diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000..4077f91
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,7 @@
+version: 2
+jobs:
+  build:
+    docker:
+      - image: circleci/python:3.7-node-browsers
+    steps:
+      - run: echo "hello world"
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7330426
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,88 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 2000000
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..5466a4a
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,43 @@
+---
+# NOTE: there must be no spaces before the '-', so put the comma first.
+Checks: '
+  *
+  ,modernize-*
+  ,-cert-err58-cpp
+  ,-cert-err60-cpp
+  ,-clang-diagnostic-*
+  ,-cppcoreguidelines-owning-memory
+  ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
+  ,-cppcoreguidelines-pro-bounds-constant-array-index
+  ,-cppcoreguidelines-pro-type-static-cast-downcast
+  ,-cppcoreguidelines-pro-type-vararg
+  ,-cppcoreguidelines-special-member-functions
+  ,-fuchsia-*
+  ,-google-build-using-namespace
+  ,-google-explicit-constructor
+  ,-google-readability-braces-around-statements
+  ,-google-readability-namespace-comments
+  ,-google-readability-todo
+  ,-google-runtime-references
+  ,-google-runtime-references
+  ,-hicpp-braces-around-statements
+  ,-hicpp-explicit-conversions
+  ,-hicpp-no-array-decay
+  ,-hicpp-special-member-functions
+  ,-hicpp-vararg
+  ,-llvm-header-guard
+  ,-llvm-namespace-comment
+  ,-misc-unused-parameters
+  ,-modernize-make-unique
+  ,-modernize-use-default-member-init
+  ,-performance-unnecessary-value-param
+  ,-readability-braces-around-statements
+  ,-readability-else-after-return
+  ,-readability-named-parameter
+  ,clang-analyzer-*
+  '
+WarningsAsErrors: ''
+HeaderFilterRegex: 'torch/csrc/'
+AnalyzeTemporaryDtors: false
+CheckOptions:
+...
diff --git a/.dockerignore b/.dockerignore
new file mode 120000
index 0000000..3e4e48b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.gitignore
\ No newline at end of file
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..e69de29
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..407cab0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,38 @@
+If you have a question or would like help and support, please ask at our
+[forums](https://discuss.pytorch.org/).
+
+If you are submitting a feature request, please preface the title with [feature request].
+If you are submitting a bug report, please fill in the following details.
+
+## Issue description
+
+Provide a short description.
+
+## Code example
+
+Please try to provide a minimal example to repro the bug.
+Error messages and stack traces are also helpful.
+
+## System Info
+Please copy and paste the output from our
+[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
+(or fill out the checklist below manually).
+
+You can get the script and run it with:
+```
+wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+# For security purposes, please check the contents of collect_env.py before running it.
+python collect_env.py
+```
+
+- PyTorch or Caffe2:
+- How you installed PyTorch (conda, pip, source):
+- Build command you used (if compiling from source):
+- OS:
+- PyTorch version:
+- Python version:
+- CUDA/cuDNN version:
+- GPU models and configuration:
+- GCC version (if compiling from source):
+- CMake version:
+- Versions of any other relevant libraries:
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..e69de29
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..09e5ed8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,196 @@
+# READ THIS BEFORE YOU REFACTOR ME
+#
+# setup.py uses the list of patterns in this file to decide
+# what to delete, but it's not 100% sound.  So, for example,
+# if you delete aten/build/ because it's redundant with build/,
+# aten/build/ will stop being cleaned.  So be careful when
+# refactoring this file!
+
+## PyTorch
+
+.mypy_cache
+*/*.pyc
+*/*.so*
+*/**/__pycache__
+*/**/*.dylib*
+*/**/*.pyc
+*/**/*.pyd
+*/**/*.so*
+*/**/**/*.pyc
+*/**/**/**/*.pyc
+*/**/**/**/**/*.pyc
+aten/build/
+aten/src/ATen/Config.h
+aten/src/ATen/cuda/CUDAConfig.h
+build/
+dist/
+docs/src/**/*
+test/.coverage
+test/cpp/api/mnist
+test/data/gpu_tensors.pt
+test/data/legacy_modules.t7
+test/data/legacy_serialized.pt
+test/data/linear.pt
+test/htmlcov
+third_party/build/
+tools/shared/_utils_internal.py
+torch.egg-info/
+torch/csrc/autograd/generated/*
+torch/csrc/cudnn/cuDNN.cpp
+torch/csrc/generated
+torch/csrc/generic/TensorMethods.cpp
+torch/csrc/jit/generated/*
+torch/csrc/nn/THCUNN.cpp
+torch/csrc/nn/THCUNN.cwrap
+torch/csrc/nn/THNN_generic.cpp
+torch/csrc/nn/THNN_generic.cwrap
+torch/csrc/nn/THNN_generic.h
+torch/csrc/nn/THNN.cpp
+torch/csrc/nn/THNN.cwrap
+torch/lib/*.a*
+torch/lib/*.dll*
+torch/lib/*.dylib*
+torch/lib/*.h
+torch/lib/*.lib
+torch/lib/*.so*
+torch/lib/build
+torch/lib/cmake
+torch/lib/include
+torch/lib/pkgconfig
+torch/lib/protoc
+torch/lib/tmp_install
+torch/lib/torch_shm_manager
+torch/version.py
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*.swm
+*~
+
+# macOS dir files
+.DS_Store
+
+# Symbolic files
+tools/shared/cwrap_common.py
+
+# Ninja files
+.ninja_deps
+.ninja_log
+compile_commands.json
+*.egg-info/
+docs/source/scripts/activation_images/
+
+## General
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Compiled protocol buffers
+*.pb.h
+*.pb.cc
+*_pb2.py
+
+# Compiled python
+*.pyc
+*.pyd
+
+# Compiled MATLAB
+*.mex*
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# Eclipse Project settings
+*.*project
+.settings
+
+# QtCreator files
+*.user
+
+# PyCharm files
+.idea
+
+# Visual Studio Code files
+.vscode
+.vs
+
+# OSX dir files
+.DS_Store
+
+## Caffe2
+
+# build, distribute, and bins (+ python proto bindings)
+build
+build_host_protoc
+build_android
+build_ios
+/build_*
+.build_debug/*
+.build_release/*
+distribute/*
+*.testbin
+*.bin
+cmake_build
+.cmake_build
+gen
+.setuptools-cmake-build
+.pytest_cache
+aten/build/*
+
+# Bram
+plsdontbreak
+
+# Generated documentation
+docs/_site
+docs/gathered
+_site
+doxygen
+docs/dev
+
+# LevelDB files
+*.sst
+*.ldb
+LOCK
+LOG*
+CURRENT
+MANIFEST-*
+
+# generated version file
+caffe2/version.py
+
+# setup.py intermediates
+.eggs
+caffe2.egg-info
+
+# Atom/Watchman required file
+.watchmanconfig
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..098255c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,81 @@
+[submodule "third_party/catch"]
+	path = third_party/catch
+	url = https://github.com/catchorg/Catch2.git
+[submodule "third_party/nanopb"]
+	path = third_party/nanopb
+	url = https://github.com/nanopb/nanopb.git
+[submodule "third_party/pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/cub"]
+	path = third_party/cub
+	url = https://github.com/NVlabs/cub.git
+[submodule "third_party/eigen"]
+	path = third_party/eigen
+	url = https://github.com/eigenteam/eigen-git-mirror.git
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
+[submodule "third_party/nervanagpu"]
+	path = third_party/nervanagpu
+	url = https://github.com/NervanaSystems/nervanagpu.git
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark.git
+[submodule "third_party/protobuf"]
+	path = third_party/protobuf
+	url = https://github.com/google/protobuf.git
+[submodule "third_party/ios-cmake"]
+	path = third_party/ios-cmake
+	url = https://github.com/Yangqing/ios-cmake.git
+[submodule "third_party/NNPACK"]
+	path = third_party/NNPACK
+	url = https://github.com/Maratyszcza/NNPACK.git
+[submodule "third_party/gloo"]
+	path = third_party/gloo
+	url = https://github.com/facebookincubator/gloo
+[submodule "third_party/NNPACK_deps/pthreadpool"]
+	path = third_party/pthreadpool
+	url = https://github.com/Maratyszcza/pthreadpool.git
+[submodule "third_party/NNPACK_deps/FXdiv"]
+	path = third_party/FXdiv
+	url = https://github.com/Maratyszcza/FXdiv.git
+[submodule "third_party/NNPACK_deps/FP16"]
+	path = third_party/FP16
+	url = https://github.com/Maratyszcza/FP16.git
+[submodule "third_party/NNPACK_deps/psimd"]
+	path = third_party/psimd
+	url = https://github.com/Maratyszcza/psimd.git
+[submodule "third_party/zstd"]
+	path = third_party/zstd
+	url = https://github.com/facebook/zstd.git
+[submodule "third-party/cpuinfo"]
+	path = third_party/cpuinfo
+	url = https://github.com/Maratyszcza/cpuinfo.git
+[submodule "third_party/python-enum"]
+	path = third_party/python-enum
+	url = https://github.com/PeachPy/enum34.git
+[submodule "third_party/python-peachpy"]
+	path = third_party/python-peachpy
+	url = https://github.com/Maratyszcza/PeachPy.git
+[submodule "third_party/python-six"]
+	path = third_party/python-six
+	url = https://github.com/benjaminp/six.git
+[submodule "third_party/ComputeLibrary"]
+	path = third_party/ComputeLibrary
+	url = https://github.com/ARM-software/ComputeLibrary.git
+[submodule "third_party/onnx"]
+	path = third_party/onnx
+	url = https://github.com/onnx/onnx.git
+[submodule "third_party/cereal"]
+	path = third_party/cereal
+	url = https://github.com/USCiLab/cereal
+[submodule "third_party/onnx-tensorrt"]
+	path = third_party/onnx-tensorrt
+	url = https://github.com/onnx/onnx-tensorrt
+[submodule "third_party/sleef"]
+	path = third_party/sleef
+	url = https://github.com/shibatch/sleef
+[submodule "third_party/ideep"]
+	path = third_party/ideep
+	url = https://github.com/intel/ideep
diff --git a/.jenkins/caffe2/README.md b/.jenkins/caffe2/README.md
new file mode 100644
index 0000000..c22cd8f
--- /dev/null
+++ b/.jenkins/caffe2/README.md
@@ -0,0 +1,14 @@
+# Jenkins
+
+The scripts in this directory are the entrypoint for testing Caffe2.
+
+The environment variable `BUILD_ENVIRONMENT` is expected to be set to
+the build environment you intend to test. It is a hint for the build
+and test scripts to configure Caffe2 a certain way and include/exclude
+tests. Docker images, they equal the name of the image itself. For
+example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
+built on Jenkins and are used in triggered builds already have this
+environment variable set in their manifest. Also see
+`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
new file mode 100755
index 0000000..345e89c
--- /dev/null
+++ b/.jenkins/caffe2/build.sh
@@ -0,0 +1,273 @@
+#!/bin/bash
+
+set -ex
+
+# The INSTALL_PREFIX here must match up with test.sh
+INSTALL_PREFIX="/usr/local/caffe2"
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+CMAKE_ARGS=()
+
+
+# Setup SCCACHE
+###############################################################################
+# Setup sccache if SCCACHE_BUCKET is set
+if [ -n "${SCCACHE_BUCKET}" ]; then
+  mkdir -p ./sccache
+
+  SCCACHE="$(which sccache)"
+  if [ -z "${SCCACHE}" ]; then
+    echo "Unable to find sccache..."
+    exit 1
+  fi
+
+  # Setup wrapper scripts
+  for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
+    (
+      echo "#!/bin/sh"
+      echo "exec $SCCACHE $(which $compiler) \"\$@\""
+    ) > "./sccache/$compiler"
+    chmod +x "./sccache/$compiler"
+  done
+
+  if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
+    (
+      echo "#!/bin/sh"
+      echo "exec $SCCACHE $(which nvcc) \"\$@\""
+    ) > "./sccache/nvcc"
+    chmod +x "./sccache/nvcc"
+  fi
+
+  export CACHE_WRAPPER_DIR="$PWD/sccache"
+
+  # CMake must find these wrapper scripts
+  export PATH="$CACHE_WRAPPER_DIR:$PATH"
+fi
+
+# Setup ccache if configured to use it (and not sccache)
+if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
+  mkdir -p ./ccache
+  ln -sf "$(which ccache)" ./ccache/cc
+  ln -sf "$(which ccache)" ./ccache/c++
+  ln -sf "$(which ccache)" ./ccache/gcc
+  ln -sf "$(which ccache)" ./ccache/g++
+  ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
+  if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
+    ln -sf "$(which ccache)" ./ccache/nvcc
+  fi
+  export CACHE_WRAPPER_DIR="$PWD/ccache"
+  export PATH="$CACHE_WRAPPER_DIR:$PATH"
+fi
+
+report_compile_cache_stats() {
+  if [[ -n "${SCCACHE}" ]]; then
+    "$SCCACHE" --show-stats
+  elif which ccache > /dev/null; then
+    ccache -s
+  fi
+}
+
+###############################################################################
+# Explicitly set Python executable.
+###############################################################################
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  PYTHON=$(which "python${BASH_REMATCH[1]}")
+  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
+fi
+
+
+###############################################################################
+# Use special scripts for Android, conda, and setup builds
+###############################################################################
+if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
+  export ANDROID_NDK=/opt/ndk
+  CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+  CMAKE_ARGS+=("-DBUILD_TEST=ON")
+  CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+  CMAKE_ARGS+=("-DUSE_ZSTD=ON")
+  "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
+  exit 0
+elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
+  "${ROOT_DIR}/scripts/build_anaconda.sh" --skip-tests --install-locally "$@"
+  report_compile_cache_stats
+
+  # This build will be tested against onnx tests, which needs onnx installed.
+  # At this point the visible protbuf installation will be in conda, since one
+  # of Caffe2's dependencies uses conda, so the correct protobuf include
+  # headers are those in conda as well
+  # This path comes from install_anaconda.sh which installs Anaconda into the
+  # docker image
+  PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
+  report_compile_cache_stats
+  exit 0
+elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then
+  rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX
+  PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX
+  exit 0
+fi
+
+
+###############################################################################
+# Set cmake args
+###############################################################################
+CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DBUILD_TEST=ON")
+CMAKE_ARGS+=("-DINSTALL_TEST=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
+
+if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
+  if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
+    CMAKE_ARGS+=("-DBUILD_ATEN=ON")
+  fi
+fi
+if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
+  CMAKE_ARGS+=("-DBLAS=MKL")
+fi
+if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
+  CMAKE_ARGS+=("-DUSE_CUDA=ON")
+  CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
+  CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+
+  # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
+  CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
+
+  # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
+  # Setting PATH to resolve to the right nvcc alone isn't enough.
+  # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
+  export CUDA_PATH="/usr/local/cuda"
+
+  # Ensure the ccache symlink can still find the real nvcc binary.
+  export PATH="/usr/local/cuda/bin:$PATH"
+fi
+if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
+  # TODO: This is patching the official FindHip to properly handly
+  # cmake generator expression. A PR is opened in the upstream repo here:
+  # https://github.com/ROCm-Developer-Tools/HIP/pull/516
+  # remove this hack once it's merged.
+  if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+    sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+  fi
+
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+  export HCC_AMDGPU_TARGET=gfx900
+fi
+
+# Try to include Redis support for Linux builds
+if [ "$(uname)" == "Linux" ]; then
+  CMAKE_ARGS+=("-DUSE_REDIS=ON")
+fi
+
+# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS
+# contbuild at the moment is minimal dependency - it doesn't use glog
+# or gflags either.
+if [ "$(uname)" == "Darwin" ]; then
+  CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
+fi
+
+# Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace
+CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI")
+
+if [[ -n "$INTEGRATED" ]]; then
+    # TODO: This is a temporary hack to work around the issue that both
+    # caffe2 and pytorch have libcaffe2.so and crossfire at runtime.
+    CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
+    CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+    CMAKE_ARGS+=("-DCAFFE2_LINK_LOCAL_PROTOBUF=OFF")
+fi
+
+# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
+# and use that if so.
+if [[ -x "$(command -v cmake3)" ]]; then
+    CMAKE_BINARY=cmake3
+else
+    CMAKE_BINARY=cmake
+fi
+# sccache will fail for CUDA builds if all cores are used for compiling
+if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then
+  MAX_JOBS=`expr $(nproc) - 1`
+else
+  MAX_JOBS=$(nproc)
+fi
+
+
+###############################################################################
+# Configure and make
+###############################################################################
+# Run cmake from ./build_caffe2 directory so it doesn't conflict with
+# standard PyTorch build directory. Eventually these won't need to
+# be separate.
+rm -rf build_caffe2
+mkdir build_caffe2
+cd ./build_caffe2
+
+# Configure
+${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
+if [ "$(uname)" == "Linux" ]; then
+  make "-j${MAX_JOBS}" install
+else
+  echo "Don't know how to build on $(uname)"
+  exit 1
+fi
+
+report_compile_cache_stats
+
+
+###############################################################################
+# Install ONNX
+###############################################################################
+
+# Install ONNX into a local directory
+pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
+
+report_compile_cache_stats
+
+if [[ -n "$INTEGRATED" ]]; then
+  # sccache will be stuck if  all cores are used for compiling
+  # see https://github.com/pytorch/pytorch/pull/7361
+  if [[ -n "${SCCACHE}" ]]; then
+    export MAX_JOBS=`expr $(nproc) - 1`
+  fi
+  pip install --user -v -b /tmp/pip_install_torch "file://${ROOT_DIR}#egg=torch"
+fi
+
+report_compile_cache_stats
+
+# Symlink the caffe2 base python path into the system python path,
+# so that we can import caffe2 without having to change $PYTHONPATH.
+# Run in a subshell to contain environment set by /etc/os-release.
+#
+# This is only done when running on Jenkins!  We don't want to pollute
+# the user environment with Python symlinks and ld.so.conf.d hacks.
+#
+if [ -n "${JENKINS_URL}" ]; then
+  (
+    source /etc/os-release
+
+    function python_version() {
+      "$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])'
+    }
+
+    # Debian/Ubuntu
+    if [[ "$ID_LIKE" == *debian* ]]; then
+      python_path="/usr/local/lib/$(python_version)/dist-packages"
+      sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
+    fi
+
+    # RHEL/CentOS
+    if [[ "$ID_LIKE" == *rhel* ]]; then
+      python_path="/usr/lib64/$(python_version)/site-packages/"
+      sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
+    fi
+
+    # /etc/ld.so.conf.d is used on both Debian and RHEL
+    echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf
+    sudo ldconfig
+  )
+fi
diff --git a/.jenkins/caffe2/dirty.sh b/.jenkins/caffe2/dirty.sh
new file mode 100755
index 0000000..6b9ba54
--- /dev/null
+++ b/.jenkins/caffe2/dirty.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -ex
+upstream="$1"
+pr="$2"
+git diff --name-only "$upstream" "$pr"
+# For safety, unconditionally trigger for any changes.
+#git diff --name-only "$upstream" "$pr" | grep -Eq '^(CMakeLists.txt|Makefile|.gitmodules|.jenkins/caffe2|binaries|caffe|caffe2|cmake|conda|docker|docs/caffe2|modules|scripts|third_party)'
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
new file mode 100755
index 0000000..a4bb748
--- /dev/null
+++ b/.jenkins/caffe2/test.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+TEST_DIR=$ROOT_DIR/caffe2_tests
+
+# Figure out which Python to use
+PYTHON="python"
+if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  PYTHON="python${BASH_REMATCH[1]}"
+fi
+
+# The prefix must mirror the setting from build.sh
+INSTALL_PREFIX="/usr/local/caffe2"
+
+# Anaconda builds have a special install prefix and python
+if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then
+  # This path comes from install_anaconda.sh which installs Anaconda into the
+  # docker image
+  PYTHON="/opt/conda/bin/python"
+  INSTALL_PREFIX="/opt/conda/"
+fi
+
+# Add the site-packages in the caffe2 install prefix to the PYTHONPATH
+SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
+INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
+
+# Skip tests in environments where they are not built/applicable
+if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
+  echo 'Skipping tests'
+  exit 0
+fi
+
+# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
+# Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this.
+if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then
+  export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
+  export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
+fi
+
+cd "$ROOT_DIR"
+
+if [ -d $TEST_DIR ]; then
+  echo "Directory $TEST_DIR already exists; please remove it..."
+  exit 1
+fi
+
+mkdir -p $TEST_DIR/{cpp,python}
+
+cd ${INSTALL_PREFIX}
+
+# C++ tests
+echo "Running C++ tests.."
+gtest_reports_dir="${TEST_DIR}/cpp"
+junit_reports_dir="${TEST_DIR}/junit_reports"
+mkdir -p "$gtest_reports_dir" "$junit_reports_dir"
+for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
+  case "$test" in
+    # skip tests we know are hanging or bad
+    */mkl_utils_test|*/aten/integer_divider_test)
+      continue
+      ;;
+    */aten/*)
+      # ATen uses test framework Catch2
+      "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml"
+      ;;
+    *)
+      "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
+      ;;
+  esac
+done
+
+# Get the relative path to where the caffe2 python module was installed
+CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
+
+# Collect additional tests to run (outside caffe2/python)
+EXTRA_TESTS=()
+
+# CUDA builds always include NCCL support
+if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then
+  EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl")
+fi
+
+conda_ignore_test=()
+if [[ $BUILD_ENVIRONMENT == conda* ]]; then
+  # These tests both assume Caffe2 was built with leveldb, which is not the case
+  conda_ignore_test+=("--ignore $CAFFE2_PYPATH/python/dataio_test.py")
+  conda_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/checkpoint_test.py")
+fi
+
+
+# TODO: re-enable this for rocm CI jobs once we have more rocm workers
+if [[ $BUILD_ENVIRONMENT != *rocm* ]]; then
+  # Python tests
+  echo "Running Python tests.."
+  "$PYTHON" \
+    -m pytest \
+    -x \
+    -v \
+    --junit-xml="$TEST_DIR/python/result.xml" \
+    --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
+    --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
+    --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
+    --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
+    ${conda_ignore_test[@]} \
+    "$CAFFE2_PYPATH/python" \
+    "${EXTRA_TESTS[@]}"
+fi
+
+if [[ -n "$INTEGRATED" ]]; then
+  pip install --user pytest-xdist torchvision
+  "$ROOT_DIR/scripts/onnx/test.sh" -p
+fi
diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md
new file mode 100644
index 0000000..ea6c6dd
--- /dev/null
+++ b/.jenkins/pytorch/README.md
@@ -0,0 +1,42 @@
+This directory contains scripts for our continuous integration.
+
+One important thing to keep in mind when reading the scripts here is
+that they are all based off of Docker images, which we build for each of
+the various system configurations we want to run on Jenkins.  This means
+it is very easy to run these tests yourself:
+
+1. Figure out what Docker image you want.  The general template for our
+   images look like:
+   ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
+   where ``$BUILD_ENVIRONMENT`` is one of the build environments
+   enumerated in
+   [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh)
+
+2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
+   run one of the scripts in this directory.
+
+The Docker images are designed so that any "reasonable" build commands
+will work; if you look in [build.sh](build.sh) you will see that it is a
+very simple script.  This is intentional.  Idiomatic build instructions
+should work inside all of our Docker images.  You can tweak the commands
+however you need (e.g., in case you want to rebuild with DEBUG, or rerun
+the build with higher verbosity, etc.).
+
+We have to do some work to make this so.  Here is a summary of the
+mechanisms we use:
+
+- We install binaries to directories like `/usr/local/bin` which
+  are automatically part of your PATH.
+
+- We add entries to the PATH using Docker ENV variables (so
+  they apply when you enter Docker) and `/etc/environment` (so they
+  continue to apply even if you sudo), instead of modifying
+  `PATH` in our build scripts.
+
+- We use `/etc/ld.so.conf.d` to register directories containing
+  shared libraries, instead of modifying `LD_LIBRARY_PATH` in our
+  build scripts.
+
+- We reroute well known paths like `/usr/bin/gcc` to alternate
+  implementations with `update-alternatives, instead of setting
+  `CC` and `CXX` in our implementations.
diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
new file mode 100755
index 0000000..4ece2ae
--- /dev/null
+++ b/.jenkins/pytorch/build-asan.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+echo "Clang version:"
+clang --version
+
+# detect_leaks=0: Python is very leaky, so we need suppress it
+# symbolize=1: Gives us much better errors when things go wrong
+export ASAN_OPTIONS=detect_leaks=0:symbolize=1
+
+# TODO: Make the ASAN flags a more unified env var
+CC="clang" CXX="clang++" LDSHARED="clang --shared" \
+  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
+  NO_CUDA=1 DEBUG=1 \
+  python setup.py install
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
new file mode 100755
index 0000000..bb06df2
--- /dev/null
+++ b/.jenkins/pytorch/build.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then
+  exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $*
+fi
+
+# TODO: move this to Docker
+# TODO: add both NCCL and MPI in CI test by fixing these test first
+# sudo apt-get update
+# sudo apt-get install libnccl-dev libnccl2
+# sudo apt-get install openmpi-bin libopenmpi-dev
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+echo "Python version:"
+python --version
+
+echo "GCC version:"
+gcc --version
+
+echo "CMake version:"
+cmake --version
+
+# TODO: Don't run this...
+pip install -r requirements.txt || true
+
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export HCC_AMDGPU_TARGET=gfx900
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+
+  sudo chown -R jenkins:jenkins /usr/local
+  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
+  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
+  USE_ROCM=1 python setup.py install
+  exit
+fi
+
+# TODO: Don't install this here
+if ! which conda; then
+  pip install mkl mkl-devel
+fi
+
+# sccache will fail for CUDA builds if all cores are used for compiling
+# gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
+if ([[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]) && which sccache > /dev/null; then
+  export MAX_JOBS=`expr $(nproc) - 1`
+fi
+
+# Target only our CI GPU machine's CUDA arch to speed up the build
+export TORCH_CUDA_ARCH_LIST=5.2
+
+if [[ "$BUILD_ENVIRONMENT" == *trusty-py3.6-gcc5.4* ]]; then
+  export DEBUG=1
+fi
+
+WERROR=1 python setup.py install
+
+# Add the test binaries so that they won't be git clean'ed away
+git add -f build/bin
+
+# Testing ATen install
+if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then
+  echo "Testing ATen install"
+  time tools/test_aten_install.sh
+fi
+
+# Test C FFI plugins
+# cffi install doesn't work for Python 3.7
+if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
+  # TODO: Don't run this here
+  pip install cffi
+  git clone https://github.com/pytorch/extension-ffi.git
+  pushd extension-ffi/script
+  python build.py
+  popd
+fi
+
+# Test documentation build
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
+  pushd docs
+  # TODO: Don't run this here
+  pip install -r requirements.txt || true
+  make html
+  popd
+fi
+
+# Test no-Python build
+if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
+  echo "Building libtorch"
+  # NB: Install outside of source directory (at the same level as the root
+  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$PWD/../cpp-build"
+fi
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
new file mode 100644
index 0000000..ca728df
--- /dev/null
+++ b/.jenkins/pytorch/common.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+# Common setup for all Jenkins scripts
+
+# NB: define this function before set -x, so that we don't
+# pollute the log with a premature EXITED_USER_LAND ;)
+function cleanup {
+  # Note that if you've exited user land, then CI will conclude that
+  # any failure is the CI's fault.  So we MUST only output this
+  # string
+  retcode=$?
+  set +x
+  if [ $retcode -eq 0 ]; then
+    echo "EXITED_USER_LAND"
+  fi
+}
+
+set -ex
+
+# Required environment variables:
+#   $BUILD_ENVIRONMENT (should be set by your Docker image)
+
+# This token is used by a parser on Jenkins logs for determining
+# if a failure is a legitimate problem, or a problem with the build
+# system; to find out more, grep for this string in ossci-job-dsl.
+echo "ENTERED_USER_LAND"
+
+# compositional trap taken from https://stackoverflow.com/a/7287873/23845
+
+# note: printf is used instead of echo to avoid backslash
+# processing and to properly handle values that begin with a '-'.
+
+log() { printf '%s\n' "$*"; }
+error() { log "ERROR: $*" >&2; }
+fatal() { error "$@"; exit 1; }
+
+# appends a command to a trap
+#
+# - 1st arg:  code to add
+# - remaining args:  names of traps to modify
+#
+trap_add() {
+    trap_add_cmd=$1; shift || fatal "${FUNCNAME} usage error"
+    for trap_add_name in "$@"; do
+        trap -- "$(
+            # helper fn to get existing trap command from output
+            # of trap -p
+            extract_trap_cmd() { printf '%s\n' "$3"; }
+            # print existing trap command with newline
+            eval "extract_trap_cmd $(trap -p "${trap_add_name}")"
+            # print the new trap command
+            printf '%s\n' "${trap_add_cmd}"
+        )" "${trap_add_name}" \
+            || fatal "unable to add to trap ${trap_add_name}"
+    done
+}
+# set the trace attribute for the above function.  this is
+# required to modify DEBUG or RETURN traps because functions don't
+# inherit them unless the trace attribute is set
+declare -f -t trap_add
+
+trap_add cleanup EXIT
+
+if which sccache > /dev/null; then
+  # Save sccache logs to file
+  sccache --stop-server || true
+  rm ~/sccache_error.log || true
+  SCCACHE_ERROR_LOG=~/sccache_error.log RUST_LOG=sccache::server=error sccache --start-server
+
+  # Report sccache stats for easier debugging
+  sccache --zero-stats
+  function sccache_epilogue() {
+    echo '=================== sccache compilation log ==================='
+    python $(dirname "${BASH_SOURCE[0]}")/print_sccache_log.py ~/sccache_error.log
+    echo '=========== If your build fails, please take a look at the log above for possible reasons ==========='
+    sccache --show-stats
+    sccache --stop-server || true
+  }
+  trap_add sccache_epilogue EXIT
+fi
+
+if which ccache > /dev/null; then
+  # Report ccache stats for easier debugging
+  ccache --zero-stats
+  ccache --show-stats
+  function ccache_epilogue() {
+    ccache --show-stats
+  }
+  trap_add ccache_epilogue EXIT
+fi
+
+# It's called a COMPACT_JOB_NAME because it's distinct from the
+# Jenkin's provided JOB_NAME, which also includes a prefix folder
+# e.g. pytorch-builds/
+
+if [ -z "$COMPACT_JOB_NAME" ]; then
+  echo "Jenkins build scripts must set COMPACT_JOB_NAME"
+  exit 1
+fi
+
+if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/disabled-configs.txt"; then
+  echo "Job is explicitly disabled, SKIPPING"
+  exit 0
+else
+  echo "Job is not disabled, proceeding"
+fi
+
+if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/enabled-configs.txt"; then
+  echo "Job is enabled, proceeding"
+else
+  echo "Job is not enabled, FAILING now (revert changes to enabled-configs.txt to fix this)"
+  exit 1
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \
+   [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then
+  BUILD_TEST_LIBTORCH=1
+else
+  BUILD_TEST_LIBTORCH=0
+fi
+
+# Use conda cmake in some CI build. Conda cmake will be newer than our supported
+# min version 3.5, so we only do it in two builds that we know should use conda.
+if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
+     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
+    if ! which conda; then
+      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
+      exit 1
+    else
+      conda install -q -y cmake
+    fi
+  else
+    if ! cmake --version | grep 'cmake version 3\.5'; then
+      echo "Expected ${BUILD_ENVIRONMENT} to have cmake version 3.5.* (min support version), but 'cmake --version' returns:"
+      cmake --version
+      exit 1
+    fi
+  fi
+fi
diff --git a/.jenkins/pytorch/dirty.sh b/.jenkins/pytorch/dirty.sh
new file mode 100755
index 0000000..cc0d90e
--- /dev/null
+++ b/.jenkins/pytorch/dirty.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -ex
+upstream="$1"
+pr="$2"
+git diff --name-only "$upstream" "$pr"
+# Now that PyTorch build depends on Caffe2, unconditionally trigger
+# for any changes.
+# TODO: Replace this with a NEGATIVE regex that allows us to blacklist
+# files (letting us skip builds when they are unnecessary)
+#git diff --name-only "$upstream" "$pr" | grep -Eq '^(aten/|caffe2/|.jenkins/pytorch|docs/(make.bat|Makefile|requirements.txt|source)|mypy|requirements.txt|setup.py|test/|third_party/|tools/|\.gitmodules|torch/)'
diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt
new file mode 100644
index 0000000..cdd51d3
--- /dev/null
+++ b/.jenkins/pytorch/disabled-configs.txt
@@ -0,0 +1,5 @@
+# This file contains a list of disabled configurations.  Disabled
+# configurations are skipped and not considered a failure if they
+# fail.  You can use this to temporarily reserve a test name to
+# turn on CI side before PyTorch repository supports it.  This
+# file has the same format as .jenkins/enabled-configs.txt
diff --git a/.jenkins/pytorch/docker-build-test.sh b/.jenkins/pytorch/docker-build-test.sh
new file mode 100755
index 0000000..508699a
--- /dev/null
+++ b/.jenkins/pytorch/docker-build-test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="docker-build-test"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+docker build -t pytorch .
diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
new file mode 100644
index 0000000..14e3144
--- /dev/null
+++ b/.jenkins/pytorch/enabled-configs.txt
@@ -0,0 +1,43 @@
+# This file contains a list of enabled configurations
+# to perform tests on.  If you want to run tests on CI on
+# a limited set of tests before enabling the full test suite,
+# you can delete lines from this file.  Any test that is not
+# in this file will report a failure (so you don't forget to
+# reenable the tests on merge ;)
+
+pytorch-linux-xenial-cuda8-cudnn6-py3-build
+pytorch-linux-xenial-cuda8-cudnn6-py3-test
+pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
+pytorch-linux-xenial-cuda9-cudnn7-py2-build
+pytorch-linux-xenial-cuda9-cudnn7-py2-test
+pytorch-linux-xenial-cuda9-cudnn7-py3-build
+pytorch-linux-xenial-cuda9-cudnn7-py3-test
+pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
+pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
+pytorch-linux-xenial-py3-clang5-asan-build
+pytorch-linux-xenial-py3-clang5-asan-test
+pytorch-linux-trusty-py2.7.9-build
+pytorch-linux-trusty-py2.7.9-test
+pytorch-linux-trusty-py2.7-build
+pytorch-linux-trusty-py2.7-test
+pytorch-linux-trusty-py3.5-build
+pytorch-linux-trusty-py3.5-test
+pytorch-linux-trusty-py3.6-gcc4.8-build
+pytorch-linux-trusty-py3.6-gcc4.8-test
+pytorch-linux-trusty-py3.6-gcc5.4-build
+pytorch-linux-trusty-py3.6-gcc5.4-test
+pytorch-linux-trusty-py3.6-gcc7.2-build
+pytorch-linux-trusty-py3.6-gcc7.2-test
+pytorch-linux-trusty-py3.6-gcc7-build
+pytorch-linux-trusty-py3.6-gcc7-test
+pytorch-linux-trusty-pynightly-build
+pytorch-linux-trusty-pynightly-test
+pytorch-win-ws2016-cuda9-cudnn7-py3-build
+pytorch-win-ws2016-cuda9-cudnn7-py3-test
+pytorch-macos-10.13-py3-build
+pytorch-macos-10.13-py3-test
+pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
+pytorch-docker-build-test
+short-perf-test-cpu
+short-perf-test-gpu
+py2-clang3.8-rocmnightly-ubuntu16.04-build
diff --git a/.jenkins/pytorch/macos-build-test.sh b/.jenkins/pytorch/macos-build-test.sh
new file mode 100755
index 0000000..330e093
--- /dev/null
+++ b/.jenkins/pytorch/macos-build-test.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-build* ]]; then
+  source "$(dirname "${BASH_SOURCE[0]}")/macos-build.sh"
+fi
+
+if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test* ]]; then
+  source "$(dirname "${BASH_SOURCE[0]}")/macos-test.sh"
+fi
diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
new file mode 100755
index 0000000..41b272e
--- /dev/null
+++ b/.jenkins/pytorch/macos-build.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build"
+export PATH="/usr/local/bin:$PATH"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+# Set up conda environment
+export PYTORCH_ENV_DIR="${HOME}/pytorch-ci-env"
+# If a local installation of conda doesn't exist, we download and install conda
+if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then
+  mkdir -p ${PYTORCH_ENV_DIR}
+  curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${PYTORCH_ENV_DIR}/miniconda3.sh
+  bash ${PYTORCH_ENV_DIR}/miniconda3.sh -b -p ${PYTORCH_ENV_DIR}/miniconda3
+fi
+export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH"
+source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate
+conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
+rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
+
+git submodule update --init --recursive
+export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/
+
+# Build PyTorch
+if [[ "${JOB_BASE_NAME}" == *cuda9.2* ]]; then
+  export CUDA_VERSION=9.2
+  export TORCH_CUDA_ARCH_LIST=5.2
+  export PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/bin${PATH:+:${PATH}}
+  export DYLD_LIBRARY_PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}
+  export CUDA_HOME=/Developer/NVIDIA/CUDA-${CUDA_VERSION}
+  export NO_CUDA=0
+
+  # Eigen gives "explicit specialization of class must precede its first use" error
+  # when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead.
+  export DEVELOPER_DIR=/Library/Developer/CommandLineTools
+else
+  export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
+fi
+
+export MACOSX_DEPLOYMENT_TARGET=10.9
+export CXX=clang++
+export CC=clang
+if which sccache > /dev/null; then
+  printf "#!/bin/sh\nexec sccache $(which clang++) \$*" > "${PYTORCH_ENV_DIR}/clang++"
+  chmod a+x "${PYTORCH_ENV_DIR}/clang++"
+
+  printf "#!/bin/sh\nexec sccache $(which clang) \$*" > "${PYTORCH_ENV_DIR}/clang"
+  chmod a+x "${PYTORCH_ENV_DIR}/clang"
+
+  if [[ "${JOB_BASE_NAME}" == *cuda* ]]; then
+    printf "#!/bin/sh\nexec sccache $(which nvcc) \$*" > "${PYTORCH_ENV_DIR}/nvcc"
+    chmod a+x "${PYTORCH_ENV_DIR}/nvcc"
+    export CUDA_NVCC_EXECUTABLE="${PYTORCH_ENV_DIR}/nvcc"
+  fi
+
+  export PATH="${PYTORCH_ENV_DIR}:$PATH"
+fi
+# If we run too many parallel jobs, we will OOM
+export MAX_JOBS=2
+
+export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
+
+python setup.py install
+
+# Upload torch binaries when the build job is finished
+7z a ${IMAGE_COMMIT_TAG}.7z ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
+aws s3 cp ${IMAGE_COMMIT_TAG}.7z s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z --acl public-read
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
new file mode 100755
index 0000000..7dc760c
--- /dev/null
+++ b/.jenkins/pytorch/macos-test.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-test"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+export PATH="/usr/local/bin:$PATH"
+
+# Set up conda environment
+export PYTORCH_ENV_DIR="${HOME}/pytorch-ci-env"
+# If a local installation of conda doesn't exist, we download and install conda
+if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then
+  mkdir -p ${PYTORCH_ENV_DIR}
+  curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${PYTORCH_ENV_DIR}/miniconda3.sh
+  bash ${PYTORCH_ENV_DIR}/miniconda3.sh -b -p ${PYTORCH_ENV_DIR}/miniconda3
+fi
+export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH"
+source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate
+conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
+rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
+
+git submodule update --init --recursive
+export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/
+
+# Test PyTorch
+if [[ "${JOB_BASE_NAME}" == *cuda9.2* ]]; then
+  # Eigen gives "explicit specialization of class must precede its first use" error
+  # when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead.
+  export DEVELOPER_DIR=/Library/Developer/CommandLineTools
+else
+  export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
+fi
+export MACOSX_DEPLOYMENT_TARGET=10.9
+export CXX=clang++
+export CC=clang
+# If we run too many parallel jobs, we will OOM
+export MAX_JOBS=2
+
+export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
+
+# Download torch binaries in the test jobs
+rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
+aws s3 cp s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z ${IMAGE_COMMIT_TAG}.7z
+7z x ${IMAGE_COMMIT_TAG}.7z -o"${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages"
+
+test_python_all() {
+  echo "Ninja version: $(ninja --version)"
+  python test/run_test.py --verbose
+}
+
+test_cpp_api() {
+  # C++ API
+
+  # NB: Install outside of source directory (at the same level as the root
+  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
+  # But still clean it before we perform our own build.
+  #
+  CPP_BUILD="$PWD/../cpp-build"
+  rm -rf $CPP_BUILD
+  mkdir -p $CPP_BUILD
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$CPP_BUILD"
+
+  python tools/download_mnist.py --quiet -d test/cpp/api/mnist
+
+  # Unfortunately it seems like the test can't load from miniconda3
+  # without these paths being set
+  export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib"
+  export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib"
+  "$CPP_BUILD"/libtorch/bin/test_api
+}
+
+if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
+  test_python_all
+  test_cpp_api
+else
+  if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
+    test_python_all
+  elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
+    test_cpp_api
+  fi
+fi
diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
new file mode 100755
index 0000000..ceee027
--- /dev/null
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-multigpu-test"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+echo "Testing pytorch (distributed only)"
+time python test/run_test.py --verbose -i distributed
diff --git a/.jenkins/pytorch/perf_test/common.sh b/.jenkins/pytorch/perf_test/common.sh
new file mode 100644
index 0000000..21ce05f
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/common.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+run_test () {
+  rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
+  "$@"
+  cd .. && rm -rf test_tmp/
+}
+
+get_runtime_of_command () {
+  TIMEFORMAT=%R
+
+  # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
+  runtime=$( { time $@; } 2>&1 1>/dev/null)
+  if [[ $runtime == *"Error"* ]]; then
+    exit 1
+  fi
+  runtime=${runtime#+++ $@}
+  runtime=$(python -c "print($runtime)")
+
+  echo $runtime
+}
diff --git a/.jenkins/pytorch/perf_test/compare_with_baseline.py b/.jenkins/pytorch/perf_test/compare_with_baseline.py
new file mode 100644
index 0000000..0fbeda6
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/compare_with_baseline.py
@@ -0,0 +1,66 @@
+import sys
+import json
+import numpy
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--test-name', dest='test_name', action='store',
+                    required=True, help='test name')
+parser.add_argument('--sample-stats', dest='sample_stats', action='store',
+                    required=True, help='stats from sample')
+parser.add_argument('--update', action='store_true',
+                    help='whether to update baseline using stats from sample')
+args = parser.parse_args()
+
+test_name = args.test_name
+
+if 'cpu' in test_name:
+    backend = 'cpu'
+elif 'gpu' in test_name:
+    backend = 'gpu'
+
+data_file_path = '../{}_runtime.json'.format(backend)
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+if test_name in data:
+    mean = float(data[test_name]['mean'])
+    sigma = float(data[test_name]['sigma'])
+else:
+    # Let the test pass if baseline number doesn't exist
+    mean = sys.maxsize
+    sigma = 0.001
+
+print("population mean: ", mean)
+print("population sigma: ", sigma)
+
+sample_stats_data = json.loads(args.sample_stats)
+
+sample_mean = sample_stats_data['mean']
+sample_sigma = sample_stats_data['sigma']
+
+print("sample mean: ", sample_mean)
+print("sample sigma: ", sample_sigma)
+
+z_value = (sample_mean - mean) / sigma
+
+print("z-value: ", z_value)
+
+if z_value >= 3:
+    raise Exception('''\n
+z-value >= 3, there is high chance of perf regression.\n
+To reproduce this regression, run `cd .jenkins/pytorch/perf_test/ && bash ''' + test_name + '''.sh` on your local machine and compare the runtime before/after your code change.
+''')
+else:
+    print("z-value < 3, no perf regression detected.")
+    if args.update:
+        print("We will use these numbers as new baseline.")
+        new_data_file_path = '../new_{}_runtime.json'.format(backend)
+        with open(new_data_file_path) as new_data_file:
+            new_data = json.load(new_data_file)
+        new_data[test_name] = {}
+        new_data[test_name]['mean'] = sample_mean
+        new_data[test_name]['sigma'] = max(sample_sigma, sample_mean * 0.1)
+        with open(new_data_file_path, 'w') as new_data_file:
+            json.dump(new_data, new_data_file, indent=4)
diff --git a/.jenkins/pytorch/perf_test/get_stats.py b/.jenkins/pytorch/perf_test/get_stats.py
new file mode 100644
index 0000000..9e6e72a
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/get_stats.py
@@ -0,0 +1,16 @@
+import sys
+import json
+import numpy
+
+sample_data_list = sys.argv[1:]
+sample_data_list = [float(v.strip()) for v in sample_data_list]
+
+sample_mean = numpy.mean(sample_data_list)
+sample_sigma = numpy.std(sample_data_list)
+
+data = {
+    'mean': sample_mean,
+    'sigma': sample_sigma,
+}
+
+print(json.dumps(data))
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
new file mode 100644
index 0000000..e1360c7
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_mini_sequence_labeler () {
+  echo "Testing: mini sequence labeler, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 726567a455edbfda6199445922a8cfee82535664
+
+  cd scripts/mini_sequence_labeler
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py)
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mini_sequence_labeler "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh
new file mode 100644
index 0000000..af3d32a
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_mnist () {
+  echo "Testing: MNIST, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  pip install -r requirements.txt
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mnist "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh
new file mode 100644
index 0000000..cd4776c
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh
@@ -0,0 +1,28 @@
+. ./common.sh
+
+test_cpu_speed_torch () {
+  echo "Testing: torch.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS="--compare ../cpu_runtime.json"
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS="--compare ../cpu_runtime.json --update ../new_cpu_runtime.json"
+  elif [ "$1" == "update_only" ]; then
+    export ARGS="--update ../new_cpu_runtime.json"
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch.py ${ARGS}; then
+    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash "${FUNCNAME[0]}".sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch "$@"
+fi
+
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
new file mode 100644
index 0000000..c924e2e
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@@ -0,0 +1,28 @@
+. ./common.sh
+
+test_cpu_speed_torch_tensor () {
+  echo "Testing: torch.Tensor.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS="--compare ../cpu_runtime.json"
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS="--compare ../cpu_runtime.json --update ../new_cpu_runtime.json"
+  elif [ "$1" == "update_only" ]; then
+    export ARGS="--update ../new_cpu_runtime.json"
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch_tensor.py ${ARGS}; then
+    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash "${FUNCNAME[0]}".sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch_tensor "$@"
+fi
+
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
new file mode 100644
index 0000000..ab02eb8
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_gpu_speed_cudnn_lstm () {
+  echo "Testing: CuDNN LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_cudnn_lstm "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh
new file mode 100644
index 0000000..ddc0d6f
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_gpu_speed_lstm () {
+  echo "Testing: LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_lstm "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh
new file mode 100644
index 0000000..fd76267
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_gpu_speed_mlstm () {
+  echo "Testing: MLSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mlstm "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
new file mode 100644
index 0000000..61d7585
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_gpu_speed_mnist () {
+  echo "Testing: MNIST, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  pip install -r requirements.txt
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mnist "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh
new file mode 100644
index 0000000..89ed044
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_gpu_speed_word_language_model () {
+  echo "Testing: word language model on Wikitext-2, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/word_language_model
+
+  cd data/wikitext-2
+
+  # Reduce dataset size, so that we can have more runs per test
+  sed -n '1,200p' test.txt > test_tmp.txt
+  sed -n '1,1000p' train.txt > train_tmp.txt
+  sed -n '1,200p' valid.txt > valid_tmp.txt
+
+  mv test_tmp.txt test.txt
+  mv train_tmp.txt train.txt
+  mv valid_tmp.txt valid.txt
+
+  cd ../..
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=$NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --cuda --epochs 1)
+    echo $runtime
+    SAMPLE_ARRAY+=(${runtime})
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]})
+  echo "Runtime stats in seconds:"
+  echo $stats
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_word_language_model "$@"
+fi
diff --git a/.jenkins/pytorch/perf_test/update_commit_hash.py b/.jenkins/pytorch/perf_test/update_commit_hash.py
new file mode 100644
index 0000000..ee7fa8a
--- /dev/null
+++ b/.jenkins/pytorch/perf_test/update_commit_hash.py
@@ -0,0 +1,13 @@
+import sys
+import json
+
+data_file_path = sys.argv[1]
+commit_hash = sys.argv[2]
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+data['commit'] = commit_hash
+
+with open(data_file_path, 'w') as data_file:
+    json.dump(data, data_file)
diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py
new file mode 100644
index 0000000..c914728
--- /dev/null
+++ b/.jenkins/pytorch/print_sccache_log.py
@@ -0,0 +1,11 @@
+import sys
+
+log_file_path = sys.argv[1]
+
+with open(log_file_path) as f:
+    lines = f.readlines()
+
+for line in lines:
+    # Ignore errors from CPU instruction set testing
+    if 'src.c' not in line:
+        print(line)
diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh b/.jenkins/pytorch/short-perf-test-cpu.sh
new file mode 100755
index 0000000..5aa86cb
--- /dev/null
+++ b/.jenkins/pytorch/short-perf-test-cpu.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="short-perf-test-cpu"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+cd .jenkins/pytorch/perf_test
+
+echo "Running CPU perf test for PyTorch..."
+
+pip install awscli
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # Get current master commit hash
+    export MASTER_COMMIT_ID=$(git log --format="%H" -n 1)
+fi
+
+# Find the master commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+master_commit_ids=($(git rev-list upstream/master))
+for commit_id in "${master_commit_ids[@]}"; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/cpu_runtime/${commit_id}.json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done
+aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/${LATEST_TESTED_COMMIT}.json cpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # Prepare new baseline file
+    cp cpu_runtime.json new_cpu_runtime.json
+    python update_commit_hash.py new_cpu_runtime.json ${MASTER_COMMIT_ID}
+fi
+
+# Include tests
+. ./test_cpu_speed_mini_sequence_labeler.sh
+. ./test_cpu_speed_mnist.sh
+. ./test_cpu_speed_torch.sh
+. ./test_cpu_speed_torch_tensor.sh
+
+# Run tests
+export TEST_MODE="compare_with_baseline"
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    export TEST_MODE="compare_and_update"
+fi
+
+# Operator tests
+run_test test_cpu_speed_torch ${TEST_MODE}
+run_test test_cpu_speed_torch_tensor ${TEST_MODE}
+
+# Sample model tests
+run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE}
+run_test test_cpu_speed_mnist 20 ${TEST_MODE}
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # This could cause race condition if we are testing the same master commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/${MASTER_COMMIT_ID}.json --acl public-read
+fi
diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh b/.jenkins/pytorch/short-perf-test-gpu.sh
new file mode 100755
index 0000000..dc59fde
--- /dev/null
+++ b/.jenkins/pytorch/short-perf-test-gpu.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="short-perf-test-gpu"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+pushd .jenkins/pytorch/perf_test
+
+echo "Running GPU perf test for PyTorch..."
+
+pip install awscli
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # Get current master commit hash
+    export MASTER_COMMIT_ID=$(git log --format="%H" -n 1)
+fi
+
+# Find the master commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+master_commit_ids=($(git rev-list upstream/master))
+for commit_id in "${master_commit_ids[@]}"; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/gpu_runtime/${commit_id}.json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done
+aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/${LATEST_TESTED_COMMIT}.json gpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # Prepare new baseline file
+    cp gpu_runtime.json new_gpu_runtime.json
+    python update_commit_hash.py new_gpu_runtime.json ${MASTER_COMMIT_ID}
+fi
+
+# Include tests
+. ./test_gpu_speed_mnist.sh
+. ./test_gpu_speed_word_language_model.sh
+. ./test_gpu_speed_cudnn_lstm.sh
+. ./test_gpu_speed_lstm.sh
+. ./test_gpu_speed_mlstm.sh
+
+# Run tests
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    run_test test_gpu_speed_mnist 20 compare_and_update
+    run_test test_gpu_speed_word_language_model 20 compare_and_update
+    run_test test_gpu_speed_cudnn_lstm 20 compare_and_update
+    run_test test_gpu_speed_lstm 20 compare_and_update
+    run_test test_gpu_speed_mlstm 20 compare_and_update
+else
+    run_test test_gpu_speed_mnist 20 compare_with_baseline
+    run_test test_gpu_speed_word_language_model 20 compare_with_baseline
+    run_test test_gpu_speed_cudnn_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_mlstm 20 compare_with_baseline
+fi
+
+if [[ "$COMMIT_SOURCE" == master ]]; then
+    # This could cause race condition if we are testing the same master commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/${MASTER_COMMIT_ID}.json --acl public-read
+fi
+
+popd
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
new file mode 100755
index 0000000..bc27628
--- /dev/null
+++ b/.jenkins/pytorch/test.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-test"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+echo "Testing pytorch"
+
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  echo "Skipping ROCm tests for now"
+  exit 0
+fi
+
+# JIT C++ extensions require ninja.
+git clone https://github.com/ninja-build/ninja --quiet
+pushd ninja
+python ./configure.py --bootstrap
+export PATH="$PWD:$PATH"
+popd
+
+# DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
+# if you're not careful.  Check this if you made some changes and the
+# ASAN test is not working
+if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
+    export ASAN_OPTIONS=detect_leaks=0:symbolize=1
+    export UBSAN_OPTIONS=print_stacktrace=1
+    export PYTORCH_TEST_WITH_ASAN=1
+    export PYTORCH_TEST_WITH_UBSAN=1
+    # TODO: Figure out how to avoid hard-coding these paths
+    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer
+    export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so
+    # Increase stack size, because ASAN red zones use more stack
+    ulimit -s 81920
+
+    function get_exit_code() {
+      set +e
+      "$@"
+      retcode=$?
+      set -e
+      return $retcode
+    }
+    (cd test && python -c "import torch")
+    echo "The next three invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
+    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_asan(3)")
+    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_ubsan(0)")
+    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
+fi
+
+export ATEN_DISABLE_AVX=
+export ATEN_DISABLE_AVX2=
+if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
+  export ATEN_DISABLE_AVX=1
+fi
+if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
+  export ATEN_DISABLE_AVX2=1
+fi
+
+test_python_nn() {
+  time python test/run_test.py --include nn --verbose
+}
+
+test_python_all_except_nn() {
+  time python test/run_test.py --exclude nn --verbose
+}
+
+test_aten() {
+  # Test ATen
+  if [[ "$BUILD_ENVIRONMENT" != *asan* ]]; then
+    echo "Running ATen tests with pytorch lib"
+    TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib
+    # NB: the ATen test binaries don't have RPATH set, so it's necessary to
+    # put the dynamic libraries somewhere were the dynamic linker can find them.
+    # This is a bit of a hack.
+    ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin
+    ln -s "$TORCH_LIB_PATH"/libnccl* build/bin
+    ls build/bin
+    aten/tools/run_tests.sh build/bin
+  fi
+}
+
+test_torchvision() {
+  rm -rf ninja
+
+  echo "Installing torchvision at branch master"
+  rm -rf vision
+  # TODO: This git clone is bad, it means pushes to torchvision can break
+  # PyTorch CI
+  git clone https://github.com/pytorch/vision --quiet
+  pushd vision
+  # python setup.py install with a tqdm dependency is broken in the
+  # Travis Python nightly (but not in latest Python nightlies, so
+  # this should be a transient requirement...)
+  # See https://github.com/pytorch/pytorch/issues/7525
+  #time python setup.py install
+  pip install .
+  popd
+}
+
+test_libtorch() {
+  if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
+     echo "Testing libtorch"
+     CPP_BUILD="$PWD/../cpp-build"
+     if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+       "$CPP_BUILD"/libtorch/bin/test_jit
+     else
+       "$CPP_BUILD"/libtorch/bin/test_jit "[cpu]"
+     fi
+     python tools/download_mnist.py --quiet -d test/cpp/api/mnist
+     OMP_NUM_THREADS=2 "$CPP_BUILD"/libtorch/bin/test_api
+  fi
+}
+
+if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
+  test_python_nn
+  test_python_all_except_nn
+  test_aten
+  test_torchvision
+  test_libtorch
+else
+  if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
+    test_python_nn
+  elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
+    test_python_all_except_nn
+    test_aten
+    test_torchvision
+    test_libtorch
+  fi
+fi
diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh
new file mode 100755
index 0000000..03adf17
--- /dev/null
+++ b/.jenkins/pytorch/win-build.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# If you want to rebuild, run this with REBUILD=1
+# If you want to build with CUDA, run this with USE_CUDA=1
+# If you want to build without CUDA, run this with USE_CUDA=0
+
+if [ ! -f setup.py ]; then
+  echo "ERROR: Please run this build script from PyTorch root directory."
+  exit 1
+fi
+
+COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-build
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
+if [[ ${JOB_NAME} == *"develop"* ]]; then
+  export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
+fi
+
+mkdir -p ci_scripts/
+
+cat >ci_scripts/upload_image.py << EOL
+
+import os
+import sys
+import boto3
+
+IMAGE_COMMIT_TAG = os.getenv('IMAGE_COMMIT_TAG')
+
+session = boto3.session.Session()
+s3 = session.resource('s3')
+data = open(sys.argv[1], 'rb')
+s3.Bucket('ossci-windows-build').put_object(Key='pytorch/'+IMAGE_COMMIT_TAG+'.7z', Body=data)
+object_acl = s3.ObjectAcl('ossci-windows-build','pytorch/'+IMAGE_COMMIT_TAG+'.7z')
+response = object_acl.put(ACL='public-read')
+
+EOL
+
+cat >ci_scripts/build_pytorch.bat <<EOL
+
+set PATH=C:\\Program Files\\CMake\\bin;C:\\Program Files\\7-Zip;C:\\curl-7.57.0-win64-mingw\\bin;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Amazon\\AWSCLI;%PATH%
+
+:: Install MKL
+if "%REBUILD%"=="" (
+  if "%BUILD_ENVIRONMENT%"=="" (
+    curl -k https://s3.amazonaws.com/ossci-windows/mkl_2018.2.185.7z --output mkl.7z
+  ) else (
+    aws s3 cp s3://ossci-windows/mkl_2018.2.185.7z mkl.7z --quiet
+  )
+  7z x -aoa mkl.7z -omkl
+)
+set CMAKE_INCLUDE_PATH=%cd%\\mkl\\include
+set LIB=%cd%\\mkl\\lib;%LIB
+
+:: Install MAGMA
+if "%REBUILD%"=="" (
+  if "%BUILD_ENVIRONMENT%"=="" (
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_cuda90_release_mkl_2018.2.185.7z --output magma_cuda90_release_mkl_2018.2.185.7z
+  ) else (
+    aws s3 cp s3://ossci-windows/magma_cuda90_release_mkl_2018.2.185.7z magma_cuda90_release_mkl_2018.2.185.7z --quiet
+  )
+  7z x -aoa magma_cuda90_release_mkl_2018.2.185.7z -omagma
+)
+set MAGMA_HOME=%cd%\\magma
+
+:: Install sccache
+mkdir %CD%\\tmp_bin
+if "%REBUILD%"=="" (
+  :check_sccache
+  %CD%\\tmp_bin\\sccache.exe --show-stats || (
+    taskkill /im sccache.exe /f /t || ver > nul
+    del %CD%\\tmp_bin\\sccache.exe
+    if "%BUILD_ENVIRONMENT%"=="" (
+      curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %CD%\\tmp_bin\\sccache.exe
+    ) else (
+      aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe
+    )
+    goto :check_sccache
+  )
+)
+
+:: Install Miniconda3
+if "%REBUILD%"=="" (
+  IF EXIST C:\\Jenkins\\Miniconda3 ( rd /s /q C:\\Jenkins\\Miniconda3 )
+  curl -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
+  .\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=C:\\Jenkins\\Miniconda3
+)
+call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3
+if "%REBUILD%"=="" ( call conda install -y -q numpy cffi pyyaml boto3 )
+
+:: Install ninja
+if "%REBUILD%"=="" ( pip install ninja )
+
+call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" x86_amd64
+
+git submodule update --init --recursive
+
+set PATH=%CD%\\tmp_bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\libnvvp;%PATH%
+set CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set CUDA_PATH_V9_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set NVTOOLSEXT_PATH=C:\\Program Files\\NVIDIA Corporation\\NvToolsExt
+set CUDNN_LIB_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\lib\\x64
+set CUDA_TOOLKIT_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+
+:: Target only our CI GPU machine's CUDA arch to speed up the build
+set TORCH_CUDA_ARCH_LIST=5.2
+
+sccache --stop-server
+sccache --start-server
+sccache --zero-stats
+set CC=sccache cl
+set CXX=sccache cl
+
+set DISTUTILS_USE_SDK=1
+
+set CMAKE_GENERATOR=Ninja
+
+if not "%USE_CUDA%"=="1" (
+  if "%REBUILD%"=="" (
+    set NO_CUDA=1
+    python setup.py install
+  )
+  if errorlevel 1 exit /b 1
+  if not errorlevel 0 exit /b 1
+)
+
+if not "%USE_CUDA%"=="0" (
+  if "%REBUILD%"=="" (
+    sccache --show-stats
+    sccache --zero-stats
+    rd /s /q C:\\Jenkins\\Miniconda3\\Lib\\site-packages\\torch
+    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
+  )
+
+  set CUDA_NVCC_EXECUTABLE=%CD%\\tmp_bin\\nvcc
+
+  if "%REBUILD%"=="" set NO_CUDA=0
+
+  python setup.py install && sccache --show-stats && (
+    if "%BUILD_ENVIRONMENT%"=="" (
+      echo "NOTE: To run \`import torch\`, please make sure to activate the conda environment by running \`call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3\` in Command Prompt before running Git Bash."
+    ) else (
+      7z a %IMAGE_COMMIT_TAG%.7z C:\\Jenkins\\Miniconda3\\Lib\\site-packages\\torch && python ci_scripts\\upload_image.py %IMAGE_COMMIT_TAG%.7z
+    )
+  )
+)
+
+EOL
+
+ci_scripts/build_pytorch.bat
+if [ ! -f $IMAGE_COMMIT_TAG.7z ] && [ ! ${BUILD_ENVIRONMENT} == "" ]; then
+    exit 1
+fi
+echo "BUILD PASSED"
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
new file mode 100755
index 0000000..a27b9f4
--- /dev/null
+++ b/.jenkins/pytorch/win-test.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-test
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
+if [[ ${JOB_NAME} == *"develop"* ]]; then
+  export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
+fi
+
+mkdir -p ci_scripts/
+
+cat >ci_scripts/download_image.py << EOL
+
+import os
+import sys
+import boto3
+import botocore
+
+IMAGE_COMMIT_TAG = os.getenv('IMAGE_COMMIT_TAG')
+
+session = boto3.session.Session()
+s3 = session.resource('s3')
+BUCKET_NAME = 'ossci-windows-build'
+KEY = 'pytorch/'+IMAGE_COMMIT_TAG+'.7z'
+LOCAL_FILE_PATH = sys.argv[1]
+try:
+    s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_FILE_PATH)
+except botocore.exceptions.ClientError as e:
+    if e.response['Error']['Code'] == "404":
+        print("The object does not exist.")
+    else:
+        raise
+
+EOL
+
+cat >ci_scripts/setup_pytorch_env.bat <<EOL
+
+set PATH=C:\\Program Files\\CMake\\bin;C:\\Program Files\\7-Zip;C:\\curl-7.57.0-win64-mingw\\bin;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Amazon\\AWSCLI;%PATH%
+
+:: Install Miniconda3
+IF EXIST C:\\Jenkins\\Miniconda3 ( rd /s /q C:\\Jenkins\\Miniconda3 )
+curl https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
+.\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=C:\\Jenkins\\Miniconda3
+call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3
+call conda install -y -q numpy mkl cffi pyyaml boto3
+
+pip install ninja
+
+call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" x86_amd64
+
+set PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\libnvvp;%PATH%
+set CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set CUDA_PATH_V9_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set NVTOOLSEXT_PATH=C:\\Program Files\\NVIDIA Corporation\\NvToolsExt
+set CUDNN_LIB_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\lib\\x64
+set CUDA_TOOLKIT_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
+set PYTHONPATH=%CD%\\test;%PYTHONPATH%
+
+cd test/
+
+python ..\\ci_scripts\\download_image.py %IMAGE_COMMIT_TAG%.7z
+
+7z x %IMAGE_COMMIT_TAG%.7z
+
+cd ..
+
+EOL
+
+cat >ci_scripts/test_python_nn.bat <<EOL
+call ci_scripts/setup_pytorch_env.bat
+cd test/ && python run_test.py --include nn --verbose && cd ..
+EOL
+
+cat >ci_scripts/test_python_all_except_nn.bat <<EOL
+call ci_scripts/setup_pytorch_env.bat
+cd test/ && python run_test.py --exclude nn --verbose && cd ..
+EOL
+
+run_tests() {
+    if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
+        ci_scripts/test_python_nn.bat && ci_scripts/test_python_all_except_nn.bat
+    else
+        if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
+            ci_scripts/test_python_nn.bat
+        elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
+            ci_scripts/test_python_all_except_nn.bat
+        fi
+    fi
+}
+
+run_tests && echo "TEST PASSED"
diff --git a/.travis.aten.yml b/.travis.aten.yml
new file mode 100644
index 0000000..0e9d802
--- /dev/null
+++ b/.travis.aten.yml
@@ -0,0 +1,31 @@
+# https://travis-ci.org/zdevito/ATen
+language: python
+python:
+    - 2.7
+    - 3.6
+
+dist: trusty
+
+before_install:
+    - sudo apt-get install -qq valgrind
+
+install:
+    - travis_retry pip install pyyaml typing
+
+script:
+    - cd aten
+    - mkdir build install
+    - cd build
+    - cmake .. -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../install
+    - make install
+    - ../tools/run_tests.sh .
+    - cd ..
+    - tools/test_install.sh $(pwd)/install $(pwd)
+
+matrix:
+    fast_finish: true
+    include:
+        env: LINT_CHECK
+        python: "2.7"
+        install: pip install flake8
+        script: flake8
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..6d82759
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,22 @@
+# https://travis-ci.org/pytorch/pytorch
+language: python
+dist: trusty
+git:
+  submodules: false
+
+# This reportedly works around an issue downloading packages from pypi on
+# travis.  Consider removing this after the underlying issue is fixed.
+# https://github.com/travis-ci/travis-ci/issues/2389
+sudo: false
+
+matrix:
+    fast_finish: true
+    include:
+      - env: LINT_CHECK
+        python: "2.7"
+        install: pip install flake8
+        script: flake8
+      - env: MYPY_TYPE_CHECK
+        python: "3.6"
+        install: pip install mypy mypy-extensions
+        script: mypy @mypy-files.txt
diff --git a/CITATION b/CITATION
new file mode 100644
index 0000000..046a2fa
--- /dev/null
+++ b/CITATION
@@ -0,0 +1,6 @@
+@inproceedings{paszke2017automatic,
+  title={Automatic differentiation in PyTorch},
+  author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
+  booktitle={NIPS-W},
+  year={2017}
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..0ad5a55
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,417 @@
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+#cmake_policy(SET CMP0022 NEW)
+#cmake_policy(SET CMP0023 NEW)
+
+# ---[ Project and semantic versioning.
+project(Caffe2 CXX C)
+
+set(CAFFE2_VERSION_MAJOR 0)
+set(CAFFE2_VERSION_MINOR 8)
+set(CAFFE2_VERSION_PATCH 2)
+set(CAFFE2_VERSION
+    "${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
+
+# One variable that determines whether the current cmake process is being run
+# with the main Caffe2 library. This is useful for building modules - if
+# modules are built with the main Caffe2 library then one does not need to do
+# find caffe2 in the cmake script. One can usually guard it in some way like
+#    if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+#      find_package(Caffe2 REQUIRED)
+#    endif()
+set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
+
+if(NOT DEFINED BLAS_SET_BY_USER)
+  if(DEFINED BLAS)
+    set(BLAS_SET_BY_USER TRUE)
+  else()
+    message(STATUS "Not forcing any particular BLAS to be found")
+    set(BLAS_SET_BY_USER FALSE)
+  endif()
+  set(BLAS_SET_BY_USER ${BLAS_SET_BY_USER} CACHE STRING "Marks whether BLAS was manually set by user or auto-detected")
+endif()
+
+# Apple specific
+if(APPLE)
+  # These lines are an attempt to make find_package(cuda) pick up
+  # libcuda.dylib, and not cuda.framework.  It doesn't work all
+  # the time, but it seems to help for some users.
+  # TODO: replace this with a more robust fix
+  set(CMAKE_FIND_FRAMEWORK LAST)
+  set(CMAKE_FIND_APPBUNDLE LAST)
+
+  # Get clang version on macOS
+  EXECUTE_PROCESS( COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string )
+  string(REGEX REPLACE "Apple LLVM version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION_STRING ${clang_full_version_string})
+  MESSAGE( STATUS "CLANG_VERSION_STRING:         " ${CLANG_VERSION_STRING} )
+
+
+  # RPATH stuff
+  set(CMAKE_MACOSX_RPATH ON)
+endif()
+
+# ---[ Options.
+# Note to developers: if you add an option below, make sure you also add it to
+# cmake/Summary.cmake so that the summary prints out the option values.
+include(CMakeDependentOption)
+option(BUILD_CAFFE2 "Build Caffe2" ON)
+option(BUILD_ATEN "Build ATen" OFF)
+option(BUILD_BINARY "Build C++ binaries" ON)
+option(BUILD_DOCS "Build Caffe2 documentation" OFF)
+option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
+option(BUILD_PYTHON "Build Python binaries" ON)
+option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
+cmake_dependent_option(
+    CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
+    "BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
+cmake_dependent_option(
+    CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
+    "NOT BUILD_SHARED_LIBS" OFF)
+cmake_dependent_option(
+    BUILD_TEST "Build Caffe2 C++ test binaries (need gtest and gbenchmark)" OFF
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    INSTALL_TEST "Install test binaries if BUILD_TEST is on" OFF
+    "BUILD_TEST" OFF)
+option(USE_ACL "Use ARM Compute Library" OFF)
+option(USE_ASAN "Use Address Sanitizer" OFF)
+option(USE_ATEN "Use ATen" OFF)
+option(USE_CUDA "Use CUDA" ON)
+option(USE_ROCM "Use ROCm" OFF)
+option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
+cmake_dependent_option(
+    USE_CUDNN "Use cuDNN" ON
+    "USE_CUDA" OFF)
+option(USE_FFMPEG "Use ffmpeg" OFF)
+cmake_dependent_option(
+    USE_GFLAGS "Use GFLAGS" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    USE_GLOG "Use GLOG" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    USE_GLOO "Use Gloo" ON
+    "BUILD_CAFFE2" OFF)
+option(USE_GLOO_IBVERBS "Use Gloo IB verbs for distributed support" OFF)
+cmake_dependent_option(
+    USE_LEVELDB "Use LEVELDB" ON
+    "BUILD_CAFFE2" OFF)
+option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+cmake_dependent_option(
+    USE_LMDB "Use LMDB" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    USE_METAL "Use Metal for iOS build" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    USE_MPI "Use MPI" ON
+    "BUILD_CAFFE2" OFF)
+option(USE_NATIVE_ARCH "Use -march=native" OFF)
+option(USE_NCCL "Use NCCL" ON)
+option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
+option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
+option(USE_NNAPI "Use NNAPI" OFF)
+option(USE_NNPACK "Use NNPACK" ON)
+option(USE_NUMA "Use NUMA (only available on Linux)" ON)
+cmake_dependent_option(
+    USE_NVRTC "Use NVRTC. Only available if USE_CUDA is on." OFF
+    "USE_CUDA" OFF)
+option(USE_OBSERVERS "Use observers module." OFF)
+option(USE_OPENCL "Use OpenCL" OFF)
+cmake_dependent_option(
+    USE_OPENCV "Use OpenCV" ON
+    "BUILD_CAFFE2" OFF)
+option(USE_OPENMP "Use OpenMP for parallel code" OFF)
+option(USE_PROF "Use profiling" OFF)
+option(USE_REDIS "Use Redis" OFF)
+option(USE_ROCKSDB "Use RocksDB" OFF)
+option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
+option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
+option(USE_ZMQ "Use ZMQ" OFF)
+option(USE_ZSTD "Use ZSTD" OFF)
+option(USE_MKLDNN "Use MKLDNN" OFF)
+cmake_dependent_option(
+  USE_IDEEP "Use IDEEP interface in MKL BLAS" ON
+    "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+  USE_MKLML "Use MKLML interface in MKL BLAS" ON
+    "BUILD_CAFFE2" OFF)
+option(USE_DISTRIBUTED "Use THD (distributed)" OFF)
+option(USE_DISTRIBUTED_MW "Use THD (distributed) master worker" OFF)
+
+# Used when building Caffe2 through setup.py
+option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)
+
+if (USE_ATEN)
+    set(BUILD_ATEN ${USE_ATEN})
+endif()
+
+# ---[ CMake scripts + modules
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+
+# ---[ CMake build directories
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+enable_testing()
+
+# ---[ Build variables set within the cmake tree
+include(cmake/BuildVariables.cmake)
+set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
+
+# Set default build type
+if(NOT CMAKE_BUILD_TYPE)
+    message(STATUS "Build type not set - defaulting to Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
+endif()
+
+# ---[ Misc checks to cope with various compiler modes
+include(cmake/MiscCheck.cmake)
+
+# External projects
+include(ExternalProject)
+
+# ---[ Utils
+# TODO: merge the following 3 files into cmake/public/utils.cmake.
+include(cmake/Utils.cmake)
+include(cmake/public/utils.cmake)
+
+# ---[ Dependencies
+include(cmake/Dependencies.cmake)
+
+# ---[ Whitelist file if whitelist is specified
+include(cmake/Whitelist.cmake)
+
+# ---[ Set link flag, handle additional deps for gcc 4.8 and above
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID)
+  message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line")
+  list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
+endif()
+
+# ---[ Build flags
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 11)
+if(NOT MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+  # Eigen fails to build with some versions, so convert this to a warning
+  # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-type-limits")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
+  # These flags are not available in GCC-4.8.5. Set only when using clang.
+  # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-typedef-redefinition")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-private-field")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-inconsistent-missing-override")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-aligned-allocation-unavailable")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++14-extensions")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
+  endif()
+  if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
+    OR (CMAKE_COMPILER_IS_GNUCXX 
+    AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0 AND NOT APPLE)))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+  endif()
+  if ($ENV{WERROR})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+  endif($ENV{WERROR})
+else()
+  foreach(flag_var
+      CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    else()
+      if(${flag_var} MATCHES "/MT")
+        string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+      endif()
+    endif()
+    # /bigobj increases number of sections in .obj file, which is needed to link
+    # against libaries in Python 2.7 under Windows
+    set(${flag_var} "${${flag_var}} /MP /bigobj")
+  endforeach(flag_var)
+endif()
+
+set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -O0")
+set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_STATIC_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -O0")
+if (USE_ASAN)
+    set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
+    set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_STATIC_LINKER_FLAGS_DEBUG} -fsanitize=address")
+endif()
+
+if(ANDROID)
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s")
+  else()
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s")
+  endif()
+endif()
+
+if(NOT APPLE AND UNIX)
+  list(APPEND Caffe2_DEPENDENCY_LIBS dl)
+endif()
+
+# Prefix path to Caffe2 headers.
+# If a directory containing installed Caffe2 headers was inadvertently
+# added to the list of include directories, prefixing
+# PROJECT_SOURCE_DIR means this source tree always takes precedence.
+include_directories(BEFORE ${PROJECT_SOURCE_DIR})
+
+# Prefix path to generated Caffe2 headers.
+# These need to take precedence over their empty counterparts located
+# in PROJECT_SOURCE_DIR.
+include_directories(BEFORE ${PROJECT_BINARY_DIR})
+
+# ---[ Old caffe protobuf
+if(BUILD_CAFFE2)
+  add_subdirectory(caffe/proto)
+endif()
+
+# ---[ Main build
+add_subdirectory(caffe2)
+
+# --[ Documentation
+if(BUILD_DOCS)
+  # check if Doxygen is installed
+  find_package(Doxygen)
+  if (DOXYGEN_FOUND)
+    message("Generating documentation")
+
+    set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
+    set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
+    set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
+    set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
+
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+      file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
+    configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
+
+    add_custom_target(doc_doxygen_c ALL
+        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Generating C++ API documentation with Doxygen"
+        VERBATIM)
+
+    add_custom_target(doc_doxygen_python ALL
+        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Generating Python API documentation with Doxygen"
+        VERBATIM)
+  else (DOXYGEN_FOUND)
+    message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
+  endif (DOXYGEN_FOUND)
+endif (BUILD_DOCS)
+
+# ---[ CMake related files
+# Uninistall option.
+if(NOT TARGET caffe2_uninstall)
+  configure_file(
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+      IMMEDIATE @ONLY)
+
+  add_custom_target(caffe2_uninstall
+      COMMAND ${CMAKE_COMMAND} -P
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+endif()
+
+# ---[ Make configuration files for cmake to allow dependent libraries
+# easier access to Caffe2.
+
+if ((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF)
+  message(WARNING
+      "Generated cmake files are only fully tested if one builds "
+      "with system glog, gflags, and protobuf. Other settings may "
+      "generate files that are not well tested.")
+endif()
+
+if (USE_CUDA OR USE_ROCM)
+  # TODO: check if we should include other cuda dependency libraries
+  # to the interface as well.
+
+endif()
+
+# Note(jiayq): when building static libraries, all PRIVATE dependencies
+# will also become interface libraries, and as a result if there are any
+# dependency libraries that are not exported, the following install export
+# script will fail. As a result, we will only provide the targets cmake
+# files for shared lib installation. For more info, read:
+# https://cmake.org/pipermail/cmake/2016-May/063400.html
+if (BUILD_SHARED_LIBS)
+  configure_file(
+      ${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in
+      ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
+      @ONLY)
+  configure_file(
+      ${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
+      ${PROJECT_BINARY_DIR}/Caffe2Config.cmake
+      @ONLY)
+  install(FILES
+      ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
+      ${PROJECT_BINARY_DIR}/Caffe2Config.cmake
+      DESTINATION share/cmake/Caffe2
+      COMPONENT dev)
+  install(FILES
+      ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
+      DESTINATION share/cmake/Caffe2/public
+      COMPONENT dev)
+  install(DIRECTORY
+      ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
+      DESTINATION share/cmake/Caffe2/
+      COMPONENT dev)
+  install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
+      FILE Caffe2Targets.cmake
+      COMPONENT dev)
+else()
+  message(WARNING
+      "Generated cmake files are only available when building "
+      "shared libs.")
+endif()
+
+# ---[ Modules
+if (BUILD_CAFFE2)
+  add_subdirectory(modules)
+endif()
+
+# ---[ Binaries
+# Binaries will be built after the Caffe2 main libraries and the modules
+# are built. For the binaries, they will be linked to the Caffe2 main
+# libraries, as well as all the modules that are built with Caffe2 (the ones
+# built in the previous Modules section above).
+if (BUILD_CAFFE2)
+  if (BUILD_BINARY)
+    add_subdirectory(binaries)
+  endif()
+endif()
+
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000..9a51f5d
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,23 @@
+# This is a comment.
+# Each line is a file pattern followed by one or more owners.
+
+/aten/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/torch/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/docs/source @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ssnl @zou3519
+/test @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/tools @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/README.md @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/setup.py @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/requirements.txt @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/torch/csrc/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
+/test/cpp/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
+/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/lib/c10d/ @apaszke @pietern @teng-li
+/torch/csrc/distributed/ @apaszke @pietern @teng-li
+/torch/distributed/ @apaszke @pietern @teng-li
+/test/test_c10d.py @apaszke @pietern @teng-li
+/torch/utils/cpp_extension.py @goldsborough @fmassa @apaszke @soumith @ezyang
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..d1f40cd
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,318 @@
+## Contributing to PyTorch
+
+If you are interested in contributing to PyTorch, your contributions will fall
+into two categories:
+1. You want to propose a new Feature and implement it
+    - post about your intended feature, and we shall discuss the design and
+    implementation. Once we agree that the plan looks good, go ahead and implement it.
+2. You want to implement a feature or bug-fix for an outstanding issue
+    - Look at the outstanding issues here: https://github.com/pytorch/pytorch/issues
+    - Especially look at the Low Priority and Medium Priority issues
+    - Pick an issue and comment on the task that you want to work on this feature
+    - If you need more context on a particular issue, please ask and we shall provide.
+
+Once you finish implementing a feature or bugfix, please send a Pull Request to
+https://github.com/pytorch/pytorch
+
+If you are not familiar with creating a Pull Request, here are some guides:
+- http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request
+- https://help.github.com/articles/creating-a-pull-request/
+
+
+## Developing locally with PyTorch
+
+To locally develop with PyTorch, here are some tips:
+
+1. Uninstall all existing pytorch installs
+```
+conda uninstall pytorch
+pip uninstall torch
+pip uninstall torch # run this command twice
+```
+
+2. Locally clone a copy of PyTorch from source:
+
+```
+git clone https://github.com/pytorch/pytorch
+cd pytorch
+```
+
+3. Install PyTorch in `build develop` mode:
+
+A full set of instructions on installing PyTorch from Source are here:
+https://github.com/pytorch/pytorch#from-source
+
+The change you have to make is to replace
+
+```
+python setup.py install
+```
+
+with
+
+```
+python setup.py build develop
+```
+
+This is especially useful if you are only changing Python files.
+
+This mode will symlink the python files from the current local source tree into the
+python install.
+
+Hence, if you modify a python file, you do not need to reinstall pytorch again and again.
+
+For example:
+- Install local pytorch in `build develop` mode
+- modify your python file `torch/__init__.py` (for example)
+- test functionality
+- modify your python file `torch/__init__.py`
+- test functionality
+- modify your python file `torch/__init__.py`
+- test functionality
+
+You do not need to repeatedly install after modifying python files.
+
+In case you want to reinstall, make sure that you uninstall pytorch first by running `pip uninstall torch`
+and `python setup.py clean`. Then you can install in `build develop` mode again.
+
+## Unit testing
+
+PyTorch's testing is located under `test/`. Run the entire test suite with
+
+```
+python test/run_test.py
+```
+
+or run individual test files, like `python test/test_nn.py`, for individual test suites.
+
+### Better local unit tests with pytest
+We don't officially support `pytest`, but it works well with our `unittest` tests and offers
+a number of useful features for local developing. Install it via `pip install pytest`.
+
+If you want to just run tests that contain a specific substring, you can use the `-k` flag:
+
+```
+pytest test/test_nn.py -k Loss -v
+```
+
+The above is an example of testing a change to Loss functions: this command runs tests such as
+`TestNN.test_BCELoss` and `TestNN.test_MSELoss` and can be useful to save keystrokes.
+
+## Writing documentation
+
+PyTorch uses [Google style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
+for formatting docstrings. Length of line inside docstrings block must be limited to 80 characters to
+fit into Jupyter documentation popups.
+
+
+## Managing multiple build trees
+
+One downside to using `python setup.py develop` is that your development
+version of pytorch will be installed globally on your account (e.g., if
+you run `import torch` anywhere else, the development version will be
+used.
+
+If you want to manage multiple builds of PyTorch, you can make use of
+[conda environments](https://conda.io/docs/using/envs.html) to maintain
+separate Python package environments, each of which can be tied to a
+specific build of PyTorch.  To set one up:
+
+```
+conda create -n pytorch-myfeature
+source activate pytorch-myfeature
+# if you run python now, torch will NOT be installed
+python setup.py build develop
+```
+
+## C++ Development tips
+
+If you are working on the C++ code, there are a few important things that you
+will want to keep in mind:
+
+1. How to rebuild only the code you are working on, and
+2. How to make rebuilds in the absence of changes go faster.
+
+### Build only what you need.
+
+`python setup.py build` will build everything, but since our build system is
+not very optimized for incremental rebuilds, this will actually be very slow.
+Far better is to only request rebuilds of the parts of the project you are
+working on:
+
+- Working on `torch/csrc`?  Run `python setup.py develop` to rebuild
+  (NB: no `build` here!)
+
+- Working on `torch/lib/TH`, did not make any cmake changes, and just want to
+  see if it compiles?  Run `(cd torch/lib/build/TH && make install -j$(getconf _NPROCESSORS_ONLN))`.  This
+  applies for any other subdirectory of `torch/lib`.  **Warning: Changes you
+  make here will not be visible from Python.**  See below.
+
+- Working on `torch/lib` and want to run your changes / rerun cmake?  Run
+  `python setup.py build_deps`.  Note that this will rerun cmake for
+  every subdirectory in TH; if you are only working on one project,
+  consider editing `torch/lib/build_all.sh` and commenting out the
+  `build` lines of libraries you are not working on.
+
+On the initial build, you can also speed things up with the environment
+variables `DEBUG` and `NO_CUDA`.
+
+- `DEBUG=1` will enable debug builds (-g -O0)
+- `NO_CUDA=1` will disable compiling CUDA (in case you are developing on something not CUDA related), to save compile time.
+
+For example:
+```
+NO_CUDA=1 DEBUG=1 python setup.py build develop
+```
+
+Make sure you continue to pass these flags on subsequent builds.
+
+### Code completion and IDE support
+
+When using `python setup.py develop`, PyTorch will generate
+a `compile_commands.json` file that can be used by many editors
+to provide command completion and error highlighting for PyTorch's
+C++ code. You need to `pip install ninja` to generate accurate
+information for the code in `torch/csrc`. More information at:
+- https://sarcasm.github.io/notes/dev/compilation-database.html
+
+### Make no-op build fast.
+
+#### Use Ninja
+Python `setuptools` is pretty dumb, and always rebuilds every C file in a
+project.  If you install the ninja build system with `pip install ninja`,
+then PyTorch will use it to track dependencies correctly.
+
+#### Use CCache
+
+Even when dependencies are tracked with file modification,
+there are many situations where files get rebuilt when a previous
+compilation was exactly the same.
+
+Using ccache in a situation like this is a real time-saver. However, by
+default, ccache does not properly support CUDA stuff, so here are the
+instructions for installing a custom `ccache` fork that has CUDA support:
+
+```
+# install and export ccache
+if ! ls ~/ccache/bin/ccache
+then
+    sudo apt-get update
+    sudo apt-get install -y automake autoconf
+    sudo apt-get install -y asciidoc
+    mkdir -p ~/ccache
+    pushd /tmp
+    rm -rf ccache
+    git clone https://github.com/colesbury/ccache -b ccbin
+    pushd ccache
+    ./autogen.sh
+    ./configure
+    make install prefix=~/ccache
+    popd
+    popd
+
+    mkdir -p ~/ccache/lib
+    mkdir -p ~/ccache/cuda
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
+    ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
+
+    ~/ccache/bin/ccache -M 25Gi
+fi
+
+export PATH=~/ccache/lib:$PATH
+export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
+```
+
+## CUDA Development tips
+
+If you are working on the CUDA code, here are some useful CUDA debugging tips:
+
+1. `CUDA_DEVICE_DEBUG=1` will enable CUDA device function debug symbols (`-g -G`).
+    This will be particularly helpful in debugging device code. However, it will
+    slow down the build process for about 50% (compared to only `DEBUG=1`), so use wisely.
+2. `cuda-gdb` and `cuda-memcheck` are your best CUDA debugging friends. Unlike`gdb`,
+   `cuda-gdb` can display actual values in a CUDA tensor (rather than all zeros).
+
+
+Hope this helps, and thanks for considering to contribute.
+
+## Windows development tips
+
+Occasionally, you will write a patch which works on Linux, but fails CI on Windows.
+There are a few aspects in which MSVC (the Windows compiler toolchain we use) is stricter
+than Linux, which are worth keeping in mind when fixing these problems.
+
+1. Symbols are NOT exported by default on Windows; instead, you have to explicitly
+   mark a symbol as exported/imported in a header file with `__declspec(dllexport)` /
+   `__declspec(dllimport)`.  We have codified this pattern into a set of macros
+   which follow the convention `*_API`, e.g., `AT_API` inside ATen. (Every separate
+   shared library needs a unique macro name, because symbol visibility is on a per
+   shared library basis.)
+
+   The upshot is if you see an "unresolved external" error in your Windows build, this
+   is probably because you forgot to mark a function with `*_API`.  However, there is
+   one important counterexample to this principle: if you want a *templated* function
+   to be instantiated at the call site, do NOT mark it with `*_API` (if you do mark it,
+   you'll have to explicitly instantiate all of the specializations used by the call
+   sites.)
+
+2. If you link against a library, this does not make its dependencies transitively
+   visible. You must explicitly specify a link dependency against every library whose
+   symbols you use.  (This is different from Linux where in most environments,
+   transitive dependencies can be used to fulfill unresolved symbols.)
+
+3. If you have a Windows box (we have a few on EC2 which you can request access to) and
+   you want to run the build, the easiest way is to just run `.jenkins/pytorch/win-build.sh`.
+   If you need to rebuild, run `REBUILD=1 .jenkins/pytorch/win-build.sh` (this will avoid
+   blowing away your Conda environment.)  I recommend opening `cmd.exe`, and then running
+   `bash` to work in a bash shell (which will make various Linux commands available.)
+
+Even if you don't know anything about MSVC, you can use cmake to build simple programs on
+Windows; this can be helpful if you want to learn more about some peculiar linking behavior
+by reproducing it on a small example.  Here's a simple example cmake file that defines
+two dynamic libraries, one linking with the other:
+
+```
+project(myproject CXX)
+set(CMAKE_CXX_STANDARD 11)
+add_library(foo SHARED foo.cpp)
+add_library(bar SHARED bar.cpp)
+# NB: don't forget to __declspec(dllexport) at least one symbol from foo,
+# otherwise foo.lib will not be created.
+target_link_libraries(bar PUBLIC foo)
+```
+
+You can build it with:
+
+```
+mkdir build
+cd build
+cmake ..
+cmake --build .
+```
+
+## Caffe2 notes
+
+In 2018, we merged Caffe2 into the PyTorch source repository.  While the
+steady state aspiration is that Caffe2 and PyTorch share code freely,
+in the meantime there will be some separation.
+
+If you submit a PR to only PyTorch or only Caffe2 code, CI will only
+run for the project you edited.  The logic for this is implemented
+in `.jenkins/pytorch/dirty.sh` and `.jenkins/caffe2/dirty.sh`; you
+can look at this to see what path prefixes constitute changes.
+This also means if you ADD a new top-level path, or you start
+sharing code between projects, you need to modify these files.
+
+There are a few "unusual" directories which, for historical reasons,
+are Caffe2/PyTorch specific.  Here they are:
+
+- `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `conda`, `modules`,
+  `scripts` are Caffe2-specific.  Don't put PyTorch code in them without
+  extra coordination.
+
+- `mypy*`, `requirements.txt`, `setup.py`, `test`, `tools` are
+  PyTorch-specific.  Don't put Caffe2 code in them without extra
+  coordination.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..4167b92
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,70 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+ 
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+ 
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+ 
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+ 
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+ 
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..13755ce
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+# This makefile does nothing but delegating the actual building to cmake.
+
+all:
+	@mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
+
+local:
+	@./scripts/build_local.sh
+
+android:
+	@./scripts/build_android.sh
+
+ios:
+	@./scripts/build_ios.sh
+
+clean: # This will remove ALL build folders.
+	@rm -r build*/
+
+linecount:
+	@cloc --read-lang-def=caffe.cloc caffe2 || \
+		echo "Cloc is not available on the machine. You can install cloc with " && \
+		echo "    sudo apt-get install cloc"
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..a346cb8
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,309 @@
+=======================================================================
+Software under third_party
+=======================================================================
+Software libraries under third_party are provided as github submodule
+links, and their content is not part of the Caffe2 codebase. Their
+licences can be found under the respective software repositories.
+
+=======================================================================
+Earlier BSD License
+=======================================================================
+Early development of Caffe2 in 2015 and early 2016 is licensed under the
+BSD license. The license is attached below:
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+=======================================================================
+Caffe's BSD License
+=======================================================================
+Some parts of the caffe2 code is derived from the original Caffe code, which is
+created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
+license is as follows:
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+=======================================================================
+Caffe2's Apache License
+=======================================================================
+
+This repo contains Caffe2 code, which was previously licensed under
+Apache License Version 2.0:
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 6b89117..716f6f2 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,76 @@
-# pytorch-gpgpu-sim
+![PyTorch Logo](docs/source/_static/img/pytorch-logo-dark.png)
+
+--------------------------------------------------------------------------------
+
+##PyTorch for GPGPU-Sim using CUDNN
+Contributor: Jonathan Lew, University of British Columbia, Dept. of Electrical and Computer Engineering
+
+Welcome to PyTorch for GPGPU-Sim!
+
+This repo is based on the popular PyTorch framework, release v0.4.1, the latest release as of Aug 17, 2018. It can be found at https://github.com/pytorch/pytorch
+
+- [Installation](#markdown-header-installation)
+- [GPGPU-Sim](#markdown-header-gpgpu-sim)
+- [Getting Started](#markdown-header-getting-started)
+
+
+## Installation
+
+Note: Only Linux install has been tested. Instructions for other operating systems are from the pytorch repo and are left here for your convenience.
+
+Install
+- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above
+
+If you want to build on Windows, Visual Studio 2017 and NVTX are also needed.
+
+### Get the PyTorch source
+```bash
+git clone --recursive [this repo]
+cd pytorch-gpgpusim-install
+```
+
+### Install PyTorch
+On Linux
+```bash
+python setup.py install
+```
+Use the following if you encounter root permission error:
+```bash
+python setup.py install --user
+```
+
+On macOS
+```bash
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+```
+
+On Windows
+```cmd
+set "VS150COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build"
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+set DISTUTILS_USE_SDK=1
+REM The following line is needed for Python 2.7, but the support for it is very experimental.
+set MSSdk=1
+REM As for CUDA 8, VS2015 Update 2 or up is required to build PyTorch. Use the following two lines.
+set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat"
+set PREBUILD_COMMAND_ARGS=x64
+
+call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11
+python setup.py install
+```
+## GPGPU-Sim
+
+1. Install GPGPU-Sim with CUDNN and PyTorch support at:
+
+2. Follow the README to set the environment variables.
+
+3. export PYTORCH_BIN=/path/to/libcudnn.so
+
+## Getting Started
+
+Three pointers to get you started:
+- [Tutorials: get you started with understanding and using PyTorch](http://pytorch.org/tutorials/)
+- [Examples: easy to understand pytorch code across all domains](https://github.com/pytorch/examples)
+- [The API Reference](http://pytorch.org/docs/)
+
diff --git a/README.md.orig b/README.md.orig
new file mode 100644
index 0000000..8551ce7
--- /dev/null
+++ b/README.md.orig
@@ -0,0 +1,80 @@
+<<<<<<< HEAD
+# pytorch-gpgpu-sim
+=======
+![PyTorch Logo](docs/source/_static/img/pytorch-logo-dark.png)
+
+--------------------------------------------------------------------------------
+
+##PyTorch for GPGPU-Sim using CUDNN
+Contributor: Jonathan Lew, University of British Columbia, Dept. of Electrical and Computer Engineering
+
+Welcome to PyTorch for GPGPU-Sim!
+
+This repo is based on the popular PyTorch framework, release v0.4.1, the latest release as of Aug 17, 2018. It can be found at https://github.com/pytorch/pytorch
+
+- [Installation](#markdown-header-installation)
+- [GPGPU-Sim](#markdown-header-gpgpu-sim)
+- [Getting Started](#markdown-header-getting-started)
+
+
+## Installation
+
+Note: Only Linux install has been tested. Instructions for other operating systems are from the pytorch repo and are left here for your convenience.
+
+Install
+- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above
+
+If you want to build on Windows, Visual Studio 2017 and NVTX are also needed.
+
+### Get the PyTorch source
+```bash
+git clone --recursive [this repo]
+cd pytorch-gpgpusim-install
+```
+
+### Install PyTorch
+On Linux
+```bash
+python setup.py install
+```
+Use the following if you encounter root permission error:
+```bash
+python setup.py install --user
+```
+
+On macOS
+```bash
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+```
+
+On Windows
+```cmd
+set "VS150COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build"
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+set DISTUTILS_USE_SDK=1
+REM The following line is needed for Python 2.7, but the support for it is very experimental.
+set MSSdk=1
+REM As for CUDA 8, VS2015 Update 2 or up is required to build PyTorch. Use the following two lines.
+set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat"
+set PREBUILD_COMMAND_ARGS=x64
+
+call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11
+python setup.py install
+```
+## GPGPU-Sim
+
+1. Install GPGPU-Sim with CUDNN and PyTorch support at:
+
+2. Follow the README to set the environment variables.
+
+3. export PYTORCH_BIN=/path/to/libcudnn.so
+
+## Getting Started
+
+Three pointers to get you started:
+- [Tutorials: get you started with understanding and using PyTorch](http://pytorch.org/tutorials/)
+- [Examples: easy to understand pytorch code across all domains](https://github.com/pytorch/examples)
+- [The API Reference](http://pytorch.org/docs/)
+
+>>>>>>> mod
diff --git a/aten/.flake8 b/aten/.flake8
new file mode 100644
index 0000000..5f32207
--- /dev/null
+++ b/aten/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 120
+
diff --git a/aten/.gitignore b/aten/.gitignore
new file mode 100644
index 0000000..c57b970
--- /dev/null
+++ b/aten/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+build/
+*.pyc
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
new file mode 100644
index 0000000..0dc61c5
--- /dev/null
+++ b/aten/CMakeLists.txt
@@ -0,0 +1,143 @@
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (NOT BUILD_ATEN)
+    return()
+  endif()
+else()
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  project(ATen CXX C)
+  include(CMakeDependentOption)
+  option(USE_CUDA "Use CUDA" ON)
+  option(USE_ROCM "Use ROCm" OFF)
+  option(USE_CUDNN "Use cuDNN" ON)
+  option(USE_MKLDNN "Use MKLDNN" ON)
+  cmake_dependent_option(
+      USE_CUDNN "Use cuDNN" ON
+      "USE_CUDA" OFF)
+  option(ATEN_NO_TEST "Do not build ATen test binaries" ON)
+
+  # Flag for shared dependencies
+  set(BUILD_ATEN ON)
+endif()
+
+# Find modules
+list(APPEND CMAKE_MODULE_PATH
+  /usr/lib/x86_64-linux-gnu/
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/public
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules_CUDA_fix)
+list(APPEND CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/)
+
+cmake_policy(SET CMP0012 NEW)
+
+#############################################
+
+set(ATen_CPU_SRCS)
+set(ATen_CPU_TEST_SRCS)
+set(ATen_CPU_INCLUDE)
+set(ATen_THIRD_PARTY_INCLUDE)
+set(ATen_CUDA_SRCS)
+set(ATen_CUDA_TEST_SRCS)
+set(ATen_CUDA_INCLUDE)
+set(ATen_CPU_DEPENDENCY_LIBS)
+set(ATen_CUDA_DEPENDENCY_LIBS)
+set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS)
+SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
+SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
+SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
+
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  # ---[ Build variables set within the cmake tree
+  include(../cmake/BuildVariables.cmake)
+  set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
+
+  # ---[ Misc checks to cope with various compiler modes
+  include(../cmake/MiscCheck.cmake)
+
+  # External projects
+  include(ExternalProject)
+
+  # ---[ Utils
+  # TODO: merge the following 3 files into cmake/public/utils.cmake.
+  include(../cmake/Utils.cmake)
+  include(../cmake/public/utils.cmake)
+
+  # ---[ Dependencies
+  include(../cmake/Dependencies.cmake)
+  list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE})
+  list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE})
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS})
+  list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS})
+  list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS
+    ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+endif()
+
+if(USE_CUDA)
+  list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS})
+endif()
+
+set(TH_LINK_STYLE STATIC)
+add_subdirectory(src/TH)
+set(TH_CPU_INCLUDE
+  # dense
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/TH
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
+  ${CMAKE_CURRENT_BINARY_DIR}/src/TH
+  ${CMAKE_CURRENT_BINARY_DIR}/src/THC
+
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${CMAKE_BINARY_DIR}/aten/src)
+list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE})
+add_subdirectory(src/THNN)
+
+# Find the HIP package, set the HIP paths, load the HIP CMake.
+IF(USE_ROCM)
+  include(LoadHIP)
+  if (NOT PYTORCH_FOUND_HIP)
+    MESSAGE(FATAL_ERROR
+      "Could not find HIP installation")
+  endif()
+ENDIF()
+
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819 -Xcompiler /wd4503 -Xcompiler /wd4190 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4275 -Xcompiler /wd4522")
+ENDIF(MSVC)
+
+if(USE_ROCM)
+  SET(AT_CUDA_ENABLED 1)
+  add_subdirectory(src/THC)
+  add_subdirectory(src/THCUNN)
+  message("ROCm is enabled.")
+elseif(USE_CUDA)
+  SET(AT_CUDA_ENABLED 1)
+  add_subdirectory(src/THC)
+  add_subdirectory(src/THCUNN)
+else()
+  message("disabling CUDA because USE_CUDA is set false")
+  SET(AT_CUDA_ENABLED 0)
+endif()
+
+list(APPEND ATen_CPU_INCLUDE
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/THNN
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/THCUNN)
+
+list(APPEND ATen_CPU_INCLUDE
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/catch/single_include
+  ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
+add_subdirectory(src/ATen)
+
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  # Pass source, includes, and libs to parent
+  set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
+  set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+  set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
+  set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+  set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
+  set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+  set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
+  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
+  set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+endif()
diff --git a/aten/README.md b/aten/README.md
new file mode 100644
index 0000000..e9ada01
--- /dev/null
+++ b/aten/README.md
@@ -0,0 +1,258 @@
+# ATen: A TENsor library
+
+ATen is a simple tensor library thats exposes the Tensor operations in Torch
+and PyTorch directly in C++11. The wrapper respects the semantics of operators
+in PyTorch, except minor details due to differences between C++ and Python in
+the way default arguments are handled. See the [documentation for tensors](http://pytorch.org/docs/tensors.html) in PyTorch for what these operations do.
+ATen's API is auto-generated from the same declarations PyTorch uses so the
+two APIs will track each other over time.
+
+Tensor types are resolved dynamically, such that the API is generic and
+does not include templates. That is, there is one `Tensor` type. It can hold a
+CPU or CUDA Tensor, and the tensor may have Doubles, Float, Ints, etc. This design
+makes it easy to write generic code without templating everything.
+
+See the _generated_ [`Tensor.h` file](doc/Tensor.h) and [`Functions.h` file](doc/Functions.h) for the provided API. Excerpt:
+```c++
+Tensor atan2(const Tensor & other) const;
+Tensor & atan2_(const Tensor & other);
+Tensor pow(Scalar exponent) const;
+Tensor pow(const Tensor & exponent) const;
+Tensor & pow_(Scalar exponent);
+Tensor & pow_(const Tensor & exponent);
+Tensor lerp(const Tensor & end, Scalar weight) const;
+Tensor & lerp_(const Tensor & end, Scalar weight);
+Tensor histc() const;
+Tensor histc(int64_t bins) const;
+Tensor histc(int64_t bins, Scalar min) const;
+Tensor histc(int64_t bins, Scalar min, Scalar max) const;
+```
+
+Inplace operations are also provided, and always suffixed by `_` to indicate they will modify the Tensor.
+
+### Installation
+
+TH/THC/THNN/THCUNN are provided (as git subtrees), so the repo is standalone. You will need a C++11 compiler, cmake, and the pyyaml python package.
+```
+
+# Install pyyaml used by python code generation to read API declarations
+
+# macOS: if you don't have pip
+sudo easy_install pip
+# Ubuntu: if you don't have pip
+apt-get -y install python-pip
+
+# if you don't have pyyaml
+sudo pip install pyyaml
+
+mkdir build
+cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=/where/you/want # specify your dest directory
+# cmake .. -DUSE_NVRTC=ON -DUSE_TENSORRT=OFF -DCMAKE_INSTALL_PREFIX=../install -DCAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO=OFF -DUSE_CUDA=ON # for CUDA
+# cmake .. -DUSE_CUDA=OFF  # for CPU only machines
+make install
+```
+
+### Example usage
+
+Here is a simple example; again, the syntax follows Torch semantics.
+
+```c++
+using namespace at; // assumed in the following
+
+Tensor d = CPU(kFloat).ones({3, 4});
+Tensor r = CPU(kFloat).zeros({3,4});
+for(auto i = 0; i < 100000; i++) {
+  r = r.add(d);
+  // equivalently
+  r = r + d;
+  // or
+  r += d;
+}
+```
+
+Want this running on the GPU?
+```c++
+using namespace at; // assumed in the following
+
+Tensor d = CUDA(kFloat).ones({3, 4});
+Tensor r = CUDA(kFloat).zeros({3,4});
+for(auto i = 0; i < 100000; i++) {
+  r = r.add(d);
+  // equivalently
+  r = r + d;
+  // or
+  r += d;
+}
+```
+
+Expressions like `CUDA(kFloat)` are first-class `at::Type` objects that represent
+the type of a Tensor and are used to create Tensors when their type cannot be
+inferred. See the _generated_ [Type header](doc/Type.h) for its API.
+
+See more in [sample files](src/ATen/test).
+
+### Creating your kernel
+
+It is easy to create new kernels, thanks to the `dispatch<>()` templated function. Example:
+```c++
+
+// a simple sum kernel (for CPU only)
+template<typename T>
+struct sum_op {
+  // dispatch handles variable arguments for you
+  Tensor CPU(const Type & t, Tensor & x_)
+  {
+    Tensor x = x_.contiguous();
+    auto x_p = x.data<T>();
+    int64_t size = x.numel();
+    T sum = 0;
+    for(int64_t i = 0; i < size; i++) {
+      sum += x_p[i];
+    }
+    return sum;
+  };
+  Tensor CUDA(Tensor& x) {
+    throw std::invalid_argument("device not supported");
+  };
+};
+
+Tensor a = CPU(kFloat).rand({3, 7});
+std::cout << a << std::endl;
+std::cout << dispatch<sum_op>(a.type(),a) << " == " << a.sum() << std::endl;
+```
+
+### Efficient access to tensor elements
+
+When using Tensor-wide operations, the relative cost of dynamic dispatch is very small.
+However, there are cases, especially in your own kernels, where efficient element-wise access is needed,
+and the cost of dynamic dispatch inside the element-wise loop is very high.
+ATen provides _accessors_ that are created with a single dynamic check that a Tensor is the type and number of
+dimensions. Accessors then expose an API for accessing the Tensor elements efficiently:
+
+```c++
+
+Tensor foo = CPU(kFloat).rand({12,12});
+
+// assert foo is 2-dimensional and holds floats.
+auto foo_a = foo.accessor<float,2>();
+float trace = 0;
+
+for(int i = 0; i < foo_a.size(0); i++) {
+  // use the accessor foo_a to get tensor data.
+  trace += foo_a[i][i];
+}
+```
+
+Accessors are temporary views of a Tensor. They are only valid for the lifetime of the tensor that they
+view and hence should only be used locally in a function, like iterators.
+
+### Using externally created data
+
+If you already have your tensor data allocated in memory (CPU or CUDA),
+you can view that memory as a Tensor in ATen:
+
+```c++
+float data[] = { 1, 2, 3,
+                 4, 5, 6};
+auto f = CPU(kFloat).tensorFromBlob(data, {2,3});
+cout << f << endl;
+```
+
+These tensors cannot be resized because ATen does not own the memory, but otherwise
+behave as normal tensors.
+
+### Scalars and zero-dimensional tensors
+
+In addition to the `Tensor` objects, ATen also includes `Scalar`s that represent a single number.
+Like a Tensor, Scalars are dynamically typed and can hold any one of ATen's [number types](doc/Type.h).
+Scalars can be implicitly constructed from C++ number types. Scalars are needed because some functions like `addmm` take numbers along with Tensors and expect these
+numbers to be the same dynamic type as the tensor. They are also used in the API to indicate places where
+a function will _always_ return a Scalar value, like `sum`.
+
+```c++
+Tensor addmm(Scalar beta, const Tensor & self,
+             Scalar alpha, const Tensor & mat1,
+             const Tensor & mat2);
+Scalar sum(const Tensor & self);
+
+//usage
+Tensor a = ...
+Tensor b = ...
+Tensor c = ...
+Tensor r = addmm(1.0, a, .5, b, c);
+```
+
+In addition to Scalars, ATen also allows Tensor objects to be zero-dimensional. These Tensors hold
+a single value and they can be references to a single element in a larger Tensor. They can be used anywhere a Tensor is expected. They are normally created by operators like `select` which reduce the dimensions of
+a Tensor.
+
+```c++
+Tensor two = CPU(kFloat).rand({10,20});
+two[1][2] = 4;
+//~~~~~~~  zero-dimensional Tensor
+```
+
+It is possible to convert between Scalar and zero-dim Tensors:
+
+```c++
+Tensor zero_dim = CPU(kFloat).scalarTensor(4);
+Scalar from_tensor = Scalar(zero_dim); //only valid when zero_dim.dim() == 0;
+```
+
+### Avoiding unnecessary CUDA synchronization in your kernels when using Scalars
+
+Moving a single number from the GPU to the CPU introduces a synchronization point
+that can add latency to your program. In certain cases the result of a GPU operator like `sum` which
+returns a Scalar may be plugged into another GPU operator as an argument. If Scalars were always copied
+to the CPU, this would result in 2 copies. To avoid these synchronizations, Scalar objects can be
+optionally backed by a zero-dim Tensor, and are only copied to the CPU when requested.
+
+```c++
+auto a = CUDA(kFloat).rand({3,4});
+Scalar on_gpu = Scalar(a[1][1]); //backed by zero-dim Tensor
+assert(on_gpu.isBackedByTensor());
+
+double value = on_gpu.toDouble(); // copied to CPU, if it was backed by GPU Tensor.
+Scalar svalue = on_gpu.local(); // force the Scalar to become local to CPU.
+
+// get the scalar as a zero-dim tensor. If it was already backed
+// by a zero-dim Tensor then this op has no synchronization.
+// if the Scalar was local on CPU, it performs the copy
+Tensor same_tensor = CUDA(kFloat).scalarTensor(on_gpu);
+```
+
+Operators aware of the location of Scalars can arrange to do the minimal number of copies required.
+
+### Developer notes
+
+ATen relies heavily on code generation to automatically generate headers
+and implementations for all of the tensor methods it supports.  The main
+entry point for the script which does all this work is
+[`src/ATen/gen.py`](src/ATen/gen.py), which ingests
+[`src/ATen/Declarations.cwrap`](src/ATen/Declarations.cwrap),
+[`src/ATen/nn.yaml`](src/ATen/nn.yaml),
+[`src/ATen/native/native_functions.yaml`](src/ATen/native/native_functions.yaml) and the THNN/THCUNN headers and
+produces all of the headers and wrapping code necessary to generate
+the ATen interface.
+
+If you need to understand how ATen understands a declaration after all
+of this processing occurs, it's helpful to look at the generated file
+`Declarations.yaml` (NB: not cwrap) which contains information for all
+ATen methods in a uniform manner.  This file is utilized by PyTorch
+which further extends the ATen interface with support for automatic
+differentation.
+
+#### Note [ATen preprocessor philosophy]
+
+ATen is designed to be simple to use, and one of the things this implies is
+that it should not be necessary to use preprocessor macros when using ATen;
+we would rather provide all symbols, even for functionality that is not
+available on the system ATen is running on.
+
+This means that internally inside ATen, whereas other libraries might
+simply omit source files for, e.g., CuDNN, when CuDNN libraries are not
+installed, ATen will always build these source files, compiling stub
+functions for anything that is not available.  ATen never uses
+`AT_ENABLED_CUDA()` in header files, and all types in ATen's public API
+are always available no matter your build configuration.
diff --git a/aten/conda/build.sh b/aten/conda/build.sh
new file mode 100644
index 0000000..f0ca38f
--- /dev/null
+++ b/aten/conda/build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+
+if [ -z "$PREFIX" ]; then
+  PREFIX="$CONDA_PREFIX"
+fi
+
+# When conda-build constructs a new working copy to perform a build
+# in, it recursively copies *all* files and directories in the original
+# source directory, including any pre-existing build products (e.g.,
+# if you previously ran cmake.)  This is problematic, because if
+# a 'build' directory already exists, cmake will reuse build settings
+# rather than recompute them from scratch.  We want a fresh build, so
+# we prophylactically remove the build directory.
+rm -rf build || true
+
+mkdir -p build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX="$PREFIX" -DCMAKE_PREFIX_PATH="$PREFIX" -DCMAKE_BUILD_TYPE=Release $CONDA_CMAKE_ARGS ..
+make install -j20
diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml
new file mode 100644
index 0000000..7493e5d
--- /dev/null
+++ b/aten/conda/meta.yaml
@@ -0,0 +1,33 @@
+{% set version = "0.1.dev" %}
+
+package:
+  name: aten
+  version: {{ version }}
+
+source:
+  path: ..
+
+build:
+  number: 1
+  skip: True  # [win]
+  script_env:
+    - CONDA_CMAKE_ARGS
+
+requirements:
+  build:
+    - cmake
+    - pyyaml
+    - setuptools
+    - python
+    - mkl # [not osx]
+  run:
+    - mkl # [not osx]
+
+about:
+  home: https://github.com/zdevito/ATen
+  license: BSD
+  summary: A TENsor library for C++11
+
+extra:
+  recipe-maintainers:
+    - ezyang
diff --git a/aten/doc/Functions.h b/aten/doc/Functions.h
new file mode 100644
index 0000000..2fd9d72
--- /dev/null
+++ b/aten/doc/Functions.h
@@ -0,0 +1,3133 @@
+#pragma once
+
+#include "ATen/Scalar.h"
+#include "ATen/Type.h"
+#include "ATen/Tensor.h"
+#include "ATen/Storage.h"
+#include "ATen/Generator.h"
+
+
+namespace at {
+
+static inline Tensor & zeros_out(Tensor & result, IntList size);
+static inline Tensor & zeros_like_out(Tensor & result, const Tensor & input);
+static inline Tensor zeros_like(const Tensor & input);
+static inline Tensor & ones_out(Tensor & result, IntList size);
+static inline Tensor & ones_like_out(Tensor & result, const Tensor & input);
+static inline Tensor ones_like(const Tensor & input);
+static inline int64_t numel(const Tensor & self);
+static inline Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask);
+static inline Tensor masked_select(const Tensor & self, const Tensor & mask);
+static inline Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1);
+static inline Tensor t(const Tensor & self);
+static inline Tensor & nonzero_out(Tensor & result, const Tensor & self);
+static inline Tensor nonzero(const Tensor & self);
+static inline Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
+static inline Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index);
+static inline Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index);
+static inline Tensor take(const Tensor & self, const Tensor & index);
+static inline Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step=1);
+static inline Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step=1);
+static inline Tensor & arange_out(Tensor & result, Scalar end);
+static inline Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
+static inline Tensor gather(const Tensor & self, int64_t dim, const Tensor & index);
+static inline bool equal(const Tensor & self, const Tensor & other);
+static inline Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor __and__(const Tensor & self, Scalar other);
+static inline Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor __and__(const Tensor & self, const Tensor & other);
+static inline Tensor & __iand__(Tensor & self, Scalar other);
+static inline Tensor & __iand__(Tensor & self, const Tensor & other);
+static inline Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor __or__(const Tensor & self, Scalar other);
+static inline Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor __or__(const Tensor & self, const Tensor & other);
+static inline Tensor & __ior__(Tensor & self, Scalar other);
+static inline Tensor & __ior__(Tensor & self, const Tensor & other);
+static inline Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor __xor__(const Tensor & self, Scalar other);
+static inline Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor __xor__(const Tensor & self, const Tensor & other);
+static inline Tensor & __ixor__(Tensor & self, Scalar other);
+static inline Tensor & __ixor__(Tensor & self, const Tensor & other);
+static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor __lshift__(const Tensor & self, Scalar other);
+static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor __lshift__(const Tensor & self, const Tensor & other);
+static inline Tensor & __ilshift__(Tensor & self, Scalar other);
+static inline Tensor & __ilshift__(Tensor & self, const Tensor & other);
+static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor __rshift__(const Tensor & self, Scalar other);
+static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor __rshift__(const Tensor & self, const Tensor & other);
+static inline Tensor & __irshift__(Tensor & self, Scalar other);
+static inline Tensor & __irshift__(Tensor & self, const Tensor & other);
+static inline Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor lt(const Tensor & self, Scalar other);
+static inline Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor lt(const Tensor & self, const Tensor & other);
+static inline Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor gt(const Tensor & self, Scalar other);
+static inline Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor gt(const Tensor & self, const Tensor & other);
+static inline Tensor & le_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor le(const Tensor & self, Scalar other);
+static inline Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor le(const Tensor & self, const Tensor & other);
+static inline Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor ge(const Tensor & self, Scalar other);
+static inline Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor ge(const Tensor & self, const Tensor & other);
+static inline Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor eq(const Tensor & self, Scalar other);
+static inline Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor eq(const Tensor & self, const Tensor & other);
+static inline Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor ne(const Tensor & self, Scalar other);
+static inline Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor ne(const Tensor & self, const Tensor & other);
+static inline std::tuple<Tensor &,Tensor &> min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor min(const Tensor & self, const Tensor & other);
+static inline Tensor min(const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor max(const Tensor & self, const Tensor & other);
+static inline Tensor max(const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false);
+static inline std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false);
+static inline std::tuple<Tensor &,Tensor &> mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool keepdim=false);
+static inline std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim=-1, bool keepdim=false);
+static inline std::tuple<Tensor &,Tensor &> median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor median(const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool descending=false);
+static inline std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim=-1, bool descending=false);
+static inline std::tuple<Tensor &,Tensor &> topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true);
+static inline std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true);
+static inline Tensor & abs_out(Tensor & result, const Tensor & self);
+static inline Tensor abs(const Tensor & self);
+static inline Tensor & sigmoid_out(Tensor & result, const Tensor & self);
+static inline Tensor sigmoid(const Tensor & self);
+static inline Tensor & log_out(Tensor & result, const Tensor & self);
+static inline Tensor log(const Tensor & self);
+static inline Tensor & log1p_out(Tensor & result, const Tensor & self);
+static inline Tensor log1p(const Tensor & self);
+static inline Tensor & lgamma_out(Tensor & result, const Tensor & self);
+static inline Tensor lgamma(const Tensor & self);
+static inline Tensor & digamma_out(Tensor & result, const Tensor & self);
+static inline Tensor digamma(const Tensor & self);
+static inline Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self);
+static inline Tensor polygamma(int64_t n, const Tensor & self);
+static inline Tensor & exp_out(Tensor & result, const Tensor & self);
+static inline Tensor exp(const Tensor & self);
+static inline Tensor & expm1_out(Tensor & result, const Tensor & self);
+static inline Tensor expm1(const Tensor & self);
+static inline Tensor & cos_out(Tensor & result, const Tensor & self);
+static inline Tensor cos(const Tensor & self);
+static inline Tensor & acos_out(Tensor & result, const Tensor & self);
+static inline Tensor acos(const Tensor & self);
+static inline Tensor & cosh_out(Tensor & result, const Tensor & self);
+static inline Tensor cosh(const Tensor & self);
+static inline Tensor & sin_out(Tensor & result, const Tensor & self);
+static inline Tensor sin(const Tensor & self);
+static inline Tensor & asin_out(Tensor & result, const Tensor & self);
+static inline Tensor asin(const Tensor & self);
+static inline Tensor & sinh_out(Tensor & result, const Tensor & self);
+static inline Tensor sinh(const Tensor & self);
+static inline Tensor & tan_out(Tensor & result, const Tensor & self);
+static inline Tensor tan(const Tensor & self);
+static inline Tensor & atan_out(Tensor & result, const Tensor & self);
+static inline Tensor atan(const Tensor & self);
+static inline Tensor & tanh_out(Tensor & result, const Tensor & self);
+static inline Tensor tanh(const Tensor & self);
+static inline Tensor & erf_out(Tensor & result, const Tensor & self);
+static inline Tensor erf(const Tensor & self);
+static inline Tensor & erfc_out(Tensor & result, const Tensor & self);
+static inline Tensor erfc(const Tensor & self);
+static inline Tensor & erfinv_out(Tensor & result, const Tensor & self);
+static inline Tensor erfinv(const Tensor & self);
+static inline Tensor & sqrt_out(Tensor & result, const Tensor & self);
+static inline Tensor sqrt(const Tensor & self);
+static inline Tensor & rsqrt_out(Tensor & result, const Tensor & self);
+static inline Tensor rsqrt(const Tensor & self);
+static inline Tensor & ceil_out(Tensor & result, const Tensor & self);
+static inline Tensor ceil(const Tensor & self);
+static inline Tensor & floor_out(Tensor & result, const Tensor & self);
+static inline Tensor floor(const Tensor & self);
+static inline Tensor & round_out(Tensor & result, const Tensor & self);
+static inline Tensor round(const Tensor & self);
+static inline Tensor & trunc_out(Tensor & result, const Tensor & self);
+static inline Tensor trunc(const Tensor & self);
+static inline Tensor & frac_out(Tensor & result, const Tensor & self);
+static inline Tensor frac(const Tensor & self);
+static inline Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor mean(const Tensor & self);
+static inline Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false);
+static inline Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false);
+static inline Tensor var(const Tensor & self, bool unbiased=true);
+static inline Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false);
+static inline Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false);
+static inline Tensor std(const Tensor & self, bool unbiased=true);
+static inline Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim=false);
+static inline Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false);
+static inline Tensor norm(const Tensor & self, Scalar p=2);
+static inline Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+static inline Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+static inline Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2);
+static inline Tensor & reciprocal_out(Tensor & result, const Tensor & self);
+static inline Tensor reciprocal(const Tensor & self);
+static inline Tensor & neg_out(Tensor & result, const Tensor & self);
+static inline Tensor neg(const Tensor & self);
+static inline Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor atan2(const Tensor & self, const Tensor & other);
+static inline Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent);
+static inline Tensor pow(const Tensor & self, Scalar exponent);
+static inline Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent);
+static inline Tensor pow(const Tensor & self, const Tensor & exponent);
+static inline Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self);
+static inline Tensor pow(Scalar base, const Tensor & self);
+static inline Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight);
+static inline Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight);
+static inline Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100);
+static inline Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100);
+static inline Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0);
+static inline Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0);
+static inline Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor sum(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor sum(const Tensor & self);
+static inline Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false);
+static inline Tensor prod(const Tensor & self);
+static inline Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim);
+static inline Tensor cumsum(const Tensor & self, int64_t dim);
+static inline Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim);
+static inline Tensor cumprod(const Tensor & self, int64_t dim);
+static inline Tensor & sign_out(Tensor & result, const Tensor & self);
+static inline Tensor sign(const Tensor & self);
+static inline Tensor trace(const Tensor & self);
+static inline Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1);
+static inline Tensor add(const Tensor & self, Scalar other, Scalar alpha=1);
+static inline Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1);
+static inline Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1);
+static inline Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha=1);
+static inline Tensor add(const Tensor & self, SparseTensor other, Scalar alpha=1);
+static inline Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1);
+static inline Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1);
+static inline Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1);
+static inline Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1);
+static inline Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor mul(const Tensor & self, Scalar other);
+static inline Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor mul(const Tensor & self, const Tensor & other);
+static inline Tensor & div_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor div(const Tensor & self, Scalar other);
+static inline Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor div(const Tensor & self, const Tensor & other);
+static inline Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor fmod(const Tensor & self, Scalar other);
+static inline Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor fmod(const Tensor & self, const Tensor & other);
+static inline Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other);
+static inline Tensor remainder(const Tensor & self, Scalar other);
+static inline Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other);
+static inline Tensor remainder(const Tensor & self, const Tensor & other);
+static inline Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max);
+static inline Tensor clamp(const Tensor & self, Scalar min, Scalar max);
+static inline Tensor & clamp_(Tensor & self, Scalar min, Scalar max);
+static inline Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min);
+static inline Tensor clamp_min(const Tensor & self, Scalar min);
+static inline Tensor & clamp_min_(Tensor & self, Scalar min);
+static inline Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max);
+static inline Tensor clamp_max(const Tensor & self, Scalar max);
+static inline Tensor & clamp_max_(Tensor & self, Scalar max);
+static inline Tensor _dot(const Tensor & self, const Tensor & tensor);
+static inline Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal=0);
+static inline Tensor tril(const Tensor & self, int64_t diagonal=0);
+static inline Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal=0);
+static inline Tensor triu(const Tensor & self, int64_t diagonal=0);
+static inline Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim=-1);
+static inline Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1);
+static inline Tensor & eye_out(Tensor & result, int64_t n, int64_t m=-1);
+static inline Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal=0);
+static inline Tensor diag(const Tensor & self, int64_t diagonal=0);
+static inline Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+static inline Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2);
+static inline Tensor _ger(const Tensor & self, const Tensor & vec2);
+static inline Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec);
+static inline Tensor _mv(const Tensor & self, const Tensor & vec);
+static inline Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2);
+static inline Tensor _mm(const Tensor & self, const Tensor & mat2);
+static inline Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2);
+static inline Tensor bmm(const Tensor & self, const Tensor & mat2);
+static inline Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+static inline Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+static inline Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+static inline Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+static inline std::tuple<Tensor &,Tensor &> gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A);
+static inline std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A);
+static inline std::tuple<Tensor &,Tensor &> gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
+static inline std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A);
+static inline std::tuple<Tensor &,Tensor &> trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false);
+static inline std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false);
+static inline std::tuple<Tensor &,Tensor &> symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false, bool upper=true);
+static inline std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors=false, bool upper=true);
+static inline std::tuple<Tensor &,Tensor &> eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false);
+static inline std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors=false);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some=true);
+static inline std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some=true);
+static inline Tensor & inverse_out(Tensor & output, const Tensor & self);
+static inline Tensor inverse(const Tensor & self);
+static inline Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper=true);
+static inline Tensor potrf(const Tensor & self, bool upper=true);
+static inline Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper=true);
+static inline Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true);
+static inline Tensor & potri_out(Tensor & output, const Tensor & self, bool upper=true);
+static inline Tensor potri(const Tensor & self, bool upper=true);
+static inline std::tuple<Tensor &,Tensor &> pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper=true, Scalar tol=-1);
+static inline std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper=true, Scalar tol=-1);
+static inline std::tuple<Tensor &,Tensor &> qr_out(Tensor & res1, Tensor & res2, const Tensor & self);
+static inline std::tuple<Tensor,Tensor> qr(const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);
+static inline std::tuple<Tensor,Tensor> geqrf(const Tensor & self);
+static inline Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2);
+static inline Tensor orgqr(const Tensor & self, const Tensor & input2);
+static inline Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false);
+static inline Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false);
+static inline std::tuple<Tensor &,Tensor &> btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot=true);
+static inline std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot=true);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot=true);
+static inline std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot=true);
+static inline Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots);
+static inline Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots);
+static inline Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator=nullptr);
+static inline Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr);
+static inline Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr);
+static inline Tensor & normal_out(Tensor & output, const Tensor & mean, double std=1, Generator * generator=nullptr);
+static inline Tensor normal(const Tensor & mean, double std=1, Generator * generator=nullptr);
+static inline Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator=nullptr);
+static inline Tensor normal(double mean, const Tensor & std, Generator * generator=nullptr);
+static inline Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator=nullptr);
+static inline Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator=nullptr);
+static inline Tensor & rand_out(Tensor & result, IntList size, Generator * generator=nullptr);
+static inline Tensor & randn_out(Tensor & result, IntList size, Generator * generator=nullptr);
+static inline Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator=nullptr);
+static inline Tensor bernoulli(const Tensor & self, Generator * generator=nullptr);
+static inline Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator=nullptr);
+static inline Tensor _standard_gamma(const Tensor & self, Generator * generator=nullptr);
+static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total);
+static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total);
+static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size);
+static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values);
+static inline Tensor alias(const Tensor & self);
+static inline Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1);
+static inline Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1);
+static inline Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1);
+static inline Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim=0);
+static inline Tensor _cat(TensorList tensors, int64_t dim=0);
+static inline Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true);
+static inline Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true);
+static inline Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce);
+static inline Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce);
+static inline Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce);
+static inline Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce);
+static inline Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true);
+static inline Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true);
+static inline Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average);
+static inline Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average);
+static inline Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average);
+static inline Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average);
+static inline Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline std::tuple<Tensor &,Tensor &> multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline std::tuple<Tensor,Tensor> multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target);
+static inline Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target);
+static inline Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true);
+static inline Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true);
+static inline std::tuple<Tensor &,Tensor &> nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce);
+static inline std::tuple<Tensor,Tensor> nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce);
+static inline Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight);
+static inline Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight);
+static inline Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true);
+static inline Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true);
+static inline std::tuple<Tensor &,Tensor &> nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce);
+static inline std::tuple<Tensor,Tensor> nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce);
+static inline Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight);
+static inline Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight);
+static inline Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true);
+static inline Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce);
+static inline Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true);
+static inline Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true);
+static inline Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average);
+static inline Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average);
+static inline Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average);
+static inline Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average);
+static inline Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha=1, Scalar scale=1);
+static inline Tensor elu(const Tensor & self, Scalar alpha=1, Scalar scale=1);
+static inline Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale);
+static inline Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale);
+static inline Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output);
+static inline Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output);
+static inline Tensor & elu_(Tensor & self, Scalar alpha=1, Scalar scale=1);
+static inline Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale);
+static inline Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim=-1);
+static inline Tensor glu(const Tensor & self, int64_t dim=-1);
+static inline Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim);
+static inline Tensor glu_forward(const Tensor & self, int64_t dim);
+static inline Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim);
+static inline Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim);
+static inline Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val=-1, Scalar max_val=1);
+static inline Tensor hardtanh(const Tensor & self, Scalar min_val=-1, Scalar max_val=1);
+static inline Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val);
+static inline Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val);
+static inline Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val);
+static inline Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val);
+static inline Tensor & hardtanh_(Tensor & self, Scalar min_val=-1, Scalar max_val=1);
+static inline Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val);
+static inline Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope=0.01);
+static inline Tensor leaky_relu(const Tensor & self, Scalar negative_slope=0.01);
+static inline Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope);
+static inline Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope);
+static inline Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope);
+static inline Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope);
+static inline Tensor & leaky_relu_(Tensor & self, Scalar negative_slope=0.01);
+static inline Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope);
+static inline Tensor & log_sigmoid_out(Tensor & output, const Tensor & self);
+static inline Tensor log_sigmoid(const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self);
+static inline std::tuple<Tensor,Tensor> log_sigmoid_forward(const Tensor & self);
+static inline Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
+static inline Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
+static inline Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim);
+static inline Tensor log_softmax(const Tensor & self, int64_t dim);
+static inline Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim);
+static inline Tensor log_softmax_forward(const Tensor & self, int64_t dim);
+static inline Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output);
+static inline Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output);
+static inline Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight);
+static inline Tensor prelu(const Tensor & self, const Tensor & weight);
+static inline Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight);
+static inline Tensor prelu_forward(const Tensor & self, const Tensor & weight);
+static inline std::tuple<Tensor &,Tensor &> prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight);
+static inline std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array<bool,2> output_mask={{true, true}});
+static inline Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr);
+static inline Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr);
+static inline Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator);
+static inline Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator);
+static inline Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
+static inline Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
+static inline Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr);
+static inline Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator);
+static inline Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim);
+static inline Tensor softmax(const Tensor & self, int64_t dim);
+static inline Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim);
+static inline Tensor softmax_forward(const Tensor & self, int64_t dim);
+static inline Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output);
+static inline Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output);
+static inline Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta=1, Scalar threshold=20);
+static inline Tensor softplus(const Tensor & self, Scalar beta=1, Scalar threshold=20);
+static inline Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold);
+static inline Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold);
+static inline Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output);
+static inline Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output);
+static inline Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd=0.5);
+static inline Tensor softshrink(const Tensor & self, Scalar lambd=0.5);
+static inline Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd);
+static inline Tensor softshrink_forward(const Tensor & self, Scalar lambd);
+static inline Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd);
+static inline Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd);
+static inline Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor threshold(const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value);
+static inline Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self);
+static inline Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self);
+static inline Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self);
+static inline Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self);
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool2d(const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool2d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices);
+static inline Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices);
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool3d(const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool3d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices);
+static inline Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices);
+static inline Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false);
+static inline Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false);
+static inline Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false);
+static inline Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false);
+static inline Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad);
+static inline std::tuple<Tensor &,Tensor &> fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples);
+static inline std::tuple<Tensor,Tensor> fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples);
+static inline std::tuple<Tensor &,Tensor &> fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples);
+static inline std::tuple<Tensor,Tensor> fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples);
+static inline Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices);
+static inline Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices);
+static inline std::tuple<Tensor &,Tensor &> max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false);
+static inline std::tuple<Tensor,Tensor> max_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false);
+static inline std::tuple<Tensor &,Tensor &> max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode);
+static inline std::tuple<Tensor,Tensor> max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode);
+static inline Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices);
+static inline Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices);
+static inline std::tuple<Tensor &,Tensor &> max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false);
+static inline std::tuple<Tensor,Tensor> max_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false);
+static inline std::tuple<Tensor &,Tensor &> max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode);
+static inline std::tuple<Tensor,Tensor> max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode);
+static inline Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices);
+static inline Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices);
+static inline Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size);
+static inline Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding);
+static inline Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad1d(const Tensor & self, IntList padding);
+static inline Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad1d_forward(const Tensor & self, IntList padding);
+static inline Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad2d(const Tensor & self, IntList padding);
+static inline Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad2d_forward(const Tensor & self, IntList padding);
+static inline Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad1d(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad1d_forward(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad2d(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad2d_forward(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad3d(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad3d_forward(const Tensor & self, IntList padding);
+static inline Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding);
+static inline Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_linear1d(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_bilinear2d(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_trilinear3d(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size);
+static inline Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size);
+static inline Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size);
+static inline Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor);
+static inline Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor);
+static inline Tensor & _sigmoid_out(Tensor & output, const Tensor & self);
+static inline Tensor _sigmoid(const Tensor & self);
+static inline Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self);
+static inline Tensor _sigmoid_forward(const Tensor & self);
+static inline Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output);
+static inline Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output);
+static inline Tensor & _tanh_out(Tensor & output, const Tensor & self);
+static inline Tensor _tanh(const Tensor & self);
+static inline Tensor & _tanh_forward_out(Tensor & output, const Tensor & self);
+static inline Tensor _tanh_forward(const Tensor & self);
+static inline Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output);
+static inline Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output);
+static inline Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps);
+static inline Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1);
+static inline Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1);
+static inline Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0);
+static inline Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor &,Tensor &> thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor,Tensor> thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array<bool,2> output_mask={{true, true}});
+static inline Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0);
+static inline Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation);
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones);
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}});
+static inline Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size);
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList output_size);
+static inline bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08);
+static inline Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+static inline Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled);
+static inline Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr);
+static inline Tensor & bernoulli_(Tensor & self, double p=0.5, Generator * generator=nullptr);
+static inline Tensor cat(TensorList tensors, int64_t dim=0);
+static inline Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim=0);
+static inline Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+static inline std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim=0);
+static inline bool cudnn_is_acceptable(const Tensor & self);
+static inline Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups);
+static inline Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled);
+static inline Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding);
+static inline std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask);
+static inline Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1);
+static inline Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1);
+static inline Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1);
+static inline Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad);
+static inline std::tuple<Tensor,Tensor,Tensor> conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad);
+static inline Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1);
+static inline Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1);
+static inline Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1);
+static inline Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W);
+static inline Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W);
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon);
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon);
+static inline Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask);
+static inline Tensor cudnn_convolution_backward_bias(const Tensor & grad_output);
+static inline Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask);
+static inline Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output);
+static inline Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic);
+static inline Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid);
+static inline std::tuple<Tensor,Tensor> cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output);
+static inline Tensor det(const Tensor & self);
+static inline std::tuple<Tensor,Tensor,Tensor,Tensor> _det_with_svd(const Tensor & self);
+static inline Tensor dot(const Tensor & self, const Tensor & tensor);
+static inline Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false);
+static inline Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
+static inline Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+static inline Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type);
+static inline Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+static inline Tensor empty_like(const Tensor & self);
+static inline std::tuple<Tensor,Tensor,Tensor> embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false);
+static inline Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse);
+static inline Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode);
+static inline Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode);
+static inline Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce);
+static inline Tensor ger(const Tensor & self, const Tensor & vec2);
+static inline Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2);
+static inline Tensor index(const Tensor & self, TensorList indices);
+static inline Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values);
+static inline bool is_cuda(const Tensor & self);
+static inline bool is_distributed(const Tensor & self);
+static inline bool is_floating_point(const Tensor & self);
+static inline bool is_nonzero(const Tensor & self);
+static inline bool is_same_size(const Tensor & self, const Tensor & other);
+static inline bool is_signed(const Tensor & self);
+static inline bool is_sparse(const Tensor & self);
+static inline Tensor matmul(const Tensor & self, const Tensor & other);
+static inline std::tuple<Tensor,Tensor> max_pool1d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false);
+static inline Tensor mm(const Tensor & self, const Tensor & mat2);
+static inline Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2);
+static inline Tensor mv(const Tensor & self, const Tensor & vec);
+static inline Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec);
+static inline Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length);
+static inline Tensor pin_memory(const Tensor & self);
+static inline Tensor rand_like(const Tensor & self);
+static inline Tensor randn_like(const Tensor & self);
+static inline Tensor repeat(const Tensor & self, IntList repeats);
+static inline std::tuple<Tensor,Tensor> RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale);
+static inline Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes);
+static inline Tensor rrelu(const Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr);
+static inline Tensor & rrelu_(Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr);
+static inline Tensor select(const Tensor & self, int64_t dim, int64_t index);
+static inline Tensor selu(const Tensor & self);
+static inline Tensor & selu_(Tensor & self);
+static inline int64_t size(const Tensor & self, int64_t dim);
+static inline Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1);
+static inline std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim=0);
+static inline Tensor squeeze(const Tensor & self);
+static inline Tensor squeeze(const Tensor & self, int64_t dim);
+static inline Tensor & squeeze_(Tensor & self);
+static inline Tensor & squeeze_(Tensor & self, int64_t dim);
+static inline Tensor stack(TensorList tensors, int64_t dim=0);
+static inline Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim=0);
+static inline Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0);
+static inline int64_t stride(const Tensor & self, int64_t dim);
+static inline Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1);
+static inline Tensor & t_(Tensor & self);
+static inline Tensor type_as(const Tensor & self, const Tensor & other);
+static inline Tensor unsqueeze(const Tensor & self, int64_t dim);
+static inline Tensor & unsqueeze_(Tensor & self, int64_t dim);
+static inline Tensor view_as(const Tensor & self, const Tensor & other);
+static inline Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other);
+static inline Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other);
+static inline Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output);
+static inline Tensor poisson(const Tensor & self, Generator * generator=nullptr);
+static inline Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional);
+static inline std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state);
+static inline std::tuple<Tensor,Tensor,Tensor,std::vector<Tensor>> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array<bool,4> output_mask);
+
+static inline Type & infer_type(const Tensor & t) {
+  AT_ASSERT(t.defined(), "undefined Tensor");
+  return t.type();
+}
+static inline Type & infer_type(const TensorList & tl) {
+  AT_ASSERT(tl.size() > 0, "expected a non-empty list of Tensors");
+  return tl[0].type();
+}
+// function definitions are all static inline because
+// they are one-line statically dispatched functions that
+// invoke the actual dynamic dispatch on the correct argument
+static inline Tensor & zeros_out(Tensor & result, IntList size) {
+    return infer_type(result).zeros_out(result, size);
+}
+static inline Tensor & zeros_like_out(Tensor & result, const Tensor & input) {
+    return infer_type(result).zeros_like_out(result, input);
+}
+static inline Tensor zeros_like(const Tensor & input) {
+    return infer_type(input).zeros_like(input);
+}
+static inline Tensor & ones_out(Tensor & result, IntList size) {
+    return infer_type(result).ones_out(result, size);
+}
+static inline Tensor & ones_like_out(Tensor & result, const Tensor & input) {
+    return infer_type(result).ones_like_out(result, input);
+}
+static inline Tensor ones_like(const Tensor & input) {
+    return infer_type(input).ones_like(input);
+}
+static inline int64_t numel(const Tensor & self) {
+    return infer_type(self).numel(self);
+}
+static inline Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) {
+    return infer_type(self).masked_select_out(result, self, mask);
+}
+static inline Tensor masked_select(const Tensor & self, const Tensor & mask) {
+    return infer_type(self).masked_select(self, mask);
+}
+static inline Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
+    return infer_type(self).transpose(self, dim0, dim1);
+}
+static inline Tensor t(const Tensor & self) {
+    return infer_type(self).t(self);
+}
+static inline Tensor & nonzero_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).nonzero_out(result, self);
+}
+static inline Tensor nonzero(const Tensor & self) {
+    return infer_type(self).nonzero(self);
+}
+static inline Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) {
+    return infer_type(self).index_select_out(result, self, dim, index);
+}
+static inline Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) {
+    return infer_type(self).index_select(self, dim, index);
+}
+static inline Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index) {
+    return infer_type(self).take_out(result, self, index);
+}
+static inline Tensor take(const Tensor & self, const Tensor & index) {
+    return infer_type(self).take(self, index);
+}
+static inline Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step) {
+    return infer_type(result).range_out(result, start, end, step);
+}
+static inline Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step) {
+    return infer_type(result).arange_out(result, start, end, step);
+}
+static inline Tensor & arange_out(Tensor & result, Scalar end) {
+    return infer_type(result).arange_out(result, end);
+}
+static inline Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) {
+    return infer_type(self).gather_out(result, self, dim, index);
+}
+static inline Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) {
+    return infer_type(self).gather(self, dim, index);
+}
+static inline bool equal(const Tensor & self, const Tensor & other) {
+    return infer_type(self).equal(self, other);
+}
+static inline Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).__and___out(result, self, other);
+}
+static inline Tensor __and__(const Tensor & self, Scalar other) {
+    return infer_type(self).__and__(self, other);
+}
+static inline Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).__and___out(result, self, other);
+}
+static inline Tensor __and__(const Tensor & self, const Tensor & other) {
+    return infer_type(self).__and__(self, other);
+}
+static inline Tensor & __iand__(Tensor & self, Scalar other) {
+    return infer_type(self).__iand__(self, other);
+}
+static inline Tensor & __iand__(Tensor & self, const Tensor & other) {
+    return infer_type(self).__iand__(self, other);
+}
+static inline Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).__or___out(result, self, other);
+}
+static inline Tensor __or__(const Tensor & self, Scalar other) {
+    return infer_type(self).__or__(self, other);
+}
+static inline Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).__or___out(result, self, other);
+}
+static inline Tensor __or__(const Tensor & self, const Tensor & other) {
+    return infer_type(self).__or__(self, other);
+}
+static inline Tensor & __ior__(Tensor & self, Scalar other) {
+    return infer_type(self).__ior__(self, other);
+}
+static inline Tensor & __ior__(Tensor & self, const Tensor & other) {
+    return infer_type(self).__ior__(self, other);
+}
+static inline Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).__xor___out(result, self, other);
+}
+static inline Tensor __xor__(const Tensor & self, Scalar other) {
+    return infer_type(self).__xor__(self, other);
+}
+static inline Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).__xor___out(result, self, other);
+}
+static inline Tensor __xor__(const Tensor & self, const Tensor & other) {
+    return infer_type(self).__xor__(self, other);
+}
+static inline Tensor & __ixor__(Tensor & self, Scalar other) {
+    return infer_type(self).__ixor__(self, other);
+}
+static inline Tensor & __ixor__(Tensor & self, const Tensor & other) {
+    return infer_type(self).__ixor__(self, other);
+}
+static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).__lshift___out(result, self, other);
+}
+static inline Tensor __lshift__(const Tensor & self, Scalar other) {
+    return infer_type(self).__lshift__(self, other);
+}
+static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).__lshift___out(result, self, other);
+}
+static inline Tensor __lshift__(const Tensor & self, const Tensor & other) {
+    return infer_type(self).__lshift__(self, other);
+}
+static inline Tensor & __ilshift__(Tensor & self, Scalar other) {
+    return infer_type(self).__ilshift__(self, other);
+}
+static inline Tensor & __ilshift__(Tensor & self, const Tensor & other) {
+    return infer_type(self).__ilshift__(self, other);
+}
+static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).__rshift___out(result, self, other);
+}
+static inline Tensor __rshift__(const Tensor & self, Scalar other) {
+    return infer_type(self).__rshift__(self, other);
+}
+static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).__rshift___out(result, self, other);
+}
+static inline Tensor __rshift__(const Tensor & self, const Tensor & other) {
+    return infer_type(self).__rshift__(self, other);
+}
+static inline Tensor & __irshift__(Tensor & self, Scalar other) {
+    return infer_type(self).__irshift__(self, other);
+}
+static inline Tensor & __irshift__(Tensor & self, const Tensor & other) {
+    return infer_type(self).__irshift__(self, other);
+}
+static inline Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).lt_out(result, self, other);
+}
+static inline Tensor lt(const Tensor & self, Scalar other) {
+    return infer_type(self).lt(self, other);
+}
+static inline Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).lt_out(result, self, other);
+}
+static inline Tensor lt(const Tensor & self, const Tensor & other) {
+    return infer_type(self).lt(self, other);
+}
+static inline Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).gt_out(result, self, other);
+}
+static inline Tensor gt(const Tensor & self, Scalar other) {
+    return infer_type(self).gt(self, other);
+}
+static inline Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).gt_out(result, self, other);
+}
+static inline Tensor gt(const Tensor & self, const Tensor & other) {
+    return infer_type(self).gt(self, other);
+}
+static inline Tensor & le_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).le_out(result, self, other);
+}
+static inline Tensor le(const Tensor & self, Scalar other) {
+    return infer_type(self).le(self, other);
+}
+static inline Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).le_out(result, self, other);
+}
+static inline Tensor le(const Tensor & self, const Tensor & other) {
+    return infer_type(self).le(self, other);
+}
+static inline Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).ge_out(result, self, other);
+}
+static inline Tensor ge(const Tensor & self, Scalar other) {
+    return infer_type(self).ge(self, other);
+}
+static inline Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).ge_out(result, self, other);
+}
+static inline Tensor ge(const Tensor & self, const Tensor & other) {
+    return infer_type(self).ge(self, other);
+}
+static inline Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).eq_out(result, self, other);
+}
+static inline Tensor eq(const Tensor & self, Scalar other) {
+    return infer_type(self).eq(self, other);
+}
+static inline Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).eq_out(result, self, other);
+}
+static inline Tensor eq(const Tensor & self, const Tensor & other) {
+    return infer_type(self).eq(self, other);
+}
+static inline Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).ne_out(result, self, other);
+}
+static inline Tensor ne(const Tensor & self, Scalar other) {
+    return infer_type(self).ne(self, other);
+}
+static inline Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).ne_out(result, self, other);
+}
+static inline Tensor ne(const Tensor & self, const Tensor & other) {
+    return infer_type(self).ne(self, other);
+}
+static inline std::tuple<Tensor &,Tensor &> min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).min_out(min, min_indices, self, dim, keepdim);
+}
+static inline std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).min(self, dim, keepdim);
+}
+static inline Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).min_out(result, self, other);
+}
+static inline Tensor min(const Tensor & self, const Tensor & other) {
+    return infer_type(self).min(self, other);
+}
+static inline Tensor min(const Tensor & self) {
+    return infer_type(self).min(self);
+}
+static inline std::tuple<Tensor &,Tensor &> max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).max_out(max, max_indices, self, dim, keepdim);
+}
+static inline std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).max(self, dim, keepdim);
+}
+static inline Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).max_out(result, self, other);
+}
+static inline Tensor max(const Tensor & self, const Tensor & other) {
+    return infer_type(self).max(self, other);
+}
+static inline Tensor max(const Tensor & self) {
+    return infer_type(self).max(self);
+}
+static inline std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool keepdim) {
+    return infer_type(self).kthvalue_out(values, indices, self, k, dim, keepdim);
+}
+static inline std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) {
+    return infer_type(self).kthvalue(self, k, dim, keepdim);
+}
+static inline std::tuple<Tensor &,Tensor &> mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).mode_out(values, indices, self, dim, keepdim);
+}
+static inline std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).mode(self, dim, keepdim);
+}
+static inline std::tuple<Tensor &,Tensor &> median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).median_out(values, indices, self, dim, keepdim);
+}
+static inline std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).median(self, dim, keepdim);
+}
+static inline Tensor median(const Tensor & self) {
+    return infer_type(self).median(self);
+}
+static inline std::tuple<Tensor &,Tensor &> sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
+    return infer_type(self).sort_out(values, indices, self, dim, descending);
+}
+static inline std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim, bool descending) {
+    return infer_type(self).sort(self, dim, descending);
+}
+static inline std::tuple<Tensor &,Tensor &> topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) {
+    return infer_type(self).topk_out(values, indices, self, k, dim, largest, sorted);
+}
+static inline std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) {
+    return infer_type(self).topk(self, k, dim, largest, sorted);
+}
+static inline Tensor & abs_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).abs_out(result, self);
+}
+static inline Tensor abs(const Tensor & self) {
+    return infer_type(self).abs(self);
+}
+static inline Tensor & sigmoid_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).sigmoid_out(result, self);
+}
+static inline Tensor sigmoid(const Tensor & self) {
+    return infer_type(self).sigmoid(self);
+}
+static inline Tensor & log_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).log_out(result, self);
+}
+static inline Tensor log(const Tensor & self) {
+    return infer_type(self).log(self);
+}
+static inline Tensor & log1p_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).log1p_out(result, self);
+}
+static inline Tensor log1p(const Tensor & self) {
+    return infer_type(self).log1p(self);
+}
+static inline Tensor & lgamma_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).lgamma_out(result, self);
+}
+static inline Tensor lgamma(const Tensor & self) {
+    return infer_type(self).lgamma(self);
+}
+static inline Tensor & digamma_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).digamma_out(result, self);
+}
+static inline Tensor digamma(const Tensor & self) {
+    return infer_type(self).digamma(self);
+}
+static inline Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self) {
+    return infer_type(self).polygamma_out(result, n, self);
+}
+static inline Tensor polygamma(int64_t n, const Tensor & self) {
+    return infer_type(self).polygamma(n, self);
+}
+static inline Tensor & exp_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).exp_out(result, self);
+}
+static inline Tensor exp(const Tensor & self) {
+    return infer_type(self).exp(self);
+}
+static inline Tensor & expm1_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).expm1_out(result, self);
+}
+static inline Tensor expm1(const Tensor & self) {
+    return infer_type(self).expm1(self);
+}
+static inline Tensor & cos_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).cos_out(result, self);
+}
+static inline Tensor cos(const Tensor & self) {
+    return infer_type(self).cos(self);
+}
+static inline Tensor & acos_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).acos_out(result, self);
+}
+static inline Tensor acos(const Tensor & self) {
+    return infer_type(self).acos(self);
+}
+static inline Tensor & cosh_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).cosh_out(result, self);
+}
+static inline Tensor cosh(const Tensor & self) {
+    return infer_type(self).cosh(self);
+}
+static inline Tensor & sin_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).sin_out(result, self);
+}
+static inline Tensor sin(const Tensor & self) {
+    return infer_type(self).sin(self);
+}
+static inline Tensor & asin_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).asin_out(result, self);
+}
+static inline Tensor asin(const Tensor & self) {
+    return infer_type(self).asin(self);
+}
+static inline Tensor & sinh_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).sinh_out(result, self);
+}
+static inline Tensor sinh(const Tensor & self) {
+    return infer_type(self).sinh(self);
+}
+static inline Tensor & tan_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).tan_out(result, self);
+}
+static inline Tensor tan(const Tensor & self) {
+    return infer_type(self).tan(self);
+}
+static inline Tensor & atan_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).atan_out(result, self);
+}
+static inline Tensor atan(const Tensor & self) {
+    return infer_type(self).atan(self);
+}
+static inline Tensor & tanh_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).tanh_out(result, self);
+}
+static inline Tensor tanh(const Tensor & self) {
+    return infer_type(self).tanh(self);
+}
+static inline Tensor & erf_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).erf_out(result, self);
+}
+static inline Tensor erf(const Tensor & self) {
+    return infer_type(self).erf(self);
+}
+static inline Tensor & erfc_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).erfc_out(result, self);
+}
+static inline Tensor erfc(const Tensor & self) {
+    return infer_type(self).erfc(self);
+}
+static inline Tensor & erfinv_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).erfinv_out(result, self);
+}
+static inline Tensor erfinv(const Tensor & self) {
+    return infer_type(self).erfinv(self);
+}
+static inline Tensor & sqrt_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).sqrt_out(result, self);
+}
+static inline Tensor sqrt(const Tensor & self) {
+    return infer_type(self).sqrt(self);
+}
+static inline Tensor & rsqrt_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).rsqrt_out(result, self);
+}
+static inline Tensor rsqrt(const Tensor & self) {
+    return infer_type(self).rsqrt(self);
+}
+static inline Tensor & ceil_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).ceil_out(result, self);
+}
+static inline Tensor ceil(const Tensor & self) {
+    return infer_type(self).ceil(self);
+}
+static inline Tensor & floor_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).floor_out(result, self);
+}
+static inline Tensor floor(const Tensor & self) {
+    return infer_type(self).floor(self);
+}
+static inline Tensor & round_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).round_out(result, self);
+}
+static inline Tensor round(const Tensor & self) {
+    return infer_type(self).round(self);
+}
+static inline Tensor & trunc_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).trunc_out(result, self);
+}
+static inline Tensor trunc(const Tensor & self) {
+    return infer_type(self).trunc(self);
+}
+static inline Tensor & frac_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).frac_out(result, self);
+}
+static inline Tensor frac(const Tensor & self) {
+    return infer_type(self).frac(self);
+}
+static inline Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).mean_out(result, self, dim, keepdim);
+}
+static inline Tensor mean(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).mean(self, dim, keepdim);
+}
+static inline Tensor mean(const Tensor & self) {
+    return infer_type(self).mean(self);
+}
+static inline Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased, bool keepdim) {
+    return infer_type(self).var_out(result, self, dim, unbiased, keepdim);
+}
+static inline Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) {
+    return infer_type(self).var(self, dim, unbiased, keepdim);
+}
+static inline Tensor var(const Tensor & self, bool unbiased) {
+    return infer_type(self).var(self, unbiased);
+}
+static inline Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased, bool keepdim) {
+    return infer_type(self).std_out(result, self, dim, unbiased, keepdim);
+}
+static inline Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) {
+    return infer_type(self).std(self, dim, unbiased, keepdim);
+}
+static inline Tensor std(const Tensor & self, bool unbiased) {
+    return infer_type(self).std(self, unbiased);
+}
+static inline Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim) {
+    return infer_type(self).norm_out(result, self, p, dim, keepdim);
+}
+static inline Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) {
+    return infer_type(self).norm(self, p, dim, keepdim);
+}
+static inline Tensor norm(const Tensor & self, Scalar p) {
+    return infer_type(self).norm(self, p);
+}
+static inline Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    return infer_type(self).renorm_out(result, self, p, dim, maxnorm);
+}
+static inline Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    return infer_type(self).renorm(self, p, dim, maxnorm);
+}
+static inline Tensor dist(const Tensor & self, const Tensor & other, Scalar p) {
+    return infer_type(self).dist(self, other, p);
+}
+static inline Tensor & reciprocal_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).reciprocal_out(result, self);
+}
+static inline Tensor reciprocal(const Tensor & self) {
+    return infer_type(self).reciprocal(self);
+}
+static inline Tensor & neg_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).neg_out(result, self);
+}
+static inline Tensor neg(const Tensor & self) {
+    return infer_type(self).neg(self);
+}
+static inline Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).atan2_out(result, self, other);
+}
+static inline Tensor atan2(const Tensor & self, const Tensor & other) {
+    return infer_type(self).atan2(self, other);
+}
+static inline Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent) {
+    return infer_type(self).pow_out(result, self, exponent);
+}
+static inline Tensor pow(const Tensor & self, Scalar exponent) {
+    return infer_type(self).pow(self, exponent);
+}
+static inline Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) {
+    return infer_type(self).pow_out(result, self, exponent);
+}
+static inline Tensor pow(const Tensor & self, const Tensor & exponent) {
+    return infer_type(self).pow(self, exponent);
+}
+static inline Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self) {
+    return infer_type(self).pow_out(result, base, self);
+}
+static inline Tensor pow(Scalar base, const Tensor & self) {
+    return infer_type(self).pow(base, self);
+}
+static inline Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) {
+    return infer_type(self).lerp_out(result, self, end, weight);
+}
+static inline Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) {
+    return infer_type(self).lerp(self, end, weight);
+}
+static inline Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps) {
+    return infer_type(result).linspace_out(result, start, end, steps);
+}
+static inline Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps) {
+    return infer_type(result).logspace_out(result, start, end, steps);
+}
+static inline Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+    return infer_type(self).histc_out(result, self, bins, min, max);
+}
+static inline Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+    return infer_type(self).histc(self, bins, min, max);
+}
+static inline Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).sum_out(result, self, dim, keepdim);
+}
+static inline Tensor sum(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).sum(self, dim, keepdim);
+}
+static inline Tensor sum(const Tensor & self) {
+    return infer_type(self).sum(self);
+}
+static inline Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).prod_out(result, self, dim, keepdim);
+}
+static inline Tensor prod(const Tensor & self, int64_t dim, bool keepdim) {
+    return infer_type(self).prod(self, dim, keepdim);
+}
+static inline Tensor prod(const Tensor & self) {
+    return infer_type(self).prod(self);
+}
+static inline Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim) {
+    return infer_type(self).cumsum_out(result, self, dim);
+}
+static inline Tensor cumsum(const Tensor & self, int64_t dim) {
+    return infer_type(self).cumsum(self, dim);
+}
+static inline Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim) {
+    return infer_type(self).cumprod_out(result, self, dim);
+}
+static inline Tensor cumprod(const Tensor & self, int64_t dim) {
+    return infer_type(self).cumprod(self, dim);
+}
+static inline Tensor & sign_out(Tensor & result, const Tensor & self) {
+    return infer_type(self).sign_out(result, self);
+}
+static inline Tensor sign(const Tensor & self) {
+    return infer_type(self).sign(self);
+}
+static inline Tensor trace(const Tensor & self) {
+    return infer_type(self).trace(self);
+}
+static inline Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha) {
+    return infer_type(self).add_out(result, self, other, alpha);
+}
+static inline Tensor add(const Tensor & self, Scalar other, Scalar alpha) {
+    return infer_type(self).add(self, other, alpha);
+}
+static inline Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha) {
+    return infer_type(self).add_out(result, self, other, alpha);
+}
+static inline Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) {
+    return infer_type(self).add(self, other, alpha);
+}
+static inline Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha) {
+    return infer_type(self).add_out(result, self, other, alpha);
+}
+static inline Tensor add(const Tensor & self, SparseTensor other, Scalar alpha) {
+    return infer_type(self).add(self, other, alpha);
+}
+static inline Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha) {
+    return infer_type(self).sub_out(result, self, other, alpha);
+}
+static inline Tensor sub(const Tensor & self, Scalar other, Scalar alpha) {
+    return infer_type(self).sub(self, other, alpha);
+}
+static inline Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha) {
+    return infer_type(self).sub_out(result, self, other, alpha);
+}
+static inline Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) {
+    return infer_type(self).sub(self, other, alpha);
+}
+static inline Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).mul_out(result, self, other);
+}
+static inline Tensor mul(const Tensor & self, Scalar other) {
+    return infer_type(self).mul(self, other);
+}
+static inline Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).mul_out(result, self, other);
+}
+static inline Tensor mul(const Tensor & self, const Tensor & other) {
+    return infer_type(self).mul(self, other);
+}
+static inline Tensor & div_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).div_out(result, self, other);
+}
+static inline Tensor div(const Tensor & self, Scalar other) {
+    return infer_type(self).div(self, other);
+}
+static inline Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).div_out(result, self, other);
+}
+static inline Tensor div(const Tensor & self, const Tensor & other) {
+    return infer_type(self).div(self, other);
+}
+static inline Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).fmod_out(result, self, other);
+}
+static inline Tensor fmod(const Tensor & self, Scalar other) {
+    return infer_type(self).fmod(self, other);
+}
+static inline Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).fmod_out(result, self, other);
+}
+static inline Tensor fmod(const Tensor & self, const Tensor & other) {
+    return infer_type(self).fmod(self, other);
+}
+static inline Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other) {
+    return infer_type(self).remainder_out(result, self, other);
+}
+static inline Tensor remainder(const Tensor & self, Scalar other) {
+    return infer_type(self).remainder(self, other);
+}
+static inline Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    return infer_type(self).remainder_out(result, self, other);
+}
+static inline Tensor remainder(const Tensor & self, const Tensor & other) {
+    return infer_type(self).remainder(self, other);
+}
+static inline Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max) {
+    return infer_type(self).clamp_out(result, self, min, max);
+}
+static inline Tensor clamp(const Tensor & self, Scalar min, Scalar max) {
+    return infer_type(self).clamp(self, min, max);
+}
+static inline Tensor & clamp_(Tensor & self, Scalar min, Scalar max) {
+    return infer_type(self).clamp_(self, min, max);
+}
+static inline Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min) {
+    return infer_type(self).clamp_min_out(result, self, min);
+}
+static inline Tensor clamp_min(const Tensor & self, Scalar min) {
+    return infer_type(self).clamp_min(self, min);
+}
+static inline Tensor & clamp_min_(Tensor & self, Scalar min) {
+    return infer_type(self).clamp_min_(self, min);
+}
+static inline Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max) {
+    return infer_type(self).clamp_max_out(result, self, max);
+}
+static inline Tensor clamp_max(const Tensor & self, Scalar max) {
+    return infer_type(self).clamp_max(self, max);
+}
+static inline Tensor & clamp_max_(Tensor & self, Scalar max) {
+    return infer_type(self).clamp_max_(self, max);
+}
+static inline Tensor _dot(const Tensor & self, const Tensor & tensor) {
+    return infer_type(self)._dot(self, tensor);
+}
+static inline Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
+    return infer_type(self).tril_out(result, self, diagonal);
+}
+static inline Tensor tril(const Tensor & self, int64_t diagonal) {
+    return infer_type(self).tril(self, diagonal);
+}
+static inline Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
+    return infer_type(self).triu_out(result, self, diagonal);
+}
+static inline Tensor triu(const Tensor & self, int64_t diagonal) {
+    return infer_type(self).triu(self, diagonal);
+}
+static inline Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) {
+    return infer_type(self).cross_out(result, self, other, dim);
+}
+static inline Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
+    return infer_type(self).cross(self, other, dim);
+}
+static inline Tensor & eye_out(Tensor & result, int64_t n, int64_t m) {
+    return infer_type(result).eye_out(result, n, m);
+}
+static inline Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal) {
+    return infer_type(self).diag_out(result, self, diagonal);
+}
+static inline Tensor diag(const Tensor & self, int64_t diagonal) {
+    return infer_type(self).diag(self, diagonal);
+}
+static inline Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmm_out(result, self, mat1, mat2, beta, alpha);
+}
+static inline Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmm(self, mat1, mat2, beta, alpha);
+}
+static inline Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmm_out(result, self, mat1, mat2, beta, alpha);
+}
+static inline Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmm(self, mat1, mat2, beta, alpha);
+}
+static inline Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return infer_type(self)._addmv_out(result, self, mat, vec, beta, alpha);
+}
+static inline Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return infer_type(self)._addmv(self, mat, vec, beta, alpha);
+}
+static inline Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return infer_type(self)._addr_out(result, self, vec1, vec2, beta, alpha);
+}
+static inline Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return infer_type(self)._addr(self, vec1, vec2, beta, alpha);
+}
+static inline Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) {
+    return infer_type(self)._ger_out(result, self, vec2);
+}
+static inline Tensor _ger(const Tensor & self, const Tensor & vec2) {
+    return infer_type(self)._ger(self, vec2);
+}
+static inline Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec) {
+    return infer_type(self)._mv_out(result, self, vec);
+}
+static inline Tensor _mv(const Tensor & self, const Tensor & vec) {
+    return infer_type(self)._mv(self, vec);
+}
+static inline Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) {
+    return infer_type(self)._mm_out(result, self, mat2);
+}
+static inline Tensor _mm(const Tensor & self, const Tensor & mat2) {
+    return infer_type(self)._mm(self, mat2);
+}
+static inline Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) {
+    return infer_type(self).bmm_out(result, self, mat2);
+}
+static inline Tensor bmm(const Tensor & self, const Tensor & mat2) {
+    return infer_type(self).bmm(self, mat2);
+}
+static inline Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addbmm_out(result, self, batch1, batch2, beta, alpha);
+}
+static inline Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addbmm(self, batch1, batch2, beta, alpha);
+}
+static inline Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return infer_type(self).baddbmm_out(result, self, batch1, batch2, beta, alpha);
+}
+static inline Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return infer_type(self).baddbmm(self, batch1, batch2, beta, alpha);
+}
+static inline Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return infer_type(self).addcmul_out(result, self, tensor1, tensor2, value);
+}
+static inline Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return infer_type(self).addcmul(self, tensor1, tensor2, value);
+}
+static inline Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return infer_type(self).addcdiv_out(result, self, tensor1, tensor2, value);
+}
+static inline Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return infer_type(self).addcdiv(self, tensor1, tensor2, value);
+}
+static inline std::tuple<Tensor &,Tensor &> gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A) {
+    return infer_type(self).gesv_out(solution, lu, self, A);
+}
+static inline std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) {
+    return infer_type(self).gesv(self, A);
+}
+static inline std::tuple<Tensor &,Tensor &> gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
+    return infer_type(self).gels_out(res1, res2, self, A);
+}
+static inline std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A) {
+    return infer_type(self).gels(self, A);
+}
+static inline std::tuple<Tensor &,Tensor &> trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) {
+    return infer_type(self).trtrs_out(res1, res2, self, A, upper, transpose, unitriangular);
+}
+static inline std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) {
+    return infer_type(self).trtrs(self, A, upper, transpose, unitriangular);
+}
+static inline std::tuple<Tensor &,Tensor &> symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors, bool upper) {
+    return infer_type(self).symeig_out(res1, res2, self, eigenvectors, upper);
+}
+static inline std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors, bool upper) {
+    return infer_type(self).symeig(self, eigenvectors, upper);
+}
+static inline std::tuple<Tensor &,Tensor &> eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) {
+    return infer_type(self).eig_out(res1, res2, self, eigenvectors);
+}
+static inline std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors) {
+    return infer_type(self).eig(self, eigenvectors);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some) {
+    return infer_type(self).svd_out(res1, res2, res3, self, some);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some) {
+    return infer_type(self).svd(self, some);
+}
+static inline Tensor & inverse_out(Tensor & output, const Tensor & self) {
+    return infer_type(self).inverse_out(output, self);
+}
+static inline Tensor inverse(const Tensor & self) {
+    return infer_type(self).inverse(self);
+}
+static inline Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper) {
+    return infer_type(self).potrf_out(output, self, upper);
+}
+static inline Tensor potrf(const Tensor & self, bool upper) {
+    return infer_type(self).potrf(self, upper);
+}
+static inline Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper) {
+    return infer_type(self).potrs_out(result, self, input2, upper);
+}
+static inline Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) {
+    return infer_type(self).potrs(self, input2, upper);
+}
+static inline Tensor & potri_out(Tensor & output, const Tensor & self, bool upper) {
+    return infer_type(self).potri_out(output, self, upper);
+}
+static inline Tensor potri(const Tensor & self, bool upper) {
+    return infer_type(self).potri(self, upper);
+}
+static inline std::tuple<Tensor &,Tensor &> pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper, Scalar tol) {
+    return infer_type(self).pstrf_out(res1, res2, self, upper, tol);
+}
+static inline std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper, Scalar tol) {
+    return infer_type(self).pstrf(self, upper, tol);
+}
+static inline std::tuple<Tensor &,Tensor &> qr_out(Tensor & res1, Tensor & res2, const Tensor & self) {
+    return infer_type(self).qr_out(res1, res2, self);
+}
+static inline std::tuple<Tensor,Tensor> qr(const Tensor & self) {
+    return infer_type(self).qr(self);
+}
+static inline std::tuple<Tensor &,Tensor &> geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) {
+    return infer_type(self).geqrf_out(res1, res2, self);
+}
+static inline std::tuple<Tensor,Tensor> geqrf(const Tensor & self) {
+    return infer_type(self).geqrf(self);
+}
+static inline Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) {
+    return infer_type(self).orgqr_out(result, self, input2);
+}
+static inline Tensor orgqr(const Tensor & self, const Tensor & input2) {
+    return infer_type(self).orgqr(self, input2);
+}
+static inline Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
+    return infer_type(self).ormqr_out(result, self, input2, input3, left, transpose);
+}
+static inline Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
+    return infer_type(self).ormqr(self, input2, input3, left, transpose);
+}
+static inline std::tuple<Tensor &,Tensor &> btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot) {
+    return infer_type(self).btrifact_out(result, pivots, self, pivot);
+}
+static inline std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot) {
+    return infer_type(self).btrifact(self, pivot);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot) {
+    return infer_type(self).btrifact_with_info_out(result, pivots, info, self, pivot);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot) {
+    return infer_type(self).btrifact_with_info(self, pivot);
+}
+static inline Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) {
+    return infer_type(self).btrisolve_out(result, self, LU_data, LU_pivots);
+}
+static inline Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) {
+    return infer_type(self).btrisolve(self, LU_data, LU_pivots);
+}
+static inline Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator) {
+    return infer_type(result).randperm_out(result, n, generator);
+}
+static inline Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) {
+    return infer_type(self).multinomial_out(result, self, num_samples, replacement, generator);
+}
+static inline Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) {
+    return infer_type(self).multinomial(self, num_samples, replacement, generator);
+}
+static inline Tensor & normal_out(Tensor & output, const Tensor & mean, double std, Generator * generator) {
+    return infer_type(output).normal_out(output, mean, std, generator);
+}
+static inline Tensor normal(const Tensor & mean, double std, Generator * generator) {
+    return infer_type(mean).normal(mean, std, generator);
+}
+static inline Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator) {
+    return infer_type(output).normal_out(output, mean, std, generator);
+}
+static inline Tensor normal(double mean, const Tensor & std, Generator * generator) {
+    return infer_type(std).normal(mean, std, generator);
+}
+static inline Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator) {
+    return infer_type(output).normal_out(output, mean, std, generator);
+}
+static inline Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator) {
+    return infer_type(mean).normal(mean, std, generator);
+}
+static inline Tensor & rand_out(Tensor & result, IntList size, Generator * generator) {
+    return infer_type(result).rand_out(result, size, generator);
+}
+static inline Tensor & randn_out(Tensor & result, IntList size, Generator * generator) {
+    return infer_type(result).randn_out(result, size, generator);
+}
+static inline Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator) {
+    return infer_type(self).bernoulli_out(output, self, generator);
+}
+static inline Tensor bernoulli(const Tensor & self, Generator * generator) {
+    return infer_type(self).bernoulli(self, generator);
+}
+static inline Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator) {
+    return infer_type(self)._standard_gamma_out(output, self, generator);
+}
+static inline Tensor _standard_gamma(const Tensor & self, Generator * generator) {
+    return infer_type(self)._standard_gamma(self, generator);
+}
+static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total) {
+    return infer_type(output)._dirichlet_grad_out(output, x, alpha, total);
+}
+static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total) {
+    return infer_type(x)._dirichlet_grad(x, alpha, total);
+}
+static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) {
+    return infer_type(values).sparse_coo_tensor(indices, values, size);
+}
+static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) {
+    return infer_type(values).sparse_coo_tensor(indices, values);
+}
+static inline Tensor alias(const Tensor & self) {
+    return infer_type(self).alias(self);
+}
+static inline Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset) {
+    return infer_type(self).as_strided_out(result, self, size, stride, storage_offset);
+}
+static inline Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) {
+    return infer_type(self).as_strided(self, size, stride, storage_offset);
+}
+static inline Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) {
+    return infer_type(self).as_strided_(self, size, stride, storage_offset);
+}
+static inline Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim) {
+    return infer_type(self)._cat_out(self, tensors, dim);
+}
+static inline Tensor _cat(TensorList tensors, int64_t dim) {
+    return infer_type(tensors)._cat(tensors, dim);
+}
+static inline Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy_out(output, self, target, weight, size_average, reduce);
+}
+static inline Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy(self, target, weight, size_average, reduce);
+}
+static inline Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy_forward_out(output, self, target, weight, size_average, reduce);
+}
+static inline Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy_forward(self, target, weight, size_average, reduce);
+}
+static inline Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy_backward_out(grad_input, grad_output, self, target, weight, size_average, reduce);
+}
+static inline Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) {
+    return infer_type(self).binary_cross_entropy_backward(grad_output, self, target, weight, size_average, reduce);
+}
+static inline Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div_out(output, self, target, size_average, reduce);
+}
+static inline Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div(self, target, size_average, reduce);
+}
+static inline Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div_forward_out(output, self, target, size_average, reduce);
+}
+static inline Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div_forward(self, target, size_average, reduce);
+}
+static inline Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div_backward_out(grad_input, grad_output, self, target, size_average, reduce);
+}
+static inline Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).kl_div_backward(grad_output, self, target, size_average, reduce);
+}
+static inline Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss_out(output, self, target, size_average, reduce);
+}
+static inline Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss(self, target, size_average, reduce);
+}
+static inline Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss_forward_out(output, self, target, size_average, reduce);
+}
+static inline Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss_forward(self, target, size_average, reduce);
+}
+static inline Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce);
+}
+static inline Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).l1_loss_backward(grad_output, self, target, size_average, reduce);
+}
+static inline Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss_out(output, self, target, size_average, reduce);
+}
+static inline Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss(self, target, size_average, reduce);
+}
+static inline Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss_forward_out(output, self, target, size_average, reduce);
+}
+static inline Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss_forward(self, target, size_average, reduce);
+}
+static inline Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce);
+}
+static inline Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).mse_loss_backward(grad_output, self, target, size_average, reduce);
+}
+static inline Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss_out(output, self, target, p, margin, weight, size_average);
+}
+static inline Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss(self, target, p, margin, weight, size_average);
+}
+static inline Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss_forward_out(output, self, target, p, margin, weight, size_average);
+}
+static inline Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss_forward(self, target, p, margin, weight, size_average);
+}
+static inline Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss_backward_out(grad_input, self, target, p, margin, weight, size_average);
+}
+static inline Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) {
+    return infer_type(self).multi_margin_loss_backward(self, target, p, margin, weight, size_average);
+}
+static inline Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).multilabel_margin_loss_out(output, self, target, size_average, reduce);
+}
+static inline Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).multilabel_margin_loss(self, target, size_average, reduce);
+}
+static inline std::tuple<Tensor &,Tensor &> multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).multilabel_margin_loss_forward_out(output, is_target, self, target, size_average, reduce);
+}
+static inline std::tuple<Tensor,Tensor> multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).multilabel_margin_loss_forward(self, target, size_average, reduce);
+}
+static inline Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) {
+    return infer_type(self).multilabel_margin_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce, is_target);
+}
+static inline Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) {
+    return infer_type(self).multilabel_margin_loss_backward(grad_output, self, target, size_average, reduce, is_target);
+}
+static inline Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss_out(output, self, target, weight, size_average, ignore_index, reduce);
+}
+static inline Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss(self, target, weight, size_average, ignore_index, reduce);
+}
+static inline std::tuple<Tensor &,Tensor &> nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss_forward_out(output, total_weight, self, target, weight, size_average, ignore_index, reduce);
+}
+static inline std::tuple<Tensor,Tensor> nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss_forward(self, target, weight, size_average, ignore_index, reduce);
+}
+static inline Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) {
+    return infer_type(self).nll_loss_backward_out(grad_input, grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight);
+}
+static inline Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) {
+    return infer_type(self).nll_loss_backward(grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight);
+}
+static inline Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss2d_out(output, self, target, weight, size_average, ignore_index, reduce);
+}
+static inline Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss2d(self, target, weight, size_average, ignore_index, reduce);
+}
+static inline std::tuple<Tensor &,Tensor &> nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss2d_forward_out(output, total_weight, self, target, weight, size_average, ignore_index, reduce);
+}
+static inline std::tuple<Tensor,Tensor> nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) {
+    return infer_type(self).nll_loss2d_forward(self, target, weight, size_average, ignore_index, reduce);
+}
+static inline Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) {
+    return infer_type(self).nll_loss2d_backward_out(grad_input, grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight);
+}
+static inline Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) {
+    return infer_type(self).nll_loss2d_backward(grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight);
+}
+static inline Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss_out(output, self, target, size_average, reduce);
+}
+static inline Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss(self, target, size_average, reduce);
+}
+static inline Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss_forward_out(output, self, target, size_average, reduce);
+}
+static inline Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss_forward(self, target, size_average, reduce);
+}
+static inline Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce);
+}
+static inline Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) {
+    return infer_type(self).smooth_l1_loss_backward(grad_output, self, target, size_average, reduce);
+}
+static inline Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss_out(output, self, target, size_average);
+}
+static inline Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss(self, target, size_average);
+}
+static inline Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss_forward_out(output, self, target, size_average);
+}
+static inline Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss_forward(self, target, size_average);
+}
+static inline Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss_backward_out(grad_input, self, target, size_average);
+}
+static inline Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average) {
+    return infer_type(self).soft_margin_loss_backward(self, target, size_average);
+}
+static inline Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu_out(output, self, alpha, scale);
+}
+static inline Tensor elu(const Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu(self, alpha, scale);
+}
+static inline Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu_forward_out(output, self, alpha, scale);
+}
+static inline Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu_forward(self, alpha, scale);
+}
+static inline Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) {
+    return infer_type(grad_input).elu_backward_out(grad_input, grad_output, alpha, scale, output);
+}
+static inline Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) {
+    return infer_type(grad_output).elu_backward(grad_output, alpha, scale, output);
+}
+static inline Tensor & elu_(Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu_(self, alpha, scale);
+}
+static inline Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale) {
+    return infer_type(self).elu_forward_(self, alpha, scale);
+}
+static inline Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).glu_out(output, self, dim);
+}
+static inline Tensor glu(const Tensor & self, int64_t dim) {
+    return infer_type(self).glu(self, dim);
+}
+static inline Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).glu_forward_out(output, self, dim);
+}
+static inline Tensor glu_forward(const Tensor & self, int64_t dim) {
+    return infer_type(self).glu_forward(self, dim);
+}
+static inline Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) {
+    return infer_type(self).glu_backward_out(grad_input, grad_output, self, dim);
+}
+static inline Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) {
+    return infer_type(self).glu_backward(grad_output, self, dim);
+}
+static inline Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_out(output, self, min_val, max_val);
+}
+static inline Tensor hardtanh(const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh(self, min_val, max_val);
+}
+static inline Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_forward_out(output, self, min_val, max_val);
+}
+static inline Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_forward(self, min_val, max_val);
+}
+static inline Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_backward_out(grad_input, grad_output, self, min_val, max_val);
+}
+static inline Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_backward(grad_output, self, min_val, max_val);
+}
+static inline Tensor & hardtanh_(Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_(self, min_val, max_val);
+}
+static inline Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val) {
+    return infer_type(self).hardtanh_forward_(self, min_val, max_val);
+}
+static inline Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_out(output, self, negative_slope);
+}
+static inline Tensor leaky_relu(const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu(self, negative_slope);
+}
+static inline Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_forward_out(output, self, negative_slope);
+}
+static inline Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_forward(self, negative_slope);
+}
+static inline Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_backward_out(grad_input, grad_output, self, negative_slope);
+}
+static inline Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_backward(grad_output, self, negative_slope);
+}
+static inline Tensor & leaky_relu_(Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_(self, negative_slope);
+}
+static inline Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope) {
+    return infer_type(self).leaky_relu_forward_(self, negative_slope);
+}
+static inline Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) {
+    return infer_type(self).log_sigmoid_out(output, self);
+}
+static inline Tensor log_sigmoid(const Tensor & self) {
+    return infer_type(self).log_sigmoid(self);
+}
+static inline std::tuple<Tensor &,Tensor &> log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) {
+    return infer_type(self).log_sigmoid_forward_out(output, buffer, self);
+}
+static inline std::tuple<Tensor,Tensor> log_sigmoid_forward(const Tensor & self) {
+    return infer_type(self).log_sigmoid_forward(self);
+}
+static inline Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) {
+    return infer_type(self).log_sigmoid_backward_out(grad_input, grad_output, self, buffer);
+}
+static inline Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) {
+    return infer_type(self).log_sigmoid_backward(grad_output, self, buffer);
+}
+static inline Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).log_softmax_out(output, self, dim);
+}
+static inline Tensor log_softmax(const Tensor & self, int64_t dim) {
+    return infer_type(self).log_softmax(self, dim);
+}
+static inline Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).log_softmax_forward_out(output, self, dim);
+}
+static inline Tensor log_softmax_forward(const Tensor & self, int64_t dim) {
+    return infer_type(self).log_softmax_forward(self, dim);
+}
+static inline Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) {
+    return infer_type(self).log_softmax_backward_out(grad_input, grad_output, self, dim, output);
+}
+static inline Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) {
+    return infer_type(self).log_softmax_backward(grad_output, self, dim, output);
+}
+static inline Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight) {
+    return infer_type(self).prelu_out(output, self, weight);
+}
+static inline Tensor prelu(const Tensor & self, const Tensor & weight) {
+    return infer_type(self).prelu(self, weight);
+}
+static inline Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight) {
+    return infer_type(self).prelu_forward_out(output, self, weight);
+}
+static inline Tensor prelu_forward(const Tensor & self, const Tensor & weight) {
+    return infer_type(self).prelu_forward(self, weight);
+}
+static inline std::tuple<Tensor &,Tensor &> prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight) {
+    return infer_type(self).prelu_backward_out(grad_input, grad_weight, grad_output, self, weight);
+}
+static inline std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array<bool,2> output_mask) {
+    return infer_type(self).prelu_backward(grad_output, self, weight, output_mask);
+}
+static inline Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise_out(output, self, noise, lower, upper, training, generator);
+}
+static inline Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise(self, noise, lower, upper, training, generator);
+}
+static inline Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise_forward_out(output, self, noise, lower, upper, training, generator);
+}
+static inline Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise_forward(self, noise, lower, upper, training, generator);
+}
+static inline Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
+    return infer_type(self).rrelu_with_noise_backward_out(grad_input, grad_output, self, noise, lower, upper, training);
+}
+static inline Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
+    return infer_type(self).rrelu_with_noise_backward(grad_output, self, noise, lower, upper, training);
+}
+static inline Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise_(self, noise, lower, upper, training, generator);
+}
+static inline Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_with_noise_forward_(self, noise, lower, upper, training, generator);
+}
+static inline Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).softmax_out(output, self, dim);
+}
+static inline Tensor softmax(const Tensor & self, int64_t dim) {
+    return infer_type(self).softmax(self, dim);
+}
+static inline Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) {
+    return infer_type(self).softmax_forward_out(output, self, dim);
+}
+static inline Tensor softmax_forward(const Tensor & self, int64_t dim) {
+    return infer_type(self).softmax_forward(self, dim);
+}
+static inline Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) {
+    return infer_type(self).softmax_backward_out(grad_input, grad_output, self, dim, output);
+}
+static inline Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) {
+    return infer_type(self).softmax_backward(grad_output, self, dim, output);
+}
+static inline Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) {
+    return infer_type(self).softplus_out(output, self, beta, threshold);
+}
+static inline Tensor softplus(const Tensor & self, Scalar beta, Scalar threshold) {
+    return infer_type(self).softplus(self, beta, threshold);
+}
+static inline Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) {
+    return infer_type(self).softplus_forward_out(output, self, beta, threshold);
+}
+static inline Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold) {
+    return infer_type(self).softplus_forward(self, beta, threshold);
+}
+static inline Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) {
+    return infer_type(self).softplus_backward_out(grad_input, grad_output, self, beta, threshold, output);
+}
+static inline Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) {
+    return infer_type(self).softplus_backward(grad_output, self, beta, threshold, output);
+}
+static inline Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink_out(output, self, lambd);
+}
+static inline Tensor softshrink(const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink(self, lambd);
+}
+static inline Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink_forward_out(output, self, lambd);
+}
+static inline Tensor softshrink_forward(const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink_forward(self, lambd);
+}
+static inline Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink_backward_out(grad_input, grad_output, self, lambd);
+}
+static inline Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd) {
+    return infer_type(self).softshrink_backward(grad_output, self, lambd);
+}
+static inline Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_out(output, self, threshold, value);
+}
+static inline Tensor threshold(const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold(self, threshold, value);
+}
+static inline Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_forward_out(output, self, threshold, value);
+}
+static inline Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_forward(self, threshold, value);
+}
+static inline Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_backward_out(grad_input, grad_output, self, threshold, value);
+}
+static inline Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_backward(grad_output, self, threshold, value);
+}
+static inline Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_(self, threshold, value);
+}
+static inline Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value) {
+    return infer_type(self).threshold_forward_(self, threshold, value);
+}
+static inline Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool2d_out(output, self, output_size);
+}
+static inline Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool2d(self, output_size);
+}
+static inline Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool2d_forward_out(output, self, output_size);
+}
+static inline Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool2d_forward(self, output_size);
+}
+static inline Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) {
+    return infer_type(self).adaptive_avg_pool2d_backward_out(grad_input, grad_output, self);
+}
+static inline Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self) {
+    return infer_type(self).adaptive_avg_pool2d_backward(grad_output, self);
+}
+static inline Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool3d_out(output, self, output_size);
+}
+static inline Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool3d(self, output_size);
+}
+static inline Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool3d_forward_out(output, self, output_size);
+}
+static inline Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool3d_forward(self, output_size);
+}
+static inline Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) {
+    return infer_type(self).adaptive_avg_pool3d_backward_out(grad_input, grad_output, self);
+}
+static inline Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) {
+    return infer_type(self).adaptive_avg_pool3d_backward(grad_output, self);
+}
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool2d_out(output, indices, self, output_size);
+}
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool2d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool2d(self, output_size);
+}
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool2d_forward_out(output, indices, self, output_size);
+}
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool2d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool2d_forward(self, output_size);
+}
+static inline Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) {
+    return infer_type(self).adaptive_max_pool2d_backward_out(grad_input, grad_output, self, indices);
+}
+static inline Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) {
+    return infer_type(self).adaptive_max_pool2d_backward(grad_output, self, indices);
+}
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool3d_out(output, indices, self, output_size);
+}
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool3d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool3d(self, output_size);
+}
+static inline std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool3d_forward_out(output, indices, self, output_size);
+}
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool3d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool3d_forward(self, output_size);
+}
+static inline Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) {
+    return infer_type(self).adaptive_max_pool3d_backward_out(grad_input, grad_output, self, indices);
+}
+static inline Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) {
+    return infer_type(self).adaptive_max_pool3d_backward(grad_output, self, indices);
+}
+static inline Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d(self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d_forward(self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool2d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d(self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d_forward(self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) {
+    return infer_type(self).avg_pool3d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+static inline std::tuple<Tensor &,Tensor &> fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) {
+    return infer_type(self).fractional_max_pool2d_out(output, indices, self, kernel_size, output_size, random_samples);
+}
+static inline std::tuple<Tensor,Tensor> fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) {
+    return infer_type(self).fractional_max_pool2d(self, kernel_size, output_size, random_samples);
+}
+static inline std::tuple<Tensor &,Tensor &> fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) {
+    return infer_type(self).fractional_max_pool2d_forward_out(output, indices, self, kernel_size, output_size, random_samples);
+}
+static inline std::tuple<Tensor,Tensor> fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) {
+    return infer_type(self).fractional_max_pool2d_forward(self, kernel_size, output_size, random_samples);
+}
+static inline Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) {
+    return infer_type(self).fractional_max_pool2d_backward_out(grad_input, grad_output, self, kernel_size, output_size, indices);
+}
+static inline Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) {
+    return infer_type(self).fractional_max_pool2d_backward(grad_output, self, kernel_size, output_size, indices);
+}
+static inline std::tuple<Tensor &,Tensor &> max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool2d_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor,Tensor> max_pool2d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool2d(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor &,Tensor &> max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool2d_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor,Tensor> max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool2d_forward(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) {
+    return infer_type(self).max_pool2d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+}
+static inline Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) {
+    return infer_type(self).max_pool2d_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+}
+static inline std::tuple<Tensor &,Tensor &> max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool3d_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor,Tensor> max_pool3d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool3d(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor &,Tensor &> max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool3d_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline std::tuple<Tensor,Tensor> max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool3d_forward(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) {
+    return infer_type(self).max_pool3d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+}
+static inline Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) {
+    return infer_type(self).max_pool3d_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+}
+static inline Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d_out(output, self, indices, output_size);
+}
+static inline Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d(self, indices, output_size);
+}
+static inline Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d_forward_out(output, self, indices, output_size);
+}
+static inline Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d_forward(self, indices, output_size);
+}
+static inline Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d_backward_out(grad_input, grad_output, self, indices, output_size);
+}
+static inline Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) {
+    return infer_type(self).max_unpool2d_backward(grad_output, self, indices, output_size);
+}
+static inline Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d_out(output, self, indices, output_size, stride, padding);
+}
+static inline Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d(self, indices, output_size, stride, padding);
+}
+static inline Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d_forward_out(output, self, indices, output_size, stride, padding);
+}
+static inline Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d_forward(self, indices, output_size, stride, padding);
+}
+static inline Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d_backward_out(grad_input, grad_output, self, indices, output_size, stride, padding);
+}
+static inline Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) {
+    return infer_type(self).max_unpool3d_backward(grad_output, self, indices, output_size, stride, padding);
+}
+static inline Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d_out(output, self, padding);
+}
+static inline Tensor reflection_pad1d(const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d(self, padding);
+}
+static inline Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d_forward_out(output, self, padding);
+}
+static inline Tensor reflection_pad1d_forward(const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d_forward(self, padding);
+}
+static inline Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d_backward_out(grad_input, grad_output, self, padding);
+}
+static inline Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad1d_backward(grad_output, self, padding);
+}
+static inline Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d_out(output, self, padding);
+}
+static inline Tensor reflection_pad2d(const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d(self, padding);
+}
+static inline Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d_forward_out(output, self, padding);
+}
+static inline Tensor reflection_pad2d_forward(const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d_forward(self, padding);
+}
+static inline Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d_backward_out(grad_input, grad_output, self, padding);
+}
+static inline Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).reflection_pad2d_backward(grad_output, self, padding);
+}
+static inline Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d_out(output, self, padding);
+}
+static inline Tensor replication_pad1d(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d(self, padding);
+}
+static inline Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d_forward_out(output, self, padding);
+}
+static inline Tensor replication_pad1d_forward(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d_forward(self, padding);
+}
+static inline Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d_backward_out(grad_input, grad_output, self, padding);
+}
+static inline Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad1d_backward(grad_output, self, padding);
+}
+static inline Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d_out(output, self, padding);
+}
+static inline Tensor replication_pad2d(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d(self, padding);
+}
+static inline Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d_forward_out(output, self, padding);
+}
+static inline Tensor replication_pad2d_forward(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d_forward(self, padding);
+}
+static inline Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d_backward_out(grad_input, grad_output, self, padding);
+}
+static inline Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad2d_backward(grad_output, self, padding);
+}
+static inline Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d_out(output, self, padding);
+}
+static inline Tensor replication_pad3d(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d(self, padding);
+}
+static inline Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d_forward_out(output, self, padding);
+}
+static inline Tensor replication_pad3d_forward(const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d_forward(self, padding);
+}
+static inline Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d_backward_out(grad_input, grad_output, self, padding);
+}
+static inline Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) {
+    return infer_type(self).replication_pad3d_backward(grad_output, self, padding);
+}
+static inline Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_linear1d_out(output, self, output_size);
+}
+static inline Tensor upsample_linear1d(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_linear1d(self, output_size);
+}
+static inline Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_linear1d_forward_out(output, self, output_size);
+}
+static inline Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_linear1d_forward(self, output_size);
+}
+static inline Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_input).upsample_linear1d_backward_out(grad_input, grad_output, output_size, input_size);
+}
+static inline Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_output).upsample_linear1d_backward(grad_output, output_size, input_size);
+}
+static inline Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_bilinear2d_out(output, self, output_size);
+}
+static inline Tensor upsample_bilinear2d(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_bilinear2d(self, output_size);
+}
+static inline Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_bilinear2d_forward_out(output, self, output_size);
+}
+static inline Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_bilinear2d_forward(self, output_size);
+}
+static inline Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_input).upsample_bilinear2d_backward_out(grad_input, grad_output, output_size, input_size);
+}
+static inline Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_output).upsample_bilinear2d_backward(grad_output, output_size, input_size);
+}
+static inline Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_trilinear3d_out(output, self, output_size);
+}
+static inline Tensor upsample_trilinear3d(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_trilinear3d(self, output_size);
+}
+static inline Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_trilinear3d_forward_out(output, self, output_size);
+}
+static inline Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size) {
+    return infer_type(self).upsample_trilinear3d_forward(self, output_size);
+}
+static inline Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_input).upsample_trilinear3d_backward_out(grad_input, grad_output, output_size, input_size);
+}
+static inline Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) {
+    return infer_type(grad_output).upsample_trilinear3d_backward(grad_output, output_size, input_size);
+}
+static inline Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d(self, scale_factor);
+}
+static inline Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d_forward_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d_forward(self, scale_factor);
+}
+static inline Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d_backward_out(grad_input, grad_output, self, scale_factor);
+}
+static inline Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest1d_backward(grad_output, self, scale_factor);
+}
+static inline Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d(self, scale_factor);
+}
+static inline Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d_forward_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d_forward(self, scale_factor);
+}
+static inline Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d_backward_out(grad_input, grad_output, self, scale_factor);
+}
+static inline Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest2d_backward(grad_output, self, scale_factor);
+}
+static inline Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d(self, scale_factor);
+}
+static inline Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d_forward_out(output, self, scale_factor);
+}
+static inline Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d_forward(self, scale_factor);
+}
+static inline Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d_backward_out(grad_input, grad_output, self, scale_factor);
+}
+static inline Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) {
+    return infer_type(self).upsample_nearest3d_backward(grad_output, self, scale_factor);
+}
+static inline Tensor & _sigmoid_out(Tensor & output, const Tensor & self) {
+    return infer_type(self)._sigmoid_out(output, self);
+}
+static inline Tensor _sigmoid(const Tensor & self) {
+    return infer_type(self)._sigmoid(self);
+}
+static inline Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self) {
+    return infer_type(self)._sigmoid_forward_out(output, self);
+}
+static inline Tensor _sigmoid_forward(const Tensor & self) {
+    return infer_type(self)._sigmoid_forward(self);
+}
+static inline Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) {
+    return infer_type(grad_input)._sigmoid_backward_out(grad_input, grad_output, output);
+}
+static inline Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output) {
+    return infer_type(grad_output)._sigmoid_backward(grad_output, output);
+}
+static inline Tensor & _tanh_out(Tensor & output, const Tensor & self) {
+    return infer_type(self)._tanh_out(output, self);
+}
+static inline Tensor _tanh(const Tensor & self) {
+    return infer_type(self)._tanh(self);
+}
+static inline Tensor & _tanh_forward_out(Tensor & output, const Tensor & self) {
+    return infer_type(self)._tanh_forward_out(output, self);
+}
+static inline Tensor _tanh_forward(const Tensor & self) {
+    return infer_type(self)._tanh_forward(self);
+}
+static inline Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) {
+    return infer_type(grad_input)._tanh_backward_out(grad_input, grad_output, output);
+}
+static inline Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output) {
+    return infer_type(grad_output)._tanh_backward(grad_output, output);
+}
+static inline Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) {
+    return infer_type(self).thnn_batch_norm_out(output, self, weight, bias, running_mean, running_var, training, momentum, eps);
+}
+static inline Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) {
+    return infer_type(self).thnn_batch_norm(self, weight, bias, running_mean, running_var, training, momentum, eps);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) {
+    return infer_type(self).thnn_batch_norm_forward_out(output, save_mean, save_std, self, weight, bias, running_mean, running_var, training, momentum, eps);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) {
+    return infer_type(self).thnn_batch_norm_forward(self, weight, bias, running_mean, running_var, training, momentum, eps);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std) {
+    return infer_type(self).thnn_batch_norm_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, running_mean, running_var, training, eps, save_mean, save_std);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_batch_norm_backward(grad_output, self, weight, running_mean, running_var, training, eps, save_mean, save_std, output_mask);
+}
+static inline Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose2d_out(output, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose2d(self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose2d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose2d_forward(self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones) {
+    return infer_type(self).thnn_conv_transpose2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, columns, ones);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv_transpose2d_backward(grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, columns, ones, output_mask);
+}
+static inline Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose3d_out(output, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose3d(self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose3d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) {
+    return infer_type(self).thnn_conv_transpose3d_forward(self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input) {
+    return infer_type(self).thnn_conv_transpose3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, finput, fgrad_input);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv_transpose3d_backward(grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, finput, fgrad_input, output_mask);
+}
+static inline Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv2d_out(output, self, weight, kernel_size, bias, stride, padding);
+}
+static inline Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv2d(self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv2d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv2d_forward(self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) {
+    return infer_type(self).thnn_conv2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv2d_backward(grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input, output_mask);
+}
+static inline Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_depthwise2d_out(output, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_depthwise2d(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_depthwise2d_forward_out(output, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_depthwise2d_forward(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &> thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_depthwise2d_backward_out(grad_input, grad_weight, grad_output, self, weight, kernel_size, stride, padding, dilation);
+}
+static inline std::tuple<Tensor,Tensor> thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array<bool,2> output_mask) {
+    return infer_type(self).thnn_conv_depthwise2d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, output_mask);
+}
+static inline Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv3d_out(output, self, weight, kernel_size, bias, stride, padding);
+}
+static inline Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv3d(self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv3d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) {
+    return infer_type(self).thnn_conv3d_forward(self, weight, kernel_size, bias, stride, padding);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) {
+    return infer_type(self).thnn_conv3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv3d_backward(grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input, output_mask);
+}
+static inline Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated2d_out(output, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated2d(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated2d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated2d_forward(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) {
+    return infer_type(self).thnn_conv_dilated2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv_dilated2d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones, output_mask);
+}
+static inline Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated3d_out(output, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated3d(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated3d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) {
+    return infer_type(self).thnn_conv_dilated3d_forward(self, weight, kernel_size, bias, stride, padding, dilation);
+}
+static inline std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) {
+    return infer_type(self).thnn_conv_dilated3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask) {
+    return infer_type(self).thnn_conv_dilated3d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones, output_mask);
+}
+static inline Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_avg_pool1d(self, output_size);
+}
+static inline std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList output_size) {
+    return infer_type(self).adaptive_max_pool1d(self, output_size);
+}
+static inline bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol) {
+    return infer_type(self).allclose(self, other, rtol, atol);
+}
+static inline Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmv(self, mat, vec, beta, alpha);
+}
+static inline Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmv_(self, mat, vec, beta, alpha);
+}
+static inline Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return infer_type(self).addmv_out(result, self, mat, vec, beta, alpha);
+}
+static inline Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addr(self, vec1, vec2, beta, alpha);
+}
+static inline Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addr_(self, vec1, vec2, beta, alpha);
+}
+static inline Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return infer_type(self).addr_out(result, self, vec1, vec2, beta, alpha);
+}
+static inline Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled) {
+    return infer_type(input).batch_norm(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled);
+}
+static inline Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) {
+    return infer_type(self).bernoulli_(self, p, generator);
+}
+static inline Tensor & bernoulli_(Tensor & self, double p, Generator * generator) {
+    return infer_type(self).bernoulli_(self, p, generator);
+}
+static inline Tensor cat(TensorList tensors, int64_t dim) {
+    return infer_type(tensors).cat(tensors, dim);
+}
+static inline Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim) {
+    return infer_type(result).cat_out(result, tensors, dim);
+}
+static inline Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).sspaddmm(self, mat1, mat2, beta, alpha);
+}
+static inline Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return infer_type(self).sspaddmm_out(result, self, mat1, mat2, beta, alpha);
+}
+static inline std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim) {
+    return infer_type(self).chunk(self, chunks, dim);
+}
+static inline bool cudnn_is_acceptable(const Tensor & self) {
+    return infer_type(self).cudnn_is_acceptable(self);
+}
+static inline Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) {
+    return infer_type(input).convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups);
+}
+static inline Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) {
+    return infer_type(input)._convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled);
+}
+static inline Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) {
+    return infer_type(input)._convolution_nogroup(input, weight, bias, stride, padding, dilation, transposed, output_padding);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask) {
+    return infer_type(self)._convolution_double_backward(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, output_mask);
+}
+static inline Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) {
+    return infer_type(input).conv1d(input, weight, bias, stride, padding, dilation, groups);
+}
+static inline Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) {
+    return infer_type(input).conv2d(input, weight, bias, stride, padding, dilation, groups);
+}
+static inline Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) {
+    return infer_type(input).conv3d(input, weight, bias, stride, padding, dilation, groups);
+}
+static inline Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad) {
+    return infer_type(self).conv_tbc(self, weight, bias, pad);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) {
+    return infer_type(self).conv_tbc_backward(self, input, weight, bias, pad);
+}
+static inline Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+    return infer_type(input).conv_transpose1d(input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+static inline Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+    return infer_type(input).conv_transpose2d(input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+static inline Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+    return infer_type(input).conv_transpose3d(input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+static inline Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) {
+    return infer_type(theta).cudnn_affine_grid_generator(theta, N, C, H, W);
+}
+static inline Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) {
+    return infer_type(grad).cudnn_affine_grid_generator_backward(grad, N, C, H, W);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon) {
+    return infer_type(input).cudnn_batch_norm(input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon) {
+    return infer_type(input).cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon);
+}
+static inline Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(self).cudnn_convolution(self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(grad_output).cudnn_convolution_backward_input(self_size, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+    return infer_type(self).cudnn_convolution_backward(self, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, output_mask);
+}
+static inline Tensor cudnn_convolution_backward_bias(const Tensor & grad_output) {
+    return infer_type(grad_output).cudnn_convolution_backward_bias(grad_output);
+}
+static inline Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(self).cudnn_convolution_backward_weight(weight_size, grad_output, self, padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(self).cudnn_convolution_transpose(self, weight, bias, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+    return infer_type(self).cudnn_convolution_transpose_backward(self, grad_output, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, output_mask);
+}
+static inline Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output) {
+    return infer_type(grad_output).cudnn_convolution_transpose_backward_bias(grad_output);
+}
+static inline Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(grad_output).cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) {
+    return infer_type(self).cudnn_convolution_transpose_backward_weight(weight_size, grad_output, self, padding, stride, dilation, groups, benchmark, deterministic);
+}
+static inline Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid) {
+    return infer_type(self).cudnn_grid_sampler(self, grid);
+}
+static inline std::tuple<Tensor,Tensor> cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output) {
+    return infer_type(self).cudnn_grid_sampler_backward(self, grid, grad_output);
+}
+static inline Tensor det(const Tensor & self) {
+    return infer_type(self).det(self);
+}
+static inline std::tuple<Tensor,Tensor,Tensor,Tensor> _det_with_svd(const Tensor & self) {
+    return infer_type(self)._det_with_svd(self);
+}
+static inline Tensor dot(const Tensor & self, const Tensor & tensor) {
+    return infer_type(self).dot(self, tensor);
+}
+static inline Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+    return infer_type(weight).embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
+}
+static inline Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+    return infer_type(grad).embedding_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse);
+}
+static inline Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+    return infer_type(grad).embedding_dense_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+}
+static inline Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type) {
+    return infer_type(self).embedding_renorm_(self, indices, max_norm, norm_type);
+}
+static inline Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+    return infer_type(grad).embedding_sparse_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+}
+static inline Tensor empty_like(const Tensor & self) {
+    return infer_type(self).empty_like(self);
+}
+static inline std::tuple<Tensor,Tensor,Tensor> embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse) {
+    return infer_type(weight).embedding_bag(weight, indices, offsets, scale_grad_by_freq, mode, sparse);
+}
+static inline Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) {
+    return infer_type(grad).embedding_bag_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, sparse);
+}
+static inline Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) {
+    return infer_type(grad).embedding_bag_sparse_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode);
+}
+static inline Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) {
+    return infer_type(grad).embedding_bag_dense_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode);
+}
+static inline Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce) {
+    return infer_type(self).hinge_embedding_loss(self, target, margin, size_average, reduce);
+}
+static inline Tensor ger(const Tensor & self, const Tensor & vec2) {
+    return infer_type(self).ger(self, vec2);
+}
+static inline Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) {
+    return infer_type(self).ger_out(result, self, vec2);
+}
+static inline Tensor index(const Tensor & self, TensorList indices) {
+    return infer_type(self).index(self, indices);
+}
+static inline Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) {
+    return infer_type(self).index_put_(self, indices, values);
+}
+static inline bool is_cuda(const Tensor & self) {
+    return infer_type(self).is_cuda(self);
+}
+static inline bool is_distributed(const Tensor & self) {
+    return infer_type(self).is_distributed(self);
+}
+static inline bool is_floating_point(const Tensor & self) {
+    return infer_type(self).is_floating_point(self);
+}
+static inline bool is_nonzero(const Tensor & self) {
+    return infer_type(self).is_nonzero(self);
+}
+static inline bool is_same_size(const Tensor & self, const Tensor & other) {
+    return infer_type(self).is_same_size(self, other);
+}
+static inline bool is_signed(const Tensor & self) {
+    return infer_type(self).is_signed(self);
+}
+static inline bool is_sparse(const Tensor & self) {
+    return infer_type(self).is_sparse(self);
+}
+static inline Tensor matmul(const Tensor & self, const Tensor & other) {
+    return infer_type(self).matmul(self, other);
+}
+static inline std::tuple<Tensor,Tensor> max_pool1d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) {
+    return infer_type(self).max_pool1d(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+static inline Tensor mm(const Tensor & self, const Tensor & mat2) {
+    return infer_type(self).mm(self, mat2);
+}
+static inline Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) {
+    return infer_type(self).mm_out(result, self, mat2);
+}
+static inline Tensor mv(const Tensor & self, const Tensor & vec) {
+    return infer_type(self).mv(self, vec);
+}
+static inline Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec) {
+    return infer_type(self).mv_out(result, self, vec);
+}
+static inline Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) {
+    return infer_type(self).narrow(self, dim, start, length);
+}
+static inline Tensor pin_memory(const Tensor & self) {
+    return infer_type(self).pin_memory(self);
+}
+static inline Tensor rand_like(const Tensor & self) {
+    return infer_type(self).rand_like(self);
+}
+static inline Tensor randn_like(const Tensor & self) {
+    return infer_type(self).randn_like(self);
+}
+static inline Tensor repeat(const Tensor & self, IntList repeats) {
+    return infer_type(self).repeat(self, repeats);
+}
+static inline std::tuple<Tensor,Tensor> RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) {
+    return infer_type(input).RoiPooling2d_forward(input, rois, pooledHeight, pooledWidth, spatialScale);
+}
+static inline Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes) {
+    return infer_type(input).RoiPooling2d_backward(input, rois, pooledHeight, pooledWidth, spatialScale, gradOutput, argmaxes);
+}
+static inline Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu(self, lower, upper, training, generator);
+}
+static inline Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator * generator) {
+    return infer_type(self).rrelu_(self, lower, upper, training, generator);
+}
+static inline Tensor select(const Tensor & self, int64_t dim, int64_t index) {
+    return infer_type(self).select(self, dim, index);
+}
+static inline Tensor selu(const Tensor & self) {
+    return infer_type(self).selu(self);
+}
+static inline Tensor & selu_(Tensor & self) {
+    return infer_type(self).selu_(self);
+}
+static inline int64_t size(const Tensor & self, int64_t dim) {
+    return infer_type(self).size(self, dim);
+}
+static inline Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) {
+    return infer_type(self).slice(self, dim, start, end, step);
+}
+static inline std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim) {
+    return infer_type(self).split(self, split_size, dim);
+}
+static inline Tensor squeeze(const Tensor & self) {
+    return infer_type(self).squeeze(self);
+}
+static inline Tensor squeeze(const Tensor & self, int64_t dim) {
+    return infer_type(self).squeeze(self, dim);
+}
+static inline Tensor & squeeze_(Tensor & self) {
+    return infer_type(self).squeeze_(self);
+}
+static inline Tensor & squeeze_(Tensor & self, int64_t dim) {
+    return infer_type(self).squeeze_(self, dim);
+}
+static inline Tensor stack(TensorList tensors, int64_t dim) {
+    return infer_type(tensors).stack(tensors, dim);
+}
+static inline Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim) {
+    return infer_type(result).stack_out(result, tensors, dim);
+}
+static inline Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided, const Tensor & window, int64_t pad_end) {
+    return infer_type(self).stft(self, frame_length, hop, fft_size, return_onesided, window, pad_end);
+}
+static inline int64_t stride(const Tensor & self, int64_t dim) {
+    return infer_type(self).stride(self, dim);
+}
+static inline Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
+    return infer_type(self).transpose_(self, dim0, dim1);
+}
+static inline Tensor & t_(Tensor & self) {
+    return infer_type(self).t_(self);
+}
+static inline Tensor type_as(const Tensor & self, const Tensor & other) {
+    return infer_type(self).type_as(self, other);
+}
+static inline Tensor unsqueeze(const Tensor & self, int64_t dim) {
+    return infer_type(self).unsqueeze(self, dim);
+}
+static inline Tensor & unsqueeze_(Tensor & self, int64_t dim) {
+    return infer_type(self).unsqueeze_(self, dim);
+}
+static inline Tensor view_as(const Tensor & self, const Tensor & other) {
+    return infer_type(self).view_as(self, other);
+}
+static inline Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) {
+    return infer_type(self).where(condition, self, other);
+}
+static inline Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other) {
+    return infer_type(self)._s_where(condition, self, other);
+}
+static inline Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output) {
+    return infer_type(self)._standard_gamma_grad(self, output);
+}
+static inline Tensor poisson(const Tensor & self, Generator * generator) {
+    return infer_type(self).poisson(self, generator);
+}
+static inline Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) {
+    return infer_type(weight_arr)._cudnn_rnn_flatten_weight(weight_arr, weight_stride0, input_size, mode, hidden_size, num_layers, batch_first, bidirectional);
+}
+static inline std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state) {
+    return infer_type(input)._cudnn_rnn(input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state);
+}
+static inline std::tuple<Tensor,Tensor,Tensor,std::vector<Tensor>> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array<bool,4> output_mask) {
+    return infer_type(input)._cudnn_rnn_backward(input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask);
+}
+
+}
diff --git a/aten/doc/Tensor.h b/aten/doc/Tensor.h
new file mode 100644
index 0000000..7cfc669
--- /dev/null
+++ b/aten/doc/Tensor.h
@@ -0,0 +1,464 @@
+#pragma once
+
+#include "ATen/Generator.h"
+#include "ATen/Scalar.h"
+#include "ATen/ScalarType.h"
+#include "ATen/TensorAccessor.h"
+#include "ATen/TensorImpl.h"
+#include "ATen/TensorBase.h"
+#include "ATen/Storage.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/Utils.h"
+
+namespace at {
+struct Type;
+
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+struct Tensor : public detail::TensorBase {
+  Tensor() : TensorBase() {}
+  Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {}
+  Tensor(const TensorBase & rhs) : TensorBase(rhs) {}
+  Tensor(const Tensor & rhs) = default;
+  Tensor(Tensor && rhs) noexcept = default;
+
+  // reimplemented from TensorBase so the return type is Tensor rather than TensorBase
+  Tensor & operator=(Tensor && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  Tensor & operator=(Tensor const & rhs) & {
+      //Tensor ctor retains original rhs.pImpl
+      //then rhs.pImpl is swapped with this->pImpl
+      //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl
+      Tensor(rhs).swap(*this);
+      return *this;
+  }
+
+  inline Tensor & operator=(Tensor const & rhs) &&;
+  Tensor & operator=(Scalar v) &&;
+  const char * toString() const {
+    return pImpl->toString();
+  }
+  IntList sizes() const {
+    return pImpl->sizes();
+  }
+  IntList strides() const {
+    return pImpl->strides();
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+  Type & type() const {
+    return pImpl->type();
+  }
+  std::unique_ptr<Storage> storage() const {
+    return pImpl->storage();
+  }
+  inline Tensor toType(const Type & t) const;
+  inline Tensor & copy_(const Tensor & src, bool non_blocking=false);
+  inline Tensor toType(ScalarType t) const;
+  inline Tensor toBackend(Backend b) const;
+
+  template<typename T>
+  T * data() const;
+
+  void * unsafeGetTH(bool retain) const {
+    return pImpl->unsafeGetTH(retain);
+  }
+
+  // Purposely not defined here to avoid inlining
+  void print() const;
+
+  //toLongData(), toFloatData() etc.
+  #define TO_TYPE_DATA(T,name,_) \
+  T * to##name##Data() const;
+  AT_FORALL_SCALAR_TYPES(TO_TYPE_DATA)
+  #undef TO_TYPE_DATA
+
+  #define TO_C_TYPE(T,name,_) \
+  T toC##name () const;
+  AT_FORALL_SCALAR_TYPES(TO_C_TYPE)
+  #undef TO_C_TYPE
+
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_ASSERT(dim() == N, "expected %d dims but tensor has %d",N,dim());
+    return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
+  }
+
+  Tensor operator-() const;
+  Tensor& operator+=(const Tensor & other);
+  Tensor& operator+=(Scalar other);
+  Tensor& operator-=(const Tensor & other);
+  Tensor& operator-=(Scalar other);
+  Tensor& operator*=(const Tensor & other);
+  Tensor& operator*=(Scalar other);
+  Tensor& operator/=(const Tensor & other);
+  Tensor& operator/=(Scalar other);
+  Tensor operator[](int64_t idx) const;
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  int64_t storage_offset() const;
+  Tensor & resize_(IntList size);
+  int64_t numel() const;
+  Tensor & set_(Storage & source);
+  Tensor & set_(Storage & source, int64_t storage_offset, IntList size, IntList stride={});
+  Tensor & set_(const Tensor & source);
+  Tensor & set_();
+  Tensor & fill_(Scalar value);
+  Tensor & fill_(const Tensor & value);
+  bool is_contiguous() const;
+  bool is_set_to(const Tensor & tensor) const;
+  Tensor & masked_fill_(const Tensor & mask, Scalar value);
+  Tensor & masked_fill_(const Tensor & mask, const Tensor & value);
+  Tensor & masked_scatter_(const Tensor & mask, const Tensor & source);
+  Tensor masked_select(const Tensor & mask) const;
+  Tensor transpose(int64_t dim0, int64_t dim1) const;
+  Tensor t() const;
+  Tensor nonzero() const;
+  Tensor contiguous() const;
+  Tensor clone() const;
+  Tensor view(IntList size) const;
+  Tensor & resize_as_(const Tensor & the_template);
+  Tensor index_select(int64_t dim, const Tensor & index) const;
+  Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor take(const Tensor & index) const;
+  Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false);
+  Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value);
+  Tensor unfold(int64_t dimension, int64_t size, int64_t step) const;
+  Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor gather(int64_t dim, const Tensor & index) const;
+  void* data_ptr() const;
+  bool equal(const Tensor & other) const;
+  Tensor __and__(Scalar other) const;
+  Tensor __and__(const Tensor & other) const;
+  Tensor & __iand__(Scalar other);
+  Tensor & __iand__(const Tensor & other);
+  Tensor __or__(Scalar other) const;
+  Tensor __or__(const Tensor & other) const;
+  Tensor & __ior__(Scalar other);
+  Tensor & __ior__(const Tensor & other);
+  Tensor __xor__(Scalar other) const;
+  Tensor __xor__(const Tensor & other) const;
+  Tensor & __ixor__(Scalar other);
+  Tensor & __ixor__(const Tensor & other);
+  Tensor __lshift__(Scalar other) const;
+  Tensor __lshift__(const Tensor & other) const;
+  Tensor & __ilshift__(Scalar other);
+  Tensor & __ilshift__(const Tensor & other);
+  Tensor __rshift__(Scalar other) const;
+  Tensor __rshift__(const Tensor & other) const;
+  Tensor & __irshift__(Scalar other);
+  Tensor & __irshift__(const Tensor & other);
+  Tensor lt(Scalar other) const;
+  Tensor lt(const Tensor & other) const;
+  Tensor & lt_(Scalar other);
+  Tensor & lt_(const Tensor & other);
+  Tensor gt(Scalar other) const;
+  Tensor gt(const Tensor & other) const;
+  Tensor & gt_(Scalar other);
+  Tensor & gt_(const Tensor & other);
+  Tensor le(Scalar other) const;
+  Tensor le(const Tensor & other) const;
+  Tensor & le_(Scalar other);
+  Tensor & le_(const Tensor & other);
+  Tensor ge(Scalar other) const;
+  Tensor ge(const Tensor & other) const;
+  Tensor & ge_(Scalar other);
+  Tensor & ge_(const Tensor & other);
+  Tensor eq(Scalar other) const;
+  Tensor eq(const Tensor & other) const;
+  Tensor & eq_(Scalar other);
+  Tensor & eq_(const Tensor & other);
+  Tensor ne(Scalar other) const;
+  Tensor ne(const Tensor & other) const;
+  Tensor & ne_(Scalar other);
+  Tensor & ne_(const Tensor & other);
+  std::tuple<Tensor,Tensor> min(int64_t dim, bool keepdim=false) const;
+  Tensor min(const Tensor & other) const;
+  Tensor min() const;
+  std::tuple<Tensor,Tensor> max(int64_t dim, bool keepdim=false) const;
+  Tensor max(const Tensor & other) const;
+  Tensor max() const;
+  std::tuple<Tensor,Tensor> kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  std::tuple<Tensor,Tensor> mode(int64_t dim=-1, bool keepdim=false) const;
+  std::tuple<Tensor,Tensor> median(int64_t dim, bool keepdim=false) const;
+  Tensor median() const;
+  std::tuple<Tensor,Tensor> sort(int64_t dim=-1, bool descending=false) const;
+  std::tuple<Tensor,Tensor> topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  Tensor all() const;
+  Tensor any() const;
+  int64_t get_device() const;
+  Tensor abs() const;
+  Tensor & abs_();
+  Tensor & sigmoid_();
+  Tensor sigmoid() const;
+  Tensor & log_();
+  Tensor log() const;
+  Tensor & log1p_();
+  Tensor log1p() const;
+  Tensor lgamma() const;
+  Tensor & lgamma_();
+  Tensor digamma() const;
+  Tensor & digamma_();
+  Tensor polygamma(int64_t n) const;
+  Tensor & polygamma_(int64_t n);
+  Tensor & exp_();
+  Tensor exp() const;
+  Tensor & expm1_();
+  Tensor expm1() const;
+  Tensor & cos_();
+  Tensor cos() const;
+  Tensor & acos_();
+  Tensor acos() const;
+  Tensor & cosh_();
+  Tensor cosh() const;
+  Tensor & sin_();
+  Tensor sin() const;
+  Tensor & asin_();
+  Tensor asin() const;
+  Tensor & sinh_();
+  Tensor sinh() const;
+  Tensor & tan_();
+  Tensor tan() const;
+  Tensor & atan_();
+  Tensor atan() const;
+  Tensor & tanh_();
+  Tensor tanh() const;
+  Tensor & erf_();
+  Tensor erf() const;
+  Tensor & erfc_();
+  Tensor erfc() const;
+  Tensor & erfinv_();
+  Tensor erfinv() const;
+  Tensor & sqrt_();
+  Tensor sqrt() const;
+  Tensor & rsqrt_();
+  Tensor rsqrt() const;
+  Tensor & ceil_();
+  Tensor ceil() const;
+  Tensor & floor_();
+  Tensor floor() const;
+  Tensor & round_();
+  Tensor round() const;
+  Tensor & trunc_();
+  Tensor trunc() const;
+  Tensor & frac_();
+  Tensor frac() const;
+  Tensor mean(int64_t dim, bool keepdim=false) const;
+  Tensor mean() const;
+  Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor var(bool unbiased=true) const;
+  Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor std(bool unbiased=true) const;
+  Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const;
+  Tensor norm(Scalar p=2) const;
+  Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const;
+  Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm);
+  Tensor dist(const Tensor & other, Scalar p=2) const;
+  Tensor reciprocal() const;
+  Tensor & reciprocal_();
+  Tensor neg() const;
+  Tensor & neg_();
+  Tensor atan2(const Tensor & other) const;
+  Tensor & atan2_(const Tensor & other);
+  Tensor pow(Scalar exponent) const;
+  Tensor pow(const Tensor & exponent) const;
+  Tensor & pow_(Scalar exponent);
+  Tensor & pow_(const Tensor & exponent);
+  Tensor lerp(const Tensor & end, Scalar weight) const;
+  Tensor & lerp_(const Tensor & end, Scalar weight);
+  Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const;
+  Tensor & zero_();
+  Tensor sum(int64_t dim, bool keepdim=false) const;
+  Tensor sum() const;
+  Tensor prod(int64_t dim, bool keepdim=false) const;
+  Tensor prod() const;
+  Tensor cumsum(int64_t dim) const;
+  Tensor cumprod(int64_t dim) const;
+  Tensor sign() const;
+  Tensor & sign_();
+  Tensor trace() const;
+  Tensor add(Scalar other, Scalar alpha=1) const;
+  Tensor add(const Tensor & other, Scalar alpha=1) const;
+  Tensor add(SparseTensor other, Scalar alpha=1) const;
+  Tensor & add_(Scalar other, Scalar alpha=1);
+  Tensor & add_(const Tensor & other, Scalar alpha=1);
+  Tensor & add_(SparseTensor other, Scalar alpha=1);
+  Tensor sub(Scalar other, Scalar alpha=1) const;
+  Tensor sub(const Tensor & other, Scalar alpha=1) const;
+  Tensor & sub_(Scalar other, Scalar alpha=1);
+  Tensor & sub_(const Tensor & other, Scalar alpha=1);
+  Tensor mul(Scalar other) const;
+  Tensor mul(const Tensor & other) const;
+  Tensor & mul_(Scalar other);
+  Tensor & mul_(const Tensor & other);
+  Tensor div(Scalar other) const;
+  Tensor div(const Tensor & other) const;
+  Tensor & div_(Scalar other);
+  Tensor & div_(const Tensor & other);
+  Tensor fmod(Scalar other) const;
+  Tensor fmod(const Tensor & other) const;
+  Tensor & fmod_(Scalar other);
+  Tensor & fmod_(const Tensor & other);
+  Tensor remainder(Scalar other) const;
+  Tensor remainder(const Tensor & other) const;
+  Tensor & remainder_(Scalar other);
+  Tensor & remainder_(const Tensor & other);
+  Tensor clamp(Scalar min, Scalar max) const;
+  Tensor & clamp_(Scalar min, Scalar max);
+  Tensor clamp_min(Scalar min) const;
+  Tensor & clamp_min_(Scalar min);
+  Tensor clamp_max(Scalar max) const;
+  Tensor & clamp_max_(Scalar max);
+  Tensor _dot(const Tensor & tensor) const;
+  Tensor tril(int64_t diagonal=0) const;
+  Tensor & tril_(int64_t diagonal=0);
+  Tensor triu(int64_t diagonal=0) const;
+  Tensor & triu_(int64_t diagonal=0);
+  Tensor cross(const Tensor & other, int64_t dim=-1) const;
+  Tensor diag(int64_t diagonal=0) const;
+  Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor addmm(SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+  Tensor & addmm_(SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+  Tensor _addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & _addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+  Tensor _addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & _addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+  Tensor _ger(const Tensor & vec2) const;
+  Tensor _mv(const Tensor & vec) const;
+  Tensor _mm(const Tensor & mat2) const;
+  Tensor bmm(const Tensor & mat2) const;
+  Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  std::tuple<Tensor,Tensor> gesv(const Tensor & A) const;
+  std::tuple<Tensor,Tensor> gels(const Tensor & A) const;
+  std::tuple<Tensor,Tensor> trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  std::tuple<Tensor,Tensor> symeig(bool eigenvectors=false, bool upper=true) const;
+  std::tuple<Tensor,Tensor> eig(bool eigenvectors=false) const;
+  std::tuple<Tensor,Tensor,Tensor> svd(bool some=true) const;
+  Tensor inverse() const;
+  Tensor potrf(bool upper=true) const;
+  Tensor potrs(const Tensor & input2, bool upper=true) const;
+  Tensor potri(bool upper=true) const;
+  std::tuple<Tensor,Tensor> pstrf(bool upper=true, Scalar tol=-1) const;
+  std::tuple<Tensor,Tensor> qr() const;
+  std::tuple<Tensor,Tensor> geqrf() const;
+  Tensor orgqr(const Tensor & input2) const;
+  Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
+  std::tuple<Tensor,Tensor> btrifact(bool pivot=true) const;
+  std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(bool pivot=true) const;
+  Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const;
+  Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr);
+  Tensor & random_(int64_t to, Generator * generator=nullptr);
+  Tensor & random_(Generator * generator=nullptr);
+  Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
+  Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr);
+  Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr);
+  Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr);
+  Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr);
+  Tensor & exponential_(double lambd=1, Generator * generator=nullptr);
+  Tensor & geometric_(double p, Generator * generator=nullptr);
+  Tensor bernoulli(Generator * generator=nullptr) const;
+  Tensor _standard_gamma(Generator * generator=nullptr) const;
+  Tensor & _copy_ignoring_overlaps_(const Tensor & src);
+  Tensor as_strided(IntList size, IntList stride, int64_t storage_offset=-1) const;
+  Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset=-1);
+  Tensor & sparse_raw_resize_(IntList size, int64_t nDimI, int64_t nDimV);
+  Tensor & reshape_(IntList size, IntList stride);
+  Tensor _sparse_mask(SparseTensor mask) const;
+  Tensor to_dense() const;
+  int64_t _dimI() const;
+  int64_t _dimV() const;
+  int64_t _nnz() const;
+  Tensor coalesce() const;
+  bool is_coalesced() const;
+  Tensor _indices() const;
+  Tensor _values() const;
+  bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08) const;
+  Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+  Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+  Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr);
+  Tensor & bernoulli_(double p=0.5, Generator * generator=nullptr);
+  Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  std::vector<Tensor> chunk(int64_t chunks, int64_t dim=0) const;
+  Tensor conv_tbc(const Tensor & weight, const Tensor & bias, int64_t pad) const;
+  std::tuple<Tensor,Tensor,Tensor> conv_tbc_backward(const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) const;
+  Tensor det() const;
+  std::tuple<Tensor,Tensor,Tensor,Tensor> _det_with_svd() const;
+  Tensor dot(const Tensor & tensor) const;
+  Tensor expand(IntList size) const;
+  Tensor expand_as(const Tensor & other) const;
+  Tensor ger(const Tensor & vec2) const;
+  Tensor index(TensorList indices) const;
+  Tensor & index_put_(TensorList indices, const Tensor & values);
+  bool is_cuda() const;
+  bool is_distributed() const;
+  bool is_floating_point() const;
+  bool is_nonzero() const;
+  bool is_same_size(const Tensor & other) const;
+  bool is_signed() const;
+  bool is_sparse() const;
+  Tensor matmul(const Tensor & other) const;
+  Tensor mm(const Tensor & mat2) const;
+  Tensor mv(const Tensor & vec) const;
+  Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
+  Tensor permute(IntList dims) const;
+  Tensor pin_memory() const;
+  Tensor repeat(IntList repeats) const;
+  Tensor select(int64_t dim, int64_t index) const;
+  int64_t size(int64_t dim) const;
+  Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const;
+  std::vector<Tensor> split(int64_t split_size, int64_t dim=0) const;
+  Tensor squeeze() const;
+  Tensor squeeze(int64_t dim) const;
+  Tensor & squeeze_();
+  Tensor & squeeze_(int64_t dim);
+  Tensor stft(int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0) const;
+  int64_t stride(int64_t dim) const;
+  Tensor & transpose_(int64_t dim0, int64_t dim1);
+  Tensor & t_();
+  Tensor type_as(const Tensor & other) const;
+  Tensor unsqueeze(int64_t dim) const;
+  Tensor & unsqueeze_(int64_t dim);
+  Tensor view_as(const Tensor & other) const;
+  Tensor where(const Tensor & condition, const Tensor & other) const;
+  Tensor _s_where(const Tensor & condition, const Tensor & other) const;
+  Tensor _standard_gamma_grad(const Tensor & output) const;
+};
+
+} //namespace at
diff --git a/aten/doc/Type.h b/aten/doc/Type.h
new file mode 100644
index 0000000..5d8ff4f
--- /dev/null
+++ b/aten/doc/Type.h
@@ -0,0 +1,1134 @@
+#pragma once
+
+#include <memory>
+#include <limits>
+#include <functional>
+
+#include "ATen/ATenGeneral.h"
+#include "ATen/ArrayRef.h"
+#include "ATen/Generator.h"
+#include "ATen/Half.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/ScalarType.h"
+#include "ATen/Scalar.h"
+#include "ATen/Tensor.h"
+#include "ATen/Allocator.h"
+
+// To solve the conflict of s_addr in inaddr.h
+#ifdef _MSC_VER
+#ifdef s_addr
+#undef s_addr
+#endif
+#endif
+
+namespace at {
+
+class Context;
+struct Storage;
+struct Generator;
+struct Allocator;
+
+// Note [Empty versus 0-dim tensors]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Unlike Torch, ATen treats zero-dimension tensors as having ONE
+// element (that is to say, a zero-dimensional tensor is a scalar!)
+// This is in contrast to Torch, where a zero-dimension tensor has
+// zero elements.
+//
+// Because we are backed by Torch tensors, we need to be able to
+// represent this state (of numel==0).  These tensors are represented
+// by one-dimensional tensors with size[0] == 0 and stride[0] == 1
+// (the stride is arbitrary but matches the NumPy equivalent).
+constexpr std::array<int64_t, 1> kEmptySizes { {0} };
+constexpr std::array<int64_t, 1> kEmptyStrides { {1} };
+
+static inline void noop_deleter(void*) {}
+
+enum class TypeID {
+  CPUByte,
+  CPUChar,
+  CPUDouble,
+  CPUFloat,
+  CPUInt,
+  CPULong,
+  CPUShort,
+  CPUHalf,
+  SparseCPUByte,
+  SparseCPUChar,
+  SparseCPUDouble,
+  SparseCPUFloat,
+  SparseCPUInt,
+  SparseCPULong,
+  SparseCPUShort,
+  CUDAByte,
+  CUDAChar,
+  CUDADouble,
+  CUDAFloat,
+  CUDAInt,
+  CUDALong,
+  CUDAShort,
+  CUDAHalf,
+  SparseCUDAByte,
+  SparseCUDAChar,
+  SparseCUDADouble,
+  SparseCUDAFloat,
+  SparseCUDAInt,
+  SparseCUDALong,
+  SparseCUDAShort,
+  Undefined,
+  NumOptions
+};
+
+
+struct AT_API Type {
+  explicit Type(Context * context)
+  : context(context) {}
+  virtual ~Type() {}
+  virtual ScalarType scalarType() const = 0;
+  virtual Backend backend() const = 0;
+  virtual bool is_cuda() const = 0;
+  virtual bool is_sparse() const = 0;
+  virtual bool is_distributed() const = 0;
+  static void registerAll(Context * context);
+  virtual std::unique_ptr<Storage> storage() const = 0;
+  virtual std::unique_ptr<Storage> storage(size_t size) const = 0;
+  virtual std::unique_ptr<Storage> storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual std::unique_ptr<Storage> storageWithAllocator(int64_t size, std::unique_ptr<Allocator> allocator) const = 0;
+  virtual std::unique_ptr<Generator> generator() const = 0;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0;
+  virtual std::unique_ptr<Storage> unsafeStorageFromTH(void * th_pointer, bool retain) const = 0;
+  virtual const char * toString() const = 0;
+  virtual size_t elementSizeInBytes() const = 0;
+  virtual Type & toBackend(Backend b) const;
+  virtual Type & toScalarType(ScalarType s) const;
+  Context& get_context() const { return *context; }
+
+  // contingious IDs for all types in the system
+  // for external dispatch
+  virtual TypeID ID() const = 0;
+
+  Tensor copy(const Tensor & src, bool non_blocking=false) const;
+  Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const;
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
+
+  Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const;
+  Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const;
+  Tensor tensorWithAllocator(IntList sizes, std::unique_ptr<Allocator> allocator) const;
+  Tensor tensorWithAllocator(IntList sizes, IntList strides, std::unique_ptr<Allocator> allocator) const;
+  Tensor scalarTensor(Scalar s) const;
+
+  bool operator==(const Type& other) const;
+  bool operator!=(const Type& other) const;
+
+  // example
+  // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
+  virtual int64_t storage_offset(const Tensor & self) const;
+  virtual Tensor & resize_(Tensor & self, IntList size) const;
+  virtual Tensor & zeros_out(Tensor & result, IntList size) const;
+  virtual Tensor zeros(IntList size) const;
+  virtual Tensor & zeros_like_out(Tensor & result, const Tensor & input) const;
+  virtual Tensor zeros_like(const Tensor & input) const;
+  virtual Tensor & ones_out(Tensor & result, IntList size) const;
+  virtual Tensor ones(IntList size) const;
+  virtual Tensor & ones_like_out(Tensor & result, const Tensor & input) const;
+  virtual Tensor ones_like(const Tensor & input) const;
+  virtual int64_t numel(const Tensor & self) const;
+  virtual Tensor & set_(Tensor & self, Storage & source) const;
+  virtual Tensor & set_(Tensor & self, Storage & source, int64_t storage_offset, IntList size, IntList stride={}) const;
+  virtual Tensor & set_(Tensor & self, const Tensor & source) const;
+  virtual Tensor & set_(Tensor & self) const;
+  virtual Tensor & fill_(Tensor & self, Scalar value) const;
+  virtual Tensor & fill_(Tensor & self, const Tensor & value) const;
+  virtual bool is_contiguous(const Tensor & self) const;
+  virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const;
+  Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const;
+  Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const;
+  Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const;
+  virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const;
+  Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) const;
+  virtual Tensor & s_masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) const;
+  Tensor masked_select(const Tensor & self, const Tensor & mask) const;
+  virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const;
+  virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const;
+  virtual Tensor t(const Tensor & self) const;
+  virtual Tensor & nonzero_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor nonzero(const Tensor & self) const;
+  virtual Tensor contiguous(const Tensor & self) const;
+  virtual Tensor clone(const Tensor & self) const;
+  virtual Tensor view(const Tensor & self, IntList size) const;
+  virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const;
+  virtual Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) const;
+  virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const;
+  virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const;
+  virtual Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index) const;
+  virtual Tensor take(const Tensor & self, const Tensor & index) const;
+  virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const;
+  virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const;
+  virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const;
+  virtual Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step=1) const;
+  virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const;
+  virtual Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step=1) const;
+  virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const;
+  virtual Tensor & arange_out(Tensor & result, Scalar end) const;
+  virtual Tensor arange(Scalar end) const;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const;
+  virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const;
+  virtual Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) const;
+  virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const;
+  virtual void* data_ptr(const Tensor & self) const;
+  virtual bool equal(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor __and__(const Tensor & self, Scalar other) const;
+  Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___and___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor __and__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s___and__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __iand__(Tensor & self, Scalar other) const;
+  Tensor & __iand__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor __or__(const Tensor & self, Scalar other) const;
+  Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___or___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor __or__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s___or__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __ior__(Tensor & self, Scalar other) const;
+  Tensor & __ior__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor __xor__(const Tensor & self, Scalar other) const;
+  Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___xor___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor __xor__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __ixor__(Tensor & self, Scalar other) const;
+  Tensor & __ixor__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor __lshift__(const Tensor & self, Scalar other) const;
+  Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___lshift___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor __lshift__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __ilshift__(Tensor & self, Scalar other) const;
+  Tensor & __ilshift__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor __rshift__(const Tensor & self, Scalar other) const;
+  Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___rshift___out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor __rshift__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & __irshift__(Tensor & self, Scalar other) const;
+  Tensor & __irshift__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const;
+  virtual Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor lt(const Tensor & self, Scalar other) const;
+  Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_lt_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor lt(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_lt(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & lt_(Tensor & self, Scalar other) const;
+  Tensor & lt_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor gt(const Tensor & self, Scalar other) const;
+  Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_gt_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor gt(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_gt(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & gt_(Tensor & self, Scalar other) const;
+  Tensor & gt_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & le_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor le(const Tensor & self, Scalar other) const;
+  Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_le_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor le(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_le(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & le_(Tensor & self, Scalar other) const;
+  Tensor & le_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_le_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor ge(const Tensor & self, Scalar other) const;
+  Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_ge_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor ge(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_ge(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & ge_(Tensor & self, Scalar other) const;
+  Tensor & ge_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor eq(const Tensor & self, Scalar other) const;
+  Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_eq_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor eq(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_eq(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & eq_(Tensor & self, Scalar other) const;
+  Tensor & eq_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor ne(const Tensor & self, Scalar other) const;
+  Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_ne_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor ne(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_ne(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & ne_(Tensor & self, Scalar other) const;
+  Tensor & ne_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const;
+  virtual std::tuple<Tensor &,Tensor &> min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_min_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor min(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_min(const Tensor & self, const Tensor & other) const;
+  virtual Tensor min(const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_max_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor max(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_max(const Tensor & self, const Tensor & other) const;
+  virtual Tensor max(const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  virtual std::tuple<Tensor &,Tensor &> mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool keepdim=false) const;
+  virtual std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const;
+  virtual std::tuple<Tensor &,Tensor &> median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor median(const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool descending=false) const;
+  virtual std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim=-1, bool descending=false) const;
+  virtual std::tuple<Tensor &,Tensor &> topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  virtual std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  virtual Tensor all(const Tensor & self) const;
+  virtual Tensor any(const Tensor & self) const;
+  virtual int64_t get_device(const Tensor & self) const;
+  virtual Tensor & abs_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor abs(const Tensor & self) const;
+  virtual Tensor & abs_(Tensor & self) const;
+  virtual Tensor & sigmoid_(Tensor & self) const;
+  virtual Tensor & sigmoid_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor sigmoid(const Tensor & self) const;
+  virtual Tensor & log_(Tensor & self) const;
+  virtual Tensor & log_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor log(const Tensor & self) const;
+  virtual Tensor & log1p_(Tensor & self) const;
+  virtual Tensor & log1p_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor log1p(const Tensor & self) const;
+  virtual Tensor & lgamma_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor lgamma(const Tensor & self) const;
+  virtual Tensor & lgamma_(Tensor & self) const;
+  virtual Tensor & digamma_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor digamma(const Tensor & self) const;
+  virtual Tensor & digamma_(Tensor & self) const;
+  virtual Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self) const;
+  virtual Tensor polygamma(int64_t n, const Tensor & self) const;
+  virtual Tensor & polygamma_(Tensor & self, int64_t n) const;
+  virtual Tensor & exp_(Tensor & self) const;
+  virtual Tensor & exp_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor exp(const Tensor & self) const;
+  virtual Tensor & expm1_(Tensor & self) const;
+  virtual Tensor & expm1_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor expm1(const Tensor & self) const;
+  virtual Tensor & cos_(Tensor & self) const;
+  virtual Tensor & cos_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor cos(const Tensor & self) const;
+  virtual Tensor & acos_(Tensor & self) const;
+  virtual Tensor & acos_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor acos(const Tensor & self) const;
+  virtual Tensor & cosh_(Tensor & self) const;
+  virtual Tensor & cosh_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor cosh(const Tensor & self) const;
+  virtual Tensor & sin_(Tensor & self) const;
+  virtual Tensor & sin_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor sin(const Tensor & self) const;
+  virtual Tensor & asin_(Tensor & self) const;
+  virtual Tensor & asin_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor asin(const Tensor & self) const;
+  virtual Tensor & sinh_(Tensor & self) const;
+  virtual Tensor & sinh_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor sinh(const Tensor & self) const;
+  virtual Tensor & tan_(Tensor & self) const;
+  virtual Tensor & tan_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor tan(const Tensor & self) const;
+  virtual Tensor & atan_(Tensor & self) const;
+  virtual Tensor & atan_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor atan(const Tensor & self) const;
+  virtual Tensor & tanh_(Tensor & self) const;
+  virtual Tensor & tanh_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor tanh(const Tensor & self) const;
+  virtual Tensor & erf_(Tensor & self) const;
+  virtual Tensor & erf_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor erf(const Tensor & self) const;
+  virtual Tensor & erfc_(Tensor & self) const;
+  virtual Tensor & erfc_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor erfc(const Tensor & self) const;
+  virtual Tensor & erfinv_(Tensor & self) const;
+  virtual Tensor & erfinv_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor erfinv(const Tensor & self) const;
+  virtual Tensor & sqrt_(Tensor & self) const;
+  virtual Tensor & sqrt_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor sqrt(const Tensor & self) const;
+  virtual Tensor & rsqrt_(Tensor & self) const;
+  virtual Tensor & rsqrt_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor rsqrt(const Tensor & self) const;
+  virtual Tensor & ceil_(Tensor & self) const;
+  virtual Tensor & ceil_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor ceil(const Tensor & self) const;
+  virtual Tensor & floor_(Tensor & self) const;
+  virtual Tensor & floor_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor floor(const Tensor & self) const;
+  virtual Tensor & round_(Tensor & self) const;
+  virtual Tensor & round_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor round(const Tensor & self) const;
+  virtual Tensor & trunc_(Tensor & self) const;
+  virtual Tensor & trunc_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor trunc(const Tensor & self) const;
+  virtual Tensor & frac_(Tensor & self) const;
+  virtual Tensor & frac_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor frac(const Tensor & self) const;
+  virtual Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor mean(const Tensor & self) const;
+  virtual Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  virtual Tensor var(const Tensor & self, bool unbiased=true) const;
+  virtual Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  virtual Tensor std(const Tensor & self, bool unbiased=true) const;
+  virtual Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const;
+  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const;
+  virtual Tensor norm(const Tensor & self, Scalar p=2) const;
+  virtual Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const;
+  virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const;
+  virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const;
+  Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const;
+  virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const;
+  virtual Tensor & reciprocal_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor reciprocal(const Tensor & self) const;
+  virtual Tensor & reciprocal_(Tensor & self) const;
+  virtual Tensor & neg_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor neg(const Tensor & self) const;
+  virtual Tensor & neg_(Tensor & self) const;
+  Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_atan2_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor atan2(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const;
+  Tensor & atan2_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent) const;
+  virtual Tensor pow(const Tensor & self, Scalar exponent) const;
+  Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) const;
+  virtual Tensor & s_pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) const;
+  Tensor pow(const Tensor & self, const Tensor & exponent) const;
+  virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const;
+  virtual Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self) const;
+  virtual Tensor pow(Scalar base, const Tensor & self) const;
+  virtual Tensor & pow_(Tensor & self, Scalar exponent) const;
+  Tensor & pow_(Tensor & self, const Tensor & exponent) const;
+  virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const;
+  Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) const;
+  virtual Tensor & s_lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) const;
+  Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const;
+  virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const;
+  Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const;
+  virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const;
+  virtual Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100) const;
+  virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const;
+  virtual Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100) const;
+  virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const;
+  virtual Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const;
+  virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const;
+  virtual Tensor & zero_(Tensor & self) const;
+  virtual Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor sum(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor sum(const Tensor & self) const;
+  virtual Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const;
+  virtual Tensor prod(const Tensor & self) const;
+  virtual Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim) const;
+  virtual Tensor cumsum(const Tensor & self, int64_t dim) const;
+  virtual Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim) const;
+  virtual Tensor cumprod(const Tensor & self, int64_t dim) const;
+  virtual Tensor & sign_out(Tensor & result, const Tensor & self) const;
+  virtual Tensor sign(const Tensor & self) const;
+  virtual Tensor & sign_(Tensor & self) const;
+  virtual Tensor trace(const Tensor & self) const;
+  virtual Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1) const;
+  virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const;
+  Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & s_add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor s_add(const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha=1) const;
+  virtual Tensor add(const Tensor & self, SparseTensor other, Scalar alpha=1) const;
+  virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const;
+  Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & s_add_(Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & add_(Tensor & self, SparseTensor other, Scalar alpha=1) const;
+  virtual Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1) const;
+  virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const;
+  Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & s_sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor s_sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const;
+  Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & s_sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const;
+  virtual Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor mul(const Tensor & self, Scalar other) const;
+  Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_mul_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor mul(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_mul(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & mul_(Tensor & self, Scalar other) const;
+  Tensor & mul_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_mul_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & div_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor div(const Tensor & self, Scalar other) const;
+  Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_div_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor div(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_div(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & div_(Tensor & self, Scalar other) const;
+  Tensor & div_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_div_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor fmod(const Tensor & self, Scalar other) const;
+  Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor fmod(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & fmod_(Tensor & self, Scalar other) const;
+  Tensor & fmod_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other) const;
+  virtual Tensor remainder(const Tensor & self, Scalar other) const;
+  Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_remainder_out(Tensor & result, const Tensor & self, const Tensor & other) const;
+  Tensor remainder(const Tensor & self, const Tensor & other) const;
+  virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const;
+  virtual Tensor & remainder_(Tensor & self, Scalar other) const;
+  Tensor & remainder_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const;
+  virtual Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max) const;
+  virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const;
+  virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const;
+  virtual Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min) const;
+  virtual Tensor clamp_min(const Tensor & self, Scalar min) const;
+  virtual Tensor & clamp_min_(Tensor & self, Scalar min) const;
+  virtual Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max) const;
+  virtual Tensor clamp_max(const Tensor & self, Scalar max) const;
+  virtual Tensor & clamp_max_(Tensor & self, Scalar max) const;
+  virtual Tensor _dot(const Tensor & self, const Tensor & tensor) const;
+  virtual Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim=-1) const;
+  virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const;
+  virtual Tensor & eye_out(Tensor & result, int64_t n, int64_t m=-1) const;
+  virtual Tensor eye(int64_t n, int64_t m=-1) const;
+  virtual Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const;
+  virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const;
+  Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & s_addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor s_addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addmm_(Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & s__addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor s__addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & _addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & s__addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor s__addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & _addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) const;
+  virtual Tensor _ger(const Tensor & self, const Tensor & vec2) const;
+  virtual Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const;
+  virtual Tensor _mv(const Tensor & self, const Tensor & vec) const;
+  virtual Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const;
+  virtual Tensor _mm(const Tensor & self, const Tensor & mat2) const;
+  virtual Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const;
+  virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const;
+  Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & s_addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & s_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor s_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor & s_addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor & s_addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  virtual std::tuple<Tensor &,Tensor &> gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A) const;
+  virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const;
+  virtual std::tuple<Tensor &,Tensor &> gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) const;
+  virtual std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A) const;
+  virtual std::tuple<Tensor &,Tensor &> trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  virtual std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  virtual std::tuple<Tensor &,Tensor &> symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false, bool upper=true) const;
+  virtual std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const;
+  virtual std::tuple<Tensor &,Tensor &> eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false) const;
+  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors=false) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some=true) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some=true) const;
+  virtual Tensor & inverse_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor inverse(const Tensor & self) const;
+  virtual Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper=true) const;
+  virtual Tensor potrf(const Tensor & self, bool upper=true) const;
+  virtual Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper=true) const;
+  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const;
+  virtual Tensor & potri_out(Tensor & output, const Tensor & self, bool upper=true) const;
+  virtual Tensor potri(const Tensor & self, bool upper=true) const;
+  virtual std::tuple<Tensor &,Tensor &> pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper=true, Scalar tol=-1) const;
+  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const;
+  virtual std::tuple<Tensor &,Tensor &> qr_out(Tensor & res1, Tensor & res2, const Tensor & self) const;
+  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) const;
+  virtual std::tuple<Tensor,Tensor> geqrf(const Tensor & self) const;
+  virtual Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) const;
+  virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const;
+  virtual Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
+  virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
+  virtual std::tuple<Tensor &,Tensor &> btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot=true) const;
+  virtual std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot=true) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot=true) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot=true) const;
+  virtual Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const;
+  virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const;
+  virtual Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator=nullptr) const;
+  virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const;
+  virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const;
+  virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const;
+  virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
+  virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
+  virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const;
+  virtual Tensor & normal_out(Tensor & output, const Tensor & mean, double std=1, Generator * generator=nullptr) const;
+  virtual Tensor normal(const Tensor & mean, double std=1, Generator * generator=nullptr) const;
+  virtual Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator=nullptr) const;
+  virtual Tensor normal(double mean, const Tensor & std, Generator * generator=nullptr) const;
+  virtual Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator=nullptr) const;
+  virtual Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator=nullptr) const;
+  virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const;
+  virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const;
+  virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const;
+  virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const;
+  virtual Tensor & rand_out(Tensor & result, IntList size, Generator * generator=nullptr) const;
+  virtual Tensor rand(IntList size, Generator * generator=nullptr) const;
+  virtual Tensor & randn_out(Tensor & result, IntList size, Generator * generator=nullptr) const;
+  virtual Tensor randn(IntList size, Generator * generator=nullptr) const;
+  virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const;
+  virtual Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor bernoulli(const Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor _standard_gamma(const Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total) const;
+  virtual Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total) const;
+  virtual Tensor tensor(Storage & storage, int64_t storageOffset, IntList size, IntList stride={}) const;
+  virtual Tensor tensor(IntList size) const;
+  virtual Tensor tensor(IntList size, IntList stride) const;
+  virtual Tensor tensor() const;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const;
+  virtual Tensor alias(const Tensor & self) const;
+  virtual Tensor & _copy_ignoring_overlaps_(Tensor & self, const Tensor & src) const;
+  virtual Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
+  virtual Tensor & sparse_raw_resize_(Tensor & self, IntList size, int64_t nDimI, int64_t nDimV) const;
+  virtual Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim=0) const;
+  virtual Tensor _cat(TensorList tensors, int64_t dim=0) const;
+  virtual Tensor & reshape_(Tensor & self, IntList size, IntList stride) const;
+  virtual Tensor _sparse_mask(const Tensor & self, SparseTensor mask) const;
+  virtual Tensor to_dense(const Tensor & self) const;
+  virtual int64_t _dimI(const Tensor & self) const;
+  virtual int64_t _dimV(const Tensor & self) const;
+  virtual int64_t _nnz(const Tensor & self) const;
+  virtual Tensor coalesce(const Tensor & self) const;
+  virtual bool is_coalesced(const Tensor & self) const;
+  virtual Tensor _indices(const Tensor & self) const;
+  virtual Tensor _values(const Tensor & self) const;
+  virtual Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true) const;
+  virtual Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true) const;
+  virtual Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const;
+  virtual Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const;
+  virtual Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const;
+  virtual Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const;
+  virtual Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true) const;
+  virtual Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true) const;
+  virtual Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const;
+  virtual Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const;
+  virtual Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const;
+  virtual Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const;
+  virtual Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual std::tuple<Tensor &,Tensor &> multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual std::tuple<Tensor,Tensor> multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) const;
+  virtual Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) const;
+  virtual Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const;
+  virtual Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const;
+  virtual std::tuple<Tensor &,Tensor &> nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const;
+  virtual std::tuple<Tensor,Tensor> nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const;
+  virtual Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const;
+  virtual Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const;
+  virtual Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const;
+  virtual Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const;
+  virtual std::tuple<Tensor &,Tensor &> nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const;
+  virtual std::tuple<Tensor,Tensor> nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const;
+  virtual Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const;
+  virtual Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const;
+  virtual Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const;
+  virtual Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const;
+  virtual Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true) const;
+  virtual Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true) const;
+  virtual Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) const;
+  virtual Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average) const;
+  virtual Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average) const;
+  virtual Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average) const;
+  virtual Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha=1, Scalar scale=1) const;
+  virtual Tensor elu(const Tensor & self, Scalar alpha=1, Scalar scale=1) const;
+  virtual Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) const;
+  virtual Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale) const;
+  virtual Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) const;
+  virtual Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) const;
+  virtual Tensor & elu_(Tensor & self, Scalar alpha=1, Scalar scale=1) const;
+  virtual Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale) const;
+  virtual Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim=-1) const;
+  virtual Tensor glu(const Tensor & self, int64_t dim=-1) const;
+  virtual Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) const;
+  virtual Tensor glu_forward(const Tensor & self, int64_t dim) const;
+  virtual Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) const;
+  virtual Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) const;
+  virtual Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val=-1, Scalar max_val=1) const;
+  virtual Tensor hardtanh(const Tensor & self, Scalar min_val=-1, Scalar max_val=1) const;
+  virtual Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) const;
+  virtual Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val) const;
+  virtual Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) const;
+  virtual Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) const;
+  virtual Tensor & hardtanh_(Tensor & self, Scalar min_val=-1, Scalar max_val=1) const;
+  virtual Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val) const;
+  virtual Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope=0.01) const;
+  virtual Tensor leaky_relu(const Tensor & self, Scalar negative_slope=0.01) const;
+  virtual Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope) const;
+  virtual Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope) const;
+  virtual Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope) const;
+  virtual Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope) const;
+  virtual Tensor & leaky_relu_(Tensor & self, Scalar negative_slope=0.01) const;
+  virtual Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope) const;
+  virtual Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor log_sigmoid(const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) const;
+  virtual std::tuple<Tensor,Tensor> log_sigmoid_forward(const Tensor & self) const;
+  virtual Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) const;
+  virtual Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) const;
+  virtual Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim) const;
+  virtual Tensor log_softmax(const Tensor & self, int64_t dim) const;
+  virtual Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) const;
+  virtual Tensor log_softmax_forward(const Tensor & self, int64_t dim) const;
+  virtual Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const;
+  virtual Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const;
+  virtual Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight) const;
+  virtual Tensor prelu(const Tensor & self, const Tensor & weight) const;
+  virtual Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight) const;
+  virtual Tensor prelu_forward(const Tensor & self, const Tensor & weight) const;
+  virtual std::tuple<Tensor &,Tensor &> prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight) const;
+  virtual std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array<bool,2> output_mask={{true, true}}) const;
+  virtual Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const;
+  virtual Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const;
+  virtual Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const;
+  virtual Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const;
+  virtual Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) const;
+  virtual Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) const;
+  virtual Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const;
+  virtual Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const;
+  virtual Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim) const;
+  virtual Tensor softmax(const Tensor & self, int64_t dim) const;
+  virtual Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) const;
+  virtual Tensor softmax_forward(const Tensor & self, int64_t dim) const;
+  virtual Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const;
+  virtual Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const;
+  virtual Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta=1, Scalar threshold=20) const;
+  virtual Tensor softplus(const Tensor & self, Scalar beta=1, Scalar threshold=20) const;
+  virtual Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) const;
+  virtual Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold) const;
+  virtual Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) const;
+  virtual Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) const;
+  virtual Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd=0.5) const;
+  virtual Tensor softshrink(const Tensor & self, Scalar lambd=0.5) const;
+  virtual Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd) const;
+  virtual Tensor softshrink_forward(const Tensor & self, Scalar lambd) const;
+  virtual Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd) const;
+  virtual Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd) const;
+  virtual Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor threshold(const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value) const;
+  virtual Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) const;
+  virtual Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self) const;
+  virtual Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) const;
+  virtual Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) const;
+  virtual std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor,Tensor> adaptive_max_pool2d(const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor,Tensor> adaptive_max_pool2d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) const;
+  virtual Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) const;
+  virtual std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor,Tensor> adaptive_max_pool3d(const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor,Tensor> adaptive_max_pool3d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) const;
+  virtual Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) const;
+  virtual Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const;
+  virtual Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const;
+  virtual Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const;
+  virtual Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const;
+  virtual Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const;
+  virtual std::tuple<Tensor &,Tensor &> fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const;
+  virtual std::tuple<Tensor,Tensor> fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const;
+  virtual std::tuple<Tensor &,Tensor &> fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const;
+  virtual std::tuple<Tensor,Tensor> fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const;
+  virtual Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) const;
+  virtual Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) const;
+  virtual std::tuple<Tensor &,Tensor &> max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const;
+  virtual std::tuple<Tensor,Tensor> max_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const;
+  virtual std::tuple<Tensor &,Tensor &> max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const;
+  virtual std::tuple<Tensor,Tensor> max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const;
+  virtual Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const;
+  virtual Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const;
+  virtual std::tuple<Tensor &,Tensor &> max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const;
+  virtual std::tuple<Tensor,Tensor> max_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const;
+  virtual std::tuple<Tensor &,Tensor &> max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const;
+  virtual std::tuple<Tensor,Tensor> max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const;
+  virtual Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const;
+  virtual Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const;
+  virtual Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) const;
+  virtual Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const;
+  virtual Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad1d(const Tensor & self, IntList padding) const;
+  virtual Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad1d_forward(const Tensor & self, IntList padding) const;
+  virtual Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad2d(const Tensor & self, IntList padding) const;
+  virtual Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad2d_forward(const Tensor & self, IntList padding) const;
+  virtual Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad1d(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad1d_forward(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad2d(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad2d_forward(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad3d(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad3d_forward(const Tensor & self, IntList padding) const;
+  virtual Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const;
+  virtual Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_linear1d(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_bilinear2d(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_trilinear3d(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const;
+  virtual Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size) const;
+  virtual Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const;
+  virtual Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const;
+  virtual Tensor & _sigmoid_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor _sigmoid(const Tensor & self) const;
+  virtual Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor _sigmoid_forward(const Tensor & self) const;
+  virtual Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) const;
+  virtual Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output) const;
+  virtual Tensor & _tanh_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor _tanh(const Tensor & self) const;
+  virtual Tensor & _tanh_forward_out(Tensor & output, const Tensor & self) const;
+  virtual Tensor _tanh_forward(const Tensor & self) const;
+  virtual Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) const;
+  virtual Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output) const;
+  virtual Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const;
+  virtual Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const;
+  virtual Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const;
+  virtual Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const;
+  virtual Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor &,Tensor &> thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor,Tensor> thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array<bool,2> output_mask={{true, true}}) const;
+  virtual Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const;
+  virtual Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const;
+  virtual std::tuple<Tensor &,Tensor &,Tensor &> thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask={{true, true, true}}) const;
+  virtual Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) const;
+  virtual std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList output_size) const;
+  virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08) const;
+  virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled) const;
+  virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const;
+  virtual Tensor & bernoulli_(Tensor & self, double p=0.5, Generator * generator=nullptr) const;
+  virtual Tensor cat(TensorList tensors, int64_t dim=0) const;
+  virtual Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim=0) const;
+  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  virtual std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const;
+  virtual bool cudnn_is_acceptable(const Tensor & self) const;
+  virtual Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) const;
+  virtual Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) const;
+  virtual Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask) const;
+  virtual Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const;
+  virtual Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const;
+  virtual Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const;
+  virtual Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) const;
+  virtual Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const;
+  virtual Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const;
+  virtual Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const;
+  virtual Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) const;
+  virtual Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon) const;
+  virtual Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) const;
+  virtual Tensor cudnn_convolution_backward_bias(const Tensor & grad_output) const;
+  virtual Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) const;
+  virtual Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output) const;
+  virtual Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const;
+  virtual Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid) const;
+  virtual std::tuple<Tensor,Tensor> cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output) const;
+  virtual Tensor det(const Tensor & self) const;
+  virtual std::tuple<Tensor,Tensor,Tensor,Tensor> _det_with_svd(const Tensor & self) const;
+  virtual Tensor dot(const Tensor & self, const Tensor & tensor) const;
+  virtual Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) const;
+  virtual Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) const;
+  virtual Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) const;
+  virtual Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type) const;
+  virtual Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) const;
+  virtual Tensor empty_like(const Tensor & self) const;
+  virtual std::tuple<Tensor,Tensor,Tensor> embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) const;
+  virtual Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) const;
+  virtual Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) const;
+  virtual Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) const;
+  virtual Tensor expand(const Tensor & self, IntList size) const;
+  virtual Tensor expand_as(const Tensor & self, const Tensor & other) const;
+  virtual Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce) const;
+  virtual Tensor ger(const Tensor & self, const Tensor & vec2) const;
+  virtual Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) const;
+  virtual Tensor index(const Tensor & self, TensorList indices) const;
+  virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const;
+  virtual bool is_cuda(const Tensor & self) const;
+  virtual bool is_distributed(const Tensor & self) const;
+  virtual bool is_floating_point(const Tensor & self) const;
+  virtual bool is_nonzero(const Tensor & self) const;
+  virtual bool is_same_size(const Tensor & self, const Tensor & other) const;
+  virtual bool is_signed(const Tensor & self) const;
+  virtual bool is_sparse(const Tensor & self) const;
+  virtual Tensor matmul(const Tensor & self, const Tensor & other) const;
+  virtual std::tuple<Tensor,Tensor> max_pool1d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const;
+  virtual Tensor mm(const Tensor & self, const Tensor & mat2) const;
+  virtual Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const;
+  virtual Tensor mv(const Tensor & self, const Tensor & vec) const;
+  virtual Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const;
+  virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const;
+  virtual Tensor permute(const Tensor & self, IntList dims) const;
+  virtual Tensor pin_memory(const Tensor & self) const;
+  virtual Tensor rand_like(const Tensor & self) const;
+  virtual Tensor randn_like(const Tensor & self) const;
+  virtual Tensor repeat(const Tensor & self, IntList repeats) const;
+  virtual std::tuple<Tensor,Tensor> RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) const;
+  virtual Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes) const;
+  virtual Tensor rrelu(const Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const;
+  virtual Tensor & rrelu_(Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const;
+  virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const;
+  virtual Tensor selu(const Tensor & self) const;
+  virtual Tensor & selu_(Tensor & self) const;
+  virtual int64_t size(const Tensor & self, int64_t dim) const;
+  virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const;
+  virtual std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim=0) const;
+  virtual Tensor squeeze(const Tensor & self) const;
+  virtual Tensor squeeze(const Tensor & self, int64_t dim) const;
+  virtual Tensor & squeeze_(Tensor & self) const;
+  virtual Tensor & squeeze_(Tensor & self, int64_t dim) const;
+  virtual Tensor stack(TensorList tensors, int64_t dim=0) const;
+  virtual Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim=0) const;
+  virtual Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0) const;
+  virtual int64_t stride(const Tensor & self, int64_t dim) const;
+  virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const;
+  virtual Tensor & t_(Tensor & self) const;
+  virtual Tensor type_as(const Tensor & self, const Tensor & other) const;
+  virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const;
+  virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const;
+  virtual Tensor view_as(const Tensor & self, const Tensor & other) const;
+  virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const;
+  virtual Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other) const;
+  virtual Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output) const;
+  virtual Tensor poisson(const Tensor & self, Generator * generator=nullptr) const;
+  virtual Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) const;
+  virtual std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state) const;
+  virtual std::tuple<Tensor,Tensor,Tensor,std::vector<Tensor>> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array<bool,4> output_mask) const;
+protected:
+  Context* context;
+};
+
+
+}
diff --git a/aten/src/ATen/.gitignore b/aten/src/ATen/.gitignore
new file mode 100644
index 0000000..12bfd61
--- /dev/null
+++ b/aten/src/ATen/.gitignore
@@ -0,0 +1 @@
+Config.h
diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
new file mode 100644
index 0000000..a7084d4
--- /dev/null
+++ b/aten/src/ATen/ATen.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "ATen/ATenGeneral.h"
+#include "ATen/CPUGeneral.h"
+#include "ATen/Allocator.h"
+#include "ATen/Scalar.h"
+#include "ATen/Type.h"
+#include "ATen/Generator.h"
+#include "ATen/Context.h"
+#include "ATen/Storage.h"
+#include "ATen/Tensor.h"
+#include "ATen/Device.h"
+#include "ATen/TensorGeometry.h"
+#include "ATen/Functions.h"
+#include "ATen/Formatting.h"
+#include "ATen/TensorOperators.h"
+#include "ATen/TensorMethods.h"
+#include "ATen/Dispatch.h"
+#include "ATen/DimVector.h"
+#include "ATen/DeviceGuard.h"
+#include "ATen/TensorOptions.h"
+#include "ATen/Layout.h"
+#include "ATen/OptionsGuard.h"
diff --git a/aten/src/ATen/ATenConfig.cmake.in b/aten/src/ATen/ATenConfig.cmake.in
new file mode 100644
index 0000000..e945926
--- /dev/null
+++ b/aten/src/ATen/ATenConfig.cmake.in
@@ -0,0 +1,9 @@
+# Find the TH includes and library
+#
+# ATEN_INCLUDE_DIR -- where to find the includes
+# ATEN_LIBRARIES -- list of libraries to link against
+# ATEN_FOUND -- set to 1 if found
+
+SET(ATEN_FOUND 1)
+SET(ATEN_INCLUDE_DIR "@ATEN_INCLUDE_DIR@")
+SET(ATEN_LIBRARIES "@ATEN_LIBRARIES@")
diff --git a/aten/src/ATen/ATenGeneral.h b/aten/src/ATen/ATenGeneral.h
new file mode 100644
index 0000000..88c58a0
--- /dev/null
+++ b/aten/src/ATen/ATenGeneral.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#ifdef _WIN32
+# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#  define AT_API __declspec(dllexport)
+# else
+#  define AT_API __declspec(dllimport)
+# endif
+#else
+# define AT_API
+#endif
diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h
new file mode 100644
index 0000000..b11b5ba
--- /dev/null
+++ b/aten/src/ATen/AccumulateType.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "ATen/Config.h"
+#include "ATen/Half.h"
+
+// Defines the accumulation type for a scalar type.
+// Example:
+//   using accscalar_t = acc_type<scalar_t, true>;
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#include <cuda_fp16.h>
+#endif
+
+namespace at {
+
+template <typename T, bool is_cuda>
+struct AccumulateType { };
+
+#ifdef __CUDACC__
+template <> struct AccumulateType<half, true> { using type = float; };
+#endif
+template <> struct AccumulateType<Half, true> { using type = float; };
+template <> struct AccumulateType<float, true> { using type = float; };
+template <> struct AccumulateType<double, true> { using type = double; };
+template <> struct AccumulateType<int8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<char, true> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, true> { using type = int64_t; };
+template <> struct AccumulateType<float, false> { using type = double; };
+template <> struct AccumulateType<double, false> { using type = double; };
+template <> struct AccumulateType<int8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<char, false> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, false> { using type = int64_t; };
+
+template<typename T, bool is_cuda>
+using acc_type = typename AccumulateType<T, is_cuda>::type;
+
+}  // namespace at
diff --git a/aten/src/ATen/AlignOf.h b/aten/src/ATen/AlignOf.h
new file mode 100644
index 0000000..5e9f012
--- /dev/null
+++ b/aten/src/ATen/AlignOf.h
@@ -0,0 +1,145 @@
+//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AlignedCharArray and AlignedCharArrayUnion classes.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::AlignOf
+// replaced LLVM_ALIGNAS with alignas
+
+#pragma once
+
+#include <cstddef>
+
+namespace at {
+
+/// \struct AlignedCharArray
+/// \brief Helper for building an aligned character array type.
+///
+/// This template is used to explicitly build up a collection of aligned
+/// character array types. We have to build these up using a macro and explicit
+/// specialization to cope with MSVC (at least till 2015) where only an
+/// integer literal can be used to specify an alignment constraint. Once built
+/// up here, we can then begin to indirect between these using normal C++
+/// template parameters.
+
+// MSVC requires special handling here.
+#ifndef _MSC_VER
+
+template<size_t Alignment, size_t Size>
+struct AlignedCharArray {
+  alignas(Alignment) char buffer[Size];
+};
+
+#else // _MSC_VER
+
+/// \brief Create a type with an aligned char buffer.
+template<size_t Alignment, size_t Size>
+struct AlignedCharArray;
+
+// We provide special variations of this template for the most common
+// alignments because __declspec(align(...)) doesn't actually work when it is
+// a member of a by-value function argument in MSVC, even if the alignment
+// request is something reasonably like 8-byte or 16-byte. Note that we can't
+// even include the declspec with the union that forces the alignment because
+// MSVC warns on the existence of the declspec despite the union member forcing
+// proper alignment.
+
+template<size_t Size>
+struct AlignedCharArray<1, Size> {
+  union {
+    char aligned;
+    char buffer[Size];
+  };
+};
+
+template<size_t Size>
+struct AlignedCharArray<2, Size> {
+  union {
+    short aligned;
+    char buffer[Size];
+  };
+};
+
+template<size_t Size>
+struct AlignedCharArray<4, Size> {
+  union {
+    int aligned;
+    char buffer[Size];
+  };
+};
+
+template<size_t Size>
+struct AlignedCharArray<8, Size> {
+  union {
+    double aligned;
+    char buffer[Size];
+  };
+};
+
+
+// The rest of these are provided with a __declspec(align(...)) and we simply
+// can't pass them by-value as function arguments on MSVC.
+
+#define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template<size_t Size> \
+  struct AlignedCharArray<x, Size> { \
+    __declspec(align(x)) char buffer[Size]; \
+  };
+
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
+
+#undef AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#endif // _MSC_VER
+
+namespace detail {
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
+class AlignerImpl {
+  T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10;
+
+  AlignerImpl() = delete;
+};
+
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
+union SizerImpl {
+  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
+       arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+       arr9[sizeof(T9)], arr10[sizeof(T10)];
+};
+} // end namespace detail
+
+/// \brief This union template exposes a suitably aligned and sized character
+/// array member which can hold elements of any of up to ten types.
+///
+/// These types may be arrays, structs, or any other types. The goal is to
+/// expose a char array buffer member which can be used as suitable storage for
+/// a placement new of any of these types. Support for more than ten types can
+/// be added at the cost of more boilerplate.
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
+struct AlignedCharArrayUnion : AlignedCharArray<
+    alignof(detail::AlignerImpl<T1, T2, T3, T4, T5,
+                                      T6, T7, T8, T9, T10>),
+    sizeof(::at::detail::SizerImpl<T1, T2, T3, T4, T5,
+                                     T6, T7, T8, T9, T10>)> {
+};
+} // end namespace at
diff --git a/aten/src/ATen/Allocator.cpp b/aten/src/ATen/Allocator.cpp
new file mode 100644
index 0000000..7d2f1fa
--- /dev/null
+++ b/aten/src/ATen/Allocator.cpp
@@ -0,0 +1,14 @@
+#include <ATen/Allocator.h>
+
+namespace at {
+
+static void deleteInefficientStdFunctionContext(void* ptr) {
+  delete static_cast<InefficientStdFunctionContext*>(ptr);
+}
+
+at::DataPtr
+InefficientStdFunctionContext::makeDataPtr(void* ptr, const std::function<void(void*)>& deleter, Device device) {
+  return {ptr, new InefficientStdFunctionContext({ptr, deleter}), &deleteInefficientStdFunctionContext, device};
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
new file mode 100644
index 0000000..867ae4c
--- /dev/null
+++ b/aten/src/ATen/Allocator.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <memory>
+#include <stddef.h>
+
+#include <ATen/Error.h>
+#include <ATen/Retainable.h>
+#include <ATen/Device.h>
+#include <ATen/detail/UniqueVoidPtr.h>
+
+namespace at {
+
+// A DataPtr is a unique pointer (with an attached deleter and some
+// context for the deleter) to some memory, which also records what
+// device is for its data.
+//
+// nullptr DataPtrs can still have a nontrivial device; this allows
+// us to treat zero-size allocations uniformly with non-zero allocations.
+//
+class DataPtr {
+private:
+  detail::UniqueVoidPtr ptr_;
+  Device device_;
+public:
+  // Choice of CPU here is arbitrary; if there's an "undefined" device
+  // we could use that too
+  DataPtr() : ptr_(), device_(kCPU) {}
+  DataPtr(void* data, Device device)
+    : ptr_(data), device_(device) {}
+  DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
+    : ptr_(data, ctx, ctx_deleter), device_(device) {}
+  void* operator->() const { return ptr_.get(); }
+  void* get() const { return ptr_.get(); }
+  void* get_context() const { return ptr_.get_context(); }
+  void* release_context() { return ptr_.release_context(); }
+  operator bool() const { return static_cast<bool>(ptr_); }
+  template <typename T>
+  T* cast_context(DeleterFnPtr expected_deleter) const {
+    return ptr_.cast_context<T>(expected_deleter);
+  }
+  Device device() const { return device_; }
+};
+
+// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a
+// CPU nullptr
+
+inline bool operator==(const at::DataPtr& dp, std::nullptr_t) noexcept { return !dp; }
+inline bool operator==(std::nullptr_t, const at::DataPtr& dp) noexcept { return !dp; }
+inline bool operator!=(const at::DataPtr& dp, std::nullptr_t) noexcept { return dp; }
+inline bool operator!=(std::nullptr_t, const at::DataPtr& dp) noexcept { return dp; }
+
+// Note [raw_allocate/raw_deallocate and Thrust]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Thrust's support for custom allocators requires us to write something
+// like this:
+//
+//  class ThrustAllocator {
+//    char* allocate(size_t);
+//    void deallocate(char*, size_t);
+//  };
+//
+// This is not good for our unique_ptr based allocator interface, as
+// there is no way to get to the context when we free.
+//
+// However, in some cases the context is exactly the same as
+// the data pointer.  In this case, we can support the "raw"
+// allocate and deallocate interface.  This is what
+// raw_deleter signifies.  By default, it returns a nullptr, which means that
+// the raw interface is not implemented.  Be sure to implement it whenever
+// possible, or the raw interface will incorrectly reported as unsupported,
+// when it is actually possible.
+
+struct Allocator {
+  virtual ~Allocator() {}
+  virtual at::DataPtr allocate(size_t n) const = 0;
+
+  // If this returns a non nullptr, it means that allocate()
+  // is guaranteed to return a unique_ptr with this deleter attached;
+  // it means the rawAllocate and rawDeallocate APIs are safe to use.
+  // This function MUST always return the same BoundDeleter.
+  virtual DeleterFnPtr raw_deleter() const { return nullptr; }
+  void* raw_allocate(size_t n) {
+    auto dptr = allocate(n);
+    AT_ASSERT(dptr.get() == dptr.get_context());
+    return dptr.release_context();
+  }
+  void raw_deallocate(void* ptr) {
+    auto d = raw_deleter();
+    AT_ASSERT(d);
+    d(ptr);
+  }
+};
+
+struct AT_API InefficientStdFunctionContext {
+  std::unique_ptr<void, std::function<void(void*)>> ptr_;
+  InefficientStdFunctionContext(std::unique_ptr<void, std::function<void(void*)>>&& ptr)
+    : ptr_(std::move(ptr)) {}
+  static at::DataPtr makeDataPtr(void* ptr, const std::function<void(void*)>& deleter, Device device);
+};
+
+}  // namespace at
diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h
new file mode 100644
index 0000000..df14402
--- /dev/null
+++ b/aten/src/ATen/ArrayRef.h
@@ -0,0 +1,192 @@
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::ArrayRef.
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+
+#pragma once
+
+#include <ATen/Error.h>
+#include <ATen/SmallVector.h>
+
+#include <array>
+#include <iterator>
+#include <vector>
+
+namespace at {
+  /// ArrayRef - Represent a constant reference to an array (0 or more elements
+  /// consecutively in memory), i.e. a start pointer and a length.  It allows
+  /// various APIs to take consecutive elements easily and conveniently.
+  ///
+  /// This class does not own the underlying data, it is expected to be used in
+  /// situations where the data resides in some other buffer, whose lifetime
+  /// extends past that of the ArrayRef. For this reason, it is not in general
+  /// safe to store an ArrayRef.
+  ///
+  /// This is intended to be trivially copyable, so it should be passed by
+  /// value.
+  template<typename T>
+  class ArrayRef {
+  public:
+    typedef const T *iterator;
+    typedef const T *const_iterator;
+    typedef size_t size_type;
+
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  private:
+    /// The start of the array, in an external buffer.
+    const T *Data;
+
+    /// The number of elements.
+    size_type Length;
+
+  public:
+    /// @name Constructors
+    /// @{
+
+    /// Construct an empty ArrayRef.
+    /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {}
+
+    /// Construct an ArrayRef from a single element.
+    /*implicit*/ ArrayRef(const T &OneElt)
+      : Data(&OneElt), Length(1) {}
+
+    /// Construct an ArrayRef from a pointer and length.
+    /*implicit*/ ArrayRef(const T *data, size_t length)
+      : Data(data), Length(length) {}
+
+    /// Construct an ArrayRef from a range.
+    ArrayRef(const T *begin, const T *end)
+      : Data(begin), Length(end - begin) {}
+
+    /// Construct an ArrayRef from a SmallVector. This is templated in order to
+    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+    /// copy-construct an ArrayRef.
+    template<typename U>
+    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    }
+
+    /// Construct an ArrayRef from a std::vector.
+    template<typename A>
+    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+    /// Construct an ArrayRef from a std::array
+    template <size_t N>
+    /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
+        : Data(Arr.data()), Length(N) {}
+
+    /// Construct an ArrayRef from a C array.
+    template <size_t N>
+    /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+    /// Construct an ArrayRef from a std::initializer_list.
+    /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+    : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()),
+      Length(Vec.size()) {}
+
+    /// @}
+    /// @name Simple Operations
+    /// @{
+
+    const_iterator begin() const { return Data; }
+    const_iterator end() const { return Data + Length; }
+
+    reverse_iterator rbegin() const { return reverse_iterator(end()); }
+    reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+    /// empty - Check if the array is empty.
+    bool empty() const { return Length == 0; }
+
+    const T *data() const { return Data; }
+
+    /// size - Get the array size.
+    size_t size() const { return Length; }
+
+    /// front - Get the first element.
+    const T &front() const {
+      AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
+      return Data[0];
+    }
+
+    /// back - Get the last element.
+    const T &back() const {
+      AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
+      return Data[Length-1];
+    }
+
+    /// equals - Check for element-wise equality.
+    bool equals(ArrayRef RHS) const {
+      if (Length != RHS.Length)
+        return false;
+      return std::equal(begin(), end(), RHS.begin());
+    }
+
+    /// slice(n, m) - Chop off the first N elements of the array, and keep M
+    /// elements in the array.
+    ArrayRef<T> slice(size_t N, size_t M) const {
+      AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size());
+      return ArrayRef<T>(data()+N, M);
+    }
+
+    /// slice(n) - Chop off the first N elements of the array.
+    ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
+
+    /// @}
+    /// @name Operator Overloads
+    /// @{
+    const T &operator[](size_t Index) const {
+      return Data[Index];
+    }
+
+    /// Vector compatibility
+    const T &at(size_t Index) const {
+      AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length);
+      return Data[Index];
+    }
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
+    operator=(U &&Temporary) = delete;
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
+    operator=(std::initializer_list<U>) = delete;
+
+    /// @}
+    /// @name Expensive Operations
+    /// @{
+    std::vector<T> vec() const {
+      return std::vector<T>(Data, Data+Length);
+    }
+
+    /// @}
+    /// @name Conversion operators
+    /// @{
+    operator std::vector<T>() const {
+      return std::vector<T>(Data, Data+Length);
+    }
+
+    /// @}
+  };
+
+} // end namespace at
diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp
new file mode 100644
index 0000000..8ee61c7
--- /dev/null
+++ b/aten/src/ATen/Backtrace.cpp
@@ -0,0 +1,230 @@
+#include <ATen/optional.h>
+#include <ATen/Backtrace.h>
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if !defined(_WIN32)
+#include <cxxabi.h>
+#include <execinfo.h>
+#endif // !defined(_WIN32)
+
+namespace at {
+#if defined(_MSC_VER)
+// Windows does not have cxxabi.h, so we will simply return the original.
+std::string demangle(const char* name) {
+  return std::string(name);
+}
+#else
+std::string demangle(const char* name) {
+  int status = -1;
+
+  // This function will demangle the mangled function name into a more human
+  // readable format, e.g. _Z1gv -> g().
+  // More information:
+  // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/libsupc%2B%2B/cxxabi.h
+  // NOTE: `__cxa_demangle` returns a malloc'd string that we have to free
+  // ourselves.
+  std::unique_ptr<char, std::function<void(char*)>> demangled(
+      abi::__cxa_demangle(
+          name,
+          /*__output_buffer=*/nullptr,
+          /*__length=*/0,
+          &status),
+      /*deleter=*/free);
+
+  // Demangling may fail, for example when the name does not follow the
+  // standard C++ (Itanium ABI) mangling scheme. This is the case for `main`
+  // or `clone` for example, so the mangled name is a fine default.
+  if (status == 0) {
+    return demangled.get();
+  } else {
+    return name;
+  }
+}
+#endif
+
+// TODO: This backtrace retrieval can be implemented on Windows via the Windows
+// API using `CaptureStackBackTrace` and `SymFromAddr`.
+// https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code
+// https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx.
+#if !defined(_WIN32)
+
+namespace {
+
+struct FrameInformation {
+  /// If available, the demangled name of the function at this frame, else
+  /// whatever (possibly mangled) name we got from `backtrace()`.
+  std::string function_name;
+  /// This is a number in hexadecimal form (e.g. "0xdead") representing the
+  /// offset into the function's machine code at which the function's body
+  /// starts, i.e. skipping the "prologue" that handles stack manipulation and
+  /// other calling convention things.
+  std::string offset_into_function;
+  /// NOTE: In debugger parlance, the "object file" refers to the ELF file that
+  /// the symbol originates from, i.e. either an executable or a library.
+  std::string object_file;
+};
+
+bool is_python_frame(const FrameInformation& frame) {
+  return frame.object_file == "python" ||
+      (frame.object_file.find("libpython") != std::string::npos);
+}
+
+at::optional<FrameInformation> parse_frame_information(
+    const std::string& frame_string) {
+  FrameInformation frame;
+
+  // This is the function name in the CXX ABI mangled format, e.g. something
+  // like _Z1gv. Reference:
+  // https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling
+  std::string mangled_function_name;
+
+#if defined(__GLIBCXX__)
+  // In GLIBCXX, `frame_string` follows the pattern
+  // `<object-file>(<mangled-function-name>+<offset-into-function>)
+  // [<return-address>]`
+
+  auto function_name_start = frame_string.find("(");
+  if (function_name_start == std::string::npos) {
+    return at::nullopt;
+  }
+  function_name_start += 1;
+
+  auto offset_start = frame_string.find('+', function_name_start);
+  if (offset_start == std::string::npos) {
+    return at::nullopt;
+  }
+  offset_start += 1;
+
+  const auto offset_end = frame_string.find(')', offset_start);
+  if (offset_end == std::string::npos) {
+    return at::nullopt;
+  }
+
+  frame.object_file = frame_string.substr(0, function_name_start - 1);
+  frame.offset_into_function =
+      frame_string.substr(offset_start, offset_end - offset_start);
+
+  // NOTE: We don't need to parse the return address because
+  // we already have it from the call to `backtrace()`.
+
+  mangled_function_name = frame_string.substr(
+      function_name_start, (offset_start - 1) - function_name_start);
+#elif defined(_LIBCPP_VERSION)
+  // In LIBCXX, The pattern is
+  // `<frame number> <object-file> <return-address> <mangled-function-name> +
+  // <offset-into-function>`
+  std::string skip;
+  std::istringstream input_stream(frame_string);
+  // operator>>() does not fail -- if the input stream is corrupted, the
+  // strings will simply be empty.
+  input_stream >> skip >> frame.object_file >> skip >> mangled_function_name >>
+      skip >> frame.offset_into_function;
+#else
+#warning Unknown standard library, backtraces may have incomplete debug information
+  return at::nullopt;
+#endif // defined(__GLIBCXX__)
+
+  // Some system-level functions don't have sufficient debug information, so
+  // we'll display them as "<unknown function>". They'll still have a return
+  // address and other pieces of information.
+  if (mangled_function_name.empty()) {
+    frame.function_name = "<unknown function>";
+    return frame;
+  }
+
+  frame.function_name = demangle(mangled_function_name.c_str());
+  return frame;
+}
+
+} // anonymous namespace
+
+#endif // !defined(_WIN32)
+
+std::string get_backtrace(
+    size_t frames_to_skip,
+    size_t maximum_number_of_frames,
+    bool skip_python_frames) {
+#if !defined(_WIN32)
+
+  // We always skip this frame (backtrace).
+  frames_to_skip += 1;
+
+  std::vector<void*> callstack(
+      frames_to_skip + maximum_number_of_frames, nullptr);
+  // backtrace() gives us a list of return addresses in the current call stack.
+  // NOTE: As per man (3) backtrace it can never fail
+  // (http://man7.org/linux/man-pages/man3/backtrace.3.html).
+  auto number_of_frames =
+      ::backtrace(callstack.data(), static_cast<int>(callstack.size()));
+
+  // Skip as many frames as requested. This is not efficient, but the sizes here
+  // are small and it makes the code nicer and safer.
+  for (; frames_to_skip > 0 && number_of_frames > 0;
+       --frames_to_skip, --number_of_frames) {
+    callstack.erase(callstack.begin());
+  }
+
+  // `number_of_frames` is strictly less than the current capacity of
+  // `callstack`, so this is just a pointer subtraction and makes the subsequent
+  // code safer.
+  callstack.resize(static_cast<size_t>(number_of_frames));
+
+  // `backtrace_symbols` takes the return addresses obtained from `backtrace()`
+  // and fetches string representations of each stack. Unfortunately it doesn't
+  // return a struct of individual pieces of information but a concatenated
+  // string, so we'll have to parse the string after. NOTE: The array returned
+  // by `backtrace_symbols` is malloc'd and must be manually freed, but not the
+  // strings inside the array.
+  std::unique_ptr<char*, std::function<void(char**)>> raw_symbols(
+      ::backtrace_symbols(callstack.data(), static_cast<int>(callstack.size())),
+      /*deleter=*/free);
+  const std::vector<std::string> symbols(
+      raw_symbols.get(), raw_symbols.get() + callstack.size());
+
+  // The backtrace string goes into here.
+  std::ostringstream stream;
+
+  // Toggles to true after the first skipped python frame.
+  bool has_skipped_python_frames = false;
+
+  for (size_t frame_number = 0; frame_number < callstack.size();
+       ++frame_number) {
+    const auto frame = parse_frame_information(symbols[frame_number]);
+
+    if (skip_python_frames && frame && is_python_frame(*frame)) {
+      if (!has_skipped_python_frames) {
+        stream << "<omitting python frames>\n";
+        has_skipped_python_frames = true;
+      }
+      continue;
+    }
+
+    // frame #<number>:
+    stream << "frame #" << frame_number << ": ";
+
+    if (frame) {
+      // <function_name> + <offset> (<return-address> in <object-file>)
+      stream << frame->function_name << " + " << frame->offset_into_function
+             << " (" << callstack[frame_number] << " in " << frame->object_file
+             << ")\n";
+    } else {
+      // In the edge-case where we couldn't parse the frame string, we can
+      // just use it directly (it may have a different format).
+      stream << symbols[frame_number] << "\n";
+    }
+  }
+
+  return stream.str();
+
+#else
+
+  return "(no backtrace available)";
+#endif
+}
+} // namespace at
diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h
new file mode 100644
index 0000000..347c430
--- /dev/null
+++ b/aten/src/ATen/Backtrace.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include <ATen/ATenGeneral.h>
+
+namespace at {
+/// Utility to demangle a C++ symbol name.
+AT_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const std::string name = demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+AT_API std::string get_backtrace(
+    size_t frames_to_skip = 0,
+    size_t maximum_number_of_frames = 64,
+    bool skip_python_frames = true);
+} // namespace at
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
new file mode 100644
index 0000000..9e19223
--- /dev/null
+++ b/aten/src/ATen/CMakeLists.txt
@@ -0,0 +1,456 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  # ---[ Generate and install header and cpp files
+  include(../../../cmake/Codegen.cmake)
+endif()
+
+IF(NOT MSVC)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers")
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-absolute-value")
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-absolute-value")
+ENDIF(NOT MSVC)
+
+################################################################################
+# Helper functions
+################################################################################
+
+function(filter_list output input)
+    unset(result)
+    foreach(filename ${${input}})
+        foreach(pattern ${ARGN})
+            if("${filename}" MATCHES "${pattern}")
+                list(APPEND result "${filename}")
+            endif()
+        endforeach()
+    endforeach()
+    set(${output} ${result} PARENT_SCOPE)
+endfunction()
+
+
+# Can be compiled standalone
+IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR)
+  SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory")
+  SET(AT_INSTALL_LIB_DIR "lib" CACHE PATH "AT install library subdirectory")
+  SET(AT_INSTALL_INCLUDE_DIR "include" CACHE PATH "AT install include subdirectory")
+  SET(AT_INSTALL_SHARE_DIR "share" CACHE PATH "AT install include subdirectory")
+ENDIF()
+
+CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
+CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
+
+# NB: If you edit these globs, you'll have to update setup.py package_data as well
+FILE(GLOB base_h "*.h" "detail/*.h")
+FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
+FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
+FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
+FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
+FILE(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
+FILE(GLOB cudnn_cpp "cudnn/*.cpp")
+FILE(GLOB mkl_cpp "mkl/*.cpp")
+FILE(GLOB mkldnn_cpp "mkldnn/*.cpp")
+
+FILE(GLOB native_cpp "native/*.cpp")
+FILE(GLOB native_sparse_cpp "native/sparse/*.cpp")
+FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
+FILE(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
+FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+FILE(GLOB native_cuda_cu "native/cuda/*.cu")
+FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
+FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
+FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
+
+set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+if(AT_MKL_ENABLED)
+  set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
+endif()
+if(AT_MKLDNN_ENABLED)
+  set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp})
+endif()
+
+IF(USE_CUDA OR USE_ROCM)
+  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
+  set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu})
+  set(all_cuda_cpp ${native_cudnn_cpp} ${native_sparse_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS})
+  IF(CUDNN_FOUND)
+    SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp})
+  ENDIF()
+endif()
+
+filter_list(generated_h generated_cpp "\\.h$")
+filter_list(cuda_generated_h cuda_generated_cpp "\\.h$")
+
+list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+# so the build can find the generated header files
+list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_BINARY_DIR})
+IF(NOT AT_LINK_STYLE)
+  SET(AT_LINK_STYLE SHARED)
+ENDIF()
+
+IF(BLAS_FOUND)
+  IF ($ENV{TH_BINARY_BUILD})
+    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
+    list(APPEND ATen_CPU_DEPENDENCY_LIBS
+      "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+    if(USE_CUDA OR USE_ROCM)
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+        "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+    endif()
+  ELSE ($ENV{TH_BINARY_BUILD})
+    list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES})
+    if(USE_CUDA OR USE_ROCM)
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${BLAS_LIBRARIES}")
+    endif()
+  ENDIF ($ENV{TH_BINARY_BUILD})
+ENDIF(BLAS_FOUND)
+
+IF(LAPACK_FOUND)
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
+  if(USE_CUDA OR USE_ROCM)
+    # Although Lapack provides CPU (and thus, one might expect that ATen_cuda
+    # would not need this at all), some of our libraries (magma in particular)
+    # backend to CPU BLAS/LAPACK implementations, and so it is very important
+    # we get the *right* implementation, because even if the symbols are the
+    # same, LAPACK implementions may have different calling conventions.
+    # This caused https://github.com/pytorch/pytorch/issues/7353
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
+  endif()
+ENDIF(LAPACK_FOUND)
+
+IF (UNIX AND NOT APPLE)
+   INCLUDE(CheckLibraryExists)
+   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
+   CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
+   IF(NEED_LIBRT)
+     list(APPEND ATen_CPU_DEPENDENCY_LIBS rt)
+     SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
+   ENDIF(NEED_LIBRT)
+ENDIF(UNIX AND NOT APPLE)
+
+IF(UNIX)
+  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+  IF(HAVE_MMAP)
+    ADD_DEFINITIONS(-DHAVE_MMAP=1)
+  ENDIF(HAVE_MMAP)
+  # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
+  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+  IF(HAVE_SHM_OPEN)
+    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+  ENDIF(HAVE_SHM_OPEN)
+  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+  IF(HAVE_SHM_UNLINK)
+    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+  ENDIF(HAVE_SHM_UNLINK)
+  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+  IF(HAVE_MALLOC_USABLE_SIZE)
+    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+  ENDIF(HAVE_MALLOC_USABLE_SIZE)
+ENDIF(UNIX)
+
+if(NOT MSVC)
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS m)
+endif()
+
+if(MKLDNN_FOUND)
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${MKLDNN_LIBRARIES})
+endif(MKLDNN_FOUND)
+
+list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
+
+if(NOT MSVC)
+  # Preserve values for the main build
+  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
+  set(__aten_sleef_build_tests ${BUILD_TESTS})
+
+  # Unset our restrictive C++ flags here and reset them later.
+  # Remove this once we use proper target_compile_options.
+  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(CMAKE_CXX_FLAGS)
+
+  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+  set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+  set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+  set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+  add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef)
+  set_property(TARGET sleef PROPERTY FOLDER "dependencies")
+  list(APPEND ATen_THIRD_PARTY_INCLUDE ${CMAKE_BINARY_DIR}/include)
+  link_directories(${CMAKE_BINARY_DIR}/sleef/lib)
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
+
+  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
+
+  # Set these back. TODO: Use SLEEF_ to pass these instead
+  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
+  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
+endif()
+
+IF(USE_CUDA AND NOT USE_ROCM)
+  IF ($ENV{ATEN_STATIC_CUDA})
+    # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
+    # we first have to build a fake lib that links with no device callbacks,
+    # and then we link against this object file.
+    # This was recommended by the CuFFT team at NVIDIA
+
+    # build fake CuFFT lib in build dir
+    EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
+    if(${CUDA_VERSION_MAJOR} EQUAL "8")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60)
+    elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60
+	--generate-code arch=compute_70,code=sm_70)
+    else()
+      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
+    endif()
+    ADD_CUSTOM_COMMAND(
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a
+      COMMAND "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" -o ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a -Xcompiler -fPIC
+      ${CUFFT_FAKELINK_OPTIONS}
+      --device-link ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc -lcufft_static -lculibos
+      )
+    ADD_CUSTOM_TARGET(FAKELINKED_CUFFT_TARGET DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
+    add_library(FAKELINKED_CUFFT STATIC IMPORTED GLOBAL)
+    add_dependencies(FAKELINKED_CUFFT FAKELINKED_CUFFT_TARGET)
+    set_target_properties(FAKELINKED_CUFFT PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
+
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+      ${CUDA_LIBRARIES}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusparse_static.a
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
+      FAKELINKED_CUFFT
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static.a
+      )
+  ELSE()
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+      ${CUDA_LIBRARIES}
+      ${CUDA_cusparse_LIBRARY}
+      ${CUDA_curand_LIBRARY})
+  ENDIF()
+
+  if(CUDNN_FOUND)
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${CUDNN_LIBRARIES})
+  endif(CUDNN_FOUND)
+
+  IF(USE_MAGMA)
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES})
+    IF ($ENV{TH_BINARY_BUILD})
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+        "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+    ENDIF($ENV{TH_BINARY_BUILD})
+  ENDIF(USE_MAGMA)
+  IF ($ENV{ATEN_STATIC_CUDA})
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a")
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
+  ENDIF($ENV{ATEN_STATIC_CUDA})
+ENDIF()
+
+IF(USE_ROCM)
+ ### Link in the ROCm libraries BLAS / RNG.
+ FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
+ FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)
+
+ list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
+ENDIF()
+
+# Include CPU paths for CUDA as well
+list(APPEND ATen_CUDA_INCLUDE ${ATen_CPU_INCLUDE})
+
+# We have two libraries: libATen_cpu.so and libATen_cuda.so,
+# with libATen_cuda.so depending on libATen_cpu.so.  The CPU library
+# contains CPU code only.  libATen_cpu.so is invariant to the setting
+# of USE_CUDA (it always builds the same way); libATen_cuda.so is only
+# built when USE_CUDA=1 and CUDA is available.
+set(ATen_CPU_SRCS ${all_cpu_cpp})
+if(AT_LINK_STYLE STREQUAL "INTERFACE")
+  # Source code can't be added to an interface library, so it is
+  # passed back to be compiled into the containing library
+  add_library(ATen_cpu INTERFACE)
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS ATEN_CPU_FILES_GEN_LIB)
+else()
+  add_library(ATen_cpu ${AT_LINK_STYLE} ${ATen_CPU_SRCS})
+  if (ATen_THIRD_PARTY_INCLUDE)
+    target_include_directories(ATen_cpu SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
+  endif()
+  target_include_directories(ATen_cpu INTERFACE $<INSTALL_INTERFACE:include>)
+  target_include_directories(ATen_cpu PRIVATE ${ATen_CPU_INCLUDE})
+  target_link_libraries(ATen_cpu PUBLIC ${ATen_CPU_DEPENDENCY_LIBS})
+  target_link_libraries(ATen_cpu PRIVATE ATEN_CPU_FILES_GEN_LIB)
+  caffe2_interface_library(ATen_cpu ATen_cpu_library)
+  # Set standard properties on the target
+  aten_set_target_props(ATen_cpu)
+
+  # Make sure these don't get built by parent
+  set(ATen_CPU_SRCS)
+endif()
+
+if(USE_CUDA OR USE_ROCM)
+  set(ATen_CUDA_SRCS ${all_cuda_cpp})
+  if(AT_LINK_STYLE STREQUAL "INTERFACE")
+    # Source code can't be added to an interface library, so it is
+    # passed back to be compiled into the containing library
+    add_library(ATen_cuda INTERFACE)
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
+  else()
+    # A hack to deal with cuda library dependencies and modern CMake: the
+    # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
+    # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
+    # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
+    # it. We will then manually add the cudart library as interface libs.
+    set(__tmp ${CUDA_LIBRARIES})
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    torch_cuda_based_add_library(ATen_cuda ${AT_LINK_STYLE} ${ATen_CUDA_SRCS})
+    set(CUDA_LIBRARIES ${__tmp})
+    target_link_libraries(ATen_cuda INTERFACE caffe2::cudart)
+
+    target_include_directories(
+        ATen_cuda INTERFACE $<INSTALL_INTERFACE:include>)
+    target_include_directories(
+        ATen_cuda PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
+    target_include_directories(
+        ATen_cuda PRIVATE ${ATen_CUDA_INCLUDE})
+    target_link_libraries(
+        ATen_cuda PRIVATE ${ATen_CUDA_DEPENDENCY_LIBS} ATEN_CUDA_FILES_GEN_LIB)
+
+    # These public dependencies must go after the previous dependencies, as the
+    # order of the libraries in the linker call matters here when statically
+    # linking; libculibos and cublas must be last.
+    target_link_libraries(
+        ATen_cuda PUBLIC ATen_cpu ${ATen_PUBLIC_CUDA_DEPENDENCY_LIBS})
+
+    # Set standard properties on the target
+    aten_set_target_props(ATen_cuda)
+
+    caffe2_interface_library(ATen_cuda ATen_cuda_library)
+
+    # Make sure these don't get built by parent
+    set(ATen_CUDA_SRCS)
+  endif()
+endif()
+
+if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")
+  if(USE_CUDA)
+    if (NOT $ENV{ATEN_STATIC_CUDA})
+      cuda_add_cublas_to_target(ATen_cuda)
+      cuda_add_cufft_to_target(ATen_cuda)
+    endif()
+  endif()
+
+  if(NOT MSVC)
+    aten_compile_options(ATen_cpu)
+    if(USE_CUDA OR USE_ROCM)
+      aten_compile_options(ATen_cuda)
+    endif()
+  endif()
+
+  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
+    set_property(TARGET ATen_cpu PROPERTY CXX_STANDARD 11)
+    if(USE_CUDA OR USE_ROCM)
+      set_property(TARGET ATen_cuda PROPERTY CXX_STANDARD 11)
+    endif()
+  endif()
+endif()
+
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  # Eventually replace this use of LOCATION with use of
+  # $<TARGET_FILE:ATen_cpu>, but generators only work in some cases
+  cmake_policy(SET CMP0026 OLD)
+  get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION)
+  get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME)
+  set(ATEN_LIBRARIES
+    "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}")
+  if(USE_CUDA OR USE_ROCM)
+    get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION)
+    get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME)
+    list(APPEND ATEN_LIBRARIES
+      "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}")
+  endif()
+
+  install(TARGETS ATen_cpu
+    RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
+    LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
+    ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
+
+  if(USE_CUDA OR USE_ROCM)
+    install(TARGETS ATen_cuda
+      RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
+      LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
+      ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
+  endif()
+endif()
+
+SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
+CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake")
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
+  DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
+
+# https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
+FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h})
+  string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
+  GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
+  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
+ENDFOREACH()
+FOREACH(HEADER ${generated_h} ${cuda_generated_h})
+  # NB: Assumed to be flat
+  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen)
+ENDFOREACH()
+INSTALL(FILES ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
+  DESTINATION ${AT_INSTALL_SHARE_DIR}/ATen)
+
+# if(ATEN_NO_TEST)
+#   message("disable test because ATEN_NO_TEST is set")
+# else()
+#   add_subdirectory(test)
+# endif()
+
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  foreach(test_src ${ATen_CPU_TEST_SRCS})
+    get_filename_component(test_name ${test_src} NAME_WE)
+    add_executable(${test_name} "${test_src}")
+    target_include_directories(
+        ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+    target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
+    target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
+    target_link_libraries(${test_name} ATen_cpu)
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    install(TARGETS ${test_name} DESTINATION test)
+  endforeach()
+
+  if(USE_CUDA OR USE_ROCM)
+    foreach(test_src ${ATen_CUDA_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      torch_cuda_based_add_executable(${test_name} "${test_src}")
+      target_include_directories(
+          ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+      target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
+      target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
+      target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda)
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      install(TARGETS ${test_name} DESTINATION test)
+    endforeach()
+  endif()
+
+  # Make sure these don't get built by parent
+  set(ATen_CPU_TEST_SRCS)
+  set(ATen_CUDA_TEST_SRCS)
+endif()
+
+# Pass source, includes, and libs to parent
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
+set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
new file mode 100644
index 0000000..2db2786
--- /dev/null
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -0,0 +1,492 @@
+#pragma once
+
+#include "ATen/Parallel.h"
+#include "ATen/TensorUtils.h"
+#include <limits>
+
+namespace at {
+
+/*
+ * The basic strategy for apply is as follows:
+ *
+ * 1. Starting with the outermost index, loop until we reach a dimension where
+ * the data is no longer contiguous, i.e. the stride at that dimension is not
+ * equal to the size of the tensor defined by the outer dimensions. Let's call
+ * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then
+ * A is equal to the entire Tensor. Let's call the inner tensor B.
+ *
+ * 2. We loop through the indices in B, starting at its outermost dimension. For
+ * example, if B is a 2x2 matrix, then we do:
+ *
+ * B[0][0]
+ * B[0][1]
+ * B[1][0]
+ * B[1][1]
+ *
+ * We set the offset into the underlying storage as (storageOffset + stride_B *
+ * index_B), i.e. basically we compute the offset into the storage as we would
+ * normally for a Tensor. But because we are guaranteed the subsequent data is
+ * contiguous in memory, we can simply loop for sizeof(A) iterations and perform
+ * the operation, without having to follow the order described by the strides of
+ * A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in
+ * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor,
+ * then the first two dimensions can be merged for the purposes of APPLY,
+ * reducing the number of nested loops.
+ */
+
+inline Tensor sort_strides(Tensor& tensor_) {
+  IntList strides = tensor_.strides();
+  std::vector<int64_t> indices;
+  indices.reserve(tensor_.ndimension());
+  for (int64_t i = 0; i < tensor_.ndimension(); i++) {
+    indices.push_back(i);
+  }
+  std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
+    return strides[i1] > strides[i2];
+  });
+  Tensor tensor = tensor_.permute(indices);
+  return tensor;
+}
+
+template <typename Arg>
+inline void _setup_arrays(Tensor& tensor, Arg* iter) {
+  int64_t max_dim = tensor.ndimension();
+  iter->dim_ = 0;
+  for (int64_t i = 0; i < max_dim; i++) {
+    int64_t size = tensor.size(i);
+    int64_t stride = tensor.stride(i);
+    while (i + 1 < max_dim &&
+           (tensor.size(i + 1) == 1 ||
+            tensor.stride(i) == tensor.size(i + 1) * tensor.stride(i + 1))) {
+      size = size * tensor.size(i + 1);
+      if (tensor.size(i + 1) != 1)
+        stride = tensor.stride(i + 1);
+      i++;
+    }
+    iter->sizes_[iter->dim_] = size;
+    iter->strides_[iter->dim_] = stride;
+    iter->dim_++;
+  }
+}
+
+template <typename T, int N>
+struct strided_tensor_iter_fixed {
+ public:
+  T* data_ = NULL;
+  int64_t dim_ = 0;
+
+  int64_t counter_[N] = {0};
+  int64_t sizes_[N] = {0};
+  int64_t strides_[N] = {0};
+
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
+  void operator=(strided_tensor_iter_fixed const& x) = delete;
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
+  strided_tensor_iter_fixed(Tensor& tensor, bool sort_strides = false)
+      : data_(tensor.data<T>()) {
+    memset(counter_, 0, sizeof(int64_t) * N);
+    _setup_arrays(tensor, this);
+  }
+};
+
+template <typename T>
+struct strided_tensor_iter {
+ private:
+ public:
+  T* data_ = NULL;
+  int64_t dim_;
+
+  std::vector<int64_t> counter_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  strided_tensor_iter(strided_tensor_iter const&) = delete;
+  void operator=(strided_tensor_iter const& x) = delete;
+  strided_tensor_iter(strided_tensor_iter&&) = default;
+  strided_tensor_iter(Tensor& tensor)
+      : data_(tensor.data<T>()),
+        dim_(tensor.ndimension()),
+        counter_(dim_, 0),
+        sizes_(tensor.sizes()),
+        strides_(tensor.strides()) {
+    _setup_arrays(tensor, this);
+  }
+};
+
+inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
+  if (tensors.size() == 0)
+    return true;
+  int64_t all_numel = tensors[0].numel();
+  for (size_t i = 1; i < tensors.size(); i++) {
+    if (tensors[i].numel() != all_numel)
+      return false;
+  }
+  return true;
+}
+
+inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
+  std::ostringstream oss;
+  oss << "inconsistent tensor size, expected ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].sizes() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1]
+      << " to have the same number of elements, but got ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].numel() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].numel()
+      << " elements respectively";
+  return oss.str();
+}
+
+inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
+  checkBackend("CPU_tensor_apply", tensors, Backend::CPU);
+  if (!_all_equal_numel(tensors))
+    throw std::runtime_error(_all_equal_numel_error(tensors));
+  // An empty tensor has no elements
+  for (auto& t : tensors)
+    if (t.numel() == 0)
+      return false;
+  return true;
+}
+
+inline int64_t _max_dim_tensors(ArrayRef<Tensor> tensors) {
+  int64_t dim = 0;
+  for (auto& t : tensors)
+    dim = std::max(dim, t.ndimension());
+  return dim;
+}
+
+inline void iterate(int64_t size){};
+
+template <typename Arg, typename... Args>
+inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
+  iter.counter_[iter.dim_ - 1] += size;
+  iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1];
+  iterate(size, iter_tail...);
+}
+
+inline bool iterate_continue() {
+  return true;
+};
+
+template <typename Arg, typename... Args>
+inline bool iterate_continue(Arg& iter, Args&... iter_tail) {
+  return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] &&
+      iterate_continue(iter_tail...);
+}
+
+inline int64_t max_iterate_size() {
+  return std::numeric_limits<int64_t>::max();
+};
+
+template <typename Arg, typename... Args>
+inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) {
+  return std::min(
+      (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]),
+      max_iterate_size(iter_tail...));
+}
+
+inline void iterate_overflow(){};
+
+template <typename Arg, typename... Args>
+inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
+  if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) {
+    for (int64_t i = iter.dim_ - 1; i > 0; i--) {
+      if (iter.counter_[i] == iter.sizes_[i]) {
+        iter.counter_[i] = 0;
+        iter.counter_[i - 1]++;
+        iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) +
+            iter.strides_[i - 1];
+      }
+    }
+  }
+  iterate_overflow(iter_tail...);
+}
+
+inline void forward(int64_t offset){};
+
+template <typename Arg, typename... Args>
+inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
+  int64_t multi = offset;
+  for (int64_t i = iter.dim_ - 1; i >= 0; i--) {
+    int64_t inc = multi % iter.sizes_[i];
+    multi = multi / iter.sizes_[i];
+    iter.data_ = iter.data_ + inc * iter.strides_[i];
+    iter.counter_[i] += inc;
+  }
+  forward(offset, iter_tail...);
+}
+
+inline int64_t max_dim() {
+  return 0;
+}
+
+template <typename Arg, typename... Args>
+inline int64_t max_dim(Arg& iter, Args&... iter_tail) {
+  return std::max(iter.dim_, max_dim(iter_tail...));
+}
+
+inline void apply_op(){};
+
+template <typename Op, typename... Args>
+inline void
+apply_op(int64_t numel, int64_t offset, const Op& op, Args... iters) {
+  // For 0-dim tensors
+  if (numel == 1 && max_dim(iters...) == 0) {
+    op(*iters.data_...);
+    return;
+  }
+  if (offset > 0)
+    forward(offset, iters...);
+  // Splitting this into chunks helps the compiler create faster assembly
+  for (int64_t i = 0; i < numel;) {
+    for (; iterate_continue(iters...) && i < numel;) {
+      op(*iters.data_...);
+      iterate(1, iters...);
+      i++;
+    }
+    iterate_overflow(iters...);
+  }
+}
+
+
+inline void apply_kernel(){};
+
+// TODO: Deal elegantly with 0-dim tensors. iters.strides_ of 0-dim
+// strided_tensor_iter will be of size 0 for dim 0 and iters.strides_[iters.dim_
+// - 1] will index at -1. C++14 integer_sequence could be of use here.
+template <typename Op, typename... Args>
+inline void
+apply_kernel(int64_t numel, int64_t offset, const Op& op, Args... iters) {
+  if (offset > 0)
+    forward(offset, iters...);
+  int64_t size = std::min(numel, max_iterate_size(iters...));
+  op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...);
+  iterate(size, iters...);
+  iterate_overflow(iters...);
+  int64_t i = size;
+  size = std::min(numel, max_iterate_size(iters...));
+  for (; i < numel;) {
+    op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...);
+    iterate(size, iters...);
+    i += size;
+    iterate_overflow(iters...);
+  }
+}
+
+template <typename scalar1, typename scalar2, typename Op>
+inline void
+CPU_tensor_parallel_kernel_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (tensor1.numel() == 1) {
+    op(1, tensor1.data<scalar1>(), tensor2.data<scalar2>(), 0, 0);
+    return;
+  }
+  if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        1,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
+          apply_kernel(
+              end - begin,
+              begin,
+              op,
+              strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+              strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+        });
+  } else {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        1,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
+          apply_kernel(
+              end - begin,
+              begin,
+              op,
+              strided_tensor_iter<scalar1>(tensor1),
+              strided_tensor_iter<scalar2>(tensor2));
+        });
+  }
+}
+
+/*
+  Apply a pointwise operator to sequence of tensors
+
+  The calling convention for op is a function/functor that takes takes the same
+  number of pointers of type scalar as the number of given tensors. For example,
+  to compute a = b * c, op would be of the form:
+  [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] =
+  b_val[0] * c_val[0]; };
+*/
+
+template <typename scalar1, typename Op>
+inline void CPU_tensor_apply1(Tensor tensor1, const Op op) {
+  if (!_apply_preamble({tensor1}))
+    return;
+  if (tensor1.ndimension() < 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1, true));
+  } else {
+    apply_op(tensor1.numel(), 0, op, strided_tensor_iter<scalar1>(tensor1));
+  }
+}
+
+template <typename scalar1, typename scalar2, typename Op>
+inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2));
+  }
+}
+
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+inline void
+CPU_tensor_apply3(Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3));
+  }
+}
+
+template <
+    typename scalar1,
+    typename scalar2,
+    typename scalar3,
+    typename scalar4,
+    typename Op>
+inline void CPU_tensor_apply4(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    Tensor tensor4,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3),
+        strided_tensor_iter_fixed<scalar4, 8>(tensor4));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3),
+        strided_tensor_iter<scalar4>(tensor4));
+  }
+}
+
+template <typename scalar1, typename Op>
+inline void CPU_tensor_parallel_apply1(
+    Tensor tensor1,
+    const Op op,
+    int64_t grain_size = internal::GRAIN_SIZE) {
+  if (!_apply_preamble({tensor1}))
+    return;
+  if (tensor1.ndimension() < 8) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
+          apply_op(
+              end - begin,
+              begin,
+              op,
+              strided_tensor_iter_fixed<scalar1, 8>(tensor1, true));
+        });
+  } else {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
+          apply_op(
+              end - begin, begin, op, strided_tensor_iter<scalar1>(tensor1));
+        });
+  }
+}
+
+template <typename scalar1, typename scalar2, typename Op>
+inline void CPU_tensor_parallel_apply2(
+    Tensor tensor1,
+    Tensor tensor2,
+    const Op op,
+    int64_t grain_size = internal::GRAIN_SIZE) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
+          apply_op(
+              end - begin,
+              begin,
+              op,
+              strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+              strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+        });
+  } else {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
+          apply_op(
+              end - begin,
+              begin,
+              op,
+              strided_tensor_iter<scalar1>(tensor1),
+              strided_tensor_iter<scalar2>(tensor2));
+        });
+  }
+}
+
+} // namespace at
diff --git a/aten/src/ATen/CPUFixedAllocator.h b/aten/src/ATen/CPUFixedAllocator.h
new file mode 100644
index 0000000..c7caea5
--- /dev/null
+++ b/aten/src/ATen/CPUFixedAllocator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "TH/TH.h"
+#include "ATen/Error.h"
+
+// This file creates a fake allocator that just throws exceptions if
+// it is actually used.
+
+// state passed to the allocator is the std::function<void(void*)> called
+// when the blob is release by ATen
+
+namespace at {
+
+static cpu_fixed_malloc(void *, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_realloc(void *, void*, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_free(void * state, void * allocation) {
+    auto on_release = static_cast<std::function<void(void*)>*>(state);
+    (*on_release)(allocation);
+    delete on_release;
+}
+
+static THAllocator CPU_fixed_allocator =
+  { cpu_fixed_malloc, cpu_fixed_realloc, cpu_fixed_free };
+
+}
diff --git a/aten/src/ATen/CPUGeneral.cpp b/aten/src/ATen/CPUGeneral.cpp
new file mode 100644
index 0000000..910e3ae
--- /dev/null
+++ b/aten/src/ATen/CPUGeneral.cpp
@@ -0,0 +1,16 @@
+#include <ATen/CPUGeneral.h>
+#include <atomic>
+#include <memory>
+#include <thread>
+
+namespace at {
+// Lock free atomic type
+std::atomic<int> num_threads(-1);
+
+void set_num_threads(int num_threads_) {
+  if (num_threads_ >= 0)
+    num_threads.store(num_threads_);
+}
+
+int get_num_threads() { return num_threads.load(); }
+}
diff --git a/aten/src/ATen/CPUGeneral.h b/aten/src/ATen/CPUGeneral.h
new file mode 100644
index 0000000..83ee165
--- /dev/null
+++ b/aten/src/ATen/CPUGeneral.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// Using AT_API is crucial as otherwise you'll see
+// linking errors using MSVC
+// See https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+// This header adds this if using AT_API
+#include "ATen/ATenGeneral.h"
+
+namespace at {
+AT_API void set_num_threads(int);
+AT_API int get_num_threads();
+}
diff --git a/aten/src/ATen/CPUGenerator.cpp b/aten/src/ATen/CPUGenerator.cpp
new file mode 100644
index 0000000..d737e1f
--- /dev/null
+++ b/aten/src/ATen/CPUGenerator.cpp
@@ -0,0 +1,49 @@
+#include "ATen/CPUGenerator.h"
+
+#define const_generator_cast(generator) \
+  dynamic_cast<const CPUGenerator&>(generator)
+
+namespace at {
+
+CPUGenerator::CPUGenerator(Context * context_)
+  : context(context_), generator(THGenerator_new())
+{}
+
+CPUGenerator::~CPUGenerator() {
+  if (generator)
+    THGenerator_free(generator);
+}
+
+CPUGenerator& CPUGenerator::copy(const Generator& from) {
+  THGenerator_copy(generator, const_generator_cast(from).generator);
+  return *this;
+}
+
+CPUGenerator& CPUGenerator::free() {
+  THGenerator_free(generator);
+  return *this;
+}
+
+uint64_t CPUGenerator::seed() {
+  return THRandom_seed(generator);
+}
+
+uint64_t CPUGenerator::initialSeed() {
+  return THRandom_initialSeed(generator);
+}
+
+CPUGenerator& CPUGenerator::manualSeed(uint64_t seed) {
+  THRandom_manualSeed(generator, seed);
+  return *this;
+}
+
+CPUGenerator& CPUGenerator::manualSeedAll(uint64_t seed) {
+  // There's only one CPU generator
+  return manualSeed(seed);
+}
+
+void * CPUGenerator::unsafeGetTH() {
+  return generator;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp
new file mode 100644
index 0000000..ad9d51c
--- /dev/null
+++ b/aten/src/ATen/CUDAStream.cpp
@@ -0,0 +1,183 @@
+#include "ATen/CUDAStream.h"
+#include "ATen/Error.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+
+#include <mutex>
+
+// Internal implementation is entirely hidden 
+struct CUDAStreamInternals {
+  bool is_destructible;
+  std::atomic<int> refcount;
+  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
+  cudaStream_t stream;
+};
+
+namespace at {
+
+namespace detail {
+
+  /*
+  * Stream state
+  */
+  static constexpr cudaStream_t DEFAULT_STREAM = 0;
+
+  static std::once_flag init_flag;
+  static int64_t num_gpus;
+  static CUDAStreamInternals* default_streams;
+  static thread_local CUDAStreamInternals** current_streams = nullptr;
+
+  // Creates a(n indestructible) default stream for each device
+  // Note: the default stream on each device is signified by a zero
+  // value for the pointer, and so is not actually created as usual.
+  // In particular, we don't need to switch devices when creating the 
+  // streams.
+  static void initDefaultCUDAStreams() {
+    num_gpus = getCUDAHooks().getNumGPUs();
+    default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals));
+    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
+      default_streams[i].is_destructible = false;
+      default_streams[i].refcount = 0;
+      default_streams[i].device = i;
+      default_streams[i].stream = DEFAULT_STREAM;
+    }
+  }
+
+  // Init front-end to ensure initialization only occurs once
+  static void initCUDAStreamsOnce() {
+    // Inits default streams (once, globally)
+    std::call_once(init_flag, initDefaultCUDAStreams);
+    
+    // Inits current streams (thread local) to default streams    
+    if (current_streams) return;
+    current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*));
+    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
+      current_streams[i] = &default_streams[i];
+    }
+  }
+
+  /*
+  * Pointer-based stream API
+  */
+
+  // Helper to return the current device
+  static inline int64_t current_device() {
+    int cur_device;
+    DynamicCUDAInterface::get_device(&cur_device);
+    return cur_device;
+  }
+
+  // Helper to verify the GPU index is valid
+  static inline void check_gpu(int64_t device) {
+    AT_CHECK(device >= 0 && device < num_gpus);
+  }
+
+  CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    return &default_streams[device];
+  }
+  CUDAStreamInternals* CUDAStream_getDefaultStream() {
+    return CUDAStream_getDefaultStreamOnDevice(current_device());
+  }
+
+  // Creates (and retains) and new cuda stream
+  CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) {
+    CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals));
+    internals->is_destructible = true;
+    internals->refcount = 1;
+    internals->device = current_device();
+    DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority);
+    return internals;
+  }
+
+  // Note: despite not being "unsafe," is using these methods in a multithreaded
+  // environment then the caller must be sure that streams are valid 
+  // when they're requested. These methods will throw an error if an
+  // invalid stream is requested.
+  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    auto cur = current_streams[device];
+    AT_CHECK(CUDAStream_retain(cur));
+    return cur;
+  }
+  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() {
+    return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device());
+  }
+
+  // Note: these unsafe methods do not retain the stream before returning it.
+  // This is unsafe behavior and these methods SHOULD NOT BE USED.
+  // They are here only for legacy compatibility.
+  CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    return current_streams[device];
+  }
+  CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() {
+    return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device());
+  }
+
+  void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    AT_CHECK(ptr);
+    AT_CHECK(ptr->device == device);
+    AT_CHECK(CUDAStream_retain(ptr));
+
+    CUDAStream_free(current_streams[device]);
+    current_streams[device] = ptr;
+  }
+  void CUDAStream_setStream(CUDAStreamInternals* ptr) {
+    CUDAStream_setStreamOnDevice(current_device(), ptr);
+  }
+
+  // Getters
+  cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) {
+    AT_CHECK(ptr);
+    return ptr->stream;
+  }
+
+  int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
+    AT_CHECK(ptr);
+    return ptr->device;
+  }
+
+  // Memory management
+  // Note: only destructible (non-default) streams are ref counted
+  bool CUDAStream_retain(CUDAStreamInternals* ptr) {
+    AT_CHECK(ptr);
+    if (ptr->is_destructible) return(++ptr->refcount > 1);
+    return true;
+  }
+
+  void CUDAStream_free(CUDAStreamInternals*& ptr) {
+    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
+      AT_CHECK(ptr->refcount == 0);
+      DynamicCUDAInterface::cuda_stream_destroy(ptr->stream);
+      free(ptr);
+      ptr = nullptr;
+    }
+  }
+
+} // namespace detail
+
+  /*
+  * CUDAStream functions
+  */
+
+   // Copy constructor
+  CUDAStream::CUDAStream(const CUDAStream& other) {
+    AT_CHECK(other.internals_);
+    AT_CHECK(detail::CUDAStream_retain(other.internals_));
+
+    internals_ = other.internals_;
+  }
+
+  // Move constructor
+  CUDAStream::CUDAStream(CUDAStream&& other) {
+    AT_CHECK(other.internals_);
+
+    std::swap(internals_, other.internals_);
+  }
+  
+} // namespace at
diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h
new file mode 100644
index 0000000..6e1a663
--- /dev/null
+++ b/aten/src/ATen/CUDAStream.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <ATen/ATenGeneral.h>
+
+/*
+* A CUDA stream interface with no CUDA build dependency.
+* 
+* Includes the CUDAStream RAII class and a pointer-based stream API.
+* 
+* The ATen Context interface should be preferred when working with streams.
+*/
+
+// Forward-declares cudaStream_t to avoid depending on CUDA in CPU builds
+// Note: this is the internal CUDA runtime typedef for cudaStream_t
+struct CUstream_st;
+typedef struct CUstream_st* cudaStream_t;
+
+// Forward-declares internals
+struct CUDAStreamInternals;
+
+namespace at {
+
+namespace detail {
+
+// Pointer-based API (for internal use)
+// Note: ATen/Context is preferred to work with streams safely
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStream();
+
+AT_API CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
+
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
+
+// Note: these Unsafe gets should NEVER be used and are only here for legacy
+// purposes. Once those uses are gone they should be removed.
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
+
+AT_API void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
+AT_API void CUDAStream_setStream(CUDAStreamInternals* internals);
+
+AT_API cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
+AT_API int64_t CUDAStream_device(CUDAStreamInternals*);
+
+AT_API bool CUDAStream_retain(CUDAStreamInternals*);
+AT_API void CUDAStream_free(CUDAStreamInternals*&);
+
+} // namespace detail
+
+// RAII for a CUDA stream
+// Allows use as a cudaStream_t, copying, moving, and metadata access.
+struct CUDAStream {
+  // Constants
+  static constexpr int32_t DEFAULT_FLAGS = 1; // = cudaStreamNonBlocking;
+  static constexpr int32_t DEFAULT_PRIORITY = 0;
+
+  // Constructors
+  CUDAStream() = default;
+  CUDAStream(CUDAStreamInternals* internals) : internals_{internals} { }
+  
+  // Destructor
+  ~CUDAStream() { detail::CUDAStream_free(internals_); }
+
+  // Copy constructor
+  AT_API CUDAStream(const CUDAStream& other);
+
+  // Move constructor
+  AT_API CUDAStream(CUDAStream&& other);
+
+  // Assignment operator
+  CUDAStream& operator=(CUDAStream other) {
+    std::swap(internals_, other.internals_);
+    return *this;
+  }
+
+  // Implicit conversion to cudaStream_t
+  operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAStream& left, const CUDAStream& right) {
+    return left.internals_ < right.internals_;
+  }
+
+  // Getters
+  int64_t device() const { return detail::CUDAStream_device(internals_); }
+  cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); }
+  CUDAStreamInternals* internals() const { return internals_; }
+
+private:
+  CUDAStreamInternals* internals_ = nullptr;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/CheckGenerator.h b/aten/src/ATen/CheckGenerator.h
new file mode 100644
index 0000000..3cf5c0f
--- /dev/null
+++ b/aten/src/ATen/CheckGenerator.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ATen/Error.h"
+#include "ATen/Generator.h"
+#include "ATen/Utils.h"
+
+namespace at {
+
+template <typename T>
+static inline T * check_generator(Generator * expr, Generator * defaultValue) {
+  if (!expr)
+    expr = defaultValue;
+  if(auto result = dynamic_cast<T*>(expr))
+    return result;
+  AT_ERROR("Expected a '", typeid(T).name(), "' but found '", typeid(expr).name(), "'");
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
new file mode 100644
index 0000000..8373c92
--- /dev/null
+++ b/aten/src/ATen/Config.h.in
@@ -0,0 +1,10 @@
+#pragma once
+
+// Test these using #if AT_MKL_ENABLED(), not #ifdef, so that it's
+// obvious if you forgot to include Config.h
+//    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
+//
+// DO NOT put the macros for CUDA libraries in this file; they belong in cuda/CUDAConfig.h
+
+#define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@
+#define AT_MKL_ENABLED() @AT_MKL_ENABLED@
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
new file mode 100644
index 0000000..59f6ff7
--- /dev/null
+++ b/aten/src/ATen/Context.cpp
@@ -0,0 +1,95 @@
+#include "ATen/Config.h"
+
+#include "Context.h"
+
+#include <thread>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+#include "ATen/CPUGenerator.h"
+
+#ifdef USE_SSE3
+#include <pmmintrin.h>
+#endif
+
+namespace at {
+
+static inline void errorHandler(const char * msg, void * data) {
+  throw std::runtime_error(msg);
+}
+static inline void argErrorHandler(int arg, const char * msg, void * data) {
+  std::stringstream new_error;
+  new_error << "invalid argument " << arg << ": " << msg;
+  throw std::runtime_error(new_error.str());
+}
+
+Context::Context()
+: next_id(static_cast<size_t>(TypeID::NumOptions))
+, thc_state(nullptr, [](THCState* p){ /* no-op */ } ) {
+
+  THSetDefaultErrorHandler(errorHandler,nullptr);
+  THSetDefaultArgErrorHandler(argErrorHandler,nullptr);
+
+  generator_registry[static_cast<int>(Backend::CPU)]
+    .reset(new CPUGenerator(this));
+  Type::registerCPU(this);
+}
+
+Context & globalContext() {
+  static Context globalContext_;
+  return globalContext_;
+}
+
+// NB: This method is *purely* whether or not a user requested
+// that CuDNN was enabled, it doesn't actually say anything about
+// whether or not CuDNN is actually usable.
+bool Context::userEnabledCuDNN() const {
+  return enabled_cudnn;
+}
+
+void Context::setUserEnabledCuDNN(bool e) {
+  enabled_cudnn = e;
+}
+
+bool Context::deterministicCuDNN() const {
+  return deterministic_cudnn;
+}
+
+void Context::setDeterministicCuDNN(bool b) {
+  deterministic_cudnn = b;
+}
+
+bool Context::benchmarkCuDNN() const {
+  return benchmark_cudnn;
+}
+
+void Context::setBenchmarkCuDNN(bool b) {
+  benchmark_cudnn = b;
+}
+
+bool Context::hasMKL() const {
+#if AT_MKL_ENABLED()
+  return true;
+#else
+  return false;
+#endif
+}
+
+bool Context::setFlushDenormal(bool on) {
+#ifdef USE_SSE3
+  // Setting flush-to-zero (FTZ) flag
+  _MM_SET_FLUSH_ZERO_MODE(on ? _MM_FLUSH_ZERO_ON
+                             : _MM_FLUSH_ZERO_OFF);
+
+  // Setting denormals-are-zero (DAZ) flag
+  _MM_SET_DENORMALS_ZERO_MODE(on ? _MM_DENORMALS_ZERO_ON
+                                 : _MM_DENORMALS_ZERO_OFF);
+  return true;
+#else
+  return false;
+#endif
+}
+
+}
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
new file mode 100644
index 0000000..accb57b
--- /dev/null
+++ b/aten/src/ATen/Context.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include "ATen/ATenGeneral.h"
+#include <ATen/CPUGeneral.h>
+#include "ATen/Generator.h"
+#include "ATen/Type.h"
+#include "ATen/Utils.h"
+#include "ATen/Error.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+#include "ATen/CUDAStream.h"
+
+#include <memory>
+#include <mutex>
+#include <cstdint>
+
+namespace at {
+
+enum class IsVariable {
+  NotVariable,
+  Variable,
+  NumOptions
+};
+
+class AT_API Context {
+public:
+  Context();
+  Type* getTypeRaw(Backend p, ScalarType s) {
+    return type_registry[static_cast<int>(p)][static_cast<int>(s)].get();
+  }
+  Type * getTypeOpt(Backend p, ScalarType s) {
+    initCUDAIfNeeded(p);
+    auto type = getTypeRaw(p, s);
+
+    if(!type) {
+      // there is only a single Undefined Type.
+      if (p == Backend::Undefined || s == ScalarType::Undefined) {
+        return getTypeRaw(Backend::Undefined, ScalarType::Undefined);
+      }
+    }
+
+    return type;
+  }
+  Type & getType(Backend p, ScalarType s) {
+    auto* type = getTypeOpt(p, s);
+    if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled.");
+    return *type;
+  }
+  Generator & defaultGenerator(Backend p) {
+    initCUDAIfNeeded(p);
+    auto & generator = generator_registry[static_cast<int>(p)];
+    if(!generator)
+      AT_ERROR(toString(p), " backend type not enabled.");
+    return *generator;
+  }
+  bool hasMKL() const;
+  bool hasCUDA() const {
+    return detail::getCUDAHooks().hasCUDA();
+  }
+  bool hasCuDNN() const {
+    return detail::getCUDAHooks().hasCuDNN();
+  }
+  int64_t current_device() const {
+    return detail::getCUDAHooks().current_device();
+  }
+  // defined in header so that getType has ability to inline
+  // call_once check. getType is called fairly frequently
+  THCState* lazyInitCUDA() {
+    std::call_once(thc_init,[&] {
+      thc_state = detail::getCUDAHooks().initCUDA();
+      generator_registry[static_cast<int>(Backend::CUDA)] =
+        detail::getCUDAHooks().initCUDAGenerator(this);
+      detail::getCUDAHooks().registerCUDATypes(this);
+    });
+    return thc_state.get();
+  }
+
+  THCState* getTHCState() {
+    // AT_ASSERT(thc_state);
+    return thc_state.get();
+  }
+
+  CUDAStream createCUDAStream() const {
+    return detail::CUDAStream_createAndRetainWithOptions(
+      CUDAStream::DEFAULT_FLAGS
+    , CUDAStream::DEFAULT_PRIORITY
+    );
+  }
+
+  CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) const {
+    return detail::CUDAStream_createAndRetainWithOptions(flags, priority);
+  }
+
+  CUDAStream getDefaultCUDAStream() const {
+    return detail::CUDAStream_getDefaultStream();
+  }
+
+  CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) const {
+    return detail::CUDAStream_getDefaultStreamOnDevice(device);
+  }
+
+  CUDAStream getCurrentCUDAStream() const {
+    return detail::CUDAStream_getAndRetainCurrentStream();
+  }
+
+  CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) const {
+    return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device);
+  }
+
+  void setCurrentCUDAStream(CUDAStream stream) const {
+    return detail::CUDAStream_setStream(stream.internals());
+  }
+
+  void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) const {
+    return detail::CUDAStream_setStreamOnDevice(device, stream.internals());
+  }
+
+#ifndef __HIP_PLATFORM_HCC__
+  cusparseHandle_t getCurrentCUDASparseHandle() const {
+    return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get());
+  }
+#endif
+  cudaDeviceProp* getCurrentDeviceProperties() const {
+    return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get());
+  }
+  cudaDeviceProp* getDeviceProperties(int device) const {
+    return detail::getCUDAHooks().getDeviceProperties(thc_state.get(), device);
+  }
+  int getNumGPUs() const {
+    return detail::getCUDAHooks().getNumGPUs();
+  }
+  size_t freshTypeID() {
+    return next_id++;
+  }
+  bool setFlushDenormal(bool on);
+
+  // NB: This method is *purely* whether or not a user requested
+  // that CuDNN was enabled, it doesn't actually say anything about
+  // whether or not CuDNN is actually usable.  Use cudnn_is_acceptable
+  // to test this instead
+  bool userEnabledCuDNN() const;
+  void setUserEnabledCuDNN(bool e);
+  bool benchmarkCuDNN() const;
+  void setBenchmarkCuDNN(bool);
+  bool deterministicCuDNN() const;
+  void setDeterministicCuDNN(bool);
+  std::unique_ptr<Generator>
+    generator_registry[static_cast<int>(Backend::NumOptions)];
+private:
+  // NB: type_registry has nullptr for all CUDA backends until
+  // CUDA initialization has occurred
+  std::unique_ptr<Type> type_registry
+    [static_cast<int>(Backend::NumOptions)]
+    [static_cast<int>(ScalarType::NumOptions)];
+  void initCUDAIfNeeded(Backend p) {
+    if(p == Backend::CUDA)
+      lazyInitCUDA();
+  }
+  std::once_flag thc_init;
+  bool enabled_cudnn = true;
+  bool deterministic_cudnn = false;
+  bool benchmark_cudnn = false;
+  std::atomic<size_t> next_id;
+  std::unique_ptr<THCState, void(*)(THCState*)> thc_state;
+  friend struct Type;
+  friend void register_cuda_types(Context * context);
+};
+
+AT_API Context & globalContext();
+
+static inline void init() {
+  globalContext();
+  if (const char *env_p = std::getenv("OMP_NUM_THREADS")) {
+    at::set_num_threads(std::stoi(env_p));
+  }
+  if (const char *env_p = std::getenv("MKL_NUM_THREADS")) {
+    at::set_num_threads(std::stoi(env_p));
+  }
+}
+
+static inline Type& getType(Backend p, ScalarType s) {
+  return globalContext().getType(p, s);
+}
+
+static inline Type& CPU(ScalarType s) {
+  return getType(Backend::CPU, s);
+}
+
+static inline Type& CUDA(ScalarType s) {
+  return getType(Backend::CUDA, s);
+}
+
+static inline bool hasCUDA() {
+  return globalContext().hasCUDA();
+}
+
+static inline bool hasCuDNN() {
+  return globalContext().hasCuDNN();
+}
+
+static inline bool hasMKL() {
+  return globalContext().hasMKL();
+}
+
+static inline int64_t current_device() {
+  return globalContext().current_device();
+}
+
+} // namespace at
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
new file mode 100644
index 0000000..963a835
--- /dev/null
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -0,0 +1,172 @@
+#include "ATen/DLConvertor.h"
+
+#include <iostream>
+#include <sstream>
+
+
+using namespace std;
+namespace at {
+
+static DLDataType getDLDataType(const Type& type) {
+  DLDataType dtype;
+  dtype.lanes = 1;
+  dtype.bits = type.elementSizeInBytes() * 8;
+  switch (type.scalarType()) {
+    case ScalarType::Byte:
+      dtype.code = DLDataTypeCode::kDLUInt;
+      break;
+    case ScalarType::Char:
+      dtype.code = DLDataTypeCode::kDLInt;
+      break;
+    case ScalarType::Double:
+      dtype.code = DLDataTypeCode::kDLFloat;
+      break;
+    case ScalarType::Float:
+      dtype.code = DLDataTypeCode::kDLFloat;
+      break;
+    case ScalarType::Int:
+      dtype.code = DLDataTypeCode::kDLInt;
+      break;
+    case ScalarType::Long:
+      dtype.code = DLDataTypeCode::kDLInt;
+      break;
+    case ScalarType::Short:
+      dtype.code = DLDataTypeCode::kDLInt;
+      break;
+    case ScalarType::Half:
+      dtype.code = DLDataTypeCode::kDLFloat;
+      break;
+    case ScalarType::Undefined:
+      throw std::logic_error("Undefined is not a valid ScalarType");
+    case ScalarType::NumOptions:
+      throw std::logic_error("NumOptions is not a valid ScalarType");
+  }
+  return dtype;
+}
+
+
+static DLContext getDLContext(const Type& type, const int64_t& device_id) {
+  DLContext ctx;
+  ctx.device_id = device_id;
+  if (type.is_cuda()) {
+    ctx.device_type = DLDeviceType::kDLGPU;
+  } else {
+    ctx.device_type = DLDeviceType::kDLCPU;
+  }
+  return ctx;
+}
+
+
+static Backend getATenBackend(const DLContext& ctx) {
+  Backend backend;
+  switch (ctx.device_type) {
+    case DLDeviceType::kDLCPU:
+      backend = Backend::CPU;
+      break;
+    case DLDeviceType::kDLGPU:
+      backend = Backend::CUDA;
+      break;
+    default:
+      throw std::logic_error("Unsupported device_type: " + std::to_string(ctx.device_type));
+  }
+  return backend;
+}
+
+
+ScalarType toScalarType(const DLDataType& dtype) {
+  ScalarType stype;
+  if (dtype.lanes != 1) throw std::logic_error("ATen does not support lanes != 1");
+  switch (dtype.code) {
+    case DLDataTypeCode::kDLUInt:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Byte;
+          break;
+        default:
+          throw std::logic_error("Unsupported kUInt bits " + std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLInt:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Char;
+          break;
+        case 16:
+          stype = ScalarType::Short;
+          break;
+        case 32:
+          stype = ScalarType::Int;
+          break;
+        case 64:
+          stype = ScalarType::Long;
+          break;
+        default:
+          throw std::logic_error("Unsupported kInt bits " + std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat:
+      switch (dtype.bits) {
+        case 16:
+          stype = ScalarType::Half;
+          break;
+        case 32:
+          stype = ScalarType::Float;
+          break;
+        case 64:
+          stype = ScalarType::Double;
+          break;
+        default:
+          throw std::logic_error("Unsupported kFloat bits " + std::to_string(dtype.bits));
+      }
+      break;
+    default:
+      throw std::logic_error("Unsupported code " + std::to_string(dtype.code));
+  }
+  return stype;
+}
+
+struct ATenDLMTensor {
+  Tensor handle;
+  DLManagedTensor tensor;
+};
+
+void deleter(DLManagedTensor * arg) {
+  delete static_cast<ATenDLMTensor*>(arg->manager_ctx);
+}
+
+
+// This function returns a shared_ptr to memory managed DLpack tensor constructed
+// out of ATen tensor
+DLManagedTensor* toDLPack(const Tensor& src) {
+  ATenDLMTensor * atDLMTensor(new ATenDLMTensor);
+  atDLMTensor->handle = src;
+  atDLMTensor->tensor.manager_ctx = atDLMTensor;
+  atDLMTensor->tensor.deleter = &deleter;
+  atDLMTensor->tensor.dl_tensor.data = src.data_ptr();
+  int64_t device_id = 0;
+  if (src.type().is_cuda()) {
+    device_id = src.get_device();
+  }
+  atDLMTensor->tensor.dl_tensor.ctx = getDLContext(src.type(), device_id);
+  atDLMTensor->tensor.dl_tensor.ndim = src.dim();
+  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src.type());
+  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(src.sizes().data());
+  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(src.strides().data());
+  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
+  return &(atDLMTensor->tensor);
+}
+
+
+Tensor fromDLPack(const DLManagedTensor* src) {
+  Backend backend = getATenBackend(src->dl_tensor.ctx);
+  ScalarType stype = toScalarType(src->dl_tensor.dtype);
+  auto deleter = [src](void * self) {
+    src->deleter(const_cast<DLManagedTensor*>(src));
+  };
+  return getType(backend, stype).tensorFromBlob(
+      src->dl_tensor.data,
+      IntList(src->dl_tensor.shape, src->dl_tensor.ndim),
+      IntList(src->dl_tensor.strides, src->dl_tensor.ndim),
+      deleter);
+}
+} //namespace at
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
new file mode 100644
index 0000000..5ed9899
--- /dev/null
+++ b/aten/src/ATen/DLConvertor.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "ATen/Tensor.h"
+#include "ATen/ATen.h"
+#include "ATen/dlpack.h"
+
+// this convertor will:
+// 1) take a Tensor object and wrap it in the DLPack tensor
+// 2) take a dlpack tensor and convert it to the ATen Tensor
+
+namespace at {
+
+AT_API ScalarType toScalarType(const DLDataType& dtype);
+AT_API DLManagedTensor * toDLPack(const Tensor& src);
+AT_API Tensor fromDLPack(const DLManagedTensor* src);
+
+} //namespace at
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
new file mode 100644
index 0000000..760c4a2
--- /dev/null
+++ b/aten/src/ATen/Declarations.cwrap
@@ -0,0 +1,3737 @@
+[[
+  name: storageOffset
+  python_name: storage_offset
+  cpu_half: True
+  device_guard: False
+  return: long
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: nDimension
+  python_name: ndimension
+  cpu_half: True
+  device_guard: False
+  return: long
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: resize_
+  return: self
+  cname: resize
+  cpu_half: True
+  arguments:
+    - THTensor* self
+    - arg: THSize* size
+      long_args: True
+    - CONSTANT NULL
+]]
+[[
+  name: set_
+  cname: set
+  cpu_half: True
+  device_guard: False
+  return: argument 0
+  options:
+    - cname: set
+      scalar_check: source_->isScalar()
+      arguments:
+        - THTensor* self
+        - THTensor* source
+    - cname: setStorage
+      scalar_check: False
+      arguments:
+        - THTensor* self
+        - CONSTANT NULL, 0, THLongStorageView({0}, THLongStorageViewKind::SIZE), NULL
+    - cname: setStorage
+      scalar_check: False
+      arguments:
+        - THTensor* self
+        - THStorage* source
+        - CONSTANT 0
+        - CONSTANT __storage_size.get()
+        - CONSTANT NULL
+    - cname: setStorage
+      arguments:
+        - THTensor* self
+        - THStorage* source
+        - long storage_offset
+        - THSize* size
+        - arg: THStride* stride
+          default: NULL
+]]
+[[
+  name: _fill_
+  return: self
+  cname: fill
+  options:
+    - arguments:
+      - THTensor* self
+      - real value
+    - zero_dim_tensor_only: True
+      arguments:
+      - THTensor* self
+      - THTensor* value
+]]
+[[
+  name: isContiguous
+  python_name: is_contiguous
+  cpu_half: True
+  device_guard: False
+  return: bool
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: isSetTo
+  python_name: is_set_to
+  cpu_half: True
+  device_guard: False
+  return: bool
+  arguments:
+    - THTensor* self
+    - THTensor* tensor
+]]
+[[
+  name: maskedFill_
+  cname: maskedFill
+  python_name: masked_fill_
+  return: self
+  options:
+    - arguments:
+      - arg: THTensor* self
+        broadcast: mask inplace fallback types:Byte
+      - THBoolTensor* mask
+      - real value
+    - zero_dim_tensor_only: True
+      arguments:
+      - arg: THTensor* self
+        broadcast: mask inplace fallback types:Byte
+      - THBoolTensor* mask
+      - THTensor* value
+]]
+[[
+  name: maskedCopy_
+  cname: maskedCopy
+  python_name: masked_scatter_
+  return: self
+  arguments:
+    - arg: THTensor* self
+      broadcast: mask inplace fallback types:Byte
+    - THBoolTensor* mask
+    - THTensor* source
+]]
+[[
+  name: maskedSelect
+  python_name: masked_select
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THTensor* self
+      broadcast: mask fallback types:Byte
+    - THBoolTensor* mask
+]]
+[[
+  name: nonzero
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THIndexTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: contiguous
+  cname: newContiguous
+  return: THTensor*
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: th_clone
+  cname: newClone
+  return: THTensor*
+  variants:
+    - function
+  cpu_half: True
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: view
+  cname: newView
+  device_guard: False
+  return: THTensor*
+  arguments:
+    - THTensor* self
+    - arg: THSize* size
+      long_args: True
+]]
+[[
+  name: resizeAs_
+  python_name: th_resize_as_
+  cname: resizeAs
+  variants:
+    - function
+  return: self
+  scalar_check: the_template_->isScalar()
+  arguments:
+    - THTensor* self
+    - THTensor* the_template
+]]
+[[
+  name: indexSelect
+  python_name: index_select
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+    - THIndexTensor* index
+]]
+[[
+  name: _indexCopy_
+  cname: indexCopy
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+    - THIndexTensor* index
+    - THTensor* source
+]]
+[[
+  name: take
+  cname: take
+  variants:
+    - method
+    - function
+  return: argument 0
+  scalar_check: index_->isScalar()
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THIndexTensor* index
+]]
+[[
+  name: put_
+  cname: put
+  backends:
+    - CPU
+    - CUDA
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - THIndexTensor* index
+    - THTensor* source
+    - arg: bool accumulate
+      default: "false"
+]]
+[[
+  name: indexAdd_
+  python_name: index_add_
+  cname: indexAdd
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+    - THIndexTensor* index
+    - THTensor* source
+]]
+[[
+  name: indexFill_
+  python_name: index_fill_
+  cname: indexFill
+  return: argument 0
+  options:
+    - arguments:
+      - THTensor* self
+      - arg: long dim
+        wrap_dim: self
+      - THIndexTensor* index
+      - real value
+    - zero_dim_tensor_only: True
+      arguments:
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - THIndexTensor* index
+        - THTensor* value
+]]
+[[
+  name: unfold
+  cpu_half: True
+  device_guard: False
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long dimension
+      wrap_dim: self
+    - long size
+    - long step
+]]
+[[
+  name: _range
+  cname: range
+  variants:
+    - function
+  backends:
+    - CPU
+    - CUDA
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - accreal start
+    - accreal end
+    - arg: accreal step
+      default: 1
+]]
+[[
+  name: _arange
+  variants:
+    - function
+  backends:
+    - CPU
+    - CUDA
+  return: argument 0
+  options:
+    - cname: arange
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - accreal start
+        - accreal end
+        - arg: accreal step
+          default: 1
+    - cname: arange
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - CONSTANT 0
+        - accreal end
+        - CONSTANT 1
+]]
+[[
+  name: scatter_
+  return: argument 0
+  options:
+    - cname: scatter
+      arguments:
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - THIndexTensor* index
+        - THTensor* src
+    - cname: scatterFill
+      arguments:
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - THIndexTensor* index
+        - real value
+]]
+[[
+  name: scatter_add_
+  return: argument 0
+  cname: scatterAdd
+  arguments:
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+    - THIndexTensor* index
+    - THTensor* src
+]]
+[[
+  name: gather
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+      resize: index
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+    - THIndexTensor* index
+]]
+[[
+  name: data_ptr
+  with_gil: True
+  device_guard: False
+  return: void*
+  cpu_half: True
+  cname: data
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: equal
+  variants:
+    - method
+    - function
+  return: bool
+  arguments:
+    - THTensor* self
+    - THTensor* other
+]]
+[[
+  name: __and__
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: bitand
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cbitand
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: __iand__
+  variants:
+    - method
+  return: argument 0
+  options:
+    - cname: bitand
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cbitand
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: __or__
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: bitor
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cbitor
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: __ior__
+  variants:
+    - method
+  return: argument 0
+  options:
+    - cname: bitor
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cbitor
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: __xor__
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: bitxor
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cbitxor
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: __ixor__
+  variants:
+    - method
+  return: argument 0
+  options:
+    - cname: bitxor
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cbitxor
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: __lshift__
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: lshift
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: clshift
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: __ilshift__
+  variants:
+    - method
+  return: argument 0
+  options:
+    - cname: lshift
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: clshift
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: __rshift__
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: rshift
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: crshift
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: __irshift__
+  variants:
+    - method
+  return: argument 0
+  options:
+    - cname: rshift
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: crshift
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: lt
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: ltValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: ltTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: lt_
+  return: self
+  options:
+    - cname: ltValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: ltTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - arg: THTensor* other
+]]
+[[
+  name: gt
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: gtValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: gtTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: gt_
+  return: self
+  options:
+    - cname: gtValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: gtTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: le
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: leValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: leTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: le_
+  return: self
+  options:
+    - cname: leValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: leTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: ge
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: geValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: geTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: ge_
+  return: self
+  options:
+    - cname: geValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: geTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: eq
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: eqValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: eqTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: eq_
+  return: self
+  options:
+    - cname: eqValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: eqTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: ne
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: neValue
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: neTensor
+      arguments:
+        - arg: THBoolTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: ne_
+  return: self
+  options:
+    - cname: neValueT
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: neTensorT
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: min
+  variants:
+    - method
+    - function
+  options:
+    - cname: minall
+      return: real
+      arguments:
+        - THTensor* self
+    - cname: cmin
+      return: argument 0
+      arguments:
+      - arg: THTensor* result
+        output: True
+      - arg: THTensor* self
+        broadcast: other fallback
+      - THTensor* other
+]]
+[[
+  name: _th_min
+  variants:
+    - method
+    - function
+  options:
+    - cname: min
+      return: argument 0,1
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* min
+          output: True
+        - arg: THIndexTensor* min_indices
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: max
+  variants:
+    - method
+    - function
+  options:
+    - cname: maxall
+      return: real
+      arguments:
+        - THTensor* self
+    - cname: cmax
+      return: argument 0
+      arguments:
+      - arg: THTensor* result
+        output: True
+      - arg: THTensor* self
+        broadcast: other fallback
+      - THTensor* other
+]]
+[[
+  name: _th_max
+  variants:
+    - method
+    - function
+  options:
+    - cname: max
+      return: argument 0,1
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* max
+          output: True
+        - arg: THIndexTensor* max_indices
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: _th_kthvalue
+  backends:
+    - CPU
+  variants:
+    - method
+    - function
+  cname: kthvalue
+  return: argument 0,1
+  scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+  arguments:
+    - arg: THTensor* values
+      output: True
+    - arg: THIndexTensor* indices
+      output: True
+    - THTensor* self
+    - long k
+    - arg: long dim
+      wrap_dim: self
+      default: __last_dim
+    - arg: bool keepdim
+      default: "false"
+]]
+[[
+  name: _th_mode
+  variants:
+    - method
+    - function
+  cname: mode
+  return: argument 0,1
+  scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+  arguments:
+    - arg: THTensor* values
+      output: True
+    - arg: THIndexTensor* indices
+      output: True
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+      default: __last_dim
+    - arg: bool keepdim
+      default: "false"
+]]
+[[
+  name: median
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  options:
+    - cname: medianall
+      return: real
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _th_median
+  variants:
+    - method
+    - function
+  cname: median
+  return: argument 0,1
+  options:
+    - cname: median
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* values
+          output: True
+        - arg: THIndexTensor* indices
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: sort
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* values
+      output: True
+    - arg: THIndexTensor* indices
+      output: True
+    - THTensor* self
+    - arg: long dim
+      default: __last_dim
+      wrap_dim: self
+    - arg: bool descending
+      default: "false"
+]]
+[[
+  name: topk
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* values
+      output: True
+    - arg: THIndexTensor* indices
+      output: True
+    - THTensor* self
+    - long k
+    - arg: long dim
+      default: __last_dim
+      wrap_dim: self
+    - arg: bool largest
+      default: "true"
+    - arg: bool sorted
+      default: "true"
+]]
+[[
+  name: all
+  types:
+    - Byte
+  variants:
+    - method
+    - function
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: logicalAndAll
+      return: real
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _th_all
+  types:
+    - Byte
+  variants:
+    - method
+    - function
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: logicalAnd
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: any
+  types:
+    - Byte
+  variants:
+    - method
+    - function
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: logicalAnyAll
+      return: real
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _th_any
+  types:
+    - Byte
+  variants:
+    - method
+    - function
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: logicalAny
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: getDevice
+  python_name: _th_get_device
+  device_guard: False
+  variants:
+    - function
+  backends:
+    - CUDA
+  return: long
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: _abs
+  cname: abs
+  types:
+    - floating_point
+    - Long
+    - Int
+    - Short
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _th_sigmoid
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  cname: sigmoid
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _log
+  cname: log
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _log10
+  cname: log10
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _log1p
+  cname: log1p
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _log2
+  cname: log2
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: lgamma
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: lgamma_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: lgamma
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+]]
+[[
+  name: digamma
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: digamma_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: digamma
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+]]
+[[
+  name: polygamma
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - int64_t n
+    - THTensor* self
+]]
+[[
+  name: polygamma_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: polygamma
+  return: self
+  arguments:
+    - THTensor* self
+    - int64_t n
+    - THTensor* self
+]]
+[[
+  name: _exp
+  cname: exp
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _expm1
+  cname: expm1
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _cos
+  cname: cos
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _acos
+  cname: acos
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _cosh
+  cname: cosh
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _sin
+  cname: sin
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _asin
+  cname: asin
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _sinh
+  cname: sinh
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _tan
+  cname: tan
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _atan
+  cname: atan
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _th_tanh
+  cname: tanh
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _erf
+  cname: erf
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _erfc
+  cname: erfc
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: erfinv_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: erfinv
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+]]
+[[
+  name: erfinv
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _sqrt
+  cname: sqrt
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _rsqrt
+  cname: rsqrt
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _ceil
+  cname: ceil
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _floor
+  cname: floor
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _round
+  cname: round
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _trunc
+  cname: trunc
+  types:
+    - floating_point
+  backends:
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: frac_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: frac
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+]]
+[[
+  name: frac
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: _th_var
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: varall
+      return: accreal
+      arguments:
+        - THTensor* self
+        - arg: bool unbiased
+          if_true: 0
+          if_false: 1
+          default: 0
+    - cname: var
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool unbiased
+          if_true: 0
+          if_false: 1
+          default: 0
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: _th_std
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: stdall
+      return: accreal
+      arguments:
+        - THTensor* self
+        - arg: bool unbiased
+          if_true: 0
+          if_false: 1
+          default: 0
+    - cname: std
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool unbiased
+          if_true: 0
+          if_false: 1
+          default: 0
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: th_norm
+  cname: norm
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - function
+  options:
+    - cname: normall
+      return: accreal
+      arguments:
+        - THTensor* self
+        - arg: real p
+          default: AS_REAL(2)
+]]
+[[
+  name: _th_norm
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: norm
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: real p
+          python_default_init: AS_REAL(2)
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: renorm
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - real p
+    - arg: long dim
+      wrap_dim: self
+    - real maxnorm
+]]
+[[
+  name: renorm_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: renorm
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - real p
+    - arg: long dim
+      wrap_dim: self
+    - real maxnorm
+]]
+[[
+  name: dist
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: dist
+      return: accreal
+      arguments:
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+        - arg: real p
+          default: AS_REAL(2)
+]]
+[[
+  name: reciprocal
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: cinv
+      return: argument 0
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+]]
+[[
+  name: reciprocal_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: cinv
+      return: self
+      arguments:
+        - THTensor* self
+        - THTensor* self
+]]
+[[
+  name: neg
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  options:
+    - cname: neg
+      return: argument 0
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+]]
+[[
+  name: neg_
+  backends:
+    - CPU
+    - CUDA
+  options:
+    - cname: neg
+      return: self
+      arguments:
+        - THTensor* self
+        - THTensor* self
+]]
+[[
+  name: atan2
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  cname: atan2
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THTensor* self
+      broadcast: other fallback
+    - THTensor* other
+]]
+[[
+  name: atan2_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: atan2
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - arg: THTensor* self
+      broadcast: other fallback inplace
+    - THTensor* other
+]]
+[[
+  name: th_pow
+  cname: pow
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: pow
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real exponent
+]]
+[[
+  name: pow
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: cpow
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: exponent fallback
+        - THTensor* exponent
+]]
+[[
+  name: pow
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: tpow
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - real base
+        - THTensor* self
+]]
+[[
+  name: pow_
+  return: argument 0
+  cname: pow
+  options:
+    - cname: pow
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real exponent
+    - cname: cpow
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: exponent inplace fallback
+        - THTensor* exponent
+]]
+[[
+  name: lerp
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  cname: lerp
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THTensor* self
+      broadcast: end fallback
+    - THTensor* end
+    - real weight
+]]
+[[
+  name: lerp_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  return: self
+  cname: lerp
+  arguments:
+    - THTensor* self
+    - arg: THTensor* self
+      broadcast: end fallback inplace
+    - THTensor* end
+    - real weight
+]]
+[[
+  name: _linspace
+  cname: linspace
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - real start
+    - real end
+    - arg: long steps
+      default: 100
+]]
+[[
+  name: _logspace
+  cname: logspace
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - real start
+    - real end
+    - arg: long steps
+      default: 100
+]]
+[[
+  name: histc
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long bins
+      default: 100
+    - arg: real min
+      default: 0
+    - arg: real max
+      default: 0
+]]
+[[
+  name: th_zero_
+  cname: zero
+  return: self
+  variants:
+    - function
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: _sumall
+  variants:
+    - method
+    - function
+  options:
+    - cname: sumall
+      return: accreal
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _th_sum
+  variants:
+    - method
+    - function
+  options:
+    - cname: sum
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: _prodall
+  variants:
+    - method
+    - function
+  options:
+    - cname: prodall
+      return: accreal
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _th_prod
+  variants:
+    - method
+    - function
+  options:
+    - cname: prod
+      return: argument 0
+      scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - arg: long dim
+          wrap_dim: self
+        - arg: bool keepdim
+          default: "false"
+]]
+[[
+  name: _cumsum
+  cname: cumsum
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+]]
+[[
+  name: _cumprod
+  cname: cumprod
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long dim
+      wrap_dim: self
+]]
+[[
+  name: sign
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+]]
+[[
+  name: sign_
+  cname: sign
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+]]
+[[
+  name: trace
+  variants:
+    - method
+    - function
+  return: accreal
+  arguments:
+    - THTensor* self
+]]
+[[
+  name: th_add
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: cadd
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THTensor* other
+]]
+[[
+  name: add
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: add_scaled
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+]]
+[[
+  name: th_add_
+  return: argument 0
+  variants: [function]
+  options:
+    - cname: cadd
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THTensor* other
+]]
+[[
+  name: add_
+  return: argument 0
+  options:
+    - cname: add_scaled
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+]]
+[[
+  name: th_sub
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: csub
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THTensor* other
+]]
+[[
+  name: sub
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: sub_scaled
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+]]
+[[
+  name: th_sub_
+  return: argument 0
+  variants: [function]
+  options:
+    - cname: csub
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THTensor* other
+]]
+[[
+  name: sub_
+  return: argument 0
+  options:
+    - cname: sub_scaled
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+        - arg: real alpha
+          default: AS_REAL(1)
+          kwarg_only: True
+]]
+[[
+  name: th_mul
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: mul
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cmul
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - arg: THTensor* other
+]]
+[[
+  name: th_mul_
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: mul
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cmul
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: th_div
+  variants:
+    - function
+  return: argument 0
+  options:
+    - cname: div
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+]]
+[[
+  name: div
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: cdiv
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: th_div_
+  variants: [function]
+  return: argument 0
+  options:
+    - cname: div
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+]]
+[[
+  name: div_
+  return: argument 0
+  options:
+    - cname: cdiv
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: fmod
+  return: argument 0
+  variants:
+    - method
+    - function
+  options:
+    - cname: fmod
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cfmod
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - THTensor* other
+]]
+[[
+  name: fmod_
+  return: argument 0
+  options:
+    - cname: fmod
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cfmod
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: remainder
+  return: argument 0
+  variants:
+    - method
+    - function
+  options:
+    - cname: remainder
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - THTensor* self
+        - real other
+    - cname: cremainder
+      arguments:
+        - arg: THTensor* result
+          output: True
+        - arg: THTensor* self
+          broadcast: other fallback
+        - arg: THTensor* other
+]]
+[[
+  name: remainder_
+  return: argument 0
+  options:
+    - cname: remainder
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - real other
+    - cname: cremainder
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: other inplace fallback
+        - THTensor* other
+]]
+[[
+  name: clamp
+  cname: clamp
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - real min
+    - real max
+]]
+[[
+  name: clamp_
+  cname: clamp
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - real min
+    - real max
+]]
+[[
+  name: clamp_min
+  cname: cmaxValue
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - real min
+]]
+[[
+  name: clamp_min_
+  cname: cmaxValue
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - real min
+]]
+[[
+  name: clamp_max
+  cname: cminValue
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - real max
+]]
+[[
+  name: clamp_max_
+  cname: cminValue
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - real max
+]]
+[[
+  name: _dot
+  backend_type_pairs: [[CUDA,floating_point], [CPU,all]]
+  cname: dot
+  variants:
+    - method
+    - function
+  return: accreal
+  arguments:
+    - arg: THTensor* self
+    - arg: THTensor* tensor
+]]
+[[
+  name: tril
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long diagonal
+      default: 0
+]]
+[[
+  name: tril_
+  cname: tril
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - arg: long diagonal
+      default: 0
+]]
+[[
+  name: triu
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long diagonal
+      default: 0
+]]
+[[
+  name: triu_
+  cname: triu
+  return: self
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - arg: long diagonal
+      default: 0
+]]
+[[
+  name: cross
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THTensor* other
+    - arg: long dim
+      default: -1
+]]
+[[
+  name: diag
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - arg: long diagonal
+      default: 0
+  aten_custom_call: |
+    if (self_->isScalar()) {
+      throw std::runtime_error("Input must be 1-d or 2-d");
+    }
+    ${THTensor}_diag(${state,}result_->tensor, self_->tensor, diagonal);
+    result_->maybeScalar(self_->isScalar());
+]]
+[[
+  name: th_addmm
+  cname: addmm
+  variants:
+    - function
+  return: argument 0
+  options:
+    - arguments:
+      - arg: THTensor* result
+        output: True
+      - arg: real beta
+        default: AS_REAL(1)
+        kwarg_only: True
+      - arg: THTensor* self
+        broadcast: mat1,mat2 dims:mat1.dim0,mat2.dim1
+      - arg: real alpha
+        default: AS_REAL(1)
+        kwarg_only: True
+      - THTensor* mat1
+      - THTensor* mat2
+]]
+[[
+  name: th_addmm_
+  variants: [function]
+  return: self
+  options:
+    - cname: addmm
+      arguments:
+      - THTensor* self
+      - arg: real beta
+        default: AS_REAL(1)
+        kwarg_only: True
+      - THTensor* self
+      - arg: real alpha
+        default: AS_REAL(1)
+        kwarg_only: True
+      - THTensor* mat1
+      - THTensor* mat2
+]]
+[[
+  name: _addmv
+  cname: addmv
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - arg: THTensor* self
+      broadcast: mat,vec dims:mat.dim0
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* mat
+    - THTensor* vec
+]]
+[[
+  name: _addmv_
+  cname: addmv
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* self
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* mat
+    - THTensor* vec
+]]
+[[
+  name: _addr
+  cname: addr
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - arg: THTensor* self
+      broadcast: vec1,vec2 dims:vec1.dim0,vec2.dim0
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* vec1
+    - THTensor* vec2
+]]
+[[
+  name: _addr_
+  cname: addr
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* self
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* vec1
+    - THTensor* vec2
+]]
+[[
+  name: _ger
+  cname: addr
+  variants:
+    - method
+    - function
+  return: argument 0
+  scalar_check: False
+  arguments:
+    - arg: THTensor* result
+      output: True
+      resize: [ [self,0], [vec2,0] ]
+      resize_scalar: True
+    - CONSTANT AS_REAL(0)
+    - argument 0
+    - CONSTANT AS_REAL(1)
+    - THTensor* self
+    - THTensor* vec2
+]]
+[[
+  name: _mv
+  cname: addmv
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+      resize: [ [self, 0] ]
+      cpu_zero: True
+    - CONSTANT AS_REAL(0)
+    - argument 0
+    - CONSTANT AS_REAL(1)
+    - THTensor* self
+    - THTensor* vec
+]]
+[[
+  name: _mm
+  variants:
+    - method
+    - function
+  return: argument 0
+  options:
+    - cname: addmm
+      arguments:
+        - arg: THTensor* result
+          output: True
+          resize: [ [self, 0], [mat2,1] ]
+          cpu_zero: True
+        - CONSTANT AS_REAL(0)
+        - argument 0
+        - CONSTANT AS_REAL(1)
+        - THTensor* self
+        - THTensor* mat2
+]]
+[[
+  name: bmm
+  cname: baddbmm
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+      resize: [ [self,0], [self,1], [mat2,2] ]
+      cpu_zero: True
+    - CONSTANT AS_REAL(0)
+    - argument 0
+    - CONSTANT AS_REAL(1)
+    - THTensor* self
+    - THTensor* mat2
+]]
+[[
+  name: addbmm
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - arg: THTensor* self
+      broadcast: batch1,batch2 dims:batch1.dim1,batch2.dim2
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* batch1
+    - THTensor* batch2
+]]
+[[
+  name: addbmm_
+  cname: addbmm
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* self
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* batch1
+    - THTensor* batch2
+]]
+[[
+  name: baddbmm
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - arg: THTensor* self
+      broadcast: batch1,batch2 dims:batch1.dim0,batch1.dim1,batch2.dim2
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* batch1
+    - THTensor* batch2
+]]
+[[
+  name: baddbmm_
+  cname: baddbmm
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - arg: real beta
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* self
+    - arg: real alpha
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* batch1
+    - THTensor* batch2
+]]
+[[
+  name: addcmul
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THTensor* self
+      broadcast: tensor1,tensor2 fallback
+    - arg: real value
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* tensor1
+    - THTensor* tensor2
+]]
+[[
+  name: addcmul_
+  options:
+    - cname: addcmul
+      return: argument 0
+      arguments:
+        - THTensor* self
+        - arg: THTensor* self
+          broadcast: tensor1,tensor2 inplace fallback
+        - arg: real value
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THTensor* tensor1
+        - THTensor* tensor2
+    - cname: spaddcmul
+      return: argument 0
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - arg: real value
+          default: AS_REAL(1)
+          kwarg_only: True
+        - THSTensor* tensor1
+        - THSTensor* tensor2
+]]
+[[
+  name: addcdiv
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THTensor* self
+      broadcast: tensor1,tensor2 fallback
+    - arg: real value
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* tensor1
+    - THTensor* tensor2
+]]
+[[
+  name: addcdiv_
+  cname: addcdiv
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - arg: THTensor* self
+      broadcast: tensor1,tensor2 inplace fallback
+    - arg: real value
+      default: AS_REAL(1)
+      kwarg_only: True
+    - THTensor* tensor1
+    - THTensor* tensor2
+]]
+[[
+  name: _gesv_single
+  cname: gesv
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* solution
+      output: True
+    - arg: THTensor* lu
+      output: True
+    - THTensor* self
+    - THTensor* A
+]]
+[[
+  name: gels
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+    - THTensor* A
+]]
+[[
+  name: trtrs
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+    - THTensor* A
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+    - arg: bool transpose
+      if_true: T
+      if_false: N
+      default: N
+    - arg: bool unitriangular
+      if_true: U
+      if_false: N
+      default: N
+]]
+[[
+  name: symeig
+  cname: syev
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+    - arg: bool eigenvectors
+      if_true: V
+      if_false: N
+      default: N
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+]]
+[[
+  name: eig
+  cname: geev
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+    - arg: bool eigenvectors
+      if_true: V
+      if_false: N
+      default: N
+]]
+[[
+  name: svd
+  cname: gesvd
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1,2
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - arg: THTensor* res3
+      output: True
+    - THTensor* self
+    - arg: bool some
+      if_true: S
+      if_false: A
+      default: S
+]]
+[[
+  name: _getri
+  cname: getri
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* output
+      output: True
+    - THTensor* self
+]]
+[[
+  name: potrf
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* output
+      output: True
+    - THTensor* self
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+]]
+[[
+  name: potrs
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THTensor* input2
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+]]
+[[
+  name: potri
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* output
+      output: True
+    - THTensor* self
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+]]
+[[
+  name: pstrf
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THIntegerTensor* res2
+      output: True
+    - THTensor* self
+    - arg: bool upper
+      if_true: U
+      if_false: L
+      default: U
+    - arg: real tol
+      default: -1
+  aten_custom_call: |
+    ${THTensor}_pstrf(res1_->tensor, res2_->tensor, self_->tensor, (upper) ? "U" : "L", tol_);
+    res2 -= 1;  // LAPACK returns 1-indexed pivots
+]]
+[[
+  name: qr
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+]]
+[[
+  name: geqrf
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* res1
+      output: True
+    - arg: THTensor* res2
+      output: True
+    - THTensor* self
+]]
+[[
+  name: orgqr
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THTensor* input2
+]]
+[[
+  name: ormqr
+  types:
+    - Float
+    - Double
+  backends:
+    - CPU
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THTensor* input2
+    - THTensor* input3
+    - arg: bool left
+      if_true: L
+      if_false: R
+      default: L
+    - arg: bool transpose
+      if_true: T
+      if_false: N
+      default: N
+]]
+[[
+  name: btrifact
+  cname: btrifact
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THIntegerTensor* pivots
+      output: True
+    - CONSTANT NULL
+    - arg: bool pivot
+      kwarg_only: True
+      default: "true"
+    - THTensor* self
+]]
+[[
+  name: btrifact_with_info
+  cname: btrifact
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0,1,2
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - arg: THIntegerTensor* pivots
+      output: True
+    - arg: THIntegerTensor* info
+      output: True
+    - arg: bool pivot
+      kwarg_only: True
+      default: "true"
+    - THTensor* self
+]]
+[[
+  name: btrisolve
+  cname: btrisolve
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THTensor* LU_data
+    - THIntegerTensor* LU_pivots
+]]
+[[
+  name: random_
+  backends:
+    - CPU
+    - CUDA
+  return: self
+  options:
+    - cname: random
+      arguments:
+        - THTensor* self
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+    - cname: cappedRandom
+      arguments:
+        - THTensor* self
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+        - int64_t to
+    - cname: clampedRandom
+      arguments:
+        - THTensor* self
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+        - int64_t from
+        - int64_t to
+]]
+[[
+  name: multinomial
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  variants:
+    - method
+    - function
+  return: argument 0
+  arguments:
+    - arg: THIndexTensor* result
+      output: True
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - THTensor* self
+    - long num_samples
+    - arg: bool replacement
+      default: "false"
+]]
+[[
+  name: uniform_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: uniform
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - arg: double from
+      default: 0
+    - arg: double to
+      default: 1
+]]
+[[
+  name: normal
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  return: argument 0
+  variants:
+    - function
+  options:
+    - cname: normal_means
+      arguments:
+        - arg: THTensor* output
+          output: True
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+        - THTensor* mean
+        - arg: double std
+          default: 1
+    - cname: normal_stddevs
+      arguments:
+        - arg: THTensor* output
+          output: True
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+        - arg: double mean
+        - THTensor* std
+    - cname: normal_means_stddevs
+      arguments:
+        - arg: THTensor* output
+          output: True
+        - arg: THGenerator* generator
+          default: nullptr
+          kwarg_only: True
+        - THTensor* mean
+        - THTensor* std
+]]
+[[
+  name: normal_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: normal
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - arg: double mean
+      default: 0
+    - arg: double std
+      default: 1
+]]
+[[
+  name: cauchy_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: cauchy
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - arg: double median
+      default: 0
+    - arg: double sigma
+      default: 1
+]]
+[[
+  name: logNormal_
+  cname: logNormal
+  python_name: log_normal_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - arg: double mean
+      default: 1
+    - arg: double std
+      default: 2
+]]
+[[
+  name: exponential_
+  types:
+    - floating_point
+  backends:
+    - CPU
+    - CUDA
+  cname: exponential
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - arg: double lambd
+      default: 1
+]]
+[[
+  name: geometric_
+  backends:
+    - CPU
+    - CUDA
+  cname: geometric
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - double p
+]]
+[[
+  name: _bernoulli_
+  backends:
+    - CPU
+    - CUDA
+  cname: bernoulli
+  return: self
+  arguments:
+    - THTensor* self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - double p
+]]
+[[
+  name: _th_bernoulli
+  types:
+    - Float
+    - Double
+  return: argument 0
+  variants:
+    - method
+    - function
+  cname: bernoulli_Tensor
+  arguments:
+    - arg: THTensor* output
+      output: True
+      resize: self
+    - arg: THGenerator* generator
+      default: nullptr
+      kwarg_only: True
+    - THTensor* self
+]]
+[[
+  name: _dirichlet_grad
+  types:
+    - floating_point
+  backends:
+    - CPU
+  return: argument 0
+  variants:
+    - function
+  options:
+    - cname: dirichlet_grad
+      arguments:
+        - arg: THTensor* output
+          output: True
+        - THTensor* x
+        - THTensor* alpha
+        - THTensor* total
+]]
+[[
+  name: th_tensor
+  return: THTensor*
+  cpu_half: True
+  variants: [function]
+  options:
+    - cname: new
+      arguments: []
+    - cname: newWithSize
+      arguments:
+        - THSize* size
+        - CONSTANT NULL
+]]
+[[
+  name: tensor
+  return: THTensor*
+  cpu_half: True
+  variants: [function]
+  options:
+    - cname: newWithSize
+      arguments:
+        - THSize* size
+        - arg: THStride* stride
+    - cname: newWithStorage
+      arguments:
+        - THStorage* storage
+        - int64_t storageOffset
+        - THSize* size
+        - arg: THStride* stride
+          default: NULL
+]]
+
+# In theory, this could be a part of the above declaration. But in
+# practice this leads to all sorts of problems with ambiguous overloads.
+# So we add it here with a separate name.
+[[
+  name: alias
+  return: THTensor*
+  cpu_half: True
+  variants: [function]
+  options:
+    - cname: newWithTensor
+      arguments:
+        - THTensor* self
+]]
+[[
+  name: _copy_ignoring_overlaps_
+  cname: copyIgnoringOverlaps
+  return: self
+  backends:
+    - CUDA
+  arguments:
+    - THTensor* self
+    - THTensor* src
+]]
+
+[[
+  name: as_strided
+  variants: [method,function]
+  cpu_half: True
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THSize* size
+    - THStride* stride
+    - arg: int64_t storage_offset
+  aten_custom_call: |
+    ${THTensor}_setStorage(${state,}result_->tensor, self_->tensor->storage, storage_offset, size_, stride_);
+    result_->maybeScalar(size.size() == 0);
+]]
+
+[[
+  name: as_strided_
+  variants: [method]
+  cpu_half: True
+  return: argument 0
+  arguments:
+    - THTensor* self
+    - THSize* size
+    - THStride* stride
+    - arg: int64_t storage_offset
+  aten_custom_call: |
+    ${THTensor}_setStorage(${state,}self_->tensor, self_->tensor->storage, storage_offset, size_, stride_);
+    self_->maybeScalar(size.size() == 0);
+]]
+
+[[
+  name: _cat
+  cname: catArray
+  variants: [function]
+  return: self
+  arguments:
+    - arg: THTensor* self
+      output: True
+    - TensorList tensors
+    - arg: int64_t dim
+      default: 0
+]]
diff --git a/aten/src/ATen/Deprecated.h b/aten/src/ATen/Deprecated.h
new file mode 100644
index 0000000..6e136ed
--- /dev/null
+++ b/aten/src/ATen/Deprecated.h
@@ -0,0 +1,16 @@
+#pragma once
+
+// Largely from https://stackoverflow.com/questions/295120/c-mark-as-deprecated
+
+#if defined(__cplusplus) && __cplusplus > 201402L
+#define AT_DEPRECATED(function) [[deprecated]] function
+#else
+#if defined(__GNUC__)
+#define AT_DEPRECATED(function) __attribute__((deprecated)) function
+#elif defined(_MSC_VER)
+#define AT_DEPRECATED(function) __declspec(deprecated) function
+#else
+#warning "You need to implement AT_DEPRECATED for this compiler"
+#define AT_DEPRECATED(function) function
+#endif // defined(__GNUC__)
+#endif // defined(__cplusplus) && __cplusplus > 201402L
diff --git a/aten/src/ATen/Device.cpp b/aten/src/ATen/Device.cpp
new file mode 100644
index 0000000..14ad860
--- /dev/null
+++ b/aten/src/ATen/Device.cpp
@@ -0,0 +1,100 @@
+#include <ATen/Device.h>
+
+#include <ATen/Error.h>
+
+#include <exception>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+namespace at {
+namespace {
+std::pair<Device::Type, size_t> parse_type(const std::string& device_string) {
+  auto position = device_string.find("cpu");
+  if (position != std::string::npos) {
+    return {Device::Type::CPU, 3};
+  }
+  position = device_string.find("cuda");
+  if (position != std::string::npos) {
+    return {Device::Type::CUDA, 4};
+  }
+  AT_ERROR("Expected 'cpu' or 'cuda' device type at start of device string");
+}
+} // namespace
+
+// `std::regex` is still in a very incomplete state in GCC 4.8.x,
+// so we have to do our own parsing, like peasants.
+// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
+//
+// Replace with the following code once we shed our GCC skin:
+//
+// static const std::regex regex(
+//     "(cuda|cpu)|(cuda|cpu):([0-9]+)|([0-9]+)",
+//     std::regex_constants::basic);
+// std::smatch match;
+// const bool ok = std::regex_match(device_string, match, regex);
+// AT_CHECK(ok, "Invalid device string: '", device_string, "'");
+// if (match[1].matched) {
+//   type_ = parse_type_from_string(match[1].str());
+// } else {
+//   if (match[2].matched) {
+//     type_ = parse_type_from_string(match[1].str());
+//   } else {
+//     type_ = Type::CUDA;
+//   }
+//   AT_ASSERT(match[3].matched);
+//   index_ = std::stoi(match[3].str());
+// }
+Device::Device(const std::string& device_string) : Device(Type::CPU) {
+  AT_CHECK(!device_string.empty(), "Device string must not be empty");
+
+  size_t position;
+  std::tie(type_, position) = parse_type(device_string);
+
+  // e.g. 'cuda', 'cpu'.
+  if (position == device_string.size()) {
+    return;
+  }
+
+  AT_CHECK(
+      device_string[position] == ':',
+      "Expected ':' to separate device type from index in device string");
+  // Skip the colon.
+  position += 1;
+
+  const auto index_string = device_string.substr(position);
+  try {
+    index_ = std::stoi(index_string);
+  } catch (const std::exception&) {
+    AT_ERROR(
+        "Could not parse device index '",
+        index_string,
+        "' in device string '",
+        device_string,
+        "'");
+  }
+}
+
+} // namespace at
+
+std::ostream& operator<<(std::ostream& stream, at::Device::Type type) {
+  switch (type) {
+    case at::Device::Type::CPU: {
+      stream << "cpu";
+      break;
+    }
+    case at::Device::Type::CUDA: {
+      stream << "cuda";
+      break;
+    }
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream, const at::Device& device) {
+  stream << device.type();
+  if (device.has_index()) {
+    stream << ":" << device.index();
+  }
+  return stream;
+}
diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h
new file mode 100644
index 0000000..4795b77
--- /dev/null
+++ b/aten/src/ATen/Device.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <ATen/Error.h>
+#include <ATen/ScalarType.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <string>
+#include <functional>
+
+namespace at {
+/// Represents a a compute device on which a tensor is located. A device is
+/// uniquely identified by a type, which specifies the type of machine it is
+/// (e.g. CPU or CUDA GPU), and a device index or ordinal, which identifies the
+/// specific compute device when there is more than one of a certain type. The
+/// device index is optional, and in its defaulted state represents (abstractly)
+/// "the current device". Further, there are two constraints on the value of the
+/// device index, if one is explicitly stored:
+/// 1. A negative index represents the current device, a non-negative index
+/// represents a specific, concrete device,
+/// 2. When the device type is CPU, the device index must be zero.
+struct Device {
+  /// The possible values of the device *type*.
+  enum class Type { CPU, CUDA };
+
+  /// Converts a `Backend` to a `Device::Type` if possible.
+  static Type backend_to_type(Backend backend) {
+    switch (backend) {
+      case kCPU:
+      case kSparseCPU:
+        return Type::CPU;
+      case kCUDA:
+      case kSparseCUDA:
+        return Type::CUDA;
+      default:
+        AT_ERROR(
+            "Invalid backend ", toString(backend), " for Device construction");
+    }
+  }
+
+  /// Constructs a new `Device` from a `Type` and an optional device index.
+  /* implicit */ Device(Type type, int32_t index = -1)
+      : type_(type), index_(index) {
+    AT_CHECK(
+        index == -1 || index >= 0,
+        "Device index must be -1 or non-negative, got ",
+        index);
+    AT_CHECK(
+        !is_cpu() || index <= 0,
+        "CPU device index must be -1 or zero, got ",
+        index);
+  }
+
+  /// Constructs a `Device` from a string description, for convenience.
+  /// The string supplied must follow the following schema:
+  /// `(cpu|cuda):[<device-index>]`
+  /// where `cpu:` or `cuda:` specifies the device type, and
+  /// `<device-index>` optionally specifies a device index.
+  /* implicit */ Device(const std::string& device_string);
+
+  /// Constructs a new `Device` from a `Backend` (which is converted to a
+  /// `Type`, if possible) and an optional device index.
+  /* implicit */ Device(Backend backend, int32_t index = -1)
+      : Device(backend_to_type(backend), index) {}
+
+  /// Returns true if the type and index of this `Device` matches that of
+  /// `other`.
+  bool operator==(const Device& other) const noexcept {
+    return this->type_ == other.type_ && this->index_ == other.index_;
+  }
+
+  /// Returns true if the type or index of this `Device` differs from that of
+  /// `other`.
+  bool operator!=(const Device& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// Sets the device index.
+  void set_index(int32_t index) {
+    index_ = index;
+  }
+
+  /// Returns the type of device this is.
+  Type type() const noexcept {
+    return type_;
+  }
+
+  /// Returns the optional index.
+  const int32_t& index() const noexcept {
+    return index_;
+  }
+
+  /// Returns true if the device has a non-default index.
+  bool has_index() const noexcept {
+    return index_ != -1;
+  }
+
+  /// Return true if the device is of CUDA type.
+  bool is_cuda() const noexcept {
+    return type_ == Type::CUDA;
+  }
+
+  /// Return true if the device is of CPU type.
+  bool is_cpu() const noexcept {
+    return type_ == Type::CPU;
+  }
+
+ private:
+  Type type_;
+  int32_t index_ = -1;
+};
+} // namespace at
+
+AT_API std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
+AT_API std::ostream& operator<<(std::ostream& stream, const at::Device& device);
+
+namespace std {
+  template<> struct hash<at::Device>
+  {
+    size_t operator()(const at::Device& device) const noexcept {
+      size_t hash_val = static_cast<size_t>(device.index() + 1);
+      if (device.is_cuda()) {
+        hash_val += 2;
+      }
+      return hash_val;
+    }
+  };
+} // namespace std
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
new file mode 100644
index 0000000..6a3b84d
--- /dev/null
+++ b/aten/src/ATen/DeviceGuard.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <ATen/Device.h>
+#include <ATen/Error.h>
+#include <ATen/ScalarType.h>
+#include <ATen/Tensor.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include <cstddef>
+
+namespace at {
+/// RAII guard that sets a certain default GPU index in its constructor, and
+/// changes it back to the device that was originally active upon destruction.
+///
+/// The index is always reset to the one that was active at the time of
+/// construction of the guard. Even if you `set_index` after construction, the
+/// destructor will still reset the index to the one that was active at
+/// construction time.
+struct DeviceGuard {
+  /// Default constructor, does nothing.
+  DeviceGuard() = default;
+
+  /// Uses the given device's `index()` if it is a CUDA device, else does
+  /// nothing.
+  explicit DeviceGuard(Device device) {
+    if (device.is_cuda()) {
+      set_index(device.index());
+    }
+  }
+
+  /// Calls `set_device` with the given index.
+  explicit DeviceGuard(int32_t index) {
+    set_index(index);
+  }
+
+  /// Sets the device to the index on which the given tensor is located.
+  explicit DeviceGuard(const Tensor& tensor) {
+    set_index_from(tensor);
+  }
+
+  /// Sets the device to the index on which the first tensor in the list is
+  /// located. If the list is empty, does nothing.
+  explicit DeviceGuard(const TensorList& tensors) {
+    if (!tensors.empty()) {
+      set_index_from(tensors.front());
+    }
+  }
+
+  /// Resets the device to the index that was active at construction of the
+  /// guard.
+  ~DeviceGuard() {
+    // It should only not have a value if an index was never actually set.
+    if (original_index_ != -1) {
+      // Unchecked because we don't want to throw in the destructor.
+      detail::DynamicCUDAInterface::unchecked_set_device(original_index_);
+    }
+  }
+
+  /// Sets the device to the given one.
+  void set_index(int32_t index) {
+    if (index == -1) {
+      return;
+    }
+    AT_ASSERT(index >= 0);
+    if (original_index_ == -1) {
+      int32_t previous_index = -123;
+      detail::DynamicCUDAInterface::get_device(&previous_index);
+      original_index_ = previous_index;
+      if (index != original_index_) {
+        detail::DynamicCUDAInterface::set_device(index);
+      }
+    } else {
+      detail::DynamicCUDAInterface::set_device(index);
+    }
+    last_index_ = index;
+  }
+
+  /// Calls `set_index` with the `Tensor`'s current device, if it is a CUDA
+  /// tensor. Does nothing if the `tensor` is not defined.
+  void set_index_from(const Tensor& tensor) {
+    if (tensor.defined() && tensor.is_cuda()) {
+      set_index(tensor.get_device());
+    }
+  }
+
+  /// Returns the device that was set upon construction of the guard.
+  int32_t original_index() const noexcept {
+    return original_index_;
+  }
+
+  // /// Returns the last device that was set via `set_device`, if any.
+  int32_t last_index() const noexcept {
+    return last_index_;
+  }
+
+ private:
+  /// The original device that was active at construction of this object.
+  int32_t original_index_ = -1;
+  /// The last index that was set via `set_device`.
+  int32_t last_index_ = -1;
+};
+} // namespace at
diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h
new file mode 100644
index 0000000..aaa4dc9
--- /dev/null
+++ b/aten/src/ATen/DimVector.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "SmallVector.h"
+#include <stdint.h>
+
+namespace at {
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, 5>;
+
+}
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
new file mode 100644
index 0000000..6cd8722
--- /dev/null
+++ b/aten/src/ATen/Dispatch.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <ATen/Error.h>
+#include <ATen/Half.h>
+#include <ATen/Type.h>
+
+#define AT_PRIVATE_CASE_TYPE(enum_type, type, ...) \
+  case enum_type: {                                \
+    using scalar_t = type;                         \
+    return __VA_ARGS__();                          \
+  }
+
+#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                           \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...)                  \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__)           \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...)                       \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__)           \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+    }                                                                         \
+  }()
diff --git a/aten/src/ATen/Error.cpp b/aten/src/ATen/Error.cpp
new file mode 100644
index 0000000..1261fbe
--- /dev/null
+++ b/aten/src/ATen/Error.cpp
@@ -0,0 +1,32 @@
+#include <ATen/Error.h>
+#include <ATen/Backtrace.h>
+
+#include <iostream>
+#include <string>
+
+namespace at {
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
+  out << loc.function << " at " << loc.file << ":" << loc.line;
+  return out;
+}
+
+Error::Error(SourceLocation source_location, std::string err)
+  : what_without_backtrace_(err)
+  , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2)))
+  {}
+
+void Warning::warn(SourceLocation source_location, std::string msg) {
+  warning_handler_(source_location, msg.c_str());
+}
+
+void Warning::set_warning_handler(handler_t handler) {
+  warning_handler_ = handler;
+}
+
+void Warning::print_warning(const SourceLocation& source_location, const char* msg) {
+  std::cerr << "Warning: " << msg << " (" << source_location << ")\n";
+}
+
+Warning::handler_t Warning::warning_handler_ = &Warning::print_warning;
+
+} // namespace at
diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h
new file mode 100644
index 0000000..5a41eb7
--- /dev/null
+++ b/aten/src/ATen/Error.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <ATen/ATenGeneral.h> // for AT_API
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <exception>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+namespace at {
+
+namespace detail {
+
+inline std::ostream& _str(std::ostream& ss) { return ss; }
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  ss << t;
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream&
+_str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+} // namespace detail
+
+// Convert a list of string-like arguments into a single string.
+template <typename... Args>
+inline std::string str(const Args&... args) {
+  std::ostringstream ss;
+  detail::_str(ss, args...);
+  return ss.str();
+}
+
+// Specializations for already-a-string types.
+template <>
+inline std::string str(const std::string& str) {
+  return str;
+}
+inline std::string str(const char* c_str) {
+  return c_str;
+}
+
+/// Represents a location in source code (for debugging).
+struct SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+/// The primary ATen error class.
+/// Provides a complete error message with source location information via
+/// `what()`, and a more concise message via `what_without_backtrace()`. Should
+/// primarily be used with the `AT_ERROR` macro.
+///
+/// NB: at::Error is handled specially by the default torch to suppress the
+/// backtrace, see torch/csrc/Exceptions.h
+class AT_API Error : public std::exception {
+  std::string what_without_backtrace_;
+  std::string what_;
+
+public:
+  Error(SourceLocation source_location, std::string err);
+
+  /// Returns the complete error message, including the source location.
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+  /// Returns only the error message string, without source location.
+  const char* what_without_backtrace() const noexcept {
+    return what_without_backtrace_.c_str();
+  }
+};
+
+class AT_API Warning {
+  using handler_t = void(*)(const SourceLocation& source_location, const char* msg);
+
+public:
+  /// Issue a warning with a given message. Dispatched to the current
+  /// warning handler.
+  static void warn(SourceLocation source_location, std::string msg);
+
+  /// Sets the global warning handler. This is not thread-safe, so it should
+  /// generally be called once during initialization.
+  static void set_warning_handler(handler_t handler);
+
+  /// The default warning handler. Prints the message to stderr.
+  static void print_warning(const SourceLocation& source_location, const char* msg);
+
+private:
+  static handler_t warning_handler_;
+};
+
+
+} // namespace at
+
+// TODO: variants that print the expression tested and thus don't require strings
+// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
+
+#define AT_ERROR(...) \
+  throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_WARN(...) \
+  at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_ASSERT(cond) \
+  if (!(cond)) {             \
+    AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch.");   \
+  }
+
+#define AT_ASSERTM(cond, ...) \
+  if (!(cond)) {             \
+    AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__));   \
+  }
+
+#define AT_CHECK(cond, ...) \
+  if (!(cond)) {             \
+    AT_ERROR(at::str(__VA_ARGS__));   \
+  }
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
new file mode 100644
index 0000000..f4a0ce4
--- /dev/null
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -0,0 +1,80 @@
+#include "ATen/ExpandUtils.h"
+
+namespace at {
+
+std::vector<int64_t> infer_size(IntList a, IntList b) {
+  auto dimsA = a.size();
+  auto dimsB = b.size();
+  ptrdiff_t ndim = dimsA > dimsB ? dimsA : dimsB;
+  std::vector<int64_t> expandedSizes(ndim);
+
+  for (long i = ndim - 1; i >= 0; --i) {
+    long offset = ndim - 1 - i;
+    long dimA = dimsA - 1 - offset;
+    long dimB = dimsB - 1 - offset;
+    long sizeA = (dimA >= 0) ? a[dimA] : 1;
+    long sizeB = (dimB >= 0) ? b[dimB] : 1;
+
+    AT_CHECK(
+        sizeA == sizeB || sizeA == 1 || sizeB == 1,
+        "The size of tensor a (", sizeA,
+        ") must match the size of tensor b (", sizeB,
+        ") at non-singleton dimension ", i);
+
+      // 1s map to the other size (even 0).
+      expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+  }
+
+  return expandedSizes;
+}
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
+    const Tensor& tensor,
+    IntList sizes) {
+  int64_t ndim = sizes.size();
+
+  if (tensor.dim() == 0) {
+    std::vector<int64_t> expandedStrides(ndim, 0);
+    return std::tuple<std::vector<int64_t>, std::vector<int64_t>>(
+        sizes.vec(), expandedStrides);
+  }
+  std::vector<int64_t> expandedSizes(ndim);
+  std::vector<int64_t> expandedStrides(ndim);
+
+  // create a new geometry for the tensors
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    int64_t offset = ndim - 1 - i;
+    int64_t dim = tensor.dim() - 1 - offset;
+    int64_t size = (dim >= 0) ? tensor.sizes()[dim] : 1;
+    int64_t stride = (dim >= 0) ? tensor.strides()[dim]
+                                : expandedSizes[i + 1] * expandedStrides[i + 1];
+    int64_t targetSize = sizes[i];
+    if (targetSize == -1) {
+      AT_CHECK(
+          dim >= 0,
+          "The expanded size of the tensor (",
+          targetSize,
+          ") isn't allowed in a leading, non-existing dimension ",
+          i);
+      targetSize = size;
+    }
+    if (size != targetSize) {
+      AT_CHECK(
+          size == 1,
+          "The expanded size of the tensor (",
+          targetSize,
+          ") must match the existing size (",
+          size,
+          ") at non-singleton dimension ",
+          i);
+      size = targetSize;
+      stride = 0;
+    }
+    expandedSizes[i] = size;
+    expandedStrides[i] = stride;
+  }
+  return std::tuple<std::vector<int64_t>, std::vector<int64_t>>(
+      expandedSizes, expandedStrides);
+}
+
+} // namespace at
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
new file mode 100644
index 0000000..2080e56
--- /dev/null
+++ b/aten/src/ATen/ExpandUtils.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "ATen/Tensor.h"
+#include "ATen/Error.h"
+
+#include <functional>
+#include <sstream>
+#include <tuple>
+
+namespace at {
+
+AT_API std::vector<int64_t> infer_size(IntList a, IntList b);
+std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(const Tensor &tensor, IntList sizes);
+
+// avoid copy-construction of Tensor by using a reference_wrapper.
+inline void check_defined(std::initializer_list<std::reference_wrapper<const Tensor>> tensors, const char *api_name) {
+  for (auto& t : tensors) {
+    if (!t.get().defined()) {
+      AT_ERROR(api_name, "(...) called with an undefined Tensor");
+    }
+  }
+}
+
+inline std::tuple<Tensor> expand_inplace(const Tensor &tensor, const Tensor &to_expand) {
+  if (tensor.sizes().equals(to_expand.sizes())) {
+    return std::make_tuple(to_expand);
+  }
+
+  return std::make_tuple(to_expand.expand(tensor.sizes(), /*implicit=*/true)); // see [expand implicit]
+}
+
+inline std::tuple<Tensor> expand_inplace(const Tensor &tensor, const Tensor &to_expand, const char *api_name) {
+  check_defined({tensor, to_expand}, api_name);
+  return expand_inplace(tensor, to_expand);
+}
+
+inline std::tuple<Tensor, Tensor> expand_inplace(const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2) {
+  if (tensor.sizes().equals(to_expand1.sizes()) && tensor.sizes().equals((to_expand2.sizes()))) {
+    return std::make_tuple(to_expand1, to_expand2);
+  }
+
+  return std::make_tuple(
+      to_expand1.expand(tensor.sizes(), /*implicit=*/true), // see [expand implicit]
+      to_expand2.expand(tensor.sizes(), /*implicit=*/true));
+}
+
+inline std::tuple<Tensor, Tensor> expand_inplace(const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2,
+                                                 const char *api_name) {
+  check_defined({tensor, to_expand1, to_expand2}, api_name);
+  return expand_inplace(tensor, to_expand1, to_expand2);
+}
+
+inline std::tuple<Tensor, Tensor> expand_outplace(const Tensor &to_expand1, const Tensor &to_expand2) {
+  if (to_expand1.sizes().equals(to_expand2.sizes())) {
+    return std::make_tuple(to_expand1, to_expand2);
+  }
+
+  auto expanded_size = infer_size(to_expand1.sizes(), to_expand2.sizes());
+  return std::make_tuple(
+      to_expand1.expand(expanded_size, /*implicit=*/true), // see [expand implicit]
+      to_expand2.expand(expanded_size, /*implicit=*/true));
+}
+
+inline std::tuple<Tensor, Tensor> expand_outplace(const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) {
+  check_defined({to_expand1, to_expand2}, api_name);
+  return expand_outplace(to_expand1, to_expand2);
+}
+
+inline std::tuple<Tensor, Tensor, Tensor> expand_outplace(const Tensor &to_expand1,
+                                                          const Tensor &to_expand2,
+                                                          const Tensor &to_expand3) {
+  if (to_expand1.sizes().equals(to_expand2.sizes()) && to_expand1.sizes().equals(to_expand3.sizes())) {
+    return std::make_tuple(to_expand1, to_expand2, to_expand3);
+  }
+
+  auto expanded_size12 = infer_size(to_expand1.sizes(), to_expand2.sizes());
+  auto expanded_size = infer_size(expanded_size12, to_expand3.sizes());
+  return std::make_tuple(
+      to_expand1.expand(expanded_size, /*implicit=*/true), // see [expand implicit]
+      to_expand2.expand(expanded_size, /*implicit=*/true),
+      to_expand3.expand(expanded_size, /*implicit=*/true));
+}
+
+inline std::tuple<Tensor, Tensor, Tensor> expand_outplace(const Tensor &to_expand1,
+                                                          const Tensor &to_expand2,
+                                                          const Tensor &to_expand3,
+                                                          const char *api_name) {
+  check_defined({to_expand1, to_expand2, to_expand3}, api_name);
+  return expand_outplace(to_expand1, to_expand2, to_expand3);
+}
+
+inline std::tuple<Tensor> expand_size(const Tensor &to_expand, IntList sizes) {
+  if(to_expand.sizes().equals(sizes)) {
+    return std::make_tuple(to_expand);
+  }
+
+  return std::make_tuple(to_expand.expand(sizes, /*implicit=*/true)); // see [expand implicit]
+}
+
+inline std::tuple<Tensor> expand_size(const Tensor &to_expand, IntList sizes, const char *api_name) {
+  check_defined({to_expand}, api_name);
+  return expand_size(to_expand, sizes);
+}
+
+inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
+  // expands a list of Tensors; ignores undefined (null) tensors
+  bool first = true;
+  std::vector<int64_t> sizes;
+  for (size_t i = 0; i < to_expand.size(); ++i) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (first) {
+      sizes = to_expand[i].sizes();
+      first = false;
+    } else {
+      sizes = infer_size(sizes, to_expand[i].sizes());
+    }
+  }
+
+  std::vector<Tensor> result(to_expand.size());
+  for (size_t i = 0; i < to_expand.size(); ++i) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (to_expand[i].sizes().equals(sizes)) {
+      result[i] = to_expand[i];
+    } else {
+      result[i] = to_expand[i].expand(sizes, /*implicit=*/true); // see [expand implicit]
+    }
+  }
+  return result;
+}
+
+}
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp
new file mode 100644
index 0000000..aab224f
--- /dev/null
+++ b/aten/src/ATen/Formatting.cpp
@@ -0,0 +1,295 @@
+#include "ATen/Formatting.h"
+#include "ATen/Tensor.h"
+#include "ATen/Context.h"
+#include "ATen/TensorMethods.h"
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+
+
+namespace at {
+
+//not all C++ compilers have default float so we define our own here
+inline std::ios_base& defaultfloat(std::ios_base& __base) {
+  __base.unsetf(std::ios_base::floatfield);
+  return __base;
+}
+//saves/restores number formatting inside scope
+struct FormatGuard {
+  FormatGuard(std::ostream & out)
+  : out(out), saved(nullptr) {
+    saved.copyfmt(out);
+  }
+  ~FormatGuard() {
+    out.copyfmt(saved);
+  }
+private:
+  std::ostream & out;
+  std::ios saved;
+};
+
+std::ostream& operator<<(std::ostream & out, IntList list) {
+  int i = 0;
+  out << "[";
+  for(auto e : list) {
+    if (i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << "]";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream & out, Backend b) {
+  return out << toString(b);
+}
+
+std::ostream& operator<<(std::ostream & out, ScalarType t) {
+  return out << toString(t);
+}
+
+std::ostream& operator<<(std::ostream & out, const Type& t) {
+  return out << t.toString();
+}
+
+static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Tensor& self) {
+  auto size = self.numel();
+  if(size == 0) {
+    return std::make_tuple(1., 0);
+  }
+  bool intMode = true;
+  auto self_p = self.data<double>();
+  for(int64_t i = 0; i < size; i++) {
+    auto z = self_p[i];
+    if(std::isfinite(z)) {
+      if(z != ceil(z)) {
+        intMode = false;
+        break;
+      }
+    }
+  }
+  int64_t offset = 0;
+  while(!std::isfinite(self_p[offset])) {
+    offset = offset + 1;
+    if(offset == size) {
+      break;
+    }
+  }
+  double expMin;
+  double expMax;
+  if(offset == size) {
+    expMin = 1;
+    expMax = 1;
+  } else {
+    expMin = fabs(self_p[offset]);
+    expMax = fabs(self_p[offset]);
+    for(int64_t i = offset; i < size; i++) {
+      double z = fabs(self_p[i]);
+      if(std::isfinite(z)) {
+        if(z < expMin) {
+          expMin = z;
+        }
+        if(self_p[i] > expMax) {
+          expMax = z;
+        }
+      }
+    }
+    if(expMin != 0) {
+      expMin = floor(log10(expMin)) + 1;
+    } else {
+      expMin = 1;
+    }
+    if(expMax != 0) {
+      expMax = floor(log10(expMax)) + 1;
+    } else {
+      expMax = 1;
+    }
+  }
+  double scale = 1;
+  int64_t sz;
+  if(intMode) {
+    if(expMax > 9) {
+      sz = 11;
+      stream << std::scientific << std::setprecision(4);
+    } else {
+      sz = expMax + 1;
+      stream << defaultfloat;
+    }
+  } else {
+    if(expMax-expMin > 4) {
+      sz = 11;
+      if(fabs(expMax) > 99 || fabs(expMin) > 99) {
+        sz = sz + 1;
+      }
+      stream << std::scientific << std::setprecision(4);
+    } else {
+      if(expMax > 5 || expMax < 0) {
+        sz = 7;
+        scale = pow(10, expMax-1);
+        stream << std::fixed << std::setprecision(4);
+      } else {
+        if(expMax == 0) {
+          sz = 7;
+        } else {
+          sz = expMax+6;
+        }
+        stream << std::fixed << std::setprecision(4);
+      }
+    }
+  }
+  return std::make_tuple(scale, sz);
+}
+
+static void __printIndent(std::ostream &stream, int64_t indent)
+{
+  for(int64_t i = 0; i < indent; i++) {
+    stream << " ";
+  }
+}
+
+static void printScale(std::ostream & stream, double scale) {
+  FormatGuard guard(stream);
+  stream << defaultfloat << scale << " *" << std::endl;
+}
+static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
+{
+  double scale;
+  int64_t sz;
+  std::tie(scale, sz) = __printFormat(stream, self);
+
+  __printIndent(stream, indent);
+  int64_t nColumnPerLine = (linesize-indent)/(sz+1);
+  int64_t firstColumn = 0;
+  int64_t lastColumn = -1;
+  while(firstColumn < self.size(1)) {
+    if(firstColumn + nColumnPerLine <= self.size(1)) {
+      lastColumn = firstColumn + nColumnPerLine - 1;
+    } else {
+      lastColumn = self.size(1) - 1;
+    }
+    if(nColumnPerLine < self.size(1)) {
+      if(firstColumn != 0) {
+        stream << std::endl;
+      }
+      stream << "Columns " << firstColumn+1 << " to " << lastColumn+1;
+      __printIndent(stream, indent);
+    }
+    if(scale != 1) {
+      printScale(stream,scale);
+      __printIndent(stream, indent);
+    }
+    for(int64_t l = 0; l < self.size(0); l++) {
+      Tensor row = self.select(0,l);
+      double *row_ptr = row.data<double>();
+      for(int64_t c = firstColumn; c < lastColumn+1; c++) {
+        stream << std::setw(sz) << row_ptr[c]/scale;
+        if(c == lastColumn) {
+          stream << std::endl;
+          if(l != self.size(0)-1) {
+            if(scale != 1) {
+              __printIndent(stream, indent);
+              stream << " ";
+            } else {
+              __printIndent(stream, indent);
+            }
+          }
+        } else {
+          stream << " ";
+        }
+      }
+    }
+    firstColumn = lastColumn + 1;
+  }
+}
+
+void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
+{
+  std::vector<int64_t> counter(self.ndimension()-2);
+  bool start = true;
+  bool finished = false;
+  counter[0] = -1;
+  for(size_t i = 1; i < counter.size(); i++)
+    counter[i] = 0;
+  while(true) {
+    for(int64_t i = 0; self.ndimension()-2; i++) {
+      counter[i] = counter[i] + 1;
+      if(counter[i] >= self.size(i)) {
+        if(i == self.ndimension()-3) {
+          finished = true;
+          break;
+        }
+        counter[i] = 0;
+      } else {
+        break;
+      }
+    }
+    if(finished) {
+      break;
+    }
+    if(start) {
+      start = false;
+    } else {
+      stream << std::endl;
+    }
+    stream << "(";
+    Tensor tensor = self;
+    for(int64_t i=0; i < self.ndimension()-2; i++) {
+      tensor = tensor.select(0, counter[i]);
+      stream << counter[i]+1 << ",";
+    }
+    stream << ".,.) = " << std::endl;
+    __printMatrix(stream, tensor, linesize, 1);
+  }
+}
+
+std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesize) {
+  FormatGuard guard(stream);
+  if(!tensor_.defined()) {
+    stream << "[ Tensor (undefined) ]";
+  } else if (tensor_.is_sparse()) {
+    stream << "[ " << tensor_.pImpl->toString() << "{}\n";
+    stream << "indices:\n" << tensor_._indices() << "\n";
+    stream << "values:\n" << tensor_._values() << "\n";
+    stream << "size:\n" << tensor_.sizes() << "\n";
+    stream << "]";
+  } else {
+    Type& cpudouble = tensor_.type().toBackend(kCPU).toScalarType(kDouble);
+    Tensor tensor = tensor_.toType(cpudouble).contiguous();
+    if(tensor.ndimension() == 0) {
+      stream << defaultfloat << tensor.data<double>()[0] << std::endl;
+      stream << "[ " << tensor_.pImpl->toString() << "{} ]";
+    } else if(tensor.ndimension() == 1) {
+      if (tensor.numel() > 0) {
+        double scale;
+        int64_t sz;
+        std::tie(scale, sz) =  __printFormat(stream, tensor);
+        if(scale != 1) {
+          printScale(stream, scale);
+        }
+        double* tensor_p = tensor.data<double>();
+        for(int64_t i = 0; i < tensor.size(0); i++) {
+          stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
+        }
+      }
+      stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0) << "} ]";
+    } else if(tensor.ndimension() == 2) {
+      if (tensor.numel() > 0) {
+        __printMatrix(stream, tensor, linesize, 0);
+      }
+      stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0) << "," <<  tensor.size(1) << "} ]";
+    } else {
+      if (tensor.numel() > 0) {
+        __printTensor(stream, tensor, linesize);
+      }
+      stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0);
+      for(int64_t i = 1; i < tensor.ndimension(); i++) {
+        stream << "," << tensor.size(i);
+      }
+      stream << "} ]";
+    }
+  }
+  return stream;
+}
+
+}
diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h
new file mode 100644
index 0000000..fe496a1
--- /dev/null
+++ b/aten/src/ATen/Formatting.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <iostream>
+#include "ATen/Type.h"
+#include "ATen/Scalar.h"
+
+namespace at {
+
+AT_API std::ostream& operator<<(std::ostream & out, IntList list);
+AT_API std::ostream& operator<<(std::ostream & out, Backend b);
+AT_API std::ostream& operator<<(std::ostream & out, ScalarType t);
+AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
+AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
+static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+  return print(out,t,80);
+}
+static inline void print(const Tensor & t, int64_t linesize=80) {
+  print(std::cout,t,linesize);
+}
+
+static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
+  s = s.local();
+  return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong());
+}
+
+}
diff --git a/aten/src/ATen/Generator.h b/aten/src/ATen/Generator.h
new file mode 100644
index 0000000..7e2b68b
--- /dev/null
+++ b/aten/src/ATen/Generator.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace at {
+
+struct Generator {
+  Generator() {};
+  Generator(const Generator& other) = delete;
+  Generator(Generator&& other) = delete;
+  virtual ~Generator() {};
+
+  virtual Generator& copy(const Generator& other) = 0;
+  virtual Generator& free() = 0;
+
+  virtual uint64_t seed() = 0;
+  virtual uint64_t initialSeed() = 0;
+  virtual Generator& manualSeed(uint64_t seed) = 0;
+  virtual Generator& manualSeedAll(uint64_t seed) = 0;
+  virtual void * unsafeGetTH() = 0;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h
new file mode 100644
index 0000000..e5563fa
--- /dev/null
+++ b/aten/src/ATen/Half-inl.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "ATen/ATenGeneral.h"
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+namespace at {
+
+/// Constructors
+
+inline AT_HOSTDEVICE Half::Half(float value) {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  x = __half_as_short(__float2half(value));
+#else
+  x = detail::float2halfbits(value);
+#endif
+}
+
+/// Implicit conversions
+
+inline AT_HOSTDEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#else
+  return detail::halfbits2float(x);
+#endif
+}
+
+#ifdef __CUDACC__
+inline AT_HOSTDEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline AT_HOSTDEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+/// Arithmetic
+
+inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
+  return (float)a + (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
+  return (float)a - (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
+  return (float)a * (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
+  return (float)a / (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a) {
+  return -(float)a;
+}
+
+inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; }
+inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; }
+inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; }
+inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; }
+
+inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; }
+inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; }
+inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; }
+inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; }
+
+inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; }
+inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; }
+inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; }
+inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; }
+
+/// Arithmetic with doubles
+
+inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; }
+inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; }
+inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; }
+inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; }
+
+inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; }
+inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; }
+inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; }
+inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; }
+
+/// Arithmetic with ints
+
+inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; }
+inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; }
+inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; }
+inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; }
+
+inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; }
+inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; }
+inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; }
+inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; }
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from at::Half to float.
+
+} // namespace at
+
+namespace std {
+
+template<> class numeric_limits<at::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss = numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = numeric_limits<float>::tinyness_before;
+  static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); }
+  static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); }
+  static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); }
+  static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); }
+  static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); }
+  static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); }
+  static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); }
+  static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); }
+  static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); }
+};
+
+} // namespace std
diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp
new file mode 100644
index 0000000..68f80a5
--- /dev/null
+++ b/aten/src/ATen/Half.cpp
@@ -0,0 +1,34 @@
+#include "ATen/Half.h"
+
+#include "ATen/Tensor.h"
+#include "ATen/Context.h"
+
+#include <TH/TH.h>
+#include <iostream>
+
+namespace at {
+
+static_assert(std::is_standard_layout<Half>::value, "at::Half must be standard layout.");
+
+namespace detail {
+
+float halfbits2float(unsigned short bits) {
+  float value;
+  TH_halfbits2float(&bits, &value);
+  return value;
+}
+
+unsigned short float2halfbits(float value) {
+  unsigned short bits;
+  TH_float2halfbits(&value, &bits);
+  return bits;
+}
+
+} // namespace detail
+
+std::ostream& operator<<(std::ostream & out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h
new file mode 100644
index 0000000..d740008
--- /dev/null
+++ b/aten/src/ATen/Half.h
@@ -0,0 +1,113 @@
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinisics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include "ATen/ATenGeneral.h"
+
+#include <limits>
+#include <string>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include <cmath>
+#include <iosfwd>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifndef AT_HOSTDEVICE
+  #ifdef __CUDACC__
+    #define AT_HOSTDEVICE __host__ __device__
+  #else
+    #define AT_HOSTDEVICE
+  #endif
+#endif
+
+namespace at {
+
+namespace detail {
+
+AT_API float halfbits2float(unsigned short bits);
+AT_API unsigned short float2halfbits(float value);
+
+}
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  static constexpr from_bits_t from_bits = from_bits_t();
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#ifdef __HIP_PLATFORM_HCC__
+  AT_HOSTDEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {};
+  inline AT_HOSTDEVICE Half(float value);
+  inline AT_HOSTDEVICE operator float() const;
+
+#ifdef __CUDACC__
+  inline AT_HOSTDEVICE Half(const __half& value);
+  inline AT_HOSTDEVICE operator __half() const;
+#endif
+};
+
+template<typename To, typename From> To convert(From f) {
+  return static_cast<To>(f);
+}
+
+// skip isnan and isinf check for integral types
+template<typename To, typename From>
+typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  return f < limit::lowest() || f > limit::max();
+}
+
+template<typename To, typename From>
+typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  if (limit::has_infinity && std::isinf((double)f)) {
+    return false;
+  }
+  if (!limit::has_quiet_NaN && (f != f)) {
+    return true;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
+template<typename To, typename From> To checked_convert(From f, const char* name) {
+  if (overflows<To, From>(f)) {
+    std::string msg = "value cannot be converted to type ";
+    msg += name;
+    msg += " without overflow: ";
+    msg += std::to_string(f);
+    throw std::domain_error(std::move(msg));
+  }
+  return convert<To, From>(f);
+}
+
+template<typename To, typename From>
+To HalfFix(From h) {
+  To ret;
+  ret.x = h.x;
+  return ret;
+}
+
+AT_API std::ostream& operator<<(std::ostream & out, const Half& value);
+
+} // namespace at
+
+#include "Half-inl.h"
+
+#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/Layout.h b/aten/src/ATen/Layout.h
new file mode 100644
index 0000000..010248a
--- /dev/null
+++ b/aten/src/ATen/Layout.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/ScalarType.h>
+
+namespace at {
+enum class Layout { Strided, Sparse };
+
+constexpr auto kStrided = Layout::Strided;
+constexpr auto kSparse = Layout::Sparse;
+
+inline Layout layout_from_backend(Backend backend) {
+  switch (backend) {
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+      return Layout::Sparse;
+    default:
+      return Layout::Strided;
+  }
+}
+} // namespace at
diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h
new file mode 100644
index 0000000..b3e992a
--- /dev/null
+++ b/aten/src/ATen/MatrixRef.h
@@ -0,0 +1,100 @@
+#pragma once
+#include <ATen/ArrayRef.h>
+#include <ATen/Utils.h>
+
+#include <vector>
+
+namespace at {
+  /// MatrixRef - Like an ArrayRef, but with an extra recorded strides so that
+  /// we can easily view it as a multidimensional array.
+  ///
+  /// Like ArrayRef, this class does not own the underlying data, it is expected
+  /// to be used in situations where the data resides in some other buffer.
+  ///
+  /// This is intended to be trivially copyable, so it should be passed by
+  /// value.
+  ///
+  /// For now, 2D only (so the copies are actually cheap, without having
+  /// to write a SmallVector class) and contiguous only (so we can
+  /// return non-strided ArrayRef on index).
+  ///
+  /// P.S. dimension 0 indexes rows, dimension 1 indexes columns
+  template<typename T>
+  class MatrixRef {
+  public:
+    typedef size_t size_type;
+
+  private:
+    /// Underlying ArrayRef
+    ArrayRef<T> arr;
+
+    /// Stride of dim 0 (outer dimension)
+    size_type stride0;
+
+    // Stride of dim 1 is assumed to be 1
+
+  public:
+    /// Construct an empty Matrixref.
+    /*implicit*/ MatrixRef() : arr(nullptr), stride0(0) {}
+
+    /// Construct an MatrixRef from an ArrayRef and outer stride.
+    /*implicit*/ MatrixRef(ArrayRef<T> arr, size_type stride0)
+      : arr(arr), stride0(stride0) {
+        AT_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0)
+      }
+
+    /// @}
+    /// @name Simple Operations
+    /// @{
+
+    /// empty - Check if the matrix is empty.
+    bool empty() const { return arr.empty(); }
+
+    const T *data() const { return arr.data(); }
+
+    /// size - Get size a dimension
+    size_t size(size_t dim) const {
+      if (dim == 0) {
+        return arr.size() / stride0;
+      } else if (dim == 1) {
+        return stride0;
+      } else {
+        AT_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1");
+      }
+    }
+
+    size_t numel() const {
+      return arr.size();
+    }
+
+    /// equals - Check for element-wise equality.
+    bool equals(MatrixRef RHS) const {
+      return stride0 == RHS.stride0 && arr.equals(RHS.arr);
+    }
+
+    /// @}
+    /// @name Operator Overloads
+    /// @{
+    ArrayRef<T> operator[](size_t Index) const {
+      return arr.slice(Index*stride0, stride0);
+    }
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, MatrixRef<T>>::type &
+    operator=(U &&Temporary) = delete;
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, MatrixRef<T>>::type &
+    operator=(std::initializer_list<U>) = delete;
+
+  };
+
+} // end namespace at
diff --git a/aten/src/ATen/OptionsGuard.cpp b/aten/src/ATen/OptionsGuard.cpp
new file mode 100644
index 0000000..d36911b
--- /dev/null
+++ b/aten/src/ATen/OptionsGuard.cpp
@@ -0,0 +1,16 @@
+#include <ATen/OptionsGuard.h>
+#include <ATen/optional.h>
+
+namespace at {
+
+thread_local at::optional<TensorOptions> DefaultTensorOptions::options_;
+
+TensorOptions& DefaultTensorOptions::get() {
+  if (!options_) {
+    options_.emplace(
+        /*use_thread_local_default_options=*/false);
+  }
+  return *options_;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/OptionsGuard.h b/aten/src/ATen/OptionsGuard.h
new file mode 100644
index 0000000..1aa39ac
--- /dev/null
+++ b/aten/src/ATen/OptionsGuard.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/Device.h>
+#include <ATen/Layout.h>
+#include <ATen/ScalarType.h>
+#include <ATen/TensorOptions.h>
+#include <ATen/optional.h>
+
+namespace at {
+
+/// A wrapper over a thread local TensorOptions instance.
+struct DefaultTensorOptions {
+  /// Returns the current thread local default options.
+  /// Defined in OptionsGuard.cpp because we can't use optional in headers, due
+  /// to Windows and other compilers.
+  static TensorOptions& get();
+
+ private:
+  /// This is an optional because of compiler bugs that mis-initialize static
+  /// thread local variables. The workaround is lazy initialization, i.e.
+  /// `DefaultTensorOptions::get()` will initialize the `options_` to a proper
+  /// value upon first invocation.
+  /// https://gcc.gnu.org/ml/gcc-bugs/2013-12/msg00026.html
+  static thread_local at::optional<TensorOptions> options_;
+};
+
+/// RAII guard that stores the current default options upon construction, sets
+/// the current default options to the ones given to its constructor, and
+/// finally resets the options back to the original ones in the destructor.
+struct OptionsGuard {
+  /// Stores the current default options and sets them to the given ones.
+  explicit OptionsGuard(const TensorOptions& options)
+      : original_(DefaultTensorOptions::get()) {
+    DefaultTensorOptions::get() = options;
+  }
+
+  /// Restores the original default options.
+  ~OptionsGuard() {
+    DefaultTensorOptions::get() = original_;
+  }
+
+  /// Returns the original options that were in place at the time of
+  /// construction of this object.
+  const TensorOptions& original() {
+    return original_;
+  }
+
+ private:
+  /// The original options that were in place at the time of construction of
+  /// this object.
+  TensorOptions original_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
new file mode 100644
index 0000000..358dde9
--- /dev/null
+++ b/aten/src/ATen/Parallel.h
@@ -0,0 +1,68 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <cstddef>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace at {
+namespace internal {
+// This parameter is heuristically chosen to determine the minimum number of
+// work that warrants paralellism. For example, when summing an array, it is
+// deemed inefficient to parallelise over arrays shorter than 32768. Further,
+// no parallel algorithm (such as parallel_reduce) should split work into
+// smaller than GRAIN_SIZE chunks.
+constexpr int64_t GRAIN_SIZE = 32768;
+} // namespace internal
+
+inline int64_t divup(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F f) {
+#ifdef _OPENMP
+#pragma omp parallel if ((end - begin) >= grain_size)
+  {
+    int64_t num_threads = omp_get_num_threads();
+    int64_t tid = omp_get_thread_num();
+    int64_t chunk_size = divup((end - begin), num_threads);
+    int64_t begin_tid = begin + tid * chunk_size;
+    if (begin_tid < end)
+      f(begin_tid, std::min(end, chunk_size + begin_tid));
+  }
+#else
+  f(begin, end);
+#endif
+}
+
+template <class scalar_t, class F, class SF>
+inline scalar_t parallel_reduce(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const scalar_t ident,
+    const F f,
+    const SF sf) {
+  if (get_num_threads() == 1) {
+    return f(begin, end, ident);
+  } else {
+    const int64_t num_results = divup((end - begin), grain_size);
+    std::vector<scalar_t> results(num_results);
+    scalar_t* results_data = results.data();
+#pragma omp parallel for if ((end - begin) >= grain_size)
+    for (int64_t id = 0; id < num_results; id++) {
+      int64_t i = begin + id * grain_size;
+      results_data[id] = f(i, i + std::min(end - i, grain_size), ident);
+    }
+    return std::accumulate(
+        results_data, results_data + results.size(), ident, sf);
+  }
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h
new file mode 100644
index 0000000..8fe9c02
--- /dev/null
+++ b/aten/src/ATen/Registry.h
@@ -0,0 +1,216 @@
+#pragma once
+
+/**
+ * Simple registry implementation that uses static variables to
+ * register object creators during program initialization time.
+ */
+
+// NB: This Registry works poorly when you have other namespaces.
+// Make all macro invocations from inside the at namespace.
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include <ATen/Backtrace.h>
+#include <ATen/ATenGeneral.h>
+
+namespace at {
+
+template <typename KeyType>
+inline void PrintOffendingKey(const KeyType& /*key*/) {
+  printf("[key type printing not supported]\n");
+}
+
+template <>
+inline void PrintOffendingKey(const std::string& key) {
+  printf("Offending key: %s.\n", key.c_str());
+}
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a std::string specifying the name, but can be anything that
+ * can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class AT_API Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry() : registry_() {}
+
+  void Register(const SrcType& key, Creator creator) {
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    if (registry_.count(key) != 0) {
+      printf("Key already registered.\n");
+      PrintOffendingKey(key);
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  void Register(const SrcType& key, Creator creator, const std::string& help_msg) {
+    Register(key, creator);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    if (registry_.count(key) == 0) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a std::vector.
+   */
+  std::vector<SrcType> Keys() {
+    std::vector<SrcType> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  const std::unordered_map<SrcType, std::string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+ private:
+  std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, std::string> help_message_;
+  std::mutex register_mutex_;
+
+  Registry(const Registry&) = delete;
+  Registry& operator=(const Registry&) = delete;
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class AT_API Registerer {
+ public:
+  Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    // TODO(jiayq): old versions of NVCC does not handle make_unique well
+    // so we are forced to use a unique_ptr constructor here. Check if it is
+    // fine to use make_unique in the future.
+    // return make_unique<DerivedType>(args...);
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ * Pretty much a copy from 'folly/Preprocessor.h'
+ */
+#define AT_CONCATENATE_IMPL(s1, s2) s1##s2
+#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__)
+#else
+#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__)
+#endif
+
+/**
+ * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+#define AT_DECLARE_TYPED_REGISTRY(                                    \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
+  AT_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName(); \
+  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>        \
+      Registerer##RegistryName; \
+  extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
+
+#define AT_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
+  Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName() {    \
+    static Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* registry = \
+        new Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>();         \
+    return registry;                                                         \
+  } \
+  template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  namespace {                                                                 \
+  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), __VA_ARGS__);                                      \
+  }
+
+#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  namespace {                                                                 \
+  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                    \
+      RegistryName(),                                                         \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
+      ::at::demangle_type<__VA_ARGS__>());                                           \
+  }
+
+// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
+
+#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
+
+#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
+
+#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
+
+// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define AT_REGISTER_CREATOR(RegistryName, key, ...) \
+  AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define AT_REGISTER_CLASS(RegistryName, key, ...) \
+  AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+}  // namespace at
diff --git a/aten/src/ATen/Retainable.h b/aten/src/ATen/Retainable.h
new file mode 100644
index 0000000..792a220
--- /dev/null
+++ b/aten/src/ATen/Retainable.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <atomic>
+
+namespace at {
+
+// base class for refcounted things, allows for collects of generic
+// refcounted objects that include tensors
+struct Retainable {
+  Retainable(): refcount(1), weak_refcount(1) {}
+  void retain() {
+    ++refcount;
+  }
+  void release() {
+    if(--refcount == 0) {
+      // If we know that this is the last reference then we can skip
+      // all the decrements and release_resources().
+      if (weak_refcount == 1) {
+        delete this;
+      } else {
+        release_resources();
+        weak_release();
+      }
+    }
+  }
+  void weak_retain() {
+    ++weak_refcount;
+  }
+  void weak_release() {
+    if (--weak_refcount == 0) {
+      delete this;
+    }
+  }
+  bool weak_lock() {
+    for (;;) {
+      auto current_refcount = refcount.load();
+      if (current_refcount == 0) return false;
+      if (refcount.compare_exchange_strong(current_refcount, current_refcount + 1)) break;
+    }
+    return true;
+  }
+  uint32_t use_count() const {
+    return refcount.load();
+  }
+  uint32_t weak_use_count() const {
+    return weak_refcount.load();
+  }
+
+  virtual void release_resources() {};
+  virtual ~Retainable() {}
+private:
+  // INVARIANT: once refcount reaches 0 it can never go up
+  // INVARIANT: weak_refcount = number of weak references + (refcount > 0 ? 1 : 0)
+  std::atomic<uint32_t> refcount;
+  std::atomic<uint32_t> weak_refcount;
+};
+
+}
diff --git a/aten/src/ATen/Scalar.cpp b/aten/src/ATen/Scalar.cpp
new file mode 100644
index 0000000..94925db
--- /dev/null
+++ b/aten/src/ATen/Scalar.cpp
@@ -0,0 +1,21 @@
+#include "ATen/Config.h"
+
+#include "ATen/Scalar.h"
+
+#include <TH/TH.h>
+
+#include "ATen/Tensor.h"
+#include "ATen/Context.h"
+
+namespace at {
+Tensor Scalar::toTensor() const {
+  if (Tag::HAS_t == tag) {
+    return Tensor(t);
+  } else if (Tag::HAS_d == tag) {
+    return CPU(kDouble).scalarTensor(*this);
+  } else {
+    assert(Tag::HAS_i == tag);
+    return CPU(kLong).scalarTensor(*this);
+  }
+}
+}
diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h
new file mode 100644
index 0000000..806b05b
--- /dev/null
+++ b/aten/src/ATen/Scalar.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include "ATen/ATenGeneral.h"
+#include "ATen/Half.h"
+#include "ATen/ScalarType.h"
+#include "ATen/TensorBase.h"
+#include "ATen/Utils.h"
+
+
+namespace at {
+
+struct Tensor;
+
+class AT_API Scalar {
+public:
+  Scalar() : Scalar(int64_t(0)) {}
+
+  explicit Scalar(const detail::TensorBase & t)
+  : tag(Tag::HAS_t), t(t) {
+    AT_CHECK(t.defined(), "Attempting to create a Scalar from an undefined tensor");
+    AT_CHECK(t.dim() == 0, "Attempting to create a Scalar from a ", t.dim(), " dim tensor");
+  }
+
+#define DEFINE_IMPLICIT_CTOR(type,name,member) \
+  Scalar(type vv) \
+  : tag(Tag::HAS_##member) { \
+    v . member = convert<decltype(v.member),type>(vv); \
+  }
+
+  AT_FORALL_SCALAR_TYPES(DEFINE_IMPLICIT_CTOR)
+
+#undef DEFINE_IMPLICIT_CTOR
+
+  // return a new scalar that is guarenteed to be not backed by a tensor.
+  Scalar local() const {
+    if (Tag::HAS_t != tag) {
+      return *this;
+    }
+    return t.pImpl->localScalar();
+  }
+
+#define DEFINE_ACCESSOR(type,name,member) \
+  type to##name () const { \
+    if (Tag::HAS_t == tag) { \
+      return local().to##name(); \
+    } else if (Tag::HAS_d == tag) { \
+      return checked_convert<type, double>(v.d, #type); \
+    } else { \
+      return checked_convert<type, int64_t>(v.i, #type); \
+    } \
+  }
+
+  Tensor toTensor() const;
+
+  AT_FORALL_SCALAR_TYPES(DEFINE_ACCESSOR)
+
+  //also support scalar.to<int64_t>();
+  template<typename T>
+  T to();
+
+#undef DEFINE_ACCESSOR
+  bool isFloatingPoint() const {
+    return Tag::HAS_d == tag;
+  }
+  bool isIntegral() const {
+    return Tag::HAS_i == tag;
+  }
+  bool isBackedByTensor() const {
+    return Tag::HAS_t == tag;
+  }
+
+private:
+  enum class Tag { HAS_d, HAS_i, HAS_t };
+  Tag tag;
+  union {
+    double d;
+    int64_t i;
+  } v;
+  detail::TensorBase t;
+  friend struct Type;
+};
+
+// define the scalar.to<int64_t>() specializations
+template<typename T>
+inline T Scalar::to() {
+  throw std::runtime_error("to() cast to unexpected type.");
+}
+
+#define DEFINE_TO(T,name,_) \
+template<> \
+inline T Scalar::to<T>() { \
+  return to##name(); \
+}
+AT_FORALL_SCALAR_TYPES(DEFINE_TO)
+#undef DEFINE_TO
+
+}
diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h
new file mode 100644
index 0000000..4cb68a6
--- /dev/null
+++ b/aten/src/ATen/ScalarType.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <stdint.h>
+
+#include "ATen/ArrayRef.h"
+#include "ATen/ATenGeneral.h"
+#include "ATen/Half.h"
+
+namespace at {
+
+// NB: Order matters for this macro; it is relied upon in
+// _promoteTypesLookup and probably other places.
+#define AT_FORALL_SCALAR_TYPES(_) \
+_(uint8_t,Byte,i) \
+_(int8_t,Char,i) \
+_(int16_t,Short,i) \
+_(int,Int,i) \
+_(int64_t,Long,i) \
+_(at::Half,Half,d) \
+_(float,Float,d) \
+_(double,Double,d)
+
+#define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \
+_(uint8_t,Byte,i) \
+_(int8_t,Char,i) \
+_(int16_t,Short,i) \
+_(int,Int,i) \
+_(int64_t,Long,i) \
+_(float,Float,d) \
+_(double,Double,d)
+
+enum class ScalarType {
+#define DEFINE_ENUM(_1,n,_2) \
+  n,
+  AT_FORALL_SCALAR_TYPES(DEFINE_ENUM)
+#undef DEFINE_ENUM
+  Undefined,
+  NumOptions
+};
+
+enum class Backend {
+  CPU,
+  CUDA,
+  SparseCPU,
+  SparseCUDA,
+  Undefined,
+  NumOptions
+};
+
+constexpr Backend kCPU = Backend::CPU;
+constexpr Backend kCUDA = Backend::CUDA;
+constexpr Backend kSparseCPU = Backend::SparseCPU;
+constexpr Backend kSparseCUDA = Backend::SparseCUDA;
+
+static inline Backend toSparse(Backend b) {
+  switch (b) {
+    case Backend::CPU: return Backend::SparseCPU;
+    case Backend::CUDA: return Backend::SparseCUDA;
+    case Backend::SparseCPU: return Backend::SparseCPU;
+    case Backend::SparseCUDA: return Backend::SparseCUDA;
+    default: throw std::runtime_error("Unknown backend");
+  }
+}
+
+static inline Backend toDense(Backend b) {
+  switch (b) {
+    case Backend::CPU: return Backend::CPU;
+    case Backend::CUDA: return Backend::CUDA;
+    case Backend::SparseCPU: return Backend::CPU;
+    case Backend::SparseCUDA: return Backend::CUDA;
+    default: throw std::runtime_error("Unknown backend");
+  }
+}
+
+static inline const char * toString(Backend b) {
+  switch(b) {
+    case Backend::CPU: return "CPU";
+    case Backend::CUDA: return "CUDA";
+    case Backend::SparseCPU: return "SparseCPU";
+    case Backend::SparseCUDA: return "SparseCUDA";
+    default: return "UNKNOWN_BACKEND";
+  }
+}
+
+#define DEFINE_CONSTANT(_,name,_2) \
+constexpr ScalarType k##name = ScalarType::name;
+
+AT_FORALL_SCALAR_TYPES(DEFINE_CONSTANT)
+#undef DEFINE_CONSTANT
+
+static inline const char * toString(ScalarType t) {
+#define DEFINE_CASE(_,name,_2) \
+  case ScalarType:: name : return #name;
+
+  switch(t) {
+    AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
+static inline size_t elementSize(ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \
+  case ScalarType:: name : return sizeof(ctype);
+
+  switch(t) {
+    AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
+    default:
+      AT_ERROR("Unknown ScalarType");
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+static inline bool isIntegralType(ScalarType t) {
+  return (t == ScalarType::Byte ||
+          t == ScalarType::Char ||
+          t == ScalarType::Int ||
+          t == ScalarType::Long ||
+          t == ScalarType::Short);
+}
+
+static inline bool isFloatingType(ScalarType t) {
+  return (t == ScalarType::Double ||
+          t == ScalarType::Float ||
+          t == ScalarType::Half);
+}
+
+static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
+  // This is generated according to NumPy's promote_types
+#define u1 ScalarType::Byte
+#define i1 ScalarType::Char
+#define i2 ScalarType::Short
+#define i4 ScalarType::Int
+#define i8 ScalarType::Long
+#define f2 ScalarType::Half
+#define f4 ScalarType::Float
+#define f8 ScalarType::Double
+#define ud ScalarType::Undefined
+  static constexpr ScalarType _promoteTypesLookup
+      [static_cast<int>(ScalarType::NumOptions)]
+      [static_cast<int>(ScalarType::NumOptions)] = {
+            /* u1  i1  i2  i4  i8  f2  f4  f8, ud */
+    /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8, ud },
+    /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8, ud },
+    /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8, ud },
+    /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8, ud },
+    /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8, ud },
+    /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8, ud },
+    /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8, ud },
+    /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8, ud },
+    /* ud */ { ud, ud, ud, ud, ud, ud, ud, ud, ud },
+  };
+#undef u1
+#undef i1
+#undef i2
+#undef i4
+#undef i8
+#undef f2
+#undef f4
+#undef f8
+#undef ud
+  return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
+}
+
+struct Tensor;
+typedef ArrayRef<int64_t> IntList;
+typedef ArrayRef<Tensor> TensorList;
+
+} // namespace at
diff --git a/aten/src/ATen/ScalarTypeUtils.h b/aten/src/ATen/ScalarTypeUtils.h
new file mode 100644
index 0000000..ff96bbe
--- /dev/null
+++ b/aten/src/ATen/ScalarTypeUtils.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "ATen/ScalarType.h"
+
+namespace at {
+
+template <typename T>
+struct CTypeToScalarType {
+};
+
+#define DEFINE_TO_SCALAR_TYPE(ct, st, _2)                          \
+template <>                                                        \
+struct CTypeToScalarType<ct> {                                     \
+  static inline at::ScalarType to() { return at::ScalarType::st; } \
+};
+AT_FORALL_SCALAR_TYPES(DEFINE_TO_SCALAR_TYPE)
+#undef DEFINE_TO_SCALAR_TYPE
+
+} // namespace at
diff --git a/aten/src/ATen/SmallVector.cpp b/aten/src/ATen/SmallVector.cpp
new file mode 100644
index 0000000..59095a2
--- /dev/null
+++ b/aten/src/ATen/SmallVector.cpp
@@ -0,0 +1,50 @@
+//===- llvm/ADT/SmallVector.cpp - 'Normally small' vectors ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// replaced report_bad_alloc_error with std::bad_alloc
+
+#include "SmallVector.h"
+
+namespace at {
+
+/// grow_pod - This is an implementation of the grow() method which only works
+/// on POD-like datatypes and is out of line to reduce code duplication.
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+                               size_t TSize) {
+  size_t CurSizeBytes = size_in_bytes();
+  size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
+  if (NewCapacityInBytes < MinSizeInBytes)
+    NewCapacityInBytes = MinSizeInBytes;
+
+  void *NewElts;
+  if (BeginX == FirstEl) {
+    NewElts = malloc(NewCapacityInBytes);
+    if (NewElts == nullptr)
+      throw std::bad_alloc();
+
+    // Copy the elements over.  No need to run dtors on PODs.
+    memcpy(NewElts, this->BeginX, CurSizeBytes);
+  } else {
+    // If this wasn't grown from the inline copy, grow the allocated space.
+    NewElts = realloc(this->BeginX, NewCapacityInBytes);
+    if (NewElts == nullptr)
+      throw std::bad_alloc();
+  }
+
+  this->EndX = (char*)NewElts+CurSizeBytes;
+  this->BeginX = NewElts;
+  this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+}
+
+}
diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h
new file mode 100644
index 0000000..238a181
--- /dev/null
+++ b/aten/src/ATen/SmallVector.h
@@ -0,0 +1,976 @@
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// replaced report_bad_alloc_error with std::bad_alloc
+// replaced isPodLike<T> with AT_IS_TRIVIALLY_COPYABLE
+// replaced iterator_range constructor with inline Container&& constructor
+// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers
+// removed LLVM_UNLIKELY
+
+#pragma once
+
+#include "AlignOf.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include <ATen/ATenGeneral.h>
+
+#if __GNUG__ && __GNUC__ < 5
+#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
+
+namespace at {
+
+namespace detail {
+
+// From llvm/Support/MathExtras.h
+static inline uint64_t NextPowerOf2(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+}
+
+/// This is all the non-templated stuff common to all SmallVectors.
+class AT_API SmallVectorBase {
+protected:
+  void *BeginX, *EndX, *CapacityX;
+
+protected:
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
+
+public:
+  /// This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const { return BeginX == EndX; }
+};
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  using U = AlignedCharArrayUnion<T>;
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+protected:
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
+  void setEnd(T *P) { this->EndX = P; }
+
+public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using iterator = T *;
+  using const_iterator = const T *;
+
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  using reference = T &;
+  using const_reference = const T &;
+  using pointer = T *;
+  using const_pointer = const T *;
+
+  // forward iterator creation methods.
+  iterator begin() { return (iterator)this->BeginX; }
+  const_iterator begin() const { return (const_iterator)this->BeginX; }
+  iterator end() { return (iterator)this->EndX; }
+  const_iterator end() const { return (const_iterator)this->EndX; }
+
+protected:
+  iterator capacity_ptr() { return (iterator)this->CapacityX; }
+  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+
+public:
+  // reverse iterator creation methods.
+  reverse_iterator rbegin()            { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+  reverse_iterator rend()              { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+  size_type size() const { return end()-begin(); }
+  size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+  /// Return the total number of elements in the currently allocated buffer.
+  size_t capacity() const { return capacity_ptr() - begin(); }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() { return pointer(begin()); }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const { return const_pointer(begin()); }
+
+  reference operator[](size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference operator[](size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    assert(!empty());
+    return begin()[0];
+  }
+  const_reference front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    assert(!empty());
+    return end()[-1];
+  }
+  const_reference back() const {
+    assert(!empty());
+    return end()[-1];
+  }
+};
+
+/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
+/// implementations that are designed to work with non-POD-like T's.
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T *S, T *E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(std::make_move_iterator(I),
+                            std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+public:
+  void push_back(const T &Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*) this->end()) T(Elt);
+    this->setEnd(this->end()+1);
+  }
+
+  void push_back(T &&Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*) this->end()) T(::std::move(Elt));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+    this->end()->~T();
+  }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+  size_t CurCapacity = this->capacity();
+  size_t CurSize = this->size();
+  // Always grow, even from zero.
+  size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2));
+  if (NewCapacity < MinSize)
+    NewCapacity = MinSize;
+  T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
+  if (NewElts == nullptr)
+    throw std::bad_alloc();
+
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    free(this->begin());
+
+  this->setEnd(NewElts+CurSize);
+  this->BeginX = NewElts;
+  this->CapacityX = this->begin()+NewCapacity;
+}
+
+
+/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
+/// implementations that are designed to work with POD-like T's.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T *, T *) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1 *I, T1 *E, T2 *Dest,
+      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+                                           T2>::value>::type * = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize*sizeof(T), sizeof(T));
+  }
+
+public:
+  void push_back(const T &Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+  }
+};
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)> {
+  using SuperClass = SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>;
+
+public:
+  using iterator = typename SuperClass::iterator;
+  using const_iterator = typename SuperClass::const_iterator;
+  using size_type = typename SuperClass::size_type;
+
+protected:
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N)
+    : SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>(N*sizeof(T)) {
+  }
+
+public:
+  SmallVectorImpl(const SmallVectorImpl &) = delete;
+
+  ~SmallVectorImpl() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      free(this->begin());
+  }
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->EndX = this->BeginX;
+  }
+
+  void resize(size_type N) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      auto I = this->end();
+      for (auto E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void resize(size_type N, const T &NV) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      std::uninitialized_fill(this->end(), this->begin()+N, NV);
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl &RHS);
+
+  /// Add the specified range to the end of the SmallVector.
+  template <typename in_iter,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<in_iter>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  void append(in_iter in_start, in_iter in_end) {
+    size_type NumInputs = std::distance(in_start, in_end);
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  /// Add the specified range to the end of the SmallVector.
+  void append(size_type NumInputs, const T &Elt) {
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
+  void assign(size_type NumElts, const T &Elt) {
+    clear();
+    if (this->capacity() < NumElts)
+      this->grow(NumElts);
+    this->setEnd(this->begin()+NumElts);
+    std::uninitialized_fill(this->begin(), this->end(), Elt);
+  }
+
+  template <typename in_iter,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<in_iter>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  void assign(in_iter in_start, in_iter in_end) {
+    clear();
+    append(in_start, in_end);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I+1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return(N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    assert(S >= this->begin() && "Range to erase is out of bounds.");
+    assert(S <= E && "Trying to erase invalid range.");
+    assert(E <= this->end() && "Trying to erase past the end.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->setEnd(I);
+    return(N);
+  }
+
+  iterator insert(iterator I, T &&Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end()-1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+
+    ::new ((void*) this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
+  }
+
+  iterator insert(iterator I, const T &Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(Elt);
+      return this->end()-1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+    ::new ((void*) this->end()) T(std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin()+InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::fill_n(I, NumToInsert, Elt);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, Elt);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+    return I;
+  }
+
+  template <typename ItTy,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<ItTy>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(From, To);
+      return this->begin()+InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J; ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
+  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+
+  bool operator==(const SmallVectorImpl &RHS) const {
+    if (this->size() != RHS.size()) return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl &RHS) const {
+    return std::lexicographical_compare(this->begin(), this->end(),
+                                        RHS.begin(), RHS.end());
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_type N) {
+    assert(N <= this->capacity());
+    this->setEnd(this->begin() + N);
+  }
+};
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->EndX, RHS.EndX);
+    std::swap(this->CapacityX, RHS.CapacityX);
+    return;
+  }
+  if (RHS.size() > this->capacity())
+    this->grow(RHS.size());
+  if (this->size() > RHS.capacity())
+    RHS.grow(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size()) NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
+    RHS.setEnd(RHS.end()+EltDiff);
+    this->destroy_range(this->begin()+NumShared, this->end());
+    this->setEnd(this->begin()+NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
+    this->setEnd(this->end() + EltDiff);
+    this->destroy_range(RHS.begin()+NumShared, RHS.end());
+    RHS.setEnd(RHS.begin()+NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::
+  operator=(const SmallVectorImpl<T> &RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->setEnd(NewEnd);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall()) free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T> struct SmallVectorStorage<T, 1> {};
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small.  It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold.  This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// Note that this does not attempt to be exception safe.
+///
+template <typename T, unsigned N>
+class SmallVector : public SmallVectorImpl<T> {
+  /// Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
+
+public:
+  SmallVector() : SmallVectorImpl<T>(N) {}
+
+  explicit SmallVector(size_t Size, const T &Value = T())
+    : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  template <typename ItTy,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<ItTy>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  template <typename Container>
+  explicit SmallVector(Container &&c) : SmallVectorImpl<T>(N) {
+    this->append(c.begin(), c.end());
+  }
+
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  const SmallVector &operator=(const SmallVector &RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  template<typename Container>
+  const SmallVector &operator=(const Container &RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
+  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  const SmallVector &operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template <typename T, unsigned N>
+inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+  return X.capacity_in_bytes();
+}
+
+} // end namespace at
+
+namespace std {
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T>
+  inline void
+  swap(at::SmallVectorImpl<T> &LHS, at::SmallVectorImpl<T> &RHS) {
+    LHS.swap(RHS);
+  }
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T, unsigned N>
+  inline void
+  swap(at::SmallVector<T, N> &LHS, at::SmallVector<T, N> &RHS) {
+    LHS.swap(RHS);
+  }
+
+} // end namespace std
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
new file mode 100644
index 0000000..62c8356
--- /dev/null
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -0,0 +1,87 @@
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+
+namespace at {
+
+
+// An empty dense tensor defaults to a 1-dimensional tensor of size [0]
+// (recall, it is not a 0-dimensional tensor, because such a tensor would
+// a scalar and have one element)
+//
+// Thus, an empty sparse tensor should be a 1-dimensional tensor of size [0].
+// Furthermore, we have dim == sparseDims + denseDims; since this is a sparse
+// tensor, let us say that an empty sparse tensor has sparseDims == 1 and
+// denseDims == 0.  (There is a degree of freedom here, but given that this
+// is a sparse dimension, it seems reasonable to demand that sparseDims > 0).
+//
+// In an ideal world, this would then mean we allocate a [1,0] size indices
+// tensor and a [0] size values tensor for such an empty tensor.  However,
+// we don't currently support zero-size dimensions, so we can't actually
+// do this; so we just allocate zero-size tensors for everything.
+SparseTensorImpl::SparseTensorImpl(Type * type)
+    : TensorImpl(type)
+    , size_{0}
+    , sparseDims_(1)
+    , denseDims_(0)
+    , indices_(type->toDense().toScalarType(ScalarType::Long).tensor())
+    , values_(type->toDense().tensor()) {
+      AT_ASSERT(type->is_sparse());
+    }
+
+const char * SparseTensorImpl::toString() const {
+  // TODO: also give back type information
+  return "SparseTensor";
+}
+IntList SparseTensorImpl::sizes() const {
+  return size_;
+}
+IntList SparseTensorImpl::strides() const {
+  AT_ERROR("sparse tensors do not have strides");
+}
+int64_t SparseTensorImpl::dim() const {
+  return sparseDims_ + denseDims_;
+}
+Scalar SparseTensorImpl::localScalar() {
+  int64_t n = numel();
+  AT_CHECK(n == 1, "a Tensor with ", n, " elements cannot be converted to Scalar");
+  if (nnz_ == 0) return Scalar(0);
+  if (coalesced_) return values_.pImpl->localScalar();
+  // You have a non-coalesced scalar sparse tensor?!  Wow!  Have
+  // a cookie.
+  return values_.sum().pImpl->localScalar();
+}
+void * SparseTensorImpl::unsafeGetTH(bool retain) {
+  AT_ERROR("unsafeGetTH not supported for new style TensorImpl");
+}
+std::unique_ptr<Storage> SparseTensorImpl::storage() {
+  AT_ERROR("sparse tensors do not have storage");
+}
+
+void SparseTensorImpl::set_indices_and_values(const Tensor& indices, const Tensor& values) {
+  // TODO: Explicit empty test is needed because we don't handle size zero
+  // dimensions at the moment
+  bool empty = values.numel() == 0;
+  AT_CHECK(values.type().toSparse() == type(), "values type must match sparse tensor type");
+  AT_CHECK(indices.type().scalarType() == kLong, "indices must be an int64 tensor");
+  AT_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")");
+  AT_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")");
+  if (!empty) {
+    AT_CHECK(indices.dim() == 2, "indices must be nDim x nnz");
+    AT_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz");
+    AT_CHECK(indices.size(0) == sparseDims_, "indices has incorrect first dimension, expected ", sparseDims_, ", got ", indices.size(0));
+    AT_CHECK(values.dim() == denseDims_ + 1, "values has incorrect number of dimensions, expected ", denseDims_ + 1, ", got ", values.dim());
+  } else {
+    AT_CHECK(indices.numel() == 0, "if values is empty, indices must be empty too");
+  }
+  indices_ = indices;
+  values_ = values;
+  // TODO: Eliminate this ternary when we handle size zero dimensions.
+  // (Actually, this will "accidentally" work today because all zero-size
+  // tensors have size [0], and so you'll get 0 when empty is zero; but it's
+  // more explicit this way.)
+  nnz_ = empty ? 0 : values.size(0);
+  coalesced_ = false;
+}
+
+
+} // namespace at
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
new file mode 100644
index 0000000..2093b45
--- /dev/null
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "ATen/Tensor.h"
+#include "ATen/TensorImpl.h"
+#include "ATen/Error.h"
+
+namespace at {
+struct AT_API SparseTensorImpl : public TensorImpl {
+  // Stored in COO format, indices + values.
+
+  // Ideal INVARIANTS:
+  // _sparseDims: range [0, len(shape)]; _sparseDims + _denseDims = len(shape)
+  // _denseDims : range [0, len(shape)]; _sparseDims + _denseDims = len(shape)
+  // _indices.shape: dimensionality: 2,  shape: (_sparseDims, nnz)
+  // _values.shape:  dimensionality: 1 + _denseDims.  shape: (nnz, shape[_sparseDims:])
+
+  // Actual INVARIANT differences:
+  // 1) _sparseDims: range [1, len(shape)] (i.e. we don't allow 0 sparse dimensions)
+  // 2) when nnz = 0, there is strange behavior because we lack 0-dimensional sparse tensors.  Namely:
+  //    dimensionality == 0, _sparseDims == 0, _denseDims == 0, _indices.shape == {0}, _values.shape == {0}
+  // 3) For both _indices.shape and _values.shape, the nnz dimension may be larger than nnz
+  // 4) For _values.shape, the non-nnz dimensions may be smaller than the corresponding dimension size, e.g.
+  //    a shape (2,3) sparse tensor with _sparseDims == 1, may have _values.shape: (nnz, <=2, <=3).
+
+
+  // The true size of the sparse tensor (e.g., if you called to_dense()
+  // on it).  When THTensor merges into TensorImpl, this field
+  // should move to the parent class.
+  std::vector<int64_t> size_;
+
+  // The number of non-zero elements.
+  int64_t nnz_ = 0;
+
+  int64_t sparseDims_ = 0; // number of sparse dimensions
+  int64_t denseDims_ = 0; // number of dense dimensions
+
+  Tensor indices_; // always a LongTensor
+  Tensor values_;
+
+  // A sparse tensor is 'coalesced' if every index occurs at most once in
+  // the indices tensor, and the indices are in sorted order.  (This means
+  // that it is very easy to convert a coalesced tensor to CSR format: you
+  // need only compute CSR format indices.)
+  //
+  // Most math operations can only be performed on coalesced sparse tensors,
+  // because many algorithms proceed by merging two sorted lists (of indices).
+  bool coalesced_ = false;
+
+public:
+  // Public for now...
+  explicit SparseTensorImpl(Type * type);
+
+  int64_t nnz() const { return nnz_; }
+  int64_t sparseDims() const { return sparseDims_; }
+  int64_t denseDims() const { return denseDims_; }
+  bool coalesced() const { return coalesced_; }
+  Tensor indices() const { return indices_; }
+  Tensor values() const { return values_; }
+
+  const char * toString() const override;
+  IntList sizes() const override;
+  IntList strides() const override;
+  int64_t dim() const override;
+  Scalar localScalar() override;
+  void * unsafeGetTH(bool retain) override;
+  std::unique_ptr<Storage> storage() override;
+
+  // Some ops do some manual size fiddling.
+  // TODO: Figure out a more safe way to provide this functionality
+  std::vector<int64_t>& _sizes_mut() { return size_; }
+
+  // WARNING: This function does NOT preserve invariants of sparseDims/denseDims with
+  // respect to indices and values
+  void raw_resize_(int64_t sparseDims, int64_t denseDims, ArrayRef<int64_t> size) {
+    // UGHHHHH.  Legacy special case
+    if (size.size() == 0) {
+      size_ = {0};
+    } else {
+      size_ = size;
+    }
+    sparseDims_ = sparseDims;
+    denseDims_ = denseDims;
+  }
+
+  // TODO: I hate these two setters, please get rid of them!!!
+  void set_indices(const Tensor& indices) {
+    AT_ASSERT(indices.type().backend() == at::toDense(type().backend()));
+    AT_ASSERT(indices.type().scalarType() == kLong);
+    indices_ = indices;
+  }
+  void set_values(const Tensor& values) {
+    AT_ASSERT(values.type().toSparse() == type());
+    values_ = values;
+  }
+
+  void set_coalesced(bool coalesced) { coalesced_ = coalesced; }
+  void set_nnz(int64_t nnz) { nnz_ = nnz; }
+
+  // This used to be called THSTensor_(_move)
+  // NB: This used to be able to avoid a refcount bump, but I was too lazy to
+  // make it happen
+  void set_indices_and_values(const Tensor& indices, const Tensor& values);
+};
+
+} // namespace at
diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/SparseTensorRef.h
new file mode 100644
index 0000000..9c9fada
--- /dev/null
+++ b/aten/src/ATen/SparseTensorRef.h
@@ -0,0 +1,11 @@
+#pragma once
+
+namespace at {
+
+struct Tensor;
+struct SparseTensorRef {
+  explicit SparseTensorRef(const Tensor& t): tref(t) {}
+  const Tensor& tref;
+};
+
+}
diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
new file mode 100644
index 0000000..582a82a
--- /dev/null
+++ b/aten/src/ATen/Storage.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "ATen/Scalar.h"
+
+namespace at {
+
+struct Type;
+
+struct Storage {
+  static const char RESIZABLE = 2;
+
+  Storage() {}
+  Storage(const Storage& other) = delete;
+  void operator=(const Storage&) = delete;
+
+  virtual ~Storage() {};
+  virtual size_t elementSize() const = 0;
+  virtual size_t size() const = 0;
+  virtual void* data() = 0;
+  virtual const void* data() const = 0;
+  virtual Storage& retain() = 0;
+  virtual Storage& free() = 0;
+  virtual void * unsafeGetTH(bool retain) const = 0;
+
+  virtual Storage& resize(int64_t new_size) = 0;
+
+  virtual Type & type() const = 0;
+  virtual int getDevice() const = 0;
+  virtual const char * toString() const = 0;
+
+  virtual Storage& fill(Scalar value) = 0;
+  virtual Storage& set(size_t ind, Scalar value) = 0;
+  virtual Storage& fast_set(size_t ind, Scalar value) = 0;
+  virtual Scalar get(size_t ind) = 0;
+  virtual Scalar fast_get(size_t ind) = 0;
+
+  virtual void set_flag(char flag) = 0;
+  virtual void clear_flag(char flag) = 0;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
new file mode 100644
index 0000000..11c6ca8
--- /dev/null
+++ b/aten/src/ATen/THLongStorageView.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "TH/TH.h"
+#include "TH/THStorage.hpp"
+#include "TH/THTypeConversion.hpp"
+
+namespace at {
+
+enum class THLongStorageViewKind {
+  SIZE,
+  STRIDE,
+  LENGTH,
+};
+
+// make a fake storage out of a size, pointer pair...
+// used as an argument where THSize and THStride are passed into TH
+class THLongStorageView {
+public:
+  operator THLongStorage*() {
+    if (storage.size == 0 && zero_dim_to_null) {
+      return nullptr;
+    }
+    return &storage;
+  }
+
+  /*
+  // This is done as an enum, and not as static constructors, as there
+  // is no move/copy constructor for THLongStorageView
+
+  static THLongStorageView makeFromSize(ArrayRef<int64_t> ref) {
+    ...
+  }
+
+  static THLongStorageView makeFromLength(ArrayRef<int64_t> ref) {
+    ...
+  }
+  */
+
+  THLongStorageView(ArrayRef<int64_t> ref, THLongStorageViewKind kind)
+  : zero_dim_to_null(false)
+  {
+    // zero_dim_to_one converts an empty ArrayRef into [1]
+    // zero_dim_to_null converts an empty ArrayRef into a null THLongStorage
+    bool zero_dim_to_one = false;
+    bool noelem_to_empty = false;
+    switch (kind) {
+      case THLongStorageViewKind::SIZE:
+        zero_dim_to_one = true;
+        break;
+      case THLongStorageViewKind::STRIDE:
+        zero_dim_to_null = true;
+        break;
+      case THLongStorageViewKind::LENGTH:
+        break;
+    }
+
+    if(zero_dim_to_one && ref.size() == 0) {
+      // make storage of size 0 actually a 1-length storage with 1 element
+      // so that our 0-dim tensors get allocated as 1-dim inside TH
+      one = 1;
+      storage.data_ptr = {&one, kCPU}; // non-owning
+      storage.size = 1;
+    } else {
+      storage.data_ptr = {const_cast<void*>(static_cast<const void*>(ref.data())), kCPU}; // non-owning
+      storage.size = ref.size();
+    }
+    storage.scalar_type = at::CTypeToScalarType<th::from_type<int64_t>>::to();
+    storage.refcount = 0;
+    storage.flag = 0;
+  }
+private:
+  int64_t one;
+  THLongStorage storage;
+  bool zero_dim_to_null;
+};
+
+}
diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/Tensor.cpp
new file mode 100644
index 0000000..88ecdab
--- /dev/null
+++ b/aten/src/ATen/Tensor.cpp
@@ -0,0 +1,14 @@
+#include <ATen/ATen.h>
+
+#include <iostream>
+
+namespace at {
+
+void Tensor::print() const {
+  if (defined()) {
+    std::cerr << "[" << type().toString() << " " << sizes() << "]" << std::endl;
+  } else {
+    std::cerr << "[UndefinedTensor]" << std::endl;
+  }
+}
+} // namespace at
diff --git a/aten/src/ATen/TensorAccessor.h b/aten/src/ATen/TensorAccessor.h
new file mode 100644
index 0000000..e51af27
--- /dev/null
+++ b/aten/src/ATen/TensorAccessor.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <cstddef>
+#include <stdint.h>
+
+#include "ATen/ScalarType.h"
+
+namespace at {
+
+
+template<typename T, size_t N>
+class TensorAccessorBase {
+public:
+  TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  IntList sizes() {
+    return IntList(sizes_,N);
+  }
+  IntList strides() {
+    return IntList(strides_,N);
+  }
+  int64_t stride(int64_t i) { return strides()[i]; }
+  int64_t size(int64_t i) { return sizes()[i]; }
+protected:
+  T * data_;
+  const int64_t* sizes_;
+  const int64_t* strides_;
+};
+
+template<typename T, size_t N>
+class TensorAccessor : public TensorAccessorBase<T,N> {
+public:
+  TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  : TensorAccessorBase<T,N>(data_,sizes_,strides_) {}
+
+  TensorAccessor<T,N-1> operator[](int64_t i) {
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+};
+
+template<typename T>
+class TensorAccessor<T,1> : public TensorAccessorBase<T,1> {
+public:
+  TensorAccessor(T * data_, const int64_t * sizes_, const   int64_t * strides_)
+  : TensorAccessorBase<T,1>(data_,sizes_,strides_) {}
+  T & operator[](int64_t i) {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+}
diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h
new file mode 100644
index 0000000..3aea68f
--- /dev/null
+++ b/aten/src/ATen/TensorBase.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include "ATen/TensorImpl.h"
+#include "ATen/UndefinedTensor.h"
+
+namespace at { namespace detail {
+
+// TensorBaseImpl is the base class for Tensor which handles the reference counting
+template<bool is_strong>
+struct TensorBaseImpl {
+  TensorBaseImpl(): TensorBaseImpl(UndefinedTensor::singleton(), false) {}
+  TensorBaseImpl(TensorImpl * self, bool should_retain)
+  : pImpl(self) {
+    if (pImpl == nullptr) {
+      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+    }
+    if(should_retain && pImpl != UndefinedTensor::singleton()) {
+      retain();
+    }
+  }
+  TensorBaseImpl(const TensorBaseImpl & rhs)
+  : pImpl(rhs.pImpl) {
+    if (pImpl != UndefinedTensor::singleton()) {
+      retain();
+    }
+  }
+  TensorBaseImpl(TensorBaseImpl && rhs) noexcept
+  : pImpl(rhs.pImpl) {
+    rhs.pImpl = UndefinedTensor::singleton();
+  }
+  ~TensorBaseImpl() {
+    if (pImpl != UndefinedTensor::singleton()) {
+      release();
+    }
+  }
+  TensorBaseImpl & operator=(TensorBaseImpl && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  TensorBaseImpl & operator=(TensorBaseImpl const & rhs) & {
+    //TensorBaseImpl ctor retains original rhs.pImpl
+    //then rhs.pImpl is swapped with this->pImpl
+    //finally TensorBaseImpl dtor releases rhs.pImpl, which was originally this->pImpl
+    TensorBaseImpl(rhs).swap(*this);
+    return *this;
+  }
+  int64_t dim() const {
+    if (is_strong) {
+      return pImpl->dim();
+    } else {
+      AT_ERROR("Can't call dim() on a WeakTensor");
+    }
+  }
+  void reset() {
+    TensorBaseImpl().swap(*this);
+  }
+  void reset(TensorImpl * rhs) {
+    TensorBaseImpl(rhs, true).swap(*this);
+  }
+  void reset(TensorImpl * rhs, bool should_retain) {
+    TensorBaseImpl(rhs, should_retain).swap(*this );
+  }
+  void swap(TensorBaseImpl & rhs) {
+    TensorImpl * tmp = pImpl;
+    pImpl = rhs.pImpl;
+    rhs.pImpl = tmp;
+  }
+  TensorImpl * get() const {
+    return pImpl;
+  }
+  TensorImpl * detach() {
+    TensorImpl * ret = pImpl;
+    pImpl = UndefinedTensor::singleton();
+    return ret;
+  }
+
+  bool defined() const {
+    return pImpl != UndefinedTensor::singleton();
+  }
+
+  friend struct Type;
+
+  //TODO(zach): sort out friend structes
+public:
+  TensorImpl * pImpl;
+
+private:
+  void retain() {
+    if (is_strong) {
+      pImpl->retain();
+    } else {
+      pImpl->weak_retain();
+    }
+  }
+
+  void release() {
+    if (is_strong) {
+      pImpl->release();
+    } else {
+      pImpl->weak_release();
+    }
+  }
+};
+
+using TensorBase = TensorBaseImpl<true>;
+using WeakTensorBase = TensorBaseImpl<false>;
+
+}} // namespace at::detail
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
new file mode 100644
index 0000000..98d47ec
--- /dev/null
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -0,0 +1,23 @@
+#include <ATen/TensorGeometry.h>
+
+#include <ATen/ATen.h>
+
+namespace at {
+
+bool TensorGeometry::is_contiguous() const {
+  int64_t dim = sizes_.size();
+  int64_t expected_stride = 1;
+  for (int64_t i = dim - 1; i >= 0; i--) {
+    if (sizes_[i] != 1 && strides_[i] != expected_stride) {
+      return false;
+    }
+    expected_stride *= sizes_[i];
+  }
+  return true;
+}
+
+Tensor TensorGeometry::zeros_with_stride(const Type& type) const {
+  return type.tensor(sizes_, strides_).zero_();
+}
+
+} // namespace at
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
new file mode 100644
index 0000000..60f6098
--- /dev/null
+++ b/aten/src/ATen/TensorGeometry.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <ATen/Type.h>
+#include <ATen/WrapDimUtils.h>
+
+namespace at {
+
+struct AT_API TensorGeometry {
+  TensorGeometry() : storage_offset_(0) {}
+
+  explicit TensorGeometry(IntList sizes)
+    : sizes_(sizes)
+    , strides_(sizes.size())
+    , storage_offset_(0) {
+      int64_t dim = sizes.size();
+      int64_t expected_stride = 1;
+      for (int64_t i = dim - 1; i >= 0; i--) {
+        strides_[i] = expected_stride;
+        expected_stride *= sizes_[i];
+      }
+  }
+
+  explicit TensorGeometry(const Tensor& t)
+    : sizes_(t.sizes())
+    , strides_(t.strides())
+    , storage_offset_(t.storage_offset()) {}
+
+  // true if the tensor is contiguous
+  bool is_contiguous() const;
+
+  // creates a new tensor with the sizes and strides of the source
+  Tensor zeros_with_stride(const Type& type) const;
+
+  int64_t dim() const { return sizes_.size(); }
+  int64_t size(int64_t dim) const {
+    dim = maybe_wrap_dim(dim, this->dim());
+    return sizes_.at(static_cast<size_t>(dim));
+  }
+  IntList sizes() const { return IntList{ sizes_ }; }
+  int64_t stride(int64_t dim) const {
+    dim = maybe_wrap_dim(dim, this->dim());
+    return strides_.at(static_cast<size_t>(dim));
+  }
+  IntList strides() const { return IntList{ strides_ }; }
+  int64_t storage_offset() const { return storage_offset_; }
+  int64_t numel() const {
+    int64_t r = 1;
+    for (auto s : sizes()) {
+      r *= s;
+    }
+    return r;
+  }
+
+  TensorGeometry transpose(int64_t dim0, int64_t dim1) {
+    TensorGeometry r = *this; // copy
+    AT_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")")
+    AT_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")")
+    std::swap(r.sizes_[dim0], r.sizes_[dim1]);
+    std::swap(r.strides_[dim0], r.strides_[dim1]);
+    return r;
+  }
+
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+  int64_t storage_offset_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp
new file mode 100644
index 0000000..a77664d
--- /dev/null
+++ b/aten/src/ATen/TensorImpl.cpp
@@ -0,0 +1,36 @@
+#include <ATen/TensorImpl.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/optional.h>
+
+namespace at {
+Tensor& TensorImpl::grad() {
+  AT_ERROR("grad is not implemented for Tensor");
+}
+
+const Tensor& TensorImpl::grad() const {
+  AT_ERROR("grad is not implemented for Tensor");
+}
+
+Tensor TensorImpl::detach() const {
+  AT_ERROR("detach is not implemented for Tensor");
+}
+
+void TensorImpl::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  AT_ERROR("backward is not implemented for Tensor");
+}
+
+void TensorImpl::set_data(Tensor new_data) {
+  AT_ERROR("set_type is not implemented for Tensor");
+}
+
+void Tensor::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  pImpl->backward(std::move(gradient), keep_graph, create_graph);
+}
+} // namespace at
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
new file mode 100644
index 0000000..f5abf15
--- /dev/null
+++ b/aten/src/ATen/TensorImpl.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "ATen/Retainable.h"
+#include "ATen/ScalarType.h"
+#include "ATen/optional.h"
+
+namespace at {
+class Scalar;
+struct Type;
+struct Storage;
+struct Tensor;
+} // namespace at
+
+namespace at {
+struct TensorImpl : public Retainable {
+  explicit TensorImpl(Type * type)
+  : is_scalar(false), type_(type) {}
+
+  Type & type() const {
+    return *type_;
+  }
+  virtual const char * toString() const = 0;
+  virtual IntList sizes() const = 0;
+  virtual IntList strides() const = 0;
+  virtual int64_t dim() const = 0;
+  /**
+   * Perform a conversion of this tensor to a scalar, if numel() == 1.
+   * Otherwise, raise an error.
+   */
+  virtual Scalar localScalar() = 0;
+  virtual void * unsafeGetTH(bool retain) = 0;
+  virtual std::unique_ptr<Storage> storage() = 0;
+  friend struct Type;
+
+  int64_t numel() {
+    int64_t n = 1;
+    for (auto s : sizes()) {
+      n *= s;
+    }
+    return n;
+  }
+
+  // 0-dim patchup of TH requires us to have a flag marking
+  // if a Tensor should be treated as 0-dim.
+  // the generated wrapper manipulates this flag.
+  // the setter should never be exposed in Tensor's public API
+  // because eventually we would like isScalar() to just be dim() == 0;
+  bool isScalar() const {
+    return is_scalar;
+  }
+  // this is called by the generated wrapper code when there are conditions
+  // when this output tensor should be a scalar. e.g. when all inputs
+  // to a function 'add' were scalars, then condition_when_scalar == true.
+  // we also prevent this from getting marked as a scalar if it is not
+  // the right shape afterall.
+  TensorImpl* maybeScalar(bool condition_when_scalar) {
+    is_scalar = false; //force dim() to tell the truth for TH
+    is_scalar = condition_when_scalar && dim() == 1 && sizes()[0] == 1;
+    return this;
+  }
+  void setScalar(bool s) {
+    is_scalar = s;
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+  // Some methods below are defined in TensorImpl.cpp because Tensor is an
+  // incomplete type.
+
+  AT_API virtual void set_requires_grad(bool requires_grad) {
+    AT_ERROR("set_requires_grad is not implemented for Tensor");
+  }
+  AT_API virtual bool requires_grad() const {
+    AT_ERROR("requires_grad is not implemented for Tensor");
+  }
+
+  AT_API virtual Tensor& grad();
+  AT_API virtual const Tensor& grad() const;
+
+  AT_API virtual Tensor detach() const;
+  AT_API virtual void detach_() {
+    AT_ERROR("detach_ is not implemented for Tensor");
+  }
+
+  AT_API virtual void backward(
+      at::optional<Tensor> gradient,
+      bool keep_graph,
+      bool create_graph);
+
+  AT_API virtual void set_data(Tensor new_data);
+
+protected:
+  bool is_scalar;
+  Type * type_;
+};
+} // namespace at
diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h
new file mode 100644
index 0000000..a1b191f
--- /dev/null
+++ b/aten/src/ATen/TensorOperators.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "ATen/Scalar.h"
+#include "ATen/Tensor.h"
+#include "ATen/Type.h"
+
+#include <string>
+#include <stdexcept>
+
+namespace at {
+
+
+inline Tensor & Tensor::operator=(Scalar v) && {
+  return fill_(v);
+}
+inline Tensor Tensor::operator-() const {
+  return neg();
+}
+inline Tensor& Tensor::operator+=(const Tensor & other) {
+  return add_(other);
+}
+inline Tensor& Tensor::operator+=(Scalar other) {
+  return add_(other);
+}
+inline Tensor& Tensor::operator-=(const Tensor & other) {
+  return sub_(other);
+}
+inline Tensor& Tensor::operator-=(Scalar other) {
+  return sub_(other);
+}
+inline Tensor& Tensor::operator*=(const Tensor & other) {
+  return mul_(other);
+}
+inline Tensor& Tensor::operator*=(Scalar other) {
+  return mul_(other);
+}
+inline Tensor& Tensor::operator/=(const Tensor & other) {
+  return div_(other);
+}
+inline Tensor& Tensor::operator/=(Scalar other) {
+  return div_(other);
+}
+inline Tensor Tensor::operator[](Scalar index) const {
+  AT_CHECK(
+      index.local().isIntegral(),
+      "Can only index tensors with integral scalars (got ",
+      index.toTensor().type().toString(), ")");
+  return select(0, index.toLong());
+}
+inline Tensor Tensor::operator[](Tensor index) const {
+  // These properties are checked in the Scalar constructor, but we already
+  // check them here to provide more useful diagnostics for the user.
+  AT_CHECK(index.defined(), "Can only index with tensors that are defined");
+  AT_CHECK(
+      index.dim() == 0,
+      "Can only index with tensors that are scalars (zero-dim)");
+  // The Scalar(Tensor) constructor is explicit, so we need to call it.
+  return this->operator[](Scalar(index));
+}
+inline Tensor Tensor::operator[](int64_t index) const {
+  return select(0, index);
+}
+
+#define AT_FORALL_BINARY_OPS(_) \
+_(+,x.add(y), y.add(x)) \
+_(*,x.mul(y), y.mul(x)) \
+_(-,x.sub(y), y.type().tensor().resize_(y.sizes()).fill_(x).sub_(y)) \
+_(/,x.div(y), y.type().tensor().resize_(y.sizes()).fill_(x).div_(y)) \
+_(%,x.remainder(y), y.type().tensor().resize_(y.sizes()).fill_(x).remainder_(y)) \
+_(<,x.lt(y), y.gt(x)) \
+_(<=,x.le(y), y.ge(x)) \
+_(>,x.gt(y),y.lt(x)) \
+_(>=,x.ge(y), y.le(x)) \
+_(==,x.eq(y), y.eq(x)) \
+_(!=,x.ne(y), y.ne(x))
+
+#define DEFINE_OPERATOR(op,body,reverse_scalar_body) \
+static inline Tensor operator op(const Tensor & x, const Tensor & y) { \
+  return body; \
+} \
+static inline Tensor operator op(const Tensor & x, Scalar y) { \
+  return body; \
+} \
+static inline Tensor operator op(Scalar x, const Tensor & y) { \
+  return reverse_scalar_body; \
+}
+
+
+AT_FORALL_BINARY_OPS(DEFINE_OPERATOR)
+#undef DEFINE_OPERATOR
+#undef AT_FORALL_BINARY_OPS
+
+}
diff --git a/aten/src/ATen/TensorOptions.cpp b/aten/src/ATen/TensorOptions.cpp
new file mode 100644
index 0000000..cb8b9bf
--- /dev/null
+++ b/aten/src/ATen/TensorOptions.cpp
@@ -0,0 +1,19 @@
+#include <ATen/TensorOptions.h>
+
+#include <ATen/Device.h>
+#include <ATen/Layout.h>
+#include <ATen/OptionsGuard.h>
+#include <ATen/ScalarType.h>
+#include <ATen/optional.h>
+
+namespace at {
+
+TensorOptions::TensorOptions(bool use_thread_local_default_options) {
+  if (use_thread_local_default_options) {
+    this->dtype(DefaultTensorOptions::get().dtype());
+    this->device(DefaultTensorOptions::get().device());
+    this->layout(DefaultTensorOptions::get().layout());
+    this->requires_grad(DefaultTensorOptions::get().requires_grad());
+  }
+}
+} // namespace at
diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h
new file mode 100644
index 0000000..53ad9d8
--- /dev/null
+++ b/aten/src/ATen/TensorOptions.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/Layout.h>
+#include <ATen/ScalarType.h>
+#include <ATen/Tensor.h>
+#include <ATen/Type.h>
+
+#include <cstddef>
+#include <utility>
+
+namespace at {
+
+/// A class to encapsulate construction axes of a `Tensor`.
+/// `TensorOptions` is a virtual class to enable overriding of certain methods
+/// by subclasses in other libraries, such as PyTorch. In PyTorch, there is a
+/// `torch::TensorOptions` subclass of this `TensorOptions`, which changes
+/// `type()` to return a variable type instead of a tensor type, such that
+/// variables are created inside factory methods, instead of tensors.
+struct AT_API TensorOptions {
+  TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {}
+
+  /// Constructs the `TensorOptions` with defaults taken from the thread local
+  /// `TensorOptions` object if `use_thread_local_default_options`, else
+  /// defaults to:
+  /// - dtype: kFloat,
+  /// - device: kCPU,
+  /// - layout: kStrided,
+  /// - requires_grad: false
+  explicit TensorOptions(bool use_thread_local_default_options);
+
+  /// Constructs the `TensorOptions` from the type of the given `Tensor`.
+  /// If the `Tensor` has a CUDA type, the `device_index` will match that of the
+  /// tensor. The `requires_grad` property of the tensor is ignored and set to
+  /// false in the created `TensorOptions`.  See the constructor from `Type` for
+  /// the semantics w.r.t. the `type()` method.
+  explicit TensorOptions(Tensor tensor, bool discard_runtime_type = false) {
+    if (!discard_runtime_type) {
+      type_ = &tensor.type();
+    }
+    this->dtype(tensor.dtype());
+    this->device(tensor.device());
+    this->layout(tensor.layout());
+  }
+
+  /// Constructs the `TensorOptions` from a type and a `device_index`.
+  ///
+  /// If `discard_runtime_type` is false (the default), the behavior of
+  /// `TensorOptions::type()` is changed in that it will always return this
+  /// `type`, irrespective of any `device` or `dtype` or `layout` specified at a
+  /// later time. This is to ensure that when a `TensorOptions` object is
+  /// constructed from a tensor's type, and that type has a dynamic type other
+  /// than `at::Type` (e.g. `torch::autograd::VariableType`), constructing a new
+  /// tensor from this `TensorOptions` will use this same derived type. If
+  /// instead the given `type` were destructured into its components (backend,
+  /// dtype and layout), information about the runtime type of the `Type` would
+  /// be lost. Set `discard_runtime_type` to `true` to always destructure the
+  /// type into its components and discard its runtime type.
+  /* implicit */ TensorOptions(
+      const Type& type,
+      int32_t device_index = -1,
+      bool discard_runtime_type = false) {
+    if (!discard_runtime_type) {
+      type_ = &type;
+    }
+    this->dtype(type.scalarType());
+    this->device({type.backend(), device_index});
+    this->layout(type.layout());
+  }
+
+  /// Constructs a `TensorOptions` object with the given layout.
+  /* implicit */ TensorOptions(Layout layout) : TensorOptions() {
+    this->layout(layout);
+  }
+
+  /// Constructs a `TensorOptions` object with the given device.
+  /* implicit */ TensorOptions(Device device) : TensorOptions() {
+    this->device(device);
+  }
+
+  /// Constructs a `TensorOptions` object from a backend, forwarded to the
+  /// `Device` constructor.
+  /* implicit */ TensorOptions(Backend backend)
+      : TensorOptions(Device(backend)) {}
+
+  /// Constructs a `TensorOptions` object with the given dtype.
+  /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() {
+    this->dtype(dtype);
+  }
+
+  /// True if all elements of the `TensorOptions` match that of the other.
+  bool operator==(const TensorOptions& other) const noexcept {
+    return dtype_ == other.dtype_ && layout_ == other.layout_ &&
+        device_ == other.device_ && requires_grad_ == other.requires_grad_;
+  }
+
+  /// True if any of the elements of this `TensorOptions` do not match that of
+  /// the other.
+  bool operator!=(const TensorOptions& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// Discards the runtime type stored if the `TensorOptions` was constructed
+  /// from a `Tensor` or a `Type`. See the documentation of the constructor from
+  /// a `Type` for implications on the behavior of the `type()` method on
+  /// `TensorOptions`.
+  const TensorOptions& discard_runtime_type() const {
+    type_ = nullptr;
+    return *this;
+  }
+
+  /// Sets the device of the `TensorOptions`.
+  TensorOptions& device(Device device) {
+    device_ = std::move(device);
+    update_underlying_type();
+    return *this;
+  }
+
+  /// Sets the device of the `TensorOptions` to CUDA, and then sets the device
+  /// index to the given one.
+  TensorOptions& device_index(int32_t device_index) {
+    return device({Device::Type::CUDA, device_index});
+  }
+
+  /// Sets the dtype of the `TensorOptions`.
+  TensorOptions& dtype(ScalarType dtype) {
+    dtype_ = dtype;
+    update_underlying_type();
+    return *this;
+  }
+
+  /// Sets the layout of the `TensorOptions`.
+  TensorOptions& layout(Layout layout) {
+    layout_ = layout;
+    update_underlying_type();
+    return *this;
+  }
+
+  /// Sets the `requires_grad` property of the `TensorOptions`.
+  TensorOptions& requires_grad(bool requires_grad) {
+    requires_grad_ = requires_grad;
+    return *this;
+  }
+
+  /// Returns the device of the `TensorOptions`.
+  const Device& device() const noexcept {
+    return device_;
+  }
+
+  /// Returns the device index of the `TensorOptions`.
+  int32_t device_index() const noexcept {
+    return device_.index();
+  }
+
+  /// Returns the dtype of the `TensorOptions`.
+  ScalarType dtype() const noexcept {
+    return dtype_;
+  }
+
+  /// Returns the layout of the `TensorOptions`.
+  Layout layout() const noexcept {
+    return layout_;
+  }
+
+  /// Returns the `requires_grad` property of the `TensorOptions`.
+  bool requires_grad() const noexcept {
+    return requires_grad_;
+  }
+
+  /// Constructs an `at::Type` from the members of the `TensorOptions`.
+  const Type& type() const {
+    if (type_ != nullptr) {
+      return *type_;
+    }
+    return getType(backend(), dtype_);
+  }
+
+ private:
+  /// Updates any stored underlying type to the current construction axes.
+  void update_underlying_type() {
+    if (type_) {
+      type_ = &type_->toScalarType(dtype_).toBackend(backend());
+    }
+  }
+
+  // Resolves the ATen backend specified by the current construction axes.
+  Backend backend() const noexcept {
+    Backend backend;
+    if (device_.type() == Device::Type::CPU) {
+      backend = (layout_ == kStrided) ? kCPU : kSparseCPU;
+    } else {
+      backend = (layout_ == kStrided) ? kCUDA : kSparseCUDA;
+    }
+    return backend;
+  }
+
+ private:
+  ScalarType dtype_{kFloat};
+  Device device_{Device::Type::CPU};
+  Layout layout_{Layout::Strided};
+  bool requires_grad_{false};
+  // Not part of the observable API, so make `mutable` so we can set it to
+  // `null` in `discard_runtime_type`.
+  mutable const Type* type_{nullptr};
+};
+
+/// Convenience function that returns a `TensorOptions` object with the `dtype`
+/// set to the given one.
+inline TensorOptions dtype(ScalarType dtype) {
+  return TensorOptions().dtype(dtype);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `layout`
+/// set to the given one.
+inline TensorOptions layout(Layout layout) {
+  return TensorOptions().layout(layout);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `device`
+/// set to the given one.
+inline TensorOptions device(Device device) {
+  return TensorOptions().device(std::move(device));
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `device_index` set to the given one.
+inline TensorOptions device_index(int32_t device_index) {
+  return TensorOptions().device_index(device_index);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `requires_grad` set to the given one.
+inline TensorOptions requires_grad(bool requires_grad = true) {
+  return TensorOptions().requires_grad(requires_grad);
+}
+
+/// From Tensor.h
+inline TensorOptions Tensor::options() const {
+  return TensorOptions(*this);
+}
+
+namespace detail {
+inline Tensor to(
+    const Tensor& tensor,
+    const TensorOptions& options,
+    bool non_blocking) {
+  // Don't copy if the options match.
+  if (tensor.options() == options) {
+    return tensor;
+  }
+  DeviceGuard guard(options.device());
+  return options.type().copy(tensor, non_blocking);
+}
+} // namespace detail
+
+inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking)
+    const {
+  if (this->device() == device && this->dtype() == dtype) {
+    return *this;
+  }
+  return detail::to(*this, options().device(device).dtype(dtype), non_blocking);
+}
+
+inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const {
+  if (this->dtype() == dtype) {
+    return *this;
+  }
+  return detail::to(*this, options().dtype(dtype), non_blocking);
+}
+
+inline Tensor Tensor::to(Device device, bool non_blocking) const {
+  if (this->device() == device) {
+    return *this;
+  }
+  return detail::to(*this, options().device(device), non_blocking);
+}
+} // namespace at
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
new file mode 100644
index 0000000..2652212
--- /dev/null
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -0,0 +1,218 @@
+#include "ATen/Config.h"
+#include "ATen/TensorUtils.h"
+
+#include "ATen/ATen.h"
+
+#include <ostream>
+#include <sstream>
+
+namespace at {
+
+std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) {
+  if (t.pos == 0) {
+    // 0 is distinguished; it usually indicates 'self' or the return
+    // tensor
+    out << "'" << t.name << "'";
+  } else {
+    out << "argument #" << t.pos << " '" << t.name << "'";
+  }
+  return out;
+}
+
+void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) {
+  AT_CHECK(t->dim() == dim,
+    "Expected ", dim, "-dimensional tensor, but got ", t->dim(),
+    "-dimensional tensor for ", t," (while checking arguments for ", c, ")");
+}
+
+void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end) {
+  AT_CHECK(
+    t->dim() >= dim_start && t->dim() < dim_end,
+    "Expected ", dim_start, " to ", (dim_end - 1), " dimensions, but got ",
+    t->dim(), "-dimensional tensor for ", t, " (while checking arguments for ",
+    c, ")");
+}
+
+void checkContiguous(CheckedFrom c, const TensorGeometryArg& t) {
+  AT_CHECK(
+    t->is_contiguous(),
+    "Expected contiguous tensor, but got non-contiguous tensor for ", t,
+     " (while checking arguments for ", c, ")");
+}
+
+void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts) {
+  for (auto& t : ts) {
+    if (!t->defined()) continue;
+    checkContiguous(c, t);
+  }
+}
+
+void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes) {
+  checkDim(c, t, sizes.size());
+  AT_CHECK(
+    t->sizes().equals(sizes),
+    "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(),
+    " for ", t, " (while checking arguments for ", c, ")");
+}
+
+void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size) {
+  AT_CHECK(
+    t->size(dim) == size,
+    "Expected tensor to have size ", size, " at dimension ", dim,
+    ", but got size ", t->size(dim), " for ", t,
+    " (while checking arguments for ", c, ")");
+}
+
+void checkAllSame(CheckedFrom c, ArrayRef<TensorArg> tensors, void(*fn)(CheckedFrom, const TensorArg&, const TensorArg&)) {
+  const TensorArg* t0 = nullptr;
+  for (auto& t : tensors) {
+    if (!t->defined()) continue;
+    if (t0 != nullptr) {
+      fn(c, *t0, t);
+    } else {
+      t0 = &t;
+    }
+  }
+}
+
+void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
+  AT_CHECK(
+    t1->sizes().equals(t2->sizes()),
+    "Expected tensor for ", t1, " to have same size as tensor for ", t2,
+    "; but ", t1->sizes(), " does not equal ", t2->sizes(),
+    " (while checking arguments for ", c, ")");
+}
+
+void checkAllSameSize(CheckedFrom c, ArrayRef<TensorArg> tensors) {
+  checkAllSame(c, tensors, checkSameSize);
+}
+
+void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) {
+  AT_CHECK(
+    t->numel() == numel,
+    "Expected tensor for ", t, " to have ", numel,
+    " elements; but it actually has ", t->numel(), " elements",
+    " (while checking arguments for ", c, ")");
+}
+
+void checkSameNumel(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
+  AT_CHECK(
+    t1->numel() == t2->numel(),
+    "Expected tensor for ", t1,
+    " to have same number of elements as tensor for ", t2, "; but ",
+    t1->numel(), " does not equal ", t2->numel(),
+    " (while checking arguments for ", c, ")");
+}
+
+void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors) {
+  checkAllSame(c, tensors, checkSameNumel);
+}
+
+void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
+  if (! (t1->is_cuda()) || ! (t2->is_cuda())) {
+    std::ostringstream oss;
+    if (! t1->is_cuda()) {
+      oss << "Tensor for " << t1 << " is on CPU, ";
+    }
+    if (! t2->is_cuda()) {
+      oss << "Tensor for " << t2 << " is on CPU, ";
+    }
+    oss << "but expected " << ((!(t1->is_cuda() || t2->is_cuda())) ? "them" : "it")
+	      << " to be on GPU (while checking arguments for " << c << ")";
+    AT_ERROR(oss.str());
+  }
+  AT_CHECK(
+    t1->get_device() == t2->get_device(),
+    "Expected tensor for ", t1, " to have the same device as tensor for ", t2,
+    "; but device ", t1->get_device(), " does not equal ", t2->get_device(),
+    " (while checking arguments for ", c, ")");
+}
+
+void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors) {
+  checkAllSame(c, tensors, checkSameGPU);
+}
+
+void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
+  AT_CHECK(
+    t1->type() == t2->type(),
+    "Expected tensor for ", t1, " to have the same type as tensor for ", t2,
+    "; but type ", t1->toString(), " does not equal ", t2->toString(),
+    " (while checking arguments for ", c, ")");
+}
+
+void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType ty) {
+  AT_CHECK(
+    t->type().scalarType() == ty,
+    "Expected tensor for ", t, " to have scalar type ", toString(ty),
+    "; but got ", t->toString(), " instead (while checking arguments for ", c,
+    ")");
+}
+
+void checkScalarTypes(CheckedFrom c, const TensorArg& t,
+                      at::ArrayRef<ScalarType> l) {
+    if (std::find(l.begin(), l.end(), t->type().scalarType()) == l.end()) {
+      std::ostringstream oss;
+      oss << "Expected tensor for " << t << " to have one of the following "
+          << "scalar types: ";
+      size_t i = 0;
+      for (auto ty : l) {
+        if (i != 0) {
+          oss << ", ";
+        }
+        oss << toString(ty);
+        i++;
+      }
+      oss << "; but got " << t->toString()
+          << " instead (while checking arguments for " << c << ")";
+      AT_ERROR(oss.str());
+    }
+}
+
+void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors) {
+  checkAllSame(c, tensors, checkSameType);
+}
+
+void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2) {
+  AT_CHECK(
+    t1->dim() == t2->dim(),
+    "Expected tensor for ", t1, " to have the same dimension as tensor for ",
+    t2, "; but ", t1->dim(), " does not equal ", t2->dim(),
+    " (while checking arguments for ", c, ")");
+}
+
+void checkDefined(CheckedFrom c, const TensorArg& t) {
+  AT_CHECK(
+    t->defined(),
+    "Expected tensor for ", t, " to be non-null, but it was undefined ",
+    " (while checking arguments for ", c, ")");
+}
+
+void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
+  // NB: don't filter defined here
+  for (auto t : ts) {
+    checkDefined(c, t);
+  }
+}
+
+void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
+  AT_CHECK(
+    t.type().backend() == backend,
+    "Expected tensor to have ", toString(backend),
+    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
+    "(while checking arguments for ", c, ")");
+}
+
+void checkBackend(CheckedFrom c, ArrayRef<Tensor> tensors, at::Backend backend) {
+  for (auto &t : tensors) {
+    checkBackend(c, t, backend);
+  }
+}
+
+void * maybe_data_ptr(const Tensor& tensor) {
+  return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
+}
+
+void * maybe_data_ptr(const TensorArg& tensor) {
+  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
+}
+}
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
new file mode 100644
index 0000000..cc7453f
--- /dev/null
+++ b/aten/src/ATen/TensorUtils.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "ATen/Tensor.h"
+#include "ATen/TensorGeometry.h"
+#include "ATen/Utils.h"
+
+// These functions are NOT in Utils.h, because this file has a dep on Tensor.h
+
+namespace at {
+
+// The following are utility functions for checking that arguments
+// make sense.  These are particularly useful for native functions,
+// which do NO argument checking by default.
+
+struct AT_API TensorArg {
+  Tensor tensor;
+  const char* name;
+  int pos; // 1-indexed
+  TensorArg(Tensor tensor, const char* name, int pos)
+    : tensor(std::move(tensor)), name(name), pos(pos) {}
+  const Tensor* operator->() const { return &tensor; }
+  const Tensor& operator*() const { return tensor; }
+};
+
+struct AT_API TensorGeometryArg {
+  TensorGeometry tensor;
+  const char* name;
+  int pos; // 1-indexed
+  /* implicit */ TensorGeometryArg(TensorArg arg)
+    : tensor(TensorGeometry{arg.tensor}), name(arg.name), pos(arg.pos) {}
+  TensorGeometryArg(TensorGeometry tensor, const char* name, int pos)
+    : tensor(tensor), name(name), pos(pos) {}
+  const TensorGeometry* operator->() const { return &tensor; }
+  const TensorGeometry& operator*() const { return tensor; }
+};
+
+// A string describing which function did checks on its input
+// arguments.
+// TODO: Consider generalizing this into a call stack.
+using CheckedFrom = const char*;
+
+// The undefined convention: singular operators assume their arguments
+// are defined, but functions which take multiple tensors will
+// implicitly filter out undefined tensors (to make it easier to perform
+// tests which should apply if the tensor is defined, and should not
+// otherwise.)
+//
+// NB: This means that the n-ary operators take lists of TensorArg,
+// not TensorGeometryArg, because the Tensor to TensorGeometry
+// conversion will blow up if you have undefined tensors.
+
+AT_API std::ostream& operator<<(std::ostream & out, TensorGeometryArg t);
+AT_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim);
+// NB: this is an inclusive-exclusive range
+AT_API void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end);
+AT_API void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
+AT_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
+AT_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
+AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes);
+AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size);
+AT_API void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel);
+AT_API void checkSameNumel(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
+AT_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
+AT_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s);
+AT_API void checkScalarTypes(CheckedFrom c, const TensorArg& t, at::ArrayRef<ScalarType> l);
+AT_API void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
+AT_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
+AT_API void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
+AT_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
+AT_API void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
+AT_API void checkDefined(CheckedFrom c, const TensorArg& t);
+AT_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
+
+// FixMe: does TensorArg slow things down?
+AT_API void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> t, at::Backend backend);
+
+// Methods for getting data_ptr if tensor is defined
+AT_API void * maybe_data_ptr(const Tensor& tensor);
+AT_API void * maybe_data_ptr(const TensorArg& tensor);
+
+}
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
new file mode 100644
index 0000000..9c9e989
--- /dev/null
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -0,0 +1,43 @@
+#include "ATen/UndefinedTensor.h"
+#include "ATen/Context.h"
+#include "ATen/Error.h"
+
+namespace at {
+
+// should this use the globalContext?  Can it get a context passed in somehow?
+UndefinedTensor::UndefinedTensor()
+: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined))) {
+}
+
+const char * UndefinedTensor::toString() const {
+  return "UndefinedTensor";
+}
+
+IntList UndefinedTensor::sizes() const {
+  AT_ERROR("sizes() called on undefined Tensor");
+}
+
+int64_t UndefinedTensor::dim() const {
+  AT_ERROR("dim() called on undefined Tensor");
+}
+
+const char * UndefinedTensor::typeString() {
+  return "UndefinedType";
+}
+void * UndefinedTensor::unsafeGetTH(bool retain) {
+  AT_ERROR("unsafeGetTH(bool retain) called on undefined Tensor");
+}
+std::unique_ptr<Storage> UndefinedTensor::storage() {
+  AT_ERROR("storage() called on undefined Tensor");
+}
+
+IntList UndefinedTensor::strides() const {
+  AT_ERROR("strides() called on undefined Tensor");
+}
+Scalar UndefinedTensor::localScalar() {
+  AT_ERROR("localScalar() called on undefined Tensor");
+}
+
+UndefinedTensor UndefinedTensor::_singleton;
+
+}
diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/UndefinedTensor.h
new file mode 100644
index 0000000..d501f24
--- /dev/null
+++ b/aten/src/ATen/UndefinedTensor.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "ATen/TensorImpl.h"
+
+namespace at {
+
+struct AT_API UndefinedTensor final : public TensorImpl {
+public:
+  static inline UndefinedTensor * singleton() {
+    return &_singleton;
+  }
+  const char * toString() const override;
+  IntList sizes() const override;
+  IntList strides() const override;
+  int64_t dim() const override;
+  Scalar localScalar() override;
+  void * unsafeGetTH(bool retain) override;
+  std::unique_ptr<Storage> storage() override;
+  static const char * typeString();
+private:
+  UndefinedTensor();
+  static UndefinedTensor _singleton;
+public:
+  friend struct UndefinedType;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
new file mode 100644
index 0000000..068f9b7
--- /dev/null
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -0,0 +1,76 @@
+#include "ATen/UndefinedType.h"
+#include "ATen/Error.h"
+
+namespace at {
+
+UndefinedType::UndefinedType(Context* context)
+    : Type(context, /*is_variable=*/false, /*is_undefined=*/true) {}
+ScalarType UndefinedType::scalarType() const {
+  return ScalarType::Undefined;
+}
+Backend UndefinedType::backend() const {
+  return Backend::Undefined;
+}
+bool UndefinedType::is_cuda() const { return false; }
+bool UndefinedType::is_sparse() const { return false; }
+bool UndefinedType::is_distributed() const { return false; }
+
+std::unique_ptr<Storage> UndefinedType::storage() const {
+  AT_ERROR("storage not defined for UndefinedType");
+}
+std::unique_ptr<Storage> UndefinedType::storage(size_t size) const {
+  AT_ERROR("storage(size_t) not defined for UndefinedType");
+}
+std::unique_ptr<Storage> UndefinedType::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
+  AT_ERROR("storageFromBlob not defined for UndefinedType");
+}
+std::unique_ptr<Storage> UndefinedType::unsafeStorageFromTH(void * th_pointer, bool retain) const {
+  AT_ERROR("unsafeStorageFromTH not defined for UndefinedType");
+}
+std::unique_ptr<Storage> UndefinedType::storageWithAllocator(int64_t size, Allocator* allocator) const {
+  AT_ERROR("storageWithAllocator not defined for UndefinedType");
+}
+Tensor UndefinedType::unsafeTensorFromTH(void * th_pointer, bool retain) const {
+  AT_ERROR("unsafeTensorFromTH not defined for UndefinedType");
+}
+std::unique_ptr<Generator> UndefinedType::generator() const {
+  AT_ERROR("generator not defined for UndefinedType");
+}
+
+const char * UndefinedType::toString() const {
+  return UndefinedType::typeString();
+}
+TypeID UndefinedType::ID() const {
+  return TypeID::Undefined;
+}
+
+size_t UndefinedType::elementSizeInBytes() const {
+  AT_ERROR("elementSizeInBytes not defined for UndefinedType");
+}
+
+Type & UndefinedType::toBackend(Backend b) const {
+  if (b == Backend::Undefined) {
+    return Type::toBackend(b);
+  }
+  AT_ERROR("toBackend not implemented for UndefinedType to non-UndefinedType");
+}
+Type & UndefinedType::toScalarType(ScalarType s) const {
+  if (s == ScalarType::Undefined) {
+    return Type::toScalarType(s);
+  }
+  AT_ERROR("toScalarType not implemented for UndefinedType to non-UndefinedType");
+}
+
+const char * UndefinedType::typeString() {
+  return "UndefinedType";
+}
+
+Tensor & UndefinedType::s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
+  AT_ERROR("s_copy not defined for UndefinedType");
+}
+
+Tensor & UndefinedType::_s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const {
+  AT_ERROR("_s_copy_from not defined for UndefinedType");
+}
+
+}
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
new file mode 100644
index 0000000..913066b
--- /dev/null
+++ b/aten/src/ATen/UndefinedType.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "ATen/Type.h"
+#include "ATen/Context.h"
+#include "ATen/CheckGenerator.h"
+
+#ifdef _MSC_VER
+#ifdef Type
+#undef Type
+#endif
+#endif
+
+namespace at {
+
+struct UndefinedType final : public Type {
+  explicit UndefinedType(Context* context);
+  virtual ScalarType scalarType() const override;
+  virtual Backend backend() const override;
+  virtual bool is_cuda() const override;
+  virtual bool is_sparse() const override;
+  virtual bool is_distributed() const override;
+  virtual std::unique_ptr<Storage> storage() const override;
+  virtual std::unique_ptr<Storage> storage(size_t size) const override;
+  virtual std::unique_ptr<Storage> storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const override;
+  virtual std::unique_ptr<Storage> storageWithAllocator(int64_t size, Allocator* allocator) const override;
+  virtual std::unique_ptr<Generator> generator() const override;
+  virtual const char * toString() const override;
+  virtual size_t elementSizeInBytes() const override;
+  virtual Type & toBackend(Backend b) const override;
+  virtual Type & toScalarType(ScalarType s) const override;
+  virtual TypeID ID() const override;
+  static const char * typeString();
+  virtual std::unique_ptr<Storage> unsafeStorageFromTH(void * th_pointer, bool retain) const override;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
+
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
new file mode 100644
index 0000000..3ce4952
--- /dev/null
+++ b/aten/src/ATen/Utils.cpp
@@ -0,0 +1,15 @@
+#include "ATen/Utils.h"
+#include <stdarg.h>
+#include <stdexcept>
+#include <typeinfo>
+#include <cstdlib>
+
+namespace at {
+
+int _crash_if_asan(int arg) {
+  volatile char x[3];
+  x[arg] = 0;
+  return x[0];
+}
+
+} // at
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
new file mode 100644
index 0000000..ccefa25
--- /dev/null
+++ b/aten/src/ATen/Utils.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include "ATen/ATenGeneral.h"
+#include "ATen/ArrayRef.h"
+#include "ATen/Error.h"
+#include "ATen/UndefinedTensor.h"
+
+#include <algorithm>
+#include <sstream>
+#include <typeinfo>
+#include <numeric>
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
+#define __ubsan_ignore_vptr__ __attribute__((no_sanitize("vptr")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#define __ubsan_ignore_vptr__
+#endif
+
+namespace at {
+
+AT_API int _crash_if_asan(int);
+
+template <typename T, typename Base>
+static inline T* checked_cast_storage(Base* expr, const char * name, int pos) {
+  if (typeid(*expr) != typeid(T))
+    AT_ERROR("Expected object of type ", T::typeString(), " but found type ", expr->type().toString(),
+             " for argument #", pos, " '", name, "'");
+  return static_cast<T*>(expr);
+}
+
+template <typename T, typename Base>
+inline T* checked_cast_tensor(Base* expr, const char * name, int pos, bool allowNull) {
+  if(allowNull && expr == UndefinedTensor::singleton()) {
+    return nullptr;
+  }
+  if (typeid(*expr) != typeid(T))
+    AT_ERROR("Expected object of type ", T::typeString(), " but found type ", expr->type().toString(),
+             " for argument #", pos, " '", name, "'");
+  return static_cast<T*>(expr);
+}
+
+// Converts a TensorList (i.e. ArrayRef<Tensor> to the underlying TH* Tensor Pointer)
+template <typename T, typename TBase, typename TH>
+static inline std::vector<TH*> tensor_list_checked_cast(ArrayRef<TBase> tensors, const char * name, int pos) {
+  std::vector<TH*> casted(tensors.size());
+  for (unsigned int i = 0; i < tensors.size(); ++i) {
+    auto *expr = tensors[i].pImpl;
+    auto result = dynamic_cast<T*>(expr);
+    if (result) {
+      casted[i] = result->tensor;
+    } else {
+      AT_ERROR("Expected a Tensor of type ", T::typeString(), " but found a type ", expr->type().toString(),
+               " for sequence element ", i, " in sequence argument at position #", pos, " '", name, "'");
+
+    }
+  }
+  return casted;
+}
+
+template <size_t N>
+std::array<int64_t, N> check_intlist(ArrayRef<int64_t> list, const char * name, int pos, ArrayRef<int64_t> def={}) {
+  if (list.empty()) {
+    list = def;
+  }
+  auto res = std::array<int64_t, N>();
+  if (list.size() == 1 && N > 1) {
+    res.fill(list[0]);
+    return res;
+  }
+  if (list.size() != N) {
+    AT_ERROR("Expected a list of ", N, " ints but got ", list.size(), " for argument #", pos, " '", name, "'");
+  }
+  std::copy_n(list.begin(), N, res.begin());
+  return res;
+}
+
+inline int64_t sum_intlist(ArrayRef<int64_t> list) {
+  return std::accumulate(list.begin(), list.end(), 0);
+}
+
+inline int64_t prod_intlist(ArrayRef<int64_t> list) {
+  return std::accumulate(list.begin(), list.end(), 1, std::multiplies<int64_t>());
+}
+
+} // at
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
new file mode 100644
index 0000000..a07efa2
--- /dev/null
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "ATen/TensorImpl.h"
+#include <sstream>
+
+namespace at {
+
+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
+  if (dim_post_expr <= 0) {
+    if (!wrap_scalar) {
+      std::ostringstream oss;
+      oss << "dimension specified as " << dim << " but tensor has no dimensions";
+      throw std::runtime_error(oss.str());
+    }
+    dim_post_expr = 1; // this will make range [-1, 0]
+  }
+
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  AT_CHECK(
+      dim >= min && dim <= max,
+      "Dimension out of range (expected to be in range of [",
+      min, ", ", max, "], but got ", dim, ")");
+  if (dim < 0) dim += dim_post_expr;
+  return dim;
+}
+
+static inline int64_t maybe_wrap_dim(int64_t dim, TensorImpl *tensor) {
+  return maybe_wrap_dim(dim, tensor->dim());
+}
+
+static inline int64_t maybe_wrap_dim(int64_t dim, TensorList tensors) {
+  if (tensors.size() == 0) {
+    // can't wrap empty TensorList; rely on underlying implementation to throw error if necessary.
+    return dim;
+  }
+  return maybe_wrap_dim(dim, tensors[0].dim());
+}
+
+static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<int64_t>> & tensor_sizes) {
+  if (tensor_sizes.size() == 0) {
+    // can't wrap empty list; rely on underlying implementation to throw error if necessary
+    return dim;
+  }
+  return maybe_wrap_dim(dim, tensor_sizes[0].size());
+}
+
+// wrap each of dims basing on dim_post_expr
+static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+  if (dim_post_expr <= 0) {
+    dim_post_expr = 1; // this will make range [-1, 0]
+  }
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  for (auto& dim : dims) {
+    AT_CHECK(
+        dim >= min && dim <= max,
+        "Dimension out of range (expected to be in range of [",
+        min, ", ", max, "], but got ", dim, ")");
+    if (dim < 0) dim += dim_post_expr;
+  }
+}
+
+// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
+// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
+// to be "skipped" (both for wrap dimension behavior and dimension size checking).
+// We maintain this behavior for backwards compatibility, but only for this specific size
+// (i.e. other empty sizes are not skipped).
+static inline int64_t legacy_cat_wrap_dim(int64_t dim, const std::vector<std::vector<int64_t>>& tensor_sizes) {
+  for (auto& sizes : tensor_sizes) {
+    if (sizes == std::vector<int64_t>({0})) {
+      continue;
+    }
+    return maybe_wrap_dim(dim, sizes.size());
+  }
+  return dim;
+}
+
+static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
+  for (auto& tensor : tensors) {
+    if (tensor.dim() == 1 && tensor.sizes()[0] == 0) {
+      continue;
+    }
+    return maybe_wrap_dim(dim, tensor.dim());
+  }
+  return dim;
+}
+
+}
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h
new file mode 100644
index 0000000..f3d3a81
--- /dev/null
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "ATen/TensorImpl.h"
+#include "ATen/WrapDimUtils.h"
+#include <sstream>
+#include <bitset>
+
+namespace at {
+
+// This is in an extra file to work around strange interaction of
+// bitset on Windows with operator overloading
+
+constexpr size_t dim_bitset_size = 64;
+
+static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntList dims, int64_t ndims, bool wrap_scalar=true) {
+  AT_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
+  std::bitset<dim_bitset_size> seen;
+  for (size_t i = 0; i < dims.size(); i++) {
+    size_t dim = maybe_wrap_dim(dims[i], ndims);
+    AT_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
+    seen[dim] = true;
+  }
+  return seen;
+}
+
+}
diff --git a/aten/src/ATen/code_template.py b/aten/src/ATen/code_template.py
new file mode 100644
index 0000000..f239030
--- /dev/null
+++ b/aten/src/ATen/code_template.py
@@ -0,0 +1,77 @@
+import re
+
+# match $identifier or ${identifier} and replace with value in env
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block subsitution by indenting to that depth and putting each element
+# of the list on its own line
+# if the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+
+
+class CodeTemplate(object):
+    substitution_str = '(^[^\n\S]*)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})'
+
+    # older versions of Python have a bug where \w* does not work,
+    # so we need to replace with the non-shortened version [a-zA-Z0-9_]*
+    # https://bugs.python.org/issue18647
+
+    substitution_str = substitution_str.replace('\w', '[a-zA-Z0-9_]')
+
+    subtitution = re.compile(substitution_str, re.MULTILINE)
+
+    @staticmethod
+    def from_file(filename):
+        with open(filename, 'r') as f:
+            return CodeTemplate(f.read())
+
+    def __init__(self, pattern):
+        self.pattern = pattern
+
+    def substitute(self, env={}, **kwargs):
+        def lookup(v):
+            return kwargs[v] if v in kwargs else env[v]
+
+        def indent_lines(indent, v):
+            return "".join([indent + l + "\n" for e in v for l in str(e).splitlines()]).rstrip()
+
+        def replace(match):
+            indent = match.group(1)
+            key = match.group(2)
+            comma_before = ''
+            comma_after = ''
+            if key[0] == "{":
+                key = key[1:-1]
+                if key[0] == ",":
+                    comma_before = ', '
+                    key = key[1:]
+                if key[-1] == ',':
+                    comma_after = ', '
+                    key = key[:-1]
+            v = lookup(key)
+            if indent is not None and isinstance(v, list):
+                return indent_lines(indent, v)
+            elif isinstance(v, list):
+                middle = ', '.join([str(x) for x in v])
+                if len(v) == 0:
+                    return middle
+                return comma_before + middle + comma_after
+            else:
+                return (indent or '') + str(v)
+        return self.subtitution.sub(replace, self.pattern)
+
+
+if __name__ == "__main__":
+    c = CodeTemplate("""\
+    int foo($args) {
+
+        $bar
+            $bar
+        $a+$b
+    }
+    int commatest(int a${,stuff})
+    int notest(int a${,empty,})
+    """)
+    print(c.substitute(args=["hi", 8], bar=["what", 7],
+                       a=3, b=4, stuff=["things...", "others"], empty=[]))
diff --git a/aten/src/ATen/common_with_cwrap.py b/aten/src/ATen/common_with_cwrap.py
new file mode 100644
index 0000000..9596369
--- /dev/null
+++ b/aten/src/ATen/common_with_cwrap.py
@@ -0,0 +1,207 @@
+# this code should be common among cwrap and ATen preprocessing
+# for now, I have put it in one place but right now is copied out of cwrap
+
+from copy import deepcopy
+from itertools import product
+
+
+def parse_arguments(args):
+    new_args = []
+    for arg in args:
+        # Simple arg declaration of form "<type> <name>"
+        if isinstance(arg, str):
+            t, _, name = arg.partition(' ')
+            new_args.append({'type': t, 'name': name})
+        elif isinstance(arg, dict):
+            if 'arg' in arg:
+                arg['type'], _, arg['name'] = arg['arg'].partition(' ')
+                del arg['arg']
+            new_args.append(arg)
+        else:
+            assert False
+    return new_args
+
+
+def set_declaration_defaults(declaration):
+    declaration.setdefault('arguments', [])
+    declaration.setdefault('return', 'void')
+    if 'cname' not in declaration:
+        declaration['cname'] = declaration['name']
+    if 'backends' not in declaration:
+        declaration['backends'] = ['CPU', 'CUDA']
+    if 'api_name' not in declaration:
+        declaration['api_name'] = (declaration['python_name']
+                                   if 'python_name' in declaration else declaration['name'])
+    # Simulate multiple dispatch, even if it's not necessary
+    if 'options' not in declaration:
+        declaration['options'] = [{'arguments': declaration['arguments']}]
+        del declaration['arguments']
+    # Parse arguments (some of them can be strings)
+    for option in declaration['options']:
+        option['arguments'] = parse_arguments(option['arguments'])
+    # Propagate defaults from declaration to options
+    for option in declaration['options']:
+        for k, v in declaration.items():
+            # TODO(zach): why does cwrap not propagate 'name'? I need it
+            # propagaged for ATen
+            if k != 'options':
+                option.setdefault(k, v)
+
+# TODO(zach): added option to remove keyword handling for C++ which cannot
+# support it.
+
+
+def filter_unique_options(options, allow_kwarg, type_to_signature, remove_self):
+    def exclude_arg(arg):
+        return arg.get('ignore_check') or arg['type'] == 'CONSTANT'
+
+    def exclude_arg_with_self_check(arg):
+        return exclude_arg(arg) or (remove_self and arg['name'] == 'self')
+
+    def signature(option, kwarg_only_count):
+        if kwarg_only_count == 0:
+            kwarg_only_count = None
+        else:
+            kwarg_only_count = -kwarg_only_count
+        arg_signature = '#'.join(
+            type_to_signature.get(arg['type'], arg['type'])
+            for arg in option['arguments'][:kwarg_only_count]
+            if not exclude_arg_with_self_check(arg))
+        if kwarg_only_count is None:
+            return arg_signature
+        kwarg_only_signature = '#'.join(
+            arg['name'] + '#' + arg['type']
+            for arg in option['arguments'][kwarg_only_count:]
+            if not exclude_arg(arg))
+        return arg_signature + "#-#" + kwarg_only_signature
+    seen_signatures = set()
+    unique = []
+    for option in options:
+        # if only check num_kwarg_only == 0 if allow_kwarg == False
+        limit = len(option['arguments']) if allow_kwarg else 0
+        for num_kwarg_only in range(0, limit + 1):
+            sig = signature(option, num_kwarg_only)
+            if sig not in seen_signatures:
+                if num_kwarg_only > 0:
+                    for arg in option['arguments'][-num_kwarg_only:]:
+                        arg['kwarg_only'] = True
+                unique.append(option)
+                seen_signatures.add(sig)
+                break
+    return unique
+
+
+def enumerate_options_due_to_default(declaration,
+                                     allow_kwarg=True, type_to_signature=[], remove_self=True):
+
+    # Checks to see if an argument with a default keyword is a Tensor that
+    # by default can be NULL. In this case, instead of generating another
+    # option that excludes this argument, we will instead generate a single
+    # function call that allows for the Tensor to be NULL
+    def is_nullable_tensor_arg(arg):
+        return arg['type'] == 'THTensor*' and arg['default'] == 'nullptr'
+
+    # TODO(zach): in cwrap this is shared among all declarations
+    # but seems to assume that all declarations will have the same
+    new_options = []
+    for option in declaration['options']:
+        optional_args = []
+        for i, arg in enumerate(option['arguments']):
+            if 'default' in arg:
+                optional_args.append(i)
+        for permutation in product((True, False), repeat=len(optional_args)):
+            option_copy = deepcopy(option)
+            option_copy['has_full_argument_list'] = sum(permutation) == len(optional_args)
+            for i, bit in zip(optional_args, permutation):
+                arg = option_copy['arguments'][i]
+                # PyYAML interprets NULL as None...
+                arg['default'] = 'NULL' if arg['default'] is None else arg['default']
+                if not bit:
+                    arg['declared_type'] = arg['type']
+                    arg['type'] = 'CONSTANT'
+                    arg['ignore_check'] = True
+            new_options.append(option_copy)
+    declaration['options'] = filter_unique_options(new_options,
+                                                   allow_kwarg, type_to_signature, remove_self)
+
+
+def sort_by_number_of_options(declaration, reverse=True):
+    def num_checked_args(option):
+        return sum(map(lambda a: not a.get('ignore_check', False), option['arguments']))
+    declaration['options'].sort(key=num_checked_args, reverse=reverse)
+
+
+class Function(object):
+
+    def __init__(self, name):
+        self.name = name
+        self.arguments = []
+
+    def add_argument(self, arg):
+        assert isinstance(arg, Argument)
+        self.arguments.append(arg)
+
+    def __repr__(self):
+        return self.name + '(' + ', '.join(map(lambda a: a.__repr__(), self.arguments)) + ')'
+
+
+class Argument(object):
+
+    def __init__(self, _type, name, is_optional):
+        self.type = _type
+        self.name = name
+        self.is_optional = is_optional
+
+    def __repr__(self):
+        return self.type + ' ' + self.name
+
+
+def parse_header(path):
+    with open(path, 'r') as f:
+        lines = f.read().split('\n')
+
+    # Remove empty lines and prebackend directives
+    lines = filter(lambda l: l and not l.startswith('#'), lines)
+    # Remove line comments
+    lines = map(lambda l: l.partition('//'), lines)
+    # Select line and comment part
+    lines = map(lambda l: (l[0].strip(), l[2].strip()), lines)
+    # Remove trailing special signs
+    lines = map(lambda l: (l[0].rstrip(');').rstrip(','), l[1]), lines)
+    # Split arguments
+    lines = map(lambda l: (l[0].split(','), l[1]), lines)
+    # Flatten lines
+    new_lines = []
+    for l, c in lines:
+        for split in l:
+            new_lines.append((split, c))
+    lines = new_lines
+    del new_lines
+    # Remove unnecessary whitespace
+    lines = map(lambda l: (l[0].strip(), l[1]), lines)
+    # Remove empty lines
+    lines = filter(lambda l: l[0], lines)
+    generic_functions = []
+    for l, c in lines:
+        if l.startswith('TH_API void THNN_'):
+            fn_name = l.lstrip('TH_API void THNN_')
+            if fn_name[0] == '(' and fn_name[-2] == ')':
+                fn_name = fn_name[1:-2]
+            else:
+                fn_name = fn_name[:-1]
+            generic_functions.append(Function(fn_name))
+        elif l.startswith('THC_API void THNN_'):
+            fn_name = l.lstrip('THC_API void THNN_')
+            if fn_name[0] == '(' and fn_name[-2] == ')':
+                fn_name = fn_name[1:-2]
+            else:
+                fn_name = fn_name[:-1]
+            generic_functions.append(Function(fn_name))
+        elif l:
+            t, name = l.split()
+            if '*' in name:
+                t = t + '*'
+                name = name[1:]
+            generic_functions[-1].add_argument(
+                Argument(t, name, '[OPTIONAL]' in c))
+    return generic_functions
diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py
new file mode 100644
index 0000000..02eb56e
--- /dev/null
+++ b/aten/src/ATen/copy_wrapper.py
@@ -0,0 +1,248 @@
+from code_template import CodeTemplate
+from function_wrapper import nested_dict
+
+FILE = CodeTemplate("""\
+// ${generated_comment}
+
+#include "ATen/Config.h"
+
+#include "TH/TH.h"
+${cuda_includes}
+#include "ATen/Utils.h"
+${copy_includes}
+
+namespace at {
+
+${copy_functions}
+
+}
+""")
+
+CUDA_INCLUDES = """\
+#undef THNN_
+#include "THC/THC.h"
+"""
+
+# NB: The copy templates static_cast both dst and src, even though
+# technically we also perform a checked_cast_tensor in the prologue
+# of the copy (meaning that hypothetically, an already casted tensor
+# is available.  However, in s_copy, the casted tensor is dst, while
+# in _s_copy_from, the casted tensor is src.  So we can reuse the logic
+# in both cases, we unconditionally cast both tensors (and rely
+# on the surrounding code to establish the necessary invariants.)
+
+COPY = CodeTemplate("""\
+${THTensor}_copy${cuda}${src_scalar_name}(${state,}\
+static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \
+static_cast<${src_tensor}*>(src.pImpl)->tensor);
+""")
+
+COPY_ASYNC_CPU = CodeTemplate("""\
+if (non_blocking) {
+    ${THTensor}_copyAsyncCPU(${state,}\
+static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \
+static_cast<${src_tensor}*>(src.pImpl)->tensor);
+    break;
+}
+""")
+
+COPY_ASYNC_CUDA = CodeTemplate("""\
+if (non_blocking) {
+    ${THTensor}_copyAsyncCuda(${state,}\
+static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \
+static_cast<${src_tensor}*>(src.pImpl)->tensor);
+    break;
+}
+""")
+
+CASE = CodeTemplate("""\
+case ${case_id}:
+    ${copies}
+    break;
+""")
+
+FUNCTION = CodeTemplate("""\
+Tensor & ${Type}::s_copy_(Tensor & dst, const Tensor & src, bool non_blocking) const {
+  // code generated by copy_wrapper
+  ${checked_cast_dst}
+  switch (src.type().ID()) {
+    ${copy_body}
+    default:
+      ${function_fallthrough}
+  }
+  dst.pImpl->setScalar(src.pImpl->isScalar());
+  return dst;
+}
+""")
+
+FUNCTION_FALLTHROUGH_REDISPATCH = "return src.type()._s_copy_from(src, dst, non_blocking);"
+
+FUNCTION_FALLTHROUGH_ERROR = """\
+AT_ERROR("copy does not support ", src.type().toString(), " to ", toString(), " copy.");
+"""
+
+FUNCTION_FROM = CodeTemplate("""\
+Tensor & ${Type}::_s_copy_from(const Tensor & src, Tensor & dst, bool non_blocking) const {
+  // code generated by copy_wrapper
+  ${checked_cast_src}
+  switch (dst.type().ID()) {
+    ${copy_body}
+    default:
+      AT_ERROR("copy does not support ", toString(), " to ", dst.type().toString(), " copy.");
+      break;
+  }
+  dst.pImpl->setScalar(src.pImpl->isScalar());
+  return dst; // NB! dst
+}
+""")
+
+# NB: Hypothetically, someone could call s_copy_from directly and get an error
+# message which claims something is not supported, when it actually is.  But
+# the correct fix in this case was to NOT call copy_from
+FUNCTION_FROM_SWAP = CodeTemplate("""\
+Tensor & ${Type}::_s_copy_from(const Tensor & src, Tensor & dst, bool non_blocking) const {
+  AT_ERROR("copy does not support ", src.type().toString(), " to ", dst.type().toString(), " copy (s_copy_from case).");
+}
+""")
+
+
+def create_one_copy(dst_type, all_types):
+    copy_body = []
+
+    for src_type in all_types:
+        if dst_type['Density'] == 'Sparse' or src_type['Density'] == 'Sparse':
+            # skip sparse copies, which are not yet implemented
+            continue
+        cuda = ''
+        state = []
+        if src_type['Backend'] == 'CUDA' or dst_type['Backend'] == 'CUDA':
+            state.append('context->getTHCState()')
+        if src_type['Backend'] == 'CUDA':
+            if dst_type['Backend'] == 'CUDA':
+                cuda = 'Cuda'
+            else:
+                # don't attempt to process CPU-CUDA; this is handled in the
+                # redispatch
+                continue
+
+        body_env = nested_dict({
+            'src_scalar_name': src_type['ScalarName'],
+            'case_id': src_type['TypeID'],
+            'src_tensor': src_type['Tensor'],
+            'dst_tensor': dst_type['Tensor'],
+            'cuda': cuda,
+            'state': state,
+        }, dst_type)
+
+        copies = []
+        if dst_type['ScalarType'] == src_type['ScalarType']:
+            if dst_type['Backend'] == 'CUDA' and src_type['Backend'] == 'CPU':
+                copies.append(COPY_ASYNC_CPU.substitute(body_env))
+        copies.append(COPY.substitute(body_env))
+
+        copy_body.append(CASE.substitute(body_env, copies=copies))
+
+    if dst_type['Backend'] == 'CPU':
+        # CPU fallthrough needs to redispatch to _s_copy_from
+        # (Backend == CPU implies Dense)
+        assert dst_type['Density'] == 'Dense'
+        function_fallthrough = FUNCTION_FALLTHROUGH_REDISPATCH
+    else:
+        function_fallthrough = FUNCTION_FALLTHROUGH_ERROR
+
+    # Note [checked_cast_tensor is for dense only]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # checked_cast_tensor is only needed for backends which implement
+    # copy and thus do a cast.  Sparse does not support copies, so there
+    # is no need to do a checked cast.  (Furthermore, the code as written
+    # will not work, as it will try to there is no derived Tensor type
+    # for sparse.)
+    checked_cast_dst = ''
+    if dst_type['Density'] == 'Dense':
+        checked_cast_dst = 'checked_cast_tensor<{}>(dst.pImpl, "dst", 0, false);'.format(dst_type['Tensor'])
+
+    env = nested_dict({
+        'function_fallthrough': function_fallthrough,
+        'checked_cast_dst': checked_cast_dst,
+    }, dst_type)
+    return FUNCTION.substitute(env, copy_body=copy_body)
+
+
+def create_one_copy_from(src_type, all_types):
+    if src_type['DenseBackend'] == 'CPU':
+        return FUNCTION_FROM_SWAP.substitute(src_type)
+
+    copy_body = []
+
+    for dst_type in all_types:
+        if dst_type['Density'] == 'Sparse' or src_type['Density'] == 'Sparse':
+            # skip sparse copies, which are not yet implemented
+            continue
+        cuda = ''
+        state = []
+        if src_type['Backend'] == 'CUDA':
+            cuda = 'Cuda'
+        if dst_type['Backend'] == 'CUDA' or src_type['Backend'] == 'CUDA':
+            state.append('context->getTHCState()')
+
+        body_env = nested_dict({
+            'src_scalar_name': src_type['ScalarName'],
+            'case_id': dst_type['TypeID'],
+            'src_tensor': src_type['Tensor'],
+            'dst_tensor': dst_type['Tensor'],
+            'cuda': cuda,
+            'state': state,
+        }, dst_type)
+
+        copies = []
+        if dst_type['ScalarType'] == src_type['ScalarType']:
+            # NB: Technically, we have already short-circuited the
+            # src_type['Backend'] == 'CUDA' case at the beginning of this
+            # function
+            if dst_type['Backend'] == 'CPU' and src_type['Backend'] == 'CUDA':
+                copies.append(COPY_ASYNC_CUDA.substitute(body_env))
+        copies.append(COPY.substitute(body_env))
+
+        copy_body.append(CASE.substitute(body_env, copies=copies))
+
+    # See Note [checked_cast_tensor is for dense only]
+    checked_cast_src = ''
+    if src_type['Density'] != 'Sparse':
+        checked_cast_src = 'checked_cast_tensor<{}>(src.pImpl, "src", 0, false);'.format(src_type['Tensor'])
+
+    return FUNCTION_FROM.substitute(src_type, copy_body=copy_body, checked_cast_src=checked_cast_src)
+
+
+def create(all_types, backend):
+    top_env = {
+        'copy_includes': [],
+        'copy_functions': [],
+        'cuda_includes': [],
+        'generated_comment': '@' + 'generated by aten/src/ATen/copy_wrapper.py'
+    }
+
+    if backend == 'CUDA':
+        top_env['cuda_includes'].append(CUDA_INCLUDES)
+
+    # Headers to include
+    for the_type in all_types:
+        # CUDA backend requires all headers (as it also manages CPU-CUDA
+        # conversions), but CPU backend should only have CPU headers
+        if backend == 'CPU' and the_type['DenseBackend'] != 'CPU':
+            continue
+        top_env['copy_includes'].append(
+            '#include "ATen/{}.h"'.format(the_type['Type']))
+        if the_type['Density'] != 'Sparse':
+            # only Dense tensors have a derived Tensor type
+            top_env['copy_includes'].append(
+                '#include "ATen/{}.h"'.format(the_type['Tensor']))
+
+    # Code generation
+    for the_type in all_types:
+        # Only generate code for the requested backend
+        if the_type['DenseBackend'] != backend:
+            continue
+        top_env['copy_functions'].append(create_one_copy(the_type, all_types))
+        top_env['copy_functions'].append(create_one_copy_from(the_type, all_types))
+
+    return FILE.substitute(top_env)
diff --git a/aten/src/ATen/cpu/vec256/functional.h b/aten/src/ATen/cpu/vec256/functional.h
new file mode 100644
index 0000000..c5e4efb
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/functional.h
@@ -0,0 +1,139 @@
+#pragma once
+#include "vec256.h"
+
+namespace at { namespace vec256 {
+
+// TODO: Make this more efficient
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(
+    const Op& vec_fun,
+    vec256::Vec256<scalar_t> acc_vec,
+    int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  scalar_t acc_arr[Vec::size];
+  acc_vec.store(acc_arr);
+  for (int64_t i = 1; i < size; i++) {
+    scalar_t acc_arr_next[Vec::size];
+    acc_arr_next[0] = acc_arr[i];
+    Vec acc_vec_next = Vec::loadu(acc_arr_next);
+    acc_vec = vec_fun(acc_vec, acc_vec_next);
+  }
+  acc_vec.store(acc_arr);
+  return acc_arr[0];
+}
+
+template <typename scalar_t, typename Op>
+inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  if (size < Vec::size)
+    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
+  int64_t d = Vec::size;
+  Vec acc_vec = Vec::loadu(data);
+  for (; d < size - (size % Vec::size); d += Vec::size) {
+    Vec data_vec = Vec::loadu(data + d);
+    acc_vec = vec_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp>
+inline scalar_t map_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    scalar_t* data,
+    int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  if (size < Vec::size)
+    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
+  int64_t d = Vec::size;
+  Vec acc_vec = map_fun(Vec::loadu(data));
+  for (; d < size - (size % Vec::size); d += Vec::size) {
+    Vec data_vec = Vec::loadu(data + d);
+    data_vec = map_fun(data_vec);
+    acc_vec = red_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    data_vec = map_fun(data_vec);
+    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp>
+inline scalar_t map2_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    scalar_t* data,
+    scalar_t* data2,
+    int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  if (size < Vec::size) {
+    Vec data_vec = Vec::loadu(data, size);
+    Vec data2_vec = Vec::loadu(data2, size);
+    data_vec = map_fun(data_vec, data2_vec);
+    return vec_reduce_all(red_fun, data_vec, size);
+  }
+  int64_t d = Vec::size;
+  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
+  for (; d < size - (size % Vec::size); d += Vec::size) {
+    Vec data_vec = Vec::loadu(data + d);
+    Vec data2_vec = Vec::loadu(data2 + d);
+    data_vec = map_fun(data_vec, data2_vec);
+    acc_vec = red_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    Vec data2_vec = Vec::loadu(data2 + d, size - d);
+    data_vec = map_fun(data_vec, data2_vec);
+    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+}
+
+template <typename scalar_t, typename Op>
+inline void map(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size); d += Vec::size) {
+    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d));
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op>
+inline void map2(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    scalar_t* input_data,
+    scalar_t* input_data2,
+    int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size); d += Vec::size) {
+    Vec data_vec = Vec::loadu(input_data + d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d);
+    Vec output_vec = vec_fun(data_vec, data_vec2);
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(input_data + d, size - d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
+    Vec output_vec = vec_fun(data_vec, data_vec2);
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+}} // namespace at::vec256
diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h
new file mode 100644
index 0000000..442e8fd
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/intrinsics.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#if defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
+#define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
+#define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
+#define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
+#endif
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif
diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h
new file mode 100644
index 0000000..98f1158
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "intrinsics.h"
+
+#include "vec256_base.h"
+#include "vec256_float.h"
+#include "vec256_double.h"
+#include "vec256_int.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+
+namespace at {
+namespace vec256 {
+namespace {
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
+  T buf[Vec256<T>::size];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
+}}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
new file mode 100644
index 0000000..a2ca760
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <cstring>
+#include <functional>
+#include <cmath>
+
+#include "ATen/Utils.h"
+
+#if defined(__GNUC__)
+#define __at_align32__ __attribute__((aligned(32)))
+#elif defined(_WIN32)
+#define __at_align32__ __declspec(align(32))
+#else
+#define __at_align32__
+#endif
+
+namespace at {
+namespace vec256 {
+namespace {
+
+// NOTE: If you specialize on a type, you must define all operations!
+
+// emulates vectorized types
+template <class T>
+struct Vec256 {
+private:
+  T values[32 / sizeof(T)] = {0};
+public:
+  static constexpr int size = 32 / sizeof(T);
+  Vec256() {}
+  Vec256(T val) {
+    for (int i = 0; i != size; i++) {
+      values[i] = val;
+    }
+  }
+  template <int64_t mask_>
+  static Vec256<T> blend(Vec256<T> a, Vec256<T> b) {
+    int64_t mask = mask_;
+    Vec256 vec;
+    for (int64_t i = 0; i < size; i++) {
+      if (mask & 0x01) {
+        vec[i] = b[i];
+      } else {
+        vec[i] = a[i];
+      }
+      mask = mask >> 1;
+    }
+    return vec;
+  }
+  static Vec256<T> set(Vec256<T> a, Vec256<T> b, int64_t count = size) {
+    Vec256 vec;
+    for (int64_t i = 0; i < size; i++) {
+      if (i < count) {
+        vec[i] = b[i];
+      } else {
+        vec[i] = a[i];
+      }
+    }
+    return vec;
+  }
+  static Vec256<T> loadu(const void* ptr) {
+    Vec256 vec;
+    std::memcpy(vec.values, ptr, 32);
+    return vec;
+  }
+  static Vec256<T> loadu(const void* ptr, int64_t count) {
+    Vec256 vec;
+    std::memcpy(vec.values, ptr, count * sizeof(T));
+    return vec;
+  }
+  void store(void* ptr, int count = size) const {
+    std::memcpy(ptr, values, count * sizeof(T));
+  }
+  const T& operator[](int idx) const {
+    return values[idx];
+  }
+  T& operator[](int idx) {
+    return values[idx];
+  }
+  Vec256<T> map(T (*f)(T)) const {
+    Vec256<T> ret;
+    for (int64_t i = 0; i != size; i++) {
+      ret[i] = f(values[i]);
+    }
+    return ret;
+  }
+  Vec256<T> abs() const {
+    Vec256<T> ret;
+    for (int64_t i = 0; i < size; i++) {
+      ret[i] = values[i] < 0 ? -values[i] : values[i];
+    }
+    return ret;
+  }
+  Vec256<T> acos() const {
+    return map(std::acos);
+  }
+  Vec256<T> asin() const {
+    return map(std::asin);
+  }
+  Vec256<T> atan() const {
+    return map(std::atan);
+  }
+  Vec256<T> erf() const {
+    return map(std::erf);
+  }
+  Vec256<T> erfc() const {
+    return map(std::erfc);
+  }
+  Vec256<T> exp() const {
+    return map(std::exp);
+  }
+  Vec256<T> expm1() const {
+    return map(std::expm1);
+  }
+  Vec256<T> log() const {
+    return map(std::log);
+  }
+  Vec256<T> log10() const {
+    return map(std::log10);
+  }
+  Vec256<T> log1p() const {
+    return map(std::log1p);
+  }
+  Vec256<T> log2() const {
+    return map(std::log2);
+  }
+  Vec256<T> ceil() const {
+    return map(std::ceil);
+  }
+  Vec256<T> cos() const {
+    return map(std::cos);
+  }
+  Vec256<T> cosh() const {
+    return map(std::cosh);
+  }
+  Vec256<T> floor() const {
+    return map(std::floor);
+  }
+  Vec256<T> neg() const {
+    return map([](T x) { return -x; });
+  }
+  Vec256<T> round() const {
+    return map(std::round);
+  }
+  Vec256<T> sin() const {
+    return map(std::sin);
+  }
+  Vec256<T> sinh() const {
+    return map(std::sinh);
+  }
+  Vec256<T> tan() const {
+    return map(std::tan);
+  }
+  Vec256<T> tanh() const {
+    return map(std::tanh);
+  }
+  Vec256<T> trunc() const {
+    return map(std::trunc);
+  }
+  Vec256<T> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vec256<T> reciprocal() const {
+    return map([](T x) { return (T)(1) / x; });
+  }
+  Vec256<T> rsqrt() const {
+    return map([](T x) { return 1 / std::sqrt(x); });
+  }
+};
+
+template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = a[i] + b[i];
+  }
+  return c;
+}
+
+template <class T> Vec256<T> operator-(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = a[i] - b[i];
+  }
+  return c;
+}
+
+template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = a[i] * b[i];
+  }
+  return c;
+}
+
+template <class T> Vec256<T> operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = a[i] / b[i];
+  }
+  return c;
+}
+
+template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = std::max(a[i], b[i]);
+  }
+  return c;
+}
+
+}}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
new file mode 100644
index 0000000..975948a
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "intrinsics.h"
+#include "vec256_base.h"
+#if defined(__AVX__) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec256 {
+namespace {
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+template <> class Vec256<double> {
+private:
+  __m256d values;
+public:
+  static constexpr int size = 4;
+  Vec256() {}
+  Vec256(__m256d v) : values(v) {}
+  Vec256(double val) {
+    values = _mm256_set1_pd(val);
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vec256<double> blend(Vec256<double> a, Vec256<double> b) {
+    return _mm256_blend_pd(a.values, b.values, mask);
+  }
+  static Vec256<double> set(Vec256<double> a, Vec256<double> b, int64_t count = size) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
+    if (count == size)
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align32__ double tmp_values[size];
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(double));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size) const {
+    if (count == size) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else {
+      double tmp_values[size];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(double));
+    }
+  }
+  const double& operator[](int idx) const  = delete;
+  double& operator[](int idx) = delete;
+  Vec256<double> map(double (*f)(double)) const {
+    __at_align32__ double tmp[4];
+    store(tmp);
+    for (int64_t i = 0; i < 4; i++) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vec256<double> abs() const {
+    auto mask = _mm256_set1_pd(-0.f);
+    return _mm256_andnot_pd(mask, values);
+  }
+  Vec256<double> acos() const {
+    return Vec256<double>(Sleef_acosd4_u10(values));
+  }
+  Vec256<double> asin() const {
+    return Vec256<double>(Sleef_asind4_u10(values));
+  }
+  Vec256<double> atan() const {
+    return Vec256<double>(Sleef_atand4_u10(values));
+  }
+  Vec256<double> erf() const {
+    return Vec256<double>(Sleef_erfd4_u10(values));
+  }
+  Vec256<double> erfc() const {
+    return Vec256<double>(Sleef_erfcd4_u15(values));
+  }
+  Vec256<double> exp() const {
+    return Vec256<double>(Sleef_expd4_u10(values));
+  }
+  Vec256<double> expm1() const {
+    return Vec256<double>(Sleef_expm1d4_u10(values));
+  }
+  Vec256<double> log() const {
+    return Vec256<double>(Sleef_logd4_u10(values));
+  }
+  Vec256<double> log2() const {
+    return Vec256<double>(Sleef_log2d4_u10(values));
+  }
+  Vec256<double> log10() const {
+    return Vec256<double>(Sleef_log10d4_u10(values));
+  }
+  Vec256<double> log1p() const {
+    return Vec256<double>(Sleef_log1pd4_u10(values));
+  }
+  Vec256<double> sin() const {
+    return map(std::sin);
+  }
+  Vec256<double> sinh() const {
+    return map(std::sinh);
+  }
+  Vec256<double> cos() const {
+    return map(std::cos);
+  }
+  Vec256<double> cosh() const {
+    return map(std::cos);
+  }
+  Vec256<double> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vec256<double> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vec256<double> neg() const {
+    return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
+  }
+  Vec256<double> round() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vec256<double> tan() const {
+    return map(std::tan);
+  }
+  Vec256<double> tanh() const {
+    return Vec256<double>(Sleef_tanhd4_u10(values));
+  }
+  Vec256<double> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vec256<double> sqrt() const {
+    return _mm256_sqrt_pd(values);
+  }
+  Vec256<double> reciprocal() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), values);
+  }
+  Vec256<double> rsqrt() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values));
+  }
+};
+
+template <>
+Vec256<double> inline operator+(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator-(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator*(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_mul_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator/(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_div_pd(a, b);
+}
+
+template <>
+Vec256<double> inline max(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_max_pd(a, b);
+}
+
+#endif
+
+}}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
new file mode 100644
index 0000000..09db2f4
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include "intrinsics.h"
+#include "vec256_base.h"
+#if defined(__AVX__) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec256 {
+namespace {
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+template <> class Vec256<float> {
+private:
+  __m256 values;
+public:
+  static constexpr int64_t size = 8;
+  Vec256() {}
+  Vec256(__m256 v) : values(v) {}
+  Vec256(float val) {
+    values = _mm256_set1_ps(val);
+  }
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vec256<float> blend(Vec256<float> a, Vec256<float> b) {
+    return _mm256_blend_ps(a.values, b.values, mask);
+  }
+  static Vec256<float> set(Vec256<float> a, Vec256<float> b, int64_t count = size) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
+    if (count == size)
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+    __at_align32__ float tmp_values[size];
+    std::memcpy(
+        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
+    return _mm256_loadu_ps(tmp_values);
+  }
+  void store(void* ptr, int64_t count = size) const {
+    if (count == size) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else {
+      float tmp_values[size];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  const float& operator[](int idx) const  = delete;
+  float& operator[](int idx) = delete;
+  Vec256<float> map(float (*f)(float)) const {
+    __at_align32__ float tmp[8];
+    store(tmp);
+    for (int64_t i = 0; i < 8; i++) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vec256<float> abs() const {
+    auto mask = _mm256_set1_ps(-0.f);
+    return _mm256_andnot_ps(mask, values);
+  }
+  Vec256<float> acos() const {
+    return Vec256<float>(Sleef_acosf8_u10(values));
+  }
+  Vec256<float> asin() const {
+    return Vec256<float>(Sleef_asinf8_u10(values));
+  }
+  Vec256<float> atan() const {
+    return Vec256<float>(Sleef_atanf8_u10(values));
+  }
+  Vec256<float> erf() const {
+    return Vec256<float>(Sleef_erff8_u10(values));
+  }
+  Vec256<float> erfc() const {
+    return Vec256<float>(Sleef_erfcf8_u15(values));
+  }
+  Vec256<float> exp() const {
+    return Vec256<float>(Sleef_expf8_u10(values));
+  }
+  Vec256<float> expm1() const {
+    return Vec256<float>(Sleef_expm1f8_u10(values));
+  }
+  Vec256<float> log() const {
+    return Vec256<float>(Sleef_logf8_u10(values));
+  }
+  Vec256<float> log2() const {
+    return Vec256<float>(Sleef_log2f8_u10(values));
+  }
+  Vec256<float> log10() const {
+    return Vec256<float>(Sleef_log10f8_u10(values));
+  }
+  Vec256<float> log1p() const {
+    return Vec256<float>(Sleef_log1pf8_u10(values));
+  }
+  Vec256<float> sin() const {
+    return map(std::sin);
+  }
+  Vec256<float> sinh() const {
+    return map(std::sinh);
+  }
+  Vec256<float> cos() const {
+    return map(std::cos);
+  }
+  Vec256<float> cosh() const {
+    return map(std::cosh);
+  }
+  Vec256<float> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vec256<float> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vec256<float> neg() const {
+    return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
+  }
+  Vec256<float> round() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vec256<float> tan() const {
+    return map(std::tan);
+  }
+  Vec256<float> tanh() const {
+    return Vec256<float>(Sleef_tanhf8_u10(values));
+  }
+  Vec256<float> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vec256<float> sqrt() const {
+    return _mm256_sqrt_ps(values);
+  }
+  Vec256<float> reciprocal() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), values);
+  }
+  Vec256<float> rsqrt() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
+  }
+};
+
+template <>
+Vec256<float> inline operator+(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator-(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator*(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_mul_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator/(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_div_ps(a, b);
+}
+
+template <>
+Vec256<float> inline max(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_max_ps(a, b);
+}
+
+#endif
+
+}}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
new file mode 100644
index 0000000..19a0a93
--- /dev/null
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -0,0 +1,296 @@
+#pragma once
+
+#include "intrinsics.h"
+#include "vec256_base.h"
+
+namespace at {
+namespace vec256 {
+namespace {
+
+#ifdef __AVX2__
+
+struct Vec256i {
+protected:
+  __m256i values;
+public:
+  Vec256i() {}
+  Vec256i(__m256i v) : values(v) {}
+  operator __m256i() const {
+    return values;
+  }
+};
+
+template <>
+struct Vec256<int64_t> : public Vec256i {
+  static constexpr int size = 4;
+  using Vec256i::Vec256i;
+  Vec256() {}
+  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
+  template <int64_t mask>
+  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
+    __at_align32__ int64_t tmp_values[size];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi64(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi64(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi64(b.values, 3);
+    return loadu(tmp_values);
+  }
+  static Vec256<int64_t>
+  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vec256<int64_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
+    __at_align32__ int64_t tmp_values[size];
+    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size) const {
+    if (count == size) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else {
+      __at_align32__ int64_t tmp_values[size];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
+    }
+  }
+  const int64_t& operator[](int idx) const  = delete;
+  int64_t& operator[](int idx)  = delete;
+  Vec256<int64_t> abs() const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto is_larger = _mm256_cmpgt_epi64(zero, values);
+    auto inverse = _mm256_xor_si256(values, is_larger);
+    return _mm256_sub_epi64(inverse, is_larger);
+  }
+};
+
+template <>
+struct Vec256<int32_t> : public Vec256i {
+  static constexpr int size = 8;
+  using Vec256i::Vec256i;
+  Vec256() {}
+  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
+  template <int64_t mask>
+  static Vec256<int32_t> blend(Vec256<int32_t> a, Vec256<int32_t> b) {
+    return _mm256_blend_epi32(a, b, mask);
+  }
+  static Vec256<int32_t>
+  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vec256<int32_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
+    __at_align32__ int32_t tmp_values[size];
+    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size) const {
+    if (count == size) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else {
+      __at_align32__ int32_t tmp_values[size];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
+    }
+  }
+  const int32_t& operator[](int idx) const  = delete;
+  int32_t& operator[](int idx)  = delete;
+  Vec256<int32_t> abs() const {
+    return _mm256_abs_epi32(values);
+  }
+};
+
+template <>
+struct Vec256<int16_t> : public Vec256i {
+  static constexpr int size = 16;
+  using Vec256i::Vec256i;
+  Vec256() {}
+  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
+  template <int64_t mask>
+  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
+    __at_align32__ int16_t tmp_values[size];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi16(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi16(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi16(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi16(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi16(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi16(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi16(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi16(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi16(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi16(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi16(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi16(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi16(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi16(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi16(b.values, 15);
+    return loadu(tmp_values);
+  }
+  static Vec256<int16_t>
+  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vec256<int16_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
+    __at_align32__ int16_t tmp_values[size];
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size) const {
+    if (count == size) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else {
+      __at_align32__ int16_t tmp_values[size];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+    }
+  }
+  const int16_t& operator[](int idx) const  = delete;
+  int16_t& operator[](int idx)  = delete;
+  Vec256<int16_t> abs() const {
+    return _mm256_abs_epi16(values);
+  }
+};
+
+template <>
+Vec256<int64_t> inline operator+(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
+  return _mm256_add_epi64(a, b);
+}
+
+template <>
+Vec256<int32_t> inline operator+(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
+  return _mm256_add_epi32(a, b);
+}
+
+template <>
+Vec256<int16_t> inline operator+(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated
+// This could be implemented more efficiently using epi32 instructions
+// This is also technically avx compatible, but then we'll need AVX
+// code for add as well.
+template <>
+Vec256<int64_t> inline operator*(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
+  int64_t a0 = _mm256_extract_epi64(a, 0);
+  int64_t a1 = _mm256_extract_epi64(a, 1);
+  int64_t a2 = _mm256_extract_epi64(a, 2);
+  int64_t a3 = _mm256_extract_epi64(a, 3);
+
+  int64_t b0 = _mm256_extract_epi64(b, 0);
+  int64_t b1 = _mm256_extract_epi64(b, 1);
+  int64_t b2 = _mm256_extract_epi64(b, 2);
+  int64_t b3 = _mm256_extract_epi64(b, 3);
+
+  int64_t c0 = a0 * b0;
+  int64_t c1 = a1 * b1;
+  int64_t c2 = a2 * b2;
+  int64_t c3 = a3 * b3;
+
+  return _mm256_set_epi64x(c3, c2, c1, c0);
+}
+
+template <>
+Vec256<int32_t> inline operator*(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+#endif
+
+}}}
diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h
new file mode 100644
index 0000000..72877bc
--- /dev/null
+++ b/aten/src/ATen/cpu/vml.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "ATen/Config.h"
+#include "ATen/Parallel.h"
+#include "ATen/cpu/vec256/functional.h"
+#include "ATen/cpu/vec256/vec256.h"
+
+// This header implements various unary operations using a MKL VML style
+// interface.
+
+// It implements various functions with a simple interface
+// For example it enables the user to call vsin(float* out, const float* in,
+// size) This functions takes a pointer to a contious output array of floats and
+// a constant input array. It will then apply sin to each value in in the input
+// array and write the result into the output array. out and in may point to the
+// same memory, i.e. this fully supports in-place operations. These functions
+// also implement their own parallelization, so take precautions when calling
+// these from threaded functions.
+
+// When MKL is available it will call into MKL's VML library similar to NumPy
+// If MKL is not available it will use SLEEF.
+
+// This file might be compiled under AVX or AVX2 when called from e.g.
+// UnaryOpsKernel.cpp
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+#include <mkl.h>
+#endif
+
+// [Note SSE-AVX transitions]
+// There is a bug in Glibc2.23
+// https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280. Calling zeroall
+// when using AVX/AVX2 code resolves this.
+#if defined(__AVX__) && defined(__GLIBC__) && __GLIBC_MINOR__ == 23
+#define DL_RUNTIME_BUG(op, type) \
+  volatile type x = (type)(1);   \
+  x = std::op(x);                \
+  _mm256_zeroall();
+#else
+#define DL_RUNTIME_BUG(op, type)
+#endif
+
+namespace at {
+namespace vml {
+namespace {
+
+using namespace vec256;
+
+template <typename scalar_t>
+inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
+  parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
+    map(
+        [](const Vec256<scalar_t>& x) {
+          return Vec256<scalar_t>((scalar_t)(1)) / x.sqrt();
+        },
+        out + begin,
+        in + begin,
+        end - begin);
+  });
+}
+
+// NB: We ignore numerical errors by convention and leave them to the user
+
+// We unfortunately need to duplicate code here to deal with the SSE-AVX
+// transition bug (see [Note SSE-AVX transitions]). As soon as we can expect
+// users to use a version of glibc newer than 2.23 we will be able to ditch
+// this. This duplication is also necessary since not all functions (e.g. rsqrt)
+// might be part of cmath.
+
+#define IMPLEMENT_VML_BUG(op)                                          \
+  template <typename scalar_t>                                          \
+  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {  \
+    DL_RUNTIME_BUG(op, scalar_t)                                        \
+    parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \
+      map([](const Vec256<scalar_t>& x) { return x.op(); },             \
+          out + begin,                                                  \
+          in + begin,                                                   \
+          end - begin);                                                 \
+    });                                                                 \
+  }
+
+#define IMPLEMENT_VML(op)                                              \
+  template <typename scalar_t>                                          \
+  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {  \
+    parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \
+      map([](const Vec256<scalar_t>& x) { return x.op(); },             \
+          out + begin,                                                  \
+          in + begin,                                                   \
+          end - begin);                                                 \
+    });                                                                 \
+  }
+
+IMPLEMENT_VML_BUG(abs)
+IMPLEMENT_VML_BUG(acos)
+IMPLEMENT_VML_BUG(asin)
+IMPLEMENT_VML_BUG(atan)
+IMPLEMENT_VML_BUG(ceil)
+IMPLEMENT_VML_BUG(cos)
+// IMPLEMENT_VML_BUG(cosh)
+IMPLEMENT_VML_BUG(erf)
+IMPLEMENT_VML_BUG(erfc)
+IMPLEMENT_VML_BUG(exp)
+IMPLEMENT_VML_BUG(expm1)
+IMPLEMENT_VML_BUG(floor)
+IMPLEMENT_VML(reciprocal)
+IMPLEMENT_VML_BUG(log)
+IMPLEMENT_VML_BUG(log10)
+IMPLEMENT_VML_BUG(log1p)
+IMPLEMENT_VML_BUG(log2)
+IMPLEMENT_VML(neg)
+IMPLEMENT_VML_BUG(sin)
+// IMPLEMENT_VML_BUG(sinh)
+IMPLEMENT_VML_BUG(sqrt)
+IMPLEMENT_VML_BUG(round)
+IMPLEMENT_VML(rsqrt)
+IMPLEMENT_VML_BUG(tan)
+IMPLEMENT_VML_BUG(tanh)
+IMPLEMENT_VML_BUG(trunc)
+
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+
+#define IMPLEMENT_VML_MKL(op, mklop)                                         \
+  template <>                                                                 \
+  inline void v##op(float* out, const float* in, int64_t size) {              \
+    vms##mklop(size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \
+  }                                                                           \
+  template <>                                                                 \
+  inline void v##op(double* out, const double* in, int64_t size) {            \
+    vmd##mklop(size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \
+  }
+
+// NB: abs, cosh and sinh were temporarily disabled due to issues with Apple clang
+
+IMPLEMENT_VML_MKL(abs, Abs)
+IMPLEMENT_VML_MKL(acos, Acos)
+IMPLEMENT_VML_MKL(asin, Asin)
+IMPLEMENT_VML_MKL(atan, Atan)
+IMPLEMENT_VML_MKL(cos, Cos)
+// IMPLEMENT_VML_MKL(cosh, Cosh)
+IMPLEMENT_VML_MKL(erf, Erf)
+IMPLEMENT_VML_MKL(erfc, Erfc)
+IMPLEMENT_VML_MKL(exp, Exp)
+IMPLEMENT_VML_MKL(expm1, Expm1)
+IMPLEMENT_VML_MKL(log, Ln)
+IMPLEMENT_VML_MKL(log10, Log10)
+IMPLEMENT_VML_MKL(log1p, Log1p)
+IMPLEMENT_VML_MKL(sin, Sin)
+// IMPLEMENT_VML_MKL(sinh, Sinh)
+IMPLEMENT_VML_MKL(sqrt, Sqrt)
+IMPLEMENT_VML_MKL(tan, Tan)
+IMPLEMENT_VML_MKL(tanh, Tanh)
+IMPLEMENT_VML_MKL(trunc, Trunc)
+
+#if INTEL_MKL_VERSION >= 20180406
+IMPLEMENT_VML_MKL(log2, Log2)
+#endif
+
+#endif
+
+} // namespace
+} // namespace vml
+} // namespace at
diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h
new file mode 100644
index 0000000..4dade5e
--- /dev/null
+++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#ifdef _WIN32
+# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS)
+#  define AT_CUDA_API __declspec(dllexport)
+# else
+#  define AT_CUDA_API __declspec(dllimport)
+# endif
+#else
+# define AT_CUDA_API
+#endif
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
new file mode 100644
index 0000000..e34cd14
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -0,0 +1,908 @@
+#pragma once
+
+#include "detail/IndexUtils.cuh"
+#include "ATen/TensorUtils.h"
+#include "THC/THCAtomics.cuh"
+
+//
+// This file contains pointwise operation functions and kernels that
+// work on both contiguous and non-contiguous tensor arguments of
+// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
+// copying or temporary storage.
+//
+
+namespace at {
+namespace cuda {
+
+// TODO: combine with TensorArg?  So far that's been for debugging, and this is functional...
+enum class TensorArgType { ReadWrite, ReadOnly };
+
+// Rearrange dimensions for pointwise operations so that strides are in
+// decreasing order as much as possible, so that kernels have better memory
+// access patterns.
+//
+// For example, consider a binary operation on two "transposed" 2-dim tensors:
+//    sizes:          256 512
+//    aInfo->strides:   1 256
+//    bInfo->strides:   1 256
+//
+// Given this, each concurrent memory access inside kernelPointwiseApply2() is
+// exactly 256 elements apart, resulting in poor performance.
+//
+// This function exchanges dimensions so that memory access is contiguous:
+//    sizes:          512 256
+//    aInfo->strides: 256   1
+//    bInfo->strides: 256   1
+//
+// (Actually, it becomes even better because now collapseDims() can turn each
+// input into one contiguous array.)
+//
+// In general, given M (<=4) TensorInfo's with N dimensions, we can view each
+// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
+// strides[i] and [j] if
+//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
+//        (exchanging them will benefit input #k), and
+//    (2) strides[i][k] <= strieds[j][k] for all k
+//        (exchanging them will not make any input worse).
+template <typename T1, typename IndexType,
+          typename T2 = void, typename T3 = void, typename T4 = void>
+void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
+                   detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
+                   detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
+                   detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
+  int numInfos = 1;
+  int dims = aInfo->dims;
+  IndexType *sizes[4] = { aInfo->sizes, };
+  IndexType *strides[4] = { aInfo->strides, };
+
+  if (bInfo != nullptr) {
+    ++numInfos;
+    if (bInfo->dims != dims) return;
+    sizes[1] = bInfo->sizes;
+    strides[1] = bInfo->strides;
+  }
+
+  if (cInfo != nullptr) {
+    ++numInfos;
+    if (cInfo->dims != dims) return;
+    sizes[2] = cInfo->sizes;
+    strides[2] = cInfo->strides;
+  }
+
+  if (dInfo != nullptr) {
+    ++numInfos;
+    if (dInfo->dims != dims) return;
+    sizes[3] = dInfo->sizes;
+    strides[3] = dInfo->strides;
+  }
+
+  // Bail out if sizes do not match: we are using "deprecated pointwise
+  // behavior" among tensors of different shapes but same number of elements.
+  for (int i = 1; i < numInfos; ++i) {
+    for (int j = 0; j < dims; ++j) {
+      if (sizes[i][j] != sizes[0][j]) return;
+    }
+  }
+
+  for (int i = 0; i < dims - 1; ++i) {
+    // No need to consider dimensions of size 1.
+    if (sizes[0][i] == 1) continue;
+
+    for (int j = i + 1; j < dims; ++j) {
+      if (sizes[0][j] == 1) continue;
+
+      // Compare the relative sizes of strides between dim #i and dim #j.
+      bool hasIncreasingStrides = false;
+      bool hasDecreasingStrides = false;
+
+      for (int k = 0; k < numInfos; k++) {
+        IndexType stride_i = strides[k][i];
+        IndexType stride_j = strides[k][j];
+        if (stride_i < stride_j) {
+          hasIncreasingStrides = true;
+        } else if (stride_i > stride_j) {
+          hasDecreasingStrides = true;
+        }
+      }
+
+      if (hasIncreasingStrides && !hasDecreasingStrides) {
+        for (int k = 0; k < numInfos; k++) {
+          IndexType size = sizes[k][i];
+          sizes[k][i] = sizes[k][j];
+          sizes[k][j] = size;
+
+          IndexType stride = strides[k][i];
+          strides[k][i] = strides[k][j];
+          strides[k][j] = stride;
+        }
+      }
+    }
+  }
+}
+
+// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
+#define AT_APPLY_THREADS_PER_BLOCK 32 * 16
+#define AT_APPLY_BLOCKS_PER_SM 4
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims, int BDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void
+kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
+                      detail::TensorInfo<scalar2, IndexType> b,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
+
+    // Convert `linearIndex` into an offset of `b`
+    const IndexType bOffset =
+      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
+
+    op(a.data[aOffset], b.data[bOffset]);
+  }
+}
+
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename IndexType,
+          int ADims, int BDims, int CDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void
+kernelPointwiseApply3(detail::TensorInfo<scalar1, IndexType> a,
+                      detail::TensorInfo<scalar2, IndexType> b,
+                      detail::TensorInfo<scalar3, IndexType> c,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
+
+    // Convert `linearIndex` into an offset of `b`
+    const IndexType bOffset =
+      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
+
+    // Convert `linearIndex` into an offset of `c`
+    const IndexType cOffset =
+      detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c);
+
+    op(a.data[aOffset], b.data[bOffset], c.data[cOffset]);
+  }
+}
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename scalar4,
+          typename IndexType,
+          int ADims, int BDims, int CDims, int DDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void
+kernelPointwiseApply4(detail::TensorInfo<scalar1, IndexType> a,
+                      detail::TensorInfo<scalar2, IndexType> b,
+                      detail::TensorInfo<scalar3, IndexType> c,
+                      detail::TensorInfo<scalar4, IndexType> d,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
+
+    // Convert `linearIndex` into an offset of `b`
+    const IndexType bOffset =
+      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
+
+    // Convert `linearIndex` into an offset of `c`
+    const IndexType cOffset =
+      detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c);
+
+    // Convert `linearIndex` into an offset of `d`
+    const IndexType dOffset =
+      detail::IndexToOffset<scalar4, IndexType, DDims>::get(linearIndex, d);
+
+    op(a.data[aOffset], b.data[bOffset], c.data[cOffset], d.data[dOffset]);
+  }
+}
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) {
+  if (curDevice == -1) return false;
+  uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast<uint64_t>(AT_APPLY_THREADS_PER_BLOCK));
+  uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0];
+  if (numBlocks > maxGridX)
+      numBlocks = maxGridX;
+  grid = dim3(numBlocks);
+  return true;
+}
+
+inline dim3 getApplyBlock() {
+  return dim3(AT_APPLY_THREADS_PER_BLOCK);
+}
+
+/*
+  Apply a pointwise operator to two tensors.
+
+  The calling convention for op is a function/functor that takes takes two references to
+  type scalar; at least one of these references should be non-const in order to write the output.
+  For example, to compute a = b^2, op would be of the form:
+  [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; };
+*/
+template <typename scalar1, typename scalar2, typename Op>
+bool CUDA_tensor_apply2(at::Tensor a,
+                        at::Tensor b,
+                        Op op,
+                        TensorArgType aType = TensorArgType::ReadWrite,
+                        TensorArgType bType = TensorArgType::ReadOnly) {
+  checkBackend("CUDA_tensor_apply2", {a, b}, Backend::CUDA);
+  int64_t totalElements = a.numel();
+
+  if (totalElements != b.numel()) {
+    return false;
+  }
+
+  if (a.dim() > MAX_TENSORINFO_DIMS ||
+      b.dim() > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (a.numel() == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  Tensor oldA;
+  Tensor oldB;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = a.contiguous();
+  }
+  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = b.contiguous();
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+
+#define HANDLE_CASE(TYPE, A, B)                                         \
+  kernelPointwiseApply2<Op,                                             \
+                        scalar1,                                        \
+                        scalar2,                                        \
+                        TYPE, A, B>                                     \
+   <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(    \
+       aInfo, bInfo, (TYPE) totalElements, op);
+
+#define HANDLE_B_CASE(TYPE, A, B) {         \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, 1);              \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, 2);              \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, -1);             \
+      break;                                \
+  }                                         \
+}                                           
+
+#define HANDLE_A_CASE(TYPE, A, B) {         \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B);            \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B);            \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B);           \
+      break;                                \
+  }                                         \
+}
+
+  if (detail::canUse32BitIndexMath(a) &&
+      detail::canUse32BitIndexMath(b)) {
+    detail::TensorInfo<scalar1, unsigned int> aInfo =
+      detail::getTensorInfo<scalar1, unsigned int>(a);
+
+    detail::TensorInfo<scalar2, unsigned int> bInfo =
+      detail::getTensorInfo<scalar2, unsigned int>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+#if CUDA_VERSION < 9000
+    if (!(aInfo.isContiguous() && bInfo.isContiguous()))
+        grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
+  } else {
+    detail::TensorInfo<scalar1, uint64_t> aInfo =
+      detail::getTensorInfo<scalar1, uint64_t>(a);
+
+    detail::TensorInfo<scalar2, uint64_t> bInfo =
+      detail::getTensorInfo<scalar2, uint64_t>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1) {
+      kernelPointwiseApply2<Op,
+                            scalar1,
+                            scalar2,
+                          uint64_t, 1, 1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+           aInfo, bInfo, (uint64_t) totalElements, op);
+    } else {
+#if CUDA_VERSION < 9000
+      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+      kernelPointwiseApply2<Op,
+                            scalar1,
+                            scalar2,
+                            uint64_t, -1, -1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+           aInfo, bInfo, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    // Ignore overlaps when copying back; if we use copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    oldA._copy_ignoring_overlaps_(a);
+    a = oldA;
+  }
+
+  if (oldB.defined()) {
+    // Ignore overlaps when copying back; if we use copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    oldB._copy_ignoring_overlaps_(b);
+    b = oldB;
+  }
+
+  return true;
+}
+
+/*
+  Apply a pointwise operator to three tensors.
+
+  The calling convention for op is a function/functor that takes takes three references to
+  type scalar; at least one of these references should be non-const in order to write the output.
+  For example, to compute a = b + c, op would be of the form:
+  [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val) {
+    a_val = b_val + c_val;
+  };
+*/
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+bool CUDA_tensor_apply3(at::Tensor a,
+                        at::Tensor b,
+                        at::Tensor c,
+                        const Op& op,
+                        TensorArgType aType = TensorArgType::ReadWrite,
+                        TensorArgType bType = TensorArgType::ReadOnly,
+                        TensorArgType cType = TensorArgType::ReadOnly) {
+  checkBackend("CUDA_tensor_apply3", {a, b, c}, Backend::CUDA);
+  int64_t totalElements = a.numel();
+
+  if (totalElements != b.numel() ||
+      totalElements != c.numel()) {
+    return false;
+  }
+
+  if (a.dim() > MAX_TENSORINFO_DIMS ||
+      b.dim() > MAX_TENSORINFO_DIMS ||
+      c.dim() > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (a.numel() == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  Tensor oldA;
+  Tensor oldB;
+  Tensor oldC;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = a.contiguous();
+  }
+  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = b.contiguous();
+  }
+  if (cType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) {
+    // Must perform in contiguous space
+    oldC = c;
+    c = c.contiguous();
+  }
+
+#define HANDLE_CASE(TYPE, A, B, C)                                      \
+  kernelPointwiseApply3<Op,                                             \
+                        scalar1,                                        \
+                        scalar2,                                        \
+                        scalar3,                                        \
+                        TYPE, A, B, C>                                  \
+    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
+      aInfo, bInfo, cInfo, (TYPE) totalElements, op);
+
+#define HANDLE_C_CASE(TYPE, A, B, C) {      \
+  switch (C) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, B, 1);           \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, B, 2);           \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, B, -1);          \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_B_CASE(TYPE, A, B, C) {      \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_C_CASE(TYPE, A, 1, C);         \
+      break;                                \
+    case 2:                                 \
+      HANDLE_C_CASE(TYPE, A, 2, C);         \
+      break;                                \
+    default:                                \
+      HANDLE_C_CASE(TYPE, A, -1, C);        \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_A_CASE(TYPE, A, B, C) {      \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B, C);         \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B, C);         \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B, C);        \
+      break;                                \
+  }                                         \
+}
+
+  if (detail::canUse32BitIndexMath(a) &&
+      detail::canUse32BitIndexMath(b) &&
+      detail::canUse32BitIndexMath(c)) {
+    detail::TensorInfo<scalar1, unsigned int> aInfo =
+      detail::getTensorInfo<scalar1, unsigned int>(a);
+
+    detail::TensorInfo<scalar2, unsigned int> bInfo =
+      detail::getTensorInfo<scalar2, unsigned int>(b);
+
+    detail::TensorInfo<scalar3, unsigned int> cInfo =
+      detail::getTensorInfo<scalar3, unsigned int>(c);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+
+#if CUDA_VERSION < 9000
+    if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()))
+      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
+  } else {
+    detail::TensorInfo<scalar1, uint64_t> aInfo =
+      detail::getTensorInfo<scalar1, uint64_t>(a);
+
+    detail::TensorInfo<scalar2, uint64_t> bInfo =
+      detail::getTensorInfo<scalar2, uint64_t>(b);
+
+    detail::TensorInfo<scalar3, uint64_t> cInfo =
+      detail::getTensorInfo<scalar3, uint64_t>(c);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
+      kernelPointwiseApply3<Op,
+                            scalar1,
+                            scalar2,
+                            scalar3,
+                            uint64_t, 1, 1, 1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+          aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
+    } else {
+#if CUDA_VERSION < 9000
+  grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+	kernelPointwiseApply3<Op,
+                        scalar1,
+                        scalar2,
+                        scalar3,
+                        uint64_t, -1, -1, -1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+          aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_C_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    oldA._copy_ignoring_overlaps_(a);
+    a = oldA;
+  }
+
+  if (oldB.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    oldB._copy_ignoring_overlaps_(b);
+    b = oldB;
+  }
+
+  if (oldC.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldC contiguous.
+    oldC._copy_ignoring_overlaps_(c);
+    c = oldC;
+  }
+
+  return true;
+}
+
+/*
+  Apply a pointwise operator to four tensors.
+
+  The calling convention for op is a function/functor that takes takes four references to
+  type scalar; at least one of these references should be non-const in order to write the output.
+  For example, to compute a = b + c * d, op would be of the form:
+  [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val, const scalar &d_val) {
+    a_val = b_val + c_val * d_val;
+  };
+*/
+template <typename scalar1, typename scalar2, typename scalar3, typename scalar4, typename Op>
+bool CUDA_tensor_apply4(at::Tensor a,
+                        at::Tensor b,
+                        at::Tensor c,
+                        at::Tensor d,
+                        const Op& op,
+                        TensorArgType aType = TensorArgType::ReadWrite,
+                        TensorArgType bType = TensorArgType::ReadOnly,
+                        TensorArgType cType = TensorArgType::ReadOnly,
+                        TensorArgType dType = TensorArgType::ReadOnly) {
+  checkBackend("CUDA_tensor_apply4", {a, b, c, d}, Backend::CUDA);
+  int64_t totalElements = a.numel();
+
+  if (totalElements != b.numel() ||
+      totalElements != c.numel() ||
+      totalElements != d.numel()) {
+    return false;
+  }
+
+  if (a.dim() > MAX_TENSORINFO_DIMS ||
+      b.dim() > MAX_TENSORINFO_DIMS ||
+      c.dim() > MAX_TENSORINFO_DIMS ||
+      d.dim() > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (a.numel() == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  Tensor oldA;
+  Tensor oldB;
+  Tensor oldC;
+  Tensor oldD;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = a.contiguous();
+  }
+  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = b.contiguous();
+  }
+  if (cType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) {
+    // Must perform in contiguous space
+    oldC = c;
+    c = c.contiguous();
+  }
+  if (dType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) {
+    // Must perform in contiguous space
+    oldD = d;
+    d = d.contiguous();
+  }
+
+#define HANDLE_CASE(TYPE, A, B, C, D)                                   \
+  kernelPointwiseApply4<Op,                                             \
+                        scalar1,                                        \
+                        scalar2,                                        \
+                        scalar3,                                        \
+                        scalar4,                                        \
+                        TYPE, A, B, C, D>                               \
+    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
+    aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op);
+
+#define HANDLE_D_CASE(TYPE, A, B, C, D) {       \
+  switch (D) {                                  \
+    case 1:                                     \
+      HANDLE_CASE(TYPE, A, B, C, 1);            \
+      break;                                    \
+    case 2:                                     \
+      HANDLE_CASE(TYPE, A, B, C, 2);            \
+      break;                                    \
+    default:                                    \
+      HANDLE_CASE(TYPE, A, B, C, -1);           \
+      break;                                    \
+  }                                             \
+}
+
+#define HANDLE_C_CASE(TYPE, A, B, C, D) {       \
+  switch (C) {                                  \
+    case 1:                                     \
+      HANDLE_D_CASE(TYPE, A, B, 1, D);          \
+      break;                                    \
+    case 2:                                     \
+      HANDLE_D_CASE(TYPE, A, B, 2, D);          \
+      break;                                    \
+    default:                                    \
+      HANDLE_D_CASE(TYPE, A, B, -1, D);         \
+      break;                                    \
+  }                                             \
+}
+
+#define HANDLE_B_CASE(TYPE, A, B, C, D) {       \
+  switch (B) {                                  \
+    case 1:                                     \
+      HANDLE_C_CASE(TYPE, A, 1, C, D);          \
+      break;                                    \
+    case 2:                                     \
+      HANDLE_C_CASE(TYPE, A, 2, C, D);          \
+      break;                                    \
+    default:                                    \
+      HANDLE_C_CASE(TYPE, A, -1, C, D);         \
+      break;                                    \
+  }                                             \
+}
+
+#define HANDLE_A_CASE(TYPE, A, B, C, D) {       \
+  switch (A) {                                  \
+    case 1:                                     \
+      HANDLE_B_CASE(TYPE, 1, B, C, D);          \
+      break;                                    \
+    case 2:                                     \
+      HANDLE_B_CASE(TYPE, 2, B, C, D);          \
+      break;                                    \
+    default:                                    \
+      HANDLE_B_CASE(TYPE, -1, B, C, D);         \
+      break;                                    \
+  }                                             \
+}
+
+  if (detail::canUse32BitIndexMath(a) &&
+      detail::canUse32BitIndexMath(b) &&
+      detail::canUse32BitIndexMath(c) &&
+      detail::canUse32BitIndexMath(d)) {
+    detail::TensorInfo<scalar1, unsigned int> aInfo =
+      detail::getTensorInfo<scalar1, unsigned int>(a);
+
+    detail::TensorInfo<scalar2, unsigned int> bInfo =
+      detail::getTensorInfo<scalar2, unsigned int>(b);
+
+    detail::TensorInfo<scalar3, unsigned int> cInfo =
+      detail::getTensorInfo<scalar3, unsigned int>(c);
+
+    detail::TensorInfo<scalar4, unsigned int> dInfo =
+      detail::getTensorInfo<scalar4, unsigned int>(d);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo, &dInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+    dInfo.collapseDims();
+
+#if CUDA_VERSION < 9000
+    if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous()))
+      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims);
+  } else {
+    detail::TensorInfo<scalar1, uint64_t> aInfo =
+      detail::getTensorInfo<scalar1, uint64_t>(a);
+
+    detail::TensorInfo<scalar2, uint64_t> bInfo =
+      detail::getTensorInfo<scalar2, uint64_t>(b);
+
+    detail::TensorInfo<scalar3, uint64_t> cInfo =
+      detail::getTensorInfo<scalar3, uint64_t>(c);
+
+    detail::TensorInfo<scalar4, uint64_t> dInfo =
+      detail::getTensorInfo<scalar4, uint64_t>(d);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo, &dInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+    dInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) {
+      kernelPointwiseApply4<Op,
+                            scalar1,
+                            scalar2,
+                            scalar3,
+                            scalar4,
+                            uint64_t, 1, 1, 1, 1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+          aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
+    } else {
+#if CUDA_VERSION < 9000
+  grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+	kernelPointwiseApply4<Op,
+                        scalar1,
+                        scalar2,
+                        scalar3,
+                        scalar4,
+                        uint64_t, -1, -1, -1, -1>
+        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+          aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_D_CASE
+#undef HANDLE_C_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    oldA._copy_ignoring_overlaps_(a);
+    a = oldA;
+  }
+
+  if (oldB.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    oldB._copy_ignoring_overlaps_(b);
+    b = oldB;
+  }
+
+  if (oldC.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldC contiguous.
+    oldC._copy_ignoring_overlaps_(c);
+    c = oldC;
+  }
+
+  if (oldD.defined()) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldC contiguous.
+    oldD._copy_ignoring_overlaps_(c);
+    d = oldD;
+  }
+
+  return true;
+}
+
+} // cuda
+} // at
diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in
new file mode 100644
index 0000000..72adee5
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAConfig.h.in
@@ -0,0 +1,7 @@
+#pragma once
+
+// Test these using #if AT_CUDNN_ENABLED(), not #ifdef, so that it's
+// obvious if you forgot to include Config.h
+//    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
+
+#define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
diff --git a/aten/src/ATen/cuda/CUDAGenerator.cpp b/aten/src/ATen/cuda/CUDAGenerator.cpp
new file mode 100644
index 0000000..38fcd84
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAGenerator.cpp
@@ -0,0 +1,56 @@
+#include "ATen/Config.h"
+
+#include "ATen/CUDAGenerator.h"
+#include "ATen/Context.h"
+#include "THCTensorRandom.h"
+#include <stdexcept>
+
+// There is only one CUDAGenerator instance. Calls to seed(), manualSeed(),
+// initialSeed(), and unsafeGetTH() refer to the THCGenerator on the current
+// device.
+
+THCGenerator* THCRandom_getGenerator(THCState* state);
+
+namespace at {
+
+CUDAGenerator::CUDAGenerator(Context * context_)
+  : context(context_)
+{
+}
+
+CUDAGenerator::~CUDAGenerator() {
+  // no-op Generator state is global to the program
+}
+
+CUDAGenerator& CUDAGenerator::copy(const Generator& from) {
+  throw std::runtime_error("CUDAGenerator::copy() not implemented");
+}
+
+CUDAGenerator& CUDAGenerator::free() {
+  THCRandom_shutdown(context->getTHCState());
+  return *this;
+}
+
+uint64_t CUDAGenerator::seed() {
+  return THCRandom_initialSeed(context->getTHCState());
+}
+
+uint64_t CUDAGenerator::initialSeed() {
+  return THCRandom_initialSeed(context->getTHCState());
+}
+
+CUDAGenerator& CUDAGenerator::manualSeed(uint64_t seed) {
+  THCRandom_manualSeed(context->getTHCState(), seed);
+  return *this;
+}
+
+CUDAGenerator& CUDAGenerator::manualSeedAll(uint64_t seed) {
+  THCRandom_manualSeedAll(context->getTHCState(), seed);
+  return *this;
+}
+
+void * CUDAGenerator::unsafeGetTH() {
+  return (void*)THCRandom_getGenerator(context->getTHCState());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu
new file mode 100644
index 0000000..c13efc6
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAHalf.cu
@@ -0,0 +1,56 @@
+#include "ATen/cuda/CUDAHalf.cuh"
+#include "ATen/Half.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace at {
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
+template <> AT_CUDA_API
+half convert(Half aten_half) {
+  return half{aten_half.x};
+}
+
+template <> AT_CUDA_API
+half convert(double value) {
+  return half{Half(value).x};
+}
+
+template <> AT_CUDA_API
+Half convert(half cuda_half) {
+  return Half(cuda_half.x, Half::from_bits);
+}
+#else
+template <> AT_CUDA_API
+half convert(Half aten_half) {
+  __half_raw x_raw;
+  x_raw.x = aten_half.x;
+  return half(x_raw);
+}
+
+template <> AT_CUDA_API
+Half convert(half cuda_half) {
+  __half_raw raw(cuda_half);
+  return Half(raw.x, Half::from_bits);
+}
+
+template <> AT_CUDA_API
+half convert(double value) {
+  __half_raw raw;
+  raw.x = Half(value).x;
+  return half {raw};
+}
+
+template <> __half HalfFix(Half h) {
+  __half_raw raw;
+  raw.x = h.x;
+  return __half{raw};
+}
+
+template <> Half HalfFix(__half h) {
+  __half_raw raw(h);
+  return Half(raw.x, Half::from_bits);
+}
+#endif
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh
new file mode 100644
index 0000000..87e7621
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAHalf.cuh
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ATen/cuda/ATenCUDAGeneral.h"
+#include "ATen/Half.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace at {
+template <> AT_CUDA_API half convert(Half aten_half);
+template <> AT_CUDA_API Half convert(half cuda_half);
+template <> AT_CUDA_API half convert(double value);
+#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
+template <> __half HalfFix(Half h);
+template <> Half HalfFix(__half h);
+#endif
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDATensorMethods.cuh b/aten/src/ATen/cuda/CUDATensorMethods.cuh
new file mode 100644
index 0000000..39f81d9
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDATensorMethods.cuh
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "ATen/Tensor.h"
+#include "ATen/Half.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace at {
+template <>
+inline __half* Tensor::data() const {
+  return reinterpret_cast<__half*>(data<Half>());
+}
+} // namespace at
diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
new file mode 100644
index 0000000..3e6c683
--- /dev/null
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
@@ -0,0 +1,17 @@
+#include <ATen/cuda/PinnedMemoryAllocator.h>
+#include <ATen/Context.h>
+#include <ATen/Config.h>
+
+#include <THC/THC.h>
+#include <THC/THCGeneral.hpp>
+
+#include <stdexcept>
+
+namespace at { namespace cuda {
+
+at::Allocator* getPinnedMemoryAllocator() {
+  auto state = globalContext().lazyInitCUDA();
+  return state->cudaHostAllocator;
+}
+
+}} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.h b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
new file mode 100644
index 0000000..f3aa457
--- /dev/null
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/Allocator.h>
+
+namespace at { namespace cuda {
+
+at::Allocator* getPinnedMemoryAllocator();
+
+}} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
new file mode 100644
index 0000000..2969924
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -0,0 +1,238 @@
+#include <ATen/cuda/detail/CUDAHooks.h>
+
+#include <ATen/CUDAGenerator.h>
+#include <ATen/Context.h>
+#include <ATen/Error.h>
+#include <ATen/RegisterCUDA.h>
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/native/cuda/CuFFTPlanCache.h>
+#include <ATen/cuda/PinnedMemoryAllocator.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include "THC/THC.h"
+#include <THC/THCGeneral.hpp>
+
+#if AT_CUDNN_ENABLED()
+#include "ATen/cudnn/cudnn-wrapper.h"
+#endif
+
+#include <cuda.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+namespace at {
+namespace cuda {
+namespace detail {
+namespace {
+
+void check_status(int32_t status) {
+  AT_CHECK(
+      static_cast<cudaError_t>(status) == cudaSuccess,
+      "CUDA error (",
+      static_cast<int32_t>(status),
+      "): ",
+      cudaGetErrorString(static_cast<cudaError_t>(status)));
+}
+
+void set_device(int32_t device) {
+  check_status(cudaSetDevice(device));
+}
+
+void get_device(int32_t* device) {
+  check_status(cudaGetDevice(device));
+}
+
+void unchecked_set_device(int32_t device) {
+  const auto return_code = cudaSetDevice(device);
+  (void)return_code;
+}
+
+void cuda_stream_create_with_priority(
+  cudaStream_t* pStream
+, int32_t flags
+, int32_t priority) {
+#ifndef __HIP_PLATFORM_HCC__
+  check_status(cudaStreamCreateWithPriority(pStream, flags, priority));
+#else
+  check_status(cudaStreamCreateWithFlags(pStream, flags));
+#endif
+}
+
+void cuda_stream_destroy(cudaStream_t stream) {
+  check_status(cudaStreamDestroy(stream));
+}
+
+struct DynamicCUDAInterfaceSetter {
+  DynamicCUDAInterfaceSetter() {
+    at::detail::DynamicCUDAInterface::set_device = set_device;
+    at::detail::DynamicCUDAInterface::get_device = get_device;
+    at::detail::DynamicCUDAInterface::unchecked_set_device =
+        unchecked_set_device;
+    at::detail::DynamicCUDAInterface::cuda_stream_create_with_priority = 
+      cuda_stream_create_with_priority;
+    at::detail::DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy;
+  }
+};
+
+// Single, global, static (because of the anonymous namespace) instance, whose
+// constructor will set the static members of `DynamicCUDAInterface` to CUDA
+// functions when the ATen CUDA library is loaded.
+DynamicCUDAInterfaceSetter _;
+} // namespace
+
+// NB: deleter is dynamic, because we need it to live in a separate
+// compilation unit (alt is to have another method in hooks, but
+// let's not if we don't need to!)
+std::unique_ptr<THCState, void (*)(THCState*)> CUDAHooks::initCUDA() const {
+  THCState* thc_state = THCState_alloc();
+  // Caching allocator has no context
+  THCState_setDeviceAllocator(thc_state, THCCachingAllocator_get());
+  thc_state->cudaHostAllocator = getTHCCachingHostAllocator();
+  THCudaInit(thc_state);
+  return std::unique_ptr<THCState, void (*)(THCState*)>(
+      thc_state, [](THCState* p) {
+        if (p)
+          THCState_free(p);
+      });
+}
+
+std::unique_ptr<Generator> CUDAHooks::initCUDAGenerator(
+    Context* context) const {
+  return std::unique_ptr<Generator>(new CUDAGenerator(context));
+}
+
+bool CUDAHooks::hasCUDA() const {
+  int count;
+  cudaError_t err = cudaGetDeviceCount(&count);
+  if (err == cudaErrorInsufficientDriver) {
+    return false;
+  }
+  return true;
+}
+
+bool CUDAHooks::hasCuDNN() const {
+  return AT_CUDNN_ENABLED();
+}
+
+#ifndef __HIP_PLATFORM_HCC__
+cusparseHandle_t CUDAHooks::getCurrentCUDASparseHandle(THCState* thc_state) const {
+  return THCState_getCurrentSparseHandle(thc_state);
+}
+#endif
+struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties(
+    THCState* thc_state) const {
+  return THCState_getCurrentDeviceProperties(thc_state);
+}
+struct cudaDeviceProp* CUDAHooks::getDeviceProperties(
+    THCState* thc_state,
+    int device) const {
+  return THCState_getDeviceProperties(thc_state, device);
+}
+
+int64_t CUDAHooks::current_device() const {
+  int device;
+  cudaError_t err = cudaGetDevice(&device);
+  if (err == cudaSuccess) {
+    return device;
+  }
+  return -1;
+}
+
+Allocator* CUDAHooks::getPinnedMemoryAllocator() const {
+  return at::cuda::getPinnedMemoryAllocator();
+}
+
+void CUDAHooks::registerCUDATypes(Context* context) const {
+  register_cuda_types(context);
+}
+
+bool CUDAHooks::compiledWithCuDNN() const {
+  return AT_CUDNN_ENABLED();
+}
+
+bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
+#if AT_CUDNN_ENABLED()
+  cudaDeviceProp* prop =
+      getCurrentDeviceProperties(globalContext().getTHCState());
+  // NOTE: extra parenthesis around numbers disable clang warnings about
+  // dead code
+  return (
+      (CUDNN_VERSION >= (6021)) ||
+      (CUDNN_VERSION >= (6000) && prop->major >= 5));
+#else
+  return false;
+#endif
+}
+
+long CUDAHooks::versionCuDNN() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_VERSION;
+#else
+  AT_ERROR("Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+double CUDAHooks::batchnormMinEpsilonCuDNN() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_BN_MIN_EPSILON;
+#else
+  AT_ERROR(
+      "Cannot query CUDNN_BN_MIN_EPSILON if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize() const {
+#ifndef __HIP_PLATFORM_HCC__
+  return at::native::detail::cufft_get_plan_cache_max_size_impl();
+#else
+  AT_ERROR("cuFFT with HIP is not supported");
+#endif
+}
+
+void CUDAHooks::cuFFTSetPlanCacheMaxSize(int64_t max_size) const {
+#ifndef __HIP_PLATFORM_HCC__
+  at::native::detail::cufft_set_plan_cache_max_size_impl(max_size);
+#else
+  AT_ERROR("cuFFT with HIP is not supported");
+#endif
+}
+
+int64_t CUDAHooks::cuFFTGetPlanCacheSize() const {
+#ifndef __HIP_PLATFORM_HCC__
+  return at::native::detail::cufft_get_plan_cache_size_impl();
+#else
+  AT_ERROR("cuFFT with HIP is not supported");
+#endif
+}
+
+void CUDAHooks::cuFFTClearPlanCache() const {
+#ifndef __HIP_PLATFORM_HCC__
+  at::native::detail::cufft_clear_plan_cache_impl();
+#else
+  AT_ERROR("cuFFT with HIP is not supported");
+#endif
+}
+
+int CUDAHooks::getNumGPUs() const {
+  int count;
+  auto err = cudaGetDeviceCount(&count);
+  if (err == cudaErrorNoDevice) {
+    return 0;
+  } else if (err != cudaSuccess) {
+    AT_ERROR(
+        "CUDA error (", static_cast<int>(err), "): ", cudaGetErrorString(err));
+  }
+  return count;
+}
+
+// Sigh, the registry doesn't support namespaces :(
+using at::CUDAHooksRegistry;
+using at::RegistererCUDAHooksRegistry;
+
+REGISTER_CUDA_HOOKS(CUDAHooks);
+
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
new file mode 100644
index 0000000..d88ac0d
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -0,0 +1,36 @@
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include <ATen/Generator.h>
+
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+
+namespace at { namespace cuda { namespace detail {
+
+// The real implementation of CUDAHooksInterface
+struct CUDAHooks : public at::CUDAHooksInterface {
+  CUDAHooks(at::CUDAHooksArgs) {}
+  std::unique_ptr<THCState, void(*)(THCState*)> initCUDA() const override;
+  std::unique_ptr<Generator> initCUDAGenerator(Context*) const override;
+  bool hasCUDA() const override;
+  bool hasCuDNN() const override;
+#ifndef __HIP_PLATFORM_HCC__
+  cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const override;
+#endif
+  struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override;
+  struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override;
+  int64_t current_device() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  void registerCUDATypes(Context*) const override;
+  bool compiledWithCuDNN() const override;
+  bool supportsDilatedConvolutionWithCuDNN() const override;
+  long versionCuDNN() const override;
+  double batchnormMinEpsilonCuDNN() const override;
+  int64_t cuFFTGetPlanCacheMaxSize() const override;
+  void cuFFTSetPlanCacheMaxSize(int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize() const override;
+  void cuFFTClearPlanCache() const override;
+  int getNumGPUs() const override;
+};
+
+}}} // at::cuda::detail
diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cu b/aten/src/ATen/cuda/detail/IndexUtils.cu
new file mode 100644
index 0000000..43b2637
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cu
@@ -0,0 +1,97 @@
+#include "IndexUtils.cuh"
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+struct SizeAndStride {
+  int64_t size;
+  int64_t stride;
+};
+
+/* 
+ A comparator that will sort SizeAndStride structs by stride,
+ in ascending order.
+ */
+ int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+  
+  if (aS->stride < bS->stride) return -1;
+  if (aS->stride == bS->stride) return 0;
+  return 1;
+}
+
+/*
+Returns false if there is no possibility that the tensor    
+has "overlapping" indices and true otherwise.               
+"Overlapping" indices are two+ valid indices that specify   
+the same offset within the tensor.                          
+The function does this by checking for a sufficient but not 
+necessary condition of no overlap. In particular, that      
+that there exists an ordering of the tensor's dimensions    
+that is nicely "nested," with each dimension contained      
+within the next one.                                        
+*/
+bool maybeOverlappingIndices(const Tensor& t) {
+  /* Extract size/stride arrays; only consider size >1 dims. */
+  SizeAndStride *info = (SizeAndStride *)alloca(sizeof(SizeAndStride) * t.dim());
+  int dims = t.dim();
+  int nonSize1Dims = 0;
+  for (int i = 0; i < dims; ++i) {
+    int64_t size = t.size(i);
+    if (size > 1) {
+      info[nonSize1Dims].size = size;
+      info[nonSize1Dims].stride = t.stride(i);
+
+      if (info[nonSize1Dims].stride < 1) {
+        return true;
+      }
+
+      ++nonSize1Dims;
+    }
+  }
+
+  // Short-circuits if tensor is a single element.
+  if (nonSize1Dims == 0) {
+    return false;
+  }
+
+  /* Ascending order (innermost dimension in sorted view is at [0]) */
+  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
+
+  for (int i = 0; i < (nonSize1Dims - 1); ++i) {                        
+    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {  
+      return true;                                                      
+    }                                                                   
+  } 
+
+  return false;
+}
+
+bool canUse32BitIndexMath(const Tensor& t, int64_t max_elem) {
+  int64_t elements = t.numel();
+  if (elements >= max_elem) {
+    return false;
+  }
+
+  int64_t offset = 0;
+  int64_t linearId = elements - 1;
+
+  for (int i = t.dim() - 1; i >= 0; --i) {
+    int64_t curDimIndex = linearId % t.size(i);
+    int64_t curDimOffset = curDimIndex * t.stride(i);
+    offset += curDimOffset;
+    linearId /= t.size(i);
+  }
+
+  if (offset >= max_elem) {
+    return false;
+  }
+
+  return true;
+}
+
+} // detail
+} // cuda
+} // at
diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh
new file mode 100644
index 0000000..9bbf8f7
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "ATen/ATen.h"
+#include "TensorInfo.cuh"
+#include <limits>
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+bool maybeOverlappingIndices(const at::Tensor& t);
+bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int64_t>::max());
+
+template <typename scalar, typename IndexType>
+TensorInfo<scalar, IndexType>
+getTensorInfo(const at::Tensor& t) {
+  IndexType sz[MAX_TENSORINFO_DIMS];
+  IndexType st[MAX_TENSORINFO_DIMS];
+
+  int dims = t.dim();
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = t.size(i);
+    st[i] = t.stride(i);
+  }
+
+  return TensorInfo<scalar, IndexType>(
+    t.data<scalar>(), dims, sz, st);
+}
+
+} // detail
+} // cuda
+} // at
diff --git a/aten/src/ATen/cuda/detail/TensorInfo.cuh b/aten/src/ATen/cuda/detail/TensorInfo.cuh
new file mode 100644
index 0000000..e0ada29
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/TensorInfo.cuh
@@ -0,0 +1,186 @@
+#pragma once
+
+#include "ATen/ATen.h"
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+#define MAX_TENSORINFO_DIMS 25
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  /*
+  Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of
+  the info, possibly excluding the optional excludeDim. A "collapsed" version
+  of the info is the fewest dims that order the tensor's elements in the same
+  way as the original info. If excludeDim is specified, the collapse is the
+  fewest dims that order the tensor's elements as the original and preserve the
+  excluded dimension, unless the tensor collapses to a point.
+
+  Returns the (new) index of the preserved dimension if excludeDim is
+  specified. Returns 0 if the tensor is collapsed to a point. Returns -1
+  otherwise.
+  */
+  int collapseDims(const int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  AT_ASSERT(dims < MAX_TENSORINFO_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  AT_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+
+  AT_CHECK(excludeDim >= -1 && excludeDim < dims, 
+    "expected excluded dim between -1 and dims - 1");
+
+  int stopDim = (excludeDim == -1) ? dims : excludeDim;
+  int newIndex = -1;
+  int oldIndex = 0;
+  int remappedExcludedDim = -1;
+
+  while (oldIndex < dims) {
+    // Finds a dimension to collapse into
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+      
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      ++oldIndex;
+      break; 
+    }
+
+    // Collapses dims
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+  
+      if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
+        sizes[newIndex] *= sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      } else {
+        ++newIndex;
+        sizes[newIndex] = sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      }
+    }
+
+    // Handles excludeDim being set (oldIndex == excludeDim)
+    if (oldIndex != dims) {
+      
+      // Preserves excluded dimension
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      remappedExcludedDim = newIndex;
+
+      // Restarts iteration after excludeDim
+      ++oldIndex;
+      stopDim = dims;
+    }
+  }
+
+  // Handles special case of all dims size 1
+  if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    return 0;
+  }
+
+  dims = newIndex + 1;
+  return remappedExcludedDim;
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    
+    IndexType offset = 0;
+
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+
+    return offset + linearId * info.strides[0];
+  }
+};
+
+// Uses dynamic (runtime) instead of static (compiletime) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+      IndexType offset = 0;
+
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+
+      return offset + linearId * info.strides[0];
+  }
+};
+
+} // detail
+} // cuda
+} // at
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
new file mode 100644
index 0000000..aafaebf
--- /dev/null
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -0,0 +1,135 @@
+#include "Descriptors.h"
+
+#include <ATen/ATen.h>
+
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace at { namespace native {
+
+namespace {
+
+inline cudnnDataType_t getDataType(const at::Type& t) {
+  auto scalar_type = t.scalarType();
+  if (scalar_type == at::kFloat) {
+    return CUDNN_DATA_FLOAT;
+  } else if (scalar_type == at::kHalf) {
+    return CUDNN_DATA_HALF;
+  } else if (scalar_type == at::kDouble) {
+    return CUDNN_DATA_DOUBLE;
+  }
+  throw std::runtime_error("TensorDescriptor only supports double, float and half tensors");
+}
+
+inline cudnnDataType_t getDataType(const at::Tensor& t) {
+  return getDataType(t.type());
+}
+
+} // anonymous namespace
+
+
+void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
+  set(getDataType(t), t.sizes(), t.strides(), pad);
+}
+
+void TensorDescriptor::set(cudnnDataType_t datatype, IntList t_sizes, IntList t_strides, size_t pad) {
+  size_t dim = t_sizes.size();
+  if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX)
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
+  int size[CUDNN_DIM_MAX];
+  int stride[CUDNN_DIM_MAX];
+  for (size_t i = 0; i < dim; ++i) {
+    size[i] = static_cast<int>(t_sizes[i]);
+    stride[i] = static_cast<int>(t_strides[i]);
+  }
+  for (size_t i = dim; i < pad; ++i) {
+    size[i] = 1;
+    stride[i] = 1;
+  }
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
+}
+
+std::string cudnnTypeToString(cudnnDataType_t dtype) {
+  switch (dtype) {
+    case CUDNN_DATA_FLOAT:
+      return "CUDNN_DATA_FLOAT";
+    case CUDNN_DATA_DOUBLE:
+      return "CUDNN_DATA_DOUBLE";
+    case CUDNN_DATA_HALF:
+      return "CUDNN_DATA_HALF";
+    case CUDNN_DATA_INT8:
+      return "CUDNN_DATA_INT8";
+    case CUDNN_DATA_INT32:
+      return "CUDNN_DATA_INT32";
+    case CUDNN_DATA_INT8x4:
+      return "CUDNN_DATA_INT8x4";
+#if CUDNN_VERSION >= 7100
+    case CUDNN_DATA_UINT8:
+      return "CUDNN_DATA_UINT8";
+    case CUDNN_DATA_UINT8x4:
+      return "CUDNN_DATA_UINT8x4";
+#endif
+    default:
+      std::ostringstream oss;
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      return oss.str();
+  }
+}
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  int nbDims;
+  int dimA[CUDNN_DIM_MAX];
+  int strideA[CUDNN_DIM_MAX];
+  cudnnDataType_t dtype;
+  cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
+  // Read out only nbDims of the arrays!
+  out << "    dimA = ";
+  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  out << "    strideA = ";
+  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  return out;
+}
+
+void TensorDescriptor::print() { std::cout << *this; }
+
+void FilterDescriptor::set(const at::Tensor &t, int64_t pad) {
+  auto dim = t.ndimension();
+  if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX)
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
+  if (!t.is_contiguous()) {
+    // NB: It is possible for this test to be insufficient, because the
+    // Tensor passed in to set the filter descriptor may not be the actual
+    // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
+    // that is the common case, so we can catch most client errors with this test.
+    throw std::runtime_error("cuDNN filters (a.k.a. weights) must be contiguous");
+  }
+  int size[CUDNN_DIM_MAX];
+  for (int i = 0; i < dim; ++i) {
+    size[i] = (int) t.size(i);
+  }
+  for (int i = dim; i < pad; ++i) {
+    size[i] = (int) 1;
+  }
+  dim = std::max(dim, pad);
+  set(getDataType(t), (int) dim, size);
+}
+
+}}
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
new file mode 100644
index 0000000..2bf7f0a
--- /dev/null
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -0,0 +1,334 @@
+#pragma once
+
+#include "Exceptions.h"
+
+#include "cudnn-wrapper.h"
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include "ATen/cuda/ATenCUDAGeneral.h"
+#include <cuda.h>
+
+#if CUDNN_VERSION < 7000
+
+#include <curand_kernel.h>
+
+/*
+Note [cuDNN dropout descriptor initialization]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In most cases, setting descriptors in cuDNN is cheap (e.g.,
+cudnnSetTensorNdDescriptor).  However, this is not the case for
+cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
+expensive precomputation to initialize the random number generator states.  In
+cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
+which means that law-abiding clients were expected to generate a dropout
+descriptor once and cache it.  However, our ATen interface is (1) stateless (so
+we can't cache the descriptors) and (2) does not accept arbitrary user types in
+its interface (so we can't pass the descriptor in).  This puts us in a pickle.
+
+In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
+forgoes the expensive initialization process, and can initialize the
+descriptor with a pre-initialized state CUDA tensor.  This is great, because
+it means we can simply pass in the state tensor and then initialize the
+descriptor internally.  Unfortunately, this function is not available in
+cuDNN 6.
+
+To work around this, we break the cuDNN abstraction barrier, and have
+the struct layout of the underlaying dropout descriptor.  With this struct,
+we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
+*/
+
+// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
+struct cudnnDropoutStruct {
+  float dropout;
+  int nstates;
+  void * states;
+};
+
+#endif
+
+namespace at { namespace native {
+
+// TODO: Add constructors for all of the descriptors
+
+inline int dataSize(cudnnDataType_t dataType)
+{
+  switch (dataType) {
+    case CUDNN_DATA_HALF: return 2;
+    case CUDNN_DATA_FLOAT: return 4;
+    default: return 8;
+  }
+}
+
+// The stride for a size-1 dimensions is not uniquely determined; in
+// fact, it can be anything you want, because the fact that the
+// tensor is size 1 at this dimension means that you will never actually
+// try advancing your pointer by this stride.
+//
+// However, CuDNN has a much more stringent requirement on strides:
+// if you are passing a contiguous input, it better be the case
+// that the stride for dim i is the product of the sizes of dims
+// i+1 to the end.  This stride is indeed uniquely determined.  This
+// function modifies 'stride' in place so this invariant holds.
+static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) {
+  int64_t z = 1;
+  for(int d = dim-1; d >= 0; d--)
+  {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
+template <typename T, cudnnStatus_t (*dtor)(T*)>
+struct DescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      AT_CUDNN_CHECK(dtor(x));
+    }
+  }
+};
+
+// A generic class for wrapping cuDNN descriptor types.  All you need
+// is to give the underlying type the Descriptor_t points to (usually,
+// if it's cudnnTensorDescriptor_t it points to cudnnTensorStruct),
+// the constructor and the destructor.  Subclasses are responsible
+// for defining a set() function to actually set the descriptor.
+//
+// Descriptors default construct to a nullptr, and have a descriptor
+// initialized the first time you call set() or any other initializing
+// function.
+template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
+class AT_CUDA_API Descriptor
+{
+public:
+  // TODO: Figure out why const-correctness doesn't work here
+
+  // Use desc() to access the underlying descriptor pointer in
+  // a read-only fashion.  Most client code should use this.
+  // If the descriptor was never initialized, this will return
+  // nullptr.
+  T* desc() const { return desc_.get(); }
+  T* desc() { return desc_.get(); }
+
+  // Use mut_desc() to access the underlying desciptor pointer
+  // if you intend to modify what it points to (e.g., using
+  // cudnnSetFooDescriptor).  This will ensure that the descriptor
+  // is initialized.  Code in this file will use this function.
+  T* mut_desc() { init(); return desc_.get(); }
+protected:
+  void init() {
+    if (desc_ == nullptr) {
+      T* raw_desc;
+      AT_CUDNN_CHECK(ctor(&raw_desc));
+      desc_.reset(raw_desc);
+    }
+  }
+private:
+  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
+};
+
+class AT_CUDA_API TensorDescriptor
+  : public Descriptor<cudnnTensorStruct,
+                      &cudnnCreateTensorDescriptor,
+                      &cudnnDestroyTensorDescriptor>
+{
+public:
+  TensorDescriptor() {}
+  explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
+    set(t, pad);
+  }
+
+  // Note [CuDNN broadcast padding]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // pad specifies the minimum dimensionality of the tensor descriptor
+  // we produce (it doesn't have anything to do with, e.g., convolution
+  // padding).  If 't' is lower-dimensional than 'pad', the remaining
+  // dimensions (on the right) are padded with ones.  This doesn't
+  // affect the underlying data layout.  This is particularly useful for
+  // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is
+  // done in two steps: first, the client code is expected to pad out
+  // (the dimensions) input tensors to be the same dimension as the
+  // target broadcast, and then second, CuDNN takes of actually
+  // broadcasting size 1 dimensions.
+
+  void set(const at::Tensor &t, size_t pad = 0);
+  void set(cudnnDataType_t dataType, IntList sizes, IntList strides, size_t pad = 0);
+
+  void print();
+
+private:
+  void set(cudnnDataType_t dataType, int dim, int* size, int* stride) {
+    fixSizeOneDimStride(dim, size, stride);
+    AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
+
+class FilterDescriptor
+  : public Descriptor<cudnnFilterStruct,
+                      &cudnnCreateFilterDescriptor,
+                      &cudnnDestroyFilterDescriptor>
+{
+public:
+  void set(const at::Tensor &t, int64_t pad = 0);
+
+private:
+  void set(cudnnDataType_t dataType, int dim, int* size) {
+    AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, CUDNN_TENSOR_NCHW, dim, size));
+  }
+};
+
+struct AT_CUDA_API ConvolutionDescriptor
+  : public Descriptor<cudnnConvolutionStruct,
+                      &cudnnCreateConvolutionDescriptor,
+                      &cudnnDestroyConvolutionDescriptor>
+{
+  void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) {
+    cudnnDataType_t mathType = dataType;
+    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
+    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
+                                          CUDNN_CROSS_CORRELATION, mathType));
+#if CUDNN_VERSION >= 7000
+    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
+    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
+    if(dataType == CUDNN_DATA_HALF)
+      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
+#endif
+  }
+};
+
+struct AT_CUDA_API SpatialTransformerDescriptor
+  : public Descriptor<cudnnSpatialTransformerStruct,
+                      &cudnnCreateSpatialTransformerDescriptor,
+                      &cudnnDestroySpatialTransformerDescriptor>
+{
+  void set(cudnnDataType_t dataType, int dim, int* size) {
+    AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size));
+  }
+};
+
+#if CUDNN_VERSION < 7000
+
+// See Note [cuDNN dropout descriptor initialization]
+inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnHandle_t handle,
+    float dropout,
+    void *states,
+    size_t stateSizeInBytes,
+    unsigned long long seed) {
+  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
+  // This is not entirely accurate but is good enough to catch some API
+  // uses which would not be compatible in cuDNN 7.  Feel free to fix
+  // this if you notice something is wrong.
+  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
+  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
+  size_t expectedStateSizeInBytes;
+  // State size will differ depending on size of GPU
+  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
+  if (ret != CUDNN_STATUS_SUCCESS) return ret;
+  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
+  dropoutDesc->dropout = dropout;
+  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+  dropoutDesc->states = states;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+#endif // CUDNN_VERSION
+
+struct AT_CUDA_API DropoutDescriptor
+  : public Descriptor<cudnnDropoutStruct,
+                      &cudnnCreateDropoutDescriptor,
+                      &cudnnDestroyDropoutDescriptor>
+{
+  at::Tensor state;
+
+  // Initialize a dropout descriptor's RNG state.
+  // WARNING: This function is very expensive, avoid calling this function!
+  // NB: it takes a Type so that we can generate a Variable if necessary.
+  void initialize_rng(const Type& type, cudnnHandle_t handle, float dropout, long long int seed) {
+    AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    size_t state_size;
+    AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));
+    AT_ASSERT(type.is_cuda());
+    AT_ASSERT(type.scalarType() == kByte);
+    state = at::empty({static_cast<int64_t>(state_size)}, type);
+    AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));
+  }
+
+  // Restore a dropout descriptor given a dropout probability and existing RNG state.
+  // See Note [cuDNN dropout descriptor initialization]
+  void set(cudnnHandle_t handle, float dropout, at::Tensor state_) {
+    AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    state = state_;
+    void *state_ptr = state.data_ptr();
+    size_t state_size = state.size(0);
+    // NB: The seed doesn't actually matter, so we give a dummy value
+    AT_CUDNN_CHECK(cudnnRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */));
+  }
+
+  // Restore a dropout descriptor corresponding to no dropout
+  // See Note [cuDNN dropout descriptor initialization]
+  void set_no_dropout(cudnnHandle_t handle) {
+    // NB: seed doesn't matter when dropout = 0, because no random number
+    // initialization actually takes place when there is no dropout.
+    // NB: Empirically, cudnnSetDropoutDescriptor is cheap when
+    // dropoot == 0
+    AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */));
+  }
+};
+
+struct AT_CUDA_API RNNDescriptor
+  : public Descriptor<cudnnRNNStruct,
+                      &cudnnCreateRNNDescriptor,
+                      &cudnnDestroyRNNDescriptor>
+{
+  DropoutDescriptor dropout_desc_;
+  void set(cudnnHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc,
+           cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
+           cudnnRNNMode_t mode, cudnnDataType_t datatype) {
+    dropout_desc_ = std::move(dropout_desc);
+    AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
+          handle,
+          mut_desc(),
+          hidden_size,
+          num_layers,
+          dropout_desc_.desc(),
+          input_mode,
+          bidirectional,
+          mode,
+          CUDNN_RNN_ALGO_STANDARD,
+          datatype));
+#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
+    cudaDeviceProp* prop = globalContext().getCurrentDeviceProperties();
+    if (prop->major >= 7) {
+      if (datatype == CUDNN_DATA_HALF) {
+        cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
+      } else {
+        // Technically, as the default it's not necessary to explicitly
+        // set this.
+        cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH);
+      }
+    }
+#endif
+  }
+};
+
+union Constant
+{
+  float f;
+  double d;
+  Constant(cudnnDataType_t dataType, double value) {
+    if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) {
+      f = (float) value;
+    } else {
+      d = value;
+    }
+  }
+};
+
+}}  // namespace
diff --git a/aten/src/ATen/cudnn/Exceptions.h b/aten/src/ATen/cudnn/Exceptions.h
new file mode 100644
index 0000000..b59127e
--- /dev/null
+++ b/aten/src/ATen/cudnn/Exceptions.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <ATen/Error.h>
+#define AT_CUDNN_CHECK(STATUS)                                                 \
+  if (STATUS != CUDNN_STATUS_SUCCESS) {                                        \
+    if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) {                                \
+      AT_ERROR(                                                                \
+          "CuDNN error: ",                                                     \
+          cudnnGetErrorString(STATUS),                                         \
+          ". This error may appear if you passed in a non-contiguous input."); \
+    } else {                                                                   \
+      AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS));                  \
+    }                                                                          \
+  }
+#define AT_CUDA_CHECK(STATUS)                             \
+  if (STATUS != cudaSuccess) {                            \
+    AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \
+  }
diff --git a/aten/src/ATen/cudnn/Handles.cpp b/aten/src/ATen/cudnn/Handles.cpp
new file mode 100644
index 0000000..7aae86d
--- /dev/null
+++ b/aten/src/ATen/cudnn/Handles.cpp
@@ -0,0 +1,51 @@
+#include "Handles.h"
+
+#include "Exceptions.h"
+
+#include <unordered_map>
+#include <mutex>
+
+// TODO: Get rid of the mutex, and just initialize these
+// handles in at::Context along with lazy CUDA initialization
+
+namespace at { namespace native {
+
+namespace {
+
+struct Handle {
+  cudnnHandle_t handle;
+  Handle() : handle(NULL) {
+    AT_CUDNN_CHECK(cudnnCreate(&handle));
+  }
+  ~Handle() {
+    if (handle) {
+// this is because of something dumb in the ordering of
+// destruction. Sometimes atexit, the cuda context (or something)
+// would already be destroyed by the time this gets destroyed. It
+// happens in fbcode setting. @colesbury and I decided to not destroy
+// the handle as a workaround.
+//   - @soumith
+#ifdef NO_CUDNN_DESTROY_HANDLE
+#else
+      cudnnDestroy(handle);
+#endif
+    }
+  }
+};
+
+std::mutex mutex;
+std::unordered_map<int, Handle> handles;
+
+}  // namespace
+
+
+cudnnHandle_t getCudnnHandle()
+{
+  int device;
+  AT_CUDA_CHECK(cudaGetDevice(&device));
+
+  std::lock_guard<std::mutex> guard(mutex);
+  return handles[device].handle;
+}
+
+}} // namespace at::cudnn
diff --git a/aten/src/ATen/cudnn/Handles.h b/aten/src/ATen/cudnn/Handles.h
new file mode 100644
index 0000000..369b1f3
--- /dev/null
+++ b/aten/src/ATen/cudnn/Handles.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "cudnn-wrapper.h"
+#include "ATen/cuda/ATenCUDAGeneral.h"
+
+namespace at { namespace native {
+
+AT_CUDA_API cudnnHandle_t getCudnnHandle();
+
+}} // namespace
diff --git a/aten/src/ATen/cudnn/README.md b/aten/src/ATen/cudnn/README.md
new file mode 100644
index 0000000..057fbc9
--- /dev/null
+++ b/aten/src/ATen/cudnn/README.md
@@ -0,0 +1,4 @@
+All files living in this directory are written with the assumption that cuDNN is available,
+which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever
+you need to use definitions from here, please guard the `#include<ATen/cudnn/*.h>` and
+definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](native/cudnn/BatchNorm.cpp).
diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp
new file mode 100644
index 0000000..7c11d46
--- /dev/null
+++ b/aten/src/ATen/cudnn/Types.cpp
@@ -0,0 +1,24 @@
+#include "Types.h"
+
+#include <ATen/ATen.h>
+
+namespace at { namespace native {
+
+cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) {
+  if (tensor.type().scalarType() == at::kFloat) {
+    return CUDNN_DATA_FLOAT;
+  } else if (tensor.type().scalarType() == at::kDouble) {
+    return CUDNN_DATA_DOUBLE;
+  } else if (tensor.type().scalarType() == at::kHalf) {
+    return CUDNN_DATA_HALF;
+  }
+  std::string msg("getCudnnDataType() not supported for ");
+  msg += at::toString(tensor.type().scalarType());
+  throw std::runtime_error(msg);
+}
+
+int64_t cudnn_version() {
+  return CUDNN_VERSION;
+}
+
+}}  // namespace at::cudnn
diff --git a/aten/src/ATen/cudnn/Types.h b/aten/src/ATen/cudnn/Types.h
new file mode 100644
index 0000000..33fa8e6
--- /dev/null
+++ b/aten/src/ATen/cudnn/Types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "cudnn-wrapper.h"
+#include <ATen/Tensor.h>
+
+namespace at { namespace native {
+
+cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);
+
+int64_t cudnn_version();
+
+}}  // namespace at::cudnn
diff --git a/aten/src/ATen/cudnn/Utils.h b/aten/src/ATen/cudnn/Utils.h
new file mode 100644
index 0000000..c2e5dcb
--- /dev/null
+++ b/aten/src/ATen/cudnn/Utils.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "THC/THC.h"
+#include "cudnn-wrapper.h"
+#include "Handles.h"
+
+namespace at { namespace native {
+
+inline void setCuDNNStreamToCurrent() {
+  // TODO: Should getCurrentStream be a method on Context?
+  AT_CUDNN_CHECK(cudnnSetStream(getCudnnHandle(), THCState_getCurrentStream(globalContext().getTHCState())));
+}
+
+// cuDNN has a buggy check for tensor being contiguous (that is, it does
+// not ignore stride for dimension that is equal to 0).  This function
+// makes tensors which have zero stride contiguous, by setting the
+// strides to 1 as cuDNN likes.
+inline Tensor contiguousIfZeroInStrides(const Tensor& t) {
+  for (auto s : t.strides()) {
+    if (s == 0) return t.contiguous();
+  }
+  return t;
+}
+
+}}
diff --git a/aten/src/ATen/cudnn/cudnn-wrapper.h b/aten/src/ATen/cudnn/cudnn-wrapper.h
new file mode 100644
index 0000000..320646e
--- /dev/null
+++ b/aten/src/ATen/cudnn/cudnn-wrapper.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cudnn.h>
+
+#define STRINGIFY(x) #x
+#define STRING(x) STRINGIFY(x)
+
+#if CUDNN_MAJOR < 6
+#pragma message ("CuDNN v" STRING(CUDNN_MAJOR) " found, but need at least CuDNN v6. You can get the latest version of CuDNN from https://developer.nvidia.com/cudnn or disable CuDNN with NO_CUDNN=1")
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#endif
+
+#undef STRINGIFY
+#undef STRING
+
diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py
new file mode 100644
index 0000000..f020dd0
--- /dev/null
+++ b/aten/src/ATen/cwrap_parser.py
@@ -0,0 +1,22 @@
+import yaml
+
+# follows similar logic to cwrap, ignores !inc, and just looks for [[]]
+
+
+def parse(filename):
+    with open(filename, 'r') as file:
+        declaration_lines = []
+        declarations = []
+        in_declaration = False
+        for line in file.readlines():
+            line = line.rstrip()
+            if line == '[[':
+                declaration_lines = []
+                in_declaration = True
+            elif line == ']]':
+                in_declaration = False
+                declaration = yaml.load('\n'.join(declaration_lines))
+                declarations.append(declaration)
+            elif in_declaration:
+                declaration_lines.append(line)
+        return declarations
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp
new file mode 100644
index 0000000..b6897ed
--- /dev/null
+++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp
@@ -0,0 +1,76 @@
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include <ATen/Error.h>
+
+#include <cstddef>
+#include <memory>
+#include <mutex>
+
+namespace at {
+namespace detail {
+
+void default_set_device(int32_t) {
+  AT_ERROR(
+      "DynamicCUDAInterface::set_device called "
+      "before CUDA library was loaded");
+}
+
+void default_get_device(int32_t*) {
+  AT_ERROR(
+      "DynamicCUDAInterface::get_device called "
+      "before CUDA library was loaded");
+}
+
+void default_unchecked_set_device(int32_t) {
+  AT_ERROR(
+      "DynamicCUDAInterface::unchecked_set_device called "
+      "before CUDA library was loaded");
+}
+
+void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) {
+  AT_ERROR(
+    "DynamicCUDAInterface::cuda_stream_create_with_priority called "
+    "before CUDA library was loaded");
+}
+
+void default_cuda_stream_destroy(cudaStream_t) {
+  AT_ERROR(
+    "DynamicCUDAInterface::cuda_stream_destroy called "
+    "before CUDA library was loaded");
+}
+
+// Default the static members of DynamicCUDAInterface.
+void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device;
+void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device;
+void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) =
+    default_unchecked_set_device;
+void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t) 
+  = default_cuda_stream_create_with_priority;
+void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) 
+  = default_cuda_stream_destroy;
+  
+
+const CUDAHooksInterface& getCUDAHooks() {
+  static std::unique_ptr<CUDAHooksInterface> cuda_hooks;
+  // NB: The once_flag here implies that if you try to call any CUDA
+  // functionality before libATen_cuda.so is loaded, CUDA is permanently
+  // disabled for that copy of ATen.  In principle, we can relax this
+  // restriction, but you might have to fix some code.  See getVariableHooks()
+  // for an example where we relax this restriction (but if you try to avoid
+  // needing a lock, be careful; it doesn't look like Registry.h is thread
+  // safe...)
+  static std::once_flag once;
+  std::call_once(once, [] {
+    cuda_hooks = CUDAHooksRegistry()->Create("CUDAHooks", CUDAHooksArgs{});
+    if (!cuda_hooks) {
+      cuda_hooks =
+          std::unique_ptr<CUDAHooksInterface>(new CUDAHooksInterface());
+    }
+  });
+  return *cuda_hooks;
+}
+} // namespace detail
+
+AT_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+
+} // namespace at
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
new file mode 100644
index 0000000..e15cf36
--- /dev/null
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <ATen/Allocator.h>
+#include <ATen/Error.h>
+#include <ATen/Generator.h>
+#include <ATen/Registry.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+// Forward declare these CUDA types here to avoid including CUDA headers in
+// ATen headers, which would make ATen always require CUDA to build.
+struct THCState;
+struct cudaDeviceProp;
+struct CUstream_st;
+typedef struct CUstream_st* cudaStream_t;
+
+#ifndef __HIP_PLATFORM_HCC__
+// pyHIPIFY rewrites this as:
+//
+//    struct cusparseContext;
+//    typedef struct cusparseContext *hipsparseHandle_t;
+//
+// however, this forward declaration is wrong
+// the way that the HIP headers define hipsparseHandle_t is
+//
+//    typedef cusparseHandle_t hipsparseHandle_t
+//
+// so the rewrite is wrong.
+struct cusparseContext;
+typedef struct cusparseContext *cusparseHandle_t;
+#endif
+
+namespace at {
+class Context;
+}
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+constexpr const char* CUDA_HELP =
+  "PyTorch splits its backend into two shared libraries: a CPU library "
+  "and a CUDA library; this error has occurred because you are trying "
+  "to use some CUDA functionality, but the CUDA library has not been "
+  "loaded by the dynamic linker for some reason.  The CUDA library MUST "
+  "be loaded, EVEN IF you don't directly use any symbols from the CUDA library! "
+  "One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many "
+  "dynamic linkers will delete dynamic library dependencies if you don't "
+  "depend on any of their symbols.  You can check if this has occurred by "
+  "using ldd on your binary to see if there is a dependency on *_cuda.so "
+  "library.";
+
+// The CUDAHooksInterface is an omnibus interface for any CUDA functionality
+// which we may want to call into from CPU code (and thus must be dynamically
+// dispatched, to allow for separate compilation of CUDA code).  How do I
+// decide if a function should live in this class?  There are two tests:
+//
+//  1. Does the *implementation* of this function require linking against
+//     CUDA libraries?
+//
+//  2. Is this function *called* from non-CUDA ATen code?
+//
+// (2) should filter out many ostensible use-cases, since many times a CUDA
+// function provided by ATen is only really ever used by actual CUDA code.
+//
+// TODO: Consider putting the stub definitions in another class, so that one
+// never forgets to implement each virtual function in the real implementation
+// in CUDAHooks.  This probably doesn't buy us much though.
+struct AT_API CUDAHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~CUDAHooksInterface() {}
+
+  // Initialize THCState and, transitively, the CUDA state
+  virtual std::unique_ptr<THCState, void (*)(THCState*)> initCUDA() const {
+    AT_ERROR("Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual std::unique_ptr<Generator> initCUDAGenerator(Context*) const {
+    AT_ERROR("Cannot initialize CUDA generator without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual bool hasCUDA() const {
+    return false;
+  }
+
+  virtual bool hasCuDNN() const {
+    return false;
+  }
+
+#ifndef __HIP_PLATFORM_HCC__
+  virtual cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const {
+    AT_ERROR("Cannot getCurrentCUDASparseHandle() without ATen_cuda library. ", CUDA_HELP);
+  }
+#endif
+
+  virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const {
+    AT_ERROR("Cannot getCurrentDeviceProperties() without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual struct cudaDeviceProp* getDeviceProperties(THCState*, int device)
+      const {
+    AT_ERROR("Cannot getDeviceProperties() without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int64_t current_device() const {
+    return -1;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    AT_ERROR("Pinned memory requires CUDA. ", CUDA_HELP);
+  }
+
+  virtual void registerCUDATypes(Context*) const {
+    AT_ERROR("Cannot registerCUDATypes() without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual bool compiledWithCuDNN() const {
+    return false;
+  }
+
+  virtual bool supportsDilatedConvolutionWithCuDNN() const {
+    return false;
+  }
+
+  virtual long versionCuDNN() const {
+    AT_ERROR("Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual double batchnormMinEpsilonCuDNN() const {
+    AT_ERROR(
+        "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int64_t cuFFTGetPlanCacheMaxSize() const {
+    AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual void cuFFTSetPlanCacheMaxSize(int64_t max_size) const {
+    AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int64_t cuFFTGetPlanCacheSize() const {
+    AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual void cuFFTClearPlanCache() const {
+    AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int getNumGPUs() const {
+    return 0;
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct AT_API CUDAHooksArgs {};
+
+AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+#define REGISTER_CUDA_HOOKS(clsname) \
+  AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
+
+namespace detail {
+AT_API const CUDAHooksInterface& getCUDAHooks();
+
+/// This class exists to let us access `cudaSetDevice`, `cudaGetDevice` and CUDA
+/// error handling functions, when CUDA is available. These functions will first
+/// default to no-ops. When the `ATen` GPU library is loaded, they will be set to
+/// the `cudaSetDevice`/`cudaGetDevice` functions. This allows us to access them
+/// with only a single pointer indirection, while virtual dispatch would require
+/// two (one for the virtual call, one for `cudaSetDevice`/`cudaGetDevice`).
+struct AT_API DynamicCUDAInterface {
+  static void (*set_device)(int32_t);
+  static void (*get_device)(int32_t*);
+  static void (*unchecked_set_device)(int32_t);
+  static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t);
+  static void (*cuda_stream_destroy)(cudaStream_t);
+};
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp
new file mode 100644
index 0000000..07531d8
--- /dev/null
+++ b/aten/src/ATen/detail/UniqueVoidPtr.cpp
@@ -0,0 +1,7 @@
+#include <ATen/detail/UniqueVoidPtr.h>
+
+namespace at { namespace detail {
+
+void deleteNothing(void*) {}
+
+}} // namespace at
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h
new file mode 100644
index 0000000..866c0ef
--- /dev/null
+++ b/aten/src/ATen/detail/UniqueVoidPtr.h
@@ -0,0 +1,84 @@
+#include <memory>
+
+#include <ATen/ATenGeneral.h>
+
+namespace at {
+
+using DeleterFnPtr = void(*)(void*);
+
+namespace detail {
+
+// Does not delete anything
+AT_API void deleteNothing(void*);
+
+// A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
+// with three major differences:
+//
+//    1) It is specialized to void
+//
+//    2) It is specialized for a function pointer deleter
+//       void(void* ctx); i.e., the deleter doesn't take a
+//       reference to the data, just to a context pointer
+//       (erased as void*).  In fact, internally, this pointer
+//       is implemented as having an owning reference to
+//       context, and a non-owning reference to data; this is why
+//       you release_context(), not release() (the conventional
+//       API for release() wouldn't give you enough information
+//       to properly dispose of the object later.)
+//
+//    3) The deleter is guaranteed to be called when the unique
+//       pointer is destructed and the context is non-null; this is different
+//       from std::unique_ptr where the deleter is not called if the
+//       data pointer is null.
+//
+// Some of the methods have slightly different types than std::unique_ptr
+// to reflect this.
+//
+class UniqueVoidPtr {
+private:
+  // Lifetime tied to ctx_
+  void* data_;
+  std::unique_ptr<void, DeleterFnPtr> ctx_;
+public:
+  UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {}
+  explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {}
+  UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
+    : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
+  void* operator->() const { return data_; }
+  void* get() const { return data_; }
+  void* get_context() const { return ctx_.get(); }
+  void* release_context() { return ctx_.release(); }
+  template <typename T>
+  T* cast_context(DeleterFnPtr expected_deleter) const {
+    if (get_deleter() != expected_deleter) return nullptr;
+    return static_cast<T*>(get_context());
+  }
+  operator bool() const { return data_ || ctx_; }
+  DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); }
+};
+
+
+// Note [How UniqueVoidPtr is implemented]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// UniqueVoidPtr solves a common problem for allocators of tensor data, which
+// is that the data pointer (e.g., float*) which you are interested in, is not
+// the same as the context pointer (e.g., DLManagedTensor) which you need
+// to actually deallocate the data.  Under a conventional deleter design, you
+// have to store extra context in the deleter itself so that you can actually
+// delete the right thing.  Implementing this with standard C++ is somewhat
+// error-prone: if you use a std::unique_ptr to manage tensors, the deleter will
+// not be called if the data pointer is nullptr, which can cause a leak if the
+// context pointer is non-null (and the deleter is responsible for freeing both
+// the data pointer and the context pointer).
+//
+// So, in our reimplementation of unique_ptr, which just store the context
+// directly in the unique pointer, and attach the deleter to the context
+// pointer itself.  In simple cases, the context pointer is just the pointer
+// itself.
+
+inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; }
+inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; }
+inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; }
+inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; }
+
+}} // namespace at::detail
diff --git a/aten/src/ATen/detail/VariableHooksInterface.cpp b/aten/src/ATen/detail/VariableHooksInterface.cpp
new file mode 100644
index 0000000..8569052
--- /dev/null
+++ b/aten/src/ATen/detail/VariableHooksInterface.cpp
@@ -0,0 +1,29 @@
+#include <ATen/detail/VariableHooksInterface.h>
+
+namespace at {
+
+namespace detail {
+
+  // NB: The VariableHooks returned by this function may CHANGE after dlopen()
+  // NB: This function takes a lock, don't call it from perf critical paths
+  const VariableHooksInterface& getVariableHooks() {
+    static std::mutex var_hooks_mutex;
+    static std::unique_ptr<VariableHooksInterface> var_hooks = nullptr;
+    static std::unique_ptr<VariableHooksInterface> default_var_hooks =
+      std::unique_ptr<VariableHooksInterface>(new VariableHooksInterface());
+    std::lock_guard<std::mutex> lock(var_hooks_mutex);
+
+    if (!var_hooks) {
+      var_hooks = VariableHooksRegistry()->Create("VariableHooks", VariableHooksArgs{});
+    }
+    if (var_hooks) {
+      return *var_hooks;
+    }
+    return *default_var_hooks;
+  }
+
+}
+
+AT_DEFINE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
+
+} // namespace at::detail
diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h
new file mode 100644
index 0000000..2871164
--- /dev/null
+++ b/aten/src/ATen/detail/VariableHooksInterface.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/Registry.h>
+#include <ATen/Error.h>
+#include <ATen/ScalarType.h>
+
+namespace at {
+  class Context;
+}
+
+// NB: Registry class not actually in the namespace detail, due to limitations
+// of Registry.h
+namespace at {
+
+// The VariableHooksInterface is an interface for autograd functionality
+// which currently doesn't live in libATen.so AND needs to be called from
+// ATen.  In this case, it is only the type registry for Variable types,
+// letting us add extra variables types if CUDA types are initialized lazily.
+//
+// We may choose to absorb autograd into ATen, in which case this interface is obsolete.
+//
+struct AT_API VariableHooksInterface {
+
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~VariableHooksInterface() {}
+
+  virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const {
+    // no-op if Variable not available; it'll get handled (if at all) when
+    // libtorch.so gets loaded
+  }
+
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct AT_API VariableHooksArgs {};
+
+AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
+#define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
+
+namespace detail {
+  AT_API const VariableHooksInterface& getVariableHooks();
+}
+
+} // namespace at
diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
new file mode 100644
index 0000000..f8dc8fc
--- /dev/null
+++ b/aten/src/ATen/dlpack.h
@@ -0,0 +1,141 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 010
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLContext.
+ */
+typedef enum {
+  kDLCPU = 1,
+  kDLGPU = 2,
+  // kDLCPUPinned = kDLCPU | kDLGPU
+  kDLCPUPinned = 3,
+  kDLOpenCL = 4,
+  kDLMetal = 8,
+  kDLVPI = 9,
+  kDLROCM = 10,
+} DLDeviceType;
+
+/*!
+ * \brief A Device context for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*! \brief The device index */
+  int device_id;
+} DLContext;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  kDLInt = 0U,
+  kDLUInt = 1U,
+  kDLFloat = 2U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */
+  void* data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to faciliate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/aten/src/ATen/extract_cwrap.py b/aten/src/ATen/extract_cwrap.py
new file mode 100644
index 0000000..64c2281
--- /dev/null
+++ b/aten/src/ATen/extract_cwrap.py
@@ -0,0 +1,38 @@
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option('-o', '--output', help='where to write the result file.',
+                  action='store', default='.')
+options, _ = parser.parse_args()
+
+files = [
+    # '../../csrc/cudnn/cuDNN.cwrap',
+    '../../csrc/generic/TensorMethods.cwrap',
+    # '../../csrc/generic/methods/SparseTensor.cwrap',
+    '../../csrc/generic/methods/Tensor.cwrap',
+    '../../csrc/generic/methods/TensorApply.cwrap',
+    '../../csrc/generic/methods/TensorCompare.cwrap',
+    '../../csrc/generic/methods/TensorCuda.cwrap',
+    '../../csrc/generic/methods/TensorMath.cwrap',
+    '../../csrc/generic/methods/TensorRandom.cwrap',
+    # '../../csrc/generic/methods/TensorSerialization.cwrap',
+]
+
+declaration_lines = []
+
+for filename in files:
+    with open(filename, 'r') as file:
+        in_declaration = False
+        for line in file.readlines():
+            line = line.rstrip()
+            if line == '[[':
+                in_declaration = True
+                declaration_lines.append(line)
+            elif line == ']]':
+                in_declaration = False
+                declaration_lines.append(line)
+            elif in_declaration:
+                declaration_lines.append(line)
+
+with open(options.output, 'w') as output:
+    output.write('\n'.join(declaration_lines) + '\n')
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
new file mode 100644
index 0000000..1c06654
--- /dev/null
+++ b/aten/src/ATen/function_wrapper.py
@@ -0,0 +1,1517 @@
+# HEY! Trying to understand what this file does?  Read
+# "what has to be done to add a Operation ..." first!
+
+import re
+from code_template import CodeTemplate
+
+try:
+    import typing  # noqa: F401
+except ImportError:
+    raise RuntimeError(
+        'Missing build dependency: Unable to import the `typing` module. '
+        'Please install it via `conda install typing` or `pip install typing`')
+
+# flake8 doesn't take into account usages in type annotations.
+from typing import Union, Set  # noqa: F401
+from typing import Any, Dict, List, Optional, Tuple, NamedTuple
+
+try:
+    from mypy_extensions import TypedDict
+except ImportError:
+    # Avoid the dependency on the mypy_extensions package.
+    # It is required, however, for type checking.
+    def TypedDict(name, attrs, total=True):  # type: ignore
+        return Dict[Any, Any]
+
+import sys
+if sys.version_info[0] == 3:
+    string_type = str
+else:
+    string_type = basestring
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# what has to be done to add a Operation ...
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# 1. if broadcasting or without the full list of arguments, add a non-virtual
+#    declaration under Type.h  (right now, we call this template
+#    BROADCAST but it also handles default arguments)
+TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\
+${return_type} ${api_name}(${type_method_formals_with_defaults}) const;
+""")
+# 2. broadcasting functions are implemented in Type.cpp
+TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\
+${return_type} Type::${api_name}(${type_method_formals}) const {
+    ${device_guard_declaration}
+    Tensor ${broadcast_returns};
+    std::tie(${broadcast_returns}) = ${broadcast_function}(${broadcast_actuals}, "${api_name}");
+    return ${method_prefix_derived}${api_name}(${broadcast_modified_actuals});
+}
+""")
+# 3. add virtual dispatch declaration to Type.h and impl to Type.cpp; method_prefix_derived
+#    is present for providing a base-class definition for a derived-type method with a prefix.
+#
+#    If the declaration is abstract, then the actual implementation will
+#    be in a derived type; we put in a simple default "not implemented"
+#    stub.  However, if the declaration is concrete, we dispatch to the
+#    actual implementation.  At the moment, this situation *only* occurs
+#    for 'native' declarations (so the native dispatch is hardcoded into
+#    the template here.)
+TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\
+virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const;
+""")
+TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\
+${return_type} Type::${method_prefix_derived}${api_name}(${type_method_formals}) const {
+    AT_ERROR("${method_prefix_derived}${api_name} is not implemented for type ", toString());
+}
+""")
+TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\
+virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const;
+""")
+DEPRECATED_TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\
+AT_DEPRECATED(virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const);
+""")
+TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\
+${return_type} Type::${api_name}(${type_method_formals}) const {
+    ${device_guard_declaration}
+    ${type_definition_body}
+}
+""")
+DEPRECATED_TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\
+${return_type} Type::${api_name}(${type_method_formals}) const {
+    TensorOptions options(*this);
+    ${device_guard_declaration}
+    return at::native::${api_name}(${type_method_actuals}, options);
+}
+""")
+# 4. add virtual override to TypeDerived.h
+TYPE_DERIVED_DECLARATION = CodeTemplate("""\
+virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override;
+""")
+# 5. add override definition to TypeDerived.cpp
+TYPE_DERIVED_DEFINITION = CodeTemplate("""\
+${return_type} ${Type}::${method_prefix_derived}${api_name}(${type_method_formals}) const {
+    ${device_guard_declaration}
+    ${type_definition_body}
+}
+""")
+# NB: As far as ezyang can tell, we don't *have* to codegen this,
+# because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
+# the superclass.  But it doesn't seem to be harmful.
+TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
+${return_type} ${Type}::${api_name}(${type_method_formals}) const {
+    ${device_guard_declaration}
+    const auto& self_ty = *this;
+    (void)self_ty;
+    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
+}
+""")
+TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
+${return_type} ${Type}::${api_name}(${type_method_formals}) const {
+    AT_ERROR("${api_name} not supported on ${Type}");
+}
+""")
+TYPE_DEFINITION_BODY_NATIVE = CodeTemplate("""\
+${return_call} at::native::${native_type_method_dispatch}(/* native_actuals */ ${native_actuals});
+""")
+
+# add non-virtual declaration to Tensor.h
+TENSOR_METHOD_DECLARATION = CodeTemplate("""\
+${return_type} ${api_name}(${method_formals_with_defaults})${const_mark};
+""")
+# add non-virtual declaration to Tensor.cpp
+TENSOR_METHOD_DEFINITION = CodeTemplate("""\
+inline ${return_type} Tensor::${api_name}(${method_formals})${const_mark} {
+    return type().${api_name}(${method_actuals});
+}
+""")
+# add a method declaration in Functions.h
+FUNCTION_DECLARATION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals_with_defaults});
+""")
+# add a method declaration in Functions.h
+DEPRECATED_FUNCTION_DECLARATION = CodeTemplate("""\
+AT_DEPRECATED(static inline ${return_type} ${api_name}(${formals_with_defaults}));
+""")
+# add method definition in Functions.h
+FUNCTION_DEFINITION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals}) {
+    return ${inferred_type}.${api_name}(${type_method_actuals});
+}
+""")
+# add a native declaration for a native function
+NATIVE_DECLARATION = CodeTemplate("""\
+AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
+""")
+
+# special method definition for factory functions in Functions.h
+FACTORY_DEFINITION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals}) {
+    const DeviceGuard guard(options.device());
+    return at::native::${api_name}(${type_method_actuals});
+}
+""")
+
+# special method definition for *deprecated* factory functions in Functions.h
+DEPRECATED_FACTORY_DEFINITION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals}) {
+    return at::${api_name}(${type_method_actuals}, TensorOptions(${inferred_type}));
+}
+""")
+
+# We need to cast to the base type because C++ may hide the base class
+# implementation of ${api_name} if we have overloaded a function with
+# the same name (but different signature) already
+ZERO_DIM_CHECK = CodeTemplate("""\
+if (${check_name}.dim() == 0) {
+    return static_cast<const Type*>(this)->${api_name}(${zero_dim_actuals});
+}""")
+
+ZERO_DIM_ONLY = CodeTemplate("""\
+AT_ERROR("${api_name} only supports a 0-dimensional ${check_name} tensor, but got tensor "
+    "with ", ${check_name}.dim(), " dimension(s).");
+""")
+
+SPARSE_CHECK = CodeTemplate("""\
+if(${check_name}.type().is_sparse()) {
+    return static_cast<const Type*>(this)->${api_name}(${sparse_actuals});
+}""")
+
+BUFFER_DEFINITION = CodeTemplate("""\
+auto ${name}_ = new ${Tensor}(context);
+auto ${name} = Tensor(${name}_, false);""")
+
+CONDITIONAL_INITIALIZER = CodeTemplate("""\
+if (${name}.defined()) {
+    ${initializer}
+}""")
+
+CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})")
+
+HALF_CONVERSION = CodeTemplate("convert<half>(${value})")
+
+
+class NYIError(Exception):
+    """Indicates we don't support this declaration yet"""
+
+    def __init__(self, reason):
+        self.reason = reason
+
+
+TYPE_FORMAL_GENERIC = {
+    'THTensor*': 'Tensor &',
+    'THSTensor*': 'SparseTensorRef',
+    'THBoolTensor*': 'Tensor &',
+    'THIndexTensor*': 'Tensor &',
+    'THIntegerTensor*': 'Tensor &',
+    'THDenseTensor*': 'Tensor &',
+    'THDenseIndexTensor*': 'Tensor &',
+    'THStorage*': 'Storage &',
+    'THGenerator*': 'Generator *',
+    'THSize*': 'IntList',
+    'THStride*': 'IntList',
+    'accreal': 'Scalar',
+    'real': 'Scalar',
+    'long': 'int64_t',
+}
+
+DYNAMIC_TYPE = {
+    'THTensor*': 'Tensor',
+    'THSTensor*': 'SparseTensorRef',
+    'THBoolTensor*': 'BoolTensor',
+    'THIndexTensor*': 'IndexTensor',
+    'THIntegerTensor*': 'IntegerTensor',
+    'THDenseTensor*': 'Tensor',
+    'THDenseIndexTensor*': 'IndexTensor',
+    'THStorage*': 'Storage',
+    'THGenerator*': 'Generator*',
+    'THSize*': 'IntList',
+    'THStride*': 'IntList',
+    'accreal': 'accreal',
+    'real': 'real',
+    'long': 'int64_t',
+}
+
+NATIVE_DYNAMIC_TYPE = {
+    'Tensor &': 'Tensor',
+    'const Tensor &': 'Tensor',
+}
+
+TYPE_RETURN = {
+    'THTensor*': 'Tensor',
+    'THIndexTensor*': 'Tensor',
+    'THBoolTensor*': 'Tensor',
+    'THIntegerTensor*': 'Tensor',
+    'THSTensor*': 'Tensor',
+    'THDenseTensor*': 'Tensor',
+    'THDenseIndexTensor*': 'Tensor',
+    'real': 'Tensor',
+    'accreal': 'Tensor',
+    'long': 'int64_t',
+}
+
+CHECKED_CAST = {
+    'THTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${Tensor}>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THSTensor*':
+    CodeTemplate(
+        'checked_cast_tensor<Sparse${Tensor}>(${arg_name}.tref.pImpl,"${arg_name}",${arg_pos},false)'),
+    'THBoolTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${Backend}ByteTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THIndexTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${Backend}LongTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THIntegerTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${Backend}IntTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THDenseTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${DenseTensor}>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THDenseIndexTensor*':
+        CodeTemplate(
+            'checked_cast_tensor<${DenseBackend}LongTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'),
+    'THStorage*': CodeTemplate('checked_cast_storage<${Storage}>(&${arg_name},"${arg_name}",${arg_pos})'),
+    'THGenerator*':
+        CodeTemplate(
+            'check_generator<${Backend}Generator>(${arg_name}, &context->defaultGenerator(backend()))'),
+    # This is a cast done via direct-construction
+    'THSize*': CodeTemplate('THLongStorageView ${result_name}(${arg_name}, THLongStorageViewKind::SIZE);'),
+    # This is a cast done via direct-construction
+    'THStride*': CodeTemplate('THLongStorageView ${result_name}(${arg_name}, THLongStorageViewKind::STRIDE);'),
+    'real': CodeTemplate('${arg_name}.to${ScalarName}()'),
+    'accreal': CodeTemplate('${arg_name}.to${AccScalarName}()'),
+    'TensorList': CodeTemplate('tensor_list_checked_cast<${Tensor}, Tensor, '
+                               '${THTensor}>(${arg_name},"${arg_name}",${arg_pos})'),
+    'IntList': CodeTemplate('check_intlist<${size}>(${arg_name}, "${arg_name}", ${arg_pos}${,default_init})')
+}
+
+DIRECT_CONSTRUCTION_CHECKED_CAST = {'THSize*', 'THStride*'}
+
+CHECKED_USE = {
+    'THTensor*': '{}_->tensor',
+    'THSTensor*': '{}_->tensor',
+    'THIndexTensor*': '{}_->tensor',
+    'THBoolTensor*': '{}_->tensor',
+    'THIntegerTensor*': '{}_->tensor',
+    'THDenseTensor*': '{}_->tensor',
+    'THDenseIndexTensor*': '{}_->tensor',
+    'THStorage*': '{}_->storage',
+    'THGenerator*': '{}_->generator',
+    'TensorList': "{0}_.data(), {0}_.size()",
+}
+
+CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL')
+
+ALLOC_WRAP = {
+    'THTensor*': 'new ${Tensor}(context${,arguments})',
+    'THBoolTensor*': 'new ${Backend}ByteTensor(context${,arguments})',
+    'THIndexTensor*': 'new ${Backend}LongTensor(context${,arguments})',
+    'THIntegerTensor*': 'new ${Backend}IntTensor(context${,arguments})',
+    'THSTensor*': 'new Sparse${Tensor}(context${,arguments})',
+    'THDenseTensor*': 'new ${DenseTensor}(context${,arguments})',
+    'THDenseIndexTensor*': 'new ${DenseBackend}LongTensor(context${,arguments})',
+}
+
+# Replacements for constants when calling into TH
+CONSTANT_REPLACEMENTS = [
+    ('AS_REAL', '${AS_REAL}'),
+    ('__storage_size.get\\(\\)',
+     'THLongStorageView(static_cast<int64_t>(source.size()), THLongStorageViewKind::LENGTH)'),
+    ('__last_dim', 'self.ndimension()-1'),
+]
+
+# Replacements for constants in header file function definitions
+HEADER_CONSTANT_REPLACEMENTS = [
+    (r'AS_REAL\((.*)\)', r'\1'),
+    ('__last_dim', '-1'),
+]
+
+
+class nested_dict(object):
+    def __init__(self, base, parent):
+        self.base, self.parent = base, parent
+
+    def __getitem__(self, x):
+        r = self.base.get(x)
+        if r is not None:
+            return r
+        return self.parent[x]
+
+
+Environment = TypedDict('Environment', {
+    'ScalarName': str,
+    'THTensor': str,
+    'THType': str,
+    'THTensor': str,
+    'Backend': str,
+    'AccScalarName': str,
+})
+
+TopEnvironment = TypedDict('TopEnvironment', {
+    'type_registrations': List[str],
+    'type_headers': List[str],
+    'type_method_declarations': List[str],
+    'type_method_definitions': List[str],
+    'type_method_inline_definitions': List[str],
+    'tensor_method_declarations': List[str],
+    'tensor_method_definitions': List[str],
+    'function_declarations': List[str],
+    'function_definitions': List[str],
+    'type_ids': List[str],
+    'native_function_declarations': List[str],
+})
+
+# A Declarations.cwrap formal argument
+# type can contain THTensor* types
+THFormal = TypedDict('THFormal', {
+    'name': str,
+    'type': str,
+    'dynamic_type': str,
+    'kwarg_only': bool,
+    'is_nullable': bool,
+    'default': str,
+    'default_init': str,
+    'python_default_init': str,
+    'output': bool,
+    'size': int,
+    'declared_type': str,
+    'ignore_check': bool,
+    'allocate': bool,
+    'mask': bool,
+    'if_true': bool,
+    'if_false': bool,
+    'wrap_dim': str,
+    # Broadcast is originally a str but gets unwrapped to a List or Dict in-place
+    'broadcast': Any,
+    'resize': str,
+    'cpu_zero': bool,
+    'zero': bool,
+    'is_type_dispatched': bool,
+}, total=False)
+
+# Generic ATen formal or native_functions.yaml formal argument.
+# type can contain Tensor& reference types.
+AtFormal = TypedDict('AtFormal', {
+    'name': str,
+    'type': str,
+    'dynamic_type': str,
+    'kwarg_only': bool,
+    'is_nullable': bool,
+    'default': str,
+    'default_init': str,
+    'python_default_init': str,
+    'output': bool,
+    'size': int,
+    'is_type_dispatched': bool,
+}, total=False)
+
+ReturnType = TypedDict('ReturnType', {
+    'name': str,
+    'type': str,
+    'dynamic_type': str,
+}, total=False)
+
+ReturnDecl = TypedDict('ReturnDecl', {
+    'kind': str,
+    'type': str,
+    'arguments': List[int],
+}, total=False)
+
+# Represents a buffer in nn.yaml
+NNBuffer = TypedDict('NNBuffer', {
+    'name': str,
+})
+
+FunctionOption = TypedDict('FunctionOption', {
+    'actuals': List[str],
+    'api_name': str,
+    'arguments': List[THFormal],
+    'aten_custom_call': str,
+    'aten_dense_sparse': bool,
+    'backend_type_pairs': List[Tuple[str, str]],
+    'backends': List[str],
+    'broadcast_actuals': List[str],
+    'broadcast_function': str,
+    'broadcast_modified_actuals': List[str],
+    'broadcast_returns': List[str],
+    'buffers': List[NNBuffer],
+    # cimpls is really a List[FunctionOption]
+    'cimpls': List[Any],
+    'cname': str,
+    'condition': str,
+    'const_mark': str,
+    'device_guard': bool,
+    'device_guard_declaration': str,
+    'with_gil': bool,
+    'cpu_half': bool,
+    'deprecated': bool,
+    'formals_list': List[AtFormal],
+    'formals_with_defaults': List[str],
+    'formals': List[str],
+    'inferred_type': str,
+    'inplace': bool,
+    'method_actuals': List[str],
+    'method_formals_with_defaults': List[str],
+    'method_formals': List[str],
+    'method_prefix_derived': str,
+    'mode': str,
+    'name': str,
+    'native_actuals': List[str],
+    'native_type_method_dispatch': str,
+    # options should be List[FunctionOption]
+    'options': Any,
+    'return_call': str,
+    'return_type': str,
+    'return': ReturnDecl,
+    'returns': List[ReturnType],
+    'scalar_check': str,
+    'sparse': bool,
+    'type_definition_body': List[str],
+    'type_method_actuals': List[str],
+    'type_method_definition_dispatch': str,
+    'type_method_formals_with_defaults': List[str],
+    'type_method_formals': List[str],
+    'variants': str,
+    'when_spares_dispatch': str,
+    'when_sparse_dispatch': str,
+    'with_gil': bool,
+    'zero_dim_dispatch_when_scalar': str,
+    'zero_dim_tensor_only': bool,
+})
+
+OutputDeclaration = NamedTuple('OutputDeclaration', [
+    ('name', str),
+    ('method_prefix_derived', str),
+    ('arguments', List[AtFormal]),
+    ('method_of', List[str]),
+    ('mode', str),
+    ('buffers', Optional[List[str]]),
+    ('returns', List[ReturnType]),
+    ('inplace', bool),
+    ('abstract', bool),
+    ('device_guard', bool),
+    ('with_gil', bool),
+    ('deprecated', bool),
+])
+
+
+def device_guard(option, formals, is_factory_method=False):
+    # For factory methods the `DeviceGuard` is already in the template.
+    if option.get('device_guard', True) and not is_factory_method:
+        tensor_arguments = [f for f in formals if f['dynamic_type'] in {'Tensor', 'TensorList'}]
+        if tensor_arguments:
+            tensor_argument = tensor_arguments[0]['name']
+            return 'const DeviceGuard device_guard({});'.format(tensor_argument)
+    return '// DeviceGuard omitted'
+
+
+def is_real_argument_to_wrapper(argument):
+    # type: (THFormal) -> bool
+    return not argument.get('output', False) and\
+        argument['type'] != 'CONSTANT' and\
+        argument['type'] != 'argument'
+
+
+def is_mutable_formal_argument(argument, option):
+    # type: (THFormal, FunctionOption) -> bool
+    return argument.get('output') or option['inplace'] and argument['name'] == 'self'
+
+
+def to_return_type(arg, option):
+    # type: (THFormal, FunctionOption) -> ReturnType
+    t = arg['type']
+    rt = TYPE_RETURN.get(t, t)
+    if rt == 'Tensor' and not arg.get('allocate'):
+        rt = rt + ' &'
+        if not is_mutable_formal_argument(arg, option):
+            rt = 'const ' + rt
+    return {
+        'name': arg['name'],
+        'type': rt,
+        'dynamic_type': DYNAMIC_TYPE.get(arg['type'], arg['type']),
+    }
+
+
+def create_generic(top_env, declarations):
+    # type: (TopEnvironment, List[FunctionOption]) -> List[OutputDeclaration]
+    # translates defaults from cwrap types to C++ values
+    def translate_default(argument, type_str, default):
+        # type: (THFormal, str, Any) -> Any
+        if default is None:
+            # cause the default constructor for the object to run
+            return '{}'
+        if 'if_true' in argument:
+            return argument['default'] == argument['if_true']
+        for pattern, replacement in HEADER_CONSTANT_REPLACEMENTS:
+            default = re.sub(pattern, replacement, str(default))
+        if type_str in {'Scalar', 'int64_t', 'double'}:
+            try:
+                return int(default)
+            except Exception:
+                try:
+                    return float(default)
+                except Exception:
+                    return default
+        elif type_str == 'bool':
+            assert default.lower() in ['true', 'false']
+            return default.lower() == 'true'
+        else:
+            return default
+
+    # change from THTensor* to Tensor & so we get how it will appear
+    # in the aten argument list...
+    def translate_formal(argument, option):
+        # type: (THFormal, FunctionOption) -> AtFormal
+        type_str = TYPE_FORMAL_GENERIC.get(argument['type'], argument['type'])
+        if type_str == 'Tensor &' and not is_mutable_formal_argument(argument, option):
+            type_str = 'const ' + type_str
+        translated = {
+            'name': argument['name'],
+            'type': type_str,
+            'dynamic_type': DYNAMIC_TYPE.get(argument['type'], argument['type']),
+        }  # type: AtFormal
+        if 'kwarg_only' in argument:
+            translated['kwarg_only'] = argument['kwarg_only']
+        if 'default' in argument:
+            default = translate_default(argument, type_str, argument['default'])
+            translated['default'] = default
+            translated['default_init'] = argument.get('default_init', default)
+        if 'python_default_init' in argument:
+            assert 'default' not in argument
+            default = translate_default(argument, type_str, argument['python_default_init'])
+            translated['python_default_init'] = default
+        if argument.get('output'):
+            translated['output'] = True
+        if argument.get('size'):
+            translated['size'] = argument['size']
+        if argument.get('is_nullable') is not None:
+            translated['is_nullable'] = argument['is_nullable']
+        return translated
+
+    def get_formals(option, include_constants=False):
+        # type: (FunctionOption, bool) -> List[AtFormal]
+        seen = set()  # type: Set[str]
+        pos_args = []  # type: List[THFormal]
+        kwd_args = []  # type: List[THFormal]
+
+        def insert(argument):
+            # type: (THFormal) -> None
+            if argument['name'] not in seen:
+                seen.add(argument['name'])
+                if argument.get('kwarg_only', False):
+                    kwd_args.append(argument)
+                else:
+                    pos_args.append(argument)
+
+        def has_output_mask(argument):
+            # type: (THFormal) -> bool
+            return argument.get('allocate', False) and argument.get('mask', False)
+
+        for argument in option['arguments']:
+            if argument.get('output') and not argument.get('allocate', False):
+                insert(argument)
+        for argument in option['arguments']:
+            if argument['type'] == 'THSTensor*':
+                # only enable for a subset of Dense/Sparse ops
+                if not (option.get('aten_dense_sparse', False)):
+                    raise NYIError("Sparse Tensor")
+
+            if include_constants and argument['type'] == 'CONSTANT':
+                insert(argument)
+            elif is_real_argument_to_wrapper(argument):
+                insert(argument)
+        if any(has_output_mask(arg) for arg in option['arguments']):
+            mask_size = sum(has_output_mask(arg) for arg in option['arguments'])
+            insert({
+                'name': 'output_mask',
+                # NB: Lack of space in comma works around parsing
+                # problem in gen_variable_type.py
+                'type': 'std::array<bool,{}>'.format(mask_size),
+                'default': '{{' + ', '.join(['true'] * mask_size) + '}}',
+            })
+
+        result = pos_args + kwd_args
+        return [translate_formal(argument, option) for argument in result]
+
+    def get_return_types(option):
+        # type: (FunctionOption) -> List[ReturnType]
+        ret = option['return']
+        if ret['kind'] == 'arguments':
+            argument_indices = ret['arguments']
+            if len(argument_indices) == 1:
+                the_arg = option['arguments'][argument_indices[0]]
+                return [to_return_type(the_arg, option)]
+            else:
+                return [to_return_type(option['arguments'][idx], option)
+                        for idx in argument_indices]
+        elif ret['kind'] == 'type':
+            return [{
+                'type': TYPE_RETURN.get(ret['type'], ret['type']),
+                'dynamic_type': DYNAMIC_TYPE.get(ret['type'], ret['type']),
+            }]
+        else:
+            raise Exception("format_return_type")
+
+    def format_return_type(return_types):
+        # type: (List[ReturnType]) -> str
+        if len(return_types) == 1:
+            return return_types[0]['type']
+        return "std::tuple<{}>".format(','.join(r['type'] for r in return_types))
+
+    def find_dispatch_tensor(formals):
+        # type: (List[AtFormal]) -> Optional[str]
+        # dispatch to self if it's a parameter
+        for formal in formals:
+            if formal['name'] == 'self' and formal['dynamic_type'] == 'Tensor':
+                return formal['name']
+        # otherwise dispatch to the first Tensor or TensorList
+        for formal in formals:
+            if 'TensorList' == formal['dynamic_type'] or formal['dynamic_type'] == 'Tensor':
+                return formal['name']
+        return None
+
+    def format_formal(f):
+        # type: (AtFormal) -> str
+        return '{} {}'.format(f['type'], f['name'])
+
+    def formal_with_default(f):
+        # type: (AtFormal) -> str
+        s = format_formal(f)
+        v = f.get('default')
+        if v is None:
+            return s
+        if isinstance(v, bool):
+            v = str(v).lower()
+        return '{}={}'.format(s, v)
+
+    def get_broadcast_argument(option):
+        # type: (FunctionOption) -> Optional[THFormal]
+        for argument in option['arguments']:
+            if argument.get('broadcast'):
+                return argument
+        return None
+
+    def get_broadcast_actuals(broadcast_arg, broadcast_inplace, broadcast_dims):
+        # type: (THFormal, bool, bool) -> List[str]
+        # Note: broadcast_dims can change type...
+        # return the actuals that will be passed to the broadcast function.
+        # 1) in the common case, this is the broadcasted argument (e.g. "self") followed by the tensors
+        #    that it is broadcasted against (comma-separated) (e.g. "self, tensor1, tensor2").
+        # 2) in the broadcast_dims case, this is the broadcasted argument (e.g. "self") followed by the sizes
+        #    it is broadcasted to (as an initializer list), so e.g. the specification
+        #    "mat1.dim0,mat2.dim1" gets transformed to "self, {mat1.size(0),mat2.size(1)}"
+        if not broadcast_dims:
+            broadcast_actuals = [broadcast_arg['name']] + broadcast_arg['broadcast'].split()[0].split(",")
+        else:
+            broadcast_dims_spec = broadcast_arg['broadcast'].split()[1].split(':')[1].split(',')
+            # generate size call for each dimension
+            broadcast_dims = ([x.split('.')[0] + '.size(' + x.split('.')[1].replace('dim', '') + ')'  # type: ignore
+                              for x in broadcast_dims_spec])
+            broadcast_dims_init_list = '{' + ','.join(broadcast_dims) + '}'  # type: ignore
+            broadcast_actuals = [broadcast_arg['name'], broadcast_dims_init_list]
+
+        return broadcast_actuals
+
+    def emit_nn_body(option):
+        # type: (FunctionOption) -> Union[str, List[str]]
+        # Concrete definition on Type.cpp for NN functions. Delegates to the
+        # xxx_forward variant variant after creating any necessary buffers.
+        actuals = option['actuals']
+        base_name = option['name'][:-1] if option['inplace'] else option['name']
+        fwd_name = option['api_name'].replace(base_name, base_name + '_forward')
+
+        if len(option['buffers']) == 0:
+            return 'return {}({});'.format(fwd_name, ', '.join(actuals))
+
+        body = []  # type: List[str]
+        if option['api_name'].endswith('_out'):
+            # _out variants must create buffers and insert them in the
+            # arguments list between output and input arguments
+            for buffer in option['buffers']:
+                body.append('Tensor {} = tensor();'.format(buffer['name']))
+            actuals = [arg['name'] for arg in option['arguments'] if arg.get('output')]
+            actuals += [buffer['name'] for buffer in option['buffers']]
+            actuals += [arg['name'] for arg in option['arguments'] if not arg.get('output')]
+
+        body.append('return std::get<0>({}({}));'.format(fwd_name, ', '.join(actuals)))
+        return body
+
+    def process_option(option, output_options):
+        # type: (FunctionOption, List[OutputDeclaration]) -> None
+        option['inplace'] = re.search(
+            '(^__i|[^_]_$)', option['api_name']) is not None
+
+        # print(yaml.dump(option))
+        formals = get_formals(option)
+        option['formals_list'] = formals
+        option['formals'] = [format_formal(f) for f in formals]
+        option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
+        option['returns'] = get_return_types(option)
+        option['return_type'] = format_return_type(option['returns'])
+        option['return_call'] = 'return ' if option['return_type'] != 'void' else ''
+        option['actuals'] = [f['name'] for f in formals]
+
+        option['method_formals'] = [format_formal(f) for f in formals
+                                    if f['name'] != 'self']
+        option['method_formals_with_defaults'] = (
+            [formal_with_default(f) for f in formals if f['name'] != 'self'])
+        option['method_actuals'] = [
+            f['name'] if f['name'] != 'self' else '*this' for f in formals]
+
+        # There are no cases where these differ, but they do in native_functions
+        option['type_method_formals'] = option['formals']
+        option['type_method_formals_with_defaults'] = option['formals_with_defaults']
+        option['type_method_actuals'] = option['actuals']
+
+        option['const_mark'] = '' if option['inplace'] else ' const'
+
+        is_method = 'method' in option['variants']
+        is_function = 'function' in option['variants']
+        dispatch_tensor = find_dispatch_tensor(formals)
+        is_namespace_function = is_function and dispatch_tensor is not None
+
+        broadcast_arg = get_broadcast_argument(option)
+        # "s_" for "same size".
+        option['method_prefix_derived'] = '' if broadcast_arg is None else 's_'
+        option['device_guard_declaration'] = device_guard(option, formals)
+
+        env = nested_dict(option, top_env)
+
+        mode = option['mode']
+        abstract = True
+        if mode == 'NN' and option.get('cimpls') is None:
+            # NN function with no _forward/_backward suffix don't have cimpls.
+            # They call the _forward function and discard any buffer returns
+            abstract = False
+            top_env['type_method_declarations'].append(
+                TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
+            body = emit_nn_body(option)
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_CONCRETE.substitute(
+                    env, type_definition_body=body))
+        elif broadcast_arg is None:
+            top_env['type_method_declarations'].append(
+                TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env))
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env))
+        else:
+            top_env['type_method_declarations'].append(
+                TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
+            top_env['type_method_declarations'].append(
+                TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env))
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env))
+
+            broadcast_inplace = 'inplace' in broadcast_arg['broadcast']
+            broadcast_dims = 'dims:' in broadcast_arg['broadcast']
+            option['broadcast_actuals'] = get_broadcast_actuals(broadcast_arg, broadcast_inplace, broadcast_dims)
+            if not broadcast_dims:
+                option['broadcast_returns'] = (["b_" + x for x in option['broadcast_actuals']
+                                               if x != broadcast_arg['name'] or not broadcast_inplace])
+            else:
+                option['broadcast_returns'] = ["b_" + broadcast_arg['name']]
+
+            option['broadcast_function'] = 'expand_' + ('inplace' if broadcast_inplace
+                                                        else 'size' if broadcast_dims else 'outplace')
+            option['broadcast_modified_actuals'] = ['b_' + y if 'b_' + y in option['broadcast_returns'] else y
+                                                    for y in option['actuals']]
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_BROADCAST.substitute(env))
+
+        method_of = ['Type']
+        if is_method:
+            top_env['tensor_method_declarations'].append(
+                TENSOR_METHOD_DECLARATION.substitute(env))
+            top_env['tensor_method_definitions'].append(
+                TENSOR_METHOD_DEFINITION.substitute(env))
+            method_of.append('Tensor')
+
+        if is_namespace_function:
+            option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+            top_env['function_declarations'].append(
+                FUNCTION_DECLARATION.substitute(env))
+            top_env['function_definitions'].append(
+                FUNCTION_DEFINITION.substitute(env))
+            method_of.append('namespace')
+
+        buffer_names = [buffer['name'] for buffer in option.get('buffers', [])]
+
+        output_options.append(OutputDeclaration(
+            name=option['api_name'],
+            method_prefix_derived=option['method_prefix_derived'],
+            arguments=formals,
+            method_of=method_of,
+            mode=mode,
+            buffers=buffer_names,
+            returns=option['returns'],
+            inplace=option['inplace'],
+            # See Note [Abstract ATen methods]
+            abstract=abstract,
+            device_guard=option.get('device_guard', True),
+            with_gil=option.get('with_gil', False),
+            deprecated=option.get('deprecated', False)
+        ))
+
+    def native_get_formals(option, include_constants=False):
+        # type: (FunctionOption, bool) -> List[AtFormal]
+        seen = set()  # type: Set[str]
+        pos_args = []
+        kwd_args = []
+
+        def insert(argument):
+            # type: (AtFormal) -> None
+            if argument['name'] not in seen:
+                seen.add(argument['name'])
+                if argument.get('kwarg_only', False):
+                    kwd_args.append(argument)
+                else:
+                    pos_args.append(argument)
+
+        for argument in option['arguments']:
+            insert(argument)
+
+        # not clear we need dynamic_type translation as we can specify the correct type
+        # directly in native functions
+        def add_dynamic_type(argument, option):
+            # type: (AtFormal, FunctionOption) -> AtFormal
+            argument['dynamic_type'] = NATIVE_DYNAMIC_TYPE.get(argument['type'], argument['type'])
+            return argument
+
+        result = pos_args + kwd_args
+        result = [add_dynamic_type(argument, option) for argument in result]
+
+        # ensure we get reference-type formals when appropriate
+        def native_translate_formals(argument, option):
+            # type: (AtFormal, FunctionOption) -> AtFormal
+            def translate_map(const):
+                # type: (bool) -> Dict[str, str]
+                return {
+                    'Tensor': 'const Tensor &' if const else 'Tensor &',
+                    'BoolTensor': 'const Tensor &' if const else 'Tensor &',
+                    'IndexTensor': 'const Tensor &' if const else 'Tensor &',
+                    'Type': 'const Type &' if const else 'Type &',
+                    'TensorOptions': 'const TensorOptions &' if const else 'TensorOptions &',
+                }
+
+            if (option['inplace'] and argument['name'] == 'self') or argument.get('output', False):
+                argument['type'] = translate_map(False).get(argument['type'], argument['type'])
+            else:
+                argument['type'] = translate_map(True).get(argument['type'], argument['type'])
+
+            return argument
+
+        result = [native_translate_formals(argument, option) for argument in result]
+        return result
+
+    # this can return multiple return types in a list, e.g. ['Tensor', 'Tensor']
+    def native_get_return_types(option):
+        # type: (FunctionOption) -> List[ReturnType]
+        ret = option['return']
+
+        return_types = []  # List[ReturnType]
+        for t_raw in ret:
+            if isinstance(t_raw, string_type):
+                t = t_raw
+                name = None
+            elif t_raw is None:
+                t = 'void'
+                name = None
+            else:
+                t = t_raw['type']
+                name = t_raw['name']
+
+            # can't actually return a TensorList (since it's a reference object)
+            actual_return_type = {'TensorList': 'std::vector<Tensor>'}.get(t, t)
+
+            if actual_return_type == 'Tensor' and (option['inplace'] or option['api_name'].endswith('_out')):
+                # follow normal ATen convention of returning Tensor & for inplace functions.
+                actual_return_type = 'Tensor &'
+
+            rtype = {
+                'type': actual_return_type,
+                'dynamic_type': NATIVE_DYNAMIC_TYPE.get(t, t),
+            }  # type: ReturnType
+            if name is not None:
+                rtype['name'] = name
+            return_types.append(rtype)
+
+        return return_types
+
+    def process_native(option, output_options):
+        # type: (FunctionOption, List[OutputDeclaration]) -> None
+        option['inplace'] = re.search(
+            '(^__i|[^_]_$)', option['api_name']) is not None
+
+        formals = native_get_formals(option)
+        option['formals_list'] = formals
+        option['formals'] = [format_formal(f) for f in formals]
+        option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
+        option['returns'] = native_get_return_types(option)
+        option['return_type'] = format_return_type(option['returns'])
+        option['return_call'] = 'return ' if option['return_type'] != 'void' else ''
+        option['actuals'] = [f['name'] for f in formals]
+
+        option['method_formals'] = [format_formal(f) for f in formals
+                                    if f['name'] != 'self']
+        option['method_formals_with_defaults'] = (
+            [formal_with_default(f) for f in formals if f['name'] != 'self'])
+        option['method_actuals'] = [
+            f['name'] if f['name'] != 'self' else '*this' for f in formals]
+
+        def find_formal(formal_name, formals):
+            for formal in formals:
+                if formal_name == formal['dynamic_type']:
+                    return formal
+            return None
+
+        dispatch_tensor = find_dispatch_tensor(formals)
+        dispatch_type = None if dispatch_tensor else find_formal('Type', formals)
+        if dispatch_type:
+            dispatch_type['is_type_dispatched'] = True
+
+        option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type]
+        option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type]
+        option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type]
+        option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals]
+
+        option['const_mark'] = '' if option['inplace'] else ' const'
+
+        is_method = 'method' in option['variants']
+        is_namespace_function = 'function' in option['variants']
+        is_factory_method = find_formal('TensorOptions', formals)
+        is_deprecated_factory_method = len(formals) > 0 and \
+            formals[0]['dynamic_type'] == 'Type' and \
+            option['return_type'] == 'Tensor' and option['deprecated']
+        needs_native_definition = not is_deprecated_factory_method
+
+        has_dispatch = dispatch_tensor or dispatch_type
+
+        option['method_prefix_derived'] = ''
+        option['device_guard_declaration'] = device_guard(option, formals, is_factory_method)
+
+        env = nested_dict(option, top_env)
+
+        broadcast_arg = get_broadcast_argument(option)
+        if broadcast_arg is not None:
+            raise Exception("broadcasting is not yet supported for native functions, "
+                            "but specified for function {}", option['name'])
+
+        # Factory methods are not dispatched over `Type`.
+        if not is_factory_method:
+            if option['deprecated']:
+                top_env['type_method_declarations'].append(DEPRECATED_TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
+            else:
+                top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
+        dispatch = option['type_method_definition_dispatch']
+        option['native_type_method_dispatch'] = dispatch
+
+        # Note [Abstract ATen methods]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # An abstract ATen method is one whose dispatch differs between
+        # types.  These are implemented in derived types (with a
+        # standard (throwing) definition in Type).  A concrete ATen
+        # method is one which has the same dispatch for all types;
+        # we just implement it in the base Type.  This is exposed
+        # in Declarations.yaml via a field named 'abstract'.
+        abstract = False
+        if isinstance(dispatch, dict):
+            abstract = True
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env))
+        elif is_deprecated_factory_method:
+            top_env['type_method_definitions'].append(
+                DEPRECATED_TYPE_METHOD_DEFINITION_CONCRETE.substitute(env))
+        elif not is_factory_method:
+            body = TYPE_DEFINITION_BODY_NATIVE.substitute(env)
+            top_env['type_method_definitions'].append(
+                TYPE_METHOD_DEFINITION_CONCRETE.substitute(
+                    env, type_definition_body=body))
+
+        # generate the at::native function declarations (i.e. what the user will implement)
+        if needs_native_definition:
+            if isinstance(dispatch, dict):
+                generated_native_functions = []  # type: List[str]
+                for key in sorted(dispatch.keys()):
+                    value = dispatch[key]
+                    if value not in generated_native_functions:
+                        option['native_type_method_dispatch'] = value
+                        top_env['native_function_declarations'].append(
+                            NATIVE_DECLARATION.substitute(env))
+                        generated_native_functions.append(value)
+            else:
+                top_env['native_function_declarations'].append(
+                    NATIVE_DECLARATION.substitute(env))
+
+        method_of = ['Type']
+        if is_method:
+            top_env['tensor_method_declarations'].append(
+                TENSOR_METHOD_DECLARATION.substitute(env))
+            top_env['tensor_method_definitions'].append(
+                TENSOR_METHOD_DEFINITION.substitute(env))
+            method_of.append('Tensor')
+
+        if is_namespace_function:
+            if dispatch_type:
+                option['inferred_type'] = dispatch_type['name']
+            elif dispatch_tensor:
+                option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+            else:
+                # doesn't depend on a specific type, use undefined float
+                option['inferred_type'] = 'at::getType(at::Backend::Undefined, at::ScalarType::Float)'
+            declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION
+            top_env['function_declarations'].append(declaration.substitute(env))
+            if is_factory_method:
+                top_env['function_definitions'].append(FACTORY_DEFINITION.substitute(env))
+            elif is_deprecated_factory_method:
+                top_env['function_definitions'].append(DEPRECATED_FACTORY_DEFINITION.substitute(env))
+            else:
+                top_env['function_definitions'].append(FUNCTION_DEFINITION.substitute(env))
+            method_of.append('namespace')
+
+        output_options.append(OutputDeclaration(
+            name=option['api_name'],
+            method_prefix_derived=option['method_prefix_derived'],
+            arguments=formals,
+            method_of=method_of,
+            mode=option['mode'],
+            buffers=None,
+            returns=option['returns'],
+            inplace=option['inplace'],
+            # See Note [Abstract ATen methods]
+            abstract=abstract,
+            device_guard=option.get('device_guard', True),
+            with_gil=option.get('with_gil', False),
+            deprecated=option['deprecated'],
+        ))
+
+    output_declarations = []  # type: List[OutputDeclaration]
+    for declaration in declarations:
+        output_options = []  # type: List[OutputDeclaration]
+        for option in declaration['options']:
+            try:
+                if option['mode'] != 'native':
+                    process_option(option, output_options)
+                else:
+                    process_native(option, output_options)
+            except NYIError:
+                option['skip'] = True
+        output_declarations.extend(output_options)
+    return output_declarations
+
+
+def create_derived(backend_type_env, declarations):
+    # type: (Environment, List[FunctionOption]) -> Tuple[List[str], List[str]]
+    type_object_declarations = []
+    type_object_definitions = []
+
+    is_cuda = 'CUDA' in backend_type_env['Backend']
+
+    real_is_half = backend_type_env['ScalarName'] == 'Half'
+
+    def replace_with_null(argument):
+        # type: (THFormal) -> bool
+        return (argument['type'] == 'THGenerator*' and
+                backend_type_env['Backend'] == 'CUDA')
+
+    def requires_checked_cast(argument):
+        # type: (THFormal) -> bool
+        if argument['type'] == 'IntList':
+            return 'size' in argument
+        return argument['type'] in CHECKED_CAST
+
+    def nullable_argument(argument):
+        # type: (THFormal) -> bool
+        return argument.get('is_nullable', False)
+
+    def bool_option_is_string(argument):
+        # type: (THFormal) -> bool
+        return 'if_true' in argument and isinstance(argument['if_true'], string_type)
+
+    def get_argument(argument, option):
+        # type: (THFormal, FunctionOption) -> str
+        if replace_with_null(argument):
+            return 'NULL'
+        elif requires_checked_cast(argument):
+            checked_use = CHECKED_USE.get(
+                argument['type'], '{}_').format(argument['name'])
+            if real_is_half and argument['type'] == 'real':
+                checked_use = HALF_CONVERSION.substitute(value=checked_use)
+            if nullable_argument(argument):
+                checked_use = CHECKED_USE_NULLABLE.substitute(
+                    env={}, arg_name=argument['name'], usage=checked_use)
+            return checked_use
+        elif argument['type'] == 'bool' and 'if_true' in argument:
+            if bool_option_is_string(argument):
+                tpl = '({}) ? "{}" : "{}"'
+            else:
+                tpl = '({}) ? {} : {}'
+            return tpl.format(argument['name'],
+                              argument['if_true'], argument['if_false'])
+        elif argument['type'] == 'CONSTANT':
+            # this is a bool that is actually a string...
+            if bool_option_is_string(argument):
+                return '"{}"'.format(argument['name'])
+            v = str(argument.get('default', argument['name']))
+            for pattern, replacement in CONSTANT_REPLACEMENTS:
+                v = re.sub(pattern, replacement, v)
+            return CodeTemplate(v).substitute(backend_type_env)
+        # e.g. argument 0, i.e. repeat the 0th argument in this position...
+        elif argument['type'] == 'argument':
+            index = int(argument['name'])
+            return get_argument(option['arguments'][index], option)
+        else:
+            return argument['name']
+
+    def drop_argument(argument, option):
+        # type: (THFormal, FunctionOption) -> bool
+        # Devices are handled in the body of the function.
+        if argument['name'] == 'device':
+            return True
+        return 'CUDA' in backend_type_env['Backend'] and (
+            option['mode'] == 'TH' and argument['type'] == 'THGenerator*')
+
+    def get_arguments(arguments, option):
+        # type: (List[THFormal], FunctionOption) -> List[str]
+        return [get_argument(argument, option)
+                for argument in arguments if not drop_argument(argument, option)]
+
+    def is_actual_return_long(ret):
+        # type: (ReturnDecl) -> bool
+        if ret['type'] == 'long':
+            return True
+        if ret['type'] == 'real':
+            return backend_type_env['ScalarName'] == 'Long'
+        if ret['type'] == 'accreal':
+            return backend_type_env['AccScalarName'] == 'Long'
+        return False
+
+    def handle_zero_dim(env, option):
+        # type: (Environment, FunctionOption) -> List[str]
+        zero_dim_dispatch = option.get('zero_dim_dispatch_when_scalar', '')
+        if not zero_dim_dispatch:
+            return []
+        broadcasts_arg = zero_dim_dispatch in option.get('broadcast_actuals', '')
+        zero_dim_only = option.get('zero_dim_tensor_only', False)
+        # this combination doesn't seem to make sense
+        assert not (broadcasts_arg and zero_dim_only)
+        # if the argument broadcasts, then this would only affect cases where all broadcasted
+        # tensors were zero-dim, which is inconsistent with the scalar handling.
+        if broadcasts_arg:
+            return []
+        zero_dim_actuals = [arg['name']
+                            if arg['name'] != zero_dim_dispatch else "Scalar({})".format(arg['name'])
+                            for arg in option['formals_list']]
+        return [ZERO_DIM_CHECK.substitute(env, check_name=zero_dim_dispatch, zero_dim_actuals=zero_dim_actuals)]
+
+    def handle_only_zero_dim(env, option):
+        # type: (Environment, FunctionOption) -> Optional[List[str]]
+        if option.get('zero_dim_tensor_only', False):
+            check_name = option['zero_dim_dispatch_when_scalar']
+            return [ZERO_DIM_ONLY.substitute(env, check_name=check_name)]
+        else:
+            return None
+
+    def handle_sparse(env, option):
+        # type: (Environment, FunctionOption) -> List[str]
+        if 'when_sparse_dispatch' not in option or 'Sparse' in backend_type_env['Backend']:
+            return []
+        check_name = option['when_sparse_dispatch']
+        sparse_actuals = [arg['name']
+                          if arg['name'] != check_name else "SparseTensorRef({})".format(arg['name'])
+                          for arg in option['formals_list']]
+        return [SPARSE_CHECK.substitute(env, check_name=check_name, sparse_actuals=sparse_actuals)]
+
+    def allocate_arg(env, arg, output_count):
+        # type: (Environment, THFormal, int) -> List[str]
+        name = arg['name']
+        allocation = CodeTemplate(ALLOC_WRAP[arg['type']]).substitute(env, arguments=[])
+        tensor_arg = '{}_'.format(name)
+        if arg.get('mask', False):
+            allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation)
+            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_'
+                          .format(name, name))
+        return [
+            'auto {}_ = {};'.format(name, allocation),
+            'auto {} = Tensor({}, false);'.format(name, tensor_arg),
+        ]
+
+    def resize_arg(arg):
+        # type: (THFormal) -> str
+        resize = arg['resize']
+        if isinstance(resize, str):
+            return "{}.resize_({}.sizes());".format(arg['name'], resize)
+        else:
+            resize_scalar = arg.get('resize_scalar', False)
+            if resize_scalar:
+                dims = ['{}.dim() == 0 ? 1 : {}.size({})'.format(name, name, dim) for name, dim in resize]
+            else:
+                dims = ['{}.size({})'.format(name, dim) for name, dim in resize]
+            return "{}.resize_({{ {} }});".format(arg['name'], ','.join(dims))
+
+    def handle_call(env, option, cimpl):
+        # type: (Environment, FunctionOption, FunctionOption) -> str
+        is_nn = option['mode'] == 'NN'
+        actuals = get_arguments(cimpl['arguments'], option)
+        if is_cuda or is_nn:
+            actuals = ['context->getTHCState()'] + actuals
+
+        cname = cimpl['cname']
+        if option.get('sparse', False):
+            if is_cuda:
+                cname = 'THCS' + env['ScalarName'] + "Tensor_" + cname
+            else:
+                cname = env['THTensor'].replace('TH', 'THS') + '_' + cname
+        elif is_nn:
+            cname = 'THNN_{}'.format(env['THType']) + cname
+        else:
+            cname = env['THTensor'] + '_' + cname
+
+        call = CALL_TEMPLATE.substitute(actuals=actuals, cname=cname)
+        if cimpl.get('condition') is not None:
+            call = 'if ({}) {}'.format(cimpl['condition'], call)
+        return call
+
+    def emit_body(env, option):
+        # type: (Environment, FunctionOption) -> List[str]
+        body = []  # type: List[str]
+        body += handle_sparse(env, option)
+        body += handle_zero_dim(env, option)
+        only_zero_dim_check = handle_only_zero_dim(env, option)
+        if only_zero_dim_check is not None:
+            #  code below only_zero_dim_check is unreachable so we do not need to generate the rest.
+            body += only_zero_dim_check
+            return body
+
+        # arguments are potentially duplicated because of one argument
+        # referencing another
+        seen_names = set()  # type: Set[str]
+        seen_tensorlists = set()  # type: Set[str]
+        count = 0
+        output_count = 0
+
+        # scalar_check is the heuristic conditions when a result may be a scalar_check
+        # if there is a THSize* argument, then its dimensions are used to determine scalar.
+        # otherwise, it is true if all the input tensors are scalars,
+        scalar_check_is_from_size = False
+        scalar_check_is_from_option = False
+        scalar_check = None
+        scalar_check_opt = option.get('scalar_check')
+        if scalar_check_opt is not None:
+            if isinstance(scalar_check_opt, bool):
+                scalar_check = str(scalar_check_opt).lower()
+            else:
+                scalar_check = scalar_check_opt
+            scalar_check_is_from_option = True
+
+        for arg in option['arguments']:
+            if is_real_argument_to_wrapper(arg):
+                count += 1
+            if arg['type'] == 'THSize*' and not scalar_check_is_from_option:
+                scalar_check_is_from_size = True
+                scalar_check = '{}.size() == 0'.format(arg['name'])
+            if arg['type'] == 'TensorList':
+                seen_tensorlists.add(arg['name'])
+
+            wrap_dim_target = arg.get('wrap_dim', None)
+            if wrap_dim_target is not None:
+                # for Tensors, "name_" is the TensorImpl, but for TensorLists, it is an
+                # std::vector of TH*s.  Since TH*s have different dimension rules, we used
+                # "name" instead, but keep "name_" for tensor to avoid an extra function call.
+                if wrap_dim_target not in seen_tensorlists:
+                    wrap_dim_target = wrap_dim_target + "_"
+                body.append("{} = maybe_wrap_dim({}, {});"
+                            .format(arg['name'], arg['name'], wrap_dim_target))
+
+            # only generated checked casts the first time we see it
+            if arg['name'] not in seen_names and requires_checked_cast(arg):
+                seen_names.add(arg['name'])
+
+                # make a new allocation of TensorImpl, then wrap a Tensor around it.
+                if arg.get('allocate', False):
+                    body += allocate_arg(env, arg, output_count)
+                    output_count += 1
+                # extract the TensorImpl from an existing tensor (or Storage, etc.)
+                else:
+                    # special case where we allow undefined Tensors, and thus
+                    # the checked cast succeeds even if the Tensor is not
+                    # defined
+                    null_okay = 'true' if nullable_argument(arg) else 'false'
+                    default_init = []
+                    if 'default_init' in arg:
+                        default_init.append(arg['default_init'])
+
+                    if arg['type'] in DIRECT_CONSTRUCTION_CHECKED_CAST:
+                        body.append(CHECKED_CAST[arg['type']].substitute(
+                            env, arg_name=arg['name'], arg_pos=count,
+                            null_okay=null_okay, default_init=default_init,
+                            size=arg.get('size'),
+                            result_name=arg['name'] + '_'))
+                    else:
+                        check_cast = CHECKED_CAST[arg['type']].substitute(
+                            env, arg_name=arg['name'], arg_pos=count,
+                            null_okay=null_okay, default_init=default_init,
+                            size=arg.get('size'))
+                        body.append("auto {}_ = {};".format(
+                            arg['name'], check_cast))
+                if drop_argument(arg, option) or replace_with_null(arg):
+                    body.append(
+                        "(void) {}_; //silence unused warning".format(arg['name']))
+
+                initializers = []
+
+                # resize tensors for special ops that require it
+                if 'resize' in arg:
+                    initializers.append(resize_arg(arg))
+
+                # also special handling where we zero some outputs.
+                if arg.get('zero', False) or (arg.get('cpu_zero', False) and not is_cuda):
+                    initializers.append("{}.zero_();".format(arg['name']))
+
+                # only initialize non-null arguments
+                if nullable_argument(arg) and len(initializers) > 0:
+                    body.append(CONDITIONAL_INITIALIZER.substitute({
+                        'name': arg['name'],
+                        'initializer': initializers
+                    }))
+                else:
+                    body += initializers
+
+                # for out-of-place: isScalar() for all input tensors is and'd to form
+                # the test for whether the output is also a scalar
+                # for in-place: isScalar() shouldn't change as a result of the operation
+                if (not arg.get('output') and 'Tensor' in arg['type'] and
+                        'TensorList' not in arg['type'] and
+                        'THS' not in arg['type'] and
+                        not scalar_check_is_from_size and
+                        not scalar_check_is_from_option and
+                        not option['inplace']):
+                    check = '{}->isScalar()'.format(arg['name'] + '_')
+                    if nullable_argument(arg):
+                        check = '(!{} || {})'.format(arg['name'] + '_', check)
+                    scalar_check = (check if scalar_check is None
+                                    else scalar_check + ' && ' + check)
+
+        # cimpls, if it exists, contains the underlying C function names and
+        # arguments. Otherwise use option
+        cimpls = option.get('cimpls', [option])
+        calls = [handle_call(env, option, cimpl) for cimpl in cimpls]
+
+        ret = option['return']
+
+        if ret['kind'] == 'arguments':
+            if 'aten_custom_call' in option:
+                # all aten_custom_call bodies handle settings on their own.
+                scalar_check = None
+                body.append(CodeTemplate(
+                    option['aten_custom_call']).substitute(env))
+            else:
+                body.extend([call + ';' for call in calls])
+            arguments_indices = ret['arguments']
+            arguments = [option['arguments'][argi]
+                         for argi in arguments_indices]
+            if scalar_check is not None:
+                if not isinstance(scalar_check, dict):
+                    if len(arguments) > 1:
+                        body.append("bool maybe_scalar = {};".format(scalar_check))
+                        scalar_check = 'maybe_scalar'
+                for arg in arguments:
+                    scalar_check_arg = (scalar_check if not isinstance(scalar_check, dict)
+                                        else scalar_check.get(arg['name']))  # type: ignore
+                    if scalar_check_arg is not None:
+                        stmt = "{}_->maybeScalar({});".format(arg['name'], scalar_check_arg)
+                        if nullable_argument(arg):
+                            stmt = "if ({}_) {}".format(arg['name'], stmt)
+                        body.append(stmt)
+            if len(arguments_indices) == 1:
+                arg = arguments[0]
+                body.append("return {};".format(arg['name']))
+            else:
+                types = [to_return_type(arg, option)['type']
+                         for arg in arguments]
+                # TODO: check for move semantics...
+                names = [arg['name'] for arg in arguments]
+                body.append(CodeTemplate("return std::tuple<${types}>(${names});").substitute(
+                    types=types, names=names))
+        elif ret['kind'] == 'type':
+            assert len(calls) == 1
+            call = calls[0]
+            if 'aten_custom_call' in option:
+                # all aten_custom_call bodies handle settings on their own.
+                scalar_check = None
+                body.append(CodeTemplate(
+                    option['aten_custom_call']).substitute(env))
+
+            if ret['type'] in ALLOC_WRAP.keys():
+                maybe_scalar = "->maybeScalar({})".format(scalar_check) \
+                               if scalar_check is not None \
+                               else ""
+                wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute(
+                    env, arguments=[call])
+                return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);"
+                body.append(CodeTemplate(return_tensor).substitute(
+                    env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar))
+            # return the same underlying Tensor type for both real and accreal; this ensures
+            # e.g. x.sum(0) and x.sum() return the same type. We explicitly cast to the
+            # ScalarType before constructing the scalarTensor to avoid overflow checking.
+            elif ret['type'] == 'accreal' or ret['type'] == 'real':
+                return_scalar = 'return scalarTensor(convert<${ScalarType}>(${call}));'
+                body.append(CodeTemplate(return_scalar).substitute(env, call=call))
+            else:
+                # we using int64_t for long in the API, so correct it here...
+                if is_actual_return_long(ret):
+                    call = "static_cast<int64_t>({})".format(call)
+                body.append("return {};".format(call))
+        else:
+            raise Exception("NYI - return handling")
+        return body
+
+    def process_option(option):
+        # type: (FunctionOption) -> None
+        pair = (backend_type_env['Backend'],
+                backend_type_env['ScalarName'])
+        if pair in option['backend_type_pairs']:
+            env = nested_dict(option, backend_type_env)
+            body = emit_body(env, option)  # type: ignore
+            option['type_definition_body'] = body
+            type_object_declarations.append(
+                TYPE_DERIVED_DECLARATION.substitute(env))
+            type_object_definitions.append(
+                TYPE_DERIVED_DEFINITION.substitute(env))
+
+    def process_native(option):
+        # type: (FunctionOption) -> None
+        dispatch = option['type_method_definition_dispatch']
+        env = nested_dict(option, backend_type_env)
+
+        if isinstance(dispatch, dict):
+            pair = (backend_type_env['Backend'],
+                    backend_type_env['ScalarName'])
+            if pair in option['backend_type_pairs']:
+                native_dispatch = dispatch.get(pair[0])
+                type_object_declarations.append(
+                    TYPE_DERIVED_DECLARATION.substitute(env))
+                if native_dispatch is None:
+                    type_object_definitions.append(
+                        TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
+                else:
+                    option['native_type_method_dispatch'] = native_dispatch
+                    type_object_definitions.append(
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env))
+
+    for declaration in declarations:
+        for option in declaration['options']:
+            if not option.get('skip', False):
+                try:
+                    if option['mode'] == 'NN' and option.get('cimpls') is None:
+                        continue
+                    if option['mode'] != 'native':
+                        process_option(option)
+                    else:
+                        process_native(option)
+                except NYIError:
+                    pass
+    return type_object_declarations, type_object_definitions
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
new file mode 100644
index 0000000..6d3598a
--- /dev/null
+++ b/aten/src/ATen/gen.py
@@ -0,0 +1,457 @@
+import argparse
+import os
+
+import yaml
+from collections import OrderedDict
+
+import sys
+from os import path
+sys.path.append(path.dirname(path.abspath(__file__)))
+
+import cwrap_parser
+import nn_parse
+import native_parse
+import preprocess_declarations
+import function_wrapper
+import copy_wrapper
+
+from code_template import CodeTemplate
+
+
+# This file is the top-level entry point for code generation in ATen.
+# It takes an arbitrary number of arguments specifying metadata files to
+# process (.cwrap, .yaml and .h) and outputs a number generated header
+# and cpp files in ATen/ (see invocations of 'write' for each file that
+# is written.) It is invoked from cmake; look for the 'cwrap_files'
+# variable for an up-to-date list of files which are passed.
+
+parser = argparse.ArgumentParser(description='Generate ATen source files')
+parser.add_argument('files', help='cwrap files', nargs='+')
+
+parser.add_argument(
+    '-s',
+    '--source-path',
+    help='path to source directory for ATen',
+    default='.')
+parser.add_argument(
+    '-o',
+    '--output-dependencies',
+    help='output a list of dependencies into the given file and exit')
+parser.add_argument(
+    '-d', '--install_dir', help='output directory', default='ATen')
+options = parser.parse_args()
+
+if options.install_dir is not None and not os.path.exists(options.install_dir):
+    os.makedirs(options.install_dir)
+
+
+class FileManager(object):
+    def __init__(self):
+        self.filenames = set()
+        self.outputs_written = False
+        self.undeclared_files = []
+
+    def will_write(self, filename):
+        filename = '{}/{}'.format(options.install_dir, filename)
+        if self.outputs_written:
+            raise Exception("'will_write' can only be called before " +
+                            "the call to write_outputs, refactor so outputs are registered " +
+                            "before running the generators")
+        self.filenames.add(filename)
+
+    def _write_if_changed(self, filename, contents):
+        try:
+            with open(filename, 'r') as f:
+                old_contents = f.read()
+        except IOError:
+            old_contents = None
+        if contents != old_contents:
+            with open(filename, 'w') as f:
+                f.write(contents)
+
+    def write_outputs(self, filename):
+        """Write a file containing the list of all outputs which are
+        generated by this script."""
+        self._write_if_changed(
+            filename,
+            ''.join(name + ";" for name in sorted(self.filenames)))
+        self.outputs_written = True
+
+    def write(self, filename, s, env=None):
+        filename = '{}/{}'.format(options.install_dir, filename)
+        if isinstance(s, CodeTemplate):
+            assert env is not None
+            env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py"
+            s = s.substitute(env)
+        self._write_if_changed(filename, s)
+        if filename not in self.filenames:
+            self.undeclared_files.append(filename)
+        else:
+            self.filenames.remove(filename)
+
+    def check_all_files_written(self):
+        if len(self.undeclared_files) > 0:
+            raise Exception(
+                "trying to write files {} which are not ".format(self.undeclared_files) +
+                "in the list of outputs this script produces. " +
+                "use will_write to add them.")
+        if len(self.filenames) > 0:
+            raise Exception("Outputs declared with 'will_write' were " +
+                            "never written: {}".format(self.filenames))
+
+
+TEMPLATE_PATH = options.source_path + "/templates"
+GENERATOR_DERIVED = CodeTemplate.from_file(
+    TEMPLATE_PATH + "/GeneratorDerived.h")
+STORAGE_DERIVED_CPP = CodeTemplate.from_file(
+    TEMPLATE_PATH + "/StorageDerived.cpp")
+STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h")
+
+TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp")
+SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
+TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
+TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h")
+TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp")
+
+TENSOR_DERIVED_CPP = CodeTemplate.from_file(
+    TEMPLATE_PATH + "/TensorDerived.cpp")
+TENSOR_DENSE_CPP = CodeTemplate.from_file(
+    TEMPLATE_PATH + "/TensorDense.cpp")
+
+REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h")
+REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp")
+
+TENSOR_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorDerived.h")
+TENSOR_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Tensor.h")
+TENSOR_METHODS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorMethods.h")
+
+FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.h")
+
+NATIVE_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/NativeFunctions.h")
+
+TYPE_REGISTER = CodeTemplate("""\
+context->type_registry[static_cast<int>(Backend::${backend})]
+                      [static_cast<int>(ScalarType::${scalar_type})]
+                      .reset(new ${type_name}(context));
+detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type});
+""")
+
+file_manager = FileManager()
+cuda_file_manager = FileManager()
+
+generators = {
+    'CPUGenerator.h': {
+        'name': 'CPU',
+        'th_generator': 'THGenerator * generator;',
+        'header': 'TH/TH.h',
+    },
+    'CUDAGenerator.h': {
+        'name': 'CUDA',
+        'th_generator': '',
+        'header': 'THC/THC.h'
+    },
+}
+
+backends = ['CPU', 'CUDA']
+densities = ['Dense', 'Sparse']
+
+# scalar_name, c_type, accreal, th_scalar_type, is_floating_type
+scalar_types = [
+    ('Byte', 'uint8_t', 'Long', 'uint8_t', False),
+    ('Char', 'int8_t', 'Long', 'int8_t', False),
+    ('Double', 'double', 'Double', 'double', True),
+    ('Float', 'float', 'Double', 'float', True),
+    ('Int', 'int', 'Long', 'int32_t', False),
+    ('Long', 'int64_t', 'Long', 'int64_t', False),
+    ('Short', 'int16_t', 'Long', 'int16_t', False),
+    ('Half', 'Half', 'Double', 'THHalf', True),
+]
+
+# shared environment for non-derived base classes Type.h Tensor.h Storage.h
+top_env = {
+    'cpu_type_registrations': [],
+    'cpu_type_headers': [],
+    'cuda_type_registrations': [],
+    'cuda_type_headers': [],
+    'type_method_declarations': [],
+    'type_method_definitions': [],
+    'type_method_inline_definitions': [],
+    'tensor_method_declarations': [],
+    'tensor_method_definitions': [],
+    'function_declarations': [],
+    'function_definitions': [],
+    'type_ids': [],
+    'native_function_declarations': [],
+}
+
+
+def dict_representer(dumper, data):
+    return dumper.represent_dict(data.items())
+
+
+def postprocess_output_declarations(output_declarations):
+    # ensure each return has a name associated with it
+    for decl in output_declarations:
+        has_named_ret = False
+        for n, ret in enumerate(decl.returns):
+            if 'name' not in ret:
+                assert not has_named_ret
+                if decl.inplace:
+                    ret['name'] = 'self'
+                elif len(decl.returns) == 1:
+                    ret['name'] = 'result'
+                else:
+                    ret['name'] = 'result' + str(n)
+            else:
+                has_named_ret = True
+
+    def remove_key_if_none(dictionary, key):
+        if key in dictionary.keys() and dictionary[key] is None:
+            del dictionary[key]
+        return dictionary
+
+    return [remove_key_if_none(decl._asdict(), 'buffers')
+            for decl in output_declarations]
+
+
+def format_yaml(data):
+    if options.output_dependencies:
+        # yaml formatting is slow so don't do it if we will ditch it.
+        return ""
+    noalias_dumper = yaml.dumper.SafeDumper
+    noalias_dumper.ignore_aliases = lambda self, data: True
+    # Support serializing OrderedDict
+    noalias_dumper.add_representer(OrderedDict, dict_representer)
+    return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper)
+
+
+def generate_storage_type_and_tensor(backend, density, scalar_type, declarations):
+    scalar_name, c_type, accreal, th_scalar_type, is_floating_type = scalar_type
+    env = {}
+    density_tag = 'Sparse' if density == 'Sparse' else ''
+    env['Density'] = density
+    env['ScalarName'] = scalar_name
+    env['ScalarType'] = c_type
+    env['THScalarType'] = th_scalar_type
+    env['AccScalarName'] = accreal
+    env['isFloatingType'] = is_floating_type
+    env['isIntegralType'] = not is_floating_type
+    if density == 'Dense':
+        env['Storage'] = "{}{}Storage".format(backend, scalar_name)
+        env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name)
+    env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name)
+    env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name)
+    env['Backend'] = density_tag + backend
+    env['DenseBackend'] = backend
+    env['storage_tensor_headers'] = []
+    if density != 'Sparse':
+        env['storage_tensor_headers'] = [
+            '#include "ATen/{}.h"'.format(env['Storage']),
+            '#include "ATen/{}.h"'.format(env['Tensor']),
+            '#include "ATen/{}ByteTensor.h"'.format(env['Backend']),
+            '#include "ATen/{}IntTensor.h"'.format(env['Backend']),
+            '#include "ATen/{}LongTensor.h"'.format(env['Backend']),
+        ]
+
+    # used for generating switch logic for external functions
+    tag = density_tag + backend + scalar_name
+    env['TypeID'] = 'TypeID::' + tag
+    top_env['type_ids'].append(tag + ',')
+
+    if backend == 'CUDA':
+        env['th_headers'] = [
+            '#include <THC/THC.h>',
+            '#include <THC/THCTensor.hpp>',
+            '#include <THCUNN/THCUNN.h>',
+            '#undef THNN_',
+            '#undef THCIndexTensor_',
+        ]
+        env['extra_cuda_headers'] = ['#include <ATen/cuda/CUDAHalf.cuh>']
+        env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')
+        sname = '' if scalar_name == "Float" else scalar_name
+        env['THType'] = 'Cuda{}'.format(sname)
+        env['THStorage'] = 'THCuda{}Storage'.format(sname)
+        env['THTensor'] = 'THCuda{}Tensor'.format(sname)
+        env['THIndexTensor'] = 'THCudaLongTensor'
+        env['state'] = ['context->getTHCState()']
+        env['isCUDA'] = 'true'
+        env['storage_device'] = 'return storage->device;'
+        env['Generator'] = 'CUDAGenerator'
+    else:
+        env['th_headers'] = [
+            '#include <TH/TH.h>',
+            '#include <TH/THTensor.hpp>',
+            '#include <THNN/THNN.h>',
+            '#undef THNN_',
+        ]
+        env['extra_cuda_headers'] = []
+        env['THType'] = scalar_name
+        env['THStorage'] = "TH{}Storage".format(scalar_name)
+        env['THTensor'] = 'TH{}Tensor'.format(scalar_name)
+        env['THIndexTensor'] = 'THLongTensor'
+        env['state'] = []
+        env['isCUDA'] = 'false'
+        env['storage_device'] = 'throw std::runtime_error("CPU storage has no device");'
+        env['Generator'] = 'CPUGenerator'
+    env['AS_REAL'] = env['ScalarType']
+    if scalar_name == "Half":
+        env['SparseTensor'] = 'Tensor'
+        if backend == "CUDA":
+            env['to_th_type'] = 'HalfFix<__half,Half>'
+            env['to_at_type'] = 'HalfFix<Half,__half>'
+            env['AS_REAL'] = 'convert<half,double>'
+            env['THScalarType'] = 'half'
+        else:
+            env['to_th_type'] = 'HalfFix<THHalf,Half>'
+            env['to_at_type'] = 'HalfFix<Half,THHalf>'
+    elif scalar_name == 'Long':
+        env['to_th_type'] = 'long'
+        env['to_at_type'] = 'int64_t'
+    else:
+        env['to_th_type'] = ''
+        env['to_at_type'] = ''
+
+    declarations, definitions = function_wrapper.create_derived(
+        env, declarations)
+    env['type_derived_method_declarations'] = declarations
+    env['type_derived_method_definitions'] = definitions
+
+    fm = file_manager
+    if env['DenseBackend'] == 'CUDA':
+        fm = cuda_file_manager
+
+    if density != 'Sparse':
+        # there are no storage or tensor types for sparse; it's all uniform
+        fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env)
+        fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env)
+        env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env)
+        fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env)
+        fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env)
+
+    if density != 'Sparse':
+        fm.write(env['Type'] + ".cpp", TYPE_DERIVED_CPP, env)
+    else:
+        fm.write(env['Type'] + ".cpp", SPARSE_TYPE_DERIVED_CPP, env)
+    fm.write(env['Type'] + ".h", TYPE_DERIVED_H, env)
+
+    type_register = TYPE_REGISTER.substitute(backend=env['Backend'], scalar_type=scalar_name, type_name=env['Type'])
+    if env['DenseBackend'] == 'CPU':
+        top_env['cpu_type_registrations'].append(type_register)
+        top_env['cpu_type_headers'].append(
+            '#include "ATen/{}.h"'.format(env['Type']))
+    else:
+        assert env['DenseBackend'] == 'CUDA'
+        top_env['cuda_type_registrations'].append(type_register)
+        top_env['cuda_type_headers'].append(
+            '#include "ATen/{}.h"'.format(env['Type']))
+
+    return env
+
+
+def iterate_types():
+    for backend in backends:
+        for density in densities:
+            for scalar_type in scalar_types:
+                if density == 'Sparse' and scalar_type[0] == 'Half':
+                    # THS does not do half type yet.
+                    continue
+                yield (backend, density, scalar_type)
+
+
+###################
+# declare what files will be output _before_ we do any work
+# so that the script runs quickly when we are just querying the
+# outputs
+def declare_outputs():
+    files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h',
+             'TensorMethods.h', 'Functions.h',
+             'CPUCopy.cpp', 'NativeFunctions.h']
+    for f in files:
+        file_manager.will_write(f)
+    cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h']
+    for f in cuda_files:
+        cuda_file_manager.will_write(f)
+    for fname in sorted(generators.keys()):
+        fm = file_manager
+        if generators[fname]['name'] == 'CUDA':
+            fm = cuda_file_manager
+        fm.will_write(fname)
+    for backend, density, scalar_types in iterate_types():
+        scalar_name = scalar_types[0]
+        full_backend = "Sparse" + backend if density == "Sparse" else backend
+        for kind in ["Storage", "Type", "Tensor"]:
+            if kind != 'Type' and density == "Sparse":
+                # No Storage or Tensor for sparse
+                continue
+            fm = file_manager
+            if backend == 'CUDA':
+                fm = cuda_file_manager
+            fm.will_write("{}{}{}.h".format(full_backend, scalar_name, kind))
+            fm.will_write("{}{}{}.cpp".format(full_backend, scalar_name, kind))
+
+
+def filter_by_extension(files, *extensions):
+    filtered_files = []
+    for file in files:
+        for extension in extensions:
+            if file.endswith(extension):
+                filtered_files.append(file)
+    return filtered_files
+
+
+def generate_outputs():
+    cwrap_files = filter_by_extension(options.files, '.cwrap')
+    nn_files = filter_by_extension(options.files, 'nn.yaml', '.h')
+    native_files = filter_by_extension(options.files, 'native_functions.yaml')
+
+    declarations = [d
+                    for file in cwrap_files
+                    for d in cwrap_parser.parse(file)]
+
+    declarations += nn_parse.run(nn_files)
+    declarations += native_parse.run(native_files)
+    declarations = preprocess_declarations.run(declarations)
+    for fname, env in generators.items():
+        fm = file_manager
+        if env['name'] == 'CUDA':
+            fm = cuda_file_manager
+        fm.write(fname, GENERATOR_DERIVED, env)
+
+    # note: this will fill in top_env['type/tensor_method_declarations/definitions']
+    # and modify the declarations to include any information that will all_backends
+    # be used by function_wrapper.create_derived
+    output_declarations = function_wrapper.create_generic(top_env, declarations)
+    output_declarations = postprocess_output_declarations(output_declarations)
+    file_manager.write("Declarations.yaml", format_yaml(output_declarations))
+
+    # populated by generate_storage_type_and_tensor
+    all_types = []
+
+    for backend, density, scalar_type in iterate_types():
+        all_types.append(generate_storage_type_and_tensor(
+            backend, density, scalar_type, declarations))
+
+    file_manager.write('Type.h', TYPE_H, top_env)
+    file_manager.write('Type.cpp', TYPE_CPP, top_env)
+
+    cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env)
+    cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env)
+
+    file_manager.write('Tensor.h', TENSOR_H, top_env)
+    file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env)
+    file_manager.write('Functions.h', FUNCTIONS_H, top_env)
+
+    file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU'))
+    cuda_file_manager.write('CUDACopy.cpp', copy_wrapper.create(all_types, 'CUDA'))
+    file_manager.write('NativeFunctions.h', NATIVE_FUNCTIONS_H, top_env)
+
+    file_manager.check_all_files_written()
+    cuda_file_manager.check_all_files_written()
+
+
+declare_outputs()
+if options.output_dependencies is not None:
+    file_manager.write_outputs(options.output_dependencies)
+    cuda_file_manager.write_outputs(options.output_dependencies + "-cuda")
+else:
+    generate_outputs()
diff --git a/aten/src/ATen/mkl/Descriptors.h b/aten/src/ATen/mkl/Descriptors.h
new file mode 100644
index 0000000..efedcd0
--- /dev/null
+++ b/aten/src/ATen/mkl/Descriptors.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "Exceptions.h"
+#include <mkl_dfti.h>
+#include <ATen/Tensor.h>
+
+namespace at { namespace native {
+
+struct DftiDescriptorDeleter {
+  void operator()(DFTI_DESCRIPTOR* desc) {
+    if (desc != nullptr) {
+      MKL_DFTI_CHECK(DftiFreeDescriptor(&desc));
+    }
+  }
+};
+
+class DftiDescriptor {
+public:
+  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) {
+    if (desc_ != nullptr) {
+      throw std::runtime_error("DFTI DESCRIPTOR can only be initialized once");
+    }
+    DFTI_DESCRIPTOR *raw_desc;
+    if (signal_ndim == 1) {
+      MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0]));
+    } else {
+      MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, signal_ndim, sizes));
+    }
+    desc_.reset(raw_desc);
+  }
+
+  DFTI_DESCRIPTOR *get() const {
+    if (desc_ == nullptr) {
+      throw std::runtime_error("DFTI DESCRIPTOR has not been initialized");
+    }
+    return desc_.get();
+  }
+
+private:
+  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
+};
+
+
+}}  // at::native
diff --git a/aten/src/ATen/mkl/Exceptions.h b/aten/src/ATen/mkl/Exceptions.h
new file mode 100644
index 0000000..e954a07
--- /dev/null
+++ b/aten/src/ATen/mkl/Exceptions.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include <sstream>
+#include <mkl_dfti.h>
+
+namespace at { namespace native {
+
+static inline void MKL_DFTI_CHECK(MKL_INT status)
+{
+  if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) {
+    std::ostringstream ss;
+    ss << "MKL FFT error: " << DftiErrorMessage(status);
+    throw std::runtime_error(ss.str());
+  }
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/mkl/Limits.h b/aten/src/ATen/mkl/Limits.h
new file mode 100644
index 0000000..b0d3829
--- /dev/null
+++ b/aten/src/ATen/mkl/Limits.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <mkl_types.h>
+
+namespace at { namespace native {
+
+  // Since size of MKL_LONG varies on different platforms (linux 64 bit, windows
+  // 32 bit), we need to programmatically calculate the max.
+  static int64_t MKL_LONG_MAX = ((1LL << (sizeof(MKL_LONG) * 8 - 2)) - 1) * 2 + 1;
+
+}}  // namespace
diff --git a/aten/src/ATen/mkl/README.md b/aten/src/ATen/mkl/README.md
new file mode 100644
index 0000000..2916246
--- /dev/null
+++ b/aten/src/ATen/mkl/README.md
@@ -0,0 +1,4 @@
+All files living in this directory are written with the assumption that MKL is available,
+which means that these code are not guarded by `#if AT_MKL_ENABLED()`. Therefore, whenever
+you need to use definitions from here, please guard the `#include<ATen/mkl/*.h>` and
+definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](native/mkl/SpectralOps.cpp).
\ No newline at end of file
diff --git a/aten/src/ATen/mkldnn/Runtime.cpp b/aten/src/ATen/mkldnn/Runtime.cpp
new file mode 100644
index 0000000..54f999e
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.cpp
@@ -0,0 +1,5 @@
+#include "Runtime.h"
+
+namespace at { namespace native {
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/mkldnn/Runtime.h b/aten/src/ATen/mkldnn/Runtime.h
new file mode 100644
index 0000000..c58ef2c
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <mkldnn.hpp>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+// CpuEngine singleton
+struct CpuEngine {
+  static CpuEngine& Instance() {
+    static CpuEngine myInstance;
+    return myInstance;
+  }
+  engine& get_engine() {
+    return _cpu_engine;
+  }
+  CpuEngine(CpuEngine const&) = delete;
+  CpuEngine& operator=(CpuEngine const&) = delete;
+
+protected:
+  CpuEngine():_cpu_engine(mkldnn::engine::cpu, 0) {}
+  ~CpuEngine() {}
+
+private:
+  engine _cpu_engine;
+};
+
+// Stream singleton
+struct Stream {
+  static Stream& Instance() {
+    static Stream myInstance;
+    return myInstance;
+  };
+  stream& get_stream() {
+    return _cpu_stream;
+  }
+  Stream(Stream const&) = delete;
+  Stream& operator=(Stream const&) = delete;
+
+protected:
+  Stream():_cpu_stream(mkldnn::stream::kind::eager) {}
+  ~Stream() {}
+
+private:
+  stream _cpu_stream;
+};
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
new file mode 100644
index 0000000..87bd091
--- /dev/null
+++ b/aten/src/ATen/native/Activation.cpp
@@ -0,0 +1,72 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Dispatch.h"
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/Half.h"
+
+namespace at { namespace native {
+
+static const double SELU_ALPHA = 1.6732632423543772848170429916717;
+static const double SELU_SCALE = 1.0507009873554804934193349852946;
+
+Tensor relu(const Tensor & self) {
+  return self.clamp_min(0.0);
+}
+
+Tensor & relu_(Tensor & self) {
+  return self.clamp_min_(0.0);
+}
+
+Tensor selu(const Tensor & self) {
+  return at::elu(self, SELU_ALPHA, SELU_SCALE);
+}
+
+Tensor & selu_(Tensor & self) {
+  return at::elu_(self, SELU_ALPHA, SELU_SCALE);
+}
+
+Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
+  return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator);
+}
+
+Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
+  return at::rrelu_with_noise_(self, self.type().tensor(), lower, upper, training, generator);
+}
+
+Tensor hardshrink_cpu(const Tensor & self, Scalar lambd) {
+  auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU);
+  auto out_tensor = at::empty_like(self);
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_cpu", [&] {
+    scalar_t* lambd_tensor_d = lambd_tensor.data<scalar_t>();
+    at::CPU_tensor_apply2<scalar_t, scalar_t>(
+      self,
+      out_tensor,
+      [lambd_tensor_d](
+        scalar_t& self_val,
+        scalar_t& out_tensor_val) {
+          out_tensor_val = (self_val >= -*lambd_tensor_d && self_val <= *lambd_tensor_d) ? convert<scalar_t, int>(0) : self_val;
+    });
+  });
+  return out_tensor;
+}
+
+Tensor hardshrink_backward_cpu(const Tensor & grad, const Tensor & self, Scalar lambd) {
+  auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU);
+  auto out_tensor = at::empty_like(self);
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_backward_cpu", [&] {
+    scalar_t* lambd_tensor_d = lambd_tensor.data<scalar_t>();
+    at::CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(
+      self,
+      grad,
+      out_tensor,
+      [lambd_tensor_d](
+        scalar_t& self_val,
+        scalar_t& grad_val,
+        scalar_t& out_tensor_val) {
+          out_tensor_val = (self_val >= -*lambd_tensor_d && self_val <= *lambd_tensor_d) ? convert<scalar_t, int>(0) : grad_val;
+    });
+  });
+  return out_tensor;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
new file mode 100644
index 0000000..a537691
--- /dev/null
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -0,0 +1,685 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+#include "ATen/Config.h"
+
+namespace at { namespace native {
+
+struct ConvParams {
+  std::vector<int64_t> stride;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> dilation;
+  bool transposed;
+  std::vector<int64_t> output_padding;
+  int groups;
+  bool benchmark;
+  bool deterministic;
+  bool cudnn_enabled;
+
+  bool is_strided() const;
+  bool is_dilated() const;
+  bool is_padded() const;
+  bool is_output_padding_neg() const;
+  bool is_output_padding_big() const;
+  bool is_padding_neg() const;
+  void view1d_as_2d();
+  bool use_cudnn(const at::Tensor& input) const;
+  bool use_mkldnn(const at::Tensor& input) const;
+  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
+};
+
+std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
+  out << "ConvParams {"
+      << "  stride = " << IntList{params.stride}
+      << "  padding = " << IntList{params.padding}
+      << "  dilation = " << IntList{params.dilation}
+      << "  transposed = " << params.transposed
+      << "  output_padding = " << IntList{params.output_padding}
+      << "  groups = " << params.groups
+      << "  benchmark = " << params.benchmark
+      << "  deterministic = " << params.deterministic
+      << "  cudnn_enabled = " << params.cudnn_enabled
+      << "}";
+  return out;
+}
+
+auto ConvParams::is_strided() const -> bool {
+  bool is_strided = false;
+  for (int s : stride) {
+    is_strided |= (s != 1);
+  }
+  return is_strided;
+}
+
+auto ConvParams::is_dilated() const -> bool {
+  bool is_dilated = false;
+  for (int d : dilation) {
+    is_dilated |= (d != 1);
+  }
+  return is_dilated;
+}
+
+auto ConvParams::is_padded() const -> bool {
+  bool is_padded = false;
+  for (int p : padding) {
+    is_padded |= (p != 0);
+  }
+  return is_padded;
+}
+
+auto ConvParams::is_output_padding_neg() const -> bool {
+  bool is_non_neg = false;
+  for (int p : output_padding) {
+    is_non_neg |= (p < 0);
+  }
+  return is_non_neg;
+}
+
+auto ConvParams::is_output_padding_big() const -> bool {
+  bool is_big = false;
+  for (size_t i = 0; i < output_padding.size(); i++) {
+    is_big |= (output_padding[i] >= stride[i] || output_padding[i] >= dilation[i]);
+  }
+  return is_big;
+}
+
+auto ConvParams::is_padding_neg() const -> bool {
+  bool is_non_neg = false;
+  for (int p : padding) {
+    is_non_neg |= (p < 0);
+  }
+  return is_non_neg;
+}
+
+
+auto ConvParams::view1d_as_2d() -> void {
+  if (stride.size() == 1) {
+    stride.insert(stride.begin(), 1);
+    padding.insert(padding.begin(), 0);
+    dilation.insert(dilation.begin(), 1);
+    output_padding.insert(output_padding.begin(), 0);
+  }
+}
+
+auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool {
+  if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    return false;
+  }
+  if (!input.type().is_cuda() || !cudnn_enabled) {
+    return false;
+  }
+  if (deterministic && is_dilated()) {
+    // cudnn doesn't support deterministic dilated convolution fully yet
+    return false;
+  }
+  if (is_dilated()) {
+    return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
+  }
+  return !is_output_padding_big();
+}
+
+auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool {
+#if AT_MKLDNN_ENABLED()
+  return input.type().backend() == kCPU &&
+         input.type().scalarType() == kFloat && // only on CPU Float Tensors
+         !is_dilated() && // doesn't support dilation
+         !transposed && // or transposed tensors
+         input.ndimension() == 4; // must be in NCHW format
+#endif
+  return false;
+}
+
+// We currently only have depthwise support for the case where groups ==
+// nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
+// a depthwise multiplier)
+auto ConvParams::is_depthwise(
+        const at::Tensor& input, const at::Tensor& weight) const -> bool {
+  return input.type().is_cuda() &&
+         !transposed &&
+         input.ndimension() == 4 &&
+         input.size(1) == groups &&
+         groups > 1 && // no point if there is only a single group
+         weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
+}
+
+static void check_input_shape_forward(const at::Tensor& input,
+                                      const at::Tensor& weight, const at::Tensor& bias,
+                                      int64_t groups, bool transposed) {
+  int64_t k = input.ndimension();
+  int64_t weight_dim = weight.ndimension();
+
+  if (weight_dim != k) {
+    std::stringstream ss;
+    ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim
+       << "-dimensional weight " << weight.sizes() << ", but got input of size "
+       << input.sizes() << " instead";
+    throw std::runtime_error(ss.str());
+  }
+  if (weight.size(0) < groups) {
+    std::stringstream ss;
+    ss << "Given groups=" << groups << ", expected weight to be at least "
+       << groups << " at dimension 0, but got weight of size " << weight.sizes()
+       << " instead";
+    throw std::runtime_error(ss.str());
+  }
+
+  if (!transposed) {
+    if (input.size(1) != (weight.size(1) * groups)) {
+      std::stringstream ss;
+      ss << "Given groups=" << groups << ", weight of size " << weight.sizes()
+         << ", expected input" << input.sizes() << " to have "
+         << (weight.size(1) * groups) << " channels, but got " << input.size(1)
+         << " channels instead";
+      throw std::runtime_error(ss.str());
+    }
+    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) {
+      std::stringstream ss;
+      ss << "Given weight of size " << weight.sizes()
+         << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements"
+         << ", but got bias of size " << bias.sizes() << " instead";
+      throw std::runtime_error(ss.str());
+    }
+  } else { // transposed
+    if (input.size(1) != weight.size(0)) {
+      std::stringstream ss;
+      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
+         << ", expected input" << input.sizes() << " to have "
+         << weight.size(0) << " channels, but got " << input.size(1)
+         << " channels instead";
+      throw std::runtime_error(ss.str());
+    }
+    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) {
+      std::stringstream ss;
+      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
+         << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements"
+         << ", but got bias of size " << bias.sizes() << " instead";
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+static auto view4d(const at::Tensor& tensor) -> at::Tensor {
+  if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor");
+  return tensor.unsqueeze(2);
+}
+
+static auto view3d(const at::Tensor& tensor) -> at::Tensor {
+  if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor");
+  return tensor.squeeze(2);
+}
+
+
+static at::Tensor subtensor(at::Tensor& tensor, int dim, int groups, int g) {
+  if (!tensor.defined()) {
+    return at::Tensor();
+  }
+  int64_t n = tensor.sizes()[dim] / groups;
+  return tensor.narrow(dim, n * g, n).contiguous();
+}
+
+
+at::Tensor conv1d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList dilation, int64_t groups) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         false, {0}, groups);
+}
+
+at::Tensor conv2d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList dilation, int64_t groups) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         false, {{0, 0}}, groups);
+}
+
+at::Tensor conv3d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList dilation, int64_t groups) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         false, {{0, 0, 0}}, groups);
+}
+
+at::Tensor conv_transpose1d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         true, output_padding, groups);
+}
+
+at::Tensor conv_transpose2d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         true, output_padding, groups);
+}
+
+at::Tensor conv_transpose3d(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) {
+  return at::convolution(input, weight, bias, stride, padding, dilation,
+                         true, output_padding, groups);
+}
+
+at::Tensor convolution(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList dilation,
+    bool transposed, IntList output_padding, int64_t groups) {
+  auto& ctx = at::globalContext();
+  return at::_convolution(input, weight, bias, stride, padding, dilation,
+                          transposed, output_padding, groups,
+                          ctx.benchmarkCuDNN(), ctx.deterministicCuDNN(), ctx.userEnabledCuDNN());
+}
+
+static inline std::vector<int64_t> convolution_expand_param_if_needed(
+  IntList list_param, const char *param_name, int64_t expected_dim) {
+  if (list_param.size() == 1) {
+    return std::vector<int64_t>(expected_dim, list_param[0]);
+  } else if ((int64_t) list_param.size() != expected_dim) {
+    std::ostringstream ss;
+    ss << "expected " << param_name << " to be a single integer value or a "
+       << "list of " << expected_dim << " values to match the convolution "
+       << "dimensions, but got " << param_name << "=" << list_param;
+    throw std::runtime_error(ss.str());
+  } else {
+    return list_param.vec();
+  }
+}
+
+at::Tensor _convolution(
+    const Tensor& input_r, const Tensor& weight_r, const Tensor& bias_r,
+    IntList stride_, IntList padding_, IntList dilation_,
+    bool transposed_, IntList output_padding_, int64_t groups_,
+    bool benchmark, bool deterministic, bool cudnn_enabled) {
+
+  auto input = input_r.contiguous();
+  auto weight = weight_r;
+  auto bias = bias_r;
+  auto k = weight.ndimension();
+  int64_t dim = k - 2;
+
+  if (dim <= 0) {
+    throw std::runtime_error("weight should have at least two dimensions");
+  }
+
+  ConvParams params;
+  params.stride = convolution_expand_param_if_needed(stride_, "stride", dim);
+  params.padding = convolution_expand_param_if_needed(padding_, "padding", dim);
+  params.dilation = convolution_expand_param_if_needed(dilation_, "dilation", dim);
+  params.transposed = transposed_;
+  params.output_padding = convolution_expand_param_if_needed(output_padding_, "output_padding", dim);
+  params.groups = groups_;
+  params.benchmark = benchmark;
+  params.deterministic = deterministic;
+  params.cudnn_enabled = cudnn_enabled;
+
+  if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported");
+  if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported");
+
+  check_input_shape_forward(input, weight, bias, params.groups, params.transposed);
+
+  if (k == 3) {
+    params.view1d_as_2d();
+    input = view4d(input);
+    weight = view4d(weight);
+  }
+
+  auto output = input.type().tensor();
+
+  if (params.is_depthwise(input, weight)) {
+      /* output.resize_(output_size(input, weight)); */
+
+      auto kernel_size = weight.sizes().slice(2);
+      auto stride = params.stride;
+      auto padding = params.padding;
+      auto dilation = params.dilation;
+
+      output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation);
+  } else if (params.use_cudnn(input)) {
+    if (input.type() != weight.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+    if (bias.defined() && input.type() != bias.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+
+    if (params.transposed) {
+      output = at::cudnn_convolution_transpose(
+          input, weight, bias,
+          params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
+    } else {
+      output = at::cudnn_convolution(
+          input, weight, bias,
+          params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
+    }
+  } else if (params.use_mkldnn(input)) {
+#if AT_MKLDNN_ENABLED()
+    if (input.type() != weight.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+    if (bias.defined() && input.type() != bias.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+
+    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
+#endif
+  } else {
+    if (params.groups == 1) {
+      output = at::_convolution_nogroup(
+          input, weight, bias, params.stride, params.padding, params.dilation, params.transposed, params.output_padding);
+    } else {
+      std::vector<Tensor> outputs(params.groups);
+      for (int g = 0; g < params.groups; ++g) {
+        auto input_g = subtensor(input, 1, params.groups, g);
+        auto weight_g = subtensor(weight, 0, params.groups, g);
+        auto bias_g = subtensor(bias, 0, params.groups, g);
+        outputs[g] = at::_convolution_nogroup(
+            input_g, weight_g, bias_g, params.stride, params.padding, params.dilation, params.transposed, params.output_padding);
+      }
+      output = at::cat(outputs, 1);
+    }
+  }
+
+  if (k == 3) {
+    output = view3d(output);
+  }
+
+  return output;
+}
+
+// A generic function for convolution implementations which don't
+// natively implement groups (e.g., not CuDNN).
+at::Tensor _convolution_nogroup(
+    const Tensor& input, const Tensor& weight, const Tensor& bias,
+    IntList stride, IntList padding, IntList dilation,
+    bool transposed, IntList output_padding) {
+
+  ConvParams params;
+  params.stride = stride;
+  params.padding = padding;
+  params.dilation = dilation;
+  params.transposed = transposed;
+  params.output_padding = output_padding;
+  params.groups = 1;
+  params.benchmark = false;
+  params.deterministic = false;
+  params.cudnn_enabled = false;
+
+  auto dim = input.ndimension();
+  auto dilated = params.is_dilated();
+  auto kernel_size = weight.sizes().slice(2);
+
+  if (params.transposed) {
+    if (dim == 4) {
+      return at::thnn_conv_transpose2d(
+          input, weight, kernel_size, bias,
+          stride, padding, output_padding, dilation);
+    } else if (dim == 5) {
+      return at::thnn_conv_transpose3d(
+        input, weight, kernel_size, bias,
+        stride, padding, output_padding, dilation);
+      }
+  } else {  /* Not transposed */
+    if (dim == 4) {
+      if (dilated) {
+        return at::thnn_conv_dilated2d(
+            input, weight, kernel_size, bias,
+            stride, padding, dilation);
+      } else {  /* dim == 4, non-dilated */
+        /* CPU implementation has specialized MM kernels
+           for non-dilated case here */
+        return at::thnn_conv2d(
+            input, weight, kernel_size, bias,
+            stride, padding);
+      }
+    } else if (dim == 5 && (input.type().is_cuda() || dilated)) {
+      return at::thnn_conv_dilated3d(
+          input, weight, kernel_size, bias,
+          stride, padding, dilation);
+    } else if (dim == 5) { /* dim == 5, CPU, non-dilated */
+      /* CPU implementation has specialized MM kernels
+         for non-dilated case here */
+      return at::thnn_conv3d(
+          input, weight, kernel_size, bias,
+          stride, padding);
+    }
+  }
+
+  throw std::runtime_error("unsupported ConvNd parameters");
+}
+
+static Tensor subvariable(const Tensor& var, int dim, int groups, int g) {
+  int64_t n = var.sizes()[dim] / groups;
+  auto result = var.narrow(dim, n * g, n);
+  return result;
+}
+
+std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(
+    const Tensor& ggI, const Tensor& ggW_r, const Tensor& ggb,
+    const Tensor& gO_r, const Tensor& weight_r, const Tensor& input,
+    IntList stride_, IntList padding_, IntList dilation_,
+    bool transposed_, IntList output_padding_, int64_t groups_,
+    bool benchmark, bool deterministic, bool cudnn_enabled,
+    std::array<bool, 3> output_mask) {
+
+  auto ggW = ggW_r;
+  auto gO = gO_r;
+  auto weight = weight_r;
+
+  ConvParams params;
+  params.stride = stride_;
+  params.padding = padding_;
+  params.dilation = dilation_;
+  params.transposed = transposed_;
+  params.output_padding = output_padding_;
+  params.groups = groups_;
+  params.benchmark = benchmark;
+  params.deterministic = deterministic;
+  params.cudnn_enabled = cudnn_enabled;
+
+  // Compute ggO = conv(ggI, w) + conv(i, ggW) + ggb
+  Tensor ggO;
+  if (ggI.defined()) {
+    if (weight.type().is_cuda()) {
+      weight = weight.contiguous();
+    }
+    ggO = at::_convolution(ggI, weight, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups, params.benchmark, params.deterministic, params.cudnn_enabled);
+  }
+
+  if (ggW.defined()) {
+    if (ggW.type().is_cuda()) {
+      ggW = ggW.contiguous();
+    }
+    auto ggW_term = at::_convolution(input, ggW, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups, params.benchmark, params.deterministic, params.cudnn_enabled);
+    if (ggO.defined()) {
+      ggO = ggO + ggW_term;
+    } else {
+      ggO = ggW_term;
+    }
+  }
+
+  if (ggb.defined()) {
+    // View as (1, ggb.size(0), 1, 1...)
+
+    // Expand
+    std::vector<int64_t> new_size(gO.ndimension(), 1);
+    new_size[1] = ggb.sizes()[0];
+    auto ggb_contiguous = ggb.contiguous();
+    auto ggb_view = ggb_contiguous.view(new_size);
+
+    // Expand
+    auto ggb_expanded = ggb_view.expand(gO.sizes());
+
+    if (ggO.defined()) {
+      ggO = ggO + ggb_expanded;
+    } else {
+      ggO = ggb_expanded;
+    }
+  }
+
+  // Compute gW = conv(ggI, gO)
+  Tensor gW;
+  if (ggI.defined()) {
+    // Modified params with correct padding
+    ConvParams gw_conv_params(params);
+
+    // Disable groups as they are handled separately
+    auto groups = gw_conv_params.groups;
+    gw_conv_params.groups = 1;
+    std::swap(gw_conv_params.dilation, gw_conv_params.stride);
+
+    // Transpose gO and ggI to accumulate over batch
+    auto gOt = gO.transpose(0, 1);
+    auto ggIt = ggI.transpose(0, 1);
+
+    Tensor gWt;
+    // Compute conv
+    if (groups == 1) {
+      if (gOt.type().is_cuda()) {
+        gOt = gOt.contiguous();
+      }
+
+      // Compute conv
+      if (params.transposed) {
+        gw_conv_params.transposed = false;
+        gWt = at::_convolution(gOt, ggIt, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled);
+      } else {
+        gWt = at::_convolution(ggIt, gOt, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled);
+      }
+    } else {
+      std::vector<Tensor> gWt_list(groups);
+      for (int g = 0; g < groups; ++g) {
+        auto ggIt_g = subvariable(ggIt, 0, groups, g);
+        auto gOt_g = subvariable(gOt, 0, groups, g);
+        if (gOt_g.type().is_cuda()) {
+          gOt_g = gOt_g.contiguous();
+        }
+
+        // Compute conv
+        if (params.transposed) {
+          gw_conv_params.transposed = false;
+          gWt_list[g] = at::_convolution(gOt_g, ggIt_g, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled);
+        } else {
+          gWt_list[g] = at::_convolution(ggIt_g, gOt_g, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled);
+        }
+      }
+
+      gWt = at::cat(gWt_list, 1);
+    }
+
+    // Transpose gW to match chan_in and chan_out
+    gW = gWt.transpose(0, 1);
+
+    // narrow gW to only relevant portion
+    // we do it this way instead of narrowing the input itself because
+    // the ConvForward kernels don't support asymmetric padding.
+    auto gW_size = gW.sizes();
+    auto w_size = weight.sizes();
+    for (size_t i = 2; i < gW_size.size(); ++i) {
+      if (gW_size[i] > w_size[i]) {
+          gW = gW.narrow(i, 0, w_size[i]);
+          gW_size = gW.sizes();
+      }
+    }
+  }
+
+  // Compute gI = convT(ggW, gO.t()) if !transposed
+  //         gI = conv(go, ggw)      if transposed
+  Tensor gI;
+  if (ggW.defined()) {
+    ConvParams gi_conv_params(params);
+    gi_conv_params.transposed = !params.transposed;
+
+    if (params.transposed) {
+      if (gO.type().is_cuda()) {
+        gO = gO.contiguous();
+      }
+      gI = at::_convolution(gO, ggW, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled);
+
+      // narrow gI to only relevant portion
+      // we do it this way because negative output_padding is not supported
+      // TODO: figure out if we can narrow gO and save some compute,
+      // rather than narrowing the computed gI
+      auto gI_size = gI.sizes();
+      auto i_size = input.sizes();
+      for (size_t i = 2; i < gI_size.size(); ++i) {
+        if (gI_size[i] > i_size[i]) {
+          gI = gI.narrow(i, 0, i_size[i]);
+          gI_size = gI.sizes();
+        }
+      }
+    } else {
+      auto groups = gi_conv_params.groups;
+      gi_conv_params.groups = 1;
+      // swap stride and dilation
+      std::swap(gi_conv_params.dilation, gi_conv_params.stride);
+
+      auto ggWt = ggW.transpose(0, 1);
+      auto gOt = gO.transpose(0, 1);
+
+      // calculate output_padding
+      // TODO: figure out why this needs to be computed...
+      auto kernel_size = weight.sizes().slice(2);
+      auto input_shape = input.sizes().slice(2);
+      auto grad_output_shape = gO.sizes().slice(2);
+
+      if (kernel_size.size() == 1) {
+        auto expected_input_shape = (kernel_size[0] - 1) * gi_conv_params.stride[1]
+          - 2 * gi_conv_params.padding[1]
+          + (gi_conv_params.dilation[1] * (grad_output_shape[0] - 1) + 1);
+        if (expected_input_shape != input_shape[0]) {
+          gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
+        }
+      } else {
+        for(size_t i = 0; i < kernel_size.size(); ++i) {
+          // Check if whole input has been used or not
+          auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.stride[i]
+            - 2 * gi_conv_params.padding[i]
+            + (gi_conv_params.dilation[i] * (grad_output_shape[i] - 1) + 1);
+          if (expected_input_shape != input_shape[i]) {
+            gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape;
+          }
+        }
+      }
+
+      Tensor gIt;
+      if (params.groups == 1) {
+        if (gOt.type().is_cuda()) {
+          gOt = gOt.contiguous();
+        }
+
+        gIt = at::_convolution(ggWt, gOt, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled);
+      } else {
+        std::vector<Tensor> gIt_list(params.groups);
+        for (int g = 0; g < groups; ++g) {
+          auto ggWt_g = subvariable(ggWt, 1, groups, g);
+          auto gOt_g = subvariable(gOt, 0, groups, g);
+          if (gOt_g.type().is_cuda()) {
+            gOt_g = gOt_g.contiguous();
+          }
+
+          gIt_list[g] = at::_convolution(ggWt_g, gOt_g, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled);
+        }
+
+        gIt = at::cat(gIt_list, 0);
+      }
+
+      gI = gIt.transpose(0, 1);
+    }
+  }
+
+  if (output_mask[0] && !ggO.defined()) ggO = at::zeros_like(gO);
+  if (output_mask[1] && !gI.defined()) gI = at::zeros_like(input);
+  if (output_mask[2] && !gW.defined()) gW = at::zeros_like(weight);
+
+  return std::tuple<Tensor,Tensor,Tensor>{ggO, gI, gW};
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
new file mode 100644
index 0000000..0c2ac96
--- /dev/null
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -0,0 +1,107 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include <tuple>
+
+namespace at {
+namespace native {
+
+Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, int64_t pad) {
+  AT_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, "
+      "in_channel");
+  AT_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width,"
+      " in_channels, out_channels.");
+  AT_CHECK(bias.dim() == 1, "Bias must be 1-D");
+
+  auto input_size = self.sizes();
+  auto weight_size = weight.sizes();
+
+  auto ilen = input_size[0];
+  auto batchSize = input_size[1];
+  auto inputPlanes = input_size[2];
+  auto outputPlanes = weight_size[2];
+  auto kw = weight_size[0];
+  auto olen = input_size[0] - kw + 1 + pad * 2;
+  auto real_pad = (olen - ilen + kw - 1) / 2;
+
+  // Make sure shapes are correct.
+  // Input = (time, batch, in_channels)
+  // Weight = (kernel_width, in_channels, out_channels)
+  // Bias = (out_channels)
+  AT_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) "
+      "is not == dim 1 in the weight tensor");
+  AT_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in "
+      "the weight tensor (output channels).");
+
+  // input * weights + bias -> output_features
+  Tensor output = self.type().tensor({
+    olen,
+    input_size[1],
+    weight_size[2],
+  });
+  output.copy_(bias.expand(output.sizes()));
+  for (int k = 0; k < kw; k++) {
+    int iShift = std::max(0, static_cast<int>(k - real_pad));
+    int oShift = std::max(0, static_cast<int>(real_pad - k));
+    int t = std::min(ilen + real_pad - k, olen) - oShift;
+    // Note: gemm assumes column-major matrices
+    // input    is l*m (row-major)
+    // weight   is m*r (row-major)
+    // output   is l*r (row-major)
+    if (t > 0) {
+      auto W = weight[k];
+      auto I = self.narrow(0, iShift, t).view({t * batchSize, inputPlanes});
+      auto O = output.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
+      O.addmm_(I, W);
+    }
+  }
+  return output;
+}
+
+std::tuple<Tensor, Tensor, Tensor> conv_tbc_backward(const Tensor& dOutput, const Tensor& input, const Tensor& weight, const Tensor& bias, int64_t pad) {
+  auto input_size = input.sizes();
+  auto weight_size = weight.sizes();
+
+  auto ilen = input_size[0];
+  auto batchSize = input_size[1];
+  auto inputPlanes = input_size[2];
+  auto outputPlanes = weight_size[2];
+  auto kw = weight.sizes()[0];
+  auto olen = input_size[0] - kw + 1 + pad * 2;
+  int real_pad = (olen - ilen + kw - 1) / 2;
+
+  Tensor dInput = at::zeros_like(input);
+  for (int k = 0; k < kw; k++) {
+    int iShift = std::max(0, k - real_pad);
+    int oShift = std::max(0, real_pad - k);
+    int t = std::min(ilen + real_pad - k, olen) - oShift;
+    // dOutput * T(weight) -> dInput
+    if (t > 0) {
+      auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
+      auto dI = dInput.narrow(0, iShift, t).view({t * batchSize, inputPlanes});
+      dI.addmm_(dO, weight[k].t());
+    }
+  }
+
+  Tensor dWeight = at::zeros_like(weight);
+  for (int k = 0; k < kw; k++) {
+    int iShift = std::max(0, k - real_pad);
+    int oShift = std::max(0, real_pad - k);
+    int t = std::min(ilen + real_pad - k, olen) - oShift;
+    // T(input) * dOutput -> dWeight
+    if (t > 0) {
+      auto dW = dWeight[k];
+      auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
+      auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes}).t();
+      dW.addmm_(I, dO);
+    }
+  }
+
+  Tensor dBias = at::zeros_like(bias);
+  auto tmp = dOutput.sum(0, false);
+  dBias.copy_(tmp.sum(0));
+
+  return std::make_tuple(dInput, dWeight, dBias);
+}
+
+}
+}
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
new file mode 100644
index 0000000..da49e28
--- /dev/null
+++ b/aten/src/ATen/native/Distance.cpp
@@ -0,0 +1,10 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+
+namespace at { namespace native {
+
+Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double eps, bool keepdim) {
+  return at::norm(x1 - x2 + eps, p, 1, keepdim);
+}
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
new file mode 100644
index 0000000..a9bd61a
--- /dev/null
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -0,0 +1,208 @@
+#include "ATen/ATen.h"
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Error.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include "ATen/CPUGenerator.h"
+#include "ATen/CheckGenerator.h"
+#include "ATen/Generator.h"
+#include "ATen/native/Distributions.h"
+
+#include <functional>
+
+#include "TH/THRandom.h"
+#include "TH/THGenerator.hpp"
+#include "TH/THMath.h"
+
+namespace {
+/*
+ * This section is a counterpart to Distributions.cu
+ *
+ */
+
+// The function `sample_poisson`
+// is adapted from Numpy's distributions.c implementation.
+// It is MIT licensed, so here is the copyright:
+
+/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+THGenerator* get_generator(at::Generator* gen) {
+  auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU);
+  auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
+  return gen_->generator;
+}
+
+int64_t sample_poisson(double lambda, THGenerator* generator) {
+  if (lambda >= 10) {
+    // transformed rejection method, (Hoermann, 1993)
+    int64_t k;
+    double U, V, a, b, invalpha, vr, us;
+
+    double slam = std::sqrt(lambda);
+    double loglam = std::log(lambda);
+    b = 0.931 + 2.53 * slam;
+    a = -0.059 + 0.02483 * b;
+    invalpha = 1.1239 + 1.1328 / (b - 3.4);
+    vr = 0.9277 - 3.6224 / (b - 2);
+
+    while (1) {
+      U = THRandom_standard_uniform(generator) - 0.5;
+      V = THRandom_standard_uniform(generator);
+      us = 0.5 - std::fabs(U);
+      k = (int64_t)std::floor((2 * a / us + b) * U + lambda + 0.43);
+      if ((us >= 0.07) && (V <= vr)) {
+        return k;
+      }
+      if ((k < 0) || ((us < 0.013) && (V > us))) {
+        continue;
+      }
+      if ((std::log(V) + std::log(invalpha) - std::log(a / (us * us) + b)) <=
+          (-lambda + k * loglam - std::lgamma((double)k + 1))) {
+        return k;
+      }
+    }
+  } else if (lambda == 0) {
+    return 0;
+  } else {
+    int64_t X;
+    double prod, U, enlam;
+
+    enlam = std::exp(-lambda);
+    X = 0;
+    prod = 1.0;
+    while (1) {
+      U = THRandom_standard_uniform(generator);
+      prod *= U;
+      if (prod > enlam) {
+        X += 1;
+      } else {
+        return X;
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace at {
+namespace native {
+
+Tensor bernoulli(const Tensor& self, const Tensor& p, Generator* gen) {
+  Tensor result = self.type().tensor();
+  result.resize_(self.sizes());
+  return native::bernoulli_(result, p, gen);
+}
+
+Tensor bernoulli(const Tensor& self, double p, Generator* gen) {
+  Tensor result = self.type().tensor();
+  result.resize_(self.sizes());
+  return native::bernoulli_(result, p, gen);
+}
+
+Tensor bernoulli(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  result.resize_(self.sizes());
+  return native::bernoulli(result, self, nullptr);
+}
+
+Tensor& bernoulli_(Tensor& self, const Tensor& p_, Generator* gen) {
+  if (!self.is_cuda() && !p_.is_cuda()) {
+    Tensor p = p_.toType(kDouble);
+    AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_", [&] {
+      THGenerator* generator = get_generator(gen);
+      std::lock_guard<std::mutex> lock(generator->mutex);
+      CPU_tensor_apply2<scalar_t, double>(
+          self, p, [generator](scalar_t& ret_val, double& p_val) {
+            ret_val = (scalar_t)THRandom_bernoulli(generator, p_val);
+          });
+    });
+    return self;
+  }
+  self.copy_(at::_th_bernoulli(std::get<0>(expand_inplace(self, p_)), gen));
+  return self;
+}
+
+Tensor& bernoulli_(Tensor& self, double p, Generator* gen) {
+    self._bernoulli_(p, gen);
+    return self;
+}
+
+Tensor& bernoulli_(Tensor& self) {
+  return native::bernoulli_(self, 0.5, nullptr);
+}
+
+Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) {
+  Tensor ret = self.type().tensor(self.sizes());
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "_standard_gamma_grad", [&] {
+    CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(ret, self, output,
+      [](scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) {
+        ret_val = standard_gamma_grad_one<scalar_t, double>(self_val, output_val);
+      }
+    );
+  });
+  return ret;
+}
+
+/*
+ * This section is a counterpart to Distributions.cu
+ */
+
+Tensor _s_poisson_cpu(const Tensor& lambda, Generator *gen) {
+  Tensor ret = at::zeros(lambda.sizes(), lambda.type());
+  AT_DISPATCH_FLOATING_TYPES(ret.type(), "poisson", [&] {
+    THGenerator* generator = get_generator(gen);
+    std::lock_guard<std::mutex> lock(generator->mutex);
+    CPU_tensor_apply2<scalar_t, scalar_t>(ret, lambda,
+      [generator](scalar_t& ret_val, const scalar_t& lambda){
+        ret_val = static_cast<scalar_t>(sample_poisson(static_cast<double>(lambda), generator));
+      }
+    );
+    });
+  return ret;
+}
+
+Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) {
+  Tensor ret = at::zeros(alpha.sizes(), alpha.type());
+  AT_DISPATCH_FLOATING_TYPES(ret.type(), "gamma", [&] {
+    THGenerator* generator = get_generator(gen);
+    std::lock_guard<std::mutex> lock(generator->mutex);
+    CPU_tensor_apply2<scalar_t, scalar_t>(ret, alpha,
+      [generator](scalar_t& ret_val, const scalar_t& alpha){
+        BaseSampler<double> standard_uniform([generator] () {
+          return THRandom_standard_uniform(generator);
+        });
+        BaseSampler<double> standard_normal([generator] () {
+          return THRandom_normal(generator, 0.0, 1.0);
+        });
+        auto sample = sample_gamma<scalar_t, double>(alpha, standard_uniform, standard_normal);
+        ret_val = std::max(std::numeric_limits<scalar_t>::min(), (scalar_t) sample);
+      }
+    );
+    });
+
+  return ret;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
new file mode 100644
index 0000000..7a6e078
--- /dev/null
+++ b/aten/src/ATen/native/Distributions.h
@@ -0,0 +1,215 @@
+#include "TH/THMath.h"
+#ifdef __CUDA_ARCH__
+#include <nvfunctional>
+#endif
+
+namespace {
+
+#ifdef __CUDA_ARCH__
+#define nvfunction_or_function nvstd::function
+#define deviceforcuda __device__
+#else
+#define nvfunction_or_function std::function
+#define deviceforcuda
+// we cannot use std::isnan directly due to some incompatibility of
+// gcc constexpr'ing and nvcc
+#define isnan std::isnan
+#endif
+
+template<typename scalar_t>
+struct BaseSampler {
+  nvfunction_or_function<scalar_t(void)> sampler;
+  deviceforcuda BaseSampler(nvfunction_or_function<scalar_t(void)> sampler): sampler(sampler) {}
+  deviceforcuda scalar_t sample() {
+    return sampler();
+  }
+};
+
+// The function `sample_gamma` is
+// is adapted from Numpy's distributions.c implementation.
+// It is MIT licensed, so here is the copyright:
+
+/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+template<typename scalar_t, typename accscalar_t>
+deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t>& standard_uniform, BaseSampler<accscalar_t>& standard_normal) {
+  accscalar_t scale = 1.0f;
+
+  // Boost alpha for higher acceptance probability.
+  if (alpha < 1.0f) {
+    scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha);
+    alpha += 1.0f;
+  }
+
+  // This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
+  // doi:10.1145/358407.358414
+  const accscalar_t d = alpha - 1.0f / 3.0f;
+  const accscalar_t c = 1.0f / std::sqrt(9.0f * d);
+  for (;;) {
+    accscalar_t x, y;
+    do {
+      x = standard_normal.sample();
+      y = 1.0f + c * x;
+    } while (y <= 0);
+    const accscalar_t v = y * y * y;
+    const accscalar_t u = 1 - standard_uniform.sample();
+    const accscalar_t xx = x * x;
+    if (u < 1.0f - 0.0331f * xx * xx)
+      return static_cast<scalar_t>(scale * d * v);
+    if (std::log(u) < 0.5f * xx + d * (1.0f - v + std::log(v)))
+      return static_cast<scalar_t>(scale * d * v);
+  }
+}
+
+template <typename scalar_t>
+deviceforcuda static inline scalar_t polevl(const scalar_t x,  const scalar_t A[], size_t len) {
+  scalar_t result = 0;
+  for (size_t i = 0; i <= len; i++) {
+    result = result * x + A[i];
+  }
+  return result;
+}
+
+
+/*
+ * The following function comes with the following copyright notice.
+ * It has been released under the BSD license.
+ *
+ * Cephes Math Library Release 2.8:  June, 2000
+ * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+ */
+template<typename scalar_t, typename accscalar_t>
+deviceforcuda static inline scalar_t digamma_one(scalar_t x) {
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  if (x == 0) {
+    return INFINITY;
+  }
+  accscalar_t additional_summand = 0;
+  int x_is_integer = x == std::floor(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      return INFINITY;
+    }
+    // it is more standard to write this as recursion, but
+    // nvcc does not like that
+    additional_summand = - static_cast<accscalar_t>(M_PI) / std::tan(static_cast<accscalar_t>(M_PI) * x);
+    x = 1 - x;
+  }
+
+  // Push x to be >= 10
+  accscalar_t result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10 + additional_summand;
+  }
+
+  // Compute asymptotic digamma
+  static const accscalar_t A[] = {
+     8.33333333333333333333E-2,
+    -2.10927960927960927961E-2,
+     7.57575757575757575758E-3,
+    -4.16666666666666666667E-3,
+     3.96825396825396825397E-3,
+    -8.33333333333333333333E-3,
+     8.33333333333333333333E-2,
+  };
+
+  accscalar_t y = 0;
+  if (x < 1.0e17f) {
+    accscalar_t z = 1.0 / (x * x);
+    y = z * polevl<accscalar_t>(z, A, 6);
+  }
+  return static_cast<scalar_t>(result + std::log(x) - (0.5f / x) - y + additional_summand);
+}
+
+// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha)
+// for random number x drawn from a standard Gamma distribution Gamma(alpha).
+template <typename scalar_t, typename accscalar_t>
+deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) {
+  // Use a Taylor series expansion for small x.
+  accscalar_t x = static_cast<accscalar_t>(x_);
+  accscalar_t alpha = static_cast<accscalar_t>(alpha_);
+  if (x < 0.8f) {
+    accscalar_t numer = 1;
+    accscalar_t denom = alpha;
+    auto series1 = numer / denom;
+    auto series2 = numer / (denom * denom);
+    for (int i = 1; i <= 5; ++i) {
+      numer *= -x / static_cast<accscalar_t>(i);
+      denom += 1;
+      series1 += numer / denom;
+      series2 += numer / (denom * denom);
+    }
+    const auto pow_x_alpha = std::pow(x, alpha);
+    const auto gamma_pdf = std::pow(x, alpha - 1) * std::exp(-x);
+    const auto gamma_cdf = pow_x_alpha * series1;
+    const auto gamma_cdf_alpha = (std::log(x) - digamma_one<accscalar_t,accscalar_t>(alpha)) * gamma_cdf
+        - pow_x_alpha * series2;
+    const auto result = -gamma_cdf_alpha / gamma_pdf;
+    return isnan(result) ? static_cast<scalar_t>( 0.f ) : static_cast<scalar_t>(result);
+  }
+
+  // Use a Rice saddle point expansion for large alpha.
+  if (alpha > 8.0f) {
+    if (0.9f * alpha <= x && x <= 1.1f * alpha) {
+      const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha);
+      const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x)
+          - 65 * x * x / alpha + alpha * (107 + 3600 * x);
+      const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha);
+      return static_cast<scalar_t>(numer_1 * numer_2 / denom);
+    }
+    const auto denom = std::sqrt(8 * alpha);
+    const auto term2 = denom / (alpha - x);
+    const auto term3 = std::pow(x - alpha - alpha * std::log(x / alpha), static_cast<accscalar_t>(-1.5));
+    const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3;
+    const auto term1 = std::log(x / alpha) * term23
+                     - std::sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x));
+    const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha));
+    const auto numer = x * term1;
+    return static_cast<scalar_t>(-stirling * numer / denom);
+  }
+
+  // Use a bivariate rational approximation to the reparameterized gradient.
+  const auto u = std::log(x / alpha);
+  const auto v = std::log(alpha);
+  static const accscalar_t coef_uv[3][8] = {
+    {0.16009398, -0.094634809, 0.025146376, -0.0030648343,
+     1, 0.32668115, 0.10406089, 0.0014179084},
+    {0.53487893, 0.1298071, 0.065735949, -0.0015649758,
+     0.16639465, 0.020070113, -0.0035938915, -0.00058392623},
+    {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777,
+     0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07},
+  };
+  accscalar_t coef_v[8];
+  for (int i = 0; i < 8; ++ i) {
+    coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]);
+  }
+  const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3]));
+  const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7]));
+  return static_cast<scalar_t>(std::exp(p / q));
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
new file mode 100644
index 0000000..7599386
--- /dev/null
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -0,0 +1,182 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+
+namespace at { namespace native {
+
+Tensor embedding(const Tensor & weight, const Tensor & indices,
+                 int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarType("embedding", indices_arg, kLong);
+
+  // TODO: use tensor.index() after improving perf
+  if (indices.dim() == 1) {
+    return weight.index_select(0, indices);
+  }
+
+  auto size = std::vector<int64_t>(indices.sizes());
+  for (auto d : weight.sizes().slice(1)) {
+    size.push_back(d);
+  }
+  return weight.index_select(0, indices.reshape(-1)).view(size);
+}
+
+Tensor embedding_backward(
+    const Tensor & grad, const Tensor & indices, int64_t num_weights,
+    int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+  if (sparse) {
+    return at::embedding_sparse_backward(
+        grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+  } else {
+    return at::embedding_dense_backward(
+        grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+  }
+}
+
+Tensor embedding_sparse_backward(
+    const Tensor & grad_, const Tensor & indices_, int64_t num_weights,
+    int64_t padding_idx, bool scale_grad_by_freq) {
+
+  auto indices_arg = TensorArg(indices_, "indices", 2);
+  checkScalarType("embedding_backward", indices_arg, kLong);
+
+  // TODO: implement scale_grad_by_freq
+  if (scale_grad_by_freq) {
+    AT_ERROR(
+        "embedding_backward: scale_grad_by_freq not supported with sparse gradients");
+  }
+
+  Tensor indices = indices_;
+  Tensor grad = grad_;
+  if (padding_idx != -1) {
+    auto c = indices != padding_idx;
+    indices = indices.index(c);
+    grad = grad.index(c);
+  }
+
+  int64_t num_features = grad_.size(-1);
+  auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
+  auto& dense_type = grad.type();
+  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU);
+
+  // check if all our grad come from padding_idx
+  if (grad.numel() == 0) {
+    // FIXME: USE_TH_SIZE_ZERO_DIM
+    return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor(),
+                                         dense_type.tensor(), weight_size);
+  }
+
+  auto index = indices.reshape({1, -1});
+  auto values = grad.reshape({-1, num_features});
+  return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size);
+}
+
+Tensor embedding_dense_backward_cpu(
+    const Tensor & grad_, const Tensor & indices, int64_t num_weights,
+    int64_t padding_idx, bool scale_grad_by_freq) {
+
+  auto indices_arg = TensorArg(indices, "indices", 2);
+  checkScalarType("embedding_backward", indices_arg, kLong);
+
+  auto indices_contig = indices.contiguous();
+  auto indices_data = indices_contig.data<int64_t>();
+  int64_t numel = indices.numel();
+
+  std::unique_ptr<int64_t[]> counts;
+  if (scale_grad_by_freq) {
+    counts.reset(new int64_t[num_weights]);
+    for (int i = 0; i < numel; i++) {
+      counts[indices_data[i]] = 0;
+    }
+    for (int i = 0; i < numel; i++) {
+      counts[indices_data[i]]++;
+    }
+  }
+
+  auto grad = grad_.contiguous().view({numel, grad_.size(-1)});
+  auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options());
+
+#ifdef _OPENMP
+  if (numel > 1000) {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+      int64_t start = tid * (num_weights/nthreads + 1);
+      int64_t end = start + (num_weights/nthreads + 1);
+      for (int64_t i = 0; i < numel; i++) {
+        if (indices_data[i] != padding_idx) {
+          int64_t k = indices_data[i];
+          if (k >= start && k < end) {
+            double scale = 1.0;
+            if (scale_grad_by_freq) {
+              scale /= counts[k];
+            }
+            grad_weight[k].add_(grad[i], scale);
+          }
+        }
+      }
+    }
+    return grad_weight;
+  }
+#endif
+
+  for (int64_t i = 0; i < numel; i++) {
+    if (indices_data[i] != padding_idx) {
+      int64_t k = indices_data[i];
+      double scale = 1.0;
+      if (scale_grad_by_freq) {
+        scale /= counts[k];
+      }
+      grad_weight[k].add_(grad[i], scale);
+    }
+  }
+
+  return grad_weight;
+}
+
+Tensor & embedding_renorm_cpu_(
+    Tensor & self, const Tensor & indices, double max_norm, double norm_type) {
+  auto self_arg = TensorArg(self, "self", 1);
+  auto indices_arg = TensorArg(indices, "indices", 2);
+  checkDim("embedding_renorm_", self_arg, 2);
+  checkScalarType("embedding_renorm_", indices_arg, kLong);
+
+  auto indices_contig = indices.contiguous();
+
+  auto num_indices = indices.numel();
+  auto data_ptr = indices_contig.data<int64_t>();
+  auto sorted_indices = std::vector<int64_t>(data_ptr, data_ptr + num_indices);
+  std::sort(sorted_indices.begin(), sorted_indices.end(), std::less<int64_t>());
+
+  #pragma omp parallel for if(num_indices > 1000)
+  for (int64_t i = 0; i < num_indices; i++) {
+    if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
+      continue;
+    }
+    auto row = self[sorted_indices[i]];
+    auto norm = row.norm(norm_type).toCDouble();
+    if (norm > max_norm) {
+      auto scale = max_norm / (norm + 1e-7);
+      row *= scale;
+    }
+  }
+
+  return self;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
new file mode 100644
index 0000000..d171893
--- /dev/null
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -0,0 +1,356 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include "TH/THBlasUtils.h"
+
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace {
+  const int MODE_SUM = 0;
+  const int MODE_MEAN = 1;
+  const int MODE_MAX = 2;
+}
+
+namespace at {
+namespace native {
+
+static void make_offset2bag(const Tensor &offsets, const Tensor &indices,
+                            Tensor &offset2bag) {
+  offset2bag.index_add_(
+      0, offsets, at::ones_like(offsets)); // offset2bag = [1 0 1 0 1]
+  offset2bag[0] -= 1;                     // offset2bag = [0 0 1 0 1]
+  offset2bag = offset2bag.cumsum(0);     // offset2bag = [0 0 1 1 2]
+}
+
+// This function combines index_select (using select_indices as the index) and
+// index_add (using add_indices as the index), without creating an intermediary
+// tensor to hold the selected embeddings
+template<typename T>
+static void index_select_add(const Tensor &select_indices,
+                             const Tensor &add_indices,
+                             const Tensor &src,
+                             Tensor &output) {
+  auto add_indices_data = add_indices.data<int64_t>();
+  auto select_indices_data = select_indices.data<int64_t>();
+  auto src_data = src.data<T>();
+  auto output_data = output.data<T>();
+  auto numel = add_indices.numel();
+  int64_t ddim = src.size(1);
+  auto src_stride0 = src.stride(0);
+  auto src_stride1 = src.stride(1);
+  auto output_stride0 = output.stride(0);
+  auto output_stride1 = output.stride(1);
+  for (int64_t i = 0; i < numel; i++) {
+    THBlas_axpy<T>(ddim, 1,
+            src_data + src_stride0 * select_indices_data[i], src_stride1,
+            output_data + output_stride0 * add_indices_data[i], output_stride1);
+  }
+}
+
+static void make_bag_size(const Tensor &offsets, const Tensor &indices,
+                          const int64_t mode, Tensor &bag_size) {
+  if (mode == MODE_MEAN || mode == MODE_MAX) {
+    // Compute this for MODE_MEAN and MODE_MAX (latter needed for backwards)
+    if (offsets.size(0) != 1) {
+      bag_size.slice(0, 0, bag_size.size(0) - 1, 1) =
+          offsets.slice(0, 1, offsets.size(0), 1) -
+          offsets.slice(0, 0, offsets.size(0) - 1, 1);
+    }
+    bag_size[-1] = indices.size(0) - offsets[-1];
+  }
+}
+
+static Tensor apply_bag_size(const Tensor &offsets, const Tensor &indices,
+                             const int64_t mode, Tensor &output,
+                             const Tensor &bag_size) {
+  if (mode == MODE_MEAN) {
+    if (offsets.size(0) == 1) {
+      auto bag_size_ = indices.size(0);
+      output /= bag_size_;
+    } else {
+      // Avoid dividing by 0 for empty bags.
+      // Instead we want empty bags to return all 0s
+      auto bag_size_ = at::max(bag_size, at::ones_like(bag_size))
+                           .toType(output.type())
+                           .unsqueeze(1)
+                           .expand_as(output);
+      output /= bag_size_;
+    }
+  }
+  return output;
+}
+
+static Tensor apply_bag_size_backward(const Tensor &offsets,
+                                      const Tensor &indices, const int64_t mode,
+                                      Tensor &output, const Tensor &offset2bag,
+                                      const Tensor &bag_size) {
+  if (mode == MODE_MEAN) {
+    if (offsets.size(0) == 1) {
+      auto bag_size_ = indices.size(0);
+      output /= bag_size_;
+    } else {
+      auto inv_bag_size_ = (1 / bag_size.toType(output.type()))
+                             .unsqueeze(1)
+                             .index_select(0, offset2bag);
+      output *= inv_bag_size_;
+    }
+  }
+  return output;
+}
+
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor, Tensor> embedding_bag_cpu_max(
+  const Tensor& weight, const Tensor &indices, const Tensor& offset2bag, const Tensor& output, const Tensor& bag_size, const Tensor& offsets) {
+
+    auto max_indices = at::zeros({offsets.size(0), weight.size(1)}, indices.type());
+
+    int64_t numel = indices.numel();
+    int64_t dims = weight.size(1);
+    auto indices_data = indices.data<int64_t>();
+    auto offset2bag_data = offset2bag.data<int64_t>();
+
+    auto max_indices_data = max_indices.data<int64_t>();
+    auto max_indices_stride = max_indices.stride(0);
+
+    auto weight_data = weight.data<scalar_t>();
+    auto output_data = output.data<scalar_t>();
+    auto weight_stride0 = weight.stride(0);
+    auto weight_stride1 = weight.stride(1);
+    auto output_stride = output.stride(0);
+
+    for (int i = 0; i < numel; i++) {
+      auto bag = offset2bag_data[i];
+      auto word_idx = indices_data[i];
+
+
+      for (int dim = 0; dim < dims; dim++) {
+        auto& current_item = output_data[output_stride * bag + dim];
+        auto weight_item = weight_data[weight_stride0 * word_idx + dim * weight_stride1];
+
+        bool is_first_for_bag = (i == 0) || offset2bag_data[i - 1] != bag;
+
+        if (is_first_for_bag || weight_item > current_item) {
+          current_item = weight_item;
+          max_indices_data[max_indices_stride * bag + dim] = word_idx;
+        }
+      }
+    }
+
+    return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, offset2bag, bag_size, max_indices);
+}
+
+// embedding_bag wrapper to enforce contiguity in tensors other than `weight`.
+// This is created to save extra `.contiguous()` call in backward.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+embedding_bag(const Tensor &weight, const Tensor &indices,
+              const Tensor &offsets, const bool scale_grad_by_freq,
+              const int64_t mode, bool sparse) {
+  return at::_embedding_bag(weight, indices.contiguous(), offsets.contiguous(),
+                            scale_grad_by_freq, mode, sparse);
+  };
+
+// Assumes all input tensors except for `weight` are contiguous.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+_embedding_bag_cpu(const Tensor &weight, const Tensor &indices,
+                  const Tensor &offsets, const bool scale_grad_by_freq,
+                  const int64_t mode, bool sparse) {
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarType("embedding_bag", indices_arg, kLong);
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarType("embedding_bag", indices_arg, kLong);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+  checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble});
+
+  auto bag_size = at::zeros(offsets.sizes(), indices.type());
+  make_bag_size(offsets, indices, mode, bag_size);
+
+  // If the last entries are empty, that the last offsets are irrelevant as they
+  // won't change anything in the assignment of ID -> bag, but index_add would
+  // throw out of bounds error. So to keep it simple we just add one more
+  // entry to the end then get rid of it after make_offset2bag.
+  auto offset2bag = at::zeros(
+     {indices.sizes()[0] + 1}, indices.options()); // offset2bag = [0 0 0 0 0]
+
+  make_offset2bag(offsets, indices, offset2bag);
+
+  offset2bag.resize_({indices.sizes()[0]});
+
+  auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options());
+
+  if (mode == MODE_MEAN || mode == MODE_SUM) {
+    if (weight.type().scalarType() == kFloat) {
+      index_select_add<float>(indices, offset2bag, weight, output);
+    } else if (weight.type().scalarType() == kDouble) {
+      index_select_add<double>(indices, offset2bag, weight, output);
+    }
+    auto ret = apply_bag_size(offsets, indices, mode, output, bag_size);
+    return std::tuple<Tensor, Tensor, Tensor, Tensor>(ret, offset2bag, bag_size, bag_size);
+  } else { // MODE_MAX
+    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      weight.type(), "embedding_bag_cpu_max", [&]() {
+        return embedding_bag_cpu_max<scalar_t>(weight, indices, offset2bag, output, bag_size, offsets);
+      }
+    );
+  }
+}
+
+// Assumes all input tensors are contiguous.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices,
+                              const Tensor &offsets,
+                              const Tensor &offset2bag,
+                              const Tensor &bag_size_,
+                              const Tensor &max_indices_,
+                              int64_t num_weights,
+                              bool scale_grad_by_freq, int64_t mode,
+                              bool sparse) {
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarType("embedding_bag", indices_arg, kLong);
+  checkContiguous("embedding_bag", indices_arg);
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarType("embedding_bag", offsets_arg, kLong);
+  checkContiguous("embedding_bag", offsets_arg);
+  auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1);
+  checkScalarType("embedding_bag", offset2bag_arg, kLong);
+  checkContiguous("embedding_bag", offset2bag_arg);
+
+  if (sparse) {
+    return at::_embedding_bag_sparse_backward(
+        grad, indices, offsets, offset2bag, bag_size_, num_weights,
+        scale_grad_by_freq, mode);
+  } else {
+    return at::_embedding_bag_dense_backward(
+        grad, indices, offsets, offset2bag, bag_size_, max_indices_, num_weights,
+        scale_grad_by_freq, mode);
+  }
+}
+
+Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indices_,
+                                  const Tensor &offsets_,
+                                  const Tensor &offset2bag__,
+                                  const Tensor &bag_size_,
+                                  const Tensor& max_indices_, int64_t num_weights,
+                                  bool scale_grad_by_freq, int64_t mode) {
+  // indices_, offsets_ and offset2bag__ are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward above.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  auto grad = grad_.contiguous();
+  auto grad_arg = TensorArg(grad, "grad_", 1);
+  checkScalarTypes("embedding_bag", grad_arg, {kFloat, kDouble});
+
+  Tensor &offset2bag_ = const_cast<Tensor &>(offset2bag__);
+
+  auto ind_sort_ = indices_.sort();
+  auto indices = std::get<0>(ind_sort_);
+  auto ind_sort = std::get<1>(ind_sort_);
+  auto offset2bag = offset2bag_.index_select(0, ind_sort);
+
+  auto indices_data = indices.data<int64_t>();
+  auto offsets_data = offsets_.data<int64_t>();
+  auto offset2bag_data = offset2bag.data<int64_t>();
+  int64_t numel = indices.numel();
+
+  std::vector<int64_t> counts(num_weights);
+  for (int i = 0; i < numel; i++) {
+    counts[indices_data[i]] = 0;
+  }
+  for (int i = 0; i < numel; i++) {
+    counts[indices_data[i]]++;
+  }
+
+  auto index_grad_weight =
+      at::zeros({num_weights, grad.size(1)}, grad.type()).contiguous();
+
+  std::vector<int64_t> counts_uniq;
+  counts_uniq.reserve(num_weights);
+  int64_t o = 0;
+  for (int64_t i = 0; i < numel; i += counts[indices_data[i]]) {
+    counts_uniq.push_back(counts[indices_data[i]]);
+    if (o > 0) {
+      counts_uniq[o] += counts_uniq[o - 1];
+    }
+    o++;
+  }
+
+  if (mode == MODE_MEAN || mode == MODE_SUM) {
+    #pragma omp parallel for if (numel > 1000)
+      for (int64_t i = 0; i < (int64_t)counts_uniq.size(); i++) {
+        int64_t start = i == 0 ? 0 : counts_uniq[i - 1];
+        int64_t index = indices_data[start];
+        for (int64_t j = start; j < counts_uniq[i]; j++) {
+          int64_t source = offset2bag_data[j];
+          double scale = 1.0;
+          if (scale_grad_by_freq) {
+            scale /= counts[indices_data[i]];
+          }
+          if (mode == 1) { // MODE_MEAN
+            if (offsets_.size(0) == 1) {
+              auto bag_size = indices.size(0);
+              scale /= bag_size;
+            } else {
+              if (source == offsets_.size(0) - 1) {
+                scale /= indices.size(0) - offsets_data[offsets_.size(0) - 1];
+              } else {
+                scale /= offsets_data[source + 1] - offsets_data[source];
+              }
+            }
+          }
+          int64_t ddim = grad.size(1);
+          if (grad.type().scalarType() == kFloat) {
+            auto igwd = index_grad_weight.data<float>();
+            auto gd = grad.data<float>();
+            THBlas_axpy<float>(ddim, (float)scale, gd + ddim * source, 1,
+                        igwd + ddim * index, 1);
+          } else if (grad.type().scalarType() == kDouble) {
+            auto igwd = index_grad_weight.data<double>();
+            auto gd = grad.data<double>();
+            THBlas_axpy<double>(ddim, (double)scale, gd + ddim * source, 1,
+                         igwd + ddim * index, 1);
+          }
+        }
+      }
+  } else if (mode == MODE_MAX) {
+    auto nonempty_max_indices = max_indices_.index_select(0, bag_size_.nonzero().view(-1));
+    auto nonempty_grad = grad_.index_select(0, bag_size_.nonzero().view(-1));
+
+    for (int64_t dim = 0; dim < grad.size(1); dim++) {
+      index_grad_weight.select(1, dim).index_add_(
+        0, nonempty_max_indices.select(1, dim), nonempty_grad.select(1, dim));
+    }
+  }
+
+  return index_grad_weight;
+}
+
+Tensor _embedding_bag_sparse_backward(
+    const Tensor &grad_, const Tensor &indices, const Tensor &offsets,
+    const Tensor &offset2bag, const Tensor &bag_size_, int64_t num_weights,
+    bool scale_grad_by_freq, int64_t mode) {
+  // indices, offsets and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward above.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  Tensor grad = grad_;
+  Tensor index_grad = grad_.index_select(0, offset2bag);
+  index_grad = apply_bag_size_backward(offsets, indices, mode, index_grad,
+                                       offset2bag, bag_size_);
+  return native::embedding_backward(index_grad, indices, num_weights, -1,
+                                    scale_grad_by_freq, true);
+}
+}
+} // namespace at::native
diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp
new file mode 100644
index 0000000..0e9a594
--- /dev/null
+++ b/aten/src/ATen/native/Gesv.cpp
@@ -0,0 +1,126 @@
+#include "ATen/ATen.h"
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/Dispatch.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include "ATen/native/LinearAlgebraUtils.h"
+#include "ATen/native/Gesv.h"
+
+#include "TH.h"  // for USE_LAPACK
+
+#include <vector>
+
+#ifdef USE_LAPACK
+extern "C" void dgesv_(
+    int* n, int* nrhs, double* a, int* lda,
+    int *ipiv, double* b, int* ldb, int* info);
+extern "C" void sgesv_(
+    int* n, int* nrhs, float* a, int* lda,
+    int* ipiv, float* b, int* ldb, int* info);
+#endif
+
+namespace at { namespace native {
+
+template<class scalar_t>
+void lapackGesv(
+    int n, int nrhs, scalar_t* a, int lda, int* ipiv,
+    scalar_t* b, int ldb, int* info) {
+  AT_ERROR("gesv only takes float or double Tensors");
+}
+
+#ifdef USE_LAPACK
+template<> void lapackGesv<float>(
+    int n, int nrhs, float* a, int lda, int* ipiv,
+    float* b, int ldb, int* info) {
+  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+
+template<> void lapackGesv<double>(
+    int n, int nrhs, double* a, int lda, int* ipiv,
+    double* b, int ldb, int* info) {
+  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+#endif
+
+template <typename scalar_t>
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+#ifndef USE_LAPACK
+  AT_ERROR("gesv: LAPACK library not found in compilation");
+#endif
+  auto A_data = A.data<scalar_t>();
+  auto b_data = b.data<scalar_t>();
+  auto A_mat_stride = matrixStride(A);
+  auto b_mat_stride = matrixStride(b);
+
+  auto batch_size = batchCount(A);
+  auto n = A.size(-2);
+  auto nrhs = b.size(-1);
+
+  auto ipiv = at::empty({n}, b.type().toScalarType(kInt));
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    int info;
+    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
+    lapackGesv<scalar_t>(n, nrhs, A_working_ptr, n, ipiv.data<int>(),
+        b_working_ptr, n, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
+  }
+}
+
+std::tuple<Tensor,Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A) {
+  std::vector<int64_t> infos(batchCount(A), 0);
+  auto A_working_copy = cloneBatchedColumnMajor(A);
+  auto b_working_copy = cloneBatchedColumnMajor(self);
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+    applyGesv<scalar_t>(b_working_copy, A_working_copy, infos);
+  });
+  checkErrors(infos);
+  return std::tuple<Tensor,Tensor>(b_working_copy, A_working_copy);
+}
+
+// Supports arbitrary batch dimensions for self and A
+std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
+  if (self.dim() <= 2 && A.dim() <= 2) {
+    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
+    // TH and ATen. We should remove the TH gesv bindings, especially
+    // since the lapackGesv function is already in ATen.
+    return at::_gesv_single(self, A);
+  }
+
+  checkInputs(self, A);
+
+  // broadcast the batch dimensions of self and A.
+  IntList self_batch_sizes(self.sizes().data(), self.ndimension() - 2);
+  IntList A_batch_sizes(A.sizes().data(), A.ndimension() - 2);
+  std::vector<int64_t> expand_batch_portion =
+      infer_size(self_batch_sizes, A_batch_sizes);
+
+  std::vector<int64_t> self_expand_size({expand_batch_portion});
+  self_expand_size.insert(self_expand_size.end(),
+      { self.size(-2), self.size(-1) });
+
+  std::vector<int64_t> A_expand_size({expand_batch_portion});
+  A_expand_size.insert(A_expand_size.end(),
+      { A.size(-2), A.size(-1) });
+
+  Tensor self_broadcasted  = self.expand(self_expand_size);
+  Tensor A_broadcasted = A.expand(A_expand_size);
+  return self.type()._gesv_helper(self_broadcasted, A_broadcasted);
+}
+
+std::tuple<Tensor&,Tensor&> gesv_out(
+    Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) {
+  if (self.dim() > 2 || A.dim() > 2) {
+    AT_ERROR("torch.gesv() with the `out` keyword does not support batching. "
+                  "b.dim() (%lld) and A.dim() (%lld) must both be 2.",
+                  (long long)self.dim(), (long long)A.dim());
+  }
+  return at::_gesv_single_out(solution, lu, self, A);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h
new file mode 100644
index 0000000..2d26552
--- /dev/null
+++ b/aten/src/ATen/native/Gesv.h
@@ -0,0 +1,32 @@
+#include "ATen/ATen.h"
+
+namespace at { namespace native {
+
+static inline void checkInputs(const Tensor& self, const Tensor& A) {
+  if (A.size(-1) != A.size(-2)) {
+    AT_ERROR("A must be batches of square matrices, "
+        "but they are %lld by %lld matrices",
+        (long long)A.size(-1), (long long)A.size(-2));
+  }
+  if (A.size(-1) != self.size(-2)) {
+    AT_ERROR("Incompatible matrix sizes for matmul: each A "
+        "matrix is %llu by %lld but each b matrix is %lld by %lld.",
+        (long long)A.size(-1), (long long)A.size(-1),
+        (long long)self.size(-2), (long long)self.size(-1));
+  }
+}
+
+static inline void checkErrors(std::vector<int64_t> infos) {
+  for (size_t i = 0; i < infos.size(); i++) {
+    auto info = infos[i];
+    if (info < 0) {
+      AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value",
+          (long long)i, -info);
+    } else if (info > 0) {
+      AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.",
+          (long long)i, info, info);
+    }
+  }
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
new file mode 100644
index 0000000..9720adb
--- /dev/null
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -0,0 +1,327 @@
+// Indexing tensors by by tensors
+//
+// This corresponds to "advanced indexing" in NumPy. The two operations are:
+//
+//  index(Tensor self, indices) -> Tensor
+//  index_put_(Tensor self, indices, value)
+//
+// The index is a TensorList containg kLong or kByte tensors or nulls. Byte
+// tensors (boolean masks) are expanded to long tensors via nonzero(). Null
+// tensors signify that the dimension is not indexed.
+//
+// All indexes are broadcast together and iterated as *one*. From NumPy:
+//
+// result[i_1, ..., i_M] == x[ind_1[i_1, ..., i_M], ind_2[i_1, ..., i_M],
+//                           ..., ind_N[i_1, ..., i_M]]
+//
+// Note 1: ByteTensors expand to index as many dimensions as there are in the
+// mask.
+//
+// Note 2: The behavior is more complicated when the index tensors are not all
+// adjacent (e.g. x[[0, 1], :, [2, 3]]). In this case, self and the index
+// tensors are transposed to the front: x.transpose(1, 2)[[0, 1], [2, 3]]
+
+
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/ExpandUtils.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+namespace at { namespace native {
+
+[[noreturn]]
+static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) {
+  std::stringstream ss;
+  ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx;
+  ss << " does not match the shape of the indexed tensor " << self.sizes();
+  ss << " at index " << idx;
+  throw std::runtime_error(ss.str());
+}
+
+static void checkIndexTensorTypes(TensorList indices) {
+  for (auto& tensor : indices) {
+    if (tensor.defined()) {
+      auto& type = tensor.type();
+      auto scalarType = type.scalarType();
+      if (scalarType != kLong && scalarType != kByte) {
+        throw std::runtime_error("tensors used as indices must be long or byte tensors");
+      }
+    }
+  }
+}
+
+static std::vector<Tensor> expandByteTensors(const Tensor & self, TensorList indices) {
+  // Expands byte tensors (masks) into the equivalent indexing by LongTensors
+  std::vector<Tensor> result;
+  for (auto & index : indices) {
+    if (index.type().scalarType() == kByte) {
+      // The sizes of the ByteTensor mask must match the sizes of the
+      // corresponding dimensions in self
+      for (int64_t j = 0; j < index.dim(); j++) {
+        int64_t srcIdx = result.size() + j;
+        if (index.size(j) != self.size(srcIdx)) {
+          invalid_mask(self, srcIdx, index, j);
+        }
+      }
+      // Replace with nonzeros
+      auto nonzero = index.nonzero();
+#ifndef USE_TH_SIZE_ZERO_DIM
+      auto special_empty = nonzero.numel() == 0;
+#else
+      auto special_empty = false;
+#endif
+      for (int64_t j = 0; j < index.dim(); j++) {
+        if (special_empty) {
+          // We can't call select on an empty tensor so we just create an empty
+          // tensor.
+          result.emplace_back(nonzero.type().tensor());
+        } else {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      }
+    } else {
+      result.emplace_back(index);
+    }
+  }
+  return result;
+}
+
+static bool hasContiguousSubspace(TensorList tl) {
+  // true if all the non-null tensors are adjacent
+  auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
+  auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
+  auto start = std::find_if(tl.begin(), tl.end(), isDefined);
+  auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined);
+  auto it = std::find_if(start, stop.base(), isNull);
+  return it == stop.base();
+}
+
+// Transposes the tensor and indices together so that all the non-null indices
+// index the first k dimensions of the tensor. Returns the transposed tensor
+// and the reordered indices. For example:
+//  transposeToFront(tensor, {nullptr, a, nullptr, b})
+// returns
+//  tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
+static std::tuple<Tensor, std::vector<Tensor>>
+transposeToFront(Tensor self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  for (int64_t i = 0; i < self.dim(); i++) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (int64_t i = 0; i < self.dim(); i++) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices));
+}
+
+static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
+  // computes the stride as if tensor were contigous
+  auto sizes = tensor.sizes();
+  std::vector<int64_t> stride(tensor.dim());
+  stride[tensor.dim() - 1] = 1;
+  std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies<int64_t>());
+  return stride;
+}
+
+// Unsqueezes src `before` times at the front and `after` times at the end
+static Tensor unsqueezeN(const Tensor & src, int64_t before, int64_t after) {
+  auto srcSizes = src.sizes();
+  auto nDim = src.dim();
+  std::vector<int64_t> sizes(nDim + before + after, 1);
+  for (int64_t i = 0; i < nDim; i++) {
+    sizes[i + before] = srcSizes[i];
+  }
+  return src.view(sizes);
+}
+
+static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) {
+  if (index.numel() != 0) {
+    auto max_idx = index.max().toCLong();
+    auto min_idx = index.min().toCLong();
+    if (max_idx >= dim_size) {
+      AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
+    }
+    if (min_idx < -dim_size) {
+      AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
+    }
+  }
+  return index.remainder(dim_size);
+}
+
+static Tensor computeLinearIndex(const Tensor & src, TensorList indices) {
+  auto strides = computeLinearStride(src);
+  Type& longType = src.type().toScalarType(kLong);
+
+  // Compute the linear index by multiplying the indexing tensors by the
+  // stride and summing them. All the indexing tensors have the same shape at
+  // this point. We also compute the number of dimensions before and after that
+  // are not being index.
+  Tensor linearIndex;
+  int64_t emptyBefore = 0, emptyAfter = 0, nElemBefore = 1, nElemAfter = 1;
+  for (int64_t i = 0; i < src.dim(); i++) {
+    if (indices[i].defined()) {
+      // Cast index to the longType matching src's backend
+      // This allows us to support ie indexing a cuda tensor with a cpu tensor
+      Tensor index = (wrapIndexOnce(indices[i], i, src.size(i)) * strides[i]).toType(longType);
+      if (linearIndex.defined()) {
+        linearIndex += index;
+      } else {
+        linearIndex = index;
+      }
+    } else if (linearIndex.defined()) {
+      emptyAfter++;
+      nElemAfter *= src.size(i);
+    } else {
+      emptyBefore++;
+      nElemBefore *= src.size(i);
+    }
+  }
+
+  // Compute the linear indices for the parts of the tensor not being indexed
+  Tensor beforeIndex;
+  if (emptyBefore > 0) {
+    auto index = at::arange(0, nElemBefore, longType) * strides[emptyBefore - 1];
+    index = index.view(src.sizes().slice(0, emptyBefore));
+    beforeIndex = unsqueezeN(index, 0, linearIndex.dim() + emptyAfter);
+  }
+  Tensor afterIndex;
+  if (emptyAfter > 0) {
+    auto index = at::arange(0, nElemAfter, longType);
+    index = index.view(src.sizes().slice(src.dim() - emptyAfter, emptyAfter));
+    afterIndex = unsqueezeN(index, linearIndex.dim() + emptyBefore, 0);
+  }
+
+  // Sum with broadcasting to compute the full index
+  linearIndex = unsqueezeN(linearIndex, emptyBefore, emptyAfter);
+  if (beforeIndex.defined()) {
+    linearIndex = linearIndex + beforeIndex;
+  }
+  if (afterIndex.defined()) {
+    linearIndex = linearIndex + afterIndex;
+  }
+  return linearIndex;
+}
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+static bool hasEmptyTensor(TensorList tensors) {
+  for (auto& tensor : tensors) {
+    if (tensor.defined() && tensor.numel() == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+#endif
+
+static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig) {
+  checkIndexTensorTypes(orig);
+  // first expand ByteTensor (boolean masks) into 1 or more LongTensors
+  auto indices = expandByteTensors(self, orig);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (hasEmptyTensor(indices)) {
+    return std::make_tuple(self, self.type().toScalarType(kLong).tensor());
+  }
+#endif
+  // next broadcast all index tensors together
+  indices = expand_outplace(indices);
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices) = transposeToFront(self, indices);
+  }
+  auto linearIndex = computeLinearIndex(self, indices);
+  return std::make_tuple(self, linearIndex);
+}
+
+Tensor index(const Tensor & self, TensorList indices) {
+  if (indices.size() > (size_t)self.dim()) {
+   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+
+  Tensor src, linearIndex;
+  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
+  return src.take(linearIndex);
+}
+
+Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) {
+  if (indices.size() > (size_t)self.dim()) {
+   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+
+  Tensor src, linearIndex, expandedValue;
+  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
+  std::tie(expandedValue) = expand_inplace(linearIndex, value);
+  Tensor dst = src.clone();
+  return dst.put_(linearIndex, expandedValue);
+}
+
+Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
+  if (indices.size() > (size_t)self.dim()) {
+   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+
+  Tensor src, linearIndex, expandedValue;
+  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
+  std::tie(expandedValue) = expand_inplace(linearIndex, value);
+  return src.put_(linearIndex, expandedValue);
+}
+
+Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
+  dim = maybe_wrap_dim(dim, self.dim());
+
+  if (index.dim() >= 2) {
+   AT_ERROR(
+        "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
+  }
+  int64_t numIndices = index.numel();
+  if (source.dim() == 0 && numIndices != 1) {
+   AT_ERROR(
+        "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
+  }
+  if (index.type().scalarType() != ScalarType::Long) {
+   AT_ERROR("index_copy_(): Expected LongTensor for index");
+  }
+
+  // Check that source and destination slices have the same size
+  auto selfSlicedSizes = std::vector<int64_t>(self.sizes());
+  if (selfSlicedSizes.size() > 0) {
+    selfSlicedSizes.erase(selfSlicedSizes.begin() + dim);
+  }
+  auto sourceSlicedSizes = std::vector<int64_t>(source.sizes());
+  if (sourceSlicedSizes.size() > 0) {
+    sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim);
+  }
+  if (selfSlicedSizes.size() != sourceSlicedSizes.size() ||
+      !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(),
+                  sourceSlicedSizes.begin())) {
+    std::stringstream ss;
+    ss << "index_copy_(): Source/destination tensor must have same slice shapes. ";
+    ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim;
+    ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0.";
+    throw std::runtime_error(ss.str());
+  }
+  if (source.dim() > 0 && numIndices != source.size(dim)) {
+     AT_ERROR(
+          "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")");
+  }
+
+  return self._indexCopy_(dim, index, source);
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
new file mode 100644
index 0000000..203bd5f
--- /dev/null
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -0,0 +1,342 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/SparseTensorRef.h>
+#include <ATen/ExpandUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+  static bool _type_has_native(const Type& dtype) {
+    return dtype.is_sparse();
+  }
+
+  static bool _has_native(const Tensor& self) {
+    return _type_has_native(self.type());
+  }
+}
+
+// These native operations are not "really" native; they're actually just bridge
+// functions that decide whether or not to call native sparse functions, or
+// TH functions.  This file should be temporary; when all of TH gets ported, we
+// can just use the native mechanism straight.
+
+// TODO: Maybe the foo_ variants should call th_foo_
+
+Tensor norm(const Tensor & self, Scalar p) {
+  if (_has_native(self)) {
+    return native_norm(self, p);
+  } else {
+    return th_norm(self, p);
+  }
+}
+
+Tensor clone(const Tensor& self) {
+  if (_has_native(self)) {
+    return native_clone(self);
+  } else {
+    return th_clone(self);
+  }
+}
+
+Tensor& resize_as_(Tensor& self, const Tensor& the_template) {
+  if (_has_native(self)) {
+    return native_resize_as_(self, the_template);
+  } else {
+    return th_resize_as_(self, the_template);
+  }
+}
+
+Tensor& pow_out(Tensor& result, const Tensor& self, Scalar exponent) {
+  if (_has_native(self)) {
+    return native_pow_out(result, self, exponent);
+  } else {
+    return th_pow_out(result, self, exponent);
+  }
+}
+
+Tensor pow(const Tensor& self, Scalar exponent) {
+  if (_has_native(self)) {
+    return native_pow(self, exponent);
+  } else {
+    return th_pow(self, exponent);
+  }
+}
+
+Tensor& zero_(Tensor& self) {
+  if (_has_native(self)) {
+    return native_zero_(self);
+  } else {
+    return th_zero_(self);
+  }
+}
+
+// Note [Multiple dispatch to sparse]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// In an ideal world, we would use direct support for multiple dispatch to
+// say that add(Dense, Dense) should dispatch to one function, while
+// add(Dense, Sparse) should dispatch to another function.
+//
+// In a world where we only have single dispatch, we can single dispatch on
+// the first function, and then do an is_sparse() test on the second argument
+// to direct ourselves to the correct argument.
+//
+// We are in neither of those worlds.  Instead, we have a th_add function
+// which has legacy implementations in the single dispatch world, BUT our
+// actual add function needs to call s_native_add if the function *would have*
+// utilized a sparse kernel that is natively implemented.
+//
+// th_add is "good old single dispatch" which internally handles the is_sparse()
+// test and also handles broadcasting.  s_native_add works asymmetrically:
+// it doesn't handle broadcasting at all, and it ASSUMES that the relevant
+// argument is a sparse tensor.  Why the asymmetry?  It turns out it is not
+// so easy to figure out if a kernel is implemented in THS; it's not as simple
+// as testing if the first argument is sparse, because, e.g.,
+// in add(Dense, Sparse), the sparse kernel is in the second argument.  So,
+// the trampoline function is going to know about the overloads *anyway*; it
+// might as well also handle is_sparse() and broadcasting while it's at it.
+//
+// Why not change TH to follow this new scheme?  We could... but since it's
+// all going away when we finish porting the TH functions to ATen, we haven't
+// done it.
+
+Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto self_sparse = self.is_sparse();
+  auto other_sparse = other.is_sparse();
+  if (self_sparse && other_sparse) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "add_out");
+    return s_native_add_out(result, b_self, b_other, alpha);
+  } else if (!self_sparse && other_sparse) {
+    // TODO: Perhaps doing overload selection with SparseTensorRef is
+    // confusing, and we should have given these overloads different names.
+    // For now, we do it this way for consistency with the TH bindings
+    // (not that it is terribly consistent anyway).
+    return native_add_out(result, self, SparseTensorRef(other), alpha);
+  } else {
+    return th_add_out(result, self, other, alpha);
+  }
+}
+
+// NB: You may be tempted to implement add and add_ just as calls to add_out, but
+// calling the actual implementing function matters, because broadcast
+// will be handled differently depending on if you call add_ or (a seemingly
+// equivalent) add_out.  Arguably this mismatch in treatment is a bug,
+// c.f., https://github.com/pytorch/pytorch/issues/8308 but fixing this
+// bug would involve changing a lot of other places, so we leave it
+// alone for now.
+
+Tensor add(const Tensor& self, const Tensor& other, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto self_sparse = self.is_sparse();
+  auto other_sparse = other.is_sparse();
+  if (self_sparse && other_sparse) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "add");
+    return s_native_add(b_self, b_other, alpha);
+  } else if (!self_sparse && other_sparse) {
+    return native_add(self, SparseTensorRef(other), alpha);
+  } else {
+    return th_add(self, other, alpha);
+  }
+}
+
+Tensor& add_(Tensor& self, const Tensor& other, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto self_sparse = self.is_sparse();
+  auto other_sparse = other.is_sparse();
+  if (self_sparse && other_sparse) {
+    Tensor b_other;
+    std::tie(b_other) = expand_inplace(self, other, "add_");
+    return s_native_add_(self, b_other, alpha);
+  } else if (!self_sparse && other_sparse) {
+    return native_add_(self, SparseTensorRef(other), alpha);
+  } else {
+    return th_add_(self, other, alpha);
+  }
+}
+
+
+Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
+  if (_has_native(self)) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "sub_out");
+    return s_native_sub_out(result, b_self, b_other, alpha);
+  } else {
+    return th_sub_out(result, self, other, alpha);
+  }
+}
+
+Tensor sub(const Tensor& self, const Tensor& other, Scalar alpha) {
+  if (_has_native(self)) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "sub");
+    return s_native_sub(b_self, b_other, alpha);
+  } else {
+    return th_sub(self, other, alpha);
+  }
+}
+
+Tensor& sub_(Tensor& self, const Tensor& other, Scalar alpha) {
+  if (_has_native(self)) {
+    Tensor b_other;
+    std::tie(b_other) = expand_inplace(self, other, "sub_");
+    return s_native_sub_(self, b_other, alpha);
+  } else {
+    return th_sub_(self, other, alpha);
+  }
+}
+
+
+Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
+  if (_has_native(self)) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "mul_out");
+    return s_native_mul_out(result, self, other);
+  } else {
+    return th_mul_out(result, self, other);
+  }
+}
+
+Tensor mul(const Tensor& self, const Tensor& other) {
+  if (_has_native(self)) {
+    Tensor b_self, b_other;
+    std::tie(b_self, b_other) = expand_outplace(self, other, "mul");
+    return s_native_mul(self, other);
+  } else {
+    return th_mul(self, other);
+  }
+}
+
+Tensor& mul_(Tensor& self, const Tensor& other) {
+  if (_has_native(self)) {
+    Tensor b_other;
+    std::tie(b_other) = expand_inplace(self, other, "mul_");
+    return s_native_mul_(self, b_other);
+  } else {
+    return th_mul_(self, other);
+  }
+}
+
+Tensor& mul_out(Tensor& result, const Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_mul_out(result, self, other);
+  } else {
+    return th_mul_out(result, self, other);
+  }
+}
+
+Tensor mul(const Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_mul(self, other);
+  } else {
+    return th_mul(self, other);
+  }
+}
+
+Tensor& mul_(Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_mul_(self, other);
+  } else {
+    return th_mul_(self, other);
+  }
+}
+
+
+Tensor& div_out(Tensor& result, const Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_div_out(result, self, other);
+  } else {
+    return th_div_out(result, self, other);
+  }
+}
+
+Tensor div(const Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_div(self, other);
+  } else {
+    return th_div(self, other);
+  }
+}
+
+Tensor& div_(Tensor& self, Scalar other) {
+  if (_has_native(self)) {
+    return native_div_(self, other);
+  } else {
+    return th_div_(self, other);
+  }
+}
+
+Tensor& addmm_out(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto mat1_sparse = mat1.is_sparse();
+  if (mat1_sparse) {
+    Tensor b_self;
+    std::tie(b_self) = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+    return s_native_addmm_out(result, b_self, mat1, mat2, beta, alpha);
+  } else {
+    return th_addmm_out(result, self, mat1, mat2, beta, alpha);
+  }
+}
+
+Tensor addmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto mat1_sparse = mat1.is_sparse();
+  if (mat1_sparse) {
+    Tensor b_self;
+    std::tie(b_self) = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm");
+    return s_native_addmm(b_self, mat1, mat2, beta, alpha);
+  } else {
+    return th_addmm(self, mat1, mat2, beta, alpha);
+  }
+}
+
+Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  // See Note [Multiple dispatch to sparse]
+  auto mat1_sparse = mat1.is_sparse();
+  if (mat1_sparse) {
+    // inplace is not broadcasting
+    return s_native_addmm_(self, mat1, mat2, beta, alpha);
+  } else {
+    return th_addmm_(self, mat1, mat2, beta, alpha);
+  }
+}
+
+
+Tensor tensor(const Type& dtype) {
+  if (_type_has_native(dtype)) {
+    return dtype.native_tensor();
+  } else {
+    return dtype.th_tensor();
+  }
+}
+
+Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
+  if (_type_has_native(dtype)) {
+    return dtype.native_tensor(size);
+  } else {
+    return dtype.th_tensor(size);
+  }
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
+  return values.type().toSparse().native_sparse_coo_tensor(indices, values);
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
+  return values.type().toSparse().native_sparse_coo_tensor(indices, values, size);
+}
+
+Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
+  return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size);
+}
+
+int64_t get_device(const Tensor& self) {
+  if (_has_native(self)) {
+    return native_get_device(self);
+  } else {
+    return _th_get_device(self);
+  }
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
new file mode 100644
index 0000000..cb24e71
--- /dev/null
+++ b/aten/src/ATen/native/Linear.cpp
@@ -0,0 +1,440 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtilsMulti.h"
+
+namespace at { namespace native {
+
+
+// sumproduct_pair computes `(left*right).sum(sumdims)` by means of permutation and
+// batch matrix multiplication
+// its main purpose is to provide a pairwise reduction for einsum
+static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntList sum_dims_, bool keepdim) {
+  // assumes that tensors have been pre-unsqueezed (so that all dimensions match - after broadcasting)
+  // but makes no other assumptions on the order of dimensions
+  AT_CHECK(left_.dim()==right_.dim(), "number of dimensions must match");
+  if (sum_dims_.size() == 0)
+    return at::mul(left_, right_);
+  int64_t dim = left_.dim();
+  auto sum_dims = dim_list_to_bitset(sum_dims_, dim);
+  // dimensions that will be part of the output (i.e. not summed over) in three vectors
+  // dims in lro appear in left, right and output, similarly lo: left and output, ro: right and output
+  // also the sizes are kept track of for reshaping
+  std::vector<int64_t> lro, lo, ro;
+  int64_t lro_size = 1, lo_size = 1, ro_size = 1, sum_size = 1;
+  Tensor left = left_;
+  Tensor right = right_;
+  for (int64_t i = 0; i < dim; i++) {
+    auto sl = left.size(i)>1;
+    auto sr = right.size(i)>1;
+    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
+      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
+	AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
+	sum_size *= left.size(i);
+      } else if (sl) { // if it is only in one of left and right, we can sum right away
+	left = left.sum(i, true);
+      } else if (sr) {
+	right = right.sum(i, true);
+      }
+    } else if (sl && sr) { // now deal with dimensions  dimensions that will be in the output
+      // dimensions nontrivially in both left and right must be of the same size
+      AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
+      lro.push_back(i);
+      lro_size *= left.size(i);
+    } else if (sl) { // keep track of dimensions appearing only once
+      lo.push_back(i);
+      lo_size *= left.size(i);
+    } else {
+      ro.push_back(i);
+      ro_size *= right.size(i);
+    }
+  }
+  // we now work with the following permutations / shapes.
+  // the pipeline is permute inputs -> reshape inputs -> batch matrix mul -> reshape(view) output -> permute output
+  // output: "lro, lo, 1-for-summed-dims, ro" with orgiginal shape dimensions
+  // left:   "lro, lo, summed" permuted with lpermutation and the three flattened
+  // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
+  // then the permuted output is a view of bmm(left, right)
+  // finally, opermutation reverts the permutation to the original order of dimensions
+  std::vector<int64_t> out_size;
+  for (auto& d : lro) out_size.push_back(left.size(d));
+  for (auto& d : lo) out_size.push_back(left.size(d));
+  for (auto& d : sum_dims_) { out_size.push_back(1); (void)(d); }; // avoid warining about not using d
+  for (auto& d : ro) out_size.push_back(right.size(d));
+
+  std::vector<int64_t> lpermutation(lro);
+  lpermutation.insert(lpermutation.end(), lo.begin(), lo.end());
+  lpermutation.insert(lpermutation.end(), sum_dims_.begin(), sum_dims_.end());
+  lpermutation.insert(lpermutation.end(), ro.begin(), ro.end());
+
+  std::vector<int64_t> rpermutation(lro);
+  rpermutation.insert(rpermutation.end(), sum_dims_.begin(), sum_dims_.end());
+  rpermutation.insert(rpermutation.end(), ro.begin(), ro.end());
+  rpermutation.insert(rpermutation.end(), lo.begin(), lo.end());
+
+  std::vector<int64_t> opermutation(lro.size()+lo.size()+sum_dims_.size()+ro.size(), -1);
+  {
+  int64_t i = 0;
+
+  for (auto it = lro.begin(); it != lro.end(); i++, it++) {
+    opermutation[*it] = i;
+  }
+  for (auto it = lo.begin(); it != lo.end(); i++, it++) {
+    opermutation[*it] = i;
+  }
+  for (auto it = sum_dims_.begin(); it != sum_dims_.end(); i++, it++) {
+    opermutation[*it] = i;
+  }
+  for (auto it = ro.begin(); it != ro.end(); i++, it++) {
+    opermutation[*it] = i;
+  }
+  }
+
+  // now we can execute the operations above
+  left = left.permute(lpermutation).reshape({lro_size, lo_size, sum_size});
+  right = right.permute(rpermutation).reshape({lro_size, sum_size, ro_size});
+  Tensor result = at::bmm(left, right);
+  result = result.view(out_size).permute(opermutation);
+
+  // finally squeeze summed dimensions if desired
+  if (! keepdim) {
+    for (int i = dim-1; i>=0; i--)
+      if (sum_dims[i])
+	result.squeeze_(i);
+  }
+  return result;
+}
+
+Tensor einsum(std::string eqn, TensorList tensors) {
+  constexpr size_t number_of_letters = 26;
+  std::string in_eqn;
+  size_t pos;
+  // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis.
+  // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter
+  // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices.
+  // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that
+  // the letter has not been assigned an index yet (because it has not been seen).
+  // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices).
+  // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet.
+  // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below.
+
+  std::array<std::int64_t, number_of_letters> letter_mapping; // map letter to internal (numerical) label
+  letter_mapping.fill(-1);
+  int64_t num_ell_idxes = -1;
+  int64_t first_ell_idx = 0;
+
+  // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes.
+  // For each operand, we have a vector mapping each dimension to an internal index.
+  // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and
+  // of the last occurence of each index.
+  std::vector<std::vector<int64_t>> input_op_idxes;                   // the parsed operand indices
+  std::array<std::int64_t, number_of_letters> num_letter_occurrences; // number of occurrence in the equation of this letter
+  num_letter_occurrences.fill(0);
+  std::vector<std::int64_t> last_idx_occurrence;                      // the last operator (left to right) using this index
+
+  if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side
+    in_eqn = eqn.substr(0, pos);
+  } else {
+    in_eqn = eqn;
+  }
+
+  // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
+  int64_t operand = 0;
+  std::stringstream eqn_stream(in_eqn);
+  std::string term;
+  int64_t num_total_idxes = 0;
+  while (! eqn_stream.eof()) {
+    std::getline(eqn_stream, term, ',');  // term = string with indices of current term
+    AT_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
+
+    int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
+    // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
+    int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3;
+    int64_t dims_in_term = 0;              // dimensions we have seen
+    std::vector<int64_t> current_op_idxes; // mapping of operand dimensions to indices for current term
+    for (auto &c : term) {                 // c = character with a single letter or '.'
+      if (c == '.') {
+        ell_char_count++;
+        AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
+        if (ell_char_count == 3) {        // this completes the ellipsis
+          if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
+            first_ell_idx = num_total_idxes;
+            num_ell_idxes = candidate_num_ell_idxes;
+            num_total_idxes += num_ell_idxes;
+          }
+          else {                          // we have seen an ellipsis before, so we check compatibility
+            AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
+		     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
+          }
+          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
+            current_op_idxes.push_back(first_ell_idx + i);
+            last_idx_occurrence.push_back(operand);
+          }
+          dims_in_term += num_ell_idxes;                // keep track of dimensions
+        }
+      } else {                                          // a letter (hopefully)
+        AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
+        AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
+        if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
+          letter_mapping[letter_num] = num_total_idxes;
+          num_total_idxes++;
+          last_idx_occurrence.push_back(operand);
+        } else {                                        // letter we have already seen
+          last_idx_occurrence[letter_mapping[letter_num]] = operand;
+        }
+        num_letter_occurrences[letter_num]++;
+        current_op_idxes.push_back(letter_mapping[letter_num]);
+        dims_in_term++;
+      }
+    }
+    AT_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
+    input_op_idxes.push_back(std::move(current_op_idxes));
+    operand++;
+  }
+  // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
+  AT_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
+
+  // the following parses or infers output (right hand side)
+  // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
+  // for the output indices. -1 means that the index has not been assigned a dimension yet
+  std::vector<int64_t> idxes_to_preprocessed_dims(num_total_idxes, -1);     // the position of the index in the tensor dimensions
+  int64_t num_output_dims = 0;
+  if (pos != std::string::npos) {            // parse the user provided right hand side
+    int64_t ell_char_count = 0;
+    for (auto &c : eqn.substr(pos+2)) {
+      if (c == '.') {                        // '.' as part of ellipsis
+        ell_char_count++;
+        AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
+        if (ell_char_count == 3) {           // ellipsis complete
+          AT_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
+          for (int64_t i = 0; i < num_ell_idxes; ++i) {
+            idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
+            num_output_dims++;
+          }
+        }
+      } else {                              // letter (hopefully)
+        AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
+        AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        int64_t letter_num = c-'a';
+        AT_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output");
+        idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
+        num_output_dims++;
+      }
+    }
+  } else { // create an inferred right hand side
+    // the ellipsis (if in the lhs) comes first
+    if (num_ell_idxes >= 0) {
+      for (int64_t i = 0; i < num_ell_idxes; ++i) {
+        idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
+        num_output_dims++;
+      }
+    }
+    // then the indices that occur exactly once in alphabetic order
+    for (size_t idx = 0; idx < number_of_letters; idx++) {
+      if (num_letter_occurrences[idx] == 1) {
+        idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims;
+        num_output_dims++;
+      }
+    }
+  }
+  // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
+  // for the non-output indices - those that are eventually summed over
+  int64_t position = num_output_dims;
+  for (int64_t i = 0; i < num_total_idxes; i++) {
+    if (idxes_to_preprocessed_dims[i]==-1) {
+      idxes_to_preprocessed_dims[i] = position;
+      position++;
+    }
+  }
+
+  // we now "homogenize the dimensions", i.e.
+  // - take diagonals for duplicated indices
+  // - permute the dimensions to match the order given by idxes_to_preprocessed_dims
+  // - unsqueeze to create all dimensions for each index in each tensor where they are missing
+  // we also check that sizes match
+  // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable)
+  std::vector<Tensor> preprocessed_operands;
+  std::vector<std::int64_t> size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet
+  for (int64_t op = 0; op < (int64_t) tensors.size(); op++) {
+    auto preprocessed_op = tensors[op];
+    std::vector<int64_t> idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear
+    std::vector<int64_t>& current_op_input_idxes = input_op_idxes[op];
+    int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input
+    for (size_t i = 0; i < current_op_input_idxes.size(); i++) {
+      auto idx = current_op_input_idxes[i];
+      auto dim_out = idxes_to_preprocessed_dims[idx];
+      if (idx_to_dim[dim_out] == -1) { // first appearance
+        idx_to_dim[dim_out] = dim;
+        if (size_of_dims[idx] == -1) { // keep track of sizes
+          size_of_dims[idx] = preprocessed_op.size(dim);
+        }
+        else {
+          AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+        }
+        dim++;
+      } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
+        AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+        preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
+        // diagonal moves the diagonal dimension to the back
+        // now we permute the last dim back to idx_to_dim[dim_out]
+        std::vector<int64_t> perm(preprocessed_op.dim(), 0);
+        for (int64_t d = 0; d < preprocessed_op.dim(); d++) {
+          if (d == idx_to_dim[dim_out]) {
+            perm[d] = preprocessed_op.dim() - 1;
+          } else {
+            perm[d] = d - (d > idx_to_dim[dim_out]);
+          }
+        }
+        preprocessed_op = preprocessed_op.permute(perm);
+      }
+    }
+    // now we permute the dimensions in the right order
+    std::vector<int64_t> permutation; // permutation for this tensor
+    for (auto &d : idx_to_dim) {
+      if (d > -1) {
+        permutation.push_back(d);
+      }
+    }
+    preprocessed_op = preprocessed_op.permute(permutation);
+    // finally, we insert dimensions for idxes not in the operand 
+    for (size_t dim = 0; dim < idx_to_dim.size(); dim++) {
+      if (idx_to_dim[dim] == -1) {
+        preprocessed_op = preprocessed_op.unsqueeze(dim);
+      }
+    }
+    preprocessed_operands.push_back(preprocessed_op);
+  }
+
+  // now we reduce the indices from left to right
+  // numpy allows to optimize the path using various
+  // algorithms (see eigen_path in numpy docs)
+  // we start with the leftmost operator and reduce indices that
+  // appear only there
+  Tensor result = preprocessed_operands[0];
+  for (int64_t idx = 0; idx < num_total_idxes; idx++) {
+    if ((last_idx_occurrence[idx] == 0)
+        && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
+      result = result.sum(idxes_to_preprocessed_dims[idx], true);
+    }
+  }
+
+  // now we process each tensor using sumproduct_pair
+  for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) {
+    std::vector<int64_t> sum_dims;
+    for (int64_t idx = 0; idx < num_total_idxes; idx++) {
+      if ((last_idx_occurrence[idx] == i)
+          && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
+        sum_dims.push_back(idxes_to_preprocessed_dims[idx]);
+      }
+    }
+    result = at::native::sumproduct_pair(result, preprocessed_operands[i], sum_dims, true);
+  }
+  // finally, we squeeze out all non-result dimensions
+  for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--)
+    result.squeeze_(dim);
+  return result;
+}
+
+// _trilinear computes a trilinear einstein sum with an unrolled dimension
+// the result is `(i1.unsqueeze(expand1)*i2.unsqueeze(expand2)*i2.unsqueeze(expand3)).sum(sumdim)`
+// the computation is unrolled in the unroll_dim dimension
+// its main purpose is to unify the computations in bilinear and bilinear_backward
+Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
+		  IntList expand1_, IntList expand2_, IntList expand3_,
+		  IntList sumdim_, int64_t unroll_dim) {
+  int64_t total_dim = i1_.dim()+expand1_.size();
+  AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
+  auto expand1 = dim_list_to_bitset(expand1_, total_dim);
+  auto expand2 = dim_list_to_bitset(expand2_, total_dim);
+  auto expand3 = dim_list_to_bitset(expand3_, total_dim);
+  auto sumdim  = dim_list_to_bitset(sumdim_,  total_dim);
+  Tensor i1 = i1_;
+  Tensor i2 = i2_;
+  Tensor i3 = i3_;
+  std::vector<int64_t> output_size;
+  std::vector<int64_t> sum_dims_12, sum_dims_23;
+  int64_t unroll_size = -1;
+  // asserts...
+  for (int64_t i = 0; i < total_dim; i++) {
+    int64_t s = 0;
+    if (expand1[i]) {
+      i1 = i1.unsqueeze(i);
+    } else  {
+      s = i1.size(i);
+    }
+    if (expand2[i]) {
+      i2 = i2.unsqueeze(i);
+    } else  {
+      s = i2.size(i);
+    }
+    if (expand3[i]) {
+      i3 = i3.unsqueeze(i);
+      if (sumdim[i] && (i != unroll_dim))
+	sum_dims_12.push_back(i);
+    } else  {
+      s = i3.size(i);
+      if (sumdim[i] && (i != unroll_dim))
+	sum_dims_23.push_back(i);
+    }
+    output_size.push_back(sumdim[i] ? 1 : s);
+    if (i == unroll_dim)
+      unroll_size = s;
+  }
+  int64_t slicemul1 = (expand1[unroll_dim] ? 0 : 1);
+  int64_t slicemul2 = (expand2[unroll_dim] ? 0 : 1);
+  int64_t slicemul3 = (expand3[unroll_dim] ? 0 : 1);
+
+  auto output = i1.type().tensor(output_size).zero_();
+  if (! sumdim[unroll_dim]) {
+    for (int64_t k = 0; k < unroll_size; k++) {
+      Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
+					       i2.narrow(unroll_dim, k * slicemul2, 1),
+					       sum_dims_12, true);
+      buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
+      output.narrow(unroll_dim, k, 1).add_(buf);
+    }
+  }
+  else {
+    for (int64_t k = 0; k < unroll_size; k++) {
+      Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
+					       i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
+      buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
+      output.add_(buf);
+    }
+  }
+  for (int64_t i = output.dim()-1; i >= 0; i--)
+    if (sumdim[i])
+      output.squeeze_(i);
+  return output;
+}
+
+Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const Tensor& bias) {
+  AT_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim());
+  for (int64_t i = 0; i < input1.dim() - 1; i++) {
+    AT_CHECK(input1.size(i) == input2.size(i),
+              "bilinear(): input batch dimensions do not match at dim ", i, ": got ", input1.size(i), " and ", input2.size(i));
+  }
+  AT_CHECK(input1.size(input1.dim() - 1) == weight.size(1),
+            "bilinear(): input1 size does not match weight size: got ",
+            input1.size(input1.dim() - 1), " but expected ", weight.size(1));
+  AT_CHECK(input2.size(input2.dim() - 1) == weight.size(2),
+            "bilinear(): input2 size does not match weight size: got ",
+            input2.size(input2.dim() - 1), " but expected ", weight.size(2));
+  AT_CHECK(!bias.defined() || bias.size(0) == weight.size(0),
+            "bilinear(): bias size does not match weight size: got ",
+            bias.size(0), " but expected ", weight.size(0));
+
+  std::vector<int64_t> output_size;
+  auto size1 = input1.sizes();
+  output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
+  output_size.push_back(weight.size(0));
+  auto input1_flattened = input1.view({-1, input1.size(-1)});
+  auto input2_flattened = input2.view({-1, input2.size(-1)});
+  Tensor output = at::_trilinear(input1_flattened, weight, input2_flattened, {1,3}, {0}, {1,2}, {2,3}).reshape(output_size);
+  if (bias.defined()) {
+    output = output + bias;
+  }
+  return output;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
new file mode 100644
index 0000000..388d704
--- /dev/null
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -0,0 +1,319 @@
+#include "ATen/ATen.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include <functional>
+#include <numeric>
+#include <vector>
+
+namespace at {
+namespace native {
+
+// Helper function for det methods.
+// For pivoted LU factorization A = P * L * U. Since we always have det(L) = 1,
+// det(P) = \pm 1, this method returns a 3-tuple:
+//   (det(P), diag(U), info),
+// where info helps us identify singular matrices.
+static inline std::tuple<double, Tensor, int> _lu_det_P_diag_U_info(const Tensor& self) {
+  Tensor p, lu, info;
+  std::tie(lu, p, info) = self.unsqueeze(0).btrifact_with_info();
+  p.squeeze_(0);
+  lu.squeeze_(0);
+  int int_info = info.squeeze_().toCInt();
+  AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info);
+  auto n = self.size(0);
+  auto num_exchanges = (at::arange(1, n + 1, p.type()) != p).nonzero().size(0);
+  if (num_exchanges % 2 == 1) {
+    return std::make_tuple(-1., lu.diag(), int_info);
+  } else {
+    return std::make_tuple(1., lu.diag(), int_info);
+  }
+}
+
+Tensor det(const Tensor& self) {
+  AT_CHECK(at::isFloatingType(self.type().scalarType()) &&
+           self.dim() == 2 && self.size(0) == self.size(1),
+           "det(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
+           "of floating types");
+  double det_P;
+  Tensor diag_U;
+  int info;
+  std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self);
+  if (info > 0) {
+    return at::zeros({}, self.type());
+  } else {
+    return diag_U.prod().mul_(det_P);
+  }
+}
+
+Tensor logdet(const Tensor& self) {
+  AT_CHECK(at::isFloatingType(self.type().scalarType()) &&
+           self.dim() == 2 && self.size(0) == self.size(1),
+           "logdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
+           "of floating types");
+  double det_P;
+  Tensor diag_U, det;
+  int info;
+  std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self);
+  if (info > 0) {
+    det = at::zeros({}, self.type());
+  } else {
+    det = diag_U.prod().mul_(det_P);
+  }
+  if (det.sign().toCDouble() <= 0) {
+    return det.log_();  // in order to get proper -inf (det=0) or nan (det<0)
+  } else {
+    return diag_U.abs().log().sum();
+  }
+}
+
+std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
+  AT_CHECK(at::isFloatingType(self.type().scalarType()) &&
+           self.dim() == 2 && self.size(0) == self.size(1),
+           "slogdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
+           "of floating types");
+  double det_P;
+  Tensor diag_U, det;
+  int info;
+  std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self);
+  if (info > 0) {
+    det = at::zeros({}, self.type());
+  } else {
+    det = diag_U.prod().mul_(det_P);
+  }
+  return std::make_tuple(det.sign(), diag_U.abs_().log_().sum());
+}
+
+Tensor inverse(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::native::inverse_out(result, self);
+}
+
+Tensor& inverse_out(Tensor &result, const Tensor &self) {
+  AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA,
+           "tensor should have CPU or CUDA backend");
+  AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional");
+  AT_CHECK(self.size(0) == self.size(1), "tensor should be square");
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "tensor should be of floating-point type");
+  if (self.size(0) == 0) {
+    return result.resize_({0, 0});
+  } else {
+    return at::_getri_out(result, self);
+  }
+}
+
+Tensor pinverse(const Tensor& self, double rcond) {
+  AT_CHECK(at::isFloatingType(self.type().scalarType()) && self.dim() == 2,
+           "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor "
+           "of floating types");
+  AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional");
+  if (self.numel() == 0) {
+    // Match NumPy
+    return self.type().tensor({self.size(1), self.size(0)});
+  }
+  Tensor U, S, V;
+  std::tie(U, S, V) = self.svd();
+  double max_val = S[0].toCDouble();
+  Tensor S_pseudoinv = at::where(S > rcond * max_val, S.reciprocal(), at::zeros({}, self.options()));
+  return V.mm(S_pseudoinv.diag().mm(U.t()));
+}
+
+static void check_1d(const Tensor& t, const char* arg, const char* fn) {
+ AT_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D");
+}
+
+Tensor ger(const Tensor& self, const Tensor& vec2) {
+  check_1d(self, "self", "ger");
+  check_1d(vec2, "vec2", "ger");
+  return at::_ger(self, vec2);
+}
+
+Tensor& ger_out(Tensor& result, const Tensor& self, const Tensor& vec2) {
+  check_1d(self, "self", "ger");
+  check_1d(vec2, "vec2", "ger");
+  return at::_ger_out(result, self, vec2);
+}
+
+Tensor mm(const Tensor& self, const Tensor& mat2) {
+  if (self.is_sparse()) {
+    return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1);
+  }
+  return self.type()._mm(self, mat2);
+}
+
+Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) {
+  if (self.is_sparse()) {
+    return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1);
+  }
+  return self.type()._mm_out(result, self, mat2);
+}
+
+Tensor mv(const Tensor& self, const Tensor& vec) {
+  check_1d(vec, "vec", "mv");
+  return at::_mv(self, vec);
+}
+
+Tensor& mv_out(Tensor& result, const Tensor& self, const Tensor& vec) {
+  check_1d(vec, "vec", "mv");
+  return at::_mv_out(result, self, vec);
+}
+
+Tensor addmv(const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) {
+  check_1d(vec, "vec", "addmv");
+  return at::_addmv(self, mat, vec, beta, alpha);
+}
+
+Tensor& addmv_(Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) {
+  check_1d(vec, "vec", "addmv");
+  return self._addmv_(mat, vec, beta, alpha);
+}
+
+Tensor& addmv_out(Tensor &result, const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) {
+  check_1d(vec, "vec", "addmv");
+  return at::_addmv_out(result, self, mat, vec, beta, alpha);
+}
+
+Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
+  check_1d(vec1, "vec1", "addr");
+  check_1d(vec2, "vec2", "addr");
+  return at::_addr(self, vec1, vec2, beta, alpha);
+}
+
+Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
+  check_1d(vec1, "vec1", "addr");
+  check_1d(vec2, "vec2", "addr");
+  return self._addr_(vec1, vec2, beta, alpha);
+}
+
+Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
+  check_1d(vec1, "vec1", "addr");
+  check_1d(vec2, "vec2", "addr");
+  return at::_addr_out(result, self, vec1, vec2, beta, alpha);
+}
+
+Tensor dot(const Tensor& self, const Tensor& tensor) {
+  check_1d(self, "self", "dot");
+  check_1d(tensor, "tensor", "dot");
+  return self._dot(tensor);
+}
+
+Tensor& dot_out(Tensor& result, const Tensor& self, const Tensor& tensor) {
+  result.resize_({});
+  // dispatching through type ensures we don't allow mismatched types.
+  return self.type().fill_(result, self.dot(tensor));
+}
+
+/*
+Matrix product of two Tensors.
+The behavior depends on the dimensionality of the Tensors as follows:
+- If both Tensors are 1-dimensional, the dot product (scalar) is returned.
+- If both arguments are 2-dimensional, the matrix-matrix product is returned.
+- If the first argument is 1-dimensional and the second argument is 2-dimensional,
+  a 1 is prepended to its dimension for the purpose of the matrix multiply.
+  After the matrix multiply, the prepended dimension is removed.
+- If the first argument is 2-dimensional and the second argument is 1-dimensional,
+  the matrix-vector product is returned.
+- If both arguments are at least 1-dimensional and at least one argument is
+  N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+  argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+  batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+  The non-matrix (i.e. batch) dimensions are broadcasted (and thus
+  must be broadcastable).  For example, if tensor1 is a (j x 1 x n x m) Tensor
+  and tensor2 is a (k x m x p) Tensor, the returned tensor will be an (j x k x n x p) Tensor.
+*/
+Tensor matmul(at::optional<Tensor> out_opt, const Tensor& tensor1, const Tensor& tensor2) {
+  auto dim_tensor1 = tensor1.dim();
+  auto dim_tensor2 = tensor2.dim();
+  auto has_out = out_opt.has_value();
+  Tensor out = out_opt.value_or(Tensor());
+
+  if (dim_tensor1 == 1 && dim_tensor2 == 1) {
+    return has_out ? at::native::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2);
+  } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
+    return has_out ? at::native::mv_out(out, tensor1, tensor2) : tensor1.mv(tensor2);
+  } else if (dim_tensor1 == 1 && dim_tensor2 == 2) {
+    return has_out ? at::native::mm_out(out, tensor1.unsqueeze(0), tensor2).squeeze_(0)
+                   : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
+  } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
+    return has_out ? at::native::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2);
+  } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
+    // optimization: use mm instead of bmm by folding tensor1's batch into
+    // its leading matrix dimension.
+
+    Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
+    auto size1 = tensor1.sizes();
+    auto size2 = t2.sizes();
+    std::vector<int64_t> output_size;
+    output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
+    if (dim_tensor2 > 1) {
+      output_size.push_back(size2[dim_tensor2 - 1]);
+    }
+
+    // fold the batch into the first dimension
+    Tensor t1 = tensor1.contiguous().view({-1, size1[size1.size() - 1]});
+    Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size)
+                            : at::_unsafe_view(t1.mm(t2), output_size);
+    return has_out ? out.set_(output) : output;
+  } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
+    // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
+    // we track m1 vs m2 separately even though they must match for nicer error messages
+    int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1;
+    int64_t m1 = tensor1.size(-1);
+    IntList batch_tensor1(tensor1.sizes().data(), std::max<int64_t>(dim_tensor1 - 2, 0));
+    int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1;
+    int64_t p = tensor2.size(-1);
+    IntList batch_tensor2(tensor2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0));
+
+    // expand the batch portion (i.e. cut off matrix dimensions and expand rest)
+    std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
+
+    std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
+    tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1});
+
+    std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
+    tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
+
+    int expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(),
+                                               1, std::multiplies<int64_t>());
+
+    std::vector<int64_t> tensor1_bmm_view({expand_batch_product});
+    tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1});
+
+    std::vector<int64_t> tensor2_bmm_view({expand_batch_product});
+    tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p});
+
+    // flatten expanded batches
+    Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).contiguous().view(tensor1_bmm_view);
+    Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).contiguous().view(tensor2_bmm_view);
+
+    // reshape batches back into result
+    std::vector<int64_t> output_shape(expand_batch_portion);
+    if (dim_tensor1 > 1) {
+      output_shape.push_back(n);
+    }
+    if (dim_tensor2 > 1) {
+      output_shape.push_back(p);
+    }
+
+    Tensor output = has_out ? at::_unsafe_view(at::bmm_out(out, tensor1_expanded, tensor2_expanded), output_shape)
+                            : at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape);
+
+    return has_out ? out.set_(output) : output;
+  }
+
+ AT_ERROR("both arguments to matmul need to be at least 1D, but they are ",
+          dim_tensor1, "D and ", dim_tensor2, "D");
+
+}
+
+Tensor matmul(const Tensor & tensor1, const Tensor & tensor2) {
+  return at::native::matmul(at::nullopt, tensor1, tensor2);
+}
+
+Tensor& matmul_out(Tensor &result, const Tensor & tensor1, const Tensor & tensor2) {
+  at::native::matmul(at::optional<Tensor>(result), tensor1, tensor2);
+  return result;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
new file mode 100644
index 0000000..d7b9a6d
--- /dev/null
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -0,0 +1,42 @@
+#include "ATen/ATen.h"
+
+namespace at { namespace native {
+
+/*
+ * Clones a Tensor so that the following conditions hold:
+ * If we think of a Tensor of having size (B, M, N), where B is any number
+ * of batch dimensions, then:
+ * - Each (M, N) matrix is in column major form
+ * - Let Tensor P have size (B, M, N) and Q have size (B, M', N').
+ *   Then when laid out in memory, the M by N matrix starting at
+ *   P.data_ptr()[b * M * N] is of the same corresponding batch as the M' by N'
+ *   matrix starting at Q.data_ptr()[b * M' * N'].
+ */
+static inline Tensor cloneBatchedColumnMajor(const Tensor& src) {
+  // If src is already in batched column major format, then
+  // this will be efficient (no reordering of the data will occur)
+  // because the first transpose will make the tensor contiguous,
+  // and cloning a contiguous tensor is fast.
+  auto result = src.transpose(-2, -1).clone();
+  result.transpose_(-2, -1);
+  return result;
+}
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches.
+ */
+static inline int64_t batchCount(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    result *= batched_matrices.size(i);
+  }
+  return result;
+}
+
+// Computes the number of elements of a matrix in a batched matrix tensor
+static inline int64_t matrixStride(const Tensor& batched_matrices) {
+  return batched_matrices.size(-1) * batched_matrices.size(-2);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
new file mode 100644
index 0000000..c370cb4
--- /dev/null
+++ b/aten/src/ATen/native/Loss.cpp
@@ -0,0 +1,74 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+#define EPSILON 1e-12
+
+
+namespace at { namespace native {
+
+Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) {
+  auto prod_sum = (input1 * input2).sum(1);
+  auto mag_square1 = (input1 * input1).sum(1) + EPSILON;
+  auto mag_square2 = (input2 * input2).sum(1) + EPSILON;
+  auto denom = (mag_square1 * mag_square2).sqrt_();
+  auto cos = prod_sum / denom;
+
+  auto zeros = at::zeros_like(target);
+  auto pos = 1 - cos;
+  auto neg = (cos - margin).clamp_min_(0);
+  auto output_pos = at::where(target == 1, pos, zeros);
+  auto output_neg = at::where(target == -1, neg, zeros);
+  auto output = output_pos + output_neg;
+
+  if (reduction == Reduction::ElementwiseMean) {
+    return output.sum() / target.numel();
+  } else if (reduction == Reduction::Sum) {
+    return output.sum();
+  }
+  return output;
+}
+
+Tensor hinge_embedding_loss(const Tensor& self, const Tensor& target, double margin, int64_t reduction) {
+  auto zeros = at::zeros_like(self);
+  auto margin_clamp = (margin - self).clamp_min_(0);
+  auto output_margin = at::where(target != 1, margin_clamp, zeros);
+  auto output_self = at::where(target != -1, self, zeros);
+  auto output = output_margin + output_self;
+
+  if (reduction == Reduction::ElementwiseMean) {
+    return output.sum() / self.numel();
+  } else if (reduction == Reduction::Sum) {
+    return output.sum();
+  }
+  return output;
+}
+
+Tensor triplet_margin_loss(const Tensor& anchor, const Tensor& positive, const Tensor& negative, double margin,
+                           double p, double eps, bool swap, int64_t reduction) {
+  auto dist_pos = at::pairwise_distance(anchor, positive, p, eps);
+  auto dist_neg = at::pairwise_distance(anchor, negative, p, eps);
+  if (swap) {
+    auto dist_swap = at::pairwise_distance(positive, negative, p, eps);
+    dist_neg = at::min(dist_neg, dist_swap);
+  }
+  auto output = at::clamp_min(margin + dist_pos - dist_neg, 0);
+
+  if (reduction == Reduction::ElementwiseMean) {
+    return output.sum() / output.numel();
+  } else if (reduction == Reduction::Sum) {
+    return output.sum();
+  }
+  return output;
+}
+
+Tensor margin_ranking_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) {
+  auto output =  (-target * (input1 - input2) + margin).clamp_min_(0);
+
+  if (reduction == Reduction::ElementwiseMean) {
+    return output.sum() / output.numel();
+  } else if (reduction == Reduction::Sum) {
+    return output.sum();
+  }
+  return output;
+}
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
new file mode 100644
index 0000000..dfb7e62
--- /dev/null
+++ b/aten/src/ATen/native/Memory.cpp
@@ -0,0 +1,20 @@
+#include "ATen/ATen.h"
+#include "ATen/Error.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+
+namespace at {
+namespace native {
+
+Tensor pin_memory(const Tensor& self) {
+  if (self.type().backend() != kCPU) {
+    AT_ERROR("cannot pin '", self.type().toString(), "' only CPU memory can be pinned");
+  }
+  auto* allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
+  auto tensor = self.type().tensorWithAllocator(self.sizes(), self.strides(), allocator);
+  tensor.copy_(self);
+  return tensor;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
new file mode 100644
index 0000000..ded0082
--- /dev/null
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -0,0 +1,193 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+#include "ATen/Config.h"
+
+#include "ATen/detail/CUDAHooksInterface.h"
+
+#include <vector>
+
+namespace at { namespace native {
+
+namespace {
+  void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){
+    if (actual != expected){
+      std::stringstream ss;
+      ss << arg_name << " should contain " << expected << " elements not " << actual ;
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+Tensor batch_norm(
+    const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
+    bool training, double momentum, double eps, bool cudnn_enabled) {
+
+  auto num_features = input.sizes()[1];
+  if (running_mean.defined()) {
+    check_dims_match_num_input_features("running_mean", num_features, running_mean.numel());
+  } else if (!training) {
+    throw std::runtime_error("running_mean must be defined in evaluation mode");
+  }
+  if (running_var.defined()) {
+    check_dims_match_num_input_features("running_var", num_features, running_var.numel());
+  } else if (!training) {
+    throw std::runtime_error("running_var must be defined in evaluation mode");
+  }
+  if (weight.defined()) {
+    check_dims_match_num_input_features("weight", num_features, weight.numel());
+  }
+  if (bias.defined()) {
+    check_dims_match_num_input_features("bias", num_features, bias.numel());
+  }
+
+  bool use_cudnn = false;
+  use_cudnn = (input.type().is_cuda()
+               && (input.type().scalarType() != at::kHalf
+                 || weight.type().scalarType() == at::kFloat)
+               && weight.defined() && bias.defined()
+               && ((running_mean.defined() && running_var.defined())
+                 || (!running_mean.defined() && !running_var.defined() && training))
+               && input.size(0) <= 131070
+               && detail::getCUDAHooks().compiledWithCuDNN()
+               && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L);
+
+  if (use_cudnn && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN()) {
+    return std::get<0>(at::cudnn_batch_norm(
+                        input.contiguous(), weight.contiguous(),
+                        bias.contiguous(),
+                        running_mean.defined() ? running_mean.contiguous() : running_mean,
+                        running_var.defined() ? running_var.contiguous() : running_var,
+                        training, momentum, eps));
+  }
+
+  return at::thnn_batch_norm(
+            input.contiguous(), weight, bias,
+            running_mean, running_var, training, momentum, eps);
+}
+
+Tensor layer_norm(const Tensor& input, IntList normalized_shape,
+    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    double eps, bool cudnn_enabled) {
+
+    int64_t normalized_ndim = normalized_shape.size();
+
+    if (normalized_ndim < 1) {
+      std::stringstream ss;
+      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
+         << "containing at least one element, but got normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (weight.defined() && !weight.sizes().equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Expected weight to be of same shape as normalized_shape, but got "
+         << "weight of shape " << weight.sizes() << " and normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (bias.defined() && !bias.sizes().equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Expected bias to be of same shape as normalized_shape, but got "
+         << "bias of shape " << bias.sizes() << " and normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    auto input_shape = input.sizes();
+    auto input_ndim = input.dim();
+
+    if (input_ndim < normalized_ndim ||
+        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Given normalized_shape=" << normalized_shape
+         << ", expected input with shape [*";
+      for (auto size : normalized_shape) {
+        ss << ", " << size;
+      }
+      ss << "], but got input of size" << input_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    int64_t n = 1;
+    for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) {
+      n *= input_shape[i];
+    }
+
+    // Apply layer norm
+    auto input_reshaped = input.contiguous().view({1, n, -1});
+
+    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
+                              cudnn_enabled);
+    out = out.view(input_shape);
+
+    if (weight.defined() && bias.defined()) {
+      return bias.addcmul(out, weight, 1);
+    } else if (weight.defined()) {
+      return out.mul(weight);
+    } else if (bias.defined()) {
+      return out.add(bias);
+    } else {
+      return out;
+    }
+}
+
+Tensor group_norm(const Tensor& input, int64_t num_groups,
+    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    double eps, bool cudnn_enabled) {
+
+    auto input_shape = input.sizes();
+    int64_t b = input.size(0);
+    int64_t c = input.size(1);
+
+    if (c % num_groups != 0) {
+      std::stringstream ss;
+      ss << "Expected number of channels in input to be divisible by "
+         << "num_groups, but got input of shape " << input.sizes() << " and "
+         << "num_groups=" << num_groups;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) {
+      std::stringstream ss;
+      ss << "Expected weight to be a vector of size equal to the number of "
+         << "channels in input, but got weight of shape " << weight.sizes()
+         << " and input of shape " <<  input.sizes();
+      throw std::runtime_error(ss.str());
+    }
+
+    if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) {
+      std::stringstream ss;
+      ss << "Expected bias to be a vector of size equal to the number of "
+         << "channels in input, but got bias of shape " << weight.sizes()
+         << " and input of shape " <<  input.sizes();
+      throw std::runtime_error(ss.str());
+    }
+
+    // Apply group norm
+    auto input_reshaped = input.contiguous().view({1, b * num_groups, -1});
+
+    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
+                              cudnn_enabled);
+    out = out.view(input_shape);
+
+    if (!weight.defined() && !bias.defined()) {
+      return out;
+    }
+
+    std::vector<int64_t> affine_param_shape(input.dim(), 1);
+    affine_param_shape[1] = c;
+
+    if (weight.defined() && bias.defined()) {
+      return bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1);
+    } else if (weight.defined()) {
+      return out.mul(weight.view(affine_param_shape));
+    } else {
+      return out.add(bias.view(affine_param_shape));
+    }
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
new file mode 100644
index 0000000..a13cae0
--- /dev/null
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -0,0 +1,134 @@
+#include "ATen/ATen.h"
+
+#include "ATen/Error.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/TensorUtils.h"
+
+#include <tuple>
+
+namespace at { namespace native {
+
+static void check1d(
+    const char* function_name,
+    const char* argument_name,
+    IntList x) {
+  AT_CHECK(
+      x.size() == 1,
+      function_name, "() argument '", argument_name,
+      "' should contain one int (got ", x.size(), ")");
+}
+
+Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) {
+  checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3);
+  check1d("adaptive_avg_pool1d", "output_size", output_size);
+
+  auto output = at::adaptive_avg_pool2d(
+      self.unsqueeze(2),
+      {1, output_size[0]});
+
+  return output.squeeze(2);
+}
+
+std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList output_size) {
+  checkDim("adaptive_max_pool1d", TensorArg(self, "self", 1), 3);
+  check1d("adaptive_max_pool1d", "output_size", output_size);
+
+  Tensor output, indices;
+  std::tie(output, indices) = at::adaptive_max_pool2d(
+      self.unsqueeze(2),
+      {1, output_size[0]});
+
+  return std::make_tuple(output.squeeze(2), indices.squeeze(2));
+}
+
+std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+    const Tensor& self,
+    IntList kernel_size,
+    IntList stride,
+    IntList padding,
+    IntList dilation,
+    bool ceil_mode) {
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  checkDim("max_pool1d", TensorArg(self, "self", 1), 3);
+  check1d("max_pool1d", "kernel_size", kernel_size);
+  check1d("max_pool1d", "stride", stride);
+  check1d("max_pool1d", "padding", padding);
+  check1d("max_pool1d", "dilation", dilation);
+
+  Tensor output, indices;
+  std::tie(output, indices) = at::max_pool2d_with_indices(
+      self.unsqueeze(2),
+      {1, kernel_size[0]},
+      {1, stride[0]},
+      {0, padding[0]},
+      {1, dilation[0]},
+      ceil_mode);
+
+  return std::make_tuple(output.squeeze(2), indices.squeeze(2));
+}
+
+Tensor avg_pool1d(
+    const Tensor& self,
+    IntList kernel_size,
+    IntList stride,
+    IntList padding,
+    bool ceil_mode,
+    bool count_include_pad) {
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  checkDim("avg_pool1d", TensorArg(self, "self", 1), 3);
+  check1d("avg_pool1d", "kernel_size", kernel_size);
+  check1d("avg_pool1d", "stride", stride);
+  check1d("avg_pool1d", "padding", padding);
+
+  auto output = at::avg_pool2d(
+      self.unsqueeze(2),
+      {1, kernel_size[0]},
+      {1, stride[0]},
+      {0, padding[0]},
+      ceil_mode,
+      count_include_pad);
+
+  return output.squeeze(2);
+}
+
+Tensor max_pool1d(
+    const Tensor& self,
+    IntList kernel_size,
+    IntList stride,
+    IntList padding,
+    IntList dilation,
+    bool ceil_mode) {
+  auto output_and_indices = at::max_pool1d_with_indices(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+  return std::get<0>(output_and_indices);
+}
+
+Tensor max_pool2d(
+    const Tensor& self,
+    IntList kernel_size,
+    IntList stride,
+    IntList padding,
+    IntList dilation,
+    bool ceil_mode) {
+  auto output_and_indices = at::max_pool2d_with_indices(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+  return std::get<0>(output_and_indices);
+}
+
+Tensor max_pool3d(
+    const Tensor& self,
+    IntList kernel_size,
+    IntList stride,
+    IntList padding,
+    IntList dilation,
+    bool ceil_mode) {
+  auto output_and_indices = at::max_pool3d_with_indices(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+  return std::get<0>(output_and_indices);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
new file mode 100644
index 0000000..d4ad799
--- /dev/null
+++ b/aten/src/ATen/native/README.md
@@ -0,0 +1,310 @@
+ATen "native" functions are the modern mechanism for adding operators and
+functions to ATen (they are "native" in contrast to legacy functions, which are bound
+via TH/THC cwrap metadata).  Native functions
+are declared in `native_functions.yaml` and have implementations defined
+in one of the `cpp` files in this directory.
+
+Like all ATen methods/functions, native functions are made available
+from both ATen's C++ and Python APIs.  In C++, they are made available
+either as methods on `Tensor` (`t.mymeth()`) and functions in the ATen
+namespace (`at::myfunc()`).  In PyTorch, they are made available as
+methods on `Variable` or as functions on `torch._C._FunctionBase`
+(it is the user's responsibility to re-exporting these functions in
+a more user-facing module.)  At the moment, only
+functions which ingest `Variable` are made available; to use a function
+with non-differentiable tensors, wrap your tensors with `Variable` before
+passing them in.
+
+The rest of this document describes how to implement an ATen function.
+
+## Registering a function in `native_functions.yaml`
+
+Every native function must have an entry in
+`native_functions.yaml`.  The format can be summarized as:
+
+```
+- func: func_name(ArgType arg0[=default], ArgType arg1[=default], ...) -> ReturnType
+  variants: function, method
+  dispatch:
+    CPU: func_cpu
+    CUDA: func_cuda
+```
+
+Each component is described in more detail below:
+
+### `func`
+
+```
+- func: func_name(ArgType arg0[=default], ArgType arg1[=default], ...) -> ReturnType
+```
+
+The `func` entry is a string describing the name of the function and its type
+signature.
+
+**Argument types.** These types are permissible as ArgType:
+
+- `Tensor`.  A `Tensor` argument translates into a C++ argument of type `const Tensor&`
+  (except when the argument is "inplace"; in this case, it is simply `Tensor&`).
+  A trailing `?`, as in `Tensor?`, indicates that the tensor argument is optional
+  and may be omitted by passing an undefined tensor.  When a function takes multiple
+  `Tensor` arguments, these tensors are assumed to be the same type (e.g.,
+  if one argument is a `FloatTensor`, all other arguments are checked
+  to be `FloatTensor`s.)
+- Tensors of specific types.  At the moment, valid type names are:
+    - `IntegerTensor` (a.k.a. `LongTensor`)
+    - `BoolTensor` (a.k.a. `ByteTensor`)
+    - `IndexTensor` (a.k.a. `IntTensor`)
+  These type names were inherited from TH, and may be renamed soon, so
+  don't commit them to memory.
+- `TensorList`.  A `TensorList` argument translates into a C++ argument of type `ArrayRef<Tensor>`
+  (a.k.a. `TensorList`)
+- `IntList`.  `IntList` accepts an optional length specifier, e.g., `IntList[2]`, which
+  has no effect in C++ but extends our Python bindings to accept a bare number, which will be
+  expanded into an appropriately sized list by repeating the number.
+- `int64_t`. There is no `int`; ATen policy is to use `int64_t` in the API anywhere you would
+  have ordinarily passed an `int` or `size_t`.
+- `double`. There is no `float`; ATen policy is to use `double` anywhere you would have used `float`.
+- `bool`
+- `Scalar`. `Scalar` supports binding to any numerical types from Python, including integral types,
+  floating point types, and zero dimensional tensors. `int64_t` and `double` can only bind to the
+  corresponding Python numerical types. However, you probably don't want to use `Scalar`. It's
+  really used for binding to TH/THC code "real" types where the Python APIs you are binding to are
+  actually different types. `double` and `int64_t` argument types should suffice for most algorithms.
+- `Generator*`, the state for a random number generator,
+- `std::array<bool,N>` (where N is `1-4`).  NB: you MUST NOT put a space after the comma, otherwise
+  this argument will not parse correctly.  (If you decide to fix this, make sure you fix the
+  argument parser both in ATen and in PyTorch.)
+- `*` is a special sentinel argument, which doesn't translate into an actual
+  argument, but indicates that in the Python bindings, any subsequent arguments
+  must be specified as keyword arguments (and cannot be provided positionally).
+
+**Return types.** These types are permissible as ReturnType:
+
+- `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector<Tensor>`,
+  respectively (unless the operation is in-place, in which case the return type
+  is `Tensor&`.
+- A tuple of any number of `Tensor`, e.g., `(Tensor, Tensor)`, translating into
+  the C++ `std::tuple<Tensor, Tensor>`.
+
+If you need a type that is not listed in this list, it may be possible to extend ATen's
+code generation to support it.  ATen's philosophy on types to support is that it supports
+only simple, universal types, as well as a handful of fundamental Tensor structures
+(e.g., `Tensor` and `Generator*`), because these types can be easily ported to any language
+bound to ATen (in practice, C++ and Python.)
+
+**Argument names.** Argument names are meaningful; downstream binding code may make use of the specific
+argument name you provide, and a rename of an argument name is considered a BC-breaking
+change (e.g., you will probably need to update `tools/autograd/derivatives.yaml` at
+least). In `native_functions.yaml`, if your function (usually functions named with 'out' affix) args
+include the result Tensor, you need to call the argument `Tensor result`. And if there are more
+than one result Tensors, you need to name the args `Tensor result0, Tensor result1, ...`.
+
+TODO: Do argument names affect Python keyword arguments?
+
+**Defaults.** Any suffix of arguments can have a default value defined;
+these default values translate into C++/Python default values which
+are applied when those positional arguments are not specified.
+
+Here are the supported default values:
+
+* Numbers (e.g., `0` or `5.0` for `int64_t`, `double` and `IntList`
+  with an explicit length (e.g., `IntList[2]`)--in the case of IntList,
+  a number is replicated to fill the length (e.g., `IntList[2] x=2`
+  is equivalent to `IntList[2] x={2,2}`.
+* Lists of numbers (e.g., `{0, 0}`) for `IntList`.
+* Booleans (e.g., `true`) for `bool`.
+* Empty initializer lists (e.g., `{}`) for `Tensor` (this implicitly changes
+  a `Tensor` argument to accept undefined tensors).
+* `nullptr` for pointer types (e.g., `Generator*`)
+
+The declarations also support the following attributes:
+
+### `variants`
+
+```
+variants: function, method
+```
+
+Controls whether Tensor method (`t.foo()`) or namespace Function (`at::foo()`) is
+generated as a result of this declaration.  If the declaration is a method,
+you must have an argument `Tensor self` at some position in the method;
+in the method variant this argument will be elided from the argument
+list.  For example, given the declaration `where(BoolTensor cond, Tensor self, Tensor other)`,
+this generates the function `at::where(cond, self, other)` and the method
+`self.where(cond, other)`.
+
+By default, ATen generates both function and method variants for a native function.
+Generally, the function variant is always useful; however, you may not wish
+to generate a method variant. Tensor operations as methods are appropriate for "core"
+Tensor operations (e.g., add, sub, etc.), but not for more complicated neural network
+layers (e.g., `conv2d`) and internal functions designed specifically for binding
+(e.g., `cudnn_convolution`).
+
+### `dispatch`
+
+```
+dispatch:
+    CPU: func_cpu
+    CUDA: func_cuda
+```
+
+This specifies the actual name of the function you want to dispatch to, so you
+can dispatch to different functions depending on whether or not you have CPU or
+CUDA tensors.  Technically, it is also possible to write `dispatch: func_name`
+to unconditionally dispatch to a native function whose name is different than
+the name in the public ATen API, but this is generally frowned upon (just name
+them the same thing!)
+
+### `python_default_init`
+
+```
+python_default_init:
+  argument_name: initializing_expression
+```
+
+A map from argument names to default initializing expressions written in C++. Such default
+expressions will only be used in Python API (in the C++ API, these arguments are
+mandatory).
+
+There are a few situations where you might like to use this functionality:
+
+- You want a default value which is fine in Python but would cause ambiguity in C++.
+  For example, `norm(Tensor self, real p=2, int64_t dim=1)` would cause ambiguity
+  with long tensors in C++. Therefore, we need to make `p=2` a python only default
+  initialization value.
+
+- You want a value to default to the same value as another argument (this cannot
+  be expressed in C++ default arguments).
+
+If you grep for `python_default_init`, you can find examples of this being used;
+in general, most functions will not need to use this.
+
+## Writing an implementation in C++
+
+Implementations of native functions go in an appropriate C++ file in the
+`native/` directory (they are organized roughly by topic, but there is no
+semantic meaning to their organization aside for the `cuda` directory,
+which is the only place the build system knows how to build `cu` files.)
+To write a native function, you only need to write a C++
+implementation (no header necessary) with a matching signature to
+the generated header from the ATen metadata.  There are many
+simple native functions; take a look at some of them to see what to do.
+
+Although, for the most part, writing an ATen function is mostly writing
+the algorithm you want to implement, there are some less obvious details
+you should also consider.
+
+### Will your function be automatically differentiable?
+
+If you are writing a pair of functions `foo` and `foo_backward`, with
+the intent that `foo_backward` implements the derivative of `foo`, then
+your implementation of `foo` is probably not automatically differentiable:
+it might make use of functions like `data_ptr()` or it dispatches differently
+depending on if it's operating on CPU or CUDA tensors.  Once you write these two functions,
+you will have to write an entry correlating them together in
+`tools/autograd/derivatives.yaml`.
+
+However, in some situations, you can write a function in ATen and it
+will be automatically differentiated!  This can be the case if the function implementation
+only calls other operations which are themselves differentiable.  In this
+case, you don't have to write an entry in `tools/autograd/derivatives.yaml`.
+
+### Can it handle being passed Variables?
+
+The biggest subtlety of writing an ATen implementation is the fact that
+`Tensor` is not a "final" class: your implementation may be passed objects
+which inherit from `Tensor` (in particular, the `Variable` subclass
+implements automatic differentiation in PyTorch.)  This has some
+direct consequences on valid implementations:
+
+* Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a
+  caller will be expecting to get `Variable`s out if it passes `Variable`.
+  Instead, create tensors from the `type()` of one of the input tensors, e.g.,
+  `input.type().tensor()`  or `input.type().toScalarType(kByte)` if you need
+  a different scalar type.
+
+* If you need to call other ATen functions, be sure to qualify the call
+  with `at::`; don't call them unqualified (in the `at::native` namespace).
+  Using the qualified name ensures that your invocation gets dispatched to
+  the `Variable` (which may be overridden to behave differently than
+  simply dispatch to `at::native`).
+
+These are not hard and fast rules: in particular, if you explicitly define
+a derivative for a function, it will only ever be called with `Tensor`
+arguments.  However, it is considered good style to abide by these rules,
+since code written in this style is more robust.
+
+NB: There is one downside to following the `at::` qualification rule, which
+is that if you know that you will only ever be called with `Tensor`, a
+direct `at::native` call will be more efficient (as it avoids a dynamic
+dispatch).
+
+### How to handle broadcasting?
+
+Unlike our legacy TH bindings, ATen native functions do not automatically
+handle broadcasting; you will have to insert the necessary broadcasting
+calls yourself.
+
+When writing broadcasting code, we obey the convention that `op` is
+broadcasting, while `s_op` (with the `s_` prefix) is not broadcasting.  The
+relationship is best seen by an example of how you would implement broadcasting
+addition out of non-broadcasting addition:
+
+```
+#include <ATen/ExpandUtils.h>
+
+Tensor add(const Tensor& self, const Tensor& other) {
+  Tensor b_self, b_other;
+  std::tie(b_self, b_other) = expand_outplace(self, other, "add");
+  return s_add(b_self, b_other);
+}
+
+Tensor s_add(const Tensor& self, const Tensor& other) {
+  // non-broadcasting implementation of addition
+}
+```
+
+For inplace operations, the convention looks like this:
+
+```
+Tensor& add_(Tensor& self, const Tensor& other) {
+  Tensor b_other = expand_inplace(self, other, "add_");
+  return s_add_(self, b_other);
+}
+
+Tensor& s_add_(Tensor& self, const Tensor& other) {
+  // non-broadcasting implementation of inplace addition
+}
+```
+
+### Undefined tensor conventions
+
+By default, `Tensor` arguments to ATen functions are always defined, unless
+you explicitly specified that an undefined tensor was permissible by writing
+`Tensor?` or `Tensor x={}`.
+
+The rules for returning undefined Tensors are a bit more subtle, but there
+is only one case you have to remember:
+
+* If the function in question is a backward function which accepts a
+  `std::array<bool,N> output_mask` argument, you MUST return an undefined
+  `Tensor` at every tuple position `i` for which `output_mask[i]` is false, otherwise
+
+* You MUST NOT return an undefined tensor.
+
+The most common situations where you might be tempted to return undefined tensors
+are when:
+
+- You have a forward function that may return a buffer if training is enabled, but does not
+  return the buffer in inference mode.  In this case, just return an appropriately
+  typed zero-size tensor.
+
+- You have a backward function where the gradient for an input is zero.  In this case, you
+  are expected to create a zero-filled tensor of appropriate size to return for this input.
+  To get the shape, it may be helpful to take a `TensorGeometry` of the input to use.
+
+### Debugging tips
+
+If you build ATen and get a linker error, that probably means you copy-pasted
+the C++ definition of your function incorrectly.  Double check your `Tensor`
+arguments, and make sure you wrote `const Tensor&` in your signature.
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
new file mode 100644
index 0000000..affa9d2
--- /dev/null
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -0,0 +1,685 @@
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/WrapDimUtilsMulti.h"
+#include "ReduceOpsUtils.h"
+#include "cpu/ReduceOpsKernel.h"
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <vector>
+#include <map>
+
+namespace at {
+namespace native {
+
+static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
+  ScalarType scalarType = self.type().scalarType();
+  ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType);
+  return self.toType(upcast_scalarType);
+}
+
+static inline Tensor cumsum(const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
+  return at::_cumsum(integer_upcast(self, dtype), dim);
+}
+
+Tensor cumsum(const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::cumsum(self, dim, optional<ScalarType>(dtype));
+}
+
+Tensor cumsum(const Tensor& self, int64_t dim) {
+  return at::native::cumsum(self, dim, nullopt);
+}
+
+static inline Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
+  // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
+  AT_CHECK(
+      !dtype.has_value() || (result.type().scalarType() == dtype.value()),
+      "provided dtype must match dtype of result in cumsum. Got ",
+      at::toString(result.type().scalarType()),
+      " and ",
+      at::toString(dtype.value()),
+      ".");
+  return at::_cumsum_out(result, self.toType(result.type().scalarType()), dim);
+}
+
+Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::cumsum_out(result, self, dim, optional<ScalarType>(dtype));
+}
+
+Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim) {
+  return at::native::cumsum_out(result, self, dim, nullopt);
+}
+
+static inline Tensor cumprod(const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
+  return at::_cumprod(integer_upcast(self, dtype), dim);
+}
+
+Tensor cumprod(const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::cumprod(self, dim, optional<ScalarType>(dtype));
+}
+
+Tensor cumprod(const Tensor& self, int64_t dim) {
+  return at::native::cumprod(self, dim, nullopt);
+}
+
+static inline Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
+  // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
+  AT_CHECK(
+      !dtype.has_value() || (result.type().scalarType() == dtype.value()),
+      "provided dtype must match dtype of result in cumprod. Got ",
+      at::toString(result.type().scalarType()),
+      " and ",
+      at::toString(dtype.value()),
+      ".");
+  return at::_cumprod_out(result, self.toType(result.type().scalarType()), dim);
+}
+
+Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::cumprod_out(result, self, dim, optional<ScalarType>(dtype));
+}
+
+Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim) {
+  return at::native::cumprod_out(result, self, dim, nullopt);
+}
+
+// ALL REDUCE #################################################################
+
+static inline Tensor mean(const Tensor &self, optional<ScalarType> dtype) {
+  ScalarType scalarType = self.type().scalarType();
+  AT_CHECK(
+      at::isFloatingType(scalarType),
+      "Can only calculate the mean of floating types. Got ",
+      at::toString(scalarType),
+      " instead.");
+  if (self.numel() > 0) {
+    Tensor result = at::native::sum(self);
+    return result.div_(self.numel());
+  } else {
+    return self.type().scalarTensor(std::numeric_limits<double>::quiet_NaN());
+  }
+}
+
+Tensor mean(const Tensor &self, ScalarType dtype) {
+  return at::native::mean(self, optional<ScalarType>(dtype));
+}
+
+Tensor mean(const Tensor &self) {
+  return at::native::mean(self, nullopt);
+}
+
+static inline Tensor sum(const Tensor &self, optional<ScalarType> dtype) {
+  return at::_sum(integer_upcast(self, dtype));
+}
+
+Tensor sum(const Tensor &self, ScalarType dtype) {
+  return at::native::sum(self, optional<ScalarType>(dtype));
+}
+
+Tensor sum(const Tensor &self) {
+  return at::native::sum(self, nullopt);
+}
+
+Tensor _sum_cpu(const Tensor& self) {
+  if (self.is_contiguous()) {
+    Tensor result = at::empty({}, self.type());
+    sum_kernel(result, self, at::nullopt);
+    return result;
+  }
+  return self._sumall();
+}
+
+static inline Tensor prod(const Tensor &self, optional<ScalarType> dtype) {
+  return at::_prod(integer_upcast(self, dtype));
+}
+
+Tensor prod(const Tensor &self, ScalarType dtype) {
+  return at::native::prod(self, optional<ScalarType>(dtype));
+}
+
+Tensor prod(const Tensor &self) {
+  return at::native::prod(self, nullopt);
+}
+
+Tensor _prod_cpu(const Tensor &self) {
+  if (self.is_contiguous()) {
+    Tensor result = at::empty({}, self.type());
+    prod_kernel(result, self, at::nullopt);
+    return result;
+  }
+  return self._prodall();
+}
+
+// \ALL REDUCE ################################################################
+
+// DIM REDUCE #################################################################
+
+static inline Tensor &mean_out(Tensor &result, const Tensor &self, int64_t dim,
+                 bool keepdim, optional<ScalarType> dtype) {
+  ScalarType scalarType = result.type().scalarType();
+  AT_CHECK(
+      at::isFloatingType(scalarType),
+      "Can only calculate the mean of floating types. Got ",
+      at::toString(scalarType),
+      " instead.");
+  at::native::sum_out(
+      result, self.toType(result.type().scalarType()), dim, keepdim);
+  if (result.numel() > 0 && self.ndimension() > 0) {
+    int64_t numel = self.size(dim);
+    if (numel > 0) {
+      result.div_(numel);
+    } else {
+      // NumPy equivalent
+      result.fill_(std::numeric_limits<double>::quiet_NaN());
+    }
+  }
+  return result;
+}
+
+Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) {
+  return at::native::mean_out(result, self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
+  return at::native::mean_out(result, self, dim, keepdim, nullopt);
+}
+
+Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::mean_out(result, self, dim, false, dtype);
+}
+
+static inline Tensor &sum_out(Tensor &result, const Tensor &self, IntList dim,
+                 bool keepdim, optional<ScalarType> dtype) {
+  // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
+  AT_CHECK(
+      !dtype.has_value() || (result.type().scalarType() == dtype.value()),
+      "provided dtype must match dtype of result in sum. Got ",
+      at::toString(result.type().scalarType()),
+      " and ",
+      at::toString(dtype.value()),
+      ".");
+  return at::_sum_out(result, self.toType(result.type().scalarType()), dim, keepdim);
+}
+
+Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, bool keepdim, ScalarType dtype) {
+  return at::native::sum_out(result, self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, bool keepdim) {
+  return at::native::sum_out(result, self, dim, keepdim, nullopt);
+}
+
+Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, ScalarType dtype) {
+  return at::native::sum_out(result, self, dim, false, dtype);
+}
+
+Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
+                     bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
+    return result;
+  if (self.is_contiguous() && result.is_contiguous()) {
+    _dimreduce_setup(result, self, dim);
+    sum_kernel(result, self, dim);
+    if (!keepdim) result.squeeze_(dim);
+    return result;
+  }
+  return at::_th_sum_out(result, self, dim, keepdim);
+}
+
+static inline Tensor &prod_out(Tensor &result, const Tensor &self, int64_t dim,
+                 bool keepdim, optional<ScalarType> dtype) {
+  // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
+  AT_CHECK(
+      !dtype.has_value() || (result.type().scalarType() == dtype.value()),
+      "provided dtype must match dtype of result in prod. Got ",
+      at::toString(result.type().scalarType()),
+      " and ",
+      at::toString(dtype.value()),
+      ".");
+  return at::_prod_out(result, self.toType(result.type().scalarType()), dim, keepdim);
+}
+
+Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) {
+  return at::native::prod_out(result, self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
+  return at::native::prod_out(result, self, dim, keepdim, nullopt);
+}
+
+Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::prod_out(result, self, dim, false, dtype);
+}
+
+Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
+                      bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  if (_dimreduce_return_trivial(result, self, 1, dim, keepdim))
+    return result;
+  if (self.is_contiguous() && result.is_contiguous()) {
+    _dimreduce_setup(result, self, dim);
+    prod_kernel(result, self, dim);
+    if (!keepdim) result.squeeze_(dim);
+    return result;
+  }
+  return at::_th_prod_out(result, self, dim, keepdim);
+}
+
+static inline Tensor mean(const Tensor &self, int64_t dim, bool keepdim, optional<ScalarType> dtype) {
+  ScalarType scalarType = self.type().scalarType();
+  AT_CHECK(
+      at::isFloatingType(scalarType),
+      "Can only calculate the mean of floating types. Got ",
+      at::toString(scalarType),
+      " instead.");
+  Tensor result = at::native::sum(self, dim, keepdim);
+  if (result.numel() > 0 && self.ndimension() > 0) {
+    int64_t numel = self.size(dim);
+    if (numel > 0) {
+      result.div_(numel);
+    } else {
+      // NumPy equivalent
+      result.fill_(std::numeric_limits<double>::quiet_NaN());
+    }
+  }
+  return result;
+}
+
+Tensor mean(const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) {
+  return at::native::mean(self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+
+Tensor mean(const Tensor& self, int64_t dim, bool keepdim) {
+  return at::native::mean(self, dim, keepdim, nullopt);
+}
+
+Tensor mean(const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::mean(self, dim, false, dtype);
+}
+
+static inline Tensor sum(const Tensor &self, IntList dim_, bool keepdim, optional<ScalarType> dtype) {
+  return at::_sum(integer_upcast(self, dtype), dim_, keepdim);
+}
+
+Tensor sum(const Tensor& self, IntList dim, bool keepdim, ScalarType dtype) {
+  return at::native::sum(self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+
+Tensor sum(const Tensor& self, IntList dim, bool keepdim) {
+  return at::native::sum(self, dim, keepdim, nullopt);
+}
+
+Tensor sum(const Tensor& self, IntList dim, ScalarType dtype) {
+  return at::native::sum(self, dim, false, dtype);
+}
+
+Tensor _sum(const Tensor &self, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  Tensor result = self.type().tensor();
+  return at::_sum_out(result, self, dim, keepdim);
+}
+
+static inline Tensor prod(const Tensor &self, int64_t dim_, bool keepdim, optional<ScalarType> dtype) {
+  return at::_prod(integer_upcast(self, dtype), dim_, keepdim);
+}
+
+Tensor prod(const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) {
+  return at::native::prod(self, dim, keepdim, at::optional<ScalarType>(dtype));
+}
+
+Tensor prod(const Tensor& self, int64_t dim, bool keepdim) {
+  return at::native::prod(self, dim, keepdim, nullopt);
+}
+
+Tensor prod(const Tensor& self, int64_t dim, ScalarType dtype) {
+  return at::native::prod(self, dim, false, dtype);
+}
+
+Tensor _prod(const Tensor &self, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  Tensor result = self.type().tensor();
+  return at::_prod_out(result, self, dim, keepdim);
+}
+
+Tensor& logsumexp_out(Tensor& result, const Tensor &self, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  // can't take max of empty tensor.
+  if (self.numel() != 0) {
+    auto maxes = at::max_values(self, dim, true);
+    result = at::where((maxes == INFINITY).__or__(maxes == -INFINITY),
+                       maxes,
+                       maxes + at::log(at::sum(at::exp(self - maxes), dim, true)));
+  } else {
+    result = at::log(at::sum(at::exp(self), dim, true));
+  }
+  if (! keepdim)
+    result.squeeze_(dim);
+  return result;
+}
+
+Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  Tensor result = self.type().tensor();
+  return at::native::logsumexp_out(result, self, dim, keepdim);
+}
+
+// \DIM REDUCE ################################################################
+
+// MULTI DIM REDUCE ###########################################################
+
+// NB: this applies two optimizations:
+//   1. Reducing the dimensions in the order of decreasing size, so that the
+//      larger dimensions are dealt earlier and we can work with less elements
+//      overall.
+//      E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
+//            If we reduce in the order of [0, 1, 2], the input and output
+//            shapes of iterations are:
+//                it 0:  [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
+//                it 1:     [10, 200] (2000 elem) =>     [200] ( 200 elem)
+//                it 2:         [200] ( 200 elem) =>     [  1] (   1 elem)
+//              Since we need to iterate through all input elements at each
+//              iteration, total number of elements traversed is 4200.
+//            If we reduce in the order of [2, 1, 0], i.e., with decreasing
+//            size, the input and output shapes of iterations are:
+//                it 0:  [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
+//                it 1:      [1,  10] (  10 elem) =>    [ 1] ( 1 elem)
+//                it 2:           [1] (   1 elem) =>    [ 1] ( 1 elem)
+//              Total number of elements traversed is 2011, much less than 4200.
+//   2. Preallocated buffer.
+//      Utilizing the `_out` variant, instead of allocating new output tensors
+//      at each iteration, we can use a preallocated buffer. Since output numel
+//      in each iteration is decreasing, we can reuse the buffer throughout the
+//      loop.
+//      Note that we need two buffers, one containing the input, i.e., output
+//      from the previous iteration, and one containing the output for this
+//      iteration.
+//      The largest output size is the output size of the first iteration. After
+//      that the largest size we need is the output size  of the second
+//      iteration.
+//      So we allocate
+//        1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
+//        2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
+//      These two regions are allocated together as a contiguous flattened
+//      buffer tensor, with a variable `offset` indicating the starting position
+//      of the output region for the current iteration.
+//      E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
+//            Say we reduce in the order of [0, 1, 2].
+//            The first buffer with has size `4 * 3 * 2 / 4 = 6`.
+//            The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
+//            So we allocate a tensor of size `6 + 2 = 8`:
+//              buffer: [ _, _, _, _, _, _, _, _]
+//      buffer region 1-->^^^^^^^^^^^^^^^^  ^^^^<--buffer region 2
+//            1st iteration:
+//              (before reduction)
+//                input:         self (or input)
+//                input shape:   [ 4, 3, 2]
+//                output shape:  [ 3, 2]
+//                buffer:        [ _, _, _, _, _, _, _, _]
+//                offset:          ^--beginning of 1st buffer region, i.e., the
+//                                    starting output location of 1st iteration.
+//              (after reduction)
+//                buffer:        [ {output of 1st it}, _, _]
+//
+//            2nd iteration:
+//              (before reduction)
+//                input:         output of 1st it
+//                input shape:   [ 3, 2]
+//                output shape:  [ 2]
+//                buffer:        [ {output of 1st it}, _, _]
+//                offset:                              ^--beginning of 2nd
+//                                                      buffer region. We can't
+//                                                      overwrite the 1st buffer
+//                                                      as it contains input to
+//                                                      reduction of this it.
+//              (after reduction)
+//                buffer:        [ {output of 1st it}, {output of 2nd it}]
+//
+//            3rd iteration:
+//              (before reduction)
+//                input:         output of 2nd it
+//                input shape:   [ 2]
+//                output shape:  [ 1]
+//                buffer:        [ {output of 1st it}, {output of 2nd it}]
+//                offset:          ^--beginning of 1st buffer region. We can
+//                                  safely overwrite now.
+//              (after reduction)
+//                buffer:        [ {output of 3rd it}, {output of 2nd it}]
+//            Return {output of 3rd it}.
+//
+// TODO: If two or more reduced dimensions are contiguous, reduce as if they are
+//       a large dimension.
+template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
+    Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
+inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool keepdim) {
+  if (dims_.size() == 1) {
+    return reduce_1(self, dims_[0], keepdim);
+  }
+  if (dims_.size() == 0) {
+    return self;
+  }
+  int64_t ndims = self.dim();
+  // `reduced_numel` and `reduced_size` will be updated in the loop.
+  // Before that, they are just size and numel.
+  int64_t reduced_numel = self.numel();
+  auto reduced_size = self.sizes().vec();
+  auto dims = dims_.vec();
+  maybe_wrap_dims(dims, ndims);
+  // Sort the reduced dimensions so that we reduce the larger dimensions first.
+  std::sort(dims.begin(), dims.end(),
+        [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
+  // Calculate 1st buffer region size
+  int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
+  int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
+  // We separate `buffer` into two regions, one starting at 0, and another
+  // starting at max_reduced_numel. These two regions are used alternatively as
+  // the output of a `reduce_1` along a particular dimension. `offset` will
+  // indicate which region we should use next.
+  // Have keepdim=true when reducing. We will squeeze later.
+  auto buffer = at::empty({buffer_size}, self.options());
+  int64_t offset = 0;
+  Tensor t = self;
+  for (auto& dim : dims) {
+    reduced_numel /= reduced_size[dim];
+    reduced_size[dim] = 1;
+    auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
+    t = reduce_1_out(res, t, dim, true);
+    // switch to other buffer region
+    // this alternatively changes `offset` between 0 and max_reduced_numel
+    offset = max_reduced_numel - offset;
+  }
+  // squeeze if needed
+  if (!keepdim) {
+    std::vector<int64_t> squeezed_shape;
+    squeezed_shape.reserve(ndims - dims.size());
+    auto reduce_dims = dim_list_to_bitset(dims_, ndims);
+    for (int64_t dim = 0; dim < ndims; dim++) {
+      if (!reduce_dims[dim]) {
+        squeezed_shape.emplace_back(reduced_size[dim]);
+      }
+    }
+    return t.view(squeezed_shape);
+  }
+  return t;
+}
+
+// See comments above reduce_multi_associative for details.
+template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
+    Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
+inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
+  if (dims_.size() == 1) {
+    return reduce_1_out(result, self, dims_[0], keepdim);
+  }
+  if (dims_.size() == 0) {
+    // reduce_out should be clone_out with empty dims_
+    return result.resize_as_(self).copy_(self);
+  }
+  int64_t ndims = self.dim();
+  // `reduced_numel` and `reduced_size` will be updated in the loop.
+  // Before that, they are just size and numel.
+  int64_t reduced_numel = self.numel();
+  auto reduced_size = self.sizes().vec();
+  auto dims = dims_.vec();
+  maybe_wrap_dims(dims, ndims);
+  // Sort the reduced dimensions so that we reduce the largest dimension first.
+  std::sort(dims.begin(), dims.end(),
+        [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
+  // Calculate 1st buffer region size
+  int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
+  int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
+  // We separate `buffer` into two regions, one starting at 0, and another
+  // starting at max_reduced_numel. These two regions are used alternatively as
+  // the output of a `reduce_1` along a particular dimension. `offset` will
+  // indicate which region we should use next.
+  // Have keepdim=true when reducing. We will squeeze later.
+  auto buffer = at::empty({buffer_size}, self.options());
+  int64_t offset = 0;
+  Tensor t = self;
+  int64_t last_reduction = dims.size() - 1;
+  int64_t num_reduction = 0;
+  for (auto& dim : dims) {
+    reduced_numel /= reduced_size[dim];
+    reduced_size[dim] = 1;
+    auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
+    if (num_reduction < last_reduction) {
+      t = reduce_1_out(res, t, dim, true);
+    } else {
+      reduce_1_out(result, t, dim, true);
+    }
+    // switch to other buffer region
+    // this alternatively changes `offset` between 0 and max_reduced_numel
+    offset = max_reduced_numel - offset;
+    num_reduction++;
+  }
+  // squeeze if needed (use in-place squeeze_)
+  if (!keepdim) {
+    auto reduce_dims = dim_list_to_bitset(dims_, ndims);
+    for (int64_t dim = ndims - 1; dim >= 0; dim--) {
+      if (reduce_dims[dim]) {
+        result.squeeze_(dim);
+      }
+    }
+  }
+  return result;
+}
+
+Tensor& _sum_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
+  if (self.is_cuda()) {
+    return at::_sum_cuda_out(result, self, dim, keepdim);
+  } else {
+    return _sum_out_cpu(result, self, dim, keepdim);
+  }
+}
+
+Tensor _sum(const Tensor &self, IntList dims, bool keepdim) {
+  return reduce_multi_associative<_sum, _sum_out>(self, dims, keepdim);
+}
+
+Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)
+{
+  return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim);
+}
+
+Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::norm_out(result, self, p, dim, keepdim);
+}
+
+Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_norm_out(result, self, p, dim, keepdim);
+  }
+}
+
+Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::all_out(result, self, dim, keepdim);
+}
+
+Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "all only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "all only supports torch.uint8 dtype");
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_all_out(result, self, dim, keepdim);
+  }
+}
+
+Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::any_out(result, self, dim, keepdim);
+}
+
+Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "any only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "any only supports torch.uint8 dtype");
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_any_out(result, self, dim, keepdim);
+  }
+}
+
+Tensor var(const Tensor& self, bool unbiased) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "var only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "var only supports floating-point dtypes");
+  auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
+  return trivial_return.has_value() ? trivial_return.value() : at::_th_var(self, unbiased);
+}
+
+Tensor var(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::var_out(result, self, dim, unbiased, keepdim);
+}
+
+Tensor &var_out(Tensor &result, const Tensor &self, int64_t dim, bool unbiased, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "var only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "var only supports floating-point dtypes");
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial(result, self, std::numeric_limits<double>::quiet_NaN(), dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_var_out(result, self, dim, unbiased, keepdim);
+  }
+}
+
+Tensor std(const Tensor& self, bool unbiased) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "std only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "std only supports floating-point dtypes");
+  auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
+  return trivial_return.has_value() ? trivial_return.value() : at::_th_std(self, unbiased);
+}
+
+Tensor std(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::std_out(result, self, dim, unbiased, keepdim);
+}
+
+Tensor &std_out(Tensor &result, const Tensor &self, int64_t dim, bool unbiased, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "std only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "std only supports floating-point dtypes");
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial(result, self, std::numeric_limits<double>::quiet_NaN(), dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_std_out(result, self, dim, unbiased, keepdim);
+  }
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
new file mode 100644
index 0000000..172d3c1
--- /dev/null
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -0,0 +1,55 @@
+#pragma once
+
+namespace at { namespace native {
+
+static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
+                                int64_t dim) {
+  IntList self_sizes = self.sizes();
+  std::vector<int64_t> result_sizes;
+  result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end());
+  result_sizes[dim] = 1;
+  result.resize_(result_sizes);
+  return result;
+}
+
+static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
+                                      Scalar ident, int64_t dim, bool keepdim) {
+  if (self.numel() == 1 && self.ndimension() == 0) {
+    result.resize_({});
+    result.fill_(self);
+    return true;
+  }
+  // Return identity
+  if (self.numel() == 0) {
+    _dimreduce_setup(result, self, dim);
+    result.fill_(ident);
+    if (!keepdim) result.squeeze_(dim);
+    return true;
+  }
+  return false;
+}
+
+static bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self,
+                                               int64_t dim, bool keepdim, const char *fn_name) {
+  if (self.numel() == 1 && self.ndimension() == 0) {
+    result.resize_({});
+    result.fill_(self);
+    return true;
+  }
+
+  if (self.numel() == 0) {
+    AT_ERROR("cannot perform reduction function ", fn_name,
+             " on tensor with no elements because the operation does not have an identity");
+  }
+  return false;
+}
+
+static at::optional<Tensor> _allreduce_return_trivial(const Tensor &self, Scalar ident) {
+  // Return identity
+  if (self.numel() == 0) {
+    return self.type().scalarTensor(ident);
+  }
+  return at::nullopt;
+}
+
+}}  // at::native
diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp
new file mode 100644
index 0000000..5995e43
--- /dev/null
+++ b/aten/src/ATen/native/RoiPooling.cpp
@@ -0,0 +1,141 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include <tuple>
+
+namespace at {
+namespace native {
+
+std::tuple<at::Tensor, at::Tensor> RoiPooling2d_forward_cpu(
+	const Tensor& input,
+	const Tensor& rois,
+	int64_t pooledHeight,
+	int64_t pooledWidth,
+	double spatialScale)
+{
+  // Input is the output of the last convolutional layer in the Backbone network, so
+  // it should be in the format of NCHW
+  AT_CHECK(input.ndimension() == 4, "Input to RoI Pooling should be a NCHW Tensor");
+
+  // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
+  // dim is the # of proposals, and the second dim is the proposal itself in the form
+  // [batch_index startW startH endW endH]
+  AT_CHECK(rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
+  AT_CHECK(rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
+
+  auto proposals = rois.size(0);
+  auto inputChannels = input.size(1);
+  auto inputHeight = input.size(2);
+  auto inputWidth = input.size(3);
+
+  // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
+  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+
+  // TODO: need some mechanism for determining train vs. test
+
+  // During training, we need to store the argmaxes for the pooling operation, so
+  // the argmaxes Tensor should be the same size as the output Tensor
+  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+
+  AT_CHECK(input.is_contiguous(), "input must be contiguous");
+  AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
+
+  auto *rawInput = input.data<float>();
+  auto inputChannelStride = inputHeight * inputWidth;
+  auto inputBatchStride = inputChannels * inputChannelStride;
+  auto *rawRois = rois.data<float>();
+  auto roiProposalStride = rois.size(1);
+
+  auto *rawOutput = output.data<float>();
+  auto *rawArgmaxes = argmaxes.data<int>();
+  auto outputChannelStride = pooledHeight * pooledWidth;
+
+  // Now that our Tensors are properly sized, we can perform the pooling operation.
+  // We iterate over each RoI and perform pooling on each channel in the input, to
+  // generate a pooledHeight x pooledWidth output for each RoI
+  for (auto i = 0; i < proposals; ++i) {
+    auto n = static_cast<int>(rawRois[0]);
+    auto startWidth = static_cast<int>(std::round(rawRois[1] * spatialScale));
+    auto startHeight = static_cast<int>(std::round(rawRois[2] * spatialScale));
+    auto endWidth = static_cast<int>(std::round(rawRois[3] * spatialScale));
+    auto endHeight = static_cast<int>(std::round(rawRois[4] * spatialScale));
+
+    // TODO: assertions for valid values?
+    // TODO: fix malformed ROIs??
+
+    auto roiHeight = endHeight - startHeight;
+    auto roiWidth = endWidth - startWidth;
+
+    // Because the Region of Interest can be of variable size, but our output
+    // must always be (pooledHeight x pooledWidth), we need to split the RoI
+    // into a pooledHeight x pooledWidth grid of tiles
+
+    auto tileHeight = static_cast<float>(roiHeight) / static_cast<float>(pooledHeight);
+    auto tileWidth = static_cast<float>(roiWidth) / static_cast<float>(pooledWidth);
+
+    auto *rawInputBatch = rawInput + (n * inputBatchStride);
+
+    // Compute pooling for each of the (pooledHeight x pooledWidth) tiles for each
+    // channel in the input
+    for (auto ch = 0; ch < inputChannels; ++ch) {
+      for (auto ph = 0; ph < pooledHeight; ++ph) {
+        for (auto pw = 0; pw < pooledWidth; ++pw) {
+          auto tileHStart = static_cast<int64_t>(std::floor(ph * tileHeight));
+          auto tileWStart =	static_cast<int64_t>(std::floor(pw * tileWidth));
+          auto tileHEnd = static_cast<int64_t>(std::ceil((ph + 1) * tileHeight));
+          auto tileWEnd = static_cast<int64_t>(std::ceil((pw + 1) * tileWidth));
+
+          // Add tile offsets to RoI offsets, and clip to input boundaries
+          tileHStart = std::min(std::max<int64_t>(tileHStart + startHeight, 0), inputHeight);
+          tileWStart = std::min(std::max<int64_t>(tileWStart + startWidth, 0), inputWidth);
+          tileHEnd = std::min(std::max<int64_t>(tileHEnd + startHeight, 0), inputHeight);
+          tileWEnd = std::min(std::max<int64_t>(tileWEnd + startWidth, 0), inputWidth);
+
+          auto poolIndex = (ph * pooledWidth) + pw;
+
+          // If our pooling region is empty, we set the output to 0, otherwise to
+          // the min float so we can calculate the max properly
+          auto empty = tileHStart >= tileHEnd || tileWStart >= tileWEnd;
+          rawOutput[poolIndex] = empty ? 0 : std::numeric_limits<float>::min();
+
+          // Set to -1 so we don't try to backprop to anywhere
+          // TODO: make optional for test
+          rawArgmaxes[poolIndex] = -1;
+
+          for (auto th = tileHStart; th < tileHEnd; ++th) {
+            for (auto tw = tileWStart; tw < tileWEnd; ++tw) {
+              auto index = (th * inputWidth) + tw;
+              if (rawInputBatch[index] > rawOutput[poolIndex]) {
+                rawOutput[poolIndex] = rawInputBatch[index];
+                // TODO: make optional for test
+                rawArgmaxes[poolIndex] = index;
+              }
+            }
+          }
+        }
+      }
+      // Increment raw pointers by channel stride
+      rawInputBatch += inputChannelStride;
+      rawOutput += outputChannelStride;
+      // TODO: make optional for test
+      rawArgmaxes += outputChannelStride;
+    }
+    // Increment RoI raw pointer
+    rawRois += roiProposalStride;
+  }
+
+  return std::make_tuple(output, argmaxes);
+}
+
+Tensor RoiPooling2d_backward_cpu(
+  const Tensor& input,
+  const Tensor& rois,
+  int64_t pooledHeight,
+  int64_t pooledWidth,
+  double spatialScale,
+  const Tensor& gradOutput,
+  const Tensor& argmaxes) {
+  throw std::runtime_error("not implemented");
+}
+
+}
+}
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
new file mode 100644
index 0000000..546c758
--- /dev/null
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -0,0 +1,217 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Parallel.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/native/cpu/SoftmaxKernel.h"
+
+namespace at {
+namespace native {
+namespace {
+
+template <typename scalar_t, bool LogSoftMax>
+void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
+  int64_t outer_size = 1;
+  int64_t dim_size = input.size(dim);
+  if (input.numel() == 0) {
+    return;
+  }
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= input.size(i);
+  for (int64_t i = dim + 1; i < input.dim(); ++i)
+    inner_size *= input.size(i);
+  int64_t dim_stride = inner_size;
+  int64_t outer_stride = dim_size * dim_stride;
+  scalar_t* input_data_base = input.data<scalar_t>();
+  scalar_t* output_data_base = output.data<scalar_t>();
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  parallel_for(
+      0, outer_size * inner_size, grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          int64_t outer_idx = i / inner_size;
+          int64_t inner_idx = i % inner_size;
+          scalar_t* input_data =
+              input_data_base + outer_idx * outer_stride + inner_idx;
+          scalar_t* output_data =
+              output_data_base + outer_idx * outer_stride + inner_idx;
+          scalar_t max_input = input_data[0];
+          for (int64_t d = 1; d < dim_size; d++)
+            max_input = std::max(max_input, input_data[d * dim_stride]);
+
+          scalar_t tmpsum = 0;
+          for (int64_t d = 0; d < dim_size; d++) {
+            scalar_t z = std::exp(input_data[d * dim_stride] - max_input);
+            if (!LogSoftMax) {
+              output_data[d * dim_stride] = z;
+            }
+            tmpsum += z;
+          }
+
+          if (LogSoftMax)
+            tmpsum = max_input + std::log(tmpsum);
+          else
+            tmpsum = 1 / tmpsum;
+
+          for (int64_t d = 0; d < dim_size; d++)
+            if (LogSoftMax)
+              output_data[d * dim_stride] = input_data[d * dim_stride] - tmpsum;
+            else
+              output_data[d * dim_stride] *= tmpsum;
+        }
+      });
+}
+
+template <typename scalar_t, bool LogSoftMax>
+void host_softmax_backward(
+    Tensor& gI,
+    const Tensor& grad,
+    const Tensor& output,
+    int64_t dim) {
+
+  int64_t outer_size = 1;
+  int64_t dim_size = grad.size(dim);
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= grad.size(i);
+  for (int64_t i = dim + 1; i < grad.dim(); ++i)
+    inner_size *= grad.size(i);
+  int64_t dim_stride = inner_size;
+  int64_t outer_stride = dim_size * dim_stride;
+  scalar_t* gradInput_data_base = gI.data<scalar_t>();
+  scalar_t* output_data_base = output.data<scalar_t>();
+  scalar_t* gradOutput_data_base = grad.data<scalar_t>();
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  parallel_for(
+      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          int64_t outer_idx = i / inner_size;
+          int64_t inner_idx = i % inner_size;
+          scalar_t* gradInput_data =
+              gradInput_data_base + outer_idx * outer_stride + inner_idx;
+          scalar_t* output_data =
+              output_data_base + outer_idx * outer_stride + inner_idx;
+          const scalar_t* gradOutput_data =
+              gradOutput_data_base + outer_idx * outer_stride + inner_idx;
+
+          scalar_t sum = 0; // TODO was accreal here
+          for (int64_t d = 0; d < dim_size; d++)
+            if (LogSoftMax)
+              sum += gradOutput_data[d * dim_stride];
+            else
+              sum +=
+                  gradOutput_data[d * dim_stride] * output_data[d * dim_stride];
+
+          for (int64_t d = 0; d < dim_size; d++) {
+            if (LogSoftMax) {
+              gradInput_data[d * dim_stride] = gradOutput_data[d * dim_stride] -
+                  std::exp(output_data[d * dim_stride]) * sum;
+            } else {
+              gradInput_data[d * dim_stride] = output_data[d * dim_stride] *
+                  (gradOutput_data[d * dim_stride] - sum);
+            }
+          }
+        }
+      });
+}
+} // namespace
+
+Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) {
+  auto input = input_.contiguous();
+  Tensor output = at::native::empty_like(input);
+  int64_t dim = maybe_wrap_dim(dim_, input.dim());
+  if (input.dim() == 0)
+    input = input.view(1);
+  AT_CHECK(
+      dim >= 0 && dim < input.dim(),
+      "dim must be non-negative and less than input dimensions");
+  if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
+    softmax_lastdim_kernel(output, input);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] {
+      host_softmax<scalar_t, false>(output, input, dim);
+    });
+  }
+  return output;
+}
+
+Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) {
+  auto input = input_.contiguous();
+  Tensor output = at::native::empty_like(input);
+  int64_t dim = maybe_wrap_dim(dim_, input.dim());
+  if (input.dim() == 0)
+    input = input.view(1);
+  AT_CHECK(
+      dim >= 0 && dim < input.dim(),
+      "dim must be non-negative and less than input dimensions");
+  if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
+    log_softmax_lastdim_kernel(output, input);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] {
+      host_softmax<scalar_t, true>(output, input, dim);
+    });
+  }
+  return output;
+}
+
+Tensor softmax_backward_cpu(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
+  checkSameSize("softmax_backward", grad_arg, output_arg);
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+  auto grad = grad_.contiguous();
+  auto output = output_.contiguous();
+  Tensor grad_input = at::native::empty_like(grad);
+
+  if (grad.dim() == 0)
+    grad = grad.view(1);
+  if (output.dim() == 0)
+    output = output.view(1);
+  AT_CHECK(
+      dim >= 0 && dim < grad.dim(),
+      "dim must be non-negative and less than input dimensions");
+  if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
+    softmax_backward_lastdim_kernel(grad_input, grad, output);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] {
+      host_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
+    });
+  }
+  return grad_input;
+}
+
+Tensor log_softmax_backward_cpu(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
+  checkSameSize("log_softmax_backward", grad_arg, output_arg);
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+  auto grad = grad_.contiguous();
+  auto output = output_.contiguous();
+  Tensor grad_input = at::native::empty_like(grad);
+
+  if (grad.dim() == 0)
+    grad = grad.view(1);
+  if (output.dim() == 0)
+    output = output.view(1);
+  AT_CHECK(
+      dim >= 0 && dim < grad.dim(),
+      "dim must be non-negative and less than input dimensions");
+  if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
+    log_softmax_backward_lastdim_kernel(grad_input, grad, output);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] {
+      host_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
+    });
+  }
+  return grad_input;
+}
+}
+}
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
new file mode 100644
index 0000000..5d1c883
--- /dev/null
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -0,0 +1,269 @@
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+#include "ATen/native/SpectralOpsUtils.h"
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace at { namespace native {
+
+// This is a pass-through wrapper function that does the size check and
+// inferences. The actual forward implementation function is called
+// at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU).
+static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
+           const bool complex_input, const bool complex_output,
+           const bool inverse, IntList signal_sizes, const bool normalized,
+           const bool onesided) {
+
+  if (signal_ndim < 1 || signal_ndim > 3) {
+    std::ostringstream ss;
+    ss << "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim="
+       << signal_ndim;
+    throw std::runtime_error(ss.str());
+  }
+  if (!at::isFloatingType(self.type().scalarType())) {
+    std::ostringstream ss;
+    ss << "Expected an input tensor of floating types, but got input="
+       << self.type() << self.sizes();
+    throw std::runtime_error(ss.str());
+  }
+
+  auto signal_tensor_ndim = signal_ndim + static_cast<int64_t>(complex_input);  // add complex dim
+  if (self.dim() < signal_tensor_ndim) {
+    std::ostringstream ss;
+    ss << "Given signal_ndim=" << signal_ndim << ", expected an input tensor "
+       << "of at least" << signal_tensor_ndim << "D";
+    if (complex_input) {
+      ss << " (complex input adds an extra dimension)";
+    }
+    ss << ", but got input=" << self.type() << self.sizes();
+    throw std::runtime_error(ss.str());
+  }
+
+  auto self_shape = self.sizes();
+  auto batch_ndim = self.dim() - signal_tensor_ndim;
+
+  Tensor input = self;
+  // flatten the batch dims
+  if (batch_ndim == 0) {
+    // slightly faster path for non-batch mode
+    input = input.unsqueeze(0);
+  } else if (batch_ndim > 1) {
+    std::vector<int64_t> flatten_input_shape(signal_tensor_ndim + 1);
+    std::copy(self_shape.begin() + batch_ndim, self_shape.end(), flatten_input_shape.begin() + 1);
+    flatten_input_shape[0] = -1;
+    input = input.reshape(flatten_input_shape);
+
+  }
+
+  // now we assume that input is batched as [ B x signal_dims... ]
+
+  if (complex_input) {
+    if (input.size(signal_ndim + 1) != 2) {
+      std::ostringstream ss;
+      ss << "Expected an input tensor with a last dimension of size 2 "
+         << "representing real + imaginary components, but got input "
+         << self.type() << self.sizes();
+      throw std::runtime_error(ss.str());
+    }
+  }
+
+  // build signal_sizes and output_size
+  if (signal_sizes.size() > 0 && static_cast<int64_t>(signal_sizes.size()) != signal_ndim) {
+    std::ostringstream ss;
+    ss << "Expected signal_sizes to be empty (default) or of signal_ndim="
+       << signal_ndim << "D, but got signal_sizes=" << signal_sizes;
+    throw std::runtime_error(ss.str());
+  }
+  std::vector<int64_t> output_sizes(signal_ndim + 1 + static_cast<int64_t>(complex_output));
+  output_sizes[0] = input.size(0);  // batch size
+  std::vector<int64_t> checked_signal_sizes(signal_ndim);
+  for (int64_t i = 0; i < signal_ndim; i++) {
+    int64_t input_size = input.size(i + 1);
+    if (i == signal_ndim - 1 && onesided && complex_input && !complex_output) {
+      // If last dim and complex-to-real onesided, input is only half of
+      // signal, and we need to infer basing on signal_sizes, if given
+      // See native/SpectralOpsUtils.h for detailed description.
+      int64_t inferred_size;
+      if (signal_sizes.size() > 0) {
+        inferred_size = infer_ft_complex_to_real_onesided_size(input_size, signal_sizes[i]);
+      } else {
+        inferred_size = infer_ft_complex_to_real_onesided_size(input_size);
+      }
+      checked_signal_sizes[i] = inferred_size;
+      output_sizes[i + 1] = inferred_size;
+    } else {
+      if (i == signal_ndim - 1 && onesided && !complex_input && complex_output) {
+        // if last dim and real-to-complex onesided, output should be only
+        // half of the signal, and we need to infer using input_size
+        output_sizes[i + 1] = infer_ft_real_to_complex_onesided_size(input_size);
+      } else {
+        output_sizes[i + 1] = input_size;
+      }
+      checked_signal_sizes[i] = input_size;
+      if (signal_sizes.size() > 0 && signal_sizes[i] != checked_signal_sizes[i]) {
+        std::ostringstream ss;
+        ss << "Expected given signal_sizes=" << signal_sizes << " to have same "
+           << "shape with input at signal dimension " << i << ", but got "
+           << "signal_sizes=" << signal_sizes << " and input=" << self.type()
+           << self.sizes();
+        throw std::runtime_error(ss.str());
+      }
+    }
+  }
+  if (complex_output) {
+    output_sizes[signal_ndim + 1] = 2;
+  }
+
+  Tensor output = at::_fft_with_size(input, signal_ndim, complex_input,
+                                     complex_output, inverse,
+                                     checked_signal_sizes, normalized, onesided,
+                                     output_sizes);
+
+  // unflatten the batch dims
+  if (batch_ndim == 0) {
+    // slightly faster path for non-batch mode
+    output = output.squeeze(0);
+  } else if (batch_ndim > 1) {
+    auto output_ndim = self.dim() + static_cast<int64_t>(complex_output) - static_cast<int64_t>(complex_input);
+    std::vector<int64_t> unflatten_output_shape(output_ndim);
+    std::copy(self_shape.begin(), self_shape.begin() + batch_ndim, unflatten_output_shape.begin());
+    std::copy(output_sizes.begin() + 1, output_sizes.end(), unflatten_output_shape.begin() + batch_ndim);
+    output = output.reshape(unflatten_output_shape);
+  }
+  return output;
+}
+
+// We call the following methods via CUDA hooks because they are really only
+// valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details.
+int64_t _cufft_get_plan_cache_max_size() {
+  return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize();
+}
+
+void _cufft_set_plan_cache_max_size(int64_t max_size) {
+  detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(max_size);
+}
+
+int64_t _cufft_get_plan_cache_size() {
+  return detail::getCUDAHooks().cuFFTGetPlanCacheSize();
+}
+
+void _cufft_clear_plan_cache() {
+  detail::getCUDAHooks().cuFFTClearPlanCache();
+}
+
+Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  return _fft(self, signal_ndim, /* complex_input */ true,
+              /* complex_output */ true, /* inverse */ false, {}, normalized,
+              /* onesided */ false);
+}
+
+Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  return _fft(self, signal_ndim, /* complex_input */ true,
+              /* complex_output */ true, /* inverse */ true, {}, normalized,
+              /* onesided */ false);
+}
+
+Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
+            const bool onesided) {
+  return _fft(self, signal_ndim, /* complex_input */ false,
+              /* complex_output */ true, /* inverse */ false, {}, normalized,
+              onesided);
+}
+
+Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
+             const bool onesided,  IntList signal_sizes) {
+  return _fft(self, signal_ndim, /* complex_input */ true,
+              /* complex_output */ false, /* inverse */ true, signal_sizes,
+              normalized, onesided);
+}
+
+
+Tensor stft(const Tensor& self, const int64_t n_fft, const int64_t hop_length,
+            const int64_t win_length, const Tensor& window,
+            const bool normalized, const bool onesided) {
+  #define REPR(SS) \
+    SS << "stft(" << self.type() << self.sizes() << ", n_fft=" << n_fft \
+       << ", hop_length=" << hop_length << ", win_length=" << win_length \
+       << ", window="; \
+    if (window.defined()) { \
+      SS << window.type() << "{" << window.sizes() << "}"; \
+    } else { \
+      SS << "None"; \
+    } \
+    SS << ", normalized=" << normalized << ", onesided=" << onesided << ")"
+
+  if (!at::isFloatingType(self.type().scalarType()) || self.dim() > 2 || self.dim() < 1) {
+    std::ostringstream ss;
+    REPR(ss) << ": expected a 1D or 2D tensor of floating types";
+    AT_ERROR(ss.str());
+  }
+  Tensor input = self;
+  if (self.dim() == 1) {
+    input = input.unsqueeze(0);
+  }
+  int64_t batch = input.size(0);
+  int64_t len = input.size(1);
+  if (n_fft <= 0 || n_fft > len) {
+    std::ostringstream ss;
+    REPR(ss) << ": expected 0 < n_fft < " << len
+             << ", but got n_fft=" << win_length;
+    AT_ERROR(ss.str());
+  }
+  if (hop_length <= 0) {
+    std::ostringstream ss;
+    REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length;
+    throw std::runtime_error(ss.str());
+  }
+  if (win_length <= 0 || win_length > n_fft) {
+    std::ostringstream ss;
+    REPR(ss) << ": expected 0 < win_length <= n_fft, but got win_length="
+             << win_length;
+    AT_ERROR(ss.str());
+  }
+  if (window.defined() && (window.dim() != 1 || window.size(0) != win_length)) {
+    std::ostringstream ss;
+    REPR(ss) << ": expected a 1D window tensor of size equal to win_length="
+             << win_length << ", but got window with size " << window.sizes();
+    AT_ERROR(ss.str());
+  }
+  #undef REPR
+  auto window_ = window;
+  if (win_length < n_fft) {
+    // pad center
+    window_ = at::zeros({n_fft}, self.options());
+    auto left = (n_fft - win_length) / 2;
+    if (window.defined()) {
+      window_.narrow(0, left, win_length).copy_(window);
+    } else {
+      window_.narrow(0, left, win_length).fill_(1);
+    }
+  }
+  int64_t n_frames = 1 + (len - n_fft) / hop_length;
+  // time2col
+  input = input.as_strided(
+    {batch, n_frames, n_fft},
+    {input.stride(0), hop_length * input.stride(1), input.stride(1)}
+  );
+  if (window_.defined()) {
+    input = input.mul(window_);
+  }
+  // rfft and transpose to get (batch x fft_size x num_frames)
+  auto out = input.rfft(1, normalized, onesided).transpose_(1, 2);
+  if (self.dim() == 1) {
+    return out.squeeze_(0);
+  } else {
+    return out;
+  }
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
new file mode 100644
index 0000000..7518d1f
--- /dev/null
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+namespace at { namespace native {
+
+// NOTE [ Fourier Transform Conjugate Symmetry ]
+//
+// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
+// assuming X is the transformed K-dimensionsal signal, we have
+//
+//     X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
+//
+//       where j_k  = (N_k - i_k)  mod N_k, N_k being the signal size at dim k,
+//             * is the conjugate operator.
+//
+// Therefore, in such cases, FFT libraries return only roughly half of the
+// values to avoid redundancy:
+//
+//     X[:, :, ..., :floor(N / 2) + 1]
+//
+// This is also the assumption in cuFFT and MKL. In ATen SpectralOps, such
+// halved signal will also be returned by default (flag onesided=True).
+// The following infer_ft_real_to_complex_onesided_size function calculates the
+// onesided size from the twosided size.
+//
+// Note that this loses some information about the size of signal at last
+// dimension. E.g., both 11 and 10 maps to 6. Hence, the following
+// infer_ft_complex_to_real_onesided_size function takes in optional parameter
+// to infer the twosided size from given onesided size.
+//
+// cuFFT doc: http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional
+// MKL doc: https://software.intel.com/en-us/mkl-developer-reference-c-dfti-complex-storage-dfti-real-storage-dfti-conjugate-even-storage#CONJUGATE_EVEN_STORAGE
+
+inline int64_t infer_ft_real_to_complex_onesided_size(int64_t real_size) {
+  return (real_size / 2) + 1;
+}
+
+inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size,
+                                                      int64_t expected_size=-1) {
+  int64_t base = (complex_size - 1) * 2;
+  if (expected_size < 0) {
+    return base + 1;
+  } else if (base == expected_size) {
+    return base;
+  } else if (base + 1 == expected_size) {
+    return base + 1;
+  } else {
+    std::ostringstream ss;
+    ss << "expected real signal size " << expected_size << " is incompatible "
+       << "with onesided complex frequency size " << complex_size;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp
new file mode 100644
index 0000000..fbd07cc
--- /dev/null
+++ b/aten/src/ATen/native/SummaryOps.cpp
@@ -0,0 +1,64 @@
+// Returns the frequency of elements of input non-negative integer tensor.
+
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+
+#include <tuple>
+
+namespace at { namespace native {
+
+///////////////// bincount /////////////////
+namespace {
+
+template <typename input_t, typename weights_t>
+Tensor _bincount_cpu_template(
+    const Tensor& self,
+    const Tensor& weights,
+    int64_t minlength) {
+  if (minlength < 0) {
+    AT_ERROR("minlength should be >= 0");
+  }
+  if (self.dim() != 1 || self.numel() == 0 || *self.min().data<input_t>() < 0) {
+    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
+  }
+
+  bool has_weights = weights.defined();
+  if (has_weights && weights.size(0) != self.size(0)) {
+    AT_ERROR("input and weights should have the same length");
+  }
+
+  Tensor output;
+  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
+  nbins = std::max(nbins, minlength); // at least minlength # of bins
+
+  const input_t* self_p = self.contiguous().data<input_t>();
+  if (has_weights) {
+    output = native::zeros({nbins}, weights.options());
+    weights_t* output_p = output.data<weights_t>();
+    const weights_t* weights_p = weights.contiguous().data<weights_t>();
+    for (int64_t i = 0; i < self.size(0); i++) {
+      output_p[self_p[i]] += weights_p[i];
+    }
+  } else {
+    output = native::zeros({nbins}, kLong);
+    int64_t* output_p = output.data<int64_t>();
+    for (int64_t i = 0; i < self.size(0); i++) {
+      output_p[self_p[i]] += 1L;
+    }
+  }
+  return output;
+}
+} // namespace
+
+Tensor
+_bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
+  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
+    const auto scalar = weights.type().scalarType();
+    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
+      return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
+    return _bincount_cpu_template<scalar_t, double>(
+        self, weights.toType(CPU(kDouble)), minlength);
+  });
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
new file mode 100644
index 0000000..52df990
--- /dev/null
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -0,0 +1,227 @@
+#include "ATen/ATen.h"
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Error.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ReduceOpsUtils.h"
+
+namespace {
+template <typename scalar_t>
+void where_cpu(
+    at::Tensor& ret,
+    const at::Tensor& condition,
+    const at::Tensor& self,
+    const at::Tensor& other) {
+  at::CPU_tensor_apply4<scalar_t, uint8_t, scalar_t, scalar_t>(
+      ret,
+      condition,
+      self,
+      other,
+      [](scalar_t& ret_val,
+         const uint8_t& cond_val,
+         const scalar_t& self_val,
+         const scalar_t& other_val) {
+        ret_val = cond_val ? self_val : other_val;
+      });
+}
+} // namespace
+
+namespace at { namespace native {
+
+bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
+  return at::isclose(self, other, rtol, atol, equal_nan).all().toCByte();
+}
+
+Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
+  // TODO: use bitwise operator overloads once we add them
+  auto actual_error = (self - other).abs();
+  auto max_error = atol + rtol * other.abs();
+  auto close = actual_error <= max_error;
+
+  // Handle +/-inf
+  close.__ior__(self == other);
+  close.__iand__((self == INFINITY) == (other == INFINITY));
+  close.__iand__((self == -INFINITY) == (other == -INFINITY));
+
+  if (equal_nan) {
+    close.__ior__((self != self).__and__((other != other)));
+  }
+  return close;
+}
+
+bool is_nonzero(const Tensor& self) {
+  auto n = self.numel();
+  AT_ASSERT(n >= 0);
+  if (n == 0) {
+    AT_ERROR("bool value of Tensor with no values is ambiguous");
+  }
+  if (n > 1) {
+    AT_ERROR("bool value of Tensor with more than one value is ambiguous");
+  }
+  Scalar localScalar = self.pImpl->localScalar();
+  if (localScalar.isFloatingPoint()) {
+    return localScalar.to<double>() != 0;
+  } else if (localScalar.isIntegral()){
+    return localScalar.to<int64_t>() != 0;
+  }
+  AT_ERROR("expected non-Tensor backed scalar");
+}
+
+Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) {
+  if (condition.type().scalarType() != ScalarType::Byte) {
+    AT_ERROR("Expected condition to have ScalarType Byte, but got ScalarType ",
+                  toString(condition.type().scalarType()));
+  }
+  Tensor b_condition, b_self, b_other;
+  std::tie(b_condition, b_self, b_other) = expand_outplace(condition, self, other, "where");
+  return at::_s_where(b_condition, b_self, b_other);
+}
+
+Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) {
+  Tensor ret = self.type().tensor(self.sizes());
+  AT_DISPATCH_ALL_TYPES(ret.type(), "where", [&] {
+    where_cpu<scalar_t>(ret, condition, self, other);
+  });
+  return ret;
+}
+
+std::tuple<Tensor, Tensor> kthvalue(const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
+  Tensor values = self.type().tensor();
+  Tensor indices = self.type().toScalarType(kLong).tensor();
+  return at::native::kthvalue_out(values, indices, self, k, dim, keepdim);
+}
+
+std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor& values, Tensor& indices,
+                                           const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "kthvalue only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "kthvalue")) {
+    AT_ASSERT(values.dim() == 0);
+    indices.resize_({}).fill_(0);
+    return std::forward_as_tuple(values, indices);
+  } else {
+    return at::_th_kthvalue_out(values, indices, self, k, dim, keepdim);
+  }
+}
+
+std::tuple<Tensor, Tensor> median(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor values = self.type().tensor();
+  Tensor indices = self.type().toScalarType(kLong).tensor();
+  return at::native::median_out(values, indices, self, dim, keepdim);
+}
+
+std::tuple<Tensor &,Tensor &> median_out(Tensor& values, Tensor& indices,
+                                         const Tensor& self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "median only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "median")) {
+    AT_ASSERT(values.dim() == 0);
+    indices.resize_({}).fill_(0);
+    return std::forward_as_tuple(values, indices);
+  } else {
+    return at::_th_median_out(values, indices, self, dim, keepdim);
+  }
+}
+
+std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor values = self.type().tensor();
+  Tensor indices = self.type().toScalarType(kLong).tensor();
+  return at::native::mode_out(values, indices, self, dim, keepdim);
+}
+
+std::tuple<Tensor &,Tensor &> mode_out(Tensor& values, Tensor& indices,
+                                       const Tensor& self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "mode only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "mode")) {
+    AT_ASSERT(values.dim() == 0);
+    indices.resize_({}).fill_(0);
+    return std::forward_as_tuple(values, indices);
+  } else {
+    return at::_th_mode_out(values, indices, self, dim, keepdim);
+  }
+}
+
+std::tuple<Tensor, Tensor> max(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor max = self.type().tensor();
+  Tensor max_indices = self.type().toScalarType(kLong).tensor();
+  return at::native::max_out(max, max_indices, self, dim, keepdim);
+}
+
+std::tuple<Tensor &,Tensor &> max_out(Tensor& max, Tensor& max_indices,
+                                      const Tensor& self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "max only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial_no_ident(max, self, dim, keepdim, "max")) {
+    AT_ASSERT(max.dim() == 0);
+    max_indices.resize_({}).fill_(0);
+    return std::forward_as_tuple(max, max_indices);
+  } else {
+    return at::_th_max_out(max, max_indices, self, dim, keepdim);
+  }
+}
+
+Tensor max_values(const Tensor& self, int64_t dim, bool keepdim) {
+  return std::get<0>(self.max(dim, keepdim));
+}
+
+std::tuple<Tensor, Tensor> min(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor min = self.type().tensor();
+  Tensor min_indices = self.type().toScalarType(kLong).tensor();
+  return at::native::min_out(min, min_indices, self, dim, keepdim);
+}
+
+std::tuple<Tensor &,Tensor &> min_out(Tensor& min, Tensor& min_indices,
+                                      const Tensor& self, int64_t dim, bool keepdim) {
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           "min only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+  dim = maybe_wrap_dim(dim, self.dim());
+  if (_dimreduce_return_trivial_no_ident(min, self, dim, keepdim, "min")) {
+    AT_ASSERT(min.dim() == 0);
+    min_indices.resize_({}).fill_(0);
+    return std::forward_as_tuple(min, min_indices);
+  } else {
+    return at::_th_min_out(min, min_indices, self, dim, keepdim);
+  }
+}
+
+Tensor min_values(const Tensor& self, int64_t dim, bool keepdim) {
+  return std::get<0>(self.min(dim, keepdim));
+}
+
+// argmax and argmin
+
+Tensor argmax(const Tensor& self, int64_t dim, bool keepdim) {
+  return std::get<1>(self.max(dim, keepdim));
+}
+
+Tensor argmax(const Tensor& self) {
+  return std::get<1>(self.reshape({-1}).max(/*dim=*/0));
+}
+
+Tensor argmin(const Tensor& self, int64_t dim, bool keepdim) {
+  return std::get<1>(self.min(dim, keepdim));
+}
+
+Tensor argmin(const Tensor& self) {
+  return std::get<1>(self.reshape({-1}).min(/*dim=*/0));
+}
+
+// `argmin` and `argmax` are exposed in C++ but not in Python, where we only
+// expose `_argmin` and `_argmax` (which call the first versions). In Python,
+// we then define our own `argmax` and `argmin` that handle passing `dim=None`,
+// which gets the argmax/argmin of the flattened array.
+
+Tensor _argmax(const Tensor& self, int64_t dim, bool keepdim) {
+  return at::argmax(self, dim, keepdim);
+}
+
+Tensor _argmin(const Tensor& self, int64_t dim, bool keepdim) {
+  return at::argmin(self, dim, keepdim);
+}
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
new file mode 100644
index 0000000..d8c856b
--- /dev/null
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -0,0 +1,636 @@
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+
+#include "ATen/ATen.h"
+#include "ATen/CPUGenerator.h"
+#include "ATen/CheckGenerator.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Error.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/ScalarType.h"
+#include "ATen/Deprecated.h"
+#include "ATen/TensorOptions.h"
+#include "TH/THRandom.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+
+namespace at {
+namespace native {
+namespace {
+void window_function_checks(
+    const char* function_name,
+    const TensorOptions& options,
+    int64_t window_length) {
+  AT_CHECK(
+      options.layout() != kSparse,
+      function_name,
+      " is not implemented for sparse types, got: ",
+      options.type().toString());
+  AT_CHECK(
+      at::isFloatingType(options.dtype()),
+      function_name,
+      " expects floating point dtypes, got: ",
+      options.type().toString());
+  AT_CHECK(
+      window_length >= 0,
+      function_name,
+      " requires non-negative window_length, got window_length=",
+      window_length);
+}
+} // namespace
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor arange(Scalar start, Scalar end, const TensorOptions& options) {
+  return native::arange(start, end, /*step=*/1, options);
+}
+
+Tensor arange(
+    Scalar start,
+    Scalar end,
+    Scalar step,
+    const TensorOptions& options) {
+  return options.type()._arange(start, end, step);
+}
+
+Tensor& arange_out(Tensor& result, Scalar start, Scalar end) {
+  return native::arange_out(result, start, end, /*step=*/1);
+}
+
+Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
+  return at::_arange_out(result, start, end, step);
+}
+
+Tensor arange(Scalar end, const TensorOptions& options) {
+  return options.type()._arange(end);
+}
+
+Tensor& arange_out(Tensor& result, Scalar end) {
+  return at::_arange_out(result, end);
+}
+
+Tensor _dim_arange(const Tensor& like, int64_t dim) {
+  return like.type().toScalarType(at::kLong)._arange(like.size(dim));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor empty(IntList size, const TensorOptions& options) {
+  return options.type().tensor(size);
+}
+
+Tensor& empty_out(Tensor& result, IntList size) {
+  if (result.is_sparse()) {
+    result.sparse_raw_resize_(size, size.size(), 0);
+  } else {
+    result.resize_(size);
+  }
+  return result;
+}
+
+// Temporary type cast operators. These are needed to trace type-casts now since
+// Type's are not supported in the IR. Instead, we call down to these
+// specialized operators for each datatype.
+// TODO: remove when we have Type support in the IR
+
+#define DEFINE_CAST_OP(_1, n, _2)                                \
+  Tensor _cast_##n(const Tensor& self, bool non_blocking) {      \
+    auto& target_type = self.type().toScalarType(ScalarType::n); \
+    if (self.type() == target_type)                              \
+      return self;                                               \
+    return target_type.copy(self, non_blocking);                 \
+  }
+
+AT_FORALL_SCALAR_TYPES(DEFINE_CAST_OP)
+
+#undef DEFINE_CAST_OP
+
+Tensor empty_like(const Tensor& self) {
+  return native::empty_like(self, self.options());
+}
+
+Tensor empty_like(const Tensor& self, const TensorOptions& options) {
+  if (options.layout() == kSparse && self.type().is_sparse()) {
+    auto res = options.type().tensor({});
+    // resize_as_ requires the same exact type.
+    res.sparse_raw_resize_(self.sizes(), self._sparseDims(), self._denseDims());
+
+    return res;
+  }
+  return native::empty(self.sizes(), options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor eye(int64_t n, const TensorOptions& options) {
+  return native::eye(n, -1, options);
+}
+
+Tensor eye(int64_t n, int64_t m, const TensorOptions& options) {
+  auto tensor = options.type().tensor({});
+  return at::eye_out(tensor, n, m);
+}
+
+Tensor& eye_out_cpu(Tensor& result, int64_t n) {
+  return native::eye_out_cpu(result, n, -1);
+}
+
+Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
+#else
+  AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+#endif
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if(m <= 0) {
+#else
+  if(m < 0) {
+#endif
+    m = n;
+  }
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  AT_DISPATCH_ALL_TYPES(result.type(), "eye", [&]() -> void {
+    scalar_t* result_data = result.data<scalar_t>();
+    for(int64_t i = 0; i < sz; i++) {
+      result_data[i*(result.strides()[0] + result.strides()[1])] = 1;
+    }
+  });
+
+  return result;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ full ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor full(IntList size, Scalar fill_value, const TensorOptions& options) {
+  if (options.layout() == kSparse) {
+    AT_ERROR("full(...) is not implemented for sparse layout");
+  }
+  auto result = options.type().tensor(size);
+  return result.fill_(fill_value);
+}
+
+Tensor& full_out(Tensor& result, IntList size, Scalar fill_value) {
+  if (result.is_sparse()) {
+    AT_ERROR("full(...) is not implemented for sparse layout");
+  }
+  result.resize_(size);
+  return result.fill_(fill_value);
+}
+
+Tensor full_like(const Tensor& self, Scalar fill_value) {
+  return native::full_like(self, fill_value, self.options());
+}
+
+Tensor full_like(const Tensor& self, Scalar fill_value, const TensorOptions& options) {
+  return native::full(self.sizes(), fill_value, options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linspace ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor linspace(Scalar start, Scalar end, const TensorOptions& options) {
+  return native::linspace(start, end, /*steps=*/100, options);
+}
+
+Tensor linspace(
+    Scalar start,
+    Scalar end,
+    int64_t steps,
+    const TensorOptions& options) {
+  return options.type()._linspace(start, end, steps);
+}
+
+Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) {
+  return native::linspace_out(result, start, end, /*steps=*/100);
+}
+
+Tensor& linspace_out(Tensor& result, Scalar start, Scalar end, int64_t steps) {
+  return at::_linspace_out(result, start, end, steps);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ logspace ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor logspace(Scalar start, Scalar end, const TensorOptions& options) {
+  return native::logspace(start, end, /*steps=*/100, options);
+}
+
+Tensor logspace(
+    Scalar start,
+    Scalar end,
+    int64_t steps,
+    const TensorOptions& options) {
+  return options.type()._logspace(start, end, steps);
+}
+
+Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) {
+  return native::logspace_out(result, start, end, /*steps=*/100);
+}
+
+Tensor& logspace_out(Tensor& result, Scalar start, Scalar end, int64_t steps) {
+  return at::_logspace_out(result, start, end, steps);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ones ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor ones(IntList size, const TensorOptions& options) {
+  return native::full(size, /*fill_value=*/1, options);
+}
+
+Tensor& ones_out(Tensor& result, IntList size) {
+  return native::full_out(result, size, /*fill_value=*/1);
+}
+
+Tensor ones_like(const Tensor& self) {
+  return native::ones(self.sizes(), self.options());
+}
+
+Tensor ones_like(const Tensor& self, const TensorOptions& options) {
+  return native::ones(self.sizes(), options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rand ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor rand(IntList size, const TensorOptions& options) {
+  return native::rand(size, nullptr, options);
+}
+
+Tensor rand(IntList size, Generator* generator, const TensorOptions& options) {
+  auto result = options.type().tensor(size);
+  return result.uniform_(0, 1, generator);
+}
+
+Tensor& rand_out(Tensor& result, IntList size) {
+  return native::rand_out(result, size, nullptr);
+}
+
+Tensor& rand_out(Tensor& result, IntList size, Generator* generator) {
+  result.resize_(size);
+  return result.uniform_(0, 1, generator);
+}
+
+Tensor rand_like(const Tensor& self) {
+  return native::rand_like(self, self.options());
+}
+
+Tensor rand_like(const Tensor& self, const TensorOptions& options) {
+  return native::rand(self.sizes(), options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randint ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor randint(int64_t high, IntList size, const TensorOptions& options) {
+  return native::randint(high, size, nullptr, options);
+}
+
+Tensor randint(
+    int64_t high,
+    IntList size,
+    Generator* generator,
+    const TensorOptions& options) {
+  return native::randint(0, high, size, generator, options);
+}
+
+Tensor randint(
+    int64_t low,
+    int64_t high,
+    IntList size,
+    const TensorOptions& options) {
+  return native::randint(low, high, size, nullptr, options);
+}
+
+Tensor randint(
+    int64_t low,
+    int64_t high,
+    IntList size,
+    Generator* generator,
+    const TensorOptions& options) {
+  auto result = options.type().tensor(size);
+  return result.random_(low, high, generator);
+}
+
+Tensor& randint_out(Tensor& result, int64_t high, IntList size) {
+  return native::randint_out(result, high, size, nullptr);
+}
+
+Tensor& randint_out(
+    Tensor& result,
+    int64_t high,
+    IntList size,
+    Generator* generator) {
+  result.resize_(size);
+  return result.random_(0, high, generator);
+}
+
+Tensor& randint_out(Tensor& result, int64_t low, int64_t high, IntList size) {
+  return native::randint_out(result, low, high, size, nullptr);
+}
+
+Tensor& randint_out(
+    Tensor& result,
+    int64_t low,
+    int64_t high,
+    IntList size,
+    Generator* generator) {
+  result.resize_(size);
+  return result.random_(low, high, generator);
+}
+
+Tensor randint_like(const Tensor& self, int64_t high) {
+  return native::randint_like(self, high, self.options());
+}
+
+Tensor randint_like(const Tensor& self, int64_t low, int64_t high) {
+  return native::randint_like(self, low, high, self.options());
+}
+
+Tensor randint_like(
+    const Tensor& self,
+    int64_t high,
+    const TensorOptions& options) {
+  return native::randint(high, self.sizes(), nullptr, options);
+}
+
+Tensor randint_like(
+    const Tensor& self,
+    int64_t low,
+    int64_t high,
+    const TensorOptions& options) {
+  return native::randint(low, high, self.sizes(), nullptr, options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor randn(IntList size, const TensorOptions& options) {
+  return native::randn(size, nullptr, options);
+}
+
+Tensor randn(IntList size, Generator* generator, const TensorOptions& options) {
+  auto result = options.type().tensor(size);
+  return result.normal_(0, 1, generator);
+}
+
+Tensor& randn_out(Tensor& result, IntList size) {
+  return native::randn_out(result, size, nullptr);
+}
+
+Tensor& randn_out(Tensor& result, IntList size, Generator* generator) {
+  result.resize_(size);
+  return result.normal_(0, 1, generator);
+}
+
+Tensor randn_like(const Tensor& self) {
+  return native::randn_like(self, self.options());
+}
+
+Tensor randn_like(const Tensor& self, const TensorOptions& options) {
+  return native::randn(self.sizes(), nullptr, options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace {
+template <typename scalar_t>
+void randperm_cpu(Tensor& result, int64_t n, THGenerator* generator) {
+  scalar_t *r__data = result.data<scalar_t>();
+
+  result.resize_({n});
+  int64_t r__stride_0 = result.stride(0);
+
+  for(int64_t i = 0; i < n; i++) {
+    r__data[i*r__stride_0] = static_cast<scalar_t>(i);
+  }
+
+  for(int64_t i = 0; i < n - 1; i++)
+  {
+    int64_t z = THRandom_random(generator) % (n-i);
+    scalar_t sav = r__data[i*r__stride_0];
+    r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
+    r__data[(z+i)*r__stride_0] = sav;
+  }
+}
+} // namespace
+
+
+THGenerator* get_generator(at::Generator* gen) {
+  auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU);
+  auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
+  return gen_->generator;
+}
+
+Tensor randperm(int64_t n, const TensorOptions& options) {
+  return native::randperm(n, nullptr, options);
+}
+
+Tensor randperm(int64_t n, Generator* generator, const TensorOptions& options) {
+  auto tensor = options.type().tensor(n);
+  return at::randperm_out(tensor, n, generator);
+}
+
+Tensor& randperm_out(Tensor& result, int64_t n) {
+  return at::randperm_out(result, n, nullptr);
+}
+
+Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) {
+  AT_CHECK(n >= 0, "n must be non-negative, got", n);
+  result.resize_({n});
+  auto gen = get_generator(generator);
+  AT_DISPATCH_ALL_TYPES(result.type(), "randperm", [&]() -> void {
+    randperm_cpu<scalar_t>(result, n, gen);
+  });
+
+  return result;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor range(Scalar start, Scalar end, const TensorOptions& options) {
+  return native::range(start, end, /*step=*/1, options);
+}
+
+Tensor range(
+    Scalar start,
+    Scalar end,
+    Scalar step,
+    const TensorOptions& options) {
+  return options.type()._range(start, end, step);
+}
+
+Tensor& range_out(Tensor& result, Scalar start, Scalar end) {
+  return native::range_out(result, start, end, 1);
+}
+
+Tensor& range_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
+  return at::_range_out(result, start, end, step);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ zeros ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor zeros(IntList size, const TensorOptions& options) {
+  auto result = options.type().tensor(size);
+  return result.zero_();
+}
+
+Tensor& zeros_out(Tensor& result, IntList size) {
+  if (result.is_sparse()) {
+    result.sparse_raw_resize_(size, size.size(), 0);
+  } else {
+    result.resize_(size);
+  }
+  return result.zero_();
+}
+
+Tensor zeros_like(const Tensor& self) {
+  return native::zeros_like(self, self.options());
+}
+
+Tensor zeros_like(const Tensor& self, const TensorOptions& options) {
+  if (options.layout() == kSparse && self.type().is_sparse()) {
+    auto res = options.type().tensor({});
+    // resize_as_ requires the same exact type.
+    res.sparse_raw_resize_(self.sizes(), self._sparseDims(), self._denseDims());
+    return res;
+  }
+  return native::zeros(self.sizes(), options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ bartlett_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor bartlett_window(int64_t window_length, const TensorOptions& options) {
+  return native::bartlett_window(window_length, /*periodic=*/true, options);
+}
+
+Tensor bartlett_window(
+    int64_t window_length,
+    bool periodic,
+    const TensorOptions& options) {
+  window_function_checks("bartlett_window", options, window_length);
+  if (window_length == 1) {
+    return native::ones({1}, options);
+  }
+  if (periodic) {
+    window_length += 1;
+  }
+  auto window = native::arange(window_length, options).mul_(2. / static_cast<double>(window_length - 1));
+  const int64_t first_half_size = ((window_length - 1) >> 1) + 1;
+  window.narrow(0, first_half_size, window_length - first_half_size).mul_(-1).add_(2);
+  return periodic ? window.narrow(0, 0, window_length - 1) : window;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor blackman_window(int64_t window_length, const TensorOptions& options) {
+  return native::blackman_window(window_length, /*periodic=*/true, options);
+}
+
+Tensor blackman_window(
+    int64_t window_length,
+    bool periodic,
+    const TensorOptions& options) {
+  window_function_checks("blackman_window", options, window_length);
+  if (window_length == 1) {
+    return native::ones({1}, options);
+  }
+  if (periodic) {
+    window_length += 1;
+  }
+  // from https://en.wikipedia.org/wiki/Window_function#Blackman_window
+  auto window = native::arange(window_length, options).mul_(M_PI / static_cast<double>(window_length - 1));
+  window = window.mul(4).cos_().mul_(0.08) - window.mul(2).cos_().mul_(0.5) + 0.42;
+  return periodic ? window.narrow(0, 0, window_length - 1) : window;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hamming_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor hamming_window(int64_t window_length, const TensorOptions& options) {
+  return native::hamming_window(window_length, /*periodic=*/true, options);
+}
+
+Tensor hamming_window(
+    int64_t window_length,
+    bool periodic,
+    const TensorOptions& options) {
+  return native::hamming_window(
+      window_length, periodic, /*alpha=*/0.54, options);
+}
+
+Tensor hamming_window(
+    int64_t window_length,
+    bool periodic,
+    double alpha,
+    const TensorOptions& options) {
+  return native::hamming_window(
+      window_length, periodic, alpha, /*beta=*/0.46, options);
+}
+
+Tensor hamming_window(
+    int64_t window_length,
+    bool periodic,
+    double alpha,
+    double beta,
+    const TensorOptions& options) {
+  window_function_checks("hamming_window", options, window_length);
+  if (window_length == 1) {
+    return native::ones({1}, options);
+  }
+  if (periodic) {
+    window_length += 1;
+  }
+  auto window = native::arange(window_length, options);
+  window.mul_(M_PI * 2. / static_cast<double>(window_length - 1)).cos_().mul_(-beta).add_(alpha);
+  return periodic ? window.narrow(0, 0, window_length - 1) : window;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hann_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor hann_window(int64_t window_length, const TensorOptions& options) {
+  return native::hann_window(window_length, /*periodic=*/true, options);
+}
+
+Tensor hann_window(
+    int64_t window_length,
+    bool periodic,
+    const TensorOptions& options) {
+  window_function_checks("hann_window", options, window_length);
+  return native::hamming_window(
+      window_length, periodic, /*alpha=*/0.5, /*beta=*/0.5, options);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename T>
+Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
+  auto result = at::empty(values.size(), options);
+  AT_ASSERT(result.is_contiguous());
+  AT_DISPATCH_ALL_TYPES(result.type(), "tensor_cpu", [&] {
+    std::copy(values.begin(), values.end(), result.template data<scalar_t>());
+  });
+  return result;
+}
+
+template <typename T>
+Tensor tensor_cuda(ArrayRef<T> values, const TensorOptions& options) {
+  auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(at::kCPU));
+  return cpu_tensor.to(options.device());
+}
+
+#define TENSOR(T, _1, _2)                                           \
+  Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
+    if (options.device().is_cuda()) {                               \
+      return tensor_cuda(values, options);                          \
+    } else {                                                        \
+      return tensor_cpu(values, options);                           \
+    }                                                               \
+  }
+AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR)
+#undef TENSOR
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
new file mode 100644
index 0000000..881f626
--- /dev/null
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -0,0 +1,39 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+
+#include "ATen/Config.h"
+namespace at {
+namespace native {
+
+bool is_same_size(const Tensor& self, const Tensor& other) {
+  return self.sizes().equals(other.sizes());
+}
+
+int64_t size(const Tensor& self, int64_t dim) {
+  // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+  dim = maybe_wrap_dim(dim, self.dim(), false);
+  return self.sizes()[dim];
+}
+
+int64_t stride(const Tensor& self, int64_t dim) {
+  // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+  dim = maybe_wrap_dim(dim, self.dim(), false);
+  return self.strides()[dim];
+}
+
+bool cudnn_is_acceptable(const Tensor& self) {
+  if (!globalContext().userEnabledCuDNN()) return false;
+  if (!self.is_cuda()) return false;
+  auto st = self.type().scalarType();
+  if (!(st == kDouble || st == kFloat || st == kHalf)) return false;
+  if (!detail::getCUDAHooks().compiledWithCuDNN()) return false;
+  // NB: In the old Python code, there was also a test to see if the
+  // cuDNN library was actually dynamically linked or not.  I'm not
+  // sure if we can actually test this.
+  return true;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
new file mode 100644
index 0000000..f248f3e
--- /dev/null
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -0,0 +1,679 @@
+#include "ATen/ATen.h"
+#include "ATen/Error.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/optional.h"
+#include <TH/THTensor.hpp>
+
+#include <algorithm>
+#include <vector>
+
+namespace at {
+namespace native {
+
+static void check_cat_no_zero_dim(TensorList tensors) {
+  for(size_t i = 0; i < tensors.size(); ++i) {
+    auto& t = tensors[i];
+    if (t.dim() == 0) {
+      AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+    }
+  }
+}
+
+Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim) {
+  check_cat_no_zero_dim(tensors);
+  dim = legacy_cat_wrap_dim(dim, tensors);
+  return at::_cat_out(result, tensors, dim);
+}
+
+Tensor cat(TensorList tensors, int64_t dim) {
+  check_cat_no_zero_dim(tensors);
+  dim = legacy_cat_wrap_dim(dim, tensors);
+  return at::_cat(tensors, dim);
+}
+
+std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
+  if (self.dim() == 0) {
+    AT_ERROR("chunk expects at least a 1-dimensional tensor");
+  }
+  if (chunks <= 0) {
+    AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks);
+  }
+  int64_t split_size = (self.size(dim) + chunks - 1) / chunks;
+
+  // We need to call split_with_sizes in the case where split_size and dimension size are 0, because
+  // a call to split would discard the number of chunks (because we can have an arbitrary number of
+  // 0-sized chunks adding up to 0).  So, call split_with_sizes with the correct number of chunks,
+  // eventually we will do this for all cases.
+  if (split_size == 0 && self.size(dim) == 0) {
+    std::vector<int64_t> split_sizes(chunks, split_size);
+    split_sizes[chunks - 1] = split_size - (split_size * chunks - self.size(dim));
+    return self.split_with_sizes(split_sizes, dim);
+  } else {
+    return self.split(split_size, dim);
+  }
+}
+
+Tensor diagflat(const Tensor& self, int64_t offset) {
+  return self.contiguous().view(-1).diag(offset);
+}
+
+Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_) {
+  int64_t nDims = self.dim();
+  int64_t dim1 = maybe_wrap_dim(dim1_, nDims);
+  int64_t dim2 = maybe_wrap_dim(dim2_, nDims);
+  AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_);
+  int64_t diag_size;
+  int64_t storage_offset = self.storage_offset();
+  // compute storage offset and size for the diagonal
+  // for positive values of offset (above the main diagonal)
+  // "leftmost columns" (along dim2) are dropped
+  // for negative values of offset (below the main diagonal)
+  // "topmost rows" (along dim1) are dropped.
+  // Note that we invert +/- in the second to absorb the negative
+  // sign in the offset.
+  if (offset >= 0) {
+    diag_size = std::max<int64_t>(std::min(self.size(dim1), self.size(dim2)-offset), 0);
+  } else {
+    diag_size = std::max<int64_t>(std::min(self.size(dim1)+offset, self.size(dim2)), 0);
+  }
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude
+#endif
+
+  // NumPy allows you to specify offsets "off the end"; let's just be careful not to
+  // set a ridiculous storage_offset in that case (technically it shouldn't matter
+  // because there are no elements in the tensor, but let's be kosher).
+  if (diag_size == 0) {
+    // skip
+  } else if (offset >= 0) {
+    storage_offset += offset * self.stride(dim2);
+  } else {
+    storage_offset -= offset * self.stride(dim1);
+  }
+
+  // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum)
+  // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
+  sizes.erase(sizes.begin() + std::max(dim1, dim2));
+  strides.erase(strides.begin() + std::max(dim1, dim2));
+  sizes.erase(sizes.begin() + std::min(dim1, dim2));
+  strides.erase(strides.begin() + std::min(dim1, dim2));
+  sizes.push_back(diag_size);
+  strides.push_back(self.stride(dim1)+self.stride(dim2));
+
+  // return view with new parameters
+  return self.as_strided(sizes, strides, storage_offset);
+}
+
+Tensor expand(const Tensor& self, IntList size, bool implicit) {
+  // [expand implicit]
+  // The implicit flag is set to true for any expand calls inserted by broadcast
+  // operators in ExpandUtils.h This flag is recorded by the tracer to
+  // distinguish between expands inserted by broadcasts and those explicitly
+  // requested by the user, because it is legal to remove implicit expands
+  // from the graph, but not legal to remove the explicit ones.
+  if (size.size() < (size_t)self.dim()) {
+    std::ostringstream ss;
+    ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size
+       << "): the number of sizes provided (" << size.size() << ") "
+       << "must be greater or equal to the number of dimensions in the tensor ("
+       << self.dim() << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  std::vector<int64_t> expandedSizes;
+  std::vector<int64_t> expandedStrides;
+  std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self, size);
+
+  return self.as_strided(expandedSizes, expandedStrides);
+}
+
+Tensor expand_as(const Tensor& self, const Tensor& other) {
+  return self.expand(other.sizes());
+}
+
+Tensor as_strided(const Tensor& self, IntList size, IntList stride) {
+  return self.as_strided(size, stride, self.storage_offset());
+}
+
+Tensor &as_strided_(Tensor& self, IntList size, IntList stride) {
+  return self.as_strided_(size, stride, self.storage_offset());
+}
+
+Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
+  AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  auto cur_size = self.size(dim);
+  if (start < 0) {
+    AT_ERROR("start out of range");
+  }
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (length <= 0 || start > cur_size - length) {
+#else
+  if (length < 0 || start > cur_size - length) {
+#endif
+    AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
+  }
+  return at::slice(self, dim, start, start + length, 1);
+}
+
+Tensor permute(const Tensor& self, IntList dims) {
+  auto nDims = self.dim();
+  if (dims.size() != (size_t)nDims) {
+    AT_ERROR("number of dims don't match in permute");
+  }
+  auto oldSizes = self.sizes();
+  auto oldStrides = self.strides();
+  std::vector<int64_t> newSizes(nDims);
+  std::vector<int64_t> newStrides(nDims);
+  std::vector<bool> seen(nDims);
+  for (int64_t i = 0; i < nDims; i++) {
+    auto dim = maybe_wrap_dim(dims[i], nDims);
+    if (seen[dim]) {
+      AT_ERROR("repeated dim in permute");
+    }
+    seen[dim] = true;
+    newSizes[i] = oldSizes[dim];
+    newStrides[i] = oldStrides[dim];
+  }
+  return self.as_strided(newSizes, newStrides);
+}
+
+Tensor repeat(const Tensor& self, IntList repeats) {
+  if (repeats.size() < (size_t)self.dim()) {
+    AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
+  }
+
+  // Add new leading dimensions to the tensor if the
+  // number of target dimensions is larger than the
+  // number of source dimensions.
+  int64_t num_new_dimensions = repeats.size() - self.dim();
+  std::vector<int64_t> padded_size(num_new_dimensions, 1);
+  padded_size.insert(padded_size.end(), self.sizes().begin(), self.sizes().end());
+  std::vector<int64_t> target_size(repeats.size());
+  for(size_t idx = 0; idx < repeats.size(); ++idx) {
+    target_size[idx] = padded_size[idx] * repeats[idx];
+  }
+
+  Tensor xtensor = self.expand(padded_size);
+
+  Tensor result = self.type().tensor(target_size);
+  Tensor urtensor = result.type().alias(result);
+  for (int64_t i = 0; i < xtensor.dim(); ++i) {
+    // can't unfold with step 0, so make sure step is at least 1
+    // (it doesn't matter what it is in that case, because the size is 0).
+    urtensor = urtensor.unfold(i, xtensor.size(i), std::max<int64_t>(xtensor.size(i), 1));
+  }
+
+  urtensor.copy_(xtensor.expand_as(urtensor));
+
+  return result;
+}
+
+// Infers the size of a dim with size -1, if it exists. Also checks that new
+// shape is compatible with the number of elements.
+static std::vector<int64_t> infer_size(IntList shape, int64_t numel) {
+  auto res = shape.vec();
+  int64_t newsize = 1;
+  auto infer_dim = at::optional<int64_t>();
+  for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
+    if (shape[dim] == -1) {
+      if (infer_dim) {
+        throw std::runtime_error("only one dimension can be inferred");
+      }
+      infer_dim = dim;
+    } else if (shape[dim] >= 0) {
+      newsize *= shape[dim];
+    } else {
+      AT_ERROR("invalid shape dimension ", shape[dim]);
+    }
+  }
+
+  if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) {
+    if (infer_dim) {
+      // we have a degree of freedom here to select the dimension size; follow NumPy semantics
+      // and just bail.
+      AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape);
+      res[*infer_dim] = numel / newsize;
+    }
+#ifndef USE_TH_SIZE_ZERO_DIM
+    if (numel == 0) {
+      // Collapse zero-element shapes into one dimension because TH handles zeros
+      // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this
+      // once we have multi-dimensional empty tensors.
+      return {0};
+    }
+#endif
+    return res;
+  }
+
+  std::ostringstream ss;
+  ss << "shape '" << shape << "' is invalid for input of size " << numel;
+  throw std::runtime_error(ss.str());
+}
+
+Tensor reshape(const Tensor& self, IntList proposed_shape) {
+  if (self.type().is_sparse()) {
+    AT_ERROR("reshape is not implemented for sparse tensors");
+  }
+  auto shape = infer_size(proposed_shape, self.numel());
+  if (auto stride = THTensor_compute_stride(self.sizes(), self.strides(), shape)) {
+    return self.as_strided(shape, *stride);
+  }
+  return at::_unsafe_view(self.clone(), shape);
+}
+
+Tensor reshape_as(const Tensor& self, const Tensor& other) {
+  return self.reshape(other.sizes());
+}
+
+Tensor select(const Tensor& self, int64_t dim, int64_t index) {
+  int64_t ndim = self.dim();
+  AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor.");
+  dim = maybe_wrap_dim(dim, ndim);
+  auto size = self.size(dim);
+  if (index < -size || index >= size) {
+    std::stringstream ss;
+    ss << "select(): index " << index << " out of range for tensor of size ";
+    ss << self.sizes() << " at dimension " << dim;
+    throw std::runtime_error(ss.str());
+  }
+  if (index < 0) {
+    index += size;
+  }
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
+  auto storage_offset = self.storage_offset() + index * strides[dim];
+  sizes.erase(sizes.begin() + dim);
+  strides.erase(strides.begin() + dim);
+  return self.as_strided(sizes, strides, storage_offset);
+}
+
+Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_t step) {
+  int64_t ndim = self.dim();
+  AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor.");
+  dim = maybe_wrap_dim(dim, ndim);
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
+  if (step <= 0) {
+    // TODO: support negative strides
+    throw std::runtime_error("slice step must be positive");
+  }
+  if (start < 0) {
+    start += sizes[dim];
+  }
+  if (end < 0) {
+    end += sizes[dim];
+  }
+  if (start < 0) {
+    start = 0;
+  } else if (start >= sizes[dim]) {
+    start = sizes[dim];
+  }
+  if (end < start) {
+    end = start;
+  } else if (end >= sizes[dim]) {
+    end = sizes[dim];
+  }
+  auto storage_offset = self.storage_offset() + start * strides[dim];
+  auto len = end - start;
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (len == 0) {
+    // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now
+    return self.type().tensor();
+  }
+#endif
+  sizes[dim] = (len + step - 1) / step;  // round-up
+  strides[dim] *= step;
+  return self.as_strided(sizes, strides, storage_offset);
+}
+
+std::vector<Tensor> split(const Tensor& self, int64_t split_size, int64_t dim) {
+  AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  AT_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  int64_t dim_size = self.size(dim);
+  AT_CHECK(split_size > 0 || self.size(dim) == 0,
+           "split_size can only be 0 if dimension size is 0, "
+           "but got dimension size of ", dim_size);
+  // if split_size is 0 and dimension size is 0, there is 1 split.
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
+    // (returns a single split).  We might want to error here, but keep it for BC.
+    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
+  }
+  std::vector<Tensor> splits(num_splits);
+  int64_t last_split_size = split_size - (split_size * num_splits - dim_size);
+
+  for (int64_t i = 0; i < num_splits; ++i) {
+    auto length = i < num_splits - 1 ? split_size : last_split_size;
+    splits[i] = self.narrow(dim, i * split_size, length);
+  }
+  return splits;
+}
+
+std::vector<Tensor> split_with_sizes(const Tensor& self, IntList split_sizes, int64_t dim) {
+  AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  int64_t dim_size = self.size(dim);
+  int64_t num_splits = split_sizes.size();
+  std::vector<Tensor> splits(num_splits);
+  int64_t start_idx = 0;
+  int64_t i;
+
+  for (i = 0; i < num_splits; ++i) {
+    auto length = split_sizes[i];
+    if (length < 0) {
+      std::ostringstream ss;
+      ss << "split_with_sizes expects split_sizes have only non-negative "
+         << "entries, but got split_sizes=" << split_sizes;
+      throw std::runtime_error(ss.str());
+    }
+    splits[i] = self.narrow(dim, start_idx, length);
+    start_idx += length;
+  }
+  if (start_idx != dim_size) {
+    std::ostringstream ss;
+    ss << "split_with_sizes expects split_sizes to sum exactly to "
+       << dim_size << " (input tensor's size at dimension " << dim << "), "
+       << "but got split_sizes=" << split_sizes;
+    throw std::runtime_error(ss.str());
+  }
+  return splits;
+}
+
+static inline std::vector<Tensor> get_stack_inputs(TensorList tensors, int64_t dim) {
+  std::vector<Tensor> inputs(tensors.size());
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    inputs[i] = tensors[i].unsqueeze(dim);
+  }
+  return inputs;
+}
+
+Tensor stack(TensorList tensors, int64_t dim) {
+  if (tensors.size() == 0) {
+    throw std::runtime_error("stack expects a non-empty TensorList");
+  }
+  dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
+  return at::cat(get_stack_inputs(tensors, dim), dim);
+}
+
+Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
+  if (tensors.size() == 0) {
+    throw std::runtime_error("stack expects a non-empty TensorList");
+  }
+  dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
+  return at::cat_out(result, get_stack_inputs(tensors, dim), dim);
+}
+
+static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
+  int64_t nsparseDims = self._sparseDims();
+  if (dim0 >= nsparseDims || dim1 >= nsparseDims) {
+    AT_ERROR(
+        "sparse transpose: transposed dimensions must be sparse ",
+        "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
+  }
+
+  if (self._indices().numel() == 0 && self._values().numel() == 0) {
+    std::vector<int64_t> sizes(self.sizes());
+    std::swap(sizes[dim0], sizes[dim1]);
+
+    return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims());
+  } else {
+    auto indices = self._indices();
+    auto row0 = indices.select(0, dim0);
+    auto row1 = indices.select(0, dim1);
+
+    // swap row0 and row1
+    auto tmp = at::zeros_like(row0);
+    tmp.copy_(row0);
+    row0.copy_(row1);
+    row1.copy_(tmp);
+
+    std::vector<int64_t> sizes(self.sizes());
+    std::swap(sizes[dim0], sizes[dim1]);
+
+    return self.sparse_raw_resize_(sizes, -1, -1);
+  }
+}
+
+Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
+  auto ndims = self.dim();
+  dim0 = maybe_wrap_dim(dim0, ndims);
+  dim1 = maybe_wrap_dim(dim1, ndims);
+  if (dim0 == dim1) {
+    return self;
+  }
+
+  if (self.is_sparse()) {
+    return sparse_transpose_(self, dim0, dim1);
+  }
+
+  std::vector<int64_t> strides(self.strides());
+  std::vector<int64_t> sizes(self.sizes());
+  std::swap(strides[dim0], strides[dim1]);
+  std::swap(sizes[dim0], sizes[dim1]);
+  return self.as_strided_(sizes, strides);
+}
+
+Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
+  auto ndims = self.dim();
+  dim0 = maybe_wrap_dim(dim0, ndims);
+  dim1 = maybe_wrap_dim(dim1, ndims);
+  if (dim0 == dim1) {
+    return self;
+  }
+
+  if (self.is_sparse()) {
+    Tensor self_clone = self.clone();  // yes, this is what THS does
+    return sparse_transpose_(self_clone, dim0, dim1);
+  }
+
+  std::vector<int64_t> strides(self.strides());
+  std::vector<int64_t> sizes(self.sizes());
+  std::swap(strides[dim0], strides[dim1]);
+  std::swap(sizes[dim0], sizes[dim1]);
+  return self.as_strided(sizes, strides);
+}
+
+static void check_t(const Tensor& self, const char *fn) {
+  if (self.is_sparse()) {
+    int64_t sparseDims = self._sparseDims();
+    int64_t denseDims = self._denseDims();
+    if (!(sparseDims == 2 && denseDims == 0)) {
+      AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
+               sparseDims, " sparse and ", denseDims, " dense dimensions");
+    }
+  } else if (self.dim() != 2) {
+    AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D");
+  }
+}
+
+Tensor t(const Tensor & self) {
+  check_t(self, "t()");
+  return self.transpose(0, 1);
+}
+
+Tensor & t_(Tensor & self) {
+  check_t(self, "t_()");
+  return self.transpose_(0, 1);
+}
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t> >
+inferSqueezeGeometry(const Tensor &tensor) {
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+
+  for(int64_t d = 0; d < tensor.dim(); d++) {
+    if(tensor.sizes()[d] != 1) {
+      sizes.push_back(tensor.sizes()[d]);
+      strides.push_back(tensor.strides()[d]);
+    }
+  }
+
+  return std::make_tuple(sizes, strides);
+}
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t> >
+inferSqueezeGeometry(const Tensor& tensor, int64_t dim) {
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+
+  for(int64_t d = 0; d < tensor.dim(); d++) {
+    if(d != dim || tensor.sizes()[dim] != 1) {
+      sizes.push_back(tensor.sizes()[d]);
+      strides.push_back(tensor.strides()[d]);
+    }
+  }
+  return std::make_tuple(sizes, strides);
+}
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t> >
+inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (tensor.numel() == 0) {
+    throw std::runtime_error("cannot unsqueeze empty tensor");
+  }
+#endif
+  std::vector<int64_t> sizes(tensor.sizes());
+  std::vector<int64_t> strides(tensor.strides());
+  int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim];
+  sizes.insert(sizes.begin() + dim, 1);
+  strides.insert(strides.begin() + dim, new_stride);
+
+  return std::make_tuple(sizes, strides);
+}
+
+Tensor squeeze(const Tensor& self) {
+  auto g = inferSqueezeGeometry(self);
+  return self.as_strided(std::get<0>(g), std::get<1>(g));
+}
+
+Tensor squeeze(const Tensor& self, int64_t dim) {
+  int64_t dims = self.dim();
+  dim = maybe_wrap_dim(dim, dims);
+
+  if (dims == 0 || self.sizes()[dim] != 1) {
+    return self.as_strided(self.sizes().vec(), self.strides().vec());
+  }
+  auto g = inferSqueezeGeometry(self, dim);
+  return self.as_strided(std::get<0>(g), std::get<1>(g));
+}
+
+Tensor & squeeze_(Tensor& self) {
+  auto g = inferSqueezeGeometry(self);
+  return self.as_strided_(std::get<0>(g), std::get<1>(g));
+}
+
+Tensor & squeeze_(Tensor& self, int64_t dim) {
+  int64_t dims = self.dim();
+  dim = maybe_wrap_dim(dim, self.dim());
+
+  if (dims == 0 || self.sizes()[dim] != 1) {
+    return self.as_strided_(self.sizes().vec(), self.strides().vec());
+  }
+  auto g = inferSqueezeGeometry(self, dim);
+  return self.as_strided_(std::get<0>(g), std::get<1>(g));
+}
+
+// _unsafe_view() differs from view() in that the returned tensor isn't treated
+// as a view for the purposes of automatic differentiation. (It's not listed in
+// VIEW_FUNCTIONS in gen_autograd.py).  It's only safe to use if the `self` tensor
+// is temporary. For example, the viewed tensor here (a + b) is discarded immediately
+// after viewing:
+//
+//  res = at::_unsafe_view(a + b, size);
+//
+// This is a hack because in-place operations on tensors treated like views
+// can be much more expensive than the same operations on non-view tensors.
+Tensor _unsafe_view(const Tensor& self, IntList size) {
+  return self.view(size);
+}
+
+Tensor unsqueeze(const Tensor& self, int64_t dim) {
+  dim = maybe_wrap_dim(dim, self.dim() + 1);
+
+  auto g = inferUnsqueezeGeometry(self, dim);
+  return self.as_strided(std::get<0>(g), std::get<1>(g));
+}
+
+Tensor & unsqueeze_(Tensor& self, int64_t dim) {
+  dim = maybe_wrap_dim(dim, self.dim() + 1);
+
+  auto g = inferUnsqueezeGeometry(self, dim);
+  return self.as_strided_(std::get<0>(g), std::get<1>(g));
+}
+
+Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) {
+  start_dim = maybe_wrap_dim(start_dim, self.dim());
+  end_dim = maybe_wrap_dim(end_dim, self.dim());
+  AT_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim");
+
+  if (start_dim == end_dim) {
+    return self;
+  }
+
+  // We don't want to infer_size on the entire shape, because that can give us an extra degree
+  // of freedom we don't want; for example, consider shape [0, 1, 3, 0], with start_dim=1, end_dim=2.
+  // It's clear we want result shape [0, 3, 0] but passing [0, -1, 0] to infer_size means the -1
+  // can take on any value and satisfy the constraints.
+  auto slice_numel = prod_intlist(self.sizes().slice(start_dim, end_dim - start_dim + 1));
+  std::vector<int64_t> shape;
+  shape.reserve(self.dim() - end_dim + start_dim);
+  for (int64_t i = 0; i < start_dim; i++) {
+    shape.push_back(self.size(i));
+  }
+  shape.push_back(slice_numel);
+  for (int64_t i = end_dim + 1; i < self.dim(); i++) {
+    shape.push_back(self.size(i));
+  }
+
+  return self.reshape(shape);
+}
+
+Tensor view_as(const Tensor& self, const Tensor& other) {
+  return self.view(other.sizes());
+}
+
+int64_t numel(const Tensor& self) {
+  return self.pImpl->numel();
+}
+
+std::vector<Tensor> unbind(const Tensor &self, int64_t dim) {
+  dim = maybe_wrap_dim(dim, self.dim());
+  int64_t size = self.size(dim);
+  std::vector<Tensor> tensors(size);
+  for (int i = 0; i < size; i++) {
+    tensors[i] = self.select(dim, i);
+  }
+  return tensors;
+}
+
+std::vector<Tensor> meshgrid(TensorList tensors) {
+  int64_t size = tensors.size();
+  AT_CHECK(size > 0, "meshgrid expects a non-empty TensorList");
+  std::vector<int64_t> shape(size);
+  for(int64_t i = 0; i < size; i++) {
+    switch (tensors[i].dim()) {
+    case 0:
+      shape[i] = 1;
+      break;
+    case 1:
+      shape[i] = tensors[i].size(0);
+      break;
+    default:
+      AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]);
+    }
+  }
+  std::vector<Tensor> grids;
+  for(int64_t i = 0; i < size; i++) {
+    std::vector<int64_t> view_shape(size, 1);
+    view_shape[i] = -1;
+    grids.push_back(tensors[i].view(view_shape).expand(shape));
+  }
+  return grids;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
new file mode 100644
index 0000000..8bce12c
--- /dev/null
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -0,0 +1,59 @@
+#include "ATen/native/TensorTransformations.h"
+
+#include <ATen/NativeFunctions.h>
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace at {
+namespace native {
+
+Tensor flip_cpu(const Tensor& self, IntList dims) {
+  const int64_t total_dims = self.dim(), flip_dims_size = dims.size();
+  check_errors(total_dims, flip_dims_size, dims);
+
+  auto flip_dims_v = std::vector<int64_t>(dims);
+  std::sort(flip_dims_v.begin(), flip_dims_v.end());
+  auto final_indices = std::vector<at::Tensor>(total_dims);
+
+  auto indices = std::vector<at::Tensor>(flip_dims_size);
+  for (int64_t i = 0; i < flip_dims_size; i++) {
+    indices[i] = at::arange(self.size(flip_dims_v[i]) - 1, -1, -1, self.type().toScalarType(at::kLong));
+    // creates a meshgrid
+    auto temp = std::vector<int64_t>(flip_dims_size, 1);
+    temp[i] = indices[i].size(0);
+    indices[i] = indices[i].view(IntList(temp));
+    final_indices[flip_dims_v[i]] = indices[i];
+  }
+
+  // check if distance between two flip dims >= 2, where permute of output tensor is needed,
+  // because the advanced indexing puts all non-consecutive indices in the beginning of the tensor
+  bool to_permute = false;
+  int64_t first = flip_dims_v[0], second = flip_dims_v[0];
+  for (int64_t i = 1; i < flip_dims_size; i++) {
+    second = flip_dims_v[i];
+    if (second - first >= 2) {
+      to_permute = true;
+      break;
+    }
+    first = second;
+  }
+
+  if (to_permute) {
+    // permute output tensor
+    auto permute_order = std::vector<int64_t>(flip_dims_v);
+    for (int64_t i = 0; i < total_dims; i++) {
+      if (std::find(flip_dims_v.begin(), flip_dims_v.end(), i) == flip_dims_v.end()) {
+        permute_order.emplace_back(i);
+      }
+    }
+    auto out_tensor = self.index(TensorList(final_indices));
+    return out_tensor.permute(IntList(permute_order));
+  }
+
+  auto out_tensor = self.index(TensorList(final_indices));
+  return out_tensor;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
new file mode 100644
index 0000000..554a46f
--- /dev/null
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -0,0 +1,39 @@
+#include "ATen/ATen.h"
+
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace at {
+namespace native {
+
+static inline void check_errors(int64_t total_dims, int64_t flip_dims_size, IntList dims) {
+  // check if number of axis in dim is valid
+  AT_CHECK(flip_dims_size > 0,
+    "expected input tensor dims > 0, but got tensor dims size=", flip_dims_size);
+
+  // check duplicates in dims
+  auto flip_dims_v = std::vector<int64_t>(dims);
+  flip_dims_v.erase(std::unique(flip_dims_v.begin(), flip_dims_v.end()), flip_dims_v.end());
+  AT_CHECK((int64_t)flip_dims_v.size() == flip_dims_size,
+    "dims has duplicates, original flip dims size=", flip_dims_size,
+    ", but unique flip dims size=", flip_dims_v.size());
+
+  // check len of dims
+  AT_CHECK(flip_dims_size <= total_dims,
+    "expected flip dims size <= tensor total dims, but got flip dims size=",
+    flip_dims_size, " and tensor total dim=", total_dims);
+
+  // check if dims axis within range
+  auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end());
+
+  AT_CHECK(*min_max_d.first >= 0,
+    "expected flip dims axis >= 0, but got min flip dims=", *min_max_d.first);
+
+  AT_CHECK(*min_max_d.second < total_dims,
+    "expected flip dims axis < tensor total dims, but got max flip dims=",
+    *min_max_d.second, " and tensor total dim=", total_dims);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp
new file mode 100644
index 0000000..a3c5f68
--- /dev/null
+++ b/aten/src/ATen/native/TypeProperties.cpp
@@ -0,0 +1,37 @@
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/NativeFunctions.h"
+#include <type_traits>
+
+namespace at { namespace native {
+
+bool is_cuda(const Tensor& self) {
+  return self.type().is_cuda();
+}
+
+bool is_distributed(const Tensor& self) {
+  return self.type().is_distributed();
+}
+
+bool is_floating_point(const Tensor& self) {
+  return at::isFloatingType(self.type().scalarType());
+}
+
+bool is_signed(const Tensor &self) {
+  if (self.type().scalarType() == ScalarType::Half) {
+    return true;
+  }
+  return AT_DISPATCH_ALL_TYPES(self.type(), "is_signed", [&]() -> bool {
+    return std::is_signed<scalar_t>();
+  });
+}
+
+bool is_sparse(const Tensor& self) {
+  return self.type().is_sparse();
+}
+
+Tensor type_as(const Tensor& self, const Tensor& other) {
+  return self.toType(other.type());
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
new file mode 100644
index 0000000..f32a206
--- /dev/null
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -0,0 +1,100 @@
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/Parallel.h"
+#include "ATen/native/cpu/UnaryOpsKernel.h"
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include <map>
+
+// NOTE:
+// YOU ARE NOT OBLIGED TO USE THESE MACROS
+// If you're writing something more specialized, please don't try to make them
+// work for your case, but just write something new instead.
+
+namespace at {
+namespace native {
+
+Tensor& fill_(Tensor& self, Scalar value) {
+  return self._fill_(value);
+}
+
+Tensor& fill_(Tensor& self, const Tensor& value) {
+  return self._fill_(value);
+}
+
+// NB: If you use this macro, you may also need to add a CUDA forwarding
+// stub in CUDAUnaryOps
+
+#define IMPLEMENT_UNARY_OP_VEC(op)                              \
+  Tensor op(const Tensor& self) {                               \
+    Tensor result = self.type().tensor();                       \
+    return at::op##_out(result, self);                          \
+  }                                                             \
+  Tensor& _##op##__cpu(Tensor& self_) {                         \
+    if (self_.numel() > 0) {                                    \
+      Tensor self = sort_strides(self_);                        \
+      op##Impl(self, self);                                     \
+    }                                                           \
+    return self_;                                               \
+  }                                                             \
+  Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
+    result.resize_(self.sizes());                               \
+    if (result.numel() > 0) {                                   \
+      op##Impl(result, self);                                   \
+    }                                                           \
+    return result;                                              \
+  }
+
+#define IMPLEMENT_UNARY_OP_TH(op)                               \
+  Tensor op(const Tensor& self) {                               \
+    Tensor result = self.type().tensor();                       \
+    return at::op##_out(result, self);                          \
+  }                                                             \
+  Tensor& _##op##__cpu(Tensor& self) {                          \
+    return at::op##_out(self, self);                            \
+  }                                                             \
+  Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
+    result.resize_(self.sizes());                               \
+    return at::_##op##_out(result, self);                       \
+  }
+
+// NB: Temp. defaulting to TH implementation of abs due to issues with Apple
+
+IMPLEMENT_UNARY_OP_TH(abs)
+IMPLEMENT_UNARY_OP_VEC(acos)
+IMPLEMENT_UNARY_OP_VEC(asin)
+IMPLEMENT_UNARY_OP_VEC(atan)
+IMPLEMENT_UNARY_OP_VEC(ceil)
+IMPLEMENT_UNARY_OP_VEC(cos)
+IMPLEMENT_UNARY_OP_TH(cosh)
+IMPLEMENT_UNARY_OP_VEC(erf)
+IMPLEMENT_UNARY_OP_VEC(erfc)
+IMPLEMENT_UNARY_OP_VEC(exp)
+IMPLEMENT_UNARY_OP_VEC(expm1)
+IMPLEMENT_UNARY_OP_VEC(floor)
+IMPLEMENT_UNARY_OP_VEC(log)
+IMPLEMENT_UNARY_OP_VEC(log10)
+IMPLEMENT_UNARY_OP_VEC(log1p)
+IMPLEMENT_UNARY_OP_VEC(log2)
+IMPLEMENT_UNARY_OP_VEC(round)
+IMPLEMENT_UNARY_OP_VEC(rsqrt)
+IMPLEMENT_UNARY_OP_VEC(sigmoid)
+IMPLEMENT_UNARY_OP_VEC(sin)
+IMPLEMENT_UNARY_OP_TH(sinh)
+IMPLEMENT_UNARY_OP_VEC(sqrt)
+IMPLEMENT_UNARY_OP_VEC(tan)
+IMPLEMENT_UNARY_OP_VEC(tanh)
+IMPLEMENT_UNARY_OP_VEC(trunc)
+
+}
+} // namespace at
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
new file mode 100644
index 0000000..d9bd94e
--- /dev/null
+++ b/aten/src/ATen/native/Unique.cpp
@@ -0,0 +1,60 @@
+// Returns unique elements of input tensor.
+
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+
+#include <set>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace at {
+namespace native{
+
+namespace {
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor> _unique_cpu_template(
+    const Tensor& self,
+    const bool sorted,
+    const bool return_inverse) {
+  const Tensor& input = self.contiguous();
+  const scalar_t* input_data = input.data<scalar_t>();
+  std::unordered_set<scalar_t> set(input_data, input_data + input.numel());
+  Tensor output = at::empty({static_cast<int64_t>(set.size())}, input.type());
+  scalar_t* output_data = output.data<scalar_t>();
+
+  if (sorted) {
+    std::vector<scalar_t> vec(set.begin(), set.end());
+    std::sort(vec.begin(), vec.end());
+    std::copy(vec.begin(), vec.end(), output_data);
+  } else {
+    std::copy(set.begin(), set.end(), output_data);
+  }
+
+  Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
+  if (return_inverse) {
+    inverse_indices.resize_(input.sizes());
+    int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
+    std::unordered_map<scalar_t, int64_t> inverse_map;
+    inverse_map.reserve(output.numel());
+    for (int i = 0; i < output.numel(); ++i) {
+      inverse_map[output_data[i]] = i;
+    }
+    for (int i = 0; i < input.numel(); ++i) {
+      inverse_indices_data[i] = inverse_map[input_data[i]];
+    }
+  }
+  return std::make_tuple(output, inverse_indices);
+}
+} // namespace
+
+std::tuple<Tensor, Tensor>
+_unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
+  return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
+    return _unique_cpu_template<scalar_t>(self, sorted, return_inverse);
+  });
+}
+
+}  // namespace native
+}  // namespace at
diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp
new file mode 100644
index 0000000..458e9ac
--- /dev/null
+++ b/aten/src/ATen/native/Vision.cpp
@@ -0,0 +1,28 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+
+namespace {
+  enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder};
+}
+
+namespace at { namespace native {
+
+Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) {
+  // cudnn does not support inputs larger than 1024
+  if (at::native::cudnn_is_acceptable(input) &&
+      padding_mode == GridSamplerModeZeros &&
+      input.dim() == 4 &&
+      input.size(1) <= 1024) {
+    return cudnn_grid_sampler(input, grid);
+  }
+  if (input.dim() == 4) {
+    return thnn_grid_sampler_bilinear2d(input, grid, padding_mode);
+  }
+  if (input.dim() == 5) {
+    return thnn_grid_sampler_bilinear3d(input, grid, padding_mode);
+  }
+  AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim());
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/CapabilityDispatch.h b/aten/src/ATen/native/cpu/CapabilityDispatch.h
new file mode 100644
index 0000000..6cb0f27
--- /dev/null
+++ b/aten/src/ATen/native/cpu/CapabilityDispatch.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <cpuinfo.h>
+#include <type_traits>
+#include <iostream>
+
+// Implements instruction set specific function dispatch.
+//
+// Kernels that may make use of specialized instruction sets (e.g. AVX) are
+// compiled multiple times with different compiler flags (e.g. -mavx). A
+// DispatchStub contains a table of function pointers for a kernel. At runtime,
+// the fastest available kernel is chosen based on the features reported by
+// cpuinfo.
+//
+// Example:
+//
+// In native/cpu/MyKernel.h:
+//   using fn_type = void(*)(const Tensor& x);
+//   DispatchStub<fn_type> stub;
+//
+// In native/cpu/MyKernel.cpp:
+//   void kernel(const Tensor& x) { ... }
+//   REGISTER_DISPATCH(stub, &kernel);
+//
+// To call:
+//   stub(tensor);
+//
+
+namespace at {
+namespace native {
+
+enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS };
+
+template <typename FnPtr>
+struct DispatchStub {
+  static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
+
+  template <typename... ArgTypes>
+  void operator()(ArgTypes... args) {
+    if (!dispatch_ptr) {
+      dispatch_ptr = choose_impl();
+    }
+    (*dispatch_ptr)(args...);
+  }
+
+  FnPtr choose_impl() {
+// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc
+#ifndef __powerpc__
+    if (cpuinfo_initialize()) {
+      int avx2 = static_cast<int>(CPUCapability::AVX2);
+      if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() &&
+          cpuinfo_has_x86_fma3() && table[avx2]) {
+        return table[avx2];
+      }
+      int avx = static_cast<int>(CPUCapability::AVX);
+      if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) {
+        return table[avx];
+      }
+    }
+#endif
+    int def = static_cast<int>(CPUCapability::DEFAULT);
+    AT_ASSERTM(table[def], "DispatchStub: missing default kernel");
+    return table[def];
+  }
+
+  FnPtr dispatch_ptr = nullptr;
+  FnPtr table[static_cast<int>(CPUCapability::NUM_OPTIONS)];
+};
+
+
+#if defined(CPU_CAPABILITY)
+
+constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY;
+
+// Registers an implementation a kernel for the current CPU capability.
+template<typename FnPtr>
+struct RegisterDispatch {
+  RegisterDispatch(DispatchStub<FnPtr>& stub, FnPtr value) {
+    stub.table[static_cast<int>(CURRENT_CAPABILITY)] = value;
+  }
+};
+
+// We only define the stub once in the DEFAULT capability compilation
+#if defined(CPU_CAPABILITY_DEFAULT)
+#define _DEFINE_STUB(stub, fn) DispatchStub<decltype(fn)> stub
+#else
+#define _DEFINE_STUB(stub, fn)
+#endif
+
+#define REGISTER_DISPATCH(stub, fn) \
+  _DEFINE_STUB(stub, fn); \
+  static RegisterDispatch<decltype(fn)> stub ## __register(stub, fn);
+
+#endif
+
+}
+}
diff --git a/aten/src/ATen/native/cpu/Intrinsics.h b/aten/src/ATen/native/cpu/Intrinsics.h
new file mode 100644
index 0000000..702b2be
--- /dev/null
+++ b/aten/src/ATen/native/cpu/Intrinsics.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#if defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y])
+#endif
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif
diff --git a/aten/src/ATen/native/cpu/README b/aten/src/ATen/native/cpu/README
new file mode 100644
index 0000000..ac8263d
--- /dev/null
+++ b/aten/src/ATen/native/cpu/README
@@ -0,0 +1,30 @@
+TODO: Clarify and add more documentation all around.
+
+All of the *.cpp files in this folder will be compiled under all compiler 
+flags specified by CPU_CAPABILITY_FLAGS in aten/src/ATen/CMakeLists.txt.
+
+The purpose of this is to allow the compilation with various compiler
+flags to enable features such as AVX instructions, while using runtime
+dispatch, which makes sure only valid instructions will be used on any
+given platform.
+
+Vec256.h provides a generic implementation of a vec256 type that allows
+the programmer to write code packing various primitives (such as floats)
+within 256bit registers. vec256 defines various operators such as + and *
+and provides functions to allow operations such as max, min, etc.
+
+As an example ReduceOpsKernel.cpp implements a generic kernel_ that reduces
+an entire array using a given associative binary operation such as +.
+
+More explicity, calling kernel_ with template argument std::plus will cause
+it to sum up the entire array into a single value.
+
+ReduceOpsKernel.cpp uses the CPU_CAPABILITY_* macros to "know" under which
+compiler flags it is currently compiled. This allows the programmer to write
+generic code, which will be compiled under multipled compilation settings.
+
+../ReduceOps.cpp now includes the header ReduceOpsKernel.h, which contains 
+a generic definition of sumImplAll. This function allows the user to reduce
+over a dimension or all dimensions. The appropiate capability is chosen at
+runtime using cpuinfo. If the current platform has avx, sumImpl will be set
+to umImplAll<CPUCapability::AVX>.
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
new file mode 100644
index 0000000..0e749c2
--- /dev/null
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -0,0 +1,191 @@
+#include "ATen/native/cpu/ReduceOpsKernel.h"
+
+#include <numeric>
+#include <iterator>
+#include <algorithm>
+
+#include "ATen/Dispatch.h"
+#include "ATen/Parallel.h"
+#include "ATen/cpu/vec256/vec256.h"
+#include "ATen/optional.h"
+
+namespace at { namespace native { namespace {
+
+using namespace vec256;
+
+static inline int64_t round_down(int64_t a, int64_t m) {
+  return a - (a % m);
+}
+
+template <typename F>
+static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) {
+  if (parallelize) {
+    parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) {
+      int64_t k = begin * step;
+      for (int64_t i = begin; i < end; i++, k += step) {
+        func(k);
+      }
+    });
+  } else {
+    for (int64_t i = 0; i != size; i += step) {
+      func(i);
+    }
+  }
+}
+
+// Vectorized reduction defined by reduce operation `Op` with identity `ident`.
+// The reduction is built on top of reduce128, which reduces down a column
+// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
+// because of the "adjacent cache line prefetch" behavior on x86 CPUs.
+template<typename scalar_t, template <class> class Op, int ident>
+struct Reduction {
+  // reduction width in number of scalar elements
+  static constexpr int WIDTH = 128 / sizeof(scalar_t);
+
+  using Vec = Vec256<scalar_t>;
+  using Reduce = Op<Vec>;
+  using ReduceScalar = Op<scalar_t>;
+
+  static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
+    auto out_ = res.data<scalar_t>();
+    auto data_ = self.data<scalar_t>();
+    auto numel = self.numel();
+    if (!dim.has_value()) {
+      *out_ = reduce_all(data_, numel);
+      return;
+    }
+
+    int64_t n = self.size(*dim);
+    int64_t stride = self.stride(*dim);
+    // A contiguous tensor does not need to hold a meaningful stride
+    // if the corresponding size is 1
+    if (n == 1) {
+      stride = 1;
+      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
+        stride *= self.size(i);
+      }
+    }
+    int64_t batch = numel / (n * stride);
+    bool paralellize = batch * n > internal::GRAIN_SIZE;
+    if (stride == 1) {
+      parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
+        for (int64_t b = begin; b < end; b++) {
+          const scalar_t* data = &data_[b * n];
+          scalar_t* out = &out_[b];
+          scalar_t buf[WIDTH] = {0};
+          std::fill(buf, buf + WIDTH, ident);
+          int64_t cols_rounded = n / WIDTH;
+          reduce128(data, buf, cols_rounded, WIDTH);
+          scalar_t result = ident;
+          for (int64_t i = 0; i < WIDTH; i++) {
+            result = ReduceScalar()(result, buf[i]);
+          }
+          for (int64_t col = cols_rounded * WIDTH; col != n; col++) {
+            result = ReduceScalar()(result, data[col]);
+          }
+          out_[b] = result;
+        }
+      });
+    } else {
+      int64_t rows = n;
+      int64_t cols = stride;
+      int64_t cols_rounded = round_down(cols, WIDTH);
+      int64_t size = cols_rounded;
+      parallel_for(
+          0,
+          batch * (size / WIDTH),
+          1,
+          [out_, data_, n, stride, rows, cols, cols_rounded, size](
+              int64_t begin, int64_t end) {
+            for (int64_t bi = begin; bi < end; bi++) {
+              int64_t b = bi / (size / WIDTH);
+              int64_t i = bi % (size / WIDTH);
+              int64_t k = i * WIDTH;
+              reduce128(
+                  &data_[b * n * stride + k],
+                  &out_[b * stride + k],
+                  rows,
+                  stride);
+            }
+          });
+
+      _parallel_for(batch, 1, paralellize, [=](int64_t b) {
+        const scalar_t* data = &data_[b * n * stride];
+        scalar_t* out = &out_[b * stride];
+        int64_t rows = n;
+        int64_t cols = stride;
+
+        int64_t cols_rounded = round_down(cols, WIDTH);
+        if (cols_rounded != cols) {
+          scalar_t buf[WIDTH] = {0};
+          std::fill(buf, buf + WIDTH, ident);
+          for (int64_t row = 0; row != rows; row++) {
+            for (int64_t j = 0; j != cols - cols_rounded; j++) {
+              auto val = data[row * stride + j + cols_rounded];
+              buf[j] = ReduceScalar()(buf[j], val);
+            }
+          }
+          for (int64_t j = 0; j != cols - cols_rounded; j++) {
+            out[j + cols_rounded] = buf[j];
+          }
+        }
+      });
+    }
+  }
+
+  static scalar_t reduce_all(const scalar_t* data, int64_t size) {
+    int64_t k = size / WIDTH;
+
+    scalar_t sum = parallel_reduce(
+        0,
+        k,
+        internal::GRAIN_SIZE / WIDTH,
+        (scalar_t)ident,
+        [data](int64_t begin, int64_t end, scalar_t init) {
+          scalar_t buf[WIDTH];
+          reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH);
+          return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
+        },
+        ReduceScalar());
+
+    for (int64_t i = k * WIDTH; i != size; i++) {
+      sum = ReduceScalar()(sum, data[i]);
+    }
+    return sum;
+  }
+
+  // Reduce down a column of WIDTH elements (128 bytes) with the given number
+  // of rows. Stores the results in out[0 ... WIDTH-1].
+  static void reduce128(const scalar_t* data, scalar_t* out, int64_t rows, int64_t stride) {
+    Vec acc[4] = {ident, ident, ident, ident};  // 128 bytes (two cache lines)
+    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
+    for (int64_t row = 0; row != rows; row++) {
+      for (int j = 0; j != 4; j++) {
+        auto val = Vec::loadu(&data[row * stride + j * Vec::size]);
+        acc[j] = Reduce()(acc[j], val);
+      }
+    }
+    for (int j = 0; j != 4; j++) {
+      acc[j].store(&out[j * Vec::size]);
+    }
+  }
+};
+
+static void sum_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
+  AT_DISPATCH_ALL_TYPES(self.type(), "sum", [&] {
+    Reduction<scalar_t, std::plus, 0>::apply(result, self, dim);
+  });
+}
+
+static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
+  AT_DISPATCH_ALL_TYPES(self.type(), "prod", [&] {
+    Reduction<scalar_t, std::multiplies, 1>::apply(result, self, dim);
+  });
+}
+
+}  // anonymous namespace
+
+REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
+REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
new file mode 100644
index 0000000..9481b90
--- /dev/null
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+#include "CapabilityDispatch.h"
+
+namespace at {
+namespace native {
+
+using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
+
+extern DispatchStub<reduce_fn> sum_kernel;
+extern DispatchStub<reduce_fn> prod_kernel;
+
+}
+}
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
new file mode 100644
index 0000000..6cfa90f
--- /dev/null
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -0,0 +1,268 @@
+#include "ATen/native/cpu/SoftmaxKernel.h"
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+
+#include "ATen/Dispatch.h"
+#include "ATen/Parallel.h"
+#include "ATen/cpu/vec256/functional.h"
+#include "ATen/cpu/vec256/vec256.h"
+#include "ATen/optional.h"
+
+// [Note AVX-SSE transitions] In general we avoid calls into cmath for code
+// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
+// Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
+//
+// On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of
+// computations per task. Each task works across dim_size elements. 16 should be
+// a very rough approximation of the number of computations per dim_size element
+// by counting simple computations (*, +, -) as 1 and exp or log as 4.
+
+namespace at { namespace native {
+namespace {
+
+template <typename scalar_t>
+inline void _vec_log_softmax_lastdim(
+    scalar_t* input_data_base,
+    scalar_t* output_data_base,
+    int64_t outer_size,
+    int64_t dim_size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  if (grain_size < CHUNK_SIZE)
+    grain_size = CHUNK_SIZE;
+
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) {
+          scalar_t tmp_sum_scalar[CHUNK_SIZE];
+          scalar_t max_input_arr[CHUNK_SIZE];
+          int64_t loop_end = CHUNK_SIZE;
+          if (ii + CHUNK_SIZE > end)
+            loop_end = end - ii;
+          for (int64_t j = 0; j < loop_end; j++) {
+            int64_t i = ii + j;
+            scalar_t* input_data = input_data_base + i * dim_size;
+            max_input_arr[j] = vec256::reduce_all<scalar_t>(
+                [](Vec& x, Vec& y) { return vec256::max(x, y); },
+                input_data,
+                dim_size);
+          }
+          for (int64_t j = 0; j < loop_end; j++) {
+            int64_t i = ii + j;
+            scalar_t* input_data = input_data_base + i * dim_size;
+            scalar_t max_input = max_input_arr[j];
+            tmp_sum_scalar[j] = vec256::map_reduce_all<scalar_t>(
+                [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
+                [](Vec x, Vec y) { return x + y; },
+                input_data,
+                dim_size);
+          }
+          // See [Note AVX-SSE transitions] for why this should call the
+          // vectorized version (aside from perf improvements).
+          vec256::map2(
+              [](Vec x, Vec y) { return x.log() + y; },
+              tmp_sum_scalar,
+              tmp_sum_scalar,
+              max_input_arr,
+              loop_end);
+          for (int64_t j = 0; j < loop_end; j++) {
+            int64_t i = ii + j;
+            scalar_t* input_data = input_data_base + i * dim_size;
+            scalar_t* output_data = output_data_base + i * dim_size;
+            scalar_t tmp_sum = tmp_sum_scalar[j];
+            vec256::map(
+                [tmp_sum](Vec x) { return x - Vec(tmp_sum); },
+                output_data,
+                input_data,
+                dim_size);
+          }
+        }
+      });
+}
+
+template <typename scalar_t>
+inline void _vec_softmax_lastdim(
+    scalar_t* input_data_base,
+    scalar_t* output_data_base,
+    int64_t outer_size,
+    int64_t dim_size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
+  if (grain_size < 1)
+    grain_size = 1;
+
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          scalar_t* input_data = input_data_base + i * dim_size;
+          scalar_t* output_data = output_data_base + i * dim_size;
+          scalar_t max_input = vec256::reduce_all<scalar_t>(
+              [](Vec& x, Vec& y) { return vec256::max(x, y); },
+              input_data,
+              dim_size);
+          vec256::map(
+              [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
+              output_data,
+              input_data,
+              dim_size);
+          scalar_t tmp_sum = vec256::reduce_all<scalar_t>(
+              [](Vec x, Vec y) { return x + y; }, output_data, dim_size);
+          tmp_sum = 1 / tmp_sum;
+          vec256::map(
+              [tmp_sum](Vec x) { return x * Vec(tmp_sum); },
+              output_data,
+              output_data,
+              dim_size);
+        }
+      });
+}
+
+template <typename scalar_t, bool log_softmax>
+inline void _vec_host_softmax_backward_lastdim(
+    scalar_t* grad_input_data_base,
+    scalar_t* grad_data_base,
+    scalar_t* output_data_base,
+    int64_t outer_size,
+    int64_t dim_size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
+  if (grain_size < 1)
+    grain_size = 1;
+
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          scalar_t* grad_input_data = grad_input_data_base + i * dim_size;
+          scalar_t* grad_data = grad_data_base + i * dim_size;
+          scalar_t* output_data = output_data_base + i * dim_size;
+          scalar_t sum;
+          if (log_softmax) {
+            sum = vec256::reduce_all<scalar_t>(
+                [](Vec& x, Vec& y) { return x + y; }, grad_data, dim_size);
+          } else {
+            sum = vec256::map2_reduce_all<scalar_t>(
+                [](Vec x, Vec y) { return x * y; },
+                [](Vec x, Vec y) { return x + y; },
+                grad_data,
+                output_data,
+                dim_size);
+          }
+          if (log_softmax) {
+            vec256::map2(
+                [sum](Vec x, Vec y) { return x - ((y.exp()) * Vec(sum)); },
+                grad_input_data,
+                grad_data,
+                output_data,
+                dim_size);
+          } else {
+            vec256::map2(
+                [sum](Vec x, Vec y) { return (x - Vec(sum)) * y; },
+                grad_input_data,
+                grad_data,
+                output_data,
+                dim_size);
+          }
+        }
+      });
+}
+
+template <typename scalar_t, bool LogSoftMax>
+struct vec_host_softmax_lastdim {
+  static void apply(Tensor& output, const Tensor& input) {
+    int64_t outer_size = 1;
+    int64_t dim_size = input.size(input.ndimension() - 1);
+    for (int64_t i = 0; i < input.ndimension() - 1; ++i)
+      outer_size *= input.size(i);
+    scalar_t* input_data_base = input.data<scalar_t>();
+    scalar_t* output_data_base = output.data<scalar_t>();
+    if (LogSoftMax) {
+      _vec_log_softmax_lastdim(
+          input_data_base, output_data_base, outer_size, dim_size);
+    } else {
+      _vec_softmax_lastdim(
+          input_data_base, output_data_base, outer_size, dim_size);
+    }
+  }
+};
+
+template <typename scalar_t, bool LogSoftMax>
+struct vec_host_softmax_backward_lastdim {
+  static void
+  apply(Tensor& grad_input, const Tensor& grad, const Tensor& output) {
+    int64_t outer_size = 1;
+    int64_t dim_size = grad.size(grad.ndimension() - 1);
+    for (int64_t i = 0; i < grad.ndimension() - 1; ++i)
+      outer_size *= grad.size(i);
+    scalar_t* grad_input_data_base = grad_input.data<scalar_t>();
+    scalar_t* grad_data_base = grad.data<scalar_t>();
+    scalar_t* output_data_base = output.data<scalar_t>();
+    _vec_host_softmax_backward_lastdim<scalar_t, LogSoftMax>(
+        grad_input_data_base,
+        grad_data_base,
+        output_data_base,
+        outer_size,
+        dim_size);
+  }
+};
+
+static void softmax_lastdim_kernel_impl(Tensor& result, const Tensor& self) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "softmax_lastdim_kernel_impl", [&] {
+    vec_host_softmax_lastdim<scalar_t, false>::apply(result, self);
+  });
+}
+
+static void log_softmax_lastdim_kernel_impl(
+    Tensor& result,
+    const Tensor& self) {
+  AT_DISPATCH_FLOATING_TYPES(
+      self.type(), "log_softmax_lastdim_kernel_impl", [&] {
+        vec_host_softmax_lastdim<scalar_t, true>::apply(result, self);
+      });
+}
+
+static void softmax_backward_lastdim_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad,
+    const Tensor& output) {
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.type(), "softmax_backward_lastdim_kernel_impl", [&] {
+        vec_host_softmax_backward_lastdim<scalar_t, false>::apply(
+            grad_input, grad, output);
+      });
+}
+
+static void log_softmax_backward_lastdim_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad,
+    const Tensor& output) {
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.type(), "log_softmax_backward_lastdim_kernel_impl", [&] {
+        vec_host_softmax_backward_lastdim<scalar_t, true>::apply(
+            grad_input, grad, output);
+      });
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(softmax_lastdim_kernel, &softmax_lastdim_kernel_impl);
+REGISTER_DISPATCH(log_softmax_lastdim_kernel, &log_softmax_lastdim_kernel_impl);
+REGISTER_DISPATCH(
+    softmax_backward_lastdim_kernel,
+    &softmax_backward_lastdim_kernel_impl);
+REGISTER_DISPATCH(
+    log_softmax_backward_lastdim_kernel,
+    &log_softmax_backward_lastdim_kernel_impl);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h
new file mode 100644
index 0000000..dbd703b
--- /dev/null
+++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "CapabilityDispatch.h"
+
+namespace at {
+namespace native {
+
+using forward_fn = void(*)(Tensor &, const Tensor &);
+using backward_fn = void(*)(Tensor &, const Tensor &, const Tensor&);
+
+extern DispatchStub<forward_fn> softmax_lastdim_kernel;
+extern DispatchStub<forward_fn> log_softmax_lastdim_kernel;
+extern DispatchStub<backward_fn> softmax_backward_lastdim_kernel;
+extern DispatchStub<backward_fn> log_softmax_backward_lastdim_kernel;
+
+}
+}
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
new file mode 100644
index 0000000..7416923
--- /dev/null
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -0,0 +1,171 @@
+#include "ATen/native/cpu/UnaryOpsKernel.h"
+
+#include <cmath>
+#include "ATen/Dispatch.h"
+#include "ATen/cpu/vml.h"
+#include "ATen/CPUApplyUtils.h"
+#include "ATen/native/cpu/CapabilityDispatch.h"
+#ifdef __AVX2__
+#include "ATen/native/cpu/avx_mathfun.h"
+#endif
+
+namespace at { namespace native {
+namespace {
+
+using namespace vec256;
+
+template <typename scalar_t>
+static int64_t _sigmoid(scalar_t* x, scalar_t* y, int64_t size);
+
+// This should be a temporary solution until we understand why SLEEF is slower
+// for sigmoid
+
+template <>
+int64_t _sigmoid(float* x, float* y, int64_t size) {
+  using Vec = Vec256<float>;
+  int64_t i = 0;
+  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+    Vec ret = Vec::loadu(y + i);
+    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    ret = ret.neg();
+    ret2 = ret2.neg();
+#if defined(__AVX2__) && !defined(_MSC_VER)
+    ret = exp256_ps(ret);
+    ret2 = exp256_ps(ret2);
+#else
+    ret = ret.exp();
+    ret2 = ret2.exp();
+#endif
+    ret = Vec((float)(1)) + ret;
+    ret2 = Vec((float)(1)) + ret2;
+    ret = ret.reciprocal();
+    ret2 = ret2.reciprocal();
+    ret.store(x + i);
+    ret2.store(x + i + Vec::size);
+  }
+  return i;
+}
+
+template <>
+int64_t _sigmoid(double* x, double* y, int64_t size) {
+  using Vec = Vec256<double>;
+  int64_t i = 0;
+  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+    Vec ret = Vec::loadu(y + i);
+    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    ret = ret.neg();
+    ret2 = ret2.neg();
+    ret = ret.exp();
+    ret2 = ret2.exp();
+    ret = Vec((double)(1)) + ret;
+    ret2 = Vec((double)(1)) + ret2;
+    ret = ret.reciprocal();
+    ret2 = ret2.reciprocal();
+    ret.store(x + i);
+    ret2.store(x + i + Vec::size);
+  }
+  return i;
+}
+
+static void sigmoid_kernel(Tensor& result, const Tensor& self) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "sigmoid", [&] {
+    using Vec = Vec256<scalar_t>;
+    CPU_tensor_parallel_kernel_apply2<scalar_t, scalar_t>(
+        result,
+        self,
+        [](int64_t size,
+           scalar_t* x,
+           scalar_t* y,
+           int64_t stridex,
+           int64_t stridey) {
+          int64_t i = 0;
+          if (stridex == 1 && stridey == 1) {
+            i = _sigmoid(x, y, size);
+          }
+          for (; i < size; i += Vec::size) {
+            scalar_t buffer[Vec::size];
+            int64_t width = Vec::size;
+            width = std::min(width, size - i);
+            for (int64_t j = 0; j < width; j++) {
+              buffer[j] = y[stridey * (i + j)];
+            }
+            Vec ret = Vec::loadu(buffer);
+            ret = Vec((scalar_t)(0)) - ret;
+            ret = ret.exp();
+            ret = Vec((scalar_t)(1)) + ret;
+            ret = ret.reciprocal();
+            ret.store(buffer);
+            for (int64_t j = 0; j < width; j++)
+              x[stridex * (i + j)] = buffer[j];
+          }
+        });
+  });
+}
+
+#define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op)                          \
+  static void op##_kernel(Tensor& result, const Tensor& self) {            \
+    AT_DISPATCH_##dispatchtypes##_TYPES(self.type(), #op, [&] {            \
+      if (self.is_contiguous() && result.is_contiguous()) {                \
+        vml::v##op(                                                        \
+            result.data<scalar_t>(), self.data<scalar_t>(), self.numel()); \
+                                                                           \
+      } else {                                                             \
+        static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t);        \
+        CPU_tensor_parallel_kernel_apply2<scalar_t, scalar_t>(             \
+            result,                                                        \
+            self,                                                          \
+            [](int64_t size,                                               \
+               scalar_t* x,                                                \
+               scalar_t* y,                                                \
+               int64_t stridex,                                            \
+               int64_t stridey) {                                          \
+              if (stridex == 1 && stridey == 1) {                          \
+                vml::v##op(x, y, size);                                    \
+              } else {                                                     \
+                for (int64_t i = 0; i < size; i += WIDTH) {                \
+                  scalar_t buffer[WIDTH];                                  \
+                  int64_t width = WIDTH;                                   \
+                  width = std::min(width, size - i);                       \
+                  for (int64_t j = 0; j < width; j++)                      \
+                    buffer[j] = y[stridey * (i + j)];                      \
+                  vml::v##op(buffer, buffer, width);                       \
+                  for (int64_t j = 0; j < width; j++)                      \
+                    x[stridex * (i + j)] = buffer[j];                      \
+                }                                                          \
+              }                                                            \
+            });                                                            \
+      }                                                                    \
+    });                                                                    \
+  }                                                                        \
+  REGISTER_DISPATCH(op##Impl, &op##_kernel)
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(sigmoidImpl, &sigmoid_kernel)
+
+// IMPLEMENT_FLOAT_KERNEL(ALL, abs)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, acos)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, asin)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, atan)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, ceil)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, cos)
+// IMPLEMENT_FLOAT_KERNEL(FLOATING, cosh)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, erf)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, erfc)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, exp)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, expm1)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, floor)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, log)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, log10)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, log2)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, round)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, rsqrt)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, sin)
+// IMPLEMENT_FLOAT_KERNEL(FLOATING, sinh)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, sqrt)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, tan)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, tanh)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, trunc)
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
new file mode 100644
index 0000000..d9bffad
--- /dev/null
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <stdexcept>
+#include "CapabilityDispatch.h"
+
+namespace at { namespace native {
+
+using unary_fn = void(*)(Tensor&, const Tensor&);
+
+extern DispatchStub<unary_fn> absImpl;
+extern DispatchStub<unary_fn> acosImpl;
+extern DispatchStub<unary_fn> asinImpl;
+extern DispatchStub<unary_fn> atanImpl;
+extern DispatchStub<unary_fn> ceilImpl;
+extern DispatchStub<unary_fn> cosImpl;
+// extern DispatchStub<unary_fn> coshImpl;
+extern DispatchStub<unary_fn> erfImpl;
+extern DispatchStub<unary_fn> erfcImpl;
+extern DispatchStub<unary_fn> expImpl;
+extern DispatchStub<unary_fn> expm1Impl;
+extern DispatchStub<unary_fn> floorImpl;
+extern DispatchStub<unary_fn> logImpl;
+extern DispatchStub<unary_fn> log10Impl;
+extern DispatchStub<unary_fn> log1pImpl;
+extern DispatchStub<unary_fn> log2Impl;
+extern DispatchStub<unary_fn> roundImpl;
+extern DispatchStub<unary_fn> rsqrtImpl;
+extern DispatchStub<unary_fn> sigmoidImpl;
+extern DispatchStub<unary_fn> sinImpl;
+// extern DispatchStub<unary_fn> sinhImpl;
+extern DispatchStub<unary_fn> sqrtImpl;
+extern DispatchStub<unary_fn> tanImpl;
+extern DispatchStub<unary_fn> tanhImpl;
+extern DispatchStub<unary_fn> truncImpl;
+
+
+// Missing unary functions
+// digamma
+// lgamma
+
+// TODO: See below
+// erfinv
+// fill
+// frac
+// clone
+// contiguous
+// clamp/_min/_max
+// neg
+// reciprocal
+// sigmoid
+// sign
+// zero
+
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h
new file mode 100644
index 0000000..3e40146
--- /dev/null
+++ b/aten/src/ATen/native/cpu/avx_mathfun.h
@@ -0,0 +1,715 @@
+#pragma once
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include "Intrinsics.h"
+
+/* yes I know, the top of this file is quite ugly */
+#if defined(__GNUC__)
+# define ALIGN32_BEG __attribute__((aligned(32)))
+#elif defined(_WIN32)
+# define ALIGN32_BEG __declspec(align(32))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m256  v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int   (avx)
+typedef __m128i v4si; // vector of 8 int   (avx)
+
+#define _PI32AVX_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] = { Val, Val, Val, Val }
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST256(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS256_CONST(1  , 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union {
+  v8si imm;
+  v4si xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
+    imm_xmm_union u __attribute__((aligned(32)));  \
+    u.imm = imm_;				   \
+    xmm0_ = u.xmm[0];                            \
+    xmm1_ = u.xmm[1];                            \
+}
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+  }
+
+
+#define AVX2_BITOP_USING_SSE2(fn) \
+static inline v8si _mm256_##fn(v8si x, int a) \
+{ \
+  /* use SSE2 instruction to perform the bitop AVX2 */ \
+  v4si x1, x2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  x1 = _mm_##fn(x1,a); \
+  x2 = _mm_##fn(x2,a); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+#warning "Using SSE2 to perform AVX2 bitshift ops"
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn) \
+static inline v8si _mm256_##fn(v8si x, v8si y) \
+{ \
+  /* use SSE2 instructions to perform the AVX2 integer operation */ \
+  v4si x1, x2; \
+  v4si y1, y2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  COPY_IMM_TO_XMM(y, y1, y2); \
+  x1 = _mm_##fn(x1,y1); \
+  x2 = _mm_##fn(x2,y2); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+#warning "Using SSE2 to perform AVX2 integer ops"
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+
+#endif /* __AVX2__ */
+
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+inline v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+
+  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi,	88.3762626647949f);
+_PS256_CONST(exp_lo,	-88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+inline v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  //imm0 = _mm256_cvttps_epi32(fx);
+  //tmp  = _mm256_cvtepi32_ps(imm0);
+
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = _mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2,  4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+inline v8sf sin256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+inline v8sf cos256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
+
+  /* get the swap sign flag */
+  imm0 =  _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+#else
+
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+  v4si imm4_1, imm4_2;
+#endif
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm4_1 = imm2_1;
+  imm4_2 = imm2_2;
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
+  imm4 =  _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = _mm256_slli_epi32(imm4, 29);
+#else
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
+
+  imm4_1 = _mm_slli_epi32(imm4_1, 29);
+  imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf*)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2,ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1,ysin2);
+  xmm2 = _mm256_add_ps(y,y2);
+
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
new file mode 100644
index 0000000..883de7a
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -0,0 +1,52 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Dispatch.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+namespace at { namespace native {
+
+template <typename scalar_t>
+void hardshrink_cuda_kernel(const Tensor& self, Tensor& out_tensor, scalar_t* lambd) {
+  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t>(
+    self,
+    out_tensor,
+    [lambd] __device__ (
+      scalar_t& self_val,
+      scalar_t& out_tensor_val) {
+        out_tensor_val = (self_val >= -*lambd && self_val <= *lambd) ? scalar_t(0) : self_val;
+  });
+}
+
+template <typename scalar_t>
+void hardshrink_backward_cuda_kernel(Tensor& out_tensor, scalar_t* lambd, const Tensor& self, const Tensor& grad) {
+  at::cuda::CUDA_tensor_apply3<scalar_t, scalar_t, scalar_t>(
+    self,
+    grad,
+    out_tensor,
+    [lambd] __device__ (
+      scalar_t& self_val,
+      scalar_t& grad_val,
+      scalar_t& out_tensor_val) {
+        out_tensor_val = (self_val >= -*lambd && self_val <= *lambd) ? scalar_t(0) : grad_val;
+  });
+}
+
+Tensor hardshrink_cuda(const Tensor & self, Scalar lambd) {
+  auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU);
+  auto out_tensor = at::empty_like(self);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_cuda", [&] {
+    hardshrink_cuda_kernel<scalar_t>(self, out_tensor, lambd_tensor.data<scalar_t>());
+  });
+  return out_tensor;
+}
+
+Tensor hardshrink_backward_cuda(const Tensor & grad, const Tensor & self, Scalar lambd) {
+  auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU);
+  auto out_tensor = at::empty_like(grad);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_backward_cuda", [&] {
+    hardshrink_backward_cuda_kernel<scalar_t>(out_tensor, lambd_tensor.data<scalar_t>(), self, grad);
+  });
+  return out_tensor;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CUDAReduceOps.cpp b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp
new file mode 100644
index 0000000..a3b32ce
--- /dev/null
+++ b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp
@@ -0,0 +1,29 @@
+#include <ATen/ATen.h>
+#include "ATen/native/ReduceOpsUtils.h"
+
+namespace at { namespace native {
+
+Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); }
+
+Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); }
+
+Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
+                      bool keepdim) {
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_sum_out(result, self, dim, keepdim);
+  }
+}
+
+Tensor &_prod_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
+                       bool keepdim) {
+  if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
+    return result;
+  } else {
+    return at::_th_prod_out(result, self, dim, keepdim);
+  }
+}
+
+
+}}
diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
new file mode 100644
index 0000000..2e524f4
--- /dev/null
+++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
@@ -0,0 +1,42 @@
+#include "ATen/ATen.h"
+
+namespace at { namespace native {
+
+// These are just forwarding stubs
+
+#define IMPLEMENT_UNARY_OP_PREQUEL(op)                           \
+  Tensor& _##op##__cuda(Tensor& self) {                          \
+    return at::_##op##_out(self, self);                          \
+  }                                                              \
+  Tensor& _##op##_out_cuda(Tensor& result, const Tensor& self) { \
+    return at::_##op##_out(result, self);                        \
+  }
+
+
+IMPLEMENT_UNARY_OP_PREQUEL(abs)
+IMPLEMENT_UNARY_OP_PREQUEL(acos)
+IMPLEMENT_UNARY_OP_PREQUEL(asin)
+IMPLEMENT_UNARY_OP_PREQUEL(atan)
+IMPLEMENT_UNARY_OP_PREQUEL(ceil)
+IMPLEMENT_UNARY_OP_PREQUEL(cos)
+IMPLEMENT_UNARY_OP_PREQUEL(cosh)
+IMPLEMENT_UNARY_OP_PREQUEL(erf)
+IMPLEMENT_UNARY_OP_PREQUEL(erfc)
+IMPLEMENT_UNARY_OP_PREQUEL(exp)
+IMPLEMENT_UNARY_OP_PREQUEL(expm1)
+IMPLEMENT_UNARY_OP_PREQUEL(floor)
+IMPLEMENT_UNARY_OP_PREQUEL(log)
+IMPLEMENT_UNARY_OP_PREQUEL(log10)
+IMPLEMENT_UNARY_OP_PREQUEL(log1p)
+IMPLEMENT_UNARY_OP_PREQUEL(log2)
+IMPLEMENT_UNARY_OP_PREQUEL(round)
+IMPLEMENT_UNARY_OP_PREQUEL(rsqrt)
+IMPLEMENT_UNARY_OP_PREQUEL(sigmoid)
+IMPLEMENT_UNARY_OP_PREQUEL(sin)
+IMPLEMENT_UNARY_OP_PREQUEL(sinh)
+IMPLEMENT_UNARY_OP_PREQUEL(sqrt)
+IMPLEMENT_UNARY_OP_PREQUEL(tan)
+IMPLEMENT_UNARY_OP_PREQUEL(tanh)
+IMPLEMENT_UNARY_OP_PREQUEL(trunc)
+
+}}
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
new file mode 100644
index 0000000..49c56cb
--- /dev/null
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -0,0 +1,399 @@
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/native/cuda/CuFFTUtils.h"
+#include "ATen/native/utils/ParamsHash.h"
+
+#include <list>
+#include <unordered_map>
+#include <string>
+#include <stdexcept>
+#include <sstream>
+#include <limits>
+#include <cufft.h>
+#include <cufftXt.h>
+
+namespace at { namespace native { namespace detail {
+
+// This POD struct is used to let us easily compute hashes of the
+// parameters.
+// It will be the **key** to the plan cache.
+struct CuFFTParams
+{
+  at::ScalarType scalar_type_;
+  int64_t input_sizes_[max_rank + 2];
+  int64_t input_strides_[max_rank + 2];
+  uint8_t signal_ndim_;  // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3
+  bool complex_input_;
+  bool complex_output_;
+  int64_t signal_sizes_[max_rank];
+  bool onesided_;
+};
+
+// NB: This can't be a constructor, because then CuFFTParams
+// would not be a POD anymore.
+static inline void setCuFFTParams(CuFFTParams* params,
+    const Tensor& input, int64_t signal_ndim, bool complex_input,
+    bool complex_output, IntList checked_signal_sizes, bool onesided) {
+
+  memset(params, 0, sizeof(CuFFTParams));
+  params->scalar_type_ = input.type().scalarType();
+  for (int i = 0; i != input.dim(); ++i) {
+    params->input_sizes_[i] = input.size(i);
+    if (input.size(i) != 1) {
+      params->input_strides_[i] = input.stride(i);
+    }
+  }
+  params->signal_ndim_ = (uint8_t) signal_ndim;
+  params->complex_input_ = complex_input;
+  params->complex_output_ = complex_output;
+  for (size_t i = 0; i != checked_signal_sizes.size(); ++i) {
+    params->signal_sizes_[i] = checked_signal_sizes[i];
+  }
+  params->onesided_ = onesided;
+}
+
+struct CuFFTHandleDeleter {
+  void operator()(cufftHandle* x) {
+    if (x != nullptr) {
+      CUFFT_CHECK(cufftDestroy(*x));
+    }
+  }
+};
+
+__forceinline__
+static bool is_pow_of_two(int64_t x) {
+  return (x & (x - 1)) == 0;
+}
+
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. whether to clone input before executing the plan
+//   3. the workspace size needed
+//
+// Its constructor also guarantees that if `input` is contiguous in all
+// dimensions, e.g., from cloning, clone_input will be false.
+//
+// This class will be the **value** in the plan cache.
+// It **owns** the raw plan via a unique_ptr.
+class CuFFTConfig {
+public:
+
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  CuFFTConfig(const CuFFTConfig&) = delete;
+  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
+
+  explicit CuFFTConfig(Tensor& input, int64_t signal_ndim, bool complex_input,
+    bool complex_output, IntList checked_signal_sizes, bool onesided,
+    IntList output_sizes) {
+
+    // signal sizes
+    std::vector<long long int> signal_sizes(checked_signal_sizes.begin(),
+                                            checked_signal_sizes.end());
+
+    // input batch size
+    long long int batch = input.size(0);
+
+    // Since cuFFT has limited non-unit stride support and various constraints, we
+    // use a flag to keep track throughout this function to see if we need to
+    // input = input.clone();
+    clone_input = false;
+
+    // For half, base strides on the real part of real-to-complex and
+    // complex-to-real transforms are not supported. Since our output is always
+    // contiguous, only need to check real-to-complex case.
+    if (input.type().scalarType() == ScalarType::Half) {
+      // cuFFT on half requires compute capability of at least SM_53
+      auto dev_prop = at::globalContext().getCurrentDeviceProperties();
+      if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) {
+        std::ostringstream ss;
+        ss << "cuFFT doesn't support signals of half type with compute "
+           << "capability less than SM_53, but the device containing input half "
+           << "tensor only has SM_" << dev_prop->major << dev_prop->minor;
+        throw std::runtime_error(ss.str());
+      }
+      for (int64_t i = 0; i < signal_ndim; i++) {
+        auto signal_size = checked_signal_sizes[i];
+        if (!is_pow_of_two(signal_size)) {
+          std::ostringstream ss;
+          ss << "cuFFT doesn't support signals of half type with size at any "
+             << "dimension that is not a power of two, but got a signal size of "
+             << checked_signal_sizes;
+          throw std::runtime_error(ss.str());
+        }
+      }
+      clone_input |= input.stride(signal_ndim) != 1;
+    }
+
+    // check the input sizes and strides to see if we need to make it contiguous
+    // cuFFT doesn't support batch dim with stride 0
+    clone_input |= input.stride(0) == 0;
+
+    if (complex_input) {
+      // Real/imag dimension must be like complex type.
+      clone_input |= input.stride(-1) != 1;
+      // Strides of other dimensions needs to be aligned when viewed as of complex
+      // type, i.e., multiples of 2. We check the batch dim and last signal dim
+      // here. If the input can be viewed as having embedded strides, the other
+      // signal dims will also satisfy this.
+      // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+      clone_input |= (batch > 0 && input.stride(0) % 2 != 0) ||
+                      input.stride(signal_ndim) % 2 != 0;
+    }
+
+    // Checks if input strides can be viewed as embedded.
+    // See NOTE [ cuFFT Embedded Strides ].
+    //
+    // TODO: Figure out why windows fails to compile
+    //         at::optional<std::vector<long long int>> inembed_opt = at::nullopt;
+    //       Then move the following to a helper function.
+    std::vector<long long int> inembed(signal_ndim);
+    if (!clone_input) {
+      auto istrides = input.strides();
+      auto last_istride = istrides[signal_ndim];
+      clone_input = last_istride <= 0;
+      for (auto i = signal_ndim - 1; !clone_input && i > 0 /* inembed[0] doesn't matteer */; i--) {
+        auto istride = istrides[i];
+        if (istride > 0 && istride % last_istride == 0) {
+          inembed[i] = istride / last_istride;
+          last_istride = istride;
+        } else {
+          clone_input = true;
+        }
+      }
+    }
+
+    // Check if we can take advantage of simple data layout.
+    //
+    // Note that this is before the actual cloning. This is intentional so we can
+    // check for advanced data layout with complex-to-real transform. cuFFT
+    // out-of-place complex-to-real transforms with advanced layout may overwrite
+    // input, and we need to clone the input.
+    //
+    // This just needs contiguity in cases except for twosided real-to-complex
+    // transform where we won't have simple data layout as output is two sided.
+    //
+    // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+
+    bool simple_layout = !(!complex_input && complex_output && !onesided) &&  // not twosided R2C
+                         (clone_input || input.is_contiguous());              // contiguous
+    if (!simple_layout && complex_input && !complex_output) {
+      clone_input = true;
+      simple_layout = true;
+    }
+
+    // if input should be cloned but simple layout can't be used (e.g. twosided R2C)
+    if (clone_input && !simple_layout) {
+      auto input_size = input.sizes();
+      std::copy(input_size.begin() + 1,                // begin of signal dim in input
+                input_size.begin() + signal_ndim + 1,  // end of signal dim in input
+                inembed.begin());                      // begin of output
+    }
+
+    cudaDataType itype, otype, exec_type;
+    if (input.type().scalarType() == ScalarType::Float) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (input.type().scalarType() == ScalarType::Double) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (input.type().scalarType() == ScalarType::Half) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      std::ostringstream ss;
+      ss << "cuFFT doesn't support tensor of type: "
+         << at::toString(input.type().scalarType());
+      throw std::runtime_error(ss.str());
+    }
+
+    // create plan
+    auto raw_plan_ptr = new cufftHandle();
+    CUFFT_CHECK(cufftCreate(raw_plan_ptr));
+    plan_ptr.reset(raw_plan_ptr);
+
+    // disable auto allocation of workspace to use THC allocator
+    CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    // make plan
+    if (simple_layout) {
+      // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL.
+      // In such case, cuFFT ignores base_istride, base_ostride, idist, and odist
+      // by assuming base_istride = base_ostride = 1.
+      //
+      // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+    } else {
+      // set idist (stride at batch dim)
+      // set base_istride (stride at innermost dim of signal)
+      long long int idist, base_istride;
+      if (clone_input) {
+        idist = at::prod_intlist(input.sizes().slice(1, signal_ndim));
+        base_istride = 1;
+      } else if (complex_input) {
+        idist = input.stride(0) >> 1;
+        base_istride = input.stride(signal_ndim) >> 1;
+      } else {
+        idist = input.stride(0);
+        base_istride = input.stride(signal_ndim);
+      }
+      // Even if batch dimension is one and idist (stride(0)) doesn't matter,
+      // cuFFT errors if idist = 0. This is hack to make it succeed.
+      if (idist == 0 && batch == 1) {
+        idist = 1;
+      }
+
+      // set odist, onembed, base_ostride
+      long long int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim));
+      std::vector<long long int> onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1);
+      long long int base_ostride = 1;
+
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+            inembed.data(), base_istride, idist, itype,
+            onembed.data(), base_ostride, odist, otype,
+            batch, &ws_size_t, exec_type));
+    }
+    ws_size = static_cast<int64_t>(ws_size_t);
+  }
+
+  const cufftHandle &plan() const { return *plan_ptr.get(); }
+
+  bool should_clone_input() const { return clone_input; }
+
+  int64_t workspace_size() const { return ws_size; }
+
+private:
+  std::unique_ptr<cufftHandle, CuFFTHandleDeleter> plan_ptr;
+  bool clone_input;
+  int64_t ws_size;
+};
+
+// NB: cuFFT allocates a starting plan array of size 1024. It should grow the
+//     array as more plans are created. However, a bug in cuFFT (at least
+//     present in CUDA 9.1) causes the cufftSetAutoAllocation call on the
+//     1024-th plan to fail with CUFFT_INVALID_PLAN. Therefore, we check that
+//     cache size is leq 1023. The initial plan array size is 1024 for
+//     CUDA 8.0 ~ 9.2 so setting this as a CUDA-version-agnostic constant should
+//     be fine for now.
+// TODO: When CUDA 10 comes out, check if the bug is fixed or if we need another
+//       number for CUDA 10.
+constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class CuFFTParamsLRUCache {
+public:
+  using kv_t = typename std::pair<CuFFTParams, CuFFTConfig>;
+  using map_t = typename std::unordered_map<std::reference_wrapper<CuFFTParams>,
+                                            typename std::list<kv_t>::iterator,
+                                            ParamsHash<CuFFTParams>,
+                                            ParamsEqual<CuFFTParams>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+
+  CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_MAX_PLAN_NUM) {}
+
+  CuFFTParamsLRUCache(int64_t max_size) {
+    _set_max_size(max_size);
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache using value_args and return it.
+  // Return const reference because CuFFTConfig shouldn't be tampered with once
+  // created.
+  // This is similar to c++ 17 try_emplace.
+  template<typename K, class ...VArgs>
+  const CuFFTConfig &try_emplace_value(K&& key, VArgs&&... value_args) {
+    AT_ASSERT(_max_size > 0);
+
+    map_kkv_iter_t map_it = _cache_map.find(key);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                       std::forward_as_tuple(key),
+                       std::forward_as_tuple(value_args...));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                std::forward_as_tuple(kv_it->first),
+                std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
+             "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    AT_CHECK(new_size >= 0,
+             "cuFFT plan cache size must be non-negative, but got ", new_size);
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+// Since ATen is separated into CPU build and CUDA build, we need a way to call
+// these functions only when CUDA is loaded. We use CUDA hooks for this purpose
+// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual
+// native function counterparts (at native/SpectralOps.cpp), i.e.,
+// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size
+// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache.
+int64_t cufft_get_plan_cache_max_size_impl();
+void cufft_set_plan_cache_max_size_impl(int64_t max_size);
+int64_t cufft_get_plan_cache_size_impl();
+void cufft_clear_plan_cache_impl();
+
+}}} // namespace at::native::detail
diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h
new file mode 100644
index 0000000..5edfcbc
--- /dev/null
+++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+
+#include <string>
+#include <stdexcept>
+#include <sstream>
+#include <cufft.h>
+#include <cufftXt.h>
+
+namespace at { namespace native {
+
+// This means that max dim is 3 + 2 = 5 with batch dimension and possible
+// complex dimension
+constexpr int max_rank = 3;
+
+static inline std::string _cudaGetErrorEnum(cufftResult error)
+{
+  switch (error)
+  {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+    default:
+      std::ostringstream ss;
+      ss << "unknown error " << error;
+      return ss.str();
+  }
+}
+
+static inline void CUFFT_CHECK(cufftResult error)
+{
+  if (error != CUFFT_SUCCESS) {
+    std::ostringstream ss;
+    ss << "cuFFT error: " << _cudaGetErrorEnum(error);
+    AT_ERROR(ss.str());
+  }
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
new file mode 100644
index 0000000..4b346ca
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -0,0 +1,122 @@
+#include "ATen/Dispatch.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+#include "ATen/AccumulateType.h"
+
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
+#include <utility>
+#include <functional>
+#include <nvfunctional>
+
+#include "ATen/native/Distributions.h"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCTensorRandom.h>
+#include <THC/THCGenerator.hpp>
+#include <THC/THCApply.cuh>
+
+#include <cstdint>
+#include <limits>
+#include <utility>
+
+THCGenerator* THCRandom_getGenerator(THCState* state);
+
+namespace {
+std::pair<uint64_t, uint64_t> next_philox_seed(at::Generator* gen, uint64_t increment) {
+  auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
+  uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment);
+  return std::make_pair(gen_->state.initial_seed, offset);
+}
+
+template <typename scalar_t>
+void poisson_cuda_kernel(
+    at::Tensor& ret,
+    const at::Tensor& lambda,
+    std::pair<uint64_t, uint64_t> seeds) {
+  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t>(
+      ret,
+      lambda,
+      [seeds] __device__(
+          scalar_t & ret_val, const scalar_t& lambda) {
+        curandStatePhilox4_32_10_t state;
+        curand_init(
+            seeds.first,
+            blockIdx.x * blockDim.x + threadIdx.x,
+            seeds.second,
+            &state);
+        ret_val = static_cast<scalar_t>(curand_poisson(&state, lambda));
+      });
+}
+
+template <typename scalar_t>
+void gamma_cuda_kernel(
+    at::Tensor& ret,
+    const at::Tensor& alpha,
+    std::pair<uint64_t, uint64_t> seeds) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t>(
+      ret,
+      alpha,
+      [seeds] __device__(
+          scalar_t & ret_val, const scalar_t& alpha) {
+        curandStatePhilox4_32_10_t state;
+        curand_init(
+            seeds.first,
+            blockIdx.x * blockDim.x + threadIdx.x,
+            seeds.second,
+            &state);
+        BaseSampler<accscalar_t> standard_uniform([&state] __device__ () {
+          return curand_uniform(&state);
+        });
+        BaseSampler<accscalar_t> standard_normal([&state] __device__ () {
+          return curand_normal(&state);
+        });
+        auto sample = sample_gamma<scalar_t, accscalar_t>(alpha, standard_uniform, standard_normal);
+        auto min_value = std::numeric_limits<scalar_t>::lowest();
+        ret_val = (min_value > sample) ? min_value : sample;
+      });
+}
+
+template <typename scalar_t>
+void gamma_grad_cuda_kernel(
+    at::Tensor& ret,
+    const at::Tensor& self,
+    const at::Tensor& output) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  at::cuda::CUDA_tensor_apply3<scalar_t, scalar_t, scalar_t>(
+      ret, self, output,
+      [] __device__ (scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) {
+        ret_val = standard_gamma_grad_one<scalar_t, accscalar_t>(self_val, output_val);
+      });
+}
+
+} // namespace
+
+namespace at { namespace native {
+Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
+  Tensor ret = lambda.type().tensor(lambda.sizes());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "poisson", [&] {
+    poisson_cuda_kernel<scalar_t>(ret, lambda, next_philox_seed(gen, 20));
+  });
+  return ret;
+}
+
+Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
+  Tensor ret = alpha.type().tensor(alpha.sizes());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "gamma", [&] {
+     gamma_cuda_kernel<scalar_t>(ret, alpha, next_philox_seed(gen, 10));
+   });
+  return ret;
+}
+
+Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
+  Tensor ret = self.type().tensor(self.sizes());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "_standard_gamma_grad", [&] {
+     gamma_grad_cuda_kernel<scalar_t>(ret, self, output);
+   });
+  return ret;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
new file mode 100644
index 0000000..affe20d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -0,0 +1,371 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/Error.h"
+
+#include "ATen/AccumulateType.h"
+
+#include <THC/THCDeviceUtils.cuh>
+#include <THC/THCTensorMathReduce.cuh>
+#include <THC/THCTensorSort.cuh>
+#include <THC/THCThrustAllocator.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/unique.h>
+
+
+namespace at { namespace native {
+
+namespace {
+
+static const int WARP_SIZE = 32;
+static const int BLOCKDIMY = 32;
+
+template
+  <typename scalar_t,
+   typename accscalar_t>
+__global__ void embedding_backward_feature_kernel
+  (int64_t* indices,
+   const scalar_t* __restrict__ grad,
+   scalar_t* __restrict__ grad_weight,
+   int n, // OK to pass as int, we don't expect 2 billion+ samples in one shot
+   int64_t stride,
+   int padding_idx)
+{
+  extern __shared__ char buf[];
+  accscalar_t* smem = (accscalar_t*)buf;
+  accscalar_t* my_s = smem + WARP_SIZE*threadIdx.y;
+  int* indices_batch = (int*)(buf + sizeof(accscalar_t)*WARP_SIZE*blockDim.y);
+
+  const int s = (int)stride; // OK to make int, we don't expect 2 billion+ embedding row size
+
+  const int f = threadIdx.x + blockIdx.x*blockDim.x; // feature_dim
+
+  for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
+  {
+    // Entire block cooperates to load a batch of 1024 indices to process
+    int tid = threadIdx.x + threadIdx.y*blockDim.x;
+    if(batch_start + tid < n)
+      indices_batch[tid] = (int)indices[batch_start + tid];
+
+    // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32
+    for(int chunk_start = batch_start; chunk_start < n; chunk_start += blockDim.y)
+    {
+      // This does double duty:  it makes sure indices_batch is ready, and it makes sure match-group
+      // leaders are done with their accumulates before other warps start loading again.
+      __syncthreads();
+
+      int n_this_chunk = (n - chunk_start) < blockDim.y ? (n - chunk_start) : blockDim.y;
+
+      int src_row = chunk_start + threadIdx.y;
+      int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight
+
+      // All warps load their smem segments with incoming grad data
+      if(src_row < n && f < s && dst_row != padding_idx)
+        my_s[threadIdx.x] = static_cast<accscalar_t>(grad[src_row*stride + f]);
+
+      __syncthreads();
+
+      // To ensure determinism, we can't just have each warp add its grad data to its dst_row.
+      // We need to check if any other warps pulled grad data targeting dst_row.
+      // If so, we elect the first warp in each matching group as the leader.
+      // Each leader warp serializes the accumulates targeting dst_row in shared memory,
+      // then finishes by adding the accumulated buffer to dst_row in grad_weight.
+      if(dst_row != padding_idx && src_row < n) // Per-warp exit condition, safe with ballot_sync
+      {
+        int match_found_this_thread =
+          (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]);
+        if(threadIdx.x >= n_this_chunk)
+          match_found_this_thread = 0;
+        unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
+
+        int first_remaining_peer = __ffs(matchmask) - 1;
+
+        if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
+        {
+          matchmask ^= (1 << first_remaining_peer);
+          while(matchmask)
+          {
+            first_remaining_peer = __ffs(matchmask) - 1;
+            my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
+            matchmask ^= (1 << first_remaining_peer);
+          }
+          if(f < s)
+            grad_weight[dst_row*stride + f] += static_cast<scalar_t>(my_s[threadIdx.x]);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void embedding_backward_kernel(
+  int64_t* input, int64_t* indices, scalar_t* grad_output, scalar_t* grad_weight,
+  int64_t* count, int64_t numel, int64_t stride, int padding_idx) {
+
+  using accscalar_t = acc_type<scalar_t, true>;
+  int idx = blockIdx.x * 4 + threadIdx.y;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values proceessed by each thread (grain size)
+  const int SZ = 4;
+
+  if (idx < numel
+      && (idx == 0 || input[idx] != input[idx - 1])
+      && input[idx] != padding_idx) {
+    do {
+      const int start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int weight_row = ((int) input[idx]) * stride;
+      const int grad_row = ((int) indices[idx]) * stride;
+      const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0;
+
+      accscalar_t gradient[SZ];
+      accscalar_t weight[SZ];
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * WARP_SIZE;
+        if (feature_dim < stride) {
+          gradient[ii] = static_cast<accscalar_t>(grad_output[grad_row + feature_dim]);
+          weight[ii] = static_cast<accscalar_t>(grad_weight[weight_row + feature_dim]);
+        }
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        weight[ii] += gradient[ii] * scale;
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * WARP_SIZE;
+        if (feature_dim < stride) {
+            grad_weight[weight_row + feature_dim] = static_cast<scalar_t>(weight[ii]);
+        }
+      }
+
+      idx++;
+    } while (idx < numel && input[idx] == input[idx - 1]);
+  }
+}
+
+/* Calculate norms of the rows of weight_ptr given by idx_ptr and capture them in norms */
+template <typename scalar_t, typename accscalar_t>
+__global__ void renorm_kernel(
+    scalar_t* weights, int64_t* indices, accscalar_t max_norm,
+    accscalar_t norm_type, int64_t dim,
+    int64_t weights_stride0, int64_t weights_stride1) {
+
+  // Some casting hacks since dynamic shared memory and templates don't work together:
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+
+  int tid = threadIdx.x;
+  int base_index = indices[blockIdx.x] * weights_stride0;
+
+  accscalar_t v = 0;
+  for (int i = tid; i < dim; i += blockDim.x) {
+    auto x = static_cast<accscalar_t>(weights[base_index + i * weights_stride1]);
+    if (norm_type == 1) {
+      v += std::abs(x);
+    } else if (norm_type == 2) {
+      v += x * x;
+    } else {
+      v += std::pow(x, norm_type);
+    }
+  }
+
+  using Op = ReduceAdd<accscalar_t>;
+  v = reduceBlock<accscalar_t>(sdata, blockDim.x, v, Op(), 0);
+
+  if (tid == 0) {
+    sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
+  }
+  __syncthreads();
+
+  // now we renormalize the blocks that need it
+  if (sdata[0] > max_norm) {
+    auto factor = static_cast<scalar_t>(max_norm / (sdata[0] + 1e-7));
+    for (int i = tid; i < dim; i += blockDim.x) {
+      weights[base_index + i * weights_stride1] *= factor;
+    }
+  }
+}
+
+} // anonymous namespace
+
+Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices,
+                               int64_t num_weights, int64_t padding_idx,
+                               bool scale_grad_by_freq) {
+  auto grad_arg = TensorArg(grad_, "grad", 1);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarType("embedding_backward", indices_arg, kLong);
+  checkSameGPU("embedding_backward", grad_arg, indices_arg);
+
+  auto num_indices = indices.numel();
+  auto grad = grad_.contiguous().view({num_indices, grad_.size(-1)});
+  auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options());
+
+  int64_t stride = grad_weight.stride(0);
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+
+  if (num_indices <= 768 && !scale_grad_by_freq) {
+    auto indices_contig = indices.contiguous();
+
+    dim3 grid(THCCeilDiv(stride, (int64_t)WARP_SIZE));
+    dim3 block(WARP_SIZE, BLOCKDIMY);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (grad.type(),
+       "embedding_backward",
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+         embedding_backward_feature_kernel<scalar_t, accscalar_t>
+           <<<grid,
+              block,
+              sizeof(accscalar_t)*WARP_SIZE*BLOCKDIMY + sizeof(int)*WARP_SIZE*BLOCKDIMY,
+              stream>>>
+           (indices_contig.data<int64_t>(),
+            grad.data<scalar_t>(),
+            grad_weight.data<scalar_t>(),
+            num_indices,
+            stride,
+            padding_idx);
+       });
+
+    THCudaCheck(cudaGetLastError());
+    return grad_weight;
+  }
+
+  auto sorted_indices = at::empty_like(indices);
+  auto orig_indices = at::empty_like(indices);
+  using device_ptr = thrust::device_ptr<int64_t>;
+
+  // Sort the inputs into sorted with the corresponding indices; we
+  // don't need a stable or multidimensional sort, so just use Thrust
+  // directly
+  {
+    sorted_indices.copy_(indices);
+
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    // Fill sortedOrigIndices with sequential indices
+    auto count_iter = thrust::counting_iterator<int64_t>(0);
+    auto orig_data = device_ptr(orig_indices.data<int64_t>());
+    thrust::copy(policy, count_iter, count_iter + num_indices, orig_data);
+
+    // Sort; a stable sort is not required
+    auto sorted_data = device_ptr(sorted_indices.data<int64_t>());
+    thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data,
+                        ThrustLTOp<int64_t>());
+  }
+
+  Tensor count;
+  if (scale_grad_by_freq) {
+    count = at::empty_like(indices);
+
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    // Compute an increasing sequence per unique item in sortedIndices:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 1 2 3 1 2 1 1 2
+    auto sorted_data = device_ptr(sorted_indices.data<int64_t>());
+    auto count_data = device_ptr(count.data<int64_t>());
+    thrust::inclusive_scan_by_key(
+      policy,
+      sorted_data,
+      sorted_data + num_indices,
+      thrust::make_constant_iterator(1),
+      count_data
+    );
+
+    // Take the maximum of each count per unique key in reverse:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 3 3 3 2 2 1 2 2
+    thrust::inclusive_scan_by_key(
+      policy,
+      thrust::make_reverse_iterator(sorted_data + num_indices),
+      thrust::make_reverse_iterator(sorted_data),
+      thrust::make_reverse_iterator(count_data + num_indices),
+      thrust::make_reverse_iterator(count_data + num_indices),
+      thrust::equal_to<int64_t>(),
+      thrust::maximum<int64_t>()
+    );
+  }
+
+  dim3 grid(THCCeilDiv(num_indices, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
+  dim3 block(32, 4);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "embedding_backward", [&] {
+    embedding_backward_kernel<<<grid, block, 0, stream>>>(
+      sorted_indices.data<int64_t>(),
+      orig_indices.data<int64_t>(),
+      grad.data<scalar_t>(),
+      grad_weight.data<scalar_t>(),
+      count.defined() ? count.data<int64_t>() : nullptr,
+      num_indices,
+      stride,
+      padding_idx);
+  });
+  THCudaCheck(cudaGetLastError());
+
+  return grad_weight;
+}
+
+Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
+                                double max_norm, double norm_type) {
+  auto self_arg = TensorArg(self, "self", 1);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkDim("embedding_renorm_", self_arg, 2);
+  checkSameGPU("embedding_renorm", self_arg, indices_arg);
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  using device_ptr = thrust::device_ptr<int64_t>;
+
+  auto num_indices = indices.numel();
+  auto indices_contig = indices.contiguous();
+  auto indices_data = device_ptr(indices_contig.data<int64_t>());
+
+  // FIXME: thrust::unique only removes consecutive elements that are equal.
+  // We have race conditions when indices contain duplicates which are not
+  // adjacent
+  auto unique_indices = indices.type().tensor(indices.numel());
+  auto unique_data = device_ptr(unique_indices.data<int64_t>());
+  auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data);
+  auto num_unique_indices = static_cast<int>(end - unique_data);
+
+  dim3 grid(num_unique_indices);
+  dim3 block(128);
+  int dim = self.stride(0);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "embedding_backward", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    renorm_kernel<<<grid, block, 128 * sizeof(accscalar_t), stream>>>(
+      self.data<scalar_t>(),
+      unique_indices.data<int64_t>(),
+      static_cast<accscalar_t>(max_norm),
+      static_cast<accscalar_t>(norm_type),
+      dim, self.stride(0), self.stride(1));
+  });
+  THCudaCheck(cudaGetLastError());
+
+  return self;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
new file mode 100644
index 0000000..9169cb0
--- /dev/null
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -0,0 +1,389 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include "ATen/AccumulateType.h"
+
+#include <THC/THCDeviceUtils.cuh>
+#include <THC/THCTensorMathReduce.cuh>
+#include <THC/THCTensorSort.cuh>
+#include <THC/THCThrustAllocator.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/unique.h>
+
+const int WARP_SIZE = 32;
+const int MODE_SUM = 0;
+const int MODE_MEAN = 1;
+const int MODE_MAX = 2;
+
+namespace at {
+namespace native {
+
+namespace {
+
+// This kernel assumes that all input tensors except `weight` are contiguous.
+template <typename scalar_t>
+__global__ void EmbeddingBag_updateOutputKernel(
+    int64_t *input, int64_t *offsets, scalar_t *weight, scalar_t *output,
+    int64_t *offset2bag, int64_t numIndices, int64_t numBags,
+    int64_t featureSize, int64_t weight_stide0, int64_t weight_stride1,
+    int mode, int64_t *bag_size, int64_t *max_indices) {
+
+  // the strategy here is that each bag x feature is handled by a single thread
+
+  using accscalar_t = acc_type<scalar_t, true>;
+  int64_t chunksPerBag = THCCeilDiv(featureSize, (int64_t)blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < featureSize) {
+      int64_t bag = chunk / chunksPerBag;
+      scalar_t *weightFeat = weight + featureDim * weight_stride1;
+      int64_t begin = offsets[bag];
+      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
+      assert(end >= begin);
+
+      accscalar_t weightFeatSum = 0;
+      scalar_t weightFeatMax;
+
+      int64_t bag_size_ = 0;
+      int64_t maxWord = -1;
+      for (int64_t emb = begin; emb < end; emb++) {
+        const int64_t weightRow = input[emb] * weight_stide0;
+        scalar_t weightValue = weightFeat[weightRow];
+
+        if (mode == MODE_MAX) {
+          if (emb == begin || weightValue > weightFeatMax) {
+            weightFeatMax = weightValue;
+            maxWord = input[emb];
+          }
+        } else {
+          weightFeatSum += static_cast<accscalar_t>(weightValue);
+        }
+
+        bag_size_++;
+        if (featureDim == 0) {
+          offset2bag[emb] = bag;
+        }
+      }
+      if (mode == MODE_MEAN) {
+        weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
+        bag_size[bag] = bag_size_;
+      }
+
+      if (mode == MODE_MEAN || mode == MODE_SUM) {
+        output[bag * featureSize + featureDim] = static_cast<scalar_t>(weightFeatSum);
+      }
+      else if (mode == MODE_MAX) {
+        max_indices[bag * featureSize + featureDim] = maxWord;
+        output[bag * featureSize + featureDim] = weightFeatMax;
+      }
+    }
+  }
+}
+
+// FIXME: removed the accGradParametersKernelByFeature case present in
+// LookupTable. That kernel is faster at small sizes (<768 indices), which
+// does not need EmbeddingBag (LookupTable + Sum works fine), but would
+// still be nice to not be slow in that case.
+
+// This kernel assumes that all input tensors are contiguous.
+template <typename scalar_t>
+__global__ void EmbeddingBag_accGradParametersKernel_sum_avg(
+    int64_t *input, int64_t *indices, scalar_t *gradOutput,
+    scalar_t *gradWeight, int64_t *offset2bag, int64_t *count, ptrdiff_t numel,
+    int64_t stride, int mode, int64_t *bag_size) {
+
+  using accscalar_t = acc_type<scalar_t, true>;
+  int idx = blockIdx.x * 4 + threadIdx.y;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values proceessed by each thread (grain size)
+  const int SZ = 4;
+
+  if (idx < numel && (idx == 0 || input[idx] != input[idx - 1])) {
+    do {
+      const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int weightRow = ((int)input[idx]) * stride;
+
+      // Note: only this line changes from LookupTable_accgradParametersKernel
+      const int origRow = ((int)indices[idx]);
+      const int seq_number = offset2bag[origRow];
+      const int gradOutputRow = ((int)seq_number) * stride;
+
+      const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0;
+
+      accscalar_t gradient[SZ];
+      accscalar_t weight[SZ];
+
+#pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride) {
+          gradient[ii] =
+              static_cast<accscalar_t>(gradOutput[gradOutputRow + featureDim]);
+          if (mode == MODE_MEAN) {
+            gradient[ii] /= bag_size[seq_number];
+          }
+          weight[ii] =
+              static_cast<accscalar_t>(gradWeight[weightRow + featureDim]);
+        }
+      }
+
+#pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        weight[ii] += gradient[ii] * scale;
+      }
+
+#pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride) {
+          gradWeight[weightRow + featureDim] =
+              static_cast<scalar_t>(weight[ii]);
+        }
+      }
+
+      idx++;
+    } while (idx < numel && input[idx] == input[idx - 1]);
+  }
+}
+
+
+Tensor embedding_bag_backward_cuda_sum_avg(
+                                   const Tensor &grad,
+                                   const Tensor &indices,
+                                   const Tensor &offset2bag,
+                                   const Tensor &bag_size_,
+                                   int64_t num_weights,
+                                   bool scale_grad_by_freq, int64_t mode) {
+
+  Tensor &bag_size = const_cast<Tensor &>(bag_size_);
+
+  auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type());
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+
+  ptrdiff_t numel = indices.numel();
+  int64_t stride = grad_weight.stride(0);
+
+  auto sorted_indices = indices.type().tensor(indices.sizes());
+  auto orig_indices = indices.type().tensor(indices.sizes());
+  using device_ptr = thrust::device_ptr<int64_t>;
+
+  // Sort the inputs into sorted with the corresponding indices; we
+  // don't need a stable or multidimensional sort, so just use Thrust
+  // directly
+  {
+    sorted_indices.copy_(indices);
+
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    // Fill sortedOrigIndices with sequential indices
+    auto count_iter = thrust::counting_iterator<int64_t>(0);
+    auto orig_data = device_ptr(orig_indices.data<int64_t>());
+    thrust::copy(policy, count_iter, count_iter + numel, orig_data);
+
+    // Sort; a stable sort is not required
+    auto sorted_data = device_ptr(sorted_indices.data<int64_t>());
+    thrust::sort_by_key(policy, sorted_data, sorted_data + numel, orig_data,
+                        ThrustLTOp<int64_t>());
+  }
+
+  Tensor count;
+  if (scale_grad_by_freq) {
+    count = indices.type().tensor(indices.sizes());
+
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    // Compute an increasing sequence per unique item in sortedIndices:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 1 2 3 1 2 1 1 2
+    auto sorted_data = device_ptr(sorted_indices.data<int64_t>());
+    auto count_data = device_ptr(count.data<int64_t>());
+    thrust::inclusive_scan_by_key(policy, sorted_data, sorted_data + numel,
+                                  thrust::make_constant_iterator(1),
+                                  count_data);
+
+    // Take the maximum of each count per unique key in reverse:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 3 3 3 2 2 1 2 2
+    thrust::inclusive_scan_by_key(
+        policy, thrust::make_reverse_iterator(sorted_data + numel),
+        thrust::make_reverse_iterator(sorted_data),
+        thrust::make_reverse_iterator(count_data + numel),
+        thrust::make_reverse_iterator(count_data + numel),
+        thrust::equal_to<int64_t>(), thrust::maximum<int64_t>());
+  }
+
+  dim3 grid(THCCeilDiv(numel, (ptrdiff_t)4), THCCeilDiv(stride, (int64_t)128));
+  dim3 block(32, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.type(), "embedding_bag_backward_cuda_sum_avg_kernel", [&] {
+        EmbeddingBag_accGradParametersKernel_sum_avg<
+            scalar_t><<<grid, block, 0, stream>>>(
+            sorted_indices.data<int64_t>(), orig_indices.data<int64_t>(),
+            grad.data<scalar_t>(), grad_weight.data<scalar_t>(),
+            offset2bag.data<int64_t>(),
+            count.defined() ? count.data<int64_t>() : nullptr, numel, stride,
+            mode, bag_size.data<int64_t>());
+      });
+
+  THCudaCheck(cudaGetLastError());
+  return grad_weight;
+}
+
+template <typename scalar_t>
+__global__ void EmbeddingBag_accGradParametersKernel_max(
+    int64_t *max_indices, scalar_t *gradOutput,
+    scalar_t *gradWeight, int64_t stride, int64_t numBags) {
+
+  using accscalar_t = acc_type<scalar_t, true>;
+
+  int64_t chunksPerBag = THCCeilDiv(stride, (int64_t)blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < stride) {
+      int64_t bag = chunk / chunksPerBag;
+
+      int64_t word_idx = max_indices[bag * stride + featureDim];
+
+      atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]);
+    }
+  }
+}
+
+Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
+                                   const Tensor &max_indices,
+                                   int64_t num_weights) {
+
+  auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type());
+
+  int64_t stride = grad_weight.stride(0);
+
+  int64_t numBags = grad.size(0);
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+
+  dim3 block = dim3(32, 8);
+  int grid = 1024;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.type(), "embedding_bag_backward_cuda_max", [&] {
+        EmbeddingBag_accGradParametersKernel_max<
+            scalar_t><<<grid, block, 0, stream>>>(
+            max_indices.data<int64_t>(), grad.data<scalar_t>(),
+            grad_weight.data<scalar_t>(), stride, numBags);
+      });
+
+  THCudaCheck(cudaGetLastError());
+  return grad_weight;
+}
+}
+
+// Assumes all input tensors are contiguous.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+_embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
+                   const Tensor &offsets, const bool scale_grad_by_freq,
+                   const int64_t mode, bool sparse) {
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarType("embedding_bag_cuda", indices_arg, kLong);
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarType("embedding_bag_cuda", offsets_arg, kLong);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+  checkSameGPU("embedding_bag_cuda", weight_arg, indices_arg);
+  checkSameGPU("embedding_bag_cuda", weight_arg, offsets_arg);
+
+  int64_t numIndices = indices.size(0);
+  int64_t numBags = offsets.size(0);
+  int64_t featureSize = weight.size(1);
+
+  auto bag_size = at::zeros(offsets.sizes(), indices.options());
+  auto offset2bag =
+      at::zeros({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0]
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+
+  auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options());
+
+  Tensor max_indices;
+
+  if (mode == MODE_MAX) {
+    max_indices = at::zeros({offsets.size(0), weight.size(1)}, indices.options());
+  } else {
+    // No need to allocate if we aren't doing a backwards pass
+    max_indices = at::zeros({0}, indices.options());
+  }
+
+  dim3 block = dim3(32, 8);
+  int grid = 1024;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(weight.type(), "embedding_bag_cuda", [&] {
+    EmbeddingBag_updateOutputKernel<scalar_t><<<grid, block, 0, stream>>>(
+        indices.data<int64_t>(), offsets.data<int64_t>(),
+        weight.data<scalar_t>(), output.data<scalar_t>(),
+        offset2bag.data<int64_t>(), numIndices, numBags, featureSize,
+        weight.stride(0), weight.stride(1), mode, bag_size.data<int64_t>(),
+        mode == MODE_MAX ? max_indices.data<int64_t>() : NULL);
+  });
+
+  THCudaCheck(cudaGetLastError());
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, offset2bag, bag_size, max_indices);
+}
+
+Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &indices,
+                                   const Tensor &offsets,
+                                   const Tensor &offset2bag,
+                                   const Tensor &bag_size_,
+                                   const Tensor &max_indices,
+                                   int64_t num_weights,
+                                   bool scale_grad_by_freq, int64_t mode) {
+  // indices, offsets and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward in
+  // EmbeddingBag.cpp.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  Tensor grad = grad_.contiguous();
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  auto grad_arg = TensorArg(grad, "grad", 1);
+  checkSameGPU("embedding_bag_cuda", grad_arg, offsets_arg);
+  checkSameGPU("embedding_bag_cuda", grad_arg, indices_arg);
+
+  switch (mode) {
+    case MODE_SUM:
+    case MODE_MEAN:
+      return embedding_bag_backward_cuda_sum_avg(grad, indices, offset2bag, bag_size_, num_weights, scale_grad_by_freq, mode);
+
+    case MODE_MAX:
+      return embedding_bag_backward_cuda_max(grad, max_indices, num_weights);
+
+    default:
+      AT_ERROR(
+          "Unknown mode for embedding_bag_backward_cuda %d", mode);
+  }
+}
+
+}
+}
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
new file mode 100644
index 0000000..c31d557
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -0,0 +1,142 @@
+#include "ATen/Context.h"
+#include "ATen/Dispatch.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/cuda/PinnedMemoryAllocator.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+#include "ATen/native/LinearAlgebraUtils.h"
+#include "ATen/native/Gesv.h"
+
+#include "THC.h" // for USE_MAGMA
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#include <magma_types.h>
+#endif
+
+namespace at {
+namespace native {
+
+#ifdef USE_MAGMA
+template<class scalar_t>
+void magmaGesvBatched(
+    magma_int_t n, magma_int_t nrhs, scalar_t** dA_array, magma_int_t ldda,
+    magma_int_t** dipiv_array, scalar_t** dB_array, magma_int_t lddb,
+    magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) {
+  AT_ERROR("gesv only takes float or double Tensors");
+}
+
+template<>
+void magmaGesvBatched<float>(
+    magma_int_t n, magma_int_t nrhs, float** dA_array, magma_int_t ldda,
+    magma_int_t** dipiv_array, float** dB_array, magma_int_t lddb,
+    magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) {
+  magma_sgesv_batched(
+      n, nrhs, dA_array, ldda, dipiv_array,
+      dB_array, lddb, dinfo_array, batch_count, queue);
+}
+
+template<>
+void magmaGesvBatched<double>(
+    magma_int_t n, magma_int_t nrhs, double** dA_array, magma_int_t ldda,
+    magma_int_t** dipiv_array, double** dB_array, magma_int_t lddb,
+    magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) {
+  magma_dgesv_batched(
+      n, nrhs, dA_array, ldda, dipiv_array,
+      dB_array, lddb, dinfo_array, batch_count, queue);
+}
+
+static magma_queue_t createMagmaQueue(const Tensor& tensor) {
+  auto& context = tensor.type().get_context();
+  magma_queue_t magma_queue;
+  magma_queue_create_from_cuda(
+      tensor.get_device(),
+      context.getCurrentCUDAStream(),
+      THCState_getCurrentBlasHandle(context.getTHCState()),
+      THCState_getCurrentSparseHandle(context.getTHCState()),
+      &magma_queue);
+  return magma_queue;
+}
+
+static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<magma_int_t>(value);
+  if (static_cast<int64_t>(result) != value) {
+    AT_ERROR("magma: The value of %s (%lld) is too large to fit into a magma_int_t (%llu bytes)",
+             varname, (long long)value, sizeof(magma_int_t));
+  }
+  return result;
+}
+#endif
+
+// Creates an array of size elements of type T, backed by pinned memory
+// wrapped in a Storage
+template<class T>
+static inline std::unique_ptr<Storage> pin_memory(int64_t size, Tensor dummy) {
+  int64_t adjusted_size = size * sizeof(T);
+  auto* allocator = cuda::getPinnedMemoryAllocator();
+  auto& backend = dummy.type().toBackend(kCPU).toScalarType(kByte);
+  return backend.storageWithAllocator(adjusted_size, allocator);
+}
+
+#define ALLOCATE_ARRAY(name, type, size, dummy_tensor) \
+  auto storage_##name = pin_memory<type>(size, dummy_tensor); \
+  name = reinterpret_cast<type*>(storage_##name->data());
+
+template <typename scalar_t>
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+#ifndef USE_MAGMA
+AT_ERROR("gesv: MAGMA library not found in "
+    "compilation. Please rebuild with MAGMA.");
+#else
+  auto A_data = A.data<scalar_t>();
+  auto b_data = b.data<scalar_t>();
+  auto A_mat_stride = matrixStride(A);
+  auto b_mat_stride = matrixStride(b);
+
+  magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount");
+  magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)");
+  magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
+
+  magma_int_t* info_array;
+  magma_int_t* ipiv_data;
+  magma_int_t** ipiv_array;
+  scalar_t** A_array;
+  scalar_t** b_array;
+
+  ALLOCATE_ARRAY(info_array, magma_int_t, batch_size, b);
+  ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n, b);
+  ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size, b);
+  ALLOCATE_ARRAY(A_array, scalar_t*, batch_size, b);
+  ALLOCATE_ARRAY(b_array, scalar_t*, batch_size, b);
+
+  // Set up the created arrays
+  for (int64_t i = 0; i < batch_size; i++) {
+    A_array[i] = &A_data[i * A_mat_stride];
+    b_array[i] = &b_data[i * b_mat_stride];
+    ipiv_array[i] = &ipiv_data[i * n];
+  }
+
+  magmaGesvBatched<scalar_t>(
+      n, nrhs, A_array, n, ipiv_array, b_array, n,
+      info_array, batch_size, createMagmaQueue(b));
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    infos[i] = info_array[i];
+  }
+#endif
+}
+
+std::tuple<Tensor,Tensor> _gesv_helper_cuda(const Tensor& self, const Tensor& A) {
+  std::vector<int64_t> infos(batchCount(A), 0);
+  auto A_working_copy = cloneBatchedColumnMajor(A);
+  auto b_working_copy = cloneBatchedColumnMajor(self);
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+    applyGesv<scalar_t>(b_working_copy, A_working_copy, infos);
+  });
+  checkErrors(infos);
+  return std::tuple<Tensor,Tensor>(b_working_copy, A_working_copy);
+}
+
+}}  // namespace at::native
+
+#undef ALLOCATE_ARRAY
diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu
new file mode 100644
index 0000000..63f1f26
--- /dev/null
+++ b/aten/src/ATen/native/cuda/RoiPooling.cu
@@ -0,0 +1,214 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Error.h"
+
+#include <cfloat>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+__host__ __device__ __forceinline__ float fmin(float a, float b) {
+  return a > b ? b : a;
+}
+
+__host__ __device__ __forceinline__ float fmax(float a, float b) {
+  return a > b ? a : b;
+}
+
+template <typename T>
+__global__ void RoiPooling2d_forward_kernel(
+  const int outputElements,
+  const T *input,
+  const T *rois,
+  const T spatialScale,
+  const int inputChannels,
+  const int inputHeight,
+  const int inputWidth,
+  const int pooledHeight,
+  const int pooledWidth,
+  T *output,
+  int *argmaxes)
+{
+  for (int linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < outputElements;
+       linearIndex += blockDim.x * gridDim.x)
+  {
+    // Calculate position in output Tensor, i.e. a specific combination
+    // of proposal, channel, pool height and pool width
+    // TODO: write to improve performance by minimize computation
+    int pw = linearIndex % pooledWidth;
+    int ph = (linearIndex / pooledWidth) % pooledHeight;
+    int ch = (linearIndex / pooledWidth / pooledHeight) % inputChannels;
+    int proposal = linearIndex / pooledWidth / pooledHeight / inputChannels;
+
+    // Get particular proposal data
+    const T *roisOffset = rois + (proposal * 5);
+    int n = roisOffset[0];
+    int startWidth = llrintf(roisOffset[1] * spatialScale);
+    int startHeight = llrintf(roisOffset[2] * spatialScale);
+    int endWidth = llrintf(roisOffset[3] * spatialScale);
+    int endHeight = llrintf(roisOffset[4] * spatialScale);
+
+    // TODO: fix malformed RoIs to be 1x1
+
+    int roiHeight = endHeight - startHeight;
+    int roiWidth = endWidth - startWidth;
+
+    // Calculate size of tile based on the size of this particular RoI and the
+    // output size
+    T tileHeight = static_cast<T>(roiHeight) / static_cast<T>(pooledHeight);
+    T tileWidth = static_cast<T>(roiWidth) / static_cast<T>(pooledWidth);
+
+    // Calculate offset into the pooled region
+    int tileHStart = static_cast<int>(floorf(static_cast<T>(ph) * tileHeight));
+    int tileWStart = static_cast<int>(floorf(static_cast<T>(pw) * tileWidth));
+    int tileHEnd = static_cast<int>(ceilf(static_cast<T>(ph + 1) * tileHeight));
+    int tileWEnd = static_cast<int>(ceilf(static_cast<T>(pw + 1) * tileWidth));
+
+    // Calculate offset into the image itself, based on RoI + pooled offsets,
+    // and ensure it falls within image boundaries
+    tileHStart = fmin(fmax(tileHStart + startHeight, 0), inputHeight);
+    tileWStart = fmin(fmax(tileWStart + startWidth, 0), inputWidth);
+    tileHEnd = fmin(fmax(tileHEnd + startHeight, 0), inputHeight);
+    tileWEnd = fmin(fmax(tileWEnd + startWidth, 0), inputWidth);
+
+    // If our pooling region is empty, we set the output to 0, otherwise to
+    // the min float so we can calculate the max properly
+    bool isEmpty = (tileHStart >= tileHEnd) || (tileWStart >= tileWEnd);
+    T max = isEmpty ? 0 : FLT_MIN;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxIdx = -1;
+
+    const T *inputOffset = input + ((n * inputChannels + ch) * inputHeight * inputWidth);
+    for (int th = tileHStart; th < tileHEnd; ++th) {
+      for (int tw = tileWStart; tw < tileWEnd; ++tw) {
+        int index = (th * inputWidth) + tw;
+	if (inputOffset[index] > max) {
+          max = inputOffset[index];
+	  maxIdx = index;
+	}
+      }
+    }
+    output[linearIndex] = max;
+
+    // TODO optional argmax
+    argmaxes[linearIndex] = maxIdx;
+  }
+}
+
+std::tuple<Tensor, Tensor> RoiPooling2d_forward_cuda(
+  const Tensor& input,
+  const Tensor& rois,
+  int64_t pooledHeight,
+  int64_t pooledWidth,
+  double spatialScale)
+{
+
+  // Input is the output of the last convolutional layer in the Backbone network, so
+  // it should be in the format of NCHW
+  AT_CHECK(input.ndimension() == 4, "Input to RoI Pooling should be a NCHW Tensor");
+
+  // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
+  // dim is the # of proposals, and the second dim is the proposal itself in the form
+  // [batch_index startW startH endW endH]
+  AT_CHECK(rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
+  AT_CHECK(rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
+
+  auto proposals = rois.size(0);
+  auto inputChannels = input.size(1);
+  auto inputHeight = input.size(2);
+  auto inputWidth = input.size(3);
+
+  // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
+  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+
+  // TODO: need some mechanism for determining train vs. test
+
+  // During training, we need to store the argmaxes for the pooling operation, so
+  // the argmaxes Tensor should be the same size as the output Tensor
+  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+
+  AT_CHECK(input.is_contiguous(), "input must be contiguous");
+  AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
+
+  dim3 block(512);
+  dim3 grid((output.numel() + 512 - 1) / 512);
+  RoiPooling2d_forward_kernel<<<grid, block, 0, globalContext().getCurrentCUDAStream()>>>(
+    output.numel(), input.data<float>(), rois.data<float>(), static_cast<float>(spatialScale), inputChannels,
+    inputHeight, inputWidth, pooledHeight, pooledWidth, output.data<float>(), argmaxes.data<int>());
+  AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_forward_kernel failed with error code ", cudaGetLastError());
+
+  return std::make_tuple(output, argmaxes);
+}
+
+template <typename T>
+__global__ void RoiPooling2d_backward_kernel(
+  const int outputElements,
+  const T *gradOutput,
+  const int *argmaxes,
+  const int proposals,
+  const T spatialScale,
+  const int inputChannels,
+  const int inputHeight,
+  const int inputWidth,
+  const int pooledHeight,
+  const int pooledWidth,
+  T *gradInput,
+  const T *rois)
+{
+  for (int linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < outputElements;
+       linearIndex += blockDim.x * gridDim.x)
+  {
+    int pw = linearIndex % pooledWidth;
+    int ph = (linearIndex / pooledWidth) / pooledHeight;
+    int ch = (linearIndex / pooledWidth / pooledHeight) % inputChannels;
+    int proposal = linearIndex / pooledWidth / pooledHeight / inputChannels;
+
+    const T *roisOffset = rois + (proposal * 5);
+    int n = roisOffset[0];
+    int gradInputOffset = (n * inputChannels + ch) * inputHeight * inputWidth;
+    int gradOutputOffset = (n * inputChannels + ch) * pooledHeight * pooledWidth;
+    const T* gradOutputShifted = gradOutput + gradOutputOffset;
+    T *gradInputShifted = gradInput + gradInputOffset;
+    const int *argmaxesShifted = argmaxes + gradOutputOffset;
+
+    int argmax = argmaxesShifted[ph * pooledWidth + pw];
+    if (argmax != -1) {
+      atomicAdd(gradInputShifted + argmax, gradOutputShifted[ph * pooledWidth + pw]);
+    }
+  }
+}
+
+Tensor RoiPooling2d_backward_cuda(
+  const Tensor& input,
+  const Tensor& rois,
+  int64_t pooledHeight,
+  int64_t pooledWidth,
+  double spatialScale,
+  const Tensor& gradOutput,
+  const Tensor& argmaxes)
+{
+  // TODO: assertions?
+
+  auto proposals = rois.size(0);
+  auto inputChannels = input.size(1);
+  auto inputHeight = input.size(2);
+  auto inputWidth = input.size(3);
+
+  auto gradInput = input.type().tensor(input.sizes());
+
+  dim3 block(512);
+  dim3 grid((gradInput.numel() + 512 - 1) / 512);
+  RoiPooling2d_backward_kernel<<<grid, block, 0, globalContext().getCurrentCUDAStream()>>>(
+    gradOutput.numel(), gradOutput.data<float>(), argmaxes.data<int>(), proposals,
+    static_cast<float>(spatialScale), inputChannels, inputHeight, inputWidth,
+    pooledHeight, pooledWidth, gradInput.data<float>(), rois.data<float>());
+  AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_backward_kernel failed with error code ", cudaGetLastError());
+
+  return gradInput;
+}
+
+} // at::native
+} // at
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
new file mode 100644
index 0000000..0ee5d18
--- /dev/null
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -0,0 +1,596 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+#include <THC/THCTensorMathReduce.cuh>
+#include <THC/THCTensorSort.cuh>
+#include <THC/THCThrustAllocator.cuh>
+#include <THC/THCNumerics.cuh>
+
+#include "ATen/AccumulateType.h"
+
+
+namespace at {
+namespace native {
+
+namespace {
+
+template<typename T, typename AccumT>
+struct LogSoftMaxForwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : logsum(max_input + std::log(sum)) {}
+
+  __device__ __forceinline__ T operator()(T input) const {
+    return static_cast<T>(input - logsum);
+}
+
+  const AccumT logsum;
+};
+
+template<typename T, typename AccumT>
+struct LogSoftMaxBackwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum)
+    : sum(sum) {}
+
+  __device__ __forceinline__ T operator()(T gradOutput, T output) const {
+    return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
+  }
+
+  const AccumT sum;
+};
+
+template<typename T, typename AccumT>
+struct SoftMaxForwardEpilogue {
+  __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : max_input(max_input)
+    , sum(sum) {}
+
+  __device__ __forceinline__ T operator()(T input) const {
+    return static_cast<T>(std::exp(input - max_input) / sum);
+  }
+
+  const AccumT max_input;
+  const AccumT sum;
+};
+
+template<typename T, typename AccumT>
+struct SoftMaxBackwardEpilogue {
+  __device__ __forceinline__ SoftMaxBackwardEpilogue(AccumT sum)
+    : sum(sum) {}
+
+  // XXX: gradOutput that we get here is really gradOutput * output
+  // Look for cmul in SoftMax_updateGradInput
+  __device__ __forceinline__ T operator()(T gradOutput, T output) const {
+    return static_cast<T>(gradOutput - output * sum);
+  }
+
+  const AccumT sum;
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Spatial kernel (fast with large inner_size and small dim_size)
+////////////////////////////////////////////////////////////////////////////////
+// Let's assume that our input has been flattened to have only three dimension:
+//     outer x dim x inner
+// The spatial algorithm tries to paralellize along all of them.
+// Within a 2d block threadIdx.y paralellizes over dim slices, and threads that
+// share it will speed up reductions over dim (along axis x).
+// The 2d grid is used to paralellize inner dimension over y axis and outer over x.
+inline dim3 SpatialSoftMax_getGridSize(
+    dim3 block, uint32_t max_active_blocks,
+    uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) {
+  // First, tile as many blocks as we can over the y axis
+  uint32_t inner_blocks = (inner_size + block.y - 1) / block.y;
+  if (inner_blocks > max_active_blocks)
+    inner_blocks = max_active_blocks;
+  // Fill the x axis with as many blocks as we can fit (a little more is ok too)
+  uint32_t outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
+  if (outer_blocks > outer_size)
+    outer_blocks = outer_size;
+  return dim3(outer_blocks, inner_blocks);
+}
+
+const int max_threads = 1024;
+
+inline dim3 SpatialSoftMax_getBlockSize(
+  uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) {
+  uint32_t inner_threads = inner_size;
+  inner_threads = std::min(inner_threads, static_cast<uint32_t>(max_threads));
+  uint32_t dim_threads = 1;
+  if (inner_threads <= 64 && dim_size >= 64) {
+    while (inner_threads * dim_threads <= max_threads && dim_threads <= dim_size)
+      dim_threads *= 2;
+    dim_threads /= 2;
+  }
+  return dim3(dim_threads, inner_threads);
+}
+
+
+template<typename accscalar_t, typename Kernel>
+void SpatialSoftMax_getLaunchSizes(
+    Kernel k,
+    uint64_t outer_size, uint64_t dim_size, uint64_t inner_size,
+    dim3& grid, dim3& block, uint32_t& smem_size) {
+  block = SpatialSoftMax_getBlockSize(outer_size, dim_size, inner_size);
+  uint32_t block_threads = block.x * block.y;
+  smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
+  int max_active_blocks;
+#ifdef __HIP_PLATFORM_HCC__
+  max_active_blocks = 16;
+#else
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
+                                                k, block_threads, smem_size);
+#endif
+  max_active_blocks *= at::globalContext().getCurrentDeviceProperties()->multiProcessorCount;
+  grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
+}
+
+inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
+  while (block_size < max_block_size) block_size *= 2;
+  // Launch at least a single warp - the kernel assumes that.
+  block_size = std::max(block_size, static_cast<uint64_t>(32));
+  return dim3(block_size);
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+// Note that it's not a complete block-wide reduction.
+// Only threads that share threadIdx.y reduce values.
+template<typename T, template<typename> class ReduceOp>
+__forceinline__ __device__
+T spatialBlockReduceX(T *shared, T val) {
+  ReduceOp<T> r;
+  shared += threadIdx.y * blockDim.x;
+
+  __syncthreads();
+
+  shared[threadIdx.x] = val;
+
+  // NOTE: loop starts with __syncthreads()
+  int offset = blockDim.x / 2;
+  while (offset > 0) {
+    __syncthreads();
+    if (threadIdx.x < offset)
+      shared[threadIdx.x] = r(shared[threadIdx.x], shared[threadIdx.x + offset]);
+    offset /= 2;
+  }
+
+  __syncthreads();
+
+  return shared[0];
+}
+
+template <typename scalar_t, typename accscalar_t, template<typename, typename> class Epilogue>
+__global__ void cunn_SpatialSoftMaxForward(
+    scalar_t *output, scalar_t *input,
+    uint32_t outer_size, uint32_t dim_size, uint32_t inner_size)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  const uint32_t outer_stride = inner_size * dim_size;
+  const uint32_t dim_stride = inner_size;
+
+  for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) {
+    const uint32_t outer_offset = outer_index * outer_stride;
+    for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
+      const uint32_t data_offset = outer_offset + inner_index;
+      ////////////////////////////////////////////////////////////
+      // These two blocks are really eqivalent, but specializing on
+      // blockDim.x == 1 makes the kernel faster when it's unused.
+      // I didn't want to thread an extra template parameter, and nvcc
+      // seems to be smart enough to hoist the if outside of the loops.
+      ////////////////////////////////////////////////////////////
+
+      if (blockDim.x > 1) {
+        accscalar_t max_input = THCNumerics<accscalar_t>::min();
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          const accscalar_t value = static_cast<accscalar_t>(input[data_offset + d * dim_stride]);
+          max_input = Max<accscalar_t>()(max_input, value);
+        }
+        max_input = spatialBlockReduceX<accscalar_t, Max>(sdata,max_input);
+
+        accscalar_t sum = 0;
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+                 - max_input);
+        sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
+
+        Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+      } else {
+        accscalar_t max_input = THCNumerics<accscalar_t>::min();
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          const accscalar_t value = static_cast<accscalar_t>(input[data_offset + d * dim_stride]);
+          max_input = Max<accscalar_t>()(max_input, value);
+        }
+        accscalar_t sum = 0;
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+                 - max_input);
+        Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+      }
+    }
+  }
+}
+
+
+
+template <typename scalar_t, typename accscalar_t, template<typename, typename> class Epilogue>
+__global__ void cunn_SpatialSoftMaxBackward(
+    scalar_t *gradInput, scalar_t *output, scalar_t *gradOutput,
+    uint32_t outer_size, uint32_t dim_size, uint32_t inner_size)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  const uint32_t outer_stride = inner_size * dim_size;
+  const uint32_t dim_stride = inner_size;
+
+  for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) {
+    const uint32_t outer_offset = outer_index * outer_stride;
+    for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
+      const uint32_t data_offset = outer_offset + inner_index;
+      // See the comment in forward kernel
+      if (blockDim.x > 1) {
+        accscalar_t sum = 0;
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += gradOutput[data_offset + d * dim_stride];
+        sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
+
+        Epilogue<scalar_t, accscalar_t> epilogue(sum);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          gradInput[data_offset + d * dim_stride] =
+            epilogue(gradOutput[data_offset + d * dim_stride],
+                    output[data_offset + d * dim_stride]);
+        }
+      } else {
+        accscalar_t sum = 0;
+        for (uint32_t d = 0; d < dim_size; d++)
+          sum += gradOutput[data_offset + d * dim_stride];
+
+        Epilogue<scalar_t, accscalar_t> epilogue(sum);
+        for (uint32_t d = 0; d < dim_size; d++) {
+          gradInput[data_offset + d * dim_stride] =
+            epilogue(gradOutput[data_offset + d * dim_stride],
+                    output[data_offset + d * dim_stride]);
+        }
+      }
+    }
+  }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Regular kernel (fast when dim_size is large; requires inner_size == 1)
+////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename T, typename AccumT>
+struct MaxFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
+    return ::max(max, (AccumT)v);
+  }
+};
+
+template<typename T, typename AccumT>
+struct AddFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + v;
+  }
+};
+
+template<typename T, typename AccumT>
+struct SumExpFloat
+{
+  __device__ __forceinline__ SumExpFloat(AccumT v)
+    : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + std::exp(v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+template <template<typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT>& r,
+            AccumT defaultVal)
+{
+  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
+  __syncthreads();
+
+  smem[threadIdx.x] = val;
+
+  __syncthreads();
+
+  AccumT warpVal = defaultVal;
+
+  // First warp will perform per-warp reductions for the remaining warps
+  if (threadIdx.x < 32) {
+    int lane = threadIdx.x % 32;
+    if (lane < blockDim.x / 32) {
+#pragma unroll
+      for (int i = 0; i < 32; ++i) {
+        warpVal = r(warpVal, smem[lane * 32 + i]);
+      }
+      smem[lane] = warpVal;
+    }
+  }
+
+  __syncthreads();
+
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal = defaultVal;
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      blockVal = r(blockVal, smem[i]);
+    }
+    smem[0] = blockVal;
+  }
+
+  // Sync and broadcast
+  __syncthreads();
+  return smem[0];
+}
+
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT
+ilpReduce(T* data,
+          int size,
+          const Reduction<T, AccumT>& r,
+          AccumT defaultVal)
+{
+  AccumT threadVal = defaultVal;
+  int offset = threadIdx.x;
+
+  int last = size % (ILP * blockDim.x);
+
+  // Body (unroll by ILP times)
+  for (; offset < size - last; offset += blockDim.x * ILP) {
+    T tmp[ILP];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      tmp[j] = data[offset + j * blockDim.x];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      threadVal = r(threadVal, tmp[j]);
+  }
+
+  // Epilogue
+  for (; offset < size; offset += blockDim.x)
+    threadVal = r(threadVal, data[offset]);
+
+  return threadVal;
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, template <typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxForward(scalar_t *output, scalar_t *input, int classes)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  // forward pointers to batch[blockIdx.x]
+  // each block handles a sample in the mini-batch
+  input += blockIdx.x * classes;
+  output += blockIdx.x * classes;
+
+  // find the max
+  accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
+      input, classes, MaxFloat<scalar_t, accscalar_t>(), -THCNumerics<accscalar_t>::max());
+  accscalar_t max_k = blockReduce<Max, accscalar_t>(
+      sdata, threadMax, Max<accscalar_t>(), -THCNumerics<accscalar_t>::max());
+
+  // reduce all values
+  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
+      input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduce<Add, accscalar_t>(
+      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t> epilogue(max_k, sumAll);
+  int offset = threadIdx.x;
+  int last = classes % (ILP * blockDim.x);
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    scalar_t tmp[ILP];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      tmp[j] = input[offset + j * blockDim.x];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      output[offset + j * blockDim.x] = epilogue(tmp[j]);
+  }
+
+  for (; offset < classes; offset += blockDim.x)
+    output[offset] = epilogue(input[offset]);
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, template<typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxBackward(scalar_t *gradInput, scalar_t *output, scalar_t *gradOutput, int classes)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  gradInput += blockIdx.x * classes;
+  output += blockIdx.x * classes;
+  gradOutput += blockIdx.x * classes;
+
+  accscalar_t threadSum = ilpReduce<AddFloat, 4, scalar_t, accscalar_t>(
+      gradOutput, classes, AddFloat<scalar_t, accscalar_t>(), accscalar_t(0));
+  accscalar_t sum_k = blockReduce<Add, accscalar_t>(
+        sdata, threadSum, Add<accscalar_t>(), accscalar_t(0));
+
+  Epilogue<scalar_t, accscalar_t> epilogue(sum_k);
+  int offset = threadIdx.x;
+  int last = classes % (ILP * blockDim.x);
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    scalar_t tmpGradOutput[ILP];
+    scalar_t tmpOutput[ILP];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
+      tmpOutput[j] = output[offset + j * blockDim.x];
+    }
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      gradInput[offset + j * blockDim.x] = epilogue(tmpGradOutput[j], tmpOutput[j]);
+  }
+
+  for (; offset < classes; offset += blockDim.x)
+    gradInput[offset] = epilogue(gradOutput[offset], output[offset]);
+}
+
+
+
+
+
+
+template<template<typename, typename> class Epilogue>
+Tensor host_softmax(const Tensor & input_, const int64_t dim_){
+  auto input = input_.contiguous();
+  Tensor output = at::empty_like(input);
+  if (input.dim() == 0) input = input.view(1);
+  int64_t dim = maybe_wrap_dim(dim_, input.dim());
+  AT_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions");
+  int64_t outer_size = 1;
+  int64_t dim_size = input.size(dim);
+  int64_t inner_size = 1;
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= input.size(i);
+  for (int64_t i = dim + 1; i < input.dim(); ++i)
+    inner_size *= input.size(i);
+  // This kernel spawns a block per each element in the batch.
+  // XXX: it assumes that inner_size == 1
+  if (inner_size == 1) {
+    const int ILP = 2;
+    dim3 grid(outer_size);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, Epilogue>
+      <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
+        output.data<scalar_t>(), input.data<scalar_t>(), dim_size
+    );
+    });
+  // This kernel runs in a 2D grid, where each application along y dimension has a fixed
+  // outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size.
+  // Reductions over dim are done in a single-threaded manner.
+  } else {
+    uint32_t smem_size;
+    dim3 grid, block;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    SpatialSoftMax_getLaunchSizes<accscalar_t>(
+        &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, Epilogue>,
+        outer_size, dim_size, inner_size,
+        grid, block, smem_size);
+    cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, Epilogue>
+      <<<grid, block, smem_size, stream>>>(
+        output.data<scalar_t>(), input.data<scalar_t>(), outer_size, dim_size, inner_size
+    );
+    });
+  }
+  THCudaCheck(cudaGetLastError());
+  return output;
+}
+
+template<template<typename, typename> class Epilogue>
+Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t dim_){
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+  auto grad = grad_.contiguous();
+  Tensor gI = at::empty_like(grad);
+  if (grad.dim() == 0) grad = grad.view(1);
+  AT_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
+  auto output = output_.contiguous();
+  if (output.dim() == 0) output = output.view(1);
+  int64_t outer_size = 1;
+  int64_t dim_size = output.size(dim);
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= output.size(i);
+  for (int64_t i = dim + 1; i < output.dim(); ++i)
+    inner_size *= output.size(i);
+// See descriptions of kernels above.
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  if (inner_size == 1) {
+    const int ILP = 2;
+    dim3 grid(outer_size);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "host_softmax_backward", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    cunn_SoftMaxBackward<ILP, scalar_t, accscalar_t, Epilogue>
+      <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
+        gI.data<scalar_t>(), output.data<scalar_t>(), grad.data<scalar_t>(), dim_size
+    );
+    });
+  } else {
+    uint32_t smem_size;
+    dim3 grid, block;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "host_softmax_backward", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    SpatialSoftMax_getLaunchSizes<accscalar_t>(
+        &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, Epilogue>,
+        outer_size, dim_size, inner_size,
+        grid, block, smem_size);
+
+    cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, Epilogue>
+      <<<grid, block, smem_size, stream>>>(
+        gI.data<scalar_t>(), output.data<scalar_t>(), grad.data<scalar_t>(),
+        outer_size, dim_size, inner_size
+    );
+    });
+  }
+  THCudaCheck(cudaGetLastError());
+  return gI;
+}
+}
+
+Tensor log_softmax_cuda(const Tensor &input, const int64_t dim){
+  return host_softmax<LogSoftMaxForwardEpilogue>(input, dim);
+}
+
+Tensor log_softmax_backward_cuda(const Tensor &grad, const Tensor &output, int64_t dim, const Tensor &input){
+  return host_softmax_backward<LogSoftMaxBackwardEpilogue>(grad, output, dim);
+}
+
+Tensor softmax_cuda(const Tensor &input, const int64_t dim){
+  return host_softmax<SoftMaxForwardEpilogue>(input, dim);
+}
+
+Tensor softmax_backward_cuda(const Tensor &grad, const Tensor &output, int64_t dim, const Tensor &input){
+
+  Tensor tmp = grad * output;
+  return host_softmax_backward<SoftMaxBackwardEpilogue>(tmp, output, dim);
+}
+
+}
+}
diff --git a/aten/src/ATen/native/cuda/SparseMM.cu b/aten/src/ATen/native/cuda/SparseMM.cu
new file mode 100644
index 0000000..29dfd8d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/SparseMM.cu
@@ -0,0 +1,15 @@
+#include "ATen/ATen.h"
+#include "ATen/Error.h"
+#include "ATen/NativeFunctions.h"
+
+namespace at { namespace native {
+// sparse, sparse, sparse, dense, real, real -> sparse
+Tensor& _sspaddmm_out_only_sparse_cuda(Tensor& result, const Tensor& self,
+    const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  AT_ERROR("tensor.sspaddmm(...) can only be called on sparse tensors");
+}
+Tensor& _sspaddmm_out_cuda(Tensor& result, const Tensor& self,
+    const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  AT_ERROR("NYI: CUDA sspaddmm is not implemented");
+}
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
new file mode 100644
index 0000000..7266ebd
--- /dev/null
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -0,0 +1,310 @@
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Utils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/native/SpectralOpsUtils.h"
+#include "ATen/native/cuda/CuFFTUtils.h"
+#include "ATen/native/cuda/CuFFTPlanCache.h"
+#include <THC/THCTensorSort.cuh>
+#include <THC/THCThrustAllocator.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/unique.h>
+#include <cufft.h>
+#include <cufftXt.h>
+#include <cmath>
+
+namespace at { namespace native {
+
+using namespace at::native::detail;
+
+// In real-to-complex transform, cuFFT only fills half of the values due to
+// conjugate symmetry. See native/SpectralUtils.h for more details.
+// The following structs are used to fill in the other half with symmetry in
+// case of real-to-complex transform with onesided=False flag.
+// See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h.
+
+// counting_iterator => index to fill
+struct cnt_to_dst_idx_functor : public thrust::unary_function<int64_t, int64_t>
+{
+  const int64_t last_dim_size;
+  const int64_t last_dim_start_slice;
+  const int64_t last_dim_to_fill_size;
+
+  cnt_to_dst_idx_functor(int64_t last_dim_size, int64_t last_dim_start_slice) :
+    last_dim_size(last_dim_size), last_dim_start_slice(last_dim_start_slice),
+    last_dim_to_fill_size(last_dim_size - last_dim_start_slice) {}
+
+  __host__ __device__ __forceinline__
+  int64_t operator()(const int64_t& i) const
+  {
+    int64_t imag = i % 2;
+    int64_t idx = i / 2;
+    int64_t num_dim = idx / last_dim_to_fill_size;
+    int64_t slice_idx = idx % last_dim_to_fill_size;
+    return (num_dim * last_dim_size + last_dim_start_slice + slice_idx) * 2 + imag;
+  }
+};
+
+// index to fill => index to read from
+template <typename scalar_t>
+struct dst_idx_to_src_functor : public thrust::unary_function<int64_t, scalar_t>
+{
+  // output can have at most dim 5 (batch + 3 signal dim + real/imag)
+  int64_t sizes[max_rank + 2], strides[max_rank + 2];
+  const int64_t signal_ndim;
+  scalar_t *data;  // device ptr
+
+  dst_idx_to_src_functor(const Tensor& batched_complex_signal)
+    : signal_ndim(batched_complex_signal.dim() - 1),
+      data(batched_complex_signal.data<scalar_t>()) {
+    for (int64_t i = 0; i < signal_ndim; i++) {
+      sizes[i] = batched_complex_signal.size(i);
+      strides[i] = batched_complex_signal.stride(i);
+    }
+  }
+
+  __device__ __forceinline__
+  scalar_t operator()(const int64_t& write_idx_with_imag) const
+  {
+    int64_t imag = write_idx_with_imag % 2;
+    // all but first (batch) and last (real/imag) dims need to be reflected
+    int64_t read_idx = 0;
+    int64_t remainder = write_idx_with_imag - imag;
+    int64_t dim_idx, dim_stride;
+    for (int64_t i = 0; i < signal_ndim; i++) {
+      dim_stride = strides[i];
+      dim_idx = remainder / dim_stride;
+      if (i == 0) {
+        read_idx += dim_idx * dim_stride;
+      } else if (dim_idx != 0) {
+        read_idx += (sizes[i] - dim_idx) * dim_stride;
+      }
+      remainder = remainder % dim_stride;
+    }
+    if (imag) {
+      return -data[read_idx + 1];
+    } else {
+      return data[read_idx];
+    }
+  }
+};
+
+// input should be a contiguous batched tensor of same size as full (twosided)
+// signals, but only contains half (onesided) of the values.
+// This function modifies inplace.
+__forceinline__
+static void _fft_fill_with_conjugate_symmetry_(Tensor& input,
+                      int64_t size_last_dim, int64_t last_dim_start_slice) {
+  if (last_dim_start_slice >= size_last_dim) {
+    return;
+  }
+
+  // copy
+  int64_t n = input.numel() / size_last_dim * (size_last_dim - last_dim_start_slice);
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "_fft_fill_with_conjugate_symmetry_", [&] {
+    typedef thrust::device_ptr<scalar_t> device_ptr;
+    typedef thrust::counting_iterator<int64_t> counter;
+    typedef thrust::transform_iterator<cnt_to_dst_idx_functor, counter> dst_idx_iterator;
+    typedef thrust::permutation_iterator<device_ptr, dst_idx_iterator> dst_iterator;
+    typedef thrust::transform_iterator<dst_idx_to_src_functor<scalar_t>, dst_idx_iterator> src_iterator;
+
+    dst_idx_iterator dst_idxs(counter(0), cnt_to_dst_idx_functor(size_last_dim, last_dim_start_slice));
+
+    auto data = device_ptr(input.data<scalar_t>());
+    dst_iterator dsts(data, dst_idxs);
+    src_iterator srcs(dst_idxs, dst_idx_to_src_functor<scalar_t>(input));
+    thrust::copy_n(policy, srcs, n, dsts);
+  });
+}
+
+// NOTE [ cuFFT Embedded Strides ]
+//
+// cuFFT supports a subset of arbitrary strides via their "advanced data layout"
+// option (http://docs.nvidia.com/cuda/cufft/index.html#advanced-data-layout).
+// Specifically, these are tensors that can be viewed as subtensors resulted
+// from slicing a larger contiguous tensors. For such input tensors, let the
+// sizes of the enclosing tensor be `inembed`, and we can have in 3d case:
+//
+//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z)]
+//
+// Above is the simplified formula ignoring the batch dimension. In fact, the
+// last dimension of the enclosing tensor doesn't have to be contiguous, i.e.,
+// it can be greater than 1. Then one can set the base stride for the enclosing
+// tensor with `istride`. Then we have
+//
+//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z) * istride]
+//
+// For example, consider
+//
+//     enclosing = torch.zeros(6, 8, 10)  # contiguous
+//     input = enclosing[:4, 2:6, 6:]
+//     input.size()                       # [ 4,  4,  4]
+//     input.stride()                     # [80, 10,  1]
+//     # inembed = [6, 8, 10]
+//     input[2, 1, 3] = input[((2 * 8) + 1) * 10 + 3]   # using above formula
+//                    = input[173]
+//                    = input[2 * 80 + 1 * 10 + 1 * 3]  # using strides directly
+//
+// Generally, the embedded strides can be computed as
+//
+//     embed[i] = stride[i - 1] / stride[i].
+//
+// Note that the value of embed[0] isn't used to compute indices and doesn't
+// matter.
+//
+// Contrary to advanced data layout, simple layout means that *embeds have
+// unit-strides. In particular, unit-stride refers to that the input and output
+// tensors being contiguous, and that the strides at the innermost signal
+// dimension being unit (1) w.r.t. the corresponding data type.
+
+static inline Tensor _run_cufft(
+    const CuFFTConfig &config, Tensor& input, int64_t signal_ndim,
+    bool complex_input, bool complex_output, bool inverse,
+    IntList checked_signal_sizes, bool normalized, bool onesided,
+    IntList output_sizes, bool input_was_cloned
+) {
+  if (config.should_clone_input() && !input_was_cloned) {
+    input = input.clone();
+  }
+
+  auto& plan = config.plan();
+  auto& ctx = at::globalContext();
+
+  // set output
+  auto output = input.type().tensor(output_sizes);
+
+  // set to current stream
+  CUFFT_CHECK(cufftSetStream(plan, ctx.getCurrentCUDAStream()));
+
+  auto ws = ctx.getType(at::Backend::CUDA, at::ScalarType::Byte).tensor({ config.workspace_size() });
+  CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr()));
+
+  // run
+  CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(),
+    inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+
+  // rescale if needed by normalized flag or inverse transform
+  auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1];
+  if (normalized || inverse) {
+    auto signal_numel = at::prod_intlist(checked_signal_sizes);
+    double scale_denom;
+    if (normalized) {
+      scale_denom = std::sqrt(static_cast<double>(signal_numel));
+    } else {
+      scale_denom = static_cast<double>(signal_numel);
+    }
+    if (!complex_input && complex_output && !onesided) {
+      auto end_data_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim);
+      output.narrow(signal_ndim, 0, end_data_slice).div_(scale_denom);
+    } else {
+      output.div_(scale_denom);
+    }
+  }
+
+  // if needed, fill out the other half using conjugate symmetry
+  if (!complex_input && complex_output && !onesided) {
+    auto start_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim);
+    _fft_fill_with_conjugate_symmetry_(output, size_last_signal_dim, start_slice);
+  }
+  return output;
+}
+
+// The cuFFT plan cache, defined in CuFFTUtils.h
+struct CuFFTParamsLRUCache plan_cache;
+std::mutex plan_cache_mutex;
+
+namespace detail {
+
+int64_t cufft_get_plan_cache_max_size_impl() {
+  std::lock_guard<std::mutex> guard(plan_cache_mutex);
+  return plan_cache.max_size();
+}
+
+void cufft_set_plan_cache_max_size_impl(int64_t max_size) {
+  std::lock_guard<std::mutex> guard(plan_cache_mutex);
+  plan_cache.resize(max_size);
+}
+
+int64_t cufft_get_plan_cache_size_impl() {
+  std::lock_guard<std::mutex> guard(plan_cache_mutex);
+  return plan_cache.size();
+}
+
+void cufft_clear_plan_cache_impl() {
+  std::lock_guard<std::mutex> guard(plan_cache_mutex);
+  return plan_cache.clear();
+}
+
+} // namespace at::native::detail
+
+// cuFFT
+// Currently not utilizing multi GPUs so this can be potentially sped up.
+Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim,
+                  bool complex_input, bool complex_output, bool inverse,
+                  IntList checked_signal_sizes, bool normalized, bool onesided,
+                  IntList output_sizes) {
+  Tensor input = self;
+  bool input_was_cloned = false;
+
+  // Slice when twosided complex-to-real. This is not always needed because we
+  // calculate the inembed. But it will benefit us in certain cases where we
+  // clone the input tensor.
+  //
+  // See NOTE [ cuFFT Embedded Strides ].
+  // See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h.
+  if (complex_input && !complex_output && !onesided) {
+    auto onesided_size = infer_ft_real_to_complex_onesided_size(checked_signal_sizes[signal_ndim - 1]);
+    input = input.narrow(signal_ndim, 0, onesided_size);
+  }
+
+  // cuFFT requires input and output data pointers to complex type aligned.
+  // Our allocated output tensor is always 256 bytes aligned so it is fine, but
+  // we need to check input tensor to make sure that it is not unaligned, e.g.,
+  // from a slicing.
+  auto complex_size_bytes = 2 * input.type().elementSizeInBytes();
+  if (reinterpret_cast<std::uintptr_t>(input.data_ptr()) % complex_size_bytes != 0) {
+    input = input.clone();
+    input_was_cloned = true;
+  }
+
+  // Now that we have done error check and data_ptr checks, we delegate all
+  // futher cuFFT parameter computation and plan creation to the helper class
+  // CuFFTConfig in CuFFTUtils.h.
+
+  // If plan caching is enabled, we check the cache. Note that this accesses
+  // plan_cache.max_size() and thus makes this function less functional.
+  // However, integrating additional arguments into the "public" level c++ APIs,
+  // e.g., irfft, is difficult as we have a long call sequence looking like
+  //   irfft --> _fft --> _fft_with_size --dispatching-to-> _fft_cufft
+
+  // This read is not locked for perf reason. Shouldn't matter too much because
+  // we check again after acquiring the lock.
+  if (plan_cache.max_size() > 0) {
+    CuFFTParams params;
+    setCuFFTParams(&params, input, signal_ndim, complex_input,
+      complex_output, checked_signal_sizes, onesided);
+    std::lock_guard<std::mutex> guard(plan_cache_mutex);
+    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
+      const CuFFTConfig &config = plan_cache.try_emplace_value(std::move(params),
+                                             input, signal_ndim, complex_input,
+                                             complex_output, checked_signal_sizes,
+                                             onesided, output_sizes);
+      return _run_cufft(config, input, signal_ndim, complex_input,
+                        complex_output, inverse, checked_signal_sizes, normalized,
+                        onesided, output_sizes, input_was_cloned);
+    }
+  }
+  CuFFTConfig config(input, signal_ndim, complex_input, complex_output,
+                     checked_signal_sizes, onesided, output_sizes);
+  return _run_cufft(config, input, signal_ndim, complex_input,
+                    complex_output, inverse, checked_signal_sizes, normalized,
+                    onesided, output_sizes, input_was_cloned);
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
new file mode 100644
index 0000000..46c812c
--- /dev/null
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -0,0 +1,291 @@
+#include "ATen/ATen.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+namespace at {
+namespace cuda {
+#define THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM 100
+#define THRESH_NUMBER_BINS_FOR_GLOBAL_MEM 1000
+#define FOR_KERNEL_LOOP(i, lim)                                      \
+  for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < lim; \
+       i += gridDim.x * blockDim.x)
+
+/*
+  Memory types used for the 3 histogram implementations.
+  See `CUDA_tensor_histogram` below.
+ */
+enum class CUDAHistogramMemoryType { SHARED, MULTI_BLOCK, GLOBAL };
+
+/*
+  Kernel for computing the histogram of the input.
+ */
+template <
+    typename output_t,
+    typename input_t,
+    typename IndexType,
+    int ADims,
+    int PDims,
+    int BDims,
+    CUDAHistogramMemoryType MemoryType = CUDAHistogramMemoryType::MULTI_BLOCK,
+    typename Op>
+__global__ void kernelHistogram1D(
+    detail::TensorInfo<output_t, IndexType> a, /* output */
+    detail::TensorInfo<output_t, IndexType> p, /* partial output */
+    detail::TensorInfo<input_t, IndexType> b, /* input */
+    int binsize,
+    IndexType totalElements,
+    Op getOp) {
+  extern __shared__ unsigned char my_smem[];
+  output_t* smem = nullptr;
+
+  if (MemoryType == CUDAHistogramMemoryType::SHARED) {
+    ////////////////////////// Shared memory //////////////////////////
+    // atomically add to block specific shared memory
+    // then atomically add to the global output tensor
+    smem = reinterpret_cast<output_t*>(my_smem);
+    for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
+      smem[i] = 0;
+    }
+    __syncthreads();
+    FOR_KERNEL_LOOP(linearIndex, totalElements) {
+      // Convert `linearIndex` into an offset of `b`
+      const IndexType bOffset =
+          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+      // Use value at `b` as an offset of `smem`
+      const IndexType pOffset = b.data[bOffset] / binsize;
+      atomicAdd(&smem[pOffset], getOp(linearIndex));
+    }
+    __syncthreads();
+    // NOTE: atomically update output bin count.
+    //   Atomic update is imp since __syncthread() will only synchronize threads
+    //   in a given block, not across blocks.
+    for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
+      const IndexType aOffset =
+          detail::IndexToOffset<output_t, IndexType, ADims>::get(i, a);
+      atomicAdd(&a.data[aOffset], smem[i]);
+    }
+
+  } else if (MemoryType == CUDAHistogramMemoryType::MULTI_BLOCK) {
+    ////////////////////////// Multi Block memory //////////////////////////
+    // atomically add to block specific global tensor
+    // then atomically add to the global output tensor
+    // compute histogram for the block
+    FOR_KERNEL_LOOP(linearIndex, totalElements) {
+      // Convert `linearIndex` into an offset of `b`
+      const IndexType bOffset =
+          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+      const auto bVal = b.data[bOffset];
+      // Use value at `b` as an offset of `p`
+      const IndexType pIdx = p.strides[0] * blockIdx.x + bVal / binsize;
+      const IndexType pOffset =
+          detail::IndexToOffset<output_t, IndexType, PDims>::get(pIdx, p);
+      atomicAdd(&p.data[pOffset], getOp(linearIndex));
+    }
+    __syncthreads();
+    // NOTE: atomically update output bin count.
+    //   Atomic update is imp since __syncthread() will only synchronize threads
+    //   in a given block, not across blocks.
+    const IndexType pIdx = p.strides[0] * blockIdx.x;
+    const IndexType pOffset =
+        detail::IndexToOffset<output_t, IndexType, PDims>::get(pIdx, p);
+    for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
+      const IndexType aOffset =
+          detail::IndexToOffset<output_t, IndexType, ADims>::get(i, a);
+      atomicAdd(&a.data[aOffset], p.data[pOffset + i]);
+    }
+
+  } else {
+    ////////////////////////// Global memory //////////////////////////
+    // atomically add to the output tensor
+    // compute histogram for the block
+    FOR_KERNEL_LOOP(linearIndex, totalElements) {
+      // Convert `linearIndex` into an offset of `b`
+      const IndexType bOffset =
+          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+      const auto bVal = b.data[bOffset];
+      // Use value at `b` as an offset of `a`
+      const IndexType aIdx = bVal / binsize;
+      const IndexType aOffset =
+          detail::IndexToOffset<output_t, IndexType, ADims>::get(aIdx, a);
+      atomicAdd(&a.data[aOffset], getOp(linearIndex));
+    }
+  }
+}
+
+#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP)                               \
+  kernelHistogram1D<output_t, input_t, IndexType, 1, 2, 1, MEMORY_TYPE>    \
+      <<<grid,                                                             \
+         block,                                                            \
+         (MEMORY_TYPE == CUDAHistogramMemoryType::SHARED) ? sharedMem : 0, \
+         at::globalContext().getCurrentCUDAStream()>>>(                    \
+          aInfo, pInfo, bInfo, binsize, totalElements, WEIGHTS_OP);        \
+  AT_ASSERTM(cudaGetLastError() == cudaSuccess, "kernelHistogram1D failed");
+
+#define HANDLE_SWITCH_CASE(mType, getOp)                        \
+  switch (mType) {                                              \
+    case CUDAHistogramMemoryType::SHARED:                       \
+      HANDLE_CASE(CUDAHistogramMemoryType::SHARED, getOp);      \
+      break;                                                    \
+    case CUDAHistogramMemoryType::MULTI_BLOCK:                  \
+      HANDLE_CASE(CUDAHistogramMemoryType::MULTI_BLOCK, getOp); \
+      break;                                                    \
+    default:                                                    \
+      HANDLE_CASE(CUDAHistogramMemoryType::GLOBAL, getOp);      \
+  }
+
+inline int64_t getFreeGlobalMemory() {
+  // no need to use `cudaSetDevice`
+  size_t free_mem, total_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+  AT_ASSERTM(
+      cudaGetLastError() == cudaSuccess,
+      "CUDA_tensor_histogram failed to get free global memory");
+  return static_cast<int64_t>(free_mem);
+}
+
+/*
+  Calculate the frequency of the input values.
+
+  `a` contains the final output or the histogram.
+  Input `b` is assumed to be 1-D non-negative int array.
+  `c` optionally contains the weight vector.
+  See `help torch.bincount` for details on the math.
+
+  3 implementations based of input size and memory usage:
+    case: #bins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM and enough shared mem
+        SHARED: Each block atomically adds to it's own **shared** hist copy,
+        then atomically updates the global tensor.
+    case: #bins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM and enough global mem
+        MULTI_BLOCK: Each block atomically adds to it's own **global** hist
+        copy, then atomically updates the global tensor.
+    case: THRESH_NUMBER_BINS_FOR_GLOBAL_MEM <= #bins
+        GLOBAL: all threads atomically update to a single **global** hist copy.
+ */
+template <typename output_t, typename input_t, bool HasWeights>
+bool CUDA_tensor_histogram(
+    at::Tensor a, /* output */
+    at::Tensor b, /* input */
+    at::Tensor c, /* weights(optional) */
+    int64_t nbins,
+    int binsize,
+    TensorArgType aType = TensorArgType::ReadWrite,
+    TensorArgType bType = TensorArgType::ReadOnly,
+    TensorArgType cType = TensorArgType::ReadOnly) {
+  checkBackend("CUDA_tensor_histogram", {a, b}, Backend::CUDA);
+  if (HasWeights) {
+    checkBackend("CUDA_tensor_histogram", {c}, Backend::CUDA);
+  }
+  auto totalElements = b.size(0);
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int64_t curDevice = current_device();
+  if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL;
+  auto maxSharedMem =
+      at::globalContext().getCurrentDeviceProperties()->sharedMemPerBlock;
+  auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes
+  auto maxGlobalMem = getFreeGlobalMemory();
+  auto multiBlockMem = nbins * grid.x * sizeof(output_t) + 8; // 8 guard bytes
+  // determine memory type to use in the kernel
+  if (nbins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM &&
+      sharedMem < maxSharedMem) {
+    memType = CUDAHistogramMemoryType::SHARED;
+  } else if (
+      nbins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM &&
+      multiBlockMem < (maxGlobalMem / 2)) {
+    // check against half of free mem to be extra safe
+    // due to cached allocator, we may anyway have slightly more free mem
+    memType = CUDAHistogramMemoryType::MULTI_BLOCK;
+  }
+
+  // alloc memory for MULTI_BLOCK
+  using IndexType = int64_t;
+  auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
+  auto bInfo = detail::getTensorInfo<input_t, IndexType>(b);
+  detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});
+  Tensor partial_output;
+  if (memType == CUDAHistogramMemoryType::MULTI_BLOCK) {
+    partial_output = native::zeros({grid.x, nbins}, a.options());
+    pInfo = detail::getTensorInfo<output_t, IndexType>(partial_output);
+  }
+
+  if (HasWeights) {
+    auto cInfo = detail::getTensorInfo<output_t, IndexType>(c);
+    const auto getWeightsOp = [cInfo] __device__(IndexType cIndex) {
+      const IndexType cOffset =
+          detail::IndexToOffset<output_t, IndexType, 1>::get(cIndex, cInfo);
+      return cInfo.data[cOffset];
+    };
+    HANDLE_SWITCH_CASE(memType, getWeightsOp)
+  } else {
+    static const auto getDummyOp = [] __device__(IndexType) { return 1L; };
+    HANDLE_SWITCH_CASE(memType, getDummyOp)
+  }
+  return true;
+}
+
+#undef HANDLE_CASE
+#undef HANDLE_SWITCH_CASE
+#undef FOR_KERNEL_LOOP
+#undef THRESH_NUMBER_BINS_FOR_GLOBAL_MEM
+#undef THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM
+} // namespace cuda
+
+namespace {
+///////////////// bincount /////////////////
+template <typename input_t, typename weights_t>
+Tensor _bincount_cuda_template(
+    const Tensor& self,
+    const Tensor& weights,
+    int64_t minlength) {
+  if (minlength < 0) {
+    AT_ERROR("minlength should be >= 0");
+  }
+  if (self.dim() != 1 || self.numel() == 0 ||
+      (!std::is_same<input_t, uint8_t>::value &&
+       *self.min().toBackend(kCPU).data<input_t>() < 0)) {
+    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
+  }
+
+  bool has_weights = weights.defined();
+  if (has_weights && weights.size(0) != self.size(0)) {
+    AT_ERROR("input and weights should have the same length");
+  }
+
+  auto maxScalarGpu = Scalar(self.max());
+  auto nbins = maxScalarGpu.local().to<int64_t>() + 1L;
+  nbins = std::max(nbins, minlength);
+  // alloc output counter on GPU
+  Tensor output;
+  if (has_weights) {
+    output = native::zeros({nbins}, weights.options());
+    auto ret = cuda::CUDA_tensor_histogram<weights_t, input_t, true>(
+        output, self, weights, nbins, 1);
+  } else {
+    output = native::zeros({nbins}, device(kCUDA).dtype(kLong));
+    auto ret = cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
+        output, self, weights, nbins, 1);
+  }
+  return output;
+}
+} // namespace
+
+namespace native {
+Tensor _bincount_cuda(
+    const Tensor& self,
+    const Tensor& weights,
+    int64_t minlength) {
+  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
+    const auto scalar = weights.type().scalarType();
+    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
+      return _bincount_cuda_template<scalar_t, float>(self, weights, minlength);
+    return _bincount_cuda_template<scalar_t, double>(
+        self, weights.toType(CUDA(kDouble)), minlength);
+  });
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu
new file mode 100644
index 0000000..8e0cf4e
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorCompare.cu
@@ -0,0 +1,41 @@
+#include "ATen/NativeFunctions.h"
+#include "ATen/Dispatch.h"
+
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+namespace {
+template <typename scalar_t>
+void where_cuda(
+    at::Tensor& ret,
+    const at::Tensor& condition,
+    const at::Tensor& self,
+    const at::Tensor& other) {
+  // Yes this name is repetitive, but the CPU version is called
+  // CPU_tensor_apply4 and we don't have a CPU namespace or directory.
+  at::cuda::CUDA_tensor_apply4<scalar_t, uint8_t, scalar_t, scalar_t>(
+      ret,
+      condition,
+      self,
+      other,
+      [] __device__(
+          scalar_t & ret_val,
+          const uint8_t& cond_val,
+          const scalar_t& self_val,
+          const scalar_t& other_val) {
+        ret_val = cond_val ? self_val : other_val;
+      });
+}
+} // namespace
+
+namespace at { namespace native {
+Tensor _s_where_cuda(
+    const Tensor& condition,
+    const Tensor& self,
+    const Tensor& other) {
+  Tensor ret = self.type().tensor(self.sizes());
+  AT_DISPATCH_ALL_TYPES_AND_HALF(ret.type(), "where", [&] {
+    where_cuda<scalar_t>(ret, condition, self, other);
+  });
+  return ret;
+}
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
new file mode 100644
index 0000000..420733d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -0,0 +1,88 @@
+#include "ATen/ATen.h"
+#include "ATen/Error.h"
+#include "ATen/NativeFunctions.h"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCThrustAllocator.cuh>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <cstddef>
+
+namespace at {
+namespace native {
+
+Tensor& eye_out_cuda(Tensor& result, int64_t n) {
+  return at::native::eye_out_cuda(result, n, /*m=*/-1);
+}
+
+Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
+#else
+  AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+#endif
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if(m <= 0) {
+#else
+  if(m < 0) {
+#endif
+    m = n;
+  }
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  int64_t stride = result.stride(0) + result.stride(1);
+
+  Tensor diag = result.as_strided({sz}, {stride});
+  diag.fill_(1);
+  return result;
+}
+
+Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
+  AT_CHECK(n >= 0, "n must be non-negative, got", n);
+  AT_CHECK(result.type().scalarTensor(n).defined(),
+  "n is too large for result tensor type: '", result.type().toString(), "'");
+
+  result.resize_({n});
+
+  if (result.type().scalarType() == at::ScalarType::Half) {
+    auto result_float = CUDA(kFloat).tensor({n});
+    result.copy_(randperm_out_cuda(result_float, n, generator));
+  } else {
+    if (n < 30000) {  // For small inputs, we offload it to CPU instead.
+      auto result_cpu = result.type().toBackend(kCPU).tensor({n});
+      randperm_out(result_cpu, n, generator);
+      result.copy_(result_cpu);
+    } else {
+      // Generate random values for the keys array
+      AT_DISPATCH_ALL_TYPES(
+        result.type(), "randperm_out_cuda", [&] {
+          auto keys = result.type().tensor(result.sizes()).random_(generator);
+
+          auto result_data = thrust::device_ptr<scalar_t>(result.data<scalar_t>());
+          auto keys_data = thrust::device_ptr<scalar_t>(keys.data<scalar_t>());
+
+          auto state = globalContext().getTHCState();
+          THCThrustAllocator thrustAlloc(state);
+          auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state));
+
+          thrust::sequence(policy, result_data, result_data + n);
+
+          // Use the sorted order of keys to rearrange the result array
+          thrust::sort_by_key(policy, keys_data, keys_data + n, result_data);
+        }
+      );
+    }
+  }
+
+  return result;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
new file mode 100644
index 0000000..cc8e78c
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -0,0 +1,124 @@
+#include "ATen/native/TensorTransformations.h"
+
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include "ATen/NativeFunctions.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace at {
+namespace native {
+
+#define AT_APPLY_THREADS_PER_BLOCK 32 * 16
+#define AT_APPLY_BLOCKS_PER_SM 4
+
+template <typename scalar_t, typename IndexType>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void
+kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
+                          cuda::detail::TensorInfo<scalar_t, IndexType> out_tensor_info,
+                          IndexType N,
+                          int flip_dim,
+                          IndexType total_dims) {
+  for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) {
+    IndexType dst_offset = 0;
+    if (flip_dim == 0) {
+      // flip 1st dim
+      dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0];
+    }
+    else {
+      // flip last dim
+      IndexType i = total_dims - 1;
+      dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]);
+    }
+    out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index];
+  }
+}
+
+template <typename scalar_t>
+__global__
+void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size,
+                      int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) {
+
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+
+  int64_t cur_indices = linear_index, rem = 0, dst_offset = 0;
+  for (int64_t i = 0; i < total_dims; i++) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / strides_contiguous[i];
+    rem = temp - cur_indices * strides_contiguous[i];
+    // flip the indices if it is in flip_dims
+    for (int64_t j = 0; j < flip_dims_size; j++) {
+      if (i == flip_dims[j]) {
+        cur_indices = shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * strides[i];
+    cur_indices = rem;
+  }
+  out_tensor[linear_index] = in_tensor[dst_offset];
+}
+
+// Flip tensor given a list of dims
+Tensor flip_cuda(const Tensor& self, IntList dims) {
+  auto in_tensor = self;
+  const int64_t flip_dims_size = dims.size(), total_dims = in_tensor.dim(), N = in_tensor.numel();
+  check_errors(total_dims, flip_dims_size, dims);
+
+  int64_t block_size = 512;
+  dim3 dim_block(block_size);
+  dim3 dim_grid((N + block_size - 1) / block_size);
+
+  // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
+  if (flip_dims_size == 1 && in_tensor.is_contiguous() && (dims[0] == 0 || dims[0] == total_dims - 1)) {
+    auto out_tensor = at::empty_like(self);
+    AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
+      auto in_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(in_tensor);
+      auto out_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(out_tensor);
+      int flip_dim = in_tensor_info.collapseDims(dims[0]);
+      out_tensor_info.collapseDims(dims[0]);
+      kernel_pointwise_flip_apply2<scalar_t, int64_t>
+        <<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+          in_tensor_info, out_tensor_info, N, flip_dim, total_dims);
+    });
+    return out_tensor;
+  }
+
+  auto flip_dims = std::vector<int64_t>(dims);
+  auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast<int64_t>(flip_dims.size())});
+
+  auto shape = std::vector<int64_t>(in_tensor.sizes());
+  auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast<int64_t>(shape.size())});
+
+  auto strides = std::vector<int64_t>(in_tensor.strides());
+  auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast<int64_t>(strides.size())});
+
+  auto out_tensor = at::empty_like(in_tensor);
+
+  // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(),
+  // it is used to compute indices for each element in non-contiguous tensor
+  Tensor stride_contiguous = at::zeros({total_dims}, kLong);
+  int64_t* stride_contiguous_d = stride_contiguous.data<int64_t>();
+  for (int64_t i = total_dims - 1; i >= 0; i--) {
+    if (i == total_dims - 1) {
+      stride_contiguous_d[i] = 1;
+    } else {
+      stride_contiguous_d[i] = std::max<int64_t>(shape[i+1], 1) * stride_contiguous_d[i + 1];
+    }
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
+    flip_cuda_kernel<<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+      in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size,
+      strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
+  });
+
+  return out_tensor;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
new file mode 100644
index 0000000..32dc7d3
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -0,0 +1,90 @@
+#include "ATen/ATen.h"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCThrustAllocator.cuh>
+#include <thrust/execution_policy.h>
+
+#include <tuple>
+#include <thrust/unique.h>
+#include <thrust/sort.h>
+
+namespace at {
+namespace native{
+
+#ifndef __HIP_PLATFORM_HCC__
+
+namespace {
+template <typename scalar_t>
+__global__ void inverse_indices_kernel(
+    const scalar_t* input_data,
+    const scalar_t* output_data,
+    int64_t* inverse_indices_data,
+    int64_t num_inp,
+    int64_t num_out) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride = blockDim.x * gridDim.x;
+
+    for (int64_t i = idx; i < num_inp * num_out; i += stride) {
+      if (input_data[i / num_out] == output_data[i % num_out]){
+        inverse_indices_data[i / num_out] = i % num_out;   
+      }
+    }
+  }
+
+
+template <typename scalar_t>
+  std::tuple<Tensor, Tensor> _unique_cuda_template(
+    const Tensor& self,
+    const bool return_inverse) {
+
+    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    const Tensor& input = self.contiguous();
+    int64_t num_inp = input.numel();
+    const scalar_t* input_data = input.data<scalar_t>();
+
+    //sort & unique
+    Tensor output = input.clone();
+    output = output.view(-1);
+    scalar_t* output_data = output.data<scalar_t>();
+    thrust::sort(policy, output_data, output_data + num_inp);
+    scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
+    int64_t num_out = output_end - output_data;
+    output.resize_(num_out);
+
+    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
+
+    if (return_inverse) {
+      inverse_indices.resize_(input.sizes());
+      int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
+      int block = 512;
+      int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
+      inverse_indices_kernel<<<grid, block, 0, stream>>>(
+        input_data, output_data, inverse_indices_data, num_inp, num_out);
+    }
+
+    THCudaCheck(cudaGetLastError());   
+    return std::tuple<Tensor, Tensor>(output, inverse_indices);
+
+  }
+} // namespace
+
+#endif
+
+std::tuple<Tensor, Tensor>
+_unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
+#ifndef __HIP_PLATFORM_HCC__
+  return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
+    // The current CUDA implementation of unique always sort due to the
+    // lack of hashtable implementation in thrust
+    return _unique_cuda_template<scalar_t>(self, return_inverse);
+  });
+#else
+  AT_ERROR("unique_cuda: HIP not supported");
+#endif
+}
+
+}  // namespace native
+}  // namespace at
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
new file mode 100644
index 0000000..f73a2ad
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -0,0 +1,98 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if !AT_CUDNN_ENABLED()
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+Tensor cudnn_affine_grid_generator_forward(
+    const Tensor& theta,
+    int64_t N, int64_t C, int64_t H, int64_t W) {
+  throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
+}
+
+Tensor cudnn_affine_grid_generator_backward(
+    const Tensor& grad_theta,
+    int64_t N, int64_t C, int64_t H, int64_t W) {
+  throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
+}
+
+}}
+
+#else // AT_CUDNN_ENABLED()
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cudnn/Handles.h>
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+void setSamplerDescriptor(SpatialTransformerDescriptor& desc,
+                          cudnnDataType_t dataType,
+                          int N, int C, int H, int W)
+{
+  int inputSize[4] = {N, C, H, W};
+  desc.set(dataType, 4, inputSize);
+}
+
+}  // namespace
+
+Tensor cudnn_affine_grid_generator_forward(
+    const Tensor& theta_t,
+    int64_t N, int64_t C, int64_t H, int64_t W)
+{
+  setCuDNNStreamToCurrent();
+
+  TensorArg theta{ theta_t.contiguous(), "theta", 1 };
+  CheckedFrom c = "cudnn_affine_grid_generator_forward";
+  checkContiguous(c, theta);
+  checkSize(c, theta, {N, 2, 3});
+
+  auto grid_t = theta->type().tensor();
+  grid_t.resize_({N, H, W, 2});
+
+  auto dataType = getCudnnDataType(*theta);
+  SpatialTransformerDescriptor desc;
+  setSamplerDescriptor(desc, dataType, N, C, H, W);
+  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward(getCudnnHandle(), desc.desc(),
+                                                 theta->data_ptr(),
+                                                 grid_t.data_ptr()));
+  return grid_t;
+}
+
+Tensor cudnn_affine_grid_generator_backward(
+    const Tensor& grad_grid_t,
+    int64_t N, int64_t C, int64_t H, int64_t W)
+{
+  setCuDNNStreamToCurrent();
+
+  TensorArg grad_grid{ grad_grid_t.contiguous(), "grad_grid", 1 };
+  CheckedFrom c = "cudnn_affine_grid_generator_backward";
+  checkContiguous(c, grad_grid);
+  checkSize(c, grad_grid, {N, H, W, 2});
+
+  auto grad_theta_t = grad_grid->type().tensor();
+  grad_theta_t.resize_({N, 2, 3});
+
+  auto dataType = getCudnnDataType(grad_theta_t);
+  SpatialTransformerDescriptor desc;
+  setSamplerDescriptor(desc, dataType, N, C, H, W);
+  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward(getCudnnHandle(), desc.desc(),
+                                                  grad_grid->data_ptr(),
+                                                  grad_theta_t.data_ptr()));
+  return grad_theta_t;
+}
+
+}}  // namespace at::native
+
+#endif // AT_CUDNN_ENABLED()
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
new file mode 100644
index 0000000..9b2a256
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -0,0 +1,222 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if !AT_CUDNN_ENABLED()
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
+    const Tensor& input, const Tensor& weight,
+    const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
+    bool training, double exponential_average_factor, double epsilon) {
+  throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support");
+}
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var,
+    const Tensor& save_mean, const Tensor& save_var,
+    double epsilon) {
+  throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
+}
+
+}}  // namespace at::native
+
+#else // AT_CUDNN_ENABLED
+
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+Tensor expandScale(const Tensor& t, int64_t dim) {
+  std::vector<int64_t> size{ 1, t.numel() };
+  while (static_cast<int64_t>(size.size()) < dim) {
+    size.emplace_back(1);
+  }
+  return t.view(size);
+}
+
+}  // namespace
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
+    const Tensor& input_t, const Tensor& weight_t,
+    const Tensor& bias_t, const Tensor& running_mean_t, const Tensor& running_var_t,
+    bool training, double exponential_average_factor, double epsilon)
+{
+  TensorArg input{ input_t, "input", 1 },
+            weight{ weight_t, "weight", 2 },
+            bias{ bias_t, "bias", 3 },
+            running_mean{ running_mean_t, "running_mean", 4 },
+            running_var{ running_var_t, "running_var", 5 };
+  CheckedFrom c = "cudnn_batch_norm";
+  setCuDNNStreamToCurrent();
+
+  checkAllDefined(c, {input, weight, bias});
+  if (!training) {
+    checkAllDefined(c, {running_mean, running_var});
+  }
+  checkAllSameGPU(c, {input, weight, bias, running_mean, running_var});
+  if (input->type().scalarType() == ScalarType::Half) {
+    checkScalarType(c, weight, ScalarType::Float);
+  } else {
+    checkAllSameType(c, {input, weight});
+  }
+  checkAllSameType(c, {weight, bias, running_mean, running_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {input, weight, bias, running_mean, running_var});
+  checkDimRange(c, input, 2, 6 /* exclusive */);
+  auto num_features = input->size(1);
+  for (auto t : {weight, bias, running_mean, running_var}) {
+    if (t->defined()) {
+      checkNumel(c, t, num_features);
+    }
+  }
+
+  cudnnBatchNormMode_t mode;
+  if (input->dim() == 2) {
+    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode = CUDNN_BATCHNORM_SPATIAL;
+#if CUDNN_VERSION >= 7003
+    if(training)
+      mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#endif
+  }
+
+  auto output_t = input->type().tensor(input->sizes());
+  TensorArg output{ output_t, "output", 0 };
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*input);
+  TensorDescriptor idesc{ *input, 4 };  // input descriptor
+  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, bias, running_mean, etc.
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  Tensor save_mean, save_var;
+
+  if (training) {
+    int64_t num_features = input_t.size(1);
+    save_mean = weight_t.type().tensor({ num_features });
+    save_var = weight_t.type().tensor({ num_features });
+    AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
+      handle, mode, &one, &zero,
+      idesc.desc(), input->data_ptr(),
+      idesc.desc(), output->data_ptr(),
+      wdesc.desc(),
+      weight->data_ptr(),
+      bias->data_ptr(),
+      exponential_average_factor,
+      at::maybe_data_ptr(running_mean),
+      at::maybe_data_ptr(running_var),
+      epsilon,
+      save_mean.data_ptr(),
+      save_var.data_ptr()));
+  } else {
+    AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
+      handle, mode, &one, &zero,
+      idesc.desc(), input->data_ptr(),
+      idesc.desc(), output->data_ptr(),
+      wdesc.desc(),
+      weight->data_ptr(),
+      bias->data_ptr(),
+      running_mean->data_ptr(),
+      running_var->data_ptr(),
+      epsilon));
+  }
+
+  // save_mean and save_var can be undefined
+  // If this causes problems, we can initialize them to empty tensors
+  // of the correct type
+  return std::tuple<Tensor, Tensor, Tensor>{output_t, save_mean, save_var};
+}
+
+// NB: CuDNN only implements the backward algorithm for batchnorm
+// in training mode (evaluation mode batchnorm has a different algorithm),
+// which is why this doesn't accept a 'training' parameter.
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
+    const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t,
+    // Unused: but we require them to be passed so that double backwards
+    // has access
+    const Tensor& running_mean, const Tensor& running_var,
+    const Tensor& save_mean_t, const Tensor& save_var_t,
+    double epsilon)
+{
+  TensorArg input{ input_t, "input", 1 },
+            grad_output{ grad_output_t, "grad_output", 2 },
+            weight{ weight_t, "weight", 3 },
+            save_mean{ save_mean_t, "save_mean", 4 },
+            save_var{ save_var_t, "save_var", 5 };
+  CheckedFrom c = "cudnn_batch_norm_backward";
+  setCuDNNStreamToCurrent();
+
+  checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
+  checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var});
+  if (input->type().scalarType() == ScalarType::Half) {
+    checkScalarType(c, weight, ScalarType::Float);
+  } else {
+    checkAllSameType(c, {input, weight});
+  }
+  checkAllSameType(c, {input, grad_output});
+  checkAllSameType(c, {weight, save_mean, save_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  checkDimRange(c, input, 2, 6 /* exclusive */);
+  checkSameSize(c, input, grad_output);
+  auto num_features = input->size(1);
+  for (auto t : {weight, save_mean, save_var}) {
+    checkNumel(c, t, num_features);
+  }
+
+  cudnnBatchNormMode_t mode;
+  if (input->dim() == 2) {
+    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+#if CUDNN_VERSION >= 7003
+    mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode = CUDNN_BATCHNORM_SPATIAL;
+#endif
+  }
+
+  auto grad_input_t  = input->type().tensor(input->sizes());
+  auto grad_weight_t = weight->type().tensor(weight->sizes());
+  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*input);
+
+  TensorDescriptor idesc{ *input, 4 };  // input, output, grad_output descriptor
+  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, bias, save_mean, etc.
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  AT_CUDNN_CHECK(cudnnBatchNormalizationBackward(
+    handle, mode, &one, &zero, &one, &zero,
+    idesc.desc(), input->data_ptr(),
+    idesc.desc(), grad_output->data_ptr(),
+    idesc.desc(), grad_input_t.data_ptr(),
+    wdesc.desc(), weight->data_ptr(),
+    grad_weight_t.data_ptr(),
+    grad_bias_t.data_ptr(),
+    epsilon,
+    save_mean->data_ptr(),
+    save_var->data_ptr()));
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
+}
+
+}}  // namespace native
+
+#endif
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
new file mode 100644
index 0000000..b3ee016
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -0,0 +1,1204 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if !AT_CUDNN_ENABLED()
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+at::Tensor cudnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_backward_weight(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_backward_bias(
+    const at::Tensor& grad_output) {
+  throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+  throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_transpose(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_transpose_backward_input(
+    const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+}
+
+at::Tensor cudnn_convolution_transpose_backward_weight(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+}
+
+}}
+
+#else  // AT_CUDNN_ENABLED
+
+#include "THC/THC.h"
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+#include "ATen/native/utils/ParamsHash.h"
+
+#include <ATen/TensorUtils.h>
+
+#include <functional>
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+#include <unordered_map>
+
+namespace at { namespace native {
+
+// TODO: Go through all the checking code again and make sure
+// we haven't missed anything.
+
+// ---------------------------------------------------------------------
+//
+// Math
+//
+// ---------------------------------------------------------------------
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+// NB: conv_output_size and conv_input_size are not bijections,
+// as conv_output_size loses information; this is why conv_input_size
+// takes an extra output_padding argument to resolve the ambiguity.
+
+std::vector<int64_t> conv_output_size(
+    IntList input_size, IntList weight_size,
+    IntList padding, IntList stride, IntList dilation, int64_t groups
+) {
+  // ASSERT(input_size.size() > 2)
+  // ASSERT(input_size.size() == weight_size.size())
+  auto dim = input_size.size();
+  std::vector<int64_t> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (size_t d = 2; d < dim; ++d) {
+    auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2])
+                        - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+std::vector<int64_t> conv_input_size(
+    IntList output_size, IntList weight_size,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups
+) {
+  // ASSERT(output_size.size() > 2)
+  // ASSERT(output_size.size() == weight_size.size())
+  auto dim = output_size.size();
+  std::vector<int64_t> input_size(dim);
+  input_size[0] = output_size[output_batch_size_dim];
+  input_size[1] = weight_size[weight_input_channels_dim] * groups;
+  for (size_t d = 2; d < dim; ++d) {
+    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
+                     kernel + output_padding[d - 2];
+  }
+  return input_size;
+}
+
+std::vector<int64_t> conv_weight_size(
+    IntList input_size, IntList output_size,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups
+) {
+  auto dim = input_size.size();
+  std::vector<int64_t> weight_size(dim);
+  weight_size[0] = output_size[1];
+  weight_size[1] = input_size[1] / groups;
+  for (size_t d = 2; d < dim; ++d) {
+    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + 2 * padding[d - 2] - output_padding[d - 2];
+    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
+  }
+  return weight_size;
+}
+
+// TODO: Move this into the standard library, with a better name?
+Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
+  auto group_size = t.size(dim) / groups;
+  return t.narrow(dim, group_idx * group_size, group_size);
+}
+
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+
+// Note [Legacy CuDNN grouped convolution support]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// CuDNN earlier than CuDNN 7 does not directly support group
+// convolution, so we provide support for it by sequentially
+// running a convolution per group  with appropriately
+// adjusted sizes.  https://blog.yani.io/filter-group-tutorial/
+// has a fairly good diagram explaining how it works.
+
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
+{
+  if (args.size() > expected_size){
+    std::stringstream ss;
+    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+  else if (args.size() < expected_size){
+    std::stringstream ss;
+    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+static void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
+
+  // Weight
+  checkSameDim(c, input, weight);
+
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+
+// This POD struct is used to let us easily compute hashes of the
+// parameters
+struct ConvolutionParams
+{
+  cudnnDataType_t dataType;
+  int input_size[2 + max_dim];
+  int input_stride[2 + max_dim];
+  int weight_size[2 + max_dim];
+  int padding[max_dim];
+  int stride[max_dim];
+  int dilation[max_dim];
+  int64_t groups;
+  bool deterministic;
+  // NB: transposed purposely omitted: transposed just swaps
+  // forward and backward, so you can reuse the benchmark entry,
+};
+
+// NB: This can't be a constructor, because then ConvolutionParams
+// would not be a POD anymore.
+// TODO: Use TensorGeometry here instead of the entire Tensor, which we
+// don't actually need.  (OTOH: We can always pass in
+// grad_input/grad_output, so this is not very pressing)
+void setConvolutionParams(
+    ConvolutionParams* params,
+    const at::Tensor& input, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool deterministic) {
+
+  cudnnDataType_t dataType = getCudnnDataType(input);
+  memset(params, 0, sizeof(ConvolutionParams));
+  params->dataType = dataType;
+  // ASSERT(weight.dim() == input.dim())
+  for (int i = 0; i != input.dim(); ++i) {
+    params->input_size[i] = (int) input.size(i);
+    params->input_stride[i] = (int) input.stride(i);
+    params->weight_size[i] = (int) weight.size(i);
+  }
+  // ASSERT(padding.size() == stride.size())
+  // ASSERT(padding.size() == dilation.size())
+  for (size_t i = 0; i != padding.size(); ++i) {
+    params->padding[i] = padding[i];
+    params->stride[i] = stride[i];
+    params->dilation[i] = dilation[i];
+  }
+  // In principle, we shouldn't parametrize by groups for legacy
+  // CuDNN, but it doesn't seem worth the effort to actually do this.
+  params->groups = groups;
+  params->deterministic = deterministic;
+}
+
+// Convenience struct for passing around descriptors and data
+// pointers
+struct ConvolutionArgs {
+  cudnnHandle_t handle;
+  ConvolutionParams params;
+  TensorDescriptor idesc, odesc;
+  FilterDescriptor wdesc;
+  const Tensor& input, output, weight;
+  ConvolutionDescriptor cdesc;
+
+  ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) {
+  }
+};
+
+// ---------------------------------------------------------------------
+//
+// Benchmarking
+//
+// ---------------------------------------------------------------------
+
+// TODO: Use something less heavy duty than a big honking mutex
+template <typename T>
+struct BenchmarkCache {
+  std::mutex mutex;
+  std::unordered_map<ConvolutionParams, T, ParamsHash<ConvolutionParams>, ParamsEqual<ConvolutionParams>> map;
+
+  bool find(const ConvolutionParams& params, T* results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    auto it = map.find(params);
+    if (it == map.end()) {
+      return false;
+    }
+    *results = it->second;
+    return true;
+  }
+
+  void insert(const ConvolutionParams& params, const T& results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    map[params] = results;
+  }
+};
+
+BenchmarkCache<cudnnConvolutionFwdAlgo_t> fwd_algos;
+BenchmarkCache<cudnnConvolutionBwdDataAlgo_t> bwd_data_algos;
+BenchmarkCache<cudnnConvolutionBwdFilterAlgo_t> bwd_filter_algos;
+
+// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
+// tensor instead.
+struct Workspace {
+  Workspace(size_t size) : size(size), data(NULL) {
+    data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
+  }
+  Workspace(const Workspace&) = delete;
+  Workspace(Workspace&&) = default;
+  Workspace& operator=(Workspace&&) = default;
+  ~Workspace() {
+    if (data) {
+      THCudaFree(globalContext().lazyInitCUDA(), data);
+    }
+  }
+
+  size_t size;
+  void* data;
+};
+
+template<typename algo_t>
+struct algorithm_search {
+};
+
+cudnnStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args,
+    cudnnConvolutionFwdAlgo_t algo, size_t* sz)
+{
+    return cudnnGetConvolutionForwardWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        algo,
+        sz
+    );
+}
+cudnnStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args,
+    cudnnConvolutionBwdDataAlgo_t algo, size_t* sz)
+{
+    return cudnnGetConvolutionBackwardDataWorkspaceSize(
+        args.handle,
+        args.wdesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        algo,
+        sz);
+}
+cudnnStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz)
+{
+    return cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        algo,
+        sz);
+}
+
+template<typename algo_t>
+size_t getMaxWorkspaceSize(
+    const ConvolutionArgs& args,
+    const algo_t *algo, int n_algo)
+{
+    THCState *state = globalContext().lazyInitCUDA();
+
+    size_t max_ws_size = 0;
+    size_t max_block_size = 0;
+    size_t total_gpu_mem = 0;
+    size_t free_gpu_mem = 0;
+
+    THCudaCheck(THCudaMemGetInfoCached(state, &free_gpu_mem, &total_gpu_mem, &max_block_size));
+
+    for (int i = 0; i < n_algo; i++) {
+        cudnnStatus_t err;
+        size_t sz;
+        err = getWorkspaceSize(args, algo[i], &sz);
+        if (CUDNN_STATUS_SUCCESS != err || sz == 0
+            || sz < max_ws_size || sz > max_block_size) continue;
+        max_ws_size = sz;
+    }
+    return max_ws_size;
+}
+
+template<typename perf_t>
+perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) {
+  if (deterministic) {
+    // iterate over perf results of all algorithms and find the best deterministic algo
+    for (int i = 0; i < n_algo; i++) {
+      // TODO: Shouldn't all returned results be successful?
+      // Double check documentation for cudnnFindConvolutionForwardAlgorithmEx
+      if (perfResults[i].status == CUDNN_STATUS_SUCCESS &&
+          perfResults[i].determinism == CUDNN_DETERMINISTIC) {
+        return perfResults[i];
+      }
+    }
+    throw std::runtime_error("no deterministic convolution algorithms available in CuDNN");
+  } else {
+    return perfResults[0];
+  }
+}
+
+template<>
+struct algorithm_search<cudnnConvolutionFwdAlgo_t> {
+  using perf_t = cudnnConvolutionFwdAlgoPerf_t;
+  using algo_t = cudnnConvolutionFwdAlgo_t;
+
+  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+  static BenchmarkCache<algo_t>& cache() { return fwd_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+         CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+         CUDNN_CONVOLUTION_FWD_ALGO_FFT,
+         CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
+         CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
+         CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+         CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
+         CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+         CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
+    };
+    static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution forward algorithms");
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    Workspace ws(max_ws_size);
+    AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
+        args.handle,
+        args.idesc.desc(), args.input.data_ptr(),
+        args.wdesc.desc(), args.weight.data_ptr(),
+        args.cdesc.desc(),
+        args.odesc.desc(), args.output.data_ptr(),
+        num_algos,
+        &perf_count,
+        perf_results.get(),
+        ws.data,
+        ws.size));
+    return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count);
+  }
+
+  static void getAlgorithm(
+    const ConvolutionArgs& args,
+    algo_t* algo)
+  {
+    cudnnConvolutionFwdPreference_t pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+    AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        args.handle,
+        args.idesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        pref,
+        0,
+        algo));
+  }
+
+  static void getWorkspaceSize(
+    const ConvolutionArgs& args,
+    algo_t algo, size_t* workspaceSize)
+  {
+    AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        algo,
+        workspaceSize));
+  }
+};
+
+template<>
+struct algorithm_search<cudnnConvolutionBwdDataAlgo_t> {
+  using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
+  using algo_t = cudnnConvolutionBwdDataAlgo_t;
+
+  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+  static BenchmarkCache<algo_t>& cache() { return bwd_data_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED
+    };
+    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward data algorithms.");
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    Workspace ws(max_ws_size);
+    AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
+        args.handle,
+        args.wdesc.desc(), args.weight.data_ptr(),
+        args.odesc.desc(), args.output.data_ptr(),
+        args.cdesc.desc(),
+        args.idesc.desc(), args.input.data_ptr(),
+        num_algos,
+        &perf_count,
+        perf_results.get(),
+        ws.data,
+        ws.size));
+    return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count);
+  }
+
+  static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) {
+    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        args.handle,
+        args.wdesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+        0,
+        algo));
+  }
+
+  static void getWorkspaceSize(
+    const ConvolutionArgs& args,
+    cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize)
+  {
+    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        args.handle,
+        args.wdesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        algo,
+        workspaceSize));
+  }
+};
+
+template<>
+struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
+  using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
+  using algo_t = cudnnConvolutionBwdFilterAlgo_t;
+
+  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+  static BenchmarkCache<algo_t>& cache() { return bwd_filter_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
+#if CUDNN_VERSION >= 6000
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
+#endif
+    };
+    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
+    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward filter algorithms.");
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    int perf_count;
+    Workspace ws(max_ws_size);
+
+    AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+        args.handle,
+        args.idesc.desc(), args.input.data_ptr(),
+        args.odesc.desc(), args.output.data_ptr(),
+        args.cdesc.desc(),
+        args.wdesc.desc(), args.weight.data_ptr(),
+        num_algos,
+        &perf_count,
+        perf_results.get(),
+        ws.data,
+        ws.size));
+    return getBestAlgorithm<perf_t>(perf_results.get(), args.params.deterministic, perf_count);
+  }
+
+  static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) {
+    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        args.handle,
+        args.idesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
+        0,
+        algo)
+    );
+  }
+
+  static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
+  {
+    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        algo,
+        workspaceSize));
+  }
+};
+
+template<typename algo_t>
+void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
+  using search = algorithm_search<algo_t>;
+  auto& cache = search::cache();
+
+  if (cache.find(args.params, algo)) {
+    return;
+  }
+
+  if (args.params.deterministic && !benchmark) {
+    *algo = search::DEFAULT_ALGO;
+    return;
+  }
+
+  if (!benchmark) {
+    search::getAlgorithm(args, algo);
+    return;
+  }
+
+  if (cache.find(args.params, algo)) {
+    // re-check cache since another thread may have benchmarked the algorithm
+    return;
+  }
+
+  auto perfResults = search::findAlgorithm(args);
+  // for deterministic algo, look at all the perf results and return the best
+  // deterministic algo
+  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
+      !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
+      *algo = perfResults.algo;
+  } else {
+      *algo = search::DEFAULT_ALGO;
+  }
+  cache.insert(args.params, *algo);
+
+  // Free the cached blocks in our caching allocator. They are
+  // needed here because the above benchmarking uses a huge amount of memory,
+  // e.g. a few GBs.
+  THCCachingAllocator_emptyCache();
+}
+
+template<typename algo_t>
+Workspace chooseAlgorithm(
+    const ConvolutionArgs& args,
+    bool benchmark,
+    algo_t* algo)
+{
+  findAlgorithm(args, benchmark, algo);
+
+  using search = algorithm_search<algo_t>;
+  size_t workspace_size;
+  search::getWorkspaceSize(args, *algo, &workspace_size);
+  try {
+    return Workspace(workspace_size);
+  } catch (std::runtime_error& e) {
+    cudaGetLastError(); // clear OOM error
+
+    // switch to default algorithm and record it in the cache to prevent
+    // further OOM errors
+    *algo = search::DEFAULT_ALGO;
+    search::cache().insert(args.params, *algo);
+
+    search::getWorkspaceSize(args, *algo, &workspace_size);
+    return Workspace(workspace_size);
+  }
+}
+
+// ---------------------------------------------------------------------
+//
+// Bias addition
+//
+// ---------------------------------------------------------------------
+
+// In-place!
+void cudnn_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias)
+{
+  checkAllSameType(c, {output, bias});
+  checkAllSameGPU(c, {output, bias});
+  checkSize(c, bias, { output->size(output_channels_dim) });
+
+  // See Note [CuDNN broadcast padding].  Handle the left padding
+  // ourselves, but use TensorDescriptor's padding argument to do the rest.
+  TensorDescriptor bdesc, odesc;
+  bdesc.set(bias->expand({1, bias->size(0)}), output->dim());
+  odesc.set(*output);
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*bias);
+  Constant one(dataType, 1);
+
+  AT_CUDNN_CHECK(cudnnAddTensor(handle, &one, bdesc.desc(), bias->data_ptr(),
+                                     &one, odesc.desc(), output->data_ptr()));
+}
+
+// The general strategy:
+//
+//    - cudnn_convolution (Tensor)
+//      Entry points for clients, takes bias
+//
+//    - cudnn_convolution_forward (TensorArg)
+//      Entry point, which may be reused between regular
+//      convolution and transposed convolution.  Does NOT take bias.
+//
+//    - raw_cudnn_convolution_forward_out (Tensor)
+//      Low level function which invokes CuDNN, and takes an output
+//      tensor which is directly written to (thus _out).
+//
+// Where does argument checking happen?  Here's the division of
+// responsibility:
+//  - Things that happen in at::Tensor
+//    - TensorArg allocation
+//    - setCuDNNStreamToCurrent
+//  - Things that happen in TensorArg
+//    - Check arguments (type, GPU, shape)
+//
+// TODO: Consider renaming zero-indexed arguments to "self"
+
+
+
+// ---------------------------------------------------------------------
+//
+// Convolution forward / Transposed convolution backward
+//
+// ---------------------------------------------------------------------
+
+// The raw API directly invokes CuDNN and does not emulate support
+// for group convolution on old versions of CuDNN.
+//
+// There are a few reasons this should never be directly exposed
+// via ATen:
+//
+//    - It takes output as a parameter (this should be computed!)
+//    - It doesn't do input checking
+//    - It doesn't resize output (it is assumed to be correctly sized)
+//    - It takes a ConvolutionParams struct
+//
+void raw_cudnn_convolution_forward_out(
+    const Tensor& output, const Tensor& input, const Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getCudnnDataType(input);
+
+  ConvolutionArgs args{ input, output, weight };
+  args.handle = getCudnnHandle();
+  setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(weight);
+  args.odesc.set(output);
+  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  // TODO: when we do legacy group convolution support, we'll repeatedly
+  // reinitialize the workspace for each convolution we do.  This is
+  // wasteful; we'd rather reuse the workspace.  OTOH, legacy group
+  // convolution support is already pretty slow, so this might not
+  // matter.  (This applies to raw_cudnn_convolution_backward_input as well.)
+  cudnnConvolutionFwdAlgo_t fwdAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  AT_CUDNN_CHECK(cudnnConvolutionForward(
+    args.handle,
+    &one, args.idesc.desc(), input.data_ptr(),
+    args.wdesc.desc(), weight.data_ptr(),
+    args.cdesc.desc(), fwdAlg, workspace.data, workspace.size,
+    &zero, args.odesc.desc(), output.data_ptr()));
+}
+
+Tensor cudnn_convolution_forward(
+    CheckedFrom c,
+    const TensorArg& input, const TensorArg& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  checkAllSameType(c, {input, weight});
+  checkAllSameGPU(c, {input, weight});
+
+  auto output_t = input->type().tensor(
+                    conv_output_size(input->sizes(), weight->sizes(),
+                                     padding, stride, dilation, groups));
+
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{ output_t, "result", 0 };
+  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
+
+  // See #4500
+  Tensor weight_contig = weight->contiguous();
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_forward_out(
+        narrowGroup(*output, output_channels_dim,        i, groups),
+        narrowGroup(*input,  input_channels_dim,         i, groups),
+        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
+        padding, stride, dilation, 1, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_forward_out(
+      *output, *input, weight_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return *output;
+}
+
+Tensor cudnn_convolution(
+    const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 },
+            bias   { bias_t,   "bias",   3 };
+  setCuDNNStreamToCurrent();
+  CheckedFrom c = "cudnn_convolution";
+  auto output_t = cudnn_convolution_forward(
+    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  if (bias->defined()) {
+    cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
+}
+
+// NB: output_padding not needed here, as there is no ambiguity to
+// resolve
+Tensor cudnn_convolution_transpose_backward_input(
+    const Tensor& grad_output_t, const Tensor& weight_t,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg grad_output { grad_output_t,  "grad_output", 1 },
+            weight      { weight_t, "weight", 2 };
+  setCuDNNStreamToCurrent();
+  return cudnn_convolution_forward(
+    "cudnn_convolution_transpose_backward_input",
+    grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[1]) {
+    grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = at::cudnn_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward / Transposed convolution forward
+//
+// ---------------------------------------------------------------------
+
+void raw_cudnn_convolution_backward_input_out(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getCudnnDataType(grad_output);
+
+  ConvolutionArgs args{ grad_input, grad_output, weight };
+  args.handle = getCudnnHandle();
+  setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(grad_input);
+  args.wdesc.set(weight);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  cudnnConvolutionBwdDataAlgo_t bwdDataAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  AT_CUDNN_CHECK(cudnnConvolutionBackwardData(
+      args.handle,
+      &one, args.wdesc.desc(), weight.data_ptr(),
+      args.odesc.desc(), grad_output.data_ptr(),
+      args.cdesc.desc(), bwdDataAlg, workspace.data, workspace.size,
+      &zero, args.idesc.desc(), grad_input.data_ptr()));
+}
+
+// Backward and transpose are algorithmically equivalent, but they
+// compute their geometry differently.  In a backwards, you knew what
+// the original size of the input tensor was, so you can cache that
+// geometry and fill it directly.  In transposed convolution, it is
+// more conventional to not explicitly specify the output (previously
+// input) size, and compute it.  This, however, leaves a degree of
+// freedom; this degree of freedom is resolved using the
+// output_padding parameter.  Both of these interfaces are equivalent,
+// but they are differently convenient depending on the use case.
+
+Tensor cudnn_convolution_backward_input(
+    CheckedFrom c,
+    IntList input_size, const TensorArg& grad_output, const TensorArg& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  checkAllSameType(c, {grad_output, weight});
+  checkAllSameGPU(c, {grad_output, weight});
+
+  auto grad_input_t = grad_output->type().tensor(input_size);
+
+  // Avoid "grad_input" when this is being used as transposed convolution
+  TensorArg grad_input{ grad_input_t, "result", 0 };
+  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+
+  // See #4500
+  Tensor weight_contig = weight->contiguous();
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_backward_input_out(
+        narrowGroup(*grad_input, input_channels_dim, i, groups),
+        narrowGroup(*grad_output, output_channels_dim, i, groups),
+        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
+        padding, stride, dilation, 1, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_backward_input_out(
+      *grad_input, *grad_output, weight_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return *grad_input;
+}
+
+Tensor cudnn_convolution_transpose_forward(
+    CheckedFrom c,
+    const TensorArg& grad_output, const TensorArg& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
+                                    padding, output_padding, stride, dilation, groups);
+  return cudnn_convolution_backward_input(c, input_size, grad_output, weight,
+                                    padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+Tensor cudnn_convolution_backward_input(
+    IntList input_size, const Tensor& grad_output_t, const Tensor& weight_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
+  setCuDNNStreamToCurrent();
+  return cudnn_convolution_backward_input(
+      "cudnn_convolution_backward_input",
+      input_size, grad_output, weight,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[1]) {
+    grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = at::cudnn_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+Tensor cudnn_convolution_transpose(
+    const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 },
+            bias   { bias_t,   "bias",   3 };
+  CheckedFrom c = "cudnn_convolution_transpose";
+  auto output_t = cudnn_convolution_transpose_forward(
+    c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+  if (bias->defined()) {
+    cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward (weight)
+//
+// ---------------------------------------------------------------------
+
+void raw_cudnn_convolution_backward_weight_out(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getCudnnDataType(input);
+
+  ConvolutionArgs args{ input, grad_output, grad_weight };
+  args.handle = getCudnnHandle();
+  setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(grad_weight);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  cudnnConvolutionBwdFilterAlgo_t bwdFilterAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+      args.handle,
+      &one, args.idesc.desc(), input.data_ptr(),
+      args.odesc.desc(), grad_output.data_ptr(),
+      args.cdesc.desc(), bwdFilterAlg, workspace.data, workspace.size,
+      &zero, args.wdesc.desc(), grad_weight.data_ptr()));
+}
+
+Tensor cudnn_convolution_backward_weight(
+    CheckedFrom c,
+    IntList weight_size, const TensorArg& grad_output, const TensorArg& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+
+  checkAllSameType(c, {grad_output, input});
+  checkAllSameGPU(c, {grad_output, input});
+
+  auto grad_weight_t = grad_output->type().tensor(weight_size);
+
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{ grad_weight_t, "result", 0 };
+  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_backward_weight_out(
+        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
+        narrowGroup(*grad_output, output_channels_dim, i, groups),
+        narrowGroup(*input, input_channels_dim, i, groups),
+        padding, stride, dilation, groups, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_backward_weight_out(
+      *grad_weight, *grad_output, *input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return grad_weight_t;
+}
+
+Tensor cudnn_convolution_backward_weight(
+    IntList weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  setCuDNNStreamToCurrent();
+  return cudnn_convolution_backward_weight(
+      "cudnn_convolution_backward_weight",
+      weight_size, grad_output, input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+Tensor cudnn_convolution_transpose_backward_weight(
+    IntList weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  setCuDNNStreamToCurrent();
+  return cudnn_convolution_backward_weight(
+      "cudnn_convolution_backward_weight",
+      weight_size, input, grad_output,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward (bias)
+//
+// ---------------------------------------------------------------------
+
+Tensor cudnn_convolution_backward_bias(
+    const Tensor& grad_output_t)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+  setCuDNNStreamToCurrent();
+
+  auto grad_bias_t = grad_output->type().tensor(
+                        { grad_output->size(output_channels_dim) });
+
+  TensorArg grad_bias{ grad_bias_t, "result", 0 };
+
+  // See Note [CuDNN broadcast padding].  Handle the left padding
+  // ourselves, but use TensorDescriptor's pad argument to do the rest.
+  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
+                         static_cast<size_t>(grad_output->dim())};
+  TensorDescriptor odesc{*grad_output};
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*grad_bias);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  AT_CUDNN_CHECK(cudnnConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
+                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
+  return *grad_bias;
+}
+
+
+}}  // namespace
+
+#endif
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
new file mode 100644
index 0000000..c6b7ffc
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -0,0 +1,147 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if !AT_CUDNN_ENABLED()
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+Tensor cudnn_grid_sampler_forward(
+    const Tensor& input_t, const Tensor& grid_t) {
+  throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
+}
+
+std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
+    const Tensor& input_t, const Tensor& grid_t,
+    const Tensor& grad_output_t) {
+  throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
+}
+
+}}
+
+#else // AT_CUDNN_ENABLED
+
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+// TODO: descriptor checking
+
+
+namespace at { namespace native {
+
+namespace {
+
+void setSamplerDescriptor(SpatialTransformerDescriptor& desc, cudnnDataType_t dataType, const at::Tensor& tensor)
+{
+  int inputSize[4] = {0};
+  for (int i = 0; i < tensor.dim(); ++i) {
+    inputSize[i] = (int) tensor.size(i);
+  }
+  desc.set(dataType, 4, inputSize);
+}
+
+void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input)
+{
+  // assert size of grid is n*h*w*2
+  // FYI: grid is between [-1, 1], where -1 left most pixel,
+  // 1 represents right most pixel (and hence 0 is the center pixel)
+  // if grid has values >1 or <-1, those values are ignored
+  checkContiguous(c, grid);
+  checkDim(c, grid, 4);
+  // TODO: Maybe more user friendly to report where the expected size
+  // came from
+  checkSize(c, grid, 0, input->size(0));
+  checkSize(c, grid, 3, 2);
+}
+
+}  // namespace
+
+Tensor cudnn_grid_sampler_forward(
+    const Tensor& input_t, const Tensor& grid_t)
+{
+  TensorArg input{ contiguousIfZeroInStrides(input_t), "input", 1 },
+            grid{ grid_t.contiguous(), "grid", 2 };
+  CheckedFrom c = "cudnn_grid_sampler_forward";
+  setCuDNNStreamToCurrent();
+  checkAllSameGPU(c, {input, grid});
+  checkAllSameType(c, {input, grid});
+  checkGridSize(c, grid, input);
+  checkDim(c, input, 4);
+
+  auto output_t = input->type().tensor();
+  output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)});
+
+  TensorDescriptor idesc{ *input };  // input descriptor
+  TensorDescriptor odesc{ output_t };  // output descriptor
+  SpatialTransformerDescriptor desc; // sampler descriptor
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*input);
+  setSamplerDescriptor(desc, dataType, output_t);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  AT_CUDNN_CHECK(cudnnSpatialTfSamplerForward(
+      handle, desc.desc(),
+      &one, idesc.desc(), input->data_ptr(),
+      grid->data_ptr(),
+      &zero, odesc.desc(), output_t.data_ptr()
+  ));
+
+  return output_t;
+}
+
+// NB: CuDNN does not support output mask; you always get both
+// gradients.
+std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
+    const Tensor& input_t, const Tensor& grid_t,
+    const Tensor& grad_output_t)
+{
+  TensorArg input{ contiguousIfZeroInStrides(input_t), "input", 1 },
+            grid{ grid_t.contiguous(), "grid", 2 },
+            grad_output{ contiguousIfZeroInStrides(grad_output_t), "grad_output", 3 };
+  CheckedFrom c = "cudnn_grid_sampler_backward";
+  setCuDNNStreamToCurrent();
+  checkAllSameGPU(c, {input, grad_output, grid});
+  checkGridSize(c, grid, input);
+  checkDim(c, input, 4);
+  checkDim(c, grad_output, 4);
+
+  auto grad_input_t = input->type().tensor();
+  grad_input_t.resize_(input->sizes());
+  auto grad_grid_t = grid->type().tensor();
+  grad_grid_t.resize_(grid->sizes());
+
+  TensorDescriptor idesc{ *input };  // input descriptor
+  TensorDescriptor odesc{ *grad_output };  // grad_output descriptor
+  TensorDescriptor gdesc{ grad_input_t };  // grad_input descriptor
+  SpatialTransformerDescriptor desc; // sampler descriptor
+
+  auto handle = getCudnnHandle();
+  auto dataType = getCudnnDataType(*input);
+  setSamplerDescriptor(desc, dataType, *grad_output);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  AT_CUDNN_CHECK(cudnnSpatialTfSamplerBackward(
+    handle, desc.desc(),
+    &one, idesc.desc(), input->data_ptr(),
+    &zero, gdesc.desc(), grad_input_t.data_ptr(),
+    &one, odesc.desc(), grad_output->data_ptr(),
+    // intruigingly, the outputs don't need descriptors
+    grid->data_ptr(),
+    &zero, grad_grid_t.data_ptr()
+  ));
+
+  return std::tuple<Tensor, Tensor>{ grad_input_t, grad_grid_t };
+}
+
+}}  // namespace at::cudnn
+
+#endif
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
new file mode 100644
index 0000000..aced0a0
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -0,0 +1,1011 @@
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Config.h>
+#include <ATen/Error.h>
+#include <ATen/MatrixRef.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if !AT_CUDNN_ENABLED()
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+Tensor _cudnn_rnn_flatten_weight(
+    TensorList weight_arr, int64_t weight_stride0,
+    int64_t input_size,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first,
+    bool fn_bidirectional
+    ) {
+  throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
+    const Tensor& input_r,
+    TensorList weight, int64_t weight_stride0,
+    const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first, double fn_dropout,
+    bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
+    const Tensor& fn_dropout_state
+    ) {
+  throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support");
+}
+
+std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
+    const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
+    const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
+    const Tensor& grad_cy_r,
+    int64_t mode, int64_t hidden_size,
+    int64_t num_layers, bool batch_first, double dropout,
+    bool train, bool bidirectional, IntList batch_sizes,
+    const Tensor& dropout_state, const Tensor& reserve,
+    std::array<bool, 4> output_mask
+    ) {
+  throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
+}
+
+Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) {
+  throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
+}
+
+}} // namespace at::native
+
+#else // AT_CUDNN_ENABLED()
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+namespace at { namespace native {
+
+namespace {
+  // DropoutDescriptor
+
+  struct DropoutDescriptorParams {
+    bool train;
+    double dropout;
+    Tensor dropout_state;
+    DropoutDescriptorParams() {}
+    void set(bool train_, double dropout_, Tensor dropout_state_) {
+      train = train_;
+      dropout = dropout_;
+      dropout_state = dropout_state_;
+    }
+    DropoutDescriptor descriptor(cudnnHandle_t handle) const {
+      auto dropout_p = train ? dropout : 0;
+      DropoutDescriptor dropout_desc;
+      if (dropout_p == 0) {
+        dropout_desc.set_no_dropout(handle);
+      } else {
+        dropout_desc.set(handle, dropout_p, dropout_state);
+      }
+      return dropout_desc;
+    }
+  };
+
+  // RNNDescriptor
+
+  struct RNNDescriptorParams {
+    int64_t hidden_size;
+    int64_t num_layers;
+    cudnnDirectionMode_t bidirectional;
+    cudnnRNNMode_t mode;
+    cudnnDataType_t datatype;
+
+    cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
+
+    int64_t num_directions() const {
+      return bidirectional ? 2 : 1;
+    }
+
+    void set_mode(int64_t fn_mode) {
+      switch (fn_mode) {
+        case CUDNN_RNN_RELU:
+          mode = CUDNN_RNN_RELU;
+          break;
+        case CUDNN_RNN_TANH:
+          mode = CUDNN_RNN_TANH;
+          break;
+        case CUDNN_LSTM:
+          mode = CUDNN_LSTM;
+          break;
+        case CUDNN_GRU:
+          mode = CUDNN_GRU;
+          break;
+        default:
+        {
+          std::ostringstream oss;
+          oss << "unrecognized cuDNN RNN mode " << fn_mode;
+          throw std::runtime_error(oss.str());
+        }
+      }
+    }
+
+    void set_bidirectional(bool fn_bidirectional) {
+      bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    }
+
+    void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype) {
+      this->set_mode(mode);
+      this->hidden_size = hidden_size;
+      this->num_layers = num_layers;
+      this->set_bidirectional(bidirectional);
+      this->datatype = datatype;
+    }
+
+
+    RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
+      RNNDescriptor rnn_desc;
+      rnn_desc.set(handle, hidden_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype);
+      return rnn_desc;
+    }
+
+    // In some cases, a use of RNNDescriptor does not rely on the
+    // DropoutDescriptor.  In this case, we fake up a no-dropout
+    // descriptor to make the RNN descriptor initialization go through.
+    // This is used by _cudnn_rnn_flatten_weight, which needs an
+    // RNNDescriptor for get_parameters(), but does not actually need
+    // a fully initialized dropout descriptor.  This lets us avoid
+    // having to pass the dropout state to flatten, which has no business
+    // knowing what the dropout state is.
+    RNNDescriptor descriptor(cudnnHandle_t handle) const {
+      DropoutDescriptor dropout_desc;
+      dropout_desc.set_no_dropout(handle);
+      return descriptor(handle, std::move(dropout_desc));
+    }
+  };
+
+  // TensorDescriptor list
+
+  std::vector<TensorDescriptor> rnn_descriptor_sequence(const Tensor& tensor, IntList batch_sizes) {
+    std::vector<TensorDescriptor> descriptors(batch_sizes.size());
+    size_t i = 0;
+    // To be mutated in the loop
+    std::vector<int64_t> batch_tensor_size(tensor.sizes());
+    for (auto batch_size : batch_sizes) {
+      batch_tensor_size[0] = batch_size;
+      // NB: cuDNN RNN API does not support 2d descriptors, so we
+      // must pad it out to 3d.
+      descriptors[i].set(getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3);
+      i++;
+    }
+    return descriptors;
+  }
+
+  std::vector<TensorDescriptor> rnn_descriptor(const Tensor& tensor, int64_t N) {
+    std::vector<TensorDescriptor> descriptors(N);
+    for (int64_t i = 0; i < N; i++) {
+      descriptors[i].set(tensor, 5);
+    }
+    return descriptors;
+  }
+
+  // The best way to understand the meaning of the values stored in
+  // this struct is to consider each of the possible ways our
+  // input can be structured.
+  //
+  // Suppose you want to run RNN on the following variable
+  // length inputs:
+  //
+  //    Sequence 1: ABCD
+  //    Sequence 2: EF
+  //    Sequence 3: G
+  //
+  // (Let _ be padding when we have non-packed representations.)
+  //
+  // # Packed input (batch_sizes is non-empty)
+  //
+  //  input_size
+  // +------+                    +
+  // | A    |                    |
+  // | E    | mini_batch =       |
+  // | G    | batch_sizes[0] = 3 |
+  // +------+                    |
+  // | B    |                    | batch_sizes_sum = 7
+  // | F    | batch_sizes[1] = 2 |
+  // +------+                    |
+  // | C    | batch_sizes[2] = 1 |
+  // +------+                    |
+  // | D    | batch_sizes[3] = 1 |
+  // +------+                    +
+  //
+  //              (seq_length = 4)
+  //
+  //    input.size() = batch_sizes_sum x input_size
+  //
+  // # Unpacked input (batch_first = false)
+  //
+  //  mini_batch = 3
+  // +-------+
+  // | A E G |
+  // | B F _ | seq_length = 4
+  // | C _ _ |
+  // | D _ _ |
+  // +-------+
+  //    ...    input_size
+  // +-------+
+  //
+  //    input.size() = seq_length x mini_batch x input_size
+  //
+  // # Unpacked input (batch_first = true)
+  //
+  //  seq_length = 4
+  // +---------+
+  // | A B C D |
+  // | E F _ _ | mini_batch = 3
+  // | G _ _ _ |
+  // +---------+
+  //     ...     input_size
+  // +---------+
+  //
+  //    input.size() = mini_batch x seq_length x input_size
+  //
+  struct TensorDescriptorListParams {
+    IntList batch_sizes;
+    int64_t seq_length;
+    int64_t mini_batch;
+    // NB: this is not input.size(), which is an IntList; instead, this
+    // size of the inner-most dimension.  In NL applications, this is usually
+    // the size of the embedding.  You can also think of this as the size
+    // of the "channel" dimension (at risk of confusing vision researchers :)
+    int64_t input_size;
+    // Only valid when !is_input_packed
+    int64_t batch_sizes_sum; // == sum(batch_sizes)
+
+    bool is_input_packed() const {
+      return batch_sizes.size() != 0;
+    }
+
+    void set(IntList input_sizes, IntList batch_sizes_, bool batch_first) {
+      batch_sizes = batch_sizes_;
+      if (is_input_packed()) {
+        seq_length = batch_sizes.size();
+        mini_batch = batch_sizes[0];
+        // NB: When input is packed, the mini_batch size is NOT the size
+        // of the outer dimension
+        batch_sizes_sum = input_sizes[0];
+        input_size = input_sizes[1];
+      } else {
+        if (batch_first) {
+          seq_length = input_sizes[1];
+          mini_batch = input_sizes[0];
+        } else {
+          seq_length = input_sizes[0];
+          mini_batch = input_sizes[1];
+        }
+        input_size = input_sizes[2];
+        // TODO: Actually, would this make ASAN's job harder catching
+        // an uninitialized access?
+        batch_sizes_sum = -1; // something bogus in case we access it
+      }
+    }
+
+    // TODO: check x for consistency with input_size?
+    std::vector<TensorDescriptor> descriptors(Tensor x) const {
+      auto is_input_packed = batch_sizes.size() != 0;
+      if (is_input_packed) {
+        return rnn_descriptor_sequence(x, batch_sizes);
+      } else {
+        return rnn_descriptor(x[0], seq_length);
+      }
+    }
+  };
+
+  // Everything together
+
+  struct RNNParams {
+    DropoutDescriptorParams dropout;
+    RNNDescriptorParams rnn;
+    TensorDescriptorListParams tensors;
+  };
+
+  // NB: Doesn't include the weight descriptor
+  struct RNNDescriptors {
+    RNNDescriptor rnn_desc;
+    // NB: this won't actually lay out the tensor descriptor pointers
+    // in the right way, so you'll have to preprocess them
+    std::vector<TensorDescriptor> x_descs;
+    std::vector<TensorDescriptor> y_descs;
+    TensorDescriptor hx_desc;
+    TensorDescriptor hy_desc;
+    TensorDescriptor cx_desc;
+    TensorDescriptor cy_desc;
+
+    RNNDescriptors(const RNNParams& fn, cudnnHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) {
+      rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle));
+      x_descs = fn.tensors.descriptors(x);
+      y_descs = fn.tensors.descriptors(y);
+      hx_desc.set(hx, 5);
+      hy_desc.set(hx, 5);
+      if (cx.defined()) {
+        cx_desc.set(cx, 5);
+        cy_desc.set(cx, 5);
+      }
+    }
+
+    // TODO: This is annoying, having to put the cudnnTensorDescriptor_t
+    // in a contiguous array...
+    std::vector<cudnnTensorDescriptor_t> get_descs(const std::vector<TensorDescriptor>& descs) {
+      std::vector<cudnnTensorDescriptor_t> r;
+      r.reserve(descs.size());
+      for (auto& desc : descs) {
+        r.emplace_back(desc.desc());
+      }
+      return r;
+    }
+
+    std::vector<cudnnTensorDescriptor_t> get_x_descs() {
+      return get_descs(x_descs);
+    }
+
+    std::vector<cudnnTensorDescriptor_t> get_y_descs() {
+      return get_descs(y_descs);
+    }
+  };
+
+  int64_t get_num_weights(cudnnHandle_t handle, const RNNDescriptor& rnn_desc,
+                          const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
+    size_t weight_size;
+    AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype));
+    auto elem_size = dataSize(datatype);
+    AT_ASSERTM(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size");
+    return weight_size / elem_size;
+  }
+
+  int64_t _num_linear_layers(cudnnRNNMode_t mode) {
+    switch(mode) {
+      case CUDNN_LSTM:
+        return 8;
+      case CUDNN_GRU:
+        return 6;
+      case CUDNN_RNN_RELU:
+        return 2;
+      case CUDNN_RNN_TANH:
+        return 2;
+      default:
+        AT_ERROR("unknown cuDNN RNN mode %d", mode);
+    }
+  }
+
+  /*
+    Returns weight and bias tensors for each layer of the RNN. These tensors
+    are views on the underlying weight buffer allocated by CuDNN.
+
+    Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively),
+          these parameters are concatenated along the first dimension.
+          These parameters are returned in a consistent order by CuDNN:
+              (reset, forget, cell, output) for LSTM
+              (reset, input, new) for GRU
+    Args:
+        fn: The RNN function object holding the RNN state
+        handle: a CuDNN handle
+        weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer
+    Returns:
+        parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers.
+            This is represented as a pair of vector, and outer-dimension stride
+            (NB: Can't return MatrixRef because we need to allocate the underlying tensor)
+  */
+  std::pair<std::vector<Tensor>, size_t> // stride0
+  get_parameters(
+      cudnnHandle_t handle,
+      const RNNDescriptorParams& rnn,
+      const RNNDescriptor& rnn_desc,
+      const TensorDescriptor& x_desc,
+      const FilterDescriptor& w_desc,
+      const Tensor& weight_buf
+  ) {
+    auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
+    std::vector<Tensor> params;
+    int64_t num_linear_layers = _num_linear_layers(rnn.mode);
+    int64_t num_layers = rnn.num_directions() * rnn.num_layers;
+    size_t cur_offset = 0;
+    size_t global_layer_params_count = 0;
+    for (int64_t layer = 0; layer < num_layers; layer++) {
+      size_t layer_params_count = 0;
+      for (auto cudnn_method : cudnn_methods) {
+        for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) {
+          FilterDescriptor lin_layer_mat_desc;
+          void* matrix_pointer;
+          AT_CUDNN_CHECK(cudnn_method(
+                handle,
+                rnn_desc.desc(),
+                layer,
+                x_desc.desc(),
+                w_desc.desc(),
+                weight_buf.data_ptr(),
+                linear_id,
+                lin_layer_mat_desc.mut_desc(),
+                &matrix_pointer
+                ));
+          cudnnDataType_t data_type;
+          cudnnTensorFormat_t format;
+          int nb_dims;
+          constexpr int min_dim = 3;
+          // TODO: The use of CPU tensor here is a bit goofy in C++,
+          // some sort of alloca would be good enough except that it is
+          // kind of convenient to be able to prod() on it.
+          Tensor filter_dim_a = at::CPU(kInt).tensor(min_dim);
+          AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
+                lin_layer_mat_desc.desc(),
+                min_dim,
+                &data_type,
+                &format,
+                &nb_dims,
+                filter_dim_a.data<int>()
+                ));
+
+          AT_ASSERTM(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
+          filter_dim_a = filter_dim_a.slice(0, 0, nb_dims);
+          auto elem_size = dataSize(rnn.datatype);
+          auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+          AT_ASSERTM(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
+          size_t offset = offset_bytes / elem_size;
+
+          // for all the RNN types provided by CUDNN, all the ih weights
+          // are the same size and are allocated in a contiguous chunk
+          // (same for the hh weights, and the ih and hh biases).
+          // Since we're storing all the weights in a single tensor anyway,
+          // might as well merge the CUDNN ones into a single tensor as well
+	  int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
+          if (linear_id == 0 || linear_id == num_linear_layers / 2) {
+            std::initializer_list<int64_t> size = {
+              mat_numel * num_linear_layers / 2, 1};
+            // Generate a new parameter tensor which is a view into the
+            // weight_buf.
+            Tensor param = weight_buf.type().tensor().set_(*weight_buf.storage(), offset, size);
+            params.emplace_back(std::move(param));
+            layer_params_count++;
+          } else {
+            AT_ASSERTM(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset);
+          }
+          cur_offset = offset + mat_numel;
+        }
+      } // for cudnn_method
+      if (layer == 0) {
+        global_layer_params_count = layer_params_count;
+      } else {
+        AT_ASSERTM(global_layer_params_count == layer_params_count,
+                   "global_layer_params_count = ", global_layer_params_count,
+                   "; layer_params_count = ", layer_params_count);
+      }
+    } // for layer
+    return std::make_pair(params, global_layer_params_count);
+  }
+
+  void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
+    AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch");
+    for (size_t i = 0; i < params_from.size(0); i++) {
+      auto layer_params_from = params_from[i];
+      auto layer_params_to = params_to[i];
+      // NOTE: these lists have all weights before all biases, so if the layer
+      // doesn't use biases, iteration will terminate once layer_params_from ends
+      // and ignore them.
+      for (auto a = layer_params_from.begin(), b = layer_params_to.begin();
+           a != layer_params_from.end() && b != layer_params_to.end();
+           ++a, ++b) {
+        auto param_from = *a, param_to = *b;
+        AT_ASSERTM(param_from.type() == param_to.type(), "parameter types mismatch");
+        param_to.copy_(param_from.view_as(param_to));
+      }
+    }
+  }
+
+  std::vector<int64_t> _input_size(const TensorDescriptorListParams& tensors) {
+    if (tensors.is_input_packed()) {
+      return {tensors.batch_sizes_sum, tensors.input_size};
+    } else {
+      return {tensors.seq_length, tensors.mini_batch, tensors.input_size};
+    }
+  }
+
+  std::vector<int64_t> _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
+    return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
+  }
+
+  std::vector<int64_t> _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
+    if (tensors.is_input_packed()) {
+      return {tensors.batch_sizes_sum, rnn.hidden_size * rnn.num_directions()};
+    } else {
+      return {tensors.seq_length, tensors.mini_batch, rnn.hidden_size * rnn.num_directions()};
+    }
+  }
+
+} // anonymous namespace
+
+// NB: does inplace update into TensorList
+// It would be a relatively simple matter to refactor this into multiple
+// functions, only one of which does an inplace update, but we leave this
+// for future work
+Tensor _cudnn_rnn_flatten_weight(
+    TensorList weight_arr, int64_t weight_stride0,
+    int64_t input_size,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first,
+    bool fn_bidirectional
+    ) {
+
+  if (weight_arr.size() == 0) {
+    throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
+  }
+
+  auto any_param = weight_arr[0];
+
+  RNNDescriptorParams rnn;
+  rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(any_param));
+
+  auto handle = getCudnnHandle();
+  RNNDescriptor rnn_desc = rnn.descriptor(handle);
+
+  TensorGeometry x_geom({1, input_size});
+  TensorDescriptor x_desc;
+  x_desc.set(getCudnnDataType(any_param), x_geom.sizes(), x_geom.strides(), 5);
+
+  auto num_weights = get_num_weights(handle, rnn_desc, x_desc, rnn.datatype);
+  auto weight_buf = any_param.type().tensor(num_weights).zero_();
+
+  FilterDescriptor w_desc;
+  w_desc.set(weight_buf, 3);
+
+  // Slice off views into weight_buf
+  std::vector<Tensor> params_arr;
+  size_t params_stride0;
+  std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf);
+
+  MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
+                    params{params_arr, params_stride0};
+
+  // Copy weights
+  _copyParams(weight, params);
+
+  // Update the storage
+  for (size_t i = 0; i < weight.size(0); i++) {
+    for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
+         orig_param_it != weight[i].end() && new_param_it != params[i].end();
+         orig_param_it++, new_param_it++) {
+      auto orig_param = *orig_param_it, new_param = *new_param_it;
+      orig_param.set_(new_param.view_as(orig_param));
+    }
+  }
+
+  return weight_buf;
+}
+
+// NB: when fn_batch_sizes is empty, that means no batch sizes was specified
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
+    const Tensor& input_r,
+    TensorList weight, int64_t weight_stride0,
+    const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first, double fn_dropout,
+    bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
+    const Tensor& fn_dropout_state
+    ) {
+
+  auto input = input_r;
+  auto weight_buf = weight_buf_r;
+  if (fn_dropout_state.defined()) {
+      auto input_arg = TensorArg(input, "input", 1);
+      auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15);
+      checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg);
+  }
+  RNNParams fn;
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input));
+  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
+  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
+
+  // TODO: Set device to input
+
+  if (fn.rnn.mode != CUDNN_LSTM) {
+    if (cx.defined()) {
+      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
+    }
+  }
+
+  // TODO: can batch_first be a wrapper around this function?
+  auto is_input_packed = fn.tensors.batch_sizes.size() != 0;
+  if (batch_first && !is_input_packed) {
+    input = input.transpose(0, 1);
+  }
+
+  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
+  auto output_size = _output_size(fn.rnn, fn.tensors);
+
+  if (!hx.is_contiguous()) {
+    throw std::runtime_error("rnn: hx is not contiguous");
+  }
+  if (cx.defined() && !cx.is_contiguous()) {
+    throw std::runtime_error("rnn: cx is not contiguous");
+  }
+
+  auto x = input.contiguous();
+  auto output = input.type().tensor(output_size);
+  auto hy = hx.type().tensor(hidden_size);
+  Tensor cy;
+  if (cx.defined()) {
+    cy = cx.type().tensor(hidden_size);
+  } else {
+    cy = hx.type().tensor(); // NB: Not allowed to return undefined tensors
+  }
+  auto y = output;
+
+  auto handle = getCudnnHandle();
+  RNNDescriptors descs(fn, handle, x, y, hx, cx);
+
+  FilterDescriptor w_desc;
+  if (!weight_buf.defined()) {
+    auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], fn.rnn.datatype);
+    weight_buf = x.type().tensor(num_weights);
+    w_desc.set(weight_buf, 3);
+    weight_buf.zero_();
+    std::vector<Tensor> params;
+    size_t params_stride0;
+    std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
+    _copyParams(MatrixRef<Tensor>{weight, static_cast<size_t>(weight_stride0)},
+                MatrixRef<Tensor>{params, params_stride0});
+  } else {
+    w_desc.set(weight_buf, 3);
+  }
+
+  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
+    throw std::runtime_error(oss.str());
+  }
+
+  size_t workspace_size;
+  auto x_descs_arr = descs.get_x_descs();
+  auto y_descs_arr = descs.get_y_descs();
+  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        &workspace_size
+        ));
+  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+
+  Tensor reserve;
+  // NB: Previously, the test was for fn.requires_grad, but we don't have
+  // this information.  Use 'train' as a proxy.
+  if (fn_train) {
+    size_t reserve_size;
+    AT_CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
+          handle,
+          descs.rnn_desc.desc(),
+          fn.tensors.seq_length,
+          x_descs_arr.data(),
+          &reserve_size
+          ));
+    reserve = input.type().toScalarType(kByte).tensor(reserve_size);
+    AT_CUDNN_CHECK(cudnnRNNForwardTraining(
+          handle,
+          descs.rnn_desc.desc(),
+          fn.tensors.seq_length,
+          x_descs_arr.data(), x.data_ptr(),
+          descs.hx_desc.desc(), hx.data_ptr(),
+          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
+          w_desc.desc(), weight_buf.data_ptr(),
+          y_descs_arr.data(), y.data_ptr(),
+          descs.hy_desc.desc(), hy.data_ptr(),
+          descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
+          workspace.data_ptr(), workspace.size(0),
+          reserve.data_ptr(), reserve.size(0)
+          ));
+  } else { // inference
+    reserve = input.type().toScalarType(kByte).tensor();
+    AT_CUDNN_CHECK(cudnnRNNForwardInference(
+          handle,
+          descs.rnn_desc.desc(),
+          fn.tensors.seq_length,
+          x_descs_arr.data(), x.data_ptr(),
+          descs.hx_desc.desc(), hx.data_ptr(),
+          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
+          w_desc.desc(), weight_buf.data_ptr(),
+          y_descs_arr.data(), y.data_ptr(),
+          descs.hy_desc.desc(), hy.data_ptr(),
+          descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
+          workspace.data_ptr(), workspace.size(0)
+          ));
+
+  }
+
+  if (batch_first && !is_input_packed) {
+    output.transpose_(0, 1);
+  }
+
+  return std::make_tuple(output, hy, cy, reserve, weight_buf);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
+    const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
+    const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy,
+    const Tensor& grad_cy,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first, double fn_dropout,
+    bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
+    const Tensor& fn_dropout_state, const Tensor& fn_reserve,
+    std::array<bool, 3> output_mask
+    ) {
+
+  auto input = input_r;
+  auto grad_output = grad_output_r;
+  auto output = output_r;
+
+  RNNParams fn;
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input));
+  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
+  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
+
+  // TODO: Set device to input
+  auto handle = getCudnnHandle();
+
+  if (fn.rnn.mode != CUDNN_LSTM) {
+    if (cx.defined()) {
+      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
+    }
+  }
+
+  auto is_input_packed = fn_batch_sizes.size() != 0;
+  if (batch_first && !is_input_packed) {
+    input = input.transpose(0, 1);
+    grad_output = grad_output.transpose(0, 1);
+    output = output.transpose(0, 1);
+  }
+
+  auto input_size = _input_size(fn.tensors);
+  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
+  auto output_size = _output_size(fn.rnn, fn.tensors);
+
+  if (!hx.is_contiguous()) {
+    throw std::runtime_error("rnn: hx is not contiguous");
+  }
+  if (cx.defined() && !cx.is_contiguous()) {
+    throw std::runtime_error("rnn: cx is not contiguous");
+  }
+
+  auto x = input.contiguous();
+  auto dy = grad_output.contiguous();
+  auto y = output;
+  auto w = weight_buf;
+  auto dx = input.type().tensor(input.sizes()); // TODO: more compact way of saying this
+  auto dhy = grad_hy.contiguous().view(hidden_size);
+  auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor();
+  auto dhx = hx.type().tensor(hidden_size);
+  AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
+  auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor();
+
+  if (!fn_train) {
+    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
+  }
+  if (!input.sizes().equals(input_size)) {
+    std::ostringstream oss;
+    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (!output.sizes().equals(output_size)) {
+    std::ostringstream oss;
+    oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (dhy.defined() && !dhy.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (dcy.defined() && !dcy.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) {
+    throw std::runtime_error("Gradients aren't CUDA tensors");
+  }
+
+  RNNDescriptors descs(fn, handle, x, y, hx, cx);
+
+  FilterDescriptor w_desc;
+  w_desc.set(weight_buf, 3);
+
+  size_t workspace_size;
+  auto x_descs_arr = descs.get_x_descs();
+  auto y_descs_arr = descs.get_y_descs();
+  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        &workspace_size
+        ));
+  // TODO: put this in the correct device???
+  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+
+  AT_CUDNN_CHECK(cudnnRNNBackwardData(
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        y_descs_arr.data(), y.data_ptr(),
+        y_descs_arr.data(), dy.data_ptr(),
+        descs.hy_desc.desc(), dhy.data_ptr(),
+        descs.cy_desc.desc(), cx.defined() ? dcy.data_ptr() : nullptr,
+        w_desc.desc(), w.data_ptr(),
+        descs.hx_desc.desc(), hx.data_ptr(),
+        descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
+        x_descs_arr.data(), dx.data_ptr(),
+        descs.hx_desc.desc(), dhx.data_ptr(),
+        descs.cx_desc.desc(), cx.defined() ? dcx.data_ptr() : nullptr,
+        workspace.data_ptr(), workspace.size(0),
+        fn_reserve.data_ptr(), fn_reserve.size(0)
+        ));
+
+  if (batch_first && !is_input_packed) {
+    dx = dx.transpose_(0, 1);
+  }
+
+  return std::make_tuple(dx, dhx, dcx);
+}
+
+// NB: This MUST BE CALLED AFTER _cudnn_rnn_backward_input.
+// We'll give a user friendly combined function...
+std::vector<Tensor> _cudnn_rnn_backward_weight(
+    // TODO: I think tensor geometry sufficient for weight_buf/weight
+    const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0,
+    const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
+    const Tensor& output_r,
+    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_num_layers, bool batch_first, double fn_dropout,
+    bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
+    const Tensor& fn_dropout_state, const Tensor& fn_reserve
+    ) {
+
+  MatrixRef<Tensor> weight{ weight_arr, static_cast<size_t>(weight_stride0) };
+
+  auto input = input_r;
+  auto output = output_r;
+
+  RNNParams fn;
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input));
+  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
+  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
+
+  auto handle = getCudnnHandle();
+
+  if (fn.rnn.mode != CUDNN_LSTM) {
+    if (cx.defined()) {
+      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
+    }
+  }
+
+  auto is_input_packed = fn_batch_sizes.size() != 0;
+  if (batch_first && !is_input_packed) {
+    input = input.transpose(0, 1);
+    output = output.transpose(0, 1);
+  }
+
+  auto input_size = _input_size(fn.tensors);
+  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
+
+  if (!fn_train) {
+    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
+  }
+  if (!input.sizes().equals(input_size)) {
+    std::ostringstream oss;
+    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
+    std::ostringstream oss;
+    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
+    throw std::runtime_error(oss.str());
+  }
+  // TODO: the above were the only checks in rnn.py, but it doesn't seem
+  // like these checks are enough
+
+  if (!hx.is_contiguous()) {
+    throw std::runtime_error("rnn: hx is not contiguous");
+  }
+  if (cx.defined() && !cx.is_contiguous()) {
+    throw std::runtime_error("rnn: cx is not contiguous");
+  }
+
+  auto x = input.contiguous();
+  const auto& y = output;
+  auto dw = weight_buf.type().tensor(weight_buf.sizes()).zero_();
+
+  RNNDescriptors descs(fn, handle, x, y, hx, cx);
+
+  FilterDescriptor w_desc;
+  w_desc.set(weight_buf, 3);
+
+  size_t workspace_size;
+  auto x_descs_arr = descs.get_x_descs();
+  auto y_descs_arr = descs.get_y_descs();
+  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        &workspace_size
+        ));
+  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+
+  AT_CUDNN_CHECK(cudnnRNNBackwardWeights(
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(), x.data_ptr(),
+        descs.hx_desc.desc(), hx.data_ptr(),
+        y_descs_arr.data(), y.data_ptr(),
+        workspace.data_ptr(), workspace.size(0),
+        w_desc.desc(), dw.data_ptr(),
+        fn_reserve.data_ptr(), fn_reserve.size(0)
+        ));
+
+  std::vector<Tensor> grad_weight_arr;
+  grad_weight_arr.reserve( weight.numel() );
+  for (const auto& w : weight_arr) {
+    grad_weight_arr.emplace_back(w.type().tensor(w.sizes()).zero_());
+  }
+
+  std::vector<Tensor> grad_params_arr;
+  size_t grad_params_stride0;
+  std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
+  _copyParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
+              MatrixRef<Tensor>{grad_weight_arr, static_cast<size_t>(weight_stride0)});
+
+  return grad_weight_arr; // stride is known from call site (and also inconvenient to return)
+}
+
+// We need this dispatcher because _cudnn_rnn_backward_weight has a stringent
+// ordering requirement with _cudnn_rnn_backward_input
+std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
+    const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
+    const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
+    const Tensor& grad_cy_r,
+    int64_t mode, int64_t hidden_size,
+    int64_t num_layers, bool batch_first, double dropout,
+    bool train, bool bidirectional, IntList batch_sizes,
+    const Tensor& dropout_state, const Tensor& reserve,
+    std::array<bool, 4> output_mask
+    ) {
+
+  auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output);
+  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx);
+  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r;
+
+  Tensor dx, dhx, dcx;
+  // NB: unconditionally compute this gradient, because it mutates reserve
+  std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
+  std::vector<Tensor> dw;
+  if (output_mask[3]) {
+    dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
+  }
+  return std::tuple<Tensor, Tensor, Tensor, TensorList>{dx, dhx, dcx, dw};
+}
+
+// TODO: I am not sure if we actually need the 'dropout' and 'train' parameters
+// to initialize just the state tensor
+Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) {
+  auto handle = getCudnnHandle();
+  DropoutDescriptor dropout_desc;
+  auto dropout_p = train ? dropout : 0;
+  dropout_desc.initialize_rng(ty, handle, dropout_p, dropout_seed);
+  return dropout_desc.state;
+}
+
+}} // namespace at::native
+
+#endif // AT_CUDNN_ENABLED()
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
new file mode 100644
index 0000000..c345182
--- /dev/null
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -0,0 +1,302 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/native/SpectralOpsUtils.h"
+#include "ATen/Config.h"
+
+#if !AT_MKL_ENABLED()
+
+namespace at { namespace native {
+
+Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim,
+                bool complex_input, bool complex_output,
+                bool inverse, IntList checked_signal_sizes,
+                bool normalized, bool onesided,
+                IntList output_sizes) {
+  throw std::runtime_error("fft: ATen not compiled with MKL support");
+}
+
+}}
+
+#else // AT_MKL_ENABLED
+
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Utils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
+#include <cmath>
+
+#include <mkl_dfti.h>
+#include <ATen/mkl/Exceptions.h>
+#include <ATen/mkl/Descriptors.h>
+#include <ATen/mkl/Limits.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace at { namespace native {
+
+// In real-to-complex transform, MKL FFT only fills half of the values due to
+// conjugate symmetry. See native/SpectralUtils.h for more details.
+// The following structs are used to fill in the other half with symmetry in
+// case of real-to-complex transform with onesided=False flag.
+// See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h.
+
+template <typename scalar_t>
+static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
+                       int64_t signal_ndim, int64_t size_last_dim,
+                       int64_t start_last_dim_idx, int64_t i, int64_t num) {
+  scalar_t *data = output.data<scalar_t>();
+
+  // A slice means a slice of last dimension (of size size_last_dim)
+
+  // This function iterates through the slices to fill, i.e. to_slice_data
+  // (basically data_slices[i:i+num]), and keeps track of the slices it reads
+  // data from, i.e., from_slice_data, using from_slice_indices, a vector
+  // containing the index of the from_slice_data slice.
+
+  // Compute the indices for the first from_slice_data
+  std::vector<int64_t> from_slice_indices(signal_ndim);  // up to before last signal dim
+  int64_t remainder = i;
+  // set last signal dim values
+  int64_t from_slice_offset = 0;
+  for (int64_t d = signal_ndim - 1; d >= 0; d--) {
+    int64_t dim_size = output.size(d);
+    int64_t dim_idx = remainder % dim_size;
+    remainder = remainder / dim_size;
+    from_slice_indices[d] = dim_idx;
+    if (d == 0) {
+      from_slice_offset += dim_idx * output.stride(d);
+    } else if (dim_idx != 0) {
+      from_slice_offset += (dim_size - dim_idx) * output.stride(d);
+    }
+  }
+
+  // First to_slice_data and from_slice_data
+  scalar_t *to_slice_data = data + i * size_last_dim * 2;
+  scalar_t *from_slice_data = data + from_slice_offset;
+
+  while (num > 0) {
+    // Fill to_slice_data from values in from_slice_data
+    for (int64_t j = start_last_dim_idx; j < size_last_dim; j++) {
+      // multiply index by 2 because of the last complex dim has size 2
+      int64_t to_idx = j * 2;
+      int64_t from_idx = (size_last_dim - j) * 2;
+      to_slice_data[to_idx] = from_slice_data[from_idx];
+      to_slice_data[to_idx + 1] = -from_slice_data[from_idx + 1];
+    }
+    // Compute the next to_slice_data and from_slice_data slices
+    to_slice_data += size_last_dim * 2;
+    for (int64_t d = signal_ndim - 1; d >= 0; d--) {
+      // Compute the next index at this dimension using conjugate symmetry
+      // Break out of this loop if nothing carries over
+      from_slice_indices[d] = (from_slice_indices[d] + 1) % output.size(d);
+      if (d > 0) {
+        // At d > 0 nonbatch dim, to get next from_slice_data offset
+        //   1. if this dim idx becomes 1, will need to add (size - 1) * stride
+        //   2. otherwise, will need to subtract stride
+        if (from_slice_indices[d] == 0) {
+          // Substract. Carries over to previous dimension
+          from_slice_data -= output.stride(d);
+        } else if (from_slice_indices[d] == 1) {
+          // Dimension index becomes 1
+          // Doesn't carry over to previous dimension
+          from_slice_data += (output.size(d) - 1) * output.stride(d);
+          break;
+        } else {
+          // Substract. Doesn't carry over to previous dimension
+          from_slice_data -= output.stride(d);
+          break;
+        }
+      } else {
+        // At d = 0 nonbatch dim, it means that to_slice_data ise now at a the
+        // beginning of a data sample. It maps to itself by conjugate symmetry.
+        from_slice_data = to_slice_data;
+      }
+    }
+    num--;
+  }
+}
+
+// input should be a contiguous batched tensor of same size as full (twosided)
+// signals, but only contains half (onesided) of the values.
+// This function modifies inplace.
+static inline void _fft_fill_with_conjugate_symmetry_(Tensor& input,
+                      int64_t signal_ndim, int64_t size_last_dim,
+                      int64_t last_dim_start_slice) {
+  if (last_dim_start_slice >= size_last_dim) {
+    return;
+  }
+
+  int64_t num = 1;
+  for (int64_t d = 0; d < signal_ndim; d++) {
+    num *= input.size(d);
+  }
+#ifdef _OPENMP
+  if (num > 500) {
+    int nthreads = omp_get_num_threads();
+    int64_t num_slices_per_thread = num / nthreads + 1;
+    #pragma omp parallel
+    {
+      int tid = omp_get_thread_num();
+      int64_t start = tid * num_slices_per_thread;
+      AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] {
+        _fft_fill_with_conjugate_symmetry_slice<scalar_t>(input, signal_ndim, size_last_dim,
+            last_dim_start_slice, start, std::min(num_slices_per_thread, num - start));
+      });
+    }
+    return;
+  }
+#endif
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] {
+    _fft_fill_with_conjugate_symmetry_slice<scalar_t>(input, signal_ndim, size_last_dim,
+        last_dim_start_slice, 0, num);
+  });
+}
+
+// MKL DFTI
+Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
+                bool complex_input, bool complex_output,
+                bool inverse, IntList checked_signal_sizes,
+                bool normalized, bool onesided,
+                IntList output_sizes) {
+  int64_t batch = self.size(0);
+  Tensor input = self;
+  // real/imag dimension must aligned when viewed as of complex type
+  if (complex_input) {
+    bool need_contiguous = input.stride(-1) != 1;
+    for (int64_t i = 0; !need_contiguous && i <= signal_ndim; i++) {
+      need_contiguous |= input.stride(i) % 2 != 0;
+    }
+    if (need_contiguous) {
+      input = input.contiguous();
+    }
+  }
+
+  // check if we can use MKL because MKL_LONG is 32bit on some OS, e.g. Windows
+  // need to check input and output size and strides
+  // be careful about complex domain, where the stride needs to be divided by 2
+  // only need to test upper bound MKL_LONG_MAX as these values are non-negative
+  if (sizeof(MKL_LONG) < sizeof(int64_t)) {
+    bool need_contiguous = false;
+    int64_t inumel = 1 /* istride if we contiguous-fy */, onumel = 1;
+    int64_t isize, osize, istride, ostride;
+    for (int64_t i = signal_ndim; i >= 0; i--) {
+      isize = input.size(i);
+      osize = output_sizes[i];
+      istride = complex_input ? input.stride(i) >> 1 : input.stride(i);
+      ostride = onumel;
+      if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) {
+        std::ostringstream ss;
+        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
+           << MKL_LONG_MAX << "]";
+        throw std::runtime_error(ss.str());
+      }
+      if (!need_contiguous && istride > MKL_LONG_MAX) {
+        // If we didn't plan to contiguous-fy but the `istride` exceeds bound,
+        // check if we can stride (equal to `inumel`) get back within bound if
+        // we contiguous-fy. If so, then we need to always check `inumel`
+        // instead for the remaining iterations. The iterations before this are
+        // fine as `inumel` is non-decreasing.
+        need_contiguous = true;
+      }
+      if (need_contiguous && inumel > MKL_LONG_MAX) {
+        std::ostringstream ss;
+        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
+           << MKL_LONG_MAX << "]";
+        throw std::runtime_error(ss.str());
+      }
+      inumel *= isize;
+      onumel *= osize;
+    }
+  }
+  Tensor output = input.type().tensor(output_sizes);
+
+  // precision
+  DFTI_CONFIG_VALUE prec;
+  if (input.type().scalarType() == ScalarType::Float) {
+    prec = DFTI_SINGLE;
+  } else if (input.type().scalarType() == ScalarType::Double) {
+    prec = DFTI_DOUBLE;
+  } else {
+    std::ostringstream ss;
+    ss << "MKL FFT doesn't support tensor of type: "
+       << at::toString(input.type().scalarType());
+    throw std::runtime_error(ss.str());
+  }
+  // signal type
+  DFTI_CONFIG_VALUE signal_type;
+  if (!inverse) {
+    signal_type = complex_input ? DFTI_COMPLEX : DFTI_REAL;
+  } else {
+    signal_type = complex_output ? DFTI_COMPLEX : DFTI_REAL;
+  }
+  // create descriptor with signal size
+  std::vector<MKL_LONG> mkl_signal_sizes(checked_signal_sizes.begin(), checked_signal_sizes.end());
+  DftiDescriptor descriptor;
+  descriptor.init(prec, signal_type, signal_ndim, mkl_signal_sizes.data());
+  // out of place FFT
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+  // batch mode
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch));
+
+  auto istrides = input.strides();
+  auto ostrides = output.strides();
+  // batch dim stride, i.e., dist between each data
+  MKL_LONG idist = complex_input ? istrides[0] >> 1 : istrides[0];
+  MKL_LONG odist = complex_output ? ostrides[0] >> 1 : ostrides[0];
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
+  // signal strides
+  // first val is offset, set to zero (ignored)
+  std::vector<MKL_LONG> mkl_istrides(1 + signal_ndim, 0), mkl_ostrides(1 + signal_ndim, 0);
+  for (int64_t i = 1; i <= signal_ndim; i++) {
+    mkl_istrides[i] = complex_input ? istrides[i] >> 1 : istrides[i];
+    mkl_ostrides[i] = complex_output ? ostrides[i] >> 1 : ostrides[i];
+  }
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_istrides.data()));
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_ostrides.data()));
+  // if conjugate domain of real is involved, set standard CCE storage type
+  // this will become default in MKL in future
+  if (!complex_input || !complex_output) {
+    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
+  }
+  // rescale if needed by normalized flag or inverse transform
+  if (normalized || inverse) {
+    auto signal_numel = at::prod_intlist(checked_signal_sizes);
+    double double_scale;
+    if (normalized) {
+      double_scale = 1.0 / std::sqrt(static_cast<double>(signal_numel));
+    } else {
+      double_scale = 1.0 / static_cast<double>(signal_numel);
+    }
+    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(),
+      inverse ? DFTI_BACKWARD_SCALE : DFTI_FORWARD_SCALE,
+      prec == DFTI_DOUBLE ? double_scale : static_cast<float>(double_scale)));
+  }
+  // finalize
+  MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get()));
+  // run
+  if (!inverse) {
+    MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), output.data_ptr()));
+  } else {
+    MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), output.data_ptr()));
+  }
+  // now if needed, fill out the other half using Hermitian symmetry dim
+  if (!complex_input && complex_output && !onesided) {
+    auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1];
+    auto start_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim);
+    _fft_fill_with_conjugate_symmetry_(output, signal_ndim, size_last_signal_dim, start_slice);
+  }
+  return output;
+}
+
+}} // namespace at::native
+
+#endif
+
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
new file mode 100644
index 0000000..00f4e8f
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -0,0 +1,447 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+#if !AT_MKLDNN_ENABLED()
+
+namespace at { namespace native {
+
+at::Tensor mkldnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+    IntList padding, IntList stride, IntList dilation, int64_t groups) {
+  throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
+}
+
+at::Tensor mkldnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
+  throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
+  throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) {
+  throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
+}
+
+}}
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/mkldnn/Runtime.h>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+std::vector<int64_t> conv_output_size(
+    IntList input_size, IntList weight_size,
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
+{
+  auto dim = input_size.size();
+  std::vector<int64_t> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (size_t d = 2; d < dim; ++d) {
+    auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2])
+                        - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+at::Tensor mkldnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
+{
+  auto output = input.type().tensor(conv_output_size(
+    input.sizes(), weight.sizes(), padding, stride, dilation, groups));
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t g = groups;
+  
+  int32_t n = input.size(0);
+  int32_t ic = input.size(1);
+  int32_t ih = input.size(2);
+  int32_t iw = input.size(3);
+
+  int32_t oc = output.size(1);
+  int32_t oh = output.size(2);
+  int32_t ow = output.size(3);
+
+  int32_t kh = weight.size(2);
+  int32_t kw = weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
+  auto format_x = memory::format::x;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  auto input_md = memory::desc({input_tz}, data_t, format_any);
+  auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+  auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+  auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias.defined()) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    input.data_ptr());
+  auto weight_usr_memory = memory({{{weight_tz}, data_t,  format_weight}, cpu_engine},
+    weight.data_ptr());
+  auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    output.data_ptr());
+
+  std::vector<primitive> net;
+
+  auto input_pd = conv_forward_pd->src_primitive_desc();
+  auto input_memory = input_usr_memory;
+  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+    input_memory = memory(input_pd);
+    net.push_back(reorder(input_usr_memory, input_memory));
+  }
+
+  auto weight_pd = conv_forward_pd->weights_primitive_desc();
+  auto weight_memory = weight_usr_memory;
+  if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+    weight_memory = memory(weight_pd);
+    net.push_back(reorder(weight_usr_memory, weight_memory));
+  }
+
+  auto output_pd = conv_forward_pd->dst_primitive_desc();
+  auto output_memory = output_usr_memory;
+  if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) {
+    output_memory = memory(output_pd);
+  }
+
+  std::shared_ptr<convolution_forward> conv_forward;
+  std::shared_ptr<memory> bias_usr_memory;
+  if (bias.defined()) {
+    bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+      bias.data_ptr()));
+    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+      weight_memory, *bias_usr_memory, output_memory));
+  } else {
+    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+      weight_memory, output_memory));
+  }
+  net.push_back(*conv_forward);
+
+  if (output_memory != output_usr_memory) {
+    net.push_back(reorder(output_memory, output_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return output;
+}
+
+Tensor mkldnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
+{
+  auto grad_input = grad_output.type().tensor(input_size);
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t g = groups;
+
+  int32_t n = grad_input.size(0);
+  int32_t ic = grad_input.size(1);
+  int32_t ih = grad_input.size(2);
+  int32_t iw = grad_input.size(3);
+
+  int32_t oc = grad_output.size(1);
+  int32_t oh = grad_output.size(2);
+  int32_t ow = grad_output.size(3);
+
+  int32_t kh = weight.size(2);
+  int32_t kw = weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  auto input_md = memory::desc({input_tz}, data_t, format_any);
+  auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+  auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+  auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+  // need to re-create conv_forward_pd to feed conv_backward_data_pd
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias_defined) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  std::shared_ptr<convolution_backward_data::desc> conv_backward_data_desc;
+  conv_backward_data_desc.reset(new convolution_backward_data::desc(
+    convolution_direct, input_md, weight_md, output_md,
+    _stride, _padding, _padding, padding_kind::zero));
+
+  std::shared_ptr<convolution_backward_data::primitive_desc> conv_backward_data_pd;
+  conv_backward_data_pd.reset(new convolution_backward_data::primitive_desc(
+    *conv_backward_data_desc, cpu_engine, *conv_forward_pd));
+
+  auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    grad_output.data_ptr());
+  auto weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine},
+    weight.data_ptr());
+  auto grad_input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    grad_input.data_ptr());
+
+  std::vector<primitive> net;
+
+  auto grad_output_pd = conv_backward_data_pd->diff_dst_primitive_desc();
+  auto grad_output_memory = grad_output_usr_memory;
+  if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+    grad_output_memory = memory(grad_output_pd);
+    net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+  }
+
+  auto weight_pd = conv_backward_data_pd->weights_primitive_desc();
+  auto weight_memory = weight_usr_memory;
+  if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+    weight_memory = memory(weight_pd);
+    net.push_back(reorder(weight_usr_memory, weight_memory));
+  }
+
+  auto grad_input_pd = conv_backward_data_pd->diff_src_primitive_desc();
+  auto grad_input_memory = grad_input_usr_memory;
+  if (grad_input_memory.get_primitive_desc() != memory::primitive_desc(grad_input_pd)) {
+    grad_input_memory = memory(grad_input_pd);
+  }
+
+  std::shared_ptr<convolution_backward_data> conv_backward_data;
+  conv_backward_data.reset(new convolution_backward_data(*conv_backward_data_pd,
+    grad_output_memory, weight_memory, grad_input_memory));
+  net.push_back(*conv_backward_data);
+
+  if (grad_input_memory != grad_input_usr_memory) {
+    net.push_back(reorder(grad_input_memory, grad_input_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return grad_input;
+}
+
+std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
+{
+  auto grad_weight = grad_output.type().tensor(weight_size);
+
+  Tensor grad_bias;
+  if (bias_defined) {
+    grad_bias = grad_output.type().tensor({grad_output.size(1)});
+  }
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t g = groups;
+
+  int32_t n = input.size(0);
+  int32_t ic = input.size(1);
+  int32_t ih = input.size(2);
+  int32_t iw = input.size(3);
+
+  int32_t oc = grad_output.size(1);
+  int32_t oh = grad_output.size(2);
+  int32_t ow = grad_output.size(3);
+
+  int32_t kh = grad_weight.size(2);
+  int32_t kw = grad_weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
+  auto format_x = memory::format::x;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  memory::desc input_md({input_tz}, data_t, format_any);
+  memory::desc weight_md({weight_tz}, data_t, format_any);
+  memory::desc bias_md({bias_tz}, data_t, format_any);
+  memory::desc output_md({output_tz}, data_t, format_any);
+
+  // need to re-create conv_forward_pd to feed conv_backward_weight_pd
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias_defined) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc;
+  if (bias_defined) {
+    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd;
+  conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc(
+    *conv_backward_weight_desc, cpu_engine, *conv_forward_pd));
+
+  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    input.data_ptr());
+  auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    grad_output.data_ptr());
+  auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine},
+    grad_weight.data_ptr());
+  std::shared_ptr<memory> grad_bias_memory;
+
+  std::vector<primitive> net;
+
+  auto input_pd = conv_backward_weight_pd->src_primitive_desc();
+  auto input_memory = input_usr_memory;
+  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+    input_memory = memory(input_pd);
+    net.push_back(reorder(input_usr_memory, input_memory));
+  }
+
+  auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc();
+  auto grad_output_memory = grad_output_usr_memory;
+  if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+    grad_output_memory = memory(grad_output_pd);
+    net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+  }
+
+  auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc();
+  auto grad_weight_memory = grad_weight_usr_memory;
+  if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) {
+    grad_weight_memory = memory(grad_weight_pd);
+  }
+
+  std::shared_ptr<convolution_backward_weights> conv_backward_weight;
+  if (bias_defined) {
+    grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+      grad_bias.data_ptr()));
+    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+      input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory));
+  } else {
+    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+      input_memory, grad_output_memory, grad_weight_memory));
+  }
+
+  net.push_back(*conv_backward_weight);
+
+  if (grad_weight_memory != grad_weight_usr_memory) {
+    net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias};
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask)
+{
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::mkldnn_convolution_backward_input(
+      input.sizes(), grad_output, weight, padding, stride, dilation, groups, output_mask[2]);
+  }
+  if (output_mask[1] || output_mask[2]) {
+    std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights(
+      weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2]);
+  }
+
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+}}  // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
new file mode 100644
index 0000000..edadf9b
--- /dev/null
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -0,0 +1,1926 @@
+# See README.md in this directory for more guidance
+
+
+# Temporary type cast operators. These are needed to trace type-casts now since
+# Type's are not supported in the IR. Instead, we call down to these
+# specialized operators for each datatype.
+# TODO: remove when we have Type support in the IR
+- func: _cast_Byte(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Char(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Double(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Float(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Int(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Long(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Short(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
+  variants: function, method
+
+- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _cudnn_rnn_flatten_weight
+
+- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _cudnn_rnn
+
+- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
+  variants: function
+  dispatch:
+    CUDA: _cudnn_rnn_backward
+
+- func: _cudnn_init_dropout_state(Type self_ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _cudnn_init_dropout_state
+
+- func: abs(Tensor self) -> Tensor
+
+- func: abs_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _abs__cpu
+    CUDA: _abs__cuda
+
+- func: abs_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _abs_out_cpu
+    CUDA: _abs_out_cuda
+
+- func: acos(Tensor self) -> Tensor
+
+- func: acos_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _acos__cpu
+    CUDA: _acos__cuda
+
+- func: acos_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _acos_out_cpu
+    CUDA: _acos_out_cuda
+
+- func: avg_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, bool ceil_mode=false, bool count_include_pad=true) -> Tensor
+  variants: function
+
+- func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor
+  variants: function
+
+- func: adaptive_max_pool1d(Tensor self, IntList[1] output_size) -> (Tensor, Tensor)
+  variants: function
+
+- func: allclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> bool
+  device_guard: false
+
+- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+
+- func: addmv_(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+
+- func: addmv_out(Tensor result, Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+
+- func: addr_(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method
+
+- func: addr_out(Tensor result, Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: all(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+- func: all_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: any(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+- func: any_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: arange(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: arange(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: arange_out(Tensor result, Scalar start, Scalar end) -> Tensor
+  variants: function
+
+- func: arange_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor
+  variants: function
+
+- func: arange(Scalar end, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: arange_out(Tensor result, Scalar end) -> Tensor
+  variants: function
+
+- func: arange(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: arange(Type dtype, Scalar end) -> Tensor
+  variants: function
+  deprecated: true
+
+# This function is a temporary hack to allow tracing of arange like constructs with dynamic
+# bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
+# if the range you need is based on another tensor, calling this function directly will
+# preserve tracing.  Get rid of this when arange can directly take tensors for bounds
+# (so that it can be traced directly).
+- func: _dim_arange(Tensor like, int64_t dim) -> Tensor
+  variants: function
+
+# `argmin` and `argmax` are exposed in C++ but not in Python, where we only
+# expose `_argmin` and `_argmax` (which call the first versions). In Python, we
+# then define our own `argmax` and `argmin` that handle passing `dim=None`,
+# which gets the argmax/argmin of the flattened array.
+
+- func: argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+- func: argmax(Tensor self) -> Tensor
+- func: _argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+- func: argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+- func: argmin(Tensor self) -> Tensor
+- func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+# The actual implementations live in Declarations.cwrap. These are just to
+# provide default values for storage_offset=self.storage_offset()
+- func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor
+- func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor
+
+- func: asin(Tensor self) -> Tensor
+
+- func: asin_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _asin__cpu
+    CUDA: _asin__cuda
+
+- func: asin_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _asin_out_cpu
+    CUDA: _asin_out_cuda
+
+- func: atan(Tensor self) -> Tensor
+
+- func: atan_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _atan__cpu
+    CUDA: _atan__cuda
+
+- func: atan_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _atan_out_cpu
+    CUDA: _atan_out_cuda
+
+- func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double momentum, double eps, bool cudnn_enabled) -> Tensor
+  variants: function
+
+- func: bernoulli(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
+
+- func: bernoulli(Tensor self, double p, Generator* generator=nullptr) -> Tensor
+
+- func: bernoulli(Tensor self) -> Tensor
+
+- func: bernoulli_(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
+
+- func: bernoulli_(Tensor self, double p, Generator* generator=nullptr) -> Tensor
+
+- func: bernoulli_(Tensor self) -> Tensor
+
+- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
+  variants: function
+
+- func: bincount(Tensor self, Tensor? weights={}, int64_t minlength=0) -> Tensor
+  dispatch:
+    CPU: _bincount_cpu
+    CUDA: _bincount_cuda
+
+- func: blackman_window(int64_t window_length, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: cat(TensorList tensors, int64_t dim=0) -> Tensor
+  variants: function
+
+- func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
+  variants: function
+
+- func: ceil(Tensor self) -> Tensor
+
+- func: ceil_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _ceil__cpu
+    CUDA: _ceil__cuda
+
+- func: ceil_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _ceil_out_cpu
+    CUDA: _ceil_out_cuda
+
+- func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
+
+- func: cudnn_is_acceptable(Tensor self) -> bool
+  variants: function
+  device_guard: false
+
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) -> Tensor
+  variants: function
+
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+  variants: function
+
+- func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) -> Tensor
+  variants: function
+
+# NB: We MUST call the input self, otherwise codegen will attempt to
+# dispatch on ggI... which might be undefined.
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+  variants: function
+
+- func: conv1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] dilation=1, int64_t groups=1) -> Tensor
+  variants: function
+
+- func: conv2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1, int64_t groups=1) -> Tensor
+  variants: function
+
+- func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor
+  variants: function
+
+- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor
+
+- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor)
+
+# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
+- func: conv_transpose1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] output_padding=0, int64_t groups=1, IntList[1] dilation=1) -> Tensor
+  variants: function
+
+- func: conv_transpose2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, int64_t groups=1, IntList[2] dilation=1) -> Tensor
+  variants: function
+
+- func: conv_transpose3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, int64_t groups=1, IntList[3] dilation=1) -> Tensor
+  variants: function
+
+- func: cos(Tensor self) -> Tensor
+
+- func: cos_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _cos__cpu
+    CUDA: _cos__cuda
+
+- func: cos_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _cos_out_cpu
+    CUDA: _cos_out_cuda
+
+- func: cosh(Tensor self) -> Tensor
+
+- func: cosh_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _cosh__cpu
+    CUDA: _cosh__cuda
+
+- func: cosh_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _cosh_out_cpu
+    CUDA: _cosh_out_cuda
+
+- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: cudnn_affine_grid_generator(Tensor theta, int64_t N, int64_t C, int64_t H, int64_t W) -> Tensor
+  return:
+    - type: Tensor
+      name: grid
+  variants: function
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_forward
+
+# TODO: Why do I have to call this grad?!
+- func: cudnn_affine_grid_generator_backward(Tensor grad, int64_t N, int64_t C, int64_t H, int64_t W)
+  return:
+    - type: Tensor
+      name: grad_theta
+  variants: function
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_backward
+
+- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: cudnn_batch_norm
+
+# NB: You can only use this if you used cudnn_batch_norm training=True
+- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: cudnn_batch_norm_backward
+
+- func: cudnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution
+
+- func: cudnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_backward_input
+
+- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_backward
+
+- func: cudnn_convolution_backward_bias(Tensor grad_output) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_backward_bias
+
+- func: cudnn_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_backward_weight
+
+- func: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_transpose
+
+# NB: output_padding not strictly needed here, but it's helpful for the double
+# backwards
+- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_transpose_backward
+
+- func: cudnn_convolution_transpose_backward_bias(Tensor grad_output) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_backward_bias
+
+- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_transpose_backward_input
+
+- func: cudnn_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: cudnn_convolution_transpose_backward_weight
+
+# NB: input is special cased in a way I don't quite understand
+- func: cudnn_grid_sampler(Tensor self, Tensor grid)
+  return:
+    - type: Tensor
+      name: output
+  variants: function
+  dispatch:
+    CUDA: cudnn_grid_sampler_forward
+
+- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output)
+  return:
+    - type: Tensor
+      name: grad_self
+    - type: Tensor
+      name: grad_grid
+  variants: function
+  dispatch:
+    CUDA: cudnn_grid_sampler_backward
+
+# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
+- func: cumsum(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+
+- func: cumsum(Tensor self, int64_t dim) -> Tensor
+
+- func: cumsum_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: cumsum_out(Tensor result, Tensor self, int64_t dim) -> Tensor
+  variants: function
+
+# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
+- func: cumprod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+
+- func: cumprod(Tensor self, int64_t dim) -> Tensor
+
+- func: cumprod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor
+  variants: function
+
+- func: det(Tensor self) -> Tensor
+
+- func: diagflat(Tensor self, int64_t offset=0) -> Tensor
+  variants: function
+
+- func: diagonal(Tensor self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) -> Tensor
+
+- func: dot(Tensor self, Tensor tensor) -> Tensor
+
+- func: dot_out(Tensor result, Tensor self, Tensor tensor) -> Tensor
+  variants: function
+
+- func: einsum(std::string equation, TensorList tensors) -> Tensor
+  variants: function
+
+- func: embedding(Tensor weight, IndexTensor indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) -> Tensor
+  variants: function
+
+- func: embedding_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+  variants: function
+
+- func: embedding_dense_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor
+  variants: function
+  dispatch:
+    CPU: embedding_dense_backward_cpu
+    CUDA: embedding_dense_backward_cuda
+
+- func: embedding_renorm_(Tensor self, IndexTensor indices, double max_norm, double norm_type) -> Tensor
+  variants: function
+  dispatch:
+    CPU: embedding_renorm_cpu_
+    CUDA: embedding_renorm_cuda_
+
+- func: embedding_sparse_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor
+  variants: function
+
+# NOTE [ embedding_bag Native Functions ]
+# The `_embedding_bag.*` variants assume that input tensors except for `weight`,
+# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous.
+# We really only need to enforce this for `_embedding_bag` (the forward) because
+# the backward inputs are the same as forward ones.
+# The above `embedding_bag` wrapper is created to achieve this, e.g.,
+# applying indices = indices.contiguous().
+# The backward functions apply a check that these input tensors are contiguous.
+
+- func: embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)
+  variants: function
+
+- func: _embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _embedding_bag_cpu
+    CUDA: _embedding_bag_cuda
+
+- func: _embedding_bag_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) -> Tensor
+  variants: function
+
+- func: _embedding_bag_sparse_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor
+  variants: function
+
+- func: _embedding_bag_dense_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _embedding_bag_dense_backward_cpu
+    CUDA: _embedding_bag_dense_backward_cuda
+
+- func: empty(IntList size, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: empty_out(Tensor result, IntList size) -> Tensor
+  variants: function
+
+- func: empty_like(Tensor self) -> Tensor
+  variants: function
+
+- func: empty_like(Tensor self, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: empty(Type dtype, IntList size) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: erf(Tensor self) -> Tensor
+
+- func: erf_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _erf__cpu
+    CUDA: _erf__cuda
+
+- func: erf_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _erf_out_cpu
+    CUDA: _erf_out_cuda
+
+- func: erfc(Tensor self) -> Tensor
+
+- func: erfc_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _erfc__cpu
+    CUDA: _erfc__cuda
+
+- func: erfc_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _erfc_out_cpu
+    CUDA: _erfc_out_cuda
+
+- func: exp(Tensor self) -> Tensor
+
+- func: exp_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _exp__cpu
+    CUDA: _exp__cuda
+
+- func: exp_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _exp_out_cpu
+    CUDA: _exp_out_cuda
+
+- func: expm1(Tensor self) -> Tensor
+
+- func: expm1_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _expm1__cpu
+    CUDA: _expm1__cuda
+
+- func: expm1_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _expm1_out_cpu
+    CUDA: _expm1_out_cuda
+
+- func: expand(Tensor self, IntList size, *, bool implicit=false) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+
+- func: expand_as(Tensor self, Tensor other) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+
+- func: eye(int64_t n, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: eye(int64_t n, int64_t m, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: eye_out(Tensor result, int64_t n) -> Tensor
+  variants: function
+  dispatch:
+    CPU: eye_out_cpu
+    CUDA: eye_out_cuda
+
+- func: eye_out(Tensor result, int64_t n, int64_t m) -> Tensor
+  variants: function
+  dispatch:
+    CPU: eye_out_cpu
+    CUDA: eye_out_cuda
+
+- func: eye(Type dtype, int64_t n, int64_t m=-1) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: flatten(Tensor self, int64_t start_dim=0, int64_t end_dim=-1) -> Tensor
+
+- func: fill_(Tensor self, Scalar value) -> Tensor
+
+- func: fill_(Tensor self, Tensor value) -> Tensor
+
+- func: floor(Tensor self) -> Tensor
+
+- func: floor_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _floor__cpu
+    CUDA: _floor__cuda
+
+- func: floor_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _floor_out_cpu
+    CUDA: _floor_out_cuda
+
+- func: full(IntList size, Scalar fill_value, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: full_out(Tensor result, IntList size, Scalar fill_value) -> Tensor
+  variants: function
+
+- func: full_like(Tensor self, Scalar fill_value) -> Tensor
+  variants: function
+
+- func: full_like(Tensor self, Scalar fill_value, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor
+  variants: function
+
+- func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hann_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hamming_window(int64_t window_length, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hamming_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hamming_window(int64_t window_length, bool periodic, double alpha, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hamming_window(int64_t window_length, bool periodic, double alpha, double beta, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: hinge_embedding_loss(Tensor self, Tensor target, double margin=1.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: ger(Tensor self, Tensor vec2) -> Tensor
+
+- func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor
+  variants: function
+
+- func: gesv(Tensor self, Tensor A) -> (Tensor, Tensor)
+
+- func: gesv_out(Tensor solution, Tensor lu, Tensor self, Tensor A) -> (Tensor, Tensor)
+  variants: function
+
+# gesv handles broadcasting of arbitrary batch dims while _gesv_helper does not.
+- func: _gesv_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
+  dispatch:
+    CPU: _gesv_helper_cpu
+    CUDA: _gesv_helper_cuda
+
+- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor
+  variants: function
+
+# FFT
+
+- func: fft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
+
+- func: ifft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
+
+- func: rfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true) -> Tensor
+
+- func: irfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) -> Tensor
+
+- func: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) -> Tensor
+  dispatch:
+    CPU: _fft_mkl
+    CUDA: _fft_cufft
+
+- func: _cufft_get_plan_cache_size() -> int64_t
+  variants: function
+  device_guard: false
+
+- func: _cufft_get_plan_cache_max_size() -> int64_t
+  variants: function
+  device_guard: false
+
+- func: _cufft_set_plan_cache_max_size(int64_t max_size)
+  variants: function
+  device_guard: false
+
+- func: _cufft_clear_plan_cache()
+  variants: function
+  device_guard: false
+
+- func: index(Tensor self, TensorList indices) -> Tensor
+  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
+
+- func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor
+  variants: method
+
+- func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor
+
+- func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
+
+- func: inverse(Tensor self) -> Tensor
+
+- func: inverse_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+
+- func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor
+
+- func: is_cuda(Tensor self) -> bool
+  device_guard: false
+
+- func: is_distributed(Tensor self) -> bool
+  device_guard: false
+
+- func: is_floating_point(Tensor self) -> bool
+  device_guard: false
+
+- func: is_nonzero(Tensor self) -> bool
+  device_guard: false
+
+- func: is_same_size(Tensor self, Tensor other) -> bool
+  device_guard: false
+
+- func: is_signed(Tensor self) -> bool
+  device_guard: false
+
+- func: is_sparse(Tensor self) -> bool
+  device_guard: false
+
+- func: kthvalue(Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+
+- func: kthvalue_out(Tensor values, Tensor indices, Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: layer_norm(Tensor input, IntList normalized_shape, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enable=True) -> Tensor
+  variants: function
+
+- func: linspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: linspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: linspace_out(Tensor result, Scalar start, Scalar end) -> Tensor
+  variants: function
+
+- func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
+  variants: function
+
+- func: linspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: log(Tensor self) -> Tensor
+
+- func: log_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _log__cpu
+    CUDA: _log__cuda
+
+- func: log_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _log_out_cpu
+    CUDA: _log_out_cuda
+
+- func: log10(Tensor self) -> Tensor
+
+- func: log10_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _log10__cpu
+    CUDA: _log10__cuda
+
+- func: log10_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _log10_out_cpu
+    CUDA: _log10_out_cuda
+
+- func: log1p(Tensor self) -> Tensor
+
+- func: log1p_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _log1p__cpu
+    CUDA: _log1p__cuda
+    SparseCPU: log1p_sparse_
+    SparseCUDA: log1p_sparse_
+
+- func: log1p_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _log1p_out_cpu
+    CUDA: _log1p_out_cuda
+    SparseCPU: log1p_out_sparse
+    SparseCUDA: log1p_out_sparse
+
+- func: log2(Tensor self) -> Tensor
+
+- func: log2_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _log2__cpu
+    CUDA: _log2__cuda
+
+- func: log2_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _log2_out_cpu
+    CUDA: _log2_out_cuda
+
+- func: logdet(Tensor self) -> Tensor
+
+- func: logspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: logspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: logspace_out(Tensor result, Scalar start, Scalar end) -> Tensor
+  variants: function
+
+- func: logspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
+  variants: function
+
+- func: logspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: log_softmax(Tensor self, int64_t dim) -> Tensor
+  dispatch:
+    CPU: log_softmax_cpu
+    CUDA: log_softmax_cuda
+
+- func: log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
+  dispatch:
+    CPU: log_softmax_backward_cpu
+    CUDA: log_softmax_backward_cuda
+
+- func: logsumexp(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+
+- func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
+
+- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: matmul(Tensor self, Tensor other) -> Tensor
+
+- func: matmul_out(Tensor result, Tensor self, Tensor other) -> Tensor
+  variants: function
+
+- func: max(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+
+- func: max_out(Tensor max, Tensor max_values, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+- func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
+  variants: function
+
+- func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
+  variants: function
+
+- func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
+  variants: function
+
+# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
+- func: mean(Tensor self, *, ScalarType dtype) -> Tensor
+
+- func: mean(Tensor self) -> Tensor
+
+- func: mean(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+
+- func: mean(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+
+- func: mean(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+
+- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
+
+- func: mean_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: median(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+
+- func: median_out(Tensor values, Tensor indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: min(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+
+- func: min_out(Tensor min, Tensor min_indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: min_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups) -> Tensor
+  variants: function
+
+- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> Tensor
+  variants: function
+
+- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> (Tensor, Tensor)
+  variants: function
+
+- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+  variants: function
+
+- func: mm(Tensor self, Tensor mat2) -> Tensor
+
+- func: mm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor
+  variants: function
+
+- func: mode(Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+
+- func: mode_out(Tensor values, Tensor indices, Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function
+
+- func: mv(Tensor self, Tensor vec) -> Tensor
+
+- func: mv_out(Tensor result, Tensor self, Tensor vec) -> Tensor
+  variants: function
+
+- func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
+
+- func: ones(IntList size, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: ones_out(Tensor result, IntList size) -> Tensor
+  variants: function
+
+- func: ones_like(Tensor self) -> Tensor
+  variants: function
+
+- func: ones_like(Tensor self, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: ones(Type dtype, IntList size) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: pairwise_distance(Tensor x1, Tensor x2, double p=2, double eps=1e-6, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: permute(Tensor self, IntList dims) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+
+- func: pin_memory(Tensor self) -> Tensor
+
+- func: pinverse(Tensor self, double rcond=1e-15) -> Tensor
+
+- func: rand(IntList size, *, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: rand(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: rand_out(Tensor result, IntList size, *) -> Tensor
+  variants: function
+
+- func: rand_out(Tensor result, IntList size, *, Generator* generator) -> Tensor
+  variants: function
+
+- func: rand_like(Tensor self) -> Tensor
+  variants: function
+
+- func: rand_like(Tensor self, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: rand(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: randint(int64_t high, IntList size, *, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randint(int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randint(int64_t low, int64_t high, IntList size, *, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randint(int64_t low, int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randint(Type dtype, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: randint(Type dtype, int64_t low, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: randint_out(Tensor result, int64_t high, IntList size, *) -> Tensor
+  variants: function
+
+- func: randint_out(Tensor result, int64_t high, IntList size, *, Generator* generator) -> Tensor
+  variants: function
+
+- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *) -> Tensor
+  variants: function
+
+- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *, Generator* generator) -> Tensor
+  variants: function
+
+- func: randint_like(Tensor self, int64_t high) -> Tensor
+  variants: function
+
+- func: randint_like(Tensor self, int64_t low, int64_t high) -> Tensor
+  variants: function
+
+- func: randint_like(Tensor self, int64_t high, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: randint_like(Tensor self, int64_t low, int64_t high, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: randn(IntList size, *, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randn(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randn_out(Tensor result, IntList size, *) -> Tensor
+  variants: function
+
+- func: randn_out(Tensor result, IntList size, *, Generator* generator) -> Tensor
+  variants: function
+
+- func: randn_like(Tensor self) -> Tensor
+  variants: function
+
+- func: randn_like(Tensor self, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: randn(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: randperm(int64_t n, *, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randperm(int64_t n, *, Generator* generator, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: randperm_out(Tensor result, int64_t n, *) -> Tensor
+  variants: function
+
+- func: randperm_out(Tensor result, int64_t n, *, Generator* generator) -> Tensor
+  variants: function
+  dispatch:
+    CPU: randperm_out_cpu
+    CUDA: randperm_out_cuda
+
+- func: randperm(Type dtype, int64_t n, *, Generator* generator=nullptr) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: range(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: range(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: range_out(Tensor result, Scalar start, Scalar end) -> Tensor
+  variants: function
+
+- func: range_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor
+  variants: function
+
+- func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: repeat(Tensor self, IntList repeats) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+
+- func: reshape(Tensor self, IntList shape) -> Tensor
+
+- func: reshape_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: RoiPooling2d_forward_cpu
+    CUDA: RoiPooling2d_forward_cuda
+
+- func: RoiPooling2d_backward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, Tensor gradOutput, Tensor argmaxes) -> Tensor
+  variants: function
+  dispatch:
+    CPU: RoiPooling2d_backward_cpu
+    CUDA: RoiPooling2d_backward_cuda
+
+- func: round(Tensor self) -> Tensor
+
+- func: round_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _round__cpu
+    CUDA: _round__cuda
+
+- func: round_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _round_out_cpu
+    CUDA: _round_out_cuda
+
+- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
+  variants: function
+
+- func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
+  variants: function
+
+- func: relu(Tensor self) -> Tensor
+
+- func: relu_(Tensor self) -> Tensor
+
+- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  dispatch:
+    CPU: hardshrink_cpu
+    CUDA: hardshrink_cuda
+
+- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+  dispatch:
+    CPU: hardshrink_backward_cpu
+    CUDA: hardshrink_backward_cuda
+
+- func: rsqrt(Tensor self) -> Tensor
+
+- func: rsqrt_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _rsqrt__cpu
+    CUDA: _rsqrt__cuda
+
+- func: rsqrt_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _rsqrt_out_cpu
+    CUDA: _rsqrt_out_cuda
+
+- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
+
+- func: selu(Tensor self) -> Tensor
+  variants: function
+
+- func: selu_(Tensor self) -> Tensor
+  variants: function
+
+- func: sigmoid(Tensor self) -> Tensor
+
+- func: sigmoid_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _sigmoid__cpu
+    CUDA: _sigmoid__cuda
+
+- func: sigmoid_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sigmoid_out_cpu
+    CUDA: _sigmoid_out_cuda
+
+- func: sin(Tensor self) -> Tensor
+
+- func: sin_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _sin__cpu
+    CUDA: _sin__cuda
+
+- func: sin_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sin_out_cpu
+    CUDA: _sin_out_cuda
+
+- func: sinh(Tensor self) -> Tensor
+
+- func: sinh_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _sinh__cpu
+    CUDA: _sinh__cuda
+
+- func: sinh_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sinh_out_cpu
+    CUDA: _sinh_out_cuda
+
+- func: size(Tensor self, int64_t dim) -> int64_t
+  device_guard: false
+
+- func: slice(Tensor self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) -> Tensor
+
+- func: slogdet(Tensor self) -> (Tensor, Tensor)
+
+- func: smm(Tensor self, Tensor mat2) -> Tensor
+
+- func: softmax(Tensor self, int64_t dim) -> Tensor
+  dispatch:
+    CPU: softmax_cpu
+    CUDA: softmax_cuda
+
+- func: softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
+  dispatch:
+    CPU: softmax_backward_cpu
+    CUDA: softmax_backward_cuda
+
+- func: split(Tensor self, int64_t split_size, int64_t dim=0) -> TensorList
+
+- func: split_with_sizes(Tensor self, IntList split_sizes, int64_t dim=0) -> TensorList
+
+- func: squeeze(Tensor self) -> Tensor
+
+- func: squeeze(Tensor self, int64_t dim) -> Tensor
+
+- func: squeeze_(Tensor self) -> Tensor
+  variants: method
+
+- func: squeeze_(Tensor self, int64_t dim) -> Tensor
+  variants: method
+
+- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+
+- func: sspaddmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sspaddmm_out_only_sparse
+    CUDA: _sspaddmm_out_only_sparse_cuda
+    SparseCPU: _sspaddmm_out_cpu
+    SparseCUDA: _sspaddmm_out_cuda
+
+- func: stack(TensorList tensors, int64_t dim=0) -> Tensor
+  variants: function
+
+- func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
+  variants: function
+
+# The signature is designed to be consistent with librosa except that it is
+# missing the `pad_mode` and `center` arguments, which are taken care of at
+# `torch.functional.py`. They shall be moved here once we have mapping between
+# Python strings and C++ Enum in codegen.
+- func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor
+  python_default_init:
+    hop_length: n_fft >> 2
+    win_length: n_fft
+
+- func: stride(Tensor self, int64_t dim) -> int64_t
+  device_guard: false
+
+# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
+- func: sum(Tensor self, *, ScalarType dtype) -> Tensor
+
+- func: sum(Tensor self) -> Tensor
+
+- func: _sum(Tensor self) -> Tensor
+  dispatch:
+    CPU: _sum_cpu
+    CUDA: _sum_cuda
+
+- func: sum(Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+
+- func: sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+
+- func: sum(Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor
+
+- func: _sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+
+- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+- func: sum_out(Tensor result, Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: _sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+- func: _sum_cuda_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _sum_out_cuda
+
+- func: sqrt(Tensor self) -> Tensor
+
+- func: sqrt_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _sqrt__cpu
+    CUDA: _sqrt__cuda
+
+- func: sqrt_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sqrt_out_cpu
+    CUDA: _sqrt_out_cuda
+
+- func: std(Tensor self, bool unbiased=true) -> Tensor
+
+- func: std(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+
+- func: std_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+  variants: function
+
+# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
+- func: prod(Tensor self, *, ScalarType dtype) -> Tensor
+
+- func: prod(Tensor self) -> Tensor
+
+- func: _prod(Tensor self) -> Tensor
+  dispatch:
+    CPU: _prod_cpu
+    CUDA: _prod_cuda
+
+- func: prod(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+
+- func: prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+
+- func: prod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+
+- func: _prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+
+- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
+
+- func: prod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function
+
+- func: _prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _prod_out_cpu
+    CUDA: _prod_out_cuda
+
+- func: t(Tensor self) -> Tensor
+
+- func: t_(Tensor self) -> Tensor
+  variants: method
+
+- func: tan(Tensor self) -> Tensor
+
+- func: tan_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _tan__cpu
+    CUDA: _tan__cuda
+
+- func: tan_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _tan_out_cpu
+    CUDA: _tan_out_cuda
+
+- func: tanh(Tensor self) -> Tensor
+
+- func: tanh_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _tanh__cpu
+    CUDA: _tanh__cuda
+
+- func: tanh_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _tanh_out_cpu
+    CUDA: _tanh_out_cuda
+
+- func: transpose(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
+
+- func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
+  variants: method
+
+- func: flip(Tensor self, IntList dims) -> Tensor
+  dispatch:
+    CPU: flip_cpu
+    CUDA: flip_cuda
+
+- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor
+  variants: function
+
+- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: trunc(Tensor self) -> Tensor
+
+- func: trunc_(Tensor self) -> Tensor
+  dispatch:
+    CPU: _trunc__cpu
+    CUDA: _trunc__cuda
+
+- func: trunc_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _trunc_out_cpu
+    CUDA: _trunc_out_cuda
+
+- func: type_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: _unique(Tensor self, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
+  dispatch:
+    CPU: _unique_cpu
+    CUDA: _unique_cuda
+
+- func: _unsafe_view(Tensor self, IntList size) -> Tensor
+  variants: function
+
+- func: unsqueeze(Tensor self, int64_t dim) -> Tensor
+
+- func: unsqueeze_(Tensor self, int64_t dim) -> Tensor
+  variants: method
+
+- func: var(Tensor self, bool unbiased=true) -> Tensor
+
+- func: var(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+
+- func: var_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: view_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+# we define both of these because 'where' does the broadcast and '_s_where' doesn't;
+# this allows us to implicitly calculate the broadcast derivative, while only dealing with the
+# _s_where derivative.
+- func: where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
+- func: _s_where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
+  dispatch:
+    CPU: _s_where_cpu
+    CUDA: _s_where_cuda
+
+- func: zeros(IntList size, TensorOptions options={}) -> Tensor
+  variants: function
+
+- func: zeros_out(Tensor result, IntList size) -> Tensor
+  variants: function
+
+- func: zeros_like(Tensor self) -> Tensor
+  variants: function
+
+- func: zeros_like(Tensor self, *, TensorOptions options) -> Tensor
+  variants: function
+
+- func: zeros(Type dtype, IntList size) -> Tensor
+  variants: function
+  deprecated: true
+
+- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+  dispatch:
+    CPU: _standard_gamma_grad_cpu
+    CUDA: _standard_gamma_grad_cuda
+
+- func: _standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor
+  dispatch:
+    CPU: _s_gamma_cpu
+    CUDA: _s_gamma_cuda
+
+- func: poisson(Tensor self, Generator* generator=nullptr) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _s_poisson_cpu
+    CUDA: _s_poisson_cuda
+
+# When more variants get ported to native, this dispatch will get more
+# complicated
+
+- func: native_norm(Tensor self, Scalar p=2) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: norm_sparse
+    SparseCUDA: norm_sparse
+
+- func: norm(Tensor self, Scalar p=2) -> Tensor
+  variants: method, function
+
+- func: norm(Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
+  python_default_init:
+    p: 2
+
+- func: norm_out(Tensor result, Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function
+  python_default_init:
+    p: 2
+
+- func: native_clone(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: clone_sparse
+    SparseCUDA: clone_sparse
+
+- func: clone(Tensor self) -> Tensor
+
+- func: native_resize_as_(Tensor self, Tensor the_template) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: resize_as_sparse_
+    SparseCUDA: resize_as_sparse_
+
+- func: resize_as_(Tensor self, Tensor the_template) -> Tensor
+
+- func: native_pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: pow_out_sparse_scalar
+    SparseCUDA: pow_out_sparse_scalar
+
+- func: native_pow(Tensor self, Scalar exponent) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: pow_sparse_scalar
+    SparseCUDA: pow_sparse_scalar
+
+- func: pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor
+  variants: function
+
+- func: pow(Tensor self, Scalar exponent) -> Tensor
+  variants: method, function
+
+- func: native_zero_(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: zero_sparse_
+    SparseCUDA: zero_sparse_
+
+- func: zero_(Tensor self) -> Tensor
+
+- func: s_native_add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_add_out_sparse_cpu
+    SparseCUDA: s_add_out_sparse_cuda
+
+- func: native_add_out(Tensor result, Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_out_dense_sparse_cpu
+    CUDA: add_out_dense_sparse_cuda
+
+- func: s_native_add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_add_sparse_cpu
+    SparseCUDA: s_add_sparse_cuda
+
+- func: native_add(Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_dense_sparse_cpu
+    CUDA: add_dense_sparse_cuda
+
+- func: s_native_add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_add_sparse_cpu_
+    SparseCUDA: s_add_sparse_cuda_
+
+- func: native_add_(Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_dense_sparse_cpu_
+    CUDA: add_dense_sparse_cuda_
+
+- func: add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: method, function
+
+- func: add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: method
+
+
+
+- func: s_native_sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_sub_out_sparse_cpu
+    SparseCUDA: s_sub_out_sparse_cuda
+
+- func: s_native_sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_sub_sparse_cpu
+    SparseCUDA: s_sub_sparse_cuda
+
+- func: s_native_sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_sub_sparse_cpu_
+    SparseCUDA: s_sub_sparse_cuda_
+
+- func: sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: method, function
+
+- func: sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: method
+
+
+
+- func: s_native_mul_out(Tensor result, Tensor self, Tensor other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_mul_out_sparse_cpu
+    SparseCUDA: s_mul_out_sparse_cuda
+
+- func: s_native_mul(Tensor self, Tensor other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_mul_sparse_cpu
+    SparseCUDA: s_mul_sparse_cuda
+
+- func: s_native_mul_(Tensor self, Tensor other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: s_mul_sparse_cpu_
+    SparseCUDA: s_mul_sparse_cuda_
+
+- func: native_mul_out(Tensor result, Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: mul_out_sparse_scalar
+    SparseCUDA: mul_out_sparse_scalar
+
+- func: native_mul(Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: mul_sparse_scalar
+    SparseCUDA: mul_sparse_scalar
+
+- func: native_mul_(Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: mul_sparse_scalar_
+    SparseCUDA: mul_sparse_scalar_
+
+- func: mul_out(Tensor result, Tensor self, Tensor other) -> Tensor
+  variants: function
+
+- func: mul_out(Tensor result, Tensor self, Scalar other) -> Tensor
+  variants: function
+
+- func: mul(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: mul(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: mul_(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: mul_(Tensor self, Scalar other) -> Tensor
+  variants: method
+
+
+
+- func: native_div_out(Tensor result, Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: div_out_sparse_scalar
+    SparseCUDA: div_out_sparse_scalar
+
+- func: native_div(Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: div_sparse_scalar
+    SparseCUDA: div_sparse_scalar
+
+- func: native_div_(Tensor self, Scalar other) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: div_sparse_scalar_
+    SparseCUDA: div_sparse_scalar_
+
+- func: div_out(Tensor result, Tensor self, Scalar other) -> Tensor
+  variants: function
+
+- func: div(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: div_(Tensor self, Scalar other) -> Tensor
+  variants: method
+
+
+- func: s_native_addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: s_addmm_out_sparse_dense_cpu
+    CUDA: s_addmm_out_sparse_dense_cuda
+
+- func: s_native_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: s_addmm_sparse_dense_cpu
+    CUDA: s_addmm_sparse_dense_cuda
+
+- func: s_native_addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: s_addmm_sparse_dense_cpu_
+    CUDA: s_addmm_sparse_dense_cuda_
+
+- func: addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method, function
+
+- func: addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method
+
+
+- func: native_tensor(Type self_ty) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: new_sparse
+    SparseCUDA: new_sparse
+
+- func: native_tensor(Type self_ty, IntList size) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: new_with_size_sparse
+    SparseCUDA: new_with_size_sparse
+
+- func: tensor(Type dtype) -> Tensor
+  variants: []
+
+- func: tensor(Type dtype, IntList size) -> Tensor
+  variants: []
+
+
+# NB: I have to decompose sparse_coo_tensor into two functions, because
+# it has custom dispatch logic for which Type to dispatch on (we must
+# use the sparse equivalent of the type of the SECOND argument).
+#
+# The actual dispatcher, native_sparse_coo_tensor, has all of its overloads
+# removed so you don't accidentally trigger the default behavior, which
+# is to infer Type based on the first argument (indices), which is ~never
+# what you want. (I guess hypothetically it would work; you'd
+# just only ever dispatch to CPULongTensor or CUDALongTensor, but that
+# seems a bit too finely balanced.)
+
+- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
+  variants: []
+  dispatch:
+    SparseCPU: new_with_tensor_sparse
+    SparseCUDA: new_with_tensor_sparse
+
+- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
+  variants: []
+  dispatch:
+    SparseCPU: new_with_tensor_and_size_sparse
+    SparseCUDA: new_with_tensor_and_size_sparse
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
+  variants: []
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
+  variants: []
+
+
+- func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
+  variants: []
+  dispatch:
+    SparseCPU: new_with_tensor_and_size_unsafe_sparse
+    SparseCUDA: new_with_tensor_and_size_unsafe_sparse
+
+- func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
+  variants: function
+
+
+- func: sparse_raw_resize_(Tensor self, IntList size, int64_t sparseDims, int64_t denseDims) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: raw_resize_sparse_
+    SparseCUDA: raw_resize_sparse_
+
+
+- func: _sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor
+  variants: method
+  dispatch:
+    CPU: sparse_mask_cpu
+    CUDA: sparse_mask_cuda
+
+
+- func: to_dense(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: sparse_to_dense
+    SparseCUDA: sparse_to_dense
+
+
+- func: _sparseDims(Tensor self) -> int64_t
+  variants: method
+  dispatch:
+    SparseCPU: _sparseDims_sparse
+    SparseCUDA: _sparseDims_sparse
+  device_guard: False
+
+# legacy method
+- func: _dimI(Tensor self) -> int64_t
+  variants: method
+  dispatch: _sparseDims_sparse
+  device_guard: False
+
+
+- func: _denseDims(Tensor self) -> int64_t
+  variants: method
+  dispatch:
+    SparseCPU: _denseDims_sparse
+    SparseCUDA: _denseDims_sparse
+  device_guard: False
+
+# legacy method
+- func: _dimV(Tensor self) -> int64_t
+  variants: method
+  dispatch: _denseDims_sparse
+  device_guard: False
+
+
+- func: _nnz(Tensor self) -> int64_t
+  variants: method
+  dispatch:
+    SparseCPU: _nnz_sparse
+    SparseCUDA: _nnz_sparse
+  device_guard: False
+
+
+- func: coalesce(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: coalesce_sparse_cpu
+    SparseCUDA: coalesce_sparse_cuda
+
+
+- func: is_coalesced(Tensor self) -> bool
+  variants: method
+  dispatch:
+    SparseCPU: is_coalesced_sparse
+    SparseCUDA: is_coalesced_sparse
+  device_guard: False
+
+
+- func: _indices(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: _indices_sparse
+    SparseCUDA: _indices_sparse
+  device_guard: False
+
+
+- func: _values(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: _values_sparse
+    SparseCUDA: _values_sparse
+  device_guard: False
+
+
+- func: hspmm_out(Tensor result, Tensor mat1, Tensor mat2) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: hspmm_out_sparse_cpu
+    SparseCUDA: hspmm_out_sparse_cuda
+
+- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: hspmm_sparse_cpu
+    SparseCUDA: hspmm_sparse_cuda
+
+# This "raw copy" doesn't handle conversions NOR does it handle non-blocking.
+- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor
+  variants: function
+  dispatch:
+    SparseCPU: copy_sparse_
+    SparseCUDA: copy_sparse_
+
+- func: numel(Tensor self) -> int64_t
+  variants:
+    - method
+    - function
+  device_guard: False
+
+- func: unbind(Tensor self, int64_t dim=0) -> TensorList
+  variants:
+    - method
+    - function
+
+- func: native_get_device(Tensor self) -> int64_t
+  variants: function
+  dispatch:
+    SparseCUDA: get_device_sparse_cuda
+  device_guard: False
+
+- func: get_device(Tensor self) -> int64_t
+  device_guard: False
+
+- func: meshgrid(TensorList tensors) -> TensorList
+  variants: function
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
new file mode 100644
index 0000000..0cac9bc
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -0,0 +1,390 @@
+// Basic functions on sparse tensors
+
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/sparse/SparseUtils.h>
+
+#include <TH/THBlasUtils.h>
+
+namespace at { namespace native {
+
+/******************************************************************************
+ * access methods
+ ******************************************************************************/
+
+int64_t _sparseDims_sparse(const SparseTensor& self) {
+  return _get_sparse_impl(self)->sparseDims();
+}
+
+int64_t _denseDims_sparse(const SparseTensor& self) {
+  return _get_sparse_impl(self)->denseDims();
+}
+
+bool is_coalesced_sparse(const SparseTensor& self) {
+  return _get_sparse_impl(self)->coalesced();
+}
+
+int64_t _nnz_sparse(const SparseTensor& self) {
+  return _get_sparse_impl(self)->nnz();
+}
+
+// TODO: This is wrong: if nnz == 0 but indices/values is not
+// empty then we'll return all the values, even the ones that
+// are "masked out" by nnz
+
+Tensor _indices_sparse(const SparseTensor& self) {
+  auto nnz = self._nnz();
+  if (nnz == 0) {
+    // Narrows don't work on 0-length tensors
+    // TODO: When we handle zero-size dims correctly, this will work and
+    // we can remove the special case.
+    return _get_sparse_impl(self)->indices();
+  }
+  return _get_sparse_impl(self)->indices().narrow(1, 0, nnz);
+}
+
+Tensor _values_sparse(const SparseTensor& self) {
+  // See indices for some relevant notes
+  auto nnz = self._nnz();
+  if (nnz == 0) {
+    return _get_sparse_impl(self)->values();
+  }
+  return _get_sparse_impl(self)->values().narrow(0, 0, nnz);
+}
+
+/******************************************************************************
+ * creation methods
+ ******************************************************************************/
+
+/* Empty init */
+SparseTensor new_sparse(const SparseType& dtype) {
+  AT_ASSERT(!dtype.is_undefined());
+  AT_ASSERT(!dtype.is_variable());
+  AT_ASSERT(dtype.is_sparse());
+  // TODO: Hmm... this const_cast business seems a bit dodgy
+  return SparseTensor(new SparseTensorImpl(const_cast<SparseType*>(&dtype)), /* retain */ false);
+}
+
+/*** Helper methods ***/
+
+/* Pointer-copy init */
+SparseTensor new_with_tensor_sparse(const LongTensor& indices, const Tensor& values_) {
+  Tensor values;
+  if (values_.dim() == 0) {
+    // Mimic Numpy behavior here and treat it as a 1D tensor
+    values = values_.expand({1});
+  } else {
+    values = values_;
+  }
+
+  // TODO: This is a temporary test until we support zero-size dims.
+  // I'm NOT adding the "obvious" bypass code, because it wasn't supported
+  // previously
+  AT_CHECK(indices.numel() != 0, "cannot construct sparse tensor with empty indices; use the nullary constructor instead");
+
+  const SparseType& dtype = values.type().toSparse();
+
+  // If sizes are not given, it is inferred as max index of each dim.
+  int64_t sparseDims = indices.size(0);
+  int64_t denseDims = values.dim() - 1;
+
+  std::vector<int64_t> computed_sizes(sparseDims + denseDims);
+  // NB: It used to keepdim. I think that was wrong.
+  LongTensor computed_indices_sizes = std::get</* values */ 0>(indices.max(/* dim */ 1, /* keepdim */ false));
+  computed_indices_sizes.add_(1); // len = max_index + 1
+  LongTensor cpu_computed_indices_sizes;
+  if (computed_indices_sizes.is_cuda()) {
+    cpu_computed_indices_sizes = at::CPU(kLong).tensor(computed_indices_sizes.sizes());
+    cpu_computed_indices_sizes.copy_(computed_indices_sizes);
+  } else {
+    cpu_computed_indices_sizes = computed_indices_sizes;
+  }
+  auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor<int64_t, 1>();
+  for (int64_t d = 0; d < sparseDims; d++) {
+    computed_sizes[static_cast<size_t>(d)] = cpu_computed_indices_sizes_accessor[d];
+  }
+  for (int64_t d = 0; d < denseDims; d++) {
+    computed_sizes[static_cast<size_t>(sparseDims + d)] = values.size(d+1);
+  }
+  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, computed_sizes, indices, values);
+}
+
+SparseTensor new_with_size_sparse(const SparseType& dtype, ArrayRef<int64_t> size) {
+  SparseTensor self = new_sparse(dtype);
+  _raw_resize_sparse(self, size.size(), 0, size);
+  return self;
+}
+
+// NB: Got rid of the sizes == NULL case
+SparseTensor new_with_tensor_and_size_unsafe_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef<int64_t> sizes) {
+  Tensor values;
+  if (values_.dim() == 0) {
+    // Mimic Numpy behavior here and treat it as a 1D tensor
+    values = values_.expand({1});
+  } else {
+    values = values_;
+  }
+
+  const SparseType& dtype = values.type().toSparse();
+  // NB: used to be a dim() == 0 test, but that's legacy TH semantics
+  if (indices.numel() == 0 && values.numel() == 0) {
+    return new_with_size_sparse(dtype, sizes);
+  }
+
+  int64_t sparseDims = indices.size(0);
+  int64_t denseDims = values.dim() - 1;
+  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values);
+}
+
+// NB: Got rid of the sizes == NULL case
+SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef<int64_t> sizes) {
+  Tensor values;
+  if (values_.dim() == 0) {
+    // Mimic Numpy behavior here and treat it as a 1D tensor
+    values = values_.expand({1});
+  } else {
+    values = values_;
+  }
+
+  const SparseType& dtype = values.type().toSparse();
+  // NB: This used to be dims, but mumble TH handling zero-sized tensors
+  // incorrectly
+  if (indices.numel() == 0 && values.numel() == 0) {
+    return new_with_size_sparse(dtype, sizes);
+  }
+
+  int64_t sparseDims = indices.size(0);
+  int64_t denseDims = values.dim() - 1;
+  AT_CHECK(sizes.size() == sparseDims + denseDims, "number of dimensions must be sparseDims (", sparseDims, ") + denseDims (", denseDims, "), but got ", sizes);
+
+  LongTensor max_indices = std::get</* values */ 0>(indices.max(/* dim */ 1, /* keepdim */ false));
+  LongTensor cpu_max_indices;
+  if (max_indices.is_cuda()) {
+    cpu_max_indices = at::CPU(kLong).copy(max_indices);
+  } else {
+    cpu_max_indices = max_indices;
+  }
+  auto cpu_max_indices_accessor = cpu_max_indices.accessor<int64_t, 1>();
+  for (int64_t d = 0; d < sparseDims; d++) {
+    // NB: This used to sync ndim times to access each entry; now we copy
+    // everything to CPU first and then access it.
+    int64_t max_index_in_dim = cpu_max_indices_accessor[d];
+    int64_t dim_size = sizes[static_cast<size_t>(d)];
+    AT_CHECK(max_index_in_dim < dim_size,
+             "sizes is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim);
+  }
+  for (int64_t d = 0; d < denseDims; d++) {
+    int64_t values_size = values.size(d+1);
+    int64_t specified_size = sizes[static_cast<size_t>(sparseDims + d)];
+    AT_CHECK(values_size <= specified_size,
+             "values and sizes are inconsistent: sizes[", d + sparseDims, "] is ", specified_size,
+             " but values.size(", d + 1, ") is ", values_size);
+  }
+  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values);
+}
+
+// NB: Deleted newWithSizeNd variants
+
+SparseTensor clone_sparse(const SparseTensor& self) {
+  SparseTensor other = new_sparse(self.type());
+  _raw_resize_sparse(other, self._sparseDims(), self._denseDims(), self.sizes());
+  // NB: This seems to preserve the size of the UN-narrowed indices and
+  // values.  Veeery interesting.
+  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values());
+  _get_sparse_impl(other)->set_coalesced(self.is_coalesced());
+  _get_sparse_impl(other)->set_nnz(self._nnz());
+  return other;
+}
+
+/******************************************************************************
+ * reshaping methods
+ ******************************************************************************/
+
+/*
+// We should implement a utility function which: (1) sets nnz and (2) resizes
+// indices/values to hold enough space to fit nnz, if nnz is larger than
+// the previous amount.  This ensures that we maintain the nnz invariant.
+void _resize_nnz_(const SparseTensor& self, int64_t nnz) {
+}
+*/
+
+void resize_sparse(const SparseTensor& self, ArrayRef<int64_t> size) {
+  _raw_resize_sparse(self, size.size(), 0, size);
+}
+
+SparseTensor& raw_resize_sparse_(SparseTensor& self, ArrayRef<int64_t> size, int64_t sparseDims, int64_t denseDims) {
+  if (sparseDims == -1) {
+    sparseDims = self._indices().size(0);
+  }
+  if (denseDims == -1) {
+    denseDims = self._values().dim() - 1;
+  }
+  _raw_resize_sparse(self, sparseDims, denseDims, size);
+  return self;
+}
+
+namespace {
+  bool _is_same_size_as_sparse(const SparseTensor& self, const SparseTensor& src) {
+    return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims() && self.sizes().equals(src.sizes());
+  }
+}
+
+SparseTensor& resize_as_sparse_(SparseTensor& self, const SparseTensor& src) {
+  if (!_is_same_size_as_sparse(self, src)) {
+    _raw_resize_sparse(self, src._sparseDims(), src._denseDims(), src.sizes());
+  }
+  return self;
+}
+
+// NB: Dropped the resizeNd variants
+
+Tensor sparse_to_dense(const SparseTensor& self) {
+  Tensor dst = at::zeros(self.sizes(), self.type().toDense());
+  return dst.add_(self);
+}
+
+SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) {
+  if (isSameTensor(self, src)) return self;
+  _raw_resize_sparse(self, src._sparseDims(), src._denseDims(), src.sizes());
+  // NB: This seems to copy the underlying full indices/values buffer
+  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values());
+  _get_sparse_impl(self)->set_coalesced(src.is_coalesced());
+  _get_sparse_impl(self)->set_nnz(src._nnz());
+  return self;
+}
+
+SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
+  AT_ASSERT(self.defined());
+  AT_ASSERT(!self.is_variable());
+  AT_ASSERT(self.is_sparse());
+
+  if (self._nnz() < 2) {
+    _get_sparse_impl(self)->set_coalesced(true);
+  }
+  if (self.is_coalesced()) {
+    return self;
+  }
+
+  LongTensor indices = self._indices();
+  Tensor values = self._values().contiguous();
+  int64_t sparseDims = self._sparseDims();
+  int64_t denseDims = self._denseDims();
+  int64_t nnz = self._nnz();
+
+  LongTensor indices_scalar = at::zeros({nnz}, kLong);
+
+  int64_t factor = 1;
+  for (int64_t d = sparseDims - 1; d >= 0; d--) {
+    LongTensor indices_slice = indices.select(0, d);
+    indices_scalar.add_(indices_slice, factor); // cadd is swapped args
+    factor *= self.size(d);
+  }
+
+  SparseTensor dst = new_sparse(self.type());
+  _raw_resize_sparse(dst, sparseDims, denseDims, self.sizes());
+  // TODO: is there a more idiomatic way to do this?
+  LongTensor newIndices = indices.type().tensor(indices.sizes());
+  Tensor newValues = values.type().tensor(values.sizes());
+  _alias_into_sparse(dst, newIndices, newValues);
+
+  LongTensor indicesBuffer;
+  LongTensor indicesPermutation;
+  std::tie(indicesBuffer, indicesPermutation) = indices_scalar.sort(0);
+  // NB: The accessor accesses here rely on self._nnz() > 0 (tested earlier in this function)
+  auto newIndicesAccessor = newIndices.accessor<int64_t, 2>();
+  auto indicesAccessor = indices.accessor<int64_t, 2>();
+  auto indicesPermutationAccessor = indicesPermutation.accessor<int64_t, 1>();
+  auto indicesBufferAccessor = indicesBuffer.accessor<int64_t, 1>();
+
+  int64_t i = -1;
+  AT_DISPATCH_ALL_TYPES(
+      values.type(), "coalesce", [&] {
+        int64_t prev = -1;
+        int64_t blockSize = values.stride(0);
+        scalar_t* values_ptr = values.data<scalar_t>();
+        scalar_t* newValues_ptr = newValues.data<scalar_t>();
+        for (int64_t j = 0; j < nnz; j++) {
+          int64_t pos = indicesPermutationAccessor[j];
+          int64_t curr = indicesBufferAccessor[j];
+          if (curr == prev) {
+            THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+          } else {
+            ++i;
+            for (int64_t d = 0; d < sparseDims; d++) {
+              newIndicesAccessor[d][i] = indicesAccessor[d][pos];
+            }
+            THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+          }
+          prev = curr;
+        }
+    });
+
+  _get_sparse_impl(dst)->set_coalesced(true);
+  _get_sparse_impl(dst)->set_nnz(i + 1);
+
+  return dst;
+}
+
+SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const SparseTensor& mask) {
+  AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
+  AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
+      t.sizes(), " but mask has size ", mask.sizes());
+  AT_ASSERT(!t.is_cuda()); // we were supposed to have dispatched on this
+  AT_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA");
+  AT_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA");
+  resize_as_sparse_(r, mask);
+  if (mask._nnz() == 0) {
+    r.zero_();
+    return r;
+  }
+  int64_t dim = t.dim();
+  int64_t sparseDims = mask._sparseDims();
+  LongTensor mask_indices = mask._indices();
+  Tensor mask_values = mask._values();
+  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  _alias_into_sparse(r, mask_indices.clone(), r_values);
+  _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
+  int64_t r_nnz = mask._nnz();
+  _get_sparse_impl(r)->set_nnz(r_nnz);
+  // NB: Relies on mask._nnz() == 0 test above
+  auto mask_indices_accessor = mask_indices.accessor<int64_t, 2>();
+
+  if (dim > sparseDims) {
+    // NB: This used to reuse buffers, but I deoptimized it
+    for (int64_t i = 0; i < r_nnz; i++) {
+      Tensor srcBuffer = t;
+      for (int64_t d = 0; d < sparseDims; d++) {
+        srcBuffer = srcBuffer.select(0, mask_indices_accessor[d][i]);
+      }
+      Tensor dstBuffer = r_values.select(0, i);
+      dstBuffer.copy_(srcBuffer);
+    }
+  } else {
+    AT_DISPATCH_ALL_TYPES(
+        r_values.type(), "sparse_mask", [&] {
+          auto r_values_accessor = r_values.accessor<scalar_t, 1>();
+          // NB: The old code did this pointer access in a weird way (going straight
+          // to storage + storageOffset.)  Was there perhaps a method to the
+          // madness?
+          scalar_t* t_ptr = t.data<scalar_t>();
+          for (int64_t i = 0; i < r_nnz; i++) {
+            int64_t idx = 0;
+            for (int64_t d = 0; d < sparseDims; d++) {
+              idx += mask_indices_accessor[d][i] * t.stride(d);
+            }
+            scalar_t val = t_ptr[idx];
+            r_values_accessor[i] = val;
+          }
+    });
+  }
+  return r;
+}
+
+SparseTensor sparse_mask_cpu(const Tensor& t, SparseTensorRef mask) {
+  SparseTensor r = t.type().toSparse().tensor();
+  sparse_mask_out_cpu(r, t, mask.tref);
+  return r;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
new file mode 100644
index 0000000..4a25665
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -0,0 +1,870 @@
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/sparse/SparseUtils.h>
+
+#include <TH/THBlasUtils.h>
+
+namespace at { namespace native {
+
+// --------------------------------------------------------------------
+// Utility functions
+// --------------------------------------------------------------------
+
+namespace {
+  LongTensor _to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
+    int64_t h, i, hp0, hp1;
+    LongTensor csr = native::zeros({dim + 1}, kLong);
+
+    // TODO: eliminate this conditional when zero-size dims supported correctly
+    if (nnz > 0) {
+      auto csr_accessor = csr.accessor<int64_t, 1>();
+      // Convert the sparse matrix to CSR format
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+      for (i=0; i<nnz; i++) {
+        hp0 = indices[i];
+        hp1 = (i+1 == nnz) ?  dim : indices[i+1];
+        if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+          csr_accessor[h+1] = i+1;
+        }
+      }
+    }
+    return csr;
+  }
+
+}
+
+// --------------------------------------------------------------------
+// zero_(SparseTensor)
+// --------------------------------------------------------------------
+
+// hummu hummu
+SparseTensor& zero_sparse_(SparseTensor& self) {
+  AT_ASSERT(self.is_sparse());
+
+  // NB: You must use _get_sparse_impl(self)->indices()
+  // and not self._indices(), because the latter will possibly
+  // return a view (which means that the in-place operation will
+  // not work).
+  if (_get_sparse_impl(self)->indices().numel()) {
+    // TODO: To be fixed when we support zero-size dims
+    _get_sparse_impl(self)->indices().resize_({0});
+  }
+
+  if (_get_sparse_impl(self)->values().numel()) {
+    _get_sparse_impl(self)->values().resize_({0});
+  }
+  _get_sparse_impl(self)->set_nnz(0);
+  _get_sparse_impl(self)->set_coalesced(true); // NB: This is new
+  return self;
+}
+
+// NB: Don't need zeros, zeros_like, already implemented in TensorFactories
+
+// --------------------------------------------------------------------
+// mul(SparseTensor, Scalar)
+// --------------------------------------------------------------------
+
+SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scalar value) {
+  AT_ASSERT(r.is_sparse());
+  AT_ASSERT(t.is_sparse());
+
+  if (isSameTensor(r, t)) {
+    r._values().mul_(value);
+  } else {
+    r.resize_as_(t);
+    r._indices().resize_as_(t._indices());
+    r._indices().copy_(t._indices());
+    Tensor r_values = r._values(); // Sigh... needed because mul_out takes Tensor&
+    at::mul_out(r_values, t._values(), value);
+    _get_sparse_impl(r)->set_nnz(t._nnz());
+    _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
+  }
+  return r;
+}
+
+SparseTensor mul_sparse_scalar(const SparseTensor& t, Scalar value) {
+  SparseTensor r = t.type().tensor();
+  mul_out_sparse_scalar(r, t, value);
+  return r;
+}
+
+SparseTensor& mul_sparse_scalar_(SparseTensor& t, Scalar v) {
+  return mul_out_sparse_scalar(t, t, v);
+}
+
+// --------------------------------------------------------------------
+// log1p(SparseTensor)
+// --------------------------------------------------------------------
+
+// TODO: add in-place variant
+
+SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
+  AT_ASSERT(r.is_sparse());
+  AT_ASSERT(t.is_sparse());
+
+  if (isSameTensor(r, t)) {
+    // don't have in-place log1p for uncoalesced input because coalesce() is not in-place
+    AT_CHECK(
+      r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
+  }
+  else {
+    r = raw_copy_sparse_(r, t.coalesce());
+  }
+  r._values().log1p_();
+  return r;
+}
+
+SparseTensor& log1p_sparse_(SparseTensor& t) {
+  AT_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
+  return log1p_out_sparse(t, t);
+}
+
+// --------------------------------------------------------------------
+// pow(SparseTensor, Scalar)
+// --------------------------------------------------------------------
+
+// TODO: add in-place variant
+
+SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Scalar value) {
+  AT_ASSERT(r.is_sparse());
+  AT_ASSERT(t_.is_sparse());
+  AT_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense");
+
+  // This coalesce is why we can't easily provide an inplace variant
+  SparseTensor t = t_.coalesce();
+
+  r.resize_as_(t);
+  r._indices().resize_as_(t._indices());
+  r._indices().copy_(t._indices());
+  Tensor r_values = r._values(); // Sigh... needed because pow_out takes Tensor&
+  at::pow_out(r_values, t._values(), value);
+  _get_sparse_impl(r)->set_nnz(t._nnz());
+  _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
+
+  return r;
+}
+
+SparseTensor pow_sparse_scalar(const SparseTensor& t, Scalar value) {
+  SparseTensor r = t.type().tensor();
+  pow_out_sparse_scalar(r, t, value);
+  return r;
+}
+
+// --------------------------------------------------------------------
+// div(SparseTensor, Scalar)
+// --------------------------------------------------------------------
+
+SparseTensor& div_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scalar value) {
+  AT_ASSERT(r.is_sparse());
+  AT_ASSERT(t.is_sparse());
+
+  if (isSameTensor(r, t)) {
+    r._values().div_(value);
+  } else {
+    r.resize_as_(t);
+    r._indices().resize_as_(t._indices());
+    r._indices().copy_(t._indices());
+    Tensor r_values = r._values(); // Sigh... needed because div_out takes Tensor&
+    at::div_out(r_values, t._values(), value);
+    _get_sparse_impl(r)->set_nnz(t._nnz());
+    _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
+  }
+  return r;
+}
+
+SparseTensor div_sparse_scalar(const SparseTensor& t, Scalar value) {
+  SparseTensor r = t.type().tensor();
+  div_out_sparse_scalar(r, t, value);
+  return r;
+}
+
+SparseTensor& div_sparse_scalar_(SparseTensor& t, Scalar value) {
+  return div_out_sparse_scalar(t, t, value);
+}
+
+// --------------------------------------------------------------------
+// norm(SparseTensor, Scalar)
+// --------------------------------------------------------------------
+
+// Only supports floating point, FYI
+Tensor norm_sparse(const SparseTensor& self, Scalar value) {
+  AT_ASSERT(self.is_sparse());
+
+  return self.coalesce()._values().norm(value);
+}
+
+// --------------------------------------------------------------------
+// add(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+  AT_ASSERT(r.is_sparse());
+  AT_ASSERT(t.is_sparse());
+  AT_ASSERT(!t.is_cuda());  // the dispatch argument
+  AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
+
+  if (src._nnz() == 0) {
+    return raw_copy_sparse_(r, t);
+  }
+  if (t._nnz() == 0) {
+    return mul_out_sparse_scalar(r, src, value);
+  }
+
+  AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions");
+
+  // saving those because they can be overwritten when doing in-place operations
+  int64_t t_nnz = t._nnz(), s_nnz = src._nnz(), max_nnz = t_nnz + s_nnz;
+  bool t_coalesced = t.is_coalesced(), s_coalesced = src.is_coalesced();
+  int64_t sparseDims = src._sparseDims();
+  LongTensor t_indices = t._indices();
+  Tensor t_values = t._values();
+  LongTensor src_indices = src._indices();
+  Tensor s_values = src._values();
+  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  Tensor r_values = _new_values_with_size_of(s_values, max_nnz).zero_();
+  r.resize_as_(src);
+  _get_sparse_impl(r)->set_indices_and_values(r_indices, r_values);  // TODO: sigh
+
+  int64_t blockSize = r_values.stride(0);
+  int64_t cmp, d;
+  int64_t r_i = 0, t_i = 0, s_i = 0;
+
+  // NB: relies on nnz tests above
+  auto t_indices_accessor = t_indices.accessor<int64_t, 2>();
+  auto r_indices_accessor = r_indices.accessor<int64_t, 2>();
+  auto src_indices_accessor = src_indices.accessor<int64_t, 2>();
+
+  AT_DISPATCH_ALL_TYPES(
+      t_values.type(), "cadd_sparse", [&] {
+        scalar_t* t_values_ptr = t_values.data<scalar_t>();
+        scalar_t* s_values_ptr = s_values.data<scalar_t>();
+        scalar_t* r_values_ptr = r_values.data<scalar_t>();
+        scalar_t cast_value = value.to<scalar_t>();
+        while (t_i < t_nnz || s_i < s_nnz) {
+          if (t_i >= t_nnz) {
+            cmp = -1;
+          } else if (s_i >= s_nnz) {
+            cmp = 1;
+          } else {
+            cmp = 0;
+            for (d = 0; d < sparseDims; d++) {
+              if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) {
+                cmp = 1;
+                break;
+              }
+              if (t_indices_accessor[d][t_i] > src_indices_accessor[d][s_i]) {
+                cmp = -1;
+                break;
+              }
+            }
+          }
+          if (cmp >= 0) {
+            for (d = 0; d < sparseDims; d++) {
+              r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
+            }
+            THBlas_axpy<scalar_t>(blockSize, 1,
+              t_values_ptr + t_i * blockSize, 1,
+              r_values_ptr + r_i * blockSize, 1);
+            t_i++;
+          }
+          if (cmp <= 0) {
+            for (d = 0; d < sparseDims; d++) {
+              r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i];
+            }
+            THBlas_axpy<scalar_t>(blockSize, cast_value,
+              s_values_ptr + s_i * blockSize, 1,
+              r_values_ptr + r_i * blockSize, 1);
+            s_i++;
+          }
+          r_i++;
+        }
+      }
+  );
+
+  _get_sparse_impl(r)->set_nnz(r_i);
+  // TODO: I think it may be possible to track inside the loop and
+  // detect when we are uncoalesced (e.g., by observing that an
+  // index goes backwards) which may be more precise than using the
+  // coalesced flag here.  But this is easy.
+  _get_sparse_impl(r)->set_coalesced(t_coalesced && s_coalesced);
+
+  return r;
+}
+
+SparseTensor s_add_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  SparseTensor r = t.type().tensor();
+  s_add_out_sparse_cpu(r, t, src, alpha);
+  return r;
+}
+
+SparseTensor& s_add_sparse_cpu_(SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  return s_add_out_sparse_cpu(t, t, src, alpha);
+}
+
+// --------------------------------------------------------------------
+// add(Tensor, SparseTensor, Scalar)
+//    formerly known as spcadd
+// --------------------------------------------------------------------
+
+template <typename scalar_t>
+void add_dense_sparse_worker_cpu(Tensor& r, Scalar value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
+  int64_t k;
+
+  auto indices_accessor = indices.accessor<int64_t, 2>();
+  auto values_accessor = values.accessor<scalar_t, 1>();
+
+  scalar_t* r_ptr = r.data<scalar_t>();
+  scalar_t cast_value = value.to<scalar_t>();
+
+  #pragma omp parallel for private(k)
+  for (k = 0; k < sparse._nnz(); k++) {
+    int64_t index = r.storage_offset();
+    for (int64_t d = 0; d < sparse._sparseDims(); d++) {
+      index += r.stride(d) * indices_accessor[d][k];
+    }
+    r_ptr[index] += cast_value * values_accessor[k];
+  }
+}
+
+Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef sparse__, Scalar value) {
+  const SparseTensor& sparse_ = sparse__.tref;
+
+  AT_ASSERT(!r.is_sparse());
+  AT_ASSERT(!dense.is_sparse());
+  AT_ASSERT(sparse_.is_sparse());
+
+  AT_ASSERT(!dense.is_cuda()); // dispatch argument
+  AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
+    dense.sizes(), " while other has size ", sparse_.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)");
+
+  r.resize_as_(dense);
+  SparseTensor sparse = sparse_.coalesce();
+
+  LongTensor indices = sparse._indices();
+  Tensor values = sparse._values();
+  int64_t nDim = dense.dim();
+  int64_t nDimI = sparse._sparseDims();
+
+  if (!isSameTensor(r, dense)) r.copy_(dense);
+  if (sparse._nnz() == 0) return r;
+
+  // accessors rely on nnz test
+  if (nDim > nDimI) {
+    auto indices_accessor = indices.accessor<int64_t, 2>();
+    for (int64_t k = 0; k < sparse._nnz(); k++) {
+      Tensor dstBuffer = r;
+      for (int64_t d = 0; d < sparse._sparseDims(); d++) {
+        dstBuffer = dstBuffer.select(0, indices_accessor[d][k]);
+      }
+      Tensor srcBuffer = values.select(0, k);
+      dstBuffer.add_(srcBuffer, value);
+    }
+  } else {
+    AT_DISPATCH_ALL_TYPES(
+        values.type(), "add_dense_sparse", [&] {
+          add_dense_sparse_worker_cpu<scalar_t>(r, value, sparse, indices, values);
+        });
+  }
+  return r;
+}
+
+Tensor add_dense_sparse_cpu(const Tensor& t, SparseTensorRef src, Scalar alpha) {
+  Tensor r = t.type().tensor();
+  add_out_dense_sparse_cpu(r, t, src, alpha);
+  return r;
+}
+
+Tensor& add_dense_sparse_cpu_(Tensor& t, SparseTensorRef src, Scalar alpha) {
+  return add_out_dense_sparse_cpu(t, t, src, alpha);
+}
+
+
+// --------------------------------------------------------------------
+// sub(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_sub_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+  AT_ASSERT(!t.is_cuda()); // dispatch argument
+  AT_CHECK(!r.is_cuda(), "sub: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!src.is_cuda(), "sub: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+
+  // UGH... We're doing two dispatches on scalar type here for no good reason.
+  // NB: I tried adding an operator- to Scalar, but there isn't any good way
+  // to negate the tensor, because I have a TensorBase...
+  AT_DISPATCH_ALL_TYPES(
+      t.type(), "sub_sparse", [&] {
+        scalar_t cast_value = value.to<scalar_t>();
+        s_add_out_sparse_cpu(r, t, src, -cast_value);
+      }
+  );
+  return r;
+}
+
+SparseTensor s_sub_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  SparseTensor r = t.type().tensor();
+  s_sub_out_sparse_cpu(r, t, src, alpha);
+  return r;
+}
+
+SparseTensor& s_sub_sparse_cpu_(SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  return s_sub_out_sparse_cpu(t, t, src, alpha);
+}
+
+// --------------------------------------------------------------------
+// mul(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_mul_out_sparse_cpu(SparseTensor& r, const SparseTensor& t_, const SparseTensor& src_) {
+  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes");
+  AT_ASSERT(!t_.is_cuda()); // dispatch argument
+  AT_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
+
+  if (src_._nnz() == 0 || t_._nnz() == 0) {
+    return r.zero_();
+  }
+
+  SparseTensor t = t_.coalesce();
+  SparseTensor src = src_.coalesce();
+
+  // saving those because they can be overwritten when doing in-place operations
+  int64_t t_nnz = t._nnz(), s_nnz = src._nnz();
+  int64_t max_nnz = std::min(t_nnz, s_nnz);  // multiply by zero is zero, and can be dropped
+  int64_t sparseDims = src._sparseDims();
+  LongTensor t_indices = t._indices();
+  Tensor t_values = t._values();
+  LongTensor src_indices = src._indices();
+  Tensor s_values = src._values();
+  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  Tensor r_values = _new_values_with_size_of(t_values, max_nnz).zero_();
+  r.resize_as_(src);
+  _get_sparse_impl(r)->set_indices_and_values(r_indices, r_values);  // TODO: sigh
+
+  int64_t match, d;
+  int64_t r_i = 0, t_i = 0, s_i = 0;
+
+  // NB: relies on nnz test above
+  auto t_indices_accessor = t_indices.accessor<int64_t, 2>();
+  auto r_indices_accessor = r_indices.accessor<int64_t, 2>();
+  auto src_indices_accessor = src_indices.accessor<int64_t, 2>();
+
+  // Check if we can find matching indices, and if so, write an
+  // entry to the result indices vector.  Returns true if matching
+  // indices were found.
+  auto index_preamble = [&]() {
+    match = 1;
+    for (d = 0; d < sparseDims; d++) {
+      if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) {
+        t_i++;
+        match = 0;
+        break;
+      }
+      if (t_indices_accessor[d][t_i] > src_indices_accessor[d][s_i]) {
+        s_i++;
+        match = 0;
+        break;
+      }
+    }
+    if (!match) return false;
+    for (d = 0; d < sparseDims; d++) {
+      r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
+    }
+    return true;
+  };
+
+  if (t_values.dim() > 1) {
+    while (t_i < t_nnz && s_i < s_nnz) {
+      if (!index_preamble()) continue;
+      r_values.select(0, r_i).addcmul_(t_values.select(0, t_i), s_values.select(0, s_i));
+      r_i++;
+      t_i++;
+      s_i++;
+    }
+  } else {
+    AT_DISPATCH_ALL_TYPES(
+        r_values.type(), "mul_out_sparse", [&] {
+          auto r_accessor = r_values.accessor<scalar_t, 1>();
+          auto t_accessor = t_values.accessor<scalar_t, 1>();
+          auto s_accessor = s_values.accessor<scalar_t, 1>();
+
+          while (t_i < t_nnz && s_i < s_nnz) {
+            if (!index_preamble()) continue;
+            r_accessor[r_i] = t_accessor[t_i] * s_accessor[s_i];
+            r_i++;
+            t_i++;
+            s_i++;
+          }
+        }
+    );
+  }
+
+  _get_sparse_impl(r)->set_nnz(r_i);
+  _get_sparse_impl(r)->set_coalesced(true);
+
+  return r;
+}
+
+SparseTensor s_mul_sparse_cpu(const SparseTensor& t, const SparseTensor& src) {
+  SparseTensor r = t.type().tensor();
+  s_mul_out_sparse_cpu(r, t, src);
+  return r;
+}
+
+SparseTensor& s_mul_sparse_cpu_(SparseTensor& t, const SparseTensor& src) {
+  return s_mul_out_sparse_cpu(t, t, src);
+}
+
+// --------------------------------------------------------------------
+// addmm(Tensor, SparseTensorRef, Tensor, Scalar, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+// NB: OMP pragmas have to get their own functions; can't put them in lambdas
+template <typename scalar_t>
+void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, Scalar beta, const Tensor& t, Scalar alpha, const Tensor& csr, const Tensor& indices, const Tensor& values, const Tensor& dense) {
+  int64_t h, i;
+
+  // r_ = alpha * sparse * dense
+  scalar_t cast_alpha = alpha.to<scalar_t>();
+  scalar_t cast_beta = beta.to<scalar_t>();
+  if (cast_beta == 0) {
+    r.zero_();
+  } else if (cast_beta == 1) {
+    if (!isSameTensor(r, t)) {
+      r.copy_(t);
+    }
+  } else {
+    at::mul_out(r, t, beta);
+  }
+
+  auto csr_accessor = csr.accessor<int64_t, 1>();
+  auto indices_accessor = indices.accessor<int64_t, 2>();
+
+  auto values_accessor = values.accessor<scalar_t, 1>();
+  scalar_t* dense_ptr = dense.data<scalar_t>();
+  scalar_t* r_ptr = r.data<scalar_t>();
+
+  int64_t dense_stride0 = dense.stride(0);
+  int64_t dense_stride1 = dense.stride(1);
+  int64_t r_stride0 = r.stride(0);
+  int64_t r_stride1 = r.stride(1);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < dim_i; h++) {
+    int64_t i_start = csr_accessor[h];
+    int64_t i_end = csr_accessor[h+1];
+    for (i = i_start; i < i_end; i++) {
+      scalar_t val = values_accessor[i];
+      int64_t col = indices_accessor[1][i];
+      if (col >= 0 && col < dim_j) {
+        THBlas_axpy<scalar_t>(dim_k,
+            cast_alpha * val,
+            dense_ptr + col * dense_stride0, dense_stride1,
+            r_ptr + h * r_stride0, r_stride1);
+      } else {
+        AT_ERROR("addmm: index out of bound: ", col, " not between 1 and ", dim_j);
+      }
+    }
+  }
+};
+
+Tensor& s_addmm_out_sparse_dense_cpu(
+    Tensor& r,
+    const Tensor& t,
+    const SparseTensor& sparse_,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  // TODO: This error message seems awfully opaque
+  AT_ASSERT(!t.is_cuda());
+  AT_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
+  AT_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
+  AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor");
+  AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
+
+  SparseTensor sparse = sparse_.coalesce();
+
+  // ixj * jxk = ixk
+  int64_t dim_i = sparse.size(0);
+  int64_t dim_j = sparse.size(1);
+  int64_t dim_k = dense.size(1);
+
+  AT_CHECK(dense.size(0) == dim_j,
+      "addmm: Argument #3 (dense): Expected dim 0 size ", dim_j, ", got ", dense.size(0));
+  AT_CHECK(t.size(0) == dim_i,
+      "addmm: Argument #1 (t): Expected dim 0 size ", dim_i, ", got ", t.size(0));
+  AT_CHECK(t.size(1) == dim_k,
+      "addmm: Argument #1 (t): Expected dim 1 size ", dim_k, ", got ", t.size(1));
+
+  r.resize_({dim_i, dim_k});
+
+  int64_t nnz        = sparse._nnz();
+
+  if (nnz == 0) {
+    at::mul_out(r, t, beta);
+    return r;
+  }
+
+  LongTensor indices = sparse._indices();
+  Tensor values      = sparse._values();
+  LongTensor csr = _to_csr(indices.data<int64_t>(), dim_i, nnz);
+
+  AT_DISPATCH_ALL_TYPES(
+      values.type(), "addmm_sparse_dense", [&] {
+        s_addmm_out_sparse_dense_worker<scalar_t>(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, csr, indices, values, dense);
+      }
+  );
+
+  return r;
+
+}
+
+Tensor s_addmm_sparse_dense_cpu(
+    const Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  Tensor r = t.type().tensor();
+  s_addmm_out_sparse_dense_cpu(r, t, sparse, dense, beta, alpha);
+  return r;
+}
+
+Tensor& s_addmm_sparse_dense_cpu_(
+    Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  return s_addmm_out_sparse_dense_cpu(t, t, sparse, dense, beta, alpha);
+}
+
+
+// --------------------------------------------------------------------
+// hspmm(SparseTensor mat1, Tensor mat2)
+// --------------------------------------------------------------------
+
+SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_, const Tensor& dense) {
+  // TODO: Make this a real argument
+  Scalar alpha = 1;
+
+  AT_ASSERT(!sparse_.is_cuda()); // dispatch argument
+  AT_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(sparse_._sparseDims() == 2,
+      "hspmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor");
+  AT_CHECK(sparse_._denseDims() == 0,
+      "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.dim() == 2,
+      "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor");
+
+  int64_t m = sparse_.size(0);
+  int64_t k = sparse_.size(1);
+  int64_t n = dense.size(1);
+
+  AT_CHECK(dense.size(0) == k,
+      "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
+
+  _get_sparse_impl(r)->raw_resize_(1, 1, {m, n});
+
+  SparseTensor sparse = sparse_.coalesce();
+
+  int64_t nnz = sparse._nnz();
+
+  if (nnz == 0) {
+    r.zero_();
+    return r;
+  }
+
+  LongTensor indices = at::CPU(kLong).tensor({1, nnz});
+
+  // Initialize the sparse matrix that will be used with spaddmm to send rows
+  // from the dense matrix to rows of the output's value tensor
+  SparseTensor newSparse = sparse.clone();
+  LongTensor spIndices = newSparse._indices();
+  LongTensor valueIndices = spIndices.select(0, 0);
+
+  // Compute output indices
+  auto valueIndices_accessor = valueIndices.accessor<int64_t, 1>();
+  auto indices_accessor = indices.accessor<int64_t, 2>();
+
+  int64_t i = -1, prevIdx = -1;
+  for (int64_t j = 0; j < nnz; j++) {
+    int64_t currIdx = valueIndices_accessor[j];
+    if (currIdx != prevIdx) {
+      indices_accessor[0][++i] = currIdx;
+      prevIdx = currIdx;
+    }
+    valueIndices_accessor[j] = i;
+  }
+  int64_t outNnz = i + 1;
+  indices.resize_({1, outNnz});
+  Tensor values = dense.type().tensor({outNnz, n});
+  _get_sparse_impl(newSparse)->_sizes_mut()[0] = outNnz; // TODO: use something safer
+
+  // Compute output values tensor with sparse * dense multiplication
+  s_addmm_out_sparse_dense_cpu(values, values, newSparse, dense, 0, alpha);
+  _get_sparse_impl(r)->set_indices_and_values(indices, values);  // TODO: sigh
+
+  return r;
+}
+
+SparseTensor hspmm_sparse_cpu(const SparseTensor& sparse, const Tensor& dense) {
+  SparseTensor r = sparse.type().tensor();
+  hspmm_out_sparse_cpu(r, sparse, dense);
+  return r;
+}
+
+// --------------------------------------------------------------------
+// sspaddmm
+// --------------------------------------------------------------------
+
+SparseTensor& _sspaddmm_out_cpu(
+    SparseTensor& r,
+    const SparseTensor& t,
+    const SparseTensor& sparse_,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  AT_ASSERT(!t.is_cuda()); // dispatch argument
+  AT_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  AT_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
+  AT_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+
+  AT_CHECK(sparse_._sparseDims() == 2,
+      "sspaddmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor");
+  AT_CHECK(sparse_._denseDims() == 0,
+      "sspaddmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.dim() == 2,
+      "sspaddmm: Argument #2: matrices expected, got ", dense.dim(), "D tensor");
+
+  SparseTensor sparse = sparse_.coalesce();
+
+  // ixj * jxk = ixk
+  int64_t dim_i = sparse.size(0);
+  int64_t dim_j = sparse.size(1);
+  int64_t dim_k = dense.size(1);
+
+  // NB: This has to occur before the checks, because r may alias t.
+  // See test_saddmm
+  r.sparse_raw_resize_({dim_i, dim_k}, 2, 0);
+
+  AT_CHECK(dense.size(0) == dim_j,
+      "sspaddmm: Argument #3: Expected dim 0 size ", dim_j, ", got ", dense.size(0));
+  AT_CHECK(t.size(0) == dim_i,
+      "sspaddmm: Argument #1: Expected dim 0 size ", dim_i, ", got ", t.size(0));
+  AT_CHECK(t.size(1) == dim_k,
+      "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1));
+
+  int64_t nnz        = sparse._nnz();
+  LongTensor indices = sparse._indices();
+  Tensor values      = sparse._values();
+
+  LongTensor csr = _to_csr(indices.data<int64_t>(), dim_i, nnz);
+
+  int64_t t_nnz = t._nnz();
+  int64_t r_nnz = nnz * dim_k + t_nnz;
+  LongTensor newi = native::empty({2, r_nnz}, kLong);
+  LongTensor newv = native::zeros({r_nnz}, values.options());
+
+  if (t_nnz != 0) {
+    LongTensor narrowi = newi.narrow(1, 0, t_nnz);
+    Tensor narrowv = newv.narrow(0, 0, t_nnz);
+
+    narrowi.copy_(t._indices());
+    narrowv.copy_(t._values());
+    newv.mul_(beta);
+  }
+
+  // sparse = sparse * dense
+  int64_t p = t_nnz;
+
+  auto csr_accessor = csr.accessor<int64_t, 1>();
+  auto indices_accessor = indices.accessor<int64_t, 2>();
+  auto newi_accessor = newi.accessor<int64_t, 2>();
+
+  int64_t dense_stride0 = dense.stride(0);
+  int64_t dense_stride1 = dense.stride(1);
+  int64_t newv_stride0 = newv.stride(0);
+
+  AT_DISPATCH_ALL_TYPES(
+      values.type(), "sspmm", [&] {
+        auto values_accessor = values.accessor<scalar_t, 1>();
+        scalar_t* dense_ptr = dense.data<scalar_t>();
+        scalar_t* newv_ptr = newv.data<scalar_t>();
+        scalar_t cast_alpha = alpha.to<scalar_t>();
+
+        for (int64_t h = 0; h < dim_i; h++) {
+          int64_t i_start = csr_accessor[h];
+          int64_t i_end = csr_accessor[h+1];
+          for (int64_t i = i_start; i < i_end; i++) {
+            scalar_t val = values_accessor[i];
+            int64_t col = indices_accessor[1][i];
+            if (col >= 0 && col < dim_j) {
+              THBlas_axpy<scalar_t>(dim_k,
+                  cast_alpha * val,
+                  dense_ptr + col * dense_stride0, dense_stride1,
+                  newv_ptr + p * newv_stride0, 1);
+            } else {
+              AT_ERROR("index out of bound. sspmm: ", col, " not between 1 and ", dim_j);
+            }
+          }
+          // Fill up the indices with the right values
+          if (i_start != i_end) {
+            for (int64_t i = 0; i < dim_k; i++) {
+              newi_accessor[0][p+i] = h;
+              newi_accessor[1][p+i] = i;
+            }
+            p += dim_k;
+          }
+        }
+      }
+  );
+
+  // to avoid a clone
+  _get_sparse_impl(r)->set_indices(newi);
+  _get_sparse_impl(r)->set_values(newv);
+  _get_sparse_impl(r)->set_nnz(p);
+
+  return r;
+}
+
+// sparse, sparse, sparse, dense, real, real -> sparse
+Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self,
+    const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
+  AT_ERROR("tensor.sspaddmm(...) can only be called on sparse tensors");
+}
+
+// sparse, dense -> sparse
+Tensor smm(const Tensor& self, const Tensor& mat2) {
+  auto result = self.type().tensor();
+  self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
+  return result;
+}
+
+// sparse, sparse, dense, real, real -> sparse
+Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2,
+    Scalar beta, Scalar alpha) {
+  auto result = self.type().tensor();
+  self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha);
+  return result;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
new file mode 100644
index 0000000..226b908
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -0,0 +1,131 @@
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+
+#include <TH/THGeneral.h>
+
+namespace at { namespace native {
+
+// Just for documentary purposes
+using SparseTensor = Tensor;
+using LongTensor = Tensor;
+using IntTensor = Tensor;
+using SparseType = Type;
+
+namespace {
+
+// This is an internal utility function for getting at the SparseTensorImpl,
+// so that we can write sparse tensor specific accessors for special fields
+// in SparseTensor.  You should only use this for writing low level
+// setters/getters for SparseTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+//
+// This may be called repeatedly, so make sure it's pretty cheap.
+SparseTensorImpl* _get_sparse_impl(const SparseTensor& self) {
+  if (!self.is_sparse()) AT_ERROR("_internal_get_SparseTensorImpl: not a sparse tensor");
+  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
+}
+
+// Port of the old THCSTensor_(checkGPU), but it doesn't really belong here
+// because it is more general
+// NB: I dropped kernelP2PEnabled support
+// NB: This only works if the tensors are KNOWN to be CUDA.
+// TODO: Generalize it so it works on CPU as well
+inline bool _check_device(ArrayRef<Tensor> ts) {
+  if (ts.empty()) {
+    return true;
+  }
+  const Tensor& ref_t = ts.front();
+  int64_t curDevice = current_device();
+  for (const Tensor& t : ts) {
+    if (t.get_device() != curDevice) return false;
+  }
+  return true;
+}
+
+inline void _raw_resize_sparse(const SparseTensor& self, int64_t sparseDims, int64_t denseDims, IntList size) {
+  _get_sparse_impl(self)->raw_resize_(sparseDims, denseDims, size);
+}
+
+// Takes indices and values and directly puts them into the sparse tensor, no
+// copy.  This used to be called THSTensor_(_move)
+inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
+  _get_sparse_impl(self)->set_indices_and_values(indices, values);
+}
+
+// Take indices and values and makes a (data) copy of them to put into the sparse
+// indices/values.  This used to be called THSTensor_(_set)
+inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
+  _alias_into_sparse(self, indices.clone(), values.clone());
+}
+
+// Does NOT make copies of indices/values
+inline SparseTensor _new_with_dims_and_tensor_sparse(
+    const SparseType& dtype,
+    int64_t sparseDims,
+    int64_t denseDims,
+    ArrayRef<int64_t> sizes,
+    const LongTensor& indices,
+    const Tensor& values) {
+  SparseTensor self = new_sparse(dtype);
+  _raw_resize_sparse(self, sparseDims, denseDims, sizes);
+  _alias_into_sparse(self, indices, values);
+  return self;
+}
+
+// TODO: put this into the public API
+inline bool isSameTensor(const Tensor& lhs, const Tensor& rhs) {
+  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
+}
+
+inline bool _is_same_density(const SparseTensor& self, const SparseTensor& src) {
+  return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims();
+}
+
+// if forceClone is true, the result will forced to be a clone of self.
+inline LongTensor _newFlattenedIndices(const SparseTensor& self, bool forceClone) {
+  LongTensor indices = self._indices();
+  int64_t sparseDims = self._sparseDims();
+  if (sparseDims == 1) {
+    if (forceClone) {
+      return indices.clone();
+    } else {
+      return indices;
+    }
+  } else {
+    // FIXME TH_INDEX_BASE
+    int64_t factor = 1;
+    LongTensor indices1D = at::empty({1, self._nnz()}, indices.options());
+    indices1D.fill_(TH_INDEX_BASE);
+    for (int64_t d = sparseDims - 1; d >= 0; d--) {
+      indices1D.add_(indices.select(0, d), factor);
+      if (TH_INDEX_BASE != 0) {
+        indices1D.add_(-TH_INDEX_BASE);
+      }
+      factor *= self.size(d);
+    }
+    return indices1D;
+  }
+}
+
+// Give us a new values tensor, with the same dimensionality
+// as 'values' but with a new number of non-zero elements.
+// TODO: Expose this for real in ATen, some day?
+// NB: Doesn't preserve data.
+inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
+  if (values.numel() == 0) { // values tensor uninitialized
+    // TODO: This logic looks bogus; if we have an uninitialized
+    // values tensor, why should we believe that denseDims == 0?
+    // That's the assumption this code makes.
+    return values.type().tensor({nnz});
+  } else {
+    std::vector<int64_t> size = values.sizes();
+    size[0] = nnz;
+    return values.type().tensor(size);
+  }
+}
+
+
+
+} // anonymous namespace
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
new file mode 100644
index 0000000..44bd3ab
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -0,0 +1,323 @@
+#pragma once
+
+#include <ATen/cuda/detail/TensorInfo.cuh>
+
+namespace at { namespace native {
+
+namespace apply {
+
+using at::cuda::detail::TensorInfo;
+using indexT = int64_t;
+
+const int WARP_SIZE = 32;
+
+template <typename IndexType, typename Real, typename Op>
+__device__ void applyOp2(
+    Op op, IndexType blockSize,
+    TensorInfo<Real, IndexType> values1, IndexType idx1,
+    TensorInfo<Real, IndexType> values2, IndexType idx2) {
+  for (IndexType k = blockIdx.x * blockDim.x + threadIdx.x;
+       k < blockSize;
+       k += gridDim.x * blockDim.x) {
+    op(values1.data + idx1 * blockSize + k, values2.data + idx2 * blockSize + k);
+  }
+}
+
+template <typename IndexType, typename Real, typename Op>
+__device__ void applyOp3(
+    Op op, IndexType blockSize,
+    TensorInfo<Real, IndexType> values1, IndexType idx1,
+    TensorInfo<Real, IndexType> values2, IndexType idx2,
+    TensorInfo<Real, IndexType> values3, IndexType idx3) {
+  for (IndexType k = blockIdx.x * blockDim.x + threadIdx.x;
+       k < blockSize;
+       k += gridDim.x * blockDim.x) {
+    op(values1.data + idx1 * blockSize + k,
+       values2.data + idx2 * blockSize + k,
+       values3.data + idx3 * blockSize + k);
+  }
+}
+
+template <typename Op, typename IndexType, typename Real>
+__global__ void sparseElementwiseKernel(
+    Op op,
+    TensorInfo<Real, IndexType> dense,
+    TensorInfo<indexT, IndexType> indices,
+    TensorInfo<Real, IndexType> values,
+    const IndexType nnz) {
+  IndexType indskip = indices.strides[0];
+  IndexType valueSize = values.strides[0];
+  for (IndexType linearId = blockIdx.x;
+       linearId < nnz;
+       linearId += gridDim.x) {
+    IndexType index = 0;
+    for (IndexType d = 0; d < indices.sizes[0]; d++) {
+      index = dense.sizes[d] * index + indices.data[d * indskip + linearId];
+    }
+    Real *dst = dense.data + index * valueSize;
+    Real *src = values.data + linearId * valueSize;
+    for (IndexType linearId2 = threadIdx.x; linearId2 < valueSize; linearId2 += blockDim.x) {
+      op(dst + linearId2, src + linearId2);
+    }
+  }
+}
+
+template <typename Op, typename IndexType, typename Real>
+__global__ void sparseElementwiseKernelScalar(
+    Op op,
+    TensorInfo<Real, IndexType> dense,
+    TensorInfo<indexT, IndexType> indices,
+    TensorInfo<Real, IndexType> values,
+    const IndexType nnz) {
+  IndexType indskip = indices.strides[0];
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < nnz;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType index = 0;
+    for (IndexType d = 0; d < indices.sizes[0]; d++) {
+      index = dense.sizes[d] * index + indices.data[d * indskip + linearId];
+    }
+    op(dense.data + index, values.data + linearId);
+  }
+}
+
+template <typename OpBoth, typename OpLeft, typename OpRight, typename IndexType, typename Real>
+__global__ void valueSparseUnionKernel(
+    OpBoth opBoth,
+    OpLeft opLeft,
+    OpRight opRight,
+    TensorInfo<indexT, IndexType> r_indices,
+    TensorInfo<indexT, IndexType> t_indices,
+    TensorInfo<indexT, IndexType> s_indices,
+    TensorInfo<Real, IndexType> r_values,
+    TensorInfo<Real, IndexType> t_values,
+    TensorInfo<Real, IndexType> s_values,
+    const IndexType t_nnz, const IndexType s_nnz) {
+  IndexType t_indskip = t_indices.strides[0];
+  IndexType s_indskip = s_indices.strides[0];
+  int64_t cmp, d;
+  int64_t nDimI = r_indices.sizes[0];
+  IndexType valueSize = r_values.strides[0];
+  IndexType r_i = 0, t_i = 0, s_i = 0;
+  while (t_i < t_nnz || s_i < s_nnz) {
+    if (t_i >= t_nnz) {
+      cmp = -1;
+    } else if (s_i >= s_nnz) {
+      cmp = 1;
+    } else {
+      cmp = 0;
+      for (d = 0; d < nDimI; d++) {
+        if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
+          cmp = 1;
+          break;
+        }
+        if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
+          cmp = -1;
+          break;
+        }
+      }
+    }
+    if (cmp == 0) applyOp3(opBoth, valueSize, r_values, r_i, t_values, t_i++, s_values, s_i++);
+    else if (cmp > 0) applyOp2(opLeft, valueSize, r_values, r_i, t_values, t_i++);
+    else if (cmp < 0) applyOp2(opRight, valueSize, r_values, r_i, s_values, s_i++);
+    r_i++;
+  }
+}
+
+// TODO find a way to parallelize this...
+template <typename IndexType, typename Real>
+__global__ void indexSparseUnionKernel(
+    TensorInfo<indexT, IndexType> r_indices,
+    TensorInfo<indexT, IndexType> t_indices,
+    TensorInfo<indexT, IndexType> s_indices,
+    const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) {
+  IndexType r_indskip = r_indices.strides[0];
+  IndexType t_indskip = t_indices.strides[0];
+  IndexType s_indskip = s_indices.strides[0];
+  int64_t cmp, d;
+  int64_t nDimI = r_indices.sizes[0];
+  IndexType r_i = 0, t_i = 0, s_i = 0;
+  while (t_i < t_nnz || s_i < s_nnz) {
+    if (t_i >= t_nnz) {
+      cmp = -1;
+    } else if (s_i >= s_nnz) {
+      cmp = 1;
+    } else {
+      cmp = 0;
+      for (d = 0; d < nDimI; d++) {
+        if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
+          cmp = 1;
+          break;
+        }
+        if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
+          cmp = -1;
+          break;
+        }
+      }
+    }
+    if (cmp >= 0) {
+      for (d = 0; d < nDimI; d++) {
+        r_indices.data[d * r_indskip + r_i] = t_indices.data[d * t_indskip + t_i];
+      }
+      t_i++;
+    }
+    if (cmp <= 0) {
+      for (d = 0; d < nDimI; d++) {
+        r_indices.data[d * r_indskip + r_i] = s_indices.data[d * s_indskip + s_i];
+      }
+      s_i++;
+    }
+    r_i++;
+  }
+  *resultNnz = r_i;
+}
+
+template <typename Op, typename IndexType, typename Real>
+__global__ void valueSparseIntersectionKernel(
+    Op op,
+    TensorInfo<indexT, IndexType> r_indices,
+    TensorInfo<indexT, IndexType> t_indices,
+    TensorInfo<indexT, IndexType> s_indices,
+    TensorInfo<Real, IndexType> r_values,
+    TensorInfo<Real, IndexType> t_values,
+    TensorInfo<Real, IndexType> s_values,
+    const IndexType t_nnz, const IndexType s_nnz) {
+  IndexType t_indskip = t_indices.strides[0];
+  IndexType s_indskip = s_indices.strides[0];
+  int64_t match, d;
+  int64_t nDimI = r_indices.sizes[0];
+  IndexType valueSize = r_values.strides[0];
+  IndexType r_i = 0, t_i = 0, s_i = 0;
+  while (t_i < t_nnz && s_i < s_nnz) {
+    match = 1;
+    for (d = 0; d < nDimI; d++) {
+      if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
+        t_i++;
+        match = 0;
+        break;
+      }
+      if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
+        s_i++;
+        match = 0;
+        break;
+      }
+    }
+    if (!match) continue;
+    applyOp3(op, valueSize, r_values, r_i++, t_values, t_i++, s_values, s_i++);
+  }
+}
+
+// TODO find a way to parallelize this...
+template <typename IndexType, typename Real>
+__global__ void indexSparseIntersectionKernel(
+    TensorInfo<indexT, IndexType> r_indices,
+    TensorInfo<indexT, IndexType> t_indices,
+    TensorInfo<indexT, IndexType> s_indices,
+    const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) {
+  IndexType r_indskip = r_indices.strides[0];
+  IndexType t_indskip = t_indices.strides[0];
+  IndexType s_indskip = s_indices.strides[0];
+  int64_t match, d;
+  int64_t nDimI = r_indices.sizes[0];
+  IndexType r_i = 0, t_i = 0, s_i = 0;
+  while (t_i < t_nnz && s_i < s_nnz) {
+    match = 1;
+    for (d = 0; d < nDimI; d++) {
+      if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
+        t_i++;
+        match = 0;
+        break;
+      }
+      if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
+        s_i++;
+        match = 0;
+        break;
+      }
+    }
+    if (!match) continue;
+    for (d = 0; d < nDimI; d++) {
+      r_indices.data[d * r_indskip + r_i] = t_indices.data[d * t_indskip + t_i];
+    }
+    r_i++; t_i++; s_i++;
+  }
+  *resultNnz = r_i;
+}
+
+// template <typename Dtype, typename Acctype>
+// __global__ void coalesceValuesKernel_gridStrided(
+//   long *segment_offsets, long *value_indices,
+//   Dtype *values, Dtype *newValues,
+//   long nnz, long newNnz, long stride) {
+//
+//   long chunksPerSeg = THCCeilDiv(stride, (long) blockDim.x);
+//   long numChunks = newNnz * chunksPerSeg;
+//   long chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+//   long chunkStride = gridDim.x * blockDim.y;
+//
+//   for (long chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+//     long featureDim = (chunk % chunksPerSeg) * blockDim.x + threadIdx.x;
+//     if (featureDim < stride) {
+//       auto valFeat = values + featureDim;
+//       long seg = chunk / chunksPerSeg;
+//       auto begin = segment_offsets[seg];
+//       auto end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+//       Acctype valSum = static_cast<Acctype>::to(0);
+//       for (long valIdx = begin; valIdx < end; valIdx++) {
+//         const long valRow = value_indices[valIdx] * stride;
+//         valSum += static_cast<Acctype>::to(valFeat[valRow]);
+//       }
+//       newValues[seg * stride + featureDim] = static_cast<Dtype>::to(valSum);
+//     }
+//   }
+// }
+
+template <typename Dtype, typename Acctype>
+__global__ void coalesceValuesKernel(
+  int64_t *segment_offsets, int64_t *value_indices,
+  Dtype *values, Dtype *newValues,
+  int64_t nnz, int64_t newNnz, int64_t stride) {
+
+  int seg = blockIdx.x * 4 + threadIdx.y;
+
+  // Number of values processed by each thread (grain size)
+  const int SZ = 4;
+
+  if (seg < newNnz) {
+    const int newValueRow = seg * stride;
+    const int begin = segment_offsets[seg];
+    const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+    const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+    Acctype tmp[SZ];
+    #pragma unroll
+    for (int ii = 0; ii < SZ; ii++) {
+      tmp[ii] = 0;
+    }
+    for (int row = begin; row < end; row++) {
+      const int valueRow = ((int) value_indices[row]) * stride;
+
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride)
+        {
+          tmp[ii] += static_cast<Acctype>(values[valueRow + featureDim]);
+        }
+      }
+    }
+    #pragma unroll
+    for (int ii = 0; ii < SZ; ii++)
+    {
+      int featureDim = startFeature + ii * WARP_SIZE;
+      if (featureDim < stride)
+      {
+        newValues[newValueRow + featureDim] = static_cast<Dtype>(tmp[ii]);
+      }
+    }
+  }
+}
+
+} // namespace apply
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
new file mode 100644
index 0000000..0ed53be
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
@@ -0,0 +1,228 @@
+#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/Error.h>
+#include <ATen/Context.h>
+
+#include <TH/THGeneral.h>
+
+#include <cusparse.h>
+
+namespace at { namespace native { namespace sparse { namespace cuda {
+
+#ifndef __HIP_PLATFORM_HCC__
+
+std::string cusparseGetErrorString(cusparseStatus_t status) {
+  switch(status)
+  {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "success";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "library not initialized";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "resource allocation failed";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "an invalid numeric value was used as an argument";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "an absent device architectural feature is required";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "an access to GPU memory space failed";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "the GPU program failed to execute";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "an internal operation failed";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "the matrix type is not supported by this function";
+
+    case CUSPARSE_STATUS_ZERO_PIVOT:
+      return "an entry of the matrix is either structural zero or numerical zero (singular block)";
+
+    default:
+      {
+        std::ostringstream oss;
+        oss << "unknown error " << static_cast<int64_t>(status);
+        return oss.str();
+      }
+  }
+}
+
+inline void CUSPARSE_CHECK(cusparseStatus_t status)
+{
+  if (status != CUSPARSE_STATUS_SUCCESS) {
+    AT_ERROR("cusparse runtime error: ", cusparseGetErrorString(status));
+  }
+}
+
+inline cusparseHandle_t setCUDASparseStream() {
+  cusparseHandle_t handle = globalContext().getCurrentCUDASparseHandle();
+  cusparseSetStream(handle, globalContext().getCurrentCUDAStream());
+  return handle;
+}
+
+void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) {
+  AT_CHECK((m <= INT_MAX) && (nnz <= INT_MAX),
+    "cusparseXcoo2csr only supports m, nnz with the bound [val] <= ",
+    INT_MAX);
+  auto handle = setCUDASparseStream();
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, coorowind, nnz, m, csrrowptr,
+    TH_INDEX_BASE ? CUSPARSE_INDEX_BASE_ONE : CUSPARSE_INDEX_BASE_ZERO
+  ));
+}
+
+cusparseOperation_t convertTransToCusparseOperation(char trans) {
+  if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE;
+  else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE;
+  else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+  else {
+    AT_ERROR("trans must be one of: t, n, c");
+  }
+}
+
+void adjustLd(char transb, int64_t m, int64_t n, int64_t k, int64_t *ldb, int64_t *ldc)
+{
+  int transb_ = ((transb == 't') || (transb == 'T'));
+
+  if(n == 1)
+    *ldc = m;
+
+  if(transb_)
+  {
+    if(k == 1)
+      *ldb = n;
+  }
+  else
+  {
+    if(n == 1)
+      *ldb = k;
+  }
+}
+
+/* Level 3 */
+void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
+{
+  adjustLd(transb, m, n, k, &ldb, &ldc);
+  cusparseOperation_t opa = convertTransToCusparseOperation(transa);
+  cusparseOperation_t opb = convertTransToCusparseOperation(transb);
+
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
+    "cusparseScsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_k = (int)k;
+  int i_nnz = (int)nnz;
+  int i_ldb = (int)ldb;
+  int i_ldc = (int)ldc;
+
+  auto handle = setCUDASparseStream();
+  cusparseMatDescr_t desc;
+  cusparseCreateMatDescr(&desc);
+#if TH_INDEX_BASE == 1
+  cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE);
+#endif
+  CUSPARSE_CHECK(cusparseScsrmm2(handle, opa, opb, i_m, i_n, i_k, i_nnz, &alpha, desc, csrvala, csrrowptra, csrcolinda, b, i_ldb, &beta, c, i_ldc));
+}
+
+void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
+{
+  adjustLd(transb, m, n, k, &ldb, &ldc);
+  cusparseOperation_t opa = convertTransToCusparseOperation(transa);
+  cusparseOperation_t opb = convertTransToCusparseOperation(transb);
+
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
+    "cusparseDcsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_k = (int)k;
+  int i_nnz = (int)nnz;
+  int i_ldb = (int)ldb;
+  int i_ldc = (int)ldc;
+
+  auto handle = setCUDASparseStream();
+  cusparseMatDescr_t desc;
+  cusparseCreateMatDescr(&desc);
+#if TH_INDEX_BASE == 1
+  cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE);
+#endif
+  CUSPARSE_CHECK(cusparseDcsrmm2(handle, opa, opb, i_m, i_n, i_k, i_nnz, &alpha, desc, csrvala, csrrowptra, csrcolinda, b, i_ldb, &beta, c, i_ldc));
+  // TODO: I think this leaks the matrix descriptor.  Proper fix is to create
+  // real descriptor classes
+}
+
+/* format conversion */
+void CreateIdentityPermutation(int64_t nnz, int *P) {
+  AT_CHECK((nnz <= INT_MAX),
+    "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
+    INT_MAX);
+  int i_nnz = (int)nnz;
+
+  auto handle = setCUDASparseStream();
+  cusparseCreateIdentityPermutation(handle, i_nnz, P);
+}
+
+void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes)
+{
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+    "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <=",
+    INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_nnz = (int)nnz;
+
+  auto handle = setCUDASparseStream();
+  CUSPARSE_CHECK(cusparseXcsrsort_bufferSizeExt(handle, i_m, i_n, i_nnz, csrRowPtr, csrColInd, pBufferSizeInBytes));
+}
+
+void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer)
+{
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+    "Xcsrsort only supports m, n, nnz with the bound [val] <= ",
+    INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_nnz = (int)nnz;
+
+  auto handle = setCUDASparseStream();
+  cusparseMatDescr_t desc;
+  cusparseCreateMatDescr(&desc);
+#if TH_INDEX_BASE == 1
+  cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE);
+#endif
+  CUSPARSE_CHECK(cusparseXcsrsort(handle, i_m, i_n, i_nnz, desc, csrRowPtr, csrColInd, P, pBuffer));
+  // TODO: I think this leaks the matrix descriptor.
+}
+
+void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes)
+{
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+    "Xcoosort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
+    INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_nnz = (int)nnz;
+
+  auto handle = setCUDASparseStream();
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, i_m, i_n, i_nnz, cooRows, cooCols, pBufferSizeInBytes));
+}
+
+void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer)
+{
+  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+    "XcoosortByRow only supports m, n, nnz with the bound [val] <= ",
+    INT_MAX);
+  int i_m = (int)m;
+  int i_n = (int)n;
+  int i_nnz = (int)nnz;
+
+  auto handle = setCUDASparseStream();
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, i_m, i_n, i_nnz, cooRows, cooCols, P, pBuffer));
+}
+
+#endif
+
+}}}} // namespace at::native::sparse::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
new file mode 100644
index 0000000..ed800fc
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/cuda/ATenCUDAGeneral.h>
+
+namespace at { namespace native { namespace sparse { namespace cuda {
+
+AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
+
+/* Level 3 */
+AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
+AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
+
+// overloaded version
+inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
+inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
+
+/* format conversion */
+AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
+AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
+AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
+AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
+AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
+
+}}}} // namespace at::native::sparse::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
new file mode 100644
index 0000000..68ab33a
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -0,0 +1,61 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+
+#include <ATen/native/sparse/SparseUtils.h>
+
+namespace at { namespace native {
+
+SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const SparseTensor& mask) {
+  AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
+  AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
+      t.sizes(), " but mask has size ", mask.sizes());
+  AT_ASSERT(t.is_cuda()); // dispatch argument
+  AT_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU");
+  AT_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU");
+  AT_CHECK(_check_device({r, t, mask}),
+      "sparse_mask: arguments are located on different devices; self is on device ", t.get_device(),
+      ", mask is on device ", mask.get_device(), ", out is on device ", r.get_device());
+  resize_as_sparse_(r, mask);
+  if (mask._nnz() == 0) {
+    return r.zero_();
+  }
+  LongTensor mask_indices = mask._indices();
+  Tensor mask_values = mask._values();
+  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  _alias_into_sparse(r, mask_indices.clone(), r_values);
+  _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
+  _get_sparse_impl(r)->set_nnz(mask._nnz());
+
+  LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options());
+
+  for (int64_t d = 0; d < mask._sparseDims(); d++) {
+    indices.mul_(mask.size(d));
+    // This used to use a buffer but I deoptimized it
+    indices.add_(mask_indices.select(0, d));
+  }
+
+  std::vector<int64_t> view_size(1 + mask._denseDims());
+  view_size[0] = -1;
+  for (int64_t d = 0; d < mask._denseDims(); d++) {
+    view_size[d + 1] = mask.size(mask._sparseDims() + d);
+  }
+
+  Tensor t_view = t.view(view_size);
+  // TODO: Re-audit this; it used to be an indexSelect directly into r_values
+  at::index_select_out(r_values, t_view, 0, indices);
+
+  return r;
+}
+
+SparseTensor sparse_mask_cuda(const Tensor& t, SparseTensorRef mask) {
+  SparseTensor r = t.type().toSparse().tensor();
+  sparse_mask_out_cuda(r, t, mask.tref);
+  return r;
+}
+
+// Technically, this is not actually CUDA specific
+int64_t get_device_sparse_cuda(const Tensor& self) {
+  return self._values().get_device();
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
new file mode 100644
index 0000000..a12edc9
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -0,0 +1,154 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#include <THC/THCThrustAllocator.cuh>
+#include <THC/THCTensorSort.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/generate.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+namespace at { namespace native {
+
+SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
+#ifndef __HIP_PLATFORM_HCC__
+  int64_t nnz = self._nnz();
+  if (nnz < 2) {
+    _get_sparse_impl(self)->set_coalesced(true);
+  }
+  if (self.is_coalesced()) {
+    return self;
+  }
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+  // Replace instances with
+
+  // For indices, a simple sort + unique suffices
+  // For values, we use a custom kernel for segmented reduction (can't use Thrust due to indirection).
+
+  // TODO: I'm not sure if this could ever be non-contiguous
+  LongTensor values = self._values().contiguous();
+
+  int64_t sparseDims = self._sparseDims();
+  int64_t stride = values.stride(0);
+
+  // indices will be modified by Thrust, so we have to clone or use new storage
+  // here.
+  LongTensor indices1D = _newFlattenedIndices(self, true);
+
+  LongTensor origIndices = at::empty({nnz}, self._indices().options());
+  LongTensor uniqueOffsets = at::empty({nnz}, self._indices().options());
+
+  typedef thrust::device_ptr<int64_t> thrust_ptr;
+  thrust_ptr indicesIter(indices1D.data<int64_t>());
+  thrust_ptr origIndicesIter(origIndices.data<int64_t>());
+  thrust_ptr uniqueOffsetsIter(uniqueOffsets.data<int64_t>());
+
+
+  // Fill sortedOrigIndices with sequential indices
+  thrust::counting_iterator<int64_t> countIterI(TH_INDEX_BASE);
+  thrust::counting_iterator<int64_t> countIterO(TH_INDEX_BASE);
+
+  thrust::copy(policy, countIterI, countIterI + nnz, origIndicesIter);
+  thrust::copy(policy, countIterO, countIterO + nnz, uniqueOffsetsIter);
+
+  thrust::sort_by_key(policy,
+    indicesIter, indicesIter + nnz,
+    origIndicesIter, ThrustLTOp<int64_t>()
+  );
+
+  // this forces device-host synchronization!
+  thrust::pair<thrust_ptr, thrust_ptr> newEnd = thrust::unique_by_key(policy,
+    indicesIter, indicesIter + nnz,
+    uniqueOffsetsIter
+  );
+  int64_t newNnz = newEnd.first - indicesIter;
+
+  indices1D.resize_({1, newNnz});
+  std::vector<int64_t> newValues_size(values.sizes());
+  newValues_size[0] = newNnz;
+  Tensor newValues = at::empty(newValues_size, values.options());
+
+  dim3 grid(THCCeilDiv(newNnz, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
+  dim3 block(32, 4);
+  AT_DISPATCH_ALL_TYPES_AND_HALF(
+      values.type(), "coalesce_sparse_cuda", [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        apply::coalesceValuesKernel<scalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+          uniqueOffsets.data<int64_t>(),
+          origIndices.data<int64_t>(),
+          values.data<scalar_t>(),
+          newValues.data<scalar_t>(),
+          nnz,
+          newNnz,
+          stride
+        );
+      });
+
+// this grid-strided version is slower but probably more flexible
+  // to different sizes
+  // int64_t blockX = min(stride, (int64_t) 512);
+  // dim3 block(blockX, 512 / blockX);
+  // int64_t grid = min((int64_t) 1024, THCCeilDiv((int64_t) newNnz * stride, (int64_t) block.x * block.y));
+  // THCSTensor_coalesceValuesKernel_gridStrided<real, accreal><<<grid, block, 0, stream>>>(
+  //   THCIndexTensor_(data)(state, uniqueOffsets),
+  //   THCIndexTensor_(data)(state, origIndices),
+  //   THCTensor_(data)(state, values),
+  //   THCTensor_(data)(state, newValues),
+  //   nnz,
+  //   newNnz,
+  //   stride
+  // );
+
+  ////////////////////////////////////////////////////////////
+  // unflatten indices if necessary
+  LongTensor newIndices;
+  if (sparseDims == 1) {
+    newIndices = indices1D;
+  } else {
+    newIndices = at::empty({sparseDims, newNnz}, origIndices.options());
+    if (TH_INDEX_BASE != 0) {
+      indices1D.add_(-1);
+    }
+    for (int64_t d = sparseDims - 1; d >= 0; d--) {
+      // NB: Not a select, so I can preserve the outer dimension
+      LongTensor indicesSlice = newIndices.narrow(0, d, 1);
+      // Note for the porting guide: THCTensor_(copy) does NOT do normal
+      // broadcasting logic; instead, it will blast the elements from one
+      // to the other so long as the numel is the same
+      indicesSlice.copy_(indices1D);
+      indices1D.div_(self.size(d));
+      indicesSlice.add_(indices1D, -self.size(d));
+    }
+    if (TH_INDEX_BASE != 0) {
+      indices1D.add_(1); // "lol"
+    }
+  }
+  ////////////////////////////////////////////////////////////
+
+  SparseTensor dst = ::at::native::sparse_coo_tensor(newIndices, newValues, self.sizes());
+  _get_sparse_impl(dst)->set_coalesced(true);
+
+  THCudaCheck(cudaGetLastError());
+  return dst;
+#else
+  AT_ERROR("coalesce_sparse_cuda: HIP not supported");
+#endif
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
new file mode 100644
index 0000000..3521fc3
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -0,0 +1,530 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+
+#include <THC/THCTensorMathPointwise.cuh>
+#include <THC/THCThrustAllocator.cuh>
+#include <thrust/device_ptr.h>
+#include <thrust/sequence.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+#define I_INFO(tensor) cuda::detail::getTensorInfo<int64_t, uint64_t>(tensor)
+#define V_INFO(tensor) cuda::detail::getTensorInfo<scalar_t, uint64_t>(tensor)
+
+namespace at { namespace native {
+
+// --------------------------------------------------------------------
+// Utility functions
+// --------------------------------------------------------------------
+
+#ifndef __HIP_PLATFORM_HCC__
+namespace {
+  IntTensor _to_csr_int(const LongTensor& rowIndices, int64_t dim, int64_t nnz) {
+    IntTensor csr = at::empty({dim+1}, CUDA(kInt));
+    IntTensor rowIndicesInt = at::empty({rowIndices.size(0)}, CUDA(kInt));
+    rowIndicesInt.copy_(rowIndices);
+    sparse::cuda::Xcoo2csr(rowIndicesInt.data<int32_t>(), nnz, dim, csr.data<int32_t>());
+    return csr;
+  }
+}
+#endif
+
+// NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not
+// wired at all)
+
+// --------------------------------------------------------------------
+// addmm(Tensor, SparseTensorRef, Tensor, Scalar, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, Scalar beta, Scalar alpha) {
+#ifndef __HIP_PLATFORM_HCC__
+  AT_ASSERT(t.is_cuda()); // dispatch argument
+  AT_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU");
+  AT_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU");
+  AT_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU");
+
+  AT_CHECK(_check_device({sparse_, r_, t, dense}));
+
+  // TODO: This error message seems awfully opaque
+  AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
+  AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
+
+  // mxk * kxn = mxn
+  int64_t m = sparse_.size(0);
+  int64_t k = sparse_.size(1);
+  int64_t n = dense.size(1);
+
+  AT_CHECK(t.size(0) == m,
+      "addmm: Argument #1 (t): Expected dim 0 size ", m, ", got ", t.size(0));
+  AT_CHECK(t.size(1) == n,
+      "addmm: Argument #1 (t): Expected dim 1 size ", n, ", got ", t.size(1));
+  AT_CHECK(dense.size(0) == k,
+      "addmm: Argument #3 (dense): Expected dim 0 size ", k, ", got ", dense.size(0));
+
+  r_.resize_({m, n});
+
+  SparseTensor sparse = sparse_.coalesce();
+
+  int64_t nnz = sparse._nnz();
+  LongTensor indices = sparse._indices();
+  Tensor values = sparse._values();
+
+  LongTensor rowIndices = indices.select(0, 0);
+  LongTensor colIndices = indices.select(0, 1);
+  IntTensor csr = _to_csr_int(rowIndices, m, nnz);
+  IntTensor colIndicesInt = at::empty({colIndices.size(0)}, indices.type().toScalarType(kInt));
+  colIndicesInt.copy_(colIndices);
+
+  // No half support, so we don't have to use CUDATypeConversion
+  Tensor r__;
+  AT_DISPATCH_FLOATING_TYPES(
+      values.type(), "addmm_sparse_cuda", [&] {
+        scalar_t cast_beta = beta.to<scalar_t>();
+        scalar_t cast_alpha = alpha.to<scalar_t>();
+        if (cast_beta == 0) {
+          r_.zero_();
+        } else if (cast_beta == 1) {
+          if (!isSameTensor(t, r_)) {
+            r_.copy_(t);
+          }
+        } else {
+          at::mul_out(r_, t, beta);
+        }
+
+        /* r_ */
+        if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
+          r__ = r_;
+        } else {
+          // TODO: how... strange
+          r__ = r_.transpose(0, 1).clone();
+          r__.transpose_(0, 1);
+        }
+
+        /* dense */
+        Tensor dense_;
+        char transpose_dense;
+        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+          transpose_dense = 'n';
+          dense_ = dense;
+        } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
+          transpose_dense = 't';
+          dense_ = dense;
+        } else {
+          transpose_dense = 't';
+          dense_ = dense.contiguous();
+        }
+
+        sparse::cuda::csrmm2(
+          'n',
+          transpose_dense,
+          m,
+          n,
+          k,
+          nnz,
+          cast_alpha,
+          values.data<scalar_t>(),
+          csr.data<int32_t>(),
+          colIndicesInt.data<int32_t>(),
+          dense_.data<scalar_t>(),
+          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+          cast_beta,
+          r__.data<scalar_t>(),
+          r__.stride(1));
+
+      });
+
+  r_.copy_(r__);
+  return r_;
+#else
+  AT_ERROR("s_addmm_out_sparse_dense_cuda: HIP not supported");
+#endif
+}
+
+Tensor s_addmm_sparse_dense_cuda(
+    const Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  Tensor r = t.type().tensor();
+  s_addmm_out_sparse_dense_cuda(r, t, sparse, dense, beta, alpha);
+  return r;
+}
+
+Tensor& s_addmm_sparse_dense_cuda_(
+    Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    Scalar beta,
+    Scalar alpha
+) {
+  return s_addmm_out_sparse_dense_cuda(t, t, sparse, dense, beta, alpha);
+}
+
+// Deleted sspaddmm (sparse, dense) -> sparse
+
+// --------------------------------------------------------------------
+// hspmm(SparseTensor mat1, Tensor mat2)
+// --------------------------------------------------------------------
+
+SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse_, const Tensor& dense/* , Scalar alpha */) {
+#ifndef __HIP_PLATFORM_HCC__
+  AT_ASSERT(sparse_.is_cuda()); // dispatch argument
+  AT_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU");
+  AT_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU");
+
+  AT_CHECK(_check_device({r_, sparse_, dense}));
+
+  AT_CHECK(sparse_._sparseDims() == 2,
+      "hspmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor");
+  AT_CHECK(sparse_._denseDims() == 0,
+      "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.dim() == 2,
+      "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor");
+
+  int64_t m = sparse_.size(0);
+  int64_t k = sparse_.size(1);
+  int64_t n = dense.size(1);
+
+  AT_CHECK(dense.size(0) == k,
+      "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
+
+  _get_sparse_impl(r_)->raw_resize_(1, 1, {m, n});
+
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  SparseTensor sparse = sparse_.coalesce();
+
+  int64_t nnz = sparse._nnz();
+
+  LongTensor indices = at::empty({1, nnz}, CUDA(kLong));
+  // create values in column-major format to avoid copying in spaddmm
+  Tensor values = at::empty({n, nnz}, dense.type());
+  values.transpose_(0, 1);
+
+  // why does sparse need to be cloned? If this is really necessary maybe we
+  // need to fuse this with newCoalesce
+  SparseTensor newSparse = sparse.clone();
+  LongTensor spIndices = newSparse._indices();
+  LongTensor dstIndices = spIndices.select(0, 0);
+  // Save destination indices to output hybrid tensor
+  indices.copy_(dstIndices);
+  // Replace destination indices with 0, 1, 2, 3, ... and compute output values
+  // tensor with sparse * dense multiplication
+  thrust::device_ptr<int64_t> indicesIter(dstIndices.data<int64_t>());
+  thrust::sequence(policy, indicesIter, indicesIter + nnz);
+  _get_sparse_impl(newSparse)->_sizes_mut()[0] = nnz; // TODO: use something safer)
+  s_addmm_out_sparse_dense_cuda(values, values, newSparse, dense, 0, /*alpha*/ 1);
+  _get_sparse_impl(r_)->set_indices_and_values(indices, values);
+
+  return r_;
+#else
+  AT_ERROR("hspmm_out_sparse_cuda: HIP not supported");
+#endif
+}
+
+SparseTensor hspmm_sparse_cuda(const SparseTensor& sparse, const Tensor& dense) {
+  SparseTensor r = sparse.type().tensor();
+  hspmm_out_sparse_cuda(r, sparse, dense);
+  return r;
+}
+
+// --------------------------------------------------------------------
+// add(Tensor, SparseTensorRef, Scalar)
+//    formerly known as spcadd
+// --------------------------------------------------------------------
+
+Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorRef sparse_, at::Scalar value) {
+#ifndef __HIP_PLATFORM_HCC__
+  const SparseTensor& sparse = sparse_.tref;
+
+  AT_ASSERT(dense.is_cuda()); // dispatch argument
+  AT_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
+  AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
+
+  AT_CHECK(_check_device({sparse, r_, dense}));
+
+  AT_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
+    dense.sizes(), " while other has size ", sparse.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)");
+
+  const int64_t nnz = sparse._nnz();
+  if (nnz == 0) {
+    r_.resize_as_(dense);
+    r_.copy_(dense);
+    return r_;
+  }
+
+  Tensor r = r_;
+  if (!isSameTensor(r, dense)) {
+    r_.resize_as_(dense);
+    r_.copy_(dense);
+  } else {
+    AT_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )");
+    r = r_.contiguous();
+  }
+
+  LongTensor indices = sparse._indices();
+  Tensor values = sparse._values();
+  int64_t nDim = dense.dim();
+  int64_t nDimI = sparse._sparseDims();
+
+  if (sparse.is_coalesced()) {
+    // TODO benchmark to decide whether to remove this special case
+    const dim3 block = cuda::getApplyBlock();
+    dim3 grid;
+    int curDevice = -1;
+    cudaGetDevice(&curDevice);
+    cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice);
+    if (sparse._denseDims() == 0) {
+      AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
+
+      AT_DISPATCH_ALL_TYPES_AND_HALF(
+          values.type(), "add_out_dense_sparse_cuda", [&] {
+            apply::sparseElementwiseKernelScalar<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
+              <<<grid, block, 0, stream>>>(
+                TensorCAddOp<scalar_t>(value.to<scalar_t>()),
+                V_INFO(r_), I_INFO(indices), V_INFO(values),
+                static_cast<uint64_t>(nnz));
+          });
+    } else {
+      AT_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
+
+      AT_DISPATCH_ALL_TYPES_AND_HALF(
+          values.type(), "add_out_dense_sparse_cuda", [&] {
+            apply::sparseElementwiseKernel<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
+              <<<grid, block, 0, stream>>>(
+                TensorCAddOp<scalar_t>(value.to<scalar_t>()),
+                V_INFO(r_), I_INFO(indices), V_INFO(values),
+                static_cast<uint64_t>(nnz));
+          });
+    }
+  } else {
+    LongTensor indices1D = _newFlattenedIndices(sparse, 0).squeeze_(0).narrow(0, 0, nnz);
+
+    // FIXME: at some point we can wrap the scale into indexAdd
+    // NB: Purposely not inplace!
+    AT_DISPATCH_ALL_TYPES_AND_HALF(
+        values.type(), "add_out_dense_sparse_cuda", [&] {
+          if (value.to<scalar_t>() != static_cast<scalar_t>(1)) {
+            values = values.mul(value);
+          }
+        });
+
+    int64_t view_rows = 1;
+    int64_t view_columns = 1;
+    for (int i = 0; i < nDimI; i++) {
+      view_rows *= r.size(i);
+    }
+    for (int i = nDimI; i < nDim; i++) {
+      view_columns *= r.size(i);
+    }
+
+    Tensor r_view = r.view({view_rows, view_columns});
+    values = values.narrow(0, 0, nnz).reshape({nnz, view_columns});
+    r_view.index_add_(0, indices1D, values);
+  }
+  THCudaCheck(cudaGetLastError());
+
+  return r_;
+#else
+  AT_ERROR("add_out_dense_sparse_cuda: HIP not supported");
+#endif
+}
+
+Tensor add_dense_sparse_cuda(const Tensor& t, SparseTensorRef src, Scalar alpha) {
+  Tensor r = t.type().tensor();
+  add_out_dense_sparse_cuda(r, t, src, alpha);
+  return r;
+}
+
+Tensor& add_dense_sparse_cuda_(Tensor& t, SparseTensorRef src, Scalar alpha) {
+  return add_out_dense_sparse_cuda(t, t, src, alpha);
+}
+
+// --------------------------------------------------------------------
+// add(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+#ifndef __HIP_PLATFORM_HCC__
+  AT_ASSERT(t.is_cuda()); // dispatch argument
+  AT_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
+  AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
+
+  AT_CHECK(_check_device({r_, t, src}));
+  AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
+
+  if (src._nnz() == 0) {
+    return raw_copy_sparse_(r_, t);
+  }
+  if (t._nnz() == 0) {
+    return mul_out_sparse_scalar(r_, src, value);
+  }
+
+  AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions");
+
+  // We deliberately choose to simply concat the indices and values tensors
+  // rather than merging them. This removes the need to synchronously fetch nnz
+  // at the end of the operation, at the cost of having a non-coalesced result.
+  // This trade-off is preferable for the common use-case of gradient accumulation.
+  LongTensor t_indices_ = t._indices();
+  Tensor t_values_ = t._values();
+  LongTensor s_indices_ = src._indices();
+  Tensor s_values_ = src._values();
+
+  AT_DISPATCH_ALL_TYPES_AND_HALF(
+      s_values_.type(), "s_add_out_sparse_cuda", [&] {
+        if (value.to<scalar_t>() != static_cast<scalar_t>(1)) {
+          s_values_ = s_values_.mul(value);
+        }
+      });
+
+  LongTensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
+  Tensor r_values_ = at::cat({t_values_, s_values_}, 0);
+  r_.resize_as_(src);
+  _alias_into_sparse(r_, r_indices_, r_values_);
+
+  // FIXME: add some heuristic about when to call coalesce() here, so that
+  // tensors don't totally blow up in size by concatenation; e.g.
+  //   r->minUnique = max(a->minUnique + b->minUnique);
+  //   if (r->nnz / r->minUnique > COMPACTION_THRESHOLD) {
+  //     THCSTensor_(contiguous)(r);
+  //     r->minUnique = r->nnz;
+  //   }
+
+  return r_;
+#else
+  AT_ERROR("s_add_out_sparse_cuda: HIP not supported");
+#endif
+}
+
+SparseTensor s_add_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  SparseTensor r = t.type().tensor();
+  s_add_out_sparse_cuda(r, t, src, alpha);
+  return r;
+}
+
+SparseTensor& s_add_sparse_cuda_(SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  return s_add_out_sparse_cuda(t, t, src, alpha);
+}
+
+// --------------------------------------------------------------------
+// sub(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_sub_out_sparse_cuda(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+  AT_ASSERT(t.is_cuda()); // dispatch argument
+  AT_CHECK(src.is_cuda(), "sub: expected 'other' to be CUDA, but got CPU");
+  AT_CHECK(r.is_cuda(), "sub: expected 'out' to be CUDA, but got CPU");
+
+  AT_DISPATCH_ALL_TYPES(
+      t.type(), "sub_sparse", [&] {
+        scalar_t cast_value = value.to<scalar_t>();
+        s_add_out_sparse_cuda(r, t, src, -cast_value);
+      }
+  );
+  return r;
+}
+
+SparseTensor s_sub_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  SparseTensor r = t.type().tensor();
+  s_sub_out_sparse_cuda(r, t, src, alpha);
+  return r;
+}
+
+SparseTensor& s_sub_sparse_cuda_(SparseTensor& t, const SparseTensor& src, Scalar alpha) {
+  return s_sub_out_sparse_cuda(t, t, src, alpha);
+}
+
+// --------------------------------------------------------------------
+// mul(SparseTensor, SparseTensor, Scalar)  [broadcasts]
+// --------------------------------------------------------------------
+
+SparseTensor& s_mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, const SparseTensor& src_) {
+#ifndef __HIP_PLATFORM_HCC__
+  AT_ASSERT(t_.is_cuda()); // dispatch argument
+  AT_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU");
+  AT_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU");
+
+  AT_CHECK(_check_device({r_, t_, src_}));
+  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes());
+
+  SparseTensor t = t_.coalesce();
+  SparseTensor src = src_.coalesce();
+
+  if (src_._nnz() == 0 || t_._nnz() == 0) {
+    return r_.zero_();
+  }
+
+  // saving those because they can be overwritten when doing in-place operations
+  int64_t t_nnz = t._nnz(), s_nnz = src._nnz();
+  int64_t max_nnz = std::min(t_nnz, s_nnz);  // multiply by zero is zero, and can be dropped
+  int64_t sparseDims = src._sparseDims();
+  LongTensor t_indices_ = t._indices();
+  Tensor t_values_ = t._values();
+  LongTensor s_indices_ = src._indices();
+  Tensor s_values_ = src._values();
+  LongTensor r_indices_ = t_indices_.type().tensor({sparseDims, max_nnz});
+  Tensor r_values_ = _new_values_with_size_of(t_values_, max_nnz).zero_();
+  r_.resize_as_(src);
+  _get_sparse_impl(r_)->set_indices_and_values(r_indices_, r_values_);  // TODO: sigh
+
+  int64_t valueSize = t_values_.stride(0);
+  const dim3 block = dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), valueSize));
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice);
+  AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
+
+  LongTensor resultNnz = at::empty({1}, CUDA(kLong));
+  AT_DISPATCH_ALL_TYPES_AND_HALF(
+      t_values_.type(), "s_mul_out_sparse_cuda", [&] {
+        apply::valueSparseIntersectionKernel<TensorMulOp<scalar_t>, uint64_t, scalar_t>
+          <<<grid, block, 0, stream>>>(
+            TensorMulOp<scalar_t>(),
+            I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
+            V_INFO(r_values_), V_INFO(t_values_), V_INFO(s_values_),
+            static_cast<uint64_t>(t_nnz), static_cast<uint64_t>(s_nnz));
+        THCudaCheck(cudaGetLastError());
+
+        apply::indexSparseIntersectionKernel<uint64_t, scalar_t>
+          <<<1, 1, 0, stream>>>(
+            I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
+            // reinterpret_cast shenanigans, because we don't actually have
+            // unsigned tensors...
+            static_cast<uint64_t>(t_nnz), static_cast<uint64_t>(s_nnz), reinterpret_cast<uint64_t*>(resultNnz.data_ptr()));
+        THCudaCheck(cudaGetLastError());
+      });
+
+  // sync!  (surely there is a more idiomatic way to do this...)
+  LongTensor cpu_resultNnz = at::empty({1}, CPU(kLong));
+  cpu_resultNnz.copy_(resultNnz);
+  _get_sparse_impl(r_)->set_nnz(cpu_resultNnz.accessor<int64_t, 1>()[0]);
+  _get_sparse_impl(r_)->set_coalesced(true);
+
+  return r_;
+#else
+  AT_ERROR("s_mul_out_sparse_cuda: HIP not supported");
+#endif
+}
+
+SparseTensor s_mul_sparse_cuda(const SparseTensor& t, const SparseTensor& src) {
+  SparseTensor r = t.type().tensor();
+  s_mul_out_sparse_cuda(r, t, src);
+  return r;
+}
+
+SparseTensor& s_mul_sparse_cuda_(SparseTensor& t, const SparseTensor& src) {
+  return s_mul_out_sparse_cuda(t, t, src);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h
new file mode 100644
index 0000000..3b42b61
--- /dev/null
+++ b/aten/src/ATen/native/utils/ParamsHash.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+
+namespace at { namespace native {
+
+// Hashing machinery for Params
+// Fowler–Noll–Vo hash function
+// see https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Params>
+struct ParamsHash {
+  // Params must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Params>::value, "Params is not POD");
+
+  size_t operator()(const Params& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < (int)sizeof(Params); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+template <typename Params>
+struct ParamsEqual {
+  // Params must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Params>::value, "Params is not POD");
+
+  bool operator()(const Params& a, const Params& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Params)) == 0;
+  }
+};
+
+
+}}  // at::native
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
new file mode 100644
index 0000000..13d852d
--- /dev/null
+++ b/aten/src/ATen/native_parse.py
@@ -0,0 +1,147 @@
+from __future__ import print_function
+import re
+import yaml
+import pprint
+import sys
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+def parse_default(s):
+    if s.lower() == 'true':
+        return True
+    elif s.lower() == 'false':
+        return False
+    elif s == 'nullptr':
+        return s
+    elif s == '{}':
+        return '{}'
+    elif s == 'nullopt':
+        return s
+    try:
+        return int(s)
+    except Exception:
+        try:
+            return float(s)
+        except Exception:
+            return s
+
+
+def sanitize_types(typ):
+    # split tuples into constituent list
+    if typ[0] == '(' and typ[-1] == ')':
+        return [x.strip() for x in typ[1:-1].split(',')]
+    elif typ == 'Generator*':
+        return ['Generator *']
+    return [typ]
+
+
+def parse_arguments(args, func_decl, func_name, func_return):
+    arguments = []
+    python_default_inits = func_decl.get('python_default_init', {})
+    is_out_fn = func_name.endswith('_out')
+    if is_out_fn and func_decl.get('variants', []) not in ['function', ['function']]:
+        raise RuntimeError("Native functions suffixed with _out MUST be declared with only the function variant; "
+                           "e.g., variants: function; otherwise you will tickle a Python argument binding bug "
+                           "(which usually manifests itself as the result variable being undefined.) "
+                           "The culprit was: {}".format(func_name))
+    kwarg_only = False
+
+    if len(args.strip()) == 0:
+        return arguments
+
+    # TODO: Use a real parser here; this will get bamboozled
+    # by signatures that contain things like std::array<bool, 2> (note the space)
+    for arg_idx, arg in enumerate(args.split(', ')):
+        type_and_name = [a.strip() for a in arg.rsplit(' ', 1)]
+        if type_and_name == ['*']:
+            assert not kwarg_only
+            kwarg_only = True
+            continue
+
+        t, name = type_and_name
+        default = None
+        python_default_init = None
+
+        if '=' in name:
+            ns = name.split('=', 1)
+            name, default = ns[0], parse_default(ns[1])
+
+        if name in python_default_inits:
+            assert default is None
+            python_default_init = python_default_inits[name]
+
+        typ = sanitize_types(t)
+        assert len(typ) == 1
+        argument_dict = {'type': typ[0].rstrip('?'), 'name': name, 'is_nullable': typ[0].endswith('?')}
+        match = re.match(r'IntList\[(\d+)\]', argument_dict['type'])
+        if match:
+            argument_dict['type'] = 'IntList'
+            argument_dict['size'] = int(match.group(1))
+        if default is not None:
+            argument_dict['default'] = default
+        if python_default_init is not None:
+            argument_dict['python_default_init'] = python_default_init
+        # TODO: convention is that the ith-argument correspond to the i-th return, but it would
+        # be better if we just named everything and matched by name.
+        if is_out_fn and arg_idx < len(func_return):
+            argument_dict['output'] = True
+        if kwarg_only:
+            argument_dict['kwarg_only'] = True
+
+        arguments.append(argument_dict)
+    return arguments
+
+
+def has_sparse_dispatches(dispatches):
+    for dispatch in dispatches:
+        if 'Sparse' in dispatch:
+            return True
+    return False
+
+
+def parse_native_yaml(path):
+    with open(path, 'r') as f:
+        return yaml.load(f, Loader=Loader)
+
+
+def run(paths):
+    declarations = []
+    for path in paths:
+        for func in parse_native_yaml(path):
+            declaration = {'mode': 'native'}
+            try:
+                if '->' in func['func']:
+                    func_decl, return_type = [x.strip() for x in func['func'].split('->')]
+                    return_type = sanitize_types(return_type)
+                else:
+                    func_decl = func['func']
+                    return_type = [None]
+                fn_name, arguments = func_decl.split('(')
+                arguments = arguments.split(')')[0]
+                declaration['name'] = func.get('name', fn_name)
+                return_type = list(func.get('return', return_type))
+                arguments = parse_arguments(arguments, func, declaration['name'], return_type)
+                output_arguments = [x for x in arguments if x.get('output')]
+                declaration['return'] = return_type if len(output_arguments) == 0 else output_arguments
+                declaration['variants'] = func.get('variants', ['method', 'function'])
+                declaration['deprecated'] = func.get('deprecated', False)
+                declaration['device_guard'] = func.get('device_guard', True)
+                declaration['arguments'] = func.get('arguments', arguments)
+                declaration['type_method_definition_dispatch'] = func.get('dispatch', declaration['name'])
+                declaration['aten_sparse'] = has_sparse_dispatches(
+                    declaration['type_method_definition_dispatch'])
+                declarations.append(declaration)
+            except Exception as e:
+                msg = '''Exception raised in processing function:
+{func}
+Generated partial declaration:
+{decl}'''.format(func=pprint.pformat(func), decl=pprint.pformat(declaration))
+                print(msg, file=sys.stderr)
+                raise e
+
+    return declarations
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
new file mode 100644
index 0000000..4590777
--- /dev/null
+++ b/aten/src/ATen/nn.yaml
@@ -0,0 +1,284 @@
+# Loss functions
+
+- name: binary_cross_entropy(Tensor self, Tensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean)
+  cname: BCECriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: kl_div(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: DistKLDivCriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: l1_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: AbsCriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: mse_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: MSECriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: multi_margin_loss(Tensor self, LongTensor target, Scalar p=1, Scalar margin=1, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean)
+  cname: MultiMarginCriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: multilabel_margin_loss(Tensor self, LongTensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: MultiLabelMarginCriterion
+  buffers: [is_target]
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+    is_target: target_->isScalar()
+
+- name: nll_loss(Tensor self, LongTensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean, int64_t ignore_index=-100)
+  cname: ClassNLLCriterion
+  buffers: [total_weight]
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+    total_weight: 'true'
+
+- name: nll_loss2d(Tensor self, LongTensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean, int64_t ignore_index=-100)
+  cname: SpatialClassNLLCriterion
+  buffers: [total_weight]
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+    total_weight: 'true'
+
+- name: smooth_l1_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: SmoothL1Criterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+- name: soft_margin_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean)
+  cname: SoftMarginCriterion
+  scalar_check:
+    output: reduction != Reduction::None || self_->isScalar()
+
+# Activation functions
+
+- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1)
+  cname: ELU
+  has_inplace: True
+  scalar_check:
+    output: self_->isScalar()
+    grad_input: output_->isScalar()
+
+- name: glu(Tensor self, int64_t dim=-1)
+  cname: GatedLinear
+  wrap_dim:
+    dim: self
+  scalar_check:
+    output: 'false'
+
+- name: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1)
+  cname: HardTanh
+  has_inplace: True
+  scalar_check:
+    output: self_->isScalar()
+
+- name: leaky_relu(Tensor self, Scalar negative_slope=0.01)
+  cname: LeakyReLU
+  has_inplace: True
+  scalar_check:
+    output: self_->isScalar()
+
+- name: log_sigmoid(Tensor self)
+  cname: LogSigmoid
+  buffers: [buffer]
+  scalar_check:
+    output: self_->isScalar()
+    buffer: self_->isScalar()
+
+- name: prelu(Tensor self, Tensor weight)
+  cname: PReLU
+  scalar_check:
+    output: self_->isScalar()
+
+# NOTE: we treat noise as an input (it's really a buffer) because the codegen
+# can't handle in-place functions that have buffers
+- name: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr)
+  cname: RReLU
+  has_inplace: True
+  scalar_check:
+    output: self_->isScalar()
+
+- name: softplus(Tensor self, Scalar beta=1, Scalar threshold=20)
+  cname: SoftPlus
+  scalar_check:
+    output: self_->isScalar()
+
+- name: softshrink(Tensor self, Scalar lambd=0.5)
+  cname: SoftShrink
+  scalar_check:
+    output: self_->isScalar()
+
+- name: threshold(Tensor self, Scalar threshold, Scalar value)
+  cname: Threshold
+  has_inplace: True
+  scalar_check:
+    output: self_->isScalar()
+
+# Pooling
+
+- name: adaptive_avg_pool2d(Tensor self, IntList[2] output_size)
+  cname: SpatialAdaptiveAveragePooling
+
+- name: adaptive_avg_pool3d(Tensor self, IntList[3] output_size)
+  cname: VolumetricAdaptiveAveragePooling
+
+- name: adaptive_max_pool2d(Tensor self, IntList[2] output_size)
+  cname: SpatialAdaptiveMaxPooling
+
+- name: adaptive_max_pool3d(Tensor self, IntList[3] output_size)
+  cname: VolumetricAdaptiveMaxPooling
+
+- name: avg_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, bool ceil_mode=false, bool count_include_pad=true)
+  cname: SpatialAveragePooling
+  default_init:
+    stride: kernel_size
+
+- name: avg_pool3d(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, bool ceil_mode=false, bool count_include_pad=true)
+  cname: VolumetricAveragePooling
+  default_init:
+    stride: kernel_size
+
+- name: fractional_max_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] output_size, Tensor random_samples)
+  cname: SpatialFractionalMaxPooling
+  scalar_check:
+    output: 'false'
+
+- name: max_pool2d_with_indices(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false)
+  cname: SpatialDilatedMaxPooling
+  default_init:
+    stride: kernel_size
+
+- name: max_pool3d_with_indices(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false)
+  cname: VolumetricDilatedMaxPooling
+  default_init:
+    stride: kernel_size
+
+- name: max_unpool2d(Tensor self, LongTensor indices, IntList[2] output_size)
+  cname: SpatialMaxUnpooling
+
+- name: max_unpool3d(Tensor self, LongTensor indices, IntList[3] output_size, IntList[3] stride, IntList[3] padding)
+  cname: VolumetricMaxUnpooling
+
+# Padding
+
+- name: reflection_pad1d(Tensor self, IntList[2] padding)
+  cname: TemporalReflectionPadding
+
+- name: reflection_pad2d(Tensor self, IntList[4] padding)
+  cname: SpatialReflectionPadding
+
+- name: replication_pad1d(Tensor self, IntList[2] padding)
+  cname: TemporalReplicationPadding
+
+- name: replication_pad2d(Tensor self, IntList[4] padding)
+  cname: SpatialReplicationPadding
+
+- name: replication_pad3d(Tensor self, IntList[6] padding)
+  cname: VolumetricReplicationPadding
+
+# Upsampling
+
+# Note: The upsampling backwards functions also include an IntList input_size
+# parameter, which is added by nn_parse.py
+
+- name: upsample_linear1d(Tensor self, IntList[1] output_size, bool align_corners)
+  cname: TemporalUpSamplingLinear
+  scalar_check:
+    grad_input: 'false'
+
+- name: upsample_bilinear2d(Tensor self, IntList[2] output_size, bool align_corners)
+  cname: SpatialUpSamplingBilinear
+  scalar_check:
+    grad_input: 'false'
+
+- name: upsample_trilinear3d(Tensor self, IntList[3] output_size, bool align_corners)
+  cname: VolumetricUpSamplingTrilinear
+  scalar_check:
+    grad_input: 'false'
+
+- name: upsample_nearest1d(Tensor self, IntList[1] output_size)
+  cname: TemporalUpSamplingNearest
+  scalar_check:
+    grad_input: 'false'
+
+- name: upsample_nearest2d(Tensor self, IntList[2] output_size)
+  cname: SpatialUpSamplingNearest
+  scalar_check:
+    grad_input: 'false'
+
+- name: upsample_nearest3d(Tensor self, IntList[3] output_size)
+  cname: VolumetricUpSamplingNearest
+  scalar_check:
+    grad_input: 'false'
+
+
+# Private functions. These also exist in TH, but we want the backwards functions
+# to implement derivatives.
+
+- name: _sigmoid(Tensor self)
+  cname: Sigmoid
+  scalar_check:
+    output: self_->isScalar()
+    grad_input: output_->isScalar()
+
+- name: _tanh(Tensor self)
+  cname: Tanh
+  scalar_check:
+    output: self_->isScalar()
+    grad_input: output_->isScalar()
+
+# Batch normalization
+
+# The buffers here are somewhat hazardous, because their type will be
+# based off of self, even though you may plausibly wish running_mean
+# and running_var to have different precision than self (e.g.,
+# BatchNorm on half).  Fortunately, THNN doesn't actually ever do this,
+# so the buffer allocation code is "correct".  If you ever do fix this,
+# you should just port the function entirely to a native ATen function.
+- name: thnn_batch_norm(Tensor self, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, bool training, double momentum, double eps)
+  cname: BatchNormalization
+  buffers: [save_mean, save_std]
+
+# Convolutions
+
+- name: thnn_conv_transpose2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, IntList[2] dilation=1)
+  cname: SpatialFullDilatedConvolution
+  buffers: [columns, ones]
+
+- name: thnn_conv_transpose3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, IntList[3] dilation=1)
+  cname: VolumetricFullDilatedConvolution
+  buffers: [finput, fgrad_input]
+
+- name: thnn_conv2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0)
+  cname: SpatialConvolutionMM
+  buffers: [finput, fgrad_input]
+
+- name: thnn_conv_depthwise2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1)
+  cname: SpatialDepthwiseConvolution
+  buffers: []
+
+- name: thnn_conv3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0)
+  cname: VolumetricConvolutionMM
+  buffers: [finput, fgrad_input]
+
+- name: thnn_conv_dilated2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1)
+  cname: SpatialDilatedConvolution
+  buffers: [columns, ones]
+
+- name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1)
+  cname: VolumetricDilatedConvolution
+  buffers: [columns, ones]
+
+# Vision
+
+- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode)
+  cname: SpatialGridSamplerBilinear
+
+- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode)
+  cname: VolumetricGridSamplerBilinear
diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py
new file mode 100644
index 0000000..d3e46f8
--- /dev/null
+++ b/aten/src/ATen/nn_parse.py
@@ -0,0 +1,415 @@
+import copy
+import re
+import common_with_cwrap
+import yaml
+from collections import OrderedDict, defaultdict
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+# matches `name`, `params` in `name(params)`
+NAME_PARAM_REGEX = r'(\w+)\((.*)\)'
+
+
+def argument_to_declaration(param, func=None):
+    arg = {}
+    arg['type'], name = param.split(' ')
+    if arg['type'] == 'Tensor':
+        arg['type'] = 'THTensor*'
+    elif arg['type'] == 'LongTensor':
+        arg['type'] = 'THIndexTensor*'
+    elif arg['type'] == 'Scalar':
+        arg['type'] = 'accreal'
+    elif arg['type'] == 'Generator*':
+        arg['type'] = 'THGenerator*'
+
+    match = re.match(r'IntList\[(\d+)\]', arg['type'])
+    if match:
+        arg['type'] = 'IntList'
+        arg['size'] = int(match.group(1))
+
+    if '=' in name:
+        name, default = name.split('=')
+        arg['optional'] = True
+        arg['default'] = default
+    arg['name'] = name
+
+    if func is not None:
+        default_inits = func.get('default_init', {})
+        wrap_dims = func.get('wrap_dim', {})
+        if name in default_inits:
+            # non constexpr defaults
+            arg['default_init'] = default_inits[name]
+        if name in wrap_dims:
+            arg['wrap_dim'] = wrap_dims[name]
+
+    return arg
+
+
+def output_arguments(thnn_function):
+    cname = thnn_function.name
+    output_args = []
+
+    # function_wrapper expects everything in a declaration to be in
+    # the base type (i.e. THTensor*), but if we pull a THCUNN only
+    # implementation, it will have THCTensor* as the arg type. So we
+    # strip the THC here before returning
+    def map_to_th_type(t):
+        if t.startswith('THC'):
+            t = t.replace('THC', 'TH')
+        return t
+
+    def is_output_arg(arg_name, func_name):
+        if arg_name == 'output' and 'updateOutput' in cname:
+            return True
+        if name in {'gradInput', 'gradWeight', 'gradBias', 'gradGrid'}:
+            return True
+        if arg_name == 'indices' and 'updateOutput' in cname and 'Unpool' not in cname:
+            # indices is an output argument in pooling and an input in unpooling
+            return True
+        return False
+
+    for arg in thnn_function.arguments:
+        name = arg.name
+        if is_output_arg(name, cname):
+            desc = {
+                'type': map_to_th_type(arg.type),
+                'name': camel_to_snake(name),
+                'output': True,
+            }
+            if name.startswith('grad_'):
+                desc['is_nullable'] = True
+            output_args.append(desc)
+    return output_args
+
+
+def get_return(args):
+    indices = [str(idx) for idx, arg in enumerate(args) if arg.get('output')]
+    return 'argument {}'.format(','.join(indices))
+
+
+ARGUMENT_MAPPINGS = {
+    'k': 'kernel_size',
+    'd': 'stride',
+    'pad': 'padding',
+    'p': 'padding',
+    'o': 'output_size',
+    'osize': 'output_size',
+    'output': 'output_size',  # as a prefix e.g. outputW
+    'isize': 'input_size',
+    'dilation': 'dilation',
+    'adj': 'output_padding',
+    'a': 'output_padding',
+}
+
+DIMENSION_OFFSET = {
+    'width': -1,
+    'height': -2,
+    'B': 0,
+    'C': 1,
+    'W': -1,
+    'H': -2,
+    'T': -3,
+    'left': 0,
+    'right': 1,
+    'top': 2,
+    'bottom': 3,
+    'front': 4,
+    'back': 5,
+}
+
+SUBSTITUTIONS = {
+    'input': 'self',
+    'weights': 'weight',
+    'train': 'training',
+    'val': 'value',
+    'lambda': 'lambd',
+    'negval': 'negative_slope',
+}
+
+
+def camel_to_snake(name):
+    # from https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def get_thnn_args(thnn_function, params, inplace):
+    params_by_name = {p['name']: p for p in params}
+
+    def arg_expr(prefix, suffix):
+        # e.g kW, kH
+        name = ARGUMENT_MAPPINGS[prefix]
+        if name not in params_by_name:
+            raise RuntimeError('missing arg "{}" in {}'.format(name, thnn_function.name))
+        param = params_by_name[name]
+        if param['type'] == 'IntList' and 'size' in param:
+            name = name + '_'
+        index = DIMENSION_OFFSET[suffix]
+        if index < 0:
+            index += param['size']
+        expr = '{}[{}]'.format(name, index)
+        return {'type': 'EXPRESSION', 'name': expr}
+
+    thnn_args = []
+    for arg in thnn_function.arguments:
+        name = arg.name
+        if name == 'state':
+            continue
+        if inplace and name == 'output':
+            name = 'self'
+        aten_name = camel_to_snake(SUBSTITUTIONS.get(name, name))
+        parts = aten_name.split('_')
+        if aten_name in params_by_name:
+            param = params_by_name[aten_name]
+            if arg.is_optional:
+                param['is_nullable'] = True
+            thnn_args.append(copy.deepcopy(param))
+        elif len(parts) == 2 and parts[0] in ARGUMENT_MAPPINGS and parts[1] in DIMENSION_OFFSET:
+            # e.g. pad_left
+            thnn_args.append(arg_expr(parts[0], parts[1]))
+        elif name[-1] in DIMENSION_OFFSET and name[:-1] in ARGUMENT_MAPPINGS:
+            # e.g kW, kH
+            thnn_args.append(arg_expr(name[:-1], name[-1]))
+        elif name == 'owidth' or name == 'oheight':
+            thnn_args.append(arg_expr(name[0], name[1:]))
+        elif name == 'scale':
+            thnn_args.append({'type': 'EXPRESSION', 'name': '1'})
+        elif name == 'inplace':
+            thnn_args.append({'type': 'EXPRESSION', 'name': str(inplace).lower()})
+        else:
+            raise RuntimeError("{}: can't find binding for '{}'"
+                               .format(thnn_function.name, name))
+    return thnn_args
+
+
+def remove_unused_args(args, thnn_args):
+    """Returns the subset of args whose name appears in thnn_args"""
+    def clean_name(name):
+        name = name[:name.index('[')] if '[' in name else name
+        if name.endswith('_'):
+            name = name[:-1]
+        return name
+    uses = set([clean_name(arg['name']) for arg in thnn_args])
+    uses.add('output_mask')
+    args = [arg for arg in args if arg['name'] in uses]
+    for arg in args:
+        if 'default' in arg:
+            del arg['default']
+    return args
+
+
+def unique_args(argslist):
+    result = []
+    seen = set()
+    for args in argslist:
+        for arg in args:
+            if arg['name'] in seen:
+                continue
+            seen.add(arg['name'])
+            result.append(arg)
+    return result
+
+
+def function_info(name, arguments, cimpls, buffers, backends, inplace, scalar_check):
+    """
+    cimpls contains information use to call into THNN:
+        cname: THNN function name
+        arguments: arguments to functional call
+        condition: [optional] guard around call
+    """
+    return {
+        'mode': 'NN',
+        'name': name,
+        'types': ['Float', 'Double', 'Half'],  # Half will be stripped for CPU backend
+        'arguments': arguments,
+        'return': 'argument 0' if inplace else get_return(arguments),
+        'buffers': buffers,
+        'backends': backends,
+        'cimpls': cimpls,
+        'scalar_check': scalar_check,
+        'variants': ['function'],
+    }
+
+
+def base_declaration(func, thnn_function, backends, inplace=False):
+    """Creates the NN function without any buffers in it's signature"""
+    name, params = re.match(NAME_PARAM_REGEX, func['name']).groups()
+    if inplace:
+        name += '_'
+    params = params.split(', ')
+    arguments = [argument_to_declaration(a, func) for a in params]
+    if not inplace:
+        arguments += output_arguments(thnn_function)
+    buffers = [argument_to_declaration('Tensor ' + buf)
+               for buf in func.get('buffers', [])]
+
+    return function_info(name, arguments, None, buffers, backends, inplace, func.get('scalar_check'))
+
+
+def forward_declaration(base, thnn_function, inplace=False):
+    name = '{}_forward'.format(base['name'])
+    if inplace:
+        name += '_'
+
+    arguments = [copy.deepcopy(arg) for arg in base['arguments']
+                 if not arg.get('output')]
+
+    arguments += output_arguments(thnn_function)
+    for buffer in base['buffers']:
+        buffer = copy.deepcopy(buffer)
+        buffer['output'] = True
+        arguments.append(buffer)
+
+    thnn_args = get_thnn_args(thnn_function, arguments, inplace)
+    arguments = remove_unused_args(arguments, thnn_args)
+    cimpl = {'cname': thnn_function.name, 'arguments': thnn_args}
+
+    scalar_check = base['scalar_check']
+    if scalar_check is not None:
+        output_arg_names = [arg['name'] for arg in arguments if arg.get('output', False)]
+        scalar_check = {k: v for (k, v) in scalar_check.items() if k in output_arg_names}
+
+    return function_info(name, arguments, [cimpl], [], base['backends'], inplace, scalar_check)
+
+
+def backward_declaration(base, thnn_functions):
+    name = '{}_backward'.format(base['name'])
+
+    arguments = []
+    arguments.append({'type': 'THTensor*', 'name': 'grad_output'})
+    arguments += [copy.deepcopy(arg) for arg in base['arguments']
+                  if arg['name'] != 'inplace']
+    arguments += base['buffers']
+
+    if 'upsample' in base['name']:
+        # Add input_size as parameter to upsample backwards functions
+        # Note that input_size is 4-dim for upsample_xxx2d
+        size = 2 + int(re.search(r'(\d+)d', base['name']).group(1))
+        input_size_arg = {'type': 'IntList', 'name': 'input_size', 'size': size}
+        for output_size_idx, arg in enumerate(arguments):
+            if arg['name'] == 'output_size':
+                break
+        arguments.insert(output_size_idx + 1, input_size_arg)
+
+    # outputs from the forward may be inputs to the backwards
+    for arg in arguments:
+        if 'output' in arg:
+            del arg['output']
+
+    arguments += unique_args([output_arguments(f) for f in thnn_functions])
+
+    def initialize_output_arg(arg):
+        # the mask array<bool, N> specifies which return values to compute
+        arg['mask'] = True
+        arg['is_nullable'] = True
+
+        # grad_weight and grad_bias need to be resized and zeroed
+        if arg['name'] == 'grad_weight':
+            arg['resize'] = 'weight'
+            arg['zero'] = True
+        if arg['name'] == 'grad_bias':
+            dim = 1 if 'transpose' in name else 0
+            arg['resize'] = [('weight', dim)]
+            arg['zero'] = True
+
+    is_batch_norm_backward = '_backward' in thnn_functions[0].name
+    grad_params = []
+    if len(thnn_functions) > 1 or is_batch_norm_backward:
+        for arg in arguments:
+            if arg.get('output', False):
+                initialize_output_arg(arg)
+            if 'Tensor' in arg['type'] and arg['name'].startswith('grad_') and \
+                    'input' not in arg['name'] and 'output' not in arg['name']:
+                grad_params.append(arg['name'])
+
+    thnn_args = [get_thnn_args(f, arguments, False) for f in thnn_functions]
+    arguments = remove_unused_args(arguments, unique_args(thnn_args))
+    cimpls = []
+
+    def get_condition(func):
+        # only call into the THNN functions if the output args are not null
+        if '_updateGradInput' in func.name:
+            return 'grad_input_'
+        if '_accGradParameters' in func.name:
+            return ' || '.join(p + '_' for p in grad_params)
+        return None
+
+    for func, args in zip(thnn_functions, thnn_args):
+        cimpl = {'cname': func.name, 'arguments': args}
+        if len(thnn_functions) > 1:
+            cimpl['condition'] = get_condition(func)
+        cimpls.append(cimpl)
+
+    output_args = [arg for arg in arguments if arg.get('output', False)]
+    scalar_check_arg = base['scalar_check'] if base['scalar_check'] is not None else dict()
+    scalar_check = {k: v for (k, v) in scalar_check_arg.items() if k in [a['name'] for a in output_args]}
+    for arg in output_args:
+        # resize automatically sets scalar_check
+        if scalar_check.get(arg['name']) is not None or arg.get('resize', False):
+            pass
+        else:
+            base_name = arg['name'][len('grad_'):] if arg['name'] != 'grad_input' else 'self'
+            if base_name in [a['name'] for a in arguments]:
+                scalar_check[arg['name']] = base_name + '_->isScalar()'
+            else:
+                raise ValueError(("Could not infer scalar_check for {} argument of func {} because {} "
+                                  "does not exist.  Please explicitly specify scalar_check."
+                                  .format(arg['name'], name, base_name)))
+
+    return function_info(name, arguments, cimpls, [], base['backends'], False, scalar_check)
+
+
+def parse_nn_yaml(filename):
+    with open(filename, 'r') as f:
+        return yaml.load(f, Loader=Loader)
+
+
+include_only = '(updateOutput|updateGradInput|accGradParameters|backward)$'
+exclude = 'LookupTable'
+
+
+def run(paths):
+    function_backends = defaultdict(list)
+    header_functions = OrderedDict()
+
+    headers = [p for p in paths if p.endswith('.h')]
+    yamls = [p for p in paths if p.endswith('.yaml')]
+
+    for path in headers:
+        backend = 'CUDA' if re.search('THCU', path) else 'CPU'
+        for func in common_with_cwrap.parse_header(path):
+            if re.search(include_only, func.name) is None or re.search(exclude, func.name) is not None:
+                continue
+            function_backends[func.name].append(backend)
+            if func.name not in header_functions:
+                header_functions[func.name] = func
+
+    bwd_suffixes = ['_updateGradInput', '_accGradParameters', '_backward']
+
+    declarations = []
+    for path in yamls:
+        for func in parse_nn_yaml(path):
+            cname = func['cname']
+            backends = function_backends[cname + '_updateOutput']
+
+            fwd_function = header_functions[cname + '_updateOutput']
+            bwd_functions = []
+            for suffix in bwd_suffixes:
+                if cname + suffix in header_functions:
+                    bwd_functions.append(header_functions[cname + suffix])
+
+            base = base_declaration(func, fwd_function, backends)
+            declarations.append(base)
+            declarations.append(forward_declaration(base, fwd_function))
+            declarations.append(backward_declaration(base, bwd_functions))
+
+            if func.get('has_inplace', False):
+                declarations.append(base_declaration(func, fwd_function, backends, True))
+                declarations.append(forward_declaration(base, fwd_function, True))
+
+    return declarations
diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h
new file mode 100644
index 0000000..287ddd8
--- /dev/null
+++ b/aten/src/ATen/optional.h
@@ -0,0 +1,982 @@
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+//
+// From https://github.com/akrzemi1/Optional
+//
+// ATen:
+// - Move to `at` namespace.
+// - Remove macro use in line 478 because the nvcc device compiler cannot handle it.
+
+#pragma once
+
+# include <utility>
+# include <type_traits>
+# include <initializer_list>
+# include <cassert>
+# include <functional>
+# include <string>
+# include <stdexcept>
+
+# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false
+
+# if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   endif
+# endif
+#
+# if defined __clang_major__
+#   if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   elif (__clang_major__ > 3)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   endif
+#   if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   endif
+# endif
+#
+# if defined _MSC_VER
+#   if (_MSC_VER >= 1900)
+#     define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   endif
+# endif
+
+# if defined __clang__
+#   if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#   else
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#   endif
+# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# else
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+# endif
+
+
+# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#   define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+# else
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#   define OPTIONAL_CONSTEXPR_INIT_LIST
+# endif
+
+# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L)
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 1
+# else
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 0
+# endif
+
+# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
+# if (defined __cplusplus) && (__cplusplus == 201103L)
+#   define OPTIONAL_MUTABLE_CONSTEXPR
+# else
+#   define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+# endif
+
+namespace at {
+
+// 20.5.4, optional for object types
+template <class T> class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T> class optional<T&>;
+
+
+// workaround: std utility functions aren't constexpr yet
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type&& t) noexcept
+{
+    static_assert(!std::is_lvalue_reference<T>::value, "!!");
+    return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr typename std::remove_reference<T>::type&& constexpr_move(T&& t) noexcept
+{
+    return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+
+#if defined NDEBUG
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR)))
+#endif
+
+
+namespace detail_
+{
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof
+{
+  template <class X>
+  constexpr static bool has_overload(...) { return false; }
+
+  template <class X, size_t S = sizeof(std::declval<X&>().operator&()) >
+  constexpr static bool has_overload(bool) { return true; }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref)
+{
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref)
+{
+  return std::addressof(ref);
+}
+
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) { return v; }
+
+} // namespace detail
+
+
+constexpr struct trivial_init_t{} trivial_init{};
+
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t{} in_place{};
+
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t
+{
+  struct init{};
+  constexpr explicit nullopt_t(init){}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public std::logic_error {
+public:
+  explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
+};
+
+
+template <class T>
+union storage_t
+{
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+  template <class... Args>
+  constexpr storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t(){}
+};
+
+
+template <class T>
+union constexpr_storage_t
+{
+    unsigned char dummy_;
+    T value_;
+
+    constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+    template <class... Args>
+    constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+    ~constexpr_storage_t() = default;
+};
+
+
+template <class T>
+struct optional_base
+{
+    bool init_;
+    storage_t<T> storage_;
+
+    constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit optional_base(in_place_t, Args&&... args)
+        : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    explicit optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+        : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~optional_base() { if (init_) storage_.value_.T::~T(); }
+};
+
+
+template <class T>
+struct constexpr_optional_base
+{
+    bool init_;
+    constexpr_storage_t<T> storage_;
+
+    constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    std::is_trivially_destructible<T>::value,                          // if possible
+    constexpr_optional_base<typename std::remove_const<T>::type>, // use base with trivial destructor
+    optional_base<typename std::remove_const<T>::type>
+>::type;
+
+
+
+template <class T>
+class optional : private OptionalBase<T>
+{
+  static_assert( !std::is_same<typename std::decay<T>::type, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<typename std::decay<T>::type, in_place_t>::value, "bad T" );
+
+
+  constexpr bool initialized() const noexcept { return OptionalBase<T>::init_; }
+  typename std::remove_const<T>::type* dataptr() {  return std::addressof(OptionalBase<T>::storage_.value_); }
+  constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase<T>::storage_.value_); }
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& { return OptionalBase<T>::storage_.value_; }
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+#   else
+  T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+  T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+#   endif
+# else
+  constexpr const T& contained_val() const { return OptionalBase<T>::storage_.value_; }
+  T& contained_val() { return OptionalBase<T>::storage_.value_; }
+# endif
+
+  void clear() noexcept {
+    if (initialized()) dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(noexcept(T(il, std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>()  {};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>() {};
+
+  optional(const optional& rhs)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(*rhs);
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible<T>::value)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list<U> il, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept
+  {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(*rhs);
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs)
+  noexcept(std::is_nothrow_move_assignable<T>::value && std::is_nothrow_move_constructible<T>::value)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(std::move(*rhs));
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v)
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, T>::value,
+    optional&
+  >::type
+  {
+    if (initialized()) { contained_val() = std::forward<U>(v); }
+    else               { initialize(std::forward<U>(v));  }
+    return *this;
+  }
+
+
+  template <class... Args>
+  void emplace(Args&&... args)
+  {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(std::initializer_list<U> il, Args&&... args)
+  {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(std::is_nothrow_move_constructible<T>::value && noexcept(swap(std::declval<T&>(), std::declval<T&>())))
+  {
+    if      (initialized() == true  && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); }
+    else if (initialized() == false && rhs.initialized() == true)  { initialize(std::move(*rhs)); rhs.clear(); }
+    else if (initialized() == true  && rhs.initialized() == true)  { using std::swap; swap(**this, *rhs); }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept { return initialized(); }
+  constexpr bool has_value() const noexcept { return initialized(); }
+
+  constexpr T const* operator ->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+# if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & {
+    assert (initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && {
+    assert (initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized()) throw bad_optional_access("bad optional access");
+	return std::move(contained_val());
+  }
+
+# else
+
+  T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const {
+    return contained_val();
+  }
+
+  T& operator *() {
+    assert (initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+# endif
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const&
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   else
+
+  template <class V>
+  T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   endif
+
+# else
+
+  template <class V>
+  constexpr T value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+# endif
+
+  // 20.6.3.6, modifiers
+  void reset() noexcept { clear(); }
+};
+
+
+template <class T>
+class optional<T&>
+{
+  static_assert( !std::is_same<T, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<T, in_place_t>::value, "bad T" );
+  T* ref;
+
+public:
+
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+
+  void swap(optional<T&>& rhs) noexcept
+  {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  constexpr bool has_value() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename std::decay<T>::type value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<typename std::decay<T>::type>(constexpr_forward<V>(v));
+  }
+
+  // x.x.x.x, modifiers
+  void reset() noexcept { ref = nullptr; }
+};
+
+
+template <class T>
+class optional<T&&>
+{
+  static_assert( sizeof(T) == 0, "optional rvalue references disallowed" );
+};
+
+
+// 20.5.8, Relational operators
+template <class T> constexpr bool operator==(const optional<T>& x, const optional<T>& y)
+{
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x == y);
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const optional<T>& y)
+{
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const optional<T>& y)
+{
+  return (y < x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const optional<T>& y)
+{
+  return !(y < x);
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x < y);
+}
+
+
+// 20.5.9, Comparison with nullopt
+template <class T> constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+
+
+// 20.5.10, Comparison with T
+template <class T> constexpr bool operator==(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// Comparison of optional<T&> with T
+template <class T> constexpr bool operator==(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T> constexpr bool operator==(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y)))
+{
+  x.swap(y);
+}
+
+
+template <class T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v)
+{
+  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(std::reference_wrapper<X> v)
+{
+  return optional<X&>(v.get());
+}
+
+
+} // namespace at
+
+namespace std
+{
+  template <typename T>
+  struct hash<at::optional<T>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef at::optional<T> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+
+  template <typename T>
+  struct hash<at::optional<T&>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef at::optional<T&> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+}
+
+# undef TR2_OPTIONAL_REQUIRES
+# undef TR2_OPTIONAL_ASSERTED_EXPRESSION
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
new file mode 100644
index 0000000..1bc33e5
--- /dev/null
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -0,0 +1,242 @@
+import re
+from copy import deepcopy
+from function_wrapper import TYPE_FORMAL_GENERIC
+import common_with_cwrap
+
+type_map = {
+    'floating_point': [
+        'Float',
+        'Double',
+        'Half',
+    ],
+    'integral': [
+        'Byte',
+        'Char',
+        'Short',
+        'Int',
+        'Long'
+    ],
+}
+
+all_types = type_map['floating_point'] + type_map['integral']
+type_map['all'] = all_types
+
+all_backends = ['CPU', 'CUDA', 'SparseCPU', 'SparseCUDA']
+default_backends = ['CPU', 'CUDA']
+
+sparse_map = {
+    'CPU': 'SparseCPU',
+    'CUDA': 'SparseCUDA',
+}
+
+
+def process_types_and_backends(option):
+    # if specific pairs were not listed, then enumerate them
+    # based on the backend and type attributes
+    # if backend or type is not defined, it is assumed to be all of them
+    if 'backend_type_pairs' not in option:
+        backends = option.get('backends', default_backends)
+        if option.get('aten_sparse', False):
+            backends.extend([sparse_map[p] for p in backends if p in sparse_map])
+        backends = set(backends)
+
+        types = option.get('types', all_types)
+
+        pairs = [[p, t] for p in backends for t in types]
+    else:
+        pairs = option['backend_type_pairs']
+
+    # expand type alias (integral, floating_point, all)
+    def expand(pair):
+        p, t = pair
+        assert(p in all_backends)
+        if t in type_map:
+            return [(p, tt) for tt in type_map[t]]
+        assert(t in all_types)
+        return [(p, t)]
+    pairs = set(p for pair in pairs for p in expand(pair))
+
+    # disable CUDA Half if there is a Sparse argument
+    for arg in option.get('arguments', []):
+        if arg['type'] == 'THSTensor*':
+            pairs.discard(('CUDA', 'Half'))
+
+    # special case remove Half for cpu unless it is explicitly enabled,
+    if not option.get('cpu_half', False):
+        pairs.discard(('CPU', 'Half'))
+
+    # sort the result for easy reading
+    option['backend_type_pairs'] = sorted([p for p in pairs])
+
+
+def exclude(declaration):
+    return 'only_register' in declaration or declaration.get('python_name') == 'ndimension'
+
+
+def add_variants(option):
+    option.setdefault('variants', ['method'])
+
+# if we have 'output' arguments, generate a variant where
+# we mark oututs as allocate = True, and where the method variant
+# is disabled...
+
+
+def handle_outputs_taken_as_arguments(options):
+    new_options = []
+
+    def is_nullable(arg):
+        return (arg['type'] in {'THIntegerTensor*', 'THTensor*'} and
+                arg.get('default', '') in {None, 'NULL', 'nullptr'})
+
+    def should_generate_out_variant(option):
+        if 'function' in option['variants'] and option['mode'] != 'native':
+            # don't generate _out variants for in-place functions
+            return re.search('(^__i|[^_]_$)', option['api_name']) is None
+        return False
+
+    for option in options:
+        for arg in option['arguments']:
+            # mark arguments which can be null
+            if is_nullable(arg):
+                arg['is_nullable'] = True
+
+        if any('output' in arg for arg in option['arguments']):
+            allocate_option = deepcopy(option)
+            # the allocating option needs to be marked
+            for arg in allocate_option['arguments']:
+                if 'output' in arg:
+                    arg['allocate'] = True
+
+            # the original option, which takes arguments for the results,
+            # is no longer a method, and has _out added to indicte it takes
+            # output arguments
+            if should_generate_out_variant(option):
+                if 'method' in option['variants']:
+                    option['variants'].remove('method')
+                option['api_name'] += '_out'
+                new_options.append(option)
+
+            new_options.append(allocate_option)
+        else:
+            new_options.append(option)
+    return new_options
+
+
+def sanitize_return(option):
+    ret = option['return']
+    m = re.match('argument (\d+(,\d+)*)', ret)
+    if m is not None:
+        arguments = [int(x) for x in m.group(1).split(',')]
+        option['return'] = {'kind': 'arguments', 'arguments': arguments}
+    elif ret == 'self':
+        option['return'] = {'kind': 'arguments', 'arguments': []}
+        for i, x in enumerate(option['arguments']):
+            if x['name'] == 'self':
+                option['return']['arguments'].append(i)
+                break
+    else:
+        option['return'] = {'kind': 'type', 'type': option['return']}
+
+
+def set_mode(option):
+    option['mode'] = option.get('mode', 'TH')
+
+# To enable 0-dim support in TH operations
+# we find all places where a single Scalar replaced with a Tensor
+# as an argument is still a valid function
+# we then mark the tensor variant with a key zero_dim_dispatch_when_scalar: name
+# where 'name' is the name of the argument that should be a scalar
+# during dispatch, if that argument is marked internally as holding a scalar
+# then the method will dispatch to that function.
+
+
+def discover_zero_dim_tensor_operations(declaration):
+    def exclude(arg):
+        return arg.get('ignore_check')
+
+    def signature(option, i=None, value=None):
+        elements = [TYPE_FORMAL_GENERIC.get(arg['type'], arg['type'])
+                    if i is None or j != i else value
+                    for j, arg in enumerate(option['arguments'])
+                    if not exclude(arg)]
+        return '#'.join(elements)
+    signature_to_option = {signature(option): option
+                           for option in declaration['options']}
+
+    for option in declaration['options']:
+        for i, arg in enumerate(option['arguments']):
+            if arg['type'] == 'real':
+                signature_of_tensor_version = signature(option, i, 'Tensor &')
+                if signature_of_tensor_version in signature_to_option:
+                    tensor_version = \
+                        signature_to_option[signature_of_tensor_version]
+                    names = [arg['name'] for arg in tensor_version['arguments']
+                             if not exclude(arg)]
+                    tensor_version['zero_dim_dispatch_when_scalar'] = names[i]
+                    # print("FOUND "+str(i)   )
+                    # print("Scalar Version ===== ")
+                    # print(yaml.dump(option))
+                    # print("Tensor Version ===== ")
+                    # print(yaml.dump(tensor_version))
+                    # print("SHARED "+names[i])
+
+
+def discover_sparse_tensor_operations(declaration):
+    def exclude(arg):
+        return arg.get('ignore_check')
+
+    def signature(option, i=None, value=None):
+        elements = [TYPE_FORMAL_GENERIC.get(arg['type'], arg['type'])
+                    if i is None or j != i else value
+                    for j, arg in enumerate(option['arguments'])
+                    if not exclude(arg)]
+        return '#'.join(elements)
+
+    # Determine if any options have the 'aten_dense_sparse' flag
+    dense_sparse_options = [option
+                            for option in declaration['options']
+                            if option.get('aten_dense_sparse', False)]
+    if len(dense_sparse_options) > 0:
+        signature_to_option = {signature(option): option
+                               for option in declaration['options']}
+
+        for option in declaration['options']:
+            for i, arg in enumerate(option['arguments']):
+                if (arg['type'] == 'THSTensor*' and
+                        option.get('aten_dense_sparse', False)):
+                    signature_of_tensor_version = signature(
+                        option, i, 'Tensor &')
+                    if signature_of_tensor_version in signature_to_option:
+                        tensor_version = \
+                            signature_to_option[signature_of_tensor_version]
+                        raw_args = len(tensor_version['arguments'])
+                        names = [arg['name'] for arg in tensor_version['arguments']
+                                 if not exclude(arg)]
+                        filtered_args = len(names)
+                        tensor_version['when_sparse_dispatch'] = names[i -
+                                                                       (raw_args - filtered_args)]
+
+
+def run(declarations):
+    declarations = [d for d in declarations if not exclude(d)]
+    for declaration in declarations:
+        common_with_cwrap.set_declaration_defaults(declaration)
+        declaration['options'] = [deepcopy(o) for o in declaration['options']]
+        declaration['options'] = common_with_cwrap.filter_unique_options(
+            declaration['options'],
+            allow_kwarg=False,
+            type_to_signature=TYPE_FORMAL_GENERIC,
+            remove_self=True)
+        common_with_cwrap.sort_by_number_of_options(declaration)
+        discover_zero_dim_tensor_operations(declaration)
+        discover_sparse_tensor_operations(declaration)
+
+        for option in declaration['options']:
+            set_mode(option)
+            if option['mode'] != 'native':
+                sanitize_return(option)
+            process_types_and_backends(option)
+            add_variants(option)
+        declaration['options'] = handle_outputs_taken_as_arguments(
+            declaration['options'])
+    return declarations
diff --git a/aten/src/ATen/stub/CombinedStub.cpp b/aten/src/ATen/stub/CombinedStub.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
new file mode 100644
index 0000000..2c510a4
--- /dev/null
+++ b/aten/src/ATen/templates/Functions.h
@@ -0,0 +1,36 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "ATen/Scalar.h"
+#include "ATen/Type.h"
+#include "ATen/Tensor.h"
+#include "ATen/Storage.h"
+#include "ATen/Generator.h"
+#include "ATen/Deprecated.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/DeviceGuard.h"
+#include "ATen/TensorOptions.h"
+#include "THNN/Reduction.h"
+
+namespace at {
+
+using native::from_blob;
+using native::tensor;
+
+${function_declarations}
+
+static inline Type & infer_type(const Tensor & t) {
+  AT_CHECK(t.defined(), "undefined Tensor");
+  return t.type();
+}
+static inline Type & infer_type(const TensorList & tl) {
+  AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+  return tl[0].type();
+}
+// function definitions are all static inline because
+// they are one-line statically dispatched functions that
+// invoke the actual dynamic dispatch on the correct argument
+${function_definitions}
+
+}
diff --git a/aten/src/ATen/templates/GeneratorDerived.h b/aten/src/ATen/templates/GeneratorDerived.h
new file mode 100644
index 0000000..9fde183
--- /dev/null
+++ b/aten/src/ATen/templates/GeneratorDerived.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <$header>
+
+#include "ATen/Generator.h"
+
+namespace at {
+
+class Context;
+struct ${name}Generator : public Generator {
+  ${name}Generator(Context * context);
+  virtual ~${name}Generator();
+
+  virtual ${name}Generator& copy(const Generator& from) override;
+  virtual ${name}Generator& free() override;
+
+  virtual uint64_t seed() override;
+  virtual uint64_t initialSeed() override;
+  virtual ${name}Generator& manualSeed(uint64_t seed) override;
+  virtual ${name}Generator& manualSeedAll(uint64_t seed) override;
+  virtual void * unsafeGetTH() override;
+
+//TODO(zach): figure out friends later
+public:
+  Context * context;
+  ${th_generator}
+};
+
+}
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
new file mode 100644
index 0000000..2c84f21
--- /dev/null
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -0,0 +1,66 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ScalarType.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorMethods.h>
+#include <ATen/TensorOptions.h>
+
+#include <array>
+#include <functional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace at {
+struct Generator;
+class Scalar;
+struct Tensor;
+struct Type;
+} // namespace at
+
+namespace at {
+namespace native {
+
+inline Tensor from_blob(
+    void* data,
+    IntList sizes,
+    const std::function<void(void*)>& deleter,
+    const TensorOptions& options = {}) {
+  return options.type().tensorFromBlob(data, sizes, deleter);
+}
+
+inline Tensor from_blob(
+    void* data,
+    IntList sizes,
+    const TensorOptions& options = {}) {
+  return native::from_blob(data, sizes, [](void*) {}, options);
+}
+
+// These functions are defined in native/TensorFactories.cpp.
+#define TENSOR(T, S, _1)                                               \
+  Tensor tensor(ArrayRef<T> values, const TensorOptions& options);     \
+  inline Tensor tensor(                                                \
+      std::initializer_list<T> values, const TensorOptions& options) { \
+    return native::tensor(ArrayRef<T>(values), options);               \
+  }                                                                    \
+  inline Tensor tensor(T value, const TensorOptions& options) {        \
+    return native::tensor(ArrayRef<T>(value), options);                \
+  }                                                                    \
+  inline Tensor tensor(ArrayRef<T> values) {                           \
+    return native::tensor(std::move(values), at::dtype(k##S));         \
+  }                                                                    \
+  inline Tensor tensor(std::initializer_list<T> values) {              \
+    return native::tensor(ArrayRef<T>(values));                        \
+  }                                                                    \
+  inline Tensor tensor(T value) {                                      \
+    return native::tensor(ArrayRef<T>(value));                         \
+  }
+AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR)
+#undef TENSOR
+
+${native_function_declarations}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/templates/RegisterCUDA.cpp b/aten/src/ATen/templates/RegisterCUDA.cpp
new file mode 100644
index 0000000..40c00c1
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCUDA.cpp
@@ -0,0 +1,17 @@
+#include <ATen/RegisterCUDA.h>
+
+// ${generated_comment}
+
+#include <ATen/Type.h>
+#include <ATen/Context.h>
+#include <ATen/detail/VariableHooksInterface.h>
+
+${cuda_type_headers}
+
+namespace at {
+
+void register_cuda_types(Context * context) {
+  ${cuda_type_registrations}
+}
+
+} // namespace at
diff --git a/aten/src/ATen/templates/RegisterCUDA.h b/aten/src/ATen/templates/RegisterCUDA.h
new file mode 100644
index 0000000..3fa97c6
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCUDA.h
@@ -0,0 +1,10 @@
+#pragma once
+
+// ${generated_comment}
+
+namespace at {
+
+class Context;
+void register_cuda_types(Context * context);
+
+} // namespace at
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
new file mode 100644
index 0000000..42a6ec9
--- /dev/null
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -0,0 +1,84 @@
+// required for old g++ to compile PRId64 macros, see
+// https://github.com/pytorch/pytorch/issues/3571
+// for context
+#define __STDC_FORMAT_MACROS
+
+#include "ATen/${Type}.h"
+
+// ${generated_comment}
+
+#include "ATen/${Generator}.h"
+#include "ATen/${DenseTensor}.h"
+#include "ATen/${DenseBackend}LongTensor.h"
+#include "ATen/Allocator.h"
+#include "ATen/Half.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/THLongStorageView.h"
+#include "ATen/UndefinedTensor.h"
+#include "ATen/Utils.h"
+#include "ATen/DeviceGuard.h"
+#include "ATen/optional.h"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "ATen/Config.h"
+$extra_cuda_headers
+
+namespace at {
+
+${Type}::${Type}(Context* context)
+  : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
+ScalarType ${Type}::scalarType() const {
+  return ScalarType::${ScalarName};
+}
+Backend ${Type}::backend() const {
+  return Backend::${Backend};
+}
+bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; }
+bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; }
+bool ${Type}::is_distributed() const { return false; }
+
+std::unique_ptr<Storage> ${Type}::storage() const {
+  AT_ERROR("storage not supported on sparse");
+}
+std::unique_ptr<Storage> ${Type}::storage(size_t size) const {
+  AT_ERROR("storage not supported on sparse");
+}
+std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
+  AT_ERROR("storage not supported on sparse");
+}
+std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const {
+  AT_ERROR("storage not supported on sparse");
+}
+Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
+  AT_ERROR("unsafeTensorFromTH not supported on sparse");
+}
+std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
+  AT_ERROR("unsafeTensorFromTH not supported on sparse");
+}
+std::unique_ptr<Generator> ${Type}::generator() const {
+  return std::unique_ptr<Generator>(new ${Generator}(context));
+}
+
+const char * ${Type}::toString() const {
+  return ${Type}::typeString();
+}
+TypeID ${Type}::ID() const {
+  return ${TypeID};
+}
+
+size_t ${Type}::elementSizeInBytes() const {
+  return sizeof(${ScalarType});
+}
+
+const char * ${Type}::typeString() {
+  return "${Type}";
+}
+
+${type_derived_method_definitions}
+
+}
diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
new file mode 100644
index 0000000..83e10b9
--- /dev/null
+++ b/aten/src/ATen/templates/StorageDerived.cpp
@@ -0,0 +1,146 @@
+#include "ATen/${Storage}.h"
+
+// ${generated_comment}
+
+#include "ATen/Half.h"
+#include "ATen/Allocator.h"
+
+#include "ATen/Config.h"
+$extra_cuda_headers
+
+namespace at {
+
+${Storage}::${Storage}(Context* context):
+    storage(${THStorage}_new(${state})), context(context) {}
+
+${Storage}::${Storage}(Context* context, THStorage* storage):
+    storage(storage), context(context) {}
+
+${Storage}::${Storage}(Context* context, size_t storage_size)
+  : storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {}
+
+${Storage}::${Storage}(Context* context, size_t size, Allocator* allocator)
+  : storage(nullptr),
+    context(context) {
+  storage = ${THStorage}_newWithAllocator(${state,} size, allocator);
+  ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE);
+}
+
+// TODO: Take in Device as an input to the std::function constructor
+
+#if ${isCUDA}
+static int getPointerDevice(void* ptr) {
+  struct cudaPointerAttributes attr;
+  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
+  return attr.device;
+}
+#endif
+
+${Storage}::${Storage}(Context* context,
+  void * data, size_t size, const std::function<void(void*)> & deleter)
+  : storage(${THStorage}_newWithDataAndAllocator(${state,}
+      InefficientStdFunctionContext::makeDataPtr(data, deleter,
+#if ${isCUDA}
+      Device(kCUDA, getPointerDevice(data))
+#else
+      kCPU
+#endif
+       ), size,
+     /* allocator */ nullptr
+    )),
+    context(context) {
+    ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE);
+}
+
+${Storage}::~${Storage}() {
+  ${THStorage}_free(${state,} storage);
+}
+
+size_t ${Storage}::elementSize() const {
+  return sizeof(${ScalarType});
+}
+
+size_t ${Storage}::size() const {
+  return storage->size;
+}
+
+void* ${Storage}::data() {
+  return storage->data_ptr.get();
+}
+
+const void* ${Storage}::data() const {
+  return storage->data_ptr.get();
+}
+
+auto ${Storage}::retain() -> ${Storage}& {
+  ${THStorage}_retain(${state,} storage);
+  return *this;
+}
+
+auto ${Storage}::free() -> ${Storage}& {
+  ${THStorage}_free(${state,} storage);
+  return *this;
+}
+
+void* ${Storage}::unsafeGetTH(bool retain) const {
+  if (retain) {
+    ${THStorage}_retain(${state,} storage);
+  }
+  return storage;
+}
+
+auto ${Storage}::resize(int64_t new_size) -> ${Storage}& {
+  ${THStorage}_resize(${state,} storage, new_size);
+  return *this;
+}
+
+auto ${Storage}::fill(Scalar value) -> ${Storage}& {
+  ${THStorage}_fill(${state,} storage, ${to_th_type}(value.to${ScalarName}()));
+  return *this;
+}
+
+auto ${Storage}::set(size_t ind, Scalar value) -> ${Storage}& {
+  ${THStorage}_set(${state,} storage, ind, ${to_th_type}(value.to${ScalarName}()));
+  return *this;
+}
+
+auto ${Storage}::fast_set(size_t ind, Scalar value) -> ${Storage}& {
+  throw std::runtime_error("unsupported operation 'fast_set'");
+}
+
+auto ${Storage}::get(size_t ind) -> Scalar {
+  // static cast to fix  long -> int64_t issues
+  return static_cast<${ScalarType}>(${to_at_type}(${THStorage}_get(${state,} storage, ind)));
+}
+
+auto ${Storage}::fast_get(size_t ind) -> Scalar {
+  if(${isCUDA})
+    throw std::runtime_error("unsupported operation 'fast_get'");
+  return static_cast<${ScalarType}>(${to_at_type}(storage->unsafe_data<${THScalarType}>()[ind]));
+}
+
+void ${Storage}::set_flag(char flag) {
+  ${THStorage}_setFlag(${state,} storage, flag);
+}
+
+void ${Storage}::clear_flag(char flag) {
+  ${THStorage}_clearFlag(${state,} storage, flag);
+}
+
+int ${Storage}::getDevice() const {
+  return storage->data_ptr.device().index();
+}
+
+Type& ${Storage}::type() const {
+  return context->getType(Backend::${Backend},ScalarType::${ScalarName});
+}
+
+const char * ${Storage}::toString() const {
+  return "${Storage}";
+}
+
+const char * ${Storage}::typeString() {
+  return "${Type}";
+}
+
+}
diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
new file mode 100644
index 0000000..d97d397
--- /dev/null
+++ b/aten/src/ATen/templates/StorageDerived.h
@@ -0,0 +1,57 @@
+#pragma once
+
+// ${generated_comment}
+
+$th_headers
+
+#include "ATen/Storage.h"
+#include "ATen/Context.h"
+
+#include <memory>
+
+namespace at {
+
+struct Allocator;
+
+struct ${Storage} final : public Storage {
+public:
+  explicit ${Storage}(Context* context);
+  ${Storage}(Context* context, THStorage *wrapped);
+  ${Storage}(Context* context, size_t size);
+  ${Storage}(Context* context, size_t size, Allocator* allocator);
+  ${Storage}(Context* context,
+    void * data, size_t size, const std::function<void(void*)> & deleter);
+  virtual ~${Storage}();
+
+  virtual size_t elementSize() const override;
+  virtual size_t size() const override;
+  virtual void* data() override;
+  virtual const void* data() const override;
+  virtual ${Storage}& retain() override;
+  virtual ${Storage}& free() override;
+  virtual void * unsafeGetTH(bool retain) const override;
+
+  virtual ${Storage}& resize(int64_t new_size) override;
+  virtual ${Storage}& fill(Scalar value) override;
+  virtual ${Storage}& set(size_t ind, Scalar value) override;
+  virtual ${Storage}& fast_set(size_t ind, Scalar value) override;
+  virtual Scalar get(size_t ind) override;
+  virtual Scalar fast_get(size_t ind) override;
+
+  virtual void set_flag(char flag) override;
+  virtual void clear_flag(char flag) override;
+
+  virtual Type& type() const override;
+  virtual int getDevice() const override;
+  virtual const char * toString() const override;
+
+  static const char * typeString();
+
+
+protected:
+  friend struct ${Type};
+  THStorage *storage;
+  Context* context;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
new file mode 100644
index 0000000..31e952e
--- /dev/null
+++ b/aten/src/ATen/templates/Tensor.h
@@ -0,0 +1,250 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "ATen/Generator.h"
+#include "ATen/Scalar.h"
+#include "ATen/ScalarType.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/Storage.h"
+#include "ATen/TensorAccessor.h"
+#include "ATen/TensorBase.h"
+#include "ATen/TensorImpl.h"
+#include "ATen/Utils.h"
+#include "ATen/Device.h"
+#include "ATen/Layout.h"
+#include "ATen/optional.h"
+
+namespace at {
+struct Type;
+struct Tensor;
+struct TensorOptions;
+namespace detail {
+void set_data(Tensor& tensor, Tensor new_data);
+} // namespace detail
+} // namespace at
+
+namespace at {
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+struct Tensor : public detail::TensorBase {
+  using TensorBase = detail::TensorBase;
+  Tensor() : TensorBase() {}
+  Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {}
+  Tensor(const TensorBase & rhs) : TensorBase(rhs) {}
+  Tensor(const Tensor & rhs) = default;
+  Tensor(Tensor && rhs) noexcept = default;
+
+  // reimplemented from TensorBase so the return type is Tensor rather than TensorBase
+  Tensor & operator=(Tensor && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  Tensor & operator=(Tensor const & rhs) & {
+      //Tensor ctor retains original rhs.pImpl
+      //then rhs.pImpl is swapped with this->pImpl
+      //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl
+      Tensor(rhs).swap(*this);
+      return *this;
+  }
+
+  inline Tensor & operator=(Tensor const & rhs) &&;
+  Tensor & operator=(Scalar v) &&;
+  const char * toString() const {
+    return pImpl->toString();
+  }
+  IntList sizes() const {
+    return pImpl->sizes();
+  }
+  IntList strides() const {
+    return pImpl->strides();
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+  Type & type() const {
+    return pImpl->type();
+  }
+  std::unique_ptr<Storage> storage() const {
+    return pImpl->storage();
+  }
+  inline Tensor toType(const Type & t, bool non_blocking=false) const;
+  inline Tensor & copy_(const Tensor & src, bool non_blocking=false);
+  inline Tensor toType(ScalarType t) const;
+  inline Tensor toBackend(Backend b) const;
+
+  /// New-style `to()` methods.
+  /// NB: These methods are defined in TensorOptions.h.
+  Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const;
+  Tensor to(ScalarType dtype, bool non_blocking = false) const;
+  Tensor to(Device device, bool non_blocking = false) const;
+
+  /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
+  /// Defined in Type.h because of include order issues.
+  bool is_variable() const noexcept;
+
+  /// Returns a `Tensor`'s layout. Defined in Type.h
+  Layout layout() const noexcept;
+
+  /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h
+  ScalarType dtype() const noexcept;
+
+  /// Returns a `Tensor`'s device.
+  Device device() const;
+
+  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
+  /// TensorOptions.h.
+  TensorOptions options() const;
+
+  template<typename T>
+  T * data() const;
+
+  void * unsafeGetTH(bool retain) const {
+    return pImpl->unsafeGetTH(retain);
+  }
+
+  // non-retaining
+  TensorImpl * unsafeGetTensorImpl() const {
+    return pImpl;
+  }
+
+  // Purposely not defined here to avoid inlining
+  void print() const;
+
+  //toLongData(), toFloatData() etc.
+  #define TO_TYPE_DATA(T,name,_) \
+  T * to##name##Data() const;
+  AT_FORALL_SCALAR_TYPES(TO_TYPE_DATA)
+  #undef TO_TYPE_DATA
+
+  #define TO_C_TYPE(T,name,_) \
+  T toC##name () const;
+  AT_FORALL_SCALAR_TYPES(TO_C_TYPE)
+  #undef TO_C_TYPE
+
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() const {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
+  }
+
+  Tensor operator-() const;
+  Tensor& operator+=(const Tensor & other);
+  Tensor& operator+=(Scalar other);
+  Tensor& operator-=(const Tensor & other);
+  Tensor& operator-=(Scalar other);
+  Tensor& operator*=(const Tensor & other);
+  Tensor& operator*=(Scalar other);
+  Tensor& operator/=(const Tensor & other);
+  Tensor& operator/=(Scalar other);
+  Tensor operator[](Scalar index) const;
+  Tensor operator[](Tensor index) const;
+  Tensor operator[](int64_t index) const;
+
+  // ~~~~~ Autograd API ~~~~~
+
+  Tensor& set_requires_grad(bool requires_grad) {
+    pImpl->set_requires_grad(requires_grad);
+    return *this;
+  }
+  bool requires_grad() const {
+    return pImpl->requires_grad();
+  }
+
+  Tensor& grad() {
+    return pImpl->grad();
+  }
+  const Tensor& grad() const {
+    return pImpl->grad();
+  }
+
+  Tensor detach() const {
+    return pImpl->detach();
+  }
+  void detach_() {
+    pImpl->detach_();
+  }
+
+  /// Computes the gradient of current tensor w.r.t. graph leaves.
+  void backward(
+      at::optional<Tensor> gradient = at::nullopt,
+      bool keep_graph = false,
+      bool create_graph = false);
+
+  friend void detail::set_data(Tensor& tensor, Tensor new_data);
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  ${tensor_method_declarations}
+
+  template <typename F, typename... Args>
+  auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward<Args>(params)...)) {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  friend struct WeakTensor;
+};
+
+struct WeakTensor : public detail::WeakTensorBase {
+  using WeakTensorBase = detail::WeakTensorBase;
+  WeakTensor() : WeakTensorBase() {}
+  WeakTensor(TensorImpl * self, bool retain) : WeakTensorBase(self, retain) {}
+  WeakTensor(const WeakTensor & rhs) = default;
+  WeakTensor(WeakTensor && rhs) noexcept = default;
+  WeakTensor(const Tensor& t) : WeakTensorBase(t.pImpl, true) {}
+
+  // reimplemented from TensorBase so the return type is WeakTensor rather than TensorBase
+  WeakTensor & operator=(WeakTensor && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  WeakTensor & operator=(WeakTensor const & rhs) & {
+    //Tensor ctor retains original rhs.pImpl
+    //then rhs.pImpl is swapped with this->pImpl
+    //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl
+    WeakTensor(rhs).swap(*this);
+    return *this;
+  }
+
+  WeakTensor & operator=(const Tensor& t) {
+    WeakTensor(t.pImpl, true).swap(*this);
+    return *this;
+  }
+
+  // non-retaining
+  TensorImpl * unsafeGetTensorImpl() const {
+    return pImpl;
+  }
+
+  // XXX: this can return undefined tensors
+  // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
+  Tensor lock() const {
+    return pImpl->weak_lock() ? Tensor(pImpl, false) : Tensor();
+  }
+};
+
+namespace detail {
+inline void set_data(Tensor& tensor, Tensor new_data) {
+  tensor.pImpl->set_data(new_data);
+}
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
new file mode 100644
index 0000000..92ffeb3
--- /dev/null
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -0,0 +1,15 @@
+// included as 'TensorDenseOrSparse' in TensorDerived.cpp
+
+IntList ${Tensor}::strides() const {
+  return IntList(tensor->stride,dim());
+}
+Scalar ${Tensor}::localScalar() {
+  int64_t numel = ${THTensor}_nElement(${state,}tensor);
+  AT_CHECK(numel == 1,"a Tensor with ", numel, " elements cannot be converted to Scalar");
+  return Scalar(${to_at_type}(${THStorage}_get(${state,}tensor->storage, tensor->storageOffset)));
+}
+std::unique_ptr<Storage> ${Tensor}::storage() {
+  auto storage = ${THTensor}_storage(${state,}tensor);
+  ${THStorage}_retain(${state,}storage);
+  return std::unique_ptr<Storage>(new ${Storage}(&type().get_context(), storage));
+}
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
new file mode 100644
index 0000000..e15eb5f
--- /dev/null
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -0,0 +1,59 @@
+// required for old g++ to compile PRId64 macros, see
+// https://github.com/pytorch/pytorch/issues/3571
+// for context
+#define __STDC_FORMAT_MACROS
+
+// ${generated_comment}
+
+#include "ATen/Config.h"
+#include "ATen/${Tensor}.h"
+#include "ATen/${Storage}.h"
+#include "ATen/Scalar.h"
+#include "ATen/Half.h"
+
+$extra_cuda_headers
+
+namespace at {
+
+${Tensor}::${Tensor}(Context* context)
+: ${Tensor}(context,${THTensor}_new(${state})) {}
+
+${Tensor}::${Tensor}(Context* context, ${THTensor} * tensor)
+: TensorImpl(&context->getType(Backend::${Backend},ScalarType::${ScalarName})),
+  tensor(tensor),
+  context(context) {}
+${Tensor}::~${Tensor}() {
+  ${THTensor}_free(${state,} tensor);
+}
+
+const char * ${Tensor}::toString() const {
+  return "${Tensor}";
+}
+
+IntList ${Tensor}::sizes() const {
+  return IntList(tensor->size,dim());
+}
+
+int64_t ${Tensor}::dim() const {
+  if(isScalar())
+    return 0;
+  return tensor->dim();
+}
+
+const char * ${Tensor}::typeString() {
+  return "${Type}";
+}
+void * ${Tensor}::unsafeGetTH(bool retain) {
+  if (retain)
+      ${THTensor}_retain(${state,} tensor);
+  return tensor;
+}
+
+void ${Tensor}::release_resources() {
+  ${THTensor}_free(${state,} tensor);
+  tensor = nullptr;
+}
+
+${TensorDenseOrSparse}
+
+}
diff --git a/aten/src/ATen/templates/TensorDerived.h b/aten/src/ATen/templates/TensorDerived.h
new file mode 100644
index 0000000..892d6bc
--- /dev/null
+++ b/aten/src/ATen/templates/TensorDerived.h
@@ -0,0 +1,37 @@
+#pragma once
+
+// ${generated_comment}
+
+$th_headers
+
+#include "ATen/Tensor.h"
+#include "ATen/TensorImpl.h"
+#include "ATen/Context.h"
+#include "ATen/TensorMethods.h"
+
+namespace at {
+
+struct ${Tensor} final : public TensorImpl {
+public:
+  explicit ${Tensor}(Context* context);
+  ${Tensor}(Context* context, ${THTensor} * tensor);
+  virtual ~${Tensor}();
+  virtual const char * toString() const override;
+  virtual IntList sizes() const override;
+  virtual IntList strides() const override;
+  virtual int64_t dim() const override;
+  virtual Scalar localScalar() override;
+  virtual void * unsafeGetTH(bool retain) override;
+  virtual std::unique_ptr<Storage> storage() override;
+  virtual void release_resources() override;
+  static const char * typeString();
+
+//TODO(zach): sort of friend permissions later so this
+// can be protected
+public:
+  ${THTensor} * tensor;
+  Context* context;
+  friend struct ${Type};
+};
+
+} // namespace at
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
new file mode 100644
index 0000000..846f5c5
--- /dev/null
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -0,0 +1,62 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "ATen/Tensor.h"
+#include "ATen/Scalar.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/Type.h"
+
+namespace at {
+
+inline Tensor & Tensor::operator=(Tensor const & rhs) && {
+  return copy_(rhs);
+}
+
+inline Tensor Tensor::toType(const Type & t, bool non_blocking) const {
+  if(type() == t)
+    return *this;
+  return t.copy(*this, non_blocking);
+}
+
+inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) {
+  return type().copy_(*this, src, non_blocking);
+}
+
+inline Tensor Tensor::toType(ScalarType t) const {
+  return toType(type().toScalarType(t));
+}
+
+inline Tensor Tensor::toBackend(Backend b) const {
+  return toType(type().toBackend(b));
+}
+
+
+// all static inline to allow for inlining of the non-dynamic part of dispatch
+${tensor_method_definitions}
+
+#define DEFINE_CAST(T, name, _)                  \
+  template <>                                    \
+  inline T* Tensor::data() const {               \
+    AT_CHECK(                                    \
+        type().scalarType() == ScalarType::name, \
+        "expected scalar type ",                 \
+        #name,                                   \
+        " but found ",                           \
+        at::toString(type().scalarType()));      \
+    return static_cast<T*>(this->data_ptr());    \
+  }                                              \
+  inline T* Tensor::to##name##Data() const {     \
+    return data<T>();                            \
+  }
+
+AT_FORALL_SCALAR_TYPES(DEFINE_CAST)
+#undef DEFINE_CAST
+
+#define DEFINE_TO_C_TYPE(T,name,_) \
+inline T Tensor::toC##name () const { return pImpl->localScalar().to##name (); }
+
+AT_FORALL_SCALAR_TYPES(DEFINE_TO_C_TYPE)
+#undef DEFINE_TO_C_TYPE
+
+} //namespace at
diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp
new file mode 100644
index 0000000..ea75f1c
--- /dev/null
+++ b/aten/src/ATen/templates/Type.cpp
@@ -0,0 +1,108 @@
+#include "ATen/Type.h"
+
+// ${generated_comment}
+
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Scalar.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/Storage.h"
+#include "ATen/Tensor.h"
+#include "ATen/TensorOptions.h"
+#include "ATen/UndefinedType.h"
+#include "ATen/DeviceGuard.h"
+
+#include <ATen/detail/VariableHooksInterface.h>
+
+#include <iostream>
+${cpu_type_headers}
+
+namespace at {
+
+void Type::registerCPU(Context * context) {
+  ${cpu_type_registrations}
+  context->type_registry[static_cast<int>(Backend::Undefined)]
+                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType(context));
+}
+
+Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
+  Tensor b_src;
+  std::tie(b_src) = expand_inplace(self, src, "copy");
+  return s_copy_(self, b_src, non_blocking);
+}
+
+Tensor Type::copy(const Tensor & src, bool non_blocking) const {
+  // TODO(psag): have a DeviceGuard here
+  AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
+  if (is_sparse()) {
+    auto indices = src._indices();
+    auto values = src._values();
+    auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU);
+    auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long);
+    auto indices_copy = this_dense_idx.copy(indices, non_blocking);
+    auto values_copy = this_dense.copy(values, non_blocking);
+    return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes());
+  } else {
+    Tensor r = this->tensor(src.sizes());
+    r.copy_(src, non_blocking);
+    return r;
+  }
+}
+
+Type & Type::toBackend(Backend b) const {
+  return context->getType(b,scalarType());
+}
+Type & Type::toScalarType(ScalarType s) const {
+  return context->getType(backend(),s);
+}
+static std::vector<int64_t> defaultStrides(IntList sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  int64_t stride = 1;
+  for(size_t i = sizes.size(); i > 0; --i) {
+    strides[i-1] = stride;
+    stride *= sizes[i-1];
+  }
+  return strides;
+}
+static int64_t computeStorageSize(IntList sizes, IntList strides) {
+  // size of the underlying storage is 1 bigger than the offset
+  // of the last element according to stride
+  int64_t size = 1;
+  for(size_t i = 0; i < sizes.size(); i++) {
+    if(sizes[i] == 0) {
+      return 0;
+    }
+    size += strides[i]*(sizes[i]-1);
+  }
+  return size;
+}
+Tensor Type::tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter) const {
+  return tensorFromBlob(data, sizes, defaultStrides(sizes), deleter);
+}
+Tensor Type::tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter) const {
+  auto storage = storageFromBlob(data, computeStorageSize(sizes, strides), deleter);
+  return tensor(*storage, 0, sizes, strides);
+}
+Tensor Type::tensorWithAllocator(IntList sizes, Allocator* allocator) const {
+  return tensorWithAllocator(sizes, defaultStrides(sizes), std::move(allocator));
+}
+Tensor Type::tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const {
+  auto storage = storageWithAllocator(computeStorageSize(sizes, strides), std::move(allocator));
+  return tensor(*storage, 0, sizes, strides);
+}
+Tensor Type::scalarTensor(Scalar s) const {
+  if(s.isBackedByTensor())
+    return Tensor(s.t).toType(*this);
+  return tensor({}).fill_(s);
+}
+
+bool Type::operator==(const Type& other) const {
+  return this == &other;
+}
+bool Type::operator!=(const Type& other) const {
+  return this != &other;
+}
+
+${type_method_definitions}
+
+}
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
new file mode 100644
index 0000000..459e363
--- /dev/null
+++ b/aten/src/ATen/templates/Type.h
@@ -0,0 +1,122 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "ATen/Allocator.h"
+#include "ATen/ArrayRef.h"
+#include "ATen/ATenGeneral.h"
+#include "ATen/Generator.h"
+#include "ATen/Half.h"
+#include "ATen/Scalar.h"
+#include "ATen/ScalarType.h"
+#include "ATen/SparseTensorRef.h"
+#include "ATen/Tensor.h"
+#include "ATen/Deprecated.h"
+#include "ATen/Layout.h"
+#include "THNN/Reduction.h"
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <memory>
+
+// To solve the conflict of s_addr in inaddr.h
+#ifdef _MSC_VER
+#ifdef s_addr
+#undef s_addr
+#endif
+#endif
+
+namespace at {
+
+class Context;
+struct Allocator;
+struct Generator;
+struct Storage;
+
+static inline void noop_deleter(void*) {}
+
+enum class TypeID {
+  ${type_ids}
+  Undefined,
+  NumOptions
+};
+
+struct AT_API Type {
+  explicit Type(Context* context, bool is_variable, bool is_undefined)
+      : context(context), is_variable_(is_variable), is_undefined_(is_undefined) {}
+  virtual ~Type() {}
+  virtual ScalarType scalarType() const = 0;
+  virtual Backend backend() const = 0;
+  Layout layout() const noexcept { return layout_from_backend(backend()); }
+  virtual bool is_cuda() const = 0;
+  virtual bool is_sparse() const = 0;
+  virtual bool is_distributed() const = 0;
+  bool is_variable() const noexcept { return is_variable_; }
+  bool is_undefined() const noexcept { return is_undefined_; }
+  static void registerCPU(Context * context);
+  virtual std::unique_ptr<Storage> storage() const = 0;
+  virtual std::unique_ptr<Storage> storage(size_t size) const = 0;
+  virtual std::unique_ptr<Storage> storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual std::unique_ptr<Storage> storageWithAllocator(int64_t size, Allocator* allocator) const = 0;
+  virtual std::unique_ptr<Generator> generator() const = 0;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0;
+  virtual std::unique_ptr<Storage> unsafeStorageFromTH(void * th_pointer, bool retain) const = 0;
+  virtual const char * toString() const = 0;
+  virtual size_t elementSizeInBytes() const = 0;
+  virtual Type & toBackend(Backend b) const;
+  virtual Type & toScalarType(ScalarType s) const;
+  Type & toSparse() const {
+    return this->toBackend(at::toSparse(this->backend()));
+  }
+  Type & toDense() const {
+    return this->toBackend(at::toDense(this->backend()));
+  }
+  Context& get_context() const { return *context; }
+
+  // contingious IDs for all types in the system
+  // for external dispatch
+  virtual TypeID ID() const = 0;
+
+  Tensor copy(const Tensor & src, bool non_blocking=false) const;
+  Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const;
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
+
+  Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const;
+  Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const;
+  Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const;
+  Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const;
+  Tensor scalarTensor(Scalar s) const;
+
+  bool operator==(const Type& other) const;
+  bool operator!=(const Type& other) const;
+
+  // example
+  // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
+  ${type_method_declarations}
+protected:
+  Context* context;
+  bool is_variable_;
+  bool is_undefined_;
+
+};
+
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().backend(), type().is_cuda() ? get_device() : -1);
+}
+
+} // namespace at
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
new file mode 100644
index 0000000..6699070
--- /dev/null
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -0,0 +1,98 @@
+// required for old g++ to compile PRId64 macros, see
+// https://github.com/pytorch/pytorch/issues/3571
+// for context
+#define __STDC_FORMAT_MACROS
+
+#include "ATen/${Type}.h"
+
+// ${generated_comment}
+
+$storage_tensor_headers
+#include "ATen/${Generator}.h"
+#include "ATen/${DenseTensor}.h"
+#include "ATen/${DenseBackend}LongTensor.h"
+#include "ATen/Allocator.h"
+#include "ATen/Half.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/THLongStorageView.h"
+#include "ATen/UndefinedTensor.h"
+#include "ATen/Utils.h"
+#include "ATen/DeviceGuard.h"
+#include "ATen/optional.h"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "ATen/Config.h"
+$extra_cuda_headers
+
+namespace at {
+
+${Type}::${Type}(Context* context)
+  : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
+ScalarType ${Type}::scalarType() const {
+  return ScalarType::${ScalarName};
+}
+Backend ${Type}::backend() const {
+  return Backend::${Backend};
+}
+bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; }
+bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; }
+bool ${Type}::is_distributed() const { return false; }
+
+std::unique_ptr<Storage> ${Type}::storage() const {
+  return std::unique_ptr<Storage>(new ${Storage}(context));
+}
+std::unique_ptr<Storage> ${Type}::storage(size_t size) const {
+  return std::unique_ptr<Storage>(new ${Storage}(context,size));
+}
+std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
+    return std::unique_ptr<Storage>(
+      new ${Storage}(context,data,size,deleter));
+}
+std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const {
+    return std::unique_ptr<Storage>(
+        new ${Storage}(context, size, allocator));
+}
+Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
+  if (retain)
+    ${THTensor}_retain(${state,} (${THTensor}*) th_pointer);
+  return Tensor(new ${Tensor}(context,(${THTensor}*)(th_pointer)), false);
+}
+std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
+  if (retain)
+    ${THStorage}_retain(${state,} (${THStorage}*) th_pointer);
+  return std::unique_ptr<Storage>(new ${Storage}(context, (${THStorage}*) th_pointer));
+}
+std::unique_ptr<Generator> ${Type}::generator() const {
+  return std::unique_ptr<Generator>(new ${Generator}(context));
+}
+
+const char * ${Type}::toString() const {
+  return ${Type}::typeString();
+}
+TypeID ${Type}::ID() const {
+  return ${TypeID};
+}
+
+size_t ${Type}::elementSizeInBytes() const {
+  return sizeof(${ScalarType});
+}
+
+const char * ${Type}::typeString() {
+  return "${Type}";
+}
+
+/* example
+Tensor * ${Type}::add(Tensor & a, Tensor & b) {
+  std::cout << "add ${Tensor}\n";
+  return &a;
+}
+*/
+
+${type_derived_method_definitions}
+
+}
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
new file mode 100644
index 0000000..92d3cf2
--- /dev/null
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -0,0 +1,45 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "ATen/Type.h"
+#include "ATen/Context.h"
+#include "ATen/TensorMethods.h"
+#include "ATen/CheckGenerator.h"
+
+#ifdef _MSC_VER
+#ifdef Type
+#undef Type
+#endif
+#endif
+
+namespace at {
+
+struct ${Type} final : public Type {
+  explicit ${Type}(Context* context);
+  virtual ScalarType scalarType() const override;
+  virtual Backend backend() const override;
+  virtual bool is_cuda() const override;
+  virtual bool is_sparse() const override;
+  virtual bool is_distributed() const override;
+  virtual std::unique_ptr<Storage> storage() const override;
+  virtual std::unique_ptr<Storage> storage(size_t size) const override;
+  virtual std::unique_ptr<Storage> storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const override;
+  virtual std::unique_ptr<Storage> storageWithAllocator(int64_t size, Allocator* allocator) const override;
+  virtual std::unique_ptr<Generator> generator() const override;
+  virtual const char * toString() const override;
+  virtual size_t elementSizeInBytes() const override;
+  virtual TypeID ID() const override;
+  static const char * typeString();
+  virtual std::unique_ptr<Storage> unsafeStorageFromTH(void * th_pointer, bool retain) const override;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
+
+  // example
+  // virtual Tensor * add(Tensor & a, Tensor & b) override;
+
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override;
+  ${type_derived_method_declarations}
+};
+
+} // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
new file mode 100644
index 0000000..25d84a3
--- /dev/null
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -0,0 +1,36 @@
+IF (MSVC)
+  IF (MSVC_VERSION LESS 1911)
+    return()
+  ENDIF()
+ENDIF(MSVC)
+
+list(APPEND ATen_CPU_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/scalar_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/apply_utils_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/basic.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/atest.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/half_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/broadcast_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/wrapdim_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/dlconvertor_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/native_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/scalar_tensor_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_parallel.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/undefined_tensor_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/verify_api_visibility.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/tbb_init_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/weakref_test.cpp)
+
+list(APPEND ATen_CUDA_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/integer_divider_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rng_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp)
+if (CUDNN_FOUND)
+  list(APPEND ATen_CUDA_TEST_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp)
+endif()
+
+# ---[ Send the lists to the parent scope.
+set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp
new file mode 100644
index 0000000..986f599
--- /dev/null
+++ b/aten/src/ATen/test/apply_test.cpp
@@ -0,0 +1,121 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include "ATen/cuda/detail/TensorInfo.cuh"
+
+/*
+Tests related to tensor indexing and applying operations. 
+*/
+#ifndef _WIN32
+
+TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
+    int sizes[] = {4, 4};
+    int strides[] = {4, 1};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 1);
+    REQUIRE(ti.sizes[0] == (4 * 4));
+}
+
+TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
+    int sizes[] = {6, 3, 7};
+    int strides[] = {3 * 7, 7, 1};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 1);
+    REQUIRE(ti.sizes[0] == (6 * 3 * 7));
+}
+
+TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
+    int sizes[] = {4, 3, 2};
+    int strides[] = {3 * 3, 3, 1};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 2);
+    REQUIRE(ti.sizes[0] == (4 * 3));
+    REQUIRE(ti.sizes[1] == 2);
+}
+
+TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
+    int sizes[] = {3, 2};
+    int strides[] = {2 * 2, 2};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 1);
+    REQUIRE(ti.sizes[0] == (3 * 2));
+    REQUIRE(ti.strides[0] == 2);
+}
+
+TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
+    int sizes[] = {3, 6, 5, 2};
+    int strides[] = {6 * 22, 22, 2 * 2, 2};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 2);
+    REQUIRE(ti.sizes[0] == (3 * 6));
+    REQUIRE(ti.strides[0] == 22);
+    REQUIRE(ti.sizes[1] == (5 * 2));
+    REQUIRE(ti.strides[1] == 2);
+}
+
+TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
+    int sizes[] = {1, 10, 1, 5, 4};
+    int strides[] = {4, 0, 16, 0, 1};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
+    ti.collapseDims();
+    REQUIRE(ti.dims == 2);
+    REQUIRE(ti.sizes[0] == (10 * 5));
+    REQUIRE(ti.strides[0] == 0);
+    REQUIRE(ti.sizes[1] == 4);
+    REQUIRE(ti.strides[1] == 1);
+}
+
+TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
+    int sizes[] = {1, 1, 1};
+    int strides[] = {17, 12, 3};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+    REQUIRE(ti.collapseDims() == 0);
+    REQUIRE(ti.dims == 1);
+    REQUIRE(ti.sizes[0] == 1);
+    REQUIRE(ti.strides[0] == 1);
+}
+
+TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
+    int sizes[] = {3, 6, 5, 2};
+    int strides[] = {6 * 22, 22, 2 * 2, 2};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+    REQUIRE(ti.collapseDims(1) == 1);
+    REQUIRE(ti.dims == 3);
+    REQUIRE(ti.sizes[0] == 3);
+    REQUIRE(ti.strides[0] == (6 * 22));
+    REQUIRE(ti.sizes[1] == 6);
+    REQUIRE(ti.strides[1] == 22);
+    REQUIRE(ti.sizes[2] == (5 * 2));
+    REQUIRE(ti.strides[2] == 2);
+}
+
+TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
+    int sizes[] = {3, 6, 5, 2};
+    int strides[] = {6 * 22, 22, 2 * 2, 2};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+    REQUIRE(ti.collapseDims(2) == 1);
+    REQUIRE(ti.dims == 3);
+    REQUIRE(ti.sizes[0] == (3 * 6));
+    REQUIRE(ti.strides[0] == 22);
+    REQUIRE(ti.sizes[1] == 5);
+    REQUIRE(ti.strides[1] == 4);
+    REQUIRE(ti.sizes[2] == 2);
+    REQUIRE(ti.strides[2] == 2);
+}
+
+TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
+    int sizes[] = {1, 1, 1};
+    int strides[] = {17, 12, 3};
+    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+    REQUIRE_THROWS(ti.collapseDims(5));
+} 
+
+#endif
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
new file mode 100644
index 0000000..24359a0
--- /dev/null
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -0,0 +1,139 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "ATen/CPUApplyUtils.h"
+#include "test_assert.h"
+#include "test_seed.h"
+
+#include <iostream>
+using namespace std;
+using namespace at;
+
+void fill_tensor(int64_t scalar, Tensor& t_) {
+  auto t = t_.view(-1);
+  for (int64_t i = 0; i < t.numel(); i++) {
+    t[i] = (i + 1) * scalar;
+  }
+}
+
+// This test exercises all sequential applyX functions. Given a shape and two
+// transpose dimensions we create 5 tensors (a0, ..., a4) of the given shape and
+// transpose the dimension a with b for each tensor. Then we call the applyX
+// function on each floating type. a4 is allocated in doubles only,  whereas a0,
+// ..., a3 are allocated in the given type. For each applyX function we once
+// write the same type as we read (using a0, ..., aX-1) and we once write to
+// double (using a4 as a target). We also exercise on a zero_dim and empty
+// tensor.
+void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
+  auto zero_dim = at::empty({}, type);
+  zero_dim.fill_(2);
+  zero_dim.exp_();
+  AT_DISPATCH_FLOATING_TYPES(zero_dim.type(), "test0", [&] {
+    ASSERT(zero_dim.data<scalar_t>()[0] == std::exp(2));
+  });
+
+  auto empty_t = at::empty({0}, type);
+  empty_t.fill_(3);
+  empty_t.exp_();
+
+  auto a0 = type.tensor();
+  auto a1 = type.tensor();
+  auto a2 = type.tensor();
+  auto a3 = type.tensor();
+  auto a4 = CPU(kDouble).tensor();
+
+  std::vector<Tensor> tensors({a0, a1, a2, a3, a4});
+  for (size_t i = 0; i < tensors.size(); i++) {
+    tensors[i].resize_(shape);
+    fill_tensor(i + 1, tensors[i]);
+    if (a >= 0 && b >= 0) {
+      tensors[i].transpose_(a, b);
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(a0.type(), "test1", [&] {
+    CPU_tensor_apply2<scalar_t, scalar_t>(
+        a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; });
+    CPU_tensor_apply2<double, scalar_t>(
+        a4, a1, [](double& y, scalar_t x) { y = (double)(x * x); });
+    for (int64_t i = 0; i < a0.numel(); i++) {
+      auto target = a1.data<scalar_t>()[i] * a1.data<scalar_t>()[i];
+      ASSERT(a0.data<scalar_t>()[i] == target);
+      ASSERT(a4.data<double>()[i] == target);
+    }
+  });
+
+  AT_DISPATCH_FLOATING_TYPES(a0.type(), "test2", [&] {
+    CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(
+        a0, a1, a2, [](scalar_t& y, const scalar_t& x, const scalar_t& z) {
+          y = x * x + z;
+        });
+    CPU_tensor_apply3<double, scalar_t, scalar_t>(
+        a4, a1, a2, [](double& y, const scalar_t& x, const scalar_t& z) {
+          y = (double)(x * x + z);
+        });
+    for (int64_t i = 0; i < a0.numel(); i++) {
+      auto target = a1.data<scalar_t>()[i] * a1.data<scalar_t>()[i];
+      target = target + a2.data<scalar_t>()[i];
+      ASSERT(a0.data<scalar_t>()[i] == target);
+      ASSERT(a4.data<double>()[i] == target);
+    }
+  });
+
+  AT_DISPATCH_FLOATING_TYPES(a0.type(), "test3", [&] {
+    CPU_tensor_apply4<scalar_t, scalar_t, scalar_t, scalar_t>(
+        a0,
+        a1,
+        a2,
+        a3,
+        [](scalar_t& y,
+           const scalar_t& x,
+           const scalar_t& z,
+           const scalar_t& a) { y = x * x + z * a; });
+    CPU_tensor_apply4<double, scalar_t, scalar_t, scalar_t>(
+        a4,
+        a1,
+        a2,
+        a3,
+        [](double& y, const scalar_t& x, const scalar_t& z, const scalar_t& a) {
+          y = (double)(x * x + z * a);
+        });
+    for (int64_t i = 0; i < a0.numel(); i++) {
+      auto target = a1.data<scalar_t>()[i] * a1.data<scalar_t>()[i];
+      target = target + a2.data<scalar_t>()[i] * a3.data<scalar_t>()[i];
+      ASSERT(a0.data<scalar_t>()[i] == target);
+      ASSERT(a4.data<double>()[i] == target);
+    }
+  });
+}
+
+TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {2, 1}, -1, -1);
+}
+
+TEST_CASE("apply utils test 2-dim small", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {2, 1});
+}
+
+TEST_CASE("apply utils test 2-dim", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {20, 10});
+}
+
+TEST_CASE("apply utils test 3-dim", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {3, 4, 2});
+}
+
+TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {3, 40, 2});
+}
+
+TEST_CASE("apply utils test 10-dim", "[cpu]") {
+  manual_seed(123, at::Backend::CPU);
+  test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
+}
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
new file mode 100644
index 0000000..af25179
--- /dev/null
+++ b/aten/src/ATen/test/atest.cpp
@@ -0,0 +1,113 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+
+#include<iostream>
+using namespace std;
+using namespace at;
+
+void trace() {
+  Tensor foo = rand({12,12});
+
+  // ASSERT foo is 2-dimensional and holds floats.
+  auto foo_a = foo.accessor<float,2>();
+  float trace = 0;
+
+  for(int i = 0; i < foo_a.size(0); i++) {
+    trace += foo_a[i][i];
+  }
+
+  REQUIRE(Scalar(foo.trace()).toFloat() == Approx(trace));
+}
+
+TEST_CASE( "atest", "[]" ) {
+
+  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::Backend::CUDA);
+
+  auto foo = rand({12,6});
+  REQUIRE(foo.data<float>() == foo.toFloatData());
+
+  REQUIRE(foo.size(0) == 12);
+  REQUIRE(foo.size(1) == 6);
+
+  foo = foo+foo*3;
+  foo -= 4;
+
+  {
+    Tensor no;
+    REQUIRE_THROWS(add_out(no,foo,foo));
+  }
+  Scalar a = 4;
+
+  float b = a.to<float>();
+  REQUIRE(b == 4);
+
+  foo = (foo*foo) == (foo.pow(3));
+  foo =  2 + (foo+1);
+  //foo = foo[3];
+  auto foo_v = foo.accessor<uint8_t,2>();
+
+  for(int i = 0; i < foo_v.size(0); i++) {
+    for(int j = 0; j < foo_v.size(1); j++) {
+      foo_v[i][j]++;
+    }
+  }
+
+  REQUIRE(foo.equal(4 * ones({12, 6}, kByte)));
+
+  trace();
+
+  float data[] = { 1, 2, 3,
+                   4, 5, 6};
+
+  auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3});
+  auto f_a = f.accessor<float,3>();
+
+  REQUIRE(f_a[0][0][0] == 1.0);
+  REQUIRE(f_a[0][1][1] == 5.0);
+
+  REQUIRE(f.strides()[0] == 6);
+  REQUIRE(f.strides()[1] == 3);
+  REQUIRE(f.strides()[2] == 1);
+  REQUIRE(f.sizes()[0] == 1);
+  REQUIRE(f.sizes()[1] == 2);
+  REQUIRE(f.sizes()[2] == 3);
+
+  REQUIRE_THROWS(f.resize_({3,4,5}));
+  {
+    int isgone = 0;
+    {
+      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
+        isgone++;
+      });
+    }
+    REQUIRE(isgone == 1);
+  }
+  {
+    int isgone = 0;
+    Tensor a_view;
+    {
+      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
+        isgone++;
+      });
+      a_view = f2.view({3,2,1});
+    }
+    REQUIRE(isgone == 0);
+    a_view.reset();
+    REQUIRE(isgone == 1);
+  }
+
+  if(at::hasCUDA()) {
+    int isgone = 0;
+    {
+      auto base = CUDA(kFloat).tensor({1,2,3});
+      auto f2 = CUDA(kFloat).tensorFromBlob(base.data_ptr(), {1,2,3}, [&](void*) {
+        isgone++;
+      });
+    }
+    REQUIRE(isgone==1);
+  }
+}
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
new file mode 100644
index 0000000..6b46c8c
--- /dev/null
+++ b/aten/src/ATen/test/basic.cpp
@@ -0,0 +1,287 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "THNN/Reduction.h"
+
+// for TH compat test only...
+struct THFloatTensor;
+extern "C" THFloatTensor * THFloatTensor_newWithSize2d(size_t a, size_t b);
+extern "C" void THFloatTensor_fill(THFloatTensor *, float v);
+
+#include <iostream>
+#include <chrono>
+#include <string.h>
+#include <sstream>
+#include "test_seed.h"
+
+using namespace at;
+
+using Catch::Matchers::StartsWith;
+
+static void test(Type & type) {
+  SECTION( "resize" ) {
+    auto a = type.tensor();
+    a.resize_({3,4});
+    REQUIRE(a.numel() == 12);
+    a.resize_({5, 7});
+    REQUIRE(a.numel() == 35);
+
+  }
+
+  SECTION( "ones and dot" ) {
+    Tensor b0 = ones({1, 1}, type);
+    REQUIRE(2 == (b0+b0).sum().toCDouble());
+
+    Tensor b1 = ones({1, 2}, type);
+    REQUIRE(4 == (b1+b1).sum().toCDouble());
+
+    Tensor b = ones({3, 4}, type);
+    REQUIRE(24 == (b+b).sum().toCDouble());
+    REQUIRE(12 == b.numel());
+    REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
+  }
+
+  SECTION( "rand" ) {
+    for(auto i = 0; i < 10; i++) {
+      Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
+    }
+  }
+
+  SECTION( "sort" ) {
+    Tensor b = rand({3, 4}, type);
+
+    auto z = b.sort(1);
+    auto z_sorted = std::get<0>(z);
+
+    REQUIRE(Scalar(z_sorted[0][0]).toFloat() < Scalar(z_sorted[0][1]).toFloat());
+  }
+
+  if(type.backend() != kCUDA)
+  SECTION( "randperm" ) {
+    Tensor b = randperm(15, type);
+    Tensor rv, ri;
+    std::tie(rv, ri) = sort(b, 0);
+    REQUIRE(Scalar(rv[0]).toFloat() <= Scalar(rv[1]).toFloat());
+  }
+
+  SECTION( "context" ) {
+    std::stringstream ss;
+    ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl;
+  }
+
+  SECTION( "add" ) {
+    Tensor a = rand({3, 4}, type);
+    Tensor b = rand({3, 4}, type);
+    Tensor c = add(a, add(a, b));
+    //TODO:0-dim Tensor d(3.f);
+    Scalar d = 3.f;
+    REQUIRE( add(c, d).allclose(a + a + b + d) );
+  }
+
+  SECTION( "loads of adds" ) {
+    auto begin = std::chrono::high_resolution_clock::now();
+    Tensor d = ones({3, 4}, type);
+    Tensor r = zeros({3, 4}, type);
+    for(auto i = 0; i < 100000; i++) {
+      add_out(r, r, d);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    //TODO TEST PERF?
+    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
+    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+  }
+
+  SECTION( "loads of adds (with copy)" ) {
+    auto begin = std::chrono::high_resolution_clock::now();
+    Tensor d = ones({3, 4}, type);
+    Tensor r = zeros({3, 4}, type);
+    for(auto i = 0; i < 100000; i++) {
+      r = add(r, d);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    //TODO TEST PERF?
+    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
+    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+  }
+
+  SECTION( "isContiguous" ) {
+    Tensor a = rand({3, 4}, type);
+    REQUIRE(a.is_contiguous());
+    a = a.transpose(0, 1);
+    REQUIRE(!a.is_contiguous());
+  }
+
+  SECTION( "permute" ) {
+    Tensor a = rand({3, 4, 5}, type);
+    Tensor b = a.permute({1, 2, 0});
+    REQUIRE(b.sizes().equals({4, 5, 3}));
+    REQUIRE(b.strides().equals({5, 1, 20}));
+  }
+
+  SECTION( "mm" ) {
+    Tensor a = rand({3, 4}, type);
+    Tensor b = rand({4}, type);
+    Tensor c = mv(a, b);
+    REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
+  }
+
+  SECTION( "squeeze" ) {
+    Tensor a = rand({2, 1}, type);
+    Tensor b = squeeze(a);
+    REQUIRE(b.dim() == 1);
+    a = rand({1}, type);
+    b = squeeze(a);
+    //TODO 0-dim squeeze
+    REQUIRE(a[0].equal(b));
+  }
+
+  SECTION( "copy" ) {
+    Tensor a = zeros({4, 3}, type);
+    Tensor e = rand({4, 3}, type);
+    a.copy_(e);
+    REQUIRE(a.equal(e));
+  }
+
+  SECTION( "copy (broadcasting)" ) {
+    Tensor a = zeros({4, 3}, type);
+    Tensor e = rand({3}, type);
+    a.copy_(e);
+    for (int i = 0; i < 4; ++i) {
+      REQUIRE(a[i].equal(e));
+    }
+  }
+
+  SECTION( "abs(value)" ) {
+    Tensor r = at::abs(type.scalarTensor(-3));
+    REQUIRE(Scalar(r).toInt() == 3);
+  }
+
+//TODO(zach): operator overloads
+#if 0
+  {
+    std::cout << "eq (value):" << std::endl;
+    Tensor a = Tensor(10.f);
+    std::cout << (a == 11_i64) << " -- should be 0" << std::endl;
+    std::cout << (a == 10_i64) << " -- should be 1" << std::endl;
+    std::cout << (a == 10.) << " -- should be 1" << std::endl;
+  }
+#endif
+
+  SECTION( "adding a value with a scalar" ) {
+    Tensor a = rand({4, 3}, type);
+    REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
+  }
+
+  SECTION( "select" ) {
+    Tensor a = rand({3, 7}, type);
+    auto a_13 = select(a, 1, 3);
+    auto a_13_02 = select(select(a, 1, 3), 0, 2);
+    REQUIRE( a[0][3].equal(a_13[0]) );
+    REQUIRE( a[2][3].equal(a_13_02) );
+  }
+
+  SECTION( "zero-dim" ) {
+    Tensor a =  type.scalarTensor(4); //rand(type, {1});
+
+    REQUIRE_NOTHROW(Scalar(a));
+    Tensor b = rand({3,4}, type);
+    REQUIRE((a + a).dim() == 0);
+    REQUIRE((1 + a).dim() == 0);
+    REQUIRE((b + a).dim() == 2);
+    REQUIRE((a + b).dim() == 2);
+    auto c = rand({3,4}, type);
+    REQUIRE(c[1][2].dim() == 0);
+
+    auto f = rand({3,4}, type);
+    f[2] = zeros({4}, type);
+    f[1][0] = -1;
+    REQUIRE(Scalar(f[2][0]).toDouble() == 0);
+  }
+
+  SECTION( "tensor from TH" ) {
+    int a = 4;
+    THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
+    THFloatTensor_fill(t, a);
+    Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false);
+    REQUIRE_NOTHROW(tt);
+  }
+
+  SECTION( "toCFloat" ) {
+    Tensor a = zeros({3,4});
+    Tensor b = ones({3,7});
+    Tensor c = cat({a,b},1);
+    REQUIRE(c.size(1) == 11);
+
+    Tensor e = rand({});
+    REQUIRE(*e.data<float>() == e.sum().toCFloat());
+  }
+
+  SECTION( "to string" ) {
+    Tensor b = ones({3,7})*.0000001f;
+    std::stringstream s;
+    s << b << "\n";
+    std::string expect = "1e-07 *";
+    REQUIRE(s.str().substr(0,expect.size()) == expect);
+  }
+  SECTION("indexing by Scalar") {
+    Tensor tensor = arange(0, 10, kInt);
+    Tensor one = ones({1}, kInt);
+    for (int64_t i = 0; i < tensor.numel(); ++i) {
+      REQUIRE(tensor[i].equal(one * i));
+    }
+    for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
+      REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
+    }
+    for (int i = 0; i < tensor.numel(); ++i) {
+      REQUIRE(tensor[i].equal(one * i));
+    }
+    for (int16_t i = 0; i < tensor.numel(); ++i) {
+      REQUIRE(tensor[i].equal(one * i));
+    }
+    for (int8_t i = 0; i < tensor.numel(); ++i) {
+      REQUIRE(tensor[i].equal(one * i));
+    }
+    REQUIRE_THROWS_WITH(
+        tensor[Scalar(3.14)].equal(one),
+        StartsWith(
+            "Can only index tensors with integral scalars (got CPUDoubleType)"));
+  }
+  SECTION("indexing by zero-dim tensor") {
+    Tensor tensor = arange(0, 10, kInt);
+    Tensor one = ones({}, kInt);
+    for (int i = 0; i < tensor.numel(); ++i) {
+      REQUIRE(tensor[one * i].equal(one * i));
+    }
+    REQUIRE_THROWS_WITH(
+        tensor[ones({}) * 3.14].equal(one),
+        StartsWith(
+            "Can only index tensors with integral scalars (got CPUFloatType)"));
+    REQUIRE_THROWS_WITH(
+        tensor[Tensor()].equal(one),
+        StartsWith("Can only index with tensors that are defined"));
+    REQUIRE_THROWS_WITH(
+        tensor[ones({2, 3, 4}, kInt)].equal(one),
+        StartsWith("Can only index with tensors that are scalars (zero-dim)"));
+  }
+  SECTION("dispatch") {
+    Tensor tensor = randn({20, 20});
+    Tensor other = randn({20, 20});
+    auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
+    REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
+  }
+}
+
+TEST_CASE( "basic tests CPU", "[cpu]" ) {
+  manual_seed(123, at::Backend::CPU);
+
+  test(CPU(kFloat));
+}
+
+TEST_CASE( "basic tests GPU", "[cuda]" ) {
+  manual_seed(123, at::Backend::CUDA);
+
+  if(at::hasCUDA()) {
+    test(CUDA(kFloat));
+  }
+}
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
new file mode 100644
index 0000000..2c98121
--- /dev/null
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -0,0 +1,154 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+
+using namespace at;
+
+TEST_CASE( "broadcast", "[]" ) {
+
+  manual_seed(123, at::Backend::CPU);
+
+  Type & T = CPU(kFloat);
+
+  // 0) pre-req tests:
+  SECTION( "can't expand empty tensor" ) {
+    auto empty = randn({0}, T);
+    REQUIRE_THROWS(empty.expand({3}));
+  }
+
+  // 1) out-place function with 2 args
+  SECTION( "out-place function with 2 args" ) {
+
+    SECTION( "basic" ) {
+      auto a = randn({3, 1}, T);
+      auto b = randn({5}, T);
+      std::vector<int64_t> expanded_sizes = {3, 5};
+      REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
+    }
+
+    SECTION( "with scalar" ) {
+      auto aScalar = ones({1}, T);
+      aScalar.get()->maybeScalar(true);
+      auto b = randn({3, 5}, T);
+      REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
+    }
+
+    SECTION( "old fallback behavior yields error" ) {
+      auto a = randn({3, 5}, T);
+      auto b = randn({5, 3}, T);
+      REQUIRE_THROWS(a + b);
+    }
+
+    SECTION( "with mismatched sizes" ) {
+      auto a = randn({3, 5}, T);
+      auto b = randn({7, 5}, T);
+      REQUIRE_THROWS(a + b);
+    }
+  }
+
+  SECTION( "out-place function with 3 args" ) {
+
+    SECTION( "basic" ) {
+      auto a = randn({3, 1, 1}, T);
+      auto b = randn({1, 2, 1}, T);
+      auto c = randn({1, 1, 5}, T);
+      std::vector<int64_t> expanded_sizes = {3, 2, 5};
+      REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
+    }
+
+    SECTION( "with scalar" ) {
+      auto aTensorScalar = ones({1}, T);
+      aTensorScalar.get()->maybeScalar(true);
+      auto b = randn({3, 2, 1}, T);
+      auto c = randn({1, 2, 5}, T);
+      std::vector<int64_t> expanded_sizes = {3, 2, 5};
+      REQUIRE(aTensorScalar.addcmul(b, c).equal(
+                aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
+    }
+
+    SECTION( "old fallback behavior yields error" ) {
+      auto a = randn({3, 2, 5}, T);
+      auto b = randn({2, 3, 5}, T);
+      auto c = randn({5, 3, 2}, T);
+      REQUIRE_THROWS(a.addcmul(b, c));
+    }
+
+    SECTION( "with mismatched sizes" ){
+      auto a = randn({3, 2, 5}, T);
+      auto b = randn({2, 3, 5}, T);
+      auto c = randn({5, 5, 5}, T);
+      REQUIRE_THROWS(a.addcmul(b, c));
+    }
+  }
+
+  SECTION( "in-place function with 2 args" ) {
+    SECTION( "basic" ) {
+      auto a = randn({3, 5}, T);
+      auto b = randn({3, 1}, T);
+      REQUIRE((a + b).equal(a + b.expand({3, 5})));
+    }
+
+    SECTION( "with scalar" ) {
+      auto a = randn({3, 5}, T);
+      auto bScalar = ones({1}, T);
+      bScalar.get()->maybeScalar(true);
+      REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
+    }
+
+    SECTION( "error: would have to expand inplace arg" ) {
+      auto a = randn({1, 5}, T);
+      auto b = randn({3, 1}, T);
+      REQUIRE_THROWS(a.add_(b));
+    }
+  }
+
+  SECTION( "in-place function with 3 args" ) {
+
+    auto a = randn({3, 5, 2}, T);
+    auto b = randn({3, 1, 2}, T);
+    auto c = randn({1, 5, 1}, T);
+
+    SECTION( "basic" ) {
+      auto aClone = a.clone();
+      REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
+    }
+
+    SECTION( "with scalar" ) {
+      auto aClone = a.clone();
+      auto bScalar = ones({1}, T);
+      bScalar.get()->maybeScalar(true);
+      REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
+    }
+
+    SECTION( "error: would have to expand inplace arg" ) {
+      auto a = randn({1, 3, 5}, T);
+      auto b = randn({4, 1, 1}, T);
+      auto c = randn({1, 3, 1}, T);
+      REQUIRE_THROWS(a.addcmul_(b, c));
+    }
+  }
+
+  SECTION( "explicit dim specification" ) {
+
+    auto a = randn({1}, T);
+    auto b = randn({5, 3}, T);
+    auto c = randn({3, 7}, T);
+
+    SECTION( "basic" ) {
+      REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
+    }
+
+    SECTION( "with scalar" ) {
+      Tensor aScalar = ones({1}, T);
+      aScalar.get()->maybeScalar(true);
+      REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
+    }
+
+    SECTION( "with mismatched sizes" ) {
+      auto a = randn({3, 3}, T);
+      REQUIRE_THROWS(a.addmm(b, c));
+    }
+  }
+}
diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp
new file mode 100644
index 0000000..d32903d
--- /dev/null
+++ b/aten/src/ATen/test/cuda_rng_test.cpp
@@ -0,0 +1,27 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include <thread>
+
+void makeRandomNumber() {
+  cudaSetDevice(std::rand() % 2);
+  auto x = at::randn({1000});
+}
+
+void testCudaRNGMultithread() {
+  auto threads = std::vector<std::thread>();
+  for (auto i = 0; i < 1000; i++) {
+    threads.emplace_back(makeRandomNumber);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+};
+
+TEST_CASE( "CUDA RNG test", "[cuda]" ) {
+  SECTION( "multithread" )
+    testCudaRNGMultithread();
+}
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
new file mode 100644
index 0000000..7c1bc96
--- /dev/null
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -0,0 +1,25 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "ATen/cudnn/Descriptors.h"
+#include "ATen/cudnn/Handles.h"
+#include "test_seed.h"
+
+using namespace at;
+using namespace at::native;
+
+TEST_CASE( "cudnn", "[cuda]" ) {
+  manual_seed(123, at::Backend::CUDA);
+
+#if CUDNN_VERSION < 7000
+  auto handle = getCudnnHandle();
+  DropoutDescriptor desc1, desc2;
+  desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42);
+  desc2.set(handle, 0.5, desc1.state);
+
+  REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
+  REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
+  REQUIRE(desc1.desc()->states == desc2.desc()->states);
+#endif
+}
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
new file mode 100644
index 0000000..1603e3d
--- /dev/null
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -0,0 +1,27 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "ATen/DLConvertor.h"
+
+#include <iostream>
+#include <string.h>
+#include <sstream>
+#include "test_seed.h"
+
+using namespace at;
+
+TEST_CASE( "dlconvertor", "[cpu]" ) {
+
+  manual_seed(123, at::Backend::CPU);
+
+  INFO( "convert ATen to DLTensor" );
+
+  Tensor a = rand({3,4});
+  DLManagedTensor* dlMTensor = toDLPack(a);
+
+  INFO( "convert DLTensor to ATen" );
+  Tensor b = fromDLPack(dlMTensor);
+
+  REQUIRE(a.equal(b));
+}
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
new file mode 100644
index 0000000..fc70522
--- /dev/null
+++ b/aten/src/ATen/test/half_test.cpp
@@ -0,0 +1,117 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include <ATen/ATen.h>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <type_traits>
+
+using namespace at;
+
+TEST_CASE( "half arithmetic", "[]" ) {
+  Half zero = 0;
+  Half one = 1;
+  REQUIRE(zero + one == one);
+  REQUIRE(zero + zero == zero);
+  REQUIRE(zero * one == zero);
+  REQUIRE(one * one == one);
+  REQUIRE(one / one == one);
+  REQUIRE(one - one == zero);
+  REQUIRE(one - zero == one);
+  REQUIRE(zero - one == -one);
+  REQUIRE(one + one == Half(2));
+  REQUIRE(one + one == 2);
+}
+
+TEST_CASE( "half comparisons", "[]" ) {
+  Half zero = 0;
+  Half one = 1;
+  REQUIRE(zero < one);
+  REQUIRE(zero < 1);
+  REQUIRE(1 > zero);
+  REQUIRE(0 >= zero);
+  REQUIRE(0 != one);
+  REQUIRE(zero == 0);
+  REQUIRE(zero == zero);
+  REQUIRE(zero == -zero);
+}
+
+TEST_CASE( "half cast", "[]" ) {
+  Half value = 1.5f;
+  REQUIRE((int)value == 1);
+  REQUIRE((short)value == 1);
+  REQUIRE((long long)value == 1LL);
+  REQUIRE((float)value == 1.5f);
+  REQUIRE((double)value == 1.5);
+  REQUIRE((bool)value == true);
+  REQUIRE((bool)Half(0.0f) == false);
+}
+
+TEST_CASE( "half construction", "[]" ) {
+  REQUIRE(Half((short)3) == Half(3.0f));
+  REQUIRE(Half((unsigned short)3) == Half(3.0f));
+  REQUIRE(Half(3) == Half(3.0f));
+  REQUIRE(Half(3U) == Half(3.0f));
+  REQUIRE(Half(3LL) == Half(3.0f));
+  REQUIRE(Half(3ULL) == Half(3.0f));
+  REQUIRE(Half(3.5) == Half(3.5f));
+}
+
+static std::string to_string(const Half& h) {
+  std::stringstream ss;
+  ss << h;
+  return ss.str();
+}
+
+TEST_CASE( "half to string", "[]" ) {
+  REQUIRE(to_string(Half(3.5f)) == "3.5");
+  REQUIRE(to_string(Half(-100.0f)) == "-100");
+}
+
+TEST_CASE( "half numeric limits", "[]" ) {
+  using limits = std::numeric_limits<Half>;
+  REQUIRE(limits::lowest() == -65504.0f);
+  REQUIRE(limits::max() == 65504.0f);
+  REQUIRE(limits::min() > 0);
+  REQUIRE(limits::min() < 1);
+  REQUIRE(limits::denorm_min() > 0);
+  REQUIRE(limits::denorm_min() / 2  == 0);
+  REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
+  REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
+  REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
+}
+
+// Check the declared type of members of numeric_limits<Half> matches
+// the declared type of that member on numeric_limits<float>
+
+#define ASSERT_SAME_TYPE(name) \
+  static_assert( \
+      std::is_same< \
+          decltype(std::numeric_limits<Half>::name), \
+          decltype(std::numeric_limits<float>::name)>::value, \
+      "decltype(" #name ") differs")
+
+ASSERT_SAME_TYPE(is_specialized);
+ASSERT_SAME_TYPE(is_signed);
+ASSERT_SAME_TYPE(is_integer);
+ASSERT_SAME_TYPE(is_exact);
+ASSERT_SAME_TYPE(has_infinity);
+ASSERT_SAME_TYPE(has_quiet_NaN);
+ASSERT_SAME_TYPE(has_signaling_NaN);
+ASSERT_SAME_TYPE(has_denorm);
+ASSERT_SAME_TYPE(has_denorm_loss);
+ASSERT_SAME_TYPE(round_style);
+ASSERT_SAME_TYPE(is_iec559);
+ASSERT_SAME_TYPE(is_bounded);
+ASSERT_SAME_TYPE(is_modulo);
+ASSERT_SAME_TYPE(digits);
+ASSERT_SAME_TYPE(digits10);
+ASSERT_SAME_TYPE(max_digits10);
+ASSERT_SAME_TYPE(radix);
+ASSERT_SAME_TYPE(min_exponent);
+ASSERT_SAME_TYPE(min_exponent10);
+ASSERT_SAME_TYPE(max_exponent);
+ASSERT_SAME_TYPE(max_exponent10);
+ASSERT_SAME_TYPE(traps);
+ASSERT_SAME_TYPE(tinyness_before);
diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu
new file mode 100644
index 0000000..4c63ab3
--- /dev/null
+++ b/aten/src/ATen/test/integer_divider_test.cu
@@ -0,0 +1,190 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+// Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
+// (b-1), so it takes a few minutes to run.
+
+#include <assert.h>
+#include <stdint.h>
+#include <memory>
+#include <vector>
+
+#include "THC/THCIntegerDivider.cuh"
+
+using std::vector;
+
+template<typename Value>
+struct TestCase {
+  Value dividend;
+  int divisor_idx;
+  int steps;
+
+  TestCase(Value dividend, int divisor_idx, int steps)
+    : dividend(dividend), divisor_idx(divisor_idx), steps(steps) { }
+};
+
+template<typename Value>
+__global__ void testIntDivider(const IntDivider<Value> *dividers,
+                               const TestCase<Value> *testCases,
+                               int numCases)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < numCases; i += stride) {
+    const TestCase<Value> &tc = testCases[i];
+    Value dividend = tc.dividend;
+    const IntDivider<Value> &divider = dividers[tc.divisor_idx];
+    Value divisor = divider.divisor;
+
+    for (int j = 0; j < tc.steps; j++) {
+      if (sizeof(Value) == 4 && dividend > INT32_MAX) return;
+
+      DivMod<Value> qr = divider.divmod(dividend);
+      assert(qr.div == dividend / divisor && qr.mod == dividend % divisor);
+      dividend += divisor;
+    }
+  }
+}
+
+enum {
+  // Number of test cases per each kernel invocation.
+  NUM_CASES = 1000000,
+
+  // Maximum number of steps per each test case.
+  MAX_STEPS = 10000,
+};
+
+// Test the magic division algorithm.
+template<typename Value>
+class IntDividerTester {
+ public:
+  IntDividerTester() {
+    cudaError_t err;
+
+    err = cudaMalloc(&dividersBuf_, NUM_CASES * sizeof(IntDivider<Value>));
+    REQUIRE(err == cudaSuccess);
+    err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
+    REQUIRE(err == cudaSuccess);
+  }
+
+  ~IntDividerTester() {
+    cudaError_t err;
+
+    err = cudaFree(dividersBuf_);
+    REQUIRE(err == cudaSuccess);
+    err = cudaFree(testCasesBuf_);
+    REQUIRE(err == cudaSuccess);
+  }
+
+  void addTestCase(Value dividend, Value divisor, int steps) {
+    // Append a new IntDivider using 'divisor' if necessary.
+    if (dividers_.empty() || dividers_.back().divisor != divisor)
+      dividers_.emplace_back(divisor);
+
+    // Append the test case.
+    testCases_.emplace_back(dividend, dividers_.size() - 1, steps);
+
+    // Launch the test kernel if the buffer is full.
+    if (testCases_.size() == NUM_CASES) flush();
+  }
+
+  void flush() {
+    cudaError_t err;
+
+    if (testCases_.empty()) return;
+    REQUIRE(!dividers_.empty());
+
+    REQUIRE(dividers_.size() <= NUM_CASES);
+    REQUIRE(testCases_.size() <= NUM_CASES);
+    err = cudaMemcpy(dividersBuf_, dividers_.data(),
+                     dividers_.size() * sizeof(IntDivider<Value>),
+                     cudaMemcpyHostToDevice);
+    REQUIRE(err == cudaSuccess);
+    err = cudaMemcpy(testCasesBuf_, testCases_.data(),
+                     testCases_.size() * sizeof(TestCase<Value>),
+                     cudaMemcpyHostToDevice);
+    REQUIRE(err == cudaSuccess);
+
+    int numCases = testCases_.size();
+    testIntDivider<Value><<<512, 512>>>(
+      dividersBuf_, testCasesBuf_, numCases);
+
+    dividers_.clear();
+    testCases_.clear();
+  }
+
+ private:
+  vector<IntDivider<Value>> dividers_;
+  vector<TestCase<Value>> testCases_;
+
+  IntDivider<Value> *dividersBuf_;
+  TestCase<Value> *testCasesBuf_;
+};
+
+static void testUint32Divider()
+{
+  fprintf(stderr, "Testing 32-bit integer division ...");
+
+  IntDividerTester<uint32_t> tester;
+
+  for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) {
+    if (divisor < 1000000 && divisor % 10000 == 0) fprintf(stderr, ".");
+    if (divisor % 10000000 == 0) fprintf(stderr, "-");
+
+    // In order to save time, we only test when the remainder is zero or
+    // (divisor - 1).
+    uint64_t dividend = 0;
+    while (dividend <= INT32_MAX) {
+      uint64_t steps = (INT32_MAX - dividend) / divisor + 1;
+      if (steps > MAX_STEPS) steps = MAX_STEPS;
+
+      tester.addTestCase(dividend, divisor, steps);
+      tester.addTestCase(dividend + divisor - 1, divisor, steps);
+
+      dividend += divisor * steps;
+    }
+
+    // Check the boundary cases.
+    tester.addTestCase(1, divisor, 1);
+    tester.addTestCase(INT32_MAX, divisor, 1);
+  }
+
+  tester.flush();
+
+  fprintf(stderr, " Done!\n");
+}
+
+// uint64_t divider uses plain division, so we just check a few random cases.
+static void testUint64Divider()
+{
+  IntDividerTester<uint64_t> tester;
+
+  uint64_t dividend = 0x123456789ULL;
+  uint64_t divisor = 0x54321ULL;
+
+  for (int i = 0; i < 1000; i++) {
+    if (divisor != 0) {
+      tester.addTestCase(dividend, divisor, 100);
+
+      // Test small divisor.
+      tester.addTestCase(dividend, divisor % 65536, 100);
+
+      // Create pseudorandom numbers.
+      dividend *= 0x100000001b3ULL;
+      dividend ^= 0x1234567890abcdefULL;
+      divisor *= 0x100000001b3ULL;
+      divisor ^= 0x1234567890abcdefULL;
+    }
+  }
+
+  tester.flush();
+}
+
+TEST_CASE( "CUDA integer divider", "[cuda]" ) {
+
+  testUint64Divider();
+  testUint32Divider();
+
+  cudaError_t err = cudaDeviceSynchronize();
+  REQUIRE(err == cudaSuccess);
+}
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
new file mode 100644
index 0000000..99a21d3
--- /dev/null
+++ b/aten/src/ATen/test/native_test.cpp
@@ -0,0 +1,193 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+
+using namespace at;
+
+#define REQUIRE_EQUAL(t1, t2) \
+  REQUIRE(t1.equal(t2));
+
+#define REQUIRE_ALLCLOSE(t1, t2)   \
+  REQUIRE(t1.is_same_size(t2));    \
+  REQUIRE(t1.allclose(t2));
+
+#define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
+  REQUIRE(t1.is_same_size(t2));    \
+  REQUIRE(t1.allclose(t2, atol, rtol));
+
+void requireEqualTensorList(TensorList t1, TensorList t2) {
+  REQUIRE(t1.size() == t2.size());
+  for (size_t i = 0; i < t1.size(); ++i) {
+    REQUIRE_EQUAL(t1[ i ], t2[ i ]);
+  }
+}
+
+void test(Type & T, Type & AccT) {
+  auto t = randn({3, 3}, T);
+
+  SECTION( "split: test method, type, namespace give same result" ) {
+    auto splitMethod = t.split(1, 0);
+    auto splitType = T.split(t, 1, 0);
+    auto splitNs = at::split(t, 1, 0);
+    requireEqualTensorList(splitMethod, splitType);
+    requireEqualTensorList(splitMethod, splitNs);
+
+    // test rebuilding with cat
+    REQUIRE_EQUAL(at::cat(splitMethod, 0), t);
+  }
+
+  SECTION( "chunk: test method, type, namespace give same result" ) {
+    // test method, type, namespace give same result
+    auto chunkMethod = t.chunk(3, 0);
+    auto chunkType = T.chunk(t, 3, 0);
+    auto chunkNs = at::chunk(t, 3, 0);
+    requireEqualTensorList(chunkMethod, chunkType);
+    requireEqualTensorList(chunkMethod, chunkNs);
+
+    // test rebuilding with cat
+    REQUIRE_EQUAL(at::cat(chunkMethod, 0), t);
+  }
+
+  // stack
+  SECTION( "stack" ) {
+    auto x = rand({2, 3, 4});
+    auto y = rand({2, 3, 4});
+    auto z = rand({2, 3, 4});
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      auto res = at::stack({x, y, z}, dim);
+      auto res_neg = at::stack({x, y, z}, dim - 4);
+      std::vector<int64_t> expected_size;
+      expected_size.insert(expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim);
+      expected_size.insert(expected_size.end(), 3);
+      expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
+
+      REQUIRE_EQUAL(res, res_neg);
+      REQUIRE(res.sizes().equals(expected_size));
+      REQUIRE_EQUAL(res.select(dim, 0), x);
+      REQUIRE_EQUAL(res.select(dim, 1), y);
+      REQUIRE_EQUAL(res.select(dim, 2), z);
+    }
+  }
+
+  SECTION( "size / stride" ) {
+    auto scalar = randn({}, T);
+		REQUIRE_THROWS_WITH(scalar.size(0), "dimension specified as 0 but tensor has no dimensions");
+    REQUIRE_THROWS_WITH(scalar.size(-1), "dimension specified as -1 but tensor has no dimensions");
+    REQUIRE_THROWS_WITH(scalar.stride(0), "dimension specified as 0 but tensor has no dimensions");
+    REQUIRE_THROWS_WITH(scalar.stride(-1), "dimension specified as -1 but tensor has no dimensions");
+
+    auto empty = randn({0}, T);
+    REQUIRE(empty.size(0) == 0);
+    REQUIRE(empty.size(-1) == 0);
+    REQUIRE(empty.stride(0) == 1);
+    REQUIRE(empty.stride(-1) == 1);
+  }
+
+  // matmul
+  SECTION( "matmul" ) {
+    auto scalar = randn({}, T);
+    auto d1 = randn({3}, T);
+    auto d2 = randn({2, 3}, T);
+
+    // 0-d
+    REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+    REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+
+    // 1-d
+    REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
+    REQUIRE_ALLCLOSE(d2.matmul(d1), d2.mv(d1));
+    auto d1o = randn({2}, T);
+    REQUIRE_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0));
+
+    // 2-d
+    auto d2o = randn({3, 5}, T);
+    REQUIRE_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o));
+
+    // > 2-d, 1-d
+    auto d3 = randn({5, 2, 3}, T);
+    REQUIRE_ALLCLOSE(d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2}));
+    REQUIRE_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3}));
+
+    auto d5 = randn({3, 2, 4, 2, 3}, T);
+    REQUIRE_ALLCLOSE(d5.matmul(d1), d5.view({24, 2, 3}).bmm(d1.view({1, 3, 1}).expand({24, 3, 1})).view({3, 2, 4, 2}));
+    REQUIRE_ALLCLOSE(d1o.matmul(d5), d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3}));
+
+    // > 2-d, 2-d
+    // we use a "folding" algorithm in this case of matmul, so the direct comparison to bmm doesn't work;
+    // instead, compare to the higher precision computation (technically, we should always do this).
+    // Tolerances are selected empirically.
+    double atol = 1e-04;
+    double rtol = 1e-06;
+    d2 = randn({3, 4}, T);
+    d2o = randn({4, 2}, T);
+    auto result = d5.matmul(d2).toType(AccT);
+
+    auto d5Acc = d5.toType(AccT);
+    auto d2Acc = d2.toType(AccT);
+    auto acc_result = d5Acc.view({24, 2, 3}).bmm(d2Acc.expand({24, 3, 4})).view({3, 2, 4, 2, 4});
+    REQUIRE_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol);
+    REQUIRE_ALLCLOSE(d2o.matmul(d5), d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3}));
+
+    // > 2-d, > 2-d
+    auto d5o = randn({2, 1, 2, 4, 3, 2}, T);
+    auto d5_bmm_view = d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3});
+    auto d5o_bmm_view = d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2});
+    REQUIRE_ALLCLOSE(d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2}));
+
+    // non-expandable case
+    auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
+    REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
+  }
+
+  // _standard_gamma_grad
+  SECTION( "_standard_gamma_grad" ) {
+    // check empty
+    auto empty = ones({0}, T);
+    REQUIRE_EQUAL(empty, empty._standard_gamma_grad(empty));
+
+    // check scalar equals one element
+    auto one_scalar = ones({}, T).mul(5);
+    auto one_with_dim = ones({1}, T).mul(5);
+    REQUIRE_ALLCLOSE(one_scalar._standard_gamma_grad(one_scalar),
+		     one_with_dim._standard_gamma_grad(one_with_dim).sum());
+
+    // check mixing types
+    auto t1 = randn({3, 4}, T);
+    auto t2 = randn({3, 4}, T).toType(kDouble);
+    REQUIRE_THROWS_WITH(t1._standard_gamma_grad(t2), Catch::StartsWith("expected scalar type"));
+  }
+
+  SECTION( "where" ) {
+    // empty
+    auto empty = ones({0}, T);
+    auto &bT = T.toScalarType(ScalarType::Byte);
+    auto empty_byte = ones({0}, bT);
+    REQUIRE_EQUAL(empty, at::where(empty_byte, empty, empty));
+
+    // check scalar equals one element
+    auto x_scalar = ones({}, T).mul(5);
+    auto y_scalar = ones({}, T).mul(7);
+    auto cond_scalar = zeros({}, bT);
+    auto x_1d = x_scalar.unsqueeze(0);
+    auto y_1d = y_scalar.unsqueeze(0);
+    auto cond_1d = cond_scalar.unsqueeze(0);
+    REQUIRE_ALLCLOSE(at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0),
+                     at::where(cond_1d, x_1d, y_1d));
+  }
+}
+
+TEST_CASE( "native test CPU", "[cpu]" ) {
+  manual_seed(123, at::Backend::CPU);
+
+  test(CPU(kFloat), CPU(kDouble));
+}
+
+TEST_CASE( "native test CUDA", "[cuda]" ) {
+  manual_seed(123, at::Backend::CUDA);
+
+  if (at::hasCUDA()) {
+    test(CUDA(kFloat), CUDA(kDouble));
+  }
+}
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
new file mode 100644
index 0000000..620e5ec
--- /dev/null
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -0,0 +1,286 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+
+using namespace at;
+
+#define TRY_CATCH_ELSE(fn, catc, els)                           \
+  {                                                             \
+    /* avoid mistakenly passing if els code throws exception*/  \
+    bool _passed = false;                                       \
+    try {                                                       \
+      fn;                                                       \
+      _passed = true;                                           \
+      els;                                                      \
+    } catch (std::exception &e) {                               \
+      REQUIRE(!_passed);                                        \
+      catc;                                                     \
+    }                                                           \
+  }
+
+void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) {
+  REQUIRE(lhs.dim() == rhs.dim());
+  REQUIRE(lhs.sizes().equals(rhs.sizes()));
+}
+
+bool should_expand(const IntList &from_size, const IntList &to_size) {
+  if(from_size.size() > to_size.size()) {
+    return false;
+  }
+  for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); ++from_dim_it) {
+    for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); ++to_dim_it) {
+      if (*from_dim_it != 1 && *from_dim_it != *to_dim_it) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void test(Type &T) {
+  std::vector<std::vector<int64_t> > sizes = { {}, {0}, {1}, {1, 1}, {2}};
+
+  // single-tensor/size tests
+  for (auto s = sizes.begin(); s != sizes.end(); ++s) {
+    // verify that the dim, sizes, strides, etc match what was requested.
+    auto t = ones(*s, T);
+    REQUIRE((size_t)t.dim() == s->size());
+    REQUIRE((size_t)t.ndimension() == s->size());
+    REQUIRE(t.sizes().equals(*s));
+    REQUIRE(t.strides().size() == s->size());
+    auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
+    REQUIRE(t.numel() == numel);
+    // verify we can output
+    std::stringstream ss;
+    REQUIRE_NOTHROW(ss << t << std::endl);
+
+    // set_
+    auto t2 = ones(*s, T);
+    t2.set_();
+    require_equal_size_dim(t2, ones({0}, T));
+
+    // unsqueeze
+    if (t.numel() != 0) {
+      REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    } else {
+      REQUIRE_THROWS(t.unsqueeze(0));
+    }
+
+    // unsqueeze_
+    {
+      auto t2 = ones(*s, T);
+      if (t2.numel() != 0) {
+        auto r = t2.unsqueeze_(0);
+        REQUIRE(r.dim() == t.dim() + 1);
+      } else {
+        REQUIRE_THROWS(t2.unsqueeze_(0));
+      }
+    }
+
+    // squeeze (with dimension argument)
+    if (t.dim() == 0 || t.sizes()[0] == 1) {
+      REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+    } else {
+      // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
+      // in NumPy this is an error.
+      REQUIRE(t.squeeze(0).dim() == t.dim());
+    }
+
+    // squeeze (with no dimension argument)
+    {
+      std::vector<int64_t> size_without_ones;
+      for (auto size : *s) {
+        if (size != 1) {
+          size_without_ones.push_back(size);
+        }
+      }
+      auto result = t.squeeze();
+      require_equal_size_dim(result, ones(size_without_ones, T));
+    }
+
+    {
+      // squeeze_ (with dimension argument)
+      auto t2 = ones(*s, T);
+      if (t2.dim() == 0 ||  t2.sizes()[0] == 1) {
+        REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      } else {
+        // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
+        // in NumPy this is an error.
+        REQUIRE(t2.squeeze_(0).dim() == t.dim());
+      }
+    }
+
+    // squeeze_ (with no dimension argument)
+    {
+      auto t2 = ones(*s, T);
+      std::vector<int64_t> size_without_ones;
+      for (auto size : *s) {
+        if (size != 1) {
+          size_without_ones.push_back(size);
+        }
+      }
+      auto r = t2.squeeze_();
+      require_equal_size_dim(t2, ones(size_without_ones, T));
+    }
+
+    // reduce (with dimension argument and with 1 return argument)
+    if (t.numel() != 0) {
+      REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+    } else {
+      REQUIRE(t.sum(0).equal(at::zeros({}, T)));
+    }
+
+    // reduce (with dimension argument and with 2 return arguments)
+    if (t.numel() != 0) {
+      auto ret = t.min(0);
+      REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+    } else {
+      REQUIRE_THROWS(t.min(0));
+    }
+
+    // simple indexing
+    if (t.dim() > 0 && t.numel() != 0) {
+      REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
+    } else {
+      REQUIRE_THROWS(t[0]);
+    }
+
+    // fill_ (argument to fill_ can only be a 0-dim tensor)
+    TRY_CATCH_ELSE(t.fill_(t.sum(0)),
+                   REQUIRE(t.dim() > 1),
+                   REQUIRE(t.dim() <= 1));
+  }
+
+  for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) {
+    for (auto rhs_it = sizes.begin(); rhs_it != sizes.end(); ++rhs_it) {
+      // is_same_size should only match if they are the same shape
+      {
+          auto lhs = ones(*lhs_it, T);
+          auto rhs = ones(*rhs_it, T);
+          if(*lhs_it != *rhs_it) {
+            REQUIRE(!lhs.is_same_size(rhs));
+            REQUIRE(!rhs.is_same_size(lhs));
+          }
+      }
+      // forced size functions (resize_, resize_as, set_)
+      {
+        // resize_
+        {
+          auto lhs = ones(*lhs_it, T);
+          auto rhs = ones(*rhs_it, T);
+          lhs.resize_(*rhs_it);
+          require_equal_size_dim(lhs, rhs);
+        }
+        // resize_as_
+        {
+          auto lhs = ones(*lhs_it, T);
+          auto rhs = ones(*rhs_it, T);
+          lhs.resize_as_(rhs);
+          require_equal_size_dim(lhs, rhs);
+        }
+        // set_
+        {
+          {
+            // with tensor
+            auto lhs = ones(*lhs_it, T);
+            auto rhs = ones(*rhs_it, T);
+            lhs.set_(rhs);
+            require_equal_size_dim(lhs, rhs);
+          }
+          {
+            // with storage
+            auto lhs = ones(*lhs_it, T);
+            auto rhs = ones(*rhs_it, T);
+            auto storage = T.storage(rhs.numel());
+            lhs.set_(*storage);
+            // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars
+            REQUIRE(lhs.dim() != 0);
+          }
+          {
+            // with storage, offset, sizes, strides
+            auto lhs = ones(*lhs_it, T);
+            auto rhs = ones(*rhs_it, T);
+            auto storage = T.storage(rhs.numel());
+            lhs.set_(*storage, rhs.storage_offset(), rhs.sizes(), rhs.strides());
+            require_equal_size_dim(lhs, rhs);
+          }
+        }
+      }
+
+      // view
+      {
+        auto lhs = ones(*lhs_it, T);
+        auto rhs = ones(*rhs_it, T);
+        auto rhs_size = *rhs_it;
+        TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
+                       REQUIRE(lhs.numel() != rhs.numel()),
+                       REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
+      }
+
+      // take
+      {
+        auto lhs = ones(*lhs_it, T);
+        auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
+        TRY_CATCH_ELSE(auto result = lhs.take(rhs),
+                       REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0),
+                       require_equal_size_dim(result, rhs));
+      }
+
+
+      // ger
+      {
+        auto lhs = ones(*lhs_it, T);
+        auto rhs = ones(*rhs_it, T);
+        TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
+                       REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
+                       [&]() {
+                         int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
+                         int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
+                         require_equal_size_dim(result, result.type().tensor({dim0, dim1}));
+                       }(););
+      }
+
+      // expand
+      {
+        auto lhs = ones(*lhs_it, T);
+        auto lhs_size = *lhs_it;
+        auto rhs = ones(*rhs_it, T);
+        auto rhs_size = *rhs_it;
+        bool should_pass = should_expand(lhs_size, rhs_size);
+        TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
+                       REQUIRE(!should_pass),
+                       REQUIRE(should_pass); require_equal_size_dim(result, rhs););
+
+        // in-place functions (would be good if we can also do a non-broadcasting one, b/c
+        // broadcasting functions will always end up operating on tensors of same size;
+        // is there an example of this outside of assign_ ?)
+        {
+          bool should_pass_inplace = should_expand(rhs_size, lhs_size);
+          TRY_CATCH_ELSE(lhs.add_(rhs),
+                         REQUIRE(!should_pass_inplace),
+                         REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
+        }
+      }
+    }
+  }
+}
+
+TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
+  manual_seed(123, at::Backend::CPU);
+
+  test(CPU(kFloat));
+}
+
+TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
+  manual_seed(123, at::Backend::CUDA);
+
+  if (at::hasCUDA()) {
+    test(CUDA(kFloat));
+  }
+}
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
new file mode 100644
index 0000000..ccdab08
--- /dev/null
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -0,0 +1,151 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include <iostream>
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "test_seed.h"
+
+using std::cout;
+using namespace at;
+
+constexpr auto Float = ScalarType::Float;
+
+template<typename scalar_type>
+struct Foo {
+  static void apply(Tensor a, Tensor b) {
+    scalar_type s = 1;
+    std::stringstream ss;
+    ss << "hello, dispatch: " << a.type().toString() << s << "\n";
+    auto data = (scalar_type*)a.data_ptr();
+    (void)data;
+  }
+};
+template<>
+struct Foo<Half> {
+  static void apply(Tensor a, Tensor b) {}
+};
+
+void test_ctors() {
+  // create scalars backed by tensors
+  auto s1 = Scalar(CPU(kFloat).scalarTensor(1));
+  auto s2 = Scalar(CPU(kFloat).scalarTensor(2));
+  Scalar{s1};
+  Scalar{std::move(s2)};
+  REQUIRE(s2.isBackedByTensor());
+  REQUIRE(!s2.toTensor().defined());
+  s2 = s1;
+  REQUIRE(s2.isBackedByTensor());
+  REQUIRE(s2.toFloat() == 1.0);
+  Scalar s3;
+  s3 = std::move(s2);
+  REQUIRE(s2.isBackedByTensor());
+  REQUIRE(!s2.toTensor().defined());
+  REQUIRE(s3.isBackedByTensor());
+  REQUIRE(s3.toFloat() == 1.0);
+}
+
+void test_overflow() {
+  auto s1 = Scalar(M_PI);
+  REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
+  s1.toHalf();
+
+  s1 = Scalar(100000);
+  REQUIRE(s1.toFloat() == 100000.0);
+  REQUIRE(s1.toInt() == 100000);
+
+  REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
+
+  s1 = Scalar(NAN);
+  REQUIRE(std::isnan(s1.toFloat()));
+  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+
+  s1 = Scalar(INFINITY);
+  REQUIRE(std::isinf(s1.toFloat()));
+  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+}
+
+TEST_CASE( "scalar test", "[]" ) {
+
+  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::Backend::CUDA);
+
+  Scalar what = 257;
+  Scalar bar = 3.0;
+  Half h = bar.toHalf();
+  Scalar h2 = h;
+  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
+  Generator & gen = at::globalContext().defaultGenerator(Backend::CPU);
+  REQUIRE_NOTHROW(gen.seed());
+  auto && C = at::globalContext();
+  if(at::hasCUDA()) {
+    auto & CUDAFloat = C.getType(Backend::CUDA,ScalarType::Float);
+    auto t2 = zeros({4,4}, CUDAFloat);
+    cout << &t2 << "\n";
+    cout << "AFTER GET TYPE " << &CUDAFloat << "\n";
+    auto s = CUDAFloat.storage(4);
+    REQUIRE( s->get(3).toFloat() == 0.0 );
+    s->fill(7);
+    REQUIRE( s->get(3).toFloat() == 7.0 );
+  }
+  auto t = ones({4,4});
+
+  auto wha2 = zeros({4,4}).add(t).sum();
+  REQUIRE( wha2.toCDouble() == 16.0 );
+
+  REQUIRE( t.sizes()[0] == 4 );
+  REQUIRE( t.sizes()[1] == 4 );
+  REQUIRE( t.strides()[0] == 4 );
+  REQUIRE( t.strides()[1] == 1 );
+
+  Type & T = CPU(Float);
+  Tensor x = randn({1,10}, T);
+  Tensor prev_h = randn({1,20}, T);
+  Tensor W_h = randn({20,20}, T);
+  Tensor W_x = randn({20,10}, T);
+  Tensor i2h = at::mm(W_x, x.t());
+  Tensor h2h = at::mm(W_h, prev_h.t());
+  Tensor next_h = i2h.add(h2h);
+  next_h = next_h.tanh();
+
+  REQUIRE_THROWS(Scalar{Tensor{}});
+
+  test_ctors();
+  test_overflow();
+
+  if(at::hasCUDA()) {
+    auto r = CUDA(Float).copy(next_h);
+    REQUIRE(CPU(Float).copy(r).equal(next_h));
+  }
+  REQUIRE_NOTHROW(randn({10,10,2}, T));
+
+  // check Scalar.toTensor on Scalars backed by different data types
+  REQUIRE(bar.toTensor().type().scalarType() == kDouble);
+  REQUIRE(what.toTensor().type().scalarType() == kLong);
+  REQUIRE(Scalar(ones({})).toTensor().type().scalarType() == kFloat);
+
+  if (x.type().scalarType() != ScalarType::Half) {
+    AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
+      scalar_t s = 1;
+      std::stringstream ss;
+      REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
+      auto data = (scalar_t*)x.data_ptr();
+      (void)data;
+    });
+  }
+
+  // test direct C-scalar type conversions
+  {
+    auto x = ones({1,2}, T);
+    REQUIRE_THROWS(x.toCFloat());
+  }
+  auto float_one = ones({}, T);
+  REQUIRE(float_one.toCFloat() == 1);
+  REQUIRE(float_one.toCInt() == 1);
+  REQUIRE((float_one.toCHalf() == 1));
+}
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
new file mode 100644
index 0000000..8946026
--- /dev/null
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -0,0 +1,103 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+
+#include "cuda_runtime.h"
+
+#include <thread>
+#include <functional>
+
+/*
+Tests related to ATen streams.
+*/
+TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copying and moving") {
+  int32_t device = -1;
+  cudaStream_t cuda_stream;
+  
+  // Tests that copying works as expected and preserves the stream
+  at::CUDAStream copyStream;
+  {
+    auto s = at::globalContext().createCUDAStream();
+    device = s.device();
+    cuda_stream = s.stream();
+
+    copyStream = s;
+    
+    REQUIRE(copyStream.internals() == s.internals());
+    REQUIRE(copyStream.device() == device);
+    REQUIRE(copyStream.stream() == cuda_stream);
+  }
+
+  REQUIRE(copyStream.internals());
+  REQUIRE(copyStream.device() == device);
+  REQUIRE(copyStream.stream() == cuda_stream);
+
+  // Tests that moving works as expected and preserves the stream
+  at::CUDAStream moveStream;
+  {
+    auto s = at::globalContext().createCUDAStream();
+    device = s.device();
+    cuda_stream = s.stream();
+
+    moveStream = std::move(s);
+
+    REQUIRE(moveStream.device() == device);
+    REQUIRE(moveStream.stream() == cuda_stream);
+  }
+
+  REQUIRE(moveStream.internals());
+  REQUIRE(moveStream.device() == device);
+  REQUIRE(moveStream.stream() == cuda_stream);
+}
+
+TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
+  at::CUDAStream myStream = at::globalContext().createCUDAStream();
+
+  // Sets and gets
+  at::globalContext().setCurrentCUDAStream(myStream);
+  at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream();
+  
+  REQUIRE(myStream == curStream);
+
+  // Gets, sets, and gets default stream
+  at::CUDAStream defaultStream = at::globalContext().getDefaultCUDAStream();
+  at::globalContext().setCurrentCUDAStream(defaultStream);
+  curStream = at::globalContext().getCurrentCUDAStream();
+
+  REQUIRE(defaultStream != myStream);
+  REQUIRE(curStream == defaultStream);
+}
+
+TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") {
+  auto ptr = at::detail::CUDAStream_createAndRetainWithOptions(
+    at::CUDAStream::DEFAULT_FLAGS
+  , at::CUDAStream::DEFAULT_PRIORITY);
+
+  at::detail::CUDAStream_free(ptr);
+  REQUIRE(ptr == nullptr);
+}
+
+void thread_fun(at::CUDAStream& cur_thread_stream) {
+  auto new_stream = at::globalContext().createCUDAStream();
+  at::globalContext().setCurrentCUDAStream(new_stream);
+  cur_thread_stream = at::globalContext().getCurrentCUDAStream();
+  REQUIRE(cur_thread_stream == new_stream);
+}
+
+TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local") {
+  at::CUDAStream s0, s1;
+
+  std::thread t0{thread_fun, std::ref(s0)};
+  std::thread t1{thread_fun, std::ref(s1)};
+  t0.join();
+  t1.join();
+
+  at::CUDAStream cur_stream = at::globalContext().getCurrentCUDAStream();
+  at::CUDAStream default_stream = at::globalContext().getDefaultCUDAStream();
+
+  REQUIRE(cur_stream == default_stream);
+  REQUIRE(cur_stream != s0);
+  REQUIRE(cur_stream != s1);
+  REQUIRE(s0 != s1);
+}
diff --git a/aten/src/ATen/test/tbb_init_test.cpp b/aten/src/ATen/test/tbb_init_test.cpp
new file mode 100644
index 0000000..027b878
--- /dev/null
+++ b/aten/src/ATen/test/tbb_init_test.cpp
@@ -0,0 +1,43 @@
+#include "ATen/ATen.h"
+#include "ATen/Parallel.h"
+#include "test_assert.h"
+#include "test_seed.h"
+#include <thread>
+
+using namespace at;
+
+// This checks whether threads can see the global
+// numbers of threads set and also whether the scheduler
+// will throw an exception when multiple threads call
+// their first parallel construct.
+void test(int given_num_threads) {
+  auto t = ones({1000 * 1000}, CPU(kFloat));
+  if (given_num_threads >= 0) {
+    ASSERT(at::get_num_threads() == given_num_threads);
+  } else {
+    ASSERT(at::get_num_threads() == -1);
+  }
+  auto t_sum = t.sum();
+  for (int i = 0; i < 1000; i ++) {
+    t_sum = t_sum + t.sum();
+  }
+}
+
+int main() {
+  manual_seed(123, at::Backend::CPU);
+
+  test(-1);
+  std::thread t1(test, -1);
+  t1.join();
+  at::set_num_threads(4);
+  std::thread t2(test, 4);
+  std::thread t3(test, 4);
+  std::thread t4(test, 4);
+  t4.join();
+  t3.join();
+  t2.join();
+  at::set_num_threads(5);
+  test(5);
+
+  return 0;
+}
diff --git a/aten/src/ATen/test/test_assert.h b/aten/src/ATen/test/test_assert.h
new file mode 100644
index 0000000..8b01172
--- /dev/null
+++ b/aten/src/ATen/test/test_assert.h
@@ -0,0 +1,67 @@
+#pragma once
+#include <stdexcept>
+#include <stdarg.h>
+
+static inline void barf(const char *fmt, ...) {
+  char msg[2048];
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(msg, 2048, fmt, args);
+  va_end(args);
+  throw std::runtime_error(msg);
+}
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define AT_EXPECT(x, y) (__builtin_expect((x),(y)))
+#else
+#define AT_EXPECT(x, y) (x)
+#endif
+
+#define ASSERT(cond) \
+  if (AT_EXPECT(!(cond), 0)) { \
+    barf("%s:%u: %s: Assertion `%s` failed.", __FILE__, __LINE__, __func__, #cond); \
+  }
+
+//note: msg must be a string literal
+//node: In, ##__VA_ARGS '##' supresses the comma if __VA_ARGS__ is empty
+#define ASSERTM(cond, msg, ...) \
+  if (AT_EXPECT(!(cond), 0)) { \
+    barf("%s:%u: %s: Assertion `%s` failed: " msg , __FILE__, __LINE__, __func__, #cond,##__VA_ARGS__); \
+  }
+
+#define TRY_CATCH_ELSE(fn, catc, els)                           \
+  {                                                             \
+    /* avoid mistakenly passing if els code throws exception*/  \
+    bool _passed = false;                                       \
+    try {                                                       \
+      fn;                                                       \
+      _passed = true;                                           \
+      els;                                                      \
+    } catch (std::runtime_error &e) {                           \
+      ASSERT(!_passed);                                         \
+      catc;                                                     \
+    }                                                           \
+  }
+
+#define ASSERT_THROWSM(fn, message)     \
+  TRY_CATCH_ELSE(fn, ASSERT(std::string(e.what()).find(message) != std::string::npos), ASSERT(false))
+
+#define ASSERT_THROWS(fn)  \
+  ASSERT_THROWSM(fn, "");
+
+#define ASSERT_EQUAL(t1, t2) \
+  ASSERT(t1.equal(t2));
+
+// allclose broadcasts, so check same size before allclose.
+#define ASSERT_ALLCLOSE(t1, t2)   \
+  ASSERT(t1.is_same_size(t2));    \
+  ASSERT(t1.allclose(t2));
+
+// allclose broadcasts, so check same size before allclose.
+#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
+  ASSERT(t1.is_same_size(t2));    \
+  ASSERT(t1.allclose(t2, atol, rtol));
diff --git a/aten/src/ATen/test/test_install/CMakeLists.txt b/aten/src/ATen/test/test_install/CMakeLists.txt
new file mode 100644
index 0000000..dc904b4
--- /dev/null
+++ b/aten/src/ATen/test/test_install/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.0)
+find_package(ATen REQUIRED)
+include_directories(${ATEN_INCLUDE_DIR})
+
+# C++11
+set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
+add_executable(main main.cpp)
+target_link_libraries(main ${ATEN_LIBRARIES})
diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp
new file mode 100644
index 0000000..adeae38
--- /dev/null
+++ b/aten/src/ATen/test/test_install/main.cpp
@@ -0,0 +1,5 @@
+#include "ATen/ATen.h"
+
+int main() {
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
+}
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
new file mode 100644
index 0000000..5dbd967
--- /dev/null
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -0,0 +1,28 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "ATen/DLConvertor.h"
+
+#include <iostream>
+#include <string.h>
+#include <sstream>
+#include "test_seed.h"
+
+using namespace at;
+
+TEST_CASE( "parallel", "[cpu]" ) {
+
+  manual_seed(123, at::Backend::CPU);
+  set_num_threads(1);
+
+  Tensor a = rand({1,3});
+  a[0][0] = 1;
+  a[0][1] = 0;
+  a[0][2] = 0;
+  Tensor as = rand({3});
+  as[0] = 1;
+  as[1] = 0;
+  as[2] = 0;
+  REQUIRE(a.sum(0).equal(as));
+}
diff --git a/aten/src/ATen/test/test_seed.h b/aten/src/ATen/test/test_seed.h
new file mode 100644
index 0000000..16f9ecb
--- /dev/null
+++ b/aten/src/ATen/test/test_seed.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "ATen/ATen.h"
+
+void manual_seed(uint64_t seed, at::Backend backend) {
+  if (backend == at::Backend::CPU) {
+    at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::Backend::CPU);
+    cpu_gen.manualSeed(seed);
+  } else if (backend == at::Backend::CUDA && at::hasCUDA()) {
+    at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::Backend::CUDA);
+    cuda_gen.manualSeed(seed);
+  }
+}
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
new file mode 100644
index 0000000..d88923d
--- /dev/null
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -0,0 +1,53 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "ATen/UndefinedTensor.h"
+#include <string>
+#include "test_seed.h"
+
+using namespace at;
+
+TEST_CASE( "undefined tensor test", "[]" ) {
+  manual_seed(123, at::Backend::CPU);
+
+  // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
+  Tensor und;
+  Tensor ft = ones({1}, CPU(kFloat));
+
+  std::stringstream ss;
+  ss << und << std::endl;
+  REQUIRE(!und.defined());
+  REQUIRE(std::string("UndefinedTensor") == und.toString());
+
+  REQUIRE_THROWS_WITH(und.strides(), Catch::Contains("strides"));
+  REQUIRE_THROWS_WITH(und.dim(), Catch::Contains("dim"));
+  REQUIRE_THROWS_WITH([]() {return Tensor();}() = Scalar(5), Catch::Contains("UndefinedType"));
+  REQUIRE_THROWS_WITH(und.unsafeGetTH(true), Catch::Contains("unsafeGetTH"));
+  REQUIRE_THROWS_WITH(und.add(und), Catch::Contains("add"));
+  REQUIRE_THROWS_WITH(und.add(ft), Catch::Contains("add"));
+  REQUIRE_THROWS_WITH(ft.add(und), Catch::Contains("add"));
+  REQUIRE_THROWS_WITH(und.add(5), Catch::Contains("add"));
+  REQUIRE_THROWS_WITH(und.mm(und), Catch::Contains("mm"));
+
+  und.toType(und.type());
+  REQUIRE_THROWS_WITH(und.toType(ft.type()), Catch::Contains("attempt to copy an undefined tensor"));
+  REQUIRE_THROWS_WITH(ft.toType(und.type()), Catch::Contains("UndefinedType"));
+  und.toType(ScalarType::Undefined);
+  REQUIRE_THROWS_WITH(und.toType(ScalarType::Float), Catch::Contains("toScalarType"));
+  REQUIRE_THROWS_WITH(ft.toType(ScalarType::Undefined), Catch::Contains("UndefinedType"));
+
+  // copy_
+  REQUIRE_THROWS_WITH(und.copy_(und), Catch::Contains("copy"));
+  REQUIRE_THROWS_WITH(und.copy_(ft), Catch::Contains("copy"));
+  REQUIRE_THROWS_WITH(ft.copy_(und), Catch::Contains("copy"));
+
+  und.toBackend(Backend::Undefined);
+  REQUIRE_THROWS_WITH(und.toBackend(Backend::CPU), Catch::Contains("toBackend"));
+  REQUIRE_THROWS_WITH(ft.toBackend(Backend::Undefined), Catch::Contains("UndefinedType"));
+
+  Tensor to_move = ones({1}, CPU(kFloat));
+  Tensor m(std::move(to_move));
+  REQUIRE(!to_move.defined());
+  REQUIRE(to_move.get() == UndefinedTensor::singleton());
+}
diff --git a/aten/src/ATen/test/verify_api_visibility.cpp b/aten/src/ATen/test/verify_api_visibility.cpp
new file mode 100644
index 0000000..ed296ce
--- /dev/null
+++ b/aten/src/ATen/test/verify_api_visibility.cpp
@@ -0,0 +1,15 @@
+#include <ATen/ATen.h>
+
+#ifdef AT_CUDNN_ENABLED
+#error "AT_CUDNN_ENABLED should not be visible in public headers"
+#endif
+
+#ifdef AT_MKL_ENABLED
+#error "AT_MKL_ENABLED should not be visible in public headers"
+#endif
+
+#ifdef AT_MKLDNN_ENABLED
+#error "AT_MKLDNN_ENABLED should not be visible in public headers"
+#endif
+
+auto main() -> int {}
diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp
new file mode 100644
index 0000000..aab2ec5
--- /dev/null
+++ b/aten/src/ATen/test/weakref_test.cpp
@@ -0,0 +1,64 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+
+#include <iostream>
+#include <chrono>
+#include <sstream>
+
+using at::Tensor;
+using at::WeakTensor;
+
+TEST_CASE( "Weak pointer tests", "" ) {
+  SECTION("gets invalidated") {
+    Tensor a = at::ones({2, 2});
+    WeakTensor b = a;
+    a.reset();
+    REQUIRE_FALSE(b.lock().defined());
+  }
+
+  SECTION("can successfully lock") {
+    Tensor a = at::ones({2, 2});
+    WeakTensor b = a;
+    auto c = b.lock();
+    REQUIRE(c.defined());
+
+    a.reset();
+    REQUIRE(b.lock().defined());
+    c.reset();
+    REQUIRE_FALSE(b.lock().defined());
+  }
+
+  SECTION("updates refcounts correctly") {
+    Tensor a = at::ones({2, 2});
+    auto ai = a.unsafeGetTensorImpl();
+    REQUIRE(ai->use_count() == 1);
+    REQUIRE(ai->weak_use_count() == 1);
+    {
+      WeakTensor b = a;
+      REQUIRE(ai->use_count() == 1);
+      REQUIRE(ai->weak_use_count() == 2);
+    }
+    REQUIRE(ai->use_count() == 1);
+    REQUIRE(ai->weak_use_count() == 1);
+    {
+      WeakTensor b = a;
+      REQUIRE(ai->use_count() == 1);
+      auto locked = b.lock();
+      REQUIRE(locked.defined());
+      REQUIRE(ai->use_count() == 2);
+    }
+    REQUIRE(ai->use_count() == 1);
+    REQUIRE(ai->weak_use_count() == 1);
+    {
+      WeakTensor b = a;
+      REQUIRE(ai->use_count() == 1);
+      REQUIRE(ai->weak_use_count() == 2);
+      a.reset();
+      auto bi = b.unsafeGetTensorImpl();
+      REQUIRE(bi->use_count() == 0);
+      REQUIRE(bi->weak_use_count() == 1);
+    }
+  }
+}
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
new file mode 100644
index 0000000..599c103
--- /dev/null
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -0,0 +1,43 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+
+using namespace at;
+
+TEST_CASE( "wrapdim test", "[]" ) {
+  manual_seed(123, at::Backend::CPU);
+
+  Type & T = CPU(kFloat);
+
+  SECTION( "simple case" ) {
+    auto a = randn({2, 3, 4, 5}, T);
+    REQUIRE(a.prod(-4).equal(a.prod(0)));
+    REQUIRE(a.prod(3).equal(a.prod(-1)));
+  }
+
+  SECTION( "expression specification" ) {
+    auto a = randn({2, 3, 4, 5}, T);
+    REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
+    REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
+
+    // can unsqueeze scalar
+    auto b = randn(1, T);
+    b.get()->maybeScalar(true);
+    REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
+  }
+
+  SECTION( "empty tensor" ) {
+    auto a = randn(0, T);
+    REQUIRE(a.prod(0).equal(at::ones({}, T)));
+  }
+
+  SECTION( "scalar vs 1-dim, 1-size" ) {
+    auto a = randn(1, T);
+    REQUIRE(a.prod(0).equal(a.prod(-1)));
+    a.get()->maybeScalar(true);
+    REQUIRE(a.get()->isScalar());
+    REQUIRE(a.prod(0).equal(a.prod(-1)));
+  }
+}
diff --git a/aten/src/README.md b/aten/src/README.md
new file mode 100644
index 0000000..a641ea1
--- /dev/null
+++ b/aten/src/README.md
@@ -0,0 +1,144 @@
+This directory contains the low-level tensor libraries for PyTorch,
+as well as the new ATen C++ bindings.
+
+The low-level libraries trace their lineage from the original Torch.  There are
+multiple variants of the library, summarized here:
+
+* TH = TorcH
+* THC = TorcH Cuda
+* THCS = TorcH Cuda Sparse (now defunct)
+* THCUNN = TorcH CUda Neural Network (see cunn)
+* THD = TorcH Distributed
+* THNN = TorcH Neural Network
+* THS = TorcH Sparse (now defunct)
+
+(You'll also see these abbreviations show up in symbol names.)
+
+## Reference counting
+
+PyTorch employs reference counting in order to permit tensors to provide
+differing views on a common underlying storage.  For example, when you call
+view() on a Tensor, a new THTensor is allocated with differing dimensions,
+but it shares the same THStorage with the original tensor.
+
+Unfortunately, this means we are in the business of manually tracking reference
+counts inside our C library code.  Fortunately, for most of our library code implementing
+tensor operations, there is only one rule you have to remember:
+
+> **Golden Rule of Reference Counting:** You must either FREE or RETURN
+> a pointer which was returned by a function whose name begins with
+> `new` or which you called `retain` on.
+> If you return this pointer, your function name must begin with `new`.
+
+In a long function, there may be many invocations of functions with `new` in
+their name.  Your responsibility is to go through each of them and ensure
+that there is a matching `free` for it for EACH exit point of the function.
+
+### Examples
+
+Suppose you want to get a reference to the indices of a sparse tensor.  This
+function is called `newIndices`.  The `new` means you MUST free it when you're
+done (usually at the end of your function.)  (It's worth noting that
+`newIndices` doesn't actually allocate a fresh indices tensor; it just gives
+you a pointer to the existing one.)  DO NOT directly access the member
+variables of the struct.
+
+```
+THIndexTensor *indices = THSTensor_(newIndices)(state, sparse);
+// ... do some stuff ...
+THIndexTensor_(free)(state, indices);
+```
+
+Let's take a look at the implementation of `newIndices`.  This doesn't free the
+return result of `newNarrow`, but returns it.  This justifies the `new` in its
+name.
+
+```
+THIndexTensor *THSTensor_(newIndices)(const THSTensor *self) {
+  // ...
+  return THIndexTensor_(newNarrow)(self->indices, 1, 0, self->nnz);
+}
+```
+
+Passing an object to another function does NOT absolve you of responsibility
+of freeing it.  If that function holds on to a pointer to the object, it
+will `retain` it itself.
+
+```
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+  THLongStorage_free(inferred_size);
+```
+
+Sometimes, you have a tensor in hand which you'd like to use directly, but
+under some conditions you have to have to call, e.g., `newContiguous`, to get
+it into the correct form:
+
+```
+  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+  ...
+  THTensor_(free)(kernel);
+```
+
+In this case, we have (redundantly) called `retain` on `k_`, so that we can
+unconditionally free `kernel` at the end of the function; intuitively, you
+want it to be possible to replace the conditional expression with an equivalent
+function call, e.g., `kernel = THTensor_(newContiguous2D)(k_)`.
+
+### Tips
+
+* If you have an early exit in a function (via a `return`), don't forget to
+  `free` any pointers which you allocated up to this point.  If at all possible,
+  move early exits prior to these allocations, so that you don't have to clean up.
+
+* Very occasionally, you may be able to implement an algorithm more efficiently
+  if you "destroy" its input.  This is a `move`; after moving an object away,
+  you must NOT `free` it.  This is the one exception to the rule, and at the
+  moment there is only one instance of `move` in the code base.
+
+* We use `THError` to signal error cases, and fortunately,
+  you do NOT need to make sure you've freed everything before calling `THError`,
+  because by default, it aborts the entire process.  However, it's good style
+  to call `THError` before performing any allocations, since in some cases we
+  sketchily throw a C++ exception and try to recover (in particular, the test
+  suite does this.)
+
+## The C interface
+
+Historically, the Torch libraries were implemented in C.  Since then, we have slowly
+started rewriting bits of pieces of Torch in C++ (usually because there is some
+C++ feature which would be really helpful for writing something.)  However,
+Torch has *always been*, and *will always be* a library that provides a C ABI
+interface, even if, at some point in the future, its internal implementation
+is entirely done in a C++ library that heavily uses C++ idioms.  (At the moment,
+all of the source files are C++, but they are mostly C code that happens to be
+compiled as C++).
+
+In order to achieve this, the `TH_API` macro (called `THC_API` in `THC`) plays
+a crucial role: it declares a function as having C-linkage, which means that the
+C++ compiler doesn't mangle its name and a C client can link against it.
+
+As a developer, here is what you need to know:
+
+1. If you add a function to the public API of Torch, you *must* mark it with
+   `TH_API` or `THC_API` (depending if you are in CPU or CUDA land).
+   This will ensure it is built with C-linkage (and on Windows, it
+   will also ensure that the symbol is exported from the DLL; otherwise it
+   won't be visible.)
+
+2. C++ features should ONLY be used in `.cpp` and `.hpp` files, and not in
+   `.h` files.  If you need to use a C++ type in a header file, you should
+   define this in a separate, C++ only header `.hpp`, and declare it opaquely
+   in the `.h`. Search for `mutex` for an example of this principle being applied.
+   (This convention is OPPOSITE from the prevailing convention in PyTorch and
+   ATen, where C++ headers are defined in `.h` files.)
+
+Arguably, the "C-compatible" headers should live in a separate directory,
+distinct from the C++ code.  We think this might be a good thing to do
+eventually, and would make the code structure more clear, but we have not
+done it at the moment.
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
new file mode 100644
index 0000000..5d588df
--- /dev/null
+++ b/aten/src/TH/CMakeLists.txt
@@ -0,0 +1,131 @@
+set(extra_src)
+
+# IF ANY SIMD FOUND
+IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve.cpp)
+ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+
+# IF SSE4 FOUND
+IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_sse.cpp)
+ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
+
+# IF AVX FOUND
+IF(C_AVX_FOUND)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX.cpp)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_avx.cpp)
+ENDIF(C_AVX_FOUND)
+
+IF(C_AVX2_FOUND)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX2.cpp)
+ENDIF(C_AVX2_FOUND)
+
+SET(hdr
+  THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THLapack.h THLogAdd.h THRandom.h THVector.h )
+
+set(ATen_TH_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THStorage.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THLapack.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THLogAdd.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THRandom.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THFile.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THDiskFile.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THMemoryFile.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THVector.cpp
+  ${extra_src}
+  )
+# Remember that PARENT_SCOPE variables are not in the current scope
+set(ATen_TH_SRCS ${ATen_TH_SRCS} PARENT_SCOPE)
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS} ${ATen_TH_SRCS} PARENT_SCOPE)
+######################################################
+
+
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}
+PARENT_SCOPE)
+
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}
+PARENT_SCOPE)
+
+CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
+
+
+INSTALL(FILES
+  TH.h
+  THAllocator.h
+  THMath.h
+  THBlas.h
+  THDiskFile.h
+  THFile.h
+  THFilePrivate.h
+  ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
+  THGenerateAllTypes.h
+  THGenerateDoubleType.h
+  THGenerateFloatType.h
+  THGenerateHalfType.h
+  THGenerateLongType.h
+  THGenerateIntType.h
+  THGenerateShortType.h
+  THGenerateCharType.h
+  THGenerateByteType.h
+  THGenerateFloatTypes.h
+  THGenerateIntTypes.h
+  THLapack.h
+  THLogAdd.h
+  THMemoryFile.h
+  THRandom.h
+  THSize.h
+  THStorage.h
+  THTensor.h
+  THTensorApply.h
+  THTensorDimApply.h
+  THVector.h
+  THHalf.h
+  THTensor.hpp
+  THStorage.hpp
+  THGenerator.hpp
+  THTypeConversion.hpp
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
+
+INSTALL(FILES
+  vector/AVX.h
+  vector/AVX2.h
+  ../ATen/native/cpu/avx_mathfun.h
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/vector")
+
+INSTALL(FILES
+  generic/THBlas.cpp
+  generic/THBlas.h
+  generic/THLapack.cpp
+  generic/THLapack.h
+  generic/THStorage.cpp
+  generic/THStorage.h
+  generic/THStorageCopy.cpp
+  generic/THStorageCopy.h
+  generic/THTensor.cpp
+  generic/THTensor.h
+  generic/THTensorConv.cpp
+  generic/THTensorConv.h
+  generic/THTensorCopy.cpp
+  generic/THTensorCopy.h
+  generic/THTensorLapack.cpp
+  generic/THTensorLapack.h
+  generic/THTensorMath.cpp
+  generic/THTensorMath.h
+  generic/THTensorRandom.cpp
+  generic/THTensorRandom.h
+  generic/THVectorDispatch.cpp
+  generic/THVector.h
+  # See Note [TH abstraction violation]
+  generic/THTensorFastGetSet.hpp
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/generic")
diff --git a/aten/src/TH/README.md b/aten/src/TH/README.md
new file mode 100644
index 0000000..4ac26c1
--- /dev/null
+++ b/aten/src/TH/README.md
@@ -0,0 +1,11 @@
+Environment variables control the disabling of certain explicit SIMD optimizations.
+
+```
+x64 options:
+TH_NO_AVX2=1 # disable AVX2 codepaths
+TH_NO_AVX=1  # disable AVX codepaths
+TH_NO_SSE=1  # disable SSE codepaths
+
+ppc64le options:
+TH_NO_VSX=1  # disable VSX codepaths
+```
diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h
new file mode 100644
index 0000000..08bdde8
--- /dev/null
+++ b/aten/src/TH/TH.h
@@ -0,0 +1,24 @@
+#ifndef TH_INC
+#define TH_INC
+
+#include "THGeneral.h"
+
+#include "THBlas.h"
+#ifdef USE_LAPACK
+#include "THLapack.h"
+#endif
+
+#include "THVector.h"
+#include "THLogAdd.h"
+#include "THRandom.h"
+#include "THSize.h"
+#include "THStorage.h"
+#include "THTensor.h"
+#include "THTensorApply.h"
+#include "THTensorDimApply.h"
+
+#include "THFile.h"
+#include "THDiskFile.h"
+#include "THMemoryFile.h"
+
+#endif
diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp
new file mode 100644
index 0000000..9dccbb3
--- /dev/null
+++ b/aten/src/TH/THAllocator.cpp
@@ -0,0 +1,563 @@
+#include "THAllocator.h"
+
+/* stuff for mapped files */
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#include <atomic>
+#if ATOMIC_INT_LOCK_FREE == 2
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#endif
+
+#if HAVE_MMAP
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+/* end of stuff for mapped files */
+
+struct THDefaultAllocator final : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    auto* ptr = THAlloc(size);
+    return {ptr, ptr, &THFree, at::kCPU};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &THFree;
+  }
+};
+
+static THDefaultAllocator th_default_allocator;
+at::Allocator* getTHDefaultAllocator() {
+  return &th_default_allocator;
+}
+
+#if defined(_WIN32) || defined(HAVE_MMAP)
+
+#define TH_ALLOC_ALIGNMENT 64
+
+typedef struct {
+  std::atomic<int> refcount;
+} THMapInfo;
+
+const char * unknown_filename = "filename not specified";
+#ifdef _WIN32
+const char * unknown_eventname = "eventname not specified";
+#endif
+
+THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+  : filename_(filename ? filename : unknown_filename)
+  , flags_(0) // to be filled later
+  , size_(0) // to be filled later
+#ifdef _WIN32
+  , handle_(INVALID_HANDLE_VALUE) // to be filled later
+  , event_(INVALID_HANDLE_VALUE) // to be filled later
+  , eventname_(filename ? std::string(filename) + "_event" : unknown_eventname)
+#else
+  , fd_(fd)
+#endif
+  , base_ptr_(nullptr)
+{
+
+  if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)) {
+    flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE;
+  }
+  if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0) {
+    AT_ERROR("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file in shared mode");
+  }
+#ifdef _WIN32
+  if (fd != -1) {
+    AT_ERROR("THMapAllocator_newWithFd is unsupported on Windows");
+  }
+#endif
+  flags_ = flags;
+
+  // OK, now do the allocation
+
+  if (size == 0) {
+    return;
+  }
+
+#ifdef _WIN32
+  if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) {
+    // Shadowing
+    const char *filename;
+    const char *eventname;
+    LARGE_INTEGER hfilesz;
+
+    if (filename_[0] == '/') {
+      filename = filename_.c_str() + 1;
+      eventname = eventname_.c_str() + 1;
+    } else {
+      filename = filename_.c_str();
+      eventname = eventname_.c_str();
+    }
+
+    hfilesz.QuadPart = size;
+
+    if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
+      handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename);
+      event_ = CreateEvent(nullptr, FALSE, FALSE, eventname);
+    } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) {
+      handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename);
+      event_ = OpenEvent(EVENT_ALL_ACCESS, FALSE, eventname);
+    } else {
+      AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE");
+    }
+
+    if (event_ == nullptr) {
+      AT_ERROR("Couldn't open shared event: <", eventname, ">, error code: <", GetLastError(), ">");
+    }
+
+    if (handle_ == nullptr) {
+      AT_ERROR("Couldn't open shared file mapping: <", filename, ">, error code: <", GetLastError(), ">");
+    }
+
+    size_ = size;
+    base_ptr_ = MapViewOfFile(handle_, FILE_MAP_ALL_ACCESS, 0, 0, size);
+    if (!base_ptr_) {
+      AT_ERROR("Couldn't map view of shared file <", filename, ">, error code: <", GetLastError(), ">");
+    }
+  } else {
+
+    HANDLE hfile;
+    HANDLE hmfile;
+    LARGE_INTEGER hfilesz;
+
+    if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
+      AT_ERROR("exclusive file mapping is not supported on Windows");
+    }
+    if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) {
+      AT_ERROR("file mapping without creation is not supported on Windows");
+    }
+    if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) {
+      AT_ERROR("TH_ALLOCATOR_MAPPED_KEEPFD not supported on Windows");
+    }
+    if (flags_ & TH_ALLOCATOR_MAPPED_FROMFD) {
+      AT_ERROR("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows");
+    }
+
+    /* open file */
+    /* FILE_FLAG_RANDOM_ACCESS ? */
+    if (flags_) {
+      hfile = CreateFileA(filename_.c_str(), GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
+      if (hfile == INVALID_HANDLE_VALUE) {
+        AT_ERROR("could not open file <", filename_, "> in read-write mode; error code: <", GetLastError(), ">");
+      }
+    } else {
+      hfile = CreateFileA(filename_.c_str(), GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+      if (hfile == INVALID_HANDLE_VALUE) {
+        AT_ERROR("could not open file <", filename_, "> in read-only mode; error code: <", GetLastError(), ">");
+      }
+    }
+
+    if (GetFileSizeEx(hfile, &hfilesz) == 0) {
+      AT_ERROR("could not get file size: <", filename_, ">; error code: <", GetLastError(), ">");
+    }
+
+    if (size > 0) {
+      if (size > hfilesz.QuadPart) {
+        if (flags_) {
+          hfilesz.QuadPart = size;
+          if (SetFilePointerEx(hfile, hfilesz, NULL, FILE_BEGIN) == 0) {
+            CloseHandle(hfile);
+            AT_ERROR("unable to stretch file <", filename_, "> to the right size; error code: <", GetLastError(), ">", filename_);
+          }
+          if (SetEndOfFile(hfile) == 0) {
+            CloseHandle(hfile);
+            AT_ERROR("unable to write to file <", filename_, ">; error code: <", GetLastError(), ">");
+          }
+        } else {
+          CloseHandle(hfile);
+          AT_ERROR("file <", filename_, "> size is smaller than the required mapping size <", size, ">; error code: <", GetLastError(), ">");
+        }
+      }
+    } else {
+      size = hfilesz.QuadPart;
+    }
+
+    size_ = size; /* if we are here, it must be the right size */
+
+    hfilesz.QuadPart = size_;
+
+    /* get map handle */
+    if (flags_) {
+      if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) {
+        AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">");
+      }
+    } else {
+      if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_WRITECOPY, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) {
+        AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">");
+      }
+    }
+
+    /* map the stuff */
+    if(flags_) {
+      base_ptr_ = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
+    } else {
+      base_ptr_ = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0);
+    }
+
+    CloseHandle(hfile);
+    CloseHandle(hmfile);
+  }
+#else /* _WIN32 */
+  {
+    /* open file */
+    int fd;
+    int flags; // shadow
+    struct stat file_stat;
+
+    if (flags_ & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM)) {
+      flags = O_RDWR | O_CREAT;
+    } else {
+      flags = O_RDONLY;
+    }
+
+    if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
+      flags |= O_EXCL;
+    }
+    if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) {
+      flags &= ~O_CREAT;
+    }
+
+    if (!(flags_ & TH_ALLOCATOR_MAPPED_FROMFD)) {
+      if (flags_ & TH_ALLOCATOR_MAPPED_SHARED) {
+        if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
+          AT_ERROR("unable to open file <", filename_, "> in read-write mode");
+        }
+      } else if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) {
+#ifdef HAVE_SHM_OPEN
+        if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
+          AT_ERROR("unable to open shared memory object <", filename_, "> in read-write mode");
+        }
+#else
+        AT_ERROR("unable to open file <", filename_, "> in sharedmem mode, shm_open unavailable on this platform");
+#endif
+      } else {
+        if ((fd = open(filename_.c_str(), O_RDONLY)) == -1) {
+          AT_ERROR("unable to open file <", filename_, "> in read-only mode");
+        }
+      }
+    } else {
+      fd = fd_;
+    }
+
+    if (fstat(fd, &file_stat) == -1) {
+      if (!(flags_ & TH_ALLOCATOR_MAPPED_FROMFD)) {
+        ::close(fd);
+      }
+      AT_ERROR("unable to stat the file <", filename_, ">");
+    }
+
+    if (size > 0) {
+      if (size > file_stat.st_size) {
+        if (flags_) {
+          if (ftruncate(fd, size) == -1) {
+            AT_ERROR("unable to resize file <", filename_, "> to the right size");
+          }
+          if (fstat(fd, &file_stat) == -1 || file_stat.st_size < size) {
+            ::close(fd);
+            AT_ERROR("unable to stretch file <", filename_, "> to the right size");
+          }
+/* on macOS write returns with errno 45 (Opperation not supported) when used
+ * with a file descriptor obtained via shm_open
+ */
+#ifndef __APPLE__
+          if ((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */ {
+            ::close(fd);
+            AT_ERROR("unable to write to file <", filename_, ">");
+          }
+#endif
+        } else {
+          ::close(fd);
+          AT_ERROR("file <", filename_, "> size is smaller than the required mapping size <", size, ">");
+        }
+      }
+    } else {
+      size = file_stat.st_size;
+    }
+
+    size_ = size; /* if we are here, it must be the right size */
+
+    /* map it */
+    if (flags_ & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM)) {
+      base_ptr_ = mmap(nullptr, size_, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    } else {
+      base_ptr_ = mmap(nullptr, size_, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+    }
+
+    if (base_ptr_ == MAP_FAILED) {
+      base_ptr_ = nullptr; /* let's be sure it is NULL */
+    }
+
+    if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) {
+      fd_ = fd;
+    } else {
+      if (::close(fd) == -1) {
+        AT_ERROR("Error closing file <", filename_, ">");
+      }
+      fd_ = -1;
+    }
+
+    if (flags_ & TH_ALLOCATOR_MAPPED_UNLINK) {
+      if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) {
+#ifdef HAVE_SHM_UNLINK
+        if (shm_unlink(filename_.c_str()) == -1) {
+          AT_ERROR("could not unlink the shared memory file ", filename_);
+        }
+#else
+        AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform");
+#endif
+      } else {
+        if (unlink(filename_.c_str()) == -1)
+          AT_ERROR("could not unlink file %s", filename_);
+      }
+    }
+
+    if (base_ptr_ == MAP_FAILED) {
+      AT_ERROR("$ Torch: unable to mmap memory: you tried to mmap ", size_/1073741824, " GB.");
+    }
+  }
+#endif
+}
+
+THMapAllocator::THMapAllocator(const char *filename, int flags, size_t size)
+  : THMapAllocator(WITH_FD, filename, -1, flags, size)
+{}
+
+#ifdef _WIN32
+typedef struct{
+  HANDLE event;
+  HANDLE handle;
+  HANDLE wait;
+} ReleaseContext;
+static VOID CALLBACK WaitForReleaseHandle(PVOID lpParam, BOOLEAN TimerOrWaitFired)
+{
+  if (lpParam) {
+    ReleaseContext *ctx = (ReleaseContext *)lpParam;
+
+    SetEvent(ctx->event);
+    CloseHandle(ctx->event);
+    CloseHandle(ctx->handle);
+
+    UnregisterWait(ctx->wait);
+
+    THFree(ctx);
+  }
+}
+#endif
+
+void THMapAllocator::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+  if (base_ptr_ == nullptr) {
+    return;
+  }
+#ifdef _WIN32
+  if ((flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) || (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+    CloseHandle(handle_);
+  if(UnmapViewOfFile(base_ptr_) == 0)
+    AT_ERROR("could not unmap the shared memory file");
+#else /* _WIN32 */
+  if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) {
+    if (::close(fd_) == -1) {
+      AT_ERROR("could not close file descriptor ", fd_);
+    }
+  }
+
+  if (munmap(base_ptr_, size_)) {
+    AT_ERROR("could not unmap the shared memory file");
+  }
+
+  if (!(flags_ & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK))) {
+    if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) {
+#ifdef HAVE_SHM_UNLINK
+      if (shm_unlink(filename_.c_str()) == -1) {
+        AT_ERROR("could not unlink the shared memory file ", filename_);
+      }
+#else
+      AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform");
+#endif
+    }
+  }
+#endif /* _WIN32 */
+}
+
+#else /* defined(_WIN32) || defined(HAVE_MMAP) */
+
+THMapAllocator::THMapAllocator(const char *filename, int flags, size_t size) {
+  AT_ERROR("file mapping not supported on your system");
+}
+
+THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags) {
+  AT_ERROR("file mapping not supported on your system");
+}
+
+THMapAllocator::~THMapAllocator(THMapAllocator* ctx) {}
+
+#endif
+
+#if (defined(_WIN32) || defined(HAVE_MMAP)) && defined(TH_ATOMIC_IPC_REFCOUNT)
+
+THRefcountedMapAllocatorArgCheck::THRefcountedMapAllocatorArgCheck(int flags) {
+  if (flags & TH_ALLOCATOR_MAPPED_FROMFD) {
+    AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_FROMFD flag");
+  }
+  if (flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
+    AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_KEEPFD flag");
+  }
+  if (flags & TH_ALLOCATOR_MAPPED_UNLINK) {
+    AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_UNLINK flag");
+  }
+  if (!(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)) {
+    AT_ERROR("THRefcountedMapAllocator requires TH_ALLOCATOR_MAPPED_SHAREDMEM flag");
+  }
+}
+
+THRefcountedMapAllocator::THRefcountedMapAllocator(const char *filename, int flags, size_t size)
+  : THRefcountedMapAllocatorArgCheck(flags)
+  , THMapAllocator(filename, flags, size + TH_ALLOC_ALIGNMENT) {
+
+    initializeAlloc();
+}
+THRefcountedMapAllocator::THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+  : THRefcountedMapAllocatorArgCheck(flags)
+  , THMapAllocator(WITH_FD, filename, flags, fd, size + TH_ALLOC_ALIGNMENT) {
+
+    initializeAlloc();
+}
+
+void THRefcountedMapAllocator::initializeAlloc() {
+  char *data = ((char*)base_ptr_) + TH_ALLOC_ALIGNMENT;
+  THMapInfo *map_info = (THMapInfo*)base_ptr_;
+
+#ifdef _WIN32
+  ReleaseContext* r_ctx = (ReleaseContext *) THAlloc(sizeof(ReleaseContext));
+  r_ctx->handle = handle_;
+  r_ctx->event = event_;
+  r_ctx->wait = NULL;
+  BOOL can_wait = RegisterWaitForSingleObject(&r_ctx->wait, event_, WaitForReleaseHandle, (PVOID)r_ctx, INFINITE, WT_EXECUTEONLYONCE);
+  if (!can_wait) {
+    AT_ERROR("Couldn't register wait on event, error code: <", GetLastError(), ">");
+  }
+#endif
+
+  if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
+    new (&map_info->refcount) std::atomic<int>(1);
+  } else {
+    map_info->refcount++;
+  }
+}
+
+void THRefcountedMapAllocator::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+
+  void* data = base_ptr_;
+
+#ifdef _WIN32
+  THMapInfo *info = (THMapInfo*)data;
+  if (--info->refcount == 0) {
+    SetEvent(event_);
+  }
+  if(UnmapViewOfFile(data) == 0) {
+    AT_ERROR("could not unmap the shared memory file");
+  }
+#else /* _WIN32 */
+
+  THMapInfo *info = (THMapInfo*)(data);
+  if (--info->refcount == 0) {
+#ifdef HAVE_SHM_UNLINK
+    if (shm_unlink(filename_.c_str()) == -1) {
+      AT_ERROR("could not unlink the shared memory file ", filename_);
+    }
+#else
+    AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform");
+#endif /* HAVE_SHM_UNLINK */
+  }
+  if (munmap(info, size_)) {
+    AT_ERROR("could not unmap the shared memory file ", filename_);
+  }
+#endif /* _WIN32 */
+}
+
+void THRefcountedMapAllocator::incref()
+{
+  THMapInfo *map_info = static_cast<THMapInfo*>(base_ptr_);
+  ++map_info->refcount;
+}
+
+int THRefcountedMapAllocator::decref()
+{
+  THMapInfo *map_info = static_cast<THMapInfo*>(base_ptr_);
+  return --map_info->refcount == 0;
+}
+
+#else
+
+
+THRefcountedMapAllocatorArgCheck::THRefcountedMapAllocatorArgCheck(int flags) {}
+
+THRefcountedMapAllocator::THRefcountedMapAllocator(const char *filename, int flags, size_t size) {
+  AT_ERROR("refcounted file mapping not supported on your system");
+}
+
+THRefcountedMapAllocator::THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) {
+  AT_ERROR("refcounted file mapping not supported on your system");
+}
+
+void THRefcountedMapAllocator::initializeAlloc() {}
+THRefcountedMapAllocator::~THRefcountedMapAllocator() {}
+
+#endif
+
+static void deleteTHMapAllocator(void* ptr) {
+  delete static_cast<THMapAllocator*>(ptr);
+}
+
+static void deleteTHRefcountedMapAllocator(void* ptr) {
+  delete static_cast<THRefcountedMapAllocator*>(ptr);
+}
+
+THMapAllocator* THMapAllocator::fromDataPtr(const at::DataPtr& dptr) {
+  return dptr.cast_context<THMapAllocator>(&deleteTHMapAllocator);
+}
+
+THRefcountedMapAllocator* THRefcountedMapAllocator::fromDataPtr(const at::DataPtr& dptr) {
+  return dptr.cast_context<THRefcountedMapAllocator>(&deleteTHRefcountedMapAllocator);
+}
+
+at::DataPtr THMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) {
+  auto* context = new THMapAllocator(filename, flags, size);
+  if (actual_size_out) *actual_size_out = context->size();
+  return {context->data(), context, &deleteTHMapAllocator, at::kCPU};
+}
+
+at::DataPtr THMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+  auto* context = new THMapAllocator(WITH_FD, filename, fd, flags, size);
+  if (actual_size_out) *actual_size_out = context->size();
+  return {context->data(), context, &deleteTHMapAllocator, at::kCPU};
+}
+
+at::DataPtr THRefcountedMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) {
+  auto* context = new THRefcountedMapAllocator(filename, flags, size);
+  if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT;
+  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU};
+}
+
+at::DataPtr THRefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+  auto* context = new THRefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
+  if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT;
+  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU};
+}
+
+void* THRefcountedMapAllocator::data() const {
+  return static_cast<void*>(static_cast<char*>(base_ptr_) + TH_ALLOC_ALIGNMENT);
+}
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
new file mode 100644
index 0000000..460f238
--- /dev/null
+++ b/aten/src/TH/THAllocator.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include "THGeneral.h"
+
+#ifdef __cplusplus
+#include <ATen/Allocator.h>
+#endif
+
+#define TH_ALLOCATOR_MAPPED_SHARED 1
+#define TH_ALLOCATOR_MAPPED_SHAREDMEM 2
+#define TH_ALLOCATOR_MAPPED_EXCLUSIVE 4
+#define TH_ALLOCATOR_MAPPED_NOCREATE 8
+#define TH_ALLOCATOR_MAPPED_KEEPFD 16
+#define TH_ALLOCATOR_MAPPED_FROMFD 32
+#define TH_ALLOCATOR_MAPPED_UNLINK 64
+
+#ifdef __cplusplus
+using THAllocator = at::Allocator;
+#else
+// struct at_THAllocator doesn't and will never exist, but we cannot name
+// the actual struct because it's a namespaced C++ thing
+typedef struct at_THAllocator THAllocator;
+#endif
+
+/* default malloc/free allocator. malloc and realloc raise an error (using
+ * THError) on allocation failure.
+ */
+TH_API THAllocator* getTHDefaultAllocator(void);
+
+#ifdef __cplusplus
+// Sentinel value/type to help distinguish the file descriptor constructor from
+// the non-file descriptor constructor
+enum WithFd { WITH_FD };
+
+class AT_API THMapAllocator {
+public:
+  THMapAllocator(const char *filename, int flags, size_t size);
+  THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
+  THMapAllocator(const THMapAllocator&) = delete;
+  THMapAllocator& operator=(const THMapAllocator&) = delete;
+  THMapAllocator(THMapAllocator&&) = delete;
+  THMapAllocator& operator=(THMapAllocator&&) = delete;
+
+  const char* filename() const { return filename_.c_str(); }
+  int fd() const {
+#ifdef _WIN32
+    AT_ERROR("THMapAllocator::fd() is unsupported on Windows");
+#else
+    return fd_;
+#endif
+  }
+  ptrdiff_t size() const { return size_; }
+  // Return a pointer to the actual data for this allocator
+  // (in the case of the refcounted allocator, this is offset
+  // from the base pointer.)
+  virtual void* data() const { return base_ptr_; }
+
+  static THMapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out);
+
+  // Closes the data.  Helps us avoid destructor shenanigans
+  virtual void close();
+
+  // This is very dangerous.  You have to redefine this destructor for each
+  // subclass
+  virtual ~THMapAllocator() { close(); }
+
+protected:
+  bool closed_ = false;
+  std::string filename_;
+  int flags_ = 0;
+  ptrdiff_t size_; /* mapped size */
+#ifdef _WIN32
+  void* handle_;
+  void* event_;
+  std::string eventname_;
+#else
+  int fd_ = -1;
+#endif
+  void *base_ptr_ = nullptr;
+};
+
+// Base-from-member idiom
+struct AT_API THRefcountedMapAllocatorArgCheck {
+  THRefcountedMapAllocatorArgCheck(int flags);
+};
+
+class AT_API THRefcountedMapAllocator : private THRefcountedMapAllocatorArgCheck, public THMapAllocator {
+public:
+  THRefcountedMapAllocator(const char *filename, int flags, size_t size);
+  THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
+
+  static THRefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out);
+
+  void* data() const override;
+
+  void incref();
+  int decref();
+  void close() override;
+
+  virtual ~THRefcountedMapAllocator() { close(); }
+
+protected:
+  void checkFlags();
+  void initializeAlloc();
+};
+
+#endif // __cplusplus
diff --git a/aten/src/TH/THBlas.cpp b/aten/src/TH/THBlas.cpp
new file mode 100644
index 0000000..7523c9e
--- /dev/null
+++ b/aten/src/TH/THBlas.cpp
@@ -0,0 +1,4 @@
+#include "THBlas.h"
+
+#include "generic/THBlas.cpp"
+#include "THGenerateAllTypes.h"
diff --git a/aten/src/TH/THBlas.h b/aten/src/TH/THBlas.h
new file mode 100644
index 0000000..5fef0fe
--- /dev/null
+++ b/aten/src/TH/THBlas.h
@@ -0,0 +1,11 @@
+#ifndef TH_BLAS_INC
+#define TH_BLAS_INC
+
+#include "THGeneral.h"
+
+#define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
+
+#include "generic/THBlas.h"
+#include "THGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/TH/THBlasUtils.h b/aten/src/TH/THBlasUtils.h
new file mode 100644
index 0000000..8281047
--- /dev/null
+++ b/aten/src/TH/THBlasUtils.h
@@ -0,0 +1,32 @@
+#include <TH/THBlas.h>
+#include <ATen/ScalarType.h>
+
+// This header file shouldn't be anything permanent; it's just a temporary
+// dumping ground to help you get access to utilities in THBlas.h via templates,
+// rather than by name directly.  Someone should figure out a reasonable way to
+// rewrite these in more idiomatic ATen and move it into ATen proper.
+
+template<typename T>
+inline void THBlas_axpy(int64_t n, T a, T *x, int64_t incx, T *y, int64_t incy);
+
+#define AXPY_SPECIALIZATION(ctype,name,_1) \
+  template<> \
+  inline void THBlas_axpy<ctype>(int64_t n, ctype a, ctype *x, int64_t incx, \
+                   ctype *y, int64_t incy) { \
+    TH ## name ## Blas_axpy(n, a, x, incx, y, incy); \
+  }
+
+AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(AXPY_SPECIALIZATION)
+
+
+template<typename T>
+inline void THBlas_copy(int64_t n, T *x, int64_t incx, T *y, int64_t incy);
+
+#define COPY_SPECIALIZATION(ctype,name,_1) \
+  template<> \
+  inline void THBlas_copy<ctype>(int64_t n, ctype *x, int64_t incx, \
+                   ctype *y, int64_t incy) { \
+    TH ## name ## Blas_copy(n, x, incx, y, incy); \
+  }
+
+AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(COPY_SPECIALIZATION)
diff --git a/aten/src/TH/THConfig.cmake.in b/aten/src/TH/THConfig.cmake.in
new file mode 100644
index 0000000..306cd87
--- /dev/null
+++ b/aten/src/TH/THConfig.cmake.in
@@ -0,0 +1,9 @@
+# Find the TH includes and library
+#
+# TH_INCLUDE_DIR -- where to find the includes
+# TH_LIBRARIES -- list of libraries to link against
+# TH_FOUND -- set to 1 if found
+
+SET(TH_FOUND 1)
+SET(TH_INCLUDE_DIR "@TH_INCLUDE_DIR@")
+SET(TH_LIBRARIES "@TH_LIBRARIES@")
diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp
new file mode 100644
index 0000000..258ad2c
--- /dev/null
+++ b/aten/src/TH/THDiskFile.cpp
@@ -0,0 +1,801 @@
+#include "THGeneral.h"
+#include "THDiskFile.h"
+#include "THFilePrivate.h"
+
+#ifndef _WIN32
+#include <sys/types.h>
+#endif
+
+#include <stdint.h>
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
+
+typedef struct THDiskFile__
+{
+    THFile file;
+
+    FILE *handle;
+    char *name;
+    int isNativeEncoding;
+    int longSize;
+
+} THDiskFile;
+
+static int THDiskFile_isOpened(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)self;
+  return (dfself->handle != NULL);
+}
+
+const char *THDiskFile_name(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)self;
+  return dfself->name;
+}
+
+/* workaround mac osx lion ***insane*** fread bug */
+#ifdef __APPLE__
+size_t fread__(void *ptr, size_t size, size_t nitems, FILE *stream)
+{
+  size_t nread = 0;
+  while(!feof(stream) && !ferror(stream) && (nread < nitems))
+    nread += fread((char*)ptr+nread*size, size, THMin(2147483648/size, nitems-nread), stream);
+  return nread;
+}
+#else
+#define fread__ fread
+#endif
+
+#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM) \
+  static ssize_t THDiskFile_read##TYPEC(THFile *self, TYPE *data, ssize_t n)  \
+  {                                                                     \
+    THDiskFile *dfself = (THDiskFile*)(self);                           \
+    ssize_t nread = 0L;                                                    \
+                                                                        \
+    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
+    THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); \
+                                                                        \
+    if(dfself->file.isBinary)                                           \
+    {                                                                   \
+      nread = fread__(data, sizeof(TYPE), n, dfself->handle);           \
+      if(!dfself->isNativeEncoding && (sizeof(TYPE) > 1) && (nread > 0)) \
+        THDiskFile_reverseMemory(data, data, sizeof(TYPE), nread);      \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      ssize_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ASCII_READ_ELEM; /* increment here result and break if wrong */ \
+      }                                                                 \
+      if(dfself->file.isAutoSpacing && (n > 0))                         \
+      {                                                                 \
+        int c = fgetc(dfself->handle);                                  \
+        if( (c != '\n') && (c != EOF) )                                 \
+          ungetc(c, dfself->handle);                                    \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if(nread != n)                                                      \
+    {                                                                   \
+      dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
+      if(!dfself->file.isQuiet)                                         \
+        THError("read error: read %d blocks instead of %d", nread, n);  \
+    }                                                                   \
+                                                                        \
+    return nread;                                                       \
+  }                                                                     \
+                                                                        \
+  static ssize_t THDiskFile_write##TYPEC(THFile *self, TYPE *data, ssize_t n) \
+  {                                                                     \
+    THDiskFile *dfself = (THDiskFile*)(self);                           \
+    ssize_t nwrite = 0L;                                                   \
+                                                                        \
+    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
+    THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); \
+                                                                        \
+    if(dfself->file.isBinary)                                           \
+    {                                                                   \
+      if(dfself->isNativeEncoding)                                      \
+      {                                                                 \
+        nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);         \
+      }                                                                 \
+      else                                                              \
+      {                                                                 \
+        if(sizeof(TYPE) > 1)                                            \
+        {                                                               \
+          char *buffer = static_cast<char*>(THAlloc(sizeof(TYPE)*n));   \
+          THDiskFile_reverseMemory(buffer, data, sizeof(TYPE), n);      \
+          nwrite = fwrite(buffer, sizeof(TYPE), n, dfself->handle);     \
+          THFree(buffer);                                               \
+        }                                                               \
+        else                                                            \
+          nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);       \
+      }                                                                 \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      ssize_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ASCII_WRITE_ELEM;                                               \
+        if( dfself->file.isAutoSpacing && (i < n-1) )                   \
+          fprintf(dfself->handle, " ");                                 \
+      }                                                                 \
+      if(dfself->file.isAutoSpacing && (n > 0))                         \
+        fprintf(dfself->handle, "\n");                                  \
+    }                                                                   \
+                                                                        \
+    if(nwrite != n)                                                     \
+    {                                                                   \
+      dfself->file.hasError = 1;                                        \
+      if(!dfself->file.isQuiet)                                         \
+        THError("write error: wrote %d blocks instead of %d", nwrite, n); \
+    }                                                                   \
+                                                                        \
+    return nwrite;                                                      \
+}
+
+static int THDiskFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  else if(strlen(mode) == 2)
+  {
+    if(mode[0] == 'r' && mode[1] == 'w')
+    {
+      *isReadable = 1;
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void THDiskFile_synchronize(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  fflush(dfself->handle);
+}
+
+static void THDiskFile_seek(THFile *self, ssize_t position)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+#if defined(_WIN64)
+  THArgCheck(position <= INT64_MAX, 2, "position must be smaller than INT64_MAX");
+  if(_fseeki64(dfself->handle, (int64_t)position, SEEK_SET) < 0)
+#elif defined(_WIN32)
+  THArgCheck(position <= LONG_MAX, 2, "position must be smaller than LONG_MAX");
+  if(fseek(dfself->handle, (int32_t)position, SEEK_SET) < 0)
+#else
+  THArgCheck(position <= LLONG_MAX, 2, "position must be smaller than LLONG_MAX");
+  if(fseeko(dfself->handle, (off_t)position, SEEK_SET) < 0)
+#endif
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("unable to seek to position %zu", position);
+  }
+}
+
+static void THDiskFile_seekEnd(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+#if defined(_WIN64)
+  if(_fseeki64(dfself->handle, 0, SEEK_END) < 0)
+#elif defined(_WIN32)
+  if(fseek(dfself->handle, 0, SEEK_END) < 0)
+#else
+  if(fseeko(dfself->handle, 0, SEEK_END) < 0)
+#endif
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("unable to seek at end of file");
+  }
+}
+
+static ssize_t THDiskFile_position(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+#if defined(_WIN64)
+  int64_t offset = _ftelli64(dfself->handle);
+#elif defined(_WIN32)
+  int32_t offset = ftell(dfself->handle);
+#else
+  off_t offset = ftello(dfself->handle);
+#endif
+  if (offset > -1)
+      return (ssize_t)offset;
+  else if(!dfself->file.isQuiet)
+      THError("unable to obtain disk file offset (maybe a long overflow occurred)");
+
+  return 0;
+}
+
+static void THDiskFile_close(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  fclose(dfself->handle);
+  dfself->handle = NULL;
+}
+
+/* Little and Big Endian */
+
+static void THDiskFile_reverseMemory(void *dst, const void *src, ssize_t blockSize, ssize_t numBlocks)
+{
+  if(blockSize > 1)
+  {
+    ssize_t halfBlockSize = blockSize/2;
+    char *charSrc = (char*)src;
+    char *charDst = (char*)dst;
+    ssize_t b, i;
+    for(b = 0; b < numBlocks; b++)
+    {
+      for(i = 0; i < halfBlockSize; i++)
+      {
+        char z = charSrc[i];
+        charDst[i] = charSrc[blockSize-1-i];
+        charDst[blockSize-1-i] = z;
+      }
+      charSrc += blockSize;
+      charDst += blockSize;
+    }
+  }
+}
+
+int THDiskFile_isLittleEndianCPU(void)
+{
+  int x = 7;
+  char *ptr = (char *)&x;
+
+  if(ptr[0] == 0)
+    return 0;
+  else
+    return 1;
+}
+
+int THDiskFile_isBigEndianCPU(void)
+{
+  return(!THDiskFile_isLittleEndianCPU());
+}
+
+void THDiskFile_nativeEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = 1;
+}
+
+void THDiskFile_littleEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = THDiskFile_isLittleEndianCPU();
+}
+
+void THDiskFile_bigEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = !THDiskFile_isLittleEndianCPU();
+}
+
+/* End of Little and Big Endian Stuff */
+
+void THDiskFile_longSize(THFile *self, int size)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
+  dfself->longSize = size;
+}
+
+void THDiskFile_noBuffer(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  if (setvbuf(dfself->handle, NULL, _IONBF, 0)) {
+    THError("error: cannot disable buffer");
+  }
+}
+
+static void THDiskFile_free(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  if(dfself->handle)
+    fclose(dfself->handle);
+  THFree(dfself->name);
+  THFree(dfself);
+}
+
+/* READ_WRITE_METHODS(int, Bool, */
+/*                    int value = 0; int ret = fscanf(file->handle, "%d", &value); array[i] = (value ? 1 : 0); if(ret <= 0) break; else result++, */
+/*                    int value = (array[i] ? 1 : 0); nElemWritten = fprintf(file->handle, "%d", value), */
+/*                    true) */
+
+/* Note that we do a trick */
+READ_WRITE_METHODS(uint8_t, Byte,
+                   nread = fread(data, 1, n, dfself->handle); break,
+                   nwrite = fwrite(data, 1, n, dfself->handle); break)
+
+READ_WRITE_METHODS(int8_t, Char,
+                   nread = fread(data, 1, n, dfself->handle); break,
+                   nwrite = fwrite(data, 1, n, dfself->handle); break)
+
+READ_WRITE_METHODS(int16_t, Short,
+                   int ret = fscanf(dfself->handle, "%hd", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%hd", data[i]); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(int32_t, Int,
+                   int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(float, Float,
+                   int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(THHalf, Half,
+                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
+                   int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(double, Double,
+                   int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++)
+
+
+/* For Long we need to rewrite everything, because of the special management of longSize */
+static ssize_t THDiskFile_readLong(THFile *self, int64_t *data, ssize_t n)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  ssize_t nread = 0L;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
+
+  if(dfself->file.isBinary)
+  {
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t))
+    {
+      nread = fread__(data, sizeof(int64_t), n, dfself->handle);
+      if(!dfself->isNativeEncoding && (sizeof(int64_t) > 1) && (nread > 0))
+        THDiskFile_reverseMemory(data, data, sizeof(int64_t), nread);
+    } else if(dfself->longSize == 4)
+    {
+      nread = fread__(data, 4, n, dfself->handle);
+      if(!dfself->isNativeEncoding && (nread > 0))
+        THDiskFile_reverseMemory(data, data, 4, nread);
+      ssize_t i;
+      for(i = nread; i > 0; i--)
+        data[i-1] = ((int *)data)[i-1];
+    }
+    else /* if(dfself->longSize == 8) */
+    {
+      int big_endian = !THDiskFile_isLittleEndianCPU();
+      int32_t *buffer = static_cast<int32_t*>(THAlloc(8*n));
+      nread = fread__(buffer, 8, n, dfself->handle);
+      ssize_t i;
+      for(i = nread; i > 0; i--)
+        data[i-1] = buffer[2*(i-1) + big_endian];
+      THFree(buffer);
+      if(!dfself->isNativeEncoding && (nread > 0))
+        THDiskFile_reverseMemory(data, data, 4, nread);
+     }
+  }
+  else
+  {
+    ssize_t i;
+    for(i = 0; i < n; i++)
+    {
+      int ret = fscanf(dfself->handle, "%" PRId64, &data[i]); if(ret <= 0) break; else nread++;
+    }
+    if(dfself->file.isAutoSpacing && (n > 0))
+    {
+      int c = fgetc(dfself->handle);
+      if( (c != '\n') && (c != EOF) )
+        ungetc(c, dfself->handle);
+    }
+  }
+
+  if(nread != n)
+  {
+    dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
+    if(!dfself->file.isQuiet)
+      THError("read error: read %d blocks instead of %d", nread, n);
+  }
+
+  return nread;
+}
+
+static ssize_t THDiskFile_writeLong(THFile *self, int64_t *data, ssize_t n)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  ssize_t nwrite = 0L;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  if(dfself->file.isBinary)
+  {
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t))
+    {
+      if(dfself->isNativeEncoding)
+      {
+        nwrite = fwrite(data, sizeof(int64_t), n, dfself->handle);
+      }
+      else
+      {
+        char *buffer = static_cast<char*>(THAlloc(sizeof(int64_t)*n));
+        THDiskFile_reverseMemory(buffer, data, sizeof(int64_t), n);
+        nwrite = fwrite(buffer, sizeof(int64_t), n, dfself->handle);
+        THFree(buffer);
+      }
+    } else if(dfself->longSize == 4)
+    {
+      int32_t *buffer = static_cast<int32_t*>(THAlloc(4*n));
+      ssize_t i;
+      for(i = 0; i < n; i++)
+        buffer[i] = (int32_t) data[i];
+      if(!dfself->isNativeEncoding)
+        THDiskFile_reverseMemory(buffer, buffer, 4, n);
+      nwrite = fwrite(buffer, 4, n, dfself->handle);
+      THFree(buffer);
+    }
+    else /* if(dfself->longSize == 8) */
+    {
+      int big_endian = !THDiskFile_isLittleEndianCPU();
+      int32_t *buffer = static_cast<int32_t*>(THAlloc(8*n));
+      ssize_t i;
+      for(i = 0; i < n; i++)
+      {
+        buffer[2*i + !big_endian] = 0;
+        buffer[2*i + big_endian] = (int32_t) data[i];
+      }
+      if(!dfself->isNativeEncoding)
+        THDiskFile_reverseMemory(buffer, buffer, 8, n);
+      nwrite = fwrite(buffer, 8, n, dfself->handle);
+      THFree(buffer);
+    }
+  }
+  else
+  {
+    ssize_t i;
+    for(i = 0; i < n; i++)
+    {
+      int ret = fprintf(dfself->handle, "%" PRId64, data[i]); if(ret <= 0) break; else nwrite++;
+      if( dfself->file.isAutoSpacing && (i < n-1) )
+        fprintf(dfself->handle, " ");
+    }
+    if(dfself->file.isAutoSpacing && (n > 0))
+      fprintf(dfself->handle, "\n");
+  }
+
+  if(nwrite != n)
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("write error: wrote %d blocks instead of %d", nwrite, n);
+  }
+
+  return nwrite;
+}
+
+static ssize_t THDiskFile_readString(THFile *self, const char *format, char **str_)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
+  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
+
+/* note: the string won't survive long, as it is copied into lua */
+/* so 1024 is not that big... */
+#define TBRS_BSZ 1024L
+
+  if(format[1] == 'a')
+  {
+    char *p = static_cast<char*>(THAlloc(TBRS_BSZ));
+    ssize_t total = TBRS_BSZ;
+    ssize_t pos = 0;
+
+    for (;;)
+    {
+      if(total-pos == 0) /* we need more space! */
+      {
+        total += TBRS_BSZ;
+        p = static_cast<char*>(THRealloc(p, total));
+      }
+      pos += fread(p+pos, 1, total-pos, dfself->handle);
+      if (pos < total) /* eof? */
+      {
+        if(pos == 0)
+        {
+          THFree(p);
+          dfself->file.hasError = 1;
+          if(!dfself->file.isQuiet)
+            THError("read error: read 0 blocks instead of 1");
+
+          *str_ = NULL;
+          return 0;
+        }
+        *str_ = p;
+        return pos;
+      }
+    }
+  }
+  else
+  {
+    char *p = static_cast<char*>(THAlloc(TBRS_BSZ));
+    ssize_t total = TBRS_BSZ;
+    ssize_t pos = 0;
+    ssize_t size;
+
+    for (;;)
+    {
+      if(total-pos <= 1) /* we can only write '\0' in there! */
+      {
+        total += TBRS_BSZ;
+        p = static_cast<char*>(THRealloc(p, total));
+      }
+      if (fgets(p+pos, (int) (total-pos), dfself->handle) == NULL) /* eof? */
+      {
+        if(pos == 0)
+        {
+          THFree(p);
+          dfself->file.hasError = 1;
+          if(!dfself->file.isQuiet)
+            THError("read error: read 0 blocks instead of 1");
+
+          *str_ = NULL;
+          return 0;
+        }
+        *str_ = p;
+        return pos;
+      }
+      size = strlen(p+pos);
+      if (size == 0 || (p+pos)[size-1] != '\n')
+      {
+        pos += size;
+      }
+      else
+      {
+        pos += size-1; /* do not include `eol' */
+        *str_ = p;
+        return pos;
+      }
+    }
+  }
+
+  *str_ = NULL;
+  return 0;
+}
+
+
+static ssize_t THDiskFile_writeString(THFile *self, const char *str, ssize_t size)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  ssize_t nwrite;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  nwrite = fwrite(str, 1, size, dfself->handle);
+  if(nwrite != size)
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("write error: wrote %zu blocks instead of %zu", nwrite, size);
+  }
+
+  return nwrite;
+}
+
+THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
+{
+  static struct THFileVTable vtable = {
+    THDiskFile_isOpened,
+
+    THDiskFile_readByte,
+    THDiskFile_readChar,
+    THDiskFile_readShort,
+    THDiskFile_readInt,
+    THDiskFile_readLong,
+    THDiskFile_readFloat,
+    THDiskFile_readDouble,
+    THDiskFile_readHalf,
+    THDiskFile_readString,
+
+    THDiskFile_writeByte,
+    THDiskFile_writeChar,
+    THDiskFile_writeShort,
+    THDiskFile_writeInt,
+    THDiskFile_writeLong,
+    THDiskFile_writeFloat,
+    THDiskFile_writeDouble,
+    THDiskFile_writeHalf,
+    THDiskFile_writeString,
+
+    THDiskFile_synchronize,
+    THDiskFile_seek,
+    THDiskFile_seekEnd,
+    THDiskFile_position,
+    THDiskFile_close,
+    THDiskFile_free
+  };
+
+  int isReadable;
+  int isWritable;
+  FILE *handle;
+  THDiskFile *self;
+
+  THArgCheck(THDiskFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+
+  if( isReadable && isWritable )
+  {
+    handle = fopen(name, "r+b");
+    if(!handle)
+    {
+      handle = fopen(name, "wb");
+      if(handle)
+      {
+        fclose(handle);
+        handle = fopen(name, "r+b");
+      }
+    }
+  }
+  else
+    handle = fopen(name, (isReadable ? "rb" : "wb"));
+
+  if(!handle)
+  {
+    if(isQuiet)
+      return 0;
+    else
+      THError("cannot open <%s> in mode %c%c", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
+  }
+
+  self = static_cast<THDiskFile*>(THAlloc(sizeof(THDiskFile)));
+
+  self->handle = handle;
+  self->name = static_cast<char*>(THAlloc(strlen(name)+1));
+  strcpy(self->name, name);
+  self->isNativeEncoding = 1;
+  self->longSize = 0;
+
+  self->file.vtable = &vtable;
+  self->file.isQuiet = isQuiet;
+  self->file.isReadable = isReadable;
+  self->file.isWritable = isWritable;
+  self->file.isBinary = 0;
+  self->file.isAutoSpacing = 1;
+  self->file.hasError = 0;
+
+  return (THFile*)self;
+}
+
+/* PipeFile */
+
+static int THPipeFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void THPipeFile_free(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  if(dfself->handle)
+    pclose(dfself->handle);
+  THFree(dfself->name);
+  THFree(dfself);
+}
+
+THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
+{
+  static struct THFileVTable vtable = {
+    THDiskFile_isOpened,
+
+    THDiskFile_readByte,
+    THDiskFile_readChar,
+    THDiskFile_readShort,
+    THDiskFile_readInt,
+    THDiskFile_readLong,
+    THDiskFile_readFloat,
+    THDiskFile_readDouble,
+    THDiskFile_readHalf,
+    THDiskFile_readString,
+
+    THDiskFile_writeByte,
+    THDiskFile_writeChar,
+    THDiskFile_writeShort,
+    THDiskFile_writeInt,
+    THDiskFile_writeLong,
+    THDiskFile_writeFloat,
+    THDiskFile_writeDouble,
+    THDiskFile_writeHalf,
+    THDiskFile_writeString,
+
+    THDiskFile_synchronize,
+    THDiskFile_seek,
+    THDiskFile_seekEnd,
+    THDiskFile_position,
+    THDiskFile_close,
+    THPipeFile_free
+  };
+
+  int isReadable;
+  int isWritable;
+  FILE *handle;
+  THDiskFile *self;
+
+  THArgCheck(THPipeFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w'");
+
+#ifdef _WIN32
+  handle = _popen(name, (isReadable ? "rb" : "wb"));
+#else
+  handle = popen(name, (isReadable ? "r" : "w"));
+#endif
+
+  if(!handle)
+  {
+    if(isQuiet)
+      return 0;
+    else
+      THError("cannot open <%s> in mode %c%c.  This might be because eg the executable doesn't exist, but it could also be because you are out of memory.", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
+  }
+
+  self = static_cast<THDiskFile*>(THAlloc(sizeof(THDiskFile)));
+
+  self->handle = handle;
+  self->name = static_cast<char*>(THAlloc(strlen(name)+1));
+  strcpy(self->name, name);
+  self->isNativeEncoding = 1;
+  self->longSize = 0;
+
+  self->file.vtable = &vtable;
+  self->file.isQuiet = isQuiet;
+  self->file.isReadable = isReadable;
+  self->file.isWritable = isWritable;
+  self->file.isBinary = 0;
+  self->file.isAutoSpacing = 1;
+  self->file.hasError = 0;
+
+  return (THFile*)self;
+}
diff --git a/aten/src/TH/THDiskFile.h b/aten/src/TH/THDiskFile.h
new file mode 100644
index 0000000..bc5c001
--- /dev/null
+++ b/aten/src/TH/THDiskFile.h
@@ -0,0 +1,19 @@
+#ifndef TH_DISK_FILE_INC
+#define TH_DISK_FILE_INC
+
+#include "THFile.h"
+
+TH_API THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet);
+TH_API THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet);
+
+TH_API const char *THDiskFile_name(THFile *self);
+
+TH_API int THDiskFile_isLittleEndianCPU(void);
+TH_API int THDiskFile_isBigEndianCPU(void);
+TH_API void THDiskFile_nativeEndianEncoding(THFile *self);
+TH_API void THDiskFile_littleEndianEncoding(THFile *self);
+TH_API void THDiskFile_bigEndianEncoding(THFile *self);
+TH_API void THDiskFile_longSize(THFile *self, int size);
+TH_API void THDiskFile_noBuffer(THFile *self);
+
+#endif
diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp
new file mode 100644
index 0000000..f3e1741
--- /dev/null
+++ b/aten/src/TH/THFile.cpp
@@ -0,0 +1,158 @@
+#include "THFile.h"
+#include "THStorage.hpp"
+#include "THFilePrivate.h"
+
+#define IMPLEMENT_THFILE_RW(TYPEC, TYPE)                          \
+  size_t THFile_read##TYPEC##Raw(THFile *self, TYPE *data, size_t n)  \
+  {                                                               \
+    return (*self->vtable->read##TYPEC)(self, data, n);           \
+  }                                                               \
+                                                                  \
+  size_t THFile_write##TYPEC##Raw(THFile *self, TYPE *data, size_t n) \
+  {                                                               \
+    return (*self->vtable->write##TYPEC)(self, data, n);          \
+  }
+
+IMPLEMENT_THFILE_RW(Byte, uint8_t)
+IMPLEMENT_THFILE_RW(Char, int8_t)
+IMPLEMENT_THFILE_RW(Short, int16_t)
+IMPLEMENT_THFILE_RW(Int, int32_t)
+IMPLEMENT_THFILE_RW(Long, int64_t)
+IMPLEMENT_THFILE_RW(Float, float)
+IMPLEMENT_THFILE_RW(Double, double)
+IMPLEMENT_THFILE_RW(Half, THHalf)
+
+size_t THFile_readStringRaw(THFile *self, const char *format, char **str_)
+{
+  return self->vtable->readString(self, format, str_);
+}
+
+size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size)
+{
+  return self->vtable->writeString(self, str, size);
+}
+
+void THFile_synchronize(THFile *self)
+{
+  self->vtable->synchronize(self);
+}
+
+void THFile_seek(THFile *self, size_t position)
+{
+  self->vtable->seek(self, position);
+}
+
+void THFile_seekEnd(THFile *self)
+{
+  self->vtable->seekEnd(self);
+}
+
+size_t THFile_position(THFile *self)
+{
+  return self->vtable->position(self);
+}
+
+void THFile_close(THFile *self)
+{
+  self->vtable->close(self);
+}
+
+void THFile_free(THFile *self)
+{
+  self->vtable->free(self);
+}
+
+int THFile_isOpened(THFile *self)
+{
+  return self->vtable->isOpened(self);
+}
+
+#define IMPLEMENT_THFILE_FLAGS(FLAG) \
+  int THFile_##FLAG(THFile *self)    \
+  {                                  \
+    return self->FLAG;               \
+  }
+
+IMPLEMENT_THFILE_FLAGS(isQuiet)
+IMPLEMENT_THFILE_FLAGS(isReadable)
+IMPLEMENT_THFILE_FLAGS(isWritable)
+IMPLEMENT_THFILE_FLAGS(isBinary)
+IMPLEMENT_THFILE_FLAGS(isAutoSpacing)
+IMPLEMENT_THFILE_FLAGS(hasError)
+
+void THFile_binary(THFile *self)
+{
+  self->isBinary = 1;
+}
+
+void THFile_ascii(THFile *self)
+{
+  self->isBinary = 0;
+}
+
+void THFile_autoSpacing(THFile *self)
+{
+  self->isAutoSpacing = 1;
+}
+
+void THFile_noAutoSpacing(THFile *self)
+{
+  self->isAutoSpacing = 0;
+}
+
+void THFile_quiet(THFile *self)
+{
+  self->isQuiet = 1;
+}
+
+void THFile_pedantic(THFile *self)
+{
+  self->isQuiet = 0;
+}
+
+void THFile_clearError(THFile *self)
+{
+  self->hasError = 0;
+}
+
+#define IMPLEMENT_THFILE_SCALAR(TYPEC, TYPE)                  \
+  TYPE THFile_read##TYPEC##Scalar(THFile *self)               \
+  {                                                           \
+    TYPE scalar;                                              \
+    THFile_read##TYPEC##Raw(self, &scalar, 1);                \
+    return scalar;                                            \
+  }                                                           \
+                                                              \
+  void THFile_write##TYPEC##Scalar(THFile *self, TYPE scalar) \
+  {                                                           \
+    THFile_write##TYPEC##Raw(self, &scalar, 1);               \
+  }
+
+IMPLEMENT_THFILE_SCALAR(Byte, uint8_t)
+IMPLEMENT_THFILE_SCALAR(Char, int8_t)
+IMPLEMENT_THFILE_SCALAR(Short, int16_t)
+IMPLEMENT_THFILE_SCALAR(Int, int32_t)
+IMPLEMENT_THFILE_SCALAR(Long, int64_t)
+IMPLEMENT_THFILE_SCALAR(Float, float)
+IMPLEMENT_THFILE_SCALAR(Double, double)
+IMPLEMENT_THFILE_SCALAR(Half, THHalf)
+
+#define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
+  size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
+  {                                                                     \
+    return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size); \
+  }                                                                     \
+                                                                        \
+  size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage)   \
+  {                                                                     \
+    return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size); \
+  }
+
+IMPLEMENT_THFILE_STORAGE(Byte, uint8_t)
+IMPLEMENT_THFILE_STORAGE(Char, int8_t)
+IMPLEMENT_THFILE_STORAGE(Short, int16_t)
+IMPLEMENT_THFILE_STORAGE(Int, int32_t)
+IMPLEMENT_THFILE_STORAGE(Long, int64_t)
+IMPLEMENT_THFILE_STORAGE(Float, float)
+IMPLEMENT_THFILE_STORAGE(Double, double)
+IMPLEMENT_THFILE_STORAGE(Half, THHalf)
diff --git a/aten/src/TH/THFile.h b/aten/src/TH/THFile.h
new file mode 100644
index 0000000..27041f5
--- /dev/null
+++ b/aten/src/TH/THFile.h
@@ -0,0 +1,91 @@
+#ifndef TH_FILE_INC
+#define TH_FILE_INC
+
+#include "THStorage.h"
+
+typedef struct THFile__ THFile;
+
+TH_API int THFile_isOpened(THFile *self);
+TH_API int THFile_isQuiet(THFile *self);
+TH_API int THFile_isReadable(THFile *self);
+TH_API int THFile_isWritable(THFile *self);
+TH_API int THFile_isBinary(THFile *self);
+TH_API int THFile_isAutoSpacing(THFile *self);
+TH_API int THFile_hasError(THFile *self);
+
+TH_API void THFile_binary(THFile *self);
+TH_API void THFile_ascii(THFile *self);
+TH_API void THFile_autoSpacing(THFile *self);
+TH_API void THFile_noAutoSpacing(THFile *self);
+TH_API void THFile_quiet(THFile *self);
+TH_API void THFile_pedantic(THFile *self);
+TH_API void THFile_clearError(THFile *self);
+
+/* scalar */
+TH_API uint8_t THFile_readByteScalar(THFile *self);
+TH_API int8_t THFile_readCharScalar(THFile *self);
+TH_API int16_t THFile_readShortScalar(THFile *self);
+TH_API int32_t THFile_readIntScalar(THFile *self);
+TH_API int64_t THFile_readLongScalar(THFile *self);
+TH_API float THFile_readFloatScalar(THFile *self);
+TH_API double THFile_readDoubleScalar(THFile *self);
+
+TH_API void THFile_writeByteScalar(THFile *self, uint8_t scalar);
+TH_API void THFile_writeCharScalar(THFile *self, int8_t scalar);
+TH_API void THFile_writeShortScalar(THFile *self, int16_t scalar);
+TH_API void THFile_writeIntScalar(THFile *self, int32_t scalar);
+TH_API void THFile_writeLongScalar(THFile *self, int64_t scalar);
+TH_API void THFile_writeFloatScalar(THFile *self, float scalar);
+TH_API void THFile_writeDoubleScalar(THFile *self, double scalar);
+
+/* storage */
+TH_API size_t THFile_readByte(THFile *self, THByteStorage *storage);
+TH_API size_t THFile_readChar(THFile *self, THCharStorage *storage);
+TH_API size_t THFile_readShort(THFile *self, THShortStorage *storage);
+TH_API size_t THFile_readInt(THFile *self, THIntStorage *storage);
+TH_API size_t THFile_readLong(THFile *self, THLongStorage *storage);
+TH_API size_t THFile_readFloat(THFile *self, THFloatStorage *storage);
+TH_API size_t THFile_readDouble(THFile *self, THDoubleStorage *storage);
+
+TH_API size_t THFile_writeByte(THFile *self, THByteStorage *storage);
+TH_API size_t THFile_writeChar(THFile *self, THCharStorage *storage);
+TH_API size_t THFile_writeShort(THFile *self, THShortStorage *storage);
+TH_API size_t THFile_writeInt(THFile *self, THIntStorage *storage);
+TH_API size_t THFile_writeLong(THFile *self, THLongStorage *storage);
+TH_API size_t THFile_writeFloat(THFile *self, THFloatStorage *storage);
+TH_API size_t THFile_writeDouble(THFile *self, THDoubleStorage *storage);
+
+/* raw */
+TH_API size_t THFile_readByteRaw(THFile *self, uint8_t *data, size_t n);
+TH_API size_t THFile_readCharRaw(THFile *self, int8_t *data, size_t n);
+TH_API size_t THFile_readShortRaw(THFile *self, int16_t *data, size_t n);
+TH_API size_t THFile_readIntRaw(THFile *self, int32_t *data, size_t n);
+TH_API size_t THFile_readLongRaw(THFile *self, int64_t *data, size_t n);
+TH_API size_t THFile_readFloatRaw(THFile *self, float *data, size_t n);
+TH_API size_t THFile_readDoubleRaw(THFile *self, double *data, size_t n);
+TH_API size_t THFile_readStringRaw(THFile *self, const char *format, char **str_); /* you must deallocate str_ */
+
+TH_API size_t THFile_writeByteRaw(THFile *self, uint8_t *data, size_t n);
+TH_API size_t THFile_writeCharRaw(THFile *self, int8_t *data, size_t n);
+TH_API size_t THFile_writeShortRaw(THFile *self, int16_t *data, size_t n);
+TH_API size_t THFile_writeIntRaw(THFile *self, int32_t *data, size_t n);
+TH_API size_t THFile_writeLongRaw(THFile *self, int64_t *data, size_t n);
+TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
+TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
+TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
+
+TH_API THHalf THFile_readHalfScalar(THFile *self);
+TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar);
+TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size);
+TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size);
+
+TH_API void THFile_synchronize(THFile *self);
+TH_API void THFile_seek(THFile *self, size_t position);
+TH_API void THFile_seekEnd(THFile *self);
+TH_API size_t THFile_position(THFile *self);
+TH_API void THFile_close(THFile *self);
+TH_API void THFile_free(THFile *self);
+
+#endif
diff --git a/aten/src/TH/THFilePrivate.h b/aten/src/TH/THFilePrivate.h
new file mode 100644
index 0000000..93bbaa0
--- /dev/null
+++ b/aten/src/TH/THFilePrivate.h
@@ -0,0 +1,50 @@
+#include "THGeneral.h"
+
+#include "THHalf.h"
+
+
+struct THFile__
+{
+    struct THFileVTable *vtable;
+
+    int isQuiet;
+    int isReadable;
+    int isWritable;
+    int isBinary;
+    int isAutoSpacing;
+    int hasError;
+};
+
+/* virtual table definition */
+
+struct THFileVTable
+{
+    int (*isOpened)(THFile *self);
+
+    ssize_t (*readByte)(THFile *self, uint8_t *data, ssize_t n);
+    ssize_t (*readChar)(THFile *self, int8_t *data, ssize_t n);
+    ssize_t (*readShort)(THFile *self, int16_t *data, ssize_t n);
+    ssize_t (*readInt)(THFile *self, int32_t *data, ssize_t n);
+    ssize_t (*readLong)(THFile *self, int64_t *data, ssize_t n);
+    ssize_t (*readFloat)(THFile *self, float *data, ssize_t n);
+    ssize_t (*readDouble)(THFile *self, double *data, ssize_t n);
+    ssize_t (*readHalf)(THFile *self, THHalf *data, ssize_t n);
+    ssize_t (*readString)(THFile *self, const char *format, char **str_);
+
+    ssize_t (*writeByte)(THFile *self, uint8_t *data, ssize_t n);
+    ssize_t (*writeChar)(THFile *self, int8_t *data, ssize_t n);
+    ssize_t (*writeShort)(THFile *self, int16_t *data, ssize_t n);
+    ssize_t (*writeInt)(THFile *self, int32_t *data, ssize_t n);
+    ssize_t (*writeLong)(THFile *self, int64_t *data, ssize_t n);
+    ssize_t (*writeFloat)(THFile *self, float *data, ssize_t n);
+    ssize_t (*writeDouble)(THFile *self, double *data, ssize_t n);
+    ssize_t (*writeHalf)(THFile *self, THHalf *data, ssize_t n);
+    ssize_t (*writeString)(THFile *self, const char *str, ssize_t size);
+
+    void (*synchronize)(THFile *self);
+    void (*seek)(THFile *self, ssize_t position);
+    void (*seekEnd)(THFile *self);
+    ssize_t (*position)(THFile *self);
+    void (*close)(THFile *self);
+    void (*free)(THFile *self);
+};
diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp
new file mode 100644
index 0000000..667d7fb
--- /dev/null
+++ b/aten/src/TH/THGeneral.cpp
@@ -0,0 +1,328 @@
+#include "THGeneral.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifndef TH_HAVE_THREAD
+#define __thread
+#elif _MSC_VER
+#define __thread __declspec( thread )
+#endif
+
+#if (defined(__unix) || defined(_WIN32))
+  #if defined(__FreeBSD__)
+    #include <malloc_np.h>
+  #else
+    #include <malloc.h>
+  #endif
+#elif defined(__APPLE__)
+#include <malloc/malloc.h>
+#endif
+
+#ifdef TH_BLAS_MKL
+// this is the C prototype, while mkl_set_num_threads is the fortran prototype
+TH_EXTERNC void MKL_Set_Num_Threads(int);
+// this is the C prototype, while mkl_get_max_threads is the fortran prototype
+TH_EXTERNC int  MKL_Get_Max_Threads(void);
+#endif
+
+/* Torch Error Handling */
+static void defaultErrorHandlerFunction(const char *msg, void *data)
+{
+  printf("$ Error: %s\n", msg);
+  exit(-1);
+}
+
+static THErrorHandlerFunction defaultErrorHandler = defaultErrorHandlerFunction;
+static void *defaultErrorHandlerData;
+static __thread THErrorHandlerFunction threadErrorHandler = NULL;
+static __thread void *threadErrorHandlerData;
+
+void _THError(const char *file, const int line, const char *fmt, ...)
+{
+  char msg[2048];
+  va_list args;
+
+  /* vasprintf not standard */
+  /* vsnprintf: how to handle if does not exists? */
+  va_start(args, fmt);
+  int n = vsnprintf(msg, 2048, fmt, args);
+  va_end(args);
+
+  if(n < 2048) {
+    snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
+  }
+
+  if (threadErrorHandler)
+    (*threadErrorHandler)(msg, threadErrorHandlerData);
+  else
+    (*defaultErrorHandler)(msg, defaultErrorHandlerData);
+  TH_UNREACHABLE;
+}
+
+void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...) {
+  char msg[1024];
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(msg, 1024, fmt, args);
+  va_end(args);
+  _THError(file, line, "Assertion `%s' failed. %s", exp, msg);
+}
+
+void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data)
+{
+  threadErrorHandler = new_handler;
+  threadErrorHandlerData = data;
+}
+
+void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data)
+{
+  if (new_handler)
+    defaultErrorHandler = new_handler;
+  else
+    defaultErrorHandler = defaultErrorHandlerFunction;
+  defaultErrorHandlerData = data;
+}
+
+/* Torch Arg Checking Handling */
+static void defaultArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
+{
+  if(msg)
+    printf("$ Invalid argument %d: %s\n", argNumber, msg);
+  else
+    printf("$ Invalid argument %d\n", argNumber);
+  exit(-1);
+}
+
+static THArgErrorHandlerFunction defaultArgErrorHandler = defaultArgErrorHandlerFunction;
+static void *defaultArgErrorHandlerData;
+static __thread THArgErrorHandlerFunction threadArgErrorHandler = NULL;
+static __thread void *threadArgErrorHandlerData;
+
+void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...)
+{
+  if(!condition) {
+    char msg[2048];
+    va_list args;
+
+    /* vasprintf not standard */
+    /* vsnprintf: how to handle if does not exists? */
+    va_start(args, fmt);
+    int n = vsnprintf(msg, 2048, fmt, args);
+    va_end(args);
+
+    if(n < 2048) {
+      snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
+    }
+
+    if (threadArgErrorHandler)
+      (*threadArgErrorHandler)(argNumber, msg, threadArgErrorHandlerData);
+    else
+      (*defaultArgErrorHandler)(argNumber, msg, defaultArgErrorHandlerData);
+    TH_UNREACHABLE;
+  }
+}
+
+void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
+{
+  threadArgErrorHandler = new_handler;
+  threadArgErrorHandlerData = data;
+}
+
+void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
+{
+  if (new_handler)
+    defaultArgErrorHandler = new_handler;
+  else
+    defaultArgErrorHandler = defaultArgErrorHandlerFunction;
+  defaultArgErrorHandlerData = data;
+}
+
+static __thread void (*torchGCFunction)(void *data) = NULL;
+static __thread void *torchGCData;
+
+/* Optional hook for integrating with a garbage-collected frontend.
+ *
+ * If torch is running with a garbage-collected frontend (e.g. Lua),
+ * the GC isn't aware of TH-allocated memory so may not know when it
+ * needs to run. These hooks trigger the GC to run in two cases:
+ *
+ * (1) When a memory allocation (malloc, realloc, ...) fails
+ * (2) When the total TH-allocated memory hits a dynamically-adjusted
+ *     soft maximum.
+ */
+void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
+{
+  torchGCFunction = torchGCFunction_;
+  torchGCData = data;
+}
+
+static void* THAllocInternal(ptrdiff_t size)
+{
+  void *ptr;
+
+  if (size > 5120)
+  {
+#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
+    if (posix_memalign(&ptr, 64, size) != 0)
+      ptr = NULL;
+/*
+#elif defined(_WIN32)
+    ptr = _aligned_malloc(size, 64);
+*/
+#else
+    ptr = malloc(size);
+#endif
+  }
+  else
+  {
+    ptr = malloc(size);
+  }
+
+  return ptr;
+}
+
+void* THAlloc(ptrdiff_t size)
+{
+  void *ptr;
+
+  if(size < 0)
+    THError("$ Torch: invalid memory size -- maybe an overflow?");
+
+  if(size == 0)
+    return NULL;
+
+  ptr = THAllocInternal(size);
+
+  if(!ptr && torchGCFunction) {
+    torchGCFunction(torchGCData);
+    ptr = THAllocInternal(size);
+  }
+
+  if(!ptr)
+    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+  return ptr;
+}
+
+void* THRealloc(void *ptr, ptrdiff_t size)
+{
+  if(!ptr)
+    return(THAlloc(size));
+
+  if(size == 0)
+  {
+    THFree(ptr);
+    return NULL;
+  }
+
+  if(size < 0)
+    THError("$ Torch: invalid memory size -- maybe an overflow?");
+
+  void *newptr = realloc(ptr, size);
+
+  if(!newptr && torchGCFunction) {
+    torchGCFunction(torchGCData);
+    newptr = realloc(ptr, size);
+  }
+
+  if(!newptr)
+    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+
+  return newptr;
+}
+
+void THFree(void *ptr)
+{
+  free(ptr);
+}
+
+double THLog10(const double x)
+{
+  return log10(x);
+}
+
+double THLog1p(const double x)
+{
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+  volatile double y = 1 + x;
+  return log(y) - ((y-1)-x)/y ;  /* cancels errors with IEEE arithmetic */
+#else
+  return log1p(x);
+#endif
+}
+
+double THLog2(const double x)
+{
+  return log2(x);
+}
+
+double THExpm1(const double x)
+{
+  return expm1(x);
+}
+
+void THSetNumThreads(int num_threads)
+{
+#ifdef _OPENMP
+  omp_set_num_threads(num_threads);
+#endif
+#ifdef TH_BLAS_MKL
+  MKL_Set_Num_Threads(num_threads);
+#endif
+
+}
+
+int THGetNumThreads(void)
+{
+#ifdef _OPENMP
+  return omp_get_max_threads();
+#else
+  return 1;
+#endif
+}
+
+int THGetNumCores(void)
+{
+#ifdef _OPENMP
+  return omp_get_num_procs();
+#else
+  return 1;
+#endif
+}
+
+TH_API void THInferNumThreads(void)
+{
+#if defined(_OPENMP) && defined(TH_BLAS_MKL)
+  // If we are using MKL an OpenMP make sure the number of threads match.
+  // Otherwise, MKL and our OpenMP-enabled functions will keep changing the
+  // size of the OpenMP thread pool, resulting in worse performance (and memory
+  // leaks in GCC 5.4)
+  omp_set_num_threads(MKL_Get_Max_Threads());
+#endif
+}
+
+TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int i, n = 0;
+  n += snprintf(str, L-n, "[");
+
+  for (i = 0; i < ndim; i++) {
+    if (n >= L) break;
+    n += snprintf(str+n, L-n, "%" PRId64, size[i]);
+    if (i < ndim-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+
+  if (n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+
+  return buf;
+}
diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in
new file mode 100644
index 0000000..103710b
--- /dev/null
+++ b/aten/src/TH/THGeneral.h.in
@@ -0,0 +1,187 @@
+#ifndef TH_GENERAL_INC
+#define TH_GENERAL_INC
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+#include <time.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+
+#ifdef TH_BLAS_MKL
+#include <mkl_vsl.h>
+#endif
+
+#cmakedefine USE_BLAS
+#cmakedefine USE_LAPACK
+#cmakedefine BLAS_F2C
+#cmakedefine BLAS_USE_CBLAS_DOT
+
+#ifdef __cplusplus
+# define TH_EXTERNC extern "C"
+#else
+# define TH_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#  define TH_API TH_EXTERNC __declspec(dllexport)
+#  define TH_CPP_API extern __declspec(dllexport)
+# else
+#  define TH_API TH_EXTERNC __declspec(dllimport)
+#  define TH_CPP_API extern __declspec(dllimport)
+# endif
+#else
+# define TH_API TH_EXTERNC
+# define TH_CPP_API extern
+#endif
+
+#ifdef _WIN32
+# define TH_NO_RETURN __declspec(noreturn)
+# define TH_UNREACHABLE
+#else
+# define TH_NO_RETURN __attribute__((noreturn))
+# define TH_UNREACHABLE __builtin_unreachable();
+#endif
+
+#if defined(__GNUC__) && ((__GNUC__ > 2) || (__GNUC__ == 2 && __GNUC_MINOR__ > 4))
+# define TH_UNUSED __attribute__((unused))
+#else
+# define TH_UNUSED
+#endif
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#endif
+
+#ifndef M_PI
+# define M_PI 3.14159265358979323846
+#endif
+
+#define TH_INDEX_BASE 0
+
+typedef void (*THErrorHandlerFunction)(const char *msg, void *data);
+typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data);
+
+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+    char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
+
+TH_API double THLog1p(const double x);
+TH_API double THLog2(const double x);
+TH_API double THExpm1(const double x);
+TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim);
+TH_API TH_NO_RETURN void _THError(const char *file, const int line, const char *fmt, ...);
+TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
+TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
+TH_API void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data);
+TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...);
+TH_API void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
+TH_API void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
+TH_API void* THAlloc(ptrdiff_t size);
+TH_API void* THRealloc(void *ptr, ptrdiff_t size);
+TH_API void THFree(void *ptr);
+TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
+// this hook should only be called by custom allocator functions
+TH_API void THHeapUpdate(ptrdiff_t size);
+TH_API void THSetNumThreads(int num_threads);
+TH_API int THGetNumThreads(void);
+TH_API int THGetNumCores(void);
+TH_API void THInferNumThreads(void);
+
+#define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
+
+#define THCleanup(...) __VA_ARGS__
+
+#define THArgCheck(...)                                               \
+do {                                                                  \
+  _THArgCheck(__FILE__, __LINE__, __VA_ARGS__);                       \
+} while(0)
+
+#define THArgError(...)                                               \
+do {                                                                  \
+  _THArgCheck(__FILE__, __LINE__, false, __VA_ARGS__);                \
+  TH_UNREACHABLE                                                      \
+} while(0)
+
+#define THArgCheckWithCleanup(condition, cleanup, ...)                \
+do if (!(condition)) {                                                \
+  cleanup                                                             \
+  _THArgCheck(__FILE__, __LINE__, 0, __VA_ARGS__);                    \
+} while(0)
+
+#define THAssert(exp)                                                 \
+do {                                                                  \
+  if (!(exp)) {                                                       \
+    _THAssertionFailed(__FILE__, __LINE__, #exp, "");                 \
+  }                                                                   \
+} while(0)
+
+#define THAssertMsg(exp, ...)                                         \
+do {                                                                  \
+  if (!(exp)) {                                                       \
+    _THAssertionFailed(__FILE__, __LINE__, #exp, __VA_ARGS__);        \
+  }                                                                   \
+} while(0)
+
+#define TH_CONCAT_STRING_2(x,y) TH_CONCAT_STRING_2_EXPAND(x,y)
+#define TH_CONCAT_STRING_2_EXPAND(x,y) #x #y
+
+#define TH_CONCAT_STRING_3(x,y,z) TH_CONCAT_STRING_3_EXPAND(x,y,z)
+#define TH_CONCAT_STRING_3_EXPAND(x,y,z) #x #y #z
+
+#define TH_CONCAT_STRING_4(x,y,z,w) TH_CONCAT_STRING_4_EXPAND(x,y,z,w)
+#define TH_CONCAT_STRING_4_EXPAND(x,y,z,w) #x #y #z #w
+
+#define TH_CONCAT_2(x,y) TH_CONCAT_2_EXPAND(x,y)
+#define TH_CONCAT_2_EXPAND(x,y) x ## y
+
+#define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z)
+#define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z
+
+#define TH_CONCAT_4_EXPAND(x,y,z,w) x ## y ## z ## w
+#define TH_CONCAT_4(x,y,z,w) TH_CONCAT_4_EXPAND(x,y,z,w)
+
+#define THMin(X, Y)  ((X) < (Y) ? (X) : (Y))
+#define THMax(X, Y)  ((X) > (Y) ? (X) : (Y))
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+#if defined(_MSC_VER)
+__inline double log1p(double x) { return THLog1p(x); }
+#else
+inline double log1p(double x) { return THLog1p(x); }
+#endif
+
+#if defined(_MSC_VER)
+__inline double log2(double x) { return THLog2(x); }
+#else
+inline double log2(double x) { return THLog2(x); }
+#endif
+
+#if defined(_MSC_VER)
+__inline double expm1(double x) { return THExpm1(x); }
+#else
+inline double expm1(double x) { return THExpm1(x); }
+#endif
+
+#define snprintf _snprintf
+#define popen _popen
+#define pclose _pclose
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+#endif
diff --git a/aten/src/TH/THGenerateAllTypes.h b/aten/src/TH/THGenerateAllTypes.h
new file mode 100644
index 0000000..5b9508d
--- /dev/null
+++ b/aten/src/TH/THGenerateAllTypes.h
@@ -0,0 +1,17 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#endif
+
+#ifndef THGenerateManyTypes
+#define THAllLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
+
+#include "THGenerateFloatTypes.h"
+#include "THGenerateIntTypes.h"
+
+#ifdef THAllLocalGenerateManyTypes
+#undef THAllLocalGenerateManyTypes
+#undef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateByteType.h b/aten/src/TH/THGenerateByteType.h
new file mode 100644
index 0000000..0ec234d
--- /dev/null
+++ b/aten/src/TH/THGenerateByteType.h
@@ -0,0 +1,26 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateByteType.h"
+#endif
+
+#define real uint8_t
+#define ureal uint8_t
+#define accreal int64_t
+#define Real Byte
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define THInf UCHAR_MAX
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef ureal
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_BYTE
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateCharType.h b/aten/src/TH/THGenerateCharType.h
new file mode 100644
index 0000000..9c172f1
--- /dev/null
+++ b/aten/src/TH/THGenerateCharType.h
@@ -0,0 +1,26 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateCharType.h"
+#endif
+
+#define real int8_t
+#define ureal uint8_t
+#define accreal int64_t
+#define Real Char
+#define THInf SCHAR_MAX 
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define TH_REAL_IS_CHAR
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef ureal
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_CHAR
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateDoubleType.h b/aten/src/TH/THGenerateDoubleType.h
new file mode 100644
index 0000000..fffee60
--- /dev/null
+++ b/aten/src/TH/THGenerateDoubleType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateDoubleType.h"
+#endif
+
+#define real double
+#define accreal double
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Double
+#define THInf DBL_MAX
+#define TH_REAL_IS_DOUBLE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_DOUBLE
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateFloatType.h b/aten/src/TH/THGenerateFloatType.h
new file mode 100644
index 0000000..a31b50c
--- /dev/null
+++ b/aten/src/TH/THGenerateFloatType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h"
+#endif
+
+#define real float
+#define accreal double
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Float
+#define THInf FLT_MAX
+#define TH_REAL_IS_FLOAT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_FLOAT
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateFloatTypes.h b/aten/src/TH/THGenerateFloatTypes.h
new file mode 100644
index 0000000..be5ea84
--- /dev/null
+++ b/aten/src/TH/THGenerateFloatTypes.h
@@ -0,0 +1,17 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
+#endif
+
+#ifndef THGenerateManyTypes
+#define THFloatLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
+
+#include "THGenerateFloatType.h"
+#include "THGenerateDoubleType.h"
+
+#ifdef THFloatLocalGenerateManyTypes
+#undef THFloatLocalGenerateManyTypes
+#undef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h
new file mode 100644
index 0000000..47ff1e8
--- /dev/null
+++ b/aten/src/TH/THGenerateHalfType.h
@@ -0,0 +1,25 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h"
+#endif
+
+#include "THHalf.h"
+#define real THHalf
+#define accreal float
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val)
+#define Real Half
+#define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF)
+#define TH_REAL_IS_HALF
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_HALF
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateIntType.h b/aten/src/TH/THGenerateIntType.h
new file mode 100644
index 0000000..5135bc5
--- /dev/null
+++ b/aten/src/TH/THGenerateIntType.h
@@ -0,0 +1,26 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateIntType.h"
+#endif
+
+#define real int32_t
+#define ureal uint32_t
+#define accreal int64_t
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Int
+#define THInf INT_MAX
+#define TH_REAL_IS_INT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef ureal
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_INT
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateIntTypes.h b/aten/src/TH/THGenerateIntTypes.h
new file mode 100644
index 0000000..9931fb1
--- /dev/null
+++ b/aten/src/TH/THGenerateIntTypes.h
@@ -0,0 +1,20 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateIntTypes.h"
+#endif
+
+#ifndef THGenerateManyTypes
+#define THIntLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
+
+#include "THGenerateByteType.h"
+#include "THGenerateCharType.h"
+#include "THGenerateShortType.h"
+#include "THGenerateIntType.h"
+#include "THGenerateLongType.h"
+
+#ifdef THIntLocalGenerateManyTypes
+#undef THIntLocalGenerateManyTypes
+#undef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateLongType.h b/aten/src/TH/THGenerateLongType.h
new file mode 100644
index 0000000..d2b9af0
--- /dev/null
+++ b/aten/src/TH/THGenerateLongType.h
@@ -0,0 +1,26 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateLongType.h"
+#endif
+
+#define real int64_t
+#define ureal uint64_t
+#define accreal int64_t
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Long
+#define THInf LONG_MAX
+#define TH_REAL_IS_LONG
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef ureal
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_LONG
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerateShortType.h b/aten/src/TH/THGenerateShortType.h
new file mode 100644
index 0000000..5b83c47
--- /dev/null
+++ b/aten/src/TH/THGenerateShortType.h
@@ -0,0 +1,26 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateShortType.h"
+#endif
+
+#define real int16_t
+#define ureal uint16_t
+#define accreal int64_t
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Short
+#define THInf SHRT_MAX
+#define TH_REAL_IS_SHORT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef ureal
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_SHORT
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp
new file mode 100644
index 0000000..f1e6914
--- /dev/null
+++ b/aten/src/TH/THGenerator.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include <mutex>
+
+struct THGeneratorState {
+  /* The initial seed. */
+  uint64_t the_initial_seed;
+  int left;  /* = 1; */
+  int seeded; /* = 0; */
+  uint64_t next;
+  uint64_t state[_MERSENNE_STATE_N]; /* the array for the state vector  */
+
+  /********************************/
+
+  /* For normal distribution */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid; /* = 0; */
+};
+
+/* A THGenerator contains all the state required for a single random number stream */
+struct THGenerator {
+  std::mutex mutex; /* mutex for using this generator */
+  THGeneratorState gen_state;
+};
diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp
new file mode 100644
index 0000000..1c46c59
--- /dev/null
+++ b/aten/src/TH/THHalf.cpp
@@ -0,0 +1,100 @@
+#include "THHalf.h"
+
+/* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
+
+THHalf TH_float2half(float f)
+{
+  THHalf h;
+  TH_float2halfbits(&f, &h.x);
+  return h;
+}
+
+TH_API float TH_half2float(THHalf h)
+{
+  float f;
+  TH_halfbits2float(&h.x, &f);
+  return f;
+}
+
+// Host functions for converting between FP32 and FP16 formats
+
+void TH_halfbits2float(unsigned short* src, float* res)
+{
+    unsigned h = *src;
+    unsigned sign = ((h >> 15) & 1);
+    unsigned exponent = ((h >> 10) & 0x1f);
+    unsigned mantissa = ((h & 0x3ff) << 13);
+
+    if (exponent == 0x1f) {  /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) {  /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1;  /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
+
+    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
+}
+
+void TH_float2halfbits(float* src, unsigned short* dest)
+{
+    unsigned x = *(unsigned*)src;
+    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+      *dest = 0x7fffU;
+      return ;
+    }
+  
+    sign = ((x >> 16) & 0x8000);
+  
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+      *dest = sign | 0x7c00U;
+      return; 
+    }
+    if (u < 0x33000001) {
+      *dest = (sign | 0x0000);
+      return;
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+  
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }  
+
+    *dest = (sign | (exponent << 10) | mantissa);  
+}
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
new file mode 100644
index 0000000..0f9807b
--- /dev/null
+++ b/aten/src/TH/THHalf.h
@@ -0,0 +1,41 @@
+#ifndef TH_HALF_H
+#define TH_HALF_H
+
+#include "THGeneral.h"
+#include <stdint.h>
+
+/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
+#if defined(__GNUC__)
+#define __thalign__(n) __attribute__((aligned(n)))
+#elif defined(_WIN32)
+#define __thalign__(n) __declspec(align(n))
+#else
+#define __thalign__(n)
+#endif
+
+typedef struct __thalign__(2){
+  unsigned short x;
+} __THHalf;
+
+typedef struct __thalign__(4) {
+  unsigned int x;
+} __THHalf2;
+
+typedef __THHalf THHalf;
+typedef __THHalf2 THHalf2;
+
+TH_API void TH_float2halfbits(float*, unsigned short*);
+TH_API void TH_halfbits2float(unsigned short*, float*);
+
+TH_API THHalf TH_float2half(float);
+TH_API float  TH_half2float(THHalf);
+
+#ifndef TH_HALF_BITS_TO_LITERAL
+# define TH_HALF_BITS_TO_LITERAL(n) { n }
+#endif
+
+#define TH_HALF_ZERO 0x0U
+#define TH_HALF_INF  0x7C00U
+
+#undef __thalign__
+#endif
diff --git a/aten/src/TH/THLapack.cpp b/aten/src/TH/THLapack.cpp
new file mode 100644
index 0000000..e340a63
--- /dev/null
+++ b/aten/src/TH/THLapack.cpp
@@ -0,0 +1,4 @@
+#include "THLapack.h"
+
+#include "generic/THLapack.cpp"
+#include "THGenerateFloatTypes.h"
diff --git a/aten/src/TH/THLapack.h b/aten/src/TH/THLapack.h
new file mode 100644
index 0000000..614d15f
--- /dev/null
+++ b/aten/src/TH/THLapack.h
@@ -0,0 +1,27 @@
+#ifndef TH_LAPACK_INC
+#define TH_LAPACK_INC
+
+#include "THGeneral.h"
+
+#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
+
+#define THLapackCheck(fmt, func, info , ...)						\
+if (info < 0) {														\
+  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
+} else if(info > 0) {												\
+  THError(fmt, func, info, ##__VA_ARGS__);							\
+}																	\
+
+#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...)    \
+if (info < 0) {                                                     \
+  cleanup                                                           \
+  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
+} else if(info > 0) {                                               \
+  cleanup                                                           \
+  THError(fmt, func, info, ##__VA_ARGS__);                          \
+}
+
+#include "generic/THLapack.h"
+#include "THGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/TH/THLogAdd.cpp b/aten/src/TH/THLogAdd.cpp
new file mode 100644
index 0000000..4b14f85
--- /dev/null
+++ b/aten/src/TH/THLogAdd.cpp
@@ -0,0 +1,88 @@
+#include "THLogAdd.h"
+
+#include <float.h>
+
+#ifdef USE_DOUBLE
+#define MINUS_LOG_THRESHOLD -39.14
+#else
+#define MINUS_LOG_THRESHOLD -18.42
+#endif
+
+const double THLog2Pi=1.83787706640934548355;
+const double THLogZero=-DBL_MAX;
+const double THLogOne=0;
+
+double THLogAdd(double log_a, double log_b)
+{
+  double minusdif;
+
+  if (log_a < log_b)
+  {
+    double tmp = log_a;
+    log_a = log_b;
+    log_b = tmp;
+  }
+
+  minusdif = log_b - log_a;
+#ifdef DEBUG
+  if (isnan(minusdif))
+    THError("THLogAdd: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
+#endif
+  if (minusdif < MINUS_LOG_THRESHOLD)
+    return log_a;
+  else
+    return log_a + log1p(exp(minusdif));
+}
+
+double THLogSub(double log_a, double log_b)
+{
+  double minusdif;
+
+  if (log_a < log_b)
+    THError("LogSub: log_a (%f) should be greater than log_b (%f)", log_a, log_b);
+
+  minusdif = log_b - log_a;
+#ifdef DEBUG
+  if (isnan(minusdif))
+    THError("LogSub: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
+#endif
+  if (log_a == log_b)
+    return THLogZero;
+  else if (minusdif < MINUS_LOG_THRESHOLD)
+    return log_a;
+  else
+    return log_a + log1p(-exp(minusdif));
+}
+
+/* Credits to Leon Bottou */
+double THExpMinusApprox(const double x)
+{
+#define EXACT_EXPONENTIAL 0
+#if EXACT_EXPONENTIAL
+  return exp(-x);
+#else
+  /* fast approximation of exp(-x) for x positive */
+# define A0   (1.0)
+# define A1   (0.125)
+# define A2   (0.0078125)
+# define A3   (0.00032552083)
+# define A4   (1.0172526e-5)
+  if (x < 13.0)
+  {
+/*    assert(x>=0); */
+    double y;
+    y = A0+x*(A1+x*(A2+x*(A3+x*A4)));
+    y *= y;
+    y *= y;
+    y *= y;
+    y = 1/y;
+    return y;
+  }
+  return 0;
+# undef A0
+# undef A1
+# undef A2
+# undef A3
+# undef A4
+#endif
+}
diff --git a/aten/src/TH/THLogAdd.h b/aten/src/TH/THLogAdd.h
new file mode 100644
index 0000000..9319b8f
--- /dev/null
+++ b/aten/src/TH/THLogAdd.h
@@ -0,0 +1,14 @@
+#ifndef TH_LOG_ADD_INC
+#define TH_LOG_ADD_INC
+
+#include "THGeneral.h"
+
+TH_API const double THLog2Pi;
+TH_API const double THLogZero;
+TH_API const double THLogOne;
+
+TH_API double THLogAdd(double log_a, double log_b);
+TH_API double THLogSub(double log_a, double log_b);
+TH_API double THExpMinusApprox(const double x);
+
+#endif
diff --git a/aten/src/TH/THMath.h b/aten/src/TH/THMath.h
new file mode 100644
index 0000000..638c98a
--- /dev/null
+++ b/aten/src/TH/THMath.h
@@ -0,0 +1,287 @@
+#ifndef _THMATH_H
+#define _THMATH_H
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#ifndef M_PIf
+#define M_PIf 3.1415926535f
+#endif  // M_PIf
+
+static inline double TH_sigmoid(double value) {
+  return 1.0 / (1.0 + exp(-value));
+}
+
+static inline double TH_frac(double x) {
+  return x - trunc(x);
+}
+
+static inline double TH_rsqrt(double x) {
+  return 1.0 / sqrt(x);
+}
+
+static inline double TH_lerp(double a, double b, double weight) {
+  return a + weight * (b-a);
+}
+
+static inline float TH_sigmoidf(float value) {
+  return 1.0f / (1.0f + expf(-value));
+}
+
+static inline float TH_fracf(float x) {
+  return x - truncf(x);
+}
+
+static inline float TH_rsqrtf(float x) {
+  return 1.0f / sqrtf(x);
+}
+
+static inline float TH_lerpf(float a, float b, float weight) {
+  return a + weight * (b-a);
+}
+
+/* The next function is taken from  https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c.
+Below is the copyright.
+Output was modified to be inf or -inf when input is 1 or -1. */
+
+
+/*
+    Copyright (c) 2014 Indiana University
+    All rights reserved.
+
+    Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci.,
+            Indiana University, Bloomington, IN
+
+    This software is licensed under the New BSD license:
+
+    Redistribution and use in source and binary forms,
+    with or without modification, are permitted provided
+    that the following conditions are met:
+
+    Redistributions of source code must retain the above
+    copyright notice, this list of conditions and the
+    following disclaimer.
+
+    Redistributions in binary form must reproduce the
+    above copyright notice, this list of conditions and
+    the following disclaimer in the documentation and/or
+    other materials provided with the distribution.
+
+    Neither the name of Indiana University nor
+    the names of its contributors may be used to endorse
+    or promote products derived from this software without
+    specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+    CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+    WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+    THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+    IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define CENTRAL_RANGE 0.7
+
+static inline double TH_erfinv(double y) {
+/* Function to calculate inverse error function.  Rational approximation
+is used to generate an initial approximation, which is then improved to
+full accuracy by two steps of Newton's method.  Code is a direct
+translation of the erfinv m file in matlab version 2.0.
+Author:  Gary L. Pavlis, Indiana University
+Date:  February 1996
+*/
+    double x,z,num,dem; /*working variables */
+    /* coefficients in rational expansion */
+    double a[4]={ 0.886226899, -1.645349621,  0.914624893, -0.140543331};
+    double b[4]={-2.118377725,  1.442710462, -0.329097515,  0.012229801};
+    double c[4]={-1.970840454, -1.624906493,  3.429567803,  1.641345311};
+    double d[2]={ 3.543889200,  1.637067800};
+    if(fabs(y) > 1.0) return (atof("NaN"));  /* This needs IEEE constant*/
+    if(fabs(y) == 1.0) return((copysign(1.0,y))*atof("INFINITY"));
+    if(fabs(y) <= CENTRAL_RANGE){
+            z = y*y;
+            num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
+            dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0])*z + 1.0);
+            x = y*num/dem;
+    }
+    else{
+            z = sqrt(-log((1.0-fabs(y))/2.0));
+            num = ((c[3]*z + c[2])*z + c[1])*z + c[0];
+            dem = (d[1]*z + d[0])*z + 1.0;
+            x = (copysign(1.0,y))*num/dem;
+    }
+    /* Two steps of Newton-Raphson correction */
+    x = x - (erf(x) - y)/( (2.0/sqrt(M_PI))*exp(-x*x));
+    x = x - (erf(x) - y)/( (2.0/sqrt(M_PI))*exp(-x*x));
+
+    return(x);
+}
+#undef CENTRAL_RANGE
+
+static inline double TH_polevl(double x, double *A, size_t len) {
+  double result = 0;
+  for (size_t i = 0; i <= len; i++) {
+    result = result * x + A[i];
+  }
+  return result;
+}
+
+static inline float TH_polevlf(float x, float *A, size_t len) {
+  float result = 0;
+  for (size_t i = 0; i <= len; i++) {
+    result = result * x + A[i];
+  }
+  return result;
+}
+
+/*
+ * The following function comes with the following copyright notice.
+ * It has been released under the BSD license.
+ *
+ * Cephes Math Library Release 2.8:  June, 2000
+ * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+ */
+static inline double TH_digamma(double x) {
+  static double PSI_10 = 2.25175258906672110764;
+  if (x == 0) {
+    return INFINITY;
+  }
+
+  int x_is_integer = x == floor(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      return INFINITY;
+    }
+    return TH_digamma(1 - x) - M_PI / tan(M_PI * x);
+  }
+
+  // Push x to be >= 10
+  double result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10;
+  }
+
+  // Compute asymptotic digamma
+  static double A[] = {
+     8.33333333333333333333E-2,
+    -2.10927960927960927961E-2,
+     7.57575757575757575758E-3,
+    -4.16666666666666666667E-3,
+     3.96825396825396825397E-3,
+    -8.33333333333333333333E-3,
+     8.33333333333333333333E-2,
+  };
+
+  double y = 0;
+  if (x < 1.0e17) {
+    double z = 1.0 / (x * x);
+    y = z * TH_polevl(z, A, 6);
+  }
+  return result + log(x) - (0.5 / x) - y;
+}
+
+/*
+ * The following function comes with the following copyright notice.
+ * It has been released under the BSD license.
+ *
+ * Cephes Math Library Release 2.8:  June, 2000
+ * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+ */
+static inline double TH_digammaf(float x) {
+  static float PSI_10 = 2.25175258906672110764f;
+  if (x == 0) {
+    return INFINITY;
+  }
+
+  int x_is_integer = x == floorf(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      return INFINITY;
+    }
+    // Avoid rounding errors for `tan`'s input.
+    // Those make a big difference at extreme values.
+    float pi_over_tan_pi_x = (float)(M_PI / tan(M_PI * (double)x));
+    return TH_digammaf(1 - x) - pi_over_tan_pi_x;
+  }
+
+  // Push x to be >= 10
+  float result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10;
+  }
+
+  // Compute asymptotic digamma
+  static float A[] = {
+     8.33333333333333333333E-2f,
+    -2.10927960927960927961E-2f,
+     7.57575757575757575758E-3f,
+    -4.16666666666666666667E-3f,
+     3.96825396825396825397E-3f,
+    -8.33333333333333333333E-3f,
+     8.33333333333333333333E-2f,
+  };
+
+  float y = 0;
+  if (x < 1.0e17) {
+    float z = 1 / (x * x);
+    y = z * TH_polevlf(z, A, 6);
+  }
+  return result + logf(x) - (0.5 / x) - y;
+}
+
+static inline double TH_trigamma(double x) {
+  double sign = +1;
+  double result = 0;
+  if (x < 0.5) {
+    sign = -1;
+    const double sin_pi_x = sin(M_PI * x);
+    result -= (M_PI * M_PI) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const double ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x;
+  return sign * result;
+}
+
+static inline float TH_trigammaf(float x) {
+  float sign = +1;
+  float result = 0;
+  if (x < 0.5f) {
+    sign = -1;
+    const float sin_pi_x = sinf(M_PIf * x);
+    result -= (M_PIf * M_PIf) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const float ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x;
+  return sign * result;
+}
+
+#endif // _THMATH_H
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
new file mode 100644
index 0000000..e13b02f
--- /dev/null
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -0,0 +1,689 @@
+#include "THMemoryFile.h"
+#include "THStorage.hpp"
+#include "THFilePrivate.h"
+#include "THDiskFile.h"
+#include "stdint.h"
+
+#ifndef _WIN32
+#include <sys/types.h>
+#endif
+
+typedef struct THMemoryFile__
+{
+    THFile file;
+    THCharStorage *storage;
+    ssize_t size;
+    ssize_t position;
+	int longSize;
+
+} THMemoryFile;
+
+static int THMemoryFile_isOpened(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  return (mfself->storage != NULL);
+}
+
+static int8_t *THMemoryFile_strnextspace(int8_t *str_, int8_t *c_)
+{
+  int8_t c;
+
+  while( (c = *str_) )
+  {
+    if( (c != ' ') && (c != '\n') && (c != ':') && (c != ';') )
+      break;
+    str_++;
+  }
+
+  while( (c = *str_) )
+  {
+    if( (c == ' ') || (c == '\n') || (c == ':') || (c == ';') )
+    {
+      *c_ = c;
+      *str_ = '\0';
+      return(str_);
+    }
+    str_++;
+  }
+  return NULL;
+}
+
+static void THMemoryFile_grow(THMemoryFile *self, ssize_t size)
+{
+  ssize_t missingSpace;
+
+  if(size <= self->size)
+    return;
+  else
+  {
+    if(size < self->storage->size) /* note the "<" and not "<=" */
+    {
+      self->size = size;
+      THCharStorage_data(self->storage)[self->size] = '\0';
+      return;
+    }
+  }
+
+  missingSpace = size-self->storage->size+1; /* +1 for the '\0' */
+  THCharStorage_resize(self->storage, (self->storage->size/2 > missingSpace ?
+                                       self->storage->size + (self->storage->size/2)
+                                       : self->storage->size + missingSpace));
+}
+
+static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  else if(strlen(mode) == 2)
+  {
+    if(mode[0] == 'r' && mode[1] == 'w')
+    {
+      *isReadable = 1;
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/********************************************************/
+
+#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM, INSIDE_SPACING) \
+  static ssize_t THMemoryFile_read##TYPEC(THFile *self, TYPE *data, ssize_t n) \
+  {                                                                     \
+    THMemoryFile *mfself = (THMemoryFile*)self;                         \
+    ssize_t nread = 0;                                                    \
+                                                                        \
+    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
+    THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); \
+                                                                        \
+    if (n == 0)                                                         \
+        return 0;                                                       \
+                                                                        \
+    if(mfself->file.isBinary)                                           \
+    {                                                                   \
+      ssize_t nByte = sizeof(TYPE)*n;                                      \
+      ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); \
+      nread = nByteRemaining/sizeof(TYPE);                              \
+      memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nread*sizeof(TYPE)); \
+      mfself->position += nread*sizeof(TYPE);                           \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      ssize_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ssize_t nByteRead = 0;                                             \
+        int8_t spaceChar = 0;                                             \
+        int8_t *spacePtr = THMemoryFile_strnextspace(THCharStorage_data(mfself->storage)+mfself->position, &spaceChar); \
+        ASCII_READ_ELEM;                                                \
+        if(ret == EOF)                                                  \
+        {                                                               \
+          while(THCharStorage_data(mfself->storage)[mfself->position])        \
+            mfself->position++;                                         \
+        }                                                               \
+        else                                                            \
+          mfself->position += nByteRead;                                \
+        if(spacePtr)                                                    \
+          *spacePtr = spaceChar;                                        \
+      }                                                                 \
+      if(mfself->file.isAutoSpacing && (n > 0))                         \
+      {                                                                 \
+        if( (mfself->position < mfself->size) && (THCharStorage_data(mfself->storage)[mfself->position] == '\n') ) \
+          mfself->position++;                                           \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if(nread != n)                                                      \
+    {                                                                   \
+      mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
+      if(!mfself->file.isQuiet)                                         \
+        THError("read error: read %d blocks instead of %d", nread, n);  \
+    }                                                                   \
+                                                                        \
+    return nread;                                                       \
+  }                                                                     \
+                                                                        \
+  static ssize_t THMemoryFile_write##TYPEC(THFile *self, TYPE *data, ssize_t n) \
+  {                                                                     \
+    THMemoryFile *mfself = (THMemoryFile*)self;                         \
+                                                                        \
+    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
+    THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); \
+                                                                        \
+    if (n == 0)                                                         \
+        return 0;                                                       \
+                                                                        \
+    if(mfself->file.isBinary)                                           \
+    {                                                                   \
+      ssize_t nByte = sizeof(TYPE)*n;                                      \
+      THMemoryFile_grow(mfself, mfself->position+nByte);                \
+      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByte);     \
+      mfself->position += nByte;                                        \
+      if(mfself->position > mfself->size)                               \
+      {                                                                 \
+        mfself->size = mfself->position;                                \
+        THCharStorage_data(mfself->storage)[mfself->size] = '\0';             \
+      }                                                                 \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      ssize_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ssize_t nByteWritten;                                           \
+        while (1)                                                       \
+        {                                                               \
+          ASCII_WRITE_ELEM;                                             \
+          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) ) \
+          {                                                             \
+            mfself->position += nByteWritten;                           \
+            break;                                                      \
+          }                                                             \
+          THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2); \
+        }                                                               \
+        if(mfself->file.isAutoSpacing)                                  \
+        {                                                               \
+          if(i < n-1)                                                   \
+          {                                                             \
+            THMemoryFile_grow(mfself, mfself->position+1);              \
+            sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, " "); \
+            mfself->position++;                                         \
+          }                                                             \
+          if(i == n-1)                                                  \
+          {                                                             \
+            THMemoryFile_grow(mfself, mfself->position+1);              \
+            sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, "\n"); \
+            mfself->position++;                                         \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      if(mfself->position > mfself->size)                               \
+      {                                                                 \
+        mfself->size = mfself->position;                                \
+        THCharStorage_data(mfself->storage)[mfself->size] = '\0';             \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    return n;                                                           \
+  }
+
+
+void THMemoryFile_longSize(THFile *self, int size)
+{
+  THMemoryFile *dfself = (THMemoryFile*)(self);
+  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
+  dfself->longSize = size;
+}
+
+THCharStorage *THMemoryFile_storage(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+
+  THCharStorage_resize(mfself->storage, mfself->size+1);
+
+  return mfself->storage;
+}
+
+static void THMemoryFile_synchronize(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+}
+
+static void THMemoryFile_seek(THFile *self, ssize_t position)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(position >= 0, 2, "position must be positive");
+
+  if(position <= mfself->size)
+    mfself->position = position;
+  else
+  {
+    mfself->file.hasError = 1;
+    if(!mfself->file.isQuiet)
+      THError("unable to seek at position %zu", position);
+  }
+}
+
+static void THMemoryFile_seekEnd(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+
+  mfself->position = mfself->size;
+}
+
+static ssize_t THMemoryFile_position(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  return mfself->position;
+}
+
+static void THMemoryFile_close(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THCharStorage_free(mfself->storage);
+  mfself->storage = NULL;
+}
+
+static void THMemoryFile_free(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  if(mfself->storage)
+    THCharStorage_free(mfself->storage);
+
+  THFree(mfself);
+}
+
+/* READ_WRITE_METHODS(bool, Bool, */
+/*                    int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */
+/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%d", value), */
+/*                    1) */
+
+READ_WRITE_METHODS(uint8_t, Byte,
+                   ssize_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
+                   if(spacePtr) *spacePtr = spaceChar; \
+                   nByteRead = ret; \
+                   nread = ret; \
+                   i = n-1; \
+                   memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
+                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
+                   i = n-1; \
+                   if(nByteWritten > -1)
+                     memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
+                   0)
+
+/* DEBUG: we should check if %n is count or not as a element (so ret might need to be ret-- on some systems) */
+/* Note that we do a trick for char */
+READ_WRITE_METHODS(int8_t, Char,
+                   ssize_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
+                   if(spacePtr) *spacePtr = spaceChar; \
+                   nByteRead = ret; \
+                   nread = ret; \
+                   i = n-1; \
+                   memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
+                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
+                   i = n-1; \
+                   if(nByteWritten > -1)
+                     memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
+                   0)
+
+READ_WRITE_METHODS(int16_t, Short,
+                   int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%hd", data[i]),
+                   1)
+
+READ_WRITE_METHODS(int32_t, Int,
+                   int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]),
+                   1)
+
+READ_WRITE_METHODS(float, Float,
+                   int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]),
+                   1)
+
+READ_WRITE_METHODS(THHalf, Half,
+                   int nByteRead_; float buf; \
+                   int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \
+                   data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])),
+                   1)
+
+READ_WRITE_METHODS(double, Double,
+                   int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]),
+                   1)
+
+static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  ssize_t nread = 0L;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
+
+  if (n == 0)
+    return 0;
+
+  if(mfself->file.isBinary)
+  {
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t))
+    {
+      ssize_t nByte = sizeof(int64_t)*n;
+      ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      nread = nByteRemaining/sizeof(int64_t);
+      memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nread*sizeof(int64_t));
+      mfself->position += nread*sizeof(int64_t);
+    } else if(mfself->longSize == 4)
+    {
+      ssize_t nByte = 4*n;
+      ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position);
+      nread = nByteRemaining/4;
+      ssize_t i;
+      for(i = 0; i < nread; i++)
+        data[i] = storage[i];
+      mfself->position += nread*4;
+    }
+    else /* if(mfself->longSize == 8) */
+    {
+      int big_endian = !THDiskFile_isLittleEndianCPU();
+      ssize_t nByte = 8*n;
+      int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position);
+      ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      nread = nByteRemaining/8;
+      ssize_t i;
+      for(i = 0; i < nread; i++)
+        data[i] = storage[2*i + big_endian];
+      mfself->position += nread*8;
+    }
+  }
+  else
+  {
+    ssize_t i;
+    for(i = 0; i < n; i++)
+    {
+      ssize_t nByteRead = 0;
+      int8_t spaceChar = 0;
+      int8_t *spacePtr = THMemoryFile_strnextspace(THCharStorage_data(mfself->storage)+mfself->position, &spaceChar);
+      int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%" PRId64 "%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++;
+      if(ret == EOF)
+      {
+        while(THCharStorage_data(mfself->storage)[mfself->position])
+          mfself->position++;
+      }
+      else
+        mfself->position += nByteRead;
+      if(spacePtr)
+        *spacePtr = spaceChar;
+    }
+    if(mfself->file.isAutoSpacing && (n > 0))
+    {
+      if( (mfself->position < mfself->size) && (THCharStorage_data(mfself->storage)[mfself->position] == '\n') )
+        mfself->position++;
+    }
+  }
+
+  if(nread != n)
+  {
+    mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
+    if(!mfself->file.isQuiet)
+      THError("read error: read %d blocks instead of %d", nread, n);
+  }
+
+  return nread;
+}
+
+static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  if (n == 0)
+    return 0;
+
+  if(mfself->file.isBinary)
+  {
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t))
+    {
+      ssize_t nByte = sizeof(int64_t)*n;
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByte);
+      mfself->position += nByte;
+    } else if(mfself->longSize == 4)
+    {
+      ssize_t nByte = 4*n;
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position);
+      ssize_t i;
+      for(i = 0; i < n; i++)
+        storage[i] = (int32_t) data[i];
+      mfself->position += nByte;
+    }
+    else /* if(mfself->longSize == 8) */
+    {
+      int big_endian = !THDiskFile_isLittleEndianCPU();
+      ssize_t nByte = 8*n;
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position);
+      ssize_t i;
+      for(i = 0; i < n; i++)
+      {
+        storage[2*i + !big_endian] = 0;
+        storage[2*i + big_endian] = (int32_t) data[i];
+      }
+      mfself->position += nByte;
+    }
+    if(mfself->position > mfself->size)
+    {
+      mfself->size = mfself->position;
+      THCharStorage_data(mfself->storage)[mfself->size] = '\0';
+    }
+  }
+  else
+  {
+    ssize_t i;
+    for(i = 0; i < n; i++)
+    {
+      ssize_t nByteWritten;
+      while (1)
+      {
+        nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%" PRId64, data[i]);
+        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) )
+        {
+          mfself->position += nByteWritten;
+          break;
+        }
+        THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2);
+      }
+      if(mfself->file.isAutoSpacing)
+      {
+        if(i < n-1)
+        {
+          THMemoryFile_grow(mfself, mfself->position+1);
+          sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, " ");
+          mfself->position++;
+        }
+        if(i == n-1)
+        {
+          THMemoryFile_grow(mfself, mfself->position+1);
+          sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, "\n");
+          mfself->position++;
+        }
+      }
+    }
+    if(mfself->position > mfself->size)
+    {
+      mfself->size = mfself->position;
+      THCharStorage_data(mfself->storage)[mfself->size] = '\0';
+    }
+  }
+
+  return n;
+}
+
+static int8_t* THMemoryFile_cloneString(const int8_t *str, ssize_t size)
+{
+  int8_t *cstr = static_cast<int8_t*>(THAlloc(size));
+  memcpy(cstr, str, size);
+  return cstr;
+}
+
+static ssize_t THMemoryFile_readString(THFile *self, const char *format, char **str_)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
+  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
+
+  if(mfself->position == mfself->size) /* eof ? */
+  {
+    mfself->file.hasError = 1;
+    if(!mfself->file.isQuiet)
+      THError("read error: read 0 blocks instead of 1");
+
+    *str_ = NULL;
+    return 0;
+  }
+
+  if(format[1] == 'a')
+  {
+    ssize_t str_size = mfself->size-mfself->position;
+
+    *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, str_size);
+    mfself->position = mfself->size;
+
+    return str_size;
+  }
+  else
+  {
+    int8_t *p = THCharStorage_data(mfself->storage)+mfself->position;
+    int eolFound = 0;
+    ssize_t posEol;
+    ssize_t i;
+    for(i = 0; i < mfself->size-mfself->position; i++)
+    {
+      if(p[i] == '\n')
+      {
+        posEol = i;
+        eolFound = 1;
+        break;
+      }
+    }
+
+    if(eolFound)
+    {
+      *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, posEol);
+      mfself->position += posEol+1;
+      return posEol;
+    }
+    else /* well, we read all! */
+    {
+      ssize_t str_size = mfself->size-mfself->position;
+
+      *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, str_size);
+      mfself->position = mfself->size;
+
+      return str_size;
+    }
+  }
+
+  *str_ = NULL;
+  return 0;
+}
+
+static ssize_t THMemoryFile_writeString(THFile *self, const char *str, ssize_t size)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  THMemoryFile_grow(mfself, mfself->position+size);
+  memmove(THCharStorage_data(mfself->storage)+mfself->position, str, size);
+  mfself->position += size;
+  if(mfself->position > mfself->size)
+  {
+    mfself->size = mfself->position;
+    THCharStorage_data(mfself->storage)[mfself->size] = '\0';
+  }
+
+  return size;
+}
+
+THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
+{
+  static struct THFileVTable vtable = {
+    THMemoryFile_isOpened,
+
+    THMemoryFile_readByte,
+    THMemoryFile_readChar,
+    THMemoryFile_readShort,
+    THMemoryFile_readInt,
+    THMemoryFile_readLong,
+    THMemoryFile_readFloat,
+    THMemoryFile_readDouble,
+    THMemoryFile_readHalf,
+    THMemoryFile_readString,
+
+    THMemoryFile_writeByte,
+    THMemoryFile_writeChar,
+    THMemoryFile_writeShort,
+    THMemoryFile_writeInt,
+    THMemoryFile_writeLong,
+    THMemoryFile_writeFloat,
+    THMemoryFile_writeDouble,
+    THMemoryFile_writeHalf,
+    THMemoryFile_writeString,
+
+    THMemoryFile_synchronize,
+    THMemoryFile_seek,
+    THMemoryFile_seekEnd,
+    THMemoryFile_position,
+    THMemoryFile_close,
+    THMemoryFile_free
+  };
+
+  THMemoryFile *mfself;
+  int isReadable;
+  int isWritable;
+
+  if(storage)
+  {
+    THArgCheck(THCharStorage_data(storage)[storage->size-1] == '\0', 1, "provided CharStorage must be terminated by 0");
+    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+    THCharStorage_retain(storage);
+  }
+  else
+  {
+    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+    storage = THCharStorage_newWithSize(1);
+    THCharStorage_data(storage)[0] = '\0';
+  }
+
+  mfself = static_cast<THMemoryFile*>(THAlloc(sizeof(THMemoryFile)));
+
+  mfself->storage = storage;
+  mfself->size = (storage ? storage->size-1 : 0);
+  mfself->position = 0;
+  mfself->longSize = 0;
+
+  mfself->file.vtable = &vtable;
+  mfself->file.isQuiet = 0;
+  mfself->file.isReadable = isReadable;
+  mfself->file.isWritable = isWritable;
+  mfself->file.isBinary = 0;
+  mfself->file.isAutoSpacing = 1;
+  mfself->file.hasError = 0;
+
+  return (THFile*)mfself;
+}
+
+THFile *THMemoryFile_new(const char *mode)
+{
+  return THMemoryFile_newWithStorage(NULL, mode);
+}
diff --git a/aten/src/TH/THMemoryFile.h b/aten/src/TH/THMemoryFile.h
new file mode 100644
index 0000000..b54cdcc
--- /dev/null
+++ b/aten/src/TH/THMemoryFile.h
@@ -0,0 +1,13 @@
+#ifndef TH_MEMORY_FILE_INC
+#define TH_MEMORY_FILE_INC
+
+#include "THFile.h"
+#include "THStorage.h"
+
+TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode);
+TH_API THFile *THMemoryFile_new(const char *mode);
+
+TH_API THCharStorage *THMemoryFile_storage(THFile *self);
+TH_API void THMemoryFile_longSize(THFile *self, int size);
+
+#endif
diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp
new file mode 100644
index 0000000..8755f77
--- /dev/null
+++ b/aten/src/TH/THRandom.cpp
@@ -0,0 +1,316 @@
+#include "THGeneral.h"
+#include "THRandom.h"
+#include "THGenerator.hpp"
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+/* Code for the Mersenne Twister random generator.... */
+#define n _MERSENNE_STATE_N
+#define m _MERSENNE_STATE_M
+
+/* Creates (unseeded) new generator*/
+static THGenerator* THGenerator_newUnseeded()
+{
+  THGenerator *self = (THGenerator *)THAlloc(sizeof(THGenerator));
+  memset(self, 0, sizeof(THGenerator));
+  self->gen_state.left = 1;
+  self->gen_state.seeded = 0;
+  self->gen_state.normal_is_valid = 0;
+  new (&self->mutex) std::mutex();
+  return self;
+}
+
+/* Creates new generator and makes sure it is seeded*/
+THGenerator* THGenerator_new()
+{
+  THGenerator *self = THGenerator_newUnseeded();
+  THRandom_seed(self);
+  return self;
+}
+
+THGenerator* THGenerator_copy(THGenerator *self, THGenerator *from)
+{
+    THGeneratorState_copy(&self->gen_state, &from->gen_state);
+    return self;
+}
+
+void THGenerator_free(THGenerator *self)
+{
+  self->mutex.~mutex();
+  THFree(self);
+}
+
+int THGeneratorState_isValid(THGeneratorState *_gen_state)
+{
+  if ((_gen_state->seeded == 1) &&
+    (_gen_state->left > 0 && _gen_state->left <= n) && (_gen_state->next <= n))
+    return 1;
+
+  return 0;
+}
+
+THGeneratorState* THGeneratorState_copy(THGeneratorState *self, THGeneratorState *from)
+{
+  memcpy(self, from, sizeof(THGeneratorState));
+  return self;
+}
+
+#ifndef _WIN32
+static uint64_t readURandomLong()
+{
+  int randDev = open("/dev/urandom", O_RDONLY);
+  uint64_t randValue;
+  if (randDev < 0) {
+    THError("Unable to open /dev/urandom");
+  }
+  ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
+  if (readBytes < (ssize_t) sizeof(randValue)) {
+    THError("Unable to read from /dev/urandom");
+  }
+  close(randDev);
+  return randValue;
+}
+#endif // _WIN32
+
+uint64_t THRandom_seed(THGenerator *_generator)
+{
+#ifdef _WIN32
+  uint64_t s = (uint64_t)time(0);
+#else
+  uint64_t s = readURandomLong();
+#endif
+  THRandom_manualSeed(_generator, s);
+  return s;
+}
+
+/* The next 4 methods are taken from http:www.math.keio.ac.jpmatumotoemt.html
+   Here is the copyright:
+   Some minor modifications have been made to adapt to "my" C... */
+
+/*
+   A C-program for MT19937, with initialization improved 2002/2/10.
+   Coded by Takuji Nishimura and Makoto Matsumoto.
+   This is a faster version by taking Shawn Cokus's optimization,
+   Matthe Bellew's simplification, Isaku Wada's double version.
+
+   Before using, initialize the state by using init_genrand(seed)
+   or init_by_array(init_key, key_length).
+
+   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+     3. The names of its contributors may not be used to endorse or promote
+        products derived from this software without specific prior written
+        permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+   Any feedback is very welcome.
+   http://www.math.keio.ac.jp/matumoto/emt.html
+   email: matumoto@math.keio.ac.jp
+*/
+
+/* Macros for the Mersenne Twister random generator... */
+/* Period parameters */
+/* #define n 624 */
+/* #define m 397 */
+#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
+#define UMASK 0x80000000UL /* most significant w-r bits */
+#define LMASK 0x7fffffffUL /* least significant r bits */
+#define MIXBITS(u,v) ( ((u) & UMASK) | ((v) & LMASK) )
+#define TWIST(u,v) ((MIXBITS(u,v) >> 1) ^ ((v)&1UL ? MATRIX_A : 0UL))
+/*********************************************************** That's it. */
+
+void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_)
+{
+  int j;
+
+  /* This ensures reseeding resets all of the state (i.e. state for Gaussian numbers) */
+  THGenerator *blank = THGenerator_newUnseeded();
+  THGenerator_copy(_generator, blank);
+  THGenerator_free(blank);
+
+  _generator->gen_state.the_initial_seed = the_seed_;
+  _generator->gen_state.state[0] = _generator->gen_state.the_initial_seed & 0xffffffffUL;
+  for(j = 1; j < n; j++)
+  {
+    _generator->gen_state.state[j] = (1812433253UL * (_generator->gen_state.state[j-1] ^ (_generator->gen_state.state[j-1] >> 30)) + j);
+    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+    /* In the previous versions, mSBs of the seed affect   */
+    /* only mSBs of the array state[].                        */
+    /* 2002/01/09 modified by makoto matsumoto             */
+    _generator->gen_state.state[j] &= 0xffffffffUL;  /* for >32 bit machines */
+  }
+  _generator->gen_state.left = 1;
+  _generator->gen_state.seeded = 1;
+}
+
+uint64_t THRandom_initialSeed(THGenerator *_generator)
+{
+  return _generator->gen_state.the_initial_seed;
+}
+
+void THRandom_nextState(THGenerator *_generator)
+{
+  uint64_t *p = _generator->gen_state.state;
+  int j;
+
+  _generator->gen_state.left = n;
+  _generator->gen_state.next = 0;
+
+  for(j = n-m+1; --j; p++)
+    *p = p[m] ^ TWIST(p[0], p[1]);
+
+  for(j = m; --j; p++)
+    *p = p[m-n] ^ TWIST(p[0], p[1]);
+
+  *p = p[m-n] ^ TWIST(p[0], _generator->gen_state.state[0]);
+}
+
+// TODO: this only returns 32-bits of randomness but as a uint64_t. This is
+// weird and should be fixed. We should also fix the state to be uint32_t
+// instead of uint64_t. (Or switch to a 64-bit random number generator).
+uint64_t THRandom_random(THGenerator *_generator)
+{
+  uint64_t y;
+
+  if (--(_generator->gen_state.left) == 0)
+    THRandom_nextState(_generator);
+  y = *(_generator->gen_state.state + (_generator->gen_state.next)++);
+
+  /* Tempering */
+  y ^= (y >> 11);
+  y ^= (y << 7) & 0x9d2c5680UL;
+  y ^= (y << 15) & 0xefc60000UL;
+  y ^= (y >> 18);
+
+  return y;
+}
+
+uint64_t THRandom_random64(THGenerator *_generator)
+{
+  uint64_t hi = THRandom_random(_generator);
+  uint64_t lo = THRandom_random(_generator);
+  return (hi << 32) | lo;
+}
+
+// doubles have 52 bits of mantissa (fractional part)
+static uint64_t DOUBLE_MASK = (1ULL << 53) - 1;
+static double DOUBLE_DIVISOR = 1.0 / (1ULL << 53);
+
+// floats have 23 bits of mantissa (fractional part)
+static uint32_t FLOAT_MASK = (1 << 24) - 1;
+static float FLOAT_DIVISOR = 1.0f / (1 << 24);
+
+/* generates a random number on [0,1)-double-interval */
+static double uniform_double(THGenerator *_generator)
+{
+  uint64_t x = THRandom_random64(_generator);
+  return (x & DOUBLE_MASK) * DOUBLE_DIVISOR;
+}
+
+/* generates a random number on [0,1)-double-interval */
+static float uniform_float(THGenerator *_generator)
+{
+  uint32_t x = (uint32_t)THRandom_random(_generator);
+  return (x & FLOAT_MASK) * FLOAT_DIVISOR;
+}
+
+/*********************************************************
+
+ Thanks *a lot* Takuji Nishimura and Makoto Matsumoto!
+
+ Now my own code...
+
+*********************************************************/
+
+double THRandom_standard_uniform(THGenerator *_generator)
+{
+  return uniform_double(_generator);
+}
+
+double THRandom_uniform(THGenerator *_generator, double a, double b)
+{
+  return(uniform_double(_generator) * (b - a) + a);
+}
+
+float THRandom_uniformFloat(THGenerator *_generator, float a, float b)
+{
+  return(uniform_float(_generator) * (b - a) + a);
+}
+
+double THRandom_normal(THGenerator *_generator, double mean, double stdv)
+{
+  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
+
+  /* This is known as the Box-Muller method */
+  if(!_generator->gen_state.normal_is_valid)
+  {
+    _generator->gen_state.normal_x = uniform_double(_generator);
+    _generator->gen_state.normal_y = uniform_double(_generator);
+    _generator->gen_state.normal_rho = sqrt(-2. * log(1.0-_generator->gen_state.normal_y));
+    _generator->gen_state.normal_is_valid = 1;
+  }
+  else
+    _generator->gen_state.normal_is_valid = 0;
+
+  if(_generator->gen_state.normal_is_valid)
+    return _generator->gen_state.normal_rho*cos(2.*M_PI*_generator->gen_state.normal_x)*stdv+mean;
+  else
+    return _generator->gen_state.normal_rho*sin(2.*M_PI*_generator->gen_state.normal_x)*stdv+mean;
+}
+
+double THRandom_exponential(THGenerator *_generator, double lambda)
+{
+  return(-1. / lambda * log(1-uniform_double(_generator)));
+}
+
+double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
+{
+  return(median + sigma * tan(M_PI*(uniform_double(_generator)-0.5)));
+}
+
+/* Faut etre malade pour utiliser ca.
+   M'enfin. */
+double THRandom_logNormal(THGenerator *_generator, double mean, double stdv)
+{
+  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
+  return(exp(THRandom_normal(_generator, mean, stdv)));
+}
+
+int THRandom_geometric(THGenerator *_generator, double p)
+{
+  THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1");
+  return((int)(log(1-uniform_double(_generator)) / log(p)) + 1);
+}
+
+int THRandom_bernoulli(THGenerator *_generator, double p)
+{
+  THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1");
+  return(uniform_double(_generator) <= p);
+}
diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h
new file mode 100644
index 0000000..5460d33
--- /dev/null
+++ b/aten/src/TH/THRandom.h
@@ -0,0 +1,83 @@
+#ifndef TH_RANDOM_INC
+#define TH_RANDOM_INC
+
+#include "THGeneral.h"
+
+#define _MERSENNE_STATE_N 624
+#define _MERSENNE_STATE_M 397
+
+/* Struct definition is moved to THGenerator.hpp, because THRandom.h
+needs to be C-compatible in order to be included in C FFI extensions. */
+typedef struct THGenerator THGenerator;
+typedef struct THGeneratorState THGeneratorState;
+
+#define torch_Generator "torch.Generator"
+
+/* Manipulate THGenerator objects */
+TH_API THGenerator * THGenerator_new(void);
+TH_API THGenerator * THGenerator_copy(THGenerator *self, THGenerator *from);
+TH_API void THGenerator_free(THGenerator *gen);
+
+/* Checks if given generator state is valid */
+TH_API int THGeneratorState_isValid(THGeneratorState *_gen_state);
+
+/* Manipulate THGeneratorState objects */
+TH_API THGeneratorState * THGeneratorState_copy(THGeneratorState *self, THGeneratorState *from);
+
+/* Initializes the random number generator from /dev/urandom (or on Windows
+platforms with the current time (granularity: seconds)) and returns the seed. */
+TH_API uint64_t THRandom_seed(THGenerator *_generator);
+
+/* Initializes the random number generator with the given int64_t "the_seed_". */
+TH_API void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_);
+
+/* Returns the starting seed used. */
+TH_API uint64_t THRandom_initialSeed(THGenerator *_generator);
+
+/* Generates a uniform 32 bits integer. */
+TH_API uint64_t THRandom_random(THGenerator *_generator);
+
+/* Generates a uniform 64 bits integer. */
+TH_API uint64_t THRandom_random64(THGenerator *_generator);
+
+/* Generates a uniform random double on [0,1). */
+TH_API double THRandom_standard_uniform(THGenerator *_generator);
+
+/* Generates a uniform random double on [a, b). */
+TH_API double THRandom_uniform(THGenerator *_generator, double a, double b);
+
+/* Generates a uniform random float on [0,1). */
+TH_API float THRandom_uniformFloat(THGenerator *_generator, float a, float b);
+
+/** Generates a random number from a normal distribution.
+    (With mean #mean# and standard deviation #stdv >= 0#).
+*/
+TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv);
+
+/** Generates a random number from an exponential distribution.
+    The density is $p(x) = lambda * exp(-lambda * x)$, where
+    lambda is a positive number.
+*/
+TH_API double THRandom_exponential(THGenerator *_generator, double lambda);
+
+/** Returns a random number from a Cauchy distribution.
+    The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$
+*/
+TH_API double THRandom_cauchy(THGenerator *_generator, double median, double sigma);
+
+/** Generates a random number from a log-normal distribution.
+    (#mean > 0# is the mean of the log-normal distribution
+    and #stdv# is its standard deviation).
+*/
+TH_API double THRandom_logNormal(THGenerator *_generator, double mean, double stdv);
+
+/** Generates a random number from a geometric distribution.
+    It returns an integer #i#, where $p(i) = (1-p) * p^(i-1)$.
+    p must satisfy $0 < p < 1$.
+*/
+TH_API int THRandom_geometric(THGenerator *_generator, double p);
+
+/* Returns true with probability $p$ and false with probability $1-p$ (p > 0). */
+TH_API int THRandom_bernoulli(THGenerator *_generator, double p);
+
+#endif
diff --git a/aten/src/TH/THSize.cpp b/aten/src/TH/THSize.cpp
new file mode 100644
index 0000000..2eb0039
--- /dev/null
+++ b/aten/src/TH/THSize.cpp
@@ -0,0 +1,26 @@
+#include "THSize.h"
+
+int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB) {
+  int d;
+  if (dimsA != dimsB)
+    return 0;
+  for(d = 0; d < dimsA; ++d)
+  {
+    if(sizeA[d] != sizeB[d])
+      return 0;
+  }
+  return 1;
+}
+
+ptrdiff_t THSize_nElement(int64_t dims, int64_t *size) {
+  if(dims == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < dims; d++)
+      nElement *= size[d];
+    return nElement;
+  }
+}
diff --git a/aten/src/TH/THSize.h b/aten/src/TH/THSize.h
new file mode 100644
index 0000000..2927f21
--- /dev/null
+++ b/aten/src/TH/THSize.h
@@ -0,0 +1,13 @@
+#ifndef TH_SIZE_INC
+#define TH_SIZE_INC
+
+#include "THGeneral.h"
+#include <stddef.h>
+
+// THTensor functions that would work on a THSize if we had such a class in C++,
+// i.e. THTensor functions that depend only on the shape of the tensor, not the type.
+
+TH_API int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB);
+TH_API ptrdiff_t THSize_nElement(int64_t dims, int64_t *size);
+
+#endif
diff --git a/aten/src/TH/THStorage.cpp b/aten/src/TH/THStorage.cpp
new file mode 100644
index 0000000..f4910c3
--- /dev/null
+++ b/aten/src/TH/THStorage.cpp
@@ -0,0 +1,228 @@
+#include <climits>
+
+#include "THStorage.hpp"
+
+#include "generic/THStorage.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorage.cpp"
+#include "THGenerateHalfType.h"
+
+#include "generic/THStorageCopy.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.cpp"
+#include "THGenerateHalfType.h"
+
+// Free a non-weak pointer to THStorage
+void THStorage_free(THStorage *storage) {
+  if (!storage) {
+    return;
+  }
+
+  if (storage->flag & TH_STORAGE_REFCOUNTED) {
+    if (--storage->refcount == 0) {
+      if (storage->finalizer) {
+        (*storage->finalizer)();
+      }
+      storage->finalizer.~unique_ptr<THFinalizer>();
+      storage->data_ptr.~DataPtr();
+      THStorage_weakFree(storage);
+    }
+  }
+}
+
+// Manually retains a weak reference
+void THStorage_weakRetain(THStorage *weak_storage) {
+  weak_storage->weakcount++;
+}
+
+// Releases a weak reference
+void THStorage_weakFree(THStorage *weak_storage) {
+  if (--weak_storage->weakcount == 0) {
+    weak_storage->refcount.~atomic<int>();
+    weak_storage->weakcount.~atomic<int>();
+    THFree(weak_storage);
+  }
+}
+
+// Given a weak reference, returns a strong reference to a storage (which must
+// be freed when done) or null if the storage is already dead.
+THStorage* THStorage_weakLock(THStorage *weak_storage) {
+  for (;;) {
+    int refcount = weak_storage->refcount.load();
+    if (refcount == 0) return nullptr;
+    if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break;
+  }
+  return weak_storage;
+}
+
+THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
+  return _THSizeDesc(THLongStorage_data(size), size->size);
+}
+
+THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
+{
+  ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
+  ptrdiff_t dim_infer = -1;
+  ptrdiff_t i;
+  for (i = 0; i < size->size; i++) {
+    if (THLongStorage_data(size)[i] == -1) {
+      THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred");
+      dim_infer = i;
+    } else {
+      total_size *= THLongStorage_data(size)[i];
+    }
+  }
+  if (dim_infer != -1) {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(total_size > 0 && nElement % total_size == 0, 2,
+        "size '%s' is invalid for input with %td elements", buf.str, nElement);
+  } else {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(nElement == total_size, 2,
+        "size '%s' is invalid for input with %td elements", buf.str, nElement);
+  }
+  THLongStorage* copy = THLongStorage_newWithSize(size->size);
+  THLongStorage_copy(copy, size);
+  if (dim_infer != -1) {
+    THLongStorage_data(copy)[dim_infer] = nElement / total_size;
+  }
+  return copy;
+}
+
+THStorage* THStorage_new(at::ScalarType scalar_type)
+{
+  return THStorage_newWithSize(scalar_type, 0);
+}
+
+THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size)
+{
+  return THStorage_newWithAllocator(scalar_type, size, getTHDefaultAllocator());
+}
+
+THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
+                                      at::Allocator *allocator)
+{
+  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
+  storage->scalar_type = scalar_type;
+  new (&storage->data_ptr) at::DataPtr(allocator->allocate(at::elementSize(scalar_type)*size));
+  storage->size = size;
+  new (&storage->refcount) std::atomic<int>(1);
+  new (&storage->weakcount) std::atomic<int>(1); // from the strong reference
+  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
+  storage->allocator = allocator;
+  return storage;
+}
+
+ptrdiff_t THStorage_size(const THStorage *self)
+{
+  return self->size;
+}
+
+size_t THStorage_elementSize(const THStorage *self)
+{
+  return at::elementSize(self->scalar_type);
+}
+
+THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags)
+{
+  size_t actual_size = -1;
+  THStorage *storage = THStorage_newWithDataAndAllocator(scalar_type,
+                                                         THMapAllocator::makeDataPtr(
+                                                            filename,
+                                                            flags,
+                                                            size * at::elementSize(scalar_type),
+                                                            &actual_size),
+                                                         size,
+                                                         /* allocator */ nullptr);
+
+  if (size <= 0) {
+    storage->size = actual_size/THStorage_elementSize(storage);
+  }
+
+  THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE);
+
+  return storage;
+}
+
+void THStorage_setFlag(THStorage *storage, const char flag)
+{
+  storage->flag |= flag;
+}
+
+void THStorage_clearFlag(THStorage *storage, const char flag)
+{
+  storage->flag &= ~flag;
+}
+
+void THStorage_retain(THStorage *storage)
+{
+  if (storage && (storage->flag & TH_STORAGE_REFCOUNTED)) {
+    ++storage->refcount;
+  }
+}
+
+/*
+// I don't think you should ever call this
+THStorage* THStorage_newWithData(at::ScalarType scalar_type, std::unique_ptr<at::BoundDeleter> data, ptrdiff_t size)
+{
+  return THStorage_newWithDataAndAllocator(scalar_type, data, size,
+                                           getTHDefaultAllocator());
+}
+*/
+
+THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
+                                             at::DataPtr&& data, ptrdiff_t size,
+                                             THAllocator* allocator) {
+  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
+  storage->scalar_type = scalar_type;
+  new (&storage->data_ptr) at::DataPtr(std::move(data));
+  storage->size = size;
+  new (&storage->refcount) std::atomic<int>(1);
+  new (&storage->weakcount) std::atomic<int>(1); // from the strong reference
+  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
+  storage->allocator = allocator;
+  return storage;
+}
+
+void THStorage_resize(THStorage *storage, ptrdiff_t size)
+{
+  if (storage->flag & TH_STORAGE_RESIZABLE)
+  {
+    /* case when the allocator does not have a realloc defined */
+    at::DataPtr old_data;
+    std::swap(old_data, storage->data_ptr);
+    ptrdiff_t old_size = storage->size;
+    if (size != 0) {
+      storage->data_ptr = storage->allocator->allocate(at::elementSize(storage->scalar_type)*size);
+    }
+    storage->size = size;
+    if (old_data != nullptr) {
+      ptrdiff_t copy_size = old_size;
+      if (storage->size < copy_size) {
+        copy_size = storage->size;
+      }
+      if (copy_size > 0) {
+        memcpy(storage->data_ptr.get(), old_data.get(), at::elementSize(storage->scalar_type)*copy_size);
+      }
+    }
+  } else {
+    THError("Trying to resize storage that is not resizable");
+  }
+}
+
+void THStorage_swap(THStorage *storage1, THStorage *storage2)
+{
+#define SWAP(val) { std::swap(storage1->val, storage2->val); }
+    SWAP(scalar_type);
+    SWAP(data_ptr);
+    SWAP(size);
+    // don't swap refcount!
+    SWAP(flag);
+    SWAP(allocator);
+    SWAP(finalizer);
+#undef SWAP
+}
diff --git a/aten/src/TH/THStorage.h b/aten/src/TH/THStorage.h
new file mode 100644
index 0000000..ce53827
--- /dev/null
+++ b/aten/src/TH/THStorage.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "THGeneral.h"
+#include "THAllocator.h"
+
+#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
+
+#include "generic/THStorage.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorage.h"
+#include "THGenerateHalfType.h"
+
+#include "generic/THStorageCopy.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.h"
+#include "THGenerateHalfType.h"
+
+// This exists to have a data-type independent way of freeing (necessary for THPPointer).
+TH_API void THStorage_free(THStorage *storage);
+TH_API void THStorage_weakFree(THStorage *storage);
+
+TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
diff --git a/aten/src/TH/THStorage.hpp b/aten/src/TH/THStorage.hpp
new file mode 100644
index 0000000..d767ada
--- /dev/null
+++ b/aten/src/TH/THStorage.hpp
@@ -0,0 +1,86 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include "THStorage.h"
+
+#include <ATen/ScalarType.h>
+#include <ATen/ScalarTypeUtils.h>
+#include "THTypeConversion.hpp"
+#include <atomic>
+
+// Note [Weak references for intrusive refcounting]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Here's the scheme:
+//
+//  - refcount == number of strong references to the object
+//    weakcount == number of weak references to the object,
+//      plus one more if refcount > 0
+//
+//  - THStorage stays live as long as there are any strong
+//    or weak pointers to it (weakcount > 0, since strong
+//    references count as a +1 to weakcount)
+//
+//  - finalizers are called and data_ptr is deallocated when refcount == 0
+//
+//  - Once refcount == 0, it can never again be > 0 (the transition
+//    from > 0 to == 0 is monotonic)
+//
+//  - When you access THStorage via a weak pointer, you must
+//    atomically increment the use count, if it is greater than 0.
+//    If it is not, you must report that the storage is dead.
+//
+
+struct THFinalizer {
+  virtual void operator()() = 0;
+  virtual ~THFinalizer() {};
+};
+
+typedef struct THStorage
+{
+    at::ScalarType scalar_type;
+    at::DataPtr data_ptr;
+    ptrdiff_t size;
+    std::atomic<int> refcount;
+    std::atomic<int> weakcount;
+    char flag;
+    at::Allocator *allocator;
+    std::unique_ptr<THFinalizer> finalizer;
+
+    template <typename T>
+    inline T * data() const {
+      auto scalar_type_T = at::CTypeToScalarType<th::from_type<T>>::to();
+      if (scalar_type != scalar_type_T) {
+        AT_ERROR("Attempt to access Storage having data type ", at::toString(scalar_type),
+                 " as data type ", at::toString(scalar_type_T));
+      }
+      return unsafe_data<T>();
+    }
+
+    template <typename T>
+    inline T * unsafe_data() const {
+      return static_cast<T*>(this->data_ptr.get());
+    }
+} THStorage;
+
+TH_API THStorage* THStorage_new(at::ScalarType scalar_type);
+TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size);
+TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
+                                             at::Allocator *allocator);
+
+TH_API ptrdiff_t THStorage_size(const THStorage *self);
+TH_API size_t THStorage_elementSize();
+TH_API THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags);
+TH_API void THStorage_setFlag(THStorage *storage, const char flag);
+TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
+TH_API void THStorage_retain(THStorage *storage);
+TH_API THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
+                                                    at::DataPtr&& data, ptrdiff_t size,
+                                                    at::Allocator* allocator);
+TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
+TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);
+
+TH_API void THStorage_weakRetain(THStorage *weak_storage);
+TH_API void THStorage_weakFree(THStorage *weak_storage);
+TH_API THStorage* THStorage_weakLock(THStorage *weak_storage);
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
new file mode 100644
index 0000000..48ddcd2
--- /dev/null
+++ b/aten/src/TH/THTensor.cpp
@@ -0,0 +1,116 @@
+#include <cmath>
+#include <float.h>
+
+#include <atomic>
+#include "THTensor.hpp"
+#include "THVector.h"
+#include "generic/simd/simd.h"
+
+#include "THBlas.h"
+#include "THLapack.h"
+#include "THRandom.h"
+#include "THTensorDimApply.h"
+#include "THMath.h"
+
+#include "generic/THTensor.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensor.cpp"
+#include "THGenerateHalfType.h"
+
+#include "generic/THTensorCopy.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorCopy.cpp"
+#include "THGenerateHalfType.h"
+
+#include "generic/THTensorRandom.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorMath.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorConv.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorLapack.cpp"
+#include "THGenerateFloatTypes.h"
+
+#include <numeric>
+
+void THTensor_free(THTensor *self)
+{
+  if(!self)
+    return;
+
+  if(--self->refcount == 0)
+  {
+    delete self;
+  }
+}
+
+// On a high level,
+// 1. separate oldshape chunks of dimensions, where the dimensions are
+//    ``contiguous'' in each chunk, i.e., oldstride[i] = oldshape[i+1] * oldstride[i+1]
+// 2. newshape must be able to be separated into same number of chunks as oldshape was separated into,
+//    where each chunk of newshape has matching ``numel'', i.e., number of subspaces,
+//    as the corresponding chunk of oldshape.
+at::optional<std::vector<int64_t>>
+THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape) {
+  if (oldshape.empty()) {
+    return std::vector<int64_t>(newshape.size(), 1);
+  }
+
+  // NOTE: stride is arbitrary is somewhat arbitrary in the numel() == 0 case;
+  // to match NumPy behavior we copy the strides if the size matches, otherwise
+  // we use the stride as if it were computed via resize.
+  // This could perhaps be combined with the below code, but the complexity didn't seem worth it.
+  int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies<int64_t>());
+  if (numel == 0 && oldshape.equals(newshape)) {
+    return std::vector<int64_t>(oldstride);
+  }
+
+  std::vector<int64_t> newstride(newshape.size());
+  if (numel == 0) {
+    int64_t view_numel = 1;
+    for (int64_t view_d = newshape.size() - 1; view_d >= 0; view_d--) {
+      if (view_d == newshape.size() - 1) {
+        newstride[view_d] = 1;
+      } else {
+        newstride[view_d] = std::max<int64_t>(newshape[view_d+1], 1) * newstride[view_d+1];
+      }
+    }
+    return newstride;
+  }
+
+  int64_t view_d = newshape.size() - 1;
+  // stride for each subspace in the chunk
+  int64_t chunk_base_stride = oldstride.back();
+  // numel in current chunk
+  int64_t tensor_numel = 1;
+  int64_t view_numel = 1;
+  for (int64_t tensor_d = oldshape.size() - 1; tensor_d >= 0; tensor_d--) {
+    tensor_numel *= oldshape[tensor_d];
+    // if end of tensor size chunk, check view
+    if ((tensor_d == 0) ||
+        (oldshape[tensor_d - 1] != 1 && oldstride[tensor_d - 1] != tensor_numel * chunk_base_stride)) {
+      while (view_d >= 0 && (view_numel < tensor_numel || newshape[view_d] == 1)) {
+        newstride[view_d] = view_numel * chunk_base_stride;
+        view_numel *= newshape[view_d];
+        view_d--;
+      }
+      if (view_numel != tensor_numel) {
+        return at::nullopt;
+      }
+      if (tensor_d > 0) {
+        chunk_base_stride = oldstride[tensor_d - 1];
+        tensor_numel = 1;
+        view_numel = 1;
+      }
+    }
+  }
+  if (view_d != -1) {
+    return at::nullopt;
+  }
+  return newstride;
+}
diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h
new file mode 100644
index 0000000..3984bf9
--- /dev/null
+++ b/aten/src/TH/THTensor.h
@@ -0,0 +1,38 @@
+#ifndef TH_TENSOR_INC
+#define TH_TENSOR_INC
+
+#include "THStorage.h"
+#include "THTensorApply.h"
+
+#define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
+
+/* basics */
+#include "generic/THTensor.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensor.h"
+#include "THGenerateHalfType.h"
+
+#include "generic/THTensorCopy.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorCopy.h"
+#include "THGenerateHalfType.h"
+
+/* random numbers */
+#include "THRandom.h"
+#include "generic/THTensorRandom.h"
+#include "THGenerateAllTypes.h"
+
+/* maths */
+#include "generic/THTensorMath.h"
+#include "THGenerateAllTypes.h"
+
+/* convolutions */
+#include "generic/THTensorConv.h"
+#include "THGenerateAllTypes.h"
+
+/* lapack support */
+#include "generic/THTensorLapack.h"
+#include "THGenerateFloatTypes.h"
+#endif
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
new file mode 100644
index 0000000..1d268a7
--- /dev/null
+++ b/aten/src/TH/THTensor.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include "THTensor.h"
+#include "THStorage.hpp"
+
+#include <atomic>
+#include <ATen/ATen.h>
+
+struct THTensor
+{
+    THTensor(THStorage* storage)
+      : refcount(1)
+      , storage(storage)
+      , storageOffset(0)
+      // TODO: Naughty naughty!
+      , size(static_cast<int64_t *>(THAlloc(sizeof(int64_t))))
+      , stride(static_cast<int64_t *>(THAlloc(sizeof(int64_t))))
+      , dim_(1)
+      {
+        size[0] = 0;
+        stride[0] = 1;
+      }
+
+    ~THTensor() {
+      THFree(size);
+      THFree(stride);
+      if (storage) {
+        THStorage_free(storage);
+      }
+    }
+
+    std::atomic<int> refcount;
+
+    // Note: storage->size may be greater than the recorded size
+    // of a tensor
+    THStorage *storage;
+    ptrdiff_t storageOffset;
+
+    int64_t *size;
+    int64_t *stride;
+    int64_t dim_;
+
+    template <typename T>
+    inline T * data() const {
+      return storage->data<T>() + storageOffset;
+    }
+
+    template <typename T>
+    inline T * unsafe_data() const {
+      return storage->unsafe_data<T>() + storageOffset;
+    }
+
+    // [NOTE: _dim() vs dim()]
+    // _dim() returns the "old" TH dimension view where no dimensions represents an empty tensor.
+    // dim()  returns the ATen view of the dimensionality, i.e. 0-sized dimensions are supported.
+    inline int64_t _dim() const {
+      return is_empty() ? 0 : dim_;
+    }
+
+    inline int64_t dim() const {
+      return dim_;
+    }
+
+    // represents that numel() == 0.
+    inline bool is_empty() const {
+      for (int64_t i = 0; i < dim_; ++i) {
+        if (size[i] == 0) {
+          return true;  
+        }
+      }
+      return false;
+    }
+
+    inline at::IntList sizes() {
+      return at::IntList(size, dim_);
+    }
+};
+
+#include "generic/THTensorFastGetSet.hpp"
+#include "THGenerateAllTypes.h"
+
+TH_API void THTensor_free(THTensor *self);
+TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
+                                                                      at::IntList newshape);
diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h
new file mode 100644
index 0000000..0b699e8
--- /dev/null
+++ b/aten/src/TH/THTensorApply.h
@@ -0,0 +1,526 @@
+#ifndef TH_TENSOR_APPLY_INC
+#define TH_TENSOR_APPLY_INC
+
+/*
+ * The basic strategy for apply is as follows:
+ *
+ * 1. Starting with the outermost index, loop until we reach a dimension where the
+ * data is no longer contiguous, i.e. the stride at that dimension is not equal to
+ * the size of the tensor defined by the outer dimensions. Let's call this outer
+ * (contiguous) tensor A. Note that if the Tensor is contiguous, then A is equal
+ * to the entire Tensor. Let's call the inner tensor B.
+ *
+ * 2. We loop through the indices in B, starting at its outermost dimension. For
+ * example, if B is a 2x2 matrix, then we do:
+ *
+ * B[0][0]
+ * B[0][1]
+ * B[1][0]
+ * B[1][1]
+ *
+ * We set the offset into the underlying storage as (storageOffset + stride_B * index_B),
+ * i.e. basically we compute the offset into the storage as we would normally for a
+ * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
+ * can simply loop for sizeof(A) iterations and perform the operation, without having to
+ * follow the order described by the strides of A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
+ * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
+ * dimensions can be merged for the purposes of APPLY, reducing the number of nested
+ * loops.
+ */
+
+#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
+  TYPE *TENSOR##_data = NULL; \
+  int64_t *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
+  int64_t TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
+  int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
+  TENSOR##_n = 1; \
+  for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \
+    TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
+\
+  if(TENSOR->is_empty()) \
+    TH_TENSOR_APPLY_hasFinished = 1; \
+  else \
+  { \
+    TENSOR##_data = TENSOR->storage->data<TYPE>()+TENSOR->storageOffset; \
+    TENSOR##_size = 1; \
+    TENSOR##_stride = 1; \
+    for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \
+      if(TENSOR->size[TENSOR##_i] != 1) { \
+        if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \
+          TENSOR##_size *= TENSOR->size[TENSOR##_i]; \
+        else{ \
+          TENSOR##_contiguous = 0; \
+          break; \
+        } \
+      } \
+    } \
+    if (!TENSOR##_contiguous) { \
+      /* Find the dimension of contiguous sections */ \
+      TENSOR##_dim = 1; \
+      for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; TENSOR##_i--) \
+      { \
+        if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
+          TENSOR##_dim++; \
+      } \
+      /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
+      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \
+      TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
+      TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
+      TH_TENSOR_dim_index = TENSOR##_dim-1; \
+      TENSOR##_dimOffset = (DIM == TENSOR->_dim()-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
+      TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->_dim()-1]; \
+      TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->_dim()-1]; \
+      /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
+      /* storage is given by storage_offset + (i * j), where i is the stride */ \
+      /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
+      for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
+        TENSOR##_counter[TENSOR##_i] = 0; \
+      } \
+      for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; --TENSOR##_i) { \
+        if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
+          if (DIM != TENSOR->_dim()-1 && TENSOR##_i < DIM) \
+            TENSOR##_dimOffset--; \
+        } else { \
+          --TH_TENSOR_dim_index; \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \
+          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \
+        } \
+      } \
+      /* Size of the inner most section */ \
+      TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
+      /* Stride of the inner most section */ \
+      TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
+    } \
+    else{\
+      TENSOR##_dim = 1;\
+      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*3);\
+      TENSOR##_sizes = TENSOR##_counter + 1;\
+      TENSOR##_strides = TENSOR##_counter + 2;\
+      TENSOR##_sizes[0] = TENSOR##_n;\
+      TENSOR##_strides[0] = 1;\
+      TENSOR##_size = TENSOR##_sizes[0];\
+      TENSOR##_stride = TENSOR##_strides[0];\
+    }\
+  } \
+  TENSOR##_i = 0;
+
+#define  __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
+  if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
+  { \
+    if(TENSOR##_contiguous) \
+      break; \
+\
+    if(TENSOR##_dim == 1) \
+       break; \
+\
+    /* Reset pointer to beginning of loop */ \
+    TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
+    for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
+    { \
+      TENSOR##_counter[TENSOR##_i]++; \
+      /* Jump ahread by the stride of this dimension */ \
+      TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
+\
+      if(TENSOR##_counter[TENSOR##_i]  == TENSOR##_sizes[TENSOR##_i]) \
+      { \
+        if(TENSOR##_i == 0) \
+        { \
+          TH_TENSOR_APPLY_hasFinished = 1; \
+          break; \
+        } \
+          else \
+        { \
+          /* Reset the pointer to the beginning of the chunk defined by this dimension */ \
+          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
+          TENSOR##_counter[TENSOR##_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+    TENSOR##_i = 0; \
+  } \
+
+#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
+                                                                        \
+  int elements_equal = 1;                                               \
+  if(TENSOR1##_n != TENSOR2##_n) {                                      \
+    elements_equal = 0;                                                 \
+  }                                                                     \
+  else if(TENSOR1##_n != TENSOR3##_n) {                                 \
+    elements_equal = 0;                                                 \
+  }                                                                     \
+  if (elements_equal == 0) {                                            \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \
+    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
+            "number of elements, but got %d, %d and %d elements respectively", \
+            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \
+            TENSOR1##_n, TENSOR2##_n, TENSOR3##_n);                     \
+  }                                                                     \
+                                                                        \
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
+  } \
+  if(TENSOR1##_counter != NULL) \
+    THFree(TENSOR1##_counter); \
+  if(TENSOR2##_counter != NULL) \
+    THFree(TENSOR2##_counter); \
+  if(TENSOR3##_counter != NULL) \
+    THFree(TENSOR3##_counter); \
+}
+
+#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+  TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
+
+#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+\
+    if(TENSOR1##_n != TENSOR2##_n) {                                    \
+      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+      THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
+              "number of elements, but got %d and %d elements respectively", \
+              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \
+    }                                                                   \
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+  } \
+  if(TENSOR1##_counter != NULL) \
+    THFree(TENSOR1##_counter); \
+  if(TENSOR2##_counter != NULL) \
+    THFree(TENSOR2##_counter); \
+}
+
+#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+  TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
+
+#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
+\
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
+  } \
+  THFree(TENSOR##_counter); \
+}
+
+#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
+  TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
+
+
+#ifdef _OPENMP
+
+#ifndef _WIN32
+#define PRAGMA(P) _Pragma(#P)
+#else
+#define PRAGMA(P) __pragma(P)
+#endif
+
+#include <omp.h>
+
+/*
+ * Calcuate the memory offset of an element in a tensor. The strategy is below:
+ *
+ * 1. convert the line index(the index of the element) to the indexs(coordinates) in the tensor.
+ *    It can hinted by a classical problem: Getting each individual digit from a whole integer(Decimal base).
+ *    A N-digit decimal base number could be view as a N-dimension tensor and the sizes of the tensor are 10.
+ *    So the value the  whole interger is the line index. And the digits could be viewed as the indexes in
+ *    different dimentions.
+ *
+ * 2. convert the indexs(coordinates) in the tensor to the memory offset.
+ *
+ *  You can get the detailes in the for-statement iterations.
+ *
+ * The macro is only used in the first element in each thread. For the rest, the memory offset could update
+ * according to info of the tensor in order to get better performance. So we should also record the each
+ * indexs in coresponding dimension of first element.
+ * The recorded info is stored in the TENSOR##_counter_tmp.
+ *
+ */
+#define __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR) \
+  int64_t *TENSOR##_counter_tmp = (int64_t*)THAlloc(sizeof(int64_t) * TENSOR##_dim);                 \
+  ptrdiff_t TENSOR##_memory_offset = 0;                                                              \
+  ptrdiff_t TENSOR##_quot = line_index_start;                                                        \
+  for (TENSOR##_i = TENSOR##_dim-1; TENSOR##_i>=0; --TENSOR##_i) {                                   \
+    TENSOR##_counter_tmp[TENSOR##_i] = TENSOR##_quot%TENSOR##_sizes[TENSOR##_i];                     \
+    TENSOR##_quot /= TENSOR##_sizes[TENSOR##_i];                                                     \
+    TENSOR##_memory_offset += TENSOR##_counter_tmp[TENSOR##_i] * TENSOR##_strides[TENSOR##_i];         \
+  }
+
+/*
+ * The macro update the indexes in each dimension of the elements except for the first one allocated in
+ * each thread.
+ * For a tensor, if the index of some dimension reaches the size of the corresponding dimension. It will carry and clear.
+ * If the index of next high dimension does do, the index of next high dimension should carry and clear, too.
+ *
+ * The momery offset calculatation is a little confusing. If current index carries, the current index is set to 0. So
+ * the offset should decrease by size*stride of the last dimension. Then the index next high dimension increases by 1. So
+ * the offset should increase by stride of next high dimension.
+ */
+#define __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR) \
+  if(TENSOR##_i == TENSOR##_size && TENSOR##_dim > 1){ /*reaches the edge*/ \
+    int TENSOR##_carry_coord = 1;                      /*set carry flag to true*/ \
+    TENSOR##_start = 0;                                /*the current index be cleared to 0*/\
+    TENSOR##_data -= TENSOR##_size * TENSOR##_stride;  /*the momery offset reset to the first one in current dimension  */\
+    for(TENSOR##_i = TENSOR##_dim - 2; (TENSOR##_i >= 0) && (TENSOR##_carry_coord); TENSOR##_i--){ \
+      TENSOR##_counter_tmp[TENSOR##_i]++;             /*the index of next high dimension update*/ \
+      TENSOR##_data += TENSOR##_strides[TENSOR##_i];   /*memory offset increase by stride of next high dimension*/\
+      if(TENSOR##_counter_tmp[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]){ /*The next high dimension also carry, continue
+        to clear and carry*/\
+        TENSOR##_data -= TENSOR##_sizes[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \
+        TENSOR##_counter_tmp[TENSOR##_i] = 0; \
+      } else { \
+        TENSOR##_carry_coord = 0; \
+      } \
+    } \
+  } else { \
+    TENSOR##_start = TENSOR##_i;                               \
+  }
+
+
+#define TH_TENSOR_APPLY_REDUCTION_OMP(TYPE, TENSOR, OPERATION, CODE, OMP_THRESHOLD) \
+{\
+  int TENSOR##Contg = THTensor_(isContiguous)(TENSOR);                      \
+  ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR);                     \
+  if(TENSOR##Contg){                                                        \
+    ptrdiff_t iter = 0;                                                     \
+    TYPE *rp = TENSOR->storage->data<TYPE>()+TENSOR->storageOffset;         \
+    PRAGMA( omp parallel for if (TENSOR##Size > OMP_THRESHOLD * 10) firstprivate(rp) reduction(OPERATION) ) \
+    for (iter = 0; iter < TENSOR##Size; iter++) { \
+      TYPE *TENSOR##_data = rp+iter;                    \
+      CODE                                         \
+    }                                              \
+  } else {                                         \
+    int TH_TENSOR_APPLY_hasFinished = 0;           \
+    int64_t TH_TENSOR_dim_index = 0;               \
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, -1, 1);\
+    if (0 == TH_TENSOR_APPLY_hasFinished) {          \
+      PRAGMA(omp parallel if (TENSOR##Size > OMP_THRESHOLD) firstprivate(TENSOR##_data, TENSOR##_sizes, TENSOR##_strides, TENSOR##_dim, TENSOR##_stride, TENSOR##_size, TENSOR##_i) reduction(OPERATION))\
+      {\
+        size_t num_threads = omp_get_num_threads();\
+        size_t tid = omp_get_thread_num();\
+        size_t line_seg_length_avg = TENSOR##Size/num_threads;                                                     \
+        ptrdiff_t line_index_start = tid * line_seg_length_avg;                                            \
+        ptrdiff_t line_seg_length = (tid == num_threads - 1)? (TENSOR##Size - line_index_start):line_seg_length_avg;  \
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR);\
+        TENSOR##_data += TENSOR##_memory_offset;\
+        ptrdiff_t count = 0;\
+        ptrdiff_t TENSOR##_start = TENSOR##_counter_tmp[TENSOR##_dim - 1];\
+        while(count < line_seg_length){\
+          for(TENSOR##_i=TENSOR##_start; (count < line_seg_length)&&(TENSOR##_i < TENSOR##_size); ++TENSOR##_i, ++count){\
+            CODE\
+            TENSOR##_data += TENSOR##_stride;\
+          }\
+          if(count < line_seg_length){\
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR);\
+          }\
+        }\
+        if(TENSOR##_counter_tmp != NULL) \
+          THFree(TENSOR##_counter_tmp); \
+      }\
+    }\
+    if(TENSOR##_counter != NULL)\
+      THFree(TENSOR##_counter);\
+  }\
+}
+
+#define TH_TENSOR_APPLY2_OMP(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, OMP_THRESHOLD) \
+{                                                                                              \
+  /* for advanced searching index*/                                                            \
+  if( CONTIG1 && CONTIG2 ){                                                                    \
+    TYPE1 *rp = TENSOR1->storage->data<TYPE1>()+TENSOR1->storageOffset;                        \
+    TYPE2 *tp = TENSOR2->storage->data<TYPE2>()+TENSOR2->storageOffset;                        \
+    ptrdiff_t iter = 0;                                                                        \
+    if(tp != (TYPE2*)rp) {                                                                             \
+      PRAGMA(ivdep) \
+      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
+      for (iter = 0; iter < SIZE; iter++) {                             \
+        TYPE2 *TENSOR2##_data = tp+iter;                                \
+        TYPE1 *TENSOR1##_data = rp+iter;                                \
+        CODE                                                            \
+      }\
+    } else {\
+      PRAGMA(simd) \
+      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
+      for (iter = 0; iter < SIZE; iter++) {\
+        TYPE2* TENSOR2##_data = tp+iter;\
+        TYPE1* TENSOR1##_data = rp+iter;\
+        CODE                                \
+      }\
+    }\
+  } else {                               \
+    /* The following strategy is not easy to understand.
+     * 1. Collapse the dimension of the tensors in order to decrease the number of nested loops.
+     * 2. Calculate the numbers of elements allocated in each thread and the line index of the first one.
+     * 3. Calculate the memory offset of the first element and the indexes in each dimension of the
+     *    first one.
+     * 4. iterate all elements in each thread. update the indexes in each dimension of the rest.
+    */                                                                                             \
+    int TH_TENSOR_APPLY_hasFinished = 0; \
+    int64_t TH_TENSOR_dim_index = 0;     \
+    /*step 1*/                           \
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
+    if (0 == TH_TENSOR_APPLY_hasFinished) {            \
+      PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i)) \
+      {                                   \
+        /*step 2*/                                                                 \
+        size_t num_threads = omp_get_num_threads();                                                        \
+        size_t tid = omp_get_thread_num();                                                                 \
+        size_t line_seg_length_avg = SIZE/num_threads;                                                     \
+        ptrdiff_t line_index_start = tid * line_seg_length_avg;                                            \
+        ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg;  \
+        /* step 3*/                                                                                        \
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2);                                                            \
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1);                                                            \
+        TENSOR2##_data += TENSOR2##_memory_offset;                                              \
+        TENSOR1##_data += TENSOR1##_memory_offset;                                              \
+        ptrdiff_t count = 0;                                                                               \
+        ptrdiff_t TENSOR2##_start =  TENSOR2##_counter_tmp[TENSOR2##_dim-1];                               \
+        ptrdiff_t TENSOR1##_start =  TENSOR1##_counter_tmp[TENSOR1##_dim-1];                               \
+        /* step 4*/                                                                                        \
+        while (count < line_seg_length) {                                                                     \
+          for(TENSOR2##_i=TENSOR2##_start, TENSOR1##_i = TENSOR1##_start; ((count < line_seg_length) && (TENSOR2##_i < TENSOR2##_size) && (TENSOR1##_i < TENSOR1##_size)); ++TENSOR2##_i, ++TENSOR1##_i, ++count){ \
+            CODE                                                                                               \
+            TENSOR2##_data += TENSOR2##_stride;                                                                \
+            TENSOR1##_data += TENSOR1##_stride;                                                                \
+          }                                                                                                    \
+          if (count < line_seg_length){                                                                           \
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR2);                                                   \
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR1);                                                   \
+          }                                                                                                    \
+        }                                                                                                      \
+        if(TENSOR1##_counter_tmp != NULL) \
+          THFree(TENSOR1##_counter_tmp); \
+        if(TENSOR2##_counter_tmp != NULL) \
+          THFree(TENSOR2##_counter_tmp); \
+      } \
+    }                                                                                                        \
+    if(TENSOR2##_counter != NULL) \
+      THFree(TENSOR2##_counter); \
+    if(TENSOR1##_counter != NULL) \
+      THFree(TENSOR1##_counter);\
+  }\
+}
+
+#define TH_TENSOR_APPLY3_OMP(SIZE, CONTIG1, CONTIG2, CONTIG3, TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE, OMP_THRESHOLD) \
+{                                                                             \
+  /* for adveanced searching index*/                                                                    \
+  if(CONTIG1 && CONTIG2 && CONTIG3){                                                                    \
+    TYPE1 *rp = TENSOR1->storage->data<TYPE1>()+TENSOR1->storageOffset;                                 \
+    TYPE2 *tp = TENSOR2->storage->data<TYPE2>()+TENSOR2->storageOffset;                                 \
+    TYPE3 *srcp = TENSOR3->storage->data<TYPE3>()+TENSOR3->storageOffset;                               \
+    ptrdiff_t iter = 0;\
+    if(tp != (TYPE2*)rp) {                                                                             \
+      PRAGMA(ivdep) \
+      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
+      for (iter = 0; iter < SIZE; iter++) {\
+        TYPE1 *TENSOR1##_data = rp+iter;\
+        TYPE2 *TENSOR2##_data = tp+iter; \
+        TYPE3 *TENSOR3##_data = srcp+iter;\
+        CODE                                \
+      } \
+    } else {\
+      PRAGMA(simd) \
+      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
+      for (iter = 0; iter < SIZE; iter++) {\
+        TYPE1 *TENSOR1##_data = rp+iter;\
+        TYPE2 *TENSOR2##_data = tp+iter; \
+        TYPE3 *TENSOR3##_data = srcp+iter;\
+        CODE                                \
+      } \
+    }\
+  } else{              \
+    int TH_TENSOR_APPLY_hasFinished = 0;\
+    int64_t TH_TENSOR_dim_index = 0;\
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
+    __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, -1, 1) \
+    if (0 == TH_TENSOR_APPLY_hasFinished) {            \
+      PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i, TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR3##_data, TENSOR3##_sizes, TENSOR3##_strides, TENSOR3##_dim, TENSOR3##_stride, TENSOR3##_size, TENSOR3##_i))\
+      {\
+        size_t num_threads = omp_get_num_threads();\
+        size_t tid = omp_get_thread_num();\
+        size_t line_seg_length_avg = SIZE/num_threads;                                                     \
+        ptrdiff_t line_index_start = tid * line_seg_length_avg;                                            \
+        ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg;  \
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1);\
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2);\
+        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR3);\
+        TENSOR1##_data += TENSOR1##_memory_offset;\
+        TENSOR2##_data += TENSOR2##_memory_offset;\
+        TENSOR3##_data += TENSOR3##_memory_offset;\
+        ptrdiff_t count = 0;\
+        ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim - 1];\
+        ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim - 1];\
+        ptrdiff_t TENSOR3##_start = TENSOR3##_counter_tmp[TENSOR3##_dim - 1];\
+        while(count < line_seg_length){\
+          for(TENSOR1##_i=TENSOR1##_start, TENSOR2##_i=TENSOR2##_start,TENSOR3##_i=TENSOR3##_start; (count<line_seg_length)&&(TENSOR1##_i<TENSOR1##_size)&&(TENSOR2##_i<TENSOR2##_size)&&(TENSOR3##_i<TENSOR3##_size); ++TENSOR1##_i,++TENSOR2##_i,++TENSOR3##_i,++count){\
+            CODE\
+            TENSOR1##_data += TENSOR1##_stride;\
+            TENSOR2##_data += TENSOR2##_stride;\
+            TENSOR3##_data += TENSOR3##_stride;\
+          }\
+          if(count < line_seg_length){\
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR1);\
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR2);\
+            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR3);\
+          }\
+        }\
+        if(TENSOR1##_counter_tmp != NULL) \
+          THFree(TENSOR1##_counter_tmp); \
+        if(TENSOR2##_counter_tmp != NULL) \
+          THFree(TENSOR2##_counter_tmp); \
+        if(TENSOR3##_counter_tmp != NULL) \
+          THFree(TENSOR3##_counter_tmp);\
+      }\
+    }\
+    if(TENSOR1##_counter != NULL)\
+      THFree(TENSOR1##_counter);\
+    if(TENSOR2##_counter != NULL)\
+      THFree(TENSOR2##_counter);\
+    if(TENSOR3##_counter != NULL)\
+      THFree(TENSOR3##_counter);\
+  }\
+}
+
+#endif
+#endif
diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h
new file mode 100644
index 0000000..e2e125c
--- /dev/null
+++ b/aten/src/TH/THTensorDimApply.h
@@ -0,0 +1,343 @@
+#ifndef TH_TENSOR_DIM_APPLY_INC
+#define TH_TENSOR_DIM_APPLY_INC
+
+// This is an example of SIZE_CHECK argument passable to TH_TENSOR_DIM_APPLY3.
+// The TENSOR1, TENSOR2, TENSOR3, DIMENSION will be expanded the same way as
+// TH_TENSOR_DIM_APPLY3.
+// Specifically, this check ensures that TENSOR1, TENSOR2, TENSOR3 have same
+// size except for DIMENSION.
+#define TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
+{ \
+  int shape_check_flag = 0;                                             \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+  { \
+    if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      continue; \
+    if (TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+      shape_check_flag = 1; \
+      break; \
+    } \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) { \
+      shape_check_flag = 1; \
+      break; \
+    } \
+  } \
+  if (shape_check_flag == 1) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \
+    THError("Expected %s %s, %s %s and %s %s to have the same size apart from dimension %d", \
+            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \
+  } \
+}
+
+#define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, SIZE_CHECK, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  TYPE3 *TENSOR3##_data = NULL; \
+  TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \
+  int same_dims = 1;                                                    \
+  if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
+    same_dims = 0;                                                      \
+  } \
+  if( TENSOR1->dim() != TENSOR3->dim() ) { \
+    same_dims = 0;                                   \
+  } \
+  if (same_dims == 0) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \
+    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
+            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \
+  }                                                                     \
+  SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION)                      \
+\
+  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
+    return; \
+  } \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
+  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
+  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+\
+  TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
+  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
+  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+\
+  TENSOR3##_data = (TENSOR3)->storage->data<TYPE3>()+(TENSOR3)->storageOffset; \
+  TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \
+  TENSOR3##_size = TENSOR3->size[DIMENSION]; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR1->dim() == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+    { \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+/**
+ * Similar to DIM_APPLY(...) but we maintain two sets of pointers: one for the first tensor
+ * and one for the second. The two tensors must have the same shape, other than at the
+ * specified DIMENSION. This function makes it easy to store the output from reducing the
+ * TENSOR at index. For example, in the sum example described below, we could instead do:
+ *
+ * int64_t i = 0;
+ * TYPE1 sum;
+ *
+ * for (i = 0; i < TENSOR1##_size; ++i) {
+ *   sum += TENSOR1##_data[i * TENSOR1##_stride]
+ * }
+ * *TENSOR2##_data = (TYPE2) sum;
+ *
+ * In particular, we guarantee that the offset into TENSOR2 will be what you would get if
+ * you applied all of the index values used to generate the offset into TENSOR1.
+ */
+#define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \
+  if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+    THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
+            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str);        \
+  }                                                                     \
+  TH_UNUSED int shape_check_flag = 0;                                             \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+  { \
+    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      continue; \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
+      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
+      THError("Expected %s %s and %s %s to have the same size in dimension %d", \
+              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION);   \
+    }                                                                   \
+  } \
+\
+  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
+    return; \
+  } \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
+  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
+  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+\
+  TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
+  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
+  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR1->dim() == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+    { \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+/**
+ * The basic idea for DIM_APPLY: Given a TENSOR and a DIMENSION, provide access to the data stored
+ * at all sets of dimension values other than DIMENSION, such that we can get all the values at those
+ * fixed indices for the various values at DIMENSION.
+ *
+ * Suppose we have a 2x3x4 Tensor A, and we have DIMENSION=2. Then we will hit CODE (2x3) times, and the
+ * pointer into storage will be at:
+ *
+ * A[0][0]
+ * A[0][1]
+ * A[0][2]
+ * A[1][0]
+ * A[1][1]
+ * A[1][2]
+ *
+ * And at each point, we can access the data for each of the four elements of the Tensor via
+ * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do:
+ *
+ * int64_t i = 0;
+ * TYPE sum;
+ * for (i = 0; i < TENSOR##_size; i++) {
+ *  sum += TENSOR##_data[i * TENSOR##_stride]
+ * }
+ *
+ * Note that we don't have to have DIMENSION be the last tensor. If we have DIMENSION=1, then we will hit the
+ * code (2x4) times, with pointer into the storage at:
+ *
+ * offset +
+ *   stride_0 * 0 + stride_2 * 0
+ *   stride_0 * 1 + stride_2 * 0
+ *   stride_0 * 0 + stride_2 * 1
+ *   stride_0 * 1 + stride_2 * 1
+ *   stride_0 * 0 + stride_2 * 2
+ *   stride_0 * 1 + stride_2 * 2
+ *   stride_0 * 0 + stride_2 * 3
+ *   stride_0 * 1 + stride_2 * 3
+ *
+ * So we can again sum over the values at DIMENSION with the other indices fixed.
+ */
+#define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
+{ \
+  TYPE *TENSOR##_data = NULL; \
+  int64_t TENSOR##_stride = 0, TENSOR##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR->_dim()) ) \
+    THError("invalid dimension"); \
+\
+  TENSOR##_data = (TENSOR)->storage->data<TYPE>()+(TENSOR)->storageOffset; \
+  TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \
+  TENSOR##_size = TENSOR->size[DIMENSION]; \
+  /* Counter stores the indices into the Tensor at any time */ \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR->_dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR->_dim() == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+    { \
+       /* Check if the index is equal to DIMENSION. We don't need to update the */ \
+       /* offset if this is the case, and can consider the next index. However, */ \
+       /* in the case that the DIMENSION is the last index in the Tensor, then */ \
+       /* we have parsed the entire tensor and can exit */ \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      /* Bump the counter at this index, update the pointer */ \
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \
+          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+#endif
diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp
new file mode 100644
index 0000000..30dd076
--- /dev/null
+++ b/aten/src/TH/THTypeConversion.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/Half.h>
+#include "THHalf.h"
+
+// Type traits to convert types to TH-specific types. Used primarily to
+// convert at::Half to TH's half type. This makes the conversion explicit.
+// FIXME: we should just use the same type
+
+namespace th {
+
+template <typename T>
+struct FromTypeConversion {
+  using type = T;
+};
+
+template <>
+struct FromTypeConversion<THHalf> {
+  using type = at::Half;
+};
+
+template <typename T>
+using from_type = typename FromTypeConversion<T>::type;
+}
diff --git a/aten/src/TH/THVector.cpp b/aten/src/TH/THVector.cpp
new file mode 100644
index 0000000..3460d17
--- /dev/null
+++ b/aten/src/TH/THVector.cpp
@@ -0,0 +1,30 @@
+#include "THVector.h"
+
+#include "generic/simd/simd.h"
+
+#ifdef __NEON__
+#include "vector/NEON.cpp"
+#endif
+
+#ifdef __PPC64__
+#include "vector/VSX.cpp"
+#endif
+
+#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+        || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+#include "vector/SSE.cpp"
+#endif
+
+#if defined(USE_AVX)
+#include "vector/AVX.h"
+#endif
+
+#if defined(USE_AVX2)
+#include "vector/AVX2.h"
+#endif
+
+#include "generic/THVectorDefault.cpp"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THVectorDispatch.cpp"
+#include "THGenerateAllTypes.h"
diff --git a/aten/src/TH/THVector.h b/aten/src/TH/THVector.h
new file mode 100644
index 0000000..8054f64
--- /dev/null
+++ b/aten/src/TH/THVector.h
@@ -0,0 +1,14 @@
+#ifndef TH_VECTOR_INC
+#define TH_VECTOR_INC
+
+#include "THGeneral.h"
+#include "THMath.h"
+
+#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
+
+/* We are going to use dynamic dispatch, and want only to generate declarations
+ * of the vector functions */
+#include "generic/THVector.h"
+#include "THGenerateAllTypes.h"
+
+#endif // TH_VECTOR_INC
diff --git a/aten/src/TH/generic/THBlas.cpp b/aten/src/TH/generic/THBlas.cpp
new file mode 100644
index 0000000..d06ae6a
--- /dev/null
+++ b/aten/src/TH/generic/THBlas.cpp
@@ -0,0 +1,435 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THBlas.cpp"
+#else
+
+
+#ifdef BLAS_F2C
+# define ffloat double
+#else
+# define ffloat float
+#endif
+
+TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
+TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
+TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
+#ifdef BLAS_USE_CBLAS_DOT
+TH_EXTERNC float cblas_sdot(const int n, const float *x, const int incx, const float *y, const int incy);
+#ifndef THBlas_C_sdot_
+#define THBlas_C_sdot_
+static inline ffloat sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy)
+{
+  return cblas_sdot(*n, x, *incx, y, *incy);
+}
+#endif
+#else
+TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
+#endif
+TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
+TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
+TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
+TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
+
+
+
+void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dswap_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    sswap_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i;
+    for(i = 0; i < n; i++)
+    {
+      real z = x[i*incx];
+      x[i*incx] = y[i*incy];
+      y[i*incy] = z;
+    }
+  }
+}
+
+void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx)
+{
+  if(n == 1)
+    incx = 1;
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dscal_(&i_n, &a, x, &i_incx);
+#else
+    sscal_(&i_n, &a, x, &i_incx);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i;
+    for(i = 0; i < n; i++) {
+      if (a == 0) {
+        x[i*incx] = 0;
+      } else {
+        x[i*incx] *= a;
+      }
+    }
+  }
+}
+
+void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dcopy_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    scopy_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i;
+    for(i = 0; i < n; i++)
+      y[i*incy] = x[i*incx];
+  }
+}
+
+void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+#else
+    saxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i;
+    for(i = 0; i < n; i++)
+      y[i*incy] += a*x[i*incx];
+  }
+}
+
+real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    return (real) sdot_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+  }
+#endif
+  {
+    int64_t i;
+    real sum = 0;
+    for(i = 0; i < n; i++)
+    sum += x[i*incx]*y[i*incy];
+    return sum;
+  }
+}
+
+void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy)
+{
+  if(n == 1)
+    lda = m;
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    THArgCheck(lda >= THMax(1, m), 6,
+      "lda should be at least max(1, m=%d), but have %d", m, lda);
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
+#else
+    sgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i, j;
+
+    if( (trans == 'T') || (trans == 't') )
+    {
+      for(i = 0; i < n; i++)
+      {
+        real sum = 0;
+        real *row_ = a+lda*i;
+        for(j = 0; j < m; j++)
+          sum += x[j*incx]*row_[j];
+	if (beta == 0)
+	  y[i*incy] = alpha*sum;
+	else
+	  y[i*incy] = beta*y[i*incy] + alpha*sum;
+      }
+    }
+    else
+    {
+      if(beta != 1)
+        THBlas_(scal)(m, beta, y, incy);
+
+      for(j = 0; j < n; j++)
+      {
+        real *column_ = a+lda*j;
+        real z = alpha*x[j*incx];
+        for(i = 0; i < m; i++)
+          y[i*incy] += z*column_[i];
+      }
+    }
+  }
+}
+
+void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda)
+{
+  if(n == 1)
+    lda = m;
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    THArgCheck(lda >= THMax(1, m), 9,
+      "lda should be at least max(1, m=%d), but have %d", m, lda);
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
+#else
+    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i, j;
+    for(j = 0; j < n; j++)
+    {
+      real *column_ = a+j*lda;
+      real z = alpha*y[j*incy];
+      for(i = 0; i < m; i++)
+        column_[i] += z*x[i*incx] ;
+    }
+  }
+}
+
+void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc)
+{
+  int transa_ = ((transa == 't') || (transa == 'T'));
+  int transb_ = ((transb == 't') || (transb == 'T'));
+
+  if(n == 1)
+    ldc = m;
+
+  if(transa_)
+  {
+    if(m == 1)
+      lda = k;
+  }
+  else
+  {
+    if(k == 1)
+      lda = m;
+  }
+
+  if(transb_)
+  {
+    if(k == 1)
+      ldb = n;
+  }
+  else
+  {
+    if(n == 1)
+      ldb = k;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) &&
+      (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    THArgCheck(lda >= THMax(1, (transa_ ? k : m)), 8,
+      "lda should be at least max(1, %d), but have %d", (transa_ ? k : m), lda);
+    THArgCheck(ldb >= THMax(1, (transb_ ? n : k)), 10,
+      "ldb should be at least max(1, %d), but have %d", (transb_ ? n : k), ldb);
+    THArgCheck(ldc >= THMax(1, m), 13,
+      "ldc should be at least max(1, m=%d), but have %d", m, ldc);
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
+#else
+    sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
+#endif
+    return;
+  }
+#endif
+  {
+    int64_t i, j, l;
+    if(!transa_ && !transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l*lda]*b_[l];
+          b_ += ldb;
+	  if (beta == 0)
+	    c[j*ldc+i] = alpha*sum;
+	  else
+	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_++;
+      }
+    }
+    else if(transa_ && !transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l]*b_[l];
+          b_ += ldb;
+	  if (beta == 0)
+	    c[j*ldc+i] = alpha*sum;
+	  else
+	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_ += lda;
+      }
+    }
+    else if(!transa_ && transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l*lda]*b_[l*ldb];
+          b_++;
+	  if (beta == 0)
+	    c[j*ldc+i] = alpha*sum;
+	  else
+	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_++;
+      }
+    }
+    else
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l]*b_[l*ldb];
+          b_++;
+	  if (beta == 0)
+	    c[j*ldc+i] = alpha*sum;
+	  else
+	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_ += lda;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/TH/generic/THBlas.h b/aten/src/TH/generic/THBlas.h
new file mode 100644
index 0000000..c36e796
--- /dev/null
+++ b/aten/src/TH/generic/THBlas.h
@@ -0,0 +1,19 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THBlas.h"
+#else
+
+/* Level 1 */
+TH_API void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
+TH_API void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx);
+TH_API void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
+TH_API void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy);
+TH_API real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
+
+/* Level 2 */
+TH_API void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy);
+TH_API void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda);
+
+/* Level 3 */
+TH_API void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc);
+
+#endif
diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp
new file mode 100644
index 0000000..8f3ccc8
--- /dev/null
+++ b/aten/src/TH/generic/THLapack.cpp
@@ -0,0 +1,270 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THLapack.cpp"
+#else
+
+
+TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
+TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
+TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
+TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
+TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
+TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
+TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
+TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
+TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
+TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info);
+TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
+TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
+TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
+TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
+TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
+TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
+TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
+TH_EXTERNC void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);
+TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
+TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
+TH_EXTERNC void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
+TH_EXTERNC void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+TH_EXTERNC void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+TH_EXTERNC void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+TH_EXTERNC void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+TH_EXTERNC void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
+TH_EXTERNC void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
+TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *rank, float *tol, float *work, int *info);
+TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info);
+
+
+/* Compute the solution to a real system of linear equations  A * X = B */
+void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#else
+  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#endif
+#else
+  THError("gesv : Lapack library not found in compile time\n");
+#endif
+  return;
+}
+
+/* Solve a triangular system of the form A * X = B  or A^T * X = B */
+void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dtrtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
+#else
+  strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
+#endif
+#else
+  THError("trtrs : Lapack library not found in compile time\n");
+#endif
+  return;
+}
+
+/* Solve overdetermined or underdetermined real linear systems involving an
+M-by-N matrix A, or its transpose, using a QR or LQ factorization of A */
+void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+#else
+  sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+#endif
+#else
+  THError("gels : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
+matrix A */
+void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+#else
+  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+#endif
+#else
+  THError("syev : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and,
+optionally, the left and/or right eigenvectors */
+void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
+#else
+  sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
+#endif
+#else
+  THError("geev : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute the singular value decomposition (SVD) of a real M-by-N matrix A,
+optionally computing the left and/or right singular vectors */
+void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+#else
+  sgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+#endif
+#else
+  THError("gesvd : Lapack library not found in compile time\n");
+#endif
+}
+
+/* LU decomposition */
+void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgetrf_(&m, &n, a, &lda, ipiv, info);
+#else
+  sgetrf_(&m, &n, a, &lda, ipiv, info);
+#endif
+#else
+  THError("getrf : Lapack library not found in compile time\n");
+#endif
+}
+
+void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#else
+  sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#endif
+#else
+  THError("getrs : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Matrix Inverse */
+void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgetri_(&n, a, &lda, ipiv, work, &lwork, info);
+#else
+  sgetri_(&n, a, &lda, ipiv, work, &lwork, info);
+#endif
+#else
+  THError("getri : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization */
+void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotrf_(&uplo, &n, a, &lda, info);
+#else
+  spotrf_(&uplo, &n, a, &lda, info);
+#endif
+#else
+  THError("potrf : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
+void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+#else
+  spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+#endif
+#else
+  THError("potrs: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization based Matrix Inverse */
+void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotri_(&uplo, &n, a, &lda, info);
+#else
+  spotri_(&uplo, &n, a, &lda, info);
+#endif
+#else
+  THError("potri: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization with complete pivoting */
+void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
+#else
+  spstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
+#endif
+#else
+  THError("pstrf: Lapack library not found at compile time\n");
+#endif
+}
+
+/* QR decomposition */
+void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+#else
+  sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+#endif
+#else
+  THError("geqrf: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Build Q from output of geqrf */
+void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+#else
+  sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+#endif
+#else
+  THError("orgqr: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Multiply Q with a matrix using the output of geqrf */
+void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
+#else
+  sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
+#endif
+#else
+  THError("ormqr: Lapack library not found in compile time\n");
+#endif
+}
+
+
+#endif
diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h
new file mode 100644
index 0000000..fe64dae
--- /dev/null
+++ b/aten/src/TH/generic/THLapack.h
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THLapack.h"
+#else
+
+/* AX=B */
+TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info);
+/* Solve a triangular system of the form A * X = B  or A^T * X = B */
+TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info);
+/* ||AX-B|| */
+TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info);
+/* Eigenvals */
+TH_API void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info);
+/* Non-sym eigenvals */
+TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info);
+/* svd */
+TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info);
+/* LU decomposition */
+TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info);
+TH_API void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info);
+/* Matrix Inverse */
+TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info);
+
+/* Positive Definite matrices */
+/* Cholesky factorization */
+TH_API void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info);
+/* Matrix inverse based on Cholesky factorization */
+TH_API void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info);
+/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
+TH_API void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info);
+/* Cholesky factorization with complete pivoting. */
+TH_API void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info);
+
+/* QR decomposition */
+TH_API void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info);
+/* Build Q from output of geqrf */
+TH_API void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info);
+/* Multiply Q with a matrix from output of geqrf */
+TH_API void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info);
+
+#endif
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
new file mode 100644
index 0000000..2d499b0
--- /dev/null
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -0,0 +1,136 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorage.cpp"
+#else
+
+#include <new>
+
+real* THStorage_(data)(const THStorage *self)
+{
+  return self->data<real>();
+}
+
+ptrdiff_t THStorage_(size)(const THStorage *self)
+{
+  return THStorage_size(self);
+}
+
+size_t THStorage_(elementSize)()
+{
+  return sizeof(real);
+}
+
+THStorage* THStorage_(new)(void)
+{
+  return THStorage_new(at::CTypeToScalarType<th::from_type<real>>::to());
+}
+
+THStorage* THStorage_(newWithSize)(ptrdiff_t size)
+{
+  return THStorage_newWithSize(at::CTypeToScalarType<th::from_type<real>>::to(), size);
+}
+
+THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
+                                        at::Allocator *allocator)
+{
+  return THStorage_newWithAllocator(at::CTypeToScalarType<th::from_type<real>>::to(), size, allocator);
+}
+
+
+THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
+{
+  return THStorage_newWithMapping(at::CTypeToScalarType<th::from_type<real>>::to(), filename, size, flags);
+}
+
+THStorage* THStorage_(newWithSize1)(real data0)
+{
+  THStorage *self = THStorage_(newWithSize)(1);
+  real *data = THStorage_(data)(self);
+  data[0] = data0;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize2)(real data0, real data1)
+{
+  THStorage *self = THStorage_(newWithSize)(2);
+  real *data = THStorage_(data)(self);
+  data[0] = data0;
+  data[1] = data1;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize3)(real data0, real data1, real data2)
+{
+  THStorage *self = THStorage_(newWithSize)(3);
+  real *data = THStorage_(data)(self);
+  data[0] = data0;
+  data[1] = data1;
+  data[2] = data2;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize4)(real data0, real data1, real data2, real data3)
+{
+  THStorage *self = THStorage_(newWithSize)(4);
+  real *data = THStorage_(data)(self);
+  data[0] = data0;
+  data[1] = data1;
+  data[2] = data2;
+  data[3] = data3;
+  return self;
+}
+
+void THStorage_(setFlag)(THStorage *storage, const char flag)
+{
+  THStorage_setFlag(storage, flag);
+}
+
+void THStorage_(clearFlag)(THStorage *storage, const char flag)
+{
+  THStorage_clearFlag(storage, flag);
+}
+
+void THStorage_(retain)(THStorage *storage)
+{
+  THStorage_retain(storage);
+}
+
+void THStorage_(free)(THStorage *storage)
+{
+  THStorage_free(storage);
+}
+
+THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
+                                               at::Allocator* allocator) {
+  return THStorage_newWithDataAndAllocator(at::CTypeToScalarType<th::from_type<real>>::to(), std::move(data), size, allocator);
+}
+
+void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
+{
+  return THStorage_resize(storage, size);
+}
+
+void THStorage_(fill)(THStorage *storage, real value)
+{
+  ptrdiff_t i;
+  for(i = 0; i < storage->size; i++)
+    THStorage_(data)(storage)[i] = value;
+}
+
+void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value)
+{
+  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
+  THStorage_(data)(self)[idx] = value;
+}
+
+real THStorage_(get)(const THStorage *self, ptrdiff_t idx)
+{
+  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
+  return THStorage_(data)(self)[idx];
+}
+
+void THStorage_(swap)(THStorage *storage1, THStorage *storage2)
+{
+  THStorage_swap(storage1, storage2);
+}
+
+#endif
diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h
new file mode 100644
index 0000000..4850c47
--- /dev/null
+++ b/aten/src/TH/generic/THStorage.h
@@ -0,0 +1,74 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorage.h"
+#else
+
+#ifdef __cplusplus
+#include <ATen/Allocator.h>
+#endif
+
+/* on pourrait avoir un liste chainee
+   qui initialise math, lab structures (or more).
+   mouais -- complique.
+
+   Pb: THMapStorage is kind of a class
+   THLab_()... comment je m'en sors?
+
+   en template, faudrait que je les instancie toutes!!! oh boy!
+   Et comment je sais que c'est pour Cuda? Le type float est le meme dans les <>
+
+   au bout du compte, ca serait sur des pointeurs float/double... etc... = facile.
+   primitives??
+ */
+
+#define TH_STORAGE_REFCOUNTED 1
+#define TH_STORAGE_RESIZABLE  2
+
+// Struct definition is moved to THStorage.hpp (so this file stays C compatible)
+typedef struct THStorage THStorage;
+
+// These used to be distinct types; for some measure of backwards compatibility and documentation
+// alias these to the single THStorage type.
+#define THFloatStorage THStorage
+#define THDoubleStorage THStorage
+#define THHalfStorage THStorage
+#define THByteStorage THStorage
+#define THCharStorage THStorage
+#define THShortStorage THStorage
+#define THIntStorage THStorage
+#define THLongStorage THStorage
+
+TH_API real* THStorage_(data)(const THStorage*);
+TH_API ptrdiff_t THStorage_(size)(const THStorage*);
+TH_API size_t THStorage_(elementSize)(void);
+
+/* slow access -- checks everything */
+TH_API void THStorage_(set)(THStorage*, ptrdiff_t, real);
+TH_API real THStorage_(get)(const THStorage*, ptrdiff_t);
+
+TH_API THStorage* THStorage_(new)(void);
+TH_API THStorage* THStorage_(newWithSize)(ptrdiff_t size);
+TH_API THStorage* THStorage_(newWithSize1)(real);
+TH_API THStorage* THStorage_(newWithSize2)(real, real);
+TH_API THStorage* THStorage_(newWithSize3)(real, real, real);
+TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real);
+TH_API THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags);
+
+TH_API THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
+                                               THAllocator* allocator);
+#ifdef __cplusplus
+TH_API THStorage* THStorage_(newWithDataAndAllocator)(
+    at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator);
+#endif
+
+/* should not differ with API */
+TH_API void THStorage_(setFlag)(THStorage *storage, const char flag);
+TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag);
+TH_API void THStorage_(retain)(THStorage *storage);
+TH_API void THStorage_(swap)(THStorage *storage1, THStorage *storage2);
+
+/* might differ with other API (like CUDA) */
+TH_API void THStorage_(free)(THStorage *storage);
+TH_API void THStorage_(resize)(THStorage *storage, ptrdiff_t size);
+TH_API void THStorage_(fill)(THStorage *storage, real value);
+
+#endif
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
new file mode 100644
index 0000000..946be62
--- /dev/null
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -0,0 +1,87 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorageCopy.cpp"
+#else
+
+void THStorage_(rawCopy)(THStorage *storage, real *src)
+{
+  ptrdiff_t i;
+  real *data = THStorage_(data)(storage);
+  for(i = 0; i < storage->size; i++)
+    data[i] = src[i];
+}
+
+void THStorage_(copy)(THStorage *storage, THStorage *src)
+{
+  THArgCheck(storage->size == src->size, 2, "size mismatch");
+  THStorage_(rawCopy)(storage, THStorage_(data)(src));
+}
+
+// NOTE: for performance, these macros generally use the raw data pointer in the inner loops,
+// rather than repeated THStorage_(data) calls.
+
+#define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  ptrdiff_t i;                                                          \
+  auto data = THStorage_(data)(storage);                                \
+  auto src_data = TH##TYPENAMESRC##Storage_data(src);                   \
+  for(i = 0; i < storage->size; i++)                                    \
+    data[i] = static_cast<real>(src_data[i]);                           \
+}
+
+#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  ptrdiff_t i;								\
+  auto data = THStorage_(data)(storage);      \
+  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
+  for(i = 0; i < storage->size; i++)					\
+    data[i] = (real)TH_half2float(src_data[i]); \
+}
+
+#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  ptrdiff_t i;								\
+  auto data = THStorage_(data)(storage);      \
+  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
+  for(i = 0; i < storage->size; i++)					\
+    data[i] = TH_float2half((float)(src_data[i])); \
+}
+
+#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  ptrdiff_t i;								\
+  auto data = THStorage_(data)(storage);      \
+  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
+  for(i = 0; i < storage->size; i++)					\
+    data[i] = static_cast<real>(src_data[i]); \
+}
+
+#ifndef TH_REAL_IS_HALF
+IMPLEMENT_THStorage_COPY(Byte)
+IMPLEMENT_THStorage_COPY(Char)
+IMPLEMENT_THStorage_COPY(Short)
+IMPLEMENT_THStorage_COPY(Int)
+IMPLEMENT_THStorage_COPY(Long)
+IMPLEMENT_THStorage_COPY(Float)
+IMPLEMENT_THStorage_COPY(Double)
+IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
+IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
+IMPLEMENT_THStorage_COPY_TO_HALF(Char)
+IMPLEMENT_THStorage_COPY_TO_HALF(Short)
+IMPLEMENT_THStorage_COPY_TO_HALF(Int)
+IMPLEMENT_THStorage_COPY_TO_HALF(Long)
+IMPLEMENT_THStorage_COPY_TO_HALF(Float)
+IMPLEMENT_THStorage_COPY_TO_HALF(Double)
+#endif
+
+
+#endif
diff --git a/aten/src/TH/generic/THStorageCopy.h b/aten/src/TH/generic/THStorageCopy.h
new file mode 100644
index 0000000..ce8a2a6
--- /dev/null
+++ b/aten/src/TH/generic/THStorageCopy.h
@@ -0,0 +1,18 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorageCopy.h"
+#else
+
+/* Support for copy between different Storage types */
+
+TH_API void THStorage_(rawCopy)(THStorage *storage, real *src);
+TH_API void THStorage_(copy)(THStorage *storage, THStorage *src);
+TH_API void THStorage_(copyByte)(THStorage *storage, struct THByteStorage *src);
+TH_API void THStorage_(copyChar)(THStorage *storage, struct THCharStorage *src);
+TH_API void THStorage_(copyShort)(THStorage *storage, struct THShortStorage *src);
+TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src);
+TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src);
+TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src);
+TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src);
+TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src);
+
+#endif
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
new file mode 100644
index 0000000..0428c8f
--- /dev/null
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -0,0 +1,890 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensor.cpp"
+#else
+
+#include <new>
+
+/**** access methods ****/
+THStorage *THTensor_(storage)(const THTensor *self)
+{
+  return self->storage;
+}
+
+ptrdiff_t THTensor_(storageOffset)(const THTensor *self)
+{
+  return self->storageOffset;
+}
+
+int THTensor_(nDimension)(const THTensor *self)
+{
+  return self->dim();
+}
+
+int THTensor_(_nDimension)(const THTensor *self)
+{
+  return self->_dim();
+}
+
+int64_t THTensor_(size)(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
+  return self->size[dim];
+}
+
+int64_t THTensor_(stride)(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
+  return self->stride[dim];
+}
+
+THLongStorage *THTensor_(newSizeOf)(THTensor *self)
+{
+  THLongStorage *size = THLongStorage_newWithSize(self->dim());
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+THLongStorage *THTensor_(newStrideOf)(THTensor *self)
+{
+  THLongStorage *stride = THLongStorage_newWithSize(self->dim());
+  THLongStorage_rawCopy(stride, self->stride);
+  return stride;
+}
+
+real *THTensor_(data)(const THTensor *self)
+{
+  if(self->storage)
+    return (THStorage_(data)(self->storage)+self->storageOffset);
+  else
+    return NULL;
+}
+
+/**** creation methods ****/
+
+/* Empty init */
+THTensor *THTensor_(new)(void)
+{
+  return new THTensor(THStorage_(new)());
+}
+
+/* Pointer-copy init */
+THTensor *THTensor_(newWithTensor)(THTensor *tensor)
+{
+  THTensor *self = new THTensor(THStorage_(new)());
+  THTensor_(setStorageNd)(self,
+                          tensor->storage,
+                          tensor->storageOffset,
+                          tensor->dim(),
+                          tensor->size,
+                          tensor->stride);
+  return self;
+}
+
+/* Storage init */
+THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
+{
+  if(size && stride) {
+    THArgCheck(size->size == stride->size, 4, "inconsistent size");
+  }
+  AT_CHECK(size, "size must not be null");
+
+  THTensor *self = new THTensor(THStorage_(new)());
+#ifdef DEBUG
+  THAssert(size->size <= INT_MAX);
+#endif
+  THTensor_(setStorageNd)(self,
+                          storage,
+                          storageOffset,
+                          size->size,
+                          THLongStorage_data(size),
+                          (stride ? THLongStorage_data(stride) : NULL));
+
+  return self;
+}
+
+THTensor *THTensor_(newWithStorageIntLists)(THStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) {
+  AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+  THTensor *self = new THTensor(THStorage_(new)());
+  THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(),
+                          const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
+
+  return self;
+}
+
+THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0)
+{
+  return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0}, {stride0});
+}
+
+THTensor *THTensor_(newWithStorage2d)(THStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1)
+{
+  return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0, size1}, {stride0, stride1});
+}
+
+THTensor *THTensor_(newWithStorage3d)(THStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2)
+{
+  return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0, size1, size2}, {stride0, stride1, stride2});
+}
+
+THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2,
+                               int64_t size3, int64_t stride3)
+{
+  return THTensor_(newWithStorageIntLists)(storage, storageOffset,
+                                          {size0, size1, size2, size3},
+                                          {stride0, stride1, stride2, stride3});
+}
+
+THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride)
+{
+  return THTensor_(newWithStorage)(NULL, 0, size, stride);
+}
+
+THTensor *THTensor_(newWithSizeIntList)(at::IntList sizes) {
+  THTensor *self = new THTensor(THStorage_(new)());
+  THTensor_(resizeNd)(self, sizes.size(), const_cast<int64_t*>(sizes.data()), nullptr);
+
+  return self;
+}
+
+THTensor *THTensor_(newWithSize1d)(int64_t size0)
+{
+  return THTensor_(newWithSizeIntList)({size0});
+}
+
+THTensor *THTensor_(newWithSize2d)(int64_t size0, int64_t size1)
+{
+  return THTensor_(newWithSizeIntList)({size0, size1});
+}
+
+THTensor *THTensor_(newWithSize3d)(int64_t size0, int64_t size1, int64_t size2)
+{
+  return THTensor_(newWithSizeIntList)({size0, size1, size2});
+}
+
+THTensor *THTensor_(newWithSize4d)(int64_t size0, int64_t size1, int64_t size2, int64_t size3)
+{
+  return THTensor_(newWithSizeIntList)({size0, size1, size2, size3});
+}
+
+THTensor *THTensor_(newClone)(THTensor *self)
+{
+  THTensor *tensor = THTensor_(new)();
+  THTensor_(resizeAs)(tensor, self);
+  THTensor_(copy)(tensor, self);
+  return tensor;
+}
+
+THTensor *THTensor_(newContiguous)(THTensor *self)
+{
+  if(!THTensor_(isContiguous)(self))
+    return THTensor_(newClone)(self);
+  else
+  {
+    THTensor_(retain)(self);
+    return self;
+  }
+}
+
+THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(select)(self, NULL, dimension_, sliceIndex_);
+  return self;
+}
+
+THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
+  return self;
+}
+
+THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(transpose)(self, NULL, dimension1_, dimension2_);
+  return self;
+}
+
+THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(unfold)(self, NULL, dimension_, size_, step_);
+  return self;
+}
+
+THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
+{
+  ptrdiff_t numel = THTensor_(nElement)(tensor);
+  THTensor *self = THTensor_(new)();
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()),
+                                        at::IntList(tensor->stride, tensor->dim()),
+                                        at::IntList(inferred_size->data<int64_t>(), inferred_size->size));
+  THArgCheck(stride.has_value(), 2, "view size is "
+    "not compatible with input tensor's size and stride (at least one dimension spans "
+    "across two contiguous subspaces). Call .contiguous() before .view().");
+  auto stride_value = *stride;
+  THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size());
+  THLongStorage_rawCopy(new_stride, stride_value.data());
+  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, new_stride);
+  THLongStorage_free(inferred_size);
+  THLongStorage_free(new_stride);
+  return self;
+}
+
+/* Resize */
+void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
+{
+  THArgCheck(size != NULL, 2, "invalid size");
+  if(stride)
+    THArgCheck(stride->size == size->size, 3, "invalid stride");
+
+#ifdef DEBUG
+  THAssert(size->size <= INT_MAX);
+#endif
+  THTensor_(resizeNd)(self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL));
+}
+
+void THTensor_(resizeAs)(THTensor *self, THTensor *src)
+{
+  if(!THTensor_(isSameSizeAs)(self, src))
+    THTensor_(resizeNd)(self, src->dim(), src->size, NULL);
+}
+
+void THTensor_(resize1d)(THTensor *tensor, int64_t size0)
+{
+  int64_t size[1] = {size0};
+  THTensor_(resizeNd)(tensor, 1, size, nullptr);
+}
+
+void THTensor_(resize2d)(THTensor *tensor, int64_t size0, int64_t size1)
+{
+  int64_t size[2] = {size0, size1};
+  THTensor_(resizeNd)(tensor, 2, size, nullptr);
+}
+
+void THTensor_(resize3d)(THTensor *tensor, int64_t size0, int64_t size1, int64_t size2)
+{
+  int64_t size[3] = {size0, size1, size2};
+  THTensor_(resizeNd)(tensor, 3, size, nullptr);
+}
+
+void THTensor_(resize4d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
+{
+  int64_t size[4] = {size0, size1, size2, size3};
+  THTensor_(resizeNd)(self, 4, size, nullptr);
+}
+
+void THTensor_(resize5d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4)
+{
+  int64_t size[5] = {size0, size1, size2, size3, size4};
+  THTensor_(resizeNd)(self, 5, size, nullptr);
+}
+
+void THTensor_(set)(THTensor *self, THTensor *src)
+{
+  if(self != src)
+    THTensor_(setStorageNd)(self,
+                            src->storage,
+                            src->storageOffset,
+                            src->dim(),
+                            src->size,
+                            src->stride);
+}
+
+void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
+{
+  if(size_ && stride_)
+    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
+
+  AT_CHECK(size_, "size must not be null");
+#ifdef DEBUG
+  THAssert(size_ <= INT_MAX);
+#endif
+  THTensor_(setStorageNd)(self,
+                          storage_,
+                          storageOffset_,
+                          size_->size,
+                          THLongStorage_data(size_),
+                          (stride_ ? THLongStorage_data(stride_) : NULL));
+}
+
+void THTensor_(setStorageIntLists)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                                   at::IntList sizes, at::IntList strides)
+{
+  AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+
+  THTensor_(setStorageNd)(self, storage_, storageOffset_, sizes.size(),
+                          const_cast<int64_t *>(sizes.data()), const_cast<int64_t *>(strides.data()));
+}
+
+void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_)
+{
+  THTensor_(setStorageIntLists)(self, storage_, storageOffset_,
+                                {size0_}, {stride0_});
+}
+
+void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_)
+{
+  THTensor_(setStorageIntLists)(self, storage_, storageOffset_,
+                                {size0_, size1_},
+                                {stride0_, stride1_});
+}
+
+void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_)
+{
+  THTensor_(setStorageIntLists)(self, storage_, storageOffset_,
+                                {size0_, size1_, size2_},
+                                {stride0_, stride1_, stride2_});
+}
+
+void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_,
+                             int64_t size3_, int64_t stride3_)
+{
+
+  int64_t size[4] = {size0_, size1_, size2_, size3_};
+  int64_t stride[4] = {stride0_, stride1_, stride2_, stride3_};
+
+  THTensor_(setStorageIntLists)(self, storage_, storageOffset_, size, stride);
+}
+
+
+void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t firstIndex, int64_t size)
+{
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range");
+  THArgCheck( firstIndex >= 0, 3, "out of range");
+#ifdef USE_TH_SIZE_ZERO_DIM
+  THArgCheck( size >= 0, 4, "out of range");
+#else
+  THArgCheck( size > 0, 4, "out of range");
+#endif
+  THArgCheck(firstIndex <= src->size[dimension] - size, 4, "out of range");
+
+  THTensor_(set)(self, src);
+
+  if(firstIndex > 0)
+    self->storageOffset += firstIndex*self->stride[dimension];
+
+  self->size[dimension] = size;
+}
+
+void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(src->_dim() > 1, 1, "cannot select on a vector");
+#else
+#ifndef USE_TH_SCALAR
+  THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
+#endif
+#endif
+  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
+
+  THTensor_(set)(self, src);
+  THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
+  for(d = dimension; d < self->dim()-1; d++)
+  {
+    self->size[d] = self->size[d+1];
+    self->stride[d] = self->stride[d+1];
+  }
+  self->dim_--;
+}
+
+void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
+{
+  int64_t z;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
+
+  THTensor_(set)(self, src);
+
+  if(dimension1 == dimension2)
+    return;
+
+  z = self->stride[dimension1];
+  self->stride[dimension1] = self->stride[dimension2];
+  self->stride[dimension2] = z;
+  z = self->size[dimension1];
+  self->size[dimension1] = self->size[dimension2];
+  self->size[dimension2] = z;
+}
+
+void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t size, int64_t step)
+{
+  int64_t *newSize;
+  int64_t *newStride;
+  int d;
+
+  if(!src)
+    src = self;
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
+#endif
+  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(step > 0, 4, "invalid step");
+
+  THTensor_(set)(self, src);
+
+  newSize = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1));
+  newStride = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1));
+
+  newSize[self->dim()] = size;
+  newStride[self->dim()] = self->stride[dimension];
+  for(d = 0; d < self->dim(); d++)
+  {
+    if(d == dimension)
+    {
+      newSize[d] = (self->size[d] - size) / step + 1;
+      newStride[d] = step*self->stride[d];
+    }
+    else
+    {
+      newSize[d] = self->size[d];
+      newStride[d] = self->stride[d];
+    }
+  }
+
+  THFree(self->size);
+  THFree(self->stride);
+
+  self->size = newSize;
+  self->stride = newStride;
+  self->dim_++;
+}
+
+/* we have to handle the case where the result is a number */
+void THTensor_(squeeze)(THTensor *self, THTensor *src)
+{
+  int ndim = 0;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THTensor_(set)(self, src);
+
+  for(d = 0; d < src->dim(); d++)
+  {
+    if(src->size[d] != 1)
+    {
+      if(d != ndim)
+      {
+        self->size[ndim] = src->size[d];
+        self->stride[ndim] = src->stride[d];
+      }
+      ndim++;
+    }
+  }
+
+#ifndef USE_TH_SCALAR
+  /* right now, we do not handle 0-dimension tensors */
+  if(ndim == 0 && src->dim() > 0)
+  {
+    self->size[0] = 1;
+    self->stride[0] = 1;
+    ndim = 1;
+  }
+#endif
+  self->dim_ = ndim;
+}
+
+void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "dimension out of range");
+
+  THTensor_(set)(self, src);
+
+#ifdef USE_TH_SCALAR
+  if(src->size[dimension] == 1)
+#else
+  if(src->size[dimension] == 1 && src->dim() > 1)
+#endif
+  {
+    for(d = dimension; d < self->dim()-1; d++)
+    {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->dim_--;
+  }
+}
+
+void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor");
+#endif
+
+  THTensor_(set)(self, src);
+
+  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1));
+  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1));
+  self->dim_++;
+  for (d = self->dim()-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->dim()) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
+int THTensor_(isTransposed)(const THTensor *self)
+{
+  if (THTensor_(isContiguous)(self)) {
+    return 0;
+  }
+  int64_t max_stride = 1;
+  int64_t size_max_stride = 1;
+  int64_t z = 1;
+  int d;
+  for (d = 0; d < self->_dim(); ++d) {
+    if (self->stride[d] == 0 && self->size[d] != 1)
+      return 0;
+    if (self->stride[d] > max_stride) {
+      max_stride = self->stride[d];
+      size_max_stride = self->size[d];
+    }
+    z *= self->size[d];
+  }
+  if (z == max_stride * size_max_stride) {
+    return 1;
+  }
+  return 0;
+}
+
+int THTensor_(isContiguous)(const THTensor *self)
+{
+  if (self->is_empty()) return 1;
+  int64_t z = 1;
+  int d;
+  for(d = self->dim()-1; d >= 0; d--)
+  {
+    if(self->size[d] != 1)
+    {
+      if(self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return 0;
+    }
+  }
+  return 1;
+}
+
+int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims)
+{
+  int d;
+  if (self->_dim() != dims->size)
+    return 0;
+
+  for(d = 0; d < self->_dim(); ++d)
+  {
+    if(self->size[d] != THLongStorage_data(dims)[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
+{
+  int d;
+  if (self->dim() != src->dim())
+    return 0;
+  for(d = 0; d < self->dim(); ++d)
+  {
+    if(self->size[d] != src->size[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THTensor_(isSetTo)(const THTensor *self, const THTensor* src)
+{
+  if (!self->storage)
+    return 0;
+  if (self->storage == src->storage &&
+      self->storageOffset == src->storageOffset &&
+      self->_dim() == src->_dim())
+  {
+    int d;
+    for (d = 0; d < self->_dim(); ++d)
+    {
+      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+ptrdiff_t THTensor_(nElement)(const THTensor *self)
+{
+  if(self->_dim() == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < self->_dim(); d++)
+      nElement *= self->size[d];
+    return nElement;
+  }
+}
+
+void THTensor_(retain)(THTensor *self)
+{
+  ++self->refcount;
+}
+
+void THTensor_(free)(THTensor *self)
+{
+  THTensor_free(self);
+}
+
+void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
+{
+  if(self != dst)
+    THTensor_(copy)(dst, self);
+
+  THTensor_(free)(self);
+}
+
+/*******************************************************************************/
+
+void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
+{
+  /* storage */
+  if(self->storage != storage)
+  {
+    if(self->storage)
+      THStorage_(free)(self->storage);
+
+    if(storage)
+    {
+      self->storage = storage;
+      THStorage_(retain)(self->storage);
+    }
+    else
+      self->storage = THStorage_(new)();
+  }
+
+  /* storageOffset */
+  if(storageOffset < 0)
+    THError("Tensor: invalid storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THTensor_(resizeNd)(self, nDimension, size, stride);
+}
+
+void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t *stride)
+{
+  int d;
+  ptrdiff_t totalSize;
+  bool hascorrectsize = true;
+
+#ifndef USE_TH_SCALAR
+  AT_CHECK(nDimension > 0, "resizeNd nDimension must be greater than 0");
+#else
+  AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
+#endif
+
+  for(d = 0; d < nDimension; d++)
+  {
+#ifndef USE_TH_SIZE_ZERO_DIM
+    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
+    // currently exist and expect a size [0] tensor to be returned.
+    if (d == 0 && size[d] == 0) {
+      nDimension = 1;
+    } else {
+      AT_CHECK(size[d] > 0, "sizes must be non-negative");
+    }
+#endif
+    if((self->dim() > d) && (size[d] != self->size[d])) {
+      hascorrectsize = false;
+    }
+
+    // NB: this used to test that stride[d] was >= 0
+    if((self->dim() > d) && stride && (stride[d] != self->stride[d])) {
+      hascorrectsize = false;
+    }
+  }
+
+  if(nDimension != self->dim()) {
+    hascorrectsize = false;
+  }
+
+  if(hascorrectsize) {
+    return;
+  }
+
+  if(nDimension != self->dim())
+  {
+    self->size = (int64_t *)THRealloc(self->size, sizeof(int64_t)*nDimension);
+    self->stride = (int64_t *)THRealloc(self->stride, sizeof(int64_t)*nDimension);
+    self->dim_ = nDimension;
+  }
+
+  totalSize = 1;
+  for(d = nDimension-1; d >= 0; d--)
+  {
+    self->size[d] = size[d];
+    if(stride && (stride[d] >= 0) ) {
+      self->stride[d] = stride[d];
+    } else {
+      if(d == nDimension-1) {
+        self->stride[d] = 1;
+      } else {
+        // Keep stride monotonically increasing to match NumPy.
+        self->stride[d] = std::max<int64_t>(self->size[d+1], 1)*self->stride[d+1];
+      }
+    }
+    totalSize += (self->size[d]-1)*self->stride[d];
+  }
+
+  if(totalSize+self->storageOffset > 0)
+  {
+    if(!self->storage) {
+      self->storage = THStorage_(new)();
+    }
+    if(totalSize+self->storageOffset > self->storage->size) {
+      THStorage_(resize)(self->storage, totalSize+self->storageOffset);
+    }
+  }
+}
+
+void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
+{
+  THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+}
+
+real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
+{
+  THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+}
+
+void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value)
+{
+  THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+}
+
+real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
+{
+  THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+}
+
+void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
+{
+  THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+}
+
+real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
+{
+  THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+}
+
+void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
+{
+  THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+}
+
+real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
+{
+  THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+}
+
+THDescBuff THTensor_(desc)(const THTensor *tensor) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+#define _stringify(x) #x
+  n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size ");
+#undef _stringify
+  int i;
+  for(i = 0; i < tensor->_dim(); i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
+    if(i < tensor->_dim()-1) {
+      n += snprintf(str+n, L-n, "x");
+    }
+  }
+  if(n >= L) {
+    snprintf(str+L-4, 4, "...");
+  }
+  return buf;
+}
+
+THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
+  THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor);
+  THDescBuff buf = THLongStorage_sizeDesc(size);
+  THLongStorage_free(size);
+  return buf;
+}
+
+#endif
diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h
new file mode 100644
index 0000000..cdc8f7e
--- /dev/null
+++ b/aten/src/TH/generic/THTensor.h
@@ -0,0 +1,137 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensor.h"
+#else
+
+/* a la lua? dim, storageoffset, ...  et les methodes ? */
+
+#define THCTensor THTensor
+
+// Struct definition moved to THTensor.hpp
+typedef struct THTensor THTensor;
+
+// These used to be distinct types; for some measure of backwards compatibility and documentation
+// alias these to the single THTensor type.
+#define THFloatTensor THTensor
+#define THDoubleTensor THTensor
+#define THHalfTensor THTensor
+#define THByteTensor THTensor
+#define THCharTensor THTensor
+#define THShortTensor THTensor
+#define THIntTensor THTensor
+#define THLongTensor THTensor
+
+/**** access methods ****/
+TH_API THStorage* THTensor_(storage)(const THTensor *self);
+TH_API ptrdiff_t THTensor_(storageOffset)(const THTensor *self);
+
+// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim().
+TH_API int THTensor_(nDimension)(const THTensor *self);
+TH_API int THTensor_(_nDimension)(const THTensor *self);
+TH_API int64_t THTensor_(size)(const THTensor *self, int dim);
+TH_API int64_t THTensor_(stride)(const THTensor *self, int dim);
+TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self);
+TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self);
+TH_API real *THTensor_(data)(const THTensor *self);
+
+
+/**** creation methods ****/
+TH_API THTensor *THTensor_(new)(void);
+TH_API THTensor *THTensor_(newWithTensor)(THTensor *tensor);
+/* stride might be NULL */
+TH_API THTensor *THTensor_(newWithStorage)(THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API THTensor *THTensor_(newWithStorage1d)(THStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_);
+TH_API THTensor *THTensor_(newWithStorage2d)(THStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_);
+TH_API THTensor *THTensor_(newWithStorage3d)(THStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_);
+TH_API THTensor *THTensor_(newWithStorage4d)(THStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_,
+                                int64_t size3_, int64_t stride3_);
+
+/* stride might be NULL */
+TH_API THTensor *THTensor_(newWithSize)(THLongStorage *size_, THLongStorage *stride_);
+TH_API THTensor *THTensor_(newWithSize1d)(int64_t size0_);
+TH_API THTensor *THTensor_(newWithSize2d)(int64_t size0_, int64_t size1_);
+TH_API THTensor *THTensor_(newWithSize3d)(int64_t size0_, int64_t size1_, int64_t size2_);
+TH_API THTensor *THTensor_(newWithSize4d)(int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
+
+TH_API THTensor *THTensor_(newClone)(THTensor *self);
+TH_API THTensor *THTensor_(newContiguous)(THTensor *tensor);
+TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_);
+TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_);
+TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
+TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_);
+TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
+
+// resize* methods simply resize the storage. So they may not retain the current data at current indices.
+// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
+// values, unless you are doing some size and stride tricks, do not use resize*.
+TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
+TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, int64_t *size, int64_t *stride);
+TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(resize1d)(THTensor *tensor, int64_t size0_);
+TH_API void THTensor_(resize2d)(THTensor *tensor, int64_t size0_, int64_t size1_);
+TH_API void THTensor_(resize3d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
+TH_API void THTensor_(resize4d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
+TH_API void THTensor_(resize5d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
+// Note: these are legacy resize functions that treat sizes as size->size == 0 and size->data<int64_t>() as being 0-terminated.
+
+TH_API void THTensor_(set)(THTensor *self, THTensor *src);
+TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, int64_t *size, int64_t *stride);
+TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_);
+TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_);
+TH_API void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_);
+TH_API void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_,
+                                    int64_t size3_, int64_t stride3_);
+
+TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
+TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, int64_t sliceIndex_);
+TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
+TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, int64_t size_, int64_t step_);
+
+TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
+TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
+TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);
+
+TH_API int THTensor_(isContiguous)(const THTensor *self);
+TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
+TH_API int THTensor_(isSetTo)(const THTensor *self, const THTensor *src);
+TH_API int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims);
+TH_API ptrdiff_t THTensor_(nElement)(const THTensor *self);
+
+TH_API void THTensor_(retain)(THTensor *self);
+TH_API void THTensor_(free)(THTensor *self);
+TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
+
+/* Slow access methods [check everything] */
+TH_API void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value);
+TH_API void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value);
+TH_API void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value);
+TH_API void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value);
+
+TH_API real THTensor_(get1d)(const THTensor *tensor, int64_t x0);
+TH_API real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1);
+TH_API real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
+TH_API real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
+
+/* Debug methods */
+TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
+TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);
+
+#endif
diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp
new file mode 100644
index 0000000..fb4670c
--- /dev/null
+++ b/aten/src/TH/generic/THTensorConv.cpp
@@ -0,0 +1,1953 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorConv.cpp"
+#else
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel.
+*/
+void THTensor_(validXCorr2Dptr)(real *r_,
+                                       real alpha,
+                                       real *t_, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kr, int64_t kc,
+                                       int64_t sr, int64_t sc)
+{
+  int64_t or_ = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
+
+  int64_t xx, yy, kx, ky;
+
+  if ((sc != 1) || (oc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < or_; yy++) {
+      for(xx = 0; xx < oc; xx++) {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real *pw_ = k_;
+        real sum = 0;
+        for(ky = 0; ky < kr; ky++) {
+          for(kx = 0; kx < kc; kx++) {
+            sum += pi_[kx]*pw_[kx];
+          }
+          pi_ += ic; /* next input line */
+          pw_ += kc; /* next mask line */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < or_; yy++) {
+      real *pi_ = t_ + yy*sr*ic;
+      real *pw_ = k_;
+      for (ky = 0; ky < kr; ky++) {
+        real *pis_ = pi_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
+          pis_++;
+        }
+        pi_ += ic; /* next input line */
+        pw_ += kc; /* next mask line */
+      }
+      r_ += oc;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel.
+*/
+void THTensor_(validConv2Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kr, int64_t kc,
+                                      int64_t sr, int64_t sc)
+{
+  int64_t or_ = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
+
+  int64_t xx, yy, kx, ky;
+
+  if ((sc != 1) || (oc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < or_; yy++) {
+      for(xx = 0; xx < oc; xx++) {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real *pw_ = k_ + kr*kc - 1;
+        real sum = 0;
+        for(ky = 0; ky < kr; ky++) {
+          for(kx = 0; kx < kc; kx++) {
+            sum += pi_[kx]*pw_[-kx];
+          }
+          pi_ += ic; /* next input line */
+          pw_ -= kc; /* next mask line */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < or_; yy++) {
+      real *pw_ = k_ + kr*kc - 1;
+      real *pi_ = t_ + yy*sr*ic;
+      for (ky = 0; ky < kr; ky++) {
+        real *pis_ = pi_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
+          pis_++;
+        }
+        pi_ += ic; /* next input line */
+        pw_ -= kc; /* next mask line */
+      }
+      r_ += oc;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
+*/
+void THTensor_(fullConv2Dptr)(real *r_,
+                                     real alpha,
+                                     real *t_, int64_t ir, int64_t ic,
+                                     real *k_, int64_t kr, int64_t kc,
+                                     int64_t sr, int64_t sc)
+{
+  int64_t oc = (ic - 1) * sc + kc;
+
+  int64_t xx, yy, kx, ky;
+
+  if ((sc != 1) || (ic < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < ir; yy++) {
+      for(xx = 0; xx < ic; xx++) {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + yy*sr*oc + xx*sc;
+        real *pw_ = k_;
+        for(ky = 0; ky < kr; ky++)
+        {
+          real z = *t_ * alpha;
+          for(kx = 0; kx < kc; kx++) {
+            po_[kx] += z * pw_[kx];
+          }
+          po_ += oc; /* next input line */
+          pw_ += kc; /* next mask line */
+        }
+        t_++;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < ir; yy++) {
+      real *po_ = r_ + yy*sr*oc;
+      real *pw_ = k_;
+      for (ky = 0; ky < kr; ky++) {
+        real *pos_ = po_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
+          pos_++;
+        }
+        po_ += oc; /* next input line */
+        pw_ += kc; /* next mask line */
+      }
+      t_ += ic;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
+*/
+void THTensor_(fullXCorr2Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kr, int64_t kc,
+                                      int64_t sr, int64_t sc)
+{
+  int64_t oc = (ic - 1) * sc + kc;
+
+  int64_t xx, yy, kx, ky;
+
+  if ((sc != 1) || (ic < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < ir; yy++) {
+      for(xx = 0; xx < ic; xx++) {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + yy*sr*oc + xx*sc;
+        real *pw_ = k_ + kr*kc -1;
+        int64_t kx, ky;
+        for(ky = 0; ky < kr; ky++)
+        {
+          real z = *t_ * alpha;
+          for(kx = 0; kx < kc; kx++) {
+            po_[kx] += z * pw_[-kx];
+          }
+          po_ += oc; /* next input line */
+          pw_ -= kc; /* next mask line */
+        }
+        t_++;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < ir; yy++) {
+      real *po_ = r_ + yy*sr*oc;
+      real *pw_ = k_ + kr*kc -1;
+      for (ky = 0; ky < kr; ky++) {
+        real *pos_ = po_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
+          pos_++;
+        }
+        po_ += oc; /* next input line */
+        pw_ -= kc; /* next mask line */
+      }
+      t_ += ic;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, valid convolution.
+  for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(validXCorr2DRevptr)(real *r_,
+                                          real alpha,
+                                          real *t_, int64_t ir, int64_t ic,
+                                          real *k_, int64_t kr, int64_t kc,
+                                          int64_t sr, int64_t sc)
+{
+  int64_t or_ = ir - (kr - 1) * sr;
+  int64_t oc = ic - (kc - 1) * sc;
+
+  int64_t xx, yy, kx, ky;
+
+  if ((sc != 1) || (kc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < kr; yy++) {
+      for(xx = 0; xx < kc; xx++) {
+        real *po_ = r_;
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+
+        for(ky = 0; ky < or_; ky++) {
+          for(kx = 0; kx < oc; kx++)
+            po_[kx] += z * pi_[kx];
+          pi_ += ic;
+          po_ += oc;
+        }
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < kr; yy++) {
+      for(xx = 0; xx < kc; xx++) {
+        real *po_ = r_;
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+
+        for(ky = 0; ky < or_; ky++) {
+          THVector_(cadd)(po_, po_, pi_, z, oc);
+          pi_ += ic;
+          po_ += oc;
+        }
+      }
+    }
+  }
+}
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel.
+*/
+void THTensor_(validXCorr3Dptr)(real *r_,
+                                       real alpha,
+                                       real *t_, int64_t it, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                       int64_t st, int64_t sr, int64_t sc)
+{
+  int64_t ot = (it - kt) / st + 1;
+  int64_t or_ = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
+
+  int64_t zz, xx, yy;
+
+  for (zz = 0; zz < ot; zz++)
+  {
+    for(yy = 0; yy < or_; yy++)
+    {
+      for(xx = 0; xx < oc; xx++)
+      {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real *pw_ = k_;
+        real sum = 0;
+        int64_t kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            for(kx = 0; kx < kc; kx++) {
+              sum += pi_[kx]*pw_[kx];
+            }
+            pi_ += ic; /* next input line */
+            pw_ += kc; /* next mask line */
+          }
+          pi_ += (ir-kr)*ic; /* next input slice */
+        }
+        /* Update output */
+        *r_++ += sum*alpha;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel.
+*/
+void THTensor_(validConv3Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, int64_t it, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                      int64_t st, int64_t sr, int64_t sc)
+{
+  int64_t ot = (it - kt) / st + 1;
+  int64_t or_ = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
+
+  int64_t zz, xx, yy;
+
+  for(zz = 0; zz < ot; zz++)
+  {
+    for(yy = 0; yy < or_; yy++)
+    {
+      for(xx = 0; xx < oc; xx++)
+      {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real *pw_ = k_ + kt*kr*kc - 1;
+        real sum = 0;
+        int64_t kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            for(kx = 0; kx < kc; kx++) {
+              sum += pi_[kx]*pw_[-kx];
+            }
+            pi_ += ic; /* next input line */
+            pw_ -= kc; /* next mask line */
+          }
+          pi_ += (ir-kr)*ic; /* next input slice */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+  }
+}
+
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
+*/
+void THTensor_(fullConv3Dptr)(real *r_,
+                                     real alpha,
+                                     real *t_, int64_t it, int64_t ir, int64_t ic,
+                                     real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                     int64_t st, int64_t sr, int64_t sc)
+{
+  int64_t or_ = (ir - 1) * sr + kr;
+  int64_t oc = (ic - 1) * sc + kc;
+
+  int64_t zz, xx, yy;
+
+  for(zz = 0; zz < it; zz++)
+  {
+    for(yy = 0; yy < ir; yy++)
+    {
+      for(xx = 0; xx < ic; xx++)
+      {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc;
+        real *pw_ = k_;
+        int64_t kz, kx, ky;
+        /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            real z = *t_ * alpha;
+            for(kx = 0; kx < kc; kx++) {
+              /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
+              po_[kx] += z * pw_[kx];
+              /* printf("o=%g " , po_[kx]); */
+            }
+            /* printf("\n"); */
+            po_ += oc; /* next input line */
+            pw_ += kc; /* next mask line */
+          }
+          po_ += (or_-kr)*oc; /* next output slice */
+          /* printf("\n"); */
+        }
+        t_++;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
+*/
+void THTensor_(fullXCorr3Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, int64_t it, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                      int64_t st, int64_t sr, int64_t sc)
+{
+  int64_t or_ = (ir - 1) * sr + kr;
+  int64_t oc = (ic - 1) * sc + kc;
+
+  int64_t zz, xx, yy;
+
+  for(zz = 0; zz < it; zz++)
+  {
+    for(yy = 0; yy < ir; yy++)
+    {
+      for(xx = 0; xx < ic; xx++)
+      {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc;
+        real *pw_ = k_ + kt*kr*kc -1;
+        int64_t kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            real z = *t_ * alpha;
+            for(kx = 0; kx < kc; kx++) {
+              po_[kx] += z * pw_[-kx];
+            }
+            po_ += oc; /* next input line */
+            pw_ -= kc; /* next mask line */
+          }
+          po_ += (or_-kr)*oc; /* next output slice */
+        }
+        t_++;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given image with the given kernel, valid convolution.
+  for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(validXCorr3DRevptr)(real *r_,
+                                          real alpha,
+                                          real *t_, int64_t it, int64_t ir, int64_t ic,
+                                          real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                          int64_t st, int64_t sr, int64_t sc)
+{
+  int64_t ot = it - (kt - 1) * st;
+  int64_t or_ = ir - (kr - 1) * sr;
+  int64_t oc = ic - (kc - 1) * sc;
+
+  int64_t zz, xx, yy;
+  for(zz = 0; zz < kt; zz++)
+  {
+    for(yy = 0; yy < kr; yy++)
+    {
+      for(xx = 0; xx < kc; xx++)
+      {
+        real *po_ = r_;
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+        int64_t kz, kx, ky;
+        for(kz = 0; kz < ot; kz++)
+        {
+          for(ky = 0; ky < or_; ky++)
+          {
+            for(kx = 0; kx < oc; kx++)
+              po_[kx] += z * pi_[kx];
+            pi_ += ic;
+            po_ += oc;
+          }
+          pi_ += (ir-or_)*ic; /* next input slice */
+        }
+      }
+    }
+  }
+}
+
+void THTensor_(conv2d)(real* output_data,
+                       real alpha,
+                       real* ptr_input, int64_t nInputRows, int64_t nInputCols,
+                       real* ptr_weight, int64_t nKernelRows, int64_t nKernelCols,
+                       int64_t srow, int64_t scol,
+                       const char *vf, const char *xc)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
+  if (*vf == 'F')
+    if (*xc == 'X')
+      THTensor_(fullXCorr2Dptr)(output_data,
+                                alpha,
+                                ptr_input,  nInputRows,  nInputCols,
+                                ptr_weight, nKernelRows, nKernelCols,
+                                srow, scol);
+    else
+      THTensor_(fullConv2Dptr)(output_data,
+                               alpha,
+                               ptr_input,  nInputRows,  nInputCols,
+                               ptr_weight, nKernelRows, nKernelCols,
+                               srow, scol);
+  else
+    if (*xc == 'X')
+      THTensor_(validXCorr2Dptr)(output_data,
+                                 alpha,
+                                 ptr_input,  nInputRows,  nInputCols,
+                                 ptr_weight, nKernelRows, nKernelCols,
+                                 srow, scol);
+    else
+      THTensor_(validConv2Dptr)(output_data,
+                                alpha,
+                                ptr_input,  nInputRows,  nInputCols,
+                                ptr_weight, nKernelRows, nKernelCols,
+                                srow, scol);
+}
+
+void THTensor_(conv3d)(real* output_data,
+                       real alpha,
+                       real* ptr_input, int64_t nInputDepth, int64_t nInputRows, int64_t nInputCols,
+                       real* ptr_weight, int64_t nKernelDepth, int64_t nKernelRows, int64_t nKernelCols,
+                       int64_t sdepth, int64_t srow, int64_t scol,
+                       const char *vf, const char *xc)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
+  if (*vf == 'F')
+    if (*xc == 'X')
+      THTensor_(fullXCorr3Dptr)(output_data,
+                                alpha,
+                                ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                sdepth, srow, scol);
+    else
+      THTensor_(fullConv3Dptr)(output_data,
+                               alpha,
+                               ptr_input, nInputDepth, nInputRows,  nInputCols,
+                               ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                               sdepth, srow, scol);
+  else
+    if (*xc == 'X')
+      THTensor_(validXCorr3Dptr)(output_data,
+                                 alpha,
+                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                 sdepth, srow, scol);
+    else
+      THTensor_(validConv3Dptr)(output_data,
+                                alpha,
+                                ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                sdepth, srow, scol);
+}
+
+int64_t THTensor_(convsize)(int64_t x, int64_t k, int64_t s, const char* vf)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
+  if (*vf == 'V')
+    return (x-k)/s + 1;
+  else
+    return (x-1)*s + k;
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    int64_t i;
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get output */
+      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(validXCorr2DRevptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows; */
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
+{
+  int64_t nbatch, nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, istride1, kstride1;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  istride1    = input->stride[1];
+  nbatch      = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0 = kernel->stride[0];
+  kstride1 = kernel->stride[1];
+  nKernelPlane = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
+  THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    int64_t i;
+    for(i = 0; i < nInputPlane; i++)
+    {
+      int64_t p;
+      for(p = 0; p < nbatch; p++)
+      {
+        /* get kernel */
+        real *ptr_weight = weight_data + p*kstride0 + k*kstride1;
+        /* get output */
+        real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+        /* get input */
+        real *ptr_input = input_data + p*istride0 + i*istride1;
+
+        /* do image, kernel convolution */
+        THTensor_(validXCorr2DRevptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+        /* Next output plane */
+        /* output_data += nOutputCols*nOutputRows; */
+      }
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+*/
+void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    int64_t i;
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get output */
+      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      if (*vf == 'F')
+        if (*xc == 'X')
+          THTensor_(fullXCorr2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+        else
+          THTensor_(fullConv2Dptr)(ptr_output,
+                                   alpha,
+                                   ptr_input,  nInputRows,  nInputCols,
+                                   ptr_weight, nKernelRows, nKernelCols,
+                                   srow, scol);
+      else
+        if (*xc == 'X')
+          THTensor_(validXCorr2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          THTensor_(validConv2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows; */
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 4D kernel, 3D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, kstride1;
+  THTensor *input;
+  THTensor* kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      int64_t l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    int64_t i;
+    /* get output */
+    real *ptr_output = output_data + k*nOutputCols*nOutputRows;
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get kernel */
+      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+      /* get input */
+      real *ptr_input = input_data + i*istride0;
+
+      /* do image, kernel convolution */
+      if (*vf == 'F')
+        if (*xc == 'X')
+          THTensor_(fullXCorr2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+        else
+          THTensor_(fullConv2Dptr)(ptr_output,
+                                   alpha,
+                                   ptr_input,  nInputRows,  nInputCols,
+                                   ptr_weight, nKernelRows, nKernelCols,
+                                   srow, scol);
+      else
+        if (*xc == 'X')
+          THTensor_(validXCorr2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          THTensor_(validConv2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+    }
+    /* Next output plane */
+    /* output_data += nOutputCols*nOutputRows;*/
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 4D kernel, 3D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t kstride0, kstride1;
+  THTensor *input;
+  THTensor* kernel;
+  int64_t nbatch;
+  ptrdiff_t nelem;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  int64_t p;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nbatch = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(p)
+    for (p=0; p < r_->size[0]; p++)
+    {
+      int64_t k;
+      for (k = 0; k < r_->size[1]; k++)
+      {
+        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
+        int64_t l;
+        for (l = 0; l < nOutputRows*nOutputCols; l++)
+          ptr_output[l] = 0.0;
+      }
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(p)
+    for(p=0; p < r_->size[0]; p++)
+    {
+      int64_t k;
+      for (k = 0; k < r_->size[1]; k++)
+      {
+        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
+        int64_t l;
+        for (l = 0; l < nOutputRows*nOutputCols; l++)
+          ptr_output[l] *= beta;
+      }
+    }
+  }
+
+#pragma omp parallel for private(p)
+  for(p=0; p < nbatch; p++)
+  {
+    int64_t k;
+    for(k = 0; k < nOutputPlane; k++)
+    {
+      int64_t i;
+      /* get output */
+      real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
+      for(i = 0; i < nInputPlane; i++)
+      {
+        /* get kernel */
+        real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+        /* get input */
+        real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
+
+        /* do image, kernel convolution */
+        if (*vf == 'F')
+          if (*xc == 'X')
+            THTensor_(fullXCorr2Dptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+          else
+            THTensor_(fullConv2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          if (*xc == 'X')
+            THTensor_(validXCorr2Dptr)(ptr_output,
+                                       alpha,
+                                       ptr_input,  nInputRows,  nInputCols,
+                                       ptr_weight, nKernelRows, nKernelCols,
+                                       srow, scol);
+          else
+            THTensor_(validConv2Dptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+      }
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows;*/
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  2D input, 2D kernel, 2D output
+  scalar multiplication like
+  y <- x*y + beta*y
+*/
+void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  THTensor *input;
+  THTensor* kernel;
+  int64_t nInputRows;
+  int64_t nInputCols;
+  int64_t nKernelRows;
+  int64_t nKernelCols;
+  int64_t nOutputRows, nOutputCols;
+  real *ptr_input;
+  real *ptr_weight;
+  real *output_data;
+  ptrdiff_t nelem;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputRows  = input->size[0];
+  nInputCols  = input->size[1];
+  nKernelRows = kernel->size[0];
+  nKernelCols = kernel->size[1];
+
+  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+    THTensor_(zero)(r_);
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  ptr_input = THTensor_(data)(input);
+  ptr_weight = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+
+  /* do image, kernel convolution */
+  THTensor_(conv2d)(output_data,
+                    alpha,
+                    ptr_input, nInputRows, nInputCols,
+                    ptr_weight, nKernelRows, nKernelCols,
+                    srow, scol, vf, xc);
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  component wise multiplication like
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + k*istride0;
+
+    /* do image, kernel convolution */
+    THTensor_(conv2d)(output_data,
+                      alpha,
+                      ptr_input, nInputRows, nInputCols,
+                      ptr_weight, nKernelRows, nKernelCols,
+                      srow, scol, vf, xc);
+    /* Next output plane */
+    output_data += nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  component wise multiplication like with a permutation map
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+  THTensor *input;
+  THTensor* kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  int64_t nmaps;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(map->_dim() == 2 , 4, "map: 2D Tensor expected");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
+              || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  nmaps = map->size[0];
+
+  for(k = 0; k < nmaps; k++)
+  {
+    /* get indices */
+    int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
+    int64_t to   = (int64_t)THTensor_(get2d)(map,k,1)-1;
+
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + from*istride0;
+    /* get output */
+    real *ptr_output = output_data + to*nOutputRows*nOutputCols;
+
+    /* do image, kernel convolution */
+    THTensor_(conv2d)(ptr_output,
+                      alpha,
+                      ptr_input, nInputRows, nInputCols,
+                      ptr_weight, nKernelRows, nKernelCols,
+                      srow, scol, vf, xc);
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 5D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                             int64_t sdepth, int64_t srow, int64_t scol)
+{
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k, i;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelDepth= kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");
+
+  nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(validXCorr3DRevptr)(output_data,
+                                    alpha,
+                                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                    sdepth, srow, scol);
+      /* Next output plane */
+      output_data += nOutputDepth*nOutputCols*nOutputRows;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  4D input, 4D kernel, 5D output
+  like rank1 update
+  A <- xx' + beta*A
+*/
+void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k, i;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0     = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows  = kernel->size[2];
+  nKernelCols  = kernel->size[3];
+
+  THArgCheck((nInputDepth >= nKernelDepth
+              && nInputRows >= nKernelRows
+              && nInputCols >= nKernelCols)
+             || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(conv3d)(output_data,
+                        alpha,
+                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                        sdepth, srow, scol, vf, xc);
+
+      /* Next output plane */
+      output_data += nOutputDepth*nOutputCols*nOutputRows;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 5D kernel, 4D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                         int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, kstride1;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k, i;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelDepth = kernel->size[2];
+  nKernelRows = kernel->size[3];
+  nKernelCols = kernel->size[4];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get kernel */
+      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+      /* get input */
+      real *ptr_input = input_data + i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(conv3d)(output_data,
+                        alpha,
+                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                        sdepth, srow, scol, vf, xc);
+    }
+    /* Next output plane */
+    output_data += nOutputDepth*nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  scalar multiplication like
+  y <- x*y + beta*y
+*/
+void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  THTensor *input;
+  THTensor* kernel;
+  int64_t nInputDepth;
+  int64_t nInputRows;
+  int64_t nInputCols;
+  int64_t nKernelDepth;
+  int64_t nKernelRows;
+  int64_t nKernelCols;
+  int64_t nOutputDepth, nOutputRows, nOutputCols;
+  real *ptr_input;
+  real *ptr_weight;
+  real *output_data;
+  ptrdiff_t nelem;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputDepth = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+  nKernelDepth = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+    THTensor_(zero)(r_);
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  ptr_input = THTensor_(data)(input);
+  ptr_weight = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+
+  /* do image, kernel convolution */
+  THTensor_(conv3d)(output_data,
+                    alpha,
+                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                    sdepth, srow, scol, vf, xc);
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 4D output
+  component wise multiplication like
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                           int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  ptrdiff_t nelem;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + k*istride0;
+
+    /* do image, kernel convolution */
+    THTensor_(conv3d)(output_data,
+                      alpha,
+                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                      sdepth, srow, scol, vf, xc);
+
+    /* Next output plane */
+    output_data += nOutputDepth*nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 4D output
+  component wise multiplication like with a permutation map
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
+{
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  ptrdiff_t nelem;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  int64_t nmaps;
+  int64_t k;
+
+  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  THArgCheck(map->_dim() == 2 , 4, "map: 2D Tensor expected");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck((nInputDepth >= nKernelDepth
+              && nInputRows >= nKernelRows
+              && nInputCols >= nKernelCols) || *vf == 'F',
+             2, "conv3Dmap : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  nmaps = map->size[0];
+
+  for(k = 0; k < nmaps; k++)
+  {
+    /* get indices */
+    int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
+    int64_t to   = (int64_t)THTensor_(get2d)(map,k,1)-1;
+
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + from*istride0;
+    /* get output */
+    real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;
+
+    /* do image, kernel convolution */
+    THTensor_(conv3d)(ptr_output,
+                      alpha,
+                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                      sdepth, srow, scol, vf, xc);
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+#endif
diff --git a/aten/src/TH/generic/THTensorConv.h b/aten/src/TH/generic/THTensorConv.h
new file mode 100644
index 0000000..279ece6
--- /dev/null
+++ b/aten/src/TH/generic/THTensorConv.h
@@ -0,0 +1,79 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorConv.h"
+#else
+
+TH_API void THTensor_(validXCorr2Dptr)(real *r_,
+                                    real alpha,
+                                    real *t_, int64_t ir, int64_t ic,
+                                    real *k_, int64_t kr, int64_t kc,
+                                    int64_t sr, int64_t sc);
+
+TH_API void THTensor_(validConv2Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kr, int64_t kc,
+                                   int64_t sr, int64_t sc);
+
+TH_API void THTensor_(fullXCorr2Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kr, int64_t kc,
+                                   int64_t sr, int64_t sc);
+
+TH_API void THTensor_(fullConv2Dptr)(real *r_,
+                                  real alpha,
+                                  real *t_, int64_t ir, int64_t ic,
+                                  real *k_, int64_t kr, int64_t kc,
+                                  int64_t sr, int64_t sc);
+
+TH_API void THTensor_(validXCorr2DRevptr)(real *r_,
+                                       real alpha,
+                                       real *t_, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kr, int64_t kc,
+                                       int64_t sr, int64_t sc);
+
+TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+
+TH_API void THTensor_(validXCorr3Dptr)(real *r_,
+                                    real alpha,
+                                    real *t_, int64_t it, int64_t ir, int64_t ic,
+                                    real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                    int64_t st, int64_t sr, int64_t sc);
+
+TH_API void THTensor_(validConv3Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, int64_t it, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                   int64_t st, int64_t sr, int64_t sc);
+
+TH_API void THTensor_(fullXCorr3Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, int64_t it, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                   int64_t st, int64_t sr, int64_t sc);
+
+TH_API void THTensor_(fullConv3Dptr)(real *r_,
+                                  real alpha,
+                                  real *t_, int64_t it, int64_t ir, int64_t ic,
+                                  real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                  int64_t st, int64_t sr, int64_t sc);
+
+TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
+                                       real alpha,
+                                       real *t_, int64_t it, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                       int64_t st, int64_t sr, int64_t sc);
+
+TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+
+#endif
diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp
new file mode 100644
index 0000000..939e5b8
--- /dev/null
+++ b/aten/src/TH/generic/THTensorCopy.cpp
@@ -0,0 +1,249 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorCopy.cpp"
+#else
+
+#ifndef _WIN32
+#define PRAGMA(P) _Pragma(#P)
+#else
+#define PRAGMA(P) __pragma(P)
+#endif
+
+#ifdef _OPENMP
+#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000
+#include <omp.h>
+#endif
+
+int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
+  const int MIN_SZ = 60 * 60;
+  return THTensor_(isContiguous)(tensor) &&
+         !src->is_empty() &&
+         THTensor_(nDimension)(src) == 2 &&
+         THTensor_(stride)(src, 0) == 1 &&
+         THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) &&
+         THTensor_(nElement)(tensor) >= MIN_SZ;
+}
+
+// special case copy where tensor is contiguous and src is a transposed matrix
+// This can be generalized to most copies, but it's tricker
+void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
+  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
+  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#ifdef TH_REAL_IS_BYTE
+  const int BLOCK_SZ = 120;
+#else
+  const int BLOCK_SZ = 60;
+#endif
+
+  THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
+  real *sp = THTensor_(data)(src);
+  real *rp = THTensor_(data)(tensor);
+  real *bp = THTensor_(data)(buf);
+
+
+  int64_t NR = THTensor_(size)(src, 0);
+  int64_t NC = THTensor_(size)(src, 1);
+  for (int64_t R = 0; R < NR; R += BLOCK_SZ) {
+    for (int64_t C = 0; C < NC; C += BLOCK_SZ) {
+      real *spo = sp + R + C * NR;
+      real *rpo = rp + C + R * NC;
+
+      int nr = MIN(NR - R, BLOCK_SZ);
+      int nc = MIN(NC - C, BLOCK_SZ);
+
+      // 1. copy columns from src to buf
+      for (int c = 0; c < nc; c++) {
+        memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real));
+      }
+
+      // 2. transpose buf in place
+      int rc_max = MAX(nr, nc);
+      int rc_min = MIN(nr, nc);
+      for (int r = 0; r < rc_max; r++) {
+        int end = MIN(r, rc_min);
+        for (int c = 0; c < end; c++) {
+          real tmp = bp[r + BLOCK_SZ * c];
+          bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
+          bp[r * BLOCK_SZ + c] = tmp;
+        }
+      }
+
+      // 3. copy rows from buf to dst
+      for (int r = 0; r < nr; r++) {
+        memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real));
+      }
+    }
+  }
+  THTensor_(free)(buf);
+  #undef MIN
+  #undef MAX
+}
+
+void THTensor_(copy)(THTensor *tensor, THTensor *src)
+{
+  if (tensor == src) return;
+  ptrdiff_t tensorSize = THTensor_(nElement)(tensor);
+  ptrdiff_t srcSize = THTensor_(nElement)(src);
+  int tensorContig = THTensor_(isContiguous)(tensor);
+  int srcContig = THTensor_(isContiguous)(src);
+
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+#endif
+  if (tensorSize == srcSize) {
+    if ( tensorContig && srcContig) {
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(tensor);
+#ifndef TH_REAL_IS_HALF
+#ifdef _OPENMP
+      #pragma omp parallel if ( (tensorSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP) )
+      {
+        size_t num_threads = omp_get_num_threads();
+        size_t tid = omp_get_thread_num();
+        ptrdiff_t offset = tid * (tensorSize / num_threads);
+        ptrdiff_t end = (tid == num_threads - 1) ? tensorSize : offset + tensorSize / num_threads;
+        ptrdiff_t len = end - offset;
+        real *tensorData = rp + offset;
+        real *srcData = sp + offset;
+        THVector_(copy)(tensorData, srcData, len);
+      }
+#else
+        THVector_(copy)(rp, sp, srcSize);
+#endif
+
+#else
+
+#ifdef _OPENMP
+      if ((srcSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP)) {
+        ptrdiff_t i;
+        #pragma omp parallel for private (i)
+        for(i=0; i<srcSize; i++){
+          rp[i] = sp[i];
+        }
+      } else {
+        memcpy(rp, sp, srcSize * sizeof(real));
+      }
+#else
+      memcpy(rp, sp, srcSize * sizeof(real));
+#endif
+
+#endif
+
+#ifndef TH_REAL_IS_HALF
+    } else if (THTensor_(copyTransposeValid)(tensor, src)) {
+      THTensor_(copyTranspose)(tensor, src);
+#endif
+    } else {
+#ifdef _OPENMP
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY2_OMP(srcSize, tensorContig, srcContig, real, tensor, real, src, *tensor_data = *src_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY)
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
+  }
+}
+
+#ifndef INTER_COPY_TYPE_DEFINED
+#define INTER_COPY_TYPE_DEFINED
+
+// C and C++ have a lovely set of implicit conversion rules, where casting
+// signed integral values to unsigned integral values is always valid
+// (it basically treats the value as if using modulo arithmetic), however
+// converting negative floating point values to unsigned integral types
+// is UB! This means that: (double)-1 -> (int64_t)-1 -> (uint8_t)255 is
+// guaranteed to look like this, but we have (double)-1 -> (uint8_t)<ANYTHING>
+// because it's UB. This also makes UBSan really angry.
+//
+// I think those rules are stupid and we really shouldn't conform to them.
+// The structs below ensure that for all unsigned types we use (currently
+// only uint8_t), we will do an intermediate convertion via int64_t,
+// to ensure that any negative values are wrapped around correctly.
+//
+// Note that conversions from doubles to signed integral types that can't
+// represent a particular value after truncating the fracitonal part are UB as well,
+// but fixing them is not as simple as adding an int64_t intermediate, beacuse the
+// int64_t -> <smaller signed type> conversion is UB for those large values anyway.
+// I guess in that case we just have to live with that, but it's definitely less
+// surprising than the thing above.
+//
+// For the curious:
+//   https://en.cppreference.com/w/cpp/language/implicit_conversion
+//   The relevant paragraph is "Floating–integral conversions".
+template<typename T>
+struct inter_copy_type {
+  using type = T;
+};
+
+template<>
+struct inter_copy_type<uint8_t> {
+  using type = int64_t;
+};
+
+template<typename T>
+using inter_copy_type_t = typename inter_copy_type<T>::type;
+
+#endif
+
+#define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+  TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, \
+                   *tensor_data = static_cast<real>( \
+                       static_cast<inter_copy_type_t<real>>(*src_data));) \
+}
+
+#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
+}
+
+#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, \
+                  *tensor_data = static_cast<real>( \
+                      static_cast<inter_copy_type_t<real>>( \
+                          TH_half2float(*src_data)));) \
+}
+
+#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \
+}
+
+#ifndef TH_REAL_IS_HALF
+IMPLEMENT_THTensor_COPY(Byte, uint8_t)
+IMPLEMENT_THTensor_COPY(Char, int8_t)
+IMPLEMENT_THTensor_COPY(Short, int16_t)
+IMPLEMENT_THTensor_COPY(Int, int32_t)
+IMPLEMENT_THTensor_COPY(Long, int64_t)
+IMPLEMENT_THTensor_COPY(Float, float)
+IMPLEMENT_THTensor_COPY(Double, double)
+IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
+IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
+IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
+
+#endif /* REAL_IS_HALF */
+
+#endif
diff --git a/aten/src/TH/generic/THTensorCopy.h b/aten/src/TH/generic/THTensorCopy.h
new file mode 100644
index 0000000..b9e5bfc
--- /dev/null
+++ b/aten/src/TH/generic/THTensorCopy.h
@@ -0,0 +1,17 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorCopy.h"
+#else
+
+/* Support for copy between different Tensor types */
+
+TH_API void THTensor_(copy)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(copyByte)(THTensor *tensor, struct THByteTensor *src);
+TH_API void THTensor_(copyChar)(THTensor *tensor, struct THCharTensor *src);
+TH_API void THTensor_(copyShort)(THTensor *tensor, struct THShortTensor *src);
+TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src);
+TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src);
+TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src);
+TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src);
+TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src);
+
+#endif
diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp
new file mode 100644
index 0000000..de65f08
--- /dev/null
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorFastGetSet.hpp"
+#else
+
+static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]];
+}
+
+static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]];
+}
+
+static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]];
+}
+
+static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]];
+}
+
+static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]];
+}
+
+static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) {
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]] = value;
+}
+
+static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) {
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]] = value;
+}
+
+static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) {
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]] = value;
+}
+
+static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) {
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]] = value;
+}
+
+static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) {
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]] = value;
+}
+
+#endif
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
new file mode 100644
index 0000000..9bc5b19
--- /dev/null
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -0,0 +1,1139 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorLapack.cpp"
+#else
+
+/*
+Check if self is transpose of a contiguous matrix
+*/
+static int THTensor_(isTransposedContiguous)(THTensor *self)
+{
+  return self->stride[0] == 1 && self->stride[1] == self->size[0];
+}
+/*
+If a matrix is a regular contiguous matrix, make sure it is transposed
+because this is what we return from Lapack calls.
+*/
+static void THTensor_(checkTransposed)(THTensor *self)
+{
+  if(THTensor_(isContiguous)(self))
+    THTensor_(transpose)(self, NULL, 0, 1);
+  return;
+}
+/*
+newContiguous followed by transpose
+Similar to (newContiguous), but checks if the transpose of the matrix
+is contiguous and also limited to 2D matrices.
+*/
+static THTensor *THTensor_(newTransposedContiguous)(THTensor *self)
+{
+  THTensor *tensor;
+  if(THTensor_(isTransposedContiguous)(self))
+  {
+    THTensor_(retain)(self);
+    tensor = self;
+  }
+  else
+  {
+    tensor = THTensor_(newContiguous)(self);
+    THTensor_(transpose)(tensor, NULL, 0, 1);
+  }
+
+  return tensor;
+}
+
+/*
+Given the result tensor and src tensor, decide if the lapack call should use the
+provided result tensor or should allocate a new space to put the result in.
+
+The returned tensor have to be freed by the calling function.
+
+nrows is required, because some lapack calls, require output space smaller than
+input space, like underdetermined gels.
+*/
+static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
+{
+  /* check if user wants to reuse src and if it is correct shape/size */
+  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
+    THTensor_(retain)(result);
+  else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
+    result = THTensor_(new)();
+  else
+    THTensor_(retain)(result);
+  return result;
+}
+
+/*
+Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
+the resulting tensor to be larger than src.
+*/
+static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows)
+{
+  THTensor *result;
+  THTensor *view;
+
+  if (src == NULL)
+    src = self;
+  result = THTensor_(checkLapackClone)(self, src, nrows);
+  if (src == result)
+    return result;
+
+  THTensor_(resize2d)(result, src->size[1], nrows);
+  THTensor_(checkTransposed)(result);
+
+  if (src->size[0] == nrows)
+    THTensor_(copy)(result, src);
+  else
+  {
+    view = THTensor_(newNarrow)(result, 0, 0, src->size[0]);
+    THTensor_(copy)(view, src);
+    THTensor_(free)(view);
+  }
+  return result;
+}
+
+/*
+Create a clone of src in self column major order for use with Lapack.
+If src == self, a new tensor is allocated, in any case, the return tensor should be
+freed by calling function.
+*/
+static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
+{
+  return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
+}
+
+void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
+{
+  int free_b = 0;
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->dim());
+  THArgCheck(!a->is_empty(), 2, "A should not be empty");
+  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->dim());
+  THArgCheck(!b->is_empty(), 2, "B should not be empty");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->dim() == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
+
+  int n, nrhs, lda, ldb, info;
+  THIntTensor *ipiv;
+  THTensor *ra__;  // working version of A matrix to be passed into lapack GELS
+  THTensor *rb__;  // working version of B matrix to be passed into lapack GELS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  ipiv = THIntTensor_newWithSize1d((int64_t)n);
+  THLapack_(gesv)(n, nrhs,
+		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
+		  THTensor_(data)(rb__), ldb, &info);
+
+  THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(rb__);
+                               THIntTensor_free(ipiv);
+                               if (free_b) THTensor_(free)(b);),
+                           "gesv", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+  THIntTensor_free(ipiv);
+  if (free_b) THTensor_(free)(b);
+}
+
+void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
+                      const char *uplo, const char *trans, const char *diag)
+{
+  int free_b = 0;
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->_dim());
+  THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->_dim());
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->_dim() == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
+
+  int n, nrhs, lda, ldb, info;
+  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
+  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  THLapack_(trtrs)(uplo[0], trans[0], diag[0], n, nrhs,
+                   THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rb__), ldb, &info);
+
+
+  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
+                           THCleanup(
+                              THTensor_(free)(ra__);
+                              THTensor_(free)(rb__);
+                              if (free_b) THTensor_(free)(b);),
+                           "trtrs", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+  if (free_b) THTensor_(free)(b);
+}
+
+void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
+{
+  int free_b = 0;
+  // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->dim());
+  THArgCheck(!a->is_empty(), 2, "A should not be empty");
+  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->dim());
+  THArgCheck(!b->is_empty(), 1, "B should not be empty");
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->_dim() == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
+
+  int m, n, nrhs, lda, ldb, info, lwork;
+  THTensor *work = NULL;
+  real wkopt = 0;
+
+  THTensor *ra__ = NULL;  // working version of A matrix to be passed into lapack GELS
+  THTensor *rb__ = NULL;  // working version of B matrix to be passed into lapack GELS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  lda = m;
+  ldb = (m > n) ? m : n;
+
+  rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);
+
+  nrhs = rb__->size[1];
+  info = 0;
+
+
+  /* get optimal workspace size */
+  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
+		  THTensor_(data)(rb__), ldb,
+		  &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
+		  THTensor_(data)(rb__), ldb,
+		  THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
+                           THCleanup(THTensor_(free)(ra__);
+                                     THTensor_(free)(rb__);
+                                     THTensor_(free)(work);
+                                     if (free_b) THTensor_(free)(b);),
+                           "gels", info,"");
+
+  /*
+   * In the m < n case, if the input b is used as the result (so b == _rb),
+   * then rb_ was originally m by nrhs but now should be n by nrhs.
+   * This is larger than before, so we need to expose the new rows by resizing.
+   */
+  if (m < n && b == rb_) {
+    THTensor_(resize2d)(rb_, n, nrhs);
+  }
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+  THTensor_(free)(work);
+  if (free_b) THTensor_(free)(b);
+}
+
+void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
+{
+  int n, lda, lwork, info, ldvr;
+  THTensor *work=nullptr, *wi, *wr, *a;
+  real wkopt;
+  real *rv_data;
+  int64_t i;
+
+  THTensor *re__ = NULL;
+  THTensor *rv__ = NULL;
+
+  THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square");
+
+  /* we want to definitely clone a_ for geev*/
+  a = THTensor_(cloneColumnMajor)(NULL, a_);
+
+  n = a->size[0];
+  lda = n;
+
+  wi = THTensor_(newWithSize1d)(n);
+  wr = THTensor_(newWithSize1d)(n);
+
+  rv_data = NULL;
+  ldvr = 1;
+  if (*jobvr == 'V')
+  {
+    THTensor_(resize2d)(rv_,n,n);
+    /* guard against someone passing a correct size, but wrong stride */
+    rv__ = THTensor_(newTransposedContiguous)(rv_);
+    rv_data = THTensor_(data)(rv__);
+    ldvr = n;
+  }
+  THTensor_(resize2d)(re_,n,2);
+  re__ = THTensor_(newContiguous)(re_);
+
+  if (n > 0) {  // lapack doesn't work with size 0
+    /* get optimal workspace size */
+    THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+        NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
+
+    lwork = (int)wkopt;
+    work = THTensor_(newWithSize1d)(lwork);
+
+    THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+        NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);
+
+    THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
+                             THCleanup(THTensor_(free)(re__);
+                                       THTensor_(free)(rv__);
+                                       THTensor_(free)(a);
+                                       THTensor_(free)(wi);
+                                       THTensor_(free)(wr);
+                                       THTensor_(free)(work);),
+                             "geev", info,"");
+  }
+
+  {
+    real *re_data = THTensor_(data)(re__);
+    real *wi_data = THTensor_(data)(wi);
+    real *wr_data = THTensor_(data)(wr);
+    for (i=0; i<n; i++)
+    {
+      re_data[2*i] = wr_data[i];
+      re_data[2*i+1] = wi_data[i];
+    }
+  }
+
+  if (*jobvr == 'V')
+  {
+    THTensor_(checkTransposed)(rv_);
+    THTensor_(freeCopyTo)(rv__, rv_);
+  }
+  THTensor_(freeCopyTo)(re__, re_);
+  THTensor_(free)(a);
+  THTensor_(free)(wi);
+  THTensor_(free)(wr);
+  THTensor_(free)(work);
+}
+
+void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
+{
+  if (a == NULL) a = rv_;
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
+
+  int n, lda, lwork, info;
+  THTensor *work = nullptr;
+  real wkopt;
+
+  THTensor *rv__ = NULL;
+  THTensor *re__ = NULL;
+
+  rv__ = THTensor_(cloneColumnMajor)(rv_, a);
+
+  n = rv__->size[0];
+  lda = n;
+
+  THTensor_(resize1d)(re_,n);
+  re__ = THTensor_(newContiguous)(re_);
+
+  /* get optimal workspace size */
+  if (n != 0) {
+    THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+                    THTensor_(data)(re_), &wkopt, -1, &info);
+    lwork = (int)wkopt;
+    work = THTensor_(newWithSize1d)(lwork);
+    THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+                    THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
+
+    THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
+                             THCleanup(THTensor_(free)(rv__);
+                                       THTensor_(free)(re__);
+                                       THTensor_(free)(work);),
+                             "syev", info,"");
+  }
+
+  // No eigenvectors specified
+  if (*jobz == 'N') {
+    THTensor_(fill)(rv_, 0);
+  }
+
+  THTensor_(freeCopyTo)(rv__, rv_);
+  THTensor_(freeCopyTo)(re__, re_);
+  THTensor_(free)(work);
+}
+
+void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu)
+{
+  THTensor *ra_ = THTensor_(new)();
+  THTensor_(gesvd2)(ru_, rs_, rv_,  ra_, a, jobu);
+  THTensor_(free)(ra_);
+}
+
+void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(!a->is_empty(), 1, "A should not be empty");
+
+  int k,m, n, lda, ldu, ldvt, lwork, info;
+  THTensor *work;
+  THTensor *rvf_ = THTensor_(new)();
+  real wkopt;
+
+  THTensor *ra__ = NULL;
+  THTensor *ru__ = NULL;
+  THTensor *rs__ = NULL;
+  THTensor *rv__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  k = (m < n ? m : n);
+
+  lda = m;
+  ldu = m;
+  ldvt = n;
+
+  THTensor_(resize1d)(rs_,k);
+  THTensor_(resize2d)(rvf_,ldvt,n);
+  if (*jobu == 'A')
+    THTensor_(resize2d)(ru_,m,ldu);
+  else
+    THTensor_(resize2d)(ru_,k,ldu);
+
+  THTensor_(checkTransposed)(ru_);
+
+  /* guard against someone passing a correct size, but wrong stride */
+  ru__ = THTensor_(newTransposedContiguous)(ru_);
+  rs__ = THTensor_(newContiguous)(rs_);
+  rv__ = THTensor_(newContiguous)(rvf_);
+
+  THLapack_(gesvd)(jobu[0],jobu[0],
+		   m,n,THTensor_(data)(ra__),lda,
+		   THTensor_(data)(rs__),
+		   THTensor_(data)(ru__),
+		   ldu,
+		   THTensor_(data)(rv__), ldvt,
+		   &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(gesvd)(jobu[0],jobu[0],
+		   m,n,THTensor_(data)(ra__),lda,
+		   THTensor_(data)(rs__),
+		   THTensor_(data)(ru__),
+		   ldu,
+		   THTensor_(data)(rv__), ldvt,
+		   THTensor_(data)(work),lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
+                           THCleanup(
+                               THTensor_(free)(ru__);
+                               THTensor_(free)(rs__);
+                               THTensor_(free)(rv__);
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "gesvd", info, "");
+
+  if (*jobu == 'S')
+    THTensor_(narrow)(rv__,NULL,1,0,k);
+
+  THTensor_(freeCopyTo)(ru__, ru_);
+  THTensor_(freeCopyTo)(rs__, rs_);
+  THTensor_(freeCopyTo)(rv__, rvf_);
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+
+  if (*jobu == 'S') {
+    THTensor_(narrow)(rvf_,NULL,1,0,k);
+  }
+  THTensor_(resizeAs)(rv_, rvf_);
+  THTensor_(copy)(rv_, rvf_);
+  THTensor_(free)(rvf_);
+}
+
+void THTensor_(getri)(THTensor *ra_, THTensor *a)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int m, n, lda, info, lwork;
+  real wkopt;
+  THIntTensor *ipiv;
+  THTensor *work;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  lda = m;
+  ipiv = THIntTensor_newWithSize1d((int64_t)m);
+
+  /* Run LU */
+  THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
+  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THIntTensor_free(ipiv);),
+                           "getrf", info, info);
+
+  /* Run inverse */
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info);
+  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);
+                               THIntTensor_free(ipiv);),
+                           "getri", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+  THIntTensor_free(ipiv);
+}
+
+void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
+{
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  /* Build full matrix */
+  real *p = THTensor_(data)(a);
+  int64_t i, j;
+
+  /* Upper Triangular Case */
+  if (uplo[0] == 'U')
+  {
+    /* Clear lower triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+     for (j=i+1; j<n; j++) {
+        p[n*i + j] = 0;
+      }
+    }
+  }
+  /* Lower Triangular Case */
+  else if (uplo[0] == 'L')
+  {
+    /* Clear upper triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+      for (j=0; j<i; j++) {
+        p[n*i + j] = 0;
+      }
+    }
+  }
+}
+
+void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
+{
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  /* Build full matrix */
+  real *p = THTensor_(data)(a);
+  int64_t i, j;
+
+  /* Upper Triangular Case */
+  if (uplo[0] == 'U')
+  {
+    /* Clear lower triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+     for (j=i+1; j<n; j++) {
+        p[n*i + j] = p[n*j+i];
+      }
+    }
+  }
+  /* Lower Triangular Case */
+  else if (uplo[0] == 'L')
+  {
+    /* Clear upper triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+      for (j=0; j<i; j++) {
+        p[n*i + j] = p[n*j+i];
+      }
+    }
+  }
+}
+
+void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n, lda, info;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  n = ra__->size[0];
+  lda = n;
+
+  /* Run Factorization */
+  THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
+  THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite",
+                           THCleanup(THTensor_(free)(ra__);),
+                           "potrf", info, "");
+
+  THTensor_(clearUpLoTriangle)(ra__, uplo);
+  THTensor_(freeCopyTo)(ra__, ra_);
+}
+
+void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
+{
+  int free_b = 0;
+  if (b == NULL) b = rb_;
+
+  THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->_dim());
+  THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->_dim());
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->_dim() == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
+
+  int n, nrhs, lda, ldb, info;
+  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
+  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
+
+  ra__ = THTensor_(cloneColumnMajor)(NULL, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  THLapack_(potrs)(uplo[0], n, nrhs, THTensor_(data)(ra__),
+                   lda, THTensor_(data)(rb__), ldb, &info);
+
+
+  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(rb__);
+                               if (free_b) THTensor_(free)(b);),
+                           "potrs", info, info);
+
+  if (free_b) THTensor_(free)(b);
+  THTensor_(free)(ra__);
+  THTensor_(freeCopyTo)(rb__, rb_);
+}
+
+void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n, lda, info;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  n = ra__->size[0];
+  lda = n;
+
+  /* Run inverse */
+  THLapack_(potri)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
+  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
+                           THCleanup(THTensor_(free)(ra__);),
+                           "potri", info, info);
+
+  THTensor_(copyUpLoTriangle)(ra__, uplo);
+  THTensor_(freeCopyTo)(ra__, ra_);
+}
+
+/*
+ Computes the Cholesky factorization with complete pivoting of a real symmetric
+ positive semidefinite matrix.
+
+ Args:
+ * `ra_`    - result Tensor in which to store the factor U or L from the
+              Cholesky factorization.
+ * `rpiv_`  - result IntTensor containing sparse permutation matrix P, encoded
+              as P[rpiv_[k], k] = 1.
+ * `a`      - input Tensor; the input matrix to factorize.
+ * `uplo`   - string; specifies whether the upper or lower triangular part of
+              the symmetric matrix A is stored. "U"/"L" for upper/lower
+              triangular.
+ * `tol`    - double; user defined tolerance, or < 0 for automatic choice.
+              The algorithm terminates when the pivot <= tol.
+ */
+void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) {
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  THIntTensor_resize1d(rpiv_, n);
+
+  // Allocate working tensor
+  THTensor *work = THTensor_(newWithSize1d)(2 * n);
+
+  // Run Cholesky factorization
+  int lda = n;
+  int rank, info;
+
+  THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda,
+                   THIntTensor_data(rpiv_), &rank, tol,
+                   THTensor_(data)(work), &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "pstrf", info,"");
+
+  THTensor_(clearUpLoTriangle)(ra__, uplo);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  Perform a QR decomposition of a matrix.
+
+  In LAPACK, two parts of the QR decomposition are implemented as two separate
+  functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped
+  directly, below - but to make the common usage convenient, we also provide
+  this function, which calls them both and returns the results in a more
+  intuitive form.
+
+  Args:
+  * `rq_` - result Tensor in which to store the Q part of the decomposition.
+  * `rr_` - result Tensor in which to store the R part of the decomposition.
+  * `a`   - input Tensor; the matrix to decompose.
+
+*/
+void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
+{
+  int m = a->size[0];
+  int n = a->size[1];
+  int k = (m < n ? m : n);
+  THTensor *ra_ = THTensor_(new)();
+  THTensor *rtau_ = THTensor_(new)();
+  THTensor *rr__ = THTensor_(new)();
+  THTensor_(geqrf)(ra_, rtau_, a);
+  THTensor_(resize2d)(rr__, k, ra_->size[1]);
+  THTensor_(narrow)(rr__, ra_, 0, 0, k);
+  THTensor_(triu)(rr_, rr__, 0);
+  THTensor_(resize2d)(rq_, ra_->size[0], k);
+  THTensor_(orgqr)(rq_, ra_, rtau_);
+  THTensor_(narrow)(rq_, rq_, 1, 0, k);
+  THTensor_(free)(ra_);
+  THTensor_(free)(rtau_);
+  THTensor_(free)(rr__);
+}
+
+/*
+  The geqrf function does the main work of QR-decomposing a matrix.
+  However, rather than producing a Q matrix directly, it produces a sequence of
+  elementary reflectors which may later be composed to construct Q - for example
+  with the orgqr function, below.
+
+  Args:
+  * `ra_`   - Result matrix which will contain:
+              i)  The elements of R, on and above the diagonal.
+              ii) Directions of the reflectors implicitly defining Q.
+  * `rtau_` - Result tensor which will contain the magnitudes of the reflectors
+              implicitly defining Q.
+  * `a`     - Input matrix, to decompose. If NULL, `ra_` is used as input.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
+{
+  if (a == NULL) ra_ = a;
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(!a->is_empty(), 1, "A should not be empty");
+
+  THTensor *ra__ = NULL;
+
+  /* Prepare the input for LAPACK, making a copy if necessary. */
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  int m = ra__->size[0];
+  int n = ra__->size[1];
+  int k = (m < n ? m : n);
+  int lda = m;
+  THTensor_(resize1d)(rtau_, k);
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rtau_),
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rtau_),
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "geqrf", info,"");
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  The orgqr function allows reconstruction of a matrix Q with orthogonal
+  columns, from a sequence of elementary reflectors, such as is produced by the
+  geqrf function.
+
+  Args:
+  * `ra_` - result Tensor, which will contain the matrix Q.
+  * `a`   - input Tensor, which should be a matrix with the directions of the
+            elementary reflectors below the diagonal. If NULL, `ra_` is used as
+            input.
+  * `tau` - input Tensor, containing the magnitudes of the elementary
+            reflectors.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+
+  THTensor *ra__ = NULL;
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  int m = ra__->size[0];
+  int k = tau->size[0];
+  int lda = m;
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(tau),
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(tau),
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "orgqr", info,"");
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  The ormqr function multiplies Q with another matrix from a sequence of
+  elementary reflectors, such as is produced by the geqrf function.
+
+  Args:
+  * `ra_`   - result Tensor, which will contain the matrix Q' c.
+  * `a`     - input Tensor, which should be a matrix with the directions of the
+              elementary reflectors below the diagonal. If NULL, `ra_` is used as
+              input.
+  * `tau`   - input Tensor, containing the magnitudes of the elementary
+              reflectors.
+  * `c`     - input Tensor, containing the matrix to be multiplied.
+  * `side`  - char, determining whether c is left- or right-multiplied with Q.
+  * `trans` - char, determining whether to transpose Q before multiplying.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+
+  THTensor *ra__ = NULL;
+  ra__ = THTensor_(cloneColumnMajor)(ra_, c);
+
+  int m = c->size[0];
+  int n = c->size[1];
+  int k = tau->size[0];
+  int lda;
+  if (*side == 'L')
+  {
+    lda = m;
+  }
+  else
+  {
+    lda = n;
+  }
+  int ldc = m;
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
+                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
+                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "ormqr", info,"");
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a)
+{
+  AT_CHECK(THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes());
+  if (!pivot) {
+    THError("btrifact without pivoting is not implemented on the CPU");
+  }
+
+  if (ra_ != a) {
+    THTensor_(resizeAs)(ra_, a);
+    THTensor_(copy)(ra_, a);
+  }
+
+  int m = a->size[1];
+  int n = a->size[2];
+  if (m != n) {
+    THError("btrifact is only implemented for square matrices");
+  }
+  int64_t num_batches = THTensor_(size)(a, 0);
+  THTensor *ra__;
+  int lda;
+
+  if (ra_->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = ra_->stride[2];
+    ra__ = ra_;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    THTensor *transp_r_ = THTensor_(newTranspose)(ra_, 1, 2);
+    ra__ = THTensor_(newClone)(transp_r_);
+    THTensor_(free)(transp_r_);
+    THTensor_(transpose)(ra__, NULL, 1, 2);
+    lda = ra__->stride[2];
+  }
+
+  THTensor *ai = THTensor_(new)();
+  THTensor *rai = THTensor_(new)();
+  THIntTensor *rpivoti = THIntTensor_new();
+
+  int info = 0;
+  int *info_ptr = &info;
+  if (rinfo_) {
+    THIntTensor_resize1d(rinfo_, num_batches);
+    info_ptr = THIntTensor_data(rinfo_);
+  }
+
+  THIntTensor_resize2d(rpivots_, num_batches, n);
+
+  int64_t batch = 0;
+  for (; batch < num_batches; ++batch) {
+    THTensor_(select)(ai, a, 0, batch);
+    THTensor_(select)(rai, ra__, 0, batch);
+    THIntTensor_select(rpivoti, rpivots_, 0, batch);
+
+    THLapack_(getrf)(n, n, THTensor_(data)(rai), lda,
+                     THIntTensor_data(rpivoti), info_ptr);
+    if (rinfo_) {
+      info_ptr++;
+    } else if (info != 0) {
+      break;
+    }
+  }
+
+  THTensor_(free)(ai);
+  THTensor_(free)(rai);
+  THIntTensor_free(rpivoti);
+
+  if (ra__ != ra_) {
+    THTensor_(freeCopyTo)(ra__, ra_);
+  }
+
+  if (!rinfo_ && info != 0) {
+    THError("failed to factorize batch element %ld (info == %d)", batch, info);
+  }
+}
+
+void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
+{
+  AT_CHECK(!atf->is_empty() && THTensor_(nDimension)(atf) == 3, "expected non-empty 3D tensor, got size: ",
+           atf->sizes());
+  AT_CHECK(!b->is_empty() && (THTensor_(nDimension)(b) == 3 ||
+             THTensor_(nDimension)(b) == 2), "expected non-empty 2D or 3D tensor, got size: ", b->sizes());
+  THArgCheck(THTensor_(size)(atf, 0) ==
+             THTensor_(size)(b, 0), 3, "number of batches must be equal");
+  THArgCheck(THTensor_(size)(atf, 1) ==
+             THTensor_(size)(atf, 2), 3, "A matrices must be square");
+  THArgCheck(THTensor_(size)(atf, 1) ==
+             THTensor_(size)(b, 1), 3, "dimensions of A and b must be equal");
+
+  if (rb_ != b) {
+    THTensor_(resizeAs)(rb_, b);
+    THTensor_(copy)(rb_, b);
+  }
+
+  int64_t num_batches = atf->size[0];
+  int64_t n = atf->size[1];
+  int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1;
+
+  int lda, ldb;
+  THTensor *atf_;
+  THTensor *rb__;
+
+  // correct ordering of A
+  if (atf->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = atf->stride[2];
+    atf_ = atf;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    // it would be nice if we could use the op(A) flags to automatically
+    // transpose A if needed, but this leads to unpredictable behavior if the
+    // user clones A_tf later with a different ordering
+    THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2);
+    atf_ = THTensor_(newClone)(transp_r_);
+    THTensor_(free)(transp_r_);
+    THTensor_(transpose)(atf_, NULL, 1, 2);
+    lda = atf_->stride[2];
+  }
+
+  // correct ordering of B
+  if (rb_->stride[1] == 1) {
+    // column ordered
+    if (rb_->_dim() == 2 || rb_->size[2] == 1) {
+      ldb = n;
+    } else {
+      ldb = rb_->stride[2];
+    }
+    rb__ = rb_;
+  } else {
+    // make column ordered
+    if (rb_->_dim() > 2) {
+      THTensor *transp_r_ = THTensor_(newTranspose)(rb_, 1, 2);
+      rb__ = THTensor_(newClone)(transp_r_);
+      THTensor_(free)(transp_r_);
+      THTensor_(transpose)(rb__, NULL, 1, 2);
+      ldb = rb__->stride[2];
+    } else {
+      rb__ = THTensor_(newClone)(rb_);
+      ldb = n;
+    }
+  }
+
+  THTensor *ai = THTensor_(new)();
+  THTensor *rbi = THTensor_(new)();
+  THIntTensor *pivoti = THIntTensor_new();
+
+  if (!THIntTensor_isContiguous(pivots)) {
+      THError("Error: rpivots_ is not contiguous.");
+  }
+
+  for (int64_t batch = 0; batch < num_batches; ++batch) {
+    THTensor_(select)(ai, atf_, 0, batch);
+    THTensor_(select)(rbi, rb__, 0, batch);
+    THIntTensor_select(pivoti, pivots, 0, batch);
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    int info;
+    THLapack_(getrs)('N', n, nrhs, THTensor_(data)(ai), lda,
+                     THIntTensor_data(pivoti), THTensor_(data)(rbi),
+                     ldb, &info);
+    if (info != 0) {
+      THError("Error: Nonzero info.");
+    }
+#else
+    THError("Unimplemented");
+#endif
+  }
+
+  THTensor_(free)(ai);
+  THTensor_(free)(rbi);
+  THIntTensor_free(pivoti);
+
+  if (atf_ != atf) {
+    THTensor_(free)(atf_);
+  }
+
+  if (rb__ != rb_) {
+    THTensor_(freeCopyTo)(rb__, rb_);
+  }
+}
+
+#endif
diff --git a/aten/src/TH/generic/THTensorLapack.h b/aten/src/TH/generic/THTensorLapack.h
new file mode 100644
index 0000000..8785943
--- /dev/null
+++ b/aten/src/TH/generic/THTensorLapack.h
@@ -0,0 +1,25 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorLapack.h"
+#else
+
+TH_API void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
+TH_API void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_, const char *uplo, const char *trans, const char *diag);
+TH_API void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
+TH_API void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobz, const char *uplo);
+TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr);
+TH_API void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobu);
+TH_API void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobu);
+TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a);
+TH_API void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo);
+TH_API void THTensor_(potrs)(THTensor *rb_, THTensor *b_, THTensor *a_,  const char *uplo);
+TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo);
+TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a);
+TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a);
+TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau);
+TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans);
+TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol);
+
+TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a);
+TH_API void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots);
+
+#endif
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
new file mode 100644
index 0000000..8559d5d
--- /dev/null
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -0,0 +1,4677 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorMath.cpp"
+#else
+
+#ifndef NAN
+  #define NAN (nan(NULL))
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define HYPER_TH_OMP_OVERHEAD_THRESHOLD 2000
+#define ORDIN_TH_OMP_OVERHEAD_THRESHOLD 20000
+#define UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD 50000
+#define TH_OMP_OVERHEAD_THRESHOLD 100000
+
+#ifdef _OPENMP
+
+#ifndef _WIN32
+#define PRAGMA(P) _Pragma(#P)
+#else
+#define PRAGMA(P) __pragma(P)
+#endif
+
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+  int inOmp = omp_in_parallel(); \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
+  PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE *TENSOR##_data = THTensor_(data)(TENSOR) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+  TYPE *TENSOR##_data = THTensor_(data)(TENSOR); \
+  ptrdiff_t TENSOR##_len = THTensor_(nElement)(TENSOR); \
+  CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  int inOmp = omp_in_parallel(); \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+  PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+  CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+  int inOmp = omp_in_parallel(); \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+  PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+    TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+  TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3); \
+  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+  CODE \
+}
+#endif
+
+#define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \
+{ \
+  if(!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
+    THError("inconsistent tensor size, expected %s %s and %s %s to have the same size", \
+            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \
+  } \
+}
+
+// Used for `scatter` and `scatterAdd`
+// Assumes TENSOR1 is real
+//         TENSOR2 is src
+//         TENSOR3 is index
+// Tests:
+//   1. index->size[d] <= src->size[d] for all d
+//   2. index->size[d] <= real->size[d] for all d != dim
+#define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
+{ \
+  int shape_check_flag = 0; \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+  { \
+    int64_t TENSOR3##_dim_size = TENSOR3->size[TH_TENSOR_DIM_APPLY_i]; \
+    if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \
+      if (TENSOR3##_dim_size > TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) { \
+        shape_check_flag = 1; \
+        break; \
+      } \
+    } \
+    if (TENSOR3##_dim_size > TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+      shape_check_flag = 1; \
+      break; \
+    } \
+  } \
+  if (shape_check_flag == 1) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \
+    THError("Expected %s %s to be smaller size than %s %s and to be smaller than %s %s apart from dimension %d", \
+            #TENSOR3, T3buff.str, #TENSOR2, T2buff.str, #TENSOR1, T1buff.str, DIMENSION); \
+  } \
+}
+
+static inline real THTensor_(powOne)(real x, real y) {
+#if defined(TH_REAL_IS_FLOAT)
+  return powf(x, y);
+#elif defined(TH_REAL_IS_DOUBLE)
+  return pow(x, y);
+#else
+  THArgCheck(y >= 0, 1,
+      "Integers to negative integer powers are not allowed");
+  real result = 1;
+  while (y) {
+    if (y & 1) {
+       result *= x;
+    }
+    y /= 2;
+    x *= x;
+  }
+  return result;
+#endif
+}
+
+void THTensor_(fill)(THTensor *r_, real value)
+{
+  if (THTensor_(isContiguous)(r_) || THTensor_(isTransposed)(r_)) {
+    TH_TENSOR_APPLY_CONTIG(real, r_, THVector_(fill)(r__data, value, r__len););
+  } else {
+    TH_TENSOR_APPLY(real, r_,
+      if (r__stride == 1) {
+        THVector_(fill)(r__data, value, r__size);
+	r__i = r__size;
+	r__data += r__stride * r__size;
+	break;
+      } else {
+        *r__data = value;
+      }
+      );
+  }
+}
+
+void THTensor_(zero)(THTensor *r_)
+{
+  THTensor_(fill)(r_, 0);
+}
+
+void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value)
+{
+  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(tensor_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = value;
+                   });
+}
+
+void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
+{
+  THTensor *srct = THTensor_(newContiguous)(src);
+  real *src_data = THTensor_(data)(srct);
+  ptrdiff_t cntr = 0;
+  ptrdiff_t nelem = THTensor_(nElement)(srct);
+  if (THTensor_(nElement)(tensor) != THByteTensor_nElement(mask))
+  {
+    THTensor_(free)(srct);
+    THError("Number of elements of destination tensor != Number of elements in mask");
+  }
+  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THTensor_(free)(srct);
+                     THFree(mask_counter);
+                     THFree(tensor_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     if (cntr == nelem)
+                     {
+                       THTensor_(free)(srct);
+                       THFree(mask_counter);
+                       THFree(tensor_counter);
+                       THError("Number of elements of src < number of ones in mask");
+                     }
+                     *tensor_data = *src_data;
+                     src_data++;
+                     cntr++;
+                   });
+  THTensor_(free)(srct);
+}
+
+void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
+{
+  ptrdiff_t numel = THByteTensor_sumall(mask);
+  real *tensor_data;
+
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = THTensor_(data)(tensor);
+  TH_TENSOR_APPLY2(real, src, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(src_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
+// Finds non-zero elements of a tensor and returns their subscripts
+void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
+{
+  ptrdiff_t numel = 0;
+  int64_t *subscript_data;
+  int64_t i = 0;
+  int64_t dim;
+  int64_t div = 1;
+#ifdef TH_REAL_IS_HALF
+#define IS_NONZERO(val) ((val.x & 0x7fff) != 0)
+#else
+#define IS_NONZERO(val) ((val)!=0)
+#endif
+
+  /* First Pass to determine size of subscripts */
+  TH_TENSOR_APPLY(real, tensor,
+                  if IS_NONZERO(*tensor_data) {
+                    ++numel;
+                  });
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THLongTensor_resize2d(subscript, numel, tensor->dim());
+
+  /* Second pass populates subscripts */
+  subscript_data = THLongTensor_data(subscript);
+  TH_TENSOR_APPLY(real, tensor,
+                  if IS_NONZERO(*tensor_data) {
+                    div = 1;
+
+                    for (dim = tensor->dim() - 1; dim >= 0; dim--) {
+                      *(subscript_data + dim) = (i/div) % tensor->size[dim];
+                      div *= tensor->size[dim];
+                    }
+
+                    subscript_data += tensor->dim();
+                  }
+                  ++i;);
+}
+
+void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
+{
+  ptrdiff_t i, numel;
+  THLongStorage *newSize;
+  THTensor *tSlice, *sSlice;
+  int64_t *index_data;
+  real *tensor_data, *src_data;
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(index->_dim() <= 1, 3, "Index is supposed to be an empty tensor or a vector");
+  THArgCheck(dim < src->_dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  THArgCheck(src->_dim() > 0, 2, "Source tensor is empty");
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional");
+  THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  //THArgCheck(src->dim() > 0, 2, "Source tensor is empty");
+#endif
+
+  numel = THLongTensor_nElement(index);
+
+  newSize = THLongStorage_newWithSize(src->dim());
+  THLongStorage_rawCopy(newSize,src->size);
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THLongStorage_data(newSize)[dim] = numel;
+  THTensor_(resize)(tensor,newSize,NULL);
+  THLongStorage_free(newSize);
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (dim == 0 && THTensor_(isContiguous)(src) && THTensor_(isContiguous)(tensor))
+  {
+    tensor_data = THTensor_(data)(tensor);
+    src_data = THTensor_(data)(src);
+    ptrdiff_t rowsize = src->size[0] == 0 ? 1: THTensor_(nElement)(src) / src->size[0];
+
+    // check that the indices are within range
+    int64_t max = src->size[0] - 1 + TH_INDEX_BASE;
+    for (i=0; i<numel; i++) {
+      if (index_data[i] < TH_INDEX_BASE || index_data[i] > max) {
+        THLongTensor_free(index);
+        THError("index out of range");
+      }
+    }
+
+    if (src->dim() == 1) {
+      #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<numel; i++)
+        tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
+    } else {
+      #pragma omp parallel for if(numel*rowsize > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<numel; i++)
+        memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(real));
+    }
+  }
+  else if (src->dim() == 1)
+  {
+    for (i=0; i<numel; i++)
+      THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i] - TH_INDEX_BASE));
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      tSlice = THTensor_(new)();
+      sSlice = THTensor_(new)();
+      THTensor_(select)(tSlice, tensor, dim, i);
+      THTensor_(select)(sSlice, src, dim, index_data[i] - TH_INDEX_BASE);
+      THTensor_(copy)(tSlice, sSlice);
+      THTensor_(free)(tSlice);
+      THTensor_(free)(sSlice);
+    }
+  }
+
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  ptrdiff_t i, numel;
+  THTensor *tSlice, *sSlice;
+  int64_t *index_data;
+
+  // Error checking for this function has moved to ATen!!
+
+  numel = THLongTensor_nElement(index);
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (tensor->dim() > 1 )
+  {
+    tSlice = THTensor_(new)();
+    sSlice = THTensor_(new)();
+
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(select)(tSlice, tensor, dim, index_data[i] - TH_INDEX_BASE);
+      THTensor_(select)(sSlice, src, dim, i);
+      THTensor_(copy)(tSlice, sSlice);
+    }
+
+    THTensor_(free)(tSlice);
+    THTensor_(free)(sSlice);
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(set1d)(tensor, index_data[i] - TH_INDEX_BASE, THTensor_(get1d)(src,i));
+    }
+  }
+  THLongTensor_free(index);
+}
+
+static ptrdiff_t THTensor_(dataOffset)(THTensor* tensor, ptrdiff_t linearIndex) {
+  int64_t *size = tensor->size;
+  int64_t *stride = tensor->stride;
+  int nDim = tensor->_dim();
+  ptrdiff_t dataOffset = 0;
+  for (int i = nDim - 1; i >= 0; i--) {
+    dataOffset += (linearIndex % size[i]) * stride[i];
+    linearIndex /= size[i];
+  }
+  return dataOffset;
+}
+
+static inline void THTensor_(checkLinearIndex)(int64_t linearIndex, int64_t numel) {
+  THArgCheck(linearIndex < numel && linearIndex >= -numel, 2, "out of range: %d out of %d", (int)linearIndex, (int)numel);
+}
+
+static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t numel) {
+  return linearIndex < 0 ? linearIndex + numel : linearIndex;
+}
+
+void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index)
+{
+  THTensor_(resizeNd)(r_, index->dim(), index->size, NULL);
+  THTensor* dst = THTensor_(newContiguous)(r_);
+
+  index = THLongTensor_newContiguous(index);
+  int64_t* index_data = THLongTensor_data(index);
+  ptrdiff_t srcElements = THTensor_(nElement)(src);
+  real* src_data = THTensor_(data)(src);
+  real* dst_data = THTensor_(data)(dst);
+  ptrdiff_t nIndices = THLongTensor_nElement(index);
+  int isContiguous = THTensor_(isContiguous)(src);
+
+  // Exceptions must not be thrown across OpenMP parallel sections, so we
+  // record the position of the invalid index and throw the exception after the
+  // loop.
+  std::atomic<int64_t> invalidIdxPos(-1);
+
+  ptrdiff_t i;
+  #pragma omp parallel for if(nIndices > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+  for (i = 0; i < nIndices; i++) {
+    int64_t idx = index_data[i];
+    if (idx < srcElements && idx >= -srcElements) {
+      idx = THTensor_(wrapLinearIndex)(idx, srcElements);
+      if (isContiguous) {
+        dst_data[i] = src_data[idx];
+      } else {
+        dst_data[i] = src_data[THTensor_(dataOffset)(src, idx)];
+      }
+    } else {
+      int64_t tmp = -1;
+      invalidIdxPos.compare_exchange_strong(tmp, i);
+    }
+  }
+
+  if (invalidIdxPos >= 0) {
+    THTensor_(checkLinearIndex)(index_data[invalidIdxPos], srcElements);
+  }
+
+  THLongTensor_free(index);
+  THTensor_(freeCopyTo)(dst, r_);
+}
+
+void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate)
+{
+  THArgCheck(THLongTensor_nElement(index) == THTensor_(nElement)(src), 3,
+    "src should have the same number of elements as index");
+
+  index = THLongTensor_newContiguous(index);
+  src = THTensor_(newContiguous)(src);
+  real* data = THTensor_(data)(tensor);
+  ptrdiff_t numel = THTensor_(nElement)(tensor);
+  int is_contiguous = THTensor_(isContiguous)(tensor);
+
+  TH_TENSOR_APPLY2(int64_t, index, real, src,
+    THTensor_(checkLinearIndex)(*index_data, numel);
+    int64_t linearIndex = THTensor_(wrapLinearIndex)(*index_data, numel);
+    int64_t dataOffset = is_contiguous ? linearIndex : THTensor_(dataOffset)(tensor, linearIndex);
+    if (accumulate) {
+      data[dataOffset] += *src_data;
+    } else {
+      data[dataOffset] = *src_data;
+    }
+  );
+
+  THTensor_(free)(src);
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  ptrdiff_t i, numel;
+  THTensor *tSlice, *sSlice;
+  int64_t *index_data;
+
+  numel = THLongTensor_nElement(index);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
+  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (tensor->dim() > 1)
+  {
+    tSlice = THTensor_(new)();
+    sSlice = THTensor_(new)();
+
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(select)(tSlice, tensor, dim, index_data[i] - TH_INDEX_BASE);
+      THTensor_(select)(sSlice, src, dim, i);
+      THTensor_(cadd)(tSlice, tSlice, 1.0, sSlice);
+    }
+
+    THTensor_(free)(tSlice);
+    THTensor_(free)(sSlice);
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(set1d)(tensor,
+              index_data[i] - TH_INDEX_BASE,
+              THTensor_(get1d)(src,i) + THTensor_(get1d)(tensor,index_data[i] - TH_INDEX_BASE));
+    }
+  }
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
+{
+  ptrdiff_t i, numel;
+  THTensor *tSlice;
+  int64_t *index_data;
+
+  numel = THLongTensor_nElement(index);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  for (i=0; i<numel; i++)
+  {
+    if (tensor->dim() > 1)
+    {
+      tSlice = THTensor_(new)();
+      THTensor_(select)(tSlice, tensor,dim,index_data[i] - TH_INDEX_BASE);
+      THTensor_(fill)(tSlice, val);
+      THTensor_(free)(tSlice);
+    }
+    else
+    {
+      THTensor_(set1d)(tensor, index_data[i] - TH_INDEX_BASE, val);
+    }
+  }
+  THLongTensor_free(index);
+}
+
+void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
+{
+  int64_t elems_per_row, i, idx;
+
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4,
+             "Index tensor must have same dimensions as input tensor");
+  THArgCheck(dim >= 0 && dim < THTensor_(nDimension)(tensor), 3,
+             "Index dimension is out of bounds");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2,
+             "Input tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
+                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= src_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in gather");
+                         }
+                         *(tensor_data + i*tensor_stride) = src_data[(idx - TH_INDEX_BASE) * src_stride];
+                       })
+}
+
+void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  int64_t elems_per_row, i, idx;
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+#else
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+#endif
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
+                       TH_TENSOR_DIM_APPLY3_SIZE_SCATTER,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatter");
+                         }
+                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = *(src_data + i*src_stride);
+                       })
+}
+
+void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  int64_t elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
+                       TH_TENSOR_DIM_APPLY3_SIZE_SCATTER,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatterAdd");
+                         }
+                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] += *(src_data + i*src_stride);
+                       })
+}
+
+void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
+{
+  int64_t elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY2(real, tensor, int64_t, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatter");
+                         }
+                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = val;
+                       })
+}
+
+accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
+{
+  accreal sum = 0;
+  /* we use a trick here. careful with that. */
+  TH_TENSOR_APPLY2(real, tensor, real, src,
+                   int64_t sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i);
+                   sum += THBlas_(dot)(sz, src_data, src_stride, tensor_data, tensor_stride);
+                   tensor_i += sz;
+                   src_i += sz;
+                   tensor_data += sz*tensor_stride;
+                   src_data += sz*src_stride;
+                   break;);
+  return sum;
+}
+
+
+#undef th_isnan
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan(val) \
+(std::isnan(val))
+#else
+#define th_isnan(val) (0)
+#endif
+
+#undef th_isnan_break
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan_break(val) \
+if (std::isnan(val)) break;
+#else
+#define th_isnan_break(val)
+#endif
+
+real THTensor_(minall)(THTensor *tensor)
+{
+  real theMin;
+  real value;
+
+  THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension");
+  theMin = THTensor_(data)(tensor)[0];
+  TH_TENSOR_APPLY(real, tensor,
+                  value = *tensor_data;
+                  /* This is not the same as value<theMin in the case of NaNs */
+                  if(!(value >= theMin))
+                  {
+                    theMin = value;
+                    th_isnan_break(value)
+                  });
+  return theMin;
+}
+
+real THTensor_(maxall)(THTensor *tensor)
+{
+  real theMax;
+  real value;
+
+  THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension");
+  theMax = THTensor_(data)(tensor)[0];
+  TH_TENSOR_APPLY(real, tensor,
+                  value = *tensor_data;
+                  /* This is not the same as value>theMax in the case of NaNs */
+                  if(!(value <= theMax))
+                  {
+                    theMax = value;
+                    th_isnan_break(value)
+                  });
+  return theMax;
+}
+
+static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride);
+
+real THTensor_(medianall)(THTensor *tensor)
+{
+  THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension");
+
+  real theMedian;
+  ptrdiff_t numel;
+  int64_t k;
+  THTensor *temp_;
+  real *temp__data;
+
+  numel = THTensor_(nElement)(tensor);
+  k = (numel-1) >> 1;
+
+  temp_ = THTensor_(newClone)(tensor);
+  temp__data = THTensor_(data)(temp_);
+
+  THTensor_(quickselectnoidx)(temp__data, k, numel, 1);
+
+  theMedian = temp__data[k];
+
+  THTensor_(free)(temp_);
+
+  return theMedian;
+}
+
+accreal THTensor_(sumall)(THTensor *tensor)
+{
+  accreal sum = 0;
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if(inOMP) {
+    serial_path = 1;
+  } else {
+    TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, +:sum, sum += *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+  }
+#else
+    serial_path = 1;
+#endif
+  if (serial_path) {
+    TH_TENSOR_APPLY(real, tensor, sum += *tensor_data;);
+  }
+  return sum;
+}
+
+accreal THTensor_(prodall)(THTensor *tensor)
+{
+  accreal prod = 1;
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if(inOMP) {
+    serial_path = 1;
+  } else {
+    TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, *:prod, prod *= *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+  }
+#else
+    serial_path = 1;
+#endif
+  if (serial_path) {
+    TH_TENSOR_APPLY(real, tensor, prod *= *tensor_data;);
+  }
+  return prod;
+}
+
+void THTensor_(add)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(adds)(r__data, t_data, value, r__len););
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data + value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD)
+    }
+#else
+    (void)r_Size;
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+  }
+}
+
+void THTensor_(sub)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(add)(r_, t, -value);
+}
+
+void THTensor_(add_scaled)(THTensor *r_, THTensor *t, real value, real alpha)
+{
+  THTensor_(add)(r_, t, value * alpha);
+}
+
+void THTensor_(sub_scaled)(THTensor *r_, THTensor *t, real value, real alpha)
+{
+  THTensor_(add)(r_, t, -value * alpha);
+}
+
+void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(muls)(r__data, t_data, value, r__len););
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data * value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD)
+    }
+#else
+    (void)r_Size;
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+  }
+}
+
+void THTensor_(div)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(divs)(r__data, t_data, value, r__len););
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data / value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD)
+    }
+#else
+    (void)r_Size;
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+  }
+}
+
+void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+  return THTensor_(mul)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+  return THTensor_(mul)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+  return THError("lshift is not supported for torch.HalfTensor");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+    for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_BYTE)
+      rp[i] = ((real) tp[i]) << value;
+#else
+      rp[i] = ((ureal) tp[i]) << value;
+#endif
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+#if defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((real) *t_data) << value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((ureal) *t_data) << value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_BYTE)
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) << value););
+#else
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((ureal) *t_data) << value););
+#endif
+  }
+#endif
+}
+
+void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+  return THTensor_(div)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+  return THTensor_(div)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+  return THError("rshift is not supported for torch.HalfTensor");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+    for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_BYTE)
+      rp[i] = ((real) tp[i]) >> value;
+#else
+      rp[i] = ((ureal) tp[i]) >> value;
+#endif
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+#if defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((real) *t_data) >> value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((ureal) *t_data) >> value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_BYTE)
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value););
+#else
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((ureal) *t_data) >> value););
+#endif
+  }
+#endif
+}
+
+void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      rp[i] = fmod(tp[i], value);
+#else
+      rp[i] = tp[i] % value;
+#endif
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = fmod(*t_data, value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (*t_data % value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value););
+#else
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data % value););
+#endif
+  }
+}
+
+// Should wrap if the value (a) has a different sign than the divisor (b), but is not 0.
+static inline bool modulo_wrap(real a, real b) {
+  return (a != 0) && (a < 0) != (b < 0);
+}
+
+void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      rp[i] = (value == 0)? NAN : tp[i] - value * floor(tp[i] / value);
+#else
+      // There is no NAN for integers
+      rp[i] = tp[i] % value;
+      if (modulo_wrap(rp[i], value))
+        rp[i] += value;
+#endif
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+      // There is no NAN for integers
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data % value;
+                                        if (modulo_wrap(*r__data, value)) *r__data += value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
+#else
+    // There is no NAN for integers
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data % value;
+                                          if (modulo_wrap(*r__data, value)) *r__data += value;);
+#endif
+  }
+}
+
+void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)value;
+  return THError("bitand is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int serial_path = 0;
+  int tContig = THTensor_(isContiguous)(t);
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+    for (i=0; i<r_Size; i++) {
+      rp[i] = tp[i] & value;
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data & value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data & value;);
+  }
+#endif
+}
+
+void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)value;
+  return THError("bitor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+    for (i=0; i<r_Size; i++) {
+      rp[i] = tp[i] | value;
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data | value;);
+  }
+#endif
+}
+
+void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)value;
+  return THError("bitxor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+    for (i=0; i<r_Size; i++) {
+      rp[i] = tp[i] ^ value;
+    }
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data ^ value;);
+  }
+#endif
+}
+
+void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    /* real t_val; */
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<r_Size; i++)
+      rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+  } else {
+#ifdef _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
+  }
+}
+
+void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      if(r_ == t) {
+        THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1);
+      } else {
+        TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len););
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
+  }
+}
+
+void THTensor_(csub)(THTensor *r_, THTensor *t, real value, THTensor *src)
+{
+  THTensor_(cadd)(r_, t, -value, src);
+}
+
+void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cmul)(r__data, t_data, src_data, r__len););
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
+  }
+}
+
+void THTensor_(pow)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if(value == 1){
+    THTensor_(copy)(r_, t);
+  }
+  else if(value == 2){
+    THTensor_(cmul)(r_, t, t);
+  }
+  else if(value == 3){
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * *t_data * *t_data;);
+  }
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+  else if(value == 0.5){
+    THTensor_(sqrt)(r_, t);
+  }
+  else if(value == -0.5){
+    THTensor_(rsqrt)(r_, t);
+  }
+  else if(value == -1){
+    THTensor_(cinv)(r_, t);
+  }
+  else if(value == -2){
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = TH_MATH_NAME(1.0) / (*t_data * *t_data););
+  }
+  else{
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = TH_MATH_NAME(pow)(*t_data, value););
+  }
+#undef TH_MATH_NAME
+#else
+  else {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = THTensor_(powOne)(*t_data, value););
+  }
+#endif
+}
+
+void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++)
+        rp[i] = THTensor_(powOne)(tp[i], sp[i]);
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = THTensor_(powOne)(*t_data, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = THTensor_(powOne)(*t_data, *src_data););
+  }
+}
+
+void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cdiv)(r__data, t_data, src_data, r__len););
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data / *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+  }
+}
+
+void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_HALF)
+  return THError("clshift is not supported for torch.HalfTensor");
+#endif
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+        rp[i] = tp[i] * powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+        rp[i] = tp[i] * pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+        rp[i] = ((real) tp[i]) << sp[i];
+#else
+        rp[i] = ((ureal) tp[i]) << sp[i];
+#endif
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+#if defined(TH_REAL_IS_FLOAT)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data * powf(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#elif defined(TH_REAL_IS_DOUBLE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data * pow(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#elif defined(TH_REAL_IS_BYTE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((ureal)*t_data) << *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;);
+#else
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((ureal)*t_data) << *src_data;);
+#endif
+  }
+}
+
+void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_HALF)
+  return THError("crshift is not supported for torch.HalfTensor");
+#endif
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+        rp[i] = tp[i] / powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+        rp[i] = tp[i] / pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+        rp[i] = ((real) tp[i]) >> sp[i];
+#else
+        rp[i] = ((ureal) tp[i]) >> sp[i];
+#endif
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+#if defined(TH_REAL_IS_FLOAT)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#elif defined(TH_REAL_IS_DOUBLE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#elif defined(TH_REAL_IS_BYTE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((ureal)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;);
+#else
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((ureal)*t_data) >> *src_data;);
+#endif
+  }
+}
+
+void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+        rp[i] = fmod(tp[i], sp[i]);
+#else
+        rp[i] = tp[i] % sp[i];
+#endif
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig,real, r_, real, t, real, src, *r__data = fmod(*t_data, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = (*t_data % *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = fmod(*t_data, *src_data););
+#else
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*t_data % *src_data););
+#endif
+  }
+}
+
+void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+        rp[i] = (sp[i] == 0)? NAN : tp[i] - sp[i] * floor(tp[i] / sp[i]);
+#else
+        // There is no NAN for integers
+        rp[i] = tp[i] % sp[i];
+        if (modulo_wrap(rp[i], sp[i]))
+          rp[i] += sp[i];
+#endif
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#else
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data % *src_data;
+                                                     if (modulo_wrap(*r__data, *src_data)) *r__data += *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+#endif
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data););
+#else
+    // There is no NAN for integers
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data % *src_data;
+                                                     if (modulo_wrap(*r__data, *src_data)) *r__data += *src_data;);
+#endif
+
+  }
+}
+
+void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitand is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+        rp[i] = tp[i] & sp[i];
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data & *src_data;);
+  }
+#endif
+}
+
+void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+        rp[i] = tp[i] | sp[i];
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data | *src_data;);
+  }
+#endif
+}
+
+void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitxor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  int serial_path = 0;
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      int64_t i;
+      #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<r_Size; i++) {
+        rp[i] = tp[i] ^ sp[i];
+      }
+    } else {
+#if _OPENMP
+      int inOMP = omp_in_parallel();
+      if (inOMP) {
+        serial_path = 1;
+      } else {
+        TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+      }
+#else
+      serial_path = 1;
+#endif
+    }
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data ^ *src_data;);
+  }
+#endif
+}
+
+void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int serial_path = 0;
+  if (r_Contig && tContig) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    int64_t i;
+    #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<r_Size; i++)
+      rp[i] = THTensor_(powOne)(value, tp[i]);
+  } else {
+#if _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = THTensor_(powOne)(value, *t_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    serial_path = 1;
+#endif
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = THTensor_(powOne)(value, *t_data););
+  }
+}
+
+void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
+{
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t src1Size = THTensor_(nElement)(src1);
+  int64_t src2Size = THTensor_(nElement)(src2);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int src1Contig = THTensor_(isContiguous)(src1);
+  int src2Contig = THTensor_(isContiguous)(src2);
+  int serial_path = 0;
+  if( (src1Size == src2Size) && (src1Size == r_Size) ){
+#if _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, src1Contig, src2Contig, real, r_, real, src1, real, src2, *r__data += value * *src1_data * *src2_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    (void)r_Contig;
+    (void)src1Contig;
+    (void)src2Contig;
+    serial_path = 1;
+#endif
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data * *src2_data;);
+  }
+}
+
+void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
+{
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t src1Size = THTensor_(nElement)(src1);
+  int64_t src2Size = THTensor_(nElement)(src2);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int src1Contig = THTensor_(isContiguous)(src1);
+  int src2Contig = THTensor_(isContiguous)(src2);
+  int serial_path = 0;
+  if( (src1Size == src2Size) && (src1Size == r_Size) ){
+#if _OPENMP
+    int inOMP = omp_in_parallel();
+    if (inOMP) {
+      serial_path = 1;
+    } else {
+      TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, src1Contig, src2Contig, real, r_, real, src1, real, src2, *r__data += value * *src1_data / *src2_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+#else
+    (void)r_Contig;
+    (void)src1Contig;
+    (void)src2Contig;
+    serial_path = 1;
+#endif
+  } else {
+    serial_path = 1;
+  }
+  if (serial_path) {
+    TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data / *src2_data;);
+  }
+}
+
+void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
+{
+  if( (mat->dim() != 2) || (vec->dim() != 1) )
+    THError("matrix and vector expected, got %dD, %dD",
+      mat->dim(), vec->dim());
+
+  if( mat->size[1] != vec->size[0] ) {
+    THDescBuff bm = THTensor_(sizeDesc)(mat);
+    THDescBuff bv = THTensor_(sizeDesc)(vec);
+    THError("size mismatch, %s, %s", bm.str, bv.str);
+  }
+
+  if(t->dim() != 1)
+    THError("vector expected, got t: %dD", t->dim());
+
+  if(t->size[0] != mat->size[0]) {
+    THDescBuff bt = THTensor_(sizeDesc)(t);
+    THDescBuff bm = THTensor_(sizeDesc)(mat);
+    THError("size mismatch, t: %s, mat: %s", bt.str, bm.str);
+  }
+
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  // n == 1 || lda >= max(1, m)
+  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
+
+  if(mat->stride[0] == 1 && LDA_COND(mat->size[0], mat->size[1], mat->stride[1]))
+  {
+    THBlas_(gemv)('n', mat->size[0], mat->size[1],
+                  alpha, THTensor_(data)(mat), mat->stride[1],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+  }
+  else if(mat->stride[1] == 1 && LDA_COND(mat->size[1], mat->size[0], mat->stride[0]))
+  {
+    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
+                  alpha, THTensor_(data)(mat), mat->stride[0],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+  }
+  else
+  {
+    THTensor *cmat = THTensor_(newContiguous)(mat);
+
+    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
+                  alpha, THTensor_(data)(cmat), cmat->stride[0],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+
+    THTensor_(free)(cmat);
+  }
+
+  #undef LDA_COND
+}
+
+void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
+{
+  int64_t N1 = m1->size[0];
+  int64_t N2 = m2->size[0];
+  int64_t dim;
+  real *m1_p;
+  real *m2_p;
+  real *r_p;
+  int64_t i;
+
+  THTensor_(resize2d)(r_, N1, N2);
+
+  m1 = THTensor_(newContiguous)(m1);
+  m2 = THTensor_(newContiguous)(m2);
+
+  THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1);
+  THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2);
+
+  dim = m1->size[1];
+  THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim");
+
+  m1_p = THTensor_(data)(m1);
+  m2_p = THTensor_(data)(m2);
+  r_p = THTensor_(data)(r_);
+
+#pragma omp parallel for private(i)
+  for (i=0; i<N1; i++) {
+    int64_t j,k;
+    for (j=0; j<N2; j++) {
+      real sum = 0;
+      for (k=0; k<dim; k++) {
+        real term = m1_p[ i*dim + k ] - m2_p[ j*dim + k ];
+        sum += term*term;
+      }
+      r_p[ i*N2 + j ] = gain * sum;
+    }
+  }
+
+  THTensor_(free)(m1);
+  THTensor_(free)(m2);
+}
+
+void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *m1, THTensor *m2)
+{
+  char transpose_r, transpose_m1, transpose_m2;
+  THTensor *r__, *m1_, *m2_;
+  int free_m1 = 0;
+  int free_m2 = 0;
+
+  if( (m1->dim() != 2) || (m2->dim() != 2))
+    THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim());
+
+  if(m1->size[1] != m2->size[0]) {
+    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
+    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
+    THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
+  }
+
+  if( t->dim() != 2 )
+    THError("matrix expected, got %dD tensor for t", t->dim());
+
+  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
+    THDescBuff bt  = THTensor_(sizeDesc)(t);
+    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
+    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
+    THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
+  }
+
+  if(t != r_)
+  {
+    THTensor_(resizeAs)(r_, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(r_, t);
+    }
+  }
+
+  // n == 1 || ldc >= max(1, m)
+  #define LDC_COND(M, N, LDC) ((N) == 1 || (LDC) >= THMax(1, M))
+
+  /* r_ */
+  if(r_->stride[0] == 1 &&
+     LDC_COND(r_->size[0], r_->size[1], r_->stride[1]))
+  {
+    transpose_r = 'n';
+    r__ = r_;
+  }
+  else if(r_->stride[1] == 1 &&
+          LDC_COND(r_->size[1], r_->size[0], r_->stride[0]))
+  {
+    THTensor *swap = m2;
+    m2 = m1;
+    m1 = swap;
+    transpose_r = 't';
+    r__ = r_;
+  }
+  else
+  {
+    transpose_r = 'n';
+    // make r__ FORTRAN contiguous
+    THTensor *transp_r_ = THTensor_(newTranspose)(r_, 0, 1);
+    r__ = THTensor_(newClone)(transp_r_);
+    THTensor_(free)(transp_r_);
+    THTensor_(transpose)(r__, NULL, 0, 1);
+  }
+
+  #undef LDC_COND
+
+  int64_t m = r__->size[(transpose_r == 'n' ? 0 : 1)];
+  int64_t n = r__->size[(transpose_r == 'n' ? 1 : 0)];
+  int64_t k = m1->size[(transpose_r == 'n' ? 1 : 0)];
+  int64_t ldr__ = r__->stride[(transpose_r == 'n' ? 1 : 0)];
+
+  /* m1 */
+  /* Need ldm1_ >= max(1, (transpose_m1 == 'n' ? m : k)) */
+  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m1->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, m))
+  {
+    transpose_m1 = 'n';
+    m1_ = m1;
+  }
+  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m1->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, k))
+  {
+    transpose_m1 = 't';
+    m1_ = m1;
+  }
+  else
+  {
+    transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
+    m1_ = THTensor_(newContiguous)(m1);
+    free_m1 = 1;
+  }
+
+  /* m2 */
+  /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */
+  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m2->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, k))
+  {
+    transpose_m2 = 'n';
+    m2_ = m2;
+  }
+  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m2->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, n))
+  {
+    transpose_m2 = 't';
+    m2_ = m2;
+  }
+  else
+  {
+    transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
+    m2_ = THTensor_(newContiguous)(m2);
+    free_m2 = 1;
+  }
+
+  int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]);
+  int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]);
+
+#pragma omp critical(blasgemm)
+  /* do the operation */
+  THBlas_(gemm)(transpose_m1,
+                transpose_m2,
+                m,
+                n,
+                k,
+                alpha,
+                THTensor_(data)(m1_),
+                ldm1_,
+                THTensor_(data)(m2_),
+                ldm2_,
+                beta,
+                THTensor_(data)(r__),
+                ldr__);
+
+  /* free intermediate variables */
+  if(free_m1)
+    THTensor_(free)(m1_);
+
+  if(free_m2)
+    THTensor_(free)(m2_);
+
+  if(r__ != r_)
+    THTensor_(freeCopyTo)(r__, r_);
+}
+
+void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
+{
+  if( (vec1->dim() != 1) || (vec2->dim() != 1) )
+    THError("vector and vector expected, got %dD, %dD tensors",
+        vec1->dim(), vec2->dim());
+
+  if(t->dim() != 2)
+    THError("expected matrix, got %dD tensor for t", t->dim());
+
+  if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+    THDescBuff bt  = THTensor_(sizeDesc)(t);
+    THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
+    THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
+    THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
+  }
+
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  if(beta == 0) {
+    THTensor_(zero)(r_);
+  }
+  else if(beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  // n == 1 || lda >= max(1, m)
+  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
+
+  if(r_->stride[0] == 1 && LDA_COND(vec1->size[0], vec2->size[0], r_->stride[1]))
+  {
+    THBlas_(ger)(vec1->size[0], vec2->size[0],
+                 alpha, THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(r_), r_->stride[1]);
+  }
+  else if(r_->stride[1] == 1 && LDA_COND(vec2->size[0], vec1->size[0], r_->stride[0]))
+  {
+    THBlas_(ger)(vec2->size[0], vec1->size[0],
+                 alpha, THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(r_), r_->stride[0]);
+  }
+  else
+  {
+    THTensor *cr = THTensor_(newClone)(r_);
+
+    THBlas_(ger)(vec2->size[0], vec1->size[0],
+                 alpha, THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(cr), cr->stride[0]);
+
+    THTensor_(freeCopyTo)(cr, r_);
+  }
+
+  #undef LDA_COND
+}
+
+void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
+{
+  int64_t batch;
+
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor");
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor");
+  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
+             "equal number of batches expected, got %d, %d",
+             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
+  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2),
+             THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2));
+
+  int64_t dim1 = THTensor_(size)(batch1, 1);
+  int64_t dim2 = THTensor_(size)(batch2, 2);
+  THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size");
+
+  if (t != result) {
+    THTensor_(resizeAs)(result, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(result, t);
+    }
+  }
+
+  THTensor *matrix1 = THTensor_(new)();
+  THTensor *matrix2 = THTensor_(new)();
+
+  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
+    THTensor_(select)(matrix1, batch1, 0, batch);
+    THTensor_(select)(matrix2, batch2, 0, batch);
+
+    THTensor_(addmm)(result, beta, result, alpha, matrix1, matrix2);
+    beta = 1; // accumulate output once
+  }
+
+  THTensor_(free)(matrix1);
+  THTensor_(free)(matrix2);
+}
+
+void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
+{
+  int64_t batch;
+
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1));
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2));
+  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
+             "equal number of batches expected, got %d, %d",
+             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
+  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2),
+             THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2));
+
+  int64_t bs = THTensor_(size)(batch1, 0);
+  int64_t dim1 = THTensor_(size)(batch1, 1);
+  int64_t dim2 = THTensor_(size)(batch2, 2);
+  THArgCheck(THTensor_(size)(t, 0) == bs, 1,   "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size");
+
+  if (t != result) {
+    THTensor_(resizeAs)(result, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(result, t);
+    }
+  }
+
+  THTensor *matrix1 = THTensor_(new)();
+  THTensor *matrix2 = THTensor_(new)();
+  THTensor *result_matrix = THTensor_(new)();
+
+  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
+    THTensor_(select)(matrix1, batch1, 0, batch);
+    THTensor_(select)(matrix2, batch2, 0, batch);
+    THTensor_(select)(result_matrix, result, 0, batch);
+
+    THTensor_(addmm)(result_matrix, beta, result_matrix, alpha, matrix1, matrix2);
+  }
+
+  THTensor_(free)(matrix1);
+  THTensor_(free)(matrix2);
+  THTensor_(free)(result_matrix);
+}
+
+ptrdiff_t THTensor_(numel)(THTensor *t)
+{
+  return THTensor_(nElement)(t);
+}
+
+
+// Helper function to be used in a reduction operation.
+// Due to resize semantics of outputs, if the specified output tensor r_ has
+// same size as the output of the reduction operation, then any noncontiguities
+// in r_ should be preserved.
+// The reduction operation, however, needs to act on r_ with an extra dimension
+// (the reduced dimension), so this function "resizes" r_ and preserves its
+// noncontiguities if necessary.
+void THTensor_(preserveReduceDimSemantics)(
+    THTensor *r_, int in_dims, int reduce_dimension, int keepdim) {
+  if (r_ && !keepdim &&
+      THTensor_(_nDimension)(r_) == in_dims - 1 &&
+      THTensor_(_nDimension)(r_) != 0) {
+    THTensor_(unsqueeze1d)(r_, r_, reduce_dimension);
+  }
+}
+
+void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  int in_dims = THTensor_(_nDimension)(t);
+  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
+  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    real theMax;
+    real value;
+    int64_t theIndex;
+    int64_t i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                         TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                         theMax = t_data[0];
+                         theIndex = 0;
+
+                         for(i = 0; i < t_size; i++)
+                         {
+                           value = t_data[i*t_stride];
+                           /* This is not the same as value>theMax in the case of NaNs */
+                           if(!(value <= theMax))
+                           {
+                             theIndex = i;
+                             theMax = value;
+                             th_isnan_break(value)
+                           }
+                         }
+                         *indices__data = theIndex;
+                         *values__data = theMax;);
+  } else {
+    if (THTensor_(_nDimension)(t) > 1) {
+      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+      THTensor_(copy)(values_, t0);
+      THTensor_(free)(t0);
+    } else {
+      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+    }
+    THLongTensor_zero(indices_);
+
+    if(t->size[dimension] == 1) {
+      if (!keepdim) {
+        THTensor_(squeeze1d)(values_, values_, dimension);
+        THLongTensor_squeeze1d(indices_, indices_, dimension);
+      }
+      return;
+    }
+
+    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+    // tempValues_.expand_as(t)
+    tempValues_->size[dimension] = t->size[dimension];
+    tempValues_->stride[dimension] = 0;
+
+    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+    // tempIndices_.expand_as(t)
+    tempIndices_->size[dimension] = t->size[dimension];
+    tempIndices_->stride[dimension] = 0;
+
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
+                          if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
+                            *tempValues__data = *t_data;
+                            *tempIndices__data = *tempIndices__dimOffset;
+                          });
+
+    THTensor_(free)(tempValues_);
+    THLongTensor_free(tempIndices_);
+  }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  int in_dims = THTensor_(_nDimension)(t);
+  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
+  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    real theMax;
+    real value;
+    int64_t theIndex;
+    int64_t i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                         TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                         theMax = t_data[0];
+                         theIndex = 0;
+
+                         for(i = 0; i < t_size; i++)
+                         {
+                           value = t_data[i*t_stride];
+                           /* This is not the same as value>theMax in the case of NaNs */
+                           if(!(value >= theMax))
+                           {
+                             theIndex = i;
+                             theMax = value;
+                             th_isnan_break(value)
+                           }
+                         }
+                         *indices__data = theIndex;
+                         *values__data = theMax;);
+  } else {
+    if (THTensor_(_nDimension)(t) > 1) {
+      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+      THTensor_(copy)(values_, t0);
+      THTensor_(free)(t0);
+    } else {
+      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+    }
+    THLongTensor_zero(indices_);
+
+    if(t->size[dimension] == 1) {
+      if (!keepdim) {
+        THTensor_(squeeze1d)(values_, values_, dimension);
+        THLongTensor_squeeze1d(indices_, indices_, dimension);
+      }
+      return;
+    }
+
+    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+    // tempValues_.expand_as(t)
+    tempValues_->size[dimension] = t->size[dimension];
+    tempValues_->stride[dimension] = 0;
+
+    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+    // tempIndices_.expand_as(t)
+    tempIndices_->size[dimension] = t->size[dimension];
+    tempIndices_->stride[dimension] = 0;
+
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
+                          if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
+                            *tempValues__data = *t_data;
+                            *tempIndices__data = *tempIndices__dimOffset;
+                          });
+
+    THTensor_(free)(tempValues_);
+    THLongTensor_free(tempIndices_);
+  }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if (inOMP) {
+    serial_path = 1;
+  } else {
+    int r_Contig = THTensor_(isContiguous)(r_);
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    if(r_Contig && (tp != rp)){
+      ptrdiff_t iter = 0;
+      ptrdiff_t r_Size = THTensor_(nElement)(r_);
+      int r_Dim = r_->_dim();
+      #pragma omp parallel for if ( r_Size > HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+      for (iter = 0; iter < r_Size; iter++) {
+        int j;
+        int64_t quot;
+        int64_t rem = iter;
+        ptrdiff_t tBasicIndex = 0;
+
+        for(j = 0; j < r_Dim; ++j) {
+          if(j != dimension){
+            quot = rem/r_->stride[j];
+            rem = rem%r_->stride[j];
+            tBasicIndex += quot*t->stride[j];
+          }
+        }
+        real *t_data = tp+tBasicIndex;
+        real *r__data = rp+iter;
+        *r__data = 0;
+        for(j=0; j < t->size[dimension]; ++j) {
+          *r__data += *(t_data + j*t->stride[dimension]);
+        }
+      }
+    } else {
+      serial_path = 1;
+    }
+  }
+#else
+  serial_path = 1;
+#endif
+  if (serial_path) {
+    // two implementations optimized for data locality
+    if (t->stride[dimension] == 1) {
+      TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                           accreal sum = 0;
+                           int64_t i;
+                           for(i = 0; i < t_size; i++)
+                             sum += t_data[i*t_stride];
+                           *r__data = (real)sum;);
+    } else {
+      THTensor_(zero)(r_);
+      THTensor *temp_ = THTensor_(newWithTensor)(r_);
+      // r_.expand_as(t)
+      temp_->size[dimension] = t->size[dimension];
+      temp_->stride[dimension] = 0;
+
+      TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
+      THTensor_(free)(temp_);
+    }
+  }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if (inOMP) {
+    serial_path = 1;
+  } else {
+    int r_Contig = THTensor_(isContiguous)(r_);
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    if(r_Contig && (tp != rp)){
+      ptrdiff_t iter = 0;
+      ptrdiff_t r_Size = THTensor_(nElement)(r_);
+      int r_Dim = r_->_dim();
+      #pragma omp parallel for if ( r_Size > HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+      for (iter = 0; iter < r_Size; iter++) {
+        int j;
+        int64_t quot;
+        int64_t rem = iter;
+        ptrdiff_t tBasicIndex = 0;
+
+        for(j = 0; j < r_Dim; ++j) {
+          if(j != dimension){
+            quot = rem/r_->stride[j];
+            rem = rem%r_->stride[j];
+            tBasicIndex += quot*t->stride[j];
+          }
+        }
+        real *t_data = tp+tBasicIndex;
+        real *r__data = rp+iter;
+        *r__data = 1;
+        for(j=0; j < t->size[dimension]; ++j) {
+          *r__data *= *(t_data + j*t->stride[dimension]);
+        }
+      }
+    } else {
+      serial_path = 1;
+    }
+  }
+#else
+  serial_path = 1;
+#endif
+
+  if(serial_path) {
+    // two implementations optimized for data locality
+    if (t->stride[dimension] == 1) {
+      TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                           accreal prod = 1;
+                           int64_t i;
+                           for(i = 0; i < t_size; i++)
+                             prod *= t_data[i*t_stride];
+                           *r__data = (real)prod;);
+    } else {
+      THTensor_(fill)(r_, 1);
+      THTensor *temp_ = THTensor_(newWithTensor)(r_);
+      // r_.expand_as(t)
+      temp_->size[dimension] = t->size[dimension];
+      temp_->stride[dimension] = 0;
+
+      TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
+      THTensor_(free)(temp_);
+    }
+  }
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(resizeAs)(r_, t);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal cumsum = 0;
+                       int64_t i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         cumsum += t_data[i*t_stride];
+                         r__data[i*r__stride] = (real)cumsum;
+                       });
+}
+
+void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(resizeAs)(r_, t);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal cumprod = 1;
+                       int64_t i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         cumprod *= t_data[i*t_stride];
+                         r__data[i*r__stride] = (real)cumprod;
+                       });
+}
+
+
+void THTensor_(sign)(THTensor *r_, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+
+#if defined (TH_REAL_IS_BYTE)
+  TH_TENSOR_APPLY2(real, r_, real, t,
+    if (*t_data > 0) *r__data = 1;
+    else *r__data = 0;);
+#else
+  TH_TENSOR_APPLY2(real, r_, real, t,
+    if (*t_data > 0) *r__data = 1;
+    else if (*t_data < 0) *r__data = -1;
+    else *r__data = 0;);
+#endif
+}
+
+
+accreal THTensor_(trace)(THTensor *t)
+{
+  real *t_data = THTensor_(data)(t);
+  accreal sum = 0;
+  int64_t i = 0;
+  int64_t t_stride_0, t_stride_1, t_diag_size;
+
+  THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix");
+
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  t_diag_size = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1));
+  while(i < t_diag_size)
+  {
+    sum += t_data[i*(t_stride_0+t_stride_1)];
+    i++;
+  }
+
+  return sum;
+}
+
+void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
+{
+  int i;
+
+  if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b))
+    THError("inconsistent tensor dimension %dD, %dD",
+        THTensor_(nDimension)(a), THTensor_(nDimension)(b));
+
+  for(i = 0; i < THTensor_(nDimension)(a); i++)
+  {
+    if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) {
+        THDescBuff ba = THTensor_(sizeDesc)(a);
+        THDescBuff bb = THTensor_(sizeDesc)(b);
+        THError("inconsistent tensor sizes %s, %s", ba.str, bb.str);
+    }
+  }
+
+  if(dimension < 0)
+  {
+    for(i = 0; i < THTensor_(nDimension)(a); i++)
+    {
+      if(THTensor_(size)(a, i) == 3)
+      {
+        dimension = i;
+        break;
+      }
+    }
+    if(dimension < 0) {
+      THDescBuff ba = THTensor_(sizeDesc)(a);
+      THError("no dimension of size 3 in a: %s", ba.str);
+    }
+  }
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+  THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(resizeAs)(r_, a);
+
+  TH_TENSOR_DIM_APPLY3(real, a, real, b, real, r_, dimension,
+                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                       r__data[0*r__stride] = a_data[1*a_stride]*b_data[2*b_stride] - a_data[2*a_stride]*b_data[1*b_stride];
+                       r__data[1*r__stride] = a_data[2*a_stride]*b_data[0*b_stride] - a_data[0*a_stride]*b_data[2*b_stride];
+                       r__data[2*r__stride] = a_data[0*a_stride]*b_data[1*b_stride] - a_data[1*a_stride]*b_data[0*b_stride];);
+}
+
+void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY3(real, r, real, t, real, src,
+                   *r_data = *t_data > *src_data ? *t_data : *src_data;);
+}
+
+void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY3(real, r, real, t, real, src,
+                   *r_data = *t_data < *src_data ? *t_data : *src_data;);
+}
+
+void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY2(real, r, real, t,
+                   *r_data = *t_data < value ? value : *t_data;);  // this order propagates NaN
+}
+
+void THTensor_(cminValue)(THTensor *r, THTensor *t, real value) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY2(real, r, real, t,
+                   *r_data = *t_data > value ? value : *t_data;);  // this order propagates NaN
+}
+
+void THTensor_(zerosLike)(THTensor *r_, THTensor *input)
+{
+  THTensor_(resizeAs)(r_, input);
+  THTensor_(zero)(r_);
+}
+
+void THTensor_(onesLike)(THTensor *r_, THTensor *input)
+{
+  THTensor_(resizeAs)(r_, input);
+  THTensor_(fill)(r_, 1);
+}
+
+void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
+{
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_ASSERT(!t->is_empty())
+#endif
+  THArgCheck(THTensor_(nDimension)(t) == 1 || THTensor_(nDimension)(t) == 2, 1, "matrix or a vector expected");
+
+  if(THTensor_(nDimension)(t) == 1)
+  {
+    real *t_data = THTensor_(data)(t);
+    int64_t t_stride_0 = THTensor_(stride)(t, 0);
+    int64_t t_size = THTensor_(size)(t, 0);
+    int64_t sz = t_size + (k >= 0 ? k : -k);
+    real *r__data;
+    int64_t r__stride_0;
+    int64_t r__stride_1;
+    int64_t i;
+
+    THTensor_(resize2d)(r_, sz, sz);
+    THTensor_(zero)(r_);
+    r__data = THTensor_(data)(r_);
+    r__stride_0 = THTensor_(stride)(r_, 0);
+    r__stride_1 = THTensor_(stride)(r_, 1);
+    r__data += (k >= 0 ? k*r__stride_1 : -k*r__stride_0);
+
+    for(i = 0; i < t_size; i++)
+      r__data[i*(r__stride_0+r__stride_1)] = t_data[i*t_stride_0];
+  }
+  else
+  {
+    real *t_data = THTensor_(data)(t);
+    int64_t t_stride_0 = THTensor_(stride)(t, 0);
+    int64_t t_stride_1 = THTensor_(stride)(t, 1);
+    int64_t sz;
+    real *r__data;
+    int64_t r__stride_0;
+    int64_t i;
+
+    if(k >= 0)
+      sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k);
+    else
+      sz = THMin(THTensor_(size)(t, 0)+k, THTensor_(size)(t, 1));
+    THTensor_(resize1d)(r_, sz);
+    r__data = THTensor_(data)(r_);
+    r__stride_0 = THTensor_(stride)(r_, 0);
+
+    t_data += (k >= 0 ? k*t_stride_1 : -k*t_stride_0);
+    for(i = 0; i < sz; i++)
+      r__data[i*r__stride_0] = t_data[i*(t_stride_0+t_stride_1)];
+  }
+}
+
+void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m)
+{
+  real *r__data;
+  int64_t i, sz;
+
+  THArgCheck(n > 0, 1, "invalid argument");
+
+  if(m <= 0)
+    m = n;
+
+  THTensor_(resize2d)(r_, n, m);
+  THTensor_(zero)(r_);
+
+  i = 0;
+  r__data = THTensor_(data)(r_);
+  sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1));
+  for(i = 0; i < sz; i++)
+    r__data[i*(r_->stride[0]+r_->stride[1])] = 1;
+}
+
+
+void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step)
+{
+  ptrdiff_t size;
+  real i = 0;
+
+  THArgCheck(step > 0 || step < 0, 3, "step must be nonzero");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound inconsistent with step sign");
+
+  size = (ptrdiff_t) (((xmax - xmin) / step) + 1);
+
+  if (THTensor_(nElement)(r_) != size) {
+    THTensor_(resize1d)(r_, size);
+  }
+
+  TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;);
+}
+
+void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step) {
+  ptrdiff_t size;
+  real i = 0;
+
+  THArgCheck(step > 0 || step < 0, 3, "step must be nonzero");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound inconsistent with step sign");
+
+  size = (ptrdiff_t) ceil((double)(xmax - xmin) / step);
+
+  if (THTensor_(nElement)(r_) != size) {
+    THTensor_(resize1d)(r_, size);
+  }
+
+  TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;);
+}
+
+void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
+{
+  real *r__data;
+  int64_t r__stride_0;
+  int64_t i;
+
+  THArgCheck(n > 0, 1, "must be strictly positive");
+
+  THTensor_(resize1d)(r_, n);
+  r__data = THTensor_(data)(r_);
+  r__stride_0 = THTensor_(stride)(r_,0);
+
+  for(i = 0; i < n; i++)
+    r__data[i*r__stride_0] = (real)(i);
+
+  for(i = 0; i < n-1; i++)
+  {
+    int64_t z = THRandom_random(_generator) % (n-i);
+    real sav = r__data[i*r__stride_0];
+    r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
+    r__data[(z+i)*r__stride_0] = sav;
+  }
+}
+
+/* I cut and pasted (slightly adapted) the quicksort code from
+   Sedgewick's 1978 "Implementing Quicksort Programs" article
+   http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
+
+   It is the state of the art existing implementation. The macros
+   are here to make as close a match as possible to the pseudocode of
+   Program 2 p.851
+
+   Note that other partition schemes exist, and are typically presented
+   in textbook, but those are less efficient. See e.g.
+   http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
+
+   Julien, November 12th 2013
+*/
+#define MAX_LEVELS  300
+#define M_SMALL 10 /* Limit for small subfiles */
+
+#define ARR(III) arr[(III)*stride]
+#define IDX(III) idx[(III)*stride]
+
+#define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
+#define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
+
+#define ARR_SWAP(III, JJJ) \
+  REAL_SWAP(ARR(III), ARR(JJJ));
+
+#define BOTH_SWAP(III, JJJ) \
+  REAL_SWAP(ARR(III), ARR(JJJ)); \
+  LONG_SWAP(IDX(III), IDX(JJJ))
+
+static void THTensor_(quicksortascend)(real *arr, int64_t *idx, int64_t elements, int64_t stride)
+{
+  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  real rswap, piv;
+  unsigned char done = 0;
+
+  /* beg[0]=0; end[0]=elements; */
+  stack = 0;
+  L = 0; R = elements-1;
+  done = elements-1 <= M_SMALL;
+
+  while(!done) {
+      /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+
+    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+
+    do {
+      do { i = i+1; } while(ARR(i) < piv);
+      do { j = j-1; } while(ARR(j) > piv);
+      if (j < i)
+          break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+    /* Left subfile is (L, j-1) */
+    /* Right subfile is (i, R) */
+    sz_left = j-L;
+    sz_right = R-i+1;
+    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+      /* both subfiles are small */
+      /* if stack empty */
+      if (stack == 0) {
+        done = 1;
+      } else {
+        stack--;
+        L = beg[stack];
+        R = end[stack];
+      }
+    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+      /* exactly one of the subfiles is small */
+      /* (L,R) = large subfile */
+      if (sz_left > sz_right) {
+        /* Implicit: L = L; */
+        R = j-1;
+      } else {
+        L = i;
+        /* Implicit: R = R; */
+      }
+    } else {
+      /* none of the subfiles is small */
+      /* push large subfile */
+      /* (L,R) = small subfile */
+      if (sz_left > sz_right) {
+        beg[stack] = L;
+        end[stack] = j-1;
+        stack++;
+        L = i;
+        /* Implicit: R = R */
+      } else {
+        beg[stack] = i;
+        end[stack] = R;
+        stack++;
+        /* Implicit: L = L; */
+        R = j-1;
+      }
+    }
+  } /* while not done */
+  /* Now insertion sort on the concatenation of subfiles */
+  for(i=elements-2; i>=0; i--) {
+    if (ARR(i) > ARR(i+1)) {
+      piv = ARR(i);
+      pid = IDX(i);
+      j = i+1;
+      do {
+        ARR(j-1) = ARR(j);
+        IDX(j-1) = IDX(j);
+        j = j+1;
+      } while(j < elements && ARR(j) < piv);
+      ARR(j-1) = piv;
+      IDX(j-1) = pid;
+     }
+  }
+}
+
+static void THTensor_(quicksortdescend)(real *arr, int64_t *idx, int64_t elements, int64_t stride)
+{
+  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  real rswap, piv;
+  unsigned char done = 0;
+
+  /* beg[0]=0; end[0]=elements; */
+  stack = 0;
+  L = 0; R = elements-1;
+  done = elements-1 <= M_SMALL;
+
+  while(!done) {
+      /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+
+    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+
+    do {
+      do { i = i+1; } while(ARR(i) > piv);
+      do { j = j-1; } while(ARR(j) < piv);
+      if (j < i)
+          break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+    /* Left subfile is (L, j-1) */
+    /* Right subfile is (i, R) */
+    sz_left = j-L;
+    sz_right = R-i+1;
+    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+      /* both subfiles are small */
+      /* if stack empty */
+      if (stack == 0) {
+        done = 1;
+      } else {
+        stack--;
+        L = beg[stack];
+        R = end[stack];
+      }
+    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+      /* exactly one of the subfiles is small */
+      /* (L,R) = large subfile */
+      if (sz_left > sz_right) {
+        /* Implicit: L = L; */
+        R = j-1;
+      } else {
+        L = i;
+        /* Implicit: R = R; */
+      }
+    } else {
+      /* none of the subfiles is small */
+      /* push large subfile */
+      /* (L,R) = small subfile */
+      if (sz_left > sz_right) {
+        beg[stack] = L;
+        end[stack] = j-1;
+        stack++;
+        L = i;
+        /* Implicit: R = R */
+      } else {
+        beg[stack] = i;
+        end[stack] = R;
+        stack++;
+        /* Implicit: L = L; */
+        R = j-1;
+      }
+    }
+  } /* while not done */
+  /* Now insertion sort on the concatenation of subfiles */
+  for(i=elements-2; i>=0; i--) {
+    if (ARR(i) < ARR(i+1)) {
+      piv = ARR(i);
+      pid = IDX(i);
+      j = i+1;
+      do {
+        ARR(j-1) = ARR(j);
+        IDX(j-1) = IDX(j);
+        j = j+1;
+      } while(j < elements && ARR(j) > piv);
+      ARR(j-1) = piv;
+      IDX(j-1) = pid;
+     }
+  }
+}
+
+#undef MAX_LEVELS
+#undef M_SMALL
+
+void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(resizeAs)(rt_, t);
+  THTensor_(copy)(rt_, t);
+
+  {
+    THLongStorage *size = THTensor_(newSizeOf)(t);
+    THLongTensor_resize(ri_, size, NULL);
+    THLongStorage_free(size);
+  }
+
+  if(descendingOrder)
+  {
+    TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension,
+                         int64_t i;
+                         for(i = 0; i < ri__size; i++)
+                           ri__data[i*ri__stride] = i;
+                         THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);)
+      }
+  else
+  {
+    TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension,
+                         int64_t i;
+                         for(i = 0; i < ri__size; i++)
+                           ri__data[i*ri__stride] = i;
+                         THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);)
+      }
+}
+
+/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
+public domain implementation at http://ndevilla.free.fr/median/median/
+Adapted similarly to the above Quicksort algorithm.
+This version does not produce indices along with values. */
+static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride)
+{
+  int64_t P, L, R, i, j;
+  real rswap, piv;
+  L = 0;
+  R = elements-1;
+
+  do {
+    if (R <= L) /* One element only */
+      return;
+
+    if (R == L+1) {  /* Two elements only */
+      if (ARR(L) > ARR(R)) {
+        ARR_SWAP(L, R);
+      }
+      return;
+    }
+
+    /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    ARR_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { ARR_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { ARR_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { ARR_SWAP(L+1, L); }
+
+    i = L+1;
+    j = R;
+    piv = ARR(L);
+    do {
+      do i++; while(ARR(i) < piv);
+      do j--; while(ARR(j) > piv);
+      if (j < i)
+        break;
+      ARR_SWAP(i, j);
+    } while(1);
+    ARR_SWAP(L, j);
+
+    /* Re-set active partition */
+    if (j <= k) L=i;
+    if (j >= k) R=j-1;
+  } while(1);
+}
+
+/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
+public domain implementation at http://ndevilla.free.fr/median/median/
+Adapted similarly to the above Quicksort algorithm. */
+static void THTensor_(quickselect)(real *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride)
+{
+  int64_t P, L, R, i, j, swap;
+  real rswap, piv;
+  L = 0;
+  R = elements-1;
+
+  do {
+    if (R <= L) /* One element only */
+      return;
+
+    if (R == L+1) {  /* Two elements only */
+      if (ARR(L) > ARR(R)) {
+        BOTH_SWAP(L, R);
+      }
+      return;
+    }
+
+    /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+
+    i = L+1;
+    j = R;
+    piv = ARR(L);
+    do {
+      do i++; while(ARR(i) < piv);
+      do j--; while(ARR(j) > piv);
+      if (j < i)
+        break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+
+    /* Re-set active partition */
+    if (j <= k) L=i;
+    if (j >= k) R=j-1;
+  } while(1);
+}
+
+#undef ARR
+#undef IDX
+#undef LONG_SWAP
+#undef REAL_SWAP
+#undef BOTH_SWAP
+
+void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+  THTensor *temp_;
+  THLongTensor *tempi_;
+  real *temp__data;
+  int64_t *tempi__data;
+  int64_t t_size_dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range");
+
+  int in_dims = THTensor_(_nDimension)(t);
+  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
+  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  t_size_dim = THTensor_(size)(t, dimension);
+
+  temp_ = THTensor_(new)();
+  THTensor_(resize1d)(temp_, t_size_dim);
+  temp__data = THTensor_(data)(temp_);
+
+  tempi_ = THLongTensor_new();
+  THLongTensor_resize1d(tempi_, t_size_dim);
+  tempi__data = THLongTensor_data(tempi_);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                       int64_t i;
+                       real mode = 0;
+                       int64_t modei = 0;
+                       int64_t temp_freq = 0;
+                       int64_t max_freq = 0;
+                       for(i = 0; i < t_size_dim; i++)
+                          temp__data[i] = t_data[i*t_stride];
+                       for(i = 0; i < t_size_dim; i++)
+                          tempi__data[i] = i;
+                       THTensor_(quicksortascend)(temp__data, tempi__data, t_size_dim, 1);
+
+                       for(i = 0; i < t_size_dim; i++)
+                       {
+                          temp_freq++;
+                          if ((i == t_size_dim - 1) || (temp__data[i] != temp__data[i+1]))
+                          {
+                              if (temp_freq > max_freq)
+                              {
+                                 mode = temp__data[i];
+                                 modei = tempi__data[i];
+                                 max_freq = temp_freq;
+                              }
+                              temp_freq = 0;
+                          }
+                       }
+                       *values__data = mode;
+                       *indices__data = modei;);
+
+  THTensor_(free)(temp_);
+  THLongTensor_free(tempi_);
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+  THTensor *temp_;
+  THLongTensor *tempi_;
+  real *temp__data;
+  int64_t *tempi__data;
+  int64_t t_size_dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range");
+  THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range");
+
+  int in_dims = THTensor_(_nDimension)(t);
+  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
+  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  t_size_dim = THTensor_(size)(t, dimension);
+
+  temp_ = THTensor_(new)();
+  THTensor_(resize1d)(temp_, t_size_dim);
+  temp__data = THTensor_(data)(temp_);
+
+  tempi_ = THLongTensor_new();
+  THLongTensor_resize1d(tempi_, t_size_dim);
+  tempi__data = THLongTensor_data(tempi_);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                       int64_t i;
+                       for(i = 0; i < t_size_dim; i++)
+                          temp__data[i] = t_data[i*t_stride];
+                       for(i = 0; i < t_size_dim; i++)
+                          tempi__data[i] = i;
+                       THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1);
+                       *values__data = temp__data[k-1];
+                       *indices__data = tempi__data[k-1];);
+
+  THTensor_(free)(temp_);
+  THLongTensor_free(tempi_);
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
+{
+  int64_t t_size_dim, k;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range");
+
+  t_size_dim = THTensor_(size)(t, dimension);
+  k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
+
+  THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
+}
+
+void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
+{
+#ifndef USE_TH_SIZE_ZERO_DIM
+  int numDims = THTensor_(_nDimension)(t);
+#else
+  int numDims = THTensor_(nDimension)(t);
+#endif
+  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
+
+  int64_t sliceSize = THTensor_(size)(t, dim);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+#else
+  THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension");
+#endif
+
+  THTensor *tmpResults = THTensor_(new)();
+  THTensor_(resize1d)(tmpResults, sliceSize);
+  real *tmp__data = THTensor_(data)(tmpResults);
+
+  THLongTensor *tmpIndices = THLongTensor_new();
+  THLongTensor_resize1d(tmpIndices, sliceSize);
+  int64_t *tmpi__data = THLongTensor_data(tmpIndices);
+
+  THLongStorage *topKSize = THTensor_(newSizeOf)(t);
+  THLongStorage_set(topKSize, dim, k);
+  THTensor_(resize)(rt_, topKSize, NULL);
+  THLongTensor_resize(ri_, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+  if (dir) {
+    /* k largest elements, descending order (optional: see sorted) */
+    int64_t K = sliceSize - k;
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim,
+                         TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                         int64_t i;
+                         for(i = 0; i < sliceSize; i++)
+                         {
+                           tmp__data[i] = t_data[i*t_stride];
+                           tmpi__data[i] = i;
+                         }
+                         if (K > 0)
+                           THTensor_(quickselect)(tmp__data, tmpi__data, K - 1, sliceSize, 1);
+                         if (sorted)
+                           THTensor_(quicksortdescend)(tmp__data + K, tmpi__data + K, k, 1);
+                         for(i = 0; i < k; i++)
+                         {
+                           rt__data[i*rt__stride] = tmp__data[i + K];
+                           ri__data[i*ri__stride] = tmpi__data[i + K];
+                         })
+  }
+  else {
+    /* k smallest elements, ascending order (optional: see sorted) */
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim,
+                         TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
+                         int64_t i;
+                         for(i = 0; i < sliceSize; i++)
+                         {
+                           tmp__data[i] = t_data[i*t_stride];
+                           tmpi__data[i] = i;
+                         }
+                         THTensor_(quickselect)(tmp__data, tmpi__data, k - 1, sliceSize, 1);
+                         if (sorted)
+                           THTensor_(quicksortascend)(tmp__data, tmpi__data, k - 1, 1);
+                         for(i = 0; i < k; i++)
+                         {
+                           rt__data[i*rt__stride] = tmp__data[i];
+                           ri__data[i*ri__stride] = tmpi__data[i];
+                         })
+  }
+
+  THTensor_(free)(tmpResults);
+  THLongTensor_free(tmpIndices);
+}
+
+void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
+{
+  int64_t t_size_0, t_size_1;
+  int64_t t_stride_0, t_stride_1;
+  int64_t r__stride_0, r__stride_1;
+  real *t_data, *r__data;
+  int64_t r, c;
+
+  THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix");
+
+  THTensor_(resizeAs)(r_, t);
+
+  t_size_0 = THTensor_(size)(t, 0);
+  t_size_1 = THTensor_(size)(t, 1);
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  r__stride_0 = THTensor_(stride)(r_, 0);
+  r__stride_1 = THTensor_(stride)(r_, 1);
+  r__data = THTensor_(data)(r_);
+  t_data = THTensor_(data)(t);
+
+  for(r = 0; r < t_size_0; r++)
+  {
+    int64_t sz = THMin(r+k+1, t_size_1);
+    for(c = THMax(0, r+k+1); c < t_size_1; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = 0;
+    for(c = 0; c < sz; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
+  }
+}
+
+void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
+{
+  int64_t t_size_0, t_size_1;
+  int64_t t_stride_0, t_stride_1;
+  int64_t r__stride_0, r__stride_1;
+  real *t_data, *r__data;
+  int64_t r, c;
+
+  THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix");
+
+  THTensor_(resizeAs)(r_, t);
+
+  t_size_0 = THTensor_(size)(t, 0);
+  t_size_1 = THTensor_(size)(t, 1);
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  r__stride_0 = THTensor_(stride)(r_, 0);
+  r__stride_1 = THTensor_(stride)(r_, 1);
+  r__data = THTensor_(data)(r_);
+  t_data = THTensor_(data)(t);
+
+  for(r = 0; r < t_size_0; r++)
+  {
+    int64_t sz = THMin(r+k, t_size_1);
+    for(c = THMax(0, r+k); c < t_size_1; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
+    for(c = 0; c < sz; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = 0;
+  }
+}
+
+void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension)
+{
+  THTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THTensor_(catArray)(r_, inputs, 2, dimension);
+}
+
+void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension);
+inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension)
+{
+  int first_dims = first->dim();
+  int second_dims = second->dim();
+  THArgCheck(first_dims == second_dims, 0,
+      "Tensors must have same number of dimensions: got %d and %d",
+      first_dims, second_dims);
+  for (int dim = 0; dim < first_dims; dim++) {
+    if (dim == dimension) {
+      continue;
+    }
+    int64_t first_dim_size = first->size[dim];
+    int64_t second_dim_size = second->size[dim];
+    THArgCheck(first_dim_size == second_dim_size, 0,
+        "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d",
+        dimension, (long long)first_dim_size, (long long)second_dim_size, dim);
+  }
+}
+
+void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension)
+{
+  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
+  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
+  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
+  // size (i.e. other empty sizes are not skipped).
+  // FIXME: warn if this is the case
+  bool allSkipped= true;
+  int64_t nDims = 0;
+  THTensor *notSkippedTensor;  // non-owning reference
+  auto should_skip = [](THTensor *t) { return t->is_empty() && t->dim() == 1; };
+  for (int i = 0; i < numInputs; i++) {
+    if (should_skip(inputs[i])) {
+      continue;
+    }
+    // We've found a non-empty tensor
+    allSkipped = false;
+    notSkippedTensor = inputs[i];
+    nDims = notSkippedTensor->dim();
+    break;
+  }
+  if (allSkipped) {
+    return;
+  }
+
+  // Compute cat_dimension based on the non-empty tensor
+  THArgCheck(dimension < nDims, 4, "invalid dimension %d", dimension);
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+
+  // Compute size of the result in the cat dimension
+  int64_t cat_dim_size = 0;
+  for (int i = 0; i < numInputs; i++) {
+    THTensor *tensor = inputs[i];
+    if (should_skip(tensor)) {
+      continue;
+    }
+    THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension);
+    cat_dim_size += tensor->size[dimension];
+  }
+
+  // Compute the size of the result
+  THLongStorage *size = THLongStorage_newWithSize(nDims);
+  for (int dim = 0; dim < nDims; dim++) {
+    int64_t result_dim_size = notSkippedTensor->size[dim];
+    if (dim == dimension) {
+      result_dim_size = cat_dim_size;
+    }
+    THLongStorage_data(size)[dim] = result_dim_size;
+  }
+  THTensor_(resize)(result, size, NULL);
+
+  // Check contiguity of all inputs and result
+  bool allContiguous = true;
+  for (int i = 0; i < numInputs; i++) {
+    if(!should_skip(inputs[i])) {
+      allContiguous = allContiguous && THTensor_(isContiguous)(inputs[i]);
+    }
+  }
+  allContiguous = allContiguous && THTensor_(isContiguous)(result);
+
+  // First path is for contiguous inputs along dim 0
+  // Second path for non-contiguous
+  int64_t offset;
+  if (dimension == 0 && allContiguous) {
+    real* result_data = THStorage_(data)(result->storage) + result->storageOffset;
+    offset = 0;
+    for (int j = 0; j < numInputs; j++) {
+      if (!should_skip(inputs[j])) {
+        THTensor* input0 = inputs[j];
+        real* input0_data = THStorage_(data)(input0->storage) + input0->storageOffset;
+        int64_t input0_size = THTensor_(nElement)(input0);
+        // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this.
+        if (input0_size != 0) {
+          memcpy(result_data + offset, input0_data, input0_size*sizeof(real));
+        }
+        offset += input0_size;
+      }
+    }
+  } else {
+    offset = 0;
+    for (int j = 0; j < numInputs; j++) {
+      if (!should_skip(inputs[j])) {
+        int64_t dimSize = inputs[j]->size[dimension];
+        THTensor *nt = THTensor_(newWithTensor)(result);
+        THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
+        THTensor_(copy)(nt, inputs[j]);
+        THTensor_(free)(nt);
+        offset += dimSize;
+      }
+    }
+  }
+  THLongStorage_free(size);
+}
+
+int THTensor_(equal)(THTensor *ta, THTensor* tb)
+{
+  int equal = 1;
+  if(!THTensor_(isSameSizeAs)(ta, tb))
+    return 0;
+
+  if (THTensor_(isContiguous)(ta) && THTensor_(isContiguous)(tb)) {
+    real *tap = THTensor_(data)(ta);
+    real *tbp = THTensor_(data)(tb);
+    ptrdiff_t sz = THTensor_(nElement)(ta);
+    ptrdiff_t i;
+    for (i=0; i<sz; ++i){
+      if(tap[i] != tbp[i]) return 0;
+    }
+  } else {
+    // Short-circuit the apply function on inequality
+    TH_TENSOR_APPLY2(real, ta, real, tb,
+                     if (equal && *ta_data != *tb_data) {
+                        equal = 0;
+                        TH_TENSOR_APPLY_hasFinished = 1; break;
+                     })
+  }
+  return equal;
+}
+
+#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
+  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
+  {									\
+    THByteTensor_resizeNd(r_, t->dim(), t->size, NULL);		\
+    TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
+		     *r__data = (*t_data OP value) ? 1 : 0;); \
+  }									\
+  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
+  {									\
+    THTensor_(resizeNd)(r_, t->dim(), t->size, NULL);		\
+    TH_TENSOR_APPLY2(real, r_, real, t,					\
+		     *r__data = (*t_data OP value) ? 1 : 0;); \
+  }									\
+  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
+  {									\
+    THByteTensor_resizeNd(r_, ta->dim(), ta->size, NULL);		\
+    TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
+		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
+  }									\
+  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
+  {									\
+    THTensor_(resizeNd)(r_, ta->dim(), ta->size, NULL);		\
+    TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
+		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
+  }									\
+
+
+TENSOR_IMPLEMENT_LOGICAL(lt,<)
+TENSOR_IMPLEMENT_LOGICAL(gt,>)
+TENSOR_IMPLEMENT_LOGICAL(le,<=)
+TENSOR_IMPLEMENT_LOGICAL(ge,>=)
+TENSOR_IMPLEMENT_LOGICAL(eq,==)
+TENSOR_IMPLEMENT_LOGICAL(ne,!=)
+
+
+#ifdef _OPENMP
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD)             \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t)             \
+  {                                                           \
+    THTensor_(resizeAs)(r_, t);                               \
+    ptrdiff_t r_Size = THTensor_(nElement)(r_);               \
+    int r_Contig = THTensor_(isContiguous)(r_);               \
+    int tContig = THTensor_(isContiguous)(t);                 \
+    int inOMP = omp_in_parallel();                            \
+    if( !inOMP ){   \
+      TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD);        \
+    } else {                                                                                                   \
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = CFUNC(*t_data););                                       \
+    }                                                                                                        \
+  }
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC)      \
+  LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD)             \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t)             \
+  {                                                           \
+    THTensor_(resizeAs)(r_, t);                               \
+    ptrdiff_t r_Size = THTensor_(nElement)(r_);               \
+    int r_Contig = THTensor_(isContiguous)(r_);               \
+    int tContig = THTensor_(isContiguous)(t);                 \
+    if (r_Contig && tContig) {                                \
+      TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(NAME)(r__data, t_data, r__len););                   \
+    } else {                                                                                                   \
+      int inOMP = omp_in_parallel();                            \
+      if( !inOMP ){   \
+        TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD);        \
+      }                                                                                                        \
+      else {                                                                                                   \
+        TH_TENSOR_APPLY2(real, r_, real, t, *r__data = CFUNC(*t_data););                                       \
+      }                                                                                                        \
+    }                                                                                                          \
+  }
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \
+  LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
+
+#else
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC)             \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t)                \
+  {                                                           \
+    THTensor_(resizeAs)(r_, t);                               \
+    TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \
+  }                                                           \
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \
+  LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC)
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC)             \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t)                \
+  {                                                           \
+    THTensor_(resizeAs)(r_, t);                               \
+    int r_Contig = THTensor_(isContiguous)(r_);               \
+    int tContig = THTensor_(isContiguous)(t);                 \
+    if (r_Contig && tContig) {                                \
+      TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(NAME)(r__data, t_data, r__len);); \
+    } else {                                                           \
+      TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \
+    }                                                           \
+  }                                                             \
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \
+  LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC)
+
+#endif
+
+#define EXPAND(...) __VA_ARGS__
+
+#define GET_4TH_ARG(ARG0, ARG1, ARG2, ARG3, ...) ARG3
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(...) \
+  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS, LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS, ))
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(...) \
+  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS, LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS, ))
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION(...) EXPAND(LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
+
+#define LAB_IMPLEMENT_VECTORIZED_FUNCTION(...) EXPAND(LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
+
+/*
+ * LAB_IMPLEMENT_BASIC_FUNCTION is a macro with optional parameters, you can use it flexibly.
+ * The macro will discard the invalid openmp threshold if openmp is unavailable. The macro will give a default threshold even if you forget to pass one.
+ * In other word,
+ * (A), If openmp is UNavailable, the two usage below is both right.
+ *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) // discard the invalid openmp threshold
+ *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity)
+ * (B), If openmp is available, the two usage below is also both right.
+ *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD)
+ *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) // pass the default openmp threshold
+ * So do LAB_IMPLEMENT_VECTORIZED_FUNCTION.
+*/
+
+LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
+
+#if defined(TH_REAL_IS_LONG)
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
+#endif /* int64_t only part */
+
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT)
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
+#endif /* int only part */
+
+#if defined(TH_REAL_IS_BYTE)
+
+int THTensor_(logicalAndAll)(THTensor *tensor)
+{
+  real prod = 1;
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if(inOMP) {
+    serial_path = 1;
+  } else {
+    TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, &&:prod, prod = prod && *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+  }
+#else
+    serial_path = 1;
+#endif
+  if (serial_path) {
+    TH_TENSOR_APPLY(real, tensor, prod = prod && *tensor_data;);
+  }
+  return prod;
+}
+
+int THTensor_(logicalAnyAll)(THTensor *tensor)
+{
+  real sum = 0;
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if(inOMP) {
+    serial_path = 1;
+  } else {
+    TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, ||:sum, sum = sum || *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+  }
+#else
+    serial_path = 1;
+#endif
+  if (serial_path) {
+    TH_TENSOR_APPLY(real, tensor, sum = sum || *tensor_data;);
+  }
+  return (bool)sum;
+}
+
+void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if (inOMP) {
+    serial_path = 1;
+  } else {
+    int r_Contig = THTensor_(isContiguous)(r_);
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    if(r_Contig && (tp != rp)){
+      ptrdiff_t iter = 0;
+      ptrdiff_t r_Size = THTensor_(nElement)(r_);
+      int r_Dim = r_->_dim();
+      #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD)
+      for (iter = 0; iter < r_Size; iter++) {
+        int j;
+        int64_t quot;
+        int64_t rem = iter;
+        ptrdiff_t tBasicIndex = 0;
+
+        for(j = 0; j < r_Dim; ++j) {
+          if(j != dimension){
+            quot = rem/r_->stride[j];
+            rem = rem%r_->stride[j];
+            tBasicIndex += quot*t->stride[j];
+          }
+        }
+        real *t_data = tp+tBasicIndex;
+        real *r__data = rp+iter;
+        *r__data = 1;
+        for(j=0; j < t->size[dimension]; ++j) {
+          *r__data = *r__data && *(t_data + j*t->stride[dimension]);
+        }
+      }
+    } else {
+      serial_path = 1;
+    }
+  }
+#else
+  serial_path = 1;
+#endif
+
+  if(serial_path) {
+    // two implementations optimized for data locality
+    if (t->stride[dimension] == 1) {
+      TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                           accreal prod = 1;
+                           int64_t i;
+                           for(i = 0; i < t_size; i++)
+                             prod = prod && t_data[i*t_stride];
+                           *r__data = (real)prod;);
+    } else {
+      THTensor_(fill)(r_, 1);
+      THTensor *temp_ = THTensor_(newWithTensor)(r_);
+      // r_.expand_as(t)
+      temp_->size[dimension] = t->size[dimension];
+      temp_->stride[dimension] = 0;
+
+      TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data && *t_data;);
+      THTensor_(free)(temp_);
+    }
+  }
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  int serial_path = 0;
+#ifdef _OPENMP
+  int inOMP = omp_in_parallel();
+  if (inOMP) {
+    serial_path = 1;
+  } else {
+    int r_Contig = THTensor_(isContiguous)(r_);
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    if(r_Contig && (tp != rp)){
+      ptrdiff_t iter = 0;
+      ptrdiff_t r_Size = THTensor_(nElement)(r_);
+      int r_Dim = r_->_dim();
+      #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD)
+      for (iter = 0; iter < r_Size; iter++) {
+        int j;
+        int64_t quot;
+        int64_t rem = iter;
+        ptrdiff_t tBasicIndex = 0;
+
+        for(j = 0; j < r_Dim; ++j) {
+          if(j != dimension){
+            quot = rem/r_->stride[j];
+            rem = rem%r_->stride[j];
+            tBasicIndex += quot*t->stride[j];
+          }
+        }
+        real *t_data = tp+tBasicIndex;
+        real *r__data = rp+iter;
+        *r__data = 0;
+        for(j=0; j < t->size[dimension]; ++j) {
+          *r__data = *r__data || *(t_data + j*t->stride[dimension]);
+        }
+      }
+    } else {
+      serial_path = 1;
+    }
+  }
+#else
+  serial_path = 1;
+#endif
+  if (serial_path) {
+    // two implementations optimized for data locality
+    if (t->stride[dimension] == 1) {
+      TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                           accreal sum = 0;
+                           int64_t i;
+                           for(i = 0; i < t_size; i++)
+                             sum = sum || t_data[i*t_stride];
+                           *r__data = (real)sum;);
+    } else {
+      THTensor_(zero)(r_);
+      THTensor *temp_ = THTensor_(newWithTensor)(r_);
+      // r_.expand_as(t)
+      temp_->size[dimension] = t->size[dimension];
+      temp_->stride[dimension] = 0;
+
+      TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data || *t_data;);
+      THTensor_(free)(temp_);
+    }
+  }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+#endif /* Byte only part */
+
+/* floating point only now */
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+
+LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log))
+LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(digamma,TH_MATH_NAME(TH_digamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(trigamma,TH_MATH_NAME(TH_trigamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(log10,TH_MATH_NAME(log10))
+LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p))
+LAB_IMPLEMENT_BASIC_FUNCTION(log2,TH_MATH_NAME(log2))
+LAB_IMPLEMENT_BASIC_FUNCTION(erf,TH_MATH_NAME(erf))
+LAB_IMPLEMENT_BASIC_FUNCTION(erfc,TH_MATH_NAME(erfc))
+LAB_IMPLEMENT_BASIC_FUNCTION(erfinv,TH_erfinv)
+LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil))
+LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor))
+LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round))
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
+LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc))
+LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
+LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
+
+LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(expm1,TH_MATH_NAME(expm1),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+
+LAB_IMPLEMENT_VECTORIZED_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
+
+void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
+{
+  THTensor_(resizeAs)(r_, tx);
+  TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
+}
+
+void THTensor_(polygamma)(THTensor *r_, int64_t n, THTensor *t) {
+  switch (n) {
+    case 0: THTensor_(digamma)(r_, t); return;
+    case 1: THTensor_(trigamma)(r_, t); return;
+    default: THError("polygamma(n,x) is not implemented for n>=2");
+  }
+}
+
+void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
+{
+  THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
+  THTensor_(resizeAs)(r_, a);
+  TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight););
+}
+
+void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(sum)(r_, t, dimension, keepdim);
+  THTensor_(div)(r_, r_, t->size[dimension]);
+}
+
+void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       // Uses Welford's algorithm for numeric stability
+                       accreal mean = 0;
+                       accreal M2 = 0;
+
+                       int64_t i;
+                       for (i = 0; i < t_size; i++)
+                       {
+                         real z = t_data[i*t_stride];
+                         real delta = z - mean;
+                         mean += delta / (i + 1);
+                         real delta2 = z - mean;
+                         M2 += delta * delta2;
+                       }
+
+                       if (biased && t_size >= 2)
+                       {
+                         *r__data = TH_MATH_NAME(sqrt)(M2 / t_size);
+                       } else if (!biased && t_size >= 2) {
+                         *r__data = TH_MATH_NAME(sqrt)(M2 / (t_size - 1));
+                       } else if (biased && t_size == 1) {
+                         *r__data = 0;
+                       } else {
+                         *r__data = NAN;
+                       });
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       // Uses Welford's algorithm for numeric stability
+                       accreal mean = 0;
+                       accreal M2 = 0;
+
+                       int64_t i;
+                       for (i = 0; i < t_size; i++)
+                       {
+                         real z = t_data[i*t_stride];
+                         real delta = z - mean;
+                         mean += delta / (i + 1);
+                         real delta2 = z - mean;
+                         M2 += delta * delta2;
+                       }
+
+                       if (biased && t_size >= 2)
+                       {
+                         *r__data = M2 / t_size;
+                       } else if (!biased && t_size >= 2) {
+                         *r__data = M2 / (t_size - 1);
+                       } else if (biased && t_size == 1) {
+                         *r__data = 0;
+                       } else {
+                         *r__data = NAN;
+                       });
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim);
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  #define DIM_REDUCE(reduce, transform) \
+    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,      \
+                         accreal sum = 0;                   \
+                         int64_t i;                         \
+                         for(i = 0; i < t_size; i++) {      \
+                           (reduce);                        \
+                         }                                  \
+                         (transform);)                      \
+
+  if(value == 0) {
+    DIM_REDUCE(sum += t_data[i*t_stride] != 0.0,
+               *r__data = sum);
+  } else if (value == 1) {
+    DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride]),
+               *r__data = sum);
+  } else if (value == 2) {
+    DIM_REDUCE(sum += t_data[i*t_stride] * t_data[i*t_stride],
+               *r__data = TH_MATH_NAME(sqrt)(sum));
+  } else if (value == 3) {
+    DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride] * t_data[i*t_stride] * t_data[i*t_stride]),
+               *r__data = TH_MATH_NAME(pow)(sum, 1.0/3));
+  } else if (value == INFINITY) {
+    DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
+	       *r__data = sum);
+  } else {
+    DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
+               *r__data = TH_MATH_NAME(pow)(sum, 1.0/value));
+  }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
+  #undef DIM_REDUCE
+}
+
+accreal THTensor_(normall)(THTensor *tensor, real value)
+{
+  accreal sum = 0;
+  if(value == 0) {
+    TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
+    return sum;
+  } else if(value == 1) {
+    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
+    return sum;
+  } else if(value == 2) {
+    TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
+    return sqrt(sum);
+  } else if(value == 3) {
+    TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += std::abs(z*z*z););
+    return TH_MATH_NAME(pow)(sum, 1.0/3);
+  } else if(value == INFINITY) {
+    TH_TENSOR_APPLY(real, tensor, sum = THMax(sum, TH_MATH_NAME(fabs)(*tensor_data)););
+    return sum;
+  } else {
+    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
+    return TH_MATH_NAME(pow)(sum, 1.0/value);
+  }
+}
+
+void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm)
+{
+  int i;
+  THTensor *rowR, *rowS;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+  THArgCheck(value > 0, 2, "non-positive-norm not supported");
+  THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
+      THTensor_(nDimension)(src));
+
+  rowR = THTensor_(new)();
+  rowS = THTensor_(new)();
+
+  THTensor_(resizeAs)(res, src);
+
+  for (i=0; i<src->size[dimension]; i++)
+  {
+    real norm = 0;
+    real new_norm;
+
+    THTensor_(select)(rowS, src, dimension, i);
+    THTensor_(select)(rowR, res, dimension, i);
+    if (value == 1) {
+      TH_TENSOR_APPLY(real, rowS, norm += fabs(*rowS_data););
+    } else if (value == 2) {
+      TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
+    } else if (value == INFINITY) {
+      TH_TENSOR_APPLY(real, rowS, norm = THMax(norm, TH_MATH_NAME(fabs)(*rowS_data)););
+    } else {
+      TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
+    }
+
+    if (value != INFINITY) {
+      norm = pow(norm, 1/value);
+    }
+
+    if (norm > maxnorm)
+    {
+      new_norm = maxnorm / (norm + 1e-7);
+
+      TH_TENSOR_APPLY2(
+        real, rowR, real, rowS,
+        *rowR_data = (*rowS_data) * new_norm;
+      )
+    }
+    else
+      THTensor_(copy)(rowR, rowS);
+  }
+
+  THTensor_(free)(rowR);
+  THTensor_(free)(rowS);
+}
+
+accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
+{
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, tensor, real, src,
+                   sum += TH_MATH_NAME(pow)(
+                     TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
+  return TH_MATH_NAME(pow)(sum, 1.0/value);
+}
+
+accreal THTensor_(meanall)(THTensor *tensor)
+{
+  return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor);
+}
+
+accreal THTensor_(varall)(THTensor *tensor, int biased)
+{
+  accreal mean = THTensor_(meanall)(tensor);
+  accreal sum = 0;
+  TH_TENSOR_APPLY(real, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean););
+  sum /= std::max<int64_t>(0, THTensor_(nElement)(tensor) - (biased ? 0 : 1));
+  return sum;
+}
+
+accreal THTensor_(stdall)(THTensor *tensor, int biased)
+{
+  return sqrt(THTensor_(varall)(tensor, biased));
+}
+
+void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n)
+{
+  real i = 0;
+
+  // NumPy allows you to pass different points even if n <= 1 -- should we?
+  THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points");
+
+  if (THTensor_(nElement)(r_) != n) {
+    THTensor_(resize1d)(r_, n);
+  }
+
+  if (n == 0) {
+  } else if (n == 1) {
+    THTensor_(set1d)(r_, 0, a);
+  } else {
+     TH_TENSOR_APPLY(real, r_,
+             *r__data = a + (b-a)/((real)(n-1))*i;
+             i++;
+           );
+  }
+}
+
+void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n)
+{
+  real i = 0;
+
+  // NumPy allows you to pass different points even if n <= 1 -- should we?
+  THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points");
+
+  if (THTensor_(nElement)(r_) != n) {
+    THTensor_(resize1d)(r_, n);
+  }
+
+  if (n == 0) {
+  } else if (n == 1) {
+    THTensor_(set1d)(r_, 0, TH_MATH_NAME(pow)(10.0, a));
+  } else {
+    TH_TENSOR_APPLY(real, r_,
+        *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1)));
+        i++;
+        );
+  }
+}
+
+void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue)
+{
+  real minval;
+  real maxval;
+  real *h_data;
+
+  THTensor_(resize1d)(hist, nbins);
+  THTensor_(zero)(hist);
+  minval = minvalue;
+  maxval = maxvalue;
+  if (minval == maxval)
+  {
+    minval = THTensor_(minall)(tensor);
+    maxval = THTensor_(maxall)(tensor);
+  }
+  if (minval == maxval)
+  {
+    minval = minval - 1;
+    maxval = maxval + 1;
+  }
+
+  h_data = THTensor_(data)(hist);
+
+  TH_TENSOR_APPLY(real, tensor,
+    if (*tensor_data >= minval && *tensor_data <= maxval) {
+      const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins);
+      h_data[THMin(bin, nbins-1)] += 1;
+    }
+  );
+}
+
+void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue)
+{
+  THArgCheck(THTensor_(_nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(_nDimension)(tensor));
+
+  int dimension = 1;
+  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(tensor), 2, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  real minval;
+  real maxval;
+
+  THTensor_(resize2d)(hist, tensor->size[0], nbins);
+  THTensor_(zero)(hist);
+
+  minval = minvalue;
+  maxval = maxvalue;
+  if (minval == maxval)
+  {
+    minval = THTensor_(minall)(tensor);
+    maxval = THTensor_(maxall)(tensor);
+  }
+  if (minval == maxval)
+  {
+    minval = minval - 1;
+    maxval = maxval + 1;
+  }
+
+  TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, int64_t i;
+                        for(i = 0; i < tensor_size; i++)
+                        {
+                          if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
+                            const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins);
+                            hist_data[THMin(bin, nbins-1)] += 1;
+                          }
+                        }
+  );
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes x is close to zero and uses a Taylor expansion.
+static inline real THTensor_(beta_grad_alpha_small)(real x, real alpha, real beta) {
+  const real factor = TH_MATH_NAME(TH_digamma)(alpha) - TH_MATH_NAME(TH_digamma)(alpha + beta) - TH_MATH_NAME(log)(x);
+  real numer = 1;
+  real series = numer / alpha * (factor + 1 / alpha);
+  for (int i = 1; i <= 10; ++i) {
+    numer *= (i - beta) * x / i;
+    const real denom = alpha + i;
+    series += numer / denom * (factor + 1 / denom);
+  }
+  const real result = x * TH_MATH_NAME(pow)(1 - x, -beta) * series;
+  return th_isnan(result) ? 0.0 : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta.
+// Assumes x is close to zero and uses a Taylor expansion.
+static inline real THTensor_(beta_grad_beta_small)(real x, real alpha, real beta) {
+  const real factor = TH_MATH_NAME(TH_digamma)(alpha+beta) - TH_MATH_NAME(TH_digamma)(beta);
+  real numer = 1;
+  real betas = 1;
+  real dbetas = 0;
+  real series = factor / alpha;
+  for (int i = 1; i <= 8; ++i) {
+    numer *= -x / i;
+    dbetas = dbetas * (beta - i) + betas;
+    betas = betas * (beta - i);
+    series += numer / (alpha + i) * (dbetas + factor * betas);
+  }
+  const real result = -TH_MATH_NAME(pow)(1 - x, 1 - beta) * series;
+  return th_isnan(result) ? 0.0 : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes alpha and beta are both large and uses a Rice saddle point expansion.
+// To ensure numerical stability, this computation is performed at higher precision.
+static inline real THTensor_(beta_grad_alpha_mid)(double x, double alpha, double beta) {
+  const double total = alpha + beta;
+  const double mean = alpha / total;
+  const double std = sqrt(alpha * beta / (total + 1)) / total;
+  if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) {
+    // Avoid the singularity at x = mean.
+    const double poly = 47 * x * (beta*beta)*(beta*beta) + alpha * (
+                      (43 + 20 * (16 + 27 * beta) * x) * (beta*beta)*beta + alpha * (
+                      3 * (59 + 180 * beta - 90 * x) * (beta*beta) + alpha * (
+                      (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * (
+                      8 * (1 - x) * (135 * beta - 11)))));
+    const double prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total);
+    const double prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total);
+    return prefactor_num / (1 - x) * poly / prefactor_den;
+  }
+  const double prefactor = -x / sqrt(2 * alpha * beta / total);
+  const double stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha*alpha))
+                        * (1 + 1 / (12 * beta) + 1 / (288 * beta*beta))
+                        / (1 + 1 / (12 * total) + 1 / (288 * total*total));
+  const double term1_num = 2 * (alpha*alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta*beta);
+  const double axbx = alpha * (x-1) + beta * x;
+  const double term1_den = sqrt(2 * alpha / beta) * pow(total, 1.5f) * axbx*axbx;
+  const double term1 = term1_num / term1_den;
+  const double term2 = 0.5f * log(alpha / (total * x));
+  const double term3_num = sqrt(8 * alpha * beta / total);
+  const double term3_den = beta * x + alpha * (x - 1);
+  const double term3 = term3_num / term3_den;
+  const double term4_base = beta * log(beta / (total * (1 - x))) +
+                          alpha * log(alpha / (total * x));
+  const double term4 = pow(term4_base, -1.5f);
+  const double term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4));
+  return stirling * prefactor * term1234;
+}
+
+// Computes a scaled reparameterized gradient
+//   -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x)
+// for random number x drawn from a Beta distribution Beta(alpha,beta).
+// This function inputs total=alpha+beta to make it easy to implement
+// Dirichlet reparameterized gradients in terms of Betas.
+static inline real THTensor_(dirichlet_grad_one)(real x, real alpha, real total) {
+  const real beta = total - alpha;
+  const real boundary = total * x * (1 - x);
+
+  // Use an asymptotic approximation for x close to 0.
+  if (x <= 0.5f && boundary < 2.5f) {
+    return THTensor_(beta_grad_alpha_small)(x, alpha, beta);
+  }
+
+  // Use an asymptotic approximation for x close to 1.
+  if (x >= 0.5f && boundary < 0.75f) {
+    return -THTensor_(beta_grad_beta_small)(1 - x, beta, alpha);
+  }
+
+  // Use an asymptotic approximation when alpha and (total - alpha) are both large.
+  if (alpha > 6 && beta > 6) {
+    return THTensor_(beta_grad_alpha_mid)(x, alpha, beta);
+  }
+
+  // Use a rational correction to an analytic approximation.
+  static const real c[2][3][3][4] = {
+    {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863},
+      {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033},
+      {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}},
+     {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814},
+      {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057},
+      {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}},
+     {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565},
+      {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181},
+      {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}},
+    {{{1, -0.02924021934, -0.04438342661, 0.007285809825},
+      {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521},
+      {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}},
+     {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273},
+      {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956},
+      {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}},
+     {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05},
+      {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05},
+      {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}},
+  };
+  const real u = TH_MATH_NAME(log)(x);
+  const real a = TH_MATH_NAME(log)(alpha) - u;
+  const real b = TH_MATH_NAME(log)(total) - a;
+  const real pow_u[3] = {1, u, u * u};
+  const real pow_a[3] = {1, a, a * a};
+  real p = 0.0;
+  real q = 0.0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      const real ua = pow_u[i] * pow_a[j];
+      p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3])));
+      q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3])));
+    }
+  }
+  const real approx = x * (TH_MATH_NAME(TH_digamma)(total) - TH_MATH_NAME(TH_digamma)(alpha)) / beta;
+  return p / q * approx;
+}
+
+void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total)
+{
+  x = THTensor_(newContiguous)(x);
+  alpha = THTensor_(newContiguous)(alpha);
+  total = THTensor_(newContiguous)(total);
+  TH_CHECK_SAME_SIZE(alpha, x);
+  TH_CHECK_SAME_SIZE(total, x);
+  THTensor_(resizeAs)(self, x);
+  THTensor* grad = THTensor_(newContiguous)(self);
+
+  real*const grad_data = THTensor_(data)(grad);
+  real*const x_data = THTensor_(data)(x);
+  real*const alpha_data = THTensor_(data)(alpha);
+  real*const total_data = THTensor_(data)(total);
+  const int64_t numel = THTensor_(nElement)(x);
+  int64_t i;
+  #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+  for(i = 0; i < numel; ++i) {
+    grad_data[i] = THTensor_(dirichlet_grad_one)(x_data[i], alpha_data[i], total_data[i]);
+  }
+
+  THTensor_(freeCopyTo)(grad, self);
+}
+
+
+#undef TH_MATH_NAME
+#endif /* floating point only part */
+#undef IS_NONZERO
+#endif
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
new file mode 100644
index 0000000..08f3f15
--- /dev/null
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -0,0 +1,214 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorMath.h"
+#else
+
+TH_API void THTensor_(fill)(THTensor *r_, real value);
+TH_API void THTensor_(zero)(THTensor *r_);
+
+TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value);
+TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
+TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
+
+TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor);
+
+TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
+TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
+TH_API void THTensor_(take)(THTensor *tensor, THTensor *src, THLongTensor *index);
+TH_API void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate);
+
+TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
+TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
+
+TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src);
+
+TH_API real THTensor_(minall)(THTensor *t);
+TH_API real THTensor_(maxall)(THTensor *t);
+TH_API real THTensor_(medianall)(THTensor *t);
+TH_API accreal THTensor_(sumall)(THTensor *t);
+TH_API accreal THTensor_(prodall)(THTensor *t);
+
+TH_API void THTensor_(neg)(THTensor *self, THTensor *src);
+TH_API void THTensor_(cinv)(THTensor *self, THTensor *src);
+
+TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(sub)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(add_scaled)(THTensor *r_, THTensor *t, real value, real alpha);
+TH_API void THTensor_(sub_scaled)(THTensor *r_, THTensor *t, real value, real alpha);
+TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(lshift)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value);
+TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value);
+
+TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src);
+TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2);
+TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
+
+TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
+TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
+
+TH_API void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat,  THTensor *vec);
+TH_API void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat1, THTensor *mat2);
+TH_API void THTensor_(addr)(THTensor *r_,  real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2);
+
+TH_API void THTensor_(addbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
+TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
+
+TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain);
+
+TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
+void THTensor_(preserveReduceDimSemantics)(THTensor *r_, int in_dims, int reduce_dimension, int keepdim);
+TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim);
+TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
+TH_API accreal THTensor_(trace)(THTensor *t);
+TH_API void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension);
+
+TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src);
+TH_API void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src);
+TH_API void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value);
+TH_API void THTensor_(cminValue)(THTensor *r, THTensor *t, real value);
+
+TH_API void THTensor_(zerosLike)(THTensor *r_, THTensor *input);
+TH_API void THTensor_(onesLike)(THTensor *r_, THTensor *input);
+TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k);
+TH_API void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m);
+TH_API void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
+TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
+TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n);
+
+TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
+TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
+TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
+TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
+TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
+TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
+
+TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
+
+TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(geValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(neValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(eqValue)(THByteTensor *r_, THTensor* t, real value);
+
+TH_API void THTensor_(ltValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(leValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(gtValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(geValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(neValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(eqValueT)(THTensor *r_, THTensor* t, real value);
+
+TH_API void THTensor_(ltTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(leTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(gtTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(geTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(neTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(eqTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+
+TH_API void THTensor_(ltTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(leTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(gtTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(geTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(neTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+
+TH_API void THTensor_(pow)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(tpow)(THTensor *r_, real value, THTensor *t);
+
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
+#endif
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(log)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(lgamma)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(digamma)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(trigamma)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(polygamma)(THTensor *r_, int64_t n, THTensor *t);
+TH_API void THTensor_(log10)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(log2)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(expm1)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(cos)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(acos)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(cosh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(sin)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(asin)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(sinh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(tan)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(atan)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty);
+TH_API void THTensor_(tanh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(erf)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(erfc)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(erfinv)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(sqrt)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(rsqrt)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(ceil)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(floor)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(round)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
+
+TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim);
+TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim);
+TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
+TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
+TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
+TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue);
+TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue);
+
+TH_API accreal THTensor_(meanall)(THTensor *self);
+TH_API accreal THTensor_(varall)(THTensor *self, int biased);
+TH_API accreal THTensor_(stdall)(THTensor *self, int biased);
+TH_API accreal THTensor_(normall)(THTensor *t, real value);
+
+TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n);
+TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n);
+
+TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total);
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+
+TH_API int THTensor_(logicalAndAll)(THTensor *self);
+TH_API int THTensor_(logicalAnyAll)(THTensor *self);
+TH_API void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+
+#endif /* TH_REAL_IS_BYTE */
+
+#endif
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
new file mode 100644
index 0000000..3ddbfa6
--- /dev/null
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -0,0 +1,552 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorRandom.cpp"
+#else
+
+#include <cmath>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <cpuinfo.h>
+
+#include "THGenerator.hpp"
+
+void THTensor_(random)(THTensor *self, THGenerator *_generator)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+#if defined(TH_REAL_IS_BYTE)
+  TH_TENSOR_APPLY(real, self, *self_data = (uint8_t)(THRandom_random(_generator) % (UINT8_MAX + 1)););
+#elif defined(TH_REAL_IS_CHAR)
+  TH_TENSOR_APPLY(real, self, *self_data = (int8_t)(THRandom_random(_generator) % (INT8_MAX + 1)););
+#elif defined(TH_REAL_IS_SHORT)
+  TH_TENSOR_APPLY(real, self, *self_data = (int16_t)(THRandom_random(_generator) % (INT16_MAX + 1)););
+#elif defined(TH_REAL_IS_INT)
+  TH_TENSOR_APPLY(real, self, *self_data = (int32_t)(THRandom_random(_generator) % (INT32_MAX + 1UL)););
+#elif defined(TH_REAL_IS_LONG)
+  TH_TENSOR_APPLY(real, self, *self_data = (uint64_t)(THRandom_random64(_generator) % (LONG_MAX + 1ULL)););
+#elif defined(TH_REAL_IS_FLOAT)
+  TH_TENSOR_APPLY(real, self, *self_data = (float)(THRandom_random(_generator) % ((1ULL << FLT_MANT_DIG) + 1)););
+#elif defined(TH_REAL_IS_DOUBLE)
+  TH_TENSOR_APPLY(real, self, *self_data = (double)(THRandom_random64(_generator) % ((1ULL << DBL_MANT_DIG) + 1)););
+#else
+#error "Unknown type"
+#endif
+
+}
+
+void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max) {
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  THArgCheck(max > min, 2, "max must be greater than min, but got: min = %lld, max = %lld", min, max);
+  uint64_t range = max - min;
+#if defined(TH_REAL_IS_LONG) || defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    if (range >= 1ULL << 32) {
+      TH_TENSOR_APPLY(real, self, *self_data = static_cast<real>(static_cast<int64_t>((THRandom_random64(_generator) % range) + min));)
+      return;
+    }
+#endif
+    TH_TENSOR_APPLY(real, self, *self_data = static_cast<real>(static_cast<int64_t>((THRandom_random(_generator) % range) + min));)
+}
+
+void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max) {
+  THArgCheck(max > 0, 1, "max must be positive, but got: max = %lld", max);
+  THTensor_(clampedRandom)(self, _generator, 0, max);
+}
+
+void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_geometric(_generator, p););
+}
+
+#ifdef TH_BLAS_MKL
+#define BERNOULLI_OMP 800
+#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000
+
+void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator, const double p)
+{
+  int64_t seed = THRandom_random(_generator);
+  int64_t n = THTensor_(nElement)(self);
+  int contig = THTensor_(isContiguous)(self);
+  int *tmp = NULL;
+  THIntTensor* intTensor = NULL;
+
+  if (contig) {
+#ifdef TH_REAL_IS_INT
+    tmp = THIntTensor_data(self);
+#else
+    tmp = (int*)THAlloc(n*sizeof(int));
+#endif
+  } else {
+    intTensor = THIntTensor_new();
+    THIntTensor_resizeNd(intTensor, self->dim(), self->size, NULL);
+    tmp = THIntTensor_data(intTensor);
+  }
+
+#ifdef _OPENMP
+  size_t nthr = !omp_in_parallel() && n >= BERNOULLI_OMP ? omp_get_num_threads() : 1;
+#pragma omp parallel num_threads(nthr) firstprivate(nthr)
+  {
+    size_t tid = omp_get_thread_num();
+    int64_t seg_len_tmp = n / nthr;
+    int64_t line_index_offset = tid * seg_len_tmp;
+    int64_t line_seg_len = (tid == nthr - 1)? (n-line_index_offset) : seg_len_tmp;
+#else
+  {
+    int64_t line_index_offset = 0;
+    int64_t line_seg_len = n;
+#endif
+
+    if (line_seg_len > 0) {
+      VSLStreamStatePtr stream;
+      vslNewStream(&stream, VSL_BRNG_MCG31, seed);
+      vslSkipAheadStream(stream, line_index_offset);
+      viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, line_seg_len,
+        tmp + line_index_offset, p);
+      vslDeleteStream(&stream);
+
+#ifndef TH_REAL_IS_INT
+      if (contig) {
+        real* self_seg = THTensor_(data)(self) + line_index_offset;
+        int* tmp_seg = tmp + line_index_offset;
+        THVector_(cvtFromInt)(self_seg, tmp_seg, line_seg_len);
+      }
+#endif
+    }
+  }
+
+  if(contig) {
+#ifndef TH_REAL_IS_INT
+    THFree(tmp);
+#endif
+  } else {
+#ifdef _OPENMP
+    TH_TENSOR_APPLY2_OMP(n, 1, 0, int, intTensor, real, self, *self_data = *intTensor_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY)
+#else
+    TH_TENSOR_APPLY2(int, intTensor, real, self, *self_data = *intTensor_data;)
+#endif
+    THIntTensor_free(intTensor);
+  }
+
+}
+
+#endif
+
+void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p)
+{
+#ifdef TH_BLAS_MKL
+  if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
+    std::lock_guard<std::mutex> lock(_generator->mutex);
+    THTensor_(iBernoulli_generate_copy)(self, _generator, p);
+  } else {
+    std::lock_guard<std::mutex> lock(_generator->mutex);
+    TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p););
+  }
+#else
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p););
+#endif
+}
+
+void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY2(real, self, float, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
+}
+
+void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY2(real, self, double, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
+}
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+#if defined(TH_REAL_IS_FLOAT)
+#define TH_REAL_MIN FLT_MIN
+#elif defined(TH_REAL_IS_DOUBLE)
+#define TH_REAL_MIN DBL_MIN
+#endif
+
+void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor* p)
+{
+#if defined(TH_REAL_IS_FLOAT)
+  THTensor_(bernoulli_FloatTensor)(self, _generator, p);
+#else
+  THTensor_(bernoulli_DoubleTensor)(self, _generator, p);
+#endif
+}
+
+void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  #if defined(TH_REAL_IS_FLOAT)
+  TH_TENSOR_APPLY(real, self, *self_data =
+    (real)THRandom_uniformFloat(_generator, (real)a, (real)b););
+  #else
+  TH_TENSOR_APPLY(real, self, *self_data =
+    (real)THRandom_uniform(_generator, a, b););
+  #endif
+}
+
+void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stddev)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  const int64_t size = THTensor_(numel)(self);
+  if (size >= 16 && THTensor_(isContiguous)(self)) {
+    THVector_(normal_fill)(THStorage_(data)(self->storage), size, _generator, mean, stddev);
+  } else {
+    TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stddev););
+  }
+}
+
+void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor *means, double stddev)
+{
+  THTensor_(resizeAs)(self, means);
+  THTensor_(normal)(self, gen, 0, stddev);
+  THTensor_(cadd)(self, self, 1, means);
+}
+
+void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs)
+{
+  THTensor_(resizeAs)(self, stddevs);
+  THTensor_(normal)(self, gen, 0, 1);
+  THTensor_(cmul)(self, self, stddevs);
+  THTensor_(add)(self, self, mean);
+}
+
+void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs)
+{
+  THTensor_(resizeAs)(self, means);
+  THTensor_(normal)(self, gen, 0, 1);
+  THTensor_(cmul)(self, self, stddevs);
+  THTensor_(cadd)(self, self, 1, means);
+}
+
+void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda););
+}
+
+#undef TH_REAL_MIN
+
+void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_cauchy(_generator, median, sigma););
+}
+
+void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_logNormal(_generator, mean, stdv););
+}
+
+void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor *q)
+{
+  int64_t inputsize = THTensor_(nElement)(probs);
+  int64_t i = 0;
+  THLongTensor *smaller = THLongTensor_newWithSize1d(inputsize);
+  THLongTensor *larger = THLongTensor_newWithSize1d(inputsize);
+  int64_t small_c = 0;
+  int64_t large_c = 0;
+  THLongTensor_resize1d(J, inputsize);
+  THTensor_(resize1d)(q, inputsize);
+  real *q_data = THTensor_(data)(q);
+  int64_t *J_data = THLongTensor_data(J);
+
+  for (i = 0; i < inputsize; i++)
+    {
+      THLongTensor_fastSet1d(J, i, 0L);
+      real val = THTensor_(fastGet1d)(probs, i);
+      THTensor_(fastSet1d)(q, i, inputsize*val);
+
+      if (inputsize * val < 1.0)
+        {
+          THLongTensor_fastSet1d(smaller, small_c, i);
+          small_c += 1;
+        }
+      else
+        {
+          THLongTensor_fastSet1d(larger, large_c, i);
+          large_c += 1;
+        }
+    }
+
+  // Loop through and create little binary mixtures that
+  // appropriately allocate the larger outcomes over the
+  // overall uniform mixture.
+  int64_t large, small;
+  while (small_c > 0 && large_c > 0)
+    {
+      large = THLongTensor_fastGet1d(larger, large_c-1);
+      small = THLongTensor_fastGet1d(smaller, small_c-1);
+
+      THLongTensor_fastSet1d(J, small, large);
+      q_data[large * q->stride[0]] -= 1.0 - THTensor_(fastGet1d)(q, small);
+
+      if(q_data[large * q->stride[0]] < 1.0)
+        {
+          THLongTensor_fastSet1d(smaller, small_c-1, large);
+          large_c -= 1;
+        }
+      else
+        {
+          THLongTensor_fastSet1d(larger, large_c-1, large);
+          small_c -= 1;
+        }
+    }
+
+  real q_min = THTensor_(fastGet1d)(q, inputsize-1);
+  real q_max = q_min;
+  real q_temp;
+  for (i=0; i < inputsize; i++)
+    {
+      q_temp = THTensor_(fastGet1d)(q, i);
+      if (q_temp < q_min)
+        q_min = q_temp;
+      else if (q_temp > q_max)
+        q_max = q_temp;
+    }
+  THArgCheckWithCleanup((q_min > 0),
+                        THCleanup(THLongTensor_free(smaller); THLongTensor_free(larger);), 2,
+                        "q_min is less than 0");
+
+  if (q_max > 1)
+    {
+      for (i=0; i < inputsize; i++)
+        {
+          q_data[i*q->stride[0]] /= q_max;
+        }
+    }
+  for (i=0; i < inputsize; i++)
+    {
+      // sometimes an large index isn't added to J.
+      // fix it by making the probability 1 so that J isn't indexed.
+      if(J_data[i] <= 0)
+        q_data[i] = 1.0;
+    }
+  THLongTensor_free(smaller);
+  THLongTensor_free(larger);
+}
+void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  int64_t K = THLongTensor_nElement(J);
+  int64_t output_nelem = THLongTensor_nElement(self);
+  int64_t i = 0, _mask=0;
+  real _q;
+  int64_t rand_ind, sample_idx, J_sample;
+
+  for (i=0; i < output_nelem; i++)
+    {
+      rand_ind = THRandom_uniform(_generator, 0, K);
+
+      _q = THTensor_(fastGet1d)(q, rand_ind);
+
+      _mask = THRandom_bernoulli(_generator, _q);
+
+      J_sample = THLongTensor_fastGet1d(J, rand_ind);
+
+      sample_idx = J_sample*(1 -_mask) + (rand_ind+1L) * _mask;
+
+      THLongTensor_fastSet1d(self, i, sample_idx-1L);
+    }
+}
+void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  int64_t start_dim = THTensor_(_nDimension)(prob_dist);
+  int64_t n_dist;
+  int64_t n_categories;
+  THDoubleTensor* cum_dist;
+  int64_t i,j,k;
+
+  if (start_dim == 1)
+  {
+    THTensor_(unsqueeze1d)(prob_dist, prob_dist, 0);
+  }
+
+  n_dist = THTensor_(size)(prob_dist, 0);
+  n_categories = THTensor_(size)(prob_dist, 1);
+
+  THArgCheckWithCleanup(n_sample > 0,
+    THCleanup(if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+    2,
+    "cannot sample n_sample <= 0 samples");
+
+  if (!with_replacement)
+  {
+    THArgCheckWithCleanup((!with_replacement) && (n_sample <= n_categories),
+      THCleanup(if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+      2,
+      "cannot sample n_sample > prob_dist.size(1) samples without replacement");
+  }
+
+  /* cumulative probability distribution vector */
+  cum_dist = THDoubleTensor_newWithSize1d(n_categories);
+
+  /* will contain multinomial samples (category indices to be returned) */
+  THLongTensor_resize2d(self, n_dist , n_sample);
+
+  for (i=0; i<n_dist; i++)
+  {
+    /* Get normalized cumulative distribution from prob distribution */
+    double sum = 0;
+    double val;
+    for (j=0; j<n_categories; j++)
+    {
+      val = THStorage_(get)( \
+        prob_dist->storage, \
+        prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \
+      );
+      THArgCheckWithCleanup((val >= 0),
+                            THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+                            2,
+                            "invalid multinomial distribution (encountering probability entry < 0)");
+      THArgCheckWithCleanup((std::isfinite(val)),
+                            THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+                            2,
+                            "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
+      sum += val;
+      THDoubleStorage_set(
+        cum_dist->storage, \
+        cum_dist->storageOffset+j*cum_dist->stride[0], \
+        sum \
+      );
+    }
+    THArgCheckWithCleanup((sum > 0),
+                          THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+                          2,
+                          "invalid multinomial distribution (sum of probabilities <= 0)");
+    /* normalize cumulative probability distribution so that last val is 1
+    i.e. doesn't assume original prob_dist row sums to one */
+    if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) )
+    {
+      for (j=0; j<n_categories; j++)
+      {
+        THDoubleTensor_data(cum_dist)[j*cum_dist->stride[0]] /= sum;
+      }
+    }
+
+    for (j=0; j<n_sample; j++)
+    {
+      /* sample a probability mass from a uniform distribution */
+      double uniform_sample = THRandom_uniform(_generator, 0, 1);
+      /* Do a binary search for the slot in which the prob falls
+      ie cum_dist[row][slot-1] < uniform_prob < cum_distr[row][slot] */
+      int left_pointer = 0;
+      int right_pointer = n_categories;
+      int mid_pointer;
+      double cum_prob;
+      int sample_idx;
+      /* Make sure the last cumulative distribution bucket sums to 1 */
+      THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride[0]] = 1;
+
+      while(right_pointer - left_pointer > 0)
+      {
+          mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
+          cum_prob = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \
+          );
+          if (cum_prob < uniform_sample)
+          {
+            left_pointer = mid_pointer + 1;
+          }
+          else
+          {
+            right_pointer = mid_pointer;
+          }
+      }
+      sample_idx = left_pointer;
+
+       /* store in result tensor (will be incremented for lua compat by wrapper) */
+      THLongStorage_set( \
+        self->storage, \
+        self->storageOffset+i*self->stride[0]+j*self->stride[1], \
+        sample_idx \
+      );
+
+      /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */
+      if (!with_replacement && j < n_sample - 1)
+      {
+        /* update cumulative distribution so that sample cannot be drawn again */
+        double diff;
+        double new_val = 0;
+        double sum;
+
+        if (sample_idx != 0)
+        {
+          new_val = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \
+          );
+        }
+        /* marginal cumulative mass (i.e. original probability) of sample */
+        diff = THDoubleStorage_get( \
+          cum_dist->storage, \
+          cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \
+        ) - new_val;
+        /* new sum of marginals is not one anymore... */
+        sum = 1.0 - diff;
+        for (k=0; k<n_categories; k++)
+        {
+          new_val = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+k*cum_dist->stride[0] \
+          );
+          if (k >= sample_idx)
+          {
+            /* remove sampled probability mass from later cumulative probabilities */
+            new_val -= diff;
+          }
+          /* make total marginals sum to one */
+          new_val /= sum;
+          THDoubleStorage_set( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+k*cum_dist->stride[0], \
+            new_val \
+          );
+        }
+      }
+    }
+  }
+
+  THDoubleTensor_free(cum_dist);
+
+  if (start_dim == 1)
+  {
+    THLongTensor_resize1d(self, n_sample);
+    THTensor_(squeeze1d)(prob_dist, prob_dist, 0);
+  }
+}
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  static const size_t size = sizeof(THGeneratorState);
+  THGeneratorState *rng_state;
+  THTensor_(resize1d)(self, size);
+  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
+  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
+  rng_state = (THGeneratorState *)THTensor_(data)(self);
+  THGeneratorState_copy(rng_state, &_generator->gen_state);
+}
+
+void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self)
+{
+  std::lock_guard<std::mutex> lock(_generator->mutex);
+  static const size_t size = sizeof(THGeneratorState);
+  THGeneratorState *rng_state;
+  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
+  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
+  rng_state = (THGeneratorState *)THTensor_(data)(self);
+  THArgCheck(THGeneratorState_isValid(rng_state), 1, "Invalid RNG state");
+  THGeneratorState_copy(&_generator->gen_state, rng_state);
+}
+#endif
+#endif
diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h
new file mode 100644
index 0000000..dc6bdaf
--- /dev/null
+++ b/aten/src/TH/generic/THTensorRandom.h
@@ -0,0 +1,33 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorRandom.h"
+#else
+
+TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator);
+TH_API void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max);
+TH_API void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max);
+TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p);
+TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p);
+TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p);
+TH_API void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p);
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+TH_API void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor *p);
+TH_API void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b);
+TH_API void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
+TH_API void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor *means, double stddev);
+TH_API void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs);
+TH_API void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs);
+TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda);
+TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma);
+TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
+TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement);
+TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor *J, THTensor *q);
+TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q);
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+TH_API void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self);
+TH_API void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self);
+#endif
+
+#endif
diff --git a/aten/src/TH/generic/THVector.h b/aten/src/TH/generic/THVector.h
new file mode 100644
index 0000000..1931700
--- /dev/null
+++ b/aten/src/TH/generic/THVector.h
@@ -0,0 +1,68 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVector.h"
+#else
+
+// Opaque C++ struct
+struct THGenerator;
+
+TH_API void THVector_(fill)(real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n);
+TH_API void THVector_(adds)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(copy)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(neg)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(normal_fill)(real *data,
+                                   const int64_t size,
+                                   struct THGenerator *generator,
+                                   const real mean,
+                                   const real stddev);
+#ifndef TH_REAL_IS_INT
+TH_API void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n);
+#endif
+
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+TH_API void THVector_(abs)(real *y, const real *x, const ptrdiff_t n);
+#endif
+
+/* floating point only now */
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+TH_API void THVector_(log)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(lgamma)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(digamma)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(trigamma)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(log10)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(log1p)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(log2)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(sigmoid)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(exp)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(expm1)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(erf)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(erfc)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(erfinv)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(cos)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(acos)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(cosh)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(sin)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(asin)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(sinh)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(tan)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(atan)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(tanh)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(pow)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(sqrt)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(rsqrt)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(ceil)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(floor)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(round)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(abs)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(trunc)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(frac)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(cinv)(real *y, const real *x, const ptrdiff_t n);
+
+#endif /* floating point only part */
+
+#endif
diff --git a/aten/src/TH/generic/THVectorDefault.cpp b/aten/src/TH/generic/THVectorDefault.cpp
new file mode 100644
index 0000000..a32701a
--- /dev/null
+++ b/aten/src/TH/generic/THVectorDefault.cpp
@@ -0,0 +1,289 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVectorDefault.cpp"
+#else
+
+#include "../THRandom.h"
+
+void THVector_(copy_DEFAULT)(real *x, const real *y, const ptrdiff_t n) {
+  ptrdiff_t i = 0;
+
+  for(; i <n-4; i+=4)
+  {
+    x[i] = y[i];
+    x[i+1] = y[i+1];
+    x[i+2] = y[i+2];
+    x[i+3] = y[i+3];
+  }
+
+  for(; i < n; i++)
+    x[i] = y[i];
+}
+
+void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
+  ptrdiff_t i = 0;
+
+  for(; i <n-4; i+=4)
+  {
+    x[i] = c;
+    x[i+1] = c;
+    x[i+2] = c;
+    x[i+3] = c;
+  }
+
+  for(; i < n; i++)
+    x[i] = c;
+}
+
+void THVector_(cadd_DEFAULT)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    z[i] = x[i] + c * y[i];
+    z[i+1] = x[i+1] + c * y[i+1];
+    z[i+2] = x[i+2] + c * y[i+2];
+    z[i+3] = x[i+3] + c * y[i+3];
+  }
+
+  for(; i<n; i++)
+    z[i] = x[i] + c * y[i];
+}
+
+void THVector_(adds_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    y[i] = x[i] + c;
+    y[i+1] = x[i+1] + c;
+    y[i+2] = x[i+2] + c;
+    y[i+3] = x[i+3] + c;
+  }
+
+  for(; i<n; i++)
+    y[i] = x[i] + c;
+}
+
+void THVector_(cmul_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i <n-4; i+=4)
+  {
+    z[i] = x[i] * y[i];
+    z[i+1] = x[i+1] * y[i+1];
+    z[i+2] = x[i+2] * y[i+2];
+    z[i+3] = x[i+3] * y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] * y[i];
+}
+
+void THVector_(muls_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i <n-4; i+=4)
+  {
+    y[i] = x[i] * c;
+    y[i+1] = x[i+1] * c;
+    y[i+2] = x[i+2] * c;
+    y[i+3] = x[i+3] * c;
+  }
+
+  for(; i < n; i++)
+    y[i] = x[i] * c;
+}
+
+void THVector_(cdiv_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    z[i] = x[i] / y[i];
+    z[i+1] = x[i+1] / y[i+1];
+    z[i+2] = x[i+2] / y[i+2];
+    z[i+3] = x[i+3] / y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] / y[i];
+}
+
+void THVector_(divs_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    y[i] = x[i] / c;
+    y[i+1] = x[i+1] / c;
+    y[i+2] = x[i+2] / c;
+    y[i+3] = x[i+3] / c;
+  }
+
+  for(; i < n; i++)
+    y[i] = x[i] / c;
+}
+
+#ifndef TH_REAL_IS_INT
+void THVector_(cvtFromInt_DEFAULT)(real *y, const int *x, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    y[i] = (real)x[i];
+    y[i+1] = (real)x[i+1];
+    y[i+2] = (real)x[i+2];
+    y[i+3] = (real)x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] = (real)x[i];
+}
+#endif
+
+// Fills 16 normally distributed samples into data, interleaved with a
+// stride of 8, i.e. in order of ([0], [8]), ([1], [9]), ...
+static void THVector_(interleaved_normal_fill_16)(real *data,
+                                                  const real mean,
+                                                  const real stddev)
+{
+  for (int j = 0; j < 8; ++j) {
+    const real u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log.
+    const real u2 = data[j + 8];
+
+    const real radius = sqrt(-2 * log(u1));
+    const real theta = 2.0f * M_PI * u2;
+
+    data[j] = radius * cos(theta) * stddev + mean;
+    data[j + 8] = radius * sin(theta) * stddev + mean;
+  }
+}
+
+void THVector_(normal_fill_DEFAULT)(real *data,
+                                    int64_t size,
+                                    THGenerator *generator,
+                                    const real mean,
+                                    const real stddev)
+{
+  THAssert(size >= 16 && "Size must be >= 16 for normal fill");
+
+  for (int64_t i = 0; i < size; ++i) {
+#ifdef TH_REAL_IS_FLOAT
+    data[i] = THRandom_uniformFloat(generator, 0, 1);
+#else
+    data[i] = THRandom_uniform(generator, 0, 1);
+#endif
+  }
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    THVector_(interleaved_normal_fill_16)(data + i, mean, stddev);
+  }
+
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (int64_t i = 0; i < 16; ++i) {
+#ifdef TH_REAL_IS_FLOAT
+      data[i] = THRandom_uniformFloat(generator, 0, 1);
+#else
+      data[i] = THRandom_uniform(generator, 0, 1);
+#endif
+    }
+    THVector_(interleaved_normal_fill_16)(data, mean, stddev);
+  }
+}
+
+#define VECTOR_IMPLEMENT_FUNCTION(NAME, CFUNC)  \
+  void THVector_(NAME)(real *y, const real *x, const ptrdiff_t n) \
+  { \
+    ptrdiff_t i = 0;  \
+    for(; i<n-4; i+=4)  \
+    { \
+      y[i] = CFUNC(x[i]); \
+      y[i+1] = CFUNC(x[i+1]); \
+      y[i+2] = CFUNC(x[i+2]); \
+      y[i+3] = CFUNC(x[i+3]); \
+    } \
+    for(; i < n; i++) \
+      y[i] = CFUNC(x[i]); \
+  } \
+
+#define VECTOR_IMPLEMENT_FUNCTION_VALUE(NAME, CFUNC)  \
+  void THVector_(NAME)(real *y, const real *x, const real c, const ptrdiff_t n) \
+  { \
+    ptrdiff_t i = 0;  \
+    for(; i<n-4; i+=4)  \
+    { \
+      y[i] = CFUNC(x[i], c);  \
+      y[i+1] = CFUNC(x[i+1], c);  \
+      y[i+2] = CFUNC(x[i+2], c);  \
+      y[i+3] = CFUNC(x[i+3], c);  \
+    } \
+    for(; i < n; i++) \
+      y[i] = CFUNC(x[i], c);  \
+  } \
+
+#if defined(TH_REAL_IS_LONG)
+VECTOR_IMPLEMENT_FUNCTION(abs,labs)
+#endif /* long only part */
+
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT)
+VECTOR_IMPLEMENT_FUNCTION(abs,abs)
+#endif /* int only part */
+
+
+/* floating point only now */
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+
+VECTOR_IMPLEMENT_FUNCTION(log,TH_MATH_NAME(log))
+VECTOR_IMPLEMENT_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
+VECTOR_IMPLEMENT_FUNCTION(digamma,TH_MATH_NAME(TH_digamma))
+VECTOR_IMPLEMENT_FUNCTION(trigamma,TH_MATH_NAME(TH_trigamma))
+VECTOR_IMPLEMENT_FUNCTION(log10,TH_MATH_NAME(log10))
+VECTOR_IMPLEMENT_FUNCTION(log1p,TH_MATH_NAME(log1p))
+VECTOR_IMPLEMENT_FUNCTION(log2,TH_MATH_NAME(log2))
+VECTOR_IMPLEMENT_FUNCTION(sigmoid_DEFAULT,TH_MATH_NAME(TH_sigmoid))
+VECTOR_IMPLEMENT_FUNCTION(exp,TH_MATH_NAME(exp))
+VECTOR_IMPLEMENT_FUNCTION(expm1,TH_MATH_NAME(expm1))
+VECTOR_IMPLEMENT_FUNCTION(erf,TH_MATH_NAME(erf))
+VECTOR_IMPLEMENT_FUNCTION(erfc,TH_MATH_NAME(erfc))
+VECTOR_IMPLEMENT_FUNCTION(erfinv, TH_erfinv)
+VECTOR_IMPLEMENT_FUNCTION(cos,TH_MATH_NAME(cos))
+VECTOR_IMPLEMENT_FUNCTION(acos,TH_MATH_NAME(acos))
+VECTOR_IMPLEMENT_FUNCTION(cosh,TH_MATH_NAME(cosh))
+VECTOR_IMPLEMENT_FUNCTION(sin,TH_MATH_NAME(sin))
+VECTOR_IMPLEMENT_FUNCTION(asin,TH_MATH_NAME(asin))
+VECTOR_IMPLEMENT_FUNCTION(sinh,TH_MATH_NAME(sinh))
+VECTOR_IMPLEMENT_FUNCTION(tan,TH_MATH_NAME(tan))
+VECTOR_IMPLEMENT_FUNCTION(atan,TH_MATH_NAME(atan))
+VECTOR_IMPLEMENT_FUNCTION(tanh,TH_MATH_NAME(tanh))
+VECTOR_IMPLEMENT_FUNCTION_VALUE(pow,TH_MATH_NAME(pow))
+VECTOR_IMPLEMENT_FUNCTION(sqrt,TH_MATH_NAME(sqrt))
+VECTOR_IMPLEMENT_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt))
+VECTOR_IMPLEMENT_FUNCTION(ceil,TH_MATH_NAME(ceil))
+VECTOR_IMPLEMENT_FUNCTION(floor,TH_MATH_NAME(floor))
+VECTOR_IMPLEMENT_FUNCTION(round,TH_MATH_NAME(round))
+VECTOR_IMPLEMENT_FUNCTION(abs,TH_MATH_NAME(fabs))
+VECTOR_IMPLEMENT_FUNCTION(trunc,TH_MATH_NAME(trunc))
+VECTOR_IMPLEMENT_FUNCTION(frac,TH_MATH_NAME(TH_frac))
+VECTOR_IMPLEMENT_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
+
+#undef TH_MATH_NAME
+#endif /* floating point only part */
+
+VECTOR_IMPLEMENT_FUNCTION(neg,-)
+
+#endif
diff --git a/aten/src/TH/generic/THVectorDispatch.cpp b/aten/src/TH/generic/THVectorDispatch.cpp
new file mode 100644
index 0000000..bfc1fff
--- /dev/null
+++ b/aten/src/TH/generic/THVectorDispatch.cpp
@@ -0,0 +1,329 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVectorDispatch.cpp"
+#else
+
+/* For now there are only SIMD implementations for FLOAT and DOUBLE.
+ * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
+ * for a lot of functions */
+/* Each function with multiple implementations has:
+ * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
+ * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
+ *    which SIMD extension a given implementation uses
+ * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
+ */
+
+static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(fill_DEFAULT);
+static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+  FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
+  THVector_(fill_DISPATCHPTR)(x, c, n);
+}
+
+static void (*THVector_(cadd_DISPATCHPTR))(real *, const real *, const real *, const real, const ptrdiff_t) = &THVector_(cadd_DEFAULT);
+static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cadd_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cadd_AVX2), SIMDExtension_AVX2),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cadd_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cadd_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(cadd_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) {
+  THVector_(cadd_DISPATCHPTR)(z, x, y, c, n);
+}
+
+static void (*THVector_(adds_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(adds_DEFAULT);
+static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(adds_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(adds_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(adds_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(adds_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(adds_DEFAULT), SIMDExtension_DEFAULT)
+};
+// Dispatch stubs that just call the pointers
+TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) {
+  THVector_(adds_DISPATCHPTR)(r_, t, value, n);
+}
+
+static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT);
+static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+  THVector_(cmul_DISPATCHPTR)(z, x, y, n);
+}
+
+static void (*THVector_(muls_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(muls_DEFAULT);
+static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(muls_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(muls_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(muls_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(muls_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(muls_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n) {
+  THVector_(muls_DISPATCHPTR)(y, x, c, n);
+}
+
+static void (*THVector_(cdiv_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cdiv_DEFAULT);
+static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(cdiv_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+  THVector_(cdiv_DISPATCHPTR)(z, x, y, n);
+}
+
+static void (*THVector_(divs_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(divs_DEFAULT);
+static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(divs_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(divs_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(divs_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(divs_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) {
+  THVector_(divs_DISPATCHPTR)(y, x, c, n);
+}
+
+static void (*THVector_(copy_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(copy_DEFAULT);
+static FunctionDescription THVector_(copy_DISPATCHTABLE)[] = {
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(copy_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(copy_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) {
+  THVector_(copy_DISPATCHPTR)(y, x, n);
+}
+
+#ifndef TH_REAL_IS_INT
+static void (*THVector_(cvtFromInt_DISPATCHPTR))(real *, const int *, const ptrdiff_t) = &THVector_(cvtFromInt_DEFAULT);
+static FunctionDescription THVector_(cvtFromInt_DISPATCHTABLE)[] = {
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cvtFromInt_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cvtFromInt_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+
+  FUNCTION_IMPL(THVector_(cvtFromInt_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n) {
+  THVector_(cvtFromInt_DISPATCHPTR)(y, x, n);
+}
+#endif
+
+static void (*THVector_(normal_fill_DISPATCHPTR))(real *, const int64_t, THGenerator *, const real, const real) = &THVector_(normal_fill_DEFAULT);
+static FunctionDescription THVector_(normal_fill_DISPATCHTABLE)[] = {
+  #if defined(TH_REAL_IS_FLOAT) && defined(USE_AVX2)
+      FUNCTION_IMPL(THVector_(normal_fill_AVX2), SIMDExtension_AVX2),
+  #endif
+
+  FUNCTION_IMPL(THVector_(normal_fill_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(normal_fill)(real *data,
+                            const int64_t size,
+                            struct THGenerator *generator,
+                            const real mean,
+                            const real stddev) {
+  THVector_(normal_fill_DISPATCHPTR)(data, size, generator, mean, stddev);
+}
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+static void (*THVector_(sigmoid_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(sigmoid_DEFAULT);
+static FunctionDescription THVector_(sigmoid_DISPATCHTABLE)[] = {
+  #if defined(TH_REAL_IS_FLOAT) && defined(USE_AVX2)
+      FUNCTION_IMPL(THVector_(sigmoid_AVX2), SIMDExtension_AVX2),
+  #endif
+
+  FUNCTION_IMPL(THVector_(sigmoid_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(sigmoid)(real *y, const real *x, const ptrdiff_t n) {
+  THVector_(sigmoid_DISPATCHPTR)(y, x, n);
+}
+#endif
+
+/*
+ * This struct's constructor initalizes the dispatch tables. It simply checks
+ * what SIMD extensions are available, and then walks the dispatch table
+ * to choose the best function.
+ * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
+ *       This means that in the dispatch tables, implementations supporting more recent extensions
+ *       need to come first
+ */
+struct THVector_(startup) {
+  THVector_(startup)() {
+    uint32_t hostSimdExts = detectHostSIMDExtensions();
+    INIT_DISPATCH_PTR(fill);
+    INIT_DISPATCH_PTR(cadd);
+    INIT_DISPATCH_PTR(adds);
+    INIT_DISPATCH_PTR(cmul);
+    INIT_DISPATCH_PTR(muls);
+    INIT_DISPATCH_PTR(cdiv);
+    INIT_DISPATCH_PTR(divs);
+    INIT_DISPATCH_PTR(copy);
+    INIT_DISPATCH_PTR(normal_fill);
+
+#ifndef TH_REAL_IS_INT
+    INIT_DISPATCH_PTR(cvtFromInt);
+#endif
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+    INIT_DISPATCH_PTR(sigmoid);
+#endif
+  }
+};
+
+// Declare a global instance to force static initialization
+static THVector_(startup) THVector_(g_startup);
+
+#endif
diff --git a/aten/src/TH/generic/simd/common_simd.h b/aten/src/TH/generic/simd/common_simd.h
new file mode 100644
index 0000000..425b4b9
--- /dev/null
+++ b/aten/src/TH/generic/simd/common_simd.h
@@ -0,0 +1,395 @@
+#ifndef COMMON_SIMD_H
+#define COMMON_SIMD_H
+
+/* Weights */
+#define LOAD_WEIGHT(q, simd_type, inst_var) _m ## simd_type ## inst_var(*(q))
+
+#define DECLARE_WEIGHTS(simd_type) \
+__ ## simd_type weight0; \
+__ ## simd_type weight1; \
+__ ## simd_type weight2; \
+__ ## simd_type weight3; \
+__ ## simd_type weight4;
+
+#define LOAD_WEIGHTS(k, simd_type, inst_var) \
+weight0 = LOAD_WEIGHT(weight + 5 * 0 + k, simd_type, inst_var); \
+weight1 = LOAD_WEIGHT(weight + 5 * 1 + k, simd_type, inst_var); \
+weight2 = LOAD_WEIGHT(weight + 5 * 2 + k, simd_type, inst_var); \
+weight3 = LOAD_WEIGHT(weight + 5 * 3 + k, simd_type, inst_var); \
+weight4 = LOAD_WEIGHT(weight + 5 * 4 + k, simd_type, inst_var);
+
+/* Inputs declare */
+#define DECLARE_INPUT_0(i) \
+float* input0 = image + i; \
+
+#define DECLARE_INPUT_1() \
+float* input1 = input0 + inputStride; \
+float* input2 = input1 + inputStride; \
+float* input3 = input2 + inputStride; \
+float* input4 = input3 + inputStride;
+
+#define DECLARE_INPUT_2() \
+DECLARE_INPUT_1() \
+float* input5 = input4 + inputStride;
+
+#define DECLARE_INPUT_4() \
+DECLARE_INPUT_2() \
+float* input6 = input5 + inputStride; \
+float* input7 = input6 + inputStride;
+
+#define DECLARE_INPUT_5() \
+DECLARE_INPUT_4() \
+float* input8 = input7 + inputStride;
+
+#define DECLARE_INPUT_6() \
+DECLARE_INPUT_5() \
+float* input9 = input8 + inputStride;
+
+#define DECLARE_INPUT_7() \
+DECLARE_INPUT_6() \
+float* inputA = input9 + inputStride;
+
+#define DECLARE_INPUT_8() \
+DECLARE_INPUT_7() \
+float* inputB = inputA + inputStride;
+
+
+/* Inputs increment */
+#define INC_INPUT_1()\
+input0++; \
+input1++; \
+input2++; \
+input3++; \
+input4++; \
+
+#define INC_INPUT_2()\
+INC_INPUT_1() \
+input5++;
+
+#define INC_INPUT_4()\
+INC_INPUT_2() \
+input6++; \
+input7++;
+
+#define INC_INPUT_5()\
+INC_INPUT_4() \
+input8++;
+
+#define INC_INPUT_6()\
+INC_INPUT_5() \
+input9++;
+
+#define INC_INPUT_7()\
+INC_INPUT_6() \
+inputA++;
+
+#define INC_INPUT_8()\
+INC_INPUT_7() \
+inputB++;
+
+/* Outputs declare */
+#define DECLARE_OUTPUT_1() \
+float* output0 = output;
+
+#define DECLARE_OUTPUT_2() \
+DECLARE_OUTPUT_1() \
+float* output1 = output0 + outputStride;
+
+#define DECLARE_OUTPUT_4() \
+DECLARE_OUTPUT_2() \
+float* output2 = output1 + outputStride; \
+float* output3 = output2 + outputStride;
+
+#define DECLARE_OUTPUT_5() \
+DECLARE_OUTPUT_4() \
+float* output4 = output3 + outputStride;
+
+#define DECLARE_OUTPUT_6() \
+DECLARE_OUTPUT_5() \
+float* output5 = output4 + outputStride;
+
+#define DECLARE_OUTPUT_7() \
+DECLARE_OUTPUT_6() \
+float* output6 = output5 + outputStride;
+
+#define DECLARE_OUTPUT_8() \
+DECLARE_OUTPUT_7() \
+float* output7 = output6 + outputStride;
+
+/* Outputs increment */
+#define INC_OUTPUT_1(x) \
+output0 += x;
+
+#define INC_OUTPUT_2(x) \
+INC_OUTPUT_1(x) \
+output1 += x;
+
+#define INC_OUTPUT_4(x) \
+INC_OUTPUT_2(x) \
+output2 += x; \
+output3 += x;
+
+#define INC_OUTPUT_5(x) \
+INC_OUTPUT_4(x) \
+output4 += x;
+
+#define INC_OUTPUT_6(x) \
+INC_OUTPUT_5(x) \
+output5 += x;
+
+#define INC_OUTPUT_7(x) \
+INC_OUTPUT_6(x) \
+output6 += x;
+
+#define INC_OUTPUT_8(x) \
+INC_OUTPUT_7(x) \
+output7 += x;
+
+/* Image declare */
+#define DECLARE_IMAGE_1(simd_type) \
+__ ## simd_type image0; \
+__ ## simd_type image1; \
+__ ## simd_type image2; \
+__ ## simd_type image3; \
+__ ## simd_type image4;
+
+#define DECLARE_IMAGE_2(simd_type) \
+DECLARE_IMAGE_1(simd_type) \
+__ ## simd_type image5;
+
+#define DECLARE_IMAGE_4(simd_type) \
+DECLARE_IMAGE_2(simd_type) \
+__ ## simd_type image6; \
+__ ## simd_type image7;
+
+#define DECLARE_IMAGE_5(simd_type) \
+DECLARE_IMAGE_4(simd_type) \
+__ ## simd_type image8;
+
+#define DECLARE_IMAGE_6(simd_type) \
+DECLARE_IMAGE_5(simd_type) \
+__ ## simd_type image9;
+
+#define DECLARE_IMAGE_7(simd_type) \
+DECLARE_IMAGE_6(simd_type) \
+__ ## simd_type imageA;
+
+#define DECLARE_IMAGE_8(simd_type) \
+DECLARE_IMAGE_7(simd_type) \
+__ ## simd_type imageB;
+
+/* Sums declare */
+#define DECLARE_SUM_1(simd_type) \
+__ ## simd_type sum0;
+
+#define DECLARE_SUM_2(simd_type) \
+DECLARE_SUM_1(simd_type) \
+__ ## simd_type sum1;
+
+#define DECLARE_SUM_4(simd_type) \
+DECLARE_SUM_2(simd_type) \
+__ ## simd_type sum2; \
+__ ## simd_type sum3;
+
+#define DECLARE_SUM_5(simd_type) \
+DECLARE_SUM_4(simd_type) \
+__ ## simd_type sum4;
+
+#define DECLARE_SUM_6(simd_type) \
+DECLARE_SUM_5(simd_type) \
+__ ## simd_type sum5;
+
+#define DECLARE_SUM_7(simd_type) \
+DECLARE_SUM_6(simd_type) \
+__ ## simd_type sum6;
+
+#define DECLARE_SUM_8(simd_type) \
+DECLARE_SUM_7(simd_type) \
+__ ## simd_type sum7;
+
+/* Sums load */
+#define LOAD_SUM_1(simd_type) \
+sum0 = _m ## simd_type ## _loadu_ps(output0);
+
+#define LOAD_SUM_2(simd_type) \
+LOAD_SUM_1(simd_type) \
+sum1 = _m ## simd_type ## _loadu_ps(output1);
+
+#define LOAD_SUM_4(simd_type) \
+LOAD_SUM_2(simd_type) \
+sum2 = _m ## simd_type ## _loadu_ps(output2); \
+sum3 = _m ## simd_type ## _loadu_ps(output3);
+
+#define LOAD_SUM_5(simd_type) \
+LOAD_SUM_4(simd_type) \
+sum4 = _m ## simd_type ## _loadu_ps(output4);
+
+#define LOAD_SUM_6(simd_type) \
+LOAD_SUM_5(simd_type) \
+sum5 = _m ## simd_type ## _loadu_ps(output5);
+
+#define LOAD_SUM_7(simd_type) \
+LOAD_SUM_6(simd_type) \
+sum6 = _m ## simd_type ## _loadu_ps(output6);
+
+#define LOAD_SUM_8(simd_type) \
+LOAD_SUM_7(simd_type) \
+sum7 = _m ## simd_type ## _loadu_ps(output7);
+
+/* Sums store */
+#define STORE_SUM_1(simd_type) \
+_m ## simd_type ## _storeu_ps(output0, sum0);
+
+#define STORE_SUM_2(simd_type) \
+STORE_SUM_1(simd_type) \
+_m ## simd_type ## _storeu_ps(output1, sum1);
+
+#define STORE_SUM_4(simd_type) \
+STORE_SUM_2(simd_type) \
+_m ## simd_type ## _storeu_ps(output2, sum2); \
+_m ## simd_type ## _storeu_ps(output3, sum3);
+
+#define STORE_SUM_5(simd_type) \
+STORE_SUM_4(simd_type) \
+_m ## simd_type ## _storeu_ps(output4, sum4);
+
+#define STORE_SUM_6(simd_type) \
+STORE_SUM_5(simd_type) \
+_m ## simd_type ## _storeu_ps(output5, sum5);
+
+#define STORE_SUM_7(simd_type) \
+STORE_SUM_6(simd_type) \
+_m ## simd_type ## _storeu_ps(output6, sum6);
+
+#define STORE_SUM_8(simd_type) \
+STORE_SUM_7(simd_type) \
+_m ## simd_type ## _storeu_ps(output7, sum7);
+
+/* Convolution */
+#define CONVOLVE_1ROWS(simd_type) \
+image0 = _m ## simd_type ## _loadu_ps(input0); \
+image1 = _m ## simd_type ## _loadu_ps(input1); \
+image2 = _m ## simd_type ## _loadu_ps(input2); \
+image3 = _m ## simd_type ## _loadu_ps(input3); \
+image4 = _m ## simd_type ## _loadu_ps(input4); \
+\
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight0, image0)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight1, image1)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight2, image2)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight3, image3)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight4, image4));
+
+#define CONVOLVE_2ROWS(simd_type) \
+CONVOLVE_1ROWS(simd_type) \
+image5 = _m ## simd_type ## _loadu_ps(input5); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight0, image1)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight1, image2)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight2, image3)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight3, image4)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight4, image5));
+
+#define CONVOLVE_4ROWS(simd_type) \
+CONVOLVE_2ROWS(simd_type) \
+image6 = _m ## simd_type ## _loadu_ps(input6); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight0, image2)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight1, image3)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight2, image4)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight3, image5)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight4, image6)); \
+\
+image7 = _m ## simd_type ## _loadu_ps(input7); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight0, image3)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight1, image4)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight2, image5)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight3, image6)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight4, image7));
+
+#define CONVOLVE_5ROWS(simd_type) \
+CONVOLVE_4ROWS(simd_type) \
+image8 = _m ## simd_type ## _loadu_ps(input8); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight0, image4)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight1, image5)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight2, image6)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight3, image7)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight4, image8));
+
+#define CONVOLVE_6ROWS(simd_type) \
+CONVOLVE_5ROWS(simd_type) \
+image9 = _m ## simd_type ## _loadu_ps(input9); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight0, image5)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight1, image6)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight2, image7)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight3, image8)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight4, image9));
+
+#define CONVOLVE_7ROWS(simd_type) \
+CONVOLVE_6ROWS(simd_type) \
+imageA = _m ## simd_type ## _loadu_ps(inputA); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight0, image6)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight1, image7)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight2, image8)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight3, image9)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight4, imageA));
+
+#define CONVOLVE_8ROWS(simd_type) \
+CONVOLVE_7ROWS(simd_type) \
+imageB = _m ## simd_type ## _loadu_ps(inputB); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight0, image7)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight1, image8)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight2, image9)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight3, imageA)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight4, imageB));
+
+/* Convolution MEGA macro */
+#define DECLARE_SUMX(rows) DECLARE_SUM_ ## rows
+#define LOAD_SUMX(rows) LOAD_SUM_ ## rows
+#define DECLARE_INPUTX(rows) DECLARE_INPUT_ ## rows
+#define DECLARE_IMAGEX(rows) DECLARE_IMAGE_ ## rows
+#define CONVOLVEX(rows) CONVOLVE_ ## rows ## ROWS
+#define INC_INPUTX(rows) INC_INPUT_ ## rows
+#define STORE_SUMX(rows) STORE_SUM_ ## rows
+#define INC_OUTPUTX(rows) INC_OUTPUT_ ## rows
+
+#define CONVOLUTION_LOOP(rows, simd_type, simd_inst_prefex, simd_set, i) \
+DECLARE_SUMX(rows)(simd_type) \
+LOAD_SUMX(rows)(simd_inst_prefex) \
+DECLARE_WEIGHTS(simd_type) \
+DECLARE_INPUT_0(i) \
+DECLARE_INPUTX(rows)() \
+DECLARE_IMAGEX(rows)(simd_type) \
+\
+LOAD_WEIGHTS(0, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(1, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(2, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(3, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(4, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+\
+STORE_SUMX(rows)(simd_inst_prefex) \
+\
+INC_OUTPUTX(rows)(sizeof(__ ## simd_type) / sizeof(float))
+
+
+#define CONVOLVE_8COLS_XROWS(rows, i) \
+{ \
+CONVOLUTION_LOOP(rows, m256, m256, _set1_ps, i) \
+}
+
+#define CONVOLVE_4COLS_XROWS(rows, i) \
+{ \
+CONVOLUTION_LOOP(rows, m128, m, _set_ps1, i) \
+}
+
+#endif
diff --git a/aten/src/TH/generic/simd/convolve.cpp b/aten/src/TH/generic/simd/convolve.cpp
new file mode 100644
index 0000000..326be70
--- /dev/null
+++ b/aten/src/TH/generic/simd/convolve.cpp
@@ -0,0 +1,129 @@
+#if defined(__AVX__)
+
+#ifdef _MSC_VER
+#include <intrin.h>
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+  unsigned int cpui[4];
+  __cpuid(cpui, __level);
+  *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3];
+  return 1;
+}
+
+static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
+  *eax = 0; *edx = 0;
+  if (op == 0)
+      *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+}
+
+#else
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+__asm("  pushl  %%ebx\n" \
+"  cpuid\n" \
+"  mov    %%ebx,%1\n" \
+"  popl   %%ebx" \
+: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+: "0"(__level))
+#else
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+: "0"(__level))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+  __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
+
+static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
+  __asm__ __volatile__
+  (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
+}
+
+#endif
+
+enum ECPUFeature
+{
+  kCPUFeature_SSE = 0x01,
+  kCPUFeature_SSE2 = 0x02,
+  kCPUFeature_SSE3 = 0x04,
+  kCPUFeature_SSE3_S = 0x08,
+  kCPUFeature_SSE4_1 = 0x10,
+  kCPUFeature_SSE4_2 = 0x20,
+  kCPUFeature_AVX = 0x40
+};
+
+static unsigned int checkCPUFeatures() {
+  unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+  unsigned int features = 0;
+  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+  if( (edx & (1 << 25)) != 0 ) {
+    features |= kCPUFeature_SSE;
+  }
+  if( (edx & (1 << 26)) != 0 ) {
+    features |= kCPUFeature_SSE2;
+  }
+  if( (ecx & (1 << 0)) != 0 ) {
+    features |= kCPUFeature_SSE3;
+  }
+  if( (ecx & (1 << 9)) != 0 ) {
+    features |= kCPUFeature_SSE3_S;
+  }
+  if( (ecx & (1 << 19)) != 0 ) {
+    features |= kCPUFeature_SSE4_1;
+  }
+  if( (ecx & (1 << 20)) != 0 ) {
+    features |= kCPUFeature_SSE4_2;
+  }
+  if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) {
+    xgetbv(0, &eax, &edx);
+    if( (eax & 6) == 6 ) {
+      features |= kCPUFeature_AVX;
+    }
+  }
+  return features;
+}
+
+#include <stdio.h>
+
+static int haveCPUFeature(unsigned int feature) {
+  static unsigned int sCPUFeatures = 0;
+  static int sDetectedCPUFeatures = 0;
+  if (!sDetectedCPUFeatures) {
+    sDetectedCPUFeatures = 1;
+    sCPUFeatures = checkCPUFeatures();
+    if ((sCPUFeatures & kCPUFeature_AVX) != 0) {
+      printf("torch running avx\n");
+    } else {
+      printf("torch running sse \n");
+    }
+  }
+  return (sCPUFeatures & feature) != 0;
+}
+
+#endif
+
+#include <stdint.h>
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
+void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
+
+void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols) {
+#if defined(__AVX__)
+  int avx = haveCPUFeature(kCPUFeature_AVX);
+  if (avx)
+  {
+    convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols);
+  }
+  else
+#endif
+  {
+    convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
+  }
+}
diff --git a/aten/src/TH/generic/simd/convolve.h b/aten/src/TH/generic/simd/convolve.h
new file mode 100644
index 0000000..fa04ce9
--- /dev/null
+++ b/aten/src/TH/generic/simd/convolve.h
@@ -0,0 +1 @@
+void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols);
\ No newline at end of file
diff --git a/aten/src/TH/generic/simd/convolve5x5_avx.cpp b/aten/src/TH/generic/simd/convolve5x5_avx.cpp
new file mode 100644
index 0000000..560474b
--- /dev/null
+++ b/aten/src/TH/generic/simd/convolve5x5_avx.cpp
@@ -0,0 +1,214 @@
+#include <immintrin.h>
+#include "common_simd.h"
+#include <stdint.h>
+
+
+#define CLEAR_AVX() _mm256_zeroupper()
+
+void convolve_5x5_1_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_1()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(1, i)
+  }
+}
+
+void convolve_5x5_2_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_2()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(2, i)
+  }
+}
+
+void convolve_5x5_4_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_4()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(4, i)
+  }
+}
+
+void convolve_5x5_5_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_5()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(5, i)
+  }
+}
+
+void convolve_5x5_6_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_6()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(6, i)
+  }
+}
+
+void convolve_5x5_7_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_7()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(7, i)
+  }
+}
+
+void convolve_5x5_8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_8()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(8, i)
+  }
+}
+
+void convolve_5x5_64x64_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 60; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    CONVOLVE_8COLS_XROWS(6, 16)
+    CONVOLVE_8COLS_XROWS(6, 24)
+    CONVOLVE_8COLS_XROWS(6, 32)
+    CONVOLVE_8COLS_XROWS(6, 40)
+    CONVOLVE_8COLS_XROWS(6, 48)
+    CONVOLVE_8COLS_XROWS(6, 56)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_8COLS_XROWS(4, 0)
+  CONVOLVE_8COLS_XROWS(4, 8)
+  CONVOLVE_8COLS_XROWS(4, 16)
+  CONVOLVE_8COLS_XROWS(4, 24)
+  CONVOLVE_8COLS_XROWS(4, 32)
+  CONVOLVE_8COLS_XROWS(4, 40)
+  CONVOLVE_8COLS_XROWS(4, 48)
+  CONVOLVE_8COLS_XROWS(4, 56)
+}
+
+void convolve_5x5_32x32_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 30; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    CONVOLVE_8COLS_XROWS(6, 16)
+    CONVOLVE_8COLS_XROWS(6, 24)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_2()
+  CONVOLVE_8COLS_XROWS(2, 0)
+  CONVOLVE_8COLS_XROWS(2, 8)
+  CONVOLVE_8COLS_XROWS(2, 16)
+  CONVOLVE_8COLS_XROWS(2, 24)
+}
+
+void convolve_5x5_16x16_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 12; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_8COLS_XROWS(4, 0)
+  CONVOLVE_8COLS_XROWS(4, 8)
+}
+
+void convolve_5x5_8x8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  DECLARE_OUTPUT_8()
+  CONVOLVE_8COLS_XROWS(8, 0)
+}
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
+
+void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
+  int64_t ic = inCols;
+  int64_t yy = 0;
+  float* t_ = input;
+  float* r_ = output;
+  float* k_ = kernel;
+
+  if((outRows == 64) && (outCols == 64)) {
+    convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 32) && (outCols == 32)) {
+    convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 16) && (outCols == 16)) {
+    convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 8) && (outCols == 8)) {
+    convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  for(; yy < (outRows / 6 ) * 6; yy += 6) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 6);
+  }
+
+  // more than 2 rows left to process and we ended up on a non-multiple of 4
+  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
+    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 2);
+    yy += 2;
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 4);
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 2);
+  }
+
+  for(; yy < outRows; yy += 1) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 1);
+  }
+
+  int64_t procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
+  int64_t remCols = outCols - procCols;
+
+  //process the rest using sse
+  if( remCols > 0) {
+    CLEAR_AVX();
+    convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols);
+  }
+}
\ No newline at end of file
diff --git a/aten/src/TH/generic/simd/convolve5x5_sse.cpp b/aten/src/TH/generic/simd/convolve5x5_sse.cpp
new file mode 100644
index 0000000..9de9a4a
--- /dev/null
+++ b/aten/src/TH/generic/simd/convolve5x5_sse.cpp
@@ -0,0 +1,321 @@
+#include <smmintrin.h>
+#include "common_simd.h"
+#include <stdint.h>
+
+
+/* SSE variants */
+void convolve_5x5_1_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_1()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(1, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+  }
+}
+
+void convolve_5x5_2_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_2()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(2, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+  }
+}
+
+void convolve_5x5_4_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_4()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(4, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+  }
+}
+
+void convolve_5x5_6_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_6()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(6, i)
+  }
+  for (; i<(count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    float output4 = output[i + outputStride * 4];
+    float output5 = output[i + outputStride * 5];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
+        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+    output[i + outputStride * 4] = output4;
+    output[i + outputStride * 5] = output5;
+  }
+}
+
+void convolve_5x5_8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_8()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(8, i)
+  }
+  for (; i<(count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    float output4 = output[i + outputStride * 4];
+    float output5 = output[i + outputStride * 5];
+    float output6 = output[i + outputStride * 6];
+    float output7 = output[i + outputStride * 7];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
+        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
+        output6 += weight[5 * row + col] * image[i + (row + 6) * inputStride + col];
+        output7 += weight[5 * row + col] * image[i + (row + 7) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+    output[i + outputStride * 4] = output4;
+    output[i + outputStride * 5] = output5;
+    output[i + outputStride * 6] = output6;
+    output[i + outputStride * 7] = output7;
+  }
+}
+
+#define UNROLL_SSE_CONVOLUTION 0
+#if (UNROLL_SSE_CONVOLUTION)
+
+void convolve_5x5_64x64_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 60; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_4COLS_XROWS(6, 0)
+    CONVOLVE_4COLS_XROWS(6, 4)
+    CONVOLVE_4COLS_XROWS(6, 8)
+    CONVOLVE_4COLS_XROWS(6, 12)
+    CONVOLVE_4COLS_XROWS(6, 16)
+    CONVOLVE_4COLS_XROWS(6, 20)
+    CONVOLVE_4COLS_XROWS(6, 24)
+    CONVOLVE_4COLS_XROWS(6, 28)
+    CONVOLVE_4COLS_XROWS(6, 32)
+    CONVOLVE_4COLS_XROWS(6, 36)
+    CONVOLVE_4COLS_XROWS(6, 40)
+    CONVOLVE_4COLS_XROWS(6, 44)
+    CONVOLVE_4COLS_XROWS(6, 48)
+    CONVOLVE_4COLS_XROWS(6, 52)
+    CONVOLVE_4COLS_XROWS(6, 56)
+    CONVOLVE_4COLS_XROWS(6, 60)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_4COLS_XROWS(4, 0)
+  CONVOLVE_4COLS_XROWS(4, 4)
+  CONVOLVE_4COLS_XROWS(4, 8)
+  CONVOLVE_4COLS_XROWS(4, 12)
+  CONVOLVE_4COLS_XROWS(4, 16)
+  CONVOLVE_4COLS_XROWS(4, 20)
+  CONVOLVE_4COLS_XROWS(4, 24)
+  CONVOLVE_4COLS_XROWS(4, 28)
+  CONVOLVE_4COLS_XROWS(4, 32)
+  CONVOLVE_4COLS_XROWS(4, 36)
+  CONVOLVE_4COLS_XROWS(4, 40)
+  CONVOLVE_4COLS_XROWS(4, 44)
+  CONVOLVE_4COLS_XROWS(4, 48)
+  CONVOLVE_4COLS_XROWS(4, 52)
+  CONVOLVE_4COLS_XROWS(4, 56)
+  CONVOLVE_4COLS_XROWS(4, 60)
+}
+
+void convolve_5x5_32x32_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 30; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+
+      CONVOLVE_4COLS_XROWS(6, 0)
+      CONVOLVE_4COLS_XROWS(6, 4)
+      CONVOLVE_4COLS_XROWS(6, 8)
+      CONVOLVE_4COLS_XROWS(6, 12)
+      CONVOLVE_4COLS_XROWS(6, 16)
+      CONVOLVE_4COLS_XROWS(6, 20)
+      CONVOLVE_4COLS_XROWS(6, 24)
+      CONVOLVE_4COLS_XROWS(6, 28)
+
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_2()
+  CONVOLVE_4COLS_XROWS(2, 0)
+  CONVOLVE_4COLS_XROWS(2, 4)
+  CONVOLVE_4COLS_XROWS(2, 8)
+  CONVOLVE_4COLS_XROWS(2, 12)
+  CONVOLVE_4COLS_XROWS(2, 16)
+  CONVOLVE_4COLS_XROWS(2, 20)
+  CONVOLVE_4COLS_XROWS(2, 24)
+  CONVOLVE_4COLS_XROWS(2, 28)
+}
+
+void convolve_5x5_16x16_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  for(int i = 0; i < 12; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_4COLS_XROWS(6, 0)
+    CONVOLVE_4COLS_XROWS(6, 4)
+    CONVOLVE_4COLS_XROWS(6, 8)
+    CONVOLVE_4COLS_XROWS(6, 12)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_4COLS_XROWS(4, 0)
+  CONVOLVE_4COLS_XROWS(4, 4)
+  CONVOLVE_4COLS_XROWS(4, 8)
+  CONVOLVE_4COLS_XROWS(4, 12)
+}
+
+void convolve_5x5_8x8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  DECLARE_OUTPUT_8()
+  CONVOLVE_4COLS_XROWS(8, 0)
+  CONVOLVE_4COLS_XROWS(8, 4)
+}
+
+#endif
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
+  int64_t yy = 0;
+  float* t_ = input;
+  float* r_ = output;
+  float* k_ = kernel;
+#if (UNROLL_SSE_CONVOLUTION)
+  if((outRows == 64) && (outCols == 64)) {
+    convolve_5x5_64x64_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 32) && (outCols == 32)) {
+    convolve_5x5_32x32_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 16) && (outCols == 16)) {
+    convolve_5x5_16x16_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 8) && (outCols == 8)) {
+    convolve_5x5_8x8_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+#endif
+  for(; yy < (outRows / 6 ) * 6; yy += 6) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_6_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 6);
+  }
+  // more than 2 rows left to process and we ended up on a non-multiple of 4
+  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
+    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 2);
+    yy += 2;
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_4_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 4);
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 2);
+  }
+
+  for(; yy < outRows; yy += 1) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_1_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 1);
+  }
+}
diff --git a/aten/src/TH/generic/simd/simd.h b/aten/src/TH/generic/simd/simd.h
new file mode 100644
index 0000000..33c08b0
--- /dev/null
+++ b/aten/src/TH/generic/simd/simd.h
@@ -0,0 +1,165 @@
+#ifndef TH_SIMD_INC
+#define TH_SIMD_INC
+
+#include <stdint.h>
+#include <stdlib.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+#include <cpuid.h>
+#endif
+
+// Can be found on Intel ISA Reference for CPUID
+#define CPUID_AVX2_BIT 0x20       // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX_BIT  0x10000000 // Bit 28 of ECX for EAX=0x1
+#define CPUID_SSE_BIT  0x2000000  // bit 25 of EDX for EAX=0x1
+
+// Helper macros for initialization
+#define FUNCTION_IMPL(NAME, EXT) \
+    { (void *)NAME,    \
+      EXT      \
+    }
+
+#define INIT_DISPATCH_PTR(OP)    \
+  do {                           \
+    size_t i;                       \
+    for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \
+      THVector_(OP ## _DISPATCHPTR) = reinterpret_cast<decltype(THVector_(OP ## _DISPATCHPTR))>(THVector_(OP ## _DISPATCHTABLE)[i].function);                     \
+      if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) {                       \
+        break;                                                                                     \
+      }                                                                                            \
+    }                                                                                              \
+  } while(0)
+
+
+typedef struct FunctionDescription
+{
+  void *function;
+  uint32_t supportedSimdExt;
+} FunctionDescription;
+
+
+enum SIMDExtensions
+{
+#if defined(__NEON__)
+  SIMDExtension_NEON    = 0x1,
+#elif defined(__PPC64__)
+  SIMDExtension_VSX     = 0x1,
+#else
+  SIMDExtension_AVX2    = 0x1,
+  SIMDExtension_AVX     = 0x2,
+  SIMDExtension_SSE     = 0x4,
+#endif
+  SIMDExtension_DEFAULT = 0x0
+};
+
+
+#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
+
+ #if defined(__NEON__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_NEON;
+}
+
+ #else //ARM without NEON
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_DEFAULT;
+}
+
+ #endif
+
+#elif defined(__PPC64__)
+
+ #if defined(__VSX__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  uint32_t hostSimdExts = SIMDExtension_DEFAULT;
+  char *evar;
+
+  evar = getenv("TH_NO_VSX");
+  if (evar == NULL || strncmp(evar, "1", 1) != 0)
+    hostSimdExts = SIMDExtension_VSX;
+  return hostSimdExts;
+}
+
+ #else //PPC64 without VSX
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_DEFAULT;
+}
+
+ #endif
+
+#else   // x86
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+#if defined(_MSC_VER)
+  uint32_t cpuInfo[4];
+  __cpuid((int *)cpuInfo, *eax);
+  *eax = cpuInfo[0];
+  *ebx = cpuInfo[1];
+  *ecx = cpuInfo[2];
+  *edx = cpuInfo[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid (level, eax, ebx, ecx, edx);
+#else
+  uint32_t a = *eax, b, c = *ecx, d;
+  asm volatile ( "cpuid\n\t"
+		 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+#endif
+}
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t hostSimdExts = 0x0;
+  int TH_NO_AVX = 1, TH_NO_AVX2 = 1, TH_NO_SSE = 1;
+  char *evar;
+
+  evar = getenv("TH_NO_AVX2");
+  if (evar == NULL || strncmp(evar, "1", 1) != 0)
+    TH_NO_AVX2 = 0;
+
+  // Check for AVX2. Requires separate CPUID
+  eax = 0x7;
+  ecx = 0x0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if ((ebx & CPUID_AVX2_BIT) && TH_NO_AVX2 == 0) {
+    hostSimdExts |= SIMDExtension_AVX2;
+  }
+
+  // Detect and enable AVX and SSE
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  evar = getenv("TH_NO_AVX");
+  if (evar == NULL || strncmp(evar, "1", 1) != 0)
+    TH_NO_AVX = 0;
+  if (ecx & CPUID_AVX_BIT && TH_NO_AVX == 0) {
+    hostSimdExts |= SIMDExtension_AVX;
+  }
+
+  evar = getenv("TH_NO_SSE");
+  if (evar == NULL || strncmp(evar, "1", 1) != 0)
+    TH_NO_SSE = 0;
+  if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) {
+    hostSimdExts |= SIMDExtension_SSE;
+  }
+
+  return hostSimdExts;
+}
+
+#endif // end SIMD extension detection code
+
+#endif
diff --git a/aten/src/TH/vector/AVX.cpp b/aten/src/TH/vector/AVX.cpp
new file mode 100644
index 0000000..b39b803
--- /dev/null
+++ b/aten/src/TH/vector/AVX.cpp
@@ -0,0 +1,309 @@
+#if defined(__AVX__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#include "AVX.h"
+#include "THGeneral.h"
+
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  for (i=0; i<=((n)-8); i+=8) {
+    _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i));
+    _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4));
+  }
+  off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] = x[off+i];
+  }
+}
+
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  __m256d YMM0 = _mm256_set_pd(c, c, c, c);
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm256_storeu_pd((x)+i  , YMM0);
+    _mm256_storeu_pd((x)+i+4, YMM0);
+    _mm256_storeu_pd((x)+i+8, YMM0);
+    _mm256_storeu_pd((x)+i+12, YMM0);
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    x[off+i] = c;
+  }
+}
+
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ {
+  ptrdiff_t i;
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_loadu_pd(y+i);
+    YMM3 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_div_pd(YMM0, YMM2);
+    YMM3 = _mm256_div_pd(YMM1, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_div_pd(YMM0, YMM15);
+    YMM1 = _mm256_div_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_loadu_pd(y+i);
+    YMM3 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_mul_pd(YMM0, YMM2);
+    YMM3 = _mm256_mul_pd(YMM1, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<n; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_mul_pd(YMM0, YMM15);
+    YMM1 = _mm256_mul_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<n; i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-4); i+=4) {
+    YMM0 = _mm256_loadu_pd(y+i);
+    YMM1 = _mm256_loadu_pd(x+i);
+    YMM2 = _mm256_mul_pd(YMM0, YMM15);
+    YMM3 = _mm256_add_pd(YMM1, YMM2);
+    _mm256_storeu_pd(z+i, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_add_pd(YMM0, YMM15);
+    YMM1 = _mm256_add_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
+    _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    y[off+i] = x[off+i];
+  }
+}
+
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  for (i=0; i<=((n)-32); i+=32) {
+    _mm256_storeu_ps((x)+i  , YMM0);
+    _mm256_storeu_ps((x)+i+8, YMM0);
+    _mm256_storeu_ps((x)+i+16, YMM0);
+    _mm256_storeu_ps((x)+i+24, YMM0);
+  }
+  off = (n) - ((n)%32);
+  for (i=0; i<((n)%32); i++) {
+    x[off+i] = c;
+  }
+}
+
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ {
+  ptrdiff_t i;
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_loadu_ps(y+i);
+    YMM3 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_div_ps(YMM0, YMM2);
+    YMM3 = _mm256_div_ps(YMM1, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_div_ps(YMM0, YMM15);
+    YMM1 = _mm256_div_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_loadu_ps(y+i);
+    YMM3 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_mul_ps(YMM0, YMM2);
+    YMM3 = _mm256_mul_ps(YMM1, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<n; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_mul_ps(YMM0, YMM15);
+    YMM1 = _mm256_mul_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<n; i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_ps(y+i);
+    YMM1 = _mm256_loadu_ps(x+i);
+    YMM2 = _mm256_mul_ps(YMM0, YMM15);
+    YMM3 = _mm256_add_ps(YMM1, YMM2);
+    _mm256_storeu_ps(z+i, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_add_ps(YMM0, YMM15);
+    YMM1 = _mm256_add_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+void THFloatVector_cvtFromInt_AVX(float *y, const int *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256i YMM0, YMM1;
+  __m256 YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_si256((__m256i const*)(x+i));
+    YMM1 = _mm256_loadu_si256((__m256i const*)(x+i+8));
+    YMM2 = _mm256_cvtepi32_ps(YMM0);
+    YMM3 = _mm256_cvtepi32_ps(YMM1);
+    _mm256_storeu_ps(y+i, YMM2);
+    _mm256_storeu_ps(y+i+8, YMM3);
+  }
+  for (; i<(n); i++) {
+    y[i] = (float)x[i];
+  }
+}
+
+void THDoubleVector_cvtFromInt_AVX(double *y, const int *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128i YMM0, YMM1;
+  __m256d YMM2, YMM3;
+  for (i=0; i<=((n)- 8); i+=8) {
+    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
+    YMM1 = _mm_loadu_si128((__m128i const*)(x+i+4));
+    YMM2 = _mm256_cvtepi32_pd(YMM0);
+    YMM3 = _mm256_cvtepi32_pd(YMM1);
+    _mm256_storeu_pd(y+i, YMM2);
+    _mm256_storeu_pd(y+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    y[i] = (double)x[i];
+  }
+}
+
+#endif // defined(__AVX__)
diff --git a/aten/src/TH/vector/AVX.h b/aten/src/TH/vector/AVX.h
new file mode 100644
index 0000000..6fd183c
--- /dev/null
+++ b/aten/src/TH/vector/AVX.h
@@ -0,0 +1,25 @@
+#ifndef TH_AVX_H
+#define TH_AVX_H
+
+#include "THGeneral.h"
+#include <stddef.h>
+
+TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cvtFromInt_AVX(double *y, const int *x, const ptrdiff_t n);
+TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cvtFromInt_AVX(float *y, const int *x, const ptrdiff_t n);
+#endif
diff --git a/aten/src/TH/vector/AVX2.cpp b/aten/src/TH/vector/AVX2.cpp
new file mode 100644
index 0000000..bde22d3
--- /dev/null
+++ b/aten/src/TH/vector/AVX2.cpp
@@ -0,0 +1,130 @@
+#if defined(__AVX2__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#include <immintrin.h>
+#endif
+#include "AVX2.h"
+#include <ATen/native/cpu/avx_mathfun.h>
+#include "../THRandom.h"
+
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(y+i);
+    YMM1 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_loadu_pd(x+i);
+    YMM3 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_fmadd_pd(YMM0, YMM15, YMM2);
+    YMM3 = _mm256_fmadd_pd(YMM1, YMM15, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(y+i);
+    YMM1 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_loadu_ps(x+i);
+    YMM3 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_fmadd_ps(YMM0, YMM15, YMM2);
+    YMM3 = _mm256_fmadd_ps(YMM1, YMM15, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+static void normal_fill_16_AVX2(float *data,
+                                const __m256* two_pi,
+                                const __m256* one,
+                                const __m256* minus_two,
+                                const __m256* mean,
+                                const __m256* stddev) {
+  const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data));
+  const __m256 u2 = _mm256_loadu_ps(data + 8);
+
+  // sincos256_ps and log256_ps are from avx_mathfun.h
+  const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1)));
+  const __m256 theta = _mm256_mul_ps(*two_pi, u2);
+
+  __m256 sintheta, costheta;
+  sincos256_ps(theta, &sintheta, &costheta);
+
+  const __m256 n1 = _mm256_mul_ps(radius, costheta);
+  const __m256 n2 = _mm256_mul_ps(radius, sintheta);
+
+  _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *stddev, *mean));
+  _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *stddev, *mean));
+}
+
+void THFloatVector_normal_fill_AVX2(float *data,
+                                    const int64_t size,
+                                    THGenerator *generator,
+                                    const float mean,
+                                    const float stddev)
+{
+  THAssert(size >= 16 && "Size must be >= 16 for AVX2 normal fill");
+  const __m256 two_pi = _mm256_set1_ps(2.0f * M_PI);
+  const __m256 one = _mm256_set1_ps(1.0f);
+  const __m256 minus_two = _mm256_set1_ps(-2.0f);
+  const __m256 mean_v = _mm256_set1_ps(mean);
+  const __m256 stddev_v = _mm256_set1_ps(stddev);
+
+  // First fill the data with the uniform numbers. Box-Mueller is a 2 -> 2
+  // mapping of 2 uniform numbers to 2 normal numbers (per iteration), so we
+  // we need exactly as much space for uniform and normal numbers and can just
+  // use the single buffer for both.
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = THRandom_uniformFloat(generator, 0, 1);
+  }
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &stddev_v);
+  }
+
+  if (size % 16 != 0) {
+    // We rewind so that we have 16 values and then compute them in one step.
+    data = data + size - 16;
+    for (int i = 0; i < 16; ++i) {
+      data[i] = THRandom_uniformFloat(generator, 0, 1);
+    }
+    normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &stddev_v);
+  }
+}
+
+void THFloatVector_sigmoid_AVX2(float *y, const float *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  const __m256 one = _mm256_set1_ps(1.0f);
+  const __m256 zero = _mm256_set1_ps(0.0f);
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i = 0; i <= ((n)-16); i += 16) {
+    YMM0 = _mm256_loadu_ps(x + i);
+    YMM1 = _mm256_loadu_ps(x + i + 8);
+    YMM0 = _mm256_sub_ps(zero, YMM0);
+    YMM1 = _mm256_sub_ps(zero, YMM1);
+    YMM2 = _mm256_add_ps(one, exp256_ps(YMM0));
+    YMM3 = _mm256_add_ps(one, exp256_ps(YMM1));
+    YMM2 = _mm256_div_ps(one, YMM2);
+    YMM3 = _mm256_div_ps(one, YMM3);
+    _mm256_storeu_ps(y + i, YMM2);
+    _mm256_storeu_ps(y + i + 8, YMM3);
+  }
+  for (; i < (n); i++) {
+    y[i] = 1.0f / (1.0f + expf(-x[i]));
+  }
+}
+
+#endif // defined(__AVX2__)
diff --git a/aten/src/TH/vector/AVX2.h b/aten/src/TH/vector/AVX2.h
new file mode 100644
index 0000000..1c281d8
--- /dev/null
+++ b/aten/src/TH/vector/AVX2.h
@@ -0,0 +1,19 @@
+#ifndef TH_AVX2_H
+#define TH_AVX2_H
+
+#include "THGeneral.h"
+
+#include <stdint.h>
+#include <stddef.h>
+
+struct THGenerator;
+
+TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_normal_fill_AVX2(float *data,
+                                    const int64_t size,
+                                    struct THGenerator *generator,
+                                    const float mean,
+                                    const float stddev);
+TH_API void THFloatVector_sigmoid_AVX2(float *y, const float *x, const ptrdiff_t n);
+#endif
diff --git a/aten/src/TH/vector/NEON.cpp b/aten/src/TH/vector/NEON.cpp
new file mode 100644
index 0000000..3966ace
--- /dev/null
+++ b/aten/src/TH/vector/NEON.cpp
@@ -0,0 +1,105 @@
+static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    x[i] = c;
+    x[i+1] = c;
+    x[i+2] = c;
+    x[i+3] = c;
+  }
+
+  for(; i < n; i++)
+    x[i] = c;
+
+}
+
+static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    z[i] = x[i] * y[i];
+    z[i+1] = x[i+1] * y[i+1];
+    z[i+2] = x[i+2] * y[i+2];
+    z[i+3] = x[i+3] * y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] * y[i];
+}
+
+static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    y[i] = x[i] * c;
+    y[i+1] = x[i+1] * c;
+    y[i+2] = x[i+2] * c;
+    y[i+3] = x[i+3] * c;
+  }
+
+  for(; i < n; i++)
+    y[i] = x[i] * c;
+}
+
+static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    z[i] = x[i] + c * y[i];
+    z[i+1] = x[i+1] + c * y[i+1];
+    z[i+2] = x[i+2] + c * y[i+2];
+    z[i+3] = x[i+3] + c * y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] + c * y[i];
+}
+
+static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    y[i] = x[i] + c;
+    y[i+1] = x[i+1] + c;
+    y[i+2] = x[i+2] + c;
+    y[i+3] = x[i+3] + c;
+  }
+
+  for(; i < n; i++)
+    y[i] = x[i] + c;
+}
+
+static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    z[i] = x[i] / y[i];
+    z[i+1] = x[i+1] / y[i+1];
+    z[i+2] = x[i+2] / y[i+2];
+    z[i+3] = x[i+3] / y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] / y[i];
+}
+
+static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+  int64_t i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    y[i] = x[i] / c;
+    y[i+1] = x[i+1] / c;
+    y[i+2] = x[i+2] / c;
+    y[i+3] = x[i+3] / c;
+  }
+
+  for(; i < n; i++)
+    y[i] = x[i] / c;
+}
diff --git a/aten/src/TH/vector/SSE.cpp b/aten/src/TH/vector/SSE.cpp
new file mode 100644
index 0000000..20d5893
--- /dev/null
+++ b/aten/src/TH/vector/SSE.cpp
@@ -0,0 +1,303 @@
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  __m128d XMM0 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    _mm_storeu_pd((x)+i  , XMM0);
+    _mm_storeu_pd((x)+i+2, XMM0);
+    _mm_storeu_pd((x)+i+4, XMM0);
+    _mm_storeu_pd((x)+i+6, XMM0);
+  }
+  off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    x[off+i] = c;
+  }
+}
+
+static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0, XMM2;
+  for (i=0; i<=((n)-2); i+=2) {
+    XMM0 = _mm_loadu_pd((x)+i);
+    XMM2 = _mm_loadu_pd((y)+i);
+    XMM2 = _mm_mul_pd(XMM2, XMM7);
+    XMM2 = _mm_add_pd(XMM0, XMM2);
+    _mm_storeu_pd((z)+i, XMM2);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + c * y[i];
+  }
+}
+
+static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0, XMM2;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_pd((x)+i);
+    XMM2 = _mm_loadu_pd((x)+i+2);
+    XMM0 = _mm_add_pd(XMM0, XMM7);
+    XMM2 = _mm_add_pd(XMM2, XMM7);
+    _mm_storeu_pd((y)+i, XMM0);
+    _mm_storeu_pd((y)+i+2, XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_loadu_pd((y)+i  );
+    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+    XMM4 = _mm_mul_pd(XMM4, XMM0);
+    XMM5 = _mm_mul_pd(XMM5, XMM1);
+    XMM6 = _mm_mul_pd(XMM6, XMM2);
+    XMM7 = _mm_mul_pd(XMM7, XMM3);
+    _mm_storeu_pd((z)+i  , XMM4);
+    _mm_storeu_pd((z)+i+2, XMM5);
+    _mm_storeu_pd((z)+i+4, XMM6);
+    _mm_storeu_pd((z)+i+6, XMM7);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM15 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_mul_pd(XMM15, XMM0);
+    __m128d XMM5 = _mm_mul_pd(XMM15, XMM1);
+    __m128d XMM6 = _mm_mul_pd(XMM15, XMM2);
+    __m128d XMM7 = _mm_mul_pd(XMM15, XMM3);
+    _mm_storeu_pd((y)+i  , XMM4);
+    _mm_storeu_pd((y)+i+2, XMM5);
+    _mm_storeu_pd((y)+i+4, XMM6);
+    _mm_storeu_pd((y)+i+6, XMM7);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM0, XMM1, XMM2, XMM3;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_pd(x+i);
+    XMM1 = _mm_loadu_pd(x+i+2);
+    XMM2 = _mm_loadu_pd(y+i);
+    XMM3 = _mm_loadu_pd(y+i+2);
+    XMM2 = _mm_div_pd(XMM0, XMM2);
+    XMM3 = _mm_div_pd(XMM1, XMM3);
+    _mm_storeu_pd(z+i, XMM2);
+    _mm_storeu_pd(z+i+2, XMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0, XMM1;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_pd(x+i);
+    XMM1 = _mm_loadu_pd(x+i+2);
+    XMM0 = _mm_div_pd(XMM0, XMM7);
+    XMM1 = _mm_div_pd(XMM1, XMM7);
+    _mm_storeu_pd(y+i, XMM0);
+    _mm_storeu_pd(y+i+2, XMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM0 = _mm_set_ps1(c);
+  ptrdiff_t off;
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm_storeu_ps((x)+i  ,  XMM0);
+    _mm_storeu_ps((x)+i+4,  XMM0);
+    _mm_storeu_ps((x)+i+8,  XMM0);
+    _mm_storeu_ps((x)+i+12, XMM0);
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    x[off+i] = c;
+  }
+}
+
+
+static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM7 = _mm_set_ps1(c);
+  __m128 XMM0, XMM2;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_ps((x)+i);
+    XMM2 = _mm_loadu_ps((y)+i);
+    XMM2 = _mm_mul_ps(XMM2, XMM7);
+    XMM2 = _mm_add_ps(XMM0, XMM2);
+    _mm_storeu_ps((z)+i, XMM2);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + c * y[i];
+  }
+}
+
+static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM7 = _mm_set1_ps(c);
+  __m128 XMM0, XMM2;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps((x)+i);
+    XMM2 = _mm_loadu_ps((x)+i+4);
+    XMM0 = _mm_add_ps(XMM0, XMM7);
+    XMM2 = _mm_add_ps(XMM2, XMM7);
+    _mm_storeu_ps((y)+i, XMM0);
+    _mm_storeu_ps((y)+i+4, XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_loadu_ps((y)+i   );
+    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+    XMM4 = _mm_mul_ps(XMM4, XMM0);
+    XMM5 = _mm_mul_ps(XMM5, XMM1);
+    XMM6 = _mm_mul_ps(XMM6, XMM2);
+    XMM7 = _mm_mul_ps(XMM7, XMM3);
+    _mm_storeu_ps((z)+i   , XMM4);
+    _mm_storeu_ps((z)+i+ 4, XMM5);
+    _mm_storeu_ps((z)+i+ 8, XMM6);
+    _mm_storeu_ps((z)+i+12, XMM7);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM15 = _mm_set_ps1(c);
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_mul_ps(XMM15, XMM0);
+    __m128 XMM5 = _mm_mul_ps(XMM15, XMM1);
+    __m128 XMM6 = _mm_mul_ps(XMM15, XMM2);
+    __m128 XMM7 = _mm_mul_ps(XMM15, XMM3);
+    _mm_storeu_ps((y)+i   , XMM4);
+    _mm_storeu_ps((y)+i+ 4, XMM5);
+    _mm_storeu_ps((y)+i+ 8, XMM6);
+    _mm_storeu_ps((y)+i+12, XMM7);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM0, XMM1, XMM2, XMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps(x+i);
+    XMM1 = _mm_loadu_ps(x+i+4);
+    XMM2 = _mm_loadu_ps(y+i);
+    XMM3 = _mm_loadu_ps(y+i+4);
+    XMM2 = _mm_div_ps(XMM0, XMM2);
+    XMM3 = _mm_div_ps(XMM1, XMM3);
+    _mm_storeu_ps(z+i, XMM2);
+    _mm_storeu_ps(z+i+4, XMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM7 = _mm_set1_ps(c);
+  __m128 XMM0, XMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps(x+i);
+    XMM1 = _mm_loadu_ps(x+i+4);
+    XMM0 = _mm_div_ps(XMM0, XMM7);
+    XMM1 = _mm_div_ps(XMM1, XMM7);
+    _mm_storeu_ps(y+i, XMM0);
+    _mm_storeu_ps(y+i+4, XMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+static void THFloatVector_cvtFromInt_SSE(float *y, const int *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128i YMM0, YMM1;
+  __m128 YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
+    YMM1 = _mm_loadu_si128((__m128i const*)(x+i+4));
+    YMM2 = _mm_cvtepi32_ps(YMM0);
+    YMM3 = _mm_cvtepi32_ps(YMM1);
+    _mm_storeu_ps(y+i, YMM2);
+    _mm_storeu_ps(y+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    y[i] = (float)x[i];
+  }
+}
+
+static void THDoubleVector_cvtFromInt_SSE(double *y, const int *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128i YMM0, YMM1;
+  __m128d YMM2, YMM3;
+  for (i=0; i<=((n)- 4); i+=4) {
+    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
+    YMM2 = _mm_cvtepi32_pd(YMM0);
+    YMM1 = _mm_srli_si128(YMM0, 8);
+    YMM3 = _mm_cvtepi32_pd(YMM1);
+    _mm_storeu_pd(y+i, YMM2);
+    _mm_storeu_pd(y+i+2, YMM3);
+  }
+  for (; i<(n); i++) {
+    y[i] = (double)x[i];
+  }
+}
+
diff --git a/aten/src/TH/vector/VSX.cpp b/aten/src/TH/vector/VSX.cpp
new file mode 100644
index 0000000..f01718c
--- /dev/null
+++ b/aten/src/TH/vector/VSX.cpp
@@ -0,0 +1,2520 @@
+#ifdef __PPC64__
+#include <altivec.h>
+#include <stddef.h>
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_fill_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double fp64vec2 = vec_xl(0, val);
+
+    for (i = 0; i <= n-128; i += 128)
+    {
+        vec_xst(fp64vec2, 0, x+(i    ));
+        vec_xst(fp64vec2, 0, x+(i+2  ));
+        vec_xst(fp64vec2, 0, x+(i+4  ));
+        vec_xst(fp64vec2, 0, x+(i+6  ));
+        vec_xst(fp64vec2, 0, x+(i+8  ));
+        vec_xst(fp64vec2, 0, x+(i+10 ));
+        vec_xst(fp64vec2, 0, x+(i+12 ));
+        vec_xst(fp64vec2, 0, x+(i+14 ));
+        vec_xst(fp64vec2, 0, x+(i+16 ));
+        vec_xst(fp64vec2, 0, x+(i+18 ));
+        vec_xst(fp64vec2, 0, x+(i+20 ));
+        vec_xst(fp64vec2, 0, x+(i+22 ));
+        vec_xst(fp64vec2, 0, x+(i+24 ));
+        vec_xst(fp64vec2, 0, x+(i+26 ));
+        vec_xst(fp64vec2, 0, x+(i+28 ));
+        vec_xst(fp64vec2, 0, x+(i+30 ));
+        vec_xst(fp64vec2, 0, x+(i+32 ));
+        vec_xst(fp64vec2, 0, x+(i+34 ));
+        vec_xst(fp64vec2, 0, x+(i+36 ));
+        vec_xst(fp64vec2, 0, x+(i+38 ));
+        vec_xst(fp64vec2, 0, x+(i+40 ));
+        vec_xst(fp64vec2, 0, x+(i+42 ));
+        vec_xst(fp64vec2, 0, x+(i+44 ));
+        vec_xst(fp64vec2, 0, x+(i+46 ));
+        vec_xst(fp64vec2, 0, x+(i+48 ));
+        vec_xst(fp64vec2, 0, x+(i+50 ));
+        vec_xst(fp64vec2, 0, x+(i+52 ));
+        vec_xst(fp64vec2, 0, x+(i+54 ));
+        vec_xst(fp64vec2, 0, x+(i+56 ));
+        vec_xst(fp64vec2, 0, x+(i+58 ));
+        vec_xst(fp64vec2, 0, x+(i+60 ));
+        vec_xst(fp64vec2, 0, x+(i+62 ));
+        vec_xst(fp64vec2, 0, x+(i+64 ));
+        vec_xst(fp64vec2, 0, x+(i+66 ));
+        vec_xst(fp64vec2, 0, x+(i+68 ));
+        vec_xst(fp64vec2, 0, x+(i+70 ));
+        vec_xst(fp64vec2, 0, x+(i+72 ));
+        vec_xst(fp64vec2, 0, x+(i+74 ));
+        vec_xst(fp64vec2, 0, x+(i+76 ));
+        vec_xst(fp64vec2, 0, x+(i+78 ));
+        vec_xst(fp64vec2, 0, x+(i+80 ));
+        vec_xst(fp64vec2, 0, x+(i+82 ));
+        vec_xst(fp64vec2, 0, x+(i+84 ));
+        vec_xst(fp64vec2, 0, x+(i+86 ));
+        vec_xst(fp64vec2, 0, x+(i+88 ));
+        vec_xst(fp64vec2, 0, x+(i+90 ));
+        vec_xst(fp64vec2, 0, x+(i+92 ));
+        vec_xst(fp64vec2, 0, x+(i+94 ));
+        vec_xst(fp64vec2, 0, x+(i+96 ));
+        vec_xst(fp64vec2, 0, x+(i+98 ));
+        vec_xst(fp64vec2, 0, x+(i+100));
+        vec_xst(fp64vec2, 0, x+(i+102));
+        vec_xst(fp64vec2, 0, x+(i+104));
+        vec_xst(fp64vec2, 0, x+(i+106));
+        vec_xst(fp64vec2, 0, x+(i+108));
+        vec_xst(fp64vec2, 0, x+(i+110));
+        vec_xst(fp64vec2, 0, x+(i+112));
+        vec_xst(fp64vec2, 0, x+(i+114));
+        vec_xst(fp64vec2, 0, x+(i+116));
+        vec_xst(fp64vec2, 0, x+(i+118));
+        vec_xst(fp64vec2, 0, x+(i+120));
+        vec_xst(fp64vec2, 0, x+(i+122));
+        vec_xst(fp64vec2, 0, x+(i+124));
+        vec_xst(fp64vec2, 0, x+(i+126));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        vec_xst(fp64vec2, 0, x+(i    ));
+        vec_xst(fp64vec2, 0, x+(i+2  ));
+        vec_xst(fp64vec2, 0, x+(i+4  ));
+        vec_xst(fp64vec2, 0, x+(i+6  ));
+        vec_xst(fp64vec2, 0, x+(i+8  ));
+        vec_xst(fp64vec2, 0, x+(i+10 ));
+        vec_xst(fp64vec2, 0, x+(i+12 ));
+        vec_xst(fp64vec2, 0, x+(i+14 ));
+    }
+    for (; i <= n-2; i += 2)
+        vec_xst(fp64vec2, 0, x+(i    ));
+    for (; i < n; i++)
+        x[i] = c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_cadds_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_cadd_VSX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double c_fp64vec2 = vec_xl(0, val);
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2,  x0_fp64vec2);
+        y1_fp64vec2  = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
+        y2_fp64vec2  = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
+        y3_fp64vec2  = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
+        y4_fp64vec2  = vec_madd(y4_fp64vec2, c_fp64vec2, x4_fp64vec2);
+        y5_fp64vec2  = vec_madd(y5_fp64vec2, c_fp64vec2, x5_fp64vec2);
+        y6_fp64vec2  = vec_madd(y6_fp64vec2, c_fp64vec2, x6_fp64vec2);
+        y7_fp64vec2  = vec_madd(y7_fp64vec2, c_fp64vec2, x7_fp64vec2);
+        y8_fp64vec2  = vec_madd(y8_fp64vec2, c_fp64vec2, x8_fp64vec2);
+        y9_fp64vec2  = vec_madd(y9_fp64vec2, c_fp64vec2, x9_fp64vec2);
+        y10_fp64vec2 = vec_madd(y10_fp64vec2, c_fp64vec2,x10_fp64vec2);
+        y11_fp64vec2 = vec_madd(y11_fp64vec2, c_fp64vec2,x11_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, z+(i+10));
+        vec_xst(y6_fp64vec2,  0, z+(i+12));
+        vec_xst(y7_fp64vec2,  0, z+(i+14));
+        vec_xst(y8_fp64vec2,  0, z+(i+16));
+        vec_xst(y9_fp64vec2,  0, z+(i+18));
+        vec_xst(y10_fp64vec2, 0, z+(i+20));
+        vec_xst(y11_fp64vec2, 0, z+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
+        y1_fp64vec2  = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
+        y2_fp64vec2  = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
+        y3_fp64vec2  = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] + c* y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_adds_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_adds_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double c_fp64vec2 = vec_xl(0, val);
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_add(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_add(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_add(x3_fp64vec2,  c_fp64vec2);
+        y4_fp64vec2  = vec_add(x4_fp64vec2,  c_fp64vec2);
+        y5_fp64vec2  = vec_add(x5_fp64vec2,  c_fp64vec2);
+        y6_fp64vec2  = vec_add(x6_fp64vec2,  c_fp64vec2);
+        y7_fp64vec2  = vec_add(x7_fp64vec2,  c_fp64vec2);
+        y8_fp64vec2  = vec_add(x8_fp64vec2,  c_fp64vec2);
+        y9_fp64vec2  = vec_add(x9_fp64vec2,  c_fp64vec2);
+        y10_fp64vec2 = vec_add(x10_fp64vec2, c_fp64vec2);
+        y11_fp64vec2 = vec_add(x11_fp64vec2, c_fp64vec2);
+        
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_add(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_add(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_add(x3_fp64vec2,  c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = x[i] +c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_cmul_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_cmul_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
+        y4_fp64vec2  = vec_mul(y4_fp64vec2,  x4_fp64vec2);
+        y5_fp64vec2  = vec_mul(y5_fp64vec2,  x5_fp64vec2);
+        y6_fp64vec2  = vec_mul(y6_fp64vec2,  x6_fp64vec2);
+        y7_fp64vec2  = vec_mul(y7_fp64vec2,  x7_fp64vec2);
+        y8_fp64vec2  = vec_mul(y8_fp64vec2,  x8_fp64vec2);
+        y9_fp64vec2  = vec_mul(y9_fp64vec2,  x9_fp64vec2);
+        y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
+        y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, z+(i+10));
+        vec_xst(y6_fp64vec2,  0, z+(i+12));
+        vec_xst(y7_fp64vec2,  0, z+(i+14));
+        vec_xst(y8_fp64vec2,  0, z+(i+16));
+        vec_xst(y9_fp64vec2,  0, z+(i+18));
+        vec_xst(y10_fp64vec2, 0, z+(i+20));
+        vec_xst(y11_fp64vec2, 0, z+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] * y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_muls_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double c_fp64vec2 = vec_xl(0, val);
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_mul(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_mul(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_mul(x3_fp64vec2,  c_fp64vec2);
+        y4_fp64vec2  = vec_mul(x4_fp64vec2,  c_fp64vec2);
+        y5_fp64vec2  = vec_mul(x5_fp64vec2,  c_fp64vec2);
+        y6_fp64vec2  = vec_mul(x6_fp64vec2,  c_fp64vec2);
+        y7_fp64vec2  = vec_mul(x7_fp64vec2,  c_fp64vec2);
+        y8_fp64vec2  = vec_mul(x8_fp64vec2,  c_fp64vec2);
+        y9_fp64vec2  = vec_mul(x9_fp64vec2,  c_fp64vec2);
+        y10_fp64vec2 = vec_mul(x10_fp64vec2, c_fp64vec2);
+        y11_fp64vec2 = vec_mul(x11_fp64vec2, c_fp64vec2);
+        
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_mul(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_mul(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_mul(x3_fp64vec2,  c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = c * x[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_cdiv_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_cdiv_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  y1_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  y2_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  y3_fp64vec2);
+        y4_fp64vec2  = vec_div(x4_fp64vec2,  y4_fp64vec2);
+        y5_fp64vec2  = vec_div(x5_fp64vec2,  y5_fp64vec2);
+        y6_fp64vec2  = vec_div(x6_fp64vec2,  y6_fp64vec2);
+        y7_fp64vec2  = vec_div(x7_fp64vec2,  y7_fp64vec2);
+        y8_fp64vec2  = vec_div(x8_fp64vec2,  y8_fp64vec2);
+        y9_fp64vec2  = vec_div(x9_fp64vec2,  y9_fp64vec2);
+        y10_fp64vec2 = vec_div(x10_fp64vec2, y10_fp64vec2);
+        y11_fp64vec2 = vec_div(x11_fp64vec2, y11_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, z+(i+10));
+        vec_xst(y6_fp64vec2,  0, z+(i+12));
+        vec_xst(y7_fp64vec2,  0, z+(i+14));
+        vec_xst(y8_fp64vec2,  0, z+(i+16));
+        vec_xst(y9_fp64vec2,  0, z+(i+18));
+        vec_xst(y10_fp64vec2, 0, z+(i+20));
+        vec_xst(y11_fp64vec2, 0, z+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  y1_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  y2_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  y3_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] / y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_divs_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_divs_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double c_fp64vec2 = vec_xl(0, val);
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
+        y4_fp64vec2  = vec_div(x4_fp64vec2,  c_fp64vec2);
+        y5_fp64vec2  = vec_div(x5_fp64vec2,  c_fp64vec2);
+        y6_fp64vec2  = vec_div(x6_fp64vec2,  c_fp64vec2);
+        y7_fp64vec2  = vec_div(x7_fp64vec2,  c_fp64vec2);
+        y8_fp64vec2  = vec_div(x8_fp64vec2,  c_fp64vec2);
+        y9_fp64vec2  = vec_div(x9_fp64vec2,  c_fp64vec2);
+        y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
+        y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
+        
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = x[i] / c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_fill_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    float val[4] = {c, c, c, c};
+    vector float fp32vec4 = vec_xl(0, val);
+
+    for (i = 0; i <= n-256; i += 256)
+    {
+        vec_xst(fp32vec4, 0, x+(i    ));
+        vec_xst(fp32vec4, 0, x+(i+4  ));
+        vec_xst(fp32vec4, 0, x+(i+8  ));
+        vec_xst(fp32vec4, 0, x+(i+12 ));
+        vec_xst(fp32vec4, 0, x+(i+16 ));
+        vec_xst(fp32vec4, 0, x+(i+20 ));
+        vec_xst(fp32vec4, 0, x+(i+24 ));
+        vec_xst(fp32vec4, 0, x+(i+28 ));
+        vec_xst(fp32vec4, 0, x+(i+32 ));
+        vec_xst(fp32vec4, 0, x+(i+36 ));
+        vec_xst(fp32vec4, 0, x+(i+40 ));
+        vec_xst(fp32vec4, 0, x+(i+44 ));
+        vec_xst(fp32vec4, 0, x+(i+48 ));
+        vec_xst(fp32vec4, 0, x+(i+52 ));
+        vec_xst(fp32vec4, 0, x+(i+56 ));
+        vec_xst(fp32vec4, 0, x+(i+60 ));
+        vec_xst(fp32vec4, 0, x+(i+64 ));
+        vec_xst(fp32vec4, 0, x+(i+68 ));
+        vec_xst(fp32vec4, 0, x+(i+72 ));
+        vec_xst(fp32vec4, 0, x+(i+76 ));
+        vec_xst(fp32vec4, 0, x+(i+80 ));
+        vec_xst(fp32vec4, 0, x+(i+84 ));
+        vec_xst(fp32vec4, 0, x+(i+88 ));
+        vec_xst(fp32vec4, 0, x+(i+92 ));
+        vec_xst(fp32vec4, 0, x+(i+96 ));
+        vec_xst(fp32vec4, 0, x+(i+100));
+        vec_xst(fp32vec4, 0, x+(i+104));
+        vec_xst(fp32vec4, 0, x+(i+108));
+        vec_xst(fp32vec4, 0, x+(i+112));
+        vec_xst(fp32vec4, 0, x+(i+116));
+        vec_xst(fp32vec4, 0, x+(i+120));
+        vec_xst(fp32vec4, 0, x+(i+124));
+        vec_xst(fp32vec4, 0, x+(i+128));
+        vec_xst(fp32vec4, 0, x+(i+132));
+        vec_xst(fp32vec4, 0, x+(i+136));
+        vec_xst(fp32vec4, 0, x+(i+140));
+        vec_xst(fp32vec4, 0, x+(i+144));
+        vec_xst(fp32vec4, 0, x+(i+148));
+        vec_xst(fp32vec4, 0, x+(i+152));
+        vec_xst(fp32vec4, 0, x+(i+156));
+        vec_xst(fp32vec4, 0, x+(i+160));
+        vec_xst(fp32vec4, 0, x+(i+164));
+        vec_xst(fp32vec4, 0, x+(i+168));
+        vec_xst(fp32vec4, 0, x+(i+172));
+        vec_xst(fp32vec4, 0, x+(i+176));
+        vec_xst(fp32vec4, 0, x+(i+180));
+        vec_xst(fp32vec4, 0, x+(i+184));
+        vec_xst(fp32vec4, 0, x+(i+188));
+        vec_xst(fp32vec4, 0, x+(i+192));
+        vec_xst(fp32vec4, 0, x+(i+196));
+        vec_xst(fp32vec4, 0, x+(i+200));
+        vec_xst(fp32vec4, 0, x+(i+204));
+        vec_xst(fp32vec4, 0, x+(i+208));
+        vec_xst(fp32vec4, 0, x+(i+212));
+        vec_xst(fp32vec4, 0, x+(i+216));
+        vec_xst(fp32vec4, 0, x+(i+220));
+        vec_xst(fp32vec4, 0, x+(i+224));
+        vec_xst(fp32vec4, 0, x+(i+228));
+        vec_xst(fp32vec4, 0, x+(i+232));
+        vec_xst(fp32vec4, 0, x+(i+236));
+        vec_xst(fp32vec4, 0, x+(i+240));
+        vec_xst(fp32vec4, 0, x+(i+244));
+        vec_xst(fp32vec4, 0, x+(i+248));
+        vec_xst(fp32vec4, 0, x+(i+252));
+    }
+    for (; i <= n-32; i += 32)
+    {
+        vec_xst(fp32vec4, 0, x+(i    ));
+        vec_xst(fp32vec4, 0, x+(i+4  ));
+        vec_xst(fp32vec4, 0, x+(i+8  ));
+        vec_xst(fp32vec4, 0, x+(i+12 ));
+        vec_xst(fp32vec4, 0, x+(i+16 ));
+        vec_xst(fp32vec4, 0, x+(i+20 ));
+        vec_xst(fp32vec4, 0, x+(i+24 ));
+        vec_xst(fp32vec4, 0, x+(i+28 ));
+    }
+    for (; i <= n-4; i += 4)
+        vec_xst(fp32vec4, 0, x+(i    ));
+    for (; i < n; i++)
+        x[i] = c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cadd_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cadd_VSX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    float val[4] = {c, c, c, c};
+    vector float c_fp32vec4 = vec_xl(0, val);
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+        y4_fp32vec4  = vec_xl(0, y+(i+16 ));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4,  x0_fp32vec4);
+        y1_fp32vec4  = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
+        y2_fp32vec4  = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
+        y3_fp32vec4  = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
+        y4_fp32vec4  = vec_madd(y4_fp32vec4, c_fp32vec4, x4_fp32vec4);
+        y5_fp32vec4  = vec_madd(y5_fp32vec4, c_fp32vec4, x5_fp32vec4);
+        y6_fp32vec4  = vec_madd(y6_fp32vec4, c_fp32vec4, x6_fp32vec4);
+        y7_fp32vec4  = vec_madd(y7_fp32vec4, c_fp32vec4, x7_fp32vec4);
+        y8_fp32vec4  = vec_madd(y8_fp32vec4, c_fp32vec4, x8_fp32vec4);
+        y9_fp32vec4  = vec_madd(y9_fp32vec4, c_fp32vec4, x9_fp32vec4);
+        y10_fp32vec4 = vec_madd(y10_fp32vec4, c_fp32vec4, x10_fp32vec4);
+        y11_fp32vec4 = vec_madd(y11_fp32vec4, c_fp32vec4, x11_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
+        vec_xst(y5_fp32vec4,  0, z+(i+20));
+        vec_xst(y6_fp32vec4,  0, z+(i+24));
+        vec_xst(y7_fp32vec4,  0, z+(i+28));
+        vec_xst(y8_fp32vec4,  0, z+(i+32));
+        vec_xst(y9_fp32vec4,  0, z+(i+36));
+        vec_xst(y10_fp32vec4, 0, z+(i+40));
+        vec_xst(y11_fp32vec4, 0, z+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+
+        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
+        y1_fp32vec4  = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
+        y2_fp32vec4  = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
+        y3_fp32vec4  = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] + c* y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_adds_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_adds_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+    float val[4] = {c, c, c, c};
+    vector float c_fp32vec4 = vec_xl(0, val);
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+        x4_fp32vec4  = vec_xl(0, x+(i+16));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_add(x1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_add(x2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_add(x3_fp32vec4,  c_fp32vec4);
+        y4_fp32vec4  = vec_add(x4_fp32vec4,  c_fp32vec4);
+        y5_fp32vec4  = vec_add(x5_fp32vec4,  c_fp32vec4);
+        y6_fp32vec4  = vec_add(x6_fp32vec4,  c_fp32vec4);
+        y7_fp32vec4  = vec_add(x7_fp32vec4,  c_fp32vec4);
+        y8_fp32vec4  = vec_add(x8_fp32vec4,  c_fp32vec4);
+        y9_fp32vec4  = vec_add(x9_fp32vec4,  c_fp32vec4);
+        y10_fp32vec4 = vec_add(x10_fp32vec4, c_fp32vec4);
+        y11_fp32vec4 = vec_add(x11_fp32vec4, c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+        vec_xst(y4_fp32vec4,  0, y+(i+16));
+        vec_xst(y5_fp32vec4,  0, y+(i+20));
+        vec_xst(y6_fp32vec4,  0, y+(i+24));
+        vec_xst(y7_fp32vec4,  0, y+(i+28));
+        vec_xst(y8_fp32vec4,  0, y+(i+32));
+        vec_xst(y9_fp32vec4,  0, y+(i+36));
+        vec_xst(y10_fp32vec4, 0, y+(i+40));
+        vec_xst(y11_fp32vec4, 0, y+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+
+        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_add(x1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_add(x2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_add(x3_fp32vec4,  c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = c + x[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cmul_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cmul_VSX(float *z, const float *y, const float *x, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
+        y4_fp32vec4  = vec_xl(0, y+(i+16 ));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
+        y4_fp32vec4  = vec_mul(y4_fp32vec4,  x4_fp32vec4);
+        y5_fp32vec4  = vec_mul(y5_fp32vec4,  x5_fp32vec4);
+        y6_fp32vec4  = vec_mul(y6_fp32vec4,  x6_fp32vec4);
+        y7_fp32vec4  = vec_mul(y7_fp32vec4,  x7_fp32vec4);
+        y8_fp32vec4  = vec_mul(y8_fp32vec4,  x8_fp32vec4);
+        y9_fp32vec4  = vec_mul(y9_fp32vec4,  x9_fp32vec4);
+        y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
+        y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
+        vec_xst(y5_fp32vec4,  0, z+(i+20));
+        vec_xst(y6_fp32vec4,  0, z+(i+24));
+        vec_xst(y7_fp32vec4,  0, z+(i+28));
+        vec_xst(y8_fp32vec4,  0, z+(i+32));
+        vec_xst(y9_fp32vec4,  0, z+(i+36));
+        vec_xst(y10_fp32vec4, 0, z+(i+40));
+        vec_xst(y11_fp32vec4, 0, z+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = y[i] * x[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_muls_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+    float val[4] = {c, c, c, c};
+    vector float c_fp32vec4 = vec_xl(0, val);
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+        x4_fp32vec4  = vec_xl(0, x+(i+16));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_mul(x1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_mul(x2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_mul(x3_fp32vec4,  c_fp32vec4);
+        y4_fp32vec4  = vec_mul(x4_fp32vec4,  c_fp32vec4);
+        y5_fp32vec4  = vec_mul(x5_fp32vec4,  c_fp32vec4);
+        y6_fp32vec4  = vec_mul(x6_fp32vec4,  c_fp32vec4);
+        y7_fp32vec4  = vec_mul(x7_fp32vec4,  c_fp32vec4);
+        y8_fp32vec4  = vec_mul(x8_fp32vec4,  c_fp32vec4);
+        y9_fp32vec4  = vec_mul(x9_fp32vec4,  c_fp32vec4);
+        y10_fp32vec4 = vec_mul(x10_fp32vec4, c_fp32vec4);
+        y11_fp32vec4 = vec_mul(x11_fp32vec4, c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+        vec_xst(y4_fp32vec4,  0, y+(i+16));
+        vec_xst(y5_fp32vec4,  0, y+(i+20));
+        vec_xst(y6_fp32vec4,  0, y+(i+24));
+        vec_xst(y7_fp32vec4,  0, y+(i+28));
+        vec_xst(y8_fp32vec4,  0, y+(i+32));
+        vec_xst(y9_fp32vec4,  0, y+(i+36));
+        vec_xst(y10_fp32vec4, 0, y+(i+40));
+        vec_xst(y11_fp32vec4, 0, y+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+
+        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_mul(x1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_mul(x2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_mul(x3_fp32vec4,  c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = c * x[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cdiv_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cdiv_VSX(float *z, const float *x, const float *y, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4));
+        y2_fp32vec4  = vec_xl(0, y+(i+8));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+        y4_fp32vec4  = vec_xl(0, y+(i+16));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
+        y1_fp32vec4  = vec_div(x1_fp32vec4,  y1_fp32vec4);
+        y2_fp32vec4  = vec_div(x2_fp32vec4,  y2_fp32vec4);
+        y3_fp32vec4  = vec_div(x3_fp32vec4,  y3_fp32vec4);
+        y4_fp32vec4  = vec_div(x4_fp32vec4,  y4_fp32vec4);
+        y5_fp32vec4  = vec_div(x5_fp32vec4,  y5_fp32vec4);
+        y6_fp32vec4  = vec_div(x6_fp32vec4,  y6_fp32vec4);
+        y7_fp32vec4  = vec_div(x7_fp32vec4,  y7_fp32vec4);
+        y8_fp32vec4  = vec_div(x8_fp32vec4,  y8_fp32vec4);
+        y9_fp32vec4  = vec_div(x9_fp32vec4,  y9_fp32vec4);
+        y10_fp32vec4 = vec_div(x10_fp32vec4, y10_fp32vec4);
+        y11_fp32vec4 = vec_div(x11_fp32vec4, y11_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
+        vec_xst(y5_fp32vec4,  0, z+(i+20));
+        vec_xst(y6_fp32vec4,  0, z+(i+24));
+        vec_xst(y7_fp32vec4,  0, z+(i+28));
+        vec_xst(y8_fp32vec4,  0, z+(i+32));
+        vec_xst(y9_fp32vec4,  0, z+(i+36));
+        vec_xst(y10_fp32vec4, 0, z+(i+40));
+        vec_xst(y11_fp32vec4, 0, z+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
+
+        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
+        y1_fp32vec4  = vec_div(x1_fp32vec4,  y1_fp32vec4);
+        y2_fp32vec4  = vec_div(x2_fp32vec4,  y2_fp32vec4);
+        y3_fp32vec4  = vec_div(x3_fp32vec4,  y3_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] / y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_divs_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    float val[4] = {c, c, c, c};
+    vector float c_fp64vec2 = vec_xl(0, val);
+
+    vector float y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector float y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector float x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector float x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+12 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+16 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+20));
+        x6_fp64vec2  = vec_xl(0, x+(i+24));
+        x7_fp64vec2  = vec_xl(0, x+(i+28));
+        x8_fp64vec2  = vec_xl(0, x+(i+32));
+        x9_fp64vec2  = vec_xl(0, x+(i+36));
+        x10_fp64vec2 = vec_xl(0, x+(i+40));
+        x11_fp64vec2 = vec_xl(0, x+(i+44));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
+        y4_fp64vec2  = vec_div(x4_fp64vec2,  c_fp64vec2);
+        y5_fp64vec2  = vec_div(x5_fp64vec2,  c_fp64vec2);
+        y6_fp64vec2  = vec_div(x6_fp64vec2,  c_fp64vec2);
+        y7_fp64vec2  = vec_div(x7_fp64vec2,  c_fp64vec2);
+        y8_fp64vec2  = vec_div(x8_fp64vec2,  c_fp64vec2);
+        y9_fp64vec2  = vec_div(x9_fp64vec2,  c_fp64vec2);
+        y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
+        y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
+        
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+12 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+16 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+20));
+        vec_xst(y6_fp64vec2,  0, y+(i+24));
+        vec_xst(y7_fp64vec2,  0, y+(i+28));
+        vec_xst(y8_fp64vec2,  0, y+(i+32));
+        vec_xst(y9_fp64vec2,  0, y+(i+36));
+        vec_xst(y10_fp64vec2, 0, y+(i+40));
+        vec_xst(y11_fp64vec2, 0, y+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+12 ));
+
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+12 ));
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+16 ));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = x[i] / c;
+}
+
+
+//------------------------------------------------
+//
+// Testing for correctness and performance
+//
+// If you want to run these tests, compile this
+// file with -DRUN_VSX_TESTS on a Power machine,
+// and then run the executable that is generated.
+//
+//------------------------------------------------
+//
+// Example passing run (from a Power8 machine):
+//
+//    $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
+//    $ ./vsxtest
+//
+//	TODO
+//
+//
+//    Finished running all tests. All tests PASSED.
+//
+//------------------------------------------------
+#ifdef RUN_VSX_TESTS
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <math.h>
+
+#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
+#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
+
+
+//--------------------------------------------------------------------------------------------------
+// Standard implementations:
+//--------------------------------------------------------------------------------------------------
+static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
+{
+    for (ptrdiff_t i = 0; i < n; i++)
+        x[i] = c;
+}
+
+static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
+{
+    for (ptrdiff_t i = 0; i < n; i++)
+        x[i] = c;
+}
+
+static void standardDouble_cadd(double *z, const double *x,  const double *y, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] + c * y[i];
+}
+
+static void standardFloat_cadd(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] + c * y[i];
+}
+
+static void standardDouble_adds(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = c + x[i];
+}
+
+static void standardFloat_adds(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = c + x[i];
+}
+
+static void standardDouble_cmul(double *z, const double *x,  const double *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] * y[i];
+}
+
+static void standardFloat_cmul(float *z, const float *x, const float *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] * y[i];
+}
+
+static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = c * x[i];
+}
+
+static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = c * x[i];
+}
+
+static void standardDouble_cdiv(double *z, const double *x,  const double *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] / y[i];
+}
+
+static void standardFloat_cdiv(float *z, const float *x, const float *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] / y[i];
+}
+
+static void standardDouble_divs(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = x[i] / c;
+}
+
+static void standardFloat_divs(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] = x[i] / c;
+}
+
+double randDouble()
+{
+    return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0);
+}
+
+int near(double a, double b)
+{
+    int aClass = fpclassify(a);
+    int bClass = fpclassify(b);
+
+    if(aClass != bClass)             // i.e. is it NAN, infinite, or finite...?
+        return 0;
+
+    if(aClass == FP_INFINITE)       // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity
+        return (signbit(a) == signbit(b));
+    else if(aClass == FP_NORMAL)    // if it is a normal number then check the magnitude of the difference between the numbers
+        return fabs(a - b) < 0.001;
+    else                            // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other.
+        return 1;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// Standard tests:
+//--------------------------------------------------------------------------------------------------
+void test_THDoubleVector_fill_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *x_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    double yVal0 = 17.2;
+    double yVal1 = 8.2;
+    double yVal2 = 5.1;
+    double yVal3 = -0.9;
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    yVal0 += 1.0;
+    yVal1 += 1.0;
+    yVal2 += 1.0;
+    yVal3 -= 1.0;
+
+    standardDouble_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == yVal0);
+
+    standardDouble_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == x_standard[i]);
+    printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n");
+
+
+    free(x_standard);
+    free(x_optimized);
+}
+
+
+void test_THFloatVector_fill_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *x_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    float yVal0 = 17.2;
+    float yVal1 = 8.2;
+    float yVal2 = 5.1;
+    float yVal3 = -0.9;
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    yVal0 += 1.0;
+    yVal1 += 1.0;
+    yVal2 += 1.0;
+    yVal3 -= 1.0;
+
+    standardFloat_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == yVal0);
+
+    standardFloat_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == x_standard[i]);
+    printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n");
+
+
+    free(x_standard);
+    free(x_optimized);
+}
+
+
+void test_THDoubleVector_cadd_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        y[i] = randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_cadd(    z_standard+1,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_cadd(    z_standard+2,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_cadd(    z_standard+3,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_cadd(    z_standard+517,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_cadd(    z_standard+517+r,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_cadd_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THFloatVector_cadd_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c            = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        y[i] = (float)randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_cadd(    z_standard+1,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_cadd(    z_standard+2,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_cadd(    z_standard+3,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_cadd(    z_standard+517,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_cadd(    z_standard+517+r,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_cadd_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THDoubleVector_adds_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+        x[i] = randDouble();
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_adds(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_adds(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_adds(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_adds(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_adds(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_adds_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+void test_THFloatVector_adds_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c            = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+        x[i] = (float)randDouble();
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_adds(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_adds(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_adds(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_adds(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_adds(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_adds_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+void test_THDoubleVector_cmul_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        y[i] = randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_cmul(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_cmul(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_cmul(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_cmul(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_cmul(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_cmul_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THFloatVector_cmul_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        y[i] = (float)randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_cmul(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_cmul(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_cmul(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_cmul(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_cmul(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_cmul_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THDoubleVector_muls_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_muls(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_muls(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_muls(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_muls(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_muls(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_muls_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+void test_THFloatVector_muls_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c           = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_muls(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_muls(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_muls(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_muls(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_muls(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_muls_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+
+void test_THDoubleVector_cdiv_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        y[i] = randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_cdiv(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_cdiv(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_cdiv(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_cdiv(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_cdiv(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_cdiv_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THFloatVector_cdiv_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        y[i] = (float)randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_cdiv(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_cdiv(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_cdiv(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_cdiv(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_cdiv(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_cdiv_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(x);
+}
+
+void test_THDoubleVector_divs_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_divs(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_divs(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_divs(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_divs(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_divs(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_divs_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+void test_THFloatVector_divs_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c            = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_divs(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_divs(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_divs(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_divs(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_divs(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_divs_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// Run tests:
+//--------------------------------------------------------------------------------------------------
+int main()
+{
+    printf("\n");
+
+
+    // First test utility functions
+
+    assert(!near(0.1, -0.1));
+    assert(!near(0.1f, -0.1f));
+    assert(!near(9, 10));
+    assert(near(0.1, 0.1000001));
+    assert(near(0.1f, 0.1000001f));
+    assert(near(100.764, 100.764));
+    assert(!near(NAN, 0.0));
+    assert(!near(-9.5, NAN));
+    assert(!near(NAN, 100));
+    assert(!near(-0.0, NAN));
+    assert(near(NAN, NAN));
+    assert(near(INFINITY, INFINITY));
+    assert(near(-INFINITY, -INFINITY));
+    assert(!near(INFINITY, NAN));
+    assert(!near(0, INFINITY));
+    assert(!near(-999.4324, INFINITY));
+    assert(!near(INFINITY, 982374.1));
+    assert(!near(-INFINITY, INFINITY));
+
+
+
+    // Then test each vectorized function
+
+    test_THDoubleVector_fill_VSX();
+    test_THFloatVector_fill_VSX();
+
+    test_THDoubleVector_cadd_VSX();
+    test_THFloatVector_cadd_VSX();
+
+    test_THDoubleVector_adds_VSX();
+    test_THFloatVector_adds_VSX();
+
+    test_THDoubleVector_cmul_VSX();
+    test_THFloatVector_cmul_VSX();
+
+    test_THDoubleVector_muls_VSX();
+    test_THFloatVector_muls_VSX();
+
+    test_THDoubleVector_cdiv_VSX();
+    test_THFloatVector_cdiv_VSX();
+
+    test_THDoubleVector_divs_VSX();
+    test_THFloatVector_divs_VSX();
+
+
+
+    printf("Finished running all tests. All tests PASSED.\n");
+    return 0;
+}
+
+
+#endif  // defined RUN_VSX_TESTS
+
+#endif  // defined __PPC64__
+
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
new file mode 100644
index 0000000..ac445f7
--- /dev/null
+++ b/aten/src/THC/CMakeLists.txt
@@ -0,0 +1,165 @@
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  "${CMAKE_CURRENT_BINARY_DIR}"
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+PARENT_SCOPE)
+
+CONFIGURE_FILE(THCGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h")
+
+set(extra_src)
+# loop over all types
+foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
+   # loop over files which need to be split between types (because of long compile times)
+   foreach(THC_FILE TensorSort TensorMathCompareT TensorMathPointwise TensorMathCompare TensorMathReduce TensorMasked)
+      if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
+         FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
+           "#include \"../THC${THC_FILE}.cuh\"\n#include \"THCTensor.hpp\"\n#include \"../generic/THC${THC_FILE}.cu\"\n#include \"../THCGenerate${THC_TYPE}Type.h\"\n")
+      endif()
+      LIST(APPEND extra_src "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
+   endforeach()
+endforeach()
+
+IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/THCHalf.cu)
+ENDIF()
+
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingAllocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingHostAllocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCGeneral.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStream.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCThreadLocal.cpp
+
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCReduceApplyUtils.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCBlas.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSleep.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorage.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathBlas.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathMagma.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathPairwise.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathReduce.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathScan.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
+  ${extra_src}
+  PARENT_SCOPE)
+
+INSTALL(FILES
+          THC.h
+          ${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h
+          THCGeneral.hpp
+          THCBlas.h
+          THCSleep.h
+          THCStorage.h
+          THCStorageCopy.h
+          THCStream.h
+          THCThreadLocal.h
+          THCTensor.h
+          THCTensorCopy.h
+          THCTensorCopy.hpp
+          THCTensorRandom.h
+          THCTensorMath.h
+          THCApply.cuh
+          THCReduce.cuh
+          THCReduceAll.cuh
+          THCReduceApplyUtils.cuh
+          THCTensorMathReduce.cuh
+          THCAsmUtils.cuh
+          THCAtomics.cuh
+          THCScanUtils.cuh
+          THCSortUtils.cuh
+          THCAllocator.h
+          THCCachingAllocator.h
+          THCCachingHostAllocator.h
+          THCDeviceUtils.cuh
+          THCDeviceTensor.cuh
+          THCDeviceTensor-inl.cuh
+          THCDeviceTensorUtils.cuh
+          THCDeviceTensorUtils-inl.cuh
+          THCGenerateAllTypes.h
+          THCGenerateByteType.h
+          THCGenerateCharType.h
+          THCGenerateShortType.h
+          THCGenerateIntType.h
+          THCGenerateLongType.h
+          THCGenerateHalfType.h
+          THCGenerateFloatType.h
+          THCGenerateFloatTypes.h
+          THCGenerateDoubleType.h
+          THCHalf.h
+          THCIntegerDivider.cuh
+          THCNumerics.cuh
+          THCTensorSort.cuh
+          THCTensorInfo.cuh
+          THCTensorMathPointwise.cuh
+          THCTensorTypeUtils.cuh
+          THCTensorRandom.cuh
+          THCTensorMathMagma.cuh
+          THCThrustAllocator.cuh
+          THCTensorMode.cuh
+          THCTensorTopK.cuh
+          THCCachingAllocator.h
+          # See Note [TH abstraction violation]
+          THCGenerator.hpp
+          THCTensor.hpp
+          THCStorage.hpp
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC")
+
+INSTALL(FILES
+          generic/THCStorage.cpp
+          generic/THCStorage.cu
+          generic/THCStorage.h
+          generic/THCTensor.cpp
+          generic/THCTensor.cu
+          generic/THCTensor.h
+          generic/THCStorageCopy.cpp
+          generic/THCStorageCopy.cu
+          generic/THCStorageCopy.h
+          generic/THCTensorCopy.cpp
+          generic/THCTensorCopy.cu
+          generic/THCTensorCopy.h
+          generic/THCTensorMasked.h
+          generic/THCTensorMasked.cu
+          generic/THCTensorMath.h
+          generic/THCTensorMath.cu
+          generic/THCTensorMathBlas.cu
+          generic/THCTensorMathBlas.h
+          generic/THCTensorMathCompare.h
+          generic/THCTensorMathCompare.cu
+          generic/THCTensorMathCompareT.h
+          generic/THCTensorMathCompareT.cu
+          generic/THCTensorMathMagma.h
+          generic/THCTensorMathMagma.cu
+          generic/THCTensorMathPairwise.h
+          generic/THCTensorMathPairwise.cu
+          generic/THCTensorMathPointwise.h
+          generic/THCTensorMathPointwise.cu
+          generic/THCTensorMathReduce.h
+          generic/THCTensorMathReduce.cu
+          generic/THCTensorMathScan.h
+          generic/THCTensorMathScan.cu
+          generic/THCTensorScatterGather.h
+          generic/THCTensorScatterGather.cu
+          generic/THCTensorIndex.h
+          generic/THCTensorIndex.cu
+          generic/THCTensorSort.h
+          generic/THCTensorSort.cu
+          generic/THCTensorRandom.h
+          generic/THCTensorRandom.cu
+          generic/THCTensorMode.h
+          generic/THCTensorMode.cu
+          generic/THCTensorTopK.h
+          generic/THCTensorTopK.cu
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/aten/src/THC/THC.h b/aten/src/THC/THC.h
new file mode 100644
index 0000000..e333d8a
--- /dev/null
+++ b/aten/src/THC/THC.h
@@ -0,0 +1,18 @@
+#ifndef THC_INC
+#define THC_INC
+
+#include "THCGeneral.h"
+#include "THCAllocator.h"
+#include "THCBlas.h"
+#include "THCCachingAllocator.h"
+#include "THCCachingHostAllocator.h"
+#include "THCSleep.h"
+#include "THCStorage.h"
+#include "THCStorageCopy.h"
+#include "THCStream.h"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THCTensorRandom.h"
+#include "THCTensorMath.h"
+
+#endif
diff --git a/aten/src/THC/THCAllocator.cpp b/aten/src/THC/THCAllocator.cpp
new file mode 100644
index 0000000..c6be2f0
--- /dev/null
+++ b/aten/src/THC/THCAllocator.cpp
@@ -0,0 +1,68 @@
+#include "THCAllocator.h"
+
+static void THCudaHostDeleter(void* ptr) {
+  THCudaCheck(cudaFreeHost(ptr));
+}
+
+struct THCudaHostAllocator : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    void* ptr = nullptr;
+    if (size != 0) {
+      THCudaCheck(cudaMallocHost(&ptr, size));
+    }
+    return {ptr, ptr, &THCudaHostDeleter, at::kCPU};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &THCudaHostDeleter;
+  }
+};
+
+static THCudaHostAllocator th_cuda_host_allocator;
+at::Allocator* getTHCudaHostAllocator() {
+  return &th_cuda_host_allocator;
+}
+
+static void THCUVADeleter(void* ptr) {
+  THCudaCheck(cudaFree(ptr));
+}
+
+struct THCUVAAllocator : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    // See J.1.1 of the CUDA_C_Programming_Guide.pdf for UVA and coherence rules
+    // on various compute capabilities.
+    void* ptr = nullptr;
+    if (size != 0) {
+      THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
+    }
+    return {ptr, ptr, &THCUVADeleter, at::kCPU};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &THCUVADeleter;
+  }
+};
+
+static THCUVAAllocator thc_uva_allocator;
+at::Allocator* getTHCUVAAllocator() {
+  return &thc_uva_allocator;
+}
+
+
+THCIpcDeleter::~THCIpcDeleter() {
+  int prev_device;
+  THCudaCheck(cudaGetDevice(&prev_device));
+  THCudaCheck(cudaSetDevice(device_));
+  THCudaCheck(cudaIpcCloseMemHandle(data_));
+  THCudaCheck(cudaSetDevice(prev_device));
+}
+
+void deleteTHCIpcDeleter(void* ptr) {
+  delete static_cast<THCIpcDeleter*>(ptr);
+}
+
+at::DataPtr THCIpcDeleter::makeDataPtr(void* data, int device) {
+  // The dynamic allocation here is a bit unfortunate
+  int cur_device;
+  THCudaCheck(cudaGetDevice(&cur_device));
+  auto* context = new THCIpcDeleter(data, device);
+  return {data, context, &deleteTHCIpcDeleter, at::Device(at::kCUDA, cur_device)};
+}
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
new file mode 100644
index 0000000..652bb7a
--- /dev/null
+++ b/aten/src/THC/THCAllocator.h
@@ -0,0 +1,22 @@
+#ifndef THC_ALLOCATOR_INC
+#define THC_ALLOCATOR_INC
+
+#include "THCGeneral.h"
+
+THC_API THAllocator* getTHCudaHostAllocator(void);
+THC_API THAllocator* getTHCUVAAllocator(void);
+// IPC doesn't support (re)allocation
+
+#ifdef __cplusplus
+class AT_API THCIpcDeleter {
+public:
+  THCIpcDeleter(void* data, int device) : data_(data), device_(device) {};
+  ~THCIpcDeleter();
+  static at::DataPtr makeDataPtr(void* data, int device);
+private:
+  void* data_;
+  int device_;
+};
+#endif
+
+#endif
diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh
new file mode 100644
index 0000000..d456b53
--- /dev/null
+++ b/aten/src/THC/THCApply.cuh
@@ -0,0 +1,748 @@
+#ifndef THC_APPLY_INC
+#define THC_APPLY_INC
+
+#include "THCTensorCopy.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCTensorCopy.hpp"
+
+//
+// This file contains pointwise operation functions and kernels that
+// work on both contiguous and non-contiguous tensor arguments of
+// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
+// copying or temporary storage.
+//
+
+// Rearrange dimensions for pointwise operations so that strides are in
+// decreasing order as much as possible, so that kernels have better memory
+// access patterns.
+//
+// For example, consider a binary operation on two "transposed" 2-dim tensors:
+//    sizes:          256 512
+//    aInfo->strides:   1 256
+//    bInfo->strides:   1 256
+//
+// Given this, each concurrent memory access inside kernelPointwiseApply2() is
+// exactly 256 elements apart, resulting in poor performance.
+//
+// This function exchanges dimensions so that memory access is contiguous:
+//    sizes:          512 256
+//    aInfo->strides: 256   1
+//    bInfo->strides: 256   1
+//
+// (Actually, it becomes even better because now collapseDims() can turn each
+// input into one contiguous array.)
+//
+// In general, given M (<=3) TensorInfo's with N dimensions, we can view each
+// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
+// strides[i] and [j] if
+//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
+//        (exchanging them will benefit input #k), and
+//    (2) strides[i][k] <= strieds[j][k] for all k
+//        (exchanging them will not make any input worse).
+template <typename T1, typename IndexType,
+          typename T2 = void, typename T3 = void>
+void rearrangeDims(TensorInfo<T1, IndexType>* aInfo,
+                   TensorInfo<T2, IndexType>* bInfo = nullptr,
+                   TensorInfo<T3, IndexType>* cInfo = nullptr) {
+  int numInfos = 1;
+  int dims = aInfo->dims;
+  IndexType *sizes[3] = { aInfo->sizes, };
+  IndexType *strides[3] = { aInfo->strides, };
+
+  if (bInfo != nullptr) {
+    ++numInfos;
+    if (bInfo->dims != dims) return;
+    sizes[1] = bInfo->sizes;
+    strides[1] = bInfo->strides;
+  }
+
+  if (cInfo != nullptr) {
+    ++numInfos;
+    if (cInfo->dims != dims) return;
+    sizes[2] = cInfo->sizes;
+    strides[2] = cInfo->strides;
+  }
+
+  // Bail out if sizes do not match: we are using "deprecated pointwise
+  // behavior" among tensors of different shapes but same number of elements.
+  for (int i = 1; i < numInfos; ++i) {
+    for (int j = 0; j < dims; ++j) {
+      if (sizes[i][j] != sizes[0][j]) return;
+    }
+  }
+
+  for (int i = 0; i < dims - 1; ++i) {
+    // No need to consider dimensions of size 1.
+    if (sizes[0][i] == 1) continue;
+
+    for (int j = i + 1; j < dims; ++j) {
+      if (sizes[0][j] == 1) continue;
+
+      // Compare the relative sizes of strides between dim #i and dim #j.
+      bool hasIncreasingStrides = false;
+      bool hasDecreasingStrides = false;
+
+      for (int k = 0; k < numInfos; k++) {
+        IndexType stride_i = strides[k][i];
+        IndexType stride_j = strides[k][j];
+        if (stride_i < stride_j) {
+          hasIncreasingStrides = true;
+        } else if (stride_i > stride_j) {
+          hasDecreasingStrides = true;
+        }
+      }
+
+      if (hasIncreasingStrides && !hasDecreasingStrides) {
+        for (int k = 0; k < numInfos; k++) {
+          IndexType size = sizes[k][i];
+          sizes[k][i] = sizes[k][j];
+          sizes[k][j] = size;
+
+          IndexType stride = strides[k][i];
+          strides[k][i] = strides[k][j];
+          strides[k][j] = stride;
+        }
+      }
+    }
+  }
+}
+
+// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
+#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
+#define THC_APPLY_BLOCKS_PER_SM 4
+template <typename Op,
+          typename Ta,
+          typename IndexType,
+          int ADims>
+__global__ void
+kernelPointwiseApply1(const OffsetInfo<Ta, IndexType, ADims> a,
+                      IndexType totalElements,
+                      Op op) {
+  // NOTE: The two typecasts below are essential when IndexType is 64-bit;
+  //       without them, results are silently truncated to 32 bits!
+  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += (IndexType) gridDim.x * blockDim.x) {
+    op(a.get(linearIndex));
+  }
+}
+
+template <typename Op,
+          typename Ta, typename Tb,
+          typename IndexType,
+          int ADims, int BDims>
+__global__ void
+kernelPointwiseApply2(const OffsetInfo<Ta, IndexType, ADims> a,
+                      const OffsetInfo<Tb, IndexType, BDims> b,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += (IndexType) gridDim.x * blockDim.x) {
+    op(a.get(linearIndex), b.get(linearIndex));
+  }
+}
+
+template <typename Op,
+          typename Ta, typename Tb, typename Tc,
+          typename IndexType,
+          int ADims, int BDims, int CDims>
+__global__ void
+kernelPointwiseApply3(const OffsetInfo<Ta, IndexType, ADims> a,
+                      const OffsetInfo<Tb, IndexType, BDims> b,
+                      const OffsetInfo<Tc, IndexType, CDims> c,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += (IndexType) gridDim.x * blockDim.x) {
+    op(a.get(linearIndex), b.get(linearIndex), c.get(linearIndex));
+  }
+}
+
+inline dim3 getApplyBlock() {
+  return dim3(THC_APPLY_THREADS_PER_BLOCK);
+}
+
+inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) {
+  if (curDevice == -1) return false;
+
+  uint64_t numBlocks = THCCeilDiv(totalElements, static_cast<uint64_t>(THC_APPLY_THREADS_PER_BLOCK));
+  uint64_t maxGridX = THCState_getDeviceProperties(state, curDevice)->maxGridSize[0];
+  if (numBlocks > maxGridX)
+      numBlocks = maxGridX;
+
+  // For 32-bit indices, make sure that gridDim.x * blockDim.x fits in 32 bits.
+  if (totalElements <= INT32_MAX &&
+      numBlocks > INT32_MAX / THC_APPLY_THREADS_PER_BLOCK)
+    numBlocks = INT32_MAX / THC_APPLY_THREADS_PER_BLOCK;
+
+  grid = dim3(numBlocks);
+  return true;
+}
+
+template <typename ScalarTypeA,
+          typename TensorTypeA,
+          typename Op>
+bool THC_pointwiseApply1(THCState* state,
+                         TensorTypeA* a,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite) {
+  if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
+  
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only 
+  once.
+  */
+  TensorTypeA* oldA = NULL;
+
+  if (aType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, A)                                            \
+  kernelPointwiseApply1<Op,                                             \
+                        ScalarTypeA,                                    \
+                        TYPE, A>                                        \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
+      OffsetInfo<ScalarTypeA, TYPE, A>  \
+          (aInfo),                                                      \
+      (TYPE) totalElements, op);
+
+#define HANDLE_A_CASE(TYPE, A) {            \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, 1);                 \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, 2);                 \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, -1);                \
+      break;                                \
+  }                                         \
+}
+
+  // Can we use 32-bit integer math in the kernel (the linear ID for the copy
+  // and the resulting non-linear offset is all computable using 32-bit math?)
+  // We also use unsigned index math in the kernel, as signed div/mod has
+  // additional overhead.
+  if (THCTensor_canUse32BitIndexMath(state, a)) {
+    TensorInfo<ScalarTypeA, unsigned int> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
+    rearrangeDims(&aInfo);
+    aInfo.collapseDims();
+#if CUDA_VERSION < 9000
+    if (!aInfo.isContiguous()) {
+        grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+    }
+#endif
+    HANDLE_A_CASE(unsigned int, aInfo.dims);
+  } else {
+    TensorInfo<ScalarTypeA, uint64_t> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
+    rearrangeDims(&aInfo);
+    aInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1) {
+      OffsetInfo<ScalarTypeA, uint64_t, 1>
+        aOffset(aInfo);
+      kernelPointwiseApply1<Op,
+                            ScalarTypeA,
+                            uint64_t, 1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, (uint64_t) totalElements, op);
+    } else {
+
+#if CUDA_VERSION < 9000
+        grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+      OffsetInfo<ScalarTypeA, uint64_t, -1>
+        aOffset(aInfo);
+      kernelPointwiseApply1<Op,
+                            ScalarTypeA,
+                            uint64_t, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
+    THCTensor_free(state, a);
+    a = oldA;
+  }
+
+  return true;
+}
+
+template <typename ScalarTypeA,
+          typename ScalarTypeB,
+          typename TensorTypeA,
+          typename TensorTypeB,
+          typename Op>
+bool THC_pointwiseApply2(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly) {
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
+  if (totalElements != THCTensor_nElement(state, b)) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS ||
+      THCTensor__nDimension(state, b) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only 
+  once.
+  */
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
+
+  if (aType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
+  }
+  if (bType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = (TensorTypeB*)THCTensor_newContiguous<ScalarTypeB>(state, b);
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, A, B)                                         \
+  kernelPointwiseApply2<Op,                                             \
+                        ScalarTypeA,                                    \
+                        ScalarTypeB,                                    \
+                        TYPE, A, B>                                     \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
+      OffsetInfo<ScalarTypeA, TYPE, A>  \
+          (aInfo),                                                      \
+      OffsetInfo<ScalarTypeB, TYPE, B>                                  \
+          (bInfo),                                                      \
+      (TYPE) totalElements, op);
+
+#define HANDLE_B_CASE(TYPE, A, B) {         \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, 1);              \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, 2);              \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, -1);             \
+      break;                                \
+  }                                         \
+}                                           
+
+#define HANDLE_A_CASE(TYPE, A, B) {         \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B);            \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B);            \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B);           \
+      break;                                \
+  }                                         \
+}
+
+  if (THCTensor_canUse32BitIndexMath(state, a) &&
+      THCTensor_canUse32BitIndexMath(state, b)) {
+    TensorInfo<ScalarTypeA, unsigned int> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
+
+    TensorInfo<ScalarTypeB, unsigned int> bInfo =
+      getTensorInfo<ScalarTypeB, TensorTypeB, unsigned int>(state, b);
+
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+#if CUDA_VERSION < 9000
+    if (!(aInfo.isContiguous() && bInfo.isContiguous()))
+        grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
+  } else {
+    TensorInfo<ScalarTypeA, uint64_t> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
+
+    TensorInfo<ScalarTypeB, uint64_t> bInfo =
+      getTensorInfo<ScalarTypeB, TensorTypeB, uint64_t>(state, b);
+
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1) {
+      OffsetInfo<ScalarTypeA, uint64_t, 1>
+        aOffset(aInfo);
+      OffsetInfo<ScalarTypeB, uint64_t, 1>
+        bOffset(bInfo);
+      kernelPointwiseApply2<Op,
+                            ScalarTypeA,
+                            ScalarTypeB,
+                            uint64_t, 1, 1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, bOffset, (uint64_t) totalElements, op);
+    } else {
+#if CUDA_VERSION < 9000
+      grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+      OffsetInfo<ScalarTypeA, uint64_t, -1>
+        aOffset(aInfo);
+      OffsetInfo<ScalarTypeB, uint64_t, -1>
+        bOffset(bInfo);
+      kernelPointwiseApply2<Op,
+                            ScalarTypeA,
+                            ScalarTypeB,
+                            uint64_t, -1, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, bOffset, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
+    THCTensor_free(state, a);
+    a = oldA;
+  }
+
+  if (oldB) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeB>(state, oldB, b);
+    THCTensor_free(state, b);
+    b = oldB;
+  }
+
+  return true;
+}
+
+template <typename ScalarTypeA,
+          typename ScalarTypeB,
+          typename ScalarTypeC,
+          typename TensorTypeA,
+          typename TensorTypeB,
+          typename TensorTypeC,
+          typename Op>
+bool THC_pointwiseApply3(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         TensorTypeC* c,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly,
+                         TensorArgType cType = ReadOnly) {
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
+
+  if (totalElements != THCTensor_nElement(state, b) ||
+      totalElements != THCTensor_nElement(state, c)) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS ||
+      THCTensor__nDimension(state, b) > MAX_CUTORCH_DIMS ||
+      THCTensor__nDimension(state, c) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only 
+  once.
+  */
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
+  TensorTypeC* oldC = NULL;
+
+  if (aType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
+  }
+  if (bType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = (TensorTypeB*)THCTensor_newContiguous<ScalarTypeB>(state, b);
+  }
+  if (cType == ReadWrite &&
+      THCTensor_maybeOverlappingIndices(state, c)) {
+    // Must perform in contiguous space
+    oldC = c;
+    c = (TensorTypeC*)THCTensor_newContiguous<ScalarTypeC>(state, c);
+  }
+
+#define HANDLE_CASE(TYPE, A, B, C)                                      \
+  kernelPointwiseApply3<Op,                                             \
+                        ScalarTypeA,                                    \
+                        ScalarTypeB,                                    \
+                        ScalarTypeC,                                    \
+                        TYPE, A, B, C>                                  \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
+      OffsetInfo<ScalarTypeA, TYPE, A>                                  \
+          (aInfo),                                                      \
+      OffsetInfo<ScalarTypeB, TYPE, B>                                  \
+          (bInfo),                                                      \
+      OffsetInfo<ScalarTypeC, TYPE, C>                                  \
+          (cInfo),                                                      \
+      (TYPE) totalElements, op);
+
+#define HANDLE_C_CASE(TYPE, A, B, C) {      \
+  switch (C) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, B, 1);           \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, B, 2);           \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, B, -1);          \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_B_CASE(TYPE, A, B, C) {      \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_C_CASE(TYPE, A, 1, C);         \
+      break;                                \
+    case 2:                                 \
+      HANDLE_C_CASE(TYPE, A, 2, C);         \
+      break;                                \
+    default:                                \
+      HANDLE_C_CASE(TYPE, A, -1, C);        \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_A_CASE(TYPE, A, B, C) {      \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B, C);         \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B, C);         \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B, C);        \
+      break;                                \
+  }                                         \
+}
+
+  if (THCTensor_canUse32BitIndexMath(state, a) &&
+      THCTensor_canUse32BitIndexMath(state, b) &&
+      THCTensor_canUse32BitIndexMath(state, c)) {
+    TensorInfo<ScalarTypeA, unsigned int> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
+
+    TensorInfo<ScalarTypeB, unsigned int> bInfo =
+      getTensorInfo<ScalarTypeB, TensorTypeB, unsigned int>(state, b);
+
+    TensorInfo<ScalarTypeC, unsigned int> cInfo =
+      getTensorInfo<ScalarTypeC, TensorTypeC, unsigned int>(state, c);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+
+#if CUDA_VERSION < 9000
+      if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()))
+          grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
+  } else {
+    TensorInfo<ScalarTypeA, uint64_t> aInfo =
+      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
+
+    TensorInfo<ScalarTypeB, uint64_t> bInfo =
+      getTensorInfo<ScalarTypeB, TensorTypeB, uint64_t>(state, b);
+
+    TensorInfo<ScalarTypeC, uint64_t> cInfo =
+      getTensorInfo<ScalarTypeC, TensorTypeC, uint64_t>(state, c);
+
+    rearrangeDims(&aInfo, &bInfo, &cInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+    cInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
+      OffsetInfo<ScalarTypeA, uint64_t, 1>
+        aOffset(aInfo);
+      OffsetInfo<ScalarTypeB, uint64_t, 1>
+        bOffset(bInfo);
+      OffsetInfo<ScalarTypeC, uint64_t, 1>
+        cOffset(cInfo);
+      kernelPointwiseApply3<Op,
+                            ScalarTypeA,
+                            ScalarTypeB,
+                            ScalarTypeC,
+                            uint64_t, 1, 1, 1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, bOffset, cOffset, (uint64_t) totalElements, op);
+    } else {
+#if CUDA_VERSION < 9000
+      grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+      OffsetInfo<ScalarTypeA, uint64_t, -1>
+        aOffset(aInfo);
+      OffsetInfo<ScalarTypeB, uint64_t, -1>
+        bOffset(bInfo);
+      OffsetInfo<ScalarTypeC, uint64_t, -1>
+        cOffset(cInfo);
+      kernelPointwiseApply3<Op,
+                            ScalarTypeA,
+                            ScalarTypeB,
+                            ScalarTypeC,
+                            uint64_t, -1, -1, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aOffset, bOffset, cOffset, (uint64_t) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_C_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
+    THCTensor_free(state, a);
+    a = oldA;
+  }
+
+  if (oldB) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeB>(state, oldB, b);
+    THCTensor_free(state, b);
+    b = oldB;
+  }
+
+  if (oldC) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldC contiguous.
+    THCTensor_copyIgnoringOverlaps<ScalarTypeC>(state, oldC, c);
+    THCTensor_free(state, c);
+    c = oldC;
+  }
+
+  return true;
+}
+
+#undef THC_APPLY_THREADS_PER_BLOCK
+#undef THC_APPLY_BLOCKS_PER_SM
+
+#endif // THC_APPLY_INC
diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh
new file mode 100644
index 0000000..c419ffa
--- /dev/null
+++ b/aten/src/THC/THCAsmUtils.cuh
@@ -0,0 +1,142 @@
+#ifndef THC_ASM_UTILS_INC
+#define THC_ASM_UTILS_INC
+
+// Collection of direct PTX functions
+
+template <typename T>
+struct Bitfield {};
+
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+#if defined(__HIP_PLATFORM_HCC__)
+    pos &= 0x1f;
+    len &= 0x1f;
+
+    unsigned int m = (1u << len) - 1u;
+    m <<= pos;
+    return val & m;
+#else
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+
+  static __device__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+#if defined(__HIP_PLATFORM_HCC__)
+    pos &= 0x1f;
+    len &= 0x1f;
+
+    unsigned int m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+#else
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+
+template <>
+struct Bitfield<uint64_t> {
+  static __device__ __forceinline__
+  uint64_t getBitfield(uint64_t val, int pos, int len) {
+#if defined(__HIP_PLATFORM_HCC__)
+    pos &= 0x1f;
+    len &= 0x1f;
+
+    uint64_t m = (1u << len) - 1u;
+    m <<= pos;
+    return val & m;
+#else
+    uint64_t ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+
+  static __device__ __forceinline__
+  uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
+#if defined(__HIP_PLATFORM_HCC__)
+    pos &= 0x1f;
+    len &= 0x1f;
+
+    uint64_t m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+#else
+    uint64_t ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+
+__device__ __forceinline__ int getLaneId() {
+#if defined(__HIP_PLATFORM_HCC__)
+  return hc::__lane_id();
+#else
+  int laneId;
+  asm("mov.s32 %0, %laneid;" : "=r"(laneId) );
+  return laneId;
+#endif
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+#if defined(__HIP_PLATFORM_HCC__)
+  std::uint64_t m = (1ull << getLaneId()) - 1ull;
+  return m;
+#else
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+#endif
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+#if defined(__HIP_PLATFORM_HCC__)
+  std::uint64_t m = (1ull << (getLaneId() + 1ull)) - 1ull;
+  return m;
+#else
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+#endif
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+#if defined(__HIP_PLATFORM_HCC__)
+  std::uint64_t m = getLaneMaskLe();
+  return m ? ~m : m;
+#else
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+#endif
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+#if defined(__HIP_PLATFORM_HCC__)
+  std::uint64_t m = getLaneMaskLt();
+  return ~m;
+#else
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+#endif
+}
+
+
+#endif // THC_ASM_UTILS_INC
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
new file mode 100644
index 0000000..bdb7859
--- /dev/null
+++ b/aten/src/THC/THCAtomics.cuh
@@ -0,0 +1,148 @@
+#ifndef THC_ATOMICS_INC
+#define THC_ATOMICS_INC
+
+#include "THC.h"
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+
+namespace at { struct Half; }
+
+template <typename T, size_t n>
+struct AtomicAddIntegerImpl;
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 1> {
+  inline __device__ void operator()(T *address, T val) {
+    uint32_t * address_as_ui =
+        (uint32_t *) (address - ((size_t)address & 3));
+    uint32_t old = *address_as_ui;
+    uint32_t shift = (((size_t)address & 3) * 8);
+    uint32_t sum;
+    uint32_t assumed;
+
+    do {
+      assumed = old;
+      sum = val + T((old >> shift) & 0xff);
+      old = (old & ~(0x000000ff << shift)) | (sum << shift);
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 2> {
+  inline __device__ void operator()(T *address, T val) {
+    uint32_t * address_as_ui =
+        (uint32_t *) ((char *)address - ((size_t)address & 2));
+    uint32_t old = *address_as_ui;
+    uint32_t sum;
+    uint32_t newval;
+    uint32_t assumed;
+
+    do {
+      assumed = old;
+      sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff);
+      newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 4> {
+  inline __device__ void operator()(T *address, T val) {
+    uint32_t * address_as_ui = (uint32_t *) (address);
+    uint32_t old = *address_as_ui;
+    uint32_t newval;
+    uint32_t assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 8> {
+  inline __device__ void operator()(T *address, T val) {
+    unsigned long long * address_as_ui = (unsigned long long *) (address);
+    unsigned long long old = *address_as_ui;
+    unsigned long long newval;
+    unsigned long long assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
+  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+}
+
+static inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
+  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+}
+
+static inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
+  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+}
+
+static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
+  AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+}
+
+#ifdef CUDA_HALF_TENSOR
+static inline  __device__ void atomicAdd(half *address, half val) {
+  unsigned int * address_as_ui =
+    (unsigned int *) ((char *)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
+    half hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    hsum = THCNumerics<half>::add(hsum, val);
+#else
+    __half_raw hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    half tmpres = THCNumerics<half>::add(hsum, val);
+    hsum = __half_raw(tmpres);
+#endif
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+  } while (assumed != old);
+}
+static inline __device__ void atomicAdd(at::Half *address, half val) {
+  return atomicAdd(reinterpret_cast<half*>(address), val);
+}
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+// from CUDA C Programmic Guide
+static inline  __device__  void atomicAdd(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+} while (assumed != old);
+}
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
+  // This needs to be defined for the host side pass
+  static inline  __device__  void atomicAdd(double *address, double val) { }
+#endif
+
+#endif // THC_ATOMICS_INC
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
new file mode 100644
index 0000000..e2003da
--- /dev/null
+++ b/aten/src/THC/THCBlas.cu
@@ -0,0 +1,539 @@
+#include "THCBlas.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy)
+{
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+    float result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSdot(handle, i_n, x, i_incx, y, i_incy, &result));
+    return result;
+  }
+
+  THError("Cublas_Sdot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return 0;
+}
+
+double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy)
+{
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+    double result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDdot(handle, i_n, x, i_incx, y, i_incy, &result));
+    return result;
+  }
+
+  THError("Cublas_Ddot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return 0;
+}
+
+#ifdef CUDA_HALF_TENSOR
+half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy)
+{
+#if CUDA_VERSION >= 8000
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    half result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDotEx(handle, n,
+                              x, CUDA_R_16F, incx,
+                              y, CUDA_R_16F, incy,
+                              &result, CUDA_R_16F,
+                              CUDA_R_32F));
+    return result;
+  }
+
+  THError("Cublas_Hdot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return THC_float2half(0);
+#else
+  THError("Cublas_Hdot requires CUDA 8.0+");
+  return THC_float2half(0);
+#endif
+}
+#endif
+
+/* Level 2 */
+void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy)
+{
+  if(n == 1)
+    lda = m;
+
+  cublasOperation_t op;
+  if (trans == 't') op = CUBLAS_OP_T;
+  else if (trans == 'n') op = CUBLAS_OP_N;
+  else if (trans == 'c') op = CUBLAS_OP_C;
+  else THError("Cublas_Sgemv parameter trans should be 't', 'n' or 'c'.");
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) &&
+      (lda > 0) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+    return;
+  }
+  THError("Cublas_Sgemv only supports m, n, lda, incx, incy"
+          "in the range 0 < [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy)
+{
+  if(n == 1)
+    lda = m;
+
+  cublasOperation_t op;
+  if (trans == 't') op = CUBLAS_OP_T;
+  else if (trans == 'n') op = CUBLAS_OP_N;
+  else if (trans == 'c') op = CUBLAS_OP_C;
+  else THError("Cublas_Sgemv parameter trans should be 't', 'n' or 'c'.");
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) &&
+      (lda > 0) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+    return;
+  }
+  THError("Cublas_Dgemv only supports m, n, lda, incx, incy"
+          "in the range 0 < [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda)
+{
+  if(n == 1)
+    lda = m;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
+    {
+      int i_m = (int)m;
+      int i_n = (int)n;
+      int i_lda = (int)lda;
+      int i_incx = (int)incx;
+      int i_incy = (int)incy;
+
+      cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+      cublasSetStream(handle, THCState_getCurrentStream(state));
+      THCublasCheck(cublasSger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
+      return;
+    }
+  THError("Cublas_Sger only supports m, n, lda, incx, incy"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda)
+{
+  if(n == 1)
+    lda = m;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
+    {
+      int i_m = (int)m;
+      int i_n = (int)n;
+      int i_lda = (int)lda;
+      int i_incx = (int)incx;
+      int i_incy = (int)incy;
+
+      cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+      cublasSetStream(handle, THCState_getCurrentStream(state));
+      THCublasCheck(cublasDger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
+      return;
+    }
+  THError("Cublas_Dger only supports m, n, lda, incx, incy"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+
+cublasOperation_t convertTransToCublasOperation(char trans) {
+  if (trans == 't') return CUBLAS_OP_T;
+  else if (trans == 'n') return CUBLAS_OP_N;
+  else if (trans == 'c') return CUBLAS_OP_C;
+  else {
+    THError("trans must be one of: t, n, c");
+    return CUBLAS_OP_T;
+  }
+}
+
+void adjustLd(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc)
+{
+  int transa_ = ((transa == 't') || (transa == 'T'));
+  int transb_ = ((transb == 't') || (transb == 'T'));
+
+  if(n == 1)
+    *ldc = m;
+
+  if(transa_)
+  {
+    if(m == 1)
+      *lda = k;
+  }
+  else
+  {
+    if(k == 1)
+      *lda = m;
+  }
+
+  if(transb_)
+  {
+    if(k == 1)
+      *ldb = n;
+  }
+  else
+  {
+    if(n == 1)
+      *ldb = k;
+  }
+}
+
+/* Level 3 */
+void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
+    return;
+  }
+  THError("Cublas_Sgemm only supports m, n, k, lda, ldb, ldc"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+#ifdef CUDA_HALF_TENSOR
+// In CUDA 8.0, definition of data types for sgemmex changed
+#if CUDA_VERSION < 8000
+#  define CUDA_R_16F CUBLAS_DATA_HALF
+#endif
+
+void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+    {
+      int i_m = (int)m;
+      int i_n = (int)n;
+      int i_k = (int)k;
+      int i_lda = (int)lda;
+      int i_ldb = (int)ldb;
+      int i_ldc = (int)ldc;
+
+      cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+      cublasSetStream(handle, THCState_getCurrentStream(state));
+
+      // Simulated Hgemm
+      float fAlpha = THC_half2float(alpha);
+      float fBeta = THC_half2float(beta);
+
+#if CUDA_VERSION < 9000
+      THCublasCheck(cublasSgemmEx(handle, opa, opb,
+                                  i_m, i_n, i_k, &fAlpha,
+                                  a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
+                                  i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
+#else
+      cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state);
+      if (prop->major >= 5){
+        THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+        THCublasCheck(cublasGemmEx(handle, opa, opb,
+                                   i_m, i_n, i_k, &fAlpha,
+                                   a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
+                                   i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
+                                   CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
+        THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+      }else{
+        THCublasCheck(cublasSgemmEx(handle, opa, opb,
+                                    i_m, i_n, i_k, &fAlpha,
+                                    a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
+                                    i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
+      }
+#endif
+      return;
+    }
+  THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc"
+          "with th bound [val] <= %d", INT_MAX);
+}
+#endif
+
+void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
+    return;
+  }
+  THError("Cublas_Dgemm only supports m, n, k, lda, ldb, ldc"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+#if CUDA_VERSION >= 9010
+void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
+                             half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+
+  {
+    THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  float fAlpha = THC_half2float(alpha);
+  float fBeta = THC_half2float(beta);
+  THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  THCublasCheck(cublasGemmStridedBatchedEx(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   (void*)&fAlpha, a, CUDA_R_16F, (int)lda, strideA,
+                                   b, CUDA_R_16F, (int)ldb, strideB,
+                                   (void*)&fBeta, c, CUDA_R_16F, (int)ldc, strideC,
+                                   (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+}
+#endif
+
+void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
+                             float beta, float *c[], int64_t ldc, int64_t batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+  {
+    THError("Cublas_SgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgemmBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc,
+                                   (int)batchCount));
+}
+
+#if CUDA_VERSION >= 8000
+void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
+                             float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+
+  {
+    THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgemmStridedBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
+                                   (int)batchCount));
+}
+#endif
+
+void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
+                             double beta, double *c[], int64_t ldc, int64_t batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+  {
+    THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgemmBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc,
+                                   (int)batchCount));
+}
+
+#if CUDA_VERSION >= 8000
+void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
+                             double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+  {
+    THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgemmStridedBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
+                                   (int)batchCount));
+}
+#endif
+
+/* Inverse */
+void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) {
+  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Sgetrf only supports n, lda, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
+}
+
+void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) {
+  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrf only supports n, lda, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
+}
+
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
+{
+  if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  // no need to adjust leading dimensions, since matrices are square
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
+
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize)
+{
+  if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  // no need to adjust leading dimensions, since matrices are square
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
+void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
+
+  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Sgetri only supports n, lda, ldc, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
+}
+
+void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) {
+
+  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetri only supports n, lda, ldc, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
+}
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
new file mode 100644
index 0000000..d9cff32
--- /dev/null
+++ b/aten/src/THC/THCBlas.h
@@ -0,0 +1,59 @@
+#ifndef THC_BLAS_INC
+#define THC_BLAS_INC
+
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+/* Level 1 */
+THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy);
+THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy);
+#ifdef CUDA_HALF_TENSOR
+THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy);
+#endif
+
+/* Level 2 */
+THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy);
+THC_API void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy);
+THC_API void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
+THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
+
+/* Level 3 */
+THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
+THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
+
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc);
+#endif
+
+THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                                     float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
+                                     float beta, float *c[], int64_t ldc, int64_t batchCount);
+THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                                     double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
+                                     double beta, double *c[], int64_t ldc, int64_t batchCount);
+#if CUDA_VERSION >= 8000
+THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                                     float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
+                                     float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount);
+THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                                     double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
+                                     double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount);
+#endif
+
+#if CUDA_VERSION >= 9010
+void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                                     half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
+                                                                  half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount);
+#endif
+
+/* Inverse */
+THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
+
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize);
+
+THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
+
+#endif
diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp
new file mode 100644
index 0000000..7d400a2
--- /dev/null
+++ b/aten/src/THC/THCCachingAllocator.cpp
@@ -0,0 +1,575 @@
+#include "THCCachingAllocator.h"
+
+#include <ATen/Context.h>
+#include <ATen/cudnn/Exceptions.h>
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+// - Large (>1MB) and small allocation requests are handled separately. Large
+//   allocation requests can be filled by a cudaMalloc call of the exact size.
+//   Small requests will allocate and split a 1MB buffer, if necessary.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
+
+namespace {
+
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+typedef std::set<THCStreamPtr> stream_set;
+
+const size_t kRoundSmall = 512;     // round up small allocs to 512 bytes
+const size_t kRoundLarge = 131072;  // round up large allocs to 128 KiB
+const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB
+
+struct DeviceStats {
+  uint64_t   amount_allocated;      // total amount allocated in bytes
+  uint64_t   max_amount_allocated;  // max total amount allocated in bytes
+  uint64_t   amount_cached;         // total amount in cache in bytes
+  uint64_t   max_amount_cached;     // max total amount in cache in bytes
+
+  DeviceStats() :
+      amount_allocated(0), max_amount_allocated(0),
+      amount_cached(0), max_amount_cached(0) { }
+
+  void increaseAllocated(size_t delta) {
+    amount_allocated += delta;
+    max_amount_allocated = std::max(max_amount_allocated, amount_allocated);
+  }
+
+  void decreaseAllocated(size_t delta) {
+    amount_allocated -= delta;
+  }
+
+  void increaseCached(size_t delta) {
+    amount_cached += delta;
+    max_amount_cached = std::max(max_amount_cached, amount_cached);
+  }
+
+  void decreaseCached(size_t delta) {
+    amount_cached -= delta;
+  }
+};
+
+struct Block {
+  int           device;      // gpu
+  cudaStream_t  stream;      // allocation stream
+  stream_set    stream_uses; // streams on which the block was used
+  size_t        size;        // block size in bytes
+  char*         ptr;         // memory address
+  bool          allocated;   // in-use flag
+  Block*        prev;        // prev block if split from a larger allocation
+  Block*        next;        // next block if split from a larger allocation
+  int           event_count; // number of outstanding CUDA events
+
+  Block(int device, cudaStream_t stream, size_t size, char* ptr=NULL) :
+      device(device), stream(stream), stream_uses(), size(size), ptr(ptr),
+      allocated(0), prev(NULL), next(NULL), event_count(0) { }
+};
+
+static bool BlockComparator(const Block* a, const Block* b)
+{
+  if (a->device != b->device) {
+    return a->device < b->device;
+  }
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+} // namespace
+
+struct THCCachingAllocator
+{
+  typedef bool (*Comparison)(const Block*, const Block*);
+  typedef std::set<Block*, Comparison> FreeBlocks;
+
+  // device statistics
+  std::vector<DeviceStats> device_stats;
+
+  // lock around all operations
+  std::mutex mutex;
+
+  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+  std::mutex cuda_free_mutex;
+
+  // cached blocks larger than 1 MB
+  FreeBlocks large_blocks;
+
+  // cached blocks 1 MB or smaller
+  FreeBlocks small_blocks;
+
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocated_blocks;
+
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
+  THCCachingAllocator() :
+      large_blocks(BlockComparator),
+      small_blocks(BlockComparator) {}
+
+  DeviceStats &get_stats_for_device(int device) {
+    THAssert(device >= 0);
+    if ((size_t) device >= device_stats.size()) {
+      device_stats.resize(device + 1);
+    }
+    return device_stats.at(device);
+  }
+
+  /** allocates a block which is safe to use from the provided stream */
+  cudaError_t malloc(void** devPtr, size_t size, cudaStream_t stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    int device;
+    cudaError_t err = cudaGetDevice(&device);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    err = process_events();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    size = round_size(size);
+    bool small = size <= kSmallAlloc;
+
+    DeviceStats &stats = get_stats_for_device(device);
+
+    Block search_key(device, stream, size);
+    auto& free_blocks = small ? large_blocks : small_blocks;
+
+    Block* block = NULL;
+    Block* remaining = NULL;
+
+    auto it = free_blocks.lower_bound(&search_key);
+    if (it != free_blocks.end() && (*it)->device == device && (*it)->stream == stream) {
+      block = *it;
+      free_blocks.erase(it);
+    } else {
+      void* ptr;
+      size_t alloc_size = small ? kSmallAlloc : size;
+      err = cuda_malloc_retry(device, &ptr, alloc_size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      stats.increaseCached(alloc_size);
+      block = new Block(device, stream, alloc_size, (char*)ptr);
+    }
+
+    if (block->size - size >= (small ? kRoundSmall : kSmallAlloc + 1)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr += size;
+      remaining->size -= size;
+      free_blocks.insert(remaining);
+    }
+
+    block->allocated = true;
+    allocated_blocks[block->ptr] = block;
+
+    *devPtr = (void*)block->ptr;
+
+    stats.increaseAllocated(block->size);
+    return cudaSuccess;
+  }
+
+  cudaError_t free(void* ptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!ptr) {
+      return cudaSuccess;
+    }
+
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return cudaErrorInvalidDevicePointer;
+    }
+
+    Block* block = it->second;
+    allocated_blocks.erase(it);
+    block->allocated = false;
+
+    get_stats_for_device(block->device).decreaseAllocated(block->size);
+    if (!block->stream_uses.empty()) {
+      return insert_events(block);
+    }
+
+    free_block(block);
+    return cudaSuccess;
+  }
+
+  /** returns cached blocks to the system allocator */
+  cudaError_t emptyCache()
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cudaError_t err = free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    return cudaSuccess;
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    Block* block = find_allocated_block(ptr);
+    if (!block) {
+      THError("invalid device pointer: %p", ptr);
+    }
+    while (block->prev) {
+      block = block->prev;
+    }
+    void *basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  // Accumulates sizes of all memory blocks for given device in given free list
+  void cacheInfoAux(FreeBlocks& blocks, int dev_id, size_t* total, size_t* largest)
+  {
+    Block search_key(dev_id, 0, 0);
+    auto it = blocks.lower_bound(&search_key);
+    for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) {
+      size_t blocksize = (*it)->size;
+      *total += blocksize;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+
+  void cacheInfo(int dev_id, size_t* total, size_t* largest)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cacheInfoAux(large_blocks, dev_id, total, largest);
+    cacheInfoAux(small_blocks, dev_id, total, largest);
+  }
+
+  void recordStream(void* ptr, THCStream* stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    Block* block = find_allocated_block(ptr);
+    if (!block) {
+      THError("invalid device pointer: %p", ptr);
+    }
+    if (THCStream_stream(stream) == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    THCStream_retain(stream);
+    block->stream_uses.insert(THCStreamPtr(stream, &THCStream_free));
+  }
+
+  /** moves a block into the free block list */
+  void free_block(Block* block)
+  {
+    THAssert(!block->allocated && block->event_count == 0);
+    bool small = block->size <= kSmallAlloc;
+    auto& free_blocks = small ? large_blocks : small_blocks;
+    try_merge_blocks(block, block->prev, free_blocks);
+    try_merge_blocks(block, block->next, free_blocks);
+    free_blocks.insert(block);
+  }
+
+  /** combine previously split blocks */
+  void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
+  {
+    if (!src || src->allocated || src->event_count > 0) {
+      return;
+    }
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+    dst->size += src->size;
+    free_blocks.erase(src);
+    delete src;
+  }
+
+  size_t round_size(size_t size)
+  {
+    if (size < kRoundSmall) {
+      size = kRoundSmall;
+    } else if (size < kSmallAlloc) {
+      size += kRoundSmall - 1 - (size - 1) % kRoundSmall;
+    } else {
+      size += kRoundLarge - 1 - (size - 1) % kRoundLarge;
+    }
+    return size;
+  }
+
+  cudaError_t cuda_malloc_retry(int device, void** devPtr, size_t size)
+  {
+    // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks
+    // and retries.
+    cudaError_t err = cudaMalloc(devPtr, size);
+    if (err != cudaSuccess) {
+      cudaGetLastError();
+      err = free_cached_blocks(device);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaMalloc(devPtr, size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    return cudaSuccess;
+  }
+
+  cudaError_t free_cached_blocks(int device)
+  {
+    // Free all non-split cached blocks on device
+    Block lower_bound(device, NULL, 0);
+    Block upper_bound(device + 1, NULL, 0);
+
+    cudaError_t err = free_blocks(
+        large_blocks,
+        large_blocks.lower_bound(&lower_bound),
+        large_blocks.lower_bound(&upper_bound));
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = free_blocks(
+        small_blocks,
+        small_blocks.lower_bound(&lower_bound),
+        small_blocks.lower_bound(&upper_bound));
+    return err;
+  }
+
+  cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end)
+  {
+    // Frees all non-split blocks between `it` and `end`
+    std::lock_guard<std::mutex> lock(cuda_free_mutex);
+    while (it != end) {
+      Block* block = *it;
+      if (!block->prev && !block->next) {
+        cudaError_t err = cudaFree((void*)block->ptr);
+        if (err != cudaSuccess) {
+          return err;
+        }
+        get_stats_for_device(block->device).decreaseCached(block->size);
+        auto cur = it;
+        ++it;
+        blocks.erase(cur);
+        delete block;
+      } else {
+        ++it;
+      }
+    }
+    return cudaSuccess;
+  }
+
+  Block* find_allocated_block(void *ptr) {
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return NULL;
+    }
+    return it->second;
+  }
+
+  cudaError_t insert_events(Block* block)
+  {
+    cudaError_t err;
+
+    int prev_device;
+    err = cudaGetDevice(&prev_device);
+    if (err != cudaSuccess) return err;
+
+    std::set<THCStreamPtr> streams(std::move(block->stream_uses));
+    THAssert(block->stream_uses.empty());
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      auto& stream = *it;
+
+      err = cudaSetDevice(THCStream_device(stream.get()));
+      if (err != cudaSuccess) break;
+
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) break;
+
+      err = cudaEventRecord(event, THCStream_stream(stream.get()));
+      if (err != cudaSuccess) break;
+
+      block->event_count++;
+      cuda_events.emplace_back(event, block);
+    }
+
+    cudaSetDevice(prev_device);
+    return err;
+  }
+
+  cudaError_t process_events()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        break;
+      } else if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaEventDestroy(event);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+      cuda_events.pop_front();
+    }
+    return cudaSuccess;
+  }
+};
+
+THCCachingAllocator caching_allocator;
+
+static void CudaCachingDeleter(void* ptr) {
+  AT_CUDA_CHECK(caching_allocator.free(ptr));
+}
+
+// NB: I decided not to fold this into THCCachingAllocator, because the latter
+// has a lot more methods and it wasn't altogether clear that they should
+// actually be publically exposed
+struct CudaCachingAllocator : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+    void* r = nullptr;
+    if (size != 0) {
+      AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::globalContext().getCurrentCUDAStreamOnDevice(device)));
+    }
+    return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &CudaCachingDeleter;
+  }
+};
+
+CudaCachingAllocator device_allocator;
+
+THC_API at::Allocator* THCCachingAllocator_get(void)
+{
+  return &device_allocator;
+}
+
+THC_API void THCCachingAllocator_emptyCache(void) {
+  AT_CUDA_CHECK(caching_allocator.emptyCache());
+}
+
+THC_API void THCCachingAllocator_cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) {
+  caching_allocator.cacheInfo(dev_id, cachedAndFree, largestBlock);
+}
+
+THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size)
+{
+  return caching_allocator.getBaseAllocation(ptr, size);
+}
+
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream)
+{
+  caching_allocator.recordStream(ptr, stream);
+}
+
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex()
+{
+  return &caching_allocator.cuda_free_mutex;
+}
+
+static inline void assertValidDevice(int device) {
+  int device_count;
+  THCudaCheck(cudaGetDeviceCount(&device_count));
+  THAssertMsg(0 <= device && device < device_count, "Invalid device argument.");
+}
+
+THC_API uint64_t THCCachingAllocator_currentMemoryAllocated(int device)
+{
+  assertValidDevice(device);
+  return caching_allocator.get_stats_for_device(device).amount_allocated;
+}
+
+THC_API uint64_t THCCachingAllocator_maxMemoryAllocated(int device) {
+  assertValidDevice(device);
+  return caching_allocator.get_stats_for_device(device).max_amount_allocated;
+}
+
+THC_API uint64_t THCCachingAllocator_currentMemoryCached(int device)
+{
+  assertValidDevice(device);
+  return caching_allocator.get_stats_for_device(device).amount_cached;
+}
+
+THC_API uint64_t THCCachingAllocator_maxMemoryCached(int device) {
+  assertValidDevice(device);
+  return caching_allocator.get_stats_for_device(device).max_amount_cached;
+}
diff --git a/aten/src/THC/THCCachingAllocator.h b/aten/src/THC/THCCachingAllocator.h
new file mode 100644
index 0000000..61314ac
--- /dev/null
+++ b/aten/src/THC/THCCachingAllocator.h
@@ -0,0 +1,25 @@
+#ifndef THC_DEVICE_ALLOCATOR_INC
+#define THC_DEVICE_ALLOCATOR_INC
+
+#if (__cplusplus >= 201103L) || (defined(_MSC_VER) && defined(__cplusplus))
+#include <mutex>
+#endif
+
+#include "THCGeneral.h"
+#include "THCStream.h"
+
+THC_API THCDeviceAllocator* THCCachingAllocator_get(void);
+THC_API void THCCachingAllocator_emptyCache(void);
+THC_API void THCCachingAllocator_cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
+THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size);
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream);
+THC_API uint64_t THCCachingAllocator_currentMemoryAllocated(int device);
+THC_API uint64_t THCCachingAllocator_maxMemoryAllocated(int device);
+THC_API uint64_t THCCachingAllocator_currentMemoryCached(int device);
+THC_API uint64_t THCCachingAllocator_maxMemoryCached(int device);
+
+#if (__cplusplus >= 201103L) || (defined(_MSC_VER) && defined(__cplusplus))
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex();
+#endif
+
+#endif
diff --git a/aten/src/THC/THCCachingHostAllocator.cpp b/aten/src/THC/THCCachingHostAllocator.cpp
new file mode 100644
index 0000000..617c6f2
--- /dev/null
+++ b/aten/src/THC/THCCachingHostAllocator.cpp
@@ -0,0 +1,282 @@
+#include "THCCachingHostAllocator.h"
+#include "THCStream.h"
+
+#include <cuda_runtime_api.h>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <stdint.h>
+#include <unordered_map>
+#include <utility>
+
+namespace {
+
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+
+struct BlockSize
+{
+  size_t  size; // allocation size
+  void*   ptr;  // host memory pointer
+
+  BlockSize(size_t size, void* ptr=NULL) : size(size), ptr(ptr) {}
+};
+
+struct Block : public BlockSize
+{
+  bool  allocated;    // true if the block is currently allocated
+  int   event_count;  // number of outstanding cuda events
+  std::set<THCStreamPtr> streams;
+
+  Block(size_t size, void* ptr, bool allocated) :
+      BlockSize(size, ptr), allocated(allocated), event_count(0), streams() {}
+};
+
+static bool BlockComparator(const BlockSize& a, const BlockSize& b)
+{
+  // sort by size, break ties with pointer
+  if (a.size != b.size) {
+    return a.size < b.size;
+  }
+  return (uintptr_t)a.ptr < (uintptr_t)b.ptr;
+}
+
+struct HostAllocator
+{
+  typedef bool (*Comparison)(const BlockSize&, const BlockSize&);
+
+  // lock around all operations
+  std::mutex mutex;
+
+  // blocks by pointer
+  std::unordered_map<void*, Block> blocks;
+
+  // pointers that are ready to be allocated (event_count=0)
+  std::set<BlockSize, Comparison> available;
+
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, void*>> cuda_events;
+
+  HostAllocator() : available(BlockComparator) {}
+
+  cudaError_t malloc(void** ptr, size_t size)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    // search for the smallest block which can hold this allocation
+    BlockSize search_key(size);
+    auto it = available.lower_bound(search_key);
+    if (it != available.end()) {
+      Block& block = blocks.at(it->ptr);
+      THAssert(!block.allocated && block.event_count == 0);
+      block.allocated = true;
+      *ptr = block.ptr;
+      available.erase(it);
+      return cudaSuccess;
+    }
+
+    // note that cudaHostAlloc may not touch pointer if size is 0
+    *ptr = 0;
+
+    // allocate a new block if no cached allocation is found
+    err = cudaHostAlloc(ptr, size, cudaHostAllocDefault);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    blocks.insert({*ptr, Block(size, *ptr, true)});
+    return cudaSuccess;
+  }
+
+  cudaError_t free(void* ptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (!ptr) {
+      return cudaSuccess;
+    }
+
+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    auto it = blocks.find(ptr);
+    THAssert(it != blocks.end());
+
+    Block& block = it->second;
+    THAssert(block.allocated);
+
+    // free (on valid memory) shouldn't fail, so mark unallocated before
+    // we process the streams.
+    block.allocated = false;
+
+    // insert CUDA events for each stream on which this block was used. This
+    err = insertEvents(block);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    if (block.event_count == 0) {
+      // the block can be re-used if there are no outstanding cuda events
+      available.insert(block);
+    }
+    return cudaSuccess;
+  }
+
+  cudaError_t recordEvent(void* ptr, THCStream *stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    auto it = blocks.find(ptr);
+    if (it == blocks.end()) {
+      // ignore events for untracked pointers
+      return cudaSuccess;
+    }
+
+    Block& block = it->second;
+    THAssert(block.allocated);
+
+    THCStreamPtr stream_ptr(stream, &THCStream_free);
+    THCStream_retain(stream);
+
+    block.streams.insert(std::move(stream_ptr));
+    return cudaSuccess;
+  }
+
+  cudaError_t processEvents()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        break;
+      } else if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaEventDestroy(event);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      Block& block = blocks.at(e.second);
+      block.event_count--;
+      if (block.event_count == 0 && !block.allocated) {
+        available.insert(block);
+      }
+      cuda_events.pop_front();
+    }
+    return cudaSuccess;
+  }
+
+  void emptyCache()
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    // remove events for freed blocks
+    for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
+      cudaEvent_t event = it->first;
+      Block& block = blocks.at(it->second);
+      if (!block.allocated) {
+        THCudaCheckWarn(cudaEventDestroy(event));
+        block.event_count--;
+      }
+    }
+
+    // all cuda_events have been processed
+    cuda_events.clear();
+
+    // clear list of available blocks
+    available.clear();
+
+    // free and erase non-allocated blocks
+    for (auto it = blocks.begin(); it != blocks.end();) {
+      Block& block = it->second;
+      if (!block.allocated) {
+        THCudaCheckWarn(cudaFreeHost(block.ptr));
+        it = blocks.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+
+  cudaError_t insertEvents(Block& block)
+  {
+    cudaError_t err;
+
+    int prev_device;
+    err = cudaGetDevice(&prev_device);
+    if (err != cudaSuccess) return err;
+
+    std::set<THCStreamPtr> streams(std::move(block.streams));
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      auto& stream = *it;
+
+      err = cudaSetDevice(THCStream_device(stream.get()));
+      if (err != cudaSuccess) break;
+
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) break;
+
+      err = cudaEventRecord(event, THCStream_stream(stream.get()));
+      if (err != cudaSuccess) break;
+
+      block.event_count++;
+      cuda_events.emplace_back(event, block.ptr);
+    }
+
+    cudaSetDevice(prev_device);
+    return err;
+  }
+};
+
+}  // namespace
+
+static HostAllocator allocator;
+
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
+{
+  return allocator.recordEvent(ptr, stream);
+}
+
+void THCCachingHostAllocator_emptyCache()
+{
+  allocator.emptyCache();
+}
+
+static void THCCachingHostDeleter(void* ptr) {
+  allocator.free(ptr);
+}
+
+struct THCCachingHostAllocator final : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    THAssert(size >= 0);
+    void *ptr;
+    THCudaCheck(allocator.malloc(&ptr, size));
+    return {ptr, ptr, &THCCachingHostDeleter, at::kCPU};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &THCCachingHostDeleter;
+  }
+};
+
+static THCCachingHostAllocator thc_caching_host_allocator;
+at::Allocator* getTHCCachingHostAllocator() {
+  return &thc_caching_host_allocator;
+}
diff --git a/aten/src/THC/THCCachingHostAllocator.h b/aten/src/THC/THCCachingHostAllocator.h
new file mode 100644
index 0000000..adb86cb
--- /dev/null
+++ b/aten/src/THC/THCCachingHostAllocator.h
@@ -0,0 +1,31 @@
+#ifndef THC_CACHING_HOST_ALLOCATOR_INC
+#define THC_CACHING_HOST_ALLOCATOR_INC
+
+#include "THCGeneral.h"
+#include "THCStream.h"
+
+//
+// A caching allocator for CUDA host allocations (pinned memory).
+//
+// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// freed pinned (page-locked) memory allocations. This avoids device
+// synchronizations due to cudaFreeHost calls.
+//
+// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
+// called anytime a pointer from this allocator is used in a cudaMemcpyAsync
+// call between host and device. The THC library implements this for storages
+// and tensors in THCTensor_(copyAsyncCPU) and THCTensor_(copyAsyncCuda).
+//
+// Note that this allocator does not split larger allocations into smaller
+// blocks, unlike the caching device allocator.
+//
+THC_API THAllocator* getTHCCachingHostAllocator(void);
+
+// Records an event in the specified stream. The allocation 'ptr' will not be
+// re-used until the event has occurred.
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream);
+
+// Releases cached pinned memory allocations via cudaHostFree
+THC_API void THCCachingHostAllocator_emptyCache(void);
+
+#endif
diff --git a/aten/src/THC/THCDeviceTensor-inl.cuh b/aten/src/THC/THCDeviceTensor-inl.cuh
new file mode 100644
index 0000000..16e1f94
--- /dev/null
+++ b/aten/src/THC/THCDeviceTensor-inl.cuh
@@ -0,0 +1,416 @@
+#include <assert.h>
+
+namespace detail {
+
+template <typename T, int N>
+__host__ __device__ void copy(T to[N], T from[N]) {
+  for (int i = 0; i < N; ++i) {
+    to[i] = from[i];
+  }
+}
+
+} // namespace detail
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
+    : data_(NULL) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = 0;
+    stride_[i] = (IndexT) 1;
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
+#ifdef _MSC_VER
+THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
+#else
+THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+  }
+
+  stride_[Dim - 1] = (IndexT) 1;
+  for (int i = Dim - 2; i >= 0; --i) {
+    stride_[i] = stride_[i + 1] * sizes[i + 1];
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
+#ifdef _MSC_VER
+  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
+#else
+  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+    stride_[i] = strides[i];
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int OtherDim>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
+  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+
+  for (int i = 0; i < Dim; ++i) {
+    if (size_[i] != rhs.size_[i]) {
+      return false;
+    }
+
+    if (stride_[i] != rhs.stride_[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
+  thc_static_assert(sizeof(U) == sizeof(T));
+
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
+  thc_static_assert(sizeof(U) == sizeof(T));
+
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ ptrdiff_t
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
+  ptrdiff_t size = getSize(0);
+
+  for (int i = 1; i < Dim; ++i) {
+    size *= getSize(i);
+  }
+
+  return size;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
+  return isContiguousRange(0, Dim);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
+  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
+    return true;
+  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
+             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
+    return true;
+  }
+
+  return false;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
+  for (int i = 0; i < Dim; ++i) {
+    if (!isConsistentlySized(i)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
+  int first, int last) const {
+
+  int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
+
+  for (int i = last - 1; i >= first; --i) {
+    if (getSize(i) != (IndexT) 1) {
+      if (getStride(i) == prevSize) {
+        prevSize *= getSize(i);
+      } else {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
+                                                      int dim2) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
+  // Device code
+  assert(dim1 >= 0 && dim1 < Dim);
+  assert(dim1 >= 0 && dim2 < Dim);
+#else
+  // Host code
+  if (dim1 < 0 || dim1 >= Dim) {
+    THError("dim1 out of bounds");
+  }
+
+  if (dim2 < 0 || dim2 >= Dim) {
+    THError("dim2 out of bounds");
+  }
+#endif
+
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i];
+  }
+
+  IndexT tmp = newSize[dim1];
+  newSize[dim1] = newSize[dim2];
+  newSize[dim2] = tmp;
+
+  tmp = newStride[dim1];
+  newStride[dim1] = newStride[dim2];
+  newStride[dim2] = tmp;
+
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int shift = NewDim - Dim;
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < shift) {
+      // These are the extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = size_[0] * stride_[0];
+    } else {
+      // Shift the remaining dimensions
+      newSize[i] = size_[i - shift];
+      newStride[i] = stride_[i - shift];
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < Dim) {
+      // Existing dimensions get copied over
+      newSize[i] = size_[i];
+      newStride[i] = stride_[i];
+    } else {
+      // Extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = (IndexT) 1;
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  bool cont = isContiguousRange(0, Dim - NewDim);
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
+  // Device code
+  assert(cont);
+#else
+  // Host code
+  if (!cont) {
+    THError("Can only downcast contiguous tensors");
+  }
+#endif
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int ignoredDims = Dim - NewDim;
+  IndexT collapsedSize = 1;
+
+  for (int i = 0; i < Dim; ++i) {
+    if (i < ignoredDims) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == ignoredDims) {
+        // This is the first non-collapsed dimension
+        newSize[i - ignoredDims] = collapsedSize * getSize(i);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i - ignoredDims] = getSize(i);
+      }
+
+      newStride[i - ignoredDims] = getStride(i);
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  bool cont = isContiguousRange(NewDim, Dim);
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
+  // Device code
+  assert(cont);
+#else
+  // Host code
+  if (!cont) {
+    THError("Can only downcast contiguous tensors");
+  }
+#endif
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  IndexT collapsedSize = 1;
+
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (i >= NewDim) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == NewDim - 1) {
+        // This is the first non-collapsed dimension
+        newSize[i] = collapsedSize * getSize(i);
+        newStride[i] = getStride(Dim - 1);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i] = getSize(i);
+        newStride[i] = getStride(i);
+      }
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
+  thc_static_assert(SubDim >= 1 && SubDim < Dim);
+
+  IndexT viewSizes[SubDim];
+  IndexT viewStrides[SubDim];
+
+  for (int i = 0; i < SubDim; ++i) {
+    viewSizes[i] = size_[Dim - SubDim + i];
+    viewStrides[i] = stride_[Dim - SubDim + i];
+  }
+
+  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
+    at, viewSizes, viewStrides);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
+  return view<SubDim>(data_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+void
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
+  assert(isContiguous());
+#else
+  if (!isContiguous()) {
+    THError("fillAsync only works on contiguous data");
+  }
+#endif
+
+  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
+}
diff --git a/aten/src/THC/THCDeviceTensor.cuh b/aten/src/THC/THCDeviceTensor.cuh
new file mode 100644
index 0000000..2df26be
--- /dev/null
+++ b/aten/src/THC/THCDeviceTensor.cuh
@@ -0,0 +1,513 @@
+#ifndef THC_DEVICE_TENSOR_INC
+#define THC_DEVICE_TENSOR_INC
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
+template <bool>
+struct THCStaticAssert;
+
+template <>
+struct THCStaticAssert<true> {
+};
+
+#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
+
+/// Our tensor type
+template <typename T,
+          int Dim,
+          typename IndexT,
+          template <typename U> class PtrTraits>
+class THCDeviceTensor;
+
+/// Type of a subspace of a tensor
+namespace detail {
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor;
+}
+
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+/**
+   Templated multi-dimensional array that supports strided access of
+   elements. Main access is through `operator[]`; e.g.,
+   `tensor[x][y][z]`.
+
+- `T` is the contained type (e.g., `float`)
+- `Dim` is the tensor rank
+- `IndexT` is the integer type used for size/stride arrays, and for
+- all indexing math. Default is `int`, but for large tensors, `int64_t`
+- can be used instead.
+- `PtrTraits` are traits applied to our data pointer (T*). By default,
+- this is just T*, but RestrictPtrTraits can be used to apply T*
+- __restrict__ for alias-free analysis.
+*/
+template <typename T,
+          int Dim,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = DefaultPtrTraits>
+class THCDeviceTensor {
+ public:
+  enum { NumDim = Dim };
+  typedef T DataType;
+  typedef IndexT IndexType;
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
+
+  /// Default constructor
+  __host__ __device__ THCDeviceTensor();
+
+  /// Constructor that calculates strides with no padding
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim]);
+#else
+                                      const IndexT sizes[Dim]);
+#endif
+
+  /// Constructor that takes arbitrary size/stride arrays
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim],
+                                      const IndexT (&strides)[Dim]);
+#else
+                                      const IndexT sizes[Dim],
+                                      const IndexT strides[Dim]);
+#endif
+
+  /// Returns true if the two tensors are of the same dimensionality,
+  /// size and stride.
+  template <int OtherDim>
+  __host__ __device__ bool
+  isSameSizeAndStride(
+    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
+
+  /// Cast to a tensor of a different type of the same size and stride
+  template <typename U>
+  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
+
+  /// Const version of `cast`
+  template <typename U>
+  __host__ __device__
+  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
+
+  /// Returns a raw pointer to the start of our data.
+  __host__ __device__ __forceinline__ DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw pointer to the start of our data (const).
+  __host__ __device__ __forceinline__
+  const DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<U>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  const typename PtrTraits<const U>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
+  }
+
+  /// Returns a read/write view of a portion of our tensor.
+  __host__ __device__ __forceinline__
+  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT);
+
+  /// Returns a read/write view of a portion of our tensor (const).
+  __host__ __device__ __forceinline__
+  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT) const;
+
+  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getSize(int i) const {
+    return size_[i];
+  }
+
+  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getStride(int i) const {
+    return stride_[i];
+  }
+
+  /// Returns the total number of elements contained within our data
+  /// (product of `getSize(i)`)
+  __host__ __device__ ptrdiff_t numElements() const;
+
+  /// Returns the size array.
+  __host__ __device__ __forceinline__ const IndexT* sizes() const {
+    return size_;
+  }
+
+  /// Returns the stride array.
+  __host__ __device__ __forceinline__ const IndexT* strides() const {
+    return stride_;
+  }
+
+  /// Returns true if there is no padding within the tensor and no
+  /// re-ordering of the dimensions.
+  /// ~~~
+  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
+  /// ~~~
+  __host__ __device__ bool isContiguous() const;
+
+  /// Returns whether a given dimension has only increasing stride
+  /// from the previous dimension. A tensor that was permuted by
+  /// exchanging size and stride only will fail this check.
+  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
+  __host__ __device__ bool isConsistentlySized(int i) const;
+
+  // Returns whether at each dimension `stride <= size`.
+  // If this is not the case then iterating once over the size space will
+  // touch the same memory locations multiple times.
+  __host__ __device__ bool isConsistentlySized() const;
+
+  /// Returns true if the given dimension range [first, last) has no padding.
+  __host__ __device__ bool isContiguousRange(int first, int last) const;
+
+  /// Returns a tensor of the same dimension after transposing the two
+  /// dimensions given. Does not actually move elements; transposition
+  /// is made by permuting the size/stride arrays.
+  /// If the dimensions are not valid, asserts.
+  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+  transpose(int dim1, int dim2) const;
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the leading dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastOuter();
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the lowest/most varying dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastInner();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting at `at`.
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view(DataPtrType at);
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting where our data begins
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view();
+
+  /// Zeroes out the tensor asynchronously. Asserts if the contents
+  /// in question are not contiguous.
+  void zero(cudaStream_t stream = 0);
+
+ private:
+  /// Raw pointer to where the tensor data begins
+  DataPtrType data_;
+
+  /// Array of strides (in sizeof(T) terms) per each dimension
+  IndexT stride_[Dim];
+
+  /// Size per each dimension
+  IndexT size_[Dim];
+};
+
+namespace detail {
+
+/// Specialization for a view of a single value (0-dimensional)
+template <typename TensorType, template <typename U> class PtrTraits>
+class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
+ public:
+  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
+  operator=(typename TensorType::DataType val) {
+    *data_ = val;
+    return *this;
+  }
+
+  // operator T&
+  __host__ __device__ operator typename TensorType::DataType&() {
+    return *data_;
+  }
+
+  // const operator T& returning const T&
+  __host__ __device__ operator const typename TensorType::DataType&() const {
+    return *data_;
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+  private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class THCDeviceTensor<typename TensorType::DataType,
+                               1,
+                               typename TensorType::IndexType,
+                               PtrTraits>;
+
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// Where our value is located
+  typename TensorType::DataPtrType const data_;
+};
+
+/// A `SubDim`-rank slice of a parent THCDeviceTensor
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor {
+ public:
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor).
+  __host__ __device__ __forceinline__
+  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor) (const).
+  __host__ __device__ __forceinline__
+  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) const {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+  /// Returns a tensor that is a view of the SubDim-dimensional slice
+  /// of this tensor, starting where our data begins
+  THCDeviceTensor<typename TensorType::DataType,
+               SubDim,
+               typename TensorType::IndexType,
+               PtrTraits> view() {
+    return tensor_.template view<SubDim>(data_);
+  }
+
+ private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class
+  THCDeviceTensor<typename TensorType::DataType,
+               TensorType::NumDim,
+               typename TensorType::IndexType,
+               PtrTraits>;
+
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// The start of our sub-region
+  typename TensorType::DataPtrType const data_;
+};
+
+} // namespace detail
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                        Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      *this, data_)[index]);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                              Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      const_cast<TensorType&>(*this), data_)[index]);
+}
+
+#include "THCDeviceTensor-inl.cuh"
+
+#endif // THC_DEVICE_TENSOR_INC
diff --git a/aten/src/THC/THCDeviceTensorUtils-inl.cuh b/aten/src/THC/THCDeviceTensorUtils-inl.cuh
new file mode 100644
index 0000000..469dd5f
--- /dev/null
+++ b/aten/src/THC/THCDeviceTensorUtils-inl.cuh
@@ -0,0 +1,118 @@
+namespace detail {
+
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHC :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim > Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template upcastOuter<NewDim>();
+  }
+};
+
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHC :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim < Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template downcastOuter<NewDim>();
+  }
+};
+
+} // namespace detail
+
+#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
+  case i:                                                               \
+  if (NewDim > i) {                                                     \
+    return detail::UpcastTHC<T, i, IndexT,                              \
+                             PtrTraits, NewDim, (NewDim > i)>::         \
+      make(state, t);                                                   \
+  } else if (NewDim == i) {                                             \
+    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
+  } else {                                                              \
+    return detail::DowncastTHC<T, i, IndexT,                            \
+                               PtrTraits, NewDim, (NewDim < i)>::       \
+      make(state, t);                                                   \
+  }                                                                     \
+  /* break; */
+
+template <typename T, int NewDim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  switch (THCudaTensor__nDimension(state, t)) {
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
+    default:
+      ;
+  }
+
+  // Not implemented
+  THError("THCDeviceTensor dimension size not supported");
+  return NULL; /* never enters this piece, appeasing compiler warnings */
+}
+
+#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
diff --git a/aten/src/THC/THCDeviceTensorUtils.cuh b/aten/src/THC/THCDeviceTensorUtils.cuh
new file mode 100644
index 0000000..2ab9d4e
--- /dev/null
+++ b/aten/src/THC/THCDeviceTensorUtils.cuh
@@ -0,0 +1,80 @@
+#ifndef THC_DEVICE_TENSOR_UTILS_INC
+#define THC_DEVICE_TENSOR_UTILS_INC
+
+#include "THCDeviceTensor.cuh"
+#include "THCTensor.hpp"
+#include <limits>
+
+/// Constructs a DeviceTensor initialized from a THCudaTensor by
+/// upcasting or downcasting the tensor to that of a different
+/// dimension.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t);
+
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+
+/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
+/// error if the dimensionality does not match exactly.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t);
+
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  if (Dim != THCTensor__nDimension(state, t)) {
+    THError("THCudaTensor dimension mismatch");
+  }
+  // Determine the maximum offset into the tensor achievable; `IndexT`
+  // must be smaller than this type in order to use it.
+  ptrdiff_t maxOffset = 0;
+  IndexT sizes[Dim];
+  IndexT strides[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    int64_t size = THCTensor_size(state, t, i);
+    int64_t stride = THCTensor_stride(state, t, i);
+
+    maxOffset += (size - 1) * stride;
+
+    sizes[i] = (IndexT) size;
+    strides[i] = (IndexT) stride;
+  }
+
+  if (maxOffset > std::numeric_limits<IndexT>::max()) {
+    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
+  }
+
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
+    t->data<T>(), sizes, strides);
+}
+
+#include "THCDeviceTensorUtils-inl.cuh"
+
+#endif // THC_DEVICE_TENSOR_UTILS_INC
diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh
new file mode 100644
index 0000000..7f16455
--- /dev/null
+++ b/aten/src/THC/THCDeviceUtils.cuh
@@ -0,0 +1,112 @@
+#ifndef THC_DEVICE_UTILS_INC
+#define THC_DEVICE_UTILS_INC
+
+#include <cuda.h>
+/* The largest consecutive integer representable in float32 (2^24) */
+#define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T THCCeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+/**
+   Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest
+   multiple of b
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T THCRoundUp(T a, T b) {
+  return THCCeilDiv(a, b) * b;
+}
+
+/**
+ * For CC 3.5+, perform a load using __ldg
+ */
+template <typename T>
+__device__ __forceinline__ T doLdg(const T* p) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(p);
+#else
+  return *p;
+#endif
+}
+
+__device__ __forceinline__ unsigned int ACTIVE_MASK()
+{
+#if CUDA_VERSION >= 9000
+    return __activemask();
+#else
+// will be ignored anyway
+    return 0xffffffff;
+#endif
+}
+
+__device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __ballot_sync(mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+#ifdef __HIP_PLATFORM_HCC__
+//To handle ambiguity, add a type double version.
+__device__ __forceinline__ double WARP_SHFL_XOR(double value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) {
+  //(HIP doesn't support double)
+  return (double) __shfl_xor((float) value, laneMask, width);
+}
+#endif
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_sync(mask, value, srcLane, width);
+#else
+    return __shfl(value, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_up_sync(mask, value, delta, width);
+#else
+    return __shfl_up(value, delta, width);
+#endif
+}
+
+#ifdef __HIP_PLATFORM_HCC__
+//To handle ambiguity, add a type double version.
+__device__ __forceinline__ double WARP_SHFL_DOWN(double value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+  //(HIP doesn't support double)
+  return (double) __shfl_down((float) value, delta, width);
+}
+#endif
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_down_sync(mask, value, delta, width);
+#else
+    return __shfl_down(value, delta, width);
+#endif
+}
+
+
+#endif // THC_DEVICE_UTILS_INC
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
new file mode 100644
index 0000000..bde2c39
--- /dev/null
+++ b/aten/src/THC/THCGeneral.cpp
@@ -0,0 +1,761 @@
+#include "THCGeneral.h"
+#include "TH.h"
+#include "THCAllocator.h"
+#include "THCCachingHostAllocator.h"
+#include "THCThreadLocal.h"
+#include "THCTensorRandom.h"
+#include "THCGeneral.hpp"
+
+#include "ATen/CUDAStream.h"
+
+#include "THCCachingAllocator.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Size of scratch space available in global memory per each SM + stream */
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)
+
+/* Minimum amount of scratch space per device. Total scratch memory per
+ * device is either this amount, or the # of SMs * the space per SM defined
+ * above, whichever is greater.*/
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float)
+
+/* Maximum number of P2P connections (if there are more than 9 then P2P is
+ * enabled in groups of 8). */
+#define THC_CUDA_MAX_PEER_SIZE 8
+
+void THCState_free(THCState* state)
+{
+  free(state);
+}
+
+THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
+  THCState *state, int device);
+
+THCState* THCState_alloc(void)
+{
+  THCState* state = (THCState*) malloc(sizeof(THCState));
+  memset(state, 0, sizeof(THCState));
+  return state;
+}
+
+static void THDefaultDeviceDeleter(void* ptr) {
+  THCudaCheck(cudaFree(ptr));
+}
+
+struct THDefaultDeviceAllocator final : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    void* p = nullptr;
+    if (size != 0) THCudaCheck(cudaMalloc(&p, size));
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+    return {p, p, &THDefaultDeviceDeleter, at::Device(at::kCUDA, device)};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return &THDefaultDeviceDeleter;
+  }
+};
+
+static THDefaultDeviceAllocator defaultDeviceAllocator;
+
+void THCudaInit(THCState* state)
+{
+  if (!state->cudaDeviceAllocator) {
+    state->cudaDeviceAllocator = &defaultDeviceAllocator;
+  }
+  if (!state->cudaHostAllocator) {
+    state->cudaHostAllocator = getTHCudaHostAllocator();
+  }
+  if (!state->cudaUVAAllocator) {
+    state->cudaUVAAllocator = getTHCUVAAllocator();
+  }
+
+  int numDevices = 0;
+  THCudaCheck(cudaGetDeviceCount(&numDevices));
+  state->numDevices = numDevices;
+
+  int device = 0;
+  THCudaCheck(cudaGetDevice(&device));
+
+  state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
+  state->currentPerDeviceSparseHandle = THCThreadLocal_alloc();
+
+  state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
+    malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
+  memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice));
+
+  state->deviceProperties =
+    (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp));
+
+  state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
+  THCRandom_init(state, numDevices, device);
+
+  // By default, all direct p2p kernel access (besides copy) is disallowed,
+  // since direct access without knowing whether or not a certain operation
+  // should be cross-GPU leads to synchronization errors. The user can choose
+  // to disable this functionality, however.
+  state->p2pKernelAccessEnabled = 0;
+
+  // p2pAccessEnabled records if p2p copies are allowed between pairs of
+  // devices. Values include "1" (copy allowed), "0" (copy not allowed), and
+  // "-1" (unknown).
+  // Currently the max number of gpus in P2P group is 8, so if there are more
+  // we enable P2P in groups of 8
+  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
+  for (int i = 0; i < numDevices; ++i) {
+    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
+    for (int j = 0; j < numDevices; ++j)
+      if (i == j)
+        state->p2pAccessEnabled[i][j] = 1;
+      else if (j / THC_CUDA_MAX_PEER_SIZE != i / THC_CUDA_MAX_PEER_SIZE)
+        state->p2pAccessEnabled[i][j] = 0;
+      else
+        state->p2pAccessEnabled[i][j] = -1;
+  }
+
+  for (int i = 0; i < numDevices; ++i) {
+    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);
+    THCudaCheck(cudaSetDevice(i));
+    THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
+
+    /* The scratch space that we want to have available per each device is
+       based on the number of SMs available per device. We guarantee a
+       minimum of 128kb of space per device, but to future-proof against
+       future architectures that may have huge #s of SMs, we guarantee that
+       we have at least 16 bytes for each SM. */
+    int numSM = state->deviceProperties[i].multiProcessorCount;
+    size_t sizePerStream =
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ?
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE :
+      numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
+    res->scratchSpacePerStream = sizePerStream;
+  }
+
+  /* Restore to previous device */
+  THCudaCheck(cudaSetDevice(device));
+
+  // Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC
+  // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
+  // handles are created lazily.
+  state->numUserBlasHandles = 1;
+  state->numUserSparseHandles = 1;
+
+  state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
+  state->heapDelta = 0;
+}
+
+void THCudaShutdown(THCState* state)
+{
+  THCRandom_shutdown(state);
+
+  free(state->rngState);
+  free(state->deviceProperties);
+
+  int deviceCount = 0;
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaGetDeviceCount(&deviceCount));
+
+  /* cleanup p2p access state */
+  for (int dev = 0; dev < deviceCount; ++dev) {
+    free(state->p2pAccessEnabled[dev]);
+  }
+  free(state->p2pAccessEnabled);
+
+  /* cleanup per-device state */
+  for (int dev = 0; dev < deviceCount; ++dev) {
+    THCudaCheck(cudaSetDevice(dev));
+    THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
+    /* Free user defined BLAS handles */
+    for (int i = 0; i < res->numBlasHandles; ++i) {
+      THCublasCheck(cublasDestroy(res->blasHandles[i]));
+    }
+    /* Free user defined sparse handles */
+    for (int i = 0; i < res->numSparseHandles; ++i) {
+      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+    }
+
+    free(res->blasHandles);
+    free(res->sparseHandles);
+  }
+  free(state->resourcesPerDevice);
+  if (state->cudaDeviceAllocator == THCCachingAllocator_get()) {
+    THCCachingAllocator_emptyCache();
+  }
+  if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
+    THCCachingHostAllocator_emptyCache();
+  }
+  THCThreadLocal_free(state->currentPerDeviceBlasHandle);
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
+{
+  if (dev < 0 || dev >= state->numDevices) {
+    THError("%d is not a device", dev);
+  }
+  if (devToAccess < 0 || devToAccess >= state->numDevices) {
+    THError("%d is not a device", devToAccess);
+  }
+  if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
+    int prevDev = 0;
+    THCudaCheck(cudaGetDevice(&prevDev));
+    THCudaCheck(cudaSetDevice(dev));
+
+    int access = 0;
+    THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
+    if (access) {
+      cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        // ignore and clear the error if access was already enabled
+        cudaGetLastError();
+      } else {
+        THCudaCheck(err);
+      }
+      state->p2pAccessEnabled[dev][devToAccess] = 1;
+    } else {
+      state->p2pAccessEnabled[dev][devToAccess] = 0;
+    }
+
+    THCudaCheck(cudaSetDevice(prevDev));
+  }
+  return state->p2pAccessEnabled[dev][devToAccess];
+}
+
+void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
+                                  int enable)
+{
+  /* This will perform device bounds checking for us */
+  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
+
+  if (enable != prevEnabled) {
+    /* If we're attempting to enable p2p access but p2p access isn't */
+    /* supported, throw an error */
+    if (enable) {
+      int access = 0;
+      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
+
+      if (!access) {
+        THError("p2p access not supported for %d accessing %d",
+                dev, devToAccess);
+      }
+    }
+
+    state->p2pAccessEnabled[dev][devToAccess] = enable;
+
+    int prevDev = 0;
+    THCudaCheck(cudaGetDevice(&prevDev));
+    THCudaCheck(cudaSetDevice(dev));
+
+    /* This should be in sync with the current access state */
+    if (enable) {
+      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
+    } else {
+      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
+    }
+
+    THCudaCheck(cudaSetDevice(prevDev));
+  }
+}
+
+int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
+  return state->p2pKernelAccessEnabled;
+}
+
+void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
+  state->p2pKernelAccessEnabled = val;
+}
+
+struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
+{
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+
+  return &(state->deviceProperties[curDev]);
+}
+
+struct cudaDeviceProp* THCState_getDeviceProperties(THCState* state, int device)
+{
+  THAssert(device >= 0 && device < state->numDevices);
+  return &(state->deviceProperties[device]);
+}
+
+struct THCRNGState* THCState_getRngState(THCState *state)
+{
+  return state->rngState;
+}
+
+THAllocator* THCState_getCudaHostAllocator(THCState* state)
+{
+  return state->cudaHostAllocator;
+}
+
+THAllocator* THCState_getCudaUVAAllocator(THCState* state)
+{
+  return state->cudaUVAAllocator;
+}
+
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
+{
+  return state->cudaDeviceAllocator;
+}
+
+void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
+{
+  state->cudaDeviceAllocator = allocator;
+}
+
+int THCState_isCachingAllocatorEnabled(THCState* state) {
+  return state->cudaHostAllocator == getTHCCachingHostAllocator();
+}
+
+int THCState_getNumDevices(THCState *state)
+{
+  return state->numDevices;
+}
+
+void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles)
+{
+  int prevDev = -1;
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  if (numBlasHandles <= res->numBlasHandles) {
+    return;
+  }
+
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaSetDevice(device));
+
+  size_t size = numBlasHandles * sizeof(cublasHandle_t);
+  cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size);
+  for (int i = res->numBlasHandles; i < numBlasHandles; ++i) {
+    handles[i] = NULL;
+    THCublasCheck(cublasCreate(&handles[i]));
+  }
+  res->blasHandles = handles;
+  res->numBlasHandles = numBlasHandles;
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles)
+{
+  int prevDev = -1;
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  if (numSparseHandles <= res->numSparseHandles) {
+    return;
+  }
+
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaSetDevice(device));
+
+  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+    handles[i] = NULL;
+    THCusparseCheck(cusparseCreate(&handles[i]));
+  }
+  res->sparseHandles = handles;
+  res->numSparseHandles = numSparseHandles;
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
+{
+  // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
+  // to avoid initializing unused devices
+  if (numBlasHandles > state->numUserBlasHandles)
+  {
+    state->numUserBlasHandles = numBlasHandles;
+  }
+}
+
+void THCState_reserveSparseHandles(THCState* state, int numSparseHandles)
+{
+  // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle
+  // to avoid initializing unused devices
+  if (numSparseHandles > state->numUserSparseHandles)
+  {
+    state->numUserSparseHandles = numSparseHandles;
+  }
+}
+
+int THCState_getNumBlasHandles(THCState* state)
+{
+  return state->numUserBlasHandles;
+}
+
+int THCState_getNumSparseHandles(THCState* state)
+{
+  return state->numUserSparseHandles;
+}
+
+THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
+  THCState *state, int device)
+{
+  /* `device` is a CUDA index */
+  if (device >= state->numDevices || device < 0)
+  {
+    THError("%d is not a device", device + 1 /* back to Torch index */);
+  }
+
+  return &(state->resourcesPerDevice[device]);
+}
+
+cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
+{
+  if (handle <= 0 || handle > state->numUserBlasHandles) {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserBlasHandles);
+  }
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  THCState_reserveDeviceBlasHandles(state, device, handle);
+  return res->blasHandles[handle - 1];
+}
+
+cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+{
+  if (handle <= 0 || handle > state->numUserSparseHandles) {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserSparseHandles);
+  }
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  THCState_reserveDeviceSparseHandles(state, device, handle);
+  return res->sparseHandles[handle - 1];
+}
+
+THCStream* THCState_getStreamOnDevice(THCState* state, int device) {
+  return at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device);
+}
+
+void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream) {
+  at::detail::CUDAStream_setStreamOnDevice(device, stream);
+}
+
+cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device) {
+  return at::detail::CUDAStream_stream(
+    at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device));
+}
+
+cudaStream_t THCState_getCurrentStream(THCState *state) {
+  return at::detail::CUDAStream_stream(
+    at::detail::CUDAStream_getCurrentStreamUnsafe());
+}
+
+THCStream* THCState_getStream(THCState *state) {
+  return at::detail::CUDAStream_getCurrentStreamUnsafe();
+}
+
+void THCState_setStream(THCState *state, THCStream *stream) {
+  at::detail::CUDAStream_setStream(stream);
+}
+
+cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
+{
+  /* This is called at the point of kernel execution.
+     For some debugging code or improperly instrumented kernels,
+     `state` is null */
+  if (state) {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+
+    int handle = THCState_getCurrentBlasHandleIndex(state);
+    return THCState_getDeviceBlasHandle(state, device, handle);
+  }
+  THError("THCState and blasHandles must be set as there is no default blasHandle");
+  return NULL;
+}
+
+cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+{
+  /* This is called at the point of kernel execution.
+     For some debugging code or improperly instrumented kernels,
+     `state` is null */
+  if (state) {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+
+    int handle = THCState_getCurrentSparseHandleIndex(state);
+    return THCState_getDeviceSparseHandle(state, device, handle);
+  }
+  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+  return NULL;
+}
+
+int THCState_getCurrentBlasHandleIndex(THCState *state)
+{
+  void* value = THCThreadLocal_get(state->currentPerDeviceBlasHandle);
+  if (value == NULL) {
+    return 1;
+  }
+  return (int) (intptr_t) value;
+}
+
+int THCState_getCurrentSparseHandleIndex(THCState *state)
+{
+  void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle);
+  if (value == NULL) {
+    return 1;
+  }
+  return (int) (intptr_t) value;
+}
+
+void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
+{
+  if (handle > state->numUserBlasHandles || handle <= 0)
+  {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserBlasHandles);
+  }
+  THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
+}
+
+void THCState_setCurrentSparseHandleIndex(THCState *state, int handle)
+{
+  if (handle > state->numUserSparseHandles || handle <= 0)
+  {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserSparseHandles);
+  }
+  THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle);
+}
+
+size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
+{
+  int device = -1;
+  THCudaCheck(cudaGetDevice(&device));
+  return THCState_getDeviceScratchSpaceSize(state, device);
+}
+
+size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
+{
+  THCCudaResourcesPerDevice* res =
+    THCState_getDeviceResourcePtr(state, device);
+
+  return res->scratchSpacePerStream;
+}
+
+void __THCudaCheck(cudaError_t err, const char *file, const int line)
+{
+  if(err != cudaSuccess)
+  {
+    static int alreadyFailed = 0;
+    if(!alreadyFailed) {
+      fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
+      alreadyFailed = 1;
+    }
+    _THError(file, line, "cuda runtime error (%d) : %s", err,
+             cudaGetErrorString(err));
+  }
+}
+
+void __THCudaCheckWarn(cudaError_t err, const char *file, const int line)
+{
+  if(err != cudaSuccess)
+  {
+    fprintf(stderr, "THCudaCheckWarn FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
+  }
+}
+
+void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
+{
+  if(status != CUBLAS_STATUS_SUCCESS)
+  {
+    const char* errmsg = NULL;
+
+    switch(status)
+    {
+      case CUBLAS_STATUS_NOT_INITIALIZED:
+        errmsg = "library not initialized";
+        break;
+
+      case CUBLAS_STATUS_ALLOC_FAILED:
+        errmsg = "resource allocation failed";
+        break;
+
+      case CUBLAS_STATUS_INVALID_VALUE:
+        errmsg = "an invalid numeric value was used as an argument";
+        break;
+
+      case CUBLAS_STATUS_ARCH_MISMATCH:
+        errmsg = "an absent device architectural feature is required";
+        break;
+
+      case CUBLAS_STATUS_MAPPING_ERROR:
+        errmsg = "an access to GPU memory space failed";
+        break;
+
+      case CUBLAS_STATUS_EXECUTION_FAILED:
+        errmsg = "the GPU program failed to execute";
+        break;
+
+      case CUBLAS_STATUS_INTERNAL_ERROR:
+        errmsg = "an internal operation failed";
+        break;
+
+      default:
+        errmsg = "unknown error";
+        break;
+    }
+
+    _THError(file, line, "cublas runtime error : %s", errmsg);
+  }
+}
+
+void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+{
+  if(status != CUSPARSE_STATUS_SUCCESS)
+  {
+    const char* errmsg = NULL;
+
+    switch(status)
+    {
+      case CUSPARSE_STATUS_NOT_INITIALIZED:
+        errmsg = "library not initialized";
+        break;
+
+      case CUSPARSE_STATUS_ALLOC_FAILED:
+        errmsg = "resource allocation failed";
+        break;
+
+      case CUSPARSE_STATUS_INVALID_VALUE:
+        errmsg = "an invalid numeric value was used as an argument";
+        break;
+
+      case CUSPARSE_STATUS_ARCH_MISMATCH:
+        errmsg = "an absent device architectural feature is required";
+        break;
+
+      case CUSPARSE_STATUS_MAPPING_ERROR:
+        errmsg = "an access to GPU memory space failed";
+        break;
+
+      case CUSPARSE_STATUS_EXECUTION_FAILED:
+        errmsg = "the GPU program failed to execute";
+        break;
+
+      case CUSPARSE_STATUS_INTERNAL_ERROR:
+        errmsg = "an internal operation failed";
+        break;
+
+      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+        errmsg = "the matrix type is not supported by this function";
+        break;
+
+      default:
+        errmsg = "unknown error";
+        break;
+    }
+
+    _THError(file, line, "cusparse runtime error : %s", errmsg);
+  }
+}
+
+void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
+{
+  state->cutorchGCFunction = cutorchGCFunction_;
+  state->cutorchGCData = data;
+}
+
+void* THCudaMalloc(THCState *state, size_t size)
+{
+  THCudaCheck(cudaGetLastError());
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+  if (state->cutorchGCFunction != nullptr) {
+    try {
+      return allocator->raw_allocate(size);
+    } catch (...) {
+      cudaGetLastError(); // reset OOM error
+      (state->cutorchGCFunction)(state->cutorchGCData);
+      return allocator->raw_allocate(size);
+    }
+  } else {
+    return allocator->raw_allocate(size);
+  }
+}
+
+void THCudaFree(THCState *state, void* ptr) {
+  state->cudaDeviceAllocator->raw_deallocate(ptr);
+}
+
+at::DataPtr THCudaHostAlloc(THCState *state, size_t size)
+{
+  THCudaCheck(cudaGetLastError());
+  THAllocator* allocator = state->cudaHostAllocator;
+  return allocator->allocate(size);
+}
+
+void THCudaHostRecord(THCState *state, void *ptr) {
+  if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
+    THCStream* stream = THCState_getStream(state);
+    THCCachingHostAllocator_recordEvent(ptr, stream);
+  }
+}
+
+cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes)
+{
+  size_t largestBlock = 0;
+  return THCudaMemGetInfoCached(state, freeBytes, totalBytes, &largestBlock);
+}
+
+cudaError_t THCudaMemGetInfoCached(THCState *state,  size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
+{
+  size_t cachedBytes = 0;
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+
+  *largestBlock = 0;
+  /* get info from CUDA first */
+  cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  int device;
+  ret = cudaGetDevice(&device);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  /* not always true - our optimistic guess here */
+  *largestBlock = *freeBytes;
+
+  if (allocator == THCCachingAllocator_get()) {
+    THCCachingAllocator_cacheInfo(device, &cachedBytes, largestBlock);
+  }
+
+  /* Adjust resulting free bytes number. largesBlock unused for now */
+  *freeBytes += cachedBytes;
+  return cudaSuccess;
+}
+
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE
+
+#include "THCStorage.cpp"
+#include "THCAllocator.cpp"
+
+/* from THCHalf.h */
+
+half THC_float2half(float f)
+{
+#if CUDA_VERSION < 9000
+  half h;
+  TH_float2halfbits(&f, &h.x);
+  return h;
+#else
+  __half_raw h_raw;
+  TH_float2halfbits(&f, &h_raw.x);
+  return half(h_raw);
+#endif
+}
+
+float  THC_half2float(half h)
+{
+  float f;
+#if CUDA_VERSION < 9000
+  TH_halfbits2float(&h.x, &f);
+#else
+  __half_raw h_raw(h);
+  TH_halfbits2float(&h_raw.x, &f);
+#endif
+  return f;
+}
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
new file mode 100644
index 0000000..4275916
--- /dev/null
+++ b/aten/src/THC/THCGeneral.h.in
@@ -0,0 +1,158 @@
+#ifndef THC_GENERAL_INC
+#define THC_GENERAL_INC
+
+#include "THGeneral.h"
+#include "THAllocator.h"
+#include "THCThreadLocal.h"
+#undef log10
+#undef log1p
+#undef log2
+#undef expm1
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "cusparse.h"
+
+#cmakedefine USE_MAGMA
+
+#ifdef __cplusplus
+# define THC_EXTERNC extern "C"
+#else
+# define THC_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS)
+#  define THC_API THC_EXTERNC __declspec(dllexport)
+#  define THC_CLASS __declspec(dllexport)
+# else
+#  define THC_API THC_EXTERNC __declspec(dllimport)
+#  define THC_CLASS __declspec(dllimport)
+# endif
+#else
+# define THC_API THC_EXTERNC
+# define THC_CLASS
+#endif
+
+#ifndef THAssert
+#define THAssert(exp)                                                   \
+  do {                                                                  \
+    if (!(exp)) {                                                       \
+      _THError(__FILE__, __LINE__, "assert(%s) failed", #exp);          \
+    }                                                                   \
+  } while(0)
+#endif
+
+struct THCRNGState;  /* Random number generator state. */
+typedef struct CUDAStreamInternals THCStream;
+typedef struct THCState THCState;
+struct THCState;
+
+typedef THAllocator THCDeviceAllocator;
+
+typedef struct _THCCudaResourcesPerDevice {
+  /* Number of materialized cuBLAS handles */
+  int numBlasHandles;
+  /* Number of materialized cuSparse handles */
+  int numSparseHandles;
+  /* cuBLAS handes are lazily initialized */
+  cublasHandle_t* blasHandles;
+  /* cuSparse handes are lazily initialized */
+  cusparseHandle_t* sparseHandles;
+  /* Size of scratch space per each stream on this device available */
+  size_t scratchSpacePerStream;
+} THCCudaResourcesPerDevice;
+
+THC_API THCState* THCState_alloc(void);
+THC_API void THCState_free(THCState* state);
+
+THC_API void THCudaInit(THCState* state);
+THC_API void THCudaShutdown(THCState* state);
+
+/* If device `dev` can access allocations on device `devToAccess`, this will return */
+/* 1; otherwise, 0. */
+THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
+/* Enables or disables allowed p2p access using cutorch copy. If we are */
+/* attempting to enable access, throws an error if CUDA cannot enable p2p */
+/* access. */
+THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
+                                          int enable);
+
+/* By default, direct in-kernel access to memory on remote GPUs is
+   disabled. When set, this allows direct in-kernel access to remote
+   GPUs where GPU/GPU p2p access is enabled and allowed. */
+THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state);
+THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val);
+
+THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state);
+THC_API struct cudaDeviceProp* THCState_getDeviceProperties(THCState* state, int device);
+
+THC_API struct THCRNGState* THCState_getRngState(THCState* state);
+THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
+THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state);
+THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
+THC_API int THCState_isCachingAllocatorEnabled(THCState* state);
+
+THC_API void THCMagma_init(THCState *state);
+
+/* State manipulators and accessors */
+THC_API int THCState_getNumDevices(THCState* state);
+
+/* Stream API */
+THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device);
+THC_API cudaStream_t THCState_getCurrentStream(THCState *state);
+
+THC_API THCStream* THCState_getStream(THCState *state);
+THC_API void THCState_setStream(THCState *state, THCStream* stream);
+THC_API THCStream* THCState_getStreamOnDevice(THCState* state, int device);
+THC_API void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream);
+
+THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumBlasHandles(THCState* state);
+
+THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumSparseHandles(THCState* state);
+
+THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
+THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
+THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
+THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
+
+THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
+THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
+THC_API int THCState_getCurrentSparseHandleIndex(THCState *state);
+THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle);
+
+/* For the current device and stream, returns the allocated scratch space */
+THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
+THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
+
+#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
+#define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
+#define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
+#define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
+#define THCusparseCheck(err)  __THCusparseCheck(err,  __FILE__, __LINE__)
+
+THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
+THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
+THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
+
+THC_API void* THCudaMalloc(THCState *state, size_t size);
+THC_API void THCudaFree(THCState *state, void* ptr);
+
+#ifdef __cplusplus
+at::DataPtr THCudaHostAlloc(THCState *state, size_t size);
+#endif
+
+THC_API void THCudaHostRecord(THCState *state, void *ptr);
+
+THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
+THC_API cudaError_t THCudaMemGetInfoCached(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock);
+THC_API void THCSetGCHandler(THCState *state,
+                             void (*torchGCHandlerFunction)(void *data),
+                             void *data );
+
+#endif
diff --git a/aten/src/THC/THCGeneral.hpp b/aten/src/THC/THCGeneral.hpp
new file mode 100644
index 0000000..89436f7
--- /dev/null
+++ b/aten/src/THC/THCGeneral.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "THCGeneral.h"
+
+/* Global state of THC. */
+struct THCState {
+  struct THCRNGState* rngState;
+  struct cudaDeviceProp* deviceProperties;
+  /* Set of all allocated resources. blasHandles and sparseHandles do not have
+     a default and must be explicitly initialized. We always initialize 1
+     blasHandle and 1 sparseHandle but we can use more.
+  */
+  THCCudaResourcesPerDevice* resourcesPerDevice;
+  /* Captured number of devices upon startup; convenience for bounds checking */
+  int numDevices;
+  int numUserBlasHandles;
+  int numUserSparseHandles;
+
+  /* Allocator using cudaMallocHost. */
+  // NB: These allocators (specifically, cudaHostAllocator) MUST implement
+  // maybeGlobalBoundDeleter, because we have a few use-cases where we need to
+  // do raw allocations with them (for Thrust).
+  // TODO: Make this statically obvious
+  at::Allocator* cudaHostAllocator;
+  at::Allocator* cudaUVAAllocator;
+  at::Allocator* cudaDeviceAllocator;
+
+  /* Index of the current selected BLAS handle. The actual BLAS handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+  /* Index of the current selected sparse handle. The actual sparse handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
+
+  /* Table of enabled peer-to-peer access between directed pairs of GPUs.
+     If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
+  int** p2pAccessEnabled;
+
+  /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
+     copies are allowed via p2p if p2p access is enabled at all for
+     the pair of GPUs in question, but if this flag is true, then
+     all cross-GPU access checks are disabled, allowing kernels to
+     directly access memory on another GPUs.
+     Note that p2p access must exist and be enabled for the pair of
+     GPUs in question. */
+  int p2pKernelAccessEnabled;
+
+  void (*cutorchGCFunction)(void *data);
+  void *cutorchGCData;
+  ptrdiff_t heapSoftmax;
+  ptrdiff_t heapDelta;
+};
diff --git a/aten/src/THC/THCGenerateAllTypes.h b/aten/src/THC/THCGenerateAllTypes.h
new file mode 100644
index 0000000..27a8bd2
--- /dev/null
+++ b/aten/src/THC/THCGenerateAllTypes.h
@@ -0,0 +1,37 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h"
+#endif
+
+#define THCGenerateAllTypes
+
+#define THCTypeIdxByte   1
+#define THCTypeIdxChar   2
+#define THCTypeIdxShort  3
+#define THCTypeIdxInt    4
+#define THCTypeIdxLong   5
+#define THCTypeIdxFloat  6
+#define THCTypeIdxDouble 7
+#define THCTypeIdxHalf   8
+#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
+
+#include "THCGenerateByteType.h"
+#include "THCGenerateCharType.h"
+#include "THCGenerateShortType.h"
+#include "THCGenerateIntType.h"
+#include "THCGenerateLongType.h"
+#include "THCGenerateHalfType.h"
+#include "THCGenerateFloatType.h"
+#include "THCGenerateDoubleType.h"
+
+#undef THCTypeIdxByte
+#undef THCTypeIdxChar
+#undef THCTypeIdxShort
+#undef THCTypeIdxInt
+#undef THCTypeIdxLong
+#undef THCTypeIdxFloat
+#undef THCTypeIdxDouble
+#undef THCTypeIdxHalf
+#undef THCTypeIdx_
+
+#undef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
diff --git a/aten/src/THC/THCGenerateByteType.h b/aten/src/THC/THCGenerateByteType.h
new file mode 100644
index 0000000..4f76800
--- /dev/null
+++ b/aten/src/THC/THCGenerateByteType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateByteType.h"
+#endif
+
+#define real uint8_t
+#define accreal int64_t
+#define Real Byte
+#define CReal CudaByte
+#define THC_REAL_IS_BYTE
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_BYTE
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/aten/src/THC/THCGenerateCharType.h b/aten/src/THC/THCGenerateCharType.h
new file mode 100644
index 0000000..ec86b1a
--- /dev/null
+++ b/aten/src/THC/THCGenerateCharType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateCharType.h"
+#endif
+
+#define real int8_t
+#define accreal int64_t
+#define Real Char
+#define CReal CudaChar
+#define THC_REAL_IS_CHAR
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_CHAR
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/aten/src/THC/THCGenerateDoubleType.h b/aten/src/THC/THCGenerateDoubleType.h
new file mode 100644
index 0000000..fdf6a8e
--- /dev/null
+++ b/aten/src/THC/THCGenerateDoubleType.h
@@ -0,0 +1,22 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateDoubleType.h"
+#endif
+
+#define real double
+#define accreal double
+#define Real Double
+#define CReal CudaDouble
+#define THC_REAL_IS_DOUBLE
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_DOUBLE
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/aten/src/THC/THCGenerateFloatType.h b/aten/src/THC/THCGenerateFloatType.h
new file mode 100644
index 0000000..997988d
--- /dev/null
+++ b/aten/src/THC/THCGenerateFloatType.h
@@ -0,0 +1,24 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateFloatType.h"
+#endif
+
+#define real float
+/* FIXME: fp64 has bad performance on some platforms; avoid using it unless
+   we opt into it? */
+#define accreal float
+#define Real Float
+#define CReal Cuda
+#define THC_REAL_IS_FLOAT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_FLOAT
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/aten/src/THC/THCGenerateFloatTypes.h b/aten/src/THC/THCGenerateFloatTypes.h
new file mode 100644
index 0000000..11bf46d
--- /dev/null
+++ b/aten/src/THC/THCGenerateFloatTypes.h
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateFloatTypes.h"
+#endif
+
+#define THCGenerateFloatTypes
+
+#define THCTypeIdxByte   1
+#define THCTypeIdxChar   2
+#define THCTypeIdxShort  3
+#define THCTypeIdxInt    4
+#define THCTypeIdxLong   5
+#define THCTypeIdxFloat  6
+#define THCTypeIdxDouble 7
+#define THCTypeIdxHalf   8
+#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
+
+#include "THCGenerateHalfType.h"
+#include "THCGenerateFloatType.h"
+#include "THCGenerateDoubleType.h"
+
+#undef THCTypeIdxByte
+#undef THCTypeIdxChar
+#undef THCTypeIdxShort
+#undef THCTypeIdxInt
+#undef THCTypeIdxLong
+#undef THCTypeIdxFloat
+#undef THCTypeIdxDouble
+#undef THCTypeIdxHalf
+#undef THCTypeIdx_
+
+#undef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h
new file mode 100644
index 0000000..77d4c0a
--- /dev/null
+++ b/aten/src/THC/THCGenerateHalfType.h
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
+#endif
+
+#include "THCHalf.h"
+
+#if defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF)
+
+#define real half
+#define accreal float
+#define Real Half
+
+// if only here via FORCE_TH_HALF, don't define CReal since
+// FORCE_TH_HALF should only be used for TH types
+#ifdef CUDA_HALF_TENSOR
+#define CReal CudaHalf
+#endif
+
+#define THC_REAL_IS_HALF
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+
+#ifdef CUDA_HALF_TENSOR
+#undef CReal
+#endif
+
+#undef THC_REAL_IS_HALF
+
+#endif // defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF)
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/aten/src/THC/THCGenerateIntType.h b/aten/src/THC/THCGenerateIntType.h
new file mode 100644
index 0000000..ec393dd
--- /dev/null
+++ b/aten/src/THC/THCGenerateIntType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateIntType.h"
+#endif
+
+#define real int32_t
+#define accreal int64_t
+#define Real Int
+#define CReal CudaInt
+#define THC_REAL_IS_INT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_INT
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/aten/src/THC/THCGenerateLongType.h b/aten/src/THC/THCGenerateLongType.h
new file mode 100644
index 0000000..f47840c
--- /dev/null
+++ b/aten/src/THC/THCGenerateLongType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateLongType.h"
+#endif
+
+#define real int64_t
+#define accreal int64_t
+#define Real Long
+#define CReal CudaLong
+#define THC_REAL_IS_LONG
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_LONG
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/aten/src/THC/THCGenerateShortType.h b/aten/src/THC/THCGenerateShortType.h
new file mode 100644
index 0000000..cfc5536
--- /dev/null
+++ b/aten/src/THC/THCGenerateShortType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateShortType.h"
+#endif
+
+#define real int16_t
+#define accreal int64_t
+#define Real Short
+#define CReal CudaShort
+#define THC_REAL_IS_SHORT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_SHORT
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/aten/src/THC/THCGenerator.hpp b/aten/src/THC/THCGenerator.hpp
new file mode 100644
index 0000000..ea5d1ba
--- /dev/null
+++ b/aten/src/THC/THCGenerator.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include <atomic>
+#include <mutex>
+
+typedef struct THCGeneratorState {
+  struct curandStateMtgp32* gen_states;
+  struct mtgp32_kernel_params *kernel_params;
+  int initf;
+  uint64_t initial_seed;
+  std::atomic<int64_t> philox_seed_offset;
+} THCGeneratorState;
+
+struct THCGenerator {
+  std::mutex mutex; /* mutex for using this generator */
+  THCGeneratorState state;
+};
diff --git a/aten/src/THC/THCHalf.cu b/aten/src/THC/THCHalf.cu
new file mode 100644
index 0000000..7863260
--- /dev/null
+++ b/aten/src/THC/THCHalf.cu
@@ -0,0 +1,51 @@
+#include "THCHalf.h"
+#include "THCThrustAllocator.cuh"
+#include <thrust/transform.h>
+#include <thrust/execution_policy.h>
+
+struct __half2floatOp {
+  __device__ float operator()(half v) { return __half2float(v); }
+};
+
+struct __float2halfOp {
+  __device__ half operator()(float v) { return __float2half(v); }
+};
+
+void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    in, in + len, out, __float2halfOp());
+}
+
+void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    in, in + len, out, __half2floatOp());
+}
+
+THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
+  cudaDeviceProp* prop =
+    THCState_getCurrentDeviceProperties(state);
+
+  // CC 5.3+
+  return (prop->major > 5 ||
+          (prop->major == 5 && prop->minor == 3));
+}
+
+THC_EXTERNC int THC_fastHalfInstructions(THCState *state) {
+  cudaDeviceProp* prop =
+    THCState_getCurrentDeviceProperties(state);
+
+  // Check for CC 6.0 only (corresponds to P100)
+  return (prop->major == 6 && prop->minor == 0);
+}
diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h
new file mode 100644
index 0000000..d9b8cba
--- /dev/null
+++ b/aten/src/THC/THCHalf.h
@@ -0,0 +1,35 @@
+#ifndef THC_HALF_CONVERSION_INC
+#define THC_HALF_CONVERSION_INC
+
+#include "THCGeneral.h"
+
+/* We compile with CudaHalfTensor support if we have this: */
+#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
+#define CUDA_HALF_TENSOR 1
+#endif
+
+#ifdef CUDA_HALF_TENSOR
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
+#ifndef __cplusplus
+typedef __half_raw half;
+#endif
+#endif
+
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
+THC_API half THC_float2half(float a);
+THC_API float THC_half2float(half a);
+
+/* Check for native fp16 support on the current device (CC 5.3+) */
+THC_API int THC_nativeHalfInstructions(THCState *state);
+
+/* Check for performant native fp16 support on the current device */
+THC_API int THC_fastHalfInstructions(THCState *state);
+
+#endif /* CUDA_HALF_TENSOR */
+
+#endif
diff --git a/aten/src/THC/THCIntegerDivider.cuh b/aten/src/THC/THCIntegerDivider.cuh
new file mode 100644
index 0000000..cf71deb
--- /dev/null
+++ b/aten/src/THC/THCIntegerDivider.cuh
@@ -0,0 +1,120 @@
+#ifndef THC_INTEGER_DIVIDER_INC
+#define THC_INTEGER_DIVIDER_INC
+
+#include <assert.h>
+
+// A utility class to implement integer division by muliplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+
+  __host__ __device__ DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() { }  // Dummy constructor for arrays.
+  IntDivider(Value d) : divisor(d) { }
+
+  __host__ __device__ inline Value div(Value n) const { return n / divisor; }
+  __host__ __device__ inline Value mod(Value n) const { return n % divisor; }
+  __host__ __device__ inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+
+  Value divisor;
+};
+
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+
+  IntDivider() { }  // Dummy constructor for arrays.
+
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+
+  __host__ __device__ inline unsigned int div(unsigned int n) const {
+#ifdef __CUDA_ARCH__
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+
+  __host__ __device__ inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+
+  __host__ __device__ inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+
+#endif // THC_INTEGER_DIVIDER_INC
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
new file mode 100644
index 0000000..36823c0
--- /dev/null
+++ b/aten/src/THC/THCNumerics.cuh
@@ -0,0 +1,812 @@
+#ifndef THC_NUMERICS_INC
+#define THC_NUMERICS_INC
+
+#include <cuda.h>
+#include <limits.h>
+#include <assert.h>
+#include "THCHalf.h"
+
+/// Class for numeric limits of the particular data type, which
+/// includes support for `half`.
+/// Unfortunately since `half` does not have a constructor, these have
+/// to be expressed as functions (either that or non-const statics).
+template <typename T>
+struct THCNumerics {
+};
+
+template <typename scalar_t>
+static inline __host__ __device__ scalar_t powi(scalar_t a, scalar_t b) {
+  assert(THCNumerics<scalar_t>::ge(b, 0));
+  scalar_t result = 1;
+  while (b) {
+    if (b & 1) {
+       result *= a;
+    }
+    b /= 2;
+    a *= a;
+  }
+  return result;
+}
+
+template <>
+struct THCNumerics<uint8_t> {
+  static inline __host__ __device__ uint8_t min() { return 0; }
+  static inline __host__ __device__ uint8_t max() { return UCHAR_MAX; }
+
+  static inline __host__ __device__ bool lt(uint8_t a, uint8_t b) { return a < b; }
+  static inline __host__ __device__ bool le(uint8_t a, uint8_t b) { return a <= b; }
+  static inline __host__ __device__ bool gt(uint8_t a, uint8_t b) { return a > b; }
+  static inline __host__ __device__ bool ge(uint8_t a, uint8_t b) { return a >= b; }
+  static inline __host__ __device__ bool eq(uint8_t a, uint8_t b) { return a == b; }
+  static inline __host__ __device__ bool ne(uint8_t a, uint8_t b) { return a != b; }
+
+  static inline __host__ __device__  uint8_t neg(int8_t a) { return -a; }
+  static inline __host__ __device__  uint8_t add(uint8_t a, uint8_t b) { return a + b; }
+  static inline __host__ __device__  uint8_t mul(uint8_t a, uint8_t b) { return a * b; }
+  static inline __host__ __device__  uint8_t sub(uint8_t a, uint8_t b) { return a - b; }
+  static inline __host__ __device__  uint8_t div(uint8_t a, uint8_t b) { return a / b; }
+  static inline __host__ __device__  uint8_t abs(uint8_t a) { return a; }
+  static inline __host__ __device__  uint8_t pow(uint8_t a, uint8_t b) { return powi<uint8_t>(a, b); }
+  static inline __host__ __device__  bool isnan(uint8_t a) { return false; }
+  static inline __host__ __device__  bool isinf(uint8_t a) { return false; }
+};
+
+template <>
+struct THCNumerics<int8_t> {
+  static inline __host__ __device__ int8_t min() { return SCHAR_MIN; }
+  static inline __host__ __device__ int8_t max() { return SCHAR_MAX; }
+
+  static inline __host__ __device__ bool lt(int8_t a, int8_t b) { return a < b; }
+  static inline __host__ __device__ bool le(int8_t a, int8_t b) { return a <= b; }
+  static inline __host__ __device__ bool gt(int8_t a, int8_t b) { return a > b; }
+  static inline __host__ __device__ bool ge(int8_t a, int8_t b) { return a >= b; }
+  static inline __host__ __device__ bool eq(int8_t a, int8_t b) { return a == b; }
+  static inline __host__ __device__ bool ne(int8_t a, int8_t b) { return a != b; }
+
+  static inline __host__ __device__  int8_t neg(int8_t a) { return -a; }
+  static inline __host__ __device__  int8_t add(int8_t a, int8_t b) { return a + b; }
+  static inline __host__ __device__  int8_t mul(int8_t a, int8_t b) { return a * b; }
+  static inline __host__ __device__  int8_t sub(int8_t a, int8_t b) { return a - b; }
+  static inline __host__ __device__  int8_t div(int8_t a, int8_t b) { return a / b; }
+  static inline __host__ __device__  int8_t abs(int8_t a) { return ::abs((int)a); }
+  static inline __host__ __device__  int8_t pow(int8_t a, int8_t b) { return powi<int8_t>(a, b); }
+  static inline __host__ __device__  bool isnan(int8_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int8_t a) { return false; }
+};
+
+template <>
+struct THCNumerics<int16_t> {
+  static inline __host__ __device__ int16_t min() { return SHRT_MIN; }
+  static inline __host__ __device__ int16_t max() { return SHRT_MAX; }
+
+  static inline __host__ __device__ bool lt(int16_t a, int16_t b) { return a < b; }
+  static inline __host__ __device__ bool le(int16_t a, int16_t b) { return a <= b; }
+  static inline __host__ __device__ bool gt(int16_t a, int16_t b) { return a > b; }
+  static inline __host__ __device__ bool ge(int16_t a, int16_t b) { return a >= b; }
+  static inline __host__ __device__ bool eq(int16_t a, int16_t b) { return a == b; }
+  static inline __host__ __device__ bool ne(int16_t a, int16_t b) { return a != b; }
+
+  static inline __host__ __device__  int16_t neg(int16_t a) { return -a; }
+  static inline __host__ __device__  int16_t add(int16_t a, int16_t b) { return a + b; }
+  static inline __host__ __device__  int16_t mul(int16_t a, int16_t b) { return a * b; }
+  static inline __host__ __device__  int16_t sub(int16_t a, int16_t b) { return a - b; }
+  static inline __host__ __device__  int16_t div(int16_t a, int16_t b) { return a / b; }
+  static inline __host__ __device__  int16_t abs(int16_t a) { return ::abs((int)a); }
+  static inline __host__ __device__  int16_t pow(int16_t a, int16_t b) { return powi<int16_t>(a, b); }
+  static inline __host__ __device__  bool isnan(int16_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int16_t a) { return false; }
+};
+
+template <>
+struct THCNumerics<int32_t> {
+  static inline __host__ __device__ int32_t min() { return INT_MIN; }
+  static inline __host__ __device__ int32_t max() { return INT_MAX; }
+
+  static inline __host__ __device__ bool lt(int32_t a, int32_t b) { return a < b; }
+  static inline __host__ __device__ bool le(int32_t a, int32_t b) { return a <= b; }
+  static inline __host__ __device__ bool gt(int32_t a, int32_t b) { return a > b; }
+  static inline __host__ __device__ bool ge(int32_t a, int32_t b) { return a >= b; }
+  static inline __host__ __device__ bool eq(int32_t a, int32_t b) { return a == b; }
+  static inline __host__ __device__ bool ne(int32_t a, int32_t b) { return a != b; }
+
+  static inline __host__ __device__  int32_t neg(int32_t a) { return -a; }
+  static inline __host__ __device__  int32_t add(int32_t a, int32_t b) { return a + b; }
+  static inline __host__ __device__  int32_t mul(int32_t a, int32_t b) { return a * b; }
+  static inline __host__ __device__  int32_t sub(int32_t a, int32_t b) { return a - b; }
+  static inline __host__ __device__  int32_t div(int32_t a, int32_t b) { return a / b; }
+  static inline __host__ __device__  int32_t abs(int32_t a) { return ::abs(a); }
+  static inline __host__ __device__  int32_t pow(int32_t a, int32_t b) { return powi<int32_t>(a, b); }
+  static inline __host__ __device__  bool isnan(int32_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int32_t a) { return false; }
+};
+
+template <>
+struct THCNumerics<int64_t> {
+#ifdef _MSC_VER
+  static inline __host__ __device__ int64_t min() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t max() { return _I64_MAX; }
+#else
+  static inline __host__ __device__ int64_t min() { return LONG_MIN; }
+  static inline __host__ __device__ int64_t max() { return LONG_MAX; }
+#endif
+
+  static inline __host__ __device__ bool lt(int64_t a, int64_t b) { return a < b; }
+  static inline __host__ __device__ bool le(int64_t a, int64_t b) { return a <= b; }
+  static inline __host__ __device__ bool gt(int64_t a, int64_t b) { return a > b; }
+  static inline __host__ __device__ bool ge(int64_t a, int64_t b) { return a >= b; }
+  static inline __host__ __device__ bool eq(int64_t a, int64_t b) { return a == b; }
+  static inline __host__ __device__ bool ne(int64_t a, int64_t b) { return a != b; }
+
+
+  static inline __host__ __device__  int64_t neg(int64_t a) { return -a; }
+  static inline __host__ __device__  int64_t add(int64_t a, int64_t b) { return a + b; }
+  static inline __host__ __device__  int64_t mul(int64_t a, int64_t b) { return a * b; }
+  static inline __host__ __device__  int64_t sub(int64_t a, int64_t b) { return a - b; }
+  static inline __host__ __device__  int64_t div(int64_t a, int64_t b) { return a / b; };
+  static inline __host__ __device__  int64_t abs(int64_t a) { return labs(a); }
+  static inline __host__ __device__  int64_t pow(int64_t a, int64_t b) { return powi<int64_t>(a, b); }
+  static inline __host__ __device__  bool isnan(int64_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int64_t a) { return false; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct THCNumerics<half> {
+#if CUDA_VERSION < 9000
+  static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
+  static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
+#else
+  static inline __host__ __device__ half min() { __half_raw h; h.x = 0xfbff; return h; }
+  static inline __host__ __device__ half max() { __half_raw h; h.x = 0x7bff; return h; }
+#endif
+
+  static inline __host__ __device__ bool lt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hlt(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa < fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) < THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool le(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hle(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa <= fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) <= THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool gt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hgt(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa > fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) > THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool ge(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hge(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa >= fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) >= THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool eq(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __heq(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa == fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) == THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool ne(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hne(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa != fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) != THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ half exp(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hexp(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(expf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(expf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half exp10(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hexp10(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(exp10f(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(exp10f(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hlog(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(logf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(logf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log10(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(log10f(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(log10f(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log1p(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(log1pf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(log1pf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log2(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(log2f(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(log2f(THC_half2float(a)));
+#endif
+  }
+
+static inline __host__ __device__ half lgamma(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(lgammaf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(lgammaf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half expm1(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(expm1f(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(expm1f(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half cos(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hcos(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(cosf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(cosf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sin(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hsin(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(sinf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(sinf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sqrt(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hsqrt(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(sqrtf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(sqrtf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half rsqrt(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hrsqrt(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(rsqrtf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(rsqrtf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half ceil(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hceil(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(ceilf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(ceilf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half floor(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hfloor(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(floorf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(floorf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half trunc(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return htrunc(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(truncf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(truncf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half neg(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(-fa);
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(-(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half acos(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(acosf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(acosf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half cosh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(coshf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(coshf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half asin(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(asinf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(asinf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sinh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(sinhf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(sinhf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half tan(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(tanf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(tanf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half atan(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(atanf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(atanf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half tanh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(tanhf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(tanhf(THC_half2float(a)));
+#endif
+  }
+
+
+   static inline __host__ __device__ half erf(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(erff(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(erff(THC_half2float(a)));
+#endif
+  }
+
+
+   static inline __host__ __device__ half erfc(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(erfcf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(erfcf(THC_half2float(a)));
+#endif
+  }
+
+
+  static inline __host__ __device__ half erfinv(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(erfinvf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(erfinvf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half abs(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(fabs(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(fabs(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half round(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(roundf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(roundf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half frac(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(fa - truncf(fa));
+#else // __CUDA_ARCH__
+    float fa = THC_half2float(a);
+    return THC_float2half(fa - floorf(fa));
+#endif
+  }
+
+  static inline __host__ __device__ half cinv(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(1.0f / fa);
+#else // __CUDA_ARCH__
+    return THC_float2half(1.0f / THC_half2float(a));
+#endif
+  }
+
+  static inline __host__ __device__ half add(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hadd(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa + fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) + THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half div(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa / fb );
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) / THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half mul(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa * fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) * THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half sub(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hsub(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa - fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) - THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half pow(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(powf(fa, fb));
+#else // __CUDA_ARCH__
+    return THC_float2half(powf(THC_half2float(a), THC_half2float(b)));
+#endif
+  }
+
+  static inline __host__ __device__ half atan2(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(atan2f(fa, fb));
+#else // __CUDA_ARCH__
+    return THC_float2half(atan2f(THC_half2float(a), THC_half2float(b)));
+#endif
+  }
+
+  static inline __host__ __device__ bool isnan(half a) {
+    // implemented using that a!=a if and only if a is nan
+    return ne(a, a);
+  }
+
+  static inline __host__ __device__ bool isinf(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hisinf(a) != 0;
+#else
+    float fa = __half2float(a);
+    return ::isinf(fa);
+#endif
+#else // __CUDA_ARCH__
+    return ::isinf(THC_half2float(a));
+#endif
+  }
+
+};
+#endif
+
+template <>
+struct THCNumerics<float> {
+  static inline __host__ __device__ float min() { return -FLT_MAX; }
+  static inline __host__ __device__ float max() { return FLT_MAX; }
+
+  static inline __host__ __device__ bool lt(float a, float b) { return a < b; }
+  static inline __host__ __device__ bool le(float a, float b) { return a <= b; }
+  static inline __host__ __device__ bool gt(float a, float b) { return a > b; }
+  static inline __host__ __device__ bool ge(float a, float b) { return a >= b; }
+  static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
+  static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
+
+  static inline __host__ __device__  float lgamma(float a) { return lgammaf(a);}
+  static inline __host__ __device__  float erfinv(float a) { return erfinvf(a);}
+  static inline __host__ __device__  float exp  (float a) { return   expf(a); }
+  static inline __host__ __device__  float exp10(float a) { return exp10f(a); }
+  static inline __host__ __device__  float log  (float a) { return   logf(a); }
+  static inline __host__ __device__  float log10(float a) { return log10f(a); }
+  static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
+  static inline __host__ __device__  float log2 (float a) { return  log2f(a); }
+  static inline __host__ __device__  float expm1(float a) { return expm1f(a); }
+  static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
+  static inline __host__ __device__  float sin  (float a) { return   sinf(a); }
+  static inline __host__ __device__  float sqrt (float a) { return  sqrtf(a); }
+  static inline __host__ __device__  float rsqrt(float a) { return rsqrtf(a); }
+  static inline __host__ __device__  float ceil (float a) { return  ceilf(a); }
+  static inline __host__ __device__  float floor(float a) { return floorf(a); }
+  static inline __host__ __device__  float trunc(float a) { return truncf(a); }
+  static inline __host__ __device__  float neg  (float a) { return        -a; }
+  static inline __host__ __device__  float acos (float a) { return  acosf(a); }
+  static inline __host__ __device__  float cosh (float a) { return  coshf(a); }
+  static inline __host__ __device__  float acosh(float a) { return acoshf(a); }
+  static inline __host__ __device__  float asin (float a) { return  asinf(a); }
+  static inline __host__ __device__  float sinh (float a) { return  sinhf(a); }
+  static inline __host__ __device__  float asinh(float a) { return asinhf(a); }
+  static inline __host__ __device__  float tan  (float a) { return   tanf(a); }
+  static inline __host__ __device__  float atan (float a) { return  atanf(a); }
+  static inline __host__ __device__  float tanh (float a) { return  tanhf(a); }
+  static inline __host__ __device__  float erf  (float a) { return   erff(a); }
+  static inline __host__ __device__  float erfc (float a) { return  erfcf(a); }
+  static inline __host__ __device__  float abs  (float a) { return  fabsf(a); }
+  static inline __host__ __device__  float round(float a) { return roundf(a); }
+  static inline __host__ __device__  float frac (float a) { return a - truncf(a); }
+  static inline __host__ __device__  float cinv (float a) { return 1.0f / a; }
+  static inline __host__ __device__  float add  (float a, float b) { return a + b; }
+  static inline __host__ __device__  float div  (float a, float b) { return a / b; }
+  static inline __host__ __device__  float mul  (float a, float b) { return a * b; }
+  static inline __host__ __device__  float sub  (float a, float b) { return a - b; }
+  static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
+  static inline __host__ __device__  float atan2(float a, float b) { return atan2f(a, b); }
+  static inline __host__ __device__  bool isnan(float a) { return ::isnan(a); }
+  static inline __host__ __device__  bool isinf(float a) { return ::isinf(a); }
+};
+
+template <>
+struct THCNumerics<double> {
+  static inline __host__ __device__ double min() { return -DBL_MAX; }
+  static inline __host__ __device__ double max() { return DBL_MAX; }
+
+  static inline __host__ __device__ bool lt(double a, double b) { return a < b; }
+  static inline __host__ __device__ bool le(double a, double b) { return a <= b; }
+  static inline __host__ __device__ bool gt(double a, double b) { return a > b; }
+  static inline __host__ __device__ bool ge(double a, double b) { return a >= b; }
+  static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
+  static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
+
+  static inline __host__ __device__  double lgamma(double a) { return ::lgamma(a);}
+  static inline __host__ __device__  double erfinv(double a) { return ::erfinv(a);}
+  static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
+  static inline __host__ __device__  double exp10(double a) { return ::exp10(a); }
+  static inline __host__ __device__  double log  (double a) { return   ::log(a); }
+  static inline __host__ __device__  double log10(double a) { return ::log10(a); }
+  static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
+  static inline __host__ __device__  double log2 (double a) { return  ::log2(a); }
+  static inline __host__ __device__  double expm1(double a) { return ::expm1(a); }
+  static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
+  static inline __host__ __device__  double sin  (double a) { return   ::sin(a); }
+  static inline __host__ __device__  double sqrt (double a) { return  ::sqrt(a); }
+  static inline __host__ __device__  double rsqrt(double a) { return ::rsqrt(a); }
+  static inline __host__ __device__  double ceil (double a) { return  ::ceil(a); }
+  static inline __host__ __device__  double floor(double a) { return ::floor(a); }
+  static inline __host__ __device__  double trunc(double a) { return ::trunc(a); }
+  static inline __host__ __device__  double neg  (double a) { return       -a; }
+  static inline __host__ __device__  double acos (double a) { return  ::acos(a); }
+  static inline __host__ __device__  double cosh (double a) { return  ::cosh(a); }
+  static inline __host__ __device__  double acosh(double a) { return ::acosh(a); }
+  static inline __host__ __device__  double asin (double a) { return  ::asin(a); }
+  static inline __host__ __device__  double sinh (double a) { return  ::sinh(a); }
+  static inline __host__ __device__  double asinh(double a) { return ::asinh(a); }
+  static inline __host__ __device__  double tan  (double a) { return   ::tan(a); }
+  static inline __host__ __device__  double atan (double a) { return  ::atan(a); }
+  static inline __host__ __device__  double tanh (double a) { return  ::tanh(a); }
+  static inline __host__ __device__  double erf  (double a) { return   ::erf(a); }
+  static inline __host__ __device__  double erfc (double a) { return  ::erfc(a); }
+  static inline __host__ __device__  double abs  (double a) { return   ::abs(a); }
+  static inline __host__ __device__  double round(double a) { return ::round(a); }
+  static inline __host__ __device__  double frac (double a) { return a - ::trunc(a); }
+  static inline __host__ __device__  double cinv (double a) { return 1.0 / a; }
+  static inline __host__ __device__  double add  (double a, double b) { return a + b; }
+  static inline __host__ __device__  double div  (double a, double b) { return a / b; }
+  static inline __host__ __device__  double mul  (double a, double b) { return a * b; }
+  static inline __host__ __device__  double sub  (double a, double b) { return a - b; }
+  static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
+  static inline __host__ __device__  double atan2(double a, double b) { return ::atan2(a, b); }
+  static inline __host__ __device__  bool isnan(double a) { return ::isnan(a); }
+  static inline __host__ __device__  bool isinf(double a) { return ::isinf(a); }
+};
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+template <typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ Out to(const In v) { return (Out) v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static __host__ __device__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+    return (Out) __half2float(v);
+#else
+    return (Out) THC_half2float(v);
+#endif
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static __host__ __device__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+    return __float2half((float) v);
+#else
+    return THC_float2half((float) v);
+#endif
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static __host__ __device__ half to(const half v) {
+    return v;
+  }
+};
+
+template <typename T, typename U>
+__host__ __device__ T scalar_cast(U u) {
+  return ScalarConvert<U, T>::to(u);
+}
+
+#endif
+
+#endif // THC_NUMERICS_INC
diff --git a/aten/src/THC/THCReduce.cuh b/aten/src/THC/THCReduce.cuh
new file mode 100644
index 0000000..2735847
--- /dev/null
+++ b/aten/src/THC/THCReduce.cuh
@@ -0,0 +1,641 @@
+#ifndef THC_REDUCE_INC
+#define THC_REDUCE_INC
+
+//
+// This file contains dimension reduction operation functions and
+// kernels that work on both contiguous and non-contiguous tensor
+// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
+// arguments without copying or temporary storage.
+//
+
+#include "THCTensorTypeUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCNumerics.cuh"
+
+// Threads per thread block
+#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
+#define CHUNKPERBLOCK 256
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
+  // Each thread handles one slice
+  return getLinearBlockId<IndexType>() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x;
+}
+
+// quick hack to enable two-stage use of reduceChunk
+template <typename T>
+struct SimpleCopyOp
+{
+  __device__ __forceinline__ T operator()(const T val) const 
+  {
+    return val;
+  }
+};
+
+__device__ __forceinline__ int lastpow2(int n)
+{
+  int out = 1 << (31 - __clz(n));
+  if(n == out) 
+    out >>= 1;
+  return out;
+}
+
+template
+  <typename T,
+   typename U,
+   typename IndexType,
+   typename AccT, 
+   typename ModifyOp,
+   typename ReduceOp,
+   typename FinalizeOp>
+__device__ __forceinline__ void reduceChunk
+  (T* out,
+   U* in,
+   const int& inbounds,
+   const IndexType& reductionStride,
+   const IndexType& reductionSize,
+   const IndexType& inOffset,
+   const IndexType& outOffset,
+   const int& shmem_lim,
+   AccT init,
+   AccT* shmem,
+   ModifyOp modifyOp,
+   ReduceOp reduceOp,
+   FinalizeOp finalizeOp) 
+{
+  AccT load_reg[4];
+  AccT local_reg = init;
+  
+  //Unroll this loop
+  //for(IndexType i=threadIdx.y; i<reductionSize; i+=blockDim.y){
+  //  local_reg += in[inOffset + i*reductionStride];
+  //}
+  if(inbounds)
+    for(IndexType i = threadIdx.y; i < reductionSize; i += blockDim.y*4) 
+    {
+      if (i + blockDim.y*3 < reductionSize) 
+      {
+        const AccT val0 = scalar_cast<AccT>(in[inOffset + i*reductionStride]);
+        load_reg[0] = modifyOp(val0);
+        const AccT val1 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y)*reductionStride]);
+        load_reg[1] = modifyOp(val1);
+        const AccT val2 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y*2)*reductionStride]);
+        load_reg[2] = modifyOp(val2);
+        const AccT val3 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y*3)*reductionStride]);
+        load_reg[3] = modifyOp(val3);
+        local_reg = reduceOp(local_reg, load_reg[0]);
+        local_reg = reduceOp(local_reg, load_reg[1]);
+        local_reg = reduceOp(local_reg, load_reg[2]);
+        local_reg = reduceOp(local_reg, load_reg[3]);
+      } 
+      else if (i + blockDim.y*2 < reductionSize) 
+      {
+        const AccT val0 = scalar_cast<AccT>(in[inOffset + i*reductionStride]);
+        load_reg[0] = modifyOp(val0);
+        const AccT val1 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y)*reductionStride]);
+        load_reg[1] = modifyOp(val1);
+        const AccT val2 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y*2)*reductionStride]);
+        load_reg[2] = modifyOp(val2);
+        local_reg = reduceOp(local_reg, load_reg[0]);
+        local_reg = reduceOp(local_reg, load_reg[1]);
+        local_reg = reduceOp(local_reg, load_reg[2]);
+      } 
+      else if (i + blockDim.y < reductionSize) 
+      {
+        const AccT val0 = scalar_cast<AccT>(in[inOffset + i*reductionStride]);
+        load_reg[0] = modifyOp(val0);
+        const AccT val1 = scalar_cast<AccT>(in[inOffset + (i + blockDim.y)*reductionStride]);
+        load_reg[1] = modifyOp(val1);
+        local_reg = reduceOp(local_reg, load_reg[0]);
+        local_reg = reduceOp(local_reg, load_reg[1]);
+      } 
+      else if (i < reductionSize) 
+      {
+        const AccT val0 = scalar_cast<AccT>(in[inOffset + i*reductionStride]);
+        local_reg = reduceOp(local_reg, modifyOp(val0));
+      }
+    }
+  
+  *shmem = local_reg;
+  for(int i = lastpow2(shmem_lim); i > 0; i >>= 1)
+  {
+    __syncthreads();
+    if(threadIdx.y < i && threadIdx.y + i < shmem_lim) 
+       *shmem = reduceOp(*shmem, *(shmem + i*blockDim.x));
+  }
+
+  if(threadIdx.y == 0 && inbounds)
+    out[outOffset] = scalar_cast<T>(finalizeOp(*shmem));
+}
+
+// Kernel that handles an entire reduction of a slice of a tensor per each thread
+template 
+  <typename T,
+   typename IndexType,
+   typename AccT,
+   typename ModifyOp,
+   typename ReduceOp,
+   typename FinalizeOp,
+   int ADims, int BDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void kernelReduceNoncontigDim_shared
+  (TensorInfo<T, IndexType> out,
+   TensorInfo<T, IndexType> in,
+   IndexType reductionStride,
+   IndexType reductionSize,
+   IndexType totalSlices,
+   AccT init,
+   ModifyOp modifyOp,
+   ReduceOp reduceOp,
+   FinalizeOp finalizeOp,
+   volatile AccT* stagingData,
+   int* semaphores)
+{
+  IndexType sliceIndex  = blockIdx.x*blockDim.x + threadIdx.x;
+
+  __shared__ int isLastBlockDone;
+  __shared__ AccT local_reduce[THC_NONCONTIG_REDUCE_BLOCK_SIZE];
+  AccT* shmem = &local_reduce[threadIdx.x + threadIdx.y*blockDim.x];
+
+  // This kernel is intended for the latency-bound case, so we want to launch enough blocks
+  // to cover the entire output.  This means we don't need grid-stride loops.
+  const IndexType outOffset =
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
+  const IndexType inOffset =
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
+  const int inbounds = (sliceIndex < totalSlices);
+
+  if(gridDim.y == 1)
+    reduceChunk
+      (out.data,
+       in.data, 
+       inbounds,
+       reductionStride,
+       reductionSize,
+       inOffset,
+       outOffset,
+       reductionSize < blockDim.y ? reductionSize : blockDim.y,
+       init,
+       shmem,
+       modifyOp,
+       reduceOp,
+       finalizeOp);
+  else
+  { 
+    int* semaphore = semaphores + blockIdx.x;
+  
+    const IndexType chunkStart = blockIdx.y*CHUNKPERBLOCK;
+    const IndexType chunkSize = reductionSize - chunkStart < CHUNKPERBLOCK ? 
+                                reductionSize - chunkStart : CHUNKPERBLOCK;
+    const IndexType reductionStrideStaging = totalSlices;
+    const IndexType stagingOffset = sliceIndex;
+
+    reduceChunk
+      (stagingData,
+       in.data, 
+       inbounds,
+       reductionStride,
+       chunkSize,
+       inOffset + chunkStart*reductionStride,
+       stagingOffset + blockIdx.y*reductionStrideStaging,
+       chunkSize < blockDim.y ? chunkSize : blockDim.y, 
+       init,
+       shmem,
+       modifyOp,
+       reduceOp,
+       SimpleCopyOp<AccT>());
+  
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+  
+    if(threadIdx.x == 0 && threadIdx.y == 0)
+    {
+      int old = atomicAdd(semaphore, 1);
+      isLastBlockDone = (old == gridDim.y - 1);
+    }
+  
+    __syncthreads();
+ 
+    // The staging area contains gridDim.y elements along each slice.  The final reduction
+    // begins by treating the first blockDim.y elements as "init" values. 
+    if(isLastBlockDone)
+    {
+      if(threadIdx.y < gridDim.y)
+        init = stagingData[stagingOffset + threadIdx.y*reductionStrideStaging];
+      IndexType remaining = gridDim.y < blockDim.y ? 0 : gridDim.y - blockDim.y;
+      reduceChunk
+        (out.data,
+         stagingData, 
+         inbounds,
+         reductionStrideStaging,
+         remaining, // if 0, loop in reduceChunk is skipped, otherwise...
+         stagingOffset + blockDim.y*reductionStrideStaging, // ...loop begins at blockDim+1th element
+         outOffset,
+         gridDim.y < blockDim.y ? gridDim.y : blockDim.y, 
+         init,
+         shmem,
+         SimpleCopyOp<AccT>(),
+         reduceOp,
+         finalizeOp);
+    }
+  }
+}
+
+
+// Kernel that handles an entire reduction of a slice of a tensor per each thread
+template <typename T,
+          typename IndexType,
+          typename AccT,
+          typename ModifyOp,
+          typename ReduceOp,
+          typename FinalizeOp,
+          int ADims, int BDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
+                         TensorInfo<T, IndexType> in,
+                         IndexType reductionStride,
+                         IndexType reductionSize,
+                         IndexType totalSlices,
+                         AccT init,
+                         ModifyOp modifyOp,
+                         ReduceOp reduceOp,
+                         FinalizeOp finalizeOp) {
+  const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
+
+  if (sliceIndex >= totalSlices) {
+    return;
+  }
+
+  // Each thread picks a point in `out` and `in` for which it is
+  // producing the reduction
+  const IndexType outOffset =
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
+  const IndexType inBaseOffset =
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
+
+  // For each point in reductionSize, reduce into `r`
+  IndexType inOffset = inBaseOffset;
+  AccT r = init;
+
+  for (IndexType i = 0; i < reductionSize; ++i) {
+    const AccT val = scalar_cast<AccT>(in.data[inOffset]);
+    r = reduceOp(r, modifyOp(val));
+    inOffset += reductionStride;
+  }
+
+  // Write out reduced value
+  out.data[outOffset] = scalar_cast<T>(finalizeOp(r));
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
+  // Each block handles one slice
+  return getLinearBlockId<IndexType>();
+}
+
+// Kernel that handles an entire reduction of a slice of a tensor per
+// each block
+template <typename T,
+          typename IndexType,
+          typename AccT,
+          typename ModifyOp,
+          typename ReduceOp,
+          typename FinalizeOp,
+          int ADims, int BDims>
+__global__ void
+kernelReduceContigDim(TensorInfo<T, IndexType> out,
+                      TensorInfo<T, IndexType> in,
+                      IndexType reductionSize,
+                      IndexType totalSlices,
+                      AccT init,
+                      ModifyOp modifyOp,
+                      ReduceOp reduceOp,
+                      FinalizeOp finalizeOp) {
+  const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
+
+  if (sliceIndex >= totalSlices) {
+    return;
+  }
+
+  // Get the offset in `out` for the reduction
+  const IndexType outOffset =
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
+
+  // Get the base offset in `in` for this block's reduction
+  const IndexType inBaseOffset =
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
+
+  // Each thread in the block will reduce some subset of elements in
+  // the slice. The elements are guaranteed contiguous starting at
+  // `inBaseOffset`.
+  AccT r = init;
+  for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
+    const AccT val = scalar_cast<AccT>(in.data[inBaseOffset + i]);
+    r = reduceOp(r, modifyOp(val));
+  }
+
+  // Reduce within the block
+  // FIXME: extern name
+  extern __shared__ char smemChar[];
+  AccT* smem = (AccT*) smemChar;
+  r = reduceBlock<AccT, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out reduced value
+    out.data[outOffset] = scalar_cast<T>(finalizeOp(r));
+  }
+}
+
+inline dim3 getNoncontigReduceBlock() {
+  return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
+}
+
+inline dim3 getContigReduceBlock(ptrdiff_t numSlices, int64_t reductionSize) {
+  // If the number of slices is low but the reduction dimension size
+  // is high, then we should increase block size for greater parallelism.
+  // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
+  // inquiring the real number for now).
+  int maxWarps = 4; // better occupancy if many blocks are around
+  // For numSlices > 15 * 8, there are > 32 warps active per SM.
+  if (numSlices < 15 * 8) {
+    maxWarps = 8;
+    if (numSlices < 15 * 4) {
+      maxWarps = 16;
+      if (numSlices < 15 * 2) {
+        maxWarps = 32;
+      }
+    }
+  }
+
+  // Scale up block size based on the reduction dimension size
+  int64_t warpsInReductionSize = THCCeilDiv(reductionSize, (int64_t) 32);
+  int numWarps = warpsInReductionSize > (int64_t) maxWarps ?
+    maxWarps : (int) warpsInReductionSize;
+
+  return dim3(numWarps * 32);
+}
+
+inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
+  // One output point per thread
+  return THC_getGridFromTiles(THCCeilDiv(elements,
+                                         (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
+}
+
+inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
+  // One output point per block
+  return THC_getGridFromTiles(elements, grid);
+}
+
+// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
+// all in where i and the out's 0 are indexed at dimension `dim`
+template <typename ScalarType,
+typename TensorType,
+typename ModifyOp, 
+typename ReduceOp,
+typename FinalizeOp,
+typename AccT>
+bool THC_reduceDim(THCState* state,
+                   TensorType* out,
+                   TensorType* in,
+                   const ModifyOp modifyOp,
+                   const ReduceOp reduceOp,
+                   const FinalizeOp finalizeOp,
+                   AccT init,
+                   int dim,
+                   int keepdim) {
+  ptrdiff_t inElements = THCTensor_nElement(state, in);
+
+  int64_t reductionSize = THCTensor_size(state, in, dim);
+  int64_t reductionStride = THCTensor_stride(state, in, dim);
+  ptrdiff_t outElements = inElements / reductionSize;
+
+  if (THCTensor__nDimension(state, out) > MAX_CUTORCH_DIMS ||
+      THCTensor__nDimension(state, in) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, in) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  // Is the reduction dimension contiguous? If so, then we can use a
+  // shared memory reduction kernel to increase performance.
+  bool contigReduction = (reductionStride == 1);
+
+  dim3 block;
+  dim3 grid;
+  int smemSize = 0; // contiguous reduction uses smem
+  if (contigReduction) {
+    if (!getContigReduceGrid(outElements, grid)) {
+      return false;
+    }
+
+    block = getContigReduceBlock(outElements, reductionSize);
+    smemSize = sizeof(AccT) * block.x;
+  } else {
+    if (!getNoncontigReduceGrid(outElements, grid)) {
+      return false;
+    }
+
+    block = getNoncontigReduceBlock();
+
+    if(outElements <= 4096)
+    {
+      // gridDim.x and blockDim.x parallelize work across slices.
+      // blockDim.y enables some intra-block reduction within slices.
+      // gridDim.y enables inter-block reduction within slices.
+
+      // Each block covers 32 output elements.
+      int blockdimx = 32;
+      int griddimx = THCCeilDiv((int64_t)outElements, (int64_t)blockdimx);
+
+      // Each warp reduces at most 4 slices.  This heuristic can be tuned, 
+      // but locking blockdimy to 16 is robust and reasonably performant.
+      int blockdimy = 16;
+
+      int griddimy = 1;
+      bool coop = false;
+      // Rough heuristics to decide if using cooperating blocks is worthwhile
+      if(                      outElements <=   32 && reductionSize >= 4096) coop = true;
+      if(  32 < outElements && outElements <=   64 && reductionSize >= 4096) coop = true;
+      if(  64 < outElements && outElements <=  128 && reductionSize >= 4096) coop = true;
+      if( 128 < outElements && outElements <=  256 && reductionSize >= 4096) coop = true;
+      if( 256 < outElements && outElements <=  512 && reductionSize >= 4096) coop = true;
+      if( 512 < outElements && outElements <= 1024 && reductionSize >= 4096) coop = true;
+      if(1024 < outElements && outElements <= 2048 && reductionSize >= 2048) coop = true;
+      if(2048 < outElements && outElements <= 4096 && reductionSize >= 2048) coop = true;
+      // Each block reduces at most CHUNKPERBLOCK (currently 256) slices.
+      if(coop) 
+        griddimy = THCCeilDiv((int64_t)reductionSize, (int64_t)CHUNKPERBLOCK);
+
+      grid = dim3(griddimx, griddimy, 1); 
+      block = dim3(blockdimx, blockdimy, 1);
+    }
+  }
+
+  // Resize out to correspond to the reduced size with keepdim=True.
+
+  // Preserve noncontiguities by unsqueezing out if necessary
+  THCTensor_preserveReduceDimSemantics(
+      state, out, THCTensor__nDimension(state, in), dim, keepdim);
+
+  // Resize out
+  THLongStorage* sizes = THCTensor_newSizeOf(state, in);
+  THLongStorage_set(sizes, dim, 1);
+  THCTensor_resize(state, out, sizes, NULL);
+  THLongStorage_free(sizes);
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, OUT, IN)                                      \
+  if (contigReduction) {                                                \
+    kernelReduceContigDim<ScalarType,                                   \
+                          TYPE, AccT, ModifyOp, ReduceOp, FinalizeOp,   \
+                          OUT, IN>                                      \
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>     \
+        (outInfo, inInfo, reductionSize,                                \
+        (TYPE) outElements, init, modifyOp, reduceOp, finalizeOp);      \
+  } else {                                                              \
+    if(block.y == 1){                                                   \
+        kernelReduceNoncontigDim<                                       \
+                          ScalarType,                                   \
+                          TYPE, AccT, ModifyOp, ReduceOp, FinalizeOp,   \
+                          OUT, IN>                                      \
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>          \
+        (outInfo, inInfo, reductionStride, reductionSize,               \
+        (TYPE) outElements, init, modifyOp, reduceOp, finalizeOp);      \
+    }                                                                   \
+    else                                                                \
+    {                                                                        \
+        void* stagingData;                                                   \
+        void* semaphores;                                                    \
+                                                                             \
+        if(grid.y > 1)                                                       \
+        {                                                                    \
+          stagingData = THCudaMalloc(state, sizeof(AccT)*outElements*grid.y);\
+          semaphores = THCudaMalloc(state, sizeof(int)*grid.x);              \
+          THCudaCheck(cudaMemsetAsync                                        \
+            (semaphores,                                                     \
+             0,                                                              \
+             sizeof(int)*grid.x,                                             \
+             THCState_getCurrentStream(state)));                             \
+        }                                                                    \
+                                                                             \
+        kernelReduceNoncontigDim_shared                                      \
+          <ScalarType, TYPE, AccT, ModifyOp, ReduceOp, FinalizeOp,  OUT, IN> \
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>             \
+          (outInfo,                                                          \
+           inInfo,                                                           \
+           reductionStride,                                                  \
+           reductionSize,                                                    \
+           (TYPE) outElements,                                               \
+           init,                                                             \
+           modifyOp,                                                         \
+           reduceOp,                                                         \
+           finalizeOp,                                                       \
+           (volatile AccT*)stagingData,                                      \
+           (int*)semaphores);                                                \
+                                                                             \
+        if(grid.y > 1)                                                       \
+        {                                                                    \
+          THCudaFree(state, stagingData);                                    \
+          THCudaFree(state, semaphores);                                     \
+        }                                                                    \
+    }                                                                        \
+  }                                                                    
+
+#define HANDLE_IN_CASE(TYPE, OUT, IN)                     \
+  {                                                       \
+    switch (IN) {                                         \
+      case 1:                                             \
+        HANDLE_CASE(TYPE, OUT, 1);                        \
+        break;                                            \
+      case 2:                                             \
+        HANDLE_CASE(TYPE, OUT, 2);                        \
+        break;                                            \
+      default:                                            \
+        HANDLE_CASE(TYPE, OUT, -1);                       \
+        break;                                            \
+    }                                                     \
+  }
+
+#define HANDLE_OUT_CASE(TYPE, OUT, IN)                    \
+  {                                                       \
+    switch (OUT) {                                        \
+      case 1:                                             \
+        HANDLE_IN_CASE(TYPE, 1, IN);                      \
+        break;                                            \
+      case 2:                                             \
+        HANDLE_IN_CASE(TYPE, 2, IN);                      \
+        break;                                            \
+      default:                                            \
+        HANDLE_IN_CASE(TYPE, -1, IN);                     \
+        break;                                            \
+    }                                                     \
+  }
+
+  if(THCTensor_canUse32BitIndexMath(state, out) &&
+     THCTensor_canUse32BitIndexMath(state, in)) 
+  {
+    TensorInfo<ScalarType,
+               unsigned int> outInfo =
+      getTensorInfo<ScalarType, TensorType, unsigned int>(state, out);
+    outInfo.collapseDims();
+
+    TensorInfo<ScalarType,
+               unsigned int> inInfo =
+      getTensorInfo<ScalarType, TensorType, unsigned int>(state, in);
+    inInfo.reduceDim(dim);
+    inInfo.collapseDims();
+    HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
+  } 
+  else 
+  {
+    TensorInfo<ScalarType,
+               uint64_t> outInfo =
+      getTensorInfo<ScalarType, TensorType, uint64_t>(state, out);
+    outInfo.collapseDims();
+
+    TensorInfo<ScalarType,
+               uint64_t> inInfo =
+      getTensorInfo<ScalarType, TensorType, uint64_t>(state, in);
+    inInfo.reduceDim(dim);
+    inInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (outInfo.dims == 1 && inInfo.dims == 1) {
+      HANDLE_CASE(uint64_t, 1, 1);
+    } else {
+      HANDLE_CASE(uint64_t, -1, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_IN_CASE
+#undef HANDLE_OUT_CASE
+
+
+  if (!keepdim) {
+    THCTensor_squeeze1d(state, out, out, dim);
+  }
+  return true;
+}
+
+#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE
+#undef CHUNKPERBLOCK
+
+#endif // THC_REDUCE_INC
diff --git a/aten/src/THC/THCReduceAll.cuh b/aten/src/THC/THCReduceAll.cuh
new file mode 100644
index 0000000..5850e77
--- /dev/null
+++ b/aten/src/THC/THCReduceAll.cuh
@@ -0,0 +1,331 @@
+#ifndef THC_REDUCEALL_INC
+#define THC_REDUCEALL_INC
+
+//
+// This file contains dimension reduction operation functions and
+// kernels that work on both contiguous and non-contiguous tensor
+// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
+// arguments without copying or temporary storage, for reducing an
+// entire tensor to one value.
+//
+
+#include "THCReduceApplyUtils.cuh"
+
+// Size per each reduction block
+#define THC_REDUCE_ALL_BLOCK_SIZE 1024L
+
+// Cutoff size for two-pass reduction
+#define THC_TWO_PASS_REDUCTION_SIZE 2048L
+
+// Kernel that handles an entire reduction of a tensor in one pass
+template <typename T,
+          typename IndexType,
+          typename AccT,
+          typename ModifyOp,
+          typename ReduceOp,
+          int ADims>
+__global__ void
+kernelReduceAll(TensorInfo<T, IndexType> in,
+                IndexType totalElements,
+                AccT init,
+                ModifyOp modifyOp,
+                ReduceOp reduceOp,
+                AccT* out) {
+  // With a block-wide stride, have each thread perform its own reduction.
+  AccT r = init;
+  for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
+    const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
+    const AccT val = scalar_cast<AccT>(in.data[inOffset]);
+    r = reduceOp(r, modifyOp(val));
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  AccT* smem = (AccT*) smemChar;
+  r = reduceBlock(smem, blockDim.x, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out reduced value
+    *out = r;
+  }
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getStartIndex(IndexType totalSize) {
+  IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
+  return blockIdx.x * sizePerBlock;
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
+  IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
+  return min((IndexType) ((blockIdx.x + 1) * sizePerBlock), totalSize);
+}
+
+// Kernel that handles an entire reduction of a tensor in two passes
+template <typename T,
+          typename IndexType,
+          typename AccT,
+          typename ModifyOp,
+          typename ReduceOp,
+          int ADims>
+__global__ void
+kernelReduceAllPass1(TensorInfo<T, IndexType> in,
+                     IndexType totalElements,
+                     AccT init,
+                     ModifyOp modifyOp,
+                     ReduceOp reduceOp,
+                     AccT* scratchSpace) {
+  const IndexType startIndex = getStartIndex<IndexType>(totalElements);
+  const IndexType endIndex = getEndIndex<IndexType>(totalElements);
+
+  // With a block-wide stride, have each thread perform its own reduction.
+  AccT r = init;
+  for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
+    const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
+    const AccT val = scalar_cast<AccT>(in.data[inOffset]);
+    r = reduceOp(r, modifyOp(val));
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  AccT* smem = (AccT*) smemChar;
+  r = reduceBlock(smem, blockDim.x, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out block-wide reduced value
+    scratchSpace[blockIdx.x] = r;
+  }
+}
+
+template <typename T, typename ReduceOp>
+__global__ void
+kernelReduceAllPass2(int numPass1Blocks,
+                     T init,
+                     ReduceOp reduceOp,
+                     T* scratchSpace,
+                     T* out) {
+  T r = init;
+  if (threadIdx.x < numPass1Blocks) {
+    r = scratchSpace[threadIdx.x];
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  T* smem = (T*) smemChar;
+  r = reduceBlock(smem, numPass1Blocks, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    *out = r;
+  }
+}
+
+// Perform a two-pass reduction if the tensor is large enough to
+// warrant it.
+inline bool isTwoPassReductionSize(ptrdiff_t elements) {
+  return (elements > THC_TWO_PASS_REDUCTION_SIZE);
+}
+
+template <typename T>
+inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) {
+  ptrdiff_t numBlocks = THCCeilDiv(elements, (ptrdiff_t)THC_REDUCE_ALL_BLOCK_SIZE);
+
+  // We can only have as many blocks as there is scratch space
+  ptrdiff_t scratchSpace =
+    THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T);
+  THAssert(scratchSpace > 0);
+
+  // Limit to 1024 due to dimensionality constraint
+  if (scratchSpace > 1024) {
+    scratchSpace = 1024;
+  }
+
+  if (numBlocks > scratchSpace) {
+    numBlocks = scratchSpace;
+  }
+
+  return numBlocks;
+}
+
+// Get the block/grid size that we want
+template <typename T>
+inline void getPass1ReduceBlockGrid(THCState* state, ptrdiff_t elements,
+                                    dim3& grid, dim3& block) {
+  grid = dim3(getTwoPassBlocks<T>(state, elements));
+  block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
+}
+
+template <typename T>
+inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements,
+                                    dim3& grid, dim3& block) {
+  grid = dim3(1);
+  // We only need as many threads as there were blocks originally
+  block = dim3(getTwoPassBlocks<T>(state, elements));
+}
+
+inline void getSinglePassReduceBlockGrid(ptrdiff_t elements,
+                                         dim3& grid, dim3& block) {
+  grid = dim3(1);
+  block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
+}
+
+template <typename T,
+          typename IndexType,
+          typename AccT,
+          typename ModifyOp,
+          typename ReduceOp,
+          int ADims>
+void callReduceAll(THCState* state,
+                   const TensorInfo<T, IndexType>& in,
+                   ptrdiff_t totalElements,
+                   AccT init,
+                   const ModifyOp& modifyOp,
+                   const ReduceOp& reduceOp,
+                   AccT* devOut) {
+  dim3 grid;
+  dim3 block;
+
+  if (isTwoPassReductionSize(totalElements)) {
+    void* scratchSpace = THCudaMalloc(state, THCState_getCurrentDeviceScratchSpaceSize(state));
+
+    getPass1ReduceBlockGrid<AccT>(state, totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAllPass1<T, IndexType, AccT, ModifyOp, ReduceOp, ADims>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        in, (IndexType) totalElements, init, modifyOp, reduceOp,
+        (AccT*) scratchSpace);
+
+    int numPass1Blocks = grid.x;
+    getPass2ReduceBlockGrid<AccT>(state, totalElements, grid, block);
+    smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAllPass2<AccT, ReduceOp>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        numPass1Blocks, init, reduceOp,
+        (AccT*) scratchSpace, devOut);
+
+    THCudaFree(state, scratchSpace);
+  } else {
+    getSinglePassReduceBlockGrid(totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAll<T, IndexType, AccT, ModifyOp, ReduceOp, ADims>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut);
+  }
+}
+
+// Reduces the entire tensor to one value. `out` points to
+// host-resident memory.
+template <typename ScalarType,
+          typename TensorType,
+          typename ModifyOp,
+          typename ReduceOp,
+          typename AccT>
+bool THC_reduceAll(THCState* state,
+                   TensorType* in,
+                   const ModifyOp& modifyOp,
+                   const ReduceOp& reduceOp,
+                   AccT init,
+                   AccT* out,
+                   int outOnDevice) {
+  ptrdiff_t inElements = THCTensor_nElement(state, in);
+
+  if (THCTensor__nDimension(state, in) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (THCTensor__nDimension(state, in) == 0) {
+    // Zero-dim tensor; do nothing
+    *out = init;
+    return true;
+  }
+
+  bool freeDevOut = false;
+  AccT* devOut = out;
+  if (!outOnDevice) {
+    // Use the stream-specific scratch space for the reduction kernel
+    // to write out its value
+    devOut = static_cast<AccT*>(THCudaMalloc(state,
+        THCState_getCurrentDeviceScratchSpaceSize(state)));
+    freeDevOut = true;
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, IN)                                           \
+  callReduceAll<ScalarType,                                             \
+                TYPE, AccT, ModifyOp, ReduceOp, IN>(                    \
+                  state, inInfo, inElements, init, modifyOp,            \
+                  reduceOp, devOut);
+
+#define HANDLE_IN_CASE(TYPE, IN)                    \
+  {                                                 \
+    switch (IN) {                                 \
+      case 1:                                     \
+        HANDLE_CASE(TYPE, 1);                     \
+        break;                                    \
+      case 2:                                     \
+        HANDLE_CASE(TYPE, 2);                     \
+        break;                                    \
+      default:                                    \
+        HANDLE_CASE(TYPE, -1);                    \
+        break;                                    \
+    }                                             \
+  }
+
+  if (THCTensor_canUse32BitIndexMath(state, in)) {
+    TensorInfo<ScalarType, unsigned int> inInfo =
+      getTensorInfo<ScalarType, TensorType, unsigned int>(state, in);
+    inInfo.collapseDims();
+
+    HANDLE_IN_CASE(unsigned int, inInfo.dims);
+  } else {
+    TensorInfo<ScalarType,
+               uint64_t> inInfo =
+      getTensorInfo<ScalarType, TensorType, uint64_t>(state, in);
+    inInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time. 
+    */
+    if (inInfo.dims == 1) {
+      HANDLE_IN_CASE(uint64_t, 1);
+    } else {
+      HANDLE_IN_CASE(uint64_t, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_IN_CASE
+
+  // If our destination is not on the device, copy the value back to
+  // the host (synchronous!)
+  if (!outOnDevice) {
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    THCudaCheck(cudaMemcpyAsync(out,
+                                devOut,
+                                sizeof(AccT),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+    THCudaCheck(cudaStreamSynchronize(stream));
+  }
+
+  if (freeDevOut) {
+    THCudaFree(state, devOut);
+  }
+
+  return true;
+}
+
+#undef THC_REDUCE_ALL_BLOCK_SIZE
+#undef THC_TWO_PASS_REDUCTION_SIZE
+
+#endif // THC_REDUCEALL_INC
diff --git a/aten/src/THC/THCReduceApplyUtils.cu b/aten/src/THC/THCReduceApplyUtils.cu
new file mode 100644
index 0000000..df0169e
--- /dev/null
+++ b/aten/src/THC/THCReduceApplyUtils.cu
@@ -0,0 +1,35 @@
+#include "THCReduceApplyUtils.cuh"
+
+#include <assert.h>
+#include <stdlib.h>
+
+// Maximum size per grid dimension that we assume (compute capability >= 2.0)
+#define MAX_GRID_SIZE 65535LL
+
+void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) {
+  int64_t dims = THCudaTensor__nDimension(state, tensor);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING);
+}
+
+bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid) {
+  if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
+    return false;
+  }
+
+  int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+  int64_t gridY = 1;
+  int64_t gridZ = 1;
+
+  if (gridTiles > MAX_GRID_SIZE) {
+    gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
+    gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+
+    if (gridTiles > MAX_GRID_SIZE) {
+      gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
+      gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+    }
+  }
+
+  grid = dim3(gridX, gridY, gridZ);
+  return true;
+}
diff --git a/aten/src/THC/THCReduceApplyUtils.cuh b/aten/src/THC/THCReduceApplyUtils.cuh
new file mode 100644
index 0000000..bf979c5
--- /dev/null
+++ b/aten/src/THC/THCReduceApplyUtils.cuh
@@ -0,0 +1,152 @@
+#ifndef THC_REDUCE_APPLY_UTILS_INC
+#define THC_REDUCE_APPLY_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCTensor.h"
+#include "THCDeviceUtils.cuh"
+#include "THCTensorInfo.cuh"
+
+// Enum that indicates whether tensor arguments are read/write or
+// read-only
+enum TensorArgType { ReadWrite, ReadOnly };
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getLinearBlockId() {
+  return blockIdx.z * gridDim.y * gridDim.x +
+    blockIdx.y * gridDim.x +
+    blockIdx.x;
+}
+
+// Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads:
+// (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0
+// is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20)
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <typename T, typename ReduceOp, int N>
+__device__ void reduceNValuesInBlock(T *smem,
+                             T threadVals[N],
+                             const unsigned int numVals,
+                             ReduceOp reduceOp,
+                             T init) {
+  if (numVals == 0) {
+    #pragma unroll
+    for (int i = 0; i < N; ++i) {
+      threadVals[i] = init;
+    }
+    return;
+  }
+
+  // We store each of the N values contiguously, so if N = 2, all values for
+  // the first threadVal for each thread in the block are stored followed by
+  // all of the values for the second threadVal for each thread in the block
+  if (threadIdx.x < numVals) {
+    #pragma unroll
+    for (int i = 0; i < N; ++i) {
+      smem[i * numVals + threadIdx.x] = threadVals[i];
+    }
+  }
+  __syncthreads();
+
+  // Number of lanes in the final reduction --> this is used to determine
+  // where to put the outputs of each of the n things we are reducing. If
+  // nLP = 32, then we have the 32 outputs for the first threadVal,
+  // followed by the 32 outputs for the second threadVal, etc.
+  const unsigned int numLanesParticipating = min(numVals, warpSize);
+
+  if (numVals > warpSize && ((threadIdx.x / warpSize) == 0 )) {
+    #pragma unroll
+    for (int i = 0; i < N; ++i) {
+      threadVals[i] = threadIdx.x < numVals ? threadVals[i] : init;
+    }
+
+    for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) {
+      #pragma unroll
+      for (int j = 0; j < N; ++j) {
+        threadVals[j] = reduceOp(threadVals[j], smem[j * numVals + i]);
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < N; ++i) {
+      smem[i * numLanesParticipating + threadIdx.x] = threadVals[i];
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    if (numLanesParticipating == 32) {
+      #pragma unroll
+      for (int i = 0; i < N; ++i) {
+        #pragma unroll
+        for (int j = 1; j < 32; ++j) {
+          threadVals[i] = reduceOp(threadVals[i], smem[i * 32 + j]);
+        }
+      }
+    } else {
+      #pragma unroll
+      for (int i = 0; i < N; ++i) {
+        for (int j = 1; j < numLanesParticipating; ++j) {
+          threadVals[i] = reduceOp(threadVals[i], smem[i * numVals + j]);
+        }
+      }
+    }
+  }
+}
+
+// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
+// return the reduced value
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <typename T, typename ReduceOp>
+__device__ T reduceBlock(T* smem,
+                         const unsigned int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp, init);
+  return threadVal;
+}
+
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <typename T, typename ReduceOp, int N>
+__device__ T reduceBlockWithNThreadLocalReductions(T *smem,
+                         T threadVals[N],
+                         const unsigned int numVals,
+                         ReduceOp reduceOp,
+                         T init) {
+  int offset = threadIdx.x * N;
+  T local = offset < numVals ? threadVals[0] : init;
+
+  #pragma unroll
+  for (int i = 1; i < N; ++i) {
+    ++offset;
+    T next = offset < numVals ? threadVals[i] : init;
+    local = reduceOp(local, next);
+  }
+
+  return reduceBlock<T, ReduceOp>(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init);
+}
+
+// Make sure the given tensor doesn't have too many dimensions
+void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
+
+// Produces a grid with at least one point per tile
+THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
+
+#endif // THC_REDUCE_APPLY_UTILS_INC
diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh
new file mode 100644
index 0000000..ef7c297
--- /dev/null
+++ b/aten/src/THC/THCScanUtils.cuh
@@ -0,0 +1,211 @@
+#ifndef THC_SCAN_UTILS_INC
+#define THC_SCAN_UTILS_INC
+
+#include "THCAsmUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+// Collection of in-kernel scan / prefix sum utilities
+
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to the
+// nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+//    0  1  2  3  4  5  6  7
+//       1     5     9    13
+//             6          22
+//                        28
+//
+// Downsweep:
+//                  15
+//         3     10    21
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void inclusivePrefixScan(T *smem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = binop(smem[index], smem[index - stride]);
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = binop(smem[index + stride], smem[index]);
+    }
+    __syncthreads();
+  }
+}
+
+// Generic Op that can be used to support segmented scans by re-using
+// the basic inclusiveScanOp. Merely requires that the input data has both
+// a flag and val component
+template <typename T, class BinaryOp>
+struct SegmentedScanOp {
+  __host__ __device__ SegmentedScanOp(BinaryOp binop): _binop(binop) {}
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    T c;
+    c.val = a.flag ? a.val : _binop(a.val, b.val);
+    c.flag = a.flag | b.flag;
+    return c;
+  }
+
+  BinaryOp _binop;
+};
+
+// Extends the above Inclusive Scan to support segments. It has the same properties
+// but also takes a flag array that indicates the starts of "segments", i.e. individual
+// units to scan. For example, consider the following (+)-scan that is segmented:
+//
+// Input:  [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
+// Flags:  [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
+// Output:  1  4  6  4  5  2  3  5  1  5
+//
+// So we see that each "flag" resets the scan to that index.
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
+      bmem[index] = bmem[index] | bmem[index - stride];
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
+      bmem[index + stride] = bmem[index + stride] | bmem[index];
+    }
+    __syncthreads();
+  }
+}
+
+// Inclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
+  // FIXME: this is a slow, simple implementation; need up/down sweep,
+  // prevent smem conflicts
+  smem[threadIdx.x] = in;
+
+  __syncthreads();
+
+  for (int offset = 1; offset < blockDim.x; offset *= 2) {
+    T val = 0;
+
+    if (threadIdx.x >= offset) {
+      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
+    }
+
+    __syncthreads();
+    if (threadIdx.x >= offset) {
+      smem[threadIdx.x] = val;
+    }
+
+    __syncthreads();
+  }
+
+  *out = smem[threadIdx.x];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
+  // FIXME: crappy implementation
+  // We kill write-after-read dependencies separately below, hence the `false`
+  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  *out -= in;
+  *carry = smem[blockDim.x - 1];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Inclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
+  // Within-warp, we use warp voting.
+  T vote = WARP_BALLOT(in);
+  T index = __popc(getLaneMaskLe() & vote);
+  T carry = __popc(vote);
+
+  int warp = threadIdx.x / 32;
+
+  // Per each warp, write out a value
+  if (getLaneId() == 0) {
+    smem[warp] = carry;
+  }
+
+  __syncthreads();
+
+  // Sum across warps in one thread. This appears to be faster than a
+  // warp shuffle scan for CC 3.0+
+  if (threadIdx.x == 0) {
+    int current = 0;
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      T v = smem[i];
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
+    }
+  }
+
+  __syncthreads();
+
+  // load the carry from the preceding warp
+  if (warp >= 1) {
+    index = binop(index, smem[warp - 1]);
+  }
+
+  *out = index;
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  // Inclusive to exclusive
+  *out -= (T) in;
+
+  // The outgoing carry for all threads is the last warp's sum
+  *carry = smem[(blockDim.x / 32) - 1];
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+#endif // THC_SCAN_UTILS_INC
diff --git a/aten/src/THC/THCSleep.cu b/aten/src/THC/THCSleep.cu
new file mode 100644
index 0000000..d305762
--- /dev/null
+++ b/aten/src/THC/THCSleep.cu
@@ -0,0 +1,21 @@
+#include "THCSleep.h"
+
+
+__global__ void spin_kernel(int64_t cycles)
+{
+  // see concurrentKernels CUDA sampl
+  int64_t start_clock = clock64();
+  int64_t clock_offset = 0;
+  while (clock_offset < cycles)
+  {
+    clock_offset = clock64() - start_clock;
+  }
+}
+
+THC_API void THC_sleep(THCState* state, int64_t cycles)
+{
+  dim3 grid(1);
+  dim3 block(1);
+  spin_kernel<<<grid, block, 0, THCState_getCurrentStream(state)>>>(cycles);
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/aten/src/THC/THCSleep.h b/aten/src/THC/THCSleep.h
new file mode 100644
index 0000000..ebd7e40
--- /dev/null
+++ b/aten/src/THC/THCSleep.h
@@ -0,0 +1,10 @@
+#ifndef THC_SPIN_INC
+#define THC_SPIN_INC
+
+#include "THCGeneral.h"
+#include <time.h>
+
+// enqueues a kernel that spins for the specified number of cycles
+THC_API void THC_sleep(THCState* state, int64_t cycles);
+
+#endif
diff --git a/aten/src/THC/THCSortUtils.cu b/aten/src/THC/THCSortUtils.cu
new file mode 100644
index 0000000..2561034
--- /dev/null
+++ b/aten/src/THC/THCSortUtils.cu
@@ -0,0 +1,17 @@
+#include "THCSortUtils.cuh"
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+uint64_t nextHighestPowerOf2(uint64_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+#ifndef _MSC_VER
+  n |= n >> 32;
+#endif
+  n++;
+
+  return n;
+}
diff --git a/aten/src/THC/THCSortUtils.cuh b/aten/src/THC/THCSortUtils.cuh
new file mode 100644
index 0000000..518063a
--- /dev/null
+++ b/aten/src/THC/THCSortUtils.cuh
@@ -0,0 +1,216 @@
+#ifndef THC_SORT_UTILS_INC
+#define THC_SORT_UTILS_INC
+
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCNumerics.cuh"
+
+// Collection of kernel sort routines
+template <typename T>
+struct LTComp {
+  __device__ inline bool operator()(const T& a, const T& b) const {
+    return THCNumerics<T>::lt(a, b);
+  }
+};
+
+template <typename T>
+struct GTComp {
+  __device__ inline bool operator()(const T& a, const T& b) const {
+    return THCNumerics<T>::gt(a, b);
+  }
+};
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
+                                   K& kB, V& vB, bool& validB,
+                                   bool dir,
+                                   const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(K& kA, bool& validA,
+                                       K& kB, bool& validB,
+                                       bool dir,
+                                       const Comparator& comp) {
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(validA, validB);
+  }
+}
+
+template <typename Comparator, typename K, typename V,
+          typename IndexType, int Power2SortSize>
+__device__ inline void bitonicSort(K keys[Power2SortSize],
+                                   V values[Power2SortSize],
+                                   bool valid[Power2SortSize],
+                                   const Comparator& comp) {
+#pragma unroll
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#pragma unroll
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+    
+      __syncthreads();
+      
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwap<Comparator, K, V>(
+        keys[pos], values[pos], valid[pos],
+        keys[pos + stride], values[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+#pragma unroll
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    
+    __syncthreads();
+    
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwap<Comparator, K, V>(
+      keys[pos], values[pos], valid[pos],
+      keys[pos + stride], values[pos + stride], valid[pos + stride],
+      false, comp);
+  }
+
+  __syncthreads();
+  
+}
+
+template <typename Comparator, typename K,
+          typename IndexType, int Power2SortSize>
+__device__ inline void bitonicSortKeys(K keys[Power2SortSize],
+                                   bool valid[Power2SortSize],
+                                   const Comparator& comp) {
+#pragma unroll
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#pragma unroll
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+
+      __syncthreads();
+      
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwapKeys<Comparator, K>(
+        keys[pos], valid[pos],
+        keys[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+#pragma unroll
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+    
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwapKeys<Comparator, K>(
+      keys[pos], valid[pos],
+      keys[pos + stride], valid[pos + stride],
+      false, comp);
+  }
+
+  __syncthreads();
+  
+}
+
+// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
+// modifies the input `keys` and `values`
+template <typename K, typename V,
+          int KeyDims, int ValueDims,
+          typename Comparator, typename IndexType, int Power2SortSize>
+__launch_bounds__(1024)
+__global__ void
+bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
+                     IndexType keySlices,
+                     IndexType keySliceSize,
+                     IndexType keySliceStride,
+                     TensorInfo<V, IndexType> values,
+                     IndexType valueSliceStride,
+                     const Comparator& comp) {
+  // Find the slice of the tensor that we are sorting
+  const IndexType linearIndex = getLinearBlockId<IndexType>();
+  // Tiling the slices could have us be out of bounds, if there are a
+  // lot of slices to sort
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  __shared__ K sharedKeys[Power2SortSize];
+  __shared__ V sharedValues[Power2SortSize];
+  __shared__ bool sharedValid[Power2SortSize];
+
+  const IndexType keyStartOffset =
+    IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  // If the sort size is 1, the data is already sorted
+  if (Power2SortSize == 1) {
+    return;
+  } else {
+    // Otherwise, each thread is responsible for loading and storing 2
+    // elements. The sort size is guaranteed to be >= 2
+    const int elem1 = threadIdx.x;
+    const int elem2 = threadIdx.x + (Power2SortSize / 2);
+
+    bool valid1 = (elem1 < keySliceSize);
+    K k1 = valid1 ?
+      keys.data[keyStartOffset + elem1 * keySliceStride] : ScalarConvert<int, K>::to(0);
+    V v1 = valid1 ?
+      values.data[valueStartOffset + elem1 * valueSliceStride] : ScalarConvert<int, V>::to(0);
+
+    sharedKeys[elem1] = k1;
+    sharedValues[elem1] = v1;
+    sharedValid[elem1] = valid1;
+
+    bool valid2 = (elem2 < keySliceSize);
+    K k2 = valid2 ?
+      keys.data[keyStartOffset + elem2 * keySliceStride] : ScalarConvert<int, K>::to(0);
+    V v2 = valid2 ?
+      values.data[valueStartOffset + elem2 * valueSliceStride] : ScalarConvert<int, V>::to(0);
+
+    sharedKeys[elem2] = k2;
+    sharedValues[elem2] = v2;
+    sharedValid[elem2] = valid2;
+
+    // Sort!
+    bitonicSort<Comparator, K, V, IndexType, Power2SortSize>(
+      sharedKeys, sharedValues, sharedValid, comp);
+
+    // elem1 and elem2 values might be out-of-range, if the data size we are
+    // sorting is smaller than half the power2 size
+    if (valid1) {
+      keys.data[keyStartOffset + elem1 * keySliceStride] =
+        sharedKeys[elem1];
+      values.data[valueStartOffset + elem1 * valueSliceStride] =
+        sharedValues[elem1];
+    }
+
+    if (valid2) {
+      keys.data[keyStartOffset + elem2 * keySliceStride] =
+        sharedKeys[elem2];
+      values.data[valueStartOffset + elem2 * valueSliceStride] =
+        sharedValues[elem2];
+    }
+  }
+}
+
+uint64_t nextHighestPowerOf2(uint64_t n);
+
+#endif // THC_SORT_UTILS_INC
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
new file mode 100644
index 0000000..c4ff07c
--- /dev/null
+++ b/aten/src/THC/THCStorage.cpp
@@ -0,0 +1,111 @@
+#include "THCStorage.hpp"
+#include "THCGeneral.h"
+
+#include "THCHalf.h"
+
+#include <new>
+
+#include "generic/THCStorage.cpp"
+#include "THCGenerateAllTypes.h"
+
+THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type)
+{
+  return THCStorage_newWithSize(state, scalar_type, 0);
+}
+
+THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size)
+{
+  return THCStorage_newWithAllocator(
+    state, scalar_type, size,
+    state->cudaDeviceAllocator);
+}
+
+THCStorage* THCStorage_newWithAllocator(THCState *state,
+                                        at::ScalarType scalar_type,
+                                        ptrdiff_t size,
+                                        at::Allocator* allocator)
+{
+  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
+  memset(storage, 0, sizeof(THCStorage));
+  new (&storage->refcount) std::atomic<int>(1);
+  new (&storage->weakcount) std::atomic<int>(1);
+  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
+  storage->scalar_type = scalar_type;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
+  storage->allocator = allocator;
+  storage->size = size;
+
+  at::DataPtr ptr;
+  try {
+    ptr = allocator->allocate(size * at::elementSize(scalar_type));
+  } catch(...) {
+    free(storage);
+    throw;
+  }
+  new (&storage->data_ptr) at::DataPtr(std::move(ptr));
+  return storage;
+}
+
+void THCStorage_free(THCState *state, THCStorage *storage)
+{
+  THStorage_free(storage);
+}
+
+void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
+{
+  THArgCheck(size >= 0, 2, "invalid size");
+  THAssert(self->allocator != nullptr);
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  if(!(self->flag & TH_STORAGE_RESIZABLE))
+    THError("Trying to resize storage that is not resizable");
+
+  size_t elementSize = at::elementSize(self->scalar_type);
+
+  if(size == 0)
+  {
+    self->data_ptr = at::DataPtr(nullptr, at::Device(at::kCUDA, device));
+    self->size = 0;
+  }
+  else
+  {
+    at::DataPtr data =
+      self->allocator->allocate(size * elementSize);
+
+    if (self->data_ptr) {
+      // Enable p2p access when the memcpy is across devices
+      THCState_getPeerToPeerAccess(state, device, THCStorage_getDevice(state, self));
+
+      THCudaCheck(cudaMemcpyAsync(data.get(),
+                                  self->data_ptr.get(),
+                                  THMin(self->size, size) * elementSize,
+                                  cudaMemcpyDeviceToDevice,
+                                  THCState_getCurrentStream(state)));
+    }
+
+    // Destructively overwrite data_ptr
+    self->data_ptr = std::move(data);
+    self->size = size;
+  }
+}
+
+int THCStorage_getDevice(THCState* state, const THCStorage* storage) {
+  return storage->data_ptr.device().index();
+}
+
+THCStorage* THCStorage_newWithDataAndAllocator(
+  THCState *state, at::ScalarType scalar_type, at::DataPtr&& data, ptrdiff_t size,
+  at::Allocator *allocator) {
+  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
+  memset(storage, 0, sizeof(THCStorage));
+  storage->scalar_type = scalar_type;
+  new (&storage->data_ptr) at::DataPtr(std::move(data));
+  storage->size = size;
+  new (&storage->refcount) std::atomic<int>(1);
+  new (&storage->weakcount) std::atomic<int>(1);
+  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
+  storage->allocator = allocator;
+  return storage;
+}
diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu
new file mode 100644
index 0000000..43a2934
--- /dev/null
+++ b/aten/src/THC/THCStorage.cu
@@ -0,0 +1,13 @@
+#include "THCStorage.hpp"
+
+#include "THCThrustAllocator.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__)
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+#include "THCHalf.h"
+
+#include "generic/THCStorage.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h
new file mode 100644
index 0000000..22a607c
--- /dev/null
+++ b/aten/src/THC/THCStorage.h
@@ -0,0 +1,12 @@
+#ifndef THC_STORAGE_INC
+#define THC_STORAGE_INC
+
+#include "THStorage.h"
+#include "THCGeneral.h"
+
+#define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME)
+
+#include "generic/THCStorage.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
new file mode 100644
index 0000000..ae5ad7b
--- /dev/null
+++ b/aten/src/THC/THCStorage.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include "THCStorage.h"
+#include <TH/THStorage.hpp>
+
+#include "ATen/ScalarType.h"
+#include "ATen/ScalarTypeUtils.h"
+#include <atomic>
+
+namespace at {
+
+template <>
+struct CTypeToScalarType<__half> : public CTypeToScalarType<Half> {};
+
+}
+
+THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type);
+THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size);
+
+THC_API THCStorage* THCStorage_newWithAllocator(THCState *state,
+                                        at::ScalarType scalar_type,
+                                        ptrdiff_t size,
+                                        at::Allocator* allocator);
+
+THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
+
+// This exists to have a data-type independent way of freeing (necessary for THPPointer).
+THC_API void THCStorage_free(THCState *state, THCStorage *self);
+
+THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size);
+THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
+
+THC_API THCStorage* THCStorage_newWithDataAndAllocator(
+  THCState *state, at::ScalarType scalar_type,
+  at::DataPtr&& data, ptrdiff_t size,
+  at::Allocator* allocator);
diff --git a/aten/src/THC/THCStorageCopy.cpp b/aten/src/THC/THCStorageCopy.cpp
new file mode 100644
index 0000000..9e42df5
--- /dev/null
+++ b/aten/src/THC/THCStorageCopy.cpp
@@ -0,0 +1,7 @@
+#include "THCStorageCopy.h"
+#include "THCTensor.hpp"
+
+#include "THCTensorCopy.h"
+
+#include "generic/THCStorageCopy.cpp"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu
new file mode 100644
index 0000000..8d7c869
--- /dev/null
+++ b/aten/src/THC/THCStorageCopy.cu
@@ -0,0 +1,10 @@
+#include "THCStorageCopy.h"
+#include "THCGeneral.h"
+
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/THCStorageCopy.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h
new file mode 100644
index 0000000..837056f
--- /dev/null
+++ b/aten/src/THC/THCStorageCopy.h
@@ -0,0 +1,11 @@
+#ifndef THC_STORAGE_COPY_INC
+#define THC_STORAGE_COPY_INC
+
+#include "THCStorage.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+#include "generic/THCStorageCopy.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/THC/THCStream.cpp b/aten/src/THC/THCStream.cpp
new file mode 100644
index 0000000..01fc9b0
--- /dev/null
+++ b/aten/src/THC/THCStream.cpp
@@ -0,0 +1,32 @@
+#include "THCStream.h"
+#include "ATen/CUDAStream.h"
+
+THC_API THCStream* THCStream_defaultStream(int device) {
+  return at::detail::CUDAStream_getDefaultStreamOnDevice(device);
+}
+
+THC_API THCStream* THCStream_new(int flags) { 
+  return THCStream_newWithPriority(flags, at::CUDAStream::DEFAULT_PRIORITY);
+}
+
+THC_API THCStream* THCStream_newWithPriority(int flags, int priority) {
+  return at::detail::CUDAStream_createAndRetainWithOptions(flags, priority);
+}
+
+THC_API cudaStream_t THCStream_stream(THCStream* stream) {
+  return at::detail::CUDAStream_stream(stream);
+}
+
+THC_API int THCStream_device(THCStream* stream) { 
+  return at::detail::CUDAStream_device(stream);
+}
+
+THC_API void THCStream_retain(THCStream* stream) {
+  at::detail::CUDAStream_retain(stream); 
+}
+
+THC_API void THCStream_free(THCStream* stream) { 
+  at::detail::CUDAStream_free(stream); 
+}
+
+
diff --git a/aten/src/THC/THCStream.h b/aten/src/THC/THCStream.h
new file mode 100644
index 0000000..87e5037
--- /dev/null
+++ b/aten/src/THC/THCStream.h
@@ -0,0 +1,26 @@
+#ifndef THC_STREAM_INC
+#define THC_STREAM_INC
+
+#include "THCGeneral.h"
+
+/*
+* Note: legacy API.
+*
+* Stream usage should be done through ATen/Context.h.
+*/
+typedef struct CUDAStreamInternals THCStream;
+
+// Stream creation
+THC_API THCStream* THCStream_defaultStream(int device);
+THC_API THCStream* THCStream_new(int flags);
+THC_API THCStream* THCStream_newWithPriority(int flags, int priority);
+
+// Getters
+THC_API cudaStream_t THCStream_stream(THCStream*);
+THC_API int THCStream_device(THCStream*);
+
+// Memory management
+THC_API void THCStream_retain(THCStream*);
+THC_API void THCStream_free(THCStream*);
+
+#endif // THC_STREAM_INC
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
new file mode 100644
index 0000000..55e46bf
--- /dev/null
+++ b/aten/src/THC/THCTensor.cpp
@@ -0,0 +1,443 @@
+#include "THCGeneral.h"
+#include "THCTensor.hpp"
+#include "THCTensorCopy.h"
+
+#include <new>
+
+#include "generic/THCTensor.cpp"
+#include "THCGenerateAllTypes.h"
+
+#include "THCTensorInfo.cuh"
+
+int THCTensor_nDimension(THCState *state, const THCTensor *self) {
+  return self->dim();
+}
+
+int THCTensor__nDimension(THCState *state, const THCTensor *self) {
+  return self->_dim();
+}
+
+int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range");
+  return self->size[dim];
+}
+
+int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range");
+  return self->stride[dim];
+}
+THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self) {
+  THLongStorage *size = THLongStorage_newWithSize(self->dim());
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) {
+  switch(scalar_type) {
+    case at::ScalarType::Byte:
+      return THCudaByteTensor_new(state);
+    case at::ScalarType::Char:
+      return THCudaCharTensor_new(state);
+    case at::ScalarType::Short:
+      return THCudaShortTensor_new(state);
+    case at::ScalarType::Int:
+      return THCudaIntTensor_new(state);
+    case at::ScalarType::Long:
+      return THCudaLongTensor_new(state);
+#ifdef CUDA_HALF_TENSOR
+    case at::ScalarType::Half:
+      return THCudaHalfTensor_new(state);
+#endif
+    case at::ScalarType::Float:
+      return THCudaTensor_new(state);
+    case at::ScalarType::Double:
+      return THCudaDoubleTensor_new(state);
+    default:
+      AT_ERROR("unexpected ScalarType: ", at::toString(scalar_type));
+  }
+}
+
+void THCTensor_resize(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride) {
+  THArgCheck(size != NULL, 2, "invalid size");
+  if(stride)
+    THArgCheck(stride->size == size->size, 3, "invalid stride");
+
+  THCTensor_resizeNd(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL));
+}
+
+void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) {
+  int isSame = 0;
+  int d;
+  if(self->dim() == src->dim())
+  {
+    isSame = 1;
+    for(d = 0; d < self->dim(); d++)
+    {
+      if(self->size[d] != src->size[d])
+      {
+        isSame = 0;
+        break;
+      }
+    }
+  }
+
+  if(!isSame)
+    THCTensor_resizeNd(state, self, src->dim(), src->size, NULL);
+}
+
+void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride)
+{
+  int d;
+  ptrdiff_t totalSize;
+  bool hascorrectsize = true;
+
+#ifndef USE_TH_SCALAR
+  AT_CHECK(nDimension > 0, "resizeNd nDimension must be greater than 0");
+#else
+  AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
+#endif
+
+  for(d = 0; d < nDimension; d++)
+  {
+#ifndef USE_TH_SIZE_ZERO_DIM
+    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
+    // currently exist and expect a size [0] tensor to be returned.
+    if (d == 0 && size[d] == 0) {
+      nDimension = 1;
+    } else {
+      AT_CHECK(size[d] > 0, "sizes must be non-negative");
+    }
+#endif
+    if((self->dim() > d) && (size[d] != self->size[d])) {
+      hascorrectsize = false;
+    }
+
+    // NB: this used to test that stride[d] was >= 0
+    if((self->dim() > d) && stride && (stride[d] != self->stride[d])) {
+      hascorrectsize = false;
+    }
+  }
+
+  if(nDimension != self->dim()) {
+    hascorrectsize = false;
+  }
+
+  if(hascorrectsize) {
+    return;
+  }
+
+  if(nDimension != self->dim())
+  {
+    self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension);
+    self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension);
+    self->dim_ = nDimension;
+  }
+
+  totalSize = 1;
+  for(d = nDimension-1; d >= 0; d--)
+  {
+    self->size[d] = size[d];
+    if(stride && (stride[d] >= 0) ) {
+      self->stride[d] = stride[d];
+    } else {
+      if(d == nDimension-1) {
+        self->stride[d] = 1;
+      } else {
+        // Keep stride monotonically increasing to match NumPy.
+        self->stride[d] = std::max<int64_t>(self->size[d+1],1)*self->stride[d+1];
+      }
+    }
+    totalSize += (self->size[d]-1)*self->stride[d];
+  }
+
+  if(totalSize+self->storageOffset > 0)
+  {
+    if(!self->storage) {
+      THError("Tensor: invalid null storage");
+    }
+    if(totalSize+self->storageOffset > self->storage->size) {
+      THCStorage_resize(state, self->storage, totalSize+self->storageOffset);
+    }
+  }
+}
+
+void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src)
+{
+  if(self != src)
+    THCTensor_setStorageNd(state,
+                           self,
+                           src->storage,
+                           src->storageOffset,
+                           src->dim(),
+                           src->size,
+                           src->stride);
+}
+
+void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
+{
+  /* storage */
+  if(self->storage != storage)
+  {
+    if (!self->storage) {
+      THError("Tensor: invalid null storage");
+    }
+    auto scalar_type = self->storage->scalar_type;
+    THCStorage_free(state, self->storage);
+
+    if(storage)
+    {
+      self->storage = storage;
+      THStorage_retain(self->storage);
+    }
+    else
+      self->storage = THCStorage_new(state, scalar_type);
+  }
+
+  /* storageOffset */
+  if(storageOffset < 0)
+    THError("Tensor: invalid storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THCTensor_resizeNd(state, self, nDimension, size, stride);
+}
+
+void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck(dimension < src->dim(), 3, "dimension out of range");
+
+  THCTensor_set(state, self, src);
+
+#ifdef TH_SCALAR
+  if(src->size[dimension] == 1)
+#else
+  if(src->size[dimension] == 1 && src->dim() > 1)
+#endif
+  {
+    for(d = dimension; d < self->dim()-1; d++)
+    {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->dim_--;
+  }
+}
+
+void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor");
+#endif
+
+  THCTensor_set(state, self, src);
+
+  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1));
+  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1));
+  self->dim_++;
+  for (d = self->dim()-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->dim()) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
+bool THCTensor_isContiguous(THCState *state, const THCTensor *self) {
+  if (self->is_empty()) return true;
+  int64_t z = 1;
+  int d;
+  for(d = self->dim()-1; d >= 0; d--)
+  {
+    if(self->size[d] != 1)
+    {
+      if(self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return false;
+    }
+  }
+  return true;
+}
+
+bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs) {
+  THAssert(numInputs > 0);
+  for (int i = 0; i < numInputs; ++i) {
+    if (!THCTensor_isContiguous(state, inputs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) {
+  if(self->_dim() == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < self->_dim(); d++)
+      nElement *= self->size[d];
+    return nElement;
+  }
+}
+
+void THCTensor_retain(THCState *state, THCTensor *self) {
+  self->refcount++;
+}
+
+
+void THCTensor_free(THCState *state, THCTensor *self) {
+  THTensor_free(self);
+}
+
+int THCTensor_getDevice(THCState* state, const THCTensor* tensor) {
+  if (!tensor->storage) return -1;
+  return THCStorage_getDevice(state, tensor->storage);
+}
+
+bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs) {
+  THAssert(numInputs > 0);
+  int device = THCTensor_getDevice(state, inputs[0]);
+  for (int i = 1; i < numInputs; ++i) {
+    if (THCTensor_getDevice(state, inputs[i]) != device) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem) {
+  ptrdiff_t elements = THCTensor_nElement(state, t);
+  if (elements >= max_elem) {
+    return false;
+  }
+
+  ptrdiff_t offset = 0;
+  ptrdiff_t linearId = elements - 1;
+
+  for (int i = THCTensor__nDimension(state, t) - 1; i >= 0; --i) {
+    ptrdiff_t curDimIndex =
+      linearId % THCTensor_size(state, t, i);
+    ptrdiff_t curDimOffset = curDimIndex *
+      THCTensor_stride(state, t, i);
+    offset += curDimOffset;
+    linearId /= THCTensor_size(state, t, i);
+  }
+
+  if (offset >= max_elem) {
+    return false;
+  }
+
+  return true;
+}
+
+bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs) {
+  for (int i = 0; i < numInputs; ++i) {
+    if (!THCTensor_canUse32BitIndexMath(state, inputs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* Due to the resize semantics of ops with `out=` keywords, if       */ \
+/* the output `tensor` has the same shape as the output of the       */ \
+/* reduction operation, then any noncontiguities in the output       */ \
+/* `tensor` should be preserved. This needs to be special cased b/c  */ \
+/* otherwise, when keepdim=False, the implementations of reduction   */ \
+/* ops resize `tensor` to the reduced size with keepdim=True, and    */ \
+/* then later squeeze `tensor` to the correct output size, breaking  */ \
+/* the contiguity guarantees of the resize semantics.                */ \
+void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor,
+                                          int in_dims, int64_t dimension, int keepdim) {
+  int out_dims = THCTensor__nDimension(state, tensor);
+  if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) {
+    THCTensor_unsqueeze1d(state, tensor, tensor, dimension);
+  }
+}
+
+namespace {
+
+struct SizeAndStride {
+  int64_t size;
+  int64_t stride;
+};
+
+/*
+ A comparator that will sort SizeAndStride structs by stride,
+ in ascending order.
+ */
+int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+
+  if (aS->stride < bS->stride) return -1;
+  if (aS->stride == bS->stride) return 0;
+  return 1;
+}
+
+}
+
+/* Returns false if there is no possibility that the tensor    */
+/* has "overlapping" indices and true otherwise.               */
+/* "Overlapping" indices are two+ valid indices that specify   */
+/* the same offset within the tensor.                          */
+/* The function does this by checking for a sufficient but not */
+/* necessary condition of no overlap. In particular, that      */
+/* that there exists an ordering of the tensor's dimensions    */
+/* that is nicely "nested," with each dimension contained      */
+/* within the next one.                                        */
+bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t) {
+  /* Extract size/stride arrays; only consider size >1 dims. */
+  SizeAndStride info[MAX_CUTORCH_DIMS];
+
+  int dims = THCTensor__nDimension(state, t);
+  int nonSize1Dims = 0;
+  for (int i = 0; i < dims; ++i) {
+    int64_t size = THCTensor_size(state, t, i);
+
+    if (size > 1) {
+      info[nonSize1Dims].size = size;
+      info[nonSize1Dims].stride =
+        THCTensor_stride(state, t, i);
+
+      if (info[nonSize1Dims].stride < 1) {
+        return true;
+      }
+
+      ++nonSize1Dims;
+    }
+  }
+
+  /* Short-circuits if tensor is a single element.             */
+  if (nonSize1Dims == 0) {
+    return false;
+  }
+
+  /* Ascending order (innermost dimension in sorted view is at [0]) */
+  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
+
+  for (int i = 0; i < (nonSize1Dims - 1); ++i) {
+    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/aten/src/THC/THCTensor.cu b/aten/src/THC/THCTensor.cu
new file mode 100644
index 0000000..34de80f
--- /dev/null
+++ b/aten/src/THC/THCTensor.cu
@@ -0,0 +1,5 @@
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/THCTensor.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensor.h b/aten/src/THC/THCTensor.h
new file mode 100644
index 0000000..368ec99
--- /dev/null
+++ b/aten/src/THC/THCTensor.h
@@ -0,0 +1,20 @@
+#ifndef THC_TENSOR_INC
+#define THC_TENSOR_INC
+
+#include "THTensor.h"
+#include "THCStorage.h"
+#include "THCGeneral.h"
+
+#define THCTensor_(NAME)   TH_CONCAT_4(TH,CReal,Tensor_,NAME)
+
+#define THC_DESC_BUFF_LEN 64
+
+typedef struct THC_CLASS THCDescBuff
+{
+    char str[THC_DESC_BUFF_LEN];
+} THCDescBuff;
+
+#include "generic/THCTensor.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp
new file mode 100644
index 0000000..56147b2
--- /dev/null
+++ b/aten/src/THC/THCTensor.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include "THCTensor.h"
+#include "THTensor.hpp"
+#include "THCStorage.hpp"
+
+#include <atomic>
+#include <ATen/ATen.h>
+
+// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim().
+THC_API int THCTensor_nDimension(THCState *state, const THCTensor *self);
+THC_API int THCTensor__nDimension(THCState *state, const THCTensor *self);
+
+THC_API int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim);
+THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim);
+THC_API THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self);
+
+THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type);
+
+THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
+THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride);
+THC_API void THCTensor_resizeAs(THCState *state, THCTensor *tensor, THCTensor *src);
+
+THC_API void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride);
+
+THC_API void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+THC_API void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+
+THC_API bool THCTensor_isContiguous(THCState *state, const THCTensor *self);
+THC_API bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs);
+THC_API ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self);
+
+THC_API void THCTensor_retain(THCState *state, THCTensor *self);
+THC_API void THCTensor_free(THCState *state, THCTensor *self);
+
+THC_API int THCTensor_getDevice(THCState* state, const THCTensor* tensor);
+THC_API bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs);
+
+/* Can we use 32 bit math for indexing? */
+THC_API bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem=INT32_MAX);
+/* Are all tensors 32-bit indexable? */
+THC_API bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs);
+THC_API void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor, int in_dims,
+                                                  int64_t dimension, int keepdim);
+/* Returns false if there is no possibility that the tensor    */
+/* has more than one index that references the same datapoint, */
+/* true otherwise.                                             */
+THC_API bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t);
diff --git a/aten/src/THC/THCTensorCopy.cpp b/aten/src/THC/THCTensorCopy.cpp
new file mode 100644
index 0000000..09e043c
--- /dev/null
+++ b/aten/src/THC/THCTensorCopy.cpp
@@ -0,0 +1,7 @@
+#include "THCTensorCopy.h"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "THCCachingHostAllocator.h"
+
+#include "generic/THCTensorCopy.cpp"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
new file mode 100644
index 0000000..7f42f72
--- /dev/null
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -0,0 +1,210 @@
+#include "THCApply.cuh"
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+#include "THCTensorCopy.hpp"
+#include <type_traits>
+
+inline int curGPU() {
+  int curDev;
+  THCudaCheck(cudaGetDevice(&curDev));
+  return curDev;
+}
+
+// Copy operator for the pointwise apply kernel
+template <typename TypeDst, typename TypeSrc>
+struct CopyOp {
+  __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) {
+#if __CUDA_ARCH__ >= 350
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src));
+#else
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src);
+#endif
+  }
+};
+
+// Copy for the same type to the same type
+template <typename ScalarTypeDst, typename ScalarTypeSrc>
+void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) {
+
+  ptrdiff_t totalElements = THCTensor_nElement(state, dst);
+
+  THArgCheck(totalElements ==
+             THCTensor_nElement(state, src),
+             2, "sizes do not match");
+
+  if (THCTensor__nDimension(state, dst) == 0) {
+    // Zero-dim tensor; copy nothing
+    return;
+  }
+
+  // We can memcpy the memory if:
+  // -both tensors are contiguous; or,
+  // -there is only one element to copy; or,
+  // -FIXME: if both tensors have matching size and stride arrays, and no
+  // holes within (in other words, there is some permutation that can be applied
+  // to the size/strides such that the resulting tensor is
+  // contiguous).
+  // -AND: both tensors have the same type.
+  bool sameType = std::is_same<ScalarTypeDst, ScalarTypeSrc>::value;
+  bool srcContig = THCTensor_isContiguous(state, src);
+  bool dstContig = THCTensor_isContiguous(state, dst);
+  bool memcpyEligible =
+    ((srcContig && dstContig) || (totalElements == 1)) && sameType;
+
+  int srcDev = THCTensor_getDevice(state, src);
+  int dstDev = THCTensor_getDevice(state, dst);
+  int oldDev = curGPU();
+
+  // Try to enable p2p access. This also handles the case srcDev == dstDev.
+  bool p2pEnabled = THCState_getPeerToPeerAccess(state, srcDev, dstDev);
+
+  // We always perform the copy on the source device, using the
+  // current stream on the source device.
+  // If the copy is on the default stream, then we fully synchronize
+  // both src and dst's default streams for completion of the
+  // copy. We have to explicitly do this for non-contig copies.
+  // This mimics the behavior of cross-device cudaMemcpyAsync on
+  // the default stream.
+  // If the copy is not on the default stream, then it is up to the
+  // user to add needed synchronization on the dst device, since the
+  // stream on the dst device that wishes to synchronize may not be
+  // the same index as the one on the src device.
+  cudaStream_t copyStream = THCState_getCurrentStreamOnDevice(state, srcDev);
+  if (srcDev != dstDev && copyStream == NULL) {
+    // This is a cross-device copy on the default stream. We perform a
+    // two-way barrier between both devices' default streams before
+    // the copy. This ensures that any write-after-write and
+    // write-after-read dependencies on the destination side are
+    // handled, so that no one is operating on the dst memory when
+    // we perform the copy.
+    // src waits on dst barrier (src already waits on src)
+    cudaEvent_t dstReady;
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(dstReady, NULL));
+
+    THCudaCheck(cudaSetDevice(srcDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
+    THCudaCheck(cudaEventDestroy(dstReady));
+  } else if (srcDev != oldDev) {
+    THCudaCheck(cudaSetDevice(srcDev));
+  }
+
+  // We are now on srcDev
+  if (memcpyEligible) {
+    // Perform the copy
+    THCudaCheck(cudaMemcpyAsync(
+                  dst->template data<ScalarTypeDst>(),
+                  src->template data<ScalarTypeSrc>(),
+                  totalElements *
+                  sizeof(ScalarTypeDst),
+                  cudaMemcpyDeviceToDevice,
+                  copyStream));
+  } else {
+    // Non-contiguous copy or a type-conversion copy
+
+    // We avoid creating temporary memory copies if possible.
+    // If both src and dst are on the same device, or if they are on
+    // different devices and p2p access is enabled, perform the copy
+    // by a pointwise copy kernel.
+    // Otherwise, we'll have to make contiguous (which will in fact
+    // invoke copy() again), and then perform the copy.
+    // FIXME: might want to consider only running the pointwise kernel
+    // if both src and dst innermost dimensions are contiguous. If
+    // they are not, then taking the hit of the memory allocation/free
+    // might be worth it to avoid non-coalesced reads or writes.
+    if (p2pEnabled) {
+      bool succ =
+        THC_pointwiseApply2<ScalarTypeDst,
+                            ScalarTypeSrc>(
+          state, dst, src,
+          CopyOp<ScalarTypeDst,
+                 ScalarTypeSrc>());
+
+      THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+    } else {
+      // GPUs can't access each other directly, but the tensors
+      // involved are non-contiguous and/or are different types.
+
+      // Make sure the src is contiguous and in the same type as dst
+      THCudaCheck(cudaSetDevice(srcDev));
+      THCTensor* srcContig = NULL;
+
+      if (sameType) {
+        srcContig = THCTensor_newContiguous<ScalarTypeSrc>(state, src);
+
+      } else {
+        // Types are different
+        // Copy into the new format, contiguous, on the source device
+        srcContig = THCTensor_new(state,
+                                  at::CTypeToScalarType<ScalarTypeDst>::to());
+        THCTensor_resizeAs(state, srcContig, dst);
+
+        bool succ =
+          THC_pointwiseApply2<ScalarTypeDst,
+                              ScalarTypeSrc>(
+            state, srcContig, src,
+            CopyOp<ScalarTypeDst,
+                   ScalarTypeSrc>());
+
+        THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+      }
+
+      // Make sure the dst is contiguous
+      THCudaCheck(cudaSetDevice(dstDev));
+      THCTensor* dstContig = THCTensor_newContiguous<ScalarTypeDst>(state, dst);
+
+      // Now, we are ready for a cross-device memcpy of contiguous
+      // data, of the same layout and type
+      THCudaCheck(cudaSetDevice(srcDev));
+
+      THCudaCheck(cudaMemcpyAsync(
+                    dstContig->template data<ScalarTypeDst>(),
+                    srcContig->template data<ScalarTypeDst>(),
+                    totalElements *
+                    sizeof(ScalarTypeDst),
+                    cudaMemcpyDeviceToDevice,
+                    copyStream));
+
+      // We are done with the src
+      THCTensor_free(state, srcContig);
+
+      if (dst != dstContig) {
+        THCTensor_freeCopyTo<ScalarTypeDst>(state, dstContig, dst);
+      } else {
+        THCTensor_free(state, dstContig);
+      }
+
+      // We're still on srcDev at this point
+    }
+  }
+
+  if (srcDev != dstDev && copyStream == NULL) {
+    // dst waits on src barrier (dst already waits on dst). We cannot
+    // operate on dst's copy until the copy is complete.
+
+    // Still on srcDev, record default stream event
+    cudaEvent_t srcReady;
+    THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(srcReady, NULL));
+
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
+    THCudaCheck(cudaEventDestroy(srcReady));
+
+    // We are now on dstDev (right above). Restore prior device from dst
+    if (dstDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  } else {
+    // We are still on srcDev. Restore prior device from src
+    if (srcDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#include "generic/THCTensorCopy.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h
new file mode 100644
index 0000000..74f2b59
--- /dev/null
+++ b/aten/src/THC/THCTensorCopy.h
@@ -0,0 +1,12 @@
+#ifndef TH_CUDA_TENSOR_COPY_INC
+#define TH_CUDA_TENSOR_COPY_INC
+
+#include "THCTensor.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCStream.h"
+
+#include "generic/THCTensorCopy.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/aten/src/THC/THCTensorCopy.hpp b/aten/src/THC/THCTensorCopy.hpp
new file mode 100644
index 0000000..8e3c762
--- /dev/null
+++ b/aten/src/THC/THCTensorCopy.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "THCTensorCopy.h"
+
+template <typename ScalarTypeDst, typename ScalarTypeSrc>
+void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src);
+
+template <typename ScalarType>
+THCTensor *THCTensor_newClone(THCState *state, THCTensor *self);
+
+template <typename ScalarType>
+THCTensor *THCTensor_newContiguous(THCState *state, THCTensor *self);
+
+template <typename ScalarType>
+void THCTensor_freeCopyTo(THCState *state, THCTensor *self, THCTensor *dst);
+
+template <typename ScalarType>
+void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, THCTensor* src);
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
new file mode 100644
index 0000000..68bea1b
--- /dev/null
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -0,0 +1,482 @@
+#include "THC.h"
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCTensorRandom.h"
+#include "THCHalf.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <algorithm> // for std::min
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexCopyLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexCopySmallIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<T, IndexType> src,
+                                    TensorInfo<int64_t, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType innerSize,
+                                    int64_t dstCopyDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstCopyDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+
+      dstOffset += dstIndex * dst.strides[dstCopyDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcCopyDim];
+
+      dst.data[dstOffset] = src.data[srcOffset];
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexCopySmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          bool IndexIsMajor>
+__global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<T, IndexType> src,
+                                    TensorInfo<int64_t, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType totalSize,
+                                    IndexType innerSize,
+                                    int64_t dstCopyDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType srcIndex, elementInSlice;
+    if (IndexIsMajor) {
+      srcIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      srcIndex = linearIndex % innerSize;
+    }
+
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstCopyDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstCopyDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcCopyDim];
+
+    dst.data[dstOffset] = src.data[srcOffset];
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexAddLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexAddSmallIndex(TensorInfo<T, IndexType> dst,
+                                   TensorInfo<T, IndexType> src,
+                                   TensorInfo<int64_t, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType innerSize,
+                                   int64_t dstAddDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstAddDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstAddDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcAddDim];
+
+      atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexAddSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          bool IndexIsMajor>
+__global__ void indexAddLargeIndex(TensorInfo<T, IndexType> dst,
+                                   TensorInfo<T, IndexType> src,
+                                   TensorInfo<int64_t, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType totalSize,
+                                   IndexType innerSize,
+                                   int64_t dstAddDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType srcIndex, elementInSlice;
+    if (IndexIsMajor) {
+      srcIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      srcIndex = linearIndex % innerSize;
+    }
+
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstAddDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstAddDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcAddDim];
+
+    atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexFillLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int IdxDim>
+__global__ void indexFillSmallIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<int64_t, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType innerSize,
+                                    int64_t dstFillDimSize,
+                                    T val) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex_ =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex_ < dstFillDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+          IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex_ * dst.strides[dstFillDim];
+
+      dst.data[dstOffset] = val;
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexFillSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int IdxDim,
+          bool IndexIsMajor>
+__global__ void indexFillLargeIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<int64_t, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType totalSize,
+                                    IndexType innerSize,
+                                    int64_t dstFillDimSize,
+                                    T val) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType dstIndex, elementInSlice;
+    if (IndexIsMajor) {
+      dstIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      dstIndex = linearIndex % innerSize;
+    }
+
+    // Lua indices begin at 1
+    IndexType dstIndex_ =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex_ < dstFillDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex_ * dst.strides[dstFillDim];
+
+    dst.data[dstOffset] = val;
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexSelectLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexSelectSmallIndex(TensorInfo<T, IndexType> dst,
+                                      TensorInfo<T, IndexType> src,
+                                      TensorInfo<int64_t, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType innerSize,
+                                      int64_t srcSelectDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
+    // Lua indices begin at 1
+    IndexType srcIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(srcIndex < srcSelectDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcSelectDim];
+
+      dst.data[dstOffset] = src.data[srcOffset];
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexSelectSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          bool IndexIsMajor>
+__global__ void indexSelectLargeIndex(TensorInfo<T, IndexType> dst,
+                                      TensorInfo<T, IndexType> src,
+                                      TensorInfo<int64_t, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType totalSize,
+                                      IndexType innerSize,
+                                      int64_t srcSelectDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType dstIndex, elementInSlice;
+    if (IndexIsMajor) {
+      dstIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      dstIndex = linearIndex % innerSize;
+    }
+
+    // Lua indices begin at 1
+    IndexType srcIndex =
+      indices.data[IndexToOffset<int64_t, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(srcIndex < srcSelectDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcSelectDim];
+
+    dst.data[dstOffset] = src.data[srcOffset];
+  }
+}
+
+template <int Dims, typename T, typename IndexType>
+__device__ __forceinline__ IndexType indexToOffset(
+    const TensorInfo<T, IndexType>& info,
+    int64_t index,
+    IndexType size)
+{
+  IndexType linearIndex = static_cast<IndexType>(index);
+  assert(linearIndex < size && linearIndex >= -size);
+  if (linearIndex < 0) {
+    linearIndex += size;
+  }
+  return IndexToOffset<T, IndexType, Dims>::get(linearIndex, info) - TH_INDEX_BASE;
+}
+
+struct WrapIndexOp {
+  WrapIndexOp(int64_t size) : size(size) {}
+
+  __device__ __forceinline__ void operator()(int64_t* out, int64_t* in) {
+    auto idx = *in;
+    assert(idx < size && idx >= -size);
+    *out = idx < 0 ? idx + size : idx;
+  }
+
+  int64_t size;
+};
+
+template <typename T, typename IndexType, int Dims>
+struct TensorTakeOp {
+  TensorTakeOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
+    : info(info), numel(numel) {}
+
+  __device__ __forceinline__ void operator()(T* out, int64_t* index) {
+    auto offset = indexToOffset<Dims>(info, *index, numel);
+    *out = info.data[offset];
+  }
+
+  const TensorInfo<T, IndexType> info;
+  IndexType numel;
+};
+
+template <typename T, typename IndexType, int Dims>
+struct TensorPutOp {
+  TensorPutOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
+    : info(info), numel(numel) {}
+
+  __device__ __forceinline__ void operator()(T* value, int64_t* index) {
+    auto offset = indexToOffset<Dims>(info, *index, numel);
+    info.data[offset] = *value;
+  }
+
+  const TensorInfo<T, IndexType> info;
+  IndexType numel;
+};
+
+template <typename T, typename IndexType, int Dims>
+struct TensorPutAccumulateOp {
+  TensorPutAccumulateOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t* start, int64_t* end)
+    : info(info), numel(numel), start(start), end(end) {}
+
+  __device__ __forceinline__ void operator()(T* value, int64_t* index) {
+    if (index == start || *index != *(index - 1)) {
+      int64_t linear_index = *index;
+      auto offset = indexToOffset<Dims>(info, linear_index, numel);
+      do {
+        info.data[offset] = THCNumerics<T>::add(info.data[offset], *value);
+        index++;
+        value++;
+      } while (index != end && *index == linear_index);
+    }
+  }
+
+  const TensorInfo<T, IndexType> info;
+  IndexType numel;
+  int64_t* start;
+  int64_t* end;
+};
+
+
+template<typename IndexType, typename real, template<class, class, int> class Op, typename TensorType>
+void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) {
+  // These are only valid if index is contiguous
+  auto start = THCudaLongTensor_data(state, index);
+  auto end = start + THCudaLongTensor_numel(state, index);
+
+  auto aInfo = getTensorInfo<real, TensorType, IndexType>(state, a);
+  aInfo.collapseDims();
+  auto numel = THCTensor_nElement(state, a);
+  if (aInfo.isContiguous()) {
+    auto op = Op<real, IndexType, -2>(aInfo, numel, start, end);
+    THC_pointwiseApply2<real, int64_t>(state, b, index, op);
+  } else {
+    auto op = Op<real, IndexType, -1>(aInfo, numel, start, end);
+    THC_pointwiseApply2<real, int64_t>(state, b, index, op);
+  }
+}
+
+template<typename real, template<class, class, int> class Op, typename TensorType>
+void dispatchTakePut(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) {
+  if (THCTensor_canUse32BitIndexMath(state, a, INT_MAX)) {
+    dispatchTakePutImpl<int32_t, real, Op>(state, a, b, index);
+  } else {
+    dispatchTakePutImpl<int64_t, real, Op>(state, a, b, index);
+  }
+}
+
+#include "generic/THCTensorIndex.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorInfo.cuh b/aten/src/THC/THCTensorInfo.cuh
new file mode 100644
index 0000000..a42fa65
--- /dev/null
+++ b/aten/src/THC/THCTensorInfo.cuh
@@ -0,0 +1,260 @@
+#ifndef THC_TENSOR_INFO_INC
+#define THC_TENSOR_INFO_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCIntegerDivider.cuh"
+#include "THCTensor.h"
+
+// Maximum number of dimensions allowed for cutorch
+#define MAX_CUTORCH_DIMS 25
+
+// Warning string for tensor arguments that are too large or have too
+// many dimensions
+#define CUTORCH_STR(X) #X
+#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
+  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_CUTORCH_DIMS],
+             IndexType st[MAX_CUTORCH_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+  
+  /*
+  Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of
+  the info, possibly excluding the optional excludeDim. A "collapsed" version
+  of the info is the fewest dims that order the tensor's elements in the same
+  way as the original info. If excludeDim is specified, the collapse is the
+  fewest dims that order the tensor's elements as the original and preserve the
+  excluded dimension, unless the tensor collapses to a point.
+
+  Returns the (new) index of the preserved dimension if excludeDim is
+  specified. Returns 0 if the tensor is collapsed to a point. Returns -1
+  otherwise.
+  */
+  int collapseDims(const int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_CUTORCH_DIMS];
+  IndexType strides[MAX_CUTORCH_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_CUTORCH_DIMS],
+                                     IndexType st[MAX_CUTORCH_DIMS]) {
+  data = p;
+  dims = dim;
+  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  assert(dim < dims && dim >= 0);
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+
+  assert(excludeDim >= -1 && excludeDim < dims);
+
+  int stopDim = (excludeDim == -1) ? dims : excludeDim;
+  int newIndex = -1;
+  int oldIndex = 0;
+  int remappedExcludedDim = -1;
+
+  while (oldIndex < dims) {
+    // Finds a dimension to collapse into
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      ++oldIndex;
+      break; 
+    }
+
+    // Collapses dims
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+  
+      if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
+        sizes[newIndex] *= sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      } else {
+        ++newIndex;
+        sizes[newIndex] = sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      }
+    }
+
+    // Handles excludeDim being set (oldIndex == excludeDim)
+    if (oldIndex != dims) {
+      
+      // Preserves excluded dimension
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      remappedExcludedDim = newIndex;
+
+      // Restarts iteration after excludeDim
+      ++oldIndex;
+      stopDim = dims;
+    }
+  }
+
+  // Handles special case of all dims size 1
+  if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    return 0;
+  }
+
+  dims = newIndex + 1;
+  return remappedExcludedDim;
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    
+    IndexType offset = 0;
+
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+    
+    return offset + linearId * info.strides[0];
+  }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Uses dynamic dims
+    for (int i = info.dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+
+    return offset + linearId * info.strides[0];
+  }
+};
+
+// OffsetInfo is a faster implementation of IndexToOffset that uses faster
+// integer division: we transform each division into integer multiplication by a
+// pre-computed constant.  (See IntDivider for details.)
+template <typename T, typename IndexType, int Dims>
+struct OffsetInfo {
+  explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo) {
+    assert(tinfo.dims == Dims);
+    data = tinfo.data;
+
+    for (int i = 0; i < Dims; ++i) {
+      sizes[i] = IntDivider<IndexType>(tinfo.sizes[i]);
+      strides[i] = tinfo.strides[i];
+    }
+  }
+
+  __host__ __device__ T* get(IndexType linearIndex) const {
+    IndexType offset = 0;
+
+    for (int i = Dims - 1; i > 0; --i) {
+      DivMod<IndexType> divmod = sizes[i].divmod(linearIndex);
+      linearIndex = divmod.div;
+      offset += divmod.mod * strides[i];
+    }
+
+    return &data[offset + linearIndex * strides[0]];
+  }
+
+  T* data;
+  IntDivider<IndexType> sizes[Dims];
+  IndexType strides[Dims];
+};
+
+// For 1D tensors the offset equals linear index * stride.
+template <typename T, typename IndexType>
+struct OffsetInfo<T, IndexType, 1> {
+  explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
+    : data{tinfo.data}, stride{tinfo.strides[0]} {}
+
+  __host__ __device__ T* get(IndexType linearIndex) const {
+    return &data[linearIndex * stride];
+  }
+
+  T* data;
+  const IndexType stride;
+};
+
+// Dims=-1 is used when the dimension is unknown at compile time.
+//
+// Unfortunately, pre-computation does not work here, because of a bug in nvcc
+// (tested on CUDA 8.0): if a kernel argument contains an array that is
+// dynamically accessed, the whole array is first copied into the local memory.
+// (That is, every kernel thread makes its own copy of the argument, even if it
+// is never updated.)  Pre-computation makes it worse because now we have more
+// data to copy.
+//
+// So let's fall back to vanilla division approach.
+
+template <typename T, typename IndexType>
+struct OffsetInfo<T, IndexType, -1> {
+  explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
+    : tinfo(tinfo) { }
+
+  __host__ __device__ T* get(IndexType linearIndex) const {
+    IndexType offset = IndexToOffset<T, IndexType, -1>::get(linearIndex, tinfo);
+    return &tinfo.data[offset];
+  }
+
+  TensorInfo<T, IndexType> tinfo;
+};
+
+#endif // THC_TENSOR_INFO_INC
diff --git a/aten/src/THC/THCTensorMasked.cuh b/aten/src/THC/THCTensorMasked.cuh
new file mode 100644
index 0000000..814e263
--- /dev/null
+++ b/aten/src/THC/THCTensorMasked.cuh
@@ -0,0 +1,58 @@
+#ifndef THC_TENSOR_MASKED_CUH
+#define THC_TENSOR_MASKED_CUH
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCThrustAllocator.cuh"
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename T, typename MaskT>
+struct TensorMaskedFillOp {
+  TensorMaskedFillOp(T v) : value(v) {}
+  __device__ inline void operator()(T* t, MaskT* mask) {
+    if (*mask) {
+      *t = value;
+    }
+  }
+
+  T value;
+};
+
+template <typename T, typename MaskT, typename MaskPrefixSumT>
+struct TensorMaskedCopyOp {
+  TensorMaskedCopyOp(T* s) : in(s) {}
+
+  __device__ inline void operator()(T* out,
+                                    MaskT* mask,
+                                    MaskPrefixSumT* maskPrefixSum) {
+    if (*mask) {
+      *out = in[*maskPrefixSum];
+    }
+  }
+
+  // Where we are copying from
+  T* in;
+};
+
+template <typename T, typename MaskT, typename MaskPrefixSumT>
+struct TensorMaskedSelectOp {
+  TensorMaskedSelectOp(T* t) : out(t) {}
+  __device__ inline void operator()(MaskT* mask,
+                                    MaskPrefixSumT* maskPrefixSum,
+                                    T* in) {
+    if (*mask) {
+      out[*maskPrefixSum] = *in;
+    }
+  }
+
+  T* out;
+};
+
+#endif // THC_TENSOR_MASKED_CUH
diff --git a/aten/src/THC/THCTensorMath.cu b/aten/src/THC/THCTensorMath.cu
new file mode 100644
index 0000000..4eded20
--- /dev/null
+++ b/aten/src/THC/THCTensorMath.cu
@@ -0,0 +1,140 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMath.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+#include <cfloat>
+
+template <typename T>
+struct TensorFillOp {
+  TensorFillOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* v) { *v = val; }
+
+  const T val;
+};
+
+// copypasta from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
+template <typename Iterator>
+class strided_range
+{
+ public:
+
+  typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+  struct stride_functor : public thrust::unary_function<difference_type,
+                                                        difference_type>
+  {
+    difference_type stride;
+
+    stride_functor(difference_type stride)
+        : stride(stride) {}
+
+    __host__ __device__
+    difference_type operator()(const difference_type& i) const
+      {
+        return stride * i;
+      }
+  };
+
+  typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+  typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+  typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+  // type of the strided_range iterator
+  typedef PermutationIterator iterator;
+
+  // construct strided_range for the range [first,last)
+  strided_range(Iterator first, Iterator last, difference_type stride)
+      : first(first), last(last), stride(stride) {}
+
+  iterator begin(void) const
+    {
+      return PermutationIterator(first,
+                                 TransformIterator(CountingIterator(0),
+                                                   stride_functor(stride)));
+    }
+
+  iterator end(void) const
+    {
+      return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+
+ protected:
+  Iterator first;
+  Iterator last;
+  difference_type stride;
+};
+
+struct idx_functor
+{
+  int64_t div;
+  int64_t size;
+
+  __host__ __device__
+  idx_functor(int64_t div, int64_t size) : div(div), size(size) {}
+
+  __host__ __device__
+  int64_t operator()(int64_t val) {
+    return (val / div) % size + TH_INDEX_BASE;
+  }
+};
+
+template <typename T>
+struct NonZeroOp
+{
+  NonZeroOp() {}
+  __host__ __device__ bool operator()(T lhs) const {
+    if (THCNumerics<T>::ne(lhs, ScalarConvert<float, T>::to(0.0))) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+template<typename T, typename accT = T>
+struct LinspaceOp {
+  __host__ __device__ LinspaceOp(accT start, accT step): 
+    start_(start), step_(step) { }
+  __device__ __forceinline__ T operator()(ptrdiff_t index) {
+    accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+    accT value = THCNumerics<accT>::add(start_, increment);
+    return ScalarConvert<accT,T>::to(value);
+  }
+
+  const accT start_, step_;
+};
+
+template<typename T, typename accT = T>
+struct LogspaceOp {
+  __host__ __device__ LogspaceOp(accT start, accT step): 
+    start_(start), step_(step) { }
+  __device__ __forceinline__ T operator()(ptrdiff_t index) {
+    accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+    accT value = THCNumerics<accT>::exp10(THCNumerics<accT>::add(start_, increment));
+    return ScalarConvert<accT,T>::to(value);
+  }
+
+  const accT start_, step_;
+};
+
+
+#include "generic/THCTensorMath.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMath.cuh b/aten/src/THC/THCTensorMath.cuh
new file mode 100644
index 0000000..202090e
--- /dev/null
+++ b/aten/src/THC/THCTensorMath.cuh
@@ -0,0 +1,130 @@
+#ifndef THC_TENSORMATH_CUH
+#define THC_TENSORMATH_CUH
+
+// Copy the kth diagonal of a matrix B to a vector A.
+template <typename T>
+__global__ void THCTensor_copyFromDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideA) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < size;
+       linearIndex += gridDim.x * blockDim.x) {
+    const ptrdiff_t bOffset = start + strideSum * linearIndex;
+    a[strideA * linearIndex] = b[bOffset];
+  }
+}
+
+// Copy vector B to the kth diagonal of a matrix A
+template <typename T>
+__global__ void THCTensor_copyToDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideB) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < size;
+       linearIndex += gridDim.x * blockDim.x) {
+    const ptrdiff_t aOffset = start + strideSum * linearIndex;
+    a[aOffset] = b[strideB * linearIndex];
+  }
+}
+
+#define CAT_ARRAY_BATCH_SIZE 1024
+#define CAT_ARRAY_MAX_INPUT_DIMS 4
+
+inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+
+  if (curDevice == -1) {
+     return false;
+  }
+
+  // Assume a reasonable number of SMs if no state is available
+  int numSM =
+        state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
+  //X dim of grid for cat array cooperates on a single tensor in the cat.
+  //Given half of the GPU, full utilization will always occur.
+  grid = dim3( 2LL * numSM, (long long) nTensors );
+	     
+  return true;
+}
+
+// Similar to any other IndexToOffset calculation for copying along a given dimension.
+template <typename IndexType, int Dims>
+struct CatArrIndexToOffset {
+  static inline __device__ IndexType compute(
+      const IndexType outputSize[Dims],
+      const IndexType outputStride[Dims],
+      const IndexType dimSize,
+      const unsigned int concatDim,
+      IndexType linearIndex) {
+    IndexType offset = 0;
+
+#pragma unroll
+    for (int i = Dims - 1; i >= 1; --i) {
+      IndexType curDimSize = i == concatDim ? dimSize : outputSize[i];
+      IndexType nextDimIndex = linearIndex / curDimSize;
+      IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
+      IndexType curDimOffset = curDimIndex * outputStride[i];
+      offset += curDimOffset;
+      linearIndex = nextDimIndex;
+    }
+
+    return offset + linearIndex * outputStride[0];
+  }
+};
+
+template <typename T, typename IndexType>
+struct CatArrInputTensor {
+  T* input;
+  IndexType offset;
+  IndexType dimSize;
+  IndexType nElements;
+};
+
+template<typename IndexType, unsigned int MaxDims>
+struct OutputTensorSizeStride {
+  IndexType outputSize[MaxDims];
+  IndexType outputStride[MaxDims];
+};
+
+/**
+  * Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a grid-stride loop based off of
+  * the blockIdx.x, threadIdx.x for each input to copy each element from each input tensor into the output.
+  *
+  * output: base pointer to the storage associated with the output tensor
+  * inputs: GPU-allocated array of input metadata for each input to concatenate in the kernel
+  * os: the size/stride vectors for the output tensor
+  * concatDim: dimension along which we are concatenating
+  * dimStride: the stride of the output tensor at the concatDim
+  *
+  * The most important assumption made is that the input tensors are contiguous.
+  */
+
+
+
+template <typename T, typename IndexType, int Dims>
+__global__ void CatArrayBatchedCopy(
+    T* output,
+    CatArrInputTensor<T, IndexType>* inputs,
+    OutputTensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs[blockIdx.y].nElements;
+
+    if(tid >= nElements) return;
+    
+    T* data = inputs[blockIdx.y].input;
+    IndexType offset = inputs[blockIdx.y].offset;
+    IndexType dimSize = inputs[blockIdx.y].dimSize;
+    IndexType dataOffset = offset * dimStride;
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+    	      os.outputSize, os.outputStride, dimSize, concatDim, tid);
+    output[dataOffset + elementOffset] = data[tid];
+
+    tid += stride;
+    }
+}
+
+#endif
diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h
new file mode 100644
index 0000000..7696749
--- /dev/null
+++ b/aten/src/THC/THCTensorMath.h
@@ -0,0 +1,58 @@
+#ifndef TH_CUDA_TENSOR_MATH_INC
+#define TH_CUDA_TENSOR_MATH_INC
+
+#include "THCTensor.h"
+#include "THCGeneral.h"
+
+#include "generic/THCTensorMath.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathBlas.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathMagma.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPairwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPointwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathReduce.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompare.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompareT.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathScan.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMasked.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorScatterGather.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorIndex.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorSort.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMode.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorTopK.h"
+#include "THCGenerateAllTypes.h"
+
+THC_API int THCudaByteTensor_logicalAndAll(THCState *state, THCudaByteTensor *self);
+THC_API int THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self);
+
+THC_API void THCudaByteTensor_logicalAnd(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim);
+THC_API void THCudaByteTensor_logicalAny(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim);
+
+#endif
diff --git a/aten/src/THC/THCTensorMathBlas.cu b/aten/src/THC/THCTensorMathBlas.cu
new file mode 100644
index 0000000..5551b0c
--- /dev/null
+++ b/aten/src/THC/THCTensorMathBlas.cu
@@ -0,0 +1,10 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/THCTensorMathBlas.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMathCompare.cuh b/aten/src/THC/THCTensorMathCompare.cuh
new file mode 100644
index 0000000..9fac608
--- /dev/null
+++ b/aten/src/THC/THCTensorMathCompare.cuh
@@ -0,0 +1,87 @@
+#ifndef THC_TENSORMATH_COMPARE_CUH
+#define THC_TENSORMATH_COMPARE_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+
+template <typename T, typename TOut>
+struct TensorLTValueOp {
+  TensorLTValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorGTValueOp {
+  TensorGTValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*in, value));
+  }
+
+  const T value;
+};
+
+
+template <typename T, typename TOut>
+struct TensorLEValueOp {
+  TensorLEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorGEValueOp {
+  TensorGEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorEQValueOp {
+  TensorEQValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorNEValueOp {
+  TensorNEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*in, value));
+  }
+
+  const T value;
+};
+
+template<typename ScalarTypeOut, typename ScalarType, typename TensorTypeOut, typename TensorType, class Op>
+void THC_logicalValue(THCState *state,
+                      TensorTypeOut *self_,
+                      TensorType *src,
+                      Op op) {
+  THLongStorage* st = THCTensor_newSizeOf(state, src);
+  THCTensor_resize(state, self_, st, NULL);
+  THLongStorage_free(st);
+
+  if (!THC_pointwiseApply2<ScalarTypeOut, ScalarType>(state, self_, src, op)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_TENSORMATH_COMPARE_CUH
diff --git a/aten/src/THC/THCTensorMathCompareT.cuh b/aten/src/THC/THCTensorMathCompareT.cuh
new file mode 100644
index 0000000..9b1fb4e
--- /dev/null
+++ b/aten/src/THC/THCTensorMathCompareT.cuh
@@ -0,0 +1,74 @@
+#ifndef THC_TENSORMATH_COMPARET_CUH
+#define THC_TENSORMATH_COMPARET_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+
+template <typename T, typename TOut>
+struct TensorLTOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorGTOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorLEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorGEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorEQOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorNEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*a, *b));
+  }
+};
+
+template<typename ScalarTypeOut, typename ScalarType, typename TensorTypeOut, typename TensorType, typename Op>
+void THC_logicalTensor(THCState *state,
+                       TensorTypeOut *self_,
+                       TensorType *src1,
+                       TensorType *src2,
+                       Op op) {
+  THLongStorage* st = THCTensor_newSizeOf(state, src1);
+  THCTensor_resize(state, self_, st, NULL);
+  THLongStorage_free(st);
+
+  THArgCheck(THCTensor_nElement(state, src1) ==
+             THCTensor_nElement(state, src2), 3,
+             "sizes do not match");
+
+  if (!THC_pointwiseApply3<ScalarTypeOut, ScalarType, ScalarType>(state, self_, src1, src2, op)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_TENSORMATH_COMPARET_CUH
diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu
new file mode 100644
index 0000000..4aa6249
--- /dev/null
+++ b/aten/src/THC/THCTensorMathMagma.cu
@@ -0,0 +1,29 @@
+#include "THCGeneral.h"
+#include "THCTensorMath.h"
+#include "THCTensorCopy.h"
+#include "THCTensorMathMagma.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+#include <algorithm>
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#else
+#include "THCBlas.h"
+#endif
+
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)"
+
+void THCMagma_init(THCState *state)
+{
+#ifdef USE_MAGMA
+  magma_init();
+#endif
+}
+
+#include "generic/THCTensorMathMagma.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh
new file mode 100644
index 0000000..6495049
--- /dev/null
+++ b/aten/src/THC/THCTensorMathMagma.cuh
@@ -0,0 +1,22 @@
+#ifndef THC_TENSOR_MATH_MAGMA_CUH
+#define THC_TENSOR_MATH_MAGMA_CUH
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#else
+#include "THCBlas.h"
+#endif
+
+#ifdef USE_MAGMA
+template <typename T>
+static inline T* th_magma_malloc_pinned(size_t n)
+{
+  void* ptr;
+  if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T)))
+    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
+  return reinterpret_cast<T*>(ptr);
+}
+
+#endif
+
+#endif // THC_TENSOR_MATH_MAGMA_CUH
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..19434f3
--- /dev/null
+++ b/aten/src/THC/THCTensorMathPairwise.cu
@@ -0,0 +1,494 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+
+template <typename T>
+struct TensorAddConstantOp {
+  TensorAddConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in + val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v += val;
+  }
+
+  const T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorAddConstantOp(half v) : val(v) {}
+#else
+  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin + fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hadd(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv += fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+
+template <typename T>
+struct TensorSubConstantOp {
+  TensorSubConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in - val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v -= val;
+  }
+
+  const T val;
+};
+
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
+#else
+  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin + fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hadd(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv += fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+
+template <typename T>
+struct TensorMulConstantOp {
+  TensorMulConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v *= val;
+  }
+
+  const T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorMulConstantOp(half v) : val(v) {}
+#else
+  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin * fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hmul(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv *= fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivConstantOp {
+  TensorDivConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in / val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v /= val;
+  }
+
+  const T val;
+};
+
+template <>
+struct TensorDivConstantOp<float> {
+  TensorDivConstantOp(float v) : val(1.f / v) {}
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(float* v) {
+    *v *= val;
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorDivConstantOp<double> {
+  TensorDivConstantOp(double v) : val(1. / v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v *= val;
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
+#else
+  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
+#endif
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin * fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hmul(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv *= fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template<typename T>
+static __device__ __forceinline__
+typename std::enable_if<std::is_signed<T>::value, bool>::type
+modulo_wrap(T a, T b) {
+  return (a != 0) && (a < 0) != (b < 0);
+}
+
+template<typename T>
+static __device__ __forceinline__
+typename std::enable_if<std::is_unsigned<T>::value, bool>::type
+modulo_wrap(T a, T b) {
+  return false;
+}
+
+template <typename T>
+struct TensorRemainderOp {
+  TensorRemainderOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in % val;
+    if (modulo_wrap<T>(*out, val)) {
+      *out += val;
+    }
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = *v % val;
+    if (modulo_wrap<T>(*v, val)) {
+      *v += val;
+    }
+  }
+
+  const T val;
+};
+
+template <>
+struct TensorRemainderOp<float> {
+  TensorRemainderOp(float v) : val(v) {}
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in - val * floorf(*in / val);
+  }
+
+  __device__ __forceinline__ void operator()(float* v) {
+    *v = *v - val * floorf(*v / val);
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorRemainderOp<double> {
+  TensorRemainderOp(double v) : val(v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in - val * floor(*in / val);
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v = *v - val * floor(*v / val);
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorRemainderOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorRemainderOp(half v) : val(v) {}
+#else
+  TensorRemainderOp(half v): fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in,  __hmul(val, hfloor(__hdiv(*in,  val))));
+#else
+    float fin = __half2float(*in);
+    float fout = fin - fval * floorf(fin / fval);
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val))));
+#else
+    float fv = __half2float(*v);
+    fv = fv - fval * floorf(fv / fval);
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorFmodOp {
+  TensorFmodOp(T v) : val((float)v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = (T) fmodf((float) *in, val);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = (T) fmodf((float) *v, val);
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorFmodOp<double> {
+  TensorFmodOp(double v) : val(v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = fmod(*in, val);
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v = fmod(*v, val);
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorFmodOp<half> {
+  TensorFmodOp(half v): fval(THC_half2float(v)) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    *out = __float2half(fmodf(__half2float(*in), fval));
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+    *v = __float2half(fmodf(__half2float(*v), fval));
+  }
+
+  const float fval;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T, int Upper>
+struct TensorTriOp {
+  TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_)
+    : start(start_), stride0(stride0_), stride1(stride1_), k(k_) {}
+
+  __device__ __forceinline__ int mask(T *out) {
+    ptrdiff_t n = out - start;
+    int64_t row, col;
+    if (stride0 > stride1)
+    {
+      row = (int64_t) (n / stride0);
+      col = (int64_t) ((n % stride0) / stride1);
+    }
+    else
+    {
+      row = (int64_t) ((n % stride1) / stride0);
+      col = (int64_t) (n / stride1);
+    }
+
+    return Upper ? (col - row >= k) : (col - row <= k);
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = mask(out) ? *in : ScalarConvert<int, T>::to(0);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    if (!mask(v))
+      *v = ScalarConvert<int, T>::to(0);
+  }
+
+  const T *start;
+  const int64_t stride0, stride1, k;
+};
+
+template <typename T>
+struct TensorLShiftConstantOp {
+  TensorLShiftConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in << val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v <<= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorRShiftConstantOp {
+  TensorRShiftConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in >> val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v >>= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitAndConstantOp {
+  TensorBitAndConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in & val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v &= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitOrConstantOp {
+  TensorBitOrConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in | val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v |= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitXorConstantOp {
+  TensorBitXorConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in ^ val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v ^= val;
+  }
+
+  const T val;
+};
+
+#include "generic/THCTensorMathPairwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
new file mode 100644
index 0000000..26389c3
--- /dev/null
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -0,0 +1,929 @@
+#ifndef THC_TENSORMATH_POINTWISE_CUH
+#define THC_TENSORMATH_POINTWISE_CUH
+
+#include <type_traits>
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+
+
+template <typename T>
+struct TensorATan2Op {
+  __device__ __forceinline__ void operator()(T* out, T* a, T* b) {
+    *out = THCNumerics<T>::atan2(*a, *b);
+  }
+};
+
+template <typename T>
+struct TensorSigmoidOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) const {
+    T one = (T) 1.0;
+    *out = one / (one + THCNumerics<T>::exp(- *in));
+  }
+
+  __device__ __forceinline__ void operator()(T* v) const {
+    T one = (T) 1.0;
+    *v = one / (one + THCNumerics<T>::exp(- *v));
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSigmoidOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
+#else
+    float fin = __half2float(*in);
+    *out = __float2half(1.0f / (1.0f + expf(- fin)));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
+#else
+    float fv = __half2float(*v);
+    *v = __float2half(1.0f / (1.0f + expf(- fv)));
+#endif
+  }
+};
+#endif
+
+template <typename T>
+struct TensorSignOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T orig = *in;
+    *out = (orig > 0) - (orig < 0);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    T orig = *v;
+    *v = (orig > 0) - (orig < 0);
+  }
+};
+
+template <>
+struct TensorSignOp<unsigned char> {
+  __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) {
+    unsigned char orig = *in;
+    *out = (orig == 0) ? 0 : 1;
+  }
+
+  __device__ __forceinline__ void operator()(unsigned char* v) {
+    unsigned char orig = *v;
+    *v = (orig == 0) ? 0 : 1;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSignOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half zero = ScalarConvert<int, half>::to(0);
+    half orig = *in;
+    *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero));
+#else
+    float orig = __half2float(*in);
+    *out = __float2half((orig > 0) - (orig < 0));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half zero = ScalarConvert<int, half>::to(0);
+    half orig = *v;
+    *v = __float2half((float) __hgt(orig, zero) -  (float) __hlt(orig, zero));
+#else
+    float orig = __half2float(*v);
+    *v = __float2half((orig > 0) - (orig < 0));
+#endif
+  }
+};
+#endif
+
+template <typename T>
+struct TensorAddOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout += fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 + fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCAddOp {
+  TensorCAddOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += val * *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + val * *in2;
+  }
+
+  T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCAddOp<half> {
+  TensorCAddOp(half v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, __hmul(val, *in));
+#else
+    float fout = __half2float(*out);
+    float fval = __half2float(val);
+    float fin = __half2float(*in);
+
+    fout += fval * fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, __hmul(val, *in2));
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fval = __half2float(val);
+
+    float fout = fin1 + fval * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+
+  half val;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorSubOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out -= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 - *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout -= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 - fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorMulOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out *= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 * *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout *= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template<typename T, int StaticExp>
+struct TensorPowOp {
+  TensorPowOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    if (StaticExp == 1) {
+      *out = *in;
+    } else if (StaticExp == 2) {
+      *out = THCNumerics<T>::mul(*in, *in);
+    } else if (StaticExp == 3) {
+      T square = THCNumerics<T>::mul(*in, *in);
+      *out = THCNumerics<T>::mul(square, *in);
+    } else {
+      *out = THCNumerics<T>::pow(*in, val);
+    }
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    if (StaticExp == 1) {
+      *v = *v;
+    } else if (StaticExp == 2) {
+      *v = THCNumerics<T>::mul(*v, *v);
+    } else if (StaticExp == 3) {
+      *v = THCNumerics<T>::mul(THCNumerics<T>::mul(*v, *v), *v);
+    } else {
+      *v = THCNumerics<T>::pow(*v, val);
+    }
+  }
+
+  const T val;
+};
+
+template<typename T>
+struct TensorPowOp<T, -1> {
+  TensorPowOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::cinv(*in);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = THCNumerics<T>::cinv(*v);
+  }
+
+  const T val;
+};
+
+template<typename T>
+struct TensorPowOp<T, -2> {
+  TensorPowOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T square = THCNumerics<T>::mul(*in, *in);
+    *out = THCNumerics<T>::cinv(square);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    T square = THCNumerics<T>::mul(*v, *v);
+    *v = THCNumerics<T>::cinv(square);
+  }
+
+  const T val;
+};
+
+template<typename T>
+struct TensorTPowOp {
+  TensorTPowOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::pow(val, *in);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = THCNumerics<T>::pow(val, *v);
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorCPowOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::pow(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::pow(*in1, *in2);
+  }
+};
+
+template <>
+struct TensorCPowOp<float> {
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = powf(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
+    *out = powf(*in1, *in2);
+  }
+};
+
+
+template <>
+struct TensorCPowOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = pow(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = pow(*in1, *in2);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCPowOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    // No fp16 pow function yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout = powf(fout, fin);
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+    // No fp16 pow function yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = powf(fin1, fin2);
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out /= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 / *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivOp<half> {
+  __device__ __forceinline__ void
+  operator()(half* out, half* in) {
+    // No fp16 div instruction yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout /= fin;
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void
+  operator()(half* out, half* in1, half* in2) {
+    // No fp16 div instruction yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 / fin2;
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template<typename T>
+static __device__ __forceinline__
+typename std::enable_if<std::is_signed<T>::value, bool>::type
+modulo_wrap(T a, T b) {
+  return (a != 0) && (a < 0) != (b < 0);
+}
+
+template<typename T>
+static __device__ __forceinline__
+typename std::enable_if<std::is_unsigned<T>::value, bool>::type
+modulo_wrap(T a, T b) {
+  return false;
+}
+
+template <typename T>
+struct TensorCRemainderOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T val =  *out % *in;
+    if (modulo_wrap(val, *in)) {
+      val += *in;
+    }
+    *out = val;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    T val = *in1 % *in2;
+    if (modulo_wrap(val, *in2)) {
+      val += *in2;
+    }
+    *out = val;
+  }
+};
+
+template <>
+struct TensorCRemainderOp<float> {
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN;
+  }
+
+  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
+    *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN;
+  }
+};
+
+template <>
+struct TensorCRemainderOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in != 0. ? *out - *in * floor(*out / *in) : NAN;
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = *in2 != 0. ? *in1 - *in2 * floor(*in1 / *in2) : NAN;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCRemainderOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in))));
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2))));
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCFmodOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *out % *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 % *in2;
+  }
+};
+
+template <>
+struct TensorCFmodOp<float> {
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = fmodf(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
+    *out = fmodf(*in1, *in2);
+  }
+};
+
+template <>
+struct TensorCFmodOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = fmod(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = fmod(*in1, *in2);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCFmodOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorClampOp {
+  TensorClampOp(T min, T max) : minValue(min), maxValue(max) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T val = THCNumerics<T>::lt(*in, maxValue) ? *in : maxValue;
+    *out = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    T val = THCNumerics<T>::lt(*v, maxValue) ? *v : maxValue;
+    *v = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+  }
+
+  const T minValue;
+  const T maxValue;
+};
+
+template <typename T>
+struct TensorLerpOp {
+  TensorLerpOp(T w) : w(w) {}
+
+  __device__ __forceinline__ void operator()(T *out, T *a, T *b) {
+    *out = THCNumerics<T>::add(
+      *a,
+      THCNumerics<T>::mul(
+          w,
+          THCNumerics<T>::sub(*b, *a)
+        )
+    );
+  }
+
+  const T w;
+};
+
+template <typename T>
+struct TensorCrossOp {
+  TensorCrossOp(int64_t sx, int64_t sy, int64_t so) : sx(sx), sy(sy), so(so) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* x, T*y) {
+    T val0 = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[1 * sx], y[2 * sy]),
+        THCNumerics<T>::mul(x[2 * sx], y[1 * sy])
+    );
+
+    T val1 = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[2 * sx], y[0 * sy]),
+        THCNumerics<T>::mul(x[0 * sx], y[2 * sy])
+    );
+
+    T val2 = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[0 * sx], y[1 * sy]),
+        THCNumerics<T>::mul(x[1 * sx], y[0 * sy])
+    );
+
+    out[0 * so] = val0;
+    out[1 * so] = val1;
+    out[2 * so] = val2;
+  }
+
+  const int64_t sx, sy, so;
+};
+
+template <typename T>
+struct TensorMaxOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::gt(*out, *in) ? *out : *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::gt(*in1, *in2) ? *in1 : *in2;
+  }
+};
+
+template <typename T>
+struct TensorMinOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::lt(*out, *in) ? *out : *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::lt(*in1, *in2) ? *in1 : *in2;
+  }
+};
+
+template <typename T>
+struct TensorMaxValueOp {
+  TensorMaxValueOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out) {
+    *out = THCNumerics<T>::lt(*out, val) ? val : *out;  // this order propagates NaN
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::lt(*in, val) ? val : *in;  // this order propagates NaN
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorMinValueOp {
+  TensorMinValueOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out) {
+    *out = THCNumerics<T>::gt(*out, val) ? val : *out;  // this order propagates NaN
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::gt(*in, val) ? val : *in;  // this order propagates NaN
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorAddCMulOp {
+  TensorAddCMulOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::add(
+      *out,
+      THCNumerics<T>::mul(
+        val,
+        THCNumerics<T>::mul(*in1, *in2)
+      )
+    );
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorAddCDivOp {
+  TensorAddCDivOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::add(
+      *out,
+      THCNumerics<T>::mul(
+        val,
+        THCNumerics<T>::div(*in1, *in2)
+      )
+    );
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorLShiftOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out <<= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 << *in2;
+  }
+};
+
+template <>
+struct TensorLShiftOp<float> {
+  __device__ __forceinline__ void
+  operator()(float* out, float* in) {
+    *out *= powf(2.0f, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(float* out, float* in1, float* in2) {
+    *out = *in1 * powf(2.0f, *in2);
+  }
+};
+
+template <>
+struct TensorLShiftOp<double> {
+  __device__ __forceinline__ void
+  operator()(double* out, double* in) {
+    *out *= pow(2.0, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(double* out, double* in1, double* in2) {
+    *out = *in1 * pow(2.0, *in2);
+  }
+};
+
+template <typename T>
+struct TensorRShiftOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out >>= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 >> *in2;
+  }
+};
+
+
+template <>
+struct TensorRShiftOp<float> {
+  __device__ __forceinline__ void
+  operator()(float* out, float* in) {
+    *out /= powf(2.0f, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(float* out, float* in1, float* in2) {
+    *out = *in1 / powf(2.0f, *in2);
+  }
+};
+
+template <>
+struct TensorRShiftOp<double> {
+  __device__ __forceinline__ void
+  operator()(double* out, double* in) {
+    *out /= pow(2.0, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(double* out, double* in1, double* in2) {
+    *out = *in1 / pow(2.0, *in2);
+  }
+};
+
+template <typename T>
+struct TensorBitAndOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out &= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 & *in2;
+  }
+};
+
+template <typename T>
+struct TensorBitOrOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out |= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 | *in2;
+  }
+};
+
+template <typename T>
+struct TensorBitXorOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out ^= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 ^ *in2;
+  }
+};
+
+/*
+ * The following function was converted to CUDA form from code that comes
+ * with the following copyright notice. It has been released under the BSD license.
+ *
+ * Cephes Math Library Release 2.8:  June, 2000
+ * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+ */
+template <typename real, typename accreal>
+struct TensorDigammaOp {
+  __device__ __forceinline__ void
+  operator()(real* out, real* in) {
+    using compute_type = typename std::conditional<std::is_same<real, half>::value, accreal, real>::type;
+    static const double PI_f64 = 3.14159265358979323846;
+    static const compute_type PSI_10 = 2.25175258906672110764;
+    static const compute_type A[] = {
+       8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+       7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+       3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+       8.33333333333333333333E-2,
+    };
+
+    auto x = scalar_cast<compute_type>(*in);
+    if (x == 0) {
+      *out = scalar_cast<real>(INFINITY);
+      return;
+    }
+
+    bool x_is_integer = x == floor(x);
+    compute_type result = 0;
+    if (x < 0) {
+      if (x_is_integer) {
+        *out = scalar_cast<real>(INFINITY);
+        return;
+      }
+      // Rounding errors in tan's input can really affect the output
+      // for extreme values, so we always perform this computation in double.
+      result = scalar_cast<compute_type>(
+          - PI_f64 / tan(PI_f64 * scalar_cast<double>(x)));
+      x = 1 - x;
+    }
+
+    while (x < 10) {
+      result -= 1 / x;
+      x += 1;
+    }
+    if (x == 10) {
+      *out = scalar_cast<real>(result + PSI_10);
+      return;
+    }
+
+    compute_type y = 0;
+    if (x < 1.0e17) {
+      compute_type z = 1.0 / (x * x);
+
+      compute_type polevl_result = 0;
+      for (int i = 0; i <= 6; i++) {
+        polevl_result = polevl_result * z + A[i];
+      }
+      y = z * polevl_result;
+    }
+
+    *out = scalar_cast<real>(log(x) - (0.5 / x) - y + result);
+    return;
+  }
+};
+
+template <typename real, typename accreal>
+struct TensorTrigammaOp {
+  using compute_type = typename std::conditional<std::is_same<real, half>::value, accreal, real>::type;
+  __device__ __forceinline__ void
+  operator()(real* out, real* in) {
+    const compute_type PI = 3.14159265358979323846;
+    compute_type x = ScalarConvert<real, compute_type>::to(*in);
+    compute_type sign = +1;
+    compute_type result = 0;
+    if (x < 0.5f) {
+      sign = -1;
+      compute_type sin_pi_x = THCNumerics<compute_type>::sin(PI * x);
+      result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+      x = 1 - x;
+    }
+    for (int i = 0; i < 6; ++i) {
+      result += 1 / (x * x);
+      x += 1;
+    }
+    const compute_type ixx = 1 / (x*x);
+    result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x;
+    *out = ScalarConvert<compute_type, real>::to(sign * result);
+  }
+};
+
+#endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/aten/src/THC/THCTensorMathReduce.cu b/aten/src/THC/THCTensorMathReduce.cu
new file mode 100644
index 0000000..e024e1f
--- /dev/null
+++ b/aten/src/THC/THCTensorMathReduce.cu
@@ -0,0 +1,62 @@
+#include "THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+
+THC_API int
+THCudaByteTensor_logicalAndAll(THCState *state, THCudaByteTensor *self) {
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll<uint8_t>(state, self,
+                              thrust::identity<unsigned char>(),
+                              LogicalAll(),
+                              (unsigned char) 1, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
+
+THC_API int
+THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self) {
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll<uint8_t>(state, self,
+                              thrust::identity<unsigned char>(),
+                              LogicalAny(),
+                              (unsigned char) 0, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
+
+THC_API void
+THCudaByteTensor_logicalAnd(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) {
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src));
+  if (!THC_reduceDim<uint8_t>(state, self, src,
+                              thrust::identity<unsigned char>(),
+                              LogicalAll(),
+                              thrust::identity<unsigned char>(),
+                              (unsigned char) 1,
+                              dimension,
+                              keepdim)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCudaByteTensor_logicalAny(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) {
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src));
+  if (!THC_reduceDim<uint8_t>(state, self, src,
+                              thrust::identity<unsigned char>(),
+                              LogicalAny(),
+                              thrust::identity<unsigned char>(),
+                              (unsigned char) 0,
+                              dimension,
+                              keepdim)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
new file mode 100644
index 0000000..5a0a804
--- /dev/null
+++ b/aten/src/THC/THCTensorMathReduce.cuh
@@ -0,0 +1,728 @@
+#ifndef THC_TENSORMATH_REDUCE_CUH
+#define THC_TENSORMATH_REDUCE_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+#include "THCReduceAll.cuh"
+#include "THCTensorCopy.hpp"
+#include "THCThrustAllocator.cuh"
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/inner_product.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+/*
+Reductions that (only) operate on accumulate types. 
+*/
+
+template <typename T>
+struct ReduceAdd {
+  inline __device__ T operator()(const T a, const T b) const {
+    return THCNumerics<T>::add(a, b);
+  }
+};
+
+template <typename T>
+struct ReduceMultiply {
+  inline __device__ T operator()(const T a, const T b) const {
+    return THCNumerics<T>::mul(a, b);
+  }
+};
+
+template <typename T>
+struct ReduceDivide {
+  ReduceDivide(const T _divisor): divisor{_divisor} {}
+
+  inline __device__ T operator()(const T x) const {
+    return THCNumerics<T>::div(x, divisor);
+  }
+
+  const T divisor;
+};
+
+template <typename T>
+struct ReducePow {
+  ReducePow(const T _exponent): exponent{_exponent} {}
+
+  inline __device__ T operator()(const T x) const {
+    return THCNumerics<T>::pow(x, exponent);
+  }
+
+  const T exponent;
+};
+
+template <typename T>
+struct SquareFunctor {
+    SquareFunctor(const T _mean): mean{_mean} {}
+
+    inline __device__ T operator()(const T x) const {
+      return THCNumerics<T>::mul(
+        THCNumerics<T>::sub(x, mean),
+        THCNumerics<T>::sub(x, mean)
+        );
+    }
+
+    const T mean;
+};
+
+template <typename T>
+struct ReduceMin {
+  inline __device__ T operator()(T a, T b) const {
+    return (THCNumerics<T>::lt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
+  }
+};
+
+template <typename T>
+struct ReduceMax {
+  inline __device__ T operator()(T a, T b) const {
+#if defined(__HIP_PLATFORM_HCC__)
+    return (static_cast<int>(THCNumerics<T>::sub(a, b)) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
+#else
+    return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
+#endif
+  }
+};
+
+struct LogicalAll {
+  inline __device__ unsigned char operator()(const unsigned char x,
+                                             const unsigned char y) const {
+    return (x && y);
+  }
+};
+
+struct LogicalAny {
+  inline __device__ unsigned char operator()(const unsigned char x,
+                                             const unsigned char y) const {
+    return (x || y);
+  }
+};
+
+template<typename T>
+inline __device__ T THCMax(const T a, const T b) {
+  return THCNumerics<T>::gt(a, b) ? a : b;
+}
+
+template<typename T, typename AccT>
+__global__ void THCTensor_kernel_renorm(T *data, 
+                                        const AccT value, 
+                                        const ptrdiff_t size, 
+                                        const AccT maxnorm) {
+  __shared__ AccT buffer[32];
+  int64_t tx = threadIdx.x;
+  int64_t bx = blockIdx.x;
+  int64_t step = blockDim.x;
+  T *row = data + size * bx;
+
+  buffer[tx] = scalar_cast<AccT>(0);
+  AccT norm;
+
+#if !defined(__HIP_DEVICE_COMPILE__)
+  if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
+    // get norm of axis
+    for (ptrdiff_t i = tx; i < size; i += step) {
+      const AccT val = scalar_cast<AccT>(row[i]);
+      buffer[tx] = THCMax<AccT>(buffer[tx], THCNumerics<AccT>::abs(val));
+    }
+    // add (reduce)
+    for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+      __syncthreads();
+      if (tx < stride)
+        buffer[tx] = THCMax<AccT>(buffer[tx], buffer[tx+stride]);
+    }
+    // clip norms
+    __syncthreads();
+    norm = buffer[0];
+  } else {
+    // get norm of axis
+    for (ptrdiff_t i = tx; i < size; i += step) {
+      const AccT val = scalar_cast<AccT>(row[i]);
+      buffer[tx] = THCNumerics<AccT>::add( 
+        buffer[tx],
+        THCNumerics<AccT>::pow(THCNumerics<AccT>::abs(val), value)
+      );
+    }
+    // add (reduce)
+    for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+      __syncthreads();
+      if (tx < stride)
+        buffer[tx] = THCNumerics<AccT>::add(buffer[tx], buffer[tx+stride]);
+    }
+    // clip norms
+    __syncthreads();
+    norm = THCNumerics<AccT>::pow(buffer[0], THCNumerics<AccT>::cinv(value));
+  }
+
+  if (THCNumerics<AccT>::gt(norm, maxnorm)) {
+    norm = THCNumerics<AccT>::div(
+      maxnorm,
+      THCNumerics<AccT>::add(norm, scalar_cast<AccT>(1e-7))
+    );
+    // renormalize
+    for (ptrdiff_t i = tx; i < size; i += step) {
+      const AccT val = scalar_cast<AccT>(row[i]);
+      row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
+    }
+  }
+#endif
+}
+
+template <typename T>
+struct TensorNonZeroOp {
+  TensorNonZeroOp() {}
+
+  __host__ __device__ T operator()(const T lhs) const {
+    const T zero = scalar_cast<T>(0);
+    if (THCNumerics<T>::eq(lhs, zero)) return zero;
+      
+    return scalar_cast<T>(1);
+  }
+};
+
+template <typename T, int StaticExp>
+struct TensorNormOp {
+  TensorNormOp(T _exponent) : exponent{_exponent} {}
+
+  __host__ __device__ T operator()(const T x) const {
+    switch (StaticExp) {
+      case 1: return THCNumerics<T>::abs(x);
+      case 2: return THCNumerics<T>::mul(x, x);
+      default: return THCNumerics<T>::pow(THCNumerics<T>::abs(x), exponent);
+    }
+  }
+
+  const T exponent;
+};
+
+/*
+  Fuses conversions and a TensorDistOp. Needed for Thrust.
+*/
+template <typename T, typename AccT>
+struct ThrustTensorDistOp {
+  ThrustTensorDistOp(AccT _exponent) : exponent{_exponent} {}
+
+  __host__ __device__ AccT operator()(T _x, T _y) const {
+    const AccT x = scalar_cast<AccT>(_x);
+    const AccT y = scalar_cast<AccT>(_y);
+    return THCNumerics<AccT>::pow(
+      THCNumerics<AccT>::abs(THCNumerics<AccT>::sub(x, y)), 
+      exponent);
+  }
+
+  const AccT exponent;
+};
+
+#include <thrust/functional.h>
+
+// Given the sum of values and the sum of squares, compute the variance or standard deviation.
+template<typename T, bool flag, bool apply_sqrt>
+__forceinline__ __device__ T THCTensor_computeVar(
+  T sum, 
+  T sum2, 
+  const unsigned row_size) {
+
+  T rs2 = scalar_cast<T>(row_size);
+  T rs2m = scalar_cast<T>(row_size - 1);
+  T zero = scalar_cast<T>(0);
+
+  if (flag) {
+    sum = THCNumerics<T>::div(sum, rs2);
+    sum2 = THCNumerics<T>::div(sum2, rs2);
+    sum2 = THCNumerics<T>::sub(sum2, THCNumerics<T>::mul(sum, sum));
+    sum2 = (THCNumerics<T>::lt(sum2, zero) ? zero : sum2);
+  } else {
+    sum = THCNumerics<T>::div(sum, rs2);
+    sum2 = THCNumerics<T>::div(sum2, rs2m);
+    sum2 = THCNumerics<T>::sub(sum2,
+      THCNumerics<T>::mul(
+        THCNumerics<T>::div(rs2 ,rs2m),
+        THCNumerics<T>::mul(sum, sum)));
+    sum2 = (THCNumerics<T>::lt(sum2, zero) ? zero : sum2);
+  }
+
+  if (apply_sqrt)
+    return THCNumerics<T>::sqrt(sum2);
+  
+  return sum2;
+}
+
+/* Compute the variance (or standard deviation) along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ * - if flag is set, normalize by `row_size` instead of `row_size - 1`
+ * - if apply_sqrt is set, compute the standard deviation instead of variance
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename T, typename AccT, bool flag, bool apply_sqrt>
+__global__ void THCTensor_kernel_varOuterDim(T *tgt, T *src_, unsigned num_orows, unsigned num_irows, unsigned row_size) {
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      T *src = src_ + orow * row_size * num_irows + irow;
+      AccT mean = scalar_cast<AccT>(0);
+      AccT m2 = scalar_cast<AccT>(0);
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        AccT val = scalar_cast<AccT>(*src);
+        AccT delta = THCNumerics<AccT>::sub(val, mean);
+        mean = THCNumerics<AccT>::add(mean,
+            THCNumerics<AccT>::div(delta, scalar_cast<AccT>(col + 1)));
+        AccT delta2 = THCNumerics<AccT>::sub(val, mean);
+        m2 = THCNumerics<AccT>::add(m2,
+            THCNumerics<AccT>::mul(delta, delta2));
+        src += num_irows;
+      }
+
+      if (flag) {
+        m2 = THCNumerics<AccT>::div(m2, scalar_cast<AccT>(row_size));
+      } else {
+        m2 = THCNumerics<AccT>::div(m2, scalar_cast<AccT>(row_size - 1));
+      }
+
+      tgt[orow * num_irows + irow] = scalar_cast<T>(
+          apply_sqrt ? THCNumerics<AccT>::sqrt(m2) : m2);
+    }
+  }
+}
+
+template<typename TensorTypeK, typename T, typename AccT, bool apply_sqrt>
+__host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int64_t dimension, int flag) {
+  unsigned ndim = THCTensor__nDimension(state, src);
+  // Treat all outer dimensions (i.e. dim < dimension) as one.
+  unsigned num_orows = 1;
+  for (int64_t dim = 0; dim < dimension; dim++) {
+    num_orows *= THCTensor_size(state, src, dim);
+  }
+  unsigned row_size = THCTensor_size(state, src, dimension);
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  unsigned num_irows = 1;
+  for (unsigned dim = dimension + 1; dim < ndim; dim++) {
+    num_irows *= THCTensor_size(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  if (flag) {
+    THCTensor_kernel_varOuterDim<T, AccT, true, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        tgt->template data<T>(), src->template data<T>(), num_orows, num_irows, row_size);
+  } else {
+    THCTensor_kernel_varOuterDim<T, AccT, false, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        tgt->template data<T>(), src->template data<T>(), num_orows, num_irows, row_size);
+  }
+
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess) THError(cudaGetErrorString(errcode));
+}
+
+/* Compute the variance (or standard deviation) of the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ * - if flag is set, normalize by `row_size` instead of `row_size - 1`
+ * - if apply_sqrt is set, compute the standard deviation instead of variance
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ *
+ * Uses Welford's algorithm for numeric stability. Divides the dataset into parallel groups
+ * and computes the M2 and mean for each group. (M2 is \sum (x - \bar{x})^2)
+ * For example, if the data is split into two groups x and y, the overall M2 can
+ * be computed by:
+ *
+ *    overall_M2 = M2x + nx * (mean(x) - overall_mean)^2
+ *               + M2y + ny * (mean(y) - overall_mean)^2
+ *
+ * This implementation assumes that each block has been launched with 16 x 32 threads.
+ */
+template<typename T, typename AccT, bool flag, bool apply_sqrt>
+__global__ void THCTensor_kernel_varInnermostDim(T *tgt, T *src_, unsigned num_rows, unsigned row_size) {
+  /*
+   * Each block computes the var/std of blockDim.y (32) rows at once.
+   * One can visualize the computation as a 16 (x) by 32 (y) grid.
+   * - Each of the 32 rows of the block is responsible for the computation
+   *   of one input row.
+   * - Each row has 16 columns; the variance computation of one input row is
+   *   split between 16 threads.
+   * - Each of those 16 threads handles the accumulation of 1/16 of the input
+   *   row's data.
+   */
+  for (unsigned block_row = blockIdx.x * blockDim.y; block_row < num_rows; block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+
+    /*
+     * Compute local mean, local M2 via Welford's algorithm for this thread.
+     */
+    AccT acc_zero = scalar_cast<AccT>(0);
+    AccT local_mean = acc_zero;
+    AccT local_M2 = acc_zero;
+    unsigned count = 0;
+
+    if (row < num_rows) {
+      T *src = src_ + row * row_size;
+
+      for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+        ++count;
+        AccT val = scalar_cast<AccT>(src[col]);
+        AccT delta = THCNumerics<AccT>::sub(val, local_mean);
+        local_mean = THCNumerics<AccT>::add(
+            local_mean,
+            THCNumerics<AccT>::div(delta, scalar_cast<AccT>(count)));
+        AccT delta2 = THCNumerics<AccT>::sub(val, local_mean);
+        local_M2 = THCNumerics<AccT>::add(
+            local_M2,
+            THCNumerics<AccT>::mul(delta, delta2));
+      }
+    }
+
+    AccT local_sum =
+        THCNumerics<AccT>::mul(local_mean, scalar_cast<AccT>(count));
+
+    /*
+     * We are reducing across each row of 16 threads to find the true sum of the
+     * entire input row. The warp shfl xor loop ultimately gives each thread the
+     * true sum.
+     */
+    for (unsigned lane_mask = 8; lane_mask > 0; lane_mask >>= 1) {
+      local_sum = THCNumerics<AccT>::add(local_sum, 
+          WARP_SHFL_XOR((row < num_rows) ? local_sum : acc_zero, lane_mask, 16));
+    }
+    AccT true_mean = THCNumerics<AccT>::div(local_sum, 
+      scalar_cast<AccT>(row_size));
+
+    /*
+     * Adjust each local_M2 according to the following:
+     *   adjusted_M2 = local_M2 + mean_diff * mean_diff * count
+     * The sum of these adjusted M2s is equal to the overall M2.
+     */
+    AccT adjusted_M2 = acc_zero;
+    if (row < num_rows) {
+      AccT mean_diff = THCNumerics<AccT>::sub(true_mean, local_mean);
+      adjusted_M2 = THCNumerics<AccT>::add(
+          local_M2,
+          THCNumerics<AccT>::mul(
+              THCNumerics<AccT>::mul(mean_diff, mean_diff),
+              scalar_cast<AccT>(count)));
+    }
+
+    /*
+     * Sums the adjusted M2s. The thread with threadIdx.x == 0 has
+     * the total sum, which is equal to the M2 for the entire input row.
+     */
+    for (unsigned s = 8; s >= 1; s >>= 1) {
+      adjusted_M2 = THCNumerics<AccT>::add(adjusted_M2, 
+          WARP_SHFL_DOWN((row < num_rows) ? adjusted_M2 : acc_zero, s, 16));
+    }
+
+    if (row < num_rows && threadIdx.x == 0) {
+      AccT M2 = adjusted_M2;
+      AccT variance;
+      if (flag) {
+        variance = THCNumerics<AccT>::div(M2, scalar_cast<AccT>(row_size));
+      } else {
+        variance = THCNumerics<AccT>::div(M2, scalar_cast<AccT>(row_size - 1));
+      }
+      tgt[row] = scalar_cast<T>(
+          apply_sqrt ? THCNumerics<AccT>::sqrt(variance) : variance);
+    }
+  }
+}
+
+template<typename TensorTypeK, typename T, typename AccT, bool apply_sqrt>
+__host__ void THCTensor_varInnermostDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int flag) {
+  unsigned ndim = THCTensor__nDimension(state, src);
+  // Treat all outer dimensions as a single dimension.
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= THCTensor_size(state, src, dim);
+  }
+  unsigned row_size = THCTensor_size(state, src, ndim - 1);
+
+  // From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions.
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  if (flag) {
+    THCTensor_kernel_varInnermostDim<T, AccT, true, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        tgt->template data<T>(), src->template data<T>(), num_rows, row_size);
+  } else {
+    THCTensor_kernel_varInnermostDim<T, AccT, false, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        tgt->template data<T>(), src->template data<T>(), num_rows, row_size);
+  }
+
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess) THError(cudaGetErrorString(errcode));
+}
+
+
+/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
+   These are useful when you not only have to do a reduction, but you might have
+   to preserve the location of contention (for example min/max operations).
+   The structure of the kernels follows the structure of the reduction kernels.
+*/
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceOuterDimIndex(K *tgt1,
+                                   Index *tgt2,
+                                   K *src_,
+                                   unsigned num_orows,
+                                   unsigned num_irows,
+                                   unsigned row_size,
+                                   thrust::pair<K, Index> init,
+                                   BinaryFunction binary_op) {
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
+         irow < num_irows;
+         irow += gridDim.y * blockDim.x) {
+      K *src = src_ + orow * row_size * num_irows + irow;
+      thrust::pair<K, Index> acc = init;
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        // +1 for Lua index
+        acc = binary_op(acc,
+                        thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE));
+        src += num_irows;
+      }
+
+      tgt1[orow * num_irows + irow] = acc.first;
+      tgt2[orow * num_irows + irow] = acc.second;
+    }
+  }
+}
+
+template <typename ScalarTypeK,
+          typename ScalarTypeIndex,
+          typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceOuterDimIndex(THCState *state,
+                                 TensorTypeK *tgt1,
+                                 TensorTypeIndex *tgt2,
+                                 TensorTypeK *src,
+                                 int64_t rdim,
+                                 const thrust::pair<ScalarTypeK, ScalarTypeIndex>& init,
+                                 BinaryFunction binary_op) {
+  unsigned ndim = THCTensor__nDimension(state, src);
+  unsigned num_orows = 1;
+  for (int64_t dim = 0; dim < rdim; dim++) {
+    num_orows *= THCTensor_size(state, src, dim);
+  }
+  unsigned row_size = THCTensor_size(state, src, rdim);
+  unsigned num_irows = 1;
+  for (unsigned dim = rdim + 1; dim < ndim; dim++) {
+    num_irows *= THCTensor_size(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows),
+            min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  kernelTransformReduceOuterDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      tgt1->template data<ScalarTypeK>(),
+      tgt2->template data<ScalarTypeIndex>(),
+      src->template data<ScalarTypeK>(),
+      num_orows, num_irows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
+ *
+ * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
+ *
+ * - block.x is the innermost dimension, i.e. dimension 0;
+ * - block.y and grid.y make up dimension 1; and
+ * - grid.x and grid z are the remaining two outer dimensions (if any)
+ *
+ * Reduction along other dimensions is handled in a separate kernel.
+ */
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceInnermostDimIndex(K *tgt1,
+                                       Index* tgt2,
+                                       K *src_,
+                                       unsigned num_rows,
+                                       unsigned row_size,
+                                       thrust::pair<K, Index> init,
+                                       BinaryFunction binary_op) {
+  __shared__ K sbuf[32][16 + 1]; // avoid bank conflict
+  __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict
+
+  for (unsigned block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    thrust::pair<K, Index> acc = init;
+    if (row < num_rows) {
+      K *src = src_ + row * row_size;
+      // Sequential reduction within a thread.
+      for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+        acc = binary_op(acc, thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE));
+      }
+    }
+
+    sbuf[threadIdx.y][threadIdx.x] = acc.first;
+    ibuf[threadIdx.y][threadIdx.x] = acc.second;
+
+    __syncthreads();
+
+    // Reduce intermediate values to single value.
+    K* sline = &sbuf[threadIdx.y][0];
+    Index* iline = &ibuf[threadIdx.y][0];
+    for (unsigned s = 8; s > 0; s >>= 1) {
+      if (row < num_rows && threadIdx.x < s) {
+        thrust::pair<K, Index> arg1 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
+        thrust::pair<K, Index> arg2 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
+        thrust::pair<K, Index> res = binary_op(arg1, arg2);
+
+        sline[threadIdx.x] = res.first;
+        iline[threadIdx.x] = res.second;
+      }
+      __syncthreads();
+    }
+
+    if (row < num_rows && threadIdx.x == 0) {
+      tgt1[row] = sline[0];
+      tgt2[row] = iline[0];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename ScalarTypeK,
+          typename ScalarTypeIndex,
+          typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceInnermostDimIndex(THCState *state,
+                                     TensorTypeK *tgt1,
+                                     TensorTypeIndex *tgt2,
+                                     TensorTypeK *src,
+                                     const thrust::pair<ScalarTypeK, ScalarTypeIndex>& init,
+                                     BinaryFunction binary_op) {
+  unsigned ndim = THCTensor__nDimension(state, src);
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= THCTensor_size(state, src, dim);
+  }
+  unsigned row_size = THCTensor_size(state, src, ndim - 1);
+
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  kernelTransformReduceInnermostDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      tgt1->template data<ScalarTypeK>(),
+      tgt2->template data<ScalarTypeIndex>(),
+      src->template data<ScalarTypeK>(),
+      num_rows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename ScalarTypeK,
+          typename ScalarTypeIndex,
+          typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+void
+THC_reduceDimIndex(THCState *state,
+                   TensorTypeK *tgt1_,
+                   TensorTypeIndex *tgt2_,
+                   TensorTypeK *src,
+                   int64_t dimension,
+                   int keepdim,
+                   const thrust::pair<ScalarTypeK, ScalarTypeIndex>& init,
+                   BinaryFunction binary_op)
+{
+  THArgCheck(dimension >= 0 &&
+             dimension < THCTensor__nDimension(state, src),
+             3, "dimension out of range");
+
+
+  // Unsqueeze tgt1_/tgt_2 if necessary so that their contiguity traits
+  // are preserved if they are the same size as the correct reduction output.
+  int src_dims = THCTensor__nDimension(state, src);
+  THCTensor_preserveReduceDimSemantics(
+      state, tgt1_, src_dims, dimension, keepdim);
+  THCTensor_preserveReduceDimSemantics(
+      state, tgt2_, src_dims, dimension, keepdim);
+
+  THLongStorage *dim = THCTensor_newSizeOf(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_resize(state, tgt1_, dim, NULL);
+  THCTensor_resize(state, tgt2_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TensorTypeK *tgt1 = (TensorTypeK*)THCTensor_newContiguous<ScalarTypeK>(state, tgt1_);
+  TensorTypeIndex *tgt2 = (TensorTypeIndex*)THCTensor_newContiguous<ScalarTypeIndex>(state, tgt2_);
+  src = (TensorTypeK*)THCTensor_newContiguous<ScalarTypeK>(state, src);
+
+  if (dimension == THCTensor__nDimension(state, src) - 1) {
+    THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
+  } else {
+    THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
+  }
+
+  THCTensor_free(state, src);
+  THCTensor_freeCopyTo<ScalarTypeK>(state, tgt1, tgt1_);
+  THCTensor_freeCopyTo<ScalarTypeIndex>(state, tgt2, tgt2_);
+  if (!keepdim) {
+    THCTensor_squeeze1d(state, tgt1_, tgt1_, dimension);
+    THCTensor_squeeze1d(state, tgt2_, tgt2_, dimension);
+  }
+}
+
+template <typename T, typename Index>
+struct MaxValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return (THCNumerics<T>::ge(a.first, b.first) ||
+            THCNumerics<T>::isnan(a.first)) ? a : b;
+  }
+};
+
+template <typename T, typename Index>
+struct MinValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return (THCNumerics<T>::le(a.first, b.first) ||
+            THCNumerics<T>::isnan(a.first)) ? a : b;
+  }
+};
+
+template <typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
+    return THCNumerics<T>::add(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct MulOp {
+  __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
+    return THCNumerics<T>::mul(lhs, rhs);
+  }
+};
+
+#endif // THC_TENSORMATH_REDUCE_CUH
diff --git a/aten/src/THC/THCTensorMathScan.cu b/aten/src/THC/THCTensorMathScan.cu
new file mode 100644
index 0000000..6f01bd2
--- /dev/null
+++ b/aten/src/THC/THCTensorMathScan.cu
@@ -0,0 +1,129 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMathReduce.cuh"
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename T, class BinaryOp>
+__global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
+                                              unsigned num_orows, unsigned num_irows, unsigned row_size,
+                                              T init, BinaryOp binary_op)
+{
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      T *src = src_ + orow * row_size * num_irows + irow;
+      T *tgt = tgt_ + orow * row_size * num_irows + irow;
+      T acc = init;
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        acc = binary_op(acc, *src);
+        *tgt = acc;
+
+        src += num_irows;
+        tgt += num_irows;
+      }
+    }
+  }
+}
+
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
+__global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_,
+                                                  unsigned num_rows, unsigned row_size,
+                                                  T init, BinaryFunction binary_op)
+{
+  __shared__ T sbuf[num_threads_y][2 * num_threads_x];
+
+  T* row_buf = sbuf[threadIdx.y];
+
+  for (unsigned block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    T block_total = init;
+
+    T *row_src = src_ + row * row_size;
+    T *row_tgt = tgt_ + row * row_size;
+
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      unsigned col1 = block_col + threadIdx.x;
+      unsigned col2 = block_col + num_threads_x + threadIdx.x;
+      if (row < num_rows) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_src[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = row_src[col2];
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          row_buf[0] = binary_op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction (up-sweep).
+      for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
+        if (row < num_rows && threadIdx.x < s) {
+          unsigned offset = (2 * threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      // Down-sweep.
+      for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
+        if (row < num_rows && threadIdx.x < s - 1) {
+          unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row < num_rows) {
+        if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+#include "generic/THCTensorMathScan.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMode.cu b/aten/src/THC/THCTensorMode.cu
new file mode 100644
index 0000000..52a5ce2
--- /dev/null
+++ b/aten/src/THC/THCTensorMode.cu
@@ -0,0 +1,18 @@
+#include "THC.h"
+#include "THCThrustAllocator.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/inner_product.h>
+#include <thrust/device_vector.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include "THCTensorMode.cuh"
+
+#include "generic/THCTensorMode.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh
new file mode 100644
index 0000000..0158f25
--- /dev/null
+++ b/aten/src/THC/THCTensorMode.cuh
@@ -0,0 +1,282 @@
+#ifndef THC_TENSOR_MODE_CUH
+#define THC_TENSOR_MODE_CUH
+
+#include "THCNumerics.cuh"
+#include "THCSortUtils.cuh"
+#include "THCScanUtils.cuh"
+
+struct ThrustHalfLess
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::lt(lhs, rhs);
+  }
+};
+
+struct ThrustHalfNotEqualTo
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::ne(lhs, rhs);
+  }
+};
+
+struct ThrustHalfEqualTo
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::eq(lhs, rhs);
+  }
+};
+
+struct ThrustHalfEqualToPredicate
+{
+  ThrustHalfEqualToPredicate(half val): val_(val) {}
+  __host__ __device__ inline bool operator()(half x) {
+    return THCNumerics<half>::eq(val_, x);
+  }
+
+  half val_;
+};
+
+template <typename T>
+struct BinaryAddOp {
+  __host__ __device__ inline T operator()(const T a, const T b) {
+    return THCNumerics<T>::add(a, b);
+  }
+};
+
+template <>
+struct BinaryAddOp<unsigned int> {
+  __host__ __device__ inline unsigned int operator()(const unsigned int a, const unsigned int b) {
+    return a + b;
+  }
+};
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+  unsigned int val;
+  bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int, unsigned int)
+// pairs of data
+struct ModeUnsignedPair {
+  unsigned int val;
+  unsigned int index;
+};
+
+template <typename T>
+struct MaxReduceOp {
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    return b.val > a.val ? b : a;
+  }
+};
+
+template <typename T>
+struct MatchReduceOp {
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    return b.flag ? b : a;
+  }
+};
+
+// The mode kernel has the following characteristics: It uses internal shared memory
+// buffers of Power2Size, which must be greater than the number of elements. Additionally,
+// there is one block for every slice to calculate the mode for, and in each block there
+// is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode dimension
+// as the innermost dim, such that we can get the particular slice for a Tensor via its
+// linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+__global__ void computeMode(
+    T *input,
+    TensorInfo<T, unsigned int> values,
+    TensorInfo<int64_t, unsigned int> indices,
+    int64_t sliceSize)
+{
+  int tidx = threadIdx.x;
+  int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+  // First, we need to calculate the offset into the sorted Tensor that represents
+  // the start of the slice for this block to calculate the mode for. This offset
+  // is a combination of the gridIndices, and the number of elements in the slice.
+  unsigned int blockId = getLinearBlockId<unsigned int>();
+  unsigned int linearOffset = blockId * sliceSize;
+
+  // shmem is a dynamically sized buffer we will use throughout the kernel to
+  // handle computation efficiently. The size of this shmem must be
+  // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+  //
+  // Initially, the buffer will be organized as follows:
+  //
+  // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+  extern __shared__ char shmem[];
+
+  // smem represents a proportion of the shared memory buffer that is used to store
+  // the elements from the slice:
+  T *smem = reinterpret_cast<T *>(shmem);
+
+  // Each thread loads up to two elements from the Tensor into shared memory
+  if (tidx < sliceSize) {
+    smem[tidx] = input[linearOffset + tidx];
+  }
+  if (stidx < sliceSize) {
+    smem[stidx] = input[linearOffset + stidx];
+  }
+
+  // Next, we initialize a boolean region of the buffer, offset by the loaded element
+  // smem region
+  bool *bmem = reinterpret_cast<bool *>(&smem[Power2Size]);
+
+  // The first use of this region stores bmem[i] = i < sliceSize to mark the valid
+  // components in the smem buffer
+  bmem[tidx] = tidx < sliceSize;
+  bmem[stidx] = stidx < sliceSize;
+  __syncthreads(); // barrier for smem, bmem initialization
+
+  // First, sort the input slice in ascending order. smem contains the input
+  // elements, and bmem marks the valid indices
+  bitonicSortKeys<LTComp<T>, T, unsigned int, Power2Size>(smem, bmem, LTComp<T>());
+  __syncthreads(); // make no assumptions that the sort syncs at end
+
+  // The next step of our algorithm is performing a block-wide comparison of
+  // neighboring elements. In particular, given an sorted input slice A, we
+  // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise 0.
+  //
+  // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+  //                 B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  //
+  // In particular, we can think of B[i] true indicating the start of a sequence of
+  // equal values in the sorted list. Similarly, we will also store the negation of B,
+  // which we'll call C. In particular, we can think of C[i] = true iff A[i-1] == A[i]
+  // in our original sorted slice.
+  //
+  //                 C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+  // We overwrite bmem, and treat the rest of shared memory as a buffer of (index, flag) pairs
+  // where the index represents values from C, and the flag represents values from B.
+  //
+  // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+  struct ModeUnsignedBoolPair *ubpmem = reinterpret_cast<struct ModeUnsignedBoolPair *>(
+      &smem[Power2Size]);
+
+  if (tidx == 0) {
+    ubpmem[0].flag = true;
+    ubpmem[0].val = 0;
+  }
+
+  // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+  ubpmem[tidx * 2 + 1].flag = THCNumerics<T>::ne(smem[tidx * 2], smem[tidx * 2 + 1]); // (0, 1), (1, 2), etc.
+  ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+  // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+  if (((tidx + 1) * 2) < Power2Size) {
+    ubpmem[(tidx + 1) * 2].flag = THCNumerics<T>::ne(smem[((tidx + 1) * 2) - 1], smem[(tidx + 1) * 2]);
+    ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+  }
+  __syncthreads(); // barrier for ubpmem initialization
+
+  // Next, we perform a segmented prefix sum on the neighboring elements, where
+  // the presence of a one indicates the start of a segment. In this case B acts
+  // as the segment start flags, and C is the buffer to be summed:
+  //
+  // Input  (C)  = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+  // Flag   (B)  = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  // Output (C)  = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+  //
+  // Afterwards, the (index) components of the ubpmem buffer contain the lengths of the
+  // segments (minus 1), i.e. the counts of each element in the original input.
+
+  inclusivePrefixScan<
+    struct ModeUnsignedBoolPair,
+    struct SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >,
+    Power2Size>(
+        ubpmem,
+        SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >(BinaryAddOp<unsigned int>()));
+  // assumes scan syncs at the end
+
+  // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. we treat the
+  // boolean flag regions as integers). We initialize these to represent indices, and we'll call
+  // this buffer I
+  struct ModeUnsignedPair *uupmem = reinterpret_cast<struct ModeUnsignedPair *>(ubpmem);
+
+  // At this point, we need to find the maximum element in lengths buffer C.
+  // This element will represent the count (-1) of the mode. Because of the
+  // way we have set up the problem, the index where this mode occurs will
+  // also be the location of the mode value in the sorted array, e.g.
+  //
+  // smem = [0, 0, 1, 1, 1, 2]
+  // C    = [0, 1, 0, 1, 2, 0]
+  // I    = [0, 1, 2, 3, 4, 5]
+  //                     ^
+  //                     maximum value, also aligned with mode = 1
+  //
+  // We perform a block wide max-reduction of the C buffer, but we also need the
+  // indices to come along with it, so we utilize the uupmem construction.
+  //
+  // At the end we need to return the ModeUnsignedPair containing index = 4, val = 2,
+  // which represents the max
+
+  // In practice, we will make each thread locally reduce 2 values in its registers prior
+  // to the global block-wide reduction. Note that instead of tidx/stidx, we utilize tidx * 2,
+  // tidx * 2 + 1, so each thread deals with adjacent elements. This is because the reduce
+  // code below relies on thread elements to be adjacent.
+  struct ModeUnsignedPair uup[2];
+  uup[0].index = tidx * 2;
+  uup[0].val = ubpmem[tidx * 2].val;
+  uup[1].index = tidx * 2 + 1;
+  uup[1].val = ubpmem[tidx * 2 + 1].val;
+  __syncthreads();
+
+  struct ModeUnsignedPair max = {0, 0};
+
+  max = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedPair, MaxReduceOp<struct ModeUnsignedPair>, 2>
+    (uupmem, uup, sliceSize, MaxReduceOp<struct ModeUnsignedPair>(), max);
+
+  // Store the mode in shared memory for use in finding the mode in the input slice
+  __shared__ T  mode;
+
+  // Given the above constraints, the mode is the value at the reduced index in the
+  // original sorted element buffer
+  if (tidx == 0) {
+    mode = smem[max.index];
+  }
+  __syncthreads(); // broadcast mode
+
+  // Finally, we need to find the "an" index of the mode in the input Tensor. The API does
+  // not constrain which index we pick, so it can be any of the indices that contain the mode.
+  // We will do a reduction to find the index. We go back to using the (index, flag) buffer
+  // arrangement. First, we mark indices that are equal to the mode, i.e B[i] = true if
+  // input[i] == mode, and initialize C[i] to be the index
+  //
+  // Again we reduce 2 elements in the thread's registers prior to the block-wide reduction
+  struct ModeUnsignedBoolPair ubpp[2];
+  if (tidx * 2 < sliceSize) {
+    ubpp[0].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2)], mode);
+    ubpp[0].val = tidx * 2;
+  }
+  if (tidx * 2 + 1 < sliceSize) {
+    ubpp[1].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2 + 1)], mode);
+    ubpp[1].val = tidx * 2 + 1;
+  }
+
+  // Then we perform a similar reduction to the one above, except this time we update
+  // the element if the element at the base position is not equal to the mode and
+  // the element at the offset position is. At the end, C[0] will contain an index
+  // with the mode.
+  struct ModeUnsignedBoolPair match = {0, false};
+
+  match = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedBoolPair, MatchReduceOp<struct ModeUnsignedBoolPair>, 2>
+    (ubpmem, ubpp, sliceSize, MatchReduceOp<struct ModeUnsignedBoolPair>(), match);
+
+  // Finally, we have the mode, and an index where it occurs. We use a single thread
+  // to place this in the appropriate output position
+  if (tidx == 0) {
+    int64_t index = TH_INDEX_BASE + match.val;
+
+    unsigned int outputOffset = IndexToOffset<T, unsigned int, -1>::get(blockId, values);
+    values.data[outputOffset] = mode;
+    indices.data[outputOffset] = index;
+  }
+}
+
+#endif // THC_TENSOR_MODE_CUH
diff --git a/aten/src/THC/THCTensorRandom.cpp b/aten/src/THC/THCTensorRandom.cpp
new file mode 100644
index 0000000..e7a4100
--- /dev/null
+++ b/aten/src/THC/THCTensorRandom.cpp
@@ -0,0 +1,141 @@
+#include "THCTensorRandom.h"
+#include "THCGenerator.hpp"
+
+#include <random>
+#include <curand.h>
+
+
+void initializeGenerator(THCState *state, THCGenerator* gen);
+void createGeneratorState(THCGenerator* gen, uint64_t seed);
+
+
+/* Frees memory allocated during setup. */
+void destroyGenerator(THCState *state, THCGenerator* gen)
+{
+  std::lock_guard<std::mutex> lock(gen->mutex);
+  if (gen->state.gen_states)
+  {
+    THCudaFree(state, gen->state.gen_states);
+    gen->state.gen_states = NULL;
+  }
+  if (gen->state.kernel_params)
+  {
+    THCudaFree(state, gen->state.kernel_params);
+    gen->state.kernel_params = NULL;
+  }
+}
+
+static uint64_t createSeed(std::random_device& rd)
+{
+  // limit to 53 bits to ensure unique representation in double
+  uint64_t seed = (((uint64_t)rd()) << 32) + rd();
+  return seed & 0x1FFFFFFFFFFFFF;
+}
+
+/* Initialize generator array (must be called before any other function) */
+void THCRandom_init(THCState* state, int devices, int current_device)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  rng_state->num_devices = devices;
+  rng_state->gen = (THCGenerator*)malloc(rng_state->num_devices * sizeof(THCGenerator));
+  std::random_device rd;
+  for (int i = 0; i < rng_state->num_devices; ++i)
+  {
+    new (&rng_state->gen[i].mutex) std::mutex();
+    rng_state->gen[i].state.initf = 0;
+    rng_state->gen[i].state.initial_seed = createSeed(rd);
+    rng_state->gen[i].state.philox_seed_offset = 0;
+    rng_state->gen[i].state.gen_states = NULL;
+    rng_state->gen[i].state.kernel_params = NULL;
+  }
+}
+
+/* Destroy generators and free memory */
+void THCRandom_shutdown(THCState* state)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  if (rng_state->gen == NULL) return;
+  for (int i = 0; i < rng_state->num_devices; ++i)
+  {
+    destroyGenerator(state, &rng_state->gen[i]);
+  }
+  free(rng_state->gen);
+  rng_state->gen = NULL;
+}
+
+/* Get the generator for the current device, but does not initialize the state */
+static THCGenerator* THCRandom_rawGenerator(THCState* state)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  if (device >= rng_state->num_devices) THError("Invalid device index.");
+  return &rng_state->gen[device];
+}
+
+/* Get the generator for the current device and initializes it if necessary */
+THCGenerator* THCRandom_getGenerator(THCState* state)
+{
+  THCGenerator* gen = THCRandom_rawGenerator(state);
+  std::lock_guard<std::mutex> lock(gen->mutex);
+  if (gen->state.initf == 0)
+  {
+    initializeGenerator(state, gen);
+    createGeneratorState(gen, gen->state.initial_seed);
+    gen->state.initf = 1;
+  }
+  return gen;
+}
+
+struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state)
+{
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  return gen->state.gen_states;
+}
+
+/* Random seed */
+uint64_t THCRandom_seed(THCState* state)
+{
+  std::random_device rd;
+  uint64_t s = createSeed(rd);
+  THCRandom_manualSeed(state, s);
+  return s;
+}
+
+uint64_t THCRandom_seedAll(THCState* state)
+{
+  std::random_device rd;
+  uint64_t s = createSeed(rd);
+  THCRandom_manualSeedAll(state, s);
+  return s;
+}
+
+/* Manually set the seed */
+void THCRandom_manualSeed(THCState* state, uint64_t seed)
+{
+  THCGenerator* gen = THCRandom_rawGenerator(state);
+  std::lock_guard<std::mutex> lock(gen->mutex);
+  gen->state.initial_seed = seed;
+  if (gen->state.initf) {
+    createGeneratorState(gen, seed);
+  }
+}
+
+void THCRandom_manualSeedAll(THCState* state, uint64_t seed)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+  for (int i = 0; i < rng_state->num_devices; ++i) {
+    THCudaCheck(cudaSetDevice(i));
+    THCRandom_manualSeed(state, seed);
+  }
+  THCudaCheck(cudaSetDevice(currentDevice));
+}
+
+/* Get the initial seed */
+uint64_t THCRandom_initialSeed(THCState* state)
+{
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  return gen->state.initial_seed;
+}
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
new file mode 100644
index 0000000..6544a18
--- /dev/null
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -0,0 +1,196 @@
+#include "THCTensorRandom.h"
+#include "THCDeviceUtils.cuh"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCTensorMath.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorRandom.cuh"
+#include "THCGenerator.hpp"
+
+#include <thrust/functional.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_mtgp32_host.h>
+#include <curand_mtgp32dc_p_11213.h>
+
+#define MAX_NUM_BLOCKS 200 
+#define BLOCK_SIZE 256
+
+
+THCGenerator* THCRandom_getGenerator(THCState* state);
+
+/* Sets up generator. Allocates but does not create the generator states. Not thread-safe. */
+__host__ void initializeGenerator(THCState *state, THCGenerator* gen)
+{
+  gen->state.gen_states = static_cast<struct curandStateMtgp32*>(THCudaMalloc(state, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32)));
+  gen->state.kernel_params = static_cast<mtgp32_kernel_params*>(THCudaMalloc(state, sizeof(mtgp32_kernel_params)));
+}
+
+/* Creates a new generator state given the seed. Not thread-safe. */
+__host__ void createGeneratorState(THCGenerator* gen, uint64_t seed)
+{
+  if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS)
+  {
+    THError("Creating MTGP constants failed.");
+  }
+  if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213,
+                                  gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS)
+  {
+    THError("Creating MTGP kernel state failed.");
+  }
+}
+
+__host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state)
+{
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  std::lock_guard<std::mutex> lock(gen->mutex);
+
+  // The RNG state comprises the MTPG32 states, the seed, and an offset used for Philox
+  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t seed_size = sizeof(gen->state.initial_seed);
+  static const size_t offset_size = sizeof(gen->state.philox_seed_offset);
+  static const size_t total_size = states_size + seed_size + offset_size;
+  THByteTensor_resize1d(rng_state, total_size);
+  THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
+  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
+  THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states,
+                         states_size, cudaMemcpyDeviceToHost));
+  memcpy(THByteTensor_data(rng_state) + states_size, &gen->state.initial_seed, seed_size);
+  memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &gen->state.philox_seed_offset, offset_size);
+}
+
+__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel)
+{
+  state[threadIdx.x].k = kernel;
+}
+
+__host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state)
+{
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  std::lock_guard<std::mutex> lock(gen->mutex);
+
+  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t seed_size = sizeof(gen->state.initial_seed);
+  static const size_t offset_size = sizeof(gen->state.philox_seed_offset);
+  static const size_t total_size = states_size + seed_size + offset_size;
+  bool no_philox_seed = false;
+  if (THByteTensor_nElement(rng_state) == total_size - offset_size) {
+    no_philox_seed = true;
+  }
+  else {
+    THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
+  }
+  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
+
+  THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state),
+                         states_size, cudaMemcpyHostToDevice));
+  set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, gen->state.kernel_params);
+  memcpy(&gen->state.initial_seed, THByteTensor_data(rng_state) + states_size, seed_size);
+  if (!no_philox_seed) {
+    memcpy(&gen->state.philox_seed_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size);
+  }
+  else {
+    gen->state.philox_seed_offset = 0;
+  }
+}
+
+// Goes from (0, 1] to [0, 1). Note 1-x is not sufficient since for some floats
+// eps near 0, 1-eps will round to 1.
+template <typename T>
+__device__ inline T reverse_bounds(T value) {
+  if (THCNumerics<T>::eq(value, ScalarConvert<int, T>::to(1))) {
+    return ScalarConvert<int, T>::to(0);
+  }
+  return value;
+}
+
+
+#ifdef CUDA_HALF_TENSOR
+__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) {
+  half width = ScalarConvert<double, half>::to(b - a);
+  half start = ScalarConvert<double, half>::to(a);
+  half scaled = THCNumerics<half>::mul(reverse_bounds(ScalarConvert<float, half>::to(x)), width);
+  return THCNumerics<half>::add(scaled, start);
+}
+#endif
+
+#define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)      \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1)      \
+{                                                                              \
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                             \
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                \
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {      \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                              \
+    if (i < size) {                                                            \
+      T y = TRANSFORM;                                                         \
+      result[i] = y;                                                           \
+    }                                                                          \
+  }                                                                            \
+}
+
+#define GENERATE_KERNEL2(NAME, T, ARG1, ARG2, CURAND_T, CURAND_FUNC, TRANSFORM)      \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)      \
+{                                                                                    \
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                                   \
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                      \
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {            \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                                    \
+    if (i < size) {                                                                  \
+      T y = TRANSFORM;                                                               \
+      result[i] = y;                                                                 \
+    }                                                                                \
+  }                                                                                  \
+}
+
+template<typename T, typename U>
+struct is_same { static const bool value = false; };
+
+template<typename T>
+struct is_same<T, T> { static const bool value = true; };
+
+template<typename real, typename prob_type>
+__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
+        real *result, prob_type *probs)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    if (is_same<prob_type, double>::value) {
+      double x = curand_uniform_double(&state[blockIdx.x]);
+      if (i < size)
+        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+    } else {
+      float x = curand_uniform(&state[blockIdx.x]);
+      if (i < size)
+        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+    }
+  }
+}
+
+// NOTE: curand_uniform is (0, 1] and we want [a, b)
+GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
+
+GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
+GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
+
+GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(x)))
+GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(x)))
+
+GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
+GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
+
+#ifdef CUDA_HALF_TENSOR
+GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
+GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, half>::to((x * stdv) + mean)))
+GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert<float, half>::to((float)(-1. / lambda * log(x)))))
+GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
+#endif // CUDA_HALF_TENSOR
+
+#include "generic/THCTensorRandom.cu"
+#include "THCGenerateAllTypes.h"
+
+#undef GENERATE_KERNEL1
+#undef GENERATE_KERNEL2
diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh
new file mode 100644
index 0000000..7749f23
--- /dev/null
+++ b/aten/src/THC/THCTensorRandom.cuh
@@ -0,0 +1,401 @@
+#ifndef THC_TENSOR_RANDOM_CUH
+#define THC_TENSOR_RANDOM_CUH
+
+#include "THCNumerics.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorMathReduce.cuh"
+
+#include <curand_kernel.h>
+
+#define MAX_NUM_BLOCKS 200 
+#define BLOCK_SIZE 256
+/* Separate kernel because curand_log_normal gets extra parameters. */
+
+template <typename T>
+__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = ScalarConvert<float, T>::to(x);
+    }
+  }
+}
+
+template <>
+__global__ void generateLogNormal<double>(curandStateMtgp32 *state, int size, double *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = x;
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+multinomialAliasDrawKernel(int size, int64_t *output, int64_t *J, T *q, int64_t K,  T *uniform, T *bernoulli){
+  int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  if (idx < size) {
+    int64_t rand_ind = ScalarConvert<T, int64_t>::to(uniform[idx]);
+    T bern_uniform = bernoulli[idx];
+    int _mask = (int) THCNumerics<T>::lt(bern_uniform, q[rand_ind]);
+    output[idx] = J[rand_ind]*(1 -_mask) + (rand_ind+1L) * _mask;
+  }
+}
+
+template <typename T>
+__global__ void
+aliasMultinomialFilter(T *q, T *probs, int64_t *smaller, int64_t *larger, int64_t *J_data, int64_t *larger_short_data, int64_t *smaller_short_data, T one, int64_t inputsize){
+  int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  if (idx < inputsize) {
+    larger_short_data[idx] = 0;
+    smaller_short_data[idx] = 0;
+    J_data[idx]= 0;
+    T val = THCNumerics<T>::mul(probs[idx], ScalarConvert<int64_t, T>::to(inputsize));
+    if (THCNumerics<T>::lt(val, one)) {
+      smaller[idx] =  idx+1;
+      larger[idx] = 0;
+    } else {
+      larger[idx] = idx+1;
+      smaller[idx] = 0;
+    }
+    q[idx] = val;
+  }
+}
+
+template <typename T>
+__global__ void
+condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) {
+  int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  T one = ScalarConvert<int, T>::to(1);
+  if (idx < inputsize) {
+    if (J[idx] <= 0) {
+      q[idx] = one;
+    } else {
+      if (THCNumerics<T>::gt(q_max, one)) {
+	q[idx] = THCNumerics<T>::div(q[idx], q_max);
+      }
+    }
+  }
+}
+
+
+#undef MAX_NUM_BLOCKS
+#undef BLOCK_SIZE
+
+// Normalizes the L1 norm of every row to 1; used by multinomial
+template <typename T>
+__global__ void renormRowsL1(T* dist, long rows, long cols) {
+  extern __shared__  unsigned char my_smem[];
+  T *smem = reinterpret_cast<T *>(my_smem);
+  T zero = ScalarConvert<int, T>::to(0);
+  T val;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    T sum = ScalarConvert<int, T>::to(0);
+    for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
+      val = dist[row * cols + col];
+      assert(THCNumerics<T>::ge(val, zero));
+      sum = THCNumerics<T>::add(sum, val);
+    }
+
+    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T>(), zero);
+    if (threadIdx.x == 0) {
+      assert(THCNumerics<T>::gt(sum, zero));
+      smem[0] = sum;
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    if (THCNumerics<T>::gt(sum, ScalarConvert<int, T>::to(0))) {
+      for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
+        dist[row * cols + col] = THCNumerics<T>::div(dist[row * cols + col], sum);
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ int binarySearchForMultinomial(T* dist,
+                                          int size,
+                                          T val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    T midVal = dist[mid];
+    if (THCNumerics<T>::lt(midVal, val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  if (start == size) {
+    // No probability mass or precision problems; just return the
+    // first non-zero element by setting start to size-1 here,
+    // the code below will move it to the last non-zero probability
+    // this actually can happen when the random number is 1
+    // (github pytorch issue #4858).
+    start = size - 1;
+  }
+
+  T curVal = dist[start];
+  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
+
+  return start;
+}
+
+template <typename T, typename AccT>
+__global__ void
+sampleMultinomialOnce(int64_t* dest,
+                      int64_t distributions,
+                      int categories,
+                      T* sampled,
+                      T* dist,
+                      int stride_dist,        // dist->stride[0]
+                      int stride_categories   // dist->stride[1]
+                      ) {
+  extern __shared__  unsigned char my_smem[];
+  __shared__ bool found;
+
+  // Shared Memory hold blockdim.x T for holding the cumulative sum,
+  // blockDim.x AccT for normalizing the probabilities,
+  T *smem = reinterpret_cast<T *>(my_smem);
+  AccT *asmem = reinterpret_cast<AccT *>(&my_smem[blockDim.x * sizeof(T)]);
+
+  AccT accZero = ScalarConvert<int, AccT>::to(0);
+  T zero = ScalarConvert<int, T>::to(0);
+
+  for (int64_t curDist = blockIdx.x;
+       curDist < distributions; curDist += gridDim.x) {
+    // Each block handles one distribution
+    // First pass, find the total sum of the distribution
+    AccT sum = accZero;
+    T val;
+    for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
+      val = dist[curDist * stride_dist + cat * stride_categories];
+      assert(THCNumerics<T>::ge(val, zero));
+      assert(!THCNumerics<T>::isinf(val));
+      assert(!THCNumerics<T>::isnan(val));
+      sum = THCNumerics<AccT>::add(sum, ScalarConvert<T, AccT>::to(val));
+    }
+
+    // threadIdx.x == 0 has the sum value from this
+    sum = reduceBlock(asmem, blockDim.x, sum, ReduceAdd<AccT>(), accZero);
+
+    // Broadcast sum and sample value
+    if (threadIdx.x == 0) {
+      // Make sure the sum of our distribution didn't overflow
+      assert(!isinf(sum));
+      assert(THCNumerics<AccT>::gt(sum, accZero));
+
+      asmem[0] = sum;
+      smem[0] = sampled[curDist];
+    }
+    __syncthreads();
+
+    sum = asmem[0];
+    T sample = smem[0];
+    __syncthreads();
+
+    if (THCNumerics<AccT>::eq(sum,  accZero) || THCNumerics<T>::eq(sample, zero)) {
+      // Choose the first element
+      if (threadIdx.x == 0) {
+        dest[curDist] = TH_INDEX_BASE;
+      }
+
+      continue;
+    }
+
+    int chunks = THCCeilDiv(categories, (int) blockDim.x);
+    T prevHighProb = zero;
+    found = false;
+
+    for (int chunk = 0; chunk < chunks && !found; ++chunk) {
+      // All threads in bounds load a value
+      int cat = chunk * blockDim.x + threadIdx.x;
+
+      AccT val =
+        cat < categories ?
+          THCNumerics<AccT>::div(
+              ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
+              sum) :
+          accZero;
+
+      smem[threadIdx.x] = ScalarConvert<AccT, T>::to(val);
+      __syncthreads();
+
+      // Perform an inclusive prefix sum of the shared memory contents
+      for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        T val = zero;
+
+        if (threadIdx.x >= offset) {
+          val = THCNumerics<T>::add(smem[threadIdx.x - offset], smem[threadIdx.x]);
+        }
+
+        __syncthreads();
+        if (threadIdx.x >= offset) {
+          smem[threadIdx.x] = val;
+        }
+        __syncthreads();
+      }
+
+      // Each thread will check to see if the sample falls in its
+      // bucket
+      T curBucket = THCNumerics<T>::add(smem[threadIdx.x], prevHighProb);
+      T prevBucket =
+        threadIdx.x == 0 ? prevHighProb :
+        THCNumerics<T>::add(smem[threadIdx.x - 1], prevHighProb);
+      bool inBucket =
+        (cat < categories) &&
+        (!THCNumerics<T>::gt(sample, curBucket)) &&
+        (THCNumerics<T>::gt(sample, prevBucket));
+
+      if (inBucket) {
+        // We're done; we have the sample
+        // Torch indices are 1-based
+        dest[curDist] = cat + TH_INDEX_BASE;
+        found = true;
+      }
+
+      // Store the previous scan's high value for future use
+      prevHighProb = THCNumerics<T>::add(prevHighProb, smem[blockDim.x - 1]);
+
+      __syncthreads();
+    }
+
+    if (threadIdx.x == 0 && !found) {
+      // This should address a rare bug where we don't select a valid index. This likely occurs when
+      // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
+      // and our uniform sample is greater than this value. In this case we likely have unitialized memory
+      // in dest[curDist]. So basically we will loop through the distribution and pick the largest index
+      // where the distribution is non-zero. This is obviously terribly inefficient, but due to the
+      // rarity in which this occurs, this should not be an issue.
+      for (int cat = categories - 1; cat >= 0; --cat) {
+        if (THCNumerics<T>::gt(dist[curDist * stride_dist + cat * stride_categories], zero)) {
+          dest[curDist] = cat + TH_INDEX_BASE;
+          break;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithReplacement(curandStateMtgp32* state,
+                                 int totalSamples,
+                                 int64_t* dest,
+                                 int64_t distributions,
+                                 int categories,
+                                 T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block determines the distribution for which we generate a point
+  for (int64_t curDist = blockIdx.x;
+       curDist < distributions;
+       curDist += gridDim.x) {
+    for (int sampleBase = 0;
+         sampleBase < totalSamples; sampleBase += blockDim.y) {
+      // The warp determines the sample
+      int sample = sampleBase + threadIdx.y;
+
+      // All threads participate in this
+      T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+      if (threadIdx.x == 0 && sample < totalSamples) {
+        // Find the bucket that a uniform sample lies in
+        int choice = binarySearchForMultinomial<T>(
+          normDistPrefixSum + curDist * categories,
+          categories,
+          r);
+
+        // Torch indices are 1-based
+        dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
+                                    int totalSamples,
+                                    int sample,
+                                    int64_t* dest,
+                                    int64_t distributions,
+                                    int categories,
+                                    T* origDist,
+                                    T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block and warp determines the distribution for which we
+  // generate a point
+  for (int64_t curDistBase = blockIdx.x * blockDim.y;
+       curDistBase < distributions;
+       curDistBase += gridDim.x * blockDim.y) {
+    // The warp determines the distribution
+    int64_t curDist = curDistBase + threadIdx.y;
+
+    // All threads must participate in this
+    T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+    if (threadIdx.x == 0 && curDist < distributions) {
+      // Find the bucket that a uniform sample lies in
+      int choice = binarySearchForMultinomial<T>(
+        normDistPrefixSum + curDist * categories,
+        categories,
+        r);
+
+      // Torch indices are 1-based
+      dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+
+      // Without replacement, so update the original probability so it
+      // is not considered a second time
+      origDist[curDist * categories + choice] = ScalarConvert<int, T>::to(0);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+aliasMultinomialSetup(int64_t *J, T*q, int64_t inputsize, int64_t * smaller, int64_t *larger, int small_c, int large_c) {
+  T one = ScalarConvert<int64_t, T>::to(1);
+  // Loop through and create little binary mixtures that
+  // appropriately allocate the larger outcomes over the
+  // overall uniform mixture.
+  int64_t large = 0;
+  int64_t small = 0;
+  while (small_c > 0 && large_c > 0) {
+    large = larger[large_c-1]-1;
+    small = smaller[small_c-1]-1;
+    J[small] = large;
+    T q_sub = THCNumerics<T>::sub(one, q[small]);
+    q[large] = THCNumerics<T>::sub(q[large], q_sub);
+    if (THCNumerics<T>::le(q[large], one)) {
+      smaller[small_c-1] = large+1;
+      large_c -= 1;
+    } else {
+      larger[large_c-1] = large+1;
+      small_c -= 1;
+    }
+  }
+}
+
+#endif // THC_TENSOR_RANDOM_CUH
diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h
new file mode 100644
index 0000000..5203df2
--- /dev/null
+++ b/aten/src/THC/THCTensorRandom.h
@@ -0,0 +1,31 @@
+#ifndef TH_CUDA_TENSOR_RANDOM_INC
+#define TH_CUDA_TENSOR_RANDOM_INC
+
+#include "THCTensor.h"
+
+#include "generic/THCTensorRandom.h"
+#include "THCGenerateAllTypes.h"
+
+typedef struct THCGenerator THCGenerator;
+
+typedef struct THCRNGState {
+  /* One generator per GPU */
+  THCGenerator* gen;
+  int num_devices;
+} THCRNGState;
+
+struct THCState;
+
+THC_API void THCRandom_init(struct THCState *state, int num_devices, int current_device);
+THC_API void THCRandom_shutdown(struct THCState *state);
+THC_API uint64_t THCRandom_seed(struct THCState *state);
+THC_API uint64_t THCRandom_seedAll(struct THCState *state);
+THC_API void THCRandom_manualSeed(struct THCState *state, uint64_t the_seed_);
+THC_API void THCRandom_manualSeedAll(struct THCState *state, uint64_t the_seed_);
+THC_API uint64_t THCRandom_initialSeed(struct THCState *state);
+THC_API void THCRandom_getRNGState(struct THCState *state, THByteTensor *rng_state);
+THC_API void THCRandom_setRNGState(struct THCState *state, THByteTensor *rng_state);
+
+THC_API struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state);
+
+#endif
diff --git a/aten/src/THC/THCTensorScatterGather.cu b/aten/src/THC/THCTensorScatterGather.cu
new file mode 100644
index 0000000..a1ed0d4
--- /dev/null
+++ b/aten/src/THC/THCTensorScatterGather.cu
@@ -0,0 +1,184 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCAtomics.cuh"
+#include "THCApply.cuh"
+
+// Compute the offsets into the given tensors for a linear index. For the 't2'
+// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
+// size (with the exception of 't2' in dimension 'dim').
+// This version uses a static number of dimensions.
+template <typename IndexType, typename Real, int Dims>
+struct IndexToScatterGatherOffsets {
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = Dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      *t1Offset += curDimIndex * t1.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = Dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+};
+
+// Same as above but using a dynamic number of dimensions.
+template <typename IndexType, typename Real>
+struct IndexToScatterGatherOffsets<IndexType, Real, -1> {
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = index.dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      *t1Offset += curDimIndex * t1.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = index.dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+};
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_gatherKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<Real, IndexType> src,
+    TensorInfo<int64_t, IndexType> index,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType srcOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          tensor, &tensorOffset,
+                                                          src, &srcOffset);
+
+    int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < src.sizes[dim]);
+    srcOffset += indexValue * src.strides[dim];
+
+    tensor.data[tensorOffset] = src.data[srcOffset];
+  }
+}
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_scatterKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<Real, IndexType> src,
+    TensorInfo<int64_t, IndexType> index,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType srcOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          src, &srcOffset,
+                                                          tensor, &tensorOffset);
+
+    int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
+    tensorOffset += indexValue * tensor.strides[dim];
+
+    tensor.data[tensorOffset] = src.data[srcOffset];
+  }
+}
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_scatterAddKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<Real, IndexType> src,
+    TensorInfo<int64_t, IndexType> index,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType srcOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          src, &srcOffset,
+                                                          tensor, &tensorOffset);
+
+    int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
+    tensorOffset += indexValue * tensor.strides[dim];
+
+    atomicAdd(&tensor.data[tensorOffset], src.data[srcOffset]);
+  }
+}
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_scatterFillKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<int64_t, IndexType> index,
+    Real value,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          tensor, &tensorOffset);
+
+    int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
+    tensorOffset += indexValue * tensor.strides[dim];
+
+    tensor.data[tensorOffset] = value;
+  }
+}
+
+#include "generic/THCTensorScatterGather.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorSort.cu b/aten/src/THC/THCTensorSort.cu
new file mode 100644
index 0000000..ed1342f
--- /dev/null
+++ b/aten/src/THC/THCTensorSort.cu
@@ -0,0 +1,62 @@
+#include "THCTensorSort.cuh"
+
+void THCudaLongTensor_fillSliceWithIndex(THCState* state,
+                                         THCudaLongTensor* t,
+                                         int dim) {
+  int64_t dims = THCudaLongTensor__nDimension(state, t);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t inElements = THCudaLongTensor_nElement(state, t);
+  int64_t sliceSize = THCudaLongTensor_size(state, t, dim);
+  ptrdiff_t numSlices = inElements / sliceSize;
+
+  dim3 grid;
+  if (!THC_getGridFromTiles(numSlices, grid)) {
+    THError("Slice to fill with indices is too large");
+  }
+
+  int64_t maxThreads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  int64_t numThreads = sliceSize;
+  if (numThreads > maxThreads) {
+    numThreads = maxThreads;
+  }
+
+  dim3 block(numThreads);
+
+#define FILL_INDEX(T, DIM)                                       \
+  fillSliceWithIndex<T, DIM>                                     \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
+      info, numSlices, sliceSize, info.strides[collapseDim])
+
+  if (THCTensor_canUse32BitIndexMath(state, t)) {
+    TensorInfo<int64_t, uint32_t> info =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, t);
+    info.reduceDim(dim);
+    int collapseDim = info.collapseDims(dim);
+
+    if (info.isContiguous()) {
+      FILL_INDEX(unsigned int, -2);
+    } else {
+      if (info.dims == 1) {
+        FILL_INDEX(unsigned int, 1);
+      } else if (info.dims == 2) {
+        FILL_INDEX(unsigned int, 2);
+      } else {
+        FILL_INDEX(unsigned int, -1);
+      }
+    }
+  } else {
+    TensorInfo<int64_t, uint64_t> info =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, t);
+    info.reduceDim(dim);
+    int collapseDim = info.collapseDims(dim);
+
+    // catch-all implementation
+    FILL_INDEX(uint64_t, -1);
+  }
+
+#undef FILL_INDEX
+
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh
new file mode 100644
index 0000000..9b75a73
--- /dev/null
+++ b/aten/src/THC/THCTensorSort.cuh
@@ -0,0 +1,86 @@
+#ifndef THC_TENSORSORT_CUH
+#define THC_TENSORSORT_CUH
+
+#include "THCReduceApplyUtils.cuh"
+#include "THCSortUtils.cuh"
+#include "THCTensorCopy.h"
+#include "THCTensorTypeUtils.cuh"
+
+#include "THCThrustAllocator.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__)
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename T>
+struct ThrustGTOp {
+  __device__ bool operator()(const T& lhs, const T& rhs) const {
+    return THCNumerics<T>::gt(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct ThrustLTOp {
+  __device__ bool operator()(const T& lhs, const T& rhs) const {
+    return THCNumerics<T>::lt(lhs, rhs);
+  }
+};
+
+// `base` is the base address of a tensor
+// For each slice (defined as a linear point of `out`, from 0 ->
+// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
+// `sliceSize - 1`.
+template <typename IndexType, int Dim>
+__global__ void
+fillSliceWithIndex(TensorInfo<int64_t, IndexType> out,
+                   IndexType totalSlices,
+                   IndexType sliceSize,
+                   IndexType sliceStride) {
+  IndexType slice = getLinearBlockId<IndexType>();
+
+  if (slice >= totalSlices) {
+    return;
+  }
+
+  const uint64_t offset =
+    IndexToOffset<int64_t, IndexType, Dim>::get(slice, out);
+  int64_t* base = &out.data[offset];
+
+  for (int64_t i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    // Torch indices are 1-based (hence the +1)
+    base[i * sliceStride] = i + TH_INDEX_BASE;
+  }
+}
+
+// For slice sorting in Thrust; extracts a slice index from a linear
+// index and uses that for comparison
+struct SliceComp {
+  SliceComp(int64_t size) : sliceSize(size) {}
+
+  __device__ bool operator()(const int64_t& a, const int64_t& b) const {
+    // Since the slices are guaranteed to be innermost,
+    // the segment is just via int64_t division
+    int64_t segA = a / sliceSize;
+    int64_t segB = b / sliceSize;
+    return segA < segB;
+  }
+
+  const int64_t sliceSize;
+};
+
+// For sorting in Thurst; extracts a within-slice index from a linear index
+struct GlobalIndexToPerSliceIndex {
+  GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {}
+
+  __device__ inline void operator()(int64_t& v) const {
+    v = v % sliceSize + TH_INDEX_BASE;
+  }
+
+  const int64_t sliceSize;
+};
+
+void THCudaLongTensor_fillSliceWithIndex(THCState* state,
+                                         THCudaLongTensor* t,
+                                         int dim);
+#endif // THC_TENSORSORT_CUH
diff --git a/aten/src/THC/THCTensorTopK.cu b/aten/src/THC/THCTensorTopK.cu
new file mode 100644
index 0000000..325d560
--- /dev/null
+++ b/aten/src/THC/THCTensorTopK.cu
@@ -0,0 +1,19 @@
+#include "THC.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorCopy.h"
+#include "THCTensorMath.h"
+#include "THCAsmUtils.cuh"
+#include "THCScanUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCTensorMathReduce.cuh"
+#include <algorithm> // for std::min
+
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+#include "THCTensorTopK.cuh"
+
+#include "generic/THCTensorTopK.cu"
+#include "THCGenerateAllTypes.h"
+
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
new file mode 100644
index 0000000..c243316
--- /dev/null
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -0,0 +1,485 @@
+#ifndef THC_TENSOR_TOPK_CUH
+#define THC_TENSOR_TOPK_CUH
+
+template <typename T>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+  typedef uint32_t RadixType;
+
+  // Converts a float to an integer representation with the same
+  // sorting; i.e., for floats f1, f2:
+  // if f1 < f2 then convert(f1) < convert(f2)
+  // We use this to enable radix selection of floating-point values.
+  // This also gives a relative order for NaNs, but that's ok, as they
+  // will all be adjacent
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (x ^ mask);
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<uint8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(uint8_t v) {
+    return v;
+  }
+
+  static inline __device__ uint8_t deconvert(RadixType v) {
+    return v;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int8_t v) {
+    return 128u + v;
+  }
+
+  static inline __device__ int8_t deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int16_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int16_t v) {
+    assert(sizeof(short) == 2);
+    return 32768u + v;
+  }
+
+  static inline __device__ int16_t deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int32_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int32_t v) {
+    assert(sizeof(int) == 4);
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int32_t deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int64_t> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(int64_t v) {
+    assert(sizeof(int64_t) == 8);
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ int64_t deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (x ^ mask);
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TopKTypeConfig<half> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(half v) {
+#if CUDA_VERSION >= 8000
+    RadixType x = __half_as_ushort(v);
+    RadixType mask = -((x >> 15)) | 0x8000;
+    return (x ^ mask);
+#else
+    assert(false);
+    return 0u;
+#endif
+  }
+
+  static inline __device__ half deconvert(RadixType v) {
+#if CUDA_VERSION >= 8000
+    RadixType mask = ((v >> 15) - 1) | 0x8000;
+    return __ushort_as_half(v ^ mask);
+#else
+    assert(false);
+    return ScalarConvert<int, half>::to(0);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <typename DataType, typename BitDataType,
+          typename IndexType, typename CountType,
+          int RadixSize, int RadixBits>
+__device__ void countRadixUsingMask(CountType counts[RadixSize],
+                                    CountType* smem,
+                                    BitDataType desired,
+                                    BitDataType desiredMask,
+                                    int radixDigitPos,
+                                    IndexType sliceSize,
+                                    IndexType withinSliceStride,
+                                    DataType* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+  for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    BitDataType val = TopKTypeConfig<DataType>::convert(doLdg(&data[i * withinSliceStride]));
+
+    bool hasVal = ((val & desiredMask) == desired);
+    BitDataType digitInRadix = Bitfield<BitDataType>::getBitfield(val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (unsigned int j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+      counts[j] += __popc(WARP_BALLOT(vote, ACTIVE_MASK()));
+    }
+  }
+
+  // Now, for each warp, sum values
+  if (getLaneId() == 0) {
+#pragma unroll
+    for (unsigned int i = 0; i < RadixSize; ++i) {
+      atomicAdd(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (unsigned int i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
+#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
+#define RADIX_MASK (RADIX_SIZE - 1)
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename DataType, typename BitDataType, typename IndexType>
+__device__ DataType findPattern(DataType* smem,
+                             DataType* data,
+                             IndexType sliceSize,
+                             IndexType withinSliceStride,
+                             BitDataType desired,
+                             BitDataType desiredMask) {
+  if (threadIdx.x < 32) {
+    smem[threadIdx.x] = ScalarConvert<int, DataType>::to(0);
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : ScalarConvert<int, DataType>::to(0);
+
+    if (inRange && ((TopKTypeConfig<DataType>::convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = ScalarConvert<int, DataType>::to(1);
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    DataType found = smem[0];
+    DataType val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (THCNumerics<DataType>::ne(found, ScalarConvert<int, DataType>::to(0))) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  assert(false);
+  return ScalarConvert<int, DataType>::to(0);
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename DataType, typename BitDataType, typename IndexType, bool Order>
+__device__ void radixSelect(DataType* data,
+                            IndexType k,
+                            IndexType sliceSize,
+                            IndexType withinSliceStride,
+                            int* smem,
+                            DataType* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  BitDataType desired = 0;
+  BitDataType desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+#pragma unroll
+  for (int digitPos = sizeof(DataType) * 8 - RADIX_BITS;
+       digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<DataType, BitDataType,
+                        IndexType, int,
+                        RADIX_SIZE, RADIX_BITS>(
+                          counts, smem,
+                          desired, desiredMask, digitPos,
+                          sliceSize, withinSliceStride, data);
+
+    // All threads participate in the comparisons below to know the
+    // final result
+
+
+#define CHECK_RADIX(i)                                                  \
+    int count = counts[i];                                              \
+                                                                        \
+    /* All threads have the same value in counts here, so all */        \
+    /* threads will return from the function. */                        \
+    if (count == 1 && kToFind == 1) {                                   \
+      /* There is a unique answer. */                                   \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The answer is now the unique element v such that: */           \
+      /* (v & desiredMask) == desired */                                \
+      /* However, we do not yet know what the actual element is. We */  \
+      /* need to perform a search through the data to find the */       \
+      /* element that matches this pattern. */                          \
+      *topK = findPattern<DataType, BitDataType, IndexType>(                         \
+        (DataType*) smem, data, sliceSize,                              \
+        withinSliceStride, desired, desiredMask);                       \
+      return;                                                           \
+    }                                                                   \
+                                                                        \
+    if (count >= kToFind) {                                             \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The top-Kth element v must now be one such that: */            \
+      /* (v & desiredMask == desired) */                                \
+      /* but we haven't narrowed it down; we must check the next */     \
+      /* least-significant digit */                                     \
+      break;                                                            \
+    }                                                                   \
+                                                                        \
+    kToFind -= count                                                    \
+
+    if (Order) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        CHECK_RADIX(i);
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        CHECK_RADIX(i);
+      }
+    }
+#undef CHECK_RADIX
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = TopKTypeConfig<DataType>::deconvert(desired);
+}
+
+template <typename T, typename IndexType, int Dim, bool Order>
+__global__ void gatherTopK(TensorInfo<T, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+
+                           IndexType numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           TensorInfo<T, IndexType> topK,
+                           IndexType numTopKSlices,
+                           IndexType topKWithinSliceStride,
+
+                           TensorInfo<int64_t, IndexType> indices,
+                           IndexType indicesWithinSliceStride) {
+  // Indices are limited to integer fp precision, so counts can fit in
+  // int32, regardless of IndexType
+  __shared__ int smem[32]; // one per each warp, up to warp limit
+
+  IndexType slice = getLinearBlockId<IndexType>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    IndexToOffset<T, IndexType, Dim>::get(slice, input);
+  IndexType topKSliceStartIndex =
+    IndexToOffset<T, IndexType, Dim>::get(slice, topK);
+  IndexType indicesSliceStartIndex =
+    IndexToOffset<int64_t, IndexType, Dim>::get(slice, indices);
+
+  T* inputSliceStart = &input.data[sliceStartIndex];
+  T* topKSliceStart = &topK.data[topKSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  T topKValue = ScalarConvert<int, T>::to(0);
+  radixSelect<T, typename TopKTypeConfig<T>::RadixType, IndexType, Order>(
+    inputSliceStart, outputSliceSize,
+    inputSliceSize, inputWithinSliceStride,
+    smem, &topKValue);
+
+  // Every value that is strictly less/greater than `pattern`
+  // (depending on sort dir) in sorted int format is in the top-K.
+  // The top-K value itself might not be unique.
+  //
+  // Since there are a variable number of elements that we see that
+  // are within the top-k, we don't know at what index to write out
+  // the resulting values.
+  // In order to get this, we perform an exclusive prefix sum of
+  // `hasTopK`. This will return the resulting index into which we
+  // need to write the result, if a thread has a result.
+
+  // All threads need to participate in the loop and the prefix sum,
+  // but not necessarily in the load; hence loop bounds being rounded
+  // up to a multiple of the block dim.
+  IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
+  IndexType writeIndexStart = 0;
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+    bool hasTopK;
+    if (Order) {
+      hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue));
+    } else {
+      hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue));
+    }
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    writeIndexStart += carry;
+  }
+
+  // We need to fill in the rest with actual == top-K values.
+  // The number that we need is outputSliceSize -
+  // writeIndexStart. There might be more than that number available,
+  // in which case we have to choose the first seen set. We do this
+  // via a prefix sum to calculate indices for writing results.
+  assert(outputSliceSize >= writeIndexStart);
+  IndexType topKRemaining = (outputSliceSize - writeIndexStart);
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+    bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue));
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK && index < topKRemaining) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    if (carry >= topKRemaining) {
+      break;
+    }
+
+    topKRemaining -= carry;
+    writeIndexStart += carry;
+  }
+}
+
+#undef RADIX_BITS
+#undef RADIX_SIZE
+#undef RADIX_MASK
+
+#endif // THC_TENSOR_TOPK_CUH
diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
new file mode 100644
index 0000000..6ff6d68
--- /dev/null
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -0,0 +1,142 @@
+#ifndef THC_TENSOR_TYPE_UTILS_INC
+#define THC_TENSOR_TYPE_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensor.hpp"
+#include "THCTensorInfo.cuh"
+#include "THCTensor.hpp"
+
+/// A utility for accessing THCuda*Tensor types in a generic manner
+
+/// Equivalent to C++11's type_traits std::is_same; used for comparing
+/// equality of types. Don't assume the existence of C++11
+template <typename T, typename U>
+struct SameType {
+  static const bool same = false;
+};
+
+template <typename T>
+struct SameType<T, T> {
+  static const bool same = true;
+};
+
+template <typename T, typename U>
+bool isSameType() {
+  return SameType<T, U>::same;
+}
+
+// Utility function for constructing TensorInfo structs. In this case, the
+// two template parameters are:
+//
+// 1. The TensorType, e.g. THCTensor in generic functions, or THCudaTensor,
+// THCudaLongTensor etc.
+//
+// 2. The IndexType. This is always going to be an unsigned integral value,
+// but depending on the size of the Tensor you may select uint16_t
+// uint32_t, uint64_t etc.
+//
+// Internally we use the TensorUtils static functions to get the necessary
+// dims, sizes, stride etc.
+//
+// For example, suppose we have a THCudaTensor t, with dim = 2, size = [3, 4],
+// stride = [4, 1], offset = 8, and we set our index type to be unsigned int.
+// Then we yield a TensorInfo struct templatized with float, unsigned int and
+// the following fields:
+//
+// data is a float* to the underlying storage at position 8
+// dims is 2
+// sizes is a MAX_CUTORCH_DIMS element array with [3, 4] in its first two positions
+// strides is a MAX_CUTORCH_DIMS element array with [4, 1] in its first two positions
+//
+// TensorInfos can then be passed to CUDA kernels, but we can use the static functions
+// defined above to perform Tensor Operations that are appropriate for each
+// TensorType.
+template <typename ScalarType, typename TensorType, typename IndexType>
+TensorInfo<ScalarType, IndexType>
+getTensorInfo(THCState* state, TensorType* t) {
+  IndexType sz[MAX_CUTORCH_DIMS];
+  IndexType st[MAX_CUTORCH_DIMS];
+
+  int dims = THCTensor__nDimension(state, t);
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = THCTensor_size(state, t, i);
+    st[i] = THCTensor_stride(state, t, i);
+  }
+
+  return TensorInfo<ScalarType, IndexType>(
+    t->template data<ScalarType>(), dims, sz, st);
+}
+
+template <typename T>
+struct ScalarNegate {
+  static __host__ __device__ T to(const T v) { return -v; }
+};
+
+template <typename T>
+struct ScalarInv {
+  static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ScalarNegate<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(v);
+#else
+    return __float2half(-__half2float(v));
+#endif
+#else
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
+    half out = v;
+#else
+    __half_raw out = __half_raw(v);
+#endif
+    out.x ^= 0x8000; // toggle sign bit
+    return out;
+#endif
+  }
+};
+
+template <>
+struct ScalarInv<half> {
+  static __host__ __device__ half to(const half v) {
+#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__)
+    return __float2half(1.0f / __half2float(v));
+#else
+    float fv = THC_half2float(v);
+    fv = 1.0f / fv;
+    return THC_float2half(fv);
+#endif
+  }
+};
+
+inline bool operator==(half a, half b) {
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
+  return a.x == b.x;
+#else
+  __half_raw araw, braw;
+  araw = __half_raw(a);
+  braw = __half_raw(b);
+  return araw.x == braw.x;
+#endif
+}
+
+inline bool operator!=(half a, half b) {
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
+    return a.x != b.x;
+#else
+  __half_raw araw, braw;
+  araw = __half_raw(a);
+  braw = __half_raw(b);
+  return araw.x != braw.x;
+#endif
+}
+
+#endif // CUDA_HALF_TENSOR
+
+#endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/aten/src/THC/THCThreadLocal.cpp b/aten/src/THC/THCThreadLocal.cpp
new file mode 100644
index 0000000..3cc95c3
--- /dev/null
+++ b/aten/src/THC/THCThreadLocal.cpp
@@ -0,0 +1,46 @@
+#include "THCThreadLocal.h"
+#include "THCGeneral.h"
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+
+THCThreadLocal THCThreadLocal_alloc(void)
+{
+#ifndef _WIN32
+  pthread_key_t key;
+  THAssert(pthread_key_create(&key, NULL) == 0);
+  return key;
+#else
+  DWORD key = TlsAlloc();
+  THAssert(key != TLS_OUT_OF_INDEXES);
+  return key;
+#endif
+}
+
+void THCThreadLocal_free(THCThreadLocal local)
+{
+#ifndef _WIN32
+  THAssert(pthread_key_delete(local) == 0);
+#else
+  THAssert(TlsFree(local));
+#endif
+}
+
+void* THCThreadLocal_get(THCThreadLocal local)
+{
+#ifndef _WIN32
+  return pthread_getspecific(local);
+#else
+  return TlsGetValue(local);
+#endif
+}
+
+void THCThreadLocal_set(THCThreadLocal local, void* value)
+{
+#ifndef _WIN32
+  THAssert(pthread_setspecific(local, value) == 0);
+#else
+  THAssert(TlsSetValue(local, value));
+#endif
+}
diff --git a/aten/src/THC/THCThreadLocal.h b/aten/src/THC/THCThreadLocal.h
new file mode 100644
index 0000000..a733cac
--- /dev/null
+++ b/aten/src/THC/THCThreadLocal.h
@@ -0,0 +1,17 @@
+#ifndef THC_THREAD_LOCAL_INC
+#define THC_THREAD_LOCAL_INC
+
+#ifdef _WIN32
+#include <intsafe.h>
+typedef DWORD THCThreadLocal;
+#else
+#include <pthread.h>
+typedef pthread_key_t THCThreadLocal;
+#endif
+
+THCThreadLocal THCThreadLocal_alloc(void);
+void THCThreadLocal_free(THCThreadLocal local);
+void* THCThreadLocal_get(THCThreadLocal local);
+void THCThreadLocal_set(THCThreadLocal local, void* value);
+
+#endif // THC_THREAD_LOCAL_INC
diff --git a/aten/src/THC/THCThrustAllocator.cuh b/aten/src/THC/THCThrustAllocator.cuh
new file mode 100644
index 0000000..0e75322
--- /dev/null
+++ b/aten/src/THC/THCThrustAllocator.cuh
@@ -0,0 +1,31 @@
+#ifndef THC_THRUST_ALLOCATOR_INC
+#define THC_THRUST_ALLOCATOR_INC
+
+#include <cstddef>
+
+/// Allocator for Thrust to re-route its internal device allocations
+/// to the THC allocator
+class THCThrustAllocator {
+ public:
+  typedef char value_type;
+
+  THCThrustAllocator(THCState* state)
+      : state_(state) {
+  }
+
+  ~THCThrustAllocator() {
+  }
+
+  char* allocate(std::ptrdiff_t size) {
+    return static_cast<char*>(THCudaMalloc(state_, size));
+  }
+
+  void deallocate(char* p, size_t size) {
+    THCudaFree(state_, p);
+  }
+
+ private:
+  THCState* state_;
+};
+
+#endif // THC_THRUST_ALLOCATOR_INC
diff --git a/aten/src/THC/generated/THCTensorMaskedByte.cu b/aten/src/THC/generated/THCTensorMaskedByte.cu
new file mode 100644
index 0000000..08818af
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedChar.cu b/aten/src/THC/generated/THCTensorMaskedChar.cu
new file mode 100644
index 0000000..27ac787
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedDouble.cu b/aten/src/THC/generated/THCTensorMaskedDouble.cu
new file mode 100644
index 0000000..03e6b8e
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedFloat.cu b/aten/src/THC/generated/THCTensorMaskedFloat.cu
new file mode 100644
index 0000000..bc4d9a0
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedHalf.cu b/aten/src/THC/generated/THCTensorMaskedHalf.cu
new file mode 100644
index 0000000..fc544cd
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedInt.cu b/aten/src/THC/generated/THCTensorMaskedInt.cu
new file mode 100644
index 0000000..9714761
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedLong.cu b/aten/src/THC/generated/THCTensorMaskedLong.cu
new file mode 100644
index 0000000..355ea2b
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorMaskedShort.cu b/aten/src/THC/generated/THCTensorMaskedShort.cu
new file mode 100644
index 0000000..43fe037
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMaskedShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMasked.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareByte.cu b/aten/src/THC/generated/THCTensorMathCompareByte.cu
new file mode 100644
index 0000000..3eaf375
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareChar.cu b/aten/src/THC/generated/THCTensorMathCompareChar.cu
new file mode 100644
index 0000000..471cf03
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareDouble.cu b/aten/src/THC/generated/THCTensorMathCompareDouble.cu
new file mode 100644
index 0000000..7bbf36c
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareFloat.cu b/aten/src/THC/generated/THCTensorMathCompareFloat.cu
new file mode 100644
index 0000000..5fc04be
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareHalf.cu b/aten/src/THC/generated/THCTensorMathCompareHalf.cu
new file mode 100644
index 0000000..52d43ed
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareInt.cu b/aten/src/THC/generated/THCTensorMathCompareInt.cu
new file mode 100644
index 0000000..81c056c
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareLong.cu b/aten/src/THC/generated/THCTensorMathCompareLong.cu
new file mode 100644
index 0000000..a9ca765
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareShort.cu b/aten/src/THC/generated/THCTensorMathCompareShort.cu
new file mode 100644
index 0000000..f620f52
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompare.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTByte.cu b/aten/src/THC/generated/THCTensorMathCompareTByte.cu
new file mode 100644
index 0000000..0a46202
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTChar.cu b/aten/src/THC/generated/THCTensorMathCompareTChar.cu
new file mode 100644
index 0000000..df0c4bb
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTDouble.cu b/aten/src/THC/generated/THCTensorMathCompareTDouble.cu
new file mode 100644
index 0000000..6b9f4e7
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTFloat.cu b/aten/src/THC/generated/THCTensorMathCompareTFloat.cu
new file mode 100644
index 0000000..b34a12b
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTHalf.cu b/aten/src/THC/generated/THCTensorMathCompareTHalf.cu
new file mode 100644
index 0000000..b38dc55
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTInt.cu b/aten/src/THC/generated/THCTensorMathCompareTInt.cu
new file mode 100644
index 0000000..6a8a114
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTLong.cu b/aten/src/THC/generated/THCTensorMathCompareTLong.cu
new file mode 100644
index 0000000..d5bf322
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorMathCompareTShort.cu b/aten/src/THC/generated/THCTensorMathCompareTShort.cu
new file mode 100644
index 0000000..d41dab6
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathCompareTShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseByte.cu b/aten/src/THC/generated/THCTensorMathPointwiseByte.cu
new file mode 100644
index 0000000..b6fe10e
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseChar.cu b/aten/src/THC/generated/THCTensorMathPointwiseChar.cu
new file mode 100644
index 0000000..af851f3
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu b/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu
new file mode 100644
index 0000000..8053408
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu b/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu
new file mode 100644
index 0000000..8149c27
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu b/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu
new file mode 100644
index 0000000..29cbf26
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseInt.cu b/aten/src/THC/generated/THCTensorMathPointwiseInt.cu
new file mode 100644
index 0000000..7e7c486
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseLong.cu b/aten/src/THC/generated/THCTensorMathPointwiseLong.cu
new file mode 100644
index 0000000..583b271
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseShort.cu b/aten/src/THC/generated/THCTensorMathPointwiseShort.cu
new file mode 100644
index 0000000..8c30a6b
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceByte.cu b/aten/src/THC/generated/THCTensorMathReduceByte.cu
new file mode 100644
index 0000000..27490e5
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceChar.cu b/aten/src/THC/generated/THCTensorMathReduceChar.cu
new file mode 100644
index 0000000..9e55b7d
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceDouble.cu b/aten/src/THC/generated/THCTensorMathReduceDouble.cu
new file mode 100644
index 0000000..5cd6b11
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceFloat.cu b/aten/src/THC/generated/THCTensorMathReduceFloat.cu
new file mode 100644
index 0000000..7c21ce2
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceHalf.cu b/aten/src/THC/generated/THCTensorMathReduceHalf.cu
new file mode 100644
index 0000000..f05f2d8
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceInt.cu b/aten/src/THC/generated/THCTensorMathReduceInt.cu
new file mode 100644
index 0000000..f6fc959
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceLong.cu b/aten/src/THC/generated/THCTensorMathReduceLong.cu
new file mode 100644
index 0000000..cff6374
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorMathReduceShort.cu b/aten/src/THC/generated/THCTensorMathReduceShort.cu
new file mode 100644
index 0000000..1ad31a8
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathReduceShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorMathReduce.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generated/THCTensorSortByte.cu b/aten/src/THC/generated/THCTensorSortByte.cu
new file mode 100644
index 0000000..53923a2
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortByte.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateByteType.h"
diff --git a/aten/src/THC/generated/THCTensorSortChar.cu b/aten/src/THC/generated/THCTensorSortChar.cu
new file mode 100644
index 0000000..0e95c69
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortChar.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateCharType.h"
diff --git a/aten/src/THC/generated/THCTensorSortDouble.cu b/aten/src/THC/generated/THCTensorSortDouble.cu
new file mode 100644
index 0000000..770ffa0
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortDouble.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/aten/src/THC/generated/THCTensorSortFloat.cu b/aten/src/THC/generated/THCTensorSortFloat.cu
new file mode 100644
index 0000000..e7604b9
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortFloat.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/aten/src/THC/generated/THCTensorSortHalf.cu b/aten/src/THC/generated/THCTensorSortHalf.cu
new file mode 100644
index 0000000..c783ff0
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortHalf.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/aten/src/THC/generated/THCTensorSortInt.cu b/aten/src/THC/generated/THCTensorSortInt.cu
new file mode 100644
index 0000000..1597eab
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortInt.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateIntType.h"
diff --git a/aten/src/THC/generated/THCTensorSortLong.cu b/aten/src/THC/generated/THCTensorSortLong.cu
new file mode 100644
index 0000000..787a942
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortLong.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateLongType.h"
diff --git a/aten/src/THC/generated/THCTensorSortShort.cu b/aten/src/THC/generated/THCTensorSortShort.cu
new file mode 100644
index 0000000..8a0c275
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorSortShort.cu
@@ -0,0 +1,5 @@
+#include "../THCTensorSort.cuh"
+#include "THCTensor.hpp"
+#include "THCStream.h"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateShortType.h"
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
new file mode 100644
index 0000000..98b4c3b
--- /dev/null
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -0,0 +1,123 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.cpp"
+#else
+
+real* THCStorage_(data)(THCState *state, const THCStorage *self)
+{
+  return self->data<real>();
+}
+
+ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage *self)
+{
+  return THStorage_size(self);
+}
+
+int THCStorage_(elementSize)(THCState *state)
+{
+  return sizeof(real);
+}
+
+void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value)
+{
+  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  THCudaCheck(cudaStreamSynchronize(stream));
+}
+
+real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
+{
+  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
+  real value;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real),
+                              cudaMemcpyDeviceToHost, stream));
+  THCudaCheck(cudaStreamSynchronize(stream));
+  return value;
+}
+
+THCStorage* THCStorage_(new)(THCState *state)
+{
+  return THCStorage_new(state, at::CTypeToScalarType<real>::to());
+}
+
+THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
+{
+  return THCStorage_newWithSize(state, at::CTypeToScalarType<real>::to(), size);
+}
+
+THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
+                                          at::Allocator* allocator)
+{
+  return THCStorage_newWithAllocator(state, at::CTypeToScalarType<real>::to(),
+                                     size, allocator);
+}
+
+THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 1);
+  THCStorage_(set)(state, self, 0, data0);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 2);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 3);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  THCStorage_(set)(state, self, 2, data2);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 4);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  THCStorage_(set)(state, self, 2, data2);
+  THCStorage_(set)(state, self, 3, data3);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, ptrdiff_t size, int isShared)
+{
+  THError("not available yet for THCStorage");
+  return NULL;
+}
+
+THCStorage* THCStorage_(newWithDataAndAllocator)(
+  THCState *state, at::DataPtr&& data, ptrdiff_t size,
+  at::Allocator *allocator) {
+  return THCStorage_newWithDataAndAllocator(state, at::CTypeToScalarType<real>::to(), std::move(data), size, allocator);
+}
+
+void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag)
+{
+  THStorage_setFlag(storage, flag);
+}
+
+void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag)
+{
+  THStorage_clearFlag(storage, flag);
+}
+
+void THCStorage_(retain)(THCState *state, THCStorage *self)
+{
+  THStorage_retain(self);
+}
+
+void THCStorage_(free)(THCState *state, THCStorage *self)
+{
+  THCStorage_free(state, self);
+}
+#endif
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
new file mode 100644
index 0000000..c3f25f4
--- /dev/null
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -0,0 +1,25 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.cu"
+#else
+
+void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
+{
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> self_data(THCStorage_(data)(state, self));
+  thrust::fill(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    self_data, self_data+self->size, value);
+}
+
+void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
+{
+  THCStorage_resize(state, self, size);
+}
+
+THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
+  return THCStorage_getDevice(state, storage);
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCStorage.h b/aten/src/THC/generic/THCStorage.h
new file mode 100644
index 0000000..4ac2fcd
--- /dev/null
+++ b/aten/src/THC/generic/THCStorage.h
@@ -0,0 +1,58 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.h"
+#else
+
+#define TH_STORAGE_REFCOUNTED 1
+#define TH_STORAGE_RESIZABLE  2
+
+#define THCStorage THStorage
+
+// These used to be distinct types; for some measure of backwards compatibility and documentation
+// alias these to the single THCStorage type.
+#define THCudaStorage       THCStorage
+#define THCudaDoubleStorage THCStorage
+#ifdef CUDA_HALF_TENSOR
+#define THCudaHalfStorage   THCStorage
+#endif
+#define THCudaByteStorage   THCStorage
+#define THCudaCharStorage   THCStorage
+#define THCudaShortStorage  THCStorage
+#define THCudaIntStorage    THCStorage
+#define THCudaLongStorage   THCStorage
+
+THC_API real* THCStorage_(data)(THCState *state, const THCStorage*);
+THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*);
+THC_API int THCStorage_(elementSize)(THCState *state);
+
+/* slow access -- checks everything */
+THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, real);
+THC_API real THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
+
+THC_API THCStorage* THCStorage_(new)(THCState *state);
+THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
+THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real);
+THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real);
+THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real);
+THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real);
+THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared);
+
+#ifdef __cplusplus
+THC_API THCStorage* THCStorage_(newWithAllocator)(
+  THCState *state, ptrdiff_t size,
+  at::Allocator* allocator);
+THC_API THCStorage* THCStorage_(newWithDataAndAllocator)(
+  THCState *state, at::DataPtr&& data, ptrdiff_t size,
+  at::Allocator* allocator);
+#endif
+
+THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag);
+THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag);
+THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
+
+THC_API void THCStorage_(free)(THCState *state, THCStorage *storage);
+THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, ptrdiff_t size);
+THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value);
+
+THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage);
+
+#endif
diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp
new file mode 100644
index 0000000..dc877b6
--- /dev/null
+++ b/aten/src/THC/generic/THCStorageCopy.cpp
@@ -0,0 +1,72 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.cpp"
+#else
+
+void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
+{
+  THArgCheck(self->size == src->size, 2, "size does not match");
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self),
+                              THStorage_(data)(src),
+                              self->size * sizeof(real),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  THCudaCheck(cudaStreamSynchronize(stream));
+}
+
+#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC)                          \
+void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
+{                                                                      \
+  THCTensor* selfTensor =                                              \
+      THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1);     \
+  struct TH##TYPEC##Tensor* srcTensor =                                \
+      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size, 1);        \
+  THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor);               \
+  TH##TYPEC##Tensor_free(srcTensor);                                   \
+  THCTensor_(free)(state, selfTensor);                                 \
+}
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Char)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Short)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Int)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Long)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Float)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Half)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
+
+void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
+{
+  THArgCheck(self->size == src->size, 2, "size does not match");
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self),
+                              THCStorage_(data)(state, src),
+                              self->size * sizeof(real),
+                              cudaMemcpyDeviceToHost,
+                              stream));
+  THCudaCheck(cudaStreamSynchronize(stream));
+}
+
+#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                             \
+void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
+{                                                                           \
+  TH##TYPEC##Tensor* selfTensor =                                           \
+      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size, 1);           \
+  struct THCTensor* srcTensor =                                             \
+      THCTensor_(newWithStorage1d)(state, src, 0, src->size, 1);            \
+  TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \
+  THCTensor_(free)(state, srcTensor);                                       \
+  TH##TYPEC##Tensor_free(selfTensor);                                   \
+}
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double)
+
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPY
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO
+
+#endif
diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu
new file mode 100644
index 0000000..ba50004
--- /dev/null
+++ b/aten/src/THC/generic/THCStorageCopy.cu
@@ -0,0 +1,46 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.cu"
+#else
+
+void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src)
+{
+  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
+}
+
+// conversions are delegated to THCTensor implementation
+#define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA)                                 \
+void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src)  \
+{                                                                                       \
+  THArgCheck(self->size == src->size, 2, "size does not match");                        \
+  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1);  \
+  struct THCuda##TYPECUDA##Tensor* srcTensor =                                          \
+      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size, 1);           \
+  THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor);                            \
+  THCuda##TYPECUDA##Tensor_free(state, srcTensor);                                      \
+  THCTensor_(free)(state, selfTensor);                                                  \
+}
+
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Byte,Byte)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Char,Char)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Short,Short)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Int,Int)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,)  // i.e. float
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double)
+#ifdef CUDA_HALF_TENSOR
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Half,Half)
+#endif
+
+#undef THC_CUDA_STORAGE_IMPLEMENT_COPY
+
+void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src)
+{
+  THCStorage_(TH_CONCAT_2(copyCuda, Real))(state, self, src);
+}
+
+void THCStorage_(copy)(THCState *state, THCStorage *self, THCStorage *src)
+{
+  THCStorage_(copyCuda)(state, self, src);
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCStorageCopy.h b/aten/src/THC/generic/THCStorageCopy.h
new file mode 100644
index 0000000..7a4ef6b
--- /dev/null
+++ b/aten/src/THC/generic/THCStorageCopy.h
@@ -0,0 +1,42 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.h"
+#else
+
+/* Support for copy between different Storage types */
+
+THC_API void THCStorage_(rawCopy)(THCState *state, THCStorage *storage, real *src);
+THC_API void THCStorage_(copy)(THCState *state, THCStorage *storage, THCStorage *src);
+THC_API void THCStorage_(copyByte)(THCState *state, THCStorage *storage, struct THByteStorage *src);
+THC_API void THCStorage_(copyChar)(THCState *state, THCStorage *storage, struct THCharStorage *src);
+THC_API void THCStorage_(copyShort)(THCState *state, THCStorage *storage, struct THShortStorage *src);
+THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct THIntStorage *src);
+THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src);
+THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src);
+THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src);
+THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src);
+
+THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src);
+THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src);
+THC_API void THCStorage_(copyCudaShort)(THCState *state, THCStorage *storage, struct THCudaShortStorage *src);
+THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, struct THCudaIntStorage *src);
+THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
+THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
+THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
+#endif
+
+THC_API void TH_CONCAT_2(THByteStorage_copyCuda  , Real)(THCState *state, THByteStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THCharStorage_copyCuda  , Real)(THCState *state, THCharStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THShortStorage_copyCuda , Real)(THCState *state, THShortStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THIntStorage_copyCuda   , Real)(THCState *state, THIntStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THLongStorage_copyCuda  , Real)(THCState *state, THLongStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src);
+
+THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src);
+THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src);
+THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
new file mode 100644
index 0000000..c0924a5
--- /dev/null
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -0,0 +1,731 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.cpp"
+#else
+
+/**** access methods ****/
+THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self)
+{
+  return self->storage;
+}
+
+ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
+{
+  return self->storageOffset;
+}
+
+int THCTensor_(nDimension)(THCState *state, const THCTensor *self)
+{
+  return THCTensor_nDimension(state, self);
+}
+
+int THCTensor_(_nDimension)(THCState *state, const THCTensor *self)
+{
+  return THCTensor__nDimension(state, self);
+}
+
+int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim)
+{
+  return THCTensor_size(state, self, dim);
+}
+
+int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim)
+{
+  return THCTensor_stride(state, self, dim);
+}
+
+THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
+{
+  return THCTensor_newSizeOf(state, self);
+}
+
+THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self)
+{
+  THLongStorage *stride = THLongStorage_newWithSize(self->dim());
+  THLongStorage_rawCopy(stride, self->stride);
+  return stride;
+}
+
+real *THCTensor_(data)(THCState *state, const THCTensor *self)
+{
+  if(self->storage)
+    return (THCStorage_(data)(state, self->storage)+self->storageOffset);
+  else
+    return NULL;
+}
+
+/**** creation methods ****/
+
+/* Empty init */
+THCTensor *THCTensor_(new)(THCState *state)
+{
+  return new THCTensor(THCStorage_(new)(state));
+}
+
+/* Pointer-copy init */
+THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
+{
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           tensor->storage,
+                           tensor->storageOffset,
+                           tensor->dim(),
+                           tensor->size,
+                           tensor->stride);
+  return self;
+}
+
+/* Storage init */
+THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
+{
+  if(size && stride)
+    THArgCheck(size->size == stride->size, 4, "inconsistent size");
+
+  AT_CHECK(size, "size must not be null");
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage,
+                           storageOffset,
+                           size->size,
+                           THLongStorage_data(size),
+                           (stride ? THLongStorage_data(stride) : NULL));
+
+  return self;
+}
+
+THCTensor *THCTensor_(newWithStorageIntLists)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) {
+  AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
+  THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(),
+                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
+
+  return self;
+}
+
+THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0)
+{
+  return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0}, {stride0});
+}
+
+THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1)
+{
+  return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0, size1}, {stride0, stride1});
+}
+
+THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2)
+{
+  return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0, size1, size2}, {stride0, stride1, stride2});
+}
+
+THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2,
+                               int64_t size3, int64_t stride3)
+{
+  return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset,
+                                            {size0, size1, size2, size3},
+                                            {stride0, stride1, stride2, stride3});
+}
+
+THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size, THLongStorage *stride)
+{
+  return THCTensor_(newWithStorage)(state, NULL, 0, size, stride);
+}
+
+THCTensor *THCTensor_(newWithSizeIntList)(THCState *state, at::IntList sizes) {
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
+  THCTensor_(resizeNd)(state, self, sizes.size(), const_cast<int64_t*>(sizes.data()), nullptr);
+
+  return self;
+}
+
+THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0)
+{
+  return THCTensor_(newWithSizeIntList)(state, {size0});
+}
+
+THCTensor *THCTensor_(newWithSize2d)(THCState *state, int64_t size0, int64_t size1)
+{
+  return THCTensor_(newWithSizeIntList)(state, {size0, size1});
+}
+
+THCTensor *THCTensor_(newWithSize3d)(THCState *state, int64_t size0, int64_t size1, int64_t size2)
+{
+  return THCTensor_(newWithSizeIntList)(state, {size0, size1, size2});
+}
+
+THCTensor *THCTensor_(newWithSize4d)(THCState *state, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
+{
+  return THCTensor_(newWithSizeIntList)(state, {size0, size1, size2, size3});
+}
+
+THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self)
+{
+  THCTensor *tensor = THCTensor_(new)(state);
+  THCTensor_(resizeAs)(state, tensor, self);
+  THCTensor_(copy)(state, tensor, self);
+  return tensor;
+}
+
+THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *self)
+{
+  if(!THCTensor_(isContiguous)(state, self)) {
+    return THCTensor_(newClone)(state, self);
+  } else {
+    THCTensor_(retain)(state, self);
+    return self;
+  }
+}
+
+THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(select)(state, self, NULL, dimension_, sliceIndex_);
+  return self;
+}
+
+THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(narrow)(state, self, NULL, dimension_, firstIndex_, size_);
+  return self;
+}
+
+THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(transpose)(state, self, NULL, dimension1_, dimension2_);
+  return self;
+}
+
+THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, int64_t size_, int64_t step_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(unfold)(state, self, NULL, dimension_, size_, step_);
+  return self;
+}
+
+THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size)
+{
+  ptrdiff_t numel = THCTensor_(nElement)(state, tensor);
+  THCTensor *self = THCTensor_(new)(state);
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()),
+                                        at::IntList(tensor->stride, tensor->dim()),
+                                        at::IntList(inferred_size->data<int64_t>(), inferred_size->size));
+  THArgCheck(stride.has_value(), 2, "view size is "
+    "not compatible with input tensor's size and stride (at least one dimension spans "
+    "across two contiguous subspaces). Call .contiguous() before .view().");
+  auto stride_value = *stride;
+  THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size());
+  THLongStorage_rawCopy(new_stride, stride_value.data());
+  THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, new_stride);
+  THLongStorage_free(inferred_size);
+  THLongStorage_free(new_stride);
+  return self;
+}
+
+// Collapses the first two dimensions of a tensor.
+// Assumes the input tensor is contiguous.
+THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input) {
+  int in_dims = THCTensor_(_nDimension)(state, input);
+  THArgCheck(in_dims >= 2, 1, "Tensor needs to have at least two dimensions");
+  THArgCheck(THCTensor_(isContiguous)(state, input), 1,
+             "Tensor must be contiguous");
+  THLongStorage *newSize = THLongStorage_newWithSize(in_dims - 1);
+  THLongStorage_data(newSize)[0] = THCTensor_(size)(state, input, 0) * THCTensor_(size)(state, input, 1);
+  for (int i = 2; i < in_dims; i++) {
+    THLongStorage_data(newSize)[i - 1] = THCTensor_(size)(state, input, i);
+  }
+  THCTensor *output = THCTensor_(newView)(state, input, newSize);
+  THLongStorage_free(newSize);
+  return output;
+}
+
+/* Resize */
+void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
+{
+  THCTensor_resize(state, self, size, stride);
+}
+
+void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THCTensor_resizeAs(state, self, src);
+}
+
+void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0)
+{
+  int64_t size[1] = {size0};
+  THCTensor_resizeNd(state, tensor, 1, size, nullptr);
+}
+
+void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0, int64_t size1)
+{
+  int64_t size[2] = {size0, size1};
+  THCTensor_resizeNd(state, tensor, 2, size, nullptr);
+}
+
+void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0, int64_t size1, int64_t size2)
+{
+  int64_t size[3] = {size0, size1, size2};
+  THCTensor_resizeNd(state, tensor, 3, size, nullptr);
+}
+
+void THCTensor_(resize4d)(THCState *state, THCTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
+{
+  int64_t size[4] = {size0, size1, size2, size3};
+  THCTensor_resizeNd(state, self, 4, size, nullptr);
+}
+
+void THCTensor_(resize5d)(THCState *state, THCTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4)
+{
+  int64_t size[5] = {size0, size1, size2, size3, size4};
+  THCTensor_resizeNd(state, self, 5, size, nullptr);
+}
+
+void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THCTensor_set(state, self, src);
+}
+
+void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
+{
+  if(size_ && stride_)
+    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
+
+  AT_CHECK(size_, "size must not be null");
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage_,
+                           storageOffset_,
+                           size_->size,
+                           THLongStorage_data(size_),
+                           (stride_ ? THLongStorage_data(stride_) : NULL));
+}
+
+void THCTensor_(setStorageIntLists)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    at::IntList sizes, at::IntList strides)
+{
+  AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+
+  THCTensor_(setStorageNd)(state, self, storage_, storageOffset_, sizes.size(),
+                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
+}
+
+void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_)
+{
+  THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_,
+                                 {size0_}, {stride0_});
+}
+
+void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_)
+{
+  THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_,
+                                 {size0_, size1_},
+                                 {stride0_, stride1_});
+}
+
+void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_)
+{
+  THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_,
+                                 {size0_, size1_, size2_},
+                                 {stride0_, stride1_, stride2_});
+}
+
+void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_,
+                             int64_t size3_, int64_t stride3_)
+{
+
+  int64_t size[4] = {size0_, size1_, size2_, size3_};
+  int64_t stride[4] = {stride0_, stride1_, stride2_, stride3_};
+
+  THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_, size, stride);
+}
+
+
+void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t firstIndex, int64_t size)
+{
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range");
+  THArgCheck( firstIndex >= 0, 4, "out of range");
+#ifdef USE_TH_SIZE_ZERO_DIM
+  THArgCheck( size >= 0, 5, "out of range");
+#else
+  THArgCheck( size > 0, 5, "out of range");
+#endif
+  THArgCheck(firstIndex+size <= src->size[dimension], 5, "out of range");
+
+  THCTensor_(set)(state, self, src);
+
+  if(firstIndex > 0)
+    self->storageOffset += firstIndex*self->stride[dimension];
+
+  self->size[dimension] = size;
+}
+
+void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t sliceIndex)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+#ifndef USE_TH_SCALAR
+  THArgCheck(src->_dim() > 1, 1, "cannot select on a vector");
+#endif
+  THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 4, "out of range");
+
+  THCTensor_(set)(state, self, src);
+  THCTensor_(narrow)(state, self, NULL, dimension, sliceIndex, 1);
+  for(d = dimension; d < self->dim()-1; d++)
+  {
+    self->size[d] = self->size[d+1];
+    self->stride[d] = self->stride[d+1];
+  }
+  self->dim_--;
+}
+
+void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1, int dimension2)
+{
+  int64_t z;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
+
+  THCTensor_(set)(state, self, src);
+
+  if(dimension1 == dimension2)
+    return;
+
+  z = self->stride[dimension1];
+  self->stride[dimension1] = self->stride[dimension2];
+  self->stride[dimension2] = z;
+  z = self->size[dimension1];
+  self->size[dimension1] = self->size[dimension2];
+  self->size[dimension2] = z;
+}
+
+void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t size, int64_t step)
+{
+  int64_t *newSize;
+  int64_t *newStride;
+  int d;
+
+  if(!src)
+    src = self;
+
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
+#endif
+  THArgCheck(dimension < src->dim(), 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(step > 0, 4, "invalid step");
+
+  THCTensor_(set)(state, self, src);
+
+  newSize = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1));
+  newStride = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1));
+
+  newSize[self->dim()] = size;
+  newStride[self->dim()] = self->stride[dimension];
+  for(d = 0; d < self->dim(); d++)
+  {
+    if(d == dimension)
+    {
+      newSize[d] = (self->size[d] - size) / step + 1;
+      newStride[d] = step*self->stride[d];
+    }
+    else
+    {
+      newSize[d] = self->size[d];
+      newStride[d] = self->stride[d];
+    }
+  }
+
+  THFree(self->size);
+  THFree(self->stride);
+
+  self->size = newSize;
+  self->stride = newStride;
+  self->dim_++;
+}
+
+/* we have to handle the case where the result is a number */
+void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  int ndim = 0;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THCTensor_(set)(state, self, src);
+
+  for(d = 0; d < src->dim(); d++)
+  {
+    if(src->size[d] != 1)
+    {
+      if(d != ndim)
+      {
+        self->size[ndim] = src->size[d];
+        self->stride[ndim] = src->stride[d];
+      }
+      ndim++;
+    }
+  }
+
+#ifndef USE_TH_SCALAR
+  /* right now, we do not handle 0-dimension tensors */
+  if(ndim == 0 && src->dim() > 0)
+  {
+    self->size[0] = 1;
+    self->stride[0] = 1;
+    ndim = 1;
+  }
+  self->dim_ = ndim;
+}
+#endif
+
+void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  THCTensor_squeeze1d(state, self, src, dimension);
+}
+
+void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  THCTensor_unsqueeze1d(state, self, src, dimension);
+}
+
+int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
+{
+  return THCTensor_isContiguous(state, self);
+}
+
+int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims)
+{
+  int d;
+  if (self->dim() != dims->size)
+    return 0;
+
+  for (d = 0; d < self->dim(); ++d)
+  {
+    if (self->size[d] != THLongStorage_data(dims)[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src)
+{
+  if (self->storage == src->storage &&
+      self->storageOffset == src->storageOffset &&
+      self->dim() == src->dim())
+  {
+    int d;
+    for (d = 0; d < self->dim(); ++d)
+    {
+      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor* src)
+{
+  int d;
+  if (self->dim() != src->dim())
+    return 0;
+  for(d = 0; d < self->dim(); ++d)
+  {
+    if(self->size[d] != src->size[d])
+      return 0;
+  }
+  return 1;
+}
+
+ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self)
+{
+  return THCTensor_nElement(state, self);
+}
+
+void THCTensor_(retain)(THCState *state, THCTensor *self)
+{
+  THCTensor_retain(state, self);
+}
+
+void THCTensor_(free)(THCState *state, THCTensor *self)
+{
+  THCTensor_free(state, self);
+}
+
+void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst)
+{
+  if(self != dst)
+    THCTensor_(copy)(state, dst, self);
+
+  THCTensor_(free)(state, self);
+}
+
+/*******************************************************************************/
+
+void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
+{
+  THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride);
+}
+
+void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride)
+{
+  THCTensor_resizeNd(state, self, nDimension, size, stride);
+}
+
+void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value)
+{
+  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+}
+
+real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0)
+{
+  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+}
+
+void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value)
+{
+  THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+}
+
+real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1)
+{
+  THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+}
+
+void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
+{
+  THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+}
+
+real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
+{
+  THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+}
+
+void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
+{
+  THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+}
+
+real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
+{
+  THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+}
+
+int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
+{
+  /* FIXME: remove this flag after any users stop using it since it is
+     now superseded by the runtime option */
+#ifdef DISABLE_CHECK_GPU
+  return 1;
+#else
+  int kernelP2PEnabled =
+    THCState_getKernelPeerToPeerAccessEnabled(state);
+
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+  va_list(args);
+  va_start(args, nTensors);
+  int valid = 1;
+  for (unsigned int i = 0; i < nTensors; i++) {
+    THCTensor* tensor = va_arg(args, THCTensor*);
+    if (tensor == NULL) {
+      continue;
+    }
+    int tensorDev = THCTensor_(getDevice)(state, tensor);
+    if (tensorDev == -1) {
+      /* This tensor does not have GPU memory (empty) */
+      continue;
+    }
+
+    if (tensorDev != curDev) {
+      if (kernelP2PEnabled) {
+        /* Kernel p2p access is allowed */
+        /* Can `curDev` access `tensorDev` directly? */
+        if (!THCState_getPeerToPeerAccess(state, curDev, tensorDev)) {
+          valid = 0;
+          break;
+        }
+      } else {
+        /* No kernel p2p access allowed */
+        valid = 0;
+        break;
+      }
+    }
+  }
+
+  va_end(args);
+  return valid;
+#endif // DISABLE_CHECK_GPU
+}
+
+THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor) {
+  const int L = THC_DESC_BUFF_LEN;
+  THCDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < tensor->dim(); i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
+    if(i < tensor->dim()-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu
new file mode 100644
index 0000000..9847834
--- /dev/null
+++ b/aten/src/THC/generic/THCTensor.cu
@@ -0,0 +1,9 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.cu"
+#else
+
+THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
+  return THCTensor_getDevice(state, tensor);
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h
new file mode 100644
index 0000000..8e9bf84
--- /dev/null
+++ b/aten/src/THC/generic/THCTensor.h
@@ -0,0 +1,141 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.h"
+#else
+
+typedef struct THCTensor THCTensor;
+
+// These used to be distinct types; for some measure of backwards compatibility and documentation
+// alias these to the single THCTensor type.
+#define THCudaTensor THCTensor
+#define THCudaDoubleTensor THCTensor
+#ifdef CUDA_HALF_TENSOR
+#define THCudaHalfTensor THCTensor
+#endif
+#define THCudaByteTensor THCTensor
+#define THCudaCharTensor THCTensor
+#define THCudaShortTensor THCTensor
+#define THCudaIntTensor THCTensor
+#define THCudaLongTensor THCTensor
+
+/**** access methods ****/
+THC_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self);
+THC_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
+
+// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim().
+THC_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(_nDimension)(THCState *state, const THCTensor *self);
+
+THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
+THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
+THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self);
+THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self);
+THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self);
+
+THC_API void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag);
+THC_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag);
+
+
+/**** creation methods ****/
+THC_API THCTensor *THCTensor_(new)(THCState *state);
+THC_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor);
+/* stride might be NULL */
+THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_);
+THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_);
+THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_);
+THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_,
+                                int64_t size3_, int64_t stride3_);
+
+/* stride might be NULL */
+THC_API THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size_, THLongStorage *stride_);
+THC_API THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0_);
+THC_API THCTensor *THCTensor_(newWithSize2d)(THCState *state, int64_t size0_, int64_t size1_);
+THC_API THCTensor *THCTensor_(newWithSize3d)(THCState *state, int64_t size0_, int64_t size1_, int64_t size2_);
+THC_API THCTensor *THCTensor_(newWithSize4d)(THCState *state, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
+
+THC_API THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self);
+THC_API THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *tensor);
+THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_);
+THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_);
+THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
+THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, int64_t size_, int64_t step_);
+THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size);
+THC_API THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input);
+
+// resize* methods simply resize the storage. So they may not retain the current data at current indices.
+// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
+// values, unless you are doing some size and stride tricks, do not use resize*.
+THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
+THC_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride);
+THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
+THC_API void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0_);
+THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_);
+THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
+THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
+THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
+
+THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride);
+THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_);
+THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_);
+THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_);
+THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_,
+                                    int64_t size3_, int64_t stride3_);
+
+THC_API void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
+THC_API void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t sliceIndex_);
+THC_API void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1_, int dimension2_);
+THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t size_, int64_t step_);
+
+THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+THC_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+
+THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
+THC_API int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src);
+THC_API int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims);
+THC_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self);
+
+THC_API void THCTensor_(retain)(THCState *state, THCTensor *self);
+THC_API void THCTensor_(free)(THCState *state, THCTensor *self);
+THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
+
+/* Slow access methods [check everything] */
+THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value);
+THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value);
+THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value);
+THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value);
+
+THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0);
+THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1);
+THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
+THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
+
+/* CUDA-specific functions */
+THC_API int THCTensor_(getDevice)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...);
+
+/* debug methods */
+THC_API THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp
new file mode 100644
index 0000000..5715133
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorCopy.cpp
@@ -0,0 +1,175 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.cpp"
+#else
+
+/* specific methods */
+
+void THCTensor_(copyCPU)(THCState *state, THCTensor *self, struct THTensor *src)
+{
+  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
+
+  {
+    THCTensor *selfc = THCTensor_(newContiguous)(state, self);
+    src = THTensor_(newContiguous)(src);
+
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state,selfc),
+                                THTensor_(data)(src),
+                                THTensor_(nElement)(src) * sizeof(real),
+                                cudaMemcpyHostToDevice,
+                                stream));
+    THCudaCheck(cudaStreamSynchronize(stream));
+
+    THTensor_(free)(src);
+    THCTensor_(freeCopyTo)(state, selfc, self);
+  }
+}
+
+#define IMPLEMENT_TH_CUDA_TENSOR_COPY(TYPEC)                            \
+void THCTensor_(copy##TYPEC)(THCState *state, THCTensor *self, struct TH##TYPEC##Tensor *src)                \
+{                                                                       \
+  THArgCheck(THCTensor_(nElement)(state, self) == TH##TYPEC##Tensor_nElement(src), 2, "sizes do not match"); \
+  if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {               \
+    THCTensor_(copyCPU)(state, self, (THTensor*) src);  /* cast just removes warnings */                     \
+  } else {                                                              \
+    THLongStorage *size = TH##TYPEC##Tensor_newSizeOf(src);             \
+    THTensor *srcf = THTensor_(newWithSize)(size, NULL);                \
+                                                                        \
+    THTensor_(copy##TYPEC)(srcf, src);                                  \
+    THCTensor_(copyCPU)(state, self, srcf);                             \
+                                                                        \
+    THLongStorage_free(size);                                           \
+    THTensor_(free)(srcf);                                              \
+  }                                                                     \
+}
+
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Byte)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Char)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Short)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Int)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Long)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Float)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Double)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Half)
+
+/* copyCuda */
+
+void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
+{
+  THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
+
+  {
+    THTensor *selfc = THTensor_(newContiguous)(self);
+    src = THCTensor_(newContiguous)(state, src);
+
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    THCudaCheck(cudaMemcpyAsync(THTensor_(data)(selfc),
+                                THCTensor_(data)(state, src),
+                                THCTensor_(nElement)(state, src) * sizeof(real),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+    THCudaCheck(cudaStreamSynchronize(stream));
+
+    THCTensor_(free)(state, src);
+    THTensor_(freeCopyTo)(selfc, self);
+  }
+}
+
+#define IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(TYPEC)                           \
+  void TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(THCState *state, TH##TYPEC##Tensor *self, struct THCTensor *src) \
+  {                                                                       \
+    THArgCheck(TH##TYPEC##Tensor_nElement(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");       \
+    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {   \
+      THTensor_(copyCuda)(state, (THTensor*) self, src);  /* cast just removes compiler warning */                   \
+    } else {                                                              \
+      THLongStorage *size = THCTensor_(newSizeOf)(state, src);            \
+      THTensor *srcf = THTensor_(newWithSize)(size, NULL);                \
+                                                                          \
+      THTensor_(copyCuda)(state, srcf, src);                              \
+      TH_CONCAT_4(TH,TYPEC,Tensor_copy,Real)(self, srcf);                 \
+                                                                          \
+      THLongStorage_free(size);                                           \
+      THTensor_(free)(srcf);                                              \
+    }                                                                     \
+  }
+
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Byte)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Char)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Short)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Int)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Half)
+
+void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THCTensor_(copy)(state, self, src);
+}
+
+void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src)
+{
+  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
+  THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous");
+
+  if (THCTensor_(nElement)(state, self) == 0) return;
+
+  // Perform the copy wrt the current stream on the CudaTensor's device.
+  int tensorDevice = THCTensor_(getDevice)(state, self);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(tensorDevice));
+  }
+
+  THCStream *stream  = THCState_getStream(state);
+  THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
+                              THTensor_(data)(src),
+                              THTensor_(nElement)(src) * sizeof(real),
+                              cudaMemcpyHostToDevice,
+                              THCStream_stream(stream)));
+
+  THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(currentDevice));
+  }
+}
+
+void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src)
+{
+  THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
+  THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous");
+  THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous");
+
+  if (THTensor_(nElement)(self) == 0) return;
+
+  // Perform the copy wrt the current stream on the CudaTensor's device.
+  int tensorDevice = THCTensor_(getDevice)(state, src);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(tensorDevice));
+  }
+
+  THCStream *stream = THCState_getStream(state);
+  THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
+                              THCTensor_(data)(state, src),
+                              THCTensor_(nElement)(state, src) * sizeof(real),
+                              cudaMemcpyDeviceToHost,
+                              THCStream_stream(stream)));
+
+  THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, src->storage), stream));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(currentDevice));
+  }
+}
+
+#undef IMPLEMENT_TH_CUDA_TENSOR_COPY
+#undef IMPLEMENT_TH_CUDA_TENSOR_COPY_TO
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
new file mode 100644
index 0000000..0e2630c
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -0,0 +1,80 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.cu"
+#else
+
+THC_API void
+THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
+  if (dst == src) return;
+  THC_copyTensor<real, real>(state, dst, src);
+}
+
+template <>
+THCTensor *THCTensor_newClone<real>(THCState *state, THCTensor *self) {
+  THCTensor *tensor = THCTensor_new(state, self->storage->scalar_type);
+  THCTensor_resizeAs(state, tensor, self);
+  THC_copyTensor<real, real>(state, tensor, self);
+  return tensor;
+}
+
+template <>
+THCTensor *THCTensor_newContiguous<real>(THCState *state, THCTensor *self)
+{
+  if(!THCTensor_isContiguous(state, self)) {
+    return THCTensor_newClone<real>(state, self);
+  } else {
+    THCTensor_retain(state, self);
+    return self;
+  }
+}
+
+
+template <>
+void THCTensor_freeCopyTo<real>(THCState *state, THCTensor *self, THCTensor *dst) {
+  if(self != dst)
+    THC_copyTensor<real, real>(state, dst, self);
+
+  THCTensor_free(state, self);
+}
+
+template <>
+void THCTensor_copyIgnoringOverlaps<real>(THCState* state, THCTensor* dst, THCTensor* src) {
+  // Called when we are copying into an overlapping index `dst`, but
+  // we don't care which writer wins. Hacky but it works.
+  // This is itself invoked by pointwiseApply2 / THCTensor_copy in
+  // case that there are write overlaps.
+  // FIXME: really, overlapping writes should be illegal/an error in Torch
+  THC_pointwiseApply2<real, real>(
+    state, dst, src,
+    CopyOp<real, real>(),
+    ReadOnly, /* ignore overwrites */
+    ReadOnly);
+}
+
+THC_API void
+THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+  THCTensor_copyIgnoringOverlaps<real>(state, dst, src);
+}
+
+#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC)        \
+  THC_API void                                                          \
+  THCTensor_(copyCuda##TYPEC)(THCState *state,                          \
+                              THCTensor *self,                          \
+                              THCuda##TYPECUDA##Tensor *src) {          \
+    THC_copyTensor<real, SCALARC>(state, self, src); \
+  }
+
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte, uint8_t)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char, int8_t)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short, int16_t)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int, int32_t)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t)
+// THCudaTensor aka the non-existent THCudaFloatTensor
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double)
+#ifdef CUDA_HALF_TENSOR
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half)
+#endif
+
+#undef IMPLEMENT_THC_CUDA_TENSOR_COPY
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorCopy.h b/aten/src/THC/generic/THCTensorCopy.h
new file mode 100644
index 0000000..e549f09
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorCopy.h
@@ -0,0 +1,43 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.h"
+#else
+
+THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src);
+THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src);
+THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src);
+THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor *src);
+THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src);
+THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src);
+THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src);
+THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src);
+
+THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src);
+THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src);
+THC_API void THCTensor_(copyCudaShort)(THCState *state, THCTensor *dst, struct THCudaShortTensor *src);
+THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *dst, struct THCudaIntTensor *src);
+THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *dst, struct THCudaLongTensor *src);
+THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *dst, struct THCudaTensor *src);
+THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *dst, struct THCudaDoubleTensor *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *dst, struct THCudaHalfTensor *src);
+#endif
+
+THC_API void TH_CONCAT_2(THByteTensor_copyCuda  , Real)  (THCState *state, THByteTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THCharTensor_copyCuda  , Real)  (THCState *state, THCharTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THShortTensor_copyCuda , Real)  (THCState *state, THShortTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THIntTensor_copyCuda   , Real)  (THCState *state, THIntTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THLongTensor_copyCuda  , Real)  (THCState *state, THLongTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real)  (THCState *state, THFloatTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real)  (THCState *state, THDoubleTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real)    (THCState *state, THHalfTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src);
+
+THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src);
+
+THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src);
+THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
new file mode 100644
index 0000000..0e6a7ff
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -0,0 +1,654 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorIndex.cu"
+#else
+
+// Check tensor dimensions for index operations, and return the slice size.
+// src can be nullptr in case of indexFill: in that case it is ignored.
+static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst,
+                                          int dim,
+                                          THCudaLongTensor *index,
+                                          THCTensor *src)
+{
+  int dstDims = THCTensor_(_nDimension)(state, dst);
+  int srcDims = (src == nullptr) ? dstDims : THCTensor_(_nDimension)(state, src);
+
+  THArgCheck(THCudaLongTensor__nDimension(state, index) == 1, 4,
+             "expecting vector of indices");
+  THArgCheck(dim >= 0 && dim < dstDims, 2, "Indexing dim is out of bounds");
+
+  ptrdiff_t dstSliceSize = 1;
+  for (int d = 0; d < dstDims; d++) {
+    if (d != dim) {
+      dstSliceSize *= dst->size[d];
+    }
+  }
+
+  if (src == nullptr) return dstSliceSize;
+
+  THArgCheck(dim < srcDims, 3, "Indexing dim is out of bounds");
+  THArgCheck(THCudaLongTensor_nElement(state, index) == src->size[dim], 4,
+             "length of src.size[dim] is not equal to length of indices");
+
+  ptrdiff_t srcSliceSize = 1;
+  bool mismatch = false;
+
+  if (dstDims != srcDims) mismatch = true;
+
+  for (int d = 0; d < srcDims; d++) {
+    if (d != dim) {
+      srcSliceSize *= src->size[d];
+      if (!mismatch && dst->size[d] != src->size[d]) mismatch = true;
+    }
+  }
+
+  THArgCheck(dstSliceSize == srcSliceSize, 2,
+             "Source/destination tensor have different slice sizes (%ld vs %ld)",
+             dstSliceSize, srcSliceSize);
+
+  if (mismatch) {
+    static bool warningShown = false;
+    if (!warningShown) {
+      warningShown = true;
+      fprintf(stderr,
+              "Warning: source/destination slices have same size but different "
+              "shape for an index operation.  This behavior is deprecated.\n");
+    }
+  }
+
+  return dstSliceSize;
+}
+
+// Compare the stride between adjacent slices (sliceStride) with strides in the
+// other dimensions (i.e., strides *inside* each slice).
+//
+// - Returns true if some dimension inside the slice has lower stride than
+//   sliceStride.  The simplest example is a 2-D contiguous tensor with sliceDim
+//   == 0 (that is, each slice is a row).
+//
+//   In this case, we choose the CUDA kernel that processes the data in
+//   "index-major order".  For example, if thread count equals slice size, then
+//   all threads process slice #0 in lockstep, and then slice #1, and so on.
+//
+// - Otherwise (i.e., sliceStride has the lowest value), this function returns
+//   false.  The simplest example is a 2-D contiguous tensor with sliceDim == 1
+//   (each slice is a column).
+//
+//   In this case, we choose the CUDA kernel that processes the data in
+//   "elementInSlice-major order".  For example, each thread can process element
+//   #0 of every slice, and then element #1 of every slice, and so on.
+bool THCTensor_(indexShouldBeMajor)(TensorInfo<real, unsigned int> &info,
+                                    int sliceDim)
+{
+  // The stride between adjacent slices (e.g., between element #0 of slice #100
+  // and element #0 of slice #101).
+  unsigned int sliceStride = info.strides[sliceDim];
+
+  for (int i = 0; i < info.dims; ++i) {
+    if (i != sliceDim && info.sizes[i] > 1 && info.strides[i] < sliceStride) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
+
+  int dims = THCTensor_(_nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(_nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t sliceSize = THCTensor_(getSliceSize)(state, dst, dim, indices, src);
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
+  int64_t dstCopyDimSize = THCTensor_(size)(state, dst, dim);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexCopySmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>       \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
+      dstInfo, srcInfo, indicesInfo,                            \
+      dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE,                         \
+                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)   \
+  indexCopyLargeIndex<TENSOR_TYPE, TYPE,                       \
+                      DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(          \
+      dstInfo, srcInfo, indicesInfo,                           \
+      dstCopyDim, srcCopyDim, srcTotalSize,                    \
+      (IDX_IS_MAJOR) ? sliceSize : numIndices,                 \
+      dstCopyDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
+
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, dst);
+    int dstCopyDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstCopyDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    int srcCopyDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcCopyDim);
+
+    TensorInfo<int64_t, unsigned int> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstCopyDim);
+
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2, true);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, false);
+        }
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, false);
+        }
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1, true);
+      }
+    }
+  } else {
+    TensorInfo<real, uint64_t> dstInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, dst);
+    int dstCopyDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstCopyDim);
+
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    int srcCopyDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcCopyDim);
+
+    TensorInfo<int64_t, uint64_t> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, uint64_t, -1, -1, -1, true);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLongTensor *index)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2,
+             "tried to take from an empty tensor");
+
+  THCTensor_(resizeNd)(state, dst, index->dim(), index->size, NULL);
+
+  // dispatchTakePut only handles non-empty tensors;
+  if (index->_dim() > 0) {
+    dispatchTakePut<real, TensorTakeOp>(state, src, dst, index);
+  }
+}
+
+static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) {
+  THCThrustAllocator thrustAlloc(state);
+
+  auto index_iter = thrust::device_ptr<int64_t>(THCudaLongTensor_data(state, index));
+  auto src_iter = thrust::device_ptr<real>(THCTensor_(data)(state, src));
+  auto numel = THCTensor_(numel)(state, src);
+
+  thrust::sort_by_key(
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+    index_iter, index_iter + numel,
+    src_iter, ThrustLTOp<int64_t>());
+}
+
+void THCTensor_(put)(THCState *state, THCTensor *dst, THCudaLongTensor *index, THCTensor *src, int accumulate)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  ptrdiff_t dstSize = THCTensor_(nElement)(state, dst);
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, index);
+  THArgCheck(THCTensor_(nElement)(state, src) == numIndices,
+    3, "src should have the same number of elements as index");
+
+  THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+
+  if (numIndices == 0) {
+    return;
+  }
+
+  if (accumulate) {
+    // wrap indices so to replace negative indices
+    THCudaLongTensor* sorted_index = THCudaLongTensor_new(state);
+    THCudaLongTensor_resizeAs(state, sorted_index, index);
+    THC_pointwiseApply2<int64_t, int64_t>(state, sorted_index, index, WrapIndexOp(dstSize));
+
+    THCTensor* sorted_src = THCTensor_(newClone)(state, src);
+
+    THCTensor_(sort_indices)(state, sorted_index, sorted_src);
+    dispatchTakePut<real, TensorPutAccumulateOp>(state, dst, sorted_src, sorted_index);
+
+    THCTensor_(free)(state, sorted_src);
+    THCudaLongTensor_free(state, sorted_index);
+  } else {
+    dispatchTakePut<real, TensorPutOp>(state, dst, src, index);
+  }
+}
+
+void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
+
+  int dims = THCTensor_(_nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(_nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t sliceSize = THCTensor_(getSliceSize)(state, dst, dim, indices, src);
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
+  int64_t dstAddDimSize = THCTensor_(size)(state, dst, dim);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexAddSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, srcInfo, indicesInfo,                    \
+      dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE,                        \
+                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)  \
+  indexAddLargeIndex<TENSOR_TYPE, TYPE,                       \
+                     DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(         \
+      dstInfo, srcInfo, indicesInfo,                          \
+      dstAddDim, srcAddDim, srcTotalSize,                     \
+      (IDX_IS_MAJOR) ? sliceSize : numIndices,                \
+      dstAddDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
+
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, dst);
+    int dstAddDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstAddDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    int srcAddDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcAddDim);
+
+    TensorInfo<int64_t, unsigned int> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstAddDim);
+
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2, true);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, false);
+        }
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, false);
+        }
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1, true);
+      }
+    }
+  } else {
+    TensorInfo<real, uint64_t> dstInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, dst);
+    int dstAddDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstAddDim);
+
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    int srcAddDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcAddDim);
+
+    TensorInfo<int64_t, uint64_t> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, uint64_t, -1, -1, -1, true);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, real val)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
+  int dims = THCTensor_(_nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t sliceSize =
+    THCTensor_(getSliceSize)(state, dst, dim, indices, nullptr);
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
+  int64_t dstFillDimSize = THCTensor_(size)(state, dst, dim);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM)  \
+  indexFillSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM> \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, indicesInfo,                             \
+      dstFillDim, sliceSize, dstFillDimSize, val);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM, IDX_IS_MAJOR)   \
+  indexFillLargeIndex<TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM, IDX_IS_MAJOR> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                    \
+      dstInfo, indicesInfo,                                              \
+      dstFillDim, sliceSize * numIndices,                                \
+      (IDX_IS_MAJOR) ? sliceSize : numIndices,                           \
+      dstFillDimSize, val);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
+
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, dst);
+    int dstFillDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstFillDim);
+
+    TensorInfo<int64_t, unsigned int> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, -2);
+      } else if (dstInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, -2);
+      } else if (dstInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1);
+      }
+    } else {
+      bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstFillDim);
+
+      if (dstInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, -2, true);
+      } else if (dstInfo.dims == 2 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 2, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 2, -2, false);
+        }
+      } else if (dstInfo.dims == 3 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 3, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 3, -2, false);
+        }
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, true);
+      }
+    }
+  } else {
+    TensorInfo<real, uint64_t> dstInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, dst);
+    int dstFillDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstFillDim);
+
+    TensorInfo<int64_t, uint64_t> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, uint64_t, -1, -1, true);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THCudaLongTensor *indices)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, dst, src, indices));
+
+  int dims = THCTensor_(_nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(_nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+
+  int srcDims = THCTensor_(_nDimension)(state, src);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THArgCheck(THCudaLongTensor__nDimension(state, indices) <= 1, 3,
+             "Index is supposed to be an empty tensor or a vector");
+  THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds");
+  THArgCheck(srcDims > 0, 2, "Source tensor is empty");
+
+  THLongStorage *newSize;
+
+  if (numIndices == 0) {
+    newSize = THCTensor_(newSizeOf)(state, src);
+    THLongStorage_set(newSize, 0, numIndices);
+    THCTensor_(resize)(state, dst, newSize, NULL);
+    THLongStorage_free(newSize);
+    return;
+  }
+
+  newSize = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(newSize, dim, numIndices);
+  THCTensor_(resize)(state, dst, newSize, NULL);
+  THLongStorage_free(newSize);
+
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
+  int64_t srcSelectDimSize = THCTensor_(size)(state, src, dim);
+  ptrdiff_t sliceSize = dstTotalSize / numIndices;
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexSelectSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>     \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
+      dstInfo, srcInfo, indicesInfo,                            \
+      dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE,                           \
+                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)     \
+  indexSelectLargeIndex<TENSOR_TYPE, TYPE,                       \
+                        DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(            \
+      dstInfo, srcInfo, indicesInfo,                             \
+      dstSelectDim, srcSelectDim, dstTotalSize,                  \
+      (IDX_IS_MAJOR) ? sliceSize : numIndices,                   \
+      srcSelectDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
+
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, dst);
+    int dstSelectDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstSelectDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    int srcSelectDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcSelectDim);
+
+    TensorInfo<int64_t, unsigned int> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstSelectDim);
+
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2, true);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 2, 2, -2, false);
+        }
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        if (indexIsMajor) {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, true);
+        } else {
+          LARGE_INDEX(real, unsigned int, 3, 3, -2, false);
+        }
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1, true);
+      }
+    }
+  } else {
+    TensorInfo<real, uint64_t> dstInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, dst);
+    int dstSelectDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstSelectDim);
+
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    int srcSelectDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcSelectDim);
+
+    TensorInfo<int64_t, uint64_t> indicesInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, uint64_t, -1, -1, -1, true);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorIndex.h b/aten/src/THC/generic/THCTensorIndex.h
new file mode 100644
index 0000000..03ff54c
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorIndex.h
@@ -0,0 +1,12 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorIndex.h"
+#else
+
+THC_API void THCTensor_(indexCopy)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexAdd)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexFill)(THCState *state, THCTensor *tensor, int dim, THCudaLongTensor *index, real val);
+THC_API void THCTensor_(indexSelect)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
+THC_API void THCTensor_(take)(THCState *state, THCTensor *res_, THCTensor *src, THCudaLongTensor *index);
+THC_API void THCTensor_(put)(THCState *state, THCTensor *res_, THCudaLongTensor *indices, THCTensor *src, int accumulate);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu
new file mode 100644
index 0000000..80c1344
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMasked.cu
@@ -0,0 +1,193 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.cu"
+#else
+
+
+THC_API void
+THCTensor_(maskedFill)(THCState* state,
+                       THCTensor *tensor, THCudaByteTensor *mask, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask));
+  THArgCheck(THCTensor_(nElement)(state, tensor) ==
+             THCudaByteTensor_nElement(state, mask),
+             2, "sizes do not match");
+
+  if (!THC_pointwiseApply2<real, uint8_t>(state, tensor, mask,
+                                          TensorMaskedFillOp<real, unsigned char>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedFillByte)(THCState* state,
+                           THCTensor *tensor, THByteTensor *mask, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedFill)(state, tensor, maskCuda, value);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedCopy)(THCState* state,
+                       THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
+  ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
+  ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
+
+  // `mask` and `tensor` must have the same number of elements
+  THArgCheck(maskSize == tensorSize, 2,
+             "mask and tensor must have the same number of elements");
+
+  // Determine our output size
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
+
+  // The number of `1` elements present in the mask must be <= the
+  // number of elements available in `src`
+  if (totalElements > srcSize) {
+    THArgCheck(false, 2, "source nElements must be == mask `1` elements");
+  }
+
+  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+  // iterator prefix sums? Convert `mask` to the same datatype as what
+  // we're accumulating the prefix sum in (int64_t) to get around it
+  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+  THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+  THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+  THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<int64_t>
+    maskData(THCudaLongTensor_data(state, maskLong));
+  thrust::device_ptr<int64_t>
+    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+  thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    maskData,
+    maskData + THCudaLongTensor_nElement(state, maskLong),
+    maskPrefixSumData);
+
+  // We are getting elements from `src` based on an offset from
+  // `maskPrefixSum`, so that should be made contiguous too
+  THCTensor* contigSrc = THCTensor_(newContiguous)(state, src);
+
+  // update `tensor` where `mask` == 1 but pull from `src` at
+  // maskPrefixSum
+  bool status = THC_pointwiseApply3<real, uint8_t, int64_t>(
+    state, tensor, mask, maskPrefixSum,
+    TensorMaskedCopyOp<real, unsigned char, int64_t>(
+      THCTensor_(data)(state, contigSrc)));
+
+  THCTensor_(free)(state, contigSrc);
+  THCudaLongTensor_free(state, maskLong);
+  THCudaLongTensor_free(state, maskPrefixSum);
+
+  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedCopyByte)(THCState* state,
+                           THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedCopy)(state, tensor, maskCuda, src);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedSelect)(THCState* state,
+                         THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  THArgCheck(THCudaByteTensor_nElement(state, mask) ==
+             THCTensor_(nElement)(state, src),
+             2, "sizes do not match");
+
+  // Determine our output size
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
+  THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor);
+
+  THCTensor_(resize1d)(state, tensorContig, totalElements);
+  if (tensor != tensorContig) {
+    THCTensor_(resize1d)(state, tensor, totalElements);
+  }
+
+  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+  // iterator prefix sums? Convert `mask` to the same datatype as what
+  // we're accumulating the prefix sum in (int64_t) to get around it
+  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+  THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+  THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+  THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<int64_t>
+    maskData(THCudaLongTensor_data(state, maskLong));
+  thrust::device_ptr<int64_t>
+    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+  thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    maskData,
+    maskData + THCudaLongTensor_nElement(state, maskLong),
+    maskPrefixSumData);
+
+  // Then copy over the masked elements at their desired output index
+  bool status = THC_pointwiseApply3<uint8_t, int64_t, real>(
+    state, mask, maskPrefixSum,
+    src, TensorMaskedSelectOp<real, unsigned char, int64_t>(
+      THCTensor_(data)(state, tensor)));
+
+  THCudaLongTensor_free(state, maskLong);
+  THCudaLongTensor_free(state, maskPrefixSum);
+
+  if (tensor != tensorContig) {
+    THCTensor_(freeCopyTo)(state, tensorContig, tensor);
+  } else {
+    THCTensor_(free)(state, tensorContig);
+  }
+
+  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+  THCudaCheck(cudaGetLastError());
+}
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void
+THCTensor_(maskedSelectByte)(THCState* state,
+                             THCTensor *tensor, THCTensor *src, THByteTensor *mask)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedSelect)(state, tensor, src, maskCuda);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMasked.h b/aten/src/THC/generic/THCTensorMasked.h
new file mode 100644
index 0000000..98f5aee
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMasked.h
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.h"
+#else
+
+THC_API void THCTensor_(maskedFill)(THCState *state,
+                                    THCTensor *tensor,
+                                    THCudaByteTensor *mask,
+                                    real value);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedFillByte)(THCState *state,
+                                        THCTensor *tensor,
+                                        THByteTensor *mask,
+                                        real value);
+
+THC_API void THCTensor_(maskedCopy)(THCState *state,
+                                    THCTensor *tensor,
+                                    THCudaByteTensor *mask,
+                                    THCTensor *src);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedCopyByte)(THCState *state,
+                                        THCTensor *tensor,
+                                        THByteTensor *mask,
+                                        THCTensor *src);
+
+THC_API void THCTensor_(maskedSelect)(THCState *state,
+                                      THCTensor *tensor,
+                                      THCTensor *src,
+                                      THCudaByteTensor *mask);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedSelectByte)(THCState *state,
+                                          THCTensor *tensor,
+                                          THCTensor *src,
+                                          THByteTensor *mask);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
new file mode 100644
index 0000000..8bdd8fa
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -0,0 +1,485 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.cu"
+#else
+
+THC_API void
+THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+
+  if (!THC_pointwiseApply1<real>(
+        state, self_, TensorFillOp<real>(value))) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zero)(THCState *state, THCTensor *self_)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  if (THCTensor_(isContiguous)(state, self_)) {
+    THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
+                                0,
+                                sizeof(real) * THCTensor_(nElement)(state, self_),
+                                THCState_getCurrentStream(state)));
+  } else {
+    if (!THC_pointwiseApply1<real>(
+          state, self_,
+          TensorFillOp<real>(ScalarConvert<int, real>::to(0)))) {
+      THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
+  THCTensor_(resizeAs)(state, r_, input);
+  THCTensor_(zero)(state, r_);
+}
+
+THC_API void
+THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
+  THCTensor_(resizeAs)(state, r_, input);
+  THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
+}
+
+ptrdiff_t
+THCTensor_(numel)(THCState *state, THCTensor *t)
+{
+  return THCTensor_(nElement)(state, t);
+}
+
+void THCTensor_(cat)(THCState *state, THCTensor *result,
+		     THCTensor *ta, THCTensor *tb, int dimension)
+{
+  THCTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THCTensor_(catArray)(state, result, inputs, 2, dimension);
+}
+
+void THCTensor_(check_shape_except_dim)(THCState *state, 
+    THCTensor *first, THCTensor *second, int dimension);
+inline void THCTensor_(check_shape_except_dim)(THCState *state, 
+    THCTensor *first, THCTensor *second, int dimension)
+{
+  int first_dims = first->dim();
+  int second_dims = second->dim();
+  THArgCheck(first_dims == second_dims, 0,
+      "Tensors must have same number of dimensions: got %d and %d",
+      first_dims, second_dims);
+  for (int dim = 0; dim < first_dims; dim++) {
+    if (dim == dimension) {
+      continue;
+    }
+    int64_t first_dim_size = THCTensor_(size)(state, first, dim);
+    int64_t second_dim_size = THCTensor_(size)(state, second, dim);
+    THArgCheck(first_dim_size == second_dim_size, 0,
+        "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d",
+        dimension, (long long)first_dim_size, (long long)second_dim_size, dim);
+  }
+}
+
+void THCTensor_(catArray)(THCState *state, THCTensor *result,
+			  THCTensor **inputs, int numInputs, int dimension)
+{
+  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
+  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
+  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
+  // size (i.e. other empty sizes are not skipped).
+  // FIXME: warn if this is the case
+  THLongStorage *size;
+  int i, j, cohortMax;
+  int64_t offset;
+  bool hasSkippedInput = false;
+  THCTensor *notSkippedTensor = NULL;  // non-owning reference
+  auto should_skip = [](THCTensor *t) { return t->is_empty() && t->dim() == 1; };
+  int nDims = 0;
+
+  for (i = 0; i < numInputs; i++)
+  {
+    if (should_skip(inputs[i])) {
+      hasSkippedInput = true;
+      continue;
+    }
+    nDims = inputs[i]->dim();
+    notSkippedTensor = inputs[i];
+  }
+
+  // If all inputs are empty tensors, return an empty tensor
+  if (notSkippedTensor == NULL) {
+    return;
+  }
+
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+  THArgCheck(dimension >= 0, 4, "invalid dimension %d", dimension);
+  
+  size = THLongStorage_newWithSize(nDims);
+  
+  // Compute size of the result in the cat dimension
+  int64_t cat_dim_size = 0;
+  for (int i = 0; i < numInputs; i++) {
+    THCTensor *tensor = inputs[i];
+    if (should_skip(tensor)) {
+      continue;
+    }
+    THCTensor_(check_shape_except_dim)(state, notSkippedTensor, tensor, dimension);
+    cat_dim_size += THCTensor_(size)(state, tensor, dimension);
+  }
+
+  // Compute the size of the result
+  for (int dim = 0; dim < nDims; dim++) {
+    int64_t result_dim_size = THCTensor_(size)(state, notSkippedTensor, dim);
+    if (dim == dimension) {
+      result_dim_size = cat_dim_size;
+    }
+    THLongStorage_data(size)[dim] = result_dim_size;
+  }
+  THCTensor_(resize)(state, result, size, NULL);
+  THLongStorage_free(size);
+
+  // We parallelize the copy if all 6 conditions pass:
+  //
+  // 1. There is more than one input tensor
+  // 2. No empty inputs
+  // 3. The result tensor is 32-bit indexable
+  // 4. The number of dimensions is <= 4
+  // 5. All input tensors are contiguous (output tensor may be non-contig)
+  // 6. All input tensors can use 32-bit indexing
+  // 7. All input tensors are on the same device
+
+  if (numInputs > 1 &&
+      !hasSkippedInput &&
+      result->dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      THCTensor_canUse32BitIndexMath(state, result) &&
+      THCTensor_allContiguous(state, inputs, numInputs) &&
+      THCTensor_all32BitIndexable(state, inputs, numInputs) &&
+      THCTensor_allSameDevice(state, inputs, numInputs)) {
+
+    // First, let's set up our kernel parameters. We start with a raw pointer to the storage
+    // for the output Tensor.
+    real *data = THCTensor_(data)(state, result);
+
+    // Kernel Parameter
+    size_t tensorMetadataSize = sizeof(CatArrInputTensor<real, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
+    auto d_inputs = static_cast<CatArrInputTensor<real, unsigned int> *>(THCudaMalloc(state, tensorMetadataSize));
+
+    OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
+
+    // Next, let's initialize the size, stride arrays for the output Tensor.
+    for (i = 0; i < nDims; ++i) {
+      param.outputSize[i] = THCTensor_(size)(state, result, i);
+      param.outputStride[i] = THCTensor_(stride)(state, result, i);
+    }
+
+    THCStream* stream = THCState_getStream(state);
+
+    // Template Declarations for dim = 1, 2, 3, 4
+#define HANDLE_CASE(DIMS) \
+  CatArrayBatchedCopy<real, unsigned int, DIMS><<<catGrid, applyBlock, 0, THCStream_stream(stream)>>>(data, d_inputs, param, dimension, param.outputStride[dimension]);
+
+    // Now we loop
+    offset = 0;
+    for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
+      // Re-allocate stackInputs every iteration to avoid read-after-write hazard
+      {
+        auto stackInputs_owner = THCudaHostAlloc(state, tensorMetadataSize);
+        CatArrInputTensor<real, unsigned int>* stackInputs = static_cast<CatArrInputTensor<real, unsigned int>*>(stackInputs_owner.get());
+        cohortMax = 0;
+        for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
+          int64_t dimSize = THCTensor_(size)(state, inputs[i+j], dimension);
+
+          stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
+          stackInputs[j].offset = offset;
+          stackInputs[j].dimSize = dimSize;
+          stackInputs[j].nElements = THCTensor_(nElement)(state, inputs[i+j]);
+          cohortMax = cohortMax > (int) stackInputs[j].nElements ? cohortMax : (int) stackInputs[j].nElements;
+
+          // update offset
+          offset += dimSize;
+        }
+        THCudaCheck(cudaMemcpyAsync(
+            d_inputs,
+            stackInputs,
+            j * sizeof(CatArrInputTensor<real, unsigned int>),
+            cudaMemcpyHostToDevice,
+            THCStream_stream(stream)));
+        THCudaHostRecord(state, stackInputs);
+      }
+
+      // Next, let's consider how we set our kernel launch parameters.
+      // We borrow from THCApply, which the kernel's internal indexing
+      // is based on.
+      dim3 applyBlock = getApplyBlock();
+
+      //Get grid where x dim fills half gpu and y dim is number of tensors.
+      //This will have cating two tensors fill the entire grid, but prevent
+      //many threads from needlessly load meta data if their sizes is small.
+      dim3 catGrid;
+      getCatGrid(state, j, catGrid);
+
+
+      switch (nDims) {
+        case 1:
+          HANDLE_CASE(1);
+          break;
+        case 2:
+          HANDLE_CASE(2);
+          break;
+        case 3:
+          HANDLE_CASE(3);
+          break;
+        case 4:
+          HANDLE_CASE(4);
+          break;
+      }
+      THCudaCheck(cudaGetLastError());
+    }
+    THCudaFree(state, d_inputs);
+#undef HANDLE_CASE
+  } else {
+    offset = 0;
+    for (j = 0; j < numInputs; j++)
+    {
+      if (should_skip(inputs[j])) continue;
+      int64_t dimSize = THCTensor_(size)(state, inputs[j], dimension);
+      THCTensor *nt = THCTensor_(newWithTensor)(state, result);
+      THCTensor_(narrow)(state, nt, NULL, dimension, offset, dimSize);
+      THCTensor_(copy)(state, nt, inputs[j]);
+      THCTensor_(free)(state, nt);
+      offset += dimSize;
+    }
+  }
+}
+
+void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
+                          THCTensor *self)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self  ));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, tensor));
+
+
+  using namespace thrust::placeholders;
+  THCThrustAllocator thrustAlloc(state);
+  self = THCTensor_(newContiguous)(state, self);
+  thrust::device_ptr<real> self_data(THCTensor_(data)(state, self));
+
+  int num_dim = THCTensor_(nDimension)(state, self);
+  int64_t N = THCTensor_(nElement)(state, self);
+
+  THCudaLongTensor_resize2d(state, tensor, N, num_dim);
+  tensor = THCudaLongTensor_newContiguous(state, tensor);
+  thrust::device_ptr<int64_t> tensor_data(THCudaLongTensor_data(state, tensor));
+
+  thrust::counting_iterator<int64_t> idxfirst(0);
+  thrust::counting_iterator<int64_t> idxlast = idxfirst + N;
+
+  typedef thrust::device_ptr<int64_t> Iter;
+  strided_range<Iter> strided_tensor(tensor_data,
+                                     tensor_data+N*num_dim, num_dim);
+
+#if CUDA_VERSION >= 7000
+  cudaStream_t stream = THCState_getCurrentStream(state);
+#endif
+
+  strided_range<Iter>::iterator dend = thrust::copy_if(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(stream),
+#endif
+    idxfirst,
+    idxlast,
+    self_data,
+    strided_tensor.begin(),
+    NonZeroOp<real>()
+  );
+
+  int64_t num_nonzeros = thrust::distance(strided_tensor.begin(), dend);
+
+  int64_t div = 1;
+  for (int dim = num_dim-1; dim >= 0; dim--) {
+    strided_range<Iter> stride_dim(tensor_data+dim,
+                                   tensor_data+N*num_dim, num_dim);
+    thrust::transform(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(stream),
+#endif
+      strided_tensor.begin(),
+      strided_tensor.end(),
+      stride_dim.begin(),
+      idx_functor(div, self->size[dim])
+    );
+    div *= self->size[dim];
+  }
+
+  THCudaLongTensor_resize2d(state, tensor, num_nonzeros, num_dim);
+
+  THCTensor_(free)(state, self);
+  THCudaLongTensor_free(state, tensor);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  int nDimension = THCTensor_(nDimension)(state, src_);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_ASSERT(!src_->is_empty());
+#endif
+  THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
+  if (nDimension == 2) {
+    int64_t stride0 = THCTensor_(stride)(state, src_, 0);
+    int64_t stride1 = THCTensor_(stride)(state, src_, 1);
+    int64_t size0 = THCTensor_(size)(state, src_, 0);
+    int64_t size1 = THCTensor_(size)(state, src_, 1);
+    int64_t size = (k > 0) ? min((int64_t)size0, (int64_t)size1 - k) : min((int64_t)size0 + k, (int64_t)size1);
+    THCTensor_(resize1d)(state, self_, size);
+    if (size > 0) {
+      int64_t strideSelf = THCTensor_(stride)(state, self_, 0);
+      const dim3 threads(min((int64_t)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (int64_t)size));
+      dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (int64_t)threads.x)));
+      int64_t start = (k >= 0 ? k * stride1 : -k * stride0);
+      THCTensor_copyFromDiagonal<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>
+      (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, size, stride0 + stride1, strideSelf);
+    }
+  } else {
+    ptrdiff_t totalElements = THCTensor_(nElement)(state, src_);
+    ptrdiff_t size = (k > 0) ? totalElements + k : totalElements - k;
+    int64_t strideSrc = THCTensor_(stride)(state, src_, 0);
+    THCTensor_(resize2d)(state, self_, size, size);
+    THCTensor_(zero)(state, self_);
+    if (size > 0) {
+      int64_t stride0 = THCTensor_(stride)(state, self_, 0);
+      int64_t stride1 = THCTensor_(stride)(state, self_, 1);
+      const dim3 threads(min((int64_t)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (int64_t)size));
+      dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (ptrdiff_t)threads.x)));
+      ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0);
+      THCTensor_copyToDiagonal<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>
+      (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, totalElements, stride0 + stride1, strideSrc);
+    }
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(eye)(THCState *state, THCTensor *self_, int64_t n, int64_t m)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  THArgCheck(n > 0, 1, "invalid argument");
+
+  if(m <= 0)
+    m = n;
+
+  THCTensor_(resize2d)(state, self_, n, m);
+  THCTensor_(zero)(state, self_);
+
+  int64_t sz = THMin(n, m);
+  int64_t stride = THCTensor_(stride)(state, self_, 0) +
+                   THCTensor_(stride)(state, self_, 1);
+
+  THCTensor *diag = THCTensor_(newWithStorage1d)(state, self_->storage,
+      self_->storageOffset,  sz, stride);
+
+  THCTensor_(fill)(state, diag, ScalarConvert<int, real>::to(1));
+  THCTensor_(free)(state, diag);
+}
+
+accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, src_));
+  THArgCheck((src_->_dim() == 2), 1, "expected a matrix");
+  THCTensor *diag = THCTensor_(new)(state);
+  THCTensor_(diag)(state, diag, src_, 0);
+  accreal trace = THCTensor_(sumall)(state, diag);
+  THCTensor_(free)(state, diag);
+  return trace;
+}
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  // NumPy allows you to pass different points even if n <= 1 -- should we?
+  THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points");
+  if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+  if (n == 0) {
+    // skip
+  } else if (n == 1) THCTensor_(fill)(state, r_, a);
+  else {
+    THCTensor *r = THCTensor_(isContiguous)(state, r_)
+                   ? r_ // if r_ is contiguous we can direct work on it
+                   : THCTensor_(newContiguous)(state, r_);
+    real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+                                       ScalarConvert<int64_t,real>::to(n - 1));
+    LinspaceOp<real> linspace_method(a, step);
+    thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+    thrust::tabulate(data_, data_ + n, linspace_method);
+    if (!THCTensor_(isContiguous)(state, r_)) { // We need to move data back to r_
+      THCTensor_(freeCopyTo)(state, r, r_);
+    }
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  // NumPy allows you to pass different points even if n <= 1 -- should we?
+  THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points");
+  if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+  if (n == 0) {
+    // skip
+  } else if (n == 1) THCTensor_(fill)(state, r_, THCNumerics<real>::exp10(a));
+  else {
+    THCTensor *r = THCTensor_(isContiguous)(state, r_)
+                   ? r_
+                   : THCTensor_(newContiguous)(state, r_);
+    real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+                                       ScalarConvert<int64_t,real>::to(n - 1));
+    LogspaceOp<real> logspace_method(a, step);
+    thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+    thrust::tabulate(data_, data_ + n, logspace_method);
+    if (!THCTensor_(isContiguous)(state, r_)) {
+      THCTensor_(freeCopyTo)(state, r, r_);
+    }
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
+
+void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  THArgCheck(step > 0 || step < 0, 3, "step must be nonzero");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound inconsistent with step sign");
+  ptrdiff_t size = (ptrdiff_t) (((xmax - xmin) / step) + 1);
+  if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size);
+  THCTensor *r = THCTensor_(newContiguous)(state, r_);
+  LinspaceOp<real,accreal> linspace_method(xmin, step);
+  thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+  thrust::tabulate(data_, data_ + size, linspace_method);
+  THCTensor_(freeCopyTo)(state, r, r_);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(arange)(THCState* state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  THArgCheck(step > 0 || step < 0, 3, "step must be nonzero");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound inconsistent with step sign");
+  ptrdiff_t size = (ptrdiff_t) ceil(ScalarConvert<accreal, double>::to(xmax - xmin) / step);
+  if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size);
+  THCTensor *r = THCTensor_(newContiguous)(state, r_);
+  LinspaceOp<real,accreal> linspace_method(xmin, step);
+  thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+  thrust::tabulate(data_, data_ + size, linspace_method);
+  THCTensor_(freeCopyTo)(state, r, r_);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMath.h b/aten/src/THC/generic/THCTensorMath.h
new file mode 100644
index 0000000..1cd7534
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMath.h
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.h"
+#else
+
+THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value);
+THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor* input);
+THC_API void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor* input);
+THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);
+THC_API void THCTensor_(cat)(THCState *state, THCTensor *result, THCTensor *ta, THCTensor *tb, int dimension);
+THC_API void THCTensor_(catArray)(THCState *state, THCTensor *result, THCTensor **inputs, int numInputs, int dimension);
+THC_API void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, THCTensor *self);
+
+THC_API void THCTensor_(tril)(THCState *state, THCTensor *self, THCTensor *src, int64_t k);
+THC_API void THCTensor_(triu)(THCState *state, THCTensor *self, THCTensor *src, int64_t k);
+THC_API void THCTensor_(diag)(THCState *state, THCTensor *self, THCTensor *src, int64_t k);
+THC_API void THCTensor_(eye)(THCState *state, THCTensor *self, int64_t n, int64_t k);
+
+THC_API accreal THCTensor_(trace)(THCState *state, THCTensor *self);
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n);
+THC_API void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n);
+
+#endif
+
+THC_API void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step);
+THC_API void THCTensor_(arange)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
new file mode 100644
index 0000000..6d1da07
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -0,0 +1,944 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.cu"
+#else
+
+#define ERROR_ONLY_FP_TYPES(func) \
+  THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func);
+
+THC_API accreal
+THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  THArgCheck(THCTensor_(nElement)(state, self) ==
+             THCTensor_(nElement)(state, src), 2, "sizes do not match");
+
+  self = THCTensor_(newContiguous)(state, self);
+  src = THCTensor_(newContiguous)(state, src);
+
+#ifdef THC_REAL_IS_FLOAT
+  accreal result = THCudaBlas_Sdot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1);
+#elif defined(THC_REAL_IS_DOUBLE)
+  accreal result = THCudaBlas_Ddot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1);
+#elif defined(THC_REAL_IS_HALF)
+  accreal result = ScalarConvert<half, accreal>::to(
+                   THCudaBlas_Hdot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1));
+#endif
+
+  THCTensor_(free)(state, src);
+  THCTensor_(free)(state, self);
+  return result;
+
+#else
+  ERROR_ONLY_FP_TYPES("dot");
+  return ScalarConvert<int, accreal>::to(0);
+#endif
+}
+
+THC_API void
+THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
+  if( (mat->_dim() != 2) || (vec->_dim() != 1) )
+    THError("matrix and vector expected");
+
+  if( mat->size[1] != vec->size[0] )
+    THError("size mismatch");
+
+  if(t->_dim() != 1)
+    THError("size mismatch");
+
+  if(t->size[0] != mat->size[0])
+    THError("size mismatch");
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  if(r_ != t)
+  {
+    THCTensor_(resizeAs)(state, r_, t);
+    THCTensor_(copy)(state, r_, t);
+  }
+
+  if(mat->stride[0] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else if(mat->stride[1] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
+                     alpha, THCTensor_(data)(state, mat), mat->stride[0],
+                     THCTensor_(data)(state, vec), vec->stride[0],
+                     beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else
+  {
+    THCTensor *cmat = THCTensor_(newContiguous)(state, mat);
+
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+
+    THCTensor_(free)(state, cmat);
+  }
+
+#elif defined(THC_REAL_IS_HALF)
+    // Currently no Hgemv/SgemvEx in Cublas
+    THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec);
+    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size[0], 1);
+
+    THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t);
+    THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size[0], 1);
+
+    THCTensor_(addmm)(state, r_, beta, tAsMatrix, alpha, mat, vecAsMatrix);
+
+    // r_ will have answer as matrix, need to return a vector
+    THCTensor_(resize1d)(state, r_, r_->size[0]);
+    THCTensor_(free)(state, vecAsMatrix);
+    THCTensor_(free)(state, tAsMatrix);
+#endif
+#else
+  ERROR_ONLY_FP_TYPES("addmv");
+#endif
+}
+
+THC_API void
+THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
+  if ( (vec1->_dim() != 1) || (vec2->_dim() != 1) ) {
+    THError("vector and vector expected");
+  }
+
+  if (t->_dim() != 2) {
+    THError("size mismatch");
+  }
+
+  if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+    THError("size mismatch");
+  }
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  if (r_ != t) {
+    THCTensor_(resizeAs)(state, r_, t);
+    THCTensor_(copy)(state, r_, t);
+  }
+
+  if(THCNumerics<real>::eq(beta, ScalarConvert<int, real>::to(0))) {
+    THCTensor_(zero)(state, r_);
+  } else if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
+    THCTensor_(mul)(state, r_, r_, beta);
+  }
+
+  if(r_->stride[0] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec1->size[0], vec2->size[0],
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[1]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec1->size[0], vec2->size[0],
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[1]);
+#endif
+  }
+  else if(r_->stride[1] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else
+  {
+    THCTensor *cr = THCTensor_(newClone)(state, r_);
+
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, cr), cr->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, cr), cr->stride[0]);
+#endif
+
+    THCTensor_(freeCopyTo)(state, cr, r_);
+  }
+#elif defined(THC_REAL_IS_HALF)
+  // currently no Hger/SgerEx in Cublas.
+  THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2);
+  THCTensor_(resize2d)(state, vec2T, vec2T->size[0], 1);
+  THCTensor_(transpose)(state, vec2T, NULL, 0, 1);
+
+  THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1);
+  THCTensor_(resize2d)(state, vec1M, vec1M->size[0], 1);
+
+  THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T);
+  THCTensor_(free)(state, vec2T);
+  THCTensor_(free)(state, vec1M);
+#endif
+#else
+  ERROR_ONLY_FP_TYPES("addr");
+#endif
+}
+
+THC_API void
+THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *m1, THCTensor *m2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
+  char transpose_r, transpose_m1, transpose_m2;
+  THCTensor *r__, *m1_, *m2_;
+
+  if( (m1->_dim() != 2) || (m2->_dim() != 2) )
+    THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim());
+
+  if(t->_dim() != 2)
+    THError("matrix expected, got %dD tensor for t", t->_dim());
+
+  if(m1->size[1] != m2->size[0]) {
+    THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1);
+    THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2);
+    THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
+  }
+
+  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
+    THCDescBuff bt  = THCTensor_(sizeDesc)(state, t);
+    THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1);
+    THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2);
+    THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
+  }
+
+  if(t != r_)
+  {
+    THCTensor_(resizeAs)(state, r_, t);
+    if (ScalarConvert<real, double>::to(beta) != 0.0) {
+      THCTensor_(copy)(state, r_, t);
+    }
+  }
+
+  /* r_ */
+  if(r_->stride[0] == 1 &&
+     r_->stride[1] != 0)
+  {
+    transpose_r = 'n';
+    r__ = r_;
+  }
+  else if(r_->stride[1] == 1 &&
+          r_->stride[0] != 0)
+  {
+    THCTensor *swap = m2;
+    m2 = m1;
+    m1 = swap;
+    transpose_r = 't';
+    r__ = r_;
+  }
+  else
+  {
+    transpose_r = 'n';
+
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, r_, 0, 1);
+    r__ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, r__, NULL, 0, 1);
+  }
+
+  /* m1 */
+  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m1 = 'n';
+    m1_ = m1;
+  }
+  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m1 = 't';
+    m1_ = m1;
+  }
+  else
+  {
+    transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
+    m1_ = THCTensor_(newContiguous)(state, m1);
+  }
+
+  /* m2 */
+  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m2 = 'n';
+    m2_ = m2;
+  }
+  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m2 = 't';
+    m2_ = m2;
+  }
+  else
+  {
+    transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
+    m2_ = THCTensor_(newContiguous)(state, m2);
+  }
+
+#ifdef THC_REAL_IS_HALF
+  THCudaBlas_Hgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#endif
+
+  /* free intermediate variables */
+  if(m1_ != m1) {
+    THCTensor_(free)(state, m1_);
+  }
+
+  if(m2_ != m2) {
+    THCTensor_(free)(state, m2_);
+  }
+
+  if(r__ != r_) {
+    THCTensor_(freeCopyTo)(state, r__, r_);
+  }
+#else
+  ERROR_ONLY_FP_TYPES("addmm");
+#endif
+}
+
+THC_API void
+THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+                   real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+  THArgCheck(THCTensor_(_nDimension)(state, t) == 2, 4, "expected 2D tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+
+  int64_t batchnum = THCTensor_(size)(state, batch1, 0);
+  int64_t m1d1 = THCTensor_(size)(state, batch1, 1);
+  int64_t innerdim = THCTensor_(size)(state, batch1, 2);
+  int64_t m2d2 = THCTensor_(size)(state, batch2, 2);
+
+  THArgCheck(batchnum == THCTensor_(size)(state, batch2, 0), 7,
+      "equal number of batches expected");
+  // M is t, as listed in the docs under addbmm
+  THArgCheck(m1d1 == THCTensor_(size)(state, t, 0), 6,
+      "first dimension must match first dimension of M");
+  THArgCheck(m2d2 == THCTensor_(size)(state, t, 1), 7,
+      "second dimension must match second dimension of M");
+  THArgCheck(innerdim == THCTensor_(size)(state, batch2, 1), 6,
+      "second dimension must match first dimension of batch2");
+
+  if (t != result) {
+    THCTensor_(resizeAs)(state, result, t);
+    if (ScalarConvert<real, double>::to(beta) != 0.0) {
+      THCTensor_(copy)(state, result, t);
+    }
+  }
+
+  THCTensor *slice1 = THCTensor_(new)(state);
+  THCTensor *slice2 = THCTensor_(new)(state);
+  for (int64_t i=0; i<batchnum; i++) {
+    THCTensor_(select)(state, slice1, batch1, 0, i);
+    THCTensor_(select)(state, slice2, batch2, 0, i);
+
+    THCTensor_(addmm)(state, result, beta, result, alpha, slice1, slice2);
+    beta = ScalarConvert<int, real>::to(1);
+  }
+  THCTensor_(free)(state, slice1);
+  THCTensor_(free)(state, slice2);
+#else
+  ERROR_ONLY_FP_TYPES("addbmm");
+#endif
+}
+
+__global__ void createBatchGemmBuffer(const real** buffer, real* data,
+                                      int64_t stride, int64_t num_batches) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_batches) {
+    buffer[idx] = data + idx * stride;
+  }
+}
+
+__global__ void createBatchGemmBuffer3(const real** buffer1, const real ** buffer2, const real ** buffer3, real* data1,
+                                       real * data2, real * data3, int64_t stride1, int64_t stride2, int64_t stride3, int64_t num_batches) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_batches) {
+    buffer1[idx] = data1 + idx * stride1;
+    buffer2[idx] = data2 + idx * stride2;
+    buffer3[idx] = data3 + idx * stride3;
+  }
+}
+
+THC_API void
+THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+                    real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+  THArgCheck(THCTensor_(_nDimension)(state, t) == 3, 4, "expected 3D tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+  THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6,
+             "equal number of batches expected");
+  THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7,
+             "equal number of batches expected");
+  THArgCheck(THCTensor_(size)(state, t, 1) == THCTensor_(size)(state, batch1, 1), 6,
+             "wrong matrix size");
+  THArgCheck(THCTensor_(size)(state, t, 2) == THCTensor_(size)(state, batch2, 2), 7,
+             "wrong matrix size");
+  THArgCheck(THCTensor_(size)(state, batch1, 2) == THCTensor_(size)(state, batch2, 1), 6,
+             "wrong matrix size");
+
+  if (t != result) {
+    THCTensor_(resizeAs)(state, result, t);
+    if (ScalarConvert<real, double>::to(beta) != 0.0) {
+      THCTensor_(copy)(state, result, t);
+    }
+  }
+
+  bool transpose_result;
+  char transpose_batch1, transpose_batch2;
+  int64_t lda, ldb, ldc;
+  THCTensor *result_, *batch1_, *batch2_;
+  if (result->stride[1] == 1)
+  {
+    transpose_result = false;
+    result_ = result;
+    ldc = result_->stride[2];
+  }
+  else if (result->stride[2] == 1)
+  {
+    transpose_result = true;
+
+    THCTensor *swap = batch2;
+    batch2 = batch1;
+    batch1 = swap;
+
+    result_ = result;
+    ldc = result_->stride[1];
+  }
+  else
+  {
+    transpose_result = false;
+
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, result, 1, 2);
+    result_ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, result_, NULL, 1, 2);
+
+    ldc = result_->stride[2];
+  }
+
+  if (batch1->stride[transpose_result ? 2 : 1] == 1 &&
+   batch1->stride[transpose_result ? 1 : 2] != 0)
+  {
+    transpose_batch1 = 'n';
+    batch1_ = batch1;
+    lda = batch1_->stride[transpose_result ? 1 : 2];
+  }
+  else if (batch1->stride[transpose_result ? 1 : 2] == 1 &&
+   batch1->stride[transpose_result ? 2 : 1] != 0)
+  {
+    transpose_batch1 = 't';
+    batch1_ = batch1;
+    lda = batch1_->stride[transpose_result ? 2 : 1];
+  }
+  else
+  {
+    transpose_batch1 = transpose_result ? 'n' : 't';
+    // batch1_ is later freed if batch1_ != batch1
+    if (THCTensor_(isContiguous)(state, batch1)) {
+      batch1_ = batch1;
+    } else {
+      batch1_ = THCTensor_(newContiguous)(state, batch1);
+    }
+    lda = batch1_->stride[1];
+  }
+
+  if (batch2->stride[transpose_result ? 2 : 1] == 1 &&
+   batch2->stride[transpose_result ? 1 : 2] != 0)
+  {
+    transpose_batch2 = 'n';
+    batch2_ = batch2;
+    ldb = batch2_->stride[transpose_result ? 1 : 2];
+  }
+  else if (batch2->stride[transpose_result ? 1 : 2] == 1 &&
+   batch2->stride[transpose_result ? 2 : 1] != 0)
+  {
+    transpose_batch2 = 't';
+    batch2_ = batch2;
+    ldb = batch2_->stride[transpose_result ? 2 : 1];
+  }
+  else
+  {
+    transpose_batch2 = transpose_result ? 'n' : 't';
+    // batch2_ is later freed if batch2_ != batch2
+    if (THCTensor_(isContiguous)(state, batch2)) {
+      batch2_ = batch2;
+    } else {
+      batch2_ = THCTensor_(newContiguous)(state, batch2);
+    }
+    ldb = batch2_->stride[1];
+  }
+  int64_t num_batches = result_->size[0];
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  // Compute pointers to matrices in each batch.
+#if CUDA_VERSION < 8000
+  size_t matrices_size = num_batches * sizeof(real*);
+
+//   Copy pointers to device.
+  auto d_matrices1 = static_cast<const real**>(THCudaMalloc(state, matrices_size));
+  auto d_matrices2 = static_cast<const real**>(THCudaMalloc(state, matrices_size));
+  auto d_result_matrices = static_cast<real**>(THCudaMalloc(state, matrices_size));
+
+  const int64_t block = 512;
+  const int64_t grid = (num_batches + block - 1) / block;
+
+  createBatchGemmBuffer3<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    d_matrices1, d_matrices2, (const real**)d_result_matrices, THCTensor_(data)(state, batch1_),
+    THCTensor_(data)(state, batch2_), THCTensor_(data)(state, result_),
+    batch1_->stride[0], batch2_->stride[0], result_->stride[0], num_batches);
+
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_SgemmBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      d_matrices1, lda,
+      d_matrices2, ldb,
+      beta,
+      d_result_matrices, ldc,
+      num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_DgemmBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      d_matrices1, lda,
+      d_matrices2, ldb,
+      beta,
+      d_result_matrices, ldc,
+      num_batches);
+#endif //THC_REAL
+
+  THCudaFree(state, d_matrices1);
+  THCudaFree(state, d_matrices2);
+  THCudaFree(state, d_result_matrices);
+
+#else
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_SgemmStridedBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      beta,
+      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_DgemmStridedBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      beta,
+      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      num_batches);
+#endif //THC_REAL
+#endif //CUDA_VERSION
+
+#elif defined(THC_REAL_IS_HALF)
+
+#if CUDA_VERSION < 9010
+  // Currently no HgemmBatched in Cublas
+  for (int64_t i = 0; i < num_batches; ++i) {
+    THCudaBlas_Hgemm(
+        state,
+        transpose_batch1,
+        transpose_batch2,
+        result_->size[transpose_result ? 2 : 1],
+        result_->size[transpose_result ? 1 : 2],
+        batch1_->size[transpose_result ? 1 : 2],
+        alpha,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        beta,
+        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+  }
+#else
+  cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state);
+  if (prop->major >= 5){
+
+  THCudaBlas_HgemmStridedBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      beta,
+      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      num_batches);
+   } else {
+      for (int64_t i = 0; i < num_batches; ++i) {
+        THCudaBlas_Hgemm(
+        state,
+        transpose_batch1,
+        transpose_batch2,
+        result_->size[transpose_result ? 2 : 1],
+        result_->size[transpose_result ? 1 : 2],
+        batch1_->size[transpose_result ? 1 : 2],
+        alpha,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        beta,
+        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+      }
+   }
+
+#endif
+#endif
+  if (batch1_ != batch1) {
+    THCTensor_(free)(state, batch1_);
+  }
+
+  if (batch2_ != batch2) {
+    THCTensor_(free)(state, batch2_);
+  }
+
+  if (result_ != result) {
+    THCTensor_(freeCopyTo)(state, result_, result);
+  }
+
+#else
+  ERROR_ONLY_FP_TYPES("baddbmm");
+#endif
+}
+
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
+  THArgCheck(THCTensor_(_nDimension)(state, a) == 3, 3, "expected 3D tensor");
+  THArgCheck(THCTensor_(size)(state, a, 1) ==
+             THCTensor_(size)(state, a, 2), 3, "matrices must be square");
+
+  if (ra_ != a) {
+    THCTensor_(resizeAs)(state, ra_, a);
+    // not sure if this is kosher, but things are nicer if we return in column major
+    if (ra_->stride[0] == 1) {
+      THCTensor_(transpose)(state, ra_, NULL, 1, 0);
+    } else if (ra_->stride[2] == 1) {
+      THCTensor_(transpose)(state, ra_, NULL, 1, 2);
+    }
+    THCTensor_(copy)(state, ra_, a);
+  }
+
+
+  int n = a->size[1];
+  int lda;
+  THCTensor *ra__;
+
+  if (ra_->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = ra_->stride[2];
+    ra__ = ra_;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, ra_, 1, 2);
+    ra__ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, ra__, NULL, 1, 2);
+    lda = ra__->stride[2];
+  }
+
+  int64_t num_batches = ra__->size[0];
+
+  if (!pivot) {
+    THCudaIntTensor *t = THCudaIntTensor_new(state);
+    THCudaIntTensor_range(state, t, 1, n, 1);
+    THCudaIntTensor_unsqueeze1d(state, t, t, 0);
+    THCudaIntTensor** ptrs = (THCudaIntTensor**) THAlloc(sizeof(THCudaIntTensor*)*num_batches);
+    for (int64_t i=0; i<num_batches; i++) {
+      ptrs[i] = t;
+    }
+    THCudaIntTensor_catArray(state, rpivots_, ptrs, num_batches, 0);
+    THCudaIntTensor_free(state, t);
+    THFree(ptrs);
+  } else {
+    THCudaIntTensor_resize2d(state, rpivots_, num_batches, n);
+  }
+
+  bool free_rinfo_ = !rinfo_;
+  if (rinfo_ == NULL) rinfo_ = THCudaIntTensor_new(state);
+  THCudaIntTensor_resize1d(state, rinfo_, num_batches);
+  int *info_gpu = THCudaIntTensor_data(state, rinfo_);
+
+  // Copy pointers to device.
+  size_t matrices_size = num_batches * sizeof(real*);
+  auto d_result = static_cast<real**>(THCudaMalloc(state, matrices_size));
+
+  const int64_t block = 512;
+  const int64_t grid = (num_batches + block - 1) / block;
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    (const real**)d_result, THCTensor_(data)(state, ra__),
+    ra__->stride[0], num_batches);
+
+  int *pivots_gpu = NULL;
+  if (pivot) {
+    pivots_gpu = THCudaIntTensor_data(state, rpivots_);
+  }
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_Sgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#endif
+
+  THCudaFree(state, d_result);
+
+  if (ra__ != ra_) {
+    THCTensor_(freeCopyTo)(state, ra__, ra_);
+  }
+
+  if (free_rinfo_) {
+    int min = THCudaIntTensor_minall(state, rinfo_);
+    int max = THCudaIntTensor_maxall(state, rinfo_);
+    THCudaIntTensor_free(state, rinfo_);
+    if (min != 0 || max != 0) {
+      THError("failed to factorize some batch elements (min info == %d, max info == %d)",
+              min, max);
+    }
+  }
+
+#else
+  THError("btrifact for CUDA tensors is only supported for floats and doubles");
+#endif
+}
+
+
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
+                              THCTensor *atf, THCudaIntTensor *pivots)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b));
+  THArgCheck(THCTensor_(_nDimension)(state, atf) == 3, 3, "expected 3D tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, b) == 3 ||
+             THCTensor_(_nDimension)(state, b) == 2, 4, "expected 2D or 3D tensor");
+  THArgCheck(THCTensor_(size)(state, atf, 0) ==
+             THCTensor_(size)(state, b, 0), 3, "number of batches must be equal");
+  THArgCheck(THCTensor_(size)(state, atf, 1) ==
+             THCTensor_(size)(state, atf, 2), 3, "A matrices must be square");
+  THArgCheck(THCTensor_(size)(state, atf, 1) ==
+             THCTensor_(size)(state, b, 1), 3, "dimensions of A and b must be equal");
+
+  if (rb_ != b) {
+    THCTensor_(resizeAs)(state, rb_, b);
+    THCTensor_(copy)(state, rb_, b);
+  }
+
+
+  int n = atf->size[1];
+  int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1;
+  THCTensor *atf_;
+  THCTensor *rb__;
+  int lda, ldb;
+
+  // correct ordering of A_tf
+  if (atf->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = atf->stride[2];
+    atf_ = atf;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    // it would be nice if we could use the op(A) flags to automatically
+    // transpose A if needed, but this leads to unpredictable behavior if the
+    // user clones A_tf later with a different ordering
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, atf, 1, 2);
+    atf_ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, atf_, NULL, 1, 2);
+    lda = atf_->stride[2];
+  }
+
+  // correct ordering of B
+  if (rb_->stride[1] == 1) {
+    // column ordered
+    if (rb_->_dim() == 2 || rb_->size[2] == 1) {
+      ldb = n;
+    } else {
+      ldb = rb_->stride[2];
+    }
+    rb__ = rb_;
+  } else {
+    // make column ordered
+    if (rb_->_dim() > 2) {
+      THCTensor *transp_r_ = THCTensor_(newTranspose)(state, rb_, 1, 2);
+      rb__ = THCTensor_(newClone)(state, transp_r_);
+      THCTensor_(free)(state, transp_r_);
+      THCTensor_(transpose)(state, rb__, NULL, 1, 2);
+      ldb = rb__->stride[2];
+    } else {
+      rb__ = THCTensor_(newClone)(state, rb_);
+      ldb = n;
+    }
+  }
+
+  int64_t num_batches = rb_->size[0];
+  size_t matrices_size = num_batches * sizeof(real*);
+
+  // Copy pointers to device.
+  auto d_result = static_cast<real**>(THCudaMalloc(state, matrices_size));
+  auto d_atf = static_cast<const real**>(THCudaMalloc(state, matrices_size));
+
+  const int64_t block = 512;
+  const int64_t grid = (num_batches + block - 1) / block;
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    (const real**)d_result, THCTensor_(data)(state, rb__),
+    rb__->stride[0], num_batches);
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    d_atf, THCTensor_(data)(state, atf_),
+    atf_->stride[0], num_batches);
+
+  if (!THCudaIntTensor_isContiguous(state, pivots)) {
+      THError("Error: pivots is not contiguous.");
+  }
+
+  int *pivots_data = THCudaIntTensor_data(state, pivots);
+  int info;
+
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_Sgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#endif
+
+  if (info < 0) {
+    THError("Illegal arg %d", -info);
+  }
+
+  THCudaFree(state, d_result);
+  THCudaFree(state, d_atf);
+
+  if (atf_ != atf) {
+    THCTensor_(free)(state, atf_);
+  }
+
+  if (rb__ != rb_) {
+    THCTensor_(freeCopyTo)(state, rb__, rb_);
+  }
+
+#else
+  THError("btrisolve for CUDA tensors is only supported for floats and doubles");
+#endif
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathBlas.h b/aten/src/THC/generic/THCTensorMathBlas.h
new file mode 100644
index 0000000..1279d7e
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathBlas.h
@@ -0,0 +1,16 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.h"
+#else
+
+THC_API accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(addmv)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec);
+THC_API void THCTensor_(addmm)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat1, THCTensor *mat2);
+THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2);
+THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a);
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *atf, THCudaIntTensor *pivots);
+
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu
new file mode 100644
index 0000000..fca7046
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathCompare.cu
@@ -0,0 +1,101 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorLTValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorGTValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorLEValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorGEValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorEQValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<uint8_t, real>(state, self_, src,
+                                  TensorNEValueOp<real,
+                                  unsigned char>(value));
+}
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                                  TensorLTValueOp<real,
+                                  real>(value));
+}
+
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                               TensorGTValueOp<real,
+                              real>(value));
+}
+
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                               TensorLEValueOp<real,
+                               real>(value));
+}
+
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                               TensorGEValueOp<real,
+                               real>(value));
+}
+
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                               TensorEQValueOp<real,
+                               real>(value));
+}
+
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue<real, real>(state, self_, src,
+                              TensorNEValueOp<real,
+                              real>(value));
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathCompare.h b/aten/src/THC/generic/THCTensorMathCompare.h
new file mode 100644
index 0000000..7b8837c
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathCompare.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.h"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu
new file mode 100644
index 0000000..ee7bc41
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathCompareT.cu
@@ -0,0 +1,113 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu"
+#else
+
+THC_API void
+THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorLTOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorGTOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorLEOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorGEOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorEQOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<uint8_t, real>(state, self_, src1, src2,
+                                   TensorNEOp<real,
+                                   unsigned char>());
+}
+
+THC_API void
+THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorLTOp<real,
+                                real>());
+}
+
+THC_API void
+THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorGTOp<real,
+                                real>());
+}
+
+THC_API void
+THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorLEOp<real,
+                                real>());
+}
+
+THC_API void
+THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorGEOp<real,
+                                real>());
+}
+
+THC_API void
+THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorEQOp<real,
+                                real>());
+}
+
+THC_API void
+THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor<real, real>(state, self_, src1, src2,
+                                TensorNEOp<real,
+                                real>());
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathCompareT.h b/aten/src/THC/generic/THCTensorMathCompareT.h
new file mode 100644
index 0000000..0d76835
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathCompareT.h
@@ -0,0 +1,19 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.h"
+#else
+
+THC_API void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+
+THC_API void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
new file mode 100644
index 0000000..fa72207
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -0,0 +1,737 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.cu"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#ifdef USE_MAGMA
+
+static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int k)
+{
+  int64_t size[1] = { k };
+  int64_t stride[1] = { 1 };
+  THCTensor_(resizeNd)(state, self, 1, size, stride);
+  size_t len = k * sizeof(real);
+  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n)
+{
+  int64_t size[2] = { m, n };
+  int64_t stride[2] = { 1, m };
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
+  size_t len = m * n * sizeof(real);
+  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self)
+{
+  THAssert(self->_dim() == 2);
+  size_t len = THCTensor_(nElement)(state, self)*sizeof(real);
+  THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1);
+  THCTensor *selfc = THCTensor_(newContiguous)(state, temp);
+  THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, selfc->storage) + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
+  THCTensor_(free)(state, temp);
+  THCTensor_(free)(state, selfc);
+}
+
+#endif // USE_MAGMA
+
+static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THAssert(src->_dim() == 2);
+  if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
+  {
+    THCTensor_(retain)(state, self);
+    return self;
+  }
+
+  if (self == src)
+    self = THCTensor_(new)(state);
+  else
+    THCTensor_(retain)(state, self);
+
+  int64_t size[2] = { src->size[0], src->size[1] };
+  int64_t stride[2] = { 1, src->size[0] };
+
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
+  THCTensor_(copy)(state, self, src);
+  return self;
+}
+
+
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
+  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
+  THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+
+  int64_t n = a_->size[0];
+  int64_t nrhs = b_->size[1];
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+  real *a_data = THCTensor_(data)(state, a);
+  real *b_data = THCTensor_(data)(state, b);
+
+  int *ipiv = th_magma_malloc_pinned<int>(n);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#else
+  magma_dgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#endif
+
+  if (info < 0)
+    THError("MAGMA gesv : Argument %d : illegal value", -info);
+  else if (info > 0)
+    THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info);
+
+  magma_free_pinned(ipiv);
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+  THError(NoMagma(gesv));
+#endif
+}
+
+THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
+                               const char *uplo, const char *trans, const char *diag)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
+  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
+  THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+
+  magma_side_t sz = MagmaLeft;
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+  magma_trans_t ts = trans[0] == 'N' ? MagmaNoTrans : MagmaTrans;
+  magma_diag_t dg = diag[0] == 'U' ? MagmaUnit : MagmaNonUnit;
+
+  real alpha = 1;
+
+  int64_t n = a_->size[0];
+  int64_t nrhs = b_->size[1];
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+  real *a_data = THCTensor_(data)(state, a);
+  real *b_data = THCTensor_(data)(state, b);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_strsm(sz, ul, ts, dg, n, nrhs, alpha, a_data, n, b_data, n);
+#else
+  magma_dtrsm(sz, ul, ts, dg, n, nrhs, alpha, a_data, n, b_data, n);
+#endif
+
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+  THError(NoMagma(trtrs));
+#endif
+}
+
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
+  THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional");
+  THArgCheck(a_->size[0] == b_->size[0], 2, "Expected A and b to have same size "
+      "at dim 0, but they have incompatible sizes");
+  THArgCheck(a_->size[0] >= a_->size[1], 2, "Expected A with shape (m x n) to have "
+      "m >= n. The case for m < n is not implemented yet.");
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+  real *a_data = THCTensor_(data)(state, a);
+  real *b_data = THCTensor_(data)(state, b);
+
+  int64_t m = a->size[0];
+  int64_t n = a->size[1];
+  int64_t nrhs = b->size[1];
+  real wkopt;
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#else
+  magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#endif
+
+  real *hwork = th_magma_malloc_pinned<real>((size_t)wkopt);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#else
+  magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#endif
+
+  magma_free_pinned(hwork);
+
+  if (info != 0)
+    THError("MAGMA gels : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+  THError(NoMagma(gels));
+#endif
+}
+
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
+{
+#ifdef USE_MAGMA
+  int64_t n = a->size[0];
+  int64_t lda = n;
+
+  magma_uplo_t uplo = uplos[0] == 'U' ?  MagmaUpper : MagmaLower;
+  magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  // eigen values and workspace
+  real *w = th_magma_malloc_pinned<real>(n);
+  real *wA = th_magma_malloc_pinned<real>(lda * n);
+
+  // compute optimal size of work array
+  int info;
+  real lwork;
+  int liwork;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#else
+  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#endif
+
+  real *work = th_magma_malloc_pinned<real>((size_t)lwork);
+  int *iwork = th_magma_malloc_pinned<int>(liwork);
+
+  // compute eigenvalues and, optionally, eigenvectors
+#if defined(THC_REAL_IS_FLOAT)
+  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#else
+  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#endif
+
+  // copy eigen values from w to re_
+  if (info == 0)
+    THCTensor_(copyArray1d)(state, re_, w, n);
+
+  magma_free_pinned(iwork);
+  magma_free_pinned(work);
+  magma_free_pinned(wA);
+  magma_free_pinned(w);
+
+  // check error value
+  if (info > 0)
+    THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+  else if (info < 0)
+    THError("MAGMA syev : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, input, rv_);
+#else
+  THError(NoMagma(syev));
+#endif
+}
+
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
+
+  magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
+  int64_t n = a_->size[0];
+
+  real *a_data = th_magma_malloc_pinned<real>(n * n);
+  THCTensor_(copyTensor2d)(state, a_data, a_);
+
+  real *wr = th_magma_malloc_pinned<real>(n);
+  real *wi = th_magma_malloc_pinned<real>(n);
+
+  real *vr_data = NULL;
+  int64_t ldvr = 1;
+  if (jobvr == MagmaVec)
+  {
+    vr_data = th_magma_malloc_pinned<real>(n * n);
+    ldvr = n;
+  }
+
+  real wkopt;
+  int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#else
+  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#endif
+
+  int lwork = (int) wkopt;
+  real *work_data = th_magma_malloc_pinned<real>(lwork);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#else
+  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+  else if (info < 0)
+    THError("MAGMA geev : Argument %d : illegal value", -info);
+
+  {
+    THCTensor_(resize2d)(state, re_, 2, n);
+    THCTensor *re = THCTensor_(newContiguous)(state, re_);
+    THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
+    THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+    THCTensor_(freeCopyTo)(state, re, re_);
+    THCTensor_(transpose)(state, re_, NULL, 0, 1);
+  }
+
+  if (jobvr == MagmaVec)
+    THCTensor_(copyArray2d)(state, rv_, vr_data, n, n);
+
+  magma_free_pinned(work_data);
+  magma_free_pinned(vr_data);
+  magma_free_pinned(wi);
+  magma_free_pinned(wr);
+  magma_free_pinned(a_data);
+
+#else
+  THError(NoMagma(geev));
+#endif
+}
+
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+{
+#ifdef USE_MAGMA
+  THCTensor *ra_ = THCTensor_(new)(state);
+  THCTensor_(gesvd2)(state, ru_, rs_, rv_,  ra_, a, jobu);
+  THCTensor_(free)(state, ra_);
+#else
+  THError(NoMagma(gesvd));
+#endif
+}
+
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
+
+  magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
+
+  int iunused[1];
+  int64_t m = a->size[0];
+  int64_t n = a->size[1];
+  int64_t k = m < n ? m : n;
+  int64_t j = (jobz == MagmaAllVec) ? m : k;
+  int64_t jv = (jobz == MagmaAllVec) ? n : k;
+
+  real *a_data = th_magma_malloc_pinned<real>(m * n);
+  THCTensor_(copyTensor2d)(state, a_data, a);
+
+  real *rs_data = th_magma_malloc_pinned<real>(k);
+  real *ru_data = th_magma_malloc_pinned<real>(m * j);
+  real *rv_data = th_magma_malloc_pinned<real>(n * n);
+
+  real wkopt;
+  int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
+#else
+  magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
+#endif
+
+  int lwork = (int) wkopt;
+  real *work_data = th_magma_malloc_pinned<real>(lwork);
+  int *iwork = th_magma_malloc_pinned<int>(8 * k);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
+#else
+  magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA gesdd : the updating process of SBDSDC did not converge (error: %d)", info);
+  else if (info < 0)
+    THError("MAGMA gesdd : Argument %d : illegal value", -info);
+
+  THCTensor_(copyArray2d)(state, rv_, rv_data, n, n);
+  THCTensor_(transpose)(state, rv_, NULL, 0, 1);
+  if (jobz != MagmaAllVec)
+    THCTensor_(narrow)(state, rv_, rv_, 1, 0, jv);
+  THCTensor_(copyArray2d)(state, ru_, ru_data, m, j);
+  THCTensor_(copyArray1d)(state, rs_, rs_data, k);
+  THCTensor_(copyArray2d)(state, ra_, a_data,  m, n);
+
+  magma_free_pinned(work_data);
+  magma_free_pinned(iwork);
+  magma_free_pinned(rv_data);
+  magma_free_pinned(ru_data);
+  magma_free_pinned(rs_data);
+  magma_free_pinned(a_data);
+#else
+  THError(NoMagma(gesvd2));
+#endif
+}
+
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
+{
+  THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+#ifdef USE_MAGMA
+  int info;
+  int64_t n = a->size[0];
+  int lwork = n * magma_get_sgetri_nb(n);
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int *ipiv = th_magma_malloc_pinned<int>(n);
+
+  THCTensor *work = THCTensor_(newWithSize1d)(state, lwork);
+  real *work_data = THCTensor_(data)(state, work);
+
+  // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#else
+  magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("MAGMA getrf : Argument %d : illegal value", -info);
+
+  // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#else
+  magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("MAGMA getri : Argument %d : illegal value", -info);
+
+  THCTensor_(free)(state, work);
+  magma_free_pinned(ipiv);
+  THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+  int64_t n = a->size[0];
+
+  // input
+  THCTensor *input = THCTensor_(newColumnMajor)(state, a, a);
+  THCTensor_(resizeNd)(state, ra_, 2, input->size, input->stride);
+
+  real *matrices1[1] = { THCTensor_(data)(state, input) };
+  real *matrices2[1] = { THCTensor_(data)(state, ra_) };
+
+  // Copy pointers to device.
+  auto d_matrices1 = static_cast<real**>(THCudaMalloc(state, sizeof(real*)));
+  auto d_matrices2 = static_cast<real**>(THCudaMalloc(state, sizeof(real*)));
+
+  THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, sizeof(real*),
+                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+  THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, sizeof(real*),
+                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+  int info;
+  auto info_gpu = static_cast<int*>(THCudaMalloc(state, sizeof(int)));
+
+  auto ipiv_gpu = static_cast<int*>(THCudaMalloc(state, n * sizeof(int)));
+
+  // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#else
+  THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#endif
+
+  THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (info > 0)
+    THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("CUBLAS getrf : Argument %d : illegal value", -info);
+
+  // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgetri(state, n, (const real**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#else
+  THCudaBlas_Dgetri(state, n, (const real**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#endif
+
+  THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (info > 0)
+    THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("CUBLAS getri : Argument %d : illegal value", -info);
+
+  THCudaFree(state, ipiv_gpu);
+  THCudaFree(state, info_gpu);
+
+  THCudaFree(state, d_matrices1);
+  THCudaFree(state, d_matrices2);
+
+  THCTensor_(free)(state, input);
+#endif
+}
+
+__global__ void THCTensor_(copyUpperSymmetric)(real *input, int n, int len)
+{
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+    const int r = idx % n;
+    const int c = idx / n;
+    if (r > c) {
+      input[idx] = input[r*n + c];
+    }
+  }
+}
+
+__global__ void THCTensor_(copyLowerSymmetric)(real *input, int n, int len)
+{
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+    const int r = idx % n;
+    const int c = idx / n;
+    if (r < c) {
+      input[idx] = input[r*n + c];
+    }
+  }
+}
+
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int64_t n = a->size[0];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotri_gpu(ul, n, input_data, n, &info);
+#else
+  magma_dpotri_gpu(ul, n, input_data, n, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info);
+  else if (info < 0)
+    THError("MAGMA potri : Argument %d : illegal value", -info);
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  const int len = n*n;
+  dim3 blocks(std::min(DIVUP(len, 128), 65535));
+  dim3 threads(128);
+  if (uplo[0] == 'U') {
+    THCTensor_(copyUpperSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+  } else {
+    THCTensor_(copyLowerSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+  }
+
+  THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+  THError(NoMagma(potri));
+#endif
+}
+
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int64_t n = a->size[0];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotrf_gpu(ul, n, input_data, n, &info);
+#else
+  magma_dpotrf_gpu(ul, n, input_data, n, &info);
+#endif
+
+  // check error value
+  if (info > 0)
+    THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
+  else if (info < 0)
+    THError("MAGMA potrf : Argument %d : illegal value", -info);
+
+  if (uplo[0] == 'U') {
+    THCTensor_(triu)(state, ra_, input, 0);
+  } else {
+    THCTensor_(tril)(state, ra_, input, 0);
+  }
+  THCTensor_(free)(state, input);
+#else
+  THError(NoMagma(potrf));
+#endif
+}
+
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int64_t n = a->size[0];
+  int64_t nrhs = b->size[1];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b);
+  real *b_data = THCTensor_(data)(state, b_);
+  THCTensor *a_ = THCTensor_(newColumnMajor)(state, a, a);
+  real *a_data = THCTensor_(data)(state, a_);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#else
+  magma_dpotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#endif
+
+  // check error value
+  if (info < 0)
+    THError("MAGMA potrs : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, b_, rb_);
+  THCTensor_(free)(state, a_);
+#else
+  THError(NoMagma(potrs));
+#endif
+}
+
+THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  int64_t m = a->size[0];
+  int64_t n = a->size[1];
+  int64_t k = (m < n ? m : n);
+
+#if defined(THC_REAL_IS_FLOAT)
+  int64_t nb = magma_get_sgeqrf_nb(m, n);
+#else
+  int64_t nb = magma_get_dgeqrf_nb(m, n);
+#endif
+
+  real *rtau_data = th_magma_malloc_pinned<real>(k);
+  real *a_data = THCTensor_(data)(state, a);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeqrf2_gpu(m, n, a_data, m, rtau_data, &info);
+#else
+  magma_dgeqrf2_gpu(m, n, a_data, m, rtau_data, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA geqrf2 : Argument %d : illegal value.", -info);
+
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(copyArray1d)(state, rtau_, rtau_data, k);
+  magma_free_pinned(rtau_data);
+#else
+  THError(NoMagma(geqrf));
+#endif
+}
+
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_);
+  int64_t m = a->size[0];
+  int64_t n = a->size[1];
+  int64_t k = (m < n ? m : n);
+
+#if defined(THC_REAL_IS_FLOAT)
+  int64_t nb = magma_get_sgeqrf_nb(m, n);
+#else
+  int64_t nb = magma_get_dgeqrf_nb(m, n);
+#endif
+
+  real *a_data = THCTensor_(data)(state, a);
+  real *tau_data = th_magma_malloc_pinned<real>(k);
+  THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + magma_roundup(n, 32))*nb);
+  real *work_data = THCTensor_(data)(state, work);
+
+  int info;
+  // We need to call two different versions of ?geqrf:
+  //   ?geqrf_gpu allows fast computation of Q via ?orqrf_gpu, but doesn't give
+  //     R properly. Note that the MAGMA documentation for this method is wrong.
+  //     http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
+  //   ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orqrf_gpu
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#else
+  magma_dgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA geqrf2 : Argument %d : illegal value.", -info);
+
+  THCTensor_(narrow)(state, a, a, 0, 0, k);
+  THCTensor_(triu)(state, rr_, a, 0);
+  THCTensor_(free)(state, a);
+
+  a = THCTensor_(newColumnMajor)(state, rq_, a_);
+  a_data = THCTensor_(data)(state, a);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#else
+  magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA geqrf : Argument %d : illegal value.", -info);
+
+  THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a);
+  real *q_data = THCTensor_(data)(state, q);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
+#else
+  magma_dorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA orgqr : Argument %d : illegal value.", -info);
+
+  THCTensor_(free)(state, a);
+  THCTensor_(free)(state, work);
+  magma_free_pinned(tau_data);
+
+  THCTensor_(narrow)(state, q, q, 1, 0, k);
+  THCTensor_(freeCopyTo)(state, q, rq_);
+#else
+  THError(NoMagma(qr));
+#endif
+}
+
+#endif
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h
new file mode 100644
index 0000000..1462af4
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathMagma.h
@@ -0,0 +1,25 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+// MAGMA (i.e. CUDA implementation of LAPACK functions)
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
+                               const char *uplo, const char *trans, const char *diag);
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo);
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr);
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a);
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *a, THCTensor *b, const char *uplo);
+THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_);
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a);
+
+#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..e0f1219
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -0,0 +1,340 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
+#else
+
+THC_API void
+THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorSubConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorSubConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, real value, real alpha)
+{
+#ifdef THC_REAL_IS_HALF
+  auto v = THC_half2float(value) * THC_half2float(alpha);
+  THCTensor_(add)(state, self_, src_, THC_float2half(v));
+#else
+  THCTensor_(add)(state, self_, src_, value * alpha);
+#endif
+}
+
+THC_API void
+THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, real value, real alpha)
+{
+#ifdef THC_REAL_IS_HALF
+  auto v = THC_half2float(value) * THC_half2float(alpha);
+  THCTensor_(sub)(state, self_, src_, THC_float2half(v));
+#else
+  THCTensor_(sub)(state, self_, src_, value * alpha);
+#endif
+}
+
+THC_API void
+THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorDivConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorDivConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCTensor_(mul)(state, self_, src_, pow(2, value));
+#elif defined(THC_REAL_IS_HALF)
+  return THError("lshift not supported for torch.CudaHalfTensor");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorLShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorLShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCTensor_(mul)(state, self_, src_, pow(2, -value));
+#elif defined(THC_REAL_IS_HALF)
+  return THError("rshift not supported for torch.CudaHalfTensor");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorRShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorRShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorFmodOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorFmodOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorRemainderOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorRemainderOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(!src_->is_empty() && src_->dim() == 2, 1, "expected a matrix");
+
+  if (self_ != src_)
+    THCTensor_(resizeAs)(state, self_, src_);
+
+  int64_t stride0 = self_->stride[0];
+  int64_t stride1 = self_->stride[1];
+  real *start = THCTensor_(data)(state, self_);
+
+  TensorTriOp<real, 0> op(start, stride0, stride1, k);
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, src_, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(!src_->is_empty() && src_->dim() == 2, 1, "expected a matrix");
+
+  if (self_ != src_)
+    THCTensor_(resizeAs)(state, self_, src_);
+
+  int64_t stride0 = self_->stride[0];
+  int64_t stride1 = self_->stride[1];
+  real *start = THCTensor_(data)(state, self_);
+
+  TensorTriOp<real, 1> op(start, stride0, stride1, k);
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, src_, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
+    return 0;
+  }
+
+  // This is not as efficient as TH, but the basic idea: create a buffer that stores
+  // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value
+  // in this buffer is 1, the two tensors are equal, otherwise they are not
+
+  THLongStorage *size = THCTensor_(newSizeOf)(state, self_);
+  THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, size, NULL);
+
+  if (!THC_pointwiseApply3<uint8_t, real, real>(state, buf, self_, src_, TensorEQOp<real, unsigned char>())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  unsigned char min = THCudaByteTensor_minall(state, buf);
+
+  THLongStorage_free(size);
+  THCudaByteTensor_free(state, buf);
+
+  return min != 0;
+}
+
+THC_API void
+THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitand only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorBitAndConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorBitAndConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorBitOrConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorBitOrConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitxor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorBitXorConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src_, TensorBitXorConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h
new file mode 100644
index 0000000..b54b0c6
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathPairwise.h
@@ -0,0 +1,21 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h"
+#else
+
+THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(add_scaled)(THCState *state, THCTensor *self, THCTensor *src, real value, real alpha);
+THC_API void THCTensor_(sub_scaled)(THCState *state, THCTensor *self, THCTensor *src, real value, real alpha);
+THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, real value);
+
+THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
new file mode 100644
index 0000000..7fb6fda
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -0,0 +1,765 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu"
+#else
+
+#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)             \
+  struct Tensor_##NAME##_##REAL##_Op {                                  \
+    __device__ __forceinline__ void operator()(real* out, real* in) const { \
+      *out = CFUNC(*in);                                                \
+    }                                                                   \
+                                                                        \
+    __device__ __forceinline__ void operator()(real* v) const {         \
+      *v = CFUNC(*v);                                                   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
+    THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));               \
+    if (self_ == src) {                                                 \
+      if (!THC_pointwiseApply1<real>(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
+      }                                                                 \
+    } else {                                                            \
+      THCTensor_(resizeAs)(state, self_, src);                          \
+                                                                        \
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, Tensor_##NAME##_##REAL##_Op())) { \
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    THCudaCheck(cudaGetLastError());                                    \
+  }
+
+#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC, REAL) \
+  IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  log, THCNumerics<real>::log,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(lgamma, THCNumerics<real>::lgamma, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log10, THCNumerics<real>::log10, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics<real>::log1p, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( log2, THCNumerics<real>::log2,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  exp, THCNumerics<real>::exp,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(expm1, THCNumerics<real>::expm1, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  cos, THCNumerics<real>::cos,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  sin, THCNumerics<real>::sin,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sqrt, THCNumerics<real>::sqrt,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(rsqrt, THCNumerics<real>::rsqrt, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( ceil, THCNumerics<real>::ceil,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(floor, THCNumerics<real>::floor, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(trunc, THCNumerics<real>::trunc, Real)
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  acos, THCNumerics<real>::acos,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  cosh, THCNumerics<real>::cosh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  asin, THCNumerics<real>::asin,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  sinh, THCNumerics<real>::sinh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(   tan, THCNumerics<real>::tan,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  atan, THCNumerics<real>::atan,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  tanh, THCNumerics<real>::tanh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(   erf, THCNumerics<real>::erf,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  erfc, THCNumerics<real>::erfc,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(erfinv, THCNumerics<real>::erfinv,Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( round, THCNumerics<real>::round, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  frac, THCNumerics<real>::frac,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  cinv, THCNumerics<real>::cinv,  Real)
+
+#endif
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  neg, THCNumerics<real>::neg,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  abs, THCNumerics<real>::abs,   Real)
+
+#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_
+#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
+
+void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorSignOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorSignOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real min_value,
+  real max_value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorClampOp<real>(min_value, max_value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorClampOp<real>(min_value, max_value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
+
+  int i;
+  int nd = THCTensor_(nDimension)(state, x);
+  ptrdiff_t nelem = THCTensor_(nElement)(state, x);
+  THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions");
+  for (i = 0; i < nd; i++) {
+    THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i);
+    if (dimension < 0 && THCTensor_(size)(state, x, i) == 3) {
+      dimension = i;
+    }
+  }
+
+  THArgCheck(dimension >= 0 && dimension < nd, 3, "dimension %d out of range", dimension+1);
+  THArgCheck(THCTensor_(size)(state, x, dimension) == 3, 3,
+      "dimension %d does not have size 3", dimension+1);
+  THCTensor_(resizeAs)(state, self, x);
+
+  int64_t sx = THCTensor_(stride)(state, x, dimension);
+  int64_t sy = THCTensor_(stride)(state, y, dimension);
+  int64_t so = THCTensor_(stride)(state, self, dimension);
+  THCTensor *nx = THCTensor_(newNarrow)(state, x, dimension, 0, 1);
+  THCTensor *ny = THCTensor_(newNarrow)(state, y, dimension, 0, 1);
+  THCTensor *nself = THCTensor_(newNarrow)(state, self, dimension, 0, 1);
+  if (!THC_pointwiseApply3<real, real, real>(state, nself, nx, ny, TensorCrossOp<real>(sx, sy, so))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+  THCTensor_(free)(state, nx);
+  THCTensor_(free)(state, ny);
+  THCTensor_(free)(state, nself);
+}
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+void THCTensor_(atan2)(THCState *state, THCTensor *self_, THCTensor *tx, THCTensor *ty)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, tx, ty));
+  THArgCheck(THCTensor_(nElement)(state, tx) ==
+             THCTensor_(nElement)(state, ty), 3, "sizes do not match");
+  THCTensor_(resizeAs)(state, self_, tx);
+
+  if (!THC_pointwiseApply3<real, real, real>(state, self_, tx, ty, TensorATan2Op<real>())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorSigmoidOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorSigmoidOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(digamma)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ != src) {
+    THCTensor_(resizeAs)(state, self_, src);
+  }
+  if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorDigammaOp<real, accreal>())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTensor* src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ != src) {
+    THCTensor_(resizeAs)(state, self_, src);
+  }
+  switch (n) {
+    case 0:
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorDigammaOp<real, accreal>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+      break;
+    case 1:
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorTrigammaOp<real, accreal>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+      break;
+    default:
+      THError("polygamma(n,x) is not implemented for n>=2");
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b));
+  THArgCheck(THCTensor_(nElement)(state, a) ==
+             THCTensor_(nElement)(state, b), 3, "sizes do not match");
+  THCTensor_(resizeAs)(state, result, a);
+
+  if (!THC_pointwiseApply3<real, real, real>(state, result, a, b, TensorLerpOp<real>(w))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
+
+THC_API void
+THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self += src2
+      if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += value * src2
+      if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 + src2
+      if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 + value * src2
+      if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self -= src2
+      if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += -value * src2
+      if (!THC_pointwiseApply2<real, real>(state, self_, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 - src2
+      if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 - value * src2
+      if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self *= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 * src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self = pow(self, src2)
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = pow(src1, src2)
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real value) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(1))) {
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, 1>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(2))) {
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, 2>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(3))) {
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, 3>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(-1))) {
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, -1>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(-2))) {
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, -2>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+#endif
+    } else {
+      // fallback implementation using pow
+      if (!THC_pointwiseApply1<real>(state, self_, TensorPowOp<real, -3>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(1))) {
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, 1>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(2))) {
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, 2>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(3))) {
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, 3>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(-1))) {
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, -1>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else if (THCNumerics<real>::eq(value, ScalarConvert<int, real>::to(-2))) {
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, -2>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+#endif
+    } else {
+      // fallback implementation using pow
+      if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorPowOp<real, -3>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *src)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1<real>(state, self_, TensorTPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2<real, real>(state, self_, src, TensorTPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+THC_API void
+THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+  return THError("clshift not supported for torch.CudaHalfTensor");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorLShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorLShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+  return THError("crshift not supported for torch.CudaHalfTensor");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorRShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorRShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2<real, real>(state, self, src2, TensorMaxOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3<real, real, real>(state, self, src1, src2, TensorMaxOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2<real, real>(state, self, src2, TensorMinOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3<real, real, real>(state, self, src1, src2, TensorMinOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2<real, real>(state, self, src2, TensorCRemainderOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3<real, real, real>(state, self, src1, src2, TensorCRemainderOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2<real, real>(state, self, src2, TensorCFmodOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3<real, real, real>(state, self, src1, src2, TensorCFmodOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+
+  if (self == src) {
+    if (!THC_pointwiseApply1<real>(state, self, TensorMaxValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src);
+    if (!THC_pointwiseApply2<real, real>(state, self, src, TensorMaxValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+
+  if (self == src) {
+    if (!THC_pointwiseApply1<real>(state, self, TensorMinValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src);
+    if (!THC_pointwiseApply2<real, real>(state, self, src, TensorMinValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  if(self_ != t)
+  {
+    THCTensor_(resizeAs)(state, self_, t);
+    THCTensor_(copy)(state, self_, t);
+  }
+  else
+  {
+    THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1),
+               1, "sizes do not match");
+  }
+
+  THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2),
+             3, "sizes do not match");
+
+  if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorAddCMulOp<real>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  if(self_ != t)
+  {
+    THCTensor_(resizeAs)(state, self_, t);
+    THCTensor_(copy)(state, self_, t);
+  }
+  else
+  {
+    THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1),
+               1, "sizes do not match");
+  }
+  THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2),
+             3, "sizes do not match");
+
+  if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorAddCDivOp<real>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitand is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorBitAndOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorBitAndOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorBitOrOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorBitOrOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<real, real>(state, self_, src2, TensorBitXorOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<real, real, real>(state, self_, src1, src2, TensorBitXorOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h
new file mode 100644
index 0000000..7f79027
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathPointwise.h
@@ -0,0 +1,72 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h"
+#else
+
+THC_API void THCTensor_(pow)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(tpow)(THCState *state, THCTensor *self, real value, THCTensor *src);
+THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(log)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(lgamma)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(digamma)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(polygamma)(THCState *state, THCTensor *self, int64_t n, THCTensor *src);
+THC_API void THCTensor_(log10)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(log2)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(expm1)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(cos)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(acos)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(cosh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sin)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(asin)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sinh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(tan)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(atan)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(atan2)(THCState *state, THCTensor *r_, THCTensor *tx, THCTensor *ty);
+THC_API void THCTensor_(tanh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(erf)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(erfc)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(erfinv)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sqrt)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(rsqrt)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(ceil)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(floor)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(round)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(trunc)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(frac)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w);
+
+THC_API void THCTensor_(cinv)(THCState *state, THCTensor *self, THCTensor *src);
+
+#endif
+
+THC_API void THCTensor_(neg)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(abs)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, real min_value, real max_value);
+THC_API void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
+
+THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+
+THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu
new file mode 100644
index 0000000..e5d8e22
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@@ -0,0 +1,476 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu"
+#else
+
+THC_API void
+THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  if (!THC_reduceDim<real>(state, self, src,
+                           thrust::identity<accreal>{},
+                           ReduceAdd<accreal>{},
+                           thrust::identity<accreal>{},
+                           scalar_cast<accreal>(0),
+                           dimension,
+                           keepdim)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  if (!THC_reduceDim<real>(state, self, src,
+                           thrust::identity<accreal>{},
+                           ReduceMultiply<accreal>{},
+                           thrust::identity<accreal>{},
+                           scalar_cast<accreal>(1),
+                           dimension,
+                           keepdim)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  const accreal size = scalar_cast<accreal>(THCTensor_(size)(state, src, dim));
+  if (!THC_reduceDim<real>(state, self, src,
+                           thrust::identity<accreal>{},
+                           ReduceAdd<accreal>{},
+                           ReduceDivide<accreal>{size},
+                           scalar_cast<accreal>(0),
+                           dim,
+                           keepdim)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void
+THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, real maxnorm)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  THCTensor *self_;
+  THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
+  THCTensor *data = THCTensor_(newClone)(state, src_);
+  ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0];
+
+  THArgCheck(dimension >= 0 && dimension < THCTensor_(_nDimension)(state, src), 3, "invalid dimension");
+  THArgCheck(THCNumerics<real>::gt(value, scalar_cast<real>(0)), 2, "non-positive-norm not supported");
+  THArgCheck(THCTensor_(_nDimension)(state, src) > 1, 1, "need at least 2 dimensions");
+
+  dim3 grid(data->size[0]);
+  dim3 threads(32);
+
+  THCTensor_kernel_renorm<real, accreal>
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>
+    (THCTensor_(data)(state, data), scalar_cast<accreal>(value), size, scalar_cast<accreal>(maxnorm));
+
+  cudaError errcode = cudaGetLastError();
+  if(errcode != cudaSuccess)
+    THError(cudaGetErrorString(errcode));
+
+  THCTensor_(free)(state, src_);
+  self_ = THCTensor_(newTranspose)(state, data, dimension, 0);
+  THCTensor_(resizeAs)(state, self, self_);
+  THCTensor_(freeCopyTo)(state, self_, self);
+  THCTensor_(free)(state, data);
+}
+
+THC_API void
+THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+
+  THCTensor_preserveReduceDimSemantics(
+      state, self_, THCTensor_(_nDimension)(state, src), dimension, keepdim);
+  THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, self_, dim, NULL);
+  THLongStorage_free(dim);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (dimension == THCTensor_(_nDimension)(state, src) - 1) {
+    THCTensor_varInnermostDim<THCTensor, real, accreal, true>(state, self, src, biased);
+  } else {
+    THCTensor_varOuterDim<THCTensor, real, accreal, true>(state, self, src, dimension, biased);
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, self_, self_, dimension);
+  }
+}
+
+THC_API void
+THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+
+  THCTensor_preserveReduceDimSemantics(
+      state, self_, THCTensor_(_nDimension)(state, src), dimension, keepdim);
+  THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, self_, dim, NULL);
+  THLongStorage_free(dim);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (dimension == THCTensor_(_nDimension)(state, src) - 1) {
+    THCTensor_varInnermostDim<THCTensor, real, accreal, false>(state, self, src, biased);
+  } else {
+    THCTensor_varOuterDim<THCTensor, real, accreal, false>(state, self, src, dimension, biased);
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, self_, self_, dimension);
+  }
+}
+
+THC_API accreal
+THCTensor_(stdall)(THCState *state, THCTensor *self, int biased)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self, biased)));
+}
+
+THC_API accreal
+THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal mean = THCTensor_(meanall)(state, self);
+
+  accreal val;
+  if (!THC_reduceAll<real>(state, self,
+                           SquareFunctor<accreal>(mean),
+                           ReduceAdd<accreal>(),
+                           scalar_cast<accreal>(0),
+                           &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  val = THCNumerics<accreal>::div(
+    val,
+    scalar_cast<accreal>(std::max<int64_t>(0, THCTensor_(nElement)(state, self) - (biased ? 0 : 1)))
+  );
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API void
+THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real _value, int dimension, int keepdim)
+{
+  const accreal value = scalar_cast<accreal>(_value);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(0))) {
+    THC_reduceDim<real>(state, self, src,
+                        TensorNonZeroOp<accreal>{},
+                        ReduceAdd<accreal>{},
+                        thrust::identity<accreal>{},
+                        scalar_cast<accreal>(0),
+                        dimension, keepdim);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(1))) {
+    THC_reduceDim<real>(state, self, src,
+                        TensorNormOp<accreal, 1>{value},
+                        ReduceAdd<accreal>{},
+                        thrust::identity<accreal>{},
+                        scalar_cast<accreal>(0),
+                        dimension, keepdim);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(2))) {
+    THC_reduceDim<real>(state, self, src,
+                        TensorNormOp<accreal, 2>{value},
+                        ReduceAdd<accreal>{},
+                        ReducePow<accreal>{scalar_cast<accreal>(.5)},
+                        scalar_cast<accreal>(0),
+                        dimension, keepdim);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(INFINITY))) {
+    THC_reduceDim<real>(state, self, src,
+                        TensorNormOp<accreal, 1>{value},
+                        ReduceMax<accreal>{},
+                        thrust::identity<accreal>{},
+                        scalar_cast<accreal>(0),
+                        dimension, keepdim);
+  } else {
+    THC_reduceDim<real>(state, self, src,
+                        TensorNormOp<accreal, -1>{value},
+                        ReduceAdd<accreal>{},
+                        ReducePow<accreal>{THCNumerics<accreal>::cinv(value)},
+                        scalar_cast<accreal>(0),
+                        dimension, keepdim);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API accreal
+THCTensor_(normall)(THCState *state, THCTensor *self, real _value)
+{
+  const accreal value = scalar_cast<accreal>(_value);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal result;
+
+  if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(0))) {
+    THC_reduceAll<real>(state, self,
+                        TensorNonZeroOp<accreal>{},
+                        ReduceAdd<accreal>{},
+                        scalar_cast<accreal>(0),
+                        &result, 0);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(1))) {
+    THC_reduceAll<real>(state, self,
+                        TensorNormOp<accreal, 1>{value},
+                        ReduceAdd<accreal>{},
+                        scalar_cast<accreal>(0),
+                        &result, 0);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(2))) {
+    THC_reduceAll<real>(state, self,
+                        TensorNormOp<accreal, 2>{value},
+                        ReduceAdd<accreal>{},
+                        scalar_cast<accreal>(0),
+                        &result, 0);
+    result = THCNumerics<accreal>::sqrt(result);
+  } else if (THCNumerics<accreal>::eq(value, scalar_cast<accreal>(INFINITY))) {
+    THC_reduceAll<real>(state, self,
+                        TensorNormOp<accreal, 1>{value},
+                        ReduceMax<accreal>{},
+                        scalar_cast<accreal>(0),
+                        &result, 0);
+  } else {
+    THC_reduceAll<real>(state, self,
+                        TensorNormOp<accreal, -1>{value},
+                        ReduceAdd<accreal>{},
+                        scalar_cast<accreal>(0),
+                        &result, 0);
+    result = THCNumerics<accreal>::pow(result, 
+                                       THCNumerics<accreal>::cinv(value));
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return result;
+}
+
+accreal THCTensor_(dist)(THCState *state, THCTensor *self,
+                         THCTensor *src, real _value)
+{
+  const accreal value = scalar_cast<accreal>(_value);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  self = THCTensor_(newContiguous)(state, self);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  src = THCTensor_(newContiguous)(state, src);
+  thrust::device_ptr<real> self_data(THCTensor_(data)(state, self));
+  thrust::device_ptr<real> src_data(THCTensor_(data)(state, src));
+
+  THCThrustAllocator thrustAlloc(state);
+  accreal result = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    self_data, self_data+size, src_data, scalar_cast<accreal>(0),
+    thrust::plus<accreal>(),
+    ThrustTensorDistOp<real, accreal>(value));
+
+  THCTensor_(free)(state, src);
+  THCTensor_(free)(state, self);
+
+  return THCNumerics<accreal>::pow(result, THCNumerics<accreal>::cinv(value));
+}
+
+#endif
+
+THC_API accreal
+THCTensor_(sumall)(THCState *state, THCTensor *self) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll<real>(state, self,
+                           thrust::identity<accreal>{},
+                           ReduceAdd<accreal>{},
+                           scalar_cast<accreal>(0),
+                           &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API accreal
+THCTensor_(prodall)(THCState *state, THCTensor *self) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll<real>(state, self,
+                           thrust::identity<accreal>{},
+                           ReduceMultiply<accreal>{},
+                           scalar_cast<accreal>(1),
+                           &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API accreal
+THCTensor_(meanall)(THCState *state, THCTensor *self)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
+}
+
+THC_API real
+THCTensor_(minall)(THCState *state, THCTensor *self) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll<real>(state, self,
+                           thrust::identity<accreal>{},
+                           ReduceMin<accreal>{},
+                           THCNumerics<accreal>::max(), &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return scalar_cast<real>(val);
+}
+
+THC_API real
+THCTensor_(maxall)(THCState *state, THCTensor *self) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll<real>(state, self,
+                           thrust::identity<accreal>{},
+                           ReduceMax<accreal>{},
+                           THCNumerics<accreal>::min(), &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return scalar_cast<real>(val);
+}
+
+THC_API real
+THCTensor_(medianall)(THCState *state, THCTensor *self) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+
+  real val;
+  ptrdiff_t nelem, k;
+
+  nelem = THCTensor_(nElement)(state, self);
+  k = (nelem-1) >> 1;
+
+  THLongStorage *size = THLongStorage_newWithSize1(nelem);
+  THCTensor *view = THCTensor_(newView)(state, self, size);
+
+  THLongStorage_free(size);
+
+  THCTensor *sorted = THCTensor_(new)(state);
+  THCudaLongTensor *indices = THCudaLongTensor_new(state);
+
+  THCTensor_(sort)(state, sorted, indices, view, 0, 0);
+
+  val = THCTensor_(get1d)(state, sorted, k);
+
+  THCTensor_(free)(state, view);
+  THCTensor_(free)(state, sorted);
+  THCudaLongTensor_free(state, indices);
+
+  THCudaCheck(cudaGetLastError());
+
+  return val;
+}
+
+THC_API void
+THCTensor_(median)(THCState *state,
+                   THCTensor *values,
+                   THCudaLongTensor *indices,
+                   THCTensor *self,
+                   int dimension,
+                   int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
+
+  int64_t t_size_dim, k;
+
+  t_size_dim = THCTensor_(size)(state, self, dimension);
+
+  k = (t_size_dim-1) >> 1;
+
+  THCTensor *sorted = THCTensor_(new)(state);
+  THCudaLongTensor *sorted_indices = THCudaLongTensor_new(state);
+
+  THCTensor_(sort)(state, sorted, sorted_indices, self, dimension, 0);
+
+  THCTensor *newValues = THCTensor_(newNarrow)(state, sorted, dimension, k, 1);
+  THCudaLongTensor *newIndices = THCudaLongTensor_newNarrow(state, sorted_indices, dimension, k, 1);
+
+  THCTensor_(free)(state, sorted);
+  THCudaLongTensor_free(state, sorted_indices);
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, newValues, newValues, dimension);
+    THCudaLongTensor_squeeze1d(state, newIndices, newIndices, dimension);
+  }
+
+  THCTensor_(resizeAs)(state, values, newValues);
+  THCudaLongTensor_resizeAs(state, indices, newIndices);
+  THCTensor_(copy)(state, values, newValues);
+  THCudaLongTensor_copy(state, indices, newIndices);
+
+  THCTensor_(free)(state, newValues);
+  THCudaLongTensor_free(state, newIndices);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(max)(THCState *state,
+                THCTensor *values,
+                THCudaLongTensor *indices,
+                THCTensor *src,
+                int dimension,
+                int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+  thrust::pair<real, int64_t>
+    init =
+    thrust::make_pair<real, int64_t>(
+      THCNumerics<real>::min(), 0);
+
+  return THC_reduceDimIndex<real, int64_t>(
+    state, values, indices, src, dimension, keepdim, init,
+    MaxValuePair<real, int64_t>());
+}
+
+THC_API void
+THCTensor_(min)(THCState *state,
+                THCTensor *values,
+                THCudaLongTensor *indices,
+                THCTensor *src,
+                int dimension,
+                int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+  thrust::pair<real, int64_t>
+    init =
+    thrust::make_pair<real, int64_t>(
+      THCNumerics<real>::max(), 0);
+
+  return THC_reduceDimIndex<real, int64_t>(
+    state, values, indices, src, dimension, keepdim, init,
+    MinValuePair<real, int64_t>());
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathReduce.h b/aten/src/THC/generic/THCTensorMathReduce.h
new file mode 100644
index 0000000..4fbbc94
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathReduce.h
@@ -0,0 +1,47 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, real max_norm);
+THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, int dim, int biased, int keepdim);
+THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, int keepdim);
+THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, int dim, int biased, int keepdim);
+
+THC_API accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased);
+THC_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, real value);
+THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased);
+
+#endif
+
+THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
+THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
+THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
+
+THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(min)(THCState *state,
+                             THCTensor *values,
+                             THCudaLongTensor *indices,
+                             THCTensor *src, int dim, int keepdim);
+THC_API void THCTensor_(max)(THCState *state,
+                             THCTensor *values,
+                             THCudaLongTensor *indices,
+                             THCTensor *src, int dim, int keepdim);
+
+THC_API real THCTensor_(minall)(THCState *state, THCTensor *self);
+THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self);
+THC_API real THCTensor_(medianall)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(median)(THCState *state,
+				THCTensor *values,
+				THCudaLongTensor *indices,
+				THCTensor *src, int dim, int keepdim);
+
+THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
+                              real value);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathScan.cu b/aten/src/THC/generic/THCTensorMathScan.cu
new file mode 100644
index 0000000..5aafb3b
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathScan.cu
@@ -0,0 +1,122 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathScan.cu"
+#else
+
+#ifndef THC_REAL_IS_HALF
+template<class BinaryFunction>
+__host__ void THCTensor_(scanThrust)(
+    THCState *state,
+    THCTensor *dst,
+    THCTensor *src,
+    BinaryFunction binary_op)
+{
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> src_data(THCTensor_(data)(state, src));
+  thrust::device_ptr<real> dst_data(THCTensor_(data)(state, dst));
+  ptrdiff_t size = THCTensor_(nElement)(state, src);
+  thrust::inclusive_scan(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      src_data, src_data + size, dst_data,
+      binary_op);
+}
+#endif
+
+template<class BinaryOp>
+__host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt,
+                                       THCTensor *src, int dimension,
+                                       real init, BinaryOp binary_op)
+{
+  unsigned ndim = THCTensor_(_nDimension)(state, src);
+  // Treat all outer dimensions (i.e. dim < dimension) as one.
+  unsigned num_orows = 1;
+  for (int dim = 0; dim < dimension; dim++) {
+    num_orows *= THCTensor_(size)(state, src, dim);
+  }
+  unsigned row_size = THCTensor_(size)(state, src, dimension);
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  unsigned num_irows = 1;
+  for (unsigned dim = dimension + 1; dim < ndim; dim++) {
+    num_irows *= THCTensor_(size)(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  THCTensor_kernel_scanOuterDim<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src),
+    num_orows, num_irows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template<class BinaryFunction>
+__host__ void THCTensor_(scanInnermostDim)(THCState *state, THCTensor *tgt,
+                                           THCTensor *src, real init,
+                                           BinaryFunction binary_op)
+{
+  unsigned ndim = THCTensor_(_nDimension)(state, src);
+  // Treat all outer dimensions as a single dimension.
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= THCTensor_(size)(state, src, dim);
+  }
+  unsigned row_size = THCTensor_(size)(state, src, ndim - 1);
+
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  THCTensor_kernel_scanInnermostDim<real, 16, 32><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src), num_rows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template<class BinaryFunction>
+void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
+                         int dimension, real init, BinaryFunction binary_op)
+{
+  // "init" must be the identity element for binary_op
+  int ndim = THCTensor_(nDimension)(state, src);
+  THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THCTensor_(resizeAs)(state, self_, src);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (!self->is_empty()) {
+  #ifndef THC_REAL_IS_HALF
+    if (ndim == 1) {
+      // thrust does not take an "init"
+      THCTensor_(scanThrust)(state, self, src, binary_op);
+    } else
+  #endif
+    if (dimension == ndim - 1) {
+      THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
+    } else {
+      THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
+    }
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+}
+
+void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  return THCTensor_(scanDim)(state, self, src, dimension,
+                             ScalarConvert<float, real>::to(0.0), AddOp<real>());
+}
+
+void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  return THCTensor_(scanDim)(state, self, src, dimension,
+                             ScalarConvert<float, real>::to(1.0), MulOp<real>());
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathScan.h b/aten/src/THC/generic/THCTensorMathScan.h
new file mode 100644
index 0000000..435519a
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMathScan.h
@@ -0,0 +1,8 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathScan.h"
+#else
+
+THC_API void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, int dim);
+THC_API void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, int dim);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu
new file mode 100644
index 0000000..d54d171
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMode.cu
@@ -0,0 +1,323 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.cu"
+#else
+
+THC_API void THCTensor_(calculateMode)(THCState *state,
+                                        THCTensor *values,
+                                        THCudaLongTensor *indices,
+                                        THCTensor *input,
+                                        THCudaLongStorage *sortBuffer,
+                                        int dimension,
+                                        THLongStorage *position) {
+  THAssert(THCTensor_(isContiguous)(state, input));
+
+  // Because the input is contiguous, we want to get a reference to the
+  // location of the buffer at the innermost dimension that we are going
+  // to calculate the mode for --> we do this by manually doing the stride
+  // calculations to get an offset
+  real *data = THCTensor_(data)(state, input);
+  for (int i = 0; i < THLongStorage_size(position); ++i) {
+    data += THLongStorage_data(position)[i] * THCTensor_(stride)(state, input, i);
+  }
+
+  int64_t nElement = THCTensor_(size)(state, input, THCTensor_(_nDimension)(state, input) - 1);
+  THCThrustAllocator thrustAlloc(state);
+
+  // Wrap input data, sortBuffer, in Thrust device vectors
+  thrust::device_ptr<real> vecPtr = thrust::device_pointer_cast(data);
+  thrust::device_vector<real> iter(vecPtr, vecPtr + nElement);
+  thrust::device_ptr<int64_t> sbPtr = thrust::device_pointer_cast(THCudaLongStorage_data(state, sortBuffer));
+  thrust::device_vector<int64_t> seq(sbPtr, sbPtr + nElement);
+
+  // Fill sortBuffer with [0, 1, 2, ... nElement - 1]
+  thrust::sequence(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    seq.begin(), seq.end());
+
+  // Sort the input data. The original indices of the data are stored in seq
+  thrust::sort_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), seq.begin()
+#if defined(THC_REAL_IS_HALF)
+    , ThrustHalfLess()
+#endif
+  );
+
+  // Count # of unique elements via an inner product between adjacent elements.
+  // Add 1 if two neighboring element are not equal.
+  int unique = 1 + thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end() - 1, iter.begin() + 1, 0, thrust::plus<int>(),
+#if defined(THC_REAL_IS_HALF)
+    ThrustHalfNotEqualTo()
+#else
+    thrust::not_equal_to<real>()
+#endif
+  );
+
+  // Count frequency of each element
+  thrust::device_vector<real> keys(unique);
+  thrust::device_vector<int> counts(unique);
+  thrust::reduce_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(),
+    thrust::constant_iterator<int>(1), keys.begin(), counts.begin()
+#if defined(THC_REAL_IS_HALF)
+    , ThrustHalfEqualTo()
+#endif
+  );
+
+  // Find index of maximum count
+  thrust::device_vector<int>::iterator it = thrust::max_element(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    counts.begin(), counts.end());
+  real mode = keys[it - counts.begin()];
+
+  // Find first index within which it occurs
+#if defined(THC_REAL_IS_HALF)
+  thrust::device_vector<real>::iterator positionIter = thrust::find_if(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), ThrustHalfEqualToPredicate(mode));
+#else
+  thrust::device_vector<real>::iterator positionIter = thrust::find(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), mode);
+#endif
+
+  THAssert(positionIter != iter.end());
+  int64_t index = TH_INDEX_BASE + seq[positionIter - iter.begin()];
+
+  // Place mode, index in output
+  ptrdiff_t valuesOffset = THCTensor_(storageOffset)(state, values);
+  int64_t indicesOffset = THCudaLongTensor_storageOffset(state, indices);
+
+  for (int i = 0; i < THLongStorage_size(position); ++i) {
+    int64_t pos = THLongStorage_data(position)[i];
+    valuesOffset += THCTensor_(stride)(state, values, i) * pos;
+    indicesOffset += THCudaLongTensor_stride(state, indices, i) * pos;
+  }
+  THCStorage_(set)(state, THCTensor_(storage)(state, values), valuesOffset, mode);
+  THCudaLongStorage_set(state, THCudaLongTensor_storage(state, indices), indicesOffset, index);
+}
+
+// this probably could be a loop, not a recursive algorithm
+THC_API void THCTensor_(dimApplyMode)(THCState *state,
+                               THCTensor *values,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               THCudaLongStorage *sortBuffer,
+                               int dimension,
+                               THLongStorage *position,
+                               int curDim) {
+  int64_t ndim = THCTensor_(_nDimension)(state, input);
+
+  // Because we have transposed the Tensor, the data for the dimension we are mode'ing along
+  // is always in the innermost dimension
+  if (curDim == ndim - 1) {
+    THCTensor_(calculateMode)(state, values, indices, input, sortBuffer, dimension, position);
+  } else {
+    // Loop through the values and recurse
+    for (int i = 0; i < THCTensor_(size)(state, input, curDim); ++i) {
+      THLongStorage_data(position)[curDim] = i;
+      THCTensor_(dimApplyMode)(state, values, indices, input, sortBuffer, dimension, position, curDim + 1);
+    }
+  }
+}
+
+#define MAX_GRID_SIZE  65535
+#define MAX_BLOCK_SIZE 1024
+
+THC_API void THCTensor_(mode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              int dimension,
+                              int keepdim) {
+  THLongStorage *dim;
+  THCTensor *transposed, *contiguous, *valuesTransposed;
+  THLongStorage *position;
+  THCudaLongStorage *sortBuffer;
+  THCudaLongTensor *indicesTransposed;
+  int64_t ndim, sliceSize, slices;
+
+
+  THAssert(THCTensor_(checkGPU)(state, 1, values));
+
+  // Verify they are asking for a valid dimension
+  ndim = THCTensor_(_nDimension)(state, input);
+  THArgCheck(dimension >= 0 && dimension < ndim, 4, "Dimension of out bounds");
+
+  sliceSize = THCTensor_(size)(state, input, dimension);
+  slices = THCTensor_(nElement)(state, input) / sliceSize;
+
+  // Resize output value, index Tensors to appropriate sizes (i.e. the same as
+  // the input Tensor, except at dim=dimension, the size is 1)
+  THCTensor_preserveReduceDimSemantics(
+      state, values, ndim, dimension, keepdim);
+  THCTensor_preserveReduceDimSemantics(
+      state, indices, ndim, dimension, keepdim);
+  dim = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, values, dim, NULL);
+  THCudaLongTensor_resize(state, indices, dim, NULL);
+  THLongStorage_free(dim);
+
+  // If sliceSize is 1, copy input to values and set indices
+  if (sliceSize == 1) {
+    THCTensor_(copy)(state, values, input);
+    THCudaLongTensor_fill(state, indices, TH_INDEX_BASE);
+    if (!keepdim) {
+      THCTensor_(squeeze1d)(state, values, values, dimension);
+      THCudaLongTensor_squeeze1d(state, indices, indices, dimension);
+    }
+    return;
+  }
+
+  // Requirements for fused kernel implementation:
+  //
+  // 1. sliceSize <= 2 * max threads per block
+  // 2. uses one block per slice, so number of slices must be less than the maximum number of blocks for
+  // a kernel launch
+  // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed)
+  if (sliceSize <= MAX_BLOCK_SIZE &&
+      slices <= MAX_GRID_SIZE &&
+      THCTensor_canUse32BitIndexMath(state, input)) {
+    // Beginning our optimized implementation. First thing we want to do is to transpose
+    // the input Tensor along the sort dimension, and then make it contiguous
+    transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1);
+    contiguous = THCTensor_(newContiguous)(state, transposed);
+
+    // We also need to view the values and indices Tensors as transposed in order to
+    // properly determine the offset into the underlying storage in which to place the
+    // mode and index for a particular set of dimension values
+    valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim-1);
+    indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim-1);
+
+    // Set-up TensorInfo structs for passing to kernel
+    TensorInfo<real, unsigned int> tiValues = getTensorInfo<real, THCTensor, unsigned int>(state, valuesTransposed);
+    TensorInfo<int64_t, unsigned int> tiIndices = getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, indicesTransposed);
+
+    // The number of blocks is the number of slices that we need to calculate the mode for. Each block
+    // is responsible for computing a single mode
+    dim3 grid;
+    THC_getGridFromTiles(slices, grid);
+
+    // The blocksize is two elements per thread, rounded up to the nearest power of 2
+    int64_t ceilPowerOf2 = nextHighestPowerOf2(sliceSize);
+
+    // Macro that calls kernel --> note that we set the block dimensions here, and
+    // the amount of shared memory
+  #define HANDLE_MODE(SIZE) \
+  { \
+    dim3 blockSize(SIZE / 2); \
+\
+    int memsize = (sizeof(real) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \
+    computeMode<real, SIZE> \
+      <<<grid, blockSize, memsize, THCState_getCurrentStream(state)>>>( \
+        THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \
+  }
+
+    // Tradeoff between compilation time and the number of specializations. Ideally we would have
+    // one HANDLE_MODE for each power of 2
+    switch(ceilPowerOf2) {
+      case 2048:
+        HANDLE_MODE(2048)
+        break;
+      case 1024:
+      case 512:
+      case 256:
+        HANDLE_MODE(1024)
+        break;
+      case 128:
+      case 64:
+        HANDLE_MODE(128)
+        break;
+      case 32:
+      case 16:
+      case 8:
+      case 4:
+      case 2:
+        HANDLE_MODE(32)
+        break;
+      case 1:
+      default:
+        assert(false);
+    }
+    THCudaCheck(cudaGetLastError());
+
+    THCTensor_(free)(state, transposed);
+    THCTensor_(free)(state, contiguous);
+    THCTensor_(free)(state, valuesTransposed);
+    THCudaLongTensor_free(state, indicesTransposed);
+  } else {
+    // Beginning our naive implementation: We don't want to mutate the input Tensor, but
+    // we need to be able to sort the inputs along the dimension in order to calculate the
+    // mode. Additionally, its ideal if the data along the dimension is contiguous. So
+    // we transpose the dimension with the innermost dimension and make a new contiguous
+    // version that we can use.
+    transposed = THCTensor_(newClone)(state, input);
+    THCTensor_(transpose)(state, transposed, NULL, dimension, ndim - 1);
+    contiguous = THCTensor_(newContiguous)(state, transposed);
+    THCTensor_(free)(state, transposed);
+
+    // We also need to view the values and indices Tensors as transposed in order to
+    // properly determine the offset into the underlying storage in which to place the
+    // mode and index for a particular set of dimension values
+    valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim - 1);
+    indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim - 1);
+
+    // Position is a Storage that will store the dimension values we are processing
+    position = THLongStorage_newWithSize(ndim - 1);
+
+    // Sort Buffer is a Storage that will be used in the internal sort required to calculate
+    // the mode efficiently
+    sortBuffer = THCudaLongStorage_newWithSize(state, sliceSize);
+
+    // Call mode
+    THCTensor_(dimApplyMode)(state, valuesTransposed, indicesTransposed, contiguous, sortBuffer, dimension, position, 0);
+
+    THCTensor_(free)(state, contiguous);
+    THLongStorage_free(position);
+    THCTensor_(free)(state, valuesTransposed);
+    THCudaLongTensor_free(state, indicesTransposed);
+    THCudaLongStorage_free(state, sortBuffer);
+  }
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, values, values, dimension);
+    THCudaLongTensor_squeeze1d(state, indices, indices, dimension);
+  }
+}
+
+#undef MAX_GRID_SIZE
+#undef MAX_BLOCK_SIZE
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorMode.h b/aten/src/THC/generic/THCTensorMode.h
new file mode 100644
index 0000000..6f24380
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorMode.h
@@ -0,0 +1,14 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.h"
+#else
+
+/* Returns the mode, and index of the mode, for the set of values
+ * along a given dimension in the input tensor. */
+THC_API void THCTensor_(mode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              int dimension,
+                              int keepdim);
+
+#endif // THC_GENERIC_FILE
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
new file mode 100644
index 0000000..353fadc
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -0,0 +1,545 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.cu"
+#else
+
+#define NUM_BLOCKS min((int)THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, a, b);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
+  THCTensor_(resizeAs)(state, self, means);
+  THCTensor_(normal)(state, self, 0, stddev);
+  THCTensor_(cadd)(state, self, self, ScalarConvert<int, real>::to(1), means);
+}
+
+THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
+{
+  THCTensor_(resizeAs)(state, self, stddevs);
+  THCTensor_(normal)(state, self, 0, 1);
+  THCTensor_(cmul)(state, self, self, stddevs);
+  THCTensor_(add)(state, self, self, ScalarConvert<double, real>::to(mean));
+}
+
+THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
+{
+  THCTensor_(resizeAs)(state, self, means);
+  THCTensor_(normal)(state, self, 0, 1);
+  THCTensor_(cmul)(state, self, self, stddevs);
+  THCTensor_(cadd)(state, self, self, ScalarConvert<int, real>::to(1), means);
+}
+
+THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generateLogNormal<real><<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, lambda);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, median, sigma);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+void THCTensor_(renormRows)(struct THCState* state,
+                             THCTensor* t) {
+  THAssert(THCTensor_(_nDimension)(state, t) == 2);
+  int64_t rows = THCTensor_(size)(state, t, 0);
+  int64_t cols = THCTensor_(size)(state, t, 1);
+
+  cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+  THAssert(props != NULL);
+
+  int numSM = props->multiProcessorCount;
+  int maxThreads = props->maxThreadsPerBlock;
+
+  dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
+  dim3 block(cols < maxThreads ? cols : maxThreads);
+
+  renormRowsL1<real>
+    <<<grid, block, block.x * sizeof(real),
+    THCState_getCurrentStream(state)>>>(THCTensor_(data)(state, t),
+                                        rows, cols);
+}
+
+THC_API void THCTensor_(multinomial)(struct THCState *state,
+                                      THCudaLongTensor *self,
+                                      THCTensor *prob_dist,
+                                      int n_sample,
+                                      int with_replacement)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist));
+  THCGenerator* gen = THCRandom_getGenerator(state);
+
+  int inputSize = THCTensor_(_nDimension)(state, prob_dist);
+  THArgCheck(inputSize > 0 && inputSize <= 2, 2,
+             "prob_dist must be 1 or 2 dim");
+
+  // Categories are in the innermost dimension
+  int64_t numDist =
+    inputSize == 1 ? 1 : THCTensor_(size)(state, prob_dist, 0);
+  int64_t numCategoriesLong =
+    inputSize == 1 ? THCTensor_(size)(state, prob_dist, 0) :
+    THCTensor_(size)(state, prob_dist, 1);
+
+  // Since the index tensor is float, numCategories cannot exceed max
+  // float integer precision
+  THArgCheck(numCategoriesLong <= FLOAT32_MAX_CONSECUTIVE_INT, 2,
+             "number of categories cannot exceed 2^24");
+  int numCategories = (int) numCategoriesLong;
+
+  THArgCheck(n_sample > 0, 3, "cannot sample <= 0 samples");
+
+  if (!with_replacement) {
+    THArgCheck(n_sample <= numCategories, 2,
+               "cannot sample n_sample > prob_dist:size(1) samples without "
+               "replacement");
+  }
+
+  int free_prob_dist = 0;
+
+  // Restructure data for 2d
+  if (inputSize == 1) {
+    THCTensor *temp = THCTensor_(new)(state);
+    THCTensor_(unsqueeze1d)(state, temp, prob_dist, 0);
+    prob_dist = temp;
+    free_prob_dist = 1;
+  }
+
+  THCudaLongTensor_resize2d(state, self, numDist, n_sample);
+
+  // get current device properties
+  cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+  THAssert(props != NULL);
+  int numSM = props->multiProcessorCount;
+  int maxThreads = props->maxThreadsPerBlock;
+  int maxShared = props->sharedMemPerBlock;
+  int requiredShared = (numCategories < maxThreads ? numCategories : maxThreads)
+                                * (sizeof(real) * sizeof(accreal));
+
+  if (n_sample == 1 && maxShared >= requiredShared) {
+    // Optimized allocation-free implementation
+    // To exploit greater parallelism for the sampling, generate the
+    // Uniform random samples in a separate kernel launch, into
+    // temporarily allocated memory. The device RNG is thread-limited
+    THCTensor *sampled = THCTensor_(newWithSize2d)(state, numDist, n_sample);
+    THCTensor_(uniform)(state, sampled, 0.0, 1.0);
+
+    dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
+    dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
+
+    sampleMultinomialOnce<real, accreal>
+      <<<grid, block,
+         requiredShared,
+         THCState_getCurrentStream(state)>>>(
+      THCudaLongTensor_data(state, self),
+      numDist,
+      numCategories,
+      THCTensor_(data)(state, sampled),
+      THCTensor_(data)(state, prob_dist),
+      THCTensor_(stride)(state, prob_dist, 0),
+      THCTensor_(stride)(state, prob_dist, 1)
+      );
+    THCTensor_(free)(state, sampled);
+  } else {
+    // Generic, slow implementation with memory allocations
+
+    // For sampling without replacement, we modify the distribution
+    // for subsequent samples in this space
+    THCTensor* origDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, origDist, prob_dist);
+    THCTensor_(copy)(state, origDist, prob_dist);
+
+    THCTensor* normDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, normDist, prob_dist);
+
+    THCTensor* prefixSum = THCTensor_(new)(state);
+
+    // Renorm along rows
+    THCTensor_(copy)(state, normDist, origDist);
+    THCTensor_(renormRows)(state, normDist);
+
+    // Prefix sum along rows
+    THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+
+    if (with_replacement) {
+      // Sample with replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from one
+      // distribution concurrently.
+      dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
+
+      sampleMultinomialWithReplacement
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          gen->state.gen_states,
+          n_sample,
+          THCudaLongTensor_data(state, self),
+          numDist, numCategories,
+          THCTensor_(data)(state, prefixSum));
+    } else {
+      // Sample without replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from a different
+      // distribution concurrently.
+      ptrdiff_t numBlocks = THCCeilDiv(numDist, (int64_t) 4);
+      dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS);
+
+      for (int sample = 0; sample < n_sample; ++sample) {
+        if (sample > 0) {
+          // Update probabilities
+          // Renorm along rows
+          THCTensor_(copy)(state, normDist, origDist);
+          THCTensor_(renormRows)(state, normDist);
+
+          // Prefix sum along rows
+          THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+        }
+
+        // The kernel can only draw one sample before we have to
+        // recalculate our distribution
+        sampleMultinomialWithoutReplacement
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            gen->state.gen_states,
+            n_sample,
+            sample,
+            THCudaLongTensor_data(state, self),
+            numDist, numCategories,
+            THCTensor_(data)(state, origDist),
+            THCTensor_(data)(state, prefixSum));
+      }
+    }
+
+    THCTensor_(free)(state, prefixSum);
+    THCTensor_(free)(state, normDist);
+    THCTensor_(free)(state, origDist);
+  }
+
+  // Revert data restructuring based on input sizes
+  if (inputSize == 1) {
+    THCudaLongTensor_resize1d(state, self, n_sample);
+  }
+  if (free_prob_dist) {
+    THCTensor_(free)(state, prob_dist);
+  }
+}
+
+THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){
+  THAssert(THCTensor_(isContiguous)(state, _q));
+  THAssert(THCudaLongTensor_isContiguous(state, _J));
+  THAssert(THCTensor_(isContiguous)(state, _probs));
+  int64_t inputsize = THCTensor_(nElement)(state, _probs);
+  THCudaLongTensor *smaller = THCudaLongTensor_newWithSize1d(state, inputsize);
+  THCudaLongTensor *larger = THCudaLongTensor_newWithSize1d(state, inputsize);
+  THCudaLongTensor *smaller_short = THCudaLongTensor_newWithSize1d(state, inputsize);
+  THCudaLongTensor *larger_short = THCudaLongTensor_newWithSize1d(state, inputsize);
+
+  THCudaLongTensor_resize1d(state, _J, inputsize);
+  THCTensor_(resize1d)(state, _q, inputsize);
+
+  real one = ScalarConvert<int64_t, real>::to(1);
+  int inputBlockDim = THCCeilDiv((int)inputsize + BLOCK_SIZE - 1, BLOCK_SIZE);
+  aliasMultinomialFilter
+    <<<inputBlockDim, BLOCK_SIZE, 0, THCState_getCurrentStream(state) >>>(
+                     THCTensor_(data)(state, _q),
+                     THCTensor_(data)(state, _probs),
+                     THCudaLongTensor_data(state, smaller),
+                     THCudaLongTensor_data(state, larger),
+                     THCudaLongTensor_data(state, _J),
+                     THCudaLongTensor_data(state, smaller_short),
+                     THCudaLongTensor_data(state, larger_short),
+                     one, inputsize
+                     );
+
+  THCudaLongTensor_nonzero(state, smaller_short, smaller);
+  THCudaLongTensor_nonzero(state, larger_short, larger);
+  int h_large_c = THCudaLongTensor_nElement(state, larger_short);
+  THCudaLongTensor_resize1d(state, smaller_short, inputsize);
+  THCudaLongTensor_resize1d(state, larger_short, inputsize);
+  aliasMultinomialSetup
+    <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+                THCudaLongTensor_data(state, _J),
+                THCTensor_(data)(state, _q),
+                inputsize,
+                THCudaLongTensor_data(state, smaller_short),
+                THCudaLongTensor_data(state, larger_short),
+                inputsize - h_large_c, h_large_c
+                );
+  real q_max = THCTensor_(maxall)(state, _q);
+  condDiv<<<
+    inputBlockDim, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+                      THCTensor_(data)(state, _q),
+                      THCudaLongTensor_data(state, _J),
+                      inputsize, q_max
+                      );
+
+  THCudaLongTensor_free(state, smaller);
+  THCudaLongTensor_free(state, larger);
+  THCudaLongTensor_free(state, smaller_short);
+  THCudaLongTensor_free(state, larger_short);
+}
+
+THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){
+  THAssert(THCTensor_(isContiguous)(state, _q));
+  THAssert(THCudaLongTensor_isContiguous(state, _J));
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  int64_t K = THCudaLongTensor_nElement(state, _J);
+  int64_t output_nelem = THCudaLongTensor_nElement(state, self);
+  ptrdiff_t size = THCudaLongTensor_nElement(state, self);
+
+  THCTensor *uniform = THCTensor_(newWithSize1d)(state, output_nelem);
+  THCTensor *bernoulli = THCTensor_(newWithSize1d)(state, output_nelem);
+
+  THCTensor_(uniform)(state, uniform, 0, K);
+  THCTensor_(uniform)(state, bernoulli, 0, 1);
+
+  multinomialAliasDrawKernel
+    <<<THCCeilDiv((int)output_nelem+BLOCK_SIZE-1, BLOCK_SIZE), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+          size,
+          THCudaLongTensor_data(state, self),
+          THCudaLongTensor_data(state, _J),
+          THCTensor_(data)(state, _q),
+          K,
+          THCTensor_(data)(state, uniform),
+          THCTensor_(data)(state, bernoulli)
+          );
+}
+
+#endif
+
+#if defined(THC_REAL_IS_DOUBLE)
+GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p)
+#else
+GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (ScalarConvert<bool, real>::to(x <= p)))
+#endif
+
+THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p)
+{
+#if defined(THC_REAL_IS_FLOAT)
+  THCTensor_(bernoulli_FloatTensor)(state, self, p);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCTensor_(bernoulli_DoubleTensor)(state, self, p);
+#endif
+}
+
+#define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE)               \
+THC_API void THCTensor_(NAME)(THCState* state,                                 \
+        THCTensor *self_, PROB_TYPE *probs_)                                   \
+{                                                                              \
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_));             \
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);                         \
+  if (size == 0) return;                                                       \
+  THCGenerator* gen = THCRandom_getGenerator(state);                           \
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);                   \
+  PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_);                 \
+  ptrdiff_t prob_size = PROB_TYPE##_nElement(state, probs);                    \
+  real *result_data = THCTensor_(data)(state, self);                           \
+  PROB_DATA_TYPE *probs_data = PROB_TYPE##_data(state, probs);                 \
+                                                                               \
+  THArgCheck(size == prob_size, 3, "inconsistent tensor size");                \
+                                                                               \
+  generate_bernoulli_tensor<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( \
+      gen->state.gen_states, size, result_data, probs_data);                         \
+                                                                               \
+  PROB_TYPE##_free(state, probs);                                              \
+  THCTensor_(freeCopyTo)(state, self, self_);                                  \
+}
+
+DEFINE_BERNOULLI_TENSOR(bernoulli_FloatTensor, THCudaTensor, float)
+DEFINE_BERNOULLI_TENSOR(bernoulli_DoubleTensor, THCudaDoubleTensor, double)
+
+#if defined(THC_REAL_IS_DOUBLE)
+GENERATE_KERNEL1(generate_geometric, double, double p, double, curand_uniform_double, ceil(log(x) / log(1-p)))
+#else
+GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (ScalarConvert<float, real>::to(ceilf(logf(x) / log(1-p)))))
+#endif
+
+#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT)
+#define CURAND64(STATE) (((uint64_t)curand(STATE)) << 32) | (uint64_t)curand(STATE)
+GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand, \
+    static_cast<real>(static_cast<int32_t>((x % range) + base)))
+GENERATE_KERNEL2(generate_random_64, real, int64_t base, uint64_t range, uint64_t, CURAND64, \
+    static_cast<real>(static_cast<int64_t>((x % range) + base)))
+#elif defined(THC_REAL_IS_HALF)
+GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand,
+    (ScalarConvert<int32_t, real>::to(static_cast<int32_t>(x % range + base))))
+#else
+GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand,
+    static_cast<real>(static_cast<int32_t>(x % range + base)))
+#endif
+
+THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val)
+{
+  THArgCheck(min_val < max_val, 2,
+             "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+  uint64_t range = max_val - min_val;
+
+#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT)
+  if (range > 1ULL << 32) {
+    generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        gen->state.gen_states, static_cast<int>(size), data, min_val, range);
+  } else {
+#endif
+    generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(min_val), static_cast<uint32_t>(range));
+#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT)
+  }
+#endif
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val)
+{
+  THCTensor_(clampedRandom)(state, self_, 0LL, max_val);
+};
+
+#define HLF_MANT_DIG 11
+
+THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
+  ptrdiff_t size = THCTensor_(nElement)(state, self_);
+  if (size == 0) return;
+  THCGenerator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  real *data = THCTensor_(data)(state, self);
+
+#if defined(THC_REAL_IS_HALF)
+  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>((1UL << HLF_MANT_DIG) + 1));
+#elif defined(THC_REAL_IS_FLOAT)
+  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>((1UL << FLT_MANT_DIG) + 1));
+#elif defined(THC_REAL_IS_DOUBLE)
+  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, static_cast<int>(size), data, static_cast<int64_t>(0ULL), static_cast<uint64_t>((1ULL << DBL_MANT_DIG) + 1));
+#elif defined(THC_REAL_IS_LONG)
+  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, static_cast<int>(size), data, static_cast<int64_t>(0ULL), static_cast<uint64_t>(std::numeric_limits<real>::max()) + 1);
+#else
+  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
+#endif
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+#undef HLF_MANT_DIG
+#undef CURAND64
+#undef NUM_BLOCKS
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h
new file mode 100644
index 0000000..1deb2db
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorRandom.h
@@ -0,0 +1,30 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b);
+THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev);
+THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs);
+THC_API void THCTensor_(normal_means_stddevs)(struct THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs);
+THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda);
+THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma);
+THC_API void THCTensor_(multinomial)(struct THCState *state, THCudaLongTensor *self, THCTensor *prob_dist, int n_sample, int with_replacement);
+THC_API void THCTensor_(multinomialAliasSetup)(struct THCState *state, THCTensor *probs, THCudaLongTensor *J, THCTensor *q);
+THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q);
+
+#endif
+
+THC_API void THCTensor_(random)(struct THCState *state, THCTensor *self);
+THC_API void THCTensor_(clampedRandom)(struct THCState *state, THCTensor *self, int64_t min, int64_t max);
+THC_API void THCTensor_(cappedRandom)(struct THCState *state, THCTensor *self, int64_t max);
+THC_API void THCTensor_(bernoulli)(struct THCState *state, THCTensor *self, double p);
+THC_API void THCTensor_(bernoulli_FloatTensor)(struct THCState *state, THCTensor *self, THCudaTensor *p);
+THC_API void THCTensor_(bernoulli_DoubleTensor)(struct THCState *state, THCTensor *self, THCudaDoubleTensor *p);
+THC_API void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p);
+THC_API void THCTensor_(geometric)(struct THCState *state, THCTensor *self, double p);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorScatterGather.cu b/aten/src/THC/generic/THCTensorScatterGather.cu
new file mode 100644
index 0000000..f04ae5a
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorScatterGather.cu
@@ -0,0 +1,362 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorScatterGather.cu"
+#else
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_gatherKernel<TYPE, REAL, DIMS>                                \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
+    tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
+
+void THCTensor_(gather)(THCState* state, THCTensor *tensor,
+                         THCTensor *src, int dim, THCudaLongTensor *index) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 4,
+             "Index tensor must have same dimensions as input tensor");
+  THLongStorage *indexSize = THCudaLongTensor_newSizeOf(state, index);
+  THArgCheck(THCTensor_(isSize)(state, tensor, indexSize), 4,
+             "Index tensor must have the same size as output tensor.");
+  THLongStorage_free(indexSize);
+  THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 3,
+             "Index dimension is out of bounds");
+  THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 2,
+             "Input tensor must have same dimensions as output tensor");
+
+  for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 2,
+                 "Input tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    TensorInfo<int64_t, unsigned int> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+    }
+  } else {
+    TensorInfo<real, uint64_t> tensorInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, tensor);
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    TensorInfo<int64_t, uint64_t> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, index);
+    RUN(uint64_t, -1, real);
+    THCudaCheck(cudaGetLastError());
+  }
+
+  if (oldTensor) {
+    THCTensor_copyIgnoringOverlaps<real>(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_scatterKernel<TYPE, REAL, DIMS>                               \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
+    tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
+
+void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 3,
+             "Index tensor must have same dimensions as input tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) {
+    int64_t indexSizeD = THCudaLongTensor_size(state, index, d);
+    if (d != dim) {
+      THArgCheck(indexSizeD <= THCTensor_(size)(state, tensor, d), 3,
+                 "Index tensor must not have larger size than output tensor apart from the specified dimension %d, but got index %s output %s",
+                 dim, THCudaLongTensor_sizeDesc(state, index).str, THCTensor_(sizeDesc)(state, tensor).str);
+    }
+    THArgCheck(indexSizeD <= THCTensor_(size)(state, src, d), 3,
+               "Index tensor must not have larger size than input tensor, but got index %s input %s",
+               THCudaLongTensor_sizeDesc(state, index).str, THCTensor_(sizeDesc)(state, src).str);
+  }
+
+  THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    TensorInfo<int64_t, unsigned int> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        break;
+    }
+  } else {
+    TensorInfo<real, uint64_t> tensorInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, tensor);
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    TensorInfo<int64_t, uint64_t> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, index);
+
+    RUN(uint64_t, -1, real)
+  }
+
+  if (oldTensor) {
+    THCTensor_copyIgnoringOverlaps<real>(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_scatterAddKernel<TYPE, REAL, DIMS>                               \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
+    tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
+
+void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 3,
+             "Index tensor must have same dimensions as input tensor");
+  THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+  THLongStorage *indexDims = THCudaLongTensor_newSizeOf(state, index);
+  THArgCheck(THCTensor_(isSize)(state, src, indexDims), 3,
+             "Index tensor must have the same size as input tensor.");
+  THLongStorage_free(indexDims);
+
+  for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 4,
+                 "Input tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, src);
+    TensorInfo<int64_t, unsigned int> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        break;
+    }
+  } else {
+    TensorInfo<real, uint64_t> tensorInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, tensor);
+    TensorInfo<real, uint64_t> srcInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, src);
+    TensorInfo<int64_t, uint64_t> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, index);
+
+    RUN(uint64_t, -1, real)
+  }
+
+  if (oldTensor) {
+    THCTensor_copyIgnoringOverlaps<real>(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_scatterFillKernel<TYPE, REAL, DIMS>                           \
+      <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(      \
+          tensorInfo, indexInfo, value, dim, (TYPE)totalElements);
+
+void
+THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
+                         int dim, THCudaLongTensor *index, real value) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor__nDimension(state, index) ==
+             THCTensor_(_nDimension)(state, tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+
+  for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) ==
+                 THCudaLongTensor_size(state, index, d), 4,
+                 "Index tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
+    TensorInfo<int64_t, unsigned int> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        break;
+    }
+  } else {
+    TensorInfo<real, uint64_t> tensorInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, tensor);
+    TensorInfo<int64_t, uint64_t> indexInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, index);
+
+    RUN(uint64_t, -1, real);
+  }
+
+  if (oldTensor) {
+    THCTensor_copyIgnoringOverlaps<real>(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorScatterGather.h b/aten/src/THC/generic/THCTensorScatterGather.h
new file mode 100644
index 0000000..e7e83b2
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorScatterGather.h
@@ -0,0 +1,10 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorScatterGather.h"
+#else
+
+THC_API void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
+THC_API void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src);
+THC_API void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src);
+THC_API void THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, real value);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
new file mode 100644
index 0000000..a97d19b
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorSort.cu
@@ -0,0 +1,336 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorSort.cu"
+#else
+
+// In alignment with default sort on a c++ map, this function
+// will permute key and value tensors identically, and
+// in such a way that the 'key' tensor is ordered numerically
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                           THCTensor* key,
+                                           THCudaLongTensor* value,
+                                           int dim, bool dir) {
+  THLongStorage *valueSize = THCudaLongTensor_newSizeOf(state, value);
+  THArgCheck(THCTensor_(isSize)(state, key, valueSize), 2,
+             "Key tensor must have same size as value tensor");
+  THLongStorage_free(valueSize);
+  int dims = THCudaLongTensor__nDimension(state, value);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(_nDimension)(state, key);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t inElements = THCTensor_(nElement)(state, key);
+  int64_t keySliceSize = THCTensor_(size)(state, key, dim);
+  ptrdiff_t keySlices = inElements / keySliceSize;
+
+  if (THCTensor_(_nDimension)(state, key) == 0) {
+    // Zero-dim tensor; do nothing
+    return;
+  }
+
+  // The amount of shared memory and block size is based on
+  // 2^ceil(lg(n)); we choose that sorting implementation for a given
+  // size.
+  int64_t ceilPowerOf2 = nextHighestPowerOf2(keySliceSize);
+
+  // FIXME: We'd have to find some other trick with Thrust to perform a
+  // vectorized (key, value) sort by slice segment
+  if (ceilPowerOf2 > 2048) {
+    THError("sortKeyValueInplace only works for sizes <= 2048 at present");
+  }
+
+  // The grid is based on the number of independent slices that we
+  // have to sort; one block per slice
+  dim3 grid;
+  if (!THC_getGridFromTiles(keySlices, grid)) {
+    THError("Slice to sort is too large");
+  }
+
+#define HANDLE_CASE(TYPE, A, SIZE)                                      \
+  do {                                                                  \
+    int blockSize = SIZE / 2;                                           \
+    if (blockSize < 1) {                                                \
+      blockSize = 1;                                                    \
+    }                                                                   \
+                                                                        \
+    dim3 block(blockSize);                                              \
+                                                                        \
+    if (dir) {                                                          \
+      bitonicSortKVInPlace<real, int64_t, A, -1, GTComp<real>, TYPE, SIZE> \
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
+          keyInfo,                                                      \
+          keySlices,                                                    \
+          (TYPE) keySliceSize,                                          \
+          (TYPE) keyInfo.strides[collapseKeyDim],                       \
+          valueInfo,                                                    \
+          (TYPE) valueInfo.strides[collapseValueDim],                   \
+          GTComp<real>());                                              \
+    } else {                                                            \
+      bitonicSortKVInPlace<real, int64_t, A, -1, LTComp<real>, TYPE, SIZE> \
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
+          keyInfo,                                                      \
+          keySlices,                                                    \
+          (TYPE) keySliceSize,                                          \
+          (TYPE) keyInfo.strides[collapseKeyDim],                       \
+          valueInfo,                                                    \
+          (TYPE) valueInfo.strides[collapseValueDim],                   \
+          LTComp<real>());                                              \
+    }                                                                   \
+  } while (0)
+
+#define HANDLE_SORT_CASE(TYPE, A)                       \
+  {                                                     \
+    switch (ceilPowerOf2) {                             \
+      case 2048:                                        \
+      HANDLE_CASE(TYPE, A, 2048);                       \
+      break;                                            \
+      case 1024:                                        \
+      case 512:                                         \
+      case 256:                                         \
+      HANDLE_CASE(TYPE, A, 1024);                       \
+      break;                                            \
+      case 128:                                         \
+      case 64:                                          \
+      HANDLE_CASE(TYPE, A, 128);                        \
+      break;                                            \
+      case 32:                                          \
+      case 16:                                          \
+      case 8:                                           \
+      case 4:                                           \
+      case 2:                                           \
+      HANDLE_CASE(TYPE, A, 32);                         \
+      break;                                            \
+      case 1:                                           \
+      /* Nothing to do, data already sorted */          \
+      break;                                            \
+      default:                                          \
+      assert(false);                                    \
+    }                                                   \
+  }
+
+  // The constructed key/value tensor info is used to select the slice
+  // we are sorting on a per-block basis
+  if (THCTensor_canUse32BitIndexMath(state, key)) {
+    TensorInfo<real, unsigned int> keyInfo =
+      getTensorInfo<real, THCTensor, unsigned int>(state, key);
+    keyInfo.reduceDim(dim);
+    int collapseKeyDim = keyInfo.collapseDims(dim);
+
+    TensorInfo<int64_t, unsigned int> valueInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, value);
+    valueInfo.reduceDim(dim);
+    int collapseValueDim = valueInfo.collapseDims(dim);
+
+    if (keyInfo.isContiguous()) {
+      HANDLE_SORT_CASE(unsigned int, -2);
+    } else {
+      switch (keyInfo.dims) {
+        case 2:
+          HANDLE_SORT_CASE(unsigned int, 2);
+          break;
+        default:
+          HANDLE_SORT_CASE(unsigned int, -1);
+          break;
+      }
+    }
+  } else {
+    TensorInfo<real, uint64_t> keyInfo =
+      getTensorInfo<real, THCTensor, uint64_t>(state, key);
+    keyInfo.reduceDim(dim);
+    int collapseKeyDim = keyInfo.collapseDims(dim);
+
+    TensorInfo<int64_t, uint64_t> valueInfo =
+      getTensorInfo<int64_t, THCudaLongTensor, uint64_t>(state, value);
+    valueInfo.reduceDim(dim);
+    int collapseValueDim = valueInfo.collapseDims(dim);
+
+    // int64_t case is rare, just instantiate the generic version
+    HANDLE_SORT_CASE(uint64_t, -1);
+  }
+#undef HANDLE_CASE
+#undef HANDLE_SORT_CASE
+#undef HANDLE_A_CASE
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(sortViaThrust)(THCState* state,
+                               THCTensor* sorted,
+                               THCudaLongTensor* indices,
+                               THCTensor* input,
+                               int dim, bool dir) {
+  int nDims = THCTensor_(_nDimension)(state, input);
+
+  ptrdiff_t totalElements = THCTensor_(nElement)(state, input);
+  int64_t sliceSize = THCTensor_(size)(state, input, dim);
+  int64_t sliceStride = THCTensor_(stride)(state, input, dim);
+
+  // We perform a vectorized segmented sort in Thrust.
+  // Say we are sorting a (2, 3) tensor. We have in flattened form:
+  // values 0.4 1.2 5.3 6.2 1.3 2.3
+  // indices  0   1   2   3   4   5
+  // where indices is a global index (across all slices)
+
+  // First we sort by values, globally:
+  // values 6.2 5.3 2.3 1.2 1.3 0.4
+  // indices  3   2   5   1   4   0
+
+  // Then we stable sort by segment, which is index / 3:
+  // values 5.3 1.2 0.4 6.2 2.3 1.3
+  // indices  2   1   0   3   5   4
+
+  // Then we translate the global index to a per-slice Lua index
+  // (index % 3) + 1:
+  // values 5.3 1.2 0.4 6.2 2.3 1.3
+  // indices  3   2   1   1   3   2
+
+  // This method can only work if the slice we are sorting (`dim`) is
+  // innermost, and both values and indices are contiguous. We do this
+  // by re-arranging the input into this form as needed, which will
+  // unfortunately allocate memory if the request is not in this form.
+  // Vectorized sort is slower than iterated sort if the number of
+  // slices is small (since we're sorting twice, instead of invoking a
+  // smaller sort `numSlices` times), but the Thrust sort
+  // implementation here is a catch-all, so we're not looking for
+  // efficiency, but instead correctness.
+  THCTensor_(copy)(state, sorted, input);
+  THCTensor* trKeys = THCTensor_(newWithTensor)(state, sorted);
+  THCudaLongTensor* trIndices = THCudaLongTensor_newWithTensor(state, indices);
+
+  // Transpose dim to innermost
+  if (dim != nDims - 1) {
+    THCTensor_(transpose)(state, trKeys, NULL, dim, nDims - 1);
+    THCudaLongTensor_transpose(state, trIndices, NULL, dim, nDims - 1);
+  }
+
+  // Thrust must operate on a contiguous layout
+  THCTensor* trContigKey = THCTensor_(newContiguous)(state, trKeys);
+  THCudaLongTensor* trContigIndices = THCudaLongTensor_newContiguous(state, trIndices);
+
+  THCTensor_(free)(state, trKeys);
+  THCudaLongTensor_free(state, trIndices);
+
+  THCThrustAllocator thrustAlloc(state);
+
+  thrust::device_ptr<real> keyIter(THCTensor_(data)(state, trContigKey));
+
+  // Since we are composing a global index across all segments rather
+  // than a per-segment index, we treat the memory as int so we don't
+  // have problems sorting slices < 2^24 but where the entire tensor
+  // has more than 2^24 elements
+  thrust::device_ptr<int64_t>
+    indexIter((int64_t*) THCudaLongTensor_data(state, trContigIndices));
+
+  // Fill the indices with a global index across all slices
+  thrust::counting_iterator<int64_t> countIter(0);
+
+  thrust::copy(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    countIter, countIter + totalElements, indexIter);
+
+  // First, we sort globally (across all slices) according to key
+  // (the values we're sorting)
+  if (dir) {
+    thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<real>());
+  } else {
+    thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<real>());
+  }
+
+  // Then, re-sort according to slice that each index is
+  // in. This completes the segment sort in Thrust, since we're
+  // stably sorting here, preserving the relative order of values
+  // per each slice
+  thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    indexIter, indexIter + totalElements, keyIter,
+    SliceComp(sliceSize));
+
+  // Translate the global integer 0-based index to a per-slice real
+  // Lua index
+  thrust::for_each(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    indexIter, indexIter + totalElements,
+    GlobalIndexToPerSliceIndex(sliceSize));
+
+  // Reverse the transposition as needed
+  if (dim != nDims - 1) {
+    THCTensor_(transpose)(state, trContigKey, NULL, dim, nDims - 1);
+    THCudaLongTensor_transpose(state, trContigIndices, NULL, dim, nDims - 1);
+  }
+
+  // Then copy back to the expected output
+  THCTensor_(freeCopyTo)(state, trContigKey, sorted);
+  THCudaLongTensor_freeCopyTo(state, trContigIndices, indices);
+}
+
+THC_API void THCTensor_(sort)(THCState* state,
+                               THCTensor *sorted,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               int dim, int order) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
+  int64_t dims = THCTensor_(_nDimension)(state, sorted);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(_nDimension)(state, input);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+
+  // Make sure sufficient output space is allocated
+  THCTensor_(resizeAs)(state, sorted, input);
+  THLongStorage *inputSize = THCTensor_(newSizeOf)(state, input);
+  THCudaLongTensor_resize(state, indices, inputSize, NULL);
+  THLongStorage_free(inputSize);
+
+  // How large are the slices that we are sorting?
+  int64_t sliceSize = THCTensor_(size)(state, input, dim);
+
+  // Workaround:
+  // CUDA 8 uses more shared memory than 7.5 for bitonicSortKVInPlace,
+  // and so for the double word types,
+  // we get "too many resources requested for launch" in the 2048 case
+#if CUDA_VERSION >= 8000
+#if defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_LONG)
+  int maxSliceSize = 1024;
+#else
+  int maxSliceSize = 2048;
+#endif
+#else
+  int maxSliceSize = 2048;
+#endif
+
+  if (sliceSize <= maxSliceSize) {
+    // Fill `indices` (the values) with the
+    // slice-relative index.
+    THCudaLongTensor_fillSliceWithIndex(state, indices, dim);
+
+    // We sort k/v pairs in-place; copy unsorted input to output
+    THCTensor_(copy)(state, sorted, input);
+
+    // Sort using our in-place k/v kernel that supports arbitrary
+    // layout
+    THCTensor_(sortKeyValueInplace)(state, sorted, indices, dim, order);
+  } else {
+    // Otherwise, fall back upon Thrust, which handles all other cases
+    // (potentially slowly, with extra copies/memory allocations)
+    THCTensor_(sortViaThrust)(state, sorted, indices, input, dim, (bool) order);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h
new file mode 100644
index 0000000..009d825
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorSort.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorSort.h"
+#else
+
+/* Performs an in-place sort of (keys, values). Only works for slice sizes
+   <= 2048 at the moment (slice size == size of keys/values dim `dim`) */
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                             THCTensor* keys,
+                                             THCudaLongTensor* values,
+                                             int dim, int order);
+
+/* Performs an out-of-place sort of `input`, returning the per-slice indices
+   in `indices` and the sorted values in `sorted` */
+THC_API void THCTensor_(sort)(THCState* state,
+                              THCTensor* sorted,
+                              THCudaLongTensor* indices,
+                              THCTensor* input,
+                              int dim, int order);
+
+#endif
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
new file mode 100644
index 0000000..c2f3a28
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -0,0 +1,165 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.cu"
+#else
+
+THC_API void THCTensor_(topk)(THCState* state,
+                               THCTensor *topK,
+                               THCudaLongTensor *indices,
+                               THCTensor *input_,
+                               int64_t k, int dim, int dir, int sorted) {
+  THAssert(topK != NULL && indices != NULL && input_ != NULL);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_));
+  THArgCheck(THCTensor_(_nDimension)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  int64_t dims = THCudaLongTensor__nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  int numDims = THCTensor_(_nDimension)(state, input_);
+  THArgCheck(numDims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  THArgCheck(dim >= 0 && dim < numDims, 6, "dim not in range");
+
+  int64_t sliceSize = THCTensor_(size)(state, input_, dim);
+  THArgCheck(k > 0 && k <= sliceSize, 5, "k not in range for dimension");
+
+  THCTensor *input = THCTensor_(newContiguous)(state, input_);
+
+  // Build the output size, which is the dim being selected set to
+  // size k
+  THLongStorage* topKSize = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(topKSize, dim, k);
+  THCTensor_(resize)(state, topK, topKSize, NULL);
+  THCudaLongTensor_resize(state, indices, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+#define RUN_K(INDEX_T, DIM, DIR)                                        \
+  gatherTopK<real, INDEX_T, DIM, DIR>                                   \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      inputInfo,                                                        \
+      sliceSize,                                                        \
+      k,                                                                \
+      inputSlices,                                                      \
+      /* The actual dimension that the k-selection is running in */     \
+      /* may have changed from collapseDims() */                        \
+      inputInfo.strides[collapseInputDim],                              \
+      topKInfo,                                                         \
+      topKSlices,                                                       \
+      topKInfo.strides[collapseTopKDim],                                \
+      indicesInfo,                                                      \
+      indicesInfo.strides[collapseIndicesDim])
+
+#define RUN_DIR(INDEX_T, DIM)                   \
+  if (dir) {                                    \
+    RUN_K(INDEX_T, DIM, true);                  \
+  } else {                                      \
+    RUN_K(INDEX_T, DIM, false);                 \
+  }
+
+#define RUN_DIM(INDEX_T)                        \
+  if (allDims == 1) {                           \
+    RUN_DIR(INDEX_T, 1);                        \
+  } else if (allDims == 2) {                    \
+    RUN_DIR(INDEX_T, 2);                        \
+  } else if (allDims == 3) {                    \
+    RUN_DIR(INDEX_T, 3);                        \
+  } else {                                      \
+    RUN_DIR(INDEX_T, -1);                       \
+  }
+
+#define RUN_T(INDEX_T)                                                  \
+  TensorInfo<real, INDEX_T> inputInfo =                                 \
+    getTensorInfo<real, THCTensor, INDEX_T>(state, input);              \
+  TensorInfo<real, INDEX_T> topKInfo =                                  \
+    getTensorInfo<real, THCTensor, INDEX_T>(state, topK);               \
+  TensorInfo<int64_t, INDEX_T> indicesInfo =                            \
+    getTensorInfo<int64_t, THCudaLongTensor, INDEX_T>(state, indices);  \
+                                                                        \
+  /* We use these structures solely to find the offset to */            \
+  /* each slice we are operating on */                                  \
+  inputInfo.sizes[dim] = 1;                                             \
+  topKInfo.sizes[dim] = 1;                                              \
+  indicesInfo.sizes[dim] = 1;                                           \
+                                                                        \
+  /* Collapse all other dims */                                         \
+  int collapseInputDim = inputInfo.collapseDims(dim);                   \
+  int collapseTopKDim = topKInfo.collapseDims(dim);                     \
+  int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
+                                                                        \
+  int64_t inputSlices = 1;                                              \
+  for (int i = 0; i < inputInfo.dims; ++i) {                            \
+    inputSlices *= inputInfo.sizes[i];                                  \
+  }                                                                     \
+  int64_t topKSlices = 1;                                               \
+  for (int i = 0; i < topKInfo.dims; ++i) {                             \
+    topKSlices *= topKInfo.sizes[i];                                    \
+  }                                                                     \
+                                                                        \
+  dim3 grid;                                                            \
+  if (!THC_getGridFromTiles(inputSlices, grid)) {                       \
+    THError("Slice to sort is too large");                              \
+  }                                                                     \
+                                                                        \
+  dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) 32), (int64_t) 1024)); \
+                                                                        \
+  /* This is used as a template parameter to calculate indices. */      \
+  /* We only specialize it if all collapsed dim sizes are the */        \
+  /* same; otherwise, we use -1 which is the specialization */          \
+  /* parameter for arbitrary dimensions */                              \
+  int allDims = inputInfo.dims;                                         \
+  if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
+    allDims = -1;                                                       \
+  }                                                                     \
+                                                                        \
+  RUN_DIM(INDEX_T);
+
+  // Based on required index size, run the algorithm with the
+  // appropriate index type
+  if (THCTensor_canUse32BitIndexMath(state, input) &&
+      THCTensor_canUse32BitIndexMath(state, topK) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
+    RUN_T(uint32_t);
+  } else {
+    RUN_T(uint64_t);
+  }
+#undef RUN_T
+#undef RUN_DIM
+#undef RUN_DIR
+#undef RUN_K
+
+  // Sort the results if the user wants them sorted, since our
+  // selection routine does not ensure sorting
+  if (sorted) {
+    // FIXME: the k/v inplace sort along slice only works for size <=
+    // 2048 at the moment
+    if (sliceSize <= 2048) {
+      // This avoids any memory allocations and performs all sorting
+      // work inplace along the slice
+      THCTensor_(sortKeyValueInplace)(state, topK, indices, dim, dir);
+    } else {
+      // Depend upon the backup sort that returns indices, which we
+      // can use in conjunction with gather to produce the original
+      // indices.
+      // This is not the most efficient implementation, especially since
+      // there are memory allocations performed here. If the user desires
+      // greater performance, they should torch.gather() the results
+      // themselves using the reported indices, providing previously
+      // allocated tensors to receive the results.
+      THCTensor* sortedTopK = THCTensor_(new)(state);
+      THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
+      THCTensor_(sort)(state, sortedTopK, sortedIndices, topK, dim, dir);
+
+      THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
+
+      THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
+      THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
+
+      THCTensor_(freeCopyTo)(state, sortedTopK, topK);
+      THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
+      THCudaLongTensor_free(state, sortedIndices);
+    }
+  }
+
+  THCudaLongTensor_free(state, input);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_GENERIC_FILE
diff --git a/aten/src/THC/generic/THCTensorTopK.h b/aten/src/THC/generic/THCTensorTopK.h
new file mode 100644
index 0000000..95dbceb
--- /dev/null
+++ b/aten/src/THC/generic/THCTensorTopK.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.h"
+#else
+
+/* Returns the set of all kth smallest (or largest) elements, depending */
+/* on `dir` */
+THC_API void THCTensor_(topk)(THCState* state,
+                               THCTensor* topK,
+                               THCudaLongTensor* indices,
+                               THCTensor* input,
+                               int64_t k, int dim, int dir, int sorted);
+
+#endif // THC_GENERIC_FILE
diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu
new file mode 100644
index 0000000..72b7ff3
--- /dev/null
+++ b/aten/src/THCUNN/Abs.cu
@@ -0,0 +1,25 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct absupdateOutput_functor
+{
+  __device__ void operator()(T* output, const T* input) const
+  {
+    *output = THCNumerics<T>::abs(*input);
+  }
+};
+
+template <typename T>
+struct absupdateGradInput_functor
+{
+  __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
+  {
+    *gradInput = *input < 0 ? - *gradOutput : *gradOutput;
+  }
+};
+
+#include "generic/Abs.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu
new file mode 100644
index 0000000..cb0f475
--- /dev/null
+++ b/aten/src/THCUNN/AbsCriterion.cu
@@ -0,0 +1,62 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+
+template <typename Dtype, typename Acctype>
+struct abs_functor
+{
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
+  {
+    Dtype z = x-y;
+    return ScalarConvert<Dtype, Acctype>::to(z >= 0 ? z : -z);
+  }
+};
+
+template <typename Dtype>
+struct abs_updateOutput_no_reduce_functor
+{
+  __host__ __device__ void operator()(const Dtype* x, const Dtype* y, Dtype *out)
+  {
+    Dtype z = *x - *y;
+    *out = z >= 0 ? z : -z;
+  }
+};
+
+template <typename Dtype>
+struct abs_updateGradInput_no_reduce_functor
+{
+  __forceinline__ __host__ __device__ void operator()(
+      const Dtype *x,
+      const Dtype *y,
+      Dtype *gradInput)
+  {
+    *gradInput = ScalarConvert<int, Dtype>::to(*x >= *y ? 1 : -1);
+  }
+};
+
+template <typename Dtype>
+struct abs_updateGradInput_functor
+{
+  const Dtype norm;
+  const Dtype gradOutput;
+
+  abs_updateGradInput_functor(Dtype norm_, Dtype gradOutput_)
+    : norm(norm_), gradOutput(gradOutput_)
+  {}
+
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
+  {
+    return ((x - y) >= 0 ? norm : -norm) * gradOutput;
+  }
+};
+
+#include "generic/AbsCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu
new file mode 100644
index 0000000..3624588
--- /dev/null
+++ b/aten/src/THCUNN/BCECriterion.cu
@@ -0,0 +1,134 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+template <typename T>
+inline __host__ __device__ T eps();
+
+template <>
+inline __host__ __device__ float eps() { return 1e-12f; }
+
+template <>
+inline __host__ __device__ double eps() { return 1e-12; }
+
+template <typename T>
+inline __host__ __device__ T safe_log(T a) {
+  if (a == 0.)
+  {
+    return THCNumerics<T>::log(eps<T>());
+  }
+  return THCNumerics<T>::log(a);
+}
+
+template <typename Dtype, typename Acctype>
+struct bce_functor
+{
+  template <class Tuple>
+  __host__ __device__
+  Acctype operator()(Tuple x)
+  {
+    Dtype input = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    assert(input >= 0. && input <= 1.);
+    return - (t * safe_log<Acctype>(ScalarConvert<Dtype, Acctype>::to(input))
+        + (Acctype(1) - t) * safe_log<Acctype>(Acctype(1) - input));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct bce_updateOutput_no_reduce_functor
+{
+  __forceinline__ __host__ __device__
+  void operator()(
+      const Dtype *input,
+      const Dtype *target,
+      Dtype *output)
+  {
+    assert(*input >= 0. && *input <= 1.);
+    *output = ScalarConvert<Acctype, Dtype>::to(
+        -(*target * safe_log<Acctype>(ScalarConvert<Dtype, Acctype>::to(*input)) +
+          (Acctype(1) - *target) * safe_log<Acctype>(Acctype(1) - *input)));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct bce_functor_weights
+{
+  template <class Tuple>
+  __host__ __device__
+  Acctype operator()(Tuple x)
+  {
+    Dtype input = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    Dtype w = thrust::get<2>(x);
+    assert(input >= 0. && input <= 1.);
+    return - w * (t * safe_log<Acctype>(ScalarConvert<Dtype, Acctype>::to(input)) +
+        (Acctype(1) - t) * safe_log<Acctype>(Acctype(1) - input));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct bce_updateGradInput_no_reduce_functor
+{
+  __forceinline__ __host__ __device__
+  void operator()(
+      const Dtype *x,
+      const Dtype *t,
+      Dtype *gradInput)
+  {
+      *gradInput = ScalarConvert<Acctype,Dtype>::to(
+          - (*t - *x) / ((Acctype(1) - *x + eps<Acctype>()) * (*x + eps<Acctype>())));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct bce_updateGradInput_functor
+{
+  const Dtype norm;
+
+  bce_updateGradInput_functor(Dtype norm_)
+    : norm(norm_)
+  {}
+
+  template <class Tuple>
+  __host__ __device__
+  Dtype operator()(Tuple x)
+  {
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    return ScalarConvert<Acctype,Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm);
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct bce_updateGradInput_functor_weights
+{
+  const Dtype norm;
+
+  bce_updateGradInput_functor_weights(Dtype norm_)
+    : norm(norm_)
+  {}
+
+  template <class Tuple>
+  __host__ __device__
+  Dtype operator()(Tuple x)
+  {
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    Dtype w = thrust::get<2>(x);
+    return ScalarConvert<Acctype, Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm * w);
+  }
+};
+
+#include "generic/BCECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu
new file mode 100644
index 0000000..03531b3
--- /dev/null
+++ b/aten/src/THCUNN/BatchNormalization.cu
@@ -0,0 +1,291 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+const int WARP_SIZE = 32;
+
+// The maximum number of threads in a block
+const int MAX_BLOCK_SIZE = 512;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+// Returns the index of the most significant 1 bit in `val`.
+__device__ __forceinline__ int getMSB(int val) {
+  return 31 - __clz(val);
+}
+
+template <typename Dtype, typename Acctype>
+struct Float2 {
+  Acctype v1, v2;
+  __device__ Float2() {}
+  __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert<Dtype, Acctype>::to(v1)), v2(ScalarConvert<Dtype, Acctype>::to(v2)) {}
+  __device__ Float2(Dtype v) : v1(ScalarConvert<Dtype, Acctype>::to(v)), v2(ScalarConvert<Dtype, Acctype>::to(v)) {}
+  __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
+  __device__ Float2& operator+=(const Float2& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct SumOp {
+  __device__ SumOp(const DeviceTensor3 t) : tensor(t) {}
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    return ScalarConvert<Dtype, Acctype>::to(tensor[batch][plane][n]);
+  }
+  const DeviceTensor3 tensor;
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct VarOp {
+  __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {}
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    Dtype val = tensor[batch][plane][n];
+    return (val - mean) * (val - mean);
+  }
+  const Acctype mean;
+  const DeviceTensor3 tensor;
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct GradOp {
+  __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
+    : mean(m), input(i), gradOutput(g) {}
+  __device__ __forceinline__ Float2<Dtype, Acctype> operator()(int batch, int plane, int n) {
+    Dtype g = gradOutput[batch][plane][n];
+    Dtype c = ScalarConvert<Acctype, Dtype>::to(input[batch][plane][n] - mean);
+    return Float2<Dtype, Acctype>(g, g * c);
+  }
+  const Acctype mean;
+  const DeviceTensor3 input;
+  const DeviceTensor3 gradOutput;
+};
+
+// Sum across all threads within a warp
+template <typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+
+template <typename Dtype, typename Acctype>
+static __device__ __forceinline__ Float2<Dtype, Acctype> warpSum(Float2<Dtype, Acctype> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}
+
+// Sum across (batch, x/y/z) applying Op() pointwise
+template<typename T, typename Op, typename DeviceTensor3>
+__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
+  T sum = (T)0;
+  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationUpdateOutputInference_kernel(
+    const DeviceTensor3 input,
+    DeviceTensor3 output,
+    const DeviceTensor1 runningMean,
+    const DeviceTensor1 runningVar,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 bias,
+    Acctype epsilon) {
+
+  int plane = blockIdx.x;
+
+  Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon);
+  Acctype mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane].ldg());
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane].ldg()) : Acctype(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane].ldg()) : Acctype(0);
+
+  // Write normalized and update the output
+  for (int batch = 0; batch < input.getSize(0); batch++) {
+    for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invstd + beta);
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationUpdateOutput_kernel(
+    const DeviceTensor3 input,
+    DeviceTensor3 output,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 bias,
+    const Acctype epsilon,
+    const Acctype momentum,
+    DeviceTensor1 runningMean,
+    DeviceTensor1 runningVar,
+    DeviceTensor1 saveMean,
+    DeviceTensor1 saveStd) {
+
+  int plane = blockIdx.x;
+  int N = input.getSize(0) * input.getSize(2);
+
+  Acctype norm = Acctype(1) / N;
+
+  // Compute the mean and variance across (batch, x/y/z)
+  Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm;
+  __syncthreads();
+  Acctype varN = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane);
+  Acctype invStd = 0;
+  if (varN != Acctype(0) || epsilon != Acctype(0)) {
+    invStd = 1 / sqrt(varN * norm + epsilon);
+  }
+
+  // Save the mean, variance, and moving averages
+  if (threadIdx.x == 0) {
+    // Momentum based writeback
+    Acctype unbiasedVar = varN / (N - 1);
+    saveMean[plane] = ScalarConvert<Acctype, Dtype>::to(mean);
+    saveStd[plane] = ScalarConvert<Acctype, Dtype>::to(invStd);
+    if (runningMean.data() != NULL) {
+      runningMean[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningMean[plane] + momentum * mean);
+    }
+    if (runningVar.data() != NULL) {
+      runningVar[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar);
+    }
+  }
+
+  // Write normalized and update the output
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : ScalarConvert<int, Acctype>::to(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane]) : ScalarConvert<int, Acctype>::to(0);
+  for (int batch = 0; batch < input.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invStd + beta);
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationBackward_kernel(
+    const DeviceTensor3 input,
+    const DeviceTensor3 gradOutput,
+    DeviceTensor3 gradInput,
+    DeviceTensor1 gradWeight,
+    DeviceTensor1 gradBias,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 runningMean,
+    const DeviceTensor1 runningVar,
+    const DeviceTensor1 saveMean,
+    const DeviceTensor1 saveStd,
+    bool train,
+    Acctype scale,
+    double eps) {
+
+  int plane = blockIdx.x;
+  int N = gradOutput.getSize(0) * gradOutput.getSize(2);
+
+  Acctype mean, stdVal;
+  if (train) {
+    mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+    stdVal = ScalarConvert<Dtype, Acctype>::to(saveStd[plane]);
+  } else {
+    mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]);
+    stdVal = 1 / sqrt(runningVar[plane] + eps);
+  }
+
+  Acctype weightVal = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : Acctype(1);
+  Acctype norm = Acctype(1) / N;
+
+  // Compute two values across (batch, x/y/z) in one pass:
+  // 1. Sum(gradOutput)
+  // 2. DotProduct(input - mean, gradOutput)
+  GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput);
+  Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane);
+  Acctype gradOutputSum = res.v1;
+  Acctype dotP = res.v2;
+
+  Acctype gradMean = gradOutputSum * norm;
+  Acctype projScale = dotP * norm * stdVal * stdVal;
+  Acctype gradScale = stdVal * weightVal;
+
+  if (gradInput.numElements() > 0) {
+    for (int batch = 0; batch < gradOutput.getSize(0); ++batch) {
+      for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) {
+        Dtype gradOut = gradOutput[batch][plane][x];
+        if (train) {
+          Dtype inp = input[batch][plane][x];
+          Acctype proj = (inp - mean) * projScale;
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to((gradOut - proj - gradMean) * gradScale);
+        } else {
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gradOut * gradScale);
+        }
+      }
+    }
+  }
+
+  if (gradWeight.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradWeight[plane] += ScalarConvert<Acctype, Dtype>::to(scale * dotP * stdVal);
+    }
+  }
+
+  if (gradBias.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradBias[plane] += ScalarConvert<Acctype, Dtype>::to(scale * gradOutputSum);
+    }
+  }
+}
+
+#include "generic/BatchNormalization.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
new file mode 100644
index 0000000..79b11c2
--- /dev/null
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -0,0 +1,88 @@
+SET(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
+${CMAKE_CURRENT_SOURCE_DIR}/AbsCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Abs.cu
+${CMAKE_CURRENT_SOURCE_DIR}/BatchNormalization.cu
+${CMAKE_CURRENT_SOURCE_DIR}/BCECriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/ClassNLLCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Col2Im.cu
+${CMAKE_CURRENT_SOURCE_DIR}/DistKLDivCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/ELU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/FeatureLPPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/FusedRNNKernel.cu
+${CMAKE_CURRENT_SOURCE_DIR}/GatedLinearUnit.cu
+${CMAKE_CURRENT_SOURCE_DIR}/HardTanh.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Im2Col.cu
+${CMAKE_CURRENT_SOURCE_DIR}/IndexLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/L1Cost.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LeakyReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LogSigmoid.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LookupTableBag.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LookupTable.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MSECriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MultiLabelMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MultiMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/PReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/RReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Sigmoid.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SmoothL1Criterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftPlus.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftShrink.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SparseLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialClassNLLCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionLocal.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialCrossMapLRN.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialSubSampling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBilinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Sqrt.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Square.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Tanh.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalReflectionPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalRowConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Threshold.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingTrilinear.cu
+PARENT_SCOPE)
+
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+PARENT_SCOPE)
+
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  DESTINATION ${ATEN_INSTALL_INCLUDE_SUBDIR}
+  FILES_MATCHING PATTERN "*.h" PATTERN "*.cuh")
diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu
new file mode 100644
index 0000000..1043454
--- /dev/null
+++ b/aten/src/THCUNN/ClassNLLCriterion.cu
@@ -0,0 +1,185 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+static const int NTHREADS = 32;
+
+template <typename Dtype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output,
+                                                           Dtype *total_weight,
+                                                           Dtype *input,
+                                                           THCIndex_t  *target,
+                                                           Dtype *weights,
+                                                           int size_average,
+                                                           int n_classes,
+                                                           int64_t ignore_index) {
+  assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
+
+  // TODO: T4951791 Reuse code between updateOutput_kernel1 and
+  // updateOutput_kernel.
+
+  int t = (int) *target - TH_INDEX_BASE;
+  if (t != (int) ignore_index) {
+    assert(t >= 0 && t < n_classes);
+    Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
+    *output = -cur_weight * input[t];
+    *total_weight = cur_weight;
+    if (size_average && *total_weight > 0) {
+      *output /= *total_weight;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ClassNLLCriterion_updateOutput_no_reduce_kernel(
+    int batch_size,
+    THCDeviceTensor<Dtype, 2> input,
+    THCDeviceTensor<THCIndex_t, 1> target,
+    THCDeviceTensor<Dtype, 1> output,
+    Dtype *weights,
+    int n_classes,
+    int ignore_index) {
+
+  CUDA_KERNEL_LOOP(index, batch_size) {
+    int cur_target = target[index] - TH_INDEX_BASE;
+    if (cur_target == ignore_index) {
+      output[index] = ScalarConvert<int, Dtype>::to(0);
+      continue;
+    }
+    assert(cur_target  >= 0 && cur_target  < n_classes);
+    Dtype weight =
+       weights ? weights[cur_target] : ScalarConvert<int, Dtype>::to(1);
+    output[index] = -weight * input[index][cur_target];
+  }
+}
+
+template <typename Dtype>
+__global__ void ClassNLLCriterion_updateGradInput_no_reduce_kernel(
+    int batch_size,
+    THCDeviceTensor<THCIndex_t, 1> target,
+    THCDeviceTensor<Dtype, 1> gradOutput,
+    THCDeviceTensor<Dtype, 2> gradInput,
+    Dtype *weights,
+    int n_classes,
+    int ignore_index) {
+
+  CUDA_KERNEL_LOOP(index, batch_size) {
+    int cur_target = target[index] - TH_INDEX_BASE;
+    if (cur_target == ignore_index) {
+      continue;
+    }
+    assert(cur_target  >= 0 && cur_target  < n_classes);
+    Dtype weight =
+       weights ? weights[cur_target] : ScalarConvert<int, Dtype>::to(1);
+    gradInput[index][cur_target] = -weight * gradOutput[index];
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output,
+                                                           Dtype *total_weight,
+                                                           Dtype *input,
+                                                           THCIndex_t *target,
+                                                           Dtype *weights,
+                                                           int size_average,
+                                                           int nframe,
+                                                           int ndim,
+                                                           int n_classes,
+                                                           int64_t ignore_index) {
+  __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS];
+  int i, t;
+  Dtype cur_weight;
+
+  shInputs[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
+  acc_weight[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
+  for (i = threadIdx.x; i < nframe; i += NTHREADS) {
+      t = target[i] - TH_INDEX_BASE;
+      if (t != (int) ignore_index) {
+        assert(t >= 0 && t < n_classes);
+        cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
+        shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
+        acc_weight[threadIdx.x] += cur_weight;
+      }
+  }
+  __syncthreads();
+
+  // TODO: T4951791 Reuse code between updateOutput_kernel1 and
+  // updateOutput_kernel
+
+  if (threadIdx.x == 0) {
+    *output = *total_weight = ScalarConvert<int, Dtype>::to(0);
+    Acctype outputAcc = 0;
+    Acctype total_weightAcc = 0;
+    for (i = 0; i < NTHREADS; ++i){
+      // FIXME should we do somethigng here
+      outputAcc += shInputs[i];
+      total_weightAcc += acc_weight[i];
+    }
+    *total_weight = ScalarConvert<Acctype, Dtype>::to(total_weightAcc);
+    *output = ScalarConvert<Acctype, Dtype>::to(outputAcc);
+    if (size_average && *total_weight > 0) {
+      *output = ScalarConvert<Acctype, Dtype>::to(outputAcc / total_weightAcc);
+    }
+
+  }
+}
+
+template <typename Dtype>
+__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1(
+  Dtype* gradInput,
+  Dtype* gradOutput,
+  Dtype* weights,
+  THCIndex_t* target,
+  Dtype* total_weight,
+  int size_average,
+  int n_classes,
+  int64_t ignore_index)
+{
+  if (*total_weight <= 0) {
+    return;
+  }
+  Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
+  int t = (int)*target - TH_INDEX_BASE;
+  if (t != (int) ignore_index) {
+    assert(t >= 0 && t < n_classes);
+    gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm * gradOutput[0];
+  }
+}
+
+template <typename Dtype>
+__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
+  Dtype *gradInput,
+  Dtype *gradOutput,
+  THCIndex_t *target,
+  Dtype *weights,
+  Dtype *total_weight,
+  int size_average,
+  int nframe,
+  int ndim,
+  int n_classes,
+  int64_t ignore_index)
+{
+  if (*total_weight <= 0) {
+    return;
+  }
+  int i, t;
+  Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
+
+  for (i = threadIdx.x; i < nframe; i += NTHREADS) {
+    t = (int)target[i] - TH_INDEX_BASE;
+    if (t != (int) ignore_index) {
+      assert(t >= 0 && t < n_classes);
+      gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm * gradOutput[0];
+    }
+  }
+}
+
+#include "generic/ClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu
new file mode 100644
index 0000000..d7fd995
--- /dev/null
+++ b/aten/src/THCUNN/Col2Im.cu
@@ -0,0 +1,11 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "im2col.h"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/Col2Im.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu
new file mode 100644
index 0000000..e4e85b7
--- /dev/null
+++ b/aten/src/THCUNN/DistKLDivCriterion.cu
@@ -0,0 +1,64 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+
+template <typename Dtype, typename Acctype>
+struct kl_functor
+{
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
+  {
+      Acctype yAcc = ScalarConvert<Dtype, Acctype>::to(y);
+      return y > 0 ? yAcc * (THCNumerics<Acctype>::log(yAcc) - x) : Acctype(0);
+  }
+};
+
+template <typename Dtype>
+struct kl_updateOutput_no_reduce_functor
+{
+  __forceinline__ __host__ __device__ void operator()(
+      const Dtype *x,
+      const Dtype *y,
+      Dtype *output)
+  {
+      *output = *y > 0 ? *y * (THCNumerics<Dtype>::log(*y) - *x) : ScalarConvert<int, Dtype>::to(0);
+  }
+};
+
+template <typename Dtype>
+struct kl_updateGradInput_no_reduce_functor
+{
+  __host__ __device__ void operator()(
+      const Dtype *target,
+      const Dtype *gradOutput,
+      Dtype *gradInput)
+  {
+      *gradInput = *target > 0 ? (-*target) * *gradOutput : ScalarConvert<int, Dtype>::to(0);
+  }
+};
+
+template <typename Dtype>
+struct kl_updateGradInput_functor
+{
+  const Dtype norm;
+  const Dtype gradOutput;
+
+  kl_updateGradInput_functor(Dtype norm_, Dtype gradOutput_)
+    : norm(norm_), gradOutput(gradOutput_)
+  {}
+
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
+  {
+      return y > 0 ? norm * (-y) * gradOutput : ScalarConvert<int, Dtype>::to(0);
+  }
+};
+
+#include "generic/DistKLDivCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu
new file mode 100644
index 0000000..d17d185
--- /dev/null
+++ b/aten/src/THCUNN/ELU.cu
@@ -0,0 +1,59 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct ELUupdateOutput_functor
+{
+  const T negcoef_;
+  const T poscoef_;
+
+  ELUupdateOutput_functor(T negcoef, T poscoef)
+    : negcoef_(negcoef)
+    , poscoef_(poscoef)
+  {}
+
+  __device__ void operator()(T *output, const T *input) const
+  {
+    *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_;
+  }
+};
+
+// in-place variant
+template <typename T>
+struct ELUupdateOutputIP_functor
+{
+  const T negcoef_;
+  const T poscoef_;
+
+  ELUupdateOutputIP_functor(T negcoef, T poscoef)
+    : negcoef_(negcoef)
+    , poscoef_(poscoef)
+  {}
+
+  __device__ void operator()(T *x) const
+  {
+    *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_;
+  }
+};
+
+template <typename T>
+struct ELUupdateGradInput_functor
+{
+  const T negcoef_;
+  const T poscoef_;
+
+  ELUupdateGradInput_functor(T negcoef, T poscoef)
+    : negcoef_(negcoef)
+    , poscoef_(poscoef)
+  {}
+
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
+  {
+    *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_);
+  }
+};
+
+#include "generic/ELU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/FeatureLPPooling.cu b/aten/src/THCUNN/FeatureLPPooling.cu
new file mode 100644
index 0000000..7026f0d
--- /dev/null
+++ b/aten/src/THCUNN/FeatureLPPooling.cu
@@ -0,0 +1,653 @@
+#include "THCUNN.h"
+#include "THCAtomics.cuh"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorTypeUtils.cuh"
+
+#define OUTPUT_FEATURES_PER_THREAD 32
+#define MAX_WARPS_PER_RUN 4
+
+namespace detail {
+
+/// Various utilities for dealing with arrays of values which are
+/// maintained in thread-local registers. All accesses are done in such
+/// a way such that the index is statically known, which preserves the
+/// compiler's ability to allocate the values to registers, as opposed
+/// to local memory.
+template <typename T, int N>
+struct RegisterUtils {
+  /// Register shifting: move elements towards the beginning of the
+  /// array (towards 0) by `Shift` places:
+  /// arr[i] = arr[i + Shift]
+  /// The `Shift` elements at the end are left unchanged.
+  template <int Shift>
+  __device__ __forceinline__ static void shiftLeft(T arr[N]) {
+    // e.g., N = 5, Shift = 2:
+    // 0 1 2 3 4 becomes =>
+    // 2 3 4 3 4 (last are unchanged)
+#pragma unroll
+    for (int i = 0; i < N - Shift; ++i) {
+      arr[i] = arr[i + Shift];
+    }
+  }
+};
+
+template <typename T>
+__device__ __forceinline__
+int getDim1Point(const THCDeviceTensor<T, 4>& input) {
+  int threadPoint = blockIdx.x * blockDim.x + threadIdx.x;
+  return threadPoint / input.getSize(3);
+}
+
+template <typename T>
+__device__ __forceinline__
+int getDim2Point(const THCDeviceTensor<T, 4>& input) {
+  int threadPoint = blockIdx.x * blockDim.x + threadIdx.x;
+  return threadPoint % input.getSize(3);
+}
+
+__device__ __forceinline__
+int getStartOutputFeature() {
+  return blockIdx.y * OUTPUT_FEATURES_PER_THREAD;
+}
+
+template <typename T>
+__device__ __forceinline__
+int getEndOutputFeature(const THCDeviceTensor<T, 4>& output) {
+  return min((blockIdx.y + 1) * OUTPUT_FEATURES_PER_THREAD, output.getSize(1));
+}
+
+__device__ __forceinline__
+int getBatch() {
+  return blockIdx.z;
+}
+
+// All of these functions that follow are MathOps; they are template
+// parameters so L2 can be more efficiently implemented
+// template <typename T>
+// typedef T (*MathOp)(const T in, const T arg);
+
+template <typename T>
+__device__ __forceinline__ T power2(const T in, const T power) {
+  return THCNumerics<T>::mul(in, in);
+}
+
+template <typename T>
+__device__ __forceinline__ T root2(const T in, const T power) {
+  return THCNumerics<T>::sqrt(in);
+}
+
+template <typename T>
+__device__ __forceinline__ T powerGrad2(const T in, const T power) {
+  return in;
+}
+
+template <typename T>
+__device__ __forceinline__ T powerN(const T in, const T power) {
+  return THCNumerics<T>::pow(in, power);
+}
+
+template <typename T>
+__device__ __forceinline__ T rootN(const T in, const T power) {
+  const T invPower = THCNumerics<T>::cinv(power);
+  return THCNumerics<T>::pow(in, invPower);
+}
+
+template <typename T>
+__device__ __forceinline__ T powerGradN(const T in, const T power) {
+  return THCNumerics<T>::pow(in,
+                             THCNumerics<T>::sub(power,
+                                                 ScalarConvert<int, T>::to(1)));
+}
+
+// Input is of the form:
+// [batch][feature dim][optional dim 1][optional dim 2]
+template <typename T,
+          int Width,
+          int Stride,
+          T (*PowerFunc)(T in, T power),
+          T (*RootFunc)(T in, T power)>
+__global__ void
+featureLPPoolingUpdateOutput(const THCDeviceTensor<T, 4> input,
+                             THCDeviceTensor<T, 4> output,
+                             T power) {
+  // What non-feature points is this thread handling?
+  int dim1Point = getDim1Point(input);
+  int dim2Point = getDim2Point(input);
+
+  if (dim1Point >= input.getSize(2) || dim2Point >= input.getSize(3)) {
+    // This thread in the warp is out of bounds
+    return;
+  }
+
+  // What feature points is this thread handling?
+  int startOutputFeature = getStartOutputFeature();
+  int endOutputFeature = getEndOutputFeature(output);
+  int startInputFeature = startOutputFeature * Stride;
+
+  // What batch points is this thread handling?
+  int batch = getBatch();
+
+  // If stride >= width, then there is no loaded data reuse.
+  // If stride > 1 and stride < width, then shift by stride, since we
+  // can reuse Width - Stride elements from the previous round.
+  // e.g., width = 5, stride = 2,
+  // output 0 uses input 0 1 2 3 4
+  // output 1 uses input 2 3 4 5 6 (inputs 2 - 4 are reused, i.e., 5 -
+  // 2 elements are reused, and we have to shift the array by 2)
+  //
+  // e.g., width = 5, stride = 3,
+  // output 0 uses input 0 1 2 3 4
+  // output 1 uses input 3 4 5 6 7 (inputs 3 - 4 are reused, i.e., 5 - 3
+  // elements are reused, and we have to shift the array by 3)
+
+  // Valid only pooling: load Width elements from input (Width -
+  // Stride is handled here, at the top of the loop we handle the
+  // remaining Stride elements). We already verified that the input is
+  // larger than the width.
+  // `in` will contain the input values ^ power.
+  T in[Width];
+
+#pragma unroll
+  for (int i = 0; i < Width - Stride; ++i) {
+    const T data =
+      input[batch][startInputFeature + i][dim1Point][dim2Point];
+    in[i] = PowerFunc(data, power);
+  }
+
+  for (int outputFeature = startOutputFeature;
+       outputFeature < endOutputFeature;
+       ++outputFeature) {
+    // If Stride < Width, we're loading Stride new values starting at
+    // Width - Stride
+    // If Stride >= Width, we're loading Width new values starting at 0
+    if (Stride < Width) {
+      int nextInputFeature = outputFeature * Stride + Width - Stride;
+
+#pragma unroll
+      for (int i = 0; i < Stride; ++i) {
+        const T data =
+          input[batch][nextInputFeature + i][dim1Point][dim2Point];
+        in[Width - Stride + i] = PowerFunc(data, power);
+      }
+    } else {
+      int nextInputFeature = outputFeature * Stride;
+
+#pragma unroll
+      for (int i = 0; i < Width; ++i) {
+        T data = input[batch][nextInputFeature + i][dim1Point][dim2Point];
+        in[i] = PowerFunc(data, power);
+      }
+    }
+
+    // Calculate the new output feature
+    T val = ScalarConvert<int, T>::to(0);
+    for (int i = 0; i < Width; ++i) {
+      val = THCNumerics<T>::add(val, in[i]);
+    }
+
+    val = RootFunc(val, power);
+    output[batch][outputFeature][dim1Point][dim2Point] = val;
+
+    if (Stride < Width) {
+      // Shift registers for calculating the next point
+      RegisterUtils<T, Width>::template shiftLeft<Stride>(in);
+    }
+  }
+}
+
+// forward pass: f(a, ..., z) = (a^p + ... + z^p)^(1 / p)
+// for bprop:
+//   partial df(a, ... z)/da = a^(p - 1) * (a^p + ... + z^p)^((1 / p) - 1) =
+//   a^(p - 1) * 1/(f(a, ..., z)^(p - 1)) = (a / f(a, ..., z))^(p - 1)
+//
+// example: for p = 2, df(a, ..., z)/da = a / f(a, ..., z)
+// example: for p = 3, df(a, ..., z)/da = (a / f(a, ..., z))^2
+//
+// PowerGradFunc implements x^(p - 1)
+template <typename T,
+          int Width,
+          int Stride,
+          T (*PowerGradFunc)(T in, T arg)>
+__global__ void
+featureLPPoolingUpdateGradInput(const THCDeviceTensor<T, 4> gradOutput,
+                                const THCDeviceTensor<T, 4> input,
+                                const THCDeviceTensor<T, 4> output,
+                                THCDeviceTensor<T, 4> gradInput,
+                                T power) {
+  // What non-feature points is this thread handling?
+  int dim1Point = getDim1Point(input);
+  int dim2Point = getDim2Point(input);
+
+  if (dim1Point >= input.getSize(2) || dim2Point >= input.getSize(3)) {
+    // This thread in the warp is out of bounds
+    return;
+  }
+
+  // What feature points is this thread handling? [start, end)
+  int startOutputFeature = getStartOutputFeature();
+  int endOutputFeature = getEndOutputFeature(output);
+
+  // What is the first input point that the output features depend
+  // upon? [start, end)
+  int startInputFeature = startOutputFeature * Stride;
+  int endInputFeature = endOutputFeature * Stride;
+
+  // What batch points is this thread handling?
+  int batch = getBatch();
+
+  // atomicAdd into gradInput is slow, avoid it where possible.
+  // We can do this because there is a range of gradInput elements
+  // that we are updating exclusively. This is how we find it
+  //
+  //  width = 3 stride = 1 example:
+  // ------------------------------
+  //      startOutputFeature for this thread
+  //        |
+  //        |
+  // previous thread's output feature
+  //   |    |
+  //   |    |                  gradOutput
+  // __v____v___________________
+  // |    |    |    |    |    |
+  // ---------------------------
+  //   |\ \_____
+  //   | \__    \               gradInput
+  // __v____v____v_____________
+  // |    |    |    |    |    |
+  // ---------------------------
+  //         A        A
+  //         |        |
+  //    startInputFeature
+  //                  |
+  //                  exclusiveStartInputFeature
+  //
+  // exclusiveStartInputFeature is the first input feature that we can
+  // write into exclusively; the one right before it overlaps with
+  // updates from a previous thread and thus has to use atomicAdd.
+  int exclusiveStartInputFeature =
+    startInputFeature == 0 ?
+    // no thread is before ourselves
+    0 :
+    // there is a thread before ourselves
+    startInputFeature + (Width - 1) * Stride;
+
+  // Similarly, exclusiveEndInputFeature is the last input feature
+  // that we can write into exclusively, since we might be overlapping
+  // with the following thread
+  int exclusiveEndInputFeature =
+    endOutputFeature == output.getSize(1) ?
+    // no thread is after ourselves
+    endInputFeature + (Width - 1) * Stride :
+    // there is a thread after ourselves
+    endInputFeature;
+
+  // As with updateOutput preload input elements, except no need to
+  // transform them
+  T in[Width];
+#pragma unroll
+  for (int i = 0; i < Width - Stride; ++i) {
+    in[i] = input[batch][startInputFeature + i][dim1Point][dim2Point];
+  }
+
+  for (int outputFeature = startOutputFeature;
+       outputFeature < endOutputFeature;
+       ++outputFeature) {
+    // As with updateOutput load the subsequent input elements that we
+    // need, except no need to transform them
+    //
+    // If Stride < Width, we're loading Stride new values starting at
+    // Width - Stride
+    // If Stride >= Width, we're loading Width new values starting at 0
+    if (Stride < Width) {
+      int nextInputFeature = outputFeature * Stride + Width - Stride;
+
+#pragma unroll
+      for (int i = 0; i < Stride; ++i) {
+        in[Width - Stride + i] =
+          input[batch][nextInputFeature + i][dim1Point][dim2Point];
+      }
+    } else {
+      int nextInputFeature = outputFeature * Stride;
+
+#pragma unroll
+      for (int i = 0; i < Width; ++i) {
+        in[i] = input[batch][nextInputFeature + i][dim1Point][dim2Point];
+      }
+    }
+
+    // A given output feature gradient contributes to `Width` input
+    // gradients
+    const T gradOut =
+      gradOutput[batch][outputFeature][dim1Point][dim2Point];
+
+    // Load output (f(x_is)). It is possible that this is zero, in
+    // which case we'll ignore this point.
+    T out = output[batch][outputFeature][dim1Point][dim2Point];
+    if (THCNumerics<T>::eq(out, ScalarConvert<int, T>::to(0))) {
+      continue;
+    }
+
+    int curStartInputFeature = outputFeature * Stride;
+    int curEndInputFeature = outputFeature * Stride + Width - 1;
+
+    if (curStartInputFeature >= exclusiveStartInputFeature &&
+        curEndInputFeature < exclusiveEndInputFeature) {
+      // This thread is exclusively responsible for updating these
+      // input points, so we need not make the addition atomic
+      for (int i = 0; i < Width; ++i) {
+        int inputFeature = outputFeature * Stride + i;
+
+        // Calculate grad * (x_i / f(x_is))^(p - 1)
+        const T val = THCNumerics<T>::mul(
+          gradOut,
+          PowerGradFunc(THCNumerics<T>::div(in[i], out), power));
+
+        gradInput[batch][inputFeature][dim1Point][dim2Point] =
+          THCNumerics<T>::add(
+            gradInput[batch][inputFeature][dim1Point][dim2Point], val);
+      }
+    } else {
+      // Handle start and end boundary cases: potential overlap with
+      // other threads
+      for (int i = 0; i < Width; ++i) {
+        int inputFeature = outputFeature * Stride + i;
+
+        // Calculate grad * (x_i / f(x_is))^(p - 1)
+        T val = THCNumerics<T>::mul(
+          gradOut,
+          PowerGradFunc(THCNumerics<T>::div(in[i], out), power));
+
+        // We don't overlap other threads for this range
+        if (inputFeature >= exclusiveStartInputFeature &&
+            inputFeature < exclusiveEndInputFeature) {
+          gradInput[batch][inputFeature][dim1Point][dim2Point]
+            = THCNumerics<T>::add(
+              gradInput[batch][inputFeature][dim1Point][dim2Point], val);
+        } else {
+          // We are potentially overlapping with threads handling
+          // features before ourselves, so these need to be added atomically
+          atomicAdd(&gradInput[batch][inputFeature][dim1Point][dim2Point],
+                    val);
+        }
+      }
+    }
+
+    if (Stride < Width) {
+      // Shift registers for calculating the next point
+      RegisterUtils<T, Width>::template shiftLeft<Stride>(in);
+    }
+  }
+}
+
+} // namespace detail
+
+inline int lpPoolingOutputSize(int inputSize, int width, int stride) {
+  return ((inputSize - width) / stride) + 1;
+}
+
+template <typename T>
+bool
+runFeatureLPPoolingUpdateOutput(THCState* state,
+                                const THCDeviceTensor<T, 4>& input,
+                                THCDeviceTensor<T, 4>& output,
+                                float power, int width, int stride) {
+  cudaStream_t stream =
+    THCState_getCurrentStream(state);
+  const cudaDeviceProp* deviceProperties =
+    THCState_getCurrentDeviceProperties(state);
+
+  int outputFeatures = ((input.getSize(1) - width) / stride) + 1;
+
+  THAssert(input.getSize(0) == output.getSize(0));
+  THAssert(outputFeatures == output.getSize(1));
+  THAssert(input.getSize(1) >= width);
+
+  THAssert(input.getSize(2) == output.getSize(2));
+  THAssert(input.getSize(3) == output.getSize(3));
+  THAssert(power > 0.0f);
+  THAssert(width >= 1);
+  THAssert(stride >= 1);
+
+  // Split non-features among threads and grid x
+  int totalNonFeatureSize = input.getSize(2) * input.getSize(3);
+  int numWarps =
+    min(THCCeilDiv(totalNonFeatureSize, deviceProperties->warpSize),
+        MAX_WARPS_PER_RUN);
+  int blockSize = deviceProperties->warpSize * numWarps;
+
+  // Split non-features among grid x
+  int nonFeatureSizeBlocks = THCCeilDiv(totalNonFeatureSize, blockSize);
+
+  // Split features among grid y, up to a maximum number of features per thread
+  int featureBlocks = THCCeilDiv(outputFeatures, OUTPUT_FEATURES_PER_THREAD);
+
+  // Split batch among grid z.
+  dim3 grid(nonFeatureSizeBlocks, featureBlocks, input.getSize(0));
+  dim3 block(blockSize);
+
+#define L2_STRIDE_CASE(STRIDE, WIDTH)                                   \
+  case STRIDE:                                                          \
+    detail::                                                            \
+    featureLPPoolingUpdateOutput<T, WIDTH,                              \
+                                 STRIDE,                                \
+                                 detail::power2,                        \
+                                 detail::root2><<<grid, block, 0, stream>>>( \
+                                   input, output,                       \
+                                   ScalarConvert<float, T>::to(power)); \
+    return true;
+
+#define L2_WIDTH_CASE(WIDTH)                    \
+  case WIDTH:                                   \
+    switch (stride) {                           \
+      L2_STRIDE_CASE(1, WIDTH);                 \
+      L2_STRIDE_CASE(2, WIDTH);                 \
+      L2_STRIDE_CASE(3, WIDTH);                 \
+      L2_STRIDE_CASE(4, WIDTH);                 \
+    }
+
+#define LP_STRIDE_CASE(STRIDE, WIDTH)                                   \
+  case STRIDE:                                                          \
+    detail::                                                            \
+    featureLPPoolingUpdateOutput<T, WIDTH,                              \
+                                 STRIDE,                                \
+                                 detail::powerN,                        \
+                                 detail::rootN><<<grid, block, 0, stream>>>( \
+                                   input, output,                       \
+                                   ScalarConvert<float, T>::to(power)); \
+    return true;
+
+#define LP_WIDTH_CASE(WIDTH)                    \
+  case WIDTH:                                   \
+    switch (stride) {                           \
+      LP_STRIDE_CASE(1, WIDTH);                 \
+      LP_STRIDE_CASE(2, WIDTH);                 \
+      LP_STRIDE_CASE(3, WIDTH);                 \
+      LP_STRIDE_CASE(4, WIDTH);                 \
+    }
+
+  if (power == 2.0f) {
+    switch (width) {
+      L2_WIDTH_CASE(2);
+      L2_WIDTH_CASE(3);
+      L2_WIDTH_CASE(4);
+      L2_WIDTH_CASE(5);
+      L2_WIDTH_CASE(6);
+      L2_WIDTH_CASE(7);
+      L2_WIDTH_CASE(8);
+      L2_WIDTH_CASE(9);
+      L2_WIDTH_CASE(10);
+      L2_WIDTH_CASE(11);
+      L2_WIDTH_CASE(12);
+      L2_WIDTH_CASE(13);
+      L2_WIDTH_CASE(14);
+      L2_WIDTH_CASE(15);
+      L2_WIDTH_CASE(16);
+    }
+  } else {
+    switch (width) {
+      LP_WIDTH_CASE(2);
+      LP_WIDTH_CASE(3);
+      LP_WIDTH_CASE(4);
+      LP_WIDTH_CASE(5);
+      LP_WIDTH_CASE(6);
+      LP_WIDTH_CASE(7);
+      LP_WIDTH_CASE(8);
+      LP_WIDTH_CASE(9);
+      LP_WIDTH_CASE(10);
+      LP_WIDTH_CASE(11);
+      LP_WIDTH_CASE(12);
+      LP_WIDTH_CASE(13);
+      LP_WIDTH_CASE(14);
+      LP_WIDTH_CASE(15);
+      LP_WIDTH_CASE(16);
+    }
+  }
+
+  // Otherwise, we have an unhandled width and/or stride.
+  return false;
+
+#undef L2_STRIDE_CASE
+#undef L2_WIDTH_CASE
+#undef LP_STRIDE_CASE
+#undef LP_WIDTH_CASE
+}
+
+template <typename T>
+bool
+runFeatureLPPoolingUpdateGradInput(THCState* state,
+                                   const THCDeviceTensor<T, 4>& gradOutput,
+                                   const THCDeviceTensor<T, 4>& input,
+                                   const THCDeviceTensor<T, 4>& output,
+                                   THCDeviceTensor<T, 4>& gradInput,
+                                   float power, int width, int stride) {
+  cudaStream_t stream =
+    THCState_getCurrentStream(state);
+  const cudaDeviceProp* deviceProperties =
+    THCState_getCurrentDeviceProperties(state);
+
+  for (int i = 0; i < 4; ++i) {
+    THAssert(gradOutput.getSize(i) == output.getSize(i));
+    THAssert(gradInput.getSize(i) == input.getSize(i));
+  }
+
+  int outputFeatures = ((input.getSize(1) - width) / stride) + 1;
+
+  THAssert(gradInput.getSize(0) == gradOutput.getSize(0));
+  THAssert(outputFeatures == gradOutput.getSize(1));
+  THAssert(gradInput.getSize(1) >= width);
+
+  THAssert(gradInput.getSize(2) == gradOutput.getSize(2));
+  THAssert(gradInput.getSize(3) == gradOutput.getSize(3));
+  THAssert(power > 0.0f);
+  THAssert(width >= 1);
+  THAssert(stride >= 1);
+
+  // Different threads are potentially adding into overlapping input
+  // points, so we must clear out gradInput before continuing.
+  gradInput.zero(stream);
+
+  // Split non-features among threads and grid x
+  int totalNonFeatureSize = input.getSize(2) * input.getSize(3);
+  int numWarps =
+    min(THCCeilDiv(totalNonFeatureSize, deviceProperties->warpSize),
+        MAX_WARPS_PER_RUN);
+  int blockSize = deviceProperties->warpSize * numWarps;
+
+  // Split non-features among grid x
+  int nonFeatureSizeBlocks = THCCeilDiv(totalNonFeatureSize, blockSize);
+
+  // Split features among grid y, up to a maximum number of features per thread
+  int featureBlocks = THCCeilDiv(outputFeatures, OUTPUT_FEATURES_PER_THREAD);
+
+  // Split batch among grid z.
+  dim3 grid(nonFeatureSizeBlocks, featureBlocks, input.getSize(0));
+  dim3 block(blockSize);
+
+#define L2_STRIDE_CASE(STRIDE, WIDTH)                                   \
+  case STRIDE:                                                          \
+    detail::                                                            \
+    featureLPPoolingUpdateGradInput<                                    \
+          T, WIDTH, STRIDE, detail::powerGrad2><<<grid, block, 0, stream>>>( \
+            gradOutput, input, output, gradInput,                       \
+            ScalarConvert<float, T>::to(power));                        \
+    return true;
+
+#define L2_WIDTH_CASE(WIDTH)                    \
+  case WIDTH:                                   \
+    switch (stride) {                           \
+      L2_STRIDE_CASE(1, WIDTH);                 \
+      L2_STRIDE_CASE(2, WIDTH);                 \
+      L2_STRIDE_CASE(3, WIDTH);                 \
+      L2_STRIDE_CASE(4, WIDTH);                 \
+    }
+
+#define LP_STRIDE_CASE(STRIDE, WIDTH)                                   \
+  case STRIDE:                                                          \
+    detail::                                                            \
+    featureLPPoolingUpdateGradInput<                                    \
+          T, WIDTH, STRIDE, detail::powerGradN><<<grid, block, 0, stream>>>( \
+            gradOutput, input, output, gradInput,                       \
+            ScalarConvert<float, T>::to(power));                        \
+    return true;
+
+#define LP_WIDTH_CASE(WIDTH)                    \
+  case WIDTH:                                   \
+    switch (stride) {                           \
+      LP_STRIDE_CASE(1, WIDTH);                 \
+      LP_STRIDE_CASE(2, WIDTH);                 \
+      LP_STRIDE_CASE(3, WIDTH);                 \
+      LP_STRIDE_CASE(4, WIDTH);                 \
+    }
+
+  if (power == 2.0f) {
+    switch (width) {
+      L2_WIDTH_CASE(2);
+      L2_WIDTH_CASE(3);
+      L2_WIDTH_CASE(4);
+      L2_WIDTH_CASE(5);
+      L2_WIDTH_CASE(6);
+      L2_WIDTH_CASE(7);
+      L2_WIDTH_CASE(8);
+      L2_WIDTH_CASE(9);
+      L2_WIDTH_CASE(10);
+      L2_WIDTH_CASE(11);
+      L2_WIDTH_CASE(12);
+      L2_WIDTH_CASE(13);
+      L2_WIDTH_CASE(14);
+      L2_WIDTH_CASE(15);
+      L2_WIDTH_CASE(16);
+    }
+  } else {
+    switch (width) {
+      LP_WIDTH_CASE(2);
+      LP_WIDTH_CASE(3);
+      LP_WIDTH_CASE(4);
+      LP_WIDTH_CASE(5);
+      LP_WIDTH_CASE(6);
+      LP_WIDTH_CASE(7);
+      LP_WIDTH_CASE(8);
+      LP_WIDTH_CASE(9);
+      LP_WIDTH_CASE(10);
+      LP_WIDTH_CASE(11);
+      LP_WIDTH_CASE(12);
+      LP_WIDTH_CASE(13);
+      LP_WIDTH_CASE(14);
+      LP_WIDTH_CASE(15);
+      LP_WIDTH_CASE(16);
+    }
+  }
+
+  // Otherwise, we have an unhandled width and/or stride.
+  return false;
+
+#undef L2_STRIDE_CASE
+#undef L2_WIDTH_CASE
+#undef LP_STRIDE_CASE
+#undef LP_WIDTH_CASE
+}
+
+#include "generic/FeatureLPPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu
new file mode 100644
index 0000000..d8b594a
--- /dev/null
+++ b/aten/src/THCUNN/FusedRNNKernel.cu
@@ -0,0 +1,46 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct TensorSigmoidOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) const {
+    T one = (T) 1.0;
+    *out = one / (one + THCNumerics<T>::exp(- *in));
+  }
+
+  __device__ __forceinline__ void operator()(T* v) const {
+    T one = (T) 1.0;
+    *v = one / (one + THCNumerics<T>::exp(- *v));
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSigmoidOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
+#else
+    float fin = ScalarConvert<half, float>::to(*in);
+    *out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
+#else
+    float fv = ScalarConvert<half, float>::to(*v);
+    *v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));
+#endif
+  }
+};
+#endif
+
+#include "generic/FusedRNNKernel.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu
new file mode 100644
index 0000000..aba9f1e
--- /dev/null
+++ b/aten/src/THCUNN/GatedLinearUnit.cu
@@ -0,0 +1,37 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+#include "common.h"
+
+template <typename Dtype, typename Acctype>
+struct gatedLinearCSigMul_functor
+{
+  __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const
+  {
+    const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
+    const Dtype mulNum = *mulTensor;
+    *target = ScalarConvert<Acctype, Dtype>::to(sigNum * mulNum);
+  }
+};
+
+
+template<typename Dtype, typename Acctype>
+struct gatedLinearDerivative
+{
+   const int64_t stride_i_;
+   const int64_t stride_gI_;
+   gatedLinearDerivative(int64_t stride_i, int64_t stride_gI)
+      :stride_i_(stride_i), stride_gI_(stride_gI){}
+   __device__ void operator()(Dtype * gI, const Dtype * gO, const Dtype * input) const
+   {
+      const Dtype * sigTensor = input + stride_i_;
+      const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
+      *gI = ScalarConvert<Acctype, Dtype>::to(sigNum * *gO);
+      Dtype * gIsecond = gI + stride_gI_;
+      *gIsecond = ScalarConvert<Acctype, Dtype>::to((Acctype(1) - sigNum) * sigNum * *gO * *input);
+   }
+};
+
+#include "generic/GatedLinearUnit.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu
new file mode 100644
index 0000000..539b22f
--- /dev/null
+++ b/aten/src/THCUNN/HardTanh.cu
@@ -0,0 +1,63 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct hardtanhupdateOutput_functor
+{
+  const T max_val_;
+  const T min_val_;
+
+  hardtanhupdateOutput_functor(T min_val, T max_val)
+    : min_val_(min_val)
+    , max_val_(max_val)
+  {}
+
+  __device__ void operator()(T *output, const T *input) const
+  {
+    if (*input < min_val_)
+      *output = min_val_;
+    else if (*input > max_val_)
+      *output = max_val_;
+    else
+      *output = *input;
+  }
+
+  __device__ void operator()(T *input) const
+  {
+    if (*input < min_val_)
+      *input = min_val_;
+    else if (*input > max_val_)
+      *input = max_val_;
+  }
+};
+
+template <typename T>
+struct hardtanhupdateGradInput_functor
+{
+  const T max_val_;
+  const T min_val_;
+
+  hardtanhupdateGradInput_functor(T min_val, T max_val)
+    : min_val_(min_val)
+    , max_val_(max_val)
+  {}
+
+  __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
+  {
+    if (*input <= min_val_ || *input >= max_val_)
+      *gradInput = ScalarConvert<int, T>::to(0);
+    else
+      *gradInput = *gradOutput;
+  }
+
+  __device__ void operator()(T *gradInput, const T *input) const
+  {
+    if (*input <= min_val_ || *input >= max_val_)
+      *gradInput = ScalarConvert<int, T>::to(0);
+  }
+};
+
+#include "generic/HardTanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu
new file mode 100644
index 0000000..95bdcd4
--- /dev/null
+++ b/aten/src/THCUNN/Im2Col.cu
@@ -0,0 +1,11 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/Im2Col.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu
new file mode 100644
index 0000000..2422af9
--- /dev/null
+++ b/aten/src/THCUNN/IndexLinear.cu
@@ -0,0 +1,473 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#define divup(a, b) ((a) + (b) - 1) / (b)
+const int THREADS_PER_BLOCK = 256;
+const int THREADS_X = 32;
+const int THREADS_Y = THREADS_PER_BLOCK / THREADS_X;
+const int REPEAT = 32;
+const int64_t NNZ_PER_BLOCK_MAX = 1024;
+
+/* sign MACRO */
+#ifndef clamp
+#define clamp(a, low, high) max(min((a), (high)), (low))
+#endif
+
+__device__ double atomicExch(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long res = atomicExch(address_as_ull, __double_as_longlong(val));
+  return __longlong_as_double(res);
+}
+
+template<typename Ty, bool train>
+__global__ static
+void updateOutput(
+    Ty *output,
+    Ty *normalizedValues,
+    const Ty *values,
+    const int64_t *cumSumSizes,
+    const int64_t *keys,
+    const int64_t batchSize,
+    const int64_t outDim,
+    Ty *weight,
+    const Ty *bias,
+    const int64_t weightStride,
+    const int64_t keysOffset,
+    const int maxNormalize,
+    const int nnzPerBlock)
+{
+    /*******************************************************
+     * Adapted from the following file in arrayfire
+     * https://github.com/arrayfire/arrayfire/blob/v3.4.1/src/backend/opencl/kernel/csrmm.cl
+     *
+     *******************************************************
+     * Original copyright notice can be seen below:
+     *
+     * Copyright (c) 2016, ArrayFire
+     * All rights reserved.
+     *
+     * This file is distributed under 3-clause BSD license.
+     * The complete license agreement can be obtained at:
+     * http://arrayfire.com/licenses/BSD-3-Clause
+     ********************************************************/
+
+    const int64_t tidx = threadIdx.x;
+    const int64_t tidy = threadIdx.y;
+    const int64_t tid  = tidy * blockDim.x + tidx;
+    const int64_t gidx = blockIdx.x * blockDim.x + tidx;
+
+
+    Ty *nWeight = weight;
+     // Offset the number of elements specified by  maxNormalize
+    weight += gidx + maxNormalize;
+    output += gidx;
+
+    bool within_N = (gidx < outDim);
+
+    __shared__ Ty s_values[THREADS_PER_BLOCK];
+    __shared__ int64_t s_keys[THREADS_PER_BLOCK];
+
+    const int64_t rowId = blockIdx.y;
+    // if (rowId >= batchSize) return;
+
+    // Load the nonzero column offsets for current row
+    const int64_t batchStart = (rowId == 0 ? 0 : cumSumSizes[rowId - 1]) + blockIdx.z * nnzPerBlock;
+    const int64_t batchEnd   = min(batchStart + nnzPerBlock, cumSumSizes[rowId]);
+    const int64_t batchStride = blockDim.x * blockDim.y;
+
+    Ty outVal = 0;
+    // Since the number of nonzero elements might be greater than local memory available,
+    // Load only part of the row into local memory, perform partial dot, repeat until done.
+    for (int64_t id = batchStart; id < batchEnd; id += batchStride) {
+        // Load the current chunk of the row into local memory
+        int64_t lim = min(batchEnd - id, (int64_t)batchStride);
+
+        int64_t key = tid < lim ? keys[id + tid] + keysOffset : -1;
+        Ty val = tid < lim ? values[id + tid] : 0;
+        int64_t nWeightOffset = key * weightStride;
+
+        if (tid < lim && maxNormalize) {
+            Ty *nWeightCurr = nWeight + nWeightOffset;
+            if (train) {
+                Ty absVal = fabs(val);
+                Ty maxVal = nWeightCurr[0];
+                if (absVal > maxVal) {
+                    // Updating maxVal and invMaxVal. Go hogwild!
+                    Ty invAbsVal = 1.0 / absVal;
+                    atomicExch(nWeightCurr + 0, absVal);
+                    atomicExch(nWeightCurr + 1, invAbsVal);
+                }
+                val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
+                normalizedValues[id + tid] = val;
+                nWeightCurr[2] = 1;
+            } else {
+                val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
+            }
+        }
+
+        s_keys[tid] = key;
+        s_values[tid] = val;
+        __syncthreads();
+
+        // Perform a single "dot" operation for each thread
+        for (int64_t idy = tidy; within_N && idy < lim; idy += blockDim.y) {
+            outVal += s_values[idy] * weight[weightStride * s_keys[idy]];
+        }
+        __syncthreads();
+    }
+
+    // s_values is no longer used at this point. Reuse it for reducing outVal.
+    // A reduction along the y dimension now gives a single output value along x.
+    s_values[tid] = outVal;
+    for (int64_t y = blockDim.y / 2; y >= 1; y /= 2) {
+        __syncthreads();
+        if (tidy < y) s_values[tid] = s_values[tid] + s_values[tid + y * blockDim.x];
+    }
+
+    if (within_N && tidy == 0) {
+        Ty val = s_values[tid] + (blockIdx.z == 0 ? bias[gidx] : 0);
+        if (gridDim.z == 1) {
+            output[rowId * outDim] = val;
+        } else {
+            atomicAdd(output + rowId * outDim, val);
+        }
+    }
+}
+
+// This kernel takes in the following inputs:
+// values of size [keysSize x 1] and gradOutput of size [batchSize x outDim],
+// to generate gradWeight of size [keysSize x outDim]
+// nth block along y dimension computes on the non zero elements from the nth batch.
+template<typename Ty>
+__global__ static
+void accGradWeight(
+    Ty *gradWeight,
+    const Ty *gradOutput,
+    const Ty *values,
+    const int64_t  *cumSumSizes,
+    const int64_t  outDim,
+    const int64_t  gradWeightStride,
+    const Ty scale,
+    const Ty weightDecay,
+    const int maxNormalize)
+{
+    const int64_t bidy = blockIdx.y;
+    const int64_t tidx = threadIdx.x;
+    const int64_t tidy = threadIdx.y;
+    const int64_t tid  = tidy * blockDim.x + tidx;
+    const int64_t ntid = blockDim.x * blockDim.y;
+    const int64_t gidx = blockIdx.x * blockDim.x + tidx;
+
+    // All the y threads in the block will use the same gradOutput value
+    gradOutput += bidy * outDim;
+    Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);
+
+    // Calculate the amount of work for the current block / batch.
+    const int64_t batchStart = bidy == 0 ? 0 : cumSumSizes[bidy - 1];
+    const int64_t batchEnd   = cumSumSizes[bidy];
+    const int64_t batchLimit = batchEnd - batchStart;
+
+    // Number of iterations required to finish the work for the current batch.
+    const int64_t iters    = divup(batchLimit, ntid);
+
+    // Offset the values to the current batch.
+    values += batchStart;
+
+    // When maxNormalize is enabled, gradWeight will be twice the size.
+    // The first half will contain the gradients required for maxNormalization.
+    // The second half will contain the gradients required for updating weights.
+    // if maxNormalize is false, both will evaluate to the same pointer.
+    Ty *gradWeight0 = gradWeight + batchStart * gradWeightStride + gidx;
+    Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);
+
+    __shared__ Ty s_values[THREADS_PER_BLOCK];
+
+    // Using iters to avoid divergence + synchtreads
+    for (int64_t n = 0; n < iters; n++) {
+        int64_t off = n * ntid;
+        int64_t id = off + tid;
+        int64_t lim = min(ntid, batchLimit - off);
+
+        // Read the values required for the current iteration.
+        s_values[tid] = id < batchLimit ? values[id] : 0;
+        __syncthreads();
+
+        if (gidx < outDim) {
+            if (maxNormalize) {
+                for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
+                    // gradOutVal is already scaled
+                    gradWeight0[(off + idy) * gradWeightStride] = gradOutVal;
+                }
+            }
+
+            for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
+                gradWeight1[(off + idy) * gradWeightStride] = s_values[idy] * gradOutVal;
+            }
+        }
+        __syncthreads();
+    }
+}
+
+// The gradBias is just a reduction of gradOutput along the batches.
+// There is only one block along y dimension performing the reduction.
+template<typename Ty, bool update>
+__global__ static
+void accGradBias(
+    Ty *buffer,
+    const Ty *gradOutput,
+    const int64_t  outDim,
+    const int64_t  batchSize,
+    const Ty scale,
+    const Ty weightDecay)
+{
+    const int tidx = threadIdx.x;
+    const int tidy = threadIdx.y;
+    const int tid = tidy * blockDim.x + tidx;
+    const int64_t idx = blockIdx.x * blockDim.x + tidx;
+
+
+    Ty gradBiasVal = 0;
+    gradOutput += idx;
+    __shared__ Ty s_gradBiasVals[THREADS_PER_BLOCK];
+
+    // Each thread along y calculates the partial sum.
+    if (idx < outDim) {
+        for (int64_t idy = tidy; idy < batchSize; idy += blockDim.y) {
+            gradBiasVal += gradOutput[idy * outDim];
+        }
+    }
+    s_gradBiasVals[tid] = gradBiasVal * scale;
+    __syncthreads();
+
+    // Perform reduction is performed along y.
+    for (int y = blockDim.y / 2; y >= 1; y /= 2) {
+        if (tidy < y) {
+            s_gradBiasVals[tid] += s_gradBiasVals[tid + y * blockDim.x];
+        }
+        __syncthreads();
+    }
+
+    // Write the output only from the first lane.
+    if (tidy == 0 && idx < outDim) {
+        if (update) {
+            // If performing inplace update, subtract from bias.
+            Ty *bias = buffer;
+            bias[idx] = (bias[idx] - s_gradBiasVals[tid]);
+        } else {
+            // If just accumulating gradients, write to gradBias.
+            Ty *gradBias = buffer;
+            gradBias[idx] = s_gradBiasVals[tid];
+        }
+    }
+}
+
+// Use gradWeight from accGradWeight to update the weight.
+// This kernel is launched batchSize number of times.
+// At each step in the iteration, the weights are updated in a sparse manner.
+template<typename Ty>
+__global__ static
+void updateWeight(
+    Ty *weight,
+    const Ty *gradWeight,
+    const int64_t *keys,
+    const int64_t *cumSumSizes,
+    const int64_t outDim,
+    const int64_t gradWeightStride,
+    const int64_t weightStride,
+    const int64_t keysOffset,
+    const Ty learningRate,
+    const Ty weightDecay,
+    const int maxNormalize,
+    const int64_t batchId)
+{
+    int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Find the limits of the work to be done
+    const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
+    const int64_t batchEnd = cumSumSizes[batchId];
+
+    // When maxNormalize is turned on, the weight tensor will contain
+    // an extra "maxNormalize" number of terms per output at the beginning.
+    // When maxNormalize is false, both will evaluate to same pointer.
+    // when maxNormalize is true,
+    // - nWeight[2] will contain the individual scaling factor.
+    // - nWeight[3] will contain the individual bias for the normalized input.
+    Ty *nWeight = weight;
+    weight += maxNormalize + gidx;
+
+    // When maxNormalize is enabled, gradWeight will be twice the size.
+    // The first half will contain the gradients required for maxNormalization.
+    // The second half will contain the gradients required for updating weights.
+    // if maxNormalize is false, both will evaluate to the same pointer.
+    const Ty *gradWeight0 = gradWeight + gidx;
+    const Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);
+
+    if (gidx >= outDim) return;
+    for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
+        Ty lr = learningRate;
+        Ty wd = weightDecay;
+        int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
+        Ty weightVal = weight[weightOffset];
+
+        if (maxNormalize) {
+            Ty scale = nWeight[weightOffset + 2];
+            lr *= scale;
+            wd *= scale;
+            // nWeight[3] needs to be updated in the following manner for a given input.
+            // nWeight[3] = nWeight[3] - sum(gradWeight0[gidx] * weight[gidx]);
+            // Since problem is parallelized along gidx, use atomicAdd for the update.
+            Ty gradNormBias = lr * weightVal * gradWeight0[id * gradWeightStride];
+            atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
+        }
+
+        // Perform the regular update
+        Ty gradWeightVal = lr * gradWeight1[id * gradWeightStride];
+        if (weightDecay == 0) {
+            weight[weightOffset] = weightVal - gradWeightVal;
+        } else {
+            weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
+        }
+    }
+}
+
+// This kernel is launched batchSize number of times.
+// At each step in the iteration, the weights are updated in place in a sparse manner.
+template<typename Ty>
+__global__ static
+void accUpdateWeight(
+    Ty *weight,
+    const int64_t weightStride,
+    const Ty *gradOutput,
+    const int64_t outDim,
+    const Ty *values,
+    const int64_t *cumSumSizes,
+    const int64_t *keys,
+    const int64_t keysOffset,
+    const Ty scale,
+    const Ty weightDecay,
+    const int maxNormalize,
+    const int64_t batchId)
+{
+    // Parallel along outDim.
+    int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
+    // Parallel along the sparse input size for current batch.
+    int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (gidx >= outDim) return;
+
+    // Find the limits of the work to be done.
+    const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
+    const int64_t batchEnd = cumSumSizes[batchId];
+
+    gradOutput += batchId * outDim;
+    Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);
+
+    // When maxNormalize is turned on, the weight tensor will contain
+    // an extra "maxNormalize" number of terms per output at the beginning.
+    // When maxNormalize is false, both will evaluate to same pointer.
+    // when maxNormalize is true,
+    // - nWeight[2] will contain the individual scaling factor.
+    // - nWeight[3] will contain the individual bias for the normalized input.
+    Ty *nWeight = weight;
+    weight += maxNormalize + gidx;
+
+    for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
+        Ty wd = weightDecay;
+        int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
+        Ty gradWeightVal = gradOutVal * values[id];
+        Ty weightVal = weight[weightOffset];
+
+        if (maxNormalize) {
+            Ty nScale = nWeight[weightOffset + 2];
+            gradWeightVal *= nScale;
+            wd *= nScale;
+            // nWeight[3] needs to be updated in the following manner for a given input.
+            // nWeight[3] = nWeight[3] - sum(gradOut[gidx] * weight[gidx]);
+            // Since problem is parallelized along gidx, use atomicAdd for the update.
+            Ty gradNormBias = nScale * weightVal * gradOutVal;
+            atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
+        }
+
+        // Perform the regular update
+        if (weightDecay == 0) {
+            weight[weightOffset] = weightVal - gradWeightVal;
+        } else {
+            weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
+        }
+    }
+}
+
+
+#ifdef CUDA_HALF_TENSOR
+void THNN_CudaHalfIndexLinear_updateOutput(
+                  THCState *state,
+                  THCudaLongTensor *keys,
+                  int64_t keysOffset,
+                  THCudaHalfTensor *values,
+                  THCudaLongTensor *sizes,
+                  THCudaLongTensor *cumSumSizes,
+                  THCudaHalfTensor *output,
+                  THCudaHalfTensor *weight,
+                  THCudaHalfTensor *bias,
+                  THCudaHalfTensor *normalizedValues,
+                  int   train) {
+    THError("THCudaHalfTensor not supported with IndexLinear");
+}
+
+void THNN_CudaHalfIndexLinear_accGradParameters(
+                  THCState *state,
+                  THCudaLongTensor *keys,
+                  int64_t keysOffset,
+                  THCudaHalfTensor *values,
+                  THCudaLongTensor *sizes,
+                  THCudaLongTensor *cumSumSizes,
+                  THCudaHalfTensor *gradOutput,
+                  THCudaHalfTensor *gradWeight,
+                  THCudaHalfTensor *gradBias,
+                  THCudaHalfTensor *weight,
+                  THCudaHalfTensor *bias,
+                  THCudaHalfTensor* valuesBuffer,
+                  float weightDecay,
+                  float scale) {
+    THError("THCudaHalfTensor not supported with IndexLinear");
+}
+
+void THNN_CudaHalfIndexLinear_accUpdateGradParameters(
+                  THCState *state,
+                  THCudaLongTensor *keys,
+                  int64_t keysOffset,
+                  THCudaHalfTensor *values,
+                  THCudaLongTensor *sizes,
+                  THCudaLongTensor *cumSumSizes,
+                  THCudaHalfTensor *gradOutput,
+                  THCudaHalfTensor *weight,
+                  THCudaHalfTensor *bias,
+                  float weightDecay,
+                  float scale) {
+    THError("THCudaHalfTensor not supported with IndexLinear");
+}
+
+void THNN_CudaHalfIndexLinear_updateParameters(
+                  THCState *state,
+                  THCudaHalfTensor *gradWeight,
+                  THCudaHalfTensor *gradBias,
+                  THCudaHalfTensor *weight,
+                  THCudaHalfTensor *bias,
+                  THCudaLongTensor *runningKeys,
+                  THCudaLongTensor *cumSumSizes,
+                  int64_t keysOffset,
+                  float weightDecay,
+                  float learningRate) {
+    THError("THCudaHalfTensor not supported with IndexLinear");
+}
+#endif
+
+#include "generic/IndexLinear.cu"
+#include "THCGenerateFloatType.h"
+#include "generic/IndexLinear.cu"
+#include "THCGenerateDoubleType.h"
diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu
new file mode 100644
index 0000000..eda58c1
--- /dev/null
+++ b/aten/src/THCUNN/L1Cost.cu
@@ -0,0 +1,34 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+
+template <typename Dtype, typename Acctype>
+struct l1cost_functor
+{
+  __host__ __device__ Acctype operator()(Dtype x) const
+  {
+    return THCNumerics<Acctype>::abs(ScalarConvert<Dtype, Acctype>::to(x));
+  }
+};
+
+template <typename Dtype>
+struct l1cost_updateGradInput_functor
+{
+  __host__ __device__ Dtype operator()(Dtype x) const
+  {
+    if (x > 0)
+      return ScalarConvert<int, Dtype>::to(1);
+    else if (x < 0)
+      return ScalarConvert<int, Dtype>::to(-1);
+    else
+      return ScalarConvert<int, Dtype>::to(0);
+  }
+};
+
+#include "generic/L1Cost.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu
new file mode 100644
index 0000000..ec9efb8
--- /dev/null
+++ b/aten/src/THCUNN/LeakyReLU.cu
@@ -0,0 +1,74 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct LeakyReLUUpdateOutput
+{
+  const T negval_;
+
+  LeakyReLUUpdateOutput(T negval)
+    : negval_(negval)
+  {}
+
+  __device__ __forceinline__ void operator()(T *out, T *in)
+  {
+    T x = *in;
+    *out = (x > 0) ? x : x * negval_;
+  }
+};
+
+// in-place variant
+template <typename T>
+struct LeakyReLUUpdateOutputIP
+{
+  const T negval_;
+
+  LeakyReLUUpdateOutputIP(T negval)
+    : negval_(negval)
+  {}
+
+  __device__ __forceinline__ void operator()(T *x)
+  {
+    *x = (*x > 0) ? *x : negval_ * (*x);
+  }
+};
+
+template <typename T>
+struct LeakyReLUUpdateGradInput
+{
+  const T negval_;
+
+  LeakyReLUUpdateGradInput(T negval)
+    : negval_(negval)
+  {}
+
+  __device__ __forceinline__ void operator()(
+    T* gradInput,
+    T* input,
+    T* gradOutput) const
+  {
+    *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
+  }
+};
+
+template <typename T>
+struct LeakyReLUUpdateGradInputIP
+{
+  const T negval_;
+
+  LeakyReLUUpdateGradInputIP(T negval)
+    : negval_(negval)
+  {}
+
+  __device__ __forceinline__ void operator()(
+    T* gradOutput,
+    T* input) const
+  {
+    *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
+  }
+};
+
+#include "generic/LeakyReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu
new file mode 100644
index 0000000..357b7bf
--- /dev/null
+++ b/aten/src/THCUNN/LogSigmoid.cu
@@ -0,0 +1,98 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+#if defined(_MSC_VER) || defined(__HIP_PLATFORM_HCC__)
+#define ZERO_MACRO zero<T>()
+template <typename T>
+inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
+	return 0.;
+}
+
+template <typename T>
+inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
+	return 0.f;
+}
+#else
+#define ZERO_MACRO 0.f
+#endif
+
+template <typename T>
+struct logSigmoid_updateOutput_functor
+{
+  __device__ void operator()(T *output, const T *input) const {
+    const T max = fmaxType(ZERO_MACRO, -*input);
+    const T z = THCNumerics<T>::exp(-max) + THCNumerics<T>::exp(-*input -max);
+    *output = -(max + THCNumerics<T>::log(z));
+  }
+};
+
+
+template <typename T>
+struct logSigmoid_updateGradInput_functor
+{
+  __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const {
+    const T max = fmaxType(ZERO_MACRO, -*input);
+    const T z = THCNumerics<T>::exp(-max) + THCNumerics<T>::exp(-*input -max);
+    T max_deriv = 0.f;
+    T sign = -1.f;
+    if (*input < 0.f){
+        max_deriv = -1.f;
+        sign = 1.f;
+    }
+    *gradInput = *gradOutput * (-max_deriv - sign*((z - 1.f)/z));
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct logSigmoid_updateOutput_functor<half> {
+  __device__ __forceinline__ void operator()(half* output, const half *input) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    const half max = fmaxType(__float2half(0.f), __hneg(*input));
+    const half z = THCNumerics<half>::exp(__hneg(max)) + THCNumerics<half>::exp(__hneg(*input) - max);
+    *output = __hneg(max + THCNumerics<half>::log(z));
+#else
+    float in = __half2float(*input);
+    float max = fmaxType(0.f, -in);
+    float z = THCNumerics<float>::exp(-max) + THCNumerics<float>::exp(-in - max);
+    *output = __float2half(-(max + THCNumerics<float>::log(z)));
+#endif
+  }
+};
+
+template <>
+struct logSigmoid_updateGradInput_functor<half> {
+  __device__ __forceinline__ void operator()(half* gradInput, const half *input, const half *gradOutput) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    const half one = __float2half(1.f);
+    const half zero = __float2half(0.f);
+    const half max = fmaxType(zero, __hneg(*input));
+    const half z = THCNumerics<half>::exp(__hneg(max)) + THCNumerics<half>::exp(__hneg(*input) - max);
+    half max_deriv = zero;
+    half sign = __hneg(one);
+    if(*input < zero){
+        max_deriv = __hneg(one);
+        sign = one;
+    }
+    *gradInput = __hmul(*gradOutput, (__hneg(max_deriv) - __hmul(sign, __hdiv(z - one, z))));
+#else
+    const float in = __half2float(*input);
+    const float max = fmaxType(0.f, -in);
+    const float z = THCNumerics<float>::exp(-max) + THCNumerics<float>::exp(-in - max);
+    const float go = __half2float(*gradOutput);
+    float max_deriv = 0.f;
+    float sign = -1.f;
+    if(in < 0.f){
+        max_deriv = -1.f;
+        sign = 1.f;
+    }
+    *gradInput = __float2half(go * (-max_deriv - sign*((z - 1.f)/z)));
+#endif
+  }
+};
+#endif
+
+#include "generic/LogSigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu
new file mode 100644
index 0000000..59aa7e8
--- /dev/null
+++ b/aten/src/THCUNN/LookupTable.cu
@@ -0,0 +1,227 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCThrustAllocator.cuh"
+#include <thrust/unique.h>
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensorSort.cuh"
+#include "../THC/THCTensorMathReduce.cuh"
+
+const int WARP_SIZE = 32;
+
+template 
+  <typename Dtype, 
+   typename Acctype>
+__global__ void cunn_LookupTable_accGradParametersKernelByFeature
+  (int64_t *indices, 
+   Dtype *grad, 
+   Dtype *grad_weight, 
+   Dtype scale, 
+   ptrdiff_t n,
+   int64_t stride, 
+   int padding_idx) 
+{
+  extern __shared__ char buf[];
+  Acctype* smem = (Acctype*)buf;
+  Acctype* my_s = smem + WARP_SIZE*threadIdx.y;
+  int* indices_batch = (int*)(buf + sizeof(Acctype)*WARP_SIZE*blockDim.y);
+
+  const int s = (int)stride; // OK to make int, we don't expect 2 billion+ embedding row size
+
+  const int f = threadIdx.x + blockIdx.x*blockDim.x; // feature_dim
+
+  for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
+  {
+    // Entire block cooperates to load a batch of 1024 indices to process
+    int tid = threadIdx.x + threadIdx.y*blockDim.x;
+    if(batch_start + tid < n)
+      indices_batch[tid] = (int)(indices[batch_start + tid] - TH_INDEX_BASE);
+    
+    // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32
+    for(int chunk_start = batch_start; chunk_start < n; chunk_start += blockDim.y)
+    {
+      // This does double duty:  it makes sure indices_batch is ready, and it makes sure match-group 
+      // leaders are done with their accumulates before other warps start loading again.
+      __syncthreads();  
+  
+      int n_this_chunk = (n - chunk_start) < blockDim.y ? (n - chunk_start) : blockDim.y; 
+
+      int src_row = chunk_start + threadIdx.y; 
+      int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight
+      
+      // All warps load their smem segments with incoming grad data
+      if(src_row < n && f < s && dst_row != padding_idx - TH_INDEX_BASE)
+        my_s[threadIdx.x] =  ScalarConvert<Dtype, Acctype>::to(scale*grad[src_row*stride + f]);
+     
+      __syncthreads();
+    
+      // To ensure determinism, we can't just have each warp add its grad data to its dst_row.
+      // We need to check if any other warps pulled grad data targeting dst_row.
+      // If so, we elect the first warp in each matching group as the leader. 
+      // Each leader warp serializes the accumulates targeting dst_row in shared memory,
+      // then finishes by adding the accumulated buffer to dst_row in grad_weight.
+      if(dst_row != padding_idx - TH_INDEX_BASE && src_row < n) // Per-warp exit condition
+      {
+        int match_found_this_thread = 
+          (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]);
+        if(threadIdx.x >= n_this_chunk)
+          match_found_this_thread = 0;
+        unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
+
+        int first_remaining_peer = __ffs(matchmask) - 1;
+
+        if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
+        {
+          matchmask ^= (1 << first_remaining_peer);   
+          while(matchmask)
+          {
+            first_remaining_peer = __ffs(matchmask) - 1;
+            my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
+            matchmask ^= (1 << first_remaining_peer);
+          }
+          if(f < s)
+            grad_weight[dst_row*stride + f] += ScalarConvert<Acctype, Dtype>::to(my_s[threadIdx.x]);
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_LookupTable_accGradParametersKernel(
+  int64_t *input, int64_t *indices, Dtype *gradOutput, Dtype *gradWeight,
+  int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride, int paddingValue) {
+
+  int idx = blockIdx.x * 4 + threadIdx.y;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values proceessed by each thread (grain size)
+  const int SZ = 4;
+
+  if (idx < numel
+      && (idx == 0 || input[idx] != input[idx - 1])
+      && input[idx] != paddingValue) {
+    do {
+      const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
+      const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
+      const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);
+
+      Acctype gradient[SZ];
+      Acctype weight[SZ];
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride)
+        {
+          gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
+          weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
+        }
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        weight[ii] += gradient[ii] * scale;
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride)
+        {
+          gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
+        }
+      }
+
+      idx++;
+    } while (idx < numel && input[idx] == input[idx - 1]);
+  }
+}
+
+template <typename DType, typename AccType, int Norm>
+struct FastPow
+{
+  __host__ __device__
+  static inline AccType pow(DType x, AccType norm) {
+    AccType xA = ScalarConvert<DType, AccType>::to(x);
+    return std::pow(std::abs(xA), norm);
+  }
+};
+
+template <typename DType, typename AccType>
+struct FastPow<DType, AccType, 1>
+{
+  __host__ __device__
+  static inline AccType pow(DType x, AccType _) {
+    AccType xA = ScalarConvert<DType, AccType>::to(x);
+    return std::abs(xA);
+  }
+};
+
+template <typename DType, typename AccType>
+struct FastPow<DType, AccType, 2>
+{
+  __host__ __device__
+  static inline AccType pow(DType x, AccType _) {
+    AccType xA = ScalarConvert<DType, AccType>::to(x);
+    return xA * xA;
+  }
+};
+
+/* Calculate norms of the rows of weight_ptr given by idx_ptr and capture them in norms */
+template <typename DType, typename AccType, typename IndexType, int Norm>
+__global__
+void calculate_norms_and_renorm(DType *weights, 
+                                THCIndex_t *indices, 
+                                AccType normType,
+                                AccType maxNorm, 
+                                IndexType dim)
+{
+  // Some casting hacks since dynamic shared memory and templates don't work together:
+  extern __shared__ unsigned char smem[];
+  AccType *sdata = reinterpret_cast<AccType *>(smem);
+
+  IndexType tid = threadIdx.x;
+  IndexType baseIndex = (indices[blockIdx.x] - TH_INDEX_BASE) * dim;
+
+  AccType accZero = ScalarConvert<int, AccType>::to(0);
+  AccType v = accZero;
+  for (IndexType i = tid; i < dim; i += blockDim.x) {
+    v += FastPow<DType, AccType, Norm>::pow(weights[baseIndex + i], normType);
+  }
+
+  v = reduceBlock<AccType, ReduceAdd<AccType>>
+        (sdata, blockDim.x, v, ReduceAdd<AccType>(), accZero);
+
+  if (tid == 0) {
+    sdata[0] = std::pow(v, 
+        THCNumerics<AccType>::div(ScalarConvert<int, AccType>::to(1), normType)
+    );
+  }
+  __syncthreads();
+  // now we renormalize the blocks that need it
+  if (sdata[0] > maxNorm) {
+    DType factor = ScalarConvert<AccType, DType>::to(maxNorm / (sdata[0] + 1e-7));
+    for (IndexType i = tid; i < dim; i += blockDim.x) {
+      weights[baseIndex + i] *= factor;
+    }
+  }
+
+}
+
+#include "generic/LookupTable.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu
new file mode 100644
index 0000000..c2ba9f5
--- /dev/null
+++ b/aten/src/THCUNN/LookupTableBag.cu
@@ -0,0 +1,143 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCTensor.hpp"
+
+#include "THCThrustAllocator.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/transform_reduce.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+#include <thrust/unique.h>
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensorSort.cuh"
+
+const int WARP_SIZE = 32;
+const int MODE_SUM = 0;
+const int MODE_MEAN = 1;
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_LookupTableBag_updateOutputKernel(
+  int64_t *input, int64_t *offsets, Dtype *weight, Dtype *output,
+  int64_t *offset2bag, int64_t numIndices, int64_t numBags, int64_t stride, int mode,
+  int64_t *bag_size) {
+
+  // the strategy here is that each bag x feature is handled by a single thread
+
+  int64_t chunksPerBag = THCCeilDiv(stride, (int64_t) blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < stride) {
+      int64_t bag = chunk / chunksPerBag;
+      Dtype*  weightFeat = weight + featureDim;
+      int64_t begin = offsets[bag] - TH_INDEX_BASE;
+      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1] - TH_INDEX_BASE) : numIndices;
+      assert(end >= begin);
+      Acctype weightFeatSum = ScalarConvert<float, Acctype>::to(0);
+      int64_t bag_size_ = 0;
+      for (int64_t emb = begin; emb < end; emb++) {
+        const int weightRow = ((int) input[emb] - TH_INDEX_BASE) * stride;
+        weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
+	bag_size_ ++;
+        if (featureDim == 0) {
+          offset2bag[emb] = bag + TH_INDEX_BASE;
+        }
+      }
+      if (mode == MODE_MEAN) {
+	weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
+	bag_size[bag] = bag_size_;
+      }
+      (void) MODE_SUM; //silence warnings about unused MODE_SUM;
+      output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
+    }
+  }
+}
+
+// FIXME: removed the accGradParametersKernelByFeature case present in
+// LookupTable. That kernel is faster at small sizes (<768 indices), which
+// does not need LookupTableBag (LookupTable + Sum works fine), but would
+// still be nice to not be slow in that case.
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_LookupTableBag_accGradParametersKernel(
+  int64_t *input, int64_t *indices, Dtype *gradOutput, Dtype *gradWeight, int64_t *offset2bag,
+  int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride,
+  int mode, int64_t *bag_size) {
+
+  int idx = blockIdx.x * 4 + threadIdx.y;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values proceessed by each thread (grain size)
+  const int SZ = 4;
+
+  if (idx < numel
+      && (idx == 0 || input[idx] != input[idx - 1])) {
+    do {
+      const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
+
+      // Note: only this line changes from LookupTable_accgradParametersKernel
+      const int origRow = ((int) indices[idx] - TH_INDEX_BASE);
+      const int seq_number = offset2bag[origRow] - TH_INDEX_BASE;
+      const int gradOutputRow = ((int) seq_number) * stride;
+
+      const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);
+
+      Acctype gradient[SZ];
+      Acctype weight[SZ];
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride)
+        {
+          gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
+	  if (mode == MODE_MEAN) {
+	    gradient[ii] /= bag_size[seq_number];
+	  }
+          weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
+        }
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        weight[ii] += gradient[ii] * scale;
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * WARP_SIZE;
+        if (featureDim < stride)
+        {
+          gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
+        }
+      }
+
+      idx++;
+    } while (idx < numel && input[idx] == input[idx - 1]);
+  }
+}
+
+
+#include "generic/LookupTableBag.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu
new file mode 100644
index 0000000..e9571fe
--- /dev/null
+++ b/aten/src/THCUNN/MSECriterion.cu
@@ -0,0 +1,62 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename Dtype, typename Acctype>
+struct mse_functor
+{
+  mse_functor() {}
+
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
+  {
+    Acctype z = ScalarConvert<Dtype, Acctype>::to(x)-y;
+    return z*z;
+  }
+};
+
+
+template <typename Dtype>
+struct mse_updateOutput_functor
+{
+  mse_updateOutput_functor() {}
+
+  __device__ void operator()(
+      const Dtype *input, 
+      const Dtype *target, 
+      Dtype *output)
+  {
+    Dtype diff = THCNumerics<Dtype>::sub(*input, *target);
+    *output = THCNumerics<Dtype>::mul(diff, diff);
+  }
+};
+
+
+template <typename Dtype, typename Acctype>
+struct mse_updateGradInput_functor
+{
+  const Acctype norm;
+
+  mse_updateGradInput_functor(Acctype norm_)
+    : norm(norm_)
+  {}
+
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
+  {
+    return ScalarConvert<Acctype, Dtype>::to(norm * (ScalarConvert<Dtype, Acctype>::to(x) - y));
+  }
+};
+
+#include "generic/MSECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu
new file mode 100644
index 0000000..7ccdbb7
--- /dev/null
+++ b/aten/src/THCUNN/MarginCriterion.cu
@@ -0,0 +1,45 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+
+template <typename Dtype, typename Acctype>
+struct margin_functor
+{
+  margin_functor(Acctype margin)
+    : margin(margin)
+  {}
+
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
+  {
+    Acctype z = margin - ScalarConvert<Dtype, Acctype>::to(x) * y;
+    return z >= 0 ? z : 0;
+  }
+
+  const Acctype margin;
+};
+
+template <typename Dtype, typename Acctype>
+struct margin_updateGradInput_functor
+{
+  const Acctype margin, norm;
+
+  margin_updateGradInput_functor(Acctype margin_, Acctype norm_)
+    : margin(margin_)
+    , norm(norm_)
+  {}
+
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
+  {
+    return ScalarConvert<Acctype, Dtype>::to((ScalarConvert<Dtype, Acctype>::to(x) * y) < margin ? -norm * y : 0);
+  }
+};
+
+#include "generic/MarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
new file mode 100644
index 0000000..13b432c
--- /dev/null
+++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
@@ -0,0 +1,152 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include <thrust/functional.h>
+
+#define MULTILABELMARGIN_THREADS 1024
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output,
+                                                                   Dtype *input,
+                                                                   THCIndex_t *target,
+                                                                   Dtype *istarget,
+                                                                   int nframe,
+                                                                   int dim,
+                                                                   int sizeaverage)
+{
+  // Temporary sums (for mapreduce)
+  __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
+
+  // vectors:
+  int k = blockIdx.x;
+  Dtype *input_k = input + k*dim;
+  THCIndex_t *target_k = target + k*dim;
+  Dtype *output_k = output + k;
+  Dtype *istarget_k = istarget + k*dim;
+
+  // zero istarget
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    istarget_k[d] = ScalarConvert<int, Dtype>::to(0);
+  }
+  __syncthreads();
+
+  // mark targets in istarget
+  if (threadIdx.x == 0) {
+    for (int dt = 0; dt < dim; dt++) {
+      int target_idx = target_k[dt] - TH_INDEX_BASE;
+      if (target_idx < 0) break;
+      istarget_k[target_idx] = ScalarConvert<int, Dtype>::to(1);
+    }
+  }
+  __syncthreads();
+
+  // iterate over targets
+  Acctype sum = 0;
+  for (int dt = 0; dt < dim; dt++) {
+    // next target:
+    int target_idx = target_k[dt] - TH_INDEX_BASE;
+    if (target_idx < 0) break;
+
+    // current value for target
+    Dtype input_target_k = input_k[target_idx];
+
+    // compare to all inputs (multithreaded):
+    for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+      // contribute to loss only if not a target
+      if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+        Dtype z = 1 - input_target_k + input_k[d];
+        if (z > 0)
+          sum += z;
+      }
+    }
+  }
+
+  // reduce
+  Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
+  if (threadIdx.x == 0) {
+    if (sizeaverage) {
+      *output_k = ScalarConvert<Acctype, Dtype>::to((totalSum / dim) / nframe);
+    } else {
+      *output_k = ScalarConvert<Acctype, Dtype>::to(totalSum / dim);
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
+                                                                      Dtype *gradOutput,
+                                                                      Dtype *input,
+                                                                      THCIndex_t *target,
+                                                                      Dtype *istarget,
+                                                                      int nframe,
+                                                                      int dim,
+                                                                      int sizeaverage,
+                                                                      int reduce)
+{
+  // Temporary sums (for mapreduce)
+  __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
+
+  // vectors:
+  int k = blockIdx.x;
+  Dtype *input_k = input + k*dim;
+  Dtype *gradInput_k = gradInput + k*dim;
+  THCIndex_t *target_k = target + k*dim;
+  Dtype *istarget_k = istarget + k*dim;
+ 
+  Dtype *gradOutput_k = gradOutput;
+  if (!reduce) {
+    gradOutput_k += k;
+  }
+
+  // gain:
+  Dtype g = ScalarConvert<Acctype, Dtype>::to( sizeaverage && reduce ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim) );
+
+  // zero gradients:
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    gradInput_k[d] = ScalarConvert<int, Dtype>::to(0);
+  }
+  __syncthreads();
+
+  // iterate over targets
+  for (int dt = 0; dt < dim; dt++) {
+    // next target:
+    int target_idx = (int)target_k[dt] - TH_INDEX_BASE;
+    if (target_idx < 0) break;
+
+    // current value for target
+    Dtype input_target_k = input_k[target_idx];
+
+    // compare to all inputs (multithreaded):
+    Acctype sum = 0;
+    for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+      // contribute to loss only if not a target
+      if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+        Dtype z = 1 - input_target_k + input_k[d];
+        if (z > 0) {
+          sum -= g;
+          gradInput_k[d] += g;
+        }
+      }
+    }
+    __syncthreads();
+
+    // reduce sum
+    Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
+    if (threadIdx.x == 0) {
+      gradInput_k[target_idx] += ScalarConvert<Acctype, Dtype>::to(totalSum);
+    }
+  }
+
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    gradInput_k[d] *= *gradOutput_k;
+  }
+}
+
+#include "generic/MultiLabelMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef MULTILABELMARGIN_THREADS
diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu
new file mode 100644
index 0000000..c2fa213
--- /dev/null
+++ b/aten/src/THCUNN/MultiMarginCriterion.cu
@@ -0,0 +1,122 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#define MULTIMARGIN_THREADS 128
+
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
+{
+  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  Dtype *input_k = input + k*dim;
+  Dtype *output_k = output + k;
+  int target_k = ((int)target[k]) - TH_INDEX_BASE;
+  Dtype input_target_k = input_k[target_k];
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i = i_start; i < i_end; i += i_step)
+  {
+    Dtype z = margin - input_target_k + input_k[i];
+    if (i == target_k)
+      continue;
+
+    if (z > 0) {
+      Dtype h = (P==1) ? z : z*z;
+      if(weights)
+        h *= weights[target_k];
+      buffer[threadIdx.x] += h;
+    }
+  }
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0)
+  {
+    Acctype sum = 0;
+    for (int i=0; i < blockDim.x; i++)
+      sum += buffer[i];
+
+    *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
+    if(sizeAverage)
+      *output_k /= nframe;
+  }
+}
+
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
+                                                                 Dtype *gradOutput,
+                                                                 Dtype *input,
+                                                                 THCIndex_t *target,
+                                                                 Dtype *weights,
+                                                                 int nframe,
+                                                                 int dim,
+                                                                 bool sizeAverage,
+                                                                 Dtype margin,
+                                                                 int reduce)
+{
+  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  Dtype *input_k = input + k*dim;
+  Dtype *gradInput_k = gradInput + k*dim;
+  int target_k = ((int)target[k]) - TH_INDEX_BASE;
+  Dtype input_target_k = input_k[target_k];
+
+  Dtype *gradOutput_k = gradOutput;
+  if (!reduce) {
+    gradOutput_k += k;
+  }
+
+  Acctype g = (sizeAverage && reduce ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i=i_start; i<i_end; i+=i_step)
+  {
+    Dtype z = margin - input_target_k + input_k[i];
+    if (i == target_k)
+      continue;
+
+    if (z > 0)
+    {
+      Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
+      if(weights)
+        h *= weights[target_k];
+      buffer[threadIdx.x] -= h;
+      gradInput_k[i] = h;
+    }
+    else
+      gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
+  }
+
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0)
+  {
+    Acctype gradInput_target_k = 0;
+    for (int i=0; i<blockDim.x; i++)
+      gradInput_target_k += buffer[i];
+    gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
+  }
+
+  for (int i=i_start; i<i_end; i+= i_step)
+  {
+    gradInput_k[i] *= * gradOutput_k;
+  }
+}
+
+#include "generic/MultiMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef MULTIMARGIN_THREADS
diff --git a/aten/src/THCUNN/PReLU.cu b/aten/src/THCUNN/PReLU.cu
new file mode 100644
index 0000000..cdc6b2b
--- /dev/null
+++ b/aten/src/THCUNN/PReLU.cu
@@ -0,0 +1,107 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+#include "THCTensor.hpp"
+
+#include "common.h"
+
+template <typename T>
+struct PReLUUpdateOutput
+{
+  T* weight_;
+
+  PReLUUpdateOutput(T* weight)
+    : weight_(weight)
+  {}
+
+  __device__ __forceinline__ void operator()(T *out, T *in)
+  {
+    T x = *in;
+    *out = (x > 0) ? x : weight_[0] * x;
+  }
+};
+
+template <typename T>
+__global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize)
+{
+  CUDA_KERNEL_LOOP(i, n)
+  {
+    int positionInSample = i % nElemsPerSample;
+    int mapNumber = positionInSample / mapSize;
+    output[i] = input[i] > 0 ? input[i] : input[i] * weight[mapNumber];
+  }
+}
+
+template <typename T>
+struct PReLUUpdateGradInput
+{
+  T *weight_;
+
+  PReLUUpdateGradInput(T *weight)
+    : weight_(weight)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input)
+  {
+    *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_;
+  }
+};
+
+template <typename T>
+__global__ void preluBackward(
+  T *gradInput,
+  const T *input,
+  const T *weight,
+  const T *gradOutput,
+  int n, int nElemsPerSample, int mapSize)
+{
+  CUDA_KERNEL_LOOP(i, n)
+  {
+    int positionInSample = i % nElemsPerSample;
+    int mapNumber = positionInSample / mapSize;
+    gradInput[i] = input[i] > 0 ? gradOutput[i] : gradOutput[i] * weight[mapNumber];
+  }
+}
+
+template <typename T>
+struct PReLUAccGradParametersShared
+{
+  __device__ __forceinline__ void operator()(T *gradInput, T  *input, T *gradOutput)
+  {
+    *gradInput = (*input) * (*gradOutput) * (*input <= 0);
+  }
+};
+
+template <typename T>
+struct PReLUAccGradParameters
+{
+  T scale;
+
+  PReLUAccGradParameters(T scale)
+    : scale(scale)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
+  {
+    *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
+  }
+};
+
+template <typename T>
+struct PReLUAccGradParameters1to1
+{
+  T scale;
+
+  PReLUAccGradParameters1to1(T scale)
+    : scale(scale)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
+  {
+    *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
+  }
+};
+
+#include "generic/PReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu
new file mode 100644
index 0000000..bf45035
--- /dev/null
+++ b/aten/src/THCUNN/RReLU.cu
@@ -0,0 +1,124 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+#include "common.h"
+#include <curand.h>
+#include <curand_kernel.h>
+
+// copied from cutorch/lib/THC/THCTensorRandom.cu
+#define MAX_NUM_BLOCKS 64
+#define BLOCK_SIZE 256
+#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
+
+template<typename T>
+inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
+  return ScalarConvert<float, half>::to(curand_uniform(state));
+}
+#endif
+
+template <>
+inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
+  return curand_uniform(state);
+}
+
+template <>
+inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
+  return curand_uniform_double(state);
+}
+
+template <typename T>
+__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
+  T *input, T* noise, T *output, double a, double b)
+{
+  CUDA_KERNEL_LOOP(i, n)
+  {
+    if (input[i] <= 0)
+    {
+      T r = curand_uniform_type<T>(&state[blockIdx.x]);
+      r = ScalarConvert<double, T>::to(r * (b-a) + a);
+      output[i] = input[i] * r;
+      noise[i] = r;
+    }
+    else
+    {
+      output[i] = input[i];
+      noise[i] = ScalarConvert<int, T>::to(1);
+    }
+  }
+}
+
+template <typename T>
+struct RReLUUpdateOutputEval_functor
+{
+  const T negSlope_;
+
+  RReLUUpdateOutputEval_functor(T negSlope)
+    : negSlope_(negSlope)
+  {}
+
+  __device__ __forceinline__ void operator()(T *out, T *in)
+  {
+    const T x = *in;
+    const T r = x <= 0 ? negSlope_ : ScalarConvert<int, T>::to(1);
+    *out = x * r;
+  }
+};
+
+template <typename T>
+struct RReLUUpdateOutputEvalIP_functor
+{
+  const T negSlope_;
+
+  RReLUUpdateOutputEvalIP_functor(T negSlope)
+    : negSlope_(negSlope)
+  {}
+
+  __device__ __forceinline__ void operator()(T *x)
+  {
+    if (*x <= 0)
+    {
+      *x = *x * negSlope_;
+    }
+  }
+};
+
+template <typename T>
+struct RReLUupdateGradInputEval_functor
+{
+  const T negSlope_;
+
+  RReLUupdateGradInputEval_functor(T negSlope)
+    : negSlope_(negSlope)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in)
+  {
+    *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut);
+  }
+};
+
+template <typename T>
+struct RReLUupdateGradInputEvalIP_functor
+{
+  const T negSlope_;
+
+  RReLUupdateGradInputEvalIP_functor(T negSlope)
+    : negSlope_(negSlope)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradOut, T *in)
+  {
+    if (*in <= 0)
+    {
+      *gradOut = (*gradOut) * negSlope_;
+    }
+  }
+};
+
+#include "generic/RReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SharedMem.cuh b/aten/src/THCUNN/SharedMem.cuh
new file mode 100644
index 0000000..070d269
--- /dev/null
+++ b/aten/src/THCUNN/SharedMem.cuh
@@ -0,0 +1,45 @@
+// Based on the simpleTempltes CUDA example
+
+#ifndef THCUNN_SHAREDMEM_H
+#define THCUNN_SHAREDMEM_H
+
+template <typename T>
+struct SharedMem {
+  __device__ T *getPointer()
+  {
+    extern __device__ void error(void);
+    error();
+    return NULL;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct SharedMem<half>
+{
+  __device__ half *getPointer() {
+    extern __shared__ half s_half[];
+    return s_half;
+  }
+};
+#endif
+
+template <>
+struct SharedMem<float>
+{
+  __device__ float *getPointer() {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMem<double>
+{
+  __device__ double *getPointer() {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+#endif
diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu
new file mode 100644
index 0000000..85bda93
--- /dev/null
+++ b/aten/src/THCUNN/Sigmoid.cu
@@ -0,0 +1,30 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct sigmoid_updateGradInput_functor {
+  __device__ __forceinline__ void operator()(T* gradInput, const T *output, const T *gradOutput) const {
+    *gradInput = *gradOutput * (1.f - *output) * (*output);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct sigmoid_updateGradInput_functor<half> {
+  __device__ __forceinline__ void operator()(half* gradInput, const half *output, const half *gradOutput) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    const half one = __float2half(1.f);
+    *gradInput = __hmul(*gradOutput, __hmul(__hadd(one, __hneg(*output)), *output));
+#else
+    const float out = __half2float(*output);
+    const float go = __half2float(*gradOutput);
+    *gradInput = __float2half(go * (1.f - out) * out);
+#endif
+  }
+};
+#endif
+
+#include "generic/Sigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu
new file mode 100644
index 0000000..c8018d9
--- /dev/null
+++ b/aten/src/THCUNN/SmoothL1Criterion.cu
@@ -0,0 +1,91 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename Dtype, typename Acctype>
+struct smoothl1_functor
+{
+  smoothl1_functor() {}
+
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
+  {
+    Acctype z = ScalarConvert<Dtype, Acctype>::to(THCNumerics<Dtype>::abs(x-y));
+    return z < Acctype(1) ? 0.5f*z*z : z - 0.5f;
+  }
+};
+
+template <typename Dtype>
+struct smoothl1_updateOutput_no_reduce_functor
+{
+  smoothl1_updateOutput_no_reduce_functor() {}
+
+  __forceinline__ __host__ __device__ void operator()(
+      const Dtype *x, 
+      const Dtype *y,
+      Dtype *out) const
+  {
+    Dtype oneHalf = ScalarConvert<float, Dtype>::to(0.5f);
+    Dtype z = THCNumerics<Dtype>::abs(*x - *y);
+    *out = z < ScalarConvert<int, Dtype>::to(1) ? oneHalf * z * z : z - oneHalf;
+  }
+};
+
+template <typename Dtype>
+struct smoothl1_updateGradInput_no_reduce_functor
+{
+  smoothl1_updateGradInput_no_reduce_functor() {}
+
+  __host__ __device__ void operator()(
+      const Dtype *x, 
+      const Dtype *y,
+      Dtype *gradInput) const
+  {
+    Dtype z = *x - *y;
+    Dtype one = ScalarConvert<int, Dtype>::to(1);
+    Dtype minusOne = ScalarConvert<int, Dtype>::to(-1);
+    if (z < minusOne) {
+      *gradInput = minusOne;
+    } else if (z > one) {
+      *gradInput = one;
+    } else {
+      *gradInput = z;
+    }
+  }
+};
+
+template <typename Dtype>
+struct smoothl1_updateGradInput_functor
+{
+  const Dtype norm;
+  const Dtype gradOutput;
+
+  smoothl1_updateGradInput_functor(Dtype norm_, Dtype gradOutput_)
+    : norm(norm_), gradOutput(gradOutput_)
+  {}
+
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
+  {
+    Dtype z = x - y;
+    if (z < ScalarConvert<int, Dtype>::to(-1))
+      return -norm * gradOutput;
+    else if (z > ScalarConvert<int, Dtype>::to(1))
+      return norm * gradOutput;
+    else
+      return norm * z * gradOutput;
+  }
+};
+
+#include "generic/SmoothL1Criterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu
new file mode 100644
index 0000000..ee53e76
--- /dev/null
+++ b/aten/src/THCUNN/SoftMarginCriterion.cu
@@ -0,0 +1,65 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+
+template <typename Dtype, typename Acctype>
+struct softmargin_functor
+{
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
+  {
+    return log(1 + exp(ScalarConvert<Dtype, Acctype>::to(-x)*y));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct softmargin_no_reduce_functor
+{
+  __host__ __device__ void operator()(
+    const Dtype *x,
+    const Dtype *y,
+    Dtype *out) const
+  {
+    *out = ScalarConvert<Acctype, Dtype>::to(log(ScalarConvert<int, Acctype>::to(1)
+                                             + exp(ScalarConvert<Dtype, Acctype>::to(-*x) * *y)));
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct softmargin_updateGradInput_functor
+{
+  const Acctype norm;
+  const Dtype gradOutput;
+
+  softmargin_updateGradInput_functor(Acctype norm_, Dtype gradOutput_) :
+    norm(norm_), gradOutput(gradOutput_) {}
+
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
+    {
+      Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-x)*y);
+      return ScalarConvert<Acctype, Dtype>::to(-y*temp*norm/(ScalarConvert<int, Acctype>::to(1) + temp) * gradOutput);
+    }
+};
+
+template <typename Dtype, typename Acctype>
+struct softmargin_updateGradInput_no_reduce_functor
+{
+  __forceinline__ __host__ __device__ void operator()(
+      const Dtype *x,
+      const Dtype *y,
+      Dtype *gradInput) const
+  {
+      Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-*x) * *y);
+      *gradInput = ScalarConvert<Acctype, Dtype>::to(-*y * temp / (ScalarConvert<int, Acctype>::to(1) + temp));
+  }
+};
+
+#include "generic/SoftMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu
new file mode 100644
index 0000000..42b2c3c
--- /dev/null
+++ b/aten/src/THCUNN/SoftPlus.cu
@@ -0,0 +1,43 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct softPlusupdateOutput_functor
+{
+  const T threshold;
+  const T beta;
+
+  softPlusupdateOutput_functor(T threshold_, T beta_)
+    : threshold(threshold_)
+    , beta(beta_)
+  {}
+
+  __device__ void operator()(T *output, const T *input) const {
+    T betain = beta * (*input);
+    *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain));
+  }
+};
+
+template <typename T>
+struct softPlusupdateGradInput_functor
+{
+  const T threshold;
+  const T beta;
+
+  softPlusupdateGradInput_functor(T threshold_, T beta_)
+    : threshold(threshold_)
+    , beta(beta_)
+  {}
+
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
+  {
+    T betaout = beta * (*output);
+    T exp_bo = exp(betaout);
+    *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo;
+  }
+};
+
+#include "generic/SoftPlus.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu
new file mode 100644
index 0000000..a4e45d8
--- /dev/null
+++ b/aten/src/THCUNN/SoftShrink.cu
@@ -0,0 +1,44 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct SoftShrinkUpdateOutput
+{
+  const T lambda_;
+
+  SoftShrinkUpdateOutput(T lambda)
+    : lambda_(lambda)
+  {}
+
+  __device__ __forceinline__ void operator()(T *out, T *in)
+  {
+    T x = *in;
+    if (x > lambda_) *out = x - lambda_;
+    else if (x < -lambda_) *out = x + lambda_;
+    else *out = ScalarConvert<int, T>::to(0);
+  }
+};
+
+template <typename T>
+struct SoftShrinkUpdateGradInput
+{
+  const T lambda_;
+
+  SoftShrinkUpdateGradInput(T lambda)
+    : lambda_(lambda)
+  {}
+
+  __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const
+  {
+    T x = *input;
+    if (x > lambda_ || x < -lambda_)
+      *gradInput = *gradOutput;
+    else
+      *gradInput = ScalarConvert<int, T>::to(0);
+  }
+};
+
+#include "generic/SoftShrink.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu
new file mode 100644
index 0000000..cd9b659
--- /dev/null
+++ b/aten/src/THCUNN/SparseLinear.cu
@@ -0,0 +1,87 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+
+#include <cusparse.h>
+
+static cusparseHandle_t cusparse_handle = 0;
+
+static void init_cusparse() {
+  if (cusparse_handle == 0) {
+    cusparseStatus_t status = cusparseCreate(&cusparse_handle);
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+      THError("CUSPARSE Library initialization failed");
+    }
+  }
+}
+
+#ifdef CUDA_HALF_TENSOR
+void THNN_CudaHalfSparseLinear_updateOutput(
+          THCState *state,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *output,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+
+void THNN_CudaHalfSparseLinear_accGradParameters(
+          THCState *state,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *gradOutput,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
+          float weightDecay,
+          float scale) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+
+void THNN_CudaHalfSparseLinear_legacyUpdateOutput(
+          THCState *state,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *output,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+
+void THNN_CudaHalfSparseLinear_legacyAccGradParameters(
+          THCState *state,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *gradOutput,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
+          float weightDecay,
+          float scale) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+
+void THNN_CudaHalfSparseLinear_zeroGradParameters(
+          THCState *state,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *lastInput) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+
+void THNN_CudaHalfSparseLinear_updateParameters(
+          THCState *state,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *lastInput,
+          float learningRate) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
+}
+#endif
+
+#include "generic/SparseLinear.cu"
+#include "THCGenerateFloatType.h"
+#include "generic/SparseLinear.cu"
+#include "THCGenerateDoubleType.h"
diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
new file mode 100644
index 0000000..2c671da
--- /dev/null
+++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
@@ -0,0 +1,197 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+// 4d tensor B x D x H x W
+// All kernels view batch dim B and feature dim D as collapsed.
+
+/*
+ * Description:
+ *    this function adaptively average pools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output
+ */
+ template <typename T>
+__global__ void adaptiveaveragepool(T *input, T *output,
+                        int isizeH, int isizeW,
+                        int osizeH, int osizeW,
+                        int64_t istrideD, int64_t istrideH, int64_t istrideW)
+{
+  // iterators on output pixels
+  int oh, ow;
+
+  // select input/output plane based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  output = output + o_plane*osizeH*osizeW;
+  input = input + i_plane*istrideD;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  const int ostepH = blockDim.y*gridDim.y;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  const int ostepW = blockDim.x;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling over corresponding input pixels
+      T *ptr_input = input + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output + oh*osizeW + ow;
+      T sum = ScalarConvert<int, T>::to(0);
+      int ih, iw;
+      for(ih = 0; ih < kH; ++ih) {
+        for(iw = 0; iw < kW; ++iw) {
+          T val = ptr_input[iw*istrideW];
+          sum += val;
+        }
+        ptr_input += istrideH; // next input line
+      }
+      // Update output
+      *ptr_output = sum / kH / kW;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from gradOutput
+ */
+ template <typename T>
+__global__ void adaptiveaveragegradinput(
+  T *gradInput, T *gradOutput,
+  int isizeH, int isizeW, int osizeH, int osizeW
+)
+{
+  // iterators on input pixels
+  int ih, iw;
+
+  // select input/output plane based on thread/block ID
+  int i_plane = blockIdx.x;
+  int o_plane = i_plane;
+
+  gradOutput = gradOutput + o_plane*osizeH*osizeW;
+  gradInput = gradInput + i_plane*isizeH*isizeW;
+
+  int istartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int iendH = isizeH;
+  int istepH = blockDim.y*gridDim.y;
+
+  int istartW = threadIdx.x;
+  int iendW = isizeW;
+  int istepW = blockDim.x;
+
+  // compute gradInput
+  for(ih = istartH; ih < iendH; ih += istepH) {
+
+    int ostartH = START_IND(ih, isizeH, osizeH);
+    int oendH   = END_IND(ih, isizeH, osizeH);
+
+    for(iw = istartW; iw < iendW; iw += istepW) {
+
+      int ostartW = START_IND(iw, isizeW, osizeW);
+      int oendW   = END_IND(iw, isizeW, osizeW);
+
+      // Compute the gradients over corresponding output pixels
+      T *ptr_gradInput = gradInput + ih*isizeW + iw;
+
+      int oh, ow;
+      for(oh = ostartH; oh < oendH; ++oh) {
+        int kH = START_IND(oh, osizeH, isizeH) - END_IND(oh, osizeH, isizeH);
+        for(ow = ostartW; ow < oendW; ++ow) {
+          int kW = START_IND(ow, osizeW, isizeW) - END_IND(ow, osizeW, isizeW);
+          T grad_delta = gradOutput[ow + oh*osizeW] / kH / kW;
+          *ptr_gradInput += grad_delta;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from gradOutput
+ *    (uses atomic add)
+ */
+ template <typename T>
+__global__ void atomicadaptiveaveragegradinput(
+  T *gradInput, T *gradOutput,
+  int isizeH, int isizeW, int osizeH, int osizeW
+)
+{
+  // iterators on output indices
+  int oh, ow;
+
+  // select input/output plane based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  gradOutput = gradOutput + o_plane*osizeW*osizeH;
+  gradInput = gradInput + i_plane*isizeW*isizeH;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = blockDim.y*gridDim.y;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the gradients for over corresponding input pixels
+      T *ptr_gradInput = gradInput + istartH*isizeW + istartW;
+      T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput / kW / kH;
+
+      int ih, iw;
+      for(ih = 0; ih < kH; ++ih) {
+        for(iw = 0; iw < kW; ++iw) {
+          // atomic add since different threads could update same variable
+          atomicAdd(&(ptr_gradInput[iw]), grad_delta);
+        }
+        ptr_gradInput += isizeW; // next input line
+      }
+    }
+  }
+}
+
+#include "generic/SpatialAdaptiveAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
+#undef START_IND
+#undef END_IND
diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..592e6fd
--- /dev/null
+++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
@@ -0,0 +1,180 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "THCTensor.hpp"
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 4d tensor B x D x H x W
+
+/*
+ * Description:
+ *    this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output, 4D argmax x and y
+ */
+ template <typename T>
+__global__ void adaptivemaxpool(T *input, T *output, THCIndex_t *indices,
+                        int isizeH, int isizeW,
+                        int osizeH, int osizeW,
+                        int64_t istrideD, int64_t istrideH, int64_t istrideW)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  const int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  const int ostepH = blockDim.y*gridDim.y;
+  // select input/output plane
+  output = output + o_plane*osizeH*osizeW;
+  input = input + i_plane*istrideD;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+
+      int kW = iendW - istartW;
+
+      // Compute the mean of the input image...
+      T *ptr_input = input + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices + oh*osizeW + ow;
+      int argmax = -1;
+      T max = THCNumerics<T>::min();
+      int ih, iw;
+      for(ih = 0; ih < kH; ih++) {
+        for(iw = 0; iw < kW; iw++) {
+          T val = ptr_input[iw*istrideW];
+          if ((val > max) || THCNumerics<T>::isnan(val)) {
+            max = val;
+            argmax = (ih+istartH)*isizeW + iw+istartW;
+          }
+        }
+        ptr_input += istrideH; // next input line
+      }
+      // Update output and argmax
+      *ptr_output = max;
+      *ptr_ind = argmax + TH_INDEX_BASE;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ */
+ template <typename T>
+__global__ void adaptivemaxgradinput(T *gradInput, T *gradOutput, THCIndex_t *indices,
+                             int isizeH, int isizeW,
+                             int osizeH, int osizeW)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+  //int k = blockIdx.x % sizeD;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o_plane*osizeH*osizeW;
+  gradInput = gradInput + i_plane*isizeH*isizeW;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // compute gradInput
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices + oh*osizeW + ow;
+      T z = *ptr_gradOutput;
+
+      int argmax = (*ptr_ind) - TH_INDEX_BASE;
+
+      gradInput[argmax] += z;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ *    when kH != dH or kW != dW (uses atomic add)
+ */
+ template <typename T>
+__global__ void atomicadaptivemaxgradinput(
+  T *gradInput, T *gradOutput, THCIndex_t *indices,
+  int isizeH, int isizeW, int osizeH, int osizeW
+)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o_plane*osizeH*osizeW;
+  gradInput = gradInput + i_plane*isizeH*isizeW;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // compute gradInput
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices + oh*osizeW + ow;
+      T z = *ptr_gradOutput;
+
+      int argmax = (*ptr_ind) - TH_INDEX_BASE;
+
+      // atomic add since different threads could update same variable
+      atomicAdd(&(gradInput[argmax]), z);
+    }
+  }
+}
+
+#include "generic/SpatialAdaptiveMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu
new file mode 100644
index 0000000..ce9941a
--- /dev/null
+++ b/aten/src/THCUNN/SpatialAveragePooling.cu
@@ -0,0 +1,86 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "common.h"
+
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
+__global__ void AvePoolForward(const int nthreads,
+    const Dtype* const bottom_data, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    Dtype* const top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    Acctype aveval = Acctype(0);
+    const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[h * width + w];
+      }
+    }
+    if(COUNT_INCLUDE_PAD)
+      top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
+    else
+      top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
+  }
+}
+
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
+__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    Dtype* const bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_w;
+    const int h = (index / width) % height + pad_h;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    Acctype gradient = Acctype(0);
+    const Dtype* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend = min(hend, height);
+        wend = min(wend, width);
+        if(COUNT_INCLUDE_PAD)
+          gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
+        else
+          gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
+      }
+    }
+    bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
+  }
+}
+
+#include "generic/SpatialAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
new file mode 100644
index 0000000..83addd0
--- /dev/null
+++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
@@ -0,0 +1,161 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include <THC/THCApply.cuh>
+
+#include <thrust/functional.h>
+
+template <typename Dtype>
+__global__ void SpatialClassNLLCriterion_updateOutput_no_reduce_kernel(
+    int64_t nthreads,
+    THCDeviceTensor<Dtype, 4> input,
+    THCDeviceTensor<THCIndex_t, 3> target,
+    THCDeviceTensor<Dtype, 3> output,
+    Dtype *weights,
+    int64_t ignore_index) {
+  int64_t batch_size = input.getSize(0);
+  int64_t H = input.getSize(2);
+  int64_t W = input.getSize(3);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int64_t b = index % batch_size;
+    const int64_t h = (index / batch_size) % H;
+    const int64_t w = (index / (batch_size * H)) % W;
+
+    int64_t cur_target = target[b][h][w] - TH_INDEX_BASE;
+    if (cur_target == ignore_index) {
+      output[b][h][w] = ScalarConvert<int, Dtype>::to(0);
+      continue;
+    }
+    Dtype value = input[b][cur_target][h][w];
+    Dtype weight =
+        weights ? weights[cur_target] : ScalarConvert<int, Dtype>::to(1);
+    output[b][h][w] = -value * weight;
+  }
+}
+
+template <typename Dtype>
+__global__ void SpatialClassNLLCriterion_updateGradInput_no_reduce_kernel(
+    int64_t nthreads,
+    THCDeviceTensor<THCIndex_t, 3> target,
+    THCDeviceTensor<Dtype, 3> gradOutput,
+    THCDeviceTensor<Dtype, 4> gradInput,
+    Dtype *weights,
+    int64_t ignore_index) {
+  int64_t batch_size = target.getSize(0);
+  int64_t H = target.getSize(1);
+  int64_t W = target.getSize(2);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int64_t b = index % batch_size;
+    const int64_t h = (index / batch_size) % H;
+    const int64_t w = (index / (batch_size * H)) % W;
+
+    int64_t cur_target = target[b][h][w] - TH_INDEX_BASE;
+    if (cur_target == ignore_index) {
+      continue;
+    }
+    Dtype value =
+        -(weights ? weights[cur_target] : ScalarConvert<int, Dtype>::to(1));
+    gradInput[b][cur_target][h][w] = value * gradOutput[b][h][w];
+  }
+}
+
+template <typename T, typename AccumT>
+__global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
+          T *output,
+          T *total_weight,
+          T *input,
+          THCIndex_t *target,
+          T *weights,
+          int size_average,
+          int batch_size,
+          int n_classes,
+          int map_nelem,
+          int blocks_per_sample,
+          int64_t ignore_index)
+{
+  __shared__ AccumT partial_sums[CUDA_NUM_THREADS];
+
+  int i, t;
+  T cur_weight;
+  AccumT input_sum = 0;
+  AccumT acc_weight = 0;
+
+  int sample = blockIdx.x / blocks_per_sample;
+  int toffset = sample * map_nelem;
+  int ioffset = sample * map_nelem * n_classes;
+  int step = blockDim.x * blocks_per_sample;
+  for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
+       i < map_nelem;
+       i += step) {
+    t = target[toffset + i] - TH_INDEX_BASE;
+    if (t != ignore_index) {
+      assert(t >= 0 && t < n_classes);
+      cur_weight = weights ? weights[t] : ScalarConvert<int, T>::to(1);
+      input_sum -= input[ioffset + i + map_nelem * t] * cur_weight;
+      acc_weight += cur_weight;
+    }
+  }
+
+  input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<AccumT>(), AccumT(0));
+  __syncthreads();
+  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<AccumT>(), AccumT(0));
+
+  if (threadIdx.x == 0) {
+    atomicAdd(total_weight, ScalarConvert<AccumT, T>::to(acc_weight));
+    atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum));
+  }
+}
+
+template<typename T>
+__global__ void cunn_SpatialClassNLLCriterion_sizeAverage_kernel(
+          T *output,
+          T *total_weight)
+{
+  if (*total_weight > 0)
+    *output = THCNumerics<T>::div(*output, *total_weight);
+}
+
+template<typename T>
+__global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
+          T *gradInput,
+          T *gradOutput,
+          THCIndex_t *target,
+          T *weights,
+          T *total_weight,
+          int size_average,
+          int batch_size,
+          int n_classes,
+          int map_nelem,
+          int blocks_per_sample,
+          int64_t ignore_index)
+{
+  if (*total_weight <= 0)
+    return;
+
+  int i, t;
+  T norm = size_average ? (ScalarConvert<int, T>::to(1) / *total_weight) : ScalarConvert<int, T>::to(1);
+
+  int sample = blockIdx.x / blocks_per_sample;
+  int step = blockDim.x * blocks_per_sample;
+  int toffset = sample * map_nelem;
+  int ioffset = sample * map_nelem * n_classes;
+  for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
+       i < map_nelem;
+       i += step) {
+    t = (int)target[toffset + i] - TH_INDEX_BASE;
+    if (t != ignore_index) {
+      assert(t >= 0 && t < n_classes);
+      gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert<int, T>::to(1)) * norm * gradOutput[0];
+    }
+  }
+}
+
+#include "generic/SpatialClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu
new file mode 100644
index 0000000..17801d5
--- /dev/null
+++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu
@@ -0,0 +1,11 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/SpatialConvolutionLocal.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu
new file mode 100644
index 0000000..4a59acb
--- /dev/null
+++ b/aten/src/THCUNN/SpatialConvolutionMM.cu
@@ -0,0 +1,10 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/SpatialConvolutionMM.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu
new file mode 100644
index 0000000..cd6f081
--- /dev/null
+++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu
@@ -0,0 +1,126 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+#include "common.h"
+
+template <typename Dtype, typename Acctype>
+__global__ void
+#if __CUDA_ARCH__ >= 320
+__launch_bounds__(CUDA_NUM_THREADS)
+#endif
+LRNFillScale(const int nthreads, const Dtype* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha_over_size,
+    const Dtype k, Dtype* const scale) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    const Dtype* const in_off = in + offset;
+    Dtype* const scale_off = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    Acctype accum_scale = Acctype(0);
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+                       * in_off[(head - size) * step];
+      }
+      scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+                       * in_off[(head - size) * step];
+      }
+      scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
+      ++head;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
+    const Dtype* scale, const Dtype negative_beta, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    out[index] = in[index] * pow(scale[index], negative_beta);
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void LRNComputeDiff(const int nthreads,
+    const Dtype* const bottom_data, const Dtype* const top_data,
+    const Dtype* const scale, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio, Dtype* const bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    const Dtype* const bottom_off = bottom_data + offset;
+    const Dtype* const top_off = top_data + offset;
+    const Dtype* const scale_off = scale + offset;
+    const Dtype* const top_diff_off = top_diff + offset;
+    Dtype* const bottom_diff_off = bottom_diff + offset;
+    int head = 0;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    Acctype accum_ratio = Acctype(0);
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
+          scale_off[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
+          scale_off[head * step];
+      if (head - size >= 0) {
+        accum_ratio -= top_diff_off[(head - size) * step] *
+            top_off[(head - size) * step] / scale_off[(head - size) * step];
+      }
+      bottom_diff_off[(head - post_pad) * step] =
+          ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
+            * pow(scale_off[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_ratio -= top_diff_off[(head - size) * step] *
+            top_off[(head - size) * step] / scale_off[(head - size) * step];
+      }
+      bottom_diff_off[(head - post_pad) * step] =
+          ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
+            * pow(scale_off[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
+      ++head;
+    }
+  }
+}
+
+
+#include "generic/SpatialCrossMapLRN.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu
new file mode 100644
index 0000000..a0231aa
--- /dev/null
+++ b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu
@@ -0,0 +1,258 @@
+// updateOutput, updateGradInput Kernels ported from Sergey Zagoruyko's pyinn, which itself was a
+// port from Caffe
+
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCNumerics.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCSortUtils.cuh"
+#include "THCTensorMathReduce.cuh"
+#include "SharedMem.cuh"
+#include "common.h"
+#include <algorithm>
+
+
+const int WARP_SIZE = 32;
+// Crude benchmarks suggest 256 is better than 512 and 1024
+// TODO: Autotune/use better heuristics, improve speed more.
+const int MAX_BLOCK_SIZE = 256;
+
+static int getGradParamsNumThreads(int batchSize){
+//warp per item in a batch, up to a maximum
+   return std::min(batchSize * WARP_SIZE, MAX_BLOCK_SIZE);
+      
+}
+
+template <typename T, typename AccT, typename IndexType, int kSize>
+__global__ void spatialDepthwiseConvolutionUpdateOutput(
+    const THCDeviceTensor<T, 4> input,
+    THCDeviceTensor<T, 4> output,
+    const THCDeviceTensor<T, 4> weight,
+    const THCDeviceTensor<T, 1> bias,
+    bool biasEnabled,
+    IndexType totalElements,
+    const int outputChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight)
+{
+  const int KW_LIMIT = (kSize !=0) ? kSize : kernelWidth;
+  const int KH_LIMIT = (kSize !=0) ? kSize : kernelHeight;
+  
+
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    //calculate n,c,h,w indices, replacing modulos by divide and multiply add, 
+    //result is same as would be in the code below
+    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
+    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
+    //const int h = (linearIndex / outputWidth) % outputHeight;
+    //const int w = linearIndex % outputWidth;
+    
+    int indtmp1 = linearIndex/outputWidth;
+    const int w = linearIndex - indtmp1 * outputWidth;
+    int indtmp2 = indtmp1/outputHeight;
+    const int h = indtmp1 - indtmp2 * outputHeight;
+    indtmp1 = indtmp2; 
+    indtmp2 = indtmp1/outputChannels;
+    const int c = indtmp1 - indtmp2 * outputChannels;
+    const int n = indtmp2;
+
+    int inputChannel = c; 
+    int inputChannels = outputChannels; 
+    if (depthwiseMultiplier !=1) {
+      inputChannel /= depthwiseMultiplier;
+      inputChannels /= depthwiseMultiplier;
+    }
+
+    int weightOffset = c * kernelHeight * kernelWidth;
+
+    AccT value = biasEnabled ? ScalarConvert<T, AccT>::to(bias.data()[c]) : ScalarConvert<int, AccT>::to(0);
+    const IndexType offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
+#pragma unroll
+    for (int kH = 0; kH < KH_LIMIT; ++kH) {
+#pragma unroll
+      for (int kW = 0; kW < KW_LIMIT; ++kW) {
+        const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
+        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
+
+        if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && (w_in < inputWidth)) {
+          const IndexType offset = offset0 + h_in * inputWidth + w_in;
+          value = THCNumerics<AccT>::add(
+            value,
+            THCNumerics<AccT>::mul(
+              ScalarConvert<T, AccT>::to(weight.data()[weightOffset]),
+              ScalarConvert<T, AccT>::to(input.data()[offset])));
+        }
+        ++weightOffset;
+      }
+    }
+    output.data()[linearIndex] = ScalarConvert<AccT, T>::to(value);
+  }
+}
+
+template <typename T, typename AccT, typename IndexType, int kSize, int stride>
+__global__ void spatialDepthwiseConvolutionUpdateGradInput(
+    const THCDeviceTensor<T, 4> gradOutput,
+    THCDeviceTensor<T, 4> gradInput,
+    const THCDeviceTensor<T, 4> weight,
+    IndexType totalElements,
+    const int inputChannels,
+    const int depthwiseMultiplier,
+    const int outputChannels,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight)
+{
+  const int KW_LIMIT = (kSize !=0) ? kSize : kernelWidth;
+  const int KH_LIMIT = (kSize !=0) ? kSize : kernelHeight;
+  const int strideW = (stride !=0) ? stride : strideWidth;
+  const int strideH = (stride !=0) ? stride : strideHeight;
+
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+
+    int indtmp1 = linearIndex/inputWidth;
+    const int w = linearIndex - indtmp1 * inputWidth;
+    int indtmp2 = indtmp1/inputHeight;
+    const int h = indtmp1 - indtmp2 * inputHeight;
+    indtmp1 = indtmp2; 
+    indtmp2 = indtmp1/inputChannels;
+    const int c = indtmp1 - indtmp2 * inputChannels;
+    const int n = indtmp2;
+
+    AccT value = ScalarConvert<int, AccT>::to(0);
+  
+#pragma unroll
+    for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) {
+      int och = (c * depthwiseMultiplier) + multiplier;
+      int weightOffset = och * kernelHeight * kernelWidth;
+#pragma unroll
+      for (int kh = 0; kh < KH_LIMIT; ++kh) {
+#pragma unroll
+        for (int kw = 0; kw < KW_LIMIT; ++kw) {
+          int h_out = h + padHeight - kh * dilationHeight;
+          int w_out = w + padWidth - kw * dilationWidth;
+          if ((h_out % strideH == 0) && (w_out % strideW == 0)) {
+            h_out = h_out / strideH;
+            w_out = w_out / strideW;
+
+            if ((h_out >= 0) && (h_out < outputHeight)
+                  && (w_out >= 0) && (w_out < outputWidth)) {
+
+              const int offset = ((n * outputChannels + och) * outputHeight + h_out)
+                    * outputWidth + w_out;
+              value = THCNumerics<AccT>::add(
+                value,
+                THCNumerics<AccT>::mul(
+                  ScalarConvert<T, AccT>::to(weight.data()[weightOffset]),
+                  ScalarConvert<T, AccT>::to(gradOutput.data()[offset])));
+            }
+          }
+          ++weightOffset;
+        }
+      }
+    }
+    gradInput.data()[linearIndex] = ScalarConvert<AccT, T>::to(value);
+  }
+}
+
+
+template <typename T, typename AccT, typename IndexType>
+__global__ void spatialDepthwiseConvolutionAccGradParameters(
+    const THCDeviceTensor<T, 4> gradOutput,
+    const THCDeviceTensor<T, 4> input,
+    THCDeviceTensor<T, 4> gradWeight,
+    const int batchSize,
+    const int inputChannels,
+    const int kernelChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight)
+{
+  const int channelStride = kernelWidth * kernelHeight;
+
+  // Have to use a statically typed Shared Memory pointer
+  SharedMem<AccT> smem;
+
+  // Each Block is responsible for accumulating over a permutation of
+  // (channels x kH x kW), use blockIdx to determine which one
+  int bidx = blockIdx.x;
+  int kW = bidx % kernelWidth;
+  int kH = (bidx / kernelWidth) % kernelHeight;
+  int ch = (bidx / channelStride);
+
+  // Need to calculate which input channel is associated with this filter
+  // channel
+  int inputCh = ch / depthwiseMultiplier;
+
+  AccT grad = ScalarConvert<float, AccT>::to(0.0);
+
+  const int laneId = threadIdx.x % WARP_SIZE;
+  const int batch = threadIdx.x / WARP_SIZE;
+  const int nwarps = blockDim.x / WARP_SIZE;
+  const int imageElements = outputWidth * outputHeight;
+  // Use warp per item.  In the original kernel, a threadblock was used to sum over NHW.
+  // Here, we use a warp to sum values over HW dimension, and if batchSize is larger than the
+  // number of warps, a warp would loop over remaining batch items (e.g. if there are 8 warps,
+  // warp 0 would go over 0-8-16 etc image, warp 1 over 1-9-17 etc). Later in blockReduce,
+  // all the warps will be reduced anyway, thus the full reduction will be over NHW, like it
+  // should be. That allows to get rid of one modulo operation inside the loop (because n/batchIdx
+  // now does not have to be computed through modulo, you are just looping over it), and
+  // bring a nice speed-up.
+  for (int batchIdx = batch; batchIdx < batchSize; batchIdx += nwarps){  
+    // Warp-stride loop over elements in a batch item
+    for (IndexType idx = laneId; idx < imageElements; idx += WARP_SIZE) {
+    // Need to calculate the following: batch position, and offset into the gradOutput
+    // in height, and width. We can intuit the corresponding position in the input from
+    // the other parameters we have
+      int go_w_offset = idx % outputWidth;
+      int go_h_offset = (idx / outputWidth);
+  
+      int i_w_offset = (go_w_offset * strideWidth) + (kW * dilationWidth) - padWidth;
+      int i_h_offset = (go_h_offset * strideHeight) + (kH * dilationHeight) - padHeight;
+  
+      if (i_w_offset >= 0 && i_h_offset >= 0 && i_w_offset < inputWidth && i_h_offset < inputHeight) {
+        int inputOffset = ((batchIdx * inputChannels + inputCh) * inputHeight + i_h_offset) * inputWidth + i_w_offset;
+        int outputOffset = ((batchIdx * kernelChannels + ch) * outputHeight ) * outputWidth + idx; 
+        grad = THCNumerics<AccT>::add(
+            grad,
+            THCNumerics<AccT>::mul(
+              ScalarConvert<T, AccT>::to(input.data()[inputOffset]),
+              ScalarConvert<T, AccT>::to(gradOutput.data()[outputOffset])));
+      }
+    }
+  }
+  __syncthreads();
+
+  // At this point each thread in the block has a local gradient, which we need to
+  // accumulate prior to writing the global value
+  AccT *buf = smem.getPointer();
+  AccT tval = reduceBlock<AccT, ReduceAdd<AccT>>(
+      buf, blockDim.x, grad, ReduceAdd<AccT>(), ScalarConvert<float, AccT>::to(0));
+
+  // After reduction, first thread in the block has the gradient, so its responsible
+  // for writing it to gradWeight
+  if (threadIdx.x == 0) {
+    int weightOffset = kW + (kernelWidth * kH) + (kernelWidth * kernelHeight * ch);
+    gradWeight.data()[weightOffset] = ScalarConvert<AccT, T>::to(tval);
+  }
+}
+
+#include "generic/SpatialDepthwiseConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu
new file mode 100644
index 0000000..b8e9602
--- /dev/null
+++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu
@@ -0,0 +1,11 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/SpatialDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
new file mode 100644
index 0000000..6732e4f
--- /dev/null
+++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
@@ -0,0 +1,116 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCNumerics.cuh"
+#include "common.h"
+
+// kernels borrowed from Caffe
+template <typename Dtype, typename AccType>
+__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, Dtype* top_data,
+    int64_t* top_mask) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+    while(hstart < 0)
+      hstart += dilation_h;
+    while(wstart < 0)
+      wstart += dilation_w;
+    AccType maxval = THCNumerics<AccType>::min();
+    int maxidx = -1;
+    bottom_data += (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; h += dilation_h) {
+      for (int w = wstart; w < wend; w += dilation_w) {
+        Dtype val = bottom_data[h * width + w];
+        if ((ScalarConvert<Dtype, AccType>::to(val) > maxval) || THCNumerics<Dtype>::isnan(val)) {
+          maxidx = h * width + w;
+          maxval = ScalarConvert<Dtype, AccType>::to(val);
+        }
+      }
+    }
+    top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval);
+    top_mask[index] = maxidx + TH_INDEX_BASE;
+  }
+}
+
+const int BACKWARD_THREADS = 256;
+
+template <typename Dtype, typename AccType>
+__launch_bounds__(BACKWARD_THREADS,2048/BACKWARD_THREADS)
+__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
+    const int64_t* top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    Dtype* bottom_diff) {
+    CUDA_KERNEL_LOOP(index, height*width) {
+    int h = index/width;
+    int w = index - h * width;
+//get some templating performance benefits without actually templating
+    int phstart, phend, pwstart, pwend;
+    if (stride_h == 1) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1))  + 1;
+       phend = min((h + pad_h)  + 1, pooled_height);
+    } else if (stride_h == 2) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2  + 1;
+       phend = min((h + pad_h) / 2  + 1, pooled_height);
+    } else {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h  + 1;
+       phend = min((h + pad_h) / stride_h  + 1, pooled_height);
+    }
+    if (stride_w == 1) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1;
+        pwend = min((w + pad_w) + 1, pooled_width);
+    } else if (stride_w == 2) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1;
+        pwend = min((w + pad_w) / 2 + 1, pooled_width);
+    } else {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
+        pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    }
+    for (int n = blockIdx.y; n < num; n += gridDim.y)
+       for (int c = blockIdx.z; c < channels; c+= gridDim.z) { 
+
+        AccType gradient = AccType(0);
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        top_diff += offset;
+        top_mask += offset;
+//get some templating performance benefits without actually templating
+        if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) {
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
+              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
+            }
+          }
+        }
+        } else {
+            if (top_mask[phstart * pooled_width + pwstart] - TH_INDEX_BASE == h * width + w) {
+              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[phstart * pooled_width + pwstart]);
+            }  
+        }
+        bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert<AccType, Dtype>::to(gradient);
+      }
+  }
+}
+
+#include "generic/SpatialDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
new file mode 100644
index 0000000..f3ca162
--- /dev/null
+++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
@@ -0,0 +1,113 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#include <cfloat>
+
+template <typename Dtype, typename Acctype>
+__device__ inline int getInterval(Acctype sample,
+                                  int index,
+                                  int inputSize,
+                                  int outputSize,
+                                  int poolSize) {
+  Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1);
+  if (index == outputSize - 1) {
+    return inputSize - poolSize;
+  } else {
+    return (int) ((index + sample) * alpha) - (int) (sample * alpha);
+  }
+}
+
+// We template on poolSizeW to allow the innermost loop to be unrolled
+template <int PoolSizeWStatic, typename Dtype, typename Acctype>
+__global__ void SpatialFractionalMaxPooling_updateOutput(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 3> samples,
+  int poolSizeW, int poolSizeH) {
+
+  // Output (h, w) point that this thread is responsible for
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < output.getSize(2) * output.getSize(3)) {
+    int outputW = ourOutputPoint % output.getSize(3);
+    int outputH = ourOutputPoint / output.getSize(3);
+
+    int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputW,
+                            input.getSize(3), output.getSize(3), poolSizeW);
+    int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputH,
+                            input.getSize(2), output.getSize(2), poolSizeH);
+
+    Dtype maxVal = THCNumerics<Dtype>::min();
+    int maxIndex = -1;
+
+    for (int h = poolH; h < poolH + poolSizeH; ++h) {
+      if (PoolSizeWStatic == -1) {
+        for (int w = poolW; w < poolW + poolSizeW; ++w) {
+          Dtype val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal) {
+            maxIndex = h * input.getSize(3) + w;
+            maxVal = val;
+          }
+        }
+      } else {
+#pragma unroll
+        for (int i = 0; i < PoolSizeWStatic; ++i) {
+          int w = i + poolW;
+          Dtype val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal) {
+            maxIndex = h * input.getSize(3) + w;
+            maxVal = val;
+          }
+        }
+      }
+    }
+
+    assert(THCNumerics<Dtype>::ne(maxVal, THCNumerics<Dtype>::min()));
+    assert(maxIndex != -1);
+
+    // +1 for Lua index
+    indices[batch][plane][outputH][outputW] = maxIndex + TH_INDEX_BASE;
+    output[batch][plane][outputH][outputW] = maxVal;
+  }
+}
+
+template <typename Dtype>
+__global__ void SpatialFractionalMaxPooling_updateGradInput(
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<THCIndex_t, 4> indices) {
+  // Output (h, w) point that this thread is responsible for
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3)) {
+    int outputW = ourOutputPoint % gradOutput.getSize(3);
+    int outputH = ourOutputPoint / gradOutput.getSize(3);
+
+    int index = indices[batch][plane][outputH][outputW] - TH_INDEX_BASE;
+    assert(index >= 0);
+    int inputW = index % gradInput.getSize(3);
+    int inputH = index / gradInput.getSize(3);
+    assert(inputH < gradInput.getSize(2));
+
+    atomicAdd(gradInput[batch][plane][inputH][inputW].data(),
+              gradOutput[batch][plane][outputH][outputW]);
+  }
+}
+
+#include "generic/SpatialFractionalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu
new file mode 100644
index 0000000..4e37ecf
--- /dev/null
+++ b/aten/src/THCUNN/SpatialFullConvolution.cu
@@ -0,0 +1,8 @@
+#include "THCUNN.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/SpatialFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
new file mode 100644
index 0000000..61e1fe5
--- /dev/null
+++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
@@ -0,0 +1,9 @@
+#include "THCUNN.h"
+#include "im2col.h"
+#include "THCTensor.hpp"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/SpatialFullDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
new file mode 100644
index 0000000..30a1a5d
--- /dev/null
+++ b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
@@ -0,0 +1,243 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H)
+#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
+  do {    \
+    if (WITHIN_BOUNDS(x, y, H, W)) {    \
+      atomicAdd(&input[n][c][y][x], value);   \
+    }   \
+  } while(0)
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+const int MODE_BORDER = 1;
+
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void SpatialGridSamplerBilinear_updateOutput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 4> input,
+    THCDeviceTensor<Dtype, 4> grid,
+    THCDeviceTensor<Dtype, 4> output,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int IH = input.getSize(2);
+  int IW = input.getSize(3);
+  int H = grid.getSize(1);
+  int W = grid.getSize(2);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int h = (index / N) % H;
+    const int w = (index / (N * H)) % W;
+    int c;
+
+    // get the corresponding input x, y co-ordinates from grid
+    Dtype ix = grid[n][h][w][0];
+    Dtype iy = grid[n][h][w][1];
+
+    // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+
+    // get NE, NW, SE, SW pixel values from (x, y)
+    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype nw = (ix_se - ix)    * (iy_se - iy);
+    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
+    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
+    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
+
+    // calculate bilinear weighted pixel value and set output pixel
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_nw, ix_nw, IW);
+      CLIP_COORDINATES(iy_nw, iy_nw, IH);
+      CLIP_COORDINATES(ix_ne, ix_ne, IW);
+      CLIP_COORDINATES(iy_ne, iy_ne, IH);
+      CLIP_COORDINATES(ix_sw, ix_sw, IW);
+      CLIP_COORDINATES(iy_sw, iy_sw, IH);
+      CLIP_COORDINATES(ix_se, ix_se, IW);
+      CLIP_COORDINATES(iy_se, iy_se, IH);
+    }
+
+    Dtype out_val;
+    for (c = 0; c < C; ++c) {
+      out_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) {
+        out_val += input[n][c][iy_nw][ix_nw] * nw;
+      }
+      if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) {
+        out_val += input[n][c][iy_ne][ix_ne] * ne;
+      }
+      if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) {
+        out_val += input[n][c][iy_sw][ix_sw] * sw;
+      }
+      if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) {
+        out_val += input[n][c][iy_se][ix_se] * se;
+      }
+      output[n][c][h][w] = out_val;
+    }
+  }
+}
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> gradInput,
+    THCDeviceTensor<Dtype, 4> grid, THCDeviceTensor<Dtype, 4> gradGrid,
+    THCDeviceTensor<Dtype, 4> gradOutput,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int IH = input.getSize(2);
+  int IW = input.getSize(3);
+  int H = grid.getSize(1);
+  int W = grid.getSize(2);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int h = (index / N) % H;
+    const int w = (index / (N * H)) % W;
+
+    // get the corresponding input x, y co-ordinates from grid
+    Dtype ix = grid[n][h][w][0];
+    Dtype iy = grid[n][h][w][1];
+
+    Dtype gix = ScalarConvert<int,Dtype>::to(0);
+    Dtype giy = ScalarConvert<int,Dtype>::to(0);
+
+    // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));;
+
+    // get NE, NW, SE, SW pixel values from (x, y)
+    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));;
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype nw = (ix_se - ix)    * (iy_se - iy);
+    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
+    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
+    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
+
+    Dtype gradout;
+    Dtype nw_val;
+    Dtype ne_val;
+    Dtype sw_val;
+    Dtype se_val;
+    
+    int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+    if (padding_mode==MODE_BORDER){
+      // get clipped NE, NW, SE, SW pixel values from (x, y)
+      CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
+      CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
+      CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
+      CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
+      CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
+      CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
+      CLIP_COORDINATES(ix_se, ix_se_cl, IW);
+      CLIP_COORDINATES(iy_se, iy_se_cl, IH);
+    }
+    else {
+      ix_nw_cl = ix_nw;
+      iy_nw_cl = iy_nw;
+      ix_ne_cl = ix_ne;
+      iy_ne_cl = iy_ne;
+      ix_sw_cl = ix_sw;
+      iy_sw_cl = iy_sw;
+      ix_se_cl = ix_se;
+      iy_se_cl = iy_se;
+    }
+
+    for (int c = 0; c < C; ++c) {
+      gradout = gradOutput[n][c][h][w];
+
+      // calculate and set gradInput
+      SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
+      SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
+      SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
+      SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
+
+      // calculate gradGrid
+      nw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) {
+        nw_val = input[n][c][iy_nw_cl][ix_nw_cl];
+      }
+      ne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) {
+        ne_val = input[n][c][iy_ne_cl][ix_ne_cl];
+      }
+      sw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) {
+        sw_val = input[n][c][iy_sw_cl][ix_sw_cl];
+      }
+      se_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) {
+        se_val = input[n][c][iy_se_cl][ix_se_cl];
+      }
+
+      gix += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (iy_se - iy) * gradout);
+      gix += ne_val * (iy_sw - iy) * gradout;
+      gix += ScalarConvert<int,Dtype>::to(-1)*(sw_val * (iy - iy_ne) * gradout);
+      gix += se_val * (iy - iy_nw) * gradout;
+
+      giy += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (ix_se - ix) * gradout);
+      giy += ScalarConvert<int,Dtype>::to(-1)*(ne_val * (ix - ix_sw) * gradout);
+      giy += sw_val * (ix_ne - ix) * gradout;
+      giy += se_val * (ix - ix_nw) * gradout;
+    }
+
+    // un-normalize gradGrid values back to [-1, 1] constraints
+    gix = gix * (IW - 1) / 2;
+    giy = giy * (IH - 1) / 2;
+
+    Dtype gix_old = gradGrid[n][h][w][0];
+    Dtype giy_old = gradGrid[n][h][w][1];
+
+    gradGrid[n][h][w][0] = gix_old + gix;
+    gradGrid[n][h][w][1] = giy_old + giy;
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef CLIP_COORDINATES
+#undef WITHIN_BOUNDS
+#undef SAFE_ADD
+
+#include "generic/SpatialGridSamplerBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialMaxPooling.cu b/aten/src/THCUNN/SpatialMaxPooling.cu
new file mode 100644
index 0000000..90e6fe4
--- /dev/null
+++ b/aten/src/THCUNN/SpatialMaxPooling.cu
@@ -0,0 +1,4 @@
+#include "THCUNN.h"
+
+#include "generic/SpatialMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialMaxUnpooling.cu b/aten/src/THCUNN/SpatialMaxUnpooling.cu
new file mode 100644
index 0000000..56488fd
--- /dev/null
+++ b/aten/src/THCUNN/SpatialMaxUnpooling.cu
@@ -0,0 +1,32 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+
+template <typename Dtype>
+__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const int64_t* bottom_mask,
+    const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels
+    int c = (index / iwidth / iheight) % channels;
+    int n = index / iwidth / iheight / channels;
+    top_data += (n*channels + c)*oheight*owidth;
+    int maxind = bottom_mask[index] - TH_INDEX_BASE;
+
+    top_data[maxind] = bottom_data[index];
+  }
+}
+
+template <typename Dtype>
+__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const int64_t* bottom_mask,
+    const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int c = (index / iwidth / iheight) % channels;
+    int n = index / iwidth / iheight / channels;
+    top_diff += (n*channels + c)*oheight*owidth;
+    int maxind = bottom_mask[index] - TH_INDEX_BASE;
+
+    bottom_diff[index] = top_diff[maxind];
+  }
+}
+
+#include "generic/SpatialMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu
new file mode 100644
index 0000000..96472ee
--- /dev/null
+++ b/aten/src/THCUNN/SpatialReflectionPadding.cu
@@ -0,0 +1,87 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <THC/THCApply.cuh>
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype>
+__global__ void SpatialReflectionPadding_updateOutput(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  int padT, int padB, int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= output.getSize(2) * output.getSize(3)) {
+    return;
+  }
+  int outputPointX = outputPointId % output.getSize(3);
+  int outputPointY = outputPointId / output.getSize(3);
+
+  int iStartX = max(0, -padL);
+  int iStartY = max(0, -padT);
+  int oStartX = max(0, padL);
+  int oStartY = max(0, padT);
+
+  int inputPointX = abs(outputPointX - padL)
+                  - abs(outputPointX - (input.getSize(3) + padL - 1))
+                  - outputPointX
+                  + 2 * padL + input.getSize(3) - 1
+                  - oStartX + iStartX;
+
+  int inputPointY = abs(outputPointY - padT)
+                  - abs(outputPointY - (input.getSize(2) + padT - 1))
+                  - outputPointY
+                  + 2 * padT + input.getSize(2) - 1
+                  - oStartY + iStartY;
+
+  Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
+  output[batch][plane][outputPointY][outputPointX] = valueToCopy;
+}
+
+template <typename Dtype>
+__global__ void SpatialReflectionPadding_updateGradInput(
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  int padT, int padB, int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) {
+    return;
+  }
+  int outputPointX = outputPointId % gradOutput.getSize(3);
+  int outputPointY = outputPointId / gradOutput.getSize(3);
+
+  int iStartX = max(0, -padL);
+  int iStartY = max(0, -padT);
+  int oStartX = max(0, padL);
+  int oStartY = max(0, padT);
+
+  int inputPointX = abs(outputPointX - padL)
+                  - abs(outputPointX - (gradInput.getSize(3) + padL - 1))
+                  - outputPointX
+                  + 2 * padL + gradInput.getSize(3) - 1
+                  - oStartX + iStartX;
+
+  int inputPointY = abs(outputPointY - padT)
+                  - abs(outputPointY - (gradInput.getSize(2) + padT - 1))
+                  - outputPointY
+                  + 2 * padT + gradInput.getSize(2) - 1
+                  - oStartY + iStartY;
+
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+  atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
+}
+
+#include "generic/SpatialReflectionPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu
new file mode 100644
index 0000000..f63c209
--- /dev/null
+++ b/aten/src/THCUNN/SpatialReplicationPadding.cu
@@ -0,0 +1,70 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <THC/THCApply.cuh>
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template <typename Dtype>
+__global__ void SpatialReplicationPadding_updateOutput(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  int padT, int padB, int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= output.getSize(2) * output.getSize(3)) {
+    return;
+  }
+  int outputPointX = outputPointId % output.getSize(3);
+  int outputPointY = outputPointId / output.getSize(3);
+
+  int iStartX = max(0, -padL);
+  int iStartY = max(0, -padT);
+  int oStartX = max(0, padL);
+  int oStartY = max(0, padT);
+
+  int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX;
+  int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY;
+
+  Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
+  output[batch][plane][outputPointY][outputPointX] = valueToCopy;
+}
+
+template <typename Dtype>
+__global__ void SpatialReplicationPadding_updateGradInput(
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  int padT, int padB, int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) {
+    return;
+  }
+  int outputPointX = outputPointId % gradOutput.getSize(3);
+  int outputPointY = outputPointId / gradOutput.getSize(3);
+
+  int iStartX = max(0, -padL);
+  int iStartY = max(0, -padT);
+  int oStartX = max(0, padL);
+  int oStartY = max(0, padT);
+
+  int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX;
+  int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY;
+
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+  atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
+}
+
+
+#include "generic/SpatialReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu
new file mode 100644
index 0000000..bb04846
--- /dev/null
+++ b/aten/src/THCUNN/SpatialSubSampling.cu
@@ -0,0 +1,265 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+/*
+ * Description:
+ *    this function subsamples an input 3D tensor along dimensions 1 and 2
+ *    3D input, 3D output, 1D weight, 1D bias
+ */
+ template <typename Dtype, typename Acctype>
+__global__ void subsample(Dtype *input, Dtype *output, Dtype *weight, Dtype *bias,
+                          int input_n, int input_h, int input_w,
+                          int kH, int kW, int dH, int dW)
+{
+  // iterators
+  int xx, yy;
+
+  // output size
+  int output_w = (input_w - kW) / dW + 1;
+  int output_h = (input_h - kH) / dH + 1;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+  int k = blockIdx.x % input_n;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  output = output + o*output_w*output_h;
+  input = input + i*input_w*input_h;
+
+  // Get the good mask for (k,i) (k out, i in)
+  Dtype the_weight = weight[k];
+
+  // Initialize to the bias
+  Dtype the_bias = bias[k];
+
+  // For all output pixels...
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+      // Compute the mean of the input image...
+      Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+      Dtype *ptr_output = output + yy*output_w + xx;
+      Acctype sum = 0;
+      int kx, ky;
+      for(ky = 0; ky < kH; ky++) {
+        for(kx = 0; kx < kW; kx++)
+          sum += ptr_input[kx];
+        ptr_input += input_w; // next input line
+      }
+      // Update output
+      *ptr_output = ScalarConvert<Acctype, Dtype>::to(the_weight*sum + the_bias);
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradWeight from input and gradOutput
+ */
+ template <typename Dtype, typename Acctype>
+__global__ void subgradweight(Dtype *input, Dtype *gradOutput, Dtype *gradWeight, Dtype *gradBias,
+                              int input_n, int input_h, int input_w,
+                              int kH, int kW, int dH, int dW,
+                              float scale)
+{
+  // iterators
+  int xx, yy;
+
+  // output size
+  int output_w = (input_w - kW) / dW + 1;
+  int output_h = (input_h - kH) / dH + 1;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+  int k = blockIdx.x % input_n;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o*output_w*output_h;
+  input = input + i*input_w*input_h;
+
+  // thread ID
+  int tid = blockDim.x*threadIdx.y + threadIdx.x;
+
+  // create array to hold partial sums
+  __shared__ Acctype sums[CUDA_MAX_THREADS];
+  sums[tid] = 0;
+
+  // compute partial sums
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+      Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput;
+      int64_t kx, ky;
+      for(ky = 0; ky < kH; ky++) {
+        for(kx = 0; kx < kW; kx++) {
+          sums[tid] += z * ptr_input[kx];
+        }
+        ptr_input += input_w;
+      }
+    }
+  }
+  __syncthreads();
+
+  // reduce: accumulate all partial sums to produce final gradWeight
+  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
+    Acctype scaledSums = Acctype(0);
+    for(int i = 0; i < blockDim.x*blockDim.y; i++) {
+      scaledSums += scale*sums[i];
+    }
+    gradWeight[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
+  }
+  __syncthreads();
+
+  // compute gradBias
+  sums[tid] = 0;
+  for (int i=tid; i<output_w*output_h; i+=(blockDim.x*blockDim.y)) {
+    sums[tid] += gradOutput[i];
+  }
+  __syncthreads();
+
+  // reduce gradBias
+  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
+    Acctype scaledSums = Acctype(0);
+    for (int i=0; i<(blockDim.x*blockDim.y); i++) {
+      scaledSums += scale*sums[i];
+    }
+    gradBias[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ */
+ template <typename Dtype>
+__global__ void subgradinput(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
+                             int input_n, int input_h, int input_w,
+                             int kH, int kW, int dH, int dW)
+{
+  // iterators
+  int xx, yy;
+
+  // output size
+  int output_w = (input_w - kW) / dW + 1;
+  int output_h = (input_h - kH) / dH + 1;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+  int k = blockIdx.x % input_n;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o*output_w*output_h;
+  gradInput = gradInput + i*input_w*input_h;
+
+  // get weight
+  Dtype the_weight = weight[k];
+
+  // compute gradInput
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+      Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput * the_weight;
+      int kx, ky;
+      for(ky = 0; ky < kH; ky++) {
+        for(kx = 0; kx < kW; kx++) {
+          // FIXME: should this be done at accreal precision?
+          ptr_gradInput[kx] += z;
+        }
+        ptr_gradInput += input_w;
+      }
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ */
+ template <typename Dtype>
+__global__ void subgradinputAtomic(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
+                                   int input_n, int input_h, int input_w,
+                                   int kH, int kW, int dH, int dW)
+{
+  // iterators
+  int xx, yy;
+
+  // output size
+  int output_w = (input_w - kW) / dW + 1;
+  int output_h = (input_h - kH) / dH + 1;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+  int k = blockIdx.x % input_n;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o*output_w*output_h;
+  gradInput = gradInput + i*input_w*input_h;
+
+  // get weight
+  Dtype the_weight = weight[k];
+
+  // compute gradInput
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+      Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput * the_weight;
+      int kx, ky;
+      for(ky = 0; ky < kH; ky++) {
+        for(kx = 0; kx < kW; kx++) {
+          // FIXME: should this be done at accreal precision?
+          atomicAdd(&(ptr_gradInput[kx]), z);
+        }
+        ptr_gradInput += input_w;
+      }
+    }
+  }
+}
+
+
+#include "generic/SpatialSubSampling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
new file mode 100644
index 0000000..07daa0e
--- /dev/null
+++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
@@ -0,0 +1,124 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rheight, const Acctype rwidth, const bool align_corners,
+    const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int height1 = data1.getSize(2);
+  const int width1 = data1.getSize(3);
+  const int height2 = data2.getSize(2);
+  const int width2 = data2.getSize(3);
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][h1][w1];
+          data2[n][c][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+        const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+                            + w1lambda * data1[n][c][h1][w1+w1p])
+                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
+                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
+        data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel_backward(const int n,
+    const Acctype rheight, const Acctype rwidth, const bool align_corners,
+    THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int height1 = data1.getSize(2);
+  const int width1 = data1.getSize(3);
+  const int height2 = data2.getSize(2);
+  const int width2 = data2.getSize(3);
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][h1][w1];
+          data1[n][c][h2][w2] += val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][h2][w2];
+        atomicAdd(data1[n][c][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val));
+      }
+    }
+  }
+}
+
+
+#include "generic/SpatialUpSamplingBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
new file mode 100644
index 0000000..889d64e
--- /dev/null
+++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
@@ -0,0 +1,102 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCTensor.hpp"
+
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_4d_kernel(
+		const int n,
+		const THCDeviceTensor<Dtype, 4> data1,
+		THCDeviceTensor<Dtype, 4> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int height1 = data1.getSize(2);
+  const int width1 = data1.getSize(3);
+  const int height2 = data2.getSize(2);
+  const int width2 = data2.getSize(3);
+  const float height_scale = (float) height1 / (float) height2;
+  const float width_scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+	for (int c = 0; c < channels; ++c) {
+	  const Dtype val = data1[n][c][h1][w1];
+	  data2[n][c][h2][w2] = val;
+	}
+      }
+      return;
+    }
+    //
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+	const Dtype val = data1[n][c][h1][w1];
+	data2[n][c][h2][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_4d_kernel_backward(
+		const int n,
+		THCDeviceTensor<Dtype, 4> data1,
+		const THCDeviceTensor<Dtype, 4> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int height1 = data1.getSize(2);
+  const int width1 = data1.getSize(3);
+  const int height2 = data2.getSize(2);
+  const int width2 = data2.getSize(3);
+  const float height_scale = (float) height1 / (float) height2;
+  const float width_scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+	for (int c = 0; c < channels; ++c) {
+	  const Dtype val = data2[n][c][h2][w2];
+	  data1[n][c][h1][w1] = val;
+	}
+      }
+      return;
+    }
+    //
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][h2][w2];
+        atomicAdd(data1[n][c][h1][w1].data(), d2val);
+      }
+    }
+  }
+}
+
+
+#include "generic/SpatialUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu
new file mode 100644
index 0000000..a52ce34
--- /dev/null
+++ b/aten/src/THCUNN/Sqrt.cu
@@ -0,0 +1,33 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct sqrtupdateOutput_functor
+{
+  const T bias;
+
+  sqrtupdateOutput_functor(T bias_)
+    : bias(bias_)
+  {}
+
+  __device__ void operator()(T *output, const T *input) const
+  {
+    *output = sqrt(*input + bias);
+  }
+};
+
+template <typename T>
+struct sqrtupdateGradInput_functor
+{
+  sqrtupdateGradInput_functor() {}
+
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
+  {
+    *gradInput = (THCNumerics<T>::eq(*output,ScalarConvert<float, T>::to(0.0f))) ? ScalarConvert<float, T>::to(0.0f) : ((ScalarConvert<float, T>::to(0.5f) * *gradOutput) / *output);
+  }
+};
+
+#include "generic/Sqrt.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu
new file mode 100644
index 0000000..66bbec4
--- /dev/null
+++ b/aten/src/THCUNN/Square.cu
@@ -0,0 +1,25 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct squareupdateOutput_functor
+{
+  __device__ void operator()(T* output, const T* input) const
+  {
+    *output = (*input) * (*input);
+  }
+};
+
+template <typename T>
+struct squareupdateGradInput_functor
+{
+  __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
+  {
+    *gradInput = ScalarConvert<double, T>::to(2.0) * (*gradOutput) * (*input);
+  }
+};
+
+#include "generic/Square.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
new file mode 100644
index 0000000..c17f09d
--- /dev/null
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
@@ -0,0 +1,248 @@
+#ifndef THC_HALF_AUTO_NUMERICS_INC
+#define THC_HALF_AUTO_NUMERICS_INC
+
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+
+// Half numerics functions defined as free functions, so cunn code can be
+//written generically, i.e. without excessive calling of THCNumerics<half> functions.
+
+// these functions should move to THCNumerics
+
+#ifdef CUDA_HALF_TENSOR
+inline __host__ __device__ half fmaxType(half x, half y) {
+  return THCNumerics<half>::ge(x, y) ? x : y;
+}
+
+inline __host__ __device__ float fmaxType(float x, half y) {
+  return fmaxf(x, ScalarConvert<half, float>::to(y));
+}
+#endif
+
+inline __host__ __device__ float fmaxType(float x, float y) {
+  return fmaxf(x, y);
+}
+
+inline __host__ __device__ double fmaxType(double x, double y) {
+  return fmax(x, y);
+}
+
+#ifdef CUDA_HALF_TENSOR
+
+// arithmetic functions
+
+inline __host__ __device__ half operator+(half a, half b) {
+  return THCNumerics<half>::add(a, b);
+}
+
+inline __host__ __device__ float operator+(half a, float b) {
+  return ScalarConvert<half, float>::to(a) + b;
+}
+
+inline __host__ __device__ float operator+(float a, half b) {
+  return a + ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator+(double a, half b) {
+  return a + ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator-(half a) {
+  return THCNumerics<half>::neg(a);
+}
+
+inline __host__ __device__ half operator-(half a, half b) {
+  return THCNumerics<half>::add(a, THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ half operator-(half a, int b) {
+  return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b)));
+}
+
+inline __host__ __device__ float operator-(half a, float b) {
+  return ScalarConvert<half, float>::to(a) - b;
+}
+
+inline __host__ __device__ double operator-(half a, double b) {
+  return ScalarConvert<half, double>::to(a) - b;
+}
+
+inline __host__ __device__ half operator-(int a, half b) {
+  return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ float operator-(float a, half b) {
+  return a - ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator-(double a, half b) {
+  return a - ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator*(half a, half b) {
+  return THCNumerics<half>::mul(a, b);
+}
+
+inline __host__ __device__ float operator*(half a, float b) {
+  return ScalarConvert<half, float>::to(a) * b;
+}
+
+inline __host__ __device__ double operator*(half a, double b) {
+  return ScalarConvert<half, double>::to(a) * b;
+}
+
+inline __host__ __device__ half operator*(half a, int b) {
+  return a * ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ float operator*(float a, half b) {
+  return a * ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator*(double a, half b) {
+  return a * ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(half a, half b) {
+  return THCNumerics<half>::div(a, b);
+}
+
+inline __host__ __device__ float operator/(float a, half b) {
+  return a / ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator/(double a, half b) {
+  return a / ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(int a, half b) {
+  return ScalarConvert<int, half>::to(a) / b;
+}
+
+inline __host__ __device__ float operator/(half a, float b) {
+  return ScalarConvert<half, float>::to(a) / b;
+}
+
+inline __host__ __device__ double operator/(half a, double b) {
+  return ScalarConvert<half, double>::to(a) / b;
+}
+
+inline __host__ __device__ half operator/(half a, int b) {
+  return a / ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) {
+  lhs = lhs + rhs;
+  return lhs;
+}
+inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) {
+  lhs = lhs + rhs;
+  return lhs;
+}
+
+inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) {
+  lhs = lhs - rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) {
+  lhs = lhs * rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) {
+  lhs = lhs / rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) {
+  lhs = lhs / rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half abs(half a) {
+  return THCNumerics<half>::abs(a);
+}
+
+inline __host__ __device__ half exp(half a) {
+  return THCNumerics<half>::exp(a);
+}
+
+inline __host__ __device__ half log10(half a) {
+  return THCNumerics<half>::log10(a);
+}
+
+inline __host__ __device__ half log1p(half a) {
+  return THCNumerics<half>::log1p(a);
+}
+
+inline __host__ __device__ half log2(half a) {
+  return THCNumerics<half>::log2(a);
+}
+
+inline __host__ __device__ half expm1(half a) {
+  return THCNumerics<half>::expm1(a);
+}
+
+inline __host__ __device__ half pow(half a, half b) {
+  return THCNumerics<half>::pow(a, b);
+}
+
+inline __host__ __device__ half sqrt(half a) {
+  return THCNumerics<half>::sqrt(a);
+}
+
+inline __host__ __device__ half tanh(half a) {
+  return THCNumerics<half>::tanh(a);
+}
+
+#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
+inline __host__ __device__ half operator+(half a, int b) {
+  return THCNumerics<half>::add(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ double operator+(half a, double b) {
+  return ScalarConvert<half, double>::to(a) + b;
+}
+
+inline __host__ __device__ half operator*(half a, bool b) {
+  return THCNumerics<half>::mul(a, ScalarConvert<bool, half>::to(b));
+}
+#endif
+
+// comparison functions
+
+inline __host__ __device__ bool operator<(half a, half b) {
+  return THCNumerics<half>::lt(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, half b) {
+  return THCNumerics<half>::le(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, int b) {
+  return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator<(half a, int b) {
+  return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>(half a, half b) {
+  return THCNumerics<half>::gt(a, b);
+}
+
+inline __host__ __device__ bool operator>(half a, int b) {
+  return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>=(half a, half b) {
+  return THCNumerics<half>::ge(a, b);
+}
+
+inline __host__ __device__ bool operator>=(half a, int b) {
+  return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b));
+}
+
+#endif
+#endif
diff --git a/aten/src/THCUNN/THCUNN.h b/aten/src/THCUNN/THCUNN.h
new file mode 100644
index 0000000..09070b1
--- /dev/null
+++ b/aten/src/THCUNN/THCUNN.h
@@ -0,0 +1,10 @@
+#include <THC/THC.h>
+
+#define THCIndexTensor THCudaLongTensor
+#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME
+typedef int64_t THCIndex_t;
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME)
+
+#include "generic/THCUNN.h"
+#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu
new file mode 100644
index 0000000..6781f33
--- /dev/null
+++ b/aten/src/THCUNN/Tanh.cu
@@ -0,0 +1,35 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct tanh_updateGradInput_functor
+{
+  __device__ __forceinline__ void operator()(T *gradInput,
+          const T *output, const T *gradOutput) const {
+    *gradInput = *gradOutput * (1.f - *output * *output);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct tanh_updateGradInput_functor<half>
+{
+  __device__ __forceinline__ void operator()(half *gradInput,
+          const half *output, const half *gradOutput) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    const half one = __float2half(1.f);
+    const half out_square = __hmul(*output, *output);
+    *gradInput = __hmul(*gradOutput, __hadd(one, __hneg(out_square)));
+#else
+    const float out = __half2float(*output);
+    const float go = __half2float(*gradOutput);
+    *gradInput = __float2half(go * (1.f - out * out));
+#endif
+  }
+};
+#endif
+
+#include "generic/Tanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu
new file mode 100644
index 0000000..af12169
--- /dev/null
+++ b/aten/src/THCUNN/TemporalConvolution.cu
@@ -0,0 +1,8 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+
+#include "generic/TemporalConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu
new file mode 100644
index 0000000..2508f83
--- /dev/null
+++ b/aten/src/THCUNN/TemporalMaxPooling.cu
@@ -0,0 +1,86 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#define TEMPORAL_MAX_POOLING_THREADS 1024
+
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+  // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
+  Dtype *input_data = input + blockIdx.x * input_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
+  Dtype *output_data = output + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+
+  int feat = 0;
+  int time = 0;
+  int max_time = input_n * kW;
+
+  Dtype max_value;
+  THCIndex_t max_index = 0;
+
+  if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
+    // For all features
+    for (feat = 0; feat < input_n; ++feat) {
+      max_value = THCNumerics<Dtype>::min();
+      // For all values in the kernel space
+      for (time = 0; time < max_time; time += input_n) {
+        if (max_value < input_data[time + feat]) {
+          max_value = input_data[time + feat];
+          max_index = time / input_n;
+        }
+      }
+      output_data[feat] = max_value;
+      indices_data[feat] = max_index;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+  // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
+  Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
+  Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+
+  int feat = 0;
+
+  if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
+    // For all features
+    for (feat = 0; feat < input_n; ++feat) {
+      gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+  // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
+  Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
+  Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
+      threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
+
+  int feat = 0;
+
+  if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
+    // For all features
+    for (feat = 0; feat < input_n; ++feat) {
+      atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]);
+    }
+  }
+}
+
+#include "generic/TemporalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu
new file mode 100644
index 0000000..4dd4da8
--- /dev/null
+++ b/aten/src/THCUNN/TemporalReflectionPadding.cu
@@ -0,0 +1,70 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <THC/THCApply.cuh>
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype>
+__global__ void TemporalReflectionPadding_updateOutput(
+  THCDeviceTensor<Dtype, 3> input,
+  THCDeviceTensor<Dtype, 3> output,
+  int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= output.getSize(2)) {
+    return;
+  }
+  int outputPointX = outputPointId % output.getSize(2);
+
+  int iStartX = max(0, -padL);
+  int oStartX = max(0, padL);
+
+  int inputPointX = abs(outputPointX - padL)
+                  - abs(outputPointX - (input.getSize(2) + padL - 1))
+                  - outputPointX
+                  + 2 * padL + input.getSize(2) - 1
+                  - oStartX + iStartX;
+
+  Dtype valueToCopy = input[batch][plane][inputPointX];
+  output[batch][plane][outputPointX] = valueToCopy;
+}
+
+template <typename Dtype>
+__global__ void TemporalReflectionPadding_updateGradInput(
+  THCDeviceTensor<Dtype, 3> gradInput,
+  THCDeviceTensor<Dtype, 3> gradOutput,
+  int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= gradOutput.getSize(2)) {
+    return;
+  }
+  int outputPointX = outputPointId % gradOutput.getSize(2);
+
+  int iStartX = max(0, -padL);
+  int oStartX = max(0, padL);
+
+  int inputPointX = abs(outputPointX - padL)
+                  - abs(outputPointX - (gradInput.getSize(2) + padL - 1))
+                  - outputPointX
+                  + 2 * padL + gradInput.getSize(2) - 1
+                  - oStartX + iStartX;
+
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointX];
+  atomicAdd(&gradInput[batch][plane][inputPointX], valueToCopy);
+}
+
+#include "generic/TemporalReflectionPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu
new file mode 100644
index 0000000..2c812bd
--- /dev/null
+++ b/aten/src/THCUNN/TemporalReplicationPadding.cu
@@ -0,0 +1,62 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <THC/THCApply.cuh>
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template <typename Dtype>
+__global__ void TemporalReplicationPadding_updateOutput(
+  THCDeviceTensor<Dtype, 3> input,
+  THCDeviceTensor<Dtype, 3> output,
+  int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= output.getSize(2)) {
+    return;
+  }
+  int outputPointX = outputPointId % output.getSize(2);
+
+  int iStartX = max(0, -padL);
+  int oStartX = max(0, padL);
+
+  int inputPointX = min(max(padL, outputPointX), input.getSize(2) + padL - 1) - oStartX + iStartX;
+
+  Dtype valueToCopy = input[batch][plane][inputPointX];
+  output[batch][plane][outputPointX] = valueToCopy;
+}
+
+template <typename Dtype>
+__global__ void TemporalReplicationPadding_updateGradInput(
+  THCDeviceTensor<Dtype, 3> gradInput,
+  THCDeviceTensor<Dtype, 3> gradOutput,
+  int padL, int padR) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= gradOutput.getSize(2)) {
+    return;
+  }
+  int outputPointX = outputPointId % gradOutput.getSize(2);
+
+  int iStartX = max(0, -padL);
+  int oStartX = max(0, padL);
+
+  int inputPointX = min(max(padL, outputPointX), gradInput.getSize(2) + padL - 1) - oStartX + iStartX;
+
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointX];
+  atomicAdd(&gradInput[batch][plane][inputPointX], valueToCopy);
+}
+
+
+#include "generic/TemporalReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu
new file mode 100644
index 0000000..745fef8
--- /dev/null
+++ b/aten/src/THCUNN/TemporalRowConvolution.cu
@@ -0,0 +1,12 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "row2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCTensor.hpp"
+#include "THCStorage.hpp"
+
+#include "generic/TemporalRowConvolution.cu"
+
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
new file mode 100644
index 0000000..89b0c37
--- /dev/null
+++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
@@ -0,0 +1,98 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rwidth, const bool align_corners,
+    const THCDeviceTensor<Dtype, 3> data1, THCDeviceTensor<Dtype, 3> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int width1 = data1.getSize(2);
+  const int width2 = data2.getSize(2);
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][w1];
+          data2[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+        const Acctype val = w0lambda * data1[n][c][w1]
+                            + w1lambda * data1[n][c][w1+w1p];
+        data2[n][c][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel_backward(const int n,
+    const Acctype rwidth, const bool align_corners,
+    THCDeviceTensor<Dtype, 3> data1, const THCDeviceTensor<Dtype, 3> data2){
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int width1 = data1.getSize(2);
+  const int width2 = data2.getSize(2);
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][w1];
+          data1[n][c][w2] += val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][w2];
+        atomicAdd(data1[n][c][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(w0lambda * d2val));
+        atomicAdd(data1[n][c][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(w1lambda * d2val));
+      }
+    }
+  }
+}
+
+
+#include "generic/TemporalUpSamplingLinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
new file mode 100644
index 0000000..c87129d
--- /dev/null
+++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
@@ -0,0 +1,89 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCTensor.hpp"
+
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_3d_kernel(
+		const int n,
+		const THCDeviceTensor<Dtype, 3> data1,
+		THCDeviceTensor<Dtype, 3> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int width1 = data1.getSize(2);
+  const int width2 = data2.getSize(2);
+  const float scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+	for (int c = 0; c < channels; ++c) {
+	  const Dtype val = data1[n][c][w1];
+	  data2[n][c][w2] = val;
+	}
+      }
+      return;
+    }
+    //
+    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+	const Dtype val = data1[n][c][w1];
+	data2[n][c][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_3d_kernel_backward(
+		const int n,
+		THCDeviceTensor<Dtype, 3> data1,
+		const THCDeviceTensor<Dtype, 3> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int width1 = data1.getSize(2);
+  const int width2 = data2.getSize(2);
+  const float scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+	for (int c = 0; c < channels; ++c) {
+	  const Dtype val = data2[n][c][w1];
+	  data1[n][c][w2] = val;
+	}
+      }
+      return;
+    }
+    //
+    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][w2];
+        atomicAdd(data1[n][c][w1].data(), d2val);
+      }
+    }
+  }
+}
+
+
+#include "generic/TemporalUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu
new file mode 100644
index 0000000..e7757eb
--- /dev/null
+++ b/aten/src/THCUNN/Threshold.cu
@@ -0,0 +1,75 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct ThresholdUpdateOutput
+{
+  const T threshold_;
+  const T val_;
+
+  ThresholdUpdateOutput(T threshold, T val)
+    : threshold_(threshold)
+    , val_(val)
+  {}
+
+  __device__ __forceinline__ void operator()(T *out, T *in)
+  {
+    T x = *in;
+    *out = (x > threshold_) ? x : val_;
+  }
+};
+
+// in-place variant
+template <typename T>
+struct ThresholdUpdateOutputIP
+{
+  const T threshold_;
+  const T val_;
+
+  ThresholdUpdateOutputIP(T threshold, T val)
+    : threshold_(threshold)
+    , val_(val)
+  {}
+
+  __device__ __forceinline__ void operator()(T *x)
+  {
+    *x = (*x > threshold_) ? *x : val_;
+  }
+};
+
+template <typename T>
+struct ThresholdUpdateGradInput
+{
+  const T threshold_;
+
+  ThresholdUpdateGradInput(T threshold)
+    : threshold_(threshold)
+  {}
+
+  __device__ __forceinline__ void operator()(
+    T *gradInput, T *input, T *gradOutput) const
+  {
+    *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
+  }
+};
+
+template <typename T>
+struct ThresholdUpdateGradInputIP
+{
+  const T threshold_;
+
+  ThresholdUpdateGradInputIP(T threshold)
+    : threshold_(threshold)
+  {}
+
+  __device__ __forceinline__ void operator()(
+    T *gradOutput, T *input) const
+  {
+    *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
+  }
+};
+
+#include "generic/Threshold.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
new file mode 100644
index 0000000..84e2c7f
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
@@ -0,0 +1,248 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+// 5d tensor B x D x T x H x W
+// All kernels view batch dim B and feature dim D as collapsed.
+
+/*
+ * Description:
+ *    This function adaptively average pools an input 5D tensor along dimensions
+ *     2, 3 and 4.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+ template <typename T>
+__global__ void cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel(
+                        T *input, T *output,
+                        int isizeT, int isizeH, int isizeW,
+                        int osizeT, int osizeH, int osizeW,
+                        int64_t istrideD,
+                        int64_t istrideT, int64_t istrideH, int64_t istrideW,
+                        int64_t offsetZ)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT;     // output frame/time
+  int d = o_plane / osizeT;  // slice/feature
+
+  // input frame/time ramge is fixed.
+  int istartT = START_IND(ot, osizeT, isizeT);
+  int iendT = END_IND(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // input offset by slice/feature and earliest relevant frame/time
+  T *input_dt = input + d*istrideD + istartT*istrideT;
+  // output offset by slice/feature and frame/time
+  T *output_dt = output + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling from corresponding input pixels
+      T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output_dt + oh*osizeW + ow;
+      T sum = ScalarConvert<int, T>::to(0);
+
+      int it, ih, iw;
+      for(it = 0; it < kT; ++it) {
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            T val = ptr_input[ih*istrideH + iw*istrideW];
+            sum += val;
+          }
+        }
+        ptr_input += istrideT;   // next input frame
+      }
+      // Update output
+      *ptr_output = sum / kT / kH / kW;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D input plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+ template <typename T>
+__global__ void cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel(
+  T *gradInput, T *gradOutput,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on input pixels
+  int it, ih, iw;
+
+  // compute offsets based on thread/block ID
+  int istartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int iendH   = isizeH;
+  int istepH  = gridDim.y * blockDim.y;
+  int istartW = threadIdx.x;
+  int iendW   = isizeW;
+  int istepW  = blockDim.x;
+
+  // select input plane
+  int64_t i_plane = blockIdx.x + offsetZ;
+  it = i_plane % isizeT;        // output frame/time
+  int d = i_plane / isizeT;     // slice/feature
+
+  // output frame/time ramge is fixed.
+  int ostartT = START_IND(it, isizeT, osizeT);
+  int oendT   = END_IND(it, isizeT, osizeT);
+
+  // gradInput offset by slice/feature and frame/time
+  T *gradInput_dt = gradInput + i_plane*isizeH*isizeW;
+  // gradOutput offset by slice/feature and earliest relevant frame/time
+  T *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW;
+
+  // For all input pixels...
+  for(ih = istartH; ih < iendH; ih += istepH) {
+
+    int ostartH = START_IND(ih, isizeH, osizeH);
+    int oendH   = END_IND(ih, isizeH, osizeH);
+
+    for(iw = istartW; iw < iendW; iw += istepW) {
+
+      int ostartW = START_IND(iw, isizeW, osizeW);
+      int oendW   = END_IND(iw, isizeW, osizeW);
+
+      // Compute the gradients from corresponding output pixels
+      T *ptr_gradInput = gradInput_dt + ih*isizeW + iw;
+      T *ptr_gradOutput = gradOutput_dt;
+
+      // for all relevant output pixels
+      int ot, oh, ow;
+      for(ot = ostartT; ot < oendT; ++ot) {
+        int kT = END_IND(ot, osizeT, isizeT) - START_IND(ot, osizeT, isizeT);
+        for(oh = ostartH; oh < oendH; ++oh) {
+          int kH = END_IND(oh, osizeH, isizeH) - START_IND(oh, osizeH, isizeH);
+          for(ow = ostartW; ow < oendW; ++ow) {
+            int kW = END_IND(ow, osizeW, isizeW) - START_IND(ow, osizeW, isizeW);
+            T grad_delta = ptr_gradOutput[oh*osizeW + ow] / kW / kH / kT;
+            *ptr_gradInput += grad_delta;
+          }
+        }
+        ptr_gradOutput += osizeH*osizeW;   // next output frame
+      }
+    }
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput without assuming
+ *    dependencies between input pixels and output pixels.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    (uses atomic add)
+ */
+ template <typename T>
+__global__ void cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel(
+  T *gradInput, T *gradOutput,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT;        // output frame/time
+  int d = o_plane / osizeT;     // output slice/feature
+
+  // input frame/time ramge is fixed.
+  int istartT = START_IND(ot, osizeT, isizeT);
+  int iendT = END_IND(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // gradInput offset by slice/feature and earliest relevant frame/time
+  T *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/time
+  T *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the gradients from corresponding input pixels
+      T *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW;
+      T *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput / kT / kH / kW;
+
+      int it, ih, iw;
+      for(it = 0; it < kT; ++it) {
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta);
+          }
+        }
+        ptr_gradInput += isizeH*isizeW;   // next input frame
+      }
+    }
+  }
+}
+
+#include "generic/VolumetricAdaptiveAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
+#undef START_IND
+#undef END_IND
diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..6d542ba
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
@@ -0,0 +1,207 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include "THCTensor.hpp"
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 5d tensor B x D x T x H x W
+
+/*
+ * Description:
+ *    this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output, 4D argmax x and y
+ */
+ template <typename T>
+__global__ void cunn_VolumetricAdaptiveMaxPooling_updateOutput_kernel(
+                        T *input, T *output, THCIndex_t *indices,
+                        int isizeT, int isizeH, int isizeW,
+                        int osizeT, int osizeH, int osizeW,
+                        int64_t istrideD,
+                        int64_t istrideT, int64_t istrideH, int64_t istrideW,
+                        int64_t offsetZ)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT;     // output frame/time
+  int d = o_plane / osizeT;  // slice/feature
+
+  // input frame/time ramge is fixed.
+  int istartT = START_IND(ot, osizeT, isizeT);
+  int iendT = END_IND(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // input offset by slice/feature and earliest relevant frame/time
+  T *input_dt = input + d*istrideD + istartT*istrideT;
+  // output offset by slice/feature and frame/time
+  T *output_dt = output + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/time
+  THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = START_IND(oh, osizeH, isizeH);
+    int iendH   = END_IND(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = START_IND(ow, osizeW, isizeW);
+      int iendW   = END_IND(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling from corresponding input pixels
+      T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output_dt + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      int64_t argmax = -1;
+      T max = THCNumerics<T>::min();
+
+      int it, ih, iw;
+      for(it = 0; it < kT; ++it) {
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            T val = ptr_input[ih*istrideH + iw*istrideW];
+            if ((val > max) || THCNumerics<T>::isnan(val)) {
+              max = val;
+              argmax = (it+istartT)*isizeH*isizeW + (ih+istartH)*isizeW + iw+istartW;
+            }
+          }
+        }
+        ptr_input += istrideT;   // next input frame
+      }
+      // Update output and argmax
+      *ptr_output = max;
+      *ptr_ind = argmax + TH_INDEX_BASE;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    Assumes that input size can be perfectly divided by output size, i.e.
+ *    each input pixel can only be argmax of one output pixel.
+ */
+ template <typename T>
+__global__ void cunn_VolumetricAdaptiveMaxPooling_updateGradInput_kernel(
+  T *gradInput, T *gradOutput, THCIndex_t *indices,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on output pixels
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  int d = o_plane / osizeT;     // output slice/feature
+
+  // gradInput offset by slice/feature
+  T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/otme
+  T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/otme
+  THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      // Compute the gradients for the argmax input pixel
+      T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput;
+      int argmax = (*ptr_ind) - TH_INDEX_BASE;
+      gradInput_d[argmax] += grad_delta;
+    }
+  }
+}
+
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    Uses atomic add.
+ */
+ template <typename T>
+__global__ void cunn_atomic_VolumetricAdaptiveMaxPooling_updateGradInput_kernel(
+  T *gradInput, T *gradOutput, THCIndex_t *indices,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on output pixels
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  int d = o_plane / osizeT;     // output slice/feature
+
+  // gradInput offset by slice/feature
+  T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/otme
+  T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/otme
+  THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      // Compute the gradients for the argmax input pixel
+      T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow;
+      THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput;
+      int64_t argmax = (*ptr_ind) - TH_INDEX_BASE;
+      atomicAdd(&(gradInput_d[argmax]), grad_delta);
+    }
+  }
+}
+
+#include "generic/VolumetricAdaptiveMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu
new file mode 100644
index 0000000..610127c
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricAveragePooling.cu
@@ -0,0 +1,279 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template <typename Dtype, typename Acctype>
+__global__ void cuda_VolumetricAveragePooling_updateOutput(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad, int offsetZ)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
+  int slice  = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
+
+  if (oRow < output.getSize(2) && oCol < output.getSize(3))
+  {
+    Acctype sum = 0.0;
+
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, input.getSize(1) + padT);
+    int hend = min(hstart + kH, input.getSize(2) + padH);
+    int wend = min(wstart + kW, input.getSize(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, input.getSize(1));
+    hend = min(hend, input.getSize(2));
+    wend = min(wend, input.getSize(3));
+
+    Acctype divide_factor;
+    if (count_include_pad)
+      divide_factor = static_cast<Acctype>(pool_size);
+    else
+      divide_factor = static_cast<Acctype>((tend - tstart) * (hend - hstart) * (wend - wstart));
+
+    int ti, hi, wi;
+    for (ti = tstart; ti < tend; ++ti)
+    {
+      for (hi = hstart; hi < hend; ++hi)
+      {
+        for (wi = wstart; wi < wend; ++wi)
+        {
+          Dtype val = input[slice][ti][hi][wi];
+          sum += val;
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum / divide_factor);
+  }
+}
+
+// Inner-most loop size (kW) passed as template parameter for
+// performance reasons.
+//
+template<int KERNEL_WIDTH, typename Dtype, typename Acctype>
+__global__ void cuda_VolumetricAveragePooling_updateOutput_fixedKW(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad, int offsetZ)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
+  int slice  = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
+
+  if (oRow < output.getSize(2) && oCol < output.getSize(3))
+  {
+    Acctype sum = 0.0;
+
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, input.getSize(1) + padT);
+    int hend = min(hstart + kH, input.getSize(2) + padH);
+    int wend = min(wstart + KERNEL_WIDTH, input.getSize(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, input.getSize(1));
+    hend = min(hend, input.getSize(2));
+    wend = min(wend, input.getSize(3));
+
+    Acctype divide_factor;
+    if (count_include_pad)
+      divide_factor = static_cast<Acctype>(pool_size);
+    else
+      divide_factor = static_cast<Acctype>((tend - tstart) * (hend - hstart) * (wend - wstart));
+
+    int ti, hi, wi;
+    for (ti = tstart; ti < tend; ++ti)
+    {
+      for (hi = hstart; hi < hend; ++hi)
+      {
+        for (wi = wstart; wi < wend; ++wi)
+        {
+          Dtype val = input[slice][ti][hi][wi];
+          sum += val;
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum / divide_factor);
+  }
+}
+
+#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
+  cuda_VolumetricAveragePooling_updateOutput_fixedKW<KW, real, accreal> \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
+      cudaInput, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW, count_include_pad, offsetZ); \
+  break
+
+template <typename Dtype, typename Acctype>
+__global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  int kT, int kH, int kW,
+  Acctype normFactor, int offsetZ)
+{
+  int iCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int iRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // input frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradInput.getSize(1); // input slice/feature
+
+  // guard against over-tiled threads
+  if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3))
+  {
+    Acctype sum = 0.0;
+    Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
+      [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
+    int frameOffset = 0;
+    for (int oFrame  = max(0, iFrame - kT + 1);
+         oFrame < min(iFrame + 1, gradOutput.getSize(1));
+         ++oFrame)
+    {
+      int rowOffset = frameOffset;
+      for (int oRow = max(0, iRow - kH + 1);
+           oRow < min(iRow + 1, gradOutput.getSize(2));
+           ++oRow)
+      {
+        int colOffset = rowOffset;
+        for (int oCol = max(0, iCol - kW + 1);
+             oCol < min(iCol + 1, gradOutput.getSize(3));
+             ++oCol)
+        {
+          sum += gOut[colOffset];
+          ++colOffset;
+        }
+        rowOffset += gradOutput.getSize(3);
+      }
+      frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3);
+    }
+    gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad, int offsetZ)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature
+
+  // guard against over-tiled threads
+  if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
+  {
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, gradInput.getSize(1) + padT);
+    int hend = min(hstart + kH, gradInput.getSize(2) + padH);
+    int wend = min(wstart + kW, gradInput.getSize(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, gradInput.getSize(1));
+    hend = min(hend, gradInput.getSize(2));
+    wend = min(wend, gradInput.getSize(3));
+
+    Acctype divide_factor;
+    if (count_include_pad)
+      divide_factor = static_cast<Acctype>(pool_size);
+    else
+      divide_factor = static_cast<Acctype>((tend - tstart) * (hend - hstart) * (wend - wstart));
+
+    Dtype val = ScalarConvert<Acctype, Dtype>::to(
+      ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor);
+    for (int iFrame = tstart; iFrame < tend; ++iFrame)
+    {
+      for (int iRow = hstart; iRow < hend; ++iRow)
+      {
+        for (int iCol = wstart; iCol < wend; ++iCol)
+        {
+          atomicAdd(&gradInput[slice][iFrame][iRow][iCol], val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void cuda_VolumetricAveragePooling_updateGradInput(
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad, int offsetZ)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature
+
+  // guard against over-tiled threads
+  if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
+  {
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, gradInput.getSize(1) + padT);
+    int hend = min(hstart + kH, gradInput.getSize(2) + padH);
+    int wend = min(wstart + kW, gradInput.getSize(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, gradInput.getSize(1));
+    hend = min(hend, gradInput.getSize(2));
+    wend = min(wend, gradInput.getSize(3));
+
+    Acctype divide_factor;
+    if (count_include_pad)
+      divide_factor = static_cast<Acctype>(pool_size);
+    else
+      divide_factor = static_cast<Acctype>((tend - tstart) * (hend - hstart) * (wend - wstart));
+
+    Dtype val = ScalarConvert<Acctype, Dtype>::to(
+      ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor);
+    for (int iFrame = tstart; iFrame < tend; ++iFrame)
+    {
+      for (int iRow = hstart; iRow < hend; ++iRow)
+      {
+        for (int iCol = wstart; iCol < wend; ++iCol)
+        {
+          gradInput[slice][iFrame][iRow][iCol] = val;
+        }
+      }
+    }
+  }
+}
+
+#include "generic/VolumetricAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu
new file mode 100644
index 0000000..da66140
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricConvolution.cu
@@ -0,0 +1,159 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+// Kernel for fast unfold+copy
+// Borrowed from Theano
+// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
+template <typename Dtype>
+__global__ void im3d2col_kernel(const int64_t n, const Dtype* data_im,
+                                const int64_t height, const int64_t width, const int64_t depth,
+                                const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
+                                const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+                                const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+                                const int64_t height_col, const int64_t width_col, const int64_t depth_col,
+                                Dtype* data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int64_t d_out = index % depth_col;
+    int64_t w_index = index / depth_col;
+    int64_t w_out = w_index % width_col;
+    int64_t h_index = w_index / width_col;
+    int64_t h_out = h_index % height_col;
+
+    int64_t channel_in = h_index / height_col;
+    //channel_in = 1;
+
+    int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d;
+
+    int64_t h_in = h_out * stride_h - pad_h;
+    int64_t w_in = w_out * stride_w - pad_w;
+    int64_t d_in = d_out * stride_d - pad_d;
+
+    Dtype* data_col_ptr = data_col;
+    data_col_ptr += channel_out * (height_col * width_col * depth_col) +
+      h_out * (width_col * depth_col) + w_out * depth_col + d_out;
+
+    const Dtype* data_im_ptr = data_im;
+    data_im_ptr += channel_in * (height * width * depth) +
+      h_in * (width * depth) + w_in * depth + d_in;
+
+    for (int64_t i = 0; i < kernel_h; ++i)
+    {
+      int64_t h = h_in + i;
+      for (int64_t j = 0; j < kernel_w; ++j)
+      {
+        int64_t w = w_in + j;
+        for (int64_t k = 0; k < kernel_d; ++k)
+        {
+          int64_t d = d_in + k;
+          *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
+                           h < height && w < width && d < depth) ?
+                           data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0);
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void im3d2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
+              const int64_t height, const int64_t width, const int64_t depth,
+              const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
+              const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+              const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+              Dtype* data_col)
+{
+  // We are going to launch channels * height_col * width_col * depth_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
+  int64_t num_kernels = channels * height_col * width_col * depth_col;
+  im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+    CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_im,
+                                   height, width, depth,
+                                   kernel_h, kernel_w, kernel_d,
+                                   pad_h, pad_w, pad_d,
+                                   stride_h, stride_w, stride_d,
+                                   height_col, width_col, depth_col,
+                                   data_col);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void col2im3d_kernel(const int64_t n, const Dtype* data_col,
+                                const int64_t height, const int64_t width, const int64_t depth,
+                                const int64_t channels,
+                                const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
+                                const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+                                const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+                                const int64_t height_col, const int64_t width_col, const int64_t depth_col,
+                                Dtype* data_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    Acctype val = 0;
+    int64_t d = index % depth + pad_d;
+    int64_t w_index = index / depth;
+    int64_t w = w_index % width + pad_w;
+    int64_t h_index = w_index / width;
+    int64_t h = h_index % height + pad_h;
+    int64_t c = h_index / height;
+
+    // compute the start and end of the output
+    int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
+    int64_t d_col_end = min(d / stride_d + 1, depth_col);
+    int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int64_t w_col_end = min(w / stride_w + 1, width_col);
+    int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int64_t h_col_end = min(h / stride_h + 1, height_col);
+
+    int64_t offset =
+      (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
+
+    int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
+    int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
+    int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
+    for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col)
+      for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
+      }
+   }
+    data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
+  }
+}
+
+template <typename Dtype, typename Acctype>
+void col2im3d(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+              const int64_t height, const int64_t width, const int64_t depth,
+              const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
+              const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+              const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+              Dtype* data_im)
+{
+  int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
+  int64_t num_kernels = channels * height * width * depth;
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2im3d_kernel<Dtype, Acctype><<<GET_BLOCKS(num_kernels),
+    CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_col,
+                                   height, width, depth, channels,
+                                   patch_h, patch_w, patch_d,
+                                   pad_h, pad_w, pad_d,
+                                   stride_h, stride_w, stride_d,
+                                   height_col, width_col, depth_col,
+                                   data_im);
+  THCudaCheck(cudaGetLastError());
+}
+
+#include "generic/VolumetricConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
new file mode 100644
index 0000000..8a32c70
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
@@ -0,0 +1,9 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/VolumetricDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
new file mode 100644
index 0000000..1a0f2f6
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
@@ -0,0 +1,161 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#include <cfloat>
+
+template <typename Dtype>
+__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
+  Dtype* inputData, int inputT, int inputH, int inputW,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  int dilationT, int dilationH, int dilationW,
+  int offsetZ)
+{
+  int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame  = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
+  int slice   = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
+
+  if (oRow < output.getSize(2) && oColumn < output.getSize(3))
+  {
+    int tStart = oFrame  * dT - padT;
+    int hStart = oRow    * dH - padH;
+    int wStart = oColumn * dW - padW;
+    int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
+    int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
+    int wEnd = fminf(wStart + (kW - 1) * dilationW + 1, inputW);
+
+    while(tStart < 0)
+      tStart += dilationT;
+    while(hStart < 0)
+      hStart += dilationH;
+    while(wStart < 0)
+      wStart += dilationW;
+
+    int index = 0;
+    int maxIndex = -1;
+    inputData += slice * inputT * inputH * inputW;
+
+    Dtype max = THCNumerics<Dtype>::min();
+
+    for (int t = tStart; t < tEnd; t += dilationT)
+    {
+      for (int h = hStart; h < hEnd; h += dilationH)
+      {
+        for (int w = wStart; w < wEnd; w += dilationW)
+        {
+          index = t * inputH * inputW + h * inputW + w;
+          Dtype val = inputData[index];
+
+          if ((max < val) || THCNumerics<Dtype>::isnan(val))
+          {
+            max = val;
+            maxIndex = index;
+          }
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oColumn] = max;
+    indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
+  }
+}
+
+template <int KERNEL_WIDTH, typename Dtype>
+__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
+  Dtype* inputData, int inputT, int inputH, int inputW,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  int dilationT, int dilationH, int dilationW,
+  int offsetZ)
+{
+  int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame  = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
+  int slice   = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
+
+  if (oRow < output.getSize(2) && oColumn < output.getSize(3))
+  {
+    int tStart = oFrame  * dT - padT;
+    int hStart = oRow    * dH - padH;
+    int wStart = oColumn * dW - padW;
+    int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
+    int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
+    int wEnd = fminf(wStart + (KERNEL_WIDTH - 1) * dilationW + 1, inputW);
+
+    while(tStart < 0)
+      tStart += dilationT;
+    while(hStart < 0)
+      hStart += dilationH;
+    while(wStart < 0)
+      wStart += dilationW;
+
+    int index = 0;
+    int maxIndex = -1;
+
+    Dtype max = THCNumerics<Dtype>::min();
+
+    for (int t = tStart; t < tEnd; t += dilationT)
+    {
+      for (int h = hStart; h < hEnd; h += dilationH)
+      {
+        for (int w = wStart; w < wEnd; w += dilationW)
+        {
+          index = t * inputH * inputW + h * inputW + w;
+          Dtype val = inputData[slice * inputT * inputH * inputW + index];
+
+          if (max < val)
+          {
+            max = val;
+            maxIndex = index;
+          }
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oColumn] = max;
+    indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
+  }
+}
+
+template <typename Dtype>
+__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  Dtype* gradInputData,
+  int inputT, int inputH, int inputW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  int dilationT, int dilationH, int dilationW,
+  int offsetZ)
+{
+  int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame  = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // output frame/time
+  int slice   = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // output slice/feature
+
+  if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
+  {
+    int maxIndex = indices[slice][oFrame][oRow][oColumn] - TH_INDEX_BASE;
+    if (maxIndex != -1) {
+      atomicAdd(&gradInputData[slice * inputT * inputH * inputW + maxIndex],
+                gradOutput[slice][oFrame][oRow][oColumn]);
+    }
+  }
+}
+
+#include "generic/VolumetricDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
new file mode 100644
index 0000000..e6260ce
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
@@ -0,0 +1,120 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#include <cfloat>
+
+template <typename Dtype, typename Acctype>
+__device__ inline int getInterval(Acctype sample,
+                                  int index,
+                                  int inputSize,
+                                  int outputSize,
+                                  int poolSize) {
+  Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1);
+  if (index == outputSize - 1) {
+    return inputSize - poolSize;
+  } else {
+    return (int) ((index + sample) * alpha) - (int) (sample * alpha);
+  }
+}
+
+// We template on poolSizeW to allow the innermost loop to be unrolled
+template <int PoolSizeTStatic, typename Dtype, typename Acctype>
+__global__ void VolumetricFractionalMaxPooling_updateOutput(
+  THCDeviceTensor<Dtype, 5> input,
+  THCDeviceTensor<Dtype, 5> output,
+  THCDeviceTensor<THCIndex_t, 5> indices,
+  THCDeviceTensor<Dtype, 3> samples,
+  int poolSizeT, int poolSizeW, int poolSizeH) {
+
+  // Output (h, w) point that this thread is responsible for
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < output.getSize(2) * output.getSize(3) * output.getSize(4)){
+    int outputT = ourOutputPoint % output.getSize(4);
+    int outputW = (ourOutputPoint / output.getSize(4)) % output.getSize(3);
+    int outputH = ourOutputPoint / (output.getSize(3)*output.getSize(4));
+
+    int poolT = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputT,
+                            input.getSize(4), output.getSize(4), poolSizeT);
+    int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputW,
+                            input.getSize(3), output.getSize(3), poolSizeW);
+    int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][2]), outputH,
+                            input.getSize(2), output.getSize(2), poolSizeH);
+
+    Dtype maxVal = THCNumerics<Dtype>::min();
+    int maxIndex = -1;
+
+    for (int h = poolH; h < poolH + poolSizeH; ++h) {
+      for (int w = poolW; w < poolW + poolSizeW; ++w) {
+        if (PoolSizeTStatic == -1) {
+          for (int t = poolT; t < poolT + poolSizeT; ++t) {
+            Dtype val = input[batch][plane][h][w][t];
+            // for consistency with THNN, favor the first max
+            if (val > maxVal) {
+              maxIndex = h * input.getSize(3)*input.getSize(4) + w * input.getSize(4) + t;
+              maxVal = val;
+            }
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < PoolSizeTStatic; ++i) {
+            int t = i + poolT;
+            Dtype val = input[batch][plane][h][w][t];
+            // for consistency with THNN, favor the first max
+            if (val > maxVal) {
+              maxIndex = h * input.getSize(3)*input.getSize(4) + w * input.getSize(4) + t;
+              maxVal = val;
+            }
+          }
+        }
+      }
+    }
+
+    assert(THCNumerics<Dtype>::ne(maxVal, THCNumerics<Dtype>::min()));
+    assert(maxIndex != -1);
+
+    // +1 for Lua index
+    indices[batch][plane][outputH][outputW][outputT] = maxIndex + TH_INDEX_BASE;
+    output[batch][plane][outputH][outputW][outputT] = maxVal;
+  }
+}
+
+template <typename Dtype>
+__global__ void VolumetricFractionalMaxPooling_updateGradInput(
+  THCDeviceTensor<Dtype, 5> gradInput,
+  THCDeviceTensor<Dtype, 5> gradOutput,
+  THCDeviceTensor<THCIndex_t, 5> indices) {
+  // Output (h, w) point that this thread is responsible for
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3) * gradOutput.getSize(4)) {
+    int outputT = ourOutputPoint % gradOutput.getSize(4);
+    int outputW = (ourOutputPoint / gradOutput.getSize(4)) % gradOutput.getSize(3);
+    int outputH = ourOutputPoint / (gradOutput.getSize(3)*gradOutput.getSize(4));
+
+    int index = indices[batch][plane][outputH][outputW][outputT] - TH_INDEX_BASE;
+    assert(index >= 0);
+    int inputT = index % gradInput.getSize(4);
+    int inputW = (index / gradInput.getSize(4)) % gradInput.getSize(3);
+    int inputH = index / (gradInput.getSize(3) * gradInput.getSize(4));
+    assert(inputH < gradInput.getSize(2));
+
+    atomicAdd(gradInput[batch][plane][inputH][inputW][inputT].data(),
+              gradOutput[batch][plane][outputH][outputW][outputT]);
+  }
+}
+
+#include "generic/VolumetricFractionalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu
new file mode 100644
index 0000000..556b5bc
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricFullConvolution.cu
@@ -0,0 +1,7 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/VolumetricFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
new file mode 100644
index 0000000..c5c7196
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
@@ -0,0 +1,9 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/VolumetricFullDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
new file mode 100644
index 0000000..43b8cef
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
@@ -0,0 +1,421 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D)
+#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)	\
+  do {    \
+    if (WITHIN_BOUNDS(x, y, z, D, H, W)) {    \
+      atomicAdd(&input[n][c][z][y][x], value);	\
+    }						\
+  } while(0)
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+const int MODE_BORDER = 1;
+
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 5> input,
+    THCDeviceTensor<Dtype, 5> grid,
+    THCDeviceTensor<Dtype, 5> output,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int ID = input.getSize(2);
+  int IH = input.getSize(3);
+  int IW = input.getSize(4);
+  int D = grid.getSize(1);
+  int H = grid.getSize(2);
+  int W = grid.getSize(3);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int d = (index / N) % D;
+    const int h = (index / (N * D)) % H;
+    const int w = (index / (N * D * H)) % W;
+    int c;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    Dtype ix = grid[n][d][h][w][0];
+    Dtype iy = grid[n][d][h][w][1];
+    Dtype iz = grid[n][d][h][w][2];
+
+    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
+
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
+    
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+    // calculate bilinear weighted pixel value and set output pixel
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
+      CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
+      CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
+      CLIP_COORDINATES(ix_tne, ix_tne, IW);
+      CLIP_COORDINATES(iy_tne, iy_tne, IH);
+      CLIP_COORDINATES(iz_tne, iz_tne, ID);
+      CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
+      CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
+      CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
+      CLIP_COORDINATES(ix_tse, ix_tse, IW);
+      CLIP_COORDINATES(iy_tse, iy_tse, IH);
+      CLIP_COORDINATES(iz_tse, iz_tse, ID);
+      CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
+      CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
+      CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
+      CLIP_COORDINATES(ix_bne, ix_bne, IW);
+      CLIP_COORDINATES(iy_bne, iy_bne, IH);
+      CLIP_COORDINATES(iz_bne, iz_bne, ID);
+      CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
+      CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
+      CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
+      CLIP_COORDINATES(ix_bse, ix_bse, IW);
+      CLIP_COORDINATES(iy_bse, iy_bse, IH);
+      CLIP_COORDINATES(iz_bse, iz_bse, ID);
+    }
+
+    Dtype out_val;
+    for (c = 0; c < C; ++c) {
+      out_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) {
+        out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw;
+      }
+      if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) {
+        out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne;
+      }
+      if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) {
+        out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw;
+      }
+      if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) {
+        out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse;
+      }
+      if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) {
+        out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw;
+      }
+      if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) {
+        out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne;
+      }
+      if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) {
+        out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw;
+      }
+      if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) {
+        out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse;
+      }
+      output[n][c][d][h][w] = out_val;
+    }
+  }
+}
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 5> input, THCDeviceTensor<Dtype, 5> gradInput,
+    THCDeviceTensor<Dtype, 5> grid, THCDeviceTensor<Dtype, 5> gradGrid,
+    THCDeviceTensor<Dtype, 5> gradOutput,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int ID = input.getSize(2);
+  int IH = input.getSize(3);
+  int IW = input.getSize(4);
+  int D = grid.getSize(1);
+  int H = grid.getSize(2);
+  int W = grid.getSize(3);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int d = (index / N) % D;
+    const int h = (index / (N * D)) % H;
+    const int w = (index / (N * D * H)) % W;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    Dtype ix = grid[n][d][h][w][0];
+    Dtype iy = grid[n][d][h][w][1];
+    Dtype iz = grid[n][d][h][w][2];
+
+    Dtype gix = ScalarConvert<int,Dtype>::to(0);
+    Dtype giy = ScalarConvert<int,Dtype>::to(0);
+    Dtype giz = ScalarConvert<int,Dtype>::to(0);
+
+    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
+
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
+    
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+    Dtype gradout;
+    Dtype tnw_val;
+    Dtype tne_val;
+    Dtype tsw_val;
+    Dtype tse_val;
+    Dtype bnw_val;
+    Dtype bne_val;
+    Dtype bsw_val;
+    Dtype bse_val;
+    
+    int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+    int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+    int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+    int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
+      CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
+      CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
+      CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
+      CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
+      CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
+      CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
+      CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
+      CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
+      CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
+      CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
+      CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
+      CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
+      CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
+      CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
+      CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
+      CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
+      CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
+      CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
+      CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
+      CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
+      CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
+      CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
+      CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
+    }
+    else {
+      ix_tnw_cl = ix_tnw;
+      iy_tnw_cl = iy_tnw;
+      iz_tnw_cl = iz_tnw;
+      ix_tne_cl = ix_tne;
+      iy_tne_cl = iy_tne;
+      iz_tne_cl = iz_tne;
+      ix_tsw_cl = ix_tsw;
+      iy_tsw_cl = iy_tsw;
+      iz_tsw_cl = iz_tsw;
+      ix_tse_cl = ix_tse;
+      iy_tse_cl = iy_tse;
+      iz_tse_cl = iz_tse;
+      ix_bnw_cl = ix_bnw;
+      iy_bnw_cl = iy_bnw;
+      iz_bnw_cl = iz_bnw;
+      ix_bne_cl = ix_bne;
+      iy_bne_cl = iy_bne;
+      iz_bne_cl = iz_bne;
+      ix_bsw_cl = ix_bsw;
+      iy_bsw_cl = iy_bsw;
+      iz_bsw_cl = iz_bsw;
+      ix_bse_cl = ix_bse;
+      iy_bse_cl = iy_bse;
+      iz_bse_cl = iz_bse;
+    }
+
+    for (int c = 0; c < C; ++c) {
+      gradout = gradOutput[n][c][d][h][w];
+
+      // calculate and set gradInput
+      SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
+      SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
+      SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
+      SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
+      SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
+      SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
+      SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
+      SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
+
+      // calculate gradGrid
+      tnw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) {
+        tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl];
+      }
+      tne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) {
+        tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl];
+      }
+      tsw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) {
+        tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl];
+      }
+      tse_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) {
+        tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl];
+      }
+      bnw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) {
+        bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl];
+      }
+      bne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) {
+        bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl];
+      }
+      bsw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) {
+        bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl];
+      }
+      bse_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) {
+        bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl];
+      }
+
+      Dtype m1 = ScalarConvert<int,Dtype>::to(-1);
+      gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
+      gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
+      gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
+      gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
+      gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
+      gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
+      gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
+      gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
+
+
+      giy += m1 * tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
+      giy += m1 * tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
+      giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
+      giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
+      giy += m1 * bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
+      giy += m1 * bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
+      giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
+      giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
+
+      giz += m1 * tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
+      giz += m1 * tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
+      giz += m1 * tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
+      giz += m1 * tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
+      giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
+      giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
+      giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
+      giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
+    }
+
+    // un-normalize gradGrid values back to [-1, 1] constraints
+    gix = gix * (IW - 1) / 2;
+    giy = giy * (IH - 1) / 2;
+    giz = giz * (ID - 1) / 2;
+
+    Dtype gix_old = gradGrid[n][d][h][w][0];
+    Dtype giy_old = gradGrid[n][d][h][w][1];
+    Dtype giz_old = gradGrid[n][d][h][w][2];
+
+    gradGrid[n][d][h][w][0] = gix_old + gix;
+    gradGrid[n][d][h][w][1] = giy_old + giy;
+    gradGrid[n][d][h][w][2] = giz_old + giz;
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef CLIP_COORDINATES
+#undef WITHIN_BOUNDS
+#undef SAFE_ADD
+
+#include "generic/VolumetricGridSamplerBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricMaxPooling.cu b/aten/src/THCUNN/VolumetricMaxPooling.cu
new file mode 100644
index 0000000..2f7de7b
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricMaxPooling.cu
@@ -0,0 +1,10 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+#include <cfloat>
+
+#include "generic/VolumetricMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
new file mode 100644
index 0000000..eac3b2d
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
@@ -0,0 +1,57 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include <cfloat>
+
+template <typename Dtype>
+__global__ void cuda_VolumetricMaxUnpooling_updateOutput(
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  Dtype* outputData,
+  int oT, int oH, int oW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW, int offsetZ)
+{
+  int64_t iColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t iRow    = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t iFrame  = (blockIdx.z + offsetZ) % input.getSize(1); // intput frame/time
+  int64_t slice   = (blockIdx.z + offsetZ) / input.getSize(1); // intput slice/feature
+
+  if (iRow < input.getSize(2) && iColumn < input.getSize(3))
+  {
+    Dtype val = input[slice][iFrame][iRow][iColumn];
+    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    outputData[slice*oT*oH*oW + index] = val;
+  }
+}
+
+template <typename Dtype>
+__global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
+  Dtype* gradOutputData,
+  int oT, int oH, int oW,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW, int offsetZ)
+{
+  int iColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int iRow    = blockIdx.y * blockDim.y + threadIdx.y;
+  int iFrame  = (blockIdx.z + offsetZ) % gradInput.getSize(1); // output frame/time
+  int slice   = (blockIdx.z + offsetZ) / gradInput.getSize(1); // output slice/feature
+
+  if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
+  {
+    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    Dtype grad_val = gradOutputData[slice*oT*oH*oW + index];
+    gradInput[slice][iFrame][iRow][iColumn] = grad_val;
+  }
+}
+
+#include "generic/VolumetricMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu
new file mode 100644
index 0000000..27ea3ec
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu
@@ -0,0 +1,90 @@
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename Dtype>
+__global__ void VolumetricReplicationPadding_updateOutput(
+  THCDeviceTensor<Dtype, 5> input,
+  THCDeviceTensor<Dtype, 5> output,
+  int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
+
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+  if (outputPointId >= (output.getSize(2) * output.getSize(3) *
+                        output.getSize(4))) {
+    return;
+  }
+  int outputPointX = outputPointId % output.getSize(4);
+  int outputPointY = (outputPointId / output.getSize(4)) % output.getSize(3);
+  int outputPointZ = outputPointId / (output.getSize(3) * output.getSize(4));
+
+  int iStartX = max(0, -pleft);
+  int iStartY = max(0, -ptop);
+  int iStartZ = max(0, -pfront);
+  int oStartX = max(0, pleft);
+  int oStartY = max(0, ptop);
+  int oStartZ = max(0, pfront);
+
+  int inputPointX = min(max(pleft, outputPointX),
+                        input.getSize(4) + pleft - 1) - oStartX + iStartX;
+  int inputPointY = min(max(ptop, outputPointY),
+                        input.getSize(3) + ptop - 1) - oStartY + iStartY;
+  int inputPointZ = min(max(pfront, outputPointZ),
+                        input.getSize(2) + pfront - 1) - oStartZ + iStartZ;
+
+  Dtype valueToCopy =
+      input[batch][plane][inputPointZ][inputPointY][inputPointX];
+  output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
+}
+
+template <typename Dtype>
+__global__ void VolumetricReplicationPadding_updateGradInput(
+  THCDeviceTensor<Dtype, 5> gradInput,
+  THCDeviceTensor<Dtype, 5> gradOutput,
+  int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
+  int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  if (outputPointId >= (gradOutput.getSize(2) * gradOutput.getSize(3) *
+                        gradOutput.getSize(4))) {
+    return;
+  }
+  int outputPointX = outputPointId % gradOutput.getSize(4);
+  int outputPointY = (outputPointId / gradOutput.getSize(4)) %
+      gradOutput.getSize(3);
+  int outputPointZ = outputPointId / (gradOutput.getSize(3) *
+      gradOutput.getSize(4));
+
+  int iStartX = max(0, -pleft);
+  int iStartY = max(0, -ptop);
+  int iStartZ = max(0, -pfront);
+  int oStartX = max(0, pleft);
+  int oStartY = max(0, ptop);
+  int oStartZ = max(0, pfront);
+
+  int inputPointX = min(max(pleft, outputPointX),
+                        gradInput.getSize(4) + pleft - 1) - oStartX + iStartX;
+  int inputPointY = min(max(ptop, outputPointY),
+                        gradInput.getSize(3) + ptop - 1) - oStartY + iStartY;
+  int inputPointZ = min(max(pfront, outputPointZ),
+                        gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ;
+
+  Dtype valueToCopy =
+      gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
+  atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
+            valueToCopy);
+}
+
+
+#include "generic/VolumetricReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
new file mode 100644
index 0000000..babbd58
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
@@ -0,0 +1,114 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCTensor.hpp"
+
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_5d_kernel(
+		const int n,
+		const THCDeviceTensor<Dtype, 5> data1,
+		THCDeviceTensor<Dtype, 5> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+  const float depth_scale = (float) depth1 / (float) depth2;
+  const float height_scale = (float) height1 / (float) height2;
+  const float width_scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int d2 = index / (height2*width2);            // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int d1 = d2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][d1][h1][w1];
+          data2[n][c][d2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
+    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+	const Dtype val = data1[n][c][d1][h1][w1];
+	data2[n][c][d2][h2][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename Dtype, typename Acctype>
+__global__ void nearest_neighbor_5d_kernel_backward(
+		const int n,
+		THCDeviceTensor<Dtype, 5> data1,
+		const THCDeviceTensor<Dtype, 5> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+  const float depth_scale = (float) depth1 / (float) depth2;
+  const float height_scale = (float) height1 / (float) height2;
+  const float width_scale = (float) width1 / (float) width2;
+
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int d2 = index / (height2*width2);            // 0:depth2-1
+
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int d1 = d2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][d1][h1][w1];
+          data1[n][c][d2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
+    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+	const Dtype val = data2[n][c][d2][h2][w2];
+	atomicAdd(data1[n][c][d1][h1][w1].data(), val);
+      }
+    }
+  }
+}
+
+
+#include "generic/VolumetricUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
new file mode 100644
index 0000000..0f353b9
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
@@ -0,0 +1,159 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#include "THCUNN.h"
+#include "THCTensor.hpp"
+#include "common.h"
+#include "linear_upsampling.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__launch_bounds__(1024)
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
+    const THCDeviceTensor<Dtype, 5> data1, THCDeviceTensor<Dtype, 5> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2*width2);            // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][t1][h1][w1];
+          data2[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype t1r = linear_upsampling_compute_source_index<Acctype>(rdepth, t2, align_corners);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const Acctype t1lambda = t1r - t1;
+    const Acctype t0lambda = Acctype(1) - t1lambda;
+    //
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+        const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1]
+                                                  + w1lambda * data1[n][c][t1][h1][w1+w1p])
+                                      + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1]
+                                                  + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p]))
+                          + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1]
+                                                  + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p])
+                                      + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1]
+                                                  + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p]));
+        data2[n][c][t2][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
+__launch_bounds__(1024)
+__global__ void caffe_gpu_interp2_kernel_backward(const int n,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
+    THCDeviceTensor<Dtype, 5> data1, const THCDeviceTensor<Dtype, 5> data2){
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2*width2);            // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][t1][h1][w1];
+          data1[n][c][t2][h2][w2] += val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype t1r = linear_upsampling_compute_source_index<Acctype>(rdepth, t2, align_corners);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const Acctype t1lambda = t1r - t1;
+    const Acctype t0lambda = Acctype(1) - t1lambda;
+    //
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][t2][h2][w2];
+        atomicAdd(data1[n][c][t1][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1+h1p][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w1lambda * d2val));
+      }
+    }
+  }
+  /////////////////////////////////////////////////////////
+}
+
+
+#include "generic/VolumetricUpSamplingTrilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
new file mode 100644
index 0000000..5149003
--- /dev/null
+++ b/aten/src/THCUNN/common.h
@@ -0,0 +1,85 @@
+#ifndef THCUNN_COMMON_H
+#define THCUNN_COMMON_H
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+#define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
+  "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+const int CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+#define THCUNN_resizeAs_indices(STATE, I1, I2)              \
+  THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2);  \
+  if (!THCIndexTensor_(isSize)(STATE, I1, size2))           \
+  { \
+    THCudaLongTensor_resize(STATE, I1, size2, NULL);        \
+  } \
+  THLongStorage_free(size2);
+
+#define THCUNN_check_shape(STATE, I1, I2)                 \
+  if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2))	\
+  { \
+       THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1);  \
+       THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);  \
+       THError(#I1 " and " #I2 " shapes do not match: "   \
+               #I1 " %s, " #I2 " %s", s1.str, s2.str);    \
+  }
+
+
+#define THCUNN_check_shape_indices(STATE, I1, I2)              \
+  THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2);     \
+  if (!THCIndexTensor_(isSize)(STATE, I1, size2))              \
+  { \
+       THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1);  \
+       THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);       \
+       THError(#I1 " and " #I2 " shapes do not match: "        \
+               #I1 " %s, " #I2 " %s", s1.str, s2.str);         \
+  } \
+  THLongStorage_free(size2);
+
+#define THCUNN_check_nElement(STATE, I1, I2)                \
+  if (I1 != NULL && I2 != NULL ) {                          \
+    ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1);              \
+    ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2);              \
+    if (n1 != n2)                                           \
+    {	\
+      THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1);     \
+      THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2);     \
+      THError(#I1 " and " #I2 " have different number of elements: "	\
+              #I1 "%s has %ld elements, while "             \
+              #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
+    }	\
+  }
+
+#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
+  if (THCTensor_(nDimension)(STATE, T) != DIM ||             \
+      THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {        \
+      THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE)  \
+  if (THCIndexTensor_(nDimension)(STATE, T) != DIM ||                 \
+      THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {            \
+      THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T);           \
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
+              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \
+  if (!(COND)) { \
+    THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
+    THArgCheck(COND, ARG, FORMAT, s1.str);           \
+  }
+
+#endif
diff --git a/aten/src/THCUNN/generic/Abs.cu b/aten/src/THCUNN/generic/Abs.cu
new file mode 100644
index 0000000..0b2a5e7
--- /dev/null
+++ b/aten/src/THCUNN/generic/Abs.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Abs.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Abs_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, absupdateOutput_functor<real>());
+}
+
+void THNN_(Abs_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, absupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/AbsCriterion.cu b/aten/src/THCUNN/generic/AbsCriterion.cu
new file mode 100644
index 0000000..d1faeaa
--- /dev/null
+++ b/aten/src/THCUNN/generic/AbsCriterion.cu
@@ -0,0 +1,82 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/AbsCriterion.cu"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 3, input, target, output);
+
+  if (reduction == Reduction::None) {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, output,
+                        abs_updateOutput_no_reduce_functor<real>());
+    return;
+  }
+
+  THCTensor_(resize1d)(state, output, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus<accreal>(), abs_functor<real, accreal>());
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 4, input, target, gradOutput, gradInput);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THCUNN_check_shape(state, gradOutput, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, gradInput,
+                        abs_updateGradInput_no_reduce_functor<real>());
+    THCTensor_(cmul)(state, gradInput, gradInput, gradOutput);
+    return;
+  }
+
+  THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<double, real>::to(reduction == Reduction::ElementwiseMean ? 1./size : 1.);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+                    abs_updateGradInput_functor<real>(norm, THCTensor_(get1d)(state, gradOutput, 0)));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/BCECriterion.cu b/aten/src/THCUNN/generic/BCECriterion.cu
new file mode 100644
index 0000000..3dcde62
--- /dev/null
+++ b/aten/src/THCUNN/generic/BCECriterion.cu
@@ -0,0 +1,130 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BCECriterion.cu"
+#else
+
+void THNN_(BCECriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction,
+           THCTensor *weights)
+{
+  THCUNN_check_nElement(state, input, target);
+  THCUNN_check_nElement(state, input, weights);
+  THCUNN_assertSameGPU(state, 3, input, target, weights);
+
+  if (reduction == Reduction::None) {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, output,
+        bce_updateOutput_no_reduce_functor<real, accreal>());
+    if (weights) {
+      THCTensor_(cmul)(state, output, output, weights);
+    }
+    return;
+  }
+
+  THCTensor_(resize1d)(state, output, 1);
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+
+  accreal sum;
+  if (weights) {
+    weights = THCTensor_(newContiguous)(state, weights);
+    thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+    sum = thrust::transform_reduce(
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+      bce_functor_weights<real, accreal>(),
+      (accreal) 0,
+      thrust::plus<accreal>()
+    );
+    THCTensor_(free)(state, weights);
+  } else {
+    sum = thrust::transform_reduce(
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+      bce_functor<real, accreal>(),
+      (accreal) 0,
+      thrust::plus<accreal>()
+    );
+  }
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(BCECriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction,
+           THCTensor *weights)
+{
+  THCUNN_check_nElement(state, input, target);
+  THCUNN_check_nElement(state, input, weights);
+  THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THCUNN_check_nElement(state, gradOutput, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, gradInput,
+        bce_updateGradInput_no_reduce_functor<real, accreal>());
+    THCTensor_(cmul)(state, gradInput, gradInput, gradOutput);
+    if (weights) {
+      THCTensor_(cmul)(state, gradInput, gradInput, weights);
+    }
+    return;
+  }
+
+  THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<accreal, real>::to((reduction == Reduction::ElementwiseMean ? accreal(1)/size : accreal(1)) * THCTensor_(get1d)(state, gradOutput, 0));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  if (weights) {
+    weights = THCTensor_(newContiguous)(state, weights);
+    thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+    thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+      gradInput_data,
+      bce_updateGradInput_functor_weights<real, accreal>(norm)
+    );
+    THCTensor_(free)(state, weights);
+  } else {
+    thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+      gradInput_data,
+      bce_updateGradInput_functor<real, accreal>(norm)
+    );
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
new file mode 100644
index 0000000..1eb3b82
--- /dev/null
+++ b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -0,0 +1,108 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BatchNormalization.cu"
+#else
+
+#define DeviceTensor3 THCDeviceTensor<real, 3>
+#define DeviceTensor1 THCDeviceTensor<real, 1>
+
+template <int Dim>
+static THCDeviceTensor<real, Dim> THNN_(devicetensor)(THCState *state, THCTensor *t) {
+  if (!t) {
+    return THCDeviceTensor<real, Dim>();
+  }
+
+  int inDim = THCTensor__nDimension(state, t);
+  if (inDim == Dim) {
+    return toDeviceTensor<real, Dim>(state, t);
+  }
+
+  // View in which the last dimensions are collapsed or expanded as needed
+  THAssert(THCTensor_isContiguous(state, t));
+  int size[Dim];
+  for (int i = 0; i < Dim || i < inDim; ++i) {
+    if (i < Dim && i < inDim) {
+      size[i] = t->size[i];
+    } else if (i < Dim) {
+      size[i] = 1;
+    } else {
+      size[Dim - 1] *= t->size[i];
+    }
+  }
+  return THCDeviceTensor<real, Dim>(t->data<real>(), size);
+}
+
+void THNN_(BatchNormalization_updateOutput)(
+  THCState *state, THCTensor *input_, THCTensor *output_,
+  THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
+  THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
+  bool train, double momentum, double eps) {
+
+  THCTensor_(resizeAs)(state, output_, input_);
+  if (train) {
+    int64_t nInput = THCTensor_(size)(state, input_, 1);
+    THCTensor_(resize1d)(state, saveMean_, nInput);
+    THCTensor_(resize1d)(state, saveStd_, nInput);
+  }
+  DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_);
+  DeviceTensor3 output = THNN_(devicetensor)<3>(state, output_);
+  DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_);
+  DeviceTensor1 bias = THNN_(devicetensor)<1>(state, bias_);
+  DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_);
+  DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_);
+  DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_);
+  DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+  if (!train) {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, runningMean, runningVar, weight, bias, eps);
+  } else {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, weight, bias, eps, momentum, runningMean, runningVar,
+      saveMean, saveStd);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(BatchNormalization_backward)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
+  THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
+  THCTensor *saveMean_, THCTensor *saveStd_, bool train, double scale, double eps) {
+
+  THCUNN_check_shape(state, input_, gradOutput_);
+  if (gradInput_) {
+    THCTensor_(resizeAs)(state, gradInput_, input_);
+  }
+
+  DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_);
+  DeviceTensor3 gradOutput = THNN_(devicetensor)<3>(state, gradOutput_);
+  DeviceTensor3 gradInput = THNN_(devicetensor)<3>(state, gradInput_);
+  DeviceTensor1 gradWeight = THNN_(devicetensor)<1>(state, gradWeight_);
+  DeviceTensor1 gradBias = THNN_(devicetensor)<1>(state, gradBias_);
+  DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_);
+  DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_);
+  DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_);
+  DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_);
+  DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+
+  dim3 blocks(gradOutput.getSize(1));
+  dim3 threads(getNumThreads(gradOutput.getSize(2)));
+  BatchNormalizationBackward_kernel<real,  accreal,  DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+    input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
+    saveMean, saveStd, train, scale, eps);
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef DeviceTensor3
+#undef DeviceTensor1
+
+#endif
diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
new file mode 100644
index 0000000..9508cc8
--- /dev/null
+++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
@@ -0,0 +1,235 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ClassNLLCriterion.cu"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           int64_t reduction,
+           THCTensor *weights,
+           THCTensor *total_weight,
+           int64_t ignore_index) {
+  if (THCIndexTensor_(nDimension)(state, target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  int n_dims = THCTensor_(nDimension)(state, input);
+  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (weights) {
+    THCUNN_assertSameGPU(
+      state, 5, input, target, weights, output, total_weight
+    );
+  } else {
+    THCUNN_assertSameGPU(
+      state, 4, input, target, output, total_weight
+    );
+  }
+
+  THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
+
+  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
+  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
+  THArgCheck(batch_size == num_targets,
+      2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
+      batch_size, num_targets);
+
+  if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+    THCDescBuff s1 = THCTensor_(sizeDesc)(state, weights);
+    THError("weight tensor should be defined either for all %d classes or no classes"
+            " but got weight tensor of shape: %s", n_classes, s1.str);
+  }
+
+  if (reduction == Reduction::None && n_dims == 2) {
+    THCTensor_(resize1d)(state, output, batch_size);
+    if (weights) {
+      weights = THCTensor_(newContiguous)(state, weights);
+    }
+
+    ClassNLLCriterion_updateOutput_no_reduce_kernel<real>
+      <<<GET_BLOCKS(batch_size), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+        batch_size,
+        toDeviceTensor<real, 2>(state, input),
+        toDeviceTensor<THCIndex_t, 1>(state, target),
+        toDeviceTensor<real, 1>(state, output),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        n_classes,
+        ignore_index);
+
+    THCudaCheck(cudaGetLastError());
+
+    if (weights) {
+      THCTensor_(free)(state, weights);
+    }
+    return;
+  }
+
+  THCTensor_(resize1d)(state, output, 1);
+  THCTensor_(resize1d)(state, total_weight, 1);
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *input_data = THCTensor_(data)(state, input);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *output_data = THCTensor_(data)(state, output);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  if (THCTensor_(nDimension)(state, input) == 1) {
+    cunn_ClassNLLCriterion_updateOutput_kernel1<real>
+      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+        output_data,
+        total_weight_data,
+        input_data,
+        target_data,
+        weights_data,
+        reduction == Reduction::ElementwiseMean,
+        n_classes,
+        ignore_index
+    );
+
+  } else if (THCTensor_(nDimension)(state, input) == 2) {
+    cunn_ClassNLLCriterion_updateOutput_kernel<real, accreal>
+      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+        output_data,
+        total_weight_data,
+        input_data,
+        target_data,
+        weights_data,
+        reduction == Reduction::ElementwiseMean,
+        THCTensor_(size)(state, input, 0),
+        THCTensor_(size)(state, input, 1),
+        n_classes,
+        ignore_index
+    );
+  }
+  THCudaCheck(cudaGetLastError());
+
+  if (weights) {
+    THCTensor_(free)(state, weights);
+  }
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction,
+           THCTensor *weights,
+           THCTensor *total_weight,
+           int64_t ignore_index) {
+  if (THCIndexTensor_(nDimension)(state, target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  int n_dims = THCTensor_(nDimension)(state, input);
+  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+  THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, "gradInput must be contiguous");
+
+  if (weights) {
+    THCUNN_assertSameGPU(
+      state, 5, weights, input, target, gradInput, total_weight
+    );
+  }
+  else {
+    THCUNN_assertSameGPU(
+      state, 4, input, target, gradInput, total_weight
+    );
+  }
+
+  THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
+
+  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
+  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
+  THArgCheck(batch_size == num_targets,
+      2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
+      batch_size, num_targets);
+
+  if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  if (reduction == Reduction::None && n_dims == 2) {
+    THCUNN_check_dim_size(state, gradOutput, 1, 0, batch_size);
+    if (weights) {
+      weights = THCTensor_(newContiguous)(state, weights);
+    }
+
+    ClassNLLCriterion_updateGradInput_no_reduce_kernel<real>
+      <<<GET_BLOCKS(batch_size), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+        batch_size,
+        toDeviceTensor<THCIndex_t, 1>(state, target),
+        toDeviceTensor<real, 1>(state, gradOutput),
+        toDeviceTensor<real, 2>(state, gradInput),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        n_classes,
+        ignore_index);
+
+    THCudaCheck(cudaGetLastError());
+
+    if (weights) {
+      THCTensor_(free)(state, weights);
+    }
+    return;
+  }
+
+  ignore_index -= TH_INDEX_BASE;
+
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+  real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  real *gradInput_data = THCTensor_(data)(state, gradInput);
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  if (THCTensor_(nDimension)(state, input) == 1) {
+    cunn_ClassNLLCriterion_updateGradInput_kernel1<real>
+      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+        gradInput_data,
+        gradOutput_data,
+        weights_data,
+        target_data,
+        total_weight_data,
+        reduction == Reduction::ElementwiseMean,
+        n_classes,
+        ignore_index
+    );
+  } else {
+    cunn_ClassNLLCriterion_updateGradInput_kernel<real>
+      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+        gradInput_data,
+        gradOutput_data,
+        target_data,
+        weights_data,
+        total_weight_data,
+        reduction == Reduction::ElementwiseMean,
+        THCTensor_(size)(state, input, 0),
+        THCTensor_(size)(state, input, 1),
+        n_classes,
+        ignore_index
+    );
+  }
+  THCudaCheck(cudaGetLastError());
+
+  if (weights) {
+    THCTensor_(free)(state, weights);
+  }
+  THCIndexTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Col2Im.cu b/aten/src/THCUNN/generic/Col2Im.cu
new file mode 100644
index 0000000..c0a074c
--- /dev/null
+++ b/aten/src/THCUNN/generic/Col2Im.cu
@@ -0,0 +1,129 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Col2Im.cu"
+#else
+
+static inline void THNN_(Col2Im_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
+
+  THArgCheck(kW > 0 && kH > 0, 6,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(sW > 0 && sH > 0, 12,
+             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int64_t ndim = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
+                  "Expected non-empty 2D or 3D input tensor, but got input of shape %s");
+
+  int batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t nInputPlane  = input->size[batch_dim + 1];
+
+  if (nInputPlane % (kW * kH) != 0) {
+    THError("Expected size of input's dimension 1 to be divisible by the "
+            "product of kernel_size, but got input.size(1)=%lld and "
+            "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW);
+  }
+
+  int64_t inputLength  = input->size[batch_dim + 2];
+  int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH;
+  int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW;
+
+  if (inputLength != (nBlocksH * nBlocksW)) {
+    THError("Given output_size=(%d, %d), kernel_size=(%d, %d), "
+            "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected "
+            "size of input's dimension 2 to match the calculated number of "
+            "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.",
+            outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, sH, sW,
+            (long long) nBlocksH, (long long) nBlocksW,
+            (long long) (nBlocksH * nBlocksW), (long long) inputLength);
+  }
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).",
+            outputHeight, outputWidth);
+  }
+}
+
+void THNN_(Col2Im_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth,
+                           kH, kW, dH, dW, padH, padW, sH, sW);
+
+  bool batched_input = true;
+  if (input->dim() == 2) {
+      // Force batch
+      batched_input = false;
+      THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+  }
+
+  int64_t batchSize = input->size[0];
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = nInputPlane / (kW * kH);
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THCTensor_(zero)(state, output);
+
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+
+  for (int64_t elt = 0; elt < batchSize; elt++) {
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nOutputPlane,
+      outputHeight, outputWidth,
+      height_col, width_col,
+      kH, kW,
+      padH, padW,
+      sH, sW,
+      dH, dW, THCTensor_(data)(state, output_n));
+  }
+
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  if (!batched_input) {
+      THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+  }
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(Col2Im_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
+                             kH, kW, dH, dW, padH, padW, sH, sW);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/DistKLDivCriterion.cu b/aten/src/THCUNN/generic/DistKLDivCriterion.cu
new file mode 100644
index 0000000..e798285
--- /dev/null
+++ b/aten/src/THCUNN/generic/DistKLDivCriterion.cu
@@ -0,0 +1,89 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 2, input, target);
+
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+             "input and target need to have the same number of elements");
+
+  if (reduction == Reduction::None) {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, output,
+                        kl_updateOutput_no_reduce_functor<real>());
+    return;
+  }
+
+  THCTensor_(resize1d)(state, output, 1);
+
+  accreal sum;
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), kl_functor<real, accreal>());
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput);
+
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+             "input and target need to have the same number of elements");
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THCUNN_check_shape(state, gradOutput, input);
+    THC_pointwiseApply3<real, real, real>(state, target, gradOutput, gradInput,
+                        kl_updateGradInput_no_reduce_functor<real>());
+    return;
+  }
+
+  THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  real norm = (reduction == Reduction::ElementwiseMean ? ScalarConvert<accreal, real>::to(accreal(1)/size) : ScalarConvert<int, real>::to(1));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+                    kl_updateGradInput_functor<real>(norm, THCTensor_(get1d)(state, gradOutput, 0)));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu
new file mode 100644
index 0000000..5c09a06
--- /dev/null
+++ b/aten/src/THCUNN/generic/ELU.cu
@@ -0,0 +1,50 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ELU.cu"
+#else
+
+#include "../common.h"
+
+
+void THNN_(ELU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal alpha,
+           accreal scale,
+           bool inplace)
+{
+  real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale);
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1<real>(state, input, ELUupdateOutputIP_functor<real>(negcoef, poscoef));
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2<real, real>(state, output, input, ELUupdateOutput_functor<real>(negcoef, poscoef));
+  }
+}
+
+
+void THNN_(ELU_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output,
+           accreal alpha,
+           accreal scale)
+{
+  real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale);
+  THCUNN_check_nElement(state, output, gradOutput);
+  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
+
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(negcoef, poscoef));
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/FeatureLPPooling.cu b/aten/src/THCUNN/generic/FeatureLPPooling.cu
new file mode 100644
index 0000000..3f95bcd
--- /dev/null
+++ b/aten/src/THCUNN/generic/FeatureLPPooling.cu
@@ -0,0 +1,267 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/FeatureLPPooling.cu"
+#else
+
+#include "../common.h"
+
+// non-batch mode:
+// [feature dim]
+// [feature dim][opt dim 1]
+// [feature dim][opt dim 1][opt dim 2]
+//
+// batch mode:
+// [batch dim][feature dim]
+// [batch dim][feature dim][opt dim 1]
+// [batch dim][feature dim][opt dim 1][opt dim 2]
+THCDeviceTensor<real, 4>
+THNN_(FeatureLPPooling_upcast)(THCState* state, THCTensor* t, bool batchMode) {
+  int inputDim = THCTensor_(_nDimension)(state, t);
+
+  if (inputDim == 1) {
+    // [feature dim]
+    return toDeviceTensor<real, 1>(state, t).
+      upcastOuter<2>().upcastInner<4>();
+  } else if (inputDim == 2) {
+    if (batchMode) {
+      // [batch dim][feature dim]
+      return toDeviceTensor<real, 2>(state, t).
+        upcastInner<4>();
+    } else {
+      // [feature dim][opt dim 1]
+      return toDeviceTensor<real, 2>(state, t).
+        upcastOuter<3>().upcastInner<4>();
+    }
+  } else if (inputDim == 3) {
+    if (batchMode) {
+      // [batch dim][feature dim][opt dim 1]
+      return toDeviceTensor<real, 3>(state, t).
+        upcastInner<4>();
+    } else {
+      // [feature dim][opt dim 1][opt dim 2]
+      return toDeviceTensor<real, 3>(state, t).
+        upcastOuter<4>();
+    }
+  } else {
+    // inputDim == 4
+    // [batch dim][feature dim][opt dim 1][opt dim 2]
+    THAssert(batchMode);
+    return toDeviceTensor<real, 4>(state, t);
+  }
+}
+
+// Resizes `toResize` based on the output size for `src` as an input
+// tensor
+void
+THNN_(FeatureLPPooling_resizeForOutput)(THCState* state,
+                                        THCTensor* toResize,
+                                        THCTensor* input,
+                                        bool batchMode,
+                                        int width,
+                                        int stride) {
+  int inputDim = THCTensor_(_nDimension)(state, input);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  int64_t outSize =
+    lpPoolingOutputSize(THCTensor_(size)(state, input, 0), width, stride);
+  if (batchMode) {
+    THAssert(inputDim > 1);
+    outSize =
+      lpPoolingOutputSize(THCTensor_(size)(state, input, 1), width, stride);
+  } else {
+    THAssert(inputDim < 4);
+  }
+
+  if (inputDim == 1) {
+    THCTensor_(resize1d)(state, toResize, outSize);
+  } else if (inputDim == 2) {
+    if (batchMode) {
+      THCTensor_(resize2d)(
+        state, toResize, THCTensor_(size)(state, input, 0), outSize);
+    } else {
+      THCTensor_(resize2d)(
+        state, toResize, outSize, THCTensor_(size)(state, input, 1));
+    }
+  } else if (inputDim == 3) {
+    if (batchMode) {
+      THCTensor_(resize3d)(
+        state,
+        toResize,
+        THCTensor_(size)(state, input, 0), outSize,
+        THCTensor_(size)(state, input, 2));
+    } else {
+      THCTensor_(resize3d)(
+        state,
+        toResize,
+        outSize, THCTensor_(size)(state, input, 1),
+        THCTensor_(size)(state, input, 2));
+    }
+  } else if (inputDim == 4) {
+    THCTensor_(resize4d)(
+      state,
+      toResize,
+      THCTensor_(size)(state, input, 0), outSize,
+      THCTensor_(size)(state, input, 2), THCTensor_(size)(state, input, 3));
+  }
+}
+
+// Makes `toResize` the same size/dimensionality as `src`
+void
+THNN_(FeatureLPPooling_resize)(THCState* state,
+                               THCTensor* toResize,
+                               THCTensor* src) {
+  int inputDim = THCTensor_(_nDimension)(state, src);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  if (inputDim == 1) {
+    THCTensor_(resize1d)(state,
+                         toResize,
+                         THCTensor_(size)(state, src, 0));
+  } else if (inputDim == 2) {
+    THCTensor_(resize2d)(
+      state,
+      toResize,
+      THCTensor_(size)(state, src, 0),
+      THCTensor_(size)(state, src, 1));
+  } else if (inputDim == 3) {
+    THCTensor_(resize3d)(
+      state,
+      toResize,
+      THCTensor_(size)(state, src, 0),
+      THCTensor_(size)(state, src, 1),
+      THCTensor_(size)(state, src, 2));
+  } else if (inputDim == 4) {
+    THCTensor_(resize4d)(
+      state,
+      toResize,
+      THCTensor_(size)(state, src, 0),
+      THCTensor_(size)(state, src, 1),
+      THCTensor_(size)(state, src, 2),
+      THCTensor_(size)(state, src, 3));
+  }
+}
+
+void THNN_(FeatureLPPooling_updateOutput)(THCState* state,
+                                          THCTensor* inputTH,
+                                          THCTensor* outputTH,
+                                          accreal power,
+                                          int width,
+                                          int stride,
+                                          bool batchMode) {
+  THCUNN_assertSameGPU(state, 2, inputTH, outputTH);
+
+  int inputDim = THCTensor_(_nDimension)(state, inputTH);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 2,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 2,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 2,
+             "input tensor must fit into 32-bit index math");
+
+  THCDeviceTensor<real, 4> input;
+  THCDeviceTensor<real, 4> output;
+
+  input = THNN_(FeatureLPPooling_upcast)(state, inputTH, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(input.getSize(1) >= width, 2,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 5,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 6,
+             "stride must be between 1 - 4");
+
+  THNN_(FeatureLPPooling_resizeForOutput)(
+    state, outputTH, inputTH, batchMode, width, stride);
+
+  output = THNN_(FeatureLPPooling_upcast)(state, outputTH, batchMode);
+
+  bool found = runFeatureLPPoolingUpdateOutput(state,
+                                               input,
+                                               output,
+                                               power,
+                                               width,
+                                               stride);
+  THAssert(found);
+}
+
+void THNN_(FeatureLPPooling_updateGradInput)(THCState* state,
+                                             THCTensor* gradOutputTH,
+                                             THCTensor* inputTH,
+                                             THCTensor* outputTH,
+                                             THCTensor* gradInputTH,
+                                             accreal power,
+                                             int width,
+                                             int stride,
+                                             bool batchMode) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutputTH), 2,
+                "output gradient tensor must fit into 32-bit index math");
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 3,
+                "input tensor must fit into 32-bit index math");
+  THCUNN_assertSameGPU(state, 4, gradOutputTH, inputTH, outputTH, gradInputTH);
+
+  int inputDim = THCTensor_(_nDimension)(state, inputTH);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 2,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 2,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  THCDeviceTensor<real, 4> gradOutput;
+  THCDeviceTensor<real, 4> input;
+  THCDeviceTensor<real, 4> output;
+  THCDeviceTensor<real, 4> gradInput;
+
+  input = THNN_(FeatureLPPooling_upcast)(state, inputTH, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(input.getSize(1) >= width, 3,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 7,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 8,
+             "stride must be between 1 - 4");
+
+  gradOutput = THNN_(FeatureLPPooling_upcast)(state, gradOutputTH, batchMode);
+  output = THNN_(FeatureLPPooling_upcast)(state, outputTH, batchMode);
+
+  for (int i = 0; i < 4; ++i) {
+    THAssertMsg(output.getSize(i) == gradOutput.getSize(i),
+                "output and gradOutput sizes do not match");
+  }
+
+  // Make sure that the input sizes produce the output sizes
+  THArgCheck(lpPoolingOutputSize(input.getSize(1), width, stride) ==
+             output.getSize(1), 3,
+             "input and output sizes do not match with respect to "
+             "width and stride");
+
+  // Resize `gradInput` based on `input`
+  THNN_(FeatureLPPooling_resize)(state, gradInputTH, inputTH);
+  gradInput = THNN_(FeatureLPPooling_upcast)(state, gradInputTH, batchMode);
+
+  bool found = runFeatureLPPoolingUpdateGradInput(state,
+                                                  gradOutput,
+                                                  input,
+                                                  output,
+                                                  gradInput,
+                                                  power,
+                                                  width,
+                                                  stride);
+  THAssert(found);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/FusedRNNKernel.cu b/aten/src/THCUNN/generic/FusedRNNKernel.cu
new file mode 100644
index 0000000..1d16bc4
--- /dev/null
+++ b/aten/src/THCUNN/generic/FusedRNNKernel.cu
@@ -0,0 +1,785 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/FusedRNNKernel.cu"
+#else
+#include <cstdarg>
+
+#include "../common.h"
+
+#define TINFO TensorInfo<real, INDTYPE>
+
+//factor will be 3 for GRU and 4 for LSTM
+void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...)
+{
+  va_list list;
+  va_start(list, count);
+  THCTensor *input = va_arg(list, THCTensor*);
+  THCTensor *hidden = va_arg(list, THCTensor*);
+  THArgCheck(THCTensor_(nElement)(state, input) ==
+             THCTensor_(nElement)(state, hidden),
+             3, "Input and Hidden tensor sizes should be the same.");
+
+  THAssertMsg(THCTensor__nDimension(state, input) <= MAX_CUTORCH_DIMS,
+              "Tensor dimension is too large.");
+
+  THAssertMsg(THCTensor__nDimension(state, hidden) <= MAX_CUTORCH_DIMS,
+              "Tensor dimension is too large.");
+
+  for (int arg=2; arg < count; ++arg){
+    THCTensor *tens = va_arg(list, THCTensor*);
+    THArgCheck(THCTensor_(nElement)(state, input) ==
+               THCTensor_(nElement)(state, tens)*factor,
+               3, "A pointwise tensor was not the right size, should have 1/%u the elements of input/hidden tensor.", arg, factor);
+    THAssertMsg(THCTensor__nDimension(state, tens) <= MAX_CUTORCH_DIMS,
+         "Tensor dimension is too large.");
+  }
+
+  va_end(list);
+}
+
+int THNN_(minIndexType)(THCState *state, int count, ...)
+{
+  va_list list;
+  va_start(list, count);
+
+  THCTensor* tens = va_arg(list, THCTensor*);
+  int startDim = THCTensor__nDimension(state, tens);
+  bool canCollapse = THCTensor_(isContiguous)(state,tens);
+
+  for (int arg=1; arg < count; ++arg){
+    tens = va_arg(list, THCTensor*);
+    canCollapse = canCollapse && THCTensor_(isContiguous)(state, tens);
+    if(THCTensor__nDimension(state, tens) != startDim){
+      va_end(list);
+      return -1;
+    }
+  }
+  va_end(list);
+  if(canCollapse) return -2;
+  return startDim;
+}
+
+bool THNN_(canUse32BitIndexMath)(THCState *state, int count, ...)
+{
+  va_list list;
+  va_start(list, count);
+
+  for (int arg=0; arg < count; ++arg){
+    THCTensor *tens = va_arg(list, THCTensor*);
+    if (!THCTensor_canUse32BitIndexMath(state, tens)){
+      va_end(list);
+      return false;
+    }
+  }
+  va_end(list);
+  return true;
+}
+
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX)                              \
+  D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
+
+#define H2F(input) ScalarConvert<real, accreal>::to(input)
+#define F2H(input) ScalarConvert<accreal, real>::to(input)
+
+template <typename T, typename IndexType, int Dims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+THNN_(GRUForward)(TensorInfo<T, IndexType> Input,
+            TensorInfo<T, IndexType> Hidden,
+            TensorInfo<T, IndexType> Bias1,
+            TensorInfo<T, IndexType> Bias2,
+            TensorInfo<T, IndexType> _hx,
+            TensorInfo<T, IndexType> _hy,
+            TensorInfo<T, IndexType> storage,
+            IndexType hsz,
+            IndexType totalElements)
+{
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x)
+    {
+
+      IndexType offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz;
+
+      T ir = DEVICE_LINEAR_GET(Input, offset+0*hsz);
+      T ii = DEVICE_LINEAR_GET(Input, offset+1*hsz);
+      T in = DEVICE_LINEAR_GET(Input, offset+2*hsz);
+      T hr = DEVICE_LINEAR_GET(Hidden,offset+0*hsz);
+      T hi = DEVICE_LINEAR_GET(Hidden,offset+1*hsz);
+      T hn = DEVICE_LINEAR_GET(Hidden,  offset+2*hsz);
+
+      T hx = DEVICE_LINEAR_GET(_hx, linearIndex);
+      T* hy = &DEVICE_LINEAR_GET(_hy, linearIndex);
+
+      bool has_bias = (Bias1.data != NULL);
+
+      T b1r, b1i, b1n, b2r, b2i, b2n;
+
+      if(has_bias){
+        b1r = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+0*hsz);
+        b1i = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+1*hsz);
+        b1n = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+2*hsz);
+
+        b2r = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+0*hsz);
+        b2i = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+1*hsz);
+        b2n = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+2*hsz);
+      }else{
+#ifndef THC_REAL_IS_HALF
+        b1r = 0.0; b1i = 0.0; b1n = 0.0;
+        b2r = 0.0; b2i = 0.0; b2n = 0.0;
+#else
+        b1r = F2H(0.0); b1i = F2H(0.0); b1n = F2H(0.0);
+        b2r = F2H(0.0); b2i = F2H(0.0); b2n = F2H(0.0);
+#endif
+      }
+
+
+      offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz;
+
+      accreal rg, ig, ng;
+
+      rg = H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r);
+      ig = H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i);
+
+      TensorSigmoidOp<accreal>()(&rg, &rg);
+      TensorSigmoidOp<accreal>()(&ig, &ig);
+      ng = H2F(in) + H2F(b1n) + rg*( H2F(hn)+H2F(b2n) );
+      ng = THCNumerics<accreal>::tanh(ng);
+      *hy = F2H( ng + ig * ( H2F(hx)-ng ) );
+
+      //SAVE FOR BACKWARDS
+      DEVICE_LINEAR_GET(storage, offset+0*hsz) = F2H(rg);
+      DEVICE_LINEAR_GET(storage, offset+1*hsz) = F2H(ig);
+      DEVICE_LINEAR_GET(storage, offset+2*hsz) = F2H(ng);
+      DEVICE_LINEAR_GET(storage, offset+3*hsz) = hx;
+      DEVICE_LINEAR_GET(storage, offset+4*hsz) = F2H(H2F(hn) + H2F(b2n));
+
+    }
+}
+
+template <typename T, typename IndexType, int Dims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+THNN_(GRUBackward)(TensorInfo<T, IndexType> gradInInput,
+             TensorInfo<T, IndexType> gradInHidden,
+             TensorInfo<T, IndexType> gradOutput,
+             TensorInfo<T, IndexType> gradInputHx,
+             TensorInfo<T, IndexType> storage,
+             IndexType hsz,
+             IndexType totalElements)
+{
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz;
+
+    T rg = DEVICE_LINEAR_GET(storage, offset+0*hsz);
+    T ig = DEVICE_LINEAR_GET(storage, offset+1*hsz);
+    T ng = DEVICE_LINEAR_GET(storage, offset+2*hsz);
+    T hx = DEVICE_LINEAR_GET(storage, offset+3*hsz);
+    T hn = DEVICE_LINEAR_GET(storage, offset+4*hsz);
+
+    T go = DEVICE_LINEAR_GET(gradOutput, linearIndex);
+
+    offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz;
+
+    accreal gig = H2F(go)*( H2F(hx)-H2F(ng) )*( 1-H2F(ig) )*H2F(ig);
+    accreal ghx = H2F(go)*H2F(ig);
+    accreal gin = H2F(go)*( 1-H2F(ig) )*( 1-H2F(ng)*H2F(ng) );
+    accreal ghn = gin * H2F(rg);
+    accreal grg = gin *H2F(hn)*( 1-H2F(rg) )*H2F(rg);
+
+    DEVICE_LINEAR_GET(gradInInput, offset+0*hsz) = F2H(grg);
+    DEVICE_LINEAR_GET(gradInInput, offset+1*hsz) = F2H(gig);
+    DEVICE_LINEAR_GET(gradInInput, offset+2*hsz) = F2H(gin);
+
+    DEVICE_LINEAR_GET(gradInHidden, offset+0*hsz) = F2H(grg);
+    DEVICE_LINEAR_GET(gradInHidden, offset+1*hsz) = F2H(gig);
+    DEVICE_LINEAR_GET(gradInHidden, offset+2*hsz) = F2H(ghn);
+    DEVICE_LINEAR_GET(gradInputHx, linearIndex) = F2H(ghx);
+
+  }
+}
+
+template <typename T, typename IndexType, int Dims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+  THNN_(LSTMForward)(TensorInfo<T, IndexType> input,
+            TensorInfo<T, IndexType> hidden,
+            TensorInfo<T, IndexType> bias1,
+            TensorInfo<T, IndexType> bias2,
+            TensorInfo<T, IndexType> _cx,
+            TensorInfo<T, IndexType> _hy,
+            TensorInfo<T, IndexType> _cy,
+            IndexType hsz,
+            IndexType totalElements)
+{
+
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x)
+    {
+
+      IndexType offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz;
+
+      T* iig = &DEVICE_LINEAR_GET(input, offset+0*hsz);
+      T* ifg = &DEVICE_LINEAR_GET(input, offset+1*hsz);
+      T* icg = &DEVICE_LINEAR_GET(input, offset+2*hsz);
+      T* iog = &DEVICE_LINEAR_GET(input, offset+3*hsz);
+
+      T hig = DEVICE_LINEAR_GET(hidden, offset+0*hsz);
+      T hfg = DEVICE_LINEAR_GET(hidden, offset+1*hsz);
+      T hcg = DEVICE_LINEAR_GET(hidden,  offset+2*hsz);
+      T hog = DEVICE_LINEAR_GET(hidden,  offset+3*hsz);
+
+      T cx = DEVICE_LINEAR_GET(_cx, linearIndex);
+
+      T* hy = &DEVICE_LINEAR_GET(_hy, linearIndex);
+      T* cy = &DEVICE_LINEAR_GET(_cy, linearIndex);
+
+      bool has_bias = (bias1.data != NULL);
+
+      T b1i, b1f, b1c, b1o;
+      T b2i, b2f, b2c, b2o;
+
+      if(has_bias){
+    b1i = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+0*hsz);
+    b1f = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+1*hsz);
+    b1c = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+2*hsz);
+    b1o = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+3*hsz);
+
+    b2i = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+0*hsz);
+    b2f = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+1*hsz);
+    b2c = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+2*hsz);
+    b2o = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+3*hsz);
+
+      }else{
+#ifndef THC_REAL_IS_HALF
+    b1i = 0.0; b1f = 0.0; b1c = 0.0; b1o = 0.0;
+    b2i = 0.0; b2f = 0.0; b2c = 0.0; b2o = 0.0;
+#else
+    b1i = F2H(0.0); b1f = F2H(0.0); b1c = F2H(0.0); b1o = F2H(0.0);
+    b2i = F2H(0.0); b2f = F2H(0.0); b2c = F2H(0.0); b2o = F2H(0.0);
+#endif
+      }
+
+      accreal ig, fg, cg, og;
+      accreal f_hy, f_cy;
+
+      ig = H2F(*iig) + H2F(hig) + H2F(b1i) + H2F(b2i);
+      fg = H2F(*ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f);
+      cg = H2F(*icg) + H2F(hcg) + H2F(b1c) + H2F(b2c);
+      og = H2F(*iog) + H2F(hog) + H2F(b1o) + H2F(b2o);
+
+      TensorSigmoidOp<accreal>()(&ig, &ig);
+      TensorSigmoidOp<accreal>()(&fg, &fg);
+      cg = THCNumerics<accreal>::tanh(cg);
+      TensorSigmoidOp<accreal>()(&og, &og);
+
+      f_cy = (fg * H2F(cx) ) + (ig * cg);
+      f_hy = og * THCNumerics<accreal>::tanh(f_cy);
+
+      *hy = F2H(f_hy);
+      *cy = F2H(f_cy);
+
+      //SAVE FOR BACKWARDS
+      //Also need cy and cx but can be saved easily in python
+      *iig = F2H(ig);
+      *ifg = F2H(fg);
+      *icg = F2H(cg);
+      *iog = F2H(og);
+
+    }
+}
+
+template <typename T, typename IndexType, int Dims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+  THNN_(LSTMBackward)(TensorInfo<T, IndexType> storage,
+              TensorInfo<T, IndexType> gradInGates,
+              TensorInfo<T, IndexType> _cx,
+              TensorInfo<T, IndexType> _cy,
+              TensorInfo<T, IndexType> gradoutput,
+              TensorInfo<T, IndexType> gradoutputcell,
+              TensorInfo<T, IndexType> gradInputCx,
+              IndexType hsz,
+              IndexType totalElements)
+{
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz;
+
+    T ig = DEVICE_LINEAR_GET(storage, offset+0*hsz);
+    T fg = DEVICE_LINEAR_GET(storage, offset+1*hsz);
+    T cg = DEVICE_LINEAR_GET(storage, offset+2*hsz);
+    T og = DEVICE_LINEAR_GET(storage, offset+3*hsz);
+
+    T* ih = &DEVICE_LINEAR_GET(gradInGates, offset+0*hsz);
+    T* fh = &DEVICE_LINEAR_GET(gradInGates, offset+1*hsz);
+    T* ch = &DEVICE_LINEAR_GET(gradInGates, offset+2*hsz);
+    T* oh = &DEVICE_LINEAR_GET(gradInGates, offset+3*hsz);
+
+    //will return hidden grads here
+    T cx = DEVICE_LINEAR_GET(_cx, linearIndex);
+    T cy = DEVICE_LINEAR_GET(_cy, linearIndex);
+
+    T* gi = &DEVICE_LINEAR_GET(gradInputCx, linearIndex);
+
+    T go = DEVICE_LINEAR_GET(gradoutput, linearIndex);
+    T goc= DEVICE_LINEAR_GET(gradoutputcell, linearIndex);
+
+    accreal gcx = THCNumerics<accreal>::tanh(H2F(cy));
+
+
+    accreal gog = H2F(go) * gcx;
+    gcx = H2F(go) * H2F(og) * ( 1 - gcx*gcx) + H2F(goc);
+
+    accreal gig = gcx * H2F(cg);
+    accreal gfg = gcx * H2F(cx);
+    accreal gcg = gcx * H2F(ig);
+
+    gcx = gcx * H2F(fg);
+
+    gig = gig * (1-H2F(ig)) * H2F(ig);
+    gfg = gfg * (1-H2F(fg)) * H2F(fg);
+    gcg = gcg * (1-H2F(cg)*H2F(cg));
+    gog = gog * (1-H2F(og)) * H2F(og);
+
+    *ih = F2H(gig);
+    *fh = F2H(gfg);
+    *ch = F2H(gcg);
+    *oh = F2H(gog);
+
+    *gi = F2H(gcx);
+
+  }
+}
+
+
+// ************ START Create function calls ********** //
+#define FILL_FUNCTION(ITYPE, DIM, FUNCTION) FUNCTION(ITYPE, DIM)
+
+#define FILL_DIM(ITYPE, DIM, FUNCTION)          \
+  switch (DIM) {                                \
+  case -2:                                      \
+    FILL_FUNCTION(ITYPE, -2, FUNCTION);         \
+    break;                                      \
+  case 1:                                       \
+    FILL_FUNCTION(ITYPE, 1, FUNCTION);          \
+    break;                                      \
+  case 2:                                       \
+    FILL_FUNCTION(ITYPE, 2, FUNCTION);          \
+    break;                                      \
+  default:                                      \
+    FILL_FUNCTION(ITYPE, -1, FUNCTION);         \
+    break;                                      \
+  }
+
+#define LSTM_FORWARD(ITYPE, DIM) THNN_(LSTMForward)             \
+  <real, ITYPE, DIM>                                            \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>        \
+  (inputI, hiddenI,                                             \
+   bias1I, bias2I, cxI, hyI, cyI,                               \
+   hid_size, totalElements);
+
+#define LSTM_BACKWARD(ITYPE, DIM) THNN_(LSTMBackward)           \
+  <real, ITYPE, DIM>                                            \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>        \
+  (storageI, gradingatesI, cxI, cyI,                            \
+   gradoutI, gradoutcI, gradincxI,                              \
+   hid_size, totalElements);
+
+#define GRU_FORWARD(ITYPE, DIM) THNN_(GRUForward)<real, ITYPE, DIM> \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>            \
+  (inputI, hiddenI, bias1I, bias2I, hxI, hyI, storageI,             \
+   hid_size, totalElements);
+
+#define GRU_BACKWARD(ITYPE, DIM) THNN_(GRUBackward)                     \
+  <real, ITYPE, DIM>                                                    \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>                \
+  (gradininputI, gradinhiddenI, gradoutI, gradinhxI, storageI,                        \
+   hid_size, totalElements);
+
+// ************ END Create actual function calls ************ //
+
+template<typename INDTYPE>
+void THNN_(LSTM_forw_ind_wrap)(
+   THCState *state,
+   THCTensor *input,
+   THCTensor *hidden,
+   THCTensor *bias1,
+   THCTensor *bias2,
+   THCTensor *cx,
+   THCTensor *hy,
+   THCTensor *cy)
+{
+  bool has_bias = (bias1!=NULL);
+
+  int maxDim;
+  if(has_bias){
+    THCUNN_assertSameGPU(state, 7, input, hidden, bias1, bias2, hy, cy, cx);
+    maxDim = THNN_(minIndexType)
+      (state, 7, input, hidden, bias1, bias2, hy, cy, cx);
+  }else{
+    THCUNN_assertSameGPU(state, 5, input, hidden, hy, cy, cx);
+    maxDim = THNN_(minIndexType)
+      (state, 5, input, hidden, hy, cy, cx);
+  }
+
+  ptrdiff_t totalElements = THCTensor_nElement(state, cx);
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
+          "Could not get grid size for pointwise apply.");
+
+  TINFO inputI = getTensorInfo<real, THCTensor, INDTYPE>(state, input);
+  TINFO hiddenI = getTensorInfo<real, THCTensor, INDTYPE>(state, hidden);
+  TINFO cxI = getTensorInfo<real, THCTensor, INDTYPE>(state, cx);
+  TINFO hyI = getTensorInfo<real, THCTensor, INDTYPE>(state, hy);
+  TINFO cyI = getTensorInfo<real, THCTensor, INDTYPE>(state, cy);
+
+  INDTYPE hid_size = cxI.sizes[cxI.dims-1];
+  if(has_bias){
+    THAssertMsg( hid_size*4 == static_cast<INDTYPE>(THCTensor_(nElement)(state, bias1)) &&
+                 hid_size*4 == static_cast<INDTYPE>(THCTensor_(nElement)(state, bias2)),
+                 "Bias in pointwise operation is an incorrect size, must be 4 x feature size.");
+  }
+
+  if(maxDim == -2){
+    inputI.collapseDims();
+    hiddenI.collapseDims();
+    cxI.collapseDims();
+    hyI.collapseDims();
+    cyI.collapseDims();
+  }
+
+  INDTYPE zero[1] = {0};
+  TINFO nullinfo = TINFO(NULL, 1, zero, zero);
+  TINFO bias1I = nullinfo;
+  TINFO bias2I = nullinfo;
+
+  if(has_bias){
+    bias1I = getTensorInfo<real, THCTensor, INDTYPE>(state, bias1);
+    bias2I = getTensorInfo<real, THCTensor, INDTYPE>(state, bias2);
+    if(maxDim == -2){
+      bias1I.collapseDims();
+      bias2I.collapseDims();
+    }
+  }
+
+  FILL_DIM(INDTYPE, maxDim, LSTM_FORWARD);
+
+}
+void THNN_(LSTMFused_updateOutput)(
+   THCState *state,
+   THCTensor *input,
+   THCTensor *hidden,
+   THCTensor *bias1,
+   THCTensor *bias2,
+   THCTensor *cx,
+   THCTensor *hy,
+   THCTensor *cy)
+{
+  THCTensor_(resizeAs)(state, hy, cx);
+  THCTensor_(resizeAs)(state, cy, cx);
+  THNN_(FusedRNNAssertSizes)(state, 4, 5, input, hidden, hy, cy, cx);
+
+  bool has_bias = (bias1!=NULL);
+  bool canUse32bi;
+  if(has_bias){
+    canUse32bi = THNN_(canUse32BitIndexMath)
+      (state, 7, input, hidden, bias1, bias2, hy, cy, cx);
+  }else{
+    canUse32bi = THNN_(canUse32BitIndexMath)
+      (state, 5, input, hidden, hy, cy, cx);
+  }
+
+  if(canUse32bi){
+    THNN_(LSTM_forw_ind_wrap)<uint32_t>
+      (state, input, hidden, bias1, bias2, cx, hy, cy);
+  }else{
+    THNN_(LSTM_forw_ind_wrap)<uint64_t>
+      (state, input, hidden, bias1, bias2, cx, hy, cy);
+  }
+    THCudaCheck(cudaGetLastError());
+}
+
+template<typename INDTYPE>
+void THNN_(LSTM_back_ind_wrap)(
+   THCState *state,
+   THCTensor *storage,
+   THCTensor *gradInGates,
+   THCTensor *cx,
+   THCTensor *cy,
+   THCTensor *gradOutput,
+   THCTensor *gradOutputCell,
+   THCTensor *gradInputCx)
+{
+  int maxDim = THNN_(minIndexType)
+    (state, 7, storage, gradInGates, cx, cy,
+     gradOutput, gradOutputCell, gradInputCx);
+  ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput);
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
+              "Could not get grid size for pointwise apply");
+
+  TINFO storageI = getTensorInfo<real, THCTensor, INDTYPE>(state, storage);
+  TINFO gradingatesI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInGates);
+  TINFO cxI = getTensorInfo<real, THCTensor, INDTYPE>(state, cx);
+  TINFO cyI = getTensorInfo<real, THCTensor, INDTYPE>(state, cy);
+  TINFO gradoutI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradOutput);
+  TINFO gradoutcI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradOutputCell);
+  TINFO gradincxI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInputCx);
+
+  INDTYPE hid_size = gradoutI.sizes[gradoutI.dims-1];
+
+  if(maxDim == -2){
+    storageI.collapseDims();
+    gradingatesI.collapseDims();
+    cxI.collapseDims();
+    cyI.collapseDims();
+    gradoutI.collapseDims();
+    gradoutcI.collapseDims();
+    gradincxI.collapseDims();
+  }
+  FILL_DIM(INDTYPE, maxDim, LSTM_BACKWARD);
+
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+   THCState *state,
+   THCTensor *storage,
+   THCTensor *gradInGates,
+   THCTensor *cx,
+   THCTensor *cy,
+   THCTensor *gradOutput,
+   THCTensor *gradOutputCell,
+   THCTensor *gradInputCx)
+{
+  THCTensor_(resizeAs)(state, gradInputCx, gradOutput);
+  THCUNN_assertSameGPU(state, 7, storage, gradInGates, cx, cy,
+               gradOutput, gradOutputCell, gradInputCx);
+  THNN_(FusedRNNAssertSizes)
+    (state, 4, 7, storage, gradInGates, cx, cy,
+     gradOutput, gradOutputCell, gradInputCx);
+
+  bool canUse32bi = THNN_(canUse32BitIndexMath)
+    (state, 7, storage, gradInGates, cx, cy,
+     gradOutput, gradOutputCell, gradInputCx);
+
+  if(canUse32bi){
+    THNN_(LSTM_back_ind_wrap)<uint32_t>
+      (state, storage, gradInGates, cx, cy,
+       gradOutput, gradOutputCell, gradInputCx);
+  }else{
+    THNN_(LSTM_back_ind_wrap)<uint64_t>
+      (state, storage, gradInGates, cx, cy,
+       gradOutput, gradOutputCell, gradInputCx);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+template<typename INDTYPE>
+void THNN_(GRU_forw_ind_wrap)(
+   THCState *state,
+   THCTensor *input,
+   THCTensor *hidden,
+   THCTensor *bias1,
+   THCTensor *bias2,
+   THCTensor *hx,
+   THCTensor *hy,
+   THCTensor *storage)
+{
+  bool has_bias = (bias1!=NULL);
+  int maxDim;
+
+  if(has_bias){
+    THCUNN_assertSameGPU
+      (state, 7, input, hidden, hx, hy, bias1, bias2, storage);
+    maxDim = THNN_(minIndexType)
+      (state, 7, input, hidden, hx, hy, bias1, bias2, storage);
+  }else{
+    THCUNN_assertSameGPU
+      (state, 5, input, hidden, hx, hy, storage);
+    maxDim = THNN_(minIndexType)
+      (state, 5, input, hidden, hx, hy, storage);
+  }
+
+  ptrdiff_t totalElements = THCTensor_nElement(state, hx);
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
+              "Could not get grid size for pointwise apply.");
+
+  TINFO inputI = getTensorInfo<real, THCTensor, INDTYPE>(state, input);
+  TINFO hiddenI = getTensorInfo<real, THCTensor, INDTYPE>(state, hidden);
+  TINFO hxI = getTensorInfo<real, THCTensor, INDTYPE>(state, hx);
+  TINFO hyI = getTensorInfo<real, THCTensor, INDTYPE>(state, hy);
+  TINFO storageI = getTensorInfo<real, THCTensor, INDTYPE>(state, storage);
+
+  INDTYPE hid_size = hxI.sizes[hxI.dims-1];
+  if(has_bias){
+    THAssertMsg( hid_size*3 == static_cast<INDTYPE>(THCTensor_(nElement)(state, bias1)) &&
+                 hid_size*3 == static_cast<INDTYPE>(THCTensor_(nElement)(state, bias2)),
+                 "Bias in pointwise operation is an incorrect size, must be 3 x feature size.");
+  }
+
+  if(maxDim == -2){
+    inputI.collapseDims();
+    hiddenI.collapseDims();
+    hyI.collapseDims();
+    hxI.collapseDims();
+    storageI.collapseDims();
+  }
+
+  INDTYPE zero[1] = {0};
+  TINFO nullinfo = TINFO(NULL, 1, zero, zero);
+  TINFO bias1I = nullinfo;
+  TINFO bias2I = nullinfo;
+
+  if(has_bias){
+    bias1I = getTensorInfo<real, THCTensor, INDTYPE>(state, bias1);
+    bias2I = getTensorInfo<real, THCTensor, INDTYPE>(state, bias2);
+    if(maxDim == -2){
+      bias1I.collapseDims();
+      bias2I.collapseDims();
+    }
+  }
+
+  FILL_DIM(INDTYPE, maxDim, GRU_FORWARD);
+
+}
+
+void THNN_(GRUFused_updateOutput)(
+   THCState *state,
+   THCTensor *input,
+   THCTensor *hidden,
+   THCTensor *bias1,
+   THCTensor *bias2,
+   THCTensor *hx,
+   THCTensor *hy,
+   THCTensor *storage)
+{
+  THCTensor_(resizeAs)(state, hy, hx);
+  THNN_(FusedRNNAssertSizes)(state, 3, 4, input, hidden, hx, hy);
+  THArgCheck(THCTensor_(nElement)(state, storage) ==
+             THCTensor_(nElement)(state, hx)*5,
+             3, "Storage tensor for fused kernel was not sized correctly.");
+
+
+  bool has_bias = (bias1!=NULL);
+  bool canUse32bi;
+
+  if(has_bias){
+    canUse32bi = THNN_(canUse32BitIndexMath)
+      (state, 7, input, hidden, hx, hy, bias1, bias2, storage);
+  }else{
+    canUse32bi = THNN_(canUse32BitIndexMath)
+      (state, 5, input, hidden, hx, hy, storage);
+  }
+
+  if(canUse32bi){
+    THNN_(GRU_forw_ind_wrap)<uint32_t>
+      (state, input, hidden, bias1, bias2, hx, hy, storage);
+  }else{
+    THNN_(GRU_forw_ind_wrap)<uint64_t>
+      (state, input, hidden, bias1, bias2, hx, hy, storage);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template<typename INDTYPE>
+void THNN_(GRU_back_ind_wrap)(
+   THCState *state,
+   THCTensor *gradInInput,
+   THCTensor *gradInHidden,
+   THCTensor *gradOutput,
+   THCTensor *gradInputHx,
+   THCTensor *storage)
+{
+
+  int maxDim = THNN_(minIndexType)(state, 5, gradInInput, gradInHidden, gradOutput,
+                                   gradInputHx, storage);
+  ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput);
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
+          "Could not get grid size for pointwise apply");
+
+  TINFO gradininputI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInInput);
+  TINFO gradinhiddenI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInHidden);
+  TINFO gradoutI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradOutput);
+  TINFO gradinhxI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInputHx);
+  TINFO storageI = getTensorInfo<real, THCTensor, INDTYPE>(state, storage);
+
+  INDTYPE hid_size = gradoutI.sizes[gradoutI.dims-1];
+
+  if(maxDim == -2){
+    gradininputI.collapseDims();
+    gradinhiddenI.collapseDims();
+    gradoutI.collapseDims();
+    gradinhxI.collapseDims();
+    storageI.collapseDims();
+  }
+  FILL_DIM(INDTYPE, maxDim, GRU_BACKWARD);
+}
+
+void THNN_(GRUFused_updateGradInput)(
+   THCState *state,
+   THCTensor *gradInInput,
+   THCTensor *gradInHidden,
+   THCTensor *gradOutput,
+   THCTensor *gradInputHx,
+   THCTensor *storage)
+{
+  THCTensor_(resizeAs)(state, gradInputHx, gradOutput);
+  THCUNN_assertSameGPU(state, 5, gradInInput, gradInHidden, gradOutput, gradInputHx, storage);
+  THNN_(FusedRNNAssertSizes)(state, 3, 4, gradInInput, gradInHidden, gradOutput, gradInputHx);
+  bool canUse32bi = THNN_(canUse32BitIndexMath)(state, 5, gradInInput, gradInHidden,
+                                                gradOutput, gradInputHx, storage);
+  if(canUse32bi){
+    THNN_(GRU_back_ind_wrap)<uint32_t>
+      (state, gradInInput, gradInHidden, gradOutput, gradInputHx, storage);
+  }else{
+    THNN_(GRU_back_ind_wrap)<uint64_t>
+      (state, gradInInput, gradInHidden, gradOutput, gradInputHx, storage);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+//Clean up compiler namespace
+#undef DEVICE_LINEAR_GET
+#undef H2F
+#undef F2H
+#undef EXPAND_FUNCTION
+#undef EXPAND_DIM
+#undef EXPAND_TYPE
+#undef FILL_TYPES_FORWARD
+#undef FILL_FORWARD
+#undef FILL_TYPES_BACKWARD
+#undef FILL_BACKWARD
+
+#endif
diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu
new file mode 100644
index 0000000..4622403
--- /dev/null
+++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu
@@ -0,0 +1,59 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/GatedLinearUnit.cu"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int dim)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  // size output to half of input
+  dim = dim - TH_INDEX_BASE;
+  const int64_t nIn = THCTensor_(size)(state, input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+  const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2;
+  THLongStorage *newSizes = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(newSizes, dim, inputSize);
+  THCTensor_(resize)(state, output, newSizes, NULL);
+
+  // halve tensor
+  THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
+  THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize);
+
+  // x = x1:cmul( sigmoid(x2) )
+  THC_pointwiseApply3<real, real, real>(state, output, secondHalf, firstHalf, gatedLinearCSigMul_functor<real, accreal>());
+
+  THLongStorage_free(newSizes);
+  THCTensor_(free)(state, firstHalf);
+  THCTensor_(free)(state, secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int dim)
+{
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  dim = dim - TH_INDEX_BASE;
+  const int64_t nIn = THCTensor_(size)(state, input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2;
+  THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
+  THCTensor *gradInputfirstHalf = THCTensor_(newNarrow)(state, gradInput, dim, 0, inputSize);
+  const int64_t stride_i = THCTensor_(stride)(state, input, dim) * inputSize;
+  const int64_t stride_gI = THCTensor_(stride)(state, gradInput, dim) * inputSize;
+  THC_pointwiseApply3<real, real, real>(state, gradInputfirstHalf, gradOutput, firstHalf, gatedLinearDerivative<real,accreal>(stride_i, stride_gI)); 
+  THCTensor_(free)(state, firstHalf);
+  THCTensor_(free)(state, gradInputfirstHalf);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/HardTanh.cu b/aten/src/THCUNN/generic/HardTanh.cu
new file mode 100644
index 0000000..18195b7
--- /dev/null
+++ b/aten/src/THCUNN/generic/HardTanh.cu
@@ -0,0 +1,61 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/HardTanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(HardTanh_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal min_val_,
+           accreal max_val_,
+           bool inplace)
+{
+  real min_val = ScalarConvert<accreal, real>::to(min_val_);
+  real max_val = ScalarConvert<accreal, real>::to(max_val_);
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+  if(inplace)
+  {
+    THCTensor_(set)(state, output, input);
+    THC_pointwiseApply1<real>(state, output, hardtanhupdateOutput_functor<real>(min_val, max_val));
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2<real, real>(state, output, input,
+                               hardtanhupdateOutput_functor<real>(min_val, max_val));
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           accreal min_val_,
+           accreal max_val_,
+           bool inplace)
+{
+  real min_val = ScalarConvert<accreal, real>::to(min_val_);
+  real max_val = ScalarConvert<accreal, real>::to(max_val_);
+
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+
+  if (inplace)
+  {
+    THCTensor_(set)(state, gradInput, gradOutput);
+    THC_pointwiseApply2<real, real>(state, gradInput, input,
+                                 hardtanhupdateGradInput_functor<real>(min_val, max_val));
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput,
+                                 hardtanhupdateGradInput_functor<real>(min_val, max_val));
+  }
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Im2Col.cu b/aten/src/THCUNN/generic/Im2Col.cu
new file mode 100644
index 0000000..dd35461
--- /dev/null
+++ b/aten/src/THCUNN/generic/Im2Col.cu
@@ -0,0 +1,119 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Im2Col.cu"
+#else
+
+static inline void THNN_(Im2Col_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
+
+  THArgCheck(kW > 0 && kH > 0, 4,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 6,
+             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(padW >= 0 && padH >= 0, 8,
+             "padding should be non-negative, but got padH: %d padW: %d", padH, padW);
+  THArgCheck(sW > 0 && sH > 0, 10,
+             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
+
+  int64_t ndim = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                "Expected non-empty 3D or 4D input tensor, but got input of shape %s");
+
+  int dim_batch = 0;
+  if (ndim == 3) {
+    dim_batch = -1;
+  }
+  int64_t nInputPlane  = THCTensor_(size)(state, input, dim_batch + 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, dim_batch + 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+
+  if (outputHeight < 1 || outputWidth < 1) {
+    THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
+            "dilation=(%d, %d), padding=(%d, %d), calculated "
+            "shape of the array of sliding blocks as (%d, %d), which is "
+            "too small (non-positive).",
+            inputHeight, inputHeight, kH, kW, dH, dW, padH, padW,
+            outputHeight, outputWidth);
+  }
+}
+
+void THNN_(Im2Col_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW);
+
+  input = THCTensor_(newContiguous)(state, input);
+  bool batched_input = true;
+  if (input->dim() == 3) {
+    batched_input = false;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t batchSize    = THCTensor_(size)(state, input, 0);
+  int64_t nInputPlane  = THCTensor_(size)(state, input, 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, 3);
+
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
+
+  THCTensor_(resize3d)(state, output, batchSize, nOutputPlane, outputLength);
+  THCTensor_(zero)(state, output);
+
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  for (int64_t elt = 0; elt < batchSize; elt++) {
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, sH, sW,
+      dH, dW, THCTensor_(data)(state, output_n));
+  }
+
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  if (!batched_input) {
+    THCTensor_(resize2d)(state, output, nOutputPlane, outputLength);
+  }
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(Im2Col_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
+                             inputHeight, inputWidth,
+                             kH, kW, dH, dW,
+                             padH, padW, sH, sW);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu
new file mode 100644
index 0000000..244d234
--- /dev/null
+++ b/aten/src/THCUNN/generic/IndexLinear.cu
@@ -0,0 +1,273 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/IndexLinear.cu"
+#else
+
+static bool THNN_(checkKeysValues)(THCState *state, THCudaLongTensor* keys,
+                                   THCTensor* values)
+{
+    return THCudaLongTensor_size(state, keys, 0) == THCTensor_(nElement)(state, values)
+        && THCTensor_(_nDimension)(state, values) == 1
+        && THCudaLongTensor__nDimension(state, keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+    THCState *state,
+    THCudaLongTensor *keys,
+    int64_t keysOffset,
+    THCTensor *values,
+    THCudaLongTensor *sizes,
+    THCudaLongTensor *cumSumSizes,
+    THCTensor *output,
+    THCTensor *weight,
+    THCTensor *bias,
+    THCTensor *normalizedValues,
+    int   train)
+{
+    // Make sure these inputs are contiguous to accelerate computations
+    THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1,
+               "keys vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, values), 3,
+               "values vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4,
+               "sizes vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5,
+               "cumSumSizes vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, output), 6,
+               "output vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, weight), 7,
+               "weight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, bias), 8,
+               "bias vector must be contiguous");
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
+               "Keys and values should have the same number of elements");
+
+    int64_t batchSize = sizes->size[0];
+    int64_t outDim = bias->size[0];
+    int64_t wDim = weight->size[1];
+    int64_t weightStride = weight->stride[0];
+    int maxNormalize = wDim - outDim;
+    int64_t keysSize = keys->size[0];
+    int64_t nnzPerRow = divup(keysSize, batchSize);
+
+    THCTensor_(resize2d)(state, output, batchSize, outDim);
+    int64_t *keysData        = THCudaLongTensor_data (state, keys);
+    real *valuesData      = THCTensor_(data)      (state, values);
+    int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes);
+    real *biasData        = THCTensor_(data)      (state, bias);
+    real *weightData      = THCTensor_(data)      (state, weight);
+    real *outData         = THCTensor_(data)      (state, output);
+
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(THREADS_X, THREADS_Y);
+    int blocks_x = divup(outDim, threads.x);
+    int blocks_y = batchSize;
+    int nnzPerBlock = ((outDim == 1 || batchSize == 1) ? THREADS_X : NNZ_PER_BLOCK_MAX);
+    int blocks_z = divup(nnzPerRow, nnzPerBlock);
+
+    dim3 blocks(blocks_x, blocks_y, blocks_z);
+
+    if (blocks_z > 1) {
+        THCudaCheck(cudaMemsetAsync(outData, 0, outDim * batchSize * sizeof(real), stream));
+    }
+
+    real *normalizedValuesData = NULL;
+    if (maxNormalize && train) {
+        THCTensor_(resize1d)(state, normalizedValues, keysSize);
+        normalizedValuesData = THCTensor_(data)(state, normalizedValues);
+        updateOutput<real, true><<<blocks, threads, 0, stream>>>
+            (outData, normalizedValuesData, valuesData, cumSumSizesData, keysData,
+             batchSize, outDim, weightData, biasData, weightStride, keysOffset, maxNormalize, nnzPerBlock);
+    } else {
+        updateOutput<real, false><<<blocks, threads, 0, stream>>>
+            (outData, normalizedValuesData, valuesData, cumSumSizesData, keysData,
+             batchSize, outDim, weightData, biasData, weightStride, keysOffset, maxNormalize, nnzPerBlock);
+    }
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+    THCState *state,
+    THCudaLongTensor *keys,
+    int64_t keysOffset,
+    THCTensor *values,
+    THCudaLongTensor *sizes,
+    THCudaLongTensor *cumSumSizes,
+    THCTensor *gradOutput,
+    THCTensor *gradWeight,
+    THCTensor *gradBias,
+    THCTensor *weight,
+    THCTensor *bias,
+    THCTensor* valuesBuffer,
+    accreal weightDecay,
+    accreal scale)
+{
+    int64_t keysSize = keys->size[0];
+    int64_t batchSize = sizes->size[0];
+    int64_t outDim = bias->size[0];
+    int64_t wDim = weight->size[1];
+    int maxNormalize = wDim - outDim;
+
+    // Make sure these inputs are contiguous to accelerate computations
+    THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1,
+               "keys vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, values), 3,
+               "values vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4,
+               "sizes vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5,
+               "cumSumSizes vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 6,
+               "gradOutput vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 7,
+               "gradWeight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 8,
+               "gradBias vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, weight), 9,
+               "weight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, bias), 10,
+               "bias vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, valuesBuffer), 11,
+               "valuesBuffer vector must be contiguous");
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
+               "Keys and values should have the same number of elements");
+
+    THCTensor_(resize2d)(state, gradWeight, keysSize, outDim * (maxNormalize > 0 ? 2 : 1));
+
+    real *valuesData      = THCTensor_(data)      (state, values);
+    int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes);
+    real *gradOutputData  = THCTensor_(data)      (state, gradOutput);
+    real *gradBiasData    = THCTensor_(data)      (state, gradBias);
+    real *gradWeightData  = THCTensor_(data)      (state, gradWeight);
+    int64_t gradWeightStride = gradWeight->stride[0];
+
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(THREADS_X, THREADS_Y);
+    int blocks_x = divup(outDim, threads.x);
+    accGradBias<real, false><<<blocks_x, threads, 0, stream>>>
+        (gradBiasData, gradOutputData, outDim, batchSize, scale, weightDecay);
+
+    dim3 blocks(blocks_x, batchSize);
+    accGradWeight<real><<<blocks, threads, 0, stream>>>
+        (gradWeightData, gradOutputData, valuesData, cumSumSizesData, outDim,
+         gradWeightStride, scale, weightDecay, maxNormalize);
+}
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+    THCState *state,
+    THCudaLongTensor *keys,
+    int64_t keysOffset,
+    THCTensor *values,
+    THCudaLongTensor *sizes,
+    THCudaLongTensor *cumSumSizes,
+    THCTensor *gradOutput,
+    THCTensor *weight,
+    THCTensor *bias,
+    accreal weightDecay,
+    accreal scale)
+{
+    // Make sure these inputs are contiguous to accelerate computations
+    THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1,
+               "keys vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, values), 3,
+               "values vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4,
+               "sizes vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5,
+               "cumSumSizes vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 6,
+               "gradOutput vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, weight), 7,
+               "weight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, bias), 8,
+               "bias vector must be contiguous");
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
+               "Keys and values should have the same number of elements");
+
+    int64_t batchSize = sizes->size[0];
+    int64_t outDim = bias->size[0];
+    int64_t keysSize = keys->size[0];
+    int64_t wDim = weight->size[1];
+    int maxNormalize = wDim - outDim;
+
+    real *biasData         = THCTensor_(data)      (state, bias);
+    real *weightData       = THCTensor_(data)      (state, weight);
+    real *gradOutputData   = THCTensor_(data)      (state, gradOutput);
+    real *valuesData       = THCTensor_(data)      (state, values);
+    int64_t *keysData         = THCudaLongTensor_data (state, keys);
+    int64_t *cumSumSizesData  = THCudaLongTensor_data (state, cumSumSizes);
+    int64_t weightStride = weight->stride[0];
+
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(THREADS_X, THREADS_Y);
+    int blocks_x = divup(outDim, threads.x);
+
+    accGradBias<real, true><<<blocks_x, threads, 0, stream>>>
+        (biasData, gradOutputData, outDim, batchSize, scale, weightDecay);
+
+    int64_t nnzPerRow = divup(keysSize, batchSize);
+    int blocks_y = divup(nnzPerRow, REPEAT * threads.y);
+    dim3 blocks(blocks_x, blocks_y);
+
+    for (int64_t batchId = 0; batchId < batchSize; batchId++) {
+        accUpdateWeight<real><<<blocks, threads, 0, stream>>>
+            (weightData, weightStride, gradOutputData, outDim, valuesData,
+             cumSumSizesData, keysData, keysOffset, scale, weightDecay, maxNormalize,
+             batchId);
+    }
+}
+
+void THNN_(IndexLinear_updateParameters)(
+    THCState *state,
+    THCTensor *gradWeight,
+    THCTensor *gradBias,
+    THCTensor *weight,
+    THCTensor *bias,
+    THCudaLongTensor *runningKeys,
+    THCudaLongTensor *cumSumSizes,
+    int64_t keysOffset,
+    accreal weightDecay,
+    accreal learningRate)
+{
+    // Make sure these inputs are contiguous to accelerate computations
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 1,
+               "gradWeight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 2,
+               "gradBias vector must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, weight), 3,
+               "weight matrix must be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, bias), 4,
+               "bias vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, runningKeys), 5,
+               "runningKeys vector must be contiguous");
+    THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 6,
+               "cumSumSizes vector must be contiguous");
+
+    int64_t outDim = bias->size[0];
+    int64_t wDim = weight->size[1];
+    int maxNormalize = wDim - outDim;
+    int64_t keysSize = runningKeys->size[0];
+    int64_t batchSize = cumSumSizes->size[0];
+
+    THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias);
+    int64_t gradWeightStride = gradWeight->stride[0];
+    int64_t weightStride = weight->stride[0];
+
+    int64_t *keysData        = THCudaLongTensor_data (state, runningKeys);
+    int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes);
+    real *gradWeightData  = THCTensor_(data)      (state, gradWeight);
+    real *weightData      = THCTensor_(data)      (state, weight);
+
+    dim3 threads(THREADS_X, THREADS_Y);
+    int64_t nnzPerRow = divup(keysSize, batchSize);
+    int blocks_x = divup(outDim, threads.x);
+    int blocks_y = divup(nnzPerRow, REPEAT * threads.y);
+    dim3 blocks(blocks_x, blocks_y);
+    cudaStream_t stream = THCState_getCurrentStream(state);
+
+    for (int64_t batchId = 0; batchId < batchSize; batchId++) {
+        updateWeight<real><<<blocks, threads, 0, stream>>>
+            (weightData, gradWeightData, keysData, cumSumSizesData, outDim,
+             gradWeightStride, weightStride, keysOffset, learningRate, weightDecay,
+             maxNormalize, batchId);
+    }
+}
+#endif
diff --git a/aten/src/THCUNN/generic/L1Cost.cu b/aten/src/THCUNN/generic/L1Cost.cu
new file mode 100644
index 0000000..fd85e61
--- /dev/null
+++ b/aten/src/THCUNN/generic/L1Cost.cu
@@ -0,0 +1,44 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/L1Cost.cu"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_check_dim_size(state, output, 1, 0, 1);
+  THCUNN_assertSameGPU(state, 1, input);
+  accreal sum;
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  input = THCTensor_(newContiguous)(state, input);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor<real, accreal>(), accreal(0), thrust::plus<accreal>());
+
+  THCTensor_(free)(state, input);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(L1Cost_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 2, input, gradInput);
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor<real>());
+
+  THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/LeakyReLU.cu b/aten/src/THCUNN/generic/LeakyReLU.cu
new file mode 100644
index 0000000..dc92090
--- /dev/null
+++ b/aten/src/THCUNN/generic/LeakyReLU.cu
@@ -0,0 +1,59 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LeakyReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LeakyReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal negval_,
+           bool inplace)
+{
+  real negval = ScalarConvert<accreal, real>::to(negval_);
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1<real>(state, input, LeakyReLUUpdateOutputIP<real>(negval));
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2<real, real>(state, output, input, LeakyReLUUpdateOutput<real>(negval));
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           accreal negval_,
+           bool inplace)
+{
+  real negval = ScalarConvert<accreal, real>::to(negval_);
+
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
+
+  if (inplace)
+  {
+    THC_pointwiseApply2<real, real>(state, gradOutput, input, LeakyReLUUpdateGradInputIP<real>(negval));
+    THCTensor_(set)(state, gradInput, gradOutput);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput<real>(negval));
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/LogSigmoid.cu b/aten/src/THCUNN/generic/LogSigmoid.cu
new file mode 100644
index 0000000..02d55da
--- /dev/null
+++ b/aten/src/THCUNN/generic/LogSigmoid.cu
@@ -0,0 +1,31 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LogSigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LogSigmoid_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *buffer)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, logSigmoid_updateOutput_functor<real>());
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *buffer)
+{
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/LookupTable.cu b/aten/src/THCUNN/generic/LookupTable.cu
new file mode 100644
index 0000000..22653dd
--- /dev/null
+++ b/aten/src/THCUNN/generic/LookupTable.cu
@@ -0,0 +1,212 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LookupTable.cu"
+#else
+
+void THNN_(LookupTable_accGradParameters)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCIndexTensor *count,
+           THCIndexTensor *sortedIndices,
+           THCIndexTensor *origIndices,
+           bool scaleGradByFreq,
+           int paddingValue,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sortedIndices, origIndices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (!(THCIndexTensor_(isContiguous)(state, input) &&
+        THCTensor_(isContiguous)(state, gradWeight))) {
+    THError("Tensors must be contiguous");
+  }
+
+  int nDim = THCIndexTensor_(_nDimension)(state, input);
+  if (THCIndexTensor_(_nDimension)(state, input) != 1 && THCIndexTensor_(_nDimension)(state, input) != 2) {
+    THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, input);
+    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+  }
+
+  ptrdiff_t numel = THCIndexTensor_(nElement)(state, input);
+  int64_t stride = THCTensor_(stride)(state, gradWeight, 0);
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  if (numel <= 768 && !scaleGradByFreq) {
+    const int WARP_SIZE = 32;
+    const int BLOCKDIMY = 32;
+    dim3 grid(THCCeilDiv(stride, (int64_t)WARP_SIZE));
+    dim3 block(WARP_SIZE, BLOCKDIMY);
+
+    cunn_LookupTable_accGradParametersKernelByFeature<real, accreal>
+    <<<grid, 
+       block, 
+       sizeof(accreal)*WARP_SIZE*BLOCKDIMY + sizeof(int)*WARP_SIZE*BLOCKDIMY,
+       stream>>>
+      (THCIndexTensor_(data)(state, input),
+       THCTensor_(data)(state, gradOutput),
+       THCTensor_(data)(state, gradWeight),
+       scale,
+       numel,
+       stride,
+       paddingValue);
+    THCTensor_(free)(state, gradOutput);
+    THCudaCheck(cudaGetLastError());
+    return;
+  }
+
+  THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input);
+  THCIndexTensor_(resize)(state, sortedIndices, inputSize, NULL);
+  THCIndexTensor_(resize)(state, origIndices, inputSize, NULL);
+  THLongStorage_free(inputSize);
+
+  // Sort the inputs into sorted with the corresponding indices; we
+  // don't need a stable or multidimensional sort, so just use Thrust
+  // directly
+  {
+    THCIndexTensor_(copy)(state, sortedIndices, input);
+
+    THCThrustAllocator thrustAlloc(state);
+
+    thrust::device_ptr<THCIndex_t>
+      sortedIndicesIter(THCIndexTensor_(data)(state, sortedIndices));
+    thrust::device_ptr<THCIndex_t>
+      origIndicesIter(THCIndexTensor_(data)(state, origIndices));
+
+    // Fill sortedOrigIndices with sequential indices
+    thrust::counting_iterator<THCIndex_t> countIter(TH_INDEX_BASE);
+
+    thrust::copy(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      countIter, countIter + numel, origIndicesIter);
+
+    // Sort; a stable sort is not required
+    thrust::sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      sortedIndicesIter, sortedIndicesIter + numel,
+      origIndicesIter, ThrustLTOp<int64_t>());
+  }
+
+  THCIndex_t *sortedIndices_data = THCIndexTensor_(data)(state, sortedIndices);
+  THCIndex_t *origIndices_data = THCIndexTensor_(data)(state, origIndices);
+  THCIndex_t *count_data = NULL;
+
+  if (scaleGradByFreq) {
+    THCIndexTensor_(resizeAs)(state, count, input);
+    count_data = THCIndexTensor_(data)(state, count);
+
+    THCThrustAllocator thrustAlloc(state);
+    thrust::device_ptr<THCIndex_t> sortedIndices_ptr(sortedIndices_data);
+    thrust::device_ptr<THCIndex_t> count_ptr(count_data);
+
+    // Compute an increasing sequence per unique item in sortedIndices:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 1 2 3 1 2 1 1 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      sortedIndices_ptr,
+      sortedIndices_ptr + numel,
+      thrust::make_constant_iterator(1),
+      count_ptr
+    );
+
+    // Take the maximum of each count per unique key in reverse:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 3 3 3 2 2 1 2 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      thrust::make_reverse_iterator(sortedIndices_ptr + numel),
+      thrust::make_reverse_iterator(sortedIndices_ptr),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::equal_to<int64_t>(),
+      thrust::maximum<int64_t>()
+    );
+  }
+
+  dim3 grid(THCCeilDiv(numel, (ptrdiff_t) 4), THCCeilDiv(stride, (int64_t) 128));
+  dim3 block(32, 4);
+  cunn_LookupTable_accGradParametersKernel<real, accreal><<<grid, block, 0, stream>>>(
+    sortedIndices_data,
+    origIndices_data,
+    THCTensor_(data)(state, gradOutput),
+    THCTensor_(data)(state, gradWeight),
+    count_data,
+    scale,
+    numel,
+    stride,
+    paddingValue
+  );
+
+  THCTensor_(free)(state, gradOutput);
+  THCudaCheck(cudaGetLastError());
+}
+
+#define THREADS 256
+#define RUN(NORM, IDXTYPE) \
+  calculate_norms_and_renorm<real, accreal, IDXTYPE, NORM> \
+    <<<numel, THREADS/2, THREADS * sizeof(accreal), THCState_getCurrentStream(state)>>> \
+    (weightsRaw, idxRaw, normType, maxNorm, THCTensor_(stride)(state, weight, 0))
+
+void THNN_(LookupTable_renorm)(
+           THCState *state,
+           THCIndexTensor *idx,
+           THCTensor *weight,
+           accreal maxNorm,
+           accreal normType)
+{
+  THCUNN_assertSameGPU(state, 2, idx, weight);
+  if (!(THCIndexTensor_(isContiguous)(state, idx) &&
+        THCTensor_(isContiguous)(state, weight))) {
+    THError("Tensors must be contiguous");
+  }
+
+  if (THCIndexTensor_(_nDimension)(state, idx) != 1) {
+    THError("idx must be a vector");
+  }
+
+  if (normType <= 0) {
+    THError("non-positive-norm not supported");
+  }
+
+  THCIndex_t numel = THCIndexTensor_(nElement)(state, idx);
+
+  real * weightsRaw = THCTensor_(data)(state, weight);
+  THCIndex_t * idxRaw = THCIndexTensor_(data)(state, idx);
+
+  // get the unique indices
+  thrust::device_ptr<THCIndex_t> idxThrust(idxRaw);
+  thrust::device_ptr<THCIndex_t> endIdxThrust(thrust::unique(idxThrust, idxThrust+numel));
+  numel = endIdxThrust - idxThrust;
+
+  // At launch time figure out what the index type is and norm type
+  int Norm = ScalarConvert<accreal, int>::to(normType);
+  if (THCTensor_canUse32BitIndexMath(state, idx)) {
+    if (Norm == 1) {
+      RUN(1, unsigned int);
+    } else if (Norm == 2) {
+      RUN(2, unsigned int);
+    } else {
+      RUN(-1, unsigned int);
+    }
+  } else {
+    if (Norm == 1) {
+      RUN(1, unsigned long);
+    } else if (Norm == 2) {
+      RUN(2, unsigned long);
+    } else {
+      RUN(-1, unsigned long);
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/LookupTableBag.cu b/aten/src/THCUNN/generic/LookupTableBag.cu
new file mode 100644
index 0000000..8386f60
--- /dev/null
+++ b/aten/src/THCUNN/generic/LookupTableBag.cu
@@ -0,0 +1,200 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LookupTableBag.cu"
+#else
+
+
+void THNN_(LookupTableBag_updateOutput)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCIndexTensor *offsets,
+           THCTensor *weight,
+           THCTensor *output,
+           THCIndexTensor *offset2bag,
+	   int mode,
+           THCIndexTensor *bag_size)
+{
+  THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
+
+  if (!(THCIndexTensor_(isContiguous)(state, input) &&
+        THCIndexTensor_(isContiguous)(state, offsets) &&
+        THCTensor_(isContiguous)(state, weight))) {
+    THError("Tensors must be contiguous");
+  }
+
+  ptrdiff_t numIndices = THCIndexTensor_(size)(state, input, 0);
+  ptrdiff_t numBags = THCIndexTensor_(size)(state, offsets, 0);
+  ptrdiff_t stride = THCTensor_(size)(state, weight, 1);
+  int64_t *bag_size_data = NULL;
+  if (bag_size != NULL) {
+    bag_size_data = THCIndexTensor_(data)(state, bag_size);
+  }
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input);
+  THLongStorage *outputSize = THLongStorage_newWithSize(2);
+  THLongStorage_data(outputSize)[0] = numBags;
+  THLongStorage_data(outputSize)[1] = stride;
+  THCTensor_(resize)(state, output, outputSize, NULL);
+  THCTensor_(zero)(state, output);
+  THCIndexTensor_(resize)(state, offset2bag, inputSize, NULL);
+  THLongStorage_free(inputSize);
+  THLongStorage_free(outputSize);
+
+  dim3 block = dim3(32, 8);
+  int grid = 1024;
+  cunn_LookupTableBag_updateOutputKernel<real, accreal><<<grid, block, 0, stream>>>(
+    THCIndexTensor_(data)(state, input),
+    THCIndexTensor_(data)(state, offsets),
+    THCTensor_(data)(state, weight),
+    THCTensor_(data)(state, output),
+    THCIndexTensor_(data)(state, offset2bag),
+    numIndices,
+    numBags,
+    stride,
+    mode,
+    bag_size_data
+  );
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(LookupTableBag_accGradParameters)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCIndexTensor *offset2bag,
+           THCIndexTensor *count,
+           THCIndexTensor *sortedIndices,
+           THCIndexTensor *origIndices,
+           bool scaleGradByFreq,
+	   int mode,
+	   THCIndexTensor *bag_size,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, offset2bag, sortedIndices, origIndices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (!(THCIndexTensor_(isContiguous)(state, input) &&
+        THCTensor_(isContiguous)(state, gradWeight) &&
+        THCIndexTensor_(isContiguous)(state, offset2bag))) {
+    THError("Tensors must be contiguous");
+  }
+
+  int64_t *bag_size_data = NULL;
+  if (bag_size != NULL) {
+    bag_size_data = THCIndexTensor_(data)(state, bag_size);
+  }
+
+  int nDim = THCIndexTensor_(_nDimension)(state, input);
+  if (THCIndexTensor_(_nDimension)(state, input) != 1 && THCIndexTensor_(_nDimension)(state, input) != 2) {
+    THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, input);
+    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+  }
+
+  ptrdiff_t numel = THCIndexTensor_(nElement)(state, input);
+  int64_t stride = THCTensor_(stride)(state, gradWeight, 0);
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input);
+  THCIndexTensor_(resize)(state, sortedIndices, inputSize, NULL);
+  THCIndexTensor_(resize)(state, origIndices, inputSize, NULL);
+  THLongStorage_free(inputSize);
+
+  // Sort the inputs into sorted with the corresponding indices; we
+  // don't need a stable or multidimensional sort, so just use Thrust
+  // directly
+  {
+    THCIndexTensor_(copy)(state, sortedIndices, input);
+
+    THCThrustAllocator thrustAlloc(state);
+
+    thrust::device_ptr<THCIndex_t>
+      sortedIndicesIter(THCIndexTensor_(data)(state, sortedIndices));
+    thrust::device_ptr<THCIndex_t>
+      origIndicesIter(THCIndexTensor_(data)(state, origIndices));
+
+    // Fill sortedOrigIndices with sequential indices
+    thrust::counting_iterator<THCIndex_t> countIter(TH_INDEX_BASE);
+
+    thrust::copy(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      countIter, countIter + numel, origIndicesIter);
+
+    // Sort; a stable sort is not required
+    thrust::sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      sortedIndicesIter, sortedIndicesIter + numel,
+      origIndicesIter, ThrustLTOp<int64_t>());
+  }
+
+  THCIndex_t *sortedIndices_data = THCIndexTensor_(data)(state, sortedIndices);
+  THCIndex_t *origIndices_data = THCIndexTensor_(data)(state, origIndices);
+  THCIndex_t *offset2bag_data = THCIndexTensor_(data)(state, offset2bag);
+  THCIndex_t *count_data = NULL;
+
+  if (scaleGradByFreq) {
+    THCIndexTensor_(resizeAs)(state, count, input);
+    count_data = THCIndexTensor_(data)(state, count);
+
+    THCThrustAllocator thrustAlloc(state);
+    thrust::device_ptr<THCIndex_t> sortedIndices_ptr(sortedIndices_data);
+    thrust::device_ptr<THCIndex_t> count_ptr(count_data);
+
+    // Compute an increasing sequence per unique item in sortedIndices:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 1 2 3 1 2 1 1 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      sortedIndices_ptr,
+      sortedIndices_ptr + numel,
+      thrust::make_constant_iterator(1),
+      count_ptr
+    );
+
+    // Take the maximum of each count per unique key in reverse:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 3 3 3 2 2 1 2 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      thrust::make_reverse_iterator(sortedIndices_ptr + numel),
+      thrust::make_reverse_iterator(sortedIndices_ptr),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::equal_to<int64_t>(),
+      thrust::maximum<int64_t>()
+    );
+  }
+
+  dim3 grid(THCCeilDiv(numel, (ptrdiff_t) 4), THCCeilDiv(stride, (int64_t) 128));
+  dim3 block(32, 4);
+  cunn_LookupTableBag_accGradParametersKernel<real, accreal><<<grid, block, 0, stream>>>(
+    sortedIndices_data,
+    origIndices_data,
+    THCTensor_(data)(state, gradOutput),
+    THCTensor_(data)(state, gradWeight),
+    offset2bag_data,
+    count_data,
+    scale,
+    numel,
+    stride,
+    mode,
+    bag_size_data
+  );
+
+  THCTensor_(free)(state, gradOutput);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/MSECriterion.cu b/aten/src/THCUNN/generic/MSECriterion.cu
new file mode 100644
index 0000000..e41e741
--- /dev/null
+++ b/aten/src/THCUNN/generic/MSECriterion.cu
@@ -0,0 +1,126 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MSECriterion.cu"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 3, input, target, output);
+
+  if (reduction != Reduction::None) {
+    THCTensor_(resize1d)(state, output, 1);
+
+    ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+    input = THCTensor_(newContiguous)(state, input);
+    target = THCTensor_(newContiguous)(state, target);
+
+    THCThrustAllocator thrustAlloc(state);
+    thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+    thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+    accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      input_data, input_data+size, target_data, (accreal) 0,
+      thrust::plus<accreal>(), mse_functor<real, accreal>());
+
+    if (reduction == Reduction::ElementwiseMean)
+      sum /= size;
+
+    THCTensor_(free)(state, input);
+    THCTensor_(free)(state, target);
+
+    THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+    return;
+  }
+
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply3<real, real, real>(
+      state,
+      input,
+      target,
+      output,
+      mse_updateOutput_functor<real>());
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput);
+
+  if (reduction != Reduction::None) {
+    ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+    THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+    accreal norm = reduction == Reduction::ElementwiseMean ? (accreal)(2)/size : (accreal)(2);
+    norm *= ScalarConvert<real, accreal>::to(THCTensor_(get1d)(state, gradOutput, 0));
+
+    input = THCTensor_(newContiguous)(state, input);
+    target = THCTensor_(newContiguous)(state, target);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+
+    THCThrustAllocator thrustAlloc(state);
+    thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+    thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+    thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+    thrust::transform(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      input_data, input_data+size, target_data, gradInput_data,
+      mse_updateGradInput_functor<real, accreal>(norm));
+
+    THCTensor_(free)(state, input);
+    THCTensor_(free)(state, target);
+    return;
+  }
+
+  THCUNN_check_shape(state, input, gradOutput);
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradOutput_data(THCTensor_(data)(state, gradOutput));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, gradInput_data,
+    mse_updateGradInput_functor<real, accreal>(2));
+
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    gradInput_data, gradInput_data+size, gradOutput_data, gradInput_data,
+    thrust::multiplies<real>());
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/MarginCriterion.cu b/aten/src/THCUNN/generic/MarginCriterion.cu
new file mode 100644
index 0000000..221f9d9
--- /dev/null
+++ b/aten/src/THCUNN/generic/MarginCriterion.cu
@@ -0,0 +1,70 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MarginCriterion.cu"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           accreal margin_)
+{
+  real margin = ScalarConvert<accreal, real>::to(margin_);
+  THCUNN_check_nElement(state, input, target);
+  THCUNN_check_dim_size(state, output, 1, 0, 1);
+  THCUNN_assertSameGPU(state, 2, input, target);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(),
+      margin_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin)));
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+
+void THNN_(MarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           accreal margin_)
+{
+  real margin = ScalarConvert<accreal, real>::to(margin_);
+
+  THCUNN_check_nElement(state, input, target);
+  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  accreal norm = sizeAverage ? 1.f/size : 1;
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+      margin_updateGradInput_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin), norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
new file mode 100644
index 0000000..2b02bf2
--- /dev/null
+++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -0,0 +1,162 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           THCTensor *istarget,
+           int64_t reduction)
+{
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCIndexTensor_(newContiguous)(state, target);
+  istarget = THCTensor_(newContiguous)(state, istarget);
+  THCTensor_(resizeAs)(state, istarget, input);
+
+  if(input->dim() == 1)
+  {
+    int dim = input->size[0];
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3,
+        "inconsistent target size");
+    THCTensor_(resize1d)(state, output, 1);
+
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal>
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        1, dim,
+        reduction == Reduction::ElementwiseMean
+        );
+    THCudaCheck(cudaGetLastError());
+  }
+  else if(input->dim() == 2)
+  {
+    int nframe = input->size[0];
+    int dim = input->size[1];
+    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
+               && (target->size[1] == dim), 3, "inconsistent target size");
+
+    dim3 blocks(input->size[0]);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    if (reduction != Reduction::None)
+    {
+      THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]);
+      THCTensor_(resize1d)(state, output, 1);
+
+      cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal>
+        <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+          THCTensor_(data)(state, output_tmp),
+          THCTensor_(data)(state, input),
+          THCIndexTensor_(data)(state, target),
+          THCTensor_(data)(state, istarget),
+          nframe, dim,
+          reduction == Reduction::ElementwiseMean
+          );
+      THCudaCheck(cudaGetLastError());
+      THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, output_tmp)));
+      THCTensor_(free)(state, output_tmp);
+    }
+    else
+    {
+    THCTensor_(resize1d)(state, output, input->size[0]);
+
+    cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal>
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        nframe, dim,
+        false
+        );
+    THCudaCheck(cudaGetLastError());
+    }
+  }
+  else
+    AT_ERROR("non-empty vector or matrix expected, got size: ", input->sizes());
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, istarget);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+            THCState *state,
+            THCTensor *input,
+            THCIndexTensor *target,
+            THCTensor *gradOutput,
+            THCTensor *gradInput,
+            THCTensor *istarget,
+            int64_t reduction)
+{
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCIndexTensor_(newContiguous)(state, target);
+  istarget = THCTensor_(newContiguous)(state, istarget);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if(gradInput->dim() == 1)
+  {
+    int dim = gradInput->size[0];
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3,
+               "inconsistent target size");
+    THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size[0] == dim), 3,
+               "inconsistent isTarget size");
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal>
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        1, gradInput->size[0],
+        reduction == Reduction::ElementwiseMean,
+        reduction != Reduction::None);
+
+  }
+  else if(gradInput->dim() == 2)
+  {
+    int nframe = gradInput->size[0];
+    int dim = gradInput->size[1];
+    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
+               && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size[0] == nframe)
+               && (istarget->size[1] == dim), 3, "inconsistent isTarget size");
+    dim3 blocks(gradInput->size[0]);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal>
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        gradInput->size[0], gradInput->size[1],
+        reduction == Reduction::ElementwiseMean,
+        reduction != Reduction::None);
+  }
+  else
+    AT_ERROR("non-empty vector or matrix expected, got size: ", gradInput->sizes());
+
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, istarget);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
new file mode 100644
index 0000000..a620c0f
--- /dev/null
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -0,0 +1,236 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiMarginCriterion.cu"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           int64_t reduction,
+           int p,
+           THCTensor *weights,
+           accreal margin_)
+{
+  real margin = ScalarConvert<accreal, real>::to(margin_);
+  THCUNN_assertSameGPU(state, 2, input, target);
+  input = THCTensor_(newContiguous)(state, input);
+  if(weights)
+    weights = THCTensor_(newContiguous)(state, weights);
+  if (input->dim() == 1)
+  {
+    dim3 blocks(1);
+    dim3 threads(MULTIMARGIN_THREADS);
+    THCTensor_(resize1d)(state, output, 1);
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, input->size[0],
+        reduction == Reduction::ElementwiseMean,
+        margin
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, input->size[0],
+        reduction == Reduction::ElementwiseMean,
+        margin
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else if (input->dim() == 2)
+  {
+    int nframe = input->size[0];
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3,
+               "inconsistent target size");
+    dim3 blocks(input->size[0]);
+    dim3 threads(MULTIMARGIN_THREADS);
+
+    if (reduction == Reduction::None)
+    {
+      THCTensor_(resize1d)(state, output, input->size[0]);
+      if (p == 1)
+      {
+        cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+          THCTensor_(data)(state, output),
+          THCTensor_(data)(state, input),
+          THCIndexTensor_(data)(state, target),
+          weights ? THCTensor_(data)(state, weights) : NULL,
+          nframe, input->size[1],
+          false,
+          margin
+        );
+      }
+      else if (p == 2)
+      {
+        cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+          THCTensor_(data)(state, output),
+          THCTensor_(data)(state, input),
+          THCIndexTensor_(data)(state, target),
+          weights ? THCTensor_(data)(state, weights) : NULL,
+          nframe, input->size[1],
+          false,
+          margin
+        );
+      }
+      THCudaCheck(cudaGetLastError());
+    }
+    else
+    {
+      THCTensor_(resize1d)(state, output, 1);
+      THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]);  // tmp output buffer
+      if (p == 1)
+      {
+        cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+          THCTensor_(data)(state, output_),
+          THCTensor_(data)(state, input),
+          THCIndexTensor_(data)(state, target),
+          weights ? THCTensor_(data)(state, weights) : NULL,
+          nframe, input->size[1],
+          reduction == Reduction::ElementwiseMean,
+          margin
+        );
+      }
+      else if (p == 2)
+      {
+        cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+          THCTensor_(data)(state, output_),
+          THCTensor_(data)(state, input),
+          THCIndexTensor_(data)(state, target),
+          weights ? THCTensor_(data)(state, weights) : NULL,
+          input->size[0], input->size[1],
+          reduction == Reduction::ElementwiseMean,
+          margin
+        );
+      }
+      THCudaCheck(cudaGetLastError());
+      float sum = THCTensor_(sumall)(state, output_);
+      THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+      THCTensor_(free)(state, output_);
+    }
+  }
+  else
+  {
+    AT_ERROR("non-empty vector or matrix expected, got sizes: ", input->sizes());
+  }
+
+  THCTensor_(free)(state, input);
+  if(weights)
+    THCTensor_(free)(state, weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction,
+           int p,
+           THCTensor *weights,
+           accreal margin_)
+{
+  real margin = ScalarConvert<accreal, real>::to(margin_);
+  THCUNN_assertSameGPU(state, 3, input, gradInput, target);
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  if(weights)
+    weights = THCTensor_(newContiguous)(state, weights);
+
+  if (input->dim() == 1)
+  {
+    dim3 blocks(1);
+    dim3 threads(MULTIMARGIN_THREADS);
+
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, gradInput->size[0],
+        reduction == Reduction::ElementwiseMean,
+        margin,
+        reduction != Reduction::None
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, gradInput->size[0],
+        reduction == Reduction::ElementwiseMean,
+        margin,
+        reduction != Reduction::None
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else if (input->dim() == 2)
+  {
+    int nframe = gradInput->size[0];
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3,
+               "inconsistent target size");
+    dim3 blocks(gradInput->size[0]);
+    dim3 threads(MULTIMARGIN_THREADS);
+
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        nframe, gradInput->size[1],
+        reduction == Reduction::ElementwiseMean,
+        margin,
+        reduction != Reduction::None
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, gradOutput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        nframe, gradInput->size[1],
+        reduction == Reduction::ElementwiseMean,
+        margin,
+        reduction != Reduction::None
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else
+  {
+    AT_ERROR("non-empty vector or matrix expected, got ", input->sizes());
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  if(weights)
+    THCTensor_(free)(state, weights);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu
new file mode 100644
index 0000000..e03d573
--- /dev/null
+++ b/aten/src/THCUNN/generic/PReLU.cu
@@ -0,0 +1,164 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/PReLU.cu"
+#else
+
+void THNN_(PReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight)
+{
+  THCTensor_(resizeAs)(state, output, input);
+  int64_t nOutputPlane = THCTensor_(numel)(state, weight);
+
+  weight = THCTensor_(newContiguous)(state, weight);
+  real *w = THCTensor_(data)(state, weight);
+
+  if (nOutputPlane == 1)
+  {
+    THC_pointwiseApply2<real, real>(state, output, input, PReLUUpdateOutput<real>(w));
+  }
+  else
+  {
+    int ndim = THCTensor_(_nDimension)(state, input);
+    input = THCTensor_(newContiguous)(state, input);
+
+    int n = THCTensor_(nElement)(state, input);
+    if (input->size[ndim > 1] != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]);
+
+    int mapSize = 1;
+    for (int d = 2; d < ndim; d++) {
+      mapSize *= input->size[d];
+    }
+    int nElemsPerSample = nOutputPlane * mapSize;
+    preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, output),
+      THCTensor_(data)(state, input),
+      w,
+      n, nElemsPerSample, mapSize
+    );
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+  }
+
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(PReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight)
+{
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  int64_t nOutputPlane = THCTensor_(numel)(state, weight);
+
+  weight = THCTensor_(newContiguous)(state, weight);
+  real *w = THCTensor_(data)(state, weight);
+  if (nOutputPlane == 1)
+  {
+    THC_pointwiseApply3<real, real, real>(state, gradInput, gradOutput, input, PReLUUpdateGradInput<real>(w));
+  }
+  else
+  {
+    int ndim = THCTensor_(_nDimension)(state, input);
+    input = THCTensor_(newContiguous)(state, input);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+    int n = THCTensor_(nElement)(state, input);
+    if (input->size[ndim > 1] != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]);
+
+    int mapSize = 1;
+    for (int d = 2; d < ndim; d++) {
+      mapSize *= input->size[d];
+    }
+    int nElemsPerSample = nOutputPlane * mapSize;
+    preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, gradInput),
+      THCTensor_(data)(state, input),
+      w,
+      THCTensor_(data)(state, gradOutput),
+      n, nElemsPerSample, mapSize
+    );
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+    THCTensor_(free)(state, gradOutput);
+  }
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(PReLU_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradWeight,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_check_nElement(state, input, gradOutput);
+  int64_t nOutputPlane = THCTensor_(numel)(state, weight);
+  // use grad input for temporary storage, then call updateGradInput again
+
+  if (nOutputPlane == 1)
+  {
+    THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, PReLUAccGradParametersShared<real>());
+
+    // introduces a sync point
+    real sum = ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, gradInput));
+    real w = THCTensor_(get1d)(state, gradWeight, 0);
+    THCTensor_(set1d)(state, gradWeight, 0, w + sum * scale);
+
+    // restore gradInput
+    THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight);
+  }
+  else
+  {
+    int ndim = THCTensor_(_nDimension)(state, input);
+
+    if (ndim == 1)
+    {
+      THC_pointwiseApply3<real, real, real>(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1<real>(scale));
+    }
+    else
+    {
+      THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, PReLUAccGradParameters<real>(scale));
+      THCTensor *gradWeightBuf = THCTensor_(new)(state);
+      THCTensor_(resizeAs)(state, gradWeightBuf, gradWeight);
+
+      if (ndim == 2)
+      {
+        THCTensor_(sum)(state, gradWeightBuf, gradInput, 0, 1);
+        THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+      }
+      else
+      {
+        THCTensor *sumbuf = THCTensor_(new)(state);
+        THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
+        int64_t size3 = 1;
+        for (int d = 2; d < ndim; d++) {
+          size3 *= input->size[d];
+        }
+        THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, size3);
+        THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane);
+        THCTensor_(sum)(state, sumbuf, buffer, 2, 1);
+        THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0, 1);
+        THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+        THCTensor_(free)(state, buffer);
+        THCTensor_(free)(state, sumbuf);
+      }
+
+      THCTensor_(free)(state, gradWeightBuf);
+
+      // restore gradInput
+      THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight);
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/RReLU.cu b/aten/src/THCUNN/generic/RReLU.cu
new file mode 100644
index 0000000..bea7f10
--- /dev/null
+++ b/aten/src/THCUNN/generic/RReLU.cu
@@ -0,0 +1,109 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/RReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(RReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *noise,
+           double lower,
+           double upper,
+           bool train,
+           bool inplace,
+           void *generator)
+{
+  THCUNN_assertSameGPU(state, 3, input, output, noise);
+  struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
+
+  if (train)
+  {
+    input = THCTensor_(newContiguous)(state, input);
+    THCTensor_(resizeAs)(state, noise, input);
+    real *input_data = THCTensor_(data)(state, input);
+    real *noise_data = THCTensor_(data)(state, noise);
+    ptrdiff_t n = THCTensor_(nElement)(state, input);
+    if (inplace)
+    {
+      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        n, gen_states, input_data, noise_data, input_data, lower, upper);
+      THCTensor_(set)(state, output, input);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, output, input);
+      real *output_data = THCTensor_(data)(state, output);
+      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        n, gen_states, input_data, noise_data, output_data, lower, upper);
+    }
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+  }
+  else
+  {
+    const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+    if (inplace)
+    {
+      THC_pointwiseApply1<real>(state, input, RReLUUpdateOutputEvalIP_functor<real>(negSlope));
+      THCTensor_(set)(state, output, input);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, output, input);
+      THC_pointwiseApply2<real, real>(state, output, input, RReLUUpdateOutputEval_functor<real>(negSlope));
+    }
+  }
+}
+
+void THNN_(RReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *noise,
+           double lower,
+           double upper,
+           bool train,
+           bool inplace)
+{
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise);
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THCTensor_(cmul)(state, gradOutput, gradOutput, noise);
+      THCTensor_(set)(state, gradInput, gradOutput);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, gradInput, input);
+      THCTensor_(cmul)(state, gradInput, gradOutput, noise);
+    }
+  }
+  else
+  {
+    // use constant factor for negative input values
+    const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+    if (inplace)
+    {
+      THC_pointwiseApply2<real, real>(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<real>(negSlope));
+      THCTensor_(set)(state, gradInput, gradOutput);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, gradInput, input);
+      THC_pointwiseApply3<real, real, real>(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<real>(negSlope));
+    }
+  }
+
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Sigmoid.cu b/aten/src/THCUNN/generic/Sigmoid.cu
new file mode 100644
index 0000000..a91a5dd
--- /dev/null
+++ b/aten/src/THCUNN/generic/Sigmoid.cu
@@ -0,0 +1,28 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sigmoid_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(sigmoid)(state, output, input);
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_check_nElement(state, output, gradOutput);
+  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, sigmoid_updateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SmoothL1Criterion.cu b/aten/src/THCUNN/generic/SmoothL1Criterion.cu
new file mode 100644
index 0000000..1760b08
--- /dev/null
+++ b/aten/src/THCUNN/generic/SmoothL1Criterion.cu
@@ -0,0 +1,103 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 3, input, target, output);
+  THArgCheck(
+    THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  if (reduction == Reduction::None) {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, output,
+                        smoothl1_updateOutput_no_reduce_functor<real>());
+    return;
+  }
+
+  THCTensor_(resize1d)(state, output, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, (accreal) 0,
+    thrust::plus<accreal>(), smoothl1_functor<real, accreal>()
+  );
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput);
+  THArgCheck(
+    THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THCUNN_check_shape(state, gradOutput, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, gradInput,
+                        smoothl1_updateGradInput_no_reduce_functor<real>());
+    THCTensor_(cmul)(state, gradInput, gradInput, gradOutput);
+    return;
+  }
+
+  THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<accreal, real>::to(reduction == Reduction::ElementwiseMean ? accreal(1)/size : accreal(1));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, gradInput_data,
+    smoothl1_updateGradInput_functor<real>(norm, THCTensor_(get1d)(state, gradOutput, 0))
+  );
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SoftMarginCriterion.cu b/aten/src/THCUNN/generic/SoftMarginCriterion.cu
new file mode 100644
index 0000000..47a4368
--- /dev/null
+++ b/aten/src/THCUNN/generic/SoftMarginCriterion.cu
@@ -0,0 +1,81 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 3, input, target, output);
+
+  if (reduction == Reduction::None) {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, output,
+        softmargin_no_reduce_functor<real, accreal>());
+    return;
+  }
+
+  accreal sum;
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+  THCTensor_(resize1d)(state, output, 1);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), softmargin_functor<real, accreal>());
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction)
+{
+  THCUNN_check_shape(state, input, target);
+  THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THCUNN_check_shape(state, gradOutput, input);
+    THC_pointwiseApply3<real, real, real>(state, input, target, gradInput,
+        softmargin_updateGradInput_no_reduce_functor<real, accreal>());
+    THCTensor_(cmul)(state, gradInput, gradInput, gradOutput);
+    return;
+  }
+
+  ptrdiff_t size = THCTensor_(nElement)(state, input);
+  accreal norm = (reduction == Reduction::ElementwiseMean ? 1./size : 1.);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+                    softmargin_updateGradInput_functor<real, accreal>(norm, THCTensor_(get1d)(state, gradOutput, 0)));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SoftPlus.cu b/aten/src/THCUNN/generic/SoftPlus.cu
new file mode 100644
index 0000000..5154d8d
--- /dev/null
+++ b/aten/src/THCUNN/generic/SoftPlus.cu
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftPlus.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftPlus_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal beta_,
+           accreal threshold_)
+{
+  real beta = ScalarConvert<accreal, real>::to(beta_);
+  real threshold = ScalarConvert<accreal, real>::to(threshold_);
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, softPlusupdateOutput_functor<real>(threshold, beta));
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output,
+           accreal beta_,
+           accreal threshold_)
+{
+  real beta = ScalarConvert<accreal, real>::to(beta_);
+  real threshold = ScalarConvert<accreal, real>::to(threshold_);
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor<real>(threshold, beta));
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SoftShrink.cu b/aten/src/THCUNN/generic/SoftShrink.cu
new file mode 100644
index 0000000..0743f70
--- /dev/null
+++ b/aten/src/THCUNN/generic/SoftShrink.cu
@@ -0,0 +1,35 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftShrink.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftShrink_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal lambda_)
+{
+  real lambda = ScalarConvert<accreal, real>::to(lambda_);
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, SoftShrinkUpdateOutput<real>(lambda));
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           accreal lambda_)
+{
+  real lambda = ScalarConvert<accreal, real>::to(lambda_);
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput<real>(lambda));
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
new file mode 100644
index 0000000..d5270d6
--- /dev/null
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -0,0 +1,274 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SparseLinear.cu"
+#else
+
+static bool THNN_(checkInput)(THCTensor* t)
+{
+  return !t->is_empty() && t->_dim() == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
+{
+  return !t->is_empty() && t->_dim() == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
+{
+  return !t->is_empty() && t->_dim() == 1 && t->size[0] == size0;
+}
+
+static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
+  #ifdef THC_REAL_IS_FLOAT
+  THCudaIntTensor_copyCudaFloat(state, buf, t);
+  #elif defined(THC_REAL_IS_DOUBLE)
+  THCudaIntTensor_copyCudaDouble(state, buf, t);
+  #elif defined(THC_REAL_IS_HALF)
+  THCudaIntTensor_copyCudaHalf(state, buf, t);
+  #endif
+}
+
+void THNN_(SparseLinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias)
+{
+  THAssert(THCTensor_(checkGPU)(state, 4, input, output, weight, bias));
+
+  int64_t h;
+  int64_t outDim = THCTensor_(size)(state, weight, 0);
+  int64_t inDim = THCTensor_(size)(state, weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 3");
+  AT_CHECK(!output->is_empty() && THCTensor_(nDimension)(state, output) == 2,
+           "output must be batchsize x outputsize, got size: ", output->sizes());
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  weight = THCTensor_(newContiguous)(state, weight);
+  
+  int64_t batchnum = THCTensor_(size)(state, output, 0);
+  int64_t nnz = THCTensor_(size)(state, input, 0);
+
+  THCTensor *buffer = THCTensor_(new)(state);
+  THCTensor *sel = THCTensor_(new)(state);
+  THCTensor *values = THCTensor_(new)(state);
+  THCudaIntTensor *rowbuf = THCudaIntTensor_new(state);
+  THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state);
+  THCudaIntTensor *colInds = THCudaIntTensor_new(state);
+
+  THCTensor_(resize1d)(state, values, nnz);
+  THCudaIntTensor_resize1d(state, rowbuf, nnz);
+  THCudaIntTensor_resize1d(state, colInds, nnz);
+  THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1);
+
+  // Get data ready for cusparse, need CudaInt buffers
+  // We do not need to sort, since rows are already in order
+  // If rows might get out of order in future implementations, or if cusparse
+  //    complains with an illegal memory access, sort like we do in AccGradParameters
+  THCTensor_(select)(state, sel, input, 1, 0);
+  THNN_(copyCudaFloatingType)(state, rowbuf, sel);
+  THCTensor_(select)(state, sel, input, 1, 1);
+  THNN_(copyCudaFloatingType)(state, colInds, sel);
+  THCTensor_(select)(state, sel, input, 1, 2);
+  THCTensor_(copyCuda)(state, values, sel);
+
+  init_cusparse();
+  cusparseXcoo2csr(cusparse_handle,
+      THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
+      THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+  // output = bias
+  THCTensor_(resize2d)(state, buffer, outDim, batchnum);
+  THCTensor_(zero)(state, buffer);
+  for (h=0; h<batchnum; h++) {
+    THCTensor_(select)(state, sel, buffer, 1, h);
+    THCTensor_(copy)(state, sel, bias);
+  }
+
+  // output = W * x
+  real one = ScalarConvert<int, real>::to(1);
+  cusparseMatDescr_t descr = 0;
+  cusparseCreateMatDescr(&descr);
+  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+  #ifdef THC_REAL_IS_FLOAT
+  cusparseScsrmm(cusparse_handle,
+  #elif defined(THC_REAL_IS_DOUBLE)
+  cusparseDcsrmm(cusparse_handle,
+  #endif
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      batchnum, outDim, inDim, nnz,
+      &one,
+      descr,
+      THCTensor_(data)(state, values),
+      THCudaIntTensor_data(state, csrPtrs),
+      THCudaIntTensor_data(state, colInds),
+      THCTensor_(data)(state, weight), inDim,
+      &one, THCTensor_(data)(state, buffer), batchnum
+  );
+  THCTensor_(transpose)(state, buffer, NULL, 0, 1);
+
+  // We do work in the buffer to keep the output contiguous
+  THCTensor_(copy)(state, output, buffer);
+
+  cusparseDestroyMatDescr(descr);
+  descr = 0;
+  THCTensor_(free)(state, buffer);
+  THCTensor_(free)(state, sel);
+  THCTensor_(free)(state, values);
+  THCTensor_(free)(state, weight);
+  THCudaIntTensor_free(state, rowbuf);
+  THCudaIntTensor_free(state, colInds);
+  THCudaIntTensor_free(state, csrPtrs);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *weight,
+           THCTensor *bias,
+           accreal weightDecay,
+           accreal scale)
+{
+  int64_t outDim = THCTensor_(size)(state, weight, 0);
+  int64_t inDim = THCTensor_(size)(state, weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+
+  weight = THCTensor_(newContiguous)(state, weight);
+  int64_t nnz = THCTensor_(size)(state, input, 0);
+  int64_t batchnum = THCTensor_(size)(state, gradOutput, 0);
+
+  THCTensor *buf = THCTensor_(new)(state);
+  THCTensor *cols = THCTensor_(new)(state);
+  THCTensor *sel = THCTensor_(new)(state);
+  THCudaLongTensor *inds = THCudaLongTensor_new(state);
+  THCTensor *values = THCTensor_(new)(state);
+  THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
+  THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
+  THCudaIntTensor *rowInds = THCudaIntTensor_new(state);
+
+  THCTensor_(select)(state, sel, input, 1, 0); // rowInds
+  THCTensor_(select)(state, cols, input, 1, 1); // colInds
+  THCTensor_(cadd)(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds
+  THCTensor_(sort)(state, buf, inds, buf, 0, 0); // Indices are now in ind
+  THCTensor_(indexSelect)(state, buf, input, 0, inds);
+
+  THCTensor_(resize1d)(state, values, nnz);
+  THCudaIntTensor_resize1d(state, colbuf, nnz);
+  THCudaIntTensor_resize1d(state, rowInds, nnz);
+  THCudaIntTensor_resize1d(state, colPtrs, inDim+1);
+
+  // Get data ready for cusparse, need CudaInt buffers
+  THCTensor_(select)(state, sel, buf, 1, 0);
+  THNN_(copyCudaFloatingType)(state, rowInds, sel);
+  THCTensor_(select)(state, sel, buf, 1, 1);
+  THNN_(copyCudaFloatingType)(state, colbuf, sel);
+  THCTensor_(select)(state, sel, buf, 1, 2);
+  THCTensor_(copyCuda)(state, values, sel);
+
+  init_cusparse();
+  // Secretly coo2csc
+  cusparseXcoo2csr(cusparse_handle,
+      THCudaIntTensor_data(state, colbuf), nnz, inDim,
+      THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+  // FORTRAN expects contiguous col-major matricies
+  THCTensor *tgradOutput = THCTensor_(new)(state);
+  THCTensor_(transpose)(state, tgradOutput, gradOutput, 0, 1);
+  THCTensor_(resize2d)(state, buf, batchnum, outDim);
+  THCTensor_(copy)(state, buf, tgradOutput);
+  THCTensor_(free)(state, tgradOutput);
+
+  real one = ScalarConvert<int, real>::to(1);
+  cusparseMatDescr_t descr = 0;
+  cusparseCreateMatDescr(&descr);
+  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+  #ifdef THC_REAL_IS_FLOAT
+  cusparseScsrmm(cusparse_handle,
+  #elif defined(THC_REAL_IS_DOUBLE)
+  cusparseDcsrmm(cusparse_handle,
+  #endif
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      inDim, outDim, batchnum, nnz,
+      &one,
+      descr,
+      THCTensor_(data)(state, values),
+      THCudaIntTensor_data(state, colPtrs),
+      THCudaIntTensor_data(state, rowInds),
+      THCTensor_(data)(state, buf), batchnum,
+      &one, THCTensor_(data)(state, gradWeight), inDim
+  );
+
+  THCTensor_(sum)(state, buf, gradOutput, 0, 1);
+  THCTensor_(resize1d)(state, buf, outDim);
+  THCTensor_(cadd)(state, gradBias, gradBias, scale, buf);
+
+  if (weightDecay != 0)
+  {
+    THCTensor_(cadd)(state, gradWeight, gradWeight, weightDecay, weight);
+    THCTensor_(cadd)(state, gradBias, gradBias, weightDecay, bias);
+  }
+
+  THCTensor_(free)(state, weight);
+  THCTensor_(free)(state, buf);
+  THCTensor_(free)(state, sel);
+  THCTensor_(free)(state, cols);
+  THCudaLongTensor_free(state, inds);
+  THCTensor_(free)(state, values);
+  THCudaIntTensor_free(state, colbuf);
+  THCudaIntTensor_free(state, rowInds);
+  THCudaIntTensor_free(state, colPtrs);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias) {
+  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+void THNN_(SparseLinear_legacyAccGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *weight,
+           THCTensor *bias,
+           accreal weightDecay,
+           accreal scale) {
+  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+
+// Dense updates are pretty fast on the GPU
+void THNN_(SparseLinear_zeroGradParameters)(
+           THCState *state,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *lastInput) {
+  THCTensor_(zero)(state, gradWeight);
+  THCTensor_(zero)(state, gradBias);
+}
+
+void THNN_(SparseLinear_updateParameters)(
+           THCState *state,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *lastInput,
+           accreal learningRate) {
+  THCTensor_(cadd)(state, weight, weight, -learningRate, gradWeight);
+  THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
new file mode 100644
index 0000000..05a7b04
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
@@ -0,0 +1,173 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.cu"
+#else
+
+#include "../common.h"
+
+// 4d tensor B x D x H x W
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int osizeW,
+           int osizeH)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  real *output_data;
+  real *input_data;
+
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 3) {
+    int64_t sizeD  = input->size[0];
+    int64_t isizeH = input->size[1];
+    int64_t isizeW = input->size[2];
+
+    int64_t istrideD = input->stride[0];
+    int64_t istrideH = input->stride[1];
+    int64_t istrideW = input->stride[2];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, sizeD, osizeH, osizeW);
+
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int blocksH = max((int)(16L / sizeD), 1);
+    dim3 blocks(sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    // run averagepool kernel
+    adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   isizeH, isizeW, osizeH, osizeW,
+                                   istrideD, istrideH, istrideW);
+    THCudaCheck(cudaGetLastError());
+
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+    int64_t sizeB  = input->size[0];
+    int64_t sizeD  = input->size[1];
+    int64_t isizeH = input->size[2];
+    int64_t isizeW = input->size[3];
+
+    int64_t istrideD = input->stride[1];
+    int64_t istrideH = input->stride[2];
+    int64_t istrideW = input->stride[3];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, sizeB, sizeD, osizeH, osizeW);
+
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int blocksH = max((int)(16L / sizeD), 1);
+    dim3 blocks(sizeB * sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    // run averagepool kernel
+    adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   isizeH, isizeW, osizeH, osizeW,
+                                   istrideD, istrideH, istrideW);
+    THCudaCheck(cudaGetLastError());
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (input->dim() == 3) {
+    int64_t sizeD  = input->size[0];
+    int64_t isizeH = input->size[1];
+    int64_t isizeW = input->size[2];
+
+    int64_t osizeH = gradOutput->size[1];
+    int64_t osizeW = gradOutput->size[2];
+
+    //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int blocksH = max((int)(16L / sizeD), 1);
+    dim3 blocks(sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    else
+    {
+      // run updateGradInput kernel
+      adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    int64_t sizeB  = input->size[0];
+    int64_t sizeD  = input->size[1];
+    int64_t isizeH = input->size[2];
+    int64_t isizeW = input->size[3];
+
+    int64_t osizeH = gradOutput->size[2];
+    int64_t osizeW = gradOutput->size[3];
+
+    //bool atomic = //(isizeW%osizeW != 0) || (isizeH%osizeH != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int blocksH = max((int)(16L / sizeD), 1);
+    dim3 blocks(sizeB * sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    else
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state,gradOutput);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..3e5fab6
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
@@ -0,0 +1,193 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+// 4d tensor B x D x H x W
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int osizeW,
+           int osizeH)
+{
+  THCUNN_assertSameGPU(state, 3, input, output, indices);
+
+  THCIndex_t *indices_data;
+  real *output_data;
+  real *input_data;
+
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 3) {
+    int64_t sizeD  = input->size[0];
+    int64_t isizeH = input->size[1];
+    int64_t isizeW = input->size[2];
+
+    int64_t istrideD = input->stride[0];
+    int64_t istrideH = input->stride[1];
+    int64_t istrideW = input->stride[2];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, sizeD, osizeH, osizeW);
+    THCIndexTensor_(resize3d)(state, indices, sizeD, osizeH, osizeW);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int blocksH = (int)(16L / sizeD);
+    blocksH = blocksH < 1 ? 1 : blocksH;
+    dim3 blocks(sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    // run maxpool kernel
+    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   indices_data,
+                                   isizeH, isizeW, osizeH, osizeW,
+                                   istrideD, istrideH, istrideW);
+    THCudaCheck(cudaGetLastError());
+
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+    int64_t sizeB  = input->size[0];
+    int64_t sizeD  = input->size[1];
+    int64_t isizeH = input->size[2];
+    int64_t isizeW = input->size[3];
+
+    int64_t istrideD = input->stride[1];
+    int64_t istrideH = input->stride[2];
+    int64_t istrideW = input->stride[3];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, sizeB, sizeD, osizeH, osizeW);
+    THCIndexTensor_(resize4d)(state, indices, sizeB, sizeD, osizeH, osizeW);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int blocksH = (int)(16L / sizeD);
+    blocksH = blocksH < 1 ? 1 : blocksH;
+    dim3 blocks(sizeB*sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    // run maxpool kernel
+    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   indices_data,
+                                   isizeH, isizeW, osizeH, osizeW,
+                                   istrideD, istrideH, istrideW);
+    THCudaCheck(cudaGetLastError());
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices)
+{
+  bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
+
+  THCIndex_t *indices_data;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (input->dim() == 3) {
+    int64_t sizeD  = input->size[0];
+    int64_t isizeH = input->size[1];
+    int64_t isizeW = input->size[2];
+
+    int64_t osizeH = gradOutput->size[1];
+    int64_t osizeW = gradOutput->size[2];
+
+    //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int blocksH = (int)(16L / sizeD);
+    blocksH = blocksH < 1 ? 1 : blocksH;
+    dim3 blocks(sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    else
+    {
+      // run updateGradInput kernel
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    int64_t sizeB  = input->size[0];
+    int64_t sizeD  = input->size[1];
+    int64_t isizeH = input->size[2];
+    int64_t isizeW = input->size[3];
+
+    int64_t osizeH = gradOutput->size[2];
+    int64_t osizeW = gradOutput->size[3];
+
+    //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int blocksH = (int)(16L / sizeD);
+    blocksH = blocksH < 1 ? 1 : blocksH;
+    dim3 blocks(sizeB*sizeD, blocksH);
+    dim3 threads(32, 8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    else
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data,
+                                          isizeH, isizeW, osizeH, osizeW);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state,gradOutput);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAveragePooling.cu
new file mode 100644
index 0000000..7b3d2d4
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialAveragePooling.cu
@@ -0,0 +1,237 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAveragePooling.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+  THCState *state,
+  THCTensor *input, THCTensor *gradOutput,
+  int kH, int kW, int dH, int dW, int padH, int padW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "padW = %d, padH = %d, kW = %d, kH = %d",
+             padW, padH, kW, kH);
+
+  int64_t nInputPlane = input->size[dimh-1];
+  int64_t nInputRows = input->size[dimh];
+  int64_t nInputCols = input->size[dimw];
+  int64_t nOutputRows, nOutputCols;
+  int64_t nOutputPlane = nInputPlane;
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  if (nOutputCols < 1 || nOutputRows < 1)
+    THError("Given input size: (%dx%dx%d). "
+            "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols);
+  }
+}
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THNN_(SpatialAveragePooling_shapeCheck)
+       (state, input, NULL, kH, kW, dH, dW,
+        padH, padW, ceil_mode);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+  int64_t nOutputCols, nOutputRows;
+
+  if (input->dim() == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  real* input_data = THCTensor_(data)(state, input);
+
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+
+  real* output_data = THCTensor_(data)(state, output);
+
+  int count = THCTensor_(nElement)(state, output);
+
+  if(count_include_pad)
+    AvePoolForward<real, accreal, true>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+        count, input_data,
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW, output_data);
+  else
+    AvePoolForward<real, accreal, false>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+        count, input_data,
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW, output_data);
+  THCudaCheck(cudaGetLastError());
+
+  if(input->dim() == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+  THNN_(SpatialAveragePooling_shapeCheck)
+       (state, input, gradOutput, kH, kW, dH, dW,
+        padH, padW, ceil_mode);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+  int64_t nOutputCols, nOutputRows;
+  int dimCol = 2;
+  int dimRow = 1;
+
+  if (input->dim() == 3) {
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    dimCol = 3;
+    dimRow = 2;
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+  nInputCols = input->size[dimCol];
+  nInputRows = input->size[dimRow];
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  THCUNN_check_dim_size(state, gradOutput, input->dim(), dimRow, nOutputRows);
+  THCUNN_check_dim_size(state, gradOutput, input->dim(), dimCol, nOutputCols);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  if(count_include_pad)
+    AvePoolBackward<real, accreal, true>
+      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+        (count,
+        THCTensor_(data)(state, gradOutput),
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW,
+        THCTensor_(data)(state, gradInput));
+  else
+    AvePoolBackward<real, accreal, false>
+      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+        (count,
+        THCTensor_(data)(state, gradOutput),
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW,
+        THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
new file mode 100644
index 0000000..693a26d
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -0,0 +1,233 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialClassNLLCriterion.cu"
+#else
+
+void THNN_(SpatialClassNLLCriterion_shapeCheck)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *weights)
+{
+  AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimension)(state, target) == 3, 1,
+           "only batches of spatial targets supported (non-empty 3D tensors)" \
+           " but got targets of size: : ", target->sizes());
+  AT_CHECK(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4, 2,
+           "only batches of spatial inputs supported (non-empty 4D tensors), "      \
+           "but got input of size: ", input->sizes());
+  if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) ||
+      THCTensor_(size)(state, input, 2) != THCIndexTensor_(size)(state, target, 1) ||
+      THCTensor_(size)(state, input, 3) != THCIndexTensor_(size)(state, target, 2)) {
+    THCDescBuff input_size = THCTensor_(sizeDesc)(state, input);
+    THCDescBuff target_size = THCIndexTensor_(sizeDesc)(state, target);
+    THError("input and target batch or spatial sizes don't match: target %s, input %s",
+            target_size.str, input_size.str);
+  }
+
+  if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+}
+
+static void THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCIndexTensor *target)
+{
+  AT_CHECK(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 3, 2,
+           "Expected non-empty dimension 3 but got gradOutput of size: ", gradOutput->sizes());
+  if (THCTensor_(size)(state, gradOutput, 0) != THCIndexTensor_(size)(state, target, 0) ||
+      THCTensor_(size)(state, gradOutput, 1) != THCIndexTensor_(size)(state, target, 1) ||
+      THCTensor_(size)(state, gradOutput, 2) != THCIndexTensor_(size)(state, target, 2)) {
+    THCDescBuff gradOutput_size = THCTensor_(sizeDesc)(state, gradOutput);
+    THCDescBuff target_size = THCIndexTensor_(sizeDesc)(state, target);
+    THError("gradOutput sizes don't match target sizes: target %s, gradOutput %s",
+            target_size.str, gradOutput_size.str);
+  }
+}
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           int64_t reduction,
+           THCTensor *weights,
+           THCTensor *total_weight,
+           int64_t ignore_index)
+{
+  THNN_(SpatialClassNLLCriterion_shapeCheck)(state, input, target, weights);
+  THCTensor_(resize1d)(state, output, 1);
+  THCTensor_(resize1d)(state, total_weight, 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (weights)
+    THCUNN_assertSameGPU(state, 5, input, target, weights, output, total_weight);
+  else
+    THCUNN_assertSameGPU(state, 4, input, target, output, total_weight);
+
+  if (reduction == Reduction::None) {
+    int64_t batch_size = THCTensor_(size)(state, input, 0);
+    int64_t H = THCTensor_(size)(state, input, 2);
+    int64_t W = THCTensor_(size)(state, input, 3);
+
+    THCTensor_(resize3d)(state, output, batch_size, H, W);
+
+    if (weights) {
+      weights = THCTensor_(newContiguous)(state, weights);
+    }
+
+    int64_t count = batch_size * H * W;
+    SpatialClassNLLCriterion_updateOutput_no_reduce_kernel<real>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+        count,
+        toDeviceTensor<real, 4>(state, input),
+        toDeviceTensor<THCIndex_t, 3>(state, target),
+        toDeviceTensor<real, 3>(state, output),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        ignore_index);
+
+    if (weights) {
+      THCTensor_(free)(state, weights);
+    }
+    return;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *input_data = THCTensor_(data)(state, input);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *output_data = THCTensor_(data)(state, output);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+  THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+  int total_blocks = blocks_per_sample * batch_size;
+
+  THCTensor_(fill)(state, output, ScalarConvert<int, real>::to(0));
+  THCTensor_(fill)(state, total_weight, ScalarConvert<int, real>::to(0));
+
+  cunn_SpatialClassNLLCriterion_updateOutput_kernel<real, accreal>
+    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      output_data,
+      total_weight_data,
+      input_data,
+      target_data,
+      weights_data,
+      reduction == Reduction::ElementwiseMean,
+      THCTensor_(size)(state, input, 0),
+      THCTensor_(size)(state, input, 1),
+      THCTensor_(size)(state, input, 2) * THCTensor_(size)(state, input, 3),
+      blocks_per_sample,
+      ignore_index
+  );
+  THCudaCheck(cudaGetLastError());
+  if (reduction == Reduction::ElementwiseMean) {
+    cunn_SpatialClassNLLCriterion_sizeAverage_kernel<<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+      output_data, total_weight_data
+    );
+    THCudaCheck(cudaGetLastError());
+  }
+
+  if (weights)
+    THCTensor_(free)(state, weights);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int64_t reduction,
+           THCTensor *weights,
+           THCTensor *total_weight,
+           int64_t ignore_index)
+{
+  THNN_(SpatialClassNLLCriterion_shapeCheck)(state, input, target, weights);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+  THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4,
+             "gradInput must be contiguous");
+  ignore_index -= TH_INDEX_BASE;
+
+  if (weights)
+    THCUNN_assertSameGPU(state, 5, weights, input, target, gradInput, total_weight);
+  else
+    THCUNN_assertSameGPU(state, 4, input, target, gradInput, total_weight);
+
+  if (reduction == Reduction::None) {
+    THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)(
+        state,
+        gradOutput,
+        target);
+
+    int64_t batch_size = THCTensor_(size)(state, input, 0);
+    int64_t H = THCTensor_(size)(state, input, 2);
+    int64_t W = THCTensor_(size)(state, input, 3);
+
+    if (weights) {
+      weights = THCTensor_(newContiguous)(state, weights);
+    }
+
+    int64_t count = batch_size * H * W;
+    SpatialClassNLLCriterion_updateGradInput_no_reduce_kernel<real>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+        count,
+        toDeviceTensor<THCIndex_t, 3>(state, target),
+        toDeviceTensor<real, 3>(state, gradOutput),
+        toDeviceTensor<real, 4>(state, gradInput),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        ignore_index);
+
+    if (weights) {
+      THCTensor_(free)(state, weights);
+    }
+    return;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  real *gradInput_data = THCTensor_(data)(state, gradInput);
+  THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+  THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+  int total_blocks = blocks_per_sample * batch_size;
+
+  cunn_SpatialClassNLLCriterion_updateGradInput_kernel
+    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      gradInput_data,
+      gradOutput_data,
+      target_data,
+      weights_data,
+      total_weight_data,
+      reduction == Reduction::ElementwiseMean,
+      THCTensor_(size)(state, input, 0),
+      THCTensor_(size)(state, input, 1),
+      THCTensor_(size)(state, input, 2) *THCTensor_(size)(state, input, 3),
+      blocks_per_sample,
+      ignore_index
+  );
+  THCudaCheck(cudaGetLastError());
+
+  if (weights)
+    THCTensor_(free)(state, weights);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
new file mode 100644
index 0000000..6446394
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -0,0 +1,411 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionLocal.cu"
+#else
+
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         THCTensor *weight, THCTensor *bias,
+                         int kH, int kW, int dH,
+                         int dW, int padH, int padW,
+                         int64_t inputHeight, int64_t inputWidth,
+                         int64_t outputHeight, int64_t outputWidth) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t nInputPlane = weight->size[2] / (kH * kW);
+  int64_t nOutputPlane = weight->size[1];
+
+  if (bias != NULL) {
+   THCUNN_check_dim_size(state, bias, 3, 0, nOutputPlane);
+   THCUNN_check_dim_size(state, bias, 3, 1, outputHeight);
+   THCUNN_check_dim_size(state, bias, 3, 2, outputWidth);
+  }
+
+  THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THCTensor* THNN_(view_weight_local)(
+                 THCState *state,
+                 THCTensor *_weight)
+{
+  THCTensor *weight = THCTensor_(newContiguous)(state, _weight);
+  AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4,
+           "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes());
+  if (weight->dim() == 6) {
+    int64_t s1 = weight->size[0] * weight->size[1];
+    int64_t s2 = weight->size[2];
+    int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    THCTensor *old_weight = weight;
+    weight = THCTensor_(newWithStorage3d)(state,
+                          weight->storage,
+                          weight->storageOffset,
+                          s1, -1, s2, -1, s3, -1);
+    THCTensor_(free)(state, old_weight);
+  }
+  return weight;
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int64_t inputWidth, int64_t inputHeight,
+           int64_t outputWidth, int64_t outputHeight)
+{
+  THCUNN_assertSameGPU(state, 5, input, output, weight,
+                       bias, finput);
+
+  weight = THNN_(view_weight_local)(state, weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+       (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+        inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  int64_t nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+  int64_t nOutputPlane = THCTensor_(size)(state,weight,1);
+
+  int batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+  }
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Augment the input
+  THCTensor_(resize3d)(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *finput_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *finput3d, *output3d;
+    THCTensor *wslice = THCTensor_(new)(state);
+    THCTensor *islice = THCTensor_(new)(state);
+    THCTensor *oslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, finput_n, finput, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, finput_n)
+    );
+
+    output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+
+    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+    THCTensor_(copy)(state, output_n, bias);
+
+    // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+    // finput3d:  oH*oW x nInputPlane*kH*kW x 1
+    THCTensor_(baddbmm)(state, output3d, ScalarConvert<int, real>::to(1),
+                        output3d, ScalarConvert<int, real>::to(1),
+                        weight, finput3d);
+    // output3d:  oH*oW x nOutputPlane x 1
+
+    THCTensor_(free)(state, output3d);
+    THCTensor_(free)(state, finput3d);
+    THCTensor_(free)(state, wslice);
+    THCTensor_(free)(state, islice);
+    THCTensor_(free)(state, oslice);
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, finput_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int64_t inputWidth, int64_t inputHeight,
+           int64_t outputWidth, int64_t outputHeight)
+{
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+                       fgradInput, gradInput);
+
+  weight = THNN_(view_weight_local)(state, weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+       (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+        inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int64_t nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+  int64_t nOutputPlane = THCTensor_(size)(state,weight,1);
+
+  int batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+  }
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize3d)(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *fgradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  THCTensor *tweight = THCTensor_(new)(state);
+  THCTensor_(transpose)(state, tweight, weight, 1, 2);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *gradOutput3d, *fgradInput3d;
+    THCTensor *wslice = THCTensor_(new)(state);
+    THCTensor *gislice = THCTensor_(new)(state);
+    THCTensor *goslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+                                               outputHeight*outputWidth, 1,
+                                               nOutputPlane, outputHeight*outputWidth,
+                                               1, nOutputPlane*outputHeight*outputWidth);
+    fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset,
+                                               outputHeight*outputWidth, 1,
+                                               kW*kH*nInputPlane, outputHeight*outputWidth,
+                                               1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+    // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+    // gradOutput3d:  oH*oW x nOutputPlane x 1
+    THCTensor_(baddbmm)(state, fgradInput3d,
+                        ScalarConvert<int, real>::to(0),
+                        fgradInput3d, ScalarConvert<int, real>::to(1),
+                        tweight, gradOutput3d);
+    // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, fgradInput_n),
+      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, gradInput_n)
+    );
+
+    THCTensor_(free)(state, gradOutput3d);
+    THCTensor_(free)(state, fgradInput3d);
+    THCTensor_(free)(state, wslice);
+    THCTensor_(free)(state, gislice);
+    THCTensor_(free)(state, goslice);
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, fgradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, tweight);
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int64_t inputWidth, int64_t inputHeight,
+           int64_t outputWidth, int64_t outputHeight,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight,
+                       gradBias, finput);
+
+  THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
+  gradWeight = THNN_(view_weight_local)(state, gradWeight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+       (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+        inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int64_t nInputPlane = THCTensor_(size)(state,gradWeight,2)/(kW*kH);
+  int64_t nOutputPlane = THCTensor_(size)(state,gradWeight,1);
+
+  int batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+  }
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *finput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *gradOutput3d, *finput3d;
+    THCTensor *gwslice = THCTensor_(new)(state);
+    THCTensor *islice = THCTensor_(new)(state);
+    THCTensor *goslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, finput_n, finput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+                                                 outputHeight*outputWidth, 1,
+                                                 nOutputPlane, outputHeight*outputWidth,
+                                                 1, nOutputPlane*outputHeight*outputWidth);
+    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, finput_n)
+    );
+
+    // gradOutput3d:  oH*oW x nOutputPlane x 1
+    // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+    THCTensor_(baddbmm)(state, gradWeight, ScalarConvert<int, real>::to(1),
+                        gradWeight, scale, gradOutput3d, finput3d);
+    // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+    THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutput_n);
+
+    THCTensor_(free)(state, gradOutput3d);
+    THCTensor_(free)(state, finput3d);
+    THCTensor_(free)(state, gwslice);
+    THCTensor_(free)(state, goslice);
+    THCTensor_(free)(state, islice);
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, finput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, gradWeight);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
new file mode 100644
index 0000000..b5dab9b
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -0,0 +1,527 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionMM.cu"
+#else
+
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         THCTensor *weight, THCTensor *bias,
+                         int kH, int kW, int dH, int dW, int padH, int padW,
+                         int weight_nullable) {
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
+                    "non-empty 2D or 4D weight tensor expected, but got: %s");
+    if (bias != NULL) {
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+
+  int64_t exactInputHeight = inputHeight + 2 * padH;
+  int64_t exactInputWidth = inputWidth + 2 * padW;
+
+  if (exactInputHeight < kH || exactInputWidth < kW) {
+    THError("Calculated padded input size per channel: (%ld x %ld). "
+      "Kernel size: (%ld x %ld). Kernel size can't be greater than actual input size",
+      exactInputHeight, exactInputWidth, kH, kW);
+  }
+
+  int64_t outputHeight = (exactInputHeight - kH) / dH + 1;
+  int64_t outputWidth  = (exactInputWidth - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld). Output size is too small",
+      inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    if (weight->dim() == 2) {
+      nInputPlane /= (kH * kW);
+    }
+    THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH) {
+
+  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU(state, 2, weight, bias);
+  }
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4,
+             "weight tensor has to be contiguous");
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
+             "bias tensor has to be contiguous");
+
+  int freeWeight = 0;
+
+  // Params:
+  int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  if (weight->dim() == 4) {
+    int64_t s1 = weight->size[0];
+    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+       (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0);
+
+  input = THCTensor_(newContiguous)(state, input);
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nOutputPlane;
+    int64_t n = columns->size[1];
+    int64_t k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+  if (freeWeight)
+    THCTensor_(free)(state, weight);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+                       gradColumns, gradInput);
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4,
+             "weight tensor has to be contiguous");
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+       (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0);
+
+  // Params
+  int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int freeWeight = 0;
+  if (weight->dim() == 4) {
+    int64_t s1 = weight->size[0];
+    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nInputPlane*kW*kH;
+    int64_t n = gradColumns->size[1];
+    int64_t k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+  if (freeWeight)
+    THCTensor_(free)(state, weight);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           accreal scale_) {
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones);
+  if (gradWeight) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  if (gradBias) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous");
+  }
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+       (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1);
+
+  // Params
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = gradOutput->size[1];
+
+  int freeWeight = 0;
+  if (gradWeight && gradWeight->dim() == 4) {
+    int64_t s1 = gradWeight->size[0];
+    int64_t s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
+    gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THCTensor_(select)(state, input_n, input, 0, elt);
+
+      // Extract columns:
+      im2col(
+        THCState_getCurrentStream(state),
+        THCTensor_(data)(state, input_n),
+        nInputPlane, inputHeight, inputWidth,
+        outputHeight, outputWidth,
+        kH, kW, padH, padW, dH, dW,
+        1, 1, THCTensor_(data)(state, columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kW*kH;
+      int64_t k = columns->size[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n, m, k,
+          scale,
+          THCTensor_(data)(state, columns), k,
+          THCTensor_(data)(state, gradOutput_n), k,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+  if (freeWeight)
+    THCTensor_(free)(state, gradWeight);
+
+  // Resize
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
new file mode 100644
index 0000000..fbdd8b4
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -0,0 +1,122 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu"
+#else
+
+void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output,
+    THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_)
+{
+  real alpha = ScalarConvert<accreal, real>::to(alpha_);
+  real beta = ScalarConvert<accreal, real>::to(beta_);
+  real k = ScalarConvert<accreal, real>::to(k_);
+
+  THCTensor_(resizeAs)(state, output, input);
+  THCTensor_(resizeAs)(state, scale, input);
+
+  int batchSize;
+  int nInputPlane;
+  int imsize_h;
+  int imsize_w;
+
+  if (input->dim() == 3) {
+    batchSize = 1;
+    nInputPlane = input->size[0];
+    imsize_h = input->size[1];
+    imsize_w = input->size[2];
+  }
+  else
+  {
+    batchSize = input->size[0];
+    nInputPlane = input->size[1];
+    imsize_h = input->size[2];
+    imsize_w = input->size[3];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  int n_threads = batchSize * imsize_h * imsize_w;
+  LRNFillScale<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
+      alpha / local_size, k, THCTensor_(data)(state, scale));
+  n_threads *= nInputPlane;
+  THCudaCheck(cudaGetLastError());
+  LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+    n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+}
+
+
+void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output,
+    THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale,
+    int local_size, accreal alpha_, accreal beta_, accreal k_)
+{
+  real alpha = ScalarConvert<accreal, real>::to(alpha_);
+  real beta = ScalarConvert<accreal, real>::to(beta_);
+  real k = ScalarConvert<accreal, real>::to(k_);
+  (void) k;
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int batchSize;
+  int nInputPlane;
+  int imsize_h;
+  int imsize_w;
+
+  if (input->dim() == 3) {
+    batchSize = 1;
+    nInputPlane = input->size[0];
+    imsize_h = input->size[1];
+    imsize_w = input->size[2];
+  }
+  else
+  {
+    batchSize = input->size[0];
+    nInputPlane = input->size[1];
+    imsize_h = input->size[2];
+    imsize_w = input->size[3];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int n_threads = batchSize * imsize_h * imsize_w;
+  LRNComputeDiff<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output),
+      THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
+      local_size, -beta, ScalarConvert<int, real>::to(2) * alpha * beta / local_size,
+      THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(SpatialCrossMapLRN_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *output,
+    THCTensor *scale,
+    int size,
+    accreal alpha,
+    accreal beta,
+    accreal k)
+{
+  THNN_(LRNforward)(state, input, output, scale, size, alpha, beta, k);
+}
+
+void THNN_(SpatialCrossMapLRN_updateGradInput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *gradOutput,
+    THCTensor *gradInput,
+    THCTensor *scale,
+    THCTensor *output,
+    int size,
+    accreal alpha,
+    accreal beta,
+    accreal k)
+{
+  THNN_(LRNbackward)(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
new file mode 100644
index 0000000..61cd0e2
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -0,0 +1,254 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDepthwiseConvolution.cu"
+#else
+
+void THNN_(SpatialDepthwiseConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH)
+{
+  THCUNN_assertSameGPU(state, 3, input, output, weight);
+
+  // Only handle 4D Input Tensors for now
+  THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4);
+  THAssert(!weight->is_empty() && THCTensor_(nDimension)(state, weight) == 4);
+
+  // We assume that the input and weight Tensors are shaped properly by
+  // the caller, so we verify that here to some extent
+
+  // Weight Tensor is shape (output_channels, 1, kH, kW)
+  THAssert(weight->size[1] == 1);
+
+  // Input Tensor is shape (N, input_channels, H, W)
+  // We verify that the # of output_channels is a multiple of input_channels
+  THAssert(weight->size[0] % input->size[1] == 0);
+
+  // Bias has same # of channels as output
+  if (bias) {
+    THAssert(bias->size[0] == weight->size[0]);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  weight = THCTensor_(newContiguous)(state, weight);
+  bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;
+
+  // Following the behvaior of other THCUNN functions, we shape the output
+  // Tensor ourselves
+
+  int batchSize = input->size[0];
+  int height = input->size[2];
+  int width = input->size[3];
+  int outputHeight = (height + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int outputWidth = (width + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int outputChannels = weight->size[0];
+
+  THCTensor_(resize4d)(state, output, batchSize, outputChannels, outputHeight, outputWidth);
+
+  // Create THCDeviceTensor
+  // Kernel currently relies upon all the Tensors to be contiguous, but we made
+  // them contiguous above
+  THCDeviceTensor<real, 4> dInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> dWeight = toDeviceTensor<real, 4>(state, weight);
+  THCDeviceTensor<real, 4> dOutput = toDeviceTensor<real, 4>(state, output);
+  THCDeviceTensor<real, 1> dBias;
+  if (bias) {
+    dBias = toDeviceTensor<real, 1>(state, bias);
+  }
+
+  int inputChannels = input->size[1];
+  int depthwiseMultiplier = outputChannels / inputChannels;
+
+  // One thread per output value
+  int n = THCTensor_(nElement)(state, output);
+  int blocks = GET_BLOCKS(n);
+  dim3 grid(blocks);
+  dim3 block(CUDA_NUM_THREADS);
+  if (kW == 3 && kH == 3) {
+  spatialDepthwiseConvolutionUpdateOutput<real, accreal, unsigned int, 3><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier,
+    width, height, outputWidth, outputHeight,
+    kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+  } else if (kW == 1 && kH == 1) {
+  spatialDepthwiseConvolutionUpdateOutput<real, accreal, unsigned int, 1><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier,
+    width, height, outputWidth, outputHeight,
+    kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+  } else {
+  spatialDepthwiseConvolutionUpdateOutput<real, accreal, unsigned int, 0><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier,
+    width, height, outputWidth, outputHeight,
+    kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+  }
+
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+  if (bias) THCTensor_(free)(state, bias);
+}
+
+void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH)
+{
+  THCUNN_assertSameGPU(state, 3, gradOutput, gradInput, weight);
+
+  // Only handle 4D Input Tensors for now
+  THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4);
+  THAssert(!weight->is_empty() && THCTensor_(nDimension)(state, weight) == 4);
+  THAssert(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 4);
+
+  // Minimal shape checking, as above
+  // Same # of elements in batch
+  THAssert(input->size[0] == gradOutput->size[0]);
+  // Same # of filters as outputChannels
+  THAssert(weight->size[0] == gradOutput->size[1]);
+
+  // Resize GradInput
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int inputChannels = input->size[1];
+  int height = input->size[2];
+  int width = input->size[3];
+
+  int outputChannels = gradOutput->size[1];
+  int outputHeight = gradOutput->size[2];
+  int outputWidth = gradOutput->size[3];
+
+  int depthwiseMultiplier = outputChannels / inputChannels;
+
+  THCDeviceTensor<real, 4> dGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  THCDeviceTensor<real, 4> dGradInput = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> dWeight = toDeviceTensor<real, 4>(state, weight);
+
+  // Kernel currently relies upon all the Tensors to be contiguous
+  THAssert(dGradOutput.isContiguous());
+  THAssert(dGradInput.isContiguous());
+  THAssert(dWeight.isContiguous());
+
+  // One thread per gradInput value
+  int n = THCTensor_(nElement)(state, gradInput);
+  int blocks = GET_BLOCKS(n);
+  dim3 grid(blocks);
+  dim3 block(CUDA_NUM_THREADS);
+  if (kW == 3 && kH == 3)
+    if (dW == 1 && dH == 1){
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 3, 1><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else if (dW == 2 && dH == 2) {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 3, 2><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 3, 0><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    }
+  else if (kW == 1 && kH == 1)
+    if (dW == 1 && dH == 1){
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 1, 1><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else if (dW == 2 && dH == 2) {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 1, 2><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 1, 0><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    }
+  else
+    if (dW == 1 && dH == 1){
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 0, 1><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else if (dW == 2 && dH == 2) {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 0, 2><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    } else {
+      spatialDepthwiseConvolutionUpdateGradInput<real, accreal, unsigned int, 0, 0><<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+      height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    }
+
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH)
+{
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradWeight);
+
+  // Only handle 4D Input Tensors for now
+  THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4);
+  THAssert(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 4);
+  THAssert(!gradWeight->is_empty() && THCTensor_(nDimension)(state, gradWeight) == 4);
+
+  // Minimal shape checking as above
+  // Same # of elements in batch
+  THAssert(input->size[0] == gradOutput->size[0]);
+  // Same # of filters as outputChannels
+  THAssert(gradWeight->size[0] == gradOutput->size[1]);
+
+  int batchSize = input->size[0];
+  int inputChannels = input->size[1];
+  int height = input->size[2];
+  int width = input->size[3];
+
+  int outputChannels = gradOutput->size[1];
+  int outputHeight = gradOutput->size[2];
+  int outputWidth = gradOutput->size[3];
+
+  int depthwiseMultiplier = outputChannels / inputChannels;
+
+  THCDeviceTensor<real, 4> dGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  THCDeviceTensor<real, 4> dInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> dGradWeight = toDeviceTensor<real, 4>(state, gradWeight);
+
+  // Kernel currently relies upon all the Tensors to be contiguous
+  THAssert(dGradOutput.isContiguous());
+  THAssert(dInput.isContiguous());
+  THAssert(dGradWeight.isContiguous());
+
+  // We parallelize so that each block computes a single value in gradWeight
+  int blocks = outputChannels * kH * kW;
+
+
+  // Make sure we have enough threads to perform the reduction, and use this number
+  // to create the shared memory size for the reduction
+  dim3 grid(blocks);
+  dim3 block(getGradParamsNumThreads(batchSize));
+  int smem = block.x * sizeof(accreal);
+
+  spatialDepthwiseConvolutionAccGradParameters<real, accreal, unsigned int><<<grid, block, smem, THCState_getCurrentStream(state)>>>(
+      dGradOutput, dInput, dGradWeight, batchSize, inputChannels, outputChannels, depthwiseMultiplier,
+      width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
new file mode 100644
index 0000000..1cac7f6
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -0,0 +1,497 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedConvolution.cu"
+#else
+
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         THCTensor *weight, THCTensor *bias,
+                         int kH, int kW, int dH, int dW, int padH, int padW,
+                         int dilationH, int dilationW, int weight_nullable) {
+  THArgCheck(kW > 0 && kH > 0, 9,
+	           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 14,
+             "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 4, 4, weight,
+                    "non-empty 4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+                  "but got: %s");
+    if (bias != NULL) {
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+   int ndim = input->dim();
+   int dimf = 0;
+   int dimh = 1;
+   int dimw = 2;
+
+   if (ndim == 4) {
+     dimf++;
+     dimh++;
+     dimw++;
+   }
+
+   THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                   "non-empty 3D or 4D input tensor expected but got: %s");
+
+   int64_t inputHeight  = input->size[dimh];
+   int64_t inputWidth   = input->size[dimw];
+
+   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+   if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld). Output size is too small",
+      inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+     THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+  }
+
+   if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    }
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+   }
+}
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU(state, 2, weight, bias);
+    THArgCheck(THCTensor_(isContiguous)(state, bias), 5, "bias tensor has to be contiguous");
+  }
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+       (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+        dilationH, dilationW, 0);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THCTensor_(newContiguous)(state, input);
+  weight = THCTensor_(newContiguous)(state, weight);
+  bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nOutputPlane;
+    int64_t n = columns->size[1];
+    int64_t k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+  if (bias) THCTensor_(free)(state, bias);
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+                       gradColumns, gradInput);
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+       (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+        dilationH, dilationW, 0);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nInputPlane*kW*kH;
+    int64_t n = gradColumns->size[1];
+    int64_t k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           accreal scale_) {
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
+  if (gradBias) {
+   THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
+  }
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+       (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+        dilationH, dilationW, 1);
+
+  if (gradWeight) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  if (gradBias) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous");
+  }
+
+  // Params
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = gradOutput->size[1];
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THCTensor_(select)(state, input_n, input, 0, elt);
+
+      // Extract columns:
+      im2col(
+        THCState_getCurrentStream(state),
+        THCTensor_(data)(state, input_n),
+        nInputPlane, inputHeight, inputWidth,
+        outputHeight, outputWidth,
+        kH, kW, padH, padW, dH, dW,
+        dilationH, dilationW,
+        THCTensor_(data)(state, columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kW*kH;
+      int64_t k = columns->size[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n, m, k,
+          scale,
+          THCTensor_(data)(state, columns), k,
+          THCTensor_(data)(state, gradOutput_n), k,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
new file mode 100644
index 0000000..7425345
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
@@ -0,0 +1,246 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *gradOutput, THCIndexTensor *indices,
+                         int kH, int kW, int dH, int dW, int padH, int padW,
+                         int dilationH, int dilationW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationH > 0 && dilationW > 0, 12,
+             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int batchSize = 1;
+
+  if (ndim == 4) {
+    batchSize = input->size[0];
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "padW = %d, padH = %d, kW = %d, kH = %d",
+             padW, padH, kW, kH);
+
+  int64_t nInputPlane = input->size[dimh-1];
+  int64_t nInputRows = input->size[dimh];
+  int64_t nInputCols = input->size[dimw];
+  int64_t nOutputRows, nOutputCols;
+  int64_t nOutputPlane = nInputPlane;
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  if (nOutputCols < 1 || nOutputRows < 1)
+    THError("Given input size: (%dx%dx%d). "
+            "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols);
+  }
+  if (indices != NULL) {
+    THCUNN_check_dim_size_indices(state, indices, 4, 0, batchSize);
+    THCUNN_check_dim_size_indices(state, indices, 4, 1, nOutputPlane);
+    THCUNN_check_dim_size_indices(state, indices, 4, 2, nOutputRows);
+    THCUNN_check_dim_size_indices(state, indices, 4, 3, nOutputCols);
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           bool ceil_mode)
+{
+
+  THCUNN_assertSameGPU(state, 3, input, output, indices);
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+       (state, input, NULL, NULL, kH, kW, dH, dW,
+        padH, padW, dilationH, dilationW, ceil_mode);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+  int64_t nOutputCols, nOutputRows;
+
+  if (input->dim() == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  real* input_data = THCTensor_(data)(state, input);
+
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+  THCUNN_resizeAs_indices(state, indices, output);
+
+  THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices);
+  real* output_data = THCTensor_(data)(state, output);
+
+  int count = THCTensor_(nElement)(state, output);
+
+  MaxPoolForward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, input_data,
+      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+      kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
+  THCudaCheck(cudaGetLastError());
+
+  if(input->dim() == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           bool ceil_mode)
+{
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+       (state, input, gradOutput, indices, kH, kW, dH, dW,
+       padH, padW, dilationH, dilationW, ceil_mode);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+  int64_t nOutputCols, nOutputRows;
+
+  if (input->_dim() == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  if(ceil_mode) {
+     nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+     nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+  dim3 grid;
+  int imgcount = nInputCols * nInputRows;
+  const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS;
+  grid.x = blocks;
+  grid.y = batchSize;
+  grid.z = nInputPlane;
+  uint64_t maxGridY = THCState_getCurrentDeviceProperties(state)->maxGridSize[1];
+  uint64_t maxGridZ = THCState_getCurrentDeviceProperties(state)->maxGridSize[2];
+  if (maxGridY < grid.y) grid.y = maxGridY;
+  if (maxGridZ < grid.z) grid.z = maxGridZ;
+  MaxPoolBackward<real, accreal> <<< grid, BACKWARD_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count,
+      THCTensor_(data)(state, gradOutput),
+      THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+      kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+      THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, gradOutput);
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu
new file mode 100644
index 0000000..0535653
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu
@@ -0,0 +1,157 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFractionalMaxPooling.cu"
+#else
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputW, int outputH,
+           int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices,
+           THCTensor *randomSamples)
+{
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int64_t numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THCTensor_(size)(state, input, planeDim);
+  int64_t inputH = THCTensor_(size)(state, input, dimh);
+  int64_t inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(outputH + poolSizeH - 1 <= inputH, 6,
+             "poolSizeH (%d) too large relative to input height (%d)",
+             poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 <= inputW, 5,
+             "poolSizeW (%d) too large relative to input width (%d)",
+             poolSizeW, inputW);
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+  THCDeviceTensor<THCIndex_t, 4> devIndices;
+  THCDeviceTensor<real, 3> devSamples =
+    toDeviceTensor<real, 3>(state, randomSamples);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize3d)(state, indices, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+    devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize4d)(state, indices, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devInput.getSize(1),
+            devInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+#define SFMP_UPDATE_OUTPUT(POOL_W)                                      \
+  SpatialFractionalMaxPooling_updateOutput<POOL_W, real, accreal>       \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH);
+
+#define SFMP_UPDATE_OUTPUT_CASE(POOL_W)                 \
+  case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
+
+  switch (poolSizeW) {
+    SFMP_UPDATE_OUTPUT_CASE(2);
+    SFMP_UPDATE_OUTPUT_CASE(3);
+    SFMP_UPDATE_OUTPUT_CASE(4);
+    SFMP_UPDATE_OUTPUT_CASE(5);
+    SFMP_UPDATE_OUTPUT_CASE(6);
+    SFMP_UPDATE_OUTPUT_CASE(7);
+    default:
+      // dynamic pool width
+      SFMP_UPDATE_OUTPUT_CASE(-1);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int outputW, int outputH,
+           int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices)
+{
+  int dimh = 1;
+  int dimw = 2;
+
+  int64_t numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  int64_t inputH = THCTensor_(size)(state, input, dimh);
+  int64_t inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+
+  /* resize */
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+  THCDeviceTensor<THCIndex_t, 4> devIndices;
+
+  /* backprop */
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+    devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devGradInput.getSize(1),
+            devGradInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  SpatialFractionalMaxPooling_updateGradInput
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      devGradInput, devGradOutput, devIndices);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialFullConvolution.cu b/aten/src/THCUNN/generic/SpatialFullConvolution.cu
new file mode 100644
index 0000000..6f9fa98
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialFullConvolution.cu
@@ -0,0 +1,61 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu"
+#else
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_updateOutput)(
+      state, input, output, weight, bias, columns, ones,
+      kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+      state, input, gradOutput, gradInput, weight, gradColumns,
+      kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH,
+           accreal scale_)
+{
+  THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+      state, input, gradOutput, gradWeight, gradBias,
+      columns, ones,
+      kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_);
+}
+
+#endif
\ No newline at end of file
diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
new file mode 100644
index 0000000..58ab364
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
@@ -0,0 +1,498 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFullDilatedConvolution.cu"
+#else
+
+static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         THCTensor *weight, THCTensor *bias,
+                         int kH, int kW, int dH, int dW, int padH, int padW,
+                         int dilationH, int dilationW,
+                         int adjH, int adjW, int weight_nullable) {
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+  THArgCheck((adjW < dW || adjW < dilationW) && (adjH < dH || adjH < dilationH), 15,
+             "output padding must be smaller than either stride or dilation, but got adjH: %d adjW: %d dH: %d dW: %d dilationH: %d dilationW: %d",
+             adjH, adjW, dH, dW, dilationH, dilationW);
+
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
+                    "non-empty 2D or 4D weight tensor expected, but got: %s");
+    if (bias != NULL) {
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+      "Calculated output spatial size per channel: (%ld x %ld). Output size is too small",
+      inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[0];
+    THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[1];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialFullDilatedConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           int adjW, int adjH)
+{
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU(state, 6, input, output, weight,
+                       bias, columns, ones);
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+       (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0);
+
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
+             "bias tensor has to be contiguous");
+  input = THCTensor_(newContiguous)(state, input);
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t n = columns->size[1];
+    int64_t k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, input_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, columns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, columns),
+      nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW, THCTensor_(data)(state, output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           int adjW, int adjH)
+{
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+                       gradColumns, gradInput);
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+       (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth,
+      inputHeight, inputWidth,
+      kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW, THCTensor_(data)(state, gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[0];
+    int64_t n = gradColumns->size[1];
+    int64_t k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradColumns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradInput_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, weight);
+}
+
+
+void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           int adjW, int adjH,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
+                       gradBias, columns, ones);
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+       (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW,
+        padH, padW, dilationH, dilationW, adjH, adjW, 1);
+
+  int nOutputPlane;
+  if (gradWeight != NULL) {
+    nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+  } else if (gradBias != NULL) {
+    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
+  } else {
+    return;
+  }
+
+  if (gradWeight) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  THArgCheck(THCTensor_(isContiguous)(state, columns), 6, "columns needs to be contiguous");
+  if (gradBias) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous");
+  }
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THCTensor_(select)(state, input_n, input, 0, elt);
+
+      // Extract columns:
+      im2col(
+        THCState_getCurrentStream(state),
+        THCTensor_(data)(state, gradOutput_n),
+        nOutputPlane, outputHeight, outputWidth,
+        inputHeight, inputWidth,
+        kH, kW, padH, padW, dH, dW,
+        dilationH, dilationW, THCTensor_(data)(state, columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t n = columns->size[0];   // nOutputPlane * kh * kw
+      int64_t m = input_n->size[0];   // nInputPlane
+      int64_t k = columns->size[1];   // inputHeight * inputWidth
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n, m, k,
+          scale,
+          THCTensor_(data)(state, columns), k,
+          THCTensor_(data)(state, input_n), k,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, input->size[1], inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
new file mode 100644
index 0000000..0e9afdf
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
@@ -0,0 +1,97 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu"
+#else
+
+static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *gradOutput) {
+  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimension)(state, input) == 4, 2, input,
+      "non-empty 4D input tensor expected but got: %s");
+  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimension)(state, grid) == 4, 2, grid,
+      "4D grid tensor expected but got: %s");
+
+  int64_t nbatch   = THCTensor_(size)(state, input, 0);
+  int64_t channels = THCTensor_(size)(state, input, 1);
+  int64_t iheight   = THCTensor_(size)(state, input, 2);
+  int64_t iwidth    = THCTensor_(size)(state, input, 3);
+  int64_t oheight   = THCTensor_(size)(state, grid, 1);
+  int64_t owidth    = THCTensor_(size)(state, grid, 2);
+
+  THCUNN_check_dim_size(state, grid, 4, 0, nbatch);
+  THCUNN_check_dim_size(state, grid, 4, 3, 2);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch);
+    THCUNN_check_dim_size(state, gradOutput, 4, 1, channels);
+    THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight);
+    THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth);
+  }
+}
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *output,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 3, input, grid, output);
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t IH = THCTensor_(size)(state, input, 2);
+  int64_t IW = THCTensor_(size)(state, input, 3);
+  int64_t H = THCTensor_(size)(state,grid, 1);
+  int64_t W = THCTensor_(size)(state, grid, 2);
+
+  // resize output to the same shape as input
+  THCTensor_(resize4d)(state, output, N, C, H, W);
+
+  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
+  THCDeviceTensor<real, 4> devOutput = toDeviceTensor<real, 4>(state, output);
+
+  int count = static_cast<int>(N*H*W);
+  SpatialGridSamplerBilinear_updateOutput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGrid, devOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+    THCState *state,
+    THCTensor *input, THCTensor *gradInput,
+    THCTensor *grid, THCTensor *gradGrid,
+    THCTensor *gradOutput,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t IH = THCTensor_(size)(state, input, 2);
+  int64_t IW = THCTensor_(size)(state, input, 3);
+  int64_t H = THCTensor_(size)(state, grid, 1);
+  int64_t W = THCTensor_(size)(state, grid, 2);
+
+  THCTensor_(resize4d)(state, gradInput, N, C, IH, IW);
+  THCTensor_(resize4d)(state, gradGrid, N, H, W, 2);
+  THCTensor_(zero)(state, gradInput);
+  THCTensor_(zero)(state, gradGrid);
+
+  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
+  THCDeviceTensor<real, 4> devGradGrid = toDeviceTensor<real, 4>(state, gradGrid);
+  THCDeviceTensor<real, 4> devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+
+  int count = static_cast<int>(N*H*W);
+  SpatialGridSamplerBilinear_updateGradInput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialMaxPooling.cu b/aten/src/THCUNN/generic/SpatialMaxPooling.cu
new file mode 100644
index 0000000..6be838d
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialMaxPooling.cu
@@ -0,0 +1,40 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+    state, input, output, indices,
+    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+    state, input, gradOutput, gradInput, indices,
+    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu
new file mode 100644
index 0000000..90d6e0a
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu
@@ -0,0 +1,104 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu"
+#else
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int owidth, int oheight)
+{
+  THCUNN_assertSameGPU(state, 3, input, output, indices);
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+  THCUNN_check_shape_indices(state, indices, input);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+
+  if (input->dim() == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth);
+  THCTensor_(zero)(state, output);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output));
+  THCudaCheck(cudaGetLastError());
+
+  if(input->dim() == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth);
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, indices);
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int owidth, int oheight)
+{
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
+  THCUNN_check_shape_indices(state, indices, input);
+
+  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
+  int dimw = 2;
+  int dimh = 1;
+
+  if (input->dim() == 3) {
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    ++dimw;
+    ++dimh;
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+  nInputCols = input->size[dimw];
+  nInputRows = input->size[dimh];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+     THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+             oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, indices);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
new file mode 100644
index 0000000..0c90944
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
@@ -0,0 +1,137 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu"
+#else
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR,
+           int padT, int padB) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s")
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(padL < inputW && padR < inputW, 4,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             padL, padR, dimw, THCTensor_(sizeDesc)(state, input).str);
+
+  THArgCheck(padT < inputH && padB < inputH, 6,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             padT, padB, dimh, THCTensor_(sizeDesc)(state, input).str);
+
+  int outputH = inputH + padT + padB;
+  int outputW  = inputW + padL + padR;
+
+  THArgCheck(outputW >= 1 || outputH >= 1, 2,
+             "input (H: %d, W: %d)is too small."
+             " Calculated output H: %d W: %d",
+             inputH, inputW, outputH, outputW);
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+
+  if (numInputDims == 3) {
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padT, padB, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR,
+           int padT, int padB) {
+
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+  int iheight = input->size[dimh];
+  int iwidth = input->size[dimw];
+  int oheight = iheight + padT + padB;
+  int owidth  = iwidth + padL + padR;
+
+  THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
+             "gradOutput width unexpected. Expected: %d, Got: %d",
+             owidth, THCTensor_(size)(state, gradOutput, dimw));
+  THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3,
+             "gradOutput height unexpected. Expected: %d, Got: %d",
+             oheight, THCTensor_(size)(state, gradOutput, dimh));
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padT, padB, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
new file mode 100644
index 0000000..6ab694d
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
@@ -0,0 +1,127 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu"
+#else
+
+void THNN_(SpatialReplicationPadding_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR,
+           int padT, int padB) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input,
+                  "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s")
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputH = inputH + padT + padB;
+  int outputW  = inputW + padL + padR;
+
+  THArgCheck(outputW >= 1 || outputH >= 1 , 2,
+             "input (H: %d, W: %d)is too small."
+             " Calculated output H: %d W: %d",
+             inputH, inputW, outputH, outputW);
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+
+  if (numInputDims == 3) {
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padT, padB, padL, padR);
+
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR,
+           int padT, int padB) {
+
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+  int iheight = input->size[dimh];
+  int iwidth = input->size[dimw];
+  int oheight = iheight + padT + padB;
+  int owidth  = iwidth + padL + padR;
+
+  THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
+             "gradOutput width unexpected. Expected: %d, Got: %d",
+             owidth, THCTensor_(size)(state, gradOutput, dimw));
+  THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3,
+             "gradOutput height unexpected. Expected: %d, Got: %d",
+             oheight, THCTensor_(size)(state, gradOutput, dimh));
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padT, padB, padL, padR);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialSubSampling.cu b/aten/src/THCUNN/generic/SpatialSubSampling.cu
new file mode 100644
index 0000000..ea71c82
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialSubSampling.cu
@@ -0,0 +1,259 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCTensor *weight,
+                         int kW, int kH) {
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+                  "non-empty 3D or 4D input tensor expected but got: %s");
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+  int dimc = 2;
+  int dimr = 1;
+  int dimp = 0;
+
+  if (input->dim() == 4) {
+    dimc++;
+    dimr++;
+    dimp++;
+  }
+
+  int64_t nInputCols = input->size[dimc];
+  int64_t nInputRows = input->size[dimr];
+  THArgCheck(input->size[dimp] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
+}
+
+void THNN_(SpatialSubSampling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           int kW, int kH,
+           int dW, int dH)
+{
+  real *weight_data = THCTensor_(data)(state, weight);
+  real *bias_data = THCTensor_(data)(state, bias);
+  real *output_data;
+  real *input_data;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+  THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
+  THNN_(SpatialSubSampling_shapeCheck)(state, input, NULL, weight, kW, kH);
+
+  if (input->dim() == 3) {
+    int64_t nInputCols = input->size[2];
+    int64_t nInputRows = input->size[1];
+    int64_t nOutputCols = (nInputCols - kW) / dW + 1;
+    int64_t nOutputRows = (nInputRows - kH) / dH + 1;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run subsample kernel
+    subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, output_data, weight_data, bias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    THCudaCheck(cudaGetLastError());
+  } else {
+    int64_t nInputCols = input->size[3];
+    int64_t nInputRows = input->size[2];
+    int64_t nbatch = input->size[0];
+    int64_t nOutputCols = (nInputCols - kW) / dW + 1;
+    int64_t nOutputRows = (nInputRows - kH) / dH + 1;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run subsample kernel
+    subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, output_data, weight_data, bias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           int kW, int kH,
+           int dW, int dH)
+{
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
+  THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, weight, kW, kH);
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+  if (input->dim() == 3) {
+    int64_t nInputCols = input->size[2];
+    int64_t nInputRows = input->size[1];
+
+    real *weight_data = THCTensor_(data)(state, weight);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *gradInput_data;
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run updateGradInput kernel
+    if (kH <= dH && kW <= dW) {
+      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    } else {
+      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    int64_t nInputCols = input->size[3];
+    int64_t nInputRows = input->size[2];
+    int64_t nbatch = input->size[0];
+
+    real *weight_data = THCTensor_(data)(state, weight);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *gradInput_data;
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run updateGradInput kernel
+    if (kH <= dH && kW <= dW) {
+      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    } else {
+      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           int kW, int kH,
+           int dW, int dH,
+           accreal scale)
+{
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias);
+  THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, gradWeight, kW, kH);
+
+  int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+
+  if (input->dim() == 3) {
+    int64_t nInputCols = input->size[2];
+    int64_t nInputRows = input->size[1];
+
+    real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+    real *gradBias_data = THCTensor_(data)(state, gradBias);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *input_data;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    // cuda blocks & threads:
+    dim3 blocks(nInputPlane);
+    dim3 threads(32,8);
+
+    // run gradweight kernel
+    subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, gradOutput_data, gradWeight_data, gradBias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+    THCudaCheck(cudaGetLastError());
+  } else {
+    int64_t nInputCols = input->size[3];
+    int64_t nInputRows = input->size[2];
+    int64_t nbatch = input->size[0];
+
+    real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+    real *gradBias_data = THCTensor_(data)(state, gradBias);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *input_data;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    // cuda blocks & threads:
+    dim3 blocks(nInputPlane);
+    dim3 threads(32,8);
+
+    // run gradweight kernel
+    int64_t sl;
+    for (sl=0; sl<nbatch; sl++) {
+      subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        input_data + sl*input->stride[0],
+        gradOutput_data + sl*gradOutput->stride[0],
+        gradWeight_data, gradBias_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu
new file mode 100644
index 0000000..f9cc0a4
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu
@@ -0,0 +1,105 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu"
+#else
+
+#include "../linear_upsampling.h"
+
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputHeight, int inputWidth,
+                         int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+             && outputHeight > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+             inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input,
+                     "non-empty 4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputHeight,
+           int outputWidth,
+           bool align_corners)
+{
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputHeight = THCTensor_(size)(state, input, 2);
+  int inputWidth = THCTensor_(size)(state, input, 3);
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+       (state, input, NULL,
+        nbatch, channels,
+        inputHeight, inputWidth,
+        outputHeight, outputWidth);
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resize4d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+                       outputHeight, outputWidth);
+  THCTensor_(zero)(state, output);
+  THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputHeight * outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
+   0 , stream>>>(num_kernels, rheight, rwidth, align_corners, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputHeight,
+           int inputWidth,
+           int outputHeight,
+           int outputWidth,
+           bool align_corners)
+{
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+       (state, NULL, gradOutput,
+        nbatch, nchannels,
+        inputHeight, inputWidth,
+        outputHeight, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputHeight * outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
+  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, align_corners, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
new file mode 100644
index 0000000..a71fc5b
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
@@ -0,0 +1,101 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputHeight, int inputWidth,
+                         int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+             && outputHeight > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+             inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, input->_dim() == 4, 2, input,
+                     "4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
+  }
+}
+
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+	   int outputHeight,
+           int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputHeight = THCTensor_(size)(state, input, 2);
+  int inputWidth  = THCTensor_(size)(state, input, 3);
+
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
+		  inputHeight, inputWidth,
+		  outputHeight, outputWidth);
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+
+  THCTensor_(resize4d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+		       outputHeight,
+                       outputWidth);
+  THCTensor_(zero)(state, output);
+
+  THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
+
+  const int num_kernels = outputHeight * outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  nearest_neighbor_4d_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
+	 0, stream>>>(num_kernels, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+	   int nchannels,
+	   int inputHeight,
+	   int inputWidth,
+	   int outputHeight,
+	   int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
+		  inputHeight, inputWidth, outputHeight, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
+
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
+
+  const int num_kernels = outputHeight * outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  nearest_neighbor_4d_kernel_backward<real, accreal> <<<THCCeilDiv(num_kernels, num_threads),
+	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Sqrt.cu b/aten/src/THCUNN/generic/Sqrt.cu
new file mode 100644
index 0000000..57a6fc8
--- /dev/null
+++ b/aten/src/THCUNN/generic/Sqrt.cu
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sqrt.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sqrt_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal eps_)
+{
+  real eps = ScalarConvert<accreal, real>::to(eps_);
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, sqrtupdateOutput_functor<real>(eps));
+}
+
+void THNN_(Sqrt_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_check_shape(state, output, gradOutput);
+  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Square.cu b/aten/src/THCUNN/generic/Square.cu
new file mode 100644
index 0000000..745502b
--- /dev/null
+++ b/aten/src/THCUNN/generic/Square.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Square.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Square_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2<real, real>(state, output, input, squareupdateOutput_functor<real>());
+}
+
+void THNN_(Square_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_check_shape(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput, squareupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
new file mode 100644
index 0000000..eaadf66
--- /dev/null
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -0,0 +1,1694 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCUNN.h"
+#else
+
+#include "Reduction.h"
+
+THC_API void THNN_(Abs_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+THC_API void THNN_(Abs_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+THC_API void THNN_(AbsCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction);
+
+THC_API void THNN_(AbsCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction);
+
+THC_API void THNN_(BatchNormalization_updateOutput)(
+                  THCState *state,
+                  THCTensor *input_,
+                  THCTensor *output_,
+                  THCTensor *weight_,        // [OPTIONAL]
+                  THCTensor *bias_,          // [OPTIONAL]
+                  THCTensor *runningMean_,   // [OPTIONAL] if train
+                  THCTensor *runningVar_,    // [OPTIONAL] if train
+                  THCTensor *saveMean_,
+                  THCTensor *saveStd_,
+                  bool train,
+                  double momentum,
+                  double eps);
+
+THC_API void THNN_(BatchNormalization_backward)(
+                  THCState *state,
+                  THCTensor *input_,
+                  THCTensor *gradOutput_,
+                  THCTensor *gradInput_,        // [OPTIONAL]
+                  THCTensor *gradWeight_,       // [OPTIONAL]
+                  THCTensor *gradBias_,         // [OPTIONAL]
+                  THCTensor *weight_,           // [OPTIONAL]
+                  THCTensor *runningMean_,      // [OPTIONAL] if train
+                  THCTensor *runningVar_,       // [OPTIONAL] if train
+                  THCTensor *saveMean_,         // [OPTIONAL] if !train
+                  THCTensor *saveStd_,          // [OPTIONAL] if !train
+                  bool train,
+                  double scale,
+                  double eps);
+
+THC_API void THNN_(BCECriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction,
+                  THCTensor *weights);         // [OPTIONAL]
+
+THC_API void THNN_(BCECriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction,
+                  THCTensor *weights);         // [OPTIONAL]
+
+THC_API void THNN_(ClassNLLCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  int64_t reduction,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight,
+                  int64_t ignore_index);
+
+THC_API void THNN_(ClassNLLCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight,
+                  int64_t ignore_index);
+
+THC_API void THNN_(DistKLDivCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction);
+
+THC_API void THNN_(DistKLDivCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction);
+
+THC_API void THNN_(ELU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal alpha,
+                  accreal scale,
+                  bool inplace);
+
+THC_API void THNN_(ELU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output,
+                  accreal alpha,
+                  accreal scale);
+
+THC_API void THNN_(FeatureLPPooling_updateOutput)(
+                  THCState* state,
+                  THCTensor* inputTH,
+                  THCTensor* outputTH,
+                  accreal power,
+                  int width,
+                  int stride,
+                  bool batchMode);
+
+THC_API void THNN_(FeatureLPPooling_updateGradInput)(
+                  THCState* state,
+                  THCTensor* gradOutputTH,
+                  THCTensor* inputTH,
+                  THCTensor* outputTH,
+                  THCTensor* gradInputTH,
+                  accreal power,
+                  int width,
+                  int stride,
+                  bool batchMode);
+
+THC_API void THNN_(HardTanh_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal min_val,
+                  accreal max_val,
+                  bool inplace);
+
+THC_API void THNN_(HardTanh_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  accreal min_val,
+                  accreal max_val,
+                  bool inplace);
+
+THC_API void THNN_(GatedLinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int dim);
+
+THC_API void THNN_(GatedLinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int dim);
+
+THC_API void THNN_(Im2Col_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
+
+THC_API void THNN_(Im2Col_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t inputHeight, int64_t inputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
+
+THC_API void THNN_(Col2Im_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int64_t outputHeight, int64_t outputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
+
+ THC_API void THNN_(Col2Im_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
+
+THC_API void THNN_(LeakyReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal negval,
+                  bool inplace);
+
+THC_API void THNN_(LeakyReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  accreal negval,
+                  bool inplace);
+
+THC_API void THNN_(GRUFused_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *hidden,
+                  THCTensor *bias1, // [OPTIONAL]
+                  THCTensor *bias2, // [OPTIONAL]
+                  THCTensor *hx,
+                  THCTensor *hy,
+                  THCTensor *storage);
+
+THC_API void THNN_(GRUFused_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradInInput,
+                  THCTensor *gradInHidden,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInputHx,
+                  THCTensor *storage);
+
+THC_API void THNN_(LSTMFused_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *hidden,
+                  THCTensor *bias1, // [OPTIONAL]
+                  THCTensor *bias2, // [OPTIONAL]
+                  THCTensor *cx,
+                  THCTensor *hy,
+                  THCTensor *cy);
+
+THC_API void THNN_(LSTMFused_updateGradInput)(
+                  THCState *state,
+                  THCTensor *storage,
+                  THCTensor *gradInGates,
+                  THCTensor *prevC,
+                  THCTensor *cy,
+                  THCTensor *gradOutput,
+                  THCTensor *gradOutputCell,
+                  THCTensor *gradInputCx);
+
+THC_API void THNN_(LogSigmoid_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *buffer);
+
+THC_API void THNN_(LogSigmoid_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *buffer);
+
+THC_API void THNN_(LookupTable_accGradParameters)(
+                  THCState *state,
+                  THCIndexTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCIndexTensor *count,
+                  THCIndexTensor *sorted,       // [OPTIONAL]
+                  THCIndexTensor *indices,      // [OPTIONAL]
+                  bool scaleGradByFreq,
+                  int paddingValue,
+                  accreal scale);
+
+THC_API void THNN_(LookupTable_renorm)(
+                  THCState *state,
+                  THCIndexTensor *idx,
+                  THCTensor *weight,
+                  accreal maxNorm,
+                  accreal normType);
+
+THC_API void THNN_(LookupTableBag_updateOutput)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCIndexTensor *offsets,
+           THCTensor *weight,
+           THCTensor *output,
+           THCIndexTensor *offset2bag,
+	   int mode,
+           THCIndexTensor *seq_length);       // [OPTIONAL]
+
+THC_API void THNN_(LookupTableBag_accGradParameters)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCIndexTensor *offset2bag,
+           THCIndexTensor *count,
+           THCIndexTensor *sortedIndices,
+           THCIndexTensor *origIndices,
+           bool scaleGradByFreq,
+	   int mode,
+	   THCIndexTensor *seq_length,        // [OPTIONAL]
+           accreal scale_);
+
+THC_API void THNN_(L1Cost_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+THC_API void THNN_(L1Cost_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,       // [OPTIONAL]
+                  THCTensor *gradInput);
+
+THC_API void THNN_(MarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  accreal margin);
+
+THC_API void THNN_(MarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  accreal margin);
+
+THC_API void THNN_(MSECriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction);
+
+THC_API void THNN_(MSECriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction);
+
+THC_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  THCTensor *istarget,
+                  int64_t reduction);
+
+THC_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *istarget,
+                  int64_t reduction);
+
+THC_API void THNN_(MultiMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  int64_t reduction,
+                  int p,
+                  THCTensor *weights,           // [OPTIONAL]
+                  accreal margin);
+
+THC_API void THNN_(MultiMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction,
+                  int p,
+                  THCTensor *weights,           // [OPTIONAL]
+                  accreal margin);
+
+THC_API void THNN_(PReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight);
+
+THC_API void THNN_(PReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight);
+
+THC_API void THNN_(PReLU_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *gradWeight,
+                  accreal scale);
+
+THC_API void THNN_(SmoothL1Criterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction);
+
+THC_API void THNN_(SmoothL1Criterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction);
+
+THC_API void THNN_(SparseLinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias);
+
+THC_API void THNN_(SparseLinear_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  accreal weightDecay,
+                  accreal scale);
+
+THC_API void THNN_(SparseLinear_legacyUpdateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias);
+
+THC_API void THNN_(SparseLinear_legacyAccGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  accreal weightDecay,
+                  accreal scale);
+
+THC_API void THNN_(SparseLinear_zeroGradParameters)(
+                  THCState *state,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *lastInput);
+
+THC_API void THNN_(SparseLinear_updateParameters)(
+                  THCState *state,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *lastInput,
+                  accreal learningRate);
+
+THC_API void THNN_(IndexLinear_updateOutput)(
+                  THCState *state,
+                  THCIndexTensor *keys,
+                  int64_t keysOffset,
+                  THCTensor *values,
+                  THCIndexTensor *sizes,
+                  THCIndexTensor *cumSumSizes,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *normalizedValues,
+                  int   train);
+
+THC_API void THNN_(IndexLinear_accGradParameters)(
+                  THCState *state,
+                  THCIndexTensor *keys,
+                  int64_t keysOffset,
+                  THCTensor *values,
+                  THCIndexTensor *sizes,
+                  THCIndexTensor *cumSumSizes,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor* valuesBuffer,
+                  accreal weightDecay,
+                  accreal scale);
+
+THC_API void THNN_(IndexLinear_accUpdateGradParameters)(
+                  THCState *state,
+                  THCIndexTensor *keys,
+                  int64_t keysOffset,
+                  THCTensor *values,
+                  THCIndexTensor *sizes,
+                  THCIndexTensor *cumSumSizes,
+                  THCTensor *gradOutput,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  accreal weightDecay,
+                  accreal scale);
+
+THC_API void THNN_(IndexLinear_updateParameters)(
+                  THCState *state,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCIndexTensor *runningKeys,
+                  THCIndexTensor *cumSumSizes,
+                  int64_t keysOffset,
+                  accreal weightDecay,
+                  accreal learningRate);
+
+THC_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int osizeW,
+                  int osizeH);
+
+THC_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices);
+
+THC_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int osizeW,
+                  int osizeH);
+
+THC_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+THC_API void THNN_(SpatialAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+THC_API void THNN_(SpatialAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+THC_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  int64_t reduction,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight,
+                  int64_t ignore_index);
+
+THC_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight,
+                  int64_t ignore_index);
+
+THC_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int64_t inputWidth, int64_t inputHeight,
+                  int64_t outputWidth, int64_t outputHeight);
+
+THC_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int64_t inputWidth, int64_t inputHeight,
+                  int64_t outputWidth, int64_t outputHeight);
+
+THC_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int64_t inputWidth, int64_t inputHeight,
+                  int64_t outputWidth, int64_t outputHeight,
+                  accreal scale);
+
+THC_API void THNN_(SpatialConvolutionMM_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,              // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH);
+
+THC_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH);
+
+THC_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,          // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  accreal scale);
+
+THC_API void THNN_(SpatialDepthwiseConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,              // [OPTIONAL]
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+THC_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+THC_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+THC_API void THNN_(SpatialCrossMapLRN_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *scale,
+                  int size,
+                  accreal alpha,
+                  accreal beta,
+                  accreal k);
+
+THC_API void THNN_(SpatialCrossMapLRN_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *scale,
+                  THCTensor *output,
+                  int size,
+                  accreal alpha,
+                  accreal beta,
+                  accreal k);
+
+THC_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,            // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+THC_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *columns,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+THC_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,        // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  accreal scale);
+
+THC_API void THNN_(SpatialFullDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,          // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  int adjW, int adjH);
+
+THC_API void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *columns,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  int adjW, int adjH);
+
+THC_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,     // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  int adjW, int adjH,
+                  accreal scale);
+
+THC_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  bool ceil_mode);
+
+THC_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  bool ceil_mode);
+
+THC_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputW, int outputH,
+                  int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices,
+                  THCTensor *randomSamples);
+
+THC_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int outputW, int outputH,
+                  int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices);
+
+THC_API void THNN_(SpatialFullConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,          // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH);
+
+THC_API void THNN_(SpatialFullConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *columns,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH);
+
+THC_API void THNN_(SpatialFullConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,     // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH,
+                  accreal scale);
+
+THC_API void THNN_(SpatialMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode);
+
+THC_API void THNN_(SpatialMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode);
+
+THC_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int owidth, int oheight);
+
+THC_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int owidth, int oheight);
+
+THC_API void THNN_(SpatialReflectionPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR,
+                  int padT, int padB);
+
+THC_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR,
+                  int padT, int padB);
+
+THC_API void THNN_(SpatialReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR,
+                  int padT, int padB);
+
+THC_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR,
+                  int padT, int padB);
+
+THC_API void THNN_(SpatialSubSampling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  int kW, int kH,
+                  int dW, int dH);
+
+THC_API void THNN_(SpatialSubSampling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int kH,
+                  int dW, int dH);
+
+THC_API void THNN_(SpatialSubSampling_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  int kW, int kH,
+                  int dW, int dH,
+                  accreal scale);
+
+THC_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputHeight,
+                  int outputWidth,
+                  bool align_corners);
+
+THC_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  bool align_corners);
+
+THC_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputHeight,
+                  int outputWidth);
+
+THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputHeight,
+                  int outputWidth);
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *grid,
+                  THCTensor *output,
+                  int padding_mode);
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input, THCTensor *gradInput,
+                  THCTensor *grid, THCTensor *gradGrid,
+                  THCTensor *gradOutput,
+                  int padding_mode);
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *grid,
+                  THCTensor *output,
+                  int padding_mode);
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input, THCTensor *gradInput,
+                  THCTensor *grid, THCTensor *gradGrid,
+                  THCTensor *gradOutput,
+                  int padding_mode);
+
+THC_API void THNN_(RReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *noise,
+                  double lower,
+                  double upper,
+                  bool train,
+                  bool inplace,
+                  void *generator);
+
+THC_API void THNN_(RReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *noise,
+                  double lower,
+                  double upper,
+                  bool train,
+                  bool inplace);
+
+THC_API void THNN_(Sigmoid_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+THC_API void THNN_(Sigmoid_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+THC_API void THNN_(SoftMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int64_t reduction);
+
+THC_API void THNN_(SoftMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int64_t reduction);
+
+THC_API void THNN_(SoftPlus_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal beta,
+                  accreal threshold);
+
+THC_API void THNN_(SoftPlus_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output,
+                  accreal beta,
+                  accreal threshold);
+
+THC_API void THNN_(SoftShrink_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal lambda);
+
+THC_API void THNN_(SoftShrink_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  accreal lambda);
+
+THC_API void THNN_(Square_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+THC_API void THNN_(Square_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+THC_API void THNN_(Sqrt_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal eps);
+
+THC_API void THNN_(Sqrt_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+THC_API void THNN_(Tanh_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+THC_API void THNN_(Tanh_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+THC_API void THNN_(TemporalConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  int kW, int dW,
+                  int inputFrameSize,
+                  int outputFrameSize);
+
+THC_API void THNN_(TemporalConvolution_updateGradInput)(
+                  THCState* state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int dW);
+
+THC_API void THNN_(TemporalConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  int kW, int dW,
+                  accreal scale);
+
+THC_API void THNN_(TemporalMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int dW);
+
+THC_API void THNN_(TemporalMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int dW);
+
+THC_API void THNN_(TemporalRowConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,          // [OPTIONAL]
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW,
+                  int dW,
+                  int padW,
+                  bool featFirst);
+
+THC_API void THNN_(TemporalRowConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW,
+                  int dW,
+                  int padW,
+                  bool featFirst);
+
+THC_API void THNN_(TemporalRowConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW,
+                  int dW,
+                  int padW,
+                  bool featFirst,
+                  accreal scale);
+
+THC_API void THNN_(TemporalReflectionPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR);
+
+THC_API void THNN_(TemporalReflectionPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR);
+
+THC_API void THNN_(TemporalReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR);
+
+THC_API void THNN_(TemporalReplicationPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR);
+
+THC_API void THNN_(TemporalUpSamplingLinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputWidth,
+                  bool align_corners);
+
+THC_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputWidth,
+                  int outputWidth,
+                  bool align_corners);
+
+THC_API void THNN_(TemporalUpSamplingNearest_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputWidth,
+                  int outputWidth);
+
+THC_API void THNN_(TemporalUpSamplingNearest_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputWidth);
+
+THC_API void THNN_(Threshold_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  accreal threshold,
+                  accreal val,
+                  bool inplace);
+
+THC_API void THNN_(Threshold_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  accreal threshold,
+                  accreal val,
+                  bool inplace);
+
+THC_API void THNN_(VolumetricAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+THC_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+// VolumetricConvolution is legacy and purposefully not bound by ATen
+THC_API void THNN_(VolumetricConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,         // [OPTIONAL]
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+THC_API void THNN_(VolumetricConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *finput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+THC_API void THNN_(VolumetricConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,     // [OPTIONAL]
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  accreal scale);
+
+THC_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *output,
+                  THCTensor  *weight,
+                  THCTensor  *bias,        // [OPTIONAL]
+                  THCTensor  *columns,
+                  THCTensor  *ones,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH);
+
+THC_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradInput,
+                  THCTensor  *weight,
+                  THCTensor  *columns,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH);
+
+THC_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradWeight,
+                  THCTensor  *gradBias,    // [OPTIONAL]
+                  THCTensor  *columns,
+                  THCTensor  *ones,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  accreal scale);
+
+THC_API void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *output,
+                  THCTensor  *weight,
+                  THCTensor  *bias,        // [OPTIONAL]
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  int adjT, int adjW, int adjH);
+
+THC_API void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradInput,
+                  THCTensor  *weight,
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  int adjT, int adjW, int adjH);
+
+THC_API void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradWeight,  // [OPTIONAL]
+                  THCTensor  *gradBias,    // [OPTIONAL]
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  int adjT, int adjW, int adjH,
+                  accreal scale);
+
+THC_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  bool ceilMode);
+
+THC_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  bool ceilMode);
+
+THC_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputT, int outputW, int outputH,
+                  int poolSizeT, int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices,
+                  THCTensor *randomSamples);
+
+THC_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int outputT, int outputW, int outputH,
+                  int poolSizeT, int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices);
+
+THC_API void THNN_(VolumetricFullConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *output,
+                  THCTensor  *weight,
+                  THCTensor  *bias,        // [OPTIONAL]
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH);
+
+THC_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradInput,
+                  THCTensor  *weight,
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH);
+
+THC_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradWeight,  // [OPTIONAL]
+                  THCTensor  *gradBias,    // [OPTIONAL]
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH,
+                  accreal scale);
+
+THC_API void THNN_(VolumetricMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  bool ceilMode);
+
+THC_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  bool ceilMode);
+
+THC_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int outputTime, int outputWidth, int outputHeight,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+THC_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int outputTime, int outputWidth, int outputHeight,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+THC_API void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int osizeT,
+                  int osizeW,
+                  int osizeH);
+
+THC_API void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices);
+
+THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int osizeT,
+                  int osizeW,
+                  int osizeH);
+
+THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+THC_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int pleft, int pright,
+                  int ptop, int pbottom,
+                  int pfront, int pback);
+
+THC_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int pleft, int pright,
+                  int ptop, int pbottom,
+                  int pfront, int pback);
+
+THC_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputDepth,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth);
+
+THC_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth);
+
+THC_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth,
+                  bool align_corners);
+
+THC_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputDepth,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth,
+                  bool align_corners);
+
+#endif
diff --git a/aten/src/THCUNN/generic/Tanh.cu b/aten/src/THCUNN/generic/Tanh.cu
new file mode 100644
index 0000000..32abd47
--- /dev/null
+++ b/aten/src/THCUNN/generic/Tanh.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Tanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Tanh_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THCTensor_(tanh)(state, output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_check_shape(state, output, gradOutput);
+  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, tanh_updateGradInput_functor<real>());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu
new file mode 100644
index 0000000..1bb1761
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalConvolution.cu
@@ -0,0 +1,397 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalConvolution.cu"
+#else
+
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+
+  THArgCheck(kW > 0, 9,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 11,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+                  "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+               "invalid input frame size. Got: %d, Expected: %d",
+               input->size[dimF], *inputFrameSize);
+  }
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+}
+
+void THNN_(TemporalConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           int kW, int dW,
+           int inputFrameSize,
+           int outputFrameSize) {
+
+  THCTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  int64_t k, i;
+
+  int dimS = 0; // sequence dimension
+
+  THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, &inputFrameSize);
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, "bias must be contiguous");
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  outputWindow = THCTensor_(new)(state);
+  inputWindow = THCTensor_(new)(state);
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->dim() == 2)
+  {
+    THCTensor_(resize2d)(state, output,
+                          nOutputFrame,
+                          outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THCTensor_(select)(state, outputWindow, output, 0, k);
+      THCTensor_(copy)(state, outputWindow, bias);
+    }
+
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, outputWindow, output->storage,
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THCTensor *tweight = THCTensor_(new)(state);
+      THCTensor_(transpose)(state, tweight, weight, 0, 1);
+      THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, tweight);
+      THCTensor_(free)(state, tweight);
+    }
+  }
+  else
+  {
+    THCTensor *outputSample = THCTensor_(new)(state);
+    THCTensor *inputSample = THCTensor_(new)(state);
+    int nBatchFrame = input->size[0];
+
+    THCTensor_(resize3d)(state, output,
+                          nBatchFrame,
+                          nOutputFrame,
+                          outputFrameSize);
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, outputSample, output, 0, i);
+      THCTensor_(select)(state, inputSample, input, 0, i);
+      int64_t nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THCTensor_(select)(state, outputWindow, outputSample, 0, k);
+        THCTensor_(copy)(state, outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage,
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THCTensor *tweight = THCTensor_(new)(state);
+        THCTensor_(transpose)(state, tweight, weight, 0, 1);
+        THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, tweight);
+        THCTensor_(free)(state, tweight);
+      }
+    }
+    THCTensor_(free)(state, outputSample);
+    THCTensor_(free)(state, inputSample);
+  }
+
+  THCTensor_(free)(state, outputWindow);
+  THCTensor_(free)(state, inputWindow);
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+           THCState* state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           int kW, int dW) {
+
+  int64_t nInputFrame;
+  int64_t nOutputFrame;
+
+  THCTensor *gradOutputWindow;
+  THCTensor *gradInputWindow;
+  int64_t k, i;
+
+  int dimS = 0; // sequence dimension
+
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous");
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, NULL);
+
+  if (gradOutput->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+
+  /* Not necessary with partial backprop: */
+  gradOutputWindow = THCTensor_(new)(state);
+  gradInputWindow = THCTensor_(new)(state);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  if (gradOutput->dim() == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THCTensor *gradOutputSample = THCTensor_(new)(state);
+    THCTensor *gradInputSample = THCTensor_(new)(state);
+    int64_t nBatchFrame = input->size[0];
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+      THCTensor_(select)(state, gradInputSample, gradInput, 0, i);
+      int64_t nOutputSampleFrame = nOutputFrame;
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+      }
+    }
+    THCTensor_(free)(state, gradOutputSample);
+    THCTensor_(free)(state, gradInputSample);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, gradOutputWindow);
+  THCTensor_(free)(state, gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           int kW, int dW,
+           accreal scale_) {
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  int64_t nInputFrame;
+  int64_t nOutputFrame;
+
+  THCTensor *gradOutputWindow;
+  THCTensor *inputWindow;
+  int64_t k, i;
+
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, NULL);
+
+  int dimS = 0; // sequence dimension
+
+  if (gradOutput->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  /* Not necessary with partial backprop: */
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  gradOutputWindow = THCTensor_(new)(state);
+  inputWindow = THCTensor_(new)(state);
+
+  if (input->dim() == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THCTensor_(select)(state, gradOutputWindow, gradOutput, 0, k);
+      THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THCTensor *tgradOutputWindow = THCTensor_(new)(state);
+      THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1);
+      THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, tgradOutputWindow, inputWindow);
+      THCTensor_(free)(state, tgradOutputWindow);
+    }
+  }
+  else
+  {
+    THCTensor *gradOutputSample = THCTensor_(new)(state);
+    THCTensor *inputSample = THCTensor_(new)(state);
+    int64_t nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+      THCTensor_(select)(state, inputSample, input, 0, i);
+      int64_t nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THCTensor_(select)(state, gradOutputWindow, gradOutputSample, 0, k);
+        THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THCTensor *tgradOutputWindow = THCTensor_(new)(state);
+        THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1);
+        THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, tgradOutputWindow, inputWindow);
+        THCTensor_(free)(state, tgradOutputWindow);
+      }
+    }
+    THCTensor_(free)(state, gradOutputSample);
+    THCTensor_(free)(state, inputSample);
+  }
+
+  THCTensor_(free)(state, gradOutputWindow);
+  THCTensor_(free)(state, inputWindow);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, input);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalMaxPooling.cu b/aten/src/THCUNN/generic/TemporalMaxPooling.cu
new file mode 100644
index 0000000..e355ebd
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalMaxPooling.cu
@@ -0,0 +1,188 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu"
+#else
+
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCIndexTensor *indices,
+                         int kW, int dW) {
+  int dimT = 0; // Temporal dimension
+  int dimF = 1; // Feature dimension
+  int input_w;
+  int input_n;
+  int output_w;
+  int ndims = input->dim();
+
+  if (ndims == 3)
+  {
+    dimT = 1;
+    dimF = 2;
+  }
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+                  "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(input->size[dimT] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimT], kW);
+
+  input_w = input->size[dimT];
+  input_n = input->size[dimF];
+  output_w = (input_w - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndims, dimT, output_w);
+    THCUNN_check_dim_size(state, gradOutput, ndims, dimF, input_n)
+  }
+  if (indices != NULL) {
+    THCUNN_check_dim_size_indices(state, indices, ndims, dimT, output_w);
+    THCUNN_check_dim_size_indices(state, indices, ndims, dimF, input_n);
+  }
+}
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int dW) {
+
+  int dimT = 0; // Temporal dimension
+  int dimF = 1; // Feature dimension
+
+  int batch = 1;
+  int input_w;
+  int input_n;
+  int output_w;
+  int nthreads;
+
+  real *input_data;
+  real *output_data;
+  THCIndex_t *indices_data;
+
+  THCUNN_assertSameGPU(state, 3, input, output, indices);
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
+  if (input->dim() == 3)
+  {
+    dimT = 1;
+    dimF = 2;
+    batch = input->size[0];
+  }
+  input = THCTensor_(newContiguous)(state, input);
+
+  input_w = input->size[dimT];
+  input_n = input->size[dimF];
+  output_w = (input_w - kW) / dW + 1;
+
+  if (input->dim() == 2)
+  {
+    THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
+    THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
+  }
+  else
+  {
+    THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
+    THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
+  }
+
+  input_data = THCTensor_(data)(state, input);
+  output_data = THCTensor_(data)(state, output);
+  indices_data = THCIndexTensor_(data)(state, indices);
+
+  dim3 blocks(batch);
+  nthreads = (output_w / 32) * 32;
+  if (output_w % 32 > 0) {
+    nthreads += 32;
+  }
+
+  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+      blocks.y += 1;
+    }
+    nthreads = TEMPORAL_MAX_POOLING_THREADS;
+  }
+
+  dim3 threads(nthreads);
+  cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+      input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int dW) {
+
+  int dimT = 0; // Temporal dimension
+  int dimF = 1; // Feature dimension
+
+  int batch = 1;
+  int input_w;
+  int input_n;
+  int output_w;
+  int nthreads;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  THCIndex_t *indices_data;
+
+  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices);
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  if (input->dim() == 3)
+  {
+    dimT = 1;
+    dimF = 2;
+    batch = input->size[0];
+  }
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  input_w = input->size[dimT];
+  input_n = input->size[dimF];
+  output_w = (input_w - kW) / dW + 1;
+
+  gradInput_data = THCTensor_(data)(state, gradInput);
+  gradOutput_data = THCTensor_(data)(state, gradOutput);
+  indices_data = THCIndexTensor_(data)(state, indices);
+
+  dim3 blocks(batch);
+  nthreads = (output_w / 32) * 32;
+  if (output_w % 32 > 0) {
+    nthreads += 32;
+  }
+
+  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+      blocks.y += 1;
+    }
+    nthreads = TEMPORAL_MAX_POOLING_THREADS;
+  }
+
+  dim3 threads(nthreads);
+  if (kW <= dW) {
+    cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+  } else {
+    cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+  }
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
new file mode 100644
index 0000000..394c796
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
@@ -0,0 +1,119 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalReflectionPadding.cu"
+#else
+
+void THNN_(TemporalReflectionPadding_updateOutput)(THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimw = 1;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 2 || numInputDims == 3), 2, input,
+                  "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s")
+
+  if (numInputDims == 3) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(padL < inputW && padR < inputW, 4,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             padL, padR, dimw, THCTensor_(sizeDesc)(state, input).str);
+
+  int outputW  = inputW + padL + padR;
+
+  THArgCheck(outputW >= 1 , 2,
+             "input (W: %d)is too small."
+             " Calculated output W: %d",
+             inputW, outputW);
+
+  THCDeviceTensor<real, 3> devInput;
+  THCDeviceTensor<real, 3> devOutput;
+
+  if (numInputDims == 2) {
+    THCTensor_(resize2d)(state, output, numPlanes, outputW);
+
+    devInput = toDeviceTensor<real, 2>(state, input).upcastOuter<3>();
+    devOutput = toDeviceTensor<real, 2>(state, output).upcastOuter<3>();
+  } else {
+    THCTensor_(resize3d)(state, output, numBatch, numPlanes, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input);
+    devOutput = toDeviceTensor<real, 3>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  TemporalReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(TemporalReflectionPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR) {
+
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimw = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 3) {
+    planeDim++;
+    dimw++;
+  }
+  int iwidth = input->size[dimw];
+  int owidth  = iwidth + padL + padR;
+
+  THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
+             "gradOutput width unexpected. Expected: %d, Got: %d",
+             owidth, THCTensor_(size)(state, gradOutput, dimw));
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 3> devGradInput;
+  THCDeviceTensor<real, 3> devGradOutput;
+
+  if (numInputDims == 2) {
+    devGradInput = toDeviceTensor<real, 2>(state, gradInput).upcastOuter<3>();
+    devGradOutput = toDeviceTensor<real, 2>(state, gradOutput).upcastOuter<3>();
+  } else {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  TemporalReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
new file mode 100644
index 0000000..11637dc
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
@@ -0,0 +1,114 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalReplicationPadding.cu"
+#else
+
+void THNN_(TemporalReplicationPadding_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimw = 1;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 2 || numInputDims == 3), 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s")
+
+  if (numInputDims == 3) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputW  = inputW + padL + padR;
+
+  THArgCheck(outputW >= 1, 2,
+             "input (W: %d)is too small."
+             " Calculated output W: %d",
+             inputW, outputW);
+
+  THCDeviceTensor<real, 3> devInput;
+  THCDeviceTensor<real, 3> devOutput;
+
+  if (numInputDims == 2) {
+    THCTensor_(resize2d)(state, output, numPlanes, outputW);
+
+    devInput = toDeviceTensor<real, 2>(state, input).upcastOuter<3>();
+    devOutput = toDeviceTensor<real, 2>(state, output).upcastOuter<3>();
+  } else {
+    THCTensor_(resize3d)(state, output, numBatch, numPlanes, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input);
+    devOutput = toDeviceTensor<real, 3>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  TemporalReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padL, padR);
+
+}
+
+void THNN_(TemporalReplicationPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR) {
+
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimw = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 3) {
+    planeDim++;
+    dimw++;
+  }
+  int iwidth = input->size[dimw];
+  int owidth  = iwidth + padL + padR;
+
+  THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
+             "gradOutput width unexpected. Expected: %d, Got: %d",
+             owidth, THCTensor_(size)(state, gradOutput, dimw));
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 3> devGradInput;
+  THCDeviceTensor<real, 3> devGradOutput;
+
+  if (numInputDims == 2) {
+    devGradInput = toDeviceTensor<real, 2>(state, gradInput).upcastOuter<3>();
+    devGradOutput = toDeviceTensor<real, 2>(state, gradOutput).upcastOuter<3>();
+  } else {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  TemporalReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padL, padR);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
new file mode 100644
index 0000000..26361d4
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
@@ -0,0 +1,430 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalRowConvolution.cu"
+#else
+
+static inline void THNN_(TemporalRowConvolution_shapeCheck)(
+    THCState *state, THCTensor *input, THCTensor *gradOutput, THCTensor *weight,
+    THCTensor *bias, int kW, int dW, int padW) {
+
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6, "stride should be greater than zero, but got dW: %d",
+             dW);
+  THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 3), 3,
+                  weight, "non-empty 2D or 3D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->dim();
+  int dimF = 0; // feature dimension
+  int dimS = 1; // sequence dimension
+
+  if (ndim == 3) {
+    ++dimF;
+    ++dimS;
+  }
+
+  THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
+                  "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
+
+  int64_t inputFrameSize = weight->size[0];
+  int64_t nInputFrame = input->size[dimS];
+  int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+  if (nOutputFrame < 1) {
+    THError("Given input size: (%d x %d). "
+            "Calculated output size: (%d x %d). Output size is too small",
+            inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+  }
+
+  THCUNN_check_dim_size(state, input, ndim, dimF, inputFrameSize);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimF, inputFrameSize);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimS, nOutputFrame);
+  }
+}
+
+void THNN_(TemporalRowConvolution_updateOutput)(
+    THCState *state, THCTensor *input, THCTensor *output, THCTensor *weight,
+    THCTensor *bias, THCTensor *finput, THCTensor *fgradInput, int kW, int dW,
+    int padW, bool featFirst) {
+
+  // aliases
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+
+  // assert same GPU
+  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
+  if (bias != NULL) {
+    THCUNN_assertSameGPU(state, 2, weight, bias);
+  }
+
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, "bias must be contiguous");
+
+  // reshape weight if necessary
+  int ndim = input->dim();
+
+  THCTensor *tinput;
+
+  if (!featFirst) {
+    tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2);
+    input = THCTensor_(newContiguous)(state, tinput);
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+  }
+
+  THNN_(TemporalRowConvolution_shapeCheck)
+  (state, input, NULL, weight, bias, kW, dW, padW);
+
+  int batch = 1;
+  if (ndim == 2) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+  }
+
+  // Params:
+  int64_t inputFrameSize = weight->size[0];
+  int64_t nInputFrame = input->size[2];
+  int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+  // Batch size
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize3d)(state, output, batchSize, inputFrameSize, nOutputFrame);
+
+  // Augment the input
+  THCTensor_(resize3d)(state, columns, inputFrameSize, kW, nOutputFrame);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever
+  // gets increased and always contains ones.
+  if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, 1, nOutputFrame);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; ++elt) {
+    // Matrix multiply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do bias first:
+    // m_, n_, k_ are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = inputFrameSize;
+    int64_t n_ = nOutputFrame;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm asummes
+    // column-major matrices)
+    if (bias != NULL) {
+#ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+#elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+#elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+#endif
+          state, 't', 'n', n_, m_, k_, ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_, THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0), THCTensor_(data)(state, output_n),
+          n_);
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    row2col(THCState_getCurrentStream(state), THCTensor_(data)(state, input_n),
+            inputFrameSize, nInputFrame, kW, padW, dW, 1,
+            THCTensor_(data)(state, columns));
+
+    THCTensor *output3d = THCTensor_(newWithStorage3d)(
+        state, output_n->storage, output_n->storageOffset, inputFrameSize, -1,
+        1, -1, nOutputFrame, -1);
+
+    // weight:    inputFrameSize x 1 x kW
+    // columns:   inputFrameSize x kW x nOutputFrame
+    THCTensor_(baddbmm)(state, output3d, ScalarConvert<int, real>::to(1),
+                        output3d, ScalarConvert<int, real>::to(1), weight,
+                        columns);
+    // output3d:  inputFrameSize x 1 x nOutputFrame
+
+    THCTensor_(free)(state, output3d);
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize2d)(state, output, inputFrameSize, nOutputFrame);
+    THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame);
+  }
+
+  if (!featFirst) {
+    THCTensor_(transpose)(state, output, output, ndim - 1, ndim - 2);
+    THCTensor_(free)(state, tinput);
+  }
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(TemporalRowConvolution_updateGradInput)(
+    THCState *state, THCTensor *input, THCTensor *gradOutput,
+    THCTensor *gradInput, THCTensor *weight, THCTensor *finput,
+    THCTensor *fgradInput, int kW, int dW, int padW, bool featFirst) {
+
+  // aliases
+  THCTensor *gradColumns = finput;
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns,
+                       gradInput);
+
+  THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous");
+
+  int ndim = input->dim();
+
+  THCTensor *tinput, *tgradOutput;
+
+  if (!featFirst) {
+    tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2);
+    tgradOutput =
+        THCTensor_(newTranspose)(state, gradOutput, ndim - 1, ndim - 2);
+    input = THCTensor_(newContiguous)(state, tinput);
+    gradOutput = THCTensor_(newContiguous)(state, tgradOutput);
+
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  }
+
+  THNN_(TemporalRowConvolution_shapeCheck)
+  (state, input, gradOutput, weight, NULL, kW, dW, padW);
+
+  int batch = 1;
+  if (ndim == 2) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0],
+                         gradOutput->size[1]);
+  }
+
+  // Params:
+  int64_t inputFrameSize = weight->size[0];
+  int64_t nInputFrame = input->size[2];
+  int64_t nOutputFrame = gradOutput->size[2];
+
+  // Batch size
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize3d)(state, gradInput, batchSize, inputFrameSize,
+                       nInputFrame);
+
+  // Resize temporary columns
+  THCTensor_(resize3d)(state, gradColumns, inputFrameSize, kW, nOutputFrame);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  THCTensor *tweight = THCTensor_(new)(state);
+  THCTensor_(transpose)(state, tweight, weight, 1, 2);
+
+  for (int elt = 0; elt < batchSize; ++elt) {
+    // Matrix multiply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)(
+        state, gradOutput_n->storage, gradOutput_n->storageOffset,
+        inputFrameSize, -1, 1, -1, nOutputFrame, -1);
+
+    // weight:          inputFrameSize x kW x 1
+    // gradOutput3d:    inputFrameSize x 1 x nOutputFrame
+    THCTensor_(baddbmm)(state, gradColumns, ScalarConvert<int, real>::to(0),
+                        gradColumns, ScalarConvert<int, real>::to(1), tweight,
+                        gradOutput3d);
+    // gradColumns:     inputFrameSize x kW x nOutputFrame
+
+    // Unpack columns back into input:
+    col2row<real, accreal>(THCState_getCurrentStream(state),
+                           THCTensor_(data)(state, gradColumns), inputFrameSize,
+                           nInputFrame, kW, padW, dW, 1,
+                           THCTensor_(data)(state, gradInput_n));
+
+    THCTensor_(free)(state, gradOutput3d);
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize2d)(state, gradOutput, inputFrameSize, nOutputFrame);
+    THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame);
+    THCTensor_(resize2d)(state, gradInput, inputFrameSize, nInputFrame);
+  }
+
+  THCTensor_(free)(state, tweight);
+
+  if (!featFirst) {
+    THCTensor_(transpose)(state, gradInput, gradInput, ndim - 1, ndim - 2);
+    THCTensor_(free)(state, tinput);
+    THCTensor_(free)(state, tgradOutput);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(TemporalRowConvolution_accGradParameters)(
+    THCState *state, THCTensor *input, THCTensor *gradOutput,
+    THCTensor *gradWeight, THCTensor *gradBias, THCTensor *finput,
+    THCTensor *fgradInput, int kW, int dW, int padW, bool featFirst,
+    accreal scale_) {
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  // Aliases
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
+  if (gradBias != NULL) {
+    THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
+  }
+
+  int ndim = input->dim();
+
+  THCTensor *tinput, *tgradOutput;
+
+  if (!featFirst) {
+    tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2);
+    tgradOutput =
+        THCTensor_(newTranspose)(state, gradOutput, ndim - 1, ndim - 2);
+    input = THCTensor_(newContiguous)(state, tinput);
+    gradOutput = THCTensor_(newContiguous)(state, tgradOutput);
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  }
+
+  THNN_(TemporalRowConvolution_shapeCheck)
+  (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+
+  int batch = 1;
+  if (ndim == 2) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0],
+                         gradOutput->size[1]);
+  }
+
+  // Params:
+  int64_t inputFrameSize = gradWeight->size[0];
+  int64_t nInputFrame = input->size[2];
+  int64_t nOutputFrame = gradOutput->size[2];
+
+  // Batch size
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, 1, nOutputFrame);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // // Resize temporary columns
+  THCTensor_(resize3d)(state, columns, inputFrameSize, kW, nOutputFrame);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; ++elt) {
+    // Matrix multiply per output
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)(
+        state, gradOutput_n->storage, gradOutput_n->storageOffset,
+        inputFrameSize, -1, 1, -1, nOutputFrame, -1);
+
+    // Extract columns
+    row2col(THCState_getCurrentStream(state), THCTensor_(data)(state, input_n),
+            inputFrameSize, nInputFrame, kW, padW, dW, 1,
+            THCTensor_(data)(state, columns));
+
+    THCTensor *tcolumns = THCTensor_(new)(state);
+    THCTensor_(transpose)(state, tcolumns, columns, 1, 2);
+
+    // gradOutput3d:  inputFrameSize x 1 x nOutputFrame
+    // columns:       inputFrameSize x nOutputFrame x kW
+    THCTensor_(baddbmm)(state, gradWeight, ScalarConvert<int, real>::to(1),
+                        gradWeight, scale, gradOutput3d, tcolumns);
+    // gradWeight:    inputFrameSize x 1 x kW
+
+    THCTensor_(free)(state, tcolumns);
+    THCTensor_(free)(state, gradOutput3d);
+
+    if (gradBias != NULL) {
+      int64_t m_ = inputFrameSize;
+      int64_t k_ = nOutputFrame;
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+#ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+#elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+#endif
+          state, 't', k_, m_, scale, THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1, ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1);
+#endif
+#ifdef THC_REAL_IS_HALF // half not supported due to baddbmm
+      THCudaBlas_Hgemm(state, 't', 'n', m_, 1, k_, scale,
+                       THCTensor_(data)(state, gradOutput_n), k_,
+                       THCTensor_(data)(state, ones), k_,
+                       ScalarConvert<int, real>::to(1),
+                       THCTensor_(data)(state, gradBias), m_);
+#endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize2d)(state, gradOutput, inputFrameSize, nOutputFrame);
+    THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame);
+  }
+
+  if (!featFirst) {
+    THCTensor_(free)(state, tinput);
+    THCTensor_(free)(state, tgradOutput);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu
new file mode 100644
index 0000000..6199eef
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu
@@ -0,0 +1,95 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalUpSamplingLinear.cu"
+#else
+
+#include "../linear_upsampling.h"
+
+static inline void THNN_(TemporalUpSamplingLinear_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputWidth,
+                         int outputWidth) {
+  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (W: %d) output (W: %d)",
+             inputWidth, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 3, 2, input,
+                     "non-empty 3D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
+  }
+}
+
+void THNN_(TemporalUpSamplingLinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputWidth,
+           bool align_corners)
+{
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputWidth = THCTensor_(size)(state, input, 2);
+  THNN_(TemporalUpSamplingLinear_shapeCheck)
+       (state, input, NULL,
+        nbatch, channels,
+        inputWidth, outputWidth);
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resize3d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+                       outputWidth);
+  THCTensor_(zero)(state, output);
+  THCDeviceTensor<real, 3> idata = toDeviceTensor<real, 3>(state, input);
+  THCDeviceTensor<real, 3> odata = toDeviceTensor<real, 3>(state, output);
+  THAssert(inputWidth > 0 && outputWidth > 0);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
+   0 , stream>>>(num_kernels, rwidth, align_corners, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(TemporalUpSamplingLinear_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputWidth,
+           int outputWidth,
+           bool align_corners)
+{
+  THNN_(TemporalUpSamplingLinear_shapeCheck)
+       (state, NULL, gradOutput,
+        nbatch, nchannels,
+        inputWidth, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth);
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 3> data1 = toDeviceTensor<real, 3>(state, gradInput);
+  THCDeviceTensor<real, 3> data2 = toDeviceTensor<real, 3>(state, gradOutput);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
+  num_threads, 0, stream>>>(num_kernels, rwidth, align_corners, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
new file mode 100644
index 0000000..55dfea2
--- /dev/null
+++ b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
@@ -0,0 +1,90 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalUpSamplingNearest.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(TemporalUpSamplingNearest_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputWidth,
+                         int outputWidth) {
+  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (W: %d) output (W: %d)",
+             inputWidth, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, input->_dim() == 3, 2, input,
+                     "3D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
+  }
+}
+
+void THNN_(TemporalUpSamplingNearest_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputWidth  = THCTensor_(size)(state, input, 2);
+
+  THNN_(TemporalUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, inputWidth, outputWidth);
+  THAssert(inputWidth > 0 && outputWidth > 0);
+
+  THCTensor_(resize3d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+                       outputWidth);
+  THCTensor_(zero)(state, output);
+
+  THCDeviceTensor<real, 3> idata = toDeviceTensor<real, 3>(state, input);
+  THCDeviceTensor<real, 3> odata = toDeviceTensor<real, 3>(state, output);
+
+  const int num_kernels = outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  nearest_neighbor_3d_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
+	 0, stream>>>(num_kernels, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(TemporalUpSamplingNearest_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputWidth,
+           int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THNN_(TemporalUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, inputWidth, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth);
+
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 3> data1 = toDeviceTensor<real, 3>(state, gradInput);
+  THCDeviceTensor<real, 3> data2 = toDeviceTensor<real, 3>(state, gradOutput);
+
+  const int num_kernels = outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  nearest_neighbor_3d_kernel_backward<real, accreal> <<<THCCeilDiv(num_kernels, num_threads),
+	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/Threshold.cu b/aten/src/THCUNN/generic/Threshold.cu
new file mode 100644
index 0000000..794ad45
--- /dev/null
+++ b/aten/src/THCUNN/generic/Threshold.cu
@@ -0,0 +1,70 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Threshold.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Threshold_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           accreal threshold_,
+           accreal val_,
+           bool inplace)
+{
+  real threshold = ScalarConvert<accreal, real>::to(threshold_);
+  real val = ScalarConvert<accreal, real>::to(val_);
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1<real>(state, input,
+      ThresholdUpdateOutputIP<real>(threshold, val)
+    );
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2<real, real>(state, output, input,
+      ThresholdUpdateOutput<real>(threshold, val)
+    );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(Threshold_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           accreal threshold_,
+           accreal val_,
+           bool inplace)
+{
+  real threshold = ScalarConvert<accreal, real>::to(threshold_);
+  real val = ScalarConvert<accreal, real>::to(val_);
+  (void) val;
+  THCUNN_check_nElement(state, input, gradOutput);
+  THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
+
+  if (inplace)
+  {
+    THC_pointwiseApply2<real, real>(state, gradOutput, input,
+      ThresholdUpdateGradInputIP<real>(threshold)
+    );
+    THCTensor_(set)(state, gradInput, gradOutput);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3<real, real, real>(state, gradInput, input, gradOutput,
+       ThresholdUpdateGradInput<real>(threshold)
+    );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
new file mode 100644
index 0000000..d297483
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
@@ -0,0 +1,173 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricAdaptiveAveragePooling.cu"
+#else
+
+#include "../common.h"
+
+// 5d tensor B x D x T x H x W
+
+void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int osizeT,
+           int osizeW,
+           int osizeH)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                  "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+
+  real *output_data;
+  real *input_data;
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  if (input->dim() == 4) {
+    sizeD = input->size[0];
+    isizeT = input->size[1];
+    isizeH = input->size[2];
+    isizeW = input->size[3];
+
+    istrideD = input->stride[0];
+    istrideT = input->stride[1];
+    istrideH = input->stride[2];
+    istrideW = input->stride[3];
+
+    THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW);
+
+    totalZ = sizeD * osizeT;
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+
+    int64_t sizeB = input->size[0];
+    sizeD = input->size[1];
+    isizeT = input->size[2];
+    isizeH = input->size[3];
+    isizeW = input->size[4];
+
+    istrideD = input->stride[1];
+    istrideT = input->stride[2];
+    istrideH = input->stride[3];
+    istrideW = input->stride[4];
+
+    THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW);
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  input_data = THCTensor_(data)(state, input);
+  output_data = THCTensor_(data)(state, output);
+
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        input_data, output_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW,
+        istrideD, istrideT, istrideH, istrideW, offsetZ
+      );
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+
+  if (input->dim() == 5) {
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t osizeT, osizeH, osizeW;
+  int64_t totalZ;
+
+  if (input->dim() == 4) {
+    sizeD = input->size[0];
+    isizeT = input->size[1];
+    isizeH = input->size[2];
+    isizeW = input->size[3];
+
+    osizeT = gradOutput->size[1];
+    osizeH = gradOutput->size[2];
+    osizeW = gradOutput->size[3];
+  } else {
+    sizeD = input->size[1];
+    isizeT = input->size[2];
+    isizeH = input->size[3];
+    isizeW = input->size[4];
+
+    osizeT = gradOutput->size[2];
+    osizeH = gradOutput->size[3];
+    osizeW = gradOutput->size[4];
+  }
+
+  // somehow nonatomic is passing all test for volumetric case.
+  bool atomic = false; //(isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
+
+  if (input->dim() == 4) {
+    totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
+  } else {
+    int sizeB = input->size[0];
+    totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
+  }
+
+  gradInput_data = THCTensor_(data)(state, gradInput);
+  gradOutput_data = THCTensor_(data)(state, gradOutput);
+
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+
+    if (atomic)
+    {
+      cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel
+        <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+          gradInput_data, gradOutput_data, isizeT, isizeH, isizeW,
+          osizeT, osizeH, osizeW, offsetZ
+        );
+    } else {
+        cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel
+          <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+            gradInput_data, gradOutput_data, isizeT, isizeH, isizeW,
+            osizeT, osizeH, osizeW, offsetZ
+          );
+    }
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+  // clean
+  THCTensor_(free)(state, gradOutput);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..7f876ae
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu
@@ -0,0 +1,178 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricAdaptiveMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+// 5d tensor B x D x T x H x W
+
+void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int osizeT,
+           int osizeW,
+           int osizeH)
+{
+  THCUNN_assertSameGPU(state, 3, input, output, indices);
+
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                  "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THCIndex_t *indices_data;
+  real *output_data;
+  real *input_data;
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  if (input->dim() == 4) {
+    sizeD = input->size[0];
+    isizeT = input->size[1];
+    isizeH = input->size[2];
+    isizeW = input->size[3];
+
+    istrideD = input->stride[0];
+    istrideT = input->stride[1];
+    istrideH = input->stride[2];
+    istrideW = input->stride[3];
+
+    THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW);
+    THCIndexTensor_(resize4d)(state, indices, sizeD, osizeT, osizeH, osizeW);
+
+    totalZ = sizeD * osizeT;
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+
+    int64_t sizeB = input->size[0];
+    sizeD = input->size[1];
+    isizeT = input->size[2];
+    isizeH = input->size[3];
+    isizeW = input->size[4];
+
+    istrideD = input->stride[1];
+    istrideT = input->stride[2];
+    istrideH = input->stride[3];
+    istrideW = input->stride[4];
+
+    THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW);
+    THCIndexTensor_(resize5d)(state, indices, sizeB, sizeD, osizeT, osizeH, osizeW);
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  input_data = THCTensor_(data)(state, input);
+  output_data = THCTensor_(data)(state, output);
+  indices_data = THCIndexTensor_(data)(state, indices);
+
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    cunn_VolumetricAdaptiveMaxPooling_updateOutput_kernel
+      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+        input_data, output_data, indices_data, isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW, offsetZ
+      );
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+
+  if (input->dim() == 5) {
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices)
+{
+  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCIndex_t *indices_data;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t osizeT, osizeH, osizeW;
+  int64_t totalZ;
+
+  if (input->dim() == 4) {
+    sizeD = input->size[0];
+    isizeT = input->size[1];
+    isizeH = input->size[2];
+    isizeW = input->size[3];
+
+    osizeT = gradOutput->size[1];
+    osizeH = gradOutput->size[2];
+    osizeW = gradOutput->size[3];
+  } else {
+    sizeD = input->size[1];
+    isizeT = input->size[2];
+    isizeH = input->size[3];
+    isizeW = input->size[4];
+
+    osizeT = gradOutput->size[2];
+    osizeH = gradOutput->size[3];
+    osizeW = gradOutput->size[4];
+  }
+
+  bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
+
+  if (input->dim() == 4) {
+    totalZ = sizeD * osizeT;
+  } else {
+    int sizeB = input->size[0];
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  indices_data = THCIndexTensor_(data)(state, indices);
+  gradInput_data = THCTensor_(data)(state, gradInput);
+  gradOutput_data = THCTensor_(data)(state, gradOutput);
+
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+
+    if (atomic)
+    {
+      cunn_atomic_VolumetricAdaptiveMaxPooling_updateGradInput_kernel
+        <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+          gradInput_data, gradOutput_data, indices_data,
+          isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ
+        );
+    } else {
+      cunn_VolumetricAdaptiveMaxPooling_updateGradInput_kernel
+        <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+          gradInput_data, gradOutput_data, indices_data,
+          isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ
+        );
+    }
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+  // clean
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
new file mode 100644
index 0000000..b32643d
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
@@ -0,0 +1,383 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricAveragePooling.cu"
+#else
+
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int padT, int padW, int padH,
+                         bool ceil_mode)
+{
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  int ndim = input->dim();
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 4)
+  {
+    THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+               && input->size[dimt] >= kT, 2,
+               "input image (T: %d H: %d W: %d) smaller than "
+               "kernel size (kT: %d kH: %d kW: %d)",
+               input->size[dimt], input->size[dimh], input->size[dimw],
+               kT, kH, kW);
+
+    /* sizes */
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 5)
+  {
+    THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+               && input->size[dimt] >= kT, 2,
+               "input image (T: %d H: %d W: %d) smaller than "
+               "kernel size (kT: %d kH: %d kW: %d)",
+               input->size[dimt], input->size[dimh], input->size[dimw],
+               kT, kH, kW);
+
+    /* sizes */
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    AT_ERROR("non-empty 4D or 5D tensor expected, but got size: ", input->sizes());
+  }
+
+  // The second argument is the index of padH.
+  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11,
+             "pad should not be greater than half of kernel size, but got "
+             "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d",
+             padT, padW, padH, kT, kW, kH);
+
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  if (ceil_mode)
+  {
+    outputTime   = ceil(float(inputTime   - kT + 2*padT) / float(dT)) + 1;
+    outputHeight = ceil(float(inputHeight - kH + 2*padH) / float(dH)) + 1;
+    outputWidth  = ceil(float(inputWidth  - kW + 2*padW) / float(dW)) + 1;
+  }
+  else
+  {
+    outputTime   = floor(float(inputTime   - kT + 2*padT) / float(dT)) + 1;
+    outputHeight = floor(float(inputHeight - kH + 2*padH) / float(dH)) + 1;
+    outputWidth  = floor(float(inputWidth  - kW + 2*padW) / float(dW)) + 1;
+  }
+  if (padT || padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputTime   - 1)*dT >= inputTime   + padT)
+      --outputTime;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (gradOutput != NULL)
+  {
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimN, inputSlices);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimt, outputTime);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (fiveDimensionalInput)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_(VolumetricAveragePooling_shapeCheck)
+       (state, input, NULL, kT, kW, kH, dT, dW, dH,
+        padT, padW, padH, ceil_mode);
+
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else /* 5D */
+  {
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  if (ceil_mode)
+  {
+    outputTime   = ceil(float(inputTime   - kT + 2*padT) / float(dT)) + 1;
+    outputHeight = ceil(float(inputHeight - kH + 2*padH) / float(dH)) + 1;
+    outputWidth  = ceil(float(inputWidth  - kW + 2*padW) / float(dW)) + 1;
+  }
+  else
+  {
+    outputTime   = floor(float(inputTime   - kT + 2*padT) / float(dT)) + 1;
+    outputHeight = floor(float(inputHeight - kH + 2*padH) / float(dH)) + 1;
+    outputWidth  = floor(float(inputWidth  - kW + 2*padW) / float(dW)) + 1;
+  }
+  if (padT || padH || padW)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputTime   - 1)*dT >= inputTime   + padT)
+      --outputTime;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                         outputTime, outputHeight, outputWidth);
+  }
+  else /* 5D */
+  {
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                         outputTime, outputHeight, outputWidth);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    output = THCTensor_(newFoldBatchDim)(state, output);
+    
+    THCTensor *old_input = input;
+    input = THCTensor_(newFoldBatchDim)(state, input);
+    THCTensor_(free)(state, old_input);
+  } else {
+    THCTensor_(retain)(state, output);
+  }
+
+  THCDeviceTensor<real, 4> cudaInput;
+  THCDeviceTensor<real, 4> cudaOutput;
+  cudaInput  = toDeviceTensor<real, 4>(state, input);
+  cudaOutput = toDeviceTensor<real, 4>(state, output);
+
+  int totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    switch (kW)
+      {
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
+      default:
+        cuda_VolumetricAveragePooling_updateOutput<real, accreal>
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            cudaInput,
+            cudaOutput,
+            kT, kH, kW,
+            dT, dH, dW,
+            padT, padH, padW,
+            count_include_pad,
+            offsetZ);
+        break;
+      }
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  THNN_(VolumetricAveragePooling_shapeCheck)
+       (state, input, gradOutput, kT, kW, kH, dT, dW, dH,
+        padT, padW, padH, ceil_mode);
+  bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
+
+  // Resize and initialize result tensor.
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+    inputTime    = THCTensor_(size)(state, input, 1);
+    inputHeight  = THCTensor_(size)(state, input, 2);
+    inputWidth   = THCTensor_(size)(state, input, 3);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 1);
+    outputHeight = THCTensor_(size)(state, gradOutput, 2);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+    inputTime    = THCTensor_(size)(state, input, 2);
+    inputHeight  = THCTensor_(size)(state, input, 3);
+    inputWidth   = THCTensor_(size)(state, input, 4);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 2);
+    outputHeight = THCTensor_(size)(state, gradOutput, 3);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 4);
+  }
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    gradInput = THCTensor_(newFoldBatchDim)(state, gradInput);
+
+    THCTensor *old_gradOutput = gradOutput;
+    gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput);
+    THCTensor_(free)(state, old_gradOutput);
+  } else {
+    THCTensor_(retain)(state, gradInput);
+  }
+
+  THCDeviceTensor<real, 4> cudaGradInput;
+  THCDeviceTensor<real, 4> cudaGradOutput;
+  cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+  cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+
+  dim3 block(32, 8);
+
+  // Optimizing for stride 1 is probably only of limited value, but this
+  // specialization yields 3x speedup over the atomicAdd implementation.
+  // Padding must be 0, otherwise, pool size may change.
+  if (dT == 1 && dH == 1 && dW == 1 && padT == 0 && padH == 0 && padW == 0)
+  {
+    int totalZ = inputTime * inputSlices * batchSize;
+    int offsetZ = 0;
+    while (totalZ > 0) {
+      dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+                THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+                totalZ > 65535 ? 65535 : totalZ);
+      cuda_VolumetricAveragePooling_updateGradInput_Stride1<real, accreal>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ);
+      THCudaCheck(cudaGetLastError());
+      totalZ -= 65535;
+      offsetZ += 65535;
+    }
+  }
+  else
+  {
+    int totalZ = outputTime * inputSlices * batchSize;
+    int offsetZ = 0;
+    while (totalZ > 0) {
+      dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+                THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+                totalZ > 65535 ? 65535 : totalZ);
+      if (kernelsOverlap)
+      {
+        cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<real, accreal>
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW,
+            padT, padH, padW, count_include_pad, offsetZ);
+      }
+      else
+      {
+        cuda_VolumetricAveragePooling_updateGradInput<real, accreal>
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW,
+            padT, padH, padW, count_include_pad, offsetZ);
+      }
+      THCudaCheck(cudaGetLastError());
+      totalZ -= 65535;
+      offsetZ += 65535;
+    }
+  }
+
+  THCTensor_(free)(state, gradInput);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricConvolution.cu b/aten/src/THCUNN/generic/VolumetricConvolution.cu
new file mode 100644
index 0000000..e76f8cb
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricConvolution.cu
@@ -0,0 +1,525 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricConvolution.cu"
+#else
+
+static inline void THNN_(VolumetricConvolution_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCTensor *weight,
+                         THCTensor *gradWeight,
+                         THCTensor *bias,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int padT,
+                         int padW,
+                         int padH) {
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                  "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(!weight || THCTensor_(isContiguous)(state, weight), 4,
+             "weight tensor has to be contiguous");
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
+             "bias tensor has to be contiguous");
+  THArgCheck(!gradWeight || THCTensor_(isContiguous)(state, gradWeight), 5,
+             "gradWeight tensor has to be contiguous");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+  if (gradOutput != NULL) {
+    THCUNN_argCheck(state, !gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3,
+                    gradOutput,
+                    "non-empty 4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+  }
+
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight,
+                    "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                    "expected for weight, but got: %s");
+  }
+
+  if (gradWeight != NULL) {
+    THCUNN_argCheck(state, !gradWeight->is_empty() && gradWeight->dim() == 5, 4, gradWeight,
+                    "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                    "expected for gradWeight, but got: %s");
+  }
+
+  if (weight == NULL) {
+    weight = gradWeight;
+  }
+  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane  = weight->size[1];
+  int64_t kT           = weight->size[2];
+  int64_t kH           = weight->size[3];
+  int64_t kW           = weight->size[4];
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 4,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int dimd = 3;
+
+  if (ndim == 5)
+  {
+    dimf++;
+    dimh++;
+    dimw++;
+    dimd++;
+  }
+
+  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputDepth   = input->size[dimd];
+
+  int64_t exactInputDepth = inputDepth + 2*padT;
+  int64_t exactInputHeight = inputHeight + 2*padH;
+  int64_t exactInputWidth = inputWidth + 2*padW;
+
+  if (exactInputDepth < kT || exactInputHeight < kH || exactInputWidth < kW) {
+    THError("Calculated input size: (%d x %d x %d). "
+      "Kernel size: (%d x %d x %d). Kernel size can't be greater than actual input size",
+      exactInputDepth,exactInputHeight,exactInputWidth,kT,kH,kW);
+  }
+
+  int64_t outputWidth  = (exactInputDepth - kH) / dH + 1;
+  int64_t outputHeight = (exactInputHeight - kT) / dT + 1;
+  int64_t outputDepth  = (exactInputWidth - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  if (bias != NULL) {
+    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+  }
+  THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+     THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
+  }
+}
+
+void THNN_(VolumetricConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+  THCUNN_assertSameGPU(state, 6, input, output, weight, bias, columns, ones);
+  THNN_(VolumetricConvolution_shapeCheck)(
+        state, input, NULL, weight, NULL,
+        bias, dT, dW, dH, padT, padW, padH);
+  input = THCTensor_(newContiguous)(state, input);
+
+  int nOutputPlane = (int)weight->size[0];
+  int nInputPlane  = (int)weight->size[1];
+  int kT           = (int)weight->size[2];
+  int kH           = (int)weight->size[3];
+  int kW           = (int)weight->size[4];
+
+  int batch = 1;
+  if (input->dim() == 4)
+  {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1],
+                          input->size[2], input->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t inputDepth   = input->size[4];
+  int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane,
+                        outputHeight, outputWidth, outputDepth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputDepth * outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+        state,
+        't', 'n',
+        n_, m_, k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, ones), k_,
+        THCTensor_(data)(state, bias), k_,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    im3d2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[0];
+    int64_t n = columns->size[1];
+    int64_t k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 'n',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, columns), n,
+      THCTensor_(data)(state, weight), k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+  }
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *finput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+
+  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane  = weight->size[1];
+  int64_t kT           = weight->size[2];
+  int64_t kH           = weight->size[3];
+  int64_t kW           = weight->size[4];
+
+  THCTensor *gradColumns = finput;
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput);
+  THNN_(VolumetricConvolution_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        NULL, dT, dW, dH, padT, padW, padH);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int batch = 1;
+  if (input->dim() == 4)
+  {
+    input = THCTensor_(newContiguous)(state, input);
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t inputDepth   = input->size[4];
+  int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+    int64_t n = gradColumns->size[1];
+    int64_t k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 't',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradOutput_n), n,
+      THCTensor_(data)(state, weight), m,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im3d<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth);
+    THCTensor_(free)(state, input);
+  }
+  THCTensor_(free)(state, gradOutput);
+
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           accreal scale_)
+{
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones);
+  THNN_(VolumetricConvolution_shapeCheck)(
+        state, input, gradOutput, NULL, gradWeight,
+        gradBias, dT, dW, dH, padT, padW, padH);
+
+  int nOutputPlane = (int)gradWeight->size[0];
+  int nInputPlane  = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int batch = 1;
+  if (input->dim() == 4)
+  {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t inputDepth   = input->size[4];
+  int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im3d2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = gradWeight->size[0];
+    int64_t n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
+    int64_t k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      't', 'n',
+      n, m, k,
+      scale,
+      THCTensor_(data)(state, columns), k,
+      THCTensor_(data)(state, gradOutput_n), k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+        state,
+        't',
+        k_, m_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), 1,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+        state,
+        't', 'n',
+        m_, 1, k_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+  }
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
new file mode 100644
index 0000000..5751ab4
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -0,0 +1,506 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedConvolution.cu"
+#else
+
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCTensor *weight,
+                         THCTensor *bias,
+                         int kT, int kH, int kW,
+                         int dT, int dH, int dW,
+                         int padT, int padH, int padW,
+                         int dilationT, int dilationH, int dilationW,
+                         int weight_nullable) {
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                  "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
+             "bias tensor has to be contiguous");
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+   // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight,
+                  "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                  "expected for weight, but got: %s");
+    if (bias != NULL) {
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t inputDepth  = input->size[dimd];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small",
+      inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *output,
+           THCTensor  *weight,
+           THCTensor  *bias,
+           THCTensor  *columns,
+           THCTensor  *ones,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU(state, 2, weight, bias);
+  }
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        state, input, NULL, weight, bias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 0);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THCTensor_(newContiguous)(state, input);
+  weight = THCTensor_(newContiguous)(state, weight);
+  bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  int64_t inputDepth  = input->size[2];
+  int64_t inputHeight  = input->size[3];
+  int64_t inputWidth   = input->size[4];
+  int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputDepth * outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nOutputPlane;
+    int64_t n = columns->size[1];
+    int64_t k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+  if (bias) THCTensor_(free)(state, bias);
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradInput,
+           THCTensor  *weight,
+           THCTensor  *gradColumns,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+                       gradColumns, gradInput);
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 0);
+
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputDepth  = input->size[2];
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor  *gradInput_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = nInputPlane*kT*kW*kH;
+    int64_t n = gradColumns->size[1];
+    int64_t k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2vol<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, weight);
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradWeight,
+           THCTensor  *gradBias,
+           THCTensor  *columns,
+           THCTensor  *ones,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           accreal scale_) {
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones);
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        state, input, gradOutput, gradWeight, gradBias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 1);
+
+  // Params
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = gradOutput->size[1];
+  int64_t inputDepth  = input->size[2];
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THCTensor_(select)(state, input_n, input, 0, elt);
+
+      // Extract columns:
+      vol2col(
+        THCState_getCurrentStream(state),
+        THCTensor_(data)(state, input_n),
+        nInputPlane, inputDepth, inputHeight, inputWidth,
+        outputDepth, outputHeight, outputWidth,
+        kT, kH, kW, padT, padH, padW, dT, dH, dW,
+        dilationT, dilationH, dilationW,
+        THCTensor_(data)(state, columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kT*kW*kH;
+      int64_t k = columns->size[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n, m, k,
+          scale,
+          THCTensor_(data)(state, columns), k,
+          THCTensor_(data)(state, gradOutput_n), k,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputDepth * outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+    // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
new file mode 100644
index 0000000..b694c37
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
@@ -0,0 +1,409 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.cu"
+#else
+
+#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:                         \
+  cuda_VolumetricDilatedMaxPooling_updateOutput<KW>                     \
+  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+    inputData, inputTime, inputHeight, inputWidth, \
+    cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
+    dilationT, dilationH, dilationW, offsetZ); \
+    break
+
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCIndexTensor *indices,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int padT, int padW, int padH,
+                         int dilationT, int dilationW, int dilationH,
+                         bool ceilMode) {
+  int ndim = input->dim();
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 7,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 16,
+             "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  if (input->dim() == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    /* sizes */
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (THCTensor_(nDimension)(state, input) == 5)
+  {
+    /* sizes */
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    AT_ERROR("non-empty 4D or 5D tensor expected, got size: ", input->sizes());
+  }
+
+  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 13,
+             "pad should be smaller than half of kernel size, but got "
+             "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+             kT, kW, kH, padT, padW, padH);
+
+  if (ceilMode)
+  {
+    outputTime   = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputTime   = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padT || padW || padH)
+  {
+    if ((outputTime - 1)*dT >= inputTime + padT)
+      --outputTime;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputTime < 1 || outputHeight < 1 || outputWidth < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth);
+
+   if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimf, inputSlices);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimt, outputTime);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+  if (indices != NULL) {
+    THCUNN_check_dim_size_indices(state, indices, ndim, dimf, inputSlices);
+    THCUNN_check_dim_size_indices(state, indices, ndim, dimt, outputTime);
+    THCUNN_check_dim_size_indices(state, indices, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size_indices(state, indices, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           bool ceilMode)
+{
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+
+  if (fiveDimensionalInput)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THCUNN_assertSameGPU(state, 3, input, indices, output);
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, NULL, NULL, kT, kW, kH,
+        dT, dW, dH, padT, padW, padH,
+        dilationT, dilationW, dilationH, ceilMode);
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (fiveDimensionalInput)
+  {
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    AT_ERROR("non-empty 4D or 5D tensor expected, got size: ", input->sizes());
+  }
+
+  if (ceilMode)
+  {
+    outputTime   = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputTime   = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padT || padW || padH)
+  {
+    if ((outputTime - 1)*dT >= inputTime + padT)
+      --outputTime;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+    /* indices pack ti,i,j locations for each output point as uchar into
+     each float of the tensor */
+    THCIndexTensor_(resize4d)(state, indices, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+  else
+  { /* 5D */
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+    // Index tensor packs index offsets as uchars into floats
+    THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+    fiveDimensionalInput = 1;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    output = THCTensor_(newFoldBatchDim)(state, output);
+
+    THCTensor *old_input = input;
+    input = THCTensor_(newFoldBatchDim)(state, input);
+    THCTensor_(free)(state, old_input);
+  } else {
+    THCTensor_(retain)(state, output);
+  }
+
+  real* inputData = THCTensor_(data)(state, input);
+
+  THCDeviceTensor<real, 4> cudaOutput;
+  cudaOutput = toDeviceTensor<real, 4>(state, output);
+
+  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+  int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
+                            outputTime, outputHeight, outputWidth };
+  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+
+  THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+    state, THCIndexTensor_(storage)(state, indices),
+    THCIndexTensor_(storageOffset)(state, indices),
+    indicesSize, NULL);
+
+  THLongStorage_free(indicesSize);
+
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+    toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+  int totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    switch (kW)
+      {
+        UPDATE_OUTPUT_KERNEL_WIDTH(1);
+        UPDATE_OUTPUT_KERNEL_WIDTH(2);
+        UPDATE_OUTPUT_KERNEL_WIDTH(3);
+        UPDATE_OUTPUT_KERNEL_WIDTH(4);
+        UPDATE_OUTPUT_KERNEL_WIDTH(5);
+        UPDATE_OUTPUT_KERNEL_WIDTH(6);
+        UPDATE_OUTPUT_KERNEL_WIDTH(7);
+      default:
+        cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
+          0, THCState_getCurrentStream(state)>>>(
+                             inputData, inputTime, inputHeight, inputWidth,
+                             cudaIndices, cudaOutput,
+                             kT, kH, kW, dT, dH, dW,
+                             padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
+      }
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
+  THCIndexTensor_(free)(state, indices1);
+}
+
+#undef UPDATE_OUTPUT_KERNEL_WIDTH
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           bool ceilMode)
+{
+  // TODO: gradOutput shape check
+  // Resize and initialize result tensor.
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  int batchSize;
+  int inputSlices;
+
+  int outputTime, outputHeight, outputWidth;
+  int inputTime, inputHeight, inputWidth;
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+
+  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, gradOutput, indices, kT, kW, kH,
+        dT, dW, dH, padT, padW, padH,
+        dilationT, dilationW, dilationH, ceilMode);
+
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 1);
+    outputHeight = THCTensor_(size)(state, gradOutput, 2);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 3);
+    inputTime   = THCTensor_(size)(state, gradInput, 1);
+    inputHeight = THCTensor_(size)(state, gradInput, 2);
+    inputWidth  = THCTensor_(size)(state, gradInput, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 2);
+    outputHeight = THCTensor_(size)(state, gradOutput, 3);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 4);
+    inputTime   = THCTensor_(size)(state, gradInput, 2);
+    inputHeight = THCTensor_(size)(state, gradInput, 3);
+    inputWidth  = THCTensor_(size)(state, gradInput, 4);
+  }
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    gradInput = THCTensor_(newFoldBatchDim)(state, gradInput);
+
+    THCTensor *old_gradOutput = gradOutput;
+    gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput);
+    THCTensor_(free)(state, old_gradOutput);
+  } else {
+    THCTensor_(retain)(state, gradInput);
+  }
+
+  THCDeviceTensor<real, 4> cudaGradOutput;
+  cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  real* gradInputData = THCTensor_(data)(state, gradInput);
+
+  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+  int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
+                           outputTime, outputHeight, outputWidth };
+  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+  THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+    state, THCIndexTensor_(storage)(state, indices),
+    THCIndexTensor_(storageOffset)(state, indices), indicesSize, NULL);
+  THLongStorage_free(indicesSize);
+
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+    toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+  int64_t totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block,
+      0, THCState_getCurrentStream(state)>>>(
+                                             cudaGradOutput,
+                                             cudaIndices,
+                                             gradInputData,
+                                             inputTime, inputHeight, inputWidth,
+                                             dT, dH, dW,
+                                             padT, padH, padW,
+                                             dilationT, dilationH, dilationW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  // cleanup
+  THCTensor_(free)(state, gradInput);
+  THCTensor_(free)(state, gradOutput);
+  THCIndexTensor_(free)(state, indices1);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu
new file mode 100644
index 0000000..f4e731f
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu
@@ -0,0 +1,168 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.cu"
+#else
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputT, int outputW, int outputH,
+           int poolSizeT, int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices,
+           THCTensor *randomSamples)
+{
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int dimt = 3;
+  int64_t numBatch = 1;
+
+  int64_t numInputDims = THCTensor_(nDimension)(state, input);
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input,
+                  "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 5) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+    dimt++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THCTensor_(size)(state, input, planeDim);
+  int64_t inputH = THCTensor_(size)(state, input, dimh);
+  int64_t inputW = THCTensor_(size)(state, input, dimw);
+  int64_t inputT = THCTensor_(size)(state, input, dimt);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH (%d) too large relative to input height (%d)",
+             poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW (%d) too large relative to input width (%d)",
+             poolSizeW, inputW);
+  THArgCheck(outputT + poolSizeT - 1 < inputW, 5,
+             "poolSizeT (%d) too large relative to input time (%d)",
+             poolSizeT, inputT);
+
+  THCDeviceTensor<real, 5> devInput;
+  THCDeviceTensor<real, 5> devOutput;
+  THCDeviceTensor<THCIndex_t, 5> devIndices;
+  THCDeviceTensor<real, 3> devSamples =
+    toDeviceTensor<real, 3>(state, randomSamples);
+
+  if (numInputDims == 4) {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize4d)(state, indices, numPlanes, outputH, outputW, outputT);
+
+    devInput = toDeviceTensor<real, 4>(state, input).upcastOuter<5>();
+    devOutput = toDeviceTensor<real, 4>(state, output).upcastOuter<5>();
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices).upcastOuter<5>();
+  } else {
+    THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize5d)(state, indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+    devInput = toDeviceTensor<real, 5>(state, input);
+    devOutput = toDeviceTensor<real, 5>(state, output);
+    devIndices = toDeviceTensor<THCIndex_t, 5>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) * devOutput.getSize(4);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devInput.getSize(1),
+            devInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+#define SFMP_UPDATE_OUTPUT(POOL_W)                                      \
+  VolumetricFractionalMaxPooling_updateOutput<POOL_W, real, accreal>       \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      devInput, devOutput, devIndices, devSamples, poolSizeT, poolSizeW, poolSizeH);
+
+#define SFMP_UPDATE_OUTPUT_CASE(POOL_W)                 \
+  case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
+
+  switch (poolSizeW) {
+    SFMP_UPDATE_OUTPUT_CASE(2);
+    SFMP_UPDATE_OUTPUT_CASE(3);
+    SFMP_UPDATE_OUTPUT_CASE(4);
+    SFMP_UPDATE_OUTPUT_CASE(5);
+    SFMP_UPDATE_OUTPUT_CASE(6);
+    SFMP_UPDATE_OUTPUT_CASE(7);
+    default:
+      // dynamic pool width
+      SFMP_UPDATE_OUTPUT_CASE(-1);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int outputT, int outputW, int outputH,
+           int poolSizeT, int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices)
+{
+  int dimh = 1;
+  int dimw = 2;
+  int dimt = 3;
+
+  int64_t numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 5) {
+    dimh++;
+    dimw++;
+    dimt++;
+  }
+
+  /* sizes */
+  int64_t inputH = THCTensor_(size)(state, input, dimh);
+  int64_t inputW = THCTensor_(size)(state, input, dimw);
+  int64_t inputT = THCTensor_(size)(state, input, dimt);
+
+  THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(outputT == THCTensor_(size)(state, gradOutput, dimt), 3,
+                "gradOutput time unexpected");
+
+  /* resize */
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 5> devGradInput;
+  THCDeviceTensor<real, 5> devGradOutput;
+  THCDeviceTensor<THCIndex_t, 5> devIndices;
+
+  /* backprop */
+  if (numInputDims == 4) {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput).upcastOuter<5>();
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput).upcastOuter<5>();
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices).upcastOuter<5>();
+  } else {
+    devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+    devIndices = toDeviceTensor<THCIndex_t, 5>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) * devGradOutput.getSize(4);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devGradInput.getSize(1),
+            devGradInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  VolumetricFractionalMaxPooling_updateGradInput
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      devGradInput, devGradOutput, devIndices);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricFullConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullConvolution.cu
new file mode 100644
index 0000000..e2a2f55
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricFullConvolution.cu
@@ -0,0 +1,61 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu"
+#else
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *output,
+       THCTensor  *weight,
+       THCTensor  *bias,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int kT, int kW, int kH,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int adjT, int adjW, int adjH)
+{
+  THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+       state, input, output, weight, bias, finput, fgradInput,
+       kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH);
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *gradOutput,
+       THCTensor  *gradInput,
+       THCTensor  *weight,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int kT, int kW, int kH,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int adjT, int adjW, int adjH)
+{
+  THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+       state, input, gradOutput, gradInput, weight, finput, fgradInput,
+       kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH);
+}
+
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradWeight,
+           THCTensor  *gradBias,
+           THCTensor  *finput,
+           THCTensor  *fgradInput,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int adjT, int adjW, int adjH,
+           accreal scale_)
+{
+  THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+       state, input, gradOutput, gradWeight, gradBias, finput, fgradInput,
+       kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH, scale_);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
new file mode 100644
index 0000000..bd653b9
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
@@ -0,0 +1,537 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricFullDilatedConvolution.cu"
+#else
+
+static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+               THCState *state,
+               THCTensor *input,
+               THCTensor *gradOutput,
+               THCTensor *weight,
+               THCTensor *bias,
+               int kT, int kW, int kH,
+               int dT, int dW, int dH,
+               int padT, int padW, int padH,
+               int dilationT, int dilationW, int dilationH,
+               int adjT, int adjW, int adjH, int weight_nullable) {
+  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+            "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+         "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+  THArgCheck((adjT < dT || adjT < dilationT)
+             && (adjW < dW || adjW < dilationW)
+             && (adjH < dH || adjH < dilationH), 15,
+             "output padding must be smaller than either stride or dilation,"
+             " but got adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d "
+             "dilationT: %d dilationH: %d dilationW: %d",
+             adjT, adjH, adjW, dT, dH, dW, dilationT, dilationH, dilationW);
+
+   // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  if (weight != NULL) {
+    THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight,
+                  "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                  "expected for weight, but got: %s");
+    if (bias != NULL) {
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  if (weight != NULL) {
+    const int64_t nInputPlane = THCTensor_(size)(state, weight, 0);
+    THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
+  }
+
+  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputDepth  = input->size[dimd];
+  int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small",
+      inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      const int64_t nOutputPlane = THCTensor_(size)(state, weight, 1);
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      const int64_t nOutputPlane = THCTensor_(size)(state, bias, 0);
+      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *output,
+       THCTensor  *weight,
+       THCTensor  *bias,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int kT, int kW, int kH,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int dilationT, int dilationW, int dilationH,
+       int adjT, int adjW, int adjH)
+{
+
+  THCTensor  *columns = finput;
+  THCTensor  *ones    = fgradInput;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU(state, 6, input, output, weight,
+               bias, columns, ones);
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+      state, input, NULL, weight, bias, kT, kW, kH,
+      dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH,
+      adjT, adjW, adjH, 0);
+
+  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
+         "bias tensor has to be contiguous");
+  input = THCTensor_(newContiguous)(state, input);
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t inputDepth  = input->size[2];
+  int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    int64_t n = columns->size[1];
+    int64_t k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 't',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, input_n), n,
+      THCTensor_(data)(state, weight), m,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, columns), n
+    );
+
+    // Unpack columns back into input:
+    col2vol<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputDepth * outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+        state,
+        't', 'n',
+        n_, m_, k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, ones), k_,
+        THCTensor_(data)(state, bias), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, weight);
+
+}
+
+void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *gradOutput,
+       THCTensor  *gradInput,
+       THCTensor  *weight,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int kT, int kW, int kH,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int dilationT, int dilationW, int dilationH,
+       int adjT, int adjW, int adjH)
+{
+  THCTensor  *gradColumns = finput;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
+               gradColumns, gradInput);
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+      state, input, gradOutput, weight, NULL, kT, kW, kH,
+      dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH,
+      adjT, adjW, adjH, 0);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  weight = THCTensor_(newContiguous)(state, weight);
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t inputDepth   = input->size[2];
+  int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor  *gradInput_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[0];
+    int64_t n = gradColumns->size[1];
+    int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 'n',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradColumns), n,
+      THCTensor_(data)(state, weight), k,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, weight);
+}
+
+
+void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradWeight,
+           THCTensor  *gradBias,
+           THCTensor  *finput,
+           THCTensor  *fgradInput,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           int adjT, int adjW, int adjH,
+           accreal scale_)
+{
+  THCTensor  *columns = finput;
+  THCTensor  *ones = fgradInput;
+
+  real scale = ScalarConvert<accreal, real>::to(scale_);
+  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
+               gradBias, columns, ones);
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+      state, input, gradOutput, gradWeight, gradBias, kT, kW, kH,
+      dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH,
+      adjT, adjW, adjH, 1);
+
+  int nOutputPlane;
+  if (gradWeight) {
+    nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+  } else if (gradBias) {
+    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
+  } else {
+    return;
+  }
+
+  if (gradWeight) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  if (gradBias) {
+    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous");
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t inputDepth   = input->size[2];
+  int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THCTensor_(select)(state, input_n, input, 0, elt);
+
+      // Extract columns:
+      vol2col(
+        THCState_getCurrentStream(state),
+        THCTensor_(data)(state, gradOutput_n),
+        nOutputPlane, outputDepth, outputHeight, outputWidth,
+        inputDepth, inputHeight, inputWidth,
+        kT, kH, kW, padT, padH, padW, dT, dH, dW,
+        dilationT, dilationH, dilationW,
+        THCTensor_(data)(state, columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t n = columns->size[0];   // nOutputPlane * kt * kh * kw
+      int64_t m = input_n->size[0];   // nInputPlane
+      int64_t k = columns->size[1];   // inputHeight * inputWidth
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, input_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputDepth * outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+        state,
+        't',
+        k_, m_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), 1,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+        state,
+        't', 'n',
+        m_, 1, k_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, input->size[1], inputDepth, inputHeight, inputWidth);
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
new file mode 100644
index 0000000..8722ce9
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
@@ -0,0 +1,104 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu"
+#else
+
+static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *gradOutput) {
+  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimension)(state, input) == 5, 2, input,
+      "non-empty 5D input tensor expected but got: %s");
+  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimension)(state, grid) == 5, 2, grid,
+      "non-empty 5D grid tensor expected but got: %s");
+
+  int64_t nbatch   = THCTensor_(size)(state, input, 0);
+  int64_t channels = THCTensor_(size)(state, input, 1);
+  int64_t idepth   = THCTensor_(size)(state, input, 2);
+  int64_t iheight   = THCTensor_(size)(state, input, 3);
+  int64_t iwidth    = THCTensor_(size)(state, input, 4);
+  int64_t odepth   = THCTensor_(size)(state, grid, 1);
+  int64_t oheight   = THCTensor_(size)(state, grid, 2);
+  int64_t owidth    = THCTensor_(size)(state, grid, 3);
+
+  THCUNN_check_dim_size(state, grid, 5, 0, nbatch);
+  THCUNN_check_dim_size(state, grid, 5, 4, 3);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch);
+    THCUNN_check_dim_size(state, gradOutput, 5, 1, channels);
+    THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth);
+    THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight);
+    THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth);
+  }
+}
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *output,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 3, input, grid, output);
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t ID = THCTensor_(size)(state, input, 2);
+  int64_t IH = THCTensor_(size)(state, input, 3);
+  int64_t IW = THCTensor_(size)(state, input, 4);
+  int64_t D = THCTensor_(size)(state,grid, 1);
+  int64_t H = THCTensor_(size)(state,grid, 2);
+  int64_t W = THCTensor_(size)(state, grid, 3);
+
+  // resize output to the same shape as input
+  THCTensor_(resize5d)(state, output, N, C, D, H, W);
+
+  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
+  THCDeviceTensor<real, 5> devOutput = toDeviceTensor<real, 5>(state, output);
+
+  int count = static_cast<int>(N*D*H*W);
+  VolumetricGridSamplerBilinear_updateOutput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGrid, devOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+    THCState *state,
+    THCTensor *input, THCTensor *gradInput,
+    THCTensor *grid, THCTensor *gradGrid,
+    THCTensor *gradOutput,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t ID = THCTensor_(size)(state, input, 2);
+  int64_t IH = THCTensor_(size)(state, input, 3);
+  int64_t IW = THCTensor_(size)(state, input, 4);
+  int64_t D = THCTensor_(size)(state,grid, 1);
+  int64_t H = THCTensor_(size)(state,grid, 2);
+  int64_t W = THCTensor_(size)(state, grid, 3);
+
+  THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW);
+  THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3);
+  THCTensor_(zero)(state, gradInput);
+  THCTensor_(zero)(state, gradGrid);
+
+  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
+  THCDeviceTensor<real, 5> devGradGrid = toDeviceTensor<real, 5>(state, gradGrid);
+  THCDeviceTensor<real, 5> devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+
+  int count = static_cast<int>(N*D*H*W);
+  VolumetricGridSamplerBilinear_updateGradInput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricMaxPooling.cu
new file mode 100644
index 0000000..c86be82
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricMaxPooling.cu
@@ -0,0 +1,40 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+        state, input, output, indices,
+        kT, kW, kH, dT, dW, dH, padT, padW, padH,
+        1, 1, 1, ceilMode);
+
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+        state, input, gradOutput, gradInput, indices,
+        kT, kW, kH, dT, dW, dH, padT, padW, padH,
+        1, 1, 1, ceilMode);
+
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
new file mode 100644
index 0000000..0b5a17d
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
@@ -0,0 +1,271 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxUnpooling.cu"
+#else
+
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         THCIndexTensor *indices,
+                         int oT,
+                         int oW,
+                         int oH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH) {
+  int inputSlices = 0;
+
+  THCUNN_check_shape_indices(state, indices, input);
+
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    inputSlices = THCTensor_(size)(state, input, 0);
+  }
+  else if (THCTensor_(nDimension)(state, input) == 5)
+  {
+    inputSlices = THCTensor_(size)(state, input, 1);
+  }
+  else
+  {
+    AT_ERROR("non-empty 4D or 5D tensor expected, got size: ",
+             input->sizes());
+  }
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+  if (input->dim() == 5)
+  {
+    dimt++;
+    dimw++;
+    dimh++;
+    dimn++;
+  }
+
+  if (gradOutput != NULL) {
+    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    {
+      THError(
+        "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]);
+    }
+
+    THCUNN_check_dim_size(state, gradOutput, input->dim(), dimn, inputSlices);
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int outputTime, int outputWidth, int outputHeight,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  int batchSize = 0;
+  int inputSlices = 0;
+  int inputTime = 0;
+  int inputHeight = 0;
+  int inputWidth = 0;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, NULL, indices,
+        outputTime, outputWidth, outputHeight,
+        dT, dW, dH, padT, padW, padH);
+  THCUNN_assertSameGPU(state, 3, input, indices, output);
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (fiveDimensionalInput)
+  {
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+  else
+  { /* 5D */
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  output = THCTensor_(newContiguous)(state, output);
+  THCTensor_(zero)(state, output);
+
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    // newFoldBatchDim assumes contiguity so the newContiguous calls must
+    // preceed this
+    THCTensor *old_output = output;
+    output = THCTensor_(newFoldBatchDim)(state, output);
+    THCTensor_(free)(state, old_output);
+
+    THCTensor *old_input = input;
+    input = THCTensor_(newFoldBatchDim)(state, input);
+    THCTensor_(free)(state, old_input);
+
+    THCIndexTensor *old_indices = indices;
+    indices = THCIndexTensor_(newFoldBatchDim)(state, indices);
+    THCIndexTensor_(free)(state, old_indices);
+  }
+
+  real* outputData = THCTensor_(data)(state, output);
+
+  THCDeviceTensor<real, 4> cudaInput;
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+  cudaInput  = toDeviceTensor<real, 4>(state, input);
+  cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block,
+          0, THCState_getCurrentStream(state)>>>(
+                             cudaInput, cudaIndices, outputData,
+                             outputTime, outputHeight, outputWidth,
+                             dT, dH, dW,
+                             padT, padH, padW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
+  THCIndexTensor_(free)(state, indices);
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int outputTime, int outputWidth, int outputHeight,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  int batchSize = 0;
+  int inputSlices = 0;
+  int inputTime = 0;
+  int inputHeight = 0;
+  int inputWidth = 0;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        outputTime, outputWidth, outputHeight,
+        dT, dW, dH, padT, padW, padH);
+  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
+
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (!fiveDimensionalInput) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  // Collapse batch and feature dimensions
+  if (fiveDimensionalInput) {
+    gradInput = THCTensor_(newFoldBatchDim)(state, gradInput);
+
+    THCIndexTensor *old_indices = indices;
+    indices = THCIndexTensor_(newFoldBatchDim)(state, indices);
+    THCIndexTensor_(free)(state, old_indices);
+
+    THCTensor *old_gradOutput = gradOutput;
+    gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput);
+    THCTensor_(free)(state, old_gradOutput);
+  } else {
+    THCTensor_(retain)(state, gradInput);
+  }
+
+  real* gradOutputData = THCTensor_(data)(state, gradOutput);
+
+  THCDeviceTensor<real, 4> cudaGradInput;
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+  cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+  cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block,
+      0, THCState_getCurrentStream(state)>>>(
+                                             gradOutputData,
+                                             outputTime, outputHeight, outputWidth,
+                                             cudaIndices,
+                                             cudaGradInput,
+                                             dT, dH, dW,
+                                             padT, padH, padW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  // cleanup
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, gradInput);
+  THCIndexTensor_(free)(state, indices);
+  THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
new file mode 100644
index 0000000..071b322
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
@@ -0,0 +1,174 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricReplicationPadding.cu"
+#else
+
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input,
+                         THCTensor *gradOutput,
+                         int pleft, int pright,
+                         int ptop, int pbottom,
+                         int pfront, int pback) {
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+  int numInputDims = THCTensor_(nDimension)(state, input);
+
+  THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input,
+    "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+  if (numInputDims == 5) {
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+    }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int idepth = input->size[dimd];
+  int iheight = input->size[dimh];
+  int iwidth = input->size[dimw];
+  int odepth = idepth + pfront + pback;
+  int oheight = iheight + ptop + pbottom;
+  int owidth  = iwidth + pleft + pright;
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+             "input (D: %d H: %d, W: %d) is too small."
+             " Calculated output D: %d H: %d W: %d",
+             idepth, iheight, iwidth, odepth, oheight, owidth);
+
+  if (gradOutput != NULL) {
+    THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput),
+               3, "output gradient tensor must fit into 32-bit index math");
+
+    THArgCheck(numPlanes == THCTensor_(size)(state, gradOutput, planeDim), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               numPlanes, THCTensor_(size)(state, gradOutput, planeDim));
+    THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               owidth, THCTensor_(size)(state, gradOutput, dimw));
+    THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3,
+               "gradOutput height unexpected. Expected: %d, Got: %d",
+               oheight, THCTensor_(size)(state, gradOutput, dimh));
+    THArgCheck(odepth == THCTensor_(size)(state, gradOutput, dimd), 3,
+               "gradOutput depth unexpected. Expected: %d, Got: %d",
+               odepth, THCTensor_(size)(state, gradOutput, dimd));
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int pleft, int pright,
+           int ptop, int pbottom,
+           int pfront, int pback) {
+  THNN_(VolumetricReplicationPadding_shapeCheck)(
+        state, input, NULL, pleft, pright, ptop,
+        pbottom, pfront, pback);
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+
+  if (numInputDims == 5) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputD = THCTensor_(size)(state, input, dimd);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputD = inputD + pfront + pback;
+  int outputH = inputH + ptop + pbottom;
+  int outputW  = inputW + pleft + pright;
+
+  THCDeviceTensor<real, 5> devInput;
+  THCDeviceTensor<real, 5> devOutput;
+
+  if (numInputDims == 4) {
+    THCTensor_(resize4d)(state, output, numPlanes, outputD, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input).upcastOuter<5>();
+    devOutput = toDeviceTensor<real, 4>(state, output).upcastOuter<5>();
+  } else {
+    THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputD, outputH,
+                          outputW);
+
+    devInput = toDeviceTensor<real, 5>(state, input);
+    devOutput = toDeviceTensor<real, 5>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) *
+      devOutput.getSize(4);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  VolumetricReplicationPadding_updateOutput<real><<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int pleft, int pright,
+           int ptop, int pbottom,
+           int pfront, int pback) {
+  THNN_(VolumetricReplicationPadding_shapeCheck)(
+        state, input, gradOutput, pleft, pright, ptop,
+        pbottom, pfront, pback);
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 5) {
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 5> devGradInput;
+  THCDeviceTensor<real, 5> devGradOutput;
+
+  if (numInputDims == 4) {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput).upcastOuter<5>();
+    devGradOutput =
+        toDeviceTensor<real, 4>(state, gradOutput).upcastOuter<5>();
+  } else {
+    devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) *
+      devGradOutput.getSize(4);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
new file mode 100644
index 0000000..06994a1
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
@@ -0,0 +1,107 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricUpSamplingNearest.cu"
+#else
+
+#include "../common.h"
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputDepth, int inputHeight, int inputWidth,
+                         int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+             && outputDepth && outputHeight > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+             inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, input->_dim() == 5, 2, input,
+                     "5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth);
+    THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth);
+  }
+}
+
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputDepth,
+           int outputHeight,
+           int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputDepth = THCTensor_(size)(state, input, 2);
+  int inputHeight = THCTensor_(size)(state, input, 3);
+  int inputWidth  = THCTensor_(size)(state, input, 4);
+
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
+		  inputDepth, inputHeight, inputWidth,
+		  outputDepth, outputHeight, outputWidth);
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
+		  outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+
+  THCTensor_(resize5d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+                       outputDepth,
+                       outputHeight,
+                       outputWidth);
+  THCTensor_(zero)(state, output);
+
+  THCDeviceTensor<real, 5> idata = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> odata = toDeviceTensor<real, 5>(state, output);
+
+  const int num_kernels = outputDepth * outputHeight * outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  nearest_neighbor_5d_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
+	 0, stream>>>(num_kernels, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputDepth,
+           int inputHeight,
+           int inputWidth,
+           int outputDepth,
+           int outputHeight,
+           int outputWidth)
+{
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
+		  inputDepth, inputHeight, inputWidth,
+		  outputDepth, outputHeight, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
+
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 5> data1 = toDeviceTensor<real, 5>(state, gradInput);
+  THCDeviceTensor<real, 5> data2 = toDeviceTensor<real, 5>(state, gradOutput);
+  const int num_kernels = outputDepth * outputHeight * outputWidth;
+  const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  nearest_neighbor_5d_kernel_backward<real, accreal> <<<THCCeilDiv(num_kernels, num_threads),
+	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu
new file mode 100644
index 0000000..1dbad86
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu
@@ -0,0 +1,112 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.cu"
+#else
+
+#include "../linear_upsampling.h"
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+                        (THCState *state,
+                         THCTensor *input, THCTensor *gradOutput,
+                         int nBatch, int nChannels,
+                         int inputDepth, int inputHeight, int inputWidth,
+                         int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+             && outputDepth && outputHeight > 0 && outputWidth > 0, 2,
+             "input and output sizes should be greater than 0,"
+             " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+             inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 5, 2, input,
+                     "non-empty 5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch);
+    THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels);
+    THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth);
+    THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight);
+    THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth);
+  }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputDepth,
+           int outputHeight,
+           int outputWidth,
+           bool align_corners)
+{
+  int nbatch = THCTensor_(size)(state, input, 0);
+  int channels = THCTensor_(size)(state, input, 1);
+  int inputDepth = THCTensor_(size)(state, input, 2);
+  int inputHeight = THCTensor_(size)(state, input, 3);
+  int inputWidth = THCTensor_(size)(state, input, 4);
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+       (state, input, NULL,
+        nbatch, channels,
+        inputDepth, inputHeight, inputWidth,
+        outputDepth, outputHeight, outputWidth);
+
+  THCUNN_assertSameGPU(state, 2, input, output);
+  THCTensor_(resize5d)(state, output,
+                       THCTensor_(size)(state, input, 0),
+                       THCTensor_(size)(state, input, 1),
+                       outputDepth, outputHeight, outputWidth);
+  THCTensor_(zero)(state, output);
+  THCDeviceTensor<real, 5> idata = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> odata = toDeviceTensor<real, 5>(state, output);
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+  const accreal rdepth = linear_upsampling_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputDepth * outputHeight * outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
+   0 , stream>>>(num_kernels, rdepth, rheight, rwidth, align_corners, idata, odata);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputDepth,
+           int inputHeight,
+           int inputWidth,
+           int outputDepth,
+           int outputHeight,
+           int outputWidth,
+           bool align_corners)
+{
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+       (state, NULL, gradOutput,
+        nbatch, nchannels,
+        inputDepth, inputHeight, inputWidth,
+        outputDepth, outputHeight, outputWidth);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 5> data1 = toDeviceTensor<real, 5>(state, gradInput);
+  THCDeviceTensor<real, 5> data2 = toDeviceTensor<real, 5>(state, gradOutput);
+  const accreal rdepth = linear_upsampling_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputDepth * outputHeight * outputWidth;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
+  num_threads, 0, stream>>>(num_kernels, rdepth, rheight, rwidth, align_corners, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/aten/src/THCUNN/im2col.h b/aten/src/THCUNN/im2col.h
new file mode 100644
index 0000000..ba90560
--- /dev/null
+++ b/aten/src/THCUNN/im2col.h
@@ -0,0 +1,130 @@
+#ifndef THCUNN_IM2COL_H
+#define THCUNN_IM2COL_H
+
+#include "common.h"
+#include "THCNumerics.cuh"
+
+// Kernel for fast unfold+copy
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+template <typename Dtype>
+__launch_bounds__(CUDA_NUM_THREADS)
+__global__ void im2col_kernel(const int64_t n, const Dtype* data_im,
+                              const int64_t height, const int64_t width,
+                              const int64_t ksize_h, const int64_t ksize_w,
+                              const int64_t pad_h, const int64_t pad_w,
+                              const int64_t stride_h, const int64_t stride_w,
+                              const int64_t dilation_h, const int64_t dilation_w,
+                              const int64_t height_col, const int64_t width_col,
+    Dtype* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int64_t w_out = index % width_col;
+    index /= width_col;
+    int64_t h_out = index % height_col;
+    int64_t channel_in = index / height_col;
+    int64_t channel_out = channel_in * ksize_h * ksize_w;
+    int64_t h_in = h_out * stride_h - pad_h;
+    int64_t w_in = w_out * stride_w - pad_w;
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    data_im += (channel_in * height + h_in) * width + w_in;
+    for (int64_t i = 0; i < ksize_h; ++i) {
+      for (int64_t j = 0; j < ksize_w; ++j) {
+        int64_t h = h_in + i * dilation_h;
+        int64_t w = w_in + j * dilation_w;
+        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
+          data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void im2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t height_col, const int64_t width_col,
+            const int64_t ksize_h, const int64_t ksize_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int64_t num_kernels = channels * height_col * width_col;
+  // Launch
+  im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+      num_kernels, data_im, height, width, ksize_h, ksize_w,
+      pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w,
+      height_col, width_col, data_col
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename Dtype, typename Acctype>
+__launch_bounds__(CUDA_NUM_THREADS)
+__global__ void col2im_kernel(const int64_t n, const Dtype* data_col,
+                                  const int64_t height, const int64_t width, const int64_t channels,
+                                  const int64_t kernel_h, const int64_t kernel_w,
+                                  const int64_t pad_h, const int64_t pad_w,
+                                  const int64_t stride_h, const int64_t stride_w,
+                                  const int64_t dilation_h, const int64_t dilation_w,
+                                  const int64_t height_col, const int64_t width_col,
+                                  Dtype* data_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    Acctype val = Acctype(0);
+    const int64_t w_im = index % width + pad_w;
+    const int64_t h_im = (index / width) % height + pad_h;
+    const int64_t c_im = index / (width * height);
+    int64_t kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int64_t kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    // compute the start and end of the output
+    const int64_t w_col_start =
+      (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int64_t w_col_end = min(w_im / stride_w + 1, width_col);
+    const int64_t h_col_start =
+      (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int64_t h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int64_t h_k = (h_im - h_col * stride_h);
+        int64_t w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int64_t data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
+        }
+      }
+    }
+    data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
+  }
+}
+
+template <typename Dtype, typename Acctype>
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im);
+
+template <typename Dtype, typename Acctype>
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im) {
+  int64_t num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+      num_kernels, data_col, height, width, channels,
+      patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w,
+      output_height, output_width, data_im
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/linear_upsampling.h b/aten/src/THCUNN/linear_upsampling.h
new file mode 100644
index 0000000..bd8a601
--- /dev/null
+++ b/aten/src/THCUNN/linear_upsampling.h
@@ -0,0 +1,41 @@
+#ifndef THCUNN_LINEAR_UPSAMPLING_H
+#define THCUNN_LINEAR_UPSAMPLING_H
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+
+template<typename Acctype>
+__host__ __forceinline__
+static Acctype linear_upsampling_compute_scale(
+                          int inputSize, int outputSize, bool align_corners) {
+  if (outputSize > 1) {
+    return align_corners ? (Acctype) (inputSize - 1) / (outputSize - 1)
+                         : (Acctype) inputSize / outputSize;
+  } else {
+    return Acctype(0);
+  }
+}
+
+template<typename Acctype>
+__device__ __forceinline__
+static Acctype linear_upsampling_compute_source_index(
+                          Acctype scale, int dst_index, bool align_corners) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    Acctype src_idx = scale * (dst_index + Acctype(0.5)) - Acctype(0.5);
+    return src_idx < Acctype(0) ? Acctype(0) : src_idx;
+  }
+}
+
+__device__ __forceinline__
+static int nearest_neighbor_compute_source_index(
+		const float scale, int dst_index, int inputSize) {
+  const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
+  return src_index;
+}
+#endif
+
diff --git a/aten/src/THCUNN/row2col.h b/aten/src/THCUNN/row2col.h
new file mode 100644
index 0000000..04765dd
--- /dev/null
+++ b/aten/src/THCUNN/row2col.h
@@ -0,0 +1,90 @@
+#ifndef THCUNN_ROW2COL_H
+#define THCUNN_ROW2COL_H
+
+#include "THCNumerics.cuh"
+#include "common.h"
+
+// Kernel for fast unfold+copy on rows
+template <typename Dtype>
+__global__ void
+row2col_kernel(const int n, const Dtype *data_row, const int width,
+               const int ksize_w, const int pad_w, const int stride_w,
+               const int dilation_w, const int width_col, Dtype *data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int channel_in = index;
+    int channel_out = channel_in * ksize_w;
+    int w_in = w_out * stride_w - pad_w;
+    data_col += (channel_out)*width_col + w_out;
+    data_row += (channel_in)*width + w_in;
+    for (int j = 0; j < ksize_w; ++j) {
+      int w = w_in + j * dilation_w;
+      *data_col = (w >= 0 && w < width) ? data_row[j * dilation_w]
+                                        : ScalarConvert<int, Dtype>::to(0);
+      data_col += width_col;
+    }
+  }
+}
+
+template <typename Dtype>
+void row2col(cudaStream_t stream, const Dtype *data_row, const int channels,
+             const int width, const int ksize_w, const int pad_w,
+             const int stride_w, const int dilation_w, Dtype *data_col) {
+  // We are going to launch channels * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * width_col;
+  // Launch
+  row2col_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
+      num_kernels, data_row, width, ksize_w, pad_w, stride_w, 1, width_col,
+      data_col);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void col2row_kernel(const int n, const Dtype *data_col,
+                               const int width, const int channels,
+                               const int kernel_w, const int pad_w,
+                               const int stride_w, const int dilation_w,
+                               const int width_col, Dtype *data_row) {
+  CUDA_KERNEL_LOOP(index, n) {
+    Acctype val = Acctype(0);
+    const int w_row = index % width + pad_w;
+    const int c_row = index / width;
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    // compute the start and end of the output
+    const int w_col_start = (w_row < kernel_extent_w)
+                                ? 0
+                                : (w_row - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_row / stride_w + 1, width_col);
+    for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+      int w_k = (w_row - w_col * stride_w);
+      if (w_k % dilation_w == 0) {
+        w_k /= dilation_w;
+        int data_col_index = (c_row * kernel_w + w_k) * width_col + w_col;
+        val += data_col[data_col_index];
+      }
+    }
+    data_row[index] = ScalarConvert<Acctype, Dtype>::to(val);
+  }
+  }
+
+template <typename Dtype, typename Acctype>
+void col2row(cudaStream_t stream, const Dtype *data_col, const int channels,
+             const int width, const int patch_w, const int pad_w,
+             const int stride_w, const int dilation_w, Dtype *data_row) {
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2row_kernel<
+      Dtype, Acctype><<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
+      num_kernels, data_col, width, channels, patch_w, pad_w, stride_w,
+      dilation_w, width_col, data_row);
+
+  THCudaCheck(cudaGetLastError());
+}
+#endif
diff --git a/aten/src/THCUNN/vol2col.h b/aten/src/THCUNN/vol2col.h
new file mode 100644
index 0000000..223248f
--- /dev/null
+++ b/aten/src/THCUNN/vol2col.h
@@ -0,0 +1,139 @@
+#ifndef THCUNN_VOL2COL_H
+#define THCUNN_VOL2COL_H
+
+#include "common.h"
+#include "THCNumerics.cuh"
+
+// Kernel for fast unfold+copy on volumes
+template <typename Dtype>
+__global__ void vol2col_kernel(const int n, const Dtype* data_vol,
+    const int depth, const int height, const int width,
+    const int ksize_t, const int ksize_h, const int ksize_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    const int dilation_t, const int dilation_h, const int dilation_w,
+    const int depth_col, const int height_col, const int width_col,
+    Dtype* data_col) {
+CUDA_KERNEL_LOOP(index, n) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    index /= height_col;
+    int t_out = index % depth_col;
+    int channel_in = index / depth_col;
+    int channel_out = channel_in * ksize_t * ksize_h * ksize_w;
+    int t_in = t_out * stride_t - pad_t;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+    data_col += ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + w_out;
+    data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in;
+    for (int i = 0; i < ksize_t; ++i) {
+      for (int j = 0; j < ksize_h; ++j) {
+        for (int k = 0; k < ksize_w; ++k) {
+          int t = t_in + i * dilation_t;
+          int h = h_in + j * dilation_h;
+          int w = w_in + k * dilation_w;
+          *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ?
+            data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : ScalarConvert<int, Dtype>::to(0);
+          data_col += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void vol2col(cudaStream_t stream, const Dtype* data_vol, const int channels,
+    const int depth, const int height, const int width,
+    const int depth_col, const int height_col, const int width_col,
+    const int ksize_t, const int ksize_h, const int ksize_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    const int dilation_t, const int dilation_h, const int dilation_w,
+    Dtype* data_col) {
+  // We are going to launch channels * depth_col * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int num_kernels = channels * depth_col * height_col * width_col;
+  // Launch
+  vol2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+      num_kernels, data_vol, depth, height, width, ksize_t, ksize_h, ksize_w,
+      pad_t, pad_h, pad_w, stride_t, stride_h, stride_w,
+      dilation_t, dilation_h, dilation_w,
+      depth_col, height_col, width_col, data_col
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename Dtype, typename Acctype>
+__global__ void vol2im_kernel(const int n, const Dtype* data_col,
+    const int depth, const int height, const int width, const int channels,
+    const int kernel_t, const int kernel_h, const int kernel_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    const int dilation_t, const int dilation_h, const int dilation_w,
+    const int depth_col, const int height_col, const int width_col,
+    Dtype* data_vol) {
+  CUDA_KERNEL_LOOP(index, n) {
+    Acctype val = Acctype(0);
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int t_im = (index / width / height) % depth + pad_t;
+    const int c_im = index / (width * height * depth);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    int kernel_extent_t = (kernel_t - 1) * dilation_t + 1;
+    // compute the start and end of the output
+    const int w_col_start =
+      (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+      (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    const int t_col_start =
+      (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1;
+    const int t_col_end = min(t_im / stride_t + 1, depth_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int t_col = t_col_start; t_col < t_col_end; t_col += 1) {
+      for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+        for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+          int t_k = (t_im - t_col * stride_t);
+          int h_k = (h_im - h_col * stride_h);
+          int w_k = (w_im - w_col * stride_w);
+          if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+            t_k /= dilation_t;
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            int data_col_index =
+              (((((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k)
+                * depth_col + t_col) * height_col + h_col) * width_col + w_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_vol[index] = ScalarConvert<Acctype, Dtype>::to(val);
+  }
+}
+
+template <typename Dtype, typename Acctype>
+void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels,
+    const int depth, const int height, const int width,
+    const int output_depth, const int output_height, const int output_width,
+    const int patch_t, const int patch_h, const int patch_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    const int dilation_t, const int dilation_h, const int dilation_w,
+    Dtype* data_vol) {
+  int num_kernels = channels * depth * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  vol2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+      num_kernels, data_col, depth, height, width, channels,
+      patch_t, patch_h, patch_w, pad_t, pad_h, pad_w, stride_t, stride_h, stride_w,
+      dilation_t, dilation_h, dilation_w,
+      output_depth, output_height, output_width, data_vol
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THNN/CMakeLists.txt b/aten/src/THNN/CMakeLists.txt
new file mode 100644
index 0000000..e61624c
--- /dev/null
+++ b/aten/src/THNN/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+PARENT_SCOPE)
+INSTALL(FILES THNN.h Reduction.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN")
+INSTALL(FILES generic/THNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN/generic")
diff --git a/aten/src/THNN/README.md b/aten/src/THNN/README.md
new file mode 100644
index 0000000..da4d549
--- /dev/null
+++ b/aten/src/THNN/README.md
@@ -0,0 +1,27 @@
+# THNN
+
+THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions; most users will want to use ATen, which provides a C++ wrapper around these functions.
+
+There is also a CUDA counterpart of THNN, THCUNN.
+
+Looking to add an implementation?  Consider writing an ATen native function
+instead!  See [../ATen/native](ATen/native).
+
+## Links
+
+* [API reference](doc/api_reference.md)
+* [Style guidelines](doc/style_guidelines.md)
+
+## API
+
+THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
+
+* **updateOutput** - applies the module to an input
+* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
+* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
+
+For information on argument types please check the [API reference](doc/api_reference.md).
+
+## Developer docs
+
+* [Style guidelines](doc/style_guidelines.md)
diff --git a/aten/src/THNN/Reduction.h b/aten/src/THNN/Reduction.h
new file mode 100644
index 0000000..fea4c2f
--- /dev/null
+++ b/aten/src/THNN/Reduction.h
@@ -0,0 +1,17 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+namespace Reduction {
+
+// NB: Keep this in sync with Reduction class in torch/nn/modules/functional.py
+// These constants control the reduction behavior of loss functions.
+// Ideally, this would be a scoped enum, but jit doesn't support that
+enum Reduction {
+  None,             // Do not reduce
+  ElementwiseMean,  // Sum losses and take mean over each individually computed loss element
+  Sum,              // Sum losses
+  END
+};
+}
+
+#endif
diff --git a/aten/src/THNN/THNN.h b/aten/src/THNN/THNN.h
new file mode 100644
index 0000000..e216e62
--- /dev/null
+++ b/aten/src/THNN/THNN.h
@@ -0,0 +1,33 @@
+#ifndef THNN_H
+#define THNN_H
+
+#include <stdbool.h>
+#include <TH.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
+
+#define THIndexTensor THLongTensor
+#define THIndexTensor_(NAME) THLongTensor_ ## NAME
+
+#define THIntegerTensor THIntTensor
+#define THIntegerTensor_(NAME) THIntTensor_ ## NAME
+
+typedef int64_t THIndex_t;
+typedef int32_t THInteger_t;
+typedef void THNNState;
+
+#define THNN_resizeAs_indices(I1, I2)                    \
+  THLongStorage *size2 = THIndexTensor_(newSizeOf)(I2);  \
+  if (!THTensor_(isSize)(I1, size2))                     \
+  { \
+    THTensor_(resize)(I1, size2, NULL);                  \
+  } \
+  THLongStorage_free(size2);
+
+#include "generic/THNN.h"
+#include <THGenerateFloatTypes.h>
+
+#endif
diff --git a/aten/src/THNN/doc/api_reference.md b/aten/src/THNN/doc/api_reference.md
new file mode 100644
index 0000000..2372bba
--- /dev/null
+++ b/aten/src/THNN/doc/api_reference.md
@@ -0,0 +1,27 @@
+# API docs
+
+This document describes the conventions behind the THNN API.
+
+### The API
+
+All functions provided by THNN are stored in `aten/src/THNN/generic/THNN.h`.
+Look at this file.
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+### Argument types
+
+Some arguments have additional tags placed in square brackets in their header declarations:
+
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
diff --git a/aten/src/THNN/doc/style_guidelines.md b/aten/src/THNN/doc/style_guidelines.md
new file mode 100644
index 0000000..a725454
--- /dev/null
+++ b/aten/src/THNN/doc/style_guidelines.md
@@ -0,0 +1,59 @@
+## API design guidelines
+
+Functions should return `void`.
+
+All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
+```
+[weight], [bias], [any buffers], [additional arguments], [optional arguments]
+```
+
+### Modules
+```
+updateOutput: state, input, output, ...
+updateGradInput: state, input, gradOutput, gradInput, ...
+accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
+```
+
+e.g.
+```C
+void THNN_(HardShrink_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+```
+
+### Criterions
+```
+updateOutput: state, input, target, output, ...
+updateGradInput: state, input, target, gradInput, ...
+```
+
+e.g.
+
+```C
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState* state,
+          THTensor *input,
+          THLongTensor *target,
+          THTensor *output,
+          THTensor *weights,
+          THTensor *total_weight,
+          bool sizeAverage)
+```
+
+## Code style guide
+
+```C
+void THNN_Linear_updateOutput(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+//<- 10 ->
+```
+
+All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+
+Use 2 spaces for block indentation.
diff --git a/aten/src/THNN/generic/Abs.c b/aten/src/THNN/generic/Abs.c
new file mode 100644
index 0000000..28721ec
--- /dev/null
+++ b/aten/src/THNN/generic/Abs.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/AbsCriterion.c b/aten/src/THNN/generic/AbsCriterion.c
new file mode 100644
index 0000000..73552a2
--- /dev/null
+++ b/aten/src/THNN/generic/AbsCriterion.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+
+  if (reduction == Reduction::None) {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY3(real, input, real, target, real, output,
+      *output_data = fabs(*input_data - *target_data);
+    );
+    return;
+  }
+
+  real sum = 0;
+  THTensor_(resize1d)(output, 1);
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += fabs(*input_data - *target_data);
+  );
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THNN_CHECK_SHAPE(gradOutput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+      *gradInput_data = ((*input_data - *target_data) >= 0 ? 1 : -1);
+    );
+    TH_TENSOR_APPLY2(real, gradInput, real, gradOutput,
+      *gradInput_data *= *gradOutput_data;
+    );
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+  real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.) * THTensor_(fastGet1d)(gradOutput, 0);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/BCECriterion.c b/aten/src/THNN/generic/BCECriterion.c
new file mode 100644
index 0000000..f3f74ca
--- /dev/null
+++ b/aten/src/THNN/generic/BCECriterion.c
@@ -0,0 +1,118 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BCECriterion.c"
+#else
+
+#define EPS 1e-12
+
+static inline real safe_log(real a) {
+  if (a == 0.) {
+    return log(EPS);
+  }
+  return log(a);
+}
+
+void THNN_(BCECriterion_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *target,
+    THTensor *output,
+    int64_t reduction,
+    THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+
+  if (reduction == Reduction::None) {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY3(real, input, real, target, real, output,
+        real x = *input_data;
+        real y = *target_data;
+        THAssertMsg(x >= 0. && x <= 1.,
+          "input value should be between 0~1, but got %f",
+		      (double) x);
+		    *output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
+    );
+		if (weights) {
+      THTensor_(cmul)(output, output, weights);
+    }
+    return;
+  }
+
+	THTensor_(resize1d)(output, 1);
+  real sum = 0;
+
+  if (weights) {
+    TH_TENSOR_APPLY3(real, input, real, target, real, weights,
+      real x = *input_data;
+      real y = *target_data;
+      real w = *weights_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
+    );
+  } else {
+    TH_TENSOR_APPLY2(real, input, real, target,
+      real x = *input_data;
+      real y = *target_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
+    );
+  }
+
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(BCECriterion_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *target,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int64_t reduction,
+    THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THNN_CHECK_NELEMENT(gradOutput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+      real x = *input_data;
+      real y = *target_data;
+      *gradInput_data = -(y - x) / ((1. - x + EPS) * (x + EPS));
+    );
+
+    if (weights) {
+      TH_TENSOR_APPLY3(real, gradInput, real, weights, real, gradOutput,
+        *gradInput_data = *gradInput_data * *weights_data * *gradOutput_data;
+      );
+    } else {
+      THTensor_(cmul)(gradInput, gradInput, gradOutput);
+    }
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+  real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS)) * THTensor_(fastGet1d)(gradOutput, 0);
+  );
+
+  if(weights)
+    THTensor_(cmul)(gradInput, gradInput, weights);
+}
+
+#undef EPS
+
+#endif
diff --git a/aten/src/THNN/generic/BatchNormalization.c b/aten/src/THNN/generic/BatchNormalization.c
new file mode 100644
index 0000000..1f2aa3c
--- /dev/null
+++ b/aten/src/THNN/generic/BatchNormalization.c
@@ -0,0 +1,160 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
+{
+  THTensor_(resizeAs)(output, input);
+  int64_t nInput = THTensor_(size)(input, 1);
+  int64_t f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  if (train) {
+    THTensor_(resize1d)(save_mean, nInput);
+    THTensor_(resize1d)(save_std, nInput);
+  }
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per input
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per input
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      if (running_mean) {
+        THTensor_(set1d)(running_mean, f,
+          (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+      }
+      if (running_var) {
+        accreal unbiased_var = sum / (n - 1);
+        THTensor_(set1d)(running_var, f,
+          (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+      }
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+}
+
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  int64_t nInput = THTensor_(size)(input, 1);
+  int64_t f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  if (gradInput) {
+    THTensor_(resizeAs)(gradInput, input);
+  }
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c
new file mode 100644
index 0000000..a434efa
--- /dev/null
+++ b/aten/src/THNN/generic/ClassNLLCriterion.c
@@ -0,0 +1,219 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          int64_t reduction,
+          THTensor *weights,
+          THTensor *total_weight,
+          int64_t ignore_index)
+{
+  THTensor_(resize1d)(total_weight, 1);
+  int n_dims = THTensor_(_nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (THIndexTensor_(_nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(_nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THDescBuff s1 = THTensor_(sizeDesc)(weights);
+    THError("weight tensor should be defined either for all %d classes or no classes"
+	    " but got weight tensor of shape: %s", n_classes, s1.str);
+  }
+
+  if (reduction == Reduction::None && n_dims == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THTensor_(resize1d)(output, batch_size);
+
+    std::atomic<int> invalid_target(-1);  // We cannot throw an exception inside omp parallel
+    int i;
+    #pragma omp parallel for private(i)
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = THLongTensor_fastGet1d(target, i) - TH_INDEX_BASE;
+
+      if (cur_target >= 0 && cur_target < n_classes) {
+          if (cur_target == ignore_index) {
+            THTensor_(fastSet1d)(output, i, 0.0f);
+            continue;
+          }
+          real cur_weight = weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f;
+          THTensor_(fastSet1d)(output, i, -THTensor_(fastGet2d)(input, i, cur_target) * cur_weight);
+      } else {
+        int tmp = -1;
+        invalid_target.compare_exchange_strong(tmp, cur_target);
+      }
+    }
+
+    if (invalid_target.load() >= 0) {
+        THError("Target %d out of bounds", invalid_target.load());
+    }
+
+    return;
+  }
+
+  THTensor_(resize1d)(output, 1);
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(_nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+      total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+      output_data[0] = -input_data[cur_target] * total_weight_data[0];
+    }
+  } else if (THTensor_(_nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+        total_weight_data[0] += cur_weight;
+        output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+      }
+    }
+  }
+
+  if (reduction == Reduction::ElementwiseMean && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction,
+          THTensor *weights,
+          THTensor *total_weight,
+          int64_t ignore_index)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int n_dims = THTensor_(_nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
+  }
+
+  if (THIndexTensor_(_nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(_nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  if (reduction == Reduction::None && n_dims == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, batch_size);
+
+    int i;
+    #pragma omp parallel for private(i)
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = THLongTensor_fastGet1d(target, i) - TH_INDEX_BASE;
+      if (cur_target == ignore_index) {
+        continue;
+      }
+      real weight = weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f;
+      THTensor_(fastSet2d)(gradInput, i, cur_target, -weight * THTensor_(fastGet1d)(gradOutput, i));
+    }
+    return;
+  }
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0) {
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  real gradOutput_value = THTensor_(get1d)(gradOutput, 0);
+
+  if (THTensor_(_nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[cur_target] =
+        (reduction != Reduction::ElementwiseMean && weights) ? -weights_data[cur_target] : -1;
+      gradInput_data[cur_target] *= gradOutput_value;
+    }
+
+  } else if (THTensor_(_nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        gradInput_data[i * n_target + cur_target] =
+          -(weights ? weights_data[cur_target] : 1.0f) * gradOutput_value;
+
+        if (reduction == Reduction::ElementwiseMean && *total_weight_data) {
+          gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+        }
+      }
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Col2Im.c b/aten/src/THNN/generic/Col2Im.c
new file mode 100644
index 0000000..aa5174d
--- /dev/null
+++ b/aten/src/THNN/generic/Col2Im.c
@@ -0,0 +1,232 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Col2Im.c"
+#else
+
+// Note [im2col/col2im output padding]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Our implementations of im2col and col2im take both the input height/width as
+// well as a seemingly redundant output height/width.  In principle, you could
+// compute the output height/width by using the convolution shape formulas.  So,
+// what's up with that?
+//
+// The trouble arises when one runs the backward of a transposed convolution
+// with output_padding >= stride.  (BTW, output_padding is known as adj inside
+// THNN.) Let's consider a simple case where we have kernel=2, dilation=2,
+// stride=1, output_padding=1 for a 4x4 input:
+//
+// Input:  X
+//
+// Output: X.X.
+//         ....
+//         X.X.
+//         ....
+//
+// If we compute backwards of output with a standard convolution on the output
+// with the same parameters, we would end up with a 2x2 grad_input (because you
+// can slide the stencil over to the right once and down once).  But that is all
+// out-of-bounds if you're computing backwards for a 1x1 input.
+//
+// "Now Edward," you might say, "the real problem is that you set output_padding
+// >= stride, surely an error should have been raised in this case."  To
+// understand why it is useful to handle this case, we have to understand how we
+// compute the weight gradient of a convolution.  Suppose we have a convolution
+// with kernel=2, stride=2 on a 5x5 input.  Let us see all the contributions of
+// weight[0][0] (which we have labeled w) in the output:
+//
+// Input:  a.b..  Weight: w.
+//         .....          ..
+//         c.d..
+//         .....
+//         .....
+//
+// Output: [ aw+...  bw+... ]
+//         [ cw+...  dw+... ]
+//
+// From this diagram, it easy to see that we can compute the weight gradient
+// by performing a *dilated* convolution between the input and the
+// output gradients with kernel=2, dilation=2, stride=1.  But there's a rub: if
+// we do a dilated convolution directly, we'll end up with a 3x3 weight
+// gradient, when we clearly wanted a 2x2.  So how do we avoid going out
+// of bounds?  We could add a notion of 'output_padding' for non-transposed
+// convolution, but another simple and effective fix is to just accept
+// the desired output size directly, and compute only within those bounds.
+//
+//
+// ALSO do vol2col
+
+static void THNN_(im2col)(const real* data_im, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
+      real* data_col) {
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
+      }
+    }
+  }
+}
+
+static void THNN_(col2im)(const real* data_col, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+static inline void THNN_(Col2Im_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
+
+  THArgCheck(kW > 0 && kH > 0, 6,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(sW > 0 && sH > 0, 12,
+             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int64_t ndim = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
+                "Expected non-empty 2D or 3D input tensor, but got input of shape %s");
+
+  int64_t batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t nInputPlane  = input->size[batch_dim + 1];
+
+  if (nInputPlane % (kW * kH) != 0) {
+    THError("Expected size of input's dimension 1 to be divisible by the "
+            "product of kernel_size, but got input.size(1)=%lld and "
+            "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW);
+  }
+
+  int64_t inputLength  = input->size[batch_dim + 2];
+  int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH;
+  int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW;
+
+  if (inputLength != (nBlocksH * nBlocksW)) {
+    THError("Given output_size=(%d, %d), kernel_size=(%d, %d), "
+            "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected "
+            "size of input's dimension 2 to match the calculated number of "
+            "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.",
+            outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, sH, sW,
+            (long long) nBlocksH, (long long) nBlocksW,
+            (long long) (nBlocksH * nBlocksW), (long long) inputLength);
+  }
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).",
+            outputHeight, outputWidth);
+  }
+}
+
+void THNN_(Col2Im_updateOutput)(
+           THNNState *state,
+           THTensor *input,
+           THTensor *output,
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth,
+                           kH, kW, dH, dW, padH, padW, sH, sW);
+
+  bool batched_input = true;
+  if (input->dim() == 2) {
+      // Force batch
+      batched_input = false;
+      THTensor_(resize3d)(input, 1, input->size[0], input->size[1]);
+  }
+
+  long batchSize = input->size[0];
+  long nInputPlane = input->size[1];
+  long nOutputPlane = nInputPlane / (kW * kH);
+
+  input = THTensor_(newContiguous)(input);
+
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+
+  for (int64_t elt = 0; elt < batchSize; elt++) {
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    THNN_(col2im)(
+      THTensor_(data)(input_n),
+      nOutputPlane,
+      outputHeight, outputWidth,
+      height_col, width_col,
+      kH, kW,
+      padH, padW,
+      sH, sW,
+      dH, dW, THTensor_(data)(output_n));
+  }
+
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  if (!batched_input) {
+      THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(Col2Im_updateGradInput)(
+           THNNState *state,
+           THTensor *gradOutput,
+           THTensor *gradInput,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
+                             kH, kW, dH, dW, padH, padW, sH, sW);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/DistKLDivCriterion.c b/aten/src/THNN/generic/DistKLDivCriterion.c
new file mode 100644
index 0000000..8233608
--- /dev/null
+++ b/aten/src/THNN/generic/DistKLDivCriterion.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+
+  if (reduction == Reduction::None) {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY3(real, input, real, target, real, output,
+      *output_data = *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+    );
+    return;
+  }
+
+  THTensor_(resize1d)(output, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THNN_CHECK_SHAPE(input, gradOutput);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, target,
+      *gradInput_data = *target_data > 0 ? (-*target_data) * *gradOutput_data : 0;
+    );
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+
+  real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) * THTensor_(fastGet1d)(gradOutput, 0) : 0;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c
new file mode 100644
index 0000000..f2d8718
--- /dev/null
+++ b/aten/src/THNN/generic/ELU.c
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal alpha_,
+          accreal scale,
+          bool inplace)
+{
+  real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
+  if (inplace) {
+    TH_TENSOR_APPLY(real, input,
+      *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
+    );
+  }
+}
+
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal alpha_,
+          accreal scale)
+{
+  real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
+  THNN_CHECK_NELEMENT(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/FeatureLPPooling.c b/aten/src/THNN/generic/FeatureLPPooling.c
new file mode 100644
index 0000000..fdb4bbe
--- /dev/null
+++ b/aten/src/THNN/generic/FeatureLPPooling.c
@@ -0,0 +1,360 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FeatureLPPooling.c"
+#else
+
+#ifndef FEATURE_LP_DEFS
+#define FEATURE_LP_DEFS
+
+#ifdef _MSC_VER
+  #define FEATURE_LP_SIZE_TYPE int64_t
+  #define FEATURE_LP_CAST_TYPE (int64_t)
+#else
+  #define FEATURE_LP_SIZE_TYPE size_t
+  #define FEATURE_LP_CAST_TYPE
+#endif
+
+typedef struct {
+  size_t size[4];
+  size_t stride[4];
+} FeatureLPPoolingSizes;
+
+static inline size_t flpGetOffset(FeatureLPPoolingSizes* s,
+                           FEATURE_LP_SIZE_TYPE batch,
+                           FEATURE_LP_SIZE_TYPE feature,
+                           FEATURE_LP_SIZE_TYPE opt1,
+                           FEATURE_LP_SIZE_TYPE opt2) {
+  return s->stride[0] * batch +
+    s->stride[1] * feature +
+    s->stride[2] * opt1 +
+    s->stride[3] * opt2;
+}
+
+static inline size_t flpOutputSize(FEATURE_LP_SIZE_TYPE inputSize,
+                            FEATURE_LP_SIZE_TYPE width,
+                            FEATURE_LP_SIZE_TYPE stride) {
+  return ((inputSize - width) / stride) + 1;
+}
+
+#endif // FEATURE_LP_DEFS
+
+FeatureLPPoolingSizes
+THNN_(FeatureLPPooling_upcastCPU)(THTensor* t, bool batchMode) {
+  int dim = THTensor_(_nDimension)(t);
+
+  // Upcast to [batch dim][feature dim][opt dim 1][opt dim 2]
+  FeatureLPPoolingSizes s;
+  for (int i = 0; i < 4; ++i) {
+    s.size[i] = 1;
+    s.stride[i] = 1;
+  }
+
+  if (dim == 1) {
+    THAssert(!batchMode);
+    // [feature dim]
+    s.size[1] = THTensor_(size)(t, 0);
+    s.stride[1] = THTensor_(stride)(t, 0);
+  } else if (dim == 2) {
+    if (batchMode) {
+      // [batch dim][feature dim]
+      for (int i = 0; i < 2; ++i) {
+        s.size[i] = THTensor_(size)(t, i);
+        s.stride[i] = THTensor_(stride)(t, i);
+      }
+    } else {
+      // [feature dim][opt dim 1]
+      s.size[1] = THTensor_(size)(t, 0);
+      s.stride[1] = THTensor_(stride)(t, 0);
+      s.size[2] = THTensor_(size)(t, 1);
+      s.stride[2] = THTensor_(stride)(t, 1);
+    }
+  } else if (dim == 3) {
+    if (batchMode) {
+      // [batch dim][feature dim][opt dim 1]
+      for (int i = 0; i < 3; ++i) {
+        s.size[i] = THTensor_(size)(t, i);
+        s.stride[i] = THTensor_(stride)(t, i);
+      }
+    } else {
+      // [feature dim][opt dim 1][opt dim 2]
+      for (int i = 1; i < 4; ++i) {
+        s.size[i] = THTensor_(size)(t, i - 1);
+        s.stride[i] = THTensor_(stride)(t, i - 1);
+      }
+    }
+  } else if (dim == 4) {
+    // [batch dim][feature dim][opt dim 1][opt dim 2]
+    THAssert(batchMode);
+    for (int i = 0; i < 4; ++i) {
+      s.size[i] = THTensor_(size)(t, i);
+      s.stride[i] = THTensor_(stride)(t, i);
+    }
+  }
+
+  return s;
+}
+
+void
+THNN_(FeatureLPPooling_resizeForOutputCPU)(THTensor* toResize,
+                                           THTensor* input,
+                                           bool batchMode,
+                                           int width,
+                                           int stride) {
+  int inputDim = THTensor_(_nDimension)(input);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  int64_t outSize =
+    flpOutputSize(THTensor_(size)(input, 0), width, stride);
+  if (batchMode) {
+    THAssert(inputDim > 1);
+    outSize =
+      flpOutputSize(THTensor_(size)(input, 1), width, stride);
+  } else {
+    THAssert(inputDim < 4);
+  }
+
+  if (inputDim == 1) {
+    THTensor_(resize1d)(toResize, outSize);
+  } else if (inputDim == 2) {
+    if (batchMode) {
+      THTensor_(resize2d)(toResize,
+                          THTensor_(size)(input, 0),
+                          outSize);
+    } else {
+      THTensor_(resize2d)(toResize,
+                          outSize,
+                          THTensor_(size)(input, 1));
+    }
+  } else if (inputDim == 3) {
+    if (batchMode) {
+      THTensor_(resize3d)(toResize,
+                          THTensor_(size)(input, 0), outSize,
+                          THTensor_(size)(input, 2));
+    } else {
+      THTensor_(resize3d)(toResize,
+                          outSize, THTensor_(size)(input, 1),
+                          THTensor_(size)(input, 2));
+    }
+  } else if (inputDim == 4) {
+    THTensor_(resize4d)(toResize,
+                        THTensor_(size)(input, 0),
+                        outSize,
+                        THTensor_(size)(input, 2),
+                        THTensor_(size)(input, 3));
+  }
+}
+
+// Makes `toResize` the same size/dimensionality as `src`
+void
+THNN_(FeatureLPPooling_resizeCPU)(THTensor* toResize,
+                                  THTensor* src) {
+  int inputDim = THTensor_(_nDimension)(src);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  if (inputDim == 1) {
+    THTensor_(resize1d)(toResize,
+                        THTensor_(size)(src, 0));
+  } else if (inputDim == 2) {
+    THTensor_(resize2d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1));
+  } else if (inputDim == 3) {
+    THTensor_(resize3d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1),
+      THTensor_(size)(src, 2));
+  } else if (inputDim == 4) {
+    THTensor_(resize4d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1),
+      THTensor_(size)(src, 2),
+      THTensor_(size)(src, 3));
+  }
+}
+
+void
+THNN_(FeatureLPPooling_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  accreal power,
+  int width,
+  int stride,
+  bool batchMode) {
+  int inputDim = THTensor_(_nDimension)(input);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 2,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 2,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  FeatureLPPoolingSizes inputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(input, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(inputDesc.size[1] >= (FEATURE_LP_SIZE_TYPE) width, 3,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 5,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 6,
+             "stride must be between 1 - 4");
+
+  // Resize output
+
+  THNN_(FeatureLPPooling_resizeForOutputCPU)(
+    output, input, batchMode, width, stride);
+
+  FeatureLPPoolingSizes outputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(output, batchMode);
+
+  real* inputP = THTensor_(data)(input);
+  real* outputP = THTensor_(data)(output);
+
+  FEATURE_LP_SIZE_TYPE batch, opt1, opt2, outputFeature, i;
+
+#pragma omp parallel for
+  for (batch = 0; batch < FEATURE_LP_CAST_TYPE inputDesc.size[0]; ++batch) {
+    for (opt1 = 0; opt1 < FEATURE_LP_CAST_TYPE inputDesc.size[2]; ++opt1) {
+      for (opt2 = 0; opt2 < FEATURE_LP_CAST_TYPE inputDesc.size[3]; ++opt2) {
+        for (outputFeature = 0;
+             outputFeature < FEATURE_LP_CAST_TYPE outputDesc.size[1]; ++outputFeature) {
+
+          accreal v = (accreal) 0;
+          for (i = 0; i < (FEATURE_LP_SIZE_TYPE) width; ++i) {
+            FEATURE_LP_SIZE_TYPE inputFeature = outputFeature * stride + i;
+            if (inputFeature >= FEATURE_LP_CAST_TYPE inputDesc.size[1]) {
+              break;
+            }
+
+            v +=
+              pow(inputP[flpGetOffset(&inputDesc,
+                                      batch,
+                                      inputFeature,
+                                      opt1,
+                                      opt2)], power);
+          }
+
+          outputP[flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)] =
+            pow(v, (accreal) 1 / power);
+        }
+      }
+    }
+  }
+}
+
+void
+THNN_(FeatureLPPooling_updateGradInput)(
+  THNNState *state,
+  THTensor* gradOutput,
+  THTensor* input,
+  THTensor* output,
+  THTensor* gradInput,
+  accreal power,
+  int width,
+  int stride,
+  bool batchMode) {
+  int inputDim = THTensor_(_nDimension)(input);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 3,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 3,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  FeatureLPPoolingSizes inputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(input, batchMode);
+  FeatureLPPoolingSizes gradOutputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(gradOutput, batchMode);
+  FeatureLPPoolingSizes outputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(output, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(inputDesc.size[1] >= (FEATURE_LP_SIZE_TYPE) width, 3,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 7,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 8,
+             "stride must be between 1 - 4");
+
+  for (int i = 0; i < 4; ++i) {
+    THAssertMsg(outputDesc.size[i] == gradOutputDesc.size[i],
+                "output and gradOutput sizes do not match");
+  }
+
+  // Make sure that the input sizes produce the output sizes
+  THArgCheck(flpOutputSize(FEATURE_LP_CAST_TYPE inputDesc.size[1], width, stride) ==
+             outputDesc.size[1], 3,
+             "input and output sizes do not match with respect to "
+             "width and stride");
+
+  // Resize `gradInput` based on `input`
+  THNN_(FeatureLPPooling_resizeCPU)(gradInput, input);
+
+  // Zero gradInput for accumulation
+  THTensor_(zero)(gradInput);
+
+  FeatureLPPoolingSizes gradInputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(gradInput, batchMode);
+
+  real* gradOutputP = THTensor_(data)(gradOutput);
+  real* gradInputP = THTensor_(data)(gradInput);
+  real* outputP = THTensor_(data)(output);
+  real* inputP = THTensor_(data)(input);
+
+  FEATURE_LP_SIZE_TYPE batch, opt1, opt2, outputFeature, i;
+
+#pragma omp parallel for
+  for (batch = 0; batch < FEATURE_LP_CAST_TYPE inputDesc.size[0]; ++batch) {
+    for (opt1 = 0; opt1 < FEATURE_LP_CAST_TYPE inputDesc.size[2]; ++opt1) {
+      for (opt2 = 0; opt2 < FEATURE_LP_CAST_TYPE inputDesc.size[3]; ++opt2) {
+        for (outputFeature = 0;
+             outputFeature < FEATURE_LP_CAST_TYPE outputDesc.size[1]; ++outputFeature) {
+
+          // Load output (f(x_is)). It is possible that this is zero, in
+          // which case we'll ignore this point.
+          real outputV =
+            outputP[
+              flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)];
+
+          if (outputV == (real) 0) {
+            continue;
+          }
+
+          for (i = 0; i < (FEATURE_LP_SIZE_TYPE) width; ++i) {
+            FEATURE_LP_SIZE_TYPE inputFeature = outputFeature * stride + i;
+            THAssert(inputFeature < inputDesc.size[1]);
+
+            real gradOutputV =
+              gradOutputP[
+                flpGetOffset(&gradOutputDesc, batch, outputFeature, opt1, opt2)];
+            real inputV =
+              inputP[
+                flpGetOffset(&inputDesc, batch, inputFeature, opt1, opt2)];
+
+            // Calculate grad * (x_i / f(x_is))^(p - 1)
+            real v = gradOutputV * pow(inputV / outputV, power - (accreal) 1);
+
+            gradInputP[
+              flpGetOffset(&gradInputDesc, batch, inputFeature, opt1, opt2)]
+              += v;
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/FusedRNNKernel.c b/aten/src/THNN/generic/FusedRNNKernel.c
new file mode 100644
index 0000000..30788b0
--- /dev/null
+++ b/aten/src/THNN/generic/FusedRNNKernel.c
@@ -0,0 +1,55 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FusedRNNKernel.c"
+#else
+
+void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *hx,
+          THTensor *hy,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *cx,
+          THTensor *hy,
+          THTensor *cy)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *prevC,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+#endif
diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c
new file mode 100644
index 0000000..68cdc37
--- /dev/null
+++ b/aten/src/THNN/generic/GatedLinearUnit.c
@@ -0,0 +1,73 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GatedLinearUnit.c"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int dim)
+{
+  // size output to half of input
+  dim = dim - TH_INDEX_BASE;
+  const int64_t nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  const int64_t inputSize = THTensor_(size)(input, dim) / 2;
+  THLongStorage *newSizes = THTensor_(newSizeOf)(input);
+  THLongStorage_set(newSizes, dim, inputSize);
+  THTensor_(resize)(output, newSizes, NULL);
+
+  // halve tensor
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+
+  // x = x1:cmul( sigmoid(x2) )
+  THTensor_(sigmoid)(output, secondHalf);
+  THTensor_(cmul)(output, output, firstHalf);
+
+  THLongStorage_free(newSizes);
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int dim)
+{
+  // set up tensors
+  dim = dim - TH_INDEX_BASE;
+  const int64_t nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  THTensor_(resizeAs)(gradInput, input);
+  const int64_t inputSize = THTensor_(size)(input, dim) / 2;
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+  THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize);
+  THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize);
+
+  THTensor_(sigmoid)(gradInputfirstHalf, secondHalf);
+
+  TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf,
+    real z = *gradInputfirstHalf_data;
+    *gradInputsecondHalf_data = (1. - z) * z;
+  );
+
+  THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput);
+
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput);
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf);
+
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+  THTensor_(free)(gradInputfirstHalf);
+  THTensor_(free)(gradInputsecondHalf);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/HardShrink.c b/aten/src/THNN/generic/HardShrink.c
new file mode 100644
index 0000000..18dea95
--- /dev/null
+++ b/aten/src/THNN/generic/HardShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if (*input_data > lambda)
+      *output_data = *input_data;
+    else if (*input_data >= -lambda)
+      *output_data = 0;
+    else
+      *output_data = *input_data;  // let NaN case pass through here
+  );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if (*input_data >= -lambda && *input_data <= lambda)
+      *gradInput_data = 0;
+    else
+      *gradInput_data = *gradOutput_data;  // let NaN case pass through here
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/HardTanh.c b/aten/src/THNN/generic/HardTanh.c
new file mode 100644
index 0000000..a19c0ce
--- /dev/null
+++ b/aten/src/THNN/generic/HardTanh.c
@@ -0,0 +1,138 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
+
+  if (input->_dim() == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+    }
+    else
+    {
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data > max_val)
+          *output_data = max_val;
+        else
+          *output_data = *input_data;
+      );
+    }
+  }
+  else
+  {
+    real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
+
+  if (input->_dim() == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
+  {
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Im2Col.c b/aten/src/THNN/generic/Im2Col.c
new file mode 100644
index 0000000..bbb0dd8
--- /dev/null
+++ b/aten/src/THNN/generic/Im2Col.c
@@ -0,0 +1,119 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Im2Col.c"
+#else
+
+static inline void THNN_(Im2Col_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
+
+  THArgCheck(kW > 0 && kH > 0, 4,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 6,
+             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(sW > 0 && sH > 0, 10,
+             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
+
+  int64_t ndim = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+                "Expected non-empty 3D or 4D input tensor, but got input of shape %s");
+
+  int64_t dim_batch = 0;
+  if (ndim == 3) {
+    dim_batch = -1;
+  }
+  int64_t nInputPlane  = THTensor_(size)(input, dim_batch + 1);
+  int64_t inputHeight  = THTensor_(size)(input, dim_batch + 2);
+  int64_t inputWidth   = THTensor_(size)(input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
+
+  if (outputHeight < 1 || outputWidth < 1) {
+    THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
+            "dilation=(%d, %d), padding=(%d, %d), calculated "
+            "shape of the array of sliding blocks as (%d, %d), which is "
+            "too small (non-positive).",
+            inputHeight, inputHeight, kH, kW, dH, dW, padH, padW,
+            outputHeight, outputWidth);
+  }
+}
+
+void THNN_(Im2Col_updateOutput)(
+           THNNState *state,
+           THTensor *input,
+           THTensor *output,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+  THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW);
+
+  input = THTensor_(newContiguous)(input);
+  bool batched_input = true;
+  if (input->dim() == 3) {
+    batched_input = false;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t batchSize    = THTensor_(size)(input, 0);
+  int64_t nInputPlane  = THTensor_(size)(input, 1);
+  int64_t inputHeight  = THTensor_(size)(input, 2);
+  int64_t inputWidth   = THTensor_(size)(input, 3);
+
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
+
+  THTensor_(resize3d)(output, batchSize, nOutputPlane, outputLength);
+  THTensor_(zero)(output);
+
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  for (int64_t elt = 0; elt < batchSize; elt++) {
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane,
+      inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, sH, sW,
+      dH, dW, THTensor_(data)(output_n));
+  }
+
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  if (!batched_input) {
+    THTensor_(resize2d)(output, nOutputPlane, outputLength);
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(Im2Col_updateGradInput)(
+           THNNState *state,
+           THTensor *gradOutput,
+           THTensor *gradInput,
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
+
+
+  THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
+                             inputHeight, inputWidth,
+                             kH, kW, dH, dW,
+                             padH, padW, sH, sW);
+}
+
+
+#endif
diff --git a/aten/src/THNN/generic/IndexLinear.c b/aten/src/THNN/generic/IndexLinear.c
new file mode 100644
index 0000000..50aa93d
--- /dev/null
+++ b/aten/src/THNN/generic/IndexLinear.c
@@ -0,0 +1,727 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/IndexLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+/* Threshold used to trigger multithreading */
+#ifndef THNN_SPARSE_OMP_THRESHOLD
+#define THNN_SPARSE_OMP_THRESHOLD 100000
+#endif
+
+/* Threshold used to trigger BLAS axpy call */
+#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
+#define THNN_SPARSE_OUTDIM_THRESHOLD 49
+#endif
+
+/* sign MACRO */
+#ifndef THNN_INDEXLINEAR_SIGN
+#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 )  ?  -1   : ( (a) > 0 ) )
+#endif
+
+static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
+{
+  return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
+                && THTensor_(_nDimension)(values) == 1
+                && THLongTensor__nDimension(keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THLongTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int  train)
+{
+  /* Retrieve all the dimensions of the problem */
+  int64_t batchSize = THLongTensor_size(sizes, 0);
+  int64_t keysSize = THLongTensor_size(keys, 0);
+  int64_t outDim = THTensor_(size)(bias, 0);
+  int64_t woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  int64_t* sizesData = THLongTensor_data(sizes);
+  int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes);
+
+  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
+  real* normalizedValuesData = NULL;
+  if (maxNormalize)
+  {
+    THTensor_(resize1d)(normalizedValues, keysSize);
+    normalizedValuesData = THTensor_(data)(normalizedValues);
+  }
+
+  /* Resize the output */
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  /* Access the storage data/strides */
+  real* outputData = THTensor_(data)(output);
+  real* valuesData = THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  int64_t weightStride0 = weight->stride[0];
+  real* biasData = THTensor_(data)(bias);
+  int64_t* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
+  int64_t i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations. */
+  if (outDim == 1)
+  {
+    THVector_(fill)(outputData, *biasData, batchSize);
+    if (maxNormalize)
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, keysOffset,                                    \
+                 weightData, keysData,                                  \
+                 valuesData, outputData,                                \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        real* loutputData = outputData + j;
+        real val = 0;
+        real absVal = 0;
+        int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
+          absVal = fabs(valuesData[offset]);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * This is used at update time.
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+          normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
+          val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+    else
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 outputData, cumSumSizesData,                           \
+                 sizesData)                                             \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+        real* loutputData = outputData + j;
+        real val = 0;
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+  }
+  else {
+#pragma omp parallel                                                    \
+    for private(i,j,k)                                                  \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 biasData, outputData,                                  \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+    for (j = 0; j < batchSize; j++)
+    {
+      int64_t offset = j == 0 ? 0 : cumSumSizesData[j -  1];
+      real val;
+      real* loutputData = outputData + j*outDim;
+      real* lweightData = weightData;
+      memcpy(loutputData, biasData, outDim*sizeof(real));
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
+        if (maxNormalize)
+        {
+          val = valuesData[offset];
+          real absVal = fabs(val);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * The commented section thereafter is just an example of what can be done:
+             *
+             *```
+             * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
+             * real alpha = 1;
+             * real beta = 0.01;
+             * real gamma = 1 - 0.000001;
+             * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
+             * l = gamma*l;
+             * weightData[woffset+2] = (alpha-beta)*l + beta;
+             * ```
+             *
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+
+          /* Normalize + Clamp */
+          val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
+          normalizedValuesData[offset] = val;
+
+          lweightData = weightData + woffset + maxNormalize;
+        }
+        else
+        {
+          val = valuesData[offset];
+          lweightData = weightData + woffset;
+        }
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            loutputData[k] += lweightData[k] * val;
+          }
+        }
+        offset++;
+      }
+    }
+  }
+  return;
+}
+
+void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THLongTensor *runningKeys,
+          THLongTensor *cumSumSizes,
+          int64_t keysOffset,
+          accreal weightDecay_,
+          accreal learningRate_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  /* Retrieve all the dimensions of the problem */
+  int64_t outDim = THTensor_(size)(bias, 0);
+  int64_t woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  int64_t keysSize = THLongTensor_size(runningKeys, 0);
+
+  /* Access the storage data/strides */
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  int64_t weightStride0 = weight->stride[0];
+  real* gradBiasData = THTensor_(data)(gradBias);
+  real* biasData = THTensor_(data)(bias);
+  int64_t* keysData = THLongTensor_data(runningKeys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
+  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");
+
+  int j, k;
+
+  /* Update the bias first */
+  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr;
+        }
+      }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          int64_t woffset = weightStride0*(keysData[j] + keysOffset);
+          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (j = 0; j < keysSize; j++)
+    {
+      real lr = learningRate;
+      real wd = weightDecay;
+      real* lweightData;
+      int64_t woffset = weightStride0*(keysData[j] + keysOffset);
+      real* lgradWeightData = gradWeightData + j*outDim;
+      if (maxNormalize)
+      {
+        lgradWeightData += j*outDim;
+        /* weightData[woffset + 2] */
+        lweightData = weightData + woffset + maxNormalize - 2;
+        lr = lr*lweightData[0];
+        wd = weightDecay*lweightData[0];
+        /* weightData[woffset + 3] */
+        lweightData++;
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
+        }
+        lweightData++;
+        lgradWeightData += outDim;
+      }
+      else
+      {
+        lweightData = weightData + woffset;
+      }
+
+      /* We do sparse weight decay.
+       * We think it makes more sense. */
+      if (weightDecay)
+      {
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[k] -= lweightData[k]*wd;
+        }
+      }
+
+      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+      {
+        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
+      }
+      else
+      {
+        for (k=0; k < outDim; k++)
+        {
+          lweightData[k] -= lgradWeightData[k]*lr;
+        }
+      }
+    }
+  }
+}
+
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  int64_t batchSize = THLongTensor_size(sizes, 0);
+  int64_t outDim = THTensor_(size)(bias, 0);
+  int64_t woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  real* biasData = THTensor_(data)(bias);
+  int64_t weightStride0 = weight->stride[0];
+  int64_t* keysData = THLongTensor_data(keys);
+  int64_t* sizesData = THLongTensor_data(sizes);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+        int64_t offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
+            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
+            offset++;
+          }
+        }
+
+        offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-2] = 0;
+            offset++;
+          }
+        }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        int64_t offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            int64_t idx = weightStride0*(keysData[offset] + keysOffset);
+            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
+            offset++;
+          }
+        }
+      }
+      else
+      {
+        int64_t offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real val = gradOutputData[j] * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
+            offset++;
+          }
+          *biasData -= val;
+        }
+      }
+    }
+  }
+  else {
+    int64_t offset = 0;
+    for (j = 0; j < batchSize; j++)
+    {
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lweightData = weightData;
+      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        real wd = weightDecay;
+
+        // Max normalize case
+        if (maxNormalize)
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          val *= lweightData[0];
+          wd *= lweightData[0];
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
+          }
+          lweightData += 2;
+        }
+        else
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
+        }
+
+        /* We do sparse weight decay.
+         * We think it makes more sense. */
+        if (weightDecay)
+        {
+          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+          {
+            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
+          }
+          else
+          {
+            for (k=0; k < outDim; k++)
+            {
+              lweightData[k] -= wd * lweightData[k];
+            }
+          }
+        }
+
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[k] -= val * lgradOutputData[k];
+          }
+        }
+        offset++;
+      }
+    }
+
+    /* Max Normalize case:
+     * Reset the smart update scaling if
+     * one does it batch-wise.
+     * TODO: Decide what to do with that piece of code.
+     * NB: If the code belowe is uncommented, so should the commented
+     * code in IndexLinear:zeroGradParameters() */
+
+    /*
+    if (maxNormalize)
+    {
+      offset = 0;
+      for (j = 0; j < batchSize; j++)
+      {
+        real* lweightData = weightData;
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          real val = valuesData[offset] * scale;
+          real wd = weightDecay;
+
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          lweightData[0] = 0;
+          offset++;
+        }
+      }
+    }
+    */
+  }
+  return;
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *valuesBuffer,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  int64_t batchSize = THLongTensor_size(sizes, 0);
+  int64_t keysSize = THLongTensor_size(keys, 0);
+  int64_t outDim = THTensor_(size)(bias, 0);
+  int64_t woutDim = THTensor_(size)(weight, 1);
+  int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  int64_t* sizesData = THLongTensor_data(sizes);
+
+  /* COmpute the cumulative sizes */
+  THLongTensor* cumSizes = THLongTensor_new();
+  THLongTensor_cumsum(cumSizes, sizes, 0);
+  int64_t* cumSizesData = THLongTensor_data(cumSizes);
+
+  /* Resize the gradWeight buffer to keep it dense.
+   * That speeds up updates A LOT assuming random mem access. */
+  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* gradBiasData = THTensor_(data)(gradBias);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    for (j = 0; j < batchSize; j++)
+    {
+      int64_t offset = j==0?0:cumSizesData[j-1];
+      real val = gradOutputData[j] * scale;
+      real* lgradWeightData = gradWeightData + offset;
+      real* lvaluesData = valuesData + offset;
+      int64_t end = sizesData[j];
+
+      if (maxNormalize)
+      {
+        lgradWeightData += offset;
+        i = 0;
+        for(;i < end; i++)
+        {
+          lgradWeightData[2*i] = val;
+          lgradWeightData[2*i+1] = val * lvaluesData[i];
+        }
+      }
+      else
+      {
+        i = 0;
+        for(;i < end-4; i += 4)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+          lgradWeightData[i+1] = val * lvaluesData[i+1];
+          lgradWeightData[i+2] = val * lvaluesData[i+2];
+          lgradWeightData[i+3] = val * lvaluesData[i+3];
+        }
+
+        for(; i < end; i++)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+        }
+      }
+      *gradBiasData += val;
+      offset += end;
+    }
+  }
+  else {
+    for (j = 0; j < batchSize; j++)
+    {
+      int64_t offset = j==0?0:cumSizesData[j-1];
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lgradWeightData = gradWeightData;
+      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        lgradWeightData = gradWeightData + offset*outDim;
+        if (maxNormalize)
+        {
+          lgradWeightData += offset*outDim;
+          k = 0;
+          for(;k < outDim-4; k += 4)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
+            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
+            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
+          }
+
+          for(; k < outDim; k++)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+          }
+          lgradWeightData += outDim;
+        }
+        k = 0;
+        for(;k < outDim-4; k += 4)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+          lgradWeightData[k+1] = val * lgradOutputData[k+1];
+          lgradWeightData[k+2] = val * lgradOutputData[k+2];
+          lgradWeightData[k+3] = val * lgradOutputData[k+3];
+        }
+
+        for(; k < outDim; k++)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+        }
+        offset++;
+      }
+    }
+  }
+  THLongTensor_free(cumSizes);
+  return;
+}
+#endif
diff --git a/aten/src/THNN/generic/L1Cost.c b/aten/src/THNN/generic/L1Cost.c
new file mode 100644
index 0000000..8f5eb17
--- /dev/null
+++ b/aten/src/THNN/generic/L1Cost.c
@@ -0,0 +1,38 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  accreal sum = 0;
+
+  TH_TENSOR_APPLY(real, input, 
+    sum += fabs(*input_data);
+  );
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/LeakyReLU.c b/aten/src/THNN/generic/LeakyReLU.c
new file mode 100644
index 0000000..abca9fb
--- /dev/null
+++ b/aten/src/THNN/generic/LeakyReLU.c
@@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      const real r = (*input_data > 0) ? 1 : negval;
+      *output_data = *input_data * r;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Linear.c b/aten/src/THNN/generic/Linear.c
new file mode 100644
index 0000000..630dc4c
--- /dev/null
+++ b/aten/src/THNN/generic/Linear.c
@@ -0,0 +1,114 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Linear.c"
+#else
+
+void THNN_(Linear_updateAddBuffer)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *addBuffer)
+{
+  int64_t nframe = THTensor_(size)(input,0);
+  int64_t nElement = THTensor_(nElement)(addBuffer);
+  if (nElement != nframe) {
+    THTensor_(resize1d)(addBuffer,nframe);
+    THTensor_(fill)(addBuffer,1.0);
+  }
+}
+
+void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer)
+{
+  int64_t dim = THTensor_(_nDimension)(input);
+  if (dim == 1) {
+    THTensor_(resize1d)(output,THTensor_(size)(weight,0));
+    if (bias) {
+      THTensor_(copy)(output,bias);
+    }
+    else {
+      THTensor_(zero)(output);
+    }
+    THTensor_(addmv)(output,1,output,1,weight,input);
+  }
+  else if (dim == 2) {
+    int64_t nframe = THTensor_(size)(input,0);
+    int64_t nElement = THTensor_(nElement)(output);
+    THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0));
+    if (THTensor_(nElement)(output) != nElement) {
+      THTensor_(zero)(output);
+    }
+    THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight,weight,0,1);
+    THTensor_(addmm)(output,0,output,1,input,tweight);
+    THTensor_(free)(tweight);
+    if (bias) {
+      THTensor_(addr)(output,1,output,1,addBuffer,bias);
+    }
+  }
+}
+
+void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
+{
+  if (gradInput) {
+    int64_t nElement = THTensor_(nElement)(gradInput);
+    THTensor_(resizeAs)(gradInput,input);
+    if (THTensor_(nElement)(gradInput) != nElement) {
+      THTensor_(zero)(gradInput);
+    }
+
+    int64_t dim = THTensor_(_nDimension)(input);
+    if (dim == 1) {
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight,weight,0,1);
+      THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput);
+      THTensor_(free)(tweight);
+    }
+    else if (dim == 2) {
+      THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
+    }
+  }
+}
+
+void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int64_t dim = THTensor_(_nDimension)(input);
+  if (dim == 1) {
+    THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    if (bias) {
+      THTensor_(cadd)(gradBias,gradBias,scale,gradOutput);
+    }
+  }
+  else if (dim == 2) {
+    THTensor *tgradOutput = THTensor_(new)();
+    THTensor_(transpose)(tgradOutput,gradOutput,0,1);
+    THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input);
+    if (bias) {
+      THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+      THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer);
+    }
+    THTensor_(free)(tgradOutput);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/LogSigmoid.c b/aten/src/THNN/generic/LogSigmoid.c
new file mode 100644
index 0000000..556af4f
--- /dev/null
+++ b/aten/src/THNN/generic/LogSigmoid.c
@@ -0,0 +1,51 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+  //Use the LogSumExp trick to make this stable against overflow
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real max_elem = fmax(0, -*input_data);
+    real z = exp(-max_elem) + exp(-*input_data - max_elem);
+    *buffer_data = z;
+    *output_data = -(max_elem + log(z));
+  );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, buffer);
+/* deriv of -max(0,-x) - log(e(0 - max(0,-x)) + e(-x - max(0,-x))) is
+ * -max_deriv - (-max_deriv*e(0-max(0,-x)) + (-1 - max_deriv)*e(-x - max(0,-x)))/z
+ * where z = e(0 - max(0,-x)) + e(-x - max(0,-x))
+ * which simplifies to 
+ *  -max_deriv - (z-1)/z if x is >= 0 or
+ *  -max_deriv + (z-1)/z if x is < 0
+ */
+  TH_TENSOR_APPLY3(real, input, real, gradInput, real, buffer,
+    real z = *buffer_data;
+    real max_deriv = 0.0;
+    real sign = -1.0;
+    if (*input_data < 0){
+        max_deriv = -1.0;
+        sign = 1.0;
+    }
+    *gradInput_data = -max_deriv - sign*((z - 1.0)/ z);
+    );
+    THTensor_(cmul)(gradInput, gradOutput, gradInput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c
new file mode 100644
index 0000000..05694fc
--- /dev/null
+++ b/aten/src/THNN/generic/LookupTable.c
@@ -0,0 +1,225 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
+{
+  ptrdiff_t i;
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    int64_t k = input_data[i] - TH_INDEX_BASE;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    int64_t k = input_data[i] - TH_INDEX_BASE;
+    count_data[k]++;
+  }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THIndexTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal ascale)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
+  ptrdiff_t i;
+  THInteger_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    THError("gradWeight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(input))
+    THError("input must be contiguous");
+  if (input->is_empty() || (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)) {
+    THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
+    THError("input must be a non-empty vector or matrix, but is of shape: %s", s1.str);
+  }
+
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+  int64_t numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
+      THError("inputs need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      input_data[i]);
+    }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  int64_t stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      int64_t start = tid * (numw/nthreads + 1);
+      int64_t end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        if (input_data[i] != paddingValue)
+        {
+            int64_t k = input_data[i] - TH_INDEX_BASE;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    if (input_data[i] != paddingValue)
+    {
+        int64_t k = input_data[i] - TH_INDEX_BASE;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          int64_t stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  int64_t j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          accreal maxNorm_,
+          accreal normType_)
+{
+  real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
+  real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (idx->is_empty() || THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a non-empty vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  ptrdiff_t i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  ptrdiff_t numel = THIndexTensor_(nElement)(idx);
+
+  int64_t numw = THTensor_(size)(weight, 0);
+  int64_t stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++) {
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
+      THError("input need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      row_idx[i]);
+    }
+  }
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  ptrdiff_t ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      int64_t k = row_idx[i] - TH_INDEX_BASE;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    int64_t k = row_idx[i] - TH_INDEX_BASE;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/MSECriterion.c b/aten/src/THNN/generic/MSECriterion.c
new file mode 100644
index 0000000..b7c6e07
--- /dev/null
+++ b/aten/src/THNN/generic/MSECriterion.c
@@ -0,0 +1,68 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+
+  if (reduction != Reduction::None) {
+    THTensor_(resize1d)(output, 1);
+
+    accreal sum = 0;
+
+    TH_TENSOR_APPLY2(real, input, real, target,
+      accreal z = (*input_data - *target_data);
+      sum += z*z;
+    );
+
+    if (reduction == Reduction::ElementwiseMean)
+      sum /= THTensor_(nElement)(input);
+
+    THTensor_(set1d)(output, 0, (real)sum);
+    return;
+  }
+
+  THTensor_(resizeAs)(output, input);
+  TH_TENSOR_APPLY3(real, input, real, target, real, output,
+      real z = (*input_data - *target_data);
+      *output_data = z*z;
+  );
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (reduction != Reduction::None) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+    real norm = reduction == Reduction::ElementwiseMean ? 2./((real)THTensor_(nElement)(input)) : 2.;
+    norm *= THTensor_(get1d)(gradOutput, 0);
+    TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+      *gradInput_data = norm * (*input_data - *target_data);
+    );
+    return;
+  }
+
+  THNN_CHECK_SHAPE(input, gradOutput);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = 2. * (*input_data - *target_data);
+  );
+  TH_TENSOR_APPLY2(real, gradInput, real, gradOutput,
+    *gradInput_data *= *gradOutput_data;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/MarginCriterion.c b/aten/src/THNN/generic/MarginCriterion.c
new file mode 100644
index 0000000..d6d9b60
--- /dev/null
+++ b/aten/src/THNN/generic/MarginCriterion.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 0000000..3072c03
--- /dev/null
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,256 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          int64_t reduction)
+{
+  real *input_data, *isTarget_data;
+  THIndex_t *target_data;
+  int64_t nframe, dim;
+  int64_t t, d, dt, ddt;
+  real sum;
+
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+           "non-empty vector or matrix expected, got size: ", input->sizes());
+
+  if (input->dim() == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim),
+             "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size[0] == nframe)
+             && (target->size[1] == dim), "inconsistent target size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+
+  THNN_resizeAs_indices(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  if (reduction != Reduction::None)
+  {
+    THTensor_(resize1d)(output, 1);
+
+    sum = 0;
+    for (t = 0; t < nframe; t++)
+    {
+      for (ddt = 0; ddt < dim; ddt++)
+      {
+        THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
+        if (target_idx < 0)
+          break;
+        isTarget_data[target_idx] = 1;
+      }
+      for (dt = 0; dt < dim; dt++)
+      {
+        THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+        real input_target;
+        if (target_idx < 0)
+          break;
+
+        input_target = input_data[target_idx];
+        for (d = 0; d < dim; d++)
+        {
+          if (!isTarget_data[d])
+          {
+            real z = 1 - input_target + input_data[d];
+            if (z > 0)
+              sum += z;
+          }
+        }
+      }
+      input_data += dim;
+      target_data += dim;
+      isTarget_data += dim;
+    }
+
+    sum /= dim;
+    if (reduction == Reduction::ElementwiseMean)
+      sum /= nframe;
+    THTensor_(fastSet1d)(output, 0, sum);
+
+    THTensor_(free)(input);
+    THIndexTensor_(free)(target);
+    return;
+  }
+
+  THTensor_(resize1d)(output, nframe);
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
+
+    sum = 0;
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+            sum += z;
+        }
+      }
+    }
+
+    sum /= dim;
+    THTensor_(fastSet1d)(output, t, sum);
+
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          int64_t reduction)
+{
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *isTarget_data;
+  int64_t nframe, dim;
+  int64_t t, d, dt;
+  real g;
+
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+           "vector or matrix expected, got size: ", input->sizes());
+
+  if (input->dim() == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size[0] == dim),
+             "inconsistent target size");
+    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size[0] == dim),
+             "inconsistent isTarget size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
+             && (target->size[1] == dim), 3, "inconsistent target size");
+    AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size[0] == nframe)
+             && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput = THTensor_(newContiguous)(gradInput);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  g = reduction == Reduction::ElementwiseMean ? (1./((real)(nframe*dim))) : (1./((real)dim));
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+    gradInput_data += dim;
+  }
+  gradInput_data = THTensor_(data)(gradInput);
+
+  if (reduction != Reduction::None)
+  {
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+    for (t = 0; t < nframe*dim; t++)
+    {
+      gradInput_data[t] *= THTensor_(fastGet1d)(gradOutput, 0);
+    }
+  }
+  else
+  {
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, nframe);
+    for (t = 0; t < nframe; t++)
+    {
+      for (d = 0; d < dim; d++)
+      {
+        gradInput_data[t * dim + d] *= THTensor_(fastGet1d)(gradOutput, t);
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  THTensor_(free)(isTarget);
+  THTensor_(free)(gradInput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
new file mode 100644
index 0000000..620e13c
--- /dev/null
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -0,0 +1,223 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          int64_t reduction,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data, *weights_data;
+  THIndex_t *target_data;
+  int64_t nframe, dim;
+  int64_t t, d;
+  real sum;
+
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+           "non-empty vector or matrix expected, got size: ", input->sizes());
+
+  if (input->dim() == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe),
+             "inconsistent target size, got: ", target->sizes());
+  }
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t idx = THIndexTensor_(get1d)(target, t);
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3,
+	       "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  if (reduction == Reduction::None)
+  {
+    THTensor_(resize1d)(output, nframe);
+
+    for (t = 0; t < nframe; t++)
+    {
+      sum = 0;
+      THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+      real input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        real z = margin - input_target + input_data[d];
+        if (d == target_idx)
+          continue;
+
+        if (z > 0) {
+          real h = (p==1) ? z : z*z;
+          if(weights_data)
+            h *= weights_data[target_idx];
+          sum += h;
+        }
+      }
+
+      sum /= dim;
+      THTensor_(fastSet1d)(output, t, sum);
+      input_data += dim;
+    }
+  }
+  else
+  {
+    THTensor_(resize1d)(output, 1);
+
+    sum = 0;
+    for (t = 0; t < nframe; t++)
+    {
+      THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+      real input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        real z = margin - input_target + input_data[d];
+        if (d == target_idx)
+          continue;
+
+        if (z > 0) {
+          real h = (p==1) ? z : z*z;
+          if(weights_data)
+            h *= weights_data[target_idx];
+          sum += h;
+        }
+      }
+      input_data += dim;
+    }
+
+    sum /= dim;
+    if(reduction == Reduction::ElementwiseMean)
+      sum /= nframe;
+
+    THTensor_(set1d)(output, 0, sum);
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *weights_data;
+  int64_t nframe, dim;
+  int64_t t, d;
+  real g;
+
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+           "non-empty vector or matrix expected, got size: ", input->sizes());
+
+  if (input->dim() == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe),
+             "inconsistent target size, got: ", target->sizes());
+  }
+
+  g = (reduction == Reduction::ElementwiseMean ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 5, "gradInput must be contiguous");
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THIndexTensor_(data)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+
+    input_data += dim;
+    gradInput_data += dim;
+  }
+  gradInput_data = THTensor_(data)(gradInput);
+
+  if (reduction != Reduction::None)
+  {
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+    for (t = 0; t < nframe * dim; t++) {
+      gradInput_data[t] *= THTensor_(fastGet1d)(gradOutput, 0);
+    }
+  }
+  else
+  {
+    THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, nframe);
+    for (t = 0; t < nframe; t++)
+    {
+      for (d = 0; d < dim; d++)
+      {
+        gradInput_data[t * dim + d] *= THTensor_(fastGet1d)(gradOutput, t);
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c
new file mode 100644
index 0000000..462280c
--- /dev/null
+++ b/aten/src/THNN/generic/PReLU.c
@@ -0,0 +1,202 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight)
+{
+  THTensor_(resizeAs)(output, input);
+  int64_t nOutputPlane = THTensor_(numel)(weight);
+
+  if (nOutputPlane == 1)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input,
+          const real r = (*input_data > 0) ? 1 : w;
+          *output_data = *input_data * r;
+    );
+    return;
+  }
+
+  input = THTensor_(newContiguous)(input);
+  int64_t bs = 1, ks = 1;
+  {
+    int64_t input_ndim = THTensor_(_nDimension)(input);
+    if (input->size[input_ndim > 1] != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+    if (input_ndim > 1) {
+        bs = input->size[0];
+        for (int d = 2; d < input_ndim; d++) {
+            ks *= input->size[d];
+        }
+    }
+  }
+
+  real *output_data = THTensor_(data)(output);
+  real *input_data = THTensor_(data)(input);
+  real *weight_data = THTensor_(data)(weight);
+  THIndex_t i, j, k;
+  #pragma omp parallel for private(j,k)
+  for (i = 0; i < bs; ++i)
+  {
+    real* n_input_data = input_data + i*nOutputPlane*ks;
+    real* n_output_data = output_data + i*nOutputPlane*ks;
+    for (j = 0; j < nOutputPlane; ++j)
+    {
+      for (k = 0; k < ks; ++k)
+        n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+      n_input_data += ks;
+      n_output_data += ks;
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  int64_t nOutputPlane = THTensor_(numel)(weight);
+
+  if (nOutputPlane == 1)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
+    );
+    return;
+  }
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  const real *input_data = THTensor_(data)(input);
+  const real *gradOutput_data = THTensor_(data)(gradOutput);
+  const real *weight_data = THTensor_(data)(weight);
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  int64_t bs = 1, ks = 1;
+  {
+    int64_t input_ndim = THTensor_(_nDimension)(input);
+    if (input->size[input_ndim > 1] != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+    if (input_ndim > 1) {
+        bs = input->size[0];
+        for (int d = 2; d < input_ndim; d++) {
+            ks *= input->size[d];
+        }
+    }
+  }
+
+  THIndex_t i, j, k;
+  #pragma omp parallel for private(j,k)
+  for (i = 0; i < bs; ++i)
+  {
+    const real *n_input_data = input_data + i*nOutputPlane*ks;
+    const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+    real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+    for (j = 0; j < nOutputPlane; ++j)
+    {
+      real w = weight_data[j];
+      for (k = 0; k < ks; ++k)
+      {
+        if (n_input_data[k] > 0)
+          n_gradInput_data[k] = n_gradOutput_data[k];
+        else
+          n_gradInput_data[k] = n_gradOutput_data[k] * w;
+      }
+      n_input_data += ks;
+      n_gradInput_data += ks;
+      n_gradOutput_data += ks;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  int64_t nOutputPlane = THTensor_(numel)(weight);
+
+  if (nOutputPlane == 1)
+  {
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
+    return;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous");
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  int64_t bs = 1, ks = 1;
+  {
+    int64_t input_ndim = THTensor_(_nDimension)(input);
+    if (input->size[input_ndim > 1] != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+    if (input_ndim > 1) {
+        bs = input->size[0];
+        for (int d = 2; d < input_ndim; d++) {
+          ks *= input->size[d];
+        }
+    }
+  }
+
+  const real *input_data = THTensor_(data)(input);
+  const real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+
+  THIndex_t i, j, k;
+  for (i = 0; i < bs; ++i)
+  {
+    const real *n_input_data = input_data + i*nOutputPlane*ks;
+    const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+    for (j = 0; j < nOutputPlane; ++j)
+    {
+      real sum = 0;
+      for (k = 0; k < ks; ++k)
+        if (n_input_data[k] <= 0)
+          sum += n_gradOutput_data[k] * n_input_data[k];
+      gradWeight_data[j] += scale * sum;
+      n_input_data += ks;
+      n_gradOutput_data += ks;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/RReLU.c b/aten/src/THNN/generic/RReLU.c
new file mode 100644
index 0000000..8fd46d3
--- /dev/null
+++ b/aten/src/THNN/generic/RReLU.c
@@ -0,0 +1,132 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  if (train)
+  {
+    // get default random generator
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *output_data = *input_data;
+          *noise_data = 1;
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
+      );
+    }
+  }
+}
+
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }
+  }
+  else
+  {
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+      );
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Sigmoid.c b/aten/src/THNN/generic/Sigmoid.c
new file mode 100644
index 0000000..2b218dd
--- /dev/null
+++ b/aten/src/THNN/generic/Sigmoid.c
@@ -0,0 +1,27 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(sigmoid)(output, input);
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_NELEMENT(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SmoothL1Criterion.c b/aten/src/THNN/generic/SmoothL1Criterion.c
new file mode 100644
index 0000000..b9eca65
--- /dev/null
+++ b/aten/src/THNN/generic/SmoothL1Criterion.c
@@ -0,0 +1,80 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+
+  if (reduction == Reduction::None) {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY3(real, input, real, target, real, output,
+      real z = fabs(*input_data - *target_data);
+      *output_data = z < 1 ? 0.5 * z * z : z - 0.5;
+    );
+    return;
+  }
+
+  THTensor_(resize1d)(output, 1);
+
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (reduction == Reduction::None) {
+    THNN_CHECK_SHAPE(gradOutput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+      real x = *input_data - *target_data;
+      if (x < -1.) {
+        *gradInput_data = -1.;
+      } else if (x > 1.) {
+        *gradInput_data = 1.;
+      } else {
+        *gradInput_data = x;
+      }
+    );
+    TH_TENSOR_APPLY2(real, gradInput, real, gradOutput,
+      *gradInput_data *= *gradOutput_data;
+    );
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+  real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.) * THTensor_(fastGet1d)(gradOutput, 0);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SoftMarginCriterion.c b/aten/src/THNN/generic/SoftMarginCriterion.c
new file mode 100644
index 0000000..8fb31f9
--- /dev/null
+++ b/aten/src/THNN/generic/SoftMarginCriterion.c
@@ -0,0 +1,65 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+
+  if (reduction == Reduction::None) {
+    THTensor_(resizeAs)(output, input);
+
+    TH_TENSOR_APPLY3(real, input, real, target, real, output,
+                     *output_data = log(1. + exp(-*input_data * *target_data));)
+    return;
+  }
+
+  THTensor_(resize1d)(output, 1);
+
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if (reduction == Reduction::ElementwiseMean)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  int64_t reduction)
+{
+  THNN_CHECK_SHAPE(input, target);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (!reduction) {
+    THNN_CHECK_SHAPE(gradOutput, input);
+
+    TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                     real z = exp(-*target_data * *input_data);
+                     *gradInput_data = -*target_data * z/(1. + z);)
+    THTensor_(cmul)(gradInput, gradInput, gradOutput);
+    return;
+  }
+
+  real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z) * THTensor_(fastGet1d)(gradOutput, 0);)
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SoftPlus.c b/aten/src/THNN/generic/SoftPlus.c
new file mode 100644
index 0000000..6491e66
--- /dev/null
+++ b/aten/src/THNN/generic/SoftPlus.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THTensor_(resizeAs)(output, input);
+
+  // f(x) = 1/beta * log(1 + exp(beta * x))
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SoftShrink.c b/aten/src/THNN/generic/SoftShrink.c
new file mode 100644
index 0000000..e779508
--- /dev/null
+++ b/aten/src/THNN/generic/SoftShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
new file mode 100644
index 0000000..a0c078b
--- /dev/null
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -0,0 +1,564 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+  return !t->is_empty() && t->dim() == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return!t->is_empty() && t->dim() == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1)
+{
+  return !t->is_empty() && t->dim() == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
+{
+  return !t->is_empty() && t->dim() == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  int64_t h, i, hp0, hp1;
+  int64_t outDim = THTensor_(size)(weight, 0);
+  int64_t inDim = THTensor_(size)(weight, 1);
+  int64_t batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  int64_t nnz = THTensor_(size)(input, 0);
+
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+  weight = THTensor_(newContiguous)(weight);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (int64_t)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (int64_t)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    int64_t i_start = THLongTensor_get1d(csr, h);
+    int64_t i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
+
+      int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THLongTensor_free(csr);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  int64_t h, i;
+  int64_t outDim = THTensor_(size)(weight, 0);
+  int64_t inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  weight = THTensor_(newContiguous)(weight);
+
+  int64_t batchSize = THTensor_(size)(input, 0);
+  int64_t nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int64_t h, i, col, hp0, hp1;
+  int64_t outDim = THTensor_(size)(weight, 0);
+  int64_t inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  int64_t nnz = THTensor_(size)(input, 0);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+  weight = THTensor_(newContiguous)(weight);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    int64_t i_start = THLongTensor_get1d(csc, col);
+    int64_t i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1;
+      int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0, 1);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+  THLongTensor_free(csc);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int64_t h, i;
+  int64_t outDim = THTensor_(size)(weight, 0);
+  int64_t inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  int64_t batchSize = THTensor_(size)(input, 0);
+  int64_t nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  int64_t i;
+  int64_t outDim = weight->size[0];
+  int64_t inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  int64_t nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  int64_t cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    int64_t offset = (int64_t)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  int64_t h, i;
+  int64_t outDim = weight->size[0];
+  int64_t inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
+
+
+  int64_t batchSize = THTensor_(size)(lastInput, 0);
+  int64_t nnz = THTensor_(size)(lastInput, 1);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  int64_t cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    int64_t offset = (int64_t)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  int64_t i, j;
+
+  int64_t outDim = gradWeight->size[0];
+  int64_t inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  int64_t nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        int64_t stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  int64_t h, i, j;
+
+  int64_t outDim = gradWeight->size[0];
+  int64_t inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
+
+  THTensor_(zero)(gradBias);
+
+  int64_t batchSize = THTensor_(size)(lastInput, 0);
+  int64_t nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
+          THVector_(fill)(pGradWeight, 0, outDim);
+        } else {
+          int64_t stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
+        }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c
new file mode 100644
index 0000000..c81657f
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -0,0 +1,266 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 4d tensor B x D x H x W
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          int64_t sizeD,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeH,
+          int64_t osizeW,
+          int64_t istrideD,
+          int64_t istrideH,
+          int64_t istrideW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    /* loop over output */
+    int64_t oh, ow;
+    for(oh = 0; oh < osizeH; oh++)
+    {
+      int istartH = START_IND(oh, osizeH, isizeH);
+      int iendH   = END_IND(oh, osizeH, isizeH);
+      int kH = iendH - istartH;
+
+      for(ow = 0; ow < osizeW; ow++)
+      {
+
+        int istartW = START_IND(ow, osizeW, isizeW);
+        int iendW   = END_IND(ow, osizeW, isizeW);
+        int kW = iendW - istartW;
+
+        /* local pointers */
+        real *ip = input_p   + d*istrideD + istartH*istrideH + istartW*istrideW;
+        real *op = output_p  + d*osizeH*osizeW + oh*osizeW + ow;
+
+        /* compute local average: */
+        real sum = 0;
+        int ih, iw;
+        for(ih = 0; ih < kH; ih++)
+        {
+          for(iw = 0; iw < kW; iw++)
+          {
+            real val = *(ip + ih*istrideH + iw*istrideW);
+            sum += val;
+          }
+        }
+
+        /* set output to local average */
+        *op = sum / kW / kH;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeW,
+          int osizeH)
+{
+  int dimD = 0;
+  int dimH = 1;
+  int dimW = 2;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;
+
+  int64_t istrideB = 0;
+  int64_t istrideD = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+
+  real *input_data = nullptr;
+  real *output_data = nullptr;
+
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+		"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 4)
+  {
+    istrideB = input->stride[0];
+    sizeB = input->size[0];
+    dimD++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  /* strides */
+  istrideD = input->stride[dimD];
+  istrideH = input->stride[dimH];
+  istrideW = input->stride[dimW];
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, sizeD, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                      sizeD,
+                                                      isizeH, isizeW,
+                                                      osizeH, osizeW,
+                                                      istrideD,
+                                                      istrideH, istrideW);
+  }
+  else
+  {
+    int64_t b;
+
+    THTensor_(resize4d)(output, sizeB, sizeD, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeH*osizeW,
+                                                      sizeD,
+                                                      isizeH, isizeW,
+                                                      osizeH, osizeW,
+                                                      istrideD,
+                                                      istrideH, istrideW);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          int64_t sizeD,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeH,
+          int64_t osizeW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    real *gradInput_p_d = gradInput_p + d*isizeW*isizeH;
+    real *gradOutput_p_d = gradOutput_p + d*osizeW*osizeH;
+
+    /* calculate average */
+    int64_t oh, ow;
+    for(oh = 0; oh < osizeH; oh++)
+    {
+      int istartH = START_IND(oh, osizeH, isizeH);
+      int iendH   = END_IND(oh, osizeH, isizeH);
+      int kH = iendH - istartH;
+
+      for(ow = 0; ow < osizeW; ow++)
+      {
+
+        int istartW = START_IND(ow, osizeW, isizeW);
+        int iendW   = END_IND(ow, osizeW, isizeW);
+        int kW = iendW - istartW;
+
+        real grad_delta = gradOutput_p_d[oh*osizeW +ow] / kH / kW;
+
+        int ih, iw;
+        for(ih = istartH; ih < iendH; ih++)
+        {
+          for(iw = istartW; iw < iendW; iw++)
+          {
+            /* update gradient */
+            gradInput_p_d[ih*isizeW + iw] += grad_delta;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  int dimD = 0;
+  int dimH = 1;
+  int dimW = 2;
+  int64_t sizeB = 1;
+  int sizeD;
+  int isizeH;
+  int isizeW;
+  int osizeH;
+  int osizeW;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 4) {
+    sizeB = input->size[0];
+    dimD++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  osizeH = gradOutput->size[dimH];
+  osizeW = gradOutput->size[dimW];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->dim() == 3)
+  {
+    THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         sizeD,
+                                                         isizeH, isizeW,
+                                                         osizeH, osizeW);
+  }
+  else
+  {
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeH*isizeW, gradOutput_data+b*sizeD*osizeH*osizeW,
+                                                           sizeD,
+                                                           isizeH, isizeW,
+                                                           osizeH, osizeW);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND
diff --git a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 0000000..711fa73
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,270 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 4d tensor B x D x H x W
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int64_t sizeD,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeH,
+          int64_t osizeW,
+          int64_t istrideD,
+          int64_t istrideH,
+          int64_t istrideW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    /* loop over output */
+    int64_t oh, ow;
+    for(oh = 0; oh < osizeH; oh++)
+    {
+      int istartH = START_IND(oh, osizeH, isizeH);
+      int iendH   = END_IND(oh, osizeH, isizeH);
+      int kH = iendH - istartH;
+
+      for(ow = 0; ow < osizeW; ow++)
+      {
+        int istartW = START_IND(ow, osizeW, isizeW);
+        int iendW   = END_IND(ow, osizeW, isizeW);
+        int kW = iendW - istartW;
+
+        /* local pointers */
+        real *ip = input_p   + d*istrideD + istartH*istrideH + istartW*istrideW;
+        real *op = output_p  + d*osizeH*osizeW + oh*osizeW + ow;
+        THIndex_t *indp = ind_p   + d*osizeH*osizeW + oh*osizeW + ow;
+
+        /* compute local max: */
+        int64_t maxindex = -1;
+        real maxval = -FLT_MAX;
+        int ih, iw;
+        for(ih = 0; ih < kH; ih++)
+        {
+          for(iw = 0; iw < kW; iw++)
+          {
+            real val = *(ip + ih*istrideH + iw*istrideW);
+            if ((val > maxval) || std::isnan(val))
+            {
+              maxval = val;
+              maxindex = (ih+istartH)*isizeW + (iw+istartW);
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int osizeW,
+          int osizeH)
+{
+  int dimW = 2;
+  int dimH = 1;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;
+
+  int64_t istrideD = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+  int64_t istrideB = 0;
+
+  real *input_data = nullptr;
+  real *output_data = nullptr;
+  THIndex_t *indices_data = nullptr;
+
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+		"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 4)
+  {
+    istrideB = input->stride[0];
+    sizeB = input->size[0];
+    dimW++;
+    dimH++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimH-1];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  /* strides */
+  istrideD = input->stride[dimH-1];
+  istrideH = input->stride[dimH];
+  istrideW = input->stride[dimW];
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, sizeD, osizeH, osizeW);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize3d)(indices, sizeD, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data,
+                                                      sizeD,
+                                                      isizeH, isizeW,
+                                                      osizeH, osizeW,
+                                                      istrideD,
+                                                      istrideH, istrideW);
+  }
+  else
+  {
+    int64_t b;
+
+    THTensor_(resize4d)(output, sizeB, sizeD, osizeH, osizeW);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize4d)(indices, sizeB, sizeD, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeH*osizeW,
+                                                        indices_data+b*sizeD*osizeH*osizeW,
+                                                        sizeD,
+                                                        isizeH, isizeW,
+                                                        osizeH, osizeW,
+                                                        istrideD,
+                                                        istrideH, istrideW);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int64_t sizeD,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeH,
+          int64_t osizeW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    real *gradInput_p_d = gradInput_p + d*isizeH*isizeW;
+    real *gradOutput_p_d = gradOutput_p + d*osizeH*osizeW;
+    THIndex_t *ind_p_d = ind_p + d*osizeH*osizeW;
+
+    /* calculate max points */
+    int64_t oh, ow;
+    for(oh = 0; oh < osizeH; oh++)
+    {
+      for(ow = 0; ow < osizeW; ow++)
+      {
+        /* retrieve position of max */
+        int64_t maxp = ind_p_d[oh*osizeW + ow] - TH_INDEX_BASE;
+
+        /* update gradient */
+        gradInput_p_d[maxp] += gradOutput_p_d[oh*osizeW + ow];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices)
+{
+  int dimW = 2;
+  int dimH = 1;
+  int64_t sizeB = 1;
+  int sizeD;
+  int isizeH;
+  int isizeW;
+  int osizeH;
+  int osizeW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 4) {
+    sizeB = input->size[0];
+    dimW++;
+    dimH++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimH-1];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  osizeH = gradOutput->size[dimH];
+  osizeW = gradOutput->size[dimW];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 3)
+  {
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                           indices_data,
+                                                           sizeD,
+                                                           isizeH, isizeW,
+                                                           osizeH, osizeW);
+  }
+  else
+  {
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeH*isizeW, gradOutput_data+b*sizeD*osizeH*osizeW,
+                                                             indices_data+b*sizeD*osizeH*osizeW,
+                                                             sizeD,
+                                                             isizeH, isizeW,
+                                                             osizeH, osizeW);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialAveragePooling.c b/aten/src/THNN/generic/SpatialAveragePooling.c
new file mode 100644
index 0000000..2a057e4
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialAveragePooling.c
@@ -0,0 +1,329 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+		"non-empty 3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  int64_t nInputPlane = input->size[dimh-1];
+  int64_t inputHeight = input->size[dimh];
+  int64_t inputWidth = input->size[dimw];
+  int64_t outputHeight, outputWidth;
+  int64_t nOutputPlane = nInputPlane;
+
+  if(ceil_mode)
+  {
+    outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  int64_t nbatch = 1;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+  int64_t outputWidth;
+  int64_t outputHeight;
+  int64_t nInputPlane; // number of channels (or colors)
+
+  int64_t k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (int64_t)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (int64_t)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (input->dim() == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      int64_t xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      int64_t i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          int64_t hstart = yy * dH - padH;
+          int64_t wstart = xx * dW - padW;
+          int64_t hend = fminf(hstart + kH, inputHeight + padH);
+          int64_t wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          int64_t kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  int64_t nbatch = 1;
+  int64_t ndim = 3;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+  int64_t outputWidth;
+  int64_t outputHeight;
+  int64_t nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *gradInput_data;
+
+  int64_t k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+    ndim = 4;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (int64_t)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (int64_t)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      int64_t xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      int64_t i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          int64_t hstart = yy * dH - padH;
+          int64_t wstart = xx * dW - padW;
+          int64_t hend = fminf(hstart + kH, inputHeight + padH);
+          int64_t wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          int64_t kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialClassNLLCriterion.c b/aten/src/THNN/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 0000000..0f12dfd
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,215 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(_nDimension)(target) == 3, 3,                         \
+    "only batches of spatial targets supported (3D tensors)"		         \
+	     " but got targets of dimension: %d",			         \
+	     THIndexTensor_(_nDimension)(target));			         \
+  THArgCheck(THTensor_(_nDimension)(input) == 4, 2,			         \
+	     "only batches of spatial inputs supported (4D tensors), "	         \
+	     "but got input of dimension: %d", THTensor_(_nDimension)(input));    \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
+                                                                                 \
+  {                                                                              \
+    int64_t input0 = THTensor_(size)(input, 0);                                     \
+    int64_t input1 = THTensor_(size)(input, 1);                                     \
+    int64_t input2 = THTensor_(size)(input, 2);                                     \
+    int64_t input3 = THTensor_(size)(input, 3);                                     \
+    int64_t target0 = THIndexTensor_(size)(target, 0);                              \
+    int64_t target1 = THIndexTensor_(size)(target, 1);                              \
+    int64_t target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+#define GRADOUTPUT_SHAPE_CHECK                                                \
+  THArgCheck(THTensor_(_nDimension)(gradOutput) == 3, 3,                       \
+    "gradOutput must have same dimension as target (3)"                       \
+	     " but got dimension: %d",			                                        \
+	     THTensor_(_nDimension)(gradOutput));			                              \
+  {                                                                           \
+    int64_t gradOutput0 = THTensor_(size)(gradOutput, 0);                     \
+    int64_t gradOutput1 = THTensor_(size)(gradOutput, 1);                     \
+    int64_t gradOutput2 = THTensor_(size)(gradOutput, 2);                     \
+    int64_t target0 = THIndexTensor_(size)(target, 0);                        \
+    int64_t target1 = THIndexTensor_(size)(target, 1);                        \
+    int64_t target2 = THIndexTensor_(size)(target, 2);                        \
+    THAssertMsg(                                                              \
+        gradOutput0 == target0 && gradOutput1 == target1 && gradOutput2 == target2, \
+        "size mismatch (got gradOutput: %ldx%ldx%ld, target: %ldx%ldx%ld)",   \
+        gradOutput0, gradOutput1, gradOutput2, target0, target1, target2);    \
+  }
+
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          int64_t reduction,
+          THTensor *weights,
+          THTensor *total_weight,
+          int64_t ignore_index)
+{
+  INITIAL_CHECK;
+  THTensor_(resize1d)(output, 1);
+  THTensor_(resize1d)(total_weight, 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (reduction == Reduction::None) {
+    int64_t batch_size = THTensor_(size)(input, 0);
+    int64_t H = THTensor_(size)(input, 2);
+    int64_t W = THTensor_(size)(input, 3);
+    THTensor_(resize3d)(output, batch_size, H, W);
+
+    int64_t b, h, w;
+    #pragma omp parallel for private(b, h, w)
+    for (b = 0; b < batch_size; b++) {
+      for (h = 0; h < H; h++) {
+        for (w = 0; w < W; w++) {
+          int64_t cur_target = (int64_t)THIndexTensor_(get3d)(target, b, h, w) - TH_INDEX_BASE;
+          if (cur_target == ignore_index) {
+            THTensor_(fastSet3d)(output, b, h, w, 0.0f);
+            continue;
+          }
+          real value = THTensor_(fastGet4d)(input, b, cur_target, h, w);
+          real weight = weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f;
+          THTensor_(fastSet3d)(output, b, h, w, -value * weight);
+        }
+      }
+    }
+    return;
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  int64_t batch_size = THTensor_(size)(input, 0);
+  int64_t n_classes = THTensor_(size)(input, 1);
+  int64_t map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  int64_t sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      if (cur_target == ignore_index) continue;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (reduction == Reduction::ElementwiseMean && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction,
+          THTensor *weights,
+          THTensor *total_weight,
+          int64_t ignore_index)
+{
+  INITIAL_CHECK;
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+  THNN_CHECK_SHAPE(input, gradInput);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (reduction == Reduction::None) {
+    GRADOUTPUT_SHAPE_CHECK;
+
+    int64_t batch_size = THTensor_(size)(input, 0);
+    int64_t H = THTensor_(size)(input, 2);
+    int64_t W = THTensor_(size)(input, 3);
+
+    int64_t b, h, w;
+    #pragma omp parallel for private(b, h, w)
+    for (b = 0; b < batch_size; b++) {
+      for (h = 0; h < H; h++) {
+        for (w = 0; w < W; w++) {
+          int64_t cur_target = (int64_t)THIndexTensor_(get3d)(target, b, h, w) - TH_INDEX_BASE;
+          if (cur_target == ignore_index) {
+            continue;
+          }
+          real value = -(weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f);
+          real gradOutput_value = THTensor_(fastGet3d)(gradOutput, b, h, w);
+          THTensor_(fastSet4d)(gradInput, b, cur_target, h, w, value * gradOutput_value);
+        }
+      }
+    }
+    return;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1);
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  int64_t batch_size = THTensor_(size)(input, 0);
+  int64_t n_classes = THTensor_(size)(input, 1);
+  int64_t map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  int64_t sample_size = map_size * n_classes;
+
+  real normalize = (reduction == Reduction::ElementwiseMean) ? *total_weight_data : 1.0f;
+
+  int b;
+  #pragma omp parallel for
+  for (b = 0; b < batch_size; b++) {
+    int elem;
+    for (elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      if (cur_target == ignore_index) continue;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      int index = b * sample_size + cur_target * map_size + elem;
+      gradInput_data[index] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize * THTensor_(fastGet1d)(gradOutput, 0);
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c
new file mode 100644
index 0000000..443901a
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,366 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+    THTensor *input, THTensor *gradOutput,
+    THTensor *weight, THTensor *bias,
+    int kH, int kW, int dH,
+    int dW, int padH, int padW,
+    int64_t inputHeight, int64_t inputWidth,
+    int64_t outputHeight, int64_t outputWidth) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+         "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+        "non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t nInputPlane = weight->size[2] / (kH * kW);
+  int64_t nOutputPlane = weight->size[1];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight);
+    THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(view_weight_local)(THTensor *_weight)
+{
+  THTensor *weight = THTensor_(newContiguous)(_weight);
+  AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6),
+           "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes());
+  if (weight->dim() == 6) {
+    int64_t s1 = weight->size[0] * weight->size[1];
+    int64_t s2 = weight->size[2];
+    int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage3d)(weight->storage,
+                       weight->storageOffset,
+                       s1, -1, s2, -1, s3, -1);
+    THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
+     (
+      THTensor *input, THTensor *output,
+      THTensor *weight, THTensor *bias, THTensor *finput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight,
+      int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight)
+{
+  THTensor *output3d, *finput3d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+               nInputPlane, inputWidth, inputHeight,
+               outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)
+    (output->storage, output->storageOffset,
+     outputHeight * outputWidth, 1,
+     nOutputPlane, outputHeight * outputWidth,
+     1, nOutputPlane * outputHeight * outputWidth);
+
+  finput3d = THTensor_(newWithStorage3d)
+    (finput->storage, finput->storageOffset,
+     outputHeight * outputWidth, 1,
+     kW * kH * nInputPlane, outputHeight * outputWidth,
+     1, kW * kH * nInputPlane * outputHeight * outputWidth);
+
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int64_t inputWidth, int64_t inputHeight,
+    int64_t outputWidth, int64_t outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+
+  int64_t nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
+  int64_t nOutputPlane = THTensor_(size)(weight, 1);
+
+  if(input->dim() == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)
+    (input_t, output_t, weight, bias, finput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+     (THTensor *gradInput, THTensor *gradOutput,
+      THTensor *weight, THTensor *fgradInput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight,
+      int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
+              nInputPlane, inputWidth, inputHeight,
+              outputWidth, outputHeight);
+
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int64_t inputWidth, int64_t inputHeight,
+    int64_t outputWidth, int64_t outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  int64_t nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  int64_t nOutputPlane = THTensor_(size)(weight,1);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 1, 2);
+
+  if(input->dim() == 3)
+  {
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+      (gradInput, gradOutput, tweight,
+       fgradInput, kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+    (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+     (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+      THTensor *finput, real scale,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight,
+      int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight)
+{
+
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int64_t inputWidth, int64_t inputHeight,
+    int64_t outputWidth, int64_t outputHeight,
+    accreal scale_)
+{
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  gradWeight = THNN_(view_weight_local)(gradWeight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int64_t nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  int64_t nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  if(input->dim() == 3)
+  {
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+      (gradOutput, gradWeight, gradBias, finput, scale,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+    (gradOutput_t, gradWeight, gradBias, finput_t, scale,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c
new file mode 100644
index 0000000..cdbff69
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@@ -0,0 +1,413 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
+                    "non-empty 2D or 4D weight tensor expected, but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+		"non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+
+  int64_t exactInputHeight = inputHeight + 2 * padH;
+  int64_t exactInputWidth = inputWidth + 2 * padW;
+
+  if (exactInputHeight < kH || exactInputWidth < kW) {
+    THError("Calculated padded input size per channel: (%ld x %ld). "
+      "Kernel size: (%ld x %ld). Kernel size can't be greater than actual input size",
+      exactInputHeight, exactInputWidth, kH, kW);
+  }
+
+  int64_t outputHeight = (exactInputHeight - kH) / dH + 1;
+  int64_t outputWidth  = (exactInputWidth - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld). Output size is too small",
+      inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    if (weight->dim() == 2) {
+      nInputPlane /= (kH * kW);
+    }
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
+  weight = THTensor_(newContiguous)(weight);
+  if (weight->dim() == 4) {
+    int64_t s1 = weight->size[0];
+    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+	THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t nOutputPlane,
+          int64_t outputWidth,
+          int64_t outputHeight)
+{
+  int64_t i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (THStorage_(data)(output->storage) + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(newViewWeightMM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0);
+
+  input = THTensor_(newContiguous)(input);
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t nInputPlane = input->size[dimf];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t nOutputPlane = weight->size[0];
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if(input->dim() == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionMM_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateOutput_frame)
+	(input_t, output_t, weight, bias, finput_t,
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(newViewWeightMM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if(input->dim() == 3)
+  {
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
+						      tweight, fgradInput,
+						      kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
+							tweight, fgradInput_t,
+							kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  int64_t i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  if (gradWeight) {
+    THTensor *tfinput = THTensor_(new)();
+    THTensor_(transpose)(tfinput, finput, 0, 1);
+    THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+    THTensor_(free)(tfinput);
+  }
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      int64_t k;
+      real sum = 0;
+      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,  // can be NULL if gradWeight = NULL
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  if (gradWeight) {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+    gradWeight = THNN_(newViewWeightMM2d)(gradWeight);
+  }
+  if (gradBias) {
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  }
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  if(input->dim() == 3)
+  {
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
+							gradBias, finput, scale);
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = NULL;
+      if (gradWeight) {
+        finput_t = THTensor_(newSelect)(finput, 0, t);
+      }
+
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
+							  gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      if (gradWeight) {
+        THTensor_(free)(finput_t);
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (gradWeight) {
+    THTensor_(free)(gradWeight);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialConvolutionMap.c b/aten/src/THNN/generic/SpatialConvolutionMap.c
new file mode 100644
index 0000000..cdd74ed
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialConvolutionMap.c
@@ -0,0 +1,277 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && !weight->is_empty() && weight->dim() == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  int64_t nbatch = 1;
+
+  THArgCheck(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, "non-empty 3D or 4D(batch mode) tensor expected");
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+
+  const int64_t kH       = weight->size[1];
+  const int64_t kW       = weight->size[2];
+
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const int64_t input_w  = input->size[dimw];
+  const int64_t input_h  = input->size[dimh];
+  const int64_t output_w = (input_w - kW) / dW + 1;
+  const int64_t output_h = (input_h - kH) / dH + 1;
+
+  if (input->dim() == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  int64_t p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    int64_t m;
+    for (m = 0; m < nbatch; m++)
+    {
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      int64_t j, k;
+      real z= bias_data[p];
+      for (j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++)
+      {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && !weight->is_empty() && weight->dim() == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const int64_t input_h  = input->size[dimh];
+  const int64_t input_w  = input->size[dimw];
+  const int64_t output_h = gradOutput->size[dimh];
+  const int64_t output_w = gradOutput->size[dimw];
+  const int64_t kH       = weight->size[1];
+  const int64_t kW       = weight->size[2];
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  int64_t p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    int64_t m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int64_t k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for (k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+        if (i == p)
+        {
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const int64_t input_h  = input->size[dimh];
+  const int64_t input_w  = input->size[dimw];
+  const int64_t output_h = gradOutput->size[dimh];
+  const int64_t output_w = gradOutput->size[dimw];
+  const int64_t kH       = gradWeight->size[1];
+  const int64_t kW       = gradWeight->size[2];
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+
+  int64_t k;
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    int64_t m;
+    for (m = 0; m < nbatch; m++)
+    {
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      int64_t l;
+      for (l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int64_t m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c
new file mode 100644
index 0000000..de4ddbd
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,439 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, int weight_nullable) {
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 4, 4, weight,
+                  "non-empty 4D weight tensor (nOutputPlane, nInputPlane, kH, kW) expected, "
+                  "but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+		"non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld). Output size is too small",
+      inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, 0);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous");
+  if (bias) {
+    bias = THTensor_(newContiguous)(bias);
+    THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous");
+  }
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (!THTensor_(isContiguous)(ones) || ones->dim() != 2 ||
+      ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth,
+      outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    int64_t m = nOutputPlane;
+    int64_t n = columns->size[1];
+    int64_t k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, 0);
+
+  // Params
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous");
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
+			gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    int64_t m = nInputPlane*kW*kH;
+    int64_t n = gradColumns->size[1];
+    int64_t k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+      kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, 1);
+
+  // Params
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  if (gradWeight) {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous");
+  if (gradBias) {
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous");
+  }
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
+			gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = gradOutput->size[1];
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THTensor_(select)(input_n, input, 0, elt);
+
+      // Extract columns:
+      THNN_(im2col)(
+        THTensor_(data)(input_n),
+        nInputPlane, inputHeight, inputWidth,
+        outputHeight, outputWidth,
+        kH, kW, padH, padW, dH, dW,
+        dilationH, dilationW,
+        THTensor_(data)(columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kW*kH;
+      int64_t k = columns->size[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      THBlas_(gemm)(
+          't', 'n',
+          n, m, k,
+          scale,
+          THTensor_(data)(columns), k,
+          THTensor_(data)(gradOutput_n), k,
+          1,
+          THTensor_(data)(gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      // Define a buffer of ones, for bias accumulation
+      if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+        // Resize plane and fill with ones...
+        THTensor_(resize2d)(ones, outputHeight, outputWidth);
+        THTensor_(fill)(ones, 1);
+      }
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
new file mode 100644
index 0000000..2d595b7
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
@@ -0,0 +1,401 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationH > 0 && dilationW > 0, 12,
+             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+		"non-empty 3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  int64_t nInputPlane = input->size[dimh-1];
+  int64_t inputHeight = input->size[dimh];
+  int64_t inputWidth = input->size[dimw];
+  int64_t outputHeight, outputWidth;
+  int64_t nOutputPlane = nInputPlane;
+
+  if (ceil_mode)
+  {
+    outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (int64_t)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int64_t nslices,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t owidth,
+          int64_t oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH
+          )
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    int64_t i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        int64_t hstart = i * dH - padH;
+        int64_t wstart = j * dW - padW;
+        int64_t hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
+        int64_t wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
+        while(hstart < 0)
+          hstart += dilationH;
+        while(wstart < 0)
+          wstart += dilationW;
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        int64_t maxindex = -1;
+        real maxval = -THInf;
+        int64_t tcntr = 0;
+        int64_t x,y;
+        for(y = hstart; y < hend; y += dilationH)
+        {
+          for(x = wstart; x < wend; x += dilationW)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if ((val > maxval) || std::isnan(val))
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+  int64_t nInputPlane;
+  int64_t inputHeight;
+  int64_t inputWidth;
+  int64_t outputHeight;
+  int64_t outputWidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, NULL, NULL, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (int64_t)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int64_t)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices,  nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+      (input_data, output_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       kW, kH, dW, dH,
+       padW, padH,
+       dilationW, dilationH
+       );
+  }
+  else
+  {
+    int64_t p;
+
+    THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+	(input_data+p*nInputPlane*inputWidth*inputHeight,
+	 output_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 kW, kH, dW, dH,
+	 padW, padH,
+	 dilationW, dilationH
+	 );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputWidth,
+          int64_t outputHeight,
+          int dW,
+          int dH)
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+    real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+    THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
+
+    /* calculate max points */
+    int64_t i, j;
+    for(i = 0; i < outputHeight; i++)
+    {
+      for(j = 0; j < outputWidth; j++)
+      {
+        /* retrieve position of max */
+        int64_t maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
+	if (maxp != -1) {
+	  /* update gradient */
+	  gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+	}
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+  int nInputPlane;
+  int inputHeight;
+  int inputWidth;
+  int outputHeight;
+  int outputWidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, gradOutput, indices, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  outputHeight = gradOutput->size[dimh];
+  outputWidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 3)
+  {
+    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+      (gradInput_data, gradOutput_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       dW, dH);
+  }
+  else
+  {
+    int64_t p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+	(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+	 gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialFractionalMaxPooling.c b/aten/src/THNN/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 0000000..c759872
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,253 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static int64_t* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  int64_t inputSize,
+  int64_t outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  int64_t* sequence = (int64_t*) THAlloc(sizeof(int64_t) * outputSize);
+
+  int64_t i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (int64_t) ((i + sample) * alpha) - (int64_t) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  int64_t numPlanes,
+  int64_t inputW, int64_t inputH,
+  int64_t outputW, int64_t outputH,
+  int poolSizeW, int poolSizeH) {
+  int64_t plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    int64_t* sequenceW =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    int64_t* sequenceH =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    int64_t h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      int64_t inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        int64_t inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        int64_t maxIndex = -1;
+
+        int64_t h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            int64_t planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  int64_t numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  int64_t numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(!input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input,
+		"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THTensor_(size)(input, planeDim);
+  int64_t inputH = THTensor_(size)(input, heightDim);
+  int64_t inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputH + poolSizeH - 1 <= inputH, 7,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 <= inputW, 6,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    int64_t batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  int64_t numPlanes,
+  int64_t inputW, int64_t inputH,
+  int64_t outputW, int64_t outputH) {
+  int64_t plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    int64_t h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        int64_t outputIndex = h * outputW + w;
+        int64_t index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  int64_t numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  int64_t numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THTensor_(size)(input, planeDim);
+  int64_t inputH = THTensor_(size)(input, heightDim);
+  int64_t inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    int64_t batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialFullConvolution.c b/aten/src/THNN/generic/SpatialFullConvolution.c
new file mode 100644
index 0000000..b9cd9fe
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialFullConvolution.c
@@ -0,0 +1,59 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_updateOutput)(
+    state, input, output, weight, bias, columns, ones,
+    kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
+  }
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+    state, input, gradOutput, gradInput, weight, gradColumns,
+    kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
+}
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    accreal scale_)
+{
+THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+    state, input, gradOutput, gradWeight, gradBias, columns, ones,
+    kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialFullConvolutionMap.c b/aten/src/THNN/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 0000000..a6fe507
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,223 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  // What does this mean?
+  THArgCheck(
+    weight != NULL && !weight->is_empty() && weight->dim() == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
+
+  THArgCheck(input != NULL && !input->is_empty() && input->dim() == 3, 2, "non-empty 3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+  THTensor_(resize3d)(
+    output_, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  THTensor* output = THTensor_(newContiguous)(output_);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const int64_t input_h = input->size[1];
+  const int64_t input_w = input->size[2];
+  const int64_t output_h = output->size[1];
+  const int64_t output_w = output->size[2];
+  const int64_t weight_h = weight->size[1];
+  const int64_t weight_w = weight->size[2];
+
+  int64_t p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    int64_t j;
+    int nweight;
+    int64_t k;
+
+    for (j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++)
+    {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+      if (o == p)
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && !weight->is_empty() && weight->dim() == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const int64_t input_h = input->size[1];
+  const int64_t input_w = input->size[2];
+  const int64_t output_h = gradOutput->size[1];
+  const int64_t output_w = gradOutput->size[2];
+  const int64_t kH = weight->size[1];
+  const int64_t kW = weight->size[2];
+
+  int64_t p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    int64_t k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
+    {
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *connTable,
+  int nInputPlane,
+  int nOutputPlane,
+  int dW, int dH,
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  const int64_t input_h  = input->size[1];
+  const int64_t input_w  = input->size[2];
+  const int64_t output_h = gradOutput->size[1];
+  const int64_t output_w = gradOutput->size[2];
+  const int64_t weight_h = gradWeight->size[1];
+  const int64_t weight_w = gradWeight->size[2];
+
+  /* gradients wrt bias */
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    int64_t l;
+    for (l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
new file mode 100644
index 0000000..8c66d02
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@@ -0,0 +1,454 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullDilatedConvolution.c"
+#else
+
+static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+  THArgCheck((adjW < dW || adjW < dilationW) && (adjH < dH || adjH < dilationH), 15,
+             "output padding must be smaller than either stride or dilation, but got adjH: %d adjW: %d dH: %d dW: %d dilationH: %d dilationW: %d",
+             adjH, adjW, dH, dW, dilationH, dilationW);
+
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
+                  "non-empty 2D or 4D weight tensor expected, but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
+		"non-empty 3D or 4D input tensor expected but got: %s");
+
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld). "
+	    "Calculated output size per channel: (%ld x %ld). Output size is too small",
+	    inputHeight, inputWidth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[0];
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[1];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialFullDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, adjH, adjW, 0);
+
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous");
+  if (bias) {
+    bias = THTensor_(newContiguous)(bias);
+    THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous");
+  }
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size[3];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t n = columns->size[1];
+    int64_t k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, adjH, adjW, 0);
+
+  int64_t nInputPlane = THTensor_(size)(weight,0);
+  int64_t nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous");
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth,
+      inputHeight, inputWidth,
+      kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    int64_t m = weight->size[0];
+    int64_t n = gradColumns->size[1];
+    int64_t k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    int adjW, int adjH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialFullDilatedConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW, adjH, adjW, 1);
+
+  int64_t nOutputPlane;
+  if (gradWeight) {
+    nOutputPlane = THTensor_(size)(gradWeight, 1);
+  } else if (gradBias) {
+    nOutputPlane = THTensor_(size)(gradBias, 0);
+  } else {
+    return;
+  }
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  if (gradWeight) {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous");
+  if (gradBias) {
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous");
+  }
+
+  int is_batch = 1;
+  if (input->dim() == 3) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size[2];
+  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
+  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THTensor_(select)(input_n, input, 0, elt);
+
+      // Extract columns:
+      THNN_(im2col)(
+        THTensor_(data)(gradOutput_n),
+        nOutputPlane, outputHeight, outputWidth,
+        inputHeight, inputWidth,
+        kH, kW, padH, padW, dH, dW,
+        dilationH, dilationW,
+        THTensor_(data)(columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t n = columns->size[0];   // nOutputPlane * kh * kw
+      int64_t m = input_n->size[0];   // nInputPlane
+      int64_t k = columns->size[1];   // inputHeight * inputWidth
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      THBlas_(gemm)(
+          't', 'n',
+          n, m, k,
+          scale,
+          THTensor_(data)(columns), k,
+          THTensor_(data)(input_n), k,
+          1,
+          THTensor_(data)(gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, input->size[1], inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
new file mode 100644
index 0000000..d31f3e0
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
@@ -0,0 +1,250 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c"
+#else
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+#undef MODE_BORDER
+#define MODE_BORDER 1
+
+static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)
+     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
+  THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input,
+    "non-empty 4D input tensor expected but got: %s");
+  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid,
+    "non-empty 4D grid tensor expected but got: %s");
+
+  int nbatch   = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int oheight   = THTensor_(size)(grid, 1);
+  int owidth    = THTensor_(size)(grid, 2);
+
+  THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch);
+  THNN_CHECK_DIM_SIZE(grid, 4, 3, 2);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth);
+  }
+}
+
+#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \
+    && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0
+
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *grid,
+    THTensor *output,
+    int padding_mode) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+
+  // resize output to the same shape as input
+  THTensor_(resize4d)(output, N, C, H, W);
+
+  // loop over each output pixel
+  int n, h, w, c;
+#pragma omp parallel for private(n, h, w, c)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+        // get the corresponding input x, y co-ordinates from grid
+        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
+        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
+
+        // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+        ix = ((ix + 1) / 2) * (IW-1);
+        iy = ((iy + 1) / 2) * (IH-1);
+
+        // get NE, NW, SE, SW pixel values from (x, y)
+        int ix_nw = floor(ix);
+        int iy_nw = floor(iy);
+        int ix_ne = ix_nw + 1;
+        int iy_ne = iy_nw;
+        int ix_sw = ix_nw;
+        int iy_sw = iy_nw + 1;
+        int ix_se = ix_nw + 1;
+        int iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        real nw = (ix_se - ix)    * (iy_se - iy);
+        real ne = (ix    - ix_sw) * (iy_sw - iy);
+        real sw = (ix_ne - ix)    * (iy    - iy_ne);
+        real se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        if (padding_mode==MODE_BORDER){
+          // clip coordinates to image borders
+          CLIP_COORDINATES(ix_nw, ix_nw, IW);
+          CLIP_COORDINATES(iy_nw, iy_nw, IH);
+          CLIP_COORDINATES(ix_ne, ix_ne, IW);
+          CLIP_COORDINATES(iy_ne, iy_ne, IH);
+          CLIP_COORDINATES(ix_sw, ix_sw, IW);
+          CLIP_COORDINATES(iy_sw, iy_sw, IH);
+          CLIP_COORDINATES(ix_se, ix_se, IW);
+          CLIP_COORDINATES(iy_se, iy_se, IH);
+        }
+
+        // calculate bilinear weighted pixel value and set output pixel
+        for (c = 0; c < C; ++c) {
+          //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+          // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+          real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW);
+          real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW);
+          real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW);
+          real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW);
+          real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se;
+          THTensor_(fastSet4d)(output, n, c, h, w, out_val);
+        }
+      }
+    }
+  }
+}
+
+#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
+  do {                \
+    if (x >= 0 && x < W && y >=0 && y < H) {      \
+      real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \
+      THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \
+    }               \
+  } while(0)
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *input, THTensor *gradInput,
+    THTensor *grid, THTensor *gradGrid,
+    THTensor *gradOutput,
+    int padding_mode) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+
+  THTensor_(resize4d)(gradInput, N, C, IH, IW);
+  THTensor_(resize4d)(gradGrid, N, H, W, 2);
+  THTensor_(zero)(gradInput);
+  THTensor_(zero)(gradGrid);
+
+  // loop over each output pixel
+  int n, h, w;
+#pragma omp parallel for private(n, h, w)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+        // get the corresponding input x, y co-ordinates from grid
+        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
+        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
+
+        real gix = 0;
+        real giy = 0;
+
+        // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
+        ix = ((ix + 1) / 2) * (IW-1);
+        iy = ((iy + 1) / 2) * (IH-1);
+
+        // get NE, NW, SE, SW pixel values from (x, y)
+        int ix_nw = floor(ix);
+        int iy_nw = floor(iy);
+        int ix_ne = ix_nw + 1;
+        int iy_ne = iy_nw;
+        int ix_sw = ix_nw;
+        int iy_sw = iy_nw + 1;
+        int ix_se = ix_nw + 1;
+        int iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        real nw = (ix_se - ix)    * (iy_se - iy);
+        real ne = (ix    - ix_sw) * (iy_sw - iy);
+        real sw = (ix_ne - ix)    * (iy    - iy_ne);
+        real se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+        if (padding_mode==MODE_BORDER){
+          // get clipped NE, NW, SE, SW pixel values from (x, y)
+          CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
+          CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
+          CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
+          CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
+          CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
+          CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
+          CLIP_COORDINATES(ix_se, ix_se_cl, IW);
+          CLIP_COORDINATES(iy_se, iy_se_cl, IH);
+        }
+        else {
+          ix_nw_cl = ix_nw;
+          iy_nw_cl = iy_nw;
+          ix_ne_cl = ix_ne;
+          iy_ne_cl = iy_ne;
+          ix_sw_cl = ix_sw;
+          iy_sw_cl = iy_sw;
+          ix_se_cl = ix_se;
+          iy_se_cl = iy_se;
+        }
+
+        for (int c = 0; c < C; ++c) {
+          real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w);
+
+          // calculate and set gradInput
+          SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
+          SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
+          SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
+          SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
+
+          // calculate gradGrid
+          real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW);
+          real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW);
+          real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW);
+          real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW);
+
+          gix -= nw_val * (iy_se - iy) * gradout;
+          gix += ne_val * (iy_sw - iy) * gradout;
+          gix -= sw_val * (iy - iy_ne) * gradout;
+          gix += se_val * (iy - iy_nw) * gradout;
+
+          giy -= nw_val * (ix_se - ix) * gradout;
+          giy -= ne_val * (ix - ix_sw) * gradout;
+          giy += sw_val * (ix_ne - ix) * gradout;
+          giy += se_val * (ix - ix_nw) * gradout;
+        }
+
+        // un-normalize gradGrid values back to [-1, 1] constraints
+        gix = gix * (IW - 1) / 2;
+        giy = giy * (IH - 1) / 2;
+
+        real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0);
+        real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1);
+
+        THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix);
+        THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy);
+      }
+    }
+  }
+}
+
+
+#undef MIN
+#undef MAX
+#undef SAFE_GET
+#undef CLIP_COORDINATES
+#undef SAFE_ADD
+#undef MODE_BORDER
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialMaxPooling.c b/aten/src/THNN/generic/SpatialMaxPooling.c
new file mode 100644
index 0000000..88aaa40
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialMaxPooling.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+      state, input, output, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+      state, input, gradOutput, gradInput, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c
new file mode 100644
index 0000000..64179b5
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,234 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      THIndex_t *ind_p,
+                                                      int nslices,
+                                                      int iwidth, int iheight,
+                                                      int owidth, int oheight)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index = 0;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+#pragma omp critical
+          {
+            has_error = 1;
+            error_index = maxp;
+          }
+        } else {
+          output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError("found an invalid max index %ld (output volumes are of size %dx%d)",
+        error_index, oheight, owidth);
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  AT_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4),
+           "non-empty 3D or 4D (batch mode) tensor expected for input, but got sizes: ", input->sizes());
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(
+						    input_data+p*nslices*iwidth*iheight,
+						    output_data+p*nslices*owidth*oheight,
+						    indices_data+p*nslices*iwidth*iheight,
+						    nslices,
+						    iwidth, iheight,
+						    owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         THIndex_t *ind_p,
+                                                         int nslices,
+                                                         int iwidth, int iheight,
+                                                         int owidth, int oheight)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+        if(maxp < 0 || maxp >= owidth * oheight) {
+            THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
+        }
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+	    oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 3)
+  {
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialReflectionPadding.c b/aten/src/THNN/generic/SpatialReflectionPadding.c
new file mode 100644
index 0000000..4ccdca8
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialReflectionPadding.c
@@ -0,0 +1,272 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight,
+  int64_t owidth, int64_t oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  int64_t k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t oheight;
+  int64_t owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+		"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* input sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  THArgCheck(pad_l < iwidth && pad_r < iwidth, 4,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str);
+
+  THArgCheck(pad_t < iheight && pad_b < iheight, 6,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             pad_t, pad_b, dimh, _THSizeDesc(input->size, input->dim()).str);
+
+  /* output sizes */
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    int64_t p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight,
+  int64_t owidth, int64_t oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  int64_t k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t oheight;
+  int64_t owidth;
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->dim() == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    int64_t p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialReplicationPadding.c b/aten/src/THNN/generic/SpatialReplicationPadding.c
new file mode 100644
index 0000000..32c125d
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialReplicationPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight,
+  int64_t owidth, int64_t oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  int64_t k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t oheight;
+  int64_t owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    int64_t p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight,
+  int64_t owidth, int64_t oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  int64_t k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t oheight;
+  int64_t owidth;
+
+  if (input->dim() == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->dim() == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    int64_t p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialSubSampling.c b/aten/src/THNN/generic/SpatialSubSampling.c
new file mode 100644
index 0000000..8f9f95d
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialSubSampling.c
@@ -0,0 +1,299 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         int kW, int kH) {
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
+                  "3D or 4D input tensor expected but got: %s");
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+
+  int nInputPlane = THTensor_(size)(weight, 0);
+
+  int dimw = 2;
+  int dimh = 1;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+
+  if (input->dim() == 4) {
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+}
+
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+  int64_t outputWidth;
+  int64_t outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  int64_t k;
+
+  THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  if (input->dim() == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      int64_t xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      int64_t i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          int64_t kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH);
+
+  int dimw = 2;
+  int dimh = 1;
+  int64_t nbatch = 1;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+  int64_t outputWidth;
+  int64_t outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *gradInput_data;
+
+  int64_t k;
+
+  if (input->dim() == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      int64_t xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      int64_t i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          int64_t kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
+
+  int64_t nbatch = 1;
+  int64_t dimw = 2;
+  int64_t dimh = 1;
+
+  int64_t inputWidth;
+  int64_t inputHeight;
+  int64_t outputWidth;
+  int64_t outputHeight;
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  int64_t k;
+
+  if (input->dim() == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      int64_t xx, yy;
+      int64_t i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          int64_t kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 0000000..1998d3b
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,180 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputHeight, int inputWidth,
+      int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+	     && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+	     inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input,
+		  "non-empty 4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputHeight,
+    int outputWidth,
+    bool align_corners){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputHeight = THTensor_(size)(input, 2);
+  int inputWidth = THTensor_(size)(input, 3);
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize4d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * inputWidth + w1];
+        real* pos2 = &odata[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const accreal h1r = linear_upsampling_compute_source_index<accreal>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * inputWidth + w1];
+      real* pos2 = &odata[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                  + w1lambda * pos1[h1p * inputWidth + w1p]);
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputHeight,
+    int inputWidth,
+    int outputHeight,
+    int outputWidth,
+    bool align_corners){
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * inputWidth + w1];
+        const real* pos2 = &data2[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const accreal h1r = linear_upsampling_compute_source_index<accreal>(rheight, h2, align_corners);
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * inputWidth + w1];
+      const real* pos2 = &data2[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/SpatialUpSamplingNearest.c b/aten/src/THNN/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 0000000..92eaddd
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,154 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputHeight, int inputWidth,
+      int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+       && outputHeight > 0 && outputWidth > 0, 2,
+       "input and output sizes should be greater than 0,"
+       " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+       inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->_dim() == 4, 2, input,
+      "4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+  }
+}
+
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputHeight,
+    int outputWidth)
+{
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputHeight = THTensor_(size)(input, 2);
+  int inputWidth = THTensor_(size)(input, 3);
+  const float height_scale = (float) inputHeight / (float) outputHeight;
+  const float width_scale = (float) inputWidth / (float) outputWidth;
+
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels,
+		  inputHeight, inputWidth, outputHeight, outputWidth);
+
+  THTensor_(resize4d)(output,
+                      THTensor_(size)(input, 0),
+                      THTensor_(size)(input, 1),
+                      outputHeight,
+                      outputWidth);
+  channels = channels * nbatch;
+
+  THAssert(inputWidth > 0 && outputWidth > 0);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+
+  // special case: just copy
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * inputWidth + w1];
+        real* pos2 = &odata[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += inputHeight * inputWidth;
+          pos2 += outputHeight * outputWidth;
+        }
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight);
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth);
+      const real* pos1 = &idata[h1 * inputWidth + w1];
+      real* pos2 = &odata[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = pos1[0];
+        pos1 += inputHeight * inputWidth;
+        pos2 += outputHeight * outputWidth;
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputHeight,
+    int inputWidth,
+    int outputHeight,
+    int outputWidth)
+{
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels,
+		  inputHeight, inputWidth, outputHeight, outputWidth);
+  THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *idata = THTensor_(data)(gradInput);
+  real *odata = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+  const float height_scale = (float) inputHeight / (float)outputHeight;
+  const float width_scale = (float) inputWidth / (float)outputWidth;
+  // special case: just copy
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &idata[h1 * inputWidth + w1];
+        const real* pos2 = &odata[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] = pos2[0];
+          pos1 += inputHeight * inputWidth;
+          pos2 += outputHeight * outputWidth;
+        }
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight);
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth);
+      real* pos1 = &idata[h1 * inputWidth + w1];
+      const real* pos2 = &odata[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += pos2[0];
+        pos1 += inputHeight * inputWidth;
+        pos2 += outputHeight * outputWidth;
+      }
+    }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Sqrt.c b/aten/src/THNN/generic/Sqrt.c
new file mode 100644
index 0000000..5e75c7d
--- /dev/null
+++ b/aten/src/THNN/generic/Sqrt.c
@@ -0,0 +1,51 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps_)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->_dim() == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
+    int64_t i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+    {
+      if (output_data[i] == 0.0)
+        gradInput_data[i] = 0.0;
+      else
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Square.c b/aten/src/THNN/generic/Square.c
new file mode 100644
index 0000000..fac8ee3
--- /dev/null
+++ b/aten/src/THNN/generic/Square.c
@@ -0,0 +1,59 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->_dim() == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
+    int64_t i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+}
+
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->_dim() == 1 || 
+      !THTensor_(isContiguous)(input) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
+    int64_t i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h
new file mode 100644
index 0000000..455da04
--- /dev/null
+++ b/aten/src/THNN/generic/THNN.h
@@ -0,0 +1,1721 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+#include "Reduction.h"
+
+TH_API void THNN_(Abs_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] Abs output
+TH_API void THNN_(Abs_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput);        // [OUT] gradient w.r.t. input
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          int64_t reduction);
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *gradOutput,
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          int64_t reduction);
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction,
+          THTensor *weights);          // [OPTIONAL]
+TH_API void THNN_(BCECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction,
+          THTensor *weights);          // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          int64_t reduction,
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          int64_t ignore_index);       // target index to ignore (loss = 0, gradInput = 0)
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *gradOutput,
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          int64_t reduction,
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          int64_t ignore_index);       // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          int64_t reduction,
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          int64_t ignore_index);       // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *gradOutput,
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          int64_t reduction,
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          int64_t ignore_index);       // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(ELU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] ELU output
+          accreal alpha,               // an ELU parameter (as in paper)
+          accreal scale,               // scaling factor
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+TH_API void THNN_(ELU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output,            // output from a forward pass
+          accreal alpha,               // an ELU parameter (as in paper)
+          accreal scale);
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          int64_t reduction);
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *gradOutput,        // grad output tensor
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          int64_t reduction);
+
+TH_API void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor, half size of input along dimension dim
+          int dim);                    // dimension for halving operation
+TH_API void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t input
+          int dim);                    // dimension for halving operation
+
+// HardTanh clamps the values to the interval [min_val; max_val].
+TH_API void THNN_(HardTanh_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+TH_API void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+
+TH_API void THNN_(Im2Col_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
+
+TH_API void THNN_(Im2Col_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t inputHeight, int64_t inputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
+
+TH_API void THNN_(Col2Im_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int64_t outputHeight, int64_t outputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
+
+TH_API void THNN_(Col2Im_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [OPTIONAL] gradient w.r.t module's output
+          THTensor *gradInput);        // [OUT] gradient w.r.t the input
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // [MODIFIED] input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *hx,
+          THTensor *output,
+          THTensor *storage);
+TH_API void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage);
+
+TH_API void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *cell,
+          THTensor *output,
+          THTensor *outputCell);
+TH_API void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *cx,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // output tensor
+          THTensor *buffer);           // [BUFFER]
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *buffer);           // [BUFFER]
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,            // [OPTIONAL]
+          THIndexTensor *indices,      // [OPTIONAL]
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+          THNNState *state,            // library's state
+          THIndexTensor *idx,          // vector containing row indices (modified in function)
+          THTensor *weight,            // 2D tensor whose rows will be renormalized
+          accreal maxNorm,             // maximum norm
+          accreal normType);           // the norm type (e.g., normType=2, then it's 2-norm)
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contain only 1s and -1s)
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contin only 1s and -1s)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
+          bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          int64_t reduction);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          int64_t reduction);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          int64_t reduction,
+          int p,
+          THTensor* weights,      // [OPTIONAL]
+          accreal margin);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction,
+          int p,
+          THTensor *weights,      // [OPTIONAL]
+          accreal margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight);
+TH_API void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          accreal scale);
+
+TH_API void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer);
+TH_API void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale);
+
+TH_API void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          int64_t reduction);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int64_t reduction);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda);
+
+
+TH_API void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THIndexTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int train);
+TH_API void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor* valuesBuffer,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          int64_t keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THIndexTensor *runningKeys,
+          THIndexTensor *cumSumSizes,
+          int64_t keysOffset,
+          accreal weightDecay,
+          accreal learningRate);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize,
+          int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+
+TH_API void THNN_(TemporalRowConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst,
+          accreal scale);
+
+TH_API void THNN_(TemporalUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeW);
+TH_API void THNN_(TemporalUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeW,
+          int osizeW);
+
+TH_API void THNN_(TemporalUpSamplingLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeW,
+          bool align_corners);
+TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeW,
+          int osizeW,
+          bool align_corners);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *running_mean, // [OPTIONAL] if train
+          THTensor *running_var,  // [OPTIONAL] if train
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+TH_API void THNN_(BatchNormalization_backward)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,    // [OPTIONAL]
+          THTensor *gradWeight,   // [OPTIONAL]
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *running_mean, // [OPTIONAL] if train
+          THTensor *running_var,  // [OPTIONAL] if train
+          THTensor *save_mean,    // [OPTIONAL] if !train
+          THTensor *save_std,     // [OPTIONAL] if !train
+          bool train,
+          double scale,
+          double eps);
+
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int64_t inputWidth, int64_t inputHeight,
+          int64_t outputWidth, int64_t outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int64_t inputWidth, int64_t inputHeight,
+          int64_t outputWidth, int64_t outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int64_t inputWidth, int64_t inputHeight,
+          int64_t outputWidth, int64_t outputHeight,
+          accreal scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int osizeW, int osizeH);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeW, int osizeH);
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int kW, int kH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int kW, int kH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *columns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          accreal scale);
+
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *columns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(SpatialFullDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *columns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          int adjW, int adjH,
+          accreal scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          accreal scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeH,
+          int osizeW);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeH,
+          int isizeW,
+          int osizeH,
+          int osizeW);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeH,
+          int osizeW,
+          bool align_corners);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeH,
+          int isizeW,
+          int osizeH,
+          int osizeW,
+          bool align_corners);
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *grid,
+          THTensor *output,
+          int padding_mode);
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input, THTensor *gradInput,
+          THTensor *grid, THTensor *gradGrid,
+          THTensor *gradOutput,
+          int padding_mode);
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *grid,
+          THTensor *output,
+          int padding_mode);
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input, THTensor *gradInput,
+          THTensor *grid, THTensor *gradGrid,
+          THTensor *gradOutput,
+          int padding_mode);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int osizeW, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          bool ceil_mode, bool count_include_pad);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          bool ceil_mode, bool count_include_pad);
+
+// VolumetricConvolution is legacy and purposefully not bound by ATen
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,     // HACK to make signature line up with backward
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int kT, int kW, int kH,   // kenerl size
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int kT, int kW, int kH,   // kenerl size
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int kT, int kW, int kH,   // kenerl size
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH,   // extra output adjustment
+          accreal scale);           // scaling factor
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *columns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int kT, int kW, int kH,   // kernel size
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int dilationT, int dilationW, int dilationH,
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int kT, int kW, int kH,   // kernel size
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int dilationT, int dilationW, int dilationH,
+          int aT, int aW, int aH);  // extra output adjustment
+
+TH_API void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int kT, int kW, int kH,   // kernel size
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int dilationT, int dilationW, int dilationH,
+          int aT, int aW, int aH,   // extra output adjustment
+          accreal scale);           // scaling factor
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeT,
+          int osizeW,
+          int osizeH);
+TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int osizeT, int osizeW, int osizeH);
+TH_API void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom);
+
+TH_API void THNN_(FeatureLPPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal power,
+          int width,
+          int stride,
+          bool batchMode);
+
+TH_API void THNN_(FeatureLPPooling_updateGradInput)(
+          THNNState *state,
+          THTensor* gradOutput,
+          THTensor* input,
+          THTensor* output,
+          THTensor* gradInput,
+          accreal power,
+          int width,
+          int stride,
+          bool batchMode);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom,
+          int pad_front, int pad_back);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_left, int pad_right,
+          int pad_top, int pad_bottom,
+          int pad_front, int pad_back);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeT,
+          int osizeH,
+          int osizeW);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeT,
+          int isizeH,
+          int isizeW,
+          int osizeT,
+          int osizeH,
+          int osizeW);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeT,
+          int osizeH,
+          int osizeW,
+          bool align_corners);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int isizeB,
+          int isizeC,
+          int isizeT,
+          int isizeH,
+          int isizeW,
+          int osizeT,
+          int osizeH,
+          int osizeW,
+          bool align_corners);
+
+TH_API void THNN_(TemporalReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_left, int pad_right);
+
+TH_API void THNN_(TemporalReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_left, int pad_right);
+
+TH_API void THNN_(TemporalReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_left, int pad_right);
+
+TH_API void THNN_(TemporalReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_left, int pad_right);
+
+#endif
diff --git a/aten/src/THNN/generic/Tanh.c b/aten/src/THNN/generic/Tanh.c
new file mode 100644
index 0000000..898656b
--- /dev/null
+++ b/aten/src/THNN/generic/Tanh.c
@@ -0,0 +1,48 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->_dim() == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    int64_t i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c
new file mode 100644
index 0000000..a7fdd3f
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalConvolution.c
@@ -0,0 +1,392 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+
+  THArgCheck(kW > 0, 9,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 11,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+                  "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+               "invalid input frame size. Got: %d, Expected: %d",
+               input->size[dimF], *inputFrameSize);
+  }
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+}
+
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  int64_t k, i;
+
+  int dimS = 0; // sequence dimension
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, &inputFrameSize);
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->dim() == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage,
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight, weight, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+      THTensor_(free)(tweight);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int64_t nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage,
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor *tweight = THTensor_(new)();
+        THTensor_(transpose)(tweight, weight, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+        THTensor_(free)(tweight);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+  int64_t nInputFrame;
+  int64_t nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  int64_t k, i;
+
+  int dimS = 0; // sequence dimension
+
+  if (gradOutput->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->dim() == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int64_t nInputFrame;
+  int64_t nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  int64_t k, i;
+
+  int dimS = 0; // sequence dimension
+
+  if (gradOutput->dim() == 3)
+  {
+    dimS = 1;
+  }
+
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  if (input->dim() == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      int64_t outputFrameStride = (kW-1)/dW+1;
+      int64_t inputFrameStride = outputFrameStride*dW;
+      int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor *tgradOutputWindow = THTensor_(new)();
+      THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+      THTensor_(free)(tgradOutputWindow);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        int64_t outputFrameStride = (kW-1)/dW+1;
+        int64_t inputFrameStride = outputFrameStride*dW;
+        int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor *tgradOutputWindow = THTensor_(new)();
+        THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+        THTensor_(free)(tgradOutputWindow);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalMaxPooling.c b/aten/src/THNN/generic/TemporalMaxPooling.c
new file mode 100644
index 0000000..faef305
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalMaxPooling.c
@@ -0,0 +1,283 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kW,
+                         int dW) {
+  int64_t niframe;
+  int64_t framesize;
+  int64_t noframe;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  int ndims = input->dim();
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+                "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
+  }
+}
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  int64_t niframe;
+  int64_t framesize;
+  int64_t noframe;
+
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  int64_t t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->dim() == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        int64_t maxindex = -1;
+        real maxval = -THInf;
+        int64_t x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    int64_t nbframe = input->size[0];
+    int64_t i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          int64_t maxindex = -1;
+          real maxval = -THInf;
+          int64_t x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  int64_t niframe;
+  int noframe;
+  int64_t framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  int64_t t, y;
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->dim() == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  if (input->dim() == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        int64_t maxindex = (int64_t)xp[y];
+	if (maxindex != -1)
+	  gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    int64_t nbframe = input->size[0];
+    int64_t i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          int64_t maxindex = (int64_t)xp[y];
+	  if (maxindex != -1)
+	    gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalReflectionPadding.c b/aten/src/THNN/generic/TemporalReflectionPadding.c
new file mode 100644
index 0000000..ea6ea9a
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalReflectionPadding.c
@@ -0,0 +1,219 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalReflectionPadding.c"
+#else
+
+static void THNN_(TemporalReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth,
+  long owidth,
+  int pad_l, int pad_r)
+{
+  int iStartX = fmax(0, -pad_l);
+  int oStartX = fmax(0, pad_l);
+
+  long k, ip_x;
+#pragma omp parallel for private(k, ip_x)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long j;
+    for (j = 0; j < owidth; j++) {
+      if (j < pad_l) {
+        ip_x = pad_l * 2 - j;
+      } else if (j >= pad_l && j < iwidth + pad_l) {
+        ip_x = j;
+      } else {
+        ip_x = (iwidth + pad_l - 1) * 2 - j;
+      }
+      ip_x = ip_x - oStartX + iStartX;
+
+      /* real *dest_p = output_p + k*owidth*oheight + i * owidth + j; */
+      real *dest_p = output_p + k*owidth + j;
+      real *src_p = input_p + k*iwidth + ip_x;
+      *dest_p = *src_p;
+    }
+  }
+}
+
+void THNN_(TemporalReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r)
+{
+  int dimw = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iwidth;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+		"non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 3)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimslices++;
+  }
+
+  /* input size */
+  nslices = input->size[dimslices];
+  iwidth = input->size[dimw];
+
+  THArgCheck(pad_l < iwidth && pad_r < iwidth, 4,
+             "Padding size should be less than the corresponding input dimension, "
+             "but got: padding (%d, %d) at dimension %d of input %s",
+             pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str);
+
+  /* output size */
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 , 2,
+	     "input (W: %d)is too small."
+	     " Calculated output W: %d",
+	     iwidth, owidth);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 2)
+  {
+    THTensor_(resize2d)(output, nslices, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(TemporalReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth,
+                                                    owidth,
+                                                    pad_l, pad_r);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize3d)(output, nbatch, nslices, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(TemporalReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth,
+        output_data+p*nslices*owidth,
+        nslices,
+        iwidth,
+        owidth,
+        pad_l, pad_r);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(TemporalReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth,
+  long owidth,
+  int pad_l, int pad_r)
+{
+  int iStartX = fmax(0, -pad_l);
+  int oStartX = fmax(0, pad_l);
+
+  long k, ip_x;
+#pragma omp parallel for private(k, ip_x)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long j;
+    for (j = 0; j < owidth; j++) {
+      if (j < pad_l) {
+        ip_x = pad_l * 2 - j;
+      } else if (j >= pad_l && j < iwidth + pad_l) {
+        ip_x = j;
+      } else {
+        ip_x = (iwidth + pad_l - 1) * 2 - j;
+      }
+      ip_x = ip_x - oStartX + iStartX;
+
+      real *src_p = goutput_p + k*owidth + j;
+      real *dest_p = ginput_p + k*iwidth + ip_x;
+      *dest_p += *src_p;
+    }
+  }
+}
+
+void THNN_(TemporalReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r)
+{
+  int dimw = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iwidth;
+  long owidth;
+
+  if (input->dim() == 3)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iwidth = input->size[dimw];
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->dim() == 2) {
+    THNN_(TemporalReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth,
+      owidth,
+      pad_l, pad_r);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(TemporalReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * owidth,
+        nslices,
+        iwidth,
+        owidth,
+        pad_l, pad_r);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalReplicationPadding.c b/aten/src/THNN/generic/TemporalReplicationPadding.c
new file mode 100644
index 0000000..da8aeb5
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalReplicationPadding.c
@@ -0,0 +1,211 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalReplicationPadding.c"
+#else
+
+static void THNN_(TemporalReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth,
+  long owidth,
+  int pad_l, int pad_r)
+{
+  int iStartX = fmax(0, -pad_l);
+  int oStartX = fmax(0, pad_l);
+
+  long k, ip_x;
+#pragma omp parallel for private(k, ip_x)
+  for (k = 0; k < nslices; k++)
+  {
+    long j;
+    for (j = 0; j < owidth; j++) {
+      if (j < pad_l) {
+        ip_x = pad_l;
+      } else if (j >= pad_l && j < iwidth + pad_l) {
+        ip_x = j;
+      } else {
+        ip_x = iwidth + pad_l - 1;
+      }
+      ip_x = ip_x - oStartX + iStartX;
+
+      real *dest_p = output_p + k*owidth + j;
+      real *src_p = input_p + k*iwidth + ip_x;
+      *dest_p = *src_p;
+    }
+  }
+}
+
+void THNN_(TemporalReplicationPadding_updateOutput)(THNNState *state,
+                                                    THTensor *input,
+                                                    THTensor *output,
+                                                    int pad_l, int pad_r)
+{
+  int dimw = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iwidth;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
+		"non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 3)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iwidth = input->size[dimw];
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 , 2,
+	     "input (W: %d)is too small."
+	     " Calculated output W: %d",
+	     iwidth, owidth);
+
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 2)
+  {
+    THTensor_(resize2d)(output, nslices, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(TemporalReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth,
+                                                    owidth,
+                                                    pad_l, pad_r);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize3d)(output, nbatch, nslices, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(TemporalReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth,
+        output_data+p*nslices*owidth,
+        nslices,
+        iwidth,
+        owidth,
+        pad_l, pad_r);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(TemporalReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth,
+  long owidth,
+  int pad_l, int pad_r)
+{
+  int iStartX = fmax(0, -pad_l);
+  int oStartX = fmax(0, pad_l);
+
+  long k, ip_x;
+#pragma omp parallel for private(k, ip_x)
+  for (k = 0; k < nslices; k++)
+  {
+    long j;
+    for (j = 0; j < owidth; j++) {
+      if (j < pad_l) {
+        ip_x = pad_l;
+      } else if (j >= pad_l && j < iwidth + pad_l) {
+        ip_x = j;
+      } else {
+        ip_x = iwidth + pad_l - 1;
+      }
+      ip_x = ip_x - oStartX + iStartX;
+
+      real *src_p = goutput_p + k*owidth + j;
+      real *dest_p = ginput_p + k*iwidth + ip_x;
+      *dest_p += *src_p;
+    }
+  }
+}
+
+void THNN_(TemporalReplicationPadding_updateGradInput)(THNNState *state,
+                                                       THTensor *input,
+                                                       THTensor *gradOutput,
+                                                       THTensor *gradInput,
+                                                       int pad_l, int pad_r)
+{
+  int dimw = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iwidth;
+  long owidth;
+
+  if (input->dim() == 3)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iwidth = input->size[dimw];
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->dim() == 2) {
+    THNN_(TemporalReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth,
+      owidth,
+      pad_l, pad_r);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(TemporalReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * owidth,
+        nslices,
+        iwidth,
+        owidth,
+        pad_l, pad_r);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c
new file mode 100644
index 0000000..db3278b
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@@ -0,0 +1,468 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c"
+#else
+
+static inline void THNN_(TemporalRowConvolution_shapeCheck)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *bias,
+	int kW,
+	int dW,
+	int padW) {
+
+	THArgCheck(kW > 0, 5,
+	           "kernel size should be greater than zero, but got kW: %d", kW);
+	THArgCheck(dW > 0, 6,
+	           "stride should be greater than zero, but got dW: %d", dW);
+	THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
+	              "non-empty 3D weight tensor expected, but got: %s");
+    THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+    THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+	if (bias != NULL) {
+		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+	}
+
+	// we're always looking at (possibly batch) x feats x seq
+	int ndim = input->dim();
+	int dimF = 0;
+	int dimS = 1;
+
+	if (ndim == 3) {
+		++dimS;
+		++dimF;
+	}
+
+	THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
+	              "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
+
+	int64_t inputFrameSize = weight->size[0];
+	int64_t nInputFrame = input->size[dimS];
+	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (nOutputFrame < 1) {
+		THError("Given input size: (%d x %d). "
+		        "Calculated output size: (%d x %d). Output size is too small",
+		        inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+	}
+
+	THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
+
+	if (gradOutput != NULL) {
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
+	}
+}
+
+static void THNN_(unfolded_acc_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	int64_t inputFrameSize,
+	int64_t nInputFrame,
+	int64_t nOutputFrame) {
+
+	int64_t c;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(c)
+	for (c = 0; c < inputFrameSize; c++) {
+		int64_t kw, x;
+		int64_t ix = 0;
+
+		for (kw = 0; kw < kW; kw++) {
+			real *src = finput_data
+			            + c * (kW * nOutputFrame)
+			            + kw * (nOutputFrame);
+			real *dst = input_data + c * (nInputFrame);
+
+			ix = (size_t)(kw);
+			if (dW == 1) {
+			  real *dst_slice = dst + (size_t)(ix);
+			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
+			} else {
+				for (x = 0; x < nOutputFrame; x++) {
+				  real *dst_slice = dst + (size_t)(ix + x * dW);
+				  THVector_(cadd)(dst_slice, dst_slice,
+						  src + (size_t)(x), 1, 1);
+				}
+			}
+		}
+	}
+}
+
+static void THNN_(unfolded_copy_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	int64_t inputFrameSize,
+	int64_t nInputFrame,
+	int64_t nOutputFrame) {
+
+	int64_t k;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(k)
+	for (k = 0; k < inputFrameSize * kW; k++) {
+		int64_t c = k / kW;
+		int64_t rest = k % kW;
+		int64_t kw = rest % kW;
+		int64_t x;
+		int64_t ix;
+		real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
+		real *src = input_data + c * (nInputFrame);
+
+		ix = (size_t)(kw);
+		if (dW == 1) {
+			memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame));
+		} else {
+			for (x = 0; x < nOutputFrame; x++) {
+				memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
+				       sizeof(real) * 1);
+			}
+		}
+	}
+}
+
+static void THNN_(TemporalRowConvolution_updateOutput_frame)(
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	int kW,
+	int dW,
+	int padW,
+	int64_t inputFrameSize,
+	int64_t nInputFrame,
+	int64_t nOutputFrame) {
+
+	int64_t i;
+
+	THTensor *output3d = THTensor_(newWithStorage3d)(
+		output->storage, output->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+	                         inputFrameSize, nInputFrame, nOutputFrame);
+
+	THTensor_(zero)(output);
+
+	if (bias != NULL) {
+		for (i = 0; i < inputFrameSize; i++)
+			THVector_(fill)
+			        (THStorage_(data)(output->storage) + output->storageOffset
+			        + output->stride[0] * i,
+			        THTensor_(get1d)(bias, i), nOutputFrame);
+	}
+
+	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
+
+	THTensor_(free)(output3d);
+}
+
+void THNN_(TemporalRowConvolution_updateOutput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	THTensor *fgradInput,     // unused here but needed for Cuda
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->dim();
+
+	THTensor *tinput;
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		input = THTensor_(newContiguous)(tinput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(
+		state, input, NULL, weight, bias, kW, dW, padW);
+
+	int64_t inputFrameSize = weight->size[0];
+	int64_t nInputFrame = input->size[ndim - 1];
+	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (ndim == 2) { /* non-batch mode */
+
+		THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+		THNN_(TemporalRowConvolution_updateOutput_frame)
+		        (input, output, weight, bias, finput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+
+	} else {
+		int64_t T = input->size[0];
+		int64_t t;
+
+		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+			THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+			THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateOutput_frame)
+			        (input_t, output_t, weight, bias, finput_t,
+			        kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(input_t);
+			THTensor_(free)(output_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) { // NOTE: output will NOT be contiguous in this case
+		THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
+		THTensor_(free)(tinput);
+	}
+
+	THTensor_(free)(input);
+}
+
+static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
+	THTensor *gradInput,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	int64_t inputFrameSize,
+	int64_t nInputFrame,
+	int64_t nOutputFrame) {
+
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	// weight:			inputFrameSize x kW x 1
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
+	// fgradInput:		inputFrameSize x kW x nOutputFrame
+	THTensor_(free)(gradOutput3d);
+
+	THTensor_(zero)(gradInput);
+
+	THNN_(unfolded_acc_row)(fgradInput, gradInput,
+	                        kW, dW, padW,
+	                        inputFrameSize, nInputFrame, nOutputFrame);
+}
+
+void THNN_(TemporalRowConvolution_updateGradInput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradInput,
+	THTensor *weight,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->dim();
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
+	                                         NULL, kW, dW, padW);
+
+	int64_t inputFrameSize = weight->size[0];
+	int64_t nInputFrame = input->size[ndim - 1];
+	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	THTensor_(resizeAs)(fgradInput, finput);
+	THTensor_(resizeAs)(gradInput, input);
+
+	THTensor_(zero)(fgradInput);
+	THTensor_(zero)(gradInput);
+
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight, weight, 1, 2);
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_updateGradInput_frame)
+		        (gradInput, gradOutput, tweight, fgradInput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+	} else {
+		int64_t T = input->size[0];
+		int64_t t;
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+
+			THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateGradInput_frame)
+			        (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+			        kW, dW, padW,
+			        inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(gradInput_t);
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(fgradInput_t);
+		}
+	}
+
+    THTensor_(free)(tweight);
+
+	if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
+
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+
+		THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+
+}
+
+static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
+	THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+	THTensor *finput, real scale) {
+
+	int64_t i;
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		gradOutput->size[0], -1,
+		1, -1,
+		gradOutput->size[1], -1);
+
+    THTensor *tfinput = THTensor_(new)();
+	THTensor_(transpose)(tfinput, finput, 1, 2);
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	// finput:			inputFrameSize x nOutputFrame x kW
+	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
+	// gradWeight:		inputFrameSize x 1 x kW
+    THTensor_(free)(tfinput);
+
+	if (gradBias != NULL) {
+		for (i = 0; i < gradBias->size[0]; i++) {
+			int64_t k;
+			real sum = 0;
+			real *data = THStorage_(data)(gradOutput3d->storage)
+			             + gradOutput3d->storageOffset
+			             + i * gradOutput3d->stride[0];
+			for (k = 0; k < gradOutput3d->size[2]; k++) {
+				sum += data[k];
+			}
+			(THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i]
+			        += scale * sum;
+		}
+	}
+
+	THTensor_(free)(gradOutput3d);
+
+}
+
+void THNN_(TemporalRowConvolution_accGradParameters)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradWeight,
+	THTensor *gradBias,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst,
+	accreal scale_) {
+
+    real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+	int ndim = input->dim();
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)
+	        (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_accGradParameters_frame)(
+			gradOutput, gradWeight, gradBias, finput, scale);
+	} else {
+		int64_t T = input->size[0];
+		int64_t t;
+
+		for (t = 0; t < T; t++) {
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_accGradParameters_frame)(
+				gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) {
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalSubSampling.c b/aten/src/THNN/generic/TemporalSubSampling.c
new file mode 100644
index 0000000..8c90d26
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalSubSampling.c
@@ -0,0 +1,156 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+static inline void THNN_(TemporalSubSampling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+  int nInputFrame, nOutputFrame;
+
+  THArgCheck(kW > 0, 6,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 7,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(!input->is_empty() && input->dim() == 2, 2, input,
+                  "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck( input->size[1] == *inputFrameSize, 2,
+                "invalid input frame size.  Got: %d, Expected: %d",
+                input->size[1], *inputFrameSize);
+  }
+  THArgCheck( input->size[0] >= kW, 2,
+              "input sequence smaller than kernel size.  Got %d, Expected: %d",
+              input->size[0], kW);
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), 0, nOutputFrame);
+    if (inputFrameSize != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), 1, *inputFrameSize);
+    }
+  }
+}
+
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  int64_t k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0, 1);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  int64_t k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  int64_t k;
+
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0, 1);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalUpSamplingLinear.c b/aten/src/THNN/generic/TemporalUpSamplingLinear.c
new file mode 100644
index 0000000..2faa9f8
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalUpSamplingLinear.c
@@ -0,0 +1,147 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalUpSamplingLinear.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(TemporalUpSamplingLinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputWidth, int outputWidth) {
+  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (W: %d) output (W: %d)",
+	     inputWidth, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(!input->is_empty() && input->dim() == 3, 2, input,
+		  "non-empty 3D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+  }
+}
+
+void THNN_(TemporalUpSamplingLinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputWidth,
+    bool align_corners){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputWidth = THTensor_(size)(input, 2);
+
+  THNN_(TemporalUpSamplingLinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputWidth, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize3d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputWidth > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputWidth == outputWidth) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = w2;
+      const real* pos1 = &idata[w1];
+      real* pos2 = &odata[w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = pos1[0];
+        pos1 += inputWidth;
+        pos2 += outputWidth;
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int w2 = 0; w2 < outputWidth; ++w2) {
+    const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+    const real w1lambda = w1r - w1;
+    const real w0lambda = (real)1. - w1lambda;
+    const real* pos1 = &idata[w1];
+    // index w2 is interpolated by idata[w1] and (itself or idata[w1 + 1])
+    real* pos2 = &odata[w2];
+    for (int c = 0; c < channels; ++c) {
+      pos2[0] = w0lambda * pos1[0] + w1lambda * pos1[w1p];
+      pos1 += inputWidth;
+      pos2 += outputWidth;
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(TemporalUpSamplingLinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputWidth,
+    int outputWidth,
+    bool align_corners){
+
+  THNN_(TemporalUpSamplingLinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputWidth,
+     outputWidth);
+
+  THTensor_(resize3d)(gradInput, nbatch, channels, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputWidth == outputWidth) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = w2;
+      real* pos1 = &data1[w1];
+      const real* pos2 = &data2[w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += pos2[0];
+        pos1 += inputWidth;
+        pos2 += outputWidth;
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int w2 = 0; w2 < outputWidth; ++w2) {
+    const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+    const int w1 = w1r;
+    const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+    const real w1lambda = w1r - w1;
+    const real w0lambda = (real)1. - w1lambda;
+    real* pos1 = &data1[w1];
+    const real* pos2 = &data2[w2];
+    for (int c = 0; c < channels; ++c) {
+      pos1[0] += w0lambda * pos2[0];
+      pos1[w1p] += w1lambda * pos2[0];
+      pos1 += inputWidth;
+      pos2 += outputWidth;
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/TemporalUpSamplingNearest.c b/aten/src/THNN/generic/TemporalUpSamplingNearest.c
new file mode 100644
index 0000000..853f6ca
--- /dev/null
+++ b/aten/src/THNN/generic/TemporalUpSamplingNearest.c
@@ -0,0 +1,130 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalUpSamplingNearest.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(TemporalUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputWidth, int outputWidth) {
+  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
+       "input and output sizes should be greater than 0,"
+       " but got input (W: %d) output (W: %d)",
+       inputWidth, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->_dim() == 3, 2, input,
+      "3D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+  }
+}
+
+void THNN_(TemporalUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputWidth)
+{
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputWidth = THTensor_(size)(input, 2);
+  const float scale = (float) inputWidth / (float)outputWidth;
+
+  THNN_(TemporalUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels, inputWidth, outputWidth);
+
+    THTensor_(resize3d)(output,
+			THTensor_(size)(input, 0),
+      THTensor_(size)(input, 1),
+      outputWidth);
+    channels = channels * nbatch;
+
+  THAssert(inputWidth > 0 && outputWidth > 0);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+
+  // special case: just copy
+  if (inputWidth == outputWidth) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = w2;
+      const real* pos1 = &idata[w1];
+      real* pos2 = &odata[w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = pos1[0];
+        pos1 += inputWidth;
+        pos2 += outputWidth;
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+
+  for (int w2 = 0; w2 < outputWidth; ++w2) {
+    const accreal src_x = nearest_neighbor_compute_source_index(scale, w2, inputWidth);
+    const int w1 = src_x;
+    const real* pos1 = &idata[w1];
+    real* pos2 = &odata[w2];
+    for (int c = 0; c < channels; ++c) {
+      pos2[0] = pos1[0];
+      pos1 += inputWidth;
+      pos2 += outputWidth;
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(TemporalUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputWidth,
+    int outputWidth)
+{
+  THNN_(TemporalUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels, inputWidth, outputWidth);
+  THTensor_(resize3d)(gradInput, nbatch, channels, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+  const float scale = (float) inputWidth / (float)outputWidth;
+
+  // special case: same-size matching grids
+  if (inputWidth == outputWidth) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const int w1 = w2;
+      real* pos1 = &data1[w1];
+      const real* pos2 = &data2[w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += pos2[0];
+        pos1 += inputWidth;
+        pos2 += outputWidth;
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+
+  for (int w2 = 0; w2 < outputWidth; ++w2) {
+    const int w1 = nearest_neighbor_compute_source_index(scale, w2, inputWidth);
+    real* pos1 = &data1[w1];
+    const real* pos2 = &data2[w2];
+    for (int c = 0; c < channels; ++c) {
+      pos1[0] += pos2[0];
+      pos1 += inputWidth;
+      pos2 += outputWidth;
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/Threshold.c b/aten/src/THNN/generic/Threshold.c
new file mode 100644
index 0000000..592aa8d
--- /dev/null
+++ b/aten/src/THNN/generic/Threshold.c
@@ -0,0 +1,63 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
+  }
+}
+
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
new file mode 100644
index 0000000..1edf8a9
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
@@ -0,0 +1,304 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 5d tensor B x D x T x H x W
+
+static void THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          int64_t sizeD,
+          int64_t isizeT,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeT,
+          int64_t osizeH,
+          int64_t osizeW,
+          int64_t istrideD,
+          int64_t istrideT,
+          int64_t istrideH,
+          int64_t istrideW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    /* loop over output */
+    int64_t ot, oh, ow;
+    for(ot = 0; ot < osizeT; ot++)
+    {
+      int istartT = START_IND(ot, osizeT, isizeT);
+      int iendT   = END_IND(ot, osizeT, isizeT);
+      int kT = iendT - istartT;
+
+      for(oh = 0; oh < osizeH; oh++)
+      {
+        int istartH = START_IND(oh, osizeH, isizeH);
+        int iendH   = END_IND(oh, osizeH, isizeH);
+        int kH = iendH - istartH;
+
+        for(ow = 0; ow < osizeW; ow++)
+        {
+
+          int istartW = START_IND(ow, osizeW, isizeW);
+          int iendW   = END_IND(ow, osizeW, isizeW);
+          int kW = iendW - istartW;
+
+          /* local pointers */
+          real *ip = input_p  + d*istrideD + istartT*istrideT + istartH*istrideH + istartW*istrideW;
+          real *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
+
+          /* compute local average: */
+          real sum = 0;
+          int it, ih, iw;
+          for(it = 0; it < kT; it++)
+          {
+            for(ih = 0; ih < kH; ih++)
+            {
+              for(iw = 0; iw < kW; iw++)
+              {
+                real val = *(ip + it*istrideT + ih*istrideH + iw*istrideW);
+                sum += val;
+              }
+            }
+          }
+
+          /* set output to local average */
+          *op = sum / kT / kH / kW;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int osizeT,
+          int osizeW,
+          int osizeH)
+{
+  int dimD = 0;
+  int dimT = 1;
+  int dimH = 2;
+  int dimW = 3;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeT = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;
+
+  int64_t istrideB = 0;
+  int64_t istrideD = 0;
+  int64_t istrideT = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+
+  real *input_data = nullptr;
+  real *output_data = nullptr;
+
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+		"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 5)
+  {
+    istrideB = input->stride[0];
+    sizeB = input->size[0];
+    dimD++;
+    dimT++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeT = input->size[dimT];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  /* strides */
+  istrideD = input->stride[dimD];
+  istrideT = input->stride[dimT];
+  istrideH = input->stride[dimH];
+  istrideW = input->stride[dimW];
+
+  /* resize output */
+  if (input->dim() == 4)
+  {
+    THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                      sizeD,
+                                                      isizeT, isizeH, isizeW,
+                                                      osizeT, osizeH, osizeW,
+                                                      istrideD, istrideT,
+                                                      istrideH, istrideW);
+  }
+  else
+  {
+    int64_t b;
+
+    THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                        sizeD,
+                                                        isizeT, isizeH, isizeW,
+                                                        osizeT, osizeH, osizeW,
+                                                        istrideD, istrideT,
+                                                        istrideH, istrideW);
+    }
+  }
+}
+
+static void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          int64_t sizeD,
+          int64_t isizeT,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeT,
+          int64_t osizeH,
+          int64_t osizeW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    real *gradInput_p_d  = gradInput_p + d*isizeT*isizeW*isizeH;
+    real *gradOutput_p_d = gradOutput_p + d*osizeT*osizeW*osizeH;
+
+    /* calculate average */
+    int64_t ot, oh, ow;
+    for(ot = 0; ot < osizeT; ot++)
+    {
+      int istartT = START_IND(ot, osizeT, isizeT);
+      int iendT   = END_IND(ot, osizeT, isizeT);
+      int kT = iendT - istartT;
+
+      for(oh = 0; oh < osizeH; oh++)
+      {
+        int istartH = START_IND(oh, osizeH, isizeH);
+        int iendH   = END_IND(oh, osizeH, isizeH);
+        int kH = iendH - istartH;
+
+        for(ow = 0; ow < osizeW; ow++)
+        {
+
+          int istartW = START_IND(ow, osizeW, isizeW);
+          int iendW   = END_IND(ow, osizeW, isizeW);
+          int kW = iendW - istartW;
+
+          real grad_delta = gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow] / kT / kH / kW;
+
+          int it, ih, iw;
+          for(it = istartT; it < iendT; it++)
+          {
+            for(ih = istartH; ih < iendH; ih++)
+            {
+              for(iw = istartW; iw < iendW; iw++)
+              {
+                /* update gradient */
+                gradInput_p_d[it*isizeH*isizeW + ih*isizeW + iw] += grad_delta;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  int dimD = 0;
+  int dimT = 1;
+  int dimH = 2;
+  int dimW = 3;
+  int64_t sizeB = 1;
+  int64_t sizeD;
+  int64_t isizeT;
+  int64_t isizeH;
+  int64_t isizeW;
+  int64_t osizeT;
+  int64_t osizeH;
+  int64_t osizeW;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 5) {
+    sizeB = input->size[0];
+    dimD++;
+    dimT++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeT = input->size[dimT];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  osizeT = gradOutput->size[dimT];
+  osizeH = gradOutput->size[dimH];
+  osizeW = gradOutput->size[dimW];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->dim() == 4)
+  {
+    THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         sizeD,
+                                                         isizeT, isizeH, isizeW,
+                                                         osizeT, osizeH, osizeW);
+  }
+  else
+  {
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                           sizeD,
+                                                           isizeT, isizeH, isizeW,
+                                                           osizeT, osizeH, osizeW);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND
diff --git a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c
new file mode 100644
index 0000000..74efa76
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c
@@ -0,0 +1,305 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAdaptiveMaxPooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+// 5d tensor B x D x T x H x W
+
+static void THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int64_t sizeD,
+          int64_t isizeT,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeT,
+          int64_t osizeH,
+          int64_t osizeW,
+          int64_t istrideD,
+          int64_t istrideT,
+          int64_t istrideH,
+          int64_t istrideW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    /* loop over output */
+    int64_t ot, oh, ow;
+    for(ot = 0; ot < osizeT; ot++)
+    {
+      int64_t istartT = START_IND(ot, osizeT, isizeT);
+      int64_t iendT   = END_IND(ot, osizeT, isizeT);
+      int64_t kT = iendT - istartT;
+
+      for(oh = 0; oh < osizeH; oh++)
+      {
+        int64_t istartH = START_IND(oh, osizeH, isizeH);
+        int64_t iendH   = END_IND(oh, osizeH, isizeH);
+        int64_t kH = iendH - istartH;
+
+        for(ow = 0; ow < osizeW; ow++)
+        {
+
+          int64_t istartW = START_IND(ow, osizeW, isizeW);
+          int64_t iendW   = END_IND(ow, osizeW, isizeW);
+          int64_t kW = iendW - istartW;
+
+          /* local pointers */
+          real *ip = input_p   + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW;
+          real *op = output_p  + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
+          THIndex_t *indp = ind_p   + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
+
+          /* compute local max: */
+          int64_t maxindex = -1;
+          real maxval = -FLT_MAX;
+          int64_t it, ih, iw;
+          for(it = 0; it < kT; it++)
+          {
+            for(ih = 0; ih < kH; ih++)
+            {
+              for(iw = 0; iw < kW; iw++)
+              {
+                real val = *(ip + it*istrideT + ih*istrideH + iw*istrideW);
+                if ((val > maxval) || std::isnan(val))
+                {
+                  maxval = val;
+                  maxindex = (it+istartT)*isizeH*isizeW + (ih+istartH)*isizeW + (iw+istartW);
+                }
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = maxval;
+
+          /* store location of max */
+          *indp = maxindex + TH_INDEX_BASE;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int osizeT,
+          int osizeW,
+          int osizeH)
+{
+  int dimD = 0;
+  int dimT = 1;
+  int dimH = 2;
+  int dimW = 3;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeT = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;
+
+  int64_t istrideB = 0;
+  int64_t istrideD = 0;
+  int64_t istrideT = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+
+  real *input_data = nullptr;
+  real *output_data = nullptr;
+  THIndex_t *indices_data = nullptr;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+    "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 5)
+  {
+    istrideB = input->stride[0];
+    sizeB = input->size[0];
+    dimD++;
+    dimT++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeT = input->size[dimT];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  /* strides */
+  istrideD = input->stride[dimD];
+  istrideT = input->stride[dimT];
+  istrideH = input->stride[dimH];
+  istrideW = input->stride[dimW];
+
+  /* resize output */
+  if (input->dim() == 4)
+  {
+    THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW);
+    /* indices will contain max input locations for each output point */
+    THIndexTensor_(resize4d)(indices, sizeD, osizeT, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data,
+                                                      sizeD,
+                                                      isizeT, isizeH, isizeW,
+                                                      osizeT, osizeH, osizeW,
+                                                      istrideD, istrideT,
+                                                      istrideH, istrideW);
+  }
+  else
+  {
+    int64_t b;
+
+    THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW);
+    /* indices will contain max input locations for each output point */
+    THIndexTensor_(resize5d)(indices, sizeB, sizeD, osizeT, osizeH, osizeW);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                        indices_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                        sizeD,
+                                                        isizeT, isizeH, isizeW,
+                                                        osizeT, osizeH, osizeW,
+                                                        istrideD, istrideT,
+                                                        istrideH, istrideW);
+    }
+  }
+}
+
+static void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int64_t sizeD,
+          int64_t isizeT,
+          int64_t isizeH,
+          int64_t isizeW,
+          int64_t osizeT,
+          int64_t osizeH,
+          int64_t osizeW)
+{
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++)
+  {
+    real *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
+    real *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
+    THIndex_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
+
+    /* calculate max points */
+    int64_t ot, oh, ow;
+    for(ot = 0; ot < osizeT; ot++)
+    {
+      for(oh = 0; oh < osizeH; oh++)
+      {
+        for(ow = 0; ow < osizeW; ow++)
+        {
+          /* retrieve position of max */
+          int64_t maxp = ind_p_d[ot*osizeH*osizeW + oh*osizeW + ow] - TH_INDEX_BASE;
+
+          /* update gradient */
+          gradInput_p_d[maxp] += gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices)
+{
+  int dimD = 0;
+  int dimT = 1;
+  int dimH = 2;
+  int dimW = 3;
+  int64_t sizeB = 1;
+  int64_t sizeD;
+  int64_t isizeT;
+  int64_t isizeH;
+  int64_t isizeW;
+  int64_t osizeT;
+  int64_t osizeH;
+  int64_t osizeW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 5) {
+    sizeB = input->size[0];
+    dimD++;
+    dimT++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD  = input->size[dimD];
+  isizeT = input->size[dimT];
+  isizeH = input->size[dimH];
+  isizeW = input->size[dimW];
+  osizeT = gradOutput->size[dimT];
+  osizeH = gradOutput->size[dimH];
+  osizeW = gradOutput->size[dimW];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 4)
+  {
+    THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data,
+                                                         sizeD,
+                                                         isizeT, isizeH, isizeW,
+                                                         osizeT, osizeH, osizeW);
+  }
+  else
+  {
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < sizeB; b++)
+    {
+      THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                           indices_data+b*sizeD*osizeT*osizeH*osizeW,
+                                                           sizeD,
+                                                           isizeT, isizeH, isizeW,
+                                                           osizeT, osizeH, osizeW);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricAveragePooling.c b/aten/src/THNN/generic/VolumetricAveragePooling.c
new file mode 100644
index 0000000..c9dd9f7
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricAveragePooling.c
@@ -0,0 +1,500 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int padT,
+                         int padW,
+                         int padH,
+                         bool ceil_mode)
+{
+  int64_t nslices;
+  int64_t itime;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t otime;
+  int64_t oheight;
+  int64_t owidth;
+  int ndim = input->dim();
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+             && input->size[dimt] >= kT, 2,
+             "input image (T: %d H: %d W: %d) smaller than "
+             "kernel size (kT: %d kH: %d kW: %d)",
+             input->size[dimt], input->size[dimh], input->size[dimw],
+             kT, kH, kW);
+
+  // The second argument is argNumber... here is the index of padH.
+  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11,
+            "pad should not be greater than half of kernel size, but got "
+            "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d",
+            padT, padW, padH, kT, kW, kH);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+
+  if (ceil_mode) {
+    otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
+    oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (int64_t)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    otime   = (int64_t)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
+    oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (int64_t)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padT || padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((otime   - 1)*dT >= itime   + padT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          int64_t nslices,
+          int64_t itime,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t otime,
+          int64_t owidth,
+          int64_t oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int padT,
+          int padW,
+          int padH,
+          bool count_include_pad)
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j, ti;
+
+    /* local pointers. */
+    real *ip = input_p + k * itime * iwidth * iheight;
+    real *op = output_p + k * otime * owidth * oheight;
+    for (i = 0; i < otime * oheight * owidth; ++i)
+      *(op + i) = 0;
+
+    /* loop over output */
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* compute pool range. */
+          int64_t tstart = ti * dT - padT;
+          int64_t hstart = i  * dH - padH;
+          int64_t wstart = j  * dW - padW;
+          int64_t tend = fminf(tstart + kT, itime + padT);
+          int64_t hend = fminf(hstart + kH, iheight + padH);
+          int64_t wend = fminf(wstart + kW, iwidth + padW);
+          int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+          tstart = fmaxf(tstart, 0);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          tend = fmin(tend, itime);
+          hend = fmin(hend, iheight);
+          wend = fmin(wend, iwidth);
+
+          int divide_factor;
+          if (count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int64_t x, y, z;
+
+          for (z = tstart; z < tend; z++)
+          {
+            for (y = hstart; y < hend; y++)
+            {
+              for (x = wstart; x < wend; x++)
+              {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op++ += sum / divide_factor;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int padT,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int64_t nslices;
+  int64_t itime;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t otime;
+  int64_t oheight;
+  int64_t owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, NULL, kT, kW, kH,
+        dT, dW, dH, padT, padW, padH, ceil_mode);
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceil_mode)
+  {
+    otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
+    oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (int64_t)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    otime   = (int64_t)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
+    oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (int64_t)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  if (padT || padH || padW)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((otime   - 1)*dT >= itime   + padT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->dim() == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      padT, padW, padH,
+      count_include_pad
+    );
+  }
+  else  /* batch mode */
+  {
+    int64_t p;
+    int64_t nBatch = input->size[0];
+
+    int64_t istride = nslices * itime * iwidth * iheight;
+    int64_t ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        padT, padW, padH,
+        count_include_pad
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          int64_t nslices,
+          int64_t itime,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t otime,
+          int64_t owidth,
+          int64_t oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int padT,
+          int padW,
+          int padH,
+          bool count_include_pad)
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int64_t i, j, ti;
+
+    /* local pointers */
+    real *ip = gradInput_p + k * itime * iwidth * iheight;
+    real *op = gradOutput_p + k * otime * owidth * oheight;
+    for (i = 0; i < itime*iwidth*iheight; i++)
+      *(ip + i) = 0;
+
+    /* loop over output */
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          int64_t tstart = ti * dT - padT;
+          int64_t hstart = i  * dH - padH;
+          int64_t wstart = j  * dW - padW;
+          int64_t tend = fminf(tstart + kT, itime + padT);
+          int64_t hend = fminf(hstart + kH, iheight + padH);
+          int64_t wend = fminf(wstart + kW, iwidth + padW);
+          int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
+          tstart = fmaxf(tstart, 0);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          tend = fminf(tend, itime);
+          hend = fminf(hend, iheight);
+          wend = fminf(wend, iwidth);
+
+          int64_t divide_factor;
+          if (count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);
+
+          /* scatter gradients out to footprint: */
+          real val  = *op++;
+
+          int64_t x,y,z;
+          for (z = tstart; z < tend; z++)
+          {
+            for (y = hstart; y < hend; y++)
+            {
+              for (x = wstart; x < wend; x++)
+              {
+                *(ip + z * iheight * iwidth + y * iwidth + x) += val / divide_factor;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int padT,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int64_t nslices;
+  int64_t itime;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t otime;
+  int64_t oheight;
+  int64_t owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, gradOutput, kT, kW, kH,
+        dT, dW, dH, padT, padW, padH, ceil_mode);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->dim() == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      padT, padW, padH,
+      count_include_pad
+    );
+  }
+  else /* batch mode */
+  {
+    int64_t p;
+    int64_t nBatch = input->size[0];
+
+    int64_t istride = nslices * itime * iwidth * iheight;
+    int64_t ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        padT, padW, padH,
+        count_include_pad
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c
new file mode 100644
index 0000000..d88cc60
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricConvolution.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+		"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->dim() == 5)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t nOutputPlane = weight->size[0];
+  int64_t kT           = weight->size[2];
+  int64_t kH           = weight->size[3];
+  int64_t kW           = weight->size[4];
+  int64_t inputDepth   = input->size[dimt];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t outputDepth  = (inputDepth - kT) / dT + 1;
+  int64_t outputWidth  = (inputWidth - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  int64_t i, j;
+  if (input->dim() == 4) /* non-batch mode */
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    /* add bias */
+    if (bias) {
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, output, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+    } else {
+      THTensor_(zero)(output);
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  }
+  else /* batch mode */
+  {
+    int64_t nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
+      /* add bias */
+      if (bias) {
+        for (i = 0; i < bias->size[0]; i++)
+        {
+          THTensor_(select)(outn, outb, 0, i);
+          THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+        }
+      } else {
+        THTensor_(zero)(outb);
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight,
+		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3,
+		gradOutput,
+		"non-empty 4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+
+  int dimPlane = 0;
+  if (gradOutput->dim() == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  /* gradient to input */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->dim() == 4) /* non-batch mode */
+  {
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  }
+  else /* batch mode */
+  {
+    int64_t nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    int64_t j;
+
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(!gradWeight->is_empty() && gradWeight->dim() == 5, 4, gradWeight,
+		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
+
+  int nOutputPlane = (int)gradWeight->size[0];
+  if (gradBias) {
+    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size[0] == nOutputPlane, 5,
+      "gradBias tensor has wrong size"
+    );
+  }
+
+  int64_t k;
+  real *gradBias_data;
+  THTensor *gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->dim() == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->dim() == 4) /* non-batch mode */
+  {
+    /* gradient to bias */
+    if (gradBias) {
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
+    }
+
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  }
+  else /* batch mode */
+  {
+    int64_t nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    int64_t j;
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
+      /* gradient to bias */
+      if (gradBias) {
+        gradBias_data = THTensor_(data)(gradBias);
+        gradOutSlice = THTensor_(new)();
+        for (k = 0; k < nOutputPlane; k++)
+        {
+          THTensor_(select)(gradOutSlice, goutb, 0, k);
+          gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+        }
+        THTensor_(free)(gradOutSlice);
+      }
+
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c
new file mode 100644
index 0000000..2fa1874
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,768 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+#define CONV3D_OMP_THRESHOLD 20
+
+static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         THTensor *bias,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH,
+                         int weight_nullable) {
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 5), 5, weight,
+                    "non-empty 2D or 5D weight tensor expected, but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t inputDepth;
+  int64_t inputHeight;
+  int64_t inputWidth;
+
+  int64_t exactInputDepth;
+  int64_t exactInputHeight;
+  int64_t exactInputWidth;
+  int64_t outputDepth;
+  int64_t outputHeight;
+  int64_t outputWidth;
+
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+
+  exactInputDepth = inputDepth + 2*pT;
+  exactInputHeight = inputHeight + 2*pH;
+  exactInputWidth = inputWidth + 2*pW;
+
+  if (exactInputDepth < kT || exactInputHeight < kH || exactInputWidth < kW) {
+    THError("Calculated padded input size per channel: (%ld x %ld x %ld). "
+      "Kernel size: (%ld x %ld x %ld). Kernel size can't be greater than actual input size",
+      exactInputDepth, exactInputHeight, exactInputWidth, kT, kH, kW);
+  }
+
+  outputDepth  = (exactInputDepth - kT) / dT + 1;
+  outputHeight = (exactInputHeight - kH) / dH + 1;
+  outputWidth  = (exactInputWidth - kW) / dW + 1;
+
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small",
+      inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    if (weight->dim() == 2) {
+      nInputPlane /= (kT * kH * kW);
+    }
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(newViewWeight)(THTensor *weight)
+{
+  weight = THTensor_(newContiguous)(weight);
+  if (weight->dim() == 5) {
+    int64_t s1 = weight->size[0];
+    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+    THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+
+// Kernel for fast unfold+copy
+// Borrowed from Theano
+// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
+
+static void THNN_(unfolded_acc_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int64_t nInputPlane,
+          int64_t inputDepth,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputDepth,
+          int64_t outputWidth,
+          int64_t outputHeight)
+{
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+#ifdef _OPENMP
+  int inOmp = omp_in_parallel();
+  #pragma omp parallel if (!inOmp) firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, nInputPlane, inputHeight, inputWidth, inputDepth)
+  {
+    size_t num_threads = omp_get_num_threads();
+    size_t tid = omp_get_thread_num();
+    int64_t n = nInputPlane * inputHeight * inputWidth * inputDepth;
+    int64_t seg_len_tmp = n / num_threads;
+    int64_t line_index_offset = tid * seg_len_tmp;
+    int64_t line_seg_len = (tid == num_threads - 1)? (n-line_index_offset) : seg_len_tmp;
+
+    int64_t w = line_index_offset % inputWidth + pW;
+    int64_t h_index = line_index_offset / inputWidth;
+    int64_t h = h_index % inputHeight + pH;
+    int64_t d_index = h_index / inputHeight;
+    int64_t d = d_index % inputDepth + pT;
+    int64_t c = d_index / inputDepth;
+#else
+    int64_t line_seg_len = nInputPlane * inputHeight * inputWidth * inputDepth;
+    int64_t line_index_offset = 0;
+    int64_t w = pW;
+    int64_t h = pH;
+    int64_t d = pT;
+    int64_t c = 0;;
+#endif
+    int64_t outputHW = outputHeight * outputWidth;
+    int64_t outputDHW = outputDepth * outputHW;
+    int64_t kHkW = kH*kW;
+    int64_t kTkHkW = kT*kHkW;
+
+    int64_t coeff_d_col = outputHW - dT * kHkW * outputDHW;
+    int64_t coeff_h_col = outputWidth - dH * kW * outputDHW;
+    int64_t coeff_w_col = (1 - dW * outputDHW);
+
+    int64_t count = 0;
+    while (count < line_seg_len) {
+      // compute the start and end of the output
+      int64_t w_col_start = (w < kW) ? 0 : (w - kW) / dW + 1;
+      int64_t w_col_tmp = w / dW + 1;
+      int64_t w_col_end = w_col_tmp < outputWidth? w_col_tmp : outputWidth;
+
+      int64_t h_col_start = (h < kH) ? 0 : (h - kH) / dH + 1;
+      int64_t h_col_tmp = h / dH + 1;
+      int64_t h_col_end = h_col_tmp < outputHeight? h_col_tmp : outputHeight;
+
+      int64_t d_col_start = (d < kT) ? 0 : (d - kT) / dT + 1;
+      int64_t d_col_tmp = d / dT + 1;
+      int64_t d_col_end = d_col_tmp < outputDepth? d_col_tmp : outputDepth;
+
+      real val = 0;
+      int64_t offset = (c * kTkHkW + d * kHkW + h * kW + w) * outputDHW;
+
+      int64_t offset_w_col_start = w_col_start * coeff_w_col;
+      int64_t offset_d_col_start = d_col_start * coeff_d_col;
+      int64_t offset_h_col_start = h_col_start * coeff_h_col;
+      int64_t offset_w_col = offset_w_col_start + offset;
+      int64_t offset_d_col;
+      int64_t offset_h_col;
+      int64_t w_col, d_col, h_col;
+      for (w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        offset_d_col = offset_d_col_start + offset_w_col;
+        for (d_col = d_col_start; d_col < d_col_end; ++d_col) {
+          offset_h_col = offset_h_col_start + offset_d_col;
+          for (h_col = h_col_start; h_col < h_col_end; ++h_col) {
+            val += finput_data[offset_h_col];
+            offset_h_col += coeff_h_col;
+          }
+          offset_d_col += coeff_d_col;
+        }
+        offset_w_col += coeff_w_col;
+      }
+
+      input_data[line_index_offset+count] = val;
+      count++;
+
+      if (count < line_seg_len) {
+        if (w - pW + 1 == inputWidth) {
+          w = pW;
+          if (h - pH + 1 == inputHeight) {
+            h = pH;
+            if (d - pT + 1 == inputDepth) {
+              d = pT;
+              c++;
+            }
+            else d++;
+          }
+          else h++;
+        }
+        else w++;
+      }
+    }
+#ifdef _OPENMP
+  }
+#endif
+}
+
+/*
+  Modified from the version of CUDA implementation, but the loop iterations is larger than that one.
+  The larger loop could lower the proportion of openmp overhead. And the inner part in loop is simpler.
+  The naive code is below:
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+  int64_t n = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight;
+  #pragma omp parallel for firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, inputHeight, inputWidth, inputDepth)
+  for (int64_t idx = 0; idx < n ; ++idx) {
+    int64_t w_out = line_index_offset % outputWidth;
+    int64_t remained = line_index_offset / outputWidth;
+    int64_t h_out = remained % outputHeight;
+    remained /= outputHeight;
+    int64_t d_out = remained % outputDepth;
+    remained /= outputDepth;
+    int k = remained % kW;
+    remained /= kW;
+    int j = remained % kH;
+    remained /= kH;
+    int i = remained % kT;
+    int64_t nip = remained / kT;
+
+    int64_t d = d_out * dT - pT + i;
+    int64_t h = h_out * dH - pH + j;
+    int64_t w = w_out * dW - pW + k;
+
+    finput_data[idx] = (h >= 0 && w >= 0 && d >= 0 && h < inputHeight && w < inputWidth && d < inputDepth) ?
+      input_data[nip*inputDepth*inputWidth*inputHeight+ d*inputHeight*inputWidth + h*inputWidth + w] : 0;
+  }
+
+  However, there are 6 quotient and 6 module operations which are very time-consuming. So we choose relatively
+  more complex but more efficient pattern.
+*/
+static void THNN_(unfolded_copy_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int64_t nInputPlane,
+          int64_t inputDepth,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputDepth,
+          int64_t outputWidth,
+          int64_t outputHeight)
+{
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#ifdef _OPENMP
+  int inOmp = omp_in_parallel();
+  #pragma omp parallel if (!inOmp) firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, nInputPlane, inputHeight, inputWidth, inputDepth)
+  {
+    size_t num_threads = omp_get_num_threads();
+    size_t tid = omp_get_thread_num();
+    int64_t n = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight;
+    int64_t seg_len_tmp = n / num_threads;
+    int64_t line_index_offset = tid * seg_len_tmp;
+    int64_t line_seg_len = (tid == num_threads - 1)? (n-line_index_offset) : seg_len_tmp;
+
+    int64_t w_out = line_index_offset % outputWidth;
+    int64_t remained = line_index_offset / outputWidth;
+    int64_t h_out = remained % outputHeight;
+    remained /= outputHeight;
+    int64_t d_out = remained % outputDepth;
+    remained /= outputDepth;
+    int k = remained % kW;
+    remained /= kW;
+    int j = remained % kH;
+    remained /= kH;
+    int i = remained % kT;
+    int64_t nip = remained / kT;
+#else
+    int64_t line_seg_len = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight;
+    int64_t line_index_offset = 0;
+    int64_t w_out = 0;
+    int64_t h_out = 0;
+    int64_t d_out = 0;
+    int i = 0;
+    int j = 0;
+    int k = 0;
+    int64_t nip = 0;
+#endif
+
+    int64_t count = 0;
+    real* dst = finput_data + line_index_offset;
+    int64_t inputHW = inputHeight*inputWidth;
+    int64_t inputDHW = inputHW*inputDepth;
+
+    while (count < line_seg_len) {
+      int64_t w = w_out * dW - pW + k;
+      int64_t h = h_out * dH - pH + j;
+      int64_t d = d_out * dT - pT + i;
+
+
+      *dst = (h >= 0 && w >= 0 && d >= 0 && h < inputHeight && w < inputWidth && d < inputDepth) ?
+        input_data[nip*inputDHW+ d*inputHW + h*inputWidth + w] : 0;
+
+      count++;
+      if (count < line_seg_len) {
+        dst++;
+        w_out++;
+        if (w_out == outputWidth) {
+          w_out = 0;
+          h_out++;
+          if (h_out == outputHeight) {
+            h_out = 0;
+            d_out++;
+            if (d_out == outputDepth) {
+              d_out = 0;
+              k++;
+              if(k == kW) {
+                k = 0;
+                j++;
+                if(j == kH) {
+                  j = 0;
+                  i++;
+                  if(i == kT) {
+                    i = 0;
+                    nip++;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+    }
+#ifdef _OPENMP
+  }
+#endif
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int64_t nInputPlane,
+          int64_t inputDepth,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t nOutputPlane,
+          int64_t outputDepth,
+          int64_t outputWidth,
+          int64_t outputHeight)
+{
+  int64_t i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
+
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
+
+  if (bias) {
+      for (i = 0; i < nOutputPlane; i++)
+      {
+        THVector_(fill)(
+          THStorage_(data)(output->storage)+output->storageOffset+output->stride[0]*i,
+          THTensor_(get1d)(bias, i),
+          outputDepth*outputHeight*outputWidth
+        );
+      }
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput, // unused
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int64_t nInputPlane;
+  int64_t inputDepth;
+  int64_t inputHeight;
+  int64_t inputWidth;
+  int64_t nOutputPlane;
+  int64_t outputDepth;
+  int64_t outputHeight;
+  int64_t outputWidth;
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, NULL, weight, bias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH, 0);
+  input = THTensor_(newContiguous)(input);
+
+  if (input->dim() == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  weight = THNN_(newViewWeight)(weight);
+
+  if (input->dim() == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+#ifdef _OPENMP
+    #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t)
+#endif
+    for (t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH, 0);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  weight = THNN_(newViewWeight)(weight);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if (input->dim() == 4)
+  {
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, tweight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+#ifdef _OPENMP
+    #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t)
+#endif
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, tweight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,  // can be NULL if gradWeight = NULL
+          real scale)
+{
+  int64_t i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  if (gradWeight){
+    THTensor *tfinput = THTensor_(new)();
+    THTensor_(transpose)(tfinput, finput, 0, 1);
+    THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+    THTensor_(free)(tfinput);
+  }
+
+  if (gradBias) {
+    for (i = 0; i < gradBias->size[0]; i++)
+    {
+      int64_t k;
+      real sum = 0;
+      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for (k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+
+      (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, gradWeight, gradBias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH, 1);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  if (gradWeight) {
+    gradWeight = THNN_(newViewWeight)(gradWeight);
+  }
+
+  if (input->dim() == 4)   // non-batch mode
+  {
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else  // batch mode
+  {
+    int64_t T = input->size[0];
+    int64_t t;
+
+#ifdef _OPENMP
+    #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t)
+#endif
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = NULL;
+      if (gradWeight) {
+        finput_t = THTensor_(newSelect)(finput, 0, t);
+      }
+
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      if (gradWeight) {
+        THTensor_(free)(finput_t);
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (gradWeight) {
+    THTensor_(free)(gradWeight);
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 0000000..66d560a
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,455 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int kT, int kH, int kW, int dT, int dH, int dW,
+                         int padT, int padH, int padW,
+                         int dilationT, int dilationH, int dilationW,
+                         int weight_nullable) {
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight,
+                  "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                  "expected for weight, but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  // Params
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t inputDepth  = input->size[dimd];
+  int64_t inputHeight  = input->size[dimh];
+  int64_t inputWidth   = input->size[dimw];
+  int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small",
+      inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  }
+
+  if (weight != NULL) {
+    int64_t nInputPlane = weight->size[1];
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      int64_t nOutputPlane = weight->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 0);
+
+  // Params:
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous");
+  if (bias) {
+    bias = THTensor_(newContiguous)(bias);
+    THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous");
+  }
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  int64_t inputDepth  = input->size[2];
+  int64_t inputHeight  = input->size[3];
+  int64_t inputWidth   = input->size[4];
+  int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    int64_t m_ = nOutputPlane;
+    int64_t n_ = outputDepth * outputHeight * outputWidth;
+    int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    int64_t m = nOutputPlane;
+    int64_t n = columns->size[1];
+    int64_t k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 0);
+
+  // Params
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous");
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t inputDepth  = input->size[2];
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    int64_t m = nInputPlane*kT*kW*kH;
+    int64_t n = gradColumns->size[1];
+    int64_t k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW, 1);
+
+  // Params
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  if (gradWeight) {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous");
+  if (gradBias) {
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous");
+  }
+
+  int is_batch = 1;
+  if (input->dim() == 4) {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  int64_t nInputPlane = input->size[1];
+  int64_t nOutputPlane = gradOutput->size[1];
+  int64_t inputDepth  = input->size[2];
+  int64_t inputWidth   = input->size[4];
+  int64_t inputHeight  = input->size[3];
+  int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THTensor_(select)(input_n, input, 0, elt);
+
+      // Extract columns:
+      THNN_(vol2col)(
+        THTensor_(data)(input_n),
+        nInputPlane, inputDepth, inputHeight, inputWidth,
+        outputDepth, outputHeight, outputWidth,
+        kT, kH, kW, padT, padH, padW, dT, dH, dW,
+        dilationT, dilationH, dilationW,
+        THTensor_(data)(columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kT*kW*kH;
+      int64_t k = columns->size[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      THBlas_(gemm)(
+          't', 'n',
+          n, m, k,
+          scale,
+          THTensor_(data)(columns), k,
+          THTensor_(data)(gradOutput_n), k,
+          1,
+          THTensor_(data)(gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      int64_t m_ = nOutputPlane;
+      int64_t k_ = outputDepth * outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (is_batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c
new file mode 100644
index 0000000..1641c60
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -0,0 +1,503 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int pT, int pW, int pH,
+                         int dilationT, int dilationW, int dilationH,
+                         bool ceilMode) {
+  int ndim = input->dim();
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  int64_t nslices;
+  int64_t itime;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t otime;
+  int64_t oheight;
+  int64_t owidth;
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+             "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+             kT, kW, kH, pT, pW, pH);
+
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *indz_p,
+          int64_t nslices,
+          int64_t itime,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t otime,
+          int64_t owidth,
+          int64_t oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    int64_t i, j, ti;
+    real *ip = input_p + k * itime * iwidth * iheight;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          int64_t start_t = ti * dT - pT;
+          int64_t start_h = i * dH - pH;
+          int64_t start_w = j * dW - pW;
+
+          int64_t end_t = fminf(start_t + (kT - 1) * dilationT + 1, itime);
+          int64_t end_h = fminf(start_h + (kH - 1) * dilationH + 1, iheight);
+          int64_t end_w = fminf(start_w + (kW - 1) * dilationW + 1, iwidth);
+
+          while(start_t < 0)
+            start_t += dilationT;
+          while(start_h < 0)
+            start_h += dilationH;
+          while(start_w < 0)
+            start_w += dilationW;
+
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          THIndex_t *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local max: */
+          int64_t maxindex = -1;
+          real maxval = -THInf;
+          int64_t x,y,z;
+          int64_t index = 0;
+
+          for (z = start_t; z < end_t; z += dilationT)
+          {
+            for (y = start_h; y < end_h; y += dilationH)
+            {
+              for (x = start_w; x < end_w; x += dilationW)
+              {
+                index = z * iwidth * iheight + y * iwidth + x;
+                real val = ip[index];
+                if ((val > maxval) || std::isnan(val))
+                {
+                  maxval = val;
+                  maxindex = index;
+                }
+              }
+            }
+          }
+
+          // store location of max
+          *indzp = maxindex + TH_INDEX_BASE;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  int64_t nslices;
+  int64_t itime;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t otime;
+  int64_t oheight;
+  int64_t owidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, NULL, NULL,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->dim() == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    int64_t p;
+    int64_t nBatch = input->size[0];
+
+    int64_t istride = nslices * itime * iwidth * iheight;
+    int64_t ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *indz_p,
+          int64_t nslices,
+          int64_t itime,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t otime,
+          int64_t owidth,
+          int64_t oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  int64_t k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    int64_t ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          int64_t index = ti * oheight * owidth + i * owidth + j;
+          int64_t maxp = indz_p_k[index] - TH_INDEX_BASE;
+
+          if (maxp != -1) {
+            /* update gradient */
+            gradInput_p_k[maxp] += gradOutput_p_k[index];
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  // TODO: gradOutput shape check
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    int64_t p;
+    int64_t nBatch = input->size[0];
+
+    int64_t istride = nslices * itime * iwidth * iheight;
+    int64_t ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c b/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c
new file mode 100644
index 0000000..12f9925
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c"
+#else
+
+static int64_t* THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+  real sample,
+  int64_t inputSize,
+  int64_t outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  int64_t* sequence = (int64_t*) THAlloc(sizeof(int64_t) * outputSize);
+
+  int64_t i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (int64_t) ((i + sample) * alpha) - (int64_t) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  int64_t numPlanes,
+  int64_t inputT, int64_t inputW, int64_t inputH,
+  int64_t outputT, int64_t outputW, int64_t outputH,
+  int poolSizeT, int poolSizeW, int poolSizeH) {
+  int64_t plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 3 random samples, one for T, one for W, and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 3;
+
+    /* Generate interval sequence */
+    int64_t* sequenceT =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputT, outputT, poolSizeT);
+    int64_t* sequenceW =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputW, outputW, poolSizeW);
+    int64_t* sequenceH =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[2], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    int64_t h, w, t;
+
+    real* inputForPlane = input + plane * inputT * inputW * inputH;
+    real* outputForPlane = output + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      int64_t inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        int64_t inputWStart = sequenceW[w];
+
+        for (t = 0; t < outputT; ++t) {
+          int64_t inputTStart = sequenceT[t];
+
+          real maxVal = -THInf;
+          int64_t maxIndex = -1;
+
+          int64_t h2, w2, t2;
+          for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+            for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+              for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) {
+                THAssert(h2 >= 0 && h2 < inputH);
+                THAssert(w2 >= 0 && w2 < inputW);
+                THAssert(t2 >= 0 && t2 < inputT);
+
+                int64_t planeIndex = h2 * inputW * inputT + w2 * inputT + t2;
+                real val = inputForPlane[planeIndex];
+                if (val > maxVal) {
+                  maxVal = val;
+                  maxIndex = planeIndex;
+                }
+              }
+            }
+          }
+
+          THAssert(maxVal != -THInf);
+          THAssert(maxIndex != -1);
+
+          outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal;
+          /* +1 to lua index */
+          indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE;
+        }
+      }
+    }
+
+    THFree(sequenceT);
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  int64_t numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  int64_t numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(!input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input,
+		"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THTensor_(size)(input, planeDim);
+  int64_t inputH = THTensor_(size)(input, heightDim);
+  int64_t inputW = THTensor_(size)(input, widthDim);
+  int64_t inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 9,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 8,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+  THArgCheck(outputT + poolSizeT - 1 < inputT, 7,
+             "poolSizeT (%d) too large relative to input time (%d)",
+	     poolSizeT, inputT);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 4) {
+    /* resize output */
+    THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT);
+
+    THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputT, inputW, inputH,
+      outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+    int64_t batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 3,
+        numPlanes, inputT, inputW, inputH,
+        outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  int64_t numPlanes,
+  int64_t inputT, int64_t inputW, int64_t inputH,
+  int64_t outputT, int64_t outputW, int64_t outputH) {
+  int64_t plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    int64_t h, w, t;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        for (t = 0; t < outputT; ++t) {
+          int64_t outputIndex = h * outputW * outputT + w * outputT + t;
+          int64_t index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+          THAssert(index >= 0 && index < inputT * inputW * inputH);
+
+          gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  int64_t numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  int64_t numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  int64_t numPlanes = THTensor_(size)(input, planeDim);
+  int64_t inputH = THTensor_(size)(input, heightDim);
+  int64_t inputW = THTensor_(size)(input, widthDim);
+  int64_t inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3,
+             "gradOutput time unexpected");
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 4) {
+    THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+  } else {
+    int64_t batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricFullConvolution.c b/aten/src/THNN/generic/VolumetricFullConvolution.c
new file mode 100644
index 0000000..e546584
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricFullConvolution.c
@@ -0,0 +1,60 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int kT, int kW, int kH,   // kenerl size
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+      state, input, output, weight, bias, finput, fgradInput,
+      kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH);
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int kT, int kW, int kH,   // kenerl size
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+      state, input, gradOutput, gradInput, weight, finput, fgradInput,
+      kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH);
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int kT, int kW, int kH,   // kenerl size
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  accreal scale_)
+{
+  THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+      state, input, gradOutput, gradWeight, gradBias, finput, fgradInput,
+      kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH, scale_);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
new file mode 100644
index 0000000..c7c18ea
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
@@ -0,0 +1,573 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullDilatedConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+  const real *data_vol, const int64_t channels,
+  const int64_t depth, const int64_t height, const int64_t width,
+  const int64_t depth_col, const int64_t height_col, const int64_t width_col,
+  const int64_t kT, const int64_t kH, const int64_t kW,
+  const int64_t pT, const int64_t pH, const int64_t pW,
+  const int64_t dT, const int64_t dH, const int64_t dW,
+  const int64_t dilationT, const int64_t dilationH, const int64_t dilationW,
+  real *data_col)
+{
+  int64_t c, t, h, w;
+  int64_t channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int64_t w_offset = c % kW;
+    int64_t h_offset = (c / kW) % kH;
+    int64_t t_offset = (c / kW / kH) % kT;
+    int64_t c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      int64_t t_pad = t * dT - pT + t_offset * dilationT;
+      for (h = 0; h < height_col; ++h)
+      {
+        int64_t h_pad = h * dH - pH + h_offset * dilationH;
+        for (w = 0; w < width_col; ++w)
+        {
+          int64_t w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(col2vol)(
+  const real* data_col, const int64_t channels,
+  const int64_t depth, const int64_t height, const int64_t width,
+  const int64_t out_depth, const int64_t out_height, const int64_t out_width,
+  const int64_t kT, const int64_t kH, const int64_t kW,
+  const int64_t pT, const int64_t pH, const int64_t pW,
+  const int64_t dT, const int64_t dH, const int64_t dW,
+  const int64_t dilationT, const int64_t dilationH, const int64_t dilationW,
+  real* data_vol)
+{
+  int64_t c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int64_t depth_col  = out_depth;
+  int64_t height_col = out_height;
+  int64_t width_col  = out_width;
+  int64_t channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int64_t w_offset = c % kW;
+    int64_t h_offset = (c / kW) % kH;
+    int64_t t_offset = (c / kW / kH) % kT;
+    int64_t c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      int64_t t_pad = t * dT - pT + t_offset * dilationT;
+      for (h = 0; h < height_col; ++h)
+      {
+        int64_t h_pad = h * dH - pH + h_offset * dilationH;
+        for (w = 0; w < width_col; ++w)
+        {
+          int64_t w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int kT, int kW, int kH, int dT, int dW, int dH,
+                         int pT, int pW, int pH,
+                         int dilationT, int dilationW, int dilationH,
+                         int aT, int aW, int aH, int weight_nullable) {
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+  THArgCheck((aT < dT || aT < dilationT)
+             && (aW < dW || aW < dilationW)
+             && (aH < dH || aH < dilationH), 15,
+             "output padding must be smaller than either stride or dilation,"
+             " but got aT: %d aH: %d aW: %d dT: %d dH: %d dW: %d "
+             "dilationT: %d dilationH: %d dilationW: %d",
+             aT, aH, aW, dT, dH, dW, dilationT, dilationH, dilationW);
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  if (weight != NULL) {
+    THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight,
+                  "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                  "expected for weight, but got: %s");
+    if (bias != NULL) {
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+    }
+  } else if (!weight_nullable) {
+    THError("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input->dim();
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  if (weight != NULL) {
+    const int64_t nInputPlane = weight->size[0];
+    THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  }
+
+  const int64_t inputWidth   = input->size[dimw];
+  const int64_t inputHeight  = input->size[dimh];
+  const int64_t inputDepth   = input->size[dimd];
+  const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
+  const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
+  const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) {
+    THError("Given input size per channel: (%ld x %ld x %ld). "
+      "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small",
+      inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  }
+
+  if (gradOutput != NULL) {
+    if (weight != NULL) {
+      const int64_t nOutputPlane = weight->size[1];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    } else if (bias != NULL) {
+      const int64_t nOutputPlane = bias->size[0];
+      THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    }
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int kT, int kW, int kH,   // kernel size
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int dilationT, int dilationW, int dilationH,
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
+
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+        input, NULL, weight, bias, kT, kW, kH,
+        dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0);
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int is_batch = 1;
+  if (input->dim() == 4)
+  {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  const int64_t inputWidth   = input->size[4];
+  const int64_t inputHeight  = input->size[3];
+  const int64_t inputDepth   = input->size[2];
+  const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
+  const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
+  const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
+
+  // Batch size + input planes
+  const int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const int64_t n = columns->size[1];
+    const int64_t k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      inputDepth, inputHeight, inputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+      dilationT,  dilationH,  dilationW,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const int64_t m_ = nOutputPlane;
+    const int64_t n_ = outputDepth * outputHeight * outputWidth;
+    const int64_t k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+	if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (is_batch == 0)
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int kT, int kW, int kH,   // kernel size
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int dilationT, int dilationW, int dilationH,
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL, kT, kW, kH,
+        dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0);
+
+  const int64_t nInputPlane  = weight->size[0];
+  const int64_t nOutputPlane = weight->size[1];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int is_batch = 1;
+  if (input->dim() == 4)
+  {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const int64_t inputWidth   = input->size[4];
+  const int64_t inputHeight  = input->size[3];
+  const int64_t inputDepth   = input->size[2];
+  const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
+  const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
+  const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
+
+  // Batch size + input planes
+  const int64_t batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      inputDepth, inputHeight, inputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+      dilationT,  dilationH,  dilationW,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const int64_t m = weight->size[0];
+    const int64_t n = gradColumns->size[1];
+    const int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (is_batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int kT, int kW, int kH,   // kernel size
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int dilationT, int dilationW, int dilationH,
+  int aT, int aW, int aH,   // extra output adjustment
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias, kT, kW, kH,
+        dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 1);
+
+  int64_t nOutputPlane;
+  if (gradWeight) {
+    nOutputPlane = THTensor_(size)(gradWeight, 1);
+  } else if (gradBias) {
+    nOutputPlane = THTensor_(size)(gradBias, 0);
+  } else {
+    return;
+  }
+
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  if (gradWeight) {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  }
+  if (gradBias) {
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+    THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous");
+  }
+
+  int is_batch = 1;
+  if (input->dim() == 4)
+  {
+    // Force batch
+    is_batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const int64_t inputWidth   = input->size[4];
+  const int64_t inputHeight  = input->size[3];
+  const int64_t inputDepth   = input->size[2];
+  const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
+  const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
+  const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
+
+  // Batch size + input planes
+  const int64_t batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Do Weight:
+    if (gradWeight) {
+      // Matrix mulitply per output:
+      THTensor_(select)(input_n, input, 0, elt);
+
+      // Extract columns:
+      THNN_(vol2col)(
+        THTensor_(data)(gradOutput_n), nOutputPlane,
+        outputDepth, outputHeight, outputWidth,
+        inputDepth, inputHeight, inputWidth,
+        kT, kH, kW,
+        pT, pH, pW,
+        dT, dH, dW,
+        dilationT,  dilationH,  dilationW,
+        THTensor_(data)(columns)
+      );
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      const int64_t n = columns->size[0];   // nOutputPlane * kt * kh * kw
+      const int64_t m = input_n->size[0];   // nInputPlane
+      const int64_t k = columns->size[1];   // inputHeight * inputWidth
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+      );
+    }
+
+    // Do Bias:
+    if (gradBias) {
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      const int64_t m_ = nOutputPlane;
+      const int64_t k_ = outputDepth * outputHeight * outputWidth;
+
+      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+      THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (is_batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, input->size[1], inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
new file mode 100644
index 0000000..4d7ace4
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
@@ -0,0 +1,409 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c"
+#else
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+#undef MODE_BORDER
+#define MODE_BORDER 1
+
+static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)
+     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
+  THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input,
+    "non-empty 5D input tensor expected but got: %s");
+  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid,
+    "non-empty 5D grid tensor expected but got: %s");
+
+  int nbatch   = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int odepth    = THTensor_(size)(grid, 1);
+  int oheight   = THTensor_(size)(grid, 2);
+  int owidth    = THTensor_(size)(grid, 3);
+
+  THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch);
+  THNN_CHECK_DIM_SIZE(grid, 5, 4, 3);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth);
+  }
+}
+
+#define SAFE_GET(input, x, y, z, n, c, D, H, W) \
+  x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \
+    ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0
+
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *grid,
+    THTensor *output,
+    int padding_mode) {
+
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int ID = THTensor_(size)(input, 2);
+  int IH = THTensor_(size)(input, 3);
+  int IW = THTensor_(size)(input, 4);
+  int D = THTensor_(size)(grid, 1);
+  int H = THTensor_(size)(grid, 2);
+  int W = THTensor_(size)(grid, 3);
+
+  // resize output to the same shape as input
+  THTensor_(resize5d)(output, N, C, D, H, W);
+
+  // loop over each output pixel
+  int n, d, h, w, c;
+#pragma omp parallel for private(n, d, h, w, c)
+  for (n = 0; n < N; ++n) {
+    for (d = 0; d < D; ++d) {
+      for (h = 0; h < H; ++h) {
+        for (w = 0; w < W; ++w) {
+          // get the corresponding input x, y, z co-ordinates from grid
+          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
+          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
+          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
+
+          // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+          ix = ((ix + 1) / 2) * (IW-1);
+          iy = ((iy + 1) / 2) * (IH-1);
+          iz = ((iz + 1) / 2) * (ID-1);
+
+          // get corner pixel values from (x, y, z)
+          // for 4d, we used north-east-south-west
+          // for 5d, we add top-bottom
+          int ix_tnw = floor(ix);
+          int iy_tnw = floor(iy);
+          int iz_tnw = floor(iz);
+
+          int ix_tne = ix_tnw + 1;
+          int iy_tne = iy_tnw;
+          int iz_tne = iz_tnw;
+
+          int ix_tsw = ix_tnw;
+          int iy_tsw = iy_tnw + 1;
+          int iz_tsw = iz_tnw;
+
+          int ix_tse = ix_tnw + 1;
+          int iy_tse = iy_tnw + 1;
+          int iz_tse = iz_tnw;
+
+          int ix_bnw = ix_tnw;
+          int iy_bnw = iy_tnw;
+          int iz_bnw = iz_tnw + 1;
+
+          int ix_bne = ix_tnw + 1;
+          int iy_bne = iy_tnw;
+          int iz_bne = iz_tnw + 1;
+
+          int ix_bsw = ix_tnw;
+          int iy_bsw = iy_tnw + 1;
+          int iz_bsw = iz_tnw + 1;
+
+          int ix_bse = ix_tnw + 1;
+          int iy_bse = iy_tnw + 1;
+          int iz_bse = iz_tnw + 1;
+
+          // get surfaces to each neighbor:
+          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+          if (padding_mode==MODE_BORDER){
+            // clip coordinates to image borders
+            CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
+            CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
+            CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
+            CLIP_COORDINATES(ix_tne, ix_tne, IW);
+            CLIP_COORDINATES(iy_tne, iy_tne, IH);
+            CLIP_COORDINATES(iz_tne, iz_tne, ID);
+            CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
+            CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
+            CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
+            CLIP_COORDINATES(ix_tse, ix_tse, IW);
+            CLIP_COORDINATES(iy_tse, iy_tse, IH);
+            CLIP_COORDINATES(iz_tse, iz_tse, ID);
+            CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
+            CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
+            CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
+            CLIP_COORDINATES(ix_bne, ix_bne, IW);
+            CLIP_COORDINATES(iy_bne, iy_bne, IH);
+            CLIP_COORDINATES(iz_bne, iz_bne, ID);
+            CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
+            CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
+            CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
+            CLIP_COORDINATES(ix_bse, ix_bse, IW);
+            CLIP_COORDINATES(iy_bse, iy_bse, IH);
+            CLIP_COORDINATES(iz_bse, iz_bse, ID);
+          }
+
+          // calculate bilinear weighted pixel value and set output pixel
+          for (c = 0; c < C; ++c) {
+            //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+            // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+            real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW);
+            real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW);
+            real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW);
+            real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW);
+            real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW);
+            real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW);
+            real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW);
+            real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW);
+            real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse +
+              bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse;
+            THTensor_(fastSet5d)(output, n, c, d, h, w, out_val);
+          }
+        }
+      }
+    }
+  }
+}
+
+#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)  \
+  do {                                                                  \
+    if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) {          \
+      real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x);        \
+      THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value);      \
+    }                                                                   \
+  } while(0)
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *input, THTensor *gradInput,
+    THTensor *grid, THTensor *gradGrid,
+    THTensor *gradOutput,
+    int padding_mode) {
+
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int ID = THTensor_(size)(input, 2);
+  int IH = THTensor_(size)(input, 3);
+  int IW = THTensor_(size)(input, 4);
+  int D = THTensor_(size)(grid, 1);
+  int H = THTensor_(size)(grid, 2);
+  int W = THTensor_(size)(grid, 3);
+
+  THTensor_(resize5d)(gradInput, N, C, ID, IH, IW);
+  THTensor_(resize5d)(gradGrid, N, D, H, W, 3);
+  THTensor_(zero)(gradInput);
+  THTensor_(zero)(gradGrid);
+
+  // loop over each output pixel
+  int n, d, h, w;
+//#pragma omp parallel for private(n, d, h, w)
+  for (n = 0; n < N; ++n) {
+    for (d = 0; d < D; ++d) {
+      for (h = 0; h < H; ++h) {
+        for (w = 0; w < W; ++w) {
+          // get the corresponding input x, y, z co-ordinates from grid
+          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
+          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
+          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
+
+          real gix = 0;
+          real giy = 0;
+          real giz = 0;
+
+          // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1]
+          ix = ((ix + 1) / 2) * (IW-1);
+          iy = ((iy + 1) / 2) * (IH-1);
+          iz = ((iz + 1) / 2) * (ID-1);
+
+          // get corner pixel values from (x, y, z)
+          // for 4d, we used north-east-south-west
+          // for 5d, we add top-bottom
+          int ix_tnw = floor(ix);
+          int iy_tnw = floor(iy);
+          int iz_tnw = floor(iz);
+
+          int ix_tne = ix_tnw + 1;
+          int iy_tne = iy_tnw;
+          int iz_tne = iz_tnw;
+
+          int ix_tsw = ix_tnw;
+          int iy_tsw = iy_tnw + 1;
+          int iz_tsw = iz_tnw;
+
+          int ix_tse = ix_tnw + 1;
+          int iy_tse = iy_tnw + 1;
+          int iz_tse = iz_tnw;
+
+          int ix_bnw = ix_tnw;
+          int iy_bnw = iy_tnw;
+          int iz_bnw = iz_tnw + 1;
+
+          int ix_bne = ix_tnw + 1;
+          int iy_bne = iy_tnw;
+          int iz_bne = iz_tnw + 1;
+
+          int ix_bsw = ix_tnw;
+          int iy_bsw = iy_tnw + 1;
+          int iz_bsw = iz_tnw + 1;
+
+          int ix_bse = ix_tnw + 1;
+          int iy_bse = iy_tnw + 1;
+          int iz_bse = iz_tnw + 1;
+
+          // get surfaces to each neighbor:
+          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+          int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+          int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+          int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+          int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+          if (padding_mode==MODE_BORDER){
+            // clip coordinates to image borders
+            CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
+            CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
+            CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
+            CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
+            CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
+            CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
+            CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
+            CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
+            CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
+            CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
+            CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
+            CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
+            CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
+            CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
+            CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
+            CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
+            CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
+            CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
+            CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
+            CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
+            CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
+            CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
+            CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
+            CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
+          }
+          else {
+            ix_tnw_cl = ix_tnw;
+            iy_tnw_cl = iy_tnw;
+            iz_tnw_cl = iz_tnw;
+            ix_tne_cl = ix_tne;
+            iy_tne_cl = iy_tne;
+            iz_tne_cl = iz_tne;
+            ix_tsw_cl = ix_tsw;
+            iy_tsw_cl = iy_tsw;
+            iz_tsw_cl = iz_tsw;
+            ix_tse_cl = ix_tse;
+            iy_tse_cl = iy_tse;
+            iz_tse_cl = iz_tse;
+            ix_bnw_cl = ix_bnw;
+            iy_bnw_cl = iy_bnw;
+            iz_bnw_cl = iz_bnw;
+            ix_bne_cl = ix_bne;
+            iy_bne_cl = iy_bne;
+            iz_bne_cl = iz_bne;
+            ix_bsw_cl = ix_bsw;
+            iy_bsw_cl = iy_bsw;
+            iz_bsw_cl = iz_bsw;
+            ix_bse_cl = ix_bse;
+            iy_bse_cl = iy_bse;
+            iz_bse_cl = iz_bse;
+          }
+
+          for (int c = 0; c < C; ++c) {
+            real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w);
+
+            // calculate and set gradInput
+            SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
+            SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
+            SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
+            SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
+            SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
+            SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
+            SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
+            SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
+
+            // calculate gradGrid
+            real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW);
+            real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW);
+            real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW);
+            real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW);
+            real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW);
+            real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW);
+            real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW);
+            real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW);
+
+            gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
+            gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
+            gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
+            gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
+            gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
+            gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
+            gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
+            gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
+
+
+            giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
+            giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
+            giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
+            giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
+            giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
+            giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
+            giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
+            giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
+
+            giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
+            giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
+            giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
+            giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
+            giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
+            giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
+            giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
+            giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
+
+          }
+
+          // un-normalize gradGrid values back to [-1, 1] constraints
+          gix = gix * (IW - 1) / 2;
+          giy = giy * (IH - 1) / 2;
+          giz = giz * (ID - 1) / 2;
+
+          real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0);
+          real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1);
+          real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2);
+
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix);
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy);
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz);
+        }
+      }
+    }
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef SAFE_GET
+#undef CLIP_COORDINATES
+#undef SAFE_ADD
+#undef MODE_BORDER
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricMaxPooling.c b/aten/src/THNN/generic/VolumetricMaxPooling.c
new file mode 100644
index 0000000..a3601e0
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricMaxPooling.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          state, input, output, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          state, input, gradOutput, gradInput, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricMaxUnpooling.c b/aten/src/THNN/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 0000000..b8e649c
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,339 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int oT,
+                         int oW,
+                         int oH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH)
+{
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+
+  if (input->dim() == 5)
+  {
+    dimt++;
+    dimw++;
+    dimh++;
+    dimn++;
+  }
+  int nslices = input->size[dimn];
+
+  if (gradOutput != NULL) {
+    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    {
+      THError(
+        "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+      );
+    }
+
+    THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), dimn, nslices);
+  }
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index = 0;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *output_p_k = output_p + k * oT * oH * oW;
+    real *input_p_k = input_p + k * iT * iH * iW;
+    THIndex_t *ind_p_k = ind_p + k * iT * iH * iW;
+
+    int t, i, j, index;
+    THIndex_t maxp;
+    for (t = 0; t < iT; t++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          index = t * iH * iW + i * iW + j;
+          maxp = ind_p_k[index] - TH_INDEX_BASE;  /* retrieve position of max */
+          if (maxp < 0 || maxp >= oT * oW * oH)
+          {
+#pragma omp critical
+            {
+              has_error = 1;
+              error_index = maxp;
+            }
+          } else {
+            output_p_k[maxp] = input_p_k[index]; /* update output */
+          }
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError(
+        "found an invalid max index %ld (output volumes are of size %dx%dx%d)",
+        error_index, oT, oH, oW
+    );
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, NULL, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  if (input->dim() == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->dim() == 4)
+  {
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH
+    );
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k * iT * iH * iW;
+    real *gradOutput_p_k = gradOutput_p + k * oT * oH * oW;
+    THIndex_t *ind_p_k = ind_p + k * iT * iH * iW;
+
+    int t, i, j, index;
+    THIndex_t maxp;
+    for (t = 0; t < iT; t++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          index = t * iH * iW + i * iW  + j;
+          maxp = ind_p_k[index] - TH_INDEX_BASE;  /* retrieve position of max */
+          if (maxp < 0 || maxp >= oT * oH * oW)
+          {
+            THError("invalid max index %ld, oT= %d, oW= %d, oH= %d", maxp, oT, oW, oH);
+          }
+          gradInput_p_k[index] = gradOutput_p_k[maxp];  /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  // TODO: check gradOutput shape
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->dim() == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->dim() == 4)
+  {
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH
+    );
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricReplicationPadding.c b/aten/src/THNN/generic/VolumetricReplicationPadding.c
new file mode 100644
index 0000000..e64cb36
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,357 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int pleft, int pright,
+                         int ptop, int pbottom,
+                         int pfront, int pback) {
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  int64_t nslices;
+  int64_t idepth;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t odepth;
+  int64_t oheight;
+  int64_t owidth;
+
+  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
+		"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->dim() == 5)
+  {
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+             "input (D: %d H: %d, W: %d)is too small."
+             " Calculated output D: %d H: %d W: %d",
+             idepth, iheight, iwidth, odepth, oheight, owidth);
+
+  if (gradOutput != NULL) {
+    THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               nslices, THTensor_(size)(gradOutput, dimslices));
+    THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               owidth, THTensor_(size)(gradOutput, dimw));
+    THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+               "gradOutput height unexpected. Expected: %d, Got: %d",
+               oheight, THTensor_(size)(gradOutput, dimh));
+    THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+               "gradOutput depth unexpected. Expected: %d, Got: %d",
+               odepth, THTensor_(size)(gradOutput, dimd));
+  }
+}
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight, int64_t idepth,
+  int64_t owidth, int64_t oheight, int64_t odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  int64_t k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    int64_t i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t idepth;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t odepth;
+  int64_t oheight;
+  int64_t owidth;
+  real *input_data;
+  real *output_data;
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  if (input->dim() == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->dim() == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    int64_t p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  int64_t nslices,
+  int64_t iwidth, int64_t iheight, int64_t idepth,
+  int64_t owidth, int64_t oheight, int64_t odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  int64_t k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    int64_t i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  int64_t nbatch = 1;
+  int64_t nslices;
+  int64_t idepth;
+  int64_t iheight;
+  int64_t iwidth;
+  int64_t odepth;
+  int64_t oheight;
+  int64_t owidth;
+
+  if (input->dim() == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->dim() == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    int64_t p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c
new file mode 100644
index 0000000..b0e1a2f
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c
@@ -0,0 +1,173 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputDepth, int inputHeight, int inputWidth,
+      int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+       && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+       "input and output sizes should be greater than 0,"
+       " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+       inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->_dim() == 5, 2, input,
+      "5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+  }
+}
+
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth)
+{
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputDepth = THTensor_(size)(input, 2);
+  int inputHeight = THTensor_(size)(input, 3);
+  int inputWidth = THTensor_(size)(input, 4);
+  const float depth_scale = (float) inputDepth / (float) outputDepth;
+  const float height_scale = (float) inputHeight / (float)outputHeight;
+  const float width_scale = (float) inputWidth / (float)outputWidth;
+
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+
+  THTensor_(resize5d)(output,
+                      THTensor_(size)(input, 0),
+                      THTensor_(size)(input, 1),
+                      outputDepth,
+                      outputHeight,
+                      outputWidth);
+  channels = channels * nbatch;
+
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int d2 = 0; d2 < outputDepth; ++d2) {
+      const int d1 = d2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          const real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += inputDepth * inputHeight * inputWidth;
+            pos2 += outputDepth * outputHeight * outputWidth;
+          }
+        }
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+
+  for (int d2 = 0; d2 < outputDepth; ++d2) {
+    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, inputDepth);
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight);
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth);
+        const real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += inputDepth * inputHeight * inputWidth;
+          pos2 += outputDepth * outputHeight * outputWidth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputDepth,
+    int inputHeight,
+    int inputWidth,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *idata = THTensor_(data)(gradInput);
+  real *odata = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+  const float depth_scale = (float) inputDepth / (float) outputDepth;
+  const float height_scale = (float) inputHeight / (float)outputHeight;
+  const float width_scale = (float) inputWidth / (float)outputWidth;
+
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int d2 = 0; d2 < outputDepth; ++d2) {
+      const int d1 = d2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          const real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos1[0] += pos2[0];
+            pos1 += inputDepth * inputHeight * inputWidth;
+            pos2 += outputDepth * outputHeight * outputWidth;
+          }
+        }
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+
+  for (int d2 = 0; d2 < outputDepth; ++d2) {
+    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, inputDepth);
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight);
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth);
+        real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        const real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += inputDepth * inputHeight * inputWidth;
+          pos2 += outputDepth * outputHeight * outputWidth;
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c
new file mode 100644
index 0000000..e24a3e9
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c
@@ -0,0 +1,219 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c"
+#else
+
+#include "linear_upsampling.h"
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputDepth, int inputHeight, int inputWidth,
+      int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+	     && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+	     inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input,
+		  "non-empty 5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+  }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth,
+    bool align_corners){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputDepth = THTensor_(size)(input, 2);
+  int inputHeight = THTensor_(size)(input, 3);
+  int inputWidth = THTensor_(size)(input, 4);
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize5d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
+           outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    THTensor_(free)(input);
+    return;
+  }
+  const accreal rdepth  = linear_upsampling_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth  = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const accreal t1r = linear_upsampling_compute_source_index<accreal>(rdepth, t2, align_corners);
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const accreal h1r = linear_upsampling_compute_source_index<accreal>(rheight, h2, align_corners);
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+                              + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                                          + w1lambda * pos1[h1p * inputWidth + w1p]))
+                  + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + w1p])
+                              + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth + w1p]));
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputDepth,
+    int inputHeight,
+    int inputWidth,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth,
+    bool align_corners){
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos1[0] += pos2[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    THTensor_(free)(gradOutput);
+    return;
+  }
+  const accreal rdepth  = linear_upsampling_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth  = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const accreal t1r = linear_upsampling_compute_source_index<accreal>(rdepth, t2, align_corners);
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const accreal h1r = linear_upsampling_compute_source_index<accreal>(rheight, h2, align_corners);
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const accreal w1r = linear_upsampling_compute_source_index<accreal>(rwidth, w2, align_corners);
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
+          pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
+          pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0];
+          pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0];
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/aten/src/THNN/generic/linear_upsampling.h b/aten/src/THNN/generic/linear_upsampling.h
new file mode 100644
index 0000000..2873506
--- /dev/null
+++ b/aten/src/THNN/generic/linear_upsampling.h
@@ -0,0 +1,51 @@
+#ifndef THNN_LINEAR_UPSAMPLING_H
+#define THNN_LINEAR_UPSAMPLING_H
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+
+template<typename T>
+static inline T linear_upsampling_compute_scale(
+                          int inputSize, int outputSize, bool align_corners) {
+  /* We view each pixel as an area, idx + 0.5 as its center index.
+   * Here is an example formula in 1D case.
+   * if align_corners: center of two corner pixel areas are preserved,
+   *     (0.5, 0.5) -> (0.5, 0.5),
+   *     (inputSize - 0.5, 0.5) -> (outputSize - 0.5)
+   *     scale = (inputSize - 0.5 - 0.5) / (outputSize - 0.5 - 0.5)
+   *     src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5)
+   * if not align_corners: the whole range is scaled accordingly
+   *     scale = inputSize / outputSize
+   *     src_idx + 0.5 = scale * (dst_index + 0.5)
+   */
+  if (outputSize > 1) {
+    return align_corners ? (T) (inputSize - 1) / (outputSize - 1)
+                         : (T) inputSize / outputSize;
+  } else {
+    return T(0);
+  }
+}
+
+template<typename T>
+static inline T linear_upsampling_compute_source_index(
+                          T scale, int dst_index, bool align_corners) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    T src_idx = scale * (dst_index + 0.5) - 0.5;
+    return src_idx < 0 ? T(0) : src_idx;
+  }
+}
+
+static inline int nearest_neighbor_compute_source_index(
+		const float scale, int dst_index, int inputSize) {
+  const int src_index = MIN(floorf(dst_index * scale), inputSize - 1);
+  return src_index;
+}
+
+
+#endif
+
diff --git a/aten/src/THNN/generic/unfold.c b/aten/src/THNN/generic/unfold.c
new file mode 100644
index 0000000..7feae7c
--- /dev/null
+++ b/aten/src/THNN/generic/unfold.c
@@ -0,0 +1,166 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // outputHeight*dH does not overflow a int64_t
+  // outputWidth*dW does not overflow a int64_t
+
+  int nip;
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    int kw, kh, y, x;
+    int64_t ix, iy;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+        real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          int lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (int64_t)y*dH - padH + kh;
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = 0 - padW + kw;
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
+                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (int64_t)x*dW - padW + kw;
+                   if (ix < 0 || ix >= inputWidth){
+                   }else{
+                     real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+                   }
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (int64_t)y*dH + kh;
+            ix = 0 + kw;
+            if (dW == 1 ) {
+               real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            }else{
+              for(x = 0; x < outputWidth; x++) {
+                real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
+                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // kH*kW does not overflow an int
+  // nInputPlane*kH*kW does not overflow a int64_t
+  // outputHeight*dH does not overflow a int64_t
+  // outputWidth*dW does not overflow a int64_t
+
+  int64_t k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < (int64_t)nInputPlane*kH*kW; k++) {
+    int64_t nip = k / (kH*kW);
+    int64_t rest = k % (kH*kW);
+    int64_t kh = rest / kW;
+    int64_t kw = rest % kW;
+    int x, y;
+    int64_t ix, iy;
+    real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+    real *src = input_data + nip*((size_t)inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      int64_t lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (int64_t)y*dH - padH + kh;
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = 0 - padW + kw;
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (int64_t)x*dW - padW + kw;
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (int64_t)y*dH + kh;
+        ix = 0 + kw;
+        if (dW == 1)
+           memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(int64_t)x*dW, sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp
new file mode 100644
index 0000000..f22cd26
--- /dev/null
+++ b/aten/src/THNN/init.cpp
@@ -0,0 +1,309 @@
+#include "TH.h"
+#include "THNN.h"
+
+#include "THTensor.hpp"
+#include <cmath>
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#define THNN_CHECK_SHAPE(I1, I2)			\
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2))	\
+    {							\
+       THDescBuff s1 = THTensor_(sizeDesc)(I1);		\
+       THDescBuff s2 = THTensor_(sizeDesc)(I2);		\
+       THError(#I1 " and " #I2 " shapes do not match: "	\
+	       #I1 " %s, " #I2 " %s", s1.str, s2.str);	\
+    }
+
+#define THNN_CHECK_SHAPE_INDICES(I1, I2)             \
+  THLongStorage *size2 = THLongTensor_newSizeOf(I2); \
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSize)(I1, size2)) \
+    {             \
+      THDescBuff s1 = THTensor_(sizeDesc)(I1);       \
+      THDescBuff s2 = THLongTensor_sizeDesc(I2);     \
+      THLongStorage_free(size2);                     \
+      THError(#I1 " and " #I2 " shapes do not match: " \
+        #I1 " %s, " #I2 " %s", s1.str, s2.str);      \
+    } else {      \
+      THLongStorage_free(size2);                     \
+    }
+
+#define THNN_CHECK_NELEMENT(I1, I2) \
+  if (I1 != NULL && I2 != NULL ) {					\
+    ptrdiff_t n1 = THTensor_(nElement)(I1);					\
+    ptrdiff_t n2 = THTensor_(nElement)(I2);	                                \
+    if (n1 != n2)							\
+      {									\
+	THDescBuff s1 = THTensor_(sizeDesc)(I1);			\
+	THDescBuff s2 = THTensor_(sizeDesc)(I2);			\
+	THError(#I1 " and " #I2 " have different number of elements: "	\
+		#I1 "%s has %ld elements, while "			\
+		#I2 "%s has %ld elements", s1.str, n1, s2.str, n2);	\
+      }									\
+  }
+
+#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
+  if (THTensor_(nDimension)(T) != DIM ||				\
+      THTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
+  if (THIndexTensor_(nDimension)(T) != DIM ||				\
+      THIndexTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+        " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_ARGCHECK(COND, ARG, T, FORMAT)	\
+  if (!(COND)) {				\
+    THDescBuff s1 = THTensor_(sizeDesc)(T);	\
+    THArgCheck(COND, ARG, FORMAT, s1.str);	\
+  }
+
+#include "generic/Abs.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/AbsCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BCECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Col2Im.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DistKLDivCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ELU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardTanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Im2Col.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/GatedLinearUnit.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/L1Cost.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LeakyReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/FusedRNNKernel.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LookupTable.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MSECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiLabelMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Linear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/PReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/RReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SmoothL1Criterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftPlus.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SparseLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/IndexLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sqrt.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Square.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Tanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Threshold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalRowConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalUpSamplingLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/FeatureLPPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BatchNormalization.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/unfold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionLocal.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingBilinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialGridSamplerBilinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAdaptiveAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingTrilinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricGridSamplerBilinear.c"
+#include "THGenerateFloatTypes.h"
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
new file mode 100755
index 0000000..c341b88
--- /dev/null
+++ b/aten/tools/run_tests.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -x
+set -e
+
+VALGRIND_SUP="${PWD}/`dirname $0`/valgrind.sup"
+pushd $1
+
+VALGRIND=${VALGRIND:=ON}
+./basic
+./atest
+./scalar_test
+./broadcast_test
+./wrapdim_test
+./apply_utils_test
+./dlconvertor_test
+./native_test
+./scalar_tensor_test
+./undefined_tensor_test
+if [[ -x ./cudnn_test ]]; then
+  ./cudnn_test
+fi
+if [[ -x ./cuda_rng_test ]]; then
+  ./cuda_rng_test
+fi
+if [[ -x ./apply_test ]]; then
+  ./apply_test
+fi
+if [[ -x ./stream_test ]]; then
+  ./stream_test
+fi
+if [ "$VALGRIND" == "ON" ]
+then
+  valgrind --suppressions="$VALGRIND_SUP" --error-exitcode=1 ./basic "[cpu]"
+fi
+
+popd
diff --git a/aten/tools/test_install.sh b/aten/tools/test_install.sh
new file mode 100755
index 0000000..381d1e7
--- /dev/null
+++ b/aten/tools/test_install.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -x
+set -e
+INSTALL_ROOT=$1
+SRC_ROOT=$2
+rm -rf test_build
+mkdir test_build
+cd test_build
+cmake -DCMAKE_PREFIX_PATH=$INSTALL_ROOT $SRC_ROOT/src/ATen/test/test_install
+make
+./main
diff --git a/aten/tools/update_doc.sh b/aten/tools/update_doc.sh
new file mode 100755
index 0000000..f8fb6c3
--- /dev/null
+++ b/aten/tools/update_doc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cp build/src/ATen/ATen/{Tensor,Type,Functions}.h doc
+
diff --git a/aten/tools/valgrind.sup b/aten/tools/valgrind.sup
new file mode 100644
index 0000000..fd1c39d
--- /dev/null
+++ b/aten/tools/valgrind.sup
@@ -0,0 +1,11 @@
+{
+   <startup>
+   Memcheck:Cond
+   fun:index
+   fun:expand_dynamic_string_token
+   fun:_dl_map_object
+   fun:map_doit
+   fun:_dl_catch_error
+   fun:handle_ld_preload
+   ...
+}
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
new file mode 100644
index 0000000..2f37470
--- /dev/null
+++ b/binaries/CMakeLists.txt
@@ -0,0 +1,56 @@
+caffe2_binary_target("convert_caffe_image_db.cc")
+caffe2_binary_target("convert_db.cc")
+caffe2_binary_target("make_cifar_db.cc")
+caffe2_binary_target("make_mnist_db.cc")
+caffe2_binary_target("predictor_verifier.cc")
+caffe2_binary_target("print_registered_core_operators.cc")
+caffe2_binary_target("run_plan.cc")
+caffe2_binary_target("speed_benchmark.cc")
+caffe2_binary_target("split_db.cc")
+
+caffe2_binary_target("db_throughput.cc")
+
+
+if (USE_CUDA)
+  caffe2_binary_target("inspect_gpus.cc")
+  target_link_libraries(inspect_gpus ${CUDA_LIBRARIES})
+  caffe2_binary_target("print_core_object_sizes.cc")
+
+  if (BUILD_TEST)
+    # Core overhead benchmark
+    caffe2_binary_target("core_overhead_benchmark.cc")
+    target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY})
+  endif()
+endif()
+
+if (USE_ZMQ)
+  caffe2_binary_target("zmq_feeder.cc")
+  target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES})
+endif()
+
+if(USE_MPI)
+  caffe2_binary_target("run_plan_mpi.cc")
+  target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES})
+endif()
+
+if (USE_OPENCV AND USE_LEVELDB)
+  caffe2_binary_target("convert_encoded_to_raw_leveldb.cc")
+  target_link_libraries(
+      convert_encoded_to_raw_leveldb
+      ${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES})
+endif()
+
+if (USE_OPENCV)
+  caffe2_binary_target("make_image_db.cc")
+  target_link_libraries(make_image_db ${OpenCV_LIBS})
+endif()
+
+if (USE_OBSERVERS)
+  add_executable(caffe2_benchmark "caffe2_benchmark.cc" "benchmark_helper.cc")
+  target_link_libraries(caffe2_benchmark  ${Caffe2_MAIN_LIBS})
+  target_link_libraries(caffe2_benchmark ${Caffe2_MODULES})
+  install(TARGETS caffe2_benchmark DESTINATION bin)
+endif()
+
+# ---[ tutorials
+caffe2_binary_target("tutorial_blob.cc")
diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
new file mode 100644
index 0000000..048e151
--- /dev/null
+++ b/binaries/bench_gen/bench_gen.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.predictor import mobile_exporter
+from caffe2.python import workspace, brew
+
+
+def parse_kwarg(kwarg_str):
+    key, value = kwarg_str.split('=')
+    try:
+        value = int(value)
+    except ValueError:
+        try:
+            value = float(value)
+        except ValueError:
+            pass
+    return key, value
+
+
+def main(args):
+    # User defined keyword arguments
+    kwargs = {"order": "NCHW"}
+    kwargs.update(dict(args.kwargs))
+
+    model = ModelHelper(name=args.benchmark_name)
+
+    op_type = args.operator  # assumes a brew type op name
+    input_name = args.input_name
+    output_name = args.output_name
+
+    iters = int(args.instances)
+    for i in range(iters):
+        input_blob_name = input_name + (str(i) if i > 0 and args.chain else '')
+        output_blob_name = output_name + str(i + 1)
+        add_op = getattr(brew, op_type)
+        add_op(model, input_blob_name, output_blob_name, **kwargs)
+        if args.chain:
+            input_name, output_name = output_name, input_name
+
+    workspace.RunNetOnce(model.param_init_net)
+
+    init_net, predict_net = mobile_exporter.Export(
+        workspace, model.net, model.params
+    )
+
+    if args.debug:
+        print("init_net:")
+        for op in init_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+        print("predict_net:")
+        for op in predict_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+
+    with open(args.predict_net, 'wb') as f:
+        f.write(predict_net.SerializeToString())
+    with open(args.init_net, 'wb') as f:
+        f.write(init_net.SerializeToString())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Utilitity to generate Caffe2 benchmark models.")
+    parser.add_argument("operator", help="Caffe2 operator to benchmark.")
+    parser.add_argument("-b", "--blob",
+                        help="Instantiate a blob --blob name=dim1,dim2,dim3",
+                        action='append')
+    parser.add_argument("--context", help="Context to run on.", default="CPU")
+    parser.add_argument("--kwargs", help="kwargs to pass to operator.",
+                        nargs="*", type=parse_kwarg, default=[])
+    parser.add_argument("--init_net", help="Output initialization net.",
+                        default="init_net.pb")
+    parser.add_argument("--predict_net", help="Output prediction net.",
+                        default="predict_net.pb")
+    parser.add_argument("--benchmark_name",
+                        help="Name of the benchmark network",
+                        default="benchmark")
+    parser.add_argument("--input_name", help="Name of the input blob.",
+                        default="data")
+    parser.add_argument("--output_name", help="Name of the output blob.",
+                        default="output")
+    parser.add_argument("--instances",
+                        help="Number of instances to run the operator.",
+                        default="1")
+    parser.add_argument("-d", "--debug", help="Print debug information.",
+                        action='store_true')
+    parser.add_argument("-c", "--chain",
+                        help="Chain ops together (create data dependencies)",
+                        action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
new file mode 100644
index 0000000..52b5117
--- /dev/null
+++ b/binaries/benchmark_helper.cc
@@ -0,0 +1,300 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include "binaries/benchmark_helper.h"
+#include "caffe2/core/blob_serialization.h"
+#ifdef __CUDA_ARCH__
+#include "caffe2/core/context_gpu.h"
+#endif
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/bench_utils.h"
+#include "caffe2/utils/string_utils.h"
+#include "observers/net_observer_reporter_print.h"
+#include "observers/observer_config.h"
+#include "observers/perf_observer.h"
+
+using std::map;
+using std::shared_ptr;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+void observerConfig() {
+  caffe2::ClearGlobalNetObservers();
+  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
+    return caffe2::make_unique<caffe2::PerfNetObserver>(subject);
+  });
+  caffe2::ObserverConfig::setReporter(
+      caffe2::make_unique<caffe2::NetObserverReporterPrint>());
+}
+
+bool backendCudaSet(const string& backend) {
+  bool run_on_gpu = false;
+  if (backend == "cuda") {
+#ifdef __CUDA_ARCH__
+    if (caffe2::HasCudaGPU()) {
+      run_on_gpu = true;
+    } else {
+      CAFFE_THROW("NO GPU support on this host machine");
+    }
+#else
+    CAFFE_THROW("NO GPU support");
+#endif
+  }
+  return run_on_gpu;
+}
+
+void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
+  for (int j = 0; j < net_def->op_size(); j++) {
+    caffe2::OperatorDef* op = net_def->mutable_op(j);
+    op->mutable_device_option()->set_device_type(run_dev);
+  }
+}
+
+void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
+  if (backend != "builtin") {
+    string engine = backend == "nnpack"
+        ? "NNPACK"
+        : backend == "eigen" ? "EIGEN"
+                             : backend == "mkl" ? "MKLDNN"
+                                                : backend == "cuda"
+                    ? "CUDA"
+                    : backend == "dnnlowp" ? "DNNLOWP"
+                                           : backend == "dnnlowp_16"
+                            ? "DNNLOWP_16"
+                            : backend == "default" ? "" : "NONE";
+    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
+    for (int i = 0; i < net_def->op_size(); i++) {
+      caffe2::OperatorDef* op_def = net_def->mutable_op(i);
+      op_def->set_engine(engine);
+    }
+  }
+}
+
+void loadInput(
+    shared_ptr<caffe2::Workspace> workspace,
+    const bool run_on_gpu,
+    map<string, caffe2::TensorProtos>& tensor_protos_map,
+    const string& input,
+    const string& input_file,
+    const string& input_dims,
+    const string& input_type) {
+  // Load input.
+  if (input.size()) {
+    vector<string> input_names = caffe2::split(',', input);
+    if (input_file.size()) {
+      vector<string> input_files = caffe2::split(',', input_file);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_files.size(),
+          "Input name and file should have the same number.");
+      for (int i = 0; i < input_names.size(); ++i) {
+        caffe2::TensorProtos tensor_protos;
+        CAFFE_ENFORCE(
+            caffe2::ReadProtoFromFile(input_files[i], &tensor_protos));
+        workspace->CreateBlob(input_names[i]);
+        tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos));
+      }
+    } else if (input_dims.size() || input_type.size()) {
+      CAFFE_ENFORCE_GE(
+          input_dims.size(),
+          0,
+          "Input dims must be specified when input tensors are used.");
+      CAFFE_ENFORCE_GE(
+          input_type.size(),
+          0,
+          "Input type must be specified when input tensors are used.");
+
+      vector<string> input_dims_list = caffe2::split(';', input_dims);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_dims_list.size(),
+          "Input name and dims should have the same number of items.");
+      vector<string> input_type_list = caffe2::split(';', input_type);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_type_list.size(),
+          "Input name and type should have the same number of items.");
+      for (size_t i = 0; i < input_names.size(); ++i) {
+        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
+        vector<int> input_dims;
+        for (const string& s : input_dims_str) {
+          input_dims.push_back(caffe2::stoi(s));
+        }
+        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
+        if (blob == nullptr) {
+          blob = workspace->CreateBlob(input_names[i]);
+        }
+        if (run_on_gpu) {
+          LOG(INFO) << "Running on GPU.";
+#ifdef __CUDA_ARCH__
+          caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>();
+          CHECK_NOTNULL(tensor);
+          tensor->Resize(input_dims);
+          if (input_type_list[i] == "uint8_t") {
+            tensor->mutable_data<uint8_t>();
+          } else if (input_type_list[i] == "float") {
+            tensor->mutable_data<float>();
+          } else {
+            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
+          }
+#else
+          CAFFE_THROW("Not support GPU on mobile.");
+#endif
+        } else {
+          caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+          CHECK_NOTNULL(tensor);
+          tensor->Resize(input_dims);
+          if (input_type_list[i] == "uint8_t") {
+            tensor->mutable_data<uint8_t>();
+          } else if (input_type_list[i] == "float") {
+            tensor->mutable_data<float>();
+          } else {
+            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
+          }
+        }
+      }
+    } else {
+      CAFFE_THROW(
+          "You requested input tensors, but neither input_file nor "
+          "input_dims is set.");
+    }
+  }
+}
+
+void fillInputBlob(
+    shared_ptr<caffe2::Workspace> workspace,
+    map<string, caffe2::TensorProtos>& tensor_protos_map,
+    int iteration) {
+  if (tensor_protos_map.empty()) {
+    return;
+  }
+
+  for (auto& tensor_kv : tensor_protos_map) {
+    caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
+    if (blob == nullptr) {
+      blob = workspace->CreateBlob(tensor_kv.first);
+    }
+    // todo: support gpu and make this function a tempalte
+    int protos_size = tensor_kv.second.protos_size();
+    caffe2::TensorProto* tensor_proto =
+        tensor_kv.second.mutable_protos(iteration % protos_size);
+    caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+    tensor->Resize(std::vector<caffe2::TIndex>());
+    if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
+      (tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
+    } else if (tensor_proto->data_type() == caffe2::TensorProto::FLOAT) {
+      (tensor->mutable_data<float>())[0] = tensor_proto->float_data(0);
+    }
+    // todo: for other types
+  }
+}
+
+void runNetwork(
+    shared_ptr<caffe2::Workspace> workspace,
+    caffe2::NetDef& net_def,
+    map<string, caffe2::TensorProtos>& tensor_protos_map,
+    const bool wipe_cache,
+    const bool run_individual,
+    const int warmup,
+    const int iter) {
+  if (!net_def.has_name()) {
+    net_def.set_name("benchmark");
+  }
+
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
+
+  LOG(INFO) << "Starting benchmark.";
+  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
+  LOG(INFO) << "Running warmup runs.";
+  for (int i = 0; i < warmup; ++i) {
+    fillInputBlob(workspace, tensor_protos_map, i);
+    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
+  }
+
+  if (wipe_cache) {
+    caffe2::wipe_cache();
+  }
+  LOG(INFO) << "Main runs.";
+  CAFFE_ENFORCE(
+      iter >= 0,
+      "Number of main runs should be non negative, provided ",
+      iter,
+      ".");
+  for (int i = 0; i < iter; ++i) {
+    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
+    fillInputBlob(workspace, tensor_protos_map, i);
+    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
+    if (wipe_cache) {
+      caffe2::wipe_cache();
+    }
+    if (run_individual) {
+      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
+      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
+      if (wipe_cache) {
+        caffe2::wipe_cache();
+      }
+    }
+  }
+}
+
+void writeOutput(
+    shared_ptr<caffe2::Workspace> workspace,
+    const bool run_on_gpu,
+    const string& output,
+    const string& output_folder,
+    const bool text_output) {
+  string output_prefix = output_folder.size() ? output_folder + "/" : "";
+  if (output.size()) {
+    vector<string> output_names = caffe2::split(',', output);
+    if (output == "*") {
+      output_names = workspace->Blobs();
+    }
+    for (const string& name : output_names) {
+      CAFFE_ENFORCE(
+          workspace->HasBlob(name),
+          "You requested a non-existing blob: ",
+          name);
+      if (text_output) {
+        if (run_on_gpu) {
+#ifdef __CUDA_ARCH__
+          writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
+              workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
+              output_prefix,
+              name);
+#else
+          CAFFE_THROW("Not support GPU.");
+#endif
+        } else {
+          writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
+              workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
+              output_prefix,
+              name);
+        }
+      } else {
+        string serialized = workspace->GetBlob(name)->Serialize(name);
+        string output_filename = output_prefix + name;
+        caffe2::WriteStringToFile(serialized, output_filename.c_str());
+      }
+    }
+  }
+}
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
new file mode 100644
index 0000000..0a52e16
--- /dev/null
+++ b/binaries/benchmark_helper.h
@@ -0,0 +1,99 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/string_utils.h"
+
+using std::map;
+using std::shared_ptr;
+using std::string;
+using std::vector;
+
+template <typename ContextType, typename TensorType>
+void writeTextOutput(
+    TensorType* tensor,
+    const string& output_prefix,
+    const string& name) {
+  string output_name = output_prefix + "/" + name + ".txt";
+  caffe2::TensorSerializer<ContextType> ser;
+  caffe2::BlobProto blob_proto;
+  ser.Serialize(
+      *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
+  blob_proto.set_name(output_name);
+  blob_proto.set_type("Tensor");
+  CAFFE_ENFORCE(blob_proto.has_tensor());
+  caffe2::TensorProto tensor_proto = blob_proto.tensor();
+  vector<float> data;
+  switch (tensor_proto.data_type()) {
+    case caffe2::TensorProto::FLOAT: {
+      std::copy(
+          tensor_proto.float_data().begin(),
+          tensor_proto.float_data().end(),
+          std::back_inserter(data));
+      break;
+    }
+    case caffe2::TensorProto::INT32: {
+      std::copy(
+          tensor_proto.int32_data().begin(),
+          tensor_proto.int32_data().end(),
+          std::back_inserter(data));
+      break;
+    }
+    default:
+      CAFFE_THROW("Unimplemented Blob type.");
+  }
+  std::ofstream output_file(output_name);
+  std::ostream_iterator<float> output_iterator(output_file, "\n");
+  std::copy(data.begin(), data.end(), output_iterator);
+}
+
+void observerConfig();
+bool backendCudaSet(const string&);
+void setDeviceType(caffe2::NetDef*, caffe2::DeviceType&);
+void setOperatorEngine(caffe2::NetDef*, const string&);
+void loadInput(
+    shared_ptr<caffe2::Workspace>,
+    const bool,
+    map<string, caffe2::TensorProtos>&,
+    const string&,
+    const string&,
+    const string&,
+    const string&);
+void fillInputBlob(
+    shared_ptr<caffe2::Workspace>,
+    map<string, caffe2::TensorProtos>&,
+    int iteration);
+void writeOutput(
+    shared_ptr<caffe2::Workspace>,
+    const bool,
+    const string&,
+    const string&,
+    const bool);
+void runNetwork(
+    shared_ptr<caffe2::Workspace>,
+    caffe2::NetDef&,
+    map<string, caffe2::TensorProtos>&,
+    const bool,
+    const bool,
+    const int,
+    const int);
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
new file mode 100644
index 0000000..729479a
--- /dev/null
+++ b/binaries/caffe2_benchmark.cc
@@ -0,0 +1,128 @@
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "binaries/benchmark_helper.h"
+
+using std::make_shared;
+using std::map;
+using std::string;
+using std::vector;
+
+CAFFE2_DEFINE_string(
+    backend,
+    "builtin",
+    "The backend to use when running the model. The allowed "
+    "backend choices are: builtin, default, nnpack, eigen, mkl, cuda");
+
+CAFFE2_DEFINE_string(
+    init_net,
+    "",
+    "The given net to initialize any parameters.");
+CAFFE2_DEFINE_string(
+    input,
+    "",
+    "Input that is needed for running the network. If "
+    "multiple input needed, use comma separated string.");
+CAFFE2_DEFINE_string(
+    input_dims,
+    "",
+    "Alternate to input_files, if all inputs are simple "
+    "float TensorCPUs, specify the dimension using comma "
+    "separated numbers. If multiple input needed, use "
+    "semicolon to separate the dimension of different "
+    "tensors.");
+CAFFE2_DEFINE_string(
+    input_file,
+    "",
+    "Input file that contain the serialized protobuf for "
+    "the input blobs. If multiple input needed, use comma "
+    "separated string. Must have the same number of items "
+    "as input does.");
+CAFFE2_DEFINE_string(
+    input_type,
+    "float",
+    "Input type when specifying the input dimension."
+    "The supported types are float, uint8_t.");
+CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
+CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
+CAFFE2_DEFINE_string(
+    output,
+    "",
+    "Output that should be dumped after the execution "
+    "finishes. If multiple outputs are needed, use comma "
+    "separated string. If you want to dump everything, pass "
+    "'*' as the output value.");
+CAFFE2_DEFINE_string(
+    output_folder,
+    "",
+    "The folder that the output should be written to. This "
+    "folder must already exist in the file system.");
+CAFFE2_DEFINE_bool(
+    run_individual,
+    false,
+    "Whether to benchmark individual operators.");
+CAFFE2_DEFINE_bool(
+    text_output,
+    false,
+    "Whether to write out output in text format for regression purpose.");
+CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
+CAFFE2_DEFINE_bool(
+    wipe_cache,
+    false,
+    "Whether to evict the cache before running network.");
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  observerConfig();
+  caffe2::ShowLogInfoToStderr();
+
+  auto workspace = make_shared<caffe2::Workspace>(new caffe2::Workspace());
+  bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend);
+
+  // support other device type in the future?
+  caffe2::DeviceType run_dev = run_on_gpu ? caffe2::CUDA : caffe2::CPU;
+
+  // Run initialization network.
+  caffe2::NetDef init_net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
+  setDeviceType(&init_net_def, run_dev);
+  setOperatorEngine(&init_net_def, caffe2::FLAGS_backend);
+  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
+
+  // Run main network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+  setDeviceType(&net_def, run_dev);
+  setOperatorEngine(&net_def, caffe2::FLAGS_backend);
+
+  map<string, caffe2::TensorProtos> tensor_protos_map;
+
+  loadInput(
+      workspace,
+      run_on_gpu,
+      tensor_protos_map,
+      caffe2::FLAGS_input,
+      caffe2::FLAGS_input_file,
+      caffe2::FLAGS_input_dims,
+      caffe2::FLAGS_input_type);
+
+  runNetwork(
+      workspace,
+      net_def,
+      tensor_protos_map,
+      caffe2::FLAGS_wipe_cache,
+      caffe2::FLAGS_run_individual,
+      caffe2::FLAGS_warmup,
+      caffe2::FLAGS_iter);
+
+  writeOutput(
+      workspace,
+      run_on_gpu,
+      caffe2::FLAGS_output,
+      caffe2::FLAGS_output_folder,
+      caffe2::FLAGS_text_output);
+
+  return 0;
+}
diff --git a/binaries/convert_caffe_image_db.cc b/binaries/convert_caffe_image_db.cc
new file mode 100644
index 0000000..ef5a570
--- /dev/null
+++ b/binaries/convert_caffe_image_db.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_string(output_db, "", "The output db.");
+CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+using caffe2::TensorProto;
+using caffe2::TensorProtos;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    caffe::Datum datum;
+    CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
+    TensorProtos protos;
+    TensorProto* data = protos.add_protos();
+    TensorProto* label = protos.add_protos();
+    label->set_data_type(TensorProto::INT32);
+    label->add_dims(1);
+    label->add_int32_data(datum.label());
+    if (datum.encoded()) {
+      // This is an encoded image. we will copy over the data directly.
+      data->set_data_type(TensorProto::STRING);
+      data->add_dims(1);
+      data->add_string_data(datum.data());
+    } else {
+      // float data not supported right now.
+      CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
+      std::vector<char> buffer_vec(datum.data().size());
+      char* buffer = buffer_vec.data();
+      // swap order from CHW to HWC
+      int channels = datum.channels();
+      int size = datum.height() * datum.width();
+      CAFFE_ENFORCE_EQ(datum.data().size(), channels * size);
+      for (int c = 0; c < channels; ++c) {
+        char* dst = buffer + c;
+        const char* src = datum.data().c_str() + c * size;
+        for (int n = 0; n < size; ++n) {
+          dst[n*channels] = src[n];
+        }
+      }
+      data->set_data_type(TensorProto::BYTE);
+      data->add_dims(datum.height());
+      data->add_dims(datum.width());
+      data->add_dims(datum.channels());
+      data->set_byte_data(buffer, datum.data().size());
+    }
+    transaction->Put(cursor->key(), protos.SerializeAsString());
+    if (++count % caffe2::FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
+
diff --git a/binaries/convert_db.cc b/binaries/convert_db.cc
new file mode 100644
index 0000000..cb0710a
--- /dev/null
+++ b/binaries/convert_db.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_string(output_db, "", "The output db.");
+CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transaction->Put(cursor->key(), cursor->value());
+    if (++count % caffe2::FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
diff --git a/binaries/convert_encoded_to_raw_leveldb.cc b/binaries/convert_encoded_to_raw_leveldb.cc
new file mode 100644
index 0000000..4e272fc
--- /dev/null
+++ b/binaries/convert_encoded_to_raw_leveldb.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts an image dataset to leveldb.
+//
+// caffe2::FLAGS_input_folder is the root folder that holds all the images, and
+// caffe2::FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <memory>
+#include <random>
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+CAFFE2_DEFINE_string(input_db_name, "", "The input image file name.");
+CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
+CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
+CAFFE2_DEFINE_int(scale, 256,
+    "If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+
+void ConvertToRawDataset(
+    const string& input_db_name, const string& output_db_name) {
+  // input leveldb
+  std::unique_ptr<leveldb::DB> input_db;
+  LOG(INFO) << "Opening input leveldb " << input_db_name;
+  {
+    leveldb::Options options;
+    options.create_if_missing = false;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, input_db_name, &db_temp);
+    CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
+    input_db.reset(db_temp);
+  }
+
+  // output leveldb
+  std::unique_ptr<leveldb::DB> output_db;
+  std::unique_ptr<leveldb::WriteBatch> batch;
+  LOG(INFO) << "Opening leveldb " << output_db_name;
+  {
+    leveldb::Options options;
+    options.error_if_exists = true;
+    options.create_if_missing = true;
+    options.write_buffer_size = 268435456;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, output_db_name, &db_temp);
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to open leveldb ",
+        output_db_name,
+        ". Is it already existing?");
+    output_db.reset(db_temp);
+  }
+  batch.reset(new leveldb::WriteBatch());
+
+  TensorProtos input_protos;
+  TensorProtos output_protos;
+  TensorProto* data = output_protos.add_protos();
+  TensorProto* label = output_protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(0);
+  data->add_dims(0);
+  if (caffe2::FLAGS_color) {
+    data->add_dims(3);
+  }
+  string value;
+
+  unique_ptr<leveldb::Iterator> iter;
+  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
+  iter->SeekToFirst();
+  int count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
+    label->CopyFrom(input_protos.protos(1));
+    const string& encoded_image = input_protos.protos(0).string_data(0);
+    int encoded_size = encoded_image.size();
+    cv::Mat img = cv::imdecode(
+        cv::Mat(1, &encoded_size, CV_8UC1,
+        const_cast<char*>(encoded_image.data())),
+        caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+    cv::Mat resized_img;
+    int scaled_width, scaled_height;
+    if (caffe2::FLAGS_warp) {
+      scaled_width = caffe2::FLAGS_scale;
+      scaled_height = caffe2::FLAGS_scale;
+    } else if (img.rows > img.cols) {
+      scaled_width = caffe2::FLAGS_scale;
+      scaled_height = static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
+    } else {
+      scaled_height = caffe2::FLAGS_scale;
+      scaled_width = static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
+    }
+    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                 cv::INTER_LINEAR);
+    data->set_dims(0, scaled_height);
+    data->set_dims(1, scaled_width);
+    DCHECK(resized_img.isContinuous());
+    data->set_byte_data(resized_img.ptr(),
+                        scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1));
+    output_protos.SerializeToString(&value);
+    // Put in db
+    batch->Put(iter->key(), value);
+    if (++count % 1000 == 0) {
+      output_db->Write(leveldb::WriteOptions(), batch.get());
+      batch.reset(new leveldb::WriteBatch());
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    output_db->Write(leveldb::WriteOptions(), batch.get());
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertToRawDataset(
+      caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name);
+  return 0;
+}
diff --git a/binaries/core_overhead_benchmark.cc b/binaries/core_overhead_benchmark.cc
new file mode 100644
index 0000000..74f19d5
--- /dev/null
+++ b/binaries/core_overhead_benchmark.cc
@@ -0,0 +1,223 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark/benchmark.h"
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+#define CAFFE2_SKIP_IF_NO_GPU                                      \
+  if (!caffe2::NumCudaDevices()) {                                 \
+    state.SkipWithError("No CUDA available, skipping benchmark."); \
+    return;                                                        \
+  }
+
+using namespace caffe2;
+
+static void BM_CUDAContextCreation(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  volatile CUDAContext context_so_we_do_initialization_work;
+  while (state.KeepRunning()) {
+    volatile CUDAContext context;
+  }
+}
+BENCHMARK(BM_CUDAContextCreation);
+
+static void BM_CUDAContextStreamAccess(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  CUDAContext context;
+  while (state.KeepRunning()) {
+    volatile cudaStream_t stream = context.cuda_stream();
+  }
+}
+BENCHMARK(BM_CUDAContextStreamAccess);
+
+static void BM_cudaGetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int id;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaGetDevice(&id));
+  }
+}
+BENCHMARK(BM_cudaGetDevice);
+
+static void BM_cudaSetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int total = NumCudaDevices();
+  int i = 0;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice((i++) % total));
+  }
+}
+BENCHMARK(BM_cudaSetDevice);
+
+static void BM_cudaSetAndGetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int total = NumCudaDevices();
+  int i = 0;
+  int id;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice((i++) % total));
+    CUDA_ENFORCE(cudaGetDevice(&id));
+  }
+}
+BENCHMARK(BM_cudaSetAndGetDevice);
+
+static void BM_cudaSetSameDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice(0));
+  }
+}
+BENCHMARK(BM_cudaSetSameDevice);
+
+static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamCreate(&stream));
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+    CUDA_ENFORCE(cudaStreamDestroy(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamCreateSyncDelete);
+
+static void BM_cudaStreamSynchronize(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamSynchronize);
+
+static void BM_cudaEventRecord(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  cudaEvent_t event;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  CUDA_ENFORCE(cudaEventCreateWithFlags(
+      &event, cudaEventDefault | cudaEventDisableTiming));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaEventRecord(event, stream));
+  }
+}
+BENCHMARK(BM_cudaEventRecord);
+
+static void BM_cudaStreamWaitEventThenStreamSynchronize(
+    benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  cudaEvent_t event;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  CUDA_ENFORCE(cudaEventCreateWithFlags(
+      &event, cudaEventDefault | cudaEventDisableTiming));
+  CUDA_ENFORCE(cudaEventRecord(event, stream));
+  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
+  CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
+
+static void BM_CudaPointerAffinity(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
+  float* ptr = tensor.mutable_data<float>();
+  while (state.KeepRunning()) {
+    volatile int id = GetGPUIDForPointer(ptr);
+  }
+}
+BENCHMARK(BM_CudaPointerAffinity);
+
+namespace {
+template <class Context>
+class DummyEmptyOp : public Operator<Context> {
+ public:
+  DummyEmptyOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+
+  bool RunOnDevice() final { return true; }
+};
+
+REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
+REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
+OPERATOR_SCHEMA(DummyEmpty);
+}  // namespace
+
+static void BM_OperatorCreationCPU(benchmark::State& state) {
+  std::unique_ptr<OperatorBase> op;
+  OperatorDef def;
+  Workspace ws;
+  def.set_type("DummyEmpty");
+  def.mutable_device_option()->set_device_type(CPU);
+  while (state.KeepRunning()) {
+    op = CreateOperator(def, &ws);
+  }
+}
+BENCHMARK(BM_OperatorCreationCPU);
+
+static void BM_OperatorCreationCUDA(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  std::unique_ptr<OperatorBase> op;
+  OperatorDef def;
+  Workspace ws;
+  def.set_type("DummyEmpty");
+  def.mutable_device_option()->set_device_type(CUDA);
+  while (state.KeepRunning()) {
+    op = CreateOperator(def, &ws);
+  }
+}
+BENCHMARK(BM_OperatorCreationCUDA);
+
+static void BM_RawAllocDeallocCPU(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    // Allocating only 1 byte in order to measure the overhead.
+    auto ptr_and_deleter = GetCPUAllocator()->New(1);
+    // Deallocate.
+    ptr_and_deleter.second(ptr_and_deleter.first);
+  }
+}
+BENCHMARK(BM_RawAllocDeallocCPU);
+
+static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
+  Tensor<CPUContext> tensor;
+  // small allocation
+  tensor.Resize(32, 32);
+  while (state.KeepRunning()) {
+    CHECK(tensor.mutable_data<float>());
+    tensor.FreeMemory();
+  }
+}
+BENCHMARK(BM_TensorAllocDeallocCPU);
+
+static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  Tensor<CUDAContext> tensor;
+  // small allocation
+  tensor.Resize(32, 32);
+  while (state.KeepRunning()) {
+    CHECK(tensor.mutable_data<float>());
+    tensor.FreeMemory();
+  }
+}
+BENCHMARK(BM_TensorAllocDeallocCUDA);
+
+BENCHMARK_MAIN();
diff --git a/binaries/db_throughput.cc b/binaries/db_throughput.cc
new file mode 100644
index 0000000..5d8fe5c
--- /dev/null
+++ b/binaries/db_throughput.cc
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+#include <thread>
+#include <vector>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_int(report_interval, 1000, "The report interval.");
+CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test.");
+CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface.");
+CAFFE2_DEFINE_int(num_read_threads, 1,
+                   "The number of concurrent reading threads.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::DBReader;
+using caffe2::string;
+
+void TestThroughputWithDB() {
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
+    caffe2::Timer timer;
+    for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
+      string key = cursor->key();
+      string value = cursor->value();
+      //VLOG(1) << "Key " << key;
+      cursor->Next();
+      if (!cursor->Valid()) {
+        cursor->SeekToFirst();
+      }
+    }
+    double elapsed_seconds = timer.Seconds();
+    printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n",
+           iter_id, elapsed_seconds,
+           caffe2::FLAGS_report_interval / elapsed_seconds);
+  }
+}
+
+void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) {
+  string key, value;
+  for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
+    caffe2::Timer timer;
+    for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
+      reader->Read(&key, &value);
+    }
+    double elapsed_seconds = timer.Seconds();
+    printf("Thread %03d iteration %03d, took %4.5f seconds, "
+           "throughput %f items/sec.\n",
+           thread_id, iter_id, elapsed_seconds,
+           caffe2::FLAGS_report_interval / elapsed_seconds);
+  }
+}
+
+void TestThroughputWithReader() {
+  caffe2::db::DBReader reader(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db);
+  std::vector<std::unique_ptr<std::thread>> reading_threads(
+      caffe2::FLAGS_num_read_threads);
+  for (int i = 0; i < reading_threads.size(); ++i) {
+    reading_threads[i].reset(new std::thread(
+        TestThroughputWithReaderWorker, &reader, i));
+  }
+  for (int i = 0; i < reading_threads.size(); ++i) {
+    reading_threads[i]->join();
+  }
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  if (caffe2::FLAGS_use_reader) {
+    TestThroughputWithReader();
+  } else {
+    TestThroughputWithDB();
+  }
+  return 0;
+}
diff --git a/binaries/inspect_gpus.cc b/binaries/inspect_gpus.cc
new file mode 100644
index 0000000..6b80a4e
--- /dev/null
+++ b/binaries/inspect_gpus.cc
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+
+#include <sstream>
+#include <vector>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+
+using std::vector;
+
+CAFFE2_DECLARE_int(caffe2_log_level);
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::SetUsageMessage(
+      "Inspects the GPUs on the current machine and prints out their details "
+      "provided by cuda.");
+
+  int gpu_count;
+  CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count));
+  for (int i = 0; i < gpu_count; ++i) {
+    LOG(INFO) << "Querying device ID = " << i;
+    caffe2::DeviceQuery(i);
+  }
+
+  vector<vector<bool> > access_pattern;
+  CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
+
+  std::stringstream sstream;
+  // Find topology
+  for (int i = 0; i < gpu_count; ++i) {
+    for (int j = 0; j < gpu_count; ++j) {
+      sstream << (access_pattern[i][j] ? "+" : "-") << " ";
+    }
+    sstream << std::endl;
+  }
+  LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
+
+  return 0;
+}
diff --git a/binaries/make_cifar_db.cc b/binaries/make_cifar_db.cc
new file mode 100644
index 0000000..9f9c0bc
--- /dev/null
+++ b/binaries/make_cifar_db.cc
@@ -0,0 +1,148 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// This script converts the CIFAR dataset to the leveldb format used
+// by caffe to perform classification.
+// Usage:
+//    convert_cifar_data input_folder output_db_file
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html
+
+#include <array>
+#include <fstream>  // NOLINT(readability/streams)
+#include <sstream>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_folder, "", "The input folder name.");
+CAFFE2_DEFINE_string(output_train_db_name,
+                     "", "The output training db name.");
+CAFFE2_DEFINE_string(output_test_db_name,
+                     "", "The output testing db name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_bool(is_cifar100, false,
+            "If set, convert cifar100. Otherwise do cifar10.");
+
+namespace caffe2 {
+
+using std::stringstream;
+
+const int kCIFARSize = 32;
+const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
+const int kCIFAR10BatchSize = 10000;
+const int kCIFAR10TestDataSize = 10000;
+const int kCIFAR10TrainBatches = 5;
+
+const int kCIFAR100TrainDataSize = 50000;
+const int kCIFAR100TestDataSize = 10000;
+
+void ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  if (caffe2::FLAGS_is_cifar100) {
+    // Skip the coarse label.
+    file->read(&label_char, 1);
+  }
+  file->read(&label_char, 1);
+  *label = label_char;
+  // Yes, there are better ways to do it, like in-place swap... but I am too
+  // lazy so let's just write it in a memory-wasteful way.
+  std::array<char, kCIFARImageNBytes> channel_first_storage;
+  file->read(channel_first_storage.data(), kCIFARImageNBytes);
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
+      buffer[i * 3 + c] =
+          channel_first_storage[c * kCIFARSize * kCIFARSize + i];
+    }
+  }
+  return;
+}
+
+void WriteToDB(const string& filename, const int num_items,
+                    const int& offset, db::DB* db) {
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(kCIFARSize);
+  data->add_dims(kCIFARSize);
+  data->add_dims(3);
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "Converting file " << filename;
+  std::ifstream data_file(filename.c_str(),
+      std::ios::in | std::ios::binary);
+  CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
+  char str_buffer[kCIFARImageNBytes];
+  int label_value;
+  string serialized_protos;
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+  for (int itemid = 0; itemid < num_items; ++itemid) {
+    ReadImage(&data_file, &label_value, str_buffer);
+    data->set_byte_data(str_buffer, kCIFARImageNBytes);
+    label->set_int32_data(0, label_value);
+    protos.SerializeToString(&serialized_protos);
+    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
+        offset + itemid);
+    transaction->Put(string(str_buffer), serialized_protos);
+  }
+}
+
+void ConvertCIFAR() {
+  std::unique_ptr<db::DB> train_db(
+      db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name,
+                   db::NEW));
+  std::unique_ptr<db::DB> test_db(
+      db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name,
+                   db::NEW));
+
+  if (!caffe2::FLAGS_is_cifar100) {
+    // This is cifar 10.
+    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
+      stringstream train_file;
+      train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1
+                 << ".bin";
+      WriteToDB(train_file.str(), kCIFAR10BatchSize,
+                fileid * kCIFAR10BatchSize, train_db.get());
+    }
+    stringstream test_file;
+    test_file << caffe2::FLAGS_input_folder << "/test_batch.bin";
+    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
+  } else {
+    // This is cifar 100.
+    stringstream train_file;
+    train_file << caffe2::FLAGS_input_folder << "/train.bin";
+    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
+    stringstream test_file;
+    test_file << caffe2::FLAGS_input_folder << "/test.bin";
+    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
+  }
+}
+
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertCIFAR();
+  return 0;
+}
diff --git a/binaries/make_image_db.cc b/binaries/make_image_db.cc
new file mode 100644
index 0000000..2bdbb53
--- /dev/null
+++ b/binaries/make_image_db.cc
@@ -0,0 +1,280 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts an image dataset to a database.
+//
+// caffe2::FLAGS_input_folder is the root folder that holds all the images
+//
+// caffe2::FLAGS_list_file is the path to a file containing a list of files
+// and their labels, as follows:
+//
+//   subfolder1/file1.JPEG 7
+//   subfolder1/file2.JPEG 7
+//   subfolder2/file1.JPEG 8
+//   ...
+//
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>
+#include <queue>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_bool(shuffle, false,
+    "Randomly shuffle the order of images and their labels");
+CAFFE2_DEFINE_string(input_folder, "", "The input image file name.");
+CAFFE2_DEFINE_string(
+    list_file,
+    "",
+    "The text file containing the list of images.");
+CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_bool(raw, false,
+    "If set, we pre-read the images and store the raw buffer.");
+CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
+CAFFE2_DEFINE_int(
+    scale,
+    256,
+    "If caffe2::FLAGS_raw is set, scale the shorter edge to the given value.");
+CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+CAFFE2_DEFINE_int(
+    num_threads,
+    -1,
+    "Number of image parsing and conversion threads.");
+
+namespace caffe2 {
+
+class Converter {
+ public:
+  explicit Converter() {
+    data_ = protos_.add_protos();
+    label_ = protos_.add_protos();
+    if (caffe2::FLAGS_raw) {
+      data_->set_data_type(TensorProto::BYTE);
+      data_->add_dims(0);
+      data_->add_dims(0);
+      if (caffe2::FLAGS_color) {
+        data_->add_dims(3);
+      }
+    } else {
+      data_->set_data_type(TensorProto::STRING);
+      data_->add_dims(1);
+      data_->add_string_data("");
+    }
+    label_->set_data_type(TensorProto::INT32);
+    label_->add_dims(1);
+    label_->add_int32_data(0);
+  }
+
+  ~Converter() {
+    if (thread_.joinable()) {
+      thread_.join();
+    }
+  }
+
+  void queue(const std::pair<std::string, int>& pair) {
+    in_.push(pair);
+  }
+
+  void start() {
+    thread_ = std::thread(&Converter::run, this);
+  }
+
+  std::string get() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (out_.empty()) {
+      cv_.wait(lock);
+    }
+
+    auto value = out_.front();
+    out_.pop();
+    cv_.notify_one();
+    return value;
+  }
+
+  void run() {
+    const auto& input_folder = caffe2::FLAGS_input_folder;
+    std::unique_lock<std::mutex> lock(mutex_);
+    std::string value;
+    while (!in_.empty()) {
+      auto pair = in_.front();
+      in_.pop();
+      lock.unlock();
+
+      label_->set_int32_data(0, pair.second);
+
+      // Add raw file contents to DB if !raw
+      if (!caffe2::FLAGS_raw) {
+        std::ifstream image_file_stream(input_folder + pair.first);
+        if (!image_file_stream) {
+          LOG(ERROR) << "Cannot open " << input_folder << pair.first
+                     << ". Skipping.";
+        } else {
+          data_->mutable_string_data(0)->assign(
+              std::istreambuf_iterator<char>(image_file_stream),
+              std::istreambuf_iterator<char>());
+        }
+      } else {
+        // Load image
+        cv::Mat img = cv::imread(
+            input_folder + pair.first,
+            caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR
+                                : CV_LOAD_IMAGE_GRAYSCALE);
+
+        // Resize image
+        cv::Mat resized_img;
+        int scaled_width, scaled_height;
+        if (caffe2::FLAGS_warp) {
+          scaled_width = caffe2::FLAGS_scale;
+          scaled_height = caffe2::FLAGS_scale;
+        } else if (img.rows > img.cols) {
+          scaled_width = caffe2::FLAGS_scale;
+          scaled_height =
+              static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
+        } else {
+          scaled_height = caffe2::FLAGS_scale;
+          scaled_width =
+              static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
+        }
+        cv::resize(
+            img,
+            resized_img,
+            cv::Size(scaled_width, scaled_height),
+            0,
+            0,
+            cv::INTER_LINEAR);
+        data_->set_dims(0, scaled_height);
+        data_->set_dims(1, scaled_width);
+
+        // Assert we don't have to deal with alignment
+        DCHECK(resized_img.isContinuous());
+        auto nbytes = resized_img.total() * resized_img.elemSize();
+        data_->set_byte_data(resized_img.ptr(), nbytes);
+      }
+
+      protos_.SerializeToString(&value);
+
+      // Add serialized proto to out queue or wait if it is not empty
+      lock.lock();
+      while (!out_.empty()) {
+        cv_.wait(lock);
+      }
+      out_.push(value);
+      cv_.notify_one();
+    }
+  }
+
+ protected:
+  TensorProtos protos_;
+  TensorProto* data_;
+  TensorProto* label_;
+  std::queue<std::pair<std::string, int>> in_;
+  std::queue<std::string> out_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::thread thread_;
+};
+
+void ConvertImageDataset(
+    const string& input_folder,
+    const string& list_filename,
+    const string& output_db_name,
+    const bool /*shuffle*/) {
+  std::ifstream list_file(list_filename);
+  std::vector<std::pair<std::string, int> > lines;
+  std::string filename;
+  int file_label;
+  while (list_file >> filename >> file_label) {
+    lines.push_back(std::make_pair(filename, file_label));
+  }
+
+  if (caffe2::FLAGS_shuffle) {
+    LOG(INFO) << "Shuffling data";
+    std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701));
+  }
+
+  auto num_threads = caffe2::FLAGS_num_threads;
+  if (num_threads < 1) {
+    num_threads = std::thread::hardware_concurrency();
+  }
+
+  LOG(INFO) << "Processing " << lines.size() << " images...";
+  LOG(INFO) << "Opening DB " << output_db_name;
+
+  auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW);
+  auto transaction = db->NewTransaction();
+
+  LOG(INFO) << "Using " << num_threads << " processing threads...";
+  std::vector<Converter> converters(num_threads);
+
+  // Queue entries across converters
+  for (auto i = 0; i < lines.size(); i++) {
+    converters[i % converters.size()].queue(lines[i]);
+  }
+
+  // Start all converters
+  for (auto& converter : converters) {
+    converter.start();
+  }
+
+  constexpr auto key_max_length = 256;
+  char key_cstr[key_max_length];
+  string value;
+  int count = 0;
+  for (auto i = 0; i < lines.size(); i++) {
+    // Get serialized proto for this entry
+    auto value = converters[i % converters.size()].get();
+
+    // Synthesize key for this entry
+    auto key_len = snprintf(
+        key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str());
+    DCHECK_LE(key_len, sizeof(key_cstr));
+
+    // Put in db
+    transaction->Put(string(key_cstr), value);
+
+    if (++count % 1000 == 0) {
+      // Commit the current writes.
+      transaction->Commit();
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+
+  // Commit final transaction
+  transaction->Commit();
+  LOG(INFO) << "Processed " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertImageDataset(
+      caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file,
+      caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle);
+  return 0;
+}
diff --git a/binaries/make_mnist_db.cc b/binaries/make_mnist_db.cc
new file mode 100644
index 0000000..8737d0e
--- /dev/null
+++ b/binaries/make_mnist_db.cc
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts the MNIST dataset to leveldb.
+// The MNIST dataset could be downloaded at
+//    http://yann.lecun.com/exdb/mnist/
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(image_file, "", "The input image file name.");
+CAFFE2_DEFINE_string(label_file, "", "The label file name.");
+CAFFE2_DEFINE_string(output_file, "", "The output db name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_int(data_limit, -1,
+             "If set, only output this number of data points.");
+CAFFE2_DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_path, const int data_limit) {
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
+  CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  if (magic == 529205256) {
+    LOG(FATAL) << 
+        "It seems that you forgot to unzip the mnist dataset. You should "
+        "first unzip them using e.g. gunzip on Linux.";
+  }
+  CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic.");
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic.");
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CAFFE_ENFORCE_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // leveldb
+  std::unique_ptr<db::DB> mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
+  // Storing to db
+  char label_value;
+  std::vector<char> pixels(rows * cols);
+  int count = 0;
+  const int kMaxKeyLength = 10;
+  char key_cstr[kMaxKeyLength];
+  string value;
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (caffe2::FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+  } else {
+    data->add_dims(rows);
+    data->add_dims(cols);
+    data->add_dims(1);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  for (int item_id = 0; item_id < num_items; ++item_id) {
+    image_file.read(pixels.data(), rows * cols);
+    label_file.read(&label_value, 1);
+    for (int i = 0; i < rows * cols; ++i) {
+      data->set_byte_data(pixels.data(), rows * cols);
+    }
+    label->set_int32_data(0, static_cast<int>(label_value));
+    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
+    protos.SerializeToString(&value);
+    string keystr(key_cstr);
+
+    // Put in db
+    transaction->Put(keystr, value);
+    if (++count % 1000 == 0) {
+      transaction->Commit();
+    }
+    if (data_limit > 0 && count == data_limit) {
+      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
+      break;
+    }
+  }
+}
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(),
+                          caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit);
+  return 0;
+}
diff --git a/binaries/predictor_verifier.cc b/binaries/predictor_verifier.cc
new file mode 100644
index 0000000..e82a8e9
--- /dev/null
+++ b/binaries/predictor_verifier.cc
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/flags.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
+CAFFE2_DEFINE_string(
+    predict_net,
+    "",
+    "The given path to the predict protobuffer.");
+
+namespace caffe2 {
+
+void run() {
+  if (FLAGS_init_net.empty()) {
+    LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net.";
+  }
+  if (FLAGS_predict_net.empty()) {
+    LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
+  }
+  caffe2::NetDef init_net, predict_net;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
+  // Can be large due to constant fills
+  VLOG(1) << "Init net: " << ProtoDebugString(init_net);
+  LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
+  auto predictor = caffe2::make_unique<Predictor>(init_net, predict_net);
+  LOG(INFO) << "Checking that a null forward-pass works";
+  Predictor::TensorVector inputVec, outputVec;
+  predictor->run(inputVec, &outputVec);
+  CAFFE_ENFORCE_GT(outputVec.size(), 0);
+}
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::run();
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  return 0;
+}
diff --git a/binaries/print_core_object_sizes.cc b/binaries/print_core_object_sizes.cc
new file mode 100644
index 0000000..2000c34
--- /dev/null
+++ b/binaries/print_core_object_sizes.cc
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#define PRINT_SIZE(cls) \
+  std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
+            << std::endl;
+
+int main(int /* unused */, char** /* unused */) {
+  PRINT_SIZE(caffe2::Blob);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::CPUContext);
+  PRINT_SIZE(caffe2::CUDAContext);
+  PRINT_SIZE(caffe2::OperatorBase);
+  PRINT_SIZE(caffe2::OperatorDef);
+  PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
+  PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::TypeMeta);
+  PRINT_SIZE(caffe2::Workspace);
+  return 0;
+}
diff --git a/binaries/print_registered_core_operators.cc b/binaries/print_registered_core_operators.cc
new file mode 100644
index 0000000..c76ea3e
--- /dev/null
+++ b/binaries/print_registered_core_operators.cc
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/operator_schema.h"
+
+CAFFE2_DEFINE_string(schema, "",
+                     "Print doc and schema of a particular operator");
+
+static bool HasSchema(const std::string& str) {
+  return caffe2::OpSchemaRegistry::Schema(str);
+}
+
+static bool HasDoc(const std::string& str) {
+  const auto* schema = caffe2::OpSchemaRegistry::Schema(str);
+  return (schema != nullptr) && (schema->doc() != nullptr);
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  if (!caffe2::FLAGS_schema.empty()) {
+    const auto* schema = caffe2::OpSchemaRegistry::Schema(
+        caffe2::FLAGS_schema);
+    if (!schema) {
+      std::cerr << "Operator " << caffe2::FLAGS_schema
+                << " doesn't have a schema" << std::endl;
+      return 1;
+    }
+    std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl
+              << *schema;
+    return 0;
+  }
+
+  for (const auto& pair : *caffe2::gDeviceTypeRegistry()) {
+    std::cout << "Device type " << pair.first
+#ifndef CAFFE2_USE_LITE_PROTO
+              << " (" << caffe2::DeviceType_Name(
+                             static_cast<caffe2::DeviceType>(pair.first))
+              << ")"
+#endif
+              << std::endl;
+    for (const auto& key : pair.second->Keys()) {
+      std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key)
+                << ")\t" << key << std::endl;
+    }
+  }
+
+  std::cout << "Operators that have gradients registered:" << std::endl;
+  for (const auto& key : caffe2::GradientRegistry()->Keys()) {
+    std::cout << "\t(schema: " << HasSchema(key) << ", doc: "
+              << HasDoc(key) << ")\t"
+              << key << std::endl;
+  }
+  return 0;
+}
diff --git a/binaries/run_plan.cc b/binaries/run_plan.cc
new file mode 100644
index 0000000..5ad2c3a
--- /dev/null
+++ b/binaries/run_plan.cc
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  if (caffe2::FLAGS_plan.size() == 0) {
+    LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan.";
+    return 0;
+  }
+  LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  return 0;
+}
diff --git a/binaries/run_plan_mpi.cc b/binaries/run_plan_mpi.cc
new file mode 100644
index 0000000..ee720fa
--- /dev/null
+++ b/binaries/run_plan_mpi.cc
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mpi.h>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it.");
+  int mpi_ret;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
+  if (mpi_ret != MPI_THREAD_MULTIPLE &&
+      mpi_ret != MPI_THREAD_SERIALIZED) {
+    std::cerr << "Caffe2 MPI requires the underlying MPI to support the "
+                 "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n";
+    return 1;
+  }
+  caffe2::GlobalInit(&argc, &argv);
+  LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  MPI_Finalize();
+  return 0;
+}
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
new file mode 100644
index 0000000..196be4a
--- /dev/null
+++ b/binaries/speed_benchmark.cc
@@ -0,0 +1,211 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#ifdef CAFFE2_OPTIMIZER
+#include "caffe2/opt/optimizer.h"
+#endif
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
+CAFFE2_DEFINE_string(
+    init_net,
+    "",
+    "The given net to initialize any parameters.");
+CAFFE2_DEFINE_string(
+    input,
+    "",
+    "Input that is needed for running the network. If "
+    "multiple input needed, use comma separated string.");
+CAFFE2_DEFINE_string(
+    input_file,
+    "",
+    "Input file that contain the serialized protobuf for "
+    "the input blobs. If multiple input needed, use comma "
+    "separated string. Must have the same number of items "
+    "as input does.");
+CAFFE2_DEFINE_string(
+    input_dims,
+    "",
+    "Alternate to input_files, if all inputs are simple "
+    "float TensorCPUs, specify the dimension using comma "
+    "separated numbers. If multiple input needed, use "
+    "semicolon to separate the dimension of different "
+    "tensors.");
+CAFFE2_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
+CAFFE2_DEFINE_string(
+    output,
+    "",
+    "Output that should be dumped after the execution "
+    "finishes. If multiple outputs are needed, use comma "
+    "separated string. If you want to dump everything, pass "
+    "'*' as the output value.");
+CAFFE2_DEFINE_string(
+    output_folder,
+    "",
+    "The folder that the output should be written to. This "
+    "folder must already exist in the file system.");
+CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
+CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
+CAFFE2_DEFINE_int(opt, 0, "The level of optimization to run automatically.");
+CAFFE2_DEFINE_bool(
+    run_individual,
+    false,
+    "Whether to benchmark individual operators.");
+
+CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators");
+CAFFE2_DEFINE_string(engine, "", "Forced engine field value");
+CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators");
+CAFFE2_DEFINE_string(algo, "", "Forced algo arg value");
+
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+
+  // Run initialization network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
+  CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
+
+  // Load input.
+  if (caffe2::FLAGS_input.size()) {
+    vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
+    if (caffe2::FLAGS_input_file.size()) {
+      vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_files.size(),
+          "Input name and file should have the same number.");
+      for (int i = 0; i < input_names.size(); ++i) {
+        caffe2::BlobProto blob_proto;
+        CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
+        workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
+      }
+    } else if (caffe2::FLAGS_input_dims.size() || caffe2::FLAGS_input_type.size()) {
+      CAFFE_ENFORCE_GE(
+          caffe2::FLAGS_input_dims.size(),
+          0,
+          "Input dims must be specified when input tensors are used.");
+      CAFFE_ENFORCE_GE(
+          caffe2::FLAGS_input_type.size(),
+          0,
+          "Input type must be specified when input tensors are used.");
+
+      vector<string> input_dims_list =
+          caffe2::split(';', caffe2::FLAGS_input_dims);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_dims_list.size(),
+          "Input name and dims should have the same number of items.");
+      vector<string> input_type_list =
+          caffe2::split(';', caffe2::FLAGS_input_type);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_type_list.size(),
+          "Input name and type should have the same number of items.");
+      for (size_t i = 0; i < input_names.size(); ++i) {
+        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
+        vector<int> input_dims;
+        for (const string& s : input_dims_str) {
+          input_dims.push_back(caffe2::stoi(s));
+        }
+        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
+        if (blob == nullptr) {
+          blob = workspace->CreateBlob(input_names[i]);
+        }
+        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+        CHECK_NOTNULL(tensor);
+        tensor->Resize(input_dims);
+        if (input_type_list[i] == "uint8_t") {
+          tensor->mutable_data<uint8_t>();
+        } else if (input_type_list[i] == "float") {
+          tensor->mutable_data<float>();
+        }  else {
+          CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
+        }
+      }
+    } else {
+      CAFFE_THROW(
+          "You requested input tensors, but neither input_file nor "
+          "input_dims is set.");
+    }
+  }
+
+  // Run main network.
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+  if (!net_def.has_name()) {
+    net_def.set_name("benchmark");
+  }
+  // force changing engine and algo
+  if (caffe2::FLAGS_force_engine) {
+    LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine;
+    for (const auto& op : net_def.op()) {
+      const_cast<caffe2::OperatorDef*>(&op)->set_engine(caffe2::FLAGS_engine);
+    }
+  }
+  if (caffe2::FLAGS_force_algo) {
+    LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo;
+    for (const auto& op : net_def.op()) {
+      caffe2::GetMutableArgument(
+          "algo", true, const_cast<caffe2::OperatorDef*>(&op))
+          ->set_s(caffe2::FLAGS_algo);
+    }
+  }
+  if (caffe2::FLAGS_opt) {
+#ifdef CAFFE2_OPTIMIZER
+    net_def = caffe2::opt::optimize(net_def, workspace.get(), caffe2::FLAGS_opt);
+#else
+    LOG(WARNING) << "Caffe2 not compiled with optimization passes.";
+#endif
+  }
+
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
+  CAFFE_ENFORCE(net->Run());
+  net->TEST_Benchmark(
+      caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
+
+  string output_prefix = caffe2::FLAGS_output_folder.size()
+      ? caffe2::FLAGS_output_folder + "/"
+      : "";
+  if (caffe2::FLAGS_output.size()) {
+    vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
+    if (caffe2::FLAGS_output == "*") {
+      output_names = workspace->Blobs();
+    }
+    for (const string& name : output_names) {
+      CAFFE_ENFORCE(
+          workspace->HasBlob(name),
+          "You requested a non-existing blob: ",
+          name);
+      string serialized = workspace->GetBlob(name)->Serialize(name);
+      string output_filename = output_prefix + name;
+      caffe2::WriteStringToFile(serialized, output_filename.c_str());
+    }
+  }
+
+  return 0;
+}
diff --git a/binaries/split_db.cc b/binaries/split_db.cc
new file mode 100644
index 0000000..077afda
--- /dev/null
+++ b/binaries/split_db.cc
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <sstream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
+CAFFE2_DEFINE_string(db_type, "", "The db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+namespace caffe2 {
+
+static int Split(int argc, char** argv) {
+  GlobalInit(&argc, &argv);
+
+  CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
+  CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
+  CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
+
+  unique_ptr<db::DB> in_db(
+      db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
+  CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
+  unique_ptr<db::Cursor> cursor(in_db->NewCursor());
+  // This usually won't happen, but FWIW.
+  CAFFE_ENFORCE(
+      cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
+
+  vector<unique_ptr<db::DB>> out_dbs;
+  vector<unique_ptr<db::Transaction>> transactions;
+  for (int i = 0; i < FLAGS_splits; ++i) {
+    out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
+        FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
+    CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
+    transactions.push_back(
+        unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
+    CAFFE_ENFORCE(
+        transactions.back().get(), "Cannot get transaction for output db #", i);
+  }
+
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      for (int i = 0; i < FLAGS_splits; ++i) {
+        transactions[i]->Commit();
+      }
+      LOG(INFO) << "Split " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
+
+} // namespace caffe2
+
+int main(int argc, char** argv) {
+  return caffe2::Split(argc, argv);
+}
diff --git a/binaries/tsv_2_proto.cc b/binaries/tsv_2_proto.cc
new file mode 100644
index 0000000..e9dba77
--- /dev/null
+++ b/binaries/tsv_2_proto.cc
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+#include <string>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_string(f_in, "", "The input data file name.");
+CAFFE2_DEFINE_string(f_out, "", "The output data file name.");
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  std::ifstream f_in(caffe2::FLAGS_f_in);
+  std::ofstream f_out(caffe2::FLAGS_f_out);
+  std::string line;
+  caffe2::TensorProtos tensor_protos;
+  while (std::getline(f_in, line)) {
+    caffe2::TensorProto* data = tensor_protos.add_protos();
+    data->set_data_type(caffe2::TensorProto::STRING);
+    data->add_dims(0);
+    data->add_string_data(line);
+    data->set_name("text");
+  }
+  f_in.close();
+  std::string output_str;
+  tensor_protos.SerializeToString(&output_str);
+  f_out << output_str;
+  f_out.close();
+  return 0;
+}
diff --git a/binaries/tutorial_blob.cc b/binaries/tutorial_blob.cc
new file mode 100644
index 0000000..f379eac
--- /dev/null
+++ b/binaries/tutorial_blob.cc
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/logging.h"
+
+// We will be lazy and just use the whole namespace.
+using namespace caffe2;
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ShowLogInfoToStderr();
+
+  LOG(INFO) <<
+      "This script corresponds to the Blob part of the Caffe2 C++ "
+      "tutorial.";
+
+  LOG(INFO) << "Let's create a blob myblob.";
+
+  Blob myblob;
+
+  LOG(INFO) << "Let's set it to int and set the value to 10.";
+
+  int* myint = myblob.GetMutable<int>();
+  *myint = 10;
+
+  LOG(INFO)
+      << "Is the blob type int? "
+      << myblob.IsType<int>();
+
+  LOG(INFO)
+      << "Is the blob type float? "
+      << myblob.IsType<float>();
+               
+  const int& myint_const = myblob.Get<int>();
+  LOG(INFO)
+      << "The value of the int number stored in the blob is: "
+      << myint_const;
+
+  LOG(INFO)
+      << "Let's try to get a float pointer. This will trigger an exception.";
+
+  try {
+    const float& myfloat = myblob.Get<float>();
+    LOG(FATAL) << "This line should never happen.";
+  } catch (std::exception& e) {
+    LOG(INFO)
+        << "As expected, we got an exception. Its content says: "
+        << e.what();
+  }
+
+  LOG(INFO) <<
+      "However, we can change the content type (and destroy the old "
+      "content) by calling GetMutable. Let's change it to double.";
+
+  double* mydouble = myblob.GetMutable<double>();
+  *mydouble = 3.14;
+
+  LOG(INFO) << "The new content is: " << myblob.Get<double>();
+
+  LOG(INFO) <<
+      "If we have a pre-created object, we can use Reset() to transfer the "
+      "object to a blob.";
+
+  std::string* pvec = new std::string();
+  myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
+  
+  LOG(INFO) << "Is the blob now of type string? "
+            << myblob.IsType<std::string>();
+
+  LOG(INFO) << "This concludes the blob tutorial.";
+  return 0;
+}
diff --git a/binaries/zmq_feeder.cc b/binaries/zmq_feeder.cc
new file mode 100644
index 0000000..27e8684
--- /dev/null
+++ b/binaries/zmq_feeder.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This binary provides an easy way to open a zeromq server and feeds data to
+// clients connect to it. It uses the Caffe2 db as the backend, thus allowing
+// one to convert any db-compliant storage to a zeromq service.
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/zmq_helper.h"
+
+CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address.");
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+
+using caffe2::db::DB;
+using caffe2::db::Cursor;
+using caffe2::string;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  LOG(INFO) << "Opening DB...";
+  auto in_db = caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
+  CAFFE_ENFORCE(
+      in_db,
+      "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
+          caffe2::FLAGS_input_db_type);
+  auto cursor = in_db->NewCursor();
+  LOG(INFO) << "DB opened.";
+
+  LOG(INFO) << "Starting ZeroMQ server...";
+
+  //  Socket to talk to clients
+  caffe2::ZmqSocket sender(ZMQ_PUSH);
+  sender.Bind(caffe2::FLAGS_server);
+  LOG(INFO) << "Server created at " << caffe2::FLAGS_server;
+
+  while (1) {
+    VLOG(1) << "Sending " << cursor->key();
+    sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE);
+    sender.SendTillSuccess(cursor->value(), 0);
+    cursor->Next();
+    if (!cursor->Valid()) {
+      cursor->SeekToFirst();
+    }
+  }
+  // We do not do an elegant quit since this binary is going to be terminated by
+  // control+C.
+  return 0;
+}
diff --git a/caffe/__init__.py b/caffe/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe/proto/CMakeLists.txt b/caffe/proto/CMakeLists.txt
new file mode 100644
index 0000000..558c224
--- /dev/null
+++ b/caffe/proto/CMakeLists.txt
@@ -0,0 +1,17 @@
+file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
+
+caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES})
+
+add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS})
+
+if (MSVC)
+  if(BUILD_SHARED_LIBS)
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
+  else()
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=")
+  endif()
+  target_compile_definitions(
+      Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE})
+endif()
+
+install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto)
diff --git a/caffe/proto/__init__.py b/caffe/proto/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe/proto/caffe.proto b/caffe/proto/caffe.proto
new file mode 100644
index 0000000..1556781
--- /dev/null
+++ b/caffe/proto/caffe.proto
@@ -0,0 +1,1399 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 41 (last added: type)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, all states will have solver = true;
+  // train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38;
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional ParameterParameter parameter_param = 145;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional WindowDataParameter window_data_param = 129;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss layer.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message BatchNormParameter {
+  // If false, accumulate global mean/variance values via a moving average. If
+  // true, use those accumulated values instead of computing mean/variance
+  // across the batch.
+  optional bool use_global_stats = 1;
+  // How much does the moving average decay each iteration?
+  optional float moving_average_fraction = 2 [default = .999];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [default = 1e-5];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [default = 2];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
diff --git a/caffe2/.clang-format b/caffe2/.clang-format
new file mode 100644
index 0000000..1307bf2
--- /dev/null
+++ b/caffe2/.clang-format
@@ -0,0 +1,87 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
new file mode 100644
index 0000000..07f0164
--- /dev/null
+++ b/caffe2/CMakeLists.txt
@@ -0,0 +1,553 @@
+# ---[ Generate and install header and cpp files
+include(../cmake/Codegen.cmake)
+
+# ---[ Declare source file lists
+
+# ---[ Shared build
+add_subdirectory(utils)
+
+# ---[ ATen build
+if(BUILD_ATEN)
+  set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  set(AT_LINK_STYLE INTERFACE)
+  add_subdirectory(../aten aten)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE})
+
+  if(BUILD_CAFFE2)
+    # Generate the headers wrapped by our operator
+    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
+    COMMAND
+    ${PYCMD} ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
+      --aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten
+      --template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten
+      --yaml_dir=${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen
+      --install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten
+    DEPENDS
+    ATEN_CPU_FILES_GEN_TARGET
+    ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
+    ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h)
+
+    add_custom_target(__aten_op_header_gen
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h)
+    add_library(aten_op_header_gen INTERFACE)
+    add_dependencies(aten_op_header_gen __aten_op_header_gen)
+  endif()
+
+  # Add source, includes, and libs to lists
+  list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
+  list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
+  # ATen tests use catch instead of gtest so keep separate for now
+  # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
+  # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
+  list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
+  list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
+  list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
+  list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
+  list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
+
+  IF(USE_ROCM)
+    # Set the HIP Variables
+    set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
+    set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
+  ENDIF(USE_ROCM)
+endif()
+
+# ---[ Caffe2 build
+if(BUILD_CAFFE2)
+  # Note: the folders that are being commented out have not been properly
+  # addressed yet.
+  add_subdirectory(proto)
+  add_subdirectory(contrib)
+  add_subdirectory(core)
+  add_subdirectory(core/nomnigraph)
+  add_subdirectory(core/dispatch)
+  if (USE_NVRTC)
+    add_subdirectory(cuda_rtc)
+  endif()
+  add_subdirectory(db)
+  add_subdirectory(distributed)
+  # add_subdirectory(experiments) # note, we may remove this folder at some point
+  add_subdirectory(ideep)
+  add_subdirectory(image)
+  add_subdirectory(video)
+  add_subdirectory(mkl)
+  add_subdirectory(mobile)
+  add_subdirectory(mpi)
+  add_subdirectory(observers)
+  add_subdirectory(onnx)
+  add_subdirectory(operators)
+  add_subdirectory(operators/rnn)
+  add_subdirectory(opt)
+  add_subdirectory(perfkernels)
+  add_subdirectory(python)
+  add_subdirectory(queue)
+  add_subdirectory(sgd)
+  add_subdirectory(share)
+  # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
+  add_subdirectory(transforms)
+endif()
+
+# Advanced: if we have white list specified, we will do intersections for all
+# main lib srcs.
+if (CAFFE2_WHITELISTED_FILES)
+  caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES)
+  caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES)
+  caffe2_do_whitelist(Caffe2_HIP_SRCS CAFFE2_WHITELISTED_FILES)
+endif()
+
+# Debug messages - if you want to get a list of source files, enable the
+# following.
+if (FALSE)
+  message(STATUS "CPU sources: ")
+  foreach(tmp ${Caffe2_CPU_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "GPU sources: ")
+  foreach(tmp ${Caffe2_GPU_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "CPU include: ")
+  foreach(tmp ${Caffe2_CPU_INCLUDE})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "GPU include: ")
+  foreach(tmp ${Caffe2_GPU_INCLUDE})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "CPU test sources: ")
+  foreach(tmp ${Caffe2_CPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "GPU test sources: ")
+  foreach(tmp ${Caffe2_GPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "HIP sources: ")
+  foreach(tmp ${Caffe2_HIP_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "HIP test sources: ")
+  foreach(tmp ${Caffe2_HIP_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "ATen CPU test sources: ")
+  foreach(tmp ${ATen_CPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "ATen CUDA test sources: ")
+  foreach(tmp ${ATen_CUDA_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+endif()
+
+# ---[ List of libraries to link with
+if (BUILD_CAFFE2)
+  add_library(caffe2_protos STATIC $<TARGET_OBJECTS:Caffe_PROTO> $<TARGET_OBJECTS:Caffe2_PROTO>)
+  add_dependencies(caffe2_protos Caffe_PROTO Caffe2_PROTO)
+else()
+  # Do not include caffe2 or caffe protos, but rather have it only be
+  # a library to attach local protobuf.
+  add_library(caffe2_protos STATIC utils/dummy.cpp)
+endif()
+# If we are going to link protobuf locally inside caffe2 libraries, what we will do is
+# to create a helper static library that always contains libprotobuf source files, and
+# link the caffe2 related dependent libraries to it.
+target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
+# Reason for this public dependency is as follows:
+# (1) Strictly speaking, we should not expose any Protobuf related functions. We should
+#     only use function interfaces wrapped with our own public API, and link protobuf
+#     locally.
+# (2) However, currently across the Caffe2 codebase, we have extensive use of protobuf
+#     functionalities. For example, not only libcaffe2.so uses it, but also other
+#     binaries such as python extensions etc. As a result, we will have to have a
+#     transitive dependency to libprotobuf.
+#
+# Good thing is that, if we specify CAFFE2_LINK_LOCAL_PROTOBUF, then we do not need to
+# separately deploy protobuf binaries - libcaffe2.so will contain all functionalities
+# one needs. One can verify this via ldd.
+#
+# TODO item in the future includes:
+# (1) Enable using lite protobuf
+# (2) Properly define public API that do not directly depend on protobuf itself.
+# (3) Expose the libprotobuf.a file for dependent libraries to link to.
+#
+# What it means for users/developers?
+# (1) Users: nothing affecting the users, other than the fact that CAFFE2_LINK_LOCAL_PROTOBUF
+#     avoids the need to deploy protobuf.
+# (2) Developers: if one simply uses core caffe2 functionality without using protobuf,
+#     nothing changes. If one has a dependent library that uses protobuf, then one needs to
+#     have the right protobuf version as well as linking to libprotobuf.a.
+target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
+
+# Compile exposed libraries.
+list(APPEND Caffe2_CPU_SRCs $<TARGET_OBJECTS:c10>)
+add_library(caffe2 ${Caffe2_CPU_SRCS})
+caffe2_interface_library(caffe2_protos caffe2_protos_whole)
+target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
+if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+  target_link_libraries(caffe2 INTERFACE protobuf::libprotobuf)
+else()
+  target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
+endif()
+target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
+target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
+target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
+target_include_directories(caffe2 INTERFACE $<INSTALL_INTERFACE:include>)
+target_include_directories(caffe2 PRIVATE ${Caffe2_CPU_INCLUDE})
+target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
+# Set standard properties on the target
+aten_set_target_props(caffe2)
+target_compile_options(caffe2 INTERFACE "-std=c++11")
+target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
+# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
+target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
+install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
+caffe2_interface_library(caffe2 caffe2_library)
+list(APPEND Caffe2_MAIN_LIBS caffe2_library)
+
+# ---[ CUDA library.
+if(USE_CUDA)
+  # A hack to deal with cuda library dependencies and modern CMake: the
+  # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
+  # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
+  # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
+  # it. We will then manually add the cudart library as interface libs.
+  set(__tmp ${CUDA_LIBRARIES})
+  set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+  torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS})
+  set(CUDA_LIBRARIES ${__tmp})
+  target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)
+
+  target_include_directories(
+      caffe2_gpu INTERFACE $<INSTALL_INTERFACE:include>)
+  target_include_directories(
+      caffe2_gpu PRIVATE ${Caffe2_GPU_INCLUDE})
+  target_link_libraries(
+      caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
+
+  # These public dependencies must go after the previous dependencies, as the
+  # order of the libraries in the linker call matters here when statically
+  # linking; libculibos and cublas must be last.
+  target_link_libraries(
+      caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+
+  # Set standard properties on the target
+  aten_set_target_props(caffe2_gpu)
+
+  install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib)
+  caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library)
+endif()
+
+# ---[ Caffe2 HIP sources.
+if(USE_ROCM)
+  # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
+  IF(BUILD_ATEN)
+     HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
+  ENDIF()
+  IF(BUILD_CAFFE2)
+     set_source_files_properties(${Caffe2_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  ENDIF()
+  hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS})
+
+  # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added.
+  set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS})
+  target_link_libraries(caffe2_hip PUBLIC caffe2)
+  target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+
+  # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
+  target_include_directories(caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
+  target_include_directories(caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+
+  IF(BUILD_ATEN)
+    aten_set_target_props(caffe2_hip)
+  ENDIF()
+
+  # When a library has object files that contain device code, it needs to use hipcc/hcc to link.
+  set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+
+  caffe2_interface_library(caffe2_hip caffe2_hip_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
+  install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
+endif()
+
+# ---[ Check if warnings should be errors.
+if ($ENV{WERROR})
+  target_compile_options(caffe2 PRIVATE -Werror)
+  if(USE_CUDA)
+    target_compile_options(caffe2_gpu PRIVATE -Werror)
+  endif()
+endif()
+
+# ---[ Test binaries.
+if(BUILD_CAFFE2)
+  if (BUILD_TEST)
+    set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS})
+    if (USE_CUDA)
+      list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS})
+    endif()
+
+    foreach(test_src ${Caffe2_ALL_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      add_executable(${test_name} "${test_src}")
+      target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
+      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
+        target_compile_features(${test_name} PRIVATE cxx_range_for)
+      endif()
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      if (INSTALL_TEST)
+        install(TARGETS ${test_name} DESTINATION test)
+      endif()
+    endforeach()
+
+    if(USE_ROCM)
+      foreach(test_src ${Caffe2_HIP_TEST_SRCS})
+        set_source_files_properties(${test_src} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        get_filename_component(test_name ${test_src} NAME_WE)
+        hip_add_executable(${test_name} "${test_src}")
+        target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
+        if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
+          target_compile_features(${test_name} PRIVATE cxx_range_for)
+        endif()
+        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+        if (INSTALL_TEST)
+          install(TARGETS ${test_name} DESTINATION test)
+        endif()
+      endforeach()
+    endif()
+
+  endif()
+endif()
+
+set(__aten_test_dir "test")
+if(BUILD_CAFFE2)
+  # Aten tests should only run when Caffe2 is not built
+  set(__aten_test_dir "test/aten")
+endif()
+# Todo - Set up ATen tests for ROCm in an upcoming PR
+if(BUILD_ATEN AND NOT USE_ROCM)
+  foreach(test_src ${ATen_CPU_TEST_SRCS})
+    get_filename_component(test_name ${test_src} NAME_WE)
+    add_executable(${test_name} "${test_src}")
+    target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+    target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
+    target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
+  endforeach()
+
+  if(USE_CUDA OR USE_ROCM)
+    foreach(test_src ${ATen_CUDA_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      torch_cuda_based_add_executable(${test_name} "${test_src}")
+      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+      target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
+      target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
+    endforeach()
+  endif()
+endif()
+
+if(BUILD_CAFFE2)
+  if (BUILD_PYTHON)
+    # Python site-packages
+    # Get canonical directory for python site packages (relative to install
+    # location).  It varies from system to system.
+    pycmd(PYTHON_SITE_PACKAGES "
+        from distutils import sysconfig
+        print(sysconfig.get_python_lib(prefix=''))
+    ")
+    SET(PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES} PARENT_SCOPE) # for Summary
+    # ---[ Options.
+    SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)")
+    message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path")
+    # Python extension suffix
+    # Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first,
+    # fallback to ".pyd" if windows and ".so" for all others.
+    pycmd(PY_EXT_SUFFIX "
+        from distutils import sysconfig
+        ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
+        print(ext_suffix if ext_suffix else '')
+    ")
+    if("${PY_EXT_SUFFIX}" STREQUAL "")
+      if (MSVC)
+        set(PY_EXT_SUFFIX ".pyd")
+      else()
+        set(PY_EXT_SUFFIX ".so")
+      endif()
+    endif()
+
+    # Allow different install locations for libcaffe2
+    # For setuptools installs (that all build Python), install libcaffe2 into
+    # site-packages, alongside the torch libraries. The pybind11 library needs
+    # an rpath to the torch library folder
+    # For cmake installs, including c++ only installs, install libcaffe2 into
+    # CMAKE_INSTALL_PREFIX/lib . The pybind11 library can have a hardcoded
+    # rpath
+    if(APPLE)
+      set(_rpath_portable_origin "@loader_path")
+    else()
+      set(_rpath_portable_origin $ORIGIN)
+    endif(APPLE)
+    set(caffe2_pybind11_rpath "${_rpath_portable_origin}")
+    if(${BUILDING_WITH_TORCH_LIBS})
+      # site-packages/caffe2/python/caffe2_pybind11_state
+      # site-packages/torch/lib
+      set(caffe2_pybind11_rpath "${_rpath_portable_origin}/../../torch/lib")
+    endif(${BUILDING_WITH_TORCH_LIBS})
+
+
+    # ---[ Python.
+    add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
+    set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+    set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "" DEBUG_POSTFIX "")
+    set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+    if (APPLE)
+      set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+    endif()
+    target_link_libraries(
+        caffe2_pybind11_state caffe2_library)
+    if (WIN32)
+      target_link_libraries(caffe2_pybind11_state ${PYTHON_LIBRARIES})
+    endif(WIN32)
+
+    # Install caffe2_pybind11_state(_gpu|hip) in site-packages/caffe2/python,
+    # so it needs an rpath to find libcaffe2
+    set_target_properties(
+        caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+        ${CMAKE_BINARY_DIR}/caffe2/python)
+    install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
+    set_target_properties(caffe2_pybind11_state PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
+
+    if(USE_CUDA)
+      add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "" DEBUG_POSTFIX "")
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+      if (APPLE)
+        set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      endif()
+      target_link_libraries(
+          caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library)
+      if (WIN32)
+        target_link_libraries(caffe2_pybind11_state_gpu ${PYTHON_LIBRARIES})
+      endif(WIN32)
+
+      # Install with same rpath as non-gpu caffe2_pybind11_state
+      set_target_properties(
+          caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+          ${CMAKE_BINARY_DIR}/caffe2/python)
+      install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
+    endif()
+
+    if(USE_ROCM)
+      add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden")
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+      if (APPLE)
+        set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      endif()
+      target_link_libraries(
+          caffe2_pybind11_state_hip caffe2_library caffe2_hip_library)
+      if (WIN32)
+        target_link_libraries(caffe2_pybind11_state_hip ${PYTHON_LIBRARIES})
+      endif(WIN32)
+
+      # Install with same rpath as non-hip caffe2_pybind11_state
+      set_target_properties(
+          caffe2_pybind11_state_hip PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+          ${CMAKE_BINARY_DIR}/caffe2/python)
+      install(TARGETS caffe2_pybind11_state_hip DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
+    endif()
+
+    if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
+      # If we are building under windows, we will copy the file from
+      # build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
+      # to its parent folder so that we can do in-build execution.
+      add_custom_target(windows_python_copy_lib ALL)
+      add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
+      add_custom_command(
+          TARGET windows_python_copy_lib POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy
+          $<TARGET_FILE:caffe2_pybind11_state>
+          ${CMAKE_BINARY_DIR}/caffe2/python)
+      if (USE_CUDA)
+        add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
+        add_custom_command(
+            TARGET windows_python_copy_lib POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy
+            $<TARGET_FILE:caffe2_pybind11_state_gpu>
+            ${CMAKE_BINARY_DIR}/caffe2/python)
+      endif()
+      if (USE_ROCM)
+        add_dependencies(windows_python_copy_lib caffe2_pybind11_state_hip)
+        add_custom_command(
+            TARGET windows_python_copy_lib POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy
+            $<TARGET_FILE:caffe2_pybind11_state_hip>
+            ${CMAKE_BINARY_DIR}/caffe2/python)
+      endif()
+    endif()
+
+    # Finally, Copy all python files to build directory
+    # Generate and create all needed __init__.py files, if they aren't already
+    # present in the current source tree.
+    message(STATUS "Automatically generating missing __init__.py files.")
+    caffe_autogen_init_py_files()
+
+    # Create a custom target that copies all python files.
+    file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
+         "${PROJECT_SOURCE_DIR}/caffe2/*.py")
+    add_custom_target(python_copy_files ALL)
+    if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja")
+      # ninja fails when the command line is too long so we split
+      # the target into several. This would be beneficial for VS also
+      # since it build targets in parallel but not custom commands
+      foreach(python_src ${PYTHON_SRCS})
+        get_filename_component(dir ${python_src} DIRECTORY)
+        string(SHA1 name_hash "${python_src}")
+        # get_filename_component(name_we ${python_src} NAME_WE)
+        add_custom_target(python_copy_files_${name_hash}
+            COMMAND ${CMAKE_COMMAND} -E copy
+            ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
+        add_dependencies(python_copy_files python_copy_files_${name_hash})
+      endforeach()
+    else()
+      foreach(python_src ${PYTHON_SRCS})
+        get_filename_component(dir ${python_src} DIRECTORY)
+        add_custom_command(
+            TARGET python_copy_files PRE_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy
+            ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
+      endforeach()
+    endif()
+
+    # Install commands
+    # Pick up static python files
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
+            FILES_MATCHING PATTERN "*.py")
+    # Caffe proto files
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH}
+            FILES_MATCHING PATTERN "*.py")
+    # Caffe2 proto files
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
+            FILES_MATCHING PATTERN "*.py")
+  endif()
+endif()
+
+# Finally, set the Caffe2_MAIN_LIBS variable in the parent scope.
+set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE)
diff --git a/caffe2/README.md b/caffe2/README.md
new file mode 100644
index 0000000..a1166b8
--- /dev/null
+++ b/caffe2/README.md
@@ -0,0 +1,21 @@
+# Caffe2
+
+[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master)
+
+Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind.
+
+## Questions and Feedback
+
+Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features.
+
+### Further Resources on [Caffe2.ai](http://caffe2.ai)
+
+* [Installation](http://caffe2.ai/docs/getting-started.html)
+* [Learn More](http://caffe2.ai/docs/learn-more.html)
+* [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html)
+* [Datasets](http://caffe2.ai/docs/datasets.html)
+* [Model Zoo](http://caffe2.ai/docs/zoo.html)
+* [Tutorials](http://caffe2.ai/docs/tutorials.html)
+* [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html)
+* [C++ API](http://caffe2.ai/doxygen-c/html/classes.html)
+* [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html)
diff --git a/caffe2/VERSION_NUMBER b/caffe2/VERSION_NUMBER
new file mode 100644
index 0000000..53a48a1
--- /dev/null
+++ b/caffe2/VERSION_NUMBER
@@ -0,0 +1 @@
+0.8.2
\ No newline at end of file
diff --git a/caffe2/__init__.py b/caffe2/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/CMakeLists.txt b/caffe2/contrib/CMakeLists.txt
new file mode 100644
index 0000000..be8c0bd
--- /dev/null
+++ b/caffe2/contrib/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_subdirectory(aten)
+add_subdirectory(gloo)
+add_subdirectory(nccl)
+add_subdirectory(opencl)
+add_subdirectory(prof)
+add_subdirectory(shm_mutex)
+add_subdirectory(script)
+if (USE_TENSORRT)
+add_subdirectory(tensorrt)
+endif()
+
+# Pass the src lists back to the parent
+
+# CPU source, include, deps, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
+set(Caffe2_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, include, deps, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE)
+set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
diff --git a/caffe2/contrib/__init__.py b/caffe2/contrib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt
new file mode 100644
index 0000000..5bc2341
--- /dev/null
+++ b/caffe2/contrib/aten/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(BUILD_ATEN)
+  # Add source generated by Codegen.cmake and pass to parent
+  list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
+  list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+endif()
diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md
new file mode 100644
index 0000000..d3046aa
--- /dev/null
+++ b/caffe2/contrib/aten/README.md
@@ -0,0 +1,80 @@
+# An ATen operator for Caffe2
+
+[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
+and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API
+that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
+ToffeeIR.
+
+
+### Example Usage in Caffe2
+
+First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
+[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+
+We will call the `pow` operator:
+
+```
+static inline Tensor pow(const Tensor & self, Scalar exponent);
+```
+
+Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
+and there is always a string attribute `operator` that defines which ATen function to call:
+
+
+```
+import numpy as np
+from caffe2.python import core, workspace
+
+
+# create the Caffe2 Op:
+op = core.CreateOperator(
+    "ATen",
+    ["MyInput"],
+    ["MyOutput"],
+    operator="pow", exponent=2.0)
+
+```
+
+Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
+Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
+In the case of `Scalar` the attributes can be either an integers or floating point numbers.
+
+The op can now be run like any other Caffe2 operator:
+
+```
+workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
+workspace.RunOperatorOnce(op)
+print(workspace.FetchBlob("MyOutput")
+```
+
+For methods, the first input is always the `this` Tensor in C++.
+To call methods of ATen's `Type` objects, you provide an additional string attribute
+that determines the type:
+
+```
+# create a 2x4 tensor filled with floating point ones
+op = core.CreateOperator(
+    "ATen",
+    [],
+    ["MyOutput"],
+    operator="ones", type="Float", size={2,4})
+```
+
+Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
+
+### Example Usage via PyTorch Symbolic
+
+The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
+to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
+
+```
+class Add(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(g, a, b):
+        return g.op("ATen", a, b, operator_s = "add")
+
+    @staticmethod
+    def forward(ctx, a, b):
+        return a + b
+```
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
new file mode 100644
index 0000000..bc93f48
--- /dev/null
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -0,0 +1,26 @@
+#include "caffe2/contrib/aten/aten_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
+template<>
+at::Backend ATenOp<CPUContext>::backend() const {
+  return at::kCPU;
+}
+
+OPERATOR_SCHEMA(ATen);
+CAFFE_KNOWN_TYPE(at::Half);
+
+namespace math {
+template <>
+void Set<at::Half, CPUContext>(
+    const size_t /*N*/,
+    const at::Half h,
+    at::Half* v,
+    CPUContext* c) {
+  Set(0, h.x, (uint16_t*) v, c);
+}
+}
+
+}
diff --git a/caffe2/contrib/aten/aten_op.h b/caffe2/contrib/aten/aten_op.h
new file mode 100644
index 0000000..7161e4a
--- /dev/null
+++ b/caffe2/contrib/aten/aten_op.h
@@ -0,0 +1 @@
+#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"
diff --git a/caffe2/contrib/aten/aten_op_cuda.cc b/caffe2/contrib/aten/aten_op_cuda.cc
new file mode 100644
index 0000000..d416e70
--- /dev/null
+++ b/caffe2/contrib/aten/aten_op_cuda.cc
@@ -0,0 +1,23 @@
+#include "caffe2/contrib/aten/aten_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
+template<>
+at::Backend ATenOp<CUDAContext>::backend() const {
+  return at::kCUDA;
+}
+
+namespace math {
+template <>
+void Set<at::Half, CUDAContext>(
+    const size_t /*N*/,
+    const at::Half h,
+    at::Half* v,
+    CUDAContext* c) {
+  Set(0, h.x, (uint16_t*) v, c);
+}
+}
+
+}
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
new file mode 100644
index 0000000..feccafd
--- /dev/null
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -0,0 +1,226 @@
+#pragma once
+#include <unordered_map>
+#include <string>
+#include <ATen/ATen.h>
+#include <caffe2/core/context.h>
+#include <caffe2/core/operator.h>
+#include <caffe2/utils/math.h>
+#include <iostream>
+
+// a map from descriptor strings (see [DESCRIPTORS])
+// to the key in the switch statement that implements them
+static std::unordered_map<std::string, int> op_to_key = {
+  ${mappings}
+};
+
+namespace caffe2 {
+
+using at::Half; // for AT_FORALL_SCALAR_TYPES
+
+template <class Context>
+class ATenOp : public Operator<Context> {
+ public:
+  ATenOp(const OperatorDef& operator_def, Workspace* ws)
+  : Operator<Context>(operator_def, ws) {
+    VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
+    switch(findImplementation(operator_def)) {
+      ${implementations}
+      default:
+        CAFFE_THROW("Unexpected key value for aten operator");
+    }
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return run_op();
+  }
+private:
+  // actual operator implementation is initialized in ctor.
+  std::function<bool()> run_op;
+  at::Backend backend() const;
+
+  TypeMeta typeMetaFor(const at::Tensor & t) {
+    return typeMetaFor(t.type().scalarType());
+  }
+  TypeMeta typeMetaFor(at::ScalarType st) {
+    #define DEFINE_CASE(ctype,aten_name,_) \
+      case at::k##aten_name: \
+        return TypeMeta::Make<ctype>();
+    switch(st) {
+      AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+      default:
+        CAFFE_THROW("Unknown ATen Type");
+    }
+    #undef DEFINE_CASE
+  }
+
+  at::Type & typeFor(const Tensor<Context> & ten) {
+    return at::getType(backend(), atScalarTypeFor(ten.meta()));
+  }
+  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
+    auto& ten = const_cast<Tensor<Context>&>(ten_);
+    return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
+  }
+
+  at::Tensor peek(size_t i, size_t N) {
+    auto real_idx = InputSize() - N + i;
+    return tensorWrapping(Input(real_idx));
+  }
+
+  std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
+    std::vector<at::Tensor> results;
+    for (size_t ii = i; ii < i + len; ++ii) {
+      results.push_back(peek(ii, N));
+    }
+    return results;
+  }
+
+  at::ScalarType atScalarTypeFor(const TypeMeta & meta) {
+    #define DEFINE_IF(ctype,aten_name,_) \
+    if(meta.Match<ctype>()) { \
+      return at::k##aten_name; \
+    }
+    AT_FORALL_SCALAR_TYPES(DEFINE_IF)
+    #undef DEFINE_IF
+    // Special case for bool, since the type in ATen is actually Byte
+    if (meta.Match<bool>()) {
+      return at::kByte;
+    }
+    CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
+  }
+  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
+    at::Tensor src = src_.contiguous();
+    auto at_sizes = src.sizes();
+    std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
+    dst->Resize(dims);
+    dst->ShareExternalPointer(
+        src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
+          // return a closure that holds a handle to t until it is called
+          // to keep the aten memory alive
+          return src.reset();
+        });
+  }
+  void assignListStartingAt(
+      size_t offset,
+      const std::vector<at::Tensor>& tensors) {
+    for (size_t i = 0; i < tensors.size(); i++) {
+      assignTo(Output(offset + i), tensors[i]);
+    }
+  }
+
+  // the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument
+  // for each type to specify if it is stored as a integer or a double.
+  // We need this workaround here to extract the value in the scalar losslessly
+  // because in some cases like 'sum' Torch promotes float to double
+  // and will complain if we downcast it with toFloat, causing it
+  // to lose precision
+  double extract_d(const at::Scalar & s) {
+    return s.toDouble();
+  }
+  int64_t extract_i(const at::Scalar & s) {
+    return s.toLong();
+  }
+
+  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
+    switch(inferred_type.scalarType()) {
+      #define DEFINE_CASE(ctype,aten_name,native) \
+        case at::k##aten_name: { \
+          auto value = extract_##native(scalar); \
+          assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
+        } break;
+      AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+      #undef DEFINE_CASE
+      default:
+        CAFFE_THROW("Unknown ATen Type");
+    }
+  }
+  template<typename T>
+  void assignToValue(Tensor<Context> * dst, T v) {
+    dst->Resize(std::vector<TIndex>());
+    math::Set(1, v, dst->template mutable_data<T>(), &context_);
+  }
+  int findImplementation(const OperatorDef& operator_def) {
+    CAFFE_ENFORCE(HasArgument("operator"));
+    std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
+    // construct descriptor string ([DESCRIPTORS]) given the attributes
+    // and inputs of this operator_def, and look up the implementation key
+    // for this variant
+    std::stringstream descriptor;
+    descriptor << op;
+    std::vector<std::string> attrs;
+    for(size_t i = 0; i < operator_def.arg_size(); i++) {
+      auto & attr = operator_def.arg(i);
+      if(attr.name() == "operator" || attr.name() == "type" )
+        continue;
+      attrs.push_back(attr.name());
+    }
+    std::sort(attrs.begin(), attrs.end());
+    for(auto & a : attrs)
+      descriptor << "-" << a;
+
+    std::string descriptor_sized =
+        descriptor.str() + "-" + caffe2::to_string(InputSize());
+    std::string descriptor_var_args = descriptor.str() + "-*";
+    if (op_to_key.count(descriptor_sized) > 0) {
+      return op_to_key[descriptor_sized];
+    }
+    if (op_to_key.count(descriptor_var_args) > 0) {
+      return op_to_key[descriptor_var_args];
+    }
+    std::stringstream ss;
+    ss << "Attempting to run unknown ATen operator configuration: "
+       << descriptor_sized;
+    CAFFE_THROW(ss.str());
+  }
+  at::Scalar readScalarAttribute(const std::string & name) {
+    if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
+      return OperatorBase::GetSingleArgument<int64_t>(name, 0);
+    } else {
+      CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
+      return OperatorBase::GetSingleArgument<float>(name, 0);
+    }
+  }
+  template<typename T>
+  T readAttribute(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
+    return OperatorBase::GetSingleArgument<T>(name, 0);
+  }
+  std::vector<int64_t> readIntList(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
+    return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
+  }
+  template <int N>
+  std::array<bool, N> readBoolMask(const std::string& name) {
+    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
+    std::vector<int64_t> ints =
+        OperatorBase::GetRepeatedArgument<int64_t>(name, {});
+    std::array<bool, N> result;
+    for (size_t i = 0; i < N; ++i) {
+      result[i] = ints.at(i);
+    }
+    return result;
+  }
+  at::ScalarType stringToScalarType(const std::string & name) {
+    #define DEFINE_IF(type,aten) \
+      if(#type == name) \
+        return at::k##aten;
+    DEFINE_IF(float16, Half)
+    DEFINE_IF(float, Float)
+    DEFINE_IF(double, Double)
+    DEFINE_IF(uint8, Byte)
+    DEFINE_IF(int8, Char)
+    DEFINE_IF(int16, Short)
+    DEFINE_IF(int32, Int)
+    DEFINE_IF(int64, Long)
+    CAFFE_THROW("unsupported type annotation: ", name);
+  }
+  at::Type & stringToType(const std::string & name) {
+    return at::getType(backend(), stringToScalarType(name));
+  }
+  at::Type * readTypeAttribute(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
+    return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
+  }
+};
+
+}
diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py
new file mode 100644
index 0000000..52d5c38
--- /dev/null
+++ b/caffe2/contrib/aten/aten_test.py
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, dyndep
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op')
+
+
+class TestATen(hu.HypothesisTestCase):
+
+    @given(inputs=hu.tensors(n=2), **hu.gcs)
+    def test_add(self, inputs, gc, dc):
+        op = core.CreateOperator(
+             "ATen",
+             ["X", "Y"],
+             ["Z"],
+             operator="add")
+
+        def ref(X, Y):
+            return [X + Y]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(inputs=hu.tensors(n=1), **hu.gcs)
+    def test_pow(self, inputs, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z"],
+            operator="pow", exponent=2.0)
+
+        def ref(X):
+            return [np.square(X)]
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
+    def test_sort(self, x, gc, dc):
+        inputs = [np.random.permutation(x)]
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z", "I"],
+            operator="sort")
+
+        def ref(X):
+            return [np.sort(X), np.argsort(X)]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(inputs=hu.tensors(n=1), **hu.gcs)
+    def test_sum(self, inputs, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z"],
+            operator="sum")
+
+        def ref(X):
+            return [np.sum(X)]
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(**hu.gcs)
+    def test_ones(self, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            [],
+            ["Z"],
+            operator="ones", type="float", size={2, 4})
+
+        def ref():
+            return [np.ones([2, 4])]
+
+        self.assertReferenceChecks(gc, op, [], ref)
+
+    @given(**hu.gcs)
+    def test_index_put(self, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            ['self', 'indices', 'values'],
+            ["Z"],
+            operator="index_put")
+
+        def ref(self, indices, values):
+            self[indices] = values
+            return (self,)
+
+
+        tensor = np.random.randn(3, 3).astype(np.float32)
+        mask = np.array([[True, True, True], [True, False, False], [True, True, False]])
+        values = np.random.randn(6).astype(np.float32)
+
+        self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
+
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
new file mode 100644
index 0000000..04ddaef
--- /dev/null
+++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
@@ -0,0 +1,157 @@
+# Using ONNX and ATen to export models from PyTorch to Caffe2
+
+When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
+hitting operators that are not yet part of the ONNX specification. These may be
+operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
+are specific to a network.
+
+To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
+[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
+that can run these tensor functions in a Caffe2 network after importing them through ONNX.
+
+This guide explains how to configure Caffe2 and modify your PyTorch program to use
+this functionality.
+
+### Enable ATen in Caffe2
+
+The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
+when you configure Caffe2 using cmake:
+
+```
+git clone https://github.com/caffe2/caffe2/
+mkdir caffe2/build
+cd caffe2/build
+cmake -DUSE_ATEN=ON <other build options> ..
+make install
+```
+
+### Describe How to Export a PyTorch Autograd Function using ATen
+
+To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
+in the forward pass of a network. For each function in the trace, it calls that function's
+`symbolic` method which describes how to construct the part of the ONNX graph
+that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples).
+
+When equivalent ONNX operators do not exist, you can instead call any ATen function.
+As an example let's assume we have an autograd function which computes `x*x+y`:
+
+```
+  class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+      return x*x + y
+```
+
+We can add a `symbolic` method to it like so:
+
+```
+  class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+      return x*x + y
+    @staticmethod
+    def symbolic(graph, x, y):
+      x2 = graph.at("mul", x, x)
+      r = graph.at("add", x2, y)
+      # x, y, x2, and r are 'Node' objects
+      # print(r) or print(graph) will print out a textual representation for debugging.
+      # this representation will be converted to ONNX protobufs on export.
+      return r
+```
+
+The function `graph.at` adds a new ATen op the computation graph.
+You can call any ATen function using this facility. To do so,
+first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
+[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+
+As an example, we might want to call the `pow` operator:
+
+```
+static inline Tensor pow(const Tensor & self, Scalar exponent);
+```
+
+We can translate this into the equivalent `graph.at` function:
+
+```
+  def symbolic(graph, x):
+    graph.at("pow", x, exponent_f = 2.0) # compute x**2
+```
+
+Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
+like `exponent` becomes a keyword argument that specify ONNX attributes.
+Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
+
+For methods, the first input is always the `this` Tensor in C++.
+To call methods of ATen's `Type` objects, you provide an additional string attribute
+that determines the type. For instance, `ones` creates a new constant tensor of all ones:
+```
+class Type {
+	...
+	virtual Tensor ones(IntList size) const;
+	...
+};
+```
+
+From PyTorch it can be created by adding the type as an additional attribute:
+
+```
+  def symbolic(graph, x):
+    return graph.at("ones", type_s="float", size_i=[2,4])
+```
+
+
+Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
+
+## Putting it together
+
+With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
+
+```
+class MyModule(nn.Module):
+    def forward(self, x, y):
+        # you can combine your ATen ops with standard onnx ones
+        x = nn.ReLU()(x)
+        return MyFunction.apply(x, y)
+
+torch.onnx.export(MyModule(),
+                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
+                  "output.onnx",
+                  verbose=True)
+```
+
+This exports the following graph, which contains calls the `ATen` operator:
+
+```
+graph(%1 : Float(3, 4)
+       %2 : Float(3, 4)) {
+   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
+   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
+   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
+   return (%5);
+}
+```
+
+The graph can then be imported using ONNX and run with Caffe2:
+
+```
+import onnx
+import caffe2.python.onnx.backend
+import numpy as np
+
+graph = onnx.load("output.onnx")
+
+a = np.random.randn(3, 2).astype(np.float32)
+b = np.random.randn(3, 2).astype(np.float32)
+
+prepared_backend = caffe2.python.onnx.backend.prepare(graph)
+W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
+c2_out = prepared_backend.run(W)[0]
+
+x = np.maximum(a, 0)
+r = x*x + b
+np.testing.assert_array_almost_equal(r, c2_out)
+```
+
+### Code
+
+For the full source code for this tutorial, see [sample.py](sample.py).
diff --git a/caffe2/contrib/aten/docs/sample.py b/caffe2/contrib/aten/docs/sample.py
new file mode 100644
index 0000000..71e2005
--- /dev/null
+++ b/caffe2/contrib/aten/docs/sample.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+from torch import nn
+from torch.autograd import Variable, Function
+import torch.onnx
+
+import onnx
+import caffe2.python.onnx.backend
+
+class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+        return x*x + y
+    @staticmethod
+    def symbolic(graph, x, y):
+        x2 = graph.at("mul", x, x)
+        r = graph.at("add", x2, y)
+        # x, y, x2, and r are 'Node' objects
+        # print(r) or print(graph) will print out a textual representation for debugging.
+        # this representation will be converted to ONNX protobufs on export.
+        return r
+
+class MyModule(nn.Module):
+    def forward(self, x, y):
+        # you can combine your ATen ops with standard onnx ones
+        x = nn.ReLU()(x)
+        return MyFunction.apply(x, y)
+
+torch.onnx.export(MyModule(),
+                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
+                  "output.onnx",
+                  verbose=True)
+
+# prints the graph for debugging:
+# graph(%1 : Float(3, 4)
+#       %2 : Float(3, 4)) {
+#   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
+#   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
+#   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
+#   return (%5);
+# }
+
+graph = onnx.load("output.onnx")
+
+a = np.random.randn(3, 4).astype(np.float32)
+b = np.random.randn(3, 4).astype(np.float32)
+
+prepared_backend = caffe2.python.onnx.backend.prepare(graph)
+W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
+c2_out = prepared_backend.run(W)[0]
+
+x = np.maximum(a, 0)
+r = x*x + b
+np.testing.assert_array_almost_equal(r, c2_out)
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
new file mode 100755
index 0000000..18a3db4
--- /dev/null
+++ b/caffe2/contrib/aten/gen_op.py
@@ -0,0 +1,304 @@
+#!/bin/env python
+
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+import sys
+import yaml
+import argparse
+import os
+from copy import deepcopy
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--template_dir", default=".", help="where template.h is")
+parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
+                    help="where ATen yaml files are")
+parser.add_argument("--output_prefix", default="", help="")
+parser.add_argument(
+    "--install_dir", default=".", help="where to put generated file")
+parser.add_argument("--aten_root", default="", help="root directory of aten")
+args, _ = parser.parse_known_args()
+
+if args.aten_root:
+    if not os.path.exists(args.aten_root):
+        raise ValueError('aten_root ({}) does not exist'.format(
+            args.aten_root))
+    sys.path.append(os.path.join(args.aten_root, 'src', 'ATen'))
+    from code_template import CodeTemplate as CT
+else:
+    from src.ATen.code_template import CodeTemplate as CT
+
+OP_TEMPLATE = CT.from_file(
+    os.path.join(args.template_dir, 'aten_op_template.h'))
+
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+def write(filename, s):
+    with open(filename, "w") as f:
+        f.write(s)
+
+
+def read(filename):
+    with open(filename, "r") as f:
+        return f.read()
+
+
+def value_has_tensors(v):
+    # Sparse shouldn't appear in public API, seems to be temporary bug
+    return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
+
+
+def value_is_tensor_type(v):
+    return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
+
+
+# for each aten type, how do we handle a return value of that type?
+RETURN_MAP = {
+    'Tensor': 'assignTo(Output(${offset}),${output});',
+    'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});',
+    'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
+    'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
+    'std::vector<Tensor>': 'assignListStartingAt(${offset}, ${output});',
+}
+
+# for each non-Tensor aten argument, how to we read it from caffe2's
+# attribute list. Most of these call runtime functions defined in the
+# template class.
+ARGUMENT_MAP = {
+    'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
+    'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
+    'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
+    'double': 'double ${arg} = readAttribute<float>("${arg}");',
+    'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
+    'IntList': 'auto ${arg} = readIntList("${arg}");',
+    'std::array<bool, 2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
+    'std::array<bool, 3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
+}
+
+
+def expand(o):
+    num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
+    results = [o]
+    for i in range(0, num_defaults):
+        # last num_default values should be default
+        assert('default' in o['arguments'][-(i + 1)])
+        v = deepcopy(o)
+        v['arguments'] = v['arguments'][:-(i + 1)]
+        results.append(v)
+    return results
+
+
+# filter the list of declarations removing things we cannot support
+def supports(o, factory_methods):
+    # Ignore all families (!) of functions that have TensorOptions (i.e. tensor factory methods).
+    if o['name'] in factory_methods:
+        if factory_methods[o['name']] == 0:
+            print("Skipping {} because it is a factory method".format(o['name']))
+        factory_methods[o['name']] += 1
+        return False
+
+    # skip all in-place operators for now since aten cannot Resize
+    # caffe2 memory inside an operator
+    if o['inplace']:
+        return False
+
+    # _out variants also work in-place on arguments taken as destinations
+    # we also cannot handle these because aten cannot resize caffe2 Tensors
+    if "_out" in o['name']:
+        return False
+
+    # skip return types we cannot handle
+    for ret in o['returns']:
+        if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
+            print("Skipping {} Because of Ret: {} ({})".format(
+                  o['name'], ret['type'], ret['dynamic_type']))
+            return False
+
+    # skip arguments we cannot handle
+    for arg in o['arguments']:
+        if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
+            print("Skipping {} Because of Arg: {} ({}) ".format(
+                  o['name'], arg['type'], arg['dynamic_type']))
+            return False
+    return True
+
+
+# template for each potential operator.
+# each operator has an integer 'key' associated with it, and
+# a lambda that defines the operator
+# non-tensor attributes are created in ${initialization}
+# and then saved as arguments to the lambda
+# Inputs/Outputs are read inside the lambda
+OPTION_TEMPLATE = CT("""\
+case ${key}: { // ${name}
+    ${initialization}
+    run_op = [=] {
+        ${statements}
+        auto the_result = ${invocation};
+        ${assignments}
+        return true;
+    };
+} break;
+""")
+
+
+def get_output(o, i):
+    if len(o['returns']) == 1:
+        return 'the_result'
+    else:
+        return 'std::get<{}>(the_result)'.format(i)
+
+
+def attribute_names(o):
+    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
+
+
+def required_attribute_names(o):
+    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
+
+
+def self_as_first_argument(arguments):
+    return ([a for a in arguments if a['name'] == 'self'] +
+            [a for a in arguments if a['name'] != 'self'])
+
+
+def get_num_inputs(o):
+    args = 0
+    for a in o['arguments']:
+        if a['type'] == 'TensorList':
+            return '*'
+        elif value_has_tensors(a):
+            args += 1
+    return str(args)
+
+
+def find_factory_methods(decls):
+    factory_methods = {}
+    for o in decls:
+        if any(arg['dynamic_type'] == 'TensorOptions' for arg in o['arguments']):
+            factory_methods[o['name']] = 0
+    return factory_methods
+
+
+if __name__ == '__main__':
+    decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
+    factory_methods = find_factory_methods(decls)
+    filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
+    top_env = {
+        'mappings': [],
+        'implementations': [],
+    }
+    seen = set()
+    key = 0
+    for o in filtered:
+        # [DESCRIPTORS]
+        # each option is associated with a descriptor string that is used
+        # to figure out which version of an op is being used:
+        # The format is:
+        #     opname-num_inputs-attribute_1-attribute2
+        # Example:
+        #  lerp-2-weight
+        #  the operator lerp takes 2 arguments and has the attribute weight
+        attr_names = attribute_names(o)
+        num_inputs = get_num_inputs(o)
+        descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
+        if descriptor in seen:
+            continue
+        seen.add(descriptor)
+
+        # map from descriptor string to the integer key in the switch statements
+        # that initializes the operators
+        top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
+        env = {
+            'name': o['name'],
+            'statements': [],
+            'arguments': [],
+            'assignments': [],
+            'initialization': [],
+            'key': str(key),
+        }
+        defined_inferred_type = False
+
+        if 'Tensor' in o['method_of']:
+            # make sure 'self' is the first argument. currently Declarations.yaml
+            # does not always do this. Instead it keeps the argument list the same order
+            # as the Type method.
+            o['arguments'] = self_as_first_argument(o['arguments'])
+        elif 'namespace' not in o['method_of']:
+            # methods on type like 'ones' or 'zeros' always take a
+            # string attribute that is translated into the at::Type object
+            # e.g. "Float" is at::kFloat
+            assert('Type' in o['method_of'])
+            defined_inferred_type = True
+            env['initialization'].append(
+                'auto inferred_type = readTypeAttribute("type");')
+
+        static_tensor_inputs = sum(arg['type'] != 'TensorList' and value_is_tensor_type(arg) for arg in o['arguments'])
+        has_tensorlist = any(arg['type'] == 'TensorList' for arg in o['arguments'])
+        if has_tensorlist:
+            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] == 'TensorList'][0]
+
+        real_inputs = 0
+        for i, arg in enumerate(o['arguments']):
+            env['arguments'].append(arg['name'])
+            # Emulate logic in gen_jit_dispatch.py. Pretend the flat argument
+            # list is a stack where the end is the top.
+            view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
+            if arg['type'] == 'TensorList':
+                # NOTE: do not advance real_inputs here. After this we will
+                # switch to indexing the "stack" from the end as if we only had
+                env['statements'].append(
+                    'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
+                        .format(arg['name'], real_inputs, static_tensor_inputs))
+            elif value_is_tensor_type(arg):
+                # load tensor inputs from Caffe2
+
+                env['statements'].append(
+                    'auto {} = peek({}, {});'.format(arg['name'], real_inputs, view_length))
+                real_inputs += 1
+                if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type:
+                    # first tensor input is used to define the output type.
+                    defined_inferred_type = True
+                    env['statements'].append(
+                        'auto inferred_type = &({}.type());'.format(
+                            arg['name']))
+            else:
+                init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
+                env['initialization'].append(init)
+
+        for i, r in enumerate(o['returns']):
+            t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor']
+            assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
+            env['assignments'].append(assignment)
+
+        if 'Tensor' in o['method_of']:
+            env['invocation'] = "self.{}({})".format(
+                o['name'], ', '.join(env['arguments'][1:]))
+        elif 'namespace' in o['method_of']:
+            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
+        else:
+            assert('Type' in o['method_of'])
+            env['invocation'] = CT(
+                'inferred_type->${name}(${arguments})').substitute(env)
+
+        top_env['implementations'].append(OPTION_TEMPLATE.substitute(env))
+        key += 1
+    write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))
diff --git a/caffe2/contrib/cuda-convnet2/LICENSE b/caffe2/contrib/cuda-convnet2/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/caffe2/contrib/cuda-convnet2/README.md b/caffe2/contrib/cuda-convnet2/README.md
new file mode 100644
index 0000000..f921264
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/README.md
@@ -0,0 +1,7 @@
+# cuda-convnet2
+Automatically exported from code.google.com/p/cuda-convnet2
+
+You can read the documentation in two ways:
+
+1. On this site: go to branches > wiki.
+2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/
diff --git a/caffe2/contrib/cuda-convnet2/build.sh b/caffe2/contrib/cuda-convnet2/build.sh
new file mode 100755
index 0000000..1ecbdd2
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/build.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# Fill in the below environment variables.
+#
+# If you're not sure what these paths should be, 
+# you can use the find command to try to locate them.
+# For example, NUMPY_INCLUDE_PATH contains the file
+# arrayobject.h. So you can search for it like this:
+# 
+# find /usr -name arrayobject.h
+# 
+# (it'll almost certainly be under /usr)
+
+# CUDA toolkit installation directory.
+export CUDA_INSTALL_PATH=/usr/local/cuda
+
+# Python include directory. This should contain the file Python.h, among others.
+export PYTHON_INCLUDE_PATH=/usr/include/python2.7
+
+# Numpy include directory. This should contain the file arrayobject.h, among others.
+export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/
+
+# ATLAS library directory. This should contain the file libcblas.so, among others.
+export ATLAS_LIB_PATH=/usr/lib/atlas-base
+
+# You don't have to change these:
+export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH
+export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples
+export PATH=$PATH:$CUDA_INSTALL_PATH/bin
+
+cd util && make numpy=1 -j $* && cd ..
+cd nvmatrix && make -j $* && cd ..
+cd cudaconv3 && make -j $* && cd ..
+cd cudaconvnet && make -j $* && cd ..
+cd make-data/pyext && make -j $* && cd ../..
+
diff --git a/caffe2/contrib/cuda-convnet2/convdata.py b/caffe2/contrib/cuda-convnet2/convdata.py
new file mode 100644
index 0000000..c79b635
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/convdata.py
@@ -0,0 +1,291 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from python_util.data import *
+import numpy.random as nr
+import numpy as n
+import random as r
+from time import time
+from threading import Thread
+from math import sqrt
+import sys
+#from matplotlib import pylab as pl
+from PIL import Image
+from StringIO import StringIO
+from time import time
+import itertools as it
+    
+class JPEGBatchLoaderThread(Thread):
+    def __init__(self, dp, batch_num, label_offset, list_out):
+        Thread.__init__(self)
+        self.list_out = list_out
+        self.label_offset = label_offset
+        self.dp = dp
+        self.batch_num = batch_num
+        
+    @staticmethod
+    def load_jpeg_batch(rawdics, dp, label_offset):
+        if type(rawdics) != list:
+            rawdics = [rawdics]
+        nc_total = sum(len(r['data']) for r in rawdics)
+
+        jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics))
+        labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics))
+        
+        img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32)
+        lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32)
+        dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview)
+        lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1))
+        for c in xrange(nc_total):
+            lab_mat[c, [z + label_offset for z in labels[c]]] = 1
+        lab_mat = n.tile(lab_mat, (dp.data_mult, 1))
+        
+
+        return {'data': img_mat[:nc_total * dp.data_mult,:],
+                'labvec': lab_vec[:nc_total * dp.data_mult,:],
+                'labmat': lab_mat[:nc_total * dp.data_mult,:]}
+    
+    def run(self):
+        rawdics = self.dp.get_batch(self.batch_num)
+        p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics,
+                                                  self.dp,
+                                                  self.label_offset)
+        self.list_out.append(p)
+        
+class ColorNoiseMakerThread(Thread):
+    def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
+        Thread.__init__(self)
+        self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
+        self.num_noise = num_noise
+        self.list_out = list_out
+        
+    def run(self):
+        noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
+        self.list_out.append(noise)
+
+class ImageDataProvider(LabeledDataProvider):
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
+        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        self.data_mean = self.batch_meta['data_mean'].astype(n.single)
+        self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
+        self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
+        self.color_noise_coeff = dp_params['color_noise']
+        self.num_colors = 3
+        self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
+        self.mini = dp_params['minibatch_size']
+        self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size
+        self.inner_pixels = self.inner_size **2
+        self.border_size = (self.img_size - self.inner_size) / 2
+        self.multiview = dp_params['multiview_test'] and test
+        self.num_views = 5*2
+        self.data_mult = self.num_views if self.multiview else 1
+        self.batch_size = self.batch_meta['batch_size']
+        self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
+        self.scalar_mean = dp_params['scalar_mean'] 
+        # Maintain pointers to previously-returned data matrices so they don't get garbage collected.
+        self.data = [None, None] # These are pointers to previously-returned data matrices
+
+        self.loader_thread, self.color_noise_thread = None, None
+        self.convnet = dp_params['convnet']
+            
+        self.num_noise = self.batch_size
+        self.batches_generated, self.loaders_started = 0, 0
+        self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
+
+        if self.scalar_mean >= 0:
+            self.data_mean_crop = self.scalar_mean
+            
+    def showimg(self, img):
+        from matplotlib import pylab as pl
+        pixels = img.shape[0] / 3
+        size = int(sqrt(pixels))
+        img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
+        pl.imshow(img, interpolation='nearest')
+        pl.show()
+            
+    def get_data_dims(self, idx=0):
+        if idx == 0:
+            return self.inner_size**2 * 3
+        if idx == 2:
+            return self.get_num_classes()
+        return 1
+
+    def start_loader(self, batch_idx):
+        self.load_data = []
+        self.loader_thread = JPEGBatchLoaderThread(self,
+                                                   self.batch_range[batch_idx],
+                                                   self.label_offset,
+                                                   self.load_data)
+        self.loader_thread.start()
+        
+    def start_color_noise_maker(self):
+        color_noise_list = []
+        self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
+        self.color_noise_thread.start()
+        return color_noise_list
+
+    def set_labels(self, datadic):
+        pass
+    
+    def get_data_from_loader(self):
+        if self.loader_thread is None:
+            self.start_loader(self.batch_idx)
+            self.loader_thread.join()
+            self.data[self.d_idx] = self.load_data[0]
+
+            self.start_loader(self.get_next_batch_idx())
+        else:
+            # Set the argument to join to 0 to re-enable batch reuse
+            self.loader_thread.join()
+            if not self.loader_thread.is_alive():
+                self.data[self.d_idx] = self.load_data[0]
+                self.start_loader(self.get_next_batch_idx())
+            #else:
+            #    print "Re-using batch"
+        self.advance_batch()
+    
+    def add_color_noise(self):
+        # At this point the data already has 0 mean.
+        # So I'm going to add noise to it, but I'm also going to scale down
+        # the original data. This is so that the overall scale of the training
+        # data doesn't become too different from the test data.
+
+        s = self.data[self.d_idx]['data'].shape
+        cropped_size = self.get_data_dims(0) / 3
+        ncases = s[0]
+
+        if self.color_noise_thread is None:
+            self.color_noise_list = self.start_color_noise_maker()
+            self.color_noise_thread.join()
+            self.color_noise = self.color_noise_list[0]
+            self.color_noise_list = self.start_color_noise_maker()
+        else:
+            self.color_noise_thread.join(0)
+            if not self.color_noise_thread.is_alive():
+                self.color_noise = self.color_noise_list[0]
+                self.color_noise_list = self.start_color_noise_maker()
+
+        self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size))
+        self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
+        self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff
+        self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size))
+        self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division.
+    
+    def get_next_batch(self):
+        self.d_idx = self.batches_generated % 2
+        epoch, batchnum = self.curr_epoch, self.curr_batchnum
+
+        self.get_data_from_loader()
+
+        # Subtract mean
+        self.data[self.d_idx]['data'] -= self.data_mean_crop
+        
+        if self.color_noise_coeff > 0 and not self.test:
+            self.add_color_noise()
+        self.batches_generated += 1
+        
+        return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T]
+        
+        
+    # Takes as input an array returned by get_next_batch
+    # Returns a (numCases, imgSize, imgSize, 3) array which can be
+    # fed to pylab for plotting.
+    # This is used by shownet.py to plot test case predictions.
+    def get_plottable_data(self, data, add_mean=True):
+        mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1))
+        return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
+       
+class CIFARDataProvider(LabeledDataProvider):
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
+        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        self.img_size = 32 
+        self.num_colors = 3
+        self.inner_size =  dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size']
+        self.border_size = (self.img_size - self.inner_size) / 2
+        self.multiview = dp_params['multiview_test'] and test
+        self.num_views = 9
+        self.scalar_mean = dp_params['scalar_mean'] 
+        self.data_mult = self.num_views if self.multiview else 1
+        self.data_dic = []
+        for i in batch_range:
+            self.data_dic += [unpickle(self.get_data_file_name(i))]
+            self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single)
+            self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C')
+            self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C')
+        
+        self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
+
+        self.batches_generated = 0
+        self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
+
+    def get_next_batch(self):
+        epoch, batchnum = self.curr_epoch, self.curr_batchnum
+        self.advance_batch()
+        bidx = batchnum - self.batch_range[0]
+
+        cropped = self.cropped_data[self.batches_generated % 2]
+
+        self.__trim_borders(self.data_dic[bidx]['data'], cropped)
+        cropped -= self.data_mean
+        self.batches_generated += 1
+        return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']]
+        
+    def get_data_dims(self, idx=0):
+        return self.inner_size**2 * self.num_colors if idx == 0 else 1
+
+    # Takes as input an array returned by get_next_batch
+    # Returns a (numCases, imgSize, imgSize, 3) array which can be
+    # fed to pylab for plotting.
+    # This is used by shownet.py to plot test case predictions.
+    def get_plottable_data(self, data):
+        return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
+    
+    def __trim_borders(self, x, target):
+        y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1])
+
+        if self.test: # don't need to loop over cases
+            if self.multiview:
+                start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
+                                  (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
+                                  (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
+                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
+                for i in xrange(self.num_views):
+                    target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
+            else:
+                pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
+                target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
+        else:
+            for c in xrange(x.shape[1]): # loop over cases
+                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
+                endY, endX = startY + self.inner_size, startX + self.inner_size
+                pic = y[:,startY:endY,startX:endX, c]
+                if nr.randint(2) == 0: # also flip the image with 50% probability
+                    pic = pic[:,:,::-1]
+                target[:,c] = pic.reshape((self.get_data_dims(),))
+
+class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider):
+    def __init__(self, data_dim):
+        LabeledDummyDataProvider.__init__(self, data_dim)
+
+        self.img_size = int(sqrt(data_dim/3))
+        
+    def get_next_batch(self):
+        epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
+        dic = {'data': dic[0], 'labels': dic[1]}
+        print dic['data'].shape, dic['labels'].shape
+        return epoch, batchnum, [dic['data'], dic['labels']]
+    
+    # Returns the dimensionality of the two data matrices returned by get_next_batch
+    def get_data_dims(self, idx=0):
+        return self.batch_meta['num_vis'] if idx == 0 else 1
diff --git a/caffe2/contrib/cuda-convnet2/convnet.py b/caffe2/contrib/cuda-convnet2/convnet.py
new file mode 100644
index 0000000..99f8a94
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/convnet.py
@@ -0,0 +1,289 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as n
+import numpy.random as nr
+import random as r
+from python_util.util import *
+from python_util.data import *
+from python_util.options import *
+from python_util.gpumodel import *
+import sys
+import math as m
+import layer as lay
+from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider
+from os import linesep as NL
+import copy as cp
+import os
+
+class Driver(object):
+    def __init__(self, convnet):
+        self.convnet = convnet
+        
+    def on_start_batch(self, batch_data, train):
+        pass
+    
+    def on_finish_batch(self):
+        pass
+
+class GradCheckDriver(Driver):
+    def on_start_batch(self, batch_data, train):
+        data = batch_data[2]
+        self.convnet.libmodel.checkGradients(data)
+
+class TrainingDriver(Driver):
+    def on_start_batch(self, batch_data, train):
+        data = batch_data[2]
+        self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train)
+
+class MultiviewTestDriver(TrainingDriver):
+    def on_start_batch(self, batch_data, train):
+        self.write_output = False
+        if train:
+            TrainingDriver.on_start_batch(self, batch_data, train)
+        else:
+            data = batch_data[2]
+            num_views = self.convnet.test_data_provider.num_views
+            if self.convnet.test_out != "" and self.convnet.logreg_name != "":
+                self.write_output = True
+                self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1])
+                self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single)
+                self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name)
+            else:
+                self.convnet.libmodel.startMultiviewTest(data, num_views)
+            
+    def on_finish_batch(self):
+        if self.write_output:
+            if not os.path.exists(self.convnet.test_out):
+                os.makedirs(self.convnet.test_out)
+            pickle(self.test_file_name,  {'data': self.probs,
+                                          'note': 'generated from %s' % self.convnet.save_file})
+
+class FeatureWriterDriver(Driver):
+    def __init__(self, convnet):
+        Driver.__init__(self, convnet)
+        self.last_batch = convnet.test_batch_range[-1]
+        
+    def on_start_batch(self, batch_data, train):
+        if train:
+            raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.")
+        
+        self.batchnum, self.data = batch_data[1], batch_data[2]
+        
+        if not os.path.exists(self.convnet.feature_path):
+            os.makedirs(self.convnet.feature_path)
+        
+        self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs']
+        self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single)
+        self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features])
+    
+    def on_finish_batch(self):
+        path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum)
+        pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]})
+        print "Wrote feature file %s" % path_out
+        if self.batchnum == self.last_batch:
+            pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file,
+                                                                             'num_vis':self.num_ftrs,
+                                                                             'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']})
+
+class ConvNet(IGPUModel):
+    def __init__(self, op, load_dic, dp_params={}):
+        filename_options = []
+        for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'):
+            dp_params[v] = op.get_value(v)
+
+        IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
+        
+    def import_model(self):
+        lib_name = "cudaconvnet._ConvNet"
+        print "========================="
+        print "Importing %s C++ module" % lib_name
+        self.libmodel = __import__(lib_name,fromlist=['_ConvNet'])
+        
+    def init_model_lib(self):
+        self.libmodel.initModel(self.layers,
+                                self.device_ids,
+                                self.minibatch_size,
+                                self.conserve_mem)
+        
+    def init_model_state(self):
+        ms = self.model_state
+        layers = ms['layers'] if self.loaded_from_checkpoint else {}
+        ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def),
+                                                    os.path.join(self.layer_path, self.layer_params), self, layers=layers)
+        
+        self.do_decouple_conv()
+        self.do_unshare_weights()
+
+        self.op.set_value('conv_to_local', [], parse=False)
+        self.op.set_value('unshare_weights', [], parse=False)
+        
+        self.set_driver()
+    
+    def do_decouple_conv(self):
+        # Convert convolutional layers to local
+        if len(self.op.get_value('conv_to_local')) > 0:
+            for lname in self.op.get_value('conv_to_local'):
+                if self.model_state['layers'][lname]['type'] == 'conv':
+                    lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname)
+    
+    def do_unshare_weights(self):
+        # Decouple weight matrices
+        if len(self.op.get_value('unshare_weights')) > 0:
+            for name_str in self.op.get_value('unshare_weights'):
+                if name_str:
+                    name = lay.WeightLayerParser.get_layer_name(name_str)
+                    if name is not None:
+                        name, idx = name[0], name[1]
+                        if name not in self.model_state['layers']:
+                            raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
+                        layer = self.model_state['layers'][name]
+                        lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx)
+                    else:
+                        raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
+    
+    def set_driver(self):
+        if self.op.get_value('check_grads'):
+            self.driver = GradCheckDriver(self)
+        elif self.op.get_value('multiview_test'):
+            self.driver = MultiviewTestDriver(self)
+        elif self.op.get_value('write_features'):
+            self.driver = FeatureWriterDriver(self)
+        else:
+            self.driver = TrainingDriver(self)
+
+    def fill_excused_options(self):
+        if self.op.get_value('check_grads'):
+            self.op.set_value('save_path', '')
+            self.op.set_value('train_batch_range', '0')
+            self.op.set_value('test_batch_range', '0')
+            self.op.set_value('data_path', '')
+            
+    # Make sure the data provider returned data in proper format
+    def parse_batch_data(self, batch_data, train=True):
+        if max(d.dtype != n.single for d in batch_data[2]):
+            raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
+        return batch_data
+
+    def start_batch(self, batch_data, train=True):
+        self.driver.on_start_batch(batch_data, train)
+            
+    def finish_batch(self):
+        ret = IGPUModel.finish_batch(self)
+        self.driver.on_finish_batch()
+        return ret
+    
+    def print_iteration(self):
+        print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()),
+        
+    def print_train_time(self, compute_time_py):
+        print "(%.3f sec)" % (compute_time_py)
+        
+    def print_costs(self, cost_outputs):
+        costs, num_cases = cost_outputs[0], cost_outputs[1]
+        children = set()
+        for errname in costs:
+            if sum(errname in self.layers[z]['children'] for z in costs) == 0:
+#                print self.layers[errname]['children']
+                for child in set(self.layers[errname]['children']) & set(costs.keys()):
+                    costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])]
+                    children.add(child)
+            
+                filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases)
+                print "%s: " % errname,
+                if 'outputFilterFormatter' not in self.layers[errname]:
+                    print ", ".join("%.6f" % v for v in filtered_costs),
+                else:
+                    print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs),
+                if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]):
+                    print "<- error nan or inf!"
+                    sys.exit(1)
+        for c in children:
+            del costs[c]
+        
+    def print_train_results(self):
+        self.print_costs(self.train_outputs[-1])
+        
+    def print_test_status(self):
+        pass
+        
+    def print_test_results(self):
+        print NL + "======================Test output======================"
+        self.print_costs(self.test_outputs[-1])
+        if not self.test_only:
+            print NL + "----------------------Averages-------------------------"
+            self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):]))
+        print NL + "-------------------------------------------------------",
+        for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now.
+            l = self.layers[name]
+            if 'weights' in l:
+                wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))]
+                print ""
+                print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales),
+                print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
+        print ""
+        
+    def conditional_save(self):
+        self.save_state()
+        
+    def aggregate_test_outputs(self, test_outputs):
+        test_outputs = cp.deepcopy(test_outputs)
+        num_cases = sum(t[1] for t in test_outputs)
+        for i in xrange(1 ,len(test_outputs)):
+            for k,v in test_outputs[i][0].items():
+                for j in xrange(len(v)):
+                    test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
+        
+        return (test_outputs[0][0], num_cases)
+    
+    @classmethod
+    def get_options_parser(cls):
+        op = IGPUModel.get_options_parser()
+        op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
+        op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False)
+        op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
+        op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="")
+        op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range'])
+        op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
+        op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True)
+        op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
+        op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
+        op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
+        op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
+        op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
+        op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
+        op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1)
+        
+        op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path'])
+        op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="")
+
+        op.delete_option('max_test_err')
+        op.options["testing_freq"].default = 57
+        op.options["num_epochs"].default = 50000
+        op.options['dp_type'].default = None
+
+        DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider)
+        DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider)
+        DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider)
+  
+        return op
+
+if __name__ == "__main__":
+#    nr.seed(6)
+
+    op = ConvNet.get_options_parser()
+
+    op, load_dic = IGPUModel.parse_options(op)
+    model = ConvNet(op, load_dic)
+    model.start()
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile b/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
new file mode 100644
index 0000000..3d16e28
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
@@ -0,0 +1,108 @@
+################################################################################
+#
+# Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+
+# Location of the CUDA Toolkit binaries and libraries
+CUDA_INC_PATH  = $(CUDA_INSTALL_PATH)/include
+CUDA_BIN_PATH  = $(CUDA_INSTALL_PATH)/bin
+CUDA_LIB_PATH  = $(CUDA_INSTALL_PATH)/lib64
+
+# Common binaries
+NVCC            = $(CUDA_BIN_PATH)/nvcc
+GCC             = g++
+AR				= ar
+
+# CUDA code generation flags
+GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
+GENCODE_FLAGS   := $(GENCODE_SM35)
+
+LDFLAGS   := -L$(CUDA_LIB_PATH) -lcudart
+CCFLAGS   := -m64
+NVCCFLAGS := -m64
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS   += -g
+      NVCCFLAGS += -g -G
+      DBG := debug
+else
+      DBG := release
+      NVCCFLAGS += -O3
+      CCFLAGS += -O3
+endif
+
+# Add profiler output
+ifeq ($(prof),1)
+	NVCCFLAGS += --ptxas-options=-v
+endif
+
+TARGETDIR := ./bin/$(DBG)
+OBJDIR := ./obj/$(DBG)
+
+########## USER STUFF ###########
+LDFLAGS   		+= -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas
+INCLUDES      	:= -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
+
+CUFILES	:= $(shell find . -name "*.cu")
+CU_DEPS	:= $(shell find . -name "*.cuh")
+CCFILES	:= $(shell find . -name "*.cpp")
+C_DEPS	:= $(shell find . -name "*.h")
+
+NVCCFLAGS += --compiler-options '-fPIC'
+LDFLAGS += -shared
+CCFLAGS += -fPIC
+TARGET := $(TARGETDIR)/libcudaconv.so
+
+################################################################################
+# Set up target and object files
+################################################################################
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
+
+# Target rules
+all: makedirs $(TARGET)
+
+$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
+	$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
+	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
+
+$(TARGET): $(OBJS)
+	$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) 
+	ln -sf $(TARGET) .
+
+makedirs:
+	mkdir -p $(TARGETDIR)
+	mkdir -p $(OBJDIR)/src
+
+clean:
+	rm -rf ./obj
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh b/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
new file mode 100644
index 0000000..6a4bd95
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
@@ -0,0 +1,1164 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONV_UTIL_CUH
+#define CONV_UTIL_CUH
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+#include "caffe2/core/context_gpu.h"
+
+#ifndef MIN
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+void convLocalMaxUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX);
+void convLocalAvgUndo(
+    NVMatrix& avgGrads,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    int imgSize,
+    bool sum);
+
+void convLocalAvgUndo(
+    NVMatrix& avgGrads,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    int imgSize,
+    bool sum,
+    float scaleTargets,
+    float scaleOutput);
+void convLocalMaxUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    float scaleTargets,
+    float scaleOutput);
+
+void convResponseNorm(
+    NVMatrix& images,
+    NVMatrix& denoms,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float minDiv);
+void convResponseNormUndo(
+    NVMatrix& outGrads,
+    NVMatrix& denoms,
+    NVMatrix& inputs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float scaleTargets,
+    float scaleOutput);
+void convContrastNorm(
+    NVMatrix& images,
+    NVMatrix& meanDiffs,
+    NVMatrix& denoms,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float minDiv);
+void convContrastNormUndo(
+    NVMatrix& outGrads,
+    NVMatrix& denoms,
+    NVMatrix& meanDiffs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float scaleTargets,
+    float scaleOutput);
+
+void convGaussianBlur(
+    NVMatrix& images,
+    NVMatrix& filter,
+    NVMatrix& target,
+    bool horiz,
+    int numChannels,
+    float scaleTargets,
+    float scaleOutputs);
+void convBedOfNails(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numChannels,
+    int imgSize,
+    int startX,
+    int strideX,
+    float scaleTargets,
+    float scaleOutput);
+void convBedOfNailsUndo(
+    NVMatrix& actsGrad,
+    NVMatrix& target,
+    int numChannels,
+    int imgSize,
+    int startX,
+    int strideX,
+    float scaleTargets,
+    float scaleOutput);
+
+void convResizeBilinear(
+    NVMatrix& images,
+    NVMatrix& target,
+    int imgSize,
+    int tgtSize,
+    float scale);
+void convRGBToYUV(NVMatrix& images, NVMatrix& target);
+void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center);
+void convCrop(
+    NVMatrix& imgs,
+    NVMatrix& target,
+    int imgSize,
+    int tgtSize,
+    int startY,
+    int startX);
+void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm);
+void convContrastNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& meanDiffs,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked);
+void convResponseNormCrossMapUndo(
+    NVMatrix& outGrads,
+    NVMatrix& inputs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked,
+    float scaleTargets,
+    float scaleOutput);
+void convResponseNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    bool blocked);
+void convResponseNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked);
+void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize);
+
+void convCrossMapMaxPoolUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    const int imgSize,
+    const int startF,
+    const int poolSize,
+    const int stride,
+    const float scaleTargets,
+    const float scaleOutputs);
+
+cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor);
+
+template <bool sum>
+class AvgPooler {
+ public:
+  __device__ inline float operator()(const float a, const float b) const {
+    return a + b;
+  }
+  __device__ inline float getBaseValue() const {
+    return 0;
+  }
+  __device__ inline float output(const float a, const int regionSize) const {
+    return sum ? a : (a / regionSize);
+  }
+};
+
+class MaxPooler {
+ public:
+  __device__ inline float operator()(const float a, const float b) const {
+    return fmaxf(a, b);
+  }
+  __device__ inline float getBaseValue() const {
+    return -2e38;
+  }
+  __device__ inline float output(const float a, const int regionSize) const {
+    return a;
+  }
+};
+
+class MaxAbsPooler {
+ public:
+  __device__ inline float operator()(const float a, const float b) const {
+    return fabsf(a) > fabsf(b) ? a : b;
+  }
+  __device__ inline float getBaseValue() const {
+    return 0.0f;
+  }
+  __device__ inline float output(const float a, const int regionSize) const {
+    return a;
+  }
+};
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, numOutputs, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ */
+
+template <
+    class Agg,
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool checkCaseBounds>
+__global__ void kLocalPool(
+    float* imgs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int subsX,
+    const int startX,
+    const int strideX,
+    const int outputsX,
+    Agg agg) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = DIVUP(numFilters, B_Y * filtersPerThread);
+  const int outputIdxX = blockIdx.x / numImgBlocks;
+  const int outputIdxY = blockIdx.y / numFilterBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx =
+      (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
+  const int myFilterIdx = (blockFilterIdx + threadIdx.y * filtersPerThread);
+  if (myFilterIdx >= numFilters) {
+    return;
+  }
+
+  const int outputIdx = outputIdxY * outputsX + outputIdxX;
+  const int numOutputs = outputsX * outputsX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int startImgPxX = startX + outputIdxX * strideX;
+  const int startImgPxY = startX + outputIdxY * strideX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs += myFilterIdx * imgPixels * numImages + imgIdx;
+  target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = agg.getBaseValue();
+    }
+  }
+
+  const int loopStartY = MAX(0, startImgPxY);
+  const int loopStartX = MAX(0, startImgPxX);
+  const int loopEndY = MIN(imgSize, startImgPxY + subsX);
+  const int loopEndX = MIN(imgSize, startImgPxX + subsX);
+  const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
+  for (int y = loopStartY; y < loopEndY; y++) {
+    for (int x = loopStartX; x < loopEndX; x++) {
+      const int imgPx = y * imgSize + x;
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[f][i] =
+                agg(prod[f][i],
+                    imgs[(f * imgPixels + imgPx) * numImages + i * B_X]);
+          }
+        }
+      }
+    }
+  }
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        target[f * numOutputs * numImages + i * B_X] =
+            agg.output(prod[f][i], regionSize);
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, output idx in batches of B_Y
+ *
+ * So each block does one pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines output idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numOutputs, imgPixels, numImages) (out)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ */
+template <class Agg, int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds>
+__global__ void kPoolCrossMap(
+    float* imgs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int startF,
+    const int poolSize,
+    const int numOutputs,
+    const int stride,
+    Agg agg) {
+  const int imgPixels = imgSize * imgSize;
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  //    const int numOutputs = DIVUP(numFilters, stride);
+  const int numOutputBlocks = DIVUP(numOutputs, B_Y);
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y / numOutputBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y;
+  //    const int filterIdx = outputIdx * stride;
+
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  if (outputIdx < numOutputs) {
+    imgs += (pxIdx)*numImages + imgIdx;
+    target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx;
+
+    float prod[imgsPerThread];
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        prod[i] = agg.getBaseValue();
+      }
+    }
+
+    const int myStartF = startF + outputIdx * stride;
+    const int loopStartF = max(0, myStartF);
+    const int loopEndF = min(numFilters, myStartF + poolSize);
+
+    for (int f = loopStartF; f < loopEndF; ++f) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+          prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]);
+        }
+      }
+    }
+
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        target[i * B_X] = agg.output(prod[i], poolSize);
+      }
+    }
+  }
+}
+
+/*
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numOutputs, imgPixels, numImages)
+ */
+template <class Pooler>
+void convPoolCrossMap(
+    NVMatrix& images,
+    NVMatrix& target,
+    const int startF,
+    const int poolSize,
+    const int numOutputs,
+    const int stride,
+    const int imgSize,
+    Pooler pooler) {
+  int numImages = images.getNumCols();
+  int imgPixels = imgSize * imgSize;
+  int numFilters = images.getNumRows() / imgPixels;
+  assert(images.getNumRows() == numFilters * imgPixels);
+
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  assert(images.isContiguous());
+  //    assert(numFilters % 4 == 0);
+  //    assert(numImages % 128 == 0);
+  assert(stride <= poolSize);
+  assert(startF <= 0);
+  assert(
+      startF + (numOutputs - 1) * stride + poolSize >=
+      numFilters); // All filters must be covered
+
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  target.resize(imgPixels * numOutputs, numImages);
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+
+  dim3 threads(32, 4);
+  dim3 blocks(
+      imgSize * DIVUP(numImages, threads.x * imgsPerThread),
+      imgSize * DIVUP(numOutputs, threads.y));
+  bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
+  if (!checkCaseBounds) {
+    if (imgsPerThread == 4) {
+      cudaFuncSetCacheConfig(
+          kPoolCrossMap<Pooler, 4, 32, 4, false>, cudaFuncCachePreferShared);
+      kPoolCrossMap<Pooler, 4, 32, 4, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          startF,
+          poolSize,
+          numOutputs,
+          stride,
+          pooler);
+
+    } else if (imgsPerThread == 2) {
+      cudaFuncSetCacheConfig(
+          kPoolCrossMap<Pooler, 4, 32, 2, false>, cudaFuncCachePreferShared);
+      kPoolCrossMap<Pooler, 4, 32, 2, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          startF,
+          poolSize,
+          numOutputs,
+          stride,
+          pooler);
+
+    } else if (imgsPerThread == 1) {
+      cudaFuncSetCacheConfig(
+          kPoolCrossMap<Pooler, 4, 32, 1, false>, cudaFuncCachePreferShared);
+      kPoolCrossMap<Pooler, 4, 32, 1, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          startF,
+          poolSize,
+          numOutputs,
+          stride,
+          pooler);
+    }
+  } else {
+    if (imgsPerThread == 1) {
+      cudaFuncSetCacheConfig(
+          kPoolCrossMap<Pooler, 4, 32, 1, true>, cudaFuncCachePreferShared);
+      kPoolCrossMap<Pooler, 4, 32, 1, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          startF,
+          poolSize,
+          numOutputs,
+          stride,
+          pooler);
+    } else {
+      assert(false);
+    }
+  }
+  getLastCudaError("convPoolCrossMap: kernel execution failed");
+}
+
+/*
+ * Block size 16xB_X
+ * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
+ * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
+ * batches of filtersPerThread
+ *
+ * So each block does a 4x4 region for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines pixel idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, numOutputs, numImages)
+ *
+ * B_X one of 8, 16, 32
+ * imgsPerThread one of 1, 2, 4, 8, 16
+ *
+ * B_XximgsPerThread MUST be divisible by 32.
+ * Number of filters MUST be divisible by filtersPerThread.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ *
+ * Final write-out will not be fully coalesced unless B_X is 32. But there's a
+ * lot more reading than writing here, and the reading is all coalesced, so it
+ * should be OK.
+ *
+ * To be used when the stride is 1 and the pooling region is fairly large.
+ */
+template <
+    class Agg,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool checkCaseBounds>
+__global__ void kLocalPool2(
+    float* imgs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int subsX,
+    const int startX,
+    const int outputsX,
+    Agg agg) {
+  __shared__ float shImgs[filtersPerThread][B_X * imgsPerThread];
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / (filtersPerThread);
+  const int blockOutputX = 4 * (blockIdx.x / numImgBlocks);
+  const int blockOutputY = 4 * (blockIdx.y / numFilterBlocks);
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
+
+  //    const int blockOutputIdx = blockOutputY * outputsX + blockOutputX;
+  const int numOutputs = outputsX * outputsX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  const int loadY = tidx / 32, loadX = tidx % 32;
+
+  const int myX = threadIdx.y % 4;
+  const int myY = threadIdx.y / 4;
+
+  const int myOutputIdxY = blockOutputY + myY;
+  const int myOutputIdxX = blockOutputX + myX;
+  const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX;
+
+  const int startImgPxX = startX + blockOutputX;
+  const int startImgPxY = startX + blockOutputY;
+  const int endImgPxX = startImgPxX + subsX;
+  const int endImgPxY = startImgPxY + subsX;
+
+  const int myStartImgPxY = startImgPxY + myY;
+  const int myStartImgPxX = startImgPxX + myX;
+  const int myEndImgPxY = endImgPxY + myY;
+  const int myEndImgPxX = endImgPxX + myX;
+
+  const int loopStartY = MAX(startImgPxY, 0);
+  const int loopStartX = MAX(startImgPxX, 0);
+  const int loopEndY = MIN(imgSize, endImgPxY + 3);
+  const int loopEndX = MIN(imgSize, endImgPxX + 3);
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs +=
+      (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
+  target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = agg.getBaseValue();
+    }
+  }
+  int regionSize = 0;
+  for (int y = loopStartY; y < loopEndY; y++) {
+    const bool isInY = y >= myStartImgPxY && y < myEndImgPxY;
+    for (int x = loopStartX; x < loopEndX; x++) {
+      // Load a pixel
+      const int px = y * imgSize + x;
+#pragma unroll
+      for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
+        if (filtersPerThread % (B_X / 2) == 0 ||
+            ly + loadY < filtersPerThread) {
+#pragma unroll
+          for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
+            if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
+              shImgs[ly + loadY][lx + loadX] =
+                  imgs[(ly * imgPixels + px) * numImages + lx];
+            }
+          }
+        }
+      }
+      __syncthreads();
+
+      // Is this pixel in my region?
+      if (isInY && x >= myStartImgPxX && x < myEndImgPxX) {
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]);
+            }
+          }
+        }
+        ++regionSize;
+      }
+      __syncthreads();
+    }
+  }
+  if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          target[f * numOutputs * numImages + i * B_X] =
+              agg.output(prod[f][i], regionSize);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, outputs, numImages)
+ */
+template <class Pooler>
+void convLocalPool(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numFilters,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    Pooler pooler) {
+  int numImages = images.getNumCols();
+  int imgPixels = images.getNumRows() / numFilters;
+  assert(images.getNumRows() == numFilters * imgPixels);
+  int imgSize = int(sqrt(imgPixels));
+  assert(imgSize * imgSize == imgPixels);
+
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  assert(images.isContiguous());
+  //    assert(numFilters % 4 == 0);
+  //    assert(numImages % 128 == 0);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  int outputs = outputsX * outputsX;
+  target.resize(numFilters * outputs, numImages);
+
+  if (strideX == 1 && subsX >= 6 && outputsX > 1) {
+    // NOTE: this part has not been optimized for Kepler
+    int imgsPerThread = numImages % 128 == 0 ? 8 : 4;
+    int filtersPerThread = numFilters % 4 == 0
+        ? 4
+        : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1;
+    int bx = 8;
+    bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
+    assert((imgsPerThread * bx) % 32 == 0);
+    assert(numFilters % filtersPerThread == 0);
+    dim3 threads(bx, 16);
+    dim3 blocks(
+        DIVUP(outputsX, 4) * DIVUP(numImages, bx * imgsPerThread),
+        DIVUP(outputsX, 4) * numFilters / filtersPerThread);
+    //        printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters:
+    //        %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n",
+    //                threads.y, threads.x, blocks.y, blocks.x, imgSize,
+    //                numFilters, numImages, subsX, startX, outputsX);
+    if (imgsPerThread == 8) {
+      if (filtersPerThread == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 1, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 1, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 1, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 2) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 2, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 2, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 2, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 2, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 3) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 3, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 3, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 3, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 3, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 4) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 4, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 8, 4, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 8, 4, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      }
+    } else if (imgsPerThread == 4) {
+      if (filtersPerThread == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 1, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 1, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 1, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 2) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 2, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 2, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 2, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 2, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 3) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 3, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 3, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 3, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 3, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      } else if (filtersPerThread == 4) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 4, true>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool2<Pooler, 8, 4, 4, false>, cudaFuncCachePreferShared);
+          kLocalPool2<Pooler, 8, 4, 4, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              outputsX,
+              pooler);
+        }
+      }
+    }
+  } else {
+    int filtersPerThread = numFilters % 16 == 0 ? 4 : 1;
+    int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+    bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+    dim3 threads(32, 4);
+    dim3 blocks(
+        DIVUP(numImages, 32 * imgsPerThread) * outputsX,
+        DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
+    if (imgsPerThread == 4) {
+      if (filtersPerThread == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 4, 1, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 4, 1, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 4, 1, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      } else {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 4, 4, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 4, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 4, 4, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 4, 4, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      }
+    } else if (imgsPerThread == 2) {
+      if (filtersPerThread == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 2, 1, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 2, 1, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 2, 1, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      } else {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 2, 4, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 2, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 2, 4, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 2, 4, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      }
+    } else {
+      if (filtersPerThread == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 1, 1, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 1, 1, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 1, 1, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      } else {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 1, 4, true>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 1, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              subsX,
+              startX,
+              strideX,
+              outputsX,
+              pooler);
+        } else {
+          cudaFuncSetCacheConfig(
+              kLocalPool<Pooler, 4, 32, 1, 4, false>, cudaFuncCachePreferL1);
+          kLocalPool<Pooler, 4, 32, 1, 4, false>
+              <<<blocks, threads, 0, stream>>>(
+                  images.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  pooler);
+        }
+      }
+    }
+  }
+  getLastCudaError("convLocalPool: kernel execution failed");
+}
+
+#endif /* CONV_UTIL_CUH */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh b/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
new file mode 100644
index 0000000..dc92cb7
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMMON_CUH
+#define COMMON_CUH
+
+#include <helper_cuda.h> // helper functions CUDA error checking and initialization
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "conv_util.cuh"
+
+#include "caffe2/core/context_gpu.h"
+
+enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE };
+
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int sumWidth);
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int sumWidth,
+    float scaleTargets,
+    float scaleOutput);
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+#endif /* COMMON_CUH */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
new file mode 100644
index 0000000..61f60bd
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
@@ -0,0 +1,5019 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <cstring>
+#include <iostream>
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../nvmatrix/include/nvmatrix_kernels.cuh"
+#include "../include/conv_util.cuh"
+
+using namespace std;
+
+__device__ inline float square(const float a) {
+  return a * a;
+}
+
+/*
+ * Horizontal reflection.
+ * imgs:    (numColors, imgSize, imgSize, numCases)
+ * targets: (numColors, imgSize, imgSize, numCases)
+ *
+ * targets should be a different array from imgs.
+ *
+ * Block size: (4, 32)
+ * blockIdx.y * 4 + threadIdx.y determines pixel
+ * blockIdx.x * 32 * imgsPerThread + threadIdx.x determines case batch
+ *
+ */
+template <int numColors, int imgsPerThread, bool checkCaseBounds>
+__global__ void
+kReflectH(float* imgs, float* targets, const int imgSize, const int numCases) {
+  const int pxIdx = blockIdx.y * 4 + threadIdx.y;
+  const int imgPixels = imgSize * imgSize;
+
+  if (pxIdx < imgPixels) {
+    const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
+    const int pxIdxY = pxIdx / imgSize;
+    const int pxIdxX = pxIdx % imgSize;
+
+    const int pxIdxXR = imgSize - 1 - pxIdxX; // reflected coordinate
+    const int pxIdxR = pxIdxY * imgSize + pxIdxXR;
+
+    imgs += pxIdx * numCases + caseIdx;
+    targets += pxIdxR * numCases + caseIdx;
+
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; ++i) {
+      if (!checkCaseBounds || caseIdx + i * 32 < numCases) {
+#pragma unroll
+        for (int c = 0; c < numColors; ++c) {
+          targets[c * imgPixels * numCases + i * 32] =
+              imgs[c * imgPixels * numCases + i * 32];
+        }
+      }
+    }
+  }
+}
+/*
+ * Horizontal reflection.
+ * imgs:    (numColors, imgSize, imgSize, numCases)
+ * targets: (numColors, imgSize, imgSize, numCases)
+ */
+void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize) {
+  int numCases = images.getNumCols();
+  int imgPixels = imgSize * imgSize;
+  int numColors = images.getNumRows() / imgPixels;
+  assert(numColors * imgPixels == images.getNumRows());
+  assert(numColors > 0 && numColors <= 3);
+
+  targets.resize(images);
+  int imgsPerThread = numCases % 128 == 0 ? 4 : numCases % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numCases % (32 * imgsPerThread) != 0;
+  dim3 threads(32, 4);
+  dim3 blocks(DIVUP(numCases, imgsPerThread * 32), DIVUP(imgPixels, 4));
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (checkCaseBounds) {
+    if (numColors == 1) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<1, 1, true>, cudaFuncCachePreferL1);
+        kReflectH<1, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<1, 2, true>, cudaFuncCachePreferL1);
+        kReflectH<1, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<1, 4, true>, cudaFuncCachePreferL1);
+        kReflectH<1, 4, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    } else if (numColors == 2) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<2, 1, true>, cudaFuncCachePreferL1);
+        kReflectH<2, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<2, 2, true>, cudaFuncCachePreferL1);
+        kReflectH<2, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<2, 4, true>, cudaFuncCachePreferL1);
+        kReflectH<2, 4, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    } else if (numColors == 3) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<3, 1, true>, cudaFuncCachePreferL1);
+        kReflectH<3, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<3, 2, true>, cudaFuncCachePreferL1);
+        kReflectH<3, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<3, 4, true>, cudaFuncCachePreferL1);
+        kReflectH<3, 4, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    }
+  } else {
+    if (numColors == 1) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<1, 1, false>, cudaFuncCachePreferL1);
+        kReflectH<1, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<1, 2, false>, cudaFuncCachePreferL1);
+        kReflectH<1, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<1, 4, false>, cudaFuncCachePreferL1);
+        kReflectH<1, 4, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    } else if (numColors == 2) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<2, 1, false>, cudaFuncCachePreferL1);
+        kReflectH<2, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<2, 2, false>, cudaFuncCachePreferL1);
+        kReflectH<2, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<2, 4, false>, cudaFuncCachePreferL1);
+        kReflectH<2, 4, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    } else if (numColors == 3) {
+      if (imgsPerThread == 1) {
+        cudaFuncSetCacheConfig(kReflectH<3, 1, false>, cudaFuncCachePreferL1);
+        kReflectH<3, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 2) {
+        cudaFuncSetCacheConfig(kReflectH<3, 2, false>, cudaFuncCachePreferL1);
+        kReflectH<3, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      } else if (imgsPerThread == 4) {
+        cudaFuncSetCacheConfig(kReflectH<3, 4, false>, cudaFuncCachePreferL1);
+        kReflectH<3, 4, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(), targets.getDevData(), imgSize, numCases);
+      }
+    }
+  }
+  getLastCudaError("kReflectH: kernel execution failed");
+}
+
+/*
+ * blockIdx.y determines module in batches of B_Y
+ * blockIdx.x determines filter in batches of B_X * filtersPerThread
+ *
+ * weights: (numModules, numColors, filterPixels, numFilters)
+ * Not fully coalesced if B_X < 32, so use cache.
+ */
+template <int B_Y, int B_X, int filtersPerThread>
+__global__ void kNormalizeLCWeights(
+    float* weights,
+    const uint numFilters,
+    const int numModules,
+    const uint weightsPerFilter,
+    const float norm) {
+  const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y;
+  const uint filterIdx = B_X * blockIdx.x + threadIdx.x;
+
+  float prod[filtersPerThread];
+#pragma unroll
+  for (uint i = 0; i < filtersPerThread; ++i) {
+    prod[i] = 0;
+  }
+  if (moduleIdx < numModules) {
+    weights += moduleIdx * weightsPerFilter * numFilters + filterIdx;
+    for (uint p = 0; p < weightsPerFilter; ++p) {
+#pragma unroll
+      for (uint i = 0; i < filtersPerThread; ++i) {
+        prod[i] += square(weights[p * numFilters + i * B_X]);
+      }
+    }
+
+#pragma unroll
+    for (uint i = 0; i < filtersPerThread; ++i) {
+      prod[i] = sqrtf(prod[i]);
+      prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f;
+    }
+
+    for (uint p = 0; p < weightsPerFilter; ++p) {
+#pragma unroll
+      for (uint i = 0; i < filtersPerThread; ++i) {
+        weights[p * numFilters + i * B_X] *= prod[i];
+      }
+    }
+  }
+}
+
+/*
+ * weights: (numModules, numColors, filterPixels, numFilters)
+ */
+void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) {
+  int numFilters = weights.getNumCols();
+  int weightsPerFilter = weights.getNumRows() / numModules;
+  assert(numModules * weightsPerFilter == weights.getNumRows());
+
+  assert(!weights.isTrans());
+  assert(weights.isContiguous());
+  assert(numFilters % 16 == 0);
+
+  int bx = numFilters % 32 == 0 ? 32 : 16;
+  int by = bx == 32 ? 4 : 8;
+
+  int filtersPerThread =
+      numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1;
+  dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by));
+  dim3 threads(bx, by);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (filtersPerThread == 4) {
+    cudaFuncSetCacheConfig(
+        kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1);
+    kNormalizeLCWeights<4, 32, 4><<<blocks, threads, 0, stream>>>(
+        weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
+  } else if (filtersPerThread == 2) {
+    cudaFuncSetCacheConfig(
+        kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1);
+    kNormalizeLCWeights<4, 32, 2><<<blocks, threads, 0, stream>>>(
+        weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
+  } else {
+    if (numFilters % 32 == 0) {
+      cudaFuncSetCacheConfig(
+          kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1);
+      kNormalizeLCWeights<4, 32, 1><<<blocks, threads, 0, stream>>>(
+          weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
+    } else {
+      cudaFuncSetCacheConfig(
+          kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1);
+      kNormalizeLCWeights<8, 16, 1><<<blocks, threads, 0, stream>>>(
+          weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
+    }
+  }
+}
+
+/*
+ * Block size 4x32
+ * blockIdx.x determines img idx in batches of 32*imgsPerThread
+ * blockIdx.y determines channel idx, pixel idx in batches of 4
+ *
+ * threadIdx.x determins case idx
+ * threadIdx.y determines pixel idx
+ *
+ * imgs:    (numChannels, imgPixels, numImages) with given imgStride
+ * target:  (numChannels, tgtPixels, numImages)
+ */
+template <int imgsPerThread, bool checkCaseBounds>
+__global__ void kCrop(
+    float* imgs,
+    float* target,
+    const uint numImages,
+    const int imgStride,
+    const uint imgSize,
+    const uint tgtSize,
+    const uint startY,
+    const uint startX) {
+  const uint imgPixels = imgSize * imgSize;
+  const uint tgtPixels = tgtSize * tgtSize;
+  const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
+  const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4);
+  const uint tgtPixelIdx = 4 * (blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y;
+  const uint tgtPxY = tgtPixelIdx / tgtSize;
+  const uint tgtPxX = tgtPixelIdx % tgtSize;
+  const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX;
+
+  if (tgtPixelIdx < tgtPixels) {
+    imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx;
+    target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx;
+
+#pragma unroll
+    for (uint i = 0; i < imgsPerThread; ++i) {
+      if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) {
+        target[i * 32] = imgs[i * 32];
+      }
+    }
+  }
+}
+
+/*
+ * Block size 4x32
+ * blockIdx.y determines pixel idx in batches of 4
+ * blockIdx.x determines case idx in batches of 32*imgsPerThread
+ * threadIdx.y determines pixel idx
+ * threadIdx.x determines case idx
+ *
+ * imgs:        (3, imgPixels, numImages) with given imgStride
+ * target:      (3, imgPixels, numImages)
+ *
+ * Each thread produces (y,u,v) values for a particular (r,g,b) pixel
+ *
+ * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV):
+ *
+ * [Y]      [ 0.2126     0.7152      0.0722 ][R]
+ * [U]  =   [-0.09991   -0.33609     0.436  ][G]
+ * [V]      [ 0.615     -0.55861    -0.05639][B]
+ */
+template <int imgsPerThread, bool checkCaseBounds>
+__global__ void kRGBToYUV(
+    float* imgs,
+    float* target,
+    const int imgPixels,
+    const int numImages,
+    const int imgStride) {
+  const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
+  const int pxIdx = blockIdx.y * 4 + threadIdx.y;
+
+  if (pxIdx < imgPixels) {
+    const int imgChannelStride = imgPixels * imgStride;
+    const int tgtChannelStride = imgPixels * numImages;
+    imgs += pxIdx * imgStride + caseIdx;
+    target += pxIdx * numImages + caseIdx;
+
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; ++i) {
+      if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
+        const float R = imgs[0 * imgChannelStride + i * 32];
+        const float G = imgs[1 * imgChannelStride + i * 32];
+        const float B = imgs[2 * imgChannelStride + i * 32];
+        target[0 * tgtChannelStride + i * 32] =
+            0.2126f * R + 0.7152f * G + 0.0722f * B; // Y
+        target[1 * tgtChannelStride + i * 32] =
+            -0.09991f * R + -0.33609f * G + 0.436f * B; // U
+        target[2 * tgtChannelStride + i * 32] =
+            0.615f * R + -0.55861f * G + -0.05639f * B; // V
+      }
+    }
+  }
+}
+
+__device__ inline float labf(const float x) {
+  if (x > 0.0088564517f) {
+    return __powf(x, 0.3333f);
+  }
+  return 7.787037f * x + 0.13793103f;
+}
+
+/*
+ * Block size 4x32
+ * blockIdx.y determines pixel idx in batches of 4
+ * blockIdx.x determines case idx in batches of 32*imgsPerThread
+ * threadIdx.y determines pixel idx
+ * threadIdx.x determines case idx
+ *
+ * imgs:        (3, imgPixels, numImages) with given imgStride
+ * target:      (3, imgPixels, numImages)
+ *
+ * This proceeds in two steps.
+ *
+ * - First, RGB values are linearly transformed to XYZ as per
+ *   http://en.wikipedia.org/wiki/CIE_XYZ_color_space
+ * - Second, XYZ values are nonlinearly transformed to L*a*b* as per
+ *   http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation
+ *
+ * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel
+ *
+ * The RGB --> XYZ transform is:
+ *
+ * [X]                  [0.49       0.31        0.2     ][R]
+ * [Y]  =   5.6506753 * [0.17697    0.8124      0.01063 ][G]
+ * [Z]                  [0          0.01        0.99    ][B]
+ *
+ * NOTE: The input should be in the range 0-1. Don't do mean-subtraction
+ * beforehand.
+ *
+ * Then X_max, Y_max, Z_max = 5.6506753.
+ *
+ * The range of the L* values is [0, 100].
+ * If the center flag is given, the range will be [-50, 50].
+ *
+ */
+template <int imgsPerThread, bool checkCaseBounds, bool center>
+__global__ void kRGBToLAB(
+    float* imgs,
+    float* target,
+    const int imgPixels,
+    const int numImages,
+    const int imgStride) {
+  const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
+  const int pxIdx = blockIdx.y * 4 + threadIdx.y;
+
+  if (pxIdx < imgPixels) {
+    const int imgChannelStride = imgPixels * imgStride;
+    const int tgtChannelStride = imgPixels * numImages;
+    imgs += pxIdx * imgStride + caseIdx;
+    target += pxIdx * numImages + caseIdx;
+
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; ++i) {
+      if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
+        const float R = imgs[0 * imgChannelStride + i * 32];
+        const float G = imgs[1 * imgChannelStride + i * 32];
+        const float B = imgs[2 * imgChannelStride + i * 32];
+
+        const float X = (0.49f * R + 0.31f * G + 0.2f * B);
+        const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B);
+        const float Z = (0.01f * G + 0.99f * B);
+
+        const float labX = labf(X);
+        const float labY = labf(Y);
+        const float labZ = labf(Z);
+
+        target[0 * tgtChannelStride + i * 32] =
+            116.0f * labY - 16.0f - (center ? 50.0f : 0); // L*
+        target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a*
+        target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b*
+      }
+    }
+  }
+}
+
+/*
+ * Block size 16x32.
+ * Each block produces a 4x4 chunk of the output image.
+ * threadIdx.y determines pixel idx in 4x4 chunk.
+ * threadIdx.x determines case idx.
+ * blockIdx.x determines case idx in batches of 32*imgsPerThread.
+ * blockIdx.y determines 4x4 chunk idx, channel idx.
+ *
+ * imgs:        (numChannels, imgPixels, numImages) with given imgStride
+ * target:      (numChannels, tgtPixels, numImages)
+ *
+ * imgSize = scale * tgtSize (roughly)
+ *
+ * This is a rather naive kernel that relies on cache for speed. But all it's
+ * doing is basic texture manipulation, which is very local in nature, so it
+ * should be ok. Also, it will in practice be a tiny fraction of the runtime of
+ * a large convnet.
+ *
+ * So that is my justification for being lazy here.
+ */
+template <int imgsPerThread, bool checkCaseBounds>
+__global__ void kResizeBilinear(
+    float* imgs,
+    float* target,
+    const int imgSize,
+    const int tgtSize,
+    const int numImages,
+    const int imgStride,
+    const float scale,
+    const float centerScale) {
+  const int numChunksX = DIVUP(tgtSize, 4);
+  const int numChunks = numChunksX * numChunksX;
+  const int channelIdx = blockIdx.y / numChunks;
+  const int chunkIdx = blockIdx.y % numChunks;
+  const int chunkIdxX = chunkIdx % numChunksX;
+  const int chunkIdxY = chunkIdx / numChunksX;
+  const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
+  const int imgPixels = imgSize * imgSize;
+  const int tgtPixels = tgtSize * tgtSize;
+
+  const int pxX = 4 * chunkIdxX + threadIdx.y % 4;
+  const int pxY = 4 * chunkIdxY + threadIdx.y / 4;
+
+  if (pxY < tgtSize && pxX < tgtSize) {
+    const int pxIdx = pxY * tgtSize + pxX;
+
+    imgs += channelIdx * imgPixels * imgStride + caseIdx;
+    target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx;
+
+    // This will cause slight distortions at the edges when upsampling in some
+    // cases. But I think that's not a big deal.
+    const float srcPxX = fmaxf(
+        0.0f,
+        fminf(
+            __int2float_rn(imgSize) - 1.01f,
+            __int2float_rn(pxX) * scale + centerScale));
+    const float srcPxY = fmaxf(
+        0.0f,
+        fminf(
+            __int2float_rn(imgSize) - 1.01f,
+            __int2float_rn(pxY) * scale + centerScale));
+
+    const float u = floorf(srcPxX + 1) - srcPxX;
+    const float w = srcPxY - floorf(srcPxY);
+
+    // Consider doing max(0, min(imgSize, x)) here
+    const int srcPx0 =
+        (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left
+    const int srcPx1 = srcPx0 + 1; // top-right
+    const int srcPx2 = srcPx0 + imgSize; // bottom-left
+    const int srcPx3 = srcPx2 + 1; // bottom-right
+
+#pragma unroll
+    for (int c = 0; c < imgsPerThread; ++c) {
+      if (!checkCaseBounds || caseIdx + c * 32 < numImages) {
+        const float val0 = imgs[srcPx0 * imgStride + c * 32];
+        const float val1 = imgs[srcPx1 * imgStride + c * 32];
+        const float val2 = imgs[srcPx2 * imgStride + c * 32];
+        const float val3 = imgs[srcPx3 * imgStride + c * 32];
+
+        const float c0 = u * (val0 - val1) + val1;
+        const float c1 = u * (val2 - val3) + val3;
+
+        target[32 * c] = w * (c1 - c0) + c0;
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X.
+ * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx
+ * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx
+ *
+ * imgs:        (numChannels, imgPixels, numImages) with given imgStride
+ * filter:      (1, 2*radius + 1)
+ * target:      (numChannels, imgPixels, numImages)
+ *
+ * target can be the same matrix as imgs.
+ * radius must be one of 3, 5, 7, 9.
+ *
+ * Tried imgsPerThread, slower.
+ */
+template <int B_Y, int B_X, int radius>
+__global__ void kGaussianBlur(
+    float* imgs,
+    float* filter,
+    float* target,
+    const int imgSize,
+    const int numImages,
+    const int imgStride,
+    const int numChannels,
+    const bool horiz,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int filterWidth = 2 * radius + 1;
+  __shared__ float shFilter[filterWidth - 1];
+
+  const int imgPixels = imgSize * imgSize;
+  const int ty = B_Y * blockIdx.y + threadIdx.y;
+  const int channelIdx = ty / imgSize;
+  const int rowIdx = ty % imgSize;
+  const int imgIdx = B_X * blockIdx.x + threadIdx.x;
+
+  //    const int tidx = B_Y * threadIdx.y + threadIdx.x;
+  if (horiz) {
+    imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride +
+        imgIdx;
+    target += channelIdx * imgPixels * numImages +
+        rowIdx * imgSize * numImages + imgIdx;
+  } else {
+    imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx;
+    target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx;
+  }
+  float outputs[filterWidth - 1];
+#pragma unroll
+  for (int r = 0; r < filterWidth - 1; r++) {
+    outputs[r] = 0;
+  }
+  if (threadIdx.x < filterWidth - 1) {
+    shFilter[threadIdx.x] = filter[threadIdx.x];
+  }
+  __syncthreads();
+
+  if (imgIdx < numImages && channelIdx < numChannels) {
+// This writes radius*2 = filterWidth - 1 values to outputs
+#pragma unroll
+    for (int col = 0; col < radius; col++) {
+      float px = imgs[0];
+#pragma unroll
+      for (int r = 0; r < radius + 1 + col; r++) {
+        outputs[r] += px * shFilter[radius + col - r];
+      }
+      imgs += horiz ? imgStride : imgStride * imgSize;
+    }
+
+    // Unfortunately this has to be at this level of granularity
+    if (scaleTargets != 0) {
+      for (int col = radius; col < imgSize; col++) { // loop over img columns
+        float px = imgs[0];
+        target[0] = scaleTargets * target[0] +
+            scaleOutputs * (outputs[0] + px * shFilter[0]);
+
+#pragma unroll
+        for (int r = 1; r < radius * 2; r++) {
+          outputs[r - 1] = outputs[r] + px * shFilter[r];
+        }
+        outputs[filterWidth - 2] = px * shFilter[0];
+
+        imgs += horiz ? imgStride : imgStride * imgSize;
+        target += horiz ? numImages : numImages * imgSize;
+      }
+
+#pragma unroll
+      for (int r = 0; r < radius; r++) {
+        float* t = &target[0];
+        t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r];
+        target += horiz ? numImages : numImages * imgSize;
+      }
+    } else {
+      for (int col = radius; col < imgSize; col++) { // loop over img columns
+        float px = imgs[0];
+        target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]);
+#pragma unroll
+        for (int r = 1; r < radius * 2; r++) {
+          outputs[r - 1] = outputs[r] + px * shFilter[r];
+        }
+        outputs[filterWidth - 2] = px * shFilter[0];
+
+        imgs += horiz ? imgStride : imgStride * imgSize;
+        target += horiz ? numImages : numImages * imgSize;
+      }
+
+#pragma unroll
+      for (int r = 0; r < radius; r++) {
+        target[0] = scaleOutputs * outputs[r];
+        target += horiz ? numImages : numImages * imgSize;
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numChannels, imgPixels, numImages)
+ * target:      (numChannels, numOutputs, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by filtersPerThread
+ */
+
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int chansPerThread,
+    bool checkCaseBounds>
+__global__ void kBedOfNails(
+    float* imgs,
+    float* target,
+    const int imgSize,
+    const int numChannels,
+    const int numImages,
+    const int startX,
+    const int strideX,
+    const int outputsX,
+    const bool reverse,
+    const float scaleTargets,
+    const float scaleOutput) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numChanBlocks = DIVUP(numChannels, B_Y * chansPerThread);
+  const int outputIdxX = blockIdx.x / numImgBlocks;
+  const int outputIdxY = blockIdx.y / numChanBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread;
+  const int myChanIdx = (blockChanIdx + threadIdx.y * chansPerThread);
+  if (myChanIdx >= numChannels) {
+    return;
+  }
+  //    if (blockIdx.x != 0 || blockIdx.y != 0) {
+  //        return;
+  //    }
+  const int outputIdx = outputIdxY * outputsX + outputIdxX;
+  const int numOutputs = outputsX * outputsX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int startImgPxX = startX + outputIdxX * strideX;
+  const int startImgPxY = startX + outputIdxY * strideX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+  const int imgPx = startImgPxY * imgSize + startImgPxX;
+
+  imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx;
+  target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx;
+
+  if (scaleTargets != 0) {
+    if (!reverse) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int c = 0; c < chansPerThread; c++) {
+            target[c * numOutputs * numImages + i * B_X] =
+                scaleTargets * target[c * numOutputs * numImages + i * B_X] +
+                scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
+          }
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int c = 0; c < chansPerThread; c++) {
+            imgs[c * imgPixels * numImages + i * B_X] =
+                scaleTargets * imgs[c * imgPixels * numImages + i * B_X] +
+                scaleOutput * target[c * numOutputs * numImages + i * B_X];
+          }
+        }
+      }
+    }
+  } else {
+    if (!reverse) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int c = 0; c < chansPerThread; c++) {
+            target[c * numOutputs * numImages + i * B_X] =
+                scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
+          }
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int c = 0; c < chansPerThread; c++) {
+            imgs[c * imgPixels * numImages + i * B_X] =
+                scaleOutput * target[c * numOutputs * numImages + i * B_X];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imgs:        (numChannels, imgPixels, numImages)
+ * target:      (numChannels, outputs, numImages)
+ */
+void _convBedOfNails(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numChannels,
+    int imgSize,
+    int startX,
+    int strideX,
+    bool reverse,
+    float scaleTargets,
+    float scaleOutput) {
+  int numImages = reverse ? target.getNumCols() : images.getNumCols();
+  int imgPixels = imgSize * imgSize;
+
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  assert(images.isContiguous());
+  assert(target.isContiguous());
+  assert(strideX > 1);
+
+  int outputsX = DIVUP(imgSize, strideX);
+  int outputs = outputsX * outputsX;
+  if (reverse) {
+    assert(target.getNumRows() == numChannels * outputs);
+  } else {
+    assert(images.getNumRows() == numChannels * imgPixels);
+  }
+
+  if (scaleTargets == 0) {
+    if (reverse) {
+      images.resize(numChannels * imgPixels, numImages);
+      images.apply(NVMatrixOps::Zero());
+    } else {
+      target.resize(numChannels * outputs, numImages);
+    }
+  } else {
+    if (reverse) {
+      assert(images.getNumRows() == numChannels * outputs);
+      assert(images.getNumCols() == numImages);
+    } else {
+      assert(target.getNumRows() == numChannels * outputs);
+      assert(target.getNumCols() == numImages);
+    }
+  }
+
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  int chansPerThread = numChannels % 8 == 0 ? 2 : 1;
+  dim3 threads(32, 4);
+  dim3 blocks(
+      DIVUP(numImages, 32 * imgsPerThread) * outputsX,
+      DIVUP(numChannels, 4 * chansPerThread) * outputsX);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (imgsPerThread == 4) {
+    if (chansPerThread == 1) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 4, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  } else if (imgsPerThread == 2) {
+    if (chansPerThread == 1) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 2, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 2, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 2, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  } else {
+    if (chansPerThread == 1) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 1, 1, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 1, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
+        kBedOfNails<4, 32, 1, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numChannels,
+            numImages,
+            startX,
+            strideX,
+            outputsX,
+            reverse,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  }
+}
+
+void convBedOfNails(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numChannels,
+    int imgSize,
+    int startX,
+    int strideX,
+    float scaleTargets,
+    float scaleOutput) {
+  _convBedOfNails(
+      images,
+      target,
+      numChannels,
+      imgSize,
+      startX,
+      strideX,
+      false,
+      scaleTargets,
+      scaleOutput);
+}
+
+void convBedOfNailsUndo(
+    NVMatrix& actsGrad,
+    NVMatrix& target,
+    int numChannels,
+    int imgSize,
+    int startX,
+    int strideX,
+    float scaleTargets,
+    float scaleOutput) {
+  _convBedOfNails(
+      target,
+      actsGrad,
+      numChannels,
+      imgSize,
+      startX,
+      strideX,
+      true,
+      scaleTargets,
+      scaleOutput);
+}
+
+/*
+ * imgs:        (numChannels, imgPixels, numImages) with given imgStride
+ * filter:      (1, 2*radius + 1)
+ * target:      (numChannels, imgPixels, numImages)
+ */
+void convGaussianBlur(
+    NVMatrix& images,
+    NVMatrix& filter,
+    NVMatrix& target,
+    bool horiz,
+    int numChannels,
+    float scaleTargets,
+    float scaleOutputs) {
+  int numImages = images.getNumCols();
+  int radius = filter.getNumCols() / 2;
+  int imgPixels = images.getNumRows() / numChannels;
+  int imgSize = int(sqrt(imgPixels));
+
+  assert(imgPixels == imgSize * imgSize);
+  assert(radius >= 1 && radius <= 4);
+  assert(imgSize >= 2 * radius + 1);
+  assert(filter.getNumRows() == 1);
+  assert(images.getNumRows() == numChannels * imgPixels);
+  assert(!images.isTrans());
+  assert(!filter.isTrans());
+  assert(!target.isTrans());
+  assert(target.isContiguous());
+  if (scaleTargets == 0) {
+    target.resize(images);
+  } else {
+    assert(target.isSameDims(images));
+  }
+
+  dim3 threads(32, 4);
+  dim3 blocks(
+      DIVUP(numImages, threads.x), DIVUP(numChannels * imgSize, threads.y));
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (radius == 1) {
+    cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1);
+    kGaussianBlur<4, 32, 1><<<blocks, threads, 0, stream>>>(
+        images.getDevData(),
+        filter.getDevData(),
+        target.getDevData(),
+        imgSize,
+        numImages,
+        images.getStride(),
+        numChannels,
+        horiz,
+        scaleTargets,
+        scaleOutputs);
+
+  } else if (radius == 2) {
+    cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1);
+    kGaussianBlur<4, 32, 2><<<blocks, threads, 0, stream>>>(
+        images.getDevData(),
+        filter.getDevData(),
+        target.getDevData(),
+        imgSize,
+        numImages,
+        images.getStride(),
+        numChannels,
+        horiz,
+        scaleTargets,
+        scaleOutputs);
+
+  } else if (radius == 3) {
+    cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1);
+    kGaussianBlur<4, 32, 3><<<blocks, threads, 0, stream>>>(
+        images.getDevData(),
+        filter.getDevData(),
+        target.getDevData(),
+        imgSize,
+        numImages,
+        images.getStride(),
+        numChannels,
+        horiz,
+        scaleTargets,
+        scaleOutputs);
+  } else if (radius == 4) {
+    cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1);
+    kGaussianBlur<4, 32, 4><<<blocks, threads, 0, stream>>>(
+        images.getDevData(),
+        filter.getDevData(),
+        target.getDevData(),
+        imgSize,
+        numImages,
+        images.getStride(),
+        numChannels,
+        horiz,
+        scaleTargets,
+        scaleOutputs);
+  }
+}
+
+/*
+ * Block size 1x128
+ * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread
+ * blockIdx.y determines pixel.y
+ *
+ * So each block does one output for some number of images and all the fliters.
+ *
+ * threadIdx.x determines img idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * meanDiffs:   (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by B_Y*filtersPerThread
+ */
+
+template <int imgsPerThread, int numFilters, bool checkCaseBounds>
+__global__ void kCNorm_fewfilter(
+    float* imgs,
+    float* meanDiffs,
+    float* denoms,
+    float* target,
+    const int imgSize,
+    const int numImages,
+    const int sizeX,
+    const float addScale,
+    const float powScale,
+    const float minDiv) {
+  const int imgPixels = imgSize * imgSize;
+  const int numImgBlocks = DIVUP(numImages, 128 * imgsPerThread);
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread;
+
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+
+  const int startPxX = -sizeX / 2 + pxIdxX;
+  const int startPxY = -sizeX / 2 + pxIdxY;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs += pxIdx * numImages + imgIdx;
+  denoms += pxIdx * numImages + imgIdx;
+  meanDiffs += imgIdx;
+  target += pxIdx * numImages + imgIdx;
+
+  float prod[numFilters][imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
+#pragma unroll
+      for (int f = 0; f < numFilters; f++) {
+        prod[f][i] = 0;
+      }
+    }
+  }
+  const int loopStartY = MAX(0, startPxY);
+  const int loopStartX = MAX(0, startPxX);
+  const int loopEndY = MIN(imgSize, startPxY + sizeX);
+  const int loopEndX = MIN(imgSize, startPxX + sizeX);
+
+  for (int y = loopStartY; y < loopEndY; y++) {
+    for (int x = loopStartX; x < loopEndX; x++) {
+      const int imgPx = y * imgSize + x;
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
+#pragma unroll
+          for (int f = 0; f < numFilters; f++) {
+            prod[f][i] += square(
+                meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]);
+          }
+        }
+      }
+    }
+  }
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
+#pragma unroll
+      for (int f = 0; f < numFilters; f++) {
+        prod[f][i] = minDiv + addScale * prod[f][i];
+        denoms[f * imgPixels * numImages + i * 128] = prod[f][i];
+        target[f * imgPixels * numImages + i * 128] =
+            imgs[f * imgPixels * numImages + i * 128] *
+            __powf(prod[f][i], -powScale);
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines filter idx in batches of B_Y*filtersPerThread
+ * blockIdx.z determines pixel
+ *
+ * So each block does one pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * means:       (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by B_Y*filtersPerThread
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool checkCaseBounds>
+__global__ void kCNorm_manyfilter(
+    float* imgs,
+    float* meanDiffs,
+    float* denoms,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeX,
+    const float addScale,
+    const float powScale,
+    const float minDiv) {
+  const int imgPixels = imgSize * imgSize;
+
+  const int pxIdxX = blockIdx.z % imgSize;
+  const int pxIdxY = blockIdx.z / imgSize;
+  const int blockImgIdx = blockIdx.x * B_X * imgsPerThread;
+  const int blockFilterIdx = blockIdx.y * B_Y * filtersPerThread;
+
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+
+  const int startPxX = -sizeX / 2 + pxIdxX;
+  const int startPxY = -sizeX / 2 + pxIdxY;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+  imgs +=
+      ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
+  meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx;
+  denoms +=
+      ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
+  target +=
+      ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[f][i] = 0;
+      }
+    }
+  }
+
+  const int loopStartY = max(0, startPxY);
+  const int loopStartX = max(0, startPxX);
+  const int loopEndY = min(imgSize, startPxY + sizeX);
+  const int loopEndX = min(imgSize, startPxX + sizeX);
+
+  for (int y = loopStartY; y < loopEndY; y++) {
+    for (int x = loopStartX; x < loopEndX; x++) {
+      const int imgPx = y * imgSize + x;
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+            prod[f][i] += square(
+                meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
+          }
+        }
+      }
+    }
+  }
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        prod[f][i] = minDiv + addScale * prod[f][i];
+        denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
+        target[f * B_Y * imgPixels * numImages + i * B_X] =
+            imgs[f * B_Y * imgPixels * numImages + i * B_X] *
+            __powf(prod[f][i], -powScale);
+      }
+    }
+  }
+}
+
+/*
+ * Block size 16xB_X
+ * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
+ * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
+ * batches of filtersPerThread
+ *
+ * So each block does 4x4 region of pixels for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines pixel idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * means:       (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ *
+ * B_X one of 8, 16, 32
+ * imgsPerThread one of 1, 2, 4, 8, 16
+ *
+ * B_XximgsPerThread MUST be divisible by 32.
+ * Number of filters MUST be divisible by filtersPerThread.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by filtersPerThread
+ *
+ * Final write-out will not be fully coalesced unless B_X is 32. But there's a
+ * lot more reading than writing here, and the reading is all coalesced, so it
+ * should be OK.
+ */
+template <
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool checkCaseBounds>
+__global__ void kCNorm2(
+    float* imgs,
+    float* meanDiffs,
+    float* denoms,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeX,
+    const float addScale,
+    const float powScale,
+    const float minDiv) {
+  __shared__ float shDiffs[filtersPerThread][B_X * imgsPerThread];
+  const int imgPixels = imgSize * imgSize;
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / (filtersPerThread);
+  const int blockPxX = 4 * (blockIdx.x / numImgBlocks);
+  const int blockPxY = 4 * (blockIdx.y / numFilterBlocks);
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
+
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  const int loadY = tidx / 32, loadX = tidx % 32;
+
+  const int startPxX = MAX(0, -sizeX / 2 + blockPxX);
+  const int startPxY = MAX(0, -sizeX / 2 + blockPxY);
+  const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3);
+  const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3);
+
+  const int myPxX = blockPxX + threadIdx.y % 4;
+  const int myPxY = blockPxY + threadIdx.y / 4;
+  const int myPxIdx = myPxY * imgSize + myPxX;
+  //    const bool doWork = myPxX < imgSize && myPxY < imgSize;
+  const int myStartPxY = -sizeX / 2 + myPxY;
+  const int myStartPxX = -sizeX / 2 + myPxX;
+  const int myEndPxY = myPxY + DIVUP(sizeX, 2);
+  const int myEndPxX = myPxX + DIVUP(sizeX, 2);
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+  meanDiffs +=
+      (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
+  denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+  target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[f][i] = 0;
+      }
+    }
+  }
+
+  for (int y = startPxY; y < endPxY; y++) {
+    const bool isInY = y >= myStartPxY && y < myEndPxY;
+    for (int x = startPxX; x < endPxX; x++) {
+      const int px = y * imgSize + x;
+// All the threads load a pixel from memory
+#pragma unroll
+      for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
+        if (filtersPerThread % (B_X / 2) == 0 ||
+            ly + loadY < filtersPerThread) {
+#pragma unroll
+          for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
+            if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
+              shDiffs[ly + loadY][lx + loadX] =
+                  meanDiffs[(ly * imgPixels + px) * numImages + lx];
+            }
+          }
+        }
+      }
+      __syncthreads();
+
+      // Each row of threads decides if it's interested in this pixel
+      if (isInY && x >= myStartPxX && x < myEndPxX) {
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]);
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+  //    imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
+  //    imgs += threadIdx.x;
+  if (myPxX < imgSize && myPxY < imgSize) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          prod[f][i] = minDiv + addScale * prod[f][i];
+          denoms[f * imgPixels * numImages + i * B_X] = prod[f][i];
+          target[f * imgPixels * numImages + i * B_X] =
+              imgs[f * imgPixels * numImages + i * B_X] *
+              __powf(prod[f][i], -powScale);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y
+ *
+ * So each block does one pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * meanDiffs:   (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by B_Y
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    bool checkCaseBounds,
+    bool blocked>
+__global__ void kFCNorm(
+    cudaTextureObject_t imgs,
+    cudaTextureObject_t meanDiffs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeF,
+    const float addScale,
+    const float powScale,
+    const float minDiv) {
+  const int imgPixels = imgSize * imgSize;
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / B_Y;
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y / numFilterBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
+
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+  const int imgOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+  const int meanDiffsOffset = pxIdx * numImages + imgIdx;
+  //    imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
+  //    meanDiffs += pxIdx * numImages + imgIdx;
+  target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+
+  float prod[imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+      prod[i] = 0;
+    }
+  }
+
+  const int startF =
+      blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx;
+  const int loopStartF = blocked ? startF : MAX(0, startF);
+  const int loopEndF = MIN(numFilters, startF + sizeF);
+
+  for (int f = loopStartF; f < loopEndF; ++f) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        prod[i] += square(tex1Dfetch<float>(
+            meanDiffs, meanDiffsOffset + f * imgPixels * numImages + i * B_X));
+      }
+    }
+  }
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+      prod[i] = minDiv + addScale * prod[i];
+      target[i * B_X] = tex1Dfetch<float>(imgs, imgOffset + i * B_X) *
+          __powf(prod[i], -powScale);
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:                (numFilters, imgPixels, numImages)
+ * maxGrads:            (numOutputs, imgPixels, numImages)
+ * maxActs:             (numOutputs, imgPixels, numImages)
+ * target:              (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y
+ *
+ * TODO: this isn't really ideal
+ */
+template <int B_Y, int B_X, int imgsPerThread, bool add, bool checkCaseBounds>
+__global__ void kCrossMapMaxPoolUndo(
+    float* imgs,
+    float* maxGrads,
+    float* maxActs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int startF,
+    const int poolSize,
+    const int numOutputs,
+    const int stride,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  //    const int numOutputs = DIVUP(numFilters, stride);
+  const int numFilterBlocks = numFilters / B_Y;
+
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y / numFilterBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
+
+  const int imgPixels = imgSize * imgSize;
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+  maxGrads += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx;
+  maxActs += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx;
+  target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+
+  float prod[imgsPerThread];
+  //    if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) {
+  //        return;
+  //    }
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    prod[i] = 0;
+  }
+
+  if (filterIdx < numFilters) {
+    //        const int startOut = max(0, (filterIdx-startF-poolSize)/ stride +
+    //        1);
+    const int loopStartOut =
+        max(0, (filterIdx - startF - poolSize) / stride + 1);
+    const int loopEndOut = min(numOutputs, (filterIdx - startF) / stride + 1);
+
+    for (int o = loopStartOut; o < loopEndOut; ++o) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+          const float ma = maxActs[o * imgPixels * numImages + i * B_X];
+          const float mg = maxGrads[o * imgPixels * numImages + i * B_X];
+          const float img = imgs[i * B_X];
+          prod[i] += (img == ma) * mg;
+        }
+      }
+    }
+    //    printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF);
+
+    if (!add) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+          target[i * B_X] = prod[i];
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+          target[i * B_X] =
+              scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * images:              (numFilters, imgPixels, numImages)
+ * maxGrads:            (numOutputs, imgPixels, numImages)
+ * maxActs:             (numOutputs, imgPixels, numImages)
+ * target:              (numFilters, imgPixels, numImages)
+ */
+void convCrossMapMaxPoolUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    const int imgSize,
+    const int startF,
+    const int poolSize,
+    const int stride,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  int numImages = images.getNumCols();
+  int imgPixels = imgSize * imgSize;
+  int numFilters = images.getNumRows() / imgPixels;
+  int numOutputs = maxActs.getNumRows() / imgPixels;
+  assert(images.getNumRows() == numFilters * imgPixels);
+  assert(maxGrads.getNumRows() == numOutputs * imgPixels);
+  assert(maxGrads.getNumCols() == numImages);
+  assert(maxGrads.isSameDims(maxActs));
+
+  assert(images.getNumRows() == numFilters * imgPixels);
+
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  assert(!maxGrads.isTrans());
+  assert(!maxActs.isTrans());
+  assert(images.isContiguous());
+  assert(maxGrads.isContiguous());
+  assert(maxActs.isContiguous());
+  assert(maxGrads.isSameDims(maxActs));
+  //    assert(numFilters % 16 == 0);
+  //    assert(numImages % 128 == 0);
+
+  assert(stride <= poolSize);
+  assert(startF <= 0);
+  assert(
+      startF + (numOutputs - 1) * stride + poolSize >=
+      numFilters); // All filters must be covered
+
+  dim3 threads(32, 4);
+
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  dim3 blocks(
+      imgSize * DIVUP(numImages, threads.x * imgsPerThread),
+      imgSize * DIVUP(numFilters, threads.y));
+  bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
+
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (scaleTargets == 0) {
+    target.resize(images);
+    if (!checkCaseBounds) {
+      if (imgsPerThread == 4) {
+        kCrossMapMaxPoolUndo<4, 32, 4, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      } else if (imgsPerThread == 2) {
+        kCrossMapMaxPoolUndo<4, 32, 2, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      } else {
+        kCrossMapMaxPoolUndo<4, 32, 1, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      }
+    } else {
+      kCrossMapMaxPoolUndo<4, 32, 1, false, true>
+          <<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              maxGrads.getDevData(),
+              maxActs.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              startF,
+              poolSize,
+              numOutputs,
+              stride,
+              scaleTargets,
+              scaleOutputs);
+    }
+  } else {
+    assert(target.isSameDims(images));
+    if (!checkCaseBounds) {
+      if (imgsPerThread == 4) {
+        kCrossMapMaxPoolUndo<4, 32, 4, true, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      } else if (imgsPerThread == 2) {
+        kCrossMapMaxPoolUndo<4, 32, 2, true, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      } else {
+        kCrossMapMaxPoolUndo<4, 32, 1, true, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                startF,
+                poolSize,
+                numOutputs,
+                stride,
+                scaleTargets,
+                scaleOutputs);
+      }
+    } else {
+      kCrossMapMaxPoolUndo<4, 32, 1, true, true>
+          <<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              maxGrads.getDevData(),
+              maxActs.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              startF,
+              poolSize,
+              numOutputs,
+              stride,
+              scaleTargets,
+              scaleOutputs);
+    }
+  }
+  getLastCudaError("convCrossMapMaxPoolUndo: kernel execution failed");
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * outGrads:        (numFilters, imgPixels, numImages)
+ * denoms:          (numFilters, imgPixels, numImages)
+ * inputs:          (numFilters, imgPixels, numImages)
+ * acts:            (numFilters, imgPixels, numImages)
+ * target:          (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y
+ *
+ * TODO: this isn't really ideal
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    bool add,
+    bool checkCaseBounds,
+    bool blocked>
+__global__ void kFRNormUndo(
+    cudaTextureObject_t outGrads,
+    cudaTextureObject_t denoms,
+    cudaTextureObject_t inputs,
+    cudaTextureObject_t acts,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeF,
+    const float powScale,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / B_Y;
+
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y / numFilterBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
+
+  const int imgPixels = imgSize * imgSize;
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  const int actsOffset = pxIdx * numImages + imgIdx;
+  const int inputOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+
+  target += inputOffset;
+  float prod[imgsPerThread];
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    prod[i] = 0;
+  }
+
+  const int startF = blocked ? (filterIdx / sizeF) * sizeF
+                             : -sizeF + sizeF / 2 + 1 + filterIdx;
+  const int loopStartF = blocked ? startF : MAX(0, startF);
+  const int loopEndF = MIN(numFilters, startF + sizeF);
+
+  for (int f = loopStartF; f < loopEndF; ++f) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        prod[i] += tex1Dfetch<float>(
+            acts, actsOffset + f * imgPixels * numImages + i * B_X);
+      }
+    }
+  }
+
+  if (!add) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        const float inp = tex1Dfetch<float>(inputs, inputOffset + i * B_X);
+        const float out = tex1Dfetch<float>(outGrads, inputOffset + i * B_X);
+        const float den = tex1Dfetch<float>(denoms, inputOffset + i * B_X);
+        prod[i] = inp * prod[i] + out * __powf(den, -powScale);
+        target[i * B_X] = prod[i];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        const float inp = tex1Dfetch<float>(inputs, inputOffset + i * B_X);
+        const float out = tex1Dfetch<float>(outGrads, inputOffset + i * B_X);
+        const float den = tex1Dfetch<float>(denoms, inputOffset + i * B_X);
+        prod[i] = inp * prod[i] + out * __powf(den, -powScale);
+        target[i * B_X] =
+            scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * outGrads:        (numFilters, imgPixels, numImages)
+ * denoms:          (numFilters, imgPixels, numImages)
+ * inputs:          (numFilters, imgPixels, numImages)
+ * acts:            (numFilters, imgPixels, numImages)
+ * target:          (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y
+ *
+ * TODO: this is pretty wasteful of computation. a lot of threads basically
+ * compute the same products.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    bool add,
+    bool checkCaseBounds,
+    bool blocked>
+//__launch_bounds__(128,16)
+__global__ void kFRNormUndo2(
+    cudaTextureObject_t outGrads,
+    cudaTextureObject_t inputs,
+    cudaTextureObject_t acts,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeF,
+    const float addScale,
+    const float powScale,
+    const float minDiv,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / B_Y;
+
+  const int pxIdxX = blockIdx.x / numImgBlocks;
+  const int pxIdxY = blockIdx.y / numFilterBlocks;
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
+
+  const int imgPixels = imgSize * imgSize;
+  const int pxIdx = pxIdxY * imgSize + pxIdxX;
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  const int inpOffset = pxIdx * numImages + imgIdx;
+  const int outOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
+
+  target += outOffset;
+
+  float prod[imgsPerThread];
+  float denoms[imgsPerThread];
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    prod[i] = 0;
+    denoms[i] = 0;
+  }
+
+  int startF = blocked ? (filterIdx / sizeF) * sizeF
+                       : -sizeF + sizeF / 2 + 1 + filterIdx;
+  int loopStartF = blocked ? startF : MAX(0, startF);
+  int loopEndF = MIN(numFilters, startF + sizeF);
+
+  for (int f = loopStartF; f < loopEndF; ++f) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        // If an input is zero, then we shuldn't divide by it.
+        const float grad = tex1Dfetch<float>(
+            outGrads, inpOffset + f * imgPixels * numImages + i * B_X);
+        const float act = tex1Dfetch<float>(
+            acts, inpOffset + f * imgPixels * numImages + i * B_X);
+        const float inp =
+            tex1Dfetch<float>(
+                inputs, inpOffset + f * imgPixels * numImages + i * B_X) +
+            (act == 0);
+        prod[i] += grad * act * __powf(__fdividef(act, inp), 1.0f / powScale);
+      }
+    }
+  }
+
+  startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx;
+  loopStartF = blocked ? startF : MAX(0, startF);
+  loopEndF = MIN(numFilters, startF + sizeF);
+
+  for (int f = loopStartF; f < loopEndF; ++f) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        denoms[i] += square(tex1Dfetch<float>(
+            inputs, inpOffset + f * imgPixels * numImages + i * B_X));
+      }
+    }
+  }
+
+  if (!add) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        const float inp = tex1Dfetch<float>(inputs, outOffset + i * B_X);
+        const float out = tex1Dfetch<float>(outGrads, outOffset + i * B_X);
+        denoms[i] = addScale * denoms[i] + minDiv;
+        prod[i] =
+            (-2 * powScale * addScale * inp * prod[i] +
+             out * __powf(denoms[i], -powScale));
+        target[i * B_X] = prod[i];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+        const float inp = tex1Dfetch<float>(inputs, outOffset + i * B_X);
+        const float out = tex1Dfetch<float>(outGrads, outOffset + i * B_X);
+        denoms[i] = addScale * denoms[i] + minDiv;
+        prod[i] =
+            (-2 * powScale * addScale * inp * prod[i] +
+             out * __powf(denoms[i], -powScale));
+        target[i * B_X] =
+            scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * maxGrads:    (numFilters, numOutputs, numImages)
+ * rMaxActs:    (numFilters, numOutputs, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y*filtersPerThread
+ */
+
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool sum,
+    bool add,
+    bool checkCaseBounds>
+__global__ void kLocalAvgUndo(
+    float* avgGrads,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int subsX,
+    const int startX,
+    const int strideX,
+    const int outputsX,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockPxX = blockIdx.x / numImgBlocks;
+  const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread));
+
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx =
+      (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y *
+      filtersPerThread;
+
+  const int blockPx = blockPxY * imgSize + blockPxX;
+  const int numOutputs = outputsX * outputsX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int startOutputY =
+      blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
+  const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
+  const int startOutputX =
+      blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
+  const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  avgGrads +=
+      ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
+  target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = 0;
+    }
+  }
+
+  if (blockPxX >= startX &&
+      blockPxX < startX + strideX * (outputsX - 1) + subsX &&
+      blockPxY >= startX &&
+      blockPxY < startX + strideX * (outputsX - 1) + subsX) {
+    for (int my = startOutputY; my < endOutputY; my++) {
+      const float regionStartY = fmaxf(0, startX + my * strideX);
+      const float regionEndY = fminf(imgSize, startX + my * strideX + subsX);
+      const float regionSizeY = regionEndY - regionStartY;
+      for (int mx = startOutputX; mx < endOutputX; mx++) {
+        const int outputIdx = my * outputsX + mx;
+        const float regionStartX = fmaxf(0, startX + mx * strideX);
+        const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX);
+        const float regionSizeX = regionEndX - regionStartX;
+        // It's important to do the division here, because pushing division into
+        // the below loops makes the code 4x slower.
+        const float regionSizeInv =
+            sum ? 1.0f : (1.0f / (regionSizeX * regionSizeY));
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][i] +=
+                  avgGrads
+                      [(f * B_Y * numOutputs + outputIdx) * numImages +
+                       i * B_X] *
+                  regionSizeInv;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!add) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          target[f * B_Y * imgPixels * numImages + i * B_X] =
+              scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[f][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * maxGrads:    (numFilters, numOutputs, numImages)
+ * maxActs:    (numFilters, numOutputs, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y*filtersPerThread
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool add,
+    bool checkCaseBounds>
+__global__ void kLocalMaxUndo(
+    float* imgs,
+    float* maxGrads,
+    float* maxActs,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int subsX,
+    const int startX,
+    const int strideX,
+    const int outputsX,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImgs[B_Y * filtersPerThread][B_X * imgsPerThread];
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockPxX = blockIdx.x / numImgBlocks;
+  const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread));
+
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx =
+      (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y *
+      filtersPerThread;
+
+  const int blockPx = blockPxY * imgSize + blockPxX;
+  const int numOutputs = outputsX * outputsX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int startOutputY =
+      blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
+  const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
+  const int startOutputX =
+      blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
+  const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+  maxGrads +=
+      ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
+  maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
+
+  target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = 0;
+    }
+  }
+
+  if (blockPxX >= startX &&
+      blockPxX < startX + strideX * (outputsX - 1) + subsX &&
+      blockPxY >= startX &&
+      blockPxY < startX + strideX * (outputsX - 1) + subsX) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] =
+              imgs[f * B_Y * imgPixels * numImages + i * B_X];
+        }
+      }
+    }
+    for (int my = startOutputY; my < endOutputY; my++) {
+      for (int mx = startOutputX; mx < endOutputX; mx++) {
+        const int outputIdx = my * outputsX + mx;
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              const float ma = maxActs
+                  [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
+              const float mg = maxGrads
+                  [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
+              const float img =
+                  shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i];
+
+              prod[f][i] += (img == ma) * mg;
+            }
+          }
+        }
+      }
+    }
+  }
+  if (!add) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          target[f * B_Y * imgPixels * numImages + i * B_X] =
+              scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[f][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * acts := -2 x scale x acts x outGrads / denoms
+ */
+template <int B_X, int eltsPerThread>
+__global__ void kRNormUndoPrelims(
+    float* acts,
+    cudaTextureObject_t denoms,
+    cudaTextureObject_t outGrads,
+    const uint numElements,
+    const float scale) {
+  const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x;
+  const uint numThreads = B_X * gridDim.x;
+  for (uint i = e; i < numElements; i += numThreads * eltsPerThread) {
+#pragma unroll
+    for (uint k = 0; k < eltsPerThread; k++) {
+      if (i + k * B_X < numElements) {
+        acts[i + k * B_X] = __fdividef(
+            scale * tex1Dfetch<float>(outGrads, i + k * B_X) *
+                acts[i + k * B_X],
+            tex1Dfetch<float>(denoms, i + k * B_X));
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * outGrads:        (numFilters, imgPixels, numImages)
+ * denoms:          (numFilters, imgPixels, numImages)
+ * inputs:          (numFilters, imgPixels, numImages)
+ * acts:            (numFilters, imgPixels, numImages)
+ * target:          (numFilters, imgPixels, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread
+ * numFilters must be divisible by B_Y*filtersPerThread
+ *
+ * TODO: this isn't really ideal
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool checkCaseBounds>
+__global__ void kRNormUndo(
+    float* outGrads,
+    float* denoms,
+    float* inputs,
+    float* acts,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeX,
+    const float powScale,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / (B_Y * filtersPerThread);
+
+  const int blockPxX = blockIdx.x / numImgBlocks;
+  const int blockPxY = blockIdx.y / numFilterBlocks;
+
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx =
+      (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
+
+  const int blockPx = blockPxY * imgSize + blockPxX;
+  const int imgPixels = imgSize * imgSize;
+
+  const int startY = MAX(0, blockPxY + sizeX / 2 - sizeX + 1);
+  const int startX = MAX(0, blockPxX + sizeX / 2 - sizeX + 1);
+  const int endY = MIN(imgSize, blockPxY + sizeX / 2 + 1);
+  const int endX = MIN(imgSize, blockPxX + sizeX / 2 + 1);
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  acts += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
+  inputs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+  denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+  outGrads +=
+      ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+  target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
+      imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = 0;
+    }
+  }
+
+  for (int sy = startY; sy < endY; sy++) {
+    for (int sx = startX; sx < endX; sx++) {
+      const int outPx = sy * imgSize + sx;
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[f][i] +=
+                acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X];
+          }
+        }
+      }
+    }
+  }
+  //    outGrads += blockPx * numImages;
+  if (scaleTargets == 0) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
+          const float out =
+              outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
+          const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
+          prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
+          target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
+          const float out =
+              outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
+          const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
+          prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
+          target[f * B_Y * imgPixels * numImages + i * B_X] =
+              scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[f][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size 16xB_X
+ * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
+ * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
+ * batches of filtersPerThread
+ *
+ * So each block does 4x4 region for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines pixel idx
+ *
+ * outGrads:        (numFilters, imgPixels, numImages)
+ * denoms:          (numFilters, imgPixels, numImages)
+ * inputs:          (numFilters, imgPixels, numImages)
+ * acts:            (numFilters, imgPixels, numImages)
+ * target:          (numFilters, imgPixels, numImages)
+ *
+ * B_X one of 8, 16, 32
+ * imgsPerThread one of 1, 2, 4, 8, 16
+ *
+ * B_XximgsPerThread MUST be divisible by 32.
+ * Number of filters MUST be divisible by filtersPerThread.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ * numFilters must be divisible by filtersPerThread
+ *
+ * Final write-out will not be fully coalesced unless B_X is 32. But there's a
+ * lot more reading than writing here, and the reading is all coalesced, so it
+ * should be OK.
+ */
+template <
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    bool add,
+    bool checkCaseBounds>
+__global__ void kRNormUndo2(
+    float* outGrads,
+    float* denoms,
+    float* inputs,
+    float* acts,
+    float* target,
+    const int imgSize,
+    const int numFilters,
+    const int numImages,
+    const int sizeX,
+    const float powScale,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shActs[filtersPerThread][B_X * imgsPerThread];
+  const int imgPixels = imgSize * imgSize;
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int numFilterBlocks = numFilters / (filtersPerThread);
+  const int blockPxX = 4 * (blockIdx.x / numImgBlocks);
+  const int blockPxY = 4 * (blockIdx.y / numFilterBlocks);
+  const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
+
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  const int loadY = tidx / 32, loadX = tidx % 32;
+
+  const int startPxX = MAX(0, -DIVUP(sizeX, 2) + blockPxX + 1);
+  const int startPxY = MAX(0, -DIVUP(sizeX, 2) + blockPxY + 1);
+  const int endPxX = MIN(imgSize, blockPxX + sizeX / 2 + 4);
+  const int endPxY = MIN(imgSize, blockPxY + sizeX / 2 + 4);
+
+  const int myPxX = blockPxX + threadIdx.y % 4;
+  const int myPxY = blockPxY + threadIdx.y / 4;
+  const int myPxIdx = myPxY * imgSize + myPxX;
+  //    const bool doWork = myPxX < imgSize && myPxY < imgSize;
+  const int myStartPxY = -DIVUP(sizeX, 2) + myPxY + 1;
+  const int myStartPxX = -DIVUP(sizeX, 2) + myPxX + 1;
+  const int myEndPxY = myPxY + sizeX / 2 + 1;
+  const int myEndPxX = myPxX + sizeX / 2 + 1;
+
+  const int imgIdx = blockImgIdx + threadIdx.x;
+
+  acts +=
+      (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
+  denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+  inputs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+  outGrads += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+  target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[f][i] = 0;
+    }
+  }
+
+  for (int y = startPxY; y < endPxY; y++) {
+    const bool isInY = y >= myStartPxY && y < myEndPxY;
+    for (int x = startPxX; x < endPxX; x++) {
+      const int px = y * imgSize + x;
+// All the threads load a pixel from memory
+#pragma unroll
+      for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
+        if (filtersPerThread % (B_X / 2) == 0 ||
+            ly + loadY < filtersPerThread) {
+#pragma unroll
+          for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
+            if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
+              shActs[ly + loadY][lx + loadX] =
+                  acts[(ly * imgPixels + px) * numImages + lx];
+            }
+          }
+        }
+      }
+      __syncthreads();
+
+      // Each row of threads decides if it's interested in this pixel
+      if (isInY && x >= myStartPxX && x < myEndPxX) {
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][i] += shActs[f][threadIdx.x + i * B_X];
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+  acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
+  acts += threadIdx.x;
+  if (myPxX < imgSize && myPxY < imgSize) {
+    if (!add) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            const float out = outGrads[f * imgPixels * numImages + i * B_X];
+            const float den = denoms[f * imgPixels * numImages + i * B_X];
+            const float inp = inputs[f * imgPixels * numImages + i * B_X];
+            prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
+            target[f * imgPixels * numImages + i * B_X] = prod[f][i];
+          }
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            const float out = outGrads[f * imgPixels * numImages + i * B_X];
+            const float den = denoms[f * imgPixels * numImages + i * B_X];
+            const float inp = inputs[f * imgPixels * numImages + i * B_X];
+            prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
+            target[f * imgPixels * numImages + i * B_X] =
+                scaleTargets * target[f * imgPixels * numImages + i * B_X] +
+                scaleOutputs * prod[f][i];
+          }
+        }
+      }
+    }
+  }
+}
+
+void convLocalMaxUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX) {
+  convLocalMaxUndo(
+      images,
+      maxGrads,
+      maxActs,
+      target,
+      subsX,
+      startX,
+      strideX,
+      outputsX,
+      0,
+      1);
+}
+
+/*
+ * imgs:        (numFilters, imgPixels, numImages)
+ * maxGrads:    (numFilters, numOutputs, numImages)
+ * rMaxActs:    (numFilters, numOutputs, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ */
+void convLocalMaxUndo(
+    NVMatrix& images,
+    NVMatrix& maxGrads,
+    NVMatrix& maxActs,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    float scaleTargets,
+    float scaleOutput) {
+  int outputs = outputsX * outputsX;
+  int numImages = images.getNumCols();
+  int numFilters = maxGrads.getNumRows() / outputs;
+  int imgPixels = images.getNumRows() / numFilters;
+  assert(images.getNumRows() == numFilters * imgPixels);
+  int imgSize = int(sqrt(imgPixels));
+
+  assert(imgSize * imgSize == imgPixels);
+  assert(maxGrads.getNumRows() == numFilters * outputs);
+  assert(maxGrads.getNumCols() == numImages);
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  assert(!maxGrads.isTrans());
+  assert(!maxActs.isTrans());
+  assert(images.isContiguous());
+  assert(maxGrads.isContiguous());
+  assert(maxActs.isContiguous());
+  assert(maxGrads.isSameDims(maxActs));
+  assert(numFilters % 16 == 0);
+  //    assert(numImages % 128 == 0);
+
+  assert(strideX <= subsX);
+
+  target.resize(images);
+  assert(target.isContiguous());
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  int checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  dim3 threads(32, 4);
+  dim3 blocks(
+      DIVUP(numImages, 32 * imgsPerThread) * imgSize,
+      (numFilters / (4 * 2)) * imgSize);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (imgsPerThread == 4) {
+    if (checkCaseBounds) {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 4, 2, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 4, 2, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 4, 2, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                subsX,
+                startX,
+                strideX,
+                outputsX,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 4, 2, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  } else if (imgsPerThread == 2) {
+    if (checkCaseBounds) {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 2, 2, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 2, 2, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 2, 2, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                subsX,
+                startX,
+                strideX,
+                outputsX,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 2, 2, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  } else {
+    if (checkCaseBounds) {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 1, 2, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 1, 2, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (scaleTargets == 0 && scaleOutput == 1) {
+        kLocalMaxUndo<4, 32, 1, 2, false, false>
+            <<<blocks, threads, 0, stream>>>(
+                images.getDevData(),
+                maxGrads.getDevData(),
+                maxActs.getDevData(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                subsX,
+                startX,
+                strideX,
+                outputsX,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        kLocalMaxUndo<4, 32, 1, 2, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            maxGrads.getDevData(),
+            maxActs.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            subsX,
+            startX,
+            strideX,
+            outputsX,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  }
+
+  getLastCudaError("convLocalMaxUndo: kernel execution failed");
+}
+
+void convLocalAvgUndo(
+    NVMatrix& avgGrads,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    int imgSize,
+    bool sum) {
+  convLocalAvgUndo(
+      avgGrads, target, subsX, startX, strideX, outputsX, imgSize, sum, 0, 1);
+}
+
+/*
+ * avgGrads:    (numFilters, numOutputs, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ */
+void convLocalAvgUndo(
+    NVMatrix& avgGrads,
+    NVMatrix& target,
+    int subsX,
+    int startX,
+    int strideX,
+    int outputsX,
+    int imgSize,
+    bool sum,
+    float scaleTargets,
+    float scaleOutput) {
+  int numImages = avgGrads.getNumCols();
+
+  int outputs = outputsX * outputsX;
+  int imgPixels = imgSize * imgSize;
+  int numFilters = avgGrads.getNumRows() / outputs;
+  assert(avgGrads.getNumRows() == numFilters * outputs);
+
+  assert(!target.isTrans());
+  assert(!avgGrads.isTrans());
+  assert(avgGrads.isContiguous());
+  assert(numFilters % 16 == 0);
+  //    assert(numImages % 128 == 0);
+
+  assert(strideX <= subsX);
+
+  target.resize(numFilters * imgPixels, numImages);
+  assert(target.isContiguous());
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  int checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  dim3 threads(32, 4);
+  dim3 blocks(
+      DIVUP(numImages, 32 * imgsPerThread) * imgSize,
+      (numFilters / (4 * 4)) * imgSize);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  bool scale = !(scaleTargets == 0 && scaleOutput == 1);
+  if (sum) {
+    if (imgsPerThread == 4) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 4, 4, true, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 4, 4, true, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 4, 4, true, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 4, 4, true, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    } else if (imgsPerThread == 2) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 2, 4, true, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 2, 4, true, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 2, 4, true, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 2, 4, true, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    } else {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 1, 4, true, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 1, 4, true, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 1, 4, true, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 1, 4, true, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    }
+  } else {
+    if (imgsPerThread == 4) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 4, 4, false, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 4, 4, false, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 4, 4, false, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 4, 4, false, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    } else if (imgsPerThread == 2) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 2, 4, false, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 2, 4, false, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 2, 4, false, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 2, 4, false, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    } else {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 1, 4, false, false, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 1, 4, false, true, true>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          kLocalAvgUndo<4, 32, 1, 4, false, false, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        } else {
+          kLocalAvgUndo<4, 32, 1, 4, false, true, false>
+              <<<blocks, threads, 0, stream>>>(
+                  avgGrads.getDevData(),
+                  target.getDevData(),
+                  imgSize,
+                  numFilters,
+                  numImages,
+                  subsX,
+                  startX,
+                  strideX,
+                  outputsX,
+                  scaleTargets,
+                  scaleOutput);
+        }
+      }
+    }
+  }
+
+  getLastCudaError("convLocalAvgUndo: kernel execution failed");
+}
+
+void convResponseNorm(
+    NVMatrix& images,
+    NVMatrix& denoms,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float minDiv) {
+  convContrastNorm(
+      images,
+      images,
+      denoms,
+      target,
+      numFilters,
+      sizeX,
+      addScale,
+      powScale,
+      minDiv);
+}
+
+/*
+ * images:      (numFilters, imgPixels, numImages)
+ * meanDiffs:   (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ */
+void convContrastNorm(
+    NVMatrix& images,
+    NVMatrix& meanDiffs,
+    NVMatrix& denoms,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float minDiv) {
+  int numImages = images.getNumCols();
+  int imgPixels = images.getNumRows() / numFilters;
+  assert(images.getNumRows() == numFilters * imgPixels);
+  int imgSize = int(sqrt(imgPixels));
+  assert(imgSize * imgSize == imgPixels);
+  assert(meanDiffs.isSameDims(images));
+
+  assert(!meanDiffs.isTrans());
+  assert(!images.isTrans());
+  assert(images.isContiguous());
+  assert(meanDiffs.isContiguous());
+  assert(numFilters % 16 == 0 || numFilters <= 8);
+
+  target.resize(images);
+  denoms.resize(images);
+  assert(target.isContiguous());
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (sizeX >= 6 && numFilters % 4 == 0) {
+    // This one is faster for large regions (my tests show regions >= 6...)
+    int imgsPerThread = 8;
+    int filtersPerThread = 4;
+    int bx = 8;
+    bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
+    assert((imgsPerThread * bx) % 32 == 0);
+    assert(numFilters % filtersPerThread == 0);
+    dim3 threads(bx, 16);
+    dim3 blocks(
+        DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread),
+        DIVUP(imgSize, 4) * numFilters / filtersPerThread);
+
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(
+          kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here
+      kCNorm2<8, 8, 4, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          meanDiffs.getDevData(),
+          denoms.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeX,
+          addScale,
+          powScale,
+          minDiv);
+    } else {
+      cudaFuncSetCacheConfig(
+          kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here
+      kCNorm2<8, 8, 4, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          meanDiffs.getDevData(),
+          denoms.getDevData(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeX,
+          addScale,
+          powScale,
+          minDiv);
+    }
+  } else {
+    bool checkCaseBounds = numImages % 128 != 0;
+    if (numFilters <= 8) {
+      dim3 threads(128);
+      dim3 blocks(DIVUP(numImages, 128) * imgSize, imgSize);
+      if (numFilters == 1) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 1, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 1, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 2) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 2, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 2, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 3) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 3, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 3, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 4) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 4, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 4, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 5) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 5, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 5, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 6) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 6, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 6, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 7) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 7, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 7, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      } else if (numFilters == 8) {
+        if (checkCaseBounds) {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 8, true><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        } else {
+          cudaFuncSetCacheConfig(
+              kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1);
+          kCNorm_fewfilter<1, 8, false><<<blocks, threads, 0, stream>>>(
+              images.getDevData(),
+              meanDiffs.getDevData(),
+              denoms.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numImages,
+              sizeX,
+              addScale,
+              powScale,
+              minDiv);
+        }
+      }
+    } else {
+      dim3 threads(32, 4);
+      dim3 blocks(
+          DIVUP(numImages, threads.x * 4),
+          (numFilters / (threads.y * 2)),
+          imgPixels);
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
+        kCNorm_manyfilter<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            meanDiffs.getDevData(),
+            denoms.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            addScale,
+            powScale,
+            minDiv);
+      } else {
+        cudaFuncSetCacheConfig(
+            kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
+        kCNorm_manyfilter<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            meanDiffs.getDevData(),
+            denoms.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            addScale,
+            powScale,
+            minDiv);
+      }
+    }
+  }
+  getLastCudaError("convResponseNorm: kernel execution failed");
+}
+
+void convContrastNormUndo(
+    NVMatrix& outGrads,
+    NVMatrix& denoms,
+    NVMatrix& meanDiffs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float scaleTargets,
+    float scaleOutput) {
+  convResponseNormUndo(
+      outGrads,
+      denoms,
+      meanDiffs,
+      acts,
+      target,
+      numFilters,
+      sizeX,
+      addScale,
+      powScale,
+      scaleTargets,
+      scaleOutput);
+}
+
+/*
+ * outGrads:    (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages)
+ * inputs:      (numFilters, imgPixels, numImages)
+ * acts:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ *
+ * THIS WILL OVERWRITE THE ACTS MATRIX.
+ */
+void convResponseNormUndo(
+    NVMatrix& outGrads,
+    NVMatrix& denoms,
+    NVMatrix& inputs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeX,
+    float addScale,
+    float powScale,
+    float scaleTargets,
+    float scaleOutput) {
+  int numImages = outGrads.getNumCols();
+  int imgPixels = outGrads.getNumRows() / numFilters;
+
+  int imgSize = int(sqrt(imgPixels));
+  assert(imgSize * imgSize == imgPixels);
+
+  assert(outGrads.getNumRows() == numFilters * imgPixels);
+
+  assert(denoms.isSameDims(outGrads));
+  assert(acts.isSameDims(denoms));
+  assert(!denoms.isTrans());
+  assert(!outGrads.isTrans());
+  assert(!acts.isTrans());
+  assert(!target.isTrans());
+  assert(outGrads.isContiguous());
+
+  assert(numFilters % 16 == 0);
+
+  target.resize(outGrads);
+  assert(target.isContiguous());
+  // First do acts := -2 x scale x acts x outGrads / denoms
+  // so that the main routine only has to do an addition in its inner loop.
+  int prelimEltsPerThread = 8;
+  dim3 threads(128);
+  dim3 blocks(
+      DIVUP(outGrads.getNumElements(), (threads.x * prelimEltsPerThread)));
+  bool checkPrelimBounds =
+      outGrads.getNumElements() % (threads.x * prelimEltsPerThread) != 0;
+  // printf("num elts: %d, blocks: %d\n", outGrads.getNumElements(), blocks.x);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  kRNormUndoPrelims<128, 8><<<blocks, threads, 0, stream>>>(
+      acts.getDevData(),
+      denoms.getTextureObject(),
+      outGrads.getTextureObject(),
+      outGrads.getNumElements(),
+      -2 * addScale * powScale);
+
+  // Now the main routine
+  if (sizeX >= 6 && numFilters % 4 == 0) {
+    // This one is faster for large regions (my tests show regions >= 6...)
+    // NOTE: this stuff is not optimized for Kepler. Only kRNormUndo is.
+    int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
+    int filtersPerThread = 4;
+    int bx = 16;
+    bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
+    assert((imgsPerThread * bx) % 32 == 0);
+
+    threads = dim3(bx, 16);
+    blocks = dim3(
+        DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread),
+        DIVUP(imgSize, 4) * numFilters / filtersPerThread);
+    if (imgsPerThread == 8) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 8, 4, true, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 8, 4, false, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 8, 4, true, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 8, 4, false, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      }
+    } else if (imgsPerThread == 4) {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 4, 4, true, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 4, 4, false, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 4, 4, true, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 4, 4, false, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      }
+    } else {
+      if (checkCaseBounds) {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 2, 4, true, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 2, 4, false, true><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      } else {
+        if (scaleTargets == 0 && scaleOutput == 1) {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 2, 4, true, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        } else {
+          cudaFuncSetCacheConfig(
+              kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1);
+          kRNormUndo2<16, 2, 4, false, false><<<blocks, threads, 0, stream>>>(
+              outGrads.getDevData(),
+              denoms.getDevData(),
+              inputs.getDevData(),
+              acts.getDevData(),
+              target.getDevData(),
+              imgSize,
+              numFilters,
+              numImages,
+              sizeX,
+              powScale,
+              scaleTargets,
+              scaleOutput);
+        }
+      }
+    }
+  } else {
+    int imgsPerThread = numImages % 128 == 0 ? 4 : 1;
+    bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+    threads = dim3(32, 4);
+    blocks = dim3(
+        DIVUP(numImages, 32 * imgsPerThread) * imgSize,
+        (numFilters / (4 * 2)) * imgSize);
+
+    if (imgsPerThread == 4) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kRNormUndo<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
+        kRNormUndo<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
+            outGrads.getDevData(),
+            denoms.getDevData(),
+            inputs.getDevData(),
+            acts.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            powScale,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kRNormUndo<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
+        kRNormUndo<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
+            outGrads.getDevData(),
+            denoms.getDevData(),
+            inputs.getDevData(),
+            acts.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            powScale,
+            scaleTargets,
+            scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kRNormUndo<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
+        kRNormUndo<4, 32, 1, 2, true><<<blocks, threads, 0, stream>>>(
+            outGrads.getDevData(),
+            denoms.getDevData(),
+            inputs.getDevData(),
+            acts.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            powScale,
+            scaleTargets,
+            scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kRNormUndo<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
+        kRNormUndo<4, 32, 1, 2, false><<<blocks, threads, 0, stream>>>(
+            outGrads.getDevData(),
+            denoms.getDevData(),
+            inputs.getDevData(),
+            acts.getDevData(),
+            target.getDevData(),
+            imgSize,
+            numFilters,
+            numImages,
+            sizeX,
+            powScale,
+            scaleTargets,
+            scaleOutput);
+      }
+    }
+  }
+  getLastCudaError("kRNormUndo: kernel execution failed");
+}
+
+/*
+ * imgs:        (numChannels, imgPixels, numImages) with given imgStride
+ * target:      (numChannels, tgtPixels, numImages)
+ *
+ * imgSize = scale * tgtSize
+ */
+void convResizeBilinear(
+    NVMatrix& images,
+    NVMatrix& target,
+    int imgSize,
+    int tgtSize,
+    float scale) {
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  int imgPixels = imgSize * imgSize;
+  int tgtPixels = tgtSize * tgtSize;
+  int numChannels = images.getNumRows() / imgPixels;
+  int numImages = images.getNumCols();
+  assert(images.getNumRows() == numChannels * imgPixels);
+
+  target.resize(numChannels * tgtPixels, numImages);
+  assert(target.isContiguous());
+  int numChunksX = DIVUP(tgtSize, 4);
+  int numChunks = numChunksX * numChunksX;
+  double imgCenter = imgSize * 0.5;
+  double tgtCenter = tgtSize * 0.5;
+  double centerScale = imgCenter - tgtCenter * scale;
+
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  dim3 threads(32, 16);
+  dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks);
+  if (imgsPerThread == 4) {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1);
+      kResizeBilinear<4, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    } else {
+      cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1);
+      kResizeBilinear<4, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    }
+  } else if (imgsPerThread == 2) {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1);
+      kResizeBilinear<2, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    } else {
+      cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1);
+      kResizeBilinear<2, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    }
+  } else {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1);
+      kResizeBilinear<1, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    } else {
+      cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1);
+      kResizeBilinear<1, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgSize,
+          tgtSize,
+          numImages,
+          images.getStride(),
+          scale,
+          centerScale);
+    }
+  }
+  getLastCudaError("convResizeBilinear: kernel execution failed");
+}
+
+/*
+ * imgs:        (3, imgPixels, numImages) with given imgStride
+ * target:      (3, imgPixels, numImages)
+ */
+void convRGBToYUV(NVMatrix& images, NVMatrix& target) {
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  int imgPixels = images.getNumRows() / 3;
+  int numImages = images.getNumCols();
+  assert(images.getNumRows() == 3 * imgPixels);
+
+  target.resize(3 * imgPixels, numImages);
+  assert(target.isContiguous());
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  dim3 threads(32, 4);
+  dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
+  if (imgsPerThread == 4) {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1);
+      kRGBToYUV<4, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    } else {
+      cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1);
+      kRGBToYUV<4, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    }
+  } else if (imgsPerThread == 2) {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1);
+      kRGBToYUV<2, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    } else {
+      cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1);
+      kRGBToYUV<2, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    }
+  } else {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1);
+      kRGBToYUV<1, true><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    } else {
+      cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1);
+      kRGBToYUV<1, false><<<blocks, threads, 0, stream>>>(
+          images.getDevData(),
+          target.getDevData(),
+          imgPixels,
+          numImages,
+          images.getStride());
+    }
+  }
+  getLastCudaError("convRGBToYUV: kernel execution failed");
+}
+
+/*
+ * imgs:        (3, imgPixels, numImages) with given imgStride
+ * target:      (3, imgPixels, numImages)
+ */
+void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) {
+  assert(!images.isTrans());
+  assert(!target.isTrans());
+  int imgPixels = images.getNumRows() / 3;
+  int numImages = images.getNumCols();
+  assert(images.getNumRows() == 3 * imgPixels);
+
+  target.resize(3 * imgPixels, numImages);
+  assert(target.isContiguous());
+
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  dim3 threads(32, 4);
+  dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (imgsPerThread == 4) {
+    if (center) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<4, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<4, false, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<4, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<4, true, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<4, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<4, false, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<4, false, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    }
+  } else if (imgsPerThread == 2) {
+    if (center) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<2, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<2, false, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<2, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<2, true, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<2, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<2, false, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<2, false, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    }
+  } else {
+    if (center) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<1, true, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<1, false, true>, cudaFuncCachePreferL1);
+        kRGBToLAB<1, false, true><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<1, true, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<1, true, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      } else {
+        cudaFuncSetCacheConfig(
+            kRGBToLAB<1, false, false>, cudaFuncCachePreferL1);
+        kRGBToLAB<1, false, false><<<blocks, threads, 0, stream>>>(
+            images.getDevData(),
+            target.getDevData(),
+            imgPixels,
+            numImages,
+            images.getStride());
+      }
+    }
+  }
+  getLastCudaError("convRGBToLAB: kernel execution failed");
+}
+
+/*
+ * imgs:    (numChannels, imgPixels, numImages) with given imgStride
+ * target:  (numChannels, tgtPixels, numImages)
+ */
+void convCrop(
+    NVMatrix& imgs,
+    NVMatrix& target,
+    int imgSize,
+    int tgtSize,
+    int startY,
+    int startX) {
+  int numImages = imgs.getNumCols();
+  int imgPixels = imgSize * imgSize;
+  int tgtPixels = tgtSize * tgtSize;
+
+  int numChannels = imgs.getNumRows() / imgPixels;
+  assert(imgs.getNumRows() == imgPixels * numChannels);
+  assert(imgPixels == imgSize * imgSize);
+  assert(imgSize - startY >= tgtSize);
+  assert(imgSize - startX >= tgtSize);
+  assert(startY >= 0);
+  assert(startX >= 0);
+  target.resize(numChannels * tgtPixels, numImages);
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
+  dim3 blocks(
+      DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4));
+  dim3 threads(32, 4);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  if (imgsPerThread == 4) {
+    if (checkCaseBounds) {
+      kCrop<4, true><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    } else {
+      kCrop<4, false><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    }
+  } else if (imgsPerThread == 2) {
+    if (checkCaseBounds) {
+      kCrop<2, true><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    } else {
+      kCrop<2, false><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    }
+  } else {
+    if (checkCaseBounds) {
+      kCrop<1, true><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    } else {
+      kCrop<1, false><<<blocks, threads, 0, stream>>>(
+          imgs.getDevData(),
+          target.getDevData(),
+          numImages,
+          imgs.getStride(),
+          imgSize,
+          tgtSize,
+          startY,
+          startX);
+    }
+  }
+  getLastCudaError("convCrop: kernel execution failed");
+}
+
+/*
+ * images:      (numFilters, imgPixels, numImages)
+ * meanDiffs:   (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+
+ * Note: at present, I have no code to compute the meanDiffs. So it should be
+ set
+ * to be equal to images. In other words, this isn't really doing contrast
+ normalization,
+ * just response normalization.
+ */
+void convContrastNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& meanDiffs,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked) {
+  int numImages = images.getNumCols();
+  int imgPixels = images.getNumRows() / numFilters;
+  assert(images.getNumRows() == numFilters * imgPixels);
+  int imgSize = int(sqrt(imgPixels));
+  assert(imgSize * imgSize == imgPixels);
+  assert(meanDiffs.isSameDims(images));
+  assert(sizeF > 0 && sizeF <= numFilters);
+
+  assert(!meanDiffs.isTrans());
+  assert(!images.isTrans());
+  assert(images.isContiguous());
+  assert(meanDiffs.isContiguous());
+  assert(numFilters % 16 == 0);
+
+  target.resize(images);
+  //    denoms.resize(images);
+  assert(target.isContiguous());
+
+  bool checkCaseBounds = numImages % 128 != 0;
+
+  dim3 threads(32, 4);
+  dim3 blocks(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize);
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+  //    printf("convContrastNormCrossMap imgs: %p, meanDiffs: %p, denoms: %p,
+  //    target: %p, imgSize: %d, numFilters: %d, numImages: %d, sizeF: %d,
+  //    addScale: %f, powScale: %f, minDiv: %f, blocked: %d\n",
+  //            images.getDevData(), meanDiffs.getDevData(),
+  //            denoms.getDevData(), target.getDevData(), imgSize, numFilters,
+  //            numImages, sizeF, addScale, powScale, minDiv, blocked);
+  if (blocked) {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(
+          kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1);
+      kFCNorm<4, 32, 4, true, true><<<blocks, threads, 0, stream>>>(
+          images.getTextureObject(),
+          meanDiffs.getTextureObject(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeF,
+          addScale,
+          powScale,
+          minDiv);
+    } else {
+      cudaFuncSetCacheConfig(
+          kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1);
+      kFCNorm<4, 32, 4, false, true><<<blocks, threads, 0, stream>>>(
+          images.getTextureObject(),
+          meanDiffs.getTextureObject(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeF,
+          addScale,
+          powScale,
+          minDiv);
+    }
+  } else {
+    if (checkCaseBounds) {
+      cudaFuncSetCacheConfig(
+          kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1);
+      kFCNorm<4, 32, 4, true, false><<<blocks, threads, 0, stream>>>(
+          images.getTextureObject(),
+          meanDiffs.getTextureObject(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeF,
+          addScale,
+          powScale,
+          minDiv);
+    } else {
+      cudaFuncSetCacheConfig(
+          kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1);
+      kFCNorm<4, 32, 4, false, false><<<blocks, threads, 0, stream>>>(
+          images.getTextureObject(),
+          meanDiffs.getTextureObject(),
+          target.getDevData(),
+          imgSize,
+          numFilters,
+          numImages,
+          sizeF,
+          addScale,
+          powScale,
+          minDiv);
+    }
+  }
+
+  getLastCudaError("convContrastNormCrossMap: kernel execution failed");
+}
+
+/*
+ * outGrads:    (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages)
+ * inputs:      (numFilters, imgPixels, numImages)
+ * acts:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, imgPixels, numImages)
+ *
+ * THIS WILL OVERWRITE THE ACTS MATRIX.
+ */
+void convResponseNormCrossMapUndo(
+    NVMatrix& outGrads,
+    NVMatrix& inputs,
+    NVMatrix& acts,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked,
+    float scaleTargets,
+    float scaleOutput) {
+  int numImages = outGrads.getNumCols();
+  int imgPixels = outGrads.getNumRows() / numFilters;
+
+  int imgSize = int(sqrt(imgPixels));
+  assert(imgSize * imgSize == imgPixels);
+  assert(sizeF > 0 && sizeF <= numFilters);
+  assert(outGrads.getNumRows() == numFilters * imgPixels);
+
+  assert(!outGrads.isTrans());
+  assert(!acts.isTrans());
+  assert(!target.isTrans());
+  assert(outGrads.isContiguous());
+
+  assert(numFilters % 16 == 0);
+
+  target.resize(outGrads);
+  assert(target.isContiguous());
+  // First do acts := -2 x scale x acts x outGrads / denoms
+  // so that the main routine only has to do an addition in its inner loop.
+  cudaStream_t stream = NVMatrix::getDefaultStream();
+
+  dim3 threads2 = dim3(32, 4);
+  dim3 blocks2 =
+      dim3(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize);
+
+  bool checkCaseBounds = (numImages % 128) != 0;
+  if (blocked) {
+    if (scaleTargets == 0 && scaleOutput == 1) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, false, true, true>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, false, true, true>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, false, false, true>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, false, false, true>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, true, true, true>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, true, true, true>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, true, false, true>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, true, false, true>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      }
+    }
+  } else {
+    if (scaleTargets == 0 && scaleOutput == 1) {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, false, true, false>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, false, true, false>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, false, false, false>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, false, false, false>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      }
+    } else {
+      if (checkCaseBounds) {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, true, true, false>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, true, true, false>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      } else {
+        cudaFuncSetCacheConfig(
+            kFRNormUndo2<4, 32, 4, true, false, false>, cudaFuncCachePreferL1);
+        kFRNormUndo2<4, 32, 4, true, false, false>
+            <<<blocks2, threads2, 0, stream>>>(
+                outGrads.getTextureObject(),
+                inputs.getTextureObject(),
+                acts.getTextureObject(),
+                target.getDevData(),
+                imgSize,
+                numFilters,
+                numImages,
+                sizeF,
+                addScale,
+                powScale,
+                minDiv,
+                scaleTargets,
+                scaleOutput);
+      }
+    }
+  }
+
+  getLastCudaError("convResponseNormCrossMapUndo: kernel execution failed");
+}
+
+void convResponseNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    float minDiv,
+    bool blocked) {
+  convContrastNormCrossMap(
+      images,
+      images,
+      target,
+      numFilters,
+      sizeF,
+      addScale,
+      powScale,
+      minDiv,
+      blocked);
+}
+
+/*
+ * images:      (numFilters, imgPixels, numImages)
+ * denoms:      (numFilters, imgPixels, numImages) (out)
+ * target:      (numFilters, imgPixels, numImages) (out)
+ */
+void convResponseNormCrossMap(
+    NVMatrix& images,
+    NVMatrix& target,
+    int numFilters,
+    int sizeF,
+    float addScale,
+    float powScale,
+    bool blocked) {
+  convContrastNormCrossMap(
+      images,
+      images,
+      target,
+      numFilters,
+      sizeF,
+      addScale,
+      powScale,
+      1,
+      blocked);
+}
+
+cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor) {
+  cudaTextureObject_t tex_obj;
+  cudaResourceDesc res_desc;
+  std::memset(&res_desc, 0, sizeof(res_desc));
+  res_desc.resType = cudaResourceTypeLinear;
+  res_desc.res.linear.devPtr = tensor->mutable_data<float>();
+  res_desc.res.linear.sizeInBytes = tensor->nbytes();
+  res_desc.res.linear.desc =
+      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+  cudaTextureDesc tex_desc;
+  std::memset(&tex_desc, 0, sizeof(tex_desc));
+  CUDA_ENFORCE(
+      cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
+  return tex_obj;
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
new file mode 100644
index 0000000..3fb31c5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
@@ -0,0 +1,6081 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../include/cudaconv2.cuh"
+
+__device__ __forceinline__ void
+filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
+    int fPidx,
+    int imgLoadModPosY,
+    int imgLoadModPosX,
+    int imgSizeX,
+    int filterSize,
+    int& iPidx) {
+  int x = imgLoadModPosX + (fPidx) % filterSize;
+  int y = imgLoadModPosY + (fPidx) / filterSize;
+  iPidx =
+      y >= 0 && y < imgSizeX && x >= 0 && x < imgSizeX ? y * imgSizeX + x : -1;
+}
+
+#define FA_COLOR3_IMPRELOAD(c, i)                                          \
+  imPreload[c][i] =                                                        \
+      iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \
+      ? 0                                                                  \
+      : mm[c * imgPixels * imgStride + i * B_X];
+#define FA_COLOR3_IMPRELOAD_TX(c, i)                                       \
+  imPreload[c][i] =                                                        \
+      iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \
+      ? 0                                                                  \
+      : tex1Dfetch<float>(                                                 \
+            images, imagesOffset2 + c * imgPixels * imgStride + i * B_X);
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int numColors,
+    int pixelCache,
+    bool scale,
+    bool checkImgBounds>
+//__launch_bounds__(128,3)
+__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex(
+    cudaTextureObject_t images,
+    cudaTextureObject_t filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv /*, const bool noloads*/) {
+  __shared__ float
+      shFilters[numColors][pixelCache]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float shImages[numColors][pixelCache]
+                           [B_X * imgsPerThread]; // pre-load 1 pixel from
+                                                  // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+
+  const int numModules = numModulesX * numModulesY;
+  // Another fun insanity: the % B_X makes things faster, even thought
+  // threadIdx.x is in the range 0..31. It appears that this allows the compiler
+  // to optimize?
+  const int tx = threadIdx.x % B_X;
+  const int ty = threadIdx.y % B_Y;
+  const int tidx = ty * B_X + threadIdx.x;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
+
+  //    images += myImgIdx;
+  //    filters += blockFilterIdx
+  //            + shFilterLoadY * numFilters + shFilterLoadX;
+  //    if (!conv) { // NOTE: UNTESTED!
+  //        filters += moduleIdx * numColors * filterPixels * numFilters;
+  //    }
+
+  const int imagesOffset = myImgIdx;
+  const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters +
+      shFilterLoadX +
+      (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters);
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
+          numModules +
+      myImgIdx;
+
+  float prod[imgsPerThread][filtersPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[i][f] = 0;
+    }
+  }
+
+  int iPidxNext;
+  float imPreload[numColors][imgsPerThread];
+  float fPreload[numColors][pixelCache * filtersPerThread / B_X];
+
+#pragma unroll
+  for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+    for (int p = 0; p < pixelCache; p += B_X / filtersPerThread) {
+      if (p + shFilterLoadY < filterPixels) {
+        fPreload[c][p * filtersPerThread / B_X] = tex1Dfetch<float>(
+            filters,
+            filtersOffset + p * numFilters + c * numFilters * filterPixels);
+      } else {
+        fPreload[c][p * filtersPerThread / B_X] = 0;
+      }
+    }
+  }
+
+  filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
+      ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext);
+
+#pragma unroll
+  for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (iPidxNext >= 0 &&
+          (!checkImgBounds || myImgIdx + i * B_X < numImages)) {
+        imPreload[c][i] = tex1Dfetch<float>(
+            images,
+            imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X);
+      } else {
+        imPreload[c][i] = 0;
+      }
+    }
+  }
+
+  for (int p = 0; p < filterPixels; p += pixelCache) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int c = 0; c < numColors; ++c) {
+        // NOTE: bank conflicts here!
+        shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i];
+      }
+    }
+
+    const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache;
+    filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
+        fPidxNext + ty,
+        imgLoadModPosY,
+        imgLoadModPosX,
+        imgSizeX,
+        filterSize,
+        iPidxNext);
+
+    //        const float* ff = &filters[numFilters * fPidxNext];
+    //        const float* mm = &images[imgStride * iPidxNext];
+    const int filtersOffset2 = filtersOffset + numFilters * fPidxNext;
+    const int imagesOffset2 = imagesOffset + imgStride * iPidxNext;
+
+    FA_COLOR3_IMPRELOAD_TX(0, 0);
+    FA_COLOR3_IMPRELOAD_TX(0, 1);
+    FA_COLOR3_IMPRELOAD_TX(0, 2);
+    FA_COLOR3_IMPRELOAD_TX(0, 3);
+
+#pragma unroll
+    for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+      for (int pp = 0; pp < pixelCache; pp += B_X / filtersPerThread) {
+        shFilters[c][pp + shFilterLoadY][shFilterLoadX] =
+            fPreload[c][pp * filtersPerThread / B_X];
+      }
+    }
+
+    __syncthreads();
+    FA_COLOR3_IMPRELOAD_TX(1, 0);
+    FA_COLOR3_IMPRELOAD_TX(1, 1);
+    FA_COLOR3_IMPRELOAD_TX(1, 2);
+    FA_COLOR3_IMPRELOAD_TX(1, 3);
+    FA_COLOR3_IMPRELOAD_TX(2, 0);
+    FA_COLOR3_IMPRELOAD_TX(2, 1);
+    FA_COLOR3_IMPRELOAD_TX(2, 2);
+    FA_COLOR3_IMPRELOAD_TX(2, 3);
+#pragma unroll
+    for (int c = 0; c < numColors; c++) {
+#pragma unroll
+      for (int pp = 0; pp < pixelCache * filtersPerThread / B_X; pp++) {
+        fPreload[c][pp] =
+            fPidxNext + pp * (B_X / filtersPerThread) + shFilterLoadY >=
+                filterPixels
+            ? 0
+            : tex1Dfetch<float>(
+                  filters,
+                  filtersOffset2 + c * numFilters * filterPixels +
+                      pp * (B_X / filtersPerThread) * numFilters);
+      }
+    }
+#pragma unroll
+    for (int pp = 0; pp < pixelCache; pp++) {
+#pragma unroll
+      for (int c = 0; c < numColors; c++) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+          for (int i = 0; i < imgsPerThread; i++) {
+            prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] *
+                shFilters[c][pp][ty * filtersPerThread + f];
+          }
+        }
+      }
+    }
+
+    __syncthreads();
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleTargets * targets[i * B_X + f * numImages * numModules] +
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  } else {
+// Note: reversing order of these loops saves 2 registers, but costs time
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * This won't be pretty.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int numColors,
+    int pixelCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex(
+    cudaTextureObject_t images,
+    cudaTextureObject_t filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv /*, const bool noloads*/) {
+  __shared__ float
+      shFilters[numColors][pixelCache]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float shImages[numColors][pixelCache]
+                           [B_X * imgsPerThread]; // pre-load 1 pixel from
+                                                  // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+
+  const int numModules = numModulesX * numModulesY;
+  // Another fun insanity: the % B_X makes things faster, even though
+  // threadIdx.x is in the range 0..31. It appears that this allows the compiler
+  // to optimize?
+  const int tx = threadIdx.x % B_X;
+  const int ty = threadIdx.y % B_Y;
+  const int tidx = ty * B_X + threadIdx.x;
+  const int warp = tidx / 32;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
+
+  //    images += myImgIdx;
+  //    filters += blockFilterIdx
+  //            + shFilterLoadY * numFilters + shFilterLoadX;
+  //    if (!conv) { // NOTE: UNTESTED!
+  //        filters += moduleIdx * numColors * filterPixels * numFilters;
+  //    }
+
+  const int imagesOffset = myImgIdx;
+  const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters +
+      shFilterLoadX +
+      (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters);
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
+          numModules +
+      myImgIdx;
+
+  float prod[imgsPerThread][filtersPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[i][f] = 0;
+    }
+  }
+
+  int iPidxNext;
+  float imPreload[numColors][imgsPerThread];
+  float fPreload[numColors][DIVUP(pixelCache * filtersPerThread, B_X)];
+
+  if (warp < 3) {
+#pragma unroll
+    for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+      for (int p = 0; p < pixelCache; p += 2) {
+        if (p + shFilterLoadY < filterPixels) {
+          fPreload[c][p / 2] = tex1Dfetch<float>(
+              filters,
+              filtersOffset + p * numFilters + c * numFilters * filterPixels);
+        } else {
+          fPreload[c][p / 2] = 0;
+        }
+      }
+    }
+  }
+
+  filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
+      ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext);
+
+#pragma unroll
+  for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (iPidxNext >= 0 &&
+          (!checkImgBounds || myImgIdx + i * B_X < numImages)) {
+        imPreload[c][i] = tex1Dfetch<float>(
+            images,
+            imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X);
+      } else {
+        imPreload[c][i] = 0;
+      }
+    }
+  }
+
+  for (int p = 0; p < filterPixels; p += pixelCache) {
+    const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache;
+    filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
+        fPidxNext + ty,
+        imgLoadModPosY,
+        imgLoadModPosX,
+        imgSizeX,
+        filterSize,
+        iPidxNext);
+
+#pragma unroll
+    for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        // NOTE: bank conflicts here!
+        shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i];
+      }
+    }
+
+    if (warp < 3) {
+#pragma unroll
+      for (int c = 0; c < numColors; ++c) {
+#pragma unroll
+        for (int pp = 0; pp < pixelCache; pp += 2) {
+          shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp / 2];
+        }
+      }
+    }
+
+    __syncthreads();
+    //        const float* ff = &filters[numFilters * fPidxNext];
+    //        const float* mm = &images[imgStride * iPidxNext];
+    const int filtersOffset2 = filtersOffset + numFilters * fPidxNext;
+    const int imagesOffset2 = imagesOffset + imgStride * iPidxNext;
+
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; ++i) {
+#pragma unroll
+      for (int c = 0; c < numColors; c++) {
+        FA_COLOR3_IMPRELOAD_TX(c, i);
+      }
+    }
+
+#pragma unroll
+    for (int c = 0; c < numColors; c++) {
+#pragma unroll
+      for (int pp = 0; pp < 2; pp++) {
+        fPreload[c][pp] =
+            warp >= 3 || fPidxNext + pp * 2 + shFilterLoadY >= filterPixels
+            ? 0
+            : tex1Dfetch<float>(
+                  filters,
+                  filtersOffset2 + c * numFilters * filterPixels +
+                      pp * 2 * numFilters);
+      }
+#pragma unroll
+      for (int pp = 0; pp < pixelCache; pp++) {
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] *
+                shFilters[c][pp][ty * filtersPerThread + f];
+          }
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleTargets * targets[i * B_X + f * numImages * numModules] +
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  } else {
+// Note: reversing order of these loops costs 2 registers, but saves time
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  }
+}
+
+__device__ inline void
+filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
+    int filterSize,
+    int imgSizeX,
+    int imgLoadModPosY,
+    int imgLoadModPosX,
+    int imgY,
+    int imgX,
+    int& fPidx,
+    int& iPidx) {
+  int filterPxY = imgY - imgLoadModPosY;
+  int filterPxX = imgX - imgLoadModPosX;
+  fPidx = filterPxY * filterSize + filterPxX;
+  iPidx = imgY * imgSizeX + imgX; // Pixel index in img
+}
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * Note: in git there's a 1.5% faster version of this which sues 167 registers
+ * instead of 154... it's basically the same thing, but it doesn't do the
+ * next-pixel computation. It just avoids pre-loading when it rolls over to the
+ * next pixel.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int colorCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(
+    float* images,
+    float* filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv /*, const bool noloads*/) {
+  __shared__ float
+      shFilters[colorCache]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float shImages[colorCache]
+                           [B_X * imgsPerThread]; // pre-load 1 pixel from
+                                                  // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int numFilterColors = numImgColors / numGroups;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+
+  const int numModules = numModulesX * numModulesY;
+  const int blockColorIdx = numFilterColors * blockGroupIdx;
+  // Another fun insanity: the % B_X makes things faster, even thought
+  // threadIdx.x is in the range 0..31. It appears that this allows the compiler
+  // to optimize?
+  const int tx = threadIdx.x % B_X;
+  const int ty = threadIdx.y % B_Y;
+  const int tidx = ty * B_X + threadIdx.x;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
+
+  images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
+  filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
+      shFilterLoadX;
+  if (!conv) {
+    filters += moduleIdx * numFilterColors * filterPixels * numFilters;
+  }
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
+          numModules +
+      myImgIdx;
+
+  float prod[imgsPerThread][filtersPerThread];
+  //    float fCache[filtersPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[i][f] = 0;
+    }
+  }
+  // NOTE: these max/min functions increase register usage as compared to my
+  // macros
+  const int imgStartX = max(0, imgLoadModPosX);
+  const int imgStartY = max(0, imgLoadModPosY);
+  const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX);
+  const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY);
+  //    __shared__ int imgPos[]
+
+  int fPidx, iPidx;
+  float imPreload[imgsPerThread];
+  float fPreload[colorCache * filtersPerThread / B_X];
+  //    float fCache[filtersPerThread];
+
+  filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
+      filterSize,
+      imgSizeX,
+      imgLoadModPosY,
+      imgLoadModPosX,
+      imgStartY,
+      imgStartX,
+      fPidx,
+      iPidx);
+
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+      imPreload[i] = images[imgStride * iPidx + i * B_X];
+    } else {
+      imPreload[i] = 0;
+    }
+  }
+  if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY <
+      B_X / filtersPerThread) { // This if statement reduces reg usage..
+#pragma unroll
+    for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+      fPreload[c * filtersPerThread / B_X] =
+          filters[(c * filterPixels + fPidx) * numFilters];
+    }
+  }
+  for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
+    //        const int filterPxY = imgY - imgLoadModPosY;
+    for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
+      //            const int filterPxX = imgX - imgLoadModPosX;
+      //            const int p = filterPxY * filterSize + filterPxX;
+      //            const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in
+      //            img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY,
+      //            imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m =
+      //            &images[imgStride * pixIdx];
+      const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1;
+      int imgYNext = imgY;
+      int imgXNext = imgX;
+      int fPidxNext, iPidxNext;
+      if (!lastPixel) {
+        imgYNext = imgY + (imgX + 1 == imgEndX);
+        imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1;
+      }
+      filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
+          filterSize,
+          imgSizeX,
+          imgLoadModPosY,
+          imgLoadModPosX,
+          imgYNext,
+          imgXNext,
+          fPidxNext,
+          iPidxNext);
+      for (int oc = 0; oc < numFilterColors;
+           oc += colorCache) { // oc stands for outer color (loop)
+        const float* ff =
+            &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)];
+        const float* mm =
+            &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)];
+        if (oc == numFilterColors - colorCache) {
+          ff = &filters[fPidxNext * numFilters];
+          mm = &images[iPidxNext * imgStride];
+          fPidx = fPidxNext;
+          iPidx = iPidxNext;
+        }
+
+#pragma unroll
+        for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+          shFilters[c + shFilterLoadY][shFilterLoadX] =
+              fPreload[c * filtersPerThread / B_X];
+        }
+
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+          // NOTE: bank conflicts here!
+          shImages[ty][tx * imgsPerThread + i] = imPreload[i];
+        }
+        imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages)
+            ? 0
+            : mm[0 * B_X];
+        imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages)
+            ? 0
+            : mm[1 * B_X];
+        imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages)
+            ? 0
+            : mm[2 * B_X];
+
+        __syncthreads();
+
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] *
+                shFilters[0][threadIdx.y * filtersPerThread + f];
+          }
+        }
+
+        fPreload[0] = ff[0];
+
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] *
+                shFilters[1][threadIdx.y * filtersPerThread + f];
+          }
+        }
+
+        fPreload[1] = ff[(B_X / filtersPerThread * filterPixels) * numFilters];
+
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] *
+                shFilters[2][threadIdx.y * filtersPerThread + f];
+          }
+        }
+
+        imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages)
+            ? 0
+            : mm[3 * B_X];
+
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] *
+                shFilters[3][threadIdx.y * filtersPerThread + f];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleTargets * targets[i * B_X + f * numImages * numModules] +
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  } else {
+// Note: reversing order of these loops saves 2 registers, but costs time
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+          targets[i * B_X + f * numImages * numModules] =
+              scaleOutputs * prod[i][f];
+        }
+      }
+    }
+  }
+}
+
+/*****************************Function Revision
+ *Record***************************** Author: Tencent BestImage
+ *Team(ankerguo@tencent.com)                           * Date:   2015-05-18 *
+ * Reason: Optimizing kernel to get faster speed according to GPU features *
+ * Method: *
+ *         1. reorganizing data structure to avoid bank conflict; *
+ *         2. using vectorized data type; *
+ *         3. improving instruction-level parallelism; *
+ *         4. removing redundant 'if' branches; *
+ *         5. removing local variables to save registers. *
+ *********************************************************************************/
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int colorCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void __launch_bounds__(128, 4)
+    filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(
+        cudaTextureObject_t images,
+        cudaTextureObject_t filters,
+        float* targets,
+        const int numImages,
+        const int numFilters,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int filterSize,
+        const int paddingStart,
+        const int moduleStride,
+        const int numModulesY,
+        const int numModulesX,
+        const int imgStride,
+        const int numImgColors,
+        const int numGroups,
+        const float scaleTargets,
+        const float scaleOutputs,
+        const bool conv /*, const bool noloads*/) {
+  // avoid bank conflict by reorganizing the data structure and improve the band
+  // width by using 'float2' instead of 'float'
+  __shared__ float2
+      shFilters[colorCache / 2]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float2
+      shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from
+                                                     // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int numFilterColors = numImgColors / numGroups;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+
+  const int numModules = numModulesX * numModulesY;
+  const int blockColorIdx = numFilterColors * blockGroupIdx;
+  // Another fun insanity: the % B_X makes things faster, even thought
+  // threadIdx.x is in the range 0..31. It appears that this allows the compiler
+  // to optimize?
+  const int tx = threadIdx.x % B_X;
+  const int ty = threadIdx.y % B_Y;
+  // const int tidx = ty * B_X + threadIdx.x; // reduce one register
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  // reduce two registers
+  // const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  // const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx;
+  const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx;
+
+  //    images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride +
+  //    myImgIdx;
+  const int filterOffset = blockFilterIdx +
+      ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels +
+      ((ty * B_X + tx) % (B_Y * filtersPerThread)) +
+      (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters);
+  //    filters +=blockFilterIdx
+  //            + shFilterLoadY * numFilters * filterPixels + shFilterLoadX;
+  //    if (!conv) {
+  //        filters += moduleIdx * numFilterColors * filterPixels * numFilters;
+  //    }
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
+          numModules +
+      myImgIdx;
+
+  // combine two registers into one
+  const int numModImages = numModules * numImages;
+  float prod[imgsPerThread][filtersPerThread];
+  //    float fCache[filtersPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[i][f] = 0;
+    }
+  }
+  // NOTE: these max/min functions increase register usage as compared to my
+  // macros
+  const int imgStartX = max(0, imgLoadModPosX);
+  const int imgStartY = max(0, imgLoadModPosY);
+  const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX);
+  const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY);
+  //    __shared__ int imgPos[]
+
+  int fPidx, iPidx;
+  float imPreload[imgsPerThread]; // [4]
+  float fPreload[colorCache * filtersPerThread / B_X]; // [2]
+  //    float fCache[filtersPerThread];
+
+  filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
+      filterSize,
+      imgSizeX,
+      imgLoadModPosY,
+      imgLoadModPosX,
+      imgStartY,
+      imgStartX,
+      fPidx,
+      iPidx);
+
+// remove redundant conditions
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+    imPreload[i] =
+        tex1Dfetch<float>(images, imgOffset + imgStride * iPidx + i * B_X);
+  }
+
+#pragma unroll
+  for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+    fPreload[c * filtersPerThread / B_X] = tex1Dfetch<float>(
+        filters, filterOffset + (c * filterPixels + fPidx) * numFilters);
+  }
+  for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
+    //        const int filterPxY = imgY - imgLoadModPosY;
+    for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
+      //            const int filterPxX = imgX - imgLoadModPosX;
+      //            const int p = filterPxY * filterSize + filterPxX;
+      //            const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in
+      //            img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY,
+      //            imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m =
+      //            &images[imgStride * pixIdx];
+      const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1;
+      int imgYNext = imgY;
+      int imgXNext = imgX;
+      int fPidxNext, iPidxNext;
+      if (!lastPixel) {
+        imgYNext = imgY + (imgX + 1 == imgEndX);
+        imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1;
+      }
+      filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
+          filterSize,
+          imgSizeX,
+          imgLoadModPosY,
+          imgLoadModPosX,
+          imgYNext,
+          imgXNext,
+          fPidxNext,
+          iPidxNext);
+      for (int oc = 0; oc < numFilterColors;
+           oc += colorCache) { // oc stands for outer color (loop)
+        // store the preloaded pixel of filter and image into shared memory
+        shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)]
+                 [(ty * B_X + tx) % (B_Y * filtersPerThread)]
+                     .x = fPreload[0];
+        shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)]
+                 [(ty * B_X + tx) % (B_Y * filtersPerThread)]
+                     .y = fPreload[1];
+        shImages[ty][tx].x = imPreload[0];
+        shImages[ty][tx].y = imPreload[1];
+        shImages[ty][tx + B_X].x = imPreload[2];
+        shImages[ty][tx + B_X].y = imPreload[3];
+
+        int imgOffset2 =
+            imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx);
+        int filterOffset2 = filterOffset +
+            numFilters * ((oc + colorCache) * filterPixels + fPidx);
+        if (oc == numFilterColors - colorCache) {
+          filterOffset2 = filterOffset + fPidxNext * numFilters;
+          imgOffset2 = imgOffset + iPidxNext * imgStride;
+          fPidx = fPidxNext;
+          iPidx = iPidxNext;
+        }
+
+        // preload one pixel of filter and image from texture, and no need to
+        // check 'checkImgBounds' with all callers setting it as false
+        imPreload[0] = tex1Dfetch<float>(images, imgOffset2);
+        imPreload[1] = tex1Dfetch<float>(images, imgOffset2 + B_X);
+        imPreload[2] = tex1Dfetch<float>(images, imgOffset2 + 2 * B_X);
+        imPreload[3] = tex1Dfetch<float>(images, imgOffset2 + 3 * B_X);
+        fPreload[0] = tex1Dfetch<float>(filters, filterOffset2);
+        fPreload[1] = tex1Dfetch<float>(
+            filters, filterOffset2 + 2 * filterPixels * numFilters);
+
+        __syncthreads();
+
+// put together the instructions with same type to improve instruction-level
+// parallelism calculate the convolution between images and filters
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+          for (int r = 0; r < colorCache / 2; r++) {
+            prod[0][f] +=
+                shImages[r][tx].x * shFilters[r][ty * filtersPerThread + f].x;
+            prod[1][f] +=
+                shImages[r][tx].y * shFilters[r][ty * filtersPerThread + f].x;
+            prod[2][f] += shImages[r][tx + B_X].x *
+                shFilters[r][ty * filtersPerThread + f].x;
+            prod[3][f] += shImages[r][tx + B_X].y *
+                shFilters[r][ty * filtersPerThread + f].x;
+            prod[0][f] += shImages[r + 2][tx].x *
+                shFilters[r][ty * filtersPerThread + f].y;
+            prod[1][f] += shImages[r + 2][tx].y *
+                shFilters[r][ty * filtersPerThread + f].y;
+            prod[2][f] += shImages[r + 2][tx + B_X].x *
+                shFilters[r][ty * filtersPerThread + f].y;
+            prod[3][f] += shImages[r + 2][tx + B_X].y *
+                shFilters[r][ty * filtersPerThread + f].y;
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        // remove the redundant condition for less registers
+        targets[i * B_X + f * numModImages] =
+            scaleTargets * targets[i * B_X + f * numModImages] +
+            scaleOutputs * prod[i][f];
+      }
+    }
+  } else {
+// Note: reversing order of these loops saves 2 registers, but costs time
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        // remove the redundant condition for less registers
+        targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f];
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X
+ * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines
+ * filter
+ *
+ * blockIdx.x determines image batch of B_X * imgsPerThread
+ * blockIdx.y determines filter batch of module and B_Y * filtersPerThread
+ *
+ * images:      (numColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numColors, filterPixels, numFilters) if conv
+ *              (numModules, numColors, filterPixels, numFilters) otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ *
+ * Number of filters per module should be divisible by B_Y * filtersPerThread
+ * checkImgBounds indicates whether number of images is divisible by B_X *
+ * imgsPerThread
+ *
+ * The imgSize here is the size of the actual image without the padding.
+ *
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int numColors,
+    int pixelCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void filterActs_YxX_color(
+    float* images,
+    float* filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv) {
+  __shared__ float
+      shFilters[pixelCache * numColors]
+               [B_Y * filtersPerThread]; // pre-load pixelCache pixels from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float
+      shImages[pixelCache * numColors]
+              [B_X * imgsPerThread]; // pre-load pixelCache pixels from
+                                     // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx = blockIdx.y % blocksPerModule;
+
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+  const int numModules = numModulesY * numModulesX;
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
+  images += myImgIdx;
+  filters += filtersPerThread * B_Y * blockFilterIdx +
+      shFilterLoadY * numFilters + shFilterLoadX;
+  if (!conv) {
+    filters += moduleIdx * numColors * filterPixels * numFilters;
+  }
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx * B_Y * filtersPerThread +
+       threadIdx.y * filtersPerThread) *
+          numImages * numModulesY * numModulesX +
+      myImgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      prod[f][g] = 0;
+    }
+  }
+  // float* shImgLoad = &shImages[0][threadIdx.x];
+  for (int p = 0; p < filterPixels; p += pixelCache) {
+    /*
+     * Load pixelCache pixels from B_Y*filtersPerThread filters
+     * This condition covers the case when B_X is not divisible by
+     * filtersPerThread. In this case, not all of the threads will participate
+     * in the loading operation. This ensures that in each loop iteration, an
+     * integer number of rows of shFilters are filled, which makes indexing
+     * simple.
+     */
+    if (B_X % filtersPerThread == 0 || shFilterLoadY < B_X / filtersPerThread) {
+#pragma unroll
+      for (int p2 = 0; p2 < pixelCache; p2 += B_X / filtersPerThread) {
+        const bool omit = pixelCache % (B_X / filtersPerThread) == 0;
+        const int preloadPx = shFilterLoadY + p2;
+        if (omit || preloadPx < pixelCache) {
+          if (p + preloadPx < filterPixels) {
+#pragma unroll
+            for (int c = 0; c < numColors; c++) {
+              shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] =
+                  filters[(c * filterPixels + p + p2) * numFilters];
+            }
+          } else {
+#pragma unroll
+            for (int c = 0; c < numColors; c++) {
+              shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = 0;
+            }
+          }
+        }
+      }
+    }
+
+/*
+ * Load pixelCache pixels from B_X*imgsPerThread images.
+ */
+#pragma unroll
+    for (int ly = 0; ly < pixelCache; ly += B_Y) {
+      const int preloadPx = ly + threadIdx.y;
+      const int pixIdx = p + preloadPx;
+      const bool omit = pixelCache % B_Y == 0; // Compile-time condition
+      /*
+       * Don't load any image pixels corresponding to filter pixels that don't
+       * exist.
+       */
+      if (pixIdx < filterPixels && (omit || preloadPx < pixelCache)) {
+        const int x = imgLoadModPosX + pixIdx % filterSize;
+        const int y = imgLoadModPosY + pixIdx / filterSize;
+
+        if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
+          float* m = &images[imgStride * (y * imgSizeX + x)];
+
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+#pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+              if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+                shImages[preloadPx + c * pixelCache]
+                        [threadIdx.x * imgsPerThread + i] =
+                            m[c * imgStride * imgPixels + i * B_X];
+              } else {
+                shImages[preloadPx + c * pixelCache]
+                        [threadIdx.x * imgsPerThread + i] = 0;
+              }
+            }
+          }
+        } else { // Padding
+#pragma unroll
+          for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+            for (int c = 0; c < numColors; c++) {
+              shImages[preloadPx + c * pixelCache]
+                      [threadIdx.x * imgsPerThread + i] = 0;
+            }
+          }
+        }
+      }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < pixelCache * numColors; i++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+        for (int g = 0; g < imgsPerThread; g++) {
+          prod[f][g] += shImages[i][g + threadIdx.x * imgsPerThread] *
+              shFilters[i][threadIdx.y * filtersPerThread + f];
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int g = 0; g < imgsPerThread; g++) {
+        if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+          targets[g * B_X + f * numImages * numModules] =
+              scaleTargets * targets[g * B_X + f * numImages * numModules] +
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          targets[g * B_X + f * numImages * numModules] =
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X
+ * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines
+ * filter
+ *
+ * blockIdx.x determines image batch of B_X * imgsPerThread
+ * blockIdx.y determines filter batch of B_Y * filtersPerThread
+ *
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * B_Y one of 4, 8, 16
+ * B_X one of 16, 32
+ * imgsPerThread one of 1, 2, 4
+ * filtersPerThread one of 1, 2, 4, 8
+ * colorCache: how many colors to put into shmem
+ *
+ * numFilters should be divisible by B_Y * filtersPerThread
+ * numImages be divisible by B_X * imgsPerThread
+ * numFilterColors should be divisible by colorCache.
+ * numImgColors must be even.
+ * numFilters must be divisible by numGroups.
+ * no restrictions on pixelCache
+ * The imgSize here is the size of the actual image without the padding.
+ * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for
+ * maximum efficiency.
+ *
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int colorCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void filterActs_YxX_sparse2(
+    float* images,
+    float* filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv) {
+  __shared__ float
+      shFilters[colorCache]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float shImages[colorCache]
+                           [B_X * imgsPerThread]; // pre-load 1 pixel from
+                                                  // B_X*imgsPerThread images
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int numFilterColors = numImgColors / numGroups;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+
+  const int numModules = numModulesX * numModulesY;
+  const int blockColorIdx = numFilterColors * blockGroupIdx;
+
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
+
+  images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
+  filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
+      shFilterLoadX;
+  if (!conv) {
+    filters += moduleIdx * numFilterColors * filterPixels * numFilters;
+  }
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + threadIdx.y) * numImages * numModules + myImgIdx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      prod[f][g] = 0;
+    }
+  }
+  const int imgStartX = MAX(0, imgLoadModPosX);
+  const int imgStartY = MAX(0, imgLoadModPosY);
+  const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX);
+  const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY);
+  //    __shared__ int imgPos[]
+
+  for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
+    const int filterPxY = imgY - imgLoadModPosY;
+    for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
+      const int filterPxX = imgX - imgLoadModPosX;
+      const int p = filterPxY * filterSize + filterPxX;
+      for (int oc = 0; oc < numFilterColors;
+           oc += colorCache) { // oc stands for outer color (loop)
+
+        /*
+         * Load a pixel from B_Y*filtersPerThread filters
+         * This condition covers the case when B_X is not divisible by
+         filtersPerThread.
+         * In this case, not all of the threads will participate in the loading
+         operation.
+         * This ensures that in each loop iteration, an integer number of rows
+         of shFilters
+         * are filled, which makes indexing simple.
+
+         * nvcc is behaving in a completely insane way: removing this condition
+         under
+         * template parameters that guarantee it to be true actually slows down
+         * the computation.
+         *
+         */
+        if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY <
+            B_X / filtersPerThread) {
+#pragma unroll
+          for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+            if (colorCache % (B_X / filtersPerThread) == 0 ||
+                c + shFilterLoadY < colorCache) {
+              shFilters[c + shFilterLoadY][shFilterLoadX] =
+                  filters[((oc + c) * filterPixels + p) * numFilters];
+            }
+          }
+        }
+
+        /*
+         * Load a pixel from B_X*imgsPerThread images.
+         */
+        const int pixIdx = imgY * imgSizeX + imgX; // Pixel index in img
+
+        float* m = &images[imgStride * (oc * imgPixels + pixIdx)];
+#pragma unroll
+        for (int c = 0; c < colorCache; c += B_Y) {
+          if (colorCache % B_Y == 0 || threadIdx.y + c < colorCache) {
+#pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+              if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
+                shImages[c + threadIdx.y][threadIdx.x + i * B_X] =
+                    m[c * imgStride * imgPixels + i * B_X];
+              } else {
+                shImages[c + threadIdx.y][threadIdx.x + i * B_X] = 0;
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        for (int c = 0; c < colorCache; c++) {
+#pragma unroll
+          for (int g = 0; g < imgsPerThread; g++) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][g] += shImages[c][g * B_X + threadIdx.x] *
+                  shFilters[c][threadIdx.y + f * B_Y];
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets *
+                  targets[g * B_X + f * B_Y * numImages * numModules] +
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  } else {
+// Note: reversing order of these loops saves 2 registers, but costs time
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int g = 0; g < imgsPerThread; g++) {
+        if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+          targets[g * B_X + f * B_Y * numImages * numModules] =
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  }
+}
+
+/*****************************Function Revision
+ *Record***************************** Author: Tencent BestImage
+ *Team(ankerguo@tencent.com)                           * Date:   2015-05-18 *
+ * Reason: Optimizing kernel to get faster speed according to GPU features *
+ * Method: *
+ *         1. reorganizing data structure to avoid bank conflict; *
+ *         2. using vectorized data type; * Note:   This function can be used
+ *when each thread loads even number of filter * pixels(filtersPerThread *
+ *colorCache / B_X is even), and this can be   * optimized more when the number
+ *of loaded image's pixel is even.        *
+ *********************************************************************************/
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int filtersPerThread,
+    int colorCache,
+    bool scale,
+    bool checkImgBounds>
+__global__ void filterActs_YxX_sparse2_f_vec(
+    float* images,
+    float* filters,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs,
+    const bool conv) {
+  // improve shared memory's band width by using 'float2' instead of 'float'
+  __shared__ float2
+      shFilters[colorCache / 2]
+               [B_Y * filtersPerThread]; // pre-load 1 pixel from
+                                         // B_Y*filtersPerThread filters
+  __shared__ float shImages[colorCache]
+                           [B_X * imgsPerThread]; // pre-load 1 pixel from
+                                                  // B_X*imgsPerThread images
+
+  const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int filterPixels = filterSize * filterSize;
+  const int numFilterColors = numImgColors / numGroups;
+  const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
+  const int moduleIdx = blockIdx.y / blocksPerModule;
+  const int blockFilterIdx =
+      filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+
+  const int numModules = numModulesX * numModulesY;
+  const int blockColorIdx = numFilterColors * blockGroupIdx;
+
+  const int tidx = ty * B_X + tx;
+
+  const int imgLoadModPosY =
+      paddingStart + (moduleIdx / numModulesX) * moduleStride;
+  const int imgLoadModPosX =
+      paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+  // load position of filters' pixels for current thread
+  const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+  const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+  // load position of images' pixels for current thread
+  const int shImgLoadY = tidx / (B_X * imgsPerThread);
+  const int shImgLoadX = tidx % (B_X * imgsPerThread);
+
+  const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX;
+  images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx;
+
+  filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
+      shFilterLoadX;
+  if (!conv) {
+    filters += moduleIdx * numFilterColors * filterPixels * numFilters;
+  }
+
+  targets += moduleIdx * numImages +
+      (blockFilterIdx + ty) * numImages * numModules +
+      blockIdx.x * B_X * imgsPerThread + tx;
+
+  float prod[filtersPerThread][imgsPerThread];
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      prod[f][g] = 0;
+    }
+  }
+
+  const int imgStartX = MAX(0, imgLoadModPosX);
+  const int imgStartY = MAX(0, imgLoadModPosY);
+  const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX);
+  const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY);
+
+  // temporary buffer to store the filter's loaded pixels during each loop
+  float fPreload[colorCache * filtersPerThread / B_X];
+  // temporary buffer to store the image's loaded pixels during each loop
+  float iPreload[colorCache * imgsPerThread / B_Y];
+
+// preload filter's pixels
+#pragma unroll
+  for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+    fPreload[c * filtersPerThread / B_X] = filters
+        [(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize +
+          (imgStartX - imgLoadModPosX)) *
+         numFilters];
+  }
+
+  // preload image's pixels
+  if (!checkImgBounds || myImgIdx < numImages) {
+#pragma unroll
+    for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+      iPreload[c * imgsPerThread / B_Y] = images
+          [(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride];
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+      iPreload[c * imgsPerThread / B_Y] = 0;
+    }
+  }
+
+  for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
+    // const int filterPxY = imgY - imgLoadModPosY;
+    for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
+      for (int oc = 0; oc < numFilterColors;
+           oc += colorCache) { // oc stands for outer color (loop)
+// store the preloaded filter's pixels into shared memory
+#pragma unroll
+        for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) {
+          shFilters[c + shFilterLoadY][shFilterLoadX].x =
+              fPreload[c * filtersPerThread / B_X];
+          shFilters[c + shFilterLoadY][shFilterLoadX].y =
+              fPreload[(c + colorCache / 2) * filtersPerThread / B_X];
+        }
+
+// store the preloaded image's pixels into shared memory
+#pragma unroll
+        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+          shImages[c + shImgLoadY][shImgLoadX] =
+              iPreload[c * imgsPerThread / B_Y];
+        }
+        /*
+         * Load a pixel from B_Y*filtersPerThread filters
+         * This condition covers the case when B_X is not divisible by
+         filtersPerThread.
+         * In this case, not all of the threads will participate in the loading
+         operation.
+         * This ensures that in each loop iteration, an integer number of rows
+         of shFilters
+         * are filled, which makes indexing simple.
+
+         * nvcc is behaving in a completely insane way: removing this condition
+         under
+         * template parameters that guarantee it to be true actually slows down
+         * the computation.
+         *
+         */
+
+        /* preload image and filter pixels' data */
+        if ((oc + colorCache) ==
+            numFilterColors) { // move to next pixel when all colors of current
+                               // pixel have been finished
+          int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX;
+          int imgYn = imgY + (imgXn != (imgX + 1));
+
+#pragma unroll
+          for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+            fPreload[c * filtersPerThread / B_X] = filters
+                [(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize +
+                  (imgXn - imgLoadModPosX)) *
+                 numFilters];
+          }
+
+          if (!checkImgBounds || myImgIdx < numImages) {
+#pragma unroll
+            for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+              iPreload[c * imgsPerThread / B_Y] = images
+                  [(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride];
+            }
+          } else {
+#pragma unroll
+            for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+              iPreload[c * imgsPerThread / B_Y] = 0;
+            }
+          }
+        } else { // move next colorCache
+#pragma unroll
+          for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
+            fPreload[c * filtersPerThread / B_X] = filters
+                [((c + oc + colorCache) * filterPixels +
+                  (imgY - imgLoadModPosY) * filterSize +
+                  (imgX - imgLoadModPosX)) *
+                 numFilters];
+          }
+
+          if (!checkImgBounds || myImgIdx < numImages) {
+#pragma unroll
+            for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+              iPreload[c * imgsPerThread / B_Y] = images
+                  [((c + oc + colorCache) * imgPixels + imgY * imgSizeX +
+                    imgX) *
+                   imgStride];
+            }
+          } else {
+#pragma unroll
+            for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+              iPreload[c * imgsPerThread / B_Y] = 0;
+            }
+          }
+        }
+
+        __syncthreads();
+
+        // convolution
+        for (int c = 0; c < colorCache / 2; c++) {
+#pragma unroll
+          for (int g = 0; g < imgsPerThread; g++) {
+#pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+              prod[f][g] +=
+                  shImages[c][g * B_X + tx] * shFilters[c][ty + f * B_Y].x;
+              prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] *
+                  shFilters[c][ty + f * B_Y].y;
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  // write convolution result into global memory
+  if (scale) {
+#pragma unroll
+    for (int g = 0; g < imgsPerThread; g++) {
+      if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+#pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+          targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets *
+                  targets[g * B_X + f * B_Y * numImages * numModules] +
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  } else {
+// Note: reversing order of these loops saves 2 registers, but costs time
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+      for (int g = 0; g < imgsPerThread; g++) {
+        if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+          targets[g * B_X + f * B_Y * numImages * numModules] =
+              scaleOutputs * prod[f][g];
+        }
+      }
+    }
+  }
+}
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
+ * filters:     (numFilterColors, filterPixels, numFilters)             if conv
+ *              (numModules, numFilterColors, filterPixels, numFilters)
+ * otherwise
+ *
+ * targets:     (numFilters, numModules, numImages)
+ *
+ * Note: all of these convolution routines are optimized for the case when
+ * the number of images (i.e. the minibatch size) is a multiple of 128.
+ * Other batch sizes will work, but but I made no attempt whatsoever
+ * to make them work fast.
+ */
+void _filterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput,
+    bool conv) {
+  CAFFE_ENFORCE(images->ndim() == 2);
+  CAFFE_ENFORCE(filters->ndim() == 2);
+  CAFFE_ENFORCE(targets->ndim() == 2);
+
+  int numFilterColors = numImgColors / numGroups;
+  int numFilters = filters->dim32(1);
+  int numModules = numModulesY * numModulesX;
+  int numImages = images->dim32(1);
+  int imgPixels = images->dim32(0) / numImgColors;
+  int imgSizeX = imgPixels / imgSizeY;
+  int filterModuleMult = conv ? 1 : numModules;
+
+  CAFFE_ENFORCE(
+      numGroups > 1 ||
+      (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0)));
+  CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0);
+  CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0);
+  CAFFE_ENFORCE(numImgColors % numGroups == 0);
+  CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors);
+  CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels);
+  int numFiltersPerGroup = numFilters / numGroups;
+
+  int imgStride = images->dim32(1);
+
+  int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors);
+  int filterSize = int(sqrt(filterPixels));
+  CAFFE_ENFORCE(filterSize * filterSize == filterPixels);
+  CAFFE_ENFORCE(
+      filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels);
+
+  // These routines don't handle the case when only part of the image is visited
+  // in the convolution
+  CAFFE_ENFORCE(paddingStart <= 0);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
+  CAFFE_ENFORCE(moduleStride <= filterSize);
+
+  int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+  int filtersPerThread, threadsY = 4;
+  if (numImgColors <= 3) {
+    // Special kernels written for colors = 3, filters = 64 and colors = 3,
+    // filters = 48 cases. The remaining cases use the old routines.
+    // TODO: Modernize the remaining cases if you care about them.
+    filtersPerThread = numFiltersPerGroup % 64 == 0
+        ? 16
+        : numFiltersPerGroup % 48 == 0 ? 12
+                                       : numFiltersPerGroup % 32 == 0 ? 8 : 4;
+  } else {
+    filtersPerThread = numFiltersPerGroup % 64 == 0
+        ? 16
+        : numFiltersPerGroup % 32 == 0 ? 8 : 4;
+    threadsY = numFiltersPerGroup % 128 == 0 && numFilterColors % 8 == 0 &&
+            imgsPerThread != 4
+        ? 8
+        : 4;
+  }
+  int threadsX = 32;
+  dim3 threads(threadsX, threadsY);
+  dim3 blocks = dim3(
+      DIVUP(numImages, threads.x * imgsPerThread),
+      (numModules * numFilters) / (threads.y * filtersPerThread));
+
+  bool checkImgBounds = numImages % (threads.x * imgsPerThread) != 0;
+  bool scale = scaleTargets != 0;
+  if (scaleTargets == 0) {
+    targets->Resize(std::vector<int>{numFilters * numModules, numImages});
+  } else {
+    CAFFE_ENFORCE(targets->dim32(0) == numFilters * numModules);
+    CAFFE_ENFORCE(targets->dim32(1) == numImages);
+  }
+
+  float* images_data = images->mutable_data<float>();
+  float* filters_data = filters->mutable_data<float>();
+  float* targets_data = targets->mutable_data<float>();
+  const std::size_t images_bytes = images->nbytes();
+
+  cudaStream_t stream = context->cuda_stream();
+
+  checkCudaErrors(cudaDeviceSetSharedMemConfig(
+      cudaSharedMemBankSizeEightByte)); // using wider band width
+
+  // Auto-generated calling code...
+  // NOTE: The calling code is set up such that if checkImgBounds is true, then
+  // imgsPerThread = 1. In principle it doesn't have to be this way, and you may
+  // want to optimize for that case.
+
+  if (scale == false) {
+    if (checkImgBounds == false) {
+      if (numFilterColors % 8 == 0) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            if (images_bytes < TEXTURE_SIZE_MAX) {
+              cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+              cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      false,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  false,
+                  false><<<blocks, threads, 0, stream>>>(
+                  tex_images,
+                  tex_filters,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+              checkCudaErrors(cudaDestroyTextureObject(tex_images));
+              checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+            } else {
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      false,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  false,
+                  false><<<blocks, threads, 0, stream>>>(
+                  images_data,
+                  filters_data,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+            }
+          } else if (numFiltersPerGroup % 64 == 0) {
+            if (images_bytes < TEXTURE_SIZE_MAX) {
+              cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+              cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      false,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  false,
+                  false><<<blocks, threads, 0, stream>>>(
+                  tex_images,
+                  tex_filters,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+              checkCudaErrors(cudaDestroyTextureObject(tex_images));
+              checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+            } else {
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      false,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  false,
+                  false><<<blocks, threads, 0, stream>>>(
+                  images_data,
+                  filters_data,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+            }
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors % 4 == 0) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 3) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
+                    4,
+                    32,
+                    4,
+                    16,
+                    3,
+                    4,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
+                4,
+                32,
+                4,
+                16,
+                3,
+                4,
+                false,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_filters,
+                targets_data,
+                numImages,
+                numFilters,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                numModulesY,
+                numModulesX,
+                imgStride,
+                scaleTargets,
+                scaleOutput,
+                conv);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
+                    4,
+                    32,
+                    4,
+                    12,
+                    3,
+                    4,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
+                4,
+                32,
+                4,
+                12,
+                3,
+                4,
+                false,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_filters,
+                targets_data,
+                numImages,
+                numFilters,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                numModulesY,
+                numModulesX,
+                imgStride,
+                scaleTargets,
+                scaleOutput,
+                conv);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 2) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 1) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      }
+    } else if (checkImgBounds == true) {
+      if (numFilterColors % 8 == 0) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors % 4 == 0) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 3) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 2) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 1) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      }
+    }
+  } else if (scale == true) {
+    if (checkImgBounds == false) {
+      if (numFilterColors % 8 == 0) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            if (images_bytes < TEXTURE_SIZE_MAX) {
+              cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+              cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      true,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  true,
+                  false><<<blocks, threads, 0, stream>>>(
+                  tex_images,
+                  tex_filters,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+              checkCudaErrors(cudaDestroyTextureObject(tex_images));
+              checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+            } else {
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      true,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  true,
+                  false><<<blocks, threads, 0, stream>>>(
+                  images_data,
+                  filters_data,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+            }
+          } else if (numFiltersPerGroup % 64 == 0) {
+            if (images_bytes < TEXTURE_SIZE_MAX) {
+              cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+              cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      true,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  true,
+                  false><<<blocks, threads, 0, stream>>>(
+                  tex_images,
+                  tex_filters,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+              checkCudaErrors(cudaDestroyTextureObject(tex_images));
+              checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+            } else {
+              cudaFuncSetCacheConfig(
+                  filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                      4,
+                      32,
+                      4,
+                      16,
+                      4,
+                      true,
+                      false>,
+                  cudaFuncCachePreferL1);
+              filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
+                  4,
+                  32,
+                  4,
+                  16,
+                  4,
+                  true,
+                  false><<<blocks, threads, 0, stream>>>(
+                  images_data,
+                  filters_data,
+                  targets_data,
+                  numImages,
+                  numFilters,
+                  imgSizeY,
+                  imgSizeX,
+                  filterSize,
+                  paddingStart,
+                  moduleStride,
+                  numModulesY,
+                  numModulesX,
+                  imgStride,
+                  numImgColors,
+                  numGroups,
+                  scaleTargets,
+                  scaleOutput,
+                  conv);
+            }
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors % 4 == 0) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 3) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
+                    4,
+                    32,
+                    4,
+                    16,
+                    3,
+                    4,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
+                4,
+                32,
+                4,
+                16,
+                3,
+                4,
+                true,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_filters,
+                targets_data,
+                numImages,
+                numFilters,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                numModulesY,
+                numModulesX,
+                imgStride,
+                scaleTargets,
+                scaleOutput,
+                conv);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
+                    4,
+                    32,
+                    4,
+                    12,
+                    3,
+                    4,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
+                4,
+                32,
+                4,
+                12,
+                3,
+                4,
+                true,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_filters,
+                targets_data,
+                numImages,
+                numFilters,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                numModulesY,
+                numModulesX,
+                imgStride,
+                scaleTargets,
+                scaleOutput,
+                conv);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 2) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 1) {
+        if (numImages % 128 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 64 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        } else if (numImages % 32 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      }
+    } else if (checkImgBounds == true) {
+      if (numFilterColors % 8 == 0) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors % 4 == 0) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 3) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 2) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      } else if (numFilterColors == 1) {
+        if (numImages % 1 == 0) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          } else if (numFiltersPerGroup % 1 == 0) {
+            cudaFuncSetCacheConfig(
+                filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true>,
+                cudaFuncCachePreferShared);
+            filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    filters_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    numModulesY,
+                    numModulesX,
+                    imgStride,
+                    scaleTargets,
+                    scaleOutput,
+                    conv);
+          }
+        }
+      }
+    }
+  }
+
+  checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
+  getLastCudaError("filterActs: kernel execution failed");
+}
+
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups) {
+  convFilterActs(
+      context,
+      images,
+      filters,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      0,
+      1);
+}
+
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput) {
+  _filterActs(
+      context,
+      images,
+      filters,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      scaleTargets,
+      scaleOutput,
+      true);
+}
+
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups) {
+  localFilterActs(
+      context,
+      images,
+      filters,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      0,
+      1);
+}
+
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput) {
+  _filterActs(
+      context,
+      images,
+      filters,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      scaleTargets,
+      scaleOutput,
+      false);
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
new file mode 100644
index 0000000..e8dd351
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
@@ -0,0 +1,9796 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include "../include/cudaconv2.cuh"
+
+/*
+ * Block size: 16x16.
+ * blockIdx.x determines case in batches of 16*imgsPerThread.
+ * blockIdx.y determines 4x4 image region in target image.
+ *
+ * threadIdx.x determines case.
+ * threadIdx.y determines pixel.
+ *
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ * filters:     (numColors, filterPixels, numFilters) if conv (numModulesY,
+ * numModulesX, numColors, filterPixels, numFilters)     otherwise targets:
+ * (numColors, imgSizeY, imgSizeX, numImages)
+ *
+ * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
+ *
+ * Number of filters must be divisible by 16.
+ * Number of images must be divisible by 16*imgsPerThread  if checkCaseBounds is
+ * false. 16 * imgsPerThread must be divisible by 32.
+ *
+ * This version loads 32 cases at a time, so it gets full coalescing on that
+ * load. It only loads 16 weights at a time, so those aren't fully coalesced.
+ * This version conserves shared memory by loading 16 filters at a time rather
+ * than 32.
+ */
+template <
+    int imgsPerThread,
+    int numColors,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void img_acts_color(
+    const float* hidActs,
+    const float* filters,
+    float* targets,
+    const int numModulesY,
+    const int numModulesX,
+    const int numImages,
+    const int numFilters,
+    const int filterSize,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int paddingStart,
+    const int moduleStride,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shFilters[numColors * 16][16 + 1];
+  __shared__ float shHidActs[16][16 * imgsPerThread];
+
+  const int blockCaseIdx = blockIdx.x * 16 * imgsPerThread;
+  const int numRegionsX = DIVUP(imgSizeX, 4);
+  const int blockRegionIdx = blockIdx.y;
+  const int blockRegionIdxX = blockRegionIdx % numRegionsX;
+  const int blockRegionIdxY = blockRegionIdx / numRegionsX;
+  const int blockRegionLeft = blockRegionIdxX * 4;
+  const int blockRegionTop = blockRegionIdxY * 4;
+  const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
+  const int pxY = blockRegionTop + pxYInRegion;
+  const int pxX = blockRegionLeft + pxXInRegion;
+  const int pxIdx = pxY * imgSizeX + pxX;
+  const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
+  const int numModules = numModulesY * numModulesX;
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeX * imgSizeY;
+  const int tidx = threadIdx.y * 16 + threadIdx.x;
+  const int loadY = tidx / 32, loadX = tidx % 32;
+
+  hidActs += blockCaseIdx + loadY * numImages * numModules + loadX;
+  filters += threadIdx.x;
+  targets += pxIdx * numImages + blockCaseIdx + threadIdx.x;
+
+  float prod[numColors][imgsPerThread];
+#pragma unroll
+  for (int c = 0; c < numColors; c++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[c][i] = 0;
+    }
+  }
+  const int startY = blockRegionTop - paddingStart < filterSize
+      ? 0
+      : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
+  const int startX = blockRegionLeft - paddingStart < filterSize
+      ? 0
+      : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
+
+  float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x];
+  float* shHidActLoad = &shHidActs[loadY][loadX];
+
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInModuleY = pxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      const int moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInModuleX = pxX - moduleLeft;
+
+      const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize &&
+          pxInModuleX >= 0 && pxInModuleX < filterSize;
+      const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
+
+      for (int f = 0; f < numFilters;
+           f += 16) { // multiply with 16 filters at a time
+        // Now the threads split up into half-warps, and each half-warp decides
+        // if it's interested.
+        const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
+#pragma unroll
+        for (int i = 0; i < imgsPerThread * 16; i += 32) {
+          if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) {
+#pragma unroll
+            for (int j = 0; j < 16;
+                 j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
+                           // elements at a time.
+              shHidActLoad[j * 16 * imgsPerThread + i] =
+                  hLoad[j * numModules * numImages + i];
+            }
+          } else {
+#pragma unroll
+            for (int j = 0; j < 16;
+                 j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
+                           // elements at a time.
+              shHidActLoad[j * 16 * imgsPerThread + i] = 0;
+            }
+          }
+        }
+
+        if (isPxInImg && isPxInModule) {
+          // This half-warp is interested, so it's going to load the weights
+          // from this module to its pixel. Not fully coalesced read :( But
+          // taking out this read entirely only reduces the runtime by ~2.8%, so
+          // it isn't costing me much.
+          const float* fLoad = conv
+              ? &filters[pxIdxInModule * numFilters + f]
+              : &filters
+                    [(moduleIdx * numColors * filterPixels + pxIdxInModule) *
+                         numFilters +
+                     f];
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+            shilterLoad[c * 16 * (16 + 1)] =
+                fLoad[c * filterPixels * numFilters];
+          }
+        }
+
+        __syncthreads();
+        // Do some actual computation
+        if (isPxInImg && isPxInModule) {
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+#pragma unroll
+            for (int w = 0; w < 16; w++) {
+#pragma unroll
+              for (int i = 0; i < imgsPerThread; i++) {
+                prod[c][i] += shFilters[threadIdx.y + c * 16][w] *
+                    shHidActs[w][threadIdx.x + i * 16];
+              }
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+  // Not fully coalesced write :(... shmem (and fully coalesced) version is
+  // actually slightly slower, though
+  if (isPxInImg) {
+    if (scale) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds ||
+            blockCaseIdx + threadIdx.x + i * 16 < numImages) {
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+            targets[c * imgPixels * numImages + i * 16] =
+                scaleTargets * targets[c * imgPixels * numImages + i * 16] +
+                scaleOutputs * prod[c][i];
+          }
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds ||
+            blockCaseIdx + threadIdx.x + i * 16 < numImages) {
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+            targets[c * imgPixels * numImages + i * 16] =
+                scaleOutputs * prod[c][i];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size: 16x16.
+ * blockIdx.x determines case in batches of 16*imgsPerThread, also color in
+ * batches of colorsPerThread. In essence, blockIdx.x.x
+ * = 1..numImages/(16*imgsPerThread) blockIdx.x.y
+ * = 1..numImgColors/colorsPerThread blockIdx.y determines 4x4 image region in
+ * target image.
+ *
+ * threadIdx.x determines case.
+ * threadIdx.y determines pixel.
+ *
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModulesY, numModulesX, numFilterColors, filterPixels,
+ * numFilters)   otherwise targets:     (numImageColors, imgSizeY, imgSizeX,
+ * numImages)
+ *
+ * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
+ *
+ * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
+ * 16 * imgsPerThread must be divisible by 32.
+ * numImageColors/numGroups must be divisible by colorsPerThread.
+ *
+ * This version loads 32 cases at a time, so it gets full coalescing on that
+ * load. It only loads 16 weights at a time, so those aren't fully coalesced.
+ * This version conserves shared memory by loading 16 filters at a time rather
+ * than 32.
+ *
+ * To be used when there are 4-16 color channels.
+ */
+template <
+    int imgsPerThread,
+    int colorsPerThread,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void img_acts_mediumcolor(
+    const float* hidActs,
+    const float* filters,
+    float* targets,
+    const int numModulesY,
+    const int numModulesX,
+    const int numImages,
+    const int numFilters,
+    const int filterSize,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int paddingStart,
+    const int moduleStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shFilters[colorsPerThread * 16][16 + 1];
+  __shared__ float shHidActs[16][16 * imgsPerThread];
+
+  const int numImgBlocks = DIVUP(numImages, 16 * imgsPerThread);
+  const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16 * imgsPerThread;
+
+  const int imgColorIdx =
+      (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
+  const int numFilterColors = numImgColors / numGroups;
+  const int blockGroupIdx = imgColorIdx / numFilterColors;
+  const int filterColorIdx =
+      imgColorIdx % numFilterColors; // color idx within group
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+  const int numRegionsX = DIVUP(imgSizeX, 4);
+  const int blockRegionIdx = blockIdx.y;
+  const int blockRegionIdxX = blockRegionIdx % numRegionsX;
+  const int blockRegionIdxY = blockRegionIdx / numRegionsX;
+  const int blockRegionLeft = blockRegionIdxX * 4;
+  const int blockRegionTop = blockRegionIdxY * 4;
+  const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
+  const int pxY = blockRegionTop + pxYInRegion;
+  const int pxX = blockRegionLeft + pxXInRegion;
+  const int pxIdx = pxY * imgSizeX + pxX;
+  const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
+  const uint numModules = numModulesY * numModulesX;
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int tidx = threadIdx.y * 16 + threadIdx.x;
+  const int loadY = tidx / 32, loadX = tidx % 32;
+
+  hidActs +=
+      blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
+  filters +=
+      blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
+  targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages +
+      blockCaseIdx + threadIdx.x;
+
+  float prod[colorsPerThread][imgsPerThread];
+#pragma unroll
+  for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[c][i] = 0;
+    }
+  }
+  const int startY = blockRegionTop - paddingStart < filterSize
+      ? 0
+      : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
+  const int startX = blockRegionLeft - paddingStart < filterSize
+      ? 0
+      : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
+
+  float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
+  float* shHidActLoad = &shHidActs[loadY][loadX];
+
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInModuleY = pxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      const int moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInModuleX = pxX - moduleLeft;
+
+      const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize &&
+          pxInModuleX >= 0 && pxInModuleX < filterSize;
+      const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
+
+      for (int f = 0; f < numFiltersPerGroup;
+           f += 16) { // multipply with 16 filters at a time
+        // Now the threads split up into half-warps, and each half-warp decides
+        // if it's interested.
+        const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
+#pragma unroll
+        for (int i = 0; i < imgsPerThread * 16; i += 32) {
+          if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
+#pragma unroll
+            for (int j = 0; j < 16;
+                 j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
+                           // elements at a time.
+              shHidActLoad[j * 16 * imgsPerThread + i] =
+                  hLoad[j * numModules * numImages + i];
+            }
+          } else {
+#pragma unroll
+            for (int j = 0; j < 16;
+                 j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
+                           // elements at a time.
+              shHidActLoad[j * 16 * imgsPerThread + i] = 0;
+            }
+          }
+        }
+
+        if (isPxInImg && isPxInModule) {
+          // This half-warp is interested, so it's going to load the weights
+          // from this module to its pixel.
+
+          // Not fully coalesced read :(
+          // But taking out this read entirely only reduces the runtime by
+          // ~2.8%, so it isn't costing me much.
+          const float* fLoad = conv
+              ? &filters[pxIdxInModule * numFilters + f]
+              : &filters
+                    [moduleIdx * numFilterColors * filterPixels * numFilters +
+                     pxIdxInModule * numFilters + f];
+#pragma unroll
+          for (int c = 0; c < colorsPerThread; c++) {
+            shFilterLoad[c * 16 * (16 + 1)] =
+                fLoad[c * filterPixels * numFilters];
+          }
+        }
+
+        __syncthreads();
+        // Do some actual computation
+        if (isPxInImg && isPxInModule) {
+#pragma unroll
+          for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+            for (int w = 0; w < 16; w++) {
+#pragma unroll
+              for (int i = 0; i < imgsPerThread; i++) {
+                prod[c][i] += shFilters[threadIdx.y + c * 16][w] *
+                    shHidActs[w][threadIdx.x + i * 16];
+              }
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+  // Not fully coalesced write :(... shmem (and fully coalesced) version is
+  // actually slightly slower, though
+  if (isPxInImg) {
+    if (scale) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds ||
+            blockCaseIdx + threadIdx.x + i * 16 < numImages) {
+#pragma unroll
+          for (int c = 0; c < colorsPerThread; c++) {
+            targets[c * imgPixels * numImages + i * 16] =
+                scaleTargets * targets[c * imgPixels * numImages + i * 16] +
+                scaleOutputs * prod[c][i];
+          }
+        }
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds ||
+            blockCaseIdx + threadIdx.x + i * 16 < numImages) {
+#pragma unroll
+          for (int c = 0; c < colorsPerThread; c++) {
+            targets[c * imgPixels * numImages + i * 16] =
+                scaleOutputs * prod[c][i];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size: B_YxB_X.
+ * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in
+ batches of B_Y*colorsPerThread.
+ *  In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
+ *              blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
+ * blockIdx.y determines image pixel in target image.
+ *
+ * threadIdx.x determines case.
+ * threadIdx.y determines color.
+ *
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModulesY, numModulesX, numFilterColors, filterPixels,
+ numFilters)   otherwise
+ * targets:     (numImageColors, imgSizeY, imgSizeX, numImages)
+ *
+ * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from
+ B_X*imgsPerThread cases.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
+ * numFiltersPerGroup must be divisible by filterCache.
+ *
+ * B_X * imgsPerThread must be divisible by 32.
+ * numFilterColors must be divisible by B_Y*colorsPerThread.
+ * B_X*B_Y must be divisible by 32.
+ * filterCache must be divisible by B_X*B_Y/32
+ * B_X*B_Y must be divisible by filterCache
+
+ * This version loads 32 cases at a time, so it gets full coalescing on that
+ load.
+ * It only loads filterCache weights at a time, so those aren't fully coalesced
+ (depending on size of filterCache).
+ *
+ * To be used when there are >= 16 color channels.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int colorsPerThread,
+    int filterCache,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void conv_img_acts_manycolor(
+    const float* hidActs,
+    const float* filters,
+    float* targets,
+    const int numModulesY,
+    const int numModulesX,
+    const int numImages,
+    const int numFilters,
+    const int filterSize,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int paddingStart,
+    const int moduleStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shFilters[colorsPerThread * B_Y][filterCache + 1];
+  __shared__ float shHidActs[filterCache][B_X * imgsPerThread];
+
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+
+  const int imgColorIdx =
+      (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
+  const int numFilterColors = numImgColors / numGroups;
+  const int blockGroupIdx = imgColorIdx / numFilterColors;
+  const int filterColorIdx =
+      imgColorIdx % numFilterColors; // color idx within group
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+  const int blockPixelIdx = blockIdx.y;
+  const int blockPixelIdxX = blockPixelIdx % imgSizeX;
+  const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
+  const int filtersLoadY = tidx / filterCache,
+            filtersLoadX = tidx % filterCache;
+  const int numModules = numModulesY * numModulesX;
+
+  hidActs += blockCaseIdx +
+      (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
+  filters += blockFilterIdx +
+      (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
+      filtersLoadX;
+  targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
+      blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
+
+  float prod[colorsPerThread][imgsPerThread];
+#pragma unroll
+  for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[c][i] = 0;
+    }
+  }
+
+  const int startY = blockPixelIdxY - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
+  const int startX = blockPixelIdxX - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
+
+  float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
+  float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
+
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      const int moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+      const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
+
+      for (int f = 0; f < numFiltersPerGroup;
+           f += filterCache) { // multiply with filterCache filters at a time
+        const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
+#pragma unroll
+        for (int i = 0; i < imgsPerThread * B_X; i += 32) {
+          if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
+#pragma unroll
+            for (int j = 0; j < filterCache; j +=
+                 B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X
+                                   // cols, 8 * 32 elements at a time.
+              shHidActLoad[j * B_X * imgsPerThread + i] =
+                  hLoad[j * numModules * numImages + i];
+            }
+          } else {
+#pragma unroll
+            for (int j = 0; j < filterCache; j +=
+                 B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X
+                                   // cols, 8 * 32 elements at a time.
+              shHidActLoad[j * B_X * imgsPerThread + i] = 0;
+            }
+          }
+        }
+        const float* fLoad = conv
+            ? &filters[pxIdxInFilter * numFilters + f]
+            : &filters
+                  [moduleIdx * numFilterColors * filterPixels * numFilters +
+                   pxIdxInFilter * numFilters + f];
+#pragma unroll
+        for (int i = 0; i < colorsPerThread * B_Y;
+             i += B_X * B_Y / filterCache) {
+          if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCache) == 0 ||
+              i + filtersLoadY < colorsPerThread * B_Y) {
+            shFilterLoad[i * (filterCache + 1)] =
+                fLoad[i * filterPixels * numFilters];
+          }
+        }
+
+        __syncthreads();
+// Do some actual computation
+#pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+          for (int w = 0; w < filterCache; w++) {
+#pragma unroll
+            for (int c = 0; c < colorsPerThread; c++) {
+              prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] *
+                  shHidActs[w][threadIdx.x + i * B_X];
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds ||
+          blockCaseIdx + threadIdx.x + i * B_X < numImages) {
+#pragma unroll
+        for (int c = 0; c < colorsPerThread; c++) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
+                  targets[c * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds ||
+          blockCaseIdx + threadIdx.x + i * B_X < numImages) {
+#pragma unroll
+        for (int c = 0; c < colorsPerThread; c++) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] =
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Block size: B_YxB_X.
+ * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in
+ * batches of B_Y*colorsPerThread. In essence, blockIdx.x.x
+ * = 1..numImages/(B_X*imgsPerThread) blockIdx.x.y
+ * = 1..numImgColors/(B_Y*colorsPerThread) blockIdx.y determines image pixel in
+ * target image.
+ *
+ * threadIdx.x determines case.
+ * threadIdx.y determines color.
+ *
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ * filters:     (numFilterColors, filterPixels, numFilters) if conv
+ *              (numModulesY, numModulesX, numFilterColors, filterPixels,
+ * numFilters)   otherwise targets:     (numImageColors, imgSizeY, imgSizeX,
+ * numImages)
+ *
+ * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from
+ * B_X*imgsPerThread cases.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
+ * numFiltersPerGroup must be divisible by filterCacheF.
+ *
+ * numFilterColors must be divisible by B_Y*colorsPerThread.
+ * B_X*B_Y must be divisible by filterCacheF
+ * filterCacheF must be divisible by filterCacheH
+ *
+ * This version loads 32 cases at a time, so it gets full coalescing on that
+ * load. It only loads filterCacheF weights at a time, so those aren't fully
+ * coalesced (depending on size of filterCacheF).
+ *
+ * To be used when there are >= 16 color channels.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int colorsPerThread,
+    int filterCacheF,
+    int filterCacheH,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void conv_img_acts_manycolor_kepler(
+    const float* hidActs,
+    const float* filters,
+    float* targets,
+    const int numModulesY,
+    const int numModulesX,
+    const int numImages,
+    const int numFilters,
+    const int filterSize,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int paddingStart,
+    const int moduleStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
+  __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
+
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+
+  const int imgColorIdx =
+      (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
+  const int numFilterColors = numImgColors / numGroups;
+  const int blockGroupIdx = imgColorIdx / numFilterColors;
+  const int filterColorIdx =
+      imgColorIdx % numFilterColors; // color idx within group
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+  const int blockPixelIdx = blockIdx.y;
+  const int blockPixelIdxX = blockPixelIdx % imgSizeX;
+  const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  const int hidActLoadY = threadIdx.y, hidActLoadX = threadIdx.x;
+  // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
+  // (B_X*imgsPerThread);
+  const int filtersLoadY = tidx / filterCacheF,
+            filtersLoadX = tidx % filterCacheF;
+  // nvcc is behaving idiotically again, these useless declarations save
+  // registers
+  // const int outputY = threadIdx.y, outputX = threadIdx.x;
+  // const int ty = threadIdx.y, tx = threadIdx.x;
+  const int numModules = numModulesY * numModulesX;
+
+  hidActs += blockCaseIdx +
+      (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
+  filters += blockFilterIdx +
+      (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
+      filtersLoadX;
+  targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
+      blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
+
+  float prod[colorsPerThread][imgsPerThread];
+#pragma unroll
+  for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      prod[c][i] = 0;
+    }
+  }
+
+  const int startY = blockPixelIdxY - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
+  const int startX = blockPixelIdxX - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
+
+  float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
+  float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
+  // const bool noFLoop = filterCacheF == filterCacheH;
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      const int moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+      const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
+
+      for (int f = 0; f < numFiltersPerGroup;
+           f += filterCacheF) { // multiply with filterCacheF filters at a time
+        const float* fLoad = conv
+            ? &filters[pxIdxInFilter * numFilters + f]
+            : &filters
+                  [moduleIdx * numFilterColors * filterPixels * numFilters +
+                   pxIdxInFilter * numFilters + f];
+#pragma unroll
+        for (int i = 0; i < colorsPerThread * B_Y;
+             i += B_X * B_Y / filterCacheF) {
+          if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
+              i + filtersLoadY < colorsPerThread * B_Y) {
+            shFilterLoad[i * filterCacheF] =
+                fLoad[i * filterPixels * numFilters];
+          }
+        }
+        //#pragma unroll
+
+        for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) {
+          // conv_img_acts_manycolor_dummy_fhLoop<B_Y, B_X, imgsPerThread,
+          // colorsPerThread, filterCacheF, filterCacheH,
+          // checkCaseBounds>(hidActs, shHidActLoad, shHidActs, shFilters,
+          // moduleIdx, numImages, hidActLoadY, hidActLoadX, blockCaseIdx,
+          // numModules, f, fh, prod);
+
+          const float* hLoad =
+              &hidActs[(moduleIdx + fh * numModules) * numImages];
+
+#pragma unroll
+          for (int j = 0; j < filterCacheH; j += B_Y) {
+            if (filterCacheH % B_Y == 0 || hidActLoadY + j < filterCacheH) {
+#pragma unroll
+              for (int i = 0; i < imgsPerThread * B_X; i += B_X) {
+                if (!checkCaseBounds ||
+                    blockCaseIdx + hidActLoadX + i < numImages) {
+                  shHidActLoad[j * B_X * imgsPerThread + i] =
+                      hLoad[j * numModules * numImages + i];
+                } else {
+                  shHidActLoad[j * B_X * imgsPerThread + i] = 0;
+                }
+              }
+            }
+          }
+
+          __syncthreads();
+
+// Do some actual computation
+// Using these variables causes register usage to go from 161 --> 123.
+// But nonetheless, the high-register version is faster.
+// const float* shF = &shFilters[threadIdx.y][fh-f];
+// const float* const shF2 = &shFilters[threadIdx.y][fh];
+// const float*  shH = &shHidActs[0][threadIdx.x];
+#pragma unroll
+          for (int w = 0; w < filterCacheH; w++) {
+#pragma unroll
+            for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+              for (int i = 0; i < imgsPerThread; i++) {
+                prod[c][i] += shFilters[c * B_Y + threadIdx.y][fh - f + w] *
+                    shHidActs[w][threadIdx.x + i * B_X];
+              }
+            }
+          }
+          __syncthreads();
+        }
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds ||
+          blockCaseIdx + threadIdx.x + i * B_X < numImages) {
+#pragma unroll
+        for (int c = 0; c < colorsPerThread; c++) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
+                  targets[c * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+      if (!checkCaseBounds ||
+          blockCaseIdx + threadIdx.x + i * B_X < numImages) {
+#pragma unroll
+        for (int c = 0; c < colorsPerThread; c++) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] =
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * New Titan-optimized stuff.
+ */
+
+__device__ __forceinline__ void
+conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
+    const int my,
+    const int mx,
+    const int numModulesX,
+    const int paddingStart,
+    const int moduleStride,
+    const int blockPixelIdxY,
+    const int blockPixelIdxX,
+    const int filterSize,
+    int& moduleIdx,
+    int& pxIdxInFilter) {
+  const int moduleTop = paddingStart + my * moduleStride;
+  const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+  moduleIdx = my * numModulesX + mx; // out
+  const int moduleLeft = paddingStart + mx * moduleStride;
+  const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+  pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; // out
+}
+
+#define IA_PRELOAD_LOOP(w, offset)                                     \
+  _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) {          \
+    _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) {      \
+      prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
+          shHidActs[w][threadIdx.x * imgsPerThread + i];               \
+    }                                                                  \
+  }
+
+/*
+ * Same loop as above but inverted.
+ */
+#define IA_PRELOAD_LOOP2(w, offset)                                    \
+  _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) {        \
+    _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) {        \
+      prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
+          shHidActs[w][threadIdx.x * imgsPerThread + i];               \
+    }                                                                  \
+  }
+
+#define IA_PRELOAD_LOOP3(i, offset)                                    \
+  _Pragma("unroll") for (int w = 0; w < filterCacheH; w++) {           \
+    _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) {      \
+      prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
+          shHidActs[w][threadIdx.x * imgsPerThread + i];               \
+    }                                                                  \
+  }
+
+#define IA_PRELOAD_W(z) \
+  wPreload[z] = fLoad[(z)*B_X * B_Y / filterCacheF * filterPixels * numFilters];
+#define IA_PRELOAD_W_TX(z)         \
+  wPreload[z] = tex1Dfetch<float>( \
+      filters,                     \
+      filtersLoadOffset +          \
+          (z)*B_X * B_Y / filterCacheF * filterPixels * numFilters);
+#define IA_PRELOAD_H(y, x)                                              \
+  if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) {            \
+    hPreload[y][x] = hLoad[(y)*B_Y * numModules * numImages + (x)*B_X]; \
+  }
+#define IA_PRELOAD_H_TX(y, x)                                            \
+  if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) {             \
+    hPreload[y][x] = tex1Dfetch<float>(                                  \
+        hidActs,                                                         \
+        hidActsLoadOffset + (y)*B_Y * numModules * numImages + (x)*B_X); \
+  }
+
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int colorsPerThread,
+    int filterCacheF,
+    int filterCacheH,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void __launch_bounds__(
+    256,
+    2) // 256 threads per block, 2 blocks per multiprocessor
+       // These launch bounds ensure 25% occupancy (128 registers used)
+       // as oppposed to 13% (130 registers) achieved by defaults.
+    conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(
+        cudaTextureObject_t hidActs,
+        cudaTextureObject_t filters,
+        float* targets,
+        const int numModulesY,
+        const int numModulesX,
+        const int numImages,
+        const int numFilters,
+        const int filterSize,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int paddingStart,
+        const int moduleStride,
+        const int numImgColors,
+        const int numGroups,
+        const float scaleTargets,
+        const float scaleOutputs) {
+  __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
+  __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
+
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int myCaseIdx = blockCaseIdx + threadIdx.x;
+
+  const int imgColorIdx =
+      (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
+  const int numFilterColors = numImgColors / numGroups;
+  const int blockGroupIdx = imgColorIdx / numFilterColors;
+  const int filterColorIdx =
+      imgColorIdx % numFilterColors; // color idx within group
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+  const int blockPixelIdx = blockIdx.y;
+  const int blockPixelIdxX = blockPixelIdx % imgSizeX;
+  const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  //    const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x %
+  //    B_X;
+  // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
+  // (B_X*imgsPerThread);
+  const int filtersLoadY = tidx / filterCacheF,
+            filtersLoadX = tidx % filterCacheF;
+  // nvcc is behaving idiotically again, these useless declarations save
+  // registers
+  // const int outputY = threadIdx.y, outputX = threadIdx.x;
+  // const int ty = threadIdx.y, tx = threadIdx.x;
+  const int numModules = numModulesY * numModulesX;
+  const int hidActsOffset =
+      (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
+  const int filtersOffset = blockFilterIdx +
+      (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
+      filtersLoadX;
+  //    hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules +
+  //    myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) *
+  //    filterPixels * numFilters + filtersLoadX;
+  targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
+      blockPixelIdx * numImages + myCaseIdx;
+
+  float prod[colorsPerThread][imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+      prod[c][i] = 0;
+    }
+  }
+
+  const int startY = blockPixelIdxY - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
+  const int startX = blockPixelIdxX - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
+
+  float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
+  float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
+  // const bool noFLoop = filterCacheF == filterCacheH;
+
+  /*
+   * Initial preload
+   */
+  float hPreload[filterCacheH / B_Y][imgsPerThread]; // [2][4]
+  float wPreload[filterCacheF * colorsPerThread / B_X]; // [8]
+
+  int moduleIdx, pxIdxInFilter;
+  conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
+      startY,
+      startX,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      blockPixelIdxY,
+      blockPixelIdxX,
+      filterSize,
+      moduleIdx,
+      pxIdxInFilter);
+  //    const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0]
+  //                              : &filters[moduleIdx * numFilterColors *
+  //                              filterPixels * numFilters + pxIdxInFilter *
+  //                              numFilters + 0];
+  int filtersLoadOffset = filtersOffset +
+      (conv ? pxIdxInFilter * numFilters + 0
+            : moduleIdx * numFilterColors * filterPixels * numFilters +
+               pxIdxInFilter * numFilters);
+#pragma unroll
+  for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) {
+    if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
+        i + filtersLoadY < colorsPerThread * B_Y) {
+      wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch<float>(
+          filters, filtersLoadOffset + i * filterPixels * numFilters);
+    }
+  }
+
+  //    const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages];
+  int hidActsLoadOffset =
+      hidActsOffset + (moduleIdx + 0 * numModules) * numImages;
+#pragma unroll
+  for (int j = 0; j < filterCacheH; j += B_Y) {
+    if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          hPreload[j / B_Y][i] = tex1Dfetch<float>(
+              hidActs,
+              hidActsLoadOffset + j * numModules * numImages + i * B_X);
+        }
+      }
+    }
+  }
+
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+      pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
+      int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext;
+      const bool lastModule = my == endY - 1 && mx == endX - 1;
+      if (!lastModule) {
+        mxNext = mx + 1 == endX ? startX : mx + 1;
+        myNext = my + (mx + 1 == endX);
+      }
+      conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
+          myNext,
+          mxNext,
+          numModulesX,
+          paddingStart,
+          moduleStride,
+          blockPixelIdxY,
+          blockPixelIdxX,
+          filterSize,
+          moduleIdxNext,
+          pxIdxInFilterNext);
+      for (int f = 0; f < numFiltersPerGroup;
+           f += filterCacheF) { // multiply with filterCacheF filters at a time
+#pragma unroll
+        for (int i = 0; i < colorsPerThread * B_Y;
+             i += B_X * B_Y / filterCacheF) {
+          if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
+              i + filtersLoadY < colorsPerThread * B_Y) {
+            shFilterLoad[i * filterCacheF] =
+                wPreload[i * filterCacheF / (B_X * B_Y)];
+          }
+        }
+
+        filtersLoadOffset = filtersOffset +
+            (conv ? pxIdxInFilter * numFilters + f + filterCacheF
+                  : moduleIdx * numFilterColors * filterPixels * numFilters +
+                     pxIdxInFilter * numFilters + f + filterCacheF);
+        if (f == numFiltersPerGroup - filterCacheF) {
+          filtersLoadOffset = filtersOffset +
+              (conv ? pxIdxInFilterNext * numFilters
+                    : moduleIdxNext * numFilterColors * filterPixels *
+                           numFilters +
+                       pxIdxInFilterNext * numFilters);
+        }
+
+#pragma unroll
+        for (int j = 0; j < filterCacheH; j += B_Y) {
+          if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+#pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+              // NOTE: bank conflicts here!
+              if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+                shHidActLoad[j * B_X * imgsPerThread + i] =
+                    hPreload[j / B_Y][i];
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        hidActsLoadOffset = hidActsOffset +
+            (moduleIdx + (f + filterCacheH) * numModules) * numImages;
+
+#pragma unroll
+        for (int z = 0; z < 4; ++z) {
+          IA_PRELOAD_LOOP(z, 0);
+          IA_PRELOAD_W_TX(z);
+        }
+
+#pragma unroll
+        for (int z = 4; z < 12; ++z) {
+          IA_PRELOAD_LOOP(z, 0);
+          IA_PRELOAD_H_TX((z - 4) / 4, z % 4);
+        }
+
+#pragma unroll
+        for (int z = 12; z < 16; ++z) {
+          IA_PRELOAD_LOOP(z, 0);
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < filterCacheH; j += B_Y) {
+          if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+#pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+              if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+                shHidActLoad[j * B_X * imgsPerThread + i] =
+                    hPreload[j / B_Y][i];
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        hidActsLoadOffset = hidActsOffset +
+            (moduleIdx + (f + filterCacheF) * numModules) * numImages;
+        if (f == numFiltersPerGroup - filterCacheF) {
+          hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages;
+        }
+
+#pragma unroll
+        for (int z = 0; z < 4; ++z) {
+          IA_PRELOAD_LOOP(z, filterCacheH);
+          IA_PRELOAD_W_TX(z + 4);
+        }
+
+#pragma unroll
+        for (int z = 4; z < 12; ++z) {
+          IA_PRELOAD_LOOP(z, filterCacheH);
+          IA_PRELOAD_H_TX((z - 4) / 4, z % 4);
+        }
+
+#pragma unroll
+        for (int z = 12; z < 16; ++z) {
+          IA_PRELOAD_LOOP(z, filterCacheH);
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
+                  targets[c * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] =
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  }
+}
+
+template <
+    int B_Y,
+    int B_X,
+    int imgsPerThread,
+    int colorsPerThread,
+    int filterCacheF,
+    int filterCacheH,
+    bool scale,
+    bool checkCaseBounds,
+    bool conv>
+__global__ void
+//__launch_bounds__(128, 3)   // 128 threads per block, 3 blocks per
+// multiprocessor
+conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16(
+    cudaTextureObject_t hidActs,
+    cudaTextureObject_t filters,
+    float* targets,
+    const int numModulesY,
+    const int numModulesX,
+    const int numImages,
+    const int numFilters,
+    const int filterSize,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int paddingStart,
+    const int moduleStride,
+    const int numImgColors,
+    const int numGroups,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
+  __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
+
+  const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
+  const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+  const int myCaseIdx = blockCaseIdx + threadIdx.x;
+
+  const int imgColorIdx =
+      (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
+  const int numFilterColors = numImgColors / numGroups;
+  const int blockGroupIdx = imgColorIdx / numFilterColors;
+  const int filterColorIdx =
+      imgColorIdx % numFilterColors; // color idx within group
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+  const int blockPixelIdx = blockIdx.y;
+  const int blockPixelIdxX = blockPixelIdx % imgSizeX;
+  const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+  const int tidx = threadIdx.y * B_X + threadIdx.x;
+  //    const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x %
+  //    B_X;
+  // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
+  // (B_X*imgsPerThread);
+  const int filtersLoadY = tidx / filterCacheF,
+            filtersLoadX = tidx % filterCacheF;
+  // nvcc is behaving idiotically again, these useless declarations save
+  // registers
+  // const int outputY = threadIdx.y, outputX = threadIdx.x;
+  // const int ty = threadIdx.y, tx = threadIdx.x;
+  const int numModules = numModulesY * numModulesX;
+
+  const int hidActsOffset =
+      (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
+  const int filtersOffset = blockFilterIdx +
+      (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
+      filtersLoadX;
+
+  //    hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules +
+  //    myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) *
+  //    filterPixels * numFilters + filtersLoadX;
+  targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
+      blockPixelIdx * numImages + myCaseIdx;
+
+  float prod[colorsPerThread][imgsPerThread];
+#pragma unroll
+  for (int i = 0; i < imgsPerThread; i++) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+      prod[c][i] = 0;
+    }
+  }
+
+  const int startY = blockPixelIdxY - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
+  const int endY =
+      min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
+  const int startX = blockPixelIdxX - paddingStart < filterSize
+      ? 0
+      : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
+  const int endX =
+      min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
+
+  float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
+  float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
+  // const bool noFLoop = filterCacheF == filterCacheH;
+
+  /*
+   * Initial preload
+   */
+  float hPreload[filterCacheH / B_Y][imgsPerThread]; // [4][4]
+  float wPreload[filterCacheF * colorsPerThread / B_X]; // [6]
+
+  int moduleIdx, pxIdxInFilter;
+  conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
+      startY,
+      startX,
+      numModulesX,
+      paddingStart,
+      moduleStride,
+      blockPixelIdxY,
+      blockPixelIdxX,
+      filterSize,
+      moduleIdx,
+      pxIdxInFilter);
+  //    const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0]
+  //                              : &filters[moduleIdx * numFilterColors *
+  //                              filterPixels * numFilters + pxIdxInFilter *
+  //                              numFilters + 0];
+  int filtersLoadOffset = filtersOffset +
+      (conv ? pxIdxInFilter * numFilters
+            : moduleIdx * numFilterColors * filterPixels * numFilters +
+               pxIdxInFilter * numFilters);
+#pragma unroll
+  for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) {
+    if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
+        i + filtersLoadY < colorsPerThread * B_Y) {
+      wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch<float>(
+          filters, filtersLoadOffset + i * filterPixels * numFilters);
+    }
+  }
+
+  //    const float* hLoad = &hidActs[moduleIdx * numImages];
+  int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages;
+#pragma unroll
+  for (int j = 0; j < filterCacheH; j += B_Y) {
+    if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          hPreload[j / B_Y][i] = tex1Dfetch<float>(
+              hidActs,
+              hidActsLoadOffset + j * numModules * numImages + i * B_X);
+        }
+      }
+    }
+  }
+
+  for (int my = startY; my < endY; my++) {
+    const int moduleTop = paddingStart + my * moduleStride;
+    const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+    for (int mx = startX; mx < endX; mx++) {
+      moduleIdx = my * numModulesX + mx;
+      const int moduleLeft = paddingStart + mx * moduleStride;
+      const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+      pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
+      int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext;
+      const bool lastModule = my == endY - 1 && mx == endX - 1;
+      if (!lastModule) {
+        mxNext = mx + 1 == endX ? startX : mx + 1;
+        myNext = my + (mx + 1 == endX);
+      }
+      conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
+          myNext,
+          mxNext,
+          numModulesX,
+          paddingStart,
+          moduleStride,
+          blockPixelIdxY,
+          blockPixelIdxX,
+          filterSize,
+          moduleIdxNext,
+          pxIdxInFilterNext);
+      for (int f = 0; f < numFiltersPerGroup;
+           f += filterCacheF) { // multiply with filterCacheF filters at a time
+#pragma unroll
+        for (int i = 0; i < colorsPerThread * B_Y;
+             i += B_X * B_Y / filterCacheF) {
+          if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
+              i + filtersLoadY < colorsPerThread * B_Y) {
+            shFilterLoad[i * filterCacheF] =
+                wPreload[i * filterCacheF / (B_X * B_Y)];
+          }
+        }
+
+        filtersLoadOffset = filtersOffset +
+            (conv ? pxIdxInFilter * numFilters + f + filterCacheF
+                  : moduleIdx * numFilterColors * filterPixels * numFilters +
+                     pxIdxInFilter * numFilters + f + filterCacheF);
+        if (f == numFiltersPerGroup - filterCacheF) {
+          filtersLoadOffset = filtersOffset +
+              (conv ? pxIdxInFilterNext * numFilters
+                    : moduleIdxNext * numFilterColors * filterPixels *
+                           numFilters +
+                       pxIdxInFilterNext * numFilters);
+        }
+
+#pragma unroll
+        for (int j = 0; j < filterCacheH; j += B_Y) {
+          if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+#pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+              // NOTE: bank conflicts here!
+              if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+                shHidActLoad[j * B_X * imgsPerThread + i] =
+                    hPreload[j / B_Y][i];
+              }
+            }
+          }
+        }
+        hidActsLoadOffset = hidActsOffset +
+            (moduleIdx + (f + filterCacheF) * numModules) * numImages;
+        if (f == numFiltersPerGroup - filterCacheF) {
+          hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages;
+        }
+
+        __syncthreads();
+
+        // It seems that there is no point explicitly interleaving loads
+        // and computations because the scheduler does that anyway.
+
+        IA_PRELOAD_LOOP2(0, 0);
+        IA_PRELOAD_LOOP2(1, 0);
+        IA_PRELOAD_LOOP2(2, 0);
+        IA_PRELOAD_LOOP2(3, 0);
+        IA_PRELOAD_LOOP2(4, 0);
+        IA_PRELOAD_LOOP2(5, 0);
+        IA_PRELOAD_LOOP2(6, 0);
+        IA_PRELOAD_LOOP2(7, 0);
+        IA_PRELOAD_LOOP2(8, 0);
+        IA_PRELOAD_LOOP2(9, 0);
+        IA_PRELOAD_LOOP2(10, 0);
+        IA_PRELOAD_LOOP2(11, 0);
+        IA_PRELOAD_LOOP2(12, 0);
+        IA_PRELOAD_LOOP2(13, 0);
+        IA_PRELOAD_LOOP2(14, 0);
+        IA_PRELOAD_LOOP2(15, 0);
+
+        IA_PRELOAD_W_TX(0);
+        IA_PRELOAD_W_TX(1);
+        IA_PRELOAD_W_TX(2);
+        IA_PRELOAD_W_TX(3);
+        IA_PRELOAD_W_TX(4);
+        IA_PRELOAD_W_TX(5);
+
+        IA_PRELOAD_H_TX(0, 0);
+        IA_PRELOAD_H_TX(0, 1);
+        IA_PRELOAD_H_TX(0, 2);
+        IA_PRELOAD_H_TX(0, 3);
+        IA_PRELOAD_H_TX(1, 0);
+        IA_PRELOAD_H_TX(1, 1);
+        IA_PRELOAD_H_TX(1, 2);
+        IA_PRELOAD_H_TX(1, 3);
+        IA_PRELOAD_H_TX(2, 0);
+        IA_PRELOAD_H_TX(2, 1);
+        IA_PRELOAD_H_TX(2, 2);
+        IA_PRELOAD_H_TX(2, 3);
+        IA_PRELOAD_H_TX(3, 0);
+        IA_PRELOAD_H_TX(3, 1);
+        IA_PRELOAD_H_TX(3, 2);
+        IA_PRELOAD_H_TX(3, 3);
+
+        __syncthreads();
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
+                  targets[c * B_Y * imgPixels * numImages + i * B_X] +
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
+          targets[c * B_Y * imgPixels * numImages + i * B_X] =
+              scaleOutputs * prod[c][i];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * hidActs:         (numFilters, numModules, numImages)
+ * filters:         (numFilterColors, filterPixels, numFilters)               if
+ * conv (numModules, numFilterColors, filterPixels, numFilters)   otherwise
+ * targets:         (overSample, numImgColors, imgPixels, numImages)
+ *
+ * Note: all of these convolution routines are optimized for the case when
+ * the number of images (i.e. the minibatch size) is a multiple of 128.
+ * Other batch sizes will work, but but I made no attempt whatsoever
+ * to make them work fast.
+ */
+void _imgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput,
+    bool conv) {
+  CAFFE_ENFORCE(hidActs->ndim() == 2);
+  CAFFE_ENFORCE(filters->ndim() == 2);
+  CAFFE_ENFORCE(targets->ndim() == 2);
+
+  int numFilterColors = numImgColors / numGroups;
+  int numImages = hidActs->dim32(1);
+  int numFilters = filters->dim32(1);
+  int numModules = hidActs->dim32(0) / numFilters;
+  int filterModuleMult = conv ? 1 : numModules;
+  int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors);
+  int filterSize = sqrt(filterPixels);
+  int imgPixels = imgSizeY * imgSizeX;
+  int numModulesX = numModules / numModulesY;
+
+  CAFFE_ENFORCE(numImgColors % numGroups == 0);
+  CAFFE_ENFORCE(
+      numFilters % (16 * numGroups) ==
+      0); // TODO: insisting on 32 filters due to bug in calling code below. fix
+          // that.
+  CAFFE_ENFORCE(
+      numGroups > 1 ||
+      (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
+  CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0);
+
+  CAFFE_ENFORCE(filterPixels == filterSize * filterSize);
+  CAFFE_ENFORCE(hidActs->dim32(0) == numModules * numFilters);
+  CAFFE_ENFORCE(
+      filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels);
+  CAFFE_ENFORCE(numModules == numModulesY * numModulesX);
+
+  // These routines don't handle the case when only part of the image is visited
+  // in the convolution
+  CAFFE_ENFORCE(paddingStart <= 0);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
+  CAFFE_ENFORCE(moduleStride <= filterSize);
+
+  dim3 blocks;
+  dim3 threads;
+  int colorsPerThread, imgsPerThread;
+  if (numFilterColors % 8 == 0) {
+    threads = dim3(32, numFilterColors % 64 == 0 ? 8 : 4);
+    colorsPerThread = numFilterColors % 64 == 0 ? 8
+                                                : numFilterColors % 48 == 0
+            ? 12
+            : numFilterColors % 32 == 0 ? 8 : numFilterColors % 16 == 0 ? 4 : 2;
+    imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+    CAFFE_ENFORCE(numFilterColors % (threads.y * colorsPerThread) == 0);
+
+    blocks = dim3(
+        DIVUP(numImages, threads.x * imgsPerThread) *
+            (numImgColors / (threads.y * colorsPerThread)),
+        imgPixels);
+    // NOTE: the case when channels % 32 == 0 but channels % 48 != 0 and
+    // channels % 64 != 0 has not been optimized!!
+  } else if (numFilterColors > 3) {
+    // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!!
+    imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
+    threads = dim3(16, 16);
+    colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
+    blocks = dim3(
+        DIVUP(numImages, threads.x * imgsPerThread) *
+            (numImgColors / colorsPerThread),
+        DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4));
+  } else {
+    // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!!
+    imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
+    threads = dim3(16, 16);
+    blocks = dim3(
+        DIVUP(numImages, threads.x * imgsPerThread),
+        DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4));
+  }
+  bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
+
+  if (scaleTargets == 0) { // do not scale or use targets matrix
+    targets->Resize(std::vector<int>{numImgColors * imgPixels, numImages});
+  } else {
+    CAFFE_ENFORCE(targets->dim32(0) == numImgColors * imgPixels);
+    CAFFE_ENFORCE(targets->dim32(1) == numImages);
+  }
+  const bool scale = scaleTargets != 0;
+
+  float* hidacts_data = hidActs->mutable_data<float>();
+  float* filters_data = filters->mutable_data<float>();
+  float* targets_data = targets->mutable_data<float>();
+
+  cudaStream_t stream = context->cuda_stream();
+  //    cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+  //    4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared);
+  //    conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4,
+  //    12, 16, 16, false, false, true ><<<blocks, threads, 0, stream>>>(
+  //            tex_hidacts, tex_filters, targets_data, numModulesY,
+  //            numModulesX, numImages, numFilters, filterSize, imgSizeY,
+  //            imgSizeX, paddingStart, moduleStride, numImgColors, numGroups,
+  //            scaleTargets, scaleOutput);
+
+  // return;
+  //    printf("conv: %d\n", conv);
+  //    printf("scale: %d\n", scale);
+  //    printf("checkCaseBounds: %d\n", checkCaseBounds);
+  //    printf("numFilterColors: %d\n", numFilterColors);
+  //    printf("numImages: %d\n", numImages);
+  //    cudaStream_t stream = NVMatrix::getDefaultStream();
+
+  if (conv == true) {
+    if (scale == false) {
+      if (checkCaseBounds == false) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                        8,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                    8,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                        4,
+                        32,
+                        4,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                    4,
+                    32,
+                    4,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<8, 4, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<8, 4, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<4, 4, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<4, 4, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 3, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 3, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 3, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 3, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 1, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 1, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 1, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 1, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      } else if (checkCaseBounds == true) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      }
+    } else if (scale == true) {
+      if (checkCaseBounds == false) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                        8,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                    8,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                        4,
+                        32,
+                        4,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                    4,
+                    32,
+                    4,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<8, 4, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<8, 4, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<4, 4, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<4, 4, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 3, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 3, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 3, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 3, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 1, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 1, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 1, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 1, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, false, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, false, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      } else if (checkCaseBounds == true) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        true,
+                        true>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    true,
+                    true><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, true, true>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, true, true>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      }
+    }
+  } else if (conv == false) {
+    if (scale == false) {
+      if (checkCaseBounds == false) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                        8,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                    8,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                        4,
+                        32,
+                        4,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                    4,
+                    32,
+                    4,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<8, 4, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<8, 4, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<4, 4, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<4, 4, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 3, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 3, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 3, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 3, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 1, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 1, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 1, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 1, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      } else if (checkCaseBounds == true) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        false,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    false,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, false, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, false, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, false, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, false, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, false, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, false, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, false, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, false, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      }
+    } else if (scale == true) {
+      if (checkCaseBounds == false) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                        8,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
+                    8,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaTextureObject_t tex_hidacts =
+                    GetTensorTextureObject(hidActs);
+                cudaTextureObject_t tex_filters =
+                    GetTensorTextureObject(filters);
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                        4,
+                        32,
+                        4,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
+                    4,
+                    32,
+                    4,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    tex_hidacts,
+                    tex_filters,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+                checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+                checkCudaErrors(cudaDestroyTextureObject(tex_filters));
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        4,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    4,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        2,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    2,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        false,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    false,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<8, 4, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<8, 4, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<4, 4, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<4, 4, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 3, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 3, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 3, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 3, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 128 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<8, 1, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<8, 1, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 64 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<4, 1, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<4, 1, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 32 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              } else if (numImages % 16 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, false, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, false, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      } else if (checkCaseBounds == true) {
+        if (numFilterColors % 8 == 0) {
+          if (numFilterColors % 64 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        8,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    8,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 48 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        12,
+                        16,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    12,
+                    16,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 32 == 0) {
+            if (numFilters % 32 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        32,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    32,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            } else if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        8,
+                        16,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    8,
+                    16,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 16 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        4,
+                        16,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    4,
+                    16,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          } else if (numFilterColors % 8 == 0) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    conv_img_acts_manycolor_kepler<
+                        4,
+                        32,
+                        1,
+                        2,
+                        16,
+                        16,
+                        true,
+                        true,
+                        false>,
+                    cudaFuncCachePreferShared);
+                conv_img_acts_manycolor_kepler<
+                    4,
+                    32,
+                    1,
+                    2,
+                    16,
+                    16,
+                    true,
+                    true,
+                    false><<<blocks, threads, 0, stream>>>(
+                    hidacts_data,
+                    filters_data,
+                    targets_data,
+                    numModulesY,
+                    numModulesX,
+                    numImages,
+                    numFilters,
+                    filterSize,
+                    imgSizeY,
+                    imgSizeX,
+                    paddingStart,
+                    moduleStride,
+                    numImgColors,
+                    numGroups,
+                    scaleTargets,
+                    scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors > 3) {
+          if (numFilterColors == 4) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_mediumcolor<2, 4, true, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_mediumcolor<2, 4, true, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        numImgColors,
+                        numGroups,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        } else if (numFilterColors <= 3) {
+          if (numFilterColors == 3) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 3, true, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 3, true, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 2) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 2, true, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 2, true, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          } else if (numFilterColors == 1) {
+            if (numFilters % 16 == 0) {
+              if (numImages % 1 == 0) {
+                cudaFuncSetCacheConfig(
+                    img_acts_color<2, 1, true, true, false>,
+                    cudaFuncCachePreferShared);
+                img_acts_color<2, 1, true, true, false>
+                    <<<blocks, threads, 0, stream>>>(
+                        hidacts_data,
+                        filters_data,
+                        targets_data,
+                        numModulesY,
+                        numModulesX,
+                        numImages,
+                        numFilters,
+                        filterSize,
+                        imgSizeY,
+                        imgSizeX,
+                        paddingStart,
+                        moduleStride,
+                        scaleTargets,
+                        scaleOutput);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  getLastCudaError("imgActs: kernel execution failed");
+}
+
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups) {
+  _imgActs(
+      context,
+      hidActs,
+      filters,
+      targets,
+      imgSizeY,
+      imgSizeX,
+      numModulesY,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      0,
+      1,
+      true);
+}
+
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput) {
+  _imgActs(
+      context,
+      hidActs,
+      filters,
+      targets,
+      imgSizeY,
+      imgSizeX,
+      numModulesY,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      scaleTargets,
+      scaleOutput,
+      true);
+}
+
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups) {
+  _imgActs(
+      context,
+      hidActs,
+      filters,
+      targets,
+      imgSizeY,
+      imgSizeX,
+      numModulesY,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      0,
+      1,
+      false);
+}
+
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput) {
+  _imgActs(
+      context,
+      hidActs,
+      filters,
+      targets,
+      imgSizeY,
+      imgSizeX,
+      numModulesY,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      scaleTargets,
+      scaleOutput,
+      false);
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
new file mode 100644
index 0000000..b41a617
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
@@ -0,0 +1,6099 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include "../include/cudaconv2.cuh"
+
+#define LO16(x) ((x)&0x0000FFFF)
+#define HI16(x) ((x) >> 16)
+
+#define WA_LOOP(r)                                                 \
+  _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) {    \
+    _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \
+      prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] *         \
+          shHidActs[threadIdx.x + f * B_X][(r)];                   \
+    }                                                              \
+  }
+
+#define WA_LOOP2(r)                                               \
+  _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) {  \
+    _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
+      prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] *        \
+          shHidActs[threadIdx.x + f * B_X][(r)];                  \
+    }                                                             \
+  }
+
+#define WA_IMLOAD(r) \
+  imPreload[r] = im[(r)*B_X * B_Y / preloadCases * imgPixels * imgStride];
+#define WA_IMLOAD_TX(r)             \
+  imPreload[r] = tex1Dfetch<float>( \
+      images,                       \
+      imgOffset2 + (r)*B_X * B_Y / preloadCases * imgPixels * imgStride);
+#define WA_HALOAD(r) \
+  haPreload[r] = ha[(r)*B_X * B_Y / preloadCases * numImages * numModules];
+#define WA_HALOAD_TX(r)             \
+  haPreload[r] = tex1Dfetch<float>( \
+      hidActs,                      \
+      hidActsOffset2 + (r)*B_X * B_Y / preloadCases * numImages * numModules);
+
+__device__ __forceinline__ void
+conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+    const int my,
+    const int mx,
+    const int paddingStart,
+    const int numModulesX,
+    const int moduleStride,
+    const int blockPixelY,
+    const int blockPixelX,
+    const int imgSizeX,
+    const int imgStride,
+    int& pixIdx,
+    int& m) {
+  const int imgLoadModPosY = paddingStart + my * moduleStride;
+  const int imgLoadModPosX = paddingStart + mx * moduleStride;
+  const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
+  const int pxX = imgLoadModPosX + blockPixelX;
+  pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
+  m = my * numModulesX + mx;
+}
+
+/*
+ * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
+ * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
+ *
+ * Number of filters must be divisible by B_X * filtersPerThread
+ * Number of images (cases) should be divisible by preloadCases if
+ * checkCaseBounds is false.
+ *
+ * images:      (numColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numColors, filterPixels,
+ * numFilters)
+ *
+ * B_Y * B_X should be divisible by preloadCases.
+ * preloadCases one of 16, 32.
+ * B_X one of 4, 8, 16, 32
+ * B_Y arbitrary (satisfying divisibility constraints)
+ * numModules must be divisible by partialSum
+ * pixelsPerThread must be divisible by pixelCache
+ *
+ * After adding pixelsPerThread, register usage went from 20 to 23 (when
+ * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
+ * unable to optimize that case away.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int pixelCache,
+    int pixelsPerThread,
+    int filtersPerThread,
+    int preloadCases,
+    int numColors,
+    bool scale,
+    bool checkCaseBounds>
+__global__ void conv_weight_acts_c_kepler(
+    float* images,
+    float* hidActs,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int imgStride,
+    const int partialSum,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImages[pixelCache * B_Y * numColors]
+                           [preloadCases]; // preload preloadCases cases of B_Y
+                                           // * pixelsPerThread pixels
+  __shared__ float
+      shHidActs[B_X * filtersPerThread]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int filterBlocksPerModule = numFilters / (B_X * filtersPerThread);
+  const int outputModuleIdx = blockIdx.x / filterBlocksPerModule;
+  const int moduleIdx = partialSum * outputModuleIdx;
+  const int blockFilterIdx =
+      B_X * filtersPerThread * (blockIdx.x % filterBlocksPerModule);
+
+  //    const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
+  const int numModules = numModulesY * numModulesX;
+
+  const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
+
+  images += loadX;
+  hidActs += blockFilterIdx * numImages * numModules +
+      loadY * numImages * numModules + loadX;
+
+  targets += (outputModuleIdx * numFilters) * filterPixels * numColors +
+      blockPixelOffset * numFilters + blockFilterIdx +
+      threadIdx.y * numFilters + threadIdx.x;
+
+  float prod[numColors][pixelsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < numColors; c++) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[c][p][f] = 0;
+      }
+    }
+  }
+
+  __shared__ int pxIdxes[B_Y * pixelsPerThread];
+  //__shared__ bool isPxInImage[B_Y*pixelsPerThread];
+  for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
+    __syncthreads();
+    if (tidx < B_Y * pixelsPerThread) {
+      const int imgLoadModPosY =
+          paddingStart + (m / numModulesX) * moduleStride;
+      const int imgLoadModPosX =
+          paddingStart + (m % numModulesX) * moduleStride;
+      int pxY = (imgLoadModPosY + (blockPixelOffset + tidx) / filterSize);
+      int pxX = (imgLoadModPosX + (blockPixelOffset + tidx) % filterSize);
+      int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
+      pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
+          ? pixIdx
+          : -1;
+      // isPxInImage[tidx] = ;
+    }
+    __syncthreads();
+    for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+      if (/*loadY < B_X*filtersPerThread &&*/ (
+          !checkCaseBounds || caseIdx + loadX < numImages)) {
+#pragma unroll
+        for (int y = 0; y < B_X * filtersPerThread;
+             y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_X * filtersPerThread) {
+            shHidActs[loadY + y][loadX] =
+                hidActs[caseIdx + y * numImages * numModules + m * numImages];
+          }
+        }
+      }
+#pragma unroll
+      for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) {
+// if (loadY < B_Y * pixelCache) { // This condition is not necessary for
+// correctness, but it speeds things a bit
+/*
+ * As long as B_Y * B_X is divisible by preloadCases this will loop the right
+ * number of times.
+ *
+ * This will load some imgGrads from filter pixels that don't exit (it'll set
+ * those to 0), but the code does not produce any output for those pixels (see
+ * last lines).
+ */
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_Y * pixelCache) {
+            const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter
+
+            if (pxIdx + blockPixelOffset < filterPixels &&
+                (!checkCaseBounds || caseIdx + loadX < numImages)) {
+              const int pixIdx =
+                  pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
+
+              if (pixIdx >= 0) {
+#pragma unroll
+                for (int c = 0; c < numColors; c++) {
+                  shImages[loadY + y + c * pixelCache * B_Y][loadX] =
+                      images[caseIdx + c * imgPixels * imgStride + pixIdx];
+                }
+              } else {
+#pragma unroll
+                for (int c = 0; c < numColors; c++) {
+                  shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+                }
+              }
+            } else {
+#pragma unroll
+              for (int c = 0; c < numColors; c++) {
+                shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+              }
+            }
+          }
+        }
+        //}
+
+        __syncthreads();
+
+#pragma unroll
+        for (int i = 0; i < preloadCases; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+            for (int p = 0; p < pixelCache; p++) {
+#pragma unroll
+              for (int c = 0; c < numColors; c++) {
+                prod[c][pp + p][f] +=
+                    shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] *
+                    shHidActs[threadIdx.x + f * B_X][i];
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleTargets *
+                    targets[p * B_Y * numFilters +
+                            c * filterPixels * numFilters + f * B_X] +
+                scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread
+ colors and B_X * filtersPerThread filters
+ * threadIdx.x determines filter
+ * threadIdx.y determines color
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ partialSum
+ * blockIdx.y determines color batch of B_Y * colorsPerThread
+ * blockIdx.z determines pixel in filter
+ *            NOTE: blockIdx.z is limited to values < 2^16. This means that this
+ routine will
+ *                  fail for filters >= 256*256. I'm assuming I won't ever use
+ such large filters.
+
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numFilterColors,
+ filterPixels, numFilters)
+
+ * B_X * B_Y must be divisible by preloadCases
+ */
+template <
+    int B_Y,
+    int B_X,
+    int filtersPerThread,
+    int colorsPerThread,
+    int preloadCases,
+    bool scale>
+__global__ void conv_weight_acts_mc_mf_kepler(
+    float* images,
+    float* hidActs,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int imgStride,
+    const int numImgColors,
+    const int numGroups,
+    const int partialSum,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImages[colorsPerThread * B_Y]
+                           [preloadCases]; // preload preloadCases cases
+  __shared__ float
+      shHidActs[filtersPerThread * B_X]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+  const int outputModuleIdx = blockIdx.x / numFilterBlocks;
+  const int moduleIdx = partialSum * outputModuleIdx;
+  const int blockFilterIdx =
+      filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
+  const int numModules = numModulesY * numModulesX;
+
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+  const int numFilterColors = numImgColors / numGroups;
+
+  const int blockPixelOffset = blockIdx.z; // pixel idx in filter
+  const int blockPixelY = blockPixelOffset / filterSize,
+            blockPixelX = blockPixelOffset % filterSize;
+  const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
+  const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
+
+  images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+
+  hidActs += blockFilterIdx * numImages * numModules +
+      loadY * numImages * numModules + loadX;
+
+  targets += outputModuleIdx * numFilters * filterPixels * numFilterColors +
+      (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
+      blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
+  // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+  float* shHidActLoad = &shHidActs[loadY][loadX];
+  float* shImgLoad = &shImages[loadY][loadX];
+  float prod[colorsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[c][f] = 0;
+    }
+  }
+
+  for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
+    const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
+    const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
+    const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
+    const int pxX = imgLoadModPosX + blockPixelX;
+    const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
+    if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        // Checking this condition actually makes things faster ... :/
+        // So I've removed the !checkCaseBounds flag and just check it all the
+        // time.
+        if (caseIdx + loadX < numImages) {
+          /*
+           * As long as B_Y * B_X is divisible by preloadCases this will loop
+           * the right number of times.
+           *
+           * This will load some images from filter pixels that don't exist
+           * (it'll set those to 0), but the code does not produce any output
+           * for those pixels (see last lines).
+           */
+          if (loadY < B_Y * colorsPerThread) {
+#pragma unroll
+            for (int y = 0; y < B_Y * colorsPerThread;
+                 y += (B_X * B_Y) / preloadCases) {
+              // Make sure number of rows in the array is divisible by number of
+              // rows filled per iteration
+              if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                  y + loadY < B_Y * colorsPerThread) {
+                shImgLoad[(y)*preloadCases] =
+                    images[caseIdx + y * imgPixels * imgStride + pixIdx];
+              }
+            }
+          }
+
+          if (loadY < B_X * filtersPerThread) {
+#pragma unroll
+            for (int y = 0; y < B_X * filtersPerThread;
+                 y += (B_X * B_Y) / preloadCases) {
+              // Make sure number of rows in the array is divisible by number of
+              // rows filled per iteration
+              if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                  y + loadY < B_X * filtersPerThread) {
+                shHidActLoad[y * (preloadCases + 1)] = hidActs
+                    [caseIdx + y * numImages * numModules + m * numImages];
+              }
+            }
+          }
+        } else {
+#pragma unroll
+          for (int y = 0; y < B_Y * colorsPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                y + loadY < B_Y * colorsPerThread) {
+              shImgLoad[(y)*preloadCases] = 0;
+            }
+          }
+#pragma unroll
+          for (int y = 0; y < B_X * filtersPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                y + loadY < B_X * filtersPerThread) {
+              shHidActLoad[y * (preloadCases + 1)] = 0;
+            }
+          }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < preloadCases; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+            for (int c = 0; c < colorsPerThread; c++) {
+              prod[c][f] += shImages[threadIdx.y + c * B_Y][i] *
+                  shHidActs[threadIdx.x + f * B_X][i];
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
+                targets[c * B_Y * filterPixels * numFilters + f * B_X] +
+            scaleOutputs * prod[c][f];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] =
+            scaleOutputs * prod[c][f];
+      }
+    }
+  }
+}
+
+/*
+ * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread
+ colors and B_X * filtersPerThread filters
+ * threadIdx.x determines filter
+ * threadIdx.y determines color
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ partialSum
+ * blockIdx.y determines color batch of B_Y * colorsPerThread
+ * blockIdx.z determines pixel in filter
+ *            NOTE: blockIdx.z is limited to values < 2^16. This means that this
+ routine will
+ *                  fail for filters >= 256*256. I'm assuming I won't ever use
+ such large filters.
+
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numFilterColors,
+ filterPixels, numFilters)
+
+ * B_X * B_Y must be divisible by preloadCases
+ */
+template <
+    int B_Y,
+    int B_X,
+    int filtersPerThread,
+    int colorsPerThread,
+    int preloadCases,
+    bool scale>
+__global__ void conv_weight_acts_mc_mf_kepler_sw(
+    float* images,
+    float* hidActs,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int imgStride,
+    const int numImgColors,
+    const int numGroups,
+    const int sumWidth,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImages[colorsPerThread * B_Y]
+                           [preloadCases]; // preload preloadCases cases
+  __shared__ float
+      shHidActs[filtersPerThread * B_X]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  const int blockFilterIdx =
+      filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
+  const int numModules = numModulesY * numModulesX;
+
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+  const int numFilterColors = numImgColors / numGroups;
+
+  const int blockPixelOffset = blockIdx.z; // pixel idx in filter
+  const int blockPixelY = blockPixelOffset / filterSize,
+            blockPixelX = blockPixelOffset % filterSize;
+  const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
+  const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
+
+  images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+
+  hidActs += blockFilterIdx * numImages * numModules +
+      loadY * numImages * numModules + loadX;
+
+  targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
+      (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
+      blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
+  // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+
+  const int mStartX =
+      max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
+  const int mStartY =
+      max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
+  const int mEndX =
+      min(numModulesX,
+          min(blockModuleStartX + sumWidth,
+              DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
+  const int mEndY =
+      min(numModulesY,
+          min(blockModuleStartY + sumWidth,
+              DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
+
+  //    if (mStartY == mEndY || mStartX == mEndX) {
+  //        return;
+  //    }
+
+  float* shHidActLoad = &shHidActs[loadY][loadX];
+  float* shImgLoad = &shImages[loadY][loadX];
+  float prod[colorsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+      prod[c][f] = 0;
+    }
+  }
+
+  /*
+   * Note; iterating this way is about 1% slower and uses a few more registers
+   * than iterating over the modules linearly. But it's consistent with the
+   * preload routines, so I'm using it.
+   */
+  for (int my = mStartY; my < mEndY; my++) {
+    const int imgLoadModPosY = paddingStart + my * moduleStride;
+    const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      const int m = my * numModulesX + mx;
+      const int imgLoadModPosX = paddingStart + mx * moduleStride;
+      const int pxX = imgLoadModPosX + blockPixelX;
+      const int pixIdx =
+          (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        // Checking this condition actually makes things faster ... :/
+        // So I've removed the !checkCaseBounds flag and just check it all the
+        // time.
+        if (caseIdx + loadX < numImages) {
+          /*
+           * As long as B_Y * B_X is divisible by preloadCases this will loop
+           * the right number of times.
+           *
+           * This will load some images from filter pixels that don't exist
+           * (it'll set those to 0), but the code does not produce any output
+           * for those pixels (see last lines).
+           */
+          if (loadY < B_Y * colorsPerThread) {
+#pragma unroll
+            for (int y = 0; y < B_Y * colorsPerThread;
+                 y += (B_X * B_Y) / preloadCases) {
+              // Make sure number of rows in the array is divisible by number of
+              // rows filled per iteration
+              if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                  y + loadY < B_Y * colorsPerThread) {
+                shImgLoad[(y)*preloadCases] =
+                    images[caseIdx + y * imgPixels * imgStride + pixIdx];
+              }
+            }
+          }
+
+          if (loadY < B_X * filtersPerThread) {
+#pragma unroll
+            for (int y = 0; y < B_X * filtersPerThread;
+                 y += (B_X * B_Y) / preloadCases) {
+              // Make sure number of rows in the array is divisible by number of
+              // rows filled per iteration
+              if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                  y + loadY < B_X * filtersPerThread) {
+                shHidActLoad[y * (preloadCases + 1)] = hidActs
+                    [caseIdx + y * numImages * numModules + m * numImages];
+              }
+            }
+          }
+        } else {
+#pragma unroll
+          for (int y = 0; y < B_Y * colorsPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                y + loadY < B_Y * colorsPerThread) {
+              shImgLoad[(y)*preloadCases] = 0;
+            }
+          }
+#pragma unroll
+          for (int y = 0; y < B_X * filtersPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                y + loadY < B_X * filtersPerThread) {
+              shHidActLoad[y * (preloadCases + 1)] = 0;
+            }
+          }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < preloadCases; i++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+            for (int c = 0; c < colorsPerThread; c++) {
+              prod[c][f] += shImages[threadIdx.y + c * B_Y][i] *
+                  shHidActs[threadIdx.x + f * B_X][i];
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+  }
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
+                targets[c * B_Y * filterPixels * numFilters + f * B_X] +
+            scaleOutputs * prod[c][f];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] =
+            scaleOutputs * prod[c][f];
+      }
+    }
+  }
+}
+
+/*
+ * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
+ * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
+ *
+ * Number of filters must be divisible by B_X * filtersPerThread
+ * Number of images (cases) should be divisible by preloadCases if
+ * checkCaseBounds is false.
+ *
+ * images:      (numColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numColors, filterPixels,
+ * numFilters)
+ *
+ * B_Y * B_X should be divisible by preloadCases.
+ * preloadCases one of 16, 32.
+ * B_X one of 4, 8, 16, 32
+ * B_Y arbitrary (satisfying divisibility constraints)
+ * numModules must be divisible by partialSum
+ * pixelsPerThread must be divisible by pixelCache
+ *
+ * After adding pixelsPerThread, register usage went from 20 to 23 (when
+ * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
+ * unable to optimize that case away.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int pixelCache,
+    int pixelsPerThread,
+    int filtersPerThread,
+    int preloadCases,
+    int numColors,
+    bool scale,
+    bool checkCaseBounds>
+__global__ void conv_weight_acts_c_kepler_sw(
+    float* images,
+    float* hidActs,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int imgStride,
+    const int sumWidth,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImages[pixelCache * B_Y * numColors]
+                           [preloadCases]; // preload preloadCases cases of B_Y
+                                           // * pixelsPerThread pixels
+  __shared__ float
+      shHidActs[B_X * filtersPerThread]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  const int blockFilterIdx =
+      B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
+
+  //    const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
+  const int numModules = numModulesY * numModulesX;
+
+  const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
+
+  images += loadX;
+  hidActs += blockFilterIdx * numImages * numModules
+      //            + loadY * numImages * numModules
+      + loadX;
+
+  targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
+      blockPixelOffset * numFilters + blockFilterIdx +
+      threadIdx.y * numFilters + threadIdx.x;
+
+  // float* shImgLoad = &shImages[loadY][loadX];
+  // float* shHidActLoad = &shHidActs[loadY][loadX];
+
+  float prod[numColors][pixelsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < numColors; c++) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[c][p][f] = 0;
+      }
+    }
+  }
+  const int mStartX = blockModuleStartX;
+  const int mStartY = blockModuleStartY;
+  const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
+  const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
+
+  //    if (mStartY == mEndY || mStartX == mEndX) {
+  //        return;
+  //    }
+
+  const int fYOff = (blockPixelOffset + tidx) / filterSize;
+  const int fXOff = (blockPixelOffset + tidx) % filterSize;
+  __shared__ int pxIdxes[B_Y * pixelsPerThread];
+  for (int my = mStartY; my < mEndY; my++) {
+    const int imgLoadModPosY = paddingStart + my * moduleStride;
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      const int m = my * numModulesX + mx;
+
+      __syncthreads();
+      const int imgLoadModPosX = paddingStart + mx * moduleStride;
+      if (tidx < B_Y * pixelsPerThread) {
+        //                const int imgLoadModPosY = paddingStart + my *
+        //                moduleStride; const int imgLoadModPosX = paddingStart
+        //                + mx * moduleStride;
+        int pxY = (imgLoadModPosY + fYOff);
+        int pxX = (imgLoadModPosX + fXOff);
+        int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
+        pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
+            ? pixIdx
+            : -1;
+      }
+      __syncthreads();
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        if (/*loadY < B_X*filtersPerThread &&*/ (
+            !checkCaseBounds || caseIdx + loadX < numImages)) {
+#pragma unroll
+          for (int y = 0; y < B_X * filtersPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            const int fIdx = ((loadY + y) % filtersPerThread) * B_X +
+                (loadY + y) / filtersPerThread;
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                loadY + y < B_X * filtersPerThread) {
+              shHidActs[loadY + y][loadX] = hidActs
+                  [caseIdx + fIdx * numImages * numModules + m * numImages];
+            }
+          }
+        } else {
+#pragma unroll
+          for (int y = 0; y < B_X * filtersPerThread;
+               y += (B_X * B_Y) / preloadCases) {
+            //                        const int fIdx = ((loadY + y) %
+            //                        filtersPerThread) * B_X + (loadY + y) /
+            //                        filtersPerThread;
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
+                loadY + y < B_X * filtersPerThread) {
+              shHidActs[loadY + y][loadX] = 0;
+            }
+          }
+        }
+#pragma unroll
+        for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) {
+// if (loadY < B_Y * pixelCache) { // This condition is not necessary for
+// correctness, but it speeds things a bit
+/*
+ * As long as B_Y * B_X is divisible by preloadCases this will loop the right
+ * number of times.
+ *
+ * This will load some imgGrads from filter pixels that don't exit (it'll set
+ * those to 0), but the code does not produce any output for those pixels (see
+ * last lines).
+ */
+#pragma unroll
+          for (int y = 0; y < B_Y * pixelCache;
+               y += (B_X * B_Y) / preloadCases) {
+            // Make sure number of rows in the array is divisible by number of
+            // rows filled per iteration
+            if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+                y + loadY < B_Y * pixelCache) {
+              const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter
+
+              if (pxIdx + blockPixelOffset < filterPixels &&
+                  (!checkCaseBounds || caseIdx + loadX < numImages)) {
+                const int pixIdx =
+                    pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
+
+                if (pixIdx >= 0) {
+#pragma unroll
+                  for (int c = 0; c < numColors; c++) {
+                    shImages[loadY + y + c * pixelCache * B_Y][loadX] =
+                        images[caseIdx + c * imgPixels * imgStride + pixIdx];
+                  }
+                } else {
+#pragma unroll
+                  for (int c = 0; c < numColors; c++) {
+                    shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+                  }
+                }
+              } else {
+#pragma unroll
+                for (int c = 0; c < numColors; c++) {
+                  shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+                }
+              }
+            }
+          }
+          //}
+
+          __syncthreads();
+
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+#pragma unroll
+            for (int i = 0; i < preloadCases; i++) {
+#pragma unroll
+              for (int p = 0; p < pixelCache; p++) {
+#pragma unroll
+                for (int f = 0; f < filtersPerThread; f++) {
+                  prod[c][pp + p][f] +=
+                      shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y]
+                              [i] *
+                      shHidActs[threadIdx.x * filtersPerThread + f][i];
+                }
+              }
+            }
+          }
+
+          __syncthreads();
+        }
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleTargets *
+                    targets[p * B_Y * numFilters +
+                            c * filterPixels * numFilters + f * B_X] +
+                scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  }
+}
+
+#define WA_C3_LOOP(pp, c)                                               \
+  _Pragma("unroll") for (int i = 0; i < preloadCases; i++) {            \
+    _Pragma("unroll") for (int p = 0; p < pixelCache; p++) {            \
+      _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) {    \
+        prod[c][(pp) + p][f] +=                                         \
+            shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \
+            shHidActs[threadIdx.x * filtersPerThread + f][i];           \
+      }                                                                 \
+    }                                                                   \
+  }
+
+#define WA_C3_LOOP2(pp)                                                   \
+  _Pragma("unroll") for (int p = 0; p < pixelCache; p++) {                \
+    _Pragma("unroll") for (int i = 0; i < preloadCases; i++) {            \
+      _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) {      \
+        _Pragma("unroll") for (int c = 0; c < 3; ++c) {                   \
+          prod[c][(pp) + p][f] +=                                         \
+              shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \
+              shHidActs[threadIdx.x * filtersPerThread + f][i];           \
+        }                                                                 \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define WA_3_FIDX(y)                                                   \
+  (((loadY + (y)*B_X * B_Y / preloadCases) % filtersPerThread) * B_X + \
+   (loadY + (y)*B_X * B_Y / preloadCases) / filtersPerThread)
+
+/*
+ * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
+ * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
+ *
+ * Number of filters must be divisible by B_X * filtersPerThread
+ * Number of images (cases) should be divisible by preloadCases if
+ * checkCaseBounds is false.
+ *
+ * images:      (numColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numColors, filterPixels,
+ * numFilters)
+ *
+ * B_Y * B_X should be divisible by preloadCases.
+ * preloadCases one of 16, 32.
+ * B_X one of 4, 8, 16, 32
+ * B_Y arbitrary (satisfying divisibility constraints)
+ * numModules must be divisible by partialSum
+ * pixelsPerThread must be divisible by pixelCache
+ *
+ * After adding pixelsPerThread, register usage went from 20 to 23 (when
+ * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
+ * unable to optimize that case away.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int pixelCache,
+    int pixelsPerThread,
+    int filtersPerThread,
+    int preloadCases,
+    int numColors,
+    bool scale,
+    bool checkCaseBounds>
+//__launch_bounds__(256,2)
+__global__ void conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3(
+    cudaTextureObject_t images,
+    cudaTextureObject_t hidActs,
+    float* targets,
+    const int numImages,
+    const int numFilters,
+    const int numModulesY,
+    const int numModulesX,
+    const int imgSizeY,
+    const int imgSizeX,
+    const int filterSize,
+    const int paddingStart,
+    const int moduleStride,
+    const int imgStride,
+    const int sumWidth,
+    const float scaleTargets,
+    const float scaleOutputs) {
+  __shared__ float shImages[pixelCache * B_Y * numColors]
+                           [preloadCases]; // preload preloadCases cases of B_Y
+                                           // * pixelsPerThread pixels
+  __shared__ float
+      shHidActs[B_X * filtersPerThread]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  const int blockFilterIdx =
+      B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
+
+  //    const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
+  const int numModules = numModulesY * numModulesX;
+
+  const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
+  const int imgOffset = loadX;
+  const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX;
+  //    images += loadX;
+  //    hidActs += blockFilterIdx * numImages * numModules
+  //            + loadX;
+
+  targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
+      blockPixelOffset * numFilters + blockFilterIdx +
+      threadIdx.y * numFilters + threadIdx.x;
+
+  // float* shImgLoad = &shImages[loadY][loadX];
+  // float* shHidActLoad = &shHidActs[loadY][loadX];
+
+  float prod[numColors][pixelsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < numColors; c++) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[c][p][f] = 0;
+      }
+    }
+  }
+  const int mStartX = blockModuleStartX;
+  const int mStartY = blockModuleStartY;
+  const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
+  const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
+
+  const bool doWork = mStartY < mEndY && mStartX < mEndX;
+  //    if (!doWork) {
+  //        hidActs -=
+  //    }
+  //    if (mStartY == mEndY || mStartX == mEndX) {
+  //        return;
+  //    }
+
+  //    float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12]
+  float haPreload[filtersPerThread * preloadCases / B_Y]; // [8]
+  //    if (blockIdx.x != 0 || blockIdx.y !=0) {
+  //        return;
+  //    }
+  //    printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX,
+  //    mStartY, mEndX, mEndY);
+  const int fYOff = (blockPixelOffset + tidx) / filterSize;
+  const int fXOff = (blockPixelOffset + tidx) % filterSize;
+  __shared__ int pxIdxes[B_Y * pixelsPerThread];
+  //    __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [8]
+
+  int m = mStartY * numModulesX + mStartX;
+
+  int fidx[filtersPerThread * preloadCases / B_Y];
+  if (doWork) {
+#pragma unroll
+    for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) {
+      const int fIdx = WA_3_FIDX(y);
+      //            if (doWork) {
+      haPreload[y] = tex1Dfetch<float>(
+          hidActs,
+          hidActsOffset + fIdx * numImages * numModules + m * numImages);
+      //            }
+      fidx[y] = fIdx * numImages * numModules;
+    }
+  }
+
+  for (int my = mStartY; my < mEndY; my++) {
+    const int imgLoadModPosY = paddingStart + my * moduleStride;
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      m = my * numModulesX + mx;
+
+      //            __syncthreads();
+      const int imgLoadModPosX = paddingStart + mx * moduleStride;
+      if (tidx < B_Y * pixelsPerThread) {
+        //                const int imgLoadModPosY = paddingStart + my *
+        //                moduleStride; const int imgLoadModPosX = paddingStart
+        //                + mx * moduleStride;
+        const int pxY = (imgLoadModPosY + fYOff);
+        const int pxX = (imgLoadModPosX + fXOff);
+        const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
+        pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
+            ? pixIdx
+            : -1;
+      }
+      __syncthreads();
+
+      int myNext = my, mxNext = mx, mNext = m;
+      const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
+
+      if (!lastModule) {
+        mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+        myNext = my + (mx + 1 == mEndX);
+        mNext = myNext * numModulesX + mxNext;
+      }
+
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        const bool lastBatch = caseIdx + preloadCases == numImages;
+        //                const float* im = &images[caseIdx + preloadCases +
+        //                pixIdx]; const float* ha = &hidActs[caseIdx +
+        //                preloadCases + m * numImages];
+        int hidActsOffset2 =
+            hidActsOffset + caseIdx + preloadCases + m * numImages;
+
+        if (lastBatch) {
+          //                    ha = &hidActs[mNext * numImages];
+          hidActsOffset2 = hidActsOffset + mNext * numImages;
+        }
+
+#pragma unroll
+        for (int y = 0; y < B_X * filtersPerThread;
+             y += (B_X * B_Y) / preloadCases) {
+          shHidActs[loadY + y][loadX] =
+              haPreload[y * preloadCases / (B_X * B_Y)];
+        }
+
+/* ==================================================================================
+ * Iteration 0
+ * ==================================================================================
+ */
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+#pragma unroll
+          for (int c = 0; c < numColors; c++) {
+            shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+          }
+        }
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter
+          if (pxIdx + blockPixelOffset < filterPixels) {
+            const int pixIdx =
+                pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
+            if (pixIdx >= 0) {
+#pragma unroll
+              for (int c = 0; c < numColors; c++) {
+                shImages[loadY + y + c * pixelCache * B_Y][loadX] =
+                    tex1Dfetch<float>(
+                        images,
+                        imgOffset + caseIdx + c * imgPixels * imgStride +
+                            pixIdx);
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        haPreload[0] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[0]);
+        haPreload[1] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[1]);
+        WA_C3_LOOP(0, 0);
+        haPreload[2] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[2]);
+        haPreload[3] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[3]);
+        WA_C3_LOOP(0, 1);
+        haPreload[4] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[4]);
+        haPreload[5] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[5]);
+        WA_C3_LOOP(0, 2);
+        haPreload[6] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[6]);
+        haPreload[7] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[7]);
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleTargets *
+                    targets[p * B_Y * numFilters +
+                            c * filterPixels * numFilters + f * B_X] +
+                scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            //                        if (threadIdx.x == 3)
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
+ * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
+ *
+ * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
+ * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
+ *
+ * Number of filters must be divisible by B_X * filtersPerThread
+ * Number of images (cases) should be divisible by preloadCases if
+ * checkCaseBounds is false.
+ *
+ * images:      (numColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numColors, filterPixels,
+ * numFilters)
+ *
+ * B_Y * B_X should be divisible by preloadCases.
+ * preloadCases one of 16, 32.
+ * B_X one of 4, 8, 16, 32
+ * B_Y arbitrary (satisfying divisibility constraints)
+ * numModules must be divisible by partialSum
+ * pixelsPerThread must be divisible by pixelCache
+ *
+ * After adding pixelsPerThread, register usage went from 20 to 23 (when
+ * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
+ * unable to optimize that case away.
+ */
+template <
+    int B_Y,
+    int B_X,
+    int pixelCache,
+    int pixelsPerThread,
+    int filtersPerThread,
+    int preloadCases,
+    int numColors,
+    bool scale,
+    bool checkCaseBounds>
+__launch_bounds__(256, 2) __global__
+    void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(
+        cudaTextureObject_t images,
+        cudaTextureObject_t hidActs,
+        float* targets,
+        const int numImages,
+        const int numFilters,
+        const int numModulesY,
+        const int numModulesX,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int filterSize,
+        const int paddingStart,
+        const int moduleStride,
+        const int imgStride,
+        const int sumWidth,
+        const float scaleTargets,
+        const float scaleOutputs) {
+  __shared__ float shImages[pixelCache * B_Y * numColors]
+                           [preloadCases]; // preload preloadCases cases of B_Y
+                                           // * pixelsPerThread pixels
+  __shared__ float
+      shHidActs[B_X * filtersPerThread]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  const int blockFilterIdx =
+      B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
+
+  //    const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
+  const int numModules = numModulesY * numModulesX;
+
+  const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
+  const int imgOffset = loadX;
+  const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX;
+  //    images += loadX;
+  //    hidActs += blockFilterIdx * numImages * numModules
+  //            + loadX;
+
+  targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
+      blockPixelOffset * numFilters + blockFilterIdx +
+      threadIdx.y * numFilters + threadIdx.x;
+
+  // float* shImgLoad = &shImages[loadY][loadX];
+  // float* shHidActLoad = &shHidActs[loadY][loadX];
+
+  float prod[numColors][pixelsPerThread][filtersPerThread];
+#pragma unroll
+  for (int c = 0; c < numColors; c++) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        prod[c][p][f] = 0;
+      }
+    }
+  }
+  const int mStartX = blockModuleStartX;
+  const int mStartY = blockModuleStartY;
+  const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
+  const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
+
+  const bool doWork = mStartY < mEndY && mStartX < mEndX;
+  //    if (mStartY == mEndY || mStartX == mEndX) {
+  //        return;
+  //    }
+
+  //    float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12]
+  float haPreload[filtersPerThread * preloadCases / B_Y]; // [6]
+  //    if (blockIdx.x != 0 || blockIdx.y !=0) {
+  //        return;
+  //    }
+  //    printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX,
+  //    mStartY, mEndX, mEndY);
+  const int fYOff = (blockPixelOffset + tidx) / filterSize;
+  const int fXOff = (blockPixelOffset + tidx) % filterSize;
+  __shared__ int pxIdxes[B_Y * pixelsPerThread];
+  //    __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [6]
+
+  int m = mStartY * numModulesX + mStartX;
+  int fidx[filtersPerThread * preloadCases / B_Y];
+  //    if (doWork) {
+#pragma unroll
+  for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) {
+    fidx[y] = WA_3_FIDX(y) * numImages * numModules;
+    if (doWork) { // Not actually necessary, I think
+      haPreload[y] =
+          tex1Dfetch<float>(hidActs, hidActsOffset + fidx[y] + m * numImages);
+    }
+  }
+  //    }
+  int mNext = mStartY * numModulesX + mStartX;
+  for (int my = mStartY; my < mEndY; my++) {
+    //        const int imgLoadModPosY = paddingStart + my * moduleStride;
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      m = mNext; // my * numModulesX + mx;
+
+      //            __syncthreads();
+      //            const int imgLoadModPosX = paddingStart + mx * moduleStride;
+      if (tidx < B_Y * pixelsPerThread) {
+        const int imgLoadModPosY = paddingStart + my * moduleStride;
+        const int imgLoadModPosX = paddingStart + mx * moduleStride;
+        const int pxY = (imgLoadModPosY + fYOff);
+        const int pxX = (imgLoadModPosX + fXOff);
+        const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
+        pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
+            ? pixIdx
+            : -1;
+      }
+      __syncthreads();
+
+      const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
+      mNext = lastModule * m +
+          !lastModule *
+              ((my + (mx + 1 == mEndX)) * numModulesX +
+               (mx + 1 == mEndX ? mStartX : mx + 1));
+      //            if (!lastModule) {
+      //                const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+      //                const int myNext = my + (mx + 1 == mEndX);
+      //                mNext = myNext * numModulesX + mxNext;
+      //            }
+
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        const bool lastBatch = caseIdx + preloadCases == numImages;
+        //                const float* im = &images[caseIdx + preloadCases +
+        //                pixIdx]; const float* ha = hidActs + !lastBatch *
+        //                (caseIdx + preloadCases + m * numImages) + lastBatch *
+        //                mNext * numImages;
+        const int hidActsOffset2 = hidActsOffset +
+            !lastBatch * (caseIdx + preloadCases + m * numImages) +
+            lastBatch * mNext * numImages;
+        //                if (lastBatch) {
+        //                    ha = &hidActs[mNext * numImages];
+        //                }
+
+#pragma unroll
+        for (int y = 0; y < B_X * filtersPerThread;
+             y += (B_X * B_Y) / preloadCases) {
+          shHidActs[loadY + y][loadX] =
+              haPreload[y * preloadCases / (B_X * B_Y)];
+        }
+
+/* ==================================================================================
+ * Iteration 0
+ * ==================================================================================
+ */
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_Y * pixelCache) {
+#pragma unroll
+            for (int c = 0; c < numColors; c++) {
+              shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+            }
+          }
+        }
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_Y * pixelCache) {
+            const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter
+            const int pixIdx =
+                pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
+            if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels &&
+                (!checkCaseBounds || caseIdx + loadX < numImages)) {
+#pragma unroll
+              for (int c = 0; c < numColors; c++) {
+                shImages[loadY + y + c * pixelCache * B_Y][loadX] =
+                    tex1Dfetch<float>(
+                        images,
+                        imgOffset + caseIdx + c * imgPixels * imgStride +
+                            pixIdx);
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        haPreload[0] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[0]);
+        haPreload[1] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[1]);
+        haPreload[2] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[2]);
+        haPreload[3] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[3]);
+        haPreload[4] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[4]);
+        haPreload[5] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[5]);
+
+        WA_C3_LOOP2(0);
+
+        __syncthreads();
+
+/* ==================================================================================
+ * Iteration 1
+ * ==================================================================================
+ */
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_Y * pixelCache) {
+            //                        const int pxIdx = 2 * B_Y + loadY + y; //
+            //                        pixel idx in filter
+#pragma unroll
+            for (int c = 0; c < numColors; c++) {
+              shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
+            }
+          }
+        }
+
+#pragma unroll
+        for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
+          // Make sure number of rows in the array is divisible by number of
+          // rows filled per iteration
+          if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
+              y + loadY < B_Y * pixelCache) {
+            const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter
+            const int pixIdx =
+                pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
+            if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels &&
+                (!checkCaseBounds || caseIdx + loadX < numImages)) {
+#pragma unroll
+              for (int c = 0; c < numColors; c++) {
+                shImages[loadY + y + c * pixelCache * B_Y][loadX] =
+                    tex1Dfetch<float>(
+                        images,
+                        imgOffset + caseIdx + c * imgPixels * imgStride +
+                            pixIdx);
+              }
+            }
+          }
+        }
+
+        __syncthreads();
+
+        WA_C3_LOOP2(2);
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleTargets *
+                    targets[p * B_Y * numFilters +
+                            c * filterPixels * numFilters + f * B_X] +
+                scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int p = 0; p < pixelsPerThread; p++) {
+      if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
+#pragma unroll
+        for (int c = 0; c < numColors; c++) {
+#pragma unroll
+          for (int f = 0; f < filtersPerThread; f++) {
+            targets
+                [p * B_Y * numFilters + c * filterPixels * numFilters +
+                 f * B_X] = scaleOutputs * prod[c][p][f];
+          }
+        }
+      }
+    }
+  }
+}
+
+/*****************************Function Revision
+ *Record***************************** Author: Tencent BestImage
+ *Team(ankerguo@tencent.com)                           * Date:   2015-05-18 *
+ * Reason: Optimizing kernel to get faster speed according to GPU features *
+ * Method: *
+ *         1. reorganizing data structure to avoid bank conflict; *
+ *         2. using vectorized data type; *
+ *         3. improving instruction-level parallelism; *
+ *         4. removing redundant 'if' branches; *
+ *         5. removing local variables to save registers. *
+ *********************************************************************************/
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numFilterColors,
+ * filterPixels, numFilters)
+ */
+template <
+    int B_Y,
+    int B_X,
+    int filtersPerThread,
+    int colorsPerThread,
+    int preloadCases,
+    bool scale>
+__launch_bounds__(128, 4) __global__
+    void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(
+        cudaTextureObject_t images,
+        cudaTextureObject_t hidActs,
+        float* targets,
+        const int numImages,
+        const int numFilters,
+        const int numModulesY,
+        const int numModulesX,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int filterSize,
+        const int paddingStart,
+        const int moduleStride,
+        const int imgStride,
+        const int numImgColors,
+        const int numGroups,
+        const int sumWidth,
+        const float scaleTargets,
+        const float scaleOutputs) {
+  // avoid bank conflict by reorganizing the data structure, and improve the
+  // band width by using 'float2'  instead of 'float'
+  __shared__ float2
+      shImages[preloadCases]
+              [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
+  __shared__ float2 shHidActs[preloadCases]
+                             [filtersPerThread * B_X / 2 +
+                              2]; // preload preloadCases cases of B_X hidacts
+
+  const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+  const int tidx = B_X * ty + tx;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  //    const int moduleIdx = partialSum * outputModuleIdx;
+  const int blockFilterIdx =
+      filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
+  const int numModules = numModulesY * numModulesX;
+
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+  const int numFilterColors = numImgColors / numGroups;
+
+  const int blockPixelOffset = blockIdx.z; // pixel idx in filter
+  const int blockPixelY = blockPixelOffset / filterSize,
+            blockPixelX = blockPixelOffset % filterSize;
+  const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
+  const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
+  const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+  //    images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+  const int hidActsOffset = blockFilterIdx * numImages * numModules +
+      loadY * numImages * numModules + loadX;
+  //
+  //    hidActs +=
+  //             blockFilterIdx * numImages * numModules
+  //            + loadY * numImages * numModules
+  //            + loadX;
+
+  targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
+      (blockFilterColorIdx + ty) * filterPixels * numFilters +
+      blockPixelOffset * numFilters + blockFilterIdx + tx;
+  // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+
+  const int mStartX =
+      max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
+  const int mStartY =
+      max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
+  const int mEndX =
+      min(numModulesX,
+          min(blockModuleStartX + sumWidth,
+              DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
+  const int mEndY =
+      min(numModulesY,
+          min(blockModuleStartY + sumWidth,
+              DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
+
+  // if (mStartY == mEndY || mStartX == mEndX) {
+  //     return;
+  // }
+  const bool doWork = mStartY < mEndY && mStartX < mEndX;
+
+  // reduce 2 registers
+  // float* shHidActLoad = &shHidActs[loadY][loadX];
+  // float* shImgLoad = &shImages[loadY][loadX];
+
+  float imPreload[preloadCases * colorsPerThread / B_X]; // [8]
+  float haPreload[preloadCases * filtersPerThread / B_Y]; // [8]
+
+  float prod[filtersPerThread][colorsPerThread];
+
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+      prod[f][c] = 0;
+    }
+  }
+  int pixIdx, pixIdxNext, m, mNext;
+
+  conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+      mStartY,
+      mStartX,
+      paddingStart,
+      numModulesX,
+      moduleStride,
+      blockPixelY,
+      blockPixelX,
+      imgSizeX,
+      imgStride,
+      pixIdx,
+      m);
+
+  if (doWork) {
+#pragma unroll
+    for (int y = 0; y < B_Y * colorsPerThread;
+         y += (B_X * B_Y) / preloadCases) {
+      // It's bizarre, but this is the fastest way I've found to get it not to
+      // load nonexistent pixels. All other ways cause crazy excessive register
+      // usage.
+      const int idx = (mStartY < mEndY && mStartX < mEndX) *
+          (0 + y * imgPixels * imgStride + pixIdx);
+      imPreload[y * preloadCases / (B_X * B_Y)] =
+          tex1Dfetch<float>(images, imgOffset + idx);
+    }
+  }
+
+  if (doWork) {
+#pragma unroll
+    for (int y = 0; y < B_X * filtersPerThread;
+         y += (B_X * B_Y) / preloadCases) {
+      // Almost certainly not necessary here.
+      const int idx = (mStartY < mEndY && mStartX < mEndX) *
+          (0 + y * numImages * numModules + m * numImages);
+      haPreload[y * preloadCases / (B_X * B_Y)] =
+          tex1Dfetch<float>(hidActs, hidActsOffset + idx);
+    }
+  }
+
+  for (int my = mStartY; my < mEndY; my++) {
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      int myNext = my, mxNext = mx;
+      const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
+
+      if (!lastModule) {
+        mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+        myNext = my + (mx + 1 == mEndX);
+      }
+
+      conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+          myNext,
+          mxNext,
+          paddingStart,
+          numModulesX,
+          moduleStride,
+          blockPixelY,
+          blockPixelX,
+          imgSizeX,
+          imgStride,
+          pixIdxNext,
+          mNext);
+
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+// store the preloaded image's pixel into shared memory
+#pragma unroll
+        for (int y = 0; y < 4; y++) {
+          shImages[loadX][loadY + y * 8].x = imPreload[y];
+          shImages[loadX][loadY + y * 8].y = imPreload[y + 4];
+        }
+        // const float* im = &images[caseIdx + preloadCases + pixIdx];
+        // const float* ha = &hidActs[caseIdx + preloadCases + m * numImages];
+        int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
+        int hidActsOffset2 =
+            hidActsOffset + caseIdx + preloadCases + m * numImages;
+        if (caseIdx + preloadCases == numImages) {
+          pixIdx = pixIdxNext;
+          m = mNext;
+          imgOffset2 = imgOffset + pixIdxNext;
+          hidActsOffset2 = hidActsOffset + mNext * numImages;
+        }
+
+        // store the images and hidActs
+        shHidActs[loadX][loadY].x = haPreload[0];
+        shHidActs[loadX][loadY].y = haPreload[2];
+        shHidActs[loadX][loadY + 16].x = haPreload[4];
+        shHidActs[loadX][loadY + 16].y = haPreload[6];
+        shHidActs[loadX][loadY + 8].x = haPreload[1];
+        shHidActs[loadX][loadY + 8].y = haPreload[3];
+        shHidActs[loadX][loadY + 24].x = haPreload[5];
+        shHidActs[loadX][loadY + 24].y = haPreload[7];
+
+// preloade the image's and hidAct's pixel
+#pragma unroll
+        for (int r = 0; r < 8; r++) {
+          imPreload[r] = tex1Dfetch<float>(
+              images, imgOffset2 + (r)*8 * imgPixels * imgStride);
+          haPreload[r] = tex1Dfetch<float>(
+              hidActs, hidActsOffset2 + (r)*8 * numImages * numModules);
+        }
+
+        __syncthreads();
+// put together the instructions of same type to improve instruction-level
+// parallelism
+#pragma unroll
+        for (int r = 0; r < 16; r++) {
+          for (int c = 0; c < 4; c++) {
+            prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x;
+            prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y;
+            prod[2][c] +=
+                shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x;
+            prod[3][c] +=
+                shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y;
+            prod[0][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x;
+            prod[1][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y;
+            prod[2][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x;
+            prod[3][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y;
+          }
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
+                targets[c * B_Y * filterPixels * numFilters + f * B_X] +
+            scaleOutputs * prod[f][c];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] =
+            scaleOutputs * prod[f][c];
+      }
+    }
+  }
+}
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numFilterColors,
+ * filterPixels, numFilters)
+ */
+template <
+    int B_Y,
+    int B_X,
+    int filtersPerThread,
+    int colorsPerThread,
+    int preloadCases,
+    bool scale>
+__launch_bounds__(256, 2) __global__
+    void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(
+        cudaTextureObject_t images,
+        cudaTextureObject_t hidActs,
+        float* targets,
+        const int numImages,
+        const int numFilters,
+        const int numModulesY,
+        const int numModulesX,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int filterSize,
+        const int paddingStart,
+        const int moduleStride,
+        const int imgStride,
+        const int numImgColors,
+        const int numGroups,
+        const int sumWidth,
+        const float scaleTargets,
+        const float scaleOutputs) {
+  __shared__ float shImages[colorsPerThread * B_Y]
+                           [preloadCases]; // preload preloadCases cases
+  __shared__ float
+      shHidActs[filtersPerThread * B_X]
+               [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
+
+  const int tidx = B_X * threadIdx.y + threadIdx.x;
+  const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  const int filterPixels = filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  //    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  //    const int moduleIdx = partialSum * outputModuleIdx;
+  const int blockFilterIdx =
+      filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
+  const int numModules = numModulesY * numModulesX;
+
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+  const int numFilterColors = numImgColors / numGroups;
+
+  const int blockPixelOffset = blockIdx.z; // pixel idx in filter
+  const int blockPixelY = blockPixelOffset / filterSize,
+            blockPixelX = blockPixelOffset % filterSize;
+  const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
+  const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
+
+  const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+  const int hidActsOffset = blockFilterIdx * numImages * numModules +
+      loadY * numImages * numModules + loadX;
+  //    images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+  //
+  //    hidActs +=
+  //             blockFilterIdx * numImages * numModules
+  //            + loadY * numImages * numModules
+  //            + loadX;
+
+  targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
+      (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
+      blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
+  //    if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+
+  const int mStartX =
+      max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
+  const int mStartY =
+      max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
+  const int mEndX =
+      min(numModulesX,
+          min(blockModuleStartX + sumWidth,
+              DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
+  const int mEndY =
+      min(numModulesY,
+          min(blockModuleStartY + sumWidth,
+              DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
+
+  //    if (mStartY == mEndY || mStartX == mEndX) {
+  //        return;
+  //    }
+  const bool doWork = mStartY < mEndY && mStartX < mEndX;
+
+  float* shHidActLoad = &shHidActs[loadY][loadX];
+  float* shImgLoad = &shImages[loadY][loadX];
+
+  float imPreload[preloadCases * colorsPerThread / B_X]; // [6]
+  float haPreload[preloadCases * filtersPerThread / B_Y]; // [16]
+
+  float prod[filtersPerThread][colorsPerThread];
+
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+      prod[f][c] = 0;
+    }
+  }
+  int pixIdx, pixIdxNext, m, mNext;
+
+  conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+      mStartY,
+      mStartX,
+      paddingStart,
+      numModulesX,
+      moduleStride,
+      blockPixelY,
+      blockPixelX,
+      imgSizeX,
+      imgStride,
+      pixIdx,
+      m);
+
+  if (doWork) {
+#pragma unroll
+    for (int y = 0; y < B_Y * colorsPerThread;
+         y += (B_X * B_Y) / preloadCases) {
+      imPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(
+          images, imgOffset + y * imgPixels * imgStride + pixIdx);
+    }
+
+#pragma unroll
+    for (int y = 0; y < B_X * filtersPerThread;
+         y += (B_X * B_Y) / preloadCases) {
+      haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(
+          hidActs, hidActsOffset + y * numImages * numModules + m * numImages);
+    }
+  }
+  //    if (mStartY > mEndY || mStartX > mEndX) {
+  //        printf("crzy!!\n");
+  //    }
+
+  for (int my = mStartY; my < mEndY; my++) {
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      int myNext = my, mxNext = mx;
+      const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
+
+      if (!lastModule) {
+        mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+        myNext = my + (mx + 1 == mEndX);
+      }
+
+      conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+          myNext,
+          mxNext,
+          paddingStart,
+          numModulesX,
+          moduleStride,
+          blockPixelY,
+          blockPixelX,
+          imgSizeX,
+          imgStride,
+          pixIdxNext,
+          mNext);
+
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+#pragma unroll
+        for (int y = 0; y < B_Y * colorsPerThread;
+             y += (B_X * B_Y) / preloadCases) {
+          shImgLoad[(y)*preloadCases] =
+              imPreload[y * preloadCases / (B_X * B_Y)];
+        }
+
+#pragma unroll
+        for (int y = 0; y < B_X * filtersPerThread;
+             y += (B_X * B_Y) / preloadCases) {
+          shHidActLoad[y * (preloadCases + 1)] =
+              haPreload[y * preloadCases / (B_X * B_Y)];
+        }
+
+        __syncthreads();
+
+        //                const float* im = &images[caseIdx + preloadCases +
+        //                pixIdx]; const float* ha = &hidActs[caseIdx +
+        //                preloadCases + m * numImages];
+        int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
+        int hidActsOffset2 =
+            hidActsOffset + caseIdx + preloadCases + m * numImages;
+        if (caseIdx + preloadCases == numImages) {
+          pixIdx = pixIdxNext;
+          m = mNext;
+          imgOffset2 = imgOffset + pixIdxNext;
+          hidActsOffset2 = hidActsOffset + mNext * numImages;
+        }
+
+        WA_LOOP(0);
+        WA_LOOP(1);
+        WA_LOOP(2);
+        WA_LOOP(3);
+        WA_LOOP(4);
+
+        WA_LOOP(5);
+        WA_IMLOAD_TX(0);
+        WA_LOOP(6);
+        WA_IMLOAD_TX(1);
+        WA_LOOP(7);
+        WA_IMLOAD_TX(2);
+        WA_LOOP(8);
+        WA_IMLOAD_TX(3);
+        WA_LOOP(9);
+        WA_IMLOAD_TX(4);
+        WA_LOOP(10);
+        WA_IMLOAD_TX(5);
+
+        WA_LOOP(11);
+        WA_HALOAD_TX(0);
+        WA_LOOP(12);
+        WA_HALOAD_TX(1);
+        WA_LOOP(13);
+        WA_HALOAD_TX(2);
+        WA_LOOP(14);
+        WA_HALOAD_TX(3);
+        WA_LOOP(15);
+        WA_HALOAD_TX(4);
+        WA_LOOP(16);
+        WA_HALOAD_TX(5);
+        WA_LOOP(17);
+        WA_HALOAD_TX(6);
+        WA_LOOP(18);
+        WA_HALOAD_TX(7);
+        WA_LOOP(19);
+        WA_HALOAD_TX(8);
+        WA_LOOP(20);
+        WA_HALOAD_TX(9);
+        WA_LOOP(21);
+        WA_HALOAD_TX(10);
+        WA_LOOP(22);
+        WA_HALOAD_TX(11);
+        WA_LOOP(23);
+        WA_HALOAD_TX(12);
+        WA_LOOP(24);
+        WA_HALOAD_TX(13);
+        WA_LOOP(25);
+        WA_HALOAD_TX(14);
+        WA_LOOP(26);
+        WA_HALOAD_TX(15);
+
+        WA_LOOP(27);
+        WA_LOOP(28);
+        WA_LOOP(29);
+        WA_LOOP(30);
+        WA_LOOP(31);
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
+                targets[c * B_Y * filterPixels * numFilters + f * B_X] +
+            scaleOutputs * prod[f][c];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixels * numFilters + f * B_X] =
+            scaleOutputs * prod[f][c];
+      }
+    }
+  }
+}
+
+/*****************************Function Revision
+ *Record***************************** Author: Tencent BestImage
+ *Team(ankerguo@tencent.com)                           * Date:   2015-05-18 *
+ * Reason: Optimizing kernel to get faster speed according to GPU features *
+ * Method: *
+ *         1. reorganizing data structure to avoid bank conflict; *
+ *         2. using vectorized data type; *
+ *         3. improving instruction-level parallelism; *
+ *         4. removing redundant 'if' branches; *
+ *         5. removing local variables to save registers. *
+ *********************************************************************************/
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
+ *
+ * targets:     (numModulesY*numModulesX/partialSum, numFilterColors,
+ * filterPixels, numFilters)
+ */
+template <
+    int B_Y,
+    int B_X,
+    int filtersPerThread,
+    int colorsPerThread,
+    int preloadCases,
+    bool scale>
+__launch_bounds__(256, 2) __global__
+    void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(
+        cudaTextureObject_t images,
+        cudaTextureObject_t hidActs,
+        float* targets,
+        const int numImages,
+        const int numFilters,
+        const int numModulesY,
+        const int numModulesX,
+        const int imgSizeY,
+        const int imgSizeX,
+        const int filterSize,
+        const int paddingStart,
+        const int moduleStride,
+        const int imgStride,
+        const int numImgColors,
+        const int numGroups,
+        const int sumWidth,
+        const float scaleTargets,
+        const float scaleOutputs) {
+  // avoid bank conflict by re-organizing the data structure, and improve band
+  // width by using 'float2' instead of 'float'
+  __shared__ float2
+      shImages[preloadCases]
+              [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
+  __shared__ float2 shHidActs[preloadCases]
+                             [filtersPerThread * B_X / 2 +
+                              2]; // preload preloadCases cases of B_X hidacts
+  const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+  // const int tidx = B_X * threadIdx.y + threadIdx.x;
+  // reduce two registers
+  // const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+  // const int filterPixels = filterSize * filterSize;
+  // reduce one register
+  const int filterPixelsAll = numFilters * filterSize * filterSize;
+  const int imgPixels = imgSizeY * imgSizeX;
+
+  const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
+  const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
+
+  const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
+  // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+
+  const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
+  const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
+
+  const int blockModuleStartX = blockModuleChunkX * sumWidth;
+  const int blockModuleStartY = blockModuleChunkY * sumWidth;
+
+  // const int moduleIdx = partialSum * outputModuleIdx;
+  const int blockFilterIdx =
+      filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
+  const int numModules = numModulesY * numModulesX;
+
+  const int numFiltersPerGroup = numFilters / numGroups;
+  const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+  const int numFilterColors = numImgColors / numGroups;
+
+  const int blockPixelOffset = blockIdx.z; // pixel idx in filter
+  const int blockPixelY = blockPixelOffset / filterSize,
+            blockPixelX = blockPixelOffset % filterSize;
+  const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
+  const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
+  const int imgOffset =
+      (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride +
+      (ty * B_X + tx) % preloadCases;
+  // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+  const int hidActsOffset = blockFilterIdx * numImages * numModules +
+      ((ty * B_X + tx) / preloadCases) * numImages * numModules +
+      ((ty * B_X + tx) % preloadCases);
+  //
+  // hidActs +=
+  //             blockFilterIdx * numImages * numModules
+  //            + loadY * numImages * numModules
+  //            + loadX;
+
+  // usie one temporary register instead of multiple registers
+  const int pIdxBase = imgStride *
+      ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX);
+
+  targets += blockModuleChunkIdx * numFilters * filterSize * filterSize *
+          numFilterColors +
+      (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters +
+      blockPixelOffset * numFilters + blockFilterIdx + tx;
+  // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+
+  const int mStartX =
+      max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
+  const int mStartY =
+      max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
+  const int mEndX =
+      min(numModulesX,
+          min(blockModuleStartX + sumWidth,
+              DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
+  const int mEndY =
+      min(numModulesY,
+          min(blockModuleStartY + sumWidth,
+              DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
+
+  // reduce 3 registers
+  const bool doWork = mStartY < mEndY && mStartX < mEndX;
+
+  // float* shHidActLoad = &shHidActs[loadY][loadX];
+  // float* shImgLoad = &shImages[loadY][loadX];
+
+  float imPreload[preloadCases * colorsPerThread / B_X]; // [4]
+  float haPreload[preloadCases * filtersPerThread / B_Y]; // [8]
+
+  float prod[filtersPerThread][colorsPerThread];
+
+#pragma unroll
+  for (int f = 0; f < filtersPerThread; f++) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+      prod[f][c] = 0;
+    }
+  }
+  // int pixIdx, pixIdxNext, m, mNext;
+
+  // conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+  //        mStartY, mStartX, paddingStart, numModulesX, moduleStride,
+  //        blockPixelY, blockPixelX, imgSizeX, imgStride,
+  //        pixIdx, m);
+
+  const int pixIdx =
+      pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride;
+  const int m = (mStartY * numModulesX + mStartX);
+
+  // preload the image's pixel
+  if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      imPreload[i] = tex1Dfetch<float>(
+          images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx);
+    }
+  }
+
+  // preload the hidAct's pixel
+  if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) {
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+      haPreload[i] = tex1Dfetch<float>(
+          hidActs,
+          hidActsOffset + 16 * i * numImages * numModules + m * numImages);
+    }
+  }
+
+  for (int my = mStartY; my < mEndY; my++) {
+    for (int mx = mStartX; mx < mEndX; mx++) {
+      for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
+        int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase +
+            (my * imgSizeX + mx) * moduleStride * imgStride;
+        int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases +
+            (my * numModulesX + mx) * numImages;
+
+        if (caseIdx + preloadCases == numImages) {
+          const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+          const int myNext = my + (mx + 1 == mEndX);
+
+          imgOffset2 = imgOffset + +pIdxBase +
+              (myNext * imgSizeX + mxNext) * moduleStride * imgStride;
+          hidActsOffset2 =
+              hidActsOffset + (myNext * numModulesX + mxNext) * numImages;
+        }
+
+        if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
+          // store the previousely preloaded pixel into shared memory
+          shImages[(ty * B_X + tx) % preloadCases]
+                  [(ty * B_X + tx) / preloadCases]
+                      .x = imPreload[0];
+          shImages[(ty * B_X + tx) % preloadCases]
+                  [(ty * B_X + tx) / preloadCases]
+                      .y = imPreload[2];
+          shImages[(ty * B_X + tx) % preloadCases]
+                  [(ty * B_X + tx) / preloadCases + 16]
+                      .x = imPreload[1];
+          shImages[(ty * B_X + tx) % preloadCases]
+                  [(ty * B_X + tx) / preloadCases + 16]
+                      .y = imPreload[3];
+        }
+
+        if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) {
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases]
+                       .x = haPreload[0];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases]
+                       .y = haPreload[2];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 32]
+                       .x = haPreload[4];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 32]
+                       .y = haPreload[6];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 16]
+                       .x = haPreload[1];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 16]
+                       .y = haPreload[3];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 48]
+                       .x = haPreload[5];
+          shHidActs[(ty * B_X + tx) % preloadCases]
+                   [(ty * B_X + tx) / preloadCases + 48]
+                       .y = haPreload[7];
+        }
+
+#pragma unroll
+        for (int r = 0; r < 8; r++) {
+          haPreload[r] = tex1Dfetch<float>(
+              hidActs, hidActsOffset2 + r * 16 * numImages * numModules);
+        }
+
+#pragma unroll
+        for (int r = 0; r < 4; r++) {
+          imPreload[r] = tex1Dfetch<float>(
+              images, imgOffset2 + r * 16 * imgPixels * imgStride);
+        }
+        __syncthreads();
+
+// put together the instructions of same type to improve instruction-level
+// parallelism calculate the derivative of the hidAct with respect to weight
+#pragma unroll
+        for (int r = 0; r < 16; r++) {
+#pragma unroll
+          for (int c = 0; c < 4; c++) {
+            prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x;
+            prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y;
+            prod[2][c] +=
+                shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x;
+            prod[3][c] +=
+                shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y;
+            prod[0][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x;
+            prod[1][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y;
+            prod[2][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x;
+            prod[3][c + 4] +=
+                shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y;
+          }
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+  if (scale) {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixelsAll + f * B_X] =
+            scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] +
+            scaleOutputs * prod[f][c];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+#pragma unroll
+      for (int f = 0; f < filtersPerThread; f++) {
+        targets[c * B_Y * filterPixelsAll + f * B_X] =
+            scaleOutputs * prod[f][c];
+      }
+    }
+  }
+}
+
+std::pair<int, int> getWeightActsOutputSize(
+    int numModulesY,
+    int numModulesX,
+    int numFilterColors,
+    int filterSize,
+    int numFilters,
+    int sumWidth) {
+  const int outputModuleChunksX = DIVUP(numModulesX, sumWidth);
+  const int outputModuleChunksY = DIVUP(numModulesY, sumWidth);
+  const int outputModuleChunks = outputModuleChunksX * outputModuleChunksY;
+  return std::pair<int, int>(
+      outputModuleChunks * numFilterColors * filterSize * filterSize,
+      numFilters);
+}
+
+/*
+ * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
+ * hidActs:     (numFilters, numModules, numImages)
+ *
+ * targets:     (numModuleY*numModulesX/partialSum, numFilterColors,
+ * filterPixels, numFilters)
+ *
+ * TODO: you can get a slight speed boost for local non-convolutional units by
+ * writing special routines for partialSum = 1. But I dunno if the code
+ * duplication is worth it...
+ *
+ * Note: all of these convolution routines are optimized for the case when
+ * the number of images (i.e. the minibatch size) is a multiple of 128.
+ * Other batch sizes will work, but but I made no attempt whatsoever
+ * to make them work fast.
+ */
+void _weightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int sumWidth,
+    float scaleTargets,
+    float scaleOutput) {
+  CAFFE_ENFORCE(images->ndim() == 2);
+  CAFFE_ENFORCE(hidActs->ndim() == 2);
+  CAFFE_ENFORCE(targets->ndim() == 2);
+
+  int numFilterColors = numImgColors / numGroups;
+  int imgStride = images->dim32(1);
+  int numImages = images->dim32(1);
+  int imgPixels = images->dim32(0) / numImgColors;
+  int imgSizeX = imgPixels / imgSizeY;
+  int numModules = numModulesY * numModulesX;
+  int numFilters = hidActs->dim32(0) / numModules;
+  int numFiltersPerGroup = numFilters / numGroups;
+
+  CAFFE_ENFORCE(numImgColors % numGroups == 0);
+  CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0);
+  CAFFE_ENFORCE(
+      numGroups > 1 ||
+      (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 16 == 0)));
+  CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 16 == 0);
+  CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels);
+  CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors);
+
+  int filterPixels = filterSize * filterSize;
+  int outputModuleChunksX = DIVUP(numModulesX, sumWidth);
+  int outputModuleChunksY = DIVUP(numModulesY, sumWidth);
+  int outputModuleChunks = outputModuleChunksX * outputModuleChunksY;
+  //    partialSum = partialSum == 0 ? numModules : partialSum;
+
+  //    CAFFE_ENFORCE(numModules % partialSum == 0);
+  CAFFE_ENFORCE(hidActs->dim32(1) == numImages);
+
+  // These routines don't handle the case when only part of the image is visited
+  // in the convolution
+  CAFFE_ENFORCE(paddingStart <= 0);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
+  CAFFE_ENFORCE(
+      paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
+  CAFFE_ENFORCE(moduleStride <= filterSize);
+
+  CAFFE_ENFORCE(numModules * numFilters == hidActs->dim32(0));
+
+  int preloadCases = 32;
+
+  dim3 blocks, threads;
+  int bx, by;
+  int pixelsPerThread, filtersPerThread, colorsPerThread;
+  // Worth playing with these parameters to find best values for your problem.
+  // These values work relatively well, but not optimal for all problems.
+  if (numFilterColors > 3) {
+    filtersPerThread =
+        numFiltersPerGroup % 64 == 0 ? 4 : numFiltersPerGroup % 32 == 0 ? 2 : 1;
+    colorsPerThread = numFilterColors % 64 == 0
+        ? 8
+        : numFilterColors % 48 == 0 ? 6 : numFilterColors % 32 == 0 ? 8 : 4;
+    by = (numFilterColors / colorsPerThread) % 8 == 0 ? 8 : 4;
+    bx = numFiltersPerGroup % 128 == 0 ? 32 : 16;
+    preloadCases = filtersPerThread * colorsPerThread < 32 ? 32 : 16;
+    blocks = dim3(
+        outputModuleChunks * (numFilters / (bx * filtersPerThread)),
+        numFilterColors / (by * colorsPerThread),
+        filterPixels);
+    CAFFE_ENFORCE(numFilterColors % (by * colorsPerThread) == 0);
+  } else { // This is ugly but it's nice to spell it out clearly
+    CAFFE_ENFORCE(numGroups == 1); // Just for sanity
+    // NOTE: these things are only optimized for colors = 3. I didn't really
+    // test other cases.
+    if (numFilters % 64 ==
+        0) { // TODO: having a separate case for 128 would make things faster,
+             // but I probably don't care about 128
+      filtersPerThread = 4;
+      pixelsPerThread = 2;
+      by = 16;
+      bx = 16;
+      preloadCases = 32;
+    } else if (numFilters % 48 == 0) {
+      filtersPerThread = 3;
+      pixelsPerThread = 4;
+      by = 16;
+      bx = 16;
+      preloadCases = 32;
+    } else if (numFilters % 32 == 0) {
+      filtersPerThread = 2;
+      pixelsPerThread = 2;
+      by = 8;
+      bx = 16;
+      preloadCases = 16;
+    } else { // This case is completely untested. It might be really slow. But
+             // no time now.
+      filtersPerThread = 1;
+      pixelsPerThread = 16;
+      by = 16;
+      bx = 16;
+      preloadCases = 32;
+    }
+    blocks = dim3(
+        outputModuleChunks * (numFilters / (bx * filtersPerThread)),
+        DIVUP(filterPixels, by * pixelsPerThread));
+  }
+  CAFFE_ENFORCE((by * bx) % preloadCases == 0);
+  CAFFE_ENFORCE(numFilters % (bx * filtersPerThread) == 0);
+  threads = dim3(bx, by);
+  bool checkCaseBounds = numImages % preloadCases != 0;
+  bool scale = scaleTargets != 0;
+  std::pair<int, int> targetSize = getWeightActsOutputSize(
+      numModulesY,
+      numModulesX,
+      numFilterColors,
+      filterSize,
+      numFilters,
+      sumWidth);
+  if (!scale) {
+    targets->Resize(std::vector<int>{targetSize.first, targetSize.second});
+  } else {
+    CAFFE_ENFORCE(targets->dim32(0) == targetSize.first);
+    CAFFE_ENFORCE(targets->dim32(1) == targetSize.second);
+  }
+
+  float* images_data = images->mutable_data<float>();
+  float* hidacts_data = hidActs->mutable_data<float>();
+  float* targets_data = targets->mutable_data<float>();
+  const std::size_t images_bytes = images->nbytes();
+
+  cudaStream_t stream = context->cuda_stream();
+
+  checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+
+  if (scale == false) {
+    if (checkCaseBounds == false) {
+      if (numFilterColors > 3) {
+        if (numFilterColors % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
+                8,
+                32,
+                4,
+                8,
+                16,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
+                    8,
+                    16,
+                    4,
+                    8,
+                    16,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
+                8,
+                16,
+                4,
+                8,
+                16,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 48 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
+                    8,
+                    32,
+                    4,
+                    6,
+                    32,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
+                8,
+                32,
+                4,
+                6,
+                32,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 16 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      } else if (numFilterColors <= 3) {
+        if (numFilterColors == 3) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    3,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
+                16,
+                16,
+                2,
+                2,
+                4,
+                32,
+                3,
+                false,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    3,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
+                16,
+                16,
+                2,
+                4,
+                3,
+                32,
+                3,
+                false,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    3,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    3,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 2) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    2,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    2,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    2,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    2,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 1) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    1,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    1,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    1,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    1,
+                    false,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      }
+    } else if (checkCaseBounds == true) {
+      if (numFilterColors > 3) {
+        if (numFilterColors % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 48 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 16 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      } else if (numFilterColors <= 3) {
+        if (numFilterColors == 3) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    3,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    3,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    3,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    3,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 2) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    2,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    2,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    2,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    2,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 1) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    1,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    1,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    1,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    1,
+                    false,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      }
+    }
+  } else if (scale == true) {
+    if (checkCaseBounds == false) {
+      if (numFilterColors > 3) {
+        if (numFilterColors % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
+                    8,
+                    32,
+                    4,
+                    8,
+                    16,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
+                8,
+                32,
+                4,
+                8,
+                16,
+                true><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
+                    8,
+                    16,
+                    4,
+                    8,
+                    16,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
+                8,
+                16,
+                4,
+                8,
+                16,
+                true><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 48 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
+                    8,
+                    32,
+                    4,
+                    6,
+                    32,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
+                8,
+                32,
+                4,
+                6,
+                32,
+                true><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                numImgColors,
+                numGroups,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 16 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      } else if (numFilterColors <= 3) {
+        if (numFilterColors == 3) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    3,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
+                16,
+                16,
+                2,
+                2,
+                4,
+                32,
+                3,
+                true,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaTextureObject_t tex_images = GetTensorTextureObject(images);
+            cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    3,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
+                16,
+                16,
+                2,
+                4,
+                3,
+                32,
+                3,
+                true,
+                false><<<blocks, threads, 0, stream>>>(
+                tex_images,
+                tex_hidacts,
+                targets_data,
+                numImages,
+                numFilters,
+                numModulesY,
+                numModulesX,
+                imgSizeY,
+                imgSizeX,
+                filterSize,
+                paddingStart,
+                moduleStride,
+                imgStride,
+                sumWidth,
+                scaleTargets,
+                scaleOutput);
+            checkCudaErrors(cudaDestroyTextureObject(tex_images));
+            checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    3,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    3,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 2) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    2,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    2,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    2,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    2,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 1) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    1,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    1,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    8,
+                    16,
+                    2,
+                    2,
+                    2,
+                    16,
+                    1,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    1,
+                    true,
+                    false>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, false>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      }
+    } else if (checkCaseBounds == true) {
+      if (numFilterColors > 3) {
+        if (numFilterColors % 64 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 48 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 32 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors % 16 == 0) {
+          if (numFiltersPerGroup % 128 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    numImgColors,
+                    numGroups,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      } else if (numFilterColors <= 3) {
+        if (numFilterColors == 3) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    3,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    3,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    3,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 2) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    2,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    2,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    2,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        } else if (numFilterColors == 1) {
+          if (numFiltersPerGroup % 64 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    2,
+                    4,
+                    32,
+                    1,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 48 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    4,
+                    3,
+                    32,
+                    1,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 32 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          } else if (numFiltersPerGroup % 16 == 0) {
+            cudaFuncSetCacheConfig(
+                conv_weight_acts_c_kepler_sw<
+                    16,
+                    16,
+                    2,
+                    16,
+                    1,
+                    32,
+                    1,
+                    true,
+                    true>,
+                cudaFuncCachePreferShared);
+            conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, true>
+                <<<blocks, threads, 0, stream>>>(
+                    images_data,
+                    hidacts_data,
+                    targets_data,
+                    numImages,
+                    numFilters,
+                    numModulesY,
+                    numModulesX,
+                    imgSizeY,
+                    imgSizeX,
+                    filterSize,
+                    paddingStart,
+                    moduleStride,
+                    imgStride,
+                    sumWidth,
+                    scaleTargets,
+                    scaleOutput);
+          }
+        }
+      }
+    }
+  }
+  checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
+  getLastCudaError("weightActs: kernel execution failed");
+}
+
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int partialSum) {
+  _weightActs(
+      context,
+      images,
+      hidActs,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      filterSize,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      partialSum,
+      0,
+      1);
+}
+
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int partialSum,
+    float scaleTargets,
+    float scaleOutput) {
+  _weightActs(
+      context,
+      images,
+      hidActs,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      filterSize,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      partialSum,
+      scaleTargets,
+      scaleOutput);
+}
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups) {
+  _weightActs(
+      context,
+      images,
+      hidActs,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      filterSize,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      1,
+      0,
+      1);
+}
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput) {
+  _weightActs(
+      context,
+      images,
+      hidActs,
+      targets,
+      imgSizeY,
+      numModulesY,
+      numModulesX,
+      filterSize,
+      paddingStart,
+      moduleStride,
+      numImgColors,
+      numGroups,
+      1,
+      scaleTargets,
+      scaleOutput);
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile b/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
new file mode 100644
index 0000000..2e1c1e7
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
@@ -0,0 +1,112 @@
+################################################################################
+#
+# Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+
+# Location of the CUDA Toolkit binaries and libraries
+CUDA_INC_PATH  = $(CUDA_INSTALL_PATH)/include
+CUDA_BIN_PATH  = $(CUDA_INSTALL_PATH)/bin
+CUDA_LIB_PATH  = $(CUDA_INSTALL_PATH)/lib64
+
+# Common binaries
+NVCC            = $(CUDA_BIN_PATH)/nvcc
+GCC             = g++
+AR				= ar
+
+# CUDA code generation flags
+GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
+GENCODE_FLAGS   := $(GENCODE_SM35)
+
+LDFLAGS   := -L$(CUDA_LIB_PATH) -lcudart
+CCFLAGS   := -m64
+NVCCFLAGS := -m64
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS   += -g
+      NVCCFLAGS += -g -G
+      DBG := debug
+else
+      DBG := release
+      NVCCFLAGS += -O3
+      CCFLAGS += -O3
+endif
+
+# Add profiler output
+ifeq ($(prof),1)
+	NVCCFLAGS += --ptxas-options=-v
+endif
+
+TARGETDIR := ./bin/$(DBG)
+OBJDIR := ./obj/$(DBG)
+
+########## USER STUFF ###########
+PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
+MODELNAME := _ConvNet
+LDFLAGS   += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3 
+INCLUDES      := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) 
+
+DEFINES := -DNUMPY_INTERFACE
+
+CUFILES	:= $(shell find . -name "*.cu")
+CU_DEPS	:= $(shell find . -name "*.cuh")
+CCFILES	:= $(shell find . -name "*.cpp")
+C_DEPS	:= $(shell find . -name "*.h")
+
+NVCCFLAGS += --compiler-options '-fPIC'
+LDFLAGS += -shared
+CCFLAGS += -fPIC
+TARGET := $(TARGETDIR)/$(MODELNAME).so
+
+################################################################################
+# Set up target and object files
+################################################################################
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
+
+# Target rules
+all: makedirs $(TARGET)
+
+$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
+	$(NVCC) $(DEFINES) $(NVCCFLAGS)  $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
+	$(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
+
+$(TARGET): $(OBJS)
+	$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
+	ln -sf $(TARGET) .
+
+makedirs:
+	mkdir -p $(TARGETDIR)
+	mkdir -p $(OBJDIR)/src
+
+clean:
+	rm -rf ./obj
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/__init__.py b/caffe2/contrib/cuda-convnet2/cudaconvnet/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh
new file mode 100644
index 0000000..58e34a5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ACTBROADCASTER_CUH_H_
+#define ACTBROADCASTER_CUH_H_
+
+#include <map>
+#include "streambroadcast.cuh"
+#include "copypipeline.cuh"
+
+class BroadcastMessage {
+public:
+    enum MESSAGE_TYPE {
+        BROADCAST,
+        EXIT
+    };
+protected:
+    int _srcDevice;
+    std::map<int, NVMatrix*> _mats;
+    int _userIdx;
+    Queue<int>* _finishQueue;
+    MESSAGE_TYPE _type;
+    BroadcastMessage(MESSAGE_TYPE type);
+public:
+    BroadcastMessage(std::map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue);
+
+    int getSrcDevice();
+    std::map<int, NVMatrix*>& getMatrices();
+    int getUserIdx();
+    Queue<int>& getFinishQueue();
+    MESSAGE_TYPE getMessageType();
+};
+
+class ExitBroadcastMessage : public BroadcastMessage {
+public:
+    ExitBroadcastMessage();
+};
+
+class ActBroadcaster : public Thread {
+protected:
+    std::map<int,IBroadcastNetwork*> _broadcasters; // src device --> broadcaster
+    Queue<BroadcastMessage*> _messageQueue;
+    int _numUsers;
+public:
+    ActBroadcaster(int numUsers, intv& cpus);
+    ~ActBroadcaster();
+    Queue<BroadcastMessage*>& getMessageQueue();
+    virtual void* run();
+    void stop();
+};
+
+
+#endif /* ACTBROADCASTER_CUH_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
new file mode 100644
index 0000000..230a721
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONVNET3
+#define	CONVNET3
+
+#include <vector>
+#include <string>
+#include <set>
+#include <map>
+#include <helper_cuda.h>
+#include <time.h>
+#include "../../util/include/queue.h"
+#include "../../util/include/thread.h"
+#include <math.h>
+#include "../../util/include/sync.h"
+#include "messages.cuh"
+#include "streambroadcast.cuh"
+
+#include "layer.cuh"
+#include "data.cuh"
+#include "worker.cuh"
+#include "weights.cuh"
+#include "pipedispenser.cuh"
+#include "timer.cuh"
+
+class Worker;
+class WorkResult;
+class Layer;
+class DataLayer;
+class CostLayer;
+class ConvNetThread;
+class StreamBroadcast;
+class Weights;
+
+// name -> device id -> layer*
+typedef std::map<std::string,std::map<int, Layer*> > NameReplicaLayerMap;
+typedef std::map<std::string, Layer*> NameLayerMap;
+// name -> ReplicaMap
+//typedef std::map<int,NameLayerMap> ReplicaNameLayerMap;
+typedef std::vector<ConvNetThread*> ConvNetThreadV;
+typedef std::vector<DataLayer*> DataLayerVector;
+//typedef std::map<int,ConvNetThreadV> ReplicaThreadsMap;
+
+class ConvNet : public Thread {
+private:
+    void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights);
+protected:
+    NameReplicaLayerMap _layerMap;
+    DataLayerVector _dataLayers;
+    // Vector of convnet threads (one thread == one GPU)
+    ConvNetThreadV _convNetThreads;
+
+    DataProvider* _dp;
+    CPUData* _data, *_bufferData;
+    int _bufferMinibatchIdx, _bufferPassIdx;
+    ThreadSynchronizer* _sync;
+    intv _deviceIDs;
+    
+    Queue<Worker*> _workerQueue;
+    Queue<WorkResult*> _resultQueue;
+    Queue<Message*> _msgQueue;
+    
+    int _numFwdTerminal;
+    std::map<int, int> _numBwdTerminal; // pass idx -> #terminal
+    int _totalPassesDone;
+    int _numReplicasMin, _numReplicasMax;
+    // For gradient checking
+    int _numFailures;
+    int _numTests;
+
+    // Training progress (between 0 and 1).
+    // Used to determine learning rate based on ParameterSchedule.
+    double _trainingProgress;
+    double _baseErr;
+    bool _conserveMem;
+    PipeDispenser *_dataCopyPD;
+
+    void waitForTerminals(int numMsgs, MESSAGES msg);
+    void sendMessage(MESSAGES msg, bool sync);
+    void sendMessage(Message* msg, bool sync);
+    void findBwdTerminal(Layer& l, std::set<Layer*>& visited, int& terminal, int passIdx);
+    void connectReplicas();
+    void initDataLayers(PyObjectV* layerList);
+    void initGPUThreads(PyObjectV* layerList);
+    void connectChildren(PyObject* layerParams);
+    void* run();
+    void setData(CPUData& data, int passIdx);
+    void setDataFromBuffer();
+    void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx);
+public:
+    ConvNet(PyObject* layerParams, intv& deviceIDs,
+            int minibatchSize, bool conserveMem);
+    ~ConvNet();
+    void stop();
+    
+    Queue<Message*>& getMessageQueue();
+    Queue<Worker*>& getWorkerQueue();
+    Queue<WorkResult*>& getResultQueue();
+    DataProvider& getDataProvider();
+    
+    Layer& getLayer(std::string& name, int replicaID);
+    void copyToCPU();
+    void copyToGPU();
+    void updateWeights(int passIdx);
+    void reset(int passIdx);
+    void reset();
+
+    void bprop(int passIdx, PASS_TYPE passType);
+    void fprop(int miniIdx, int passIdx, PASS_TYPE passType);
+    void fprop(CPUData& data, int passIdx, PASS_TYPE passType);
+
+    void setTrainingProgress(double progress);
+    double getTrainingProgress() const;
+
+    bool checkGradient(const std::string& name, float eps, Weights& weights); 
+    void checkGradients();
+    Cost& getCost();
+    Cost& getCost(Cost& cost);
+    CPUData& getData(); // Returns last minibatch fpropped
+    double getCostValue();
+    intv& getDeviceIDs();
+    ThreadSynchronizer& getSync();
+    void syncWithChildren();
+    int getMinibatchSize();
+    bool isConserveMemory();
+    int getNumReplicasMax();
+    int getNumReplicasMin();
+    int getNumPasses();
+    int getTotalPassesDone();
+    PipeDispenser& getDataCopyPD();
+};
+
+class ConvNetThread : public Thread {
+protected:
+    NameLayerMap _nameLayerMap;
+    std::vector<CostLayer*> _costs;
+    ConvNet* _convNet;
+    int _deviceID;
+    Queue<Message*> _msgQueue;
+    Timer _timer;
+//    StreamBroadcast* _weightSynchronizer;
+    
+    void initCuda();
+    virtual void initLayer(PyObject* paramsDict, int replicaID);
+    void* run();
+public:
+    ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet);
+    ~ConvNetThread();
+    
+    NameLayerMap& getLayerMap();
+    int getDeviceID();
+    
+    ConvNet& getConvNet();
+    
+    Queue<Message*>& getMessageQueue();
+    std::vector<CostLayer*>& getCostLayers();
+//    StreamBroadcast& getWeightSynchronizer();
+    
+    Cost& getCost();
+    Layer& getLayer(std::string& name);
+    void startTimer();
+    double stopTimer();
+};
+
+#endif	/* CONVNET */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh
new file mode 100644
index 0000000..f9dfa81
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COPYPIPELINE_CUH_
+#define COPYPIPELINE_CUH_
+
+#include <set>
+#include "../../util/include/thread.h"
+#include "../../util/include/queue.h"
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+#define COPY_MIN_CHUNK_SIZE                 (1<<18) // 256k
+#define COPY_MAX_CHUNKS                     16
+#define COPY_MIN_CHUNKS                     2
+
+class CopyPeer;
+class CopySource;
+class ICopySegment;
+class IBroadcastNetwork;
+
+class CopyMessage {
+protected:
+    std::map<int,NVMatrix*>* _mats;
+    float _scaleSource, _scaleTargets;
+public:
+    enum COPY_MESSAGE_TYPE {
+        COPY_CHUNK,
+        COPY_START,
+        EXIT
+    };
+    CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
+        : _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) {
+    }
+    CopyMessage(COPY_MESSAGE_TYPE msgType)
+        : _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) {
+    }
+    inline COPY_MESSAGE_TYPE getType() const {
+        return _msgType;
+    }
+    inline NVMatrix& getMatrix(int deviceID) const {
+        return *_mats->at(deviceID);
+    }
+    inline std::map<int,NVMatrix*>& getMatrices() const {
+        return *_mats;
+    }
+    inline float getScaleSource() const {
+        return _scaleSource;
+    }
+    inline float getScaleTargets() const {
+        return _scaleTargets;
+    }
+protected:
+    COPY_MESSAGE_TYPE _msgType;
+};
+
+class CopyChunkMessage : public CopyMessage {
+protected:
+    int _chunkIdx;
+    int _chunkSize;
+    int _numChunks;
+public:
+    CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
+        : _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) {
+    }
+
+    inline int getChunkIdx() const {
+        return _chunkIdx;
+    }
+    inline int getChunkSize() const {
+        return _chunkSize;
+    }
+    inline int getNumChunks() const {
+        return _numChunks;
+    }
+};
+
+class CopyStartMessage : public CopyMessage {
+public:
+    CopyStartMessage(float scaleSource, float scaleTargets, std::map<int,NVMatrix*>& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) {
+    }
+};
+
+class ICopySegment : public Thread {
+protected:
+    int _deviceID, _execDeviceID;
+    cudaStream_t _stream;
+    ICopySegment* _prev;
+    std::vector<CopyPeer*> _next;
+    Queue<CopyMessage*> _queue;
+    Queue<int>* _finishQueue;
+    HostNVMatrix _hmat;
+    IBroadcastNetwork* _parent;
+
+    NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx);
+    void* run();
+    virtual bool processMessage(CopyMessage& msg) = 0;
+
+public:
+    ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
+    virtual ~ICopySegment();
+    inline NVMatrix& getMatrix(CopyMessage& msg);
+    Queue<CopyMessage*>& getQueue();
+    inline int getDeviceID();
+    void addPrev(ICopySegment& c);
+    void addNext(CopyPeer& c);
+    bool isTerminal() const;
+    virtual bool isSource() const = 0;
+};
+
+class CopySource : public ICopySegment {
+protected:
+    bool processMessage(CopyMessage& msg);
+public:
+    CopySource(IBroadcastNetwork& parent, int deviceID);
+    inline bool isSource() const;
+};
+
+class CopyPeer : public ICopySegment {
+protected:
+    bool processMessage(CopyMessage& msg);
+public:
+    CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
+    inline bool isSource() const;
+};
+
+class IBroadcastNetwork {
+protected:
+    Queue<int> _finishQueue;
+    CopySource* _src;
+    std::vector<CopyPeer*> _peers;
+    int _srcDeviceID, _numTerminal;
+    bool _constructed;
+    std::set<int> _devices;
+    std::pair<std::vector<int>,std::vector<int> > makeGPULists();
+
+    void makePeers(std::pair<std::vector<int>,std::vector<int> >& gpus);
+    virtual void makeConnections() = 0;
+    virtual void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    IBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
+public:
+    virtual IBroadcastNetwork& construct();
+    virtual ~IBroadcastNetwork();
+
+    virtual void broadcast(std::map<int, NVMatrix*>& mats);
+    int getSourceDeviceID() const;
+    static IBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
+};
+
+class ISafeBroadcastNetwork : public IBroadcastNetwork {
+protected:
+    ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
+public:
+    virtual void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    virtual ISafeBroadcastNetwork& construct();
+    static ISafeBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
+};
+
+class NullBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    NullBroadcaster(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+public:
+    NullBroadcaster& construct();
+    void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    void broadcast(std::map<int, NVMatrix*>& mats);
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+/*
+ * This one goes to host and then to targets.
+ */
+class NaiveBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    NaiveBroadcaster(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+class EightGPUBroadcaster1 : public IBroadcastNetwork {
+protected:
+    EightGPUBroadcaster1(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+    friend class IBroadcastNetwork;
+};
+
+class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    int _tgtDeviceID;
+    cudaStream_t _tgtStream;
+    void makeConnections();
+    void resetDeviceID(int d);
+    void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+public:
+    TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID);
+    ~TwoPeeringGPUsBroadcaster();
+    ISafeBroadcastNetwork& construct();
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+#endif /* COPYPIPELINE_CUH_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
new file mode 100644
index 0000000..80270e3
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COST_CUH
+#define	COST_CUH
+
+#include <vector>
+#include <map>
+#include <helper_cuda.h>
+
+#include "layer.cuh"
+#include "util.cuh"
+
+class CostLayer;
+
+/*
+ * Wrapper for dictionary mapping cost name to vector of returned values.
+ */
+class Cost {
+protected:
+    std::map<std::string,int> _numCases;
+    CostMap _costMap;
+    CostCoeffMap _costCoeffMap;
+    std::map<std::string,int>& getNumCasesMap();
+public:
+    Cost();
+    Cost(std::vector<CostLayer*>& costs);
+    doublev& operator [](const std::string s);
+    CostMap& getCostMap();
+    CostCoeffMap& getCostCoeffMap();
+    int getNumCases();
+    /*
+     * Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients.
+     */
+    double getValue();
+    Cost& operator += (Cost& er);
+    virtual ~Cost();
+    void print();
+};
+
+
+#endif	/* COST_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
new file mode 100644
index 0000000..e64601f
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATA_CUH
+#define	DATA_CUH
+
+#include <vector>
+#include <algorithm>
+#include "util.cuh"
+
+class CPUData {
+protected:
+    MatrixV* _data;
+    void assertDimensions() {
+        assert(_data->size() > 0);
+        for (int i = 1; i < _data->size(); i++) {
+            assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
+            if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) {
+                _data->at(i)->setTrans(_data->at(i-1)->isTrans());
+            }
+            assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
+        }
+        assert(_data->at(0)->getNumCols() > 0);
+    }
+public:
+    typedef typename MatrixV::iterator T_iter;
+    // Cases in columns, but array may be transposed
+    // (so in memory they can really be in rows -- in which case the array is transposed
+    //  during the copy to GPU).
+    CPUData(PyObject* pyData) {
+        _data = getMatrixV(pyData);
+        assertDimensions();
+    }
+    
+    CPUData(MatrixV* data) : _data(data) {
+        assertDimensions();
+    }
+
+    ~CPUData() {
+        for (T_iter it = _data->begin(); it != _data->end(); ++it) {
+            delete *it;
+        }
+        delete _data;
+    }
+    
+    Matrix& operator [](int idx) const {
+        return *_data->at(idx);
+    }
+    
+    int getSize() const {
+        return _data->size();
+    }
+    
+    MatrixV& getData() const {
+        return *_data;
+    }
+    
+    Matrix& getData(int i) const {
+        return *_data->at(i);
+    }
+    
+    bool isTrans() const {
+        return _data->at(0)->isTrans();
+    }
+
+    int getNumCases() const {
+        return _data->at(0)->getNumCols();
+    }
+};
+
+class DataProvider {
+protected:
+    CPUData* _hData;
+    NVMatrixV _data;
+    int _minibatchSize;
+public:
+    DataProvider(int minibatchSize);
+    void setData(CPUData&);
+    void clearData();
+    CPUData& getMinibatch(int idx);
+    CPUData& getDataSlice(int startCase, int endCase);
+    int getNumMinibatches();
+    int getMinibatchSize();
+    int getNumCases();
+};
+
+#endif	/* DATA_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh
new file mode 100644
index 0000000..84079ae
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRADREDUCER_CUH_
+#define GRADREDUCER_CUH_
+
+#include <set>
+#include <algorithm>
+#include "streambroadcast.cuh"
+#include "reducepipeline.cuh"
+#include "layer.cuh"
+#include "util.cuh"
+
+class StreamBroadcast;
+class Layer;
+
+#define ACT_GRAD_REDUCER_EXIT       (1 << 16)
+
+//class ReduceMessage {
+//    ReduceMessage();
+//    ReduceMessage(bool exit);
+//};
+
+class IActGradReducer : public Thread {
+protected:
+    Layer* _parent;
+    Queue<int> _finishQueue;
+    int _numExpectedMsgsTotal;
+    std::map<int,int> _numExpectedMsgs; // map from device id -> num expected msgs
+
+    void* run();
+    virtual bool reduce() = 0;
+    virtual void reset() = 0;
+public:
+    IActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    virtual ~IActGradReducer();
+    int waitForFinish();
+    virtual void enqueueReduction(int deviceID) = 0;
+    virtual void stop() = 0;
+    static IActGradReducer& makeGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+};
+
+class SequentialActGradReducer : public IActGradReducer {
+protected:
+
+    std::map<int,int> _numReceivedMsgs; // map from device id -> num received msgs
+
+    std::map<int,Queue<int>* > _messageQueues;
+    intv _deviceIDs;
+    StreamBroadcast* _broadcaster;
+    bool reduce();
+    void reset();
+public:
+    SequentialActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    ~SequentialActGradReducer();
+    void enqueueReduction(int deviceID);
+    void stop();
+};
+
+class ParallelActGradReducer : public IActGradReducer {
+protected:
+    IEightGPUReducer* _reducer;
+    int _numReceivedMsgs;
+    float _scaleTarget;
+    Queue<int> _messageQueue;
+    bool reduce();
+    void reset();
+public:
+    ParallelActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    void enqueueReduction(int deviceID);
+    void stop();
+};
+
+
+#endif /* GRADREDUCER_CUH_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
new file mode 100644
index 0000000..83c5061
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef JPEG_MAIN_H
+#define JPEG_MAIN_H
+
+#include <cstdio>
+#include <cstdlib>
+#include <Python.h>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <jpeglib.h>
+//#include <arrayobject.h>
+#include "../../util/include/thread.h"
+#include "../../util/include/matrix.h"
+
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#define NUM_JPEG_DECODER_THREADS        4
+
+
+class DecoderThread : public Thread {
+ protected:
+    PyObject* _pyList;
+    Matrix* _target;
+    int64 _start_img, _end_img;
+    int64 _img_size, _inner_size, _inner_pixels;
+    bool _test, _multiview;
+
+    unsigned char* _decodeTarget;
+    int64 _decodeTargetSize;
+    unsigned int _rseed;
+
+    void* run();
+    void decodeJpeg(int idx, int& width, int& height);
+    double randUniform();
+    double randUniform(double min, double max);
+    void crop(int64 i, int64 width, int64 height, bool flip);
+    virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y);
+ public:
+    DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview);
+    virtual ~DecoderThread();
+};
+
+#endif // JPEG_MAIN_H
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
new file mode 100644
index 0000000..2400413
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
@@ -0,0 +1,812 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LAYER_CUH
+#define    LAYER_CUH
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <map>
+#include <assert.h>
+#include <helper_timer.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh"
+
+#include "weights.cuh"
+#include "convnet.cuh"
+#include "cost.cuh"
+#include "neuron.cuh"
+#include "data.cuh"
+#include "layer_kernels.cuh"
+#include "streambroadcast.cuh"
+#include "actbroadcaster.cuh"
+#include "gradreducer.cuh"
+#include "util.cuh"
+#include "timer.cuh"
+#include "memorysource.cuh"
+
+class Cost;
+class ConvNet;
+class ConvNetThread;
+class CostLayer;
+class DataLayer;
+class Layer;
+class ActBroadcaster;
+class BroadcastMessage;
+class IActGradReducer;
+class Weights;
+class WeightList;
+typedef std::vector<Layer*> LayerV;
+
+class BinomialCrossEntOperator {
+protected:
+    float _posWeight;
+public:
+    BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) {
+    }
+    __device__ inline float operator()(const float t, const float y) const {
+        return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
+    }
+};
+
+class CrossEntOperator {
+protected:
+    float _posWeight;
+public:
+    CrossEntOperator(float posWeight) : _posWeight(posWeight) {
+    }
+    __device__ inline float operator()(const float t, const float y) const {
+        return _posWeight * t * safelog(y);
+    }
+};
+
+/*
+ * Abstract layer.
+ */
+class Layer {
+protected:
+    ConvNetThread* _convNetThread;
+
+    // This is a vector[#layers_next]
+    std::vector<Layer*> _next;
+    // This is a vector[#replicas_prev][#layers_prev]
+    std::map<int, std::vector<Layer*> > _prev;
+
+    int _rcvdFInputMsgs;
+    std::map<int, int> _numComputedActsGrads;
+    int _rcvdBInputMsgs;
+    int _numOutputs;
+    std::map<int, NVMatrix*> _inputs;                // input idx -> matrix
+    std::map<int, MemoryView*> _memSrcActs;        // device id -> memory source
+    std::map<int, MemoryView*> _memSrcActsGrad;    // device id -> memory source
+
+    bool _gradConsumer, _foundGradConsumers, _trans;
+    std::map<int,bool> _bwdTerminal; // One bool per pass
+    int _numGradProducersNext;
+    int _actsTarget, _actsGradTarget;
+    std::string _name, _type;
+    intv _nextDeviceIDs, _prevDeviceIDs;
+    HostNVMatrix _hostMemFwd;
+
+    // New replica-related stuff:
+    std::map<int,Layer*> _replicas; // NOTE: a layer is its own sibling, too
+    // Previous layers sorted by device ID, in reverse order in which they are procesed by
+    // sequential grad reducer. map from replica -> device id -> layers
+    std::map<int,std::map<int,std::set<Layer*> > > _prevByDevice;
+    std::map<std::string, int> _inputIndices;
+    int _replicaID;
+    int _numReplicas;
+    int _numReplicasPrev, _numReplicasNext;
+
+    Queue<int> _broadcastFinishQueue;
+    Queue<int> _reductionFinishQueue;
+    ActBroadcaster* _actBroadcaster;
+    IActGradReducer* _gradReducer;
+    Timer _timer;
+    bool _initialized;
+
+    virtual void fpropNext(PASS_TYPE passType, int passIdx);
+    virtual void truncBwdActs(); 
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0;
+    
+    virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
+        // Do nothing by default
+    }
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+        assert(!isGradProducer()); // Only do nothing if not grad producer
+    }
+    virtual void fpropCommon(PASS_TYPE passType) {
+
+    }
+    void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx);
+
+    ActBroadcaster& getActBroadcaster();
+    IActGradReducer& getGradReducer();
+    int getInputIdx(std::string& parentName);
+    void setInputIdx(std::string& parentName, int idx);
+
+public:
+    static bool _saveActsGrad, _saveActs;
+    
+    Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    virtual ~Layer();
+    
+    virtual bool fprop(PASS_TYPE passType, int passIdx);
+    void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx);
+    virtual void fprop(std::map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx);
+    virtual void bprop(PASS_TYPE passType, int passIdx);
+    virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
+    virtual void reset();
+    virtual void resetPassIdx();
+    int getNumCases(NVMatrix& v);
+    int& getNumComputedActsGrads(int deviceID);
+    int incRcvdBInputMsgs();
+    bool isGradConsumer();
+    bool hasGradProducerNext(std::string& layerName);
+    // Does this layer produce a gradient for any layer?
+    virtual bool isGradProducer();
+    // Does this layer produce a gradient for layer of given name?
+    virtual bool isGradProducer(std::string& layerName);
+    std::string& getName();
+    std::string& getType();
+    virtual void addNext(Layer& l);
+    virtual void addPrev(Layer& l, int replicaIdx);
+    virtual void addReplica(Layer& l);
+    std::map<int,std::vector<Layer*> >& getPrev();
+    std::vector<Layer*>& getNext();
+    virtual NVMatrix& getActs();
+    virtual NVMatrix& getActs(int deviceID);
+    virtual NVMatrix& getActs(int deviceID, int numCases);
+    virtual NVMatrix& getActsGrad();
+    virtual NVMatrix& getActsGrad(int deviceID);
+    virtual std::map<int,NVMatrix*> getAllActs();
+    virtual std::map<int, NVMatrix*> getAllActsGrads();
+    virtual bool postInit();
+    int getDeviceID();
+    ConvNetThread& getConvNetThread();
+    cudaStream_t getStream();
+    void syncStream();
+    void setBwdTerminal(int passIdx);
+    // Do nothing if this layer has no weights
+    virtual bool updateWeights() {
+        return false;
+    }
+    virtual bool constrainWeights() {
+        return false;
+    }
+    virtual void checkGradient() {
+    }
+    virtual void copyToCPU() {
+    }
+    virtual void copyToGPU()  {
+    }
+    intv& getNextDeviceIDs() {
+        return _nextDeviceIDs;
+    }
+
+    int getReplicaID();
+    int getNumReplicas();
+    int getNumSiblingReplicas();
+    int getNumReplicasPrev();
+    int getNumReplicasNext();
+    int getNumOutputs();
+    void setMemorySourceActs(int deviceID, MemoryView& mem);
+    void setMemorySourceActsGrad(int deviceID, MemoryView& mem);
+    MemoryView& getMemorySourceActs(int deviceID);
+    MemoryView& getMemorySourceActsGrad(int deviceID);
+    int getFwdActiveInputReplicaIdx(int passIdx);
+    int getBwdActiveInputReplicaIdx(int passIdx);
+    int getFwdActiveReplicaIdx(int passIdx);
+    int getNumLayersPrev();
+    virtual int getNumInputReplicas();
+    int getNumExpectedBwdMsgs();
+    int getNumExpectedFwdMsgs();
+    int getReplicaIdx();
+    int getActivePassPeriod();
+    int getNumGradProducersNext();
+    virtual ConvNet& getConvNet();
+};
+
+class TwoDLayerInterface {
+protected:
+    int _channels, _imgSize, _imgPixels;
+public:
+    TwoDLayerInterface(PyObject* paramsDict);
+};
+
+class NeuronLayer : public Layer {
+protected:
+    Neuron* _neuron;
+    std::string _neuronType;
+    
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    class CrossEntLogisticGradientOperator {
+    private:
+        float _coeff, _posWeight;
+    public:
+        CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
+        }
+        __device__ inline float operator()(const float y, const float t) const {
+            return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y);
+        }
+    };
+    NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~NeuronLayer();
+    std::string& getNeuronType();
+};
+
+class WeightLayer : public Layer {
+protected:
+    WeightList* _weights;
+    Weights *_biases;
+    NVMatrix _norm2;
+    float _wStep, _bStep;
+    int _weightUpdatePassPeriod;
+    void fpropCommon(PASS_TYPE passType);
+    void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType);
+    virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
+    virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0;
+    virtual void _constrainWeights();
+    virtual float getGradScale(int inpIdx, PASS_TYPE passType);
+    virtual float getIncScale(int inpIdx, PASS_TYPE passType);
+    virtual float getBGradScale(PASS_TYPE passType);
+    virtual float getBIncScale();
+    virtual NVMatrix& getGradTarget(int inpIdx);
+    NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx);
+    NVMatrix& getBiasMatrix(PASS_TYPE passType);
+public:
+    WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad);
+    virtual ~WeightLayer();
+    virtual bool updateWeights();
+    virtual bool constrainWeights();
+    virtual void copyToCPU();
+    virtual void copyToGPU();
+    virtual void checkGradient();
+    Weights& getWeights(int idx);
+    void addReplica(Layer& l);
+    virtual bool postInit();
+};
+
+class FCLayer : public WeightLayer {
+protected:
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    virtual void _constrainWeights();
+public:
+    FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+    FCLayer();
+};
+
+class SplitFCLayer : public FCLayer {
+protected:
+    int _numParts;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+//    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void splitWeights();
+public:
+    SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+};
+
+class SoftmaxLayer : public Layer {
+protected:
+    bool _doUpperGrad;
+    NVMatrix _max, _sum;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    void setDoUpperGrad(bool b);
+};
+
+class ConcatenationLayer : public Layer {
+protected:
+    intv* _copyOffsets;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual ~ConcatenationLayer();
+};
+
+class PassThroughLayer : public Layer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual bool postInit();
+};
+
+class EltwiseSumLayer : public Layer {
+protected:
+    floatv* _coeffs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~EltwiseSumLayer();
+};
+
+class EltwiseMaxLayer : public Layer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class SumLayer : public Layer {
+protected:
+    int _stride;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class DataCopyMessage {
+public:
+    enum MESSAGE_TYPE {
+        COPY,
+        EXIT
+    };
+protected:
+    CPUData* _cpuData;
+    int _passIdx;
+    bool _other;
+    DataCopyMessage::MESSAGE_TYPE _type;
+    DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) {
+    }
+public:
+    DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) {
+    }
+    
+    CPUData& getData() const {
+        return *_cpuData;
+    }
+    
+    int getPassIdx() const {
+        return _passIdx;
+    }
+    
+    bool isOther() const {
+        return _other;
+    }
+
+    DataCopyMessage::MESSAGE_TYPE getType() {
+        return _type;
+    }
+};
+
+class DataCopyExitMessage : public DataCopyMessage {
+public:
+    DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) {
+    }
+};
+
+class DataCopyThread;
+
+class DataLayer : public Layer {
+protected:
+    bool _useBuffer;
+    int _dataIdx;
+    ConvNet* _convNet;
+//    std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
+    std::map<int, MemoryView*> _memSrcActs2;        // // Buffer for copying data during computation
+    std::map<int, cudaStream_t> _copyStreams;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    Queue<int> _copyFinishQueue;
+    DataCopyThread* _copier;
+    bool _outstandingCopyRequest;
+    int _start, _end;
+    
+public:
+    void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer);
+    DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID);
+    ~DataLayer();
+    NVMatrix& getActs(int deviceID);
+//    NVMatrix& getActs(int deviceID, bool other);
+    NVMatrix& getActs(int deviceID, bool other, int numCases);
+    bool isGradProducer();
+    void toggleBuffer(int passIdx);
+    void copyData(CPUData& data, bool other, int passIdx);
+    bool postInit();
+    ConvNet& getConvNet();
+    int getNumInputReplicas();
+    cudaStream_t getCopyStream(int deviceID);
+    Queue<int>& getCopyFinishQueue() {
+        return _copyFinishQueue;
+    }
+    void waitForCopyFinish();
+    int getDataIdx() const {
+        return _dataIdx;
+    }
+    int getStart() const {
+        return _start;
+    }
+    int getEnd() const {
+        return _end;
+    }
+};
+
+
+class DataCopyThread : public Thread {
+protected:
+    DataLayer* _parent;
+    Queue<DataCopyMessage*> _queue;
+    HostNVMatrix _hostMemFwd;
+    Timer _requestTimer;
+    int _sleepUsec;
+    virtual void* run();
+    
+public:
+    DataCopyThread(DataLayer& parent, intv& cpus);
+    Queue<DataCopyMessage*>& getQueue();
+    void stop();
+};
+
+
+class LocalLayer : public WeightLayer {
+protected:
+    intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
+    intv* _imgPixels, *_filterPixels, *_filterChannels;
+    int _modulesX, _modules, _numFilters;
+    
+public:
+    LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+    virtual ~LocalLayer();
+};
+
+class ConvLayer : public LocalLayer {
+protected:
+    int _sumWidth;
+    bool _sharedBiases;
+    floatv* _weightContrastNormMin, *_weightContrastNormMax;
+    NVMatrix _weightGradTmp;
+
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void truncBwdActs();
+    void _constrainWeights();
+
+public:
+    ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual ~ConvLayer();
+}; 
+
+class LocalUnsharedLayer : public LocalLayer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void _constrainWeights();
+public:
+    LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class PoolLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _sizeX, _start, _stride, _outputsX;
+    std::string _pool;
+public:
+    PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    
+    static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class AvgPoolLayer : public PoolLayer {
+protected:
+    bool _sum;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class MaxPoolLayer : public PoolLayer {
+protected:
+    bool _abs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs);
+};
+
+class CrossMapPoolLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _size, _start, _stride, _outputs;
+    std::string _pool;
+public:
+    CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+
+    static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CrossMapMaxPoolLayer : public CrossMapPoolLayer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RandomScaleLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _tgtSize, _minScaledSize;
+    float _maxScale; // should be >= 1
+    NVMatrix _rescaledActs;
+    std::vector<double> _scaleProbs;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CropLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _tgtSize, _startX, _startY;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class NailbedLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _start, _stride, _outputsX;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class GaussianBlurLayer : public Layer, public TwoDLayerInterface {
+protected:
+    Matrix* _hFilter;
+    NVMatrix _filter;
+    NVMatrix _actGradsTmp;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void copyToGPU();
+    
+    GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~GaussianBlurLayer();
+};
+
+class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface {
+protected:
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID);
+};
+
+class ResizeLayer : public Layer, public TwoDLayerInterface {
+protected:
+    float _scale;
+    int _tgtSize;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class DropoutLayer : public Layer {
+protected:
+    bool _enable;
+    float _keep;
+    NVMatrix _keepMask;
+public:
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+    DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    class DropoutSmallerThanOperator {
+    private:
+        float _keep, _scale;
+    public:
+        DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) {
+        }
+        __device__ inline float operator()(const float x) const {
+            return (x < _keep) * _scale;
+        }
+    };
+};
+
+class Dropout2Layer : public DropoutLayer {
+protected:
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RGBToYUVLayer : public Layer {
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RGBToLABLayer : public Layer {
+protected:
+    bool _center;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class ResponseNormLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _size;
+    float _scale, _pow;
+    float _minDiv;
+    NVMatrix _denoms;
+
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+public:
+    ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class CrossMapResponseNormLayer : public ResponseNormLayer {
+protected:
+    bool _blocked;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class ContrastNormLayer : public ResponseNormLayer {
+protected:
+    NVMatrix _meanDiffs;
+    
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+public:
+    ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CostLayer : public Layer {
+protected:
+    float _coeff;
+    doublev _costv;
+    NVMatrix _tmpbuf; // For error accumulation
+    int _numCases; // number of cases that the values in _costv were computed on
+    bool _aggregated;
+    void fpropCommon(PASS_TYPE passType);
+public:
+    CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
+    bool fprop(PASS_TYPE passType, int passIdx);
+    
+    int getNumCases();
+    virtual doublev& getCost();
+    float getCoeff();
+    bool isGradProducer();
+    void setSendTerminalMessages(bool send);
+    void resetPassIdx();
+    
+    static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: softmax outputs
+ */
+class CrossEntCostLayer : public CostLayer {
+protected:
+    NVMatrix _trueLabelLogProbs, _correctProbs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: softmax outputs
+ */
+class LogregCostLayer : public CostLayer {
+protected:
+    NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs;
+    std::map<int,NVMatrix*> _probsAccum; // input replica idx -> nvmatrix
+    NVMatrix _maxProbs;
+    std::map<int,int> _numAccumed; // input replica idx -> int
+    int _topk;
+    bool _doCompute;
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    NVMatrix& getProbsAccum(int replicaIdx);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: logistic outputs
+ */
+class BinomialCrossEntropyCostLayer : public CostLayer {
+protected:
+    bool _computeSoftmaxErrorRate;
+    NVMatrix _tmpProbs, _tmpVec, _correctProbs;
+    float _posWeight;
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    float getPosWeight();
+
+    // Only for use with non-logistic units
+    class BinomialCrossEntGradientOperator {
+    private:
+        float _coeff, _posWeight;
+    public:
+        BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
+        }
+        __device__ inline float operator()(const float t, const float y) const {
+            return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y));
+        }
+    };
+};
+
+/*
+ * Input 0: labels
+ * Input 1: logistic outputs
+ */
+class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer {
+protected:
+    Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive;
+    NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+public:
+    DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class SumOfSquaresCostLayer : public CostLayer {
+protected:
+    NVMatrix _tmp;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+#endif    /* LAYER_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh
new file mode 100644
index 0000000..ec61266
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LAYER_KERNELS_CUH
+#define	LAYER_KERNELS_CUH
+
+#include <vector>
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+#define LOGREG_GRAD_THREADS_X      32
+#define LOGREG_GRAD_THREADS_Y      4
+
+#define LOGREG_ERR_THREADS_X        128
+#define LOGREG_ERR_THREADS_Y        1
+
+__device__ inline float safelog(const float x) {
+    return x > 0.0f ? __logf(x) : -50.0f;
+}
+
+// The input matrix here is the squared norm.
+// This replaces the squared norm with:
+// 1 if it is below the threshold given by norm2
+// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
+class MaxWeightConstraintOperator {
+private:
+    float _norm, _norm2;
+public:
+    MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
+    }
+    __device__ inline float operator()(const float a) const {
+        return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
+    }
+};
+
+class HardWeightConstraintOperator {
+private:
+    float _norm, _norm2;
+public:
+    HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
+    }
+    __device__ inline float operator()(const float a) const {
+        return __fdividef(_norm, sqrtf(a));
+    }
+};
+
+class WeightContrastNormOperator {
+private:
+    float _min, _max, _scale;
+public:
+    WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
+    }
+    __device__ inline float operator()(float a) const {
+        a = sqrtf(a) * _scale;
+        return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
+    }
+};
+
+void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
+void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad);
+
+void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
+void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+
+
+// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
+// to avoi dividing and then multiplying by quantities that may be near zero.
+void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
+void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
+                             NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize);
+#endif	/* LAYER_KERNELS_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
new file mode 100644
index 0000000..10a409a
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LR_CUH
+#define LR_CUH
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <helper_cuda.h>
+#include <assert.h>
+#include <Python.h>
+#include "util.cuh"
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+
+/*
+ * The maximum learning rate is _baseRate.
+ * The minimum learning rate is _baseRate / _tgtFactor.
+ *
+ * These classes define annealing schedules that interpolate between these
+ * two extrema.
+ */
+class ParameterSchedule {
+protected:
+    double _baseRate;
+public:
+    ParameterSchedule(double base);
+    virtual double getValue(double progress);
+    double getBaseValue() const;
+    virtual ~ParameterSchedule();
+
+    static ParameterSchedule& make(PyObject* schedDict);
+};
+
+class LinearParameterSchedule : public ParameterSchedule {
+protected:
+    double _finalRate;
+public:
+    LinearParameterSchedule(double base, double tgtFactor);
+    virtual double getValue(double progress);
+};
+
+class ExpParameterSchedule : public ParameterSchedule {
+protected:
+    double _powBase;
+public:
+    ExpParameterSchedule(double baseRate, double tgtFactor);
+    virtual double getValue(double progress);
+};
+
+class DiscreteExpParameterSchedule : public ParameterSchedule {
+protected:
+    std::vector<double> _rates;
+public:
+    DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps);
+    virtual double getValue(double progress);
+};
+
+
+#endif    /* LR_CUH */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh
new file mode 100644
index 0000000..9ea3f69
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+class MemorySource;
+
+class MemoryView {
+protected:
+    MemorySource* _src;
+    std::string _name;
+public:
+    MemoryView(MemorySource& src, std::string& name);
+    ~MemoryView();
+    NVMatrix& getMemory(int numCases);
+    NVMatrix& getMemory();
+    MemorySource& getMemorySource();
+    bool isParent();
+    std::string& getName();
+    MemoryView& clone(std::string& name);
+};
+
+// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU.
+class MemorySource {
+protected:
+//    int _inputIdx;
+    NVMatrix _memory;
+    int _deviceID;
+    int _size;
+    std::map<std::string, std::pair<int,int> > _viewRanges;
+    std::map<std::string, NVMatrix*> _memoryViews; // input idx --> slice of _memory
+    std::set<std::string> _truncateRequests;
+    Lock _lock;
+public:
+    MemorySource(int size, int deviceID);
+    ~MemorySource();
+    NVMatrix& getMemory(std::string& name, int numCases);
+    NVMatrix& getMemory(std::string& name);
+    MemoryView& addUser(std::string& name, std::pair<int,int> range);
+    MemoryView& addUser(std::string& name);
+    std::pair<int,int> getRange(std::string& name);
+    int getSize();
+    bool truncate(std::string& name);
+    static MemoryView& make(int size, int deviceID, std::string& parentUser);
+};
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
new file mode 100644
index 0000000..25dd2f4
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MESSAGES_CUH_
+#define MESSAGES_CUH_
+
+#include <string>
+#include "layer.cuh"
+
+class Layer;
+
+enum MESSAGES { FPROP_TERMINAL,
+                BPROP_TERMINAL,
+                BPROP_READY,
+                FPROP_READY,
+                SYNC,
+                COPY_TO_CPU,
+                COPY_TO_GPU,
+                UPDATE_WEIGHTS,
+                CONSTRAIN_WEIGHTS,
+                RESET,
+                RESET_PASS_IDX,
+                COST_COMPUTED,
+                BPROP_START,
+                EXIT_CONVNET};
+
+class Message {
+protected:
+    MESSAGES _messageType;
+public:
+    MESSAGES getType() {
+        return _messageType;
+    }
+    virtual Message* clone() {
+        return new Message(_messageType);
+    }
+    Message(MESSAGES messageType) : _messageType(messageType) {
+    }
+    virtual ~Message() {
+    }
+};
+
+class PropMessage : public Message {
+protected:
+    Layer *_toLayer;
+    PASS_TYPE _passType;
+    int _passIdx;
+public:
+
+    Layer& getToLayer() {
+        return *_toLayer;
+    }
+
+    PASS_TYPE getPassType() {
+        return _passType;
+    }
+
+    int getPassIdx() {
+        return _passIdx;
+    }
+
+    virtual PropMessage* clone() {
+        return new PropMessage(*_toLayer, _passType, _passIdx, _messageType);
+    }
+
+    PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType)
+        : _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) {
+    }
+};
+
+class FpropMessage : public PropMessage {
+public:
+    FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
+        : PropMessage(toLayer, passType, passIdx, FPROP_READY) {
+    }
+    virtual FpropMessage* clone() {
+        return new FpropMessage(*_toLayer, _passType, _passIdx);
+    }
+};
+
+class BpropMessage : public PropMessage {
+public:
+    BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
+        : PropMessage(toLayer, passType, passIdx, BPROP_READY) {
+    }
+    virtual BpropMessage* clone() {
+        return new BpropMessage(*_toLayer, _passType, _passIdx);
+    }
+};
+
+class BpropStartMessage : public Message {
+protected:
+    PASS_TYPE _passType;
+    int _passIdx;
+public:
+    PASS_TYPE getPassType() {
+        return _passType;
+    }
+
+    int getPassIdx() {
+        return _passIdx;
+    }
+
+    virtual BpropStartMessage* clone() {
+        return new BpropStartMessage(_passType, _passIdx);
+    }
+
+    BpropStartMessage(PASS_TYPE passType, int passIdx)
+        : _passType(passType), Message(BPROP_START), _passIdx(passIdx) {
+    }
+};
+
+
+
+#endif /* MESSAGES_CUH_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
new file mode 100644
index 0000000..d573901
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
@@ -0,0 +1,541 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NEURONS_CUH
+#define	NEURONS_CUH
+
+#include <Python.h>
+#include <assert.h>
+#include <string>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include <helper_cuda.h>
+
+template <class GradientOp>
+class AddGradientBinaryOperator {
+    GradientOp _op;
+public:
+    AddGradientBinaryOperator(GradientOp op) : _op(op) {
+    }
+    __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
+        return _op(unitActGrad, unitAct) + target; 
+    }
+};
+
+template <class GradientOp>
+class AddGradientOperator {
+    GradientOp _op;
+public:
+    AddGradientOperator(GradientOp op) : _op(op) {
+    }
+    __device__ inline float operator()(const float unitActGrad, const float target) const {
+        return target + _op(unitActGrad); 
+    }
+};
+
+/* =======================
+ * Neuron
+ * -----------------------
+ * 
+ * f(x) = x
+ * =======================
+ */
+class Neuron {
+protected:
+    bool _activated;
+    // Inputs and outputs potentially point to the same matrix, depending on the neuron
+    NVMatrix* _inputs, *_outputs; 
+    virtual void _activate() {
+        if (_inputs != _outputs) {
+            _inputs->copy(*_outputs);
+        }
+    }
+    virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        if (&target != &actsGrad) {
+            actsGrad.copy(target);
+        }
+    }
+    virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        if (&target != &actsGrad) {
+            target.add(actsGrad);
+        }
+    }
+public:
+    Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
+    }
+    virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
+        _activated = true;
+        _inputs = &inputs;
+        _outputs = &outputs;
+        _activate();
+    }
+
+    virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
+        assert(_activated);
+        if (!add) {
+            target.resize(actsGrad);
+            _computeInputGrad(actsGrad, target);
+        } else {
+            _addInputGrad(actsGrad, target);
+        }
+    }
+        
+    static Neuron& makeNeuron(PyObject* neuronDict);
+};
+
+/* =======================
+ * LogisticNeuron
+ * -----------------------
+ * 
+ * f(x) = 1 / (1 + e^-x)
+ * =======================
+ */
+class LogisticNeuron : public Neuron {
+protected:
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Logistic(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class LogisticGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const {
+            return unitActGrad * unitAct * (1.0f - unitAct); 
+        }
+    };
+    
+    LogisticNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * LogNeuron
+ * -----------------------
+ *
+ * f(x) = log(eps + x)
+ * =======================
+ */
+class LogNeuron : public Neuron {
+protected:
+    float _eps;
+    void _activate() {
+        _inputs->apply(LogOperator(_eps), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target);
+    }
+
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<LogGradientOperator>(LogGradientOperator(_eps)), *_inputs, target, target);
+    }
+public:
+    class LogGradientOperator {
+    protected:
+        float _eps;
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const {
+            return __fdividef(unitActGrad, _eps + unitInput);
+        }
+        LogGradientOperator(float eps) : _eps(eps) {
+
+        }
+    };
+
+    class LogOperator {
+    protected:
+        float _eps;
+    public:
+        __device__ inline float operator()(float x) const {
+            return __logf(_eps + x);
+        }
+        LogOperator(float eps) : _eps(eps) {
+
+        }
+    };
+
+    LogNeuron(float eps) : _eps(eps), Neuron() {
+    }
+};
+
+/* =======================
+ * ReluNeuron
+ * -----------------------
+ * 
+ * f(x) = max(0, x)
+ * =======================
+ */
+class ReluNeuron : public Neuron {
+protected:
+    virtual void _activate() {
+        _inputs->apply(ReluOperator(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class ReluOperator {
+    public:    
+        __device__ inline float operator()(float x) const {
+            return x < 0.0f ? 0.0f : x;
+        }
+    };
+
+    class ReluGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+            return unitActGrad * (unitAct > 0.0f); 
+        }
+    };
+    
+    ReluNeuron() : Neuron() {
+    }
+};
+
+
+/* =======================
+ * BoundedReluNeuron
+ * -----------------------
+ * 
+ * f(x) = min(a, max(0, x))
+ * =======================
+ */
+class BoundedReluNeuron : public Neuron {
+protected:
+    float _a;
+    
+    void _activate() {
+        _inputs->apply(BoundedReluOperator(_a), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
+    }
+public:
+    class BoundedReluOperator {
+    private:
+        float _a;
+    public:
+        BoundedReluOperator(float a) : _a(a) {
+        }
+        __device__ inline float operator()(float x) const {
+            return x < 0.0f ? 0.0f : x > _a ? _a : x;
+        }
+    };
+
+    class BoundedReluGradientOperator {
+    private:
+        float _a;
+    public:
+        BoundedReluGradientOperator(float a) : _a(a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+            return unitActGrad * (unitAct > 0.0f) * (unitAct < _a); 
+        }
+    };
+    
+    BoundedReluNeuron(float a) : Neuron(), _a(a) {
+    }
+};
+
+/* =======================
+ * AbsNeuron
+ * -----------------------
+ * 
+ * f(x) = abs(x)
+ * =======================
+ */
+class AbsNeuron : public Neuron {
+protected:
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(NVMatrixOps::Abs(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
+    }
+public:
+    class AbsGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
+            return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f); 
+        }
+    };
+    
+    AbsNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * TanhNeuron
+ * -----------------------
+ * 
+ * f(x) = a*tanh(b*x)
+ * =======================
+ */
+class TanhNeuron : public Neuron {
+protected:
+    float _a, _b;
+
+    void _activate() {
+        _inputs->apply(TanhOperator(_a, _b), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
+    }
+public:
+    class TanhOperator {
+    private:
+        float _a, _n2b;
+    public:
+        TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
+        }
+        virtual __device__ inline float operator()(float x) const {
+            return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
+        }
+    };
+
+    class TanhGradientOperator {
+    private:
+        float _b, _a;
+    public:
+        TanhGradientOperator(float a, float b) : _b(b), _a(a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+//            const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
+//            return unitActGrad * _n4ab * (t * (t - 1.0f));
+            return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
+        }
+    };
+    
+    TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
+    }
+};
+
+/* =======================
+ * DoubleReluNeuron
+ * -----------------------
+ * 
+ * f(x) = x - a*tanh(x/a)
+ * =======================
+ */
+class DoubleReluNeuron : public Neuron {
+protected:
+    float _a;
+
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(DoubleReluOperator(_a), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
+    }
+public:
+    class DoubleReluOperator {
+    private:
+        float _a, _n2a;
+    public:
+        DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
+        }
+        virtual __device__ inline float operator()(float x) const {
+            return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
+        }
+    };
+
+    class DoubleReluGradientOperator {
+    private:
+        float _n2a;
+    public:
+        DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
+            const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
+            return unitActGrad * (tanh*tanh);
+        }
+    };
+    
+    DoubleReluNeuron(float a) : Neuron(), _a(a) {
+    }
+};
+
+/* =======================
+ * SoftReluNeuron
+ * -----------------------
+ * 
+ * f(x) = log(1 + e^x)
+ * =======================
+ */
+class SoftReluNeuron : public Neuron {
+protected:
+    void _activate() {
+//        assert(_inputs != _outputs);
+        _inputs->apply(SoftReluOperator(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class SoftReluOperator {
+    public:    
+        __device__ inline float operator()(float x) const {
+            // This piece-wise implementation has better numerical stability than
+            // simply computing log(1 + e^x).
+            return x > 4.0f ? x : __logf(1.0f + __expf(x));
+        }
+    };
+
+    class SoftReluGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitOutput) const  {
+            if (unitOutput > 4.0f) {
+                return unitActGrad;
+            }
+            const float f = __expf(-unitOutput);
+            return unitActGrad * (1.0f - f);
+        }
+    };
+    
+    SoftReluNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * SquareNeuron
+ * -----------------------
+ * 
+ * f(x) = x^2
+ * =======================
+ */
+class SquareNeuron : public Neuron {
+protected:
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(NVMatrixOps::Square(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
+    }
+public:
+    class SquareGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const {
+            return unitActGrad * 2.0f * unitInput; 
+        }
+    };
+    
+    SquareNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * SqrtNeuron
+ * -----------------------
+ * 
+ * f(x) = sqrt(x)
+ * =======================
+ */
+class SqrtNeuron : public Neuron {
+protected:
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class SqrtGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const {
+            return __fdividef(unitActGrad, 2.0f * unitAct); 
+        }
+    };
+    
+    SqrtNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * LinearNeuron
+ * -----------------------
+ * 
+ * f(x) = a*x + b
+ * =======================
+ */
+class LinearNeuron : public Neuron {
+protected:
+    float _a, _b;
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.scale(_a, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
+    }
+public:
+    LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
+    }
+};
+#endif	/* NEURONS_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh
new file mode 100644
index 0000000..9c43c9d
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PIPEDISPENSER_CUH_
+#define PIPEDISPENSER_CUH_
+
+#include <pthread.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include "../../util/include/thread.h"
+#include "util.cuh"
+
+/*
+ * PipeDispenser interface
+ */
+class PipeDispenser {
+protected:
+    int _numPipes;
+    seti _pipes;
+    pthread_mutex_t *_mutex;
+
+    void lock() {
+        pthread_mutex_lock(_mutex);
+    }
+
+    void unlock() {
+        pthread_mutex_unlock(_mutex);
+    }
+
+    virtual void init() {
+        _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+        pthread_mutex_init(_mutex, NULL);
+    }
+public:
+    PipeDispenser(const seti& pipes) {
+        _pipes.insert(pipes.begin(), pipes.end());
+        init();
+    }
+
+    PipeDispenser(int numPipes) {
+        for (int i = 0; i < numPipes; ++i) {
+            _pipes.insert(i);
+        }
+        init();
+    }
+
+    virtual ~PipeDispenser() {
+        pthread_mutex_destroy(_mutex);
+        free(_mutex);
+    }
+
+    virtual int getPipe(const seti& interested) = 0;
+
+    int getPipe(int interested) {
+        seti tmp;
+        tmp.insert(interested);
+        return getPipe(tmp);
+    }
+
+    virtual void freePipe(int pipe) = 0;
+};
+
+/*
+ * This one blocks until there is a free pipe to return.
+ */
+class PipeDispenserBlocking : public PipeDispenser {
+protected:
+    pthread_cond_t *_cv;
+
+    void wait() {
+        pthread_cond_wait(_cv, _mutex);
+    }
+
+    void broadcast() {
+        pthread_cond_broadcast(_cv);
+    }
+
+    int getAvailablePipes(const seti& interested, intv& available) {
+        available.clear();
+        std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
+        return available.size();
+    }
+
+    virtual void init() {
+        PipeDispenser::init();
+        _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
+                pthread_cond_init(_cv, NULL);
+    }
+public:
+    PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
+        init();
+    }
+
+    PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) {
+        init();
+    }
+
+    ~PipeDispenserBlocking() {
+        pthread_cond_destroy(_cv);
+        free(_cv);
+    }
+
+    int getPipe(const seti& interested) {
+        lock();
+        intv avail;
+        while (getAvailablePipes(interested, avail) == 0) {
+            wait();
+        }
+        int pipe = avail[0];
+        _pipes.erase(pipe);
+        unlock();
+        return pipe;
+    }
+
+    void freePipe(int pipe) {
+        lock();
+        _pipes.insert(pipe);
+        broadcast();
+        unlock();
+    }
+};
+
+/*
+ * This one returns the least-occupied pipe.
+ */
+class PipeDispenserNonBlocking : public PipeDispenser  {
+protected:
+    std::map<int,int> _pipeUsers;
+
+public:
+    PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
+        for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
+            _pipeUsers[*it] = 0;
+        }
+    }
+
+    int getPipe(const seti& interested) {
+        lock();
+        int pipe = -1, users = 1 << 30;
+        for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
+            if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
+                pipe = *it;
+                users = _pipeUsers[*it];
+            }
+        }
+        if (pipe >= 0) {
+            _pipeUsers[pipe]++;
+        }
+        unlock();
+        return pipe;
+    }
+
+    void freePipe(int pipe) {
+        lock();
+        _pipeUsers[pipe]--;
+        unlock();
+    }
+};
+
+
+#endif /* PIPEDISPENSER_CUH_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh
new file mode 100644
index 0000000..911c4cd
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PYCONVNET3_CUH
+#define	PYCONVNET3_CUH
+
+#define _QUOTEME(x) #x
+#define QUOTEME(x) _QUOTEME(x)
+
+extern "C" void init_ConvNet();
+
+PyObject* initModel(PyObject *self, PyObject *args);
+PyObject* startBatch(PyObject *self, PyObject *args);
+PyObject* finishBatch(PyObject *self, PyObject *args);
+PyObject* checkGradients(PyObject *self, PyObject *args);
+PyObject* syncWithHost(PyObject *self, PyObject *args);
+PyObject* startMultiviewTest(PyObject *self, PyObject *args);
+PyObject* startFeatureWriter(PyObject *self, PyObject *args);
+PyObject* startDataGrad(PyObject *self, PyObject *args);
+PyObject* decodeJpeg(PyObject *self, PyObject *args);
+
+#endif
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh
new file mode 100644
index 0000000..8bafce5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef REDUCEPIPELINE_CUH_H_
+#define REDUCEPIPELINE_CUH_H_
+
+#include "../../util/include/thread.h"
+#include "../../util/include/queue.h"
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+#define REDUCE_MIN_CHUNK_SIZE               (1<<18) // 256k
+#define REDUCE_MAX_CHUNKS                   16
+#define REDUCE_MIN_CHUNKS                   2
+
+enum REDUCE_MESSAGE_TYPE {
+    REDUCE_CHUNK,
+    REDUCE_START,
+    EXIT
+};
+
+class ReducePeer;
+class ReducerSource;
+class IReduceSegment;
+class IEightGPUReducer;
+
+class ReduceMessage {
+protected:
+    REDUCE_MESSAGE_TYPE _msgType;
+    float _scaleIntermediates, _scaleTarget;
+    std::map<int,NVMatrix*>* _mats;
+public:
+    ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) {
+    }
+    ReduceMessage(REDUCE_MESSAGE_TYPE msgType)
+        : _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) {
+    }
+    inline REDUCE_MESSAGE_TYPE getType() const {
+        return _msgType;
+    }
+    inline float getScaleIntermediates() const {
+        return _scaleIntermediates;
+    }
+    inline float getScaleTarget() const {
+        return _scaleTarget;
+    }
+    inline NVMatrix& getMatrix(int deviceID) const {
+        return *_mats->at(deviceID);
+    }
+    inline std::map<int,NVMatrix*>& getMatrices() const {
+        return *_mats;
+    }
+};
+
+class ReduceChunkMessage : public ReduceMessage {
+protected:
+    int _chunkIdx;
+    int _chunkSize;
+    int _numChunks;
+
+    IReduceSegment* _src;
+public:
+    ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks),
+          ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) {
+    }
+
+    inline int getChunkIdx() const {
+        return _chunkIdx;
+    }
+
+    inline int getChunkSize() const {
+        return _chunkSize;
+    }
+
+    inline int getNumChunks() const {
+        return _numChunks;
+    }
+
+    inline IReduceSegment& getSource() const {
+        return *_src;
+    }
+};
+
+class ReduceStartMessage : public ReduceMessage {
+public:
+    ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) {
+    }
+};
+
+class IReduceSegment : public Thread {
+protected:
+    int _deviceID;
+    std::vector<IReduceSegment*> _prev;
+    ReducePeer* _next;
+    Queue<ReduceMessage*> _queue;
+    Queue<int>* _finishQueue;
+
+    NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx);
+    void* run();
+    virtual bool processMessage(ReduceMessage& msg) = 0;
+
+public:
+    IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
+    virtual ~IReduceSegment();
+    inline virtual NVMatrix& getMatrix(ReduceMessage& msg);
+    Queue<ReduceMessage*>& getQueue();
+    int getDeviceID() const;
+    void addPrev(IReduceSegment& c);
+    void addNext(ReducePeer& c);
+    bool isTerminal() const;
+};
+
+class ReducerSource : public IReduceSegment {
+protected:
+    bool processMessage(ReduceMessage& msg);
+public:
+    ReducerSource(IEightGPUReducer& parent, int deviceID);
+};
+
+class ReducePeer : public IReduceSegment {
+protected:
+    std::map<int,cudaStream_t> _streams;  // device id -> stream
+    std::map<int,int> _numInputsReceived; // chunk idx -> num inputs
+    int _numInputsFinished;
+    HostNVMatrix _mat;
+    bool _add;
+    bool processMessage(ReduceMessage& msg);
+    inline cudaStream_t getStream(int deviceID);
+    inline NVMatrix& getMatrix(ReduceMessage& msg);
+    void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt);
+public:
+    ReducePeer(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
+    ReducePeer(IEightGPUReducer& parent);
+    ~ReducePeer();
+};
+
+class IEightGPUReducer {
+protected:
+    std::vector<ReducerSource*> _sources;
+    std::vector<ReducePeer*> _peers;
+    Queue<int> _finishQueue;
+    int _tgtDeviceID;
+    virtual void makeConnections(std::vector<int>& same, std::vector<int>&other) = 0;
+public:
+    IEightGPUReducer(int tgtDeviceID);
+    virtual ~IEightGPUReducer();
+    IEightGPUReducer& construct();
+    void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget);
+    void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates);
+    void reduce(std::map<int, NVMatrix*>& mats);
+    int getTgtDeviceID() const;
+};
+
+class EightGPUReducer1 : public IEightGPUReducer {
+protected:
+    void makeConnections(std::vector<int>& same, std::vector<int>&other);
+public:
+    EightGPUReducer1(int tgtDeviceID);
+};
+
+class EightGPUReducer2 : public IEightGPUReducer {
+protected:
+    void makeConnections(std::vector<int>& same, std::vector<int>&other);
+public:
+    EightGPUReducer2(int tgtDeviceID);
+};
+
+#endif /* REDUCEPIPELINE_CUH_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh
new file mode 100644
index 0000000..7aa27f9
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef STREAMBROADCAST_CUH_
+#define STREAMBROADCAST_CUH_
+
+#include <iostream>
+#include "../../util/include/queue.h"
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+class Layer;
+
+//#define NUM_STREAM_COPY_PARTS       4
+// This is in 4-byte words, not bytes
+#define SB_MIN_CHUNK_SIZE              (1<<17)
+#define SB_MAX_CHUNKS                  16
+
+class StreamBroadcast {
+protected:
+    std::map<int,cudaStream_t> _streams;
+    std::set<int> _ownedStreams;
+    HostNVMatrix _hostMem;
+    void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice);
+    void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput);
+    void init(std::map<int,cudaStream_t>& streams);
+    void init(std::map<int,NVMatrix*>& mats);
+public:
+    StreamBroadcast(std::map<int,cudaStream_t>& streams);
+    StreamBroadcast();
+    virtual ~StreamBroadcast();
+
+    void transfer(std::map<int,NVMatrix*>& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput);
+    void transfer(std::map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput);
+    void transfer(std::map<int,NVMatrix*>& mats, int srcDevice);
+    void sync(int deviceID);
+    cudaStream_t getStream(int deviceID);
+};
+
+#endif /* STREAMBROADCAST_CUH_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
new file mode 100644
index 0000000..3f479f2
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TIMER_CC_H_
+#define TIMER_CC_H_
+
+#include <helper_timer.h>
+
+class Timer {
+protected:
+    StopWatchInterface* _timer;
+    bool _started;
+
+public:
+    Timer() : _started(false) {
+        sdkCreateTimer(&_timer);
+    }
+
+    ~Timer() {
+        sdkDeleteTimer(&_timer);
+    }
+    inline void start () {
+        _started = true;
+        sdkResetTimer(&_timer);
+        sdkStartTimer(&_timer);
+    }
+
+    inline double stop() {
+        sdkStopTimer(&_timer);
+        _started = false;
+        return sdkGetTimerValue(&_timer);
+    }
+
+    inline bool isStarted() const {
+        return _started;
+    }
+};
+
+#endif /* TIMER_CC_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
new file mode 100644
index 0000000..ef31e44
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_H
+#define	UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <sstream>
+#include <string>
+#include <Python.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+
+
+#define PASS_TYPE                   uint
+#define PASS_TRAIN                  0x1
+#define PASS_TEST                   0x2
+#define PASS_GC                     0x4
+#define PASS_MULTIVIEW_TEST         (PASS_TEST | 0x8)
+#define PASS_MULTIVIEW_TEST_START   (PASS_MULTIVIEW_TEST | 0x10)
+#define PASS_MULTIVIEW_TEST_END     (PASS_MULTIVIEW_TEST | 0x20)
+#define PASS_FEATURE_GEN            0x40
+
+#define HAS_FLAG(f, x)              (((x) & (f)) == (f))
+#define IS_MULTIVIEW_TEST(x)        HAS_FLAG(PASS_MULTIVIEW_TEST, x)
+#define IS_MULTIVIEW_TEST_START(x)  HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
+#define IS_MULTIVIEW_TEST_END(x)    HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
+#define IS_TEST(x)                  HAS_FLAG(PASS_TEST, x)
+#define IS_TRAIN(x)                 HAS_FLAG(PASS_TRAIN, x)
+
+// For gradient checking
+#define GC_SUPPRESS_PASSES          false
+#define GC_REL_ERR_THRESH           0.02
+
+#ifdef DO_PRINT
+#define PRINT(x, args...) printf(x, ## args);
+#else
+#define PRINT(x, args...) ;
+#endif
+
+/*
+ * Generates a random floating point number in the range 0-1.
+ */
+#define randf                       ((float)rand() / RAND_MAX)
+
+//typedef std::vector<Matrix*> MatrixV;
+//typedef std::vector<NVMatrix*> NVMatrixV;
+typedef std::map<std::string,std::vector<double>*> CostMap;
+typedef std::map<std::string,double> CostCoeffMap;
+typedef std::vector<double> doublev;
+typedef std::vector<float> floatv;
+typedef std::vector<int> intv;
+typedef std::vector<std::string> stringv;
+typedef std::set<int> seti;
+typedef std::vector<PyObject*> PyObjectV;
+
+stringv* getStringV(PyObject* pyList);
+floatv* getFloatV(PyObject* pyList);
+intv* getIntV(PyObject* pyList);
+MatrixV* getMatrixV(PyObject* pyList);
+MatrixV* getMatrixV(PyObject* pyList, int len);
+int* getIntA(PyObject* pyList);
+
+int pyDictGetInt(PyObject* dict, const char* key);
+intv* pyDictGetIntV(PyObject* dict, const char* key);
+std::string pyDictGetString(PyObject* dict, const char* key);
+float pyDictGetFloat(PyObject* dict, const char* key);
+floatv* pyDictGetFloatV(PyObject* dict, const char* key);
+Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
+MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
+int* pyDictGetIntA(PyObject* dict, const char* key);
+stringv* pyDictGetStringV(PyObject* dict, const char* key);
+bool pyDictHasKey(PyObject* dict, const char* key);
+PyObjectV* pyDictGetValues(PyObject* dict);
+
+template<typename T> std::string tostr(T n);
+template<typename T> void shuffleVector(std::vector<T>& v, int start, int end);
+template<class T> void deleteElements(std::vector<T*>& v);
+template<class T> void deleteElements(std::vector<T*>& v, bool deleteContainer);
+
+template<class T>
+int indexOf(std::vector<T>& v, T e) {
+    int i = 0;
+//    typename vector<T>::iterator it2 = v.begin();
+    for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        if (*it == e) {
+            return i;
+        }
+        ++i;
+    }
+    return -1;
+}
+
+std::vector<int>& getDeviceCPUs(int deviceID);
+
+template<typename K, typename V> std::set<K> getKeys(std::map<K,V>& m) {
+    std::set<K> s;
+    for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it) {
+        s.insert(it->first);
+    }
+    return s;
+}
+
+struct LayerIDComparator {
+    bool operator()(PyObject* i, PyObject* j) {
+        return pyDictGetInt(i, "id") < pyDictGetInt(j, "id");
+    }
+};
+
+#endif	/* UTIL_H */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
new file mode 100644
index 0000000..dd1e522
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef WEIGHTS_CUH
+#define	WEIGHTS_CUH
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <helper_cuda.h>
+#include <assert.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+#include "util.cuh"
+#include "lr.cuh"
+#include "layer.cuh"
+#include "copypipeline.cuh"
+#include "reducepipeline.cuh"
+#include "streambroadcast.cuh"
+
+class Layer;
+class Weights;
+class StreamBroadcast;
+
+class IWeightReducer {
+protected:
+    int _tgtReplicaID;
+    std::map<int,Weights*> _replicas;
+
+    int getDeviceID();
+public:
+    IWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    virtual ~IWeightReducer();
+    static IWeightReducer& make(std::map<int,Weights*>& replicas, int srcReplicaID);
+    virtual void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) = 0;
+};
+
+class SequentialWeightReducer : public IWeightReducer {
+protected:
+    StreamBroadcast* _sb;
+public:
+    SequentialWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    ~SequentialWeightReducer();
+    void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
+};
+
+class ParallelWeightReducer : public IWeightReducer {
+protected:
+    IEightGPUReducer* _reducer;
+public:
+    ParallelWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    ~ParallelWeightReducer();
+    void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
+};
+
+class Weights {
+protected:
+    Matrix* _hWeights, *_hWeightsInc;
+    NVMatrix* _weights, *_weightsInc, *_weightsGrad;
+    
+    ParameterSchedule* _lrs;
+
+    float _wc, _mom, _wball;
+    bool _onGPU, _useGrad, _cleanup;
+    int _numUpdates;
+
+    // Note: every layer is its own sibling too
+    std::map<int,Weights*> _replicas;
+    
+    // Non-NULL if these weights are really shared from some other layer
+    Weights* _srcWeights;
+    Layer* _parent;
+    int _shardSize;
+    IWeightReducer* _reducer;
+    ISafeBroadcastNetwork* _broadcaster;
+
+    void aggregateReplicaGradients(float progress);
+
+    // TODO: assert that these retrun contiguous views
+    template<class T> T& getShard(T& mat, int replicaID);
+    template<class T> T& getShard(T& mat);
+    void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup);
+
+public:
+    NVMatrix& operator*() const;
+    
+    Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent);
+    Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent,
+            float wc, float wball, float mom, bool useGrad);
+        
+    virtual ~Weights();
+
+    virtual NVMatrix& getW() const;
+    virtual NVMatrix& getInc() const;
+    virtual NVMatrix& getGrad() const;
+    virtual Matrix& getCPUW() const;
+    virtual Matrix& getCPUWInc() const;
+    virtual ParameterSchedule& getLearningRateSchedule() const;
+    virtual int getNumRows() const;
+    virtual int getNumCols() const;
+    virtual void copyToCPU();
+    
+    // This function is assumed to be called in the order in which the layers
+    // were defined
+    virtual void copyToGPU();
+    
+    virtual void update(float progress);
+    virtual void addReplica(Weights& sibling);
+    int incNumUpdates();
+    
+    // Returns the number of times a gradient has been computed for this
+    // weight matrix during the current pass (interval between two calls of update())
+    // through the net. This number will only be greater than 1 if this weight matrix
+    // is *shared* by multiple layers in the net.
+    int getNumUpdates() const;
+    float getEps(float progress) const;
+    float getMom() const;
+    float getWC() const;
+    float getWBall() const;
+    bool isUseGrad() const;
+    bool isOwner() const;
+    int getReplicaID();
+    int getDeviceID();
+    Layer& getParent();
+    std::map<int,Weights*>& getReplicas();
+    ISafeBroadcastNetwork& getBroadcaster();
+    IWeightReducer& getReducer();
+};
+
+class WeightList {
+private:
+    std::vector<Weights*> _weightList;
+public:
+    Weights& operator[](const int idx) const;
+    ~WeightList();
+    WeightList();
+    Weights& at(const int i) const;
+    void addWeights(Weights& w);
+    void addReplica(WeightList& sibling);
+    void update(float progress);
+    void copyToCPU();
+    void copyToGPU();
+    int getSize() const;
+};
+
+#endif	/* WEIGHTS_CUH */
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
new file mode 100644
index 0000000..233e383
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef WORKER_CUH
+#define WORKER_CUH
+
+#include "convnet.cuh"
+#include "cost.cuh"
+#include "data.cuh"
+
+class ConvNet;
+class Cost;
+
+class WorkResult {
+public:
+    enum RESULTS {BATCH_DONE, SYNC_DONE};
+protected:
+    WorkResult::RESULTS _resultType;
+    Cost* _results;
+public:
+    WorkResult(WorkResult::RESULTS resultType, Cost& results);
+    WorkResult(WorkResult::RESULTS resultType);
+    virtual ~WorkResult();
+    Cost& getResults() const;
+    WorkResult::RESULTS getResultType() const;
+};
+
+class Worker {
+protected:
+    ConvNet* _convNet;
+public:
+    Worker(ConvNet& convNet);
+    virtual ~Worker();
+    virtual bool run() = 0;
+};
+
+class DataWorker : public Worker {
+protected:
+    CPUData* _data;
+    DataProvider* _dp;
+public:
+    DataWorker(ConvNet& convNet, CPUData& data);
+    virtual ~DataWorker();
+    bool run();
+    virtual void _run() = 0;
+};
+
+class TrainingWorker : public DataWorker {
+protected:
+    bool _test;
+    double _progress;
+public:
+    TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
+    void _run();
+};
+
+class SyncWorker : public Worker {
+public:
+    SyncWorker(ConvNet& convNet);
+    bool run();
+};
+
+class ExitWorker : public Worker {
+public:
+    ExitWorker(ConvNet& convNet);
+    bool run();
+};
+
+class GradCheckWorker : public DataWorker {
+public:
+    GradCheckWorker(ConvNet& convNet, CPUData& data);
+    void _run();
+};
+
+class MultiviewTestWorker : public DataWorker {
+protected:
+    int _numViews;
+    Matrix* _cpuProbs;
+    std::string _logregName;
+    CPUData& getMinibatch(int v, int i);
+public:
+    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
+    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
+    ~MultiviewTestWorker();
+    void _run();
+};
+
+class FeatureWorker : public DataWorker {
+protected:
+    MatrixV *_ftrs;
+    stringv *_layerNames;
+    bool _deleteFeatures;
+public:
+    FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true);
+    ~FeatureWorker();
+    void _run();
+};
+
+class DataGradWorker : public DataWorker {
+protected:
+    Matrix* _dataGrads;
+    int _dataLayerIdx, _softmaxLayerIdx;
+public:
+    DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
+    ~DataGradWorker();
+    void _run();
+};
+
+#endif/* WORKER_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
new file mode 100644
index 0000000..0493d40
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../include/actbroadcaster.cuh"
+
+using namespace std;
+
+/*
+ * =====================
+ * BroadcastMessage
+ * =====================
+ */
+BroadcastMessage::BroadcastMessage(map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue)
+    : _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) {
+}
+
+BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type)
+    : _type(type), _finishQueue(NULL) {
+}
+
+int BroadcastMessage::getSrcDevice() {
+    return _srcDevice;
+}
+
+map<int, NVMatrix*>& BroadcastMessage::getMatrices() {
+    return _mats;
+}
+
+int BroadcastMessage::getUserIdx() {
+    return _userIdx;
+}
+
+Queue<int>& BroadcastMessage::getFinishQueue() {
+    return *_finishQueue;
+}
+
+BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() {
+    return _type;
+}
+
+/*
+ * =====================
+ * ExitBroadcastMessage
+ * =====================
+ */
+ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) {
+}
+
+/*
+ * =====================
+ * ActBroadcaster
+ * =====================
+ */
+ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) {
+}
+
+ActBroadcaster::~ActBroadcaster() {
+    for (map<int,IBroadcastNetwork*>::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) {
+        delete it->second;
+    }
+}
+
+Queue<BroadcastMessage*>& ActBroadcaster::getMessageQueue() {
+    return _messageQueue;
+}
+
+void* ActBroadcaster::run() {
+    int nextUserIdx = 0;
+    bool exit = false;
+    while (!exit) {
+        BroadcastMessage& msg = *_messageQueue.dequeue();
+        if (msg.getMessageType() == BroadcastMessage::EXIT) {
+            exit = true;
+            delete &msg;
+        } else {
+            if (msg.getUserIdx() == nextUserIdx) {
+                if (_broadcasters.count(msg.getSrcDevice()) == 0) {
+                    _broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice());
+                }
+                _broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices());
+                msg.getFinishQueue().enqueue(0);
+                delete &msg;
+                nextUserIdx = (nextUserIdx + 1) % _numUsers;
+            } else {
+                _messageQueue.enqueue(&msg);
+            }
+        }
+    }
+    return NULL;
+}
+
+void ActBroadcaster::stop() {
+    getMessageQueue().enqueue(new ExitBroadcastMessage());
+    join();
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
new file mode 100644
index 0000000..bb4c70c
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
@@ -0,0 +1,782 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <iostream> 
+#include <string>
+#include <set>
+#include <map>
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../nvmatrix/include/nvmatrix_operators.cuh"
+#include "../../util/include/matrix.h"
+#include "../include/convnet.cuh"
+#include "../include/util.cuh"
+
+using namespace std;
+
+/* 
+ * =======================
+ * ConvNet
+ * =======================
+ */
+ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs,
+                 int minibatchSize, bool conserveMem) : Thread(true) {
+    _deviceIDs = deviceIDs;
+    _data = NULL;
+    _bufferData = NULL;
+    _bufferMinibatchIdx = -1;
+    _bufferPassIdx = -1;
+    _trainingProgress = 0;
+    _totalPassesDone = 0;
+    _conserveMem = conserveMem;
+    _sync = new ThreadSynchronizer(deviceIDs.size() + 1);
+    PyObjectV* layerList = pyDictGetValues(layerParams);
+    std::sort(layerList->begin(), layerList->end(), LayerIDComparator());
+
+    
+    _dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now
+
+    initDataLayers(layerList);
+    initGPUThreads(layerList);
+    connectReplicas();              // Connect replicas to one another
+    connectChildren(layerParams);   // Connect forward/backward links in graph
+    _numFwdTerminal = 0;
+    // Execute post-initialization stuff
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        for (int r = 0; r < it->second.size(); r++) {
+            _numFwdTerminal += it->second[r]->getNext().size() == 0;
+            if (it->second[r]->getNext().size() == 0) {
+                printf("Fwd terminal: %s\n", it->second[r]->getName().c_str());
+            }
+            it->second[r]->postInit();
+        }
+    }
+
+    // Find and count the terminal nodes in the backward pass
+    for (int p = 0; p < getNumPasses(); p++) {
+        set<Layer*> visited;
+        _numBwdTerminal[p] = 0;
+        for (int t = 0; t < _convNetThreads.size(); t++) {
+            vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
+            for (int c = 0; c < cl.size(); c++) {
+                findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p);
+            }
+        }
+    }
+
+    _dp = new DataProvider(minibatchSize);
+//    Py_DECREF(layerList);
+    delete layerList;
+}
+
+ConvNet::~ConvNet() {
+    for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
+        (*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET));
+        (*it)->join();
+        delete *it;
+    }
+    for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) {
+        delete *it;
+    }
+    for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) {
+        DEVICE_MEMORY_MANAGER::destroyInstance(*it);
+    }
+    HOST_MEMORY_MANAGER::destroyInstance();
+    delete _sync;
+    delete _dataCopyPD;
+    delete _dp;
+}
+
+void ConvNet::stop() {
+    getWorkerQueue().enqueue(new ExitWorker(*this));
+    join();
+}
+
+PipeDispenser& ConvNet::getDataCopyPD() {
+    return *_dataCopyPD;
+}
+
+void ConvNet::initDataLayers(PyObjectV* layerList) {
+    for (int i = 0; i < layerList->size(); i++) {
+        PyObject* paramsDict = layerList->at(i);
+        std::string layerType = pyDictGetString(paramsDict, "type");
+
+        if (layerType == "data") {
+            int numReplicas = pyDictGetInt(paramsDict, "numReplicas");
+            for (int r = 0; r < numReplicas; ++r) {
+                DataLayer* dataLayer = new DataLayer(this, paramsDict, r);
+                _dataLayers.push_back(dataLayer);
+                _layerMap[dataLayer->getName()][r] = dataLayer;
+            }
+        }
+    }
+}
+
+void ConvNet::initGPUThreads(PyObjectV* layerList) {
+    // Initialize GPU worker threads
+    for (int i = 0; i < _deviceIDs.size(); ++i) {
+        ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this);
+        _convNetThreads.push_back(cng);
+        for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
+            const std::string& name = it->first;
+            Layer* layer = it->second;
+            _layerMap[name][layer->getReplicaID()] = layer;
+        }
+    }
+}
+
+void ConvNet::connectReplicas() {
+    _numReplicasMax = 0;
+    _numReplicasMin = 1 << 16;
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        _numReplicasMax = max(_numReplicasMax, int(it->second.size()));
+        _numReplicasMin = min(_numReplicasMin, int(it->second.size()));
+        for (map<int,Layer*>::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
+            Layer& l1 = *it2->second;
+            for (map<int,Layer*>::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) {
+                Layer& l2 = *it3->second;
+                l1.addReplica(l2);
+            }
+        }
+    }
+}
+
+void ConvNet::connectChildren(PyObject* layerParams) {
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
+        PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
+        if (inputList != NULL) {
+            // Iterate over "replicas" of this layer
+            int numReplicas = _layerMap[it->first].size();
+            for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
+                std::string inputName = PyString_AsString(PyList_GetItem(inputList, i));
+                int numReplicasPrev = _layerMap[inputName].size();
+                // How many replicas from the previous layer must this layer be connected to?
+                int numInputReplicas = numReplicasPrev / numReplicas;
+                for (int r = 0; r < numReplicas; r++) {
+                    for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) {
+                        it->second[r]->addPrev(*_layerMap[inputName][rp], ridx);
+                        _layerMap[inputName][rp]->addNext(*it->second[r]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ConvNet::findBwdTerminal(Layer& l, set<Layer*>& visited, int& terminal, int passIdx) {
+    if (visited.count(&l) == 0) {
+        visited.insert(&l);
+        if (l.isGradConsumer()) {
+            bool hasPrevConsumer = false;
+            if (l.getPrev().size() > 0) {
+                for (int i = 0; i < l.getPrev()[0].size(); i++) {
+                    // Looking only at 0th replica is fine to see if you have
+                    // grad consumers below you.
+                    hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer();
+                }
+            }
+            if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) {
+                terminal++;
+                l.setBwdTerminal(passIdx);
+                printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx);
+            } else if (l.isGradProducer()) {
+                for (int r = 0; r < l.getPrev().size(); r++) {
+                    for (int i = 0; i < l.getPrev()[r].size(); i++) {
+                        findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void* ConvNet::run() {
+    for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
+        (*it)->start();
+    }
+    // The manager thread defaults to using the GPU of the first worker.
+    // Put more logic here if this is inappropriate.
+    NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
+    copyToGPU();
+    bool exit = false;
+    while (!exit) {
+        Worker* worker = _workerQueue.dequeue();
+        exit = worker->run();
+        delete worker;
+    }
+
+    return NULL;
+}
+
+Queue<Worker*>& ConvNet::getWorkerQueue() {
+    return _workerQueue;
+}
+
+Queue<WorkResult*>& ConvNet::getResultQueue() {
+    return _resultQueue;
+}
+
+DataProvider& ConvNet::getDataProvider() {
+    return *_dp;
+}
+
+Layer& ConvNet::getLayer(std::string& name, int replicaID) {
+    return *_layerMap[name][replicaID];
+}
+
+void ConvNet::sendMessage(MESSAGES msg, bool sync) {
+    sendMessage(new Message(msg), sync);
+}
+
+void ConvNet::sendMessage(Message* msg, bool sync) {
+    for (int i = 0; i < _convNetThreads.size(); i++) {
+        _convNetThreads[i]->getMessageQueue().enqueue(msg->clone());
+    }
+
+    delete msg;
+
+    if (sync) {
+        syncWithChildren();
+    }
+}
+
+void ConvNet::copyToCPU() {
+    sendMessage(COPY_TO_CPU, true);
+}
+
+void ConvNet::copyToGPU() {
+    sendMessage(COPY_TO_GPU, false);
+}
+
+void ConvNet::updateWeights(int passIdx) {
+    sendMessage(UPDATE_WEIGHTS, true);
+    sendMessage(CONSTRAIN_WEIGHTS, true);
+}
+
+void ConvNet::reset(int passIdx) {
+    sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false);
+}
+
+void ConvNet::reset() {
+    reset(0);
+}
+
+// Fprop given data
+void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) {
+    reset(passIdx);
+    // This is necessary because setData below could delete data. If there's
+    // an outstanding copy request, this'll cause a segfault.
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->waitForCopyFinish();
+    }
+
+    setData(data, passIdx);
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->fprop(passType, passIdx, false);
+    }
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+// Fprop given minibatch idx
+void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) {
+    reset(passIdx);
+
+    bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx;
+    if (!fromBuffer) {
+        // This is necessary because setData below could delete data. If there's
+        // an outstanding copy request, this'll cause a segfault.
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->waitForCopyFinish();
+        }
+
+        setData(_dp->getMinibatch(miniIdx), passIdx);
+
+    } else {
+        setDataFromBuffer();
+    }
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->fprop(passType, passIdx, fromBuffer);
+    }
+
+    if (passIdx == getNumPasses() - 1) {
+        // Do double-buffering from next minibatch from the DataProvider
+        setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0);
+    } else {
+        // Do double-buffering from next microbatch within current minibatch
+        setBuffer(_data, miniIdx, passIdx + 1);
+    }
+
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+void ConvNet::setDataFromBuffer() {
+    if (_bufferData != _data) {
+        delete _data;
+    }
+    _data = _bufferData;
+    _bufferData = NULL;
+    _bufferMinibatchIdx = -1;
+    _bufferPassIdx = -1;
+}
+
+void ConvNet::setData(CPUData& data, int passIdx) {
+    bool same = _data == _bufferData;
+    if (&data != _data) {
+        delete _data;
+    }
+    if (&data != _bufferData && !same) {
+        delete _bufferData;
+        _bufferData = NULL;
+        _bufferMinibatchIdx = -1;
+        _bufferPassIdx = -1;
+    }
+    _data = &data;
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->copyData(*_data, false, passIdx);
+    }
+}
+
+void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) {
+    _bufferData = bufferData;
+    _bufferMinibatchIdx = bufferMinibatchIdx;
+    _bufferPassIdx = bufferPassIdx;
+    if (bufferData != NULL) {
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx);
+        }
+    }
+}
+
+CPUData& ConvNet::getData() {
+    assert(_data != NULL);
+    return *_data;
+}
+
+void ConvNet::bprop(int passIdx, PASS_TYPE passType) {
+    _totalPassesDone++;
+    sendMessage(new BpropStartMessage(passType, passIdx), false);
+    waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL);
+    reset(passIdx + 1);
+}
+
+void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) {
+    for (int rcvd = 0; rcvd < numMsgs; rcvd++) {
+        Message* m = _msgQueue.dequeue();
+        assert(m->getType() == msgType);
+        delete m;
+    }
+}
+
+// Same as getCost() but adds results to given cost and returns it
+Cost& ConvNet::getCost(Cost& cost) {
+    Cost &tmp = getCost();
+    cost += tmp;
+    delete &tmp;
+    return cost;
+}
+
+Cost& ConvNet::getCost() {
+    Cost& cost = *new Cost();
+    for (int t = 0; t < _convNetThreads.size(); t++) {
+        Cost& tcost = _convNetThreads[t]->getCost();
+        cost += tcost;
+        delete &tcost;
+    }
+    return cost;
+}
+
+double ConvNet::getCostValue() {
+    Cost& cost = getCost();
+    double val = cost.getValue();
+    delete &cost;
+    return val;
+}
+
+Queue<Message*>& ConvNet::getMessageQueue() {
+    return _msgQueue;
+}
+
+intv& ConvNet::getDeviceIDs() {
+    return _deviceIDs;
+}
+
+ThreadSynchronizer& ConvNet::getSync() {
+    return *_sync;
+}
+
+void ConvNet::syncWithChildren() {
+    sendMessage(SYNC, false);
+    _sync->sync();
+}
+
+int ConvNet::getTotalPassesDone() {
+    return _totalPassesDone;
+}
+
+int ConvNet::getMinibatchSize() {
+    return _dp->getMinibatchSize();
+}
+
+int ConvNet::getNumReplicasMax() {
+    return _numReplicasMax;
+}
+
+int ConvNet::getNumReplicasMin() {
+    return _numReplicasMin;
+}
+
+int ConvNet::getNumPasses() {
+    return _numReplicasMax / _numReplicasMin;
+}
+
+void ConvNet::setTrainingProgress(double progress) {
+    _trainingProgress = progress;
+}
+
+double ConvNet::getTrainingProgress() const {
+    return _trainingProgress;
+}
+
+bool ConvNet::isConserveMemory() {
+    return _conserveMem;
+}
+
+/*
+ * Gradient checking stuff
+ */
+void ConvNet::checkGradients() {
+    _numFailures = 0;
+    _numTests = 0;
+    _baseErr = 0;
+    for (int p = 0; p < getNumPasses(); ++p) {
+        fprop(0, p, PASS_GC);
+        _baseErr += getCostValue();
+        bprop(p, PASS_GC);
+    }
+    // We call grad check only on the first replica,
+    // but because weights are aware of their fellow replicas,
+    // we can simultaneously perturb the weights of all
+    // replicas.
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        map<int, Layer*>& layers = it->second;
+        if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't)
+            layers[0]->checkGradient();
+        }
+    }
+
+    cout << "------------------------" << endl;
+    if (_numFailures > 0) {
+        cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
+    } else {
+        cout << "ALL " << _numTests << " TESTS PASSED" << endl;
+    }
+}
+
+// Copies to all replicas
+void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) {
+    int d = NVMatrix::getDeviceID();
+    for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
+        NVMatrix::setDeviceID(it->second->getDeviceID());
+        it->second->getW().copyFromHost(weightsCPU);
+    }
+    NVMatrix::setDeviceID(d);
+}
+
+/*
+ * name: weight matrix name
+ * eps: finite difference step
+ */
+bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) {
+    Matrix numGrad(weights.getNumRows(), weights.getNumCols());
+    Matrix diff(numGrad);
+    numGrad.apply(Matrix::ZERO);
+    Matrix weightsCPU;
+
+    weights.getW().copyToHost(weightsCPU, true);
+
+    for(int i = 0; i < weights.getNumRows(); i++) {
+        for (int j = 0; j < weights.getNumCols(); j++) {
+            float v = weightsCPU(i,j);
+            weightsCPU(i,j) += eps;
+
+            checkGradient_copyWeightsToGPU(weightsCPU, weights);
+
+            weightsCPU(i,j) = v;
+            double err = 0;
+            for (int p = 0; p < getNumPasses(); ++p) {
+//                printf("trying fprop %d\n", p);
+                fprop(0, p, PASS_GC);
+//                printf("    success\n");
+                err += getCostValue();
+            }
+            numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
+            if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) {
+                cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
+                cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
+                cout << "Exiting." << endl;
+                exit(1);
+            }
+            checkGradient_copyWeightsToGPU(weightsCPU, weights);
+        }
+    }
+    Matrix gradCPU;
+    NVMatrix::setDeviceID(weights.getDeviceID());
+    map<int,NVMatrix*> mats;
+    for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
+        mats[it->first] = &it->second->getGrad();
+    }
+    weights.getReducer().reduce(mats, 1, false);
+
+    weights.getGrad().copyToHost(gradCPU, true);
+    gradCPU.scale(-1.0 / _data->getNumCases());
+    float analNorm = gradCPU.norm();
+    float numNorm = numGrad.norm();
+    numGrad.subtract(gradCPU, diff);
+    float relErr = diff.norm() / analNorm;
+    bool fail = relErr >= GC_REL_ERR_THRESH;
+    if (fail || !GC_SUPPRESS_PASSES) {
+        cout << "========================" << endl;
+        printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
+        cout << "========================" << endl;
+        cout << "Analytic:" << endl;
+        gradCPU.print(0, 6, 0, 4);
+        cout << "Numeric:" << endl;
+        numGrad.print(0, 6, 0, 4);
+        printf("Analytic norm: %e\n", analNorm);
+        printf("Numeric norm:  %e\n", numNorm);
+        printf("Relative error: %e\n", relErr);
+    }
+    _numTests++;
+    _numFailures += fail;
+    return fail;
+}
+
+/* 
+ * =======================================================================================================
+ * ConvNetThread
+ * =======================================================================================================
+ */
+ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet)
+    : Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) {
+    try {
+        int numLayers = layerList->size();
+
+        for (int i = 0; i < numLayers; i++) {
+            PyObject* paramsDict = layerList->at(i);
+            std::string layerType = pyDictGetString(paramsDict, "type");
+            if (layerType != "data") {
+                intv& gpus = *pyDictGetIntV(paramsDict, "gpu");
+                int rid = indexOf(gpus, deviceIdx);
+                if (rid >= 0) {
+                    initLayer(paramsDict, rid);
+                }
+                delete &gpus;
+            }
+        }
+    } catch (std::string& s) {
+        cout << "Error creating ConvNet: " << s << endl;
+        exit(1);
+    }
+}
+
+ConvNetThread::~ConvNetThread() {
+    NVMatrix::setDeviceID(_deviceID);
+    NVMatrix::destroyCublas();
+    NVMatrix::destroyRandom();
+    for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+        delete it->second;
+    }
+    _nameLayerMap.clear();
+}
+
+void ConvNetThread::startTimer() {
+    NVMatrix::syncStream();
+    _timer.start();
+}
+
+double ConvNetThread::stopTimer() {
+    NVMatrix::syncStream();
+    return _timer.stop();
+}
+
+void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) {
+    std::string type = pyDictGetString(paramsDict, "type");
+    std::string name = pyDictGetString(paramsDict, "name");
+    if (type == "fc") {
+        _nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false);
+    } else if (type == "sfc") {
+        _nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false);
+    } else if (type == "conv") {
+        _nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID);
+    } else if (type == "local") {
+        _nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID);
+    } else if (type == "pool") {
+        _nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID);
+    } else if (type == "cmpool") {
+        _nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID);
+    } else if (type == "rnorm") {
+        _nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID);
+    } else if (type == "cmrnorm") {
+        _nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID);
+    } else if (type == "cnorm") {
+        _nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID);
+    } else if (type == "softmax") {
+        _nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID);
+    } else if (type == "eltsum") {
+        _nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID);
+    } else if (type == "eltmax") {
+        _nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID);
+    } else if (type == "neuron") {
+        _nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID);
+    } else if (type == "nailbed") {
+        _nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID);
+    } else if (type == "blur") {
+        _nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID);
+    } else if (type == "href") {
+        _nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID);
+    } else if (type == "resize") {
+        _nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID);
+    } else if (type == "rgb2yuv") {
+        _nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID);
+    } else if (type == "rgb2lab") {
+        _nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID);
+    } else if (type == "rscale") {
+        _nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID);
+    } else if (type == "crop") {
+        _nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID);
+    } else if (type == "concat") {
+        _nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID);
+    } else if (type == "pass") {
+        _nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID);
+    } else if (type == "dropout") {
+        _nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID);
+    } else if (type == "dropout2") {
+        _nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID);
+    } else if (strncmp(type.c_str(), "cost.", 5) == 0) {
+        CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID);
+        _nameLayerMap[name] = c;
+        _costs.push_back(c);
+    } else {
+        throw std::string("Unknown layer type ") + type;
+    }
+}
+
+/*
+ * This executes in a new CPU thread so it's OK to initialize CUDA stuff here. 
+ */
+void ConvNetThread::initCuda() { 
+    NVMatrix::setDeviceID(_deviceID);
+    checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
+    for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
+        int d = _convNet->getDeviceIDs()[i];
+        if (d != _deviceID) {
+            if (NVMatrix::canAccessPeer(_deviceID, d)) {
+                printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d);
+                checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
+            } else {
+                printf("No peer access GPU %d -->  GPU %d\n", _deviceID, d);
+            }
+        }
+    }
+//    NVMatrix::syncStream();
+    NVMatrix::initCublas();
+    NVMatrix::initRandom(/*7*/);
+    srand(time(0));
+}
+
+void* ConvNetThread::run() {
+    initCuda();
+    bool exit = false;
+    while (!exit) {
+        Message* m = _msgQueue.dequeue();
+        if (m->getType() == FPROP_READY) {
+            FpropMessage* msg = static_cast<FpropMessage*>(m);
+            msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx());
+        } else if (m->getType() == BPROP_READY) {
+            BpropMessage* msg = static_cast<BpropMessage*>(m);
+            msg->getToLayer().incRcvdBInputMsgs();
+            msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx());
+        } else if (m->getType() == BPROP_START) {
+            BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
+            for (int i = 0; i < _costs.size(); i++) {
+                dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx());
+            }
+        } else if (m->getType() == SYNC) {
+            NVMatrix::syncStream();
+            _convNet->getSync().sync();
+        } else if (m->getType() == COPY_TO_CPU) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->copyToCPU();
+            }
+        } else if (m->getType() == COPY_TO_GPU) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->copyToGPU();
+            }
+        } else if (m->getType() == RESET) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->reset();
+            }
+        } else if (m->getType() == RESET_PASS_IDX) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->resetPassIdx();
+            }
+        } else if (m->getType() == UPDATE_WEIGHTS) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->updateWeights();
+            }
+        } else if (m->getType() == CONSTRAIN_WEIGHTS) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->constrainWeights();
+            }
+        } else if (m->getType() == EXIT_CONVNET) {
+            exit = true;
+        }
+        delete m;
+    }
+    return NULL;
+}
+
+Cost& ConvNetThread::getCost() {
+    // In a single ConvNetThread, all costs are guaranteed to be different
+    // (i.e. not replicas of one another)
+    return *new Cost(_costs);
+}
+
+Layer& ConvNetThread::getLayer(std::string& name) {
+    return *_nameLayerMap[name];
+}
+
+int ConvNetThread::getDeviceID() {
+    return _deviceID;
+}
+
+Queue<Message*>& ConvNetThread::getMessageQueue() {
+    return _msgQueue;
+}
+
+vector<CostLayer*>& ConvNetThread::getCostLayers() {
+    return _costs;
+}
+
+NameLayerMap& ConvNetThread::getLayerMap() {
+    return _nameLayerMap;
+}
+
+ConvNet& ConvNetThread::getConvNet() {
+    return *_convNet;
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
new file mode 100644
index 0000000..37afa33
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
@@ -0,0 +1,378 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/copypipeline.cuh"
+//#include "gpu_util.cuh"
+
+using namespace std;
+
+/* =========================
+ * ICopySegment
+ * =========================
+ */
+ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue)
+    : _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) {
+    _execDeviceID = _deviceID;
+}
+
+ICopySegment::~ICopySegment() {
+    if (_stream != NULL) {
+        checkCudaErrors(cudaStreamDestroy(_stream));
+    }
+}
+
+void* ICopySegment::run() {
+    assert(_execDeviceID != DEVICE_HOST);
+    NVMatrix::setDeviceID(_execDeviceID);
+    checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
+    bool exit = false;
+    while (!exit) {
+        CopyMessage& msg = *_queue.dequeue();
+        if (msg.getType() == CopyMessage::EXIT) {
+            exit = true;
+        } else {
+            bool term = processMessage(msg);
+            if (term) {
+                assert(_finishQueue != NULL);
+                _finishQueue->enqueue(1);
+            }
+        }
+        delete &msg;
+    }
+    return NULL;
+}
+
+NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) {
+    NVMatrix& line = mat.reshaped(1, mat.getNumElements());
+    int start = chunkIdx * chunkSize;
+    int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
+    NVMatrix& chunk = line.sliceCols(start, end);
+    delete &line;
+    return chunk;
+}
+
+inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) {
+    if (getDeviceID() == DEVICE_HOST) {
+        return _hmat;
+    }
+    return msg.getMatrix(getDeviceID());
+}
+
+Queue<CopyMessage*>& ICopySegment::getQueue() {
+    return _queue;
+}
+
+inline int ICopySegment::getDeviceID() {
+    return _deviceID;
+}
+
+void ICopySegment::addPrev(ICopySegment& c) {
+    _prev = &c;
+    if (_deviceID == DEVICE_HOST) {
+        _execDeviceID = c.getDeviceID();
+    }
+}
+
+void ICopySegment::addNext(CopyPeer& c) {
+    _next.push_back(&c);
+    c.addPrev(*this);
+}
+
+bool ICopySegment::isTerminal() const {
+    return _next.size() == 0;
+}
+
+/* =========================
+ * CopySource
+ * =========================
+ */
+CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) {
+}
+
+bool CopySource::processMessage(CopyMessage& msg) {
+    assert(msg.getType() == CopyMessage::COPY_START);
+    int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE))));
+    int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
+//                printf("num chunks: %d\n", numChunks);
+    for (int c = 0; c <= numChunks; ++c) {
+        for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
+            (*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices()));
+        }
+    }
+    return false;
+}
+
+inline bool CopySource::isSource() const {
+    return true;
+}
+
+/* =========================
+ * CopyPeer
+ * =========================
+ */
+CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue) : ICopySegment(parent, deviceID, finishQueue) {
+}
+
+bool CopyPeer::processMessage(CopyMessage& msg) {
+    assert(msg.getType() == CopyMessage::COPY_CHUNK);
+    CopyChunkMessage& cmsg = *static_cast<CopyChunkMessage*>(&msg);
+    if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
+        if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) {
+            getMatrix(msg).resize(_prev->getMatrix(msg));
+        }
+//        getMatrix(msg).printShape("getMatrix(msg)");
+//        _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)");
+        assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg)));
+        const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0;
+        const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1;
+        NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream);
+        NVMatrix::syncStream(_stream);
+        delete &prevChunk;
+        delete &myChunk;
+    }
+    for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
+        (*it)->getQueue().enqueue(new CopyChunkMessage(cmsg));
+    }
+    return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal();
+}
+
+inline bool CopyPeer::isSource() const {
+    return false;
+}
+
+/* =========================
+ * IBroadcastNetwork
+ * =========================
+ */
+IBroadcastNetwork& IBroadcastNetwork::make(set<int> devices, int srcDevice) {
+    if (devices.size() == 8) {
+        return (new EightGPUBroadcaster1(devices, srcDevice))->construct();
+    } else if (devices.size() == 1) {
+        return (new NullBroadcaster(devices, srcDevice))->construct();
+    } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
+        return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
+    }
+    return (new NaiveBroadcaster(devices, srcDevice))->construct();
+}
+
+IBroadcastNetwork::IBroadcastNetwork(set<int>& devices, int srcDeviceID, int numTerminal)
+    : _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) {
+}
+
+IBroadcastNetwork::~IBroadcastNetwork() {
+    vector<ICopySegment*> v;
+    v.insert(v.end(), _peers.begin(), _peers.end());
+    v.insert(v.end(), _src);
+    for (vector<ICopySegment*>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        (*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT));
+        (*it)->join();
+        delete *it;
+    }
+}
+
+IBroadcastNetwork& IBroadcastNetwork::construct() {
+    assert(!_constructed);
+    pair<vector<int>,vector<int> > gpus = makeGPULists();
+    _src = new CopySource(*this, _srcDeviceID);
+    makePeers(gpus);
+    makeConnections();
+    _src->start();
+    for (vector<CopyPeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
+        (*it)->start();
+    }
+    _constructed = true;
+    return *this;
+}
+
+pair<vector<int>,vector<int> > IBroadcastNetwork::makeGPULists() {
+    vector<int> same, other;
+    for (set<int>::const_iterator it = _devices.begin(); it != _devices.end(); ++it) {
+        if (*it != _srcDeviceID) {
+            if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) {
+                same.insert(same.begin() + rand() % (1 + same.size()), *it);
+            } else {
+                other.insert(other.begin() + rand() % (1 + other.size()), *it);
+            }
+        }
+    }
+    return pair<vector<int>,vector<int> >(same, other);
+}
+
+void IBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats) {
+    _broadcast(mats, 1, 0);
+}
+
+void IBroadcastNetwork::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    assert(_constructed);
+    assert(_finishQueue.getNumElements() == 0);
+    assert(mats.size() == _devices.size());
+    assert(mats.size() > 1);
+    if (mats[_srcDeviceID]->getNumElements() == 0) {
+        for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+            it->second->resize(*mats[_srcDeviceID]);
+        }
+    } else {
+        _src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats));
+        for (int i = 0; i < _numTerminal; ++i) {
+            _finishQueue.dequeue();
+        }
+    }
+    assert(_finishQueue.getNumElements() == 0);
+}
+
+int IBroadcastNetwork::getSourceDeviceID() const {
+    return _srcDeviceID;
+}
+
+void IBroadcastNetwork::makePeers(pair<vector<int>,vector<int> >& gpus) {
+    vector<int>& same = gpus.first, &other = gpus.second;
+    for (int i = 0; i < same.size(); ++i) {
+        _peers.push_back(new CopyPeer(*this, same[i], &_finishQueue));
+    }
+    for (int i = 0; i < other.size(); ++i) {
+        _peers.push_back(new CopyPeer(*this, other[i], &_finishQueue));
+    }
+    _peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7]
+}
+
+/* =========================
+ * ISafeBroadcastNetwork
+ * =========================
+ */
+ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set<int> devices, int srcDevice) {
+    if (devices.size() == 1) {
+        return (new NullBroadcaster(devices, srcDevice))->construct();
+    } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
+        return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
+    }
+    return (new NaiveBroadcaster(devices, srcDevice))->construct();
+}
+
+ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) {
+}
+
+void ISafeBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    _broadcast(mats, scaleSource, scaleTargets);
+}
+
+ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() {
+    IBroadcastNetwork::construct();
+    return *this;
+}
+
+/* =========================
+ * NullBroadcaster
+ * =========================
+ */
+NullBroadcaster::NullBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
+}
+
+void NullBroadcaster::makeConnections() {
+}
+
+NullBroadcaster& NullBroadcaster::construct() {
+    _constructed = true;
+    return *this;
+}
+
+void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+}
+
+void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats) {
+}
+
+/* =========================
+ * NaiveBroadcaster
+ * =========================
+ *
+ * This one does src -> host -> all
+ */
+NaiveBroadcaster::NaiveBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) {
+}
+
+void NaiveBroadcaster::makeConnections() {
+    _src->addNext(*_peers.back()); // Make connection src -> host
+    for (int i = 0; i < _peers.size() - 1; ++i) {
+        if (_peers[i]->getDeviceID() != _src->getDeviceID()) {
+            _peers.back()->addNext(*_peers[i]); // Make connection host -> peer
+        }
+    }
+}
+
+/* =========================
+ * EightGPUBroadcaster1
+ * =========================
+ *
+ * This one does a fancy graph
+ */
+EightGPUBroadcaster1::EightGPUBroadcaster1(set<int>& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) {
+}
+
+void EightGPUBroadcaster1::makeConnections() {
+    _src->addNext(*_peers[7]);
+    _peers[7]->addNext(*_peers[0]);
+    _peers[7]->addNext(*_peers[1]);
+    _peers[7]->addNext(*_peers[3]);
+    _peers[7]->addNext(*_peers[4]);
+
+    _peers[1]->addNext(*_peers[2]);
+    _peers[3]->addNext(*_peers[5]);
+    _peers[4]->addNext(*_peers[6]);
+}
+
+/* =========================
+ * TwoPeeringGPUsBroadcaster
+ * =========================
+ */
+TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
+    _tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin();
+}
+
+TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() {
+    if (_constructed) {
+        checkCudaErrors(cudaStreamDestroy(_tgtStream));
+    }
+}
+
+void TwoPeeringGPUsBroadcaster::makeConnections() {
+}
+
+void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) {
+    if (d >= 0) {
+        NVMatrix::setDeviceID(d);
+    }
+}
+
+ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() {
+    assert(!_constructed);
+    int d = NVMatrix::getDeviceID();
+    NVMatrix::setDeviceID(_tgtDeviceID);
+    checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking));
+    resetDeviceID(d);
+    _constructed = true;
+    return *this;
+}
+
+void TwoPeeringGPUsBroadcaster::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    int d = NVMatrix::getDeviceID();
+    NVMatrix::setDeviceID(_tgtDeviceID);
+    mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream);
+    NVMatrix::syncStream(_tgtStream);
+    resetDeviceID(d);
+}
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
new file mode 100644
index 0000000..55d466a
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "../include/cost.cuh"
+
+using namespace std;
+
+/* 
+ * =====================
+ * Cost
+ * =====================
+ */
+
+Cost::Cost() {
+}
+
+Cost::Cost(vector<CostLayer*>& costs) {
+    for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
+        _costMap[(*it)->getName()] = &(*it)->getCost();
+        _costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
+        _numCases[(*it)->getName()] = (*it)->getNumCases();
+    }
+}
+
+int Cost::getNumCases() {
+    return _numCases.size() == 0 ? 0 : _numCases.begin()->second;
+}
+
+map<std::string,int>& Cost::getNumCasesMap() {
+    return _numCases;
+}
+
+doublev& Cost::operator [](const std::string s) {
+    return *_costMap[s];
+}
+
+CostMap& Cost::getCostMap() {
+    return _costMap;
+}
+
+CostCoeffMap& Cost::getCostCoeffMap() {
+    return _costCoeffMap;
+}
+
+double Cost::getValue() {
+    double val = 0;
+    for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0));
+    }
+    return val;
+}
+
+Cost& Cost::operator += (Cost& er) {
+    CostMap& otherMap = er.getCostMap();
+    CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
+
+    for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
+        bool newCost = _costMap.count(it->first) == 0;
+        if (newCost) {
+            _costMap[it->first] = new doublev();
+            _costCoeffMap[it->first] = otherCoeffMap[it->first];
+            _numCases[it->first] = er.getNumCasesMap()[it->first];
+        } else {
+            _numCases[it->first] += er.getNumCasesMap()[it->first];
+        }
+        
+        doublev& myVec = *_costMap[it->first];
+        doublev& otherVec = *otherMap[it->first];
+        assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size());
+        // Add costs from otherVec to me
+        for (int i = 0; i < otherVec.size(); i++) {
+            if (myVec.size() <= i) {
+                myVec.push_back(0);
+            }
+            myVec[i] += otherVec[i];
+        }
+    }
+    return *this;
+}
+
+Cost::~Cost() {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        delete it->second;
+    }
+}
+
+void Cost::print() {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]);
+        doublev& vec = *_costMap[it->first];
+        for (int z = 0; z < vec.size(); ++z) {
+            printf("%.3f", vec[z]);
+            if (z < vec.size() - 1) {
+                printf(", ");
+            }
+        }
+        printf("\n");
+    }
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
new file mode 100644
index 0000000..6c2cdcd
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <vector>
+#include "../../util/include/matrix.h"
+#include "../include/data.cuh"
+#include "../include/timer.cuh"
+
+using namespace std;
+
+DataProvider::DataProvider(int minibatchSize) : 
+    _minibatchSize(minibatchSize), _hData(NULL) {
+}
+
+void DataProvider::clearData() {
+    delete _hData;
+    _hData = NULL;
+}
+
+void DataProvider::setData(CPUData& hData) {
+    // DataWorker calls clearData
+    _hData = &hData;
+    assert(_hData != NULL);
+}
+
+CPUData& DataProvider::getMinibatch(int idx) {
+    assert(idx >= 0 && idx < getNumMinibatches());
+    return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
+}
+
+CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    endCase = min(_hData->getNumCases(), endCase);
+    // TODO: maintain these matrices, no point re-creating them all the time
+    MatrixV& miniData = *new MatrixV();
+    
+    for (int i = 0; i < _hData->getData().size(); i++) {
+        // NOTE: if hData is transposed, then the output minibatch matrix
+        // can be a view. No need to allocate new CPU memory here. Might
+        // want to look into optimizing that in the future, though it's 
+        // unlikely to be a big deal.
+        if (_hData->isTrans()) {
+            miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
+        } else {
+            miniData.push_back(new Matrix());
+            (*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
+        }
+    }
+    CPUData& cpuData = *new CPUData(&miniData);
+    return *new CPUData(&miniData);
+}
+
+int DataProvider::getNumMinibatches() {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    return DIVUP(_hData->getNumCases(), _minibatchSize);
+}
+
+int DataProvider::getMinibatchSize() {
+    return _minibatchSize;
+}
+
+int DataProvider::getNumCases() {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    return _hData->getNumCases();
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
new file mode 100644
index 0000000..0a70182
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/util.cuh"
+#include "../include/gradreducer.cuh"
+
+using namespace std;
+
+/* =====================
+ * IGradReducer
+ * =====================
+ */
+IActGradReducer::IActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) {
+    _numExpectedMsgsTotal = 0;
+    for (map<int,int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
+        _numExpectedMsgsTotal += it->second;
+    }
+//    printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal);
+}
+
+IActGradReducer::~IActGradReducer() {
+
+}
+
+void* IActGradReducer::run() {
+    while (true) {
+        reset();
+        if (reduce()) {
+            break;
+        }
+        _finishQueue.enqueue(0);
+    }
+    return NULL;
+}
+
+// Cost layer will have nothing to dequeue, so just return immediately.
+int IActGradReducer::waitForFinish() {
+    if (_numExpectedMsgsTotal > 0) {
+        int i = _finishQueue.dequeue();
+        assert(_finishQueue.getNumElements() == 0);
+        return i;
+    }
+//    printf("%s not waiting for finish\n", _name.c_str());
+    return 0;
+}
+
+IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map<int, int> numExpectedMsgs) {
+    int tgtDeviceID = parent.getDeviceID();
+    if (numExpectedMsgs.count(tgtDeviceID) == 0) {
+        numExpectedMsgs[tgtDeviceID] = 0;
+    }
+    if (numExpectedMsgs.size() == 8) {
+        return *new ParallelActGradReducer(parent, numExpectedMsgs);
+    }
+    return *new SequentialActGradReducer(parent, numExpectedMsgs);
+}
+
+/* =====================
+ * SequentialGradReducer
+ * =====================
+ */
+SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : IActGradReducer(parent, numExpectedMsgs) {
+    intv deviceIDs;
+    int tgtDeviceID = parent.getDeviceID();
+    for (map<int, int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
+        if (it->first != tgtDeviceID) {
+            deviceIDs.push_back(it->first);
+        }
+    }
+    if (numExpectedMsgs[tgtDeviceID] > 0) {
+        deviceIDs.push_back(tgtDeviceID);
+    }
+
+    sort(deviceIDs.begin(), deviceIDs.end());
+
+    int firstDeviceIdx = 0, firstDeviceID = 1 << 16;
+    for (int i = 0; i < deviceIDs.size(); ++i) {
+        if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) {
+            firstDeviceIdx = i;
+            firstDeviceID = deviceIDs[i];
+        }
+    }
+
+    // This is the order in which we process devices.
+    for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) {
+        int d = deviceIDs[i];
+        _deviceIDs.push_back(d);
+        _messageQueues[d] = new Queue<int>();
+    }
+    //shuffleVector(_deviceIDs, 1, _deviceIDs.size()); 
+    _broadcaster = new StreamBroadcast();
+
+    // Note that we MUST process the tgtDeviceID first because
+    // we write to it at every iteration, and the computation
+    // thread writes to it too. By processing it first we ensure
+    // that there's no race condition.
+    assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID);
+    reset();
+}
+
+SequentialActGradReducer::~SequentialActGradReducer() {
+    for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
+        delete it->second;
+    }
+    delete _broadcaster;
+}
+
+void SequentialActGradReducer::reset() {
+    for (map<int,int>::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) {
+        _numReceivedMsgs[it->first] = 0;
+    }
+}
+
+bool SequentialActGradReducer::reduce() {
+    int tgtDeviceID = _parent->getDeviceID();
+    for (int didx = 0; didx < _deviceIDs.size(); ) {
+        int d = _deviceIDs[didx];
+        _numReceivedMsgs[d] += _messageQueues[d]->dequeue();
+        if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) {
+            if (d != tgtDeviceID) {
+                NVMatrix::setDeviceID(tgtDeviceID);
+
+                _parent->getActsGrad().resize(_parent->getActsGrad(d));
+                map<int, NVMatrix*> mats;
+                mats[d] = &_parent->getActsGrad(d);
+                mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID);
+
+                _broadcaster->transfer(mats, d, didx > 0, 1);
+            }
+            didx++;
+            assert(_messageQueues[d]->getNumElements() == 0);
+        } else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit
+            return true;
+        }
+    }
+    return false;
+}
+
+void SequentialActGradReducer::enqueueReduction(int deviceID) {
+    _messageQueues[deviceID]->enqueue(1);
+}
+
+void SequentialActGradReducer::stop() {
+    for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
+        it->second->enqueue(ACT_GRAD_REDUCER_EXIT);
+    }
+    join();
+}
+
+/* =====================
+ * ParallelActGradReducer
+ * =====================
+ */
+ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) {
+    _reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct();
+
+    _scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0;
+}
+
+bool ParallelActGradReducer::reduce() {
+    // TODO: make it so that you can start the reduction before you've received all the messages.
+    while(_numReceivedMsgs < _numExpectedMsgsTotal) {
+        _numReceivedMsgs += _messageQueue.dequeue();
+    }
+    if (_numReceivedMsgs > _numExpectedMsgsTotal) {
+        return true; // exit
+    }
+    map<int,NVMatrix*> mats = _parent->getAllActsGrads();
+    _reducer->reduce(mats, 1, _scaleTarget);
+    assert(_messageQueue.getNumElements() == 0);
+    return false;
+
+}
+
+void ParallelActGradReducer::enqueueReduction(int deviceID) {
+    _messageQueue.enqueue(1);
+}
+
+void ParallelActGradReducer::stop() {
+    _messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT);
+    join();
+}
+
+void ParallelActGradReducer::reset() {
+    _numReceivedMsgs = 0;
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
new file mode 100644
index 0000000..7d158df
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/jpeg.h"
+
+using namespace std;
+
+/* ========================
+ * DecoderThread
+ * ========================
+ */
+DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview)
+: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img),
+  _img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview),
+  _decodeTarget(0), _decodeTargetSize(0) {
+
+    _inner_pixels = _inner_size * _inner_size;
+    _rseed = time(0);
+}
+
+DecoderThread::~DecoderThread(){
+    free(_decodeTarget);
+}
+
+void* DecoderThread::run() {
+    int numSrcCases = PyList_GET_SIZE(_pyList);
+    assert(_target->getNumCols() == _inner_pixels * 3);
+    assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1));
+
+    int width, height;
+
+    for (int64 i = _start_img; i < _end_img; ++i) {
+        decodeJpeg(i, width, height);
+        assert((width == _img_size && height >= _img_size)
+               || (height == _img_size && width >= _img_size));
+        if (_multiview) {
+            for (int flip = 0; flip < 2; ++flip) {
+                crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left
+                crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right
+                crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center
+                crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left
+                crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right
+            }
+        } else {
+            crop(i, width, height, !_test && (rand_r(&_rseed) % 2));
+        }
+
+    }
+    return NULL;
+}
+
+void DecoderThread::decodeJpeg(int idx, int& width, int& height) {
+    PyObject* pySrc = PyList_GET_ITEM(_pyList, idx);
+    unsigned char* src = (unsigned char*)PyString_AsString(pySrc);
+    size_t src_len = PyString_GET_SIZE(pySrc);
+    
+    struct jpeg_decompress_struct cinf;
+    struct jpeg_error_mgr jerr;
+    cinf.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinf);
+    jpeg_mem_src(&cinf, src, src_len);
+    assert(jpeg_read_header(&cinf, TRUE));
+    cinf.out_color_space = JCS_RGB;
+    assert(jpeg_start_decompress(&cinf));
+    assert(cinf.num_components == 3 || cinf.num_components == 1);
+    width = cinf.image_width;
+    height = cinf.image_height;
+
+    if (_decodeTargetSize < width * height * 3) {
+        free(_decodeTarget);
+        _decodeTargetSize = width * height * 3 * 3;
+        _decodeTarget = (unsigned char*)malloc(_decodeTargetSize);
+    }
+    
+    while (cinf.output_scanline < cinf.output_height) {
+        JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline];
+        assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0);
+    }
+    assert(jpeg_finish_decompress(&cinf));
+    jpeg_destroy_decompress(&cinf);
+}
+
+/*
+ * Uniform in [0,1)
+ */
+inline double DecoderThread::randUniform() {
+    return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1);
+}
+
+/*
+ * Uniform in [min, max)
+ */
+inline double DecoderThread::randUniform(double min, double max) {
+    return (max - min) * randUniform() + min;
+}
+
+void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) {
+    crop(i, src_width, src_height, flip, -1, -1);
+}
+
+void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) {
+    const int64 border_size_y = src_height - _inner_size;
+    const int64 border_size_x = src_width - _inner_size;
+    if (crop_start_x < 0) {
+        crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1));
+    }
+    if (crop_start_y < 0) {
+        crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1));
+    }
+    const int64 src_pixels = src_width * src_height;
+    for (int64 c = 0; c < 3; ++c) {
+        for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) {
+            for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) {
+                assert((y >= 0 && y < src_height && x >= 0 && x < src_width));
+                _target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size
+                                    + (flip ? (_inner_size - 1 - x + crop_start_x)
+                                        : (x - crop_start_x)))
+                        = _decodeTarget[3 * (y * src_width + x) + c];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
new file mode 100644
index 0000000..4ff54f8
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
@@ -0,0 +1,2306 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <helper_cuda.h>
+#include <iostream>
+#include <set>
+#include "../../cudaconv3/include/cudaconv2.cuh"
+#include "../../util/include/matrix.h"
+#include "../include/layer_kernels.cuh"
+#include "../include/layer.cuh"
+#include "../include/data.cuh"
+#include "../include/util.cuh"
+#include "../include/weights.cuh"
+
+using namespace std;
+
+/*
+ * =======================
+ * Layer
+ * =======================
+ */
+Layer::Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) :
+             _convNetThread(convNetThread),  _replicaID(replicaID), _trans(trans) {
+    _name = pyDictGetString(paramsDict, "name");
+    _type = pyDictGetString(paramsDict, "type");
+   
+    _foundGradConsumers = false;
+    _gradConsumer = pyDictGetInt(paramsDict, "gradConsumer");
+    _actsTarget = pyDictGetInt(paramsDict, "actsTarget");
+    _actsGradTarget = pyDictGetInt(paramsDict, "actsGradTarget");
+    _numOutputs = pyDictGetInt(paramsDict, "outputs");
+    _numReplicas = pyDictGetInt(paramsDict, "numReplicas");
+    _numReplicasPrev = 1;
+    _rcvdBInputMsgs = 0;
+
+    _actBroadcaster = NULL;
+    _gradReducer = NULL;
+    _initialized = false;
+}
+
+Layer::~Layer() {
+    if (_actBroadcaster != NULL) {
+        _actBroadcaster->stop();
+        delete _actBroadcaster;
+    }
+    if (_gradReducer != NULL) {
+        _gradReducer->stop();
+        delete _gradReducer;
+    }
+    // For now, gradReducer doesn't have a destructor
+//    delete _gradReducer;
+    for (std::map<int, MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
+        if (it->second->getMemorySource().truncate(_name)) {
+            delete &it->second->getMemorySource();
+        }
+    }
+    for (std::map<int, MemoryView*>::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
+        if (it->second->getMemorySource().truncate(_name)) {
+            delete &it->second->getMemorySource();
+        }
+    }
+}
+
+cudaStream_t Layer::getStream() {
+    assert(getDeviceID() >= 0);
+    return NVMatrix::getDefaultStream(getDeviceID());
+}
+
+void Layer::syncStream() {
+    NVMatrix::syncStream(getStream());
+}
+
+void Layer::fpropNext(PASS_TYPE passType, int passIdx) {
+    if (_next.size() > 0) {
+        if (getFwdActiveReplicaIdx(passIdx) == 0/*getReplicaIdx()*/) { // 0 turns on pipelining
+            if (_nextDeviceIDs.size() > 1 || (_nextDeviceIDs.size() == 1 && _nextDeviceIDs[0] != getDeviceID())) {
+                syncStream(); // Make sure I've finished computing before broadcasting
+            }
+            getActBroadcaster().getMessageQueue().enqueue(new BroadcastMessage(getAllActs(), getDeviceID(), getReplicaIdx(), _broadcastFinishQueue));
+        }
+        if (getFwdActiveReplicaIdx(passIdx) == getReplicaIdx()) {
+            _broadcastFinishQueue.dequeue();
+            assert(_broadcastFinishQueue.getNumElements() == 0);
+        }
+    }
+
+    for (int i = 0; i < _next.size(); i++) {
+        _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx));
+    }
+}
+
+bool Layer::fprop(PASS_TYPE passType, int passIdx) {
+    _rcvdFInputMsgs++;
+    // I require messages from *all* input replicas because it makes the propagation easier to think about.
+    // Without this requirement, when all fprop terminal msgs arrive to ConvNet, the forward propagation
+    // might not actually be finished yet.
+    if (_rcvdFInputMsgs == getNumExpectedFwdMsgs()) {
+//        printf("Layer %s[%d] fprop\n", _name.c_str(), getReplicaID());
+        int ridx = getFwdActiveInputReplicaIdx(passIdx);
+        assert(getDeviceID() == NVMatrix::getDeviceID());
+        map<int, NVMatrix*> v;
+        if (ridx >= 0) {
+            for (int i = 0; i < getNumLayersPrev(); i++) {
+                v[i] = &_prev[ridx][i]->getActs(getDeviceID());
+            }
+        }
+        fprop(v, passType, passIdx);
+        return true;
+    }
+    return false;
+}
+
+void Layer::fprop(map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx) {
+    if (getFwdActiveInputReplicaIdx(passIdx) >= 0) {
+        assert(v.size() == getNumLayersPrev());
+        _inputs.clear();
+        _inputs.insert(v.begin(), v.end());
+
+        int numCases = _inputs[0]->getLeadingDim();
+        for (map<int,MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
+            it->second->getMemory(numCases);
+        }
+
+        if (numCases > 0) {
+            //printf("layer %s fprop, numcases: %d\n", _name.c_str(), numCases);
+            _rcvdFInputMsgs = getNumExpectedFwdMsgs();
+            for (map<int,NVMatrix*>::iterator it = v.begin(); it != v.end(); ++it) {
+                it->second->transpose(_trans);
+            }
+            getActs().transpose(_trans);
+   
+            fpropCommon(passType);
+
+            // First do fprop on the input whose acts matrix I'm sharing, if any
+            if (_actsTarget >= 0) {
+                fpropActs(_actsTarget, 0, passType, passIdx);
+            }
+            // Then add the rest of the inputs to that
+            for (int i = 0; i < getNumLayersPrev(); i++) {
+                if (i != _actsTarget) {
+                    fpropActs(i, _actsTarget >= 0 || i > 0, passType, passIdx);
+                }
+            }
+        }
+    }
+    fpropNext(passType, passIdx);
+}
+
+void Layer::truncBwdActs() {
+    // Only truncate actsGrad if I own it
+    if (_actsGradTarget < 0) {
+        for (map<int,MemoryView*>::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
+            it->second->getMemorySource().truncate(getName());
+        }
+    }
+    if (_actsTarget < 0) {
+        for (map<int,MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
+            it->second->getMemorySource().truncate(getName());
+        }
+    }
+}
+
+int Layer::getNumGradProducersNext() {
+    return _numGradProducersNext;
+}
+
+int Layer::getNumExpectedBwdMsgs() {
+    return _numGradProducersNext * getNumSiblingReplicas();
+}
+
+int Layer::getNumExpectedFwdMsgs() {
+    return getNumLayersPrev() * getNumInputReplicas();
+}
+
+void Layer::bprop(PASS_TYPE passType, int passIdx) {
+    if (getBwdActiveInputReplicaIdx(passIdx) >= 0 && _rcvdBInputMsgs == getNumExpectedBwdMsgs()) {
+//        printf("Layer %s[%d] bprop\n", _name.c_str(), getReplicaID());
+        if (_gradReducer != NULL) {
+            _gradReducer->waitForFinish();
+        }
+
+        // This does sync, but only if it has grad consumers below! so we must sync again before sending bprop terminal messages
+        bprop(getActsGrad(), passType, passIdx);
+       
+        if (_bwdTerminal[passIdx]) {
+            syncStream();
+            getConvNet().getMessageQueue().enqueue(new Message(BPROP_TERMINAL));
+        }
+    }
+}
+
+void Layer::bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx) {
+    Layer& prev = *_prev[replicaIdx][inputIdx];
+    if (prev.isGradConsumer() && isGradProducer(prev.getName())) {
+        if (v.getLeadingDim() > 0) { // Only do computation if #cases > 0
+            bpropActs(v, replicaIdx, inputIdx, prev.getNumComputedActsGrads(getDeviceID()) > 0, passType);
+        }
+        prev.getNumComputedActsGrads(getDeviceID())++;
+        // Synchronize if the previous layer is going to actually do a reduction.
+        // If the previous layer is on the same GPU as us and has no next layers
+        // on other GPUs then it won't need to do a reduction.
+        if (prev.getNextDeviceIDs().size() > 1 || (prev.getNextDeviceIDs().size() == 1 && getDeviceID() != prev.getDeviceID())) {
+            syncStream();
+        }
+        prev.getGradReducer().enqueueReduction(getDeviceID());
+    }
+}
+
+void Layer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) {
+
+    v.transpose(_trans);
+    assert(getDeviceID() == NVMatrix::getDeviceID());
+    int ridx = getBwdActiveInputReplicaIdx(passIdx);
+    LayerV& prev = _prev[ridx];
+    map<int, set<Layer*> > prevByDevice = _prevByDevice[ridx];
+
+    for (int i = 0; i < prev.size(); i++) {
+        _inputs[i]->transpose(_trans);
+        prev[i]->getActsGrad().transpose(_trans);
+    }
+    getActs().transpose(_trans);
+    // NOTE: this should be here (before the bpropActs) because if you have a layer
+    // that has a weight matrix AND actsGradTarget >= 0, then the stuff below will overwrite
+    // v which is used in bpropCommon. So bpropCommon must come first.
+    bpropCommon(v, ridx, passType);
+
+    if (isGradProducer()) {
+        // First propagate activity gradient to all layers whose activity
+        // gradient matrix I'm definitely not sharing.
+        for (map<int, set<Layer*> >::const_iterator it = prevByDevice.begin(); it != prevByDevice.end(); ++it) {
+            const set<Layer*>& deviceLayers = it->second;
+            for (set<Layer*>::const_iterator it2 = deviceLayers.begin(); it2 != deviceLayers.end(); ++it2) {
+                if (_actsGradTarget != (*it2)->getInputIdx(_name)) {
+                    bpropActsCall(v, passType, ridx, (*it2)->getInputIdx(_name));
+                }
+            }
+        }
+
+        // Then propagate activity gradient to the layer whose activity gradient
+        // matrix I'm sharing, if any.
+        if (_actsGradTarget >= 0) {
+            bpropActsCall(v, passType, ridx, _actsGradTarget);
+        }
+    }
+
+    // Synchronization is necessary because the kernel calls that compute my backward acts
+    // execute asynchronously. Therefore I don't want to tell other threads that I've
+    // computed bprop activities for them when in fact I've only called a function which
+    // will eventually compute them.
+    if (_prevDeviceIDs.size() > 1 || (_prevDeviceIDs.size() == 1 && _prevDeviceIDs[0] != getDeviceID())) {
+        syncStream();
+    }
+
+    if (getConvNet().isConserveMemory()) {
+        truncBwdActs();
+    }
+
+    if (isGradProducer()) {
+        /*for (int i = 0; i < prev.size(); i++) {
+            if (prev[i]->isGradConsumer() && isGradProducer(prev[i]->getName())) {
+                prev[i]->getGradReducer().enqueueReduction(getDeviceID());
+            }
+        }*/
+
+        // Send backward messages to *all* replicas.
+        // Note that the messages will be dismissed unless the passIdx indicates
+        // that the previous layer should do some work.
+        for (int r = 0; r < getNumInputReplicas(); r++) {
+            for (int i = 0; i < _prev[r].size(); i++) {
+                if (_prev[r][i]->isGradConsumer() && isGradProducer(_prev[r][i]->getName())) {
+                    _prev[r][i]->getConvNetThread().getMessageQueue().enqueue(new BpropMessage(*_prev[r][i], passType, passIdx));
+                }
+            }
+        }
+    }
+}
+
+IActGradReducer& Layer::getGradReducer() {
+    return *_gradReducer;
+}
+
+// This is called between minibatches
+void Layer::reset() {
+    _rcvdFInputMsgs = 0;
+    _rcvdBInputMsgs = 0;
+    for (map<int,int>::iterator it = _numComputedActsGrads.begin(); it != _numComputedActsGrads.end(); ++it) {
+        it->second = 0;
+    }
+}
+
+// This is called between microbatches
+void Layer::resetPassIdx() {
+    _rcvdFInputMsgs = 0;
+    if (_rcvdBInputMsgs >= getNumExpectedBwdMsgs()) {
+        reset();
+    }
+}
+
+/*
+ * Returns number of cases in given matrix.
+ */
+int Layer::getNumCases(NVMatrix& v) {
+    return v.getLeadingDim();
+}
+
+int Layer::incRcvdBInputMsgs() {
+    return ++_rcvdBInputMsgs;
+}
+
+std::string& Layer::getName() {
+    return _name;
+}
+
+std::string& Layer::getType() {
+    return _type;
+}
+
+int& Layer::getNumComputedActsGrads(int deviceID) {
+    return _numComputedActsGrads[deviceID];
+}
+
+void Layer::addNext(Layer& l) {
+    _next.push_back(&l);
+    _numReplicasNext = l.getNumReplicas();
+    if (count(_nextDeviceIDs.begin(), _nextDeviceIDs.end(), l.getDeviceID()) == 0) {
+        int pos = rand() % (_nextDeviceIDs.size() + 1);
+        _nextDeviceIDs.insert(_nextDeviceIDs.begin() + pos, l.getDeviceID());
+    }
+}
+
+void Layer::addPrev(Layer& l, int replicaIdx) {
+    _prev[replicaIdx].push_back(&l);
+    _numReplicasPrev = l.getNumReplicas();
+    l.setInputIdx(getName(), _prev[replicaIdx].size() - 1);
+    if (l.getDeviceID() >= 0 && count(_prevDeviceIDs.begin(), _prevDeviceIDs.end(), l.getDeviceID()) == 0) {
+        int pos = rand() % (_prevDeviceIDs.size() + 1);
+        _prevDeviceIDs.insert(_prevDeviceIDs.begin() + pos, l.getDeviceID());
+    }
+}
+
+void Layer::addReplica(Layer& l) {
+    assert(_replicas.count(l.getReplicaID()) == 0);
+    _replicas[l.getReplicaID()] = &l;
+}
+
+bool Layer::hasGradProducerNext(std::string& layerName) {
+    bool b = _next.size() == 0;
+    for (int i = 0; i < _next.size(); i++) {
+        b |= _next[i]->hasGradProducerNext(_name);
+    }
+    return b && isGradProducer(layerName);
+}
+
+bool Layer::postInit() {
+    // We choose not to populate _outputs[getDeviceID()] here because we do it instead in fprop().
+    // In fprop(), we can populate it from the _inputs vector, which is a bit more general than populating
+    // it from _prev->getActs()
+//    _outputs = _actsTarget < 0 ? new NVMatrix() : &_prev[_actsTarget]->getActs();
+    if (!_initialized) {
+        _initialized = true;
+        map<int,int> numGradProducersNext;
+        _numGradProducersNext = 0;
+        for (int r = 0; r < getNumInputReplicas(); ++r) {
+            for (vector<Layer*>::const_iterator it = _prev[r].begin(); it != _prev[r].end(); ++it) {
+                (*it)->postInit();
+            }
+        }
+
+        _memSrcActs[getDeviceID()] = _actsTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName())
+                                                     : &_prev[0][_actsTarget]->getMemorySourceActs(getDeviceID()).clone(_name);
+
+        // _actsGradTarget will only be >= 0 when the number of replicas is the same in both layers, so this justifies the use of _prev[0]
+
+        _memSrcActsGrad[getDeviceID()] = _actsGradTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName())
+                                                             : &_prev[0][_actsGradTarget]->getMemorySourceActsGrad(getDeviceID()).clone(_name);
+        for (int i = 0; i < _next.size(); ++i) {
+            int d = _next[i]->getDeviceID();
+            _numComputedActsGrads[d] = 0;
+            if (_next[i]->hasGradProducerNext(_name)) {
+                if (numGradProducersNext.count(d) == 0) {
+                    numGradProducersNext[d] = 0;
+                }
+                numGradProducersNext[d]++;
+                _numGradProducersNext++;
+                if (_memSrcActsGrad.count(d) == 0) {
+                    _memSrcActsGrad[d] = &MemorySource::make(_numOutputs, d, getName());
+                }
+            }
+            if (_memSrcActs.count(d) == 0) {
+                _memSrcActs[d] = &MemorySource::make(_numOutputs, d, getName());
+            }
+        }
+
+        if (_next.size() == 0) {
+            _numReplicasNext = getNumReplicas();
+        }
+
+        /*
+         * Initialize forward broadcaster. First sibling owns it.
+         */
+        if (getReplicaIdx() == 0 && _convNetThread != NULL) {
+            _actBroadcaster = new ActBroadcaster(getNumSiblingReplicas(), getDeviceCPUs(_convNetThread->getDeviceID()));
+            _actBroadcaster->start();
+        }
+
+        /*
+         * Initialize backward reducer.
+         */
+        if (isGradConsumer() && _numGradProducersNext > 0) {
+            _gradReducer = &IActGradReducer::makeGradReducer(*this, numGradProducersNext);
+            _gradReducer->start();
+        }
+
+        /*
+         * Initialize specially sorted previous array
+         */
+        for (int r = 0; r < _prev.size(); ++r) {
+            for (int i = 0; i < _prev[r].size(); ++i) {
+                // Previous devices in reverse order of processing by (sequential) GradReducer
+                _prevByDevice[r][getDeviceID() - _prev[r][i]->getDeviceID()
+                                 + 16 * (_prev[r][i]->getDeviceID() > getDeviceID())].insert(_prev[r][i]);
+
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+ActBroadcaster& Layer::getActBroadcaster() {
+    return getReplicaIdx() == 0 ? *_actBroadcaster : _replicas[getReplicaID() - getReplicaIdx()]->getActBroadcaster();
+}
+
+// Does this layer, or some layer below it, need the gradient
+// for parameter updates?
+// Only weight layers should be grad consumers themselves.
+bool Layer::isGradConsumer() {
+    if (!_foundGradConsumers && _prev.size() > 0) {
+        for (int i = 0; i < _prev[0].size(); i++) {
+            _gradConsumer |= _prev[0][i]->isGradConsumer();
+        }
+        _foundGradConsumers = true;
+    }
+    return _gradConsumer;
+}
+
+// Does this layer produce gradient for layers below?
+bool Layer::isGradProducer() {
+    return true;
+}
+
+bool Layer::isGradProducer(std::string& layerName) {
+    return isGradProducer();
+}
+
+map<int,vector<Layer*> >& Layer::getPrev() {
+    return _prev;
+}
+
+vector<Layer*>& Layer::getNext() {
+    return _next;
+}
+
+NVMatrix& Layer::getActs() {
+    return getActs(getDeviceID());
+}
+
+NVMatrix& Layer::getActs(int deviceID) {
+    assert(_memSrcActs.count(deviceID) > 0);
+    return _memSrcActs[deviceID]->getMemory();
+}
+
+NVMatrix& Layer::getActs(int deviceID, int numCases) {
+    assert(_memSrcActs.count(deviceID) > 0);
+    return _memSrcActs[deviceID]->getMemory(numCases);
+}
+
+NVMatrix& Layer::getActsGrad(int deviceID) {
+    assert(_memSrcActsGrad.count(deviceID) > 0);
+    return _memSrcActsGrad[deviceID]->getMemory(getActs(deviceID).getLeadingDim());
+}
+
+NVMatrix& Layer::getActsGrad() {
+    return getActsGrad(NVMatrix::getDeviceID());
+}
+
+map<int, NVMatrix*> Layer::getAllActs() {
+    map<int, NVMatrix*> m;
+    for (map<int, MemoryView*>::const_iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
+        m[it->first] = &it->second->getMemory();
+    }
+    return m;
+}
+
+map<int, NVMatrix*> Layer::getAllActsGrads() {
+    map<int, NVMatrix*> m;
+    for (map<int, MemoryView*>::const_iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
+        m[it->first] = &it->second->getMemory();
+    }
+    return m;
+}
+
+int Layer::getDeviceID() {
+    return _convNetThread == NULL ? -1 : _convNetThread->getDeviceID();
+}
+
+ConvNetThread& Layer::getConvNetThread() {
+    assert(_convNetThread != NULL);
+    return *_convNetThread;
+}
+
+ConvNet& Layer::getConvNet() {
+    return getConvNetThread().getConvNet();
+}
+
+void Layer::setBwdTerminal(int passIdx) {
+    _bwdTerminal[passIdx] = true;
+}
+
+int Layer::getReplicaID() {
+    return  _replicaID;
+}
+
+int Layer::getActivePassPeriod() {
+    return getNumReplicas() / getConvNet().getNumReplicasMin();
+}
+
+int Layer::getFwdActiveInputReplicaIdx(int passIdx) {
+    const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas();
+    return passIdx % getActivePassPeriod() == 0 ? edge : -1;
+}
+
+int Layer::getBwdActiveInputReplicaIdx(int passIdx) {
+    const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas();
+    return (passIdx + 1) % getActivePassPeriod() == 0 ? edge : -1;
+}
+
+int Layer::getFwdActiveReplicaIdx(int passIdx) {
+    assert(_next.size() > 0);
+    return _next[0]->getFwdActiveInputReplicaIdx(passIdx);
+}
+
+int Layer::getNumReplicas() {
+    return _replicas.size();
+}
+
+int Layer::getNumSiblingReplicas() {
+    return getNumReplicas() / getNumReplicasNext();
+}
+
+int Layer::getNumReplicasPrev() {
+    return _numReplicasPrev;
+}
+
+int Layer::getNumReplicasNext() {
+    return _numReplicasNext;
+}
+
+int Layer::getNumInputReplicas() {
+    return _numReplicasPrev / getNumReplicas();
+}
+
+int Layer::getReplicaIdx() {
+    return getReplicaID() % getNumSiblingReplicas();
+}
+
+int Layer::getNumLayersPrev() {
+    return _prev.size() > 0 ? _prev[0].size() : 0;
+}
+
+void Layer::setMemorySourceActs(int deviceID, MemoryView& mem) {
+    assert(_memSrcActs[deviceID]->isParent());
+    delete _memSrcActs[deviceID];
+    _memSrcActs[deviceID] = &mem;
+    if (_actsTarget >= 0 && deviceID == getDeviceID()) {
+        assert(getNumInputReplicas() == 1);
+        _prev[0][_actsTarget]->setMemorySourceActs(deviceID, mem.clone(_prev[0][_actsTarget]->getName()));
+    }
+}
+
+void Layer::setMemorySourceActsGrad(int deviceID, MemoryView& mem) {
+    assert(_memSrcActsGrad[deviceID]->isParent());
+    delete _memSrcActsGrad[deviceID];
+    _memSrcActsGrad[deviceID] = &mem;
+    if (_actsGradTarget >= 0 && deviceID == getDeviceID()) {
+        assert(getNumInputReplicas() == 1);
+        _prev[0][_actsGradTarget]->setMemorySourceActsGrad(deviceID, mem.clone(_prev[0][_actsGradTarget]->getName()));
+    }
+}
+
+MemoryView& Layer::getMemorySourceActs(int deviceID) {
+    return *_memSrcActs[deviceID];
+}
+
+MemoryView& Layer::getMemorySourceActsGrad(int deviceID) {
+    return *_memSrcActsGrad[deviceID];
+}
+
+int Layer::getNumOutputs() {
+    return _numOutputs;
+}
+
+void Layer::setInputIdx(std::string& parentName, int idx) {
+    _inputIndices[parentName] = idx;
+}
+
+int Layer::getInputIdx(std::string& parentName) {
+    return _inputIndices[parentName];
+}
+
+/*
+ * =======================
+ * NeuronLayer
+ * =======================
+ */
+NeuronLayer::NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : Layer(convNetThread, paramsDict, replicaID, true) {
+    PyObject* neuronDict = PyDict_GetItemString(paramsDict, "neuron");
+    _neuronType = pyDictGetString(neuronDict, "type");
+    _neuron = &Neuron::makeNeuron(neuronDict);
+}
+
+NeuronLayer::~NeuronLayer() {
+    delete _neuron;
+}
+
+void NeuronLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 0);
+    if (!bpropSpecial(v, replicaIdx, inpIdx, scaleTargets, passType)) {
+        _neuron->computeInputGrad(v, _prev[replicaIdx][0]->getActsGrad(), scaleTargets > 0);
+    }
+}
+
+bool NeuronLayer::bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    // Special optimization for cross-entropy objective with logistic units.
+    // Better to just compute the input gradient in one go to avoid division by small numbers.
+    bool doCrossEntGrad = _neuronType == "logistic" && _next.size() == 1
+                        && (_next[0]->getType() == "cost.bce" || _next[0]->getType() == "cost.dce")
+                        && _next[0]->getDeviceID() == getDeviceID()
+                        && _next[0]->getNumReplicas() == getNumReplicas();
+    LayerV& prev = _prev[replicaIdx];
+    if (doCrossEntGrad) {
+        NVMatrix& labels = _next[0]->getPrev()[replicaIdx][0]->getActs(getDeviceID());
+        BinomialCrossEntropyCostLayer& cost = *static_cast<BinomialCrossEntropyCostLayer*>(_next[0]);
+        float gradCoeff = cost.getCoeff();
+        labels.transpose(_trans);
+        if (cost.getPosWeight() == 1) {
+            if (scaleTargets == 0) {
+                getActs().add(labels, -gradCoeff, gradCoeff, prev[0]->getActsGrad());
+            } else {
+                getActs().applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::WeightedAdd>(NVMatrixBinaryOps::WeightedAdd(-gradCoeff, gradCoeff)),
+                                       labels, prev[0]->getActsGrad(), prev[0]->getActsGrad());
+            }
+        } else {
+            if (scaleTargets == 0) {
+                getActs().applyBinary(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight()), labels, prev[0]->getActsGrad());
+            } else {
+                getActs().applyTernary(AddGradientBinaryOperator<CrossEntLogisticGradientOperator>(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight())),
+                                       labels, prev[0]->getActsGrad(), prev[0]->getActsGrad());
+            }
+        }
+    }
+    return doCrossEntGrad;
+}
+
+void NeuronLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    _neuron->activate(*_inputs[0], getActs());
+}
+
+std::string& NeuronLayer::getNeuronType() {
+    return _neuronType;
+}
+
+/*
+ * =======================
+ * WeightLayer
+ * =======================
+ *
+ * The useGrad parameter here merely expresses a preference by the subclass. It may
+ * be overridden by the superclass (WeightLayer) and in that case the subclass must follow its wishes.
+ * So when computing gradient updates, the subclass must always first check weights.isUseGrad().
+ *
+ * Note: biases always useGrad.
+ */
+WeightLayer::WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad) :
+    Layer(convNetThread, paramsDict, replicaID, trans) {
+    _weightUpdatePassPeriod = pyDictGetInt(paramsDict, "updatePeriod");
+
+    MatrixV& hWeights = *pyDictGetMatrixV(paramsDict, "weights");
+    MatrixV& hWeightsInc = *pyDictGetMatrixV(paramsDict, "weightsInc");
+    Matrix& hBiases = *pyDictGetMatrix(paramsDict, "biases");
+    Matrix& hBiasesInc = *pyDictGetMatrix(paramsDict, "biasesInc");
+    PyObject* pyEpsWList = PyDict_GetItemString(paramsDict, "epsW");
+    PyObject* pyEpsB = PyDict_GetItemString(paramsDict, "epsB");
+    floatv& momW = *pyDictGetFloatV(paramsDict, "momW");
+    float momB = pyDictGetFloat(paramsDict, "momB");
+    floatv& wc = *pyDictGetFloatV(paramsDict, "wc");
+    floatv& wball = *pyDictGetFloatV(paramsDict, "wballNormed");
+
+    /*
+     * When there are multiple replicas, the present implementation
+     * requires that useGrad is true. This is because weights.update()
+     * performs a simultaneous write to both replicas' weightsInc matrix,
+     * which means that the read should come from somewhere else (i.e. a
+     * grads matrix).
+     */
+    useGrad |= _numReplicas > 1;
+
+    // Source layers for shared weights
+    stringv& weightSourceLayers = *pyDictGetStringV(paramsDict, "weightSourceLayers");
+
+    // Weight matrix indices (inside the above source layers) for shared weights
+    intv& weightSourceMatrixIndices = *pyDictGetIntV(paramsDict, "weightSourceMatrixIndices");
+    _weights = new WeightList();
+    for (int i = 0; i < weightSourceLayers.size(); i++) {
+        std::string& srcLayerName = weightSourceLayers[i];
+        int matrixIdx = weightSourceMatrixIndices[i];
+        PyObject* pyEpsW = PyList_GetItem(pyEpsWList, i);
+        ParameterSchedule& lrs = ParameterSchedule::make(pyEpsW); // Learning rate schedule
+        if (srcLayerName == _name) { // Current layer
+            _weights->addWeights(*new Weights(_weights->at(matrixIdx), lrs, *this));
+        } else if (srcLayerName != "") {
+            WeightLayer& srcLayer = *static_cast<WeightLayer*>(&convNetThread->getLayer(srcLayerName));
+            Weights* srcWeights = &srcLayer.getWeights(matrixIdx);
+            _weights->addWeights(*new Weights(*srcWeights, lrs, *this));
+        } else {
+            _weights->addWeights(*new Weights(*hWeights[i], *hWeightsInc[i], lrs, *this, wc[i], wball[i], momW[i], useGrad));
+        }
+    }
+    _biases = new Weights(hBiases, hBiasesInc, ParameterSchedule::make(pyEpsB), *this, 0, 0, momB, true);
+
+    delete &weightSourceLayers;
+    delete &weightSourceMatrixIndices;
+    delete &hWeights;
+    delete &hWeightsInc;
+    delete &momW;
+    delete &wc;
+    delete &wball;
+
+    _wStep = 0.02;
+    _bStep = 0.005;
+}
+
+WeightLayer::~WeightLayer() {
+    delete _weights;
+    delete _biases;
+}
+
+bool WeightLayer::postInit() {
+    if (Layer::postInit()) {
+        _weightUpdatePassPeriod = max(_weightUpdatePassPeriod, getActivePassPeriod());
+        assert(_weightUpdatePassPeriod % getActivePassPeriod() == 0);
+        return true;
+    }
+    return false;
+}
+
+void WeightLayer::fpropCommon(PASS_TYPE passType) {
+}
+
+void WeightLayer::bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
+    if (_biases->getLearningRateSchedule().getBaseValue() > 0) {
+        if (v.getNumElements() > 0) {
+            bpropBiases(v, passType);
+        } else {
+            _biases->getGrad().resize(_biases->getW());
+            _biases->getGrad().scale(getBIncScale());
+        }
+        _biases->incNumUpdates();
+    }
+    for (int i = 0; i < _weights->getSize(); i++) {
+        if (_weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
+            if (v.getNumElements() > 0) {
+                bpropWeights(v, replicaIdx, i, passType);
+            } else {
+                _weights->at(i).getGrad().resize(_weights->at(i).getW());
+                // This will cause it to forget momentum when shown 0 training cases
+                // and _useGrad = false but it's not too important.
+                _weights->at(i).getGrad().scale(getIncScale(i, passType));
+            }
+            // Increment its number of updates
+            _weights->at(i).incNumUpdates();
+        }
+    }
+}
+
+bool WeightLayer::updateWeights() {
+     if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) {
+        _weights->update(getConvNet().getTrainingProgress());
+        _biases->update(getConvNet().getTrainingProgress());
+//        constrainWeights();
+        return true;
+    }
+    return false;
+}
+
+bool WeightLayer::constrainWeights() {
+    if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) {
+        _constrainWeights();
+        return true;
+    }
+    return false;
+}
+
+void WeightLayer::_constrainWeights() {
+}
+
+void WeightLayer::copyToCPU() {
+    _weights->copyToCPU();
+    _biases->copyToCPU();
+}
+
+void WeightLayer::copyToGPU() {
+    _weights->copyToGPU();
+    _biases->copyToGPU();
+}
+
+void WeightLayer::checkGradient() {
+    for (int i = 0; i < _weights->getSize(); i++) {
+        getConvNet().checkGradient(_name + " weights[" + tostr(i) + "]", _wStep, _weights->at(i));
+    }
+    getConvNet().checkGradient(_name + " biases", _bStep, *_biases);
+}
+
+void WeightLayer::addReplica(Layer& l) {
+    Layer::addReplica(l);
+    _weights->addReplica(*static_cast<WeightLayer*>(&l)->_weights);
+    _biases->addReplica(*static_cast<WeightLayer*>(&l)->_biases);
+}
+
+Weights& WeightLayer::getWeights(int idx) {
+    return _weights->at(idx);
+}
+
+float WeightLayer::getGradScale(int inpIdx, PASS_TYPE passType) {
+    // weight update period must be multiple of activation period
+    // TODO: simply accumulate # of cases seen between weight updates. simpler and more accurate.
+    double numCases = _weightUpdatePassPeriod * (getConvNet().getMinibatchSize() / double(getConvNet().getNumPasses()));
+    if (_weights->at(inpIdx).isUseGrad()) {
+        return passType == PASS_GC ? 1.0f : 1.0f / numCases;
+    }
+    return passType == PASS_GC ? 1.0f : _weights->at(inpIdx).getEps(getConvNet().getTrainingProgress()) / numCases;
+}
+
+float WeightLayer::getIncScale(int inpIdx, PASS_TYPE passType) {
+    if (_weights->at(inpIdx).isUseGrad()) {
+        return _weights->at(inpIdx).getNumUpdates() > 0;
+    }
+    return  (passType == PASS_GC ? _weights->at(inpIdx).getNumUpdates() > 0
+                                 : (_weights->at(inpIdx).getNumUpdates() == 0 ? _weights->at(inpIdx).getMom() : 1.0f));
+}
+
+NVMatrix& WeightLayer::getGradTarget(int inpIdx) {
+    return _weights->at(inpIdx).getGrad();
+}
+
+float WeightLayer::getBGradScale(PASS_TYPE passType) {
+    int numCases = _weightUpdatePassPeriod * DIVUP(getConvNet().getMinibatchSize(), getConvNet().getNumPasses());
+    return passType == PASS_GC ? 1.0f : 1.0f / numCases;
+}
+
+float WeightLayer::getBIncScale() {
+    return _biases->getNumUpdates() > 0;
+}
+
+NVMatrix& WeightLayer::getWeightMatrix(PASS_TYPE passType, int inpIdx) {
+    return _weights->at(inpIdx).getW();
+}
+
+NVMatrix& WeightLayer::getBiasMatrix(PASS_TYPE passType) {
+    return _biases->getW();
+}
+
+/*
+ * =======================
+ * FCLayer
+ * =======================
+ */
+FCLayer::FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
+    : WeightLayer(convNetThread, paramsDict, replicaID, true, useGrad) {
+    _wStep = 0.01;
+    _bStep = 0.01;
+}
+
+void FCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    getActs().addProduct(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), scaleTargets, 1);
+    if (scaleTargets == 0) {
+        getActs().addVector(getBiasMatrix(passType), 1, getActs());
+    }
+}
+
+void FCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose();
+    _prev[replicaIdx][inpIdx]->getActsGrad().addProduct(v, weights_T, scaleTargets, 1);
+    delete &weights_T;
+}
+
+void FCLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
+    _biases->getGrad().addSum(v, 0, getBIncScale(), getBGradScale(passType));
+}
+
+void FCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
+    NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose();
+    float scaleGrad = getGradScale(inpIdx, passType);
+    float scaleInc = getIncScale(inpIdx, passType);
+    getGradTarget(inpIdx).addProduct(prevActs_T, v, scaleInc, scaleGrad);
+    delete &prevActs_T;
+}
+
+void FCLayer::_constrainWeights() {
+    for (int i = 0; i < _weights->getSize(); i++) {
+        if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
+//            NVMatrix norm2; // Unfortunate extra weight matrix...
+            _weights->at(i).getW().sumOfSquares(0, _norm2);
+//            norm2.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall()));
+            _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall()));
+            _weights->at(i).getW().eltwiseMultByVector(_norm2);
+        }
+    }
+}
+
+/*
+ * =======================
+ * SplitFCLayer
+ * =======================
+ */
+SplitFCLayer::SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
+    : FCLayer(convNetThread, paramsDict, replicaID, useGrad) {
+    _numParts = pyDictGetInt(paramsDict, "parts");
+}
+
+void SplitFCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    getActs().resize(_inputs[inpIdx]->getNumRows(), _numOutputs, true);
+    NVMatrixV& splitInput = _inputs[inpIdx]->splitCols(_numParts);
+    NVMatrixV& splitWeights = getWeightMatrix(passType, inpIdx).splitRows(_numParts);
+    NVMatrixV& splitTarget = getActs().splitCols(_numParts);
+
+    NVMatrix::batchedMatrixMultiply(splitInput, splitWeights, splitTarget, scaleTargets, 1);
+    if (scaleTargets == 0) {
+        getActs().addVector(getBiasMatrix(passType), 1, getActs());
+    }
+
+    deleteElements(splitInput, true);
+    deleteElements(splitWeights, true);
+    deleteElements(splitTarget, true);
+}
+
+void SplitFCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose();
+    _prev[replicaIdx][inpIdx]->getActsGrad().resize(*_inputs[inpIdx]);
+
+    NVMatrixV& splitV = v.splitCols(_numParts);
+    NVMatrixV& splitWeights_T = weights_T.splitCols(_numParts);
+    NVMatrixV& splitTarget = _prev[replicaIdx][inpIdx]->getActsGrad().splitCols(_numParts);
+
+    NVMatrix::batchedMatrixMultiply(splitV, splitWeights_T, splitTarget, scaleTargets, 1);
+
+    delete &weights_T;
+    deleteElements(splitV, true);
+    deleteElements(splitWeights_T, true);
+    deleteElements(splitTarget, true);
+}
+
+void SplitFCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
+    NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose();
+    NVMatrixV& splitPrevActs_T = prevActs_T.splitRows(_numParts);
+    NVMatrixV& splitV = v.splitCols(_numParts);
+    NVMatrixV& splitGradTarget = getGradTarget(inpIdx).splitRows(_numParts);
+
+    NVMatrix::batchedMatrixMultiply(splitPrevActs_T, splitV, splitGradTarget, getIncScale(inpIdx, passType), getGradScale(inpIdx, passType));
+
+    delete &prevActs_T;
+    deleteElements(splitPrevActs_T, true);
+    deleteElements(splitV, true);
+    deleteElements(splitGradTarget, true);
+}
+
+/*
+ * =======================
+ * TwoDLayerInterface
+ * =======================
+ */
+TwoDLayerInterface::TwoDLayerInterface(PyObject* paramsDict) {
+    _channels = pyDictGetInt(paramsDict, "channels");
+    _imgSize = pyDictGetInt(paramsDict, "imgSize");
+    _imgPixels = _imgSize * _imgSize;
+}
+
+/*
+ * =======================
+ * LocalLayer
+ * =======================
+ */
+LocalLayer::LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
+    : WeightLayer(convNetThread, paramsDict, replicaID, false, useGrad) {
+    _padding = pyDictGetIntV(paramsDict, "padding");
+    _stride = pyDictGetIntV(paramsDict, "stride");
+    _filterSize = pyDictGetIntV(paramsDict, "filterSize");
+    _channels = pyDictGetIntV(paramsDict, "channels");
+    _imgSize = pyDictGetIntV(paramsDict, "imgSize");
+    _numFilters = pyDictGetInt(paramsDict, "filters");
+    _groups = pyDictGetIntV(paramsDict, "groups");
+    _filterChannels = pyDictGetIntV(paramsDict, "filterChannels");
+    _filterPixels = pyDictGetIntV(paramsDict, "filterPixels");
+    _imgPixels = pyDictGetIntV(paramsDict, "imgPixels");
+   
+    _modulesX = pyDictGetInt(paramsDict, "modulesX");
+    _modules = pyDictGetInt(paramsDict, "modules");
+}
+
+LocalLayer::~LocalLayer() {
+    delete _padding;
+    delete _stride;
+    delete _filterSize;
+    delete _channels;
+    delete _imgSize;
+    delete _groups;
+    delete _filterChannels;
+    delete _filterPixels;
+    delete _imgPixels;
+}
+
+/*
+ * =======================
+ * ConvLayer
+ * =======================
+ */
+ConvLayer::ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : LocalLayer(convNetThread, paramsDict, replicaID, true) {
+    _sumWidth = pyDictGetInt(paramsDict, "sumWidth");
+    _sharedBiases = pyDictGetInt(paramsDict, "sharedBiases");
+    _weightContrastNormMin = pyDictGetFloatV(paramsDict, "wcNormMin");
+    _weightContrastNormMax = pyDictGetFloatV(paramsDict, "wcNormMax");
+}
+
+ConvLayer::~ConvLayer() {
+    delete _weightContrastNormMin;
+    delete _weightContrastNormMax;
+}
+
+void ConvLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx),
+                   _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
+
+    if (scaleTargets == 0) {
+        if (_sharedBiases) {
+            getActs().reshape(_numFilters, getActs().getNumElements() / _numFilters);
+            getActs().addVector(getBiasMatrix(passType));
+            getActs().reshape(_numFilters * _modules, getActs().getNumElements() / (_numFilters * _modules));
+        } else {
+            getActs().addVector(getBiasMatrix(passType));
+        }
+    }
+}
+
+void ConvLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
+    float scaleBGrad = getBGradScale(passType);
+    float scaleInc = getBIncScale();
+    if (_sharedBiases) {
+        v.reshape(_numFilters, v.getNumElements() / _numFilters);
+        _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad);
+        v.reshape(_numFilters * _modules, v.getNumElements() / (_numFilters * _modules));
+    } else {
+        _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad);
+    }
+}
+
+void ConvLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
+    assert(_weights->at(inpIdx).isUseGrad());
+    bool doPartialSum = _sumWidth < _modulesX;
+    NVMatrix& tgt = doPartialSum ? _weightGradTmp : _weights->at(inpIdx).getGrad();
+
+    float scaleWGrad = getGradScale(inpIdx, passType);
+    float scaleTargets = getIncScale(inpIdx, passType) * !doPartialSum;
+
+    convWeightActs(*_inputs[inpIdx], v, tgt, _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx),
+                   _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), _sumWidth, scaleTargets, scaleWGrad);
+
+    if (doPartialSum) {
+        scaleTargets = _weights->at(inpIdx).getNumUpdates() > 0;
+        int outWidth = DIVUP(_modulesX, _sumWidth);
+        _weightGradTmp.reshape(outWidth*outWidth, _filterChannels->at(inpIdx) * _filterPixels->at(inpIdx) * _numFilters);
+        _weights->at(inpIdx).getGrad().addSum(_weightGradTmp, 0, scaleTargets, 1);
+        _weights->at(inpIdx).getGrad().reshape(_filterChannels->at(inpIdx) * _filterPixels->at(inpIdx), _numFilters);
+    }
+}
+
+void ConvLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(), _imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX,
+                _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
+}
+
+void ConvLayer::truncBwdActs() {
+    LocalLayer::truncBwdActs();
+    _weightGradTmp.truncate();
+}
+
+void ConvLayer::_constrainWeights() {
+    for (int i = 0; i < _weights->getSize(); i++) {
+        if (_weightContrastNormMax->at(i) > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
+            float fz = _weights->at(i).getW().getNumRows();
+            NVMatrix tmp;
+            _weights->at(i).getW().sum(0, tmp);
+            _weights->at(i).getW().addVector(tmp, -1.0f / fz, _weights->at(i).getGrad());
+            // Now _weights->at(i).getGrad() contains zero-mean filters
+            _weights->at(i).getGrad().apply(NVMatrixOps::Square());
+            _weights->at(i).getGrad().sum(0, tmp);
+
+            tmp.apply(WeightContrastNormOperator(_weightContrastNormMin->at(i), _weightContrastNormMax->at(i), 1.0f / fz));
+            // Now tmp has the stdev
+            _weights->at(i).getW().eltwiseMultByVector(tmp);
+        }
+        // It's pretty silly to do both these things but whatever
+        if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
+//            NVMatrix norm2;
+            _weights->at(i).getW().sumOfSquares(0, _norm2);
+
+//            norm.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall()));
+            _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall()));
+            _weights->at(i).getW().eltwiseMultByVector(_norm2);
+        }
+    }
+}
+
+/*
+ * =======================
+ * LocalUnsharedLayer
+ * =======================
+ */
+LocalUnsharedLayer::LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : LocalLayer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void LocalUnsharedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    localFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx),
+                    _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
+    if (scaleTargets == 0) {
+        getActs().addVector(getBiasMatrix(passType));
+    }
+}
+
+void LocalUnsharedLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
+    _biases->getGrad().addSum(v, 1, getBIncScale(), getBGradScale(passType));
+}
+
+void LocalUnsharedLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
+    float scaleWGrad = getGradScale(inpIdx, passType);
+    float scaleInc = getIncScale(inpIdx, passType);
+    localWeightActs(*_inputs[inpIdx], v, getGradTarget(inpIdx), _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx),
+                    _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleInc, scaleWGrad);
+}
+
+void LocalUnsharedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    localImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(),_imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX,
+                 _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
+}
+
+void LocalUnsharedLayer::_constrainWeights() {
+    for (int i = 0; i < _weights->getSize(); i++) {
+        if (_weights->at(i).getWBall() > 0  && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
+            normalizeLocalWeights(*_weights->at(i), _modules, _weights->at(i).getWBall());
+        }
+    }
+}
+
+/*
+ * =======================
+ * SoftmaxLayer
+ * =======================
+ */
+SoftmaxLayer::SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : Layer(convNetThread, paramsDict, replicaID, true), _doUpperGrad(false) {
+}
+
+void SoftmaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    NVMatrix& input = *_inputs[0];
+    input.max(1, _max);
+    input.addVector(_max, -1, getActs());
+    getActs().apply(NVMatrixOps::Exp());
+    getActs().sum(1, _sum);
+    getActs().eltwiseDivideByVector(_sum);
+}
+
+void SoftmaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 0);
+    LayerV& prev = _prev[replicaIdx];
+    if (_doUpperGrad) {
+        // Todo: rethink replica IDs or idxes... this here doesn't make a huge amount of sense
+        for (int i = 0; i < _next.size(); ++i) {
+            if (_next[i]->isGradProducer(getName())) {
+                NVMatrix& labels = _next[i]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); // Get cost's labels
+                float gradCoeff = dynamic_cast<CostLayer*>(_next[i])->getCoeff();
+
+                computeLogregSoftmaxGrad(labels, getActs(), prev[0]->getActsGrad(), scaleTargets == 1, gradCoeff);
+                break;
+            }
+        }
+
+    } else {
+        computeSoftmaxGrad(getActs(), v, prev[0]->getActsGrad(), scaleTargets, 1);
+    }
+}
+
+void SoftmaxLayer::setDoUpperGrad(bool b) {
+    _doUpperGrad = b;
+}
+
+/*
+ * =======================
+ * ConcatenationLayer
+ * =======================
+ */
+ConcatenationLayer::ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : Layer(convNetThread, paramsDict, replicaID, false) {
+    _copyOffsets = pyDictGetIntV(paramsDict, "copyOffsets");
+    _copyOffsets->push_back(_numOutputs);
+}
+
+ConcatenationLayer::~ConcatenationLayer() {
+    delete _copyOffsets;
+}
+
+void ConcatenationLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    getActs().resize(_numOutputs, _inputs[inpIdx]->getNumCols());
+    _inputs[inpIdx]->copy(getActs(), 0, -1, 0, -1, _copyOffsets->at(inpIdx), 0);
+}
+
+void ConcatenationLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    NVMatrix& copySrc = v.sliceRows(_copyOffsets->at(inpIdx), _copyOffsets->at(inpIdx + 1)); // view
+    _prev[replicaIdx][inpIdx]->getActsGrad().add(copySrc, scaleTargets, 1);
+    delete &copySrc;
+}
+
+/*
+ * =======================
+ * PassThroughLayer
+ * =======================
+ */
+PassThroughLayer::PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : Layer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void PassThroughLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    // No-op
+}
+
+void PassThroughLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    // No-op
+}
+
+bool PassThroughLayer::postInit() {
+    if (Layer::postInit()) {
+        assert(getNumInputReplicas() == 1);
+        for (int i = 0, offset = 0; i < _prev[0].size(); offset += _prev[0][i]->getNumOutputs(), i++) {
+            MemoryView& vActs = _memSrcActs[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair<int,int>(offset, offset + _prev[0][i]->getNumOutputs()));
+            MemoryView& vActsGrad = _memSrcActsGrad[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair<int,int>(offset, offset + _prev[0][i]->getNumOutputs()));
+            _prev[0][i]->setMemorySourceActs(getDeviceID(), vActs);
+            _prev[0][i]->setMemorySourceActsGrad(getDeviceID(), vActsGrad);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+/*
+ * =======================
+ * EltwiseSumLayer
+ * =======================
+ */
+EltwiseSumLayer::EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
+    _coeffs = pyDictGetFloatV(paramsDict, "coeffs");
+}
+
+EltwiseSumLayer::~EltwiseSumLayer() {
+    delete _coeffs;
+}
+
+void EltwiseSumLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    getActs().add(*_inputs[inpIdx], scaleTargets, _coeffs->at(inpIdx));
+}
+
+void EltwiseSumLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    _prev[replicaIdx][inpIdx]->getActsGrad().add(v, scaleTargets, _coeffs->at(inpIdx));
+}
+
+/*
+ * =======================
+ * EltwiseMaxLayer
+ * =======================
+ */
+EltwiseMaxLayer::EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void EltwiseMaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (inpIdx == 1) { // First input, do nothing
+        _inputs[inpIdx]->applyBinary(NVMatrixAggs::Max(), *_inputs[0], getActs());
+    } else if (inpIdx > 1) {
+        getActs().applyBinary(NVMatrixAggs::Max(), *_inputs[inpIdx]);
+    }
+}
+
+void EltwiseMaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    computeEltwiseMaxGrad(v, *_inputs[inpIdx], getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), scaleTargets != 0);
+}
+
+
+/*
+ * =======================
+ * DropoutLayer
+ * =======================
+ *
+ * TODO: optimize away the case when using dopout over relus. Don't need the keepmask.
+ */
+DropoutLayer::DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
+    _enable = pyDictGetInt(paramsDict, "enable");
+    _keep = pyDictGetFloat(paramsDict, "keep");
+}
+
+void DropoutLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (_enable && passType == PASS_TRAIN) {
+        _keepMask.resize(*_inputs[inpIdx]);
+        _keepMask.randomizeUniform();
+        _keepMask.apply(DropoutSmallerThanOperator(_keep));
+        _inputs[inpIdx]->eltwiseMult(_keepMask, getActs());
+    } else {
+        _inputs[inpIdx]->copy(getActs());
+    }
+}
+
+void DropoutLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    LayerV& prev = _prev[replicaIdx];
+    if (_enable && passType == PASS_TRAIN) {
+        if (scaleTargets != 0) {
+            v.applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::Multiply>(NVMatrixBinaryOps::Multiply()),
+                           _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
+        } else {
+            v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad());
+        }
+    } else {
+         prev[inpIdx]->getActsGrad().add(v, scaleTargets, 1);
+    }
+}
+
+void DropoutLayer::truncBwdActs() {
+    Layer::truncBwdActs();
+    _keepMask.truncate();
+}
+
+
+/*
+ * =======================
+ * Dropout2Layer
+ * =======================
+ *
+ * TODO: optimize away the case when using dopout over relus. Don't need the keepmask.
+ */
+Dropout2Layer::Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : DropoutLayer(convNetThread, paramsDict, replicaID) {
+}
+
+void Dropout2Layer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (_enable && passType == PASS_TRAIN) {
+        _keepMask.resize(*_inputs[inpIdx]);
+        _keepMask.randomizeUniform();
+        _keepMask.smallerThanScalar(_keep);
+        _inputs[inpIdx]->eltwiseMult(_keepMask, getActs());
+    } else {
+        _inputs[inpIdx]->scale(_keep, getActs());
+    }
+}
+
+void Dropout2Layer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    LayerV& prev = _prev[replicaIdx];
+    if (_enable && passType == PASS_TRAIN) {
+        if (scaleTargets != 0) {
+            v.applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::Multiply>(NVMatrixBinaryOps::Multiply()),
+                           _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
+        } else {
+            v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad());
+        }
+    } else {
+        if (scaleTargets != 0) {
+             v.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_keep)),
+                           prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
+        } else {
+            v.scale(_keep, prev[inpIdx]->getActsGrad());
+        }
+    }
+}
+
+/*
+ * =======================
+ * DataLayer
+ * =======================
+ */
+DataLayer::DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID) : Layer(NULL, paramsDict, replicaID, false) {
+    _dataIdx = pyDictGetInt(paramsDict, "dataIdx");
+    _start = pyDictGetInt(paramsDict, "start");
+    _end = pyDictGetInt(paramsDict, "end");
+    _useBuffer = false;
+    _outstandingCopyRequest = false;
+    _convNet = convNet;
+}
+
+DataLayer::~DataLayer() {
+    for (map<int,cudaStream_t>::const_iterator it = _copyStreams.begin(); it != _copyStreams.end(); ++it) {
+        checkCudaErrors(cudaStreamDestroy(it->second));
+    }
+    for (std::map<int, MemoryView*>::iterator it = _memSrcActs2.begin(); it != _memSrcActs2.end(); ++it) {
+        if (it->second->getMemorySource().truncate(_name)) {
+            delete &it->second->getMemorySource();
+        }
+    }
+    _copier->stop();
+    delete _copier;
+}
+
+void DataLayer::fprop(PASS_TYPE passType, int passIdx, bool fromBuffer) {
+    waitForCopyFinish();
+    if (fromBuffer && getFwdActiveInputReplicaIdx(passIdx) >= 0) {
+        _useBuffer = !_useBuffer;
+    }
+
+    for (int i = 0; i < _next.size(); i++) {
+        _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx));
+    }
+}
+
+void DataLayer::waitForCopyFinish() {
+    if (_outstandingCopyRequest) {
+        _copyFinishQueue.dequeue();
+        assert(_copyFinishQueue.getNumElements() == 0);
+        _outstandingCopyRequest = false;
+    }
+}
+
+cudaStream_t DataLayer::getCopyStream(int deviceID) {
+    if (_copyStreams.count(deviceID) == 0) {
+        NVMatrix::setDeviceID(deviceID);
+        checkCudaErrors(cudaStreamCreateWithFlags(&_copyStreams[deviceID], cudaStreamNonBlocking));
+    }
+    return _copyStreams[deviceID];
+}
+
+void DataLayer::copyData(CPUData& data, bool other, int passIdx) {
+    assert(!_outstandingCopyRequest);
+    assert(_copyFinishQueue.getNumElements() == 0);
+    _copier->getQueue().enqueue(new DataCopyMessage(data, other, passIdx));
+    _outstandingCopyRequest = true;
+}
+
+int DataLayer::getNumInputReplicas() {
+    return _convNet->getNumReplicasMax() / getNumReplicas();
+}
+
+void DataLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+   
+}
+
+NVMatrix& DataLayer::getActs(int deviceID) {
+    return getActs(deviceID, false, -1);
+}
+
+NVMatrix& DataLayer::getActs(int deviceID, bool other, int numCases) {
+//    printf("%s[%d] getActs(%d, %d, %d)\n", _name.c_str(), getReplicaID(), deviceID, other, numCases);
+    assert(_memSrcActs.count(deviceID) > 0);
+    assert(_memSrcActs2.count(deviceID) > 0);
+    return (_useBuffer != other ? _memSrcActs2[deviceID]->getMemory(numCases) : _memSrcActs[deviceID]->getMemory(numCases));
+}
+
+ConvNet& DataLayer::getConvNet() {
+    return *_convNet;
+}
+
+bool DataLayer::postInit() {
+    if (Layer::postInit()) {
+        for (int i = 0; i < _next.size(); ++i) {
+            int d = _next[i]->getDeviceID();
+            if (_memSrcActs2.count(d) == 0) {
+                _memSrcActs2[d] = &MemorySource::make(_numOutputs, d, getName());
+            }
+        }
+        intv cpus = getDeviceCPUs(_next[0]->getDeviceID());
+        _copier = new DataCopyThread(*this, cpus);
+        _copier->start();
+        return true;
+    }
+    return false;
+}
+
+bool DataLayer::isGradProducer() {
+    return false;
+}
+
+/*
+ * =======================
+ * DataCopyThread
+ * =======================
+ */
+DataCopyThread::DataCopyThread(DataLayer& parent, intv& cpus) : _parent(&parent), _sleepUsec(0), Thread(true, cpus) {
+}
+
+Queue<DataCopyMessage*>& DataCopyThread::getQueue() {
+    return _queue;
+}
+
+void DataCopyThread::stop() {
+    getQueue().enqueue(new DataCopyExitMessage());
+    join();
+}
+
+void* DataCopyThread::run() {
+    NVMatrix::setDeviceID(*_parent->getNextDeviceIDs().begin());
+    bool exit = false;
+    while(!exit) {
+        DataCopyMessage& msg = *_queue.dequeue();
+        exit = msg.getType() == DataCopyMessage::EXIT;
+        if (!exit) {
+            CPUData& data = msg.getData();
+            int passIdx = msg.getPassIdx();
+            bool other = msg.isOther();
+
+            Matrix& dataMatrix = data.getData(_parent->getDataIdx());
+            // How many times is this layer going to process microbatches from this minibatch?
+            assert(_parent->getNumReplicasNext() == _parent->getNumReplicas());
+            int microIdx = _parent->getFwdActiveInputReplicaIdx(passIdx);
+
+            if (microIdx >= 0) {
+                if (_requestTimer.isStarted()) {
+                    double requestIntervalMsec = _requestTimer.stop();
+                    // Sleep for up to 1/20th the average request interval
+                    _sleepUsec = int(round(0.95 * _sleepUsec + 0.05 * (_parent->getReplicaID() / double(_parent->getNumReplicas())) * requestIntervalMsec * 1000.0 / 20.0));
+                }
+                _requestTimer.start();
+                if (other) {
+                    // Sleeping a bit is helpful because in typical nets, copying input data
+                    // as soon as it's available will produce contention with other communications
+                    // that are happening at the time. This is very much a hack, so in the future
+                    // it might be good to replace it with something smarter which schedules access
+                    // to communication links.
+                    usleep(_sleepUsec);
+                }
+                microIdx += _parent->getReplicaID() * _parent->getNumInputReplicas();
+                // Safer to divup because this way you won't get a minibatch size of 0
+                int microbatchSize = DIVUP(data.getNumCases(), _parent->getConvNet().getNumReplicasMax());
+                int microStart = microIdx * microbatchSize;
+                int microEnd = min(data.getNumCases(), (microIdx + 1) * microbatchSize);
+                // Check that this replica has some data. This can be false when, for example,
+                // there are only 7 examples in the minibatch but 8 replicas.
+                if (microStart < microEnd) {
+                    assert(dataMatrix.isView() == dataMatrix.isTrans());
+                    int pipe = _parent->getConvNet().getDataCopyPD().getPipe(_parent->getReplicaID()/2);
+                    if (dataMatrix.isTrans()) {
+                        Matrix& replicaDataMatrix = dataMatrix.sliceCols(microStart, microEnd);
+                        // In this case, dataMatrix is a view on memory allocated by Python.
+                        //_hostMemFwd.copyFromHost(replicaDataMatrix, true);
+                        _hostMemFwd.resize(replicaDataMatrix.getNumRows(), replicaDataMatrix.getNumCols(), true);
+                        memcpy(_hostMemFwd.getDevData(), replicaDataMatrix.getData(), replicaDataMatrix.getNumDataBytes());
+                        delete &replicaDataMatrix; // view
+                        NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd());
+                        for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
+                            int deviceID = *it;
+                            // Copy my output to this guy's GPU
+                            NVMatrix::setDeviceID(deviceID);
+                            // Note to self: this is the path that gets executed in practice
+                            // in my models. It does a transpose & copy simultaneously.
+                            hostMemFwdSlice.flipTrans(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID));
+                        }
+                        delete &hostMemFwdSlice;
+                    } else {
+                        // Hacky way to copy a slice to _hostMemFwd
+                        _hostMemFwd.resize(dataMatrix.getNumRows(), microEnd - microStart);
+                        Matrix tmp(_hostMemFwd.getDevData(), _hostMemFwd.getNumRows(), _hostMemFwd.getNumCols(), _hostMemFwd.isTrans());
+                        dataMatrix.sliceCols(microStart, microEnd, tmp);
+                        NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd());
+                        for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
+                            int deviceID = *it;
+                            // Copy my output to this guy's GPU
+                            NVMatrix::setDeviceID(deviceID);
+                            hostMemFwdSlice.copy(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID));
+                        }
+                        delete &hostMemFwdSlice;
+                    }
+
+                    for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
+                        int deviceID = *it;
+                        NVMatrix::setDeviceID(deviceID);
+                        NVMatrix::syncStream(_parent->getCopyStream(deviceID));
+                    }
+                    _parent->getConvNet().getDataCopyPD().freePipe(pipe);
+                } else {
+                    for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
+                        int deviceID = *it;
+                        _parent->getActs(deviceID, other, 0);
+                    }
+                }
+            }
+            _parent->getCopyFinishQueue().enqueue(1);
+        }
+        delete &msg;
+    }
+    return NULL;
+}
+
+/*
+ * =====================
+ * PoolLayer
+ * =====================
+ */
+PoolLayer::PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
+    : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) {
+    _sizeX = pyDictGetInt(paramsDict, "sizeX");
+    _start = pyDictGetInt(paramsDict, "start");
+    _stride = pyDictGetInt(paramsDict, "stride");
+    _outputsX = pyDictGetInt(paramsDict, "outputsX");
+    _pool = pyDictGetString(paramsDict, "pool");
+}
+
+PoolLayer& PoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) {
+    std::string _pool = pyDictGetString(paramsDict, "pool");
+    if (_pool == "max") {
+        return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, false);
+    } else if(_pool == "maxabs") {
+        return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, true);
+    } else if(_pool == "avg") {
+        return *new AvgPoolLayer(convNetThread, paramsDict, replicaID);
+    }
+    throw std::string("Unknown pooling layer type ") + _pool;
+}
+
+/*
+ * =====================
+ * AvgPoolLayer
+ * =====================
+ */
+AvgPoolLayer::AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : PoolLayer(convNetThread, paramsDict, replicaID, false) {
+    _sum = pyDictGetInt(paramsDict, "sum");
+}
+
+void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (_sum) {
+        convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler<true>());
+    } else {
+        convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler<false>());
+    }
+}
+
+void AvgPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convLocalAvgUndo(v, _prev[replicaIdx][0]->getActsGrad(), _sizeX, _start, _stride, _outputsX, _imgSize, _sum, scaleTargets, 1);
+}
+
+/*
+ * =====================
+ * MaxPoolLayer
+ * =====================
+ */
+MaxPoolLayer::MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs) : PoolLayer(convNetThread, paramsDict, replicaID, false), _abs(abs) {
+}
+
+void MaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (_abs) {
+        convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxAbsPooler());
+    } else {
+        convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxPooler());
+    }
+}
+
+void MaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 0);
+    convLocalMaxUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _sizeX, _start, _stride, _outputsX, scaleTargets, 1);
+}
+
+/*
+ * =====================
+ * CrossMapPoolLayer
+ * =====================
+ */
+CrossMapPoolLayer::CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
+    : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) {
+    _size = pyDictGetInt(paramsDict, "size");
+    _start = pyDictGetInt(paramsDict, "start");
+    _stride = pyDictGetInt(paramsDict, "stride");
+    _outputs = pyDictGetInt(paramsDict, "outputChannels");
+    _pool = pyDictGetString(paramsDict, "pool");
+}
+
+CrossMapPoolLayer& CrossMapPoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) {
+    std::string _pool = pyDictGetString(paramsDict, "pool");
+    if (_pool == "max") {
+        return *new CrossMapMaxPoolLayer(convNetThread, paramsDict, replicaID);
+    }
+    throw std::string("Unknown pooling layer type ") + _pool;
+}
+
+/*
+ * =====================
+ * CrossMapMaxPoolLayer
+ * =====================
+ */
+CrossMapMaxPoolLayer::CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CrossMapPoolLayer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void CrossMapMaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convPoolCrossMap(*_inputs[0], getActs(), _start, _size, _outputs, _stride, _imgSize, MaxPooler());
+}
+
+void CrossMapMaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 0);
+    convCrossMapMaxPoolUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][0]->getActsGrad(), _imgSize, _start, _size, _stride, scaleTargets, 1);
+}
+
+/*
+ * =====================
+ * RandomScaleLayer
+ * =====================
+ */
+RandomScaleLayer::RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _maxScale = pyDictGetFloat(paramsDict, "maxScale");
+    _tgtSize = pyDictGetInt(paramsDict, "tgtSize");
+    // The smallest size the image could be after rescaling
+    _minScaledSize = _imgSize / _maxScale;
+   
+    // The number of discrete scales we're considering
+    int numScales = _imgSize - _minScaledSize + 1;
+   
+    // The total number of squares of size _tgtSize that we can extract
+    // from all these scales
+    double numCrops = numScales * (numScales + 1) * (2 * numScales + 1) / 6;
+   
+    // For each scale, record the fraction of the squares that it has.
+    // This will be the probability of sampling this scale.
+    _scaleProbs.push_back(1.0 / numCrops);
+    for (int s = 1; s < numScales; ++s) {
+        _scaleProbs.push_back(_scaleProbs[s-1] + (s + 1) * (s + 1) / numCrops);
+    }
+}
+
+void RandomScaleLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    if (IS_TRAIN(passType)) {
+        // _maxScale is in the range [1, 2)
+        float r = randf;
+        int rescaledSize = _tgtSize;
+        float scaleFactor = _maxScale;
+        // Find which scale we have sampled
+        for (int s = 0; s < _scaleProbs.size(); ++s) {
+            if (r <= _scaleProbs[s]) {
+                rescaledSize += s;
+                float scaleFactorEnd = _imgSize / float(rescaledSize);
+                float scaleFactorStart = max(1.0, _imgSize / (1.0 + rescaledSize));
+                scaleFactor = scaleFactorStart + randf * (scaleFactorEnd - scaleFactorStart);
+                break;
+            }
+        }
+        assert(rescaledSize >= _tgtSize);
+        int maxStart = rescaledSize - _tgtSize;
+        int startY = rand() % (1 + maxStart), startX = rand() % (1 + maxStart);
+        if (rescaledSize  == _imgSize) {
+            convCrop(*_inputs[0], getActs(), rescaledSize, _tgtSize, startY, startX);
+        } else {
+            convResizeBilinear(*_inputs[0], _rescaledActs, _imgSize, rescaledSize, scaleFactor);
+            convCrop(_rescaledActs, getActs(), rescaledSize, _tgtSize, startY, startX);
+        }
+        _rescaledActs.truncate(); // this'll have a different size each time so may as well truncate it.
+    } else if (IS_MULTIVIEW_TEST(passType)) { // for now...
+        _inputs[0]->copy(getActs());
+    } else if (IS_TEST(passType)) { // Test on center patch
+        convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _maxScale);
+    }
+}
+
+void RandomScaleLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(false);
+}
+
+/*
+ * =====================
+ * CropLayer
+ * =====================
+ */
+CropLayer::CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _startX = pyDictGetInt(paramsDict, "startX");
+    _startY = pyDictGetInt(paramsDict, "startY");
+    _tgtSize = pyDictGetInt(paramsDict, "sizeX");
+}
+
+void CropLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convCrop(*_inputs[0], getActs(), _imgSize, _tgtSize, _startY, _startX);
+}
+
+void CropLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(false);
+}
+
+/*
+ * =====================
+ * NailbedLayer
+ * =====================
+ */
+NailbedLayer::NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _start = pyDictGetInt(paramsDict, "start");
+    _stride = pyDictGetInt(paramsDict, "stride");
+    _outputsX = pyDictGetInt(paramsDict, "outputsX");
+}
+
+void NailbedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convBedOfNails(*_inputs[0], getActs(), _channels, _imgSize, _start, _stride, 0, 1);
+}
+
+void NailbedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convBedOfNailsUndo(v, _prev[replicaIdx][0]->getActsGrad(), _channels, _imgSize, _start, _stride, scaleTargets, 1);
+}
+
+/*
+ * =====================
+ * GaussianBlurLayer
+ * =====================
+ */
+GaussianBlurLayer::GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _hFilter = pyDictGetMatrix(paramsDict, "filter");
+}
+
+GaussianBlurLayer::~GaussianBlurLayer() {
+    delete _hFilter;
+}
+
+void GaussianBlurLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convGaussianBlur(*_inputs[0], _filter, getActs(), true, _channels, 0, 1);
+    convGaussianBlur(getActs(), _filter, getActs(), false, _channels, 0, 1);
+}
+
+void GaussianBlurLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    NVMatrix& tgt = _prev[replicaIdx][0]->getNumComputedActsGrads(getDeviceID()) > 0 ? _actGradsTmp : _prev[replicaIdx][0]->getActsGrad();
+    convGaussianBlur(v, _filter, tgt, true, _channels, 0, 1);
+    convGaussianBlur(tgt, _filter, _prev[replicaIdx][0]->getActsGrad(), false, _channels, scaleTargets, 1);
+}
+
+void GaussianBlurLayer::copyToGPU() {
+    _filter.copyFromHost(*_hFilter, true);
+}
+
+ /*
+ * =====================
+ * HorizontalReflectionLayer
+ * =====================
+ */
+HorizontalReflectionLayer::HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID) : Layer(convNet, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    assert(_channels >= 1 && _channels <= 3);
+}
+
+void HorizontalReflectionLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convReflectHorizontal(*_inputs[0], getActs(), _imgSize);
+}
+
+void HorizontalReflectionLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convReflectHorizontal(v, _prev[replicaIdx][0]->getActsGrad(), _imgSize);
+}
+
+/*
+ * =====================
+ * ResizeLayer
+ * =====================
+ */
+ResizeLayer::ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _tgtSize = pyDictGetInt(paramsDict, "tgtSize");
+    _scale = pyDictGetFloat(paramsDict, "scale");
+}
+
+void ResizeLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _scale);
+}
+
+// Can't do this
+void ResizeLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(false);
+}
+
+/*
+ * =====================
+ * RGBToYUVLayer
+ * =====================
+ */
+RGBToYUVLayer::RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void RGBToYUVLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convRGBToYUV(*_inputs[0], getActs());
+}
+
+// Can't do this
+void RGBToYUVLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(false);
+}
+
+/*
+ * =====================
+ * RGBToLABLayer
+ * =====================
+ */
+RGBToLABLayer::RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
+    _center = pyDictGetInt(paramsDict, "center");
+}
+
+void RGBToLABLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convRGBToLAB(*_inputs[0], getActs(), _center);
+}
+
+// Can't do this
+void RGBToLABLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(false);
+}
+
+/*
+ * =====================
+ * ResponseNormLayer
+ * =====================
+ */
+ResponseNormLayer::ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+: Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
+    _size = pyDictGetInt(paramsDict, "size");
+    _scale = pyDictGetFloat(paramsDict, "scale");
+    _pow = pyDictGetFloat(paramsDict, "pow");
+    _minDiv = pyDictGetFloat(paramsDict, "minDiv");
+}
+
+void ResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    convResponseNorm(*_inputs[0], _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv);
+}
+
+void ResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convResponseNormUndo(v, _denoms, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1);
+}
+
+void ResponseNormLayer::truncBwdActs() {
+    Layer::truncBwdActs();
+    _denoms.truncate();
+}
+
+/*
+ * =====================
+ * CrossMapResponseNormLayer
+ * =====================
+ */
+CrossMapResponseNormLayer::CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+: ResponseNormLayer(convNetThread, paramsDict, replicaID) {
+    _blocked = pyDictGetInt(paramsDict, "blocked");
+}
+
+void CrossMapResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    assert(inpIdx == 0);
+    convResponseNormCrossMap(*_inputs[0], getActs(), _channels, _size, _scale, _pow, _minDiv, _blocked);
+}
+
+void CrossMapResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convResponseNormCrossMapUndo(v, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, _minDiv, _blocked, scaleTargets, 1);
+}
+
+/*
+ * =====================
+ * ContrastNormLayer
+ * =====================
+ */
+ContrastNormLayer::ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : ResponseNormLayer(convNetThread, paramsDict, replicaID) {
+}
+
+void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    NVMatrix& images = *_inputs[0];
+    convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler<false>());
+    _meanDiffs.add(images, -1, 1);
+    convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv);
+}
+
+void ContrastNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    convContrastNormUndo(v, _denoms, _meanDiffs, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1);
+}
+
+void ContrastNormLayer::truncBwdActs() {
+    ResponseNormLayer::truncBwdActs();
+    _meanDiffs.truncate();
+}
+
+/*
+ * =====================
+ * CostLayer
+ * =====================
+ */
+CostLayer::CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
+    : Layer(convNetThread, paramsDict, replicaID, trans) {
+    _coeff = pyDictGetFloat(paramsDict, "coeff");
+    _numCases = 0;
+    _aggregated = pyDictGetInt(paramsDict, "aggregated") != 0;
+}
+
+float CostLayer::getCoeff() {
+    return _coeff;
+}
+
+void CostLayer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) {
+    if (_coeff != 0) {
+        Layer::bprop(v, passType, passIdx);
+    }
+}
+
+bool CostLayer::fprop(PASS_TYPE passType, int passIdx) {
+    if (Layer::fprop(passType, passIdx)) {
+        syncStream();
+        getConvNet().getMessageQueue().enqueue(new Message(FPROP_TERMINAL));
+        return true;
+    }
+    return false;
+}
+
+void CostLayer::fpropCommon(PASS_TYPE passType) {
+    _numCases = Layer::getNumCases(*_inputs[0]);
+}
+
+int CostLayer::getNumCases() {
+    return _numCases;
+}
+
+bool CostLayer::isGradProducer() {
+    return _coeff != 0;
+}
+
+doublev& CostLayer::getCost() {
+    return *new doublev(_costv);
+}
+
+// This is called between microbatches
+void CostLayer::resetPassIdx() {
+    Layer::resetPassIdx();
+    _costv.clear();
+}
+
+CostLayer& CostLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID) {
+    if (type == "cost.crossent") {
+        return *new CrossEntCostLayer(convNetThread, paramsDict, replicaID);
+    } else if (type == "cost.bce") {
+        return *new BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID);
+    } else if (type == "cost.dce") {
+        return *new DetectionCrossEntropyCostLayer(convNetThread, paramsDict, replicaID);
+    } else if (type == "cost.logreg") {
+        return *new LogregCostLayer(convNetThread, paramsDict, replicaID);
+    } else if (type == "cost.sum2") {
+        return *new SumOfSquaresCostLayer(convNetThread, paramsDict, replicaID);
+    }
+    throw std::string("Unknown cost layer type ") + type;
+}
+
+/*
+ * =====================
+ * CrossEntCostLayer
+ * =====================
+ */
+CrossEntCostLayer::CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void CrossEntCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    // This layer uses its two inputs together
+    if (inpIdx == 0) {
+        NVMatrix& labels = *_inputs[0];
+        NVMatrix& probs = *_inputs[1];
+        int numCases = labels.getLeadingDim();
+        computeCrossEntCost(labels, probs, _trueLabelLogProbs, _correctProbs);
+        _costv.clear();
+        _costv.push_back(-_trueLabelLogProbs.sum());
+        _costv.push_back(numCases - _correctProbs.sum());
+    }
+}
+
+void CrossEntCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 1);
+    LayerV& prev = _prev[replicaIdx];
+    NVMatrix& labels = *_inputs[0];
+    NVMatrix& probs = *_inputs[1];
+    NVMatrix& target = prev[1]->getActsGrad();
+    // Numerical stability optimization: if the layer below me is a softmax layer, let it handle
+    // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
+    bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" || prev[1]->getDeviceID() != getDeviceID();
+    if (doWork) {
+        computeCrossEntGrad(labels, probs, target, scaleTargets == 1, _coeff);
+    }
+}
+
+/*
+ * =====================
+ * BinomialCrossEntropyCostLayer
+ * =====================
+ */
+BinomialCrossEntropyCostLayer::BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
+    _computeSoftmaxErrorRate = pyDictGetInt(paramsDict, "computeSoftmaxErrorRate");
+    _posWeight = pyDictGetFloat(paramsDict, "posWeight");
+}
+
+void BinomialCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    // This layer uses its two inputs together
+    if (inpIdx == 0) {
+        NVMatrix& labels = *_inputs[0];
+        NVMatrix& probs = *_inputs[1];
+        int numCases = labels.getLeadingDim();
+        labels.applyBinary(BinomialCrossEntOperator(_posWeight), probs, _tmpProbs);
+        _costv.clear();
+        // Cross-entropy cost
+        _costv.push_back(-_tmpProbs.sum(_tmpbuf));// / labels.getFollowingDim());
+
+        // If aggregated, we don't produce these outputs because they're not additive.
+        // They have no meaning if this is just a partial cost.
+        if (!_aggregated) {
+            // "Correct" classifications. To compute these we threshold probs
+            // and just count the number of entries that agree with labels.
+            probs.biggerThanScalar(0.5, _tmpProbs);
+            _tmpProbs.equals(labels);
+            _costv.push_back((_tmpProbs.getNumElements() - _tmpProbs.sum(_tmpbuf)) / double(labels.getFollowingDim()));
+
+            if (_computeSoftmaxErrorRate) {
+                // Also compute top-1 error as if this is softmax and there's only one correct class
+                probs.max(0, _tmpVec);
+                assert(_tmpVec.getNumElements() == numCases); // Make sure we did max on correct axis
+                probs.equalsVector(_tmpVec, _correctProbs);
+                _correctProbs.sum(0, _tmpVec); // Divide by the # of labels that we predict as being present
+                float m = _tmpVec.max();
+
+                _correctProbs.eltwiseDivideByVector(_tmpVec);
+                _correctProbs.eltwiseMult(labels);
+
+                _costv.push_back(numCases - _correctProbs.sum(_tmpbuf));
+            }
+        }
+    }
+}
+
+void BinomialCrossEntropyCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    assert(inpIdx == 1);
+    LayerV& prev = _prev[replicaIdx];
+    NVMatrix& labels = *_inputs[0];
+    NVMatrix& probs = *_inputs[1];
+    NVMatrix& target = prev[1]->getActsGrad();
+    // Numerical stability optimization: if the layer below me is a logistic neuron layer, let it handle
+    // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
+    bool doWork =   prev[1]->getNext().size() > 1
+                    || prev[1]->getType() != "neuron"
+                    || static_cast<NeuronLayer*>(prev[1])->getNeuronType() != "logistic"
+                    ||  prev[1]->getDeviceID() != getDeviceID()
+                    || prev[1]->getNumReplicas() != getNumReplicas();
+    if (doWork) {
+        printf("Computing cross-entropy gradient the stupid way\n");
+        if (scaleTargets == 0) {
+            labels.applyBinary(BinomialCrossEntGradientOperator(_coeff, _posWeight), probs, target);
+        } else {
+            labels.applyTernary(AddGradientBinaryOperator<BinomialCrossEntGradientOperator>(BinomialCrossEntGradientOperator(_coeff, _posWeight)), probs, target, target);
+        }
+    }
+}
+
+float BinomialCrossEntropyCostLayer::getPosWeight() {
+    return _posWeight;
+}
+/*
+ * =====================
+ * DetectionCrossEntropyCostLayer
+ * =====================
+ */
+DetectionCrossEntropyCostLayer::DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
+    : BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID) {
+    assert(!_aggregated);
+}
+
+void DetectionCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    BinomialCrossEntropyCostLayer::fpropActs(inpIdx, scaleTargets, passType, passIdx);
+    // This layer uses its two inputs together
+    if (inpIdx == 0) {
+        NVMatrix& labels = *_inputs[0];
+        NVMatrix& probs = *_inputs[1];
+        int numCases = labels.getLeadingDim();
+
+        /*
+         * Add information sufficient to compute precision and recall for each class.
+         */
+        // NOTE: _tmpProbs contains ((probs > 0.5) == labels)
+        labels.sum(1, _numPositive);      // sum(labels, 1)
+
+        _tmpProbs.eltwiseMult(labels); // labels * ((probs > 0.5) == labels)
+        _tmpProbs.sum(1, _numTruePositive);
+
+        probs.biggerThanScalar(0.5, _tmpProbs);
+        _tmpProbs.sum(1, _numDeclaredPositive);
+
+        _numDeclaredPositive.copyToHost(_hNumDeclaredPositive, true);
+        _numPositive.copyToHost(_hNumPositive, true);
+        _numTruePositive.copyToHost(_hNumTruePositive, true);
+
+        for (int i = 0; i < labels.getFollowingDim(); ++i) {
+            _costv.push_back(_hNumDeclaredPositive(i, 0));                  // 2
+            _costv.push_back(_hNumPositive(i, 0));                          // 3
+            _costv.push_back(_hNumTruePositive(i, 0));                      // 4
+        }
+
+    }
+}
+
+/*
+ * =====================
+ * LogregCostLayer
+ * =====================
+ */
+LogregCostLayer::LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
+    _topk = pyDictGetInt(paramsDict, "topk");
+//    _numAccumed = 0;
+}
+
+void LogregCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    // This layer uses its two inputs together
+    if (inpIdx == 0) {
+        NVMatrix& labels = *_inputs[0];
+        NVMatrix* probs = _inputs[1];
+
+        _doCompute = !IS_MULTIVIEW_TEST(passType);
+        if (!_doCompute) {
+            if (IS_MULTIVIEW_TEST_START(passType)) {
+                if (_probsAccum.count(passIdx) == 0) {
+                    _probsAccum[passIdx] = new NVMatrix(*probs);
+                }
+                probs->copy(*_probsAccum[passIdx]);
+                _numAccumed[passIdx] = 1;
+            } else {
+                _probsAccum[passIdx]->add(*probs);
+                _numAccumed[passIdx] += 1;
+            }
+            if (IS_MULTIVIEW_TEST_END(passType)) {
+                probs = _probsAccum[passIdx];
+                probs->scale(1.0 / _numAccumed[passIdx]);
+                _doCompute = true;
+            }
+        }
+        if (_doCompute) {
+            int numCases = labels.getNumElements();
+            probs->max(0,_maxProbs);
+            if (_topk == 1) {
+                computeLogregCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs);
+            } else {
+                computeMultiSoftmaxCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs, _topkProbs, _topk);
+            }
+            _costv.clear();
+            double top1 = _correctProbs.sum(_tmpbuf);
+
+            _costv.push_back(-_trueLabelLogProbs.sum(_tmpbuf));
+            _costv.push_back(numCases - top1);
+            _costv.push_back(numCases - (_topk == 1 ? top1 : _topkProbs.sum(_tmpbuf)));
+
+        }
+    }
+}
+
+NVMatrix& LogregCostLayer::getProbsAccum(int replicaIdx) {
+    return *_probsAccum[replicaIdx];
+}
+
+void LogregCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    if (inpIdx == 1) {
+        LayerV& prev = _prev[replicaIdx];
+        NVMatrix& labels = *_inputs[0];
+        NVMatrix& probs = *_inputs[1];
+        NVMatrix& target = prev[1]->getActsGrad();
+        // Numerical stability optimization: if the layer below me is a softmax layer, let it handle
+        // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
+        bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax"
+                    || prev[1]->getDeviceID() != getDeviceID() || prev[1]->getNumReplicas() != getNumReplicas();
+        if (prev[1]->getType() == "softmax") {
+            static_cast<SoftmaxLayer*>(prev[1])->setDoUpperGrad(!doWork);
+        }
+        if (doWork) {
+            computeLogregGrad(labels, probs, target, scaleTargets == 1, _coeff);
+        }
+    }
+}
+
+/*
+ * =====================
+ * SumOfSquaresCostLayer
+ * =====================
+ */
+SumOfSquaresCostLayer::SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
+}
+
+void SumOfSquaresCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
+    _inputs[0]->apply(NVMatrixOps::Square(), _tmp);
+    _costv.clear();
+    _costv.push_back(_tmp.sum());
+}
+
+void SumOfSquaresCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+    _prev[replicaIdx][inpIdx]->getActsGrad().add(*_inputs[0], scaleTargets, -2 * _coeff);
+}
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
new file mode 100644
index 0000000..39995a6
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
@@ -0,0 +1,555 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <vector>
+#include <cmath>
+#include "../include/layer_kernels.cuh"
+
+using namespace std;
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxEnergies:     (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ * 
+ */
+__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
+                                  float* labelLogProbs, float* correctProbs, float* top5Probs,
+                                  const int numCases, const int numOut, const int setSize) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        int numBiggerProbs = 0, numEqualsProbs = 0;
+        for (int i = 0; i < numOut; ++i) {
+            numBiggerProbs += probs[i * numCases + tx] > labelp;
+            numEqualsProbs += probs[i * numCases + tx] == labelp;
+        }
+
+        const int slotsLeft = setSize - numBiggerProbs;
+        
+        top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
+        correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
+                       NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+//    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    top5Probs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+
+    cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
+    kMultiSoftmaxCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                    labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
+                                    numCases, numOut, setSize);
+
+    getLastCudaError("kMultiSoftmaxCost: Kernel execution failed");
+//    cudaThreadSynchronize();
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        probs += tx;
+        labels += tx;
+        maxProbs += tx;
+        labelLogProbs += tx;
+        correctProbs += tx;
+        
+        const float maxp = maxProbs[0];
+
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        float crossEnt = 0.0f;
+        int numMax = 0;
+        bool correctLabel = false;
+        for (int i = 0; i < numOut; i++) {
+            const float label_prob = labels[i * numCases];
+            const float model_prob = probs[i * numCases];
+            numMax += model_prob == maxp;
+            crossEnt += label_prob * safelog(model_prob);
+            correctLabel |= model_prob == maxp && label_prob > 0.0f;
+        }
+        labelLogProbs[0] = crossEnt;
+        if (!correctLabel) {
+            correctProbs[0] = 0.0f;
+        } else {
+            correctProbs[0] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const float label_prob = labels[tidx];
+        const float model_prob = y_l[tidx];
+        const float v = gradCoeff * __fdividef(label_prob, model_prob);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const float model_prob = y_l[tidx];
+        const float label_prob = labels[tidx];
+        float v = gradCoeff * (label_prob - model_prob);
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        if (labelp != maxp) {
+            correctProbs[tx] = 0;
+        } else {
+            int numMax = 0;
+            for (int i = 0; i < numOut; i++) {
+                numMax += probs[i * numCases + tx] == maxp;
+            }
+            correctProbs[tx] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * (label == ty);
+        v = __fdividef(v, y_l[tidx]);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * ((label == ty) - y_l[tidx]);
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * dE_dy_l: (numOut, numCases)
+ * y_l:     (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        float v = 0;
+        for (int j = 0; j < numOut; j++) {
+            v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
+        }
+        v *= y_l[tidx];
+        
+        if (add) {
+            dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v;
+        } else {
+            dE_dx_l[tidx] = scaleGrad * v;
+        }
+    }
+}
+
+template <int B_X, bool add>
+__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
+                                const int numElements) {
+    for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
+        if (add) {
+            target[i] += actGrad[i] * (output[i] == input[i]);
+        } else {
+            target[i] = actGrad[i] * (output[i] == input[i]);
+        }
+    }
+}
+
+void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
+    assert(actGrad.isContiguous());
+    assert(output.isContiguous());
+    assert(input.isContiguous());
+    assert(actGrad.isSameDims(input));
+    assert(actGrad.isSameDims(output));
+    
+    dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
+    dim3 threads(128);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (add) {
+        assert(actGrad.isSameDims(target));
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, true><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    } else {
+        target.resize(actGrad);
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, false><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    }
+    
+    getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = sum_i{-p_i*log(y_i)}
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.isSameDims(probs));
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
+    kCrossEntCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("kCrossEntCost: Kernel execution failed");
+
+    delete &maxProbs;
+}
+
+void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.isSameDims(probs));
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kCrossEntGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kCrossEntGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("kCrossEntGrad: Kernel execution failed");
+}
+
+void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) {
+    int numCases = acts.getLeadingDim();
+    int numOut = acts.getFollowingDim();
+
+    assert(acts.isSameDims(actsGrad));
+    assert(acts.isContiguous());
+    assert(actsGrad.isContiguous());
+    assert(target.isContiguous());
+    assert(acts.isTrans());
+    assert(actsGrad.isTrans());
+
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+
+    if (scaleTarget == 0) {
+        target.resize(acts);
+        kSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
+    } else {
+        kSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
+    }
+    getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
+}
+
+void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    assert(!labels.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    } else {
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    }
+    getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
+    kLogregCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("computeLogregCost: Kernel execution failed");
+}
+
+void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kLogregCostGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregCostGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregGrad: Kernel execution failed");
+}
+
+void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kLogregSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
new file mode 100644
index 0000000..1cea787
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include "../include/lr.cuh"
+#include "../include/util.cuh"
+
+/*
+ * ==================================
+ * ParameterSchedule
+ * ==================================
+ */
+ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) {
+    std::string type = pyDictGetString(schedDict, "type");
+    PyObject* paramsDict = PyDict_GetItemString(schedDict, "params");
+    double base = pyDictGetFloat(paramsDict, "base");
+    if (type == "const") {
+        return *new ParameterSchedule(base);
+    } else {
+        double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
+        if (type == "linear") {
+            return *new LinearParameterSchedule(base, tgtFactor);
+        } else if (type == "exp") {
+            return *new ExpParameterSchedule(base, tgtFactor);
+        } else if (type == "dexp") {
+            double numSteps = pyDictGetInt(paramsDict, "numSteps");
+            return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps);
+        }
+    }
+    throw std::string("Unknown learning rate schedule type ") + type;
+}
+
+ParameterSchedule::ParameterSchedule(double baseRate)
+    : _baseRate(baseRate) {
+}
+
+double ParameterSchedule::getValue(double progress) {
+    return _baseRate;
+}
+
+double ParameterSchedule::getBaseValue() const {
+    return _baseRate;
+}
+
+ParameterSchedule::~ParameterSchedule() {
+}
+
+/*
+ * ==================================
+ * LinearParameterSchedule
+ * ==================================
+ */
+LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor)
+: ParameterSchedule(baseRate) {
+    _finalRate = baseRate / tgtFactor;
+}
+
+double LinearParameterSchedule::getValue(double progress) {
+    return _baseRate * (1 - progress) + _finalRate * progress;
+}
+
+/*
+ * ==================================
+ * ExpParameterSchedule
+ * ==================================
+ */
+ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor)
+: ParameterSchedule(baseRate) {
+    _powBase = 1.0 / tgtFactor;
+}
+
+double ExpParameterSchedule::getValue(double progress) {
+    return _baseRate * std::pow(_powBase, progress);
+}
+
+/*
+ * ==================================
+ * DiscreteExpParameterSchedule
+ * ==================================
+ */
+DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps)
+: ParameterSchedule(baseRate) {
+    ExpParameterSchedule elrs(baseRate, tgtFactor);
+    double finalRate = baseRate / tgtFactor;
+    for (int i = 0; i < numSteps - 1; i++) {
+        double progress = double(i) / (numSteps - 1);
+        _rates.push_back(elrs.getValue(progress));
+    }
+    _rates.push_back(finalRate);
+    //printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
+}
+
+double DiscreteExpParameterSchedule::getValue(double progress) {
+    for (int i = 0; i < _rates.size(); ++i) {
+        if (progress <= double(i + 1) / _rates.size()) {
+            return _rates[i];
+        }
+    }
+    return _rates.back();
+}
+
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
new file mode 100644
index 0000000..cd2d299
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/memorysource.cuh"
+
+using namespace std;
+
+/*
+ * =======================
+ * MemoryView
+ * =======================
+ */
+MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) {
+}
+
+MemoryView::~MemoryView() {
+//    if (_src->truncate(_name)) {
+//        delete _src;
+//    }
+}
+
+NVMatrix& MemoryView::getMemory(int numCases) {
+    return _src->getMemory(_name, numCases);
+}
+
+NVMatrix& MemoryView::getMemory() {
+    return _src->getMemory(_name);
+}
+
+MemorySource& MemoryView::getMemorySource() {
+    return *_src;
+}
+
+bool MemoryView::isParent() {
+    return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize();
+}
+
+std::string& MemoryView::getName() {
+    return _name;
+}
+
+MemoryView& MemoryView::clone(std::string& name) {
+    return _src->addUser(name, _src->getRange(_name));
+}
+
+/*
+ * =======================
+ * MemorySource
+ * =======================
+ */
+MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) {
+}
+
+MemorySource::~MemorySource() {
+    // Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource.
+    // So this is a no-op.
+}
+
+NVMatrix& MemorySource::getMemory(std::string& name) {
+    return getMemory(name, _memory.getLeadingDim());
+}
+
+// Deletes old view when appropriate
+NVMatrix& MemorySource::getMemory(std::string& name, int numCases) {
+    numCases = numCases < 0 ? _memory.getLeadingDim() : numCases;
+    _lock.acquire();
+    if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) {
+        int d = NVMatrix::getDeviceID();
+        NVMatrix::setDeviceID(_deviceID);
+        _memory.resize(_size, numCases, false);
+        for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
+            delete it->second;
+        }
+        _memoryViews.clear();
+        if (d >= 0) {
+            NVMatrix::setDeviceID(d);
+        }
+    }
+    if (_memoryViews.count(name) == 0) {
+        assert(!_memory.isTrans());
+        _memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second);
+    }
+    NVMatrix& view = *_memoryViews[name];
+    assert(view.isContiguous());
+    _lock.release();
+    return view;
+}
+
+MemoryView& MemorySource::addUser(std::string& name, std::pair<int,int> range) {
+    assert(_viewRanges.count(name) == 0);
+    _viewRanges[name] = range;
+    return *new MemoryView(*this, name);
+}
+
+MemoryView& MemorySource::addUser(std::string& name) {
+    return addUser(name, std::pair<int,int>(0, _size));
+}
+
+MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) {
+    return (new MemorySource(size, deviceID))->addUser(parentUser);
+}
+
+pair<int,int> MemorySource::getRange(std::string& name) {
+    return _viewRanges[name];
+}
+
+int MemorySource::getSize() {
+    return _size;
+}
+
+bool MemorySource::truncate(std::string& name) {
+    bool truncated = false;
+    _lock.acquire();
+    _truncateRequests.insert(name);
+    if (_truncateRequests.size() == _viewRanges.size()) {
+        for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
+            delete it->second;
+        }
+        _memoryViews.clear();
+        _memory.truncate();
+        _truncateRequests.clear();
+        truncated = true;
+    }
+    _lock.release();
+    return truncated;
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
new file mode 100644
index 0000000..bf6fd40
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/neuron.cuh"
+#include "../include/util.cuh"
+
+using namespace std;
+
+Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
+    std::string type = pyDictGetString(neuronDict, "type");
+    PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
+    
+    if (type == "relu") {
+        return *new ReluNeuron();
+    }
+    
+    if (type == "drelu") {
+        return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+    
+    if (type == "softrelu") {
+        return *new SoftReluNeuron();
+    }
+    
+    if (type == "brelu") {
+        return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+
+    if (type == "abs") {
+        return *new AbsNeuron();
+    }
+
+    if (type == "logistic") {
+        return *new LogisticNeuron();
+    }
+    
+    if (type == "tanh") {
+        return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+    
+    if (type == "square") {
+        return *new SquareNeuron();
+    }
+    
+    if (type == "sqrt") {
+        return *new SqrtNeuron();
+    }
+    
+    if (type == "linear") {
+        return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+
+    if (type == "log") {
+        return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+
+    if (type == "ident") {
+        return *new Neuron();
+    }
+    
+    throw std::string("Unknown neuron type: ") + type;
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu
new file mode 100644
index 0000000..ed1aacf
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Python.h>
+#include <arrayobject.h>
+#include <assert.h>
+#include <helper_cuda.h>
+#include <cublas.h>
+#include <time.h>
+#include <vector>
+#include <execinfo.h>
+#include <signal.h>
+
+#include "../../util/include/matrix.h"
+#include "../../util/include/queue.h"
+#include "../include/worker.cuh"
+#include "../include/util.cuh"
+#include "../include/cost.cuh"
+
+#include "../include/pyconvnet.cuh"
+#include "../include/convnet.cuh"
+
+#include "../include/jpeg.h"
+
+using namespace std;
+static ConvNet* model = NULL;
+
+static PyMethodDef _ConvNetMethods[] = {{ "initModel",          initModel,              METH_VARARGS },
+                                        { "startBatch",         startBatch,             METH_VARARGS },
+                                        { "finishBatch",        finishBatch,            METH_VARARGS },
+                                        { "checkGradients",     checkGradients,         METH_VARARGS },
+                                        { "startMultiviewTest", startMultiviewTest,     METH_VARARGS },
+                                        { "startFeatureWriter", startFeatureWriter,     METH_VARARGS },
+                                        { "startDataGrad",      startDataGrad,          METH_VARARGS },
+                                        { "syncWithHost",       syncWithHost,           METH_VARARGS },
+                                        { "decodeJpeg",         decodeJpeg,             METH_VARARGS },
+                                        { NULL, NULL }
+};
+
+void init_ConvNet() {
+    (void) Py_InitModule("_ConvNet", _ConvNetMethods);
+    import_array();
+}
+
+void signalHandler(int sig) {
+    const size_t max_trace_size = 40;
+    void *array[max_trace_size];
+    size_t trace_size = backtrace(array, max_trace_size);
+    fprintf(stderr, "Error signal %d:\n", sig);
+    backtrace_symbols_fd(array, trace_size, STDERR_FILENO);
+    exit(1);
+}
+
+PyObject* initModel(PyObject *self, PyObject *args) {
+    assert(model == NULL);
+    signal(SIGSEGV, signalHandler);
+    signal(SIGABRT, signalHandler);
+
+    PyDictObject* pyLayerParams;
+    PyListObject* pyDeviceIDs;
+    int pyMinibatchSize;
+    int conserveMem;
+
+    if (!PyArg_ParseTuple(args, "O!O!ii",
+                          &PyDict_Type, &pyLayerParams,
+                          &PyList_Type, &pyDeviceIDs,
+                          &pyMinibatchSize,
+                          &conserveMem)) {
+        return NULL;
+    }
+    intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs);
+
+    model = new ConvNet((PyObject*)pyLayerParams,
+                        deviceIDs,
+                        pyMinibatchSize,
+                        conserveMem);
+
+    model->start();
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Starts training/testing on the given batch (asynchronous -- returns immediately).
+ */
+PyObject* startBatch(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+//    printf("starting next batch\n");
+    PyListObject* data;
+    double progress;
+    int test = 0;
+    if (!PyArg_ParseTuple(args, "O!d|i",
+        &PyList_Type, &data,
+        &progress,
+        &test)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    
+    TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Starts testing on the given batch (asynchronous -- returns immediately).
+ */
+PyObject* startMultiviewTest(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    int numViews;
+    PyArrayObject* pyProbs = NULL;
+    char* logregName = NULL;
+    if (!PyArg_ParseTuple(args, "O!i|O!s",
+        &PyList_Type, &data,
+        &numViews,
+        &PyArray_Type, &pyProbs,
+        &logregName)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews)
+                                              : new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+PyObject* startFeatureWriter(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    PyListObject* pyFtrs;
+    PyListObject* pyLayerNames;
+    if (!PyArg_ParseTuple(args, "O!O!O!",
+        &PyList_Type, &data,
+        &PyList_Type, &pyFtrs,
+        &PyList_Type, &pyLayerNames)) {
+        return NULL;
+    }
+    stringv* layerNames = getStringV((PyObject*)pyLayerNames);
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs);
+    
+    FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+PyObject* startDataGrad(PyObject *self, PyObject *args) {
+//    assert(model != NULL);
+//    PyListObject* data;
+//    int dataLayerIdx, softmaxLayerIdx;
+//    if (!PyArg_ParseTuple(args, "O!ii",
+//        &PyList_Type, &data,
+//        &dataLayerIdx, &softmaxLayerIdx)) {
+//        return NULL;
+//    }
+//    CPUData* cpuData = new CPUData((PyObject*)data);
+//    Matrix& ftrs = *mvec.back();
+//    mvec.pop_back();
+//    
+//    DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx);
+//    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Waits for the trainer to finish training on the batch given to startBatch.
+ * This is a blocking call so lets release the GIL.
+ */
+PyObject* finishBatch(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::BATCH_DONE);
+    
+    Cost& cost = res->getResults();
+    PyObject* dict = PyDict_New();
+    CostMap& costMap = cost.getCostMap();
+    for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) {
+        PyObject* v = PyList_New(0);
+        for (vector<double>::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) {
+            PyObject* f = PyFloat_FromDouble(*iv);
+            PyList_Append(v, f);
+        }
+        PyDict_SetItemString(dict, it->first.c_str(), v);
+    }
+    PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases());
+    delete res; // Deletes cost too
+    
+    return retVal;
+}
+
+PyObject* checkGradients(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    if (!PyArg_ParseTuple(args, "O!",
+        &PyList_Type, &data)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    
+    GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData);
+    model->getWorkerQueue().enqueue(wr);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::BATCH_DONE);
+    delete res;
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Copies weight matrices from GPU to system memory.
+ */
+PyObject* syncWithHost(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    SyncWorker* wr = new SyncWorker(*model);
+    model->getWorkerQueue().enqueue(wr);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::SYNC_DONE);
+    
+    delete res;
+    return Py_BuildValue("i", 0);
+}
+
+PyObject* decodeJpeg(PyObject *self, PyObject *args) {
+    PyListObject* pyJpegStrings;
+    PyArrayObject* pyTarget;
+    int img_size, inner_size, test, multiview;
+    if (!PyArg_ParseTuple(args, "O!O!iiii",
+        &PyList_Type, &pyJpegStrings,
+        &PyArray_Type, &pyTarget,
+        &img_size,
+        &inner_size,
+        &test,
+        &multiview)) {
+        return NULL;
+    }
+
+    Thread* threads[NUM_JPEG_DECODER_THREADS];
+    int num_imgs = PyList_GET_SIZE(pyJpegStrings);
+    int num_imgs_per_thread = DIVUP(num_imgs, NUM_JPEG_DECODER_THREADS);
+    Matrix& dstMatrix = *new Matrix(pyTarget);
+    for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) {
+        int start_img = t * num_imgs_per_thread;
+        int end_img = min(num_imgs, (t+1) * num_imgs_per_thread);
+
+        threads[t] = new DecoderThread((PyObject*)pyJpegStrings, dstMatrix, start_img, end_img, img_size, inner_size, test, multiview);
+        threads[t]->start();
+    }
+
+    for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) {
+        threads[t]->join();
+        delete threads[t];
+    }
+    assert(dstMatrix.isView());
+    delete &dstMatrix;
+    return Py_BuildValue("i", 0);
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu
new file mode 100644
index 0000000..e58c640
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <map>
+#include "../include/reducepipeline.cuh"
+
+using namespace std;
+
+/* =========================
+ * IReducerSegment
+ * =========================
+ */
+// Null mat --> reducer on host
+IReduceSegment::IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue)
+: _deviceID(deviceID), _next(NULL), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getTgtDeviceID())) {
+}
+
+IReduceSegment::~IReduceSegment() {
+}
+
+NVMatrix& IReduceSegment::getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx) {
+        NVMatrix& line = mat.reshaped(1, mat.getNumElements());
+    int start = chunkIdx * chunkSize;
+    int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
+//        _mat->printShape("_mat");
+    NVMatrix& chunk = line.sliceCols(start, end);
+    delete &line;
+//        chunk.printShape("chunk");
+    return chunk;
+}
+
+void* IReduceSegment::run() {
+    bool exit = false;
+    while (!exit) {
+        ReduceMessage& msg = *_queue.dequeue();
+        if (msg.getType() == EXIT) {
+            exit = true;
+        } else {
+            bool term = processMessage(msg);
+            if (term) {
+                assert(_finishQueue);
+                _finishQueue->enqueue(1);
+            }
+        }
+        delete &msg;
+    }
+    return NULL;
+}
+
+inline NVMatrix& IReduceSegment::getMatrix(ReduceMessage& msg) {
+    return msg.getMatrix(getDeviceID());
+}
+
+Queue<ReduceMessage*>& IReduceSegment::getQueue() {
+    return _queue;
+}
+
+inline int IReduceSegment::getDeviceID() const {
+    return _deviceID;
+}
+
+void IReduceSegment::addPrev(IReduceSegment& c) {
+    _prev.push_back(&c);
+}
+
+void IReduceSegment::addNext(ReducePeer& c) {
+    assert(_next == NULL);
+    _next = &c;
+    c.addPrev(*this);
+}
+
+bool IReduceSegment::isTerminal() const {
+    return _next == NULL;
+}
+
+/* =========================
+ * ReducerSource
+ * =========================
+ */
+ReducerSource::ReducerSource(IEightGPUReducer& parent, int deviceID) : IReduceSegment(parent, deviceID, NULL) {
+}
+
+bool ReducerSource::processMessage(ReduceMessage& msg) {
+    assert(msg.getType() == REDUCE_START);
+    int numChunks = min(getMatrix(msg).getNumElements(), max(REDUCE_MIN_CHUNKS, min(REDUCE_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), REDUCE_MIN_CHUNK_SIZE))));
+    int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
+    //printf("num chunks: %d\n", numChunks);
+    for (int c = 0; c <= numChunks; ++c) {
+        _next->getQueue().enqueue(new ReduceChunkMessage(*this, c, chunkSize, numChunks, msg.getScaleIntermediates(), msg.getScaleTarget(), msg.getMatrices()));
+    }
+    return false;
+}
+
+/* =========================
+ * ReducerPeer
+ * =========================
+ */
+ReducePeer::ReducePeer(IEightGPUReducer& parent,int deviceID, Queue<int>* finishQueue) : IReduceSegment(parent, deviceID,  finishQueue), _numInputsFinished(0) {
+    _add = deviceID != DEVICE_HOST;
+}
+
+ReducePeer::ReducePeer(IEightGPUReducer& parent) : IReduceSegment(parent, DEVICE_HOST, NULL), _numInputsFinished(0), _add(false) {
+}
+
+ReducePeer::~ReducePeer() {
+    for(std::map<int,cudaStream_t>::iterator it = _streams.begin(); it != _streams.end(); ++it) {
+        checkCudaErrors(cudaStreamDestroy(it->second));
+    }
+    _streams.clear();
+}
+
+inline cudaStream_t ReducePeer::getStream(int deviceID) {
+    if (deviceID < 0) {
+        return NULL;
+    }
+    if (_streams.count(deviceID) == 0) {
+        NVMatrix::setDeviceID(deviceID);
+        checkCudaErrors(cudaStreamCreateWithFlags(&_streams[deviceID], cudaStreamNonBlocking));
+    }
+    return _streams[deviceID];
+}
+
+bool ReducePeer::processMessage(ReduceMessage& msg) {
+    assert(msg.getType() == REDUCE_CHUNK);
+
+    ReduceChunkMessage& cmsg = *static_cast<ReduceChunkMessage*>(&msg);
+//    if (_numInputsReceived.count(cmsg.getChunkIdx()) == 0) {
+//        _numInputsReceived[cmsg.getChunkIdx()] = 0;
+//    }
+    int& inputsRcvd = ++_numInputsReceived[cmsg.getChunkIdx()];
+//    printf("reducer on device %d got msg chunk idx %d of %d, inputs rcvd for this chunk idx: %d/%d\n",
+//            getDeviceID(), cmsg.getChunkIdx(), cmsg.getNumChunks(),_numInputsReceived[cmsg.getChunkIdx()], _prev.size());
+    if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
+        IReduceSegment& src = cmsg.getSource();
+        float scalePrev = isTerminal() ? cmsg.getScaleIntermediates() : 1;
+        float scaleSelf = inputsRcvd == 1 ? _add * (isTerminal() ? cmsg.getScaleTarget() : 1): 1;
+        if (scaleSelf == 0 || isTerminal()) {
+            if (getDeviceID() >= 0) {
+                NVMatrix::setDeviceID(getDeviceID());
+            }
+            getMatrix(msg).resize(src.getMatrix(msg));
+        }
+        assert(getMatrix(msg).isSameDims(src.getMatrix(msg)));
+        NVMatrix& prevChunk = getChunk(src.getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        int execDeviceID = getDeviceID() >= 0 ? getDeviceID() : src.getDeviceID();
+        if (execDeviceID >= 0) {
+            NVMatrix::setDeviceID(execDeviceID);
+            prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, getStream(execDeviceID));
+            NVMatrix::syncStream(getStream(execDeviceID));
+        } else {
+            assert(!isTerminal());
+            hostAdd(prevChunk.getDevData(), myChunk.getDevData(), prevChunk.getNumElements(), scaleSelf);
+        }
+
+        delete &prevChunk;
+        delete &myChunk;
+
+    } else {
+        _numInputsFinished++;
+    }
+    if (!isTerminal() && inputsRcvd == _prev.size()) {
+//        printf("    device %d enqueueing msg for next on device %d\n", getDeviceID(), _next->getDeviceID());
+        _next->getQueue().enqueue(
+                new ReduceChunkMessage(*this, cmsg.getChunkIdx(), cmsg.getChunkSize(), cmsg.getNumChunks(),
+                                        cmsg.getScaleIntermediates(), cmsg.getScaleTarget(), cmsg.getMatrices()));
+    }
+
+    bool finished = _numInputsFinished == _prev.size();
+    if (finished) {
+        _numInputsFinished = 0;
+        _numInputsReceived.clear();
+    }
+    return finished && isTerminal();
+}
+
+void ReducePeer::hostAdd(const float* src, float* tgt, const int n, const float scaleTgt) {
+    if (scaleTgt != 0) {
+        for (int i = 0; i < n; ++i) {
+            tgt[i] = scaleTgt * tgt[i] + src[i];
+        }
+    } else {
+        for (int i = 0; i < n; ++i) {
+            tgt[i] = src[i];
+        }
+    }
+}
+
+inline NVMatrix& ReducePeer::getMatrix(ReduceMessage& msg) {
+    if (getDeviceID() != DEVICE_HOST) {
+        return IReduceSegment::getMatrix(msg);
+    }
+    return _mat;
+}
+
+/* =========================
+ * EightGPUReducer
+ * =========================
+ */
+IEightGPUReducer::IEightGPUReducer(int tgtDeviceID) : _tgtDeviceID(tgtDeviceID) {
+}
+
+IEightGPUReducer::~IEightGPUReducer() {
+    vector<IReduceSegment*> v;
+    v.insert(v.end(), _sources.begin(), _sources.end());
+    v.insert(v.end(), _peers.begin(), _peers.end());
+    for (vector<IReduceSegment*>::iterator it = v.begin(); it != v.end(); ++it) {
+        (*it)->getQueue().enqueue(new ReduceMessage(EXIT));
+        (*it)->join();
+        delete *it;
+    }
+}
+
+IEightGPUReducer& IEightGPUReducer::construct() {
+    vector<int> same, other;
+    for (int i = 0; i < 8; ++i) {
+        if (i != _tgtDeviceID) {
+            if (NVMatrix::canAccessPeer(_tgtDeviceID, i)) {
+                same.insert(same.begin() + rand() % (1 + same.size()), i);
+            } else {
+                other.insert(other.begin() + rand() % (1 + other.size()), i);
+            }
+        }
+    }
+    assert(same.size() == 3);
+    assert(other.size() == 4);
+    makeConnections(same, other);
+    for (vector<ReducerSource*>::const_iterator it = _sources.begin(); it != _sources.end(); ++it) {
+        (*it)->start();
+    }
+    for (vector<ReducePeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
+        (*it)->start();
+    }
+    return *this;
+}
+
+void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget) {
+    assert(mats.size() == 8);
+    // Check if source matrices are 0-sized
+    bool zero = true;
+    for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+        if (it->first != _tgtDeviceID && it->second->getNumElements() != 0) {
+            zero = false;
+            break;
+        }
+    }
+    if (zero) {
+        mats[_tgtDeviceID]->resize(*mats[(_tgtDeviceID + 1) % 8]);
+    } else {
+        for (vector<ReducerSource*>::const_iterator it = _sources.begin(); it != _sources.end(); ++it) {
+            (*it)->getQueue().enqueue(new ReduceStartMessage(scaleIntermediates, scaleTarget, mats));
+        }
+        _finishQueue.dequeue();
+    }
+    assert(_finishQueue.getNumElements() == 0);
+}
+
+void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates) {
+    reduce(mats, scaleIntermediates, 1);
+}
+
+void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats) {
+    reduce(mats, 1, 1);
+}
+
+int IEightGPUReducer::getTgtDeviceID() const {
+    return _tgtDeviceID;
+}
+
+/* =========================
+ * EightGPUReducer1
+ * =========================
+ */
+EightGPUReducer1::EightGPUReducer1(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) {
+}
+
+void EightGPUReducer1::makeConnections(vector<int>& same, vector<int>&other) {
+    // Setup segments on same truck
+    _peers.push_back(new ReducePeer(*this, _tgtDeviceID, &_finishQueue));         // peers[0] = tgt
+    _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue));               // peers[1] = same truck 1
+    _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue));               // peers[2] = same truck 2
+    _sources.push_back(new ReducerSource(*this,same[2]));                         // sources[0] = same truck 3
+ 
+    _sources[0]->addNext(*_peers[2]);
+    _peers[2]->addNext(*_peers[1]);
+    _peers[1]->addNext(*_peers[0]);
+
+    // Setup segments on other truck
+    _sources.push_back(new ReducerSource(*this,other[0]));                        // sources[1] = other truck 1
+    _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue));              // peers[3] = other truck 2
+    _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue));              // peers[4] = other truck 3
+    _sources.push_back(new ReducerSource(*this,other[3]));                        // sources[2] = other truck 4
+    _peers.push_back(new ReducePeer(*this));                                      // peers[5] = host 1
+    _peers.push_back(new ReducePeer(*this));                                      // peers[6] = host 2
+    _peers.push_back(new ReducePeer(*this));                                      // peers[7] = host 3
+
+    _sources[1]->addNext(*_peers[3]);
+    _peers[3]->addNext(*_peers[5]);
+    _peers[5]->addNext(*_peers[7]);
+    _peers[7]->addNext(*_peers[0]);
+    _peers[4]->addNext(*_peers[6]);
+    _peers[6]->addNext(*_peers[7]);
+    _sources[2]->addNext(*_peers[4]);
+}
+
+/* =========================
+ * EightGPUReducer2
+ * =========================
+ */
+EightGPUReducer2::EightGPUReducer2(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) {
+}
+
+void EightGPUReducer2::makeConnections(vector<int>& same, vector<int>&other) {
+    // Setup segments on same truck
+    _peers.push_back(new ReducePeer(*this,_tgtDeviceID, &_finishQueue));          // peers[0] = tgt
+    _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue));               // peers[1] = same truck 1
+    _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue));               // peers[2] = same truck 2
+    _sources.push_back(new ReducerSource(*this,same[2]));                         // sources[0] = same truck 3
+
+    _sources[0]->addNext(*_peers[2]);
+    _peers[2]->addNext(*_peers[1]);
+    _peers[1]->addNext(*_peers[0]);
+
+    // Setup segments on other truck
+    _sources.push_back(new ReducerSource(*this,other[0]));                        // sources[1] = other truck 1
+    _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue));              // peers[3] = other truck 2
+    _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue));              // peers[4] = other truck 3
+    _peers.push_back(new ReducePeer(*this,other[3], &_finishQueue));              // peers[5] = other truck 4
+    _peers.push_back(new ReducePeer(*this));                                      // peers[6] = host 1
+
+    _sources[1]->addNext(*_peers[3]);
+    _peers[3]->addNext(*_peers[4]);
+    _peers[4]->addNext(*_peers[5]);
+    _peers[5]->addNext(*_peers[6]);
+    _peers[6]->addNext(*_peers[0]);
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu
new file mode 100644
index 0000000..b8de719
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/streambroadcast.cuh"
+
+using namespace std;
+
+/*
+ * =====================
+ * StreamBroadcast
+ * =====================
+ */
+
+StreamBroadcast::StreamBroadcast(map<int,cudaStream_t>& streams) {
+    _streams = streams;
+}
+
+StreamBroadcast::StreamBroadcast() {
+}
+
+void StreamBroadcast::toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice) {
+    src.copy(hostmem, _streams[srcDevice]);
+}
+
+void StreamBroadcast::toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput) {
+    tgt.add(hostmem, scaleTarget, scaleOutput, tgt, _streams[tgtDevice]);
+}
+
+void StreamBroadcast::init(map<int, NVMatrix*>& mats) {
+    for (map<int, NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+        if (_streams.count(it->first) == 0) {
+            _ownedStreams.insert(it->first);
+            NVMatrix::setDeviceID(it->first);
+            checkCudaErrors(cudaStreamCreateWithFlags(&_streams[it->first], cudaStreamNonBlocking));
+        }
+    }
+}
+
+StreamBroadcast::~StreamBroadcast() {
+    for (set<int>::const_iterator it = _ownedStreams.begin(); it != _ownedStreams.end(); ++it) {
+        checkCudaErrors(cudaStreamDestroy(_streams[*it]));
+    }
+}
+
+cudaStream_t StreamBroadcast::getStream(int deviceID) {
+    return _streams[deviceID];
+}
+
+// Sync stream associated with given device id
+void StreamBroadcast::sync(int deviceID) {
+    NVMatrix::syncStream(_streams[deviceID]);
+}
+
+void StreamBroadcast::transfer(map<int,NVMatrix*>& mats,  int srcDevice) {
+    transfer(mats, _hostMem, srcDevice, 0, 1);
+}
+
+void StreamBroadcast::transfer(map<int,NVMatrix*>& mats,  int srcDevice, float scaleTarget, float scaleOutput) {
+    transfer(mats, _hostMem, srcDevice, scaleTarget, scaleOutput);
+}
+
+void StreamBroadcast::transfer(map<int,NVMatrix*>& mats, HostNVMatrix& hostbuf, int srcDevice, float scaleTarget, float scaleOutput) {
+    int oldDeviceID = NVMatrix::getDeviceID();
+    assert(mats.count(srcDevice) != 0);
+    init(mats);
+//    assert(_streams.count(srcDevice) != 0);
+    if (mats.size() > 1) {
+        if (mats[srcDevice]->getNumElements() == 0) {
+            for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+                it->second->resize(*mats[srcDevice]);
+            }
+        } else {
+            int tgtDevice = mats.begin()->first != srcDevice ? mats.begin()->first : (++mats.begin())->first;
+            // This case is a simple copy
+            if (mats.size() == 2 && NVMatrix::canAccessPeer(tgtDevice, srcDevice)) {
+                NVMatrix::setDeviceID(tgtDevice);
+                mats[tgtDevice]->add(*mats[srcDevice], scaleTarget, scaleOutput, *mats[tgtDevice], _streams[tgtDevice]);
+            } else {
+                NVMatrix& src = *mats[srcDevice];
+                if (hostbuf.getNumElements() < src.getNumElements()) {
+                    hostbuf.resize(1,src.getNumElements());
+                }
+                hostbuf.setTrans(src.isTrans());
+
+                NVMatrix& hostmat = hostbuf.sliceCols(0, src.getNumElements());
+                assert(hostmat.isView());
+                hostmat.reshape(src.getNumRows(), src.getNumCols());
+
+                for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+                    assert(it->second->isContiguous());
+                    NVMatrix::setDeviceID(it->first);
+                    it->second->resize(src);
+                    assert(it->second->isTrans() == src.isTrans());
+                }
+                int numChunks = min(DIVUP(src.getNumElements(), SB_MIN_CHUNK_SIZE), SB_MAX_CHUNKS);
+
+                if (numChunks == 1) { // This is a bit faster for small matrices
+                    NVMatrix::setDeviceID(srcDevice);
+                    toHostMem(src, hostmat, srcDevice);
+                    NVMatrix::syncStream(_streams[srcDevice]);
+
+                    for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+                        if (it->first != src.getDataDeviceID()) {
+                            NVMatrix::setDeviceID(it->first);
+                            toTarget(hostmat, *it->second, it->first, scaleTarget, scaleOutput);
+                        }
+                    }
+                } else {
+                    int n = src.getNumElements();
+
+                    map<int,NVMatrix*> lines;
+                    for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+                        lines[it->first] = &it->second->reshaped(1, n);
+                        lines[it->first]->setTrans(src.isTrans());
+                    }
+                    NVMatrix& srcLine = *lines[srcDevice];
+                    hostmat.reshape(1, n);
+
+                    int chunkSize = DIVUP(n, numChunks);
+                    bool trans = src.isTrans();
+                    for (int i = 0; i < numChunks; ++i) {
+                        int start = i * chunkSize;
+                        int end = min((i+1) * chunkSize, n);
+                        if (start < end) {
+                            NVMatrix& tmpSrc = srcLine.sliceCols(start, end); // view
+                            NVMatrix& tmpHostmem = hostmat.sliceCols(start, end); // view
+
+                            NVMatrix::setDeviceID(srcDevice);
+                            toHostMem(tmpSrc, tmpHostmem, srcDevice);
+                            NVMatrix::syncStream(_streams[srcDevice]);
+
+                            for (map<int,NVMatrix*>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+                                if (it->first != srcDevice) {
+                                    NVMatrix& tmpTgt = it->second->sliceCols(start, end); // view
+                                    NVMatrix::setDeviceID(it->first);
+                                    toTarget(tmpHostmem, tmpTgt, it->first, scaleTarget, scaleOutput);
+                                    delete &tmpTgt;
+                                }
+                            }
+                            delete &tmpSrc;
+                            delete &tmpHostmem;
+                        }
+                    }
+                    for (map<int,NVMatrix*>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+                        delete it->second;
+                    }
+                }
+                delete &hostmat;
+            }
+            for(map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+                if (it->first != srcDevice) {
+                    NVMatrix::syncStream(_streams[it->first]);
+                }
+            }
+        }
+    }
+    if (oldDeviceID >= 0) {
+        NVMatrix::setDeviceID(oldDeviceID);
+    }
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu
new file mode 100644
index 0000000..13a1533
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu
@@ -0,0 +1,217 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Python.h>
+#include <arrayobject.h>
+#include <helper_cuda.h>
+#include "../include/util.cuh"
+
+using namespace std;
+
+stringv* getStringV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    stringv* vec = new stringv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(std::string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i))));
+    }
+    return vec;
+}
+
+floatv* getFloatV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    floatv* vec = new floatv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+intv* getIntV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    intv* vec = new intv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+int* getIntA(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    int* arr = new int[PyList_GET_SIZE(pyList)];
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i));
+    }
+    return arr;
+}
+
+MatrixV* getMatrixV(PyObject* pyList) {
+    return getMatrixV(pyList, PyList_GET_SIZE(pyList));
+}
+
+MatrixV* getMatrixV(PyObject* pyList, int len) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    MatrixV* vec = new MatrixV(); 
+    for (int i = 0; i < len; i++) {
+        vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+PyObjectV* pyDictGetValues(PyObject* dict) {
+    PyObjectV* pov = new PyObjectV();
+    PyObject* valuesList = PyDict_Values(dict);
+    int numValues = PyList_GET_SIZE(valuesList);
+
+    for (int i = 0; i < numValues; i++) {
+        pov->push_back(PyList_GET_ITEM(valuesList, i));
+    }
+    Py_DECREF(valuesList);
+    return pov;
+}
+
+int pyDictGetInt(PyObject* dict, const char* key) {
+    return PyInt_AS_LONG(PyDict_GetItemString(dict, key));
+}
+
+intv* pyDictGetIntV(PyObject* dict, const char* key) {
+    return getIntV(PyDict_GetItemString(dict, key));
+}
+
+int* pyDictGetIntA(PyObject* dict, const char* key) {
+    return getIntA(PyDict_GetItemString(dict, key));
+}
+
+std::string pyDictGetString(PyObject* dict, const char* key) {
+    return std::string(PyString_AS_STRING(PyDict_GetItemString(dict, key)));
+}
+
+float pyDictGetFloat(PyObject* dict, const char* key) {
+    return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key));
+}
+
+floatv* pyDictGetFloatV(PyObject* dict, const char* key) {
+    return getFloatV(PyDict_GetItemString(dict, key));
+}
+
+Matrix* pyDictGetMatrix(PyObject* dict, const char* key) {
+    return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key));
+}
+
+MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) {
+    return getMatrixV(PyDict_GetItemString(dict, key));
+}
+
+stringv* pyDictGetStringV(PyObject* dict, const char* key) {
+    return getStringV(PyDict_GetItemString(dict, key));
+}
+
+bool pyDictHasKey(PyObject* dict, const char* key) {
+    PyObject* str = PyString_FromString(key);
+    bool b = PyDict_Contains(dict, str);
+    Py_DECREF(str);
+    return b;
+}
+
+template<typename T>
+void shuffleVector(vector<T>& v, int start, int end) {
+    const int len = end - start;
+    for (int i = 0; i < len*5; ++i) {
+        int r1 = start + rand() % len;
+        int r2 = start + rand() % len;
+        int tmp = v[r1];
+        v[r1] = v[r2];
+        v[r2] = tmp;
+    }
+}
+
+template<class T>
+std::string tostr(T n) {
+    ostringstream result;
+    result << n;
+    return result.str();
+}
+
+template<class T>
+void deleteElements(vector<T*>& v) {
+    deleteElements(v, false);
+}
+
+template<class T>
+void deleteElements(vector<T*>& v, bool deleteContainer) {
+    for (typename vector<T*>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        delete *it;
+    }
+    if (deleteContainer) {
+        delete &v;
+    }
+}
+
+static Lock deviceCPULock;
+static std::map<int, std::vector<int> > deviceCPUs;
+
+std::vector<int>& getDeviceCPUs(int deviceID) {
+    deviceCPULock.acquire();
+    if (deviceCPUs.count(deviceID) == 0 && deviceID >= 0) {
+        struct cudaDeviceProp props;
+        checkCudaErrors(cudaGetDeviceProperties(&props, deviceID));
+        char pciString[13];
+
+        sprintf(pciString, "%04x", props.pciDomainID);
+        pciString[4] = ':';
+        sprintf(pciString + 5, "%02x", props.pciBusID);
+        pciString[7] = ':';
+        sprintf(pciString + 8, "%02x", props.pciDeviceID);
+        pciString[10] = '.';
+        pciString[11] = '0';
+        pciString[12] = 0;
+        std::string path = std::string("/sys/bus/pci/devices/") + std::string(pciString) + "/local_cpulist";
+        ifstream f(path.c_str());
+
+        if (f.is_open()) {
+            std::string cpuString;
+            while (getline(f, cpuString, ',')) {
+                int start, end;
+                int found = sscanf(cpuString.c_str(), "%d-%d", &start, &end);
+                end = found == 1 ? start : end;
+                if (found > 0) {
+                    for (int i = start; i <= end; ++i) {
+                        deviceCPUs[deviceID].push_back(i);
+                    }
+                } 
+            }
+            f.close();
+        } else {
+            printf("Unable to open %s\n", path.c_str());
+        }
+    }
+    vector<int>& ret = deviceCPUs[deviceID];
+    deviceCPULock.release();
+    return ret;
+}
+
+template void shuffleVector<int>(std::vector<int>& v, int start, int end);
+template std::string tostr<int>(int n);
+template void deleteElements<NVMatrix>(std::vector<NVMatrix*>& v, bool deleteContainer);
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu
new file mode 100644
index 0000000..51cffa9
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu
@@ -0,0 +1,460 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <algorithm>
+#include "../include/weights.cuh"
+#include "../include/lr.cuh"
+#include "../include/worker.cuh"
+
+using namespace std;
+
+/* ========================
+ * IWeightReducer
+ * ========================
+ */
+int IWeightReducer::getDeviceID() {
+    return _replicas[_tgtReplicaID]->getDeviceID();
+}
+
+IWeightReducer::IWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : _replicas(replicas), _tgtReplicaID(tgtReplicaID) {
+}
+
+IWeightReducer::~IWeightReducer() {
+}
+
+IWeightReducer& IWeightReducer::make(std::map<int,Weights*>& replicas, int tgtReplicaID) {
+    if (replicas.size() == 8) {
+        return *new ParallelWeightReducer(replicas, tgtReplicaID);
+    }
+    return *new SequentialWeightReducer(replicas, tgtReplicaID);
+}
+
+/* ========================
+ * SequentialWeightReducer
+ * ========================
+ */
+SequentialWeightReducer::SequentialWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) {
+    _sb = new StreamBroadcast();
+}
+
+SequentialWeightReducer::~SequentialWeightReducer() {
+    delete _sb;
+}
+
+void SequentialWeightReducer::reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) {
+    std::map<int, NVMatrix*> mats; // device id -> grad
+    mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad();
+    for (int i = 0, r = _tgtReplicaID; i < _replicas.size(); ++i, r = (r + 1) % _replicas.size()) {
+        if (r != _tgtReplicaID) {
+            mats[_replicas[r]->getDeviceID()] = gradShards[r];
+            _sb->transfer(mats, _replicas[r]->getDeviceID(), 1, gradScale);
+            mats.erase(_replicas[r]->getDeviceID());
+        }
+    }
+}
+
+/* ========================
+ * ParallelWeightReducer
+ * ========================
+ */
+ParallelWeightReducer::ParallelWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) {
+    _reducer = &(new EightGPUReducer1(getDeviceID()))->construct();
+}
+
+ParallelWeightReducer::~ParallelWeightReducer() {
+    delete _reducer;
+}
+
+void ParallelWeightReducer::reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) {
+    std::map<int, NVMatrix*> mats; // device id -> grad
+    mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad();
+    for (std::map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+        if (it->first != _tgtReplicaID) {
+            mats[it->second->getDeviceID()] = gradShards[it->first];
+        }
+    }
+    _reducer->reduce(mats, gradScale, 1);
+}
+
+// weights has pointer to layer, layer pointer to thread
+// thread has sync (copy) object for every other thread
+// weights uses copy object to sum grad contributions into inc matrix slice (phase 1)
+// weights broadcasts inc matrix slice to other inc matrix replicas (phase 2)
+
+NVMatrix& Weights::operator*() const {
+    return getW();
+}
+
+/*
+ * TODO: get rid of this constructor duplication.
+ */
+Weights::Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent) {
+    init(srcWeights.getCPUW(), srcWeights.getCPUWInc(), lrs, parent, 0, 0, srcWeights.getMom(), srcWeights.isUseGrad(), false);
+    _srcWeights = &srcWeights;
+}
+
+Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc,
+                 float wball, float mom, bool useGrad) {
+    init(hWeights, hWeightsInc, lrs, parent, wc, wball, mom, useGrad, true);
+}
+
+void Weights::init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc,
+              float wball, float mom, bool useGrad, bool cleanup) {
+    _srcWeights = NULL;
+    _hWeights = &hWeights;
+    _hWeightsInc = &hWeightsInc;
+    _numUpdates = 0;
+    _lrs = &lrs;
+    _parent = &parent;
+    _wc = wc;
+    _wball = wball;
+    _mom = mom;
+    _useGrad = useGrad;
+    _onGPU = false;
+    _weights = NULL;
+    _weightsInc = NULL;
+    _weightsGrad = NULL;
+    _cleanup = cleanup;
+    _reducer = NULL;
+    _broadcaster = NULL;
+}
+
+Weights::~Weights() {
+	delete _lrs;
+	delete _reducer;
+	delete _broadcaster;
+    if (_cleanup) {
+        delete _hWeights;
+        delete _hWeightsInc;
+        if (_srcWeights == NULL) {
+            delete _weights;
+            delete _weightsInc;
+            delete _weightsGrad;
+        }
+    }
+}
+
+NVMatrix& Weights::getW() const {
+    assert(_onGPU);
+    return *_weights;
+}
+
+NVMatrix& Weights::getInc() const {
+    assert(_onGPU);
+    return *_weightsInc;
+}
+
+/*
+ * TODO: This seems like pretty nasty behavior, I should change this.
+ */
+NVMatrix& Weights::getGrad() const {
+    assert(_onGPU);
+    return _useGrad ? *_weightsGrad : *_weightsInc;
+}
+
+Matrix& Weights::getCPUW() const {
+    return *_hWeights;
+}
+
+Matrix& Weights::getCPUWInc() const {
+    return *_hWeightsInc;
+}
+
+int Weights::getNumRows() const {
+    return _hWeights->getNumRows();
+}
+
+int Weights::getNumCols() const {
+    return _hWeights->getNumCols();
+}
+
+map<int,Weights*>& Weights::getReplicas() {
+    return _replicas;
+}
+
+template<class T> T& Weights::getShard(T& mat, int replicaID) {
+    const int n = mat.getNumElements();
+    T& line = mat.reshaped(1, n);
+    const int shardStart = min(n, replicaID * _shardSize);
+    const int shardEnd = min(n, (replicaID + 1) * _shardSize);
+    T& slice = line.sliceCols(shardStart, shardEnd);
+    assert(slice.isView());
+    delete &line;
+    return slice;
+}
+
+template<class T> T& Weights::getShard(T& mat) {
+    return getShard(mat, getReplicaID());
+}
+
+ISafeBroadcastNetwork& Weights::getBroadcaster() {
+    if (_broadcaster == NULL) {
+        set<int> devices;
+        for (map<int, Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+            devices.insert(it->second->getDeviceID());
+        }
+        // NOTE: we must use safe broadcaster becasue we want to *add* our value to everyone else
+        _broadcaster = &ISafeBroadcastNetwork::make(devices, getDeviceID()); //&(new NaiveBroadcaster(devices, getDeviceID()))->construct();
+    }
+    return *_broadcaster;
+}
+
+IWeightReducer& Weights::getReducer() {
+    if (_reducer == NULL) {
+        _reducer = &IWeightReducer::make(_replicas, getReplicaID());
+    }
+    return *_reducer;
+}
+
+void Weights::copyToCPU() {
+    if (_srcWeights == NULL) {
+        assert(_onGPU);
+        NVMatrix::syncStream(); // for safety
+        if (getReplicaID() == 0) {
+            _weights->copyToHost(*_hWeights);
+
+            // Synchronize weights amongst replicas while we're at it.
+            map<int,NVMatrix*> weights;
+            for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+                weights[it->second->getDeviceID()] = &it->second->getW();
+            }
+            // These things sync before returning.
+            getBroadcaster().broadcast(weights, 1, 0);
+        }
+        if (_useGrad) {
+            Matrix& hIncShard = getShard(*_hWeightsInc);
+            _weightsInc->copyToHost(hIncShard);
+            delete &hIncShard;
+        } else { // In this case there's definitely only one replica
+            _weightsInc->copyToHost(*_hWeightsInc);
+        }
+    }
+}
+
+// This function is assumed to be called in the order in which the layers
+// were defined
+void Weights::copyToGPU() {
+    assert(!_onGPU);
+    // Copies are performed on the default (computation) stream, so that's fine.
+    if (_srcWeights == NULL) {
+        _weights = _weights == NULL ? new NVMatrix() : _weights;
+        _weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc;
+        _weights->copyFromHost(*_hWeights, true);
+
+        if (_useGrad) {
+            // In this case there is no need to store the entire inc matrix.
+            // Just this replica's shard (for synchronization purposes) will do.
+            Matrix& hIncShard = getShard(*_hWeightsInc);
+            _weightsInc->copyFromHost(hIncShard, true);
+            delete &hIncShard;
+        } else {
+            _weightsInc->copyFromHost(*_hWeightsInc, true);
+        }
+
+        _weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL;
+    } else {
+        _weights = _srcWeights->_weights;
+        _weightsInc = _srcWeights->_weightsInc;
+        _weightsGrad = _srcWeights->_weightsGrad;
+    }
+    _onGPU = true;
+}
+
+void Weights::aggregateReplicaGradients(float progress) {
+    map<int, NVMatrix*> gradShards;
+    map<int, NVMatrix*> wShards;
+    for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+        gradShards[it->first] = &getShard(it->second->getGrad(), getReplicaID());
+        wShards[it->first] = &getShard(it->second->getW(), getReplicaID());
+        assert(wShards[it->first]->isContiguous() && gradShards[it->first]->isContiguous());
+    }
+
+    float gradScale = _lrs->getValue(progress);
+    NVMatrix::setDeviceID(getDeviceID());
+
+    if (_wc > 0) {
+        NVMatrixTernaryOps::WeightedAdd wadd = NVMatrixTernaryOps::WeightedAdd(_mom, gradScale, -_wc * _lrs->getValue(progress));
+        _weightsInc->applyTernary(wadd, *gradShards[getReplicaID()], *wShards[getReplicaID()], *_weightsInc);
+    } else {
+        _weightsInc->add(*gradShards[getReplicaID()], _mom, gradScale);
+    }
+
+    // Reduce everyone's gradient into my inc shard
+    NVMatrix::syncStream(); // Crucial since the reducer does everything in its own streams!!
+    getReducer().reduce(gradShards, gradScale, true);
+
+    // Broadcast my inc -> all replicas
+    map<int, NVMatrix*> mats; // device id -> grad
+    mats[getDeviceID()] = _weightsInc;
+    for (map<int, Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+        if (it->first != getReplicaID()) {
+            mats[it->second->getDeviceID()] = wShards[it->first];
+        }
+    }
+    getBroadcaster().broadcast(mats, 1, 1);
+
+    NVMatrix::setDeviceID(getDeviceID());
+    wShards[getReplicaID()]->add(*_weightsInc);
+
+    // Cleanup
+    for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
+        delete gradShards[it->first];
+        delete wShards[it->first];
+    }
+}
+
+
+// When _useGrad is false, weightsInc is assumed to contain the 
+// entire, properly scaled weight increment.
+// OTHERWISE, scale your gradient by 1 / numCases only.
+// The scaling by epsW will be done in this routine.
+void Weights::update(float progress) {
+    // Only true owner of weights updates
+//    printf("%s update weights\n", _parent->getName().c_str());
+    if (_srcWeights == NULL && _lrs->getBaseValue() > 0) {
+        assert(_onGPU);
+        if (_useGrad) {
+            aggregateReplicaGradients(progress);
+        } else { // Definitely no replicas in this case
+            if (_wc > 0) {
+                _weightsInc->add(*_weights, -_wc * _lrs->getValue(progress));
+            }
+            _weights->add(*_weightsInc);
+        }
+        _numUpdates = 0;
+    }
+}
+
+int Weights::incNumUpdates() {
+    if (_srcWeights != NULL) {
+        return _srcWeights->incNumUpdates();
+    }
+    return _numUpdates++;
+}
+
+// Returns the number of times a gradient has been computed for this
+// weight matrix during the current pass (interval between two calls of update())
+// through the net. This number will only be greater than 1 if this weight matrix
+// is *shared* by multiple layers in the net.
+int Weights::getNumUpdates() const {
+    if (_srcWeights != NULL) {
+        return _srcWeights->getNumUpdates();
+    }
+    return _numUpdates;
+}
+
+float Weights::getEps(float progress) const {
+    return _lrs->getValue(progress);
+}
+
+float Weights::getMom() const {
+    return _mom;
+}
+
+float Weights::getWC() const {
+    return _wc;
+}
+
+float Weights::getWBall() const {
+    return _wball;
+}
+
+bool Weights::isUseGrad() const { // is good grammar
+    return _useGrad;
+}
+
+bool Weights::isOwner() const {
+    return _srcWeights == NULL;
+}
+
+ParameterSchedule& Weights::getLearningRateSchedule() const {
+	return *_lrs;
+}
+
+void Weights::addReplica(Weights& replica) {
+    _replicas[replica.getReplicaID()] = &replica;
+
+    const int n = _hWeights->getNumElements();
+    _shardSize = DIVUP(n, _replicas.size());
+}
+
+int Weights::getReplicaID() {
+    return _parent->getReplicaID();
+}
+
+int Weights::getDeviceID() {
+    return _parent->getDeviceID();
+}
+
+Layer& Weights::getParent() {
+    return *_parent;
+}
+
+/* 
+ * ===============
+ * WeightList
+ * ===============
+ */
+Weights& WeightList::operator[](const int i) const {
+    return *_weightList[i];
+}
+
+Weights& WeightList::at(const int i) const {
+    return *_weightList[i];
+}
+
+WeightList::~WeightList() {
+    for (int i = 0; i < _weightList.size(); i++) {
+        delete _weightList[i];
+    }
+}
+
+WeightList::WeightList() {
+}
+
+void WeightList::addWeights(Weights& w) {
+    _weightList.push_back(&w);
+}
+
+
+void WeightList::update(float progress) {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->update(progress);
+    }
+}
+
+void WeightList::copyToCPU() {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->copyToCPU();
+    }
+}
+
+void WeightList::copyToGPU() {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->copyToGPU();
+    }
+}
+
+int WeightList::getSize() const {
+    return _weightList.size();
+}
+
+void WeightList::addReplica(WeightList& replica) {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->addReplica(replica[i]);
+    }
+}
diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu
new file mode 100644
index 0000000..50d9b8e
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "../include/util.cuh"
+#include "../include/worker.cuh"
+#include "../include/timer.cuh"
+
+using namespace std;
+
+/* 
+ * ====================
+ * WorkResult
+ * ====================
+ */
+WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) {
+}
+
+WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) {
+}
+
+WorkResult::~WorkResult() {
+    delete _results; // delete NULL is ok
+}
+
+Cost& WorkResult::getResults() const {
+    return *_results;
+}
+
+WorkResult::RESULTS WorkResult::getResultType() const {
+    return _resultType;
+}
+
+/* 
+ * ====================
+ * Worker
+ * ====================
+ */
+Worker::Worker(ConvNet& convNet) : _convNet(&convNet) {
+}
+
+Worker::~Worker() {
+}
+
+/* 
+ * ====================
+ * DataWorker
+ * ====================
+ */
+DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data), _dp(NULL) {
+    assert(_data != NULL);
+}
+
+bool DataWorker::run() {
+    _dp = &_convNet->getDataProvider();
+    _dp->setData(*_data);
+    _run();
+    _dp->clearData();
+    return false;
+}
+
+DataWorker::~DataWorker() {
+}
+
+/* 
+ * ====================
+ * TrainingWorker
+ * ====================
+ */
+TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test)
+    : DataWorker(convNet, data), _progress(progress), _test(test) {
+}
+
+void TrainingWorker::_run() {
+    _convNet->setTrainingProgress(_progress);
+    Cost& batchCost = *new Cost();
+    int numMinibatches = _dp->getNumMinibatches();
+    for (int i = 0; i < numMinibatches; i++) {
+        for (int p = 0; p < _convNet->getNumPasses(); p++) {
+            _convNet->fprop(i, p, _test ? PASS_TEST : PASS_TRAIN);
+            _convNet->getCost(batchCost);
+
+            if (!_test) {
+                _convNet->bprop(p, PASS_TRAIN);
+                _convNet->updateWeights(p);
+            }
+        }
+    }
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/*
+ * ====================
+ * SyncWorker
+ * ====================
+ */
+SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) {
+}
+
+bool SyncWorker::run() {
+    _convNet->copyToCPU();
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE));
+    return false;
+}
+
+/*
+ * ====================
+ * ExitWorker
+ * ====================
+ */
+ExitWorker::ExitWorker(ConvNet& convNet) : Worker(convNet) {
+}
+
+bool ExitWorker::run() {
+    return true;
+}
+
+/* 
+ * ====================
+ * GradCheckWorker
+ * ====================
+ */
+GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data) 
+    : DataWorker(convNet, data) {
+}
+
+void GradCheckWorker::_run() {
+    _convNet->checkGradients();
+    exit(0); // eh
+}
+
+/* 
+ * ====================
+ * MultiviewTestWorker
+ * ====================
+ */
+MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName) 
+    : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) {
+//    assert(_data->getNumCases() % _numViews == 0);
+//    assert(convNet.getNumReplicas() == 1); // For now?
+}
+
+MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews) 
+    : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") {
+//    assert(_data->getNumCases() % _numViews == 0);
+}
+
+MultiviewTestWorker::~MultiviewTestWorker() {
+//    delete _cpuProbs;
+}
+
+CPUData& MultiviewTestWorker::getMinibatch(int v, int i) {
+    int numCasesPerView = _dp->getNumCases() / _numViews;
+    int miniStart = v * numCasesPerView + i * _dp->getMinibatchSize();
+    int miniEnd = v * numCasesPerView + min(numCasesPerView, (i + 1) * _dp->getMinibatchSize());
+    CPUData& mini = _dp->getDataSlice(miniStart, miniEnd);
+    return mini;
+}
+
+void MultiviewTestWorker::_run() {
+    int numCasesPerView = _dp->getNumCases() / _numViews;
+    int numMiniPerView = DIVUP(numCasesPerView, _dp->getMinibatchSize());
+
+    Cost& batchCost = *new Cost();
+    for (int i = 0; i < numMiniPerView; i++) {
+        for (int v = 0; v < _numViews - 1; v++) {
+            for (int p = 0; p < _convNet->getNumPasses(); p++) {
+                _convNet->fprop(getMinibatch(v, i), p, v == 0 ? PASS_MULTIVIEW_TEST_START : PASS_MULTIVIEW_TEST);
+            }
+        }
+        for (int p = 0; p < _convNet->getNumPasses(); p++) {
+            _convNet->fprop(getMinibatch(_numViews - 1, i), p, PASS_MULTIVIEW_TEST_END);
+            _convNet->getCost(batchCost);
+        }
+//        if (_cpuProbs != NULL) {
+//            LogregCostLayer& logregLayer = *dynamic_cast<LogregCostLayer*>(&_convNet->getLayer(_logregName, 0));
+//            NVMatrix::setDeviceID(logregLayer.getDeviceID());
+//            Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(),
+//                                                     min(numCasesReal, (i + 1) * _dp->getMinibatchSize()));
+//            NVMatrix& acts = logregLayer.getProbsAccum();
+//            NVMatrix acts_T;
+//            acts.transpose(acts_T);
+//            acts_T.copyToHost(miniProbs);
+//
+//            delete &miniProbs;
+//        }
+    }
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/* 
+ * ====================
+ * FeatureWorker
+ * ====================
+ */
+FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures)
+    : DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames), _deleteFeatures(deleteFeatures) {
+    assert(layerNames.size() == ftrs.size());
+    for (int i = 0; i < layerNames.size(); i++) {
+        assert(ftrs[i]->getNumRows() == data.getNumCases());
+        assert(!ftrs[i]->isTrans());
+    }
+}
+
+FeatureWorker::~FeatureWorker() {
+    if (_deleteFeatures) {
+        for (int i = 0; i < _ftrs->size(); i++) {
+            delete _ftrs->at(i);
+        }
+        delete _ftrs;
+    }
+    delete _layerNames;
+}
+
+void FeatureWorker::_run() {
+    Cost& batchCost = *new Cost();
+    map<int,int> repStart; // Feature write start offsets within minibatch
+    for (int i = 0; i < _dp->getNumMinibatches(); i++) {
+        for (int f = 0; f < _layerNames->size(); f++) {
+            repStart[f] = 0;
+        }
+
+        for (int p = 0; p < _convNet->getNumPasses(); p++) {
+            _convNet->fprop(i, p, PASS_FEATURE_GEN);
+            _convNet->getCost(batchCost);
+            for (int f = 0; f < _layerNames->size(); f++) {
+
+                if (_convNet->getLayer(_layerNames->at(f), 0).getFwdActiveInputReplicaIdx(p) >= 0) {
+                    Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(),
+                                                               min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
+
+                    for (int r = 0; r < _convNet->getLayer(_layerNames->at(f), 0).getNumReplicas(); ++r) {
+                        Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f), r);
+                        int d = ftrLayer.getDeviceID();
+                        NVMatrix::setDeviceID(d);
+                        NVMatrix& acts = ftrLayer.getActs();
+
+                        Matrix& repMiniFtrs = miniFtrs.sliceRows(repStart[f],
+                                                                 min(int(miniFtrs.getNumRows()), repStart[f] + acts.getLeadingDim()));
+
+                        NVMatrix acts_T;
+                        acts.transpose(false);
+                        acts.transpose(acts_T);
+                        acts_T.copyToHost(repMiniFtrs);
+                        NVMatrix::syncStream(); // eh why not
+
+                        delete &repMiniFtrs;
+
+                        repStart[f] += acts.getLeadingDim();
+                    }
+                    delete &miniFtrs;
+                }
+            }
+        }
+    }
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/* 
+ * ====================
+ * DataGradWorker
+ * ====================
+ */
+DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx)
+    : DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) {
+//    assert(dataGrads.getNumRows() == data.getNumCases());
+//    assert(!dataGrads.isTrans());
+}
+
+DataGradWorker::~DataGradWorker() {
+//    delete _dataGrads;
+}
+
+void DataGradWorker::_run() {
+//    DataLayer& dataLayer = *dynamic_cast<DataLayer*>(&_convNet->getLayer(_dataLayerIdx));
+//    SoftmaxLayer& softmaxLayer = *dynamic_cast<SoftmaxLayer*>(&_convNet->getLayer(_softmaxLayerIdx));
+//    softmaxLayer.setDoLogregGrad(false);
+//    Cost& batchCost = *new Cost(0);
+//    for (int i = 0; i < _dp->getNumMinibatches(); i++) {
+//        _convNet->fprop(i, PASS_TEST);
+//        _convNet->getCost(batchCost);
+//        softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad());
+//        
+//        softmaxLayer.getActsGrad().addScalar(1);
+//        softmaxLayer.getActsGrad().scale(-1);
+//        softmaxLayer.incRcvdBInputs();
+//        softmaxLayer.bprop(PASS_TEST);
+//        
+//        Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(),
+//                                                      min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
+//        NVMatrix& grads = dataLayer.getActsGrad();
+//        NVMatrix grads_T;
+//        if (grads.isTrans()) {
+//            NVMatrix& soft_T = grads.getTranspose();
+//            soft_T.transpose(grads_T);
+//            delete &soft_T;
+//        } else {
+//            grads.transpose(grads_T);
+//        }
+//        grads_T.copyToHost(miniDataGrads);
+//        delete &miniDataGrads;
+//        
+//        _convNet->reset();
+//    }
+//    cudaThreadSynchronize();
+//    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
diff --git a/caffe2/contrib/cuda-convnet2/images/show-cost.png b/caffe2/contrib/cuda-convnet2/images/show-cost.png
new file mode 100644
index 0000000..1e2ad5a
Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-cost.png differ
diff --git a/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png b/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png
new file mode 100644
index 0000000..c2bc364
Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png differ
diff --git a/caffe2/contrib/cuda-convnet2/images/show-filters.png b/caffe2/contrib/cuda-convnet2/images/show-filters.png
new file mode 100644
index 0000000..ca275e9
Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-filters.png differ
diff --git a/caffe2/contrib/cuda-convnet2/images/show-preds.png b/caffe2/contrib/cuda-convnet2/images/show-preds.png
new file mode 100644
index 0000000..0d5550f
Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-preds.png differ
diff --git a/caffe2/contrib/cuda-convnet2/initw.py b/caffe2/contrib/cuda-convnet2/initw.py
new file mode 100644
index 0000000..8f068a3
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/initw.py
@@ -0,0 +1,54 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from python_util.gpumodel import *
+import numpy as n
+import numpy.random as nr
+
+def get_src(filename):
+    src = IGPUModel.load_checkpoint(filename)
+    return src['model_state']['layers']
+    
+# Initialize weight matrix by copying weight matrix of given layer
+def makew(name, idx, shape, params):
+    src = get_src(params[0])
+    return src[name]['weights'][idx]
+    
+# Initialize bias vector by copying bias vector of given layer
+def makeb(name, shape, params):
+    src = get_src(params[0])
+    return src[name]['biases']
+    
+def concat(shape, src, src_layers, src_func):
+    mat = n.empty(shape, dtype=n.single, order='F')
+    start = 0
+    for s in src_layers:
+        m = src_func(src[s])
+        mat[:,start:start+m.shape[1]] = m
+        start += m.shape[1]
+    return mat
+
+# Initialize weight matrix by concatenating weight matrices of given layers
+def makewcat(name, idx, shape, params):
+    src, src_layers = get_src(params[0]), params[1:]
+    return concat(shape, src, src_layers, lambda x: x['weights'][idx])
+    
+# Initialize bias vector by concatenating bias vectors of given layers
+def makebcat(name, shape, params):
+    src, src_layers = get_src(params[0]), params[1:]
+    return concat(shape, src, src_layers, lambda x: x['biases'])
+
+# Initialize bias vector from tuple input
+def makeb_vec(name, shape, params):
+    return n.array([n.single(x) for x in params], dtype=n.single).reshape((1, len(params)))
diff --git a/caffe2/contrib/cuda-convnet2/layer.py b/caffe2/contrib/cuda-convnet2/layer.py
new file mode 100644
index 0000000..8baef39
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layer.py
@@ -0,0 +1,1537 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import exp
+import sys
+import ConfigParser as cfg
+import os
+import numpy as n
+import numpy.random as nr
+from math import ceil, floor
+from collections import OrderedDict
+from os import linesep as NL
+from python_util.options import OptionsParser
+import re
+
+class LayerParsingError(Exception):
+    pass
+
+# A neuron that doesn't take parameters
+class NeuronParser:
+    def __init__(self, type, func_str, uses_acts=True, uses_inputs=True):
+        self.type = type
+        self.func_str = func_str
+        self.uses_acts = uses_acts  
+        self.uses_inputs = uses_inputs
+        
+    def parse(self, type):
+        if type == self.type:
+            return {'type': self.type,
+                    'params': {},
+                    'usesActs': self.uses_acts,
+                    'usesInputs': self.uses_inputs}
+        return None
+    
+# A neuron that takes parameters
+class ParamNeuronParser(NeuronParser):
+    neuron_regex = re.compile(r'^\s*(\w+)\s*\[\s*(\w+(\s*,\w+)*)\s*\]\s*$')
+    def __init__(self, type, func_str, uses_acts=True, uses_inputs=True):
+        NeuronParser.__init__(self, type, func_str, uses_acts, uses_inputs)
+        m = self.neuron_regex.match(type)
+        self.base_type = m.group(1)
+        self.param_names = m.group(2).split(',')
+        assert len(set(self.param_names)) == len(self.param_names)
+        
+    def parse(self, type):
+        m = re.match(r'^%s\s*\[([\d,\.\s\-]*)\]\s*$' % self.base_type, type)
+        if m:
+            try:
+                param_vals = [float(v.strip()) for v in m.group(1).split(',')]
+                if len(param_vals) == len(self.param_names):
+                    return {'type': self.base_type,
+                            'params': dict(zip(self.param_names, param_vals)),
+                            'usesActs': self.uses_acts,
+                            'usesInputs': self.uses_inputs}
+            except TypeError:
+                pass
+        return None
+
+class AbsTanhNeuronParser(ParamNeuronParser):
+    def __init__(self):
+        ParamNeuronParser.__init__(self, 'abstanh[a,b]', 'f(x) = a * |tanh(b * x)|')
+        
+    def parse(self, type):
+        dic = ParamNeuronParser.parse(self, type)
+        # Make b positive, since abs(tanh(bx)) = abs(tanh(-bx)) and the C++ code
+        # assumes b is positive.
+        if dic:
+            dic['params']['b'] = abs(dic['params']['b'])
+        return dic
+
+class ParamParser:
+    lrs_regex = re.compile(r'^\s*(\w+)\s*(?:\[\s*(\w+(\s*;\w+)*)\s*\])?\s*$')
+    param_converters = {'i': int,
+                        'f': float}
+    def __init__(self, type):
+        m = self.lrs_regex.match(type)
+        self.base_type = m.group(1)
+        param_names_with_type = m.group(2).split(';') if m.group(2) is not None else []
+        self.param_names = [p[1:] for p in param_names_with_type]
+        self.param_types = [self.param_converters[p[0]] for p in param_names_with_type]
+        self.param_regex_inner = ";".join([('\s*%s\s*=\s*[^;,\s=]+\s*' % p) for p in self.param_names])
+        self.regex_str = ('^%s\s*(?:\[(%s)\])?\s*$') % (self.base_type, self.param_regex_inner)
+        assert len(set(self.param_names)) == len(self.param_names)
+    
+    def parse(self, type):
+        m = re.match(self.regex_str, type, flags=re.IGNORECASE)
+        if m:
+            try:
+                param_vals = [ptype(v.split('=')[1].strip()) for ptype,v in zip(self.param_types, m.group(1).split(';'))] if m.group(1) is not None else []
+                if len(param_vals) == len(self.param_names):
+                    return {'type': self.base_type,
+                            'params': dict(zip(self.param_names, param_vals))}
+            except TypeError:
+                pass
+        return None
+
+# Subclass that throws more convnet-specific exceptions than the default
+class MyConfigParser(cfg.SafeConfigParser):
+    def safe_get(self, section, option, f=cfg.SafeConfigParser.get, typestr=None, default=None):
+        try:
+            return f(self, section, option)
+        except cfg.NoOptionError, e:
+            if default is not None:
+                return default
+            raise LayerParsingError("Layer '%s': required parameter '%s' missing" % (section, option))
+        except ValueError, e:
+            if typestr is None:
+                raise e
+            raise LayerParsingError("Layer '%s': parameter '%s' must be %s" % (section, option, typestr))
+        
+    def safe_get_list(self, section, option, f=str, typestr='strings', default=None):
+        v = self.safe_get(section, option, default=default)
+        if type(v) == list:
+            return v
+        try:
+            return [f(x.strip()) for x in v.split(',')]
+        except:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be ','-delimited list of %s" % (section, option, typestr))
+        
+    def safe_get_int(self, section, option, default=None):
+        return self.safe_get(section, option, f=cfg.SafeConfigParser.getint, typestr='int', default=default)
+        
+    def safe_get_float(self, section, option, default=None):
+        return self.safe_get(section, option, f=cfg.SafeConfigParser.getfloat, typestr='float', default=default)
+    
+    def safe_get_bool(self, section, option, default=None):
+        return self.safe_get(section, option, f=cfg.SafeConfigParser.getboolean, typestr='bool', default=default)
+    
+    def safe_get_float_list(self, section, option, default=None):
+        return self.safe_get_list(section, option, float, typestr='floats', default=default)
+    
+    def safe_get_int_list(self, section, option, default=None):
+        return self.safe_get_list(section, option, int, typestr='ints', default=default)
+    
+    def safe_get_bool_list(self, section, option, default=None):
+        return self.safe_get_list(section, option, lambda x: x.lower() in ('true', '1'), typestr='bools', default=default)
+
+# A class that implements part of the interface of MyConfigParser
+class FakeConfigParser(object):
+    def __init__(self, dic):
+        self.dic = dic
+
+    def safe_get(self, section, option, default=None):
+        if option in self.dic:
+            return self.dic[option]
+        return default
+    
+    def safe_get_int(self, section, option, default=None):
+        return int(self.safe_get(section, option, default))
+    
+    def safe_get_int_list(self, section, option, default=None):
+        return list(self.safe_get(section, option, default))
+
+class LayerParser:
+    def __init__(self):
+        self.dic = {}
+        self.set_defaults()
+        
+    # Post-processing step -- this is called after all layers have been initialized
+    def optimize(self, layers):
+        self.dic['actsTarget'] = -1
+        self.dic['actsGradTarget'] = -1
+        if len(set(len(l['gpu']) for l in layers.values() if 'inputs' in l and self.dic['name'] in l['inputs'])) > 1:
+#            print set(len(l['gpu']) for l in layers.values())
+            raise LayerParsingError("Layer '%s': all next layers must have equal number of replicas." % (self.dic['name']))
+    
+    def parse_params(self, vals, parsers, param_name, human_name, num_params=1):
+        dic, name = self.dic, self.dic['name']
+        
+#        print vals
+        if len(vals) != num_params and len(vals) != 1:
+            raise LayerParsingError("Layer '%s': expected list of length %d for %s but got list of length %d."% (name, num_params, param_name, len(vals)))
+        parsed = []
+#        print vals
+        for v in vals:
+            for p in parsers:
+                parsedv = p.parse(v)
+                if parsedv: 
+                    parsed += [parsedv]
+                    break
+        if len(parsed) == 1 and num_params > 1:
+            parsed = parsed * num_params
+        if len(parsed) == num_params:
+            return parsed
+#        print parsed, vals
+        raise LayerParsingError("Layer '%s': unable to parse %s %s=%s." % (name, human_name, param_name, ",".join(vals)))
+    
+    # Add parameters from layer parameter file
+    def add_params(self, mcp):
+        pass
+#        self.dic['conserveMem'] = mcp.convnet.op.get_value('conserve_mem') if mcp.convnet is not None else 0
+    
+    def init(self, dic):
+        self.dic = dic
+        return self
+    
+    def set_defaults(self):
+        self.dic['outputs'] = 0
+        self.dic['parser'] = self
+        self.dic['requiresParams'] = False
+        # Does this layer use its own activity matrix
+        # for some purpose other than computing its output?
+        # Usually, this will only be true for layers that require their
+        # own activity matrix for gradient computations. For example, layers
+        # with logistic units must compute the gradient y * (1 - y), where y is 
+        # the activity matrix.
+        # 
+        # Layers that do not not use their own activity matrix should advertise
+        # this, since this will enable memory-saving matrix re-use optimizations.
+        #
+        # The default value of this property is True, for safety purposes.
+        # If a layer advertises that it does not use its own activity matrix when
+        # in fact it does, bad things will happen.
+        self.dic['usesActs'] = True
+        
+        # Does this layer use the activity matrices of its input layers
+        # for some purpose other than computing its output?
+        #
+        # Again true by default for safety
+        self.dic['usesInputs'] = True
+        
+        # Force this layer to use its own activity gradient matrix,
+        # instead of borrowing one from one of its inputs.
+        # 
+        # This should be true for layers where the mapping from output
+        # gradient to input gradient is non-elementwise.
+        self.dic['forceOwnActs'] = True
+        
+        # Does this layer need the gradient at all?
+        # Should only be true for layers with parameters (weights).
+        self.dic['gradConsumer'] = False
+        
+        # The gpu indices on which this layer runs
+        self.dic['gpu'] = [-1]
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        self.prev_layers = prev_layers
+        self.dic['name'] = name
+        self.dic['type'] = mcp.safe_get(name, 'type')
+        self.dic['id'] = len(prev_layers)
+
+        return self.dic  
+
+    def verify_float_range(self, v, param_name, _min, _max):
+        self.verify_num_range(v, param_name, _min, _max, strconv=lambda x: '%.3f' % x)
+
+    def verify_num_range(self, v, param_name, _min, _max, strconv=lambda x:'%d' % x):
+        if type(v) == list:
+            for i,vv in enumerate(v):
+                self._verify_num_range(vv, param_name, _min, _max, i, strconv=strconv)
+        else:
+            self._verify_num_range(v, param_name, _min, _max, strconv=strconv)
+    
+    def _verify_num_range(self, v, param_name, _min, _max, input=-1, strconv=lambda x:'%d' % x):
+        layer_name = self.dic['name'] if input < 0 else '%s[%d]' % (self.dic['name'], input)
+        if _min is not None and _max is not None and (v < _min or v > _max):
+            raise LayerParsingError("Layer '%s': parameter '%s' must be in the range %s-%s" % (layer_name, param_name, strconv(_min), strconv(_max)))
+        elif _min is not None and v < _min:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be greater than or equal to %s" % (layer_name, param_name,  strconv(_min)))
+        elif _max is not None and v > _max:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be smaller than or equal to %s" % (layer_name, param_name,  strconv(_max)))
+    
+    def verify_divisible(self, value, div, value_name, div_name=None, input_idx=0):
+        layer_name = self.dic['name'] if len(self.dic['inputs']) == 0 else '%s[%d]' % (self.dic['name'], input_idx)
+        if value % div != 0:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be divisible by %s" % (layer_name, value_name, str(div) if div_name is None else "'%s'" % div_name))
+        
+    def verify_str_in(self, value, param_name, lst, input_idx=-1):
+        lname = self.dic['name'] if input_idx == -1 else ('%s[%d]' % (self.dic['name'], input_idx))
+        if value not in lst:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (lname, param_name, ", ".join("'%s'" % s for s in lst)))
+        
+    def verify_int_in(self, value, param_name, lst):
+        if value not in lst:
+            raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst)))
+        
+    def verify_all_ints_in(self, values, param_name, lst):
+        if len([v for v in values if v not in lst]) > 0:
+            raise LayerParsingError("Layer '%s': all parameters to '%s' must be among %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst)))
+    
+    def verify_input_dims(self, dims):
+        for i,d in enumerate(dims):
+            if d is not None and self.dic['numInputs'][i] != d: # first input must be labels
+                raise LayerParsingError("Layer '%s': dimensionality of input %d must be %d" % (self.dic['name'], i, d))
+
+    # This looks for neuron=x arguments in various layers, and creates
+    # separate layer definitions for them.
+    @staticmethod
+    def detach_neuron_layers(layers):
+        for name,l in layers.items():
+            if l['type'] != 'neuron' and 'neuron' in l and l['neuron']:
+                NeuronLayerParser().detach_neuron_layer(name, layers)
+                
+    @staticmethod
+    def parse_layers(layer_cfg_path, param_cfg_path, model, layers={}):
+        try:
+            if not os.path.exists(layer_cfg_path):
+                raise LayerParsingError("Layer definition file '%s' does not exist" % layer_cfg_path)
+            if not os.path.exists(param_cfg_path):
+                raise LayerParsingError("Layer parameter file '%s' does not exist" % param_cfg_path)
+            if len(layers) == 0:
+                mcp = MyConfigParser(dict_type=OrderedDict)
+                mcp.readfp(open(layer_cfg_path))
+                for name in mcp.sections():
+                    if not mcp.has_option(name, 'type'):
+                        raise LayerParsingError("Layer '%s': no type given" % name)
+                    ltype = mcp.safe_get(name, 'type')
+                    if ltype not in layer_parsers:
+                        raise LayerParsingError("Layer '%s': Unknown layer type: '%s'" % (name, ltype))
+                    layers[name] = layer_parsers[ltype]().parse(name, mcp, layers, model)
+                
+                LayerParser.detach_neuron_layers(layers)
+                for l in layers.values():
+                    l['parser'].optimize(layers)
+                    del l['parser']
+                    
+                for name,l in layers.items():
+                    if not l['type'].startswith('cost.'):
+                        found = max(name in l2['inputs'] for l2 in layers.values() if 'inputs' in l2)
+                        if not found:
+                            raise LayerParsingError("Layer '%s' of type '%s' is unused" % (name, l['type']))
+            
+            mcp = MyConfigParser(dict_type=OrderedDict)
+            mcp.readfp(open(param_cfg_path))
+#            mcp.convnet = model
+            for name,l in layers.items():
+                if not mcp.has_section(name) and l['requiresParams']:
+                    raise LayerParsingError("Layer '%s' of type '%s' requires extra parameters, but none given in file '%s'." % (name, l['type'], param_cfg_path))
+                lp = layer_parsers[l['type']]().init(l)
+                lp.add_params(mcp)
+        except LayerParsingError, e:
+            print e
+            sys.exit(1)
+        return layers
+        
+    @staticmethod
+    def register_layer_parser(ltype, cls):
+        if ltype in layer_parsers:
+            raise LayerParsingError("Layer type '%s' already registered" % ltype)
+        layer_parsers[ltype] = cls
+
+# Any layer that takes an input (i.e. non-data layer)
+class LayerWithInputParser(LayerParser):
+    def __init__(self, num_inputs=-1):
+        LayerParser.__init__(self)
+        self.num_inputs = num_inputs
+        
+    def verify_num_params(self, params, auto_expand=True):
+        for param in params:
+            if len(self.dic[param]) != len(self.dic['inputs']):
+                if auto_expand and len(self.dic[param]) == 1:
+                    self.dic[param] *= len(self.dic['inputs'])
+                else:
+                    raise LayerParsingError("Layer '%s': %s list length does not match number of inputs" % (self.dic['name'], param))        
+    
+    # layers: dictionary: name -> layer
+    def optimize(self, layers):
+        LayerParser.optimize(self, layers)
+        dic = self.dic
+        
+        # Check if I have an input that no one else uses.
+        #print "Layer %s optimizing" % dic['name']
+        if not dic['forceOwnActs']:
+            for i, inp in enumerate(dic['inputLayers']):
+                if inp['outputs'] == dic['outputs'] and sum(('inputs' in ll) and (inp['name'] in ll['inputs']) for ll in layers.itervalues()) == 1:
+                    # I can share my activity matrix with this layer
+                    # if it does not use its activity matrix, and I 
+                    # do not need to remember my inputs.
+                    # TODO: a dropout layer should always be able to overwrite
+                    # its input. Make it so.
+#                    print "Layer %s(uses inputs=%d), input %s(uses acts = %d)" % (dic['name'], dic['usesInputs'], inp['name'], inp['usesActs'])
+                    if not inp['usesActs'] and not dic['usesInputs']:
+                        dic['actsTarget'] = i
+                        print "Layer %s using acts from layer %s" % (dic['name'], inp['name'])
+#                        print "Layer '%s' sharing activity matrix with layer '%s'" % (dic['name'], l['name'])
+                    # I can share my gradient matrix with this layer if we're on the same GPU.
+                    # This is different from the logic for actsTarget because this guy doesn't
+                    # have an actsGrad matrix on my GPU if our GPUs are different, so there's
+                    # nothing to share.
+                    if dic['gpu'] == inp['gpu']:
+                        dic['actsGradTarget'] = i
+#                    print "Layer '%s' sharing activity gradient matrix with layer '%s'" % (dic['name'], l['name'])
+            
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerParser.parse(self, name, mcp, prev_layers, model)
+        
+        dic['inputs'] = [inp.strip() for inp in mcp.safe_get(name, 'inputs').split(',')]
+        
+        for inp in dic['inputs']:
+            if inp not in prev_layers:
+                raise LayerParsingError("Layer '%s': input layer '%s' not defined" % (name, inp))
+            
+        dic['inputLayers'] = [prev_layers[inp] for inp in dic['inputs']]
+        dic['gpu'] = mcp.safe_get_int_list(name, 'gpu', default=dic['inputLayers'][0]['gpu'])
+        dic['gpus'] = ", ".join('%s' % d for d in dic['gpu'])
+        dic['numReplicas'] = len(dic['gpu'])
+        
+        if len(set(dic['gpu'])) != len(dic['gpu']):
+            raise LayerParsingError("Layer '%s': all replicas must run on different GPUs." % (name))
+        
+        for inp in dic['inputs']:
+            # Data layers do not explicitly define how many replicas they have.
+            # The number of replicas for a data layer is given by the number of replicas
+            # in the next layer(s). So we set that here.
+            inpl = prev_layers[inp]
+            if inpl['type'] == 'data':
+                inpl['numReplicas'] = dic['numReplicas']
+            if inpl['numReplicas'] % dic['numReplicas'] != 0:
+                raise LayerParsingError("Layer '%s': number of replicas (%d) must divide number of replicas in all input layers (input %s has %d replicas)." % (name, dic['numReplicas'], inpl['name'], inpl['numReplicas']))
+        if len(set(inp['numReplicas'] for inp in dic['inputLayers'])) != 1:
+            raise LayerParsingError("Layer '%s': all input layers must have equal numbers of replicas." % (name))
+
+        # Need to also assert that all *next* layers have equal number of replicas but this is hard so it's done in Layer.optimize
+        for inp in dic['inputLayers']:
+            if inp['outputs'] == 0:
+                raise LayerParsingError("Layer '%s': input layer '%s' does not produce any output" % (name, inp['name']))
+        dic['numInputs'] = [inp['outputs'] for inp in dic['inputLayers']]
+        
+        # Layers can declare a neuron activation function to apply to their output, as a shortcut
+        # to avoid declaring a separate neuron layer above themselves.
+        dic['neuron'] = mcp.safe_get(name, 'neuron', default="")
+        if self.num_inputs > 0 and len(dic['numInputs']) != self.num_inputs:
+            raise LayerParsingError("Layer '%s': number of inputs must be %d" % (name, self.num_inputs))
+        
+        if model:
+            self.verify_all_ints_in(dic['gpu'], 'gpu', range(len(model.op.get_value('gpu'))))
+        return dic
+    
+    def verify_img_size(self):
+        dic = self.dic
+        if dic['numInputs'][0] % dic['imgPixels'] != 0 or dic['imgSize'] * dic['imgSize'] != dic['imgPixels']:
+            raise LayerParsingError("Layer '%s': has %-d dimensional input, not interpretable as %d-channel images" % (dic['name'], dic['numInputs'][0], dic['channels']))
+    
+    @staticmethod
+    def grad_consumers_below(dic):
+        if dic['gradConsumer']:
+            return True
+        if 'inputLayers' in dic:
+            return any(LayerWithInputParser.grad_consumers_below(l) for l in dic['inputLayers'])
+        
+    def verify_no_grads(self):
+        if LayerWithInputParser.grad_consumers_below(self.dic):
+            raise LayerParsingError("Layer '%s': layers of type '%s' cannot propagate gradient and must not be placed over layers with parameters." % (self.dic['name'], self.dic['type']))
+
+class NailbedLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+        
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['stride'] = mcp.safe_get_int(name, 'stride')
+
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        dic['outputsX'] = (dic['imgSize'] + dic['stride'] - 1) / dic['stride']
+        dic['start'] = (dic['imgSize'] - dic['stride'] * (dic['outputsX'] - 1)) / 2
+        dic['outputs'] = dic['channels'] * dic['outputsX']**2
+        
+        self.verify_num_range(dic['outputsX'], 'outputsX', 0, None)
+        
+        self.verify_img_size()
+        
+        print "Initialized bed-of-nails layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels'])
+        return dic
+    
+class GaussianBlurLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+        dic['outputs'] = dic['numInputs'][0]
+        
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['filterSize'] = mcp.safe_get_int(name, 'filterSize')
+        dic['stdev'] = mcp.safe_get_float(name, 'stdev')
+
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        self.verify_int_in(dic['filterSize'], 'filterSize', [3, 5, 7, 9])
+        
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        dic['filter'] = n.array([exp(-(dic['filterSize']/2 - i)**2 / float(2 * dic['stdev']**2)) 
+                                 for i in xrange(dic['filterSize'])], dtype=n.float32).reshape(1, dic['filterSize'])
+        dic['filter'] /= dic['filter'].sum()
+        self.verify_img_size()
+        
+        if dic['filterSize'] > dic['imgSize']:
+            raise LayerParsingError("Later '%s': filter size (%d) must be smaller than image size (%d)." % (dic['name'], dic['filterSize'], dic['imgSize']))
+        
+        print "Initialized Gaussian blur layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
+        
+        return dic
+    
+class HorizontalReflectionLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['outputs'] = dic['numInputs'][0]
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+  
+        self.verify_num_range(dic['channels'], 'channels', 1, 3)
+
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        self.verify_img_size()
+        
+        print "Initialized horizontal reflection layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
+        
+        return dic
+    
+class ResizeLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+        
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        
+        dic['scale'] = mcp.safe_get_float(name, 'scale')
+        dic['tgtSize'] = int(floor(dic['imgSize'] / dic['scale']))
+        dic['tgtPixels'] = dic['tgtSize']**2
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        # Really not recommended to use this for such severe scalings
+        self.verify_float_range(dic['scale'], 'scale', 0.5, 2) 
+
+        dic['outputs'] = dic['channels'] * dic['tgtPixels']
+        
+        self.verify_img_size()
+        self.verify_no_grads()
+        
+        print "Initialized resize layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels'])
+        
+        return dic
+    
+class RandomScaleLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+        
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        
+        dic['maxScale'] = mcp.safe_get_float(name, 'maxScale')
+        dic['tgtSize'] = mcp.safe_get_int(name, 'tgtSize')
+        min_size = int(floor(dic['imgSize'] / dic['maxScale']))
+        max_size = dic['imgSize'] #int(floor(dic['imgSize'] * dic['maxScale']))
+        if dic['tgtSize'] < min_size:
+            raise LayerParsingError("Layer '%s': target size must be greater than minimum image size after rescaling (%d)" % (name, min_size))
+        if dic['tgtSize'] > max_size:
+            raise LayerParsingError("Layer '%s': target size must be smaller than maximum image size after rescaling (%d)" % (name, max_size))
+        dic['tgtPixels'] = dic['tgtSize']**2
+        
+        self.verify_float_range(dic['maxScale'], 'maxScale', 1, 2) 
+
+        dic['outputs'] = dic['channels'] * dic['tgtPixels']
+        
+        self.verify_img_size()
+        self.verify_no_grads()
+        
+        print "Initialized random scale layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels'])
+        
+        return dic
+    
+class CropLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+        
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        dic['startX'] = mcp.safe_get_int(name, 'startX')
+        dic['startY'] = mcp.safe_get_int(name, 'startY', default=dic['startX'])
+        dic['sizeX'] = mcp.safe_get_int(name, 'sizeX')
+        
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+     
+        dic['outputs'] = dic['channels'] * (dic['sizeX']**2)
+        
+        self.verify_num_range(dic['startX'], 'startX', 0, dic['imgSize']-1)
+        self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize'])
+        self.verify_num_range(dic['startY'], 'startY', 0, dic['imgSize']-1)
+        self.verify_img_size()
+        self.verify_no_grads()
+        
+        if dic['startX'] + dic['sizeX'] > dic['imgSize']:
+            raise LayerParsingError("Layer '%s': startX (%d) + sizeX (%d) > imgSize (%d)" % (name, dic['startX'], dic['sizeX'], dic['imgSize']))
+        
+        print "Initialized cropping layer '%s', producing %dx%d %d-channel output" % (name, dic['sizeX'], dic['sizeX'], dic['channels'])
+        
+        return dic
+    
+class ColorTransformLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+    
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['forceOwnActs'] = False
+        dic['usesActs'] = False
+        dic['usesInputs'] = False
+
+        # Computed values
+        dic['imgPixels'] = dic['numInputs'][0] / 3
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        dic['channels'] = 3
+        dic['outputs'] = dic['numInputs'][0]
+        
+        self.verify_img_size()
+        self.verify_no_grads()
+        
+        return dic
+    
+class RGBToYUVLayerParser(ColorTransformLayerParser):
+    def __init__(self):
+        ColorTransformLayerParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model)
+        print "Initialized RGB --> YUV layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
+        return dic
+    
+class RGBToLABLayerParser(ColorTransformLayerParser):
+    def __init__(self):
+        ColorTransformLayerParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model)
+        dic['center'] = mcp.safe_get_bool(name, 'center', default=False)
+        print "Initialized RGB --> LAB layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
+        return dic
+
+class NeuronLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+    
+    @staticmethod
+    def get_unused_layer_name(layers, wish):
+        if wish not in layers:
+            return wish
+        for i in xrange(1, 100):
+            name = '%s.%d' % (wish, i)
+            if name not in layers:
+                return name
+        raise LayerParsingError("This is insane.")
+    
+    def parse_neuron(self, neuron_str):
+        for n in neuron_parsers:
+            p = n.parse(neuron_str)
+            if p: # Successfully parsed neuron, return it
+                self.dic['neuron'] = p
+                self.dic['usesActs'] = self.dic['neuron']['usesActs']
+                self.dic['usesInputs'] = self.dic['neuron']['usesInputs']
+                
+                return
+        # Could not parse neuron
+        # Print available neuron types
+        colnames = ['Neuron type', 'Function']
+        m = max(len(colnames[0]), OptionsParser._longest_value(neuron_parsers, key=lambda x:x.type)) + 2
+        ntypes = [OptionsParser._bold(colnames[0].ljust(m))] + [n.type.ljust(m) for n in neuron_parsers]
+        fnames = [OptionsParser._bold(colnames[1])] + [n.func_str for n in neuron_parsers]
+        usage_lines = NL.join(ntype + fname for ntype,fname in zip(ntypes, fnames))
+        
+        raise LayerParsingError("Layer '%s': unable to parse neuron type '%s'. Valid neuron types: %sWhere neurons have parameters, they must be floats." % (self.dic['name'], neuron_str, NL + usage_lines + NL))
+    
+    def detach_neuron_layer(self, src_name, layers):
+        dic = self.dic
+#        self.set_defaults()
+        dic['name'] = NeuronLayerParser.get_unused_layer_name(layers, '%s_neuron' % src_name)
+        dic['type'] = 'neuron'
+        dic['inputs'] = src_name
+        dic['neuron'] = layers[src_name]['neuron']
+        dic['gpu'] = layers[src_name]['gpu']
+        
+        # Yes it's not entirely correct to pass all of layers as prev_layers, but it's harmless
+        dic = self.parse(dic['name'], FakeConfigParser(dic), layers)
+        dic['src_layer'] = src_name
+        
+        # Link upper layers to this new one
+        for l in layers.values():
+            if 'inputs' in l:
+                l['inputs'] = [inp if inp != src_name else dic['name'] for inp in l['inputs']]
+                l['inputLayers'] = [inp if inp['name'] != src_name else dic for inp in l['inputLayers']]
+        layers[dic['name']] = dic
+    
+    def parse(self, name, mcp, prev_layers, model=None):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['outputs'] = dic['numInputs'][0]
+        self.parse_neuron(dic['neuron'])
+        dic['forceOwnActs'] = False
+        print "Initialized neuron layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+
+class EltwiseSumLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self)
+    
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['coeffs'] = mcp.safe_get_float_list(name, 'coeffs', default=[1.0] * len(dic['inputs']))
+    
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        
+        if len(set(dic['numInputs'])) != 1:
+            raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs'])))
+        dic['outputs'] = dic['numInputs'][0]
+        dic['usesInputs'] = False
+        dic['usesActs'] = False
+        dic['forceOwnActs'] = False
+        dic['requiresParams'] = True        
+        
+        print "Initialized elementwise sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class EltwiseMaxLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        if len(dic['inputs']) < 2:
+            raise LayerParsingError("Layer '%s': elementwise max layer must have at least 2 inputs, got %d." % (name, len(dic['inputs'])))
+        if len(set(dic['numInputs'])) != 1:
+            raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs'])))
+        dic['outputs'] = dic['numInputs'][0]
+
+        print "Initialized elementwise max layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class SumLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        
+        dic['stride'] = mcp.safe_get_int(name, 'stride', default=1)
+        self.verify_divisible(dic['numInputs'][0], dic['stride'], 'input dimensionality', 'stride')
+        dic['outputs'] = dic['numInputs'][0] / dic['stride']
+    
+        print "Initialized sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class DropoutLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['enable'] = mcp.safe_get_bool(name, 'enable', default=True)
+        dic['keep'] = mcp.safe_get_float(name, 'keep', default=0.5)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        dic['usesInputs'] = False
+        dic['usesActs'] = False
+        dic['forceOwnActs'] = False
+        dic['outputs'] = dic['numInputs'][0]
+
+        print "Initialized %s layer '%s' on GPUs %s, producing %d outputs" % (dic['type'], name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class Dropout2LayerParser(DropoutLayerParser):
+    def __init__(self):
+        DropoutLayerParser.__init__(self)
+    
+class WeightLayerParser(LayerWithInputParser):
+    LAYER_PAT = re.compile(r'^\s*([^\s\[]+)(?:\[(\d+)\])?\s*$') # matches things like layername[5], etc
+    
+    def __init__(self, num_inputs=-1):
+        LayerWithInputParser.__init__(self, num_inputs=num_inputs)
+    
+    @staticmethod
+    def get_layer_name(name_str):
+        m = WeightLayerParser.LAYER_PAT.match(name_str)
+        if not m:
+            return None
+        return m.group(1), m.group(2)
+        
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['momW'] = mcp.safe_get_float_list(name, 'momW')
+        dic['momB'] = mcp.safe_get_float(name, 'momB')
+        dic['superEps'] = mcp.safe_get_float(name, 'superEps', default=0.0)
+        dic['superMom'] = mcp.safe_get_float(name, 'superMom', default=0.0)
+        dic['wc'] = mcp.safe_get_float_list(name, 'wc', default=[0.0] * len(dic['inputs']))
+        dic['wball'] = mcp.safe_get_float_list(name, 'wball', default=[0.0] * len(dic['inputs']))
+        self.verify_num_params(['momW', 'wc', 'wball'])
+#        dic['wballNormed'] = [wball * nweights for wball,nweights in zip(dic['wball'], dic['weightsPerFilter'])]
+        dic['wballNormed'] = dic['wball']
+        
+        # Convert from old-style 0.001,0.02 hyperparam specification to new-stye
+        # const[base=0.001],const[base=0.02] and so forth
+        def convert_scalars_to_schedules(scalars):
+            parts = scalars.split(',')
+            for i,p in enumerate(parts):
+                p = p.strip()
+                if re.match('(?:\d*\.)?\d+$', p):
+                    parts[i] = 'const[base=%s]' % p
+            return parts
+            
+        dic['epsW'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsW')), lrs_parsers, 'epsW', 'learning rate schedule', num_params=len(dic['inputs']))
+        dic['epsB'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsB')), lrs_parsers, 'epsB', 'learning rate schedule', num_params=1)[0]
+        
+        dic['updatePeriod'] = mcp.safe_get_int(name, 'updatePeriod', default=0) # 0 means update as often as possible
+        # TODO: assert that updatePeriod is a multiple of active pass period, which is unknown here.
+        # the assert has to go in some post-processing step..
+        dic['gradConsumer'] = dic['epsB']['params']['base'] > 0 or any(w['params']['base'] > 0 for w in dic['epsW'])
+
+    @staticmethod
+    def unshare_weights(layer, layers, matrix_idx=None):
+        def unshare(layer, layers, indices):
+            for i in indices:
+                if layer['weightSourceLayers'][i] >= 0:
+                    src_matrix_idx = layer['weightSourceMatrixIndices'][i]
+                    layer['weightSourceLayers'][i] = ""
+                    layer['weightSourceMatrixIndices'][i] = -1
+                    layer['weights'][i] = layer['weights'][i].copy()
+                    layer['weightsInc'][i] = n.zeros_like(layer['weights'][i])
+                    print "Unshared weight matrix %s[%d] from %s[%d]." % (layer['name'], i, layer['weightSourceLayers'][i], src_matrix_idx)
+                else:
+                    print "Weight matrix %s[%d] already unshared." % (layer['name'], i)
+        if 'weightSourceLayers' in layer:
+            unshare(layer, layers, range(len(layer['inputs'])) if matrix_idx is None else [matrix_idx])
+
+    # Load weight/biases initialization module
+    def call_init_func(self, param_name, shapes, input_idx=-1):
+        dic = self.dic
+        func_pat = re.compile('^([^\.]+)\.([^\(\)]+)\s*(?:\(([^,]+(?:,[^,]+)*)\))?$')
+        m = func_pat.match(dic[param_name])
+        if not m:
+            raise LayerParsingError("Layer '%s': '%s' parameter must have format 'moduleName.functionName(param1,param2,...)'; got: %s." % (dic['name'], param_name, dic['initWFunc']))
+        module, func = m.group(1), m.group(2)
+        params = m.group(3).split(',') if m.group(3) is not None else []
+        try:
+            mod = __import__(module)
+            return getattr(mod, func)(dic['name'], input_idx, shapes, params=params) if input_idx >= 0 else getattr(mod, func)(dic['name'], shapes, params=params)
+        except (ImportError, AttributeError, TypeError), e:
+            raise LayerParsingError("Layer '%s': %s." % (dic['name'], e))
+        
+    def make_weights(self, initW, rows, cols, order='C'):
+        dic = self.dic
+        dic['weights'], dic['weightsInc'] = [], []
+        if dic['initWFunc']: # Initialize weights from user-supplied python function
+            # Initialization function is supplied in the format
+            # module.func
+            for i in xrange(len(dic['inputs'])):
+                dic['weights'] += [self.call_init_func('initWFunc', (rows[i], cols[i]), input_idx=i)]
+
+                if type(dic['weights'][i]) != n.ndarray:
+                    raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], i, dic['initWFunc'], type(dic['weights'][i])))
+                if dic['weights'][i].dtype != n.float32:
+                    raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must weight matrices consisting of single-precision floats. Got: %s." % (dic['name'], i, dic['initWFunc'], dic['weights'][i].dtype))
+                if dic['weights'][i].shape != (rows[i], cols[i]):
+                    raise LayerParsingError("Layer '%s[%d]': weight matrix returned by weight initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], i, dic['initWFunc'], (rows[i], cols[i]), dic['weights'][i].shape))
+                # Convert to desired order
+                dic['weights'][i] = n.require(dic['weights'][i], requirements=order)
+                dic['weightsInc'] += [n.zeros_like(dic['weights'][i])]
+                print "Layer '%s[%d]' initialized weight matrices from function %s" % (dic['name'], i, dic['initWFunc'])
+        else:
+            for i in xrange(len(dic['inputs'])):
+                if dic['weightSourceLayers'][i] != '': # Shared weight matrix
+                    src_layer = self.prev_layers[dic['weightSourceLayers'][i]] if dic['weightSourceLayers'][i] != dic['name'] else dic
+                    dic['weights'] += [src_layer['weights'][dic['weightSourceMatrixIndices'][i]]]
+                    dic['weightsInc'] += [src_layer['weightsInc'][dic['weightSourceMatrixIndices'][i]]]
+                    if dic['weights'][i].shape != (rows[i], cols[i]):
+                        raise LayerParsingError("Layer '%s': weight sharing source matrix '%s' has shape %dx%d; should be %dx%d." 
+                                                % (dic['name'], dic['weightSource'][i], dic['weights'][i].shape[0], dic['weights'][i].shape[1], rows[i], cols[i]))
+                    print "Layer '%s' initialized weight matrix %d from %s" % (dic['name'], i, dic['weightSource'][i])
+                else:
+                    dic['weights'] += [n.array(initW[i] * nr.randn(rows[i], cols[i]), dtype=n.single, order=order)]
+                    dic['weightsInc'] += [n.zeros_like(dic['weights'][i])]
+        
+    def make_biases(self, rows, cols, order='C'):
+        dic = self.dic
+        if dic['initBFunc']:
+            dic['biases'] = self.call_init_func('initBFunc', (rows, cols))
+            if type(dic['biases']) != n.ndarray:
+                raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], dic['initBFunc'], type(dic['biases'])))
+            if dic['biases'].dtype != n.float32:
+                raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object consisting of single-precision floats. Got: %s." % (dic['name'], dic['initBFunc'], dic['biases'].dtype))
+            if dic['biases'].shape != (rows, cols):
+                raise LayerParsingError("Layer '%s': bias vector returned by bias initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], dic['initBFunc'], (rows, cols), dic['biases'].shape))
+
+            dic['biases'] = n.require(dic['biases'], requirements=order)
+            print "Layer '%s' initialized bias vector from function %s" % (dic['name'], dic['initBFunc'])
+        else:
+            dic['biases'] = dic['initB'] * n.ones((rows, cols), order=order, dtype=n.single)
+        dic['biasesInc'] = n.zeros_like(dic['biases'])
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        dic['gradConsumer'] = True
+        dic['usesActs'] = False
+        dic['initW'] = mcp.safe_get_float_list(name, 'initW', default=0.01)
+        dic['initB'] = mcp.safe_get_float(name, 'initB', default=0)
+        dic['initWFunc'] = mcp.safe_get(name, 'initWFunc', default="")
+        dic['initBFunc'] = mcp.safe_get(name, 'initBFunc', default="")
+        # Find shared weight matrices
+        
+        dic['weightSource'] = mcp.safe_get_list(name, 'weightSource', default=[''] * len(dic['inputs']))
+        self.verify_num_params(['initW'])
+        self.verify_num_params(['weightSource'], auto_expand=False)
+        
+        dic['weightSourceLayers'] = []
+        dic['weightSourceMatrixIndices'] = []
+
+        for i, src_name in enumerate(dic['weightSource']):
+            src_layer_matrix_idx = -1
+            src_layer_name = ''
+            if src_name != '':
+                src_layer_match = WeightLayerParser.get_layer_name(src_name)
+                if src_layer_match is None:
+                    raise LayerParsingError("Layer '%s': unable to parse weight sharing source '%s'. Format is layer[idx] or just layer, in which case idx=0 is used." % (name, src_name))
+                src_layer_name = src_layer_match[0]
+                src_layer_matrix_idx = int(src_layer_match[1]) if src_layer_match[1] is not None else 0
+
+                if src_layer_name not in prev_layers and src_layer_name != name:
+                    raise LayerParsingError("Layer '%s': weight sharing source layer '%s' does not exist." % (name, src_layer_name))
+                
+#                src_layer_idx = prev_names.index(src_layer_name) if src_layer_name != name else len(prev_names)
+                src_layer = prev_layers[src_layer_name] if src_layer_name != name else dic
+                if src_layer['gpu'] != dic['gpu']:
+                    raise LayerParsingError("Layer '%s': weight sharing source layer '%s' runs on GPUs %s, while '%s' runs on GPUs %s." % (name, src_layer_name, src_layer['gpu'], name, dic['gpu']))
+                if src_layer['type'] != dic['type']:
+                    raise LayerParsingError("Layer '%s': weight sharing source layer '%s' is of type '%s'; should be '%s'." % (name, src_layer_name, src_layer['type'], dic['type']))
+                if src_layer_name != name and len(src_layer['weights']) <= src_layer_matrix_idx:
+                    raise LayerParsingError("Layer '%s': weight sharing source layer '%s' has %d weight matrices, but '%s[%d]' requested." % (name, src_layer_name, len(src_layer['weights']), src_name, src_layer_matrix_idx))
+                if src_layer_name == name and src_layer_matrix_idx >= i:
+                    raise LayerParsingError("Layer '%s': weight sharing source '%s[%d]' not defined yet." % (name, name, src_layer_matrix_idx))
+
+            dic['weightSourceLayers'] += [src_layer_name]
+            dic['weightSourceMatrixIndices'] += [src_layer_matrix_idx]
+                
+        return dic
+        
+class FCLayerParser(WeightLayerParser):
+    def __init__(self):
+        WeightLayerParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
+        
+        dic['outputs'] = mcp.safe_get_int(name, 'outputs')
+        dic['weightsPerFilter'] = dic['numInputs']
+        self.verify_num_range(dic['outputs'], 'outputs', 1, None)
+        self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']] * len(dic['numInputs']), order='F')
+        self.make_biases(1, dic['outputs'], order='F')
+
+        print "Initialized fully-connected layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class SplitFCLayerParser(WeightLayerParser):
+    def __init__(self):
+        WeightLayerParser.__init__(self)
+    
+    def parse(self, name, mcp, prev_layers, model):
+        dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
+        dic['parts'] = mcp.safe_get_int(name, 'parts')
+        dic['outputs'] = mcp.safe_get_int(name, 'outputs') * dic['parts']
+        dic['weightsPerFilter'] = dic['numInputs']
+        self.verify_num_range(dic['parts'], 'parts', 1, None)
+        
+        self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']/dic['parts']] * len(dic['numInputs']), order='F')
+        self.make_biases(1, dic['outputs'], order='F')
+        
+        for i in xrange(len(dic['numInputs'])):
+            self.verify_divisible(dic['numInputs'][i], dic['parts'], 'numInputs', 'parts', input_idx=i)
+            
+        print "Initialized split fully-connected layer '%s' on GPUs %s, producing %d outputs in %d parts" % (name, dic['gpus'], dic['outputs'], dic['parts'])
+        return dic
+    
+class LocalLayerParser(WeightLayerParser):
+    def __init__(self):
+        WeightLayerParser.__init__(self)
+        
+    # Convert convolutional layer to unshared, locally-connected layer
+    @staticmethod
+    def conv_to_local(layers, lname):
+        layer = layers[lname]
+        if layer['type'] == 'conv':
+            layer['type'] = 'local'
+            for inp,inpname in enumerate(layer['inputs']):
+                src_layer_name = layer['weightSourceLayers'][inp]
+                if src_layer_name != '':
+                    src_layer = layers[src_layer_name]
+                    src_matrix_idx = layer['weightSourceMatrixIndices'][inp]
+                    LocalLayerParser.conv_to_local(layers, src_layer_name)
+                    for w in ('weights', 'weightsInc'):
+                        layer[w][inp] = src_layer[w][src_matrix_idx]
+                else:
+                    layer['weights'][inp] = n.require(n.reshape(n.tile(n.reshape(layer['weights'][inp], (1, n.prod(layer['weights'][inp].shape))), (layer['modules'], 1)),
+                                                        (layer['modules'] * layer['filterChannels'][inp] * layer['filterPixels'][inp], layer['filters'])),
+                                                      requirements='C')
+                    layer['weightsInc'][inp] = n.zeros_like(layer['weights'][inp])
+            if layer['sharedBiases']:
+                layer['biases'] = n.require(n.repeat(layer['biases'], layer['modules'], axis=0), requirements='C')
+                layer['biasesInc'] = n.zeros_like(layer['biases'])
+            
+            print "Converted layer '%s' from convolutional to unshared, locally-connected" % layer['name']
+            
+            # Also call this function on any layers sharing my weights
+            for l in layers:
+                if 'weightSourceLayers' in l and lname in l['weightSourceLayers']:
+                    LocalLayerParser.conv_to_local(layers, l)
+        return layer
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        dic['usesActs'] = False
+        # Supplied values
+        dic['channels'] = mcp.safe_get_int_list(name, 'channels')
+        dic['padding'] = mcp.safe_get_int_list(name, 'padding', default=[0]*len(dic['inputs']))
+        dic['stride'] = mcp.safe_get_int_list(name, 'stride', default=[1]*len(dic['inputs']))
+        dic['filterSize'] = mcp.safe_get_int_list(name, 'filterSize')
+        dic['filters'] = mcp.safe_get_int_list(name, 'filters')
+        dic['groups'] = mcp.safe_get_int_list(name, 'groups', default=[1]*len(dic['inputs']))
+        dic['initW'] = mcp.safe_get_float_list(name, 'initW')
+        dic['initCFunc'] = mcp.safe_get(name, 'initCFunc', default='')
+        dic['modulesX'] = mcp.safe_get_int(name, 'modulesX', default=0)
+
+        
+        self.verify_num_params(['channels', 'padding', 'stride', 'filterSize', \
+                                'filters', 'groups', 'initW'])
+        
+        self.verify_num_range(dic['stride'], 'stride', 1, None)
+        self.verify_num_range(dic['filterSize'],'filterSize', 1, None)  
+        self.verify_num_range(dic['padding'], 'padding', 0, None)
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        self.verify_num_range(dic['groups'], 'groups', 1, None)
+        self.verify_num_range(dic['modulesX'], 'modulesX', 0, None)
+        for i in xrange(len(dic['filters'])):
+            self.verify_divisible(dic['filters'][i], 16, 'filters', input_idx=i)
+        
+        # Computed values
+        dic['imgPixels'] = [numInputs/channels for numInputs,channels in zip(dic['numInputs'], dic['channels'])]
+        dic['imgSize'] = [int(n.sqrt(imgPixels)) for imgPixels in dic['imgPixels']]
+        self.verify_num_range(dic['imgSize'], 'imgSize', 1, None)
+        dic['filters'] = [filters*groups for filters,groups in zip(dic['filters'], dic['groups'])]
+        dic['filterPixels'] = [filterSize**2 for filterSize in dic['filterSize']]
+        if dic['modulesX'] <= 0:
+            dic['modulesX'] = [1 + int(ceil((2*padding + imgSize - filterSize) / float(stride))) for padding,imgSize,filterSize,stride in zip(dic['padding'], dic['imgSize'], dic['filterSize'], dic['stride'])]
+        else:
+            dic['modulesX'] = [dic['modulesX']] * len(dic['inputs'])
+
+        dic['filterChannels'] = [channels/groups for channels,groups in zip(dic['channels'], dic['groups'])]
+        
+        if len(set(dic['modulesX'])) != 1 or len(set(dic['filters'])) != 1:
+            raise LayerParsingError("Layer '%s': all inputs must produce equally-dimensioned output. Dimensions are: %s." % (name, ", ".join("%dx%dx%d" % (filters, modulesX, modulesX) for filters,modulesX in zip(dic['filters'], dic['modulesX']))))
+
+        dic['modulesX'] = dic['modulesX'][0]
+        dic['modules'] = dic['modulesX']**2
+        dic['filters'] = dic['filters'][0]
+        dic['outputs'] = dic['modules'] * dic['filters']
+#        dic['filterConns'] = [[]] * len(dic['inputs'])
+        for i in xrange(len(dic['inputs'])):
+            if dic['numInputs'][i] % dic['imgPixels'][i] != 0 or dic['imgSize'][i] * dic['imgSize'][i] != dic['imgPixels'][i]:
+                raise LayerParsingError("Layer '%s[%d]': has %-d dimensional input, not interpretable as square %d-channel images" % (name, i, dic['numInputs'][i], dic['channels'][i]))
+            if dic['channels'][i] > 3 and dic['channels'][i] % 4 != 0:
+                raise LayerParsingError("Layer '%s[%d]': number of channels must be smaller than 4 or divisible by 4" % (name, i))
+#            if dic['filterSize'][i] > totalPadding[i] + dic['imgSize'][i]:
+#                raise LayerParsingError("Layer '%s[%d]': filter size (%d) greater than image size + padding (%d)" % (name, i, dic['filterSize'][i], dic['padding'][i] + dic['imgSize'][i]))
+            if -dic['padding'][i] + dic['stride'][i] * (dic['modulesX'] - 1) + dic['filterSize'][i] < dic['imgSize'][i]:
+                raise LayerParsingError("Layer '%s[%d]': %dx%d output map with padding=%d, stride=%d does not cover entire input image." % (name, i, dic['modulesX'], dic['outputsX'], dic['padding'][i], dic['stride'][i]))
+
+            if dic['groups'][i] > 1:
+                self.verify_divisible(dic['channels'][i], 4*dic['groups'][i], 'channels', '4 * groups', input_idx=i)
+            self.verify_divisible(dic['channels'][i], dic['groups'][i], 'channels', 'groups', input_idx=i)
+
+            self.verify_divisible(dic['filters'], 16*dic['groups'][i], 'filters * groups', input_idx=i)
+            
+        
+            dic['padding'][i] = -dic['padding'][i]
+#        dic['overSample'] = [groups*filterChannels/channels for groups,filterChannels,channels in zip(dic['groups'], dic['filterChannels'], dic['channels'])]
+        dic['weightsPerFilter'] = [fc * (fz**2) for fc, fz in zip(dic['filterChannels'], dic['filterSize'])]
+        
+        return dic    
+
+class ConvLayerParser(LocalLayerParser):
+    def __init__(self):
+        LocalLayerParser.__init__(self)
+        
+    def add_params(self, mcp):
+        LocalLayerParser.add_params(self, mcp)
+        self.dic['wcNormMax'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMax', default=[0.0] * len(self.dic['inputs']))
+        self.dic['wcNormMin'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMin', default=[0.0] * len(self.dic['inputs']))
+        self.verify_num_params(['wcNormMax', 'wcNormMin'])
+        for min,max in zip(self.dic['wcNormMin'], self.dic['wcNormMax']):
+            if min > max:
+                raise LayerParsingError("Layer '%s': wcNormMin must be <= wcNormMax." % (self.dic['name']))
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model)
+        
+        dic['sumWidth'] = mcp.safe_get_int(name, 'sumWidth')
+        dic['sharedBiases'] = mcp.safe_get_bool(name, 'sharedBiases', default=True)
+        
+        num_biases = dic['filters'] if dic['sharedBiases'] else dic['modules']*dic['filters']
+
+        eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)]
+        self.make_weights(dic['initW'], eltmult(dic['filterPixels'], dic['filterChannels']), [dic['filters']] * len(dic['inputs']), order='C')
+        self.make_biases(num_biases, 1, order='C')
+
+        print "Initialized convolutional layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters'])
+        return dic    
+    
+class LocalUnsharedLayerParser(LocalLayerParser):
+    def __init__(self):
+        LocalLayerParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model)
+        eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)]
+        scmult = lambda x, lst: [x * l for l in lst]
+        self.make_weights(dic['initW'], scmult(dic['modules'], eltmult(dic['filterPixels'], dic['filterChannels'])), [dic['filters']] * len(dic['inputs']), order='C')
+        self.make_biases(dic['modules'] * dic['filters'], 1, order='C')
+        
+        print "Initialized locally-connected layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters'])
+        return dic  
+    
+class DataLayerParser(LayerParser):
+    def __init__(self):
+        LayerParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerParser.parse(self, name, mcp, prev_layers, model)
+        dic['dataIdx'] = mcp.safe_get_int(name, 'dataIdx')
+        dic['start'] = mcp.safe_get_int(name, 'start', default=0)
+        dic['end'] = mcp.safe_get_int(name, 'end', default=model.train_data_provider.get_data_dims(idx=dic['dataIdx']))
+        dic['outputs'] = dic['end'] - dic['start']
+#        dic['usesActs'] = False
+        print "Initialized data layer '%s', producing %d outputs" % (name, dic['outputs'])
+        return dic
+
+class SoftmaxLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['outputs'] = dic['inputLayers'][0]['outputs']
+        print "Initialized softmax layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class ConcatentionLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers'])
+        dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))]
+        print "Initialized concatenation layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+    
+class PassThroughLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self)
+        
+    # Note: this doesn't verify all the necessary constraints. Layer construction may still fail in C++ code.
+    # For example, it does not verify that every layer only has one pass-through parent. Obviously having 
+    # two such parents is incoherent.
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+#        if len(dic['inputLayers']) == 1:
+#            raise LayerParsingError("Layer %s: pass-through layer must have more than one input." % dic['name'])
+        if len(dic['gpu']) != len(dic['inputLayers'][0]['gpu']):
+            raise LayerParsingError("Layer '%s': number of replicas in pass-through layer must be equivalent to number of replicas in input layers." % dic['name'])
+        for inp in dic['inputLayers']:
+            conflicting_layers = [l for l in prev_layers.values() if l['type'] == 'pass' and inp['name'] in l['inputs'] and len(set(dic['gpu']).intersection(set(l['gpu']))) > 0]
+            if len(conflicting_layers) > 0:
+                raise LayerParsingError("Layer '%s' conflicts with layer '%s'. Both pass-through layers take layer '%s' as input and operate on an overlapping set of GPUs." % (dic['name'], conflicting_layers[0]['name'], inp['name']))
+        dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers'])
+#        dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))]
+        print "Initialized pass-through layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
+        return dic
+
+class PoolLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+    
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+    
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['sizeX'] = mcp.safe_get_int(name, 'sizeX')
+        dic['start'] = mcp.safe_get_int(name, 'start', default=0)
+        dic['stride'] = mcp.safe_get_int(name, 'stride')
+        dic['outputsX'] = mcp.safe_get_int(name, 'outputsX', default=0)
+        dic['pool'] = mcp.safe_get(name, 'pool')
+        
+        # Avg pooler does not use its acts or inputs
+        dic['usesActs'] = dic['pool'] != 'avg'
+        dic['usesInputs'] = dic['pool'] != 'avg'
+        
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        
+        if dic['pool'] == 'avg':
+            dic['sum'] = mcp.safe_get_bool(name, 'sum', default=False)
+        
+        self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize'])
+        self.verify_num_range(dic['stride'], 'stride', 1, dic['sizeX'])
+        self.verify_num_range(dic['outputsX'], 'outputsX', 0, None)
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        
+        if LayerWithInputParser.grad_consumers_below(dic):
+            self.verify_divisible(dic['channels'], 16, 'channels')
+        self.verify_str_in(dic['pool'], 'pool', ['max', 'maxabs', 'avg'])
+        
+        self.verify_img_size()
+
+        if dic['outputsX'] <= 0:
+            dic['outputsX'] = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1;
+        dic['outputs'] = dic['outputsX']**2 * dic['channels']
+        
+        print "Initialized %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels'])
+        return dic
+    
+
+class CrossMapPoolLayerParser(LayerWithInputParser):
+    def __init__(self):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+    
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['size'] = mcp.safe_get_int(name, 'size')
+        dic['start'] = mcp.safe_get_int(name, 'start', default=0)
+        dic['stride'] = mcp.safe_get_int(name, 'stride')
+        dic['outputChannels'] = mcp.safe_get_int(name, 'outputs', default=0)
+        dic['pool'] = mcp.safe_get(name, 'pool')
+        dic['requiresParams'] = False
+        
+        # Avg pooler does not use its acts or inputs
+        dic['usesActs'] = 'pool' != 'avg'
+        dic['usesInputs'] = 'pool' != 'avg'
+        
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        dic['outputs'] = dic['outputChannels'] * dic['imgPixels']
+        
+        self.verify_num_range(dic['size'], 'size', 1, dic['channels'])
+        self.verify_num_range(dic['stride'], 'stride', 1, dic['size'])
+        self.verify_num_range(dic['outputChannels'], 'outputChannels', 0, None)
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        self.verify_num_range(dic['start'], 'start', None, 0)
+        
+        self.verify_str_in(dic['pool'], 'pool', ['max'])
+        self.verify_img_size()
+        
+        covered_chans = dic['start'] + (dic['outputChannels'] - 1) * dic['stride'] + dic['size']
+        if covered_chans < dic['channels']:
+            raise LayerParsingError("Layer '%s': cross-map pooling with start=%d, stride=%d, size=%d, outputs=%d covers only %d of %d input channels." % \
+                                    (name, dic['start'], dic['stride'], dic['size'], dic['outputChannels'], covered_chans, dic['channels']))
+        
+        print "Initialized cross-map %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['outputChannels'])
+        return dic
+    
+class NormLayerParser(LayerWithInputParser):
+    RESPONSE_NORM = 'response'
+    CONTRAST_NORM = 'contrast'
+    CROSSMAP_RESPONSE_NORM = 'cross-map response'
+    
+    def __init__(self, norm_type):
+        LayerWithInputParser.__init__(self, num_inputs=1)
+        self.norm_type = norm_type
+        
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['scale'] = mcp.safe_get_float(name, 'scale')
+        dic['scale'] /= dic['size'] if self.norm_type == self.CROSSMAP_RESPONSE_NORM else dic['size']**2
+        dic['pow'] = mcp.safe_get_float(name, 'pow')
+        dic['minDiv'] = mcp.safe_get_float(name, 'minDiv', default=1.0)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        dic['channels'] = mcp.safe_get_int(name, 'channels')
+        dic['size'] = mcp.safe_get_int(name, 'size')
+        dic['blocked'] = mcp.safe_get_bool(name, 'blocked', default=False)
+        
+        dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
+        dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
+        
+        # Contrast normalization layer does not use its inputs
+        dic['usesInputs'] = self.norm_type != self.CONTRAST_NORM
+        
+        self.verify_num_range(dic['channels'], 'channels', 1, None)
+        if self.norm_type == self.CROSSMAP_RESPONSE_NORM: 
+            self.verify_num_range(dic['size'], 'size', 2, dic['channels'])
+            if dic['channels'] % 16 != 0:
+                raise LayerParsingError("Layer '%s': number of channels must be divisible by 16 when using crossMap" % name)
+        else:
+            self.verify_num_range(dic['size'], 'size', 1, dic['imgSize'])
+        
+        if self.norm_type != self.CROSSMAP_RESPONSE_NORM and dic['channels'] > 3 and dic['channels'] % 4 != 0:
+            raise LayerParsingError("Layer '%s': number of channels must be smaller than 4 or divisible by 4" % name)
+
+        self.verify_img_size()
+
+        dic['outputs'] = dic['imgPixels'] * dic['channels']
+        print "Initialized %s-normalization layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (self.norm_type, name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['channels'])
+        return dic
+
+class CostParser(LayerWithInputParser):
+    def __init__(self, num_inputs=-1):
+        LayerWithInputParser.__init__(self, num_inputs=num_inputs)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        # Stored as string because python can't pickle lambda functions
+        dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs]'
+        dic['children'] = mcp.safe_get_list(name, 'children', default=[])
+        # Aggregated costs only produce outputs which are additive.
+        for c in dic['children']:
+            if c not in prev_layers:
+                raise LayerParsingError("Layer '%s': child cost layer '%s' not defined" % (name, c))
+            if prev_layers[c]['type'] != dic['type']:
+                raise LayerParsingError("Layer '%s': child cost layer '%s' must have same type as parent" % (name, c))
+            prev_layers[c]['aggregated'] = 1
+        dic['aggregated'] = dic['children'] != []
+        del dic['neuron']
+        return dic
+
+    def add_params(self, mcp):
+        LayerWithInputParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['coeff'] = mcp.safe_get_float(name, 'coeff')
+        dic['gradConsumer'] = dic['coeff'] > 0
+            
+class CrossEntCostParser(CostParser):
+    def __init__(self):
+        CostParser.__init__(self, num_inputs=2)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = CostParser.parse(self, name, mcp, prev_layers, model)
+        if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels
+            raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name)
+        if dic['inputLayers'][1]['type'] != 'softmax':
+            raise LayerParsingError("Layer '%s': Second input must be softmax layer" % name)
+        if dic['numInputs'][1] != model.train_data_provider.get_num_classes():
+            raise LayerParsingError("Layer '%s': Softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \
+                                    % (name, dic['inputs'][1], model.train_data_provider.get_num_classes()))
+        
+        print "Initialized cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
+        return dic
+    
+class LogregCostParser(CostParser):
+    def __init__(self):
+        CostParser.__init__(self, num_inputs=2)
+        
+    def add_params(self, mcp):
+        CostParser.add_params(self, mcp)
+        dic, name = self.dic, self.dic['name']
+        dic['topk'] = mcp.safe_get_int(name, 'topk', default=1)
+        if dic['topk'] > dic['numInputs'][1]:
+            raise LayerParsingError("Layer '%s': parameter 'topk'must not have value greater than the number of classess."  % (name))
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = CostParser.parse(self, name, mcp, prev_layers, model)
+        dic['requiresParams'] = True
+        if dic['numInputs'][0] != 1: # first input must be labels
+            raise LayerParsingError("Layer '%s': dimensionality of first input must be 1" % name)
+        if dic['inputLayers'][1]['type'] != 'softmax':
+            raise LayerParsingError("Layer '%s': second input must be softmax layer" % name)
+        if dic['numInputs'][1] != model.train_data_provider.get_num_classes():
+            raise LayerParsingError("Layer '%s': softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \
+                                    % (name, dic['inputs'][1], model.train_data_provider.get_num_classes()))
+            
+        print "Initialized logistic regression cost '%s' on GPUs %s" % (name, dic['gpus'])
+        return dic
+    
+class BinomialCrossEntCostParser(CostParser):
+    def __init__(self):
+        CostParser.__init__(self, num_inputs=2)
+        
+    def add_params(self, mcp):
+        CostParser.add_params(self, mcp)
+        self.dic['posWeight'] = mcp.safe_get_float(self.dic['name'], 'posWeight', default=1.0)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = CostParser.parse(self, name, mcp, prev_layers, model)
+
+        if dic['numInputs'][0] != dic['numInputs'][1]:
+            raise LayerParsingError("Layer '%s': both inputs must produce the same number of outputs" % (name))
+
+        if 'neuron' not in dic['inputLayers'][1] or dic['inputLayers'][1]['neuron'] != 'logistic':
+            print "WARNING: Layer '%s': input '%s' is not logistic, results may not be what you intend." % (dic['name'], dic['inputs'][1])
+        
+        if dic['type'] == 'cost.bce':
+            print "Initialized binomial cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
+        
+        
+        dic['computeSoftmaxErrorRate'] = True
+        return dic
+    
+class DetectionCrossEntCostParser(BinomialCrossEntCostParser):
+    def __init__(self):
+        BinomialCrossEntCostParser.__init__(self)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = BinomialCrossEntCostParser.parse(self, name, mcp, prev_layers, model)
+        if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels
+            raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name)
+        dic['computeSoftmaxErrorRate'] = False
+        dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs[:2]] + [(class_cost[2] / class_cost[j] if class_cost[j] > 0 else n.inf) for class_cost in [costs[2:][i*3:(i+1)*3] for i in range(len(costs[2:])/3)] for j in range(2)]'
+        dic['outputFilterFormatter'] = 'lambda self,costs: "(crossent) %.6f, (err) %.6f, " % (costs[0], costs[1]) + ", ".join("(%s) %.6f, %.6f" % (self.train_data_provider.batch_meta["label_names"][i/2-1],costs[i],costs[i+1]) for i in xrange(2, len(costs), 2))'
+        print "Initialized detection cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
+        return dic
+    
+class SumOfSquaresCostParser(CostParser):
+    def __init__(self):
+        CostParser.__init__(self, num_inputs=1)
+        
+    def parse(self, name, mcp, prev_layers, model):
+        dic = CostParser.parse(self, name, mcp, prev_layers, model)
+        print "Initialized sum-of-squares cost '%s' on GPUs %s" % (name, dic['gpus'])
+        return dic
+    
+# All the layer parsers
+layer_parsers = {'data' :           lambda : DataLayerParser(),
+                 'fc':              lambda : FCLayerParser(),
+                 'sfc':             lambda : SplitFCLayerParser(),
+                 'conv':            lambda : ConvLayerParser(),
+                 'local':           lambda : LocalUnsharedLayerParser(),
+                 'softmax':         lambda : SoftmaxLayerParser(),
+                 'eltsum':          lambda : EltwiseSumLayerParser(),
+                 'eltmax':          lambda : EltwiseMaxLayerParser(),
+                 'sum':             lambda : SumLayerParser(),
+                 'neuron':          lambda : NeuronLayerParser(),
+                 'pool':            lambda : PoolLayerParser(),
+                 'cmpool':          lambda : CrossMapPoolLayerParser(),
+                 'rnorm':           lambda : NormLayerParser(NormLayerParser.RESPONSE_NORM),
+                 'cnorm':           lambda : NormLayerParser(NormLayerParser.CONTRAST_NORM),
+                 'cmrnorm':         lambda : NormLayerParser(NormLayerParser.CROSSMAP_RESPONSE_NORM),
+                 'nailbed':         lambda : NailbedLayerParser(),
+                 'blur':            lambda : GaussianBlurLayerParser(),
+                 'href':            lambda : HorizontalReflectionLayerParser(),
+                 'resize':          lambda : ResizeLayerParser(),
+                 'rgb2yuv':         lambda : RGBToYUVLayerParser(),
+                 'rgb2lab':         lambda : RGBToLABLayerParser(),
+                 'rscale':          lambda : RandomScaleLayerParser(),
+                 'crop':            lambda : CropLayerParser(),
+                 'concat':          lambda : ConcatentionLayerParser(),
+                 'pass':            lambda : PassThroughLayerParser(),
+                 'dropout':         lambda : DropoutLayerParser(),
+                 'dropout2':        lambda : Dropout2LayerParser(),
+                 'cost.logreg':     lambda : LogregCostParser(),
+                 'cost.crossent':   lambda : CrossEntCostParser(),
+                 'cost.bce':        lambda : BinomialCrossEntCostParser(),
+                 'cost.dce':        lambda : DetectionCrossEntCostParser(),
+                 'cost.sum2':       lambda : SumOfSquaresCostParser()}
+ 
+# All the neuron parsers
+# This isn't a name --> parser mapping as the layer parsers above because neurons don't have fixed names.
+# A user may write tanh[0.5,0.25], etc.
+neuron_parsers = sorted([NeuronParser('ident', 'f(x) = x', uses_acts=False, uses_inputs=False),
+                         NeuronParser('logistic', 'f(x) = 1 / (1 + e^-x)', uses_acts=True, uses_inputs=False),
+                         NeuronParser('abs', 'f(x) = |x|', uses_acts=False, uses_inputs=True),
+                         NeuronParser('relu', 'f(x) = max(0, x)', uses_acts=True, uses_inputs=False),
+                         NeuronParser('nrelu', 'f(x) = max(0, x) + noise', uses_acts=True, uses_inputs=False),
+                         NeuronParser('softrelu', 'f(x) = log(1 + e^x)', uses_acts=True, uses_inputs=False),
+                         NeuronParser('square', 'f(x) = x^2', uses_acts=False, uses_inputs=True),
+                         NeuronParser('sqrt', 'f(x) = sqrt(x)', uses_acts=True, uses_inputs=False),
+                         ParamNeuronParser('log[a]', 'f(x) = log(a + x)', uses_acts=False, uses_inputs=True),
+                         ParamNeuronParser('tanh[a,b]', 'f(x) = a * tanh(b * x)', uses_acts=True, uses_inputs=False),
+                         ParamNeuronParser('brelu[a]', 'f(x) = min(a, max(0, x))', uses_acts=True, uses_inputs=False),
+                         ParamNeuronParser('linear[a,b]', 'f(x) = a * x + b', uses_acts=True, uses_inputs=False),
+                         ParamNeuronParser('drelu[a]', 'f(x) = x - a * tanh(x / a)', uses_acts=False, uses_inputs=True)],
+                        key=lambda x:x.type)
+
+# Learning rate schedules
+lrs_parsers = sorted([ParamParser('const[fbase]'),
+                      ParamParser('linear[fbase;ftgtFactor]'),
+                      ParamParser('exp[fbase;ftgtFactor]'),
+                      ParamParser('dexp[fbase;ftgtFactor;inumSteps]')])
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg
new file mode 100644
index 0000000..a24d538
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg
@@ -0,0 +1,57 @@
+# 11% error on CIFAR-10 - layer parameter file
+# Methodology:
+# 1. Train on batches 1-4, use batch 5 for validation.
+# 2. After about 350 epochs, validation error no longer making improvements.
+# 3. Fold in batch 5.
+# 4. Train on batches 1-5 for about 150 more epochs, until the batch 5 error is near the errors for batches 1-4. It takes forever to actually get there but after 150 epochs it's close enough.
+# 5. Lower learning rates (epsW) by a factor of 10 to 0.0001, train for 10 more epochs.
+# 6. Lower learning rates (epsW) by another factor of 10 to 0.00001, train for 10 more epochs.
+# 7. Stop. Test on batch 6 with --test-range=6 --multiview-test=1 --logreg-name=logprob (read more about what this does here: http://code.google.com/p/cuda-convnet/wiki/TrainingNet#Training_on_image_translations )
+
+# More details about methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
+
+[conv1]
+epsW=0.001
+epsB=0.002
+momW=0.9
+momB=0.9
+wc=0.000
+
+[conv2]
+epsW=0.001
+epsB=0.002
+momW=0.9
+momB=0.9
+wc=0.000
+
+[local3]
+epsW=0.001
+epsB=0.002
+momW=0.9
+momB=0.9
+wc=0.004
+
+[local4]
+epsW=0.001
+epsB=0.002
+momW=0.9
+momB=0.9
+wc=0.004
+
+[fc10]
+epsW=0.001
+epsB=0.002
+momW=0.9
+momB=0.9
+wc=0.01
+
+[logprob]
+coeff=1
+
+[rnorm1]
+scale=0.001
+pow=0.75
+
+[rnorm2]
+scale=0.001
+pow=0.75
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg
new file mode 100644
index 0000000..9462f5b
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg
@@ -0,0 +1,93 @@
+[conv1]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv3]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv4]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv5]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096a]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096b]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1000]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[logprob]
+coeff=1
+topk=5
+
+[dropout1]
+enable=true
+
+[dropout2]
+enable=true
+
+[rnorm1]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg
new file mode 100644
index 0000000..f06dda2
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg
@@ -0,0 +1,93 @@
+[conv1]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv3]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv4]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv5]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096a]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096b]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1000]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[logprob]
+coeff=1
+topk=5
+
+[dropout1]
+enable=true
+
+[dropout2]
+enable=true
+
+[rnorm1]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg
new file mode 100644
index 0000000..a4dba87
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg
@@ -0,0 +1,182 @@
+[conv1a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+
+
+[conv1b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+
+
+[conv2a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+
+
+[conv2b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+
+
+[conv3a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+
+[conv3b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+[conv4a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+
+
+[conv4b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+
+
+[conv5a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+
+
+[conv5b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+
+
+[fc2048a]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+[fc2048b]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+[fc2048ba]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+[fc2048bb]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+[fc1000]
+epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
+momW=0.9,0.9
+momB=0.9
+wc=0.0005,0.0005
+wball=0,0
+
+
+[logprob]
+coeff=1
+topk=5
+
+[dropout1a]
+enable=true
+keep=0.5
+
+[dropout2a]
+enable=true
+keep=0.5
+
+[dropout1b]
+enable=true
+keep=0.5
+
+[dropout2b]
+enable=true
+keep=0.5
+
+[rnorm1a]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm1b]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2a]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2b]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[cnorm2a]
+scale=0.001
+pow=0.75
+
+[cnorm2b]
+scale=0.001
+pow=0.75
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg
new file mode 100644
index 0000000..4d1f078
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg
@@ -0,0 +1,169 @@
+[conv1]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+
+[conv2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+
+[conv3]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+
+[conv4]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+
+[conv5]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+
+[fc1024a]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024b]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024c]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024d]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024ba]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024bb]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024bc]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1024bd]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+
+[fc1000]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[logprob]
+coeff=1
+topk=5
+
+[dropout1a]
+enable=true
+keep=0.5
+
+[dropout1b]
+enable=true
+keep=0.5
+
+[dropout1c]
+enable=true
+keep=0.5
+
+[dropout1d]
+enable=true
+keep=0.5
+
+[dropout2a]
+enable=true
+keep=0.5
+
+[dropout2b]
+enable=true
+keep=0.5
+
+[dropout2c]
+enable=true
+keep=0.5
+
+[dropout2d]
+enable=true
+keep=0.5
+
+[rnorm1]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg
new file mode 100644
index 0000000..b3febfd
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg
@@ -0,0 +1,93 @@
+[conv1]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv2]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0.00
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv3]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv4]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[conv5]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096a]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc4096b]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[fc1000]
+momW=0.9
+momB=0.9
+wc=0.0005
+wball=0
+epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
+epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
+updatePeriod=1
+
+[logprob]
+coeff=1
+topk=5
+
+[dropout1]
+enable=true
+
+[dropout2]
+enable=true
+
+[rnorm1]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+[rnorm2]
+scale=0.0001
+pow=0.75
+minDiv=2
+
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg
new file mode 100644
index 0000000..44fc31a
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg
@@ -0,0 +1,103 @@
+[data]
+type=data
+dataIdx=0
+
+[labels]
+type=data
+dataIdx=1
+
+[conv1]
+type=conv
+inputs=data
+channels=3
+filters=64
+padding=2
+stride=1
+filterSize=5
+neuron=relu
+initW=0.0001
+sumWidth=4
+sharedBiases=1
+gpu=0
+
+[pool1]
+type=pool
+pool=max
+inputs=conv1
+start=0
+sizeX=3
+stride=2
+outputsX=0
+channels=64
+
+[rnorm1]
+type=cmrnorm
+inputs=pool1
+channels=64
+size=9
+
+[conv2]
+type=conv
+inputs=rnorm1
+filters=64
+padding=2
+stride=1
+filterSize=5
+channels=64
+neuron=relu
+initW=0.01
+sumWidth=2
+sharedBiases=1
+
+[rnorm2]
+type=cmrnorm
+inputs=conv2
+channels=64
+size=9
+
+[pool2]
+type=pool
+pool=max
+inputs=rnorm2
+start=0
+sizeX=3
+stride=2
+outputsX=0
+channels=64
+
+[local3]
+type=local
+inputs=pool2
+filters=64
+padding=1
+stride=1
+filterSize=3
+channels=64
+neuron=relu
+initW=0.04
+
+[local4]
+type=local
+inputs=local3
+filters=32
+padding=1
+stride=1
+filterSize=3
+channels=64
+neuron=relu
+initW=0.04
+
+[fc10]
+type=fc
+outputs=10
+inputs=local4
+initW=0.01
+
+[probs]
+type=softmax
+inputs=fc10
+
+[logprob]
+type=cost.logreg
+inputs=labels,probs
+gpu=0
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg
new file mode 100644
index 0000000..0b549bb
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg
@@ -0,0 +1,155 @@
+[data]
+type=data
+dataIdx=0
+
+[labvec]
+type=data
+dataIdx=1
+
+[conv1]
+type=conv
+inputs=data
+channels=3
+filters=64
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=4
+sharedBiases=1
+gpu=0
+
+[rnorm1]
+type=cmrnorm
+inputs=conv1
+channels=64
+size=5
+
+[pool1]
+type=pool
+pool=max
+inputs=rnorm1
+sizeX=3
+stride=2
+channels=64
+neuron=relu
+
+[conv2]
+type=conv
+inputs=pool1
+filters=192
+padding=2
+stride=1
+filterSize=5
+channels=64
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[rnorm2]
+type=cmrnorm
+inputs=conv2
+channels=192
+size=5
+
+[pool2]
+type=pool
+pool=max
+inputs=rnorm2
+sizeX=3
+stride=2
+channels=192
+
+[conv3]
+type=conv
+inputs=pool2
+filters=384
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[conv4]
+type=conv
+inputs=conv3
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=384
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=3
+sharedBiases=1
+
+[conv5]
+type=conv
+inputs=conv4
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=256
+initW=0.03
+initB=1
+sumWidth=3
+
+[pool3]
+type=pool
+pool=max
+inputs=conv5
+sizeX=3
+stride=2
+channels=256
+neuron=relu
+
+[fc4096a]
+type=fc
+inputs=pool3
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+gpu=0
+
+[dropout1]
+type=dropout2
+inputs=fc4096a
+
+[fc4096b]
+type=fc
+inputs=dropout1
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+gpu=0
+
+[dropout2]
+type=dropout2
+inputs=fc4096b
+
+[fc1000]
+type=fc
+outputs=1000
+inputs=dropout2
+initW=0.01
+initB=-7
+gpu=0
+
+[probs]
+type=softmax
+inputs=fc1000
+
+[logprob]
+type=cost.logreg
+inputs=labvec,probs
+gpu=0
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg
new file mode 100644
index 0000000..f27093c
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg
@@ -0,0 +1,152 @@
+[data]
+type=data
+dataIdx=0
+
+[labvec]
+type=data
+dataIdx=1
+
+[conv1]
+type=conv
+inputs=data
+channels=3
+filters=64
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=4
+sharedBiases=1
+gpu=0,1
+
+[rnorm1]
+type=cmrnorm
+inputs=conv1
+channels=64
+size=5
+
+[pool1]
+type=pool
+pool=max
+inputs=rnorm1
+sizeX=3
+stride=2
+channels=64
+neuron=relu
+
+[conv2]
+type=conv
+inputs=pool1
+filters=192
+padding=2
+stride=1
+filterSize=5
+channels=64
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[rnorm2]
+type=cmrnorm
+inputs=conv2
+channels=192
+size=5
+
+[pool2]
+type=pool
+pool=max
+inputs=rnorm2
+sizeX=3
+stride=2
+channels=192
+
+[conv3]
+type=conv
+inputs=pool2
+filters=384
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[conv4]
+type=conv
+inputs=conv3
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=384
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=3
+sharedBiases=1
+
+[conv5]
+type=conv
+inputs=conv4
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=256
+initW=0.03
+initB=1
+sumWidth=3
+
+[pool3]
+type=pool
+pool=max
+inputs=conv5
+sizeX=3
+stride=2
+channels=256
+neuron=relu
+
+[fc4096a]
+type=fc
+inputs=pool3
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+
+[dropout1]
+type=dropout2
+inputs=fc4096a
+
+[fc4096b]
+type=fc
+inputs=dropout1
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+
+[dropout2]
+type=dropout2
+inputs=fc4096b
+
+[fc1000]
+type=fc
+outputs=1000
+inputs=dropout2
+initW=0.01
+initB=-7
+
+[probs]
+type=softmax
+inputs=fc1000
+
+[logprob]
+type=cost.logreg
+inputs=labvec,probs
+gpu=0,1
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg
new file mode 100644
index 0000000..5180134
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg
@@ -0,0 +1,304 @@
+[data]
+type=data
+dataIdx=0
+
+[labels]
+type=data
+dataIdx=1
+
+[conv1a]
+type=conv
+inputs=data
+channels=3
+filters=48
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=3
+sharedBiases=1
+gpu=0
+
+[conv1b]
+type=conv
+inputs=data
+channels=3
+filters=48
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=3
+sharedBiases=1
+gpu=1
+
+[rnorm1a]
+type=cmrnorm
+inputs=conv1a
+channels=48
+size=5
+
+[rnorm1b]
+type=cmrnorm
+inputs=conv1b
+channels=48
+size=5
+
+[pool1a]
+type=pool
+pool=max
+inputs=rnorm1a
+sizeX=3
+stride=2
+channels=48
+neuron=relu
+
+[pool1b]
+type=pool
+pool=max
+inputs=rnorm1b
+sizeX=3
+stride=2
+channels=48
+neuron=relu
+
+[conv2a]
+type=conv
+inputs=pool1a
+filters=128
+padding=2
+stride=1
+filterSize=5
+channels=48
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+gpu=0
+
+[conv2b]
+type=conv
+inputs=pool1b
+filters=128
+padding=2
+stride=1
+filterSize=5
+channels=48
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+gpu=1
+
+[rnorm2a]
+type=cmrnorm
+inputs=conv2a
+channels=128
+size=5
+
+[rnorm2b]
+type=cmrnorm
+inputs=conv2b
+channels=128
+size=5
+
+[cnorm2a]
+type=rnorm
+inputs=rnorm2a
+channels=128
+size=5
+
+[cnorm2b]
+type=rnorm
+inputs=rnorm2b
+channels=128
+size=5
+
+[pool2a]
+type=pool
+pool=max
+inputs=cnorm2a
+sizeX=3
+stride=2
+channels=128
+
+[pool2b]
+type=pool
+pool=max
+inputs=cnorm2b
+sizeX=3
+stride=2
+channels=128
+
+[conv3a]
+type=conv
+inputs=pool2a,pool2b
+filters=192,192
+padding=1,1
+stride=1,1
+filterSize=3,3
+channels=128,128
+initW=0.03,0.03
+sumWidth=2
+sharedBiases=1
+neuron=relu
+gpu=0
+
+[conv3b]
+type=conv
+inputs=pool2a,pool2b
+filters=192,192
+padding=1,1
+stride=1,1
+filterSize=3,3
+channels=128,128
+initW=0.03,0.03
+sumWidth=2
+sharedBiases=1
+neuron=relu
+gpu=1
+
+[conv4a]
+type=conv
+inputs=conv3a
+filters=192
+padding=1
+stride=1
+filterSize=3
+channels=192
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=2
+sharedBiases=1
+
+[conv4b]
+type=conv
+inputs=conv3b
+filters=192
+padding=1
+stride=1
+filterSize=3
+channels=192
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=2
+sharedBiases=1
+
+
+[conv5a]
+type=conv
+inputs=conv4a
+filters=128
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+initB=1
+sumWidth=2
+groups=1
+randSparse=0
+
+[conv5b]
+type=conv
+inputs=conv4b
+filters=128
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+initB=1
+sumWidth=2
+groups=1
+randSparse=0
+
+[pool3a]
+type=pool
+pool=max
+inputs=conv5a
+sizeX=3
+stride=2
+channels=128
+neuron=relu
+
+[pool3b]
+type=pool
+pool=max
+inputs=conv5b
+sizeX=3
+stride=2
+channels=128
+neuron=relu
+
+[fc2048a]
+type=fc
+inputs=pool3a,pool3b
+outputs=2048
+initW=0.01,0.01
+initB=1
+neuron=relu
+gpu=0
+
+[fc2048b]
+type=fc
+inputs=pool3a,pool3b
+outputs=2048
+initW=0.01,0.01
+initB=1
+neuron=relu
+gpu=1
+
+[dropout1a]
+type=dropout
+inputs=fc2048a
+
+[dropout1b]
+type=dropout
+inputs=fc2048b
+
+[fc2048ba]
+type=fc
+inputs=dropout1a,dropout1b
+outputs=2048
+initW=0.01,0.01
+initB=1
+neuron=relu
+gpu=0
+
+[fc2048bb]
+type=fc
+inputs=dropout1b,dropout1a
+outputs=2048
+initW=0.01,0.01
+initB=1
+neuron=relu
+gpu=1
+
+[dropout2a]
+type=dropout
+inputs=fc2048ba
+
+[dropout2b]
+type=dropout
+inputs=fc2048bb
+
+[fc1000]
+type=fc
+outputs=1000
+inputs=dropout2a,dropout2b
+initW=0.01,0.01
+gpu=0
+
+[probs]
+type=softmax
+inputs=fc1000
+
+[logprob]
+type=cost.logreg
+inputs=labels,probs
+gpu=0
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg
new file mode 100644
index 0000000..3d79b4d
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg
@@ -0,0 +1,257 @@
+[data]
+type=data
+dataIdx=0
+
+[labvec]
+type=data
+dataIdx=1
+
+[conv1]
+type=conv
+inputs=data
+channels=3
+filters=64
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=4
+sharedBiases=1
+gpu=0,1,2,3
+
+[rnorm1]
+type=cmrnorm
+inputs=conv1
+channels=64
+size=5
+
+[pool1]
+type=pool
+pool=max
+inputs=rnorm1
+sizeX=3
+stride=2
+channels=64
+neuron=relu
+
+[conv2]
+type=conv
+inputs=pool1
+filters=192
+padding=2
+stride=1
+filterSize=5
+channels=64
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[rnorm2]
+type=cmrnorm
+inputs=conv2
+channels=192
+size=5
+
+[pool2]
+type=pool
+pool=max
+inputs=rnorm2
+sizeX=3
+stride=2
+channels=192
+
+[conv3]
+type=conv
+inputs=pool2
+filters=384
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[conv4]
+type=conv
+inputs=conv3
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=384
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=3
+sharedBiases=1
+
+[conv5]
+type=conv
+inputs=conv4
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=256
+initW=0.03
+initB=1
+sumWidth=3
+
+[pool3]
+type=pool
+pool=max
+inputs=conv5
+sizeX=3
+stride=2
+channels=256
+neuron=relu
+
+[fc1024a]
+type=fc
+inputs=pool3
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+gpu=0
+
+[fc1024b]
+type=fc
+inputs=pool3
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+gpu=1
+
+[fc1024c]
+type=fc
+inputs=pool3
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+gpu=2
+
+[fc1024d]
+type=fc
+inputs=pool3
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+gpu=3
+
+[dropout1a]
+type=dropout2
+inputs=fc1024a
+
+[dropout1b]
+type=dropout2
+inputs=fc1024b
+
+[dropout1c]
+type=dropout2
+inputs=fc1024c
+
+[dropout1d]
+type=dropout2
+inputs=fc1024d
+
+# This is like a concatenation layer
+[pass1a]
+type=pass
+inputs=dropout1a,dropout1b,dropout1c,dropout1d
+gpu=0
+
+# This is like a concatenation layer
+[pass1b]
+type=pass
+inputs=dropout1a,dropout1b,dropout1c,dropout1d
+gpu=1
+
+# This is like a concatenation layer
+[pass1c]
+type=pass
+inputs=dropout1a,dropout1b,dropout1c,dropout1d
+gpu=2
+
+# This is like a concatenation layer
+[pass1d]
+type=pass
+inputs=dropout1a,dropout1b,dropout1c,dropout1d
+gpu=3
+
+
+[fc1024ba]
+type=fc
+inputs=pass1a
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+
+[fc1024bb]
+type=fc
+inputs=pass1b
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+
+[fc1024bc]
+type=fc
+inputs=pass1c
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+
+[fc1024bd]
+type=fc
+inputs=pass1d
+outputs=1024
+initW=0.01
+initB=1
+neuron=relu
+
+[dropout2a]
+type=dropout2
+inputs=fc1024ba
+
+[dropout2b]
+type=dropout2
+inputs=fc1024bb
+
+[dropout2c]
+type=dropout2
+inputs=fc1024bc
+
+[dropout2d]
+type=dropout2
+inputs=fc1024bd
+
+[pass2a]
+inputs=dropout2a,dropout2b,dropout2c,dropout2d
+type=pass
+gpu=0
+
+[fc1000]
+type=fc
+outputs=1000
+inputs=pass2a
+initW=0.01
+
+[probs]
+type=softmax
+inputs=fc1000
+
+[logprob]
+type=cost.logreg
+inputs=labvec,probs
+gpu=0
+
diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg
new file mode 100644
index 0000000..e804fdc
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg
@@ -0,0 +1,152 @@
+[data]
+type=data
+dataIdx=0
+
+[labvec]
+type=data
+dataIdx=1
+
+[conv1]
+type=conv
+inputs=data
+channels=3
+filters=64
+padding=0
+stride=4
+filterSize=11
+initW=0.01
+sumWidth=4
+sharedBiases=1
+gpu=0,1,2,3
+
+[rnorm1]
+type=cmrnorm
+inputs=conv1
+channels=64
+size=5
+
+[pool1]
+type=pool
+pool=max
+inputs=rnorm1
+sizeX=3
+stride=2
+channels=64
+neuron=relu
+
+[conv2]
+type=conv
+inputs=pool1
+filters=192
+padding=2
+stride=1
+filterSize=5
+channels=64
+initW=0.01
+initB=1
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[rnorm2]
+type=cmrnorm
+inputs=conv2
+channels=192
+size=5
+
+[pool2]
+type=pool
+pool=max
+inputs=rnorm2
+sizeX=3
+stride=2
+channels=192
+
+[conv3]
+type=conv
+inputs=pool2
+filters=384
+padding=1
+stride=1
+filterSize=3
+channels=192
+initW=0.03
+sumWidth=3
+sharedBiases=1
+neuron=relu
+
+[conv4]
+type=conv
+inputs=conv3
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=384
+neuron=relu
+initW=0.03
+initB=1
+sumWidth=3
+sharedBiases=1
+
+[conv5]
+type=conv
+inputs=conv4
+filters=256
+padding=1
+stride=1
+filterSize=3
+channels=256
+initW=0.03
+initB=1
+sumWidth=3
+
+[pool3]
+type=pool
+pool=max
+inputs=conv5
+sizeX=3
+stride=2
+channels=256
+neuron=relu
+
+[fc4096a]
+type=fc
+inputs=pool3
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+
+[dropout1]
+type=dropout2
+inputs=fc4096a
+
+[fc4096b]
+type=fc
+inputs=dropout1
+outputs=4096
+initW=0.01
+initB=1
+neuron=relu
+
+[dropout2]
+type=dropout2
+inputs=fc4096b
+
+[fc1000]
+type=fc
+outputs=1000
+inputs=dropout2
+initW=0.01
+initB=-7
+
+[probs]
+type=softmax
+inputs=fc1000
+
+[logprob]
+type=cost.logreg
+inputs=labvec,probs
+gpu=0,1,2,3
+
diff --git a/caffe2/contrib/cuda-convnet2/make-data/input_meta b/caffe2/contrib/cuda-convnet2/make-data/input_meta
new file mode 100644
index 0000000..659b20b
Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/make-data/input_meta differ
diff --git a/caffe2/contrib/cuda-convnet2/make-data/make-data.py b/caffe2/contrib/cuda-convnet2/make-data/make-data.py
new file mode 100644
index 0000000..1861ceb
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/make-data/make-data.py
@@ -0,0 +1,157 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#################################################################################
+
+
+# This script makes batches suitable for training from raw ILSVRC 2012 tar files.
+
+import tarfile
+from StringIO import StringIO
+from random import shuffle
+import sys
+from time import time
+from pyext._MakeDataPyExt import resizeJPEG
+import itertools
+import os
+import cPickle
+import scipy.io
+import math
+import argparse as argp
+
+# Set this to True to crop images to square. In this case each image will be
+# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels, and then the
+# center OUTPUT_IMAGE_SIZE x OUTPUT_IMAGE_SIZE patch will be extracted.
+#
+# Set this to False to preserve image borders. In this case each image will be
+# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels. This was
+# demonstrated to be superior by Andrew Howard in his very nice paper:
+# http://arxiv.org/abs/1312.5402
+CROP_TO_SQUARE          = True
+OUTPUT_IMAGE_SIZE       = 256
+
+# Number of threads to use for JPEG decompression and image resizing.
+NUM_WORKER_THREADS      = 8
+
+# Don't worry about these.
+OUTPUT_BATCH_SIZE = 3072
+OUTPUT_SUB_BATCH_SIZE = 1024
+
+def pickle(filename, data):
+    with open(filename, "w") as fo:
+        cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL)
+
+def unpickle(filename):
+    fo = open(filename, 'r')
+    contents = cPickle.load(fo)
+    fo.close()
+    return contents
+
+def partition_list(l, partition_size):
+    divup = lambda a,b: (a + b - 1) / b
+    return [l[i*partition_size:(i+1)*partition_size] for i in xrange(divup(len(l),partition_size))]
+
+def open_tar(path, name):
+    if not os.path.exists(path):
+        print "ILSVRC 2012 %s not found at %s. Make sure to set ILSVRC_SRC_DIR correctly at the top of this file (%s)." % (name, path, sys.argv[0])
+        sys.exit(1)
+    return tarfile.open(path)
+
+def makedir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def parse_devkit_meta(ILSVRC_DEVKIT_TAR):
+    tf = open_tar(ILSVRC_DEVKIT_TAR, 'devkit tar')
+    fmeta = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/meta.mat'))
+    meta_mat = scipy.io.loadmat(StringIO(fmeta.read()))
+    labels_dic = dict((m[0][1][0], m[0][0][0][0]-1) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000)
+    label_names_dic = dict((m[0][1][0], m[0][2][0]) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000)
+    label_names = [tup[1] for tup in sorted([(v,label_names_dic[k]) for k,v in labels_dic.items()], key=lambda x:x[0])]
+
+    fval_ground_truth = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'))
+    validation_ground_truth = [[int(line.strip()) - 1] for line in fval_ground_truth.readlines()]
+    tf.close()
+    return labels_dic, label_names, validation_ground_truth
+
+def write_batches(target_dir, name, start_batch_num, labels, jpeg_files):
+    jpeg_files = partition_list(jpeg_files, OUTPUT_BATCH_SIZE)
+    labels = partition_list(labels, OUTPUT_BATCH_SIZE)
+    makedir(target_dir)
+    print "Writing %s batches..." % name
+    for i,(labels_batch, jpeg_file_batch) in enumerate(zip(labels, jpeg_files)):
+        t = time()
+        jpeg_strings = list(itertools.chain.from_iterable(resizeJPEG([jpeg.read() for jpeg in jpeg_file_batch], OUTPUT_IMAGE_SIZE, NUM_WORKER_THREADS, CROP_TO_SQUARE)))
+        batch_path = os.path.join(target_dir, 'data_batch_%d' % (start_batch_num + i))
+        makedir(batch_path)
+        for j in xrange(0, len(labels_batch), OUTPUT_SUB_BATCH_SIZE):
+            pickle(os.path.join(batch_path, 'data_batch_%d.%d' % (start_batch_num + i, j/OUTPUT_SUB_BATCH_SIZE)), 
+                   {'data': jpeg_strings[j:j+OUTPUT_SUB_BATCH_SIZE],
+                    'labels': labels_batch[j:j+OUTPUT_SUB_BATCH_SIZE]})
+        print "Wrote %s (%s batch %d of %d) (%.2f sec)" % (batch_path, name, i+1, len(jpeg_files), time() - t)
+    return i + 1
+
+if __name__ == "__main__":
+    parser = argp.ArgumentParser()
+    parser.add_argument('--src-dir', help='Directory containing ILSVRC2012_img_train.tar, ILSVRC2012_img_val.tar, and ILSVRC2012_devkit_t12.tar.gz', required=True)
+    parser.add_argument('--tgt-dir', help='Directory to output ILSVRC 2012 batches suitable for cuda-convnet to train on.', required=True)
+    args = parser.parse_args()
+    
+    print "CROP_TO_SQUARE: %s" % CROP_TO_SQUARE
+    print "OUTPUT_IMAGE_SIZE: %s" % OUTPUT_IMAGE_SIZE
+    print "NUM_WORKER_THREADS: %s" % NUM_WORKER_THREADS
+
+    ILSVRC_TRAIN_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_train.tar')
+    ILSVRC_VALIDATION_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_val.tar')
+    ILSVRC_DEVKIT_TAR = os.path.join(args.src_dir, 'ILSVRC2012_devkit_t12.tar.gz')
+
+    assert OUTPUT_BATCH_SIZE % OUTPUT_SUB_BATCH_SIZE == 0
+    labels_dic, label_names, validation_labels = parse_devkit_meta(ILSVRC_DEVKIT_TAR)
+
+    with open_tar(ILSVRC_TRAIN_TAR, 'training tar') as tf:
+        synsets = tf.getmembers()
+        synset_tars = [tarfile.open(fileobj=tf.extractfile(s)) for s in synsets]
+        print "Loaded synset tars."
+        print "Building training set image list (this can take 10-20 minutes)..."
+        sys.stdout.flush()
+    
+        train_jpeg_files = []
+        for i,st in enumerate(synset_tars):
+            if i % 100 == 0:
+                print "%d%% ..." % int(round(100.0 * float(i) / len(synset_tars))),
+                sys.stdout.flush()
+            train_jpeg_files += [st.extractfile(m) for m in st.getmembers()]
+            st.close()
+            
+        shuffle(train_jpeg_files)
+        train_labels = [[labels_dic[jpeg.name[:9]]] for jpeg in train_jpeg_files]
+        print "done"
+    
+        # Write training batches
+        i = write_batches(args.tgt_dir, 'training', 0, train_labels, train_jpeg_files)
+    
+    # Write validation batches
+    val_batch_start = int(math.ceil((i / 1000.0))) * 1000
+    with open_tar(ILSVRC_VALIDATION_TAR, 'validation tar') as tf:
+        validation_jpeg_files = sorted([tf.extractfile(m) for m in tf.getmembers()], key=lambda x:x.name)
+        write_batches(args.tgt_dir, 'validation', val_batch_start, validation_labels, validation_jpeg_files)
+    
+    # Write meta file
+    meta = unpickle('input_meta')
+    meta_file = os.path.join(args.tgt_dir, 'batches.meta')
+    meta.update({'batch_size': OUTPUT_BATCH_SIZE,
+                 'num_vis': OUTPUT_IMAGE_SIZE**2 * 3,
+                 'label_names': label_names})
+    pickle(meta_file, meta)
+    print "Wrote %s" % meta_file
+    print "All done! ILSVRC 2012 batches are in %s" % args.tgt_dir
diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile b/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile
new file mode 100644
index 0000000..7b7ae56
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile
@@ -0,0 +1,50 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDES := -I./include
+COMMONFLAGS :=
+CC_ARGS := 
+
+ifndef debug
+	CC_ARGS += -O3
+endif
+CC=g++
+
+OUT_DIR=./bin/$(OUT_SUFFIX)
+
+PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
+LINK_LIBS := -L$(CUDA_INSTALL_PATH)/lib64 `pkg-config --libs python` `pkg-config --libs opencv` -lpthread
+
+INCLUDES += -I$(PYTHON_INCLUDE_PATH) 
+OUT_FILE=_MakeDataPyExt.so
+
+all: dir classes $(OUT_FILE)
+
+dir:
+	mkdir -p $(OUT_DIR)/src
+
+SOURCES = $(shell echo src/*.cpp)
+CLASSES = $(SOURCES:.cpp=.o)
+
+classes: $(CLASSES)
+
+%.o: %.cpp
+	$(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o
+
+$(OUT_FILE): classes
+	cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS)
+	ln -sf $(OUT_DIR)/$(OUT_FILE) .
+
+clean:
+	rm -rf $(OUT_DIR)/*
diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py b/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py
new file mode 100644
index 0000000..520b1ea
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h b/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h
new file mode 100644
index 0000000..6e4c655
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_PYEXT_H_
+#define INCLUDE_PYEXT_H_
+
+#include <stdio.h>
+//#include <jpeglib.h>
+#include <opencv2/opencv.hpp>
+#include <Python.h>
+#include "../../../util/include/thread.h"
+
+#define JPEG_QUALITY      95
+
+#ifndef DIVUP
+#define DIVUP(a,b) (((a) + (b) - 1) / (b))
+#endif
+
+extern "C" {
+    void init_MakeDataPyExt();
+}
+PyObject* resizeJPEG(PyObject *self, PyObject *args);
+
+class DecoderThread : public Thread {
+ protected:
+    PyObject* _py_list_src;
+    PyObject* _py_list_tgt;
+    int _start_img, _end_img;
+    int _target_size;
+    bool _crop_to_square;
+
+    cv::Mat _resized_mat_buffer;
+    std::vector<uchar> _output_jpeg_buffer;
+    std::vector<int> _encode_params;
+
+    void* run();
+    void makeJPEG(int idx);
+
+ public:
+    DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square);
+    virtual ~DecoderThread();
+    PyObject* getTargetList();
+};
+
+
+#endif  // INCLUDE_PYEXT_H_
diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp b/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp
new file mode 100644
index 0000000..0e3c0c7
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/pyext.h"
+
+using namespace std;
+
+static PyMethodDef _MakeDataPyExtMethods[] = {{ "resizeJPEG", resizeJPEG, METH_VARARGS },
+                                              { NULL, NULL }
+};
+
+void init_MakeDataPyExt() {
+    (void) Py_InitModule("_MakeDataPyExt", _MakeDataPyExtMethods);
+}
+
+PyObject* resizeJPEG(PyObject *self, PyObject *args) {
+
+    PyListObject* pyListSrc;
+    int tgtImgSize, numThreads;
+    int cropToSquare;
+
+    if (!PyArg_ParseTuple(args, "O!iii",
+                          &PyList_Type, &pyListSrc,
+                          &tgtImgSize,
+                          &numThreads,
+                          &cropToSquare)) {
+        return NULL;
+    }
+
+    DecoderThread* threads[numThreads];
+    int num_imgs = PyList_GET_SIZE(pyListSrc);
+    int num_imgs_per_thread = DIVUP(num_imgs, numThreads);
+    for (int t = 0; t < numThreads; ++t) {
+        int start_img = t * num_imgs_per_thread;
+        int end_img = min(num_imgs, (t+1) * num_imgs_per_thread);
+
+        threads[t] = new DecoderThread((PyObject*)pyListSrc, start_img, end_img, tgtImgSize, cropToSquare);
+        threads[t]->start();
+    }
+
+    PyObject* pyListTgt = PyList_New(0);
+    for (int t = 0; t < numThreads; ++t) {
+        threads[t]->join();
+        PyList_Append(pyListTgt, threads[t]->getTargetList());
+        delete threads[t]; // the thread's list too
+    }
+
+    return pyListTgt;
+}
+
+DecoderThread::DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square)
+: Thread(true), _py_list_src(py_list_src), _start_img(start_img), _end_img(end_img), _target_size(target_size), _crop_to_square(crop_to_square) {
+
+    _encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+    _encode_params.push_back(JPEG_QUALITY);
+    _py_list_tgt = PyList_New(0);
+}
+
+DecoderThread::~DecoderThread(){
+    Py_DECREF(_py_list_tgt);
+}
+
+void* DecoderThread::run() {
+    for (int i = _start_img; i < _end_img; ++i) {
+        makeJPEG(i);
+    }
+    return NULL;
+}
+
+PyObject* DecoderThread::getTargetList() {
+    return _py_list_tgt;
+}
+
+void DecoderThread::makeJPEG(int idx) {
+    /*
+     * Decompress JPEG
+     */
+    PyObject* pySrc = PyList_GET_ITEM(_py_list_src, idx);
+    uchar* src = (unsigned char*)PyString_AsString(pySrc);
+    size_t src_len = PyString_GET_SIZE(pySrc);
+    vector<uchar> src_vec(src, src + src_len);
+
+    cv::Mat decoded_mat = cv::imdecode(cv::Mat(src_vec), CV_LOAD_IMAGE_COLOR);
+    assert(decoded_mat.channels() == 3);
+
+    /*
+     * Resize
+     */
+    double min_dim = std::min(decoded_mat.size().height, decoded_mat.size().width);
+    double scale_factor = _target_size / min_dim;
+
+    int new_height = round(scale_factor * decoded_mat.size().height);
+    int new_width = round(scale_factor * decoded_mat.size().width);
+    assert((new_height == _target_size && new_width >= _target_size)
+           || (new_width == _target_size && new_height >= _target_size));
+    int interpolation = scale_factor == 1 ? cv::INTER_LINEAR
+                      : scale_factor > 1 ? cv::INTER_CUBIC : cv::INTER_AREA;
+
+    cv::resize(decoded_mat, _resized_mat_buffer, cv::Size(new_width, new_height), 0, 0, interpolation);
+
+    /*
+     * Conditionally crop and compress JPEG
+     */
+    if (_crop_to_square) {
+        int crop_start_x = (new_width - _target_size) / 2;
+        int crop_start_y = (new_height - _target_size) / 2;
+        cv::Rect cropRect(crop_start_x, crop_start_y, _target_size, _target_size);
+        cv::Mat cropped_mat_buffer = _resized_mat_buffer(cropRect);
+        cv::imencode(".jpg", cropped_mat_buffer, _output_jpeg_buffer, _encode_params);
+    } else {
+        cv::imencode(".jpg", _resized_mat_buffer, _output_jpeg_buffer, _encode_params);
+    }
+
+    char* output_jpeg_buffer_ptr = reinterpret_cast<char*>(&_output_jpeg_buffer[0]);
+    PyObject* pyStr = PyString_FromStringAndSize(output_jpeg_buffer_ptr, _output_jpeg_buffer.size());
+    PyList_Append(_py_list_tgt, pyStr);
+    Py_DECREF(pyStr);
+}
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile b/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile
new file mode 100644
index 0000000..81b8dd4
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile
@@ -0,0 +1,108 @@
+################################################################################
+#
+# Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+
+# Location of the CUDA Toolkit binaries and libraries
+CUDA_INC_PATH  = $(CUDA_INSTALL_PATH)/include
+CUDA_BIN_PATH  = $(CUDA_INSTALL_PATH)/bin
+CUDA_LIB_PATH  = $(CUDA_INSTALL_PATH)/lib64
+
+# Common binaries
+NVCC            = $(CUDA_BIN_PATH)/nvcc
+GCC             = g++
+AR				= ar
+
+# CUDA code generation flags
+GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
+GENCODE_FLAGS   := $(GENCODE_SM35)
+
+LDFLAGS   := -L$(CUDA_LIB_PATH) -lcudart
+CCFLAGS   := -m64
+NVCCFLAGS := -m64
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS   += -g
+      NVCCFLAGS += -g -G
+      DBG := debug
+else
+      DBG := release
+      NVCCFLAGS += -O3
+      CCFLAGS += -O3
+endif
+
+# Add profiler output
+ifeq ($(prof),1)
+	NVCCFLAGS += --ptxas-options=-v
+endif
+
+TARGETDIR := ./bin/$(DBG)
+OBJDIR := ./obj/$(DBG)
+
+########## USER STUFF ###########
+LDFLAGS   		+= -L../util -lutilpy -lcublas
+INCLUDES      	:= -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
+
+CUFILES	:= $(shell find . -name "*.cu")
+CU_DEPS	:= $(shell find . -name "*.cuh")
+CCFILES	:= $(shell find . -name "*.cpp")
+C_DEPS	:= $(shell find . -name "*.h")
+
+NVCCFLAGS += --compiler-options '-fPIC'
+LDFLAGS += -shared
+CCFLAGS += -fPIC
+TARGET := $(TARGETDIR)/libnvmatrix.so
+
+################################################################################
+# Set up target and object files
+################################################################################
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
+
+# Target rules
+all: makedirs $(TARGET)
+
+$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
+	$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
+	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
+
+$(TARGET): $(OBJS)
+	$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS)
+	ln -sf $(TARGET) .
+
+makedirs:
+	mkdir -p $(TARGETDIR)
+	mkdir -p $(OBJDIR)/src
+
+clean:
+	rm -rf ./obj
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh
new file mode 100644
index 0000000..5154a0d
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh
@@ -0,0 +1,317 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MEMORY_CUH_H_
+#define MEMORY_CUH_H_
+#include <map>
+#include <cuda.h>
+#include <string.h>
+#include <vector>
+#include <assert.h>
+
+#include <helper_cuda.h>
+#include "../../util/include/sync.h"
+#include "nvmatrix_kernels.cuh"
+
+#define GPU_ALLOC_FRACTION                  0.95 // Take 95% of available GPU memory
+#define HOST_ALLOC_CHUNK                    (1UL << 32)
+#define SYNC_ON_FREE                        true
+#define BUCKET_TYPE                         unsigned int
+
+// Allocte memory from up to this many buckets higher than desired without subdividing
+#define BUCKET_DIVISION_THRESHOLD           1
+#define NUM_BUCKETS                         static_cast<int>(sizeof(BUCKET_TYPE) * 8)
+#define CLZ(x)                              ((x) == 0 ? (NUM_BUCKETS) : __builtin_clz(x))
+#define CEIL_LOG2(x)                        (NUM_BUCKETS - CLZ(x))                      // Ceiling of log base 2 of (x + 1)
+#define LOG_FIRST_BUCKET_SIZE               12
+#define FIRST_BUCKET_SIZE                   (1 << LOG_FIRST_BUCKET_SIZE)                // First bucket is for 4K bytes
+#define GET_ALLOC_BUCKET(size)              (CEIL_LOG2(((size) - 1) >> LOG_FIRST_BUCKET_SIZE))
+#define GET_DEALLOC_BUCKET(size)            (CEIL_LOG2((size) >> (1 + LOG_FIRST_BUCKET_SIZE)))
+#define GET_BUCKET_SIZE(b)                  (1UL << (LOG_FIRST_BUCKET_SIZE + b))
+
+#define BUCKET_MASK(b)                      (1UL << (b))
+#define PREV_BUCKETS_MASK(b)                (BUCKET_MASK(b) - 1)
+#define AVAILABLE_NEXT_MASK(b, buckets)     ((buckets) & ~PREV_BUCKETS_MASK(b))
+
+/*
+ * Returns the "best-matching" available bucket as defined by policy.
+ * The two policies are:
+ *
+ *      TAKE_FROM_BIGGEST = true: If a bucket in the range
+ *      b...{b + BUCKET_DIVISION_THRESHOLD} is available, return the smallest
+ *      available bucket in that range. Otherwise return the *biggest* available
+ *      bucket greater than or equal to b.
+ *
+ *      TAKE_FROM_BIGGEST = false: Return the *smallest* available bucket greater
+ *      than or equal to b.
+ *
+ * Returns -1 when no satisfactory bucket is available.
+ */
+#define TAKE_FROM_BIGGEST                   true
+#if TAKE_FROM_BIGGEST
+#define GET_AVAILABLE_BUCKET(b, buckets)                                                                 \
+                                    (-1 + (((AVAILABLE_NEXT_MASK(b, buckets))                            \
+                                             & (PREV_BUCKETS_MASK((b) + 1 + BUCKET_DIVISION_THRESHOLD))) \
+        /* Smallest bucket >= b */         ? __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets))              \
+        /* Biggest bucket >= b */          : CEIL_LOG2(AVAILABLE_NEXT_MASK(b, buckets))))
+#else
+#define GET_AVAILABLE_BUCKET(b, buckets)    __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets))
+#endif
+
+/*
+ * Bit get/set/clear.
+ */
+#define GET_BIT(x, bit)             ((x) & (1 << (bit)))
+#define SET_BIT(x, bit)             ((x) |= (1 << (bit)))
+#define CLEAR_BIT(x, bit)           ((x) &= ~(1 << (bit)))
+
+typedef struct __align__(512) {
+    char data;
+} DataType;
+
+#define SIZE_ROUNDUP(size) (sizeof(DataType) * DIVUP((size), sizeof(DataType)))
+
+class MemorySegment {
+    friend class FastMemoryManager;
+protected:
+    DataType* _data;
+    size_t _size;
+    int _deviceID;
+    // Resizes itself to _size - size and
+    // returns pointer to new memory segment
+    MemorySegment* subdivide(size_t size) {
+        assert(size < _size);
+//        assert(size % sizeof(DataType) == 0);
+        _size -= size;
+        return new MemorySegment(_data + _size / sizeof(DataType), size, _deviceID);
+    }
+
+    inline size_t getSize() const {
+        return _size;
+    }
+public:
+    MemorySegment(DataType* data, size_t size, int deviceID) : _data(data), _size(size), _deviceID(deviceID) {
+        assert(size % sizeof(DataType) == 0);
+    }
+    // In some cases size is irrelevant
+    template<typename T> MemorySegment(T* data) : _data(reinterpret_cast<DataType*>(data)), _size(0), _deviceID(-1) {
+
+    }
+
+    template <class T /*= DataType*/>
+    inline T* getData() const {
+        return reinterpret_cast<T*>(_data);
+    }
+
+    template <class T /*= DataType*/>
+    inline T** getDataPtr() {
+        return reinterpret_cast<T**>(&_data);
+    }
+
+    inline int getDeviceID() const {
+        return _deviceID;
+    }
+};
+
+class MemoryManager {
+protected:
+    static Lock _globalLock;
+public:
+    virtual MemoryManager* init() = 0;
+    virtual MemorySegment* malloc(size_t size) = 0;
+    virtual void free(MemorySegment* mem) = 0;
+    virtual ~MemoryManager() {
+
+    }
+};
+
+class FastMemoryManager : public MemoryManager {
+protected:
+    int _deviceID;
+    Lock _lock;
+    DataType* _data;
+    size_t _size;
+    BUCKET_TYPE _buckets; // Bucket availability bit vector
+    std::vector<std::vector<MemorySegment*> > _freeSegments; // bucket idx -> vector of segments
+
+    static std::map<int, MemoryManager*> _memoryManagers;
+
+    virtual void allocateInitialSegment() {
+        assert(_deviceID >= 0);
+        assert(FIRST_BUCKET_SIZE % sizeof(DataType) == 0);
+        checkCudaErrors(cudaSetDevice(_deviceID));
+        size_t memFree, memTotal;
+        checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal));
+        _size = sizeof(DataType) * (size_t(round(double(memFree) * GPU_ALLOC_FRACTION)) / sizeof(DataType));
+        printf("FastMemoryManager[%d] allocating %lu-byte initial segment\n", _deviceID, _size);
+        checkCudaErrors(cudaMalloc(&_data, _size));
+    }
+
+    virtual void freeInitialSegment() {
+        checkCudaErrors(cudaFree(_data));
+    }
+
+public:
+    static MemoryManager& getInstance(int deviceID);
+    static void destroyInstance(int deviceID);
+
+    FastMemoryManager(int deviceID) : _deviceID(deviceID), _data(NULL), _size(0), _buckets(0) {
+    }
+
+    ~FastMemoryManager() {
+        freeInitialSegment();
+        for (int i = 0; i < _freeSegments.size(); ++i) {
+            for (int j = 0; j < _freeSegments[i].size(); ++j) {
+                delete _freeSegments[i][j];
+            }
+        }
+    }
+
+    virtual MemoryManager* init() {
+        allocateInitialSegment();
+
+        for (int i = 0; i < NUM_BUCKETS; ++i) {
+            _freeSegments.push_back(std::vector<MemorySegment*>());
+        }
+        int bucket = GET_DEALLOC_BUCKET(_size);
+        SET_BIT(_buckets, bucket);
+        _freeSegments[bucket].push_back(new MemorySegment(_data, _size, _deviceID));
+        return this;
+    }
+
+    MemorySegment* malloc(size_t size) {
+        assert(size > 0);
+        int requestedBucket = GET_ALLOC_BUCKET(size);
+        _lock.acquire();
+
+        int bucket = GET_AVAILABLE_BUCKET(requestedBucket, _buckets);
+//        if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) {
+//            printf("MemoryManager[%d] requested size: %lu, requested bucket: %d, available bucket: %d\n", _deviceID, size, requestedBucket, bucket);
+//        }
+
+        assert(bucket >= requestedBucket); // Out of memory
+
+        MemorySegment* sourceSegment = _freeSegments[bucket].back();
+        MemorySegment* ret = sourceSegment;
+        if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { // We got a much bigger chunk than we wanted
+            ret = sourceSegment->subdivide(GET_BUCKET_SIZE(requestedBucket));
+            int newSrcBucket = GET_DEALLOC_BUCKET(sourceSegment->getSize());
+            if (newSrcBucket != bucket) {
+                _freeSegments[bucket].pop_back();
+                _freeSegments[newSrcBucket].push_back(sourceSegment);
+                SET_BIT(_buckets, newSrcBucket);
+            }
+        } else {
+            _freeSegments[bucket].pop_back();
+        }
+        if (_freeSegments[bucket].size() == 0) {
+            CLEAR_BIT(_buckets, bucket);
+        }
+        _lock.release();
+        return ret;
+    }
+
+    void free(MemorySegment* mem) {
+        assert(mem != NULL);
+        assert(mem->getSize() >= FIRST_BUCKET_SIZE);
+        int bucket = GET_DEALLOC_BUCKET(mem->getSize());
+        // Synchronize for safety, so that we don't free memory that's being used. Not synchronizing
+        // could potentially cause a problem if we re-allocate the just-freed chunk and attempt to
+        // use it in a different stream.
+        if (SYNC_ON_FREE) {
+            int d;
+            checkCudaErrors(cudaGetDevice(&d));
+            checkCudaErrors(cudaSetDevice(mem->getDeviceID()));
+            checkCudaErrors(cudaDeviceSynchronize());
+            checkCudaErrors(cudaSetDevice(d));
+        }
+        _lock.acquire();
+        _freeSegments[bucket].push_back(mem);
+        SET_BIT(_buckets, bucket);
+//        printf("MemoryManager[%d] Freed segment of size %lu into bucket %lu\n", _deviceID, mem->getSize(), bucket);
+        _lock.release();
+    }
+};
+
+class FastHostMemoryManager : public FastMemoryManager {
+protected:
+    static MemoryManager* _memoryManager;
+    void allocateInitialSegment() {
+        _size = HOST_ALLOC_CHUNK;
+        checkCudaErrors(cudaHostAlloc(&_data, _size, cudaHostAllocPortable));
+    }
+    void freeInitialSegment () {
+        checkCudaErrors(cudaFreeHost(_data));
+    }
+public:
+    FastHostMemoryManager() : FastMemoryManager(DEVICE_HOST) {
+    }
+
+    static MemoryManager& getInstance();
+    static void destroyInstance();
+};
+
+class CUDAMemoryManager : public MemoryManager {
+protected:
+    static MemoryManager* _memoryManager;
+
+    virtual void _malloc(DataType** data, size_t size) {
+        checkCudaErrors(cudaMalloc(data, size));
+    }
+    virtual void _free(MemorySegment* mem) {
+        checkCudaErrors(cudaFree(mem->getData<DataType>()));
+    }
+public:
+    static MemoryManager& getInstance(int deviceID);
+    static void destroyInstance(int deviceID);
+    CUDAMemoryManager() {
+    }
+
+    MemoryManager* init() {
+        return this;
+    }
+
+    MemorySegment* malloc(size_t size) {
+        MemorySegment* seg = new MemorySegment(reinterpret_cast<DataType*>(NULL));
+        DataType** data = seg->getDataPtr<DataType>();
+        _malloc(data, size);
+        return seg;
+    }
+
+    void free(MemorySegment* mem) {
+        assert(mem != NULL);
+        _free(mem);
+        delete mem;
+    }
+};
+
+class CUDAHostMemoryManager : public CUDAMemoryManager {
+protected:
+    static MemoryManager* _memoryManager;
+    void _free(MemorySegment* mem) {
+        checkCudaErrors(cudaFreeHost(mem->getData<DataType>()));
+    }
+    void _malloc(DataType** data, size_t size) {
+        checkCudaErrors(cudaHostAlloc(data, size, cudaHostAllocPortable));
+    }
+public:
+    static MemoryManager& getInstance();
+    static void destroyInstance();
+    CUDAHostMemoryManager() : CUDAMemoryManager() {
+
+    }
+};
+#endif /* MEMORY_CUH_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh
new file mode 100644
index 0000000..d878d74
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh
@@ -0,0 +1,667 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVMATRIX_H_
+#define NVMATRIX_H_
+
+#include <map>
+#include <vector>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <curand.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#include <helper_cuda.h>
+#include "../../util/include/matrix.h"
+#include "nvmatrix_kernels.cuh"
+#include "nvmatrix_operators.cuh"
+#include "memory.cuh"
+
+#ifdef WARNINGS
+#define WARN(msg) printf("WARN: File %s, line %d: %s\n", __FILE__, __LINE__, msg);
+#else
+#define WARN(msg) ;
+#endif
+
+#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
+                            printf("CURAND Error at %s:%d\n",__FILE__,__LINE__);\
+                            exit(EXIT_FAILURE);}} while(0)
+
+#define CUBLAS_CALL(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
+                            printf("CUBLAS Error at %s:%d\n",__FILE__,__LINE__);\
+                            exit(EXIT_FAILURE);}} while(0)
+
+/*
+ * Memory manager to use for GPU memory allocations.
+ *
+ * CUDAMemoryManager: Default Nvidia memory manager; just calls cudaMalloc / cudaFree.
+ *                    Allocating and freeing memory is slow.
+ * FastMemoryManager: A GPU memory manager with very fast (constant time)
+ *                    alloc / free, but possibly more wasteful of memory.
+ */
+#define DEVICE_MEMORY_MANAGER       CUDAMemoryManager
+
+/*
+ * Memory manager to use for host memory allocations.
+ *
+ * CUDAHostMemoryManager: Default Nvidia memory manager; just calls cudaHostAlloc / cudaFreeHost.
+ *                        Allocating and freeing memory is slow.
+ * FastHostMemoryManager: A host memory manager with very fast (constant time)
+ *                        alloc / free, but possibly more wasteful of memory.
+ */
+#define HOST_MEMORY_MANAGER         CUDAHostMemoryManager
+
+class NVMatrix;
+typedef std::vector<NVMatrix*> NVMatrixV;
+
+class NVMatrix {
+protected:
+    int _numCols, _numRows;
+    int _numElements;
+    int _stride;
+//    float* getDevData();
+    MemorySegment* _memSegment;
+    bool _isTrans;
+    bool _ownsData;
+    // This flag makes sure that the NVMatrix destructor does nothing
+    // when called on HostNVMatrix instance.
+    bool _deleted;
+    cudaTextureObject_t _texObj;
+
+//    static std::map<int,curandGenerator_t> rndGen;
+    static std::map<int,MemorySegment*> _rndDevStates;
+    static std::map<int,cublasHandle_t> _cublasHandles;
+    // Map from device id --> # of random streams initialized on that device
+    static std::map<int,int> _rndDevThreads;
+    static pthread_mutex_t *_rndMutex, *_cublasMutex, *_streamMutex;
+    // Map from device id --> default stream
+    static std::map<int,cudaStream_t> _defaultStreams;
+
+    cublasOperation_t getTransChar() const {
+        /*
+         * not a typo! return opposite character because a
+         * non-transposed nvmatrix is in row-major order while a non-transposed
+         * cublas matrix is in column-major order.
+         */
+        return _isTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
+    }
+
+    void _init(bool isTrans);
+    void _sum_setParams(int n, dim3* blocks, dim3* threads);
+    template<class Agg> float cpuAgg(Agg agg, cudaStream_t stream);
+    template<class Agg> float _totalAgg(Agg agg);
+    template<class Agg> float _totalAgg(Agg agg, cudaStream_t stream);
+    template<class Agg> float _totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream);
+    template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp);
+    template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream);
+    template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop);
+    template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream);
+    template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop);
+    template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream);
+    template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop);
+    template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream);
+    template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop);
+
+    template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp);
+    template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
+    template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp);
+    template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
+    template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp);
+    template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
+    template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, NVMatrix& tmp);
+
+    template <class Randomizer> void _unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream);
+    template <class Randomizer> void _unaryRandomize(NVMatrix& target, Randomizer rnd);
+    template <class Randomizer> void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd);
+    template <class Randomizer> void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream);
+
+    virtual void alloc(int numElements);
+    virtual void dealloc();
+    void deallocTexture();
+    virtual NVMatrix& construct() const;
+    virtual NVMatrix& construct(bool isTrans) const;
+    virtual NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const;
+    virtual NVMatrix& construct(const Matrix& like, bool copy) const;
+    virtual NVMatrix& construct(const NVMatrix& like, bool copy) const;
+    virtual NVMatrix& construct(const NVMatrix& like) const;
+    virtual NVMatrix& construct(const Matrix& like) const;
+    virtual NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const;
+    static cublasHandle_t getCublasHandle();
+    static cublasHandle_t getCublasHandle(int deviceID);
+public:
+    NVMatrix();
+    NVMatrix(bool isTrans);
+    NVMatrix(int numRows, int numCols, bool isTrans=false);
+    NVMatrix(const Matrix& like, bool copy);
+    NVMatrix(const NVMatrix& like, bool copy);
+    NVMatrix(const NVMatrix& like);
+    NVMatrix(const Matrix& like);
+    NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans);
+    virtual ~NVMatrix();
+
+    // Returns the device ID on which the data pointer is allocated
+    int getDataDeviceID() const;
+    static void initRandom(unsigned long long seed, int numStreams, cudaStream_t stream);
+    static void initRandom(unsigned long long seed, int numStreams);
+    static void initRandom(unsigned long long seed);
+    static void initRandom();
+    static void initCublas();
+    static void destroyCublas();
+    static std::pair<size_t, size_t> getCudaMemorySize();
+
+    // Returns the currently-active device ID for calling thread
+    static int getDeviceID();
+    static void setDeviceID(int d);
+    static bool canAccessPeer(int srcDevice, int tgtDevice);
+    static bool isRndInitialized();
+    static bool isRndInitialized(bool haveLock);
+    static curandState* getCurandState();
+    static curandState* getCurandState(int numStreams);
+    static void destroyRandom();
+    static pthread_mutex_t* makeMutex();
+    static cudaStream_t getDefaultStream(int deviceID);
+    static cudaStream_t getDefaultStream();
+    static void syncDevice();
+    static void syncStream();
+    static void syncStream(cudaStream_t stream);
+
+    /*
+     * DO NOT DEREFERENCE IN HOST CODE! This is a device memory pointer.
+     */
+    float* getCellPtr(int i, int j) const {
+        if (_isTrans) {
+            return &getDevData()[j * _numRows + i];
+        }
+        return &getDevData()[i * _numCols + j];
+    }
+
+    bool isSameDims(const Matrix& m) const {
+        return m.getNumRows() == _numRows && m.getNumCols() == _numCols;
+    }
+
+    bool isSameDims(const NVMatrix& m) const {
+        return m.getNumRows() == _numRows && m.getNumCols() == _numCols;
+    }
+
+    int getNumRows() const {
+        return _numRows;
+    }
+
+    int getNumCols() const {
+        return _numCols;
+    }
+
+    int getStride() const {
+        return _stride;
+    }
+
+    int getLeadingDim() const {
+        return _isTrans ? _numRows : _numCols;
+    }
+
+    int getFollowingDim() const {
+        return !_isTrans ? _numRows : _numCols;
+    }
+
+    /*
+     * FALSE:    Row-major order.
+     * TRUE:     Column-major order.
+     */
+    bool isTrans() const {
+        return _isTrans;
+    }
+
+    bool isView() const {
+        return !_ownsData;
+    }
+
+    float* getDevData() const {
+        return _memSegment == NULL ? NULL : _memSegment->getData<float>();
+    }
+
+    MemorySegment& getMemorySegment() const {
+        return *_memSegment;
+    }
+
+    int getNumElements() const {
+        return _numElements;
+    }
+
+    size_t getNumDataBytes() const {
+        return size_t(_numElements) * 4;
+    }
+
+    /*
+     * Only use if you know what you're doing!
+     * Does not actually transpose matrix.
+     */
+    void setTrans(bool trans) {
+        if (trans != _isTrans) {
+            assert(isContiguous());
+            _isTrans = trans;
+            _stride = getLeadingDim();
+        }
+    }
+
+    /*
+     * Only use if you know what you're doing!
+     * This toggles whether this object will free its GPU memory when it's destroyed.
+     */
+    void setIsView(bool isView) {
+        _ownsData = !isView;
+    }
+
+    bool isContiguous() const {
+        return _stride == getLeadingDim() || getFollowingDim() == 1;
+    }
+
+    void truncate() {
+        resize(0,0);
+    }
+
+    virtual cudaTextureObject_t getTextureObject();
+
+    virtual void copyFromHost(const Matrix& hostMatrix);
+    virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget);
+    virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream);
+    virtual void copyToHost(Matrix& hostMatrix) const;
+    virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget) const;
+    virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const;
+    void copy(NVMatrix& dest) const;
+    void copy(NVMatrix& dest, cudaStream_t stream) const;
+    NVMatrix& copy() const;
+    void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream);
+    void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB);
+    void addProduct(NVMatrix& a, NVMatrix &b);
+    void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream);
+    void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target);
+    void rightMult(NVMatrix &b, NVMatrix &target);
+    void rightMult(NVMatrix &b, float scaleAB);
+    void randomizeUniform();
+    void addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target);
+    void addGaussianNoise(float stdev, NVMatrix& target);
+    void addGaussianNoise(NVMatrix& stdevs, bool var);
+    void addGaussianNoise(NVMatrix& stdevs);
+    void addGaussianNoise(float stdev);
+    void addGaussianNoise();
+    void randomizeGaussian();
+    void randomizeGaussian(float stdev);
+    void randomizeGaussian(float mean, float stdev);
+    void randomizeGaussian(float mean, NVMatrix& stdevs);
+    void randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs);
+    void randomizeGaussian(NVMatrix& stdevs);
+    void randomizeGaussian(NVMatrix& stdevs, NVMatrix& target);
+    void binarizeProbs();
+    void binarizeProbs(NVMatrix& target);
+
+    void biggerThan(NVMatrix& m, NVMatrix& target);
+    void biggerThan(NVMatrix& m);
+    void biggerThanVector(NVMatrix& vec, NVMatrix& target);
+    void biggerThanVector(NVMatrix& vec);
+    void equals(NVMatrix& m, NVMatrix& target);
+    void equals(NVMatrix& m);
+
+    void _checkBounds(int startRow, int endRow, int startCol, int endCol) const;
+    NVMatrix& slice(int startRow, int endRow, int startCol, int endCol) const;
+    void slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const;
+    NVMatrix& sliceRows(int startRow, int endRow) const;
+    void sliceRows(int startRow, int endRow, NVMatrix& target) const;
+    NVMatrix& sliceCols(int startCol, int endCol) const;
+    void sliceCols(int startCol, int endCol, NVMatrix& target) const;
+
+    NVMatrixV& splitRows(int numParts);
+    NVMatrixV& splitCols(int numParts);
+
+    template <class Op> void apply(Op op, NVMatrix& target, cudaStream_t stream) {
+        if (!target.isSameDims(*this)) {
+            target.resize(*this);
+        }
+        if (getNumElements() > 0) {
+            int height = target.getFollowingDim(), width = target.getLeadingDim();
+
+            if (target.isTrans() == isTrans()) {
+                if (!isContiguous() || !target.isContiguous()) {
+                    dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)),
+                            std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y)));
+                    dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
+                    kEltwiseUnaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
+                    getLastCudaError("kEltwiseUnaryOp: Kernel execution failed");
+                } else {
+                    dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
+                    dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
+                    kEltwiseUnaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), _numElements, op);
+                    getLastCudaError("kEltwiseUnaryOpFlat: Kernel execution failed");
+                }
+            } else {
+                dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)),
+                        std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y)));
+                dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
+                bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0);
+    //            printf("height: %d, width: %d, stride: %d, target stride: %d, check bounds: %d, threads.x: %d, threads.y: %d, blocks.x: %d, blocks.y: %d\n",
+    //                    height, width, getStride(), target.getStride(), checkBounds, threads.x, threads.y, blocks.x, blocks.y);
+                if (checkBounds) {
+                    kEltwiseUnaryOpTrans<Op, true><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
+                } else {
+                    kEltwiseUnaryOpTrans<Op, false><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
+                }
+                getLastCudaError("kEltwiseUnaryOpTrans: Kernel execution failed");
+            }
+        }
+    }
+
+    template <class Op> void apply(Op op, cudaStream_t stream) {
+        apply(op, *this, stream);
+    }
+
+    template <class Op> void apply(Op op, NVMatrix& target) {
+        apply(op, target, getDefaultStream());
+    }
+
+    template <class Op> void apply(Op op) {
+        apply(op, *this);
+    }
+
+    template <class Op> void applyBinary(Op op, NVMatrix& b) {
+        applyBinary(op, b, *this);
+    }
+
+    template <class Op> void applyBinary(Op op, NVMatrix& b, NVMatrix& target) {
+        applyBinary(op, b, target, getDefaultStream());
+    }
+
+    template <class Op> void applyBinary(Op op, NVMatrix& b, NVMatrix& target, cudaStream_t stream) {
+        assert(this->isSameDims(b));
+
+        if (!target.isSameDims(*this)) {
+            target.resize(*this);
+        }
+
+        if (getNumElements() > 0) {
+            int height = target.getFollowingDim(), width = target.getLeadingDim();
+            if (target.isTrans() == isTrans() && target.isTrans() == b.isTrans()) {
+                if (!isContiguous() || !b.isContiguous() || !target.isContiguous()) {
+                    dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)),
+                                std::min(128, DIVUP(height, ELTWISE_THREADS_Y)));
+                    dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
+                    kEltwiseBinaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width, getStride(),
+                                                              b.getStride(), target.getStride(), op);
+                } else {
+                    dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
+                    dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
+                    kEltwiseBinaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), _numElements, op);
+                }
+                getLastCudaError("kEltwiseBinaryOp: Kernel execution failed");
+            } else {
+
+                dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)),
+                            std::min(128, DIVUP(height, ELTWISE_THREADS_Y)));
+                dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
+                //  both x here since y divides x
+                bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0);
+                if (target.isTrans() == isTrans() && target.isTrans() != b.isTrans()) {
+                    if (checkBounds) {
+                        kEltwiseBinaryOpTrans<Op,true,false,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
+                                                                   b.getStride(), target.getStride(), op);
+                    } else {
+                        kEltwiseBinaryOpTrans<Op,false,false,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
+                                                                   b.getStride(), target.getStride(), op);
+                    }
+                } else if (target.isTrans() != isTrans() && target.isTrans() != b.isTrans()) {
+                    if (checkBounds) {
+                        kEltwiseBinaryOpTrans<Op,true,true,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
+                                                                   b.getStride(), target.getStride(), op);
+                    } else {
+                        kEltwiseBinaryOpTrans<Op,false,true,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
+                                                                   b.getStride(), target.getStride(), op);
+                    }
+                } else if (target.isTrans() != isTrans() && target.isTrans() == b.isTrans()) {
+                    if (checkBounds) {
+                        kEltwiseBinaryOpTrans<Op,true,false,true><<<blocks, threads, 0, stream>>>(b.getDevData(), getDevData(), target.getDevData(), height, width,b.getStride(),
+                                                                   getStride(), target.getStride(), op);
+                    } else {
+                        kEltwiseBinaryOpTrans<Op,false,false,true><<<blocks, threads, 0, stream>>>(b.getDevData(), getDevData(), target.getDevData(), height, width, b.getStride(),
+                                                                   getStride(), target.getStride(), op);
+                    }
+                }
+                getLastCudaError("kEltwiseBinaryOpTrans: Kernel execution failed");
+            }
+        }
+    }
+
+    template <class Op> void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target) {
+        applyTernary(op, b, c, target, getDefaultStream());
+    }
+
+    template <class Op> void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target, cudaStream_t stream) {
+        assert(isSameDims(b));
+        assert(isSameDims(c));
+        // For now ternary ops are only supported for matrices of same transposedness
+        assert(isTrans() == b.isTrans());
+        assert(isTrans() == c.isTrans());
+        if (!target.isSameDims(*this) || target.isTrans() != isTrans()) {
+            target.resize(*this);
+        }
+        if (getNumElements() > 0) {
+            int height = target.getFollowingDim(), width = target.getLeadingDim();
+            if (!isContiguous() || !b.isContiguous() || !c.isContiguous() || !target.isContiguous()) {
+                dim3 blocks(std::min(512, DIVUP(width, ELTWISE_THREADS_X)),
+                            std::min(512, DIVUP(height, ELTWISE_THREADS_Y)));
+                dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
+                kEltwiseTernaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), height, width,
+                                                                       getStride(), b.getStride(), c.getStride(), target.getStride(), op);
+                getLastCudaError("kEltwiseTernaryOp: Kernel execution failed");
+            } else {
+                dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
+                dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
+                kEltwiseTernaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), _numElements, op);
+                getLastCudaError("kEltwiseTernaryOpFlat: Kernel execution failed");
+            }
+        }
+    }
+
+    bool resize(int numRows, int numCols, bool trans);
+    bool resize(int numRows, int numCols);
+    bool resize(const NVMatrix &like);
+    bool resize(const Matrix &like);
+    void reshape(int numRows, int numCols);
+    NVMatrix& reshaped(int numRows, int numCols) const;
+    void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const;
+    void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol, cudaStream_t stream) const;
+    void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream);
+    void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target);
+    void add(NVMatrix& b, float scaleB, NVMatrix& target);
+    void add(NVMatrix& b, NVMatrix& target);
+    void add(NVMatrix& b, float scaleB);
+    void add(NVMatrix& b, float scaleA, float scaleB);
+    void add(NVMatrix& b);
+    void eltwiseMult(NVMatrix& b);
+    void eltwiseMult(NVMatrix& b, NVMatrix& target);
+    void eltwiseDivide(NVMatrix& b);
+    void eltwiseDivide(NVMatrix& b, NVMatrix& target);
+    void squaredDiff(NVMatrix& b);
+    void squaredDiff(NVMatrix& b, NVMatrix& target);
+    void subtract(NVMatrix& b, NVMatrix& target);
+    void subtract(NVMatrix& b);
+    void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream);
+    void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target);
+    void addVector(NVMatrix& vec);
+    void addVector(NVMatrix& vec, float scaleVec);
+    void addVector(NVMatrix& vec, NVMatrix& target);
+    void equalsVector(NVMatrix& vec, NVMatrix& target);
+    void equalsVector(NVMatrix& vec);
+    void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream);
+    void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target);
+    void eltwiseMultByVector(NVMatrix& vec);
+    void eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream);
+    void eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target);
+    void eltwiseDivideByVector(NVMatrix& vec);
+    void tile(int timesY, int timesX, NVMatrix& target);
+    void tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream);
+
+    void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum);
+    void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream);
+    void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax);
+    void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream);
+    void sum(int axis, NVMatrix& target, cudaStream_t stream);
+    void sum(int axis, NVMatrix& target);
+    void sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp);
+    void sum(int axis, NVMatrix& target, NVMatrix& tmp);
+    NVMatrix& sum(int axis);
+    void max(int axis, NVMatrix& target);
+    void max(int axis, NVMatrix& target, NVMatrix& tmp);
+    NVMatrix& max(int axis);
+    void min(int axis, NVMatrix& target);
+    NVMatrix& min(int axis);
+    void sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream);
+    void sumOfSquares(int axis, NVMatrix& target);
+    NVMatrix& sumOfSquares(int axis);
+    float mean();
+    float sum();
+    float sum(NVMatrix& tmpbuf);
+    float max();
+    float min();
+    float countInf();
+    float countNan();
+    float norm2();
+    float norm();
+
+    void inRangeInc(float lower, float upper);
+    void inRangeInc(float lower, float upper, NVMatrix& target);
+    void inRangeExc(float lower, float upper);
+    void inRangeExc(float lower, float upper, NVMatrix& target);
+    void biggerThanScalar(float scalar);
+    void biggerThanScalar(float scalar, NVMatrix& target);
+    void smallerThanScalar(float scalar);
+    void smallerThanScalar(float scalar, NVMatrix& target);
+    void addScalar(float scaleThis, float scalar, NVMatrix& target);
+    void addScalar(float scalar, NVMatrix& target);
+    void addScalar(float scalar);
+    void minWithScalar(float scalar, NVMatrix& target);
+    void minWithScalar(float scalar);
+    void maxWithScalar(float scalar, NVMatrix& target);
+    void maxWithScalar(float scalar);
+    void pow(float p, NVMatrix& target);
+    void pow(float p);
+    void scale(float _scale);
+    void scale(float _scale, NVMatrix& target);
+    void scale(float _scale, NVMatrix& target, cudaStream_t stream);
+    void scale(float _scale, cudaStream_t stream);
+    void zero();
+    void zero(NVMatrix& like);
+
+    float dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream);
+    float dotProduct(NVMatrix& b, cudaStream_t stream);
+    float dotProduct(NVMatrix& b);
+
+    /*
+     * Does SOFT transpose and returns result, leaving this matrix unchanged
+     */
+    NVMatrix& getTranspose();
+    NVMatrix& getClone();
+
+    /*
+     * Does HARD transpose and puts result in target
+     */
+    void transpose(NVMatrix& target);
+
+    /*
+     * Does SOFT transpose
+     */
+    void transpose();
+    bool transpose(bool trans);
+
+    void flipTrans(NVMatrix& target, cudaStream_t stream);
+    void flipTrans(NVMatrix& target);
+    NVMatrix& flipTrans();
+
+    void print(int startRow, int rows, int startCol, int cols) const;
+    void print(int rows, int cols) const;
+    void printShape(const char* name) const;
+
+    template <class Op> void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target) {
+        applyBinaryV(op, vec, target, getDefaultStream());
+    }
+
+    template <class Op> void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target, cudaStream_t stream) {
+        assert(&target != &vec); // for now
+        if (isSameDims(vec)) {
+            applyBinary(op, vec, target, stream);
+            return;
+        }
+        assert(vec.getNumRows() == 1 || vec.getNumCols() == 1);
+        assert(vec.getNumRows() == _numRows || vec.getNumCols() == _numCols);
+        assert(vec.isContiguous());
+
+        target.resize(*this); // target must be same orientation as me for now
+        int width = getLeadingDim(); //_isTrans ? _numRows : _numCols;
+        int height = getFollowingDim(); //_isTrans ? _numCols : _numRows;
+        dim3 threads(ADD_VEC_THREADS_X, ADD_VEC_THREADS_Y);
+
+        if ((vec.getNumRows() == _numRows && !isTrans()) || (vec.getNumCols() == _numCols && isTrans())) {
+            dim3 blocks(std::min(512, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y)));
+            kColVectorOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op);
+        } else {
+            dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y)));
+            kRowVectorOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op);
+        }
+        getLastCudaError("Kernel execution failed");
+    //    cudaThreadSynchronize();
+    }
+
+    template<class UnaryOperator> float argMax(UnaryOperator u) {
+       return _totalAgg(NVMatrixAggs::ArgMax<UnaryOperator>(u));
+    }
+    static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev);
+    static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream);
+    static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev);
+    static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB);
+
+    static void assertSame(NVMatrixV& a);
+};
+
+class HostNVMatrix : public NVMatrix {
+protected:
+    void alloc(int numElements);
+    void dealloc();
+    NVMatrix& construct() const;
+    NVMatrix& construct(bool isTrans) const;
+    NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const;
+    NVMatrix& construct(const Matrix& like, bool copy) const;
+    NVMatrix& construct(const NVMatrix& like, bool copy) const;
+    NVMatrix& construct(const NVMatrix& like) const;
+    NVMatrix& construct(const Matrix& like) const;
+    NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const;
+public:
+    ~HostNVMatrix();
+    HostNVMatrix();
+    HostNVMatrix(bool isTrans);
+    HostNVMatrix(int numRows, int numCols, bool isTrans=false);
+    HostNVMatrix(const Matrix& like, bool copy);
+    HostNVMatrix(const NVMatrix& like, bool copy);
+    HostNVMatrix(const NVMatrix& like);
+    HostNVMatrix(const Matrix& like);
+    HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans);
+    void copyFromHost(const Matrix& hostMatrix);
+    void copyFromHost(const Matrix& hostMatrix, bool resizeTarget);
+    void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream);
+    void copyToHost(Matrix& hostMatrix) const;
+    void copyToHost(Matrix& hostMatrix, bool resizeTarget) const;
+    void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const;
+    cudaTextureObject_t getTextureObject();
+};
+
+#endif /* NVMATRIX_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh
new file mode 100644
index 0000000..99b234a
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh
@@ -0,0 +1,727 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVMATRIX_KERNEL_H_
+#define NVMATRIX_KERNEL_H_
+
+#include <curand_kernel.h>
+
+#if defined(_WIN64) || defined(_WIN32)
+#define uint unsigned int
+#endif
+
+#define NUM_BLOCKS_MAX                      65535
+#define TEXTURE_SIZE_MAX                    (1<<29)
+
+#define NUM_RND_BLOCKS                      96
+#define NUM_RND_THREADS_PER_BLOCK           128
+#define NUM_RND_STREAMS                     (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK)
+
+/*
+ * Default grid/block sizes for the various functions.
+ */
+#define ADD_BLOCK_SIZE                      16
+
+#define NUM_TILE_BLOCKS                     4096
+#define NUM_TILE_THREADS_PER_BLOCK          512
+
+#define ELTWISE_THREADS_X                   32
+#define ELTWISE_THREADS_Y                   8
+
+#define ELTWISE_FLAT_THREADS_X              128
+
+#define NUM_SUM_COLS_THREADS_PER_BLOCK      128
+
+#define AGG_SHORT_ROWS_THREADS_X            32
+#define AGG_SHORT_ROWS_THREADS_Y            8
+#define AGG_SHORT_ROWS_LOOPS_Y              32
+
+#define DP_BLOCKSIZE                        512
+#define CPUSUM_MAX                          4096
+
+#define ADD_VEC_THREADS_X                   64
+#define ADD_VEC_THREADS_Y                   4
+
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#define MYMAX(a, b) ((a) > (b) ? (a) : (b))
+
+#ifndef MUL24 // legacy
+#define MUL24(x,y) ((x) * (y))
+#endif
+
+#define AWR_NUM_THREADS           256
+#define WARP_SIZE                 32
+#define AWR_NUM_WARPS             AWR_NUM_THREADS / WARP_SIZE 
+#define AWR_LOG_NUM_THREADS       8
+#define LOG_WARP_SIZE             5
+#define AWR_LOG_NUM_WARPS         3
+
+#define DEVICE_HOST               -1
+#define DEVICE_NULL               -2
+
+__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight);
+__global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements);
+__global__ void kSetupCurand(curandState *state, unsigned long long seed);
+
+template<typename T> 
+__device__ T shfl_down(T a, int b, int c=WARP_SIZE) {
+#if __CUDA_ARCH__ >= 300
+    return __shfl_down(a, b, c);
+#else
+    return 0;
+#endif
+}
+
+/*
+ * For now this is supported only for arrays with the same transposedness.
+ */
+template<class Op>
+__global__ void kEltwiseTernaryOp(const float* a, const float* b, const float* c, float* const dest,
+                                  const uint height, const uint width, uint strideA, const uint strideB, const uint strideC,
+                                  const uint strideDest, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
+    const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
+
+    for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
+        for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
+            dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x], c[y * strideC + x]);
+        }
+    }
+}
+
+template<class Op>
+__global__ void kEltwiseTernaryOpFlat(const float* a, const float* b, const float* c, float* const dest, const uint numElements, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
+
+    for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
+        dest[x] = op(a[x], b[x], c[x]);
+    }
+}
+
+
+/*
+ * dest here is assumed to be "not transposed" -- height and width correspond to it.
+ * b is assumed to be transposed.
+ * a can be either transposed or not -- depending on parameter.
+ * 
+ * Performs dest := op(a, b)
+ */
+template<class Op, bool checkBounds, bool aTrans, bool reverse>
+__global__ void kEltwiseBinaryOpTrans(const float* a, const float* b, float* const dest,
+                             const uint height, const uint width,
+                             const uint strideA, const uint strideB, const uint strideDest, Op op) {
+
+    __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1];
+
+    // x here because that's how much work we do
+    for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) {
+        for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) {
+            const uint readX = by + threadIdx.x;
+            const uint readY = bx + threadIdx.y;
+
+            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
+                if (!checkBounds || (readX < height && readY + y < width)) {
+                    if (aTrans) {
+                        shmem[threadIdx.x][threadIdx.y + y] = reverse ? op(b[(readY+y) * strideB + readX], a[(readY+y) * strideA + readX])
+                                                                      : op(a[(readY+y) * strideA + readX], b[(readY+y) * strideB + readX]);
+                    } else {
+                        shmem[threadIdx.x][threadIdx.y + y] = b[(readY+y) * strideB + readX];
+                    }
+                }
+            }
+            __syncthreads();
+
+            const uint writeX = bx + threadIdx.x;
+            const uint writeY = by + threadIdx.y;
+
+            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
+                if(!checkBounds || (writeX < width && writeY + y < height)) {
+                    if (aTrans) {
+                        dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x];
+                    } else {
+                        dest[(writeY + y) * strideDest + writeX] = reverse ? op(shmem[threadIdx.y + y][threadIdx.x], a[(writeY + y) * strideA + writeX])
+                                                                           : op(a[(writeY + y) * strideA + writeX], shmem[threadIdx.y + y][threadIdx.x]);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+template<class Op>
+__global__ void kEltwiseBinaryOp(const float* a, const float* b, float* const dest, const uint height, const uint width,
+                             const uint strideA, const uint strideB, const uint strideDest, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
+    const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
+
+    for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
+        for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
+            dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x]);
+        }
+    }
+}
+
+template<class Op>
+__global__ void kEltwiseBinaryOpFlat(const float* a, const float* b, float* const dest, const uint numElements, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
+
+    for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
+        dest[x] = op(a[x], b[x]);
+    }
+}
+
+/*
+ * dest here is assumed to be "not transposed" -- height and width correspond to it.
+ */
+template<class Op, bool checkBounds>
+__global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest,
+                                     const uint height, const uint width,
+                                     const uint strideA, const uint strideDest, Op op) {
+
+    __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1];
+
+    for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) {
+        for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) {
+            const uint readX = by + threadIdx.x;
+            const uint readY = bx + threadIdx.y;
+            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
+                if (!checkBounds || (readX < height && readY + y < width)) {
+                    shmem[threadIdx.x][threadIdx.y + y] = op(a[(readY + y) * strideA + readX]);
+                }
+            }
+            __syncthreads();
+
+            const uint writeX = bx + threadIdx.x;
+            const uint writeY = by + threadIdx.y;
+            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
+                if(!checkBounds || (writeX < width && writeY + y < height)) {
+                    dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x];
+
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+template<class Op>
+__global__ void kEltwiseUnaryOpFlat(const float* a, float* const dest, const uint numElements, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
+
+    for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
+        dest[x] = op(a[x]);
+    }
+}
+
+template<class Op>
+__global__ void kEltwiseUnaryOp(const float* a, float* const dest, const uint height, const uint width,
+                                const uint strideA, const uint strideDest, Op op) {
+    const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
+    const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
+
+    for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
+        for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
+            dest[y * strideDest + x] = op(a[y * strideA + x]);
+        }
+    }
+}
+
+/*
+ * Matrix in ROW-MAJOR order!
+ */
+template <class Op>
+__global__ void kRowVectorOp(const float* mat, const float* vec, float* const tgtMat, const uint width, const uint height,
+                             const uint matStride, const uint tgtStride, Op op) {
+    __shared__ float shVec[ADD_VEC_THREADS_X];
+    const uint bx = ADD_VEC_THREADS_X * blockIdx.x;
+    const uint by = ADD_VEC_THREADS_Y * blockIdx.y;
+
+    for (uint x = bx; x < width; x += gridDim.x * ADD_VEC_THREADS_X) {
+        __syncthreads();
+        if (x + threadIdx.x < width && threadIdx.y == 0) {
+            shVec[threadIdx.x] = vec[x + threadIdx.x];
+        }
+        __syncthreads();
+
+        if (x + threadIdx.x < width) {
+            for (uint y = by + threadIdx.y; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) {
+                tgtMat[y * tgtStride + x + threadIdx.x] = op(mat[y * matStride + x + threadIdx.x], shVec[threadIdx.x]);
+            }
+        }
+    }
+}
+
+/*
+ * Matrix in ROW-MAJOR order!
+ */
+template <class Op>
+__global__ void kColVectorOp(float* mat, float* vec, float* tgtMat,
+                             const uint width, const uint height,
+                             const uint matStride, const uint tgtStride, Op op) {
+    __shared__ float shVec[ADD_VEC_THREADS_Y];
+    const uint by = ADD_VEC_THREADS_Y * blockIdx.y;
+    const uint bx = ADD_VEC_THREADS_X * blockIdx.x;
+    const uint tidx = ADD_VEC_THREADS_X * threadIdx.y + threadIdx.x;
+    
+    mat += threadIdx.y * matStride;
+    vec += tidx;
+    tgtMat += threadIdx.y * tgtStride;
+
+    for (uint y = by; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) {
+        __syncthreads();
+        if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) {
+            shVec[tidx] = vec[y];
+        }
+        __syncthreads();
+
+        if (y + threadIdx.y < height) {
+            for (uint x = bx + threadIdx.x; x < width; x += gridDim.x * ADD_VEC_THREADS_X) {
+                tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[threadIdx.y]);
+            }
+        }
+    }
+}
+
+/*
+ * This one gets coalesced reads but computes only a partial sum which
+ * must either be summed again (recursively) or summed on the host.
+ */
+template<class Agg, class UnaryOp, class BinaryOp, int blockSize>
+__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) {
+    const int idxX = blockIdx.x * blockSize*2 + threadIdx.x;
+
+    __shared__ float accum[blockSize*2];
+
+    matSum += blockIdx.y * sumWidth + blockIdx.x;
+    /*
+     * Here it's important to make sure that all threads in a block call __syncthreads,
+     * so I have even the redundant threads (for which idxX >= width) enter this loop
+     * just so that they may call __syncthreads at the appropriate times.
+     */
+    mat += width * blockIdx.y + idxX;
+
+    accum[threadIdx.x] = agg.getBaseValue();
+    accum[threadIdx.x + blockSize] = agg.getBaseValue();
+    for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) {
+        if (idxX < width) {
+            accum[threadIdx.x] = uop(mat[0]);
+            if(idxX + blockSize < width)
+                accum[threadIdx.x + blockSize] = uop(mat[blockSize]);
+        }
+        if (blockSize >= 512) {
+            __syncthreads();
+            if (threadIdx.x < 512)
+                accum[threadIdx.x] = agg(accum[threadIdx.x], accum[threadIdx.x + 512]);
+        }
+        if (blockSize >= 256) {
+            __syncthreads();
+            if (threadIdx.x < 256)
+                accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 256]);
+        }
+        if (blockSize >= 128) {
+            __syncthreads();
+            if (threadIdx.x < 128)
+                accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 128]);
+        }
+        if (blockSize >= 64) {
+            __syncthreads();
+            if (threadIdx.x < 64)
+                accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 64]);
+        }
+
+        __syncthreads();
+        volatile float* myAccum = &accum[threadIdx.x];
+        if (threadIdx.x < 32) { // executed only by first warp
+            myAccum[0] = agg(myAccum[0], myAccum[32]);
+            myAccum[0] = agg(myAccum[0], myAccum[16]);
+            myAccum[0] = agg(myAccum[0], myAccum[8]);
+            myAccum[0] = agg(myAccum[0], myAccum[4]);
+            myAccum[0] = agg(myAccum[0], myAccum[2]);
+            myAccum[0] = agg(myAccum[0], myAccum[1]);
+        }
+
+        if (threadIdx.x == 0) {
+            matSum[0] = bop(matSum[0], myAccum[0]);
+            matSum += gridDim.y * sumWidth;
+        }
+        __syncthreads();
+        mat += width * gridDim.y;
+    }
+}
+
+template<class Agg, class BinaryOp>
+__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) {
+    const int tidx = threadIdx.x;
+
+    __shared__ float accum[AWR_NUM_THREADS];
+    volatile float* vMyAccum = &accum[tidx];
+    float* myAccum = &accum[tidx];
+    
+    matSum += blockIdx.y;
+    mat += width * blockIdx.y;
+
+    for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) {
+        myAccum[0] = agg.getBaseValue();
+        for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
+            myAccum[0] = agg(myAccum[0], mat[x]);
+        }
+        #pragma unroll
+        for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) {
+            const uint d = 1 << i;
+            __syncthreads();
+            if (tidx < d) {
+                myAccum[0] = agg(myAccum[0], myAccum[d]);
+            }
+        }
+        __syncthreads();
+        if (tidx < WARP_SIZE) {
+            #pragma unroll
+            for (int i = LOG_WARP_SIZE; i >= 0; i--) {
+                const uint d = 1 << i;
+                vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]);
+            }
+
+            if (tidx == 0) {
+                matSum[0] = op(matSum[0], vMyAccum[0]);
+                matSum += gridDim.y;
+            }
+        }
+        __syncthreads();
+        mat += width * gridDim.y;
+    }
+}
+
+/*
+ * Implements multiscan idea from http://www.moderngpu.com
+ * Not really useful for pure reductions but neat nonetheless.
+ */
+template<class Agg, class UnaryOp, class BinaryOp>
+__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height,
+                                         Agg agg, UnaryOp uop, BinaryOp bop) {
+    const uint tidx = threadIdx.x;
+    const uint warpIdx = tidx / WARP_SIZE;
+    const uint lane = tidx % WARP_SIZE;
+    
+    __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS];
+    __shared__ float finalAccum[AWR_NUM_WARPS];
+
+    float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
+    float* myFinalAccum = &finalAccum[tidx];
+    //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
+    matSum += blockIdx.y;
+    mat += width * blockIdx.y;
+
+    float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem
+    #pragma unroll 32
+    for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
+        rAccum = agg(rAccum, uop(mat[x]));
+    }
+    myAccum[0] = rAccum;
+    
+    // Each warp does a reduction that doesn't require synchronizatoin
+    #pragma unroll
+    for (uint i = 0; i < LOG_WARP_SIZE; i++) {
+        const uint d = 1 << i;
+        myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d));
+    }
+    __syncthreads();
+    // The warps write their results
+    if (tidx < AWR_NUM_WARPS) {
+        //volatile float* vMyFinalAccum = &finalAccum[tidx];
+        myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)];
+        #pragma unroll
+        for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) {
+            const uint d = 1 << i;
+            myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d));
+        }
+        if (tidx == 0) {
+            matSum[0] = bop(matSum[0], myFinalAccum[0]);
+            matSum += gridDim.y;
+        }
+    }
+}
+
+/*
+ * To be used when the rows are <= 64.
+ *
+ * TODO: try to reduce reg usage. i think this can be made faster too.
+ */
+//#define AGG_SHORT_ROWS_LOOPS_X  4
+template <class Agg, class UnaryOp, class BinaryOp, int LOOPS_X, int THREADS_X>
+__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
+    const uint shmemX = THREADS_X + 1;
+    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
+
+    const uint tidx = threadIdx.y * THREADS_X + threadIdx.x;
+    const uint ty = LOOPS_X == 1 ? tidx / width : threadIdx.y; // when loops==1, width is gonna be smaller than block x dim
+    const uint tx = LOOPS_X == 1 ? tidx % width : threadIdx.x;
+    const uint bidx = blockIdx.y * gridDim.x + blockIdx.x;
+    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
+    float* shmemWrite = shmem + MUL24(ty, shmemX) + tx;
+    matSum += blockRowIdx + tidx;
+//    shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0;
+    mat += width * blockRowIdx + MUL24(ty, width) + tx;
+    float* shmemWriteZeros = &shmem[MUL24(threadIdx.y,shmemX) + threadIdx.x];
+
+    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ;
+
+    if (blockRowIdx < height) {
+#pragma unroll
+        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
+            doAgg &= tidx + y + blockRowIdx < height;
+            const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height;
+
+            shmemWriteZeros[0] = agg.getBaseValue();
+            __syncthreads();
+#pragma unroll
+            for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) {
+//                __syncthreads();
+                if (heightIdxOK && x + tx < width) {
+                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
+                }
+            }
+            __syncthreads();
+            if (doAgg) {
+                /*
+                 * I tried doing this final sum as a 4-step reduction, with 8 threads
+                 * per warp participating. It was slightly slower.
+                 */
+                float accum = agg.getBaseValue();
+                float* shmemRead = shmem + MUL24(tidx, shmemX);
+                // this loops too much if the rows are really short :(
+#pragma unroll
+                for (uint i = 0; i < THREADS_X; i++) {
+                    accum = agg(accum, shmemRead[0]);
+                    shmemRead++;
+                }
+                matSum[0] = bop(matSum[0], accum);
+                matSum += AGG_SHORT_ROWS_THREADS_Y;
+            }
+            __syncthreads();
+            mat += width * AGG_SHORT_ROWS_THREADS_Y;
+        }
+    }
+}
+
+template <class Agg, class UnaryOp, class BinaryOp>
+__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
+    const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1;
+    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
+    const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X);
+    const uint tidx = threadIdx.y * AGG_SHORT_ROWS_THREADS_X + threadIdx.x;
+
+    const uint bidx = blockIdx.y * gridDim.x + blockIdx.x;
+    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
+
+    float* shmemWrite = shmem + MUL24(threadIdx.y, shmemX) + threadIdx.x;
+    matSum += blockRowIdx + tidx;
+//    shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0;
+    mat += width * blockRowIdx + MUL24(threadIdx.y, width) + threadIdx.x;
+
+    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y;
+    if(blockRowIdx < height) {
+        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
+            doAgg &= tidx + y + blockRowIdx < height;
+            const bool heightIdxOK = threadIdx.y + y + blockRowIdx < height;
+            float accum = agg.getBaseValue();
+            shmemWrite[0] = agg.getBaseValue();
+
+            for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) {
+//                __syncthreads();
+                if (heightIdxOK && x + threadIdx.x < width) {
+                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
+                }
+            }
+
+            __syncthreads();
+            if (doAgg) {
+                float* shmemRead = shmem + MUL24(tidx, shmemX);
+
+#pragma unroll
+                for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) {
+                    accum = agg(accum, shmemRead[0]);
+                    shmemRead++;
+                }
+
+                matSum[0] = bop(matSum[0], accum);
+                matSum += AGG_SHORT_ROWS_THREADS_Y;
+            }
+            __syncthreads();
+            mat += width * AGG_SHORT_ROWS_THREADS_Y;
+        }
+    }
+}
+
+/*
+ * Bad when there are few columns.
+ */
+template <class Agg, class UnaryOp, class BinaryOp>
+__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
+    const uint idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < width) {
+        float mx = agg.getBaseValue();
+        for (uint j = 0; j < height; j++) {
+            mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx);
+        }
+        vec[idx] = bop(vec[idx], mx);
+    }
+}
+
+/*
+ * Better with few columns because it only computes a partial sum.
+ */
+template <class Agg, class UnaryOp>
+__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) {
+    const uint idxX = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint idxY = blockIdx.y * sumLength;
+    if (idxX < width) {
+        float mx = agg.getBaseValue();
+        for (uint j = idxY; j < min(height,idxY + sumLength); j++) {
+            mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx);
+        }
+        vec[blockIdx.y * width + idxX] = mx;
+    }
+}
+
+template <class Agg>
+__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) {
+    __shared__ float shmem[DP_BLOCKSIZE];
+    uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
+    shmem[threadIdx.x] = agg.getBaseValue();
+    if (eidx < gridDim.x * DP_BLOCKSIZE) {
+        for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) {
+            shmem[threadIdx.x] = agg(shmem[threadIdx.x], a[eidx]);
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x < 256) {
+        shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 256]);
+    }
+    __syncthreads();
+    if (threadIdx.x < 128) {
+        shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 128]);
+    }
+    __syncthreads();
+    if (threadIdx.x < 64) {
+        shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 64]);
+    }
+    __syncthreads();
+    if (threadIdx.x < 32) {
+        volatile float* mysh = &shmem[threadIdx.x];
+        *mysh = agg(*mysh, mysh[32]);
+        *mysh = agg(*mysh, mysh[16]);
+        *mysh = agg(*mysh, mysh[8]);
+        *mysh = agg(*mysh, mysh[4]);
+        *mysh = agg(*mysh, mysh[2]);
+        *mysh = agg(*mysh, mysh[1]);
+        if (threadIdx.x == 0) {
+            target[blockIdx.x] = *mysh;
+        }
+    }
+}
+
+class AddGaussianUnaryRandomizer {
+private:
+    const float stdev;
+public:
+    AddGaussianUnaryRandomizer(float _stdev) : stdev(_stdev) {
+    }
+    __device__ inline float operator ()(float data, curandState* state) {
+        return data + stdev * curand_normal(state);
+    }
+};
+
+class BinarizeUnaryRandomizer {
+public:
+    __device__ inline float operator ()(float data, curandState* state) {
+        return data > curand_uniform(state);
+    }
+};
+
+class UniformUnaryRandomizer {
+public:
+    __device__ inline float operator ()(float data, curandState* state) {
+        return curand_uniform(state);
+    }
+};
+
+class GaussianUnaryRandomizer {
+private:
+    const float mean, stdev;
+public:
+    GaussianUnaryRandomizer(float _mean, float _stdev) : mean(_mean), stdev(_stdev) {
+    }
+    __device__ inline float operator ()(float data, curandState* state) {
+        return mean + stdev * curand_normal(state);
+    }
+};
+
+template <bool var>
+class AddGaussianBinaryRandomizer {
+public:
+    __device__ inline float operator ()(float data, float stdev, curandState* state) {
+        return data + (var ? stdev : 1) * stdev * curand_normal(state);
+    }
+};
+
+class GaussianBinaryRandomizer {
+private:
+    const float mean;
+public:
+    GaussianBinaryRandomizer(float _mean) : mean(_mean) {
+    }
+    __device__ inline float operator ()(float data, float stdev, curandState* state) {
+        return mean + stdev * curand_normal(state);
+    }
+};
+
+class ScaledGaussianBinaryRandomizer {
+private:
+    const float mean, stdevScale;
+public:
+    ScaledGaussianBinaryRandomizer(float _mean, float _stdevScale) : mean(_mean), stdevScale(_stdevScale) {
+    }
+    __device__ inline float operator ()(float data, float stdev, curandState* state) {
+        return mean + stdevScale * stdev * curand_normal(state);
+    }
+};
+
+template<class Randomizer>
+__global__ void kUnaryRandomize(float* data, float* targets, curandState* state, const uint numElements, Randomizer rnd) {
+    const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
+    curandState localState = state[tidx];
+
+    for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) {
+        targets[i] = rnd(data[i], &localState);
+    }
+    state[tidx] = localState;
+}
+
+template<class Randomizer>
+__global__ void kBinaryRandomize(float* data, float* data2, float* targets, curandState* state, const uint numElements, Randomizer rnd) {
+    const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
+    curandState localState = state[tidx];
+
+    for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) {
+        targets[i] = rnd(data[i], data2[i], &localState);
+    }
+    state[tidx] = localState;
+}
+
+#endif /* NVMATRIX_KERNEL_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh
new file mode 100644
index 0000000..6c2a4c3
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh
@@ -0,0 +1,485 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVMATRIX_OPERATORS_CUH
+#define	NVMATRIX_OPERATORS_CUH
+
+class NVMatrixOps {
+public:
+    class Exp {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return __expf(a);
+        }
+    };
+
+    class Logistic {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return __fdividef(1.0f, 1.0f + __expf(-a));
+        }
+    };
+
+    class Log {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return __logf(a);
+        }
+    };
+
+    class Square {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return a * a;
+        }
+    };
+
+    class Sqrt {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return sqrtf(a);
+        }
+    };
+
+    class SqrtAbs {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return sqrtf(fabsf(a));
+        }
+    };
+
+    class Reciprocal {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return 1.0f / a;
+        }
+    };
+
+    class Abs {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return a > 0 ? a : -a;
+        }
+    };
+
+    class Sign {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return (a > 0) - (a < 0);
+        }
+    };
+
+    class Identity {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return a;
+        }
+    };
+
+    class Zero {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return 0;
+        }
+    };
+
+    class One {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return 1;
+        }
+    };
+
+    class Const {
+    private:
+        const float scalar;
+    public:
+        Const(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return scalar;
+        }
+    };
+
+    class OneMinus {
+    public:
+        __device__ inline float operator()(const float x) const {
+            return 1.0f - x;
+        }
+    };
+
+    class Linear {
+    protected:
+        float _a, _b;
+    public:
+        __device__ inline float operator()(float x) const {
+            return _a * x + _b;
+        }
+        Linear(float a, float b) : _a(a), _b(b) {
+        }
+    };
+
+    class IsNan {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return isnan(a);
+        }
+    };
+
+    class IsInf {
+    public:
+        __device__ inline float operator()(const float a) const {
+            return isinf(a);
+        }
+    };
+
+    class SmallerThanScalar {
+    private:
+        const float scalar;
+    public:
+        SmallerThanScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a < scalar;
+        }
+    };
+
+    class BiggerThanScalar {
+    private:
+        const float scalar;
+    public:
+        BiggerThanScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a > scalar;
+        }
+    };
+
+    class AddScalar {
+    private:
+        const float scalar;
+    public:
+        AddScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a + scalar;
+        }
+    };
+
+    class WeightedAddScalar {
+    private:
+        const float weight, scalar;
+    public:
+        WeightedAddScalar(const float _weight, const float _scalar) : weight(_weight), scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return weight * a + scalar;
+        }
+    };
+
+    class MultByScalar {
+    private:
+        const float scalar;
+    public:
+        MultByScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a * scalar;
+        }
+    };
+
+    class Pow {
+    private:
+        const float p;
+    public:
+        Pow(const float _p) : p(_p) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return __powf(a, p);
+        }
+    };
+
+    template <bool exclusive>
+    class InRange {
+    private:
+        const float lower, upper;
+    public:
+        InRange(const float _lower, const float _upper) : lower(_lower), upper(_upper) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return exclusive ? a > lower && a < upper : a >= lower && a <= upper;
+        }
+    };
+
+    class MinWithScalar {
+    private:
+        const float scalar;
+    public:
+        MinWithScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a > scalar ? scalar : a;
+        }
+    };
+
+    class MaxWithScalar {
+    private:
+        const float scalar;
+    public:
+        MaxWithScalar(const float _scalar) : scalar(_scalar) {
+        }
+        __device__ inline float operator()(const float a) const {
+            return a > scalar ? a : scalar;
+        }
+    };
+};
+
+class NVMatrixBinaryOps {
+public:
+    class BinaryOp {
+    public:
+    };
+    class Equals : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a == b;
+        }
+    };
+
+    class BiggerThan : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a > b;
+        }
+    };
+
+    class Divide : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const  {
+            return __fdividef(a, b);
+        }
+    };
+
+    class DivideAccurate : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const  {
+            return a / b;
+        }
+    };
+
+    class DivideSafe : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const  {
+            return b == 0 ? 0 : __fdividef(a, b);
+        }
+    };
+
+    class DivideSafeAccurate : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const  {
+            return b == 0 ? 0 : (a / b);
+        }
+    };
+
+    class Multiply : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a * b;
+        }
+    };
+
+    class SquaredDiff : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return (a - b) * (a - b);
+        }
+    };
+
+    class WeightedAdd : public BinaryOp {
+    private:
+        const float scaleA, scaleB;
+    public:
+        WeightedAdd(const float _scaleA, const float _scaleB) : scaleA(_scaleA), scaleB(_scaleB) {
+        }
+        WeightedAdd() : scaleA(0), scaleB(0) { // Compiler complains about no default constructor?
+        }
+        __device__ inline float operator()(const float a, const float b) const {
+            return a * scaleA + b * scaleB;
+        }
+    };
+
+    class WeightedAdd1 : public BinaryOp {
+    private:
+        const float scaleB;
+    public:
+        WeightedAdd1(const float _scaleB) : scaleB(_scaleB) {
+        }
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + b * scaleB;
+        }
+    };
+
+    class ScaledAdd : public BinaryOp {
+    private:
+        const float scaleB;
+    public:
+        ScaledAdd(const float _scaleB) : scaleB(_scaleB) {
+        }
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + b * scaleB;
+        }
+    };
+
+    class Add : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + b;
+        }
+    };
+
+    class First : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a;
+        }
+    };
+
+    class Second : public BinaryOp {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return b;
+        }
+    };
+
+    class SecondScaled : public BinaryOp {
+    private:
+        const float scale;
+    public:
+        SecondScaled(const float _scale) : scale(_scale) {
+        }
+
+        SecondScaled() : scale(0) { // Compiler complains about no default constructor?
+        }
+        __device__ inline float operator()(const float a, const float b) const {
+            return scale * b;
+        }
+    };
+
+    template<class UnaryOp, class BinaryOp>
+    class CompositeSecond : public BinaryOp {
+    private:
+        UnaryOp _uop;
+        BinaryOp _bop;
+    public:
+        CompositeSecond(UnaryOp uop, BinaryOp bop) : _uop(uop), _bop(bop) {
+
+        }
+        __device__ inline float operator()(const float a, const float b) const {
+            return _bop(a, _uop(b));
+        }
+    };
+};
+
+class NVMatrixAggs {
+public:
+    class Sum {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + b;
+        }
+        __device__ inline float getBaseValue() {
+            return 0;
+        }
+    };
+
+    class Max {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a > b ? a : b;
+        }
+        __device__ inline float getBaseValue() {
+            return -2e38;
+        }
+    };
+
+    class Min {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a > b ? b : a;
+        }
+        __device__ inline float getBaseValue() {
+            return 2e38;
+        }
+    };
+
+    class CountNan {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + isnan(b);
+        }
+        __device__ inline float getBaseValue() {
+            return 0;
+        }
+    };
+
+    class CountInf {
+    public:
+        __device__ inline float operator()(const float a, const float b) const {
+            return a + isinf(b);
+        }
+        __device__ inline float getBaseValue() {
+            return 0;
+        }
+    };
+
+    template<class UnaryOperator>
+    class ArgMax {
+    private:
+       UnaryOperator u;
+    public:
+       ArgMax(UnaryOperator _u) : u(_u) {
+       }
+       __device__ inline float operator()(const float a, const float b) const {
+           return u(a) > u(b) ? a : b;
+       }
+       __device__ inline float getBaseValue() {
+           return u.getArgMin();
+       }
+    };
+};
+
+class NVMatrixTernaryOps {
+public:
+    class Add {
+    public:
+        __device__ inline float operator()(const float a, const float b, const float c) const {
+            return a + b + c;
+        }
+    };
+    class WeightedAdd {
+    private:
+        const float scaleA, scaleB, scaleC;
+    public:
+        WeightedAdd(const float _scaleA, const float _scaleB, const float _scaleC) : scaleA(_scaleA), scaleB(_scaleB), scaleC(_scaleC) {
+        }
+        __device__ inline float operator()(const float a, const float b, const float c) const {
+            return a * scaleA + b * scaleB + c * scaleC;
+        }
+    };
+};
+
+#endif	/* NVMATRIX_OPERATORS_CUH */
+
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu
new file mode 100644
index 0000000..aab7e45
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/memory.cuh"
+
+Lock MemoryManager::_globalLock;
+std::map<int, MemoryManager*> FastMemoryManager::_memoryManagers;
+
+MemoryManager& FastMemoryManager::getInstance(int deviceID) {
+    _globalLock.acquire();
+    if (_memoryManagers.count(deviceID) == 0) {
+        _memoryManagers[deviceID] = (new FastMemoryManager(deviceID))->init();
+    }
+    MemoryManager& ret = *_memoryManagers[deviceID];
+    _globalLock.release();
+    return ret;
+}
+
+MemoryManager* CUDAMemoryManager::_memoryManager = NULL;
+MemoryManager& CUDAMemoryManager::getInstance(int deviceID) {
+    _globalLock.acquire();
+    if (_memoryManager == NULL) {
+        _memoryManager = new CUDAMemoryManager();
+    }
+    _globalLock.release();
+    return *_memoryManager;
+}
+
+MemoryManager* CUDAHostMemoryManager::_memoryManager = NULL;
+MemoryManager& CUDAHostMemoryManager::getInstance() {
+    _globalLock.acquire();
+    if (_memoryManager == NULL) {
+        _memoryManager = new CUDAHostMemoryManager();
+    }
+    _globalLock.release();
+    return *_memoryManager;
+}
+
+MemoryManager* FastHostMemoryManager::_memoryManager = NULL;
+MemoryManager& FastHostMemoryManager::getInstance() {
+    _globalLock.acquire();
+    if (_memoryManager == NULL) {
+        _memoryManager = (new FastHostMemoryManager())->init();
+    }
+    _globalLock.release();
+    return *_memoryManager;
+}
+
+
+void FastMemoryManager::destroyInstance(int deviceID) {
+    _globalLock.acquire();
+    if (_memoryManagers.count(deviceID) != 0) {
+        delete _memoryManagers[deviceID];
+        _memoryManagers.erase(deviceID);
+    }
+    _globalLock.release();
+}
+
+void FastHostMemoryManager::destroyInstance() {
+    _globalLock.acquire();
+    if (_memoryManager != NULL) {
+        delete _memoryManager;
+        _memoryManager = NULL;
+    }
+    _globalLock.release();
+}
+
+void CUDAMemoryManager::destroyInstance(int deviceID) {
+    _globalLock.acquire();
+    if (_memoryManager != NULL) {
+        delete _memoryManager;
+        _memoryManager = NULL;
+    }
+    _globalLock.release();
+}
+
+void CUDAHostMemoryManager::destroyInstance() {
+    _globalLock.acquire();
+    if (_memoryManager != NULL) {
+        delete _memoryManager;
+        _memoryManager = NULL;
+    }
+    _globalLock.release();
+}
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu
new file mode 100644
index 0000000..e37c1de
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu
@@ -0,0 +1,1724 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <vector>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <algorithm>
+#include <typeinfo>
+#include <map>
+#include <cuda.h>
+#include <signal.h>
+#include "../include/nvmatrix.cuh"
+#include "../include/nvmatrix_operators.cuh"
+
+using namespace std;
+
+/*
+ * Device random number generator pointers.
+ */
+//map<int,curandGenerator_t> NVMatrix::rndGen;
+map<int,MemorySegment*> NVMatrix::_rndDevStates;
+map<int,int> NVMatrix::_rndDevThreads;
+pthread_mutex_t* NVMatrix::_rndMutex = makeMutex();
+pthread_mutex_t* NVMatrix::_cublasMutex = makeMutex();
+pthread_mutex_t* NVMatrix::_streamMutex = makeMutex();
+std::map<int,cublasHandle_t> NVMatrix::_cublasHandles;
+std::map<int,cudaStream_t> NVMatrix::_defaultStreams;
+
+pthread_mutex_t* NVMatrix::makeMutex() {
+    pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
+    pthread_mutex_init(m, NULL);
+    return m;
+}
+/*
+   Do not call resize in _init because resize is a virtual function
+   which is overridden in base class. Since C++ is retarded and unable
+   to call overridden functions from constructors, we shall call resize
+   separately from every constructor after calling _init.
+*/
+void NVMatrix::_init(bool isTrans) {
+    _numRows = 0;
+    _numCols = 0;
+    _numElements = 0;
+    _ownsData = true;
+
+    _isTrans = isTrans;
+    _memSegment = NULL;
+
+    _stride = 0;
+    _texObj = 0;
+}
+
+NVMatrix::NVMatrix() : _deleted(false) {
+    _init(false);
+}
+
+NVMatrix::NVMatrix(bool isTrans) : _deleted(false) {
+    _init(isTrans);
+}
+
+NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) : _deleted(false) {
+    _init(isTrans);
+    resize(numRows, numCols);
+}
+
+NVMatrix::NVMatrix(const Matrix& like, bool copy) : _deleted(false) {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+    if (copy) {
+        copyFromHost(like);
+    }
+}
+
+NVMatrix::NVMatrix(const NVMatrix& like, bool copy) : _deleted(false) {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+    if (copy) {
+        like.copy(*this);
+    }
+}
+
+/*
+ * Initializes NVMatrix with same dimensions as given matrix but
+ * does not copy any data.
+ */
+NVMatrix::NVMatrix(const NVMatrix& like) : _deleted(false) {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+}
+
+/*
+ * Initializes NVMatrix with same dimensions as given matrix but
+ * does not copy any data.
+ */
+NVMatrix::NVMatrix(const Matrix& like) : _deleted(false) {
+    _init(false);
+    resize(like.getNumRows(), like.getNumCols());
+}
+
+NVMatrix::NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) :
+    _numRows(numRows),
+    _numCols(numCols),
+    _numElements(numRows*numCols),
+    _ownsData(false),
+    _memSegment(mem),
+    _isTrans(isTrans),
+    _deleted(false),
+    _texObj(0) {
+    _stride = stride < 0 ? getLeadingDim() : stride;
+}
+
+NVMatrix::~NVMatrix() {
+    if (!_deleted) {
+        deallocTexture();
+        if(_ownsData && _numElements > 0) {
+            dealloc();
+        } else {
+            // dealloc deletes the mem segment. But if this is a view,
+            // then we still need to delete the mem segment object.
+//            assert(_memSegment == NULL || _memSegment->getSize() == 0);
+            delete _memSegment;
+        }
+    }
+}
+
+void NVMatrix::copyFromHost(const Matrix& hostMatrix) {
+    copyFromHost(hostMatrix, false, getDefaultStream());
+}
+
+void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) {
+    copyFromHost(hostMatrix, resizeTarget, getDefaultStream());
+}
+
+void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) {
+    if (resizeTarget) {
+        resize(hostMatrix);
+    } else {
+        assert(isSameDims(hostMatrix));
+    }
+    setTrans(hostMatrix.isTrans());
+
+    if (getNumElements() > 0) {
+        CUBLAS_CALL(cublasSetMatrixAsync(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float),
+                                    hostMatrix.getData(), hostMatrix.getLeadingDim(), getDevData(), _stride, stream));
+        syncStream(stream);
+    }
+}
+
+void NVMatrix::copyToHost(Matrix& hostMatrix) const {
+    copyToHost(hostMatrix, false, getDefaultStream());
+}
+
+void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
+    copyToHost(hostMatrix, resizeTarget, getDefaultStream());
+}
+
+void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const {
+    if (resizeTarget) {
+        hostMatrix.resize(_numRows, _numCols);
+    } else {
+        assert(isSameDims(hostMatrix));
+    }
+    hostMatrix.setTrans(_isTrans);
+
+    if (getNumElements() > 0) {
+        CUBLAS_CALL(cublasGetMatrixAsync(getLeadingDim(),getFollowingDim(), sizeof(float),
+                                         getDevData(), getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim(), stream));
+        syncStream(stream);
+    }
+}
+
+void NVMatrix::copy(NVMatrix& dest) const {
+    copy(dest, getDefaultStream());
+}
+
+void NVMatrix::copy(NVMatrix& dest, cudaStream_t stream) const {
+    if (&dest != this) {
+        if (!isSameDims(dest)) {
+            dest.resize(*this);
+        }
+        copy(dest, 0, -1, 0, -1, 0, 0, stream);
+    }
+}
+
+NVMatrix& NVMatrix::copy() const {
+    NVMatrix& c = construct();
+    copy(c);
+    return c;
+}
+
+void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target) {
+    rightMult(b, scaleAB, target, getDefaultStream());
+}
+
+void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream) {
+//    if(&target != this && &target != &b) {
+//        target.resize(_numRows, b.getNumCols());
+//        target.setTrans(true);
+//    }
+    target.addProduct(*this, b, 0, scaleAB, stream);
+}
+
+void NVMatrix::rightMult(NVMatrix &b, float scaleAB) {
+    rightMult(b, scaleAB, *this);
+}
+
+void NVMatrix::rightMult(NVMatrix &b, NVMatrix& target) {
+    rightMult(b, 1, target);
+}
+
+void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB) {
+    addProduct(a, b, scaleThis, scaleAB, getDefaultStream());
+}
+
+/*
+ * This will only work if this matrix is in column-major order! In other words,
+ * if isTrans() returns true.
+ */
+void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream) {
+    assert(a.getNumCols() == b.getNumRows());
+
+    if (scaleThis == 0) {
+        resize(a.getNumRows(), b.getNumCols());
+        setTrans(true);
+    }
+
+    assert(this->getNumRows() == a.getNumRows());
+    assert(this->getNumCols() == b.getNumCols());
+    assert(_isTrans);
+    CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream));
+    CUBLAS_CALL(cublasSgemm_v2(getCublasHandle(), a.getTransChar(), b.getTransChar(), a.getNumRows(), b.getNumCols(), a.getNumCols(),
+                               &scaleAB, a.getDevData(), a.getStride(), b.getDevData(), b.getStride(),
+                               &scaleThis, getDevData(), getStride()));
+}
+
+void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b) {
+    addProduct(a, b, 1, 1);
+}
+
+void NVMatrix::assertSame(NVMatrixV& a) {
+    for (int i = 1; i < a.size(); ++i) {
+        assert(a[i]->isSameDims(*a[0]));
+        assert(a[i]->isTrans() == a[0]->isTrans());
+        assert(a[i]->getStride() == a[0]->getStride());
+        assert(a[i]->getDataDeviceID() == a[0]->getDataDeviceID());
+    }
+}
+
+void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB,
+                                     const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) {
+    batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream(), aPtrsDev, bPtrsDev, tgtPtrsDev);
+}
+
+void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB) {
+    batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream());
+}
+
+void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream,
+                                     const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) {
+    assert(a.size() == b.size());
+    assert(a.size() == target.size());
+    assertSame(a);
+    assertSame(b);
+    assertSame(target);
+
+    const int batch = a.size();
+    if (batch > 0) {
+        const int rows = a[0]->getNumRows(), inner = a[0]->getNumCols(), cols = b[0]->getNumCols();
+
+        assert(inner == b[0]->getNumRows());
+        assert(target[0]->getNumRows() == rows);
+        assert(target[0]->getNumCols() == cols);
+
+        const int lda = a[0]->getStride(), ldb = b[0]->getStride(), ldc = target[0]->getStride();
+        cublasOperation_t atrans = a[0]->getTransChar(), btrans = b[0]->getTransChar();
+
+        CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream));
+        CUBLAS_CALL(cublasSgemmBatched(getCublasHandle(), atrans, btrans, rows, cols, inner, &scaleAB, aPtrsDev, lda, bPtrsDev, ldb, &scaleTarget, tgtPtrsDev, ldc, batch));
+    }
+}
+
+void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream) {
+    assert(a.size() == b.size());
+    assert(a.size() == target.size() || target.size() == 0);
+
+    const int batch = a.size();
+    if (batch > 0) {
+        const int rows = a[0]->getNumRows(), cols = b[0]->getNumCols();
+
+        const float* aPtrs[batch], *bPtrs[batch], *tgtPtrs[batch];
+        for (int i = 0; i < batch; ++i) {
+            if (target.size() <= i) {
+                target.push_back(new NVMatrix(rows, cols, true));
+            }
+            aPtrs[i] = a[i]->getDevData();
+            bPtrs[i] = b[i]->getDevData();
+            tgtPtrs[i] = target[i]->getDevData();
+        }
+
+//        const float** aPtrsDev, **bPtrsDev;
+//        float **tgtPtrsDev;
+//        checkCudaErrors(cudaMalloc(&aPtrsDev, batch * sizeof(float*)));
+//        checkCudaErrors(cudaMalloc(&bPtrsDev, batch * sizeof(float*)));
+//        checkCudaErrors(cudaMalloc(&tgtPtrsDev, batch * sizeof(float*)));
+        MemorySegment* aPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
+        MemorySegment* bPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
+        MemorySegment* tgtPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
+
+        checkCudaErrors(cudaMemcpyAsync(aPtrsDev, aPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(bPtrsDev, bPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(tgtPtrsDev, tgtPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
+
+        batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, stream, const_cast<const float**>(aPtrsDev->getData<float*>()),
+                                                                          const_cast<const float**>(bPtrsDev->getData<float*>()),
+                                                                          tgtPtrsDev->getData<float*>());
+
+//        checkCudaErrors(cudaFree(aPtrsDev));
+//        checkCudaErrors(cudaFree(bPtrsDev));
+//        checkCudaErrors(cudaFree(tgtPtrsDev));
+        DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(aPtrsDev);
+        DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(bPtrsDev);
+        DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(tgtPtrsDev);
+    }
+}
+
+template <class Randomizer>
+void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) {
+    _unaryRandomize(target, rnd, getDefaultStream());
+}
+
+template <class Randomizer>
+void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream) {
+    assert(isRndInitialized());
+    assert(isContiguous() && target.isContiguous());
+    if (!isSameDims(target)) {
+        target.resize(*this);
+    }
+    assert(isTrans() == target.isTrans());
+    kUnaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
+    getLastCudaError("kUnaryRandomize: Kernel execution failed");
+}
+
+template <class Randomizer>
+void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) {
+    _binaryRandomize(data2, target, rnd, getDefaultStream());
+}
+
+template <class Randomizer>
+void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream) {
+    assert(isRndInitialized());
+    assert(isContiguous() && data2.isContiguous() && target.isContiguous());
+    assert(isSameDims(data2));
+    assert(isTrans() == data2.isTrans());
+    if (!isSameDims(target)) {
+        target.resize(*this);
+    }
+    assert(isTrans() == target.isTrans());
+    kBinaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
+    getLastCudaError("kBinaryRandomize: Kernel execution failed");
+}
+
+void NVMatrix::initRandom(unsigned long long seed, int numStreams) {
+    NVMatrix::initRandom(seed, numStreams, NVMatrix::getDefaultStream());
+}
+
+void NVMatrix::initRandom(unsigned long long seed, int numStreams, cudaStream_t stream) {
+//    printf("init random on device %d\n", getDeviceID());
+    pthread_mutex_lock(_rndMutex);
+    assert(!isRndInitialized(true));
+    int d = getDeviceID();
+//    _rndDevStates[d] = NULL;
+    _rndDevThreads[d] = numStreams;
+    _rndDevStates[d] = DEVICE_MEMORY_MANAGER::getInstance(d).malloc(numStreams * sizeof(curandState));
+//    checkCudaErrors(cudaMalloc((void **)&_rndDevStates[d], numStreams * sizeof(curandState)));
+    pthread_mutex_unlock(_rndMutex);
+    kSetupCurand<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one
+    getLastCudaError("kSetupCurand: Kernel execution failed");
+}
+
+void NVMatrix::initRandom(unsigned long long seed) {
+    initRandom(seed, NUM_RND_STREAMS);
+}
+
+void NVMatrix::initRandom() {
+    NVMatrix::initRandom(time(0));
+}
+
+void NVMatrix::initCublas() {
+    int d = getDeviceID();
+    pthread_mutex_lock(_cublasMutex);
+    assert(_cublasHandles.count(d) == 0);
+    CUBLAS_CALL(cublasCreate(&_cublasHandles[d]));
+    // It appears that cublasCreate causes a host -> device copy on stream 0,
+    // so we synchronize with it because we run everything else on other
+    // streams.
+    syncDevice();
+    pthread_mutex_unlock(_cublasMutex);
+}
+
+void NVMatrix::destroyCublas() {
+    int d = getDeviceID();
+    pthread_mutex_lock(_cublasMutex);
+    assert(_cublasHandles.count(d) > 0);
+    CUBLAS_CALL(cublasDestroy(_cublasHandles[d]));
+    _cublasHandles.erase(d);
+    pthread_mutex_unlock(_cublasMutex);
+}
+
+cublasHandle_t NVMatrix::getCublasHandle() {
+    return getCublasHandle(getDeviceID());
+}
+
+cublasHandle_t NVMatrix::getCublasHandle(int deviceID) {
+    pthread_mutex_lock(_cublasMutex);
+    assert(_cublasHandles.count(deviceID) > 0);
+    cublasHandle_t h = _cublasHandles[deviceID];
+    pthread_mutex_unlock(_cublasMutex);
+    return h;
+}
+
+cudaStream_t NVMatrix::getDefaultStream() {
+    return getDefaultStream(NVMatrix::getDeviceID());
+}
+
+cudaStream_t NVMatrix::getDefaultStream(int deviceID) {
+    if (deviceID >= 0) {
+        pthread_mutex_lock(_streamMutex);
+        if (_defaultStreams.count(deviceID) == 0) {
+            int oldDeviceID = getDeviceID();
+            NVMatrix::setDeviceID(deviceID);
+            checkCudaErrors(cudaStreamCreateWithFlags(&_defaultStreams[deviceID], cudaStreamNonBlocking));
+            NVMatrix::setDeviceID(oldDeviceID);
+        }
+        cudaStream_t s = _defaultStreams[deviceID];
+        pthread_mutex_unlock(_streamMutex);
+        return s;
+    }
+    return 0;
+}
+
+void NVMatrix::syncDevice() {
+    checkCudaErrors(cudaDeviceSynchronize());
+}
+
+void NVMatrix::syncStream(cudaStream_t stream) {
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+void NVMatrix::syncStream() {
+    syncStream(getDefaultStream());
+}
+
+curandState* NVMatrix::getCurandState() {
+    /*
+     * Even though we're only reading from the map here, it's important to grab
+     * the mutex because another thread may be writing to it.
+     */
+    pthread_mutex_lock(_rndMutex);
+    int d = getDeviceID();
+    assert(isRndInitialized(true));
+    curandState* r = _rndDevStates[d]->getData<curandState>();
+    pthread_mutex_unlock(_rndMutex);
+    return r;
+}
+
+curandState* NVMatrix::getCurandState(int numStreams) {
+    int d = getDeviceID();
+    pthread_mutex_lock(_rndMutex);
+    assert(isRndInitialized(true));
+    bool realloc = numStreams >  _rndDevThreads[d];
+    pthread_mutex_unlock(_rndMutex);
+
+    if (realloc) {
+        destroyRandom();
+        initRandom(time(0), numStreams);
+    }
+    return getCurandState();
+}
+
+int NVMatrix::getDataDeviceID() const {
+    if (getDevData() == NULL) {
+        return DEVICE_NULL;
+    }
+    struct cudaPointerAttributes atts;
+    checkCudaErrors(cudaPointerGetAttributes(&atts, getDevData()));
+    return atts.memoryType == cudaMemoryTypeDevice ? atts.device : DEVICE_HOST;
+}
+
+
+int NVMatrix::getDeviceID() {
+    int d;
+    checkCudaErrors(cudaGetDevice(&d));
+//    if (d == 0) {
+//        raise(SIGABRT);
+//    }
+    return d;
+}
+
+void NVMatrix::setDeviceID(int d) {
+    assert(d >= 0);
+//    printf("Setting device to %d\n", d);
+//    if (d == 0) {
+//        raise(SIGABRT);
+//    }
+    checkCudaErrors(cudaSetDevice(d));
+}
+
+bool NVMatrix::canAccessPeer(int srcDevice, int tgtDevice) {
+    if (srcDevice == tgtDevice) {
+        return true;
+    }
+    int canAccess;
+    checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcDevice, tgtDevice));
+    return canAccess;
+}
+
+bool NVMatrix::isRndInitialized(bool haveLock) {
+    if (!haveLock) {
+        pthread_mutex_lock(_rndMutex);
+    }
+    bool b = _rndDevStates.count(getDeviceID()) != 0;
+    if (!haveLock) {
+        pthread_mutex_unlock(_rndMutex);
+    }
+    return b;
+}
+
+bool NVMatrix::isRndInitialized() {
+    return isRndInitialized(false);
+}
+
+void NVMatrix::destroyRandom() {
+    int d = getDeviceID();
+    pthread_mutex_lock(_rndMutex);
+    assert(isRndInitialized(true));
+//    checkCudaErrors(cudaFree(_rndDevStates[d]));
+    DEVICE_MEMORY_MANAGER::getInstance(d).free(_rndDevStates[d]);
+    _rndDevStates.erase(d);
+    _rndDevThreads.erase(d);
+    pthread_mutex_unlock(_rndMutex);
+}
+
+void NVMatrix::binarizeProbs() {
+    binarizeProbs(*this);
+}
+
+void NVMatrix::binarizeProbs(NVMatrix& target) {
+    _unaryRandomize(target, BinarizeUnaryRandomizer());
+}
+
+void NVMatrix::randomizeUniform() {
+    assert(isContiguous());
+    assert(isRndInitialized());
+//    CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements()));
+    _unaryRandomize(*this, UniformUnaryRandomizer());
+}
+
+void NVMatrix::randomizeGaussian() {
+    randomizeGaussian(1);
+}
+
+void NVMatrix::randomizeGaussian(float stdev) {
+    randomizeGaussian(0, stdev);
+}
+
+void NVMatrix::randomizeGaussian(float mean, float stdev) {
+    assert(isContiguous());
+    assert(isRndInitialized());
+//    CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev));
+    _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev));
+}
+
+/*
+ * Kind of a hack since we don't actually need the contents of this matrix for it,
+ * so we don't really need a binary randomizer.
+ */
+void NVMatrix::randomizeGaussian(NVMatrix& stdevs) {
+    randomizeGaussian(0, stdevs);
+}
+
+void NVMatrix::randomizeGaussian(float mean, NVMatrix& stdevs) {
+    _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer(mean));
+}
+
+void NVMatrix::randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs) {
+    _binaryRandomize(stdevs, *this, ScaledGaussianBinaryRandomizer(mean, stdevMult));
+}
+
+void NVMatrix::addGaussianNoise() {
+    addGaussianNoise(1);
+}
+
+void NVMatrix::addGaussianNoise(float stdev) {
+    addGaussianNoise(stdev, *this);
+}
+
+void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) {
+    _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev));
+}
+
+void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) {
+    addGaussianNoise(stdevs, var, *this);
+}
+
+void NVMatrix::addGaussianNoise(NVMatrix& stdevs) {
+    addGaussianNoise(stdevs, false, *this);
+}
+
+void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) {
+    if (var) {
+        _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<true>());
+    } else {
+        _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<false>());
+    }
+}
+
+void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) {
+    applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target);
+}
+
+void NVMatrix::biggerThan(NVMatrix& b) {
+    biggerThan(b, *this);
+}
+
+void NVMatrix::equals(NVMatrix& b, NVMatrix& target) {
+    applyBinary(NVMatrixBinaryOps::Equals(), b, target);
+}
+
+void NVMatrix::equals(NVMatrix& m) {
+    equals(m, *this);
+}
+
+void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) {
+    applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target);
+}
+
+void NVMatrix::biggerThanVector(NVMatrix& vec) {
+    biggerThanVector(vec, *this);
+}
+
+void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const {
+    assert(startRow >= 0 && startRow <= _numRows);
+    assert(endRow >= startRow && endRow <= _numRows);
+
+    assert(startCol >= 0 && startCol <= _numCols);
+    assert(endCol >= startCol && endCol <= _numCols);
+}
+
+/*
+ * The only place where stride is supported for now!
+ * Will ALWAYS return a view of the original data, sometimes non-contiguous.
+ */
+NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const {
+    endRow = endRow < 0 ? this->_numRows : endRow;
+    endCol = endCol < 0 ? this->_numCols : endCol;
+    _checkBounds(startRow, endRow, startCol, endCol);
+
+    if (!isTrans()) {
+        return construct(new MemorySegment(this->getDevData() + startRow * _stride + startCol), endRow - startRow, endCol - startCol, _stride, false);
+    }
+    return construct(new MemorySegment(this->getDevData() + startCol * _stride + startRow), endRow - startRow, endCol - startCol, _stride, true);
+}
+
+/* this will NEVER return a view */
+void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const {
+    endRow = endRow < 0 ? this->_numRows : endRow;
+    endCol = endCol < 0 ? this->_numCols : endCol;
+    _checkBounds(startRow, endRow, startCol, endCol);
+
+    int sliceRows = endRow - startRow, sliceCols = endCol - startCol;
+    if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) {
+        target.resize(sliceRows, sliceCols);
+    }
+    this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
+}
+
+NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const {
+    return slice(startRow, endRow, 0, -1);
+}
+
+void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const {
+    slice(startRow, endRow, 0, -1, target);
+}
+
+NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const {
+    return slice(0, -1, startCol, endCol);
+}
+
+void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const {
+    slice(0, -1, startCol, endCol, target);
+}
+
+NVMatrixV& NVMatrix::splitRows(int numParts) {
+    assert(getNumRows() % numParts == 0);
+    NVMatrixV& v = *new NVMatrixV();
+    int partSize = getNumRows() / numParts;
+    for (int p = 0; p < numParts; ++p) {
+        v.push_back(&sliceRows(p * partSize, (p+1) * partSize));
+    }
+    return v;
+}
+
+NVMatrixV& NVMatrix::splitCols(int numParts) {
+    assert(getNumCols() % numParts == 0);
+    NVMatrixV& v = *new NVMatrixV();
+    int partSize = getNumCols() / numParts;
+    for (int p = 0; p < numParts; ++p) {
+        v.push_back(&sliceCols(p * partSize, (p+1) * partSize));
+    }
+    return v;
+}
+
+/*
+ * Guaranteed to not change the data if the number of elements doesn't change.
+ * So you can use this to "reshape" a matrix.
+ */
+bool NVMatrix::resize(int numRows, int numCols, bool trans) {
+    setTrans(trans);
+    bool reallocated = false;
+    if (numRows != _numRows || numCols != _numCols) {
+        assert(_ownsData || (_numElements == numRows * numCols && isContiguous()));
+        if (_numElements != numRows * numCols) {
+            if (_numElements > 0) { // free old memory
+                dealloc();
+            }
+            if (numRows * numCols > 0) { // allocate new memory
+                alloc(numCols * numRows);
+            } else {
+                _memSegment = NULL;
+            }
+            reallocated = true;
+        }
+        _numRows = numRows;
+        _numCols = numCols;
+        _numElements = numRows * numCols;
+        _stride = getLeadingDim();
+    }
+    return reallocated;
+}
+
+bool NVMatrix::resize(int numRows, int numCols) {
+    return resize(numRows, numCols, isTrans());
+}
+
+bool NVMatrix::resize(const NVMatrix& like) {
+    setTrans(like.isTrans());
+    return resize(like.getNumRows(), like.getNumCols());
+}
+
+bool NVMatrix::resize(const Matrix& like) {
+    setTrans(like.isTrans());
+    return resize(like.getNumRows(), like.getNumCols());
+}
+
+void NVMatrix::reshape(int numRows, int numCols) {
+    assert(isContiguous());
+    assert(_numElements == numRows*numCols);
+    _numRows = numRows;
+    _numCols = numCols;
+    _stride = getLeadingDim();
+}
+
+NVMatrix& NVMatrix::reshaped(int numRows, int numCols) const {
+    assert(isContiguous());
+    assert(_numElements == numRows*numCols);
+    return construct(new MemorySegment(*_memSegment), numRows, numCols, -1, _isTrans);
+}
+
+void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
+                    int srcStartCol, int srcEndCol,
+                    int destStartRow, int destStartCol) const {
+    copy(dest, srcStartRow, srcEndRow, srcStartCol, srcEndCol, destStartRow, destStartCol, getDefaultStream());
+}
+
+void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
+                    int srcStartCol, int srcEndCol,
+                    int destStartRow, int destStartCol, cudaStream_t stream) const {
+    srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow;
+    srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol;
+    NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol);
+    NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol);
+    if (srcSlice->isContiguous() && destSlice->isContiguous() && srcSlice->isSameDims(*destSlice) && srcSlice->isTrans() == destSlice->isTrans()) {
+        // The commonest case.
+        checkCudaErrors(cudaMemcpyAsync(destSlice->getDevData(), srcSlice->getDevData(), srcSlice->getNumDataBytes(), cudaMemcpyDefault, stream));
+    } else {
+        srcSlice->apply(NVMatrixOps::Identity(), *destSlice, stream);
+    }
+    delete srcSlice;
+    delete destSlice;
+}
+
+
+NVMatrix& NVMatrix::getTranspose() {
+    return construct(new MemorySegment(*_memSegment), _numCols, _numRows, _stride, !_isTrans);
+}
+
+NVMatrix& NVMatrix::getClone() {
+    return construct(new MemorySegment(*_memSegment), _numRows, _numCols, _stride, _isTrans);
+}
+
+void NVMatrix::transpose(NVMatrix& target) {
+    flipTrans(target);
+    target.setTrans(!target.isTrans());
+    target.reshape(target.getNumCols(), target.getNumRows());
+}
+
+void NVMatrix::transpose() {
+    int tmp = _numCols;
+    _numCols = _numRows;
+    _numRows = tmp;
+    _isTrans = !_isTrans;
+}
+
+bool NVMatrix::transpose(bool trans) {
+    bool oldTrans = _isTrans;
+    if (oldTrans != trans) {
+        transpose();
+    }
+    return oldTrans;
+}
+
+/*
+ * Flips the ordering of the matrix from row-major to column-major and vice versa.
+ * This creates temporary storage -- not a cheap operation.
+ *
+ * This is not equivalent to a "hard transpose". The resultant matrix still has
+ * the same dimensions, its layout in memory just changes.
+ */
+NVMatrix& NVMatrix::flipTrans() {
+    NVMatrix& meTrans = construct(*this);
+    flipTrans(meTrans);
+    return meTrans;
+}
+
+void NVMatrix::flipTrans(NVMatrix& target) {
+    flipTrans(target, getDefaultStream());
+}
+
+void NVMatrix::flipTrans(NVMatrix& target, cudaStream_t stream) {
+    assert(&target != this);
+    target.resize(_numRows, _numCols);
+    target.setTrans(!isTrans());
+//    target.printShape("target");
+//    this->printShape("this");
+    apply(NVMatrixOps::Identity(), target, stream);
+}
+
+void NVMatrix::squaredDiff(NVMatrix& b) {
+    squaredDiff(b, *this);
+}
+
+void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) {
+    applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target);
+}
+
+void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) {
+    add(b, scaleA, scaleB, target, NVMatrix::getDefaultStream());
+}
+
+void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream) {
+    if (scaleA == 0) {
+        b.scale(scaleB, target, stream);
+    } else if (scaleB == 0) {
+        scale(scaleA, target, stream);
+    } else if (scaleA == 1 && scaleB == 1) { // slight optimization
+        applyBinary(NVMatrixBinaryOps::Add(), b, target, stream);
+    } else if (scaleA == 1) {
+        applyBinary(NVMatrixBinaryOps::WeightedAdd1(scaleB), b, target, stream);
+    } else {
+        applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target, stream);
+    }
+}
+
+void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) {
+    add(b, 1, scaleB, target);
+}
+
+void NVMatrix::add(NVMatrix& b, NVMatrix& target) {
+    add(b, 1, target);
+}
+
+void NVMatrix::add(NVMatrix& b, float scaleB) {
+    add(b, scaleB, *this);
+}
+
+void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) {
+    add(b, scaleA, scaleB, *this);
+}
+
+void NVMatrix::add(NVMatrix& b) {
+    add(b, 1, *this);
+}
+
+void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) {
+    add(b, -1, target);
+}
+
+void NVMatrix::subtract(NVMatrix& b) {
+    add(b, -1);
+}
+
+void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) {
+    applyBinary(NVMatrixBinaryOps::Multiply(), b, target);
+}
+
+void NVMatrix::eltwiseMult(NVMatrix& b) {
+    eltwiseMult(b, *this);
+}
+
+void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) {
+    applyBinary(NVMatrixBinaryOps::Divide(), b, target);
+}
+
+void NVMatrix::eltwiseDivide(NVMatrix& b) {
+    eltwiseDivide(b, *this);
+}
+
+void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) {
+    tile(timesY, timesX, target, getDefaultStream());
+}
+
+void NVMatrix::tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream) {
+    assert(isContiguous() && target.isContiguous());
+    assert(timesX > 0 && timesY > 0);
+    target.resize(_numRows*timesY, _numCols*timesX);
+    target.setTrans(_isTrans);
+    if(!isTrans()) {
+        kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), _numCols, _numRows, target._numCols, target._numRows);
+    } else {
+        kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), _numRows, _numCols, target._numRows, target._numCols);
+    }
+    getLastCudaError("Kernel execution failed");
+}
+
+void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) {
+    addVector(vec, scaleVec, target, getDefaultStream());
+}
+
+void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream) {
+    applyBinaryV(NVMatrixBinaryOps::ScaledAdd(scaleVec), vec, target, stream);
+}
+
+void NVMatrix::addVector(NVMatrix& vec) {
+    addVector(vec, 1);
+}
+
+void NVMatrix::addVector(NVMatrix& vec, float scaleVec) {
+    addVector(vec, scaleVec, *this);
+}
+
+void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) {
+    addVector(vec, 1, target);
+}
+
+void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) {
+    applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target);
+}
+
+void NVMatrix::equalsVector(NVMatrix& vec) {
+    equalsVector(vec, *this);
+}
+
+void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) {
+    eltwiseMultByVector(vec, target, getDefaultStream());
+}
+
+void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream) {
+    applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target, stream);
+}
+
+void NVMatrix::eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream) {
+    eltwiseMultByVector(vec, *this, stream);
+}
+
+void NVMatrix::eltwiseMultByVector(NVMatrix& vec) {
+    eltwiseMultByVector(vec, *this);
+}
+
+void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) {
+    eltwiseDivideByVector(vec,  *this);
+}
+
+void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) {
+    applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target);
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) {
+    _aggregate(axis, target, agg, uop, bop, stream, NULL);
+}
+
+/*
+ * TODO: this is a mess, fix it. it works pretty fast but it's too ugly.
+ * TODO: this function is _really_ bad for very long aggregations of few columns.
+ */
+template<class Agg, class UnaryOp, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp) {
+    assert(axis == 0 || axis == 1);
+    assert(isContiguous()  && target.isContiguous());
+    assert(&target != this);
+    int width = _isTrans ? _numRows : _numCols;
+    int height = _isTrans ? _numCols : _numRows;
+
+    target.setTrans(_isTrans);
+    assert(width > 0);
+    assert(height > 0);
+    if((axis == 0 && !_isTrans) || (axis == 1 && _isTrans)) { //col sum
+        target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1);
+//        int height = getFollowingDim();
+        if ((height <= 2048 || width >= 4096)) {
+            int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
+            assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width);
+            assert(numBlocks < NUM_BLOCKS_MAX);
+            kDumbAggCols<Agg, UnaryOp, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK, 0, stream>>>(getTextureObject(), target.getDevData(), width, height, agg, uop, bop);
+            getLastCudaError("kDumbAggCols: Kernel execution failed");
+        } else { // Specialize the case when we have very long columns and few of them
+            const int sumLength = 128;
+            bool deltmp = tmp == NULL;
+            if (tmp == NULL) {
+                tmp = new NVMatrix(false);
+            }
+
+            int numBlocksX = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
+            int numBlocksY = DIVUP(height, sumLength);
+            tmp->resize(numBlocksY, width);
+
+            dim3 blocks(numBlocksX, numBlocksY);
+            dim3 threads(NUM_SUM_COLS_THREADS_PER_BLOCK);
+            kAggCols<Agg, UnaryOp><<<blocks,threads, 0, stream>>>(getTextureObject(), tmp->getDevData(), width, height, sumLength, agg, uop);
+            getLastCudaError("kAggCols: Kernel execution failed");
+
+            int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
+            kDumbAggCols<Agg, NVMatrixOps::Identity, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK, 0, stream>>>(tmp->getTextureObject(), target.getDevData(), width, numBlocksY, agg, NVMatrixOps::Identity(), bop);
+            getLastCudaError("kDumbAggCols: Kernel execution failed");
+            if (deltmp) {
+                delete tmp;
+            }
+        }
+    } else { // row sum
+        target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1);
+        if (width > 1) {
+            if (height >= 16384) { // linear aggregation
+                int numBlocksX = 1;
+                int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y);
+                int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X;
+                int numThreadsY = AGG_SHORT_ROWS_THREADS_Y;
+                while (numBlocksY > NUM_BLOCKS_MAX) {
+                    numBlocksY = DIVUP(numBlocksY,2);
+                    numBlocksX *= 2;
+                }
+                dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
+                if(width <= 16) {
+                    if(width <= 4) {
+                        kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 4><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                    } else if(width <= 8) {
+                        kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 8><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                    } else if(width <= 12) {
+                        kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 12><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                    } else {
+                        kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 16><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                    }
+                } else if(width <= 32) {
+                    kAggShortRows<Agg, UnaryOp, BinaryOp, 2, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                } else if(width <= 48){
+                    kAggShortRows<Agg, UnaryOp, BinaryOp, 3, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                } else if(width <= 64){
+                    kAggShortRows<Agg, UnaryOp, BinaryOp, 4, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                } else {
+                    kAggShortRows2<Agg, UnaryOp, BinaryOp><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
+                }
+            } else {
+                if (width >= 512) {
+                    // NOTE: this is the only case which I bothered to try to optimize for Kepler
+                    dim3 threads(AWR_NUM_THREADS);
+                    dim3 blocks(1, height);
+                    kAggRows_wholerow_nosync<<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), width, height, agg, uop, bop);
+                } else {
+
+                    int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512)));
+                    int numThreadsY = 1;
+                    int numBlocksX = DIVUP(width, 2*numThreadsX);
+                    int numBlocksY = std::min(height, NUM_BLOCKS_MAX);
+
+                    dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
+                    assert(numBlocksX <= NUM_BLOCKS_MAX);
+                    assert(numBlocksY <= NUM_BLOCKS_MAX);
+
+                    if(width <= 64) {
+                        kAggRows<Agg, UnaryOp, BinaryOp, 32><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
+                                                   width, height, target.getLeadingDim(), agg, uop, bop);
+                    } else if(width <= 128) {
+                        kAggRows<Agg, UnaryOp, BinaryOp, 64><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
+                                                   width, height, target.getLeadingDim(), agg, uop, bop);
+                    } else if(width <= 256) {
+                        kAggRows<Agg, UnaryOp, BinaryOp, 128><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
+                                                   width, height, target.getLeadingDim(), agg, uop, bop);
+                    } else if(width <= 512) {
+                        kAggRows<Agg, UnaryOp, BinaryOp, 256><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
+                                                   width, height, target.getLeadingDim(), agg, uop, bop);
+                    } else {
+                        kAggRows<Agg, UnaryOp, BinaryOp, 512><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
+                                                   width, height, target.getLeadingDim(), agg, uop, bop);
+                    }
+
+                    getLastCudaError("agg rows: Kernel execution failed");
+                }
+            }
+        } else {
+            target.applyBinary(NVMatrixBinaryOps::CompositeSecond<UnaryOp, BinaryOp>(uop, bop), *this, target, stream);
+//            copy(target, stream);
+        }
+    }
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop) {
+    _aggregate(axis, target, agg, uop, bop, getDefaultStream());
+}
+
+template<class Agg, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop) {
+    _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream());
+}
+
+template<class Agg, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream) {
+    _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream);
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop) {
+    NVMatrix &sumVec = construct();
+    _aggregate(axis, sumVec, agg, uop, bop);
+    return sumVec;
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) {
+    NVMatrix &sumVec = construct();
+    _aggregate(axis, sumVec, agg, uop, bop, stream);
+    return sumVec;
+}
+
+template<class Agg, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop) {
+    return _aggregate(axis, agg, NVMatrixOps::Identity(), bop);
+}
+
+template<class Agg, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream) {
+    return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream);
+}
+
+
+
+template<class Agg, class UnaryOp, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) {
+    _aggregate(axis, target, agg, uop, bop, getDefaultStream(), tmp);
+}
+
+template<class Agg, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp) {
+    _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream(), &tmp);
+}
+
+template<class Agg, class BinaryOp>
+void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
+    _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream, &tmp);
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) {
+    NVMatrix &sumVec = construct();
+    _aggregate(axis, sumVec, agg, uop, bop, tmp);
+    return sumVec;
+}
+
+template<class Agg, class UnaryOp, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
+    NVMatrix &sumVec = construct();
+    _aggregate(axis, sumVec, agg, uop, bop, stream, tmp);
+    return sumVec;
+}
+
+template<class Agg, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp) {
+    return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, tmp);
+}
+
+template<class Agg, class BinaryOp>
+NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
+    return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream, tmp);
+}
+
+void NVMatrix::inRangeInc(float lower, float upper) {
+    inRangeInc(lower, upper, *this);
+}
+void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) {
+    apply(NVMatrixOps::InRange<false>(lower, upper), target);
+}
+
+void NVMatrix::inRangeExc(float lower, float upper) {
+    inRangeExc(lower, upper, *this);
+}
+
+void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) {
+    apply(NVMatrixOps::InRange<true>(lower, upper), target);
+}
+
+void NVMatrix::biggerThanScalar(float scalar) {
+    biggerThanScalar(scalar, *this);
+}
+
+void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::BiggerThanScalar(scalar), target);
+}
+
+void NVMatrix::smallerThanScalar(float scalar) {
+    smallerThanScalar(scalar, *this);
+}
+
+void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::SmallerThanScalar(scalar), target);
+}
+
+void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target);
+}
+
+void NVMatrix::addScalar(float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::AddScalar(scalar), target);
+}
+
+void NVMatrix::addScalar(float scalar) {
+    addScalar(scalar, *this);
+}
+
+void NVMatrix::minWithScalar(float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::MinWithScalar(scalar), target);
+}
+
+void NVMatrix::minWithScalar(float scalar) {
+    minWithScalar(scalar, *this);
+}
+
+void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) {
+    apply(NVMatrixOps::MaxWithScalar(scalar), target);
+}
+
+void NVMatrix::maxWithScalar(float scalar) {
+    maxWithScalar(scalar, *this);
+}
+
+void NVMatrix::pow(float p, NVMatrix& target) {
+    apply(NVMatrixOps::Pow(p), target);
+}
+
+void NVMatrix::pow(float p) {
+    pow(p, *this);
+}
+
+void NVMatrix::scale(float _scale) {
+    scale(_scale, *this);
+}
+
+void NVMatrix::scale(float _scale, cudaStream_t stream) {
+    scale(_scale, *this, stream);
+}
+
+void NVMatrix::scale(float _scale, NVMatrix& target) {
+    scale(_scale, target, NVMatrix::getDefaultStream());
+}
+
+void NVMatrix::scale(float _scale, NVMatrix& target, cudaStream_t stream) {
+    if (_scale != 1 || &target != this) { // optimize away scale by 1
+        if (_scale == 1) {
+            copy(target, stream);
+        } else {
+            apply(NVMatrixOps::MultByScalar(_scale), target, stream);
+        }
+    }
+}
+
+void NVMatrix::zero() {
+    apply(NVMatrixOps::Zero());
+}
+
+void NVMatrix::zero(NVMatrix& like) {
+    resize(like);
+    zero();
+}
+
+void NVMatrix::max(int axis, NVMatrix& target) {
+    _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
+}
+
+void NVMatrix::max(int axis, NVMatrix& target, NVMatrix& tmp) {
+    _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second(), tmp);
+}
+
+void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) {
+    addSum(a, axis, scaleThis, scaleSum, getDefaultStream());
+}
+
+void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream) {
+    if (scaleThis != 0) {
+        a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum), stream);
+    } else {
+        a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum), stream);
+    }
+}
+
+void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax) {
+    addMax(a, axis, scaleThis, scaleMax, getDefaultStream());
+}
+
+void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream) {
+    if (scaleThis != 0) {
+        a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleMax), stream);
+    } else {
+        a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::SecondScaled(scaleMax), stream);
+    }
+}
+
+void NVMatrix::sum(int axis, NVMatrix& target) {
+    sum(axis, target, getDefaultStream());
+}
+
+void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream) {
+    _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream);
+}
+
+void NVMatrix::sum(int axis, NVMatrix& target, NVMatrix& tmp) {
+    sum(axis, target, getDefaultStream(), tmp);
+}
+
+void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp) {
+    _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream, tmp);
+}
+
+void NVMatrix::sumOfSquares(int axis, NVMatrix& target) {
+    sumOfSquares(axis, target, getDefaultStream());
+}
+
+void NVMatrix::sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream) {
+    _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second(), stream);
+}
+
+void NVMatrix::min(int axis, NVMatrix& target) {
+    _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
+}
+
+NVMatrix& NVMatrix::max(int axis) {
+    return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
+}
+
+NVMatrix& NVMatrix::sum(int axis) {
+    return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
+}
+
+NVMatrix& NVMatrix::min(int axis) {
+    return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
+}
+
+NVMatrix& NVMatrix::sumOfSquares(int axis) {
+    return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second());
+}
+
+void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads) {
+    *threads = dim3(DP_BLOCKSIZE);
+    *blocks = dim3(std::min(CPUSUM_MAX, DIVUP(n, DP_BLOCKSIZE)));
+}
+
+float NVMatrix::mean() {
+    return sum() / getNumElements();
+}
+
+float NVMatrix::sum() {
+    return _totalAgg(NVMatrixAggs::Sum());
+}
+
+float NVMatrix::sum(NVMatrix& tmpbuf) {
+    return _totalAgg(NVMatrixAggs::Sum(), tmpbuf, getDefaultStream());
+}
+
+float NVMatrix::max() {
+    return _totalAgg(NVMatrixAggs::Max());
+}
+
+float NVMatrix::min() {
+    return _totalAgg(NVMatrixAggs::Min());
+}
+
+float NVMatrix::countNan() {
+    return _totalAgg(NVMatrixAggs::CountNan());
+}
+
+float NVMatrix::countInf() {
+    return _totalAgg(NVMatrixAggs::CountInf());
+}
+
+template<class Agg>
+float NVMatrix::_totalAgg(Agg agg) {
+    return _totalAgg(agg, getDefaultStream());
+}
+
+template<class Agg>
+float NVMatrix::_totalAgg(Agg agg, cudaStream_t stream) {
+    NVMatrix tmp;
+    return _totalAgg(agg, tmp, stream);
+}
+
+template<class Agg>
+float NVMatrix::_totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream) {
+    assert(isContiguous());
+    dim3 blocks, threads;
+    // Sum most of it on GPU
+
+    _sum_setParams(getNumElements(), &blocks, &threads);
+    tmpbuf.resize(1, blocks.x);
+    kTotalAgg<<<blocks, threads, 0, stream>>>(getDevData(), tmpbuf.getDevData(), getNumElements(), agg);
+    getLastCudaError("kTotalAgg: Kernel execution failed");
+    // Don't need to sync because we copyToHost in the same stream, so it's serialized
+//    NVMatrix::syncStream(stream);
+    return tmpbuf.cpuAgg(agg, stream);
+}
+template<class Agg>
+float NVMatrix::cpuAgg(Agg agg, cudaStream_t stream) {
+    Matrix bufCPU(getNumRows(), getNumCols());
+    copyToHost(bufCPU, false, stream);
+    if (getNumElements() > 1) { // Sum remainder on CPU
+        if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) {
+            return bufCPU.sum();
+        } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) {
+            return bufCPU.max();
+        } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) {
+            return bufCPU.min();
+        } else if (typeid(Agg) == typeid(NVMatrixAggs::CountNan)) {
+            return bufCPU.hasNan(); //yea, it's not the same, who cares
+        } else if (typeid(Agg) == typeid(NVMatrixAggs::CountInf)) {
+            return bufCPU.hasInf();
+        } else {
+            assert(false);
+        }
+    }
+    return bufCPU(0,0);
+}
+
+float NVMatrix::dotProduct(NVMatrix& b) {
+    return dotProduct(b, getDefaultStream());
+}
+
+float NVMatrix::dotProduct(NVMatrix& b, cudaStream_t stream) {
+    NVMatrix tmp;
+    return dotProduct(b, tmp, stream);
+}
+
+/*
+ * Fast dot product only for matrices with same transposedness.
+ */
+float NVMatrix::dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream) {
+    assert(isContiguous() && b.isContiguous());
+    assert(isSameDims(b));
+    assert(isTrans() == b.isTrans()); // see?
+    dim3 blocks, threads;
+    _sum_setParams(getNumElements(), &blocks, &threads);
+//    NVMatrix target(1, blocks.x);
+    tmp.resize(1, blocks.x);
+    kDotProduct_r<<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), tmp.getDevData(), getNumElements());
+    getLastCudaError("kDotProduct_r: Kernel execution failed");
+//    cudaThreadSynchronize();
+//    syncStream(stream);
+//    return tmp._totalAgg(NVMatrixAggs::Sum(), stream);
+    return tmp.cpuAgg(NVMatrixAggs::Sum(), stream);
+}
+
+float NVMatrix::norm2() {
+    return dotProduct(*this);
+}
+
+float NVMatrix::norm() {
+    return sqrt(norm2());
+}
+
+void NVMatrix::print(int startRow, int rows, int startCol, int cols) const {
+//    cudaThreadSynchronize();
+    syncDevice();
+    Matrix hm = Matrix(_numRows, _numCols);
+    copyToHost(hm);
+    hm.print(startRow, rows, startCol, cols);
+}
+
+void NVMatrix::print(int rows, int cols) const {
+    print(0, rows, 0, cols);
+}
+
+void NVMatrix::printShape(const char* name) const {
+    printf("%s: %dx%d\n", name, _numRows, _numCols);
+}
+
+void NVMatrix::alloc(int numElements) {
+    _memSegment = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(numElements * sizeof(float));
+}
+
+void NVMatrix::dealloc() {
+    DEVICE_MEMORY_MANAGER::getInstance(_memSegment->getDeviceID()).free(_memSegment);
+    _memSegment = NULL;
+    deallocTexture();
+}
+
+void NVMatrix::deallocTexture() {
+    if (_texObj != 0) {
+        checkCudaErrors(cudaDestroyTextureObject(_texObj));
+        _texObj = 0;
+    }
+}
+
+cudaTextureObject_t NVMatrix::getTextureObject() {
+   if (_texObj == 0) {
+       assert(isContiguous());
+       //size_t memFree, memTotal;
+
+       struct cudaResourceDesc resDesc;
+       memset(&resDesc, 0, sizeof(resDesc));
+       resDesc.resType = cudaResourceTypeLinear;
+       resDesc.res.linear.devPtr = getDevData();
+       resDesc.res.linear.sizeInBytes = getNumDataBytes();
+       resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+       struct cudaTextureDesc texDesc;
+       memset(&texDesc, 0, sizeof(texDesc));
+       checkCudaErrors(cudaCreateTextureObject(&_texObj, &resDesc, &texDesc, NULL));
+   }
+   assert(_texObj != 0);
+   return _texObj;
+}
+
+NVMatrix& NVMatrix::construct() const {
+    return *new NVMatrix();
+}
+NVMatrix& NVMatrix::construct(bool isTrans) const {
+    return *new NVMatrix(isTrans);
+}
+NVMatrix& NVMatrix::construct(int numRows, int numCols, bool isTrans) const {
+    return *new NVMatrix(numRows, numCols, isTrans);
+}
+NVMatrix& NVMatrix::construct(const Matrix& like, bool copy) const {
+    return *new NVMatrix(like, copy);
+}
+NVMatrix& NVMatrix::construct(const NVMatrix& like, bool copy) const {
+    return *new NVMatrix(like, copy);
+}
+NVMatrix& NVMatrix::construct(const NVMatrix& like) const {
+    return *new NVMatrix(like);
+}
+NVMatrix& NVMatrix::construct(const Matrix& like) const {
+    return *new NVMatrix(like);
+}
+NVMatrix& NVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const {
+    return *new NVMatrix(mem, numRows, numCols, stride, isTrans);
+}
+
+std::pair<size_t, size_t> NVMatrix::getCudaMemorySize() {
+    size_t memFree, memTotal;
+    checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal));
+    return std::pair<size_t,size_t>(memFree, memTotal);
+}
+
+
+/* ================
+ * HostNVMatrix
+ * ================
+ */
+HostNVMatrix::~HostNVMatrix() {
+    if (_ownsData && _numElements > 0) {
+        dealloc();
+    } else {
+        // dealloc frees the mem segment. But if this is a view,
+        // then we need to delete the mem segment object.
+//        assert(_memSegment == NULL || _memSegment->getSize() == 0);
+        delete _memSegment;
+    }
+    _deleted = true;
+}
+HostNVMatrix::HostNVMatrix() : NVMatrix() {
+    _init(false);
+}
+HostNVMatrix::HostNVMatrix(bool isTrans) {
+    _init(isTrans);
+}
+HostNVMatrix::HostNVMatrix(int numRows, int numCols, bool isTrans)  {
+    _init(isTrans);
+    resize(numRows, numCols);
+}
+HostNVMatrix::HostNVMatrix(const Matrix& like, bool copy)  {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+    if (copy) {
+        copyFromHost(like);
+    }
+}
+HostNVMatrix::HostNVMatrix(const NVMatrix& like, bool copy)  {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+    if (copy) {
+        like.copy(*this);
+    }
+}
+HostNVMatrix::HostNVMatrix(const NVMatrix& like)  {
+    _init(like.isTrans());
+    resize(like.getNumRows(), like.getNumCols());
+}
+HostNVMatrix::HostNVMatrix(const Matrix& like) {
+    _init(false);
+    resize(like.getNumRows(), like.getNumCols());
+}
+HostNVMatrix::HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans)
+    : NVMatrix(mem, numRows, numCols, stride, isTrans) {
+}
+
+NVMatrix& HostNVMatrix::construct() const {
+    return *new HostNVMatrix();
+}
+NVMatrix& HostNVMatrix::construct(bool isTrans) const {
+    return *new HostNVMatrix(isTrans);
+}
+NVMatrix& HostNVMatrix::construct(int numRows, int numCols, bool isTrans) const {
+    return *new HostNVMatrix(numRows, numCols, isTrans);
+}
+NVMatrix& HostNVMatrix::construct(const Matrix& like, bool copy) const {
+    return *new HostNVMatrix(like, copy);
+}
+NVMatrix& HostNVMatrix::construct(const NVMatrix& like, bool copy) const {
+    return *new HostNVMatrix(like, copy);
+}
+NVMatrix& HostNVMatrix::construct(const NVMatrix& like) const {
+    return *new HostNVMatrix(like);
+}
+NVMatrix& HostNVMatrix::construct(const Matrix& like) const {
+    return *new HostNVMatrix(like);
+}
+NVMatrix& HostNVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const {
+    return *new HostNVMatrix(mem, numRows, numCols, stride, isTrans);
+}
+
+void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) {
+    if (resizeTarget) {
+        resize(hostMatrix);
+    } else {
+        assert(isSameDims(hostMatrix));
+    }
+    setTrans(hostMatrix.isTrans());
+    if (getNumElements() > 0) {
+        checkCudaErrors(cudaMemcpy2D(getDevData(), _stride * sizeof(float), hostMatrix.getData(),
+                                     hostMatrix.getLeadingDim() * sizeof(float), getLeadingDim() * sizeof(float),
+                                     getFollowingDim(), cudaMemcpyHostToHost));
+//        syncStream(stream);
+    }
+}
+
+void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) {
+    copyFromHost(hostMatrix, resizeTarget, 0);
+}
+
+void HostNVMatrix::copyFromHost(const Matrix& hostMatrix) {
+    copyFromHost(hostMatrix, false, 0);
+}
+
+void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const {
+    if (resizeTarget) {
+        hostMatrix.resize(getNumRows(), getNumCols());
+    } else {
+        assert(isSameDims(hostMatrix));
+    }
+    hostMatrix.setTrans(_isTrans);
+    if (getNumElements() > 0) {
+        checkCudaErrors(cudaMemcpy2D(hostMatrix.getData(), hostMatrix.getLeadingDim() * sizeof(float),
+                                     getDevData(), _stride * sizeof(float), getLeadingDim() * sizeof(float),
+                                     getFollowingDim(), cudaMemcpyHostToHost));
+//        syncStream(stream);
+    }
+}
+
+void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
+    copyToHost(hostMatrix, resizeTarget, 0);
+}
+
+void HostNVMatrix::copyToHost(Matrix& hostMatrix) const {
+    copyToHost(hostMatrix, false, 0);
+}
+
+void HostNVMatrix::alloc(int numElements) {
+//    checkCudaErrors(cudaHostAlloc(&_devData, numElements * sizeof(float), cudaHostAllocPortable));
+    _memSegment = HOST_MEMORY_MANAGER::getInstance().malloc(numElements * sizeof(float));
+//    _memSegment = FastHostMemoryManager::getInstance().malloc(numElements * sizeof(float));
+}
+
+void HostNVMatrix::dealloc() {
+//    FastHostMemoryManager::getInstance().free(_memSegment);
+    HOST_MEMORY_MANAGER::getInstance().free(_memSegment);
+    _memSegment = NULL;
+//    checkCudaErrors(cudaFreeHost(_devData));
+}
+
+cudaTextureObject_t HostNVMatrix::getTextureObject() {
+    assert(false);
+    return 0;
+}
diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu
new file mode 100644
index 0000000..628a1f5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include "../include/nvmatrix_kernels.cuh"
+
+__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int numThreads = blockDim.x * gridDim.x;
+    //    const unsigned int numEls = tgtWidth * tgtHeight;
+    for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) {
+        const uint y = i / tgtWidth;
+        const uint x = i % tgtWidth;
+        const uint srcY = y % srcHeight;
+        const uint srcX = x % srcWidth;
+        tgt[i] = src[srcY * srcWidth + srcX];
+    }
+}
+
+__global__ void kDotProduct_r(float* a, float* b, float* target,  const uint numElements) {
+    __shared__ float shmem[DP_BLOCKSIZE];
+
+    uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
+    shmem[threadIdx.x] = 0;
+    if (eidx < gridDim.x * DP_BLOCKSIZE) {
+        for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) {
+            shmem[threadIdx.x] += a[eidx] * b[eidx];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x < 256) {
+        shmem[threadIdx.x] += shmem[threadIdx.x + 256];
+    }
+    __syncthreads();
+    if (threadIdx.x < 128) {
+        shmem[threadIdx.x] += shmem[threadIdx.x + 128];
+    }
+    __syncthreads();
+    if (threadIdx.x < 64) {
+        shmem[threadIdx.x] += shmem[threadIdx.x + 64];
+    }
+    __syncthreads();
+    if (threadIdx.x < 32) {
+        volatile float* mysh = &shmem[threadIdx.x];
+        *mysh += mysh[32];
+        *mysh += mysh[16];
+        *mysh += mysh[8];
+        *mysh += mysh[4];
+        *mysh += mysh[2];
+        *mysh += mysh[1];
+        if (threadIdx.x == 0) {
+            target[blockIdx.x] = *mysh;
+        }
+    }
+}
+
+__global__ void kSetupCurand(curandState *state, unsigned long long seed) {
+    const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
+    /* Each thread gets same seed, a different sequence number,
+     no offset */
+    curand_init(seed, tidx, 0, &state[tidx]);
+}
+
diff --git a/caffe2/contrib/cuda-convnet2/python_util/__init__.py b/caffe2/contrib/cuda-convnet2/python_util/__init__.py
new file mode 100644
index 0000000..520b1ea
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/python_util/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/caffe2/contrib/cuda-convnet2/python_util/data.py b/caffe2/contrib/cuda-convnet2/python_util/data.py
new file mode 100644
index 0000000..d8c8ff1
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/python_util/data.py
@@ -0,0 +1,194 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as n
+from numpy.random import randn, rand, random_integers
+import os
+from threading import Thread
+from util import *
+
+BATCH_META_FILE = "batches.meta"
+
+class DataLoaderThread(Thread):
+    def __init__(self, path, tgt):
+        Thread.__init__(self)
+        self.path = path
+        self.tgt = tgt
+    def run(self):
+        self.tgt += [unpickle(self.path)]
+        
+class DataProvider:
+    BATCH_REGEX = re.compile('^data_batch_(\d+)(\.\d+)?$')
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
+        if batch_range == None:
+            batch_range = DataProvider.get_batch_nums(data_dir)
+        if init_batchnum is None or init_batchnum not in batch_range:
+            init_batchnum = batch_range[0]
+
+        self.data_dir = data_dir
+        self.batch_range = batch_range
+        self.curr_epoch = init_epoch
+        self.curr_batchnum = init_batchnum
+        self.dp_params = dp_params
+        self.batch_meta = self.get_batch_meta(data_dir)
+        self.data_dic = None
+        self.test = test
+        self.batch_idx = batch_range.index(init_batchnum)
+
+    def get_next_batch(self):
+        if self.data_dic is None or len(self.batch_range) > 1:
+            self.data_dic = self.get_batch(self.curr_batchnum)
+        epoch, batchnum = self.curr_epoch, self.curr_batchnum
+        self.advance_batch()
+
+        return epoch, batchnum, self.data_dic
+            
+    def get_batch(self, batch_num):
+        fname = self.get_data_file_name(batch_num)
+        if os.path.isdir(fname): # batch in sub-batches
+            sub_batches = sorted(os.listdir(fname), key=alphanum_key)
+            #print sub_batches
+            num_sub_batches = len(sub_batches)
+            tgts = [[] for i in xrange(num_sub_batches)]
+            threads = [DataLoaderThread(os.path.join(fname, s), tgt) for (s, tgt) in zip(sub_batches, tgts)]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+            
+            return [t[0] for t in tgts]
+        return unpickle(self.get_data_file_name(batch_num))
+    
+    def get_data_dims(self,idx=0):
+        return self.batch_meta['num_vis'] if idx == 0 else 1
+    
+    def advance_batch(self):
+        self.batch_idx = self.get_next_batch_idx()
+        self.curr_batchnum = self.batch_range[self.batch_idx]
+        if self.batch_idx == 0: # we wrapped
+            self.curr_epoch += 1
+            
+    def get_next_batch_idx(self):
+        return (self.batch_idx + 1) % len(self.batch_range)
+    
+    def get_next_batch_num(self):
+        return self.batch_range[self.get_next_batch_idx()]
+    
+    # get filename of current batch
+    def get_data_file_name(self, batchnum=None):
+        if batchnum is None:
+            batchnum = self.curr_batchnum
+        return os.path.join(self.data_dir, 'data_batch_%d' % batchnum)
+    
+    @classmethod
+    def get_instance(cls, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, type="default", dp_params={}, test=False):
+        # why the fuck can't i reference DataProvider in the original definition?
+        #cls.dp_classes['default'] = DataProvider
+        type = type or DataProvider.get_batch_meta(data_dir)['dp_type'] # allow data to decide data provider
+        if type.startswith("dummy-"):
+            name = "-".join(type.split('-')[:-1]) + "-n"
+            if name not in dp_types:
+                raise DataProviderException("No such data provider: %s" % type)
+            _class = dp_classes[name]
+            dims = int(type.split('-')[-1])
+            return _class(dims)
+        elif type in dp_types:
+            _class = dp_classes[type]
+            return _class(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        
+        raise DataProviderException("No such data provider: %s" % type)
+    
+    @classmethod
+    def register_data_provider(cls, name, desc, _class):
+        if name in dp_types:
+            raise DataProviderException("Data provider %s already registered" % name)
+        dp_types[name] = desc
+        dp_classes[name] = _class
+        
+    @staticmethod
+    def get_batch_meta(data_dir):
+        return unpickle(os.path.join(data_dir, BATCH_META_FILE))
+    
+    @staticmethod
+    def get_batch_filenames(srcdir):
+        return sorted([f for f in os.listdir(srcdir) if DataProvider.BATCH_REGEX.match(f)], key=alphanum_key)
+    
+    @staticmethod
+    def get_batch_nums(srcdir):
+        names = DataProvider.get_batch_filenames(srcdir)
+        return sorted(list(set(int(DataProvider.BATCH_REGEX.match(n).group(1)) for n in names)))
+        
+    @staticmethod
+    def get_num_batches(srcdir):
+        return len(DataProvider.get_batch_nums(srcdir))
+    
+class DummyDataProvider(DataProvider):
+    def __init__(self, data_dim):
+        #self.data_dim = data_dim
+        self.batch_range = [1]
+        self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True}
+        self.curr_epoch = 1
+        self.curr_batchnum = 1
+        self.batch_idx = 0
+        
+    def get_next_batch(self):
+        epoch,  batchnum = self.curr_epoch, self.curr_batchnum
+        self.advance_batch()
+        data = rand(512, self.get_data_dims()).astype(n.single)
+        return self.curr_epoch, self.curr_batchnum, {'data':data}
+
+class LabeledDataProvider(DataProvider):   
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
+        DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        
+    def get_num_classes(self):
+        return len(self.batch_meta['label_names'])
+        
+class LabeledDummyDataProvider(DummyDataProvider):
+    def __init__(self, data_dim, num_classes=10, num_cases=7):
+        #self.data_dim = data_dim
+        self.batch_range = [1]
+        self.batch_meta = {'num_vis': data_dim,
+                           'label_names': [str(x) for x in range(num_classes)],
+                           'data_in_rows':True}
+        self.num_cases = num_cases
+        self.num_classes = num_classes
+        self.curr_epoch = 1
+        self.curr_batchnum = 1
+        self.batch_idx=0
+        self.data = None
+        
+    def get_num_classes(self):
+        return self.num_classes
+    
+    def get_next_batch(self):
+        epoch,  batchnum = self.curr_epoch, self.curr_batchnum
+        self.advance_batch()
+        if self.data is None:
+            data = rand(self.num_cases, self.get_data_dims()).astype(n.single) # <--changed to rand
+            labels = n.require(n.c_[random_integers(0,self.num_classes-1,self.num_cases)], requirements='C', dtype=n.single)
+            self.data, self.labels = data, labels
+        else:
+            data, labels = self.data, self.labels
+#        print data.shape, labels.shape
+        return self.curr_epoch, self.curr_batchnum, [data.T, labels.T ]
+
+    
+dp_types = {"dummy-n": "Dummy data provider for n-dimensional data",
+            "dummy-labeled-n": "Labeled dummy data provider for n-dimensional data"}
+dp_classes = {"dummy-n": DummyDataProvider,
+              "dummy-labeled-n": LabeledDummyDataProvider}
+    
+class DataProviderException(Exception):
+    pass
diff --git a/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py b/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py
new file mode 100644
index 0000000..d4df71c
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py
@@ -0,0 +1,358 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as n
+import os
+from time import time, asctime, localtime, strftime
+from util import *
+from data import *
+from options import *
+from math import ceil, floor, sqrt
+from data import DataProvider, dp_types
+import sys
+import shutil
+import platform
+from os import linesep as NL
+from threading import Thread
+import tempfile as tf
+
+class ModelStateException(Exception):
+    pass
+
+class CheckpointWriter(Thread):
+    def __init__(self, path, dic):
+        Thread.__init__(self)
+        self.path = path
+        self.dic = dic
+        
+    def run(self):
+        save_dir = os.path.dirname(self.path)
+        save_file = os.path.basename(self.path)
+        # Write checkpoint to temporary filename
+        tmpfile = tf.NamedTemporaryFile(dir=os.path.dirname(save_dir), delete=False)
+        pickle(tmpfile, self.dic) # Also closes tf
+        # Move it to final filename
+        os.rename(tmpfile.name, self.path)
+        # Delete old checkpoints
+        for f in os.listdir(save_dir):
+            if f != save_file:
+                os.remove(os.path.join(save_dir, f))
+
+# GPU Model interface
+class IGPUModel:
+    def __init__(self, model_name, op, load_dic, filename_options=[], dp_params={}):
+        # these are input parameters
+        self.model_name = model_name
+        self.op = op
+        self.options = op.options
+        self.load_dic = load_dic
+        self.filename_options = filename_options
+        self.dp_params = dp_params
+        self.device_ids = self.op.get_value('gpu')
+        self.fill_excused_options()
+        self.checkpoint_writer = None
+        #assert self.op.all_values_given()
+        
+        for o in op.get_options_list():
+            setattr(self, o.name, o.value)
+        self.loaded_from_checkpoint = load_dic is not None
+        # these are things that the model must remember but they're not input parameters
+        if self.loaded_from_checkpoint:
+            self.model_state = load_dic["model_state"]
+            self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else self.options['load_file'].value
+            if not os.path.isdir(self.save_file) and os.path.exists(self.save_file):
+                self.save_file = os.path.dirname(self.save_file)
+#            print self.options["save_file_override"].value, self.save_file
+        else:
+            self.model_state = {}
+            self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else os.path.join(self.options['save_path'].value, model_name + "_" + '_'.join(['%s_%s' % (char, self.options[opt].get_str_value()) for opt, char in filename_options]) + '_' + strftime('%Y-%m-%d_%H.%M.%S'))
+            self.model_state["train_outputs"] = []
+            self.model_state["test_outputs"] = []
+            self.model_state["epoch"] = 1
+            self.model_state["batchnum"] = self.train_batch_range[0]
+#            print self.save_file
+
+        self.init_data_providers()
+        if load_dic: 
+            self.train_data_provider.advance_batch()
+            
+        # model state often requries knowledge of data provider, so it's initialized after
+        try:
+            self.init_model_state()
+        except ModelStateException, e:
+            print e
+            sys.exit(1)
+        for var, val in self.model_state.iteritems():
+            setattr(self, var, val)
+            
+        self.import_model()
+        self.init_model_lib()
+
+    def import_model(self):
+        print "========================="
+        print "Importing %s C++ module" % ('_' + self.model_name)
+        self.libmodel = __import__('_' + self.model_name) 
+                   
+    def fill_excused_options(self):
+        pass
+    
+    def init_data_providers(self):
+        self.dp_params['convnet'] = self
+        try:
+            self.test_data_provider = DataProvider.get_instance(self.data_path, self.test_batch_range,
+                                                                type=self.dp_type, dp_params=self.dp_params, test=True)
+            self.train_data_provider = DataProvider.get_instance(self.data_path, self.train_batch_range,
+                                                                     self.model_state["epoch"], self.model_state["batchnum"],
+                                                                     type=self.dp_type, dp_params=self.dp_params, test=False)
+        except DataProviderException, e:
+            print "Unable to create data provider: %s" % e
+            self.print_data_providers()
+            sys.exit()
+        
+    def init_model_state(self):
+        pass
+       
+    def init_model_lib(self):
+        pass
+    
+    def start(self):
+        if self.test_only:
+            self.test_outputs += [self.get_test_error()]
+            self.print_test_results()
+        else:
+            self.train()
+        self.cleanup()
+        if self.force_save:
+            self.save_state().join()
+        sys.exit(0)
+    
+    def train(self):
+        print "========================="
+        print "Training %s" % self.model_name
+        self.op.print_values()
+        print "========================="
+        self.print_model_state()
+        print "Running on CUDA device(s) %s" % ", ".join("%d" % d for d in self.device_ids)
+        print "Current time: %s" % asctime(localtime())
+        print "Saving checkpoints to %s" % self.save_file
+        print "========================="
+        next_data = self.get_next_batch()
+        while self.epoch <= self.num_epochs:
+            data = next_data
+            self.epoch, self.batchnum = data[0], data[1]
+            self.print_iteration()
+            sys.stdout.flush()
+            
+            compute_time_py = time()
+            self.start_batch(data)
+            
+            # load the next batch while the current one is computing
+            next_data = self.get_next_batch()
+            
+            batch_output = self.finish_batch()
+            self.train_outputs += [batch_output]
+            self.print_train_results()
+
+            if self.get_num_batches_done() % self.testing_freq == 0:
+                self.sync_with_host()
+                self.test_outputs += [self.get_test_error()]
+                self.print_test_results()
+                self.print_test_status()
+                self.conditional_save()
+            
+            self.print_elapsed_time(time() - compute_time_py)
+    
+    def cleanup(self):
+        if self.checkpoint_writer is not None:
+            self.checkpoint_writer.join()
+            self.checkpoint_writer = None
+        
+    def print_model_state(self):
+        pass
+    
+    def get_num_batches_done(self):
+        return len(self.train_batch_range) * (self.epoch - 1) + self.batchnum - self.train_batch_range[0] + 1
+    
+    def get_next_batch(self, train=True):
+        dp = self.train_data_provider
+        if not train:
+            dp = self.test_data_provider
+        return self.parse_batch_data(dp.get_next_batch(), train=train)
+    
+    def parse_batch_data(self, batch_data, train=True):
+        return batch_data[0], batch_data[1], batch_data[2]['data']
+    
+    def start_batch(self, batch_data, train=True):
+        self.libmodel.startBatch(batch_data[2], not train)
+    
+    def finish_batch(self):
+        return self.libmodel.finishBatch()
+    
+    def print_iteration(self):
+        print "\t%d.%d..." % (self.epoch, self.batchnum),
+    
+    def print_elapsed_time(self, compute_time_py):
+        print "(%.3f sec)" % (compute_time_py)
+    
+    def print_train_results(self):
+        batch_error = self.train_outputs[-1][0]
+        if not (batch_error > 0 and batch_error < 2e20):
+            print "Crazy train error: %.6f" % batch_error
+            self.cleanup()
+
+        print "Train error: %.6f " % (batch_error),
+
+    def print_test_results(self):
+        batch_error = self.test_outputs[-1][0]
+        print "%s\t\tTest error: %.6f" % (NL, batch_error),
+
+    def print_test_status(self):
+        status = (len(self.test_outputs) == 1 or self.test_outputs[-1][0] < self.test_outputs[-2][0]) and "ok" or "WORSE"
+        print status,
+        
+    def sync_with_host(self):
+        if self.checkpoint_writer is not None:
+            self.checkpoint_writer.join()
+            self.checkpoint_writer = None
+        self.libmodel.syncWithHost()
+        
+    def conditional_save(self):
+        batch_error = self.test_outputs[-1][0]
+        if batch_error > 0 and batch_error < self.max_test_err:
+            self.save_state()
+        else:
+            print "\tTest error > %g, not saving." % self.max_test_err,
+    
+    def aggregate_test_outputs(self, test_outputs):
+        test_error = tuple([sum(t[r] for t in test_outputs) / (1 if self.test_one else len(self.test_batch_range)) for r in range(len(test_outputs[-1]))])
+        return test_error
+    
+    def get_test_error(self):
+        next_data = self.get_next_batch(train=False)
+        test_outputs = []
+        while True:
+            data = next_data
+            start_time_test = time()
+            self.start_batch(data, train=False)
+            load_next = (not self.test_one or self.test_only) and data[1] < self.test_batch_range[-1]
+            if load_next: # load next batch
+                next_data = self.get_next_batch(train=False)
+            test_outputs += [self.finish_batch()]
+            if self.test_only: # Print the individual batch results for safety
+                print "batch %d: %s" % (data[1], str(test_outputs[-1])),
+                self.print_elapsed_time(time() - start_time_test)
+            if not load_next:
+                break
+            sys.stdout.flush()
+            
+        return self.aggregate_test_outputs(test_outputs)
+    
+    def set_var(self, var_name, var_val):
+        setattr(self, var_name, var_val)
+        self.model_state[var_name] = var_val
+        return var_val
+        
+    def get_var(self, var_name):
+        return self.model_state[var_name]
+        
+    def has_var(self, var_name):
+        return var_name in self.model_state
+        
+    def save_state(self):
+        for att in self.model_state:
+            if hasattr(self, att):
+                self.model_state[att] = getattr(self, att)
+        
+        dic = {"model_state": self.model_state,
+               "op": self.op}
+            
+        checkpoint_file = "%d.%d" % (self.epoch, self.batchnum)
+        checkpoint_file_full_path = os.path.join(self.save_file, checkpoint_file)
+        if not os.path.exists(self.save_file):
+            os.makedirs(self.save_file)
+    
+        assert self.checkpoint_writer is None
+        self.checkpoint_writer = CheckpointWriter(checkpoint_file_full_path, dic)
+        self.checkpoint_writer.start()
+        print "-------------------------------------------------------"
+        print "Saved checkpoint to %s" % self.save_file
+        print "=======================================================",
+        return self.checkpoint_writer
+        
+    def get_progress(self):
+        num_batches_total = self.num_epochs * len(self.train_batch_range)
+        return min(1.0, max(0.0, float(self.get_num_batches_done()-1) / num_batches_total))
+    
+    @staticmethod
+    def load_checkpoint(load_dir):
+        if os.path.isdir(load_dir):
+            return unpickle(os.path.join(load_dir, sorted(os.listdir(load_dir), key=alphanum_key)[-1]))
+        return unpickle(load_dir)
+
+    @staticmethod
+    def get_options_parser():
+        op = OptionsParser()
+        op.add_option("load-file", "load_file", StringOptionParser, "Load file", default="", excuses=OptionsParser.EXCUSE_ALL)
+        op.add_option("save-path", "save_path", StringOptionParser, "Save path", excuses=['save_file_override'])
+        op.add_option("save-file", "save_file_override", StringOptionParser, "Save file override", excuses=['save_path'])
+        op.add_option("train-range", "train_batch_range", RangeOptionParser, "Data batch range: training")
+        op.add_option("test-range", "test_batch_range", RangeOptionParser, "Data batch range: testing")
+        op.add_option("data-provider", "dp_type", StringOptionParser, "Data provider", default="default")
+        op.add_option("test-freq", "testing_freq", IntegerOptionParser, "Testing frequency", default=25)
+        op.add_option("epochs", "num_epochs", IntegerOptionParser, "Number of epochs", default=500)
+        op.add_option("data-path", "data_path", StringOptionParser, "Data path")
+        
+        op.add_option("max-test-err", "max_test_err", FloatOptionParser, "Maximum test error for saving")
+        op.add_option("test-only", "test_only", BooleanOptionParser, "Test and quit?", default=0)
+        op.add_option("test-one", "test_one", BooleanOptionParser, "Test on one batch at a time?", default=1)
+        op.add_option("force-save", "force_save", BooleanOptionParser, "Force save before quitting", default=0)
+        op.add_option("gpu", "gpu", ListOptionParser(IntegerOptionParser), "GPU override")
+        return op
+
+    @staticmethod
+    def print_data_providers():
+        print "Available data providers:"
+        for dp, desc in dp_types.iteritems():
+            print "    %s: %s" % (dp, desc)
+            
+
+    @staticmethod
+    def parse_options(op):
+        try:
+            load_dic = None
+            options = op.parse()
+            load_location = None
+#            print options['load_file'].value_given, options['save_file_override'].value_given
+#            print options['save_file_override'].value
+            if options['load_file'].value_given:
+                load_location = options['load_file'].value
+            elif options['save_file_override'].value_given and os.path.exists(options['save_file_override'].value):
+                load_location = options['save_file_override'].value
+            
+            if load_location is not None:
+                load_dic = IGPUModel.load_checkpoint(load_location)
+                old_op = load_dic["op"]
+                old_op.merge_from(op)
+                op = old_op
+            op.eval_expr_defaults()
+            return op, load_dic
+        except OptionMissingException, e:
+            print e
+            op.print_usage()
+        except OptionException, e:
+            print e
+        except UnpickleError, e:
+            print "Error loading checkpoint:"
+            print e
+        sys.exit()
diff --git a/caffe2/contrib/cuda-convnet2/python_util/options.py b/caffe2/contrib/cuda-convnet2/python_util/options.py
new file mode 100644
index 0000000..afc6ed5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/python_util/options.py
@@ -0,0 +1,408 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from getopt import getopt
+import os
+import re
+#import types
+
+TERM_BOLD_START = "\033[1m"
+TERM_BOLD_END = "\033[0m"
+
+class Option:
+    def __init__(self, letter, name, desc, parser, set_once, default, excuses, requires, save):
+        assert not name is None
+        self.letter = letter
+        self.name = name
+        self.desc = desc
+        self.parser = parser
+        self.set_once = set_once
+        self.default = default
+        self.excuses = excuses
+        self.requires = requires
+        self.save = save
+        
+        self.value = None
+        self.value_given = False
+        self.prefixed_letter = min(2, len(letter)) * '-' + letter
+        
+    def set_value(self, value, parse=True):
+        try:
+            self.value = self.parser.parse(value) if parse else value
+            self.value_given = True
+#            print self.name, self.value
+        except OptionException, e:
+            raise OptionException("Unable to parse option %s (%s): %s" % (self.prefixed_letter, self.desc, e))
+        
+    def set_default(self):
+        if not self.default is None:
+            self.value = self.default
+    
+    def eval_expr_default(self, env):
+        try:
+            if isinstance(self.default, OptionExpression) and not self.value_given:
+                self.value = self.default.evaluate(env)
+                if not self.parser.is_type(self.value):
+                    raise OptionException("expression result %s is not of right type (%s)" % (self.value, self.parser.get_type_str()))
+        except Exception, e:
+            raise OptionException("Unable to set default value for option %s (%s): %s" % (self.prefixed_letter, self.desc, e))
+            
+    def get_str_value(self, get_default_str=False):
+        val = self.value
+        if get_default_str: val = self.default
+        if val is None: return ""
+        if isinstance(val, OptionExpression):
+            return val.expr
+        return self.parser.to_string(val)
+
+class OptionsParser:
+    """An option parsing class. All options without default values are mandatory, unless a excuses
+    option (usually a load file) is given.
+    Does not support options without arguments."""
+    SORT_LETTER = 1
+    SORT_DESC = 2
+    SORT_EXPR_LAST = 3
+    EXCUSE_ALL = "all"
+    def __init__(self):
+        self.options = {}
+        
+    def add_option(self, letter, name, parser, desc, set_once=False, default=None, excuses=[], requires=[], save=True):
+        """
+        The letter parameter is the actual parameter that the user will have to supply on the command line.
+        The name parameter is some name to be given to this option and must be a valid python variable name.
+        
+        An explanation of the "default" parameter:
+        The default value, if specified, should have the same type as the option.
+        You can also specify an expression as the default value. In this case, the default value of the parameter
+        will be the output of the expression. The expression may assume all other option names
+        as local variables. For example, you can define the hidden bias
+        learning rate to be 10 times the weight learning rate by setting this default:
+        
+        default=OptionExpression("eps_w * 10") (assuming an option named eps_w exists).
+        
+        However, it is up to you to make sure you do not make any circular expression definitions.
+        
+        Note that the order in which the options are parsed is arbitrary.
+        In particular, expression default values that depend on other expression default values
+        will often raise errors (depending on the order in which they happen to be parsed).
+        Therefore it is best not to make the default value of one variable depend on the value
+        of another if the other variable's default value is itself an expression.
+        
+        An explanation of the "excuses" parameter:
+        All options are mandatory, but certain options can exclude other options from being mandatory.
+        For example, if the excuses parameter for option "load_file" is ["num_hid", "num_vis"],
+        then the options num_hid and num_vis are not mandatory as long as load_file is specified.
+        Use the special flag EXCUSE_ALL to allow an option to make all other options optional.
+        """
+        
+        assert name not in self.options
+        self.options[name] = Option(letter, name, desc, parser, set_once, default, excuses, requires, save)
+    
+    def set_value(self, name, value, parse=True):
+        self.options[name].set_value(value, parse=parse)
+    
+    def get_value(self, name):
+        return self.options[name].value
+        
+    def delete_option(self, name):
+        if name in self.options:
+            del self.options[name]
+            
+    def parse(self, eval_expr_defaults=False):
+        """Parses the options in sys.argv based on the options added to this parser. The
+        default behavior is to leave any expression default options as OptionExpression objects.
+        Set eval_expr_defaults=True to circumvent this."""
+        short_opt_str = ''.join(["%s:" % self.options[name].letter for name in self.options if len(self.options[name].letter) == 1])
+        long_opts = ["%s=" % self.options[name].letter for name in self.options if len(self.options[name].letter) > 1]
+        (go, ga) = getopt(sys.argv[1:], short_opt_str, longopts=long_opts)
+        dic = dict(go)
+        
+        for o in self.get_options_list(sort_order=self.SORT_EXPR_LAST):
+            if o.prefixed_letter in dic:  
+                o.set_value(dic[o.prefixed_letter])
+            else:
+                # check if excused or has default
+                excused = max([o2.prefixed_letter in dic for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses])
+                if not excused and o.default is None:
+                    raise OptionMissingException("Option %s (%s) not supplied" % (o.prefixed_letter, o.desc))
+                o.set_default()
+            # check requirements
+            if o.prefixed_letter in dic:
+                for o2 in self.get_options_list(sort_order=self.SORT_LETTER):
+                    if o2.name in o.requires and o2.prefixed_letter not in dic:
+                        raise OptionMissingException("Option %s (%s) requires option %s (%s)" % (o.prefixed_letter, o.desc,
+                                                                                                 o2.prefixed_letter, o2.desc))
+        if eval_expr_defaults:
+            self.eval_expr_defaults()
+        return self.options
+    
+    def merge_from(self, op2):
+        """Merges the options in op2 into this instance, but does not overwrite
+        this instances's SET options with op2's default values."""
+        for name, o in self.options.iteritems():
+            if name in op2.options and ((op2.options[name].value_given and op2.options[name].value != self.options[name].value) or not op2.options[name].save):
+                if op2.options[name].set_once:
+                    raise OptionException("Option %s (%s) cannot be changed" % (op2.options[name].prefixed_letter, op2.options[name].desc))
+                self.options[name] = op2.options[name]
+        for name in op2.options:
+            if name not in self.options:
+                self.options[name] = op2.options[name]
+    
+    def eval_expr_defaults(self):
+        env = dict([(name, o.value) for name, o in self.options.iteritems()])
+        for o in self.options.values():
+            o.eval_expr_default(env)
+            
+    def all_values_given(self):
+        return max([o.value_given for o in self.options.values() if o.default is not None])
+    
+    def get_options_list(self, sort_order=SORT_LETTER):
+        """ Returns the list of Option objects in this OptionParser,
+        sorted as specified"""
+        
+        cmp = lambda x, y: (x.desc < y.desc and -1 or 1)
+        if sort_order == self.SORT_LETTER:
+            cmp = lambda x, y: (x.letter < y.letter and -1 or 1)
+        elif sort_order == self.SORT_EXPR_LAST:
+            cmp = lambda x, y: (type(x.default) == OptionExpression and 1 or -1)
+        return sorted(self.options.values(), cmp=cmp)
+    
+    def print_usage(self, print_constraints=False):
+        print "%s usage:" % os.path.basename(sys.argv[0])
+        opslist = self.get_options_list()
+
+        usage_strings = []
+        num_def = 0
+        for o in opslist:
+            excs = ' '
+            if o.default is None:
+                excs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses]))
+            reqs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.name in o.requires]))
+            usg = (OptionsParser._bold(o.prefixed_letter) + " <%s>" % o.parser.get_type_str(), o.desc, ("[%s]" % o.get_str_value(get_default_str=True)) if not o.default is None else None, excs, reqs)
+            if o.default is None:
+                usage_strings += [usg]
+            else:
+                usage_strings.insert(num_def, usg)
+                num_def += 1
+                
+        col_widths = [self._longest_value(usage_strings, key=lambda x:x[i]) for i in range(len(usage_strings[0]) - 1)]
+
+        col_names = ["    Option", "Description", "Default"]
+        if print_constraints:
+            col_names += ["Excused by", "Requires"]
+        for i, s in enumerate(col_names):
+            print self._bold(s.ljust(col_widths[i])),
+
+        print ""
+        for l, d, de, ex, req in usage_strings:
+            if de is None:
+                de = ' '
+                print ("     %s  -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]),
+            else:
+                print ("    [%s] -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]),
+            if print_constraints:
+                print ex.ljust(col_widths[3]), req
+            else:
+                print ""
+                
+    def print_values(self):
+        longest_desc = self._longest_value(self.options.values(), key=lambda x:x.desc)
+        longest_def_value = self._longest_value([v for v in self.options.values() if not v.value_given and not v.default is None],
+                                                 key=lambda x:x.get_str_value())
+        for o in self.get_options_list(sort_order=self.SORT_DESC):
+            print "%s: %s %s" % (o.desc.ljust(longest_desc), o.get_str_value().ljust(longest_def_value), (not o.value_given and not o.default is None) and "[DEFAULT]" or "")
+    
+    @staticmethod
+    def _longest_value(values, key=lambda x:x):
+        mylen = lambda x: 0 if x is None else len(x)
+        return mylen(key(max(values, key=lambda x:mylen(key(x)))))
+
+    @staticmethod
+    def _bold(str):
+        return TERM_BOLD_START + str + TERM_BOLD_END
+
+class OptionException(Exception):
+    pass
+                
+class OptionMissingException(OptionException):
+    pass
+
+class OptionParser:
+    @staticmethod
+    def parse(value):
+        return str(value)
+       
+    @staticmethod
+    def to_string(value):
+        return str(value)
+    
+    @staticmethod
+    def get_type_str():
+        pass
+    
+class IntegerOptionParser(OptionParser):
+    @staticmethod
+    def parse(value):
+        try:
+            return int(value)
+        except:
+            raise OptionException("argument is not an integer")
+    
+    @staticmethod
+    def get_type_str():
+        return "int"
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == int
+    
+class BooleanOptionParser(OptionParser):
+    @staticmethod
+    def parse(value):
+        try:
+            v = int(value)
+            if not v in (0,1):
+                raise OptionException
+            return v
+        except:
+            raise OptionException("argument is not a boolean")
+    
+    @staticmethod
+    def get_type_str():
+        return "0/1"
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == int and value in (0, 1)
+        
+class StringOptionParser(OptionParser):       
+    @staticmethod
+    def get_type_str():
+        return "string"
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == str
+    
+class FloatOptionParser(OptionParser):
+    @staticmethod
+    def parse(value):
+        try:
+            return float(value)
+        except:
+            raise OptionException("argument is not a float")
+    
+    @staticmethod
+    def to_string(value):
+        return "%.6g" % value
+    
+    @staticmethod
+    def get_type_str():
+        return "float"
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == float
+    
+class RangeOptionParser(OptionParser):
+    @staticmethod
+    def parse(value):
+        m = re.match("^(\d+)\-(\d+)$", value)
+        try:
+            if m: return range(int(m.group(1)), int(m.group(2)) + 1)
+            return [int(value)]
+        except:
+            raise OptionException("argument is neither an integer nor a range")
+    
+    @staticmethod
+    def to_string(value):
+        return "%d-%d" % (value[0], value[-1])
+    
+    @staticmethod
+    def get_type_str():
+        return "int[-int]"
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == list
+    
+class ListOptionParser(OptionParser):
+    """
+    A parser that parses a delimited list of items. If the "parsers"
+    argument is a list of parsers, then the list of items must have the form and length
+    specified by that list. 
+    
+    Example:
+    ListOptionParser([FloatOptionParser, IntegerOptionParser])
+    
+    would parse "0.5,3" but not "0.5,3,0.6" or "0.5" or "3,0.5".
+    
+    If the "parsers" argument is another parser, then the list of items may be of
+    arbitrary length, but each item must be parseable by the given parser.
+    
+    Example:
+    ListOptionParser(FloatOptionParser)
+    
+    would parse "0.5" and "0.5,0.3" and "0.5,0.3,0.6", etc.
+    """
+    def __init__(self, parsers, sepchar=','):
+        self.parsers = parsers
+        self.sepchar = sepchar
+        
+    def parse(self, value):
+        values = value.split(self.sepchar)
+        if type(self.parsers) == list and len(values) != len(self.parsers):
+            raise OptionException("requires %d arguments, given %d" % (len(self.parsers), len(values)))
+        
+        try:
+            if type(self.parsers) == list:
+                return [p.parse(v) for p, v in zip(self.parsers, values)]
+            return [self.parsers.parse(v) for v in values]
+        except:
+            raise OptionException("argument is not of the form %s" % self.get_type_str())
+    
+    def to_string(self, value):
+        if type(self.parsers) == list:
+            return self.sepchar.join([p.to_string(v) for p, v in zip(self.parsers, value)])
+        return self.sepchar.join([self.parsers.to_string(v) for v in value])
+    
+    def get_type_str(self):
+        if type(self.parsers) == list:
+            return self.sepchar.join([p.get_type_str() for p in self.parsers])
+        return "%s%s..." % (self.parsers.get_type_str(), self.sepchar)
+    
+    @staticmethod
+    def is_type(value):
+        return type(value) == list
+    
+class OptionExpression:
+    """
+    This allows you to specify option values in terms of other option values.
+    Example:
+    op.add_option("eps-w", "eps_w", ListOptionParser(FloatOptionParser), "Weight learning rates for each layer")
+    op.add_option("eps-b", "eps_b", ListOptionParser(FloatOptionParser), "Bias learning rates for each layer", default=OptionExpression("[o * 10 for o in eps_w]"))
+    
+    This says: the default bias learning rate for each layer is 10
+    times the weight learning rate for that layer.
+    """
+    def __init__(self, expr):
+        self.expr = expr
+    
+    def evaluate(self, options):
+        locals().update(options)
+        try:
+            return eval(self.expr)
+        except Exception, e:
+            raise OptionException("expression '%s': unable to parse: %s" % (self.expr, e))
diff --git a/caffe2/contrib/cuda-convnet2/python_util/util.py b/caffe2/contrib/cuda-convnet2/python_util/util.py
new file mode 100644
index 0000000..b3b6211
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/python_util/util.py
@@ -0,0 +1,94 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import cPickle
+import os
+from cStringIO import StringIO
+
+class UnpickleError(Exception):
+    pass
+
+GPU_LOCK_NO_SCRIPT = -2
+GPU_LOCK_NO_LOCK = -1
+
+def pickle(filename, data):
+    fo = filename
+    if type(filename) == str:
+        fo = open(filename, "w")
+    
+    cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL)
+    fo.close()
+    
+def unpickle(filename):
+    if not os.path.exists(filename):
+        raise UnpickleError("Path '%s' does not exist." % filename)
+
+    fo = open(filename, 'r')
+    z = StringIO()
+    file_size = os.fstat(fo.fileno()).st_size
+    # Read 1GB at a time to avoid overflow
+    while fo.tell() < file_size:
+        z.write(fo.read(1 << 30))
+    fo.close()
+    dict = cPickle.loads(z.getvalue())
+    z.close()
+    
+    return dict
+
+def is_intel_machine():
+    VENDOR_ID_REGEX = re.compile('^vendor_id\s+: (\S+)')
+    f = open('/proc/cpuinfo')
+    for line in f:
+        m = VENDOR_ID_REGEX.match(line)
+        if m:
+            f.close()
+            return m.group(1) == 'GenuineIntel'
+    f.close()
+    return False
+
+# Returns the CPUs associated with a given GPU
+def get_cpus_for_gpu(gpu):
+    #proc = subprocess.Popen(['nvidia-smi', '-q', '-i', str(gpu)], stdout=subprocess.PIPE)
+    #lines = proc.communicate()[0]
+    #lines = subprocess.check_output(['nvidia-smi', '-q', '-i', str(gpu)]).split(os.linesep)
+
+    with open('/proc/driver/nvidia/gpus/%d/information' % gpu) as f:
+        for line in f:
+            if line.startswith('Bus Location'):
+                bus_id = line.split(':', 1)[1].strip()
+                bus_id = bus_id[:7] + ':' + bus_id[8:]
+                ff = open('/sys/module/nvidia/drivers/pci:nvidia/%s/local_cpulist' % bus_id)
+                cpus_str = ff.readline()
+                ff.close()
+                cpus = [cpu for s in cpus_str.split(',') for cpu in range(int(s.split('-')[0]),int(s.split('-')[1])+1)]
+                return cpus
+    return [-1]
+
+def get_cpu():
+    if is_intel_machine():
+        return 'intel'
+    return 'amd'
+
+def is_windows_machine():
+    return os.name == 'nt'
+    
+def tryint(s):
+    try:
+        return int(s)
+    except:
+        return s
+
+def alphanum_key(s):
+    return [tryint(c) for c in re.split('([0-9]+)', s)]
diff --git a/caffe2/contrib/cuda-convnet2/shownet.py b/caffe2/contrib/cuda-convnet2/shownet.py
new file mode 100644
index 0000000..6e1bf11
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/shownet.py
@@ -0,0 +1,341 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from tarfile import TarFile, TarInfo
+from matplotlib import pylab as pl
+import numpy as n
+import getopt as opt
+from python_util.util import *
+from math import sqrt, ceil, floor
+from python_util.gpumodel import IGPUModel
+import random as r
+import numpy.random as nr
+from convnet import ConvNet
+from python_util.options import *
+from PIL import Image
+from time import sleep
+
+class ShowNetError(Exception):
+    pass
+
+class ShowConvNet(ConvNet):
+    def __init__(self, op, load_dic):
+        ConvNet.__init__(self, op, load_dic)
+
+    def init_data_providers(self):
+        self.need_gpu = self.op.get_value('show_preds') 
+        class Dummy:
+            def advance_batch(self):
+                pass
+        if self.need_gpu:
+            ConvNet.init_data_providers(self)
+        else:
+            self.train_data_provider = self.test_data_provider = Dummy()
+    
+    def import_model(self):
+        if self.need_gpu:
+            ConvNet.import_model(self)
+            
+    def init_model_state(self):
+        if self.op.get_value('show_preds'):
+            self.softmax_name = self.op.get_value('show_preds')
+            
+    def init_model_lib(self):
+        if self.need_gpu:
+            ConvNet.init_model_lib(self)
+
+    def plot_cost(self):
+        if self.show_cost not in self.train_outputs[0][0]:
+            raise ShowNetError("Cost function with name '%s' not defined by given convnet." % self.show_cost)
+#        print self.test_outputs
+        train_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.train_outputs]
+        test_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.test_outputs]
+        if self.smooth_test_errors:
+            test_errors = [sum(test_errors[max(0,i-len(self.test_batch_range)):i])/(i-max(0,i-len(self.test_batch_range))) for i in xrange(1,len(test_errors)+1)]
+        numbatches = len(self.train_batch_range)
+        test_errors = n.row_stack(test_errors)
+        test_errors = n.tile(test_errors, (1, self.testing_freq))
+        test_errors = list(test_errors.flatten())
+        test_errors += [test_errors[-1]] * max(0,len(train_errors) - len(test_errors))
+        test_errors = test_errors[:len(train_errors)]
+
+        numepochs = len(train_errors) / float(numbatches)
+        pl.figure(1)
+        x = range(0, len(train_errors))
+        pl.plot(x, train_errors, 'k-', label='Training set')
+        pl.plot(x, test_errors, 'r-', label='Test set')
+        pl.legend()
+        ticklocs = range(numbatches, len(train_errors) - len(train_errors) % numbatches + 1, numbatches)
+        epoch_label_gran = int(ceil(numepochs / 20.)) 
+        epoch_label_gran = int(ceil(float(epoch_label_gran) / 10) * 10) if numepochs >= 10 else epoch_label_gran 
+        ticklabels = map(lambda x: str((x[1] / numbatches)) if x[0] % epoch_label_gran == epoch_label_gran-1 else '', enumerate(ticklocs))
+
+        pl.xticks(ticklocs, ticklabels)
+        pl.xlabel('Epoch')
+#        pl.ylabel(self.show_cost)
+        pl.title('%s[%d]' % (self.show_cost, self.cost_idx))
+#        print "plotted cost"
+        
+    def make_filter_fig(self, filters, filter_start, fignum, _title, num_filters, combine_chans, FILTERS_PER_ROW=16):
+        MAX_ROWS = 24
+        MAX_FILTERS = FILTERS_PER_ROW * MAX_ROWS
+        num_colors = filters.shape[0]
+        f_per_row = int(ceil(FILTERS_PER_ROW / float(1 if combine_chans else num_colors)))
+        filter_end = min(filter_start+MAX_FILTERS, num_filters)
+        filter_rows = int(ceil(float(filter_end - filter_start) / f_per_row))
+    
+        filter_pixels = filters.shape[1]
+        filter_size = int(sqrt(filters.shape[1]))
+        fig = pl.figure(fignum)
+        fig.text(.5, .95, '%s %dx%d filters %d-%d' % (_title, filter_size, filter_size, filter_start, filter_end-1), horizontalalignment='center') 
+        num_filters = filter_end - filter_start
+        if not combine_chans:
+            bigpic = n.zeros((filter_size * filter_rows + filter_rows + 1, filter_size*num_colors * f_per_row + f_per_row + 1), dtype=n.single)
+        else:
+            bigpic = n.zeros((3, filter_size * filter_rows + filter_rows + 1, filter_size * f_per_row + f_per_row + 1), dtype=n.single)
+    
+        for m in xrange(filter_start,filter_end ):
+            filter = filters[:,:,m]
+            y, x = (m - filter_start) / f_per_row, (m - filter_start) % f_per_row
+            if not combine_chans:
+                for c in xrange(num_colors):
+                    filter_pic = filter[c,:].reshape((filter_size,filter_size))
+                    bigpic[1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size,
+                           1 + (1 + filter_size*num_colors) * x + filter_size*c:1 + (1 + filter_size*num_colors) * x + filter_size*(c+1)] = filter_pic
+            else:
+                filter_pic = filter.reshape((3, filter_size,filter_size))
+                bigpic[:,
+                       1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size,
+                       1 + (1 + filter_size) * x:1 + (1 + filter_size) * x + filter_size] = filter_pic
+                
+        pl.xticks([])
+        pl.yticks([])
+        if not combine_chans:
+            pl.imshow(bigpic, cmap=pl.cm.gray, interpolation='nearest')
+        else:
+            bigpic = bigpic.swapaxes(0,2).swapaxes(0,1)
+            pl.imshow(bigpic, interpolation='nearest')        
+        
+    def plot_filters(self):
+        FILTERS_PER_ROW = 16
+        filter_start = 0 # First filter to show
+        if self.show_filters not in self.layers:
+            raise ShowNetError("Layer with name '%s' not defined by given convnet." % self.show_filters)
+        layer = self.layers[self.show_filters]
+        filters = layer['weights'][self.input_idx]
+#        filters = filters - filters.min()
+#        filters = filters / filters.max()
+        if layer['type'] == 'fc': # Fully-connected layer
+            num_filters = layer['outputs']
+            channels = self.channels
+            filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1])
+        elif layer['type'] in ('conv', 'local'): # Conv layer
+            num_filters = layer['filters']
+            channels = layer['filterChannels'][self.input_idx]
+            if layer['type'] == 'local':
+                filters = filters.reshape((layer['modules'], channels, layer['filterPixels'][self.input_idx], num_filters))
+                filters = filters[:, :, :, self.local_plane] # first map for now (modules, channels, pixels)
+                filters = filters.swapaxes(0,2).swapaxes(0,1)
+                num_filters = layer['modules']
+#                filters = filters.swapaxes(0,1).reshape(channels * layer['filterPixels'][self.input_idx], num_filters * layer['modules'])
+#                num_filters *= layer['modules']
+                FILTERS_PER_ROW = layer['modulesX']
+            else:
+                filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1])
+        
+        
+        # Convert YUV filters to RGB
+        if self.yuv_to_rgb and channels == 3:
+            R = filters[0,:,:] + 1.28033 * filters[2,:,:]
+            G = filters[0,:,:] + -0.21482 * filters[1,:,:] + -0.38059 * filters[2,:,:]
+            B = filters[0,:,:] + 2.12798 * filters[1,:,:]
+            filters[0,:,:], filters[1,:,:], filters[2,:,:] = R, G, B
+        combine_chans = not self.no_rgb and channels == 3
+        
+        # Make sure you don't modify the backing array itself here -- so no -= or /=
+        if self.norm_filters:
+            #print filters.shape
+            filters = filters - n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).mean(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1))
+            filters = filters / n.sqrt(n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).var(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1)))
+            #filters = filters - n.tile(filters.min(axis=0).min(axis=0), (3, filters.shape[1], 1))
+            #filters = filters / n.tile(filters.max(axis=0).max(axis=0), (3, filters.shape[1], 1))
+        #else:
+        filters = filters - filters.min()
+        filters = filters / filters.max()
+
+        self.make_filter_fig(filters, filter_start, 2, 'Layer %s' % self.show_filters, num_filters, combine_chans, FILTERS_PER_ROW=FILTERS_PER_ROW)
+    
+    def plot_predictions(self):
+        epoch, batch, data = self.get_next_batch(train=False) # get a test batch
+        num_classes = self.test_data_provider.get_num_classes()
+        NUM_ROWS = 2
+        NUM_COLS = 4
+        NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1]
+        NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels
+        NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs']
+        PRED_IDX = 1
+        
+        label_names = [lab.split(',')[0] for lab in self.test_data_provider.batch_meta['label_names']]
+        if self.only_errors:
+            preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single)
+        else:
+            preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
+            #rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS]
+            rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
+            if NUM_IMGS < data[0].shape[1]:
+                data = [n.require(d[:,rand_idx], requirements='C') for d in data]
+#        data += [preds]
+        # Run the model
+        print  [d.shape for d in data], preds.shape
+        self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
+        IGPUModel.finish_batch(self)
+        print preds
+        data[0] = self.test_data_provider.get_plottable_data(data[0])
+
+        if self.save_preds:
+            if not gfile.Exists(self.save_preds):
+                gfile.MakeDirs(self.save_preds)
+            preds_thresh = preds > 0.5 # Binarize predictions
+            data[0] = data[0] * 255.0
+            data[0][data[0]<0] = 0
+            data[0][data[0]>255] = 255
+            data[0] = n.require(data[0], dtype=n.uint8)
+            dir_name = '%s_predictions_batch_%d' % (os.path.basename(self.save_file), batch)
+            tar_name = os.path.join(self.save_preds, '%s.tar' % dir_name)
+            tfo = gfile.GFile(tar_name, "w")
+            tf = TarFile(fileobj=tfo, mode='w')
+            for img_idx in xrange(NUM_IMGS):
+                img = data[0][img_idx,:,:,:]
+                imsave = Image.fromarray(img)
+                prefix = "CORRECT" if data[1][0,img_idx] == preds_thresh[img_idx,PRED_IDX] else "FALSE_POS" if preds_thresh[img_idx,PRED_IDX] == 1 else "FALSE_NEG"
+                file_name = "%s_%.2f_%d_%05d_%d.png" % (prefix, preds[img_idx,PRED_IDX], batch, img_idx, data[1][0,img_idx])
+#                gf = gfile.GFile(file_name, "w")
+                file_string = StringIO()
+                imsave.save(file_string, "PNG")
+                tarinf = TarInfo(os.path.join(dir_name, file_name))
+                tarinf.size = file_string.tell()
+                file_string.seek(0)
+                tf.addfile(tarinf, file_string)
+            tf.close()
+            tfo.close()
+#                gf.close()
+            print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name)
+        else:
+            fig = pl.figure(3, figsize=(12,9))
+            fig.text(.4, .95, '%s test samples' % ('Mistaken' if self.only_errors else 'Random'))
+            if self.only_errors:
+                # what the net got wrong
+                if NUM_OUTPUTS > 1:
+                    err_idx = [i for i,p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:,i] > 0)[0]]
+                else:
+                    err_idx = n.where(data[1][0,:] != preds[:,0].T)[0]
+                    print err_idx
+                err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS))
+                data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]
+                
+            
+            import matplotlib.gridspec as gridspec
+            import matplotlib.colors as colors
+            cconv = colors.ColorConverter()
+            gs = gridspec.GridSpec(NUM_ROWS*2, NUM_COLS,
+                                   width_ratios=[1]*NUM_COLS, height_ratios=[2,1]*NUM_ROWS )
+            #print data[1]
+            for row in xrange(NUM_ROWS):
+                for col in xrange(NUM_COLS):
+                    img_idx = row * NUM_COLS + col
+                    if data[0].shape[0] <= img_idx:
+                        break
+                    pl.subplot(gs[(row * 2) * NUM_COLS + col])
+                    #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1)
+                    pl.xticks([])
+                    pl.yticks([])
+                    img = data[0][img_idx,:,:,:]
+                    pl.imshow(img, interpolation='lanczos')
+                    show_title = data[1].shape[0] == 1
+                    true_label = [int(data[1][0,img_idx])] if show_title else n.where(data[1][:,img_idx]==1)[0]
+                    #print true_label
+                    #print preds[img_idx,:].shape
+                    #print preds[img_idx,:].max()
+                    true_label_names = [label_names[i] for i in true_label]
+                    img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
+                    #print img_labels
+                    axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col])
+                    height = 0.5
+                    ylocs = n.array(range(NUM_TOP_CLASSES))*height
+                    pl.barh(ylocs, [l[0] for l in img_labels], height=height, \
+                            color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels])
+                    #pl.title(", ".join(true_labels))
+                    if show_title:
+                        pl.title(", ".join(true_label_names), fontsize=15, fontweight='bold')
+                    else:
+                        print true_label_names
+                    pl.yticks(ylocs + height/2, [l[1] for l in img_labels], x=1, backgroundcolor=cconv.to_rgba('0.65', alpha=0.5), weight='bold')
+                    for line in enumerate(axes.get_yticklines()): 
+                        line[1].set_visible(False) 
+                    #pl.xticks([width], [''])
+                    #pl.yticks([])
+                    pl.xticks([])
+                    pl.ylim(0, ylocs[-1] + height)
+                    pl.xlim(0, 1)
+
+    def start(self):
+        self.op.print_values()
+#        print self.show_cost
+        if self.show_cost:
+            self.plot_cost()
+        if self.show_filters:
+            self.plot_filters()
+        if self.show_preds:
+            self.plot_predictions()
+
+        if pl:
+            pl.show()
+        sys.exit(0)
+            
+    @classmethod
+    def get_options_parser(cls):
+        op = ConvNet.get_options_parser()
+        for option in list(op.options):
+            if option not in ('gpu', 'load_file', 'inner_size', 'train_batch_range', 'test_batch_range', 'multiview_test', 'data_path', 'pca_noise', 'scalar_mean'):
+                op.delete_option(option)
+        op.add_option("show-cost", "show_cost", StringOptionParser, "Show specified objective function", default="")
+        op.add_option("show-filters", "show_filters", StringOptionParser, "Show learned filters in specified layer", default="")
+        op.add_option("norm-filters", "norm_filters", BooleanOptionParser, "Individually normalize filters shown with --show-filters", default=0)
+        op.add_option("input-idx", "input_idx", IntegerOptionParser, "Input index for layer given to --show-filters", default=0)
+        op.add_option("cost-idx", "cost_idx", IntegerOptionParser, "Cost function return value index for --show-cost", default=0)
+        op.add_option("no-rgb", "no_rgb", BooleanOptionParser, "Don't combine filter channels into RGB in layer given to --show-filters", default=False)
+        op.add_option("yuv-to-rgb", "yuv_to_rgb", BooleanOptionParser, "Convert RGB filters to YUV in layer given to --show-filters", default=False)
+        op.add_option("channels", "channels", IntegerOptionParser, "Number of channels in layer given to --show-filters (fully-connected layers only)", default=0)
+        op.add_option("show-preds", "show_preds", StringOptionParser, "Show predictions made by given softmax on test set", default="")
+        op.add_option("save-preds", "save_preds", StringOptionParser, "Save predictions to given path instead of showing them", default="")
+        op.add_option("only-errors", "only_errors", BooleanOptionParser, "Show only mistaken predictions (to be used with --show-preds)", default=False, requires=['show_preds'])
+        op.add_option("local-plane", "local_plane", IntegerOptionParser, "Local plane to show", default=0)
+        op.add_option("smooth-test-errors", "smooth_test_errors", BooleanOptionParser, "Use running average for test error plot?", default=1)
+
+        op.options['load_file'].default = None
+        return op
+    
+if __name__ == "__main__":
+    #nr.seed(6)
+    try:
+        op = ShowConvNet.get_options_parser()
+        op, load_dic = IGPUModel.parse_options(op)
+        model = ShowConvNet(op, load_dic)
+        model.start()
+    except (UnpickleError, ShowNetError, opt.GetoptError), e:
+        print "----------------"
+        print "Error:"
+        print e 
diff --git a/caffe2/contrib/cuda-convnet2/util/Makefile b/caffe2/contrib/cuda-convnet2/util/Makefile
new file mode 100644
index 0000000..55aba16
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/Makefile
@@ -0,0 +1,57 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LINK_LIBS := -L$(ATLAS_LIB_PATH) -latlas -lcblas
+INCLUDES := -I./include
+COMMONFLAGS :=
+CC_ARGS := 
+CC=g++
+
+ifndef debug
+	CC_ARGS += -O3
+endif
+
+OUT_DIR=./bin/$(OUT_SUFFIX)
+OUT_FILE=libutil.so
+
+ifeq ($(numpy), 1)
+	PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
+	LINK_LIBS += -lpython$(PYTHON_VERSION)
+
+	INCLUDES += -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH)
+	COMMONFLAGS += -DNUMPY_INTERFACE
+	OUT_FILE=libutilpy.so
+endif
+
+OBJECTS = matrix.cpp 
+
+all: dir classes $(OUT_FILE)
+
+dir:
+	mkdir -p $(OUT_DIR)/src
+
+SOURCES = $(shell echo src/*.cpp)
+CLASSES = $(SOURCES:.cpp=.o)
+
+classes: $(CLASSES)
+
+%.o: %.cpp
+	$(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o
+
+$(OUT_FILE): classes
+	cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS)
+	ln -sf $(OUT_DIR)/$(OUT_FILE) .
+
+clean:
+	rm -rf $(OUT_DIR)/*
diff --git a/caffe2/contrib/cuda-convnet2/util/include/matrix.h b/caffe2/contrib/cuda-convnet2/util/include/matrix.h
new file mode 100644
index 0000000..c75da8c
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/include/matrix.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MATRIX_H_
+#define MATRIX_H_
+
+#include "matrix_funcs.h"
+#ifdef NUMPY_INTERFACE
+#include <Python.h>
+#include <arrayobject.h>
+#endif
+#include <limits>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <vector>
+
+extern "C" {
+// #include <cblas.h>
+#include "caffe2/utils/cblas.h"
+}
+
+#ifdef DOUBLE_PRECISION
+#define CBLAS_GEMM cblas_dgemm
+#define CBLAS_SCAL cblas_dscal
+#define CBLAS_AXPY cblas_daxpy
+#else
+#define CBLAS_GEMM cblas_sgemm
+#define CBLAS_SCAL cblas_sscal
+#define CBLAS_AXPY cblas_saxpy
+#endif /* DOUBLE_PRECISION */
+
+#define MTYPE_MAX numeric_limits<MTYPE>::max()
+
+typedef long long int int64;
+
+class Matrix {
+private:
+    MTYPE* _data;
+    bool _ownsData;
+    int64 _numRows, _numCols;
+    int64 _numElements;
+    CBLAS_TRANSPOSE _trans;
+
+    void _init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData);
+    void _tileTo2(Matrix& target) const;
+    void _copyAllTo(Matrix& target) const;
+    MTYPE _sum_column(int64 col) const;
+    MTYPE _sum_row(int64 row) const;
+    MTYPE _aggregate(MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
+    void _aggregate(int64 axis, Matrix& target, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
+    MTYPE _aggregateRow(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
+    MTYPE _aggregateCol(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
+    void _updateDims(int64 numRows, int64 numCols);
+    void _applyLoop(MTYPE(*func)(MTYPE));
+    void _applyLoop(MTYPE (*func)(MTYPE), Matrix& target);
+    void _applyLoop2(const Matrix& a, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const;
+    void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const;
+    void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const;
+    void _applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const;
+    void _checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const;
+    void _divideByVector(const Matrix& vec, Matrix& target);
+    inline int64 _getNumColsBackEnd() const {
+        return _trans == CblasNoTrans ? _numCols : _numRows;
+    }
+public:
+    enum FUNCTION {
+        TANH, RECIPROCAL, SQUARE, ABS, EXP, LOG, ZERO, ONE, LOGISTIC1, LOGISTIC2, SIGN
+    };
+    Matrix();
+    Matrix(int64 numRows, int64 numCols);
+    Matrix(int64 numRows, int64 numCols, bool transpose);
+#ifdef NUMPY_INTERFACE
+    Matrix(const PyArrayObject *src);
+#endif
+    Matrix(const Matrix &like);
+    Matrix(MTYPE* data, int64 numRows, int64 numCols);
+    Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose);
+    ~Matrix();
+
+    inline MTYPE& getCell(int64 i, int64 j) const {
+        assert(i >= 0 && i < _numRows);
+        assert(j >= 0 && j < _numCols);
+        if (_trans == CblasTrans) {
+            return _data[j * _numRows + i];
+        }
+        return _data[i * _numCols + j];
+    }
+
+    MTYPE& operator()(int64 i, int64 j) const {
+        return getCell(i, j);
+    }
+
+    inline MTYPE* getData() const {
+        return _data;
+    }
+
+    inline bool isView() const {
+        return !_ownsData;
+    }
+
+    inline int64 getNumRows() const {
+        return _numRows;
+    }
+
+    inline int64 getNumCols() const {
+        return _numCols;
+    }
+
+    inline int64 getNumDataBytes() const {
+        return _numElements * sizeof(MTYPE);
+    }
+
+    inline int64 getNumElements() const {
+        return _numElements;
+    }
+
+    inline int64 getLeadingDim() const {
+        return _trans == CblasTrans ? _numRows : _numCols;
+    }
+
+    inline int64 getFollowingDim() const {
+        return _trans == CblasTrans ? _numCols : _numRows;
+    }
+
+    inline CBLAS_TRANSPOSE getBLASTrans() const {
+        return _trans;
+    }
+
+    inline bool isSameDims(const Matrix& a) const {
+        return a.getNumRows() == getNumRows() && a.getNumCols() == getNumCols();
+    }
+
+    inline bool isTrans() const {
+        return _trans == CblasTrans;
+    }
+
+    /*
+     * Only use if you know what you're doing!
+     * Does not update any dimensions. Just flips the _trans flag.
+     *
+     * Use transpose() if you want to get the transpose of this matrix.
+     */
+    inline void setTrans(bool trans) {
+        assert(isTrans() == trans || !isView());
+        _trans = trans ? CblasTrans : CblasNoTrans;
+    }
+
+    void apply(FUNCTION f);
+    void apply(Matrix::FUNCTION f, Matrix& target);
+    void subtractFromScalar(MTYPE scalar);
+    void subtractFromScalar(MTYPE scalar, Matrix &target) const;
+    void biggerThanScalar(MTYPE scalar);
+    void smallerThanScalar(MTYPE scalar);
+    void equalsScalar(MTYPE scalar);
+    void biggerThanScalar(MTYPE scalar, Matrix& target) const;
+    void smallerThanScalar(MTYPE scalar, Matrix& target) const;
+    void equalsScalar(MTYPE scalar, Matrix& target) const;
+    void biggerThan(Matrix& a);
+    void biggerThan(Matrix& a, Matrix& target) const;
+    void smallerThan(Matrix& a);
+    void smallerThan(Matrix& a, Matrix& target) const;
+    void minWith(Matrix &a);
+    void minWith(Matrix &a, Matrix &target) const;
+    void maxWith(Matrix &a);
+    void maxWith(Matrix &a, Matrix &target) const;
+    void equals(Matrix& a);
+    void equals(Matrix& a, Matrix& target) const;
+    void notEquals(Matrix& a) ;
+    void notEquals(Matrix& a, Matrix& target) const;
+    void add(const Matrix &m);
+    void add(const Matrix &m, MTYPE scale);
+    void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM);
+    void add(const Matrix &m, Matrix& target);
+    void add(const Matrix &m, MTYPE scaleM, Matrix &target);
+    void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target);
+    void subtract(const Matrix &m);
+    void subtract(const Matrix &m, Matrix& target);
+    void subtract(const Matrix &m, MTYPE scale);
+    void subtract(const Matrix &m, MTYPE scale, Matrix& target);
+    void addVector(const Matrix& vec, MTYPE scale);
+    void addVector(const Matrix& vec, MTYPE scale, Matrix& target);
+    void addVector(const Matrix& vec);
+    void addVector(const Matrix& vec, Matrix& target);
+    void addScalar(MTYPE scalar);
+    void addScalar(MTYPE scalar, Matrix& target) const;
+    void maxWithScalar(MTYPE scalar);
+    void maxWithScalar(MTYPE scalar, Matrix &target) const;
+    void minWithScalar(MTYPE scalar);
+    void minWithScalar(MTYPE scalar, Matrix &target) const;
+    void eltWiseMultByVector(const Matrix& vec);
+    void eltWiseMultByVector(const Matrix& vec, Matrix& target);
+    void eltWiseDivideByVector(const Matrix& vec);
+    void eltWiseDivideByVector(const Matrix& vec, Matrix& target);
+    void resize(int64 newNumRows, int64 newNumCols);
+    void resize(const Matrix& like);
+    Matrix& slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const;
+    void slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix &target) const;
+    Matrix& sliceRows(int64 startRow, int64 endRow) const;
+    void sliceRows(int64 startRow, int64 endRow, Matrix& target) const;
+    Matrix& sliceCols(int64 startCol, int64 endCol) const;
+    void sliceCols(int64 startCol, int64 endCol, Matrix& target) const;
+    void rightMult(const Matrix &b, MTYPE scale);
+    void rightMult(const Matrix &b, Matrix &target) const;
+    void rightMult(const Matrix &b);
+    void rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const;
+    void addProduct(const Matrix &a, const Matrix &b, MTYPE scaleAB, MTYPE scaleThis);
+    void addProduct(const Matrix& a, const Matrix& b);
+    void eltWiseMult(const Matrix& a);
+    void eltWiseMult(const Matrix& a, Matrix& target) const;
+    void eltWiseDivide(const Matrix& a);
+    void eltWiseDivide(const Matrix& a, Matrix &target) const;
+    Matrix& transpose() const;
+    Matrix& transpose(bool hard) const;
+    Matrix& tile(int64 timesY, int64 timesX) const;
+    void tile(int64 timesY, int64 timesX, Matrix& target) const;
+    void copy(Matrix &dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const;
+    Matrix& copy() const;
+    void copy(Matrix& target) const;
+    Matrix& sum(int64 axis) const;
+    void sum(int64 axis, Matrix &target) const;
+    MTYPE sum() const;
+    MTYPE max() const;
+    Matrix& max(int64 axis) const;
+    void max(int64 axis, Matrix& target) const;
+    MTYPE min() const;
+    Matrix& min(int64 axis) const;
+    void min(int64 axis, Matrix& target) const;
+    MTYPE norm() const;
+    MTYPE norm2() const;
+    void scale(MTYPE scale);
+    void scale(MTYPE alpha, Matrix& target);
+    void reshape(int64 numRows, int64 numCols);
+    Matrix& reshaped(int64 numRows, int64 numCols);
+    void printShape(const char* name) const;
+    bool hasNan() const;
+    bool hasInf() const;
+
+    void randomizeNormal(MTYPE mean, MTYPE stdev);
+    void randomizeUniform();
+    void randomizeNormal();
+    void print() const;
+    void print(int64 startRow,int64 rows, int64 startCol,int64 cols) const;
+    void print(int64 rows, int64 cols) const;
+};
+
+typedef std::vector<Matrix*> MatrixV;
+
+#endif /* MATRIX_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h b/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h
new file mode 100644
index 0000000..2d37ff1
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MATRIX_FUNCS_H_
+#define MATRIX_FUNCS_H_
+
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+
+#ifdef DOUBLE_PRECISION
+#define MTYPE double
+#else
+#define MTYPE float
+#endif
+
+#define MYRAND ((double)rand() / ((double)RAND_MAX + 1))
+
+inline MTYPE _zero(MTYPE /*x*/) {
+  return 0;
+}
+
+inline MTYPE _one(MTYPE /*x*/) {
+  return 1;
+}
+
+inline MTYPE _abs(MTYPE x) {
+    return x > 0 ? x : -x;
+}
+
+inline MTYPE _square(MTYPE x) {
+    return x * x;
+}
+
+inline MTYPE _sigma1(MTYPE x) {
+    return (tanh(x / 2) + 1) / 2;
+}
+
+inline MTYPE _sigma2(MTYPE x) {
+    return 1 / (1 + exp(-x));
+}
+
+inline MTYPE _recip(MTYPE x) {
+    return 1 / x;
+}
+
+inline MTYPE _exp(MTYPE x) {
+    return exp(x);
+}
+
+inline MTYPE _log(MTYPE x) {
+    return log(x);
+}
+
+inline MTYPE _tanh(MTYPE x) {
+    return tanh(x);
+}
+
+inline MTYPE _sign(MTYPE x) {
+    return x > 0 ? 1 : -1;
+}
+
+inline MTYPE _rand(MTYPE /*x*/) {
+  return MYRAND;
+}
+
+inline MTYPE _divide(MTYPE x, MTYPE y) {
+    return x / y;
+}
+
+inline MTYPE _mult(MTYPE x, MTYPE y) {
+    return x * y;
+}
+
+inline MTYPE _add(MTYPE x, MTYPE y) {
+    return x + y;
+}
+
+inline MTYPE _addSquare(MTYPE x, MTYPE y) {
+    return x*x + y;
+}
+
+inline MTYPE _addWithScale(MTYPE x, MTYPE y, MTYPE scale) {
+    return x + scale*y;
+}
+
+inline MTYPE _addWithScale2(MTYPE x, MTYPE y, MTYPE scaleThis, MTYPE scaleM) {
+    return scaleThis * x + scaleM * y;
+}
+
+inline MTYPE _max(MTYPE x, MTYPE y) {
+    return std::max(x, y);
+}
+
+inline MTYPE _min(MTYPE x, MTYPE y) {
+    return std::min(x, y);
+}
+
+inline MTYPE _bigger(MTYPE x, MTYPE y) {
+    return x > y;
+}
+
+inline MTYPE _smaller(MTYPE x, MTYPE y) {
+    return x < y;
+}
+
+inline MTYPE _equal(MTYPE x, MTYPE y) {
+    return x == y;
+}
+
+inline MTYPE _notEqual(MTYPE x, MTYPE y) {
+    return x != y;
+}
+
+#endif /* MATRIX_FUNCS_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/util/include/queue.h b/caffe2/contrib/cuda-convnet2/util/include/queue.h
new file mode 100644
index 0000000..e5cddd4
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/include/queue.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef QUEUE_H_
+#define QUEUE_H_
+#include <pthread.h>
+#include <stdlib.h>
+
+/*
+ * A thread-safe circular queue that automatically grows but never shrinks.
+ */
+template <class T>
+class Queue {
+private:
+    T *_elements;
+    int _numElements;
+    int _head, _tail;
+    int _maxSize;
+    pthread_mutex_t *_queueMutex;
+    pthread_cond_t *_queueCV;
+
+    void _init(int initialSize) {
+        _numElements = 0;
+        _head = 0;
+        _tail = 0;
+        _maxSize = initialSize;
+        _elements = new T[initialSize];
+        _queueCV = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
+        _queueMutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+        pthread_mutex_init(_queueMutex, NULL);
+        pthread_cond_init(_queueCV, NULL);
+    }
+
+    void expand() {
+        T *newStorage = new T[_maxSize * 2];
+        memcpy(newStorage, _elements + _head, (_maxSize - _head) * sizeof(T));
+        memcpy(newStorage + _maxSize - _head, _elements, _tail * sizeof(T));
+        delete[] _elements;
+        _elements = newStorage;
+        _head = 0;
+        _tail = _numElements;
+        _maxSize *= 2;
+    }
+public:
+    Queue(int initialSize) {
+        _init(initialSize);
+    }
+
+    Queue()  {
+        _init(1);
+    }
+
+    ~Queue() {
+        pthread_mutex_destroy(_queueMutex);
+        pthread_cond_destroy(_queueCV);
+        delete[] _elements;
+        free(_queueMutex);
+        free(_queueCV);
+    }
+
+    void enqueue(T el) {
+        pthread_mutex_lock(_queueMutex);
+        if (_numElements == _maxSize) {
+            expand();
+        }
+        _elements[_tail] = el;
+        _tail = (_tail + 1) % _maxSize;
+        _numElements++;
+
+        pthread_cond_signal(_queueCV);
+        pthread_mutex_unlock(_queueMutex);
+    }
+
+    /*
+     * Blocks until not empty.
+     */
+    T dequeue() {
+        pthread_mutex_lock(_queueMutex);
+        // Apparently, pthread_cond_signal may actually unblock
+        // multiple threads, so a while loop is needed here.
+        while (_numElements == 0) {
+            pthread_cond_wait(_queueCV, _queueMutex);
+        }
+        T el = _elements[_head];
+        _head = (_head + 1) % _maxSize;
+        _numElements--;
+        pthread_mutex_unlock(_queueMutex);
+        return el;
+    }
+
+    /*
+     * Obviously this number can change by the time you actually look at it.
+     */
+    inline int getNumElements() const {
+        return _numElements;
+    }
+};
+
+#endif /* QUEUE_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/util/include/sync.h b/caffe2/contrib/cuda-convnet2/util/include/sync.h
new file mode 100644
index 0000000..00113a5
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/include/sync.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SYNC_H_
+#define SYNC_H_
+
+#include <pthread.h>
+
+class Lock {
+private: 
+    pthread_mutex_t _mutex;
+public:
+    Lock() {
+        pthread_mutex_init(&_mutex, NULL);
+    }
+    ~Lock() {
+        pthread_mutex_destroy(&_mutex);
+    }
+    
+    void acquire() {
+        pthread_mutex_lock(&_mutex);
+    }
+    
+    void release() {
+        pthread_mutex_unlock(&_mutex);
+    }
+};
+
+class ThreadSynchronizer {
+private:
+    int _numThreads;
+    int _numSynced;
+    pthread_mutex_t *_syncMutex;
+    pthread_cond_t *_syncThresholdCV;
+public:
+    ThreadSynchronizer(int numThreads) {
+        _numThreads = numThreads;
+        _numSynced = 0;
+        _syncMutex = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
+        _syncThresholdCV = (pthread_cond_t*) malloc(sizeof(pthread_cond_t));
+        pthread_mutex_init(_syncMutex, NULL);
+        pthread_cond_init(_syncThresholdCV, NULL);
+    }
+
+    ~ThreadSynchronizer() {
+        pthread_mutex_destroy(_syncMutex);
+        pthread_cond_destroy(_syncThresholdCV);
+        free(_syncMutex);
+        free(_syncThresholdCV);
+    }
+
+    void sync() {
+        pthread_mutex_lock(_syncMutex);
+        _numSynced++;
+
+        if (_numSynced == _numThreads) {
+            _numSynced = 0;
+            pthread_cond_broadcast(_syncThresholdCV);
+        } else {
+            pthread_cond_wait(_syncThresholdCV, _syncMutex);
+        }
+        pthread_mutex_unlock(_syncMutex);
+    }
+};
+
+#endif /* SYNC_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/util/include/thread.h b/caffe2/contrib/cuda-convnet2/util/include/thread.h
new file mode 100644
index 0000000..8380b58
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/include/thread.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef THREAD_H_
+#define THREAD_H_
+#include <pthread.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <vector>
+
+#define NUM_CPUS_MAX    48
+
+/*
+ * Abstract joinable thread class.
+ * The only thing the implementer has to fill in is the run method.
+ */
+class Thread {
+private:
+    cpu_set_t *_cpu_set;
+    pthread_attr_t _pthread_attr;
+    pthread_t _threadID;
+    bool _joinable, _startable;
+
+    static void* start_pthread_func(void *obj) {
+        void* retval = reinterpret_cast<Thread*>(obj)->run();
+        pthread_exit(retval);
+        return retval;
+    }
+protected:
+    virtual void* run() = 0;
+public:
+    Thread(bool joinable) : _cpu_set(NULL), _joinable(joinable), _startable(true) {
+        pthread_attr_init(&_pthread_attr);
+    }
+
+    Thread(bool joinable, std::vector<int>& cpus) : _cpu_set(NULL), _joinable(joinable), _startable(true) {
+        pthread_attr_init(&_pthread_attr);
+        setAffinity(cpus);
+    }
+
+    virtual ~Thread() {
+        if (_cpu_set != NULL) {
+            CPU_FREE(_cpu_set);
+        }
+        pthread_attr_destroy(&_pthread_attr);
+    }
+
+    void setAffinity(std::vector<int>& cpus) {
+        assert(_startable);
+        _cpu_set = CPU_ALLOC(NUM_CPUS_MAX);
+        size_t size = CPU_ALLOC_SIZE(NUM_CPUS_MAX);
+        if (cpus.size() > 0 && cpus[0] >= 0) {
+            CPU_ZERO_S(size, _cpu_set);
+            for (int i = 0; i < cpus.size(); i++) {
+                assert(cpus[i] < NUM_CPUS_MAX);
+                CPU_SET_S(cpus[i], size, _cpu_set);
+//                printf("set cpu %d\n", cpus[i]);
+            }
+            pthread_attr_setaffinity_np(&_pthread_attr, size, _cpu_set);
+        }
+    }
+
+    pthread_t start() {
+        assert(_startable);
+        _startable = false;
+        pthread_attr_setdetachstate(&_pthread_attr, _joinable ? PTHREAD_CREATE_JOINABLE : PTHREAD_CREATE_DETACHED);
+        int n;
+        if ((n = pthread_create(&_threadID, &_pthread_attr, &Thread::start_pthread_func, (void*)this))) {
+            errno = n;
+            perror("pthread_create error");
+        }
+        return _threadID;
+    }
+
+    void join(void **status) {
+        assert(_joinable);
+        int n;
+        if((n = pthread_join(_threadID, status))) {
+            errno = n;
+            perror("pthread_join error");
+        }
+    }
+
+    void join() {
+        join(NULL);
+    }
+
+    pthread_t getThreadID() const {
+        return _threadID;
+    }
+
+    bool isStartable() const {
+        return _startable;
+    }
+};
+
+#endif /* THREAD_H_ */
diff --git a/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp b/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp
new file mode 100644
index 0000000..a1da84c
--- /dev/null
+++ b/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp
@@ -0,0 +1,820 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/matrix.h"
+#include "../include/matrix_funcs.h"
+
+#if defined(_WIN64) || defined(_WIN32)
+double sqrt(int _X) {return sqrt((double) _X);}
+double log(int _X) {return log((double) _X);}
+#endif
+
+using namespace std;
+
+void Matrix::_init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData) {
+    _updateDims(numRows, numCols);
+    _ownsData = ownsData;
+    _trans = transpose ? CblasTrans : CblasNoTrans;
+    _data = data;
+}
+
+Matrix::Matrix() {
+    _init(NULL, 0, 0, false, true);
+}
+
+Matrix::Matrix(int64 numRows, int64 numCols) {
+    _init(NULL, numRows, numCols, false, true);
+    this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL;
+}
+
+Matrix::Matrix(int64 numRows, int64 numCols, bool transpose) {
+    _init(NULL, numRows, numCols, transpose, true);
+    this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL;
+}
+
+Matrix::Matrix(const Matrix &like) {
+    _init(NULL, like.getNumRows(), like.getNumCols(), false, true);
+    this->_data = new MTYPE[this->_numElements];
+}
+
+/* construct a matrix with another matrix's data. the resultant
+ * matrix does NOT own its data */
+Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols) {
+    _init(data, numRows, numCols, false, false);
+}
+
+/* construct a matrix with another matrix's data (and optionally transpose it). the resultant
+ * matrix does NOT own its data -- it is a VIEW */
+Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose) {
+    _init(data, numRows, numCols, transpose, false);
+}
+
+#ifdef NUMPY_INTERFACE
+Matrix::Matrix(const PyArrayObject *src) {
+    this->_data = NULL;
+    this->_trans = CblasNoTrans;
+    if (src != NULL) {
+        this->_updateDims(PyArray_DIM(src,0), PyArray_DIM(src,1));
+        if (src->flags & NPY_CONTIGUOUS || src->flags & NPY_FORTRAN) {
+            this->_data = (MTYPE*) src->data;
+            this->_ownsData = false;
+            this->_trans = src->flags & NPY_CONTIGUOUS ? CblasNoTrans : CblasTrans;
+        } else {
+            this->_data = new MTYPE[PyArray_DIM(src,0) * PyArray_DIM(src,1)];
+            for (int64 i = 0; i < PyArray_DIM(src,0); i++) {
+                for (int64 j = 0; j < PyArray_DIM(src,1); j++) {
+                    (*this)(i,j) = *reinterpret_cast<MTYPE*>(PyArray_GETPTR2(src,i,j));
+                }
+            }
+            this->_ownsData = true;
+        }
+    }
+}
+#endif
+Matrix::~Matrix() {
+    if(this->_data != NULL && this->_ownsData) {
+        delete[] this->_data;
+    }
+}
+
+void Matrix::_updateDims(int64 numRows, int64 numCols) {
+    this->_numRows = numRows;
+    this->_numCols = numCols;
+    this->_numElements = numRows * numCols;
+}
+
+void Matrix::_checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const {
+    assert(startRow >= 0 && startRow <= _numRows);
+    assert(endRow >= 0 && endRow <= _numRows);
+    assert(startCol >= 0 && startCol <= _numCols);
+    assert(endCol >= 0 && endCol <= _numCols);
+}
+
+/* will return a view if possible */
+Matrix& Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const {
+    endRow = endRow < 0 ? this->_numRows : endRow;
+    endCol = endCol < 0 ? this->_numCols : endCol;
+    _checkBounds(startRow, endRow, startCol, endCol);
+    if (!isTrans() && ((startCol == 0 && endCol == this->_numCols) || (startRow == endRow - 1))) {
+        return *new Matrix(this->_data + startRow * this->_numCols + startCol, endRow - startRow, endCol - startCol);
+    } else if (isTrans() && ((startRow == 0 && endRow == this->_numRows) || (startCol == endCol - 1))) {
+        return *new Matrix(this->_data + startCol * this->_numRows + startRow, endRow - startRow, endCol - startCol, true);
+    }
+    Matrix& newSlice = *new Matrix(endRow - startRow, endCol - startCol);
+    this->copy(newSlice, startRow, endRow, startCol, endCol, 0, 0);
+    return newSlice;
+}
+
+/* this will NEVER return a view, unlike Matrix_slice */
+void Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix& target) const {
+    endRow = endRow < 0 ? this->_numRows : endRow;
+    endCol = endCol < 0 ? this->_numCols : endCol;
+    _checkBounds(startRow, endRow, startCol, endCol);
+    target.resize(endRow - startRow, endCol - startCol);
+    this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
+}
+
+Matrix& Matrix::sliceRows(int64 startRow, int64 endRow) const {
+    return slice(startRow, endRow, 0, -1);
+}
+
+void Matrix::sliceRows(int64 startRow, int64 endRow, Matrix& target) const {
+    slice(startRow, endRow, 0, -1, target);
+}
+
+Matrix& Matrix::sliceCols(int64 startCol, int64 endCol) const {
+    return slice(0, -1, startCol, endCol);
+}
+
+void Matrix::sliceCols(int64 startCol, int64 endCol, Matrix& target) const {
+    slice(0, -1, startCol, endCol, target);
+}
+
+void Matrix::subtractFromScalar(MTYPE scalar) {
+    subtractFromScalar(scalar, *this);
+}
+
+void Matrix::subtractFromScalar(MTYPE scalar, Matrix& target) const {
+    if(&target != this) {
+        copy(target);
+    }
+    target.scale(-1);
+    target.addScalar(scalar);
+}
+
+void Matrix::biggerThanScalar(MTYPE scalar) {
+    biggerThanScalar(scalar, *this);
+}
+
+void Matrix::smallerThanScalar(MTYPE scalar) {
+    smallerThanScalar(scalar, *this);
+}
+
+void Matrix::equalsScalar(MTYPE scalar) {
+    equalsScalar(scalar, *this);
+}
+
+void Matrix::biggerThanScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_bigger, target);
+}
+
+void Matrix::smallerThanScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_smaller, target);
+}
+
+void Matrix::equalsScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_equal, target);
+}
+
+void Matrix::add(const Matrix &m) {
+    add(m, 1, *this);
+}
+
+void Matrix::add(const Matrix &m, Matrix& target) {
+    add(m, 1, target);
+}
+
+void Matrix::add(const Matrix &m, MTYPE scale) {
+    add(m, scale, *this);
+}
+
+void Matrix::subtract(const Matrix &m) {
+    add(m, -1, *this);
+}
+
+void Matrix::subtract(const Matrix &m, Matrix& target) {
+    add(m, -1, target);
+}
+
+void Matrix::subtract(const Matrix &m, MTYPE scale) {
+    add(m, -scale, *this);
+}
+
+void Matrix::subtract(const Matrix &m, MTYPE scale, Matrix& target) {
+    add(m, -scale, target);
+}
+
+void Matrix::add(const Matrix &m, MTYPE scaleM, Matrix &target) {
+    add(m, 1, scaleM, target);
+}
+
+void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM) {
+    add(m, scaleThis, scaleM, *this);
+}
+
+void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target) {
+    assert(this->isSameDims(m));
+    if (isTrans() != m.isTrans() || isTrans() != target.isTrans() || scaleThis != 1) {
+        if (&target != this) {
+            target.resize(*this);
+        }
+        if(scaleThis == 1 && scaleM == 1) {
+            this->_applyLoop2(m, &_add, target);
+        } else if (scaleThis == 1) {
+            this->_applyLoop2(m, &_addWithScale, scaleM, target);
+        } else {
+            this->_applyLoop2(m, &_addWithScale2, scaleThis, scaleM, target);
+        }
+    } else {
+        if (&target != this) {
+            copy(target);
+        }
+        CBLAS_AXPY(getNumElements(), scaleM, m._data, 1, target._data, 1);
+    }
+}
+
+void Matrix::addScalar(MTYPE scalar) {
+    addScalar(scalar, *this);
+}
+
+void Matrix::addScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_add, target);
+}
+
+void Matrix::maxWithScalar(MTYPE scalar) {
+    maxWithScalar(scalar, *this);
+}
+
+void Matrix::maxWithScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_max, target);
+}
+
+void Matrix::minWithScalar(MTYPE scalar) {
+    minWithScalar(scalar, *this);
+}
+
+void Matrix::minWithScalar(MTYPE scalar, Matrix& target) const {
+    target.resize(*this);
+    _applyLoopScalar(scalar, &_min, target);
+}
+
+void Matrix::biggerThan(Matrix& a) {
+    biggerThan(a, *this);
+}
+
+void Matrix::biggerThan(Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_bigger, target);
+}
+
+void Matrix::smallerThan(Matrix& a) {
+    smallerThan(a, *this);
+}
+
+void Matrix::smallerThan(Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_smaller, target);
+}
+
+void Matrix::equals(Matrix& a) {
+    equals(a, *this);
+}
+
+void Matrix::equals(Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_equal, target);
+}
+
+void Matrix::notEquals(Matrix& a) {
+    notEquals(a, *this);
+}
+
+void Matrix::notEquals(Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_notEqual, target);
+}
+
+void Matrix::minWith(Matrix &a) {
+    minWith(a, *this);
+}
+
+void Matrix::minWith(Matrix &a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_min, target);
+}
+
+void Matrix::maxWith(Matrix &a) {
+    maxWith(a, *this);
+}
+
+void Matrix::maxWith(Matrix &a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    _applyLoop2(a, &_max, target);
+}
+
+/* this := this + scale*tile(vec) */
+void Matrix::addVector(const Matrix& vec, MTYPE scale, Matrix& target) {
+    if(&target != this) {
+        copy(target);
+    }
+    assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
+    const bool rowVector = vec.getNumRows() == 1;
+    const bool colVector = vec.getNumCols() == 1;
+    assert((rowVector && vec.getNumCols() == target.getNumCols()) || (colVector && vec.getNumRows() == target.getNumRows()));
+    if (rowVector && colVector) {
+        addScalar(vec(0,0) * scale, target);
+        return;
+    }
+    const int64 loopTil = rowVector ? target.getNumRows() : target.getNumCols();
+    const int64 dataInc = ((rowVector && target.isTrans()) || (!rowVector && !target.isTrans())) ? 1 : (rowVector ? target.getNumCols() : target.getNumRows());
+    const int64 myStride = ((target.isTrans() && rowVector) || (!target.isTrans() && !rowVector)) ? loopTil : 1;
+    for (int64 i = 0; i < loopTil; i++) {
+        CBLAS_AXPY(vec.getNumElements(), scale, vec._data, 1, target._data + dataInc * i, myStride);
+    }
+}
+
+/* this := this + scale*tile(vec) */
+void Matrix::addVector(const Matrix& vec, MTYPE scale) {
+    addVector(vec, scale, *this);
+}
+
+void Matrix::addVector(const Matrix& vec) {
+    addVector(vec, 1, *this);
+}
+
+void Matrix::addVector(const Matrix& vec, Matrix& target) {
+    addVector(vec, 1, target);
+}
+
+void Matrix::eltWiseMultByVector(const Matrix& vec) {
+    eltWiseMultByVector(vec, *this);
+}
+
+/* omg test these */
+void Matrix::eltWiseMultByVector(const Matrix& vec, Matrix& target) {
+    if(&target != this) {
+        copy(target);
+    }
+    assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
+    const bool rowVector = vec.getNumRows() == 1;
+    assert((rowVector && vec.getNumCols() == target.getNumCols()) || (!rowVector && vec.getNumRows() == target.getNumRows()));
+    const int64 dataInc = ((rowVector && !target.isTrans()) || (!rowVector && target.isTrans())) ? 1 : (rowVector ? target.getNumRows() : target.getNumCols());
+    const int64 myStride = ((!target.isTrans() && !rowVector) || (target.isTrans() && rowVector)) ? 1 : vec.getNumElements();
+    const int64 numScaling = rowVector ? target.getNumRows() : target.getNumCols();
+    for (int64 i = 0; i < vec.getNumElements(); i++) {
+        CBLAS_SCAL(numScaling, vec._data[i], target._data + dataInc * i, myStride);
+    }
+}
+
+/* return := scale * this * b */
+void Matrix::rightMult(const Matrix& b, MTYPE scale) {
+    rightMult(b, scale, *this);
+}
+
+/* return := this * b */
+void Matrix::rightMult(const Matrix& b) {
+    rightMult(b, 1);
+}
+
+/* target := this * b
+ * also resizes target if necessary.*/
+void Matrix::rightMult(const Matrix &b, Matrix &target) const {
+    rightMult(b, 1, target);
+}
+
+/* target := scaleAB * this * b
+ * also resizes target if necessary.*/
+void Matrix::rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const {
+    if(&target != this) {
+        target.resize(this->_numRows, b._numCols);
+    }
+    target.addProduct(*this, b, scaleAB, 0);
+}
+
+/* this := scaleAB * a*b + scaleC * this
+ * ALL SIZES MUST BE CORRECT. */
+void Matrix::addProduct(const Matrix& a, const Matrix& b, MTYPE scaleAB, MTYPE scaleThis) {
+    assert(a.getNumCols() == b.getNumRows());
+    assert(this->getNumRows() == a.getNumRows() && this->getNumCols() == b.getNumCols());
+    assert(!isTrans());
+    CBLAS_GEMM(CblasRowMajor, a._trans, b._trans, a._numRows, b._numCols, a._numCols, scaleAB, a._data,
+            a._getNumColsBackEnd(), b._data, b._getNumColsBackEnd(), scaleThis, this->_data, this->_numCols);
+}
+
+void Matrix::addProduct(const Matrix& a, const Matrix& b) {
+    addProduct(a, b, 1, 1);
+}
+
+Matrix& Matrix::transpose() const {
+    return *new Matrix(this->_data, this->_numCols, this->_numRows, !isTrans());
+}
+
+Matrix& Matrix::transpose(bool hard) const {
+    if (!hard || isTrans()) {
+        return transpose();
+    }
+    Matrix &meTrans = *new Matrix(_numCols, _numRows);
+    for (int64 i = 0; i < _numRows; i++) {
+        for (int64 j = 0; j < _numCols; j++) {
+            meTrans(j, i) = (*this)(i, j);
+        }
+    }
+    return meTrans;
+}
+
+Matrix& Matrix::tile(int64 timesY, int64 timesX) const {
+    Matrix& tiled = *new Matrix(this->_numRows * timesY, this->_numCols * timesX);
+    _tileTo2(tiled);
+    return tiled;
+}
+
+/* resizes target if necessary */
+void Matrix::tile(int64 timesY, int64 timesX, Matrix& target) const {
+    target.resize(this->_numRows * timesY, this->_numCols * timesX);
+    _tileTo2(target);
+}
+
+/* a variant ... seems to be no faster than original. */
+void Matrix::_tileTo2(Matrix& target) const {
+    for(int64 y = 0; y < target._numRows; y += this->_numRows) {
+        for(int64 x = 0; x < target._numCols; x += this->_numCols) {
+            this->copy(target, 0, -1, 0, -1, y, x);
+        }
+    }
+}
+
+/* guarantees that result will be non-transposed */
+void Matrix::resize(int64 newNumRows, int64 newNumCols) {
+    if(this->_numRows != newNumRows || this->_numCols != newNumCols) {
+        assert(!isView());
+        if (this->getNumElements() != newNumRows * newNumCols) {
+            delete[] this->_data; //deleting NULL is ok, sez c++
+            this->_data = new MTYPE[newNumRows * newNumCols];
+        }
+        this->_updateDims(newNumRows, newNumCols);
+        this->_trans = CblasNoTrans;
+    }
+}
+
+void Matrix::resize(const Matrix& like) {
+    resize(like.getNumRows(), like.getNumCols());
+}
+
+void Matrix::scale(MTYPE alpha) {
+    scale(alpha, *this);
+}
+
+void Matrix::scale(MTYPE alpha, Matrix& target) {
+    if (&target != this) {
+        target.resize(*this);
+        copy(target);
+    }
+    CBLAS_SCAL(getNumElements(), alpha, target._data, 1);
+}
+
+/* performs no resizing.
+ * Warnings:
+ * 1. ALL DIMENSIONS MUST BE CORRECT
+ * 2. The source and destination memories better not overlap! */
+void Matrix::copy(Matrix& dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const {
+    srcEndRow = srcEndRow < 0 ? this->_numRows : srcEndRow;
+    srcEndCol = srcEndCol < 0 ? this->_numCols : srcEndCol;
+    assert(destStartRow >= 0 && destStartCol >= 0); //some range-checking
+    assert(srcEndRow <= _numRows && srcEndCol <= _numCols);
+    assert(destStartRow + srcEndRow - srcStartRow <= dest.getNumRows());
+    assert(destStartCol + srcEndCol - srcStartCol <= dest.getNumCols());
+    // I found no evidence that memcpy is actually faster than just
+    // copying element-by-element.
+    if (!isTrans() && !dest.isTrans()) {
+        int64 src_start_idx = this->_numCols * srcStartRow + srcStartCol;
+        int64 dest_start_idx = dest._numCols * destStartRow + destStartCol;
+        int64 copy_row_width = srcEndCol - srcStartCol;
+
+        for (int64 i = srcStartRow; i < srcEndRow; i++) {
+            memcpy(dest._data + dest_start_idx + dest._numCols * (i - srcStartRow),
+                    this->_data + src_start_idx + this->_numCols * (i - srcStartRow), sizeof(MTYPE) * copy_row_width);
+        }
+    } else {
+        for (int64 i = srcStartRow; i < srcEndRow; i++) {
+            for (int64 j = srcStartCol; j < srcEndCol; j++) {
+                dest(i - srcStartRow + destStartRow, j - srcStartCol + destStartCol) = (*this)(i, j);
+            }
+        }
+    }
+}
+
+/* preserves everything excluding transposedness.
+ * new matrix owns its data */
+Matrix& Matrix::copy() const {
+    Matrix& copy = *new Matrix(*this);
+    this->copy(copy);
+    return copy;
+}
+
+/* resizes target if necessary */
+void Matrix::copy(Matrix& target) const {
+    target.resize(this->_numRows, this->_numCols); //target is now non-transposed
+    if(this->isTrans() == target.isTrans()) {
+        this->_copyAllTo(target);
+    } else { //if I'm transposed, make sure that target is non-transposed copy
+        this->copy(target, 0, -1, 0, -1, 0, 0);
+    }
+}
+
+void Matrix::_copyAllTo(Matrix& target) const {
+    assert(target.isTrans() == isTrans());
+    memcpy((void*) target._data, (void*) this->_data, this->getNumDataBytes());
+    target._trans = this->_trans;
+}
+
+MTYPE Matrix::min() const {
+    return _aggregate(&_min, MTYPE_MAX);
+}
+
+Matrix& Matrix::min(int64 axis) const {
+    Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
+    this->min(axis, target);
+    return target;
+}
+
+void Matrix::min(int64 axis, Matrix& target) const {
+    _aggregate(axis, target, &_min, MTYPE_MAX);
+}
+
+MTYPE Matrix::max() const {
+    return _aggregate(&_max, -MTYPE_MAX);
+}
+
+Matrix& Matrix::max(int64 axis) const {
+    Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
+    this->max(axis, target);
+    return target;
+}
+
+void Matrix::max(int64 axis, Matrix& target) const {
+    _aggregate(axis, target, &_max, -MTYPE_MAX);
+}
+
+MTYPE Matrix::sum() const {
+    return _aggregate(&_add, 0);
+}
+
+MTYPE Matrix::norm() const {
+    return sqrt(norm2());
+}
+
+MTYPE Matrix::norm2() const {
+    return _aggregate(&_addSquare, 0);
+}
+
+Matrix& Matrix::sum(int64 axis) const {
+    Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
+    this->sum(axis, target);
+    return target;
+}
+
+void Matrix::sum(int64 axis, Matrix& target) const {
+    _aggregate(axis, target, &_add, 0);
+}
+
+void Matrix::_aggregate(int64 axis, Matrix& target, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
+    if (axis == 0) {
+        target.resize(1, this->_numCols);
+        for (int64 j = 0; j < this->_numCols; j++) {
+            target(0, j) = _aggregateCol(j, agg_func, initialValue);
+        }
+    } else {
+        target.resize(this->_numRows, 1);
+        for (int64 i = 0; i < this->_numRows; i++) {
+            target(i, 0) = _aggregateRow(i, agg_func, initialValue);
+        }
+    }
+}
+
+MTYPE Matrix::_aggregateRow(int64 row, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
+    MTYPE v = initialValue;
+    for (int64 j = 0; j < this->_numCols; j++) {
+        v = agg_func((*this)(row, j), v);
+    }
+    return v;
+}
+
+MTYPE Matrix::_aggregateCol(int64 col, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
+    MTYPE v = initialValue;
+    for (int64 i = 0; i < this->_numRows; i++) {
+        v = agg_func((*this)(i, col), v);
+    }
+    return v;
+}
+
+MTYPE Matrix::_aggregate(MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
+    MTYPE v = initialValue;
+    MTYPE* ptr = _data;
+    for (int64 i = 0; i < getNumElements(); i++, ptr++) {
+        v = agg_func(*ptr, v);
+    }
+    return v;
+}
+
+void Matrix::printShape(const char* name) const {
+    printf("%s: %lldx%lld\n", name, getNumRows(), getNumCols());
+}
+
+void Matrix::print() const {
+    print(0,getNumRows(),0, getNumCols());
+}
+
+void Matrix::print(int64 rows, int64 cols) const {
+    print(0,rows,0, cols);
+}
+
+void Matrix::print(int64 startRow, int64 rows, int64 startCol, int64 cols) const {
+    for (int64 i = startRow; i < std::min(startRow+rows, this->_numRows); i++) {
+        for (int64 j = startCol; j < std::min(startCol+cols, this->_numCols); j++) {
+            printf("%.15f ", (*this)(i, j));
+        }
+        printf("\n");
+    }
+}
+
+void Matrix::apply(Matrix::FUNCTION f) {
+    apply(f, *this);
+}
+
+
+void Matrix::apply(Matrix::FUNCTION f, Matrix& target) {
+    MTYPE (*func)(MTYPE);
+    if(f == EXP) {
+        func = &_exp;
+    } else if(f == TANH) {
+        func = &_tanh;
+    } else if(f == RECIPROCAL) {
+        func = &_recip;
+    } else if (f == SQUARE) {
+        func = &_square;
+    } else if(f == LOG) {
+        func = &_log;
+    } else if(f == ZERO) {
+        func = &_zero;
+    } else if (f == ONE) {
+        func = &_one;
+    } else if(f == LOGISTIC1) {
+        func = &_sigma1;
+    } else if(f == LOGISTIC2) {
+        func = &_sigma2;
+    } else if (f == ABS) {
+        func = &_abs;
+    } else if (f == SIGN) {
+        func = &_sign;
+    } else {
+        return;
+        //LOG(FATAL) << "Matrix::apply: Unknown function type";
+    }
+    this->_applyLoop(func, target);
+}
+
+void Matrix::eltWiseMult(const Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    this->_applyLoop2(a, &_mult, target);
+}
+
+void Matrix::eltWiseDivide(const Matrix& a, Matrix& target) const {
+    assert(isSameDims(a));
+    target.resize(*this);
+    this->_applyLoop2(a, &_divide, target);
+}
+
+void Matrix::eltWiseMult(const Matrix& a) {
+    eltWiseMult(a, *this);
+}
+
+void Matrix::eltWiseDivide(const Matrix& a) {
+    eltWiseDivide(a, *this);
+}
+
+void Matrix::randomizeUniform() {
+    this->_applyLoop(&_rand);
+}
+
+void Matrix::randomizeNormal() {
+    //LOG(FATAL) << "randomizeNormal only implemented on MKL!";
+}
+
+void Matrix::randomizeNormal(MTYPE /*mean*/, MTYPE /*stdev*/) {
+  // LOG(FATAL) << "randomizeNormal only implemented on MKL!";
+}
+
+void Matrix::eltWiseDivideByVector(const Matrix& vec) {
+    eltWiseDivideByVector(vec, *this);
+}
+
+/* This function allocates a chunk of memory at most as big as the input vector */
+void Matrix::eltWiseDivideByVector(const Matrix& vec, Matrix& target) {
+    assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
+    const bool rowVector = vec.getNumRows() == 1;
+    assert((rowVector && vec.getNumCols() == getNumCols()) || (!rowVector && vec.getNumRows() == getNumRows()));
+    if(&target != this) {
+        target.resize(*this);
+    }
+    _divideByVector(vec, target);
+}
+
+void Matrix::_divideByVector(const Matrix& vec, Matrix& target) {
+    Matrix& vecInverse = vec.copy();
+    vecInverse.apply(RECIPROCAL);
+    eltWiseMultByVector(vecInverse,target);
+    delete &vecInverse;
+}
+
+void Matrix::reshape(int64 numRows, int64 numCols) {
+    assert(_numElements == numRows*numCols);
+    _numRows = numRows;
+    _numCols = numCols;
+}
+
+Matrix& Matrix::reshaped(int64 numRows, int64 numCols) {
+    assert(_numElements == numRows*numCols);
+    return *new Matrix(_data, numRows, numCols, isTrans());
+}
+
+void Matrix::_applyLoop(MTYPE (*func)(MTYPE), Matrix& target) {
+    MTYPE *ptr = this->_data, *tgtPtr = target._data;
+    for (int64 i = 0; i < getNumElements(); i++, ptr++, tgtPtr++) {
+        *tgtPtr = (*func)(*ptr);
+    }
+}
+
+void Matrix::_applyLoop(MTYPE (*func)(MTYPE)) {
+    _applyLoop(func, *this);
+}
+
+void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE), Matrix& target) const {
+    for (int64 i = 0; i < getNumRows(); i++) {
+        for (int64 j = 0; j < getNumCols(); j++) {
+            target(i, j) = (*func)((*this)(i, j), a(i, j));
+        }
+    }
+}
+
+void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const {
+    for (int64 i = 0; i < getNumRows(); i++) {
+        for (int64 j = 0; j < getNumCols(); j++) {
+            target(i, j) = (*func)((*this)(i, j), a(i, j), scalar);
+        }
+    }
+}
+
+void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const {
+    for (int64 i = 0; i < getNumRows(); i++) {
+        for (int64 j = 0; j < getNumCols(); j++) {
+            target(i, j) = (*func)((*this)(i, j), a(i, j), scalar1, scalar2);
+        }
+    }
+}
+
+void Matrix::_applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const {
+    MTYPE *myPtr = _data;
+    MTYPE *targetPtr = target._data;
+    for (int64 i = 0; i < getNumElements(); i++, myPtr++, targetPtr++) {
+        *targetPtr = (*func)(*myPtr, scalar);
+    }
+}
+
+bool Matrix::hasNan() const {
+    for (int64 r = 0; r < _numRows; r++) {
+        for (int64 c = 0; c < _numCols; c++) {
+            if (isnan((*this)(r,c))) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool Matrix::hasInf() const {
+    for (int64 r = 0; r < _numRows; r++) {
+        for (int64 c = 0; c < _numCols; c++) {
+            if (isinf((*this)(r,c))) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
diff --git a/caffe2/contrib/docker-ubuntu-14.04/Dockerfile b/caffe2/contrib/docker-ubuntu-14.04/Dockerfile
new file mode 100644
index 0000000..c8d2bcc
--- /dev/null
+++ b/caffe2/contrib/docker-ubuntu-14.04/Dockerfile
@@ -0,0 +1,126 @@
+FROM ubuntu:14.04
+MAINTAINER caffe-dev <caffe-dev@googlegroups.com>
+
+# A docker container with CUDA and caffe2 installed.
+# Note: this should install everything but cudnn, which requires you to have a
+# manual registration and download from the NVidia website. After creating this
+# docker image, the Caffe2 repository is located at /opt/caffe2. You can install
+# cudnn manually and re-compile caffe2.
+
+################################################################################
+# Step 1: set up cuda on the ubuntu box.
+################################################################################
+
+RUN apt-get update && apt-get install -q -y \
+  build-essential \
+  wget
+
+RUN cd /tmp && \
+  wget http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run && \
+  chmod +x cuda_*_linux.run && ./cuda_*_linux.run -extract=`pwd` && \
+  ./NVIDIA-Linux-x86_64-*.run -s --no-kernel-module && \
+  ./cuda-linux64-rel-*.run -noprompt && \
+  rm -rf *
+
+# Ensure the CUDA libs and binaries are in the correct environment variables
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+ENV PATH=$PATH:/usr/local/cuda/bin
+
+# Run nvcc to make sure things are set correctly.
+RUN nvcc --version
+
+################################################################################
+# Step 2: set up caffe2 pre-requisites
+################################################################################
+
+RUN apt-get update && apt-get install -q -y \
+  git \
+  libeigen3-dev \
+  libgoogle-glog-dev \
+  libleveldb-dev \
+  liblmdb-dev \
+  libopencv-dev \
+  libprotobuf-dev \
+  libsnappy-dev \
+  zlib1g-dev \
+  libbz2-dev \
+  protobuf-compiler \
+  python-dev \
+  python-pip
+
+RUN cd /tmp && \
+  git clone https://github.com/facebook/rocksdb.git && \
+  cd /tmp/rocksdb && \
+  make && make install && \
+  cd / && \
+  rm -rf /tmp/rocksdb
+
+# Caffe2 works best with openmpi 1.8.5 or above (which has cuda support).
+# If you do not need openmpi, skip this step.
+RUN cd /tmp && \
+  wget http://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-1.10.0.tar.gz && \
+  tar xzvf openmpi-1.10.0.tar.gz && \
+  cd /tmp/openmpi-1.10.0 && \
+  ./configure --with-cuda --with-threads && \
+  make && make install && \
+  cd / && \
+  rm -rf /tmp/openmpi-1.10.0 && \
+  rm /tmp/openmpi-1.10.0.tar.gz
+
+# Caffe2 requires zeromq 4.0 or above, manually install.
+# If you do not need zeromq, skip this step.
+RUN apt-get install -q -y autoconf libtool
+RUN mkdir /tmp/zeromq-build && \
+  cd /tmp/zeromq-build && \
+  wget https://github.com/zeromq/zeromq4-1/archive/v4.1.3.tar.gz && \
+  tar xzvf v4.1.3.tar.gz --strip 1 && \
+  ./autogen.sh && \
+  ./configure --without-libsodium && \
+  make && make install && \
+  cd / && \
+  rm -rf /tmp/zeromq-build
+
+# pip self upgrade
+RUN pip install --upgrade pip
+
+# Python dependencies
+RUN pip install \
+  matplotlib \
+  numpy \
+  protobuf
+
+################################################################################
+# Step 3: install optional dependencies ("good to have" features)
+################################################################################
+
+RUN apt-get install -q -y \
+  gfortran \
+  graphviz \
+  libatlas-base-dev \
+  vim
+
+RUN pip install \
+  flask \
+  ipython \
+  notebook \
+  pydot \
+  python-nvd3 \
+  scipy \
+  tornado
+
+# This is intentional. scikit-image has to be after scipy.
+RUN pip install \
+  scikit-image
+
+################################################################################
+# Step 4: set up caffe2
+################################################################################
+
+# Get the repository, and build.
+RUN cd /opt && \
+  git clone https://github.com/Yangqing/caffe2.git && \
+  cd /opt/caffe2 && \
+  make
+
+# Now, we know that some of the caffe tests will fail. How do we deal with
+# those?
diff --git a/caffe2/contrib/gloo/CMakeLists.txt b/caffe2/contrib/gloo/CMakeLists.txt
new file mode 100644
index 0000000..ff77e32
--- /dev/null
+++ b/caffe2/contrib/gloo/CMakeLists.txt
@@ -0,0 +1,22 @@
+if(USE_GLOO)
+  set(Caffe2_CONTRIB_GLOO_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
+    )
+
+  set(Caffe2_CONTRIB_GLOO_GPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
+    )
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
+endif()
diff --git a/caffe2/contrib/gloo/allgather_ops.cc b/caffe2/contrib/gloo/allgather_ops.cc
new file mode 100644
index 0000000..ff536bd
--- /dev/null
+++ b/caffe2/contrib/gloo/allgather_ops.cc
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "allgather_ops.h"
+
+#include <gloo/allgather_ring.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+void AllgatherOp<Context>::initializeAlgorithm() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::AllgatherRing<float>(
+        init_.context,
+        init_.template getInputs<float>(),
+        init_.template getOutput<float>(),
+        init_.size));
+  } else if (init_.template IsType<long>()) {
+    algorithm_.reset(new ::gloo::AllgatherRing<long>(
+        init_.context,
+        init_.template getInputs<long>(),
+        init_.template getOutput<long>(),
+        init_.size));
+  } else if (init_.template IsType<int>()) {
+    algorithm_.reset(new ::gloo::AllgatherRing<int>(
+        init_.context,
+        init_.template getInputs<int>(),
+        init_.template getOutput<int>(),
+        init_.size));
+  } else if (init_.template IsType<float16>()) {
+    algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>(
+        init_.context,
+        init_.template getInputs<::gloo::float16>(),
+        init_.template getOutput<::gloo::float16>(),
+        init_.size));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp<CPUContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h
new file mode 100644
index 0000000..044357c
--- /dev/null
+++ b/caffe2/contrib/gloo/allgather_ops.h
@@ -0,0 +1,130 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class AllgatherOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AllgatherOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~AllgatherOp() {}
+
+  bool RunOnDevice() override {
+    std::call_once(once_, [&] { initialize(); });
+
+    // If any parameter has changed in between runs, the initialized
+    // algorithm is invalid and cannot be used.
+    update(current_);
+    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
+
+    try {
+      algorithm_->run();
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      if (status_blob_ != "") {
+        signalFailure(ws_->GetBlob(status_blob_), ioe);
+        return false;
+      } else {
+        throw;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  void initialize() {
+    // Allocate output tensor
+    CAFFE_ENFORCE_EQ(OutputSize(), 1);
+    auto comm_size =
+        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
+    const auto dims =
+        std::vector<TIndex>(1, (InputSize() - 1) * Input(1).size() * comm_size);
+    Output(0)->Resize(dims);
+
+    // Store which inputs/outputs this instance initialized with
+    update(init_);
+
+    CAFFE_ENFORCE_EQ(init_.outputs.size(), 1);
+
+    // Verify tensors all have same size
+    size_t size = Input(1).size();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE_EQ(Input(i).size(), size);
+    }
+
+    // Verify tensors all have same type
+    TypeMeta meta = Input(1).meta();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE(Input(i).meta() == meta);
+    }
+
+    // Finally initialize the algorithm
+    initializeAlgorithm();
+  }
+
+  void initializeAlgorithm();
+
+  std::once_flag once_;
+  std::unique_ptr<::gloo::Algorithm> algorithm_;
+
+  // Captures the parameters passed to Gloo when first initialized.
+  // An instance is updated every time this op runs and is compared
+  // to the reference instance for equality. If any parameter has
+  // changed from run to run, the initialized algorithm is invalid.
+  void update(GlooParameters& params) {
+    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+    params.inputs.resize(InputSize() - 1);
+    params.size = Input(1).size();
+    params.meta = Input(1).meta();
+    for (auto i = 0; i < params.inputs.size(); i++) {
+      params.inputs[i] = Input(i + 1).template raw_data();
+    }
+    params.outputs.resize(OutputSize());
+    params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
+  }
+
+  GlooParameters init_;
+  GlooParameters current_;
+  Workspace* ws_;
+  std::string status_blob_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops.cc b/caffe2/contrib/gloo/allreduce_ops.cc
new file mode 100644
index 0000000..888e34a
--- /dev/null
+++ b/caffe2/contrib/gloo/allreduce_ops.cc
@@ -0,0 +1,62 @@
+#include "allreduce_ops.h"
+
+#include <gloo/allreduce_halving_doubling.h>
+#include <gloo/allreduce_ring.h>
+#include <gloo/allreduce_ring_chunked.h>
+#include <gloo/types.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+void AllreduceOp<Context>::initializeHalvingDoubling() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<float>(
+        init_.context, init_.template getOutputs<float>(), init_.size));
+  } else if (init_.template IsType<::caffe2::float16>()) {
+    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+template <class Context>
+void AllreduceOp<Context>::initializeRingFull() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::AllreduceRing<float>(
+        init_.context, init_.template getOutputs<float>(), init_.size));
+  } else if (init_.template IsType<::caffe2::float16>()) {
+    algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+template <class Context>
+void AllreduceOp<Context>::initializeRingChunked() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::AllreduceRingChunked<float>(
+        init_.context, init_.template getOutputs<float>(), init_.size));
+  } else if (init_.template IsType<::caffe2::float16>()) {
+    algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CPUContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h
new file mode 100644
index 0000000..8837b32
--- /dev/null
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <algorithm>
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class AllreduceOp final : public Operator<Context> {
+  enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING };
+
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AllreduceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
+        gpu_direct_(
+            OperatorBase::GetSingleArgument<bool>("gpu_direct", false)) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~AllreduceOp() {}
+
+  bool RunOnDevice() override {
+    std::call_once(once_, [&] { initialize(); });
+
+    // If any parameter has changed in between runs, the initialized
+    // algorithm is invalid and cannot be used.
+    update(current_);
+    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
+
+    try {
+      algorithm_->run();
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      if (status_blob_ != "") {
+        signalFailure(ws_->GetBlob(status_blob_), ioe);
+        return false;
+      } else {
+        throw;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  void initialize() {
+    Mode mode = HALVING_DOUBLING;
+    auto bytes = Input(1).nbytes();
+
+    // Store which inputs/outputs this instance initialized with
+    update(init_);
+
+    // Verify inputs == ouputs
+    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
+    for (auto i = 0; i < init_.inputs.size(); i++) {
+      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
+    }
+
+    // Verify tensors all have same size
+    size_t size = Input(1).size();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE_EQ(Input(i).size(), size);
+    }
+
+    // Verify tensors all have same type
+    TypeMeta meta = Input(1).meta();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE(Input(i).meta() == meta);
+    }
+
+    switch (mode) {
+      case RING_FULL:
+        initializeRingFull();
+        return;
+      case RING_CHUNKED:
+        initializeRingChunked();
+        return;
+      case HALVING_DOUBLING:
+        initializeHalvingDoubling();
+        return;
+    }
+
+    CAFFE_ENFORCE(false, "Unreachable code");
+  }
+
+  void initializeHalvingDoubling();
+  void initializeRingFull();
+  void initializeRingChunked();
+
+  std::once_flag once_;
+  std::unique_ptr<::gloo::Algorithm> algorithm_;
+
+  // Captures the parameters passed to Gloo when first initialized.
+  // An instance is updated every time this op runs and is compared
+  // to the reference instance for equality. If any parameter has
+  // changed from run to run, the initialized algorithm is invalid.
+  void update(GlooParameters& params) {
+    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+    params.inputs.resize(InputSize() - 1);
+    params.outputs.resize(OutputSize());
+    for (auto i = 0; i < params.inputs.size(); i++) {
+      params.inputs[i] = Input(i + 1).template raw_data();
+      params.outputs[i] = Output(i)->template raw_mutable_data();
+    }
+    params.size = Output(0)->size();
+    params.meta = Output(0)->meta();
+  }
+
+  GlooParameters init_;
+  GlooParameters current_;
+  Workspace* ws_;
+  std::string status_blob_;
+  const bool gpu_direct_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops_gpu.cc b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
new file mode 100644
index 0000000..bbc187c
--- /dev/null
+++ b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
@@ -0,0 +1,109 @@
+#include "allreduce_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/logging.h"
+
+#include <gloo/cuda_allreduce_halving_doubling.h>
+#include <gloo/cuda_allreduce_ring.h>
+#include <gloo/cuda_allreduce_ring_chunked.h>
+#include <gloo/types.h>
+
+namespace caffe2 {
+namespace gloo {
+
+namespace {
+
+// Decides on using GPUDirect based on device support.
+template <template <typename T, typename W> class A, typename T>
+std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
+    bool gpu_direct_,
+    std::shared_ptr<::gloo::Context> context,
+    std::vector<T*> ptrs,
+    size_t size) {
+  if (gpu_direct_) {
+    if (context->getDevice()->hasGPUDirect()) {
+      return std::unique_ptr<::gloo::Algorithm>(
+        new A<T, ::gloo::CudaDeviceWorkspace<T>>(context, ptrs, size));
+    } else {
+      LOG(WARNING)
+        << "GPUDirect not available; "
+        << "Gloo communication will go through system memory instead.";
+    }
+  }
+
+  return std::unique_ptr<::gloo::Algorithm>(
+    new A<T, ::gloo::CudaHostWorkspace<T>>(context, ptrs, size));
+}
+
+} // namespace
+
+template <class Context>
+void AllreduceOp<Context>::initializeHalvingDoubling() {
+  if (init_.template IsType<float>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, float>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<float>(),
+        init_.size);
+  } else if (init_.template IsType<float16>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, ::gloo::float16>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size);
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+template <class Context>
+void AllreduceOp<Context>::initializeRingFull() {
+  if (init_.template IsType<float>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceRing, float>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<float>(),
+        init_.size);
+  } else if (init_.template IsType<float16>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceRing, ::gloo::float16>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size);
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+template <class Context>
+void AllreduceOp<Context>::initializeRingChunked() {
+  if (init_.template IsType<float>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, float>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<float>(),
+        init_.size);
+  } else if (init_.template IsType<float16>()) {
+    algorithm_ =
+      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, ::gloo::float16>(
+        gpu_direct_,
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size);
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CUDAContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/barrier_ops.cc b/caffe2/contrib/gloo/barrier_ops.cc
new file mode 100644
index 0000000..01391a5
--- /dev/null
+++ b/caffe2/contrib/gloo/barrier_ops.cc
@@ -0,0 +1,11 @@
+#include "barrier_ops.h"
+
+namespace caffe2 {
+namespace gloo {
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Barrier, GLOO, BarrierOp<CPUContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/barrier_ops.h b/caffe2/contrib/gloo/barrier_ops.h
new file mode 100644
index 0000000..f19b651
--- /dev/null
+++ b/caffe2/contrib/gloo/barrier_ops.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/core/operator.h"
+
+#include <gloo/algorithm.h>
+#include <gloo/barrier_all_to_one.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class BarrierOp final : public Operator<Context> {
+ public:
+  BarrierOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~BarrierOp() {}
+
+  bool RunOnDevice() override {
+    auto context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+    std::call_once(once_, [&] {
+      initContext_ = context;
+      // Use an all-to-one barrier synchronizing against rank 0
+      algorithm_.reset(new ::gloo::BarrierAllToOne(initContext_, 0));
+    });
+
+    // If any parameter has changed in between runs, the initialized
+    // algorithm is invalid and cannot be used.
+    CAFFE_ENFORCE(context == initContext_, "Context has changed");
+
+    try {
+      algorithm_->run();
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      if (status_blob_ != "") {
+        signalFailure(ws_->GetBlob(status_blob_), ioe);
+        return false;
+      } else {
+        throw;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  std::once_flag once_;
+  std::shared_ptr<::gloo::Context> initContext_;
+  std::unique_ptr<::gloo::Algorithm> algorithm_;
+  Workspace* ws_;
+  std::string status_blob_;
+};
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops.cc b/caffe2/contrib/gloo/broadcast_ops.cc
new file mode 100644
index 0000000..331e362
--- /dev/null
+++ b/caffe2/contrib/gloo/broadcast_ops.cc
@@ -0,0 +1,36 @@
+#include "broadcast_ops.h"
+
+#include <gloo/broadcast_one_to_all.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+void BroadcastOp<Context>::initializeAlgorithm() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::BroadcastOneToAll<float>(
+        init_.context, init_.template getOutputs<float>(), init_.size, root_));
+  } else if (init_.template IsType<long>()) {
+    algorithm_.reset(new ::gloo::BroadcastOneToAll<long>(
+        init_.context, init_.template getOutputs<long>(), init_.size, root_));
+  } else if (init_.template IsType<int>()) {
+    algorithm_.reset(new ::gloo::BroadcastOneToAll<int>(
+        init_.context, init_.template getOutputs<int>(), init_.size, root_));
+  } else if (init_.template IsType<float16>()) {
+    algorithm_.reset(new ::gloo::BroadcastOneToAll<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size,
+        root_));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CPUContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h
new file mode 100644
index 0000000..e525b8e
--- /dev/null
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <algorithm>
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class BroadcastOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BroadcastOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        root_(OperatorBase::template GetSingleArgument<int>("root", 0)),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~BroadcastOp() {}
+
+  bool RunOnDevice() override {
+    std::call_once(once_, [&] { initialize(); });
+
+    // If any parameter has changed in between runs, the initialized
+    // algorithm is invalid and cannot be used.
+    update(current_);
+    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
+
+    try {
+      algorithm_->run();
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      if (status_blob_ != "") {
+        signalFailure(ws_->GetBlob(status_blob_), ioe);
+        return false;
+      } else {
+        throw;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  void initialize() {
+    // Store which inputs/outputs this instance initialized with
+    update(init_);
+
+    // Verify inputs == ouputs
+    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
+    for (auto i = 0; i < init_.inputs.size(); i++) {
+      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
+    }
+
+    // Verify tensors all have same size
+    size_t size = Input(1).size();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE_EQ(Input(i).size(), size);
+    }
+
+    // Verify tensors all have same size
+    TypeMeta meta = Input(1).meta();
+    for (auto i = 2; i < InputSize(); i++) {
+      CAFFE_ENFORCE(Input(i).meta() == meta);
+    }
+
+    // Finally initialize the algorithm
+    initializeAlgorithm();
+  }
+
+  void initializeAlgorithm();
+
+  const int root_;
+  std::once_flag once_;
+  std::unique_ptr<::gloo::Algorithm> algorithm_;
+
+  // Captures the parameters passed to Gloo when first initialized.
+  // An instance is updated every time this op runs and is compared
+  // to the reference instance for equality. If any parameter has
+  // changed from run to run, the initialized algorithm is invalid.
+  void update(GlooParameters& params) {
+    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+    params.inputs.resize(InputSize() - 1);
+    params.outputs.resize(OutputSize());
+    for (auto i = 0; i < params.inputs.size(); i++) {
+      params.inputs[i] = Input(i + 1).template raw_data();
+      params.outputs[i] = Output(i)->template raw_mutable_data();
+    }
+    params.size = Output(0)->size();
+    params.meta = Output(0)->meta();
+  }
+
+  GlooParameters init_;
+  GlooParameters current_;
+  Workspace* ws_;
+  std::string status_blob_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops_gpu.cc b/caffe2/contrib/gloo/broadcast_ops_gpu.cc
new file mode 100644
index 0000000..55b61f4
--- /dev/null
+++ b/caffe2/contrib/gloo/broadcast_ops_gpu.cc
@@ -0,0 +1,38 @@
+#include "broadcast_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+
+#include <gloo/cuda_broadcast_one_to_all.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+void BroadcastOp<Context>::initializeAlgorithm() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<float>(
+        init_.context, init_.template getOutputs<float>(), init_.size, root_));
+  } else if (init_.template IsType<long>()) {
+    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<long>(
+        init_.context, init_.template getOutputs<long>(), init_.size, root_));
+  } else if (init_.template IsType<int>()) {
+    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<int>(
+        init_.context, init_.template getOutputs<int>(), init_.size, root_));
+  } else if (init_.template IsType<float16>()) {
+    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size,
+        root_));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CUDAContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
new file mode 100644
index 0000000..a3f20b3
--- /dev/null
+++ b/caffe2/contrib/gloo/common.cc
@@ -0,0 +1,48 @@
+#include "caffe2/contrib/gloo/common.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+
+#include <gloo/transport/tcp/device.h>
+#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
+#include <gloo/transport/ibverbs/device.h>
+#endif
+
+namespace caffe2 {
+namespace gloo {
+
+void signalFailure(Blob* status_blob, std::exception& /* unused */) {
+  auto* res = status_blob->GetMutable<TensorCPU>();
+  res->Resize(1);
+  res->template mutable_data<int32_t>()[0] = 1;
+}
+
+std::shared_ptr<::gloo::transport::Device> createDevice(
+    const createDeviceAttr attr) {
+  if (attr.transport == "tcp") {
+    ::gloo::transport::tcp::attr tcpAttr;
+    if (attr.interface.size() > 0) {
+      tcpAttr.iface = attr.interface;
+    }
+    return ::gloo::transport::tcp::CreateDevice(tcpAttr);
+  } else if (attr.transport == "ibverbs") {
+#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
+    ::gloo::transport::ibverbs::attr ibverbsAttr;
+    ibverbsAttr.port = 1;
+    ibverbsAttr.index = 0;
+    if (attr.interface.size() > 0) {
+      ibverbsAttr.name = attr.interface;
+    }
+    return ::gloo::transport::ibverbs::CreateDevice(ibverbsAttr);
+#else
+    CAFFE_THROW(
+      "Gloo was not compiled with ibverbs support. ",
+      "Please recompile with -DUSE_IBVERBS=1.");
+#endif
+  }
+
+  CAFFE_THROW("Invalid transport: ", attr.transport);
+}
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common.h b/caffe2/contrib/gloo/common.h
new file mode 100644
index 0000000..98b23cd
--- /dev/null
+++ b/caffe2/contrib/gloo/common.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <exception>
+
+#include "caffe2/core/blob.h"
+
+#include <gloo/config.h>
+#include <gloo/context.h>
+#include <gloo/transport/device.h>
+
+namespace caffe2 {
+namespace gloo {
+
+void signalFailure(Blob* status_blob, std::exception& exception);
+
+struct createDeviceAttr {
+    // "tcp" or "ibverbs"
+    std::string transport;
+
+    // E.g. "eth0" (tcp), or "mlx5_0" (ibverbs).
+    // This may be empty to make Gloo figure it out.
+    std::string interface;
+};
+
+std::shared_ptr<::gloo::transport::Device> createDevice(
+    const createDeviceAttr attr);
+
+// Captures the parameters passed to Gloo.
+struct GlooParameters {
+  std::shared_ptr<::gloo::Context> context;
+  std::vector<const void*> inputs;
+  std::vector<void*> outputs;
+  size_t size;
+  TypeMeta meta;
+
+  template <typename T>
+  std::vector<const T*> getInputs() {
+    std::vector<const T*> result;
+    result.reserve(inputs.size());
+    for (auto& input : inputs) {
+      result.push_back(reinterpret_cast<const T*>(input));
+    }
+    return result;
+  }
+
+  template <typename T>
+  std::vector<T*> getOutputs() {
+    std::vector<T*> result;
+    result.reserve(outputs.size());
+    for (auto& output : outputs) {
+      result.push_back(reinterpret_cast<T*>(output));
+    }
+    return result;
+  }
+
+  template <typename T>
+  T* getOutput() {
+    return reinterpret_cast<T*>(outputs[0]);
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return meta.Match<T>();
+  }
+
+  bool operator==(GlooParameters const& other) const {
+    return context == other.context && inputs == other.inputs &&
+        outputs == other.outputs && size == other.size;
+  }
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops.cc b/caffe2/contrib/gloo/common_world_ops.cc
new file mode 100644
index 0000000..9e631ce
--- /dev/null
+++ b/caffe2/contrib/gloo/common_world_ops.cc
@@ -0,0 +1,29 @@
+#include "caffe2/contrib/gloo/common_world_ops.h"
+
+#include <gloo/transport/tcp/device.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <>
+void CreateCommonWorld<CPUContext>::initializeForContext() {
+  // Nothing to initialize for CPUContext.
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(
+    CreateCommonWorld,
+    GLOO,
+    CreateCommonWorld<CPUContext>);
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(
+    CloneCommonWorld,
+    GLOO,
+    CloneCommonWorld<CPUContext>);
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(DestroyCommonWorld, GLOO, DestroyCommonWorld);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops.h b/caffe2/contrib/gloo/common_world_ops.h
new file mode 100644
index 0000000..e9c9b78
--- /dev/null
+++ b/caffe2/contrib/gloo/common_world_ops.h
@@ -0,0 +1,249 @@
+#pragma once
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/contrib/gloo/store_handler.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/distributed/store_handler.h"
+
+#include <gloo/common/error.h>
+#include <gloo/config.h>
+#include <gloo/rendezvous/context.h>
+#include <gloo/rendezvous/prefix_store.h>
+
+#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
+#include <gloo/mpi/context.h>
+#endif
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class CreateCommonWorld final : public Operator<Context> {
+ public:
+  using CommonWorld = std::shared_ptr<::gloo::Context>;
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  CreateCommonWorld(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        size_(OperatorBase::template GetSingleArgument<int>("size", 0)),
+        rank_(OperatorBase::template GetSingleArgument<int>("rank", 0)),
+        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
+        transport_(OperatorBase::template GetSingleArgument<std::string>(
+                       "transport", "tcp")),
+        interface_(OperatorBase::template GetSingleArgument<std::string>(
+                       "interface", "")),
+        mpi_rendezvous_(OperatorBase::template GetSingleArgument<bool>(
+                       "mpi_rendezvous", false)),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
+        timeout_ms_(OperatorBase::GetSingleArgument<int>("timeout_ms", -1)),
+        ws_(ws) {
+    CAFFE_ENFORCE(
+        operator_def.has_name(), "CreateCommonWorld operator requires name");
+    CAFFE_ENFORCE(rank_ >= 0 && rank_ < size_);
+    name_ = operator_def.name();
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+    initialize();
+  }
+
+  virtual ~CreateCommonWorld() {
+  }
+
+  CommonWorld rendezvousWithMPI() {
+#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
+    auto context = ::gloo::mpi::Context::createManaged();
+    if (timeout_ms_ != -1) {
+      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
+    }
+    context->connectFullMesh(device_);
+    return context;
+#else
+    CAFFE_THROW(
+      "Gloo was not compiled with MPI support. ",
+      "Please recompile with -DUSE_MPI=1.");
+#endif
+  }
+
+  CommonWorld rendezvousWithStore(
+      const std::unique_ptr<StoreHandler>& handler) {
+    // Use PrefixStore to isolate different CreateCommonWorld instances
+    StoreHandlerWrapper wrapper(*handler);
+    ::gloo::rendezvous::PrefixStore store(name_, wrapper);
+    auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
+    if (timeout_ms_ != -1) {
+      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
+    }
+    context->connectFullMesh(store, device_);
+    return context;
+  }
+
+  bool RunOnDevice() override {
+    try {
+      CommonWorld context;
+      if (mpi_rendezvous_) {
+        context = rendezvousWithMPI();
+      } else {
+        CAFFE_ENFORCE_EQ(InputSize(), 1, "Expected store handler input");
+        const auto& handler =
+            OperatorBase::Input<std::unique_ptr<StoreHandler>>(STORE_HANDLER);
+        context = rendezvousWithStore(handler);
+      }
+
+      // Switch pairs to synchronous mode if configured to do so
+      if (sync_) {
+        for (int i = 0; i < context->size; i++) {
+          auto& pair = context->getPair(i);
+          if (pair) {
+            pair->setSync(true, false);
+          }
+        }
+      }
+
+      *OperatorBase::Output<CommonWorld>(COMM) = std::move(context);
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      return handleException(ioe);
+    } catch (::caffe2::StoreHandlerTimeoutException& te) {
+      LOG(ERROR) << "Caught store handler timeout exception: " << te.what();
+      return handleException(te);
+    }
+    return true;
+  }
+
+ private:
+  bool handleException(std::exception& ex) {
+    if (status_blob_ != "") {
+      signalFailure(ws_->GetBlob(status_blob_), ex);
+      return false;
+    } else {
+      throw;
+    }
+  }
+
+  void initialize() {
+    // Share single device between all common worlds.
+    static std::once_flag once;
+    static std::shared_ptr<::gloo::transport::Device> device;
+    std::call_once(once, [&]() {
+        createDeviceAttr attr;
+        attr.transport = transport_;
+        attr.interface = interface_;
+        device = createDevice(attr);
+      });
+    device_ = device;
+
+    // Context specific initialization.
+    initializeForContext();
+  }
+
+  void initializeForContext();
+
+  const int size_;
+  const int rank_;
+  const bool sync_;
+  const std::string transport_;
+  const std::string interface_;
+  const bool mpi_rendezvous_;
+  const std::string status_blob_;
+  const int timeout_ms_;
+  Workspace* ws_;
+
+  std::string name_;
+  std::shared_ptr<::gloo::transport::Device> device_;
+
+  INPUT_TAGS(STORE_HANDLER);
+  OUTPUT_TAGS(COMM);
+};
+
+template <class Context>
+class CloneCommonWorld final : public Operator<Context> {
+ public:
+  using CommonWorld = std::shared_ptr<::gloo::Context>;
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  CloneCommonWorld(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~CloneCommonWorld() {}
+
+  bool RunOnDevice() override {
+    try {
+      auto existing = OperatorBase::Input<CommonWorld>(EXISTING_COMM);
+      ::gloo::rendezvous::ContextFactory factory(existing);
+      auto clone = factory.makeContext(existing->getDevice());
+
+      // Switch pairs to synchronous mode if configured to do so
+      if (sync_) {
+        for (int i = 0; i < clone->size; i++) {
+          auto& pair = clone->getPair(i);
+          if (pair) {
+            pair->setSync(true, false);
+          }
+        }
+      }
+
+      *OperatorBase::Output<CommonWorld>(CLONED_COMM) = std::move(clone);
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      return handleException(ioe);
+    }
+    return true;
+  }
+
+ private:
+  bool handleException(std::exception& ex) {
+    if (status_blob_ != "") {
+      signalFailure(ws_->GetBlob(status_blob_), ex);
+      return false;
+    } else {
+      throw;
+    }
+  }
+
+  const bool sync_;
+  Workspace* ws_;
+  std::string status_blob_;
+
+  INPUT_TAGS(EXISTING_COMM);
+  OUTPUT_TAGS(CLONED_COMM);
+};
+
+class DestroyCommonWorld final : public Operator<CPUContext> {
+ public:
+  DestroyCommonWorld(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    cw_name_ = operator_def.input(0);
+  }
+
+  bool RunOnDevice() override {
+    if (OperatorBase::InputBlob(0).GetRaw() == nullptr) {
+      return true;
+    }
+    const auto& context =
+        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+
+    if (context) {
+      LOG(INFO) << "Closing connections: " << cw_name_;
+      context->closeConnections();
+    }
+    return true;
+  }
+
+ private:
+  std::string cw_name_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops_gpu.cc b/caffe2/contrib/gloo/common_world_ops_gpu.cc
new file mode 100644
index 0000000..f8fb701
--- /dev/null
+++ b/caffe2/contrib/gloo/common_world_ops_gpu.cc
@@ -0,0 +1,35 @@
+#include "caffe2/contrib/gloo/common_world_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+
+#include <gloo/cuda.h>
+#include <gloo/transport/tcp/device.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <>
+void CreateCommonWorld<CUDAContext>::initializeForContext() {
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+      // This is the first time we call Gloo code for a CUDAContext.
+      // Share Caffe2 CUDA mutex with Gloo.
+      ::gloo::CudaShared::setMutex(&CUDAContext::mutex());
+    });
+}
+
+namespace {
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    CreateCommonWorld,
+    GLOO,
+    CreateCommonWorld<CUDAContext>);
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    CloneCommonWorld,
+    GLOO,
+    CloneCommonWorld<CUDAContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/context.cc b/caffe2/contrib/gloo/context.cc
new file mode 100644
index 0000000..5325a4a
--- /dev/null
+++ b/caffe2/contrib/gloo/context.cc
@@ -0,0 +1,12 @@
+#include "context.h"
+
+#include "caffe2/core/typeid.h"
+
+#include <gloo/types.h>
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(::gloo::float16);
+CAFFE_KNOWN_TYPE(std::shared_ptr<::gloo::Context>);
+
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/context.h b/caffe2/contrib/gloo/context.h
new file mode 100644
index 0000000..7b427bd
--- /dev/null
+++ b/caffe2/contrib/gloo/context.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <gloo/context.h>
diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
new file mode 100644
index 0000000..b4ef092
--- /dev/null
+++ b/caffe2/contrib/gloo/gloo_test.py
@@ -0,0 +1,682 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+from multiprocessing import Process, Queue
+
+import numpy as np
+import os
+import pickle
+import tempfile
+import shutil
+
+from caffe2.python import core, workspace, dyndep
+import caffe2.python.hypothesis_test_util as hu
+from gloo.python import IoError
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
+
+op_engine = 'GLOO'
+
+
+class TemporaryDirectory:
+    def __enter__(self):
+        self.tmpdir = tempfile.mkdtemp()
+        return self.tmpdir
+
+    def __exit__(self, type, value, traceback):
+        shutil.rmtree(self.tmpdir)
+
+
+class TestCase(hu.HypothesisTestCase):
+    test_counter = 0
+    sync_counter = 0
+
+    def run_test_locally(self, fn, device_option=None, **kwargs):
+        # Queue for assertion errors on subprocesses
+        queue = Queue()
+
+        # Capture any exception thrown by the subprocess
+        def run_fn(*args, **kwargs):
+            try:
+                with core.DeviceScope(device_option):
+                    fn(*args, **kwargs)
+                    workspace.ResetWorkspace()
+                    queue.put(True)
+            except Exception as ex:
+                queue.put(ex)
+
+        # Start N processes in the background
+        procs = []
+        for i in range(kwargs['comm_size']):
+            kwargs['comm_rank'] = i
+            proc = Process(
+                target=run_fn,
+                kwargs=kwargs)
+            proc.start()
+            procs.append(proc)
+
+        # Test complete, join background processes
+        while len(procs) > 0:
+            proc = procs.pop(0)
+            while proc.is_alive():
+                proc.join(10)
+
+            # Raise exception if we find any. Otherwise each worker
+            # should put a True into the queue
+            # Note that the following is executed ALSO after
+            # the last process was joined, so if ANY exception
+            # was raised, it will be re-raised here.
+            self.assertFalse(queue.empty(), "Job failed without a result")
+            o = queue.get()
+            if isinstance(o, Exception):
+                raise o
+            else:
+                self.assertTrue(o)
+
+    def run_test_distributed(self, fn, device_option=None, **kwargs):
+        comm_rank = os.getenv('COMM_RANK')
+        self.assertIsNotNone(comm_rank)
+        comm_size = os.getenv('COMM_SIZE')
+        self.assertIsNotNone(comm_size)
+        kwargs['comm_rank'] = int(comm_rank)
+        kwargs['comm_size'] = int(comm_size)
+        with core.DeviceScope(device_option):
+            fn(**kwargs)
+            workspace.ResetWorkspace()
+
+    def create_common_world(self, comm_rank, comm_size, tmpdir=None, existing_cw=None):
+        store_handler = "store_handler"
+
+        # If REDIS_HOST is set, use RedisStoreHandler for rendezvous.
+        if existing_cw is None:
+            redis_host = os.getenv("REDIS_HOST")
+            redis_port = int(os.getenv("REDIS_PORT", 6379))
+            if redis_host is not None:
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "RedisStoreHandlerCreate",
+                        [],
+                        [store_handler],
+                        prefix=str(TestCase.test_counter) + "/",
+                        host=redis_host,
+                        port=redis_port))
+            else:
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "FileStoreHandlerCreate",
+                        [],
+                        [store_handler],
+                        path=tmpdir))
+            common_world = "common_world"
+        else:
+            common_world = str(existing_cw) + ".forked"
+
+        inputs = [store_handler]
+        if existing_cw is not None:
+            inputs.append(existing_cw)
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "CreateCommonWorld",
+                inputs,
+                [common_world],
+                size=comm_size,
+                rank=comm_rank,
+                sync=True,
+                engine=op_engine))
+        return (store_handler, common_world)
+
+    def synchronize(self, store_handler, value, comm_rank=None):
+        TestCase.sync_counter += 1
+        blob = "sync_{}".format(TestCase.sync_counter)
+        if comm_rank == 0:
+            workspace.FeedBlob(blob, pickle.dumps(value))
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "StoreSet",
+                    [store_handler, blob],
+                    []))
+        else:
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "StoreGet",
+                    [store_handler],
+                    [blob]))
+        return pickle.loads(workspace.FetchBlob(blob))
+
+    def _test_broadcast(self,
+                        comm_rank=None,
+                        comm_size=None,
+                        blob_size=None,
+                        num_blobs=None,
+                        tmpdir=None,
+                        use_float16=False,
+                        ):
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        blob_size = self.synchronize(
+            store_handler,
+            blob_size,
+            comm_rank=comm_rank)
+
+        num_blobs = self.synchronize(
+            store_handler,
+            num_blobs,
+            comm_rank=comm_rank)
+
+        for i in range(comm_size):
+            blobs = []
+            for j in range(num_blobs):
+                blob = "blob_{}".format(j)
+                offset = (comm_rank * num_blobs) + j
+                value = np.full(blob_size, offset,
+                                np.float16 if use_float16 else np.float32)
+                workspace.FeedBlob(blob, value)
+                blobs.append(blob)
+
+            net = core.Net("broadcast")
+            net.Broadcast(
+                [common_world] + blobs,
+                blobs,
+                root=i,
+                engine=op_engine)
+
+            workspace.CreateNet(net)
+            workspace.RunNet(net.Name())
+
+            for j in range(num_blobs):
+                np.testing.assert_array_equal(
+                    workspace.FetchBlob(blobs[j]),
+                    i * num_blobs)
+
+            # Run the net a few more times to check the operator
+            # works not just the first time it's called
+            for _tmp in range(4):
+                workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           blob_size=st.integers(min_value=1e3, max_value=1e6),
+           num_blobs=st.integers(min_value=1, max_value=4),
+           device_option=st.sampled_from([hu.cpu_do]),
+           use_float16=st.booleans())
+    def test_broadcast(self, comm_size, blob_size, num_blobs, device_option,
+                       use_float16):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_broadcast,
+                blob_size=blob_size,
+                num_blobs=num_blobs,
+                use_float16=use_float16,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_broadcast,
+                    comm_size=comm_size,
+                    blob_size=blob_size,
+                    num_blobs=num_blobs,
+                    device_option=device_option,
+                    tmpdir=tmpdir,
+                    use_float16=use_float16)
+
+    def _test_allreduce(self,
+                        comm_rank=None,
+                        comm_size=None,
+                        blob_size=None,
+                        num_blobs=None,
+                        tmpdir=None,
+                        use_float16=False
+                        ):
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        blob_size = self.synchronize(
+            store_handler,
+            blob_size,
+            comm_rank=comm_rank)
+
+        num_blobs = self.synchronize(
+            store_handler,
+            num_blobs,
+            comm_rank=comm_rank)
+
+        blobs = []
+        for i in range(num_blobs):
+            blob = "blob_{}".format(i)
+            value = np.full(blob_size, (comm_rank * num_blobs) + i,
+                            np.float16 if use_float16 else np.float32)
+            workspace.FeedBlob(blob, value)
+            blobs.append(blob)
+
+        net = core.Net("allreduce")
+        net.Allreduce(
+            [common_world] + blobs,
+            blobs,
+            engine=op_engine)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net.Name())
+
+        for i in range(num_blobs):
+            np.testing.assert_array_equal(
+                workspace.FetchBlob(blobs[i]),
+                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
+
+        # Run the net a few more times to check the operator
+        # works not just the first time it's called
+        for _tmp in range(4):
+            workspace.RunNet(net.Name())
+
+    def _test_allreduce_multicw(self,
+                                comm_rank=None,
+                                comm_size=None,
+                                tmpdir=None
+                                ):
+        _store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        _, common_world2 = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir,
+            existing_cw=common_world)
+
+        blob_size = 1e4
+        num_blobs = 4
+
+        for cw in [common_world, common_world2]:
+            blobs = []
+            for i in range(num_blobs):
+                blob = "blob_{}".format(i)
+                value = np.full(blob_size, (comm_rank * num_blobs) + i, np.float32)
+                workspace.FeedBlob(blob, value)
+                blobs.append(blob)
+
+            net = core.Net("allreduce_multicw")
+            net.Allreduce(
+                [cw] + blobs,
+                blobs,
+                engine=op_engine)
+
+            workspace.RunNetOnce(net)
+            for i in range(num_blobs):
+                np.testing.assert_array_equal(
+                    workspace.FetchBlob(blobs[i]),
+                    (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           blob_size=st.integers(min_value=1e3, max_value=1e6),
+           num_blobs=st.integers(min_value=1, max_value=4),
+           device_option=st.sampled_from([hu.cpu_do]),
+           use_float16=st.booleans())
+    def test_allreduce(self, comm_size, blob_size, num_blobs, device_option,
+                       use_float16):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_allreduce,
+                blob_size=blob_size,
+                num_blobs=num_blobs,
+                use_float16=use_float16,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_allreduce,
+                    comm_size=comm_size,
+                    blob_size=blob_size,
+                    num_blobs=num_blobs,
+                    device_option=device_option,
+                    tmpdir=tmpdir,
+                    use_float16=use_float16)
+
+    def _test_reduce_scatter(self,
+                             comm_rank=None,
+                             comm_size=None,
+                             blob_size=None,
+                             num_blobs=None,
+                             tmpdir=None,
+                             use_float16=False
+                             ):
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        blob_size = self.synchronize(
+            store_handler,
+            blob_size,
+            comm_rank=comm_rank)
+
+        num_blobs = self.synchronize(
+            store_handler,
+            num_blobs,
+            comm_rank=comm_rank)
+
+        blobs = []
+        for i in range(num_blobs):
+            blob = "blob_{}".format(i)
+            value = np.full(blob_size, (comm_rank * num_blobs) + i,
+                            np.float16 if use_float16 else np.float32)
+            workspace.FeedBlob(blob, value)
+            blobs.append(blob)
+
+        # Specify distribution among ranks i.e. number of elements
+        # scattered/distributed to each process.
+        recv_counts = np.zeros(comm_size, dtype=np.int32)
+        remaining = blob_size
+        chunk_size = (blob_size + comm_size - 1) / comm_size
+        for i in range(comm_size):
+            recv_counts[i] = min(chunk_size, remaining)
+            remaining = remaining - chunk_size if remaining > chunk_size else 0
+        recv_counts_blob = "recvCounts"
+        workspace.FeedBlob(recv_counts_blob, recv_counts)
+        blobs.append(recv_counts_blob)
+
+        net = core.Net("reduce_scatter")
+        net.ReduceScatter(
+            [common_world] + blobs,
+            blobs,
+            engine=op_engine)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net.Name())
+
+        for i in range(num_blobs):
+            np.testing.assert_array_equal(
+                np.resize(workspace.FetchBlob(blobs[i]), recv_counts[comm_rank]),
+                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
+
+        # Run the net a few more times to check the operator
+        # works not just the first time it's called
+        for _tmp in range(4):
+            workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           blob_size=st.integers(min_value=1e3, max_value=1e6),
+           num_blobs=st.integers(min_value=1, max_value=4),
+           device_option=st.sampled_from([hu.cpu_do]),
+           use_float16=st.booleans())
+    def test_reduce_scatter(self, comm_size, blob_size, num_blobs,
+                            device_option, use_float16):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_reduce_scatter,
+                blob_size=blob_size,
+                num_blobs=num_blobs,
+                use_float16=use_float16,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_reduce_scatter,
+                    comm_size=comm_size,
+                    blob_size=blob_size,
+                    num_blobs=num_blobs,
+                    device_option=device_option,
+                    tmpdir=tmpdir,
+                    use_float16=use_float16)
+
+    def _test_allgather(self,
+                        comm_rank=None,
+                        comm_size=None,
+                        blob_size=None,
+                        num_blobs=None,
+                        tmpdir=None,
+                        use_float16=False
+                        ):
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        blob_size = self.synchronize(
+            store_handler,
+            blob_size,
+            comm_rank=comm_rank)
+
+        num_blobs = self.synchronize(
+            store_handler,
+            num_blobs,
+            comm_rank=comm_rank)
+
+        blobs = []
+        for i in range(num_blobs):
+            blob = "blob_{}".format(i)
+            value = np.full(blob_size, (comm_rank * num_blobs) + i,
+                            np.float16 if use_float16 else np.float32)
+            workspace.FeedBlob(blob, value)
+            blobs.append(blob)
+
+        net = core.Net("allgather")
+        net.Allgather(
+            [common_world] + blobs,
+            ["Gathered"],
+            engine=op_engine)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net.Name())
+        # create expected output
+        expected_output = np.array([])
+        for i in range(comm_size):
+            for j in range(num_blobs):
+                value = np.full(blob_size, (i * num_blobs) + j,
+                                np.float16 if use_float16 else np.float32)
+                expected_output = np.concatenate((expected_output, value))
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("Gathered"), expected_output)
+
+        # Run the net a few more times to check the operator
+        # works not just the first time it's called
+        for _tmp in range(4):
+            workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           blob_size=st.integers(min_value=1e3, max_value=1e6),
+           num_blobs=st.integers(min_value=1, max_value=4),
+           device_option=st.sampled_from([hu.cpu_do]),
+           use_float16=st.booleans())
+    def test_allgather(self, comm_size, blob_size, num_blobs, device_option,
+                       use_float16):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_allgather,
+                blob_size=blob_size,
+                num_blobs=num_blobs,
+                use_float16=use_float16,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_allgather,
+                    comm_size=comm_size,
+                    blob_size=blob_size,
+                    num_blobs=num_blobs,
+                    device_option=device_option,
+                    tmpdir=tmpdir,
+                    use_float16=use_float16)
+
+    @given(device_option=st.sampled_from([hu.cpu_do]))
+    def test_forked_cw(self, device_option):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_allreduce_multicw,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_allreduce_multicw,
+                    comm_size=8,
+                    device_option=device_option,
+                    tmpdir=tmpdir)
+
+    def _test_barrier(
+        self,
+        comm_rank=None,
+        comm_size=None,
+        tmpdir=None,
+    ):
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
+        )
+
+        net = core.Net("barrier")
+        net.Barrier(
+            [common_world],
+            [],
+            engine=op_engine)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net.Name())
+
+        # Run the net a few more times to check the operator
+        # works not just the first time it's called
+        for _tmp in range(4):
+            workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           device_option=st.sampled_from([hu.cpu_do]))
+    def test_barrier(self, comm_size, device_option):
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_barrier,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_barrier,
+                    comm_size=comm_size,
+                    device_option=device_option,
+                    tmpdir=tmpdir)
+
+    def _test_close_connection(
+        self,
+        comm_rank=None,
+        comm_size=None,
+        tmpdir=None,
+    ):
+        '''
+        One node calls close connection, others wait it on barrier.
+        Test will check that all will exit eventually.
+        '''
+        # Caffe's for closers only:
+        # https://www.youtube.com/watch?v=QMFwFgG9NE8
+        closer = comm_rank == comm_size // 2,
+
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
+        )
+
+        net = core.Net("barrier_or_close")
+        if not closer:
+            net.Barrier(
+                [common_world],
+                [],
+                engine=op_engine)
+        else:
+            net.DestroyCommonWorld(
+                [common_world], [common_world], engine=op_engine)
+            # Sleep a bit to ensure others start the barrier
+            import time
+            time.sleep(0.1)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           device_option=st.sampled_from([hu.cpu_do]))
+    def test_close_connection(self, comm_size, device_option):
+        import time
+        start_time = time.time()
+        TestCase.test_counter += 1
+        if os.getenv('COMM_RANK') is not None:
+            self.run_test_distributed(
+                self._test_close_connection,
+                device_option=device_option)
+        else:
+            with TemporaryDirectory() as tmpdir:
+                self.run_test_locally(
+                    self._test_close_connection,
+                    comm_size=comm_size,
+                    device_option=device_option,
+                    tmpdir=tmpdir)
+        # Check that test finishes quickly because connections get closed
+        self.assertLess(time.time() - start_time, 2.0)
+
+    def _test_io_error(
+        self,
+        comm_rank=None,
+        comm_size=None,
+        tmpdir=None,
+    ):
+        '''
+        Only one node will participate in allreduce, resulting in an IoError
+        '''
+        store_handler, common_world = self.create_common_world(
+            comm_rank=comm_rank,
+            comm_size=comm_size,
+            tmpdir=tmpdir)
+
+        if comm_rank == 0:
+            blob_size = 1000
+            num_blobs = 1
+
+            blobs = []
+            for i in range(num_blobs):
+                blob = "blob_{}".format(i)
+                value = np.full(
+                    blob_size, (comm_rank * num_blobs) + i, np.float32
+                )
+                workspace.FeedBlob(blob, value)
+                blobs.append(blob)
+
+            net = core.Net("allreduce")
+            net.Allreduce(
+                [common_world] + blobs,
+                blobs,
+                engine=op_engine)
+
+            workspace.CreateNet(net)
+            workspace.RunNet(net.Name())
+
+    @given(comm_size=st.integers(min_value=2, max_value=8),
+           device_option=st.sampled_from([hu.cpu_do]))
+    def test_io_error(self, comm_size, device_option):
+        TestCase.test_counter += 1
+        with self.assertRaises(IoError):
+            if os.getenv('COMM_RANK') is not None:
+                self.run_test_distributed(
+                    self._test_io_error,
+                    device_option=device_option)
+            else:
+                with TemporaryDirectory() as tmpdir:
+                    self.run_test_locally(
+                        self._test_io_error,
+                        comm_size=comm_size,
+                        device_option=device_option,
+                        tmpdir=tmpdir)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/contrib/gloo/py_export.cc b/caffe2/contrib/gloo/py_export.cc
new file mode 100644
index 0000000..379982e
--- /dev/null
+++ b/caffe2/contrib/gloo/py_export.cc
@@ -0,0 +1,15 @@
+#include <gloo/common/error.h>
+#include <pybind11/pybind11.h>
+
+namespace gloo {
+namespace python {
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(python, m) {
+  m.doc() = "Python interface for Gloo";
+  py::register_exception<IoException>(m, "IoError");
+}
+
+} // namespace python
+} // namespace gloo
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.cc b/caffe2/contrib/gloo/reduce_scatter_ops.cc
new file mode 100644
index 0000000..a96c545
--- /dev/null
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.cc
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reduce_scatter_ops.h"
+
+#include <gloo/reduce_scatter.h>
+#include <gloo/types.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+void ReduceScatterOp<Context>::initializeHalvingDoubling() {
+  if (init_.template IsType<float>()) {
+    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<float>(
+        init_.context,
+        init_.template getOutputs<float>(),
+        init_.size,
+        recvCounts_));
+  } else if (init_.template IsType<::caffe2::float16>()) {
+    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<::gloo::float16>(
+        init_.context,
+        init_.template getOutputs<::gloo::float16>(),
+        init_.size,
+        recvCounts_));
+  } else {
+    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
+  }
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(
+    ReduceScatter,
+    GLOO,
+    ReduceScatterOp<CPUContext>);
+
+} // namespace
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h
new file mode 100644
index 0000000..069c523
--- /dev/null
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@@ -0,0 +1,131 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "caffe2/contrib/gloo/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+
+namespace caffe2 {
+namespace gloo {
+
+template <class Context>
+class ReduceScatterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ReduceScatterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        status_blob_(
+            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
+    if (status_blob_ != "") {
+      ws_->CreateBlob(status_blob_);
+    }
+  }
+
+  virtual ~ReduceScatterOp() {}
+
+  bool RunOnDevice() override {
+    std::call_once(once_, [&] { initialize(); });
+
+    // If any parameter has changed in between runs, the initialized
+    // algorithm is invalid and cannot be used.
+    update(current_);
+    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
+
+    try {
+      algorithm_->run();
+    } catch (::gloo::IoException& ioe) {
+      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
+      if (status_blob_ != "") {
+        signalFailure(ws_->GetBlob(status_blob_), ioe);
+        return false;
+      } else {
+        throw;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  void initialize() {
+    // Store which inputs/outputs this instance initialized with
+    update(init_);
+
+    // Verify inputs == ouputs
+    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
+    for (auto i = 0; i < init_.inputs.size(); i++) {
+      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
+    }
+
+    // Verify tensors all have same size
+    size_t size = Input(1).size();
+    for (auto i = 2; i < InputSize() - 1; i++) {
+      CAFFE_ENFORCE_EQ(Input(i).size(), size);
+    }
+
+    // Verify tensors all have same type
+    TypeMeta meta = Input(1).meta();
+    for (auto i = 2; i < InputSize() - 1; i++) {
+      CAFFE_ENFORCE(Input(i).meta() == meta);
+    }
+
+    initializeHalvingDoubling();
+  }
+
+  void initializeHalvingDoubling();
+
+  std::once_flag once_;
+  std::unique_ptr<::gloo::Algorithm> algorithm_;
+
+  // Captures the parameters passed to Gloo when first initialized.
+  // An instance is updated every time this op runs and is compared
+  // to the reference instance for equality. If any parameter has
+  // changed from run to run, the initialized algorithm is invalid.
+  void update(GlooParameters& params) {
+    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
+    params.inputs.resize(InputSize() - 2);
+    params.outputs.resize(OutputSize() - 1);
+    for (auto i = 0; i < params.inputs.size(); i++) {
+      params.inputs[i] = Input(i + 1).template raw_data();
+      params.outputs[i] = Output(i)->template raw_mutable_data();
+    }
+    params.size = Output(0)->size();
+    params.meta = Output(0)->meta();
+
+    // Verify recvCountsSize == comm_size
+    CAFFE_ENFORCE_EQ(Input(InputSize() - 1).size(), params.context->size);
+    int* recvCounts = (int*)Input(InputSize() - 1).template raw_data();
+    recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).size());
+  }
+
+  GlooParameters init_;
+  GlooParameters current_;
+  Workspace* ws_;
+  std::string status_blob_;
+  std::vector<int> recvCounts_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/store_handler.cc b/caffe2/contrib/gloo/store_handler.cc
new file mode 100644
index 0000000..e5c30d9
--- /dev/null
+++ b/caffe2/contrib/gloo/store_handler.cc
@@ -0,0 +1,25 @@
+#include "store_handler.h"
+
+namespace caffe2 {
+namespace gloo {
+
+void StoreHandlerWrapper::set(
+    const std::string& key,
+    const std::vector<char>& data) {
+  std::string stringValue(data.data(), data.size());
+  handler_.set(key, stringValue);
+}
+
+std::vector<char> StoreHandlerWrapper::get(const std::string& key) {
+  std::string str = handler_.get(key);
+  return std::vector<char>(str.begin(), str.end());
+}
+
+void StoreHandlerWrapper::wait(
+    const std::vector<std::string>& keys,
+    const std::chrono::milliseconds& timeout) {
+  handler_.wait(keys, timeout);
+}
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/gloo/store_handler.h b/caffe2/contrib/gloo/store_handler.h
new file mode 100644
index 0000000..1770972
--- /dev/null
+++ b/caffe2/contrib/gloo/store_handler.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "caffe2/distributed/store_handler.h"
+
+#include <gloo/rendezvous/store.h>
+
+namespace caffe2 {
+namespace gloo {
+
+class StoreHandlerWrapper : public ::gloo::rendezvous::Store {
+ public:
+  explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}
+
+  virtual ~StoreHandlerWrapper() {}
+
+  virtual void set(const std::string& key, const std::vector<char>& data)
+      override;
+
+  virtual std::vector<char> get(const std::string& key) override;
+
+  virtual void wait(const std::vector<std::string>& keys) override {
+    wait(keys, ::gloo::rendezvous::Store::kDefaultTimeout);
+  }
+
+  virtual void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+ protected:
+  StoreHandler& handler_;
+};
+
+} // namespace gloo
+} // namespace caffe2
diff --git a/caffe2/contrib/ideep/CMakeLists.txt b/caffe2/contrib/ideep/CMakeLists.txt
new file mode 100644
index 0000000..711a4cd
--- /dev/null
+++ b/caffe2/contrib/ideep/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(USE_MKL AND USE_IDEEP AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+  message(STATUS "Including IDEEP operators")
+
+  # ---[ CPU files.
+  file(GLOB_RECURSE avx2_srcs *.cc)
+  # exclude test files and gpu files
+  file(GLOB_RECURSE tmp *_test.cc)
+  exclude(avx2_srcs "${avx2_srcs}" ${tmp})
+
+  add_library(Caffe2_ideep_operators OBJECT ${avx2_srcs})
+  add_dependencies(Caffe2_ideep_operators Caffe_PROTO Caffe2_PROTO)
+  set_target_properties(Caffe2_ideep_operators PROPERTIES COMPILE_FLAGS "-mavx2")
+
+  # ---[ Send the lists to the parent scope.
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
+      $<TARGET_OBJECTS:Caffe2_ideep_operators>)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+else()
+  message(STATUS "Excluding ideep operators as we are not using ideep")
+endif()
diff --git a/caffe2/contrib/nccl/CMakeLists.txt b/caffe2/contrib/nccl/CMakeLists.txt
new file mode 100644
index 0000000..b2a0f9c
--- /dev/null
+++ b/caffe2/contrib/nccl/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(USE_NCCL)
+    message(STATUS "Include NCCL operators")
+    set(Caffe2_CONTRIB_NCCL_GPU_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_gpu.cc"
+        "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_op_gpu.cc"
+    )
+
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+else()
+  message(STATUS "NCCL operators skipped due to no CUDA support")
+endif()
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
new file mode 100644
index 0000000..aa321e5
--- /dev/null
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@@ -0,0 +1,333 @@
+#include "cuda_nccl_gpu.h"
+
+namespace caffe2 {
+
+namespace nccl {
+
+namespace {
+
+std::vector<int> getDevices(const NCCLExecution& ex) {
+  std::vector<int> result;
+  result.reserve(ex.elements.size());
+  for (const auto& el: ex.elements) {
+    result.push_back(el.device);
+  }
+  return result;
+}
+
+class NCCLContext {
+ public:
+  explicit NCCLContext(const NCCLExecution& ex)
+      : devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
+    comms_.resize(devices_.size());
+    CAFFE_NCCL_CHECK(
+        ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));
+
+    streams_.resize(devices_.size());
+    events_.resize(devices_.size());
+    for (auto i = 0; i < devices_.size(); ++i) {
+      DeviceGuard g(devices_[i]);
+      // get stream priorities
+      int lo_pri, hi_pri;
+      CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
+      CUDA_ENFORCE(cudaStreamCreateWithPriority(
+          &streams_[i], cudaStreamNonBlocking, hi_pri));
+      CUDA_ENFORCE(cudaEventCreateWithFlags(
+          &events_[i], cudaEventDefault | cudaEventDisableTiming));
+    }
+    DeviceGuard g(master_gpu_id_);
+    CUDA_ENFORCE(cudaEventCreateWithFlags(
+        &master_event_, cudaEventDefault | cudaEventDisableTiming));
+  }
+
+  ~NCCLContext() {
+    for (auto i = 0; i < devices_.size(); ++i) {
+      DeviceGuard g(devices_[i]);
+      CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
+      CUDA_ENFORCE(cudaEventDestroy(events_[i]));
+    }
+    DeviceGuard g(master_gpu_id_);
+    CUDA_ENFORCE(cudaEventDestroy(master_event_));
+
+    /*
+     * TODO(T30279827) Temporarily disable calling ncclCommDestroy
+     * Calling ncclCommDestroy while program exiting is undefined
+     * according to Nvidia, and will lead to segfault in NCCL 2
+     * (whether it is called before or after the CUDA runtime destructor).
+     * Temporarily disable it in destructor to avoid segfault.
+     * Following up with Nvidia for long term solution.
+     */
+
+    /*
+    for (auto& comm : comms_) {
+      ncclCommDestroy(comm);
+    }
+    */
+  }
+
+  std::vector<int> devices_;
+  std::vector<ncclComm_t> comms_;
+  std::vector<cudaStream_t> streams_;
+  int master_gpu_id_;
+  cudaEvent_t master_event_;
+  std::vector<cudaEvent_t> events_;
+
+  DISABLE_COPY_AND_ASSIGN(NCCLContext);
+};
+
+// We share the contexts across multiple operators, hence the
+// thread-local cache
+static std::mutex& gContextsMutex() {
+  static std::mutex m;
+  return m;
+}
+
+std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
+  // Initiazed after CUDA, so guaranteed to be destructed before CUDA.
+  static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
+  return m;
+}
+
+std::string ncclKey(const NCCLExecution& ex) {
+  std::string result;
+  int curr_device;
+  CUDA_CHECK(cudaGetDevice(&curr_device));
+  result += to_string(curr_device) + ":";
+  for (const auto& el : ex.elements) {
+    result += to_string(el.device) + ",";
+  }
+  return result;
+}
+
+NCCLContext* getNCCLContext(const NCCLExecution& ex) {
+  auto& contexts = gContexts();
+  const auto key = ncclKey(ex);
+  if (!contexts[key]) {
+    LOG(INFO) << "Creating NCCLContext for key: " << key;
+    contexts[key].reset(new NCCLContext(ex));
+  }
+  return CHECK_NOTNULL(contexts[key].get());
+}
+
+template <typename T>
+class ncclTypeWrapper;
+
+template <>
+class ncclTypeWrapper<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template <>
+class ncclTypeWrapper<int> {
+ public:
+  static const ncclDataType_t type = ncclInt;
+};
+
+#ifdef CAFFE_HAS_CUDA_FP16
+template <>
+class ncclTypeWrapper<float16> {
+ public:
+  static const ncclDataType_t type = ncclHalf;
+};
+#endif
+
+template <typename T, typename InitF, typename F>
+void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
+  // do initialization
+  for (auto i = 0; i < ex.elements.size(); ++i) {
+    auto& ctx = ex.elements[i];
+    DeviceGuard g(ctx.device);
+    init_f(ex.elements[i]);
+  }
+
+  std::lock_guard<std::mutex> g(gContextsMutex());
+  auto* context = getNCCLContext(ex);
+  auto& comms = context->comms_;
+  auto& streams = context->streams_;
+  auto& events = context->events_;
+  // Record an event on the master context, wait on it in each of the
+  // children streams, so the children streams are synchronized WRT
+  // the original stream.
+  {
+    DeviceGuard g(ex.stream_gpu_id);
+    CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
+  }
+
+  {
+    // lock out alloc / free while NCCL launches
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+
+#if NCCL_VERSION_MIN(2, 0, 0)
+    CAFFE_NCCL_CHECK(ncclGroupStart());
+#endif
+
+    for (auto i = 0; i < ex.elements.size(); ++i) {
+      auto& ctx = ex.elements[i];
+      DeviceGuard g(ctx.device);
+      auto& comm = comms[i];
+      auto& stream = streams[i];
+      auto& event = events[i];
+
+      DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
+      CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
+      f(ctx, comm, stream);
+    }
+
+#if NCCL_VERSION_MIN(2, 0, 0)
+    CAFFE_NCCL_CHECK(ncclGroupEnd());
+#endif
+
+    for (auto i = 0; i < ex.elements.size(); ++i) {
+      auto& ctx = ex.elements[i];
+      DeviceGuard g(ctx.device);
+      auto& comm = comms[i];
+      auto& stream = streams[i];
+      auto& event = events[i];
+
+      // Record an event on each children stream that we have finished
+      // our computation
+      CUDA_ENFORCE(cudaEventRecord(event, stream));
+    }
+  }
+
+  // Now, wait on all the events in the original stream.
+  DeviceGuard dg(ex.stream_gpu_id);
+  for (auto& event : events) {
+    CUDA_ENFORCE(cudaStreamWaitEvent(CHECK_NOTNULL(ex.stream), event, 0));
+  }
+}
+
+}
+
+template <typename T>
+void NCCL<T>::AllReduce(const NCCLExecution& ex) {
+  return runNCCL<T>(
+      ex,
+      [](const NCCLElement& ctx) {
+        ctx.dst->Resize(ctx.src->dims());
+        ctx.dst->template mutable_data<T>();
+      },
+      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
+        CAFFE_NCCL_CHECK(ncclAllReduce(
+            ctx.src->raw_data(),
+            ctx.dst->raw_mutable_data(),
+            ctx.dst->size(),
+            ncclTypeWrapper<T>::type,
+            ncclSum,
+            comm,
+            stream));
+      });
+}
+
+template <typename T>
+void NCCL<T>::Broadcast(const NCCLExecution& ex) {
+  return runNCCL<T>(
+      ex,
+      [](const NCCLElement& ctx) {
+        ctx.dst->Resize(ctx.src->dims());
+        ctx.dst->template mutable_data<T>();
+      },
+      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
+        CAFFE_NCCL_CHECK(ncclBcast(
+            ctx.dst->raw_mutable_data(),
+            ctx.dst->size(),
+            ncclTypeWrapper<T>::type,
+            ex.root,
+            comm,
+            stream));
+      });
+}
+
+template <typename T>
+void NCCL<T>::Reduce(const NCCLExecution& ex) {
+  return runNCCL<T>(
+      ex,
+      [](const NCCLElement& ctx) {
+        if (ctx.dst) {
+          ctx.dst->Resize(ctx.src->dims());
+          ctx.dst->template mutable_data<T>();
+        }
+      },
+      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
+        CAFFE_NCCL_CHECK(ncclReduce(
+            ctx.src->raw_data(),
+            ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
+            ctx.src->size(),
+            ncclTypeWrapper<T>::type,
+            ncclSum,
+            ex.root,
+            comm,
+            stream));
+      });
+}
+
+template <typename T>
+void NCCL<T>::AllGather(const NCCLExecution& ex) {
+  const auto n = ex.elements.size();
+  return runNCCL<T>(
+      ex,
+      [n](const NCCLElement& ctx) {
+        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
+        std::vector<TIndex> dims;
+        dims.reserve(ctx.src->ndim() + 1);
+        dims.push_back(n);
+        for (auto d : ctx.src->dims()) {
+          dims.push_back(d);
+        }
+        ctx.dst->Resize(dims);
+        ctx.dst->template mutable_data<T>();
+      },
+      [n](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
+#if NCCL_VERSION_MIN(2, 0, 0)
+        CAFFE_NCCL_CHECK(ncclAllGather(
+            ctx.src->raw_data(),
+            ctx.dst->raw_mutable_data(),
+            ctx.src->size(),
+            ncclTypeWrapper<T>::type,
+            comm,
+            stream));
+#else
+        CAFFE_NCCL_CHECK(ncclAllGather(
+            ctx.src->raw_data(),
+            ctx.src->size(),
+            ncclTypeWrapper<T>::type,
+            ctx.dst->raw_mutable_data(),
+            comm,
+            stream));
+#endif
+      });
+}
+
+template <typename T>
+void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
+  const auto n = ex.elements.size();
+  return runNCCL<T>(
+      ex,
+      [](const NCCLElement& ctx) {
+        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
+        const auto& srcDims = ctx.src->dims();
+        std::vector<TIndex> dstDims(srcDims.begin() + 1, srcDims.end());
+        ctx.dst->Resize(dstDims);
+        ctx.dst->template mutable_data<T>();
+      },
+      [n](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
+        CAFFE_NCCL_CHECK(ncclReduceScatter(
+            ctx.src->raw_data(),
+            ctx.dst->raw_mutable_data(),
+            ctx.dst->size(),
+            ncclTypeWrapper<T>::type,
+            ncclSum,
+            comm,
+            stream));
+      });
+}
+
+// Explicit instantiation
+template class NCCL<float>;
+template class NCCL<int>;
+#ifdef CAFFE_HAS_CUDA_FP16
+template class NCCL<float16>;
+#endif
+}
+}
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.h b/caffe2/contrib/nccl/cuda_nccl_gpu.h
new file mode 100644
index 0000000..0b0a466
--- /dev/null
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <cstddef>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/logging.h"
+
+#include <nccl.h>
+#include <unordered_map>
+
+#define NCCL_VERSION_MIN(major, minor, patch) \
+  ((NCCL_MAJOR > major) || \
+    ((NCCL_MAJOR == major) && ((NCCL_MINOR > minor) || \
+      ((NCCL_MINOR == minor) && (NCCL_PATCH >= patch)) )))
+
+namespace caffe2 {
+
+namespace nccl {
+
+#define CAFFE_NCCL_CHECK(condition)    \
+  do {                                 \
+    ncclResult_t status = (condition); \
+    CAFFE_ENFORCE_EQ(                  \
+        status,                        \
+        ncclSuccess,                   \
+        " ",                           \
+        "Error at: ",                  \
+        __FILE__,                      \
+        __LINE__,                      \
+        ": ",                          \
+        ncclGetErrorString(status));   \
+  } while (0)
+
+struct NCCLElement {
+  const TensorCUDA* src{nullptr};
+  TensorCUDA* dst{nullptr};
+  int device{0};
+};
+
+struct NCCLExecution {
+  int stream_gpu_id{0};
+  cudaStream_t stream{nullptr};
+  std::vector<NCCLElement> elements;
+  size_t root{0};
+};
+
+template <typename T>
+class NCCL {
+ public:
+  static void AllReduce(const NCCLExecution& ex);
+  static void Broadcast(const NCCLExecution& ex);
+  static void Reduce(const NCCLExecution& ex);
+  static void AllGather(const NCCLExecution& ex);
+  static void ReduceScatter(const NCCLExecution& ex);
+};
+}
+}
diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
new file mode 100644
index 0000000..102c854
--- /dev/null
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -0,0 +1,220 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+#include "cuda_nccl_gpu.h"
+
+namespace caffe2 {
+
+nccl::NCCLExecution getNCCLElements(
+    OperatorBase* op,
+    const CUDAContext& context) {
+  // We either do an N-N op, or an N-1 op.
+  CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
+  nccl::NCCLExecution ex;
+  ex.stream_gpu_id = context.cuda_gpu_id();
+  ex.stream = context.cuda_stream();
+  ex.root = op->template GetSingleArgument<int>("root", 0);
+  ex.elements.resize(op->InputSize());
+  for (auto i = 0; i < op->InputSize(); ++i) {
+    auto& el = ex.elements[i];
+    el.src = &(op->Input<TensorCUDA>(i));
+    if (op->OutputSize() == 1) {
+      // Reduce op
+      if (i == ex.root) {
+        el.dst = op->Output<TensorCUDA>(0);
+      }
+    } else if (i < op->OutputSize()) {
+      el.dst = op->Output<TensorCUDA>(i);
+    }
+    // TODO - expensive (>1ms) - cache these.
+    el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
+  }
+
+  return ex;
+}
+
+namespace {
+// Check if all inputs are float
+template <typename T>
+bool AllInputsAre(OperatorBase* op) {
+  for (auto i = 0; i < op->InputSize(); ++i) {
+    if (op->Input<TensorCUDA>(i).IsType<T>()) {
+      continue;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+}; // namespace
+
+class NCCLAllreduceOp final : public Operator<CUDAContext> {
+ public:
+  using Operator::Operator;
+  NCCLAllreduceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  bool RunOnDevice() override {
+    if (InputSize() == 1)
+      return true;
+
+    if (AllInputsAre<float>(this)) {
+      nccl::NCCL<float>::AllReduce(getNCCLElements(this, context_));
+      return true;
+    } else if (AllInputsAre<float16>(this)) {
+      nccl::NCCL<float16>::AllReduce(getNCCLElements(this, context_));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+};
+
+class NCCLBroadcastOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  using Operator::Operator;
+  bool RunOnDevice() override {
+    if (InputSize() == 1)
+      return true;
+    if (AllInputsAre<float>(this)) {
+      nccl::NCCL<float>::Broadcast(getNCCLElements(this, context_));
+      return true;
+    } else if (AllInputsAre<float16>(this)) {
+      nccl::NCCL<float16>::Broadcast(getNCCLElements(this, context_));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+};
+
+class NCCLReduceOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  using Operator::Operator;
+  bool RunOnDevice() override {
+    if (InputSize() == 1)
+      return true;
+    const auto& ex = getNCCLElements(this, context_);
+
+    if (AllInputsAre<float>(this)) {
+      nccl::NCCL<float>::Reduce(ex);
+      return true;
+    } else if (AllInputsAre<float16>(this)) {
+      nccl::NCCL<float16>::Reduce(ex);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+};
+
+class NCCLAllGatherOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  using Operator::Operator;
+  bool RunOnDevice() override {
+    if (InputSize() == 1)
+      return true;
+    if (AllInputsAre<float>(this)) {
+      nccl::NCCL<float>::AllGather(getNCCLElements(this, context_));
+      return true;
+    } else if (AllInputsAre<float16>(this)) {
+      nccl::NCCL<float16>::AllGather(getNCCLElements(this, context_));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+};
+
+class NCCLReduceScatterOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  using Operator::Operator;
+  bool RunOnDevice() override {
+    if (AllInputsAre<float>(this)) {
+      nccl::NCCL<float>::ReduceScatter(getNCCLElements(this, context_));
+      return true;
+    } else if (AllInputsAre<float16>(this)) {
+      nccl::NCCL<float16>::ReduceScatter(getNCCLElements(this, context_));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+};
+
+namespace {
+
+std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
+    const OperatorDef& def) {
+  std::vector<DeviceOption> opt;
+  for (int i = 0; i < def.input().size(); ++i) {
+    DeviceOption dev;
+    dev.set_device_type(1);
+    dev.set_cuda_gpu_id(i);
+    opt.push_back(dev);
+  }
+  return std::make_pair(opt, opt);
+}
+
+REGISTER_CUDA_OPERATOR(NCCLAllreduce, NCCLAllreduceOp);
+OPERATOR_SCHEMA(NCCLAllreduce)
+    .NumInputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .NumOutputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .AllowOneToOneInplace()
+    .DeviceInferenceFunction(ncclOpDevInfer);
+SHOULD_NOT_DO_GRADIENT(NCCLAllreduce);
+
+REGISTER_CUDA_OPERATOR(NCCLBroadcast, NCCLBroadcastOp);
+OPERATOR_SCHEMA(NCCLBroadcast)
+    .NumInputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .NumOutputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .EnforceOneToOneInplace()
+    .DeviceInferenceFunction(ncclOpDevInfer);
+
+SHOULD_NOT_DO_GRADIENT(NCCLBroadcast);
+
+REGISTER_CUDA_OPERATOR(NCCLReduce, NCCLReduceOp);
+OPERATOR_SCHEMA(NCCLReduce)
+    .NumInputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .InputsCanCrossDevices()
+    .AllowInplace([](int /*in*/, int out) -> bool { return (out == 0); })
+    .DeviceInferenceFunction(ncclOpDevInfer);
+SHOULD_NOT_DO_GRADIENT(NCCLReduce);
+
+REGISTER_CUDA_OPERATOR(NCCLAllGather, NCCLAllGatherOp);
+OPERATOR_SCHEMA(NCCLAllGather)
+    .NumInputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .NumOutputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction(ncclOpDevInfer);
+SHOULD_NOT_DO_GRADIENT(NCCLAllGather);
+
+REGISTER_CUDA_OPERATOR(NCCLReduceScatter, NCCLReduceScatterOp);
+OPERATOR_SCHEMA(NCCLReduceScatter)
+    .NumInputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .NumOutputs(1, CAFFE2_COMPILE_TIME_MAX_GPUS)
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction(ncclOpDevInfer);
+SHOULD_NOT_DO_GRADIENT(NCCLReduceScatter);
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
new file mode 100644
index 0000000..7e8a61e
--- /dev/null
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -0,0 +1,192 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, assume
+import numpy as np
+import time
+import os
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, muji, dyndep
+import caffe2.python.hypothesis_test_util as hu
+
+np.random.seed(1)
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/nccl:nccl_ops')
+
+
+def gpu_device(i):
+    device_option = caffe2_pb2.DeviceOption()
+    device_option.device_type = caffe2_pb2.CUDA
+    device_option.cuda_gpu_id = i
+    return device_option
+
+
+def benchmark(ws, net, warmups=5, iters=100):
+    for _ in range(warmups):
+        ws.run(net)
+    plan = core.Plan("plan")
+    plan.AddStep(core.ExecutionStep("test-step", net, iters))
+    before = time.time()
+    ws.run(plan)
+    after = time.time()
+    print("Timing network, time taken per-iteration: {:.6f}ms".format((
+        after - before) / float(iters) * 1000.0))
+    return after - before
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "NCCL only on GPU")
+class NCCLOpsTest(hu.HypothesisTestCase):
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=1, max_value=1000),
+           in_place=st.booleans())
+    def test_nccl_allreduce(self, n, m, in_place):
+        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        prefix = "" if in_place else "o"
+        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
+        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
+        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
+
+        def allreduce(*args):
+            assert len(args) == n
+            output = np.sum(args, axis=0)
+            return [output for _ in range(n)]
+
+        outputs = self.assertReferenceChecks(
+            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
+            allreduce, input_device_options)
+        for output in outputs:
+            np.testing.assert_array_equal(outputs[0], output)
+            self.assertEqual(outputs[0].tobytes(), output.tobytes())
+
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=1, max_value=1000),
+           root=st.integers(min_value=0,
+                            max_value=workspace.NumCudaDevices() - 1))
+    def test_nccl_broadcast(self, n, m, root):
+        assume(root < n)
+        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
+        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
+
+        def broadcast(*args):
+            assert len(args) == n
+            return [args[root] for _ in range(n)]
+
+        self.assertReferenceChecks(
+            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
+            broadcast, input_device_options)
+
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=1, max_value=1000),
+           # NCCL Reduce seems to deadlock for non-zero roots.
+           root=st.integers(min_value=0, max_value=0),
+           in_place=st.booleans())
+    def test_nccl_reduce(self, n, m, root, in_place):
+        assume(in_place is False or root == 0)
+        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        op = core.CreateOperator(
+            "NCCLReduce", inputs,
+            inputs[root] if in_place else b"o", root=root)
+        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
+
+        def reduce(*args):
+            assert len(args) == n
+            return [np.sum(args, axis=0)]
+
+        self.assertReferenceChecks(
+            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
+            reduce, input_device_options)
+
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=1, max_value=1000))
+    def test_nccl_allgather(self, n, m):
+        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        outputs = [str("o_{}".format(i)) for i in range(n)]
+        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
+        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
+
+        def allgather(*args):
+            assert len(args) == n
+            return [np.stack(args, axis=0) for _ in range(n)]
+
+        outputs = self.assertReferenceChecks(
+            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
+            allgather, input_device_options)
+        for output in outputs:
+            np.testing.assert_array_equal(outputs[0], output)
+            self.assertEqual(outputs[0].tobytes(), output.tobytes())
+
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=1, max_value=1000))
+    def test_nccl_reduce_scatter(self, n, m):
+        xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        outputs = [str("o_{}".format(i)) for i in range(n)]
+        op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
+        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
+
+        def reduce_scatter(*args):
+            assert len(args) == n
+            reduced = sum(args)
+            assert len(reduced.shape) > 1
+            ref = [reduced[i, :] for i in range(n)]
+            return ref
+
+        self.assertReferenceChecks(
+            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
+            reduce_scatter, input_device_options)
+
+    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
+           m=st.integers(min_value=100000, max_value=100000),
+           iters=st.integers(min_value=1, max_value=100),
+           net_type=st.sampled_from(["dag", "async_dag", "simple"]))
+    def _test_nccl_sync(self, n, m, iters, net_type):
+        inputs = [str("x_{}".format(i)) for i in range(n)]
+        extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
+        net = core.Net("asdf")
+        net.Proto().type = net_type
+        net.Proto().num_workers = n
+        for i in range(n):
+            net.ConstantFill([], inputs[i], shape=[m], value=0.0,
+                             device_option=gpu_device(i))
+            net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0,
+                             device_option=gpu_device(i))
+            for _ in range(iters):
+                net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
+                        device_option=gpu_device(i))
+        net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
+        self.ws.run(net)
+        np.testing.assert_array_equal(
+            self.ws.blobs[inputs[0]].fetch(),
+            np.full(shape=(m,), fill_value=iters * n, dtype=np.float32))
+
+    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
+    def test_timings(self):
+        for n in range(2, workspace.NumCudaDevices()):
+            for in_place in [False, True]:
+                xs = [np.random.randn(1e7).astype(np.float32)
+                      for i in range(n)]
+                inputs = [str("x_{}".format(i)) for i in range(n)]
+                prefix = "" if in_place else "o"
+                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
+
+                net = core.Net("test")
+                net.NCCLAllreduce(inputs, outputs)
+                net.RunAllOnGPU()
+                for i in range(n):
+                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
+                self.ws.run(net)
+                net_time = benchmark(self.ws, net)
+                vanilla = core.Net("vanilla")
+                muji.Allreduce(vanilla, inputs)
+                vanilla_time = benchmark(self.ws, vanilla)
+                print("Speedup for NCCL: {:.2f}".format(
+                    vanilla_time / net_time))
diff --git a/caffe2/contrib/nervana/CMakeLists.txt b/caffe2/contrib/nervana/CMakeLists.txt
new file mode 100644
index 0000000..074ffb6
--- /dev/null
+++ b/caffe2/contrib/nervana/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(USE_NERVANA_GPU)
+  message(STATUS "Include Nervana operators")
+  set(Caffe2_CONTRIB_NCCL_GPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_c_api.cu"
+    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_fc_op_gpu.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_init_gpu.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_math_gpu.cc"
+  )
+
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+endif()
diff --git a/caffe2/contrib/nervana/nervana.h b/caffe2/contrib/nervana/nervana.h
new file mode 100644
index 0000000..9714e39
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_FB_NERVANA_INIT_H_
+#define CAFFE2_FB_NERVANA_INIT_H_
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/flags.h"
+
+#include "nervana_c_api.h"
+
+/**
+ * A flag that specifies the nervana cubin path.
+ */
+CAFFE2_DECLARE_string(nervana_cubin_path);
+
+namespace caffe2 {
+
+/**
+ * An empty class to be used in identifying the engine in the math functions.
+ */
+class NervanaEngine {};
+
+/**
+ * Returns whether the nervana kernels are loaded or not.
+ */
+bool NervanaKernelLoaded();
+
+/**
+ * An initialization function that is run once by caffe2::GlobalInit()
+ * that initializes the nervana kernels.
+ */
+bool Caffe2InitializeNervanaKernels();
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_FB_NERVANA_INIT_H_
diff --git a/caffe2/contrib/nervana/nervana_c_api.cu b/caffe2/contrib/nervana/nervana_c_api.cu
new file mode 100644
index 0000000..bbbb689
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_c_api.cu
@@ -0,0 +1,419 @@
+/*
+ * Copyright 2015 Baidu USA, Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <string>
+#include <map>
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+#include <mutex>
+#include <tuple>
+#include "nervana_c_api.h"
+
+std::map<CUdevice, int> nervana_sm_counts_;
+std::map<std::string, CUfunction> nervana_kernels_;
+std::vector<CUmodule> nervana_modules_;
+
+//for when we need to modify the above data structures
+std::mutex nervana_load_kernels_mutex_;
+std::mutex nervana_sm_count_mutex_;
+
+extern "C" bool nervana_loadKernels(const char* const base_path_cstr) {
+    std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
+
+    //better would be a vector<string>, but there is a bug in nvcc that prevents this
+    // (bug report filed) (fixed in 7.5)
+    std::string names[36] = {
+        "hgemm_nn_vec_128x128",
+        "hgemm_nn_128x128",
+        "hgemm_nt_vec_128x128",
+        "hgemm_nt_128x128",
+        "hgemm_tn_vec_128x128",
+        "hgemm_tn_128x128",
+        "hgemm_nn_vec_128x64",
+        "hgemm_nn_128x64",
+        "hgemm_tn_vec_128x64",
+        "hgemm_tn_128x64",
+        "hgemm_nn_vec_128x32",
+        "hgemm_nn_128x32",
+        "hgemm_tn_vec_128x32",
+        "hgemm_tn_128x32",
+        "hgemm_nn_32x128",
+        "hgemm_nn_vec_32x128",
+        "hgemm_nt_32x128",
+        "hgemm_nt_vec_32x128",
+        "sgemm_nn_vec_128x128",
+        "sgemm_nn_128x128",
+        "sgemm_nt_vec_128x128",
+        "sgemm_nt_128x128",
+        "sgemm_tn_vec_128x128",
+        "sgemm_tn_128x128",
+        "sgemm_nn_vec_128x64",
+        "sgemm_nn_128x64",
+        "sgemm_tn_vec_128x64",
+        "sgemm_tn_128x64",
+        "sgemm_nn_vec_128x32",
+        "sgemm_nn_128x32",
+        "sgemm_tn_vec_128x32",
+        "sgemm_tn_128x32",
+        "sgemm_nn_32x128",
+        "sgemm_nn_vec_32x128",
+        "sgemm_nt_32x128",
+        "sgemm_nt_vec_32x128"
+    };
+
+    std::string base_path(base_path_cstr);
+
+    for (auto kernel : names) {
+        if (nervana_kernels_.count(kernel) > 0)
+            continue;
+
+        CUmodule module;
+
+        std::string path = base_path + kernel + std::string(".cubin");
+        CUresult res = cuModuleLoad(&module, path.c_str());
+
+        if (res != CUDA_SUCCESS) {
+            // std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
+            return false;
+        }
+
+        nervana_modules_.push_back(module);
+
+        CUfunction function;
+        res = cuModuleGetFunction(&function, module, kernel.c_str());
+        if (res != CUDA_SUCCESS) {
+            // std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
+            return false;
+        }
+
+        nervana_kernels_.insert(std::make_pair(kernel, function));
+    }
+
+    return true;
+}
+
+extern "C" bool nervana_unloadKernels() {
+    std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
+    while(nervana_modules_.size() > 0) {
+        auto module = nervana_modules_.back();
+        CUresult res = cuModuleUnload(module);
+
+        nervana_modules_.pop_back();
+
+        if (res != CUDA_SUCCESS)
+            return false;
+    }
+
+    nervana_kernels_.clear();
+
+    return true;
+}
+
+extern "C" size_t nervana_randStateSizeBytes() {
+    return 2048 * 32 * sizeof(int);
+}
+
+std::tuple<int, int, int> get_grid_dimensions(int grid, int m, int n, int sm_count, const std::string& trans)
+{
+    int sizeA, sizeB, threads;
+    if (grid >= 0) {
+        if (grid == 0) {
+            sizeA = 32;
+            sizeB = 128;
+            threads = 128;
+        } else if (grid == 1) {
+            sizeA = 128;
+            sizeB = 32;
+            threads = 128;
+        } else if (grid == 2) {
+            sizeA = 128;
+            sizeB = 64;
+            threads = 128;
+        } else if (grid == 3) {
+            sizeA = 128;
+            sizeB = 128;
+            threads = 256;
+        }
+    } else {
+        int sh = min(m, n);
+
+        int size;
+        if (sh < 384 - 16) {
+            int sh128 = sh % 128;
+            if (sh128 > 0 && sh128 < 112) {
+                if (sh128 > 48 && sh128 <= 64) {
+                    int sh64 = sh / 64;
+                    int wide = max(m, n);
+                    sh64 *= (wide / 128 + (wide % 128 != 0)) / sm_count;
+                    if (sh64 > 1) {
+                        size = 64;
+                    }
+                    else {
+                        size = 32;
+                    }
+                }
+                else {
+                    size = 32;
+                }
+            }
+            else {
+                size = 128;
+            }
+        } else {
+            size = 128;
+        }
+
+        if (m >= n) {
+            if (trans == "nt") {
+                size = 128;
+            }
+            sizeA = 128;
+            sizeB = size;
+        } else {
+            if (trans == "tn") {
+                size = 128;
+            } else if (size == 64) {
+                //temporary until kernels exist
+                size = 32;
+            }
+            sizeA = size;
+            sizeB = 128;
+        }
+        threads = (sizeA == 128 && sizeB == 128) ? 256 : 128;
+    }
+
+    return std::make_tuple(sizeA, sizeB, threads);
+}
+
+extern "C" bool nervana_sgemm(float *A, float *B, float *C,
+                              bool a_t, bool b_t,
+                              int m, int n, int k,
+                              int lda, int ldb, int ldc,
+                              float alpha, float beta,
+                              unsigned int *rand_state,
+                              bool stochastic_round, bool apply_relu,
+                              CUstream stream, int grid
+                             )
+{
+    int sm_count;
+    {
+        std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
+
+        CUdevice device;
+        CUresult res = cuCtxGetDevice(&device);
+        if (res != CUDA_SUCCESS) {
+            return false;
+        }
+        auto count = nervana_sm_counts_.find(device);
+        if (count != nervana_sm_counts_.end()) {
+            sm_count = count->second;
+        }
+        else {
+            int pi;
+            res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
+            if (res != CUDA_SUCCESS) {
+                return false;
+            }
+            sm_count = pi;
+            nervana_sm_counts_[device] = pi;
+        }
+    }
+
+    std::string name = "sgemm_";
+
+    std::string trans;
+    trans += a_t ? 't' : 'n';
+    trans += b_t ? 't' : 'n';
+
+    name += trans;
+
+    int sizeA, sizeB, threads;
+
+    std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
+
+    int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
+
+    if ( (trans == "tn" && m % 4 == 0  && n % 4 == 0) ||
+         (trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
+         (trans == "nt" && k % k_vec == 0)) {
+         name += "_vec";
+    }
+
+    int gridA = m / sizeA + (m % sizeA != 0);
+    int gridB = n / sizeB + (n % sizeB != 0);
+    std::stringstream ss;
+    ss << "_" << sizeA << "x" << sizeB;
+    name += ss.str();
+
+    int flags = 0;
+    flags |= (stochastic_round << 0);
+    flags |= (apply_relu << 1);
+
+    CUresult res;
+
+    if (a_t)
+        lda *= (8 * sizeof(float));
+
+    if (!b_t)
+        ldb *= (8 * sizeof(float));
+
+    int zero = 0;
+    void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
+                      &zero, &zero, &zero, &zero};
+
+    res = cuLaunchKernel(nervana_kernels_[name],
+                         1, gridA, gridB,
+                         threads, 1, 1,
+                         0,
+                         stream, args, NULL);
+
+    if (res != CUDA_SUCCESS) {
+        std::cerr << "Error launching kernel " << name << " " << res << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+extern "C" bool nervana_hgemm(short *A, short *B, short *C,
+                              bool a_t, bool b_t,
+                              int m, int n, int k,
+                              int lda, int ldb, int ldc,
+                              float alpha, float beta,
+                              unsigned int *rand_state,
+                              bool stochastic_round, bool apply_relu,
+                              CUstream stream, int grid
+                             )
+{
+    int sm_count;
+    {
+        std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
+
+        CUdevice device;
+        CUresult res = cuCtxGetDevice(&device);
+        if (res != CUDA_SUCCESS) {
+            return false;
+        }
+        auto count = nervana_sm_counts_.find(device);
+        if (count != nervana_sm_counts_.end()) {
+            sm_count = count->second;
+        }
+        else {
+            int pi;
+            res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
+            if (res != CUDA_SUCCESS) {
+                return false;
+            }
+            sm_count = pi;
+            nervana_sm_counts_[device] = pi;
+        }
+    }
+
+    std::string name = "hgemm_";
+
+    std::string trans;
+    trans += a_t ? 't' : 'n';
+    trans += b_t ? 't' : 'n';
+
+    name += trans;
+
+    int sizeA, sizeB, threads;
+
+    std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
+
+    int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
+
+    if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
+         (trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
+         (trans == "nt" && k % k_vec == 0)) {
+         name += "_vec";
+    }
+
+    int gridA = m / sizeA + (m % sizeA != 0);
+    int gridB = n / sizeB + (n % sizeB != 0);
+    std::stringstream ss;
+    ss << "_" << sizeA << "x" << sizeB;
+    name += ss.str();
+
+    int flags = 0;
+    flags |= (stochastic_round << 0);
+    flags |= (apply_relu << 1);
+
+    CUresult res;
+
+    if (a_t)
+        lda *= (8 * sizeof(short));
+
+    if (!b_t)
+        ldb *= (8 * sizeof(short));
+
+    int zero = 0;
+    void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
+                      &zero, &zero, &zero, &zero};
+
+    res = cuLaunchKernel(nervana_kernels_[name],
+                         1, gridA, gridB,
+                         threads, 1, 1,
+                         0,
+                         stream, args, NULL);
+
+    if (res != CUDA_SUCCESS) {
+        std::cerr << "Error launching kernel " << name << " " << res << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+extern "C" bool nervana_sgemm_colmajor(float *A, float *B, float *C,
+                                       bool a_t, bool b_t,
+                                       int m, int n, int k,
+                                       int lda, int ldb, int ldc,
+                                       float alpha, float beta,
+                                       unsigned int *rand_state,
+                                       bool stochastic_round, bool apply_relu,
+                                       CUstream stream, int grid
+                                      )
+{
+    return nervana_sgemm(B, A, C,
+                         b_t, a_t,
+                         n, m, k,
+                         ldb, lda, ldc,
+                         alpha, beta,
+                         rand_state, stochastic_round, apply_relu,
+                         stream, grid);
+}
+
+extern "C" bool nervana_hgemm_colmajor(short *A, short *B, short *C,
+                                       bool a_t, bool b_t,
+                                       int m, int n, int k,
+                                       int lda, int ldb, int ldc,
+                                       float alpha, float beta,
+                                       unsigned int *rand_state,
+                                       bool stochastic_round, bool apply_relu,
+                                       CUstream stream, int grid
+                                      )
+{
+    return nervana_hgemm(B, A, C,
+                         b_t, a_t,
+                         n, m, k,
+                         ldb, lda, ldc,
+                         alpha, beta,
+                         rand_state, stochastic_round, apply_relu,
+                         stream, grid);
+}
diff --git a/caffe2/contrib/nervana/nervana_c_api.h b/caffe2/contrib/nervana/nervana_c_api.h
new file mode 100644
index 0000000..bbcf255
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_c_api.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2015 Baidu USA, Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#ifdef __cplusplus
+extern "C" {
+#else
+#include <stdbool.h>
+#endif
+
+/** Load all the sgemm and hgemm cubins from the given path
+ * \param [in] base_path path to the kernel cubins
+ * \return true on success and false if an error was encountered
+ */
+bool nervana_loadKernels(const char* const base_path);
+
+/** Unload all currently loaded cubins
+ * \return true on success and false if an error was encountered
+ */
+bool nervana_unloadKernels();
+
+/** Return the number of bytes required for the random state
+ *  used in stochastic rounding.
+ *  \return bytes required for random state
+ */
+ size_t nervana_randStateSizeBytes();
+
+/** Perform BLAS sgemm on alpha * A * B + beta * C, with the
+ *  additional options of stochastic rounding and applying a
+ *  rectified linear unit (relu) to the result.  This routine expects
+ *  all matrices to be in row-major order.
+ *  \param [in] A Pointer to the data for matrix A
+ *  \param [in] B Pointer to the data for matrix B
+ *  \param [in, out] C Pointer to the data for matrix C
+ *  \param [in] m number of rows of C
+ *  \param [in] n number of columns of C
+ *  \param [in] k inner dimension of multiplication
+ *  \param [in] lda leading dimension of two-dimensional array A
+ *  \param [in] ldb leading dimension of two-dimensional array B
+ *  \param [in] ldc leading dimension of two-dimensional array C
+ *  \param [in] alpha scalar used for multiplication
+ *  \param [in] beta scalar used for multiplication
+ *  \param [in, out] rand_state pointer to memory used for random state
+ *              use nervana_randStateSizeBytes to allocate the correct size
+ *              if stochastic_round is false, this can be NULL
+ *  \param [in] stochastic_round true if stochastic rounding should be used
+ *  \param [in] apply_relu true if a relu should be applied to the result
+ *  \param [in] stream The cudaStream on which the kernel should be launched
+ *  \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
+ */
+ bool nervana_sgemm(float *A, float *B, float *C,
+                    bool a_t, bool b_t,
+                    int m, int n, int k,
+                    int lda, int ldb, int ldc,
+                    float alpha, float beta,
+                    unsigned int *rand_state,
+                    bool stochastic_round, bool apply_relu,
+                    CUstream stream, int grid=-1
+                    );
+
+/** Perform BLAS hgemm on alpha * A * B + beta * C, with the
+ *  additional options of stochastic rounding and applying a
+ *  rectified linear unit (relu) to the result.  This routine expects
+ *  all matrices to be in row-major order.
+ *  \param [in] A Pointer to the data for matrix A
+ *  \param [in] B Pointer to the data for matrix B
+ *  \param [in, out] C Pointer to the data for matrix C
+ *  \param [in] m number of rows of C
+ *  \param [in] n number of columns of C
+ *  \param [in] k inner dimension of multiplication
+ *  \param [in] lda leading dimension of two-dimensional array A
+ *  \param [in] ldb leading dimension of two-dimensional array B
+ *  \param [in] ldc leading dimension of two-dimensional array C
+ *  \param [in] alpha scalar used for multiplication
+ *  \param [in] beta scalar used for multiplication
+ *  \param [in, out] rand_state pointer to memory used for random state
+ *              use nervana_randStateSizeBytes to allocate the correct size
+ *              if stochastic_round is false, this can be NULL
+ *  \param [in] stochastic_round true if stochastic rounding should be used
+ *  \param [in] apply_relu true if a relu should be applied to the result
+ *  \param [in] stream The cudaStream on which the kernel should be launched
+ *  \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
+ */
+ bool nervana_hgemm(short *A, short *B, short *C,
+                    bool a_t, bool b_t,
+                    int m, int n, int k,
+                    int lda, int ldb, int ldc,
+                    float alpha, float beta,
+                    unsigned int *rand_state,
+                    bool stochastic_round, bool apply_relu,
+                    CUstream stream, int grid=-1
+                    );
+
+ bool nervana_sgemm_colmajor(float *A, float *B, float *C,
+                             bool a_t, bool b_t,
+                             int m, int n, int k,
+                             int lda, int ldb, int ldc,
+                             float alpha, float beta,
+                             unsigned int *rand_state,
+                             bool stochastic_round, bool apply_relu,
+                             CUstream stream, int grid=-1
+                             );
+
+ bool nervana_hgemm_colmajor(short *A, short *B, short *C,
+                             bool a_t, bool b_t,
+                             int m, int n, int k,
+                             int lda, int ldb, int ldc,
+                             float alpha, float beta,
+                             unsigned int *rand_state,
+                             bool stochastic_round, bool apply_relu,
+                             CUstream stream, int grid=-1
+                             );
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu.cc
new file mode 100644
index 0000000..8d33a7c
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu.cc
@@ -0,0 +1,15 @@
+#include "nervana.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FC,
+    NERVANA,
+    FullyConnectedOp<CUDAContext, NervanaEngine>);
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FCGradient,
+    NERVANA,
+    FullyConnectedGradientOp<CUDAContext, NervanaEngine>);
+}  // namespace caffe2
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
new file mode 100644
index 0000000..3eb0fc3
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -0,0 +1,66 @@
+#include "nervana.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/blob.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/operators/fully_connected_op.h"
+#include "caffe2/utils/math.h"
+#include "common/gtest/gtest_extensions.h"
+
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+namespace {
+static void AddConstInput(const std::vector<int>& shape, const float value,
+                          const string& name, Workspace* ws) {
+  DeviceOption option;
+  option.set_device_type(CUDA);
+  CUDAContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  tensor->Resize(shape);
+  math::Set<float, CUDAContext>(tensor->size(), value,
+                                tensor->mutable_data<float>(),
+                                &context);
+  return;
+}
+}  // namespace
+
+TEST(NervanaFullyConnectedTest, Test) {
+  if (!NervanaKernelLoaded()) {
+    SKIP() << "Nervana kernels are not loaded. Skipping test.";
+  }
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("FC");
+  def.add_input("X");
+  def.add_input("W");
+  def.add_input("B");
+  def.add_output("Y");
+  def.mutable_device_option()->set_device_type(CUDA);
+  def.set_engine("NERVANA");
+  AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
+  AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
+  AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
+  unique_ptr<OperatorBase> op(
+      new FullyConnectedOp<CUDAContext, NervanaEngine>(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+  Blob* Yblob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, Yblob);
+  auto& Y = Yblob->Get<Tensor<CUDAContext>>();
+  TensorCPU Y_cpu(Y);
+  EXPECT_EQ(Y.size(), 5 * 6);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_LT(Y_cpu.data<float>()[i], 10.11);
+    CHECK_GT(Y_cpu.data<float>()[i], 10.09);
+  }
+}
+
+}  // namespace caffe2
diff --git a/caffe2/contrib/nervana/nervana_init_gpu.cc b/caffe2/contrib/nervana/nervana_init_gpu.cc
new file mode 100644
index 0000000..5b7a1ce
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_init_gpu.cc
@@ -0,0 +1,48 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/flags.h"
+
+#include "nervana_c_api.h"
+
+
+CAFFE2_DEFINE_string(nervana_cubin_path,
+                     "/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/cubin/",
+                     "The cubin path for nervana kernels. Currently defaulted "
+                     "to the internal fb deployment path.");
+
+namespace caffe2 {
+
+namespace {
+static bool g_nervana_kernel_loaded = false;
+}  // namespace
+
+bool NervanaKernelLoaded() { return g_nervana_kernel_loaded; }
+
+bool Caffe2InitializeNervanaKernels(int*, char***) {
+  // If we do not specify the nervana cubin path, we will simply return.
+  if (FLAGS_nervana_cubin_path.size() == 0) {
+    VLOG(1) << "Nervana cubin loading skipped.";
+    return true;
+  }
+  g_nervana_kernel_loaded =
+      nervana_loadKernels(FLAGS_nervana_cubin_path.c_str());
+  if (g_nervana_kernel_loaded) {
+    VLOG(1) << "Loaded nervana kernels from path "
+            << FLAGS_nervana_cubin_path;
+  } else {
+    // Since this is not a critical error we will just vlog it.
+    VLOG(1) << "Cannot load nervana gpu kernels from path "
+            << FLAGS_nervana_cubin_path
+            << ", will disable Caffe2 nervana engines.";
+  }
+  // We will always return true for this initialization, because the loading
+  // result is kept and accessible via NervanaKernelLoaded(). This allows us
+  // to register an init function but not forcing the user to have to install
+  // nervana kernels, delaying the failure to the first time a nervana kernel
+  // is actually called.
+  return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeNervanaKernels,
+                              &Caffe2InitializeNervanaKernels,
+                              "Initialize nervana kernels for caffe2.");
+}  // namespace caffe2
diff --git a/caffe2/contrib/nervana/nervana_math_gpu.cc b/caffe2/contrib/nervana/nervana_math_gpu.cc
new file mode 100644
index 0000000..c3e7e6b
--- /dev/null
+++ b/caffe2/contrib/nervana/nervana_math_gpu.cc
@@ -0,0 +1,53 @@
+#include "nervana.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace math {
+
+// Caffe2 gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <>
+void Gemm<float, CUDAContext, NervanaEngine>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CUDAContext* context,
+    TensorProto::DataType /*math_type*/) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  bool a_t = (TransA == CblasTrans);
+  bool b_t = (TransB == CblasTrans);
+  CAFFE_ENFORCE(nervana_sgemm(
+      const_cast<float*>(A),
+      const_cast<float*>(B),
+      C,
+      a_t,
+      b_t,
+      M,
+      N,
+      K,
+      lda,
+      ldb,
+      N,
+      alpha,
+      beta,
+      nullptr,
+      false,
+      false,
+      context->cuda_stream()));
+}
+
+}  // namespace math
+}  // namespace caffe2
diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc
new file mode 100644
index 0000000..502d6c8
--- /dev/null
+++ b/caffe2/contrib/nnpack/nnpack_ops.cc
@@ -0,0 +1,350 @@
+#include "caffe2/core/common.h"
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/leaky_relu_op.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/math.h"
+#include "nnpack.h"
+
+CAFFE2_DEFINE_int(
+    caffe2_nnpack_num_threads, 1,
+    "The number of nnpack pthreadpool threads.");
+CAFFE2_DEFINE_bool(
+    caffe2_nnpack_use_mkl_num_threads, true,
+    "If MKL is built, this sets nnpack to use the same number of threads as "
+    "MKL does. This overrides caffe2_nnpack_num_threads if set.");
+
+namespace caffe2 {
+////////////////////////////////////////////////////////////////////////////////
+// Helper Functions
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+bool has_nnpack() {
+  // nnp_initialize is a noop after the first call so it's safe to invoke it
+  // repeatedly
+  auto nnpack_status = nnp_initialize();
+  return nnpack_status == nnp_status_success;
+}
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algo) {
+  if (algo == "AUTO") {
+    return nnp_convolution_algorithm_auto;
+  }
+  if (algo == "WINOGRAD") {
+    return nnp_convolution_algorithm_wt8x8;
+  }
+  if (algo == "FT16") {
+    return nnp_convolution_algorithm_ft16x16;
+  }
+  if (algo == "FT8") {
+    return nnp_convolution_algorithm_ft8x8;
+  }
+  return nnp_convolution_algorithm_auto;
+}
+
+nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
+    const std::string& kts) {
+  if (kts == "BLOCK") {
+    return nnp_convolution_transform_strategy_block_based;
+  }
+  if (kts == "TUPLE") {
+    return nnp_convolution_transform_strategy_tuple_based;
+  }
+  return nnp_convolution_transform_strategy_block_based;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Pool
+////////////////////////////////////////////////////////////////////////////////
+
+static pthreadpool_t nnpack_threadpool_ = nullptr;
+
+pthreadpool_t nnpack_threadpool() {
+  if (nnpack_threadpool_ == nullptr) {
+    enum nnp_status nnpack_status = nnp_initialize();
+    CAFFE_ENFORCE(
+        nnpack_status == nnp_status_success, "NNPack is not supported here!");
+    int num_threads = FLAGS_caffe2_nnpack_num_threads;
+    if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
+#ifdef CAFFE2_USE_MKL
+      num_threads = mkl_get_max_threads();
+#else
+      VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
+                 "Caffe2 is not built with MKL. Skipping.";
+#endif
+    }
+    nnpack_threadpool_ = pthreadpool_create(num_threads);
+  }
+  return nnpack_threadpool_;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// NNPACK Ops
+////////////////////////////////////////////////////////////////////////////////
+
+class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws),
+        algo_(get_nnp_convolution_algorithm(
+            OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
+        kts_(get_nnp_convolution_transform_strategy(
+            OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "NNPack only supports NCHW order. Please consider adding "
+        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+    OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "The NNPack convolution does not support dilation yet.");
+    // NNPACK can be built with avx2 support only and might not be able to run
+    // on a given machine.
+    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(0);
+    auto& filter = Input(1);
+    auto& bias = Input(2);
+    auto* Y = Output(0);
+
+    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+    const int M = filter.dim32(0);
+
+    CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
+    CAFFE_ENFORCE(filter.ndim(), 4);
+    CAFFE_ENFORCE(C % this->group_ == 0, "");
+    CAFFE_ENFORCE(M % this->group_ == 0, "");
+    CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
+    CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
+    CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
+    CAFFE_ENFORCE(bias.size() == M, "");
+
+    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+    const int oH = Y->dim32(2), oW = Y->dim32(3);
+
+    if (N > 1) {
+      CAFFE_ENFORCE_EQ(
+          this->stride_h(),
+          1,
+          "NNPack only supports stride = 1 when doing batch feedforward");
+      CAFFE_ENFORCE_EQ(
+          this->stride_w(),
+          1,
+          "NNPack only supports stride = 1 when doing batch feedforward");
+    }
+    std::vector<int> pads(
+        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
+    std::vector<int> stride({this->stride_h(), this->stride_w()});
+
+    const size_t input_channels = X.dim32(1);
+    const size_t output_channels = Y->dim32(1);
+    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
+                                 .height = static_cast<size_t>(X.dim32(2))};
+    // filter is MCHW
+    const nnp_size kernel_size = {
+        .width = static_cast<size_t>(filter.dim32(3)),
+        .height = static_cast<size_t>(filter.dim32(2))};
+    // pad is tblr
+    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
+                                 .right = static_cast<size_t>(pads[3]),
+                                 .bottom = static_cast<size_t>(pads[1]),
+                                 .left = static_cast<size_t>(pads[2])};
+
+    const nnp_size output_subsample = {
+        .width = static_cast<size_t>(stride[1]),
+        .height = static_cast<size_t>(stride[0])};
+    if (N == 1) {
+      VLOG(1) << "Running inference mode";
+      for (auto g = 0; g < group_; ++g) {
+        const auto status = nnp_convolution_inference(
+            algo_,
+            kts_,
+            C / group_,
+            M / group_,
+            input_size,
+            padding,
+            kernel_size,
+            output_subsample,
+            X.template data<float>() + g * H * W * (C / group_),
+            filter.template data<float>() + filter.size() / group_ * g,
+            bias.template data<float>() + bias.size() / group_ * g,
+            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
+            nnpack_threadpool(),
+            nullptr);
+        CAFFE_ENFORCE(nnp_status_success == status, "");
+      }
+    } else {
+      VLOG(1) << "Running batched mode";
+      for (auto g = 0; g < group_; ++g) {
+        const auto status = nnp_convolution_output(
+            algo_,
+            N,
+            C / group_,
+            M / group_,
+            input_size,
+            padding,
+            kernel_size,
+            X.template data<float>() + g * H * W * (C / group_),
+            filter.template data<float>() + filter.size() / group_ * g,
+            bias.template data<float>() + bias.size() / group_ * g,
+            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
+            nnpack_threadpool(),
+            nullptr);
+        CAFFE_ENFORCE(nnp_status_success == status, "");
+      }
+    }
+    return true;
+  }
+
+ private:
+  const nnp_convolution_algorithm algo_;
+  const nnp_convolution_transform_strategy kts_;
+};
+
+class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "NNPack only supports NCHW order. Please consider add "
+        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+    OPERATOR_NEEDS_FEATURE(
+        this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
+    OPERATOR_NEEDS_FEATURE(
+        this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
+    OPERATOR_NEEDS_FEATURE(
+        this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
+    OPERATOR_NEEDS_FEATURE(
+        this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_t() == 0,
+        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_l() == 0,
+        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_r() == 0,
+        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_b() == 0,
+        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
+    // NNPACK can be built with avx2 support only and might not be able to run
+    // on a given machine.
+    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(X.ndim() == 4, "");
+    const int H = X.dim32(2), W = X.dim32(3);
+    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
+    std::vector<int> pads(
+        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
+    std::vector<int> stride({this->stride_h(), this->stride_w()});
+    std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
+
+    // Input X is in NCHW order
+    const size_t batch_size = X.dim32(0);
+    const size_t input_channels = X.dim32(1);
+    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
+                                 .height = static_cast<size_t>(X.dim32(2))};
+    // pooling kernel
+    const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
+                                   .height = static_cast<size_t>(pooling[0])};
+    // pad is tblr
+    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
+                                 .right = static_cast<size_t>(pads[3]),
+                                 .bottom = static_cast<size_t>(pads[1]),
+                                 .left = static_cast<size_t>(pads[2])};
+
+    const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
+                                     .height = static_cast<size_t>(stride[0])};
+    const auto status = nnp_max_pooling_output(
+        batch_size,
+        input_channels,
+        input_size,
+        padding,
+        pooling_size,
+        pooling_stride,
+        X.template data<float>(),
+        Y->template mutable_data<float>(),
+        nnpack_threadpool());
+    CAFFE_ENFORCE(nnp_status_success == status, "");
+    return true;
+  }
+
+ private:
+};
+
+class NNPACKReluOp final : public Operator<CPUContext> {
+ public:
+  NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    // NNPACK can be built with avx2 support only and might not be able to run
+    // on a given machine.
+    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
+  }
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    const auto status = nnp_relu_output(
+        1,
+        X.size(),
+        X.template data<float>(),
+        Y->template mutable_data<float>(),
+        0.0,
+        nnpack_threadpool());
+    CAFFE_ENFORCE(nnp_status_success == status, "");
+    return true;
+  }
+
+ private:
+};
+
+class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
+ public:
+  NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : LeakyReluOp<float, CPUContext>(operator_def, ws) {
+    // NNPACK can be built with avx2 support only and might not be able to run
+    // on a given machine.
+    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
+  }
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    const auto status = nnp_relu_output(
+        1,
+        X.size(),
+        X.template data<float>(),
+        Y->template mutable_data<float>(),
+        alpha_,
+        nnpack_threadpool());
+    CAFFE_ENFORCE(nnp_status_success == status, "");
+    return true;
+  }
+
+ private:
+};
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);
+
+} // namespace caffe2
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
new file mode 100644
index 0000000..dbef1ff
--- /dev/null
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@@ -0,0 +1,236 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, assume, settings
+import numpy as np
+import time
+import os
+from caffe2.python import core, dyndep
+import caffe2.python.hypothesis_test_util as hu
+
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
+
+np.random.seed(1)
+
+
+def benchmark(ws, net, warmups=5, iters=100):
+    for _ in range(warmups):
+        ws.run(net)
+    plan = core.Plan("plan")
+    plan.AddStep(core.ExecutionStep("test-step", net, iters))
+    before = time.time()
+    ws.run(plan)
+    after = time.time()
+    print("Timing network, time taken per-iteration: {:.6f}ms".format((
+        after - before) / float(iters) * 1000.0))
+    return after - before
+
+
+def has_avx2():
+    import subprocess
+    try:
+        subprocess.check_output(["grep", "avx2", "/proc/cpuinfo"])
+        return True
+    except subprocess.CalledProcessError:
+        # grep exits with rc 1 on no matches
+        return False
+
+
+@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
+class NNPackOpsTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 2),
+           kernel=st.integers(3, 5),
+           size=st.integers(5, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 5),
+           groups=st.integers(1, 2))
+    def test_convolution_correctness(self, stride, pad, kernel, size,
+                                     input_channels, output_channels,
+                                     batch_size, groups):
+        assume(input_channels % groups == 0)
+        assume(output_channels % groups == 0)
+        assume(output_channels == input_channels / groups)
+        assume(stride <= kernel)
+        if stride != 1:
+            assume(batch_size == 1)
+
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, input_channels, kernel, kernel).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        order = "NCHW"
+        outputs = {}
+        for engine in ["", "NNPACK"]:
+            op = core.CreateOperator(
+                "Conv",
+                ["X", "w", "b"],
+                ["Y"],
+                stride=stride,
+                kernel=kernel,
+                pad=pad,
+                order=order,
+                kts="TUPLE",
+                engine=engine,
+                group=groups,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.create_blob("w").feed(w)
+            self.ws.create_blob("b").feed(b)
+            self.ws.run(op)
+            outputs[engine] = self.ws.blobs["Y"].fetch()
+        np.testing.assert_allclose(
+            outputs[""],
+            outputs["NNPACK"],
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(size=st.sampled_from([6, 8]),
+           input_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 5))
+    def test_max_pool_correctness(self, size, input_channels, batch_size):
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        order = "NCHW"
+        outputs = {}
+        # only 2 * 2 stride and 2 * 2 pool is supported in NNPack now
+        stride = 2
+        kernel = 2
+        # The pooling strategy of NNPack is different from caffe2 pooling
+        pad = 0
+        for engine in ["", "NNPACK"]:
+            op = core.CreateOperator(
+                "MaxPool",
+                ["X"],
+                ["Y"],
+                stride=stride,
+                kernel=kernel,
+                pad=pad,
+                order=order,
+                engine=engine,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.run(op)
+            outputs[engine] = self.ws.blobs["Y"].fetch()
+        np.testing.assert_allclose(
+            outputs[""],
+            outputs["NNPACK"],
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(size=st.sampled_from([6, 8]),
+           input_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 5))
+    def test_relu_correctness(self, size, input_channels, batch_size):
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        outputs = {}
+        for engine in ["", "NNPACK"]:
+            op = core.CreateOperator(
+                "Relu",
+                ["X"],
+                ["Y"],
+                engine=engine,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.run(op)
+            outputs[engine] = self.ws.blobs["Y"].fetch()
+        np.testing.assert_allclose(
+            outputs[""],
+            outputs["NNPACK"],
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(size=st.sampled_from([6, 8]),
+           input_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 5),
+           alpha=st.floats(0, 1))
+    def test_leaky_relu_correctness(self, size, input_channels, batch_size,
+                                    alpha):
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        outputs = {}
+        for engine in ["", "NNPACK"]:
+            op = core.CreateOperator(
+                "LeakyRelu",
+                ["X"],
+                ["Y"],
+                alpha=alpha,
+                engine=engine,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.run(op)
+            outputs[engine] = self.ws.blobs["Y"].fetch()
+        np.testing.assert_allclose(
+            outputs[""],
+            outputs["NNPACK"],
+            atol=1e-4,
+            rtol=1e-4)
+
+    @settings(timeout=3600)
+    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
+    @given(stride=st.integers(1, 1),
+           pad=st.integers(0, 2),
+           kernel=st.sampled_from([3, 5, 7]),
+           size=st.integers(30, 90),
+           input_channels=st.sampled_from([3, 64, 256]),
+           output_channels=st.sampled_from([32, 96, 256]),
+           batch_size=st.sampled_from([32, 64, 96, 128]))
+    def test_timings(self, stride, pad, kernel, size,
+                     input_channels, output_channels, batch_size):
+        assume(stride <= kernel)
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(output_channels, input_channels,
+                           kernel, kernel).astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        order = "NCHW"
+        times = {}
+        for engine in ["", "NNPACK"]:
+            net = core.Net(engine + "_test")
+            net.Conv(
+                ["X", "W", "b"], "Y",
+                order=order,
+                kernel=kernel,
+                stride=stride,
+                pad=pad,
+                kts="TUPLE",
+                engine=engine,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.create_blob("W").feed(w)
+            self.ws.create_blob("b").feed(b)
+            self.ws.run(net)
+            times[engine] = benchmark(self.ws, net)
+        print("Speedup for NNPACK: {:.2f}".format(
+            times[""] / times["NNPACK"]))
+
+    @settings(timeout=3600)
+    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
+    @given(size=st.integers(30, 90),
+           input_channels=st.sampled_from([3, 64, 256]),
+           batch_size=st.sampled_from([32, 64, 96, 128]))
+    def test_relu_timings(self, size, input_channels, batch_size):
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        times = {}
+        for engine in ["", "NNPACK"]:
+            net = core.Net(engine + "_test")
+            net.Relu(
+                ["X"],
+                ["Y"],
+                engine=engine,
+            )
+            self.ws.create_blob("X").feed(X)
+            self.ws.run(net)
+            times[engine] = benchmark(self.ws, net)
+        print("Speedup for NNPACK: {:.2f}".format(
+            times[""] / times["NNPACK"]))
diff --git a/caffe2/contrib/opencl/CMakeLists.txt b/caffe2/contrib/opencl/CMakeLists.txt
new file mode 100644
index 0000000..5d4a036
--- /dev/null
+++ b/caffe2/contrib/opencl/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(USE_OPENCL)
+  set(Caffe2_CONTRIB_OPENCL_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
+  )
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_OPENCL_CPU_SRC} PARENT_SCOPE)
+endif()
+
diff --git a/caffe2/contrib/opencl/OpenCL/cl.hpp b/caffe2/contrib/opencl/OpenCL/cl.hpp
new file mode 100644
index 0000000..5c9be5c
--- /dev/null
+++ b/caffe2/contrib/opencl/OpenCL/cl.hpp
@@ -0,0 +1,12906 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.7
+ *   \date January 2015
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (_MSC_VER >= 1700) || (__cplusplus >= 201103L)
+#define CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#define CL_HPP_CPP11_ATOMICS_SUPPORTED
+#include <atomic>
+#endif
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT noexcept
+#else
+#define CL_HPP_NOEXCEPT
+#endif
+
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  __ERR_STR(clLinkProgram)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            ::size_t copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    //! \brief Resizes the vector to the given size
+    void resize(unsigned int newSize, T fill = T())
+    {
+        if (newSize > N)
+        {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+        else
+        {
+            while (size_ < newSize)
+            {
+                new (&data_[size_]) T(fill);
+                size_++;
+            }
+            while (size_ > newSize)
+            {
+                --size_;
+                data_[size_].~T();
+            }
+        }
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(std::atomic<int> * dest, int exchange, int comparand)
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_compare_exchange_strong(dest, &comparand, exchange);
+        return comparand;
+#elif _MSC_VER
+        return (int)(_InterlockedCompareExchange(
+            (volatile long*)dest,
+            (long)exchange,
+            (long)comparand));
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        return (__sync_val_compare_and_swap(
+            dest,
+            comparand,
+            exchange));
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+
+    inline void fence() {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+#elif _MSC_VER // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        _ReadWriteBarrier();
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        __sync_synchronize();
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+} // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    // std::string has a constant data member
+    // a char vector does not
+    VECTOR_CLASS<char> value(required);
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (param) {
+        param->assign(value.begin(), value.end());
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(Device&& dev) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        
+        return Platform(ids[0]);
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must represent contiguous data.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+    
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(const BufferD3D10& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+*
+*  This is provided to facilitate interoperability with OpenGL.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class BufferRenderGL : 
+#if defined(CL_VERSION_1_2)
+    public ImageGL
+#else // #if defined(CL_VERSION_1_2)
+    public Image2DGL
+#endif //#if defined(CL_VERSION_1_2)
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+    *         GL Renderbuffer.
+    *
+    *  Wraps clCreateFromGLRenderbuffer().
+    */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL() : ImageGL() {};
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL() : Image2DGL() {};
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+    *
+    *  See Memory for further details.
+    */
+#if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : ImageGL(buffer) { }
+#else // #if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Image2DGL(buffer) { }
+#endif //#if defined(CL_VERSION_1_2)
+
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : ImageGL(buf) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : Image2DGL(buf) {}
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (const BufferRenderGL &rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : ImageGL(std::move(buf)) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : Image2DGL(std::move(buf)) {}
+#endif //#if defined(CL_VERSION_1_2)
+    
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(std::move(buf));
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(std::move(buf));
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_, type, gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                0,
+                NULL,
+                NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+    typedef detail::KernelFunctorGlobal<             
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+#undef CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#undef CL_HPP_NOEXCEPT
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/caffe2/contrib/opencl/context.cc b/caffe2/contrib/opencl/context.cc
new file mode 100644
index 0000000..f0cf9c3
--- /dev/null
+++ b/caffe2/contrib/opencl/context.cc
@@ -0,0 +1,107 @@
+#include "context.h"
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(Tensor<OpenCLContext>);
+
+OpenCLContextSingleton::OpenCLContextSingleton() {
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) {
+    CAFFE_THROW("Cannot find platform for OpenCL.");
+  }
+  platform = platforms[platform_id];
+
+  devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) {
+    CAFFE_THROW("Cannot find OpenCL compatible device.");
+  }
+  device = devices[device_id];
+
+  context = cl::Context({device});
+  queue = cl::CommandQueue(context, device);
+}
+
+OpenCLContextSingleton& OpenCLContextSingleton::getInstance() {
+  static OpenCLContextSingleton* instance;
+  if (instance == nullptr) {
+    instance = new OpenCLContextSingleton();
+  }
+  return *instance;
+}
+
+std::pair<void*, MemoryDeleter> OpenCLContext::New(size_t nbytes) {
+  auto& ctx = GetSingleton();
+  cl_int err = 0;
+
+  cl::Buffer* buffer = new cl::Buffer(ctx.context, CL_MEM_READ_WRITE,
+      nbytes, nullptr, &err);
+  OPENCL_CHECK(err);
+  // TODO(bwasti): use host ptr if possible to make CopyBytes free
+  return std::make_pair((void *)buffer, OpenCLContext::Delete);
+}
+
+void OpenCLContext::Delete(void *ptr) {
+  delete (cl::Buffer *)ptr;
+}
+
+struct OpenCLContextSingleton& OpenCLContext::GetSingleton() {
+  return OpenCLContextSingleton::getInstance();
+}
+
+cl::Kernel OpenCLContext::BuildKernel(const char* src, std::string additional_options, const char* fn_name) {
+  auto& ctx = GetSingleton();
+
+  cl::Program::Sources source(1,
+      std::make_pair(src, strlen(src)));
+
+  cl::Program p = cl::Program(ctx.context, source);
+  cl_int err = CL_SUCCESS;
+  std::string options = "-cl-std=CL1.1 -cl-fast-relaxed-math -cl-single-precision-constant";
+  options += additional_options;
+  err = p.build(ctx.devices, options.c_str());
+  cl_build_status build_status = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(ctx.device);
+  if (err != CL_SUCCESS || build_status != CL_BUILD_SUCCESS) {
+    auto str = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(ctx.device);
+    LOG(ERROR) << str;
+    CAFFE_THROW(str);
+  }
+
+  auto kernel = cl::Kernel(p, fn_name, &err);
+  OPENCL_CHECK(err);
+  return kernel;
+}
+
+std::string OpenCLContext::BuildArgumentList(std::vector<std::pair<std::string, std::string>> args) {
+  std::string out = " "; // There may be args before this
+  for (auto arg : args) {
+    out += "-D " + arg.first + "=" + arg.second + " ";
+  }
+  return out;
+}
+
+void EventCreateOPENCL(const DeviceOption& /* unused */, Event* /* unused */) {}
+void EventRecordOPENCL(
+    Event* /* unused */,
+    const void* /* unused */,
+    const char* /* unused */) {}
+void EventWaitOPENCL(const Event* /* unused */, void* /* unused */) {}
+void EventFinishOPENCL(const Event* /* unused */) {}
+void EventResetOPENCL(Event* /* unused */) {}
+
+REGISTER_EVENT_CREATE_FUNCTION(OPENCL, EventCreateOPENCL);
+REGISTER_EVENT_RECORD_FUNCTION(OPENCL, EventRecordOPENCL);
+REGISTER_EVENT_WAIT_FUNCTION(OPENCL, OPENCL, EventWaitOPENCL);
+REGISTER_EVENT_FINISH_FUNCTION(OPENCL, EventFinishOPENCL);
+REGISTER_EVENT_RESET_FUNCTION(OPENCL, EventResetOPENCL);
+
+} // namespace caffe2
diff --git a/caffe2/contrib/opencl/context.h b/caffe2/contrib/opencl/context.h
new file mode 100644
index 0000000..64426b1
--- /dev/null
+++ b/caffe2/contrib/opencl/context.h
@@ -0,0 +1,104 @@
+#ifndef CAFFE2_OPENCL_CONTEXT_H_
+#define CAFFE2_OPENCL_CONTEXT_H_
+
+#include "caffe2/core/context.h"
+
+#define CL_HPP_ENABLE_EXCEPTIONS 1
+#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+//#include "libopencl.h"
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.hpp>
+#else
+#include <CL/cl.hpp>
+#endif
+
+#define OPENCL_CHECK(expr) (void)expr
+
+namespace caffe2 {
+
+struct OpenCLContextSingleton {
+ private:
+  OpenCLContextSingleton();
+  OpenCLContextSingleton(const OpenCLContextSingleton &) = delete;
+  OpenCLContextSingleton(OpenCLContextSingleton&&) = delete;
+ public:
+  static OpenCLContextSingleton& getInstance();
+  cl::Platform platform;
+  cl::Device device;
+  std::vector<cl::Device> devices;
+  cl::Context context;
+  cl::CommandQueue queue;
+};
+
+class OpenCLContext final {
+ public:
+  explicit OpenCLContext();
+  explicit OpenCLContext(const DeviceOption& option) {
+    DCHECK_EQ(option.device_type(), OPENCL);
+    OpenCLContext();
+  }
+  ~OpenCLContext() {}
+
+  /*
+   * Everything below is basically boiler plate for Context classes
+   */
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+
+  static void Delete(void* data);
+
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {}
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(int n, const T* src, T* dst) {
+    CopyBytes<SrcContext, DstContext>(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    CAFFE_ENFORCE(!meta.copy(), "OpenCLContext requires fundamental types.");
+    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+  }
+
+  void SwitchToDevice(int a, ...) {
+    auto& ctx = GetSingleton();
+    CAFFE_ENFORCE(a < ctx.devices.size());
+    ctx.device = ctx.devices[a];
+  }
+  void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  inline void WaitEvent(const Event& ev) { /* TODO */
+  }
+  void FinishDeviceComputation() {
+    auto& ctx = GetSingleton();
+    ctx.queue.finish();
+  }
+
+  inline void Record(Event* ev, const char*&) const { /* TODO */
+  }
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+    return true;
+  }
+  bool HasAsyncPartDefault() const {
+    return false;
+  }
+  bool SupportsAsyncScheduling() const {
+    return false;
+  }
+
+  // OpenCL specific helper functions
+  cl::Kernel BuildKernel(const char* src, std::string additional_options = "", const char* fn_name = "K");
+  static struct OpenCLContextSingleton& GetSingleton();
+  static std::string BuildArgumentList(std::vector<std::pair<std::string, std::string>> args);
+};
+
+
+} // namespace caffe2
+
+#endif /* CAFFE2_OPENCL_CONTEXT_H_ */
diff --git a/caffe2/contrib/opencl/context_test.cc b/caffe2/contrib/opencl/context_test.cc
new file mode 100644
index 0000000..4e71015
--- /dev/null
+++ b/caffe2/contrib/opencl/context_test.cc
@@ -0,0 +1,12 @@
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+TEST(ContextTest, BasicInit) {
+
+}
+
+}  // namespace
+}  // namespace caffe2
+
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
new file mode 100644
index 0000000..5d968b0
--- /dev/null
+++ b/caffe2/contrib/playground/AnyExp.py
@@ -0,0 +1,491 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from abc import abstractmethod
+
+from caffe2.python import workspace
+from caffe2.python import timeout_guard
+from caffe2.python import data_parallel_model
+from . import checkpoint as checkpoint
+
+from . import ModuleRegister as ModuleRegister
+from . import module_map as module_map
+
+# instantiate logger outside of distributed operators may trigger error
+# logger need to be created in each idividual operator instead.
+import os
+import inspect
+import time
+import logging
+logging.basicConfig()
+log = logging.getLogger("AnyExp")
+log.setLevel(logging.DEBUG)
+
+
+def initOpts(opts):
+
+    workspace.GlobalInit(
+        ['caffe2', '--caffe2_log_level=2', '--caffe2_gpu_memory_tracking=0'])
+
+    assert (opts['distributed']['num_gpus'] > 0 or
+            opts['distributed']['num_cpus'] > 0),\
+        "Need to specify num_gpus or num_cpus to decide which device to use."
+
+    trainWithCPU = (opts['distributed']['num_gpus'] == 0)
+    num_xpus = opts['distributed']['num_cpus'] if \
+        trainWithCPU else opts['distributed']['num_gpus']
+    first_xpu = opts['distributed']['first_cpu_id'] if \
+        trainWithCPU else opts['distributed']['first_gpu_id']
+    opts['distributed']['device'] = 'cpu' if trainWithCPU else 'gpu'
+
+    opts['model_param']['combine_spatial_bn'] =\
+        trainWithCPU and opts['model_param']['combine_spatial_bn']
+
+    opts['distributed']['num_xpus'] = num_xpus
+    opts['distributed']['first_xpu_id'] = first_xpu
+    opts['temp_var'] = {}
+    opts['temp_var']['metrics_output'] = {}
+
+    return opts
+
+
+def initDefaultModuleMap():
+    registerModuleMap(module_map)
+
+
+def registerModuleMap(module_map):
+    ModuleRegister.registerModuleMap(module_map)
+
+
+def aquireDatasets(opts):
+    myAquireDataModule = ModuleRegister.getModule(opts['input']['input_name_py'])
+    return myAquireDataModule.get_input_dataset(opts)
+
+
+def createTrainerClass(opts):
+    return ModuleRegister.constructTrainerClass(AnyExpTrainer, opts)
+
+
+def overrideAdditionalMethods(myTrainerClass, opts):
+    return ModuleRegister.overrideAdditionalMethods(myTrainerClass, opts)
+
+
+def initialize_params_from_file(*args, **kwargs):
+    return checkpoint.initialize_params_from_file(*args, **kwargs)
+
+
+class AnyExpTrainer(object):
+
+    def __init__(self, opts):
+        import logging
+        logging.basicConfig()
+        log = logging.getLogger("AnyExp")
+        log.setLevel(logging.DEBUG)
+        self.log = log
+
+        self.opts = opts
+        self.train_dataset = None
+        self.test_dataset = None
+        self.train_df = None
+        self.test_df = None
+
+        self.metrics = {}
+        self.plotsIngredients = []
+
+        self.record_epochs = []
+        self.samples_per_sec = []
+        self.secs_per_train = []
+
+        self.metrics_output = opts['temp_var']['metrics_output']
+
+        first_xpu = opts['distributed']['first_xpu_id']
+        num_xpus = opts['distributed']['num_xpus']
+
+        self.xpus = range(first_xpu, first_xpu + num_xpus)
+
+        self.total_batch_size = \
+            self.opts['epoch_iter']['batch_per_device'] * \
+            self.opts['distributed']['num_xpus'] * \
+            self.opts['distributed']['num_shards']
+        self.epoch_iterations = \
+            self.opts['epoch_iter']['num_train_sample_per_epoch'] // \
+            self.total_batch_size
+
+        if len(opts['input']['datasets']) > 0:
+            self.train_df = opts['input']['datasets'][0]
+            if len(opts['input']['datasets']) == 2:
+                self.test_df = opts['input']['datasets'][1]
+        # at this point, the intance of this class becomes many instances
+        # running on different machines.  Most of their attributes are same,
+        # but the shard_ids are different.
+        self.shard_id = opts['temp_var']['shard_id']
+        self.start_epoch = opts['temp_var']['start_epoch']
+        self.epoch = opts['temp_var']['epoch']
+        self.epochs_to_run = opts['epoch_iter']['num_epochs_per_flow_schedule']
+
+        log.info('opts: {}'.format(str(opts)))
+
+    @abstractmethod
+    def get_input_dataset(self, opts):
+        pass
+
+    @abstractmethod
+    def get_model_input_fun(self):
+        pass
+
+    @abstractmethod
+    def init_model(self):
+        pass
+
+    def init_metrics(self):
+        metrics = self.opts['output']['metrics']
+        for metric in metrics:
+            meterClass = self.getMeterClass(metric['meter_py'])
+            # log.info('metric.meter_kargs {}'.format(metric.meter_kargs))
+            # log.info('type meter_kargs {}'.format(type(metric.meter_kargs)))
+            meterInstance = meterClass(opts=self.opts, **metric['meter_kargs'])
+            self.add_metric(metric['name'], meterInstance, metric['is_train'])
+
+    def getMeterClass(self, meterName):
+        return ModuleRegister.getClassFromModule(meterName, meterName)
+
+    def add_metric(self, name, calculator, is_train):
+        metrics = self.metrics
+        metrics[name] = {}
+        metrics[name]['calculator'] = calculator
+        metrics[name]['is_train'] = is_train
+        metrics[name]['output'] = []
+
+    def extendMetricsOutput(self):
+        metrics_output = self.metrics_output
+        if not metrics_output:
+            metrics_output['epochs'] = self.record_epochs
+            metrics_output['samples_per_sec'] = self.samples_per_sec
+            metrics_output['secs_per_train'] = self.secs_per_train
+            for metric, value in self.metrics.items():
+                metrics_output[metric] = value['output']
+        else:
+            metrics_output['epochs'].extend(self.record_epochs)
+            metrics_output['samples_per_sec'].extend(self.samples_per_sec)
+            metrics_output['secs_per_train'].extend(self.secs_per_train)
+            for metric, value in self.metrics.items():
+                metrics_output[metric].extend(value['output'])
+
+    @abstractmethod
+    def init_plots(self):
+        pass
+
+    def add_plot(self, x, x_title, ys, y_title):
+        plotsIngredients = self.plotsIngredients
+        aPlotIngredients = {}
+        aPlotIngredients['x'] = x
+        aPlotIngredients['x_title'] = x_title
+        aPlotIngredients['ys'] = ys
+        aPlotIngredients['y_title'] = y_title
+        plotsIngredients.append(aPlotIngredients)
+
+    @abstractmethod
+    def init_logs(self):
+        pass
+
+    def list_of_epochs(self):
+        iter_end_point = min(self.opts['epoch_iter']['num_epochs'],
+                             self.epoch +
+                             self.opts['epoch_iter']['num_epochs_per_flow_schedule'])
+        return range(self.epoch, iter_end_point)
+
+    def list_of_epoch_iters(self):
+        return range(0, self.epoch_iterations)
+
+    @abstractmethod
+    def fun_per_epoch_b4RunNet(self, epoch):
+        pass
+
+    @abstractmethod
+    def fun_per_epoch_aftRunNet(self, epoch):
+        pass
+
+    def checkpoint(self, epoch):
+        self.model_path = checkpoint.save_model_params(
+            True, self.train_model, self.gen_checkpoint_path(True, epoch + 1),
+            epoch + 1, self.opts, float('-inf'))
+
+    def gen_checkpoint_path(self, is_checkpoint, epoch):
+        if (is_checkpoint):
+            filename = "model_checkpoint_epoch{}.pkl".format(epoch)
+        else:
+            filename = "model_final.pkl"
+        return self.opts['output']['checkpoint_folder'] + filename
+
+    # @abstractmethod
+    # def gen_checkpoint_path(self, is_checkpoint, epoch):
+    #     pass
+
+    @abstractmethod
+    def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
+        pass
+
+    @abstractmethod
+    def fun_per_iter_aftRunNetB4Test(self, epoch, epoch_iter):
+        pass
+
+    @abstractmethod
+    def fun_per_iter_aftRunNetAftTest(self, epoch, epoch_iter):
+        pass
+
+    @abstractmethod
+    def fun_conclude_operator(self, opts):
+        pass
+
+    def createMetricsPlotsModelsOutputs(self):
+        self.extendMetricsOutput()
+        self.model_output = self.model_path
+
+    @abstractmethod
+    def assembleAllOutputs(self):
+        pass
+
+    @abstractmethod
+    def gen_input_builder_fun(self, model, dataset, is_train):
+        pass
+
+    @abstractmethod
+    def gen_forward_pass_builder_fun(self, model, dataset, is_train):
+        pass
+
+    @abstractmethod
+    def gen_param_update_builder_fun(self, model, dataset, is_train):
+        pass
+
+    @abstractmethod
+    def gen_optimizer_fun(self, model, dataset, is_train):
+        pass
+
+    @abstractmethod
+    def gen_rendezvous_ctx(self, model, dataset, is_train):
+        pass
+
+    @abstractmethod
+    def run_training_net(self):
+        pass
+
+    @abstractmethod
+    def run_testing_net(self):
+        if self.test_model is None:
+            return
+        timeout = 2000.0
+        with timeout_guard.CompleteInTimeOrDie(timeout):
+            workspace.RunNet(self.test_model.net.Proto().name)
+
+    # @abstractmethod
+    def planning_output(self):
+        self.init_metrics()
+        self.init_plots()
+        self.init_logs()
+
+    def prep_data_parallel_models(self):
+        self.prep_a_data_parallel_model(self.train_model,
+                                        self.train_dataset, True)
+        self.prep_a_data_parallel_model(self.test_model,
+                                        self.test_dataset, False)
+
+    def prep_a_data_parallel_model(self, model, dataset, is_train):
+        if model is None:
+            return
+
+        log.info('in prep_a_data_parallel_model')
+
+        param_update = \
+            self.gen_param_update_builder_fun(model, dataset, is_train) \
+            if self.gen_param_update_builder_fun is not None else None
+        log.info('in prep_a_data_parallel_model param_update done ')
+
+        optimizer = \
+            self.gen_optimizer_fun(model, dataset, is_train) \
+            if self.gen_optimizer_fun is not None else None
+        log.info('in prep_a_data_parallel_model optimizer done ')
+
+        max_ops = self.opts['model_param']['max_concurrent_distributed_ops']
+        data_parallel_model.Parallelize(
+            model,
+            input_builder_fun=self.gen_input_builder_fun(model, dataset, is_train),
+            forward_pass_builder_fun=self.gen_forward_pass_builder_fun(
+                model, dataset, is_train),
+            param_update_builder_fun=param_update,
+            optimizer_builder_fun=optimizer,
+            devices=self.xpus,
+            rendezvous=self.gen_rendezvous_ctx(model, dataset, is_train),
+            broadcast_computed_params=False,
+            optimize_gradient_memory=self.opts['model_param']['memonger'],
+            use_nccl=self.opts['model_param']['cuda_nccl'],
+            max_concurrent_distributed_ops=max_ops,
+            cpu_device=(self.opts['distributed']['device'] == 'cpu'),
+            # "shared model" will only keep model parameters for cpu_0 or gpu_0
+            # will cause issue when initialize each gpu_0, gpu_1, gpu_2 ...
+            # shared_model=(self.opts['distributed']['device'] == 'cpu'),
+            combine_spatial_bn=self.opts['model_param']['combine_spatial_bn'],
+        )
+        log.info('in prep_a_data_parallel_model Parallelize done ')
+
+        # log.info("Current blobs in workspace: {}".format(workspace.Blobs()))
+
+        workspace.RunNetOnce(model.param_init_net)
+        log.info('in prep_a_data_parallel_model RunNetOnce done ')
+
+        # for op in model.net.Proto().op:
+        #     log.info('op type engine {} {}'.format(op.type, op.engine))
+
+        log.info('model.net.Proto() {}'.format(model.net.Proto()))
+
+        workspace.CreateNet(model.net)
+
+        # for op in model.net.Proto().op:
+        #     log.info('after CreateNet op type engine {} {}'.
+        #         format(op.type, op.engine))
+
+        log.info('in prep_a_data_parallel_model CreateNet done ')
+
+    def loadCheckpoint(self):
+        opts = self.opts
+        previous_checkpoint = opts['temp_var']['checkpoint_model']
+        pretrained_model = opts['temp_var']['pretrained_model']
+        num_xpus = opts['distributed']['num_xpus']
+        if (previous_checkpoint is not None):
+            if os.path.exists(previous_checkpoint):
+                log.info('Load previous checkpoint:{}'.format(
+                    previous_checkpoint
+                ))
+                start_epoch, prev_checkpointed_lr, _best_metric = \
+                    checkpoint.initialize_params_from_file(
+                        model=self.train_model,
+                        weights_file=previous_checkpoint,
+                        num_xpus=num_xpus,
+                        opts=opts,
+                        broadcast_computed_param=True,
+                        reset_epoch=False,
+                    )
+        elif pretrained_model is not None and os.path.exists(pretrained_model):
+            log.info("Load pretrained model: {}".format(pretrained_model))
+            start_epoch, prev_checkpointed_lr, best_metric = \
+                checkpoint.initialize_params_from_file(
+                    model=self.train_model,
+                    weights_file=pretrained_model,
+                    num_xpus=num_xpus,
+                    opts=opts,
+                    broadcast_computed_param=True,
+                    reset_epoch=opts['model_param']['reset_epoch'],
+                )
+
+        data_parallel_model.FinalizeAfterCheckpoint(self.train_model)
+
+    def buildModelAndTrain(self, opts):
+        log.info('in buildModelAndTrain, trainer_input: {}'.format(str(opts)))
+        log.info("check type self: {}".format(type(self)))
+        log.info("check self dir: {}".format(dir(self)))
+        log.info("check self source: {}".format(self.__dict__))
+        log.info("check self get_input_dataset methods: {}".
+                 format(inspect.getsource(self.get_input_dataset)))
+        log.info("check self gen_input_builder_fun method: {}".
+                 format(inspect.getsource(self.gen_input_builder_fun)))
+        log.info("check self gen_forward_pass_builder_fun method: {}".
+                 format(inspect.getsource(self.gen_forward_pass_builder_fun)))
+        if self.gen_param_update_builder_fun is not None:
+            log.info("check self gen_param_update_builder_fun method: {}".
+                     format(inspect.getsource(self.gen_param_update_builder_fun)))
+        else:
+            log.info("check self gen_optimizer_fun method: {}".
+                     format(inspect.getsource(self.gen_optimizer_fun)))
+        log.info("check self assembleAllOutputs method: {}".
+                 format(inspect.getsource(self.assembleAllOutputs)))
+        log.info("check self prep_data_parallel_models method: {}".
+                 format(inspect.getsource(self.prep_data_parallel_models)))
+
+        self.get_model_input_fun()
+
+        self.init_model()
+
+        self.planning_output()
+
+        self.prep_data_parallel_models()
+
+        self.loadCheckpoint()
+
+        for epoch in self.list_of_epochs():
+
+            log.info("start training epoch {}".format(epoch))
+
+            self.fun_per_epoch_b4RunNet(epoch)
+
+            for epoch_iter in self.list_of_epoch_iters():
+
+                self.iter_start_time = time.time()
+
+                self.fun_per_iter_b4RunNet(epoch, epoch_iter)
+
+                if self.train_model is not None:
+                    self.run_training_net()
+
+                self.fun_per_iter_aftRunNetB4Test(epoch, epoch_iter)
+
+                self.iter_end_time = time.time()
+
+                if (epoch_iter %
+                opts['epoch_iter']['num_train_iteration_per_test'] == 0):
+                    secs_per_train = (self.iter_end_time - self.iter_start_time)
+                    self.secs_per_train.append(secs_per_train)
+
+                    sample_trained = self.total_batch_size
+                    samples_per_sec = sample_trained / secs_per_train
+                    self.samples_per_sec.append(samples_per_sec)
+
+                    self.fract_epoch = (epoch +
+                    float(epoch_iter) / self.epoch_iterations)
+                    self.record_epochs.append(self.fract_epoch)
+
+                    for key in self.metrics:
+                        metric = self.metrics[key]
+                        if not metric['is_train']:
+                            continue
+                        metric['calculator'].Add()
+                        metric['output'].append(metric['calculator'].Compute())
+
+                    self.test_loop_start_time = time.time()
+                    for _test_iter in range(0, opts['epoch_iter']['num_test_iter']):
+                        self.run_testing_net()
+                        for key in self.metrics:
+                            metric = self.metrics[key]
+                            if metric['is_train']:
+                                continue
+                            metric['calculator'].Add()
+                    self.test_loop_end_time = time.time()
+                    self.sec_per_test_loop = \
+                        self.test_loop_end_time - self.test_loop_start_time
+
+                    for metric in self.metrics.values():
+                        if metric['is_train']:
+                            continue
+                        metric['output'].append(metric['calculator'].Compute())
+
+                    logStr = 'epoch:{}/{} iter:{}/{} secs_per_train:{} '.format(
+                        self.fract_epoch, self.opts['epoch_iter']['num_epochs'],
+                        epoch_iter, self.epoch_iterations, secs_per_train)
+                    logStr += 'samples_per_sec:{} loop {} tests takes {} sec'.format(
+                        samples_per_sec, opts['epoch_iter']['num_test_iter'],
+                        self.sec_per_test_loop)
+                    for metric, value in self.metrics.items():
+                        logStr += ' {}:{} '.format(metric, value['output'][-1])
+                    log.info('Iter Stats: {}'.format(logStr))
+
+                self.fun_per_iter_aftRunNetAftTest(epoch, epoch_iter)
+
+            self.checkpoint(epoch)
+
+            self.fun_per_epoch_aftRunNet(epoch)
+
+        self.fun_conclude_operator()
+
+        self.createMetricsPlotsModelsOutputs()
+
+        return self.assembleAllOutputs()
diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py
new file mode 100644
index 0000000..b269777
--- /dev/null
+++ b/caffe2/contrib/playground/AnyExpOnTerm.py
@@ -0,0 +1,98 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import json
+import os
+
+import caffe2.contrib.playground.AnyExp as AnyExp
+import caffe2.contrib.playground.checkpoint as checkpoint
+
+import logging
+logging.basicConfig()
+log = logging.getLogger("AnyExpOnTerm")
+log.setLevel(logging.DEBUG)
+
+
+def runShardedTrainLoop(opts, myTrainFun):
+    start_epoch = 0
+    pretrained_model = opts['model_param']['pretrained_model']
+    if pretrained_model != '' and os.path.exists(pretrained_model):
+        # Only want to get start_epoch.
+        start_epoch, prev_checkpointed_lr, best_metric = \
+            checkpoint.initialize_params_from_file(
+                model=None,
+                weights_file=pretrained_model,
+                num_xpus=1,
+                opts=opts,
+                broadcast_computed_param=True,
+                reset_epoch=opts['model_param']['reset_epoch'],
+            )
+    log.info('start epoch: {}'.format(start_epoch))
+    pretrained_model = None if pretrained_model == '' else pretrained_model
+    ret = None
+
+    pretrained_model = ""
+    shard_results = []
+
+    for epoch in range(start_epoch,
+                       opts['epoch_iter']['num_epochs'],
+                       opts['epoch_iter']['num_epochs_per_flow_schedule']):
+        # must support checkpoint or the multiple schedule will always
+        # start from initial state
+        checkpoint_model = None if epoch == start_epoch else ret['model']
+        pretrained_model = None if epoch > start_epoch else pretrained_model
+        shard_results = []
+        # with LexicalContext('epoch{}_gang'.format(epoch),gang_schedule=False):
+        for shard_id in range(opts['distributed']['num_shards']):
+            opts['temp_var']['shard_id'] = shard_id
+            opts['temp_var']['pretrained_model'] = pretrained_model
+            opts['temp_var']['checkpoint_model'] = checkpoint_model
+            opts['temp_var']['epoch'] = epoch
+            opts['temp_var']['start_epoch'] = start_epoch
+            shard_ret = myTrainFun(opts)
+            shard_results.append(shard_ret)
+
+        ret = None
+        # always only take shard_0 return
+        for shard_ret in shard_results:
+            if shard_ret is not None:
+                ret = shard_ret
+                opts['temp_var']['metrics_output'] = ret['metrics']
+                break
+        log.info('ret is: {}'.format(str(ret)))
+
+    return ret
+
+
+def trainFun():
+    def simpleTrainFun(opts):
+        trainerClass = AnyExp.createTrainerClass(opts)
+        trainerClass = AnyExp.overrideAdditionalMethods(trainerClass, opts)
+        trainer = trainerClass(opts)
+        return trainer.buildModelAndTrain(opts)
+    return simpleTrainFun
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Any Experiment training.')
+    parser.add_argument("--parameters-json", type=json.loads,
+                        help='model options in json format', dest="params")
+
+    args = parser.parse_args()
+    opts = args.params['opts']
+    opts = AnyExp.initOpts(opts)
+    log.info('opts is: {}'.format(str(opts)))
+
+    AnyExp.initDefaultModuleMap()
+
+    opts['input']['datasets'] = AnyExp.aquireDatasets(opts)
+
+    # defined this way so that AnyExp.trainFun(opts) can be replaced with
+    # some other custermized training function.
+    ret = runShardedTrainLoop(opts, trainFun())
+
+    log.info('ret is: {}'.format(str(ret)))
diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py
new file mode 100644
index 0000000..89a9deb
--- /dev/null
+++ b/caffe2/contrib/playground/ModuleRegister.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import inspect
+import logging
+logging.basicConfig()
+log = logging.getLogger("ModuleRegister")
+log.setLevel(logging.DEBUG)
+
+MODULE_MAPS = []
+
+
+def registerModuleMap(module_map):
+    MODULE_MAPS.append(module_map)
+    log.info("ModuleRegister get modules from  ModuleMap content: {}".
+             format(inspect.getsource(module_map)))
+
+
+def constructTrainerClass(myTrainerClass, opts):
+
+    log.info("ModuleRegister, myTrainerClass name is {}".
+             format(myTrainerClass.__name__))
+    log.info("ModuleRegister, myTrainerClass type is {}".
+             format(type(myTrainerClass)))
+    log.info("ModuleRegister, myTrainerClass dir is {}".
+             format(dir(myTrainerClass)))
+
+    myInitializeModelModule = getModule(opts['model']['model_name_py'])
+    log.info("ModuleRegister, myInitializeModelModule dir is {}".
+             format(dir(myInitializeModelModule)))
+
+    myTrainerClass.init_model = myInitializeModelModule.init_model
+    myTrainerClass.run_training_net = myInitializeModelModule.run_training_net
+    myTrainerClass.fun_per_iter_b4RunNet = \
+        myInitializeModelModule.fun_per_iter_b4RunNet
+    myTrainerClass.fun_per_epoch_b4RunNet = \
+        myInitializeModelModule.fun_per_epoch_b4RunNet
+
+    myInputModule = getModule(opts['input']['input_name_py'])
+    log.info("ModuleRegister, myInputModule {} dir is {}".
+             format(opts['input']['input_name_py'], myInputModule.__name__))
+
+    # Override input methods of the myTrainerClass class
+    myTrainerClass.get_input_dataset = myInputModule.get_input_dataset
+    myTrainerClass.get_model_input_fun = myInputModule.get_model_input_fun
+    myTrainerClass.gen_input_builder_fun = myInputModule.gen_input_builder_fun
+
+    # myForwardPassModule = GetForwardPassModule(opts)
+    myForwardPassModule = getModule(opts['model']['forward_pass_py'])
+    myTrainerClass.gen_forward_pass_builder_fun = \
+        myForwardPassModule.gen_forward_pass_builder_fun
+
+    myParamUpdateModule = getModule(opts['model']['parameter_update_py'])
+    myTrainerClass.gen_param_update_builder_fun =\
+        myParamUpdateModule.gen_param_update_builder_fun \
+        if myParamUpdateModule is not None else None
+
+    myOptimizerModule = getModule(opts['model']['optimizer_py'])
+    myTrainerClass.gen_optimizer_fun = \
+        myOptimizerModule.gen_optimizer_fun \
+        if myOptimizerModule is not None else None
+
+    myRendezvousModule = getModule(opts['model']['rendezvous_py'])
+    myTrainerClass.gen_rendezvous_ctx = \
+        myRendezvousModule.gen_rendezvous_ctx \
+        if myRendezvousModule is not None else None
+
+    # override output module
+    myOutputModule = getModule(opts['output']['gen_output_py'])
+
+    log.info("ModuleRegister, myOutputModule is {}".
+             format(myOutputModule.__name__))
+    myTrainerClass.fun_conclude_operator = myOutputModule.fun_conclude_operator
+    myTrainerClass.assembleAllOutputs = myOutputModule.assembleAllOutputs
+
+    return myTrainerClass
+
+
+def overrideAdditionalMethods(myTrainerClass, opts):
+    log.info("B4 additional override myTrainerClass source {}".
+        format(inspect.getsource(myTrainerClass)))
+    # override any additional modules
+    myAdditionalOverride = getModule(opts['model']['additional_override_py'])
+    if myAdditionalOverride is not None:
+        for funcName, funcValue in inspect.getmembers(myAdditionalOverride,
+                                                      inspect.isfunction):
+            setattr(myTrainerClass, funcName, funcValue)
+    log.info("Aft additional override myTrainerClass's source {}".
+        format(inspect.getsource(myTrainerClass)))
+    return myTrainerClass
+
+
+def getModule(moduleName):
+    log.info("get module {} from MODULE_MAPS content {}".format(moduleName, str(MODULE_MAPS)))
+    myModule = None
+    for ModuleMap in MODULE_MAPS:
+        log.info("iterate through MODULE_MAPS content {}".
+                 format(str(ModuleMap)))
+        for name, obj in inspect.getmembers(ModuleMap):
+            log.info("iterate through MODULE_MAPS a name {}".format(str(name)))
+            if name == moduleName:
+                log.info("AnyExp get module {} with source:{}".
+                         format(moduleName, inspect.getsource(obj)))
+                myModule = obj
+                return myModule
+    return None
+
+
+def getClassFromModule(moduleName, className):
+    myClass = None
+    for ModuleMap in MODULE_MAPS:
+        for name, obj in inspect.getmembers(ModuleMap):
+            if name == moduleName:
+                log.info("ModuleRegistry from module {} get class {} of source:{}".
+                         format(moduleName, className, inspect.getsource(obj)))
+                myClass = getattr(obj, className)
+                return myClass
+    return None
diff --git a/caffe2/contrib/playground/README.md b/caffe2/contrib/playground/README.md
new file mode 100644
index 0000000..32264fa
--- /dev/null
+++ b/caffe2/contrib/playground/README.md
@@ -0,0 +1,154 @@
+# Playground for Caffe2 Models
+
+Playground is created to allow modelers to reuse the components of their models.  It is based on data parallel model of Caffe2.  Playground provide a framework that takes care of regular trainer iteration procedures and abstracting out APIs that allows user to apply customized model components of their own.  User can swap / exchange /reuse these components without rewriting the whole training script.  Once the components are in place, user can use parameterized launch command to drive their experiments.  This will be convenient for creating large amount of experiments with different components defined for each of them.  It may be used as a tool to explore different model architect / algorithms like optimizer / momentum / learning rate / batch normalization parameters.
+
+Playground project highlight:
+1. parameter driven: no need to create py script for each experiment, just swap components using parameters.  Very customizable, add your own component and add your opts in the command as you want.
+2. All models follows a typical way of train/testing epoch iteration.  Many aspects can be customized, example: run epoch by loss instead of predetermined iteration
+3. customizable components, trained metrics, also specified with parameters
+4. gpu or cpu training supported
+5. parallel training on multiple host supported
+6. checkpoint, pre-trained model helps with recover interrupted / failed experiment
+
+
+### Example Usage
+Playground comes with a resnet example, located in resnetdemo folder.  To see how playground works, do the following:
+
+1. make sure your caffe2 build successful with openCV and lmdb dependencies supported.
+
+2. make sure you have training/testing datasets ready in folders that can be accessible to trainer / distributed trainers
+
+3. specify a folder that you would like to store your checkpoint model files.
+
+4. use this command to launch a training, verify epochs are running with metrics reported in log and model file store in your checkpoint folder
+
+$ python caffe2/contrib/playground/AnyExpOnTerm.py --parameters-json '{
+"opts":{
+
+    "input":{
+        "input_name_py":"gfs_IN1k",
+        "train_input_path":"/path/to/your/training/data_lmdb/",
+        "test_input_path":"/path/to/your/testing/data_lmdb/",
+        "scale_jitter_type": 1, "color_jitter": true,      "color_lighting": true,
+        "namespace": "aml",  "table": "imagenet_data",  "column_handle": "everstore_handle",
+        "column_label": "label", "column_id": "image_id",  "label_type": 0,
+        "train_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "1"},
+        "test_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "0"},
+        "num_classes":1000, "loadsize" : 256, "imsize": 224, "decode_threads": 8, "datasets":[]},
+
+    "model":{
+        "model_name_py":"IN1k_resnet",
+        "forward_pass_py":"caffe2_resnet50_default_forward",
+        "parameter_update_py":"explicit_resnet_param_update",
+        "optimizer_py":"",
+        "rendezvous_py":"rendezvous_filestore",
+        "additional_override_py":""},
+
+    "model_param":{
+        "pretrained_model":"", "reset_epoch":true, "memonger" : true, "cuda_nccl": true,
+        "combine_spatial_bn":true, "max_concurrent_distributed_ops" : 16,
+        "base_learning_rate":0.05, "bn_epsilon":0.00001, "bn_momentum":0.9, "custom_bn_init": true,
+        "bn_init_gamma":1e-323, "weight_decay":1e-4, "weight_decay_bn":1e-323, "engine":"CUDNN"},
+
+    "epoch_iter":{
+        "num_train_sample_per_epoch":10240,
+        "num_test_sample": 5000,
+        "num_epochs":10,
+        "num_epochs_per_flow_schedule":5,
+        "num_train_iteration_per_test": 10,
+        "batch_per_device":32,
+        "num_test_iter":2},
+
+    "distributed":{
+        "num_shards":1,
+        "num_gpus":2,
+        "first_gpu_id":0,
+        "num_cpus":4,
+        "first_cpu_id":0},
+
+    "output":{
+        "gen_output_py":"output_generator",
+        "gen_checkpoint_path_py":"gen_checkpoint_path",
+        "checkpoint_folder":"/home/your_user_name/model_checkpoint/",
+        "metrics":[
+            {"name":"train_loss",
+             "meter_py":"ComputeLoss",
+             "meter_kargs":{"blob_name":"loss"},
+             "is_train":true},
+            {"name":"test_loss",
+             "meter_py":"ComputeLoss",
+             "meter_kargs":{"blob_name":"loss"},
+             "is_train":false},
+            {"name":"train_accuracy_top1",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
+             "is_train":true},
+            {"name":"train_accuracy_top5",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
+             "is_train":true},
+            {"name":"test_accuracy_top1",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
+             "is_train":false},
+            {"name":"test_accuracy_top5",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
+             "is_train":false}],
+        "plots":[
+            {"x":"", "x_title":"", "ys":["train_loss", "test_loss"],
+             "y_title":"train and test loss"},
+            {"x":"epochs", "x_title":"epochs",
+             "ys":["train_accuracy_top1","test_accuracy_top1",
+                   "train_accuracy_top5","test_accuracy_top5"],
+             "y_title":"Accuracy: Train top1, Test top1, Train top5, Test top5"}]}}
+
+}'
+
+5. now you can switch to different components that supplied in resnetdemo folder like so:
+
+   "forward_pass_py":"caffe2_resnet50_default_forward", --> "explicit_resnet_forward"  (which is a resnet model that allow you specify layers with "model_param"."num_layer")
+
+   and/or
+
+   "parameter_update_py":"caffe2_resnet50_default_param_update", --> "explicit_resnet_param_update"
+
+   playground should be able to launch training epochs and give you results
+
+
+### General Usage Guideline
+
+1. mandatory non empty opts: input_name_py, datasets, model_name_py, forward_pass_py, (parameter_update_py or optimizer_py), rendezvous_py, memonger, all epoch_iter opts, all distributed opts, gen_output_py
+
+2. mandatory nullable opts: pretrained_model, max_concurrent_distributed_ops, combine_spatial_bn
+
+3. other module dependent opts can be changed or removed: the rest of the opts.
+
+4. specify any additional opts depends on your modules' need, directly add them into the command line opts dictionary and no need to change any py code.  You should create your module to make sure they knows how to handle these new opts.  You access your own opts in such a manner:  self.opt['your_own_arg']['your_own_sub_arg']
+
+5. checkpoint is performed at the end of each epoch by default and generated model file can be find in log.  Each checkpoint can be used as pre-trained model to start new experiment.  Make sure new experiment is compatible with pre-trained model if you specified it.  For example, gpu experiment and cpu experiment can not share checkpoint, because the blob names are different.  Any experiments with different blob names can not share checkpoint.
+
+6. The metric and plots are reported when experiment finish running.  Intermediate results are reported in the log of the CreateTrainerAndRunManyEpochs operator as iteration goes on.
+
+7. if num_gpus is specified, the trainer will try to use gpu.  if num_gpus = 0, the trainer will use cpu to train.  For gpu training, batch_per_device are typically 32, for cpu training, batch_per_device is normally set to 2 with num_cpus higher like 8 or 16 depends on your machines' configuration.
+
+8. if train on single host, let "num_shards" = 1,  if multiple hosts, specify your "num_shards" and start parallelized training from each shards similarly to the resnet50_trainer.py example in caffe2/python/examples/ folder.
+
+
+### Develop Your Own Components
+
+1. Create a folder for your own experiment under caffe2/contrib/playground/ and go to this folder.
+
+2. Create a base model file, for example IN1kResnet.py.  In this script you need to implement init function and in it, instantiate your train/test model and give them to self.train_model and self.test_model.  In this base model class, you can also chose to override other functions you'd like to customize, for example if you want to iterate according to accuracy instead of fixed number of loops, override list_of_epochs(), and list_of_epoch_iters()
+
+3. Create component py scripts implementing the generators arguments of data_parallel_model.Parallelize().  Total four of them: input_builder_fun, forward_pass_builder_fun,  one of param_update_builder_fun or optimizer_builder_fun, and rendezvous.  This is where you can switch between different components. Examples: for the demo IN1k_resnet experiments, I created two different forward function: explicit_resnet_forward.py and caffe2_resnet50_default_forward.py.  Both implemented the API "gen_param_update_builder_fun", which is abstract method in the framework class AnyExp.py
+
+4. Next import the module components you created into module_map.py.  This import is needed to include these packages during building.  Give imported module a module name, normally if module is just a simple file contains some functions, just use the py script file name.  If the module contains class and the class is needed for module input, name it with the class name, examples are the meter classes like compute_loss.  When launching your experiment, in opts for the term “xxx_py” fill in the name you chose in module_map.py.  Playground will find your module and load it.
+
+5. Create as many modules as you need.  Then when you perform your experiment, specify the module you want in opts correspondingly and you can run your experiment with ease.
+
+6. In the demo, the opts item “gen_output_py” uses output_generator.py , which provides a minimum way to generating final experimental result, stored in the form of a dict.  It will allow user to do whatever visualization with these data after the training is finished.
+
+7. Customize your experimental result.  A meter interface is provided to implement your own metrics calculators.  Example compute_loss.py and compute_topk_accuracy.py.  For training metrics, results are calculated right away in each iteration.  For testing metrics, results are accumulated for the whole loop and finally calculated after test iteration finishes.  Once your have your meter class defined, you can start defining what metrics to report in your opts['output']['metrics'] list.  The name you give to your metrics can later be used when you define your plots.  The Playground will always record throughput metrics secs_per_train and samples_per_sec.
+
+8. an additional_override_py option is provided for the modules to allow user override any existing methods defined in the main framework AnyExp.py.  This make it easy to shut down part of the model to focus on remaining modules for experimenting or debugging.  An example is given as override_no_test_model_no_checkpoint.py, which turns off checkpointing and does neither prepare nor run test model.
diff --git a/caffe2/contrib/playground/__init__.py b/caffe2/contrib/playground/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py
new file mode 100644
index 0000000..1cde83e
--- /dev/null
+++ b/caffe2/contrib/playground/checkpoint.py
@@ -0,0 +1,174 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import cPickle as pickle
+from collections import OrderedDict
+
+from caffe2.proto import caffe2_pb2
+
+from caffe2.python import workspace, core, scope
+
+import logging
+logging.basicConfig()
+log = logging.getLogger("AnyExpOnTerm")
+log.setLevel(logging.DEBUG)
+
+
+def initialize_params_from_file(
+        model, weights_file, num_xpus, opts,
+        broadcast_computed_param=False, reset_epoch=False):
+    start_epoch, lr, best_metric = initialize_master_xpu_model_params(
+        model, weights_file, opts, reset_epoch)
+    broadcast_parameters(opts, model, num_xpus, broadcast_computed_param)
+    return start_epoch, lr, best_metric
+
+
+def initialize_master_xpu_model_params(model, weights_file, opts, reset_epoch):
+    log.info("Initializing model params from file: {}".format(weights_file))
+    with open(weights_file, 'r') as fopen:
+        blobs = pickle.load(fopen)
+    if 'blobs' in blobs:
+        blobs = blobs['blobs']
+
+    start_epoch = 0
+    best_metric = float('-inf')
+    if 'epoch' in blobs:
+        log.info('epoch {} is found in model file'.format(blobs['epoch']))
+        if not reset_epoch:
+            start_epoch = blobs['epoch']
+        else:
+            log.info('Reset epoch')
+    else:
+        log.info('no epoch is found in model file')
+    lr = opts['model_param']['base_learning_rate']
+    if 'lr' in blobs:
+        lr = blobs['lr']
+    if 'best_metric' in blobs and not reset_epoch:
+        best_metric = blobs['best_metric']
+
+    if model is not None:
+        log.info('initialize model parameters using weights file: {}'.format(
+            weights_file
+        ))
+        ws_blobs = workspace.Blobs()
+        unscoped_blob_names = OrderedDict()
+        for blob in model.GetAllParams():
+            unscoped_blob_names[unscope_name(str(blob))] = True
+        root_xpu_id = opts['distributed']['first_xpu_id']
+        device = opts['distributed']['device']
+        caffe2_pb2_DEVICE =\
+            caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\
+            else caffe2_pb2.CPU
+        with core.NameScope('{}_{}'.format(device, root_xpu_id)):
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, 0)):
+                for unscoped_blob_name in unscoped_blob_names.keys():
+                    scoped_blob_name = scoped_name(unscoped_blob_name)
+                    if unscoped_blob_name not in blobs:
+                        log.info('{:s} not found'.format(unscoped_blob_name))
+                        continue
+                    log.info(
+                        '{:s} loaded from weights file into: {:s}'.format(
+                            unscoped_blob_name, scoped_blob_name
+                        )
+                    )
+                    if scoped_blob_name in ws_blobs:
+                        ws_blob = workspace.FetchBlob(scoped_blob_name)
+                        if not ws_blob.shape == blobs[unscoped_blob_name].shape:
+                            log.info(
+                                ('Workspace blob {} with shape {} does '
+                                    'not match weights file shape {}').format(
+                                            unscoped_blob_name, ws_blob.shape,
+                                            blobs[unscoped_blob_name].shape)
+                            )
+                        else:
+                            workspace.FeedBlob(
+                                scoped_blob_name,
+                                blobs[unscoped_blob_name].astype(
+                                    np.float32, copy=False))
+    else:
+        log.info('Skip initializing model parameters from file: {}'.format(
+            weights_file
+        ))
+    log.info('Complete initialize_master_xpu_model_params')
+    return start_epoch, lr, best_metric
+
+
+def broadcast_parameters(opts, model, num_xpus, broadcast_computed_param=False):
+    if num_xpus == 1:
+        log.info("only 1 device. Skip parameter broadcast")
+        return
+    all_params = [model.GetParams()]
+    if broadcast_computed_param:
+        all_params.append(model.GetComputedParams())
+    caffe2_pb2_DEVICE =\
+        caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\
+        else caffe2_pb2.CPU
+    for params in all_params:
+        assert len(params) % num_xpus == 0, \
+            "Current model dosen't match device number when loading checkpoint"
+        params_per_xpu = int(len(params) / num_xpus)
+        for idx in range(params_per_xpu):
+            blobs = [param for param in params[idx::params_per_xpu]]
+            data = workspace.FetchBlob(blobs[0])
+            log.info('Broadcasting {} to'.format(str(blobs[0])))
+            for i, p in enumerate(blobs[1:]):
+                log.info(' |-> {}'.format(str(p)))
+                with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, i+1)):
+                    workspace.FeedBlob(p, data)
+    log.info("Complete parameter broadcast")
+
+
+def save_model_params(is_checkpoint, model, checkpoint_path, epoch, opts, best_metric):
+    # best_metric=float('-inf')
+    if checkpoint_path is None:
+        return None
+
+    try:
+        save_model_params_blob(
+            model, checkpoint_path, epoch, opts, best_metric
+        )
+    except Exception as e:
+        log.warning('Exception from save_model_params {}'.format(str(e)))
+    return checkpoint_path
+
+
+def save_model_params_blob(model, params_file, epoch, opts, best_metric):
+    # best_metric=float('-inf')
+    log.info("Saving model params...")
+    root_xpu_id = opts['distributed']['first_xpu_id']
+    device = opts['distributed']['device']
+    save_params = [str(param) for param in
+                   model.GetParams('{}_{}'.format(device, root_xpu_id))]
+    save_computed_params = [str(param) for param in
+                            model.GetComputedParams('{}_{}'
+                            .format(device, root_xpu_id))]
+    save_blobs = {}
+    save_blobs['epoch'] = epoch
+    save_blobs['best_metric'] = best_metric
+    save_blobs['lr'] = \
+        workspace.FetchBlob('{}_{}/lr'.format(device, root_xpu_id))
+    for param in save_params + save_computed_params:
+        scoped_blob_name = str(param)
+        unscoped_blob_name = unscope_name(scoped_blob_name)
+        if unscoped_blob_name not in save_blobs:
+            save_blobs[unscoped_blob_name] = workspace.FetchBlob(
+                scoped_blob_name)
+            log.debug(
+                '{:s} -> {:s}'.format(scoped_blob_name, unscoped_blob_name))
+    log.info('to weights file {}'.format(params_file))
+    try:
+        with open(params_file, 'w') as fwrite:
+            pickle.dump(dict(blobs=save_blobs), fwrite, pickle.HIGHEST_PROTOCOL)
+    except IOError as e:
+        log.error('I/O error({0}): {1}'.format(e.errno, e.strerror))
+
+
+def unscope_name(blob_name):
+    return blob_name[blob_name.rfind(scope._NAMESCOPE_SEPARATOR) + 1:]
+
+
+def scoped_name(blob_name):
+    return scope.CurrentNameScope() + blob_name
diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py
new file mode 100644
index 0000000..53eb77d
--- /dev/null
+++ b/caffe2/contrib/playground/compute_loss.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.contrib.playground.meter as Meter
+from caffe2.python import workspace
+
+
+class ComputeLoss(Meter.Meter):
+    def __init__(self, opts=None, blob_name=''):
+        self.blob_name = blob_name
+        self.opts = opts
+        self.iter = 0
+        self.value = 0
+
+    def Reset(self):
+        self.iter = 0
+        self.value = 0
+
+    def Add(self):
+        """Average values of a blob on each gpu"""
+        value = 0
+        for idx in range(self.opts['distributed']['first_xpu_id'],
+                         self.opts['distributed']['first_xpu_id'] +
+                         self.opts['distributed']['num_xpus']):
+            value += workspace.FetchBlob('{}_{}/{}'.
+                format(self.opts['distributed']['device'], idx, self.blob_name))
+        self.value += value
+        self.iter += 1
+
+    def Compute(self):
+        result = self.opts['distributed']['num_shards'] * self.value / self.iter
+        self.Reset()
+        return result
diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py
new file mode 100644
index 0000000..396b797
--- /dev/null
+++ b/caffe2/contrib/playground/compute_topk_accuracy.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.contrib.playground.meter as Meter
+from caffe2.python import workspace
+import numpy as np
+
+
+class ComputeTopKAccuracy(Meter.Meter):
+    # Python default arguments are evaluated once when the function is
+    # defined, not each time the function is called
+    # This means that if you use a mutable default argument and mutate it,
+    # you will and have mutated that object for
+    # all future calls to the function as well.
+    # def __init__(self, blob_name=['softmax', 'label'], opts=None, topk=1):
+    def __init__(self, blob_name=None, opts=None, topk=1):
+        if blob_name is None:
+            blob_name = ['softmax', 'label']
+        self.blob_name = blob_name
+        self.opts = opts
+        self.topk = topk
+        self.iter = 0
+        self.value = 0
+
+    def Reset(self):
+        self.iter = 0
+        self.value = 0
+
+    def Add(self):
+        for idx in range(self.opts['distributed']['first_xpu_id'],
+                         self.opts['distributed']['first_xpu_id'] +
+                         self.opts['distributed']['num_xpus']):
+            prefix = '{}_{}/'.format(self.opts['distributed']['device'], idx)
+            softmax = workspace.FetchBlob(prefix + self.blob_name[0])
+            labels = workspace.FetchBlob(prefix + self.blob_name[1])
+            output = np.squeeze(softmax)
+            target = np.squeeze(labels)
+            if len(output.shape) == 1:
+                output = output.reshape((1, output.shape[0]))
+            else:
+                assert len(output.shape) == 2, \
+                    'wrong output size (1D or 2D expected)'
+            assert len(target.shape) == 1, 'wrong target size (1D expected)'
+            assert output.shape[0] == target.shape[0], \
+                'target and output do not match'
+
+            N = output.shape[0]
+            pred = np.argsort(-output, axis=1)[:, :self.topk]
+            correct = pred.astype(target.dtype) == np.repeat(
+                target.reshape((N, 1)), [self.topk], axis=1)
+            self.value += np.sum(correct[:, :self.topk])
+            self.iter += N
+
+    def Compute(self):
+        result = self.value / self.iter
+        self.Reset()
+        return result
diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py
new file mode 100644
index 0000000..7e109e4
--- /dev/null
+++ b/caffe2/contrib/playground/meter.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from abc import abstractmethod
+
+
+class Meter(object):
+
+    @abstractmethod
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def Reset(self):
+        pass
+
+    @abstractmethod
+    def Add(self):
+        pass
+
+    @abstractmethod
+    def Compute(self):
+        pass
diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py
new file mode 100644
index 0000000..0f5de59
--- /dev/null
+++ b/caffe2/contrib/playground/module_map.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+# Input
+import caffe2.contrib.playground.resnetdemo.\
+    gfs_IN1k as gfs_IN1k  # noqa
+
+# model
+import caffe2.contrib.playground.resnetdemo.\
+    IN1k_resnet as IN1k_resnet # noqa
+
+import caffe2.contrib.playground.resnetdemo.\
+    IN1k_resnet_no_test_model as IN1k_resnet_no_test_model # noqa
+
+# Additional override
+import caffe2.contrib.playground.resnetdemo.\
+    override_no_test_model_no_checkpoint as override_no_test_model_no_checkpoint # noqa
+
+# FORWARD_PASS
+import caffe2.contrib.playground.resnetdemo.\
+    caffe2_resnet50_default_forward as caffe2_resnet50_default_forward # noqa
+
+import caffe2.contrib.playground.resnetdemo.\
+    explicit_resnet_forward as explicit_resnet_forward # noqa
+
+# PARAMETER_UPDATE
+import caffe2.contrib.playground.resnetdemo.\
+    caffe2_resnet50_default_param_update as caffe2_resnet50_default_param_update # noqa
+
+import caffe2.contrib.playground.resnetdemo.\
+    explicit_resnet_param_update as explicit_resnet_param_update # noqa
+
+# RENDEZVOUS
+import caffe2.contrib.playground.resnetdemo.\
+    rendezvous_filestore as rendezvous_filestore # noqa
+
+# OUTPUT
+import caffe2.contrib.playground.\
+    output_generator as output_generator # noqa
+
+# METERS
+# for meters, use the class name as your module name in this map
+import caffe2.contrib.playground.\
+    compute_loss as ComputeLoss # noqa
+
+import caffe2.contrib.playground.\
+    compute_topk_accuracy as ComputeTopKAccuracy # noqa
diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py
new file mode 100644
index 0000000..41d8e3f
--- /dev/null
+++ b/caffe2/contrib/playground/output_generator.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import timeout_guard
+
+def fun_conclude_operator(self):
+    # Ensure the program exists. This is to "fix" some unknown problems
+    # causing the job sometimes get stuck.
+    timeout_guard.EuthanizeIfNecessary(600.0)
+
+
+def assembleAllOutputs(self):
+    output = {}
+    output['train_model'] = self.train_model
+    output['test_model'] = self.test_model
+    output['model'] = self.model_output
+    output['metrics'] = self.metrics_output
+    return output
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
new file mode 100644
index 0000000..52ce95e
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import workspace, cnn, core
+from caffe2.python import timeout_guard
+from caffe2.proto import caffe2_pb2
+
+
+def init_model(self):
+    train_model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="resnet",
+        use_cudnn=True,
+        cudnn_exhaustive_search=False
+    )
+    self.train_model = train_model
+
+    test_model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="resnet_test",
+        use_cudnn=True,
+        cudnn_exhaustive_search=False,
+        init_params=False,
+    )
+    self.test_model = test_model
+
+    self.log.info("Model creation completed")
+
+
+def fun_per_epoch_b4RunNet(self, epoch):
+    pass
+
+
+def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
+
+    learning_rate = 0.05
+    for idx in range(self.opts['distributed']['first_xpu_id'],
+                     self.opts['distributed']['first_xpu_id'] +
+                     self.opts['distributed']['num_xpus']):
+        caffe2_pb2_device = caffe2_pb2.CUDA if \
+            self.opts['distributed']['device'] == 'gpu' else \
+            caffe2_pb2.CPU
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2_device, idx)):
+            workspace.FeedBlob(
+                '{}_{}/lr'.format(self.opts['distributed']['device'], idx),
+                np.array(learning_rate, dtype=np.float32)
+            )
+
+
+def run_training_net(self):
+    timeout = 2000.0
+    with timeout_guard.CompleteInTimeOrDie(timeout):
+        workspace.RunNet(self.train_model.net.Proto().name)
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
new file mode 100644
index 0000000..cf893b5
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
@@ -0,0 +1,62 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import workspace, cnn, core
+from caffe2.python import timeout_guard
+from caffe2.proto import caffe2_pb2
+
+
+def init_model(self):
+    # if cudnn needs to be turned off, several other places
+    # need to be modified:
+    # 1. operators need to be constructed with engine option, like below:
+    #     conv_blob = model.Conv(...engine=engine)
+    # 2. when launch model, opts['model_param']['engine'] = "" instead of "CUDNN"
+    # 2. caffe2_disable_implicit_engine_preference in operator.cc set to true
+    train_model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="resnet",
+        use_cudnn=False,
+        cudnn_exhaustive_search=False,
+    )
+    self.train_model = train_model
+
+    # test_model = cnn.CNNModelHelper(
+    #     order="NCHW",
+    #     name="resnet_test",
+    #     use_cudnn=False,
+    #     cudnn_exhaustive_search=False,
+    #     init_params=False,
+    # )
+    self.test_model = None
+
+    self.log.info("Model creation completed")
+
+
+def fun_per_epoch_b4RunNet(self, epoch):
+    pass
+
+
+def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
+    learning_rate = 0.05
+    for idx in range(self.opts['distributed']['first_xpu_id'],
+                     self.opts['distributed']['first_xpu_id'] +
+                     self.opts['distributed']['num_xpus']):
+        caffe2_pb2_device = caffe2_pb2.CUDA if \
+            self.opts['distributed']['device'] == 'gpu' else \
+            caffe2_pb2.CPU
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2_device, idx)):
+            workspace.FeedBlob(
+                '{}_{}/lr'.format(self.opts['distributed']['device'], idx),
+                np.array(learning_rate, dtype=np.float32)
+            )
+
+
+def run_training_net(self):
+    timeout = 2000.0
+    with timeout_guard.CompleteInTimeOrDie(timeout):
+        workspace.RunNet(self.train_model.net.Proto().name)
diff --git a/caffe2/contrib/playground/resnetdemo/__init__.py b/caffe2/contrib/playground/resnetdemo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
new file mode 100644
index 0000000..174ffe1
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
@@ -0,0 +1,26 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.python.models.resnet as resnet
+
+
+def gen_forward_pass_builder_fun(self, model, dataset, is_train):
+    def create_resnet50_model_ops(model, loss_scale):
+        [softmax, loss] = resnet.create_resnet50(
+            model,
+            "data",
+            num_input_channels=3,
+            num_labels=1000,
+            label="label",
+        )
+        model.Accuracy([softmax, "label"], "accuracy")
+
+        my_loss_scale = 1. / self.opts['distributed']['num_xpus'] / \
+            self.opts['distributed']['num_shards']
+
+        loss = model.Scale(loss, scale=my_loss_scale)
+
+        return [loss]
+    return create_resnet50_model_ops
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
new file mode 100644
index 0000000..9746534
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def gen_param_update_builder_fun(self, model, dataset, is_train):
+    if not is_train:
+        return None
+    else:
+        def add_parameter_update_ops(model):
+            model.AddWeightDecay(1e-4)
+            ITER = model.Iter("ITER")
+            stepsz = int(30 *
+                         self.opts['epoch_iter']['num_train_sample_per_epoch'] /
+                         self.total_batch_size)
+            LR = model.net.LearningRate(
+                [ITER],
+                "lr",
+                base_lr=self.opts['model_param']['base_learning_rate'],
+                policy="step",
+                stepsize=stepsz,
+                gamma=0.1,
+            )
+
+            params = model.GetParams()
+            assert(len(params) > 0)
+            for param in params:
+                param_grad = model.param_to_grad[param]
+                param_momentum = model.param_init_net.ConstantFill(
+                    [param], param + '_momentum', value=0.0
+                )
+
+                # Update param_grad and param_momentum in place
+                model.net.MomentumSGDUpdate(
+                    [param_grad, param_momentum, LR, param],
+                    [param_grad, param_momentum, param],
+                    momentum=0.9,
+                    nesterov=1
+                )
+
+        return add_parameter_update_ops
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
new file mode 100644
index 0000000..01b51fa
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
@@ -0,0 +1,313 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+logging.basicConfig()
+log = logging.getLogger("AnyExp")
+log.setLevel(logging.DEBUG)
+
+# For more depths, add the block config here
+BLOCK_CONFIG = {
+    18: (2, 2, 2, 2),
+    34: (3, 4, 6, 3),
+    50: (3, 4, 6, 3),
+    101: (3, 4, 23, 3),
+    152: (3, 8, 36, 3),
+    200: (3, 32, 36, 3),
+    264: (3, 64, 36, 3),
+    284: (3, 32, 64, 3),
+}
+
+
+def gen_forward_pass_builder_fun(self, model, dataset, is_train):
+    split = 'train' if is_train else 'test'
+    opts = self.opts
+
+    def model_creator(model, loss_scale):
+        model, softmax, loss = resnet_imagenet_create_model(
+            model=model,
+            data='data',
+            labels='label',
+            split=split,
+            opts=opts,
+            dataset=dataset,
+        )
+        return [loss]
+    return model_creator
+
+
+def resnet_imagenet_create_model(model, data, labels, split, opts, dataset):
+    model_helper = ResNetModelHelper(model, split, opts)
+    opts_depth = opts['model_param']['num_layer']
+    engine = opts['model_param']['engine']
+    log.info(' | ResNet-{} Imagenet'.format(opts_depth))
+    assert opts_depth in BLOCK_CONFIG.keys(), \
+        'Block config is not defined for specified model depth. Please check.'
+    (n1, n2, n3, n4) = BLOCK_CONFIG[opts_depth]
+
+    num_features = 2048
+    residual_block = model_helper.bottleneck_block
+    if opts_depth in [18, 34]:
+        num_features = 512
+        residual_block = model_helper.basic_block
+
+    num_classes = 1000
+    conv_blob = model.Conv(
+        data, 'conv1', 3, 64, 7, stride=2, pad=3, weight_init=('MSRAFill', {}),
+        bias_init=('ConstantFill', {'value': 0.}), no_bias=0, engine=engine
+    )
+    test_mode = False
+    if split in ['test', 'val']:
+        test_mode = True
+    bn_blob = model.SpatialBN(
+        conv_blob, 'res_conv1_bn', 64,
+        # does not appear to affect test_loss performance
+        # epsilon=1e-3,
+        epsilon=opts['model_param']['bn_epsilon'],
+        # momentum=0.1,
+        momentum=opts['model_param']['bn_momentum'],
+        is_test=test_mode,
+    )
+    relu_blob = model.Relu(bn_blob, bn_blob)
+    max_pool = model.MaxPool(relu_blob, 'pool1', kernel=3, stride=2, pad=1)
+
+    # TODO: This can be further optimized by passing dim_in, dim_out = features,
+    # dim_out = features * 4
+    if opts_depth in [50, 101, 152, 200, 264, 284]:
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, max_pool, 64, 256, stride=1, num_blocks=n1,
+            prefix='res2', dim_inner=64
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 512, stride=2, num_blocks=n2,
+            prefix='res3', dim_inner=128
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 1024, stride=2, num_blocks=n3,
+            prefix='res4', dim_inner=256
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 2048, stride=2, num_blocks=n4,
+            prefix='res5', dim_inner=512
+        )
+    elif opts_depth in [18, 34]:
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, max_pool, 64, 64, stride=1, num_blocks=n1,
+            prefix='res2',
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 128, stride=2, num_blocks=n2,
+            prefix='res3',
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 256, stride=2, num_blocks=n3,
+            prefix='res4',
+        )
+        blob_in, dim_in = model_helper.residual_layer(
+            residual_block, blob_in, dim_in, 512, stride=2, num_blocks=n4,
+            prefix='res5',
+        )
+
+    pool_blob = model.AveragePool(blob_in, 'pool5', kernel=7, stride=1)
+
+    loss_scale = 1. / opts['distributed']['num_xpus'] / \
+        opts['distributed']['num_shards']
+
+    loss = None
+
+    fc_blob = model.FC(
+        pool_blob, 'pred', num_features, num_classes,
+        # does not appear to affect test_loss performance
+        # weight_init=('GaussianFill', {'std': opts.fc_init_std}),
+        # bias_init=('ConstantFill', {'value': 0.})
+        weight_init=None,
+        bias_init=None)
+    softmax, loss = model.SoftmaxWithLoss(
+        [fc_blob, labels],
+        ['softmax', 'loss'],
+        scale=loss_scale)
+    model.Accuracy(['softmax', labels], 'accuracy')
+    return model, softmax, loss
+
+
+class ResNetModelHelper():
+
+    def __init__(self, model, split, opts):
+        self.model = model
+        self.split = split
+        self.opts = opts
+        self.engine = opts['model_param']['engine']
+
+
+    # shortcut type B
+    def add_shortcut(self, blob_in, dim_in, dim_out, stride, prefix):
+        if dim_in == dim_out:
+            return blob_in
+        conv_blob = self.model.Conv(
+            blob_in, prefix, dim_in, dim_out, kernel=1,
+            stride=stride,
+            weight_init=("MSRAFill", {}),
+            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
+        )
+        test_mode = False
+        if self.split in ['test', 'val']:
+            test_mode = True
+        bn_blob = self.model.SpatialBN(
+            conv_blob, prefix + "_bn", dim_out,
+            # epsilon=1e-3,
+            # momentum=0.1,
+            epsilon=self.opts['model_param']['bn_epsilon'],
+            momentum=self.opts['model_param']['bn_momentum'],
+            is_test=test_mode,
+        )
+        return bn_blob
+
+    def conv_bn(
+        self, blob_in, dim_in, dim_out, kernel, stride, prefix, group=1, pad=1,
+    ):
+        conv_blob = self.model.Conv(
+            blob_in, prefix, dim_in, dim_out, kernel, stride=stride,
+            pad=pad, group=group,
+            weight_init=("MSRAFill", {}),
+            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
+        )
+        test_mode = False
+        if self.split in ['test', 'val']:
+            test_mode = True
+        bn_blob = self.model.SpatialBN(
+            conv_blob, prefix + "_bn", dim_out,
+            epsilon=self.opts['model_param']['bn_epsilon'],
+            momentum=self.opts['model_param']['bn_momentum'],
+            is_test=test_mode,
+        )
+        return bn_blob
+
+    def conv_bn_relu(
+        self, blob_in, dim_in, dim_out, kernel, stride, prefix, pad=1, group=1,
+    ):
+        bn_blob = self.conv_bn(
+            blob_in, dim_in, dim_out, kernel, stride, prefix, group=group,
+            pad=pad
+        )
+        return self.model.Relu(bn_blob, bn_blob)
+
+    # 3(a)this block uses multi-way group conv implementation that splits blobs
+    def multiway_bottleneck_block(
+        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group
+    ):
+        blob_out = self.conv_bn_relu(
+            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
+        )
+
+        conv_blob = self.model.GroupConv_Deprecated(
+            blob_out, prefix + "_branch2b", dim_inner, dim_inner, kernel=3,
+            stride=stride, pad=1, group=group, weight_init=("MSRAFill", {}),
+            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
+        )
+        test_mode = False
+        if self.split in ['test', 'val']:
+            test_mode = True
+        bn_blob = self.model.SpatialBN(
+            conv_blob, prefix + "_branch2b_bn", dim_out,
+            epsilon=self.opts['model_param']['bn_epsilon'],
+            momentum=self.opts['model_param']['bn_momentum'], is_test=test_mode,
+        )
+        relu_blob = self.model.Relu(bn_blob, bn_blob)
+
+        bn_blob = self.conv_bn(
+            relu_blob, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
+        )
+        if self.opts['model_param']['custom_bn_init']:
+            self.model.param_init_net.ConstantFill(
+                [bn_blob + '_s'], bn_blob + '_s',
+                value=self.opts['model_param']['bn_init_gamma'])
+
+        sc_blob = self.add_shortcut(
+            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
+        )
+        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
+        return self.model.Relu(sum_blob, sum_blob)
+
+    # 3(c) this block uses cudnn group conv op
+    def group_bottleneck_block(
+        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group
+    ):
+        blob_out = self.conv_bn_relu(
+            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
+        )
+        blob_out = self.conv_bn_relu(
+            blob_out, dim_inner, dim_inner, 3, stride, prefix + "_branch2b",
+            group=group
+        )
+        bn_blob = self.conv_bn(
+            blob_out, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
+        )
+        if self.opts['model_param']['custom_bn_init']:
+            self.model.param_init_net.ConstantFill(
+                [bn_blob + '_s'], bn_blob + '_s',
+                value=self.opts['model_param']['bn_init_gamma'])
+
+        sc_blob = self.add_shortcut(
+            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
+        )
+        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
+        return self.model.Relu(sum_blob, sum_blob)
+
+    # bottleneck residual layer for 50, 101, 152 layer networks
+    def bottleneck_block(
+        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group=None
+    ):
+        blob_out = self.conv_bn_relu(
+            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
+        )
+        blob_out = self.conv_bn_relu(
+            blob_out, dim_inner, dim_inner, 3, stride, prefix + "_branch2b",
+        )
+        bn_blob = self.conv_bn(
+            blob_out, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
+        )
+        if self.opts['model_param']['custom_bn_init']:
+            self.model.param_init_net.ConstantFill(
+                [bn_blob + '_s'], bn_blob + '_s',
+                value=self.opts['model_param']['bn_init_gamma'])
+
+        sc_blob = self.add_shortcut(
+            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
+        )
+        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
+        return self.model.Relu(sum_blob, sum_blob)
+
+    # basic layer for the 18 and 34 layer networks and the CIFAR data netwrorks
+    def basic_block(
+        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner=None,
+        group=None,
+    ):
+        blob_out = self.conv_bn_relu(
+            blob_in, dim_in, dim_out, 3, stride, prefix + "_branch2a"
+        )
+        bn_blob = self.conv_bn(
+            blob_out, dim_out, dim_out, 3, 1, prefix + "_branch2b", pad=1
+        )
+        sc_blob = self.add_shortcut(
+            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
+        )
+        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
+        return self.model.Relu(sum_blob, sum_blob)
+
+    def residual_layer(
+        self, block_fn, blob_in, dim_in, dim_out, stride, num_blocks, prefix,
+        dim_inner=None, group=None
+    ):
+        # prefix is something like: res2, res3, etc.
+        # each res layer has num_blocks stacked
+        for idx in range(num_blocks):
+            block_prefix = "{}_{}".format(prefix, idx)
+            block_stride = 2 if (idx == 0 and stride == 2) else 1
+            blob_in = block_fn(
+                blob_in, dim_in, dim_out, block_stride, block_prefix, dim_inner,
+                group
+            )
+            dim_in = dim_out
+        return blob_in, dim_in
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
new file mode 100644
index 0000000..8a86289
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace, core
+from caffe2.proto import caffe2_pb2
+
+
+def gen_param_update_builder_fun(self, model, dataset, is_train):
+    if not is_train:
+        return None
+    else:
+        # from sherlok
+        for idx in range(self.opts['distributed']['first_xpu_id'],
+                         self.opts['distributed']['first_xpu_id'] +
+                         self.opts['distributed']['num_xpus']):
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, idx)):
+                workspace.CreateBlob('{}_{}/lr'.
+                    format(self.opts['distributed']['device'], idx))
+
+        def add_parameter_update_ops(model):
+            model.Iter("ITER")
+            weight_decay = model.param_init_net.ConstantFill(
+                [], 'weight_decay', shape=[1],
+                value=self.opts['model_param']['weight_decay']
+            )
+            weight_decay_bn = model.param_init_net.ConstantFill(
+                [], 'weight_decay_bn', shape=[1],
+                value=self.opts['model_param']['weight_decay_bn']
+            )
+            one = model.param_init_net.ConstantFill(
+                [], "ONE", shape=[1], value=1.0
+            )
+
+            '''
+            Add the momentum-SGD update.
+            '''
+            params = model.GetParams()
+            assert(len(params) > 0)
+
+            for param in params:
+                param_grad = model.param_to_grad[param]
+                param_momentum = model.param_init_net.ConstantFill(
+                    [param], param + '_momentum', value=0.0
+                )
+
+                if '_bn' in str(param):
+                    model.WeightedSum(
+                        [param_grad, one, param, weight_decay_bn], param_grad
+                    )
+                else:
+                    model.WeightedSum(
+                        [param_grad, one, param, weight_decay], param_grad
+                    )
+
+                # Update param_grad and param_momentum in place
+                model.net.MomentumSGDUpdate(
+                    [param_grad, param_momentum, 'lr', param],
+                    [param_grad, param_momentum, param],
+                    momentum=0.9,
+                    nesterov=1
+                )
+
+        return add_parameter_update_ops
diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
new file mode 100644
index 0000000..8b26471
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
@@ -0,0 +1,54 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+# # example1 using gfs as input source.
+
+def gen_input_builder_fun(self, model, dataset, is_train):
+    if is_train:
+        input_path = self.opts['input']['train_input_path']
+    else:
+        input_path = self.opts['input']['test_input_path']
+
+    reader = model.CreateDB("reader",
+                            db=input_path,
+                            db_type='lmdb',
+                            shard_id=self.shard_id,
+                            num_shards=self.opts['distributed']['num_shards'],)
+
+    def AddImageInput(model, reader, batch_size, img_size):
+        '''
+        Image input operator that loads data from reader and
+        applies certain transformations to the images.
+        '''
+        data, label = model.ImageInput(
+            reader,
+            ["data", "label"],
+            batch_size=batch_size,
+            use_caffe_datum=True,
+            mean=128.,
+            std=128.,
+            scale=256,
+            crop=img_size,
+            mirror=1,
+            is_test=True
+        )
+        data = model.StopGradient(data, data)
+
+    def add_image_input(model):
+        AddImageInput(
+            model,
+            reader,
+            batch_size=self.opts['epoch_iter']['batch_per_device'],
+            img_size=self.opts['input']['imsize'],
+        )
+    return add_image_input
+
+
+def get_input_dataset(opts):
+    return []
+
+
+def get_model_input_fun(self):
+    pass
diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
new file mode 100644
index 0000000..4cc2d68
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
@@ -0,0 +1,16 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+def checkpoint(self, epoch):
+    self.model_path = None
+    pass
+
+def prep_data_parallel_models(self):
+    # only do train_model no test needed here
+    self.prep_a_data_parallel_model(self.train_model,
+                                    self.train_dataset, True)
+
+def run_testing_net(self):
+    pass
diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
new file mode 100644
index 0000000..d757896
--- /dev/null
+++ b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python import dyndep
+dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
+
+
+# rendezvous should NOT be unique for each operator.  It should have
+# the same run_id on different operators.  say we have two shards,
+# both shards created rendezvous of run_id "aaa_bbb_epoch_09", and this
+# rendezvous will wait for two shards to join because max_shards is specified
+# to be 2.  If each shard created an rendezvous with different run_id,
+# each of them are waiting for different rendezvous to join, they will
+# never wait for each other and therefore timeout eventually.
+
+def gen_rendezvous_ctx(self, model, dataset, is_train):
+    if self.opts['distributed']['num_shards'] < 2:
+        return None
+    # have issue when try to set this up on more shards
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "FileStoreHandlerCreate", [], ["store_handler"],
+            path="/tmp",
+            prefix="epoch.{}".format(self.epoch),
+        )
+    )
+
+    rendezvous = dict(
+        kv_handler="store_handler",
+        shard_id=self.shard_id,
+        num_shards=self.opts['distributed']['num_shards'],
+        engine="GLOO",
+        # transport=args.distributed_transport,
+        transport="tcp",
+        # interface=interfaces[0],
+        interface=[],
+        exit_nets=None) if is_train else None
+    return rendezvous
diff --git a/caffe2/contrib/prof/CMakeLists.txt b/caffe2/contrib/prof/CMakeLists.txt
new file mode 100644
index 0000000..6252a04
--- /dev/null
+++ b/caffe2/contrib/prof/CMakeLists.txt
@@ -0,0 +1,27 @@
+if (USE_PROF)
+  set(Caffe2_CONTRIB_PROF_CPU_SRCS
+      "${CMAKE_CURRENT_SOURCE_DIR}/prof_dag_net.cc"
+      "${CMAKE_CURRENT_SOURCE_DIR}/prof_dag_stats_op.cc"
+  )
+  set(Caffe2_CONTRIB_PROF_GPU_SRCS
+      "${CMAKE_CURRENT_SOURCE_DIR}/cuda_profile_ops.cc"
+  )
+
+  if (USE_PROF_HTRACE)
+    set(Caffe2_CONTRIB_PROF_CPU_SRCS ${Caffe2_CONTRIB_PROF_CPU_SRCS}
+        "${CMAKE_CURRENT_SOURCE_DIR}/htrace_conf.cc"
+        "${CMAKE_CURRENT_SOURCE_DIR}/htrace_dag_net.cc"
+    )
+    set(Caffe2_CONTRIB_PROF_GPU_SRCS ${Caffe2_CONTRIB_PROF_GPU_SRCS}
+        "${CMAKE_CURRENT_SOURCE_DIR}/htrace_async_dag_net_gpu.cc"
+    )
+  endif()
+
+  file(GLOB_RECURSE Caffe2_CONTRIB_PROF_CPU_CPP_TEST_SRCS *test.cc)
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_PROF_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} 
+      ${Caffe2_CONTRIB_PROF_CPU_CPP_TEST_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_PROF_GPU_SRCS} PARENT_SCOPE)
+
+endif()
diff --git a/caffe2/contrib/prof/cuda_profile_ops.cc b/caffe2/contrib/prof/cuda_profile_ops.cc
new file mode 100644
index 0000000..2076133
--- /dev/null
+++ b/caffe2/contrib/prof/cuda_profile_ops.cc
@@ -0,0 +1,104 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/operator.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <cuda_profiler_api.h>
+
+namespace caffe2 {
+
+static std::vector<std::string> kCudaProfileConfiguration = {
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "dynsmemperblock",
+    "stasmemperblock",
+    "regperthread",
+    "memtransfersize",
+    "memtransferdir",
+    "memtransferhostmemtype",
+    "streamid",
+    "cacheconfigrequested",
+    "cacheconfigexecuted",
+    "countermodeaggregate",
+    "enableonstart 0",
+    "active_warps",
+    "active_cycles",
+};
+
+class CudaProfileInitializeOp : public OperatorBase {
+ public:
+  CudaProfileInitializeOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        output_(GetSingleArgument<std::string>("output", "/tmp/output")) {
+    std::array<char, 128> buf;
+    std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+    CAFFE_ENFORCE_LT(tmpl.size(), buf.size());
+    memcpy(buf.data(), tmpl.data(), tmpl.size());
+    auto result = mktemp(buf.data());
+    CAFFE_ENFORCE_NE(strlen(result), 0, "mktemp: ", strerror(errno));
+    config_ = result;
+
+    // Write configuration to temporary file
+    {
+      std::ofstream ofs(config_, std::ios::out | std::ios::trunc);
+      CAFFE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+      for (const auto& line : kCudaProfileConfiguration) {
+        ofs << line << std::endl;
+      }
+    }
+  }
+
+  ~CudaProfileInitializeOp() {
+    unlink(config_.c_str());
+  }
+
+  virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
+    // If this fails, check the contents of "output" for hints.
+    CUDA_CHECK(
+        cudaProfilerInitialize(config_.c_str(), output_.c_str(), cudaCSV));
+    return true;
+  }
+
+ protected:
+  std::string config_;
+  std::string output_;
+};
+
+class CudaProfileStartOp : public OperatorBase {
+ public:
+  CudaProfileStartOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws) {}
+
+  virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
+    CUDA_ENFORCE(cudaProfilerStart());
+    return true;
+  }
+};
+
+class CudaProfileStopOp : public OperatorBase {
+ public:
+  CudaProfileStopOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws) {}
+
+  virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
+    CUDA_ENFORCE(cudaProfilerStop());
+    return true;
+  }
+};
+
+OPERATOR_SCHEMA(CudaProfileInitialize);
+OPERATOR_SCHEMA(CudaProfileStart);
+OPERATOR_SCHEMA(CudaProfileStop);
+
+REGISTER_CPU_OPERATOR(CudaProfileInitialize, CudaProfileInitializeOp);
+REGISTER_CPU_OPERATOR(CudaProfileStart, CudaProfileStartOp);
+REGISTER_CPU_OPERATOR(CudaProfileStop, CudaProfileStopOp);
+
+REGISTER_CUDA_OPERATOR(CudaProfileInitialize, CudaProfileInitializeOp);
+REGISTER_CUDA_OPERATOR(CudaProfileStart, CudaProfileStartOp);
+REGISTER_CUDA_OPERATOR(CudaProfileStop, CudaProfileStopOp);
+
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py
new file mode 100644
index 0000000..2953503
--- /dev/null
+++ b/caffe2/contrib/prof/cuda_profile_ops_test.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, dyndep, workspace
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/prof:cuda_profile_ops")
+
+
+class CudaProfileOpsTest(unittest.TestCase):
+    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU")
+    def test_run(self):
+        net = core.Net("net")
+        net.CudaProfileInitialize([], [], output="/tmp/cuda_profile_test")
+        net.CudaProfileStart([], [])
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            net.ConstantFill([], ["out"], shape=[1, 3, 244, 244])
+        net.CudaProfileStop([], [])
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
diff --git a/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc b/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc
new file mode 100644
index 0000000..fe67940
--- /dev/null
+++ b/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc
@@ -0,0 +1,60 @@
+#include <htrace.hpp>
+
+#include "caffe2/contrib/prof/htrace_conf.h"
+#include "caffe2/core/net_async_dag_gpu.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+class HTraceAsyncDAGNet : public AsyncDAGNet {
+ public:
+  HTraceAsyncDAGNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws)
+      : AsyncDAGNet(net_def, ws) {
+    VLOG(1) << "Constructing HTraceAsyncDAGNet " << net_def->name();
+
+    for (auto& worker : workers_) {
+      std::thread::id worker_id = worker.get_id();
+      std::stringstream stream;
+      stream << "worker-scope-" << worker_id;
+      htrace_worker_scope_map_[worker_id] = std::make_shared<htrace::Scope>(
+          htrace_tracer_, htrace_root_scope_.GetSpanId(), stream.str());
+    }
+  }
+
+  ~HTraceAsyncDAGNet() {
+    VLOG(1) << "Closing all htrace scopes for workers";
+
+    // Due to the implementation of htrace,
+    // we need to make sure we delete the scopes in order.
+    // Simply calling map.clear() may not preserve the order.
+    auto iter = htrace_worker_scope_map_.begin();
+    while (iter != htrace_worker_scope_map_.end()) {
+      iter = htrace_worker_scope_map_.erase(iter);
+    }
+  }
+
+ protected:
+  bool DoRunAsync() override {
+    htrace::Scope run_scope(
+        htrace_tracer_,
+        htrace_root_scope_.GetSpanId(),
+        "run-scope-" + caffe2::to_string(run_count_++));
+    return AsyncDAGNet::DoRunAsync();
+  }
+
+  htrace::Conf htrace_conf_{defaultHTraceConf(name_)};
+  htrace::Tracer htrace_tracer_{"htrace-tracer", htrace_conf_};
+  htrace::Sampler htrace_sampler_{&htrace_tracer_, htrace_conf_};
+  htrace::Scope htrace_root_scope_{htrace_tracer_,
+                                   htrace_sampler_,
+                                   "root-scope"};
+  std::map<std::thread::id, std::shared_ptr<htrace::Scope>>
+      htrace_worker_scope_map_;
+  int run_count_ = 0;
+};
+
+REGISTER_NET(htrace_async_dag, HTraceAsyncDAGNet);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/htrace_conf.cc b/caffe2/contrib/prof/htrace_conf.cc
new file mode 100644
index 0000000..ad1fb12
--- /dev/null
+++ b/caffe2/contrib/prof/htrace_conf.cc
@@ -0,0 +1,49 @@
+#include "htrace_conf.h"
+
+#include <htrace.hpp>
+#include <algorithm>
+#include <ctime>
+
+CAFFE2_DEFINE_string(
+    caffe2_htrace_span_log_path,
+    "",
+    "Span log path for htrace");
+
+namespace caffe2 {
+
+const string defaultHTraceConf(const string& net_name) {
+  // create a duplicate because we may need to modify the name
+  string net_name_copy(net_name);
+
+  // make sure the net name is a valid file name
+  std::replace(net_name_copy.begin(), net_name_copy.end(), '/', '_');
+  std::replace(net_name_copy.begin(), net_name_copy.end(), '\\', '_');
+
+  // take current local time
+  time_t rawtime;
+  std::time(&rawtime);
+  struct tm timeinfo;
+  localtime_r(&rawtime, &timeinfo);
+
+  // and append it to the log file name, in a human-readable format
+  std::string buf;
+  buf.resize(30); // 15 should be enough, but apparently is too short.
+  strftime(&buf[0], buf.size(), "%Y%m%d_%H%M%S", &timeinfo);
+  auto datetime = buf.data();
+
+  std::stringstream stream;
+  stream << HTRACE_SPAN_RECEIVER_KEY << "=local.file;";
+  stream << HTRACE_SAMPLER_KEY << "=always;";
+
+  if (FLAGS_caffe2_htrace_span_log_path.empty()) {
+    stream << HTRACE_LOCAL_FILE_RCV_PATH_KEY << "=/tmp/htrace_" << net_name_copy
+           << "_span_log_" << datetime << ";";
+  } else {
+    stream << HTRACE_LOCAL_FILE_RCV_PATH_KEY << "="
+           << FLAGS_caffe2_htrace_span_log_path << ";";
+  }
+
+  return stream.str();
+}
+
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/htrace_conf.h b/caffe2/contrib/prof/htrace_conf.h
new file mode 100644
index 0000000..97286cb
--- /dev/null
+++ b/caffe2/contrib/prof/htrace_conf.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "caffe2/core/flags.h"
+
+CAFFE2_DECLARE_string(caffe2_htrace_span_log_path);
+
+namespace caffe2 {
+
+const string defaultHTraceConf(const string& net_name);
+
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/htrace_dag_net.cc b/caffe2/contrib/prof/htrace_dag_net.cc
new file mode 100644
index 0000000..a802cfc
--- /dev/null
+++ b/caffe2/contrib/prof/htrace_dag_net.cc
@@ -0,0 +1,88 @@
+#include <htrace.hpp>
+
+#include "caffe2/contrib/prof/htrace_conf.h"
+#include "caffe2/core/net_dag.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+class HTraceDAGNet : public DAGNetBase {
+ public:
+  HTraceDAGNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws)
+      : DAGNetBase(net_def, ws) {
+    VLOG(1) << "Constructing HTrace DAG Net " << net_def->name();
+
+    for (auto& worker : workers_) {
+      std::thread::id worker_id = worker.get_id();
+      std::stringstream stream;
+      stream << "worker-scope-" << worker_id;
+      htrace_worker_scope_map_[worker_id] = std::make_shared<htrace::Scope>(
+          htrace_tracer_, htrace_root_scope_.GetSpanId(), stream.str());
+    }
+  }
+
+  bool SupportsAsync() override {
+    return false;
+  }
+
+  ~HTraceDAGNet() {
+    VLOG(1) << "Closing all htrace scopes for workers";
+
+    // Due to the implementation of htrace,
+    // we need to make sure we delete the scopes in order.
+    // Simply calling map.clear() may not preserve the order.
+    auto iter = htrace_worker_scope_map_.begin();
+    while (iter != htrace_worker_scope_map_.end()) {
+      iter = htrace_worker_scope_map_.erase(iter);
+    }
+  }
+
+ protected:
+  bool DoRunAsync() override {
+    htrace::Scope run_scope(
+        htrace_tracer_,
+        htrace_root_scope_.GetSpanId(),
+        "run-scope-" + caffe2::to_string(run_count_++));
+    return DAGNetBase::DoRunAsync();
+  }
+
+  bool RunAt(int /* unused */, const std::vector<int>& chain) override {
+    std::thread::id thread_id = std::this_thread::get_id();
+    auto worker_scope = htrace_worker_scope_map_[thread_id];
+
+    bool success = true;
+    for (const auto idx : chain) {
+      const auto& op = operator_nodes_[idx].operator_;
+      const auto& def = op->debug_def();
+      const string& print_name =
+          (def.name().size()
+               ? def.name()
+               : (op->OutputSize() ? def.output(0) : "NO_OUTPUT"));
+      const string& op_type = def.type();
+
+      htrace::Scope operator_scope(
+          htrace_tracer_,
+          worker_scope->GetSpanId(),
+          "#" + caffe2::to_string(idx) + " (" + print_name + ", " + op_type +
+              ")");
+      success &= operator_nodes_[idx].operator_->Run();
+    }
+    return success;
+  }
+
+  htrace::Conf htrace_conf_{defaultHTraceConf(name_)};
+  htrace::Tracer htrace_tracer_{"htrace-tracer", htrace_conf_};
+  htrace::Sampler htrace_sampler_{&htrace_tracer_, htrace_conf_};
+  htrace::Scope htrace_root_scope_{htrace_tracer_,
+                                   htrace_sampler_,
+                                   "root-scope"};
+  std::map<std::thread::id, std::shared_ptr<htrace::Scope>>
+      htrace_worker_scope_map_;
+  int run_count_ = 0;
+};
+
+REGISTER_NET(htrace_dag, HTraceDAGNet);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/htrace_to_chrome.py b/caffe2/contrib/prof/htrace_to_chrome.py
new file mode 100644
index 0000000..7c6e823
--- /dev/null
+++ b/caffe2/contrib/prof/htrace_to_chrome.py
@@ -0,0 +1,201 @@
+## @package htrace_to_chrome
+# Module caffe2.contrib.prof.htrace_to_chrome
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import json
+import re
+import sys
+
+display_levels = ["network", "worker", "operator", "kernel"]
+
+
+def stop_display(limit, curr):
+    return display_levels.index(limit) <= display_levels.index(curr)
+
+
+def build_trace_dict(f, start_time, end_time):
+    """Creates a python dictionary that has trace ids as keys and the
+    corresponding trace objects as values.
+
+    Input: python file object that points to a file with traces, written by
+    htrace-c's local file span receiver.
+    The exact format shouldn't concern you if you're using htrace-c correctly.
+    https://github.com/apache/incubator-htrace/blob/master/htrace-c.
+
+    Returns: a tuple (trace_dic, root_list), where trace_dic is a dictionary
+    containing all traces parsed from the input file object, and root_list is a
+    list of traces from trace_dic which have no parents.
+    Each value in trace_dic is in the form of another dictionary with the
+    folowing keys:
+        "begin"   : timestamp of trace start time, microseconds
+        "end"     : timestamp of trace end time, microseconds
+        "desc"    : description of trace
+        "parent"  : trace id of parent trace
+        "children": dictionary of child traces, in the same format as trace_dic
+    """
+    trace_dic = {}
+    root_list = []
+    for line in f:
+        h = json.loads(line)
+        if h["e"] < start_time or h["b"] > end_time:
+            continue
+
+        entry = {"begin": h["b"], "end": h["e"], "desc": h["d"]}
+        if "p" not in h or len(h["p"]) == 0:
+            root_list.append(entry)
+        else:
+            entry["parent"] = h["p"][0]
+        trace_dic[h["a"]] = entry
+
+    for k, v in trace_dic.items():
+        if "parent" not in v:
+            continue
+        parent = trace_dic[v["parent"]]
+        if "children" not in parent:
+            parent["children"] = {}
+        parent["children"][k] = v
+
+    return trace_dic, root_list
+
+
+def generate_chrome_trace(root_list, display):
+    """Takes trace objects created by build_trace_dict() and generates a list of
+    python dictionaries that can be written to a file in json format, which in
+    turn can be given to Chrome tracing (chrome://tracing).
+
+    Input: refer to root_list in build_trace_dict()'s return value.
+
+    Output: list of dictionaries that can be directly written to a json file by
+    json.dumps().
+    The dictionary format follows the JSON array format of Chrome tracing.
+    Complete events ("ph": "X") are used to express most traces; such events
+    will appear as horizontal blocks with lengths equal to the trace duration.
+    Instant events ("ph": "i") are used for traces with many occurrencs which
+    may make the trace graph unreadable; such events are shown as thin lines.
+    """
+    ct = []
+    for root_idx, root in enumerate(root_list):
+        # network-level spans
+        ct.append({
+            "name": root["desc"],
+            "ph": "X",
+            "ts": root["begin"],
+            "dur": root["end"] - root["begin"],
+            "pid": root_idx,
+            "tid": root_idx,
+            "args": {
+                "Start timestamp": root["begin"],
+                "End timestamp": root["end"]
+            }
+        })
+
+        for _, v in root["children"].items():
+            # run-scopes and worker-scopes
+            c = {
+                "name": v["desc"],
+                "ph": "X",
+                "ts": v["begin"],
+                "dur": v["end"] - v["begin"],
+                "pid": root_idx,
+                "args": {
+                    "Start timestamp": v["begin"],
+                    "End timestamp": v["end"]
+                }
+            }
+
+            if "run-scope" in v["desc"]:
+                c["tid"] = root_idx
+                ct.append(c)
+            else:
+                if stop_display(display, "network"):
+                    continue
+
+                m = re.search("(?<=worker-scope-)\d+", v["desc"])
+                wid = m.group(0)
+                c["tid"] = wid
+                ct.append(c)
+
+                if stop_display(display, "worker") or "children" not in v:
+                    continue
+                for k_op, v_op in v["children"].items():
+                    # operator scopes
+                    ct.append({
+                        "name": v_op["desc"],
+                        "ph": "X",
+                        "ts": v_op["begin"],
+                        "dur": v_op["end"] - v_op["begin"],
+                        "pid": root_idx,
+                        "tid": wid,
+                        "args": {
+                            "Start timestamp": v_op["begin"],
+                            "End timestamp": v_op["end"]
+                        }
+                    })
+
+                    if stop_display(display, "operator") or "children" not in v_op:
+                        continue
+                    for idx, (k_gpu_op, v_gpu_op) in \
+                            enumerate(sorted(v_op["children"].items(),
+                                             key=lambda e: e[1]["begin"])):
+                        # kernel scopes
+                        if idx == 0:
+                            ct.append({
+                                "name": v_op["desc"] + "-GPU",
+                                "ph": "X",
+                                "ts": v_gpu_op["begin"],
+                                "dur": v_gpu_op["end"] - v_gpu_op["begin"],
+                                "pid": root_idx,
+                                "tid": wid,
+                                "args": {
+                                    "desc": "NEW OPERATOR",
+                                    "Start timestamp": v_gpu_op["begin"],
+                                    "End timestamp": v_gpu_op["end"]
+                                }
+                            })
+
+                        ct.append({
+                            "name": v_op["desc"] + "-GPU",
+                            "ph": "i",
+                            "ts": v_gpu_op["begin"],
+                            "pid": root_idx,
+                            "tid": wid,
+                            "args": {
+                                "desc": v_gpu_op["desc"]
+                            }
+                        })
+
+    return ct
+
+
+def get_argument_parser():
+    parser = argparse.ArgumentParser(
+        description="Format conversion from HTrace to Chrome tracing.")
+    parser.add_argument("htrace_log", type=str, help="input htrace span log file")
+    parser.add_argument("--display",
+                        type=str, choices=display_levels, default="operator",
+                        help="deepest level of spans to display (default: operator)")
+    parser.add_argument("--start_time", type=int, default=-1,
+                        help="do not display spans occuring before this timestamp")
+    parser.add_argument("--end_time", type=int, default=sys.maxsize,
+                        help="do not display spans occuring after this timestamp")
+    return parser
+
+
+def main():
+    args = get_argument_parser().parse_args()
+    with open(args.htrace_log, "r") as f:
+        trace_dic, root_list = build_trace_dict(f, args.start_time, args.end_time)
+
+    ct = generate_chrome_trace(root_list, args.display)
+    print("Writing chrome json file to %s.json" % args.htrace_log)
+    print("Now import %s.json in chrome://tracing" % args.htrace_log)
+    with open(args.htrace_log + ".json", "w") as f:
+        f.write(json.dumps(ct))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
new file mode 100644
index 0000000..16917dd
--- /dev/null
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -0,0 +1,205 @@
+#include "prof_dag_net.h"
+
+#include <cmath>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+ProfDAGNet::ProfDAGNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : DAGNetBase(net_def, ws), time_per_op_total_(operator_nodes_.size()) {
+  VLOG(1) << "Constructing ProfDAGNet " << name_;
+}
+
+ProfDAGNet::~ProfDAGNet() {
+  VLOG(1) << "Closing ProfDAGNet " << name_;
+  if (runs_ <= 1) {
+    LOG(INFO) << "Insufficient runs to produce meaningful data.";
+    return;
+  }
+  PrintStats();
+}
+
+void ProfDAGNet::ValidateOpTensorDevices() {
+  bool had_mismatches = false;
+  for (int idx = 0; idx < operator_nodes_.size(); idx++) {
+    const auto& node = operator_nodes_[idx];
+    auto mismatches =
+        ValidateTensorDevices(*node.operator_, node.operator_->debug_def());
+    for (auto& mismatch : mismatches) {
+      had_mismatches = true;
+      LOG(INFO) << "== PERFORMANCE WARNING == \n"
+                << " Operator " << node.operator_->debug_def().type()
+                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
+                << " but tensor [" << mismatch.first << "] is on GPU "
+                << mismatch.second.second.cuda_gpu_id();
+    }
+  }
+  if (!had_mismatches) {
+    LOG(INFO) << "Analyzed operator & blob GPU assignments -- no mismatches";
+  }
+}
+
+bool ProfDAGNet::DoRunAsync() {
+  runs_++;
+
+  // don't collect statistics from first run
+  if (runs_ <= 1) {
+    bool success = DAGNetBase::DoRunAsync();
+    ValidateOpTensorDevices();
+    return success;
+  }
+
+  CAFFE_ENFORCE(
+      time_per_op_total_.size() == operator_nodes_.size(),
+      "Data collected for ",
+      time_per_op_total_.size(),
+      " ops, expected ",
+      operator_nodes_.size(),
+      " ops.");
+
+  // Create a copy of cumulative stats before the run so we can
+  // later collect the difference
+  vector<Stats> time_per_op_pre_run(time_per_op_total_);
+  bool success = DAGNetBase::DoRunAsync();
+
+  // Aggregate this run's stats per operator type
+  CaffeMap<string, float> time_per_op_type_run;
+  for (int idx = 0; idx < operator_nodes_.size(); idx++) {
+    const auto& node = operator_nodes_[idx];
+    const string& op_type = node.operator_->debug_def().type();
+    time_per_op_type_run[op_type] +=
+        time_per_op_total_[idx].sum - time_per_op_pre_run[idx].sum;
+    time_per_op_type_total_[op_type].cnt += 1;
+  }
+
+  for (const auto& item : time_per_op_type_run) {
+    time_per_op_type_total_[item.first].sum += item.second;
+    time_per_op_type_total_[item.first].sqrsum += item.second * item.second;
+  }
+
+  return success;
+}
+
+ProfDAGProto ProfDAGNet::ProtoMsg(std::pair<std::string, Stats> op_stat) const {
+  ProfDAGProto message;
+  float mean = op_stat.second.sum / (runs_ - 1);
+  float stddev = std::sqrt(op_stat.second.sqrsum / (runs_ - 1) - mean * mean);
+  message.set_mean(mean);
+  message.set_stddev(stddev);
+  message.set_name(op_stat.first);
+  return message;
+}
+
+ProfDAGProtos ProfDAGNet::GetOperatorStats() {
+  ProfDAGProtos prof_dag_protos;
+  for (auto& item : time_per_op_type_total_) {
+    auto buf = prof_dag_protos.add_stats();
+    buf->CopyFrom(ProtoMsg(item));
+  }
+  return prof_dag_protos;
+}
+
+// GetPerOperatorCost collects the execution time of each operator, the output
+// is formatted as a map: (netName__opIndex__opType, cost)
+ProfDAGProtos ProfDAGNet::GetPerOperatorCost() {
+  CAFFE_ENFORCE(
+      time_per_op_total_.size() == operator_nodes_.size(),
+      "Data collected for ",
+      time_per_op_total_.size(),
+      " ops, expected ",
+      operator_nodes_.size(),
+      " ops.");
+
+  ProfDAGProtos prof_dag_protos;
+  for (int idx = 0; idx < operator_nodes_.size(); idx++) {
+    const auto& op = operator_nodes_[idx].operator_;
+    const auto& def = op->debug_def();
+    const string& op_type = def.type();
+
+    auto buf = prof_dag_protos.add_stats();
+    std::string op_output_name =
+        name_ + "___" + to_string(idx) + "___" + op_type;
+    std::pair<std::string, Stats> op_stat =
+        std::pair<std::string, Stats>(op_output_name, time_per_op_total_[idx]);
+    buf->CopyFrom(ProtoMsg(op_stat));
+  }
+  return prof_dag_protos;
+}
+
+bool ProfDAGNet::RunAt(int /* unused */, const std::vector<int>& chain) {
+  bool success = true;
+  Timer timer;
+  for (const auto idx : chain) {
+    // don't collect metrics from first run
+    if (runs_ <= 1) {
+      success &= operator_nodes_[idx].operator_->Run();
+
+    } else {
+      timer.Start();
+      success &= operator_nodes_[idx].operator_->Run();
+      float spent = timer.MilliSeconds();
+
+      CAFFE_ENFORCE(
+          time_per_op_total_.size() > idx,
+          "Expecting ",
+          time_per_op_total_.size(),
+          " ops, but op #",
+          idx,
+          " was given.");
+      time_per_op_total_[idx].sum += spent;
+      time_per_op_total_[idx].sqrsum += spent * spent;
+    }
+  }
+  return success;
+}
+
+void ProfDAGNet::PrintStats() {
+  CAFFE_ENFORCE(
+      time_per_op_total_.size() == operator_nodes_.size(),
+      "Data collected for ",
+      time_per_op_total_.size(),
+      " ops, expected ",
+      operator_nodes_.size(),
+      " ops.");
+
+  CAFFE_ENFORCE(runs_ > 1, "# of runs: ", runs_, ", expected > 1.");
+  int measured_runs = runs_ - 1;
+
+  LOG(INFO) << "Measured operators over " << measured_runs << " net runs.";
+
+  for (int idx = 0; idx < operator_nodes_.size(); idx++) {
+    const auto& op = operator_nodes_[idx].operator_;
+    const auto& def = op->debug_def();
+    const string& op_type = def.type();
+    const string& print_name = def.name().size()
+        ? def.name()
+        : (op->OutputSize() ? def.output(0) : "NO_OUTPUT");
+
+    float mean = time_per_op_total_[idx].sum / measured_runs;
+    float stddev =
+        std::sqrt(time_per_op_total_[idx].sqrsum / measured_runs - mean * mean);
+    VLOG(1) << "Op #" << idx << " (" << print_name << ", " << op_type << ") "
+            << mean << " ms/run (" << stddev << " ms/run)";
+  }
+
+  LOG(INFO) << "Mean time in operator per run (stddev):";
+  for (const auto& item : time_per_op_type_total_) {
+    float mean = item.second.sum / measured_runs;
+    float stddev = std::sqrt(item.second.sqrsum / measured_runs - mean * mean);
+    LOG(INFO) << std::setw(10) << std::setfill(' ') << mean << " ms/run ("
+              << std::setw(10) << std::setfill(' ') << stddev << " ms/run) "
+              << " Op count per run: " << (item.second.cnt / measured_runs)
+              << "  " << item.first;
+  }
+}
+
+namespace {
+
+REGISTER_NET(prof_dag, ProfDAGNet);
+}
+
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/prof_dag_net.h b/caffe2/contrib/prof/prof_dag_net.h
new file mode 100644
index 0000000..918fdb9
--- /dev/null
+++ b/caffe2/contrib/prof/prof_dag_net.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "caffe2/core/net_dag.h"
+#include "caffe2/proto/prof_dag.pb.h"
+
+namespace caffe2 {
+
+struct Stats {
+  float sum;
+  float sqrsum;
+  size_t cnt;
+};
+
+/**
+ * This net type is identical to DAGNet, except that it
+ * measures the time taken for each and every operator.
+ *
+ * To collect statistics from stable runs, this net ignores the first run.
+ * Thus, at least two runs are required for this net to print operator metrics.
+ */
+class ProfDAGNet : public DAGNetBase {
+ public:
+  ProfDAGNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  ~ProfDAGNet();
+  bool SupportsAsync() override {
+    return false;
+  }
+  ProfDAGProtos GetOperatorStats();
+
+  // GetPerOperatorCost collects the execution time of each operator, the
+  // output is formatted as a map: (netName__opIndex__opType, cost)
+  ProfDAGProtos GetPerOperatorCost();
+
+ protected:
+  bool DoRunAsync() override;
+  bool RunAt(int chain_id, const std::vector<int>& chain) override;
+  void PrintStats();
+  void ValidateOpTensorDevices();
+  ProfDAGProto ProtoMsg(std::pair<std::string, Stats> op_stat) const;
+  // Cumulative sum and sum squared time spent per operator instance in net.
+  std::vector<Stats> time_per_op_total_;
+  // Cumulative sum and sum squared time spent per unique operator type.
+  CaffeMap<std::string, Stats> time_per_op_type_total_;
+  int runs_ = 0;
+};
+
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/prof_dag_stats_op.cc b/caffe2/contrib/prof/prof_dag_stats_op.cc
new file mode 100644
index 0000000..70f4c73
--- /dev/null
+++ b/caffe2/contrib/prof/prof_dag_stats_op.cc
@@ -0,0 +1,26 @@
+#include "prof_dag_stats_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(GetProfDagStats, GetProfDagStatsOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(GetProfDagStats)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Gets the profiling statistics.
+)DOC")
+    .Arg(
+        "per_op",
+        "(bool) default to false; False: calculate per-op-type cost."
+        "True: calculate per-op cost, the cost of multiple instances of the same "
+        "op will be calculated separately")
+    .Arg(
+        "partial_net_name",
+        "(string) default to empty; describes the partial name of the ProfDAGNet")
+    .Arg(
+        "net_name",
+        "(string) default to empty; describes the name of the ProfDAGNet");
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/prof_dag_stats_op.h b/caffe2/contrib/prof/prof_dag_stats_op.h
new file mode 100644
index 0000000..1b90781
--- /dev/null
+++ b/caffe2/contrib/prof/prof_dag_stats_op.h
@@ -0,0 +1,84 @@
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+
+#include "caffe2/contrib/prof/prof_dag_net.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// This operator outputs the ProfDAGNet stats
+template <typename T, class Context, class Engine = DefaultEngine>
+class GetProfDagStatsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GetProfDagStatsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        net_name_(OperatorBase::GetSingleArgument<std::string>("net_name", "")),
+        partial_net_name_(OperatorBase::GetSingleArgument<std::string>(
+            "partial_net_name",
+            "")),
+        per_op_(OperatorBase::GetSingleArgument<bool>("per_op", false)) {
+    ws_ = ws;
+    CAFFE_ENFORCE(
+        !(net_name_.empty() && partial_net_name_.empty()),
+        "You need to provide net_name or partial_net_name");
+    CAFFE_ENFORCE(
+        net_name_.empty() || partial_net_name_.empty(),
+        "You can not provide both net_name and partial_net_name");
+  }
+  ~GetProfDagStatsOp() {}
+
+  bool RunOnDevice() override {
+    // find the net by net_name_ or partial_net_name
+    NetBase* net = nullptr;
+    if (!net_name_.empty()) {
+      net = ws_->GetNet(net_name_);
+    } else if (!partial_net_name_.empty()) {
+      for (auto& current_net : ws_->Nets()) {
+        if (current_net.find(partial_net_name_) != std::string::npos) {
+          CAFFE_ENFORCE(
+              net == nullptr,
+              "There are multiple nets with ",
+              partial_net_name_,
+              " as part of their name");
+          net = ws_->GetNet(current_net);
+        }
+      }
+      CAFFE_ENFORCE(
+          net,
+          "Can not find a net with ",
+          partial_net_name_,
+          " as part of its name");
+    }
+
+    auto prof_dag_net = dynamic_cast_if_rtti<ProfDAGNet*>(net);
+    CAFFE_ENFORCE(prof_dag_net);
+
+    ProfDAGProtos stats;
+    if (per_op_) {
+      stats = prof_dag_net->GetPerOperatorCost();
+    } else {
+      stats = prof_dag_net->GetOperatorStats();
+    }
+
+    // Write protobuf message to the output blob
+    std::string serialized_data;
+    CAFFE_ENFORCE(stats.SerializeToString(&serialized_data));
+    Output(0)->Resize(1);
+    Output(0)->template mutable_data<std::string>()[0] = serialized_data;
+
+    return true;
+  }
+
+ protected:
+  std::string net_name_;
+  std::string partial_net_name_;
+  bool per_op_;
+  Workspace* ws_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/contrib/prof/profiling_annotations.h b/caffe2/contrib/prof/profiling_annotations.h
new file mode 100644
index 0000000..dcac8a0
--- /dev/null
+++ b/caffe2/contrib/prof/profiling_annotations.h
@@ -0,0 +1,117 @@
+// This file defines classes that hold profiling information for
+// NeuralNetOperator and NeuralNetData.
+#pragma once
+
+#include "caffe2/contrib/prof/prof_dag_net.h"
+#include "caffe2/proto/prof_dag.pb.h"
+
+namespace caffe2 {
+namespace contrib {
+namespace prof {
+
+// Accumulates data points and generates two point summary: mean, stddev.
+class TwoNumberStats {
+ public:
+  TwoNumberStats() : sum_(0), squareSum_(0), count_(0) {}
+  // To prepopulate state of the TwoNumberStats accumulator.
+  TwoNumberStats(float mean, float stddev, int count)
+      : sum_(mean * count),
+        squareSum_((stddev * stddev + mean * mean) * count),
+        count_(count) {}
+  // This is a small structure and so it's OK to copy (and move).
+  TwoNumberStats(const TwoNumberStats& other) = default;
+  TwoNumberStats(TwoNumberStats&& other) = default;
+  void addPoint(float point) {
+    sum_ += point;
+    squareSum_ += point * point;
+    count_++;
+  }
+  float getMean() const {
+    if (count_ == 0) {
+      return 0;
+    }
+    return sum_ / count_;
+  }
+  // Returns population stddev.
+  float getStddev() const {
+    if (count_ == 0) {
+      return 0;
+    }
+    return sqrt((count_ * squareSum_ - sum_ * sum_) / (count_ * count_));
+  }
+  // Serializes the internal state.
+  TwoNumberStatsProto ToProto() const {
+    TwoNumberStatsProto proto;
+    proto.set_mean(getMean());
+    proto.set_stddev(getStddev());
+    proto.set_count(count_);
+    return proto;
+  }
+  // Merges another stat accumulator into this one.
+  void Merge(const TwoNumberStats& other) {
+    sum_ += other.sum_;
+    squareSum_ += other.squareSum_;
+    count_ += other.count_;
+  }
+
+ private:
+  // Sum of data points.
+  float sum_;
+  // Sum of square of data points.
+  float squareSum_;
+  // Sample count.
+  int count_;
+};
+
+// Annotations used when profiling a NeuralNetOperator.
+class ProfilingOperatorAnnotation {
+ public:
+  ProfilingOperatorAnnotation() {}
+  explicit ProfilingOperatorAnnotation(const ProfDAGProto& stats_proto)
+      : execution_time_ms_(
+            stats_proto.execution_time().mean(),
+            stats_proto.execution_time().stddev(),
+            stats_proto.execution_time().count()) {}
+  ProfilingOperatorAnnotation(ProfilingOperatorAnnotation&&) = default;
+  // Accessors
+  const TwoNumberStats& getExecutionTimeMs() const {
+    return execution_time_ms_;
+  }
+  TwoNumberStats* getMutableExecutionTimeMs() {
+    return &execution_time_ms_;
+  }
+
+ private:
+  // Statistics for how long this op took to execute.
+  TwoNumberStats execution_time_ms_;
+};
+
+// Annotations used when profiling a NeuralNetData. Data this class
+// stores is translatable to/from BlobProfile. Note: translation
+// may be lossy due to use of floating point arithmetic.
+class ProfilingDataAnnotation {
+ public:
+  ProfilingDataAnnotation() {}
+  explicit ProfilingDataAnnotation(const BlobProfile& profile)
+      : used_bytes_(
+            profile.bytes_used().mean(),
+            profile.bytes_used().stddev(),
+            profile.bytes_used().count()) {}
+  ProfilingDataAnnotation(ProfilingDataAnnotation&&) = default;
+  // Accessors
+  const TwoNumberStats& getUsedBytes() const {
+    return used_bytes_;
+  }
+  TwoNumberStats* getMutableUsedBytes() {
+    return &used_bytes_;
+  }
+
+ private:
+  // Statistics for how much data this tensor/parameter used (per invocation of
+  // the op that generated the data).
+  TwoNumberStats used_bytes_;
+};
+
+} // namespace prof
+} // namespace contrib
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/profiling_annotations_test.cc b/caffe2/contrib/prof/profiling_annotations_test.cc
new file mode 100644
index 0000000..1a40c72
--- /dev/null
+++ b/caffe2/contrib/prof/profiling_annotations_test.cc
@@ -0,0 +1,44 @@
+// Unit tests for profiling_annotations.h.
+#include "caffe2/contrib/prof/profiling_annotations.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace contrib {
+namespace prof {
+namespace {
+
+TEST(TwoNumberStatsTest, ComputeAndGetOpStatsSummary) {
+  // e.g., 2 and 3
+  TwoNumberStats stats;
+  stats.addPoint(2);
+  stats.addPoint(3);
+  EXPECT_FLOAT_EQ(2.5, stats.getMean());
+  // Population standard deviation.
+  EXPECT_FLOAT_EQ(0.5, stats.getStddev());
+}
+
+TEST(TwoNumberStatsTest, TestRestore) {
+  // Expect that restore&recompute is still the same.
+  // E.g., 2 and 3 (above).
+  TwoNumberStats stats(2.5, 0.5, 2);
+  // Expect that restore&recompute is still the same.
+  EXPECT_FLOAT_EQ(2.5, stats.getMean());
+  // Population standard deviation.
+  EXPECT_FLOAT_EQ(0.5, stats.getStddev());
+}
+
+TEST(ProfilingAnnotationsTest, BasicAccessToActiveData) {
+  ProfilingOperatorAnnotation op_annotation;
+  op_annotation.getMutableExecutionTimeMs()->addPoint(5);
+  EXPECT_EQ(5, op_annotation.getExecutionTimeMs().getMean());
+
+  ProfilingDataAnnotation data_annotation;
+  data_annotation.getMutableUsedBytes()->addPoint(7);
+  EXPECT_EQ(7, data_annotation.getUsedBytes().getMean());
+}
+
+} // namespace
+} // namespace prof
+} // namespace contrib
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/profiling_info.cc b/caffe2/contrib/prof/profiling_info.cc
new file mode 100644
index 0000000..aa0042f
--- /dev/null
+++ b/caffe2/contrib/prof/profiling_info.cc
@@ -0,0 +1,173 @@
+// Implements ProfilingInfo class.
+#include "caffe2/contrib/prof/profiling_info.h"
+
+namespace caffe2 {
+namespace contrib {
+namespace prof {
+
+bool ProfilingInfo::Init(const NetDef& netDef) {
+  bool success = true;
+  int opIdx = 0;
+  name_ = netDef.name();
+  blobMap_.clear();
+  operatorMap_.clear();
+  for (const auto& op : netDef.op()) {
+    bool inserted =
+        operatorMap_.emplace(opIdx, ProfilingOperatorAnnotation()).second;
+    if (!inserted) {
+      success = false;
+    }
+    for (const auto& output : op.output()) {
+      inserted = blobMap_.emplace(output, ProfilingDataAnnotation()).second;
+      if (!inserted) {
+        success = false;
+      }
+    }
+    ++opIdx;
+  }
+  return success;
+}
+
+bool ProfilingInfo::Restore(
+    const NetDef& netDef,
+    const ProfDAGProtos& profile) {
+  if (netDef.name() != profile.net_name()) {
+    // If profile is not in the old format then it should contain the same name.
+    return false;
+  }
+  name_ = netDef.name();
+  blobMap_.clear();
+  operatorMap_.clear();
+  bool success = true;
+  int opIdx = 0;
+  for (const auto& op : netDef.op()) {
+    if (!addOperatorAnnotation(profile, opIdx, op.name())) {
+      success = false;
+    }
+    int blobIdx = 0;
+    for (const auto& output : op.output()) {
+      if (!addDataAnnotation(profile, opIdx, blobIdx, output)) {
+        success = false;
+      }
+      ++blobIdx;
+    }
+    ++opIdx;
+  }
+  return success;
+}
+
+bool ProfilingInfo::GetOperatorAndDataStats(
+    const NetDef& netDef,
+    bool oldFormat,
+    ProfDAGProtos* serialized) const {
+  if (!oldFormat) {
+    serialized->set_net_name(name_);
+  }
+  bool success = true;
+  int opIdx = 0;
+  for (const auto& op : netDef.op()) {
+    auto opIt = operatorMap_.find(opIdx);
+    if (opIt == operatorMap_.end()) {
+      success = false;
+      continue;
+    }
+    auto* stats = serialized->add_stats();
+    // Set required fields for compatibility for both formats.
+    stats->set_mean(opIt->second.getExecutionTimeMs().getMean());
+    stats->set_stddev(opIt->second.getExecutionTimeMs().getStddev());
+    if (oldFormat) {
+      stats->set_name(getNameInOldFormat(opIdx, op.type()));
+    } else {
+      stats->set_name(op.name());
+      *stats->mutable_execution_time() =
+          opIt->second.getExecutionTimeMs().ToProto();
+      for (const auto& output : op.output()) {
+        auto blobIt = blobMap_.find(output);
+        if (blobIt == blobMap_.end()) {
+          success = false;
+          continue;
+        }
+        auto* output_profile = stats->add_output_profile();
+        output_profile->set_name(output);
+        *output_profile->mutable_bytes_used() =
+            blobIt->second.getUsedBytes().ToProto();
+      }
+    }
+    ++opIdx;
+  }
+  return success;
+}
+
+bool ProfilingInfo::GetOperatorTypeStats(
+    const NetDef& netDef,
+    ProfDAGProtos* serialized) const {
+  bool success = true;
+  std::unordered_map<string /* type */, TwoNumberStats> typeStats;
+  int opIdx = 0;
+  for (const auto& op : netDef.op()) {
+    auto opIt = operatorMap_.find(opIdx);
+    if (opIt == operatorMap_.end()) {
+      success = false;
+      continue;
+    }
+    auto stats_it =
+        typeStats.emplace(op.type(), opIt->second.getExecutionTimeMs());
+    if (!stats_it.second) { // existing type
+      stats_it.first->second.Merge(opIt->second.getExecutionTimeMs());
+    }
+    ++opIdx;
+  }
+
+  for (const auto& type_and_stat : typeStats) {
+    auto* stat = serialized->add_stats();
+    stat->set_name(type_and_stat.first);
+    stat->set_mean(type_and_stat.second.getMean());
+    stat->set_stddev(type_and_stat.second.getStddev());
+  }
+
+  return success;
+}
+
+bool ProfilingInfo::addOperatorAnnotation(
+    const ProfDAGProtos& profile,
+    int idx,
+    const string& opName) {
+  if (idx >= profile.stats_size()) {
+    LOG(ERROR) << __func__ << ": indexing " << idx << " within "
+               << profile.stats_size();
+    return false;
+  }
+  const auto& op_node = profile.stats(idx);
+  if (op_node.name() != opName) {
+    LOG(ERROR) << "Unmatched name in ProfDAGProtos and NetDef. Respectively: "
+               << op_node.name() << ", " << opName;
+    return false;
+  }
+  return operatorMap_.emplace(idx, ProfilingOperatorAnnotation(op_node)).second;
+}
+
+bool ProfilingInfo::addDataAnnotation(
+    const ProfDAGProtos& profile,
+    int opIdx,
+    int blobIdx,
+    const string& output_name) {
+  if (opIdx >= profile.stats_size() ||
+      blobIdx >= profile.stats(opIdx).output_profile_size()) {
+    LOG(ERROR) << __func__ << ": indexing " << opIdx << " within "
+               << profile.stats_size() << ", and " << blobIdx << "within"
+               << profile.stats(opIdx).output_profile_size();
+    return false;
+  }
+  const auto& data_node = profile.stats(opIdx).output_profile(blobIdx);
+  if (output_name != data_node.name()) {
+    LOG(ERROR) << "Unmatched name in ProfDAGProtos and NetDef. Respectively: "
+               << data_node.name() << ", " << output_name;
+    return false;
+  }
+  return blobMap_.emplace(output_name, ProfilingDataAnnotation(data_node))
+      .second;
+}
+
+} // namespace prof
+} // namespace contrib
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/profiling_info.h b/caffe2/contrib/prof/profiling_info.h
new file mode 100644
index 0000000..bf6e151
--- /dev/null
+++ b/caffe2/contrib/prof/profiling_info.h
@@ -0,0 +1,83 @@
+// Defines the class for storing profiling information for a neural net.
+#pragma once
+
+#include <unordered_map>
+
+#include "caffe2/contrib/prof/profiling_annotations.h"
+
+namespace caffe2 {
+namespace contrib {
+namespace prof {
+
+// Holds the profiling data for the execution of a net.
+class ProfilingInfo {
+ public:
+  using BlobMapType = std::unordered_map<std::string, ProfilingDataAnnotation>;
+  using OperatorMapType = std::unordered_map<int, ProfilingOperatorAnnotation>;
+  // Generates a fresh data structure to be initialized. Use Init() or Restore()
+  // below to initialize the data structure.
+  ProfilingInfo() {}
+  // Generates 0-initialized stats for node and blob profiles from a given
+  // NetDef. Use this to start profiling.
+  bool Init(const NetDef& netDef);
+  // Uses ProfDAGProtos for existing profile, and NetDef for graph definition,
+  // and restores the state. Returns whether profile and net_def were
+  // consistent. This is defined over the "indices" of operators and outputs:
+  // the indices within the net_def should exist with the profile, and the names
+  // should match. Errors are handled with best effort: matching state is
+  // populated.
+  bool Restore(const NetDef& netDef, const ProfDAGProtos& profile);
+  // Appends ProfDAGProtos from the internal representation representing each
+  // operator and blob in the graph as separate entities. Returns false if
+  // NetDef is inconsistent with the original. It uses the old format when
+  // oldFormat is set, which looks like: net__opidx__optype.
+  bool GetOperatorAndDataStats(
+      const NetDef& net_def,
+      bool oldFormat,
+      ProfDAGProtos* serialized) const;
+  // Appends ProfDAGProtos using each op type as a 'stats' element. Returns
+  // false if NetDef is inconsistent with the original. This function is only
+  // used for oldFormat because the information is redundant. The user of the
+  // library can iterate over the map and recreate if needed.
+  bool GetOperatorTypeStats(const NetDef& net_def, ProfDAGProtos* serialized)
+      const;
+
+  // Accessors.
+  const BlobMapType& getBlobMap() {
+    return blobMap_;
+  }
+  const OperatorMapType& getOperatorMap() {
+    return operatorMap_;
+  }
+  BlobMapType* getMutableBlobMap() {
+    return &blobMap_;
+  }
+  OperatorMapType* getMutableOperatorMap() {
+    return &operatorMap_;
+  }
+
+ private:
+  bool addOperatorAnnotation(
+      const ProfDAGProtos& profile,
+      int idx,
+      const string& opName);
+  bool addDataAnnotation(
+      const ProfDAGProtos& profile,
+      int opIdx,
+      int blobIdx,
+      const string& output_name);
+  string getNameInOldFormat(int idx, const string& opType) const {
+    return name_ + "___" + to_string(idx) + "___" + opType;
+  }
+
+  // Maps blob name to its node int he dataFlowGraph_.
+  BlobMapType blobMap_;
+  // Maps a node index to its NodeRef.
+  OperatorMapType operatorMap_;
+  // Net for which this profile was collected (NetDef.name).
+  string name_;
+};
+
+} // namespace prof
+} // namespace contrib
+} // namespace caffe2
diff --git a/caffe2/contrib/prof/profiling_info_test.cc b/caffe2/contrib/prof/profiling_info_test.cc
new file mode 100644
index 0000000..2996fe4
--- /dev/null
+++ b/caffe2/contrib/prof/profiling_info_test.cc
@@ -0,0 +1,379 @@
+// Unit tests for ProfilingInfo.
+#include "caffe2/contrib/prof/profiling_info.h"
+
+#include <google/protobuf/util/message_differencer.h>
+#include <gtest/gtest.h>
+
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+namespace contrib {
+namespace prof {
+namespace {
+
+const char* kTestProfile = R"(
+  stats {
+    name: "op1"
+    mean: 5
+    stddev: 7
+
+    execution_time {
+      mean: 5
+      stddev: 7
+      count: 11
+    }
+
+    output_profile {
+      name: "var1"
+      bytes_used {
+        mean: 13
+        stddev: 17
+        count: 10
+      }
+    }
+    output_profile {
+      name: "var12"
+      bytes_used {
+        mean: 43
+        stddev: 47
+        count: 53
+      }
+    }
+  }
+  stats {
+    name: ""
+    mean: 19
+    stddev: 23
+
+    execution_time {
+      mean: 19
+      stddev: 23
+      count: 29
+    }
+
+    output_profile {
+      name: "var2"
+
+      bytes_used {
+        mean: 31
+        stddev: 37
+        count: 41
+      }
+    }
+  }
+  net_name: "example_net"
+)";
+
+const char* kTestNetDefCorrect = R"(
+  name: "example_net"
+  op {
+    name: "op1"
+    type: "add"
+    output: "var1"
+    output: "var12"
+  }
+  op {
+    name: ""
+    type: "mult"
+    output: "var2"
+  }
+)";
+
+const char* kTestNetDefPartial = R"(
+  name: "example_net"
+  op {
+    name: "op1"
+    output: "var1"
+  }
+  op {
+    name: ""
+    output: "var_NET_WRONG"
+  }
+  op {
+    name: "op_NET_WRONG"
+  }
+)";
+
+TEST(ProfilingInfoTest, CorrectParse) {
+  NetDef net_def;
+  ProfDAGProtos profile;
+  ASSERT_TRUE(TextFormat::ParseFromString(string(kTestProfile), &profile));
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefCorrect), &net_def));
+  ProfilingInfo info;
+  EXPECT_TRUE(info.Restore(net_def, profile));
+  EXPECT_EQ(3, info.getBlobMap().size());
+  EXPECT_EQ(2, info.getOperatorMap().size());
+
+  auto it = info.getBlobMap().find("var1");
+  ASSERT_NE(info.getBlobMap().end(), it);
+  EXPECT_FLOAT_EQ(13, it->second.getUsedBytes().getMean());
+
+  it = info.getBlobMap().find("var12");
+  ASSERT_NE(info.getBlobMap().end(), it);
+  EXPECT_FLOAT_EQ(47, it->second.getUsedBytes().getStddev());
+
+  it = info.getBlobMap().find("var2");
+  ASSERT_NE(info.getBlobMap().end(), it);
+  EXPECT_FLOAT_EQ(31, it->second.getUsedBytes().getMean());
+
+  auto it2 = info.getOperatorMap().find(0);
+  ASSERT_NE(info.getOperatorMap().end(), it2);
+  EXPECT_FLOAT_EQ(5, it2->second.getExecutionTimeMs().getMean());
+
+  it2 = info.getOperatorMap().find(1);
+  ASSERT_NE(info.getOperatorMap().end(), it2);
+  EXPECT_FLOAT_EQ(23, it2->second.getExecutionTimeMs().getStddev());
+}
+
+TEST(ProfilingInfoTest, PartialParse) {
+  NetDef net_def;
+  ProfDAGProtos profile;
+  ASSERT_TRUE(TextFormat::ParseFromString(string(kTestProfile), &profile));
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefPartial), &net_def));
+  ProfilingInfo info;
+  EXPECT_FALSE(info.Restore(net_def, profile));
+  EXPECT_EQ(1, info.getBlobMap().size());
+  EXPECT_EQ(2, info.getOperatorMap().size());
+
+  auto it = info.getBlobMap().find("var1");
+  ASSERT_NE(info.getBlobMap().end(), it);
+  EXPECT_FLOAT_EQ(13, it->second.getUsedBytes().getMean());
+
+  auto it2 = info.getOperatorMap().find(0);
+  ASSERT_NE(info.getOperatorMap().end(), it2);
+  EXPECT_FLOAT_EQ(5, it2->second.getExecutionTimeMs().getMean());
+
+  it2 = info.getOperatorMap().find(1);
+  ASSERT_NE(info.getOperatorMap().end(), it2);
+  EXPECT_FLOAT_EQ(23, it2->second.getExecutionTimeMs().getStddev());
+}
+
+TEST(ProfilingInfoTest, InitAndAddStats) {
+  NetDef net_def;
+  ProfilingInfo info;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefCorrect), &net_def));
+  info.Init(net_def);
+  // Expect correct number of items.
+  EXPECT_EQ(3, info.getBlobMap().size());
+  EXPECT_EQ(2, info.getOperatorMap().size());
+
+  // Expect correct key contents.
+  auto mutable_blob_it = info.getMutableBlobMap()->find("var1");
+  ASSERT_NE(info.getMutableBlobMap()->end(), mutable_blob_it);
+  // Example profiling call:
+  mutable_blob_it->second.getMutableUsedBytes()->addPoint(3);
+  auto const_blob_it = info.getBlobMap().find("var12");
+  EXPECT_NE(info.getBlobMap().end(), const_blob_it);
+  const_blob_it = info.getBlobMap().find("var2");
+  EXPECT_NE(info.getBlobMap().end(), const_blob_it);
+
+  auto mutable_op_it = info.getMutableOperatorMap()->find(0);
+  ASSERT_NE(info.getMutableOperatorMap()->end(), mutable_op_it);
+  // Example profiling call:
+  mutable_op_it->second.getMutableExecutionTimeMs()->addPoint(3);
+  auto const_op_it = info.getOperatorMap().find(1);
+  EXPECT_NE(info.getOperatorMap().end(), const_op_it);
+}
+
+const char* kExpectedProto = R"(
+  stats {
+    name: "op1"
+    mean: 5
+    stddev: 0
+
+    execution_time {
+      mean: 5
+      stddev: 0
+      count: 1
+    }
+
+    output_profile {
+      name: "var1"
+      bytes_used {
+        mean: 3
+        stddev: 0
+        count: 1
+      }
+    }
+    output_profile {
+      name: "var12"
+      bytes_used {
+        mean: 0
+        stddev: 0
+        count: 0
+      }
+    }
+  }
+  stats {
+    name: ""
+    mean: 0
+    stddev: 0
+
+    execution_time {
+      mean: 0
+      stddev: 0
+      count: 0
+    }
+
+    output_profile {
+      name: "var2"
+
+      bytes_used {
+        mean: 0
+        stddev: 0
+        count: 0
+      }
+    }
+  }
+  net_name: "example_net"
+)";
+
+TEST(ProfilingInfoTest, InitAddStatsAndGetOperatorStatsProto) {
+  NetDef net_def;
+  ProfilingInfo info;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefCorrect), &net_def));
+  ProfDAGProtos expected;
+  ASSERT_TRUE(TextFormat::ParseFromString(string(kExpectedProto), &expected));
+
+  info.Init(net_def);
+  // Add stats.
+  auto op_it = info.getMutableOperatorMap()->find(0);
+  op_it->second.getMutableExecutionTimeMs()->addPoint(5);
+  auto blob_it = info.getMutableBlobMap()->find("var1");
+  blob_it->second.getMutableUsedBytes()->addPoint(3);
+
+  // Export to proto.
+  ProfDAGProtos generated;
+  ASSERT_TRUE(info.GetOperatorAndDataStats(net_def, false, &generated));
+  EXPECT_TRUE(google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
+      expected, generated))
+      << generated.DebugString();
+}
+
+TEST(ProfilingInfoTest, NewFormatSymmetry) {
+  NetDef net_def;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefCorrect), &net_def));
+  ProfDAGProtos profile;
+  ASSERT_TRUE(TextFormat::ParseFromString(string(kTestProfile), &profile));
+  ProfilingInfo info;
+  info.Restore(net_def, profile);
+  ProfDAGProtos generated;
+  ASSERT_TRUE(info.GetOperatorAndDataStats(net_def, false, &generated));
+  EXPECT_TRUE(google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
+      profile, generated))
+      << generated.DebugString();
+}
+
+const char* kExpectedProtoOldFormat = R"(
+  stats {
+    name: "example_net___0___add"
+    mean: 5
+    stddev: 0
+  }
+  stats {
+    name: "example_net___1___mult"
+    mean: 0
+    stddev: 0
+  }
+)";
+
+TEST(ProfilingInfoTest, InitAddStatsAndGetOperatorStatsProtoOldFormat) {
+  NetDef net_def;
+  ProfilingInfo info;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kTestNetDefCorrect), &net_def));
+  ProfDAGProtos expected;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kExpectedProtoOldFormat), &expected));
+
+  info.Init(net_def);
+  // Add stats.
+  auto op_it = info.getMutableOperatorMap()->find(0);
+  op_it->second.getMutableExecutionTimeMs()->addPoint(5);
+  auto blob_it = info.getMutableBlobMap()->find("var1");
+  blob_it->second.getMutableUsedBytes()->addPoint(3);
+
+  // Export to proto.
+  ProfDAGProtos generated;
+  ASSERT_TRUE(info.GetOperatorAndDataStats(net_def, true, &generated));
+  EXPECT_TRUE(
+      google::protobuf::util::MessageDifferencer::ApproximatelyEquivalent(
+          expected, generated))
+      << generated.DebugString();
+}
+
+const char* kTestNetDefMultiOpMultiType = R"(
+  name: "example_net"
+  op {
+    name: "op1"
+    type: "add"
+    output: "var1"
+    output: "var12"
+  }
+  op {
+    name: "op2"
+    type: "add"
+    output: "var12"
+  }
+  op {
+    name: "op3"
+    type: "mult"
+    output: "var2"
+  }
+)";
+
+const char* kExpectedProtoByType = R"(
+  stats {
+    name: "add"
+    mean: 6
+    stddev: 1
+  }
+  stats {
+    name: "mult"
+    mean: 11
+    stddev: 0
+  }
+)";
+
+TEST(ProfilingInfoTest, InitAddStatsAndGetOperatorTypeStatsProto) {
+  NetDef net_def;
+  ProfilingInfo info;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      string(kTestNetDefMultiOpMultiType), &net_def));
+  ProfDAGProtos expected;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(string(kExpectedProtoByType), &expected));
+
+  info.Init(net_def);
+  // Add stats.
+  auto op_it = info.getMutableOperatorMap()->find(0);
+  op_it->second.getMutableExecutionTimeMs()->addPoint(5);
+  op_it = info.getMutableOperatorMap()->find(1);
+  op_it->second.getMutableExecutionTimeMs()->addPoint(7);
+  op_it = info.getMutableOperatorMap()->find(2);
+  op_it->second.getMutableExecutionTimeMs()->addPoint(11);
+
+  // Export to proto.
+  ProfDAGProtos generated;
+  ASSERT_TRUE(info.GetOperatorTypeStats(net_def, &generated));
+  // Since op types are unordered, it is OK to match either.
+  ProfDAGProtos generated_swapped = generated;
+  std::swap(
+      *generated_swapped.mutable_stats(0), *generated_swapped.mutable_stats(1));
+  EXPECT_TRUE(
+      google::protobuf::util::MessageDifferencer::ApproximatelyEquivalent(
+          expected, generated) ||
+      google::protobuf::util::MessageDifferencer::ApproximatelyEquivalent(
+          expected, generated_swapped))
+      << generated.DebugString();
+}
+
+} // namespace
+} // namespace prof
+} // namespace contrib
+} // namespace caffe2
diff --git a/caffe2/contrib/script/CMakeLists.txt b/caffe2/contrib/script/CMakeLists.txt
new file mode 100644
index 0000000..fb38787
--- /dev/null
+++ b/caffe2/contrib/script/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/contrib/script/caffe2_script_test.py b/caffe2/contrib/script/caffe2_script_test.py
new file mode 100644
index 0000000..d9f0b65
--- /dev/null
+++ b/caffe2/contrib/script/caffe2_script_test.py
@@ -0,0 +1,520 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+import numpy as np
+
+
+def feed_inputs(inputs):
+    for name, value in inputs.items():
+        workspace.FeedBlob(name, value)
+
+
+def assert_proto_equals(proto, expected):
+    proto_lines = proto.strip().split('\n')
+    expected_lines = expected.strip().split('\n')
+    assert len(proto_lines) == len(expected_lines), \
+        '{} != {}'.format(proto, expected)
+    for left, right in zip(proto_lines, expected_lines):
+        assert left.strip() == right.strip(), \
+            '{} != {}'.format(proto, expected)
+
+
+class TestCaffe2Script(hu.HypothesisTestCase):
+    test_program = """
+          def foo(a,b,X,W) -> (c):
+              t = a + b*b
+              c = FC(X,W,t)
+          def testIf(c0,c1,t,f) -> (r):
+              if c0 < c1:
+                  r = t
+              else:
+                  r = f
+              r = Add(r,3f,broadcast=1)
+          def testWhile(r) -> (r):
+              m = 0
+              while m < 4:
+                  # Plus operator automatically broadcasts, and we cannot
+                  # do in-place B and C arguments when we broadcast, so use
+                  # an explicit Add op.
+                  r = Add(r, r)
+                  m = m + 1
+      """
+
+    @given(firstdim=st.integers(min_value=1, max_value=4096),
+           seconddim=st.integers(min_value=1, max_value=4096),
+           seed=st.integers(min_value=0, max_value=65536),
+           **hu.gcs)
+    def test_foo(self, firstdim, seconddim, seed, gc, dc):
+        np.random.seed(int(seed))
+        inputs = {}
+        a = inputs['a'] = np.random.rand(seconddim).astype(np.float32)
+        b = inputs['b'] = np.random.rand(seconddim).astype(np.float32)
+        X = inputs['X'] = np.random.rand(firstdim, firstdim).astype(np.float32)
+        W = inputs['W'] = np.random.rand(seconddim, firstdim).astype(np.float32)
+
+        feed_inputs(inputs)
+
+        CU = core.C.CompilationUnit()
+        CU.define(self.test_program)
+        CU.create_net('foo').run()
+
+        ref_t = a + b * b
+        ref_c = np.matmul(X, W.transpose()) + ref_t
+        actual_c = workspace.FetchBlob('c')
+
+        np.testing.assert_allclose(actual_c, ref_c, rtol=1e-05)
+
+    def test_trinary(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo(c) -> (d):
+                d = 1 + (2 if c else 4)
+        """)
+        workspace.FeedBlob('c', np.ones((1), dtype=bool))
+        net = CU.create_net('foo')
+        net.run()
+        assert(3 == workspace.FetchBlob('d'))
+        workspace.FeedBlob('c', np.zeros((1), dtype=bool))
+        net.run()
+        assert(5 == workspace.FetchBlob('d'))
+
+    def test_bool_literal(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a,b):
+                a = True
+                b = False
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(workspace.FetchBlob('a'))
+        assert(not workspace.FetchBlob('b'))
+
+    def test_bool_operators(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a, b, c, d, e):
+                a = True and False
+                b = True or False
+                c = not b
+                d = not False or True
+                e = not (1 if a else 0) == (1 if b else 0)
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(not workspace.FetchBlob('a'))
+        assert(workspace.FetchBlob('b'))
+        assert(not workspace.FetchBlob('c'))
+        assert(workspace.FetchBlob('d'))
+        assert(workspace.FetchBlob('e'))
+
+    def expect_fail(self, fn, msg):
+        try:
+            fn()
+        except RuntimeError as r:
+            if msg not in str(r):
+                raise RuntimeError(
+                    "Failed wrong: expected string '{}' ".format(msg) +
+                    "in error message but found\n{}".format(str(r)))
+
+    def test_fails(self):
+        def fail_inputs():
+            CU = core.C.CompilationUnit()
+            CU.define("""
+                def foo() -> ():
+                    Print(1,4)
+            """)
+        self.expect_fail(fail_inputs, "expects 1 inputs but found 2")
+
+        def fail_undef():
+            CU = core.C.CompilationUnit()
+            CU.define("""
+                def foo(a) -> (b):
+                    a = what()
+            """)
+        self.expect_fail(fail_undef, "attempting to call unknown operation")
+
+        def fail_schema():
+            CU = core.C.CompilationUnit()
+            CU.define("""
+                def foo(a) -> (b):
+                    a = FC(a,a,a)
+            """)
+        self.expect_fail(fail_schema, "failed schema checking")
+
+    def test_print(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> ():
+                a = 1
+                Print(a)
+                Print(a+1)
+                _ = 4
+                Print(_) # verify in print this isn't _ but some temorary
+                Print(1)
+                Print(1.f)
+                Print(3.0)
+        """)
+        net = CU.create_net('foo')
+        net.run()
+
+    def test_method(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a):
+                a = (3+1).Add(4).Add(1)
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(9 == workspace.FetchBlob('a'))
+
+    def test_plus_eq(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a):
+                a = 4
+                a += 1
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(5 == workspace.FetchBlob('a'))
+
+    def test_cast(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a):
+                a = int(4.5f)
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(4 == workspace.FetchBlob('a'))
+
+    def test_global(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def foo() -> (a):
+                global m
+                m.a = 4
+                m.b = 5
+                a = m.a + m.b
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(9 == workspace.FetchBlob('a'))
+
+    def test_module_as_arg_ret(self):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+            def bar(a,c) -> (b):
+                b = Module()
+                temp = a.second
+                b.first = temp
+                b.second = a.first + c
+            def foo() -> (a,b):
+                x = Module()
+                x.first = 1
+                x.second = 2
+                x.y = bar(x,4)
+                a = x.y.first
+                b = x.y.second
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(2 == workspace.FetchBlob('a'))
+        assert(5 == workspace.FetchBlob('b'))
+
+    def test_call_extern(self):
+        CU = core.C.CompilationUnit()
+        net = caffe2_pb2.NetDef()
+        net.op.extend([
+            core.CreateOperator(
+                'Mul',
+                ['i', 'i'],
+                ['o'],
+            )
+        ])
+        net.external_input.append('i')
+        net.external_output.append('o')
+
+        CU.extern("myActualExtern", net)
+        CU.define("""
+            def myExtern(x) -> (y):
+                t = x
+                if t > 1:
+                    y = t * t
+                else:
+                    y = 5
+            def foo() -> (b):
+                a = 4
+                a += 1
+                b = 2 + myExtern(a) + myExtern(a, rename=False) + myActualExtern(a)
+        """)
+        net = CU.create_net('foo')
+        net.run()
+        assert(77 == workspace.FetchBlob('b'))
+
+    @given(seed=st.integers(min_value=0, max_value=65536), **hu.gcs)
+    def test_if(self, seed, gc, dc):
+        np.random.seed(int(seed))
+        inputs = {}
+        c0 = inputs['c0'] = np.random.rand(1).astype(np.float32)
+        c1 = inputs['c1'] = np.random.rand(1).astype(np.float32)
+        t = inputs['t'] = np.random.rand(3, 3).astype(np.float32)
+        f = inputs['f'] = np.random.rand(3, 3).astype(np.float32)
+
+        feed_inputs(inputs)
+
+        CU = core.C.CompilationUnit()
+        CU.define(self.test_program)
+        CU.create_net('testIf').run()
+
+        if c0 < c1:
+            ref_r = t + 3
+        else:
+            ref_r = f + 3
+        actual_r = workspace.FetchBlob('r')
+
+        np.testing.assert_allclose(actual_r, ref_r)
+
+    @given(seed=st.integers(min_value=0, max_value=65536), **hu.gcs)
+    def test_while(self, seed, gc, dc):
+        np.random.seed(int(seed))
+        inputs = {}
+        r = inputs['r'] = np.ones([3, 3]).astype(np.float32)
+
+        feed_inputs(inputs)
+
+        CU = core.C.CompilationUnit()
+        CU.define(self.test_program)
+        CU.create_net('testWhile').run()
+
+        m = 0
+        while m < 4:
+            r = r + r
+            m = m + 1
+
+        actual_r = workspace.FetchBlob('r')
+
+        np.testing.assert_allclose(actual_r, r)
+
+    @given(seed=st.integers(min_value=0, max_value=65536), **hu.gcs)
+    def test_gather(self, seed, gc, dc):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+        def easy(tensor, indices) -> (output):
+            output = tensor[indices]
+        def hard(tensor, i, j, k) -> (output):
+            output = tensor[i][j][k]
+        """)
+
+        # First check that the generated proto is as expected. This tests that
+        # we desugar the gather syntax correctly and emit the right code.
+        proto = CU.get_proto('easy')
+        assert_proto_equals(proto, """
+            name: "easy"
+            op {
+              input: "tensor"
+              input: "indices"
+              output: "output"
+              type: "Gather"
+            }""")
+
+        proto = CU.get_proto('hard')
+        assert_proto_equals(proto, """
+            name: "hard"
+            op {
+              input: "tensor"
+              input: "i"
+              output: "$t1"
+              type: "Gather"
+            }
+            op {
+              input: "$t1"
+              input: "j"
+              output: "$t0"
+              type: "Gather"
+            }
+            op {
+              input: "$t0"
+              input: "k"
+              output: "output"
+              type: "Gather"
+            }""")
+
+        # Now just test that the effect of the generated code is as expected.
+        np.random.seed(int(seed))
+        tensor = np.random.rand(5, 4, 3).astype(np.float32)
+        indices = np.random.randint(len(tensor), size=(5, 5))
+
+        feed_inputs(dict(tensor=tensor, indices=indices))
+
+        net = CU.create_net('easy')
+        net.run()
+
+        output = workspace.FetchBlob('output')
+        expected_output = [tensor[sample] for sample in indices]
+        np.testing.assert_allclose(output, expected_output)
+
+    @given(seed=st.integers(min_value=0, max_value=65536), **hu.gcs)
+    def test_slice(self, seed, gc, dc):
+        CU = core.C.CompilationUnit()
+        CU.define("""
+        def slice_from_tensor(tensor, start, end) -> (output):
+            output = tensor[start:end]
+        def slice_from_vector(vector, start, end) -> (a, b, c, d):
+            a = vector[start:end]
+            b = vector[start:]
+            c = vector[:end]
+            d = vector[:]
+        """)
+
+        # slice_from_tensor
+        proto = CU.get_proto('slice_from_tensor')
+        assert_proto_equals(proto, """
+            name: "slice_from_tensor"
+            op {
+              input: "tensor"
+              input: "start"
+              input: "end"
+              output: "output"
+              type: "Slice"
+            }""")
+
+        np.random.seed(int(seed))
+        tensor = np.random.rand(5, 4, 3).astype(np.float32)
+        start = np.array([0, 1, 0], dtype=np.int32)
+        end = np.array([-1, 2, -1], dtype=np.int32)
+
+        feed_inputs(dict(tensor=tensor, start=start, end=end))
+
+        net = CU.create_net('slice_from_tensor')
+        net.run()
+
+        output = workspace.FetchBlob('output')
+        np.testing.assert_allclose(output, tensor[:, 1:2])
+
+        # slice_from_vector
+        proto = CU.get_proto('slice_from_vector')
+        assert_proto_equals(proto, """
+            name: "slice_from_vector"
+            op {
+              input: "vector"
+              input: "start"
+              input: "end"
+              output: "a"
+              type: "Slice"
+            }
+            op {
+              output: "$t0"
+              type: "ConstantFill"
+              arg {
+                name: "dtype"
+                i: 2
+              }
+              arg {
+                name: "value"
+                i: -1
+              }
+              arg {
+                name: "shape"
+                ints: 1
+              }
+            }
+            op {
+              input: "vector"
+              input: "start"
+              input: "$t0"
+              output: "b"
+              type: "Slice"
+            }
+            op {
+              output: "$t1"
+              type: "ConstantFill"
+              arg {
+                name: "dtype"
+                i: 2
+              }
+             arg {
+                name: "value"
+                i: 0
+              }
+              arg {
+                name: "shape"
+                ints: 1
+              }
+            }
+            op {
+              input: "vector"
+              input: "$t1"
+              input: "end"
+              output: "c"
+              type: "Slice"
+            }
+            op {
+              output: "$t2"
+              type: "ConstantFill"
+              arg {
+                name: "dtype"
+                i: 2
+              }
+             arg {
+                name: "value"
+                i: 0
+              }
+              arg {
+                name: "shape"
+                ints: 1
+              }
+            }
+            op {
+              output: "$t3"
+              type: "ConstantFill"
+              arg {
+                name: "dtype"
+                i: 2
+              }
+             arg {
+                name: "value"
+                i: -1
+              }
+              arg {
+                name: "shape"
+                ints: 1
+              }
+            }
+            op {
+              input: "vector"
+              input: "$t2"
+              input: "$t3"
+              output: "d"
+              type: "Slice"
+            }""")
+
+        vector = np.random.rand(10).astype(np.float32)
+        start = np.array([2], dtype=np.int32)
+        end = np.array([6], dtype=np.int32)
+        feed_inputs(dict(vector=vector, start=start, end=end))
+
+        net = CU.create_net('slice_from_vector')
+        net.run()
+
+        output = workspace.FetchBlob('a')
+        np.testing.assert_allclose(output, vector[2:6])
+
+        output = workspace.FetchBlob('b')
+        np.testing.assert_allclose(output, vector[2:])
+
+        output = workspace.FetchBlob('c')
+        np.testing.assert_allclose(output, vector[:6])
+
+        output = workspace.FetchBlob('d')
+        np.testing.assert_allclose(output, vector)
diff --git a/caffe2/contrib/script/compiler.cc b/caffe2/contrib/script/compiler.cc
new file mode 100644
index 0000000..fc7c183
--- /dev/null
+++ b/caffe2/contrib/script/compiler.cc
@@ -0,0 +1,793 @@
+#include "caffe2/core/net.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "compiler.h"
+#include "parser.h"
+
+namespace caffe2 {
+namespace script {
+
+namespace {
+
+static std::unordered_set<std::string> ops_containing_nets = {
+    "If",
+    "While",
+    "RecurrentNetwork",
+};
+// record of defined function
+// NetDef + metadata
+struct FunctionDefinition {
+  explicit FunctionDefinition(Def tree)
+      : tree(new Def(tree)), net_def(new NetDef()) {}
+
+  explicit FunctionDefinition(std::unique_ptr<NetDef> def)
+      : tree(nullptr), net_def(std::move(def)) {
+    // we coop extern_inputs/extern_outputs to be the inputs/outputs to
+    // this net as a function
+    // but we _dont_ set these when creating the net in the workspace
+    // because they require the net to have valid inputs/outputs
+    inputs.insert(
+        inputs.begin(),
+        net_def->external_input().begin(),
+        net_def->external_input().end());
+    outputs.insert(
+        outputs.begin(),
+        net_def->external_output().begin(),
+        net_def->external_output().end());
+    net_def->clear_external_output();
+    net_def->clear_external_input();
+  }
+
+  bool isExtern() const {
+    return tree == nullptr;
+  }
+  std::unique_ptr<Def> tree;
+  std::unique_ptr<NetDef> net_def;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+};
+
+} // namespace
+
+using SymbolTable = std::unordered_map<std::string, FunctionDefinition>;
+
+struct DefCompiler {
+  DefCompiler(FunctionDefinition& def, SymbolTable& symbol_table)
+      : def(def),
+        net_def_stack({def.net_def.get()}),
+        symbol_table(symbol_table) {}
+  void run() {
+    auto& tree = *def.tree;
+    cur().set_name(tree.name().name());
+    for (auto input : tree.params()) {
+      auto& name = input.ident().name();
+      map(name, name);
+      def.inputs.push_back(name);
+    }
+    for (auto output : tree.returns()) {
+      auto& name = output.ident().name();
+      map(name, name);
+      def.outputs.push_back(name);
+    }
+    emitStatements(tree.statements());
+  }
+  void emitExpressionStatement(TreeRef stmt) {
+    // expression with no used outputs
+    emit(stmt, {});
+  }
+  void emitStatements(const ListView<TreeRef>& statements) {
+    for (auto stmt : statements) {
+      switch (stmt->kind()) {
+        case TK_IF:
+          emitIf(If(stmt));
+          break;
+        case TK_WHILE:
+          emitWhile(While(stmt));
+          break;
+        case TK_ASSIGN:
+          emitAssignment(Assign(stmt));
+          break;
+        case TK_GLOBAL:
+          for (auto ident : stmt->trees()) {
+            auto name = Ident(ident).name();
+            map(name, name);
+          }
+          break;
+        default:
+          emitExpressionStatement(stmt);
+          break;
+      }
+    }
+  }
+  void map(const std::string& name, const std::string& value) {
+    env[name] = value;
+  }
+  const std::string& lookup(const Ident& ident) {
+    if (env.count(ident.name()) == 0)
+      throw ErrorReport(ident) << "undefined value " << ident.name();
+    return env[ident.name()];
+  }
+  void emitAssignment(const Assign& stmt) {
+    std::vector<std::string> outputs;
+    for (auto lhs : stmt.lhs()) {
+      std::string name = getLHS(lhs);
+      // use of "_" gets renamed in Caffe2 graphs so that two uses
+      // don't unintentionally interfere with each other
+      if (name == "_") {
+        name = fresh();
+      }
+      outputs.push_back(name);
+    }
+    if (stmt.reduction() != '=') {
+      if (stmt.lhs().size() != 1) {
+        throw ErrorReport(stmt)
+            << "reductions are only allow when there is a single variable "
+            << "on the left-hand side.";
+      }
+      auto lhs = stmt.lhs()[0];
+      auto expr =
+          Compound::create(stmt.reduction(), stmt.range(), {lhs, stmt.rhs()});
+      emit(expr, outputs);
+    } else {
+      emit(stmt.rhs(), outputs);
+    }
+    int i = 0;
+    for (auto ident : stmt.lhs()) {
+      if (ident->kind() == TK_IDENT)
+        map(Ident(ident).name(), outputs.at(i));
+      i++;
+    }
+  }
+  void emitIf(const If& stmt) {
+    auto cond = getValue(stmt.cond());
+    auto op = cur().add_op();
+    op->set_type("If");
+    op->add_input(cond);
+    auto true_branch = op->add_arg();
+    true_branch->set_name("then_net");
+    auto nd = true_branch->mutable_n();
+    net_def_stack.push_back(nd);
+    emitStatements(stmt.trueBranch());
+    net_def_stack.pop_back();
+    if (stmt.falseBranch().size() > 0) {
+      auto false_branch = op->add_arg();
+      false_branch->set_name("else_net");
+      auto nd = false_branch->mutable_n();
+      net_def_stack.push_back(nd);
+      emitStatements(stmt.falseBranch());
+      net_def_stack.pop_back();
+    }
+  }
+  void emitWhile(const While& stmt) {
+    std::string loop_var = fresh();
+    emitConst(0, loop_var, "i"); // it needs a definition before loop
+    auto op = cur().add_op();
+    op->set_type("While");
+    auto cond = op->add_arg();
+    cond->set_name("cond_net");
+    auto cond_net = cond->mutable_n();
+
+    net_def_stack.push_back(cond_net);
+    emit(stmt.cond(), {loop_var});
+    net_def_stack.pop_back();
+
+    op->add_input(loop_var);
+    auto body = op->add_arg();
+    body->set_name("loop_net");
+    auto body_net = body->mutable_n();
+
+    net_def_stack.push_back(body_net);
+    emitStatements(stmt.body());
+    net_def_stack.pop_back();
+  }
+  std::string getLHS(const TreeRef& tree) {
+    switch (tree->kind()) {
+      case TK_IDENT: {
+        return Ident(tree).name();
+      } break;
+      case '.': {
+        auto sel = Select(tree);
+        std::string lhs = getValue(sel.value());
+        // TODO: check whether this subname exists in object lhs
+        return lhs + "/" + sel.selector().name();
+      } break;
+      default: {
+        throw ErrorReport(tree)
+            << "This expression cannot appear on the left-hand size of an assignment";
+      } break;
+    }
+  }
+  std::string getValue(const TreeRef& tree) {
+    switch (tree->kind()) {
+      case TK_IDENT: {
+        return lookup(Ident(tree));
+      } break;
+      case '.': {
+        auto sel = Select(tree);
+        std::string lhs = getValue(sel.value());
+        // TODO: check whether this subname exists in object lhs
+        return lhs + "/" + sel.selector().name();
+      } break;
+      default: {
+        std::string name = fresh();
+        emit(tree, {name});
+        return name;
+      } break;
+    }
+  }
+  std::string fresh(std::string prefix = "$t") {
+    return std::string(prefix) + caffe2::to_string(next_fresh++);
+  }
+  const char* operatorName(int kind, int ninputs) {
+    switch (kind) {
+      case '+':
+        return "Add";
+      case '-':
+        if (ninputs == 1)
+          return "Negative";
+        else
+          return "Sub";
+      case '*':
+        return "Mul";
+      case '/':
+        return "Div";
+      case TK_NE:
+        return "NE";
+      case TK_EQ:
+        return "EQ";
+      case '<':
+        return "LT";
+      case '>':
+        return "GT";
+      case TK_LE:
+        return "LE";
+      case TK_GE:
+        return "GE";
+      case TK_IF_EXPR:
+        return "Conditional";
+      case TK_AND:
+        return "And";
+      case TK_OR:
+        return "Or";
+      case TK_NOT:
+        return "Not";
+      default:
+        throw std::runtime_error("unknown kind " + caffe2::to_string(kind));
+    }
+  }
+  void fillArg(Argument* arg, const Attribute& attr) {
+    std::string name = attr.name().name();
+    arg->set_name(name);
+    auto value = attr.value();
+    // TODO: handle non-float attributes
+    switch (value->kind()) {
+      case TK_CONST: {
+        auto v = value->tree(0)->doubleValue();
+        auto f = value->tree(1)->stringValue();
+        if (f == "f")
+          arg->set_f(v);
+        else
+          arg->set_i(v);
+      } break;
+      case TK_LIST:
+        for (auto t : value->trees()) {
+          auto v = t->tree(0)->doubleValue();
+          auto f = t->tree(1)->stringValue();
+          if (f == "f")
+            arg->add_floats(v);
+          else
+            arg->add_ints(v);
+        }
+        break;
+    }
+  }
+  template <typename Trees>
+  std::vector<std::string> getValues(const Trees& trees) {
+    std::vector<std::string> result;
+    for (const auto& tree : trees) {
+      result.push_back(getValue(tree));
+    }
+    return result;
+  }
+
+  bool renameLookup(
+      std::unordered_map<std::string, std::string>& rename_map,
+      const std::string& name,
+      std::string& rename) {
+    // first look for name in the map directly
+    auto it = rename_map.find(name);
+    if (it != rename_map.end()) {
+      rename = it->second;
+      return true;
+    }
+    // otherwise if we have a rename entry like a => b and a name "a/foo/bar"
+    // then replace it with "b/foo/bar"
+    auto p = name.find("/");
+    if (p == std::string::npos)
+      return false;
+    it = rename_map.find(name.substr(0, p));
+    if (it != rename_map.end()) {
+      rename = it->second + name.substr(p);
+      return true;
+    }
+    return false;
+  }
+  void renameOp(
+      std::unordered_map<std::string, std::string>& rename_map,
+      const Apply& apply,
+      const std::string& prefix,
+      bool isExtern,
+      OperatorDef* new_op) {
+    for (size_t i = 0; i < new_op->input().size(); i++) {
+      auto& name = new_op->input(i);
+      std::string renamed;
+      bool defined = renameLookup(rename_map, name, renamed);
+      if (!isExtern && !defined) {
+        throw ErrorReport(apply)
+            << " unexpected undefined name '" << name
+            << "' while attempting to inline '" << apply.name().name() << "'";
+      } else if (!defined) {
+        // extern function using a global name, assign it an identity mapping
+        rename_map[name] = name;
+      }
+      new_op->set_input(i, renamed);
+    }
+    for (size_t i = 0; i < new_op->output().size(); i++) {
+      auto& name = new_op->output(i);
+      std::string renamed;
+      if (!renameLookup(rename_map, name, renamed)) {
+        renamed = prefix + name;
+        rename_map[name] = renamed;
+      }
+      new_op->set_output(i, renamed);
+    }
+    // handle control flow inside the op as well
+    if (ops_containing_nets.count(new_op->type()) > 0) {
+      for (size_t i = 0; i < new_op->arg_size(); i++) {
+        auto* arg = new_op->mutable_arg(i);
+        if (arg->has_n()) {
+          auto* n = arg->mutable_n();
+          for (size_t j = 0; j < n->op_size(); j++) {
+            renameOp(rename_map, apply, prefix, isExtern, n->mutable_op(j));
+          }
+        }
+      }
+    }
+  }
+
+  bool hasBypassRename(const Apply& apply) {
+    for (auto attr : apply.attributes()) {
+      if (attr.name().name() == "rename") {
+        if (attr.value()->kind() != TK_CONST) {
+          throw ErrorReport(attr.value()) << "expected a single constant";
+        }
+        return attr.value()->tree(0)->doubleValue() == 0;
+      }
+    }
+    return false;
+  }
+
+  // emit a function call by inlining the function's NetDef into our
+  // net def, renaming temporaries func_name<unique_id>/orig_name
+  // renaming only happens for values defined by the function
+  // that are not marked outputs
+
+  // inputs/outputs are passed by reference
+  void emitFunctionCall(Apply& apply, const std::vector<std::string>& outputs) {
+    std::string fname = apply.name().name();
+    std::string prefix = fresh(fname) + "/";
+    auto& fn = symbol_table.at(apply.name().name());
+    bool isExtern = fn.isExtern();
+    auto inputs = getValues(apply.inputs());
+    std::unordered_map<std::string, std::string> rename_map;
+    if (inputs.size() != fn.inputs.size()) {
+      throw ErrorReport(apply) << fname << " expected " << fn.inputs.size()
+                               << " values but received " << inputs.size();
+    }
+    for (size_t i = 0; i < inputs.size(); i++) {
+      rename_map[fn.inputs[i]] = inputs[i];
+    }
+    if (outputs.size() != fn.outputs.size()) {
+      throw ErrorReport(apply) << fname << " expected " << fn.outputs.size()
+                               << " values but received " << outputs.size();
+    }
+    for (size_t i = 0; i < outputs.size(); i++) {
+      rename_map[fn.outputs[i]] = outputs[i];
+    }
+    for (auto& op : fn.net_def->op()) {
+      auto new_op = cur().add_op();
+      new_op->CopyFrom(op);
+      if (hasBypassRename(apply)) {
+        prefix = "";
+      }
+      renameOp(rename_map, apply, prefix, isExtern, new_op);
+    }
+  }
+  void expectOutputs(
+      const TreeRef& tree,
+      const std::vector<std::string>& outputs,
+      size_t size) {
+    if (outputs.size() != size) {
+      throw ErrorReport(tree)
+          << "expected operator to produce " << outputs.size()
+          << " outputs but it produced " << size;
+    }
+  }
+  void appendOutputs(
+      const TreeRef& tree,
+      OperatorDef* op,
+      const std::vector<std::string>& outputs,
+      size_t size) {
+    expectOutputs(tree, outputs, size);
+    for (size_t i = 0; i < size; i++) {
+      op->add_output(outputs[i]);
+    }
+  }
+  void emitOperator(
+      const Apply& apply,
+      const OpSchema* schema,
+      const std::vector<std::string>& outputs) {
+    // must be before add_op
+    auto values = getValues(apply.inputs());
+    if (values.size() < schema->min_input() ||
+        values.size() > schema->max_input()) {
+      if (schema->min_input() == schema->max_input()) {
+        throw ErrorReport(apply) << "operator expects " << schema->min_input()
+                                 << " inputs but found " << values.size();
+      } else {
+        throw ErrorReport(apply)
+            << "operator takes between " << schema->min_input() << " and "
+            << schema->max_input() << " inputs but found " << values.size()
+            << ".";
+      }
+    }
+    auto numActualOutputs = schema->CalculateOutput(values.size());
+    if (numActualOutputs != kCannotComputeNumOutputs &&
+        outputs.size() != numActualOutputs) {
+      throw ErrorReport(apply)
+          << "operator produces " << numActualOutputs
+          << " outputs but matched to " << outputs.size() << " outputs";
+    }
+    auto op = cur().add_op();
+    op->set_type(apply.name().name());
+    for (auto& v : values) {
+      op->add_input(v);
+    }
+    // assume 1 output unless matched to more
+    appendOutputs(apply, op, outputs, outputs.size());
+    for (auto attribute : apply.attributes()) {
+      fillArg(op->add_arg(), attribute);
+    }
+    // Ok, we checked the stuff where we can easily give a friendly error
+    // message, now verify against the schema and report the error at the line
+    if (!schema->Verify(*op)) {
+      throw ErrorReport(apply) << "failed schema checking";
+    }
+  }
+
+  // Emit an operation, writing results into 'outputs'.
+  // This will _always_ compute something, unlike 'getValue' which simply
+  // returns an already computed reference if possible.
+  // So if 'tree' is an identifier or nested identifier (foo.bar)
+  // this will cause it to be _copied_ into outputs.
+  void emit(const TreeRef& tree, const std::vector<std::string>& outputs) {
+    switch (tree->kind()) {
+      case TK_IDENT:
+      case '.': {
+        auto op = cur().add_op();
+        op->set_type("Copy");
+        op->add_input(getValue(tree));
+        appendOutputs(tree, op, outputs, 1);
+      } break;
+      case TK_NE:
+      case TK_EQ:
+      case '<':
+      case '>':
+      case TK_LE:
+      case TK_GE:
+      case '-':
+      case '*':
+      case '/':
+      case '+':
+      case TK_AND:
+      case TK_OR:
+      case TK_NOT:
+      case TK_IF_EXPR: {
+        // must be before add_op
+        auto values = getValues(tree->trees());
+        auto op = cur().add_op();
+        op->set_type(operatorName(tree->kind(), tree->trees().size()));
+        for (auto& v : values) {
+          op->add_input(v);
+        }
+        appendOutputs(tree, op, outputs, 1);
+        auto broadcast = op->add_arg();
+        broadcast->set_name("broadcast");
+        broadcast->set_i(1);
+      } break;
+      case TK_APPLY: {
+        auto apply = Apply(tree);
+        // Handle built-ins like zeros, ones, etc
+        if (builtins.count(apply.name().name()) > 0) {
+          builtins[apply.name().name()](this, apply, outputs);
+          break;
+        }
+        if (symbol_table.count(apply.name().name()) > 0) {
+          emitFunctionCall(apply, outputs);
+          break;
+        }
+        auto schema = OpSchemaRegistry::Schema(apply.name().name());
+        if (schema) {
+          emitOperator(apply, schema, outputs);
+          break;
+        }
+        throw ErrorReport(apply)
+            << "attempting to call unknown operation or function '"
+            << apply.name().name() << "'";
+      } break;
+      case TK_CAST: {
+        auto cast = Cast(tree);
+        auto c2type = getType(cast.type());
+        auto input = getValue(cast.input());
+        auto op = cur().add_op();
+        op->set_type("Cast");
+        op->add_input(input);
+        appendOutputs(tree, op, outputs, 1);
+        auto arg = op->add_arg();
+        arg->set_name("to");
+        arg->set_i(c2type);
+      } break;
+      case TK_CONST: {
+        expectOutputs(tree, outputs, 1);
+        emitConst(
+            tree->tree(0)->doubleValue(),
+            outputs[0],
+            tree->tree(1)->stringValue());
+      } break;
+      case TK_GATHER: {
+        const auto gather = Gather(tree);
+        desugarAndEmitOperator(
+            "Gather",
+            gather.range(),
+            {gather.value(), gather.indices()},
+            outputs);
+        break;
+      }
+      case TK_SLICE: {
+        const auto slice = Slice(tree);
+        desugarAndEmitOperator(
+            "Slice",
+            slice.range(),
+            {slice.value(), slice.startOr(0), slice.endOr(-1)},
+            outputs);
+        break;
+      }
+      default:
+        throw ErrorReport(tree) << "NYI: " << tree;
+        break;
+    }
+  }
+
+  // Desugars constructs that are syntactic sugar and emits the corresponding
+  // operator invocation, e.g. tensor[indices] -> tensor.Gather(indices).
+  void desugarAndEmitOperator(
+      const std::string& operatorName,
+      const SourceRange& range,
+      TreeList&& inputs,
+      const std::vector<std::string>& outputs) {
+    const auto applyName = Ident::create(range, operatorName);
+    const auto applyInputs =
+        Compound::create(TK_LIST, range, std::move(inputs));
+    const auto applyAttributes = Compound::create(TK_LIST, range, {});
+    const auto apply =
+        Apply::create(range, applyName, applyInputs, applyAttributes);
+    const auto schema = OpSchemaRegistry::Schema(operatorName);
+    assert(schema != nullptr);
+    emitOperator(Apply(apply), schema, outputs);
+  }
+
+  TensorProto_DataType getType(int type) {
+    switch (type) {
+      case TK_INT:
+        return TensorProto_DataType_INT32;
+      case TK_FLOAT:
+        return TensorProto_DataType_FLOAT;
+      case TK_LONG:
+        return TensorProto_DataType_INT64;
+      case TK_BOOL:
+        return TensorProto_DataType_BOOL;
+      default:
+        throw std::runtime_error(
+            "expected type token: " + caffe2::to_string(type));
+    }
+  }
+
+  OperatorDef* emitConst(
+      double v,
+      const std::string& output,
+      const std::string& type_ident) {
+    auto op = cur().add_op();
+    op->set_type("ConstantFill");
+    auto dtype = op->add_arg();
+    dtype->set_name("dtype");
+    auto value = op->add_arg();
+    value->set_name("value");
+    if (type_ident == "f") {
+      dtype->set_i(TensorProto_DataType_FLOAT);
+      value->set_f(v);
+    } else if (type_ident == "LL") {
+      dtype->set_i(TensorProto_DataType_INT64);
+      value->set_i(v);
+    } else if (type_ident == "b") {
+      dtype->set_i(TensorProto_DataType_BOOL);
+      value->set_i(v != 0);
+    } else if (type_ident == "i") {
+      dtype->set_i(TensorProto_DataType_INT32);
+      value->set_i(v);
+    } else {
+      throw std::runtime_error("unknown type_ident " + type_ident);
+    }
+    auto shape = op->add_arg();
+    shape->set_name("shape");
+    shape->add_ints(1);
+    op->add_output(output);
+    return op;
+  }
+  NetDef& cur() {
+    return *net_def_stack.back();
+  }
+  FunctionDefinition& def; // the def being constructed
+  std::unordered_map<std::string, std::string>
+      env; // map from name in Def to name in NetDef
+  std::vector<NetDef*> net_def_stack;
+  SymbolTable& symbol_table;
+  int next_fresh = 0;
+
+ private:
+  void emitFillOp(const Apply& apply, const std::vector<std::string>& outputs) {
+    auto builtin_type = apply.name().name();
+    auto values = getValues(apply.inputs());
+    if (values.size() > 1) {
+      throw ErrorReport(apply)
+          << "Built-in " << builtin_type << " accepts 0 or 1 inputs.";
+    }
+    bool has_shape = false;
+    for (const auto& attribute : apply.attributes()) {
+      if (attribute.name().name() == "shape") {
+        has_shape = true;
+      } else {
+        throw ErrorReport(apply)
+            << "Unrecognized attribute " << attribute.name().name()
+            << " for built-in " << builtin_type;
+      }
+    }
+    if (builtin_type == "zeros" || builtin_type == "ones") {
+      if ((values.size() != 1) && !has_shape) {
+        throw ErrorReport(apply)
+            << "Built-in " << builtin_type
+            << " requires either 1 input or 1 shape attribute";
+      }
+    } else {
+      // zeros_like or ones_like
+      if (values.size() != 1) {
+        throw ErrorReport(apply)
+            << "Built-in " << builtin_type << " requires 1 input";
+      }
+    }
+
+    auto op = cur().add_op();
+    op->set_type("ConstantFill");
+    if (values.size()) {
+      op->add_input(values[0]);
+      auto* input_as_shape = op->add_arg();
+      input_as_shape->set_name("input_as_shape");
+      if (builtin_type.find("_like") != std::string::npos) {
+        // zeros_like, ones_like take the shape of the input as constant
+        // tensor shape
+        input_as_shape->set_i(0);
+      } else {
+        // zeros, ones take the values in the tensor as constant tensor
+        // shape
+        input_as_shape->set_i(1);
+      }
+    } else {
+      fillArg(op->add_arg(), apply.attributes()[0]);
+    }
+
+    auto value = op->add_arg();
+    value->set_name("value");
+    if (builtin_type.find("ones") != std::string::npos) {
+      value->set_f(1.0f);
+    } else {
+      value->set_f(0.0f);
+    }
+    appendOutputs(apply, op, outputs, 1);
+  }
+  // emitModule doesn't actually do anything except for allow
+  // statements like a = Module() to register 'a' as a valid identifier
+  // so that a.b = ... will work
+  void emitModule(const Apply& apply, const std::vector<std::string>& outputs) {
+    expectOutputs(apply, outputs, 1);
+  }
+  std::unordered_map<
+      std::string,
+      std::function<void(
+          DefCompiler*,
+          const Apply&,
+          const std::vector<std::string>& outputs)>>
+      builtins{{"zeros", &DefCompiler::emitFillOp},
+               {"zeros_like", &DefCompiler::emitFillOp},
+               {"ones", &DefCompiler::emitFillOp},
+               {"ones_like", &DefCompiler::emitFillOp},
+               {"Module", &DefCompiler::emitModule}};
+};
+
+struct CompilationUnitImpl {
+  void defineFunction(const Def& def) {
+    if (functions.count(def.name().name()) > 0) {
+      throw ErrorReport(def) << def.name().name() << " already defined.";
+    }
+    DefCompiler c(
+        functions.emplace(def.name().name(), FunctionDefinition(def))
+            .first->second,
+        functions);
+    c.run();
+  }
+
+  void define(const std::string& str) {
+    Parser p(str);
+    while (p.lexer().cur().kind != TK_EOF) {
+      defineFunction(Def(p.parseFunction()));
+    }
+  }
+
+  std::unique_ptr<NetBase> createNet(Workspace* ws, const std::string& str) {
+    if (functions.count(str) == 0)
+      throw ErrorReport() << "undefined function: " << str << "\n";
+    auto& def = functions.at(str);
+    return caffe2::CreateNet(*def.net_def, ws);
+  }
+
+  void defineExtern(const std::string& name, std::unique_ptr<NetDef> net_def) {
+    // TODO: unify extern and function namespaces
+    if (functions.count(name) > 0) {
+      throw ErrorReport() << "function '" << name << "' already defined.";
+    }
+    functions.emplace(name, FunctionDefinition(std::move(net_def)));
+  }
+
+  std::string getProto(const std::string& functionName) {
+    return functions.at(functionName).net_def->DebugString();
+  }
+
+ private:
+  friend struct DefCompiler;
+  SymbolTable functions;
+};
+
+CompilationUnit::CompilationUnit() : pImpl(new CompilationUnitImpl()) {}
+
+void CompilationUnit::define(const std::string& str) {
+  return pImpl->define(str);
+}
+
+void CompilationUnit::defineExtern(
+    const std::string& name,
+    std::unique_ptr<NetDef> nd) {
+  pImpl->defineExtern(name, std::move(nd));
+}
+
+std::unique_ptr<NetBase> CompilationUnit::createNet(
+    Workspace* ws,
+    const std::string& str) {
+  return pImpl->createNet(ws, str);
+}
+
+std::string CompilationUnit::getProto(const std::string& functionName) const {
+  return pImpl->getProto(functionName);
+}
+
+CompilationUnit::~CompilationUnit() {}
+
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/compiler.h b/caffe2/contrib/script/compiler.h
new file mode 100644
index 0000000..618c176
--- /dev/null
+++ b/caffe2/contrib/script/compiler.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <memory>
+#include <string>
+#include "caffe2/core/net.h"
+
+namespace caffe2 {
+namespace script {
+
+struct CompilationUnitImpl;
+struct CompilationUnit {
+  CompilationUnit();
+  void define(const std::string& str);
+  void defineExtern(const std::string& str, std::unique_ptr<NetDef> netdef);
+  std::unique_ptr<NetBase> createNet(Workspace* ws, const std::string& name);
+  std::string getProto(const std::string& functionName) const;
+  ~CompilationUnit();
+
+ private:
+  std::unique_ptr<CompilationUnitImpl> pImpl;
+};
+
+} // namespace script
+}; // namespace caffe2
diff --git a/caffe2/contrib/script/error_report.h b/caffe2/contrib/script/error_report.h
new file mode 100644
index 0000000..cecc0f3
--- /dev/null
+++ b/caffe2/contrib/script/error_report.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "caffe2/contrib/script/tree.h"
+
+namespace caffe2 {
+namespace script {
+
+struct ErrorReport : public std::exception {
+  ErrorReport(const ErrorReport& e)
+      : ss(e.ss.str()), context(e.context), the_message(e.the_message) {}
+
+  ErrorReport() : context(nullptr) {}
+  explicit ErrorReport(const SourceRange& r)
+      : context(std::make_shared<SourceRange>(r)) {}
+  explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
+  explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
+  virtual const char* what() const noexcept override {
+    std::stringstream msg;
+    msg << "\n" << ss.str();
+    if (context != nullptr) {
+      msg << ":\n";
+      context->highlight(msg);
+    } else {
+      msg << ".\n";
+    }
+    the_message = msg.str();
+    return the_message.c_str();
+  }
+
+ private:
+  template <typename T>
+  friend const ErrorReport& operator<<(const ErrorReport& e, const T& t);
+
+  mutable std::stringstream ss;
+  std::shared_ptr<SourceRange> context;
+  mutable std::string the_message;
+};
+
+template <typename T>
+const ErrorReport& operator<<(const ErrorReport& e, const T& t) {
+  e.ss << t;
+  return e;
+}
+
+#define C2S_ASSERT(ctx, cond)                                              \
+  if (!(cond)) {                                                           \
+    throw ::caffe2::script::ErrorReport(ctx)                               \
+        << __FILE__ << ":" << __LINE__ << ": assertion failed: " << #cond; \
+  }
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/examples/example_beam_search.c2s b/caffe2/contrib/script/examples/example_beam_search.c2s
new file mode 100644
index 0000000..2e081ee
--- /dev/null
+++ b/caffe2/contrib/script/examples/example_beam_search.c2s
@@ -0,0 +1,76 @@
+[["log_probs", [6, 1, 44463], "float32"], ["attentions", [6, 1, 21], "float32"], ["inputs", [21], "float32"]]
+beam_search
+["scores_t"]
+
+def beam_search(inputs, log_probs, attentions) -> ():
+    beam_size = 6LL
+    length = 20LL
+    beam_output_shape, _ = Concat(length + 1LL, beam_size, axis=0)
+    output_token_beam_list = int(zeros(beam_output_shape))
+    output_prev_index_beam_list = int(zeros(beam_output_shape))
+    output_score_beam_list = zeros(beam_output_shape)
+
+    input_length = inputs.Size().ExpandDims(dims=[0])
+
+    attention_beam_output_shape, _ = Concat(
+        input_length, beam_output_shape, axis=0)
+    output_attention_weights_beam_list = zeros(attention_beam_output_shape)
+
+    attention_step_output_shape, _ = Concat(beam_size, input_length, axis=0)
+    attention_t = zeros(attention_step_output_shape)
+
+    scores_t = zeros(shape=[1, 6])
+    hypo_t = int(zeros(shape=[6]))
+    tokens_t = int(ones(shape=[6])) * 99
+
+    output_token_beam_list = output_token_beam_list.ScatterAssign(0, tokens_t)
+    output_token_beam_list = output_token_beam_list.ExpandDims(dims=[2])
+    output_prev_index_beam_list = output_prev_index_beam_list.ScatterAssign(
+        0, hypo_t)
+    output_prev_index_beam_list = output_prev_index_beam_list.ExpandDims(dims=[2])
+    output_score_beam_list = output_score_beam_list.ScatterAssign(0, scores_t)
+    output_score_beam_list = output_score_beam_list.ExpandDims(dims=[2])
+    output_attention_weights_beam_list = output_attention_weights_beam_list\
+        .ScatterAssign(0, attention_t)
+
+    length_32 = int(length)
+
+    timestep = 0
+    not_finished = True
+    while not_finished:
+        # TODO: once we have a metaprogramming facility we need to insert the
+        # body of the post_eos_penalty here programmatically
+
+        best_scores_per_hypo, best_tokens_per_hypo = log_probs.TopK(k=6)
+
+        # Add the best score in each hypothesis to the cumulative score so far
+        output_scores = best_scores_per_hypo + scores_t.Squeeze(dims=[0])
+
+        # Flatten scores so we can find the best overall out of all hypotheses
+        output_scores_flattened_slice, _ = output_scores.FlattenToVec()\
+            .Slice(0, 6 if timestep == 0 else -1).Reshape(shape=[1, -1])
+
+        # Find top K out of all
+        scores_t, best_indices = output_scores_flattened_slice.TopK(k=6)
+
+        # Integer floor divide on indices finds the association back to original
+        #  hypotheses. Use this to reorder states
+        hypo_t_int64 = best_indices / 6LL
+
+        # Reorder attentions
+        attention_t, _ = attentions.Gather(hypo_t_int64)\
+            .Reshape(shape=[1, 6, -1])
+        tokens_t_int64 = best_tokens_per_hypo.FlattenToVec()\
+            .Gather(best_indices).Cast(to=2)
+
+        timestep += 1
+        not_finished = timestep < length_32
+
+        output_token_beam_list = output_token_beam_list\
+            .ScatterAssign(timestep, tokens_t)
+        output_prev_index_beam_list = output_prev_index_beam_list\
+            .ScatterAssign(timestep, hypo_t)
+        output_score_beam_list = output_score_beam_list\
+            .ScatterAssign(timestep, scores_t)
+        output_attention_weights_beam_list = output_attention_weights_beam_list\
+            .ScatterAssign(timestep, attention_t)
diff --git a/caffe2/contrib/script/examples/example_post_eos_penalty.c2s b/caffe2/contrib/script/examples/example_post_eos_penalty.c2s
new file mode 100644
index 0000000..9988913
--- /dev/null
+++ b/caffe2/contrib/script/examples/example_post_eos_penalty.c2s
@@ -0,0 +1,13 @@
+[["tokens_t", [1, 6], "int32"], ["hypo_t", [1, 6], "int32"], ["log_probs", [6, 1, 44463], "float32"], ["on_initial_step", [1], "bool_"]]
+post_eos_penalty
+["log_probs"]
+
+def post_eos_penalty(tokens_t, hypo_t, log_probs, on_initial_step) \
+  -> (log_probs):
+  eos_token = 1
+  finished_penalty = 0f if on_initial_step else 0.5f
+  predecessor_tokens = tokens_t.FlattenToVec().Gather(hypo_t.FlattenToVec())
+  predecessor_is_eos = float(predecessor_tokens == eos_token)
+  log_probs = log_probs.Add(
+      predecessor_is_eos * finished_penalty, broadcast=1, axis=0
+  )
diff --git a/caffe2/contrib/script/examples/run_examples.py b/caffe2/contrib/script/examples/run_examples.py
new file mode 100644
index 0000000..26f2db0
--- /dev/null
+++ b/caffe2/contrib/script/examples/run_examples.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+import glob
+import json
+import numpy as np
+
+example_files = glob.glob('example_*.c2s')
+
+for ex in example_files:
+    print('Running example file', ex)
+    with open(ex, 'r') as f:
+        inits = json.loads(f.readline())
+        net_name = f.readline().strip()
+        outputs = json.loads(f.readline())
+
+        CU = core.C.CompilationUnit()
+        CU.define(f.read())
+
+    # Initialize workspace with required inputs
+    for name, shape, dt in inits:
+        workspace.FeedBlob(name, np.random.rand(*shape).astype(np.dtype(dt)))
+
+    net = CU.create_net(net_name)
+    net.run()
+
+    print('Success! Interesting outputs:')
+    for output in outputs:
+        print(output, workspace.FetchBlob(output))
diff --git a/caffe2/contrib/script/lexer.cc b/caffe2/contrib/script/lexer.cc
new file mode 100644
index 0000000..2f788e3
--- /dev/null
+++ b/caffe2/contrib/script/lexer.cc
@@ -0,0 +1,26 @@
+#include "caffe2/contrib/script/lexer.h"
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+namespace script {
+
+std::string kindToString(int kind) {
+  if (kind < 256)
+    return std::string(1, kind);
+  switch (kind) {
+#define DEFINE_CASE(tok, str, _) \
+  case tok:                      \
+    return str;
+    TC_FORALL_TOKEN_KINDS(DEFINE_CASE)
+#undef DEFINE_CASE
+    default:
+      throw std::runtime_error("unknown kind: " + caffe2::to_string(kind));
+  }
+}
+
+SharedParserData& sharedParserData() {
+  static SharedParserData data; // safely handles multi-threaded init
+  return data;
+}
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/lexer.h b/caffe2/contrib/script/lexer.h
new file mode 100644
index 0000000..ddcc672
--- /dev/null
+++ b/caffe2/contrib/script/lexer.h
@@ -0,0 +1,527 @@
+#pragma once
+#include <assert.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+namespace script {
+
+// single character tokens are just the character itself '+'
+// multi-character tokens need an entry here
+// if the third entry is not the empty string, it is used
+// in the lexer to match this token.
+
+// These kinds are also used in Tree.h as the kind of the AST node.
+// Some kinds TK_APPLY, TK_LIST are only used in the AST and are not seen in the
+// lexer.
+
+#define TC_FORALL_TOKEN_KINDS(_)                 \
+  _(TK_EOF, "eof", "")                           \
+  _(TK_WHITESPACE, "whitespace", "")             \
+  _(TK_NUMBER, "number", "")                     \
+  _(TK_NEWLINE, "newline", "")                   \
+  _(TK_INDENT, "indent", "")                     \
+  _(TK_DEDENT, "dedent", "")                     \
+  _(TK_WHERE, "where", "where")                  \
+  _(TK_FLOAT, "float", "float")                  \
+  _(TK_DOUBLE, "double", "double")               \
+  _(TK_LONG, "long", "long")                     \
+  _(TK_INT, "int", "int")                        \
+  _(TK_DEF, "def", "def")                        \
+  _(TK_ARROW, "arrow", "->")                     \
+  _(TK_EQUIVALENT, "equivalent", "<=>")          \
+  _(TK_IDENT, "ident", "")                       \
+  _(TK_STRING, "string", "")                     \
+  _(TK_CONST, "const", "")                       \
+  _(TK_LIST, "list", "")                         \
+  _(TK_OPTION, "option", "")                     \
+  _(TK_APPLY, "apply", "")                       \
+  _(TK_COMPREHENSION, "comprehension", "")       \
+  _(TK_TENSOR_TYPE, "tensor_type", "")           \
+  _(TK_RANGE_CONSTRAINT, "range_constraint", "") \
+  _(TK_PARAM, "param", "")                       \
+  _(TK_INFERRED, "inferred", "")                 \
+  _(TK_BOOL, "bool", "")                         \
+  _(TK_ACCESS, "access", "")                     \
+  _(TK_ASSIGN, "assign", "")                     \
+  _(TK_ATTRIBUTE, "attribute", "")               \
+  _(TK_IF, "if", "if")                           \
+  _(TK_ELSE, "else", "else")                     \
+  _(TK_ELIF, "elif", "elif")                     \
+  _(TK_WHILE, "while", "while")                  \
+  _(TK_NE, "ne", "!=")                           \
+  _(TK_EQ, "eq", "==")                           \
+  _(TK_LE, "le", "<=")                           \
+  _(TK_GE, "ge", ">=")                           \
+  _(TK_IF_EXPR, "if", "")                        \
+  _(TK_TRUE, "True", "True")                     \
+  _(TK_FALSE, "False", "False")                  \
+  _(TK_AND, "and", "and")                        \
+  _(TK_OR, "or", "or")                           \
+  _(TK_NOT, "not", "not")                        \
+  _(TK_CAST, "cast", "")                         \
+  _(TK_PLUS_EQ, "+=", "+=")                      \
+  _(TK_MINUS_EQ, "-=", "-=")                     \
+  _(TK_TIMES_EQ, "*=", "*=")                     \
+  _(TK_DIV_EQ, "/=", "/=")                       \
+  _(TK_GLOBAL, "global", "global")               \
+  _(TK_BUILT_IN, "built-in", "")                 \
+  _(TK_SLICE, "slice", "")                       \
+  _(TK_GATHER, "gather", "")
+static const char* valid_single_char_tokens = "+-*/()[]:,={}><.";
+
+enum TokenKind {
+  // we use characters to represent themselves so skip all valid characters
+  // before
+  // assigning enum values to multi-char tokens.
+  TK_DUMMY_START = 256,
+#define DEFINE_TOKEN(tok, _, _2) tok,
+  TC_FORALL_TOKEN_KINDS(DEFINE_TOKEN)
+#undef DEFINE_TOKEN
+};
+
+std::string kindToString(int kind);
+
+// nested hash tables that indicate char-by-char what is a valid token.
+struct TokenTrie;
+using TokenTrieRef = std::unique_ptr<TokenTrie>;
+struct TokenTrie {
+  TokenTrie() : kind(0) {}
+  void insert(const char* str, int tok) {
+    if (*str == '\0') {
+      assert(kind == 0);
+      kind = tok;
+      return;
+    }
+    auto& entry = children[*str];
+    if (entry == nullptr) {
+      entry.reset(new TokenTrie());
+    }
+    entry->insert(str + 1, tok);
+  }
+  int kind; // 0 == invalid token
+  std::unordered_map<char, TokenTrieRef> children;
+};
+
+// stuff that is shared against all TC lexers/parsers and is initialized only
+// once.
+struct SharedParserData {
+  SharedParserData() : head(new TokenTrie()) {
+    // listed in increasing order of precedence
+    std::vector<std::vector<int>> binary_ops = {
+        {TK_IF},
+        {TK_AND, TK_OR},
+        {}, // reserve a level for unary not
+        {'<', '>', TK_EQ, TK_LE, TK_GE, TK_NE},
+        {'+', '-'},
+        {'*', '/'},
+    };
+    std::vector<std::vector<int>> unary_ops = {
+        {'-'},
+    };
+
+    std::stringstream ss;
+    for (const char* c = valid_single_char_tokens; *c; c++) {
+      const char str[] = {*c, '\0'};
+      head->insert(str, *c);
+    }
+
+#define ADD_CASE(tok, _, tokstring) \
+  if (*tokstring != '\0') {         \
+    head->insert(tokstring, tok);   \
+  }
+    TC_FORALL_TOKEN_KINDS(ADD_CASE)
+#undef ADD_CASE
+
+    // precedence starts at 1 so that there is always a 0 precedence
+    // less than any other precedence
+    int prec = 1;
+    for (auto& group : binary_ops) {
+      for (auto& element : group) {
+        binary_prec[element] = prec;
+      }
+      prec++;
+    }
+    // unary ops
+    for (auto& group : unary_ops) {
+      for (auto& element : group) {
+        unary_prec[element] = prec;
+      }
+      prec++;
+    }
+    // add unary not separately because it slots into the precedence of
+    // binary operators
+    unary_prec[TK_NOT] = binary_prec[TK_AND] + 1;
+  }
+  // 1. skip whitespace
+  // 2. handle comment or newline
+  //
+  bool isNumber(const std::string& str, size_t start, size_t* len) {
+    char first = str[start];
+    // strtod allows numbers to start with + or - or nan or inf
+    // http://en.cppreference.com/w/cpp/string/byte/strtof
+    // but we want only the number part, otherwise 1+3 will turn into two
+    // adjacent numbers in the lexer
+    if (first == '-' || first == '+' || isalpha(first))
+      return false;
+    const char* startptr = str.c_str() + start;
+    char* endptr;
+    std::strtod(startptr, &endptr);
+    *len = endptr - startptr;
+    return *len > 0;
+  }
+  bool isblank(int n) {
+    return isspace(n) && n != '\n';
+  }
+  // find the longest match of str.substring(pos) against a token, return true
+  // if successful
+  // filling in kind, start,and len
+  bool match(
+      const std::string& str,
+      size_t pos,
+      bool continuation, // are we inside a scope where newlines don't count
+                         // (e.g. inside parens)
+      bool whitespace_token, // should we treat whitespace as a token
+      int* kind,
+      size_t* start,
+      size_t* len) {
+    *start = pos;
+    // skip whitespace
+    while (pos < str.size() && isblank(str[pos]))
+      pos++;
+
+    // special handling
+    if (pos < str.size()) {
+      if (str[pos] == '#') {
+        // skip comments
+        while (pos < str.size() && str[pos] != '\n')
+          pos++;
+        // tail call, handle whitespace and more comments
+        return match(
+            str, pos, continuation, whitespace_token, kind, start, len);
+      }
+      if (str[pos] == '\\' && pos + 1 < str.size() && str[pos + 1] == '\n' &&
+          !whitespace_token) {
+        return match(str, pos + 2, continuation, false, kind, start, len);
+      }
+      if (str[pos] == '\n') {
+        return match(
+            str, pos + 1, continuation, !continuation, kind, start, len);
+      }
+    }
+    if (pos == str.size()) {
+      *kind = TK_EOF;
+      *start = pos;
+      *len = 0;
+      return true;
+    }
+    // invariant: the next token is not whitespace or newline
+    if (whitespace_token) {
+      *kind = TK_WHITESPACE;
+      *len = pos - *start;
+      return true;
+    }
+    *start = pos;
+    // check for a valid number
+    if (isNumber(str, pos, len)) {
+      *kind = TK_NUMBER;
+      return true;
+    }
+    // check for either an ident or a token
+    // ident tracks whether what we have scanned so far could be an identifier
+    // matched indicates if we have found any match.
+    bool matched = false;
+    bool ident = true;
+    TokenTrie* cur = head.get();
+    for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr); i++) {
+      ident = ident && validIdent(i, str[pos + i]);
+      if (ident) {
+        matched = true;
+        *len = i + 1;
+        *kind = TK_IDENT;
+      }
+      // check for token second, so that e.g. 'max' matches the token TK_MAX
+      // rather the
+      // identifier 'max'
+      if (cur) {
+        auto it = cur->children.find(str[pos + i]);
+        cur = (it == cur->children.end()) ? nullptr : it->second.get();
+        if (cur && cur->kind != 0) {
+          matched = true;
+          *len = i + 1;
+          *kind = cur->kind;
+        }
+      }
+    }
+    return matched;
+  }
+  bool isUnary(int kind, int* prec) {
+    auto it = unary_prec.find(kind);
+    if (it != unary_prec.end()) {
+      *prec = it->second;
+      return true;
+    }
+    return false;
+  }
+  bool isBinary(int kind, int* prec) {
+    auto it = binary_prec.find(kind);
+    if (it != binary_prec.end()) {
+      *prec = it->second;
+      return true;
+    }
+    return false;
+  }
+  bool isRightAssociative(int kind) {
+    switch (kind) {
+      case '?':
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ private:
+  bool validIdent(size_t i, char n) {
+    return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+  }
+  TokenTrieRef head;
+  std::unordered_map<int, int>
+      unary_prec; // map from token to its unary precedence
+  std::unordered_map<int, int>
+      binary_prec; // map from token to its binary precedence
+};
+
+SharedParserData& sharedParserData();
+
+// a range of a shared string 'file_' with functions to help debug by highlight
+// that
+// range.
+struct SourceRange {
+  SourceRange(
+      const std::shared_ptr<std::string>& file_,
+      size_t start_,
+      size_t end_)
+      : file_(file_), start_(start_), end_(end_) {}
+  const std::string text() const {
+    return file().substr(start(), end() - start());
+  }
+  size_t size() const {
+    return end() - start();
+  }
+  void highlight(std::ostream& out) const {
+    const std::string& str = file();
+    size_t begin = start();
+    size_t end = start();
+    while (begin > 0 && str[begin - 1] != '\n')
+      --begin;
+    while (end < str.size() && str[end] != '\n')
+      ++end;
+    out << str.substr(0, end) << "\n";
+    out << std::string(start() - begin, ' ');
+    size_t len = std::min(size(), end - start());
+    out << std::string(len, '~')
+        << (len < size() ? "...  <--- HERE" : " <--- HERE");
+    out << str.substr(end);
+    if (str.size() > 0 && str.back() != '\n')
+      out << "\n";
+  }
+  const std::string& file() const {
+    return *file_;
+  }
+  const std::shared_ptr<std::string>& file_ptr() const {
+    return file_;
+  }
+  size_t start() const {
+    return start_;
+  }
+  size_t end() const {
+    return end_;
+  }
+
+ private:
+  std::shared_ptr<std::string> file_;
+  size_t start_;
+  size_t end_;
+};
+
+struct Token {
+  int kind;
+  SourceRange range;
+  Token(int kind, const SourceRange& range) : kind(kind), range(range) {}
+  double doubleValue() {
+    assert(TK_NUMBER == kind);
+    size_t idx;
+    double r = ::caffe2::stod(text(), &idx);
+    assert(idx == range.size());
+    return r;
+  }
+  std::string text() {
+    return range.text();
+  }
+  std::string kindString() const {
+    return kindToString(kind);
+  }
+};
+
+struct Lookahead {
+  Lookahead(const Token& t) : t(t) {}
+  Token t;
+  bool valid = false;
+  size_t repeat = 0;
+};
+
+struct Lexer {
+  std::shared_ptr<std::string> file;
+  explicit Lexer(const std::string& str)
+      : file(std::make_shared<std::string>(str)),
+        pos(0),
+        cur_(TK_EOF, SourceRange(file, 0, 0)),
+        lookahead_(cur_),
+        repeat(0),
+        nesting(0),
+        shared(sharedParserData()) {
+    auto first_indent = lexRaw(true);
+    indent_stack.push_back(first_indent.range.size());
+    next();
+  }
+  Token next() {
+    Token r = cur_;
+    if (repeat > 0) {
+      repeat--;
+    } else if (lookahead_.valid) {
+      lookahead_.valid = false;
+      repeat = lookahead_.repeat;
+      cur_ = lookahead_.t;
+    } else {
+      std::tie(cur_, repeat) = lex();
+    }
+    return r;
+  }
+  bool nextIf(int kind) {
+    if (cur_.kind != kind)
+      return false;
+    next();
+    return true;
+  }
+
+  [[noreturn]] void reportError(const std::string& what) {
+    reportError(what, cur_);
+  }
+  [[noreturn]] void reportError(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << what << ":\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << "expected " << what << " but found '" << t.kindString()
+       << "' here:\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what) {
+    expected(what, cur_);
+  }
+  Token expect(int kind) {
+    if (cur_.kind != kind) {
+      expected(kindToString(kind));
+    }
+    return next();
+  }
+  Token& lookahead() {
+    if (!lookahead_.valid) {
+      lookahead_.valid = true;
+      std::tie(lookahead_.t, lookahead_.repeat) = lex();
+    }
+    return lookahead_.t;
+  }
+  Token& cur() {
+    return cur_;
+  }
+
+ private:
+  // token, number of times to repeat it
+  std::pair<Token, int> lex() {
+    auto r = lexRaw();
+    int repeat = 0;
+    switch (r.kind) {
+      case '(':
+      case '[':
+      case '{':
+        nesting++;
+        break;
+      case ')':
+      case ']':
+      case '}':
+        nesting--;
+        break;
+      case TK_WHITESPACE: {
+        size_t depth = r.range.size();
+        if (depth > indent_stack.back()) {
+          indent_stack.push_back(depth);
+          r.kind = TK_INDENT;
+        } else if (depth == indent_stack.back()) {
+          r.kind = TK_NEWLINE;
+        } else {
+          while (indent_stack.back() != depth) {
+            indent_stack.pop_back();
+            repeat++;
+            if (indent_stack.size() == 0) {
+              reportError("invalid ident level", r);
+            }
+          }
+          repeat--; // first repeat is this return
+          r.kind = TK_DEDENT;
+        }
+      } break;
+      case TK_EOF:
+        if (indent_stack.size() > 1) {
+          r.kind = TK_DEDENT;
+          indent_stack.pop_back();
+        }
+        break;
+      default:
+        break;
+    }
+    return std::make_pair(r, repeat);
+  }
+  Token lexRaw(bool whitespace_token = false) {
+    int kind;
+    size_t start;
+    size_t length;
+    assert(file);
+    if (!shared.match(
+            *file,
+            pos,
+            nesting > 0,
+            whitespace_token,
+            &kind,
+            &start,
+            &length)) {
+      expected(
+          "a valid token",
+          Token((*file)[start], SourceRange(file, start, start + 1)));
+    }
+    auto t = Token(kind, SourceRange(file, start, start + length));
+    pos = start + length;
+    return t;
+  }
+  size_t pos;
+  Token cur_;
+  Lookahead lookahead_;
+  size_t repeat; // how many times to repeat the current token until we continue
+
+  size_t nesting; // depth of ( [ { nesting...
+  std::vector<int> indent_stack; // stack of identation level of blocks
+  SharedParserData& shared;
+};
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/parser.h b/caffe2/contrib/script/parser.h
new file mode 100644
index 0000000..4b68b8d
--- /dev/null
+++ b/caffe2/contrib/script/parser.h
@@ -0,0 +1,418 @@
+#pragma once
+#include "lexer.h"
+#include "tree.h"
+#include "tree_views.h"
+
+namespace caffe2 {
+namespace script {
+
+struct Parser {
+  explicit Parser(const std::string& str)
+      : L(str), shared(sharedParserData()) {}
+
+  TreeRef parseIdent() {
+    auto t = L.expect(TK_IDENT);
+    // whenever we parse something that has a TreeView type we always
+    // use its create method so that the accessors and the constructor
+    // of the Compound tree are in the same place.
+    return Ident::create(t.range, t.text());
+  }
+  TreeRef createApply(TreeRef ident, TreeList& inputs) {
+    TreeList attributes;
+    auto range = L.cur().range;
+    parseOperatorArguments(inputs, attributes);
+    return Apply::create(
+        range,
+        ident,
+        List(range, std::move(inputs)),
+        List(range, std::move(attributes)));
+  }
+  // things like a 1.0 or a(4) that are not unary/binary expressions
+  // and have higher precedence than all of them
+  TreeRef parseBaseExp() {
+    TreeRef prefix;
+    switch (L.cur().kind) {
+      case TK_NUMBER:
+      case TK_TRUE:
+      case TK_FALSE: {
+        prefix = parseConst();
+      } break;
+      case '(': {
+        L.next();
+        prefix = parseExp();
+        L.expect(')');
+      } break;
+      case TK_FLOAT:
+      case TK_INT:
+      case TK_LONG: {
+        auto r = L.cur().range;
+        auto type = c(L.next().kind, r, {});
+        L.expect('(');
+        auto exp = parseExp();
+        L.expect(')');
+        prefix = Cast::create(r, type, exp);
+      } break;
+      default: {
+        prefix = parseIdent();
+        if (L.cur().kind == '(') {
+          TreeList inputs;
+          prefix = createApply(prefix, inputs);
+        }
+      } break;
+    }
+    while (true) {
+      if (L.nextIf('.')) {
+        const auto name = parseIdent();
+        if (L.cur().kind == '(') {
+          TreeList inputs = {prefix};
+          prefix = createApply(name, inputs);
+        } else {
+          prefix = Select::create(name->range(), prefix, name);
+        }
+      } else if (L.cur().kind == '[') {
+        prefix = parseSliceOrGather(prefix);
+      } else {
+        break;
+      }
+    }
+    return prefix;
+  }
+  TreeRef parseOptionalReduction() {
+    auto r = L.cur().range;
+    switch (L.cur().kind) {
+      case TK_PLUS_EQ:
+      case TK_MINUS_EQ:
+      case TK_TIMES_EQ:
+      case TK_DIV_EQ: {
+        int modifier = L.next().text()[0];
+        return c(modifier, r, {});
+      } break;
+      default: {
+        L.expect('=');
+        return c('=', r, {}); // no reduction
+      } break;
+    }
+  }
+  TreeRef
+  parseTrinary(TreeRef true_branch, const SourceRange& range, int binary_prec) {
+    auto cond = parseExp();
+    L.expect(TK_ELSE);
+    auto false_branch = parseExp(binary_prec);
+    return c(TK_IF_EXPR, range, {cond, true_branch, false_branch});
+  }
+  // parse the longest expression whose binary operators have
+  // precedence strictly greater than 'precedence'
+  // precedence == 0 will parse _all_ expressions
+  // this is the core loop of 'top-down precedence parsing'
+  TreeRef parseExp(int precedence = 0) {
+    TreeRef prefix = nullptr;
+    int unary_prec;
+    if (shared.isUnary(L.cur().kind, &unary_prec)) {
+      auto kind = L.cur().kind;
+      auto pos = L.cur().range;
+      L.next();
+      prefix = c(kind, pos, {parseExp(unary_prec)});
+    } else {
+      prefix = parseBaseExp();
+    }
+    int binary_prec;
+    while (shared.isBinary(L.cur().kind, &binary_prec)) {
+      if (binary_prec <= precedence) // not allowed to parse something which is
+        // not greater than 'precedenc'
+        break;
+
+      int kind = L.cur().kind;
+      auto pos = L.cur().range;
+      L.next();
+      if (shared.isRightAssociative(kind))
+        binary_prec--;
+
+      // special case for trinary operator
+      if (kind == TK_IF) {
+        prefix = parseTrinary(prefix, pos, binary_prec);
+        continue;
+      }
+
+      prefix = c(kind, pos, {prefix, parseExp(binary_prec)});
+    }
+    return prefix;
+  }
+  TreeRef
+  parseList(int begin, int sep, int end, std::function<TreeRef(int)> parse) {
+    auto r = L.cur().range;
+    L.expect(begin);
+    TreeList elements;
+    if (L.cur().kind != end) {
+      int i = 0;
+      do {
+        elements.push_back(parse(i++));
+      } while (L.nextIf(sep));
+    }
+    L.expect(end);
+    return c(TK_LIST, r, std::move(elements));
+  }
+  TreeRef parseNonEmptyList(int sep, std::function<TreeRef(int)> parse) {
+    TreeList elements;
+    int i = 0;
+    do {
+      elements.push_back(parse(i++));
+    } while (L.nextIf(sep));
+    return c(TK_LIST, elements[0]->range(), std::move(elements));
+  }
+  TreeRef parseExpList() {
+    return parseList('(', ',', ')', [&](int i) { return parseExp(); });
+  }
+  TreeRef parseConst() {
+    // 'b' - boolean
+    // 'LL' 64-bit integer
+    // 'f' single-precision float
+    // 'i' 32-bit integer
+    // 'f' is default if '.' appears in the number
+    auto range = L.cur().range;
+    if (L.nextIf(TK_TRUE)) {
+      return c(TK_CONST, range, {d(1), s("b")});
+    } else if (L.nextIf(TK_FALSE)) {
+      return c(TK_CONST, range, {d(0), s("b")});
+    }
+    float mult = 1.0f;
+    while (L.nextIf('-')) {
+      mult *= -1.0f;
+    }
+    auto t = L.expect(TK_NUMBER);
+    std::string type_ident =
+        (t.text().find('.') == std::string::npos) ? "i" : "f";
+    if (L.cur().kind == TK_IDENT) {
+      Token type_ident_tok = L.expect(TK_IDENT);
+      type_ident = type_ident_tok.text();
+      if (type_ident != "LL" && type_ident != "f") {
+        throw ErrorReport(type_ident_tok)
+            << "expected 'f' or 'LL' "
+            << "as numeric type identifier but found '" << type_ident << "'";
+      }
+    }
+    return c(TK_CONST, t.range, {d(mult * t.doubleValue()), s(type_ident)});
+  }
+  TreeRef parseAttributeValue() {
+    int kind = L.cur().kind;
+    switch (kind) {
+      case '[':
+        return parseList('[', ',', ']', [&](int i) { return parseConst(); });
+      default:
+        return parseConst();
+    }
+  }
+  void parseOperatorArguments(TreeList& inputs, TreeList& attributes) {
+    L.expect('(');
+    if (L.cur().kind != ')') {
+      do {
+        if (L.cur().kind == TK_IDENT && L.lookahead().kind == '=') {
+          auto ident = parseIdent();
+          L.expect('=');
+          auto v = parseAttributeValue();
+          attributes.push_back(Attribute::create(ident->range(), ident, v));
+        } else {
+          inputs.push_back(parseExp());
+        }
+      } while (L.nextIf(','));
+    }
+    L.expect(')');
+  }
+
+  // OK: [a] (gather), [a:], [:a], [a:b], [:] (slice)
+  // Not OK: []
+  TreeRef parseSliceOrGather(TreeRef value) {
+    const auto range = L.cur().range;
+    L.expect('[');
+
+    // `first` will either be the gather indices, or the start of the slice.
+    TreeRef first, second;
+
+    // Here we can either have a colon (which starts a slice), or an expression.
+    // If an expression, we don't know yet if it will be a slice or a gather.
+    if (L.cur().kind != ':') {
+      first = parseExp();
+      if (L.nextIf(']')) {
+        return Gather::create(range, value, first);
+      } else {
+        first = c(TK_OPTION, range, {first});
+      }
+    } else {
+      first = c(TK_OPTION, range, {});
+    }
+    L.expect(':');
+    // Now we *may* have an expression.
+    if (L.cur().kind != ']') {
+      second = c(TK_OPTION, range, {parseExp()});
+    } else {
+      second = c(TK_OPTION, range, {});
+    }
+    L.expect(']');
+
+    return Slice::create(range, value, first, second);
+  }
+  TreeRef parseIdentList() {
+    return parseList('(', ',', ')', [&](int i) { return parseIdent(); });
+  }
+  TreeRef parseParam() {
+    auto typ = parseType();
+    if (L.cur().kind != TK_IDENT && typ->trees()[0]->kind() == TK_IDENT) {
+      // oops, it wasn't a type but just a param without any type specified
+      return Param::create(
+          typ->range(), typ->trees()[0], c(TK_INFERRED, typ->range(), {}));
+    }
+    auto ident = parseIdent();
+    return Param::create(typ->range(), ident, typ);
+  }
+  // TODO: these functions should be unnecessary, but we currently do not
+  // emit a TK_NEWLINE before a series of TK_DEDENT tokens
+  // so if we see a TK_DEDENT then we know a newline must have happened and
+  // ignore it. The real fix is to patch the lexer so TK_NEWLINE does get
+  // emited before a TK_INDENT
+  void expectEndOfLine() {
+    if (L.cur().kind != TK_DEDENT)
+      L.expect(TK_NEWLINE);
+  }
+  bool isEndOfLine() {
+    return L.cur().kind == TK_NEWLINE || L.cur().kind == TK_DEDENT;
+  }
+
+  // 'first' has already been parsed since expressions can exist
+  // alone on a line:
+  // first[,other,lhs] = rhs
+  TreeRef parseAssign(TreeRef first) {
+    TreeRef list = parseOneOrMoreExp(first);
+    auto red = parseOptionalReduction();
+    auto rhs = parseExp();
+    expectEndOfLine();
+    return Assign::create(list->range(), list, red, rhs);
+  }
+  TreeRef parseStmt() {
+    switch (L.cur().kind) {
+      case TK_IF:
+        return parseIf();
+      case TK_WHILE:
+        return parseWhile();
+      case TK_GLOBAL: {
+        auto range = L.next().range;
+        std::vector<TreeRef> idents;
+        do {
+          idents.push_back(parseIdent());
+        } while (L.nextIf(','));
+        expectEndOfLine();
+        return c(TK_GLOBAL, range, std::move(idents));
+      }
+      default: {
+        auto r = parseExp();
+        if (!isEndOfLine()) {
+          return parseAssign(r);
+        } else {
+          expectEndOfLine();
+          return r;
+        }
+      }
+    }
+  }
+  TreeRef parseScalarType() {
+    switch (L.cur().kind) {
+      case TK_INT:
+      case TK_FLOAT:
+      case TK_LONG:
+      case TK_DOUBLE: {
+        auto t = L.next();
+        return c(t.kind, t.range, {});
+      }
+      default:
+        return parseIdent();
+    }
+  }
+  TreeRef parseOptionalIdentList() {
+    TreeRef list = nullptr;
+    if (L.cur().kind == '(') {
+      list = parseIdentList();
+    } else {
+      list = c(TK_LIST, L.cur().range, {});
+    }
+    return list;
+  }
+  TreeRef parseType() {
+    auto st = parseScalarType();
+    auto list = parseOptionalIdentList();
+    return TensorType::create(st->range(), st, list);
+  }
+  // 'first' has already been parsed, add the rest
+  // if they exist
+  // first[, the, rest]
+  TreeRef parseOneOrMoreExp(TreeRef first) {
+    TreeList list{first};
+    while (L.nextIf(',')) {
+      list.push_back(parseExp());
+    }
+    return List(list.back()->range(), std::move(list));
+  }
+  TreeRef parseIf() {
+    auto r = L.cur().range;
+    L.expect(TK_IF);
+    auto cond = parseExp();
+    L.expect(':');
+    auto true_branch = parseStatements();
+    auto false_branch = List(L.cur().range, {});
+    if (L.nextIf(TK_ELSE)) {
+      L.expect(':');
+      false_branch = parseStatements();
+    }
+    return If::create(r, cond, true_branch, false_branch);
+  }
+  TreeRef parseWhile() {
+    auto r = L.cur().range;
+    L.expect(TK_WHILE);
+    auto cond = parseExp();
+    L.expect(':');
+    auto body = parseStatements();
+    return While::create(r, cond, body);
+  }
+  TreeRef parseStatements() {
+    auto r = L.cur().range;
+    L.expect(TK_INDENT);
+    TreeList stmts;
+    while (true) {
+      stmts.push_back(parseStmt());
+      if (L.nextIf(TK_DEDENT))
+        break;
+    }
+    return c(TK_LIST, r, std::move(stmts));
+  }
+  TreeRef parseFunction() {
+    L.expect(TK_DEF);
+    auto name = parseIdent();
+    auto paramlist =
+        parseList('(', ',', ')', [&](int i) { return parseParam(); });
+    L.expect(TK_ARROW);
+    auto retlist =
+        parseList('(', ',', ')', [&](int i) { return parseParam(); });
+    L.expect(':');
+    auto stmts_list = parseStatements();
+    return Def::create(name->range(), name, paramlist, retlist, stmts_list);
+  }
+  Lexer& lexer() {
+    return L;
+  }
+
+ private:
+  // short helpers to create nodes
+  TreeRef d(double v) {
+    return Number::create(v);
+  }
+  TreeRef s(const std::string& s) {
+    return String::create(s);
+  }
+  TreeRef c(int kind, const SourceRange& range, TreeList&& trees) {
+    return Compound::create(kind, range, std::move(trees));
+  }
+  TreeRef List(const SourceRange& range, TreeList&& trees) {
+    return c(TK_LIST, range, std::move(trees));
+  }
+  Lexer L;
+  SharedParserData& shared;
+};
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/tree.h b/caffe2/contrib/script/tree.h
new file mode 100644
index 0000000..c508308
--- /dev/null
+++ b/caffe2/contrib/script/tree.h
@@ -0,0 +1,233 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "caffe2/contrib/script/lexer.h"
+
+namespace caffe2 {
+namespace script {
+
+// Tree's are used to represent all forms of TC IR, pre- and post- typechecking.
+// Rather than have a full class hierarchy for all TC statements,
+// Trees are a slight variation of Lisp S-expressions.
+// for instance the expression a*b+1 is represented as:
+// (+ (* (ident a) (ident b)) (const 1))
+// Atoms like 'a', 'b', and '1' are represented by subclasses of Tree which
+// define stringValue() and doubleValue().
+// Everything else is a Compound object, which has a 'kind' that is a token from
+// Lexer.h's TokenKind enum, and contains a list of subtrees.
+// Like TokenKind single-character operators like '+' are representing using the
+// character itself, so add.kind() == '+'.
+// Compound objects are also always associated with a SourceRange for
+// reporting error message.
+
+// Memory management of trees is done using shared_ptr.
+
+struct Tree;
+using TreeRef = std::shared_ptr<Tree>;
+using TreeList = std::vector<TreeRef>;
+
+static const TreeList empty_trees = {};
+
+struct Tree : std::enable_shared_from_this<Tree> {
+  Tree(int kind_) : kind_(kind_) {}
+  int kind() const {
+    return kind_;
+  }
+  virtual bool isAtom() const {
+    return true;
+  }
+  virtual const SourceRange& range() const {
+    throw std::runtime_error("is an Atom");
+  }
+  virtual double doubleValue() const {
+    throw std::runtime_error("not a TK_NUMBER");
+  }
+  virtual const std::string& stringValue() const {
+    throw std::runtime_error("not a TK_STRING");
+  }
+  virtual bool boolValue() const {
+    throw std::runtime_error("not a TK_BOOL");
+  }
+  virtual const TreeList& trees() const {
+    return empty_trees;
+  }
+  const TreeRef& tree(size_t i) const {
+    return trees().at(i);
+  }
+  virtual TreeRef map(std::function<TreeRef(TreeRef)> /*fn*/) {
+    return shared_from_this();
+  }
+  template <typename... Args>
+  void match(int k, Args&... args) {
+    matchD(k, "unknown", 0, args...);
+  }
+  template <typename... Args>
+  void matchD(int k, const char* filename, int lineno, Args&... args) {
+    if (kind() != k) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expecting kind '" << kindToString(k)
+         << "' but found '" << kind() << "'\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+    std::initializer_list<TreeRef*> vars = {&args...};
+    if (vars.size() > trees().size()) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": trying to match " << vars.size()
+         << " variables against " << trees().size() << " values in list.\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+    size_t i = 0;
+    for (TreeRef* v : vars) {
+      *v = trees()[i++];
+    }
+  }
+  virtual ~Tree() {}
+
+ private:
+  int kind_;
+};
+
+struct String : public Tree {
+  String(const std::string& value_) : Tree(TK_STRING), value_(value_) {}
+  virtual const std::string& stringValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return std::make_shared<String>(std::forward<Args>(args)...);
+  }
+
+ private:
+  std::string value_;
+};
+struct Number : public Tree {
+  Number(double value_) : Tree(TK_NUMBER), value_(value_) {}
+  virtual double doubleValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return std::make_shared<Number>(std::forward<Args>(args)...);
+  }
+
+ private:
+  double value_;
+};
+struct Bool : public Tree {
+  Bool(bool value_) : Tree(TK_BOOL), value_(value_) {}
+  virtual double doubleValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return std::make_shared<Bool>(std::forward<Args>(args)...);
+  }
+
+ private:
+  bool value_;
+};
+
+static SourceRange mergeRanges(SourceRange c, const TreeList& others) {
+  for (auto t : others) {
+    if (t->isAtom())
+      continue;
+    size_t s = std::min(c.start(), t->range().start());
+    size_t e = std::max(c.end(), t->range().end());
+    c = SourceRange(c.file_ptr(), s, e);
+  }
+  return c;
+}
+
+struct Compound : public Tree {
+  Compound(int kind, const SourceRange& range_) : Tree(kind), range_(range_) {}
+  Compound(int kind, const SourceRange& range_, TreeList&& trees_)
+      : Tree(kind),
+        range_(mergeRanges(range_, trees_)),
+        trees_(std::move(trees_)) {}
+  virtual const TreeList& trees() const override {
+    return trees_;
+  }
+  static TreeRef
+  create(int kind, const SourceRange& range_, TreeList&& trees_) {
+    return std::make_shared<Compound>(kind, range_, std::move(trees_));
+  }
+  virtual bool isAtom() const override {
+    return false;
+  }
+  virtual TreeRef map(std::function<TreeRef(TreeRef)> fn) override {
+    TreeList trees_;
+    for (auto& t : trees()) {
+      trees_.push_back(fn(t));
+    }
+    return Compound::create(kind(), range(), std::move(trees_));
+  }
+  const SourceRange& range() const override {
+    return range_;
+  }
+
+ private:
+  SourceRange range_;
+  TreeList trees_;
+};
+
+// tree pretty printer
+struct pretty_tree {
+  pretty_tree(const TreeRef& tree, size_t col = 40) : tree(tree), col(col) {}
+  const TreeRef& tree;
+  size_t col;
+  std::unordered_map<TreeRef, std::string> flat_strings;
+  const std::string& get_flat(const TreeRef& t) {
+    auto it = flat_strings.find(t);
+    if (it != flat_strings.end())
+      return it->second;
+
+    std::stringstream out;
+    switch (t->kind()) {
+      case TK_NUMBER:
+        out << t->doubleValue();
+        break;
+      case TK_STRING:
+        out << t->stringValue();
+        break;
+      default:
+        out << "(" << kindToString(t->kind());
+        for (auto e : t->trees()) {
+          out << " " << get_flat(e);
+        }
+        out << ")";
+        break;
+    }
+    auto it_ = flat_strings.emplace(t, out.str());
+    return it_.first->second;
+  }
+  void print(std::ostream& out, const TreeRef& t, int indent) {
+    const std::string& s = get_flat(t);
+    if (indent + s.size() < col || t->isAtom()) {
+      out << s;
+      return;
+    }
+    std::string k = kindToString(t->kind());
+    out << "(" << k;
+    for (auto e : t->trees()) {
+      out << "\n" << std::string(indent + 2, ' ');
+      print(out, e, indent + 2);
+    }
+    out << ")";
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& out, pretty_tree t_) {
+  t_.print(out, t_.tree, 0);
+  return out << std::endl;
+}
+
+static inline std::ostream& operator<<(std::ostream& out, TreeRef t) {
+  return out << pretty_tree(t);
+}
+
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/script/tree_views.h b/caffe2/contrib/script/tree_views.h
new file mode 100644
index 0000000..2089333
--- /dev/null
+++ b/caffe2/contrib/script/tree_views.h
@@ -0,0 +1,442 @@
+#pragma once
+#include "error_report.h"
+#include "tree.h"
+
+namespace caffe2 {
+namespace script {
+
+// TreeView provides a statically-typed way to access the members of a TreeRef
+// instead of using TK_MATCH
+
+struct TreeView {
+  explicit TreeView(const TreeRef& tree_) : tree_(tree_) {}
+  TreeRef tree() const {
+    return tree_;
+  }
+  const SourceRange& range() const {
+    return tree_->range();
+  }
+  operator TreeRef() const {
+    return tree_;
+  }
+
+ protected:
+  TreeRef tree_;
+};
+
+template <typename T>
+struct ListViewIterator {
+  ListViewIterator(TreeList::const_iterator it) : it(it) {}
+  bool operator!=(const ListViewIterator& rhs) const {
+    return it != rhs.it;
+  }
+  T operator*() const {
+    return T(*it);
+  }
+  void operator++() {
+    ++it;
+  }
+  void operator--() {
+    --it;
+  }
+
+ private:
+  TreeList::const_iterator it;
+};
+
+template <typename T>
+struct ListView : public TreeView {
+  ListView(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_LIST);
+  }
+  typedef ListViewIterator<T> iterator;
+  typedef ListViewIterator<T> const_iterator;
+  iterator begin() const {
+    return iterator(tree_->trees().begin());
+  }
+  iterator end() const {
+    return iterator(tree_->trees().end());
+  }
+  T operator[](size_t i) const {
+    return T(tree_->trees().at(i));
+  }
+  TreeRef map(std::function<TreeRef(const T&)> fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  size_t size() const {
+    return tree_->trees().size();
+  }
+};
+
+template <typename T>
+struct OptionView : public TreeView {
+  explicit OptionView(const TreeRef& tree) : TreeView(tree) {
+    C2S_ASSERT(tree, tree->kind() == TK_OPTION);
+  }
+  bool present() const {
+    return tree_->trees().size() > 0;
+  }
+  T get() const {
+    C2S_ASSERT(tree_, present());
+    return T(tree_->trees()[0]);
+  }
+  TreeRef map(std::function<TreeRef(const T&)> fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+};
+
+struct Ident : public TreeView {
+  // each subclass of TreeView provides:
+  // 1. a constructor that takes a TreeRef, and matches it to the right type.
+  explicit Ident(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_IDENT, name_);
+  }
+  // 2. accessors that get underlying information out of the object
+  // in this case, we return the name of the identifier, and handle the
+  // converstion to a string in the method
+  const std::string& name() const {
+    return name_->stringValue();
+  }
+
+  // 3. a static method 'create' that creates the underlying TreeRef object
+  // for every TreeRef kind that has a TreeView, the parser always uses
+  // (e.g.) Ident::create rather than Compound::Create, this means that
+  // changes to the structure of Ident are always made right here rather
+  // than both in the parser and in this code
+  static TreeRef create(const SourceRange& range, const std::string& name) {
+    return Compound::create(TK_IDENT, range, {String::create(name)});
+  }
+
+ private:
+  TreeRef name_;
+};
+
+struct Attribute : public TreeView {
+  explicit Attribute(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_ATTRIBUTE, name_, value_);
+  }
+  Ident name() const {
+    return Ident(name_);
+  }
+  TreeRef value() const {
+    return value_;
+  }
+  static TreeRef create(const SourceRange& range, TreeRef name, TreeRef value) {
+    return Compound::create(TK_ATTRIBUTE, range, {name, value});
+  }
+
+ private:
+  TreeRef name_;
+  TreeRef value_;
+};
+
+struct Apply : public TreeView {
+  explicit Apply(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_APPLY, name_, inputs_, attributes_);
+  }
+
+  Ident name() const {
+    return Ident(name_);
+  }
+  ListView<TreeRef> inputs() const {
+    return ListView<TreeRef>(inputs_);
+  }
+  ListView<Attribute> attributes() const {
+    return ListView<Attribute>(attributes_);
+  }
+
+  static TreeRef create(
+      const SourceRange& range,
+      TreeRef name,
+      TreeRef inputs,
+      TreeRef attributes) {
+    return Compound::create(TK_APPLY, range, {name, inputs, attributes});
+  }
+
+ private:
+  TreeRef name_;
+  TreeRef inputs_;
+  TreeRef attributes_;
+};
+
+struct Slice : public TreeView {
+  explicit Slice(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_SLICE, value_, start_, end_);
+  }
+
+  TreeRef value() const {
+    return value_;
+  }
+
+  OptionView<TreeRef> start() const {
+    return OptionView<TreeRef>(start_);
+  }
+
+  OptionView<TreeRef> end() const {
+    return OptionView<TreeRef>(end_);
+  }
+
+  TreeRef startOr(int alternative) const {
+    const auto startOption = start();
+    return startOption.present() ? startOption.get() : createInt(alternative);
+  }
+
+  TreeRef endOr(int alternative) const {
+    const auto endOption = end();
+    return endOption.present() ? endOption.get() : createInt(alternative);
+  }
+
+  static TreeRef
+  create(const SourceRange& range, TreeRef value, TreeRef start, TreeRef end) {
+    return Compound::create(TK_SLICE, range, {value, start, end});
+  }
+
+ private:
+  TreeRef createInt(int value) const {
+    return Compound::create(
+        TK_CONST, range(), {Number::create(value), String::create("i")});
+  }
+
+  TreeRef value_;
+  TreeRef start_;
+  TreeRef end_;
+};
+
+struct Gather : public TreeView {
+  explicit Gather(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_GATHER, value_, indices_);
+  }
+
+  TreeRef value() const {
+    return value_;
+  }
+
+  TreeRef indices() const {
+    return indices_;
+  }
+
+  static TreeRef
+  create(const SourceRange& range, TreeRef value, TreeRef indices) {
+    return Compound::create(TK_GATHER, range, {value, indices});
+  }
+
+ private:
+  TreeRef value_;
+  TreeRef indices_;
+};
+
+struct Cast : public TreeView {
+  explicit Cast(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_CAST, type_, input_);
+  }
+
+  int type() const {
+    return type_->kind();
+  }
+  TreeRef input() const {
+    return input_;
+  }
+
+  static TreeRef create(const SourceRange& range, TreeRef type, TreeRef input) {
+    return Compound::create(TK_CAST, range, {type, input});
+  }
+
+ private:
+  TreeRef type_;
+  TreeRef input_;
+};
+
+struct TensorType : public TreeView {
+  explicit TensorType(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_TENSOR_TYPE, scalar_type_, dims_);
+  }
+  static TreeRef
+  create(const SourceRange& range, TreeRef scalar_type_, TreeRef dims_) {
+    return Compound::create(TK_TENSOR_TYPE, range, {scalar_type_, dims_});
+  }
+  int scalarType() const {
+    if (scalar_type_->kind() == TK_IDENT)
+      throw ErrorReport(tree_)
+          << " TensorType has a symbolic ident " << Ident(scalar_type_).name()
+          << " rather than a concrete type";
+    return scalar_type_->kind();
+  }
+  ListView<Ident> dims() const {
+    return ListView<Ident>(dims_);
+  }
+
+ private:
+  TreeRef scalar_type_;
+  TreeRef dims_;
+};
+
+struct Param : public TreeView {
+  explicit Param(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_PARAM, ident_, type_);
+  }
+  static TreeRef create(const SourceRange& range, TreeRef ident, TreeRef type) {
+    return Compound::create(TK_PARAM, range, {ident, type});
+  }
+  // when the type of a field is statically know the accessors return
+  // the wrapped type. for instance here we know ident_ is an identifier
+  // so the accessor returns an Ident
+  // this means that clients can do p.ident().name() to get the name of the
+  // parameter.
+  Ident ident() const {
+    return Ident(ident_);
+  }
+  // may be TensorType or TK_INFERRED
+  TreeRef type() const {
+    return type_;
+  }
+  bool typeIsInferred() const {
+    return type_->kind() == TK_INFERRED;
+  }
+  // helper for when you know the type is not inferred.
+  TensorType tensorType() const {
+    return TensorType(type_);
+  }
+
+ private:
+  TreeRef ident_;
+  TreeRef type_;
+};
+
+struct Assign : public TreeView {
+  explicit Assign(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_ASSIGN, lhs_, reduction_, rhs_);
+  }
+  static TreeRef create(
+      const SourceRange& range,
+      TreeRef lhs,
+      TreeRef reduction,
+      TreeRef rhs) {
+    return Compound::create(TK_ASSIGN, range, {lhs, reduction, rhs});
+  }
+  // when the type of a field is statically know the accessors return
+  // the wrapped type. for instance here we know ident_ is an identifier
+  // so the accessor returns an Ident
+  // this means that clients can do p.ident().name() to get the name of the
+  // parameter.
+  ListView<TreeRef> lhs() const {
+    return ListView<TreeRef>(lhs_);
+  }
+  int reduction() const {
+    return reduction_->kind();
+  }
+  TreeRef rhs() const {
+    return rhs_;
+  }
+
+ private:
+  TreeRef lhs_;
+  TreeRef reduction_;
+  TreeRef rhs_;
+};
+
+struct Def : public TreeView {
+  explicit Def(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DEF, name_, paramlist, retlist, stmts_list);
+  }
+  Ident name() const {
+    return Ident(name_);
+  }
+  // ListView helps turn TK_LISTs into vectors of TreeViews
+  // so that we can, e.g., return lists of parameters
+  ListView<Param> params() const {
+    return ListView<Param>(paramlist);
+  }
+  ListView<Param> returns() const {
+    return ListView<Param>(retlist);
+  }
+  ListView<TreeRef> statements() const {
+    return ListView<TreeRef>(stmts_list);
+  }
+  static TreeRef create(
+      const SourceRange& range,
+      TreeRef name,
+      TreeRef paramlist,
+      TreeRef retlist,
+      TreeRef stmts_list) {
+    return Compound::create(
+        TK_DEF, range, {name, paramlist, retlist, stmts_list});
+  }
+
+ private:
+  TreeRef name_;
+  TreeRef paramlist;
+  TreeRef retlist;
+  TreeRef stmts_list;
+};
+
+struct Select : public TreeView {
+  explicit Select(const TreeRef& tree) : TreeView(tree) {
+    tree_->match('.', value_, selector_);
+  }
+  TreeRef value() const {
+    return value_;
+  }
+  Ident selector() const {
+    return Ident(selector_);
+  }
+  static TreeRef
+  create(const SourceRange& range, TreeRef value, TreeRef selector) {
+    return Compound::create('.', range, {value, selector});
+  }
+
+ private:
+  TreeRef value_;
+  TreeRef selector_;
+};
+
+struct If : public TreeView {
+  explicit If(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_IF, cond_, true_branch_, false_branch_);
+  }
+  const TreeRef& cond() const {
+    return cond_;
+  }
+  ListView<TreeRef> trueBranch() const {
+    return ListView<TreeRef>(true_branch_);
+  }
+  ListView<TreeRef> falseBranch() const {
+    return ListView<TreeRef>(false_branch_);
+  }
+
+  static TreeRef create(
+      const SourceRange& range,
+      TreeRef cond_,
+      TreeRef true_branch_,
+      TreeRef false_branch_) {
+    return Compound::create(TK_IF, range, {cond_, true_branch_, false_branch_});
+  }
+
+ private:
+  TreeRef cond_;
+  TreeRef true_branch_;
+  TreeRef false_branch_;
+};
+
+struct While : public TreeView {
+  explicit While(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_WHILE, cond_, body_);
+  }
+  const TreeRef& cond() const {
+    return cond_;
+  }
+  ListView<TreeRef> body() const {
+    return ListView<TreeRef>(body_);
+  }
+
+  static TreeRef
+  create(const SourceRange& range, TreeRef cond_, TreeRef body_) {
+    return Compound::create(TK_WHILE, range, {cond_, body_});
+  }
+
+ private:
+  TreeRef cond_;
+  TreeRef body_;
+};
+
+} // namespace script
+} // namespace caffe2
diff --git a/caffe2/contrib/shm_mutex/CMakeLists.txt b/caffe2/contrib/shm_mutex/CMakeLists.txt
new file mode 100644
index 0000000..3fd2e69
--- /dev/null
+++ b/caffe2/contrib/shm_mutex/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(USE_SHM_MUTEX)
+  set(Caffe2_CONTRIB_SHMMUTEX_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/shm_mutex.cc"
+    )
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_SHMMUTEX_CPU_SRC} PARENT_SCOPE)
+endif()
diff --git a/caffe2/contrib/shm_mutex/shm_mutex.cc b/caffe2/contrib/shm_mutex/shm_mutex.cc
new file mode 100644
index 0000000..c323703
--- /dev/null
+++ b/caffe2/contrib/shm_mutex/shm_mutex.cc
@@ -0,0 +1,17 @@
+#include "shm_mutex.h"
+
+ShmProcessMutexCheck& ShmProcessMutexCheck::getInstance() {
+  static ShmProcessMutexCheck singleton;
+  return singleton;
+}
+
+bool ShmProcessMutexCheck::addLock(const std::string& name) {
+  std::lock_guard<std::mutex> l(m_);
+  auto p = shmLocks_.emplace(name);
+  return p.second;
+}
+
+bool ShmProcessMutexCheck::removeLock(const std::string& name) {
+  std::lock_guard<std::mutex> l(m_);
+  return shmLocks_.erase(name) == 1;
+}
diff --git a/caffe2/contrib/shm_mutex/shm_mutex.h b/caffe2/contrib/shm_mutex/shm_mutex.h
new file mode 100644
index 0000000..d9a97d7
--- /dev/null
+++ b/caffe2/contrib/shm_mutex/shm_mutex.h
@@ -0,0 +1,343 @@
+/*
+ * This implements a machine-wide mutex to be used
+ * to synchronize CUDA calls (memory allocation and frees) and
+ * NCCL calls. This prevents a potential deadlock that
+ * can occur.
+ *
+ * The implementation has a few caveats:
+ *   - it assumes that PID are not reused
+ *   - there is a possible race between the creation (shm_open followed
+ *     by ftruncate) and the spin on 'isInitialized' (if the memory region is
+ *     not all zeroes).
+ *
+ * There are two implementations of the mutex and they vary mostly by how
+ * they wait:
+ *   - The ShmTicketMutex_t is a simple ticket based lock and processes will
+ *     queue up and only attempt to grab the lock when it is their turn
+ *   - The ShmTTSetMutex_t is a simple test-test-and-set mutex. It is possibly
+ *     faster for low contention.
+ *
+ * Use both as you would use any std::mutex. Both mutexes support try_lock as
+ * well.
+ */
+#pragma once
+
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <climits>
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <unordered_set>
+
+#include "caffe2/core/logging.h"
+
+const int kTicketDelay = 1000;
+const int kTimeout = 1000;
+
+class ShmProcessMutexCheck {
+ public:
+  static ShmProcessMutexCheck& getInstance();
+  ShmProcessMutexCheck(const ShmProcessMutexCheck&) = delete;
+  ShmProcessMutexCheck& operator=(const ShmProcessMutexCheck&) = delete;
+
+  bool addLock(const std::string& name);
+  bool removeLock(const std::string& name);
+
+ protected:
+  ShmProcessMutexCheck() = default;
+  std::mutex m_;
+  std::unordered_set<std::string> shmLocks_;
+};
+
+template <class Derived>
+struct shm_traits;
+
+using ShmBaseHeader = struct {
+  std::atomic<bool> isInitialized;
+  std::atomic<int> countMapped;
+  std::atomic<pid_t> owner;
+};
+
+template <class Impl>
+class ShmProcessMutex {
+ public:
+  using header_t = typename shm_traits<Impl>::header_t;
+
+  explicit ShmProcessMutex(const char* name)
+      : name_(name), check_(ShmProcessMutexCheck::getInstance()) {
+    CAFFE_ENFORCE(check_.addLock(name_), "Creating duplicate lock: ", name_);
+    myPid_ = getpid();
+    // Try to open and map the shared memory location
+    int fd = -1;
+    while (true) {
+      fd = shm_open(name, O_RDWR, 0);
+      if (fd == -1) {
+        CAFFE_ENFORCE(
+            errno == ENOENT,
+            "shm_open failed with not ENOENT: ",
+            strerror(errno));
+
+        // Create new object
+        fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL, 0700);
+        if (fd == -1 && errno == EEXIST) {
+          // Some other process created first; loop around to re-open
+          continue;
+        }
+        CAFFE_ENFORCE(
+            fd != -1, "shm_open failed with create: ", strerror(errno));
+        // At this point, we are the creator of the shared object.
+        // Initialize the header_ (it's all 0 right now)
+        auto rv = ftruncate(fd, sizeof(header_t));
+        CAFFE_ENFORCE(rv != -1, "ftruncate: ", strerror(errno));
+
+        // Map memory and initialize
+        header_ = (header_t*)mmap(
+            nullptr,
+            sizeof(header_t),
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            fd,
+            0);
+        CAFFE_ENFORCE(header_ != MAP_FAILED, "mmap: ", strerror(errno));
+
+        header_->countMapped = 1;
+        header_->owner = 0;
+        header_->isInitialized.store(true, std::memory_order_release);
+        close(fd);
+        break;
+      } else {
+        // Object exists, we just map it
+        header_ = (header_t*)mmap(
+            nullptr,
+            sizeof(header_t),
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            fd,
+            0);
+        CAFFE_ENFORCE(header_ != MAP_FAILED, "mmap: ", strerror(errno));
+
+        // Wait for memory to be initialized
+        while (header_->isInitialized.load(std::memory_order_acquire) == 0) {
+          // Spin; should be done soon
+        }
+        // Now check if we can register ourself by incrementing countMapped.
+        // If we are "locked-out" (shared object being destroyed), retry
+        if (header_->countMapped.fetch_add(1, std::memory_order_relaxed) < 0) {
+          header_->countMapped.fetch_sub(1, std::memory_order_relaxed);
+          int rv = munmap(header_, sizeof(header_t));
+          CAFFE_ENFORCE(rv == 0, "munmap (to retry) failed: ", strerror(errno));
+          close(fd);
+          continue;
+        }
+        close(fd);
+        break;
+      }
+    }
+  }
+
+  ~ShmProcessMutex() {
+    if (header_ != nullptr) {
+      // We are participating in a lock. Destroy
+      internalDestroy();
+    }
+  }
+
+  // Copy and assignment operator are implicitly deleted
+
+  ShmProcessMutex(ShmProcessMutex&& toMove) noexcept
+      : header_(toMove.header_),
+        myPid_(toMove.myPid_),
+        name_(toMove.name_),
+        check_(toMove.check_) {
+    toMove.header_ = nullptr;
+    toMove.myPid_ = -1;
+  }
+
+  ShmProcessMutex& operator=(ShmProcessMutex&& toMove) {
+    CAFFE_ENFORCE(toMove.myPid_ == this->myPid_);
+    if (&toMove != this) {
+      internalDestroy();
+      header_ = toMove.header_;
+      name_ = toMove.name_;
+      toMove.header_ = nullptr;
+      toMove.myPid_ = -1;
+    }
+    return *this;
+  }
+
+  void lock() {
+    pid_t expectedPid = 0;
+    while (not header_->owner.compare_exchange_weak(
+        expectedPid,
+        myPid_,
+        std::memory_order_relaxed,
+        std::memory_order_relaxed)) {
+      if (expectedPid == 0) {
+        continue;
+      }
+      // Someone else has the lock. We check if that process is
+      // still alive
+      if (kill(expectedPid, 0) < 0 && errno == ESRCH) {
+        // The process no longer exists. Try to "steal" the lock
+        continue;
+      }
+      while (true) {
+        if (static_cast<Impl*>(this)->waitForLock()) {
+          return;
+        }
+        expectedPid = header_->owner.load(std::memory_order_relaxed);
+        if (expectedPid == 0 || (kill(expectedPid, 0) < 0 && errno == ESRCH)) {
+          break;
+        }
+      }
+    }
+  }
+
+  bool try_lock() {
+    pid_t expectedPid = 0;
+    bool firstTry = true;
+    while (not header_->owner.compare_exchange_weak(
+        expectedPid,
+        myPid_,
+        std::memory_order_relaxed,
+        std::memory_order_relaxed)) {
+      if (expectedPid == 0) {
+        continue;
+      }
+      // Someone else has the lock. We check if that process is
+      // still alive
+      if (firstTry && kill(expectedPid, 0) < 0 && errno == ESRCH) {
+        firstTry = false;
+        // The process no longer exists. Try to "steal" the lock once
+        continue;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void unlock() noexcept {
+    header_->owner.store(0, std::memory_order_relaxed);
+    static_cast<Impl*>(this)->subUnlock();
+  }
+
+ protected:
+  header_t* header_;
+  pid_t myPid_;
+  std::string name_;
+
+  ShmProcessMutexCheck& check_;
+
+ private:
+  void internalDestroy() {
+    CAFFE_ENFORCE(header_ != nullptr, "Internal error");
+    CAFFE_ENFORCE(check_.removeLock(name_), "Double free of lock: ", name_);
+    // Unmap the memory. If we are the last one, "lock" the
+    // shared memory and free it if successful
+    int oldCount = header_->countMapped.fetch_sub(1, std::memory_order_relaxed);
+    bool doUnlink = false;
+    if (oldCount == 1) {
+      // We were the last one. We attempt to lock out
+      // future processes by exchanging with something very negative
+      // This simplifies the checks when checking for lock out
+      oldCount = 0;
+      if (header_->countMapped.compare_exchange_strong(
+              oldCount,
+              INT_MIN,
+              std::memory_order_relaxed,
+              std::memory_order_relaxed)) {
+        doUnlink = true;
+      }
+    }
+    int rv = munmap(header_, sizeof(header_t));
+    CAFFE_ENFORCE(rv == 0, "munmap failed: ", strerror(errno));
+    if (doUnlink) {
+      rv = shm_unlink(name_.c_str());
+      CAFFE_ENFORCE(rv == 0, "shm_unlink failed: ", strerror(errno));
+    }
+  }
+};
+
+template <class T>
+class ShmTTSetMutex : public ShmProcessMutex<ShmTTSetMutex<T>> {
+ public:
+  friend class ShmProcessMutex<ShmTTSetMutex<T>>;
+  explicit ShmTTSetMutex(const char* name, int timeout = kTimeout)
+      : ShmProcessMutex<ShmTTSetMutex>(name), timeout_(timeout) {}
+
+ protected:
+  bool waitForLock() {
+    int delay = timeout_;
+    pid_t expectedPid = 0;
+    while (--delay > 0 &&
+           this->header_->owner.load(std::memory_order_relaxed)) {
+      // Empty loop
+      __asm__ __volatile__("");
+    }
+    return this->header_->owner.compare_exchange_strong(
+        expectedPid, this->myPid_, std::memory_order_relaxed);
+  }
+
+  void subUnlock() noexcept {}
+  int timeout_;
+};
+
+template <class T>
+class ShmTicketMutex : public ShmProcessMutex<ShmTicketMutex<T>> {
+ public:
+  friend class ShmProcessMutex<ShmTicketMutex<T>>;
+  explicit ShmTicketMutex(const char* name, int delay = kTicketDelay)
+      : ShmProcessMutex<ShmTicketMutex>(name), delay_(delay) {}
+
+ protected:
+  bool waitForLock() {
+    pid_t expectedPid = 0;
+    int slot = this->header_->ticket.fetch_add(1, std::memory_order_relaxed);
+    for (;;) {
+      int spintime =
+          (slot - this->header_->now.load(std::memory_order_relaxed)) * delay_;
+      for (int i = 0; i < spintime; i++) {
+        // Empty loop
+        __asm__ __volatile__("");
+      }
+      if (this->header_->now.load(std::memory_order_relaxed) == slot) {
+        break;
+      }
+    }
+    return this->header_->owner.compare_exchange_strong(
+        expectedPid, this->myPid_, std::memory_order_relaxed);
+  }
+
+  void subUnlock() noexcept {
+    this->header_->now.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  int delay_;
+};
+
+template <class T>
+struct shm_traits<ShmTTSetMutex<T>> {
+  using header_t = T;
+};
+
+template <class T>
+struct shm_traits<ShmTicketMutex<T>> {
+  using header_t = T;
+};
+
+using TicketStruct = struct : ShmBaseHeader {
+  std::atomic<unsigned> ticket;
+  std::atomic<unsigned> now;
+};
+
+template class ShmTicketMutex<TicketStruct>;
+template class ShmTTSetMutex<ShmBaseHeader>;
+
+using ShmTicketMutex_t = ShmTicketMutex<TicketStruct>;
+using ShmTTSetMutex_t = ShmTTSetMutex<ShmBaseHeader>;
diff --git a/caffe2/contrib/tensorboard/__init__.py b/caffe2/contrib/tensorboard/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/contrib/tensorboard/tensorboard.md b/caffe2/contrib/tensorboard/tensorboard.md
new file mode 100644
index 0000000..ce1f72a
--- /dev/null
+++ b/caffe2/contrib/tensorboard/tensorboard.md
@@ -0,0 +1,50 @@
+# Using TensorBoard in ifbpy #
+
+## Simple Example ##
+
+```lang=py
+
+import caffe2.contrib.tensorboard.tensorboard as tb
+import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
+from caffe2.python import brew, core, model_helper
+
+model = model_helper.ModelHelper(name="overfeat")
+data, label = brew.image_input(
+    model, ["db"], ["data", "label"], is_test=0
+)
+with core.NameScope("conv1"):
+    conv1 = brew.conv(model, data, "conv1", 3, 96, 11, stride=4)
+    relu1 = brew.relu(model, conv1, conv1)
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
+with core.NameScope("conv2"):
+    conv2 = brew.conv(model, pool1, "conv2", 96, 256, 5)
+    relu2 = brew.relu(model, conv2, conv2)
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
+with core.NameScope("conv3"):
+    conv3 = brew.conv(model, pool2, "conv3", 256, 512, 3, pad=1)
+    relu3 = brew.relu(model, conv3, conv3)
+with core.NameScope("conv4"):
+    conv4 = brew.conv(model, relu3, "conv4", 512, 1024, 3, pad=1)
+    relu4 = brew.relu(model, conv4, conv4)
+with core.NameScope("conv5"):
+    conv5 = brew.conv(model, relu4, "conv5", 1024, 1024, 3, pad=1)
+    relu5 = brew.relu(model, conv5, conv5)
+    pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
+with core.NameScope("fc6"):
+    fc6 = brew.fc(model, pool5, "fc6", 1024*6*6, 3072)
+    relu6 = brew.relu(model, fc6, "fc6")
+with core.NameScope("fc7"):
+    fc7 = brew.fc(model, relu6, "fc7", 3072, 4096)
+    relu7 = brew.relu(model, fc7, "fc7")
+with core.NameScope("classifier"):
+    fc8 = brew.fc(model, relu7, "fc8", 4096, 1000)
+    pred = brew.softmax(model, fc8, "pred")
+    xent = model.LabelCrossEntropy([pred, label], "xent")
+    loss = model.AveragedLoss(xent, "loss")
+model.net.RunAllOnGPU()
+model.param_init_net.RunAllOnGPU()
+model.AddGradientOperators([loss], skip=1)
+
+tb.Config.HEIGHT = 700
+tb.visualize_cnn(model)
+```
diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py
new file mode 100644
index 0000000..cb2aae1
--- /dev/null
+++ b/caffe2/contrib/tensorboard/tensorboard.py
@@ -0,0 +1,202 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import click
+import collections
+import logging
+import numpy as np
+import os
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
+import tensorflow as tf
+
+
+class Config(object):
+    HEIGHT = 600
+    ASPECT_RATIO = 1.6
+
+
+CODE_TEMPLATE = """
+<script>
+  function load() {{
+    document.getElementById("{id}").pbtxt = {data};
+  }}
+</script>
+<link rel="import"
+  href="https://tensorboard.appspot.com/tf-graph-basic.build.html"
+  onload=load()
+>
+<div style="height:{height}px">
+  <tf-graph-basic id="{id}"></tf-graph-basic>
+</div>
+"""
+
+IFRAME_TEMPLATE = """
+<iframe
+  seamless
+  style="width:{width}px;height:{height}px;border:0"
+  srcdoc="{code}">
+</iframe>
+"""
+
+
+def _show_graph(graph_def):
+    import IPython.display
+
+    code = CODE_TEMPLATE.format(
+        data=repr(str(graph_def)),
+        id='graph' + str(np.random.rand()),
+        height=Config.HEIGHT)
+
+    iframe = IFRAME_TEMPLATE.format(
+        code=code.replace('"', '&quot;'),
+        width=Config.HEIGHT * Config.ASPECT_RATIO,
+        height=Config.HEIGHT + 20)
+
+    IPython.display.display(IPython.display.HTML(iframe))
+
+
+def visualize_cnn(cnn, **kwargs):
+    g = tb_exporter.cnn_to_graph_def(cnn, **kwargs)
+    _show_graph(g)
+
+
+def visualize_net(nets, **kwargs):
+    g = tb_exporter.nets_to_graph_def(nets, **kwargs)
+    _show_graph(g)
+
+
+def visualize_ops(ops, **kwargs):
+    g = tb_exporter.ops_to_graph_def(ops, **kwargs)
+    _show_graph(g)
+
+
+@click.group()
+def cli():
+    pass
+
+
+def write_events(tf_dir, events):
+    # tf.summary.FileWriter exists in the current Tensorflow release
+    # tf.train.SummaryWriter is the way in older versions
+    if hasattr(tf.summary, 'FileWriter'):
+        writer = tf.summary.FileWriter(logdir=tf_dir, max_queue=len(events))
+    else:
+        writer = tf.train.SummaryWriter(logdir=tf_dir, max_queue=len(events))
+
+    for event in events:
+        writer.add_event(event)
+    writer.flush()
+    writer.close()
+
+
+def graph_def_to_event(step, graph_def):
+    return tf.Event(
+        wall_time=step, step=step, graph_def=graph_def.SerializeToString())
+
+
+@cli.command("tensorboard-graphs")
+@click.option("--c2-netdef", type=click.Path(exists=True, dir_okay=False),
+              multiple=True)
+@click.option("--tf-dir", type=click.Path(exists=True))
+def tensorboard_graphs(c2_netdef, tf_dir):
+    log = logging.getLogger(__name__)
+    log.setLevel(logging.INFO)
+
+    def parse_net_def(path):
+        import google.protobuf.text_format
+        net_def = caffe2_pb2.NetDef()
+        with open(path) as f:
+            google.protobuf.text_format.Merge(f.read(), net_def)
+        return core.Net(net_def)
+
+    graph_defs = [tb_exporter.nets_to_graph_def([parse_net_def(path)])
+                  for path in c2_netdef]
+    events = [graph_def_to_event(i, graph_def)
+              for (i, graph_def) in enumerate(graph_defs, start=1)]
+    write_events(tf_dir, events)
+    log.info("Wrote %s graphs to logdir %s", len(events), tf_dir)
+
+
+@cli.command("tensorboard-events")
+@click.option("--c2-dir", type=click.Path(exists=True, file_okay=False),
+              help="Root directory of the Caffe2 run")
+@click.option("--tf-dir", type=click.Path(writable=True),
+              help="Output path to the logdir used by TensorBoard")
+def tensorboard_events(c2_dir, tf_dir):
+    np.random.seed(1701)
+    log = logging.getLogger(__name__)
+    log.setLevel(logging.INFO)
+    S = collections.namedtuple('S', ['min', 'max', 'mean', 'std'])
+
+    def parse_summary(filename):
+        try:
+            with open(filename) as f:
+                rows = [(float(el) for el in line.split()) for line in f]
+                return [S(*r) for r in rows]
+        except Exception as e:
+            log.exception(e)
+            return None
+
+    def get_named_summaries(root):
+        summaries = [
+            (fname, parse_summary(os.path.join(dirname, fname)))
+            for dirname, _, fnames in os.walk(root)
+            for fname in fnames
+        ]
+        return [(n, s) for (n, s) in summaries if s]
+
+    def inferred_histo(summary, samples=1000):
+        np.random.seed(hash(
+            summary.std + summary.mean + summary.min + summary.max))
+        samples = np.random.randn(samples) * summary.std + summary.mean
+        samples = np.clip(samples, a_min=summary.min, a_max=summary.max)
+        (hist, edges) = np.histogram(samples)
+        upper_edges = edges[1:]
+        r = tf.HistogramProto(
+            min=summary.min,
+            max=summary.max,
+            num=len(samples),
+            sum=samples.sum(),
+            sum_squares=(samples * samples).sum())
+        r.bucket_limit.extend(upper_edges)
+        r.bucket.extend(hist)
+        return r
+
+    def named_summaries_to_events(named_summaries):
+        names = [n for (n, _) in named_summaries]
+        summaries = [s for (_, s) in named_summaries]
+        summaries = list(zip(*summaries))
+
+        def event(step, values):
+            s = tf.Summary()
+            scalar = [
+                tf.Summary.Value(
+                    tag="{}/{}".format(name, field),
+                    simple_value=v)
+                for name, value in zip(names, values)
+                for field, v in value._asdict().items()]
+            hist = [
+                tf.Summary.Value(
+                    tag="{}/inferred_normal_hist".format(name),
+                    histo=inferred_histo(value))
+                for name, value in zip(names, values)
+            ]
+            s.value.extend(scalar + hist)
+            return tf.Event(wall_time=int(step), step=step, summary=s)
+
+        return [event(step, values)
+                for step, values in enumerate(summaries, start=1)]
+
+    named_summaries = get_named_summaries(c2_dir)
+    events = named_summaries_to_events(named_summaries)
+    write_events(tf_dir, events)
+    log.info("Wrote %s events to logdir %s", len(events), tf_dir)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
new file mode 100644
index 0000000..93ade48
--- /dev/null
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -0,0 +1,337 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from builtins import bytes
+import copy
+import logging
+import os
+import six
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import tensorflow as tf
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import graph_pb2
+
+
+def _make_unique_name(seen, name, min_version=0):
+    assert name is not None
+    i = min_version
+    x = '%s_%d' % (name, i) if i else name
+    while x in seen:
+        i += 1
+        x = '%s_%d' % (name, i)
+    seen.add(x)
+    return x
+
+
+def _convert_to_ssa(shapes, track_blob_names, ops):
+    """
+    Convert an operator graph to SSA (i.e. out-of-place).
+
+    I.e. blobs will be renamed so that each blob is produced only once.
+    """
+    ir = core.IR(ops)
+    seen = set()
+    versioned = {}
+    shapes2 = {}
+    track_blob_names2 = {}
+
+    def ssa_name(name, versions):
+        assert name in versions
+        version = versions[name]
+        if (name, version) in versioned:
+            return versioned[(name, version)]
+        # Always setting name2 = `{name}_{version}` would work, but we also try
+        # to avoid a trailing `_0`, so we have to be careful not to introduce
+        # name collisions, such as (foo_1, 0) = foo_1 = (foo, 1).
+        # Note: operator names (if any) will be handled later.
+        name2 = _make_unique_name(seen, name, min_version=version)
+        versioned[(name, version)] = name2
+        # Transfer shape.
+        if name in shapes:
+            shapes2[name2] = shapes[name]
+        if track_blob_names and name in track_blob_names:
+            track_blob_names2[name2] = track_blob_names[name]
+        return name2
+
+    for (op, ssa) in zip(ops, ir.ssa):
+        assert op is ssa.op
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(ssa_name(name, ssa.in_versions) for name in inputs)
+        op.output.extend(ssa_name(name, ssa.out_versions) for name in outputs)
+
+    shapes.clear()
+    shapes.update(shapes2)
+    if track_blob_names:
+        track_blob_names.clear()
+        track_blob_names.update(track_blob_names2)
+
+
+def _get_blob_names(ops):
+    names = set()
+    for op in ops:
+        names.update(op.input)
+        names.update(op.output)
+    return {name: name for name in names}
+
+
+def _remap_keys(m, f):
+    m2 = {f(key): value for key, value in six.iteritems(m)}
+    m.clear()
+    m.update(m2)
+
+
+def _rename_all(shapes, track_blob_names, ops, f):
+    seen = set()
+    renamed = {}
+
+    def g(name):
+        """ Collision-free version of f.
+        """
+        if name is None:
+            return None
+        if name in renamed:
+            return renamed[name]
+        name2 = _make_unique_name(seen, f(name))
+        renamed[name] = name2
+        return name2
+
+    for op in ops:
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(g(name) for name in inputs)
+        op.output.extend(g(name) for name in outputs)
+
+    _remap_keys(shapes, g)
+    if track_blob_names:
+        _remap_keys(track_blob_names, g)
+    # Rename all operator names (if any) independently so that the
+    # unique-fication happens only once in _fill_missing_operator_names().
+    seen.clear()
+    renamed.clear()
+    for op in ops:
+        op.name = g(op.name)
+
+
+def _add_gradient_scope(shapes, track_blob_names, ops):
+    """
+    For all operators or blobs with name containing "_grad", add a
+    "GRADIENTS/" scope.
+
+    Note: breaks graph execution since the blob -> gradient mapping is
+    hardcoded.
+    """
+    def f(name):
+        if '_grad' in name:
+            return 'GRADIENTS/{}'.format(name)
+        else:
+            return name
+    _rename_all(shapes, track_blob_names, ops, f)
+
+
+def _replace_colons(shapes, track_blob_names, ops, repl):
+    """
+    `:i` has a special meaning in Tensorflow.
+    """
+    def f(name):
+        return name.replace(':', repl)
+    _rename_all(shapes, track_blob_names, ops, f)
+
+
+def _fill_missing_operator_names(ops):
+    ''' Give missing operators a name.
+
+    We expect C2 operators to be generally unnamed. This gives them a scope
+    (inferred from their outputs) and a name after their type. Duplicates will
+    be postfixed by an index.
+    '''
+    seen = set()
+    for op in ops:
+        # Make sure operator names don't collide with blobs.
+        seen.update(op.input)
+        seen.update(op.output)
+    for op in ops:
+        if op.name:
+            name = op.name
+        elif op.output or op.input:
+            l = [os.path.dirname(name) for name in op.output or op.input]
+            scope = os.path.commonprefix(l)
+            name = os.path.join(scope, op.type)
+        else:
+            name = op.type
+        assert(name)
+        op.name = _make_unique_name(seen, name)
+
+
+def _tf_device(device_option):
+    if not device_option.HasField("device_type"):
+        return ""
+    if device_option.device_type == caffe2_pb2.CPU:
+        return "/cpu:*"
+    if device_option.device_type == caffe2_pb2.CUDA:
+        return "/gpu:{}".format(device_option.cuda_gpu_id)
+    raise Exception("Unhandled device", device_option)
+
+
+def _add_tf_shape(m, ints):
+    sh = tensor_shape_pb2.TensorShapeProto()
+    for i in ints:
+        dim = tensor_shape_pb2.TensorShapeProto.Dim()
+        dim.size = i
+        sh.dim.extend([dim])
+    m['_output_shapes'].list.shape.extend([sh])
+
+
+def _set_tf_attr(m, arg):
+    k = arg.name
+    if k == 'shape' and arg.ints:
+        _add_tf_shape(m, arg.ints)
+        return
+    if arg.HasField("f"):
+        m[k].f = arg.f
+        return
+    if arg.HasField("i"):
+        m[k].i = arg.i
+        return
+    if arg.HasField("s"):
+        m[k].s = (
+            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode('utf-8')
+        )
+        return
+    if arg.floats:
+        m[k].list.f.extend(arg.floats)
+        return
+    if arg.ints:
+        m[k].list.i.extend(arg.ints)
+        return
+    if arg.strings:
+        m[k].list.s.extend(
+            s if isinstance(s, bytes) else str(s).encode('utf-8')
+            for s in arg.strings
+        )
+        return
+    # The value is an empty list.
+    m[k].list.s.extend([])
+
+
+def _operator_to_node(shapes, op):
+    assert op.name, op
+    # Check for existance of __version__ for backwards compatibility
+    n = tf.NodeDef() if hasattr(tf, '__version__') else graph_pb2.NodeDef()
+    n.name = op.name
+    n.input.extend(op.input)
+    n.op = op.type
+    n.device = _tf_device(op.device_option)
+    if shapes:
+        # Add shapes in order.
+        for output in op.output:
+            if output not in shapes:
+                break
+            _add_tf_shape(n.attr, shapes[output])
+    for arg in op.arg:
+        _set_tf_attr(n.attr, arg)
+    return n
+
+
+def _blob_to_node(producing_ops, shapes, name):
+    assert name
+    # Check for existance of __version__ for backwards compatibility
+    n = tf.NodeDef() if hasattr(tf, '__version__') else graph_pb2.NodeDef()
+    n.name = name
+    inputs = producing_ops.get(name, [])
+    if inputs:
+        n.op = 'Blob'
+    else:
+        n.op = 'Placeholder'
+    n.input.extend('%s:%d' % (op.name, i) for op, i in inputs)
+    if inputs:
+        device = inputs[0][0].device_option
+        if (all(input[0].device_option == device for input in inputs)):
+            n.device = _tf_device(device)
+    if shapes and name in shapes:
+        _add_tf_shape(n.attr, shapes[name])
+    return n
+
+
+def _operators_to_graph_def(
+    shapes,
+    ops,
+    replace_colons='$',
+    with_ssa=True,
+    with_gradient_scope=True,
+    track_blob_names=None,  # pass an empty array to track blob names
+):
+    if track_blob_names is not None:
+        track_blob_names.clear()
+        track_blob_names.update(_get_blob_names(ops))
+    if replace_colons:
+        _replace_colons(shapes, track_blob_names, ops, replace_colons)
+    if with_ssa:
+        _convert_to_ssa(shapes, track_blob_names, ops)
+    if with_gradient_scope:
+        _add_gradient_scope(shapes, track_blob_names, ops)
+    _fill_missing_operator_names(ops)
+    # Check for existance of __version__ for backwards compatibility
+    g = tf.GraphDef() if hasattr(tf, '__version__') else graph_pb2.GraphDef()
+    producing_ops = {}
+    blobs = set()
+    for op in ops:
+        g.node.extend([_operator_to_node(shapes, op)])
+        for input_blob in op.input:
+            blobs.add(input_blob)
+        for i, output_blob in enumerate(op.output):
+            blobs.add(output_blob)
+            producing_ops.setdefault(output_blob, []).append((op, i))
+    for blob in blobs:
+        g.node.extend([_blob_to_node(producing_ops, shapes, blob)])
+    return g
+
+
+def _propagate_device_option(net):
+    if not net.HasField("device_option"):
+        return
+    for op in net.op:
+        if not op.HasField("device_option"):
+            op.device_option.CopyFrom(net.device_option)
+
+
+def _try_get_shapes(nets):
+    try:
+        # Note: this will inspect the workspace for better or worse.
+        shapes, _ = workspace.InferShapesAndTypes(nets)
+        return shapes
+    except Exception as e:
+        logging.warning('Failed to compute shapes: %s', e)
+        return {}
+
+
+def nets_to_graph_def(nets, shapes=None, **kwargs):
+    if shapes is None:
+        shapes = _try_get_shapes(nets)
+    nets = [copy.deepcopy(net.Proto()) for net in nets]
+    shapes = copy.deepcopy(shapes)
+    for net in nets:
+        _propagate_device_option(net)
+    return _operators_to_graph_def(
+        shapes,
+        [op for net in nets for op in net.op],
+        **kwargs
+    )
+
+
+def cnn_to_graph_def(cnn, **kwargs):
+    return nets_to_graph_def([cnn.param_init_net, cnn.net], **kwargs)
+
+
+def ops_to_graph_def(ops, shapes=None, **kwargs):
+    ops = copy.deepcopy(ops)
+    shapes = copy.deepcopy(shapes or {})
+    return _operators_to_graph_def(shapes, ops, **kwargs)
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
new file mode 100644
index 0000000..6b9c894
--- /dev/null
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
@@ -0,0 +1,705 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+
+from caffe2.proto import caffe2_pb2
+import caffe2.python.cnn as cnn
+import caffe2.python.core as core
+import caffe2.contrib.tensorboard.tensorboard_exporter as tb
+
+EXPECTED = """
+node {
+  name: "conv1/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 11
+          }
+          dim {
+            size: 11
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+          dim {
+            size: 4096
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ImageInput"
+  op: "ImageInput"
+  input: "db"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "is_test"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "NHWC2NCHW"
+  op: "NHWC2NCHW"
+  input: "data_nhwc"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/Conv"
+  op: "Conv"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "conv1/conv1_b"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 11
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "conv1/Relu"
+  op: "Relu"
+  input: "conv1/conv1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool"
+  op: "MaxPool"
+  input: "conv1/conv1_1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "classifier/FC"
+  op: "FC"
+  input: "conv1/pool1"
+  input: "classifier/fc_w"
+  input: "classifier/fc_b"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Softmax"
+  op: "Softmax"
+  input: "classifier/fc"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/LabelCrossEntropy"
+  op: "LabelCrossEntropy"
+  input: "classifier/pred"
+  input: "label"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/AveragedLoss"
+  op: "AveragedLoss"
+  input: "classifier/xent"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/ConstantFill"
+  op: "ConstantFill"
+  input: "classifier/loss"
+  device: "/gpu:0"
+  attr {
+    key: "value"
+    value {
+      f: 1.0
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/AveragedLossGradient"
+  op: "AveragedLossGradient"
+  input: "classifier/xent"
+  input: "GRADIENTS/classifier/loss_autogen_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/LabelCrossEntropyGradient"
+  op: "LabelCrossEntropyGradient"
+  input: "classifier/pred"
+  input: "label"
+  input: "GRADIENTS/classifier/xent_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/SoftmaxGradient"
+  op: "SoftmaxGradient"
+  input: "classifier/pred"
+  input: "GRADIENTS/classifier/pred_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/c/FCGradient"
+  op: "FCGradient"
+  input: "conv1/pool1"
+  input: "classifier/fc_w"
+  input: "GRADIENTS/classifier/fc_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient"
+  op: "MaxPoolGradient"
+  input: "conv1/conv1_1"
+  input: "conv1/pool1"
+  input: "GRADIENTS/conv1/pool1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/ReluGradient"
+  op: "ReluGradient"
+  input: "conv1/conv1_1"
+  input: "GRADIENTS/conv1/conv1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/ConvGradient"
+  op: "ConvGradient"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "GRADIENTS/conv1/conv1_grad_1"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 11
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "GRADIENTS/NCHW2NHWC"
+  op: "NCHW2NHWC"
+  input: "GRADIENTS/data_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc"
+  op: "Blob"
+  input: "classifier/FC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data_nhwc"
+  op: "Blob"
+  input: "ImageInput:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/pred_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/SoftmaxGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "label"
+  op: "Blob"
+  input: "ImageInput:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/loss_autogen_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_1"
+  op: "Blob"
+  input: "conv1/Relu:0"
+  device: "/gpu:0"
+}
+node {
+  name: "db"
+  op: "Placeholder"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/xent_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/AveragedLossGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data"
+  op: "Blob"
+  input: "NHWC2NCHW:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad_1"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ReluGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_nhwc_grad"
+  op: "Blob"
+  input: "GRADIENTS/NCHW2NHWC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/pool1_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/gpu:0"
+}
+"""
+
+
+class TensorboardExporterTest(unittest.TestCase):
+    def test_that_operators_gets_non_colliding_names(self):
+        op = caffe2_pb2.OperatorDef()
+        op.type = 'foo'
+        op.input.extend(['foo'])
+        tb._fill_missing_operator_names([op])
+        self.assertEqual(op.input[0], 'foo')
+        self.assertEqual(op.name, 'foo_1')
+
+    def test_that_replacing_colons_gives_non_colliding_names(self):
+        # .. and update shapes
+        op = caffe2_pb2.OperatorDef()
+        op.name = 'foo:0'
+        op.input.extend(['foo:0', 'foo$0'])
+        shapes = {'foo:0': [1]}
+        track_blob_names = tb._get_blob_names([op])
+        tb._replace_colons(shapes, track_blob_names, [op], '$')
+        self.assertEqual(op.input[0], 'foo$0')
+        self.assertEqual(op.input[1], 'foo$0_1')
+        # Collision but blobs and op names are handled later by
+        # _fill_missing_operator_names.
+        self.assertEqual(op.name, 'foo$0')
+        self.assertEqual(len(shapes), 1)
+        self.assertEqual(shapes['foo$0'], [1])
+        self.assertEqual(len(track_blob_names), 2)
+        self.assertEqual(track_blob_names['foo$0'], 'foo:0')
+        self.assertEqual(track_blob_names['foo$0_1'], 'foo$0')
+
+    def test_that_adding_gradient_scope_does_no_fancy_renaming(self):
+        # because it cannot create collisions
+        op = caffe2_pb2.OperatorDef()
+        op.name = 'foo_grad'
+        op.input.extend(['foo_grad', 'foo_grad_1'])
+        shapes = {'foo_grad': [1]}
+        track_blob_names = tb._get_blob_names([op])
+        tb._add_gradient_scope(shapes, track_blob_names, [op])
+        self.assertEqual(op.input[0], 'GRADIENTS/foo_grad')
+        self.assertEqual(op.input[1], 'GRADIENTS/foo_grad_1')
+        self.assertEqual(op.name, 'GRADIENTS/foo_grad')
+        self.assertEqual(len(shapes), 1)
+        self.assertEqual(shapes['GRADIENTS/foo_grad'], [1])
+        self.assertEqual(len(track_blob_names), 2)
+        self.assertEqual(
+            track_blob_names['GRADIENTS/foo_grad'], 'foo_grad')
+        self.assertEqual(
+            track_blob_names['GRADIENTS/foo_grad_1'], 'foo_grad_1')
+
+    def test_that_auto_ssa_gives_non_colliding_names(self):
+        op1 = caffe2_pb2.OperatorDef()
+        op1.output.extend(['foo'])
+        op2 = caffe2_pb2.OperatorDef()
+        op2.input.extend(['foo'])
+        op2.output.extend(['foo'])
+        op2.output.extend(['foo_1'])
+        shapes = {'foo': [1], 'foo_1': [2]}
+        track_blob_names = tb._get_blob_names([op1, op2])
+        tb._convert_to_ssa(shapes, track_blob_names, [op1, op2])
+        self.assertEqual(op1.output[0], 'foo')
+        self.assertEqual(op2.input[0], 'foo')
+        self.assertEqual(op2.output[0], 'foo_1')
+        # Unfortunate name but we do not parse original `_` for now.
+        self.assertEqual(op2.output[1], 'foo_1_1')
+        self.assertEqual(len(shapes), 3)
+        self.assertEqual(shapes['foo'], [1])
+        self.assertEqual(shapes['foo_1'], [1])
+        self.assertEqual(shapes['foo_1_1'], [2])
+        self.assertEqual(len(track_blob_names), 3)
+        self.assertEqual(track_blob_names['foo'], 'foo')
+        self.assertEqual(track_blob_names['foo_1'], 'foo')
+        self.assertEqual(track_blob_names['foo_1_1'], 'foo_1')
+
+    def test_simple_cnnmodel(self):
+        model = cnn.CNNModelHelper("NCHW", name="overfeat")
+        data, label = model.ImageInput(["db"], ["data", "label"], is_test=0)
+        with core.NameScope("conv1"):
+            conv1 = model.Conv(data, "conv1", 3, 96, 11, stride=4)
+            relu1 = model.Relu(conv1, conv1)
+            pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
+        with core.NameScope("classifier"):
+            fc = model.FC(pool1, "fc", 4096, 1000)
+            pred = model.Softmax(fc, "pred")
+            xent = model.LabelCrossEntropy([pred, label], "xent")
+            loss = model.AveragedLoss(xent, "loss")
+        model.net.RunAllOnGPU()
+        model.param_init_net.RunAllOnGPU()
+        model.AddGradientOperators([loss], skip=1)
+        track_blob_names = {}
+        graph = tb.cnn_to_graph_def(
+            model,
+            track_blob_names=track_blob_names,
+            shapes={},
+        )
+        self.assertEqual(
+            track_blob_names['GRADIENTS/conv1/conv1_b_grad'],
+            'conv1/conv1_b_grad',
+        )
+        self.maxDiff = None
+        # We can't guarantee the order in which they appear, so we sort
+        # both before we compare them
+        sep = "node {"
+        expected = "\n".join(sorted(
+            sep + "\n  " + part.strip()
+            for part in EXPECTED.strip().split(sep)
+            if part.strip()
+        ))
+        actual = "\n".join(sorted(
+            sep + "\n  " + part.strip()
+            for part in str(graph).strip().split(sep)
+            if part.strip()
+        ))
+        self.assertMultiLineEqual(actual, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py
new file mode 100644
index 0000000..943e75f
--- /dev/null
+++ b/caffe2/contrib/tensorboard/tensorboard_test.py
@@ -0,0 +1,100 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import click.testing
+import numpy as np
+import os
+import tempfile
+import unittest
+
+from caffe2.python import brew, core, model_helper
+import caffe2.contrib.tensorboard.tensorboard as tb
+import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
+import tensorflow as tf
+
+
+class TensorboardTest(unittest.TestCase):
+
+    def test_events(self):
+        runner = click.testing.CliRunner()
+        c2_dir = tempfile.mkdtemp()
+        np.random.seed(1701)
+        n_iters = 2
+        blobs = ["w", "b"]
+        data = np.random.randn(len(blobs), n_iters, 10)
+        for i, blob in enumerate(blobs):
+            with open(os.path.join(c2_dir, blob), "w") as f:
+                for row in data[i]:
+                    stats = [row.min(), row.max(), row.mean(), row.std()]
+                    f.write(" ".join(str(s) for s in stats) + "\n")
+
+        # Test error handling path
+        with open(os.path.join(c2_dir, "not-a-summary"), "w") as f:
+            f.write("not-a-summary")
+
+        tf_dir = tempfile.mkdtemp()
+        result = runner.invoke(
+            tb.cli,
+            ["tensorboard-events", "--c2-dir", c2_dir, "--tf-dir", tf_dir])
+        self.assertEqual(result.exit_code, 0)
+        entries = list(os.walk(tf_dir))
+        self.assertEqual(len(entries), 1)
+        ((d, _, (fname,)),) = entries
+        self.assertEqual(tf_dir, d)
+        events = list(tf.train.summary_iterator(os.path.join(tf_dir, fname)))
+        self.assertEqual(len(events), n_iters + 1)
+        events = events[1:]
+        self.maxDiff = None
+        self.assertEqual(len(events), 2)
+
+    def test_tensorboard_graphs(self):
+        model = model_helper.ModelHelper(name="overfeat")
+        data, label = brew.image_input(
+            model, ["db"], ["data", "label"], is_test=0
+        )
+        with core.NameScope("conv1"):
+            conv1 = brew.conv(model, data, "conv1", 3, 96, 11, stride=4)
+            relu1 = brew.relu(model, conv1, conv1)
+            pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
+        with core.NameScope("classifier"):
+            fc = brew.fc(model, pool1, "fc", 4096, 1000)
+            pred = brew.softmax(model, fc, "pred")
+            xent = model.LabelCrossEntropy([pred, label], "xent")
+            loss = model.AveragedLoss(xent, "loss")
+        model.AddGradientOperators([loss], skip=1)
+
+        c2_dir = tempfile.mkdtemp()
+        tf_dir = tempfile.mkdtemp()
+
+        with open(os.path.join(c2_dir, "init"), "w") as f:
+            f.write(str(model.param_init_net.Proto()))
+        with open(os.path.join(c2_dir, "net"), "w") as f:
+            f.write(str(model.net.Proto()))
+        runner = click.testing.CliRunner()
+        result = runner.invoke(
+            tb.cli,
+            ["tensorboard-graphs",
+             "--c2-netdef", os.path.join(c2_dir, "init"),
+             "--c2-netdef", os.path.join(c2_dir, "net"),
+             "--tf-dir", tf_dir])
+        self.assertEqual(result.exit_code, 0)
+        entries = list(os.walk(tf_dir))
+        self.assertEqual(len(entries), 1)
+        ((d, _, (fname,)),) = entries
+        self.assertEqual(tf_dir, d)
+        events = list(tf.train.summary_iterator(os.path.join(tf_dir, fname)))
+        self.assertEqual(len(events), 3)
+        events = events[1:]
+        nets = [model.param_init_net, model.net]
+        for i, (event, net) in enumerate(zip(events, nets), start=1):
+            self.assertEqual(event.step, i)
+            self.assertEqual(event.wall_time, i)
+            self.assertEqual(
+                event.graph_def,
+                tb_exporter.nets_to_graph_def([net]).SerializeToString())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/contrib/tensorrt/CMakeLists.txt b/caffe2/contrib/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000..bb5d288
--- /dev/null
+++ b/caffe2/contrib/tensorrt/CMakeLists.txt
@@ -0,0 +1,6 @@
+# ---[ GPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
new file mode 100644
index 0000000..0d0ddc4
--- /dev/null
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
@@ -0,0 +1,258 @@
+#include "caffe2/contrib/tensorrt/tensorrt_op_trt.h"
+
+#include <numeric>
+#include <unordered_map>
+
+#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
+#include "caffe2/core/logging.h"
+#include "onnx/onnx_pb.h"
+
+namespace caffe2 {
+
+namespace {
+// Note that input of trt tensor is in CHW format, while our tensor is NCHW
+// \return -1 if there is dimension mismatch between C2 tensor and trt tensor.
+// Otherwise, return the product of CHW dimensions
+int64_t CheckDims(
+    const nvinfer1::Dims& nv_dims,
+    const std::vector<TIndex>& c2_dims) {
+  if (nv_dims.nbDims + 1 != c2_dims.size()) {
+    CAFFE_THROW(
+        "Mismatched dimensions between TRT input (",
+        nv_dims.nbDims + 1,
+        ") and C2 input (",
+        c2_dims.size(),
+        ")");
+  }
+  int64_t chw = 1;
+  for (int i = 0; i < nv_dims.nbDims; ++i) {
+    if (nv_dims.d[i] != c2_dims[i + 1]) {
+      CAFFE_THROW(
+          "Mismatched value at dimension ",
+          i,
+          "  between TRT input (",
+          nv_dims.d[i],
+          ") and C2 input (",
+          c2_dims[i + 1],
+          ")");
+    }
+    chw *= nv_dims.d[i];
+  }
+  return chw;
+}
+
+} // namespace
+
+// Upon construction, we build the inference engine by deserializing from
+// protobuf string. And since we know the input/output blobs, we can do the
+// binding here too.
+TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
+    : Operator<CUDAContext>(operator_def, ws),
+      logger_(
+          (nvinfer1::ILogger::Severity)(OperatorBase::GetSingleArgument<int>(
+              "log_verbosity",
+              FLAGS_minloglevel))),
+      max_batch_size_(
+          OperatorBase::GetSingleArgument<int>("max_batch_size", 1)) {
+  {
+    auto engine_string =
+        OperatorBase::GetSingleArgument<std::string>("backend_buffer", "");
+    if (!engine_string.empty()) {
+      auto trt_runtime =
+          tensorrt::TrtObject(nvinfer1::createInferRuntime(logger_));
+      // TODO(support trt plugin factory)
+      trt_engine_ = tensorrt::TrtObject(trt_runtime->deserializeCudaEngine(
+          engine_string.data(), engine_string.size(), nullptr));
+    } else {
+      auto onnx_model_str =
+          OperatorBase::GetSingleArgument<std::string>("onnx_model", "");
+      CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
+      auto debug_builder = OperatorBase::GetSingleArgument<int>("debug_builder", 0);
+      auto max_workspace_size = OperatorBase::GetSingleArgument<int>(
+          "max_workspace_size", 1024 * 1024 * 2);
+
+      // Pull the weights from workspace and assembly it back to the onnx model,
+      // notice that since we may have rewritten the net, we need to map the
+      // weight names
+      auto initializers = OperatorBase::GetRepeatedArgument<std::string>("initializers");
+      CAFFE_ENFORCE_EQ(
+          initializers.size() % 2, 0, "initializers should come in pairs");
+      std::unordered_set<std::string> initializer_set;
+      std::unordered_map<std::string, std::string> input_mapping;
+      for (auto it = initializers.begin(); it != initializers.end(); ++it)  {
+        auto key = *it++;
+        input_mapping.emplace(key, *it);
+        initializer_set.emplace(key);
+      }
+      Workspace mapped_ws(ws, input_mapping);
+      ::ONNX_NAMESPACE::ModelProto onnx_model;
+      ParseProtoFromLargeString(onnx_model_str, &onnx_model);
+      BuildInitializationList(&mapped_ws, onnx_model.mutable_graph(), &initializer_set);
+      onnx_model_str.clear();
+      onnx_model.SerializeToString(&onnx_model_str);
+
+      // Build the trt engine
+      trt_engine_ = tensorrt::BuildTrtEngine(
+          onnx_model_str,
+          &logger_,
+          max_batch_size_,
+          max_workspace_size,
+          debug_builder);
+    }
+  }
+
+  CAFFE_ENFORCE(trt_engine_, "Cannot build TensorRT engine!");
+
+  // match and bind the input/output
+  const int num_bindings = trt_engine_->getNbBindings();
+  int output_idx = 0;
+  for (int b = 0; b < num_bindings; ++b) {
+    nv_dims_.push_back(trt_engine_->getBindingDimensions(b));
+    bool is_input = trt_engine_->bindingIsInput(b);
+    is_input_.push_back(is_input);
+    if (!is_input) {
+      // For output, we try to get its output size hint
+      const std::string key = MakeString("output_size_hint_", output_idx);
+      auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
+      if (!output_size_hint.empty()) {
+        std::vector<TIndex> dims;
+        for (const auto v : output_size_hint) {
+          dims.push_back(v);
+        }
+        output_size_hints_.emplace(output_idx, std::move(dims));
+      }
+      ++output_idx;
+    }
+  }
+
+  trt_executor_ = tensorrt::TrtObject(trt_engine_->createExecutionContext());
+}
+
+void TensorRTOp::MaybeAdjustOutputShape(
+    int output_idx,
+    std::vector<TIndex>* dims) {
+  const auto it = output_size_hints_.find(output_idx);
+  if (it != output_size_hints_.end()) {
+    const auto& dims_hint = it->second;
+    auto total_trt = std::accumulate(
+        dims->begin(), dims->end(), (TIndex)(1), std::multiplies<TIndex>());
+    auto total_c2 = std::accumulate(
+        dims_hint.begin(),
+        dims_hint.end(),
+        (TIndex)(1),
+        std::multiplies<TIndex>());
+    CAFFE_ENFORCE_EQ(
+        total_trt,
+        total_c2,
+        "The total size of TensorRT op output and hint don't match: ",
+        total_trt,
+        " vs ",
+        total_c2);
+
+    // We conform to the output shape hints. NB: We might need an explicit
+    // reshape op for this
+    *dims = dims_hint;
+  }
+}
+
+bool TensorRTOp::RunOnDevice() {
+  CAFFE_ENFORCE(trt_executor_);
+  // Decide input batch size
+  size_t N = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    const auto& input_tensor = Input(i);
+    const auto& tensor_dims = input_tensor.dims();
+    CAFFE_ENFORCE(!tensor_dims.empty(), "Input tensor cannot be empty");
+    if (i == 0) {
+      N = tensor_dims.front();
+    } else {
+      CAFFE_ENFORCE_EQ(
+          N, tensor_dims.front(), "Mismatched batch size in input tensors");
+    }
+  }
+  if (N > max_batch_size_ && !batch_warning_issued_) {
+    LOG(WARNING) << "Batch size (" << N << ") is larger than max_batch_size ("
+                 << max_batch_size_ << ") optimized for TensorRT operator. "
+                 << "Performance may be sub-optimal.";
+    batch_warning_issued_ = true;
+  }
+
+  // We need to do the binding at RunOnDevice time because we only know the
+  // exact shapes of the tensors now. In addtion, since TensorRT engine has
+  // max_batch_size, we need to call that multiple times if input batch size
+  // exceeeds this limit.
+  CAFFE_ENFORCE_EQ(is_input_.size(), nv_dims_.size());
+  std::vector<void*> bindings;
+  bindings.reserve(is_input_.size());
+  auto batch_size = max_batch_size_;
+  for (size_t offset = 0; offset < N; offset += batch_size) {
+    bindings.clear();
+    batch_size = std::min<size_t>(N - offset, max_batch_size_);
+    VLOG(2) << "Offset: " << offset << ", batch_size: " << batch_size
+            << ", N: " << N;
+    int input_idx = 0;
+    int output_idx = 0;
+    for (auto i = 0; i < is_input_.size(); ++i) {
+      const auto& dims = nv_dims_[i];
+      if (is_input_[i]) {
+        // input, check input dimensions
+        const auto& input_tensor = Input(input_idx++);
+        const float* input_data = input_tensor.data<float>();
+        const auto& tensor_dims = input_tensor.dims();
+        auto chw = CheckDims(dims, tensor_dims);
+        bindings.push_back((void*)(input_data + offset * chw));
+      } else {
+        // output, we need to allocate the output tensor at first batch run
+        auto* output_tensor = Output(output_idx);
+        std::vector<TIndex> tensor_dims;
+        tensor_dims.push_back(N);
+        int64_t chw = 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          tensor_dims.push_back(dims.d[i]);
+          chw *= dims.d[i];
+        }
+
+        if (offset == 0) {
+          MaybeAdjustOutputShape(output_idx, &tensor_dims);
+          output_tensor->Resize(tensor_dims);
+        }
+        ++output_idx;
+        float* output_data = output_tensor->mutable_data<float>();
+        bindings.push_back((void*)(output_data + offset * chw));
+      }
+    }
+
+    CAFFE_ENFORCE_EQ(bindings.size(), InputSize() + OutputSize());
+    if (!trt_executor_->execute(batch_size, bindings.data())) {
+      CAFFE_THROW("Error running the TensorRT executor");
+    }
+  }
+  return true;
+}
+
+OPERATOR_SCHEMA(TensorRT)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+The TensorRT operator is a black-box operator serialized from prebuilt TensorRT
+Engine string. It will take the input, do the computation by calling TensorRT
+inference engine and generate the outputs.
+
+This is a GPU only operator.
+)DOC")
+    .Arg(
+        "log_verbosity",
+        "(int default 0) verbosity of the TensorRt engine log.")
+    .Arg(
+        "backend_buffer",
+        "(string default=\"\" blob for serialized TensorRT engine."
+        "Note that serialized engine is not compatible across platform and "
+        "different TensorRT version.")
+    .Arg(
+        "max_batch_size",
+        "(int default 0) Batch size set by the TensorRT engine builder."
+        "It must be no larger than the max_batch_size of the engine builder so "
+        "it is better not to edit this manually.");
+
+REGISTER_CUDA_OPERATOR(TensorRT, TensorRTOp);
+} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.h b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
new file mode 100644
index 0000000..cd0700f
--- /dev/null
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "caffe2/contrib/tensorrt/trt_utils.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+#include <NvInfer.h>
+#include <unordered_map>
+
+namespace caffe2 {
+
+class TensorRTOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  TensorRTOp(const OperatorDef& operator_def, Workspace* ws);
+  bool RunOnDevice() override;
+  virtual ~TensorRTOp() noexcept {}
+
+ private:
+  void MaybeAdjustOutputShape(int output_idx, std::vector<TIndex>* dims);
+
+  tensorrt::TrtLogger logger_;
+  int max_batch_size_;
+  std::vector<nvinfer1::Dims> nv_dims_;
+  std::vector<bool> is_input_;
+  std::unordered_map<int, std::vector<TIndex>> output_size_hints_;
+  std::shared_ptr<nvinfer1::ICudaEngine> trt_engine_{nullptr};
+  std::shared_ptr<nvinfer1::IExecutionContext> trt_executor_{nullptr};
+  bool batch_warning_issued_{false};
+};
+
+} // namespace caffe2
+
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
new file mode 100644
index 0000000..2c05224
--- /dev/null
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -0,0 +1,525 @@
+#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
+
+#include <iostream>
+#include <unordered_set>
+
+#include <NvInfer.h>
+#include <google/protobuf/text_format.h>
+#include <onnx2trt.hpp>
+
+#include "caffe2/contrib/tensorrt/trt_utils.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/onnx/onnx_exporter.h"
+#include "caffe2/opt/backend_cutting.h"
+
+namespace caffe2 {
+
+namespace {
+
+// TODO(yinghai): Remove the awkward conversion between unordered_map and map
+std::unordered_map<std::string, TensorShape> InferShapes(
+    Workspace* ws,
+    NetDef* pred_net,
+    CaffeMap<std::string, TensorShape>* shape_hints_ordered) {
+  // Populate shapes from workplace
+  const std::vector<string>& ws_blobs = ws->Blobs();
+  for (const auto& s : ws_blobs) {
+    shape_hints_ordered->emplace(s, GetTensorShapeOfBlob(ws->GetBlob(s)));
+  }
+
+  std::vector<NetDef*> nets;
+  nets.emplace_back(pred_net);
+  InferBlobShapesAndTypes(*shape_hints_ordered, nets);
+  std::unordered_map<std::string, TensorShape> shape_hints;
+  for (const auto& kv : *shape_hints_ordered) {
+    shape_hints.emplace(kv.first, kv.second);
+  }
+
+  return shape_hints;
+}
+
+void DumpModel(const ::ONNX_NAMESPACE::ModelProto& model, const std::string& fname) {
+  std::ofstream ff(fname);
+  std::string body;
+  ::google::protobuf::TextFormat::PrintToString(model.graph(), &body);
+  ff << body << std::endl;
+  ff.close();
+}
+
+void CPUTensorToTensorProto(
+    const TensorCPU& cpu_tensor,
+    ::ONNX_NAMESPACE::TensorProto* t) {
+  const auto len = cpu_tensor.size();
+  if (cpu_tensor.template IsType<float>()) {
+    t->set_data_type(::ONNX_NAMESPACE::TensorProto::FLOAT);
+    const float* data = cpu_tensor.template data<float>();
+    for (auto i = 0; i < len; ++i) {
+      t->add_float_data(*data++);
+    }
+  } else if (cpu_tensor.template IsType<int64_t>()) {
+    t->set_data_type(::ONNX_NAMESPACE::TensorProto::INT64);
+    const int64_t* data = cpu_tensor.template data<int64_t>();
+    for (auto i = 0; i < len; ++i) {
+      t->add_int64_data(*data++);
+    }
+  } else if (cpu_tensor.template IsType<int32_t>()) {
+    t->set_data_type(::ONNX_NAMESPACE::TensorProto::INT32);
+    const int32_t* data = cpu_tensor.template data<int32_t>();
+    for (auto i = 0; i < len; ++i) {
+      t->add_int32_data(*data++);
+    }
+  } else {
+    CAFFE_THROW(
+        "Don't know how to convert workspace tensor type ",
+        cpu_tensor.meta().name(),
+        " to ONNX TensorProto");
+  }
+}
+
+void BlobToTensorProto(
+    const std::string& name,
+    Workspace* ws,
+    CUDAContext* context,
+    ::ONNX_NAMESPACE::TensorProto* t) {
+  // Set name
+  t->set_name(name);
+  const Blob* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
+
+  // Set dims
+  const auto shape = GetTensorShapeOfBlob(blob);
+  for (const auto i : shape.dims()) {
+    t->add_dims(i);
+  }
+
+  // Set values
+  if (blob->template IsType<TensorCPU>()) {
+    const auto& cpu_tensor = blob->template Get<TensorCPU>();
+    CPUTensorToTensorProto(cpu_tensor, t);
+  } else if (blob->template IsType<TensorCUDA>()) {
+    const auto& cuda_tensor = blob->template Get<TensorCUDA>();
+    const auto cpu_tensor = TensorCPU(cuda_tensor, context);
+    context->FinishDeviceComputation();
+    CPUTensorToTensorProto(cpu_tensor, t);
+  } else {
+    CAFFE_THROW(
+        "Initialization blob ",
+        name,
+        " needs to be either TensorCPU or TensorCUDA");
+  }
+}
+
+std::vector<::ONNX_NAMESPACE::ValueInfoProto> ConvertToValueInfo(
+    const std::vector<std::string>& names,
+    const std::unordered_map<std::string, TensorShape>& shape_hints) {
+  std::vector<::ONNX_NAMESPACE::ValueInfoProto> r;
+  for (const auto& s : names) {
+    r.emplace_back();
+    auto& value_info = r.back();
+    value_info.set_name(s);
+    const auto it = shape_hints.find(s);
+    if (it == shape_hints.end()) {
+      LOG(WARNING) << "Cannot get shape of " << s;
+    } else {
+      auto* tensor_type = value_info.mutable_type()->mutable_tensor_type();
+      tensor_type->set_elem_type(
+          ::ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT);
+      auto* shape = tensor_type->mutable_shape();
+      for (int i = 0; i < it->second.dims().size(); ++i) {
+        auto* dim = shape->add_dim();
+        dim->set_dim_value(it->second.dims(i));
+      }
+    }
+  }
+  return r;
+}
+
+void FillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
+  model->set_ir_version(::ONNX_NAMESPACE::Version::IR_VERSION);
+  model->set_producer_name("caffe2");
+  auto* opset_id = model->add_opset_import();
+  opset_id->set_domain("");
+  opset_id->set_version(3);
+}
+} // namespace
+
+void BuildInitializationList(
+    Workspace* ws,
+    ::ONNX_NAMESPACE::GraphProto* g,
+    std::unordered_set<std::string>* initialization_list) {
+  const std::vector<string>& ws_blobs = ws->Blobs();
+
+  // Create a CUDA context and reuse it for potential tensor copies across
+  // devices
+  CUDAContext context;
+
+  for (const auto& s : ws_blobs) {
+    auto it = initialization_list->find(s);
+    if (it != initialization_list->end()) {
+      auto* init_tensor = g->add_initializer();
+      BlobToTensorProto(s, ws, &context, init_tensor);
+      initialization_list->erase(it);
+    }
+  }
+  CAFFE_ENFORCE(
+      initialization_list->empty(), "Unfulfilled initialization list");
+  for (const auto& t : g->initializer()) {
+    VLOG(2) << "Initializer: " << t.name();
+  }
+}
+
+void TensorRTTransformer::AddTrtOptions(
+    OperatorDef* op,
+    const std::unordered_map<std::string, std::vector<int>>&
+        output_size_hints) {
+  auto* max_batch_size_arg = op->add_arg();
+  max_batch_size_arg->set_name("max_batch_size");
+  max_batch_size_arg->set_i(max_batch_size_);
+
+  auto* verbosity_arg = op->add_arg();
+  verbosity_arg->set_name("log_verbosity");
+  verbosity_arg->set_i(verbosity_);
+
+  for (int i = 0; i < op->output_size(); ++i) {
+    const auto& o = op->output(i);
+    const auto it = output_size_hints.find(o);
+    if (it != output_size_hints.end()) {
+      const auto& dims = it->second;
+      auto* output_size_hint_arg = op->add_arg();
+      output_size_hint_arg->set_name(MakeString("output_size_hint_", i));
+      for (const auto& d : dims) {
+        output_size_hint_arg->add_ints(d);
+      }
+
+      LOG(INFO) << "Adding output hint: " << o;
+    }
+  }
+}
+
+OperatorDef TensorRTTransformer::BuildTrtOpLazy(
+    const std::string& onnx_model_str,
+    const std::unordered_map<std::string, std::vector<int>>& output_size_hints,
+    const std::unordered_set<std::string>& initialization_list,
+    const caffe2::NetDef& net) {
+  OperatorDef op;
+  op.set_type("TensorRT");
+  auto* onnx_model_arg = op.add_arg();
+  onnx_model_arg->set_name("onnx_model");
+  onnx_model_arg->set_s(onnx_model_str);
+
+  // Add the names of the initializer blobs that we want to fetch from the
+  // workspace later
+  auto* initializers_arg = op.add_arg();
+  initializers_arg->set_name("initializers");
+  for (const auto& s : initialization_list) {
+    initializers_arg->add_strings(s);
+    initializers_arg->add_strings(input_mapping_.at(s));
+  }
+
+  // Add the input/output
+  for (const auto& input : net.external_input()) {
+    if (!initialization_list.count(input)) {
+      op.add_input(input);
+    }
+  }
+  for (const auto& output : net.external_output()) {
+    op.add_output(output);
+  }
+
+  // Additional arguments for TRT builder
+  auto* debug_builder_arg = op.add_arg();
+  debug_builder_arg->set_name("debug_builder");
+  debug_builder_arg->set_i(debug_builder_);
+  auto* max_workspace_size_arg = op.add_arg();
+  max_workspace_size_arg->set_name("max_workspace_size");
+  max_workspace_size_arg->set_i(max_workspace_size_);
+  AddTrtOptions(&op, output_size_hints);
+  return op;
+}
+
+OperatorDef TensorRTTransformer::BuildTrtOp(
+    const std::string& onnx_model_str,
+    const std::unordered_map<std::string, std::vector<int>>& output_size_hints) {
+  OperatorDef op;
+  op.set_type("TensorRT");
+
+  tensorrt::TrtLogger logger;
+  auto trt_engine = tensorrt::BuildTrtEngine(
+      onnx_model_str,
+      &logger,
+      max_batch_size_,
+      max_workspace_size_,
+      debug_builder_);
+
+  // Set up inputs/outputs in the order of they appearnce in getNbBindings
+  int num_bindings = trt_engine->getNbBindings();
+  for (int b = 0; b < num_bindings; ++b) {
+    const auto& name = trt_engine->getBindingName(b);
+    if (trt_engine->bindingIsInput(b)) {
+      op.add_input(name);
+    } else {
+      op.add_output(name);
+    }
+  }
+
+  auto engine_plan = tensorrt::TrtObject(trt_engine->serialize());
+  auto* serialized_engine_arg = op.add_arg();
+  serialized_engine_arg->set_s("");
+  serialized_engine_arg->set_name("backend_buffer");
+  auto* s = serialized_engine_arg->mutable_s();
+  s->assign((char*)engine_plan->data(), engine_plan->size());
+
+  AddTrtOptions(&op, output_size_hints);
+
+  return op;
+}
+
+NetDef TensorRTTransformer::SubnetToTrtOp(
+    const caffe2::NetDef& net,
+    Workspace* ws,
+    onnx::OnnxExporter* exporter,
+    std::unordered_map<std::string, TensorShape>* shape_hints) {
+  ::ONNX_NAMESPACE::ModelProto onnx_model;
+  FillModelInfo(&onnx_model);
+
+  // Convert c2 ops to onnx ops, add const weights if there are any
+  for (const auto& op : net.op()) {
+    const auto results = exporter->Caffe2OpToOnnxNodes(op, *shape_hints);
+    const auto& node_protos = results.first;
+    for (const auto& n : node_protos) {
+      onnx_model.mutable_graph()->add_node()->CopyFrom(n);
+    }
+    for (const auto& t : results.second) {
+      VLOG(2) << "Adding extra init tensor: " << t.name();
+      TensorShape shape;
+      shape.mutable_dims()->CopyFrom(t.dims());
+      shape_hints->emplace(t.name(), std::move(shape));
+      ::ONNX_NAMESPACE::TensorProto tf;
+      tf.set_name(t.name());
+      tf.mutable_dims()->CopyFrom(t.dims());
+      tf.set_data_type(::ONNX_NAMESPACE::TensorProto::FLOAT);
+      std::vector<int64_t> v;
+      v.resize(t.raw_data().size() / sizeof(int64_t));
+      memcpy(v.data(), t.raw_data().data(), t.raw_data().size());
+      std::vector<float> vf;
+      for (auto i : v) {
+        vf.push_back(static_cast<float>(i));
+      }
+      tf.mutable_raw_data()->assign(
+          reinterpret_cast<const char*>(vf.data()), sizeof(float) * vf.size());
+
+      onnx_model.mutable_graph()->add_initializer()->CopyFrom(tf);
+    }
+  }
+
+  // Convert outputs and compute output shape hints
+  std::vector<std::string> io_names;
+  for (const auto& output : net.external_output()) {
+    io_names.emplace_back(output);
+  }
+  auto io_vec = ConvertToValueInfo(io_names, *shape_hints);
+  std::unordered_map<std::string, std::vector<int>> output_shape_hints;
+  for (const auto& i : io_vec) {
+    onnx_model.mutable_graph()->add_output()->CopyFrom(i);
+    auto ret = output_shape_hints.emplace(i.name(), std::vector<int>());
+    auto& vec = ret.first->second;
+    const auto it = shape_hints->find(i.name());
+    CAFFE_ENFORCE(
+        it != shape_hints->end(),
+        "Cannot find shape info for output ",
+        i.name());
+    const auto& shape = it->second;
+    for (int k = 0; k < shape.dims().size(); ++k) {
+      vec.push_back(shape.dims(k));
+    }
+  }
+
+  // Convert inputs and figure out weights
+  std::unordered_set<std::string> weights;
+  const std::vector<string>& ws_blobs = ws->Blobs();
+  for (const auto& s : ws_blobs) {
+    VLOG(2) << "Add weights: " << s;
+    weights.emplace(s);
+  }
+
+  std::unordered_set<std::string> total_inputs;
+  std::unordered_set<std::string> initialization_list;
+  std::vector<std::string> total_inputs_vec;
+
+  // Extra intermediate weights created during conversion
+  for (const auto& extra_weight : onnx_model.graph().initializer()) {
+    if (total_inputs.emplace(extra_weight.name()).second) {
+      total_inputs_vec.emplace_back(extra_weight.name());
+    }
+  }
+  // Boundary inputs, should not be weights
+  std::unordered_set<std::string> boundary_inputs;
+  for (const auto& i : net.external_input()) {
+    boundary_inputs.emplace(i);
+  }
+
+  for (const auto& op : net.op()) {
+    for (const auto& input : op.input()) {
+      if (total_inputs.emplace(input).second && weights.count(input)) {
+        // We add weights as inputs too
+        total_inputs_vec.emplace_back(input);
+        initialization_list.emplace(input);
+        VLOG(2) << "Add input weights: " << input;
+      } else if (boundary_inputs.count(input)) {
+        VLOG(2) << "Adding boundary input: " << input;
+        total_inputs_vec.emplace_back(input);
+      }
+    }
+  }
+  io_vec = ConvertToValueInfo(total_inputs_vec, *shape_hints);
+  for (const auto& i : io_vec) {
+    onnx_model.mutable_graph()->add_input()->CopyFrom(i);
+  }
+
+  // Debug stuff
+  if (debug_builder_) {
+    DumpModel(onnx_model, "debug.onnxtxt");
+  }
+
+  // Convert weights to initializing tensors if we are building serializable trt
+  // op or defer it to construction time of trt op
+  if (build_serializable_op_) {
+    BuildInitializationList(
+        ws, onnx_model.mutable_graph(), &initialization_list);
+  }
+
+  // Onnx model is ready. Call onnx-trt to convert to one trt c2 op
+  std::string model_str;
+  onnx_model.SerializeToString(&model_str);
+  NetDef net_opt;
+  auto* op = net_opt.add_op();
+  if (build_serializable_op_) {
+    *op = BuildTrtOp(model_str, output_shape_hints);
+  } else {
+    *op =
+        BuildTrtOpLazy(model_str, output_shape_hints, initialization_list, net);
+  }
+  for (const auto& i : op->input()) {
+    net_opt.add_external_input(i);
+  }
+  for (const auto& i : op->output()) {
+    net_opt.add_external_output(i);
+  }
+
+  return net_opt;
+}
+
+CaffeMap<std::string, TensorShape> TensorRTTransformer::SsaRewriteAndMapNames(
+    Workspace* ws,
+    NetDef* pred_net,
+    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
+  input_mapping_ = onnx::SsaRewrite(nullptr, pred_net);
+  std::unordered_map<std::string, std::string> input_reverse_mapping;
+  std::vector<std::string> external_inputs;
+  for (const auto kv : input_mapping_) {
+    input_reverse_mapping.emplace(kv.second, kv.first);
+    if (!ws->HasBlob(kv.second)) {
+      external_inputs.emplace_back(kv.first);
+    }
+  }
+  for (const auto& i : external_inputs) {
+    input_mapping_.erase(i);
+  }
+  CaffeMap<std::string, TensorShape> shape_hints_ordered;
+  for (const auto& kv : input_shape_hints) {
+    const auto it = input_reverse_mapping.find(kv.first);
+    if (it != input_reverse_mapping.end()) {
+      LOG(INFO) << "Adding input hint: " << it->second;
+      shape_hints_ordered.emplace(it->second, kv.second);
+    } else {
+      shape_hints_ordered.emplace(kv.first, kv.second);
+    }
+  }
+  return shape_hints_ordered;
+}
+
+void TensorRTTransformer::PruneUnusedWeights(
+    Workspace* ws,
+    const NetDef& pred_net) {
+  std::unordered_set<std::string> used_weights;
+  for (const auto& op : pred_net.op()) {
+    for (const auto& i : op.input()) {
+      used_weights.emplace(i);
+    }
+  }
+
+  for (const auto kv : input_mapping_) {
+    // for weights that are not referenced anywhere, we remove it from the
+    // original workspace
+    if (!used_weights.count(kv.first)) {
+      VLOG(2) << "Removing unused weight blob: " << kv.second << " ("
+              << kv.first << ")";
+      ws->RemoveBlob(kv.second);
+    }
+  }
+}
+
+// Cutting off the runnable part and replace with tensor ops. Asssume the nets
+// were topologically sorted
+void TensorRTTransformer::Transform(
+    Workspace* ws,
+    NetDef* pred_net,
+    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
+  CAFFE_ENFORCE(ws);
+  auto shape_hints_ordered =
+      SsaRewriteAndMapNames(ws, pred_net, input_shape_hints);
+  Workspace mapped_ws(ws, input_mapping_);
+  auto shape_hints = InferShapes(&mapped_ws, pred_net, &shape_hints_ordered);
+
+  CAFFE_ENFORCE(pred_net, "Predict net cannot be nullptr");
+  onnx::OnnxExporter exporter(nullptr, true);
+  tensorrt::TrtLogger logger;
+  auto trt_builder = tensorrt::TrtObject(nvinfer1::createInferBuilder(logger));
+  auto trt_network = tensorrt::TrtObject(trt_builder->createNetwork());
+  auto importer =
+      tensorrt::TrtObject(nvonnxparser::createParser(trt_network.get(), logger));
+
+  // function to tell whether TensorRT supports a given C2 op or not
+  auto supports =
+      [&exporter, &shape_hints, importer](const caffe2::OperatorDef& op) {
+        const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
+        if (!schema || schema->onnx_schema().empty()) {
+          LOG(INFO) << "Cannot export c2 op " << op.type() << " to onnx";
+          return false;
+        }
+
+        auto results = exporter.Caffe2OpToOnnxNodes(op, shape_hints);
+        for (const auto& n : results.first) {
+          if (!importer->supportsOperator(n.op_type().c_str())) {
+            LOG(INFO) << "TRT does not support ONNX node " << n.op_type();
+            return false;
+          }
+        }
+        return true;
+      };
+
+  // function to convert runnbale subgraph into a trt op. Note that to keep the
+  // interface clean, we do the double conversion from C2 op to Onnx ops here
+  // but it should be OK as the cost is really small. We also need to keep the
+  // same exporter throughout the process to avoid duplicated dummy name
+  // generation
+  onnx::OnnxExporter exporter2(nullptr, true);
+  auto trt_converter = [this, &mapped_ws, &shape_hints, &exporter2](
+                           const caffe2::NetDef& net) mutable {
+    return SubnetToTrtOp(net, &mapped_ws, &exporter2, &shape_hints);
+  };
+
+  NetDef net_opt = opt::OptimizeForBackend(*pred_net, supports, trt_converter);
+
+  // Need to figure out a proper place to handle device option
+  net_opt.mutable_device_option()->CopyFrom(pred_net->device_option());
+  pred_net->Swap(&net_opt);
+
+  if (build_serializable_op_) {
+    PruneUnusedWeights(ws, *pred_net);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.h b/caffe2/contrib/tensorrt/tensorrt_tranformer.h
new file mode 100644
index 0000000..10750c0
--- /dev/null
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/onnx/onnx_exporter.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "onnx/onnx_pb.h"
+
+namespace caffe2 {
+
+void BuildInitializationList(
+    Workspace* ws,
+    ::ONNX_NAMESPACE::GraphProto* g,
+    std::unordered_set<std::string>* initialization_list);
+
+class TensorRTTransformer {
+ public:
+  TensorRTTransformer(
+      size_t max_batch_size,
+      size_t max_workspace_size,
+      int verbosity,
+      bool debug_builder,
+      bool build_serializable_op = false)
+      : build_serializable_op_(build_serializable_op),
+        max_batch_size_(max_batch_size),
+        max_workspace_size_(max_workspace_size),
+        verbosity_(verbosity),
+        debug_builder_(debug_builder) {}
+
+  OperatorDef BuildTrtOp(
+      const std::string& onnx_model_str,
+      const std::unordered_map<std::string, std::vector<int>>&
+          output_size_hints);
+
+  void Transform(
+      Workspace* ws,
+      NetDef* pred_net,
+      const std::unordered_map<std::string, TensorShape>& shape_hints);
+
+ private:
+  caffe2::NetDef SubnetToTrtOp(
+      const caffe2::NetDef& net,
+      Workspace* ws,
+      onnx::OnnxExporter* exporter,
+      std::unordered_map<std::string, TensorShape>* shape_hints);
+
+  void AddTrtOptions(
+      caffe2::OperatorDef* op,
+      const std::unordered_map<std::string, std::vector<int>>&
+          output_size_hints);
+
+  // A lazy version of Trt op building function, where instead of invoking the
+  // trt build engine and serialize the trt runtime, we just attach the
+  // serialized trt model string. The runtime will be built when trt op is
+  // constructed, during which the weights will be pulled from the workspace.
+  // The benefit of doing so is that we can avoid serialize/deserialize the
+  // weights across OperatorDef.
+  OperatorDef BuildTrtOpLazy(
+      const std::string& onnx_model_str,
+      const std::unordered_map<std::string, std::vector<int>>&
+          output_size_hints,
+      const std::unordered_set<std::string>& initialization_list,
+      const caffe2::NetDef& net);
+
+  CaffeMap<std::string, TensorShape> SsaRewriteAndMapNames(
+      Workspace* ws,
+      NetDef* pred_net,
+      const std::unordered_map<std::string, TensorShape>& input_shape_hints);
+
+  // Prune the unreferenced weights in original workspace to save memory
+  void PruneUnusedWeights(Workspace* ws, const NetDef& pred_net);
+
+  // Input mapping
+  std::unordered_map<std::string, std::string> input_mapping_;
+
+  // Generate serializable trt op or defer the onnx->trt process to ctor of the
+  // Trt op
+  bool build_serializable_op_{true};
+
+  // TensorRT params
+  size_t max_batch_size_{50};
+  size_t max_workspace_size_{1024 * 1024 * 2};
+  int verbosity_{2};
+  bool debug_builder_{false};
+};
+} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/trt_utils.cc b/caffe2/contrib/tensorrt/trt_utils.cc
new file mode 100644
index 0000000..f1efa4e
--- /dev/null
+++ b/caffe2/contrib/tensorrt/trt_utils.cc
@@ -0,0 +1,44 @@
+#include "caffe2/contrib/tensorrt/trt_utils.h"
+
+#include <NvOnnxParser.h>
+
+namespace caffe2 {
+namespace tensorrt {
+std::shared_ptr<nvinfer1::ICudaEngine> BuildTrtEngine(
+    const std::string& onnx_model_str,
+    TrtLogger* logger,
+    size_t max_batch_size,
+    size_t max_workspace_size,
+    bool debug_builder) {
+  auto trt_builder = TrtObject(nvinfer1::createInferBuilder(*logger));
+  auto trt_network = TrtObject(trt_builder->createNetwork());
+  auto trt_parser =
+      TrtObject(nvonnxparser::createParser(trt_network.get(), *logger));
+  auto status = trt_parser->parse(onnx_model_str.data(), onnx_model_str.size());
+  if (!status) {
+    const auto num_errors = trt_parser->getNbErrors();
+    if (num_errors > 0) {
+      const auto* error = trt_parser->getError(num_errors - 1);
+      CAFFE_THROW(
+          "TensorRTTransformer ERROR: ",
+          error->file(),
+          ":",
+          error->line(),
+          " In function ",
+          error->func(),
+          ":\n",
+          "[",
+          static_cast<int>(error->code()),
+          "] ",
+          error->desc());
+    } else {
+      CAFFE_THROW("TensorRTTransformer Unknown Error");
+    }
+  }
+  trt_builder->setMaxBatchSize(max_batch_size);
+  trt_builder->setMaxWorkspaceSize(max_workspace_size);
+  trt_builder->setDebugSync(debug_builder);
+  return TrtObject(trt_builder->buildCudaEngine(*trt_network.get()));
+}
+} // namespace tensorrt
+} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/trt_utils.h b/caffe2/contrib/tensorrt/trt_utils.h
new file mode 100644
index 0000000..737764e
--- /dev/null
+++ b/caffe2/contrib/tensorrt/trt_utils.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <iostream>
+#include <NvInfer.h>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 { namespace tensorrt {
+
+  // Logger for GIE info/warning/errors
+class TrtLogger : public nvinfer1::ILogger {
+  using nvinfer1::ILogger::Severity;
+
+ public:
+  TrtLogger(Severity verbosity = Severity::kWARNING) : _verbosity(verbosity) {}
+  void log(Severity severity, const char* msg) override {
+    if (severity <= _verbosity) {
+      if (severity == Severity::kINTERNAL_ERROR || severity == Severity::kERROR) {
+        LOG(ERROR) << msg;
+      } else if (severity == Severity::kWARNING) {
+        LOG(WARNING)  << msg;
+      } else if (severity == Severity::kINFO) {
+        LOG(INFO) << msg;
+      }
+    }
+  }
+
+ private:
+  Severity _verbosity;
+};
+
+struct TrtDeleter {
+  template <typename T>
+  void operator()(T* obj) const {
+    if (obj) {
+      obj->destroy();
+    }
+  }
+};
+
+template <typename T>
+inline std::shared_ptr<T> TrtObject(T* obj) {
+  CAFFE_ENFORCE(obj, "Failed to create TensorRt object");
+  return std::shared_ptr<T>(obj, TrtDeleter());
+}
+
+std::shared_ptr<nvinfer1::ICudaEngine> BuildTrtEngine(
+    const std::string& onnx_model_str,
+    TrtLogger* logger,
+    size_t max_batch_size,
+    size_t max_workspace_size,
+    bool debug_builder);
+}
+}
+
diff --git a/caffe2/contrib/warpctc/ctc_op.cpp b/caffe2/contrib/warpctc/ctc_op.cpp
new file mode 100644
index 0000000..0d01766
--- /dev/null
+++ b/caffe2/contrib/warpctc/ctc_op.cpp
@@ -0,0 +1,33 @@
+#include "ctc_op.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+namespace detail {
+template <>
+ctcComputeInfo workspaceInfo<CPUContext>(const CPUContext& /*context*/) {
+  ctcComputeInfo result;
+  result.loc = CTC_CPU;
+  // CpuCTC overrides OMP threads set by --caffe2_omp_num_threads on init.
+  // Default to 0 to use the configured omp_get_max_threads().
+  result.num_threads = 0;
+  return result;
+}
+}
+
+REGISTER_CPU_OPERATOR(CTC, CTCOp<float, CPUContext>);
+OPERATOR_SCHEMA(CTC).NumInputs(4).NumOutputs(2, 3);
+//    .EnforceInputOutputGradient({{0, 0}});
+
+namespace {
+class GetCTCGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Copy", "", vector<string>{O(0)}, vector<string>{GI(0)});
+  }
+};
+}
+REGISTER_GRADIENT(CTC, GetCTCGradient);
+}
diff --git a/caffe2/contrib/warpctc/ctc_op.h b/caffe2/contrib/warpctc/ctc_op.h
new file mode 100644
index 0000000..748e3a5
--- /dev/null
+++ b/caffe2/contrib/warpctc/ctc_op.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <ctc.h>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/core/common_cudnn.h"
+
+#define CTC_CHECK(condition)           \
+  do {                                 \
+    ctcStatus_t status = condition;    \
+    CAFFE_ENFORCE_EQ(                  \
+        status,                        \
+        CTC_STATUS_SUCCESS,            \
+        " Error at: ",                 \
+        __FILE__,                      \
+        ":",                           \
+        __LINE__,                      \
+        ": ",                          \
+        ::ctcGetStatusString(status)); \
+  } while (0)
+
+namespace caffe2 {
+
+namespace detail {
+
+template <typename Context>
+ctcComputeInfo workspaceInfo(const Context& context);
+
+}
+
+template <typename T, typename Context>
+class CTCOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CTCOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    CAFFE_ENFORCE(
+        (is_test_ && OutputSize() == 2) || (!is_test_ && OutputSize() == 3));
+  }
+
+  bool RunOnDevice() override {
+    // inputs
+    const auto& inputs = Input(INPUTS);
+    const auto minibatchSize = inputs.dim(1);
+    const auto alphabetSize = inputs.dim(2);
+    const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
+    const auto& labelLengths =
+        OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
+    const auto& inputLengths =
+        OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
+
+    // outputs
+    Tensor<Context>* gradients = nullptr;
+    TensorCPU* costs;
+    Tensor<Context>* workspace;
+    if (!is_test_) {
+      // [grads, costs, workspace] to maintain backward compatibility
+      gradients = Output(0);
+      gradients->ResizeLike(inputs);
+      costs = OperatorBase::template Output<TensorCPU>(1);
+      costs->ResizeLike(labelLengths);
+      workspace = Output(2);
+    } else {
+      // [costs, workspace]
+      costs = OperatorBase::template Output<TensorCPU>(0);
+      costs->ResizeLike(labelLengths);
+      workspace = Output(1);
+    }
+
+    size_t workspaceSizeBytes;
+    CTC_CHECK(get_workspace_size(
+        labelLengths.template data<int>(),
+        inputLengths.template data<int>(),
+        alphabetSize,
+        minibatchSize,
+        detail::workspaceInfo(context_),
+        &workspaceSizeBytes));
+    workspace->Resize(workspaceSizeBytes);
+    CTC_CHECK(compute_ctc_loss(
+        inputs.template data<T>(),
+        gradients ? gradients->template mutable_data<T>() : nullptr,
+        labels.template data<int>(),
+        labelLengths.template data<int>(),
+        inputLengths.template data<int>(),
+        alphabetSize,
+        minibatchSize,
+        costs->template mutable_data<T>(),
+        workspace->template mutable_data<uint8_t>(),
+        detail::workspaceInfo(context_)));
+    return true;
+  }
+
+private:
+ bool is_test_;
+
+ INPUT_TAGS(INPUTS, LABELS, LABEL_LENGTHS, INPUT_LENGTHS);
+};
+}
+
+#undef CTC_CHECK
diff --git a/caffe2/contrib/warpctc/ctc_op_gpu.cpp b/caffe2/contrib/warpctc/ctc_op_gpu.cpp
new file mode 100644
index 0000000..e6b399a
--- /dev/null
+++ b/caffe2/contrib/warpctc/ctc_op_gpu.cpp
@@ -0,0 +1,18 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "ctc_op.h"
+
+namespace caffe2 {
+
+namespace detail {
+template <>
+ctcComputeInfo workspaceInfo<CUDAContext>(const CUDAContext& context) {
+  ctcComputeInfo result;
+  result.loc = CTC_GPU;
+  result.stream = context.cuda_stream();
+  return result;
+}
+}
+
+REGISTER_CUDA_OPERATOR(CTC, CTCOp<float, CUDAContext>);
+}
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
new file mode 100644
index 0000000..602ab28
--- /dev/null
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -0,0 +1,87 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+
+from caffe2.python import core, workspace, dyndep, test_util
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/warpctc:ctc_ops')
+workspace.GlobalInit(["python"])
+
+
+def softmax(w):
+    maxes = np.amax(w, axis=-1, keepdims=True)
+    e = np.exp(w - maxes)
+    dist = e / np.sum(e, axis=-1, keepdims=True)
+    return dist
+
+
+class CTCOpsTest(test_util.TestCase):
+    def verify_cost(self, device_option, is_test):
+        alphabet_size = 5
+        N = 1
+        T = 2
+
+        inputs = np.asarray(
+            [
+                [[0.1, 0.6, 0.1, 0.1, 0.1]],
+                [[0.1, 0.1, 0.6, 0.1, 0.1]],
+            ]
+        ).reshape(T, N, alphabet_size).astype(np.float32)
+
+        labels = np.asarray([1, 2]).astype(np.int32).reshape(T)
+        label_lengths = np.asarray([2]).astype(np.int32).reshape(N)
+        input_lengths = np.asarray([T]).astype(np.int32)
+
+        net = core.Net("test-net")
+        output_blobs = ["costs", "workspace"] if is_test \
+                else ["inputs_grad_to_be_copied", "costs", "workspace"]
+        net.CTC(["inputs", "labels", "label_lengths", "input_lengths"],
+                output_blobs,
+                is_test=is_test,
+                device_option=device_option)
+        if not is_test:
+            net.AddGradientOperators(["costs"])
+        self.ws.create_blob("inputs").feed(inputs, device_option=device_option)
+        self.ws.create_blob("labels").feed(labels)
+        self.ws.create_blob("label_lengths").feed(label_lengths)
+        self.ws.create_blob("input_lengths").feed(input_lengths)
+        self.ws.run(net)
+        probs = softmax(inputs)
+        expected = probs[0, 0, 1] * probs[1, 0, 2]
+        self.assertEqual(self.ws.blobs["costs"].fetch().shape, (N,))
+        self.assertEqual(self.ws.blobs["costs"].fetch().dtype, np.float32)
+        cost = self.ws.blobs["costs"].fetch()[0]
+        print(cost)
+        self.assertAlmostEqual(np.exp(-cost), expected)
+        if not is_test:
+            # Make sure inputs_grad was added by AddGradientOperators and
+            # it is equal to the inputs_grad_to_be_copied blob returned by CTCop
+            assert np.array_equal(
+                self.ws.blobs["inputs_grad"].fetch(),
+                self.ws.blobs["inputs_grad_to_be_copied"].fetch()
+            )
+
+    def test_ctc_cost_cpu(self):
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
+            is_test=False)
+
+    def test_ctc_cost_gpu(self):
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
+                                    cuda_gpu_id=0),
+            is_test=False)
+
+    def test_ctc_forward_only_cpu(self):
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
+            is_test=True)
+
+    def test_ctc_forward_only_gpu(self):
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
+                                    cuda_gpu_id=0),
+            is_test=True)
diff --git a/caffe2/core/CMakeLists.txt b/caffe2/core/CMakeLists.txt
new file mode 100644
index 0000000..a3b5e46
--- /dev/null
+++ b/caffe2/core/CMakeLists.txt
@@ -0,0 +1,60 @@
+# ---[ GPU files
+# ------[ cuDNN
+if (USE_CUDNN)
+  file(GLOB tmp *_cudnn.cc)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+endif()
+# ------[ general GPU
+file(GLOB tmp *_gpu.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# ------[ CUDA sources
+file(GLOB tmp *.cu)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+# ---[ HIP files
+file(GLOB_RECURSE tmp *_hip.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+# ------[ MIOpen files
+file(GLOB_RECURSE tmp *_miopen.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+# exclude test files
+file(GLOB_RECURSE tmp *_test.cc)
+exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
+
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
+# TODO: when we move to explicit file list, this would not be needed.
+file(GLOB tmp_cudnn *_cudnn.cc)
+exclude(tmp "${tmp}" ${tmp_cudnn})
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} $<TARGET_OBJECTS:dispatch> ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
+
+# ---[ GPU test files
+file(GLOB tmp *_gpu_test.cc)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+# ---[ HIP test files
+file(GLOB_RECURSE tmp *_hip_test.cc)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/core/THCCachingAllocator.cu b/caffe2/core/THCCachingAllocator.cu
new file mode 100644
index 0000000..66e1e78
--- /dev/null
+++ b/caffe2/core/THCCachingAllocator.cu
@@ -0,0 +1,309 @@
+#include "THCCachingAllocator.h"
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+
+#include <cuda_runtime_api.h>
+
+#include "caffe2/core/context_gpu.h"
+
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+// - Large (>1MB) and small allocation requests are handled separately. Large
+//   allocation requests can be filled by a cudaMalloc call of the exact size.
+//   Small requests will allocate and split a 1MB buffer, if necessary.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// Thread Safety: the allocator is NOT thread safe. Calls to { Alloc, Free }
+// must be synchronized by the programmer.
+//
+
+namespace {
+
+const size_t kRoundSmall = 512; // round up small allocs to 512 bytes
+const size_t kRoundLarge = 131072; // round up large allocs to 128 KiB
+const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB
+
+struct Block {
+  int device; // gpu
+  cudaStream_t stream; // allocation stream
+  size_t size; // block size in bytes
+  char* ptr; // memory address
+  bool allocated; // in-use flag
+  Block* prev; // prev block if split from a larger allocation
+  Block* next; // next block if split from a larger allocation
+  int event_count; // number of outstanding CUDA events
+
+  Block(int device, cudaStream_t stream, size_t size, char* ptr = nullptr)
+      : device(device),
+        stream(stream),
+        size(size),
+        ptr(ptr),
+        allocated(0),
+        prev(nullptr),
+        next(nullptr),
+        event_count(0) {}
+};
+
+static bool BlockComparator(const Block* a, const Block* b) {
+  if (a->device != b->device) {
+    return a->device < b->device;
+  }
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+static size_t roundSize(size_t size) {
+  if (size < kRoundSmall) {
+    size = kRoundSmall;
+  } else if (size < kSmallAlloc) {
+    size += kRoundSmall - 1 - (size - 1) % kRoundSmall;
+  } else {
+    size += kRoundLarge - 1 - (size - 1) % kRoundLarge;
+  }
+  return size;
+}
+
+} // namespace
+
+namespace caffe2 {
+
+struct THCCachingAllocatorImpl {
+  typedef bool (*Comparison)(const Block*, const Block*);
+  typedef std::set<Block*, Comparison> FreeBlocks;
+
+  // lock around all operations
+  std::mutex mutex;
+
+  // cached blocks larger than 1 MB
+  FreeBlocks largeBlocks_;
+
+  // cached blocks 1 MB or smaller
+  FreeBlocks smallBlocks_;
+
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocatedBlocks_;
+
+  THCCachingAllocatorImpl()
+      : largeBlocks_(BlockComparator), smallBlocks_(BlockComparator) {}
+
+  ~THCCachingAllocatorImpl() {
+    emptyCache();
+  }
+
+  /** allocates a block which is safe to use from the provided stream */
+  cudaError_t Alloc(void** devPtr, size_t size, cudaStream_t stream) {
+    int device;
+    cudaError_t err = cudaGetDevice(&device);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    size = roundSize(size);
+    bool small = size <= kSmallAlloc;
+
+    Block search_key(device, stream, size);
+    auto& free_blocks = small ? smallBlocks_ : largeBlocks_;
+
+    Block* block = nullptr;
+    Block* remaining = nullptr;
+
+    auto it = free_blocks.lower_bound(&search_key);
+    if (it != free_blocks.end() && (*it)->device == device &&
+        (*it)->stream == stream) {
+      block = *it;
+      free_blocks.erase(it);
+    } else {
+      void* ptr;
+      size_t alloc_size = small ? kSmallAlloc : size;
+      err = cudaMallocRetry(device, &ptr, alloc_size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      block = new Block(device, stream, alloc_size, (char*)ptr);
+    }
+
+    if (block->size - size >= (small ? kRoundSmall : kSmallAlloc + 1)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr += size;
+      remaining->size -= size;
+      free_blocks.insert(remaining);
+    }
+
+    block->allocated = true;
+    allocatedBlocks_[block->ptr] = block;
+
+    *devPtr = (void*)block->ptr;
+    return cudaSuccess;
+  }
+
+  cudaError_t Free(void* ptr) {
+    if (!ptr) {
+      return cudaSuccess;
+    }
+
+    auto it = allocatedBlocks_.find(ptr);
+    if (it == allocatedBlocks_.end()) {
+      return cudaErrorInvalidDevicePointer;
+    }
+
+    Block* block = it->second;
+    allocatedBlocks_.erase(it);
+    block->allocated = false;
+
+    freeBlock(block);
+    return cudaSuccess;
+  }
+
+  /** returns cached blocks to the system allocator */
+  cudaError_t emptyCache() {
+    cudaError_t err =
+        freeBlocks(largeBlocks_, largeBlocks_.begin(), largeBlocks_.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = freeBlocks(smallBlocks_, smallBlocks_.begin(), smallBlocks_.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    return cudaSuccess;
+  }
+
+  /** moves a block into the free block list */
+  void freeBlock(Block* block) {
+    CAFFE_ENFORCE(!block->allocated && block->event_count == 0);
+    bool small = block->size <= kSmallAlloc;
+    auto& free_blocks = small ? smallBlocks_ : largeBlocks_;
+    tryMergeBlocks(block, block->prev, free_blocks);
+    tryMergeBlocks(block, block->next, free_blocks);
+    free_blocks.insert(block);
+  }
+
+  /** combine previously split blocks */
+  void tryMergeBlocks(Block* dst, Block* src, FreeBlocks& free_blocks) {
+    if (!src || src->allocated || src->event_count > 0) {
+      return;
+    }
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+    dst->size += src->size;
+    free_blocks.erase(src);
+    delete src;
+  }
+
+  cudaError_t cudaMallocRetry(int device, void** devPtr, size_t size) {
+    // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks
+    // and retries.
+    cudaError_t err = cudaMalloc(devPtr, size);
+    if (err != cudaSuccess) {
+      cudaGetLastError();
+      err = freeCachedBlocks(device);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaMalloc(devPtr, size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    return cudaSuccess;
+  }
+
+  cudaError_t freeCachedBlocks(int device) {
+    // Free all non-split cached blocks on device
+    Block lower_bound(device, nullptr, 0);
+    Block upper_bound(device + 1, nullptr, 0);
+
+    cudaError_t err = freeBlocks(
+        largeBlocks_,
+        largeBlocks_.lower_bound(&lower_bound),
+        largeBlocks_.lower_bound(&upper_bound));
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = freeBlocks(
+        smallBlocks_,
+        smallBlocks_.lower_bound(&lower_bound),
+        smallBlocks_.lower_bound(&upper_bound));
+    return err;
+  }
+
+  cudaError_t freeBlocks(
+      FreeBlocks& blocks,
+      FreeBlocks::iterator it,
+      FreeBlocks::iterator end) {
+    // Frees all non-split blocks between `it` and `end`
+    while (it != end) {
+      Block* block = *it;
+      if (!block->prev && !block->next) {
+        cudaError_t err = cudaFree((void*)block->ptr);
+        if (err != cudaSuccess) {
+          return err;
+        }
+        auto cur = it;
+        ++it;
+        blocks.erase(cur);
+        delete block;
+      } else {
+        ++it;
+      }
+    }
+    return cudaSuccess;
+  }
+};
+
+THCCachingAllocator::THCCachingAllocator()
+    : _impl(new THCCachingAllocatorImpl()) {}
+
+THCCachingAllocator::~THCCachingAllocator() {
+  delete _impl;
+}
+
+cudaError_t
+THCCachingAllocator::Alloc(void** refPtr, size_t nbytes, cudaStream_t stream) {
+  return _impl->Alloc(refPtr, nbytes, stream);
+}
+
+cudaError_t THCCachingAllocator::Free(void* ptr) {
+  return _impl->Free(ptr);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/THCCachingAllocator.h b/caffe2/core/THCCachingAllocator.h
new file mode 100644
index 0000000..fa57d52
--- /dev/null
+++ b/caffe2/core/THCCachingAllocator.h
@@ -0,0 +1,24 @@
+#ifndef THC_CACHING_ALLOCATOR_H
+#define THC_CACHING_ALLOCATOR_H
+
+#include <cuda_runtime.h>
+
+namespace caffe2 {
+
+struct THCCachingAllocatorImpl;
+
+class THCCachingAllocator {
+ public:
+  THCCachingAllocator();
+  ~THCCachingAllocator();
+
+  cudaError_t Alloc(void** refPtr, size_t nbytes, cudaStream_t stream);
+  cudaError_t Free(void* ptr);
+
+ private:
+  THCCachingAllocatorImpl* _impl;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc
new file mode 100644
index 0000000..4edc491
--- /dev/null
+++ b/caffe2/core/allocator.cc
@@ -0,0 +1,49 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_report_cpu_memory_usage,
+    false,
+    "If set, print out detailed memory usage");
+
+CAFFE2_DEFINE_bool(
+    caffe2_cpu_allocator_do_zero_fill,
+    true,
+    "If set, do memory zerofilling when allocating on CPU");
+
+namespace caffe2 {
+
+void NoDelete(void*) {}
+
+static std::unique_ptr<CPUAllocator> g_cpu_allocator(new DefaultCPUAllocator());
+CPUAllocator* GetCPUAllocator() {
+  return g_cpu_allocator.get();
+}
+
+void SetCPUAllocator(CPUAllocator* alloc) {
+  g_cpu_allocator.reset(alloc);
+}
+
+MemoryAllocationReporter CPUContext::reporter_;
+
+void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  size_table_[ptr] = nbytes;
+  allocated_ += nbytes;
+  LOG(INFO) << "Caffe2 alloc " << nbytes << " bytes, total alloc " << allocated_
+            << " bytes.";
+}
+
+void MemoryAllocationReporter::Delete(void* ptr) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  auto it = size_table_.find(ptr);
+  CHECK(it != size_table_.end());
+  allocated_ -= it->second;
+  LOG(INFO) << "Caffe2 deleted " << it->second << " bytes, total alloc "
+            << allocated_ << " bytes.";
+  size_table_.erase(it);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
new file mode 100644
index 0000000..6666805
--- /dev/null
+++ b/caffe2/core/allocator.h
@@ -0,0 +1,88 @@
+#ifndef CAFFE2_CORE_ALLOCATOR_H_
+#define CAFFE2_CORE_ALLOCATOR_H_
+
+#include <unordered_map>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/numa.h"
+
+CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
+CAFFE2_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
+
+namespace caffe2 {
+
+// Use 32-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gCaffe2Alignment = 32;
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+void NoDelete(void*);
+
+// A virtual allocator class to do memory allocation and deallocation.
+struct CPUAllocator {
+  CPUAllocator() {}
+  virtual ~CPUAllocator() noexcept {}
+  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) = 0;
+  virtual MemoryDeleter GetDeleter() = 0;
+};
+
+// A virtual struct that is used to report Caffe2's memory allocation and
+// deallocation status
+class MemoryAllocationReporter {
+ public:
+  MemoryAllocationReporter() : allocated_(0) {}
+  void New(void* ptr, size_t nbytes);
+  void Delete(void* ptr);
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<void*, size_t> size_table_;
+  size_t allocated_;
+};
+
+struct DefaultCPUAllocator final : CPUAllocator {
+  DefaultCPUAllocator() {}
+  ~DefaultCPUAllocator() override {}
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+    void* data = nullptr;
+#ifdef __ANDROID__
+    data = memalign(gCaffe2Alignment, nbytes);
+#elif defined(_MSC_VER)
+    data = _aligned_malloc(nbytes, gCaffe2Alignment);
+#else
+    CAFFE_ENFORCE_EQ(posix_memalign(&data, gCaffe2Alignment, nbytes), 0);
+#endif
+    CAFFE_ENFORCE(data);
+    // move data to a thread's NUMA node
+    NUMAMove(data, nbytes, GetCurrentNUMANode());
+    if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
+      memset(data, 0, nbytes);
+    }
+    return {data, Delete};
+  }
+
+#ifdef _MSC_VER
+  static void Delete(void* data) {
+    _aligned_free(data);
+  }
+#else
+  static void Delete(void* data) {
+    free(data);
+  }
+#endif
+
+  MemoryDeleter GetDeleter() override {
+    return Delete;
+  }
+};
+
+// Get the CPU Alloctor.
+CPUAllocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+void SetCPUAllocator(CPUAllocator* alloc);
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_ALLOCATOR_H_
diff --git a/caffe2/core/asan.h b/caffe2/core/asan.h
new file mode 100644
index 0000000..0d5ce0d
--- /dev/null
+++ b/caffe2/core/asan.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// Detect address sanitizer as some stuff doesn't work with it
+
+#undef CAFFE2_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define CAFFE2_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(CAFFE2_ASAN_ENABLED)
+#define CAFFE2_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(CAFFE2_ASAN_ENABLED)
+#define CAFFE2_ASAN_ENABLED 0
+#endif
+
+// Define sanitization macro
+#if !CAFFE2_ASAN_ENABLED
+#define CAFFE2_NO_SANITIZE(...)
+#else
+#define CAFFE2_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
+#endif
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
new file mode 100644
index 0000000..c7c020e
--- /dev/null
+++ b/caffe2/core/blob.h
@@ -0,0 +1,267 @@
+#ifndef CAFFE2_CORE_BLOB_H_
+#define CAFFE2_CORE_BLOB_H_
+
+#include <cstddef>
+#include <sstream>
+#include <typeinfo>
+#include <type_traits>
+#include <vector>
+
+#include "caffe2/core/blob_serializer_base.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class Blob {
+ public:
+  typedef void (*DestroyCall)(void*);
+
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() : meta_(), pointer_(nullptr) {}
+  ~Blob() { Reset(); }
+
+  Blob(Blob&& other) noexcept
+      : meta_(std::move(other.meta_)),
+        pointer_(std::move(other.pointer_)),
+        destroy_(std::move(other.destroy_)) {
+    other.meta_ = {};
+    other.pointer_ = nullptr;
+    other.destroy_ = nullptr;
+  }
+
+  Blob& operator=(Blob&& other) noexcept {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = std::move(other.meta_);
+    pointer_ = std::move(other.pointer_);
+    destroy_ = std::move(other.destroy_);
+    other.meta_ = {};
+    other.pointer_ = nullptr;
+    other.destroy_ = nullptr;
+    return *this;
+  }
+
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const { return meta_.Match<T>(); }
+
+  /**
+   * Returns the meta info of the blob.
+   */
+  inline const TypeMeta& meta() const { return meta_; }
+
+  /**
+   * Returns a printable typename of the blob.
+   */
+  inline const char* TypeName() const { return meta_.name(); }
+
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  template <class T>
+  const T& Get() const {
+    CAFFE_ENFORCE(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    return *static_cast<const T*>(pointer_);
+  }
+
+  const void* GetRaw() const {
+    return pointer_;
+  }
+  void* GetRaw() {
+    return pointer_;
+  }
+
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = &Destroy<T>;
+    return allocated;
+  }
+
+  inline void*
+  Reset(void* allocated, const TypeMeta& meta, const DestroyCall& destroy) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = destroy;
+    return allocated;
+  }
+
+  /**
+   * Releases the ownership, if any, this Blob has on the underlying pointer.
+   * The user is then responsible for freeing the data if needed
+   */
+  inline DestroyCall Release() {
+    DestroyCall d = destroy_;
+    destroy_ = nullptr;
+    return d;
+  }
+
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = nullptr;
+    return allocated;
+  }
+
+  /**
+   * Resets the Blob to an empty one.
+   */
+  inline void Reset() {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    destroy_ = nullptr;
+  }
+
+  /**
+   * Serializes the current blob, if possible. Note that this serialization uses
+   * the registration mechanism and one has to implement specific serialization
+   * approaches for specific classes. Acceptor should take care of writing data
+   * to the actual storage.
+   */
+  void Serialize(
+      const string& name,
+      BlobSerializerBase::SerializationAcceptor acceptor,
+      int chunk_size = kDefaultChunkSize) const;
+
+  /**
+   * @brief Convenience function to serialize a blob to a string.
+   *
+   * This is a conveinence function to serialize small Blobs that produce
+   * manageable serialized strings. To serialize big blobs such as
+   * large sparse tensors, use the fully-functional interface in
+   * blob_serializer_base.h.
+   *
+   * NOTE: this function doesn't do chunking and might break with big tensors.
+   */
+  string Serialize(const string& name) const;
+
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs) {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(destroy_, rhs.destroy_);
+  }
+
+  /**
+   * Deserializes from a string containing either BlobProto or TensorProto. If
+   * the deserialization fails, the content in the blob should no longer be
+   * trusted.
+   */
+  void Deserialize(const string& content);
+  void Deserialize(const BlobProto& proto);
+
+ private:
+  /**
+   * @brief A destroy call that is used to properly deconstruct objects.
+   */
+  template <class T>
+  static void Destroy(void* pointer) {
+    delete static_cast<T*>(pointer);
+  }
+  TypeMeta meta_;
+  void* pointer_ = nullptr;
+  DestroyCall destroy_ = nullptr;
+
+  DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+inline void swap(Blob& lhs, Blob& rhs) {
+  lhs.swap(rhs);
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_BLOB_H_
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
new file mode 100644
index 0000000..498f0b5
--- /dev/null
+++ b/caffe2/core/blob_gpu_test.cc
@@ -0,0 +1,215 @@
+#include <iostream>  // NOLINT
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+template <typename T> class TensorGPUTest : public ::testing::Test {};
+template <typename T> class TensorGPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
+  if (!caffe2::HasCudaGPU()) return;
+  TensorCUDA tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Resize(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 3);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
+  if (!HasCudaGPU()) return;
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCUDA tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 3);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Resize(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim32(0), 7);
+  EXPECT_EQ(tensor.dim32(1), 11);
+  EXPECT_EQ(tensor.dim32(2), 13);
+  EXPECT_EQ(tensor.dim32(3), 17);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareData) {
+  if (!HasCudaGPU()) return;
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes) {
+  if (!HasCudaGPU()) return;
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  vector<int> alternate_dims(1);
+  alternate_dims[0] = 2 * 3 * 5;
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(alternate_dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(other_tensor.ndim(), 1);
+  EXPECT_EQ(other_tensor.dim32(0), alternate_dims[0]);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
+  if (!HasCudaGPU()) return;
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+  auto* old_pointer = other_tensor.data<TypeParam>();
+
+  dims[0] = 7;
+  tensor.Resize(dims);
+  EXPECT_EQ(old_pointer, other_tensor.data<TypeParam>());
+  EXPECT_NE(old_pointer, tensor.mutable_data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
+  if (!HasCudaGPU()) return;
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  TensorCUDA tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
+}
+
+#define TEST_SERIALIZATION_GPU_WITH_TYPE(TypeParam, field_name)            \
+  TEST(TensorGPUTest, TensorSerialization_##TypeParam) {                   \
+    if (!HasCudaGPU()) {                                                   \
+      return;                                                              \
+    }                                                                      \
+    Blob blob;                                                             \
+    TensorCPU cpu_tensor;                                                  \
+    cpu_tensor.Resize(2, 3);                                               \
+    for (int i = 0; i < 6; ++i) {                                          \
+      cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
+    }                                                                      \
+    blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor);                   \
+    string serialized = blob.Serialize("test");                            \
+    BlobProto proto;                                                       \
+    CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
+    EXPECT_EQ(proto.name(), "test");                                       \
+    EXPECT_EQ(proto.type(), "Tensor");                                     \
+    EXPECT_TRUE(proto.has_tensor());                                       \
+    const TensorProto& tensor_proto = proto.tensor();                      \
+    EXPECT_EQ(                                                             \
+        tensor_proto.data_type(),                                          \
+        TypeMetaToDataType(TypeMeta::Make<TypeParam>()));                  \
+    EXPECT_EQ(tensor_proto.field_name##_size(), 6);                        \
+    for (int i = 0; i < 6; ++i) {                                          \
+      EXPECT_EQ(tensor_proto.field_name(i), static_cast<TypeParam>(i));    \
+    }                                                                      \
+    Blob new_blob;                                                         \
+    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
+    EXPECT_TRUE(new_blob.IsType<TensorCUDA>());                            \
+    TensorCPU new_cpu_tensor(blob.Get<TensorCUDA>());                      \
+    EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
+    EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
+    EXPECT_EQ(new_cpu_tensor.dim(1), 3);                                   \
+    for (int i = 0; i < 6; ++i) {                                          \
+      EXPECT_EQ(                                                           \
+          cpu_tensor.data<TypeParam>()[i],                                 \
+          new_cpu_tensor.data<TypeParam>()[i]);                            \
+    }                                                                      \
+  }
+
+TEST_SERIALIZATION_GPU_WITH_TYPE(bool, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(double, double_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(float, float_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int8_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int16_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(uint8_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(uint16_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)
+
+TEST(TensorTest, TensorSerializationMultiDevices) {
+  Blob blob;
+  TensorCPU tensor;
+  tensor.Resize(2, 3);
+  for (int i = 0; i < 6; ++i) {
+    tensor.mutable_data<float>()[i] = i;
+  }
+  for (int gpu_id = 0; gpu_id < NumCudaDevices(); ++gpu_id) {
+    DeviceGuard guard(gpu_id);
+    CUDAContext context(gpu_id);
+    blob.Reset(new TensorCUDA(tensor, &context));
+    string serialized = blob.Serialize("test");
+    BlobProto proto;
+    CAFFE_ENFORCE(proto.ParseFromString(serialized));
+    EXPECT_EQ(proto.name(), "test");
+    EXPECT_TRUE(proto.has_tensor());
+    const TensorProto& tensor_proto = proto.tensor();
+    EXPECT_EQ(tensor_proto.data_type(), TensorProto::FLOAT);
+    EXPECT_EQ(tensor_proto.float_data_size(), 6);
+    for (int i = 0; i < 6; ++i) {
+      EXPECT_EQ(tensor_proto.float_data(i), i);
+    }
+    EXPECT_TRUE(tensor_proto.has_device_detail());
+    EXPECT_EQ(tensor_proto.device_detail().device_type(), CUDA);
+    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
+    // Test if the restored blob is still of the same device.
+    blob.Reset();
+    EXPECT_NO_THROW(blob.Deserialize(serialized));
+    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
+              gpu_id);
+    // Test if we force the restored blob on a different device, we
+    // can still get so.
+    blob.Reset();
+    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
+    EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
+    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
+  }
+}
+
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
new file mode 100644
index 0000000..a7cbb41
--- /dev/null
+++ b/caffe2/core/blob_serialization.cc
@@ -0,0 +1,140 @@
+#include "caffe2/core/blob_serialization.h"
+
+#include <sstream>
+#include <mutex>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_int(
+    caffe2_tensor_chunk_size,
+    1000000,
+    "Chunk size to split tensor data into");
+
+CAFFE2_DEFINE_int(
+    caffe2_max_tensor_serializer_threads,
+    16,
+    "Maximal number of threads that can be used for tensor serialization");
+
+CAFFE2_DEFINE_bool(
+    caffe2_serialize_fp16_as_bytes,
+    false,
+    "Serialize FLOAT16 tensors using byte_data field");
+
+namespace caffe2 {
+/**
+ * @brief StringSerializer is the serializer for String.
+ *
+ * StringSerializer takes in a blob that contains a String, and serializes it
+ * into a BlobProto protocol buffer.
+ */
+class StringSerializer : public BlobSerializerBase {
+ public:
+  StringSerializer() {}
+  ~StringSerializer() {}
+  /**
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * otherwise this function produces a fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<std::string>());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::string");
+    blob_proto.set_content(blob.template Get<std::string>());
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+/**
+ * @brief StringDeserializer is the deserializer for Strings.
+ *
+ */
+class StringDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    *blob->GetMutable<std::string>() = proto.content();
+  }
+};
+
+// The blob serialization member function implementation.
+void Blob::Serialize(
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) const {
+  std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
+  CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
+  serializer->SerializeWithChunkSize(*this, name, acceptor, chunk_size);
+}
+
+// The blob serialization member function implementation.
+std::string Blob::Serialize(const string& name) const {
+  std::string data;
+  BlobSerializerBase::SerializationAcceptor acceptor = [&data](
+      const std::string&, const std::string& blob) {
+    DCHECK(data.empty()); // should be called once with kNoChunking
+    data = blob;
+  };
+  this->Serialize(name, acceptor, kNoChunking);
+  return data;
+}
+
+// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
+template <>
+void TensorSerializer<CPUContext>::StoreDeviceDetail(
+    const Tensor<CPUContext>& /*input*/,
+    TensorProto* /*proto*/) {}
+
+// The actual serialization registry objects.
+CAFFE_DEFINE_TYPED_REGISTRY(
+    BlobSerializerRegistry,
+    CaffeTypeId,
+    BlobSerializerBase,
+    std::unique_ptr);
+
+CAFFE_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+
+void Blob::Deserialize(const string& content) {
+  BlobProto blob_proto;
+  CAFFE_ENFORCE(
+      blob_proto.ParseFromString(content),
+      "Cannot parse content into a BlobProto.");
+  Deserialize(blob_proto);
+}
+
+void Blob::Deserialize(const BlobProto& blob_proto) {
+  if (blob_proto.type() == kTensorBlobType) {
+    // This is a tensor object. Depending on the device type, we will
+    // use the corresponding TensorDeserializer.
+    auto deserializer = CreateDeserializer(
+        "Tensor" +
+        DeviceTypeName(blob_proto.tensor().device_detail().device_type()));
+    // Tensor's deserializer should always be registered, but we will double
+    // check if it is not null anyway.
+    CAFFE_ENFORCE(deserializer.get());
+    deserializer->Deserialize(blob_proto, this);
+  } else {
+    auto deserializer = CreateDeserializer(blob_proto.type());
+    CAFFE_ENFORCE(
+        deserializer.get(),
+        "No registered deserializer for type ",
+        blob_proto.type());
+    deserializer->Deserialize(blob_proto, this);
+  }
+}
+
+namespace {
+// Serialize TensorCPU.
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<TensorCPU>()),
+    TensorSerializer<CPUContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
+// Serialize std::string
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
+REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
new file mode 100644
index 0000000..94af8a9
--- /dev/null
+++ b/caffe2/core/blob_serialization.h
@@ -0,0 +1,586 @@
+#ifndef CAFFE2_CORE_BLOB_SERIALIZATION_H_
+#define CAFFE2_CORE_BLOB_SERIALIZATION_H_
+
+#include <limits>
+#include <future>
+
+#include <google/protobuf/repeated_field.h>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serializer_base.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/simple_queue.h"
+
+CAFFE2_DECLARE_int(caffe2_tensor_chunk_size);
+CAFFE2_DECLARE_int(caffe2_max_tensor_serializer_threads);
+CAFFE2_DECLARE_bool(caffe2_serialize_fp16_as_bytes);
+
+namespace caffe2 {
+
+constexpr auto kTensorBlobType = "Tensor";
+// String used to separate chunk id from the blob name when storing in DB
+constexpr auto kChunkIdSeparator = "#%";
+
+// The Blob serialization registry and serializer creator functions.
+CAFFE_DECLARE_TYPED_REGISTRY(
+    BlobSerializerRegistry,
+    CaffeTypeId,
+    BlobSerializerBase,
+    std::unique_ptr);
+#define REGISTER_BLOB_SERIALIZER(id, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
+// Creates an operator with the given operator definition.
+inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
+  return BlobSerializerRegistry()->Create(id);
+}
+
+/**
+ * @brief TensorSerializer is the serializer for Tensors.
+ *
+ * TensorSerializer takes in a blob that contains a Tensor, and serializes it
+ * into a TensorProto protocol buffer.
+ */
+template <class Context>
+class TensorSerializer : public BlobSerializerBase {
+ public:
+  TensorSerializer() : context_() {}
+  ~TensorSerializer() override {}
+  /**
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * otherwise this function produces a fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override;
+  void SerializeWithChunkSize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor,
+      int chunk_size) override;
+
+  void Serialize(const Tensor<Context>& tensor, const string& name,
+                 TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
+
+ private:
+  // A utility function to store the device context detauls.
+  void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
+  Context context_;
+};
+
+/**
+ * @brief BlobDeserializerBase is an abstract class that deserializes a blob
+ * from a BlobProto or a TensorProto.
+ */
+class BlobDeserializerBase {
+ public:
+  virtual ~BlobDeserializerBase() {}
+
+  // Deserializes from a BlobProto object.
+  virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
+};
+
+CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+#define REGISTER_BLOB_DESERIALIZER(name, ...) \
+  CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
+// Creates an operator with the given operator definition.
+inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
+  return BlobDeserializerRegistry()->Create(type);
+}
+
+/**
+ * @brief TensorDeserializer is the deserializer for Tensors.
+ *
+ * The device that the deserialized Tensor will live under is determined by the
+ * device_detail field. If you want to specify the device of the deserialized
+ * tensor, change the TensorProto's corresponding fields before calling
+ * Deserialize.
+ */
+template <class Context>
+class TensorDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override;
+  void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <typename SrcType, typename DstType, class Context>
+inline void CopyToProtoAsIs(
+    const size_t size,
+    const SrcType* src,
+    google::protobuf::RepeatedField<DstType>* field,
+    Context* context) {
+  static_assert(
+      sizeof(SrcType) == sizeof(DstType),
+      "The source type and dest type cannot be copied as-is. Did "
+      "you mean CopyToProtoWithCast?");
+  field->Reserve(size);
+  for (int i = 0; i < size; ++i) {
+    field->Add(0);
+  }
+  context->template Copy<SrcType, Context, CPUContext>(
+      size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
+  // Make sure that we finish the copy into the protobuf.
+  context->FinishDeviceComputation();
+}
+
+template <typename SrcType, typename DstType, class Context>
+inline void CopyToProtoWithCast(
+    const size_t size,
+    const SrcType* src,
+    google::protobuf::RepeatedField<DstType>* field,
+    Context* context) {
+  // TODO: we are having one unnecessary copy here if the context is already
+  // CPUContext. Remove it if it is performance critical.
+  unique_ptr<SrcType[]> buffer(new SrcType[size]);
+  context->template Copy<SrcType, Context, CPUContext>(
+      size, src, buffer.get());
+  context->FinishDeviceComputation();
+  field->Reserve(size);
+  for (int i = 0; i < size; ++i) {
+    field->Add(static_cast<DstType>(buffer[i]));
+  }
+}
+
+template <typename SrcType, typename DstType, class Context>
+inline void CopyFromProtoAsIs(
+    const size_t size,
+    const google::protobuf::RepeatedField<SrcType>& field,
+    DstType* dst,
+    Context* context) {
+  static_assert(
+      sizeof(SrcType) == sizeof(DstType),
+      "The source type and dest type cannot be copied as-is. Did "
+      "you mean CopyFromProtoWithCast?");
+  CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
+  context->template Copy<DstType, CPUContext, Context>(
+      size, reinterpret_cast<const DstType*>(field.data()), dst);
+}
+
+template <typename SrcType, typename DstType, class Context>
+inline void CopyFromProtoWithCast(
+    const size_t size,
+    const google::protobuf::RepeatedField<SrcType>& field,
+    DstType* dst,
+    Context* context) {
+  CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
+  // TODO: we are having one unnecessary copy here if the context is already
+  // CPUContext. Remove it if it is performance critical.
+  unique_ptr<DstType[]> buffer(new DstType[size]);
+  const SrcType* src = field.data();
+  for (int i = 0; i < size; ++i) {
+    buffer[i] = static_cast<DstType>(src[i]);
+  }
+  context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
+}
+
+}  // namespace detail
+
+template <class Context>
+void TensorSerializer<Context>::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
+}
+
+template <class Context>
+void TensorSerializer<Context>::SerializeWithChunkSize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) {
+  CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
+  const auto& tensor = blob.template Get<Tensor<Context>>();
+  if (chunk_size == kNoChunking) {
+    chunk_size = tensor.size() + 1; // to account for empty tensors
+  } else if (chunk_size == kDefaultChunkSize) {
+    chunk_size = FLAGS_caffe2_tensor_chunk_size;
+  }
+
+  auto processChunk = [&](int64_t chunkStart) {
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(kTensorBlobType);
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    this->Serialize(
+        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
+    acceptor(
+        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
+        blob_proto.SerializeAsString());
+  };
+
+#ifndef __ANDROID__
+  std::vector<std::future<void>> futures;
+  // Poorman's IOBound ThreadPool
+  SimpleQueue<size_t> chunkQueue;
+  auto task = [&]() {
+    size_t chunkStart;
+    while (chunkQueue.Pop(&chunkStart)) {
+      processChunk(chunkStart);
+    }
+  };
+  if (tensor.size() > chunk_size) {
+    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
+      futures.emplace_back(std::async(std::launch::async, task));
+    }
+  }
+#endif
+
+  VLOG(1) << "Serializing blob " << name;
+  // Serialize whole vector. If vector is empty, it's shape still needs to be
+  // serialized in empty proto
+  for (size_t chunkBegin = 0;
+       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin += chunk_size) {
+    VLOG(2) << "Starting a chunk at " << chunkBegin;
+#ifndef __ANDROID__
+    if (tensor.size() > chunk_size) {
+      chunkQueue.Push(chunkBegin);
+    } else {
+      // Sync mode for small tensors
+      processChunk(chunkBegin);
+    }
+#else
+    // Since Android does not have std::future, we will always do sync mode
+    processChunk(chunkBegin);
+#endif
+  }
+
+#ifndef __ANDROID__
+  chunkQueue.NoMoreJobs();
+  for (auto& fut : futures) {
+    fut.get();
+  }
+#endif
+}
+
+template <class Context>
+void TensorSerializer<Context>::Serialize(
+    const Tensor<Context>& input,
+    const string& /*name*/,
+    TensorProto* proto_ptr,
+    size_t chunkBegin,
+    int32_t chunkSize) {
+  CAFFE_ENFORCE(
+      chunkBegin <= input.size(),
+      "Chunk begin is out of tensor: ",
+      chunkBegin,
+      ' ',
+      input.size());
+  if (chunkBegin + chunkSize > input.size()) {
+    chunkSize = input.size() - chunkBegin;
+  }
+
+  CAFFE_ENFORCE(
+      input.raw_data() || chunkSize == 0,
+      "The input does not have data input yet. This is probably because you "
+      "created a tensor of non-zero shape but never filled its data via "
+      "mutable_data() calls. This means that it makes no sense to serialize "
+      "the tensor content.");
+
+  TensorProto& proto = *proto_ptr;
+  proto.mutable_segment()->set_begin(chunkBegin);
+  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
+
+  for (int i = 0; i < input.ndim(); ++i) {
+    proto.add_dims(input.dim(i));
+  }
+  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
+  proto.set_data_type(data_type);
+  StoreDeviceDetail(input, &proto);
+
+  // A lot of copypaste is error prone. Should we create a macro for this?
+  switch (data_type) {
+  case TensorProto_DataType_FLOAT:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<float>() + chunkBegin,
+        proto.mutable_float_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT32:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<int>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_BYTE:
+    LOG(FATAL) << "This should not happen. When serializing, "
+                  "BYTE is deprecated and moved to UINT8.";
+    break;
+  case TensorProto_DataType_STRING:
+    {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      const string* content = input.template data<string>();
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        proto.add_string_data(content[i]);
+      }
+      break;
+    }
+  case TensorProto_DataType_BOOL:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<bool>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UINT8:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<uint8_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT8:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<int8_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UINT16:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<uint16_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT16:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<int16_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT64:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<int64_t>() + chunkBegin,
+        proto.mutable_int64_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_FLOAT16: {
+    if (FLAGS_caffe2_serialize_fp16_as_bytes) {
+      const int kValue = 1;
+      CAFFE_ENFORCE_EQ(
+          reinterpret_cast<const char*>(&kValue)[0],
+          1,
+          "Serialization of FLOAT16 on big endian platform "
+          "is not written yet.");
+      unique_ptr<char[]> buffer(new char[2 * chunkSize]);
+      this->context_.template Copy<char, Context, CPUContext>(
+          2 * chunkSize,
+          reinterpret_cast<const char*>(
+              input.template data<float16>() + chunkBegin),
+          buffer.get());
+      this->context_.FinishDeviceComputation();
+      proto.set_byte_data(buffer.release(), 2 * chunkSize);
+    } else {
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
+              chunkBegin,
+          proto.mutable_int32_data(),
+          &this->context_);
+    }
+  } break;
+  case TensorProto_DataType_DOUBLE:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<double>() + chunkBegin,
+        proto.mutable_double_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UNDEFINED: {
+    proto.mutable_string_data()->Reserve(chunkSize);
+    Blob temp_blob;
+    const char* raw_data = static_cast<const char*>(input.raw_data());
+    for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+      temp_blob.ShareExternal(
+          const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
+      proto.add_string_data(temp_blob.Serialize(""));
+    }
+  } break;
+    // Note: we intentially do not provide "default:" so if any new data types
+    // are added, the compiler should warn the user to add the case here.
+  }
+}
+
+template <class Context>
+void TensorDeserializer<Context>::Deserialize(
+    const BlobProto& blob_proto,
+    Blob* blob) {
+  Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
+}
+
+template <class Context>
+void TensorDeserializer<Context>::Deserialize(
+    const TensorProto& proto,
+    Tensor<Context>* tensor) {
+  // We create a local context for deserializing. Since Caffe2 contexts are
+  // usually lightweighted, this should not involve too much overhead.
+  Context context(proto.device_detail());
+  context.SwitchToDevice(0);
+  vector<TIndex> dims;
+  for (const TIndex d : proto.dims()) {
+    dims.push_back(d);
+  }
+  tensor->Resize(dims);
+
+  int64_t chunkBegin = 0;
+  auto chunkEnd = tensor->size();
+  if (proto.has_segment()) {
+    chunkBegin = proto.segment().begin();
+    chunkEnd = proto.segment().end();
+  }
+  CAFFE_ENFORCE(
+      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
+      "Invalid chunk ",
+      chunkBegin,
+      ' ',
+      chunkEnd,
+      " with total tensor size ",
+      tensor->size());
+  auto chunkSize = chunkEnd - chunkBegin;
+
+  switch (proto.data_type()) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.float_data(),
+          tensor->template mutable_data<float>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_BYTE:
+      // Since BYTE stores the data in a string field instead of a repreated
+      // field we will have it special cased.
+      CAFFE_ENFORCE_EQ(
+          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
+      context.template Copy<uint8_t, Context, CPUContext>(
+          chunkSize,
+          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
+          tensor->template mutable_data<uint8_t>() + chunkBegin);
+      break;
+    case TensorProto_DataType_STRING:
+      // Special handing of string because it is a non-fundamental type.
+      {
+        string* content = tensor->template mutable_data<string>();
+        for (int i = 0; i < chunkSize; ++i) {
+          content[i + chunkBegin] = proto.string_data(i);
+        }
+      }
+      break;
+    case TensorProto_DataType_BOOL:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<bool>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint8_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int8_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint16_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int16_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int64_data(),
+          tensor->template mutable_data<int64_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_FLOAT16:
+      if (proto.has_byte_data()) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        CAFFE_ENFORCE_EQ(
+            2 * chunkSize,
+            proto.byte_data().size(),
+            "Incorrect proto field size.");
+        context.template Copy<float16, Context, CPUContext>(
+            chunkSize,
+            reinterpret_cast<const float16*>(proto.byte_data().data()),
+            tensor->template mutable_data<float16>() + chunkBegin);
+      } else {
+        // Backward compatibility with models which used int32_data field
+        detail::CopyFromProtoWithCast(
+            chunkSize,
+            proto.int32_data(),
+            reinterpret_cast<uint16_t*>(
+                tensor->template mutable_data<float16>()) +
+                chunkBegin,
+            &context);
+      }
+      break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.double_data(),
+          tensor->template mutable_data<double>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      Blob temp_blob;
+      void* raw_ptr = nullptr;
+      for (int i = 0; i < chunkSize; ++i) {
+        temp_blob.Deserialize(proto.string_data(i));
+        if (i == 0) {
+          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
+        }
+        temp_blob.meta().copy()(
+            temp_blob.GetRaw(),
+            static_cast<char*>(raw_ptr) +
+                (i + chunkBegin) * temp_blob.meta().itemsize(),
+            1);
+      }
+    }
+  }
+  context.FinishDeviceComputation();
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_BLOB_SERIALIZATION_H_
diff --git a/caffe2/core/blob_serialization_gpu.cc b/caffe2/core/blob_serialization_gpu.cc
new file mode 100644
index 0000000..76349f3
--- /dev/null
+++ b/caffe2/core/blob_serialization_gpu.cc
@@ -0,0 +1,23 @@
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+template <>
+void TensorSerializer<CUDAContext>::StoreDeviceDetail(
+    const Tensor<CUDAContext>& input, TensorProto* proto) {
+  auto* device_detail = proto->mutable_device_detail();
+  device_detail->set_device_type(CUDA);
+  device_detail->set_cuda_gpu_id(
+      GetGPUIDForPointer(input.raw_data()));
+}
+
+namespace {
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<TensorCUDA>()),
+    TensorSerializer<CUDAContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer<CUDAContext>);
+}
+}  // namespace caffe2
+
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
new file mode 100644
index 0000000..4bf4724
--- /dev/null
+++ b/caffe2/core/blob_serializer_base.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <string>
+#include <functional>
+
+namespace caffe2 {
+
+class Blob;
+
+constexpr int kDefaultChunkSize = -1;
+constexpr int kNoChunking = 0;
+
+/**
+ * @brief BlobSerializerBase is an abstract class that serializes a blob to a
+ * string.
+ *
+ * This class exists purely for the purpose of registering type-specific
+ * serialization code. If you need to serialize a specific type, you should
+ * write your own Serializer class, and then register it using
+ * REGISTER_BLOB_SERIALIZER. For a detailed example, see TensorSerializer for
+ * details.
+ */
+class BlobSerializerBase {
+ public:
+  virtual ~BlobSerializerBase() {}
+  using SerializationAcceptor =
+     std::function<void(const std::string& blobName, const std::string& data)>;
+  /**
+   * @brief The virtual function that returns a serialized string for the input
+   * blob.
+   * @param blob
+   *     the input blob to be serialized.
+   * @param name
+   *     the blob name to be used in the serialization implementation. It is up
+   *     to the implementation whether this name field is going to be used or
+   *     not.
+   * @param acceptor
+   *     a lambda which accepts key value pairs to save them to storage.
+   *     serailizer can use it to save blob in several chunks
+   *     acceptor should be thread-safe
+   */
+  virtual void Serialize(const Blob& blob, const std::string& name,
+                        SerializationAcceptor acceptor) = 0;
+
+  virtual void SerializeWithChunkSize(
+      const Blob& blob,
+      const std::string& name,
+      SerializationAcceptor acceptor,
+      int /*chunk_size*/) {
+    // Base implementation.
+    Serialize(blob, name, acceptor);
+  }
+};
+
+} // namespace caffe2
diff --git a/caffe2/core/blob_stats.cc b/caffe2/core/blob_stats.cc
new file mode 100644
index 0000000..057e966
--- /dev/null
+++ b/caffe2/core/blob_stats.cc
@@ -0,0 +1,36 @@
+#include "caffe2/core/blob_stats.h"
+
+namespace caffe2 {
+
+const BlobStatGetter* BlobStatRegistry::get(CaffeTypeId id) {
+  auto it = map_.find(id);
+  if (it == map_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
+}
+
+BlobStatRegistry& BlobStatRegistry::instance() {
+  static BlobStatRegistry registry;
+  return registry;
+}
+
+void BlobStatRegistry::doRegister(
+    CaffeTypeId id,
+    std::unique_ptr<BlobStatGetter>&& v) {
+  // don't use CAFFE_ENFORCE_EQ to avoid static initialization order fiasco.
+  if (map_.count(id) > 0) {
+    throw std::runtime_error("BlobStatRegistry: Type already registered.");
+  }
+  map_[id] = std::move(v);
+}
+
+namespace BlobStat {
+
+size_t sizeBytes(const Blob& blob) {
+  auto* p = BlobStatRegistry::instance().get(blob.meta().id());
+  return p ? p->sizeBytes(blob) : 0;
+}
+
+} // namespace BlobStats
+}
diff --git a/caffe2/core/blob_stats.h b/caffe2/core/blob_stats.h
new file mode 100644
index 0000000..f4ef21d
--- /dev/null
+++ b/caffe2/core/blob_stats.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/typeid.h"
+
+#include <unordered_map>
+
+namespace caffe2 {
+
+struct BlobStatGetter {
+  virtual size_t sizeBytes(const Blob& blob) const = 0;
+  virtual ~BlobStatGetter() {}
+};
+
+struct BlobStatRegistry {
+ private:
+  std::unordered_map<CaffeTypeId, std::unique_ptr<BlobStatGetter>> map_;
+  void doRegister(CaffeTypeId id, std::unique_ptr<BlobStatGetter>&& v);
+
+ public:
+  template <typename T, typename Getter>
+  struct Registrar {
+    Registrar() {
+      BlobStatRegistry::instance().doRegister(
+          TypeMeta::Id<T>(), std::unique_ptr<Getter>(new Getter));
+    }
+  };
+
+  const BlobStatGetter* get(CaffeTypeId id);
+  static BlobStatRegistry& instance();
+};
+
+#define REGISTER_BLOB_STAT_GETTER(Type, BlobStatGetterClass)    \
+  static BlobStatRegistry::Registrar<Type, BlobStatGetterClass> \
+      CAFFE_ANONYMOUS_VARIABLE(BlobStatRegistry)
+
+namespace BlobStat {
+
+/**
+ * Return size in bytes of the blob, if available for a blob of given type.
+ * If not available, return 0.
+ */
+size_t sizeBytes(const Blob& blob);
+}
+}
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
new file mode 100644
index 0000000..3fafbf2
--- /dev/null
+++ b/caffe2/core/blob_test.cc
@@ -0,0 +1,1115 @@
+#include <iostream>
+#include <memory>
+#include <mutex>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/qtensor.h"
+#include "caffe2/core/qtensor_serialization.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_int64(caffe2_test_big_tensor_size, 100000000, "");
+CAFFE2_DECLARE_int(caffe2_tensor_chunk_size);
+CAFFE2_DECLARE_bool(caffe2_serialize_fp16_as_bytes);
+
+namespace caffe2 {
+using namespace ::caffe2::db;
+namespace {
+class BlobTestFoo {
+ public:
+  int32_t val;
+};
+class BlobTestBar {};
+class BlobTestNonDefaultConstructible {
+ public:
+  BlobTestNonDefaultConstructible() = delete;
+  BlobTestNonDefaultConstructible(int x) : val(x) {}
+  int32_t val;
+};
+}
+
+CAFFE_KNOWN_TYPE(BlobTestFoo);
+CAFFE_KNOWN_TYPE(BlobTestBar);
+CAFFE_KNOWN_TYPE(BlobTestNonDefaultConstructible);
+
+class BlobTestFooSerializer : public BlobSerializerBase {
+ public:
+  BlobTestFooSerializer() {}
+  ~BlobTestFooSerializer() {}
+  /**
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * otherwise this function produces a fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<BlobTestFoo>());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("BlobTestFoo");
+    // For simplicity we will just serialize the 4-byte content as a string.
+    blob_proto.set_content(std::string(
+        reinterpret_cast<const char*>(&(blob.Get<BlobTestFoo>().val)),
+        sizeof(int32_t)));
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+class BlobTestFooDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    blob->GetMutable<BlobTestFoo>()->val =
+        reinterpret_cast<const int32_t*>(proto.content().c_str())[0];
+  }
+};
+
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<BlobTestFoo>()), BlobTestFooSerializer);
+REGISTER_BLOB_DESERIALIZER(BlobTestFoo, BlobTestFooDeserializer);
+
+namespace {
+
+TEST(BlobTest, Blob) {
+  Blob blob;
+
+  int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
+  EXPECT_TRUE(blob.IsType<int>());
+  EXPECT_FALSE(blob.IsType<BlobTestFoo>());
+
+  BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
+  EXPECT_TRUE(blob.IsType<BlobTestFoo>());
+  EXPECT_FALSE(blob.IsType<int>());
+}
+
+TEST(BlobTest, BlobUninitialized) {
+  Blob blob;
+  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
+}
+
+TEST(BlobTest, BlobWrongType) {
+  Blob blob;
+  BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
+  EXPECT_TRUE(blob.IsType<BlobTestFoo>());
+  EXPECT_FALSE(blob.IsType<int>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob.Get<BlobTestFoo>(), nullptr);
+  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
+}
+
+TEST(BlobTest, BlobReset) {
+  Blob blob;
+  std::unique_ptr<BlobTestFoo> foo(new BlobTestFoo());
+  EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobMove) {
+  Blob blob1;
+  std::unique_ptr<BlobTestFoo> foo(new BlobTestFoo());
+  auto* fooPtr = foo.get();
+  EXPECT_TRUE(blob1.Reset(foo.release()) != nullptr);
+  Blob blob2;
+  blob2 = std::move(blob1);
+  ASSERT_THROW(blob1.Get<BlobTestFoo>(), EnforceNotMet);
+  EXPECT_EQ(&blob2.Get<BlobTestFoo>(), fooPtr);
+  Blob blob3{std::move(blob2)};
+  EXPECT_EQ(&blob3.Get<BlobTestFoo>(), fooPtr);
+}
+
+TEST(BlobTest, BlobNonConstructible) {
+  Blob blob;
+  ASSERT_THROW(blob.Get<BlobTestNonDefaultConstructible>(), EnforceNotMet);
+  // won't work because it's not default constructible
+  // blob.GetMutable<BlobTestNonDefaultConstructible>();
+  EXPECT_FALSE(
+      blob.GetMutableOrNull<BlobTestNonDefaultConstructible>() != nullptr);
+  EXPECT_TRUE(blob.Reset(new BlobTestNonDefaultConstructible(42)) != nullptr);
+  ASSERT_NO_THROW(blob.Get<BlobTestNonDefaultConstructible>());
+  ASSERT_TRUE(
+      blob.GetMutableOrNull<BlobTestNonDefaultConstructible>() != nullptr);
+  EXPECT_EQ(blob.Get<BlobTestNonDefaultConstructible>().val, 42);
+  blob.GetMutableOrNull<BlobTestNonDefaultConstructible>()->val = 37;
+  EXPECT_EQ(blob.Get<BlobTestNonDefaultConstructible>().val, 37);
+}
+
+TEST(BlobTest, BlobShareExternalPointer) {
+  Blob blob;
+  std::unique_ptr<BlobTestFoo> foo(new BlobTestFoo());
+  EXPECT_EQ(blob.ShareExternal<BlobTestFoo>(foo.get()), foo.get());
+  EXPECT_TRUE(blob.IsType<BlobTestFoo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobShareExternalObject) {
+  Blob blob;
+  BlobTestFoo foo;
+  EXPECT_EQ(blob.ShareExternal<BlobTestFoo>(&foo), &foo);
+  EXPECT_TRUE(blob.IsType<BlobTestFoo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, StringSerialization) {
+  const std::string kTestString = "Hello world?";
+  Blob blob;
+  *blob.GetMutable<std::string>() = kTestString;
+
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "std::string");
+  EXPECT_FALSE(proto.has_tensor());
+  EXPECT_EQ(proto.content(), kTestString);
+}
+
+TEST(TensorNonTypedTest, TensorChangeType) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+
+  auto* ptr = tensor.mutable_data<int>();
+  EXPECT_TRUE(ptr != nullptr);
+  EXPECT_TRUE(tensor.data<int>() != nullptr);
+  EXPECT_TRUE(tensor.meta().Match<int>());
+
+  // int and float are same size, so should retain the pointer
+  EXPECT_TRUE(tensor.mutable_data<float>() == (float*)ptr);
+  EXPECT_TRUE(tensor.data<float>() == (const float*)ptr);
+  EXPECT_TRUE(tensor.meta().Match<float>());
+
+  // float16 is smaller, so still should share buffer
+  EXPECT_TRUE(tensor.mutable_data<float16>() == (float16*)ptr);
+  EXPECT_TRUE(tensor.data<float16>() == (const float16*)ptr);
+  EXPECT_TRUE(tensor.meta().Match<float16>());
+
+  // share the data with other tensor so that the pointer won't be reused
+  // when we reallocate
+  TensorCPU other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  // but double is bigger, so it should allocate a new one
+  auto* doubleptr = tensor.mutable_data<double>();
+  EXPECT_TRUE(doubleptr != (double*)ptr);
+  EXPECT_TRUE(doubleptr != nullptr);
+  EXPECT_TRUE(tensor.data<double>() != nullptr);
+  EXPECT_TRUE(tensor.meta().Match<double>());
+}
+
+TEST(TensorNonTypedTest, NonDefaultConstructible) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+
+  // this doesn't compile - good!
+  // auto* ptr = tensor.mutable_data<BlobTestNonDefaultConstructible>();
+  EXPECT_THROW(
+      tensor.raw_mutable_data(
+          TypeMeta::Make<BlobTestNonDefaultConstructible>()),
+      EnforceNotMet);
+}
+
+template <typename T> class TensorCPUTest : public ::testing::Test {};
+template <typename T> class TensorCPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
+  TensorCPU tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Resize(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 3);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_EQ(tensor.size(), 2 * 3 * 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 3);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Resize(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim32(0), 7);
+  EXPECT_EQ(tensor.dim32(1), 11);
+  EXPECT_EQ(tensor.dim32(2), 13);
+  EXPECT_EQ(tensor.dim32(3), 17);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorInitializedZeroDim) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 0;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 0);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() == nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() == nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 2);
+  EXPECT_EQ(tensor.dim32(1), 3);
+  EXPECT_EQ(tensor.dim32(2), 5);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+
+  dims[0] = 7;
+  dims[1] = 0;
+  dims[2] = 13;
+  tensor.Resize(dims);
+  EXPECT_EQ(tensor.size(), 0);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim32(0), 7);
+  EXPECT_EQ(tensor.dim32(1), 0);
+  EXPECT_EQ(tensor.dim32(2), 13);
+  // output value can be arbitrary, but the call to data() shouldn't crash
+  tensor.mutable_data<TypeParam>();
+  tensor.data<TypeParam>();
+}
+
+TYPED_TEST(TensorCPUTest, TensorInitializedScalar) {
+  vector<int> dims;
+  TensorCPU tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 0);
+  EXPECT_EQ(tensor.size(), 1);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareData) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data<TypeParam>()[i] = i;
+    EXPECT_EQ(other_tensor.data<TypeParam>()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareDataRawPointer) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2*3*5]);
+  TensorCPU tensor(dims);
+  tensor.ShareExternalPointer(raw_buffer.get());
+  EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
+  EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    raw_buffer.get()[i] = i;
+    EXPECT_EQ(tensor.data<TypeParam>()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2 * 3 * 5]);
+  TensorCPU tensor(dims);
+  TypeMeta meta = TypeMeta::Make<TypeParam>();
+  tensor.ShareExternalPointer(raw_buffer.get(), meta);
+  EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
+  EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    raw_buffer.get()[i] = i;
+    EXPECT_EQ(tensor.data<TypeParam>()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
+  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
+  TensorCPU tensor;
+  ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  vector<int> alternate_dims(1);
+  alternate_dims[0] = 2 * 3 * 5;
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(alternate_dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(other_tensor.ndim(), 1);
+  EXPECT_EQ(other_tensor.dim32(0), alternate_dims[0]);
+  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+  EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data<TypeParam>()[i] = i;
+    EXPECT_EQ(other_tensor.data<TypeParam>()[i], i);
+  }
+}
+
+
+TYPED_TEST(TensorCPUTest, NoLongerSharesAfterResize) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+  auto* old_pointer = other_tensor.data<TypeParam>();
+
+  dims[0] = 7;
+  tensor.Resize(dims);
+  EXPECT_EQ(old_pointer, other_tensor.data<TypeParam>());
+  EXPECT_NE(old_pointer, tensor.mutable_data<TypeParam>());
+}
+
+TYPED_TEST(TensorCPUTest, NoLongerSharesAfterFreeMemory) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
+  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+  auto* old_pointer = other_tensor.data<TypeParam>();
+
+  tensor.FreeMemory();
+  EXPECT_EQ(old_pointer, other_tensor.data<TypeParam>());
+  EXPECT_NE(old_pointer, tensor.mutable_data<TypeParam>());
+}
+
+TYPED_TEST(TensorCPUTest, KeepOnShrink) {
+  // Set flags (defaults)
+  FLAGS_caffe2_keep_on_shrink = true;
+  FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;
+
+  vector<int> dims{2, 3, 5};
+  TensorCPU tensor(dims);
+  TypeParam* ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(ptr != nullptr);
+  // Expanding - will reallocate
+  tensor.Resize(3, 4, 6);
+  TypeParam* larger_ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(larger_ptr != nullptr);
+
+  // This check can fail when malloc() returns the same recently freed address
+  //EXPECT_NE(ptr, larger_ptr);
+
+  // Shrinking - will not reallocate
+  tensor.Resize(1, 2, 4);
+  TypeParam* smaller_ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(smaller_ptr != nullptr);
+  EXPECT_EQ(larger_ptr, smaller_ptr);
+  // resize to 0 in the meantime;
+  tensor.Resize(3, 0, 6);
+  // Expanding but still under capacity - will not reallocate
+  tensor.Resize(2, 3, 5);
+  TypeParam* new_ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(new_ptr != nullptr);
+  EXPECT_EQ(larger_ptr, new_ptr);
+}
+
+TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
+  // Set flags
+  FLAGS_caffe2_keep_on_shrink = true;
+  FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 4 * sizeof(TypeParam);
+
+  vector<int> dims{1, 8, 8};
+  TensorCPU tensor(dims);
+  TypeParam* ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(ptr != nullptr);
+  // Shrinking - will not reallocate
+  tensor.Resize(1, 7, 8);
+  TypeParam* smaller_ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(smaller_ptr != nullptr);
+  EXPECT_EQ(ptr, smaller_ptr);
+  // Resize to more than maximum shrink, should reallocate
+  tensor.Resize(1, 1, 8);
+  TypeParam* new_ptr = tensor.mutable_data<TypeParam>();
+  EXPECT_TRUE(new_ptr != nullptr);
+
+  // This check can fail when malloc() returns the same recently freed address
+  //EXPECT_NE(ptr, new_ptr);
+
+  // Restore default flags
+  FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotAccessRawDataWhenEmpty) {
+  TensorCPU tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_ANY_THROW(tensor.raw_data());
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
+  TensorCPU tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_ANY_THROW(tensor.data<TypeParam>());
+}
+
+TEST(TensorTest, TensorNonFundamentalType) {
+  TensorCPU tensor(vector<int>{2, 3, 4});
+  EXPECT_TRUE(tensor.mutable_data<std::string>() != nullptr);
+  const std::string* ptr = tensor.data<std::string>();
+  for (int i = 0; i < tensor.size(); ++i) {
+    EXPECT_TRUE(ptr[i] == "");
+  }
+}
+
+TEST(TensorTest, TensorNonFundamentalTypeClone) {
+  TensorCPU tensor(vector<int>{2, 3, 4});
+  std::string* ptr = tensor.mutable_data<std::string>();
+  EXPECT_TRUE(ptr != nullptr);
+  for (int i = 0; i < tensor.size(); ++i) {
+    EXPECT_TRUE(ptr[i] == "");
+    ptr[i] = "filled";
+  }
+  TensorCPU dst_tensor = tensor.Clone();
+  const std::string* dst_ptr = dst_tensor.data<std::string>();
+  for (int i = 0; i < dst_tensor.size(); ++i) {
+    EXPECT_TRUE(dst_ptr[i] == "filled");
+  }
+  // Change the original tensor
+  for (int i = 0; i < tensor.size(); ++i) {
+    EXPECT_TRUE(ptr[i] == "filled");
+    ptr[i] = "changed";
+  }
+  // Confirm that the cloned tensor is not affect
+  for (int i = 0; i < dst_tensor.size(); ++i) {
+    EXPECT_TRUE(dst_ptr[i] == "filled");
+  }
+}
+
+TEST(TensorTest, Tensor64BitDimension) {
+  // Initialize a large tensor.
+  TIndex large_number =
+      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+  TensorCPU tensor(vector<TIndex>{large_number});
+  EXPECT_EQ(tensor.ndim(), 1);
+  EXPECT_EQ(tensor.dim(0), large_number);
+  EXPECT_EQ(tensor.size(), large_number);
+  try {
+    EXPECT_TRUE(tensor.mutable_data<char>() != nullptr);
+  } catch (const EnforceNotMet& e) {
+    string msg = e.what();
+    size_t found = msg.find("posix_memalign");
+    if (found != string::npos) {
+      msg = msg.substr(0, msg.find('\n'));
+      LOG(WARNING) << msg;
+      LOG(WARNING) << "Out of memory issue with posix_memalign;\n";
+      return;
+    } else {
+      throw e;
+    }
+  }
+  EXPECT_EQ(tensor.nbytes(), large_number * sizeof(char));
+  EXPECT_EQ(tensor.itemsize(), sizeof(char));
+  // Try to go even larger, but this time we will not do mutable_data because we
+  // do not have a large enough memory.
+  tensor.Resize(large_number, 100);
+  EXPECT_EQ(tensor.ndim(), 2);
+  EXPECT_EQ(tensor.dim(0), large_number);
+  EXPECT_EQ(tensor.dim(1), 100);
+  EXPECT_EQ(tensor.size(), large_number * 100);
+}
+
+TEST(TensorDeathTest, CannotCastDownLargeDims) {
+  TIndex large_number =
+      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+  TensorCPU tensor(vector<TIndex>{large_number});
+  EXPECT_EQ(tensor.ndim(), 1);
+  EXPECT_EQ(tensor.dim(0), large_number);
+  ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
+}
+
+#define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
+  TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
+    Blob blob;                                                            \
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    tensor->Resize(2, 3);                                                 \
+    for (int i = 0; i < 6; ++i) {                                         \
+      tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
+    }                                                                     \
+    string serialized = blob.Serialize("test");                           \
+    BlobProto proto;                                                      \
+    CHECK(proto.ParseFromString(serialized));                             \
+    EXPECT_EQ(proto.name(), "test");                                      \
+    EXPECT_EQ(proto.type(), "Tensor");                                    \
+    EXPECT_TRUE(proto.has_tensor());                                      \
+    const TensorProto& tensor_proto = proto.tensor();                     \
+    EXPECT_EQ(                                                            \
+        tensor_proto.data_type(),                                         \
+        TypeMetaToDataType(TypeMeta::Make<TypeParam>()));                 \
+    EXPECT_EQ(tensor_proto.field_name##_size(), 6);                       \
+    for (int i = 0; i < 6; ++i) {                                         \
+      EXPECT_EQ(tensor_proto.field_name(i), static_cast<TypeParam>(i));   \
+    }                                                                     \
+    Blob new_blob;                                                        \
+    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
+    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
+    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
+    EXPECT_EQ(new_tensor.dim(0), 2);                                      \
+    EXPECT_EQ(new_tensor.dim(1), 3);                                      \
+    for (int i = 0; i < 6; ++i) {                                         \
+      EXPECT_EQ(                                                          \
+          tensor->data<TypeParam>()[i], new_tensor.data<TypeParam>()[i]); \
+    }                                                                     \
+  }                                                                       \
+                                                                          \
+  TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
+    Blob blob;                                                            \
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    tensor->Resize(0, 3);                                                 \
+    tensor->mutable_data<TypeParam>();                                    \
+    string serialized = blob.Serialize("test");                           \
+    BlobProto proto;                                                      \
+    CHECK(proto.ParseFromString(serialized));                             \
+    EXPECT_EQ(proto.name(), "test");                                      \
+    EXPECT_EQ(proto.type(), "Tensor");                                    \
+    EXPECT_TRUE(proto.has_tensor());                                      \
+    const TensorProto& tensor_proto = proto.tensor();                     \
+    EXPECT_EQ(                                                            \
+        tensor_proto.data_type(),                                         \
+        TypeMetaToDataType(TypeMeta::Make<TypeParam>()));                 \
+    EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
+    Blob new_blob;                                                        \
+    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
+    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
+    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
+    EXPECT_EQ(new_tensor.dim(0), 0);                                      \
+    EXPECT_EQ(new_tensor.dim(1), 3);                                      \
+  }
+
+TEST_SERIALIZATION_WITH_TYPE(bool, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(double, double_data)
+TEST_SERIALIZATION_WITH_TYPE(float, float_data)
+TEST_SERIALIZATION_WITH_TYPE(int, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(int8_t, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(int16_t, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(uint8_t, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(uint16_t, int32_data)
+TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
+
+TEST(TensorTest, TensorSerialization_CustomType) {
+  Blob blob;
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  tensor->Resize(2, 3);
+  for (int i = 0; i < 6; ++i) {
+    tensor->mutable_data<BlobTestFoo>()[i].val = i;
+  }
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "Tensor");
+  Blob new_blob;
+  EXPECT_NO_THROW(new_blob.Deserialize(serialized));
+  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
+  EXPECT_EQ(new_tensor.ndim(), 2);
+  EXPECT_EQ(new_tensor.dim(0), 2);
+  EXPECT_EQ(new_tensor.dim(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(
+        new_tensor.data<BlobTestFoo>()[i].val,
+        tensor->data<BlobTestFoo>()[i].val);
+  }
+}
+
+TEST(TensorTest, float16) {
+  const TIndex kSize = 3000000;
+  Blob blob;
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  tensor->Resize(kSize);
+  for (int i = 0; i < tensor->size(); ++i) {
+    tensor->mutable_data<float16>()[i].x = i % 10000;
+  }
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "Tensor");
+  EXPECT_TRUE(proto.has_tensor());
+  const TensorProto& tensor_proto = proto.tensor();
+  EXPECT_EQ(
+      tensor_proto.data_type(), TypeMetaToDataType(TypeMeta::Make<float16>()));
+  if (FLAGS_caffe2_serialize_fp16_as_bytes) {
+    EXPECT_EQ(tensor_proto.byte_data().size(), 2 * kSize);
+    for (int i = 0; i < kSize; ++i) {
+      auto value = tensor->mutable_data<float16>()[i].x;
+      auto low_bits = static_cast<char>(value & 0xff);
+      auto high_bits = static_cast<char>(value >> 8);
+      EXPECT_EQ(tensor_proto.byte_data()[2 * i], low_bits);
+      EXPECT_EQ(tensor_proto.byte_data()[2 * i + 1], high_bits);
+    }
+  } else {
+    EXPECT_EQ(tensor_proto.int32_data().size(), kSize);
+  }
+  Blob new_blob;
+  EXPECT_NO_THROW(new_blob.Deserialize(serialized));
+  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
+  EXPECT_EQ(new_tensor.ndim(), 1);
+  EXPECT_EQ(new_tensor.dim(0), kSize);
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(new_tensor.data<float16>()[i].x, i % 10000);
+  }
+}
+
+TEST(QTensorTest, QTensorSerialization) {
+  Blob blob;
+  QTensor<CPUContext>* qtensor = blob.GetMutable<QTensor<CPUContext>>();
+  qtensor->SetPrecision(5);
+  qtensor->SetSigned(false);
+  qtensor->SetScale(1.337);
+  qtensor->SetBias(-1.337);
+  qtensor->Resize(std::vector<int>{2, 3});
+  // "Randomly" set bits.
+  srand(0);
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      qtensor->SetBitAtIndex(j, i, rand() % 2);
+    }
+  }
+
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "QTensor");
+  EXPECT_TRUE(proto.has_qtensor());
+  const QTensorProto& qtensor_proto = proto.qtensor();
+
+  EXPECT_EQ(qtensor_proto.precision(), qtensor->precision());
+  EXPECT_EQ(qtensor_proto.scale(), qtensor->scale());
+  EXPECT_EQ(qtensor_proto.bias(), qtensor->bias());
+  EXPECT_EQ(qtensor_proto.is_signed(), qtensor->is_signed());
+
+  Blob new_blob;
+  new_blob.Deserialize(serialized);
+  EXPECT_TRUE(new_blob.IsType<QTensor<CPUContext>>());
+  const QTensor<CPUContext>& new_qtensor = blob.Get<QTensor<CPUContext>>();
+  EXPECT_EQ(new_qtensor.ndim(), 2);
+  EXPECT_EQ(new_qtensor.dim32(0), 2);
+  EXPECT_EQ(new_qtensor.dim32(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      EXPECT_EQ(qtensor->GetBitAtIndex(j, i), new_qtensor.GetBitAtIndex(j, i));
+    }
+  }
+}
+
+using StringMap = std::vector<std::pair<string, string>>;
+
+class VectorCursor : public db::Cursor {
+ public:
+  explicit VectorCursor(StringMap* data) : data_(data) {
+    pos_ = 0;
+  }
+  ~VectorCursor() {}
+  void Seek(const string& /* unused */) override {}
+  void SeekToFirst() override {}
+  void Next() override {
+    ++pos_;
+  }
+  string key() override {
+    return (*data_)[pos_].first;
+  }
+  string value() override {
+    return (*data_)[pos_].second;
+  }
+  bool Valid() override {
+    return pos_ < data_->size();
+  }
+
+ private:
+  StringMap* data_ = nullptr;
+  size_t pos_ = 0;
+};
+
+class VectorDB : public db::DB {
+ public:
+  VectorDB(const string& source, db::Mode mode)
+      : DB(source, mode), name_(source) {}
+  ~VectorDB() {
+    data_.erase(name_);
+  }
+  void Close() override {}
+  std::unique_ptr<db::Cursor> NewCursor() override {
+    return make_unique<VectorCursor>(getData());
+  }
+  std::unique_ptr<db::Transaction> NewTransaction() override {
+    CAFFE_THROW("Not implemented");
+  }
+  static void registerData(const string& name, StringMap&& data) {
+    std::lock_guard<std::mutex> guard(dataRegistryMutex_);
+    data_[name] = std::move(data);
+  }
+
+ private:
+  StringMap* getData() {
+    auto it = data_.find(name_);
+    CAFFE_ENFORCE(it != data_.end(), "Can't find ", name_);
+    return &(it->second);
+  }
+
+ private:
+  string name_;
+  static std::mutex dataRegistryMutex_;
+  static std::map<string, StringMap> data_;
+};
+
+std::mutex VectorDB::dataRegistryMutex_;
+std::map<string, StringMap> VectorDB::data_;
+
+REGISTER_CAFFE2_DB(vector_db, VectorDB);
+
+template <typename TypeParam>
+class TypedTensorTest : public ::testing::Test {};
+typedef ::testing::
+    Types<float, bool, double, int, int8_t, int16_t, uint8_t, uint16_t, int64_t>
+        TensorDataTypes;
+TYPED_TEST_CASE(TypedTensorTest, TensorDataTypes);
+
+TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
+  int64_t d1 = 2;
+  int64_t d2 = FLAGS_caffe2_test_big_tensor_size
+      ? FLAGS_caffe2_test_big_tensor_size / d1
+      : static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+  int64_t size = d1 * d2;
+  string db_source = (string)std::tmpnam(nullptr);
+  VLOG(1) << "db_source: " << db_source;
+
+  {
+    VLOG(1) << "Test begin";
+    Blob blob;
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+    VLOG(1) << "Allocating blob";
+    tensor->Resize(d1, d2);
+    auto mutableData = tensor->mutable_data<TypeParam>();
+    VLOG(1) << "Filling out the blob";
+    for (int64_t i = 0; i < size; ++i) {
+      mutableData[i] = static_cast<TypeParam>(i);
+    }
+    StringMap data;
+    std::mutex mutex;
+    /*auto db = CreateDB("minidb", db_source, WRITE);*/
+    auto acceptor = [&](const std::string& key, const std::string& value) {
+      std::lock_guard<std::mutex> guard(mutex);
+      /*db->NewTransaction()->Put(key, value);*/
+      data.emplace_back(key, value);
+    };
+    blob.Serialize("test", acceptor);
+    VectorDB::registerData(db_source, std::move(data));
+    VLOG(1) << "finished writing to DB";
+  }
+
+  {
+    DeviceOption option;
+    option.set_device_type(CPU);
+    Argument db_type_arg = MakeArgument<string>("db_type", "vector_db");
+    Argument absolute_path_arg = MakeArgument<bool>("absolute_path", true);
+    Argument db_source_arg = MakeArgument<string>("db", db_source);
+    auto op_def = CreateOperatorDef(
+        "Load",
+        "",
+        std::vector<string>{},
+        std::vector<string>({"test"}),
+        std::vector<Argument>{db_type_arg, db_source_arg, absolute_path_arg},
+        option,
+        "DUMMY_ENGINE");
+    Workspace ws;
+    auto load_op = CreateOperator(op_def, &ws);
+    EXPECT_TRUE(load_op != nullptr);
+    VLOG(1) << "Running operator";
+
+    load_op->Run();
+    VLOG(1) << "Reading blob from workspace";
+    auto new_blob = ws.GetBlob("test");
+    EXPECT_TRUE(new_blob->IsType<TensorCPU>());
+    const auto& new_tensor = new_blob->Get<TensorCPU>();
+
+    EXPECT_EQ(new_tensor.ndim(), d1);
+    EXPECT_EQ(new_tensor.dim(0), d1);
+    EXPECT_EQ(new_tensor.dim(1), d2);
+    for (int64_t i = 0; i < size; ++i) {
+      EXPECT_EQ(static_cast<TypeParam>(i), new_tensor.data<TypeParam>()[i]);
+    }
+  }
+}
+
+struct DummyType {
+  /* This struct is used to test serialization and deserialization of huge
+   * blobs, that are not tensors.
+   */
+
+  /* implicit */ DummyType(int n_chunks_init = 0) : n_chunks(n_chunks_init) {}
+  std::string serialize(const std::string& name, const int32_t chunk_id) const {
+    BlobProto blobProto;
+    blobProto.set_name(name);
+    blobProto.set_type("DummyType");
+    std::string content("");
+    blobProto.set_content(content);
+    blobProto.set_content_num_chunks(n_chunks);
+    blobProto.set_content_chunk_id(chunk_id);
+    return blobProto.SerializeAsString();
+  }
+  void deserialize(const BlobProto& /* unused */) {
+    ++n_chunks;
+  }
+  int n_chunks;
+};
+
+class DummyTypeSerializer : public BlobSerializerBase {
+ public:
+  DummyTypeSerializer() {}
+  ~DummyTypeSerializer() {}
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<DummyType>());
+    const auto& container = blob.template Get<DummyType>();
+    for (int k = 0; k < container.n_chunks; ++k) {
+      std::string serialized_chunk = container.serialize(name, k);
+      acceptor(MakeString(name, kChunkIdSeparator, k), serialized_chunk);
+    }
+  }
+};
+
+class DummyTypeDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    auto* container = blob->GetMutable<DummyType>();
+    container->deserialize(proto);
+  }
+};
+}
+
+CAFFE_KNOWN_TYPE(DummyType);
+
+namespace {
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<DummyType>()), DummyTypeSerializer);
+CAFFE_REGISTER_TYPED_CLASS(
+    BlobDeserializerRegistry,
+    "DummyType",
+    DummyTypeDeserializer);
+
+TEST(ContentChunks, Serialization) {
+  string db_source = (string)std::tmpnam(nullptr);
+  VLOG(1) << "db_source: " << db_source;
+
+  {
+    VLOG(1) << "Test begin";
+    Blob blob;
+    DummyType* container = blob.GetMutable<DummyType>();
+    VLOG(1) << "Allocating blob";
+    container->n_chunks = 10;
+    VLOG(1) << "Filling out the blob";
+    StringMap data;
+    std::mutex mutex;
+    auto acceptor = [&](const std::string& key, const std::string& value) {
+      std::lock_guard<std::mutex> guard(mutex);
+      data.emplace_back(key, value);
+    };
+    blob.Serialize("test", acceptor);
+    VectorDB::registerData(db_source, std::move(data));
+    VLOG(1) << "finished writing to DB";
+  }
+
+  {
+    DeviceOption option;
+    option.set_device_type(CPU);
+    Argument db_type_arg = MakeArgument<string>("db_type", "vector_db");
+    Argument absolute_path_arg = MakeArgument<bool>("absolute_path", true);
+    Argument db_source_arg = MakeArgument<string>("db", db_source);
+    auto op_def = CreateOperatorDef(
+        "Load",
+        "",
+        std::vector<string>{},
+        std::vector<string>({"test"}),
+        std::vector<Argument>{db_type_arg, db_source_arg, absolute_path_arg},
+        option,
+        "DUMMY_ENGINE");
+    Workspace ws;
+    auto load_op = CreateOperator(op_def, &ws);
+    EXPECT_TRUE(load_op != nullptr);
+    VLOG(1) << "Running operator";
+
+    load_op->Run();
+    VLOG(1) << "Reading blob from workspace";
+    auto new_blob = ws.GetBlob("test");
+    EXPECT_TRUE(new_blob->IsType<DummyType>());
+    const auto& container = new_blob->Get<DummyType>();
+    EXPECT_EQ(container.n_chunks, 10);
+  }
+}
+
+TEST(CustomChunkSize, BigTensorSerialization) {
+  int64_t d1 = 2;
+  int64_t d2 = FLAGS_caffe2_test_big_tensor_size
+      ? FLAGS_caffe2_test_big_tensor_size / d1
+      : static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+  int64_t size = d1 * d2;
+
+  Blob blob;
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  tensor->Resize(d1, d2);
+  tensor->mutable_data<float>();
+  std::mutex mutex;
+  int counter = 0;
+  auto acceptor = [&](const std::string& /*key*/,
+                      const std::string& /*value*/) {
+    std::lock_guard<std::mutex> guard(mutex);
+    counter++;
+  };
+  blob.Serialize("test", acceptor, size);
+  EXPECT_EQ(counter, 1);
+
+  counter = 0;
+  blob.Serialize("test", acceptor, (size / 2) + 1);
+  EXPECT_EQ(counter, 2);
+
+  counter = 0;
+  blob.Serialize("test", acceptor, kNoChunking);
+  EXPECT_EQ(counter, 1);
+}
+
+TEST(QTensor, QTensorSizingTest) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  QTensor<CPUContext> qtensor(dims, 3);
+  EXPECT_TRUE(qtensor.mutable_data() != nullptr);
+  EXPECT_EQ(qtensor.nbytes(), 12);
+  EXPECT_EQ(qtensor.size(), 30);
+}
+
+TEST(BlobTest, CastingMessage) {
+  Blob b;
+  b.GetMutable<BlobTestFoo>();
+  b.Get<BlobTestFoo>();
+  try {
+    b.Get<BlobTestBar>();
+    FAIL() << "Should have thrown";
+  } catch (const EnforceNotMet& e) {
+    string msg = e.what();
+    msg = msg.substr(0, msg.find('\n'));
+    LOG(INFO) << msg;
+    EXPECT_NE(msg.find("BlobTestFoo"), std::string::npos) << msg;
+    EXPECT_NE(msg.find("BlobTestBar"), std::string::npos) << msg;
+  }
+}
+
+TEST(TensorConstruction, UnitializedCopyTest) {
+  CPUContext context;
+  TensorCPU x;
+  TensorCPU y(x, &context);
+  TensorCPU z = x.Clone();
+  // should be uninitialized
+  EXPECT_EQ(x.size(), -1);
+  EXPECT_EQ(y.size(), -1);
+  LOG(INFO) << "z.size()" << z.size();
+  EXPECT_EQ(z.size(), -1);
+}
+
+TEST(TensorConstruction, CopyConstructorTest) {
+  CPUContext context;
+
+  TensorCPU x;
+  x.Resize(5);
+  x.mutable_data<float>()[0] = 1;
+  TensorCPU y = x.Clone();
+  TensorCPU z(x, &context);
+  TensorCPU w;
+
+  EXPECT_EQ(*x.data<float>(), 1);
+  EXPECT_EQ(*y.data<float>(), 1);
+  EXPECT_EQ(*z.data<float>(), 1);
+  x.mutable_data<float>()[0] = 5;
+  EXPECT_EQ(*x.data<float>(), 5);
+  EXPECT_EQ(*y.data<float>(), 1);
+  EXPECT_EQ(*z.data<float>(), 1);
+}
+
+TEST(TensorConstruction, MoveConstructorTest) {
+  CPUContext context;
+
+  TensorCPU x;
+  x.Resize(5);
+  x.mutable_data<float>()[0] = 1;
+  TensorCPU y = std::move(x);
+
+  EXPECT_EQ(*y.data<float>(), 1);
+}
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/common.cc b/caffe2/core/common.cc
new file mode 100644
index 0000000..aede349
--- /dev/null
+++ b/caffe2/core/common.cc
@@ -0,0 +1,39 @@
+#include <atomic>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+// A global variable to mark if Caffe2 has cuda linked to the current runtime.
+// Do not directly use this variable, but instead use the HasCudaRuntime()
+// function below.
+std::atomic<bool> g_caffe2_has_cuda_linked{false};
+std::atomic<bool> g_caffe2_has_hip_linked{false};
+
+bool HasCudaRuntime() {
+  return g_caffe2_has_cuda_linked.load();
+}
+
+bool HasHipRuntime() {
+  return g_caffe2_has_hip_linked.load();
+}
+
+namespace internal {
+void SetCudaRuntimeFlag() {
+  g_caffe2_has_cuda_linked.store(true);
+}
+
+void SetHipRuntimeFlag() {
+  g_caffe2_has_hip_linked.store(true);
+}
+} // namespace internal
+
+const std::map<string, string>& GetBuildOptions() {
+#ifndef CAFFE2_BUILD_STRINGS
+#define CAFFE2_BUILD_STRINGS {}
+#endif
+  static const std::map<string, string> kMap = CAFFE2_BUILD_STRINGS;
+  return kMap;
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
new file mode 100644
index 0000000..32b06b6
--- /dev/null
+++ b/caffe2/core/common.h
@@ -0,0 +1,296 @@
+#ifndef CAFFE2_CORE_COMMON_H_
+#define CAFFE2_CORE_COMMON_H_
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(_MSC_VER)
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+// Macros used during the build of this caffe2 instance. This header file
+// is automatically generated by the cmake script during build.
+#include "caffe2/core/macros.h"
+
+namespace caffe2 {
+
+// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
+// large matrices that are common in sparse math.
+typedef int64_t TIndex;
+
+// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
+// forcing us to use std::map instead of unordered_map. This may affect speed
+// in some cases, but in most of the computation code we do not access map very
+// often, so it should be fine for us. I am putting a CaffeMap alias so we can
+// change it more easily if things work out for unordered_map down the road.
+template <typename Key, typename Value>
+using CaffeMap = std::map<Key, Value>;
+// using CaffeMap = std::unordered_map;
+
+// Using statements for common classes that we refer to in caffe2 very often.
+// Note that we only place it inside caffe2 so the global namespace is not
+// polluted.
+/* using override */
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+// Just in order to mark things as not implemented. Do not use in final code.
+#define CAFFE_NOT_IMPLEMENTED CAFFE_THROW("Not Implemented.")
+
+// suppress an unused variable.
+#ifdef _MSC_VER
+#define CAFFE2_UNUSED
+#define CAFFE2_USED
+#else
+#define CAFFE2_UNUSED __attribute__((__unused__))
+#define CAFFE2_USED __attribute__((__used__))
+#endif //_MSC_VER
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname)                                     \
+  classname(const classname&) = delete;                                        \
+  classname& operator=(const classname&) = delete
+#endif
+
+// Define enabled when building for iOS or Android devices
+#if !defined(CAFFE2_MOBILE)
+#if defined(__ANDROID__)
+#define CAFFE2_ANDROID 1
+#define CAFFE2_MOBILE 1
+#elif (defined(__APPLE__) &&                                            \
+       (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define CAFFE2_IOS 1
+#define CAFFE2_MOBILE 1
+#elif (defined(__APPLE__) && TARGET_OS_MAC)
+#define CAFFE2_IOS 1
+#define CAFFE2_MOBILE 0
+#else
+#define CAFFE2_MOBILE 0
+#endif // ANDROID / IOS / MACOS
+#endif // CAFFE2_MOBILE
+
+// Define alignment macro that is cross platform
+#if defined(_MSC_VER)
+#define CAFFE2_ALIGNED(x) __declspec(align(x))
+#else
+#define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
+#endif
+
+/**
+ * Macro for marking functions as having public visibility.
+ * Ported from folly/CPortability.h
+ */
+#ifndef __GNUC_PREREQ
+#if defined __GNUC__ && defined __GNUC_MINOR__
+#define __GNUC_PREREQ(maj, min) \
+  ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+#else
+#define __GNUC_PREREQ(maj, min) 0
+#endif
+#endif
+
+// Defines CAFFE2_EXPORT and CAFFE2_IMPORT. On Windows, this corresponds to
+// different declarations (dllexport and dllimport). On Linux/Mac, it just
+// resolves to the same "default visibility" setting.
+#if defined(_MSC_VER)
+#if defined(CAFFE2_BUILD_SHARED_LIBS)
+#define CAFFE2_EXPORT __declspec(dllexport)
+#define CAFFE2_IMPORT __declspec(dllimport)
+#else
+#define CAFFE2_EXPORT
+#define CAFFE2_IMPORT
+#endif
+#else
+#if defined(__GNUC__)
+#if __GNUC_PREREQ(4, 9)
+#define CAFFE2_EXPORT [[gnu::visibility("default")]]
+#else
+#define CAFFE2_EXPORT __attribute__((__visibility__("default")))
+#endif
+#else
+#define CAFFE2_EXPORT
+#endif
+#define CAFFE2_IMPORT CAFFE2_EXPORT
+#endif
+
+// CAFFE2_API is a macro that, depends on whether you are building the
+// main caffe2 library or not, resolves to either CAFFE2_EXPORT or
+// CAFFE2_IMPORT.
+//
+// This is used in e.g. Caffe2's protobuf files: when building the main library,
+// it is defined as CAFFE2_EXPORT to fix a Windows global-variable-in-dll
+// issue, and for anyone dependent on Caffe2 it will be defined as
+// CAFFE2_IMPORT.
+
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define CAFFE2_API CAFFE2_EXPORT
+#else
+#define CAFFE2_API CAFFE2_IMPORT
+#endif
+
+#ifdef CAFFE2_BUILD_OBSERVER_LIB
+#define CAFFE2_OBSERVER_API CAFFE2_EXPORT
+#else
+#define CAFFE2_OBSERVER_API CAFFE2_IMPORT
+#endif
+
+
+#if defined(_MSC_VER)
+#define NOMINMAX
+#endif
+
+// make_unique is a C++14 feature. If we don't have 14, we will emulate
+// its behavior. This is copied from folly/Memory.h
+#if __cplusplus >= 201402L ||                                              \
+    (defined __cpp_lib_make_unique && __cpp_lib_make_unique >= 201304L) || \
+    (defined(_MSC_VER) && _MSC_VER >= 1900)
+/* using override */
+using std::make_unique;
+#else
+
+template<typename T, typename... Args>
+typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+// Allows 'make_unique<T[]>(10)'. (N3690 s20.9.1.4 p3-4)
+template<typename T>
+typename std::enable_if<std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(const size_t n) {
+  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+}
+
+// Disallows 'make_unique<T[10]>()'. (N3690 s20.9.1.4 p5)
+template<typename T, typename... Args>
+typename std::enable_if<
+  std::extent<T>::value != 0, std::unique_ptr<T>>::type
+make_unique(Args&&...) = delete;
+
+#endif
+
+// to_string, stoi and stod implementation for Android related stuff.
+// Note(jiayq): Do not use the CAFFE2_TESTONLY_FORCE_STD_STRING_TEST macro
+// outside testing code that lives under common_test.cc
+#if defined(__ANDROID__) || defined(CAFFE2_TESTONLY_FORCE_STD_STRING_TEST)
+#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 1
+template <typename T>
+std::string to_string(T value)
+{
+  std::ostringstream os;
+  os << value;
+  return os.str();
+}
+
+inline int stoi(const string& str) {
+  std::stringstream ss;
+  int n = 0;
+  ss << str;
+  ss >> n;
+  return n;
+}
+
+inline double stod(const string& str, std::size_t* pos = 0) {
+  std::stringstream ss;
+  ss << str;
+  double val = 0;
+  ss >> val;
+  if (pos) {
+    if (ss.tellg() == std::streampos(-1)) {
+      *pos = str.size();
+    } else {
+      *pos = ss.tellg();
+    }
+  }
+  return val;
+}
+#else
+#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 0
+using std::to_string;
+using std::stoi;
+using std::stod;
+#endif // defined(__ANDROID__) || defined(CAFFE2_FORCE_STD_STRING_FALLBACK_TEST)
+
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+using ::round;
+#else
+using std::round;
+#endif // defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+
+// dynamic cast reroute: if RTTI is disabled, go to reinterpret_cast
+template <typename Dst, typename Src>
+inline Dst dynamic_cast_if_rtti(Src ptr) {
+#ifdef __GXX_RTTI
+  return dynamic_cast<Dst>(ptr);
+#else
+  return reinterpret_cast<Dst>(ptr);
+#endif
+}
+
+// SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
+// as utilty functions that marks input / output indices to skip when we use a
+// CPU operator as the fallback of GPU/MKL operator option.
+template <int... values>
+class SkipIndices {
+ private:
+  template <int V>
+  static inline bool ContainsInternal(const int i) {
+    return (i == V);
+  }
+  template <int First, int Second, int... Rest>
+  static inline bool ContainsInternal(const int i) {
+    return (i == First) || ContainsInternal<Second, Rest...>(i);
+  }
+
+ public:
+  static inline bool Contains(const int i) {
+    return ContainsInternal<values...>(i);
+  }
+};
+
+template <>
+class SkipIndices<> {
+ public:
+  static inline bool Contains(const int /*i*/) {
+    return false;
+  }
+};
+
+// HasCudaRuntime() tells the program whether the binary has Cuda runtime
+// linked. This function should not be used in static initialization functions
+// as the underlying boolean variable is going to be switched on when one
+// loads libcaffe2_gpu.so.
+bool HasCudaRuntime();
+bool HasHipRuntime();
+namespace internal {
+// Sets the Cuda Runtime flag that is used by HasCudaRuntime(). You should
+// never use this function - it is only used by the Caffe2 gpu code to notify
+// Caffe2 core that cuda runtime has been loaded.
+void SetCudaRuntimeFlag();
+void SetHipRuntimeFlag();
+}
+// Returns which setting Caffe2 was configured and built with (exported from
+// CMake)
+const std::map<string, string>& GetBuildOptions();
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_COMMON_H_
diff --git a/caffe2/core/common_cudnn.cc b/caffe2/core/common_cudnn.cc
new file mode 100644
index 0000000..248897c
--- /dev/null
+++ b/caffe2/core/common_cudnn.cc
@@ -0,0 +1,26 @@
+#include "caffe2/core/common_cudnn.h"
+#include "caffe2/core/cudnn_wrappers.h"
+
+#include "caffe2/core/init.h"
+
+namespace caffe2 {
+
+CuDNNWrapper::PerGPUCuDNNStates& CuDNNWrapper::cudnn_states() {
+  // New it (never delete) to avoid calling the destructors on process
+  // exit and racing against the CUDA shutdown sequence.
+  static auto* p = new CuDNNWrapper::PerGPUCuDNNStates();
+  CHECK_NOTNULL(p);
+  return *p;
+}
+
+namespace {
+bool PrintCuDNNInfo(int*, char***) {
+  VLOG(1) << "Caffe2 is built with CuDNN version " << CUDNN_VERSION;
+  return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(PrintCuDNNInfo, &PrintCuDNNInfo,
+                              "Print CuDNN Info.");
+
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
new file mode 100644
index 0000000..ca154a0
--- /dev/null
+++ b/caffe2/core/common_cudnn.h
@@ -0,0 +1,322 @@
+#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
+#define CAFFE2_CORE_COMMON_CUDNN_H_
+
+#include <array>
+#include <mutex>
+
+#include <cudnn.h>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#ifndef CAFFE2_USE_CUDNN
+#error("This Caffe2 install is not built with cudnn, so you should not include this file.");
+#endif
+
+static_assert(
+    CUDNN_VERSION >= 5000,
+    "Caffe2 requires cudnn version 5.0 or above.");
+
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#endif // CUDNN_VERSION < 6000
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
+
+namespace caffe2 {
+
+namespace internal {
+/**
+ * A helper function to obtain cudnn error strings.
+ */
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+    default:
+      return "Unknown cudnn error number";
+  }
+}
+} // namespace internal
+
+// A macro that wraps around a cudnn statement so we can check if the cudnn
+// execution finishes or not.
+#define CUDNN_ENFORCE(condition)                          \
+  do {                                                    \
+    cudnnStatus_t status = condition;                     \
+    CAFFE_ENFORCE_EQ(                                     \
+        status,                                           \
+        CUDNN_STATUS_SUCCESS,                             \
+        ", Error at: ",                                   \
+        __FILE__,                                         \
+        ":",                                              \
+        __LINE__,                                         \
+        ": ",                                             \
+        ::caffe2::internal::cudnnGetErrorString(status)); \
+  } while (0)
+#define CUDNN_CHECK(condition)                              \
+  do {                                                      \
+    cudnnStatus_t status = condition;                       \
+    CHECK(status == CUDNN_STATUS_SUCCESS)                   \
+        << ::caffe2::internal::cudnnGetErrorString(status); \
+  } while (0)
+
+// report the version of cuDNN Caffe2 was compiled with
+inline size_t cudnnCompiledVersion() {
+  return CUDNN_VERSION;
+}
+// report the runtime version of cuDNN
+inline size_t cudnnRuntimeVersion() {
+  return cudnnGetVersion();
+}
+
+// Check compatibility of compiled and runtime cuDNN versions
+inline void CheckCuDNNVersions() {
+  // Version format is major*1000 + minor*100 + patch
+  // If compiled with version < 7, major, minor and patch must all match
+  // If compiled with version >= 7, then either
+  //    runtime_version > compiled_version
+  //    major and minor match
+  bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
+  bool compiled_with_7 = cudnnCompiledVersion() >= 7000;
+  bool backwards_compatible_7 = compiled_with_7 && cudnnRuntimeVersion() >= cudnnCompiledVersion();
+  bool patch_compatible = compiled_with_7 && (cudnnRuntimeVersion() / 100) == (cudnnCompiledVersion() / 100);
+  CAFFE_ENFORCE(version_match || backwards_compatible_7 || patch_compatible,
+                "cuDNN compiled (", cudnnCompiledVersion(), ") and "
+                "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
+}
+
+/**
+ * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
+ * in a template function. The class is specialized explicitly for different
+ * data types below.
+ */
+template <typename T>
+class cudnnTypeWrapper;
+
+template <>
+class cudnnTypeWrapper<float> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+  typedef const float ScalingParamType;
+  typedef float BNParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static const ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+template <>
+class cudnnTypeWrapper<int> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_INT32;
+  typedef const int ScalingParamType;
+  typedef int BNParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1;
+    return &v;
+  }
+  static const ScalingParamType* kZero() {
+    static ScalingParamType v = 0;
+    return &v;
+  }
+};
+#endif // CUDNN_VERSION_MIN(6, 0, 0)
+
+template <>
+class cudnnTypeWrapper<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+  typedef const double ScalingParamType;
+  typedef double BNParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class cudnnTypeWrapper<float16> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_HALF;
+  typedef const float ScalingParamType;
+  typedef float BNParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+/**
+ * A wrapper function to convert the Caffe storage order to cudnn storage order
+ * enum values.
+ */
+inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
+  switch (order) {
+    case StorageOrder::NHWC:
+      return CUDNN_TENSOR_NHWC;
+    case StorageOrder::NCHW:
+      return CUDNN_TENSOR_NCHW;
+    default:
+      LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
+  }
+  // Just to suppress compiler warnings
+  return CUDNN_TENSOR_NCHW;
+}
+
+/**
+ * cudnnTensorDescWrapper is the placeholder that wraps around a
+ * cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during
+ * runtime.
+ */
+class cudnnTensorDescWrapper {
+ public:
+  cudnnTensorDescWrapper() {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
+  }
+  ~cudnnTensorDescWrapper() noexcept {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t Descriptor(
+      const cudnnTensorFormat_t format,
+      const cudnnDataType_t type,
+      const vector<int>& dims,
+      bool* changed) {
+    if (type_ == type && format_ == format && dims_ == dims) {
+      // if not changed, simply return the current descriptor.
+      if (changed)
+        *changed = false;
+      return desc_;
+    }
+    CAFFE_ENFORCE_EQ(
+        dims.size(), 4, "Currently only 4-dimensional descriptor supported.");
+    format_ = format;
+    type_ = type;
+    dims_ = dims;
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        desc_,
+        format,
+        type,
+        dims_[0],
+        (format == CUDNN_TENSOR_NCHW ? dims_[1] : dims_[3]),
+        (format == CUDNN_TENSOR_NCHW ? dims_[2] : dims_[1]),
+        (format == CUDNN_TENSOR_NCHW ? dims_[3] : dims_[2])));
+    if (changed)
+      *changed = true;
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t Descriptor(
+      const StorageOrder& order,
+      const vector<int>& dims) {
+    return Descriptor(
+        GetCudnnTensorFormat(order), cudnnTypeWrapper<T>::type, dims, nullptr);
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  cudnnTensorFormat_t format_;
+  cudnnDataType_t type_;
+  vector<int> dims_;
+  DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
+};
+
+class cudnnFilterDescWrapper {
+ public:
+  cudnnFilterDescWrapper() {
+    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&desc_));
+  }
+  ~cudnnFilterDescWrapper() noexcept {
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(desc_));
+  }
+
+  inline cudnnFilterDescriptor_t Descriptor(
+      const StorageOrder& order,
+      const cudnnDataType_t type,
+      const vector<int>& dims,
+      bool* changed) {
+    if (type_ == type && order_ == order && dims_ == dims) {
+      // if not changed, simply return the current descriptor.
+      if (changed)
+        *changed = false;
+      return desc_;
+    }
+    CAFFE_ENFORCE_EQ(
+        dims.size(), 4, "Currently only 4-dimensional descriptor supported.");
+    order_ = order;
+    type_ = type;
+    dims_ = dims;
+    CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
+        desc_,
+        type,
+        GetCudnnTensorFormat(order),
+        dims_[0],
+        // TODO - confirm that this is correct for NHWC
+        (order == StorageOrder::NCHW ? dims_[1] : dims_[3]),
+        (order == StorageOrder::NCHW ? dims_[2] : dims_[1]),
+        (order == StorageOrder::NCHW ? dims_[3] : dims_[2])));
+    if (changed)
+      *changed = true;
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnFilterDescriptor_t Descriptor(
+      const StorageOrder& order,
+      const vector<int>& dims) {
+    return Descriptor(order, cudnnTypeWrapper<T>::type, dims, nullptr);
+  }
+
+ private:
+  cudnnFilterDescriptor_t desc_;
+  StorageOrder order_;
+  cudnnDataType_t type_;
+  vector<int> dims_;
+  DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
+};
+
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_COMMON_CUDNN_H_
diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
new file mode 100644
index 0000000..9e39a85
--- /dev/null
+++ b/caffe2/core/common_gpu.cc
@@ -0,0 +1,326 @@
+#include "caffe2/core/common_gpu.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <sstream>
+
+#include "caffe2/core/asan.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_cuda_full_device_control,
+    false,
+    "If true, assume all the cudaSetDevice and cudaGetDevice calls will be "
+    "controlled by Caffe2, and non-Caffe2 code will ensure that the entry and "
+    "exit point has the same cuda device. Under the hood, Caffe2 will use "
+    "thread local variables to cache the device, in order to speed up set and "
+    "get device calls. This is an experimental feature that may have non "
+    "trivial side effects, so use it with care and only enable it if you are "
+    "absolutely sure. Also, this flag should not be changed after the program "
+    "initializes.");
+
+namespace caffe2 {
+
+int NumCudaDevices() {
+  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
+    static bool first = true;
+    if (first) {
+      first = false;
+      std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
+                << std::endl;
+    }
+  }
+  static int count = -1;
+  if (count < 0) {
+    auto err = cudaGetDeviceCount(&count);
+    switch (err) {
+      case cudaSuccess:
+        // Everything is good.
+        break;
+      case cudaErrorNoDevice:
+        count = 0;
+        break;
+      case cudaErrorInsufficientDriver:
+        LOG(WARNING) << "Insufficient cuda driver. Cannot use cuda.";
+        count = 0;
+        break;
+      case cudaErrorInitializationError:
+        LOG(WARNING) << "Cuda driver initialization failed, you might not "
+                        "have a cuda gpu.";
+        count = 0;
+        break;
+      case cudaErrorUnknown:
+        LOG(ERROR) << "Found an unknown error - this may be due to an "
+                      "incorrectly set up environment, e.g. changing env "
+                      "variable CUDA_VISIBLE_DEVICES after program start. "
+                      "I will set the available devices to be zero.";
+        count = 0;
+        break;
+      case cudaErrorMemoryAllocation:
+#if CAFFE2_ASAN_ENABLED
+        // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
+        // pop up.
+        LOG(ERROR) << "It is known that CUDA does not work well with ASAN. As "
+                      "a result we will simply shut down CUDA support. If you "
+                      "would like to use GPUs, turn off ASAN.";
+        count = 0;
+        break;
+#else // CAFFE2_ASAN_ENABLED
+        // If we are not in ASAN mode and we get cudaErrorMemoryAllocation,
+        // this means that something is wrong before NumCudaDevices() call.
+        LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
+                      "some cuda functions before calling NumCudaDevices() "
+                      "that might have already set an error? Error: "
+                   << err;
+        break;
+#endif // CAFFE2_ASAN_ENABLED
+      default:
+        LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
+                      "some cuda functions before calling NumCudaDevices() "
+                      "that might have already set an error? Error: "
+                   << err;
+    }
+  }
+  return count;
+}
+
+namespace {
+int gDefaultGPUID = 0;
+// Only used when FLAGS_caffe2_cuda_full_device_control is set true.
+thread_local int gCurrentDevice = -1;
+}  // namespace
+
+void SetDefaultGPUID(const int deviceid) {
+  CAFFE_ENFORCE_LT(
+      deviceid,
+      NumCudaDevices(),
+      "The default gpu id should be smaller than the number of gpus "
+      "on this machine: ",
+      deviceid,
+      " vs ",
+      NumCudaDevices());
+  gDefaultGPUID = deviceid;
+}
+
+int GetDefaultGPUID() { return gDefaultGPUID; }
+
+int CaffeCudaGetDevice() {
+  if (FLAGS_caffe2_cuda_full_device_control) {
+    if (gCurrentDevice < 0) {
+      CUDA_ENFORCE(cudaGetDevice(&gCurrentDevice));
+    }
+    return gCurrentDevice;
+  } else {
+    int gpu_id = 0;
+    CUDA_ENFORCE(cudaGetDevice(&gpu_id));
+    return gpu_id;
+  }
+}
+
+void CaffeCudaSetDevice(const int id) {
+  if (FLAGS_caffe2_cuda_full_device_control) {
+    if (gCurrentDevice != id) {
+      CUDA_ENFORCE(cudaSetDevice(id));
+    }
+    gCurrentDevice = id;
+  } else {
+    CUDA_ENFORCE(cudaSetDevice(id));
+  }
+}
+
+int GetGPUIDForPointer(const void* ptr) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
+
+  if (err == cudaErrorInvalidValue) {
+    // Occurs when the pointer is in the CPU address space that is
+    // unmanaged by CUDA; make sure the last error state is cleared,
+    // since it is persistent
+    err = cudaGetLastError();
+    CHECK(err == cudaErrorInvalidValue);
+    return -1;
+  }
+
+  // Otherwise, there must be no error
+  CUDA_ENFORCE(err);
+
+  if (attr.memoryType == cudaMemoryTypeHost) {
+    return -1;
+  }
+
+  return attr.device;
+}
+
+struct CudaDevicePropWrapper {
+  CudaDevicePropWrapper() : props(NumCudaDevices()) {
+    for (int i = 0; i < NumCudaDevices(); ++i) {
+      CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
+    }
+  }
+
+  vector<cudaDeviceProp> props;
+};
+
+const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
+  // According to C++11 standard section 6.7, static local variable init is
+  // thread safe. See
+  //   https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
+  // for details.
+  static CudaDevicePropWrapper props;
+  CAFFE_ENFORCE_LT(
+      deviceid,
+      NumCudaDevices(),
+      "The gpu id should be smaller than the number of gpus ",
+      "on this machine: ",
+      deviceid,
+      " vs ",
+      NumCudaDevices());
+  return props.props[deviceid];
+}
+
+void DeviceQuery(const int device) {
+  const cudaDeviceProp& prop = GetDeviceProperty(device);
+  std::stringstream ss;
+  ss << std::endl;
+  ss << "Device id:                     " << device << std::endl;
+  ss << "Major revision number:         " << prop.major << std::endl;
+  ss << "Minor revision number:         " << prop.minor << std::endl;
+  ss << "Name:                          " << prop.name << std::endl;
+  ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
+  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
+     << std::endl;
+  ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
+  ss << "Warp size:                     " << prop.warpSize << std::endl;
+  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
+  ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock
+     << std::endl;
+  ss << "Maximum dimension of block:    "
+     << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
+     << prop.maxThreadsDim[2] << std::endl;
+  ss << "Maximum dimension of grid:     "
+     << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
+     << prop.maxGridSize[2] << std::endl;
+  ss << "Clock rate:                    " << prop.clockRate << std::endl;
+  ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
+  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
+  ss << "Concurrent copy and execution: "
+     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
+  ss << "Number of multiprocessors:     " << prop.multiProcessorCount
+     << std::endl;
+  ss << "Kernel execution timeout:      "
+     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
+  LOG(INFO) << ss.str();
+  return;
+}
+
+bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
+  int gpu_count;
+  if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
+  pattern->clear();
+  pattern->resize(gpu_count, vector<bool>(gpu_count, false));
+  for (int i = 0; i < gpu_count; ++i) {
+    for (int j = 0; j < gpu_count; ++j) {
+      int can_access = true;
+      if (i != j) {
+        if (cudaDeviceCanAccessPeer(&can_access, i, j)
+                 != cudaSuccess) {
+          return false;
+        }
+      }
+      (*pattern)[i][j] = static_cast<bool>(can_access);
+    }
+  }
+  return true;
+}
+
+bool TensorCoreAvailable() {
+  // requires CUDA 9.0 and above
+#if CUDA_VERSION < 9000
+  return false;
+#else
+  int device = CaffeCudaGetDevice();
+  auto& prop = GetDeviceProperty(device);
+
+  return prop.major >= 7;
+#endif
+}
+
+const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif  // CUDA_VERSION >= 6050
+#endif  // CUDA_VERSION >= 6000
+  }
+  // To suppress compiler warning.
+  return "Unrecognized cublas error string";
+}
+
+const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  // To suppress compiler warning.
+  return "Unrecognized curand error string";
+}
+
+// Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
+// function.
+namespace {
+class CudaRuntimeFlagFlipper {
+ public:
+  CudaRuntimeFlagFlipper() {
+    internal::SetCudaRuntimeFlag();
+  }
+};
+static CudaRuntimeFlagFlipper g_flipper;
+} // namespace
+
+}  // namespace caffe2
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
new file mode 100644
index 0000000..f56ffe8
--- /dev/null
+++ b/caffe2/core/common_gpu.h
@@ -0,0 +1,423 @@
+#ifndef CAFFE2_CORE_COMMON_GPU_H_
+#define CAFFE2_CORE_COMMON_GPU_H_
+
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// Disable strict aliasing errors for CUDA 9.
+// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
+// It is included by cusparse.h as well, so guarding the
+// inclusion of that header here is not enough.
+#if CUDA_VERSION >= 9000
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#endif
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // __GNUC__
+#endif // CUDA_VERSION >= 9000
+
+#include <cublas_v2.h>
+#include <curand.h>
+#include <driver_types.h>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/common.h"
+
+// This is a macro defined for cuda fp16 support. In default, cuda fp16 is
+// supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
+// a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
+// version here, but would also allow a use to pass in the flag
+// CAFFE_HAS_CUDA_FP16 manually.
+
+#ifndef CAFFE_HAS_CUDA_FP16
+#if CUDA_VERSION >= 7050
+#define CAFFE_HAS_CUDA_FP16
+#endif  // CUDA_VERSION >= 7050
+#endif  // CAFFE_HAS_CUDA_FP16
+
+#ifdef CAFFE_HAS_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+// Re-enable strict aliasing diagnostic if it was disabled.
+#if CUDA_VERSION >= 9000
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif
+#endif // __GNUC__
+#endif // CUDA_VERSION >= 9000
+
+/**
+ * The maximum number of GPUs that caffe2 recognizes.
+ */
+#define CAFFE2_COMPILE_TIME_MAX_GPUS 16
+/**
+ * The maximum number of peers that each gpu can have when doing p2p setup.
+ * Currently, according to NVidia documentation, each device can support a
+ * system-wide maximum of eight peer connections.
+ * When Caffe2 sets up peer access resources, if we have more than 8 gpus,
+ * we will enable peer access in groups of 8.
+ */
+#define CAFFE2_CUDA_MAX_PEER_SIZE 8
+
+namespace caffe2 {
+
+#if CUDA_VERSION >= 9000
+/**
+ * Empty class to identify TensorCore-based math
+ */
+class TensorCoreEngine {};
+#endif
+
+/**
+ * A runtime function to report the cuda version that Caffe2 is built with.
+ */
+inline int CudaVersion() { return CUDA_VERSION; }
+
+/**
+ * Returns the number of devices.
+ */
+int NumCudaDevices();
+
+/**
+ * Check if the current running session has a cuda gpu present.
+ *
+ * Note that this is different from having caffe2 built with cuda. Building
+ * Caffe2 with cuda only guarantees that this function exists. If there are no
+ * cuda gpus present in the machine, or there are hardware configuration
+ * problems like an insufficient driver, this function will still return false,
+ * meaning that there is no usable GPU present.
+ *
+ * In the open source build, it is possible that Caffe2's GPU code is
+ * dynamically loaded, and as a result a library could be only linked to the
+ * CPU code, but want to test if cuda is later available or not. In this case,
+ * one should use HasCudaRuntime() from common.h.
+ */
+inline bool HasCudaGPU() { return NumCudaDevices() > 0; }
+
+/**
+ * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
+ */
+int CaffeCudaGetDevice();
+
+/**
+ * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
+ */
+void CaffeCudaSetDevice(const int id);
+
+/**
+ * Gets the GPU id that the current pointer is located at.
+ */
+int GetGPUIDForPointer(const void* ptr);
+
+/**
+ * Gets the device property for the given device. This function is thread safe.
+ */
+const cudaDeviceProp& GetDeviceProperty(const int device);
+
+/**
+ * Runs a device query function and prints out the results to LOG(INFO).
+ */
+void DeviceQuery(const int deviceid);
+
+/**
+ * Return a peer access pattern by returning a matrix (in the format of a
+ * nested vector) of boolean values specifying whether peer access is possible.
+ *
+ * This function returns false if anything wrong happens during the query of
+ * the GPU access pattern.
+ */
+bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
+
+/**
+ * Return the availability of TensorCores for math
+ */
+bool TensorCoreAvailable();
+
+/**
+ * Return a human readable cublas error string.
+ */
+const char* cublasGetErrorString(cublasStatus_t error);
+
+/**
+ * Return a human readable curand error string.
+ */
+const char* curandGetErrorString(curandStatus_t error);
+
+// CUDA: various checks for different function calls.
+#define CUDA_ENFORCE(condition, ...)     \
+  do {                              \
+    cudaError_t error = condition;  \
+    CAFFE_ENFORCE_EQ(               \
+        error,                      \
+        cudaSuccess,                \
+        "Error at: ",               \
+        __FILE__,                   \
+        ":",                        \
+        __LINE__,                   \
+        ": ",                       \
+        cudaGetErrorString(error), ##__VA_ARGS__); \
+  } while (0)
+#define CUDA_CHECK(condition)                                 \
+  do {                                                        \
+    cudaError_t error = condition;                            \
+    CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUDA_DRIVERAPI_ENFORCE(condition)                            \
+  do {                                                               \
+    CUresult result = condition;                                     \
+    if (result != CUDA_SUCCESS) {                                    \
+      const char* msg;                                               \
+      cuGetErrorName(result, &msg);                                  \
+      CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
+    }                                                                \
+  } while (0)
+#define CUDA_DRIVERAPI_CHECK(condition)                                 \
+  do {                                                                  \
+    CUresult result = condition;                                        \
+    if (result != CUDA_SUCCESS) {                                       \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
+                 << msg;                                                \
+    }                                                                   \
+  } while (0)
+
+#define CUBLAS_ENFORCE(condition)                \
+  do {                                           \
+    cublasStatus_t status = condition;           \
+    CAFFE_ENFORCE_EQ(                            \
+        status,                                  \
+        CUBLAS_STATUS_SUCCESS,                   \
+        "Error at: ",                            \
+        __FILE__,                                \
+        ":",                                     \
+        __LINE__,                                \
+        ": ",                                    \
+        ::caffe2::cublasGetErrorString(status)); \
+  } while (0)
+#define CUBLAS_CHECK(condition)                    \
+  do {                                             \
+    cublasStatus_t status = condition;             \
+    CHECK(status == CUBLAS_STATUS_SUCCESS)         \
+        << ::caffe2::cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_ENFORCE(condition)                \
+  do {                                           \
+    curandStatus_t status = condition;           \
+    CAFFE_ENFORCE_EQ(                            \
+        status,                                  \
+        CURAND_STATUS_SUCCESS,                   \
+        "Error at: ",                            \
+        __FILE__,                                \
+        ":",                                     \
+        __LINE__,                                \
+        ": ",                                    \
+        ::caffe2::curandGetErrorString(status)); \
+  } while (0)
+#define CURAND_CHECK(condition)                    \
+  do {                                             \
+    curandStatus_t status = condition;             \
+    CHECK(status == CURAND_STATUS_SUCCESS)         \
+        << ::caffe2::curandGetErrorString(status); \
+  } while (0)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                 \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
+// kernels. This is not supported by Apple platforms so we special case it.
+// See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
+#ifdef __APPLE__
+#define CUDA_KERNEL_ASSERT(...)
+#else  // __APPLE__
+#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
+#endif  // __APPLE__
+
+// The following helper functions are here so that you can write a kernel call
+// when you are not particularly interested in maxing out the kernels'
+// performance. Usually, this will give you a reasonable speed, but if you
+// really want to find the best performance, it is advised that you tune the
+// size of the blocks and grids more reasonably.
+// A legacy note: this is derived from the old good Caffe days, when I simply
+// hard-coded the number of threads and wanted to keep backward compatibility
+// for different computation capabilities.
+// For more info on CUDA compute capabilities, visit the NVidia website at:
+//    http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+
+// The number of cuda threads to use. 512 is used for backward compatibility,
+// and it is observed that setting it to 1024 usually does not bring much
+// performance gain (which makes sense, because warp size being 32 means that
+// blindly setting a huge block for a random kernel isn't optimal).
+constexpr int CAFFE_CUDA_NUM_THREADS = 512;
+// The maximum number of blocks to use in the default kernel call. We set it to
+// 4096 which would work for compute capability 2.x (where 65536 is the limit).
+// This number is very carelessly chosen. Ideally, one would like to look at
+// the hardware at runtime, and pick the number of blocks that makes most
+// sense for the specific runtime environment. This is a todo item.
+constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
+
+/**
+ * @brief Compute the number of blocks needed to run N threads.
+ */
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return std::max(
+      std::min(
+          (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
+          CAFFE_MAXIMUM_NUM_BLOCKS),
+      // Use at least 1 block, since CUDA does not allow empty block
+      1);
+}
+
+class DeviceGuard {
+ public:
+  explicit DeviceGuard(int newDevice) : previous_(CaffeCudaGetDevice()) {
+    if (previous_ != newDevice) {
+      CaffeCudaSetDevice(newDevice);
+    }
+  }
+
+  ~DeviceGuard() noexcept {
+    CaffeCudaSetDevice(previous_);
+  }
+
+ private:
+  int previous_;
+};
+
+template <typename T, int N>
+struct SimpleArray {
+  T data[N];
+};
+
+constexpr int kCUDATensorMaxDims = 8;
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
+  do {                                                            \
+    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                    \
+    switch (val) {                                                \
+      case 1: {                                                   \
+        Func<T, 1>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 2: {                                                   \
+        Func<T, 2>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 3: {                                                   \
+        Func<T, 3>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 4: {                                                   \
+        Func<T, 4>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 5: {                                                   \
+        Func<T, 5>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 6: {                                                   \
+        Func<T, 6>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 7: {                                                   \
+        Func<T, 7>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 8: {                                                   \
+        Func<T, 8>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      default: { break; }                                         \
+    }                                                             \
+  } while (false)
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
+  do {                                                                 \
+    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                         \
+    switch (val) {                                                     \
+      case 1: {                                                        \
+        Func<T1, T2, 1>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 2: {                                                        \
+        Func<T1, T2, 2>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 3: {                                                        \
+        Func<T1, T2, 3>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 4: {                                                        \
+        Func<T1, T2, 4>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 5: {                                                        \
+        Func<T1, T2, 5>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 6: {                                                        \
+        Func<T1, T2, 6>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 7: {                                                        \
+        Func<T1, T2, 7>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 8: {                                                        \
+        Func<T1, T2, 8>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      default: { break; }                                              \
+    }                                                                  \
+  } while (false)
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
+  do {                                                                     \
+    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                             \
+    switch (val) {                                                         \
+      case 1: {                                                            \
+        Func<T1, T2, T3, 1>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 2: {                                                            \
+        Func<T1, T2, T3, 2>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 3: {                                                            \
+        Func<T1, T2, T3, 3>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 4: {                                                            \
+        Func<T1, T2, T3, 4>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 5: {                                                            \
+        Func<T1, T2, T3, 5>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 6: {                                                            \
+        Func<T1, T2, T3, 6>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 7: {                                                            \
+        Func<T1, T2, T3, 7>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 8: {                                                            \
+        Func<T1, T2, T3, 8>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      default: { break; }                                                  \
+    }                                                                      \
+  } while (false)
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_COMMON_GPU_H_
diff --git a/caffe2/core/common_omp.h b/caffe2/core/common_omp.h
new file mode 100644
index 0000000..36c4b9f
--- /dev/null
+++ b/caffe2/core/common_omp.h
@@ -0,0 +1,8 @@
+#ifndef CAFFE2_CORE_COMMON_OMP_H_
+#define CAFFE2_CORE_COMMON_OMP_H_
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif // _OPENMP
+
+#endif // CAFFE2_CORE_COMMON_OMP_H_
diff --git a/caffe2/core/common_test.cc b/caffe2/core/common_test.cc
new file mode 100644
index 0000000..8900a78
--- /dev/null
+++ b/caffe2/core/common_test.cc
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <memory>
+
+#define CAFFE2_TESTONLY_FORCE_STD_STRING_TEST
+
+#include "caffe2/core/common.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+#ifndef __ANDROID__
+
+// Simple tests to make sure that our stoi and stod implementations are 
+// matching the std implementations, but not testing it very extensively
+// as one should be using the std version most of the time.
+TEST(CommonTest, TestStoi) {
+  EXPECT_TRUE(CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS);
+  string s = "1234";
+  int i_std = std::stoi(s);
+  int i_caffe2 = ::caffe2::stoi(s);
+  EXPECT_EQ(i_std, i_caffe2);
+}
+
+TEST(CommonTest, TestStod) {
+  // Full string is parsed.
+  string s = "1.234";
+  std::size_t p_std = 0, p_caffe2 = 0;
+  double d_std = std::stod(s, &p_std);
+  double d_caffe2 = ::caffe2::stod(s, &p_caffe2);
+  EXPECT_EQ(d_std, d_caffe2);
+  EXPECT_EQ(p_std, p_caffe2);
+
+  // Only part of the string is parsed.
+  s = "1.234 5.678";
+  d_std = std::stod(s, &p_std);
+  d_caffe2 = ::caffe2::stod(s, &p_caffe2);
+  EXPECT_EQ(d_std, d_caffe2);
+  EXPECT_EQ(p_std, p_caffe2);
+}
+
+#endif // __ANDROID__
+
+}  // namespace caffe2
+
+
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
new file mode 100644
index 0000000..427e8bb
--- /dev/null
+++ b/caffe2/core/context.cc
@@ -0,0 +1,27 @@
+#include "caffe2/core/context.h"
+
+#include <atomic>
+#if defined(_MSC_VER)
+#include <process.h>
+#endif
+
+namespace caffe2 {
+
+uint32_t RandomNumberSeed() {
+  // Originally copied from folly::randomNumberSeed (at 418ad4)
+  // modified to use chrono instead of sys/time.h
+  static std::atomic<uint32_t> seedInput(0);
+  auto tv = std::chrono::system_clock::now().time_since_epoch();
+  uint64_t usec = static_cast<uint64_t>(
+      std::chrono::duration_cast<std::chrono::microseconds>(tv).count());
+  uint32_t tv_sec = usec / 1000000;
+  uint32_t tv_usec = usec % 1000000;
+  const uint32_t kPrime0 = 51551;
+  const uint32_t kPrime1 = 61631;
+  const uint32_t kPrime2 = 64997;
+  const uint32_t kPrime3 = 111857;
+  return kPrime0 * (seedInput++) + kPrime1 * static_cast<uint32_t>(getpid()) +
+      kPrime2 * tv_sec + kPrime3 * tv_usec;
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
new file mode 100644
index 0000000..df3b0f2
--- /dev/null
+++ b/caffe2/core/context.h
@@ -0,0 +1,180 @@
+#ifndef CAFFE2_CORE_CONTEXT_H_
+#define CAFFE2_CORE_CONTEXT_H_
+
+#include <cstdlib>
+#include <ctime>
+#include <random>
+#include <unordered_map>
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/event.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
+
+namespace caffe2 {
+
+/**
+ * A function to generate a random number seed that is unique in a best-effort
+ * basis, using an ever-incrementing seed and the current time.
+ */
+uint32_t RandomNumberSeed();
+
+/**
+ * The CPU Context, representing the bare minimum of what a Context class in
+ * Caffe2 should implement.
+ *
+ * See operator.h, especially Operator<Context>, for how Context are used in
+ * actual operator implementations that are associated with specific devices.
+ * In general, the Context class is passed in as a template argument, and
+ * the operator can use the functions defined in the context to execute whatever
+ * computation it has.
+ *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes have the freedom to choose what functions it
+ * implements, but there are a few functions that you should consider
+ * implementing if you want to write your own context class:
+ * - void SwitchToDevice(): any necessary code to switch to the device before
+ *     running anything.
+ * - void WaitEvent(const Event& ev): make the current context to wait on
+ *     an event. For example, for cuda, this is the equivalent of
+ *     cudaStreamWaitEvent. For CPU context, it essentially synchronizes the
+ *     event.
+ * - void Record(Event* ev): record the async activities on the current context
+ *     to the event. For example, for cuda, this is the equivalent of
+ *     cudaEventRecord on the current stream. For CPU context, it is always
+ *     synchronous.
+ * - void FinishDeviceComputation(): any wrapping-up work after all the
+ *     computation of the operator is done. If there are errors during the
+ *     execution, throw exception. For example, in a CUDAContext, this function
+ *     carries out a stream synchronization and spots potential errors for
+ *     the cuda kernel calls.
+ * - static std::pair<void*, MemoryDeleter> New(size_t nbytes): allocates
+       memory and returns a deleter.
+ * - template <class SrcContext, class DstContext> void CopyBytes(...): does
+ *     cross context memory copy.
+ * - template <typename T, class SrcContext, class DstContext> void Copy(...):
+ *     usually a simple wrapper around the above CopyBytes function.
+ *
+ * We intentionally did not create a base class for the various possible Context
+ * classes there might be, since they are intended to be specified during
+ * compile time using templates rather than via polymorphism. You should also
+ * not have classes derived from existing context classes.
+ */
+class CPUContext final {
+ public:
+  typedef std::mt19937 rand_gen_type;
+  CPUContext() : random_seed_(RandomNumberSeed()) {}
+  explicit CPUContext(const DeviceOption& option)
+      : random_seed_(
+            option.has_random_seed() ? option.random_seed()
+                                     : RandomNumberSeed()) {
+    CAFFE_ENFORCE_EQ(option.device_type(), CPU);
+  }
+
+  ~CPUContext() noexcept {}
+
+  inline void SwitchToDevice(int /*stream_id*/) {}
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  inline void WaitEvent(const Event& ev) {
+    ev.Wait(CPU, this);
+  }
+
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+    CAFFE_ENFORCE(ev, "Event must not be null.");
+    ev->Record(CPU, this, err_msg);
+  }
+
+  inline void FinishDeviceComputation() {}
+
+  inline rand_gen_type& RandGenerator() {
+    if (!random_generator_.get()) {
+      random_generator_.reset(new rand_gen_type(random_seed_));
+    }
+    return *random_generator_.get();
+  }
+
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      reporter_.New(data_and_deleter.first, nbytes);
+      data_and_deleter.second = ReportAndDelete;
+    }
+    return data_and_deleter;
+  }
+
+  // Two copy functions that deals with cross-device copies.
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(size_t n, const T* src, T* dst) {
+    if (std::is_fundamental<T>::value) {
+      CopyBytes<SrcContext, DstContext>(
+          n * sizeof(T),
+          static_cast<const void*>(src),
+          static_cast<void*>(dst));
+    } else {
+      for (int i = 0; i < n; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  // By default CPU operators don't have async device parts
+  static bool HasAsyncPartDefault() {
+    return false;
+  }
+
+  static bool SupportsAsyncScheduling() {
+    return false;
+  }
+
+  // CPU streams are not implemented and are silently ignored by CPU ops,
+  // return true to signal executor to schedule a CPU op
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+    return true;
+  }
+
+ protected:
+  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
+  int random_seed_{1701};
+  std::unique_ptr<rand_gen_type> random_generator_;
+  CAFFE2_API static MemoryAllocationReporter reporter_;
+
+ private:
+  static void ReportAndDelete(void* ptr) {
+    reporter_.Delete(ptr);
+    GetCPUAllocator()->GetDeleter()(ptr);
+  }
+};
+
+template<>
+inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
+    size_t nbytes, const void* src, void* dst) {
+  if (nbytes == 0) {
+    return;
+  }
+  CAFFE_ENFORCE(src);
+  CAFFE_ENFORCE(dst);
+  memcpy(dst, src, nbytes);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_H_
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
new file mode 100644
index 0000000..cb4eaed
--- /dev/null
+++ b/caffe2/core/context_gpu.cu
@@ -0,0 +1,437 @@
+#include <algorithm>
+#include <atomic>
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "THCCachingAllocator.h"
+#include "cub/util_allocator.cuh"
+
+// Needed to be included first to check the CAFFE2_USE_CUDNN macros.
+#include "caffe2/core/macros.h"
+
+#include "caffe2/core/asan.h"
+#include "caffe2/core/blob_stats.h"
+#ifdef CAFFE2_USE_CUDNN
+#include "caffe2/core/common_cudnn.h"
+#endif // CAFFE2_USE_CUDNN
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(
+    caffe2_cuda_memory_pool,
+    "",
+    "Sets the memory pool used by caffe2. Possible values are "
+    "none, cnmem, thc and cub.");
+
+// For description of CUB caching allocator configuration, see
+// https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html
+CAFFE2_DEFINE_int(caffe2_cub_bin_growth, 8,
+             "If using cub as the memory allocator, sets the growth of bins "
+             "used by the cub pool.");
+CAFFE2_DEFINE_int(caffe2_cub_min_bin, 3,
+             "If using cub as the memory allocator, sets the min number of "
+             "bins.");
+CAFFE2_DEFINE_int(caffe2_cub_max_bin, 10,
+             "If using cub as the memory allocator, sets the max number of "
+             "bins.");
+CAFFE2_DEFINE_int(caffe2_cub_max_managed_mb, 10 * 1024,
+             "If using cub as the memory allocators, sets the maximum amount "
+             "of memory managed in gigabytes");
+
+CAFFE2_DEFINE_bool(
+    caffe2_cub_print_allocation_events,
+    false,
+    "If true CachingDeviceAllocator will print allocation and deallocation "
+    "events to stdout.");
+
+CAFFE2_DEFINE_bool(
+    caffe2_gpu_memory_tracking,
+    false,
+    "If set, logs changes in GPU memory allocations");
+CAFFE2_DEFINE_int(
+    caffe2_gpu_memory_report_interval_mb,
+    128,
+    "The threshold in MB on how frequently to report memory changes");
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
+
+thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
+
+// TODO(jiayq): these variables shouldn't be currently accessed during static
+// initialization. We should consider moving them to a Mayer's singleton to
+// be totally safe against SIOF.
+
+// Static global variables for setting up the memory pool.
+CudaMemoryPoolType g_cuda_memory_pool_type;
+
+std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
+
+std::unique_ptr<THCCachingAllocator> g_thc_allocator;
+
+// an unordered map that holds the map from the cuda memory pointer to the
+// device id that it is allocated from. This is used in the cuda memory pool
+// cases, where we need the device id to carry out the deletion.
+// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
+// that is usually quite slow. We might want to benchmark the speed difference
+// though.
+// Note(jiayq): another alternate approach is to augment the Tensor class that
+// would allow one to record the device id. However, this does not address any
+// non-tensor allocation and deallocation.
+// Ideally, a memory pool should already have the device id information, as
+// long as we are using UVA (as of CUDA 5 and later) so the addresses are
+// unique.
+static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
+
+// Data structures for optional memory tracking. Access to these structures
+// is garded by the CUDAContext::mutex.
+static std::unordered_map<void*, long> g_size_map;
+static std::vector<long> g_total_by_gpu_map(CAFFE2_COMPILE_TIME_MAX_GPUS, 0);
+static std::vector<long> g_max_by_gpu_map(CAFFE2_COMPILE_TIME_MAX_GPUS, 0);
+
+static long g_total_mem = 0;
+static long g_last_rep = 0;
+
+CudaMemoryPoolType GetCudaMemoryPoolType() {
+  return g_cuda_memory_pool_type;
+}
+
+vector<TIndex> GetCUDATensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  vector<TIndex> dims =
+      GetTensorInfo<CUDAContext>(c, shares_data, capacity, device);
+  const Tensor<CUDAContext>* tc = static_cast<const Tensor<CUDAContext>*>(c);
+  device->set_device_type(CUDA);
+  device->set_cuda_gpu_id(GetGPUIDForPointer(tc->raw_data()));
+  return dims;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// A wrapper to allow us to lazily initialize all cuda environments that Caffe
+// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
+// which is probably the decisive indication that this caffe2 run is going to
+// use GPUs. We avoid cuda initialization with core/init.h functionalities so
+// that we have minimal resource impact in case we will need to run multiple
+// caffe2 instances on a GPU machine.
+///////////////////////////////////////////////////////////////////////////////
+
+static void Caffe2InitializeCuda() {
+  // If the current run does not have any cuda devices, do nothing.
+  if (!HasCudaGPU()) {
+    VLOG(1) << "No cuda gpu present. Skipping.";
+    return;
+  }
+  // Check if the number of GPUs matches the expected compile-time max number
+  // of GPUs.
+  CAFFE_ENFORCE_LE(
+      NumCudaDevices(),
+      CAFFE2_COMPILE_TIME_MAX_GPUS,
+      "Number of CUDA devices on the machine is larger than the compiled "
+      "max number of gpus expected (",
+      CAFFE2_COMPILE_TIME_MAX_GPUS,
+      "). Increase that and recompile the caffe binary.");
+
+  for (int i = 0; i < NumCudaDevices(); ++i) {
+    DeviceGuard g(i);
+    // Enable peer access.
+    const int peer_group = i / CAFFE2_CUDA_MAX_PEER_SIZE;
+    const int peer_start = peer_group * CAFFE2_CUDA_MAX_PEER_SIZE;
+    const int peer_end = std::min(
+        NumCudaDevices(), (peer_group + 1) * CAFFE2_CUDA_MAX_PEER_SIZE);
+    VLOG(1) << "Enabling peer access within group #" << peer_group
+            << ", from gpuid " << peer_start << " to " << peer_end - 1
+            << ", for gpuid " << i << ".";
+
+    for (int j = peer_start; j < peer_end; ++j) {
+      if (i == j) continue;
+      int can_access;
+      CUDA_ENFORCE(cudaDeviceCanAccessPeer(&can_access, i, j));
+      if (can_access) {
+        VLOG(1) << "Enabling peer access from " << i << " to " << j;
+        // Note: just for future reference, the 0 here is not a gpu id, it is
+        // a reserved flag for cudaDeviceEnablePeerAccess that should always be
+        // zero currently.
+        CUDA_ENFORCE(cudaDeviceEnablePeerAccess(j, 0));
+      }
+    }
+  }
+
+  RegisterTypeCallFunction(
+    TypeMeta::Id<Tensor<CUDAContext>>(),
+    GetTensorType<CUDAContext>
+  );
+
+  RegisterTensorInfoFunction(
+      TypeMeta::Id<Tensor<CUDAContext>>(), GetCUDATensorInfo);
+
+#ifdef CAFFE2_USE_CUDNN
+  // Check the versions of cuDNN that were compiled and linked with are compatible
+  CheckCuDNNVersions();
+#endif // CAFFE2_USE_CUDNN
+}
+
+static void SetUpCub() {
+  VLOG(1) << "Setting up cub memory pool.";
+  // Sets up the cub memory pool
+  try {
+    g_cub_allocator.reset(new cub::CachingDeviceAllocator(
+        FLAGS_caffe2_cub_bin_growth,
+        FLAGS_caffe2_cub_min_bin,
+        FLAGS_caffe2_cub_max_bin,
+        size_t(FLAGS_caffe2_cub_max_managed_mb) * 1024L * 1024L,
+        false,
+        FLAGS_caffe2_cub_print_allocation_events));
+  } catch (...) {
+    CAFFE_THROW("Some error happened at cub initialization.");
+  }
+  VLOG(1) << "Done setting up cub memory pool.";
+}
+
+static void Caffe2SetCUDAMemoryPool() {
+  if (FLAGS_caffe2_cuda_memory_pool == "" ||
+      FLAGS_caffe2_cuda_memory_pool == "none") {
+    g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
+  } else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
+    CAFFE_THROW("CNMEM is no longer used by Caffe2. Use cub instead. "
+                "This error message may go away in the future.");
+  } else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
+    // Sets up cub.
+    g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
+    SetUpCub();
+  } else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
+    g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
+    g_thc_allocator.reset(new THCCachingAllocator());
+  } else {
+    CAFFE_THROW("Unrecognized cuda memory pool type: ",
+                FLAGS_caffe2_cuda_memory_pool);
+  }
+}
+
+// An initialization function that sets the CPU side to use pinned cpu
+// allocator.
+void Caffe2UsePinnedCPUAllocator() {
+#if CAFFE2_ASAN_ENABLED
+  // Note(jiayq): for more details, see
+  //     https://github.com/google/sanitizers/issues/629
+  LOG(WARNING) << "There are known issues between address sanitizer and "
+                  "cudaMallocHost. As a result, caffe2 will not enable pinned "
+                  "memory allocation in asan mode. If you are expecting any "
+                  "behavior that depends on asan, be advised that it is not "
+                  "turned on.";
+#else
+  if (!HasCudaGPU()) {
+    VLOG(1) << "No GPU present. I won't use pinned allocator then.";
+    return;
+  }
+  VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
+  SetCPUAllocator(new PinnedCPUAllocator());
+#endif
+}
+
+// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
+// detect the first hint that this Caffe2 run is going to use GPU: either
+// CUDAContext is initialized or CUDAContext::New is called. It then runs
+// all the related cuda initialization functions.
+namespace {
+struct Caffe2CudaInitializerHelper {
+  Caffe2CudaInitializerHelper() {
+    // We cannot use bool because nvcc changes bool to __nv_bool which does
+    // not have a std::atomic instantiation.
+    static std::atomic<char> first_call(1);
+    if (first_call.fetch_and((char)0)) {
+      Caffe2InitializeCuda();
+      Caffe2SetCUDAMemoryPool();
+      Caffe2UsePinnedCPUAllocator();
+    }
+  }
+};
+
+struct TensorCUDAStatGetter : BlobStatGetter {
+  size_t sizeBytes(const Blob& blob) const override {
+    const auto& tensor = blob.Get<TensorCUDA>();
+    auto nbytes = tensor.nbytes();
+    if (nbytes > 0 && tensor.IsType<std::string>()) {
+      const auto* data = tensor.data<std::string>();
+      for (int i = 0; i < tensor.size(); ++i) {
+        nbytes += data[i].size();
+      }
+    }
+    return nbytes;
+  }
+};
+REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
+} // namespace
+
+/**
+ * A utility function to rectify the gpu id. If the context specifies the
+ * gpu id to be -1, it means that we will just use the current gpu id when
+ * the function is being called.
+ */
+static inline int RectifyGPUID(const int gpu_id) {
+  return gpu_id == -1 ? CaffeCudaGetDevice() : gpu_id;
+}
+
+CUDAContext::CUDAContext(const int gpu_id)
+    : gpu_id_(RectifyGPUID(gpu_id)), random_seed_(RandomNumberSeed()) {
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+}
+
+CUDAContext::CUDAContext(const DeviceOption& option)
+    : gpu_id_(
+          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
+                                   : CaffeCudaGetDevice()),
+      random_seed_(
+          option.has_random_seed() ? option.random_seed()
+                                   : RandomNumberSeed()) {
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+  DCHECK_EQ(option.device_type(), CUDA);
+}
+
+// shared mutex to lock out alloc / free during NCCL launches
+std::mutex& CUDAContext::mutex() {
+  static std::mutex m;
+  return m;
+}
+
+std::vector<long> CUDAContext::TotalMemoryByGpu() {
+  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+  CAFFE_ENFORCE(
+      FLAGS_caffe2_gpu_memory_tracking,
+      "Pass --caffe2_gpu_memory_tracking to enable memory stats");
+  return g_total_by_gpu_map;
+}
+
+std::vector<long> CUDAContext::MaxMemoryByGpu() {
+  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+  CAFFE_ENFORCE(
+      FLAGS_caffe2_gpu_memory_tracking,
+      "Pass --caffe2_gpu_memory_tracking to enable memory stats");
+  return g_max_by_gpu_map;
+}
+
+namespace {
+void TrackMemoryAlloc(size_t nbytes) {
+  int this_gpu = CaffeCudaGetDevice();
+  g_total_by_gpu_map[this_gpu] += nbytes;
+  g_max_by_gpu_map[this_gpu] =
+      max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
+  g_total_mem += nbytes;
+  if (g_total_mem - g_last_rep >
+      FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024) {
+    for (int gpu = 0; gpu < g_total_by_gpu_map.size(); gpu++) {
+      long t = g_total_by_gpu_map[gpu];
+      long max_t = g_max_by_gpu_map[gpu];
+      if (max_t > 0) {
+        if (max_t != t) {
+          LOG(INFO) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB"
+                    << " (max: " << max_t / 1024 / 1024 << " MB)";
+        } else {
+          LOG(INFO) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB";
+        }
+      }
+    }
+    LOG(INFO) << "Total: " << g_total_mem / 1024 / 1024 << " MB";
+    g_last_rep = g_total_mem;
+  }
+}
+}
+
+std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
+  // Lock the mutex
+  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+  // A one-time caffe2 cuda initializer.
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+  void* ptr = nullptr;
+
+  if (FLAGS_caffe2_gpu_memory_tracking) {
+    TrackMemoryAlloc(nbytes);
+  }
+  switch (g_cuda_memory_pool_type) {
+  case CudaMemoryPoolType::NONE:
+    CUDA_ENFORCE(cudaMalloc(&ptr, nbytes));
+    if (FLAGS_caffe2_gpu_memory_tracking) {
+      g_size_map[ptr] = nbytes;
+      g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
+    }
+    return {ptr, Delete};
+  case CudaMemoryPoolType::CUB:
+    CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
+    g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
+    VLOG(2) << "CUB allocating pointer " << ptr << " on device "
+            << CaffeCudaGetDevice();
+    if (FLAGS_caffe2_gpu_memory_tracking) {
+      g_size_map[ptr] = nbytes;
+    }
+    return {ptr, Delete};
+  case CudaMemoryPoolType::THC:
+    CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
+    if (FLAGS_caffe2_gpu_memory_tracking) {
+      g_size_map[ptr] = nbytes;
+      g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
+    }
+    return {ptr, Delete};
+  }
+  return {nullptr, Delete};
+}
+
+void CUDAContext::Delete(void* ptr) {
+  // lock the mutex
+  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+
+  if (FLAGS_caffe2_gpu_memory_tracking) {
+    auto sz_it = g_size_map.find(ptr);
+    DCHECK(sz_it != g_size_map.end());
+    auto aff_it = g_cuda_device_affiliation.find(ptr);
+    DCHECK(aff_it != g_cuda_device_affiliation.end());
+    g_total_mem -= sz_it->second;
+    g_total_by_gpu_map[aff_it->second] -= sz_it->second;
+    g_size_map.erase(sz_it);
+  }
+
+  switch (g_cuda_memory_pool_type) {
+  case CudaMemoryPoolType::NONE: {
+    // If memory pool is not set up, use simple cudaFree.
+    cudaError_t error = cudaFree(ptr);
+    // For some reason, in Python runtime we sometimes delete a data pointer
+    // after the cuda runtime exits - this is odd but is probably caused by
+    // a static workspace that pycaffe2 uses, and the destruction got
+    // entangled in some race condition. Anyway, since cuda runtime is exiting
+    // anyway, we will not need to worry about memory leak, so we basically
+    // ignore it. This is definitely not ideal but works for now.
+    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                 << cudaGetErrorString(error);
+    }
+
+    if (FLAGS_caffe2_gpu_memory_tracking) {
+      g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
+    }
+
+    break; }
+  case CudaMemoryPoolType::CUB: {
+    auto it = g_cuda_device_affiliation.find(ptr);
+    DCHECK(it != g_cuda_device_affiliation.end());
+    VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
+    CUDA_ENFORCE(g_cub_allocator->DeviceFree(it->second, ptr));
+    g_cuda_device_affiliation.erase(it);
+    break;
+  }
+  case CudaMemoryPoolType::THC: {
+    CUDA_ENFORCE(g_thc_allocator->Free(ptr));
+    if (FLAGS_caffe2_gpu_memory_tracking) {
+      g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
+    }
+    break;
+  }
+  }
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
new file mode 100644
index 0000000..1668d4f
--- /dev/null
+++ b/caffe2/core/context_gpu.h
@@ -0,0 +1,358 @@
+#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
+#define CAFFE2_CORE_CONTEXT_GPU_H_
+
+#include <ctime>
+#include <mutex>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/numa.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+// Since we are using the macro CAFFE2_USE_CUDNN, we will need to include this
+// file after common.h is included.
+#ifdef CAFFE2_USE_CUDNN
+#include "caffe2/core/common_cudnn.h"
+#endif // CAFFE2_USE_CUDNN
+
+namespace caffe2 {
+
+enum class CudaMemoryPoolType {
+  NONE = 0,
+  CUB = 1,
+  THC = 2,
+};
+
+/**
+ * Gets the current memory pool type used by Caffe2.
+ *
+ * The memory pool is set up during caffe2's global initialization time.
+ */
+CudaMemoryPoolType GetCudaMemoryPoolType();
+
+/**
+ * A struct to host thread-local cuda objects.
+ *
+ * In Caffe2, each thread has its own non-default cuda stream as well as
+ * related objects such as cublas and curand handles. This is achieved by
+ * having the ThreadLocalCUDAObjects wrapper that takes care of allocating
+ * and deallocating these objects at the thread scope. This class is solely
+ * used inside CUDAContext and should not be used externally.
+ */
+class ThreadLocalCUDAObjects {
+  friend class CUDAContext;
+
+ private:
+  ThreadLocalCUDAObjects() {
+    for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
+      cuda_streams_[i] = vector<cudaStream_t>();
+      cublas_handles_[i] = vector<cublasHandle_t>();
+#ifdef CAFFE2_USE_CUDNN
+      cudnn_handles_[i] = vector<cudnnHandle_t>();
+#endif // CAFFE2_USE_CUDNN
+    }
+  }
+
+  cudaStream_t GetStream(int gpu, int stream_id) {
+    vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
+    if (gpu_streams.size() <= (unsigned)stream_id) {
+      gpu_streams.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_streams[stream_id]) {
+      DeviceGuard guard(gpu);
+      CUDA_ENFORCE(cudaStreamCreateWithFlags(
+          &gpu_streams[stream_id], cudaStreamNonBlocking));
+    }
+    return gpu_streams[stream_id];
+  }
+
+  cublasHandle_t GetHandle(int gpu, int stream_id) {
+    DeviceGuard guard(gpu);
+    vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
+    if (gpu_handles.size() <= (unsigned)stream_id) {
+      gpu_handles.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_handles[stream_id]) {
+      CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
+      // The default is CUBLAS_POINTER_MODE_HOST. You can override
+      // it after obtaining the cublas handle, but do that with
+      // caution.
+      CUBLAS_ENFORCE(cublasSetPointerMode(
+          gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
+      CUBLAS_ENFORCE(
+          cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
+    }
+    return gpu_handles[stream_id];
+  }
+
+#ifdef CAFFE2_USE_CUDNN
+  cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
+    DeviceGuard guard(gpu);
+    vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
+    if (gpu_handles.size() <= (unsigned)stream_id) {
+      gpu_handles.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_handles[stream_id]) {
+      CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
+      CUDNN_ENFORCE(
+          cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
+    }
+    return gpu_handles[stream_id];
+  }
+#endif // CAFFE2_USE_CUDNN
+
+  ~ThreadLocalCUDAObjects() noexcept {
+    for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
+      for (auto& handle : cublas_handles_[i]) {
+        if (handle) {
+          CUBLAS_CHECK(cublasDestroy(handle));
+        }
+      }
+      for (auto& stream : cuda_streams_[i]) {
+        if (stream) {
+          CUDA_CHECK(cudaStreamDestroy(stream));
+        }
+      }
+
+#ifdef CAFFE2_USE_CUDNN
+      for (auto& handle : cudnn_handles_[i]) {
+        if (handle) {
+          CUDNN_CHECK(cudnnDestroy(handle));
+        }
+      }
+#endif // CAFFE2_USE_CUDNN
+    }
+  }
+  vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
+  vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
+#ifdef CAFFE2_USE_CUDNN
+  vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
+#endif // CAFFE2_USE_CUDNN
+};
+
+class CUDAContext final {
+ public:
+  // The default cuda context constructor.
+  explicit CUDAContext(const int gpu_id = -1);
+  explicit CUDAContext(const DeviceOption& option);
+
+  ~CUDAContext() {
+    if (curand_generator_) {
+      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+    }
+    FinishDeviceComputation();
+  }
+
+  inline void SwitchToDevice(int stream_id) {
+    set_stream_id(stream_id);
+    CaffeCudaSetDevice(gpu_id_);
+  }
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  inline void WaitEvent(const Event& ev) {
+    ev.Wait(CUDA, this);
+  }
+
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+    CAFFE_ENFORCE(ev, "Event must not be null.");
+    ev->Record(CUDA, this, err_msg);
+  }
+
+  void FinishDeviceComputation() {
+    cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+      CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error));
+    }
+  }
+
+  inline int cuda_gpu_id() const {
+    return gpu_id_;
+  }
+
+  inline cudaStream_t cuda_stream() {
+    return cuda_stream(gpu_id_, stream_id_);
+  }
+
+  inline cudaStream_t cuda_stream() const {
+    return cuda_stream(gpu_id_, stream_id_);
+  }
+
+  static cudaStream_t cuda_stream(int gpu_id, int stream_id) {
+    return cuda_objects_.GetStream(gpu_id, stream_id);
+  }
+
+  cublasHandle_t cublas_handle() {
+    return cuda_objects_.GetHandle(gpu_id_, stream_id_);
+  }
+
+#ifdef CAFFE2_USE_CUDNN
+  cudnnHandle_t cudnn_handle() {
+    return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
+  }
+#endif // CAFFE2_USE_CUDNN
+
+  curandGenerator_t& curand_generator() {
+    if (!curand_generator_) {
+      DeviceGuard guard(gpu_id_);
+      CURAND_ENFORCE(
+          curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+      CURAND_ENFORCE(
+          curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
+      CHECK_NOTNULL(curand_generator_);
+    }
+    CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
+    return curand_generator_;
+  }
+
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+
+  // Get a mutex to lock out cudaMalloc / cudaFree calls when
+  // NCCL kernels are being launched. Should remove threat of
+  // deadlocks
+  static std::mutex& mutex();
+
+  // Functions to query memory stats. Only available if flag
+  // --caffe2_gpu_memory_tracking is enabled.
+  static std::vector<long> TotalMemoryByGpu();
+  static std::vector<long> MaxMemoryByGpu();
+
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
+    CUDA_ENFORCE(cudaMemcpyAsync(
+        dst,
+        src,
+        nbytes,
+        cudaMemcpyDefault,
+        cuda_objects_.GetStream(gpu_id_, stream_id_)));
+  }
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(int n, const T* src, T* dst) {
+    CopyBytes<SrcContext, DstContext>(n * sizeof(T),
+                                 static_cast<const void*>(src),
+                                 static_cast<void*>(dst));
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
+    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+  }
+
+  // By default CUDA operators have async device parts
+  static bool HasAsyncPartDefault() {
+    return true;
+  }
+
+  static bool SupportsAsyncScheduling() {
+    return true;
+  }
+
+  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
+    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
+    return cudaStreamQuery(stream) == cudaSuccess;
+  }
+
+ protected:
+  static void Delete(void* data);
+  void set_stream_id(int stream_id) {
+    stream_id_ = stream_id;
+  }
+
+  int gpu_id_;
+  int stream_id_ = 0;
+  int random_seed_;
+  curandGenerator_t curand_generator_{nullptr};
+  static thread_local ThreadLocalCUDAObjects cuda_objects_;
+};
+
+// For the CPU context, we also allow a (probably expensive) function
+// to copy the data from a cuda context. Inside the function, we create
+// a temporary CUDAContext object to carry out the copy. From the caller's
+// side, these functions are synchronous with respect to the host, similar
+// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
+template<>
+inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
+    size_t nbytes, const void* src, void* dst) {
+  CUDAContext context(GetGPUIDForPointer(src));
+  context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
+}
+template<>
+inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
+    size_t nbytes, const void* src, void* dst) {
+  CUDAContext context(GetGPUIDForPointer(dst));
+  context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
+}
+
+/**
+ * An allocator that does the CPU memory allocation with pinned memory.
+ *
+ * This is needed because if we want to do any asynchronous cuda memcpy,
+ * the underlying CPU memory also needs to be allocated into pinned memory
+ * space. As a result, whenever Caffe2 is built with GPU and there is
+ * GPU present during runtime, at global initialization time we will set
+ * the CPU memory allocator to allocate pinned memory.
+ */
+struct PinnedCPUAllocator final : CPUAllocator {
+  PinnedCPUAllocator() {}
+  ~PinnedCPUAllocator() override {}
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+    void* data;
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+    if (IsNUMAEnabled()) {
+      auto ptr_and_deleter = baseAllocator_.New(nbytes);
+      data = ptr_and_deleter.first;
+      CAFFE_ENFORCE(data);
+      CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
+    } else {
+      CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+    }
+    memset(data, 0, nbytes);
+    return {data, Delete};
+  }
+
+  MemoryDeleter GetDeleter() override {
+    return Delete;
+  }
+
+ private:
+  static void Delete(void* data) {
+    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
+    // or not. If a CUDAContext::New() call is made, inside the CUDAContext
+    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
+    // But, if one calls CPUContext::New() before any cuda allocations,
+    // PinnedCPUAllocator can still delete the corresponding memory.
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+    if (IsNUMAEnabled()) {
+      CUDA_ENFORCE(cudaHostUnregister(data));
+      DefaultCPUAllocator::Delete(data);
+    } else {
+      cudaError_t err = cudaFreeHost(data);
+      if (err == cudaErrorInvalidValue) {
+        free(data);
+        // Calling cudaGetLastError will reset the cuda error.
+        cudaGetLastError();
+      } else {
+        // For all other errors, still do a cuda check.
+        CUDA_ENFORCE(err);
+      }
+    }
+  }
+
+  DefaultCPUAllocator baseAllocator_;
+};
+
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<CUDAContext> TensorCUDA;
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_GPU_H_
diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc
new file mode 100644
index 0000000..ce1cb74
--- /dev/null
+++ b/caffe2/core/context_gpu_test.cc
@@ -0,0 +1,149 @@
+#include <chrono>
+#include <future>
+#include <random>
+#include <thread>
+#include <array>
+
+#include "caffe2/core/context_gpu.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_bool(caffe2_cuda_full_device_control);
+
+namespace caffe2 {
+
+namespace {
+std::shared_ptr<void> shared_from_new(std::pair<void*, MemoryDeleter>&& p) {
+  return std::shared_ptr<void>(p.first, std::move(p.second));
+}
+}
+
+TEST(CUDATest, HasCudaRuntime) {
+  EXPECT_TRUE(HasCudaRuntime());
+}
+
+TEST(CUDAContextTest, TestAllocDealloc) {
+  if (!HasCudaGPU()) return;
+  CUDAContext context(0);
+  context.SwitchToDevice();
+  auto data = shared_from_new(CUDAContext::New(10 * sizeof(float)));
+  EXPECT_NE(data.get(), nullptr);
+}
+
+TEST(CUDAContextTest, TestSetGetDeviceWithoutCaffeMode) {
+  // For a while, set full device control to be true.
+  for (int i = 0; i < NumCudaDevices(); ++i) {
+    CaffeCudaSetDevice(i);
+    EXPECT_EQ(CaffeCudaGetDevice(), i);
+  }
+  for (int i = NumCudaDevices() - 1; i >= 0; --i) {
+    CaffeCudaSetDevice(i);
+    EXPECT_EQ(CaffeCudaGetDevice(), i);
+  }
+}
+
+TEST(CUDAContextTest, TestSetGetDeviceWithCaffeMode) {
+  // For a while, set full device control to be true.
+  FLAGS_caffe2_cuda_full_device_control = true;
+  for (int i = 0; i < NumCudaDevices(); ++i) {
+    CaffeCudaSetDevice(i);
+    EXPECT_EQ(CaffeCudaGetDevice(), i);
+  }
+  for (int i = NumCudaDevices() - 1; i >= 0; --i) {
+    CaffeCudaSetDevice(i);
+    EXPECT_EQ(CaffeCudaGetDevice(), i);
+  }
+  FLAGS_caffe2_cuda_full_device_control = false;
+}
+
+TEST(CUDAContextTest, MemoryPoolAllocateDealloc) {
+  if (!HasCudaGPU())
+    return;
+  if (GetCudaMemoryPoolType() == CudaMemoryPoolType::NONE) {
+    LOG(ERROR) << "Choose a memory type that is not none to test memory pool.";
+    return;
+  }
+  const int nbytes = 1048576;
+  for (int i = 0; i < NumCudaDevices(); ++i) {
+    LOG(INFO) << "Device " << i << " of " << NumCudaDevices();
+    DeviceGuard guard(i);
+    auto allocated = shared_from_new(CUDAContext::New(nbytes));
+    EXPECT_NE(allocated, nullptr);
+    cudaPointerAttributes attr;
+    CUDA_ENFORCE(cudaPointerGetAttributes(&attr, allocated.get()));
+    EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
+    EXPECT_EQ(attr.device, i);
+    void* prev_allocated = allocated.get();
+    allocated.reset();
+    auto new_allocated = shared_from_new(CUDAContext::New(nbytes));
+    // With a pool, the above allocation should yield the same address.
+    EXPECT_EQ(new_allocated.get(), prev_allocated);
+    // But, if we are allocating something larger, we will have a different
+    // chunk of memory.
+    auto larger_allocated = shared_from_new(CUDAContext::New(nbytes * 2));
+    EXPECT_NE(larger_allocated.get(), prev_allocated);
+  }
+}
+
+cudaStream_t getStreamForHandle(cublasHandle_t handle) {
+  cudaStream_t stream = nullptr;
+  CUBLAS_ENFORCE(cublasGetStream(handle, &stream));
+  CHECK_NOTNULL(stream);
+  return stream;
+}
+
+TEST(CUDAContextTest, TestSameThreadSameObject) {
+  if (!HasCudaGPU()) return;
+  CUDAContext context_a(0);
+  CUDAContext context_b(0);
+  EXPECT_EQ(context_a.cuda_stream(), context_b.cuda_stream());
+  EXPECT_EQ(context_a.cublas_handle(), context_b.cublas_handle());
+  EXPECT_EQ(
+      context_a.cuda_stream(), getStreamForHandle(context_b.cublas_handle()));
+  // CuRAND generators are context-local.
+  EXPECT_NE(context_a.curand_generator(), context_b.curand_generator());
+}
+
+TEST(CUDAContextTest, TestSameThreadDifferntObjectIfDifferentDevices) {
+  if (NumCudaDevices() > 1) {
+    CUDAContext context_a(0);
+    CUDAContext context_b(1);
+    EXPECT_NE(context_a.cuda_stream(), context_b.cuda_stream());
+    EXPECT_NE(context_a.cublas_handle(), context_b.cublas_handle());
+    EXPECT_NE(
+        context_a.cuda_stream(), getStreamForHandle(context_b.cublas_handle()));
+    EXPECT_NE(context_a.curand_generator(), context_b.curand_generator());
+  }
+}
+
+namespace {
+// A test function to return a stream address from a temp CUDA context. You
+// should not use that stream though, because the actual stream is destroyed
+// after thread exit.
+void TEST_GetStreamAddress(cudaStream_t* ptr) {
+  CUDAContext context(0);
+  *ptr = context.cuda_stream();
+  // Sleep for a while so we have concurrent thread executions
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+}
+}  // namespace
+
+TEST(CUDAContextTest, TestDifferntThreadDifferentobject) {
+  if (!HasCudaGPU()) return;
+  std::array<cudaStream_t, 2> temp = {0};
+  // Same thread
+  TEST_GetStreamAddress(&temp[0]);
+  TEST_GetStreamAddress(&temp[1]);
+  EXPECT_TRUE(temp[0] != nullptr);
+  EXPECT_TRUE(temp[1] != nullptr);
+  EXPECT_EQ(temp[0], temp[1]);
+  // Different threads
+  std::thread thread_a(TEST_GetStreamAddress, &temp[0]);
+  std::thread thread_b(TEST_GetStreamAddress, &temp[1]);
+  thread_a.join();
+  thread_b.join();
+  EXPECT_TRUE(temp[0] != nullptr);
+  EXPECT_TRUE(temp[1] != nullptr);
+  EXPECT_NE(temp[0], temp[1]);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
new file mode 100644
index 0000000..e2338d7
--- /dev/null
+++ b/caffe2/core/context_test.cc
@@ -0,0 +1,37 @@
+#include <random>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+TEST(CPUContextTest, TestAllocAlignment) {
+  for (int i = 1; i < 10; ++i) {
+    auto data = CPUContext::New(i);
+    EXPECT_EQ((reinterpret_cast<size_t>(data.first) % gCaffe2Alignment), 0);
+    data.second(data.first);
+  }
+}
+
+TEST(CPUContextTest, TestAllocDealloc) {
+  auto data_and_deleter = CPUContext::New(10 * sizeof(float));
+  float* data = static_cast<float*>(data_and_deleter.first);
+  EXPECT_NE(data, nullptr);
+  auto dst_data_and_deleter = CPUContext::New(10 * sizeof(float));
+  float* dst_data = static_cast<float*>(dst_data_and_deleter.first);
+  EXPECT_NE(dst_data, nullptr);
+  for (int i = 0; i < 10; ++i) {
+    data[i] = i;
+  }
+  DeviceOption option;
+  CPUContext context(option);
+  context.Copy<float, CPUContext, CPUContext>(10, data, dst_data);
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_FLOAT_EQ(dst_data[i], i);
+  }
+  data_and_deleter.second(data);
+  dst_data_and_deleter.second(dst_data);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
new file mode 100644
index 0000000..c2910e2
--- /dev/null
+++ b/caffe2/core/cudnn_wrappers.h
@@ -0,0 +1,161 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_CORE_CUDNN_WRAPPERS_H_
+#define CAFFE2_CORE_CUDNN_WRAPPERS_H_
+
+#include "caffe2/core/common_cudnn.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+class CuDNNWrapper;
+
+/**
+ * CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn
+ * scratch space. This struct is meant to be only used in CuDNNWrapper to
+ * provide a program-wide scratch space for CuDNN. The reason behind it is that
+ * cudnn function calls are usually very efficient, hence one probably does not
+ * want to run multiple cudnn calls at the same time. As a result, one should
+ * not need more than one cudnn workspace per device.
+ */
+struct CuDNNWorkspace {
+  ~CuDNNWorkspace() noexcept {}
+
+  void* get(size_t nbytes) {
+    if (nbytes_ < nbytes) {
+      reset();
+      auto data_and_deleter = CUDAContext::New(nbytes);
+      data_ = {data_and_deleter.first, data_and_deleter.second};
+      nbytes_ = nbytes;
+    }
+    CAFFE_ENFORCE_GE(nbytes_, nbytes);
+    return data_.get();
+  }
+
+  void reset() {
+    data_ = nullptr;
+    nbytes_ = 0;
+  }
+
+ private:
+  std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
+  size_t nbytes_{0};
+};
+
+// CuDNNState is the owner of the CuDNNWorkspace, and serializes all
+// executions of operations that use the state onto it's own stream
+// (so multiple Net workers can reuse the same workspace from
+// different threads and CUDA streams).
+class CuDNNState {
+ public:
+  explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
+    DeviceGuard g(gpu_id_);
+    CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
+    CUDA_ENFORCE(cudaEventCreate(&before_));
+    CUDA_ENFORCE(cudaEventCreate(&after_));
+    CUDA_ENFORCE(cudaStreamCreate(&stream_));
+    CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
+  }
+
+  ~CuDNNState() noexcept {
+    DeviceGuard g(gpu_id_);
+    CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
+    CUDA_CHECK(cudaStreamDestroy(stream_));
+    CUDA_CHECK(cudaEventDestroy(after_));
+    CUDA_CHECK(cudaEventDestroy(before_));
+  }
+
+  cudnnHandle_t& cudnn_handle() {
+    return cudnn_handle_;
+  }
+
+  CuDNNWorkspace& workspace() {
+    return workspace_;
+  }
+
+  template <typename F>
+  void execute(cudaStream_t stream, F&& f) {
+    CUDA_ENFORCE(cudaEventRecord(before_, stream));
+    CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
+    f(this);
+    CUDA_ENFORCE(cudaEventRecord(after_, stream_));
+    CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
+  }
+
+ private:
+  cudnnHandle_t cudnn_handle_{nullptr};
+  cudaEvent_t before_{nullptr};
+  cudaEvent_t after_{nullptr};
+  cudaStream_t stream_{nullptr};
+  CuDNNWorkspace workspace_;
+  size_t gpu_id_{0};
+  DISABLE_COPY_AND_ASSIGN(CuDNNState);
+};
+
+/**
+ * CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
+ *
+ * The wrapper ensures that for each thread and each gpu, there is one
+ * identical cudnn handle, which is also associated with the thread-local
+ * per-device cuda stream. The wrapper also hosts the device-specific cudnn
+ * workspace (scratch space for some cudnn functions).
+ *
+ */
+class CuDNNWrapper {
+ public:
+  /**
+   * Creates a cudnn wrapper associated with a CUDAContext object. Note that
+   * the CUDAContext object should outlive the CuDNNWrapper.
+   */
+  explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
+
+  /**
+   * Returns the inline cudnn handle that executes on the current
+   * thread's cuda_stream.
+   */
+  cudnnHandle_t inline_cudnn_handle() {
+    return context_->cudnn_handle();
+  }
+
+  // Executes the closure F on the CuDNNState associated with state_idx
+  template <typename F>
+  void with_cudnn_state(size_t state_idx, F&& f) {
+    CAFFE_ENFORCE(
+        state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
+    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
+
+    DeviceGuard dg(context_->cuda_gpu_id());
+
+    // We need to serialize execution on the CuDNNState as we can't
+    // allow multiple threads to race through the cudaEventRecord
+    // calls (so a worker thread might wait on another worker thread's
+    // execution)
+    std::lock_guard<std::mutex> g(sync_state.mutex);
+    if (!sync_state.state.get()) {
+      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
+    }
+    CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
+  }
+
+ protected:
+  // Pointer to an external cuda context that the cudnn wrapper will use.
+  CUDAContext* context_;
+
+  static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
+
+  struct SyncedCuDNNState {
+    std::mutex mutex;
+    std::unique_ptr<CuDNNState> state;
+  };
+
+  using PerGPUCuDNNStates = std::array<
+      std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
+      CAFFE2_COMPILE_TIME_MAX_GPUS>;
+  static PerGPUCuDNNStates& cudnn_states();
+
+  DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
+};
+
+}; // namespace caffe2
+
+#endif
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
new file mode 100644
index 0000000..3dd993c
--- /dev/null
+++ b/caffe2/core/db.cc
@@ -0,0 +1,208 @@
+#include "caffe2/core/db.h"
+
+#include <mutex>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(db::DBReader);
+CAFFE_KNOWN_TYPE(db::Cursor);
+
+namespace db {
+
+CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+
+// Below, we provide a bare minimum database "minidb" as a reference
+// implementation as well as a portable choice to store data.
+// Note that the MiniDB classes are not exposed via a header file - they should
+// be created directly via the db interface. See MiniDB for details.
+
+class MiniDBCursor : public Cursor {
+ public:
+  explicit MiniDBCursor(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex), valid_(true) {
+    // We call Next() to read in the first entry.
+    Next();
+  }
+  ~MiniDBCursor() {}
+
+  void Seek(const string& /*key*/) override {
+    LOG(FATAL) << "MiniDB does not support seeking to a specific key.";
+  }
+
+  void SeekToFirst() override {
+    fseek(file_, 0, SEEK_SET);
+    CAFFE_ENFORCE(!feof(file_), "Hmm, empty file?");
+    // Read the first item.
+    valid_ = true;
+    Next();
+  }
+
+  void Next() override {
+    // First, read in the key and value length.
+    if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
+      // Reaching EOF.
+      VLOG(1) << "EOF reached, setting valid to false";
+      valid_ = false;
+      return;
+    }
+    CAFFE_ENFORCE_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
+    CAFFE_ENFORCE_GT(key_len_, 0);
+    CAFFE_ENFORCE_GT(value_len_, 0);
+    // Resize if the key and value len is larger than the current one.
+    if (key_len_ > (int)key_.size()) {
+      key_.resize(key_len_);
+    }
+    if (value_len_ > (int)value_.size()) {
+      value_.resize(value_len_);
+    }
+    // Actually read in the contents.
+    CAFFE_ENFORCE_EQ(
+        fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
+    CAFFE_ENFORCE_EQ(
+        fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
+    // Note(Yangqing): as we read the file, the cursor naturally moves to the
+    // beginning of the next entry.
+  }
+
+  string key() override {
+    CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
+    return string(key_.data(), key_len_);
+  }
+
+  string value() override {
+    CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
+    return string(value_.data(), value_len_);
+  }
+
+  bool Valid() override { return valid_; }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+  bool valid_;
+  int key_len_;
+  vector<char> key_;
+  int value_len_;
+  vector<char> value_;
+};
+
+class MiniDBTransaction : public Transaction {
+ public:
+  explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex) {}
+  ~MiniDBTransaction() {
+    Commit();
+  }
+
+  void Put(const string& key, const string& value) override {
+    int key_len = key.size();
+    int value_len = value.size();
+    CAFFE_ENFORCE_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
+    CAFFE_ENFORCE_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
+    CAFFE_ENFORCE_EQ(
+        fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
+    CAFFE_ENFORCE_EQ(
+        fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
+  }
+
+  void Commit() override {
+    if (file_ != nullptr) {
+      CAFFE_ENFORCE_EQ(fflush(file_), 0);
+      file_ = nullptr;
+    }
+  }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+
+  DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
+};
+
+class MiniDB : public DB {
+ public:
+  MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
+    switch (mode) {
+      case NEW:
+        file_ = fopen(source.c_str(), "wb");
+        break;
+      case WRITE:
+        file_ = fopen(source.c_str(), "ab");
+        fseek(file_, 0, SEEK_END);
+        break;
+      case READ:
+        file_ = fopen(source.c_str(), "rb");
+        break;
+    }
+    CAFFE_ENFORCE(file_, "Cannot open file: " + source);
+    VLOG(1) << "Opened MiniDB " << source;
+  }
+  ~MiniDB() { Close(); }
+
+  void Close() override {
+    if (file_) {
+      fclose(file_);
+    }
+    file_ = nullptr;
+  }
+
+  unique_ptr<Cursor> NewCursor() override {
+    CAFFE_ENFORCE_EQ(this->mode_, READ);
+    return make_unique<MiniDBCursor>(file_, &file_access_mutex_);
+  }
+
+  unique_ptr<Transaction> NewTransaction() override {
+    CAFFE_ENFORCE(this->mode_ == NEW || this->mode_ == WRITE);
+    return make_unique<MiniDBTransaction>(file_, &file_access_mutex_);
+  }
+
+ private:
+  FILE* file_;
+  // access mutex makes sure we don't have multiple cursors/transactions
+  // reading the same file.
+  std::mutex file_access_mutex_;
+};
+
+REGISTER_CAFFE2_DB(MiniDB, MiniDB);
+REGISTER_CAFFE2_DB(minidb, MiniDB);
+
+void DBReaderSerializer::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  CAFFE_ENFORCE(blob.IsType<DBReader>());
+  auto& reader = blob.Get<DBReader>();
+  DBReaderProto proto;
+  proto.set_name(name);
+  proto.set_source(reader.source_);
+  proto.set_db_type(reader.db_type_);
+  if (reader.cursor() && reader.cursor()->SupportsSeek()) {
+    proto.set_key(reader.cursor()->key());
+  }
+  BlobProto blob_proto;
+  blob_proto.set_name(name);
+  blob_proto.set_type("DBReader");
+  blob_proto.set_content(proto.SerializeAsString());
+  acceptor(name, blob_proto.SerializeAsString());
+}
+
+void DBReaderDeserializer::Deserialize(const BlobProto& proto, Blob* blob) {
+  DBReaderProto reader_proto;
+  CAFFE_ENFORCE(
+      reader_proto.ParseFromString(proto.content()),
+      "Cannot parse content into a DBReaderProto.");
+  blob->Reset(new DBReader(reader_proto));
+}
+
+namespace {
+// Serialize TensorCPU.
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<DBReader>()),
+                         DBReaderSerializer);
+REGISTER_BLOB_DESERIALIZER(DBReader, DBReaderDeserializer);
+}  // namespace
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
new file mode 100644
index 0000000..7c5b79d
--- /dev/null
+++ b/caffe2/core/db.h
@@ -0,0 +1,311 @@
+#ifndef CAFFE2_CORE_DB_H_
+#define CAFFE2_CORE_DB_H_
+
+#include <mutex>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+namespace db {
+
+/**
+ * The mode of the database, whether we are doing a read, write, or creating
+ * a new database.
+ */
+enum Mode { READ, WRITE, NEW };
+
+/**
+ * An abstract class for the cursor of the database while reading.
+ */
+class Cursor {
+ public:
+  Cursor() { }
+  virtual ~Cursor() { }
+  /**
+   * Seek to a specific key (or if the key does not exist, seek to the
+   * immediate next). This is optional for dbs, and in default, SupportsSeek()
+   * returns false meaning that the db cursor does not support it.
+   */
+  virtual void Seek(const string& key) = 0;
+  virtual bool SupportsSeek() { return false; }
+  /**
+   * Seek to the first key in the database.
+   */
+  virtual void SeekToFirst() = 0;
+  /**
+   * Go to the next location in the database.
+   */
+  virtual void Next() = 0;
+  /**
+   * Returns the current key.
+   */
+  virtual string key() = 0;
+  /**
+   * Returns the current value.
+   */
+  virtual string value() = 0;
+  /**
+   * Returns whether the current location is valid - for example, if we have
+   * reached the end of the database, return false.
+   */
+  virtual bool Valid() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Cursor);
+};
+
+/**
+ * An abstract class for the current database transaction while writing.
+ */
+class Transaction {
+ public:
+  Transaction() { }
+  virtual ~Transaction() { }
+  /**
+   * Puts the key value pair to the database.
+   */
+  virtual void Put(const string& key, const string& value) = 0;
+  /**
+   * Commits the current writes.
+   */
+  virtual void Commit() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Transaction);
+};
+
+/**
+ * An abstract class for accessing a database of key-value pairs.
+ */
+class DB {
+ public:
+  DB(const string& /*source*/, Mode mode) : mode_(mode) {}
+  virtual ~DB() { }
+  /**
+   * Closes the database.
+   */
+  virtual void Close() = 0;
+  /**
+   * Returns a cursor to read the database. The caller takes the ownership of
+   * the pointer.
+   */
+  virtual std::unique_ptr<Cursor> NewCursor() = 0;
+  /**
+   * Returns a transaction to write data to the database. The caller takes the
+   * ownership of the pointer.
+   */
+  virtual std::unique_ptr<Transaction> NewTransaction() = 0;
+
+ protected:
+  Mode mode_;
+
+  DISABLE_COPY_AND_ASSIGN(DB);
+};
+
+// Database classes are registered by their names so we can do optional
+// dependencies.
+CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+#define REGISTER_CAFFE2_DB(name, ...) \
+  CAFFE_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
+
+/**
+ * Returns a database object of the given database type, source and mode. The
+ * caller takes the ownership of the pointer. If the database type is not
+ * supported, a nullptr is returned. The caller is responsible for examining the
+ * validity of the pointer.
+ */
+inline unique_ptr<DB> CreateDB(
+    const string& db_type, const string& source, Mode mode) {
+  auto result = Caffe2DBRegistry()->Create(db_type, source, mode);
+  VLOG(1) << ((!result) ? "not found db " : "found db ") << db_type;
+  return result;
+}
+
+/**
+ * Returns whether or not a database exists given the database type and path.
+ */
+inline bool DBExists(const string& db_type, const string& full_db_name) {
+  // Warning! We assume that creating a DB throws an exception if the DB
+  // does not exist. If the DB constructor does not follow this design
+  // pattern,
+  // the returned output (the existence tensor) can be wrong.
+  try {
+    std::unique_ptr<DB> db(
+        caffe2::db::CreateDB(db_type, full_db_name, caffe2::db::READ));
+    return true;
+  } catch (...) {
+    return false;
+  }
+}
+
+/**
+ * A reader wrapper for DB that also allows us to serialize it.
+ */
+class DBReader {
+ public:
+
+  friend class DBReaderSerializer;
+  DBReader() {}
+
+  DBReader(
+      const string& db_type,
+      const string& source,
+      const int32_t num_shards = 1,
+      const int32_t shard_id = 0) {
+    Open(db_type, source, num_shards, shard_id);
+  }
+
+  explicit DBReader(const DBReaderProto& proto) {
+    Open(proto.db_type(), proto.source());
+    if (proto.has_key()) {
+      CAFFE_ENFORCE(cursor_->SupportsSeek(),
+          "Encountering a proto that needs seeking but the db type "
+          "does not support it.");
+      cursor_->Seek(proto.key());
+    }
+    num_shards_ = 1;
+    shard_id_ = 0;
+  }
+
+  explicit DBReader(std::unique_ptr<DB> db)
+      : db_type_("<memory-type>"),
+        source_("<memory-source>"),
+        db_(std::move(db)) {
+    CAFFE_ENFORCE(db_.get(), "Passed null db");
+    cursor_ = db_->NewCursor();
+  }
+
+  void Open(
+      const string& db_type,
+      const string& source,
+      const int32_t num_shards = 1,
+      const int32_t shard_id = 0) {
+    // Note(jiayq): resetting is needed when we re-open e.g. leveldb where no
+    // concurrent access is allowed.
+    cursor_.reset();
+    db_.reset();
+    db_type_ = db_type;
+    source_ = source;
+    db_ = CreateDB(db_type_, source_, READ);
+    CAFFE_ENFORCE(db_, "Cannot open db: ", source_, " of type ", db_type_);
+    InitializeCursor(num_shards, shard_id);
+  }
+
+  void Open(
+      unique_ptr<DB>&& db,
+      const int32_t num_shards = 1,
+      const int32_t shard_id = 0) {
+    cursor_.reset();
+    db_.reset();
+    db_ = std::move(db);
+    CAFFE_ENFORCE(db_.get(), "Passed null db");
+    InitializeCursor(num_shards, shard_id);
+  }
+
+ public:
+  /**
+   * Read a set of key and value from the db and move to next. Thread safe.
+   *
+   * The string objects key and value must be created by the caller and
+   * explicitly passed in to this function. This saves one additional object
+   * copy.
+   *
+   * If the cursor reaches its end, the reader will go back to the head of
+   * the db. This function can be used to enable multiple input ops to read
+   * the same db.
+   *
+   * Note(jiayq): we loosen the definition of a const function here a little
+   * bit: the state of the cursor is actually changed. However, this allows
+   * us to pass in a DBReader to an Operator without the need of a duplicated
+   * output blob.
+   */
+  void Read(string* key, string* value) const {
+    CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
+    std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
+    *key = cursor_->key();
+    *value = cursor_->value();
+
+    // In sharded mode, each read skips num_shards_ records
+    for (uint32_t s = 0; s < num_shards_; s++) {
+      cursor_->Next();
+      if (!cursor_->Valid()) {
+        MoveToBeginning();
+        break;
+      }
+    }
+  }
+
+  /**
+   * @brief Seeks to the first key. Thread safe.
+   */
+  void SeekToFirst() const {
+    CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
+    std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
+    MoveToBeginning();
+  }
+
+  /**
+   * Returns the underlying cursor of the db reader.
+   *
+   * Note that if you directly use the cursor, the read will not be thread
+   * safe, because there is no mechanism to stop multiple threads from
+   * accessing the same cursor. You should consider using Read() explicitly.
+   */
+  inline Cursor* cursor() const {
+    VLOG(1) << "Usually for a DBReader you should use Read() to be "
+               "thread safe. Consider refactoring your code.";
+    return cursor_.get();
+  }
+
+ private:
+  void InitializeCursor(const int32_t num_shards, const int32_t shard_id) {
+    CAFFE_ENFORCE(num_shards >= 1);
+    CAFFE_ENFORCE(shard_id >= 0);
+    CAFFE_ENFORCE(shard_id < num_shards);
+    num_shards_ = num_shards;
+    shard_id_ = shard_id;
+    cursor_ = db_->NewCursor();
+    SeekToFirst();
+  }
+
+  void MoveToBeginning() const {
+    cursor_->SeekToFirst();
+    for (uint32_t s = 0; s < shard_id_; s++) {
+      cursor_->Next();
+      CAFFE_ENFORCE(
+          cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_);
+    }
+  }
+
+  string db_type_;
+  string source_;
+  unique_ptr<DB> db_;
+  unique_ptr<Cursor> cursor_;
+  mutable std::mutex reader_mutex_;
+  uint32_t num_shards_;
+  uint32_t shard_id_;
+
+  DISABLE_COPY_AND_ASSIGN(DBReader);
+};
+
+class DBReaderSerializer : public BlobSerializerBase {
+ public:
+  /**
+   * Serializes a DBReader. Note that this blob has to contain DBReader,
+   * otherwise this function produces a fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      BlobSerializerBase::SerializationAcceptor acceptor) override;
+};
+
+class DBReaderDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override;
+};
+
+}  // namespace db
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_DB_H_
diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt
new file mode 100644
index 0000000..841bfca
--- /dev/null
+++ b/caffe2/core/dispatch/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(LIB_SOURCES
+        DeviceId.cpp
+        Dispatcher.cpp
+        DispatchKey.cpp
+        DispatchTable.cpp
+        KernelRegistration.cpp
+        LayoutId.cpp
+        LeftRight.cpp
+        OpSchema.cpp
+        OpSchemaRegistration.cpp
+        TensorTypeId.cpp
+        TensorTypeIdRegistration.cpp
+)
+
+set(TEST_SOURCES
+        OpSchema_test.cpp
+)
+
+add_library(dispatch OBJECT ${LIB_SOURCES})
+target_enable_style_warnings(dispatch)
+
+if(BUILD_TEST)
+    add_executable(dispatch_test ${TEST_SOURCES} $<TARGET_OBJECTS:dispatch>)
+    add_test(NAME dispatch_test COMMAND $<TARGET_FILE:dispatch_test>)
+    target_enable_style_warnings(dispatch_test)
+    target_link_libraries(dispatch_test gtest_main)
+    if(INSTALL_TEST)
+        install(TARGETS dispatch_test DESTINATION test)
+    endif()
+endif()
diff --git a/caffe2/core/dispatch/DeviceId.cpp b/caffe2/core/dispatch/DeviceId.cpp
new file mode 100644
index 0000000..55b5c6d
--- /dev/null
+++ b/caffe2/core/dispatch/DeviceId.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/DeviceId.h"
diff --git a/caffe2/core/dispatch/DeviceId.h b/caffe2/core/dispatch/DeviceId.h
new file mode 100644
index 0000000..e74a803
--- /dev/null
+++ b/caffe2/core/dispatch/DeviceId.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include "caffe2/utils/C++17.h"
+
+namespace c10 {
+
+enum class DeviceTypeId : uint8_t {
+    // Don't use the int values here in the enum (i.e. don't do static_cast to or from int).
+    // Instead, if you want to serialize this, write a function with switch/case.
+    CPU = 0,
+    CUDA = 1,
+    UNDEFINED
+};
+
+}
+
+inline std::ostream& operator<<(std::ostream& stream, c10::DeviceTypeId device_type_id) {
+    switch(device_type_id) {
+        case c10::DeviceTypeId::CPU: return stream << "DeviceTypeId(CPU)";
+        case c10::DeviceTypeId::CUDA: return stream << "DeviceTypeId(CUDA)";
+        case c10::DeviceTypeId::UNDEFINED: return stream << "DeviceTypeId(UNDEFINED)";
+    }
+    throw std::logic_error("Unknown DeviceTypeId: " + c10::guts::to_string(static_cast<int>(device_type_id)));
+}
+
+namespace std {
+
+template <> struct hash<c10::DeviceTypeId> {
+    size_t operator()(c10::DeviceTypeId v) const {
+        return std::hash<uint8_t>()(static_cast<uint8_t>(v));
+    }
+};
+
+}
diff --git a/caffe2/core/dispatch/DispatchKey.cpp b/caffe2/core/dispatch/DispatchKey.cpp
new file mode 100644
index 0000000..1928366
--- /dev/null
+++ b/caffe2/core/dispatch/DispatchKey.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/DispatchKey.h"
diff --git a/caffe2/core/dispatch/DispatchKey.h b/caffe2/core/dispatch/DispatchKey.h
new file mode 100644
index 0000000..6622f55
--- /dev/null
+++ b/caffe2/core/dispatch/DispatchKey.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "caffe2/core/dispatch/DeviceId.h"
+#include "caffe2/core/dispatch/LayoutId.h"
+#include "caffe2/core/typeid.h"
+
+#include <vector>
+#include <functional>
+#include <sstream>
+#include "caffe2/utils/Array.h"
+
+namespace c10 {
+
+namespace details {
+struct TensorParameterDispatchKey final {
+  // note: This dispatch key structure is not final yet and will change. Don't rely on it.
+  DeviceTypeId deviceTypeId;
+  LayoutId layoutId;
+  // TODO Move this CaffeTypeId to c10 namespace
+  caffe2::CaffeTypeId dataType;
+};
+inline constexpr bool operator==(const TensorParameterDispatchKey& lhs, const TensorParameterDispatchKey& rhs) {
+  return lhs.deviceTypeId == rhs.deviceTypeId && lhs.layoutId == rhs.layoutId && lhs.dataType == rhs.dataType;
+}
+}  // namespace details
+}  // namespace c10
+
+inline std::ostream& operator<<(std::ostream& stream, const c10::details::TensorParameterDispatchKey& key) {
+  return stream << "TensorKey(" << key.deviceTypeId << ", " << key.layoutId.value() << ", " << key.dataType << ")";
+}
+
+namespace std {
+  template<>
+  struct hash<c10::details::TensorParameterDispatchKey> {
+    // TODO constexpr hashing
+    size_t operator()(const c10::details::TensorParameterDispatchKey& obj) const {
+      return std::hash<c10::DeviceTypeId>()(obj.deviceTypeId) ^ std::hash<c10::LayoutId>()(obj.layoutId) ^ std::hash<caffe2::CaffeTypeId>()(obj.dataType);
+    }
+  };
+}  // namespace std
+
+namespace c10 {
+/**
+ * The dispatch key encodes the runtime type identity of a function call arguments,
+ * specifying what aspects of this identity can be dynamically dispatched on.
+ *
+ * Intuitively, given a function signature like f(Tensor, int), a valid dispatch
+ * key for the arguments might be [CPUFloatTensor] (notice that 'f' is NOT included
+ * in the dispatch key, and the runtime type of 'int' is NOT considered for dispatch
+ * (since it is trivial).
+ *
+ * Dispatch keys permit equality tests and are hashable.
+ *
+ * @tparam num_dispatch_args The number of dispatchable arguments
+ */
+template<size_t num_dispatch_args>
+struct DispatchKey final {
+  guts::array<details::TensorParameterDispatchKey, num_dispatch_args> argTypes;
+};
+
+template<size_t num_dispatch_args>
+inline constexpr bool operator==(const DispatchKey<num_dispatch_args> &lhs, const DispatchKey<num_dispatch_args>& rhs) {
+  // TODO: Use AVX instructions to perform this equality test more quickly
+  return lhs.argTypes == rhs.argTypes;
+}
+
+}  // namespace c10
+
+template<size_t num_dispatch_args>
+inline std::ostream& operator<<(std::ostream& stream, const c10::DispatchKey<num_dispatch_args>& key) {
+  stream << "DispatchKey(";
+  if (num_dispatch_args > 0) {
+      stream << "DispatchKey(" << key.argTypes[0];
+      for (size_t i = 1; i < num_dispatch_args; ++i) {
+          stream << ", " << key.argTypes[i];
+      }
+      stream << ")";
+  }
+  return stream << ")";
+}
+
+namespace std {
+  template<size_t num_dispatch_args>
+  struct hash<c10::DispatchKey<num_dispatch_args>> {
+    // TODO constexpr hashing
+    size_t operator()(const c10::DispatchKey<num_dispatch_args>& obj) const {
+      size_t hash_value = 0;
+      for (const auto& argType : obj.argTypes) {
+        hash_value *= 10883; // prime
+        hash_value += std::hash<c10::details::TensorParameterDispatchKey>()(argType);
+      }
+      return hash_value;
+    }
+  };
+}  // namespace std
diff --git a/caffe2/core/dispatch/DispatchTable.cpp b/caffe2/core/dispatch/DispatchTable.cpp
new file mode 100644
index 0000000..bea17cf
--- /dev/null
+++ b/caffe2/core/dispatch/DispatchTable.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/DispatchTable.h"
diff --git a/caffe2/core/dispatch/DispatchTable.h b/caffe2/core/dispatch/DispatchTable.h
new file mode 100644
index 0000000..0f11979
--- /dev/null
+++ b/caffe2/core/dispatch/DispatchTable.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include "caffe2/core/dispatch/LeftRight.h"
+#include "caffe2/core/dispatch/OpSchema.h"
+#include "caffe2/utils/Metaprogramming.h"
+#include "caffe2/utils/flat_hash_map/flat_hash_map.h"
+
+#include <array>
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <type_traits>
+#include <unordered_map>
+
+namespace c10 {
+
+namespace details {
+/// Kernel implementations in a thread-safe hash table.
+template <class Key>
+class ThreadsafeOperatorTable_ final {
+ public:
+  template <class Key_>
+  void emplace(Key_&& key, void* value) {
+    bool res = map_.write([&](ska::flat_hash_map<Key, void*>& map) -> bool {
+      auto result = map->emplace(std::forward<Key>(key), value);
+      return result.second;
+    });
+    if (!res) {
+      std::ostringstream msg;
+      msg << "Tried to register conflicting kernels to the dispatcher: " << key;
+      throw std::logic_error(msg.str());
+    }
+  }
+
+  void erase(const Key& key) {
+    auto num_removed =
+        map_.write([&](ska::flat_hash_map<Key, void*>& map) -> size_t {
+          return map->erase(key);
+        });
+    assert(num_removed <= 1); // This is not a multi-map
+    if (num_removed == 0) {
+      throw std::logic_error(
+          "Tried to deregister a kernel that isn't registered.");
+    }
+  }
+
+  void* lookup(const Key& key) const {
+    return map_.read([&](const ska::flat_hash_map<Key, void*>& map) -> void* {
+      auto found = map->find(key);
+      if (found != map->end()) {
+        return found->second;
+      } else {
+        return nullptr;
+      }
+    });
+  }
+
+ private:
+  LeftRight<ska::flat_hash_map<Key, void*>> map_;
+};
+} // namespace details
+
+/**
+ * Per-operator dispatch table.
+ *
+ * Given an operator specified by 'OpSchemaDef', this class records a dispatch
+ * table for various kernels provided for this operator.  For example, if we
+ * consider the operator add(Tensor, Tensor), the dispatch table for this
+ * operator may contain implementations for various dynamic tensor types, such
+ * as (CPUFloatTensor, CPUFloatTensor), (CUDAFloatTensor, CUDAFloatTensor), etc.
+ *
+ * @tparam OpSchemaDef The operator signature this dispatch table encodes.
+ */
+// TODO: Support dispatch for meta-operators (which apply to all dynamic types)
+template <class OpSchemaDef>
+class DispatchTable final {
+ private:
+  using Schema = OpSchema<OpSchemaDef>;
+
+ public:
+  DispatchTable() : kernels_() {}
+
+  /**
+   * Register a kernel in the table at some dispatch key.
+   * @param func Concrete kernel function implementation to register
+   * @param dispatch_key Dispatch key to define when this kernel is selected
+   */
+  void registerKernel(
+      typename Schema::signature::func_type* func,
+      typename Schema::dispatch::dispatch_key_type dispatch_key) {
+    kernels_.emplace(std::move(dispatch_key), reinterpret_cast<void*>(func));
+  }
+
+  /**
+   * Deregister the kernel for some dispatch key.
+   *
+   * @param dispatch_key Dispatch key to unregister.
+   */
+  // TODO: This isn't going to work so well when we get more complicated
+  // override patterns! In this case, an operator will show up in multiple
+  // slots, and erasing them one-by-one is probably not such a good idea.
+  void deregisterKernel(
+      const typename Schema::dispatch::dispatch_key_type& dispatch_key) {
+    kernels_.erase(dispatch_key);
+  }
+
+  /**
+   * Perform a dynamic dispatch on this table.
+   *
+   * @tparam Args Perfect forwarding template arguments to the dispatch
+   * @param args Arguments to invoke the function with
+   * @return Returned value of the operator
+   */
+  template <class... Args>
+  typename Schema::signature::return_type call(Args&&... args) const {
+    // TODO Better error message, but need to take care that reference arguments
+    // match non-reference arguments and so on.
+    //      static_assert(std::is_same<typename Schema::return_type (Args...),
+    //      typename Schema::func_type>::value, "Argument types don't match
+    //      operator signature");
+    auto kernel_func = lookupKernelFunc_(args...);
+    return kernel_func(std::forward<Args>(args)...);
+  }
+
+ private:
+  template <class... Args>
+  typename Schema::signature::func_type* lookupKernelFunc_(
+      const Args&... args) const {
+    auto dispatch_key = Schema::dispatch::dispatch_key(args...);
+    void* found = kernels_.lookup(dispatch_key);
+    if (found == nullptr) {
+      // TODO Better error message - include op name and dispatch key (i.e.
+      // argument types)
+      throw std::logic_error(
+          std::string() + "Didn't find kernel to dispatch to for operator '" +
+          Schema::metadata::name() + "'");
+    }
+    return reinterpret_cast<typename Schema::signature::func_type*>(found);
+  }
+
+  details::ThreadsafeOperatorTable_<
+      typename Schema::dispatch::dispatch_key_type>
+      kernels_;
+};
+
+} // namespace c10
+
+/*
+ * Use this to access the dispatch table singleton for a given op schema.
+ * It has an implementation for each op schema def in a cpp file, because
+ * we can't rely on the one-definition-rule.
+ */
+template <class OpSchemaDef>
+c10::DispatchTable<OpSchemaDef>& c10_dispatch_table();
diff --git a/caffe2/core/dispatch/Dispatcher.cpp b/caffe2/core/dispatch/Dispatcher.cpp
new file mode 100644
index 0000000..a1c40ed
--- /dev/null
+++ b/caffe2/core/dispatch/Dispatcher.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/Dispatcher.h"
diff --git a/caffe2/core/dispatch/Dispatcher.h b/caffe2/core/dispatch/Dispatcher.h
new file mode 100644
index 0000000..bdfa0ee
--- /dev/null
+++ b/caffe2/core/dispatch/Dispatcher.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "caffe2/core/dispatch/DispatchTable.h"
+
+namespace c10 {
+
+/**
+ * Top-level dispatch interface for dispatching via the dynamic dispatcher.
+ */
+template<class OpSchemaDef>
+class Dispatcher final {
+public:
+  // Implementation note: this class abstracts over the fact that we have per-operator
+  // dispatch tables.  This could be easily adjusted to have a single global hash
+  // table.
+
+  /**
+   * Register an operator to the dispatch table for some operator schema.
+   *
+   * @tparam OpSchemaDef Operator schema to register this operator to (mandatory)
+   * @tparam Args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::registerOp (inferred)
+   * @param args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::registerOp
+   * @return void
+   */
+  template<class... Args>
+  static void registerKernel(Args&&... args) {
+    auto& dispatch_table_for_this_op = c10_dispatch_table<OpSchemaDef>();
+    return dispatch_table_for_this_op.registerKernel(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Remove an operator from the dispatch table for some operator schema.
+   *
+   * @tparam OpSchemaDef Operator schema to deregister from (mandatory)
+   * @tparam Args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::deregisterOp (inferred)
+   * @param args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::deregisterOp
+   * @return void
+   */
+  template<class... Args>
+  static void deregisterKernel(Args&&... args) {
+    auto& dispatch_table_for_this_op = c10_dispatch_table<OpSchemaDef>();
+    return dispatch_table_for_this_op.deregisterKernel(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Perform a dynamic dispatch to some operator
+   *
+   * @tparam OpSchemaDef Operator schema to dispatch with (mandatory)
+   * @tparam Args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::call (inferred)
+   * @param args Perfect-forwarding args to c10::dispatch::impl::DispatchTable::call
+   * @return Return type of this operator
+   */
+  template<class... Args>
+  static typename OpSchema<OpSchemaDef>::signature::return_type call(Args&&... args) {
+    auto& dispatch_table_for_this_op = c10_dispatch_table<OpSchemaDef>();
+    return dispatch_table_for_this_op.call(std::forward<Args>(args)...);
+  }
+};
+
+} // namespace c10
diff --git a/caffe2/core/dispatch/KernelRegistration.cpp b/caffe2/core/dispatch/KernelRegistration.cpp
new file mode 100644
index 0000000..364696b
--- /dev/null
+++ b/caffe2/core/dispatch/KernelRegistration.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/KernelRegistration.h"
diff --git a/caffe2/core/dispatch/KernelRegistration.h b/caffe2/core/dispatch/KernelRegistration.h
new file mode 100644
index 0000000..755480e
--- /dev/null
+++ b/caffe2/core/dispatch/KernelRegistration.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "caffe2/core/dispatch/OpSchema.h"
+#include "caffe2/core/dispatch/Dispatcher.h"
+#include "caffe2/utils/Optional.h"
+
+/**
+ * To register your own kernel for an operator, do in one (!) cpp file:
+ *   C10_REGISTER_KERNEL(OpSchemaDef)
+ *      .kernel(&kernel_func)
+ *      .dispatchKey(dispatch_key);
+ */
+
+namespace c10 {
+
+// TODO Test different order for builder
+// TODO Test no dispatch key defined
+
+/**
+ * Class which, on construction, registers an operator in the dispatch table.  The intent is that
+ * this class is constructed at static initialization time so that operators automatically get
+ * registered when a dlopen() occurs.
+ *
+ * You shouldn't call this directly; instead, use the KernelRegistrationBuilder
+ *
+ * @tparam OpSchemaDef
+ */
+template<class OpSchemaDef>
+class KernelRegistrar final {
+private:
+    using Schema = OpSchema<OpSchemaDef>;
+public:
+  /**
+   * @param kernel The concrete function implementation to register
+   * @param dispatch_key  The dispatch key to register the function to
+   */
+  KernelRegistrar(typename Schema::signature::func_type* kernel, typename Schema::dispatch::dispatch_key_type dispatch_key)
+  : dispatch_key_(std::move(dispatch_key)), owns_registration_(true) {
+    Dispatcher<OpSchemaDef>::registerKernel(kernel, dispatch_key_);
+  }
+
+  KernelRegistrar(KernelRegistrar&& rhs)
+  : dispatch_key_(std::move(rhs.dispatch_key_)), owns_registration_(true) {
+    rhs.owns_registration_ = false;
+  }
+
+  // not needed for now
+  KernelRegistrar& operator=(KernelRegistrar&& rhs) = delete;
+
+  ~KernelRegistrar() {
+    if (owns_registration_) {
+      Dispatcher<OpSchemaDef>::deregisterKernel(dispatch_key_);
+    }
+  }
+
+private:
+  const typename Schema::dispatch::dispatch_key_type dispatch_key_;
+  bool owns_registration_;
+
+  DISABLE_COPY_AND_ASSIGN(KernelRegistrar);
+};
+
+/**
+ * Helper class for building a KernelRegistrar.  This permits "keyword-argument" like syntax
+ * when performing operator registration, e.g., as in:
+ *
+ * C10_REGISTER_KERNEL(::ops::add_notensor)
+ *      .kernel(&add_notensor_op)
+ *      .dispatchKey("bla");
+ *
+ * Expanded, this macro invocation looks like:
+ *
+ * static KernelRegistrar<::ops::add_notensor> _anon0 =
+ *    KernelRegistrationBuilder<::ops::add_notensor, false, false>()
+ *      .kernel(&add_notensor_op)
+ *      .dispatchKey("bla");
+ *
+ * The resulting full expression is implicitly convertible to a KernelRegistrar.
+ *
+ * @tparam OpSchemaDef The operator schema this is building a KernelRegistration for
+ * @tparam hasKernel Boolean for compile-time checking that a kernel is specified before finalizing the builder
+ * @tparam hasDispatchKey Boolean for compile-time checking thhat a dispatch key is specified before finalizing the builder
+ */
+template<class OpSchemaDef, uint64_t FieldsPresentFlags>
+class KernelRegistrationBuilder final {
+private:
+  using Schema = OpSchema<OpSchemaDef>;
+
+  static constexpr uint64_t KERNEL_PRESENT = 0x01 << 0;
+  static constexpr uint64_t DISPATCH_KEY_PRESENT = 0x01 << 1;
+
+  optional<typename Schema::signature::func_type*> kernel_;
+  optional<typename Schema::dispatch::dispatch_key_type> dispatch_key_;
+
+public:
+  constexpr KernelRegistrationBuilder(): KernelRegistrationBuilder(nullopt, nullopt) {}
+
+  constexpr KernelRegistrationBuilder(optional<typename Schema::signature::func_type*> kernel, optional<typename Schema::dispatch::dispatch_key_type> dispatch_key)
+  : kernel_(std::move(kernel)), dispatch_key_(std::move(dispatch_key)) {}
+
+  /**
+   * Implicit coercion to KernelRegistrar<OpSchemaDef> that finalizes the builder and
+   * creates the object.
+   * @return Produced KernelRegistrar
+   */
+  constexpr operator KernelRegistrar<OpSchemaDef>() && {
+    static_assert(FieldsPresentFlags & KERNEL_PRESENT, "Forgot to call .kernel() in kernel registration");
+    static_assert(FieldsPresentFlags & DISPATCH_KEY_PRESENT, "Forgot to call .dispatchKey() in kernel registration");
+    return KernelRegistrar<OpSchemaDef>(std::move(*kernel_), std::move(*dispatch_key_));
+  }
+
+  /**
+   * Specify the concrete function implementation for this dispatch registration
+   * @param kernel concrete function implementation to be registered
+   * @return "this" for method chaining
+   */
+  constexpr KernelRegistrationBuilder<OpSchemaDef, FieldsPresentFlags | KERNEL_PRESENT> kernel(typename Schema::signature::func_type* kernel_func) && {
+    static_assert(!(FieldsPresentFlags & KERNEL_PRESENT), "Tried to define kernel twice in same op registration");
+    return KernelRegistrationBuilder<OpSchemaDef, FieldsPresentFlags | KERNEL_PRESENT>(*kernel_func, std::move(dispatch_key_));
+  }
+
+  /**
+   * Specify the dispatch key for this dispatch registration
+   * @param dispatch_key dispatch key to register the function to
+   * @return "this" for method chaining
+   */
+  constexpr KernelRegistrationBuilder<OpSchemaDef, FieldsPresentFlags | DISPATCH_KEY_PRESENT> dispatchKey(typename Schema::dispatch::dispatch_key_type dispatch_key) && {
+    static_assert(!(FieldsPresentFlags & DISPATCH_KEY_PRESENT), "Tried to define kernel twice in same op registration");
+    return KernelRegistrationBuilder<OpSchemaDef, FieldsPresentFlags | DISPATCH_KEY_PRESENT>(std::move(kernel_), std::move(dispatch_key));
+  }
+};
+
+} // namespace c10
+
+// TODO Can the builder logic be moved to compile time?
+#define CONCAT_IMPL( x, y ) x##y
+#define MACRO_CONCAT( x, y ) CONCAT_IMPL( x, y )
+// NB: Semicolon after applying this macro is MANDATORY
+#define C10_REGISTER_KERNEL(OpSchemaDef)                                                           \
+  static KernelRegistrar<OpSchemaDef> MACRO_CONCAT(__kernelRegistrationBuilder_, __COUNTER__) = KernelRegistrationBuilder<OpSchemaDef, 0>()
diff --git a/caffe2/core/dispatch/LayoutId.cpp b/caffe2/core/dispatch/LayoutId.cpp
new file mode 100644
index 0000000..9cd3eaf
--- /dev/null
+++ b/caffe2/core/dispatch/LayoutId.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/LayoutId.h"
diff --git a/caffe2/core/dispatch/LayoutId.h b/caffe2/core/dispatch/LayoutId.h
new file mode 100644
index 0000000..7f039fa
--- /dev/null
+++ b/caffe2/core/dispatch/LayoutId.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "caffe2/utils/IdWrapper.h"
+
+namespace c10 {
+
+class LayoutId final : public c10::guts::IdWrapper<LayoutId, uint8_t> {
+public:
+    constexpr explicit LayoutId(underlying_type id): IdWrapper(id) {}
+
+    constexpr uint8_t value() const {
+        return underlyingId();
+    }
+
+    // Don't use this default constructor!
+    // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223
+    constexpr LayoutId(): IdWrapper(0) {}
+};
+
+}
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId)
diff --git a/caffe2/core/dispatch/LeftRight.cpp b/caffe2/core/dispatch/LeftRight.cpp
new file mode 100644
index 0000000..26e7a7e
--- /dev/null
+++ b/caffe2/core/dispatch/LeftRight.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/LeftRight.h"
diff --git a/caffe2/core/dispatch/LeftRight.h b/caffe2/core/dispatch/LeftRight.h
new file mode 100644
index 0000000..dc60a30
--- /dev/null
+++ b/caffe2/core/dispatch/LeftRight.h
@@ -0,0 +1,72 @@
+#include <atomic>
+#include <functional>
+#include <mutex>
+#include <thread>
+
+namespace c10 {
+namespace details {
+
+// LeftRight wait-free readers synchronization primitive
+// https://hal.archives-ouvertes.fr/hal-01207881/document
+template <typename T>
+class LeftRight {
+ public:
+  LeftRight() {
+    counters_[0].store(0);
+    counters_[1].store(0);
+  }
+
+  template <typename F>
+  auto read(F&& readFunc) -> typename std::result_of<F(const T&)>::type {
+    auto localCounterIndex = counterIndex_.load();
+    ++counters_[localCounterIndex];
+    try {
+      auto r = readFunc(data_[dataIndex_.load()]);
+      --counters_[localCounterIndex];
+      return r;
+    } catch (const std::exception& e) {
+      --counters_[localCounterIndex];
+      throw;
+    }
+  }
+
+  // Throwing from write would result in invalid state
+  template <typename F>
+  auto write(F&& writeFunc) -> typename std::result_of<F(T&)>::type {
+    std::unique_lock<std::mutex> lock(mutex_);
+    uniqueWrite(std::forward<F&&>(writeFunc));
+  }
+
+ private:
+  // This function doesn't use any locks for the writers. Use only if you know
+  // what you're doing
+  template <typename F>
+  auto uniqueWrite(F&& writeFunc) -> typename std::result_of<F(T&)>::type {
+    try {
+      auto localDataIndex = dataIndex_.load();
+      writeFunc(data_[localDataIndex ^ 1]);
+      dataIndex_ = localDataIndex ^ 1;
+      auto localCounterIndex = counterIndex_.load();
+      while (counters_[localCounterIndex ^ 1].load()) {
+        std::this_thread::yield();
+      }
+      counterIndex_ = localCounterIndex ^ 1;
+      while (counters_[localCounterIndex].load()) {
+        std::this_thread::yield();
+      }
+      return writeFunc(data_[localDataIndex]);
+    } catch (const std::exception& e) {
+      // rethrow
+      throw;
+    }
+  }
+
+  std::mutex mutex_;
+  std::atomic<uint8_t> counterIndex_{0};
+  std::atomic<uint8_t> dataIndex_{0};
+  std::atomic<int32_t> counters_[2];
+  T data_[2];
+};
+
+} // namespace details
+} // namespace c10
diff --git a/caffe2/core/dispatch/OpSchema.cpp b/caffe2/core/dispatch/OpSchema.cpp
new file mode 100644
index 0000000..9e1a284
--- /dev/null
+++ b/caffe2/core/dispatch/OpSchema.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/OpSchema.h"
diff --git a/caffe2/core/dispatch/OpSchema.h b/caffe2/core/dispatch/OpSchema.h
new file mode 100644
index 0000000..bdfd14e
--- /dev/null
+++ b/caffe2/core/dispatch/OpSchema.h
@@ -0,0 +1,258 @@
+#pragma once
+
+#include "caffe2/core/dispatch/DispatchKey.h"
+#include "caffe2/utils/Metaprogramming.h"
+#include "caffe2/utils/Array.h"
+
+namespace caffe2 {
+template<class Context> class Tensor;
+class CPUContext;
+class CUDAContext;
+}  // namespace caffe2
+
+namespace c10 {
+
+namespace details {
+
+/**
+ * If Arg is a Tensor or reference to a Tensor, provide the member constant value equal to true.  Otherwise
+ * return false.
+ */
+template<class Arg> using is_tensor_arg = guts::is_instantiation_of<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
+
+// TODO get rid of tensor_to_dispatch_key once c2::Tensor is de-templatized. This then fits into a template lambda instead of a functor.
+template<class TensorType, class Enable = void> struct tensor_to_dispatch_key_ final {};
+template<class TensorType>
+struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CPUContext>>::value>> final {
+    static TensorParameterDispatchKey call(const TensorType& tensor) {
+      return TensorParameterDispatchKey{DeviceTypeId::CPU, LayoutId(0), tensor.meta().id()};
+    }
+};
+template<class TensorType>
+struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CUDAContext>>::value>> final {
+    static TensorParameterDispatchKey call(const TensorType& tensor) {
+      return TensorParameterDispatchKey{DeviceTypeId::CUDA, LayoutId(0), tensor.meta().id()};
+    }
+};
+struct tensor_to_dispatch_key final {
+    template<class TensorType>
+    TensorParameterDispatchKey operator()(const TensorType& tensor) const {
+      return tensor_to_dispatch_key_<TensorType, void>::call(tensor);
+    }
+};
+
+/**
+ * Extract the type ids of all tensors in a variadic list of arguments
+ *
+ * @tparam Args Inferred variadic list of argument types
+ * @param args List of arguments to get type ids from
+ * @return guts::array<TensorParameterDispatchKey, n>, where n is the number of tensor arguments (is_tensor_arg) in the class
+ */
+template<class... Args> auto getTensorTypeIds_(const Args&... args)
+-> guts::array<TensorParameterDispatchKey, guts::typelist::count_if<is_tensor_arg, guts::typelist::typelist<Args...>>::value> {
+  return guts::filter_map<TensorParameterDispatchKey, is_tensor_arg>(tensor_to_dispatch_key(), args...);
+}
+
+// TODO Test getTensorTypeIds_
+
+/**
+ * If T is a struct with a type field Signature, provides the member constant
+ * @tparam T
+ */
+template<class T, typename = void>
+struct has_signature_defined : std::false_type {};
+template<class T>
+struct has_signature_defined<T, guts::void_t<
+  typename T::Signature
+>> : std::true_type {};
+
+// TODO Test has_signature_defined
+
+template<class T, typename = void>
+struct has_parameter_names_defined : std::false_type {};
+template<class T>
+struct has_parameter_names_defined<T, guts::void_t<
+  decltype(T::parameter_names)
+>> : std::true_type {};
+
+// TODO Test has_parameter_names_defined
+
+template<class T, typename = void>
+struct has_name_defined : std::false_type {};
+template<class T>
+struct has_name_defined<T, guts::void_t<
+        decltype(T::name)
+>> : std::true_type {};
+
+// TODO Test has_name_defined
+
+/**
+ * Wrapper class around a user-provided schema definition some useful information about the schema.
+ *
+ * @tparam OpSchemaDef Operator schema definition.  See OpSchema for more details.
+ */
+template<class OpSchemaDef> class OpSignatureSchema final {
+  static_assert(details::has_signature_defined<OpSchemaDef>::value, "Operator schema doesn't define a valid Signature member type.");
+  static_assert(guts::is_function_type<typename OpSchemaDef::Signature>::value, "Signature member of operator schema must be a function type.");
+
+  using signature_traits = guts::function_traits<typename OpSchemaDef::Signature>;
+public:
+  /**
+   * The function type OpSchemaDef::Signature
+   */
+  using func_type = typename signature_traits::func_type;
+  /**
+   * The return type of the function OpSchemaDef::Signature
+   */
+  using return_type = typename signature_traits::return_type;
+  /**
+   * A type list of the parameter types of OpSchemaDef::Signature
+   */
+  using parameter_types = typename signature_traits::parameter_types;
+
+  /**
+   * The number of arguments of OpSchemaDef::Signature
+   */
+  static constexpr size_t num_args = guts::typelist::size<parameter_types>::value;
+  /**
+   * The number of tensor arguments (as per is_tensor_arg) in OpSchemaDef::Signature
+   */
+  static constexpr size_t num_tensor_args = guts::typelist::count_if<details::is_tensor_arg, parameter_types>::value;
+
+private:
+  static_assert(details::has_parameter_names_defined<OpSchemaDef>::value, "Operator schema doesn't define parameter_names member.");
+  // TODO Allow simpler definition of parameter_names without having to spell out the guts::array type in the schema def.
+  static_assert(std::is_same<const guts::array<const char*, num_args>, decltype(OpSchemaDef::parameter_names)>::value, "Operator schema defines parameter_names member, but it isn't the correct type. Must be a static constexpr guts::array of const char* with one entry for each parameter.");
+
+public:
+  /**
+   * The names of the parameters (as per OpSchemaDef::parameter_names)
+   * @return Array
+   */
+  static constexpr const guts::array<const char*, num_args>& parameter_names() {
+    return OpSchemaDef::parameter_names;
+  }
+};
+
+/**
+ * If T has a method dispatch_key, provide a member constant value equal to true.  Otherwise return false.
+ * @tparam T
+ */
+template<class T, typename = void>
+struct has_function_dispatch_key_defined : std::false_type {};
+template<class T>
+struct has_function_dispatch_key_defined<T, guts::void_t<
+  decltype(&T::dispatch_key)
+>> : std::true_type {};
+
+/**
+ * Wrapper class around a user-defined schema definition providing a way of computing a dispatch key
+ * from arguments matching the signature of that schema.
+ *
+ * @tparam OpSchemaDef Operator schema definition.  See OpSchema for more details.
+ * @tparam Enable Inferred, used to control specialization
+ */
+template<class OpSchemaDef, class Enable = void> class OpDispatchKeySchema final {};
+
+// General case. Operator doesn't overwrite DispatchKey generation. Use default.
+template<class OpSchemaDef>
+class OpDispatchKeySchema<OpSchemaDef, guts::enable_if_t<!has_function_dispatch_key_defined<OpSchemaDef>::value>> final {
+  using signature = OpSignatureSchema<OpSchemaDef>;
+
+  // TODO Static assert that dispatch_key_type has operator<<(ostream, _) defined for debug output.
+  // TODO Use an ADL-based debugString(DispatchKey) function instead of operator<< for debug printing.
+
+public:
+  using dispatch_key_type = DispatchKey<signature::num_tensor_args>;
+
+  template<class... Args>
+  static inline dispatch_key_type dispatch_key(const Args&... args) {
+    using guts::typelist::map_t;
+    using guts::typelist::typelist;
+    static_assert(std::is_same<
+      map_t<guts::remove_cv_t, map_t<guts::remove_reference_t, typelist<Args...>>>,
+      map_t<guts::remove_cv_t, map_t<guts::remove_reference_t, typename signature::parameter_types>>
+      >::value, "Invalid argument types passed to OpSchema::dispatch_key()");
+    return dispatch_key_type {
+      details::getTensorTypeIds_(args...)
+    };
+  }
+};
+
+// Special case. Operator overwrites DispatchKey generation. Use that.
+template<class OpSchemaDef>
+class OpDispatchKeySchema<OpSchemaDef, guts::enable_if_t<has_function_dispatch_key_defined<OpSchemaDef>::value>> final {
+  using signature = OpSignatureSchema<OpSchemaDef>;
+
+  static_assert(guts::is_function_type<decltype(OpSchemaDef::dispatch_key)>::value, "Operator schema defines dispatch_key member, but it isn't a function.");
+
+  using dispatch_key_traits = guts::function_traits<decltype(OpSchemaDef::dispatch_key)>;
+
+public:
+  using dispatch_key_type = typename dispatch_key_traits::return_type;
+
+private:
+
+  static_assert(guts::is_equality_comparable<dispatch_key_type>::value, "Operator schema specified custom dispatch_key() derivation function, but the returned dispatch key type doesn't have the equality operator defined. Please define it.");
+  static_assert(guts::is_hashable<dispatch_key_type>::value, "Operator schema specified custom dispatch_key() derivation function, but the returned dispatch key type doesn't have an overload for std::hash. Please define it.");
+
+  static_assert(std::is_same<
+    guts::typelist::map_t<guts::remove_cv_t, guts::typelist::map_t<guts::remove_reference_t, typename dispatch_key_traits::parameter_types>>,
+    guts::typelist::map_t<guts::remove_cv_t, guts::typelist::map_t<guts::remove_reference_t, typename signature::parameter_types>>
+    >::value, "Operator schema defines custom dispatch_key() derivation function, but the arguments don't match the operator signature.");
+
+public:
+
+  template<class... Args>
+  static inline dispatch_key_type dispatch_key(const Args&... args) {
+    using guts::typelist::map_t;
+    using guts::typelist::typelist;
+    static_assert(std::is_same<
+      map_t<guts::remove_cv_t, map_t<guts::remove_reference_t, typelist<Args...>>>,
+      map_t<guts::remove_cv_t, map_t<guts::remove_reference_t, typename signature::parameter_types>>
+      >::value, "Invalid argument types passed to OpSchema::dispatch_key()");
+    return OpSchemaDef::dispatch_key(args...);
+  }
+};
+
+template<class OpSchemaDef>
+class OpMetadataSchema final {
+private:
+    static_assert(has_name_defined<OpSchemaDef>::value, "The operator schema has to define a 'static constexpr const char* name = ...' member to specify the operator name.");
+    static_assert(std::is_same<const char* const, decltype(OpSchemaDef::name)>::value, "The 'name' member of the operator schema must have type 'static constexpr const char*'");
+
+public:
+    static constexpr const char* name() {
+        return OpSchemaDef::name;
+    }
+};
+
+}  // namespace details
+
+/**
+ * Wrapper class for user-defined OpSchemaDef, providing functionality for determining
+ * information about the signature and dispatching on that signature.  This is the
+ * "public" facing class.
+ *
+ * @tparam OpSchemaDef User-defined OpSchemaDef.
+ *   This struct is expected to define:
+ *      - a function type Signature
+ *      - a constexpr guts<const char*, n_args> parameter_names field (where n_args is
+ *        the number of arguments in Signature)
+ */
+template<class OpSchemaDef> class OpSchema final {
+  // TODO static_assert OpSchemaDef isn't an instanciation of OpSchema. If yes, the caller probably passed an OpSchema somewhere where an OpSchemaDef was expected and wants a good error message.
+public:
+  using metadata = details::OpMetadataSchema<OpSchemaDef>;
+  /**
+   * Information about the signature
+   */
+  using signature = details::OpSignatureSchema<OpSchemaDef>;
+  /**
+   * Functionality for dispatching on that signature
+   */
+  using dispatch = details::OpDispatchKeySchema<OpSchemaDef>;
+};
+
+// TODO test OpSchema::dispatch stuff
+}  // namespace c10
diff --git a/caffe2/core/dispatch/OpSchemaRegistration.cpp b/caffe2/core/dispatch/OpSchemaRegistration.cpp
new file mode 100644
index 0000000..1a5ac01
--- /dev/null
+++ b/caffe2/core/dispatch/OpSchemaRegistration.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/OpSchemaRegistration.h"
diff --git a/caffe2/core/dispatch/OpSchemaRegistration.h b/caffe2/core/dispatch/OpSchemaRegistration.h
new file mode 100644
index 0000000..c862025
--- /dev/null
+++ b/caffe2/core/dispatch/OpSchemaRegistration.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "caffe2/core/dispatch/Dispatcher.h"
+
+// TODO Better error message when this definition is missing
+
+/**
+ * Macro for defining an operator schema.  Every user-defined OpSchemaDef struct must
+ * invoke this macro on it.  Internally, this arranges for the dispatch table for
+ * the operator to be created.
+ */
+#define C10_DEFINE_OP_SCHEMA(OpSchemaDef)                                         \
+  template<>                                                                      \
+  c10::DispatchTable<OpSchemaDef>& c10_dispatch_table<OpSchemaDef>() {            \
+    static c10::DispatchTable<OpSchemaDef> singleton;                             \
+    return singleton;                                                             \
+  }
+// TODO Also register unboxed calling API here
diff --git a/caffe2/core/dispatch/OpSchema_test.cpp b/caffe2/core/dispatch/OpSchema_test.cpp
new file mode 100644
index 0000000..77936a0
--- /dev/null
+++ b/caffe2/core/dispatch/OpSchema_test.cpp
@@ -0,0 +1,24 @@
+#include "caffe2/core/dispatch/OpSchema.h"
+#include "caffe2/utils/Array.h"
+
+using namespace c10;
+using namespace caffe2;
+
+static_assert(details::is_tensor_arg<Tensor<CPUContext>>::value, "");
+static_assert(details::is_tensor_arg<const Tensor<CPUContext> &>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CPUContext> &&>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CUDAContext>>::value, "");
+static_assert(details::is_tensor_arg<const Tensor<CUDAContext> &>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CUDAContext> &&>::value, "");
+static_assert(!details::is_tensor_arg<int>::value, "");
+
+struct SchemaDef final {
+  using Signature = bool (int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int);
+  static constexpr guts::array<const char*, 6> parameter_names = {{
+      "1", "2", "3", "4", "5", "6"
+  }};
+};
+static_assert(6 == OpSchema<SchemaDef>::signature::num_args, "test num_dispatch_args");
+static_assert(3 == OpSchema<SchemaDef>::signature::num_tensor_args, "test num_dispatch_args");
+static_assert(std::is_same<bool, typename OpSchema<SchemaDef>::signature::return_type>::value, "test num_dispatch_args");
+static_assert(std::is_same<guts::typelist::typelist<int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
diff --git a/caffe2/core/dispatch/TensorTypeId.cpp b/caffe2/core/dispatch/TensorTypeId.cpp
new file mode 100644
index 0000000..fe1ad1b
--- /dev/null
+++ b/caffe2/core/dispatch/TensorTypeId.cpp
@@ -0,0 +1,5 @@
+#include "caffe2/core/dispatch/TensorTypeId.h"
+
+std::ostream& operator<<(std::ostream& str, c10::TensorTypeId rhs) {
+  return str << rhs.underlyingId();
+}
diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h
new file mode 100644
index 0000000..a80fc83
--- /dev/null
+++ b/caffe2/core/dispatch/TensorTypeId.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "caffe2/utils/IdWrapper.h"
+#include <string>
+#include <iostream>
+#include <mutex>
+#include <unordered_set>
+
+namespace c10 {
+class TensorTypeId;
+}
+
+std::ostream& operator<<(std::ostream&, c10::TensorTypeId);
+
+namespace c10 {
+
+namespace details {
+  using _tensorTypeId_underlyingType = uint8_t;
+}
+
+/**
+ * Dynamic type ID of a Tensor argument.  It represents something like CPUTensor, etc.
+ */
+class TensorTypeId final : public guts::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
+public:
+  // Don't use this!
+  // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223
+  constexpr TensorTypeId() noexcept: IdWrapper(0) {}
+private:
+  constexpr explicit TensorTypeId(details::_tensorTypeId_underlyingType id) noexcept: IdWrapper(id) {}
+
+  friend class TensorTypeIdCreator;
+  friend std::ostream& ::operator<<(std::ostream&, TensorTypeId);
+};
+
+}  // namespace c10
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
new file mode 100644
index 0000000..9c7831b
--- /dev/null
+++ b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
@@ -0,0 +1,61 @@
+#include "caffe2/core/dispatch/TensorTypeIdRegistration.h"
+#include "caffe2/utils/C++17.h"
+
+namespace c10 {
+
+constexpr TensorTypeId TensorTypeIdCreator::max_id_;
+
+TensorTypeIds::TensorTypeIds()
+: creator_(), registry_() {}
+
+TensorTypeIds& TensorTypeIds::singleton() {
+  static TensorTypeIds singleton;
+  return singleton;
+}
+
+TensorTypeIdCreator::TensorTypeIdCreator()
+: last_id_(0) {}
+
+TensorTypeId TensorTypeIdCreator::create() {
+  auto id = TensorTypeId(++last_id_);
+
+  if (id == max_id_) {
+    // If this happens in prod, we have to change details::_tensorTypeId_underlyingType to uint16_t.
+    throw std::logic_error("Tried to define more than " + c10::guts::to_string(std::numeric_limits<details::_tensorTypeId_underlyingType>::max()-1) + " tensor types, which is unsupported");
+  }
+
+  return id;
+}
+
+TensorTypeIdRegistry::TensorTypeIdRegistry()
+: registeredTypeIds_(), mutex_() {}
+
+void TensorTypeIdRegistry::registerId(TensorTypeId id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  registeredTypeIds_.emplace(id);
+}
+
+void TensorTypeIdRegistry::deregisterId(TensorTypeId id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  registeredTypeIds_.erase(id);
+}
+
+TensorTypeId TensorTypeIds::createAndRegister() {
+  TensorTypeId id = creator_.create();
+  registry_.registerId(id);
+  return id;
+}
+
+void TensorTypeIds::deregister(TensorTypeId id) {
+  registry_.deregisterId(id);
+}
+
+TensorTypeIdRegistrar::TensorTypeIdRegistrar()
+: id_(TensorTypeIds::singleton().createAndRegister()) {
+}
+
+TensorTypeIdRegistrar::~TensorTypeIdRegistrar() {
+  TensorTypeIds::singleton().deregister(id_);
+}
+
+}  // namespace c10
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.h b/caffe2/core/dispatch/TensorTypeIdRegistration.h
new file mode 100644
index 0000000..a7af633
--- /dev/null
+++ b/caffe2/core/dispatch/TensorTypeIdRegistration.h
@@ -0,0 +1,98 @@
+#pragma once
+
+/**
+ * To register your own tensor types, do in a header file:
+ *   C10_DECLARE_TENSOR_TYPE(MY_TENSOR)
+ * and in one (!) cpp file:
+ *   C10_DEFINE_TENSOR_TYPE(MY_TENSOR)
+ * Both must be in the same namespace.
+ */
+
+#include "caffe2/core/dispatch/TensorTypeId.h"
+#include "caffe2/core/common.h"
+#include <atomic>
+#include "caffe2/utils/flat_hash_map/flat_hash_map.h"
+
+namespace c10 {
+
+class TensorTypeIdCreator final {
+public:
+  TensorTypeIdCreator();
+
+  TensorTypeId create();
+
+  static constexpr TensorTypeId undefined() noexcept {
+    return TensorTypeId(0);
+  }
+
+private:
+  std::atomic<details::_tensorTypeId_underlyingType> last_id_;
+
+  static constexpr TensorTypeId max_id_ = TensorTypeId(std::numeric_limits<details::_tensorTypeId_underlyingType>::max());
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
+};
+
+class TensorTypeIdRegistry final {
+public:
+  TensorTypeIdRegistry();
+
+  void registerId(TensorTypeId id);
+  void deregisterId(TensorTypeId id);
+
+private:
+  ska::flat_hash_set<TensorTypeId> registeredTypeIds_;
+  std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
+};
+
+class TensorTypeIds final {
+public:
+  static TensorTypeIds& singleton();
+
+  TensorTypeId createAndRegister();
+  void deregister(TensorTypeId id);
+
+  static constexpr TensorTypeId undefined() noexcept;
+
+private:
+  TensorTypeIds();
+
+  TensorTypeIdCreator creator_;
+  TensorTypeIdRegistry registry_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
+};
+
+inline constexpr TensorTypeId TensorTypeIds::undefined() noexcept {
+  return TensorTypeIdCreator::undefined();
+}
+
+class TensorTypeIdRegistrar final {
+public:
+  TensorTypeIdRegistrar();
+  ~TensorTypeIdRegistrar();
+
+  TensorTypeId id() const noexcept;
+
+private:
+  TensorTypeId id_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
+};
+
+inline TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
+  return id_;
+}
+
+}  // namespace c10
+
+#define C10_DECLARE_TENSOR_TYPE(TensorName)                                      \
+  TensorTypeId TensorName();                                                     \
+
+#define C10_DEFINE_TENSOR_TYPE(TensorName)                                       \
+  TensorTypeId TensorName() {                                                    \
+    static TensorTypeIdRegistrar registration_raii;                              \
+    return registration_raii.id();                                               \
+  }
diff --git a/caffe2/core/event.cc b/caffe2/core/event.cc
new file mode 100644
index 0000000..e547167
--- /dev/null
+++ b/caffe2/core/event.cc
@@ -0,0 +1,140 @@
+#include "caffe2/core/event_cpu.h"
+
+namespace caffe2 {
+
+CAFFE2_API EventCreateFunction Event::event_creator_[MaxDeviceTypes];
+CAFFE2_API EventRecordFunction Event::event_recorder_[MaxDeviceTypes];
+CAFFE2_API EventWaitFunction
+    Event::event_waiter_[MaxDeviceTypes][MaxDeviceTypes];
+CAFFE2_API EventFinishFunction Event::event_finisher_[MaxDeviceTypes];
+
+CAFFE2_API EventQueryFunction Event::event_querier_[MaxDeviceTypes];
+CAFFE2_API EventErrorMessageFunction
+    Event::event_err_msg_getter_[MaxDeviceTypes];
+CAFFE2_API EventSetFinishedFunction
+    Event::event_finished_setter_[MaxDeviceTypes];
+CAFFE2_API EventResetFunction Event::event_resetter_[MaxDeviceTypes];
+CAFFE2_API EventSetCallbackFunction
+    Event::event_callback_setter_[MaxDeviceTypes];
+
+namespace {
+const std::string kNoError = "No error";
+}
+
+void EventCreateCPU(const DeviceOption& option, Event* event) {
+  event->event_ = std::make_shared<CPUEventWrapper>(option);
+}
+
+void EventRecordCPU(
+    Event* event,
+    const void* /* unused */,
+    const char* err_msg) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_);
+
+  // Possible state changes:
+  //  INITIALIZED -> SCHEDULED or SUCCESS/FAILED
+  //  SCHEDULED -> SUCCESS/FAILED
+  //  SUCCESS/FAILED - terminal, no further changes to status_/err_msg_
+
+  CAFFE_ENFORCE(
+      wrapper->status_ != EventStatus::EVENT_SCHEDULED,
+      "Calling Record multiple times");
+
+  // Event might be in SUCCESS/FAILED state in case an op has
+  // finished async execution part first
+  if (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
+    if (!err_msg) {
+      wrapper->status_ = EventStatus::EVENT_SCHEDULED;
+    } else {
+      wrapper->err_msg_ = err_msg;
+      wrapper->status_ = EventStatus::EVENT_FAILED;
+      wrapper->cv_completed_.notify_all();
+    }
+  }
+}
+
+void EventFinishCPU(const Event* event) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_);
+  while (wrapper->status_ != EventStatus::EVENT_SUCCESS &&
+         wrapper->status_ != EventStatus::EVENT_FAILED) {
+    wrapper->cv_completed_.wait(lock);
+  }
+}
+
+void EventWaitCPUCPU(const Event* event, void* /* context */) {
+  EventFinishCPU(event);
+}
+
+EventStatus EventQueryCPU(const Event* event) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  return static_cast<EventStatus>(wrapper->status_.load());
+}
+
+const std::string& EventErrorMessageCPU(const Event* event) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  if (wrapper->status_ == EventStatus::EVENT_FAILED) {
+    // Failed is a terminal state, not synchronizing,
+    // err_msg_ should not be changed anymore
+    return wrapper->err_msg_;
+  } else {
+    return kNoError;
+  }
+}
+
+void EventSetFinishedCPU(const Event* event, const char* err_msg) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_);
+
+  CAFFE_ENFORCE(
+      wrapper->status_ == EventStatus::EVENT_INITIALIZED ||
+          wrapper->status_ == EventStatus::EVENT_SCHEDULED,
+      "Calling SetFinished on finished event");
+
+  if (!err_msg) {
+    wrapper->status_ = EventStatus::EVENT_SUCCESS;
+  } else {
+    wrapper->err_msg_ = err_msg;
+    wrapper->status_ = EventStatus::EVENT_FAILED;
+  }
+
+  for (auto& callback : wrapper->callbacks_) {
+    callback();
+  }
+
+  wrapper->cv_completed_.notify_all();
+}
+
+void EventSetCallbackCPU(Event* event, EventCallbackFunction callback) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_);
+
+  wrapper->callbacks_.push_back(callback);
+  if (wrapper->status_ == EventStatus::EVENT_SUCCESS ||
+      wrapper->status_ == EventStatus::EVENT_FAILED) {
+    callback();
+  }
+}
+
+void EventResetCPU(Event* event) {
+  auto* wrapper = static_cast<CPUEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_);
+  wrapper->status_ = EventStatus::EVENT_INITIALIZED;
+  wrapper->err_msg_ = "";
+  wrapper->callbacks_.clear();
+}
+
+REGISTER_EVENT_CREATE_FUNCTION(CPU, EventCreateCPU);
+REGISTER_EVENT_RECORD_FUNCTION(CPU, EventRecordCPU);
+REGISTER_EVENT_WAIT_FUNCTION(CPU, CPU, EventWaitCPUCPU);
+REGISTER_EVENT_FINISH_FUNCTION(CPU, EventFinishCPU);
+
+REGISTER_EVENT_QUERY_FUNCTION(CPU, EventQueryCPU);
+REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(CPU, EventErrorMessageCPU);
+REGISTER_EVENT_SET_FINISHED_FUNCTION(CPU, EventSetFinishedCPU);
+REGISTER_EVENT_RESET_FUNCTION(CPU, EventResetCPU);
+
+REGISTER_EVENT_SET_CALLBACK_FUNCTION(CPU, EventSetCallbackCPU);
+
+} // namespace caffe2
diff --git a/caffe2/core/event.h b/caffe2/core/event.h
new file mode 100644
index 0000000..d6bbc12
--- /dev/null
+++ b/caffe2/core/event.h
@@ -0,0 +1,321 @@
+#ifndef CAFFE2_CORE_EVENT_H_
+#define CAFFE2_CORE_EVENT_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+constexpr int MaxDeviceTypes = DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
+class Event;
+
+enum EventStatus {
+  EVENT_INITIALIZED = 0,
+  EVENT_SCHEDULED = 1,
+  EVENT_SUCCESS = 2,
+  EVENT_FAILED = 3,
+};
+
+// For the following functions, void* shall be interpreted as the corresponding
+// context object corresponding to the device type associated with the
+// functions.
+
+// Initializes event
+typedef void (*EventCreateFunction)(const DeviceOption& option, Event*);
+
+// Called on event to signal that CPU part of operation is finished,
+// Optionally accepts error message from CPU part.
+// Should be called no more than once per event
+typedef void (*EventRecordFunction)(Event*, const void*, const char*);
+
+// Waits and returns as soon as possible in order schedule next operation,
+// e.g. for CUDA->CUDA waits only for CPU part of CUDA op,
+// for CUDA->CPU waits till the CUDA op is fully completed.
+// Prepares context to synchronize device part of operation.
+// Can be called concurrently from multiple threads
+typedef void (*EventWaitFunction)(const Event*, void*);
+
+// Waits till operation is fully finished,
+// can be called concurrently from multiple threads
+typedef void (*EventFinishFunction)(const Event*);
+
+// Queries current status of operation,
+// can be called concurrently from multiple threads
+typedef EventStatus (*EventQueryFunction)(const Event*);
+typedef const std::string& (*EventErrorMessageFunction)(const Event*);
+typedef void (*EventSetFinishedFunction)(const Event*, const char*);
+typedef void (*EventResetFunction)(Event*);
+
+// Sets callback that is called when event is finished
+typedef std::function<void()> EventCallbackFunction;
+typedef void (*EventSetCallbackFunction)(Event*, EventCallbackFunction);
+
+class Event {
+ public:
+  explicit Event(const DeviceOption& option)
+      : event_(), type_(option.device_type()), option_(option) {
+    CAFFE_ENFORCE_LT(type_, MaxDeviceTypes);
+    CAFFE_ENFORCE(event_creator_[type_]);
+    event_creator_[type_](option, this);
+  }
+
+  // Nothing needs to be done in the destructor, as the event creator should
+  // set the proper destruction process for the unique_ptr.
+  ~Event() {}
+
+  void Record(
+      int recorder_type,
+      const void* context,
+      const char* err_msg = nullptr) {
+    CAFFE_ENFORCE_EQ(
+        recorder_type,
+        type_,
+        "You are trying to record with a wrong device type.");
+    CAFFE_ENFORCE(event_recorder_[recorder_type]);
+    event_recorder_[recorder_type](this, context, err_msg);
+  }
+
+  void Wait(int waiter_type, void* context) const {
+    CAFFE_ENFORCE(event_waiter_[waiter_type][type_]);
+    event_waiter_[waiter_type][type_](this, context);
+  }
+
+  void Finish() const {
+    CAFFE_ENFORCE(event_finisher_[type_]);
+    event_finisher_[type_](this);
+  }
+
+  EventStatus Query() const {
+    CAFFE_ENFORCE(event_querier_[type_]);
+    return event_querier_[type_](this);
+  }
+
+  const std::string& ErrorMessage() const {
+    CAFFE_ENFORCE(event_err_msg_getter_[type_]);
+    return event_err_msg_getter_[type_](this);
+  }
+
+  void Reset() {
+    CAFFE_ENFORCE(event_resetter_[type_]);
+    event_resetter_[type_](this);
+  }
+
+  const DeviceOption& GetDeviceOption() const {
+    return option_;
+  }
+
+  bool IsScheduled() const {
+    return Query() == EventStatus::EVENT_SCHEDULED;
+  }
+
+  bool IsFinished() const {
+    auto status = Query();
+    return status == EventStatus::EVENT_SUCCESS ||
+        status == EventStatus::EVENT_FAILED;
+  }
+
+  void SetFinished(const char* err_msg = nullptr) {
+    CAFFE_ENFORCE(event_finished_setter_[type_]);
+    return event_finished_setter_[type_](this, err_msg);
+  }
+
+  bool SupportsCallback() const {
+    return event_callback_setter_[type_] != nullptr;
+  }
+
+  void SetCallback(EventCallbackFunction callback) {
+    CAFFE_ENFORCE(
+        event_callback_setter_[type_], "Event does not support callbacks");
+    event_callback_setter_[type_](this, callback);
+  }
+
+  // If parent op has succeeded, then we can run any child op;
+  // If parent op is in scheduled state, we need to check that:
+  //  - child op supports async scheduling
+  //  - there's a way to setup synchronization between async parent and
+  //    child - both child and parent should use the same type of device,
+  //    non-blocking synchronization between different device types is not
+  //    supported
+  // If parent op is in another state (initialized or failed) then scheduling
+  // is not possible
+  bool CanSchedule(const Event& child_event, bool supports_async) const {
+    return CanSchedule(type_, Query(), child_event.GetType(), supports_async);
+  }
+
+  static bool CanSchedule(
+      int parent_type,
+      EventStatus parent_status,
+      int child_type,
+      bool child_supports_async) {
+    if (parent_status == EventStatus::EVENT_SUCCESS) {
+      return true;
+    }
+    if (parent_status == EventStatus::EVENT_SCHEDULED) {
+      return (parent_type == child_type) && child_supports_async;
+    }
+    return false;
+  }
+
+  int GetType() const {
+    return type_;
+  }
+
+  // event_ is going to be accessed by the EventCreate/Record/Wait/Finish
+  // functions, but one should not use it outside the own Event functionalities.
+  // In the future we may move it to a private member.
+  std::shared_ptr<void> event_;
+
+ private:
+  int type_;
+  DeviceOption option_;
+
+  CAFFE2_API static EventCreateFunction event_creator_[MaxDeviceTypes];
+  CAFFE2_API static EventRecordFunction event_recorder_[MaxDeviceTypes];
+  CAFFE2_API static EventWaitFunction event_waiter_[MaxDeviceTypes]
+                                                   [MaxDeviceTypes];
+  CAFFE2_API static EventFinishFunction event_finisher_[MaxDeviceTypes];
+
+  CAFFE2_API static EventQueryFunction event_querier_[MaxDeviceTypes];
+  CAFFE2_API static EventErrorMessageFunction
+      event_err_msg_getter_[MaxDeviceTypes];
+  CAFFE2_API static EventSetFinishedFunction
+      event_finished_setter_[MaxDeviceTypes];
+  CAFFE2_API static EventResetFunction event_resetter_[MaxDeviceTypes];
+
+  CAFFE2_API static EventSetCallbackFunction
+      event_callback_setter_[MaxDeviceTypes];
+
+  template <int d>
+  friend struct EventCreateFunctionRegisterer;
+  template <int d>
+  friend struct EventRecordFunctionRegisterer;
+  template <int w, int d>
+  friend struct EventWaitFunctionRegisterer;
+  template <int d>
+  friend struct EventFinishFunctionRegisterer;
+
+  template <int d>
+  friend struct EventQueryFunctionRegisterer;
+  template <int d>
+  friend struct EventErrorMessageFunctionRegisterer;
+  template <int d>
+  friend struct EventSetFinishedFunctionRegisterer;
+  template <int d>
+  friend struct EventSetCallbackFunctionRegisterer;
+  template <int d>
+  friend struct EventResetFunctionRegisterer;
+};
+
+template <int d>
+struct EventCreateFunctionRegisterer {
+  explicit EventCreateFunctionRegisterer(EventCreateFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_creator_[d] = f;
+  }
+};
+#define REGISTER_EVENT_CREATE_FUNCTION(d, f)                     \
+  namespace {                                                    \
+  static EventCreateFunctionRegisterer<d> g_event_create_##d(f); \
+  }
+
+template <int d>
+struct EventRecordFunctionRegisterer {
+  explicit EventRecordFunctionRegisterer(EventRecordFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_recorder_[d] = f;
+  }
+};
+#define REGISTER_EVENT_RECORD_FUNCTION(d, f)                     \
+  namespace {                                                    \
+  static EventRecordFunctionRegisterer<d> g_event_record_##d(f); \
+  }
+
+template <int waiter_type, int event_type>
+struct EventWaitFunctionRegisterer {
+  explicit EventWaitFunctionRegisterer(EventWaitFunction f) {
+    static_assert(waiter_type < MaxDeviceTypes, "");
+    static_assert(event_type < MaxDeviceTypes, "");
+    Event::event_waiter_[waiter_type][event_type] = f;
+  }
+};
+#define REGISTER_EVENT_WAIT_FUNCTION(w, d, f)                         \
+  namespace {                                                         \
+  static EventWaitFunctionRegisterer<w, d> g_event_wait_##w##_##d(f); \
+  }
+
+template <int d>
+struct EventQueryFunctionRegisterer {
+  explicit EventQueryFunctionRegisterer(EventQueryFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_querier_[d] = f;
+  }
+};
+#define REGISTER_EVENT_QUERY_FUNCTION(d, f)                    \
+  namespace {                                                  \
+  static EventQueryFunctionRegisterer<d> g_event_query_##d(f); \
+  }
+
+template <int d>
+struct EventErrorMessageFunctionRegisterer {
+  explicit EventErrorMessageFunctionRegisterer(EventErrorMessageFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_err_msg_getter_[d] = f;
+  }
+};
+#define REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(d, f)                     \
+  namespace {                                                           \
+  static EventErrorMessageFunctionRegisterer<d> g_event_err_msg_##d(f); \
+  }
+
+template <int d>
+struct EventSetFinishedFunctionRegisterer {
+  explicit EventSetFinishedFunctionRegisterer(EventSetFinishedFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_finished_setter_[d] = f;
+  }
+};
+#define REGISTER_EVENT_SET_FINISHED_FUNCTION(d, f)                          \
+  namespace {                                                               \
+  static EventSetFinishedFunctionRegisterer<d> g_event_set_finished_##d(f); \
+  }
+
+template <int d>
+struct EventSetCallbackFunctionRegisterer {
+  explicit EventSetCallbackFunctionRegisterer(EventSetCallbackFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_callback_setter_[d] = f;
+  }
+};
+#define REGISTER_EVENT_SET_CALLBACK_FUNCTION(d, f)                          \
+  namespace {                                                               \
+  static EventSetCallbackFunctionRegisterer<d> g_event_set_callback_##d(f); \
+  }
+
+template <int d>
+struct EventFinishFunctionRegisterer {
+  explicit EventFinishFunctionRegisterer(EventFinishFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_finisher_[d] = f;
+  }
+};
+#define REGISTER_EVENT_FINISH_FUNCTION(d, f)                     \
+  namespace {                                                    \
+  static EventFinishFunctionRegisterer<d> g_event_finish_##d(f); \
+  }
+
+template <int d>
+struct EventResetFunctionRegisterer {
+  explicit EventResetFunctionRegisterer(EventResetFunction f) {
+    static_assert(d < MaxDeviceTypes, "");
+    Event::event_resetter_[d] = f;
+  }
+};
+#define REGISTER_EVENT_RESET_FUNCTION(d, f)                    \
+  namespace {                                                  \
+  static EventResetFunctionRegisterer<d> g_event_reset_##d(f); \
+  }
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_EVENT_H_
diff --git a/caffe2/core/event_cpu.h b/caffe2/core/event_cpu.h
new file mode 100644
index 0000000..130841c
--- /dev/null
+++ b/caffe2/core/event_cpu.h
@@ -0,0 +1,46 @@
+#include "caffe2/core/event.h"
+#include "caffe2/core/operator.h"
+
+#include <atomic>
+
+namespace caffe2 {
+
+struct CPUEventWrapper {
+  explicit CPUEventWrapper(const DeviceOption& option)
+      : status_(EventStatus::EVENT_INITIALIZED) {
+    CAFFE_ENFORCE(
+        option.device_type() == CPU || option.device_type() == MKLDNN ||
+            option.device_type() == IDEEP,
+        "Expected CPU/MKLDNN/IDEEP device type");
+  }
+  ~CPUEventWrapper() {}
+
+  std::mutex mutex_;
+  std::condition_variable cv_completed_;
+  std::atomic<int> status_;
+  std::string err_msg_;
+  std::vector<EventCallbackFunction> callbacks_;
+};
+
+void EventCreateCPU(const DeviceOption& option, Event* event);
+
+void EventRecordCPU(
+    Event* event,
+    const void* /* unused */,
+    const char* err_msg);
+
+void EventFinishCPU(const Event* event);
+
+void EventWaitCPUCPU(const Event* event, void* /* context */);
+
+EventStatus EventQueryCPU(const Event* event);
+
+const std::string& EventErrorMessageCPU(const Event* event);
+
+void EventSetFinishedCPU(const Event* event, const char* err_msg);
+
+bool EventCanScheduleCPU(const Event*, const Event*);
+
+void EventResetCPU(Event*);
+
+} // namespace caffe2
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
new file mode 100644
index 0000000..f61fac4
--- /dev/null
+++ b/caffe2/core/event_gpu.cc
@@ -0,0 +1,216 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/event_cpu.h"
+#include "caffe2/core/operator.h"
+
+#include <atomic>
+
+namespace caffe2 {
+
+struct CudaEventWrapper {
+  explicit CudaEventWrapper(const DeviceOption& option)
+      : cuda_stream_(nullptr),
+        cuda_gpu_id_(option.cuda_gpu_id()),
+        status_(EventStatus::EVENT_INITIALIZED) {
+    CAFFE_ENFORCE(option.device_type(), CUDA);
+    DeviceGuard g(cuda_gpu_id_);
+    CUDA_ENFORCE(cudaEventCreate(
+        &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
+  }
+  ~CudaEventWrapper() {
+    DeviceGuard g(cuda_gpu_id_);
+    CUDA_CHECK(cudaEventDestroy(cuda_event_));
+  }
+
+  cudaEvent_t cuda_event_;
+  cudaStream_t cuda_stream_;
+  int cuda_gpu_id_;
+
+  std::atomic<int> status_;
+  std::mutex mutex_recorded_;
+  std::condition_variable cv_recorded_;
+  std::string err_msg_;
+};
+
+namespace {
+const std::string kNoError = "No error";
+}
+
+void EventCreateCUDA(const DeviceOption& option, Event* event) {
+  event->event_ = std::make_shared<CudaEventWrapper>(option);
+}
+
+void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  {
+    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+
+    // Possible state changes:
+    //  INITIALIZED -> SCHEDULED/FAILED
+    //  SCHEDULED -> SUCCESS/FAILED
+    //  SUCCESS/FAILED - terminal
+    //
+    // No further changes to cuda_event_ and cuda_stream_ after transitioning
+    // from INITIALIZED
+    // No further changes to err_msg_ after transitioning into FAILED
+
+    CAFFE_ENFORCE_EQ(
+        wrapper->status_,
+        EventStatus::EVENT_INITIALIZED,
+        "Calling Record multiple times");
+
+    if (!err_msg) {
+      // When recording, one needs to make sure that the current gpu id is
+      // correct.
+      // TODO(jiayq): move the enforce logic to the caller?
+      const auto& current_device = CaffeCudaGetDevice();
+      CAFFE_ENFORCE_EQ(
+          current_device,
+          wrapper->cuda_gpu_id_,
+          "When you call EventRecordCUDA, your current device should be the same "
+          "as the device specified by the event.");
+      CAFFE_ENFORCE_EQ(
+          current_device,
+          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+      CUDA_ENFORCE(cudaEventRecord(
+          wrapper->cuda_event_,
+          static_cast<const CUDAContext*>(context)->cuda_stream()));
+      wrapper->cuda_stream_ =
+          static_cast<const CUDAContext*>(context)->cuda_stream();
+      wrapper->status_ = EventStatus::EVENT_SCHEDULED;
+    } else {
+      wrapper->err_msg_ = err_msg;
+      wrapper->status_ = EventStatus::EVENT_FAILED;
+    }
+  }
+  wrapper->cv_recorded_.notify_all();
+}
+
+void EventFinishCUDA(const Event* event) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  {
+    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+    while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
+      wrapper->cv_recorded_.wait(lock);
+    }
+  }
+
+  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
+    // ok, even if event is already completed and status was not yet updated
+    DeviceGuard g(wrapper->cuda_gpu_id_);
+    auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
+    if (cudaResult == cudaSuccess) {
+      wrapper->status_ = EventStatus::EVENT_SUCCESS;
+    } else {
+      const auto& err_msg = cudaGetErrorString(cudaResult);
+
+      std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+      wrapper->err_msg_ = err_msg;
+      wrapper->status_ = EventStatus::EVENT_FAILED;
+    }
+  }
+}
+
+// Both waiter and event are CUDA. Non-blocking
+void EventWaitCUDACUDA(const Event* event, void* context) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  {
+    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+    while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
+      wrapper->cv_recorded_.wait(lock);
+    }
+  }
+
+  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
+    // ok, even if event is already completed and status was not yet updated
+    auto context_stream = static_cast<CUDAContext*>(context)->cuda_stream();
+    auto event_stream = wrapper->cuda_stream_;
+    if (context_stream != event_stream) {
+      // CAFFE_ENFORCE_EQ(
+      //    CaffeCudaGetDevice(),
+      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+      CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
+    }
+  }
+}
+
+// Waiter is CPU, event is CUDA
+void EventWaitCPUCUDA(const Event* event, void* context) {
+  EventFinishCUDA(event);
+}
+
+// Waiter is CUDA, event is CPU
+void EventWaitCUDACPU(const Event* event, void* context) {
+  event->Finish(); // calls EventFinishCPU
+}
+
+EventStatus EventQueryCUDA(const Event* event) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
+    auto cudaResult = cudaEventQuery(wrapper->cuda_event_);
+    if (cudaResult == cudaSuccess) {
+      wrapper->status_ = EventStatus::EVENT_SUCCESS;
+    } else if (cudaResult != cudaErrorNotReady) {
+      const auto& err_msg = cudaGetErrorString(cudaResult);
+
+      std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+      wrapper->err_msg_ = err_msg;
+      wrapper->status_ = EventStatus::EVENT_FAILED;
+    }
+  }
+  return static_cast<EventStatus>(wrapper->status_.load());
+}
+
+const std::string& EventErrorMessageCUDA(const Event* event) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  // supposed to be called after EventQueryCUDA to update status first
+  if (wrapper->status_ == EventStatus::EVENT_FAILED) {
+    return wrapper->err_msg_;
+  } else {
+    return kNoError;
+  }
+}
+
+void EventSetFinishedCUDA(const Event* event, const char* err_msg) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  {
+    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+
+    CAFFE_ENFORCE_EQ(
+        wrapper->status_,
+        EventStatus::EVENT_INITIALIZED,
+        "Calling SetFinished on recorded CUDA event");
+
+    if (!err_msg) {
+      wrapper->status_ = EventStatus::EVENT_SUCCESS;
+    } else {
+      wrapper->err_msg_ = err_msg;
+      wrapper->status_ = EventStatus::EVENT_FAILED;
+    }
+  }
+  wrapper->cv_recorded_.notify_all();
+}
+
+void EventResetCUDA(Event* event) {
+  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
+  std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+  wrapper->status_ = EventStatus::EVENT_INITIALIZED;
+  wrapper->err_msg_ = "";
+  wrapper->cuda_stream_ = nullptr;
+}
+
+REGISTER_EVENT_CREATE_FUNCTION(CUDA, EventCreateCUDA);
+REGISTER_EVENT_RECORD_FUNCTION(CUDA, EventRecordCUDA);
+REGISTER_EVENT_WAIT_FUNCTION(CUDA, CUDA, EventWaitCUDACUDA);
+REGISTER_EVENT_WAIT_FUNCTION(CPU, CUDA, EventWaitCPUCUDA);
+REGISTER_EVENT_WAIT_FUNCTION(CUDA, CPU, EventWaitCUDACPU);
+REGISTER_EVENT_FINISH_FUNCTION(CUDA, EventFinishCUDA);
+
+REGISTER_EVENT_QUERY_FUNCTION(CUDA, EventQueryCUDA);
+REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(CUDA, EventErrorMessageCUDA);
+REGISTER_EVENT_SET_FINISHED_FUNCTION(CUDA, EventSetFinishedCUDA);
+REGISTER_EVENT_RESET_FUNCTION(CUDA, EventResetCUDA);
+
+REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, CUDA, EventWaitCPUCUDA);
+REGISTER_EVENT_WAIT_FUNCTION(CUDA, MKLDNN, EventWaitCUDACPU);
+
+} // namespace caffe2
diff --git a/caffe2/core/event_gpu_test.cc b/caffe2/core/event_gpu_test.cc
new file mode 100644
index 0000000..ec83d98
--- /dev/null
+++ b/caffe2/core/event_gpu_test.cc
@@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/event.h"
+
+namespace caffe2 {
+
+TEST(EventCUDATest, EventBasics) {
+  if (!HasCudaGPU())
+    return;
+  DeviceOption device_cpu;
+  device_cpu.set_device_type(CPU);
+  DeviceOption device_cuda;
+  device_cuda.set_device_type(CUDA);
+
+  CPUContext context_cpu(device_cpu);
+  CUDAContext context_cuda(device_cuda);
+
+  Event event_cpu(device_cpu);
+  Event event_cuda(device_cuda);
+
+  // CPU context and event interactions
+  context_cpu.Record(&event_cpu);
+  event_cpu.SetFinished();
+  event_cpu.Finish();
+  context_cpu.WaitEvent(event_cpu);
+
+  event_cpu.Reset();
+  event_cpu.Record(CPU, &context_cpu);
+  event_cpu.SetFinished();
+  event_cpu.Wait(CPU, &context_cpu);
+
+  // CUDA context and event interactions
+  context_cuda.SwitchToDevice();
+  context_cuda.Record(&event_cuda);
+  context_cuda.WaitEvent(event_cuda);
+  event_cuda.Finish();
+
+  event_cuda.Reset();
+  event_cuda.Record(CUDA, &context_cuda);
+  event_cuda.Wait(CUDA, &context_cuda);
+
+  // CPU context waiting for CUDA event
+  context_cpu.WaitEvent(event_cuda);
+
+  // CUDA context waiting for CPU event
+  context_cuda.WaitEvent(event_cpu);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/event_test.cc b/caffe2/core/event_test.cc
new file mode 100644
index 0000000..02994ec
--- /dev/null
+++ b/caffe2/core/event_test.cc
@@ -0,0 +1,25 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/core/event.h"
+
+namespace caffe2 {
+
+TEST(EventCPUTest, EventBasics) {
+  DeviceOption device_option;
+  device_option.set_device_type(CPU);
+  Event event(device_option);
+  CPUContext context;
+
+  context.Record(&event);
+  event.SetFinished();
+
+  context.WaitEvent(event);
+  event.Finish();
+
+  event.Reset();
+  event.Record(CPU, &context);
+  event.SetFinished();
+  event.Wait(CPU, &context);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
new file mode 100644
index 0000000..04edccf
--- /dev/null
+++ b/caffe2/core/flags.cc
@@ -0,0 +1,217 @@
+#include "caffe2/core/flags.h"
+
+#include <cstdlib>
+#include <sstream>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+#ifdef CAFFE2_USE_GFLAGS
+
+void SetUsageMessage(const string& str) {
+  if (UsageMessage() != nullptr) {
+    // Usage message has already been set, so we will simply return.
+    return;
+  }
+  gflags::SetUsageMessage(str);
+}
+
+const char* UsageMessage() {
+  return gflags::ProgramUsage();
+}
+
+bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+  if (*pargc == 0) return true;
+  return gflags::ParseCommandLineFlags(pargc, pargv, true);
+}
+
+bool CommandLineFlagsHasBeenParsed() {
+  // There is no way we query gflags right now, so we will simply return true.
+  return true;
+}
+
+#else  // CAFFE2_USE_GFLAGS
+
+
+CAFFE_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+
+namespace {
+static bool gCommandLineFlagsParsed = false;
+// Since caffe flags is going to be loaded before caffe logging, we would
+// need to have a stringstream to hold the messages instead of directly
+// using caffe logging.
+std::stringstream& GlobalInitStream() {
+  static std::stringstream ss;
+  return ss;
+}
+static string gUsageMessage = "(Usage message not set.)";
+}
+
+
+void SetUsageMessage(const string& str) { gUsageMessage = str; }
+const char* UsageMessage() { return gUsageMessage.c_str(); }
+
+bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+  if (*pargc == 0) return true;
+  char** argv = *pargv;
+  bool success = true;
+  GlobalInitStream() << "Parsing commandline arguments for caffe2."
+                     << std::endl;
+  // write_head is the location we write the unused arguments to.
+  int write_head = 1;
+  for (int i = 1; i < *pargc; ++i) {
+    string arg(argv[i]);
+
+    if (arg.find("--help") != string::npos) {
+      // Print the help message, and quit.
+      std::cout << UsageMessage() << std::endl;
+      std::cout << "Arguments: " << std::endl;
+      for (const auto& help_msg : Caffe2FlagsRegistry()->HelpMessage()) {
+        std::cout << "    " << help_msg.first << ": " << help_msg.second
+                  << std::endl;
+      }
+      exit(0);
+    }
+    // If the arg does not start with "--", we will ignore it.
+    if (arg[0] != '-' || arg[1] != '-') {
+      GlobalInitStream()
+          << "Caffe2 flag: commandline argument does not match --name=var "
+             "or --name format: "
+          << arg << ". Ignoring this argument." << std::endl;
+      argv[write_head++] = argv[i];
+      continue;
+    }
+
+    string key;
+    string value;
+    size_t prefix_idx = arg.find('=');
+    if (prefix_idx == string::npos) {
+      // If there is no equality char in the arg, it means that the
+      // arg is specified in the next argument.
+      key = arg.substr(2, arg.size() - 2);
+      ++i;
+      if (i == *pargc) {
+        GlobalInitStream()
+            << "Caffe2 flag: reached the last commandline argument, but "
+               "I am expecting a value for " << arg;
+        success = false;
+        break;
+      }
+      value = string(argv[i]);
+    } else {
+      // If there is an equality character, we will basically use the value
+      // after the "=".
+      key = arg.substr(2, prefix_idx - 2);
+      value = arg.substr(prefix_idx + 1, string::npos);
+    }
+    // If the flag is not registered, we will ignore it.
+    if (!Caffe2FlagsRegistry()->Has(key)) {
+      GlobalInitStream() << "Caffe2 flag: unrecognized commandline argument: "
+                         << arg << std::endl;
+      success = false;
+      break;
+    }
+    std::unique_ptr<Caffe2FlagParser> parser(
+        Caffe2FlagsRegistry()->Create(key, value));
+    if (!parser->success()) {
+      GlobalInitStream() << "Caffe2 flag: illegal argument: "
+                         << arg << std::endl;
+      success = false;
+      break;
+    }
+  }
+  *pargc = write_head;
+  gCommandLineFlagsParsed = true;
+  // TODO: when we fail commandline flag parsing, shall we continue, or
+  // shall we just quit loudly? Right now we carry on the computation, but
+  // since there are failures in parsing, it is very likely that some
+  // downstream things will break, in which case it makes sense to quit loud
+  // and early.
+  if (!success) {
+    std::cerr << GlobalInitStream().str();
+  }
+  // Clear the global init stream.
+  GlobalInitStream().str(std::string());
+  return success;
+}
+
+bool CommandLineFlagsHasBeenParsed() {
+  return gCommandLineFlagsParsed;
+}
+
+template <>
+bool Caffe2FlagParser::Parse<string>(const string& content, string* value) {
+  *value = content;
+  return true;
+}
+
+template <>
+bool Caffe2FlagParser::Parse<int>(const string& content, int* value) {
+  try {
+    *value = std::atoi(content.c_str());
+    return true;
+  } catch(...) {
+    GlobalInitStream() << "Caffe2 flag error: Cannot convert argument to int: "
+                       << content << std::endl;
+    return false;
+  }
+}
+
+template <>
+bool Caffe2FlagParser::Parse<int64_t>(const string& content, int64_t* value) {
+  try {
+    static_assert(sizeof(long long) == sizeof(int64_t), "");
+#ifdef __ANDROID__
+    // Android does not have std::atoll.
+    *value = atoll(content.c_str());
+#else
+    *value = std::atoll(content.c_str());
+#endif
+    return true;
+  } catch (...) {
+    GlobalInitStream() << "Caffe2 flag error: Cannot convert argument to int: "
+                       << content << std::endl;
+    return false;
+  }
+}
+
+template <>
+bool Caffe2FlagParser::Parse<double>(const string& content, double* value) {
+  try {
+    *value = std::atof(content.c_str());
+    return true;
+  } catch(...) {
+    GlobalInitStream()
+        << "Caffe2 flag error: Cannot convert argument to double: "
+        << content << std::endl;
+    return false;
+  }
+}
+
+template <>
+bool Caffe2FlagParser::Parse<bool>(const string& content, bool* value) {
+  if (content == "false" || content == "False" || content == "FALSE" ||
+      content == "0") {
+    *value = false;
+    return true;
+  } else if (content == "true" || content == "True" || content == "TRUE" ||
+      content == "1") {
+    *value = true;
+    return true;
+  } else {
+    GlobalInitStream()
+        << "Caffe2 flag error: Cannot convert argument to bool: "
+        << content << std::endl
+        << "Note that if you are passing in a bool flag, you need to "
+           "explicitly specify it, like --arg=True or --arg True. Otherwise, "
+           "the next argument may be inadvertently used as the argument, "
+           "causing the above error."
+        << std::endl;
+    return false;
+  }
+}
+
+#endif  // CAFFE2_USE_GFLAGS
+
+}  // namespace caffe2
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
new file mode 100644
index 0000000..6d89045
--- /dev/null
+++ b/caffe2/core/flags.h
@@ -0,0 +1,163 @@
+/**
+ * @file flags.h
+ * @brief Commandline flags support for Caffe2.
+ *
+ * This is a portable commandline flags tool for caffe2, so we can optionally
+ * choose to use gflags or a lightweighted custom implementation if gflags is
+ * not possible on a certain platform. If you have gflags installed, set the
+ * macro CAFFE2_USE_GFLAGS will seamlessly route everything to gflags.
+ *
+ * To define a flag foo of type bool default to true, do the following in the
+ * *global* namespace:
+ *     CAFFE2_DEFINE_bool(foo, true, "An example.");
+ *
+ * To use it in another .cc file, you can use CAFFE2_DECLARE_* as follows:
+ *     CAFFE2_DECLARE_bool(foo);
+ *
+ * In both cases, you can then access the flag via caffe2::FLAGS_foo.
+ */
+
+#ifndef CAFFE2_CORE_FLAGS_H_
+#define CAFFE2_CORE_FLAGS_H_
+
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+/**
+ * Sets the usage message when a commandline tool is called with "--help".
+ */
+void SetUsageMessage(const string& str);
+
+/**
+ * Returns the usage message for the commandline tool set by SetUsageMessage.
+ */
+const char* UsageMessage();
+
+/**
+ * Parses the commandline flags.
+ *
+ * This command parses all the commandline arguments passed in via pargc
+ * and argv. Once it is finished, partc and argv will contain the remaining
+ * commandline args that caffe2 does not deal with. Note that following
+ * convention, argv[0] contains the binary name and is not parsed.
+ */
+bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv);
+/**
+ * Checks if the commandline flags has already been passed.
+ */
+bool CommandLineFlagsHasBeenParsed();
+
+}  // namespace caffe2
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Below are gflags and non-gflags specific implementations.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef CAFFE2_USE_GFLAGS
+
+#include <gflags/gflags.h>
+
+// gflags before 2.0 uses namespace google and after 2.1 uses namespace gflags.
+// Using GFLAGS_GFLAGS_H_ to capture this change.
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif  // GFLAGS_GFLAGS_H_
+
+#define CAFFE2_GFLAGS_DEF_WRAPPER(type, name, default_value, help_str)         \
+  DEFINE_##type(name, default_value, help_str);                                \
+  namespace caffe2 {                                                           \
+    using ::FLAGS_##name;                                                      \
+  }
+
+#define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
+  CAFFE2_GFLAGS_DEF_WRAPPER(int32, name, default_value, help_str)
+#define CAFFE2_DEFINE_int64(name, default_value, help_str)                     \
+  CAFFE2_GFLAGS_DEF_WRAPPER(int64, name, default_value, help_str)              
+#define CAFFE2_DEFINE_double(name, default_value, help_str)                    \
+  CAFFE2_GFLAGS_DEF_WRAPPER(double, name, default_value, help_str)
+#define CAFFE2_DEFINE_bool(name, default_value, help_str)                      \
+  CAFFE2_GFLAGS_DEF_WRAPPER(bool, name, default_value, help_str)
+#define CAFFE2_DEFINE_string(name, default_value, help_str) \
+  CAFFE2_GFLAGS_DEF_WRAPPER(string, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, name)                             \
+  DECLARE_##type(name);                                                       \
+  namespace caffe2 {                                                          \
+    using ::FLAGS_##name;                                                     \
+  }  // namespace caffe2
+
+#define CAFFE2_DECLARE_int(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(int32, name)
+#define CAFFE2_DECLARE_int64(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(int64, name)
+#define CAFFE2_DECLARE_double(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(double, name)
+#define CAFFE2_DECLARE_bool(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(bool, name)
+#define CAFFE2_DECLARE_string(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(string, name)
+
+#else   // CAFFE2_USE_GFLAGS
+
+namespace caffe2 {
+
+class Caffe2FlagParser {
+ public:
+  Caffe2FlagParser() {}
+  bool success() { return success_; }
+
+ protected:
+  template <typename T>
+  bool Parse(const string& content, T* value);
+  bool success_;
+};
+
+CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+
+}  // namespace caffe2
+
+// The macros are defined outside the caffe2 namespace. In your code, you should
+// write the CAFFE2_DEFINE_* and CAFFE2_DECLARE_* macros outside any namespace
+// as well.
+
+#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)          \
+  namespace caffe2 {                                                          \
+  CAFFE2_EXPORT type FLAGS_##name = default_value;                            \
+  namespace {                                                                 \
+  class Caffe2FlagParser_##name : public Caffe2FlagParser {                   \
+   public:                                                                    \
+    explicit Caffe2FlagParser_##name(const string& content) {                 \
+      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);       \
+    }                                                                         \
+  };                                                                          \
+  }                                                                           \
+  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                 \
+      #name,                                                                  \
+      Caffe2FlagsRegistry(),                                                  \
+      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>, \
+      "(" #type ", default " #default_value ") " help_str);                   \
+  }
+
+#define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
+  CAFFE2_DEFINE_typed_var(int, name, default_value, help_str)
+#define CAFFE2_DEFINE_int64(name, default_value, help_str) \
+  CAFFE2_DEFINE_typed_var(int64_t, name, default_value, help_str)
+#define CAFFE2_DEFINE_double(name, default_value, help_str) \
+  CAFFE2_DEFINE_typed_var(double, name, default_value, help_str)
+#define CAFFE2_DEFINE_bool(name, default_value, help_str)                      \
+  CAFFE2_DEFINE_typed_var(bool, name, default_value, help_str)
+#define CAFFE2_DEFINE_string(name, default_value, help_str)                    \
+  CAFFE2_DEFINE_typed_var(string, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define CAFFE2_DECLARE_typed_var(type, name) \
+  namespace caffe2 {                         \
+  CAFFE2_IMPORT extern type FLAGS_##name;    \
+  } // namespace caffe2
+
+#define CAFFE2_DECLARE_int(name) CAFFE2_DECLARE_typed_var(int, name)
+#define CAFFE2_DECLARE_int64(name) CAFFE2_DECLARE_typed_var(int64_t, name)
+#define CAFFE2_DECLARE_double(name) CAFFE2_DECLARE_typed_var(double, name)
+#define CAFFE2_DECLARE_bool(name) CAFFE2_DECLARE_typed_var(bool, name)
+#define CAFFE2_DECLARE_string(name) CAFFE2_DECLARE_typed_var(string, name)
+
+#endif  // CAFFE2_USE_GFLAGS
+
+#endif  // CAFFE2_CORE_FLAGS_H_
diff --git a/caffe2/core/graph.cc b/caffe2/core/graph.cc
new file mode 100644
index 0000000..ba5a3a1
--- /dev/null
+++ b/caffe2/core/graph.cc
@@ -0,0 +1,285 @@
+#include "caffe2/core/graph.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+namespace transform {
+
+Graph::Graph(const NetDef& net) : netdef_(net) {
+  nodes_.clear();
+  nodes_.resize(net.op_size());
+
+  // Copy over operators
+  for (int x = 0; x < net.op_size(); x++) {
+    node(x).op = net.op(x);
+  }
+
+  // For any blob, which operator was the last to write to it?
+  // In python, this is known as "versions".
+  std::unordered_map<string, int> edge_parent;
+
+  for (int i = 0; i < (int)nodes_.size(); i++) {
+    for (const string& blob : node(i).op.input()) {
+      auto it = edge_parent.find(blob);
+      if (it != edge_parent.end()) {
+        int j = it->second;
+        node(i).parents[j].push_back(blob);
+        node(j).children[i].push_back(blob);
+      } else {
+        external_input_.insert(blob);
+      }
+    }
+    for (const string& blob : node(i).op.output()) {
+      edge_parent[blob] = i;
+    }
+  }
+
+  // Traverse opposite direction to find external outputs
+
+  // For any blob, which operator was the last to read to from it?
+  std::unordered_map<string, int> edge_child;
+
+  for (int i = (int)nodes_.size() - 1; i >= 0; i--) {
+    for (const string& blob : node(i).op.output()) {
+      auto it = edge_child.find(blob);
+      if (it == edge_child.end()) {
+        external_output_.insert(blob);
+      }
+    }
+    for (const string& blob : node(i).op.input()) {
+      edge_child[blob] = i;
+    }
+  }
+}
+
+const std::vector<std::pair<string, int>> Graph::GetSubgraphInput(
+    const std::vector<int>& match) {
+  return GetSubgraphPerimeterHelper(true, match);
+}
+
+const std::vector<std::pair<string, int>> Graph::GetSubgraphOutput(
+    const std::vector<int>& match) {
+  return GetSubgraphPerimeterHelper(false, match);
+}
+
+// This helper function will either get:
+//    1) a list for the blobs that write INTO a subgraph
+//    2) a list of for the blobs that are written FROM a subgraph.
+//
+// The "from_children" flag determines if it is case 1 (true) or case 2 (false).
+const std::vector<std::pair<string, int>> Graph::GetSubgraphPerimeterHelper(
+    bool from_children,
+    const std::vector<int>& match) {
+  std::vector<std::pair<string, int>> edge_list;
+  std::unordered_set<int> match_set(match.begin(), match.end());
+  for (int x = 0; x < (int)nodes_.size(); x++) {
+    if (!is_node_active(x)) {
+      continue;
+    }
+    if (!match_set.count(x)) { // x is not in subgraph
+      const auto& list = from_children ? node(x).children : node(x).parents;
+      for (const auto& edge : list) {
+        int parent = edge.first;
+        const auto& blobs = edge.second;
+        if (match_set.count(parent)) { // but has a parent that is in subgraph
+          for (const string& blob : blobs) {
+            edge_list.push_back({blob, x});
+          }
+        }
+      }
+    }
+  }
+  // return the list in sorted order, to allow binary searching
+  std::sort(edge_list.begin(), edge_list.end());
+  return edge_list;
+}
+
+NetDef Graph::GetNetDef() {
+  std::vector<bool> visited(nodes_.size(), false);
+
+  // Copy over all the properties of the netdef we're based on
+  NetDef netdef = netdef_;
+
+  // But we're going to put in our own operators.
+  netdef.clear_op();
+
+  // Keeps track of the number of parents yet to be processed.
+  std::vector<int> unchecked_parent_count;
+
+  // We will perform a topological traversal on the nodes, but we will prefer
+  // nodes that come earlier in the execution order.
+
+  // This is a min-heap, which stores its elements in ascending order.
+  // This stores the nodes in the order we process them to be in.
+  // This guarantees the lowest lexicographical topological ordering.
+
+  // This also means the original nodes will be kept in their execution order.
+  std::priority_queue<int, std::vector<int>, std::greater<int>> q;
+
+  // In our graph, G, the nodes don't have a strict ordering. But in the netdef,
+  // they must (since nets are operators executed in some order).
+  // How do we make sure that the order of operators in our generated netdef
+  // is valid?
+  // 1) The ordering of the netdef must be topologically sorted, respect to G.
+  //    If A -> B is an edge in the graph G, then A must come before B in the
+  //    netdef's ordering.
+  // 2) No blob conflicts: If A -> B is an edge in the graph G, and A writes to
+  //    blob X and B reads from blob X, then there cannot be an op that writes
+  //    to blob X between A and B in the ordering.
+  //
+  // Perform a Topological Sort, to find an order for the Operators to be in.
+  // We will keep track of the number of parents each node has.
+  // We begin with an empty queue, and push in all nodes that do not have any
+  // parents. Then, we keep track of all unprocessed parents for each node.
+  // When a node has no more unprocessed parents, we can push it into the queue
+  // to be processed. This guarantees condition 1 is satisfied.
+
+  // TODO(benz): Currently, condition 2 is not guaranteed to be satisified.
+  // However, giving each blob unique names via SSA will satisfy this condition.
+  // Then, the resulting graph can be optimized with memonger.
+
+  for (int i = 0; i < (int)nodes_.size(); i++) {
+    unchecked_parent_count.push_back(node(i).parents.size());
+    if (node(i).parents.size() == 0 && is_node_active(i)) {
+      q.push(i);
+      visited[i] = true;
+    }
+  }
+
+  while (!q.empty()) {
+    int idx = q.top();
+    q.pop();
+    if (!is_node_active(idx)) {
+      continue;
+    }
+    // Creates a new OperatorDef in NetDef
+    auto& op = *(netdef.add_op());
+    // Sets it equal to the OperatorDef at node(idx)
+    op = node(idx).op;
+    for (const auto& edge : node(idx).children) {
+      int child = edge.first;
+      if (!visited[child] && is_node_active(child)) {
+        unchecked_parent_count[child]--;
+        if (unchecked_parent_count[child] == 0) {
+          q.push(child);
+          visited[child] = true;
+        }
+      }
+    }
+  }
+  return netdef;
+}
+
+void Graph::DeactivateSubgraph(std::vector<int> subgraph) {
+  for (int idx : subgraph) {
+    // remove all edges connected to inactive node
+    for (const auto& edge : node(idx).parents) {
+      int parent = edge.first;
+      node(parent).children.erase(idx);
+    }
+    for (const auto& edge : node(idx).children) {
+      int child = edge.first;
+      node(child).parents.erase(idx);
+    }
+    // actually mark flags as false
+    node(idx).active = false;
+  }
+}
+
+} // namespace transform
+
+OperatorDef* AddOp(
+    NetDef* netdef_ptr,
+    string op_type,
+    std::vector<string> inputs,
+    std::vector<string> outputs) {
+  CHECK(netdef_ptr);
+  auto& netdef = *netdef_ptr;
+  auto op_ptr = netdef.add_op();
+  auto& op = *op_ptr;
+  op.set_type(op_type);
+  for (const string& inp : inputs) {
+    op.add_input(inp);
+  }
+  for (const string& outp : outputs) {
+    op.add_output(outp);
+  }
+  return op_ptr;
+}
+
+bool MatchStrings(string p, string s) {
+  if (p == "*") { // star accepts anything
+    return true;
+  }
+  // TODO(benz): memoize this. (high constant factor boost in performance)
+  vector<string> choices = split('|', p);
+  for (const string& candidate : choices) {
+    if (candidate == s) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op) {
+  for (const auto& p_arg : p_op.arg()) {
+    if (!p_arg.has_name()) {
+      continue;
+    }
+    bool found = false;
+    for (const auto& g_arg : g_op.arg()) {
+      if (p_arg.name() == g_arg.name()) {
+        found = true;
+        if (p_arg.has_f()) {
+          if (!g_arg.has_f() || p_arg.f() != g_arg.f()) {
+            return false;
+          }
+        }
+        if (p_arg.has_i()) {
+          if (!g_arg.has_i() || p_arg.i() != g_arg.i()) {
+            return false;
+          }
+        }
+        if (p_arg.has_s()) {
+          if (!g_arg.has_s() || !MatchStrings(p_arg.s(), g_arg.s())) {
+            return false;
+          }
+        }
+        if (p_arg.floats_size() != g_arg.floats_size()) {
+          return false;
+        }
+        for (int i = 0; i < p_arg.floats_size(); i++) {
+          if (p_arg.floats(i) != g_arg.floats(i)) {
+            return false;
+          }
+        }
+        if (p_arg.ints_size() != g_arg.ints_size()) {
+          return false;
+        }
+        for (int i = 0; i < p_arg.ints_size(); i++) {
+          if (p_arg.ints(i) != g_arg.ints(i)) {
+            return false;
+          }
+        }
+        if (p_arg.strings_size() != g_arg.strings_size()) {
+          return false;
+        }
+        for (int i = 0; i < p_arg.strings_size(); i++) {
+          if (!MatchStrings(p_arg.strings(i), g_arg.strings(i))) {
+            return false;
+          }
+        }
+      }
+    }
+    if (!found) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/graph.h b/caffe2/core/graph.h
new file mode 100644
index 0000000..1bd0d4f
--- /dev/null
+++ b/caffe2/core/graph.h
@@ -0,0 +1,179 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace caffe2 {
+
+namespace transform {
+
+/**
+ *  Graph representation of an operator.
+ */
+struct Node {
+ public:
+  // Empty constructor for resize
+  Node() {}
+
+  // Alternate constructor
+  Node(
+      const OperatorDef& op,
+      bool active,
+      std::map<int, std::vector<string>> parents,
+      std::map<int, std::vector<string>> children)
+      : op(op), active(active), parents(parents), children(children) {}
+
+  // The OperatorDef which this node represents.
+  OperatorDef op;
+
+  // Keeps track of if an operator has been deleted through a transformation.
+  bool active = true;
+
+  // Stores a pair (idx, blob_list),
+  //  idx = index of the child
+  //  blob_list = a list of strings, containing the blobs that connect the nodes
+  std::map<int, std::vector<string>> parents;
+  std::map<int, std::vector<string>> children;
+};
+
+/**
+ *  Graph representation of a Netdef.
+ */
+struct Graph {
+ public:
+  /**
+   * Given a subgraph, gets all of the parents of the subgraph, as well as
+   * their associated blob names. Sorted by blob names.
+   *
+   * <string, int> := (name of blob writing into subgraph,
+   *                  index of node that writes into subgraph using that blob)
+   */
+  const std::vector<std::pair<string, int>> GetSubgraphInput(
+      const std::vector<int>& subgraph);
+
+  /**
+   * Given a subgraph, gets all of the children of the subgraph, as well as
+   * their associated blob names. Sorted by blob names.
+   *
+   * <string, int> := (name of blob reading from subgraph,
+   *                  index of node that reads from subgraph using that blob)
+   */
+  const std::vector<std::pair<string, int>> GetSubgraphOutput(
+      const std::vector<int>& subgraph);
+
+  /**
+   * Graph generation.
+   * Given a netdef, returns a Graph.
+   *
+   * Each node represents an operator.
+   * An edge exists between two nodes if the parent op writes to a blob, which
+   * is the input of the child blob, with no other op writing to the blob in
+   * between the execution order.
+   *
+   * Time Complexity: O(E), where E is the number of blobs
+   */
+  explicit Graph(const NetDef& net_def);
+
+  /**
+   * Generates a NetDef Representation for the current graph.
+   * Nodes are visited in topological order, which is proper Opdef ordering.
+   * TODO(benz):
+   * There exists conflicts with repeated blob names, where topological sorting
+   * is not sufficient for correct netdef representation, unless blobs are
+   * renamed.
+   * For example, if after a transformation, We have operator ancestry:
+   * A --> B --> C, and also A --> D --> E, where B -> C and D -> E uses the
+   * same blob name, then A, B, D, E, C is a correct topological ordering,
+   * but D will write to the blob that C reads from, instead of B.
+   * Currently believe that there will always be ambiguity unless blobs are
+   * renamed.
+   * This is solved by performing SSA on all transformed blob names.
+   */
+  NetDef GetNetDef();
+
+  /**
+   * Deactivate a subgraph, and get rid of all edges into this subgraph.
+   */
+  void DeactivateSubgraph(std::vector<int> subgraph);
+
+  size_t size() const {
+    return nodes_.size();
+  }
+
+  void push_node(const Node& new_node) {
+    return nodes_.push_back(new_node);
+  }
+
+  void resize_nodes(size_t new_size) {
+    nodes_.resize(new_size);
+  }
+
+  // Index safe, less verbose way to access nodes
+  inline const Node& node(size_t idx) const {
+    return nodes_.at(idx);
+  }
+
+  inline Node& node(size_t idx) {
+    return nodes_.at(idx);
+  }
+
+  inline bool is_node_active(size_t idx) {
+    return node(idx).active;
+  }
+
+  inline const std::set<string>& external_input() const {
+    return external_input_;
+  }
+
+  inline const std::set<string>& external_output() const {
+    return external_output_;
+  }
+
+ private:
+  const std::vector<std::pair<string, int>> GetSubgraphPerimeterHelper(
+      bool from_children,
+      const std::vector<int>& match);
+
+  // Stores the netdef representation. Is updated upon calls to GetNetDef.
+  NetDef netdef_;
+
+  // Stores which blobs the graph reads from, and writes to.
+  std::set<string> external_input_;
+  std::set<string> external_output_;
+
+  // Keeps track of all the Operators currently within graph, even if inactive.
+  std::vector<Node> nodes_;
+};
+
+} // namespace transform
+
+// Adds an operator def to a netdef.
+// Returns the ptr, if you want to add anything extra (such as device_option)
+OperatorDef* AddOp(
+    NetDef* netdef_ptr,
+    string op_type,
+    std::vector<string> inputs,
+    std::vector<string> outputs);
+
+/**
+ * This allows for the use of * and | to match operator types,
+ * engines, or any other property that is represented by strings.
+ *
+ * For example, if we wanted to match an operator to Conv or FC, we can give:
+ * "Conv|FC" as the type() of that op.
+ */
+bool MatchStrings(string p, string s);
+
+/**
+ * This ensures that each named arg that exists in the pattern exists in g_op,
+ * is equal in value.
+ */
+bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op);
+
+} // namespace caffe2
diff --git a/caffe2/core/graph_test.cc b/caffe2/core/graph_test.cc
new file mode 100644
index 0000000..8aa4f16
--- /dev/null
+++ b/caffe2/core/graph_test.cc
@@ -0,0 +1,200 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/graph.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+namespace {
+
+using transform::Graph;
+
+static std::atomic<int> counter;
+
+class GraphDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    counter.fetch_add(1);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(GraphDummyOp1, GraphDummyOp);
+
+OPERATOR_SCHEMA(GraphDummyOp1)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(GraphDummyOp2, GraphDummyOp);
+
+OPERATOR_SCHEMA(GraphDummyOp2)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(GraphDummyOp3, GraphDummyOp);
+
+OPERATOR_SCHEMA(GraphDummyOp3)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+// Checks if two netdefs are  in terms of type, input, and output.
+void compare_netdefs(const NetDef& net_a, const NetDef& net_b) {
+  EXPECT_EQ(net_a.op_size(), net_b.op_size());
+  for (int i = 0; i < net_a.op_size(); i++) {
+    EXPECT_EQ(net_a.op(i).type(), net_b.op(i).type());
+    EXPECT_EQ(net_a.op(i).input_size(), net_b.op(i).input_size());
+    for (int j = 0; j < net_a.op(i).input_size(); j++) {
+      EXPECT_EQ(net_a.op(i).input(j), net_b.op(i).input(j));
+    }
+    EXPECT_EQ(net_a.op(i).output_size(), net_b.op(i).output_size());
+    for (int j = 0; j < net_a.op(i).output_size(); j++) {
+      EXPECT_EQ(net_a.op(i).output(j), net_b.op(i).output(j));
+    }
+  }
+}
+
+TEST(GraphTest, TestGenerateGraphChain) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+  AddOp(&netdef, "GraphDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "GraphDummyOp2", {"mid3"}, {"out"});
+  Graph g(netdef);
+  EXPECT_EQ(g.size(), 4);
+  for (int i = 0; i < 4; i++) {
+    if (i < 3) {
+      EXPECT_EQ(g.node(i).children.size(), 1);
+      EXPECT_TRUE(g.node(i).children.count(i + 1));
+    }
+    if (i > 0) {
+      EXPECT_EQ(g.node(i).parents.size(), 1);
+      EXPECT_TRUE(g.node(i).parents.count(i - 1));
+    }
+  }
+  NetDef retrieved_net = g.GetNetDef();
+  compare_netdefs(retrieved_net, netdef);
+}
+
+TEST(GraphTest, TestGenerateGraphChainInPlace) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+  AddOp(&netdef, "GraphDummyOp1", {"in"}, {"out"});
+  AddOp(&netdef, "GraphDummyOp2", {"out"}, {"out"});
+  AddOp(&netdef, "GraphDummyOp1", {"out"}, {"out"});
+  AddOp(&netdef, "GraphDummyOp2", {"out"}, {"out"});
+  Graph g(netdef);
+  EXPECT_EQ(g.size(), 4);
+  for (int i = 0; i < 4; i++) {
+    if (i < 3) {
+      EXPECT_EQ(g.node(i).children.size(), 1);
+      EXPECT_TRUE(g.node(i).children.count(i + 1));
+    }
+    if (i > 0) {
+      EXPECT_EQ(g.node(i).parents.size(), 1);
+      EXPECT_TRUE(g.node(i).parents.count(i - 1));
+    }
+  }
+  NetDef retrieved_net = g.GetNetDef();
+  compare_netdefs(retrieved_net, netdef);
+}
+
+// Diamond Graph
+TEST(GraphTest, TestGenerateGraphBranch) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "GraphDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp2", {"mid1"}, {"mid3"});
+  AddOp(&netdef, "GraphDummyOp3", {"mid2", "mid3"}, {"out"});
+
+  Graph g(netdef);
+
+  EXPECT_EQ(g.size(), 4);
+  EXPECT_EQ(g.node(0).parents.size(), 0);
+  EXPECT_EQ(g.node(0).children.size(), 2);
+  EXPECT_EQ(g.node(1).parents.size(), 1);
+  EXPECT_EQ(g.node(1).children.size(), 1);
+  EXPECT_EQ(g.node(2).parents.size(), 1);
+  EXPECT_EQ(g.node(2).children.size(), 1);
+  EXPECT_EQ(g.node(3).parents.size(), 2);
+  EXPECT_EQ(g.node(3).children.size(), 0);
+
+  NetDef retrieved_net = g.GetNetDef();
+  compare_netdefs(retrieved_net, netdef);
+}
+
+// Double Diamond Graph, reused names
+TEST(GraphTest, TestReusedInputs) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "GraphDummyOp1", {"in"}, {"in"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp3", {"mid1", "mid2"}, {"in"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp3", {"mid1", "mid2"}, {"in"});
+
+  Graph g(netdef);
+
+  EXPECT_EQ(g.size(), 7);
+  EXPECT_EQ(g.node(0).parents.size(), 0);
+  EXPECT_EQ(g.node(0).children.size(), 2);
+  EXPECT_EQ(g.node(1).parents.size(), 1);
+  EXPECT_EQ(g.node(1).children.size(), 1);
+  EXPECT_EQ(g.node(2).parents.size(), 1);
+  EXPECT_EQ(g.node(2).children.size(), 1);
+  EXPECT_EQ(g.node(3).parents.size(), 2);
+  EXPECT_EQ(g.node(3).children.size(), 2);
+  EXPECT_EQ(g.node(4).parents.size(), 1);
+  EXPECT_EQ(g.node(4).children.size(), 1);
+  EXPECT_EQ(g.node(5).parents.size(), 1);
+  EXPECT_EQ(g.node(5).children.size(), 1);
+  EXPECT_EQ(g.node(6).parents.size(), 2);
+  EXPECT_EQ(g.node(6).children.size(), 0);
+
+  NetDef retrieved_net = g.GetNetDef();
+  compare_netdefs(retrieved_net, netdef);
+}
+
+TEST(GraphTest, TestGetPerimeter) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "GraphDummyOp1", {"in"}, {"in"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp3", {"mid1", "mid2"}, {"in"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid1"});
+  AddOp(&netdef, "GraphDummyOp2", {"in"}, {"mid2"});
+  AddOp(&netdef, "GraphDummyOp1", {"mid1", "mid2"}, {"in"});
+
+  Graph g(netdef);
+  std::vector<int> subgraph = {3};
+
+  auto subgraph_input = g.GetSubgraphInput(subgraph);
+  EXPECT_EQ(subgraph_input.size(), 2);
+  EXPECT_EQ(subgraph_input[0], std::make_pair(string("mid1"), 1));
+  EXPECT_EQ(subgraph_input[1], std::make_pair(string("mid2"), 2));
+
+  auto subgraph_output = g.GetSubgraphOutput(subgraph);
+  EXPECT_EQ(subgraph_output.size(), 2);
+  EXPECT_EQ(subgraph_output[0], std::make_pair(string("in"), 4));
+  EXPECT_EQ(subgraph_output[1], std::make_pair(string("in"), 5));
+}
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/blob_hip_test.cc b/caffe2/core/hip/blob_hip_test.cc
new file mode 100644
index 0000000..aa1fa24
--- /dev/null
+++ b/caffe2/core/hip/blob_hip_test.cc
@@ -0,0 +1,237 @@
+#include <iostream> // NOLINT
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+class TensorGPUTest : public ::testing::Test
+{
+};
+template <typename T>
+class TensorGPUDeathTest : public ::testing::Test
+{
+};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorGPUTest, TensorInitializedEmpty)
+{
+    if(!caffe2::HasHipGPU())
+        return;
+    TensorHIP tensor;
+    EXPECT_EQ(tensor.ndim(), 0);
+    vector<int> dims(3);
+    dims[0] = 2;
+    dims[1] = 3;
+    dims[2] = 5;
+    tensor.Resize(dims);
+    EXPECT_EQ(tensor.ndim(), 3);
+    EXPECT_EQ(tensor.dim32(0), 2);
+    EXPECT_EQ(tensor.dim32(1), 3);
+    EXPECT_EQ(tensor.dim32(2), 5);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty)
+{
+    if(!HasHipGPU())
+        return;
+    vector<int> dims(3);
+    dims[0] = 2;
+    dims[1] = 3;
+    dims[2] = 5;
+    TensorHIP tensor(dims);
+    EXPECT_EQ(tensor.ndim(), 3);
+    EXPECT_EQ(tensor.dim32(0), 2);
+    EXPECT_EQ(tensor.dim32(1), 3);
+    EXPECT_EQ(tensor.dim32(2), 5);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+    dims[0] = 7;
+    dims[1] = 11;
+    dims[2] = 13;
+    dims.push_back(17);
+    tensor.Resize(dims);
+    EXPECT_EQ(tensor.ndim(), 4);
+    EXPECT_EQ(tensor.dim32(0), 7);
+    EXPECT_EQ(tensor.dim32(1), 11);
+    EXPECT_EQ(tensor.dim32(2), 13);
+    EXPECT_EQ(tensor.dim32(3), 17);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareData)
+{
+    if(!HasHipGPU())
+        return;
+    vector<int> dims(3);
+    dims[0] = 2;
+    dims[1] = 3;
+    dims[2] = 5;
+    TensorHIP tensor(dims);
+    TensorHIP other_tensor(dims);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    other_tensor.ShareData(tensor);
+    EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+    EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+    EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes)
+{
+    if(!HasHipGPU())
+        return;
+    vector<int> dims(3);
+    dims[0] = 2;
+    dims[1] = 3;
+    dims[2] = 5;
+    vector<int> alternate_dims(1);
+    alternate_dims[0] = 2 * 3 * 5;
+    TensorHIP tensor(dims);
+    TensorHIP other_tensor(alternate_dims);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    other_tensor.ShareData(tensor);
+    EXPECT_EQ(other_tensor.ndim(), 1);
+    EXPECT_EQ(other_tensor.dim32(0), alternate_dims[0]);
+    EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
+    EXPECT_TRUE(other_tensor.data<TypeParam>() != nullptr);
+    EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize)
+{
+    if(!HasHipGPU())
+        return;
+    vector<int> dims(3);
+    dims[0] = 2;
+    dims[1] = 3;
+    dims[2] = 5;
+    TensorHIP tensor(dims);
+    TensorHIP other_tensor(dims);
+    EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
+    other_tensor.ShareData(tensor);
+    EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
+    auto* old_pointer = other_tensor.data<TypeParam>();
+
+    dims[0] = 7;
+    tensor.Resize(dims);
+    EXPECT_EQ(old_pointer, other_tensor.data<TypeParam>());
+    EXPECT_NE(old_pointer, tensor.mutable_data<TypeParam>());
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty)
+{
+    if(!HasHipGPU())
+        return;
+    ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+    TensorHIP tensor;
+    EXPECT_EQ(tensor.ndim(), 0);
+    EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
+}
+
+#define TEST_SERIALIZATION_GPU_WITH_TYPE(TypeParam, field_name)                               \
+    TEST(TensorGPUTest, TensorSerialization_##TypeParam)                                      \
+    {                                                                                         \
+        if(!HasHipGPU())                                                                      \
+        {                                                                                     \
+            return;                                                                           \
+        }                                                                                     \
+        Blob blob;                                                                            \
+        TensorCPU cpu_tensor;                                                                 \
+        cpu_tensor.Resize(2, 3);                                                              \
+        for(int i = 0; i < 6; ++i)                                                            \
+        {                                                                                     \
+            cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);              \
+        }                                                                                     \
+        blob.GetMutable<TensorHIP>()->CopyFrom(cpu_tensor);                                   \
+        string serialized = blob.Serialize("test");                                           \
+        BlobProto proto;                                                                      \
+        CAFFE_ENFORCE(proto.ParseFromString(serialized));                                     \
+        EXPECT_EQ(proto.name(), "test");                                                      \
+        EXPECT_EQ(proto.type(), "Tensor");                                                    \
+        EXPECT_TRUE(proto.has_tensor());                                                      \
+        const TensorProto& tensor_proto = proto.tensor();                                     \
+        EXPECT_EQ(tensor_proto.data_type(), TypeMetaToDataType(TypeMeta::Make<TypeParam>())); \
+        EXPECT_EQ(tensor_proto.field_name##_size(), 6);                                       \
+        for(int i = 0; i < 6; ++i)                                                            \
+        {                                                                                     \
+            EXPECT_EQ(tensor_proto.field_name(i), static_cast<TypeParam>(i));                 \
+        }                                                                                     \
+        Blob new_blob;                                                                        \
+        EXPECT_NO_THROW(new_blob.Deserialize(serialized));                                    \
+        EXPECT_TRUE(new_blob.IsType<TensorHIP>());                                            \
+        TensorCPU new_cpu_tensor(blob.Get<TensorHIP>());                                      \
+        EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                                  \
+        EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                                  \
+        EXPECT_EQ(new_cpu_tensor.dim(1), 3);                                                  \
+        for(int i = 0; i < 6; ++i)                                                            \
+        {                                                                                     \
+            EXPECT_EQ(cpu_tensor.data<TypeParam>()[i], new_cpu_tensor.data<TypeParam>()[i]);  \
+        }                                                                                     \
+    }
+
+TEST_SERIALIZATION_GPU_WITH_TYPE(bool, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(double, double_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(float, float_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int8_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int16_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(uint8_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(uint16_t, int32_data)
+TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)
+
+TEST(TensorTest, TensorSerializationMultiDevices)
+{
+    Blob blob;
+    TensorCPU tensor;
+    tensor.Resize(2, 3);
+    for(int i = 0; i < 6; ++i)
+    {
+        tensor.mutable_data<float>()[i] = i;
+    }
+    for(int gpu_id = 0; gpu_id < NumHipDevices(); ++gpu_id)
+    {
+        DeviceGuard guard(gpu_id);
+        HIPContext context(gpu_id);
+        blob.Reset(new TensorHIP(tensor, &context));
+        string serialized = blob.Serialize("test");
+        BlobProto proto;
+        CAFFE_ENFORCE(proto.ParseFromString(serialized));
+        EXPECT_EQ(proto.name(), "test");
+        EXPECT_TRUE(proto.has_tensor());
+        const TensorProto& tensor_proto = proto.tensor();
+        EXPECT_EQ(tensor_proto.data_type(), TensorProto::FLOAT);
+        EXPECT_EQ(tensor_proto.float_data_size(), 6);
+        for(int i = 0; i < 6; ++i)
+        {
+            EXPECT_EQ(tensor_proto.float_data(i), i);
+        }
+        EXPECT_TRUE(tensor_proto.has_device_detail());
+        EXPECT_EQ(tensor_proto.device_detail().device_type(), HIP);
+        EXPECT_EQ(tensor_proto.device_detail().hip_gpu_id(), gpu_id);
+        // Test if the restored blob is still of the same device.
+        blob.Reset();
+        EXPECT_NO_THROW(blob.Deserialize(serialized));
+        EXPECT_TRUE(blob.IsType<TensorHIP>());
+        EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorHIP>().data<float>()), gpu_id);
+        // Test if we force the restored blob on a different device, we
+        // can still get so.
+        blob.Reset();
+        proto.mutable_tensor()->mutable_device_detail()->set_hip_gpu_id(0);
+        EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
+        EXPECT_TRUE(blob.IsType<TensorHIP>());
+        EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorHIP>().data<float>()), 0);
+    }
+}
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/hip/blob_serialization_hip.cc b/caffe2/core/hip/blob_serialization_hip.cc
new file mode 100644
index 0000000..d472456
--- /dev/null
+++ b/caffe2/core/hip/blob_serialization_hip.cc
@@ -0,0 +1,20 @@
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/hip/context_hip.h"
+
+namespace caffe2 {
+
+template <>
+void TensorSerializer<HIPContext>::StoreDeviceDetail(const Tensor<HIPContext>& input,
+                                                     TensorProto* proto)
+{
+    auto* device_detail = proto->mutable_device_detail();
+    device_detail->set_device_type(HIP);
+    device_detail->set_hip_gpu_id(GetGPUIDForPointer(input.raw_data()));
+}
+
+namespace {
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<TensorHIP>()), TensorSerializer<HIPContext>);
+REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer<HIPContext>);
+}
+} // namespace caffe2
diff --git a/caffe2/core/hip/common_hip.cc b/caffe2/core/hip/common_hip.cc
new file mode 100644
index 0000000..d94dba7
--- /dev/null
+++ b/caffe2/core/hip/common_hip.cc
@@ -0,0 +1,309 @@
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/asan.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <sstream>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_bool(caffe2_hip_full_device_control,
+                   false,
+                   "If true, assume all the hipSetDevice and hipGetDevice calls will be "
+                   "controlled by Caffe2, and non-Caffe2 code will ensure that the entry and "
+                   "exit point has the same cuda device. Under the hood, Caffe2 will use "
+                   "thread local variables to cache the device, in order to speed up set and "
+                   "get device calls. This is an experimental feature that may have non "
+                   "trivial side effects, so use it with care and only enable it if you are "
+                   "absolutely sure. Also, this flag should not be changed after the program "
+                   "initializes.");
+
+namespace caffe2 {
+
+int NumHipDevices()
+{
+    if(getenv("CAFFE2_DEBUG_HIP_INIT_ORDER"))
+    {
+        static bool first = true;
+        if(first)
+        {
+            first = false;
+            std::cerr << "DEBUG: caffe2::NumHipDevices() invoked for the first time" << std::endl;
+        }
+    }
+    static int count = -1;
+    if(count < 0)
+    {
+        auto err = hipGetDeviceCount(&count);
+        switch(err)
+        {
+        case hipSuccess:
+            // Everything is good.
+            break;
+        case hipErrorNoDevice: count = 0; break;
+        case hipErrorInsufficientDriver:
+            LOG(WARNING) << "Insufficient HIP driver. Cannot use HIP.";
+            count = 0;
+            break;
+        case hipErrorInitializationError:
+            LOG(WARNING) << "HIP driver initialization failed, you might not "
+                            "have a HIP gpu.";
+            count = 0;
+            break;
+        case hipErrorUnknown:
+            LOG(ERROR) << "Found an unknown error - this may be due to an "
+                          "incorrectly set up environment, e.g. changing env "
+                          "variable HIP_VISIBLE_DEVICES after program start. "
+                          "I will set the available devices to be zero.";
+            count = 0;
+            break;
+        case hipErrorMemoryAllocation:
+#if CAFFE2_ASAN_ENABLED
+            // In ASAN mode, we know that a hipErrorMemoryAllocation error will
+            // pop up.
+            LOG(ERROR) << "It is known that HIP does not work well with ASAN. As "
+                          "a result we will simply shut down HIP support. If you "
+                          "would like to use GPUs, turn off ASAN.";
+            count = 0;
+            break;
+#else  // CAFFE2_ASAN_ENABLED
+            // If we are not in ASAN mode and we get hipErrorMemoryAllocation,
+            // this means that something is wrong before NumCudaDevices() call.
+            LOG(FATAL) << "Unexpected error from hipGetDeviceCount(). Did you run "
+                          "some HIP functions before calling NumHipDevices() "
+                          "that might have already set an error? Error: "
+                       << err;
+            break;
+#endif // CAFFE2_ASAN_ENABLED
+        default:
+            LOG(FATAL) << "Unexpected error from hipGetDeviceCount(). Did you run "
+                          "some HIP functions before calling NumHipDevices() "
+                          "that might have already set an error? Error: "
+                       << err;
+        }
+    }
+    return count;
+}
+
+namespace {
+int gDefaultGPUID = 0;
+// Only used when FLAGS_caffe2_hip_full_device_control is set true.
+thread_local int gCurrentDevice = -1;
+} // namespace
+
+void SetDefaultGPUID(const int deviceid)
+{
+    CAFFE_ENFORCE_LT(deviceid,
+                     NumHipDevices(),
+                     "The default gpu id should be smaller than the number of gpus "
+                     "on this machine: ",
+                     deviceid,
+                     " vs ",
+                     NumHipDevices());
+    gDefaultGPUID = deviceid;
+}
+
+int GetDefaultGPUID() { return gDefaultGPUID; }
+
+int CaffeHipGetDevice()
+{
+    if(FLAGS_caffe2_hip_full_device_control)
+    {
+        if(gCurrentDevice < 0)
+        {
+            HIP_ENFORCE(hipGetDevice(&gCurrentDevice));
+        }
+        return gCurrentDevice;
+    }
+    else
+    {
+        int gpu_id = 0;
+        HIP_ENFORCE(hipGetDevice(&gpu_id));
+        return gpu_id;
+    }
+}
+
+void CaffeHipSetDevice(const int id)
+{
+    if(FLAGS_caffe2_hip_full_device_control)
+    {
+        if(gCurrentDevice != id)
+        {
+            HIP_ENFORCE(hipSetDevice(id));
+        }
+        gCurrentDevice = id;
+    }
+    else
+    {
+        HIP_ENFORCE(hipSetDevice(id));
+    }
+}
+
+int GetGPUIDForPointer(const void* ptr)
+{
+    hipPointerAttribute_t attr;
+    hipError_t err = hipPointerGetAttributes(&attr, ptr);
+
+    if(err == hipErrorInvalidValue)
+    {
+        // Occurs when the pointer is in the CPU address space that is
+        // unmanaged by HIP; make sure the last error state is cleared,
+        // since it is persistent
+        err = hipGetLastError();
+        CHECK(err == hipErrorInvalidValue);
+        return -1;
+    }
+
+    // Otherwise, there must be no error
+    HIP_ENFORCE(err);
+
+    if(attr.memoryType == hipMemoryTypeHost)
+    {
+        return -1;
+    }
+
+    return attr.device;
+}
+
+struct HipDevicePropWrapper
+{
+    HipDevicePropWrapper() : props(NumHipDevices())
+    {
+        for(int i = 0; i < NumHipDevices(); ++i)
+        {
+            HIP_ENFORCE(hipGetDeviceProperties(&props[i], i));
+        }
+    }
+
+    vector<hipDeviceProp_t> props;
+};
+
+const hipDeviceProp_t& GetDeviceProperty(const int deviceid)
+{
+    // According to C++11 standard section 6.7, static local variable init is
+    // thread safe. See
+    //   https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
+    // for details.
+    static HipDevicePropWrapper props;
+    CAFFE_ENFORCE_LT(deviceid,
+                     NumHipDevices(),
+                     "The gpu id should be smaller than the number of gpus ",
+                     "on this machine: ",
+                     deviceid,
+                     " vs ",
+                     NumHipDevices());
+    return props.props[deviceid];
+}
+
+void DeviceQuery(const int device)
+{
+    const hipDeviceProp_t& prop = GetDeviceProperty(device);
+    std::stringstream ss;
+    ss << std::endl;
+    ss << "Device id:                     " << device << std::endl;
+    ss << "Major revision number:         " << prop.major << std::endl;
+    ss << "Minor revision number:         " << prop.minor << std::endl;
+    ss << "Name:                          " << prop.name << std::endl;
+    ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
+    ss << "Total shared memory per block: " << prop.sharedMemPerBlock << std::endl;
+    ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
+    ss << "Warp size:                     " << prop.warpSize << std::endl;
+    //  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
+    ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock << std::endl;
+    ss << "Maximum dimension of block:    " << prop.maxThreadsDim[0] << ", "
+       << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << std::endl;
+    ss << "Maximum dimension of grid:     " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1]
+       << ", " << prop.maxGridSize[2] << std::endl;
+    ss << "Clock rate:                    " << prop.clockRate << std::endl;
+    ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
+    //  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
+    //  ss << "Concurrent copy and execution: "
+    //     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
+    ss << "Number of multiprocessors:     " << prop.multiProcessorCount << std::endl;
+    //  ss << "Kernel execution timeout:      "
+    //     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
+    LOG(INFO) << ss.str();
+    return;
+}
+
+bool GetHipPeerAccessPattern(vector<vector<bool>>* pattern)
+{
+    int gpu_count;
+    if(hipGetDeviceCount(&gpu_count) != hipSuccess)
+        return false;
+    pattern->clear();
+    pattern->resize(gpu_count, vector<bool>(gpu_count, false));
+    for(int i = 0; i < gpu_count; ++i)
+    {
+        for(int j = 0; j < gpu_count; ++j)
+        {
+            int can_access = true;
+            if(i != j)
+            {
+                if(hipDeviceCanAccessPeer(&can_access, i, j) != hipSuccess)
+                {
+                    return false;
+                }
+            }
+            (*pattern)[i][j] = static_cast<bool>(can_access);
+        }
+    }
+    return true;
+}
+
+const char* rocblasGetErrorString(rocblas_status error)
+{
+    switch(error)
+    {
+    case rocblas_status_success: return "rocblas_status_success";
+    case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle";
+    case rocblas_status_not_implemented: return "rocblas_status_not_implemented";
+    case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer";
+    case rocblas_status_invalid_size: return "rocblas_status_invalid_size";
+    case rocblas_status_memory_error: return "rocblas_status_memory_error";
+    case rocblas_status_internal_error: return "rocblas_status_internal_error";
+    }
+    // To suppress compiler warning.
+    return "Unrecognized rocblas error string";
+}
+
+const char* hiprandGetErrorString(hiprandStatus_t error)
+{
+    switch(error)
+    {
+    case HIPRAND_STATUS_SUCCESS: return "HIPRAND_STATUS_SUCCESS";
+    case HIPRAND_STATUS_VERSION_MISMATCH: return "HIPRAND_STATUS_VERSION_MISMATCH";
+    case HIPRAND_STATUS_NOT_INITIALIZED: return "HIPRAND_STATUS_NOT_INITIALIZED";
+    case HIPRAND_STATUS_ALLOCATION_FAILED: return "HIPRAND_STATUS_ALLOCATION_FAILED";
+    case HIPRAND_STATUS_TYPE_ERROR: return "HIPRAND_STATUS_TYPE_ERROR";
+    case HIPRAND_STATUS_OUT_OF_RANGE: return "HIPRAND_STATUS_OUT_OF_RANGE";
+    case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE: return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+        return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case HIPRAND_STATUS_LAUNCH_FAILURE: return "HIPRAND_STATUS_LAUNCH_FAILURE";
+    case HIPRAND_STATUS_PREEXISTING_FAILURE: return "HIPRAND_STATUS_PREEXISTING_FAILURE";
+    case HIPRAND_STATUS_INITIALIZATION_FAILED: return "HIPRAND_STATUS_INITIALIZATION_FAILED";
+    case HIPRAND_STATUS_ARCH_MISMATCH: return "HIPRAND_STATUS_ARCH_MISMATCH";
+    case HIPRAND_STATUS_INTERNAL_ERROR: return "HIPRAND_STATUS_INTERNAL_ERROR";
+    case HIPRAND_STATUS_NOT_IMPLEMENTED: return "HIPRAND_STATUS_NOT_IMPLEMENTED";
+    }
+    // To suppress compiler warning.
+    return "Unrecognized HIPRAND error string";
+}
+
+// Turn on the flag g_caffe2_has_hip_linked to true for HasHipRuntime()
+// function.
+extern bool g_caffe2_has_hip_linked;
+namespace {
+class HipRuntimeFlagFlipper
+{
+    public:
+     HipRuntimeFlagFlipper() {
+       internal::SetHipRuntimeFlag();
+     }
+};
+static HipRuntimeFlagFlipper g_flipper;
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/common_hip.h b/caffe2/core/hip/common_hip.h
new file mode 100644
index 0000000..5423f5b
--- /dev/null
+++ b/caffe2/core/hip/common_hip.h
@@ -0,0 +1,364 @@
+#ifndef CAFFE2_CORE_COMMON_HIP_H_
+#define CAFFE2_CORE_COMMON_HIP_H_
+
+#define HIP_VERSION 1
+#include <assert.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand.h>
+#include <rocblas.h>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+
+/**
+ * The maximum number of AMD HIP GPUs that caffe2 recognizes.
+ */
+#define CAFFE2_COMPILE_TIME_MAX_HIP_GPUS 16
+/**
+ * The maximum number of peers that each gpu can have when doing p2p setup.
+ * Currently, according to NVidia documentation, each device can support a
+ * system-wide maximum of eight peer connections.
+ * When Caffe2 sets up peer access resources, if we have more than 8 gpus,
+ * we will enable peer access in groups of 8.
+ */
+#define CAFFE2_HIP_MAX_PEER_SIZE 8
+
+namespace caffe2 {
+
+/**
+ * A runtime function to report the HIP version that Caffe2 is built with.
+ */
+inline int HipVersion() { return HIP_VERSION; }
+
+/**
+ * Returns the number of devices.
+ */
+int NumHipDevices();
+
+/**
+ * Check if the current running session has a HIP gpu present.
+ *
+ * Note that this is different from having caffe2 built with HIP. Building
+ * Caffe2 with HIP only guarantees that this function exists. If there are no
+ * HIP gpus present in the machine, or there are hardware configuration
+ * problems like an insufficient driver, this function will still return false,
+ * meaning that there is no usable GPU present.
+ *
+ * In the open source build, it is possible that Caffe2's GPU code is
+ * dynamically loaded, and as a result a library could be only linked to the
+ * CPU code, but want to test if HIP is later available or not. In this case,
+ * one should use HasHipRuntime() from common.h.
+ */
+inline bool HasHipGPU() { return NumHipDevices() > 0; }
+
+/**
+ * Sets the default GPU id for Caffe2.
+ *
+ * If an operator is set to run on HIP GPU but no gpu id is given, we will use
+ * the default gpu id to run the operator. Before this function is explicitly
+ * called, GPU 0 will be the default GPU id.
+ */
+void SetDefaultGPUID(const int deviceid);
+
+/**
+ * Gets the default GPU id for Caffe2.
+ */
+int GetDefaultGPUID();
+
+/**
+ * Gets the current GPU id. This is a simple wrapper around hipGetDevice().
+ */
+int CaffeHipGetDevice();
+
+/**
+ * Gets the current GPU id. This is a simple wrapper around hipSetDevice().
+ */
+void CaffeHipSetDevice(const int id);
+
+/**
+ * Gets the GPU id that the current pointer is located at.
+ */
+int GetGPUIDForPointer(const void* ptr);
+
+/**
+ * Gets the device property for the given device. This function is thread safe.
+ */
+const hipDeviceProp_t& GetDeviceProperty(const int device);
+
+/**
+ * Runs a device query function and prints out the results to LOG(INFO).
+ */
+void DeviceQuery(const int deviceid);
+
+/**
+ * Return a peer access pattern by returning a matrix (in the format of a
+ * nested vector) of boolean values specifying whether peer access is possible.
+ *
+ * This function returns false if anything wrong happens during the query of
+ * the GPU access pattern.
+ */
+bool GetHipPeerAccessPattern(vector<vector<bool>>* pattern);
+
+/**
+ * Return the availability of TensorCores for math
+ */
+bool TensorCoreAvailable();
+
+/**
+ * Return a human readable curand error string.
+ */
+const char* hiprandGetErrorString(hiprandStatus_t error);
+
+/**
+ * Return a human readable cublas error string.
+ */
+const char* rocblasGetErrorString(rocblas_status error);
+
+// HIP: various checks for different function calls.
+#define HIP_ENFORCE(condition, ...) \
+  do {                              \
+    hipError_t error = condition;   \
+    CAFFE_ENFORCE_EQ(               \
+        error,                      \
+        hipSuccess,                 \
+        "Error at: ",               \
+        __FILE__,                   \
+        ":",                        \
+        __LINE__,                   \
+        ": ",                       \
+        hipGetErrorString(error),   \
+        ##__VA_ARGS__);             \
+  } while (0)
+#define HIP_CHECK(condition)                                \
+  do {                                                      \
+    hipError_t error = condition;                           \
+    CHECK(error == hipSuccess) << hipGetErrorString(error); \
+  } while (0)
+
+#define ROCBLAS_ENFORCE(condition)                \
+  do {                                            \
+    rocblas_status status = condition;            \
+    CAFFE_ENFORCE_EQ(                             \
+        status,                                   \
+        rocblas_status_success,                   \
+        "Error at: ",                             \
+        __FILE__,                                 \
+        ":",                                      \
+        __LINE__,                                 \
+        ": ",                                     \
+        ::caffe2::rocblasGetErrorString(status)); \
+  } while (0)
+
+#define ROCBLAS_CHECK(condition)                    \
+  do {                                              \
+    rocblas_status status = condition;              \
+    CHECK(status == rocblas_status_success)         \
+        << ::caffe2::rocblasGetErrorString(status); \
+  } while (0)
+
+#define HIPRAND_ENFORCE(condition)                \
+  do {                                            \
+    hiprandStatus_t status = condition;           \
+    CAFFE_ENFORCE_EQ(                             \
+        status,                                   \
+        HIPRAND_STATUS_SUCCESS,                   \
+        "Error at: ",                             \
+        __FILE__,                                 \
+        ":",                                      \
+        __LINE__,                                 \
+        ": ",                                     \
+        ::caffe2::hiprandGetErrorString(status)); \
+  } while (0)
+#define HIPRAND_CHECK(condition)                    \
+  do {                                              \
+    hiprandStatus_t status = condition;             \
+    CHECK(status == HIPRAND_STATUS_SUCCESS)         \
+        << ::caffe2::hiprandGetErrorString(status); \
+  } while (0)
+
+#define HIP_1D_KERNEL_LOOP(i, n)                                           \
+  for (size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; i < (n); \
+       i += hipBlockDim_x * hipGridDim_x)
+
+// HIP_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
+// kernels.
+#define HIP_KERNEL_ASSERT(...)
+
+// The following helper functions are here so that you can write a kernel call
+// when you are not particularly interested in maxing out the kernels'
+// performance. Usually, this will give you a reasonable speed, but if you
+// really want to find the best performance, it is advised that you tune the
+// size of the blocks and grids more reasonably.
+// A legacy note: this is derived from the old good Caffe days, when I simply
+// hard-coded the number of threads and wanted to keep backward compatibility
+// for different computation capabilities.
+
+// The number of HIP threads to use. 512 is used for backward compatibility,
+// and it is observed that setting it to 1024 usually does not bring much
+// performance gain (which makes sense, because warp size being 32 means that
+// blindly setting a huge block for a random kernel isn't optimal).
+constexpr int CAFFE_HIP_NUM_THREADS = 512;
+// The maximum number of blocks to use in the default kernel call. We set it to
+// 4096 which would work for compute capability 2.x (where 65536 is the limit).
+// This number is very carelessly chosen. Ideally, one would like to look at
+// the hardware at runtime, and pick the number of blocks that makes most
+// sense for the specific runtime environment. This is a todo item.
+constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
+
+/**
+ * @brief Compute the number of blocks needed to run N threads.
+ */
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return std::min(
+      (N + CAFFE_HIP_NUM_THREADS - 1) / CAFFE_HIP_NUM_THREADS,
+      CAFFE_MAXIMUM_NUM_BLOCKS);
+}
+
+class DeviceGuard {
+ public:
+  explicit DeviceGuard(int newDevice) : previous_(CaffeHipGetDevice()) {
+    if (previous_ != newDevice) {
+      CaffeHipSetDevice(newDevice);
+    }
+  }
+
+  ~DeviceGuard() noexcept {
+    CaffeHipSetDevice(previous_);
+  }
+
+ private:
+  int previous_;
+};
+
+template <typename T, int N>
+struct SimpleArray {
+  T data[N];
+};
+
+constexpr int kHIPTensorMaxDims = 8;
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
+  do {                                                            \
+    CAFFE_ENFORCE_LE(val, kHIPTensorMaxDims);                     \
+    switch (val) {                                                \
+      case 1: {                                                   \
+        Func<T, 1>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 2: {                                                   \
+        Func<T, 2>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 3: {                                                   \
+        Func<T, 3>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 4: {                                                   \
+        Func<T, 4>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 5: {                                                   \
+        Func<T, 5>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 6: {                                                   \
+        Func<T, 6>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 7: {                                                   \
+        Func<T, 7>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      case 8: {                                                   \
+        Func<T, 8>(__VA_ARGS__);                                  \
+        break;                                                    \
+      }                                                           \
+      default: { break; }                                         \
+    }                                                             \
+  } while (false)
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
+  do {                                                                 \
+    CAFFE_ENFORCE_LE(val, kHIPTensorMaxDims);                          \
+    switch (val) {                                                     \
+      case 1: {                                                        \
+        Func<T1, T2, 1>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 2: {                                                        \
+        Func<T1, T2, 2>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 3: {                                                        \
+        Func<T1, T2, 3>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 4: {                                                        \
+        Func<T1, T2, 4>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 5: {                                                        \
+        Func<T1, T2, 5>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 6: {                                                        \
+        Func<T1, T2, 6>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 7: {                                                        \
+        Func<T1, T2, 7>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      case 8: {                                                        \
+        Func<T1, T2, 8>(__VA_ARGS__);                                  \
+        break;                                                         \
+      }                                                                \
+      default: { break; }                                              \
+    }                                                                  \
+  } while (false)
+
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
+  do {                                                                     \
+    CAFFE_ENFORCE_LE(val, kHIPTensorMaxDims);                              \
+    switch (val) {                                                         \
+      case 1: {                                                            \
+        Func<T1, T2, T3, 1>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 2: {                                                            \
+        Func<T1, T2, T3, 2>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 3: {                                                            \
+        Func<T1, T2, T3, 3>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 4: {                                                            \
+        Func<T1, T2, T3, 4>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 5: {                                                            \
+        Func<T1, T2, T3, 5>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 6: {                                                            \
+        Func<T1, T2, T3, 6>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 7: {                                                            \
+        Func<T1, T2, T3, 7>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 8: {                                                            \
+        Func<T1, T2, T3, 8>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      default: { break; }                                                  \
+    }                                                                      \
+  } while (false)
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_COMMON_HIP_H_
diff --git a/caffe2/core/hip/common_miopen.cc b/caffe2/core/hip/common_miopen.cc
new file mode 100644
index 0000000..86a7c54
--- /dev/null
+++ b/caffe2/core/hip/common_miopen.cc
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "caffe2/core/hip/common_miopen.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+
+#include "caffe2/core/init.h"
+
+namespace caffe2 {
+
+MIOPENWrapper::PerGPUMIOPENStates& MIOPENWrapper::miopen_states()
+{
+    // New it (never delete) to avoid calling the destructors on process
+    // exit and racing against the CUDA shutdown sequence.
+    static auto* p = new MIOPENWrapper::PerGPUMIOPENStates();
+    CHECK_NOTNULL(p);
+    return *p;
+}
+
+namespace {
+bool PrintMIOPENInfo(int*, char***)
+{
+    VLOG(1) << "Caffe2 is built with MIOPEN version " << MIOPEN_VERSION;
+    return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(PrintMIOPENInfo, &PrintMIOPENInfo, "Print MIOPEN Info.");
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/hip/common_miopen.h b/caffe2/core/hip/common_miopen.h
new file mode 100644
index 0000000..290ae99
--- /dev/null
+++ b/caffe2/core/hip/common_miopen.h
@@ -0,0 +1,172 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CAFFE2_CORE_COMMON_MIOPEN_H_
+#define CAFFE2_CORE_COMMON_MIOPEN_H_
+
+#include <array>
+#include <mutex>
+#include "miopen/miopen.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#define MIOPEN_VERSION 1399
+
+namespace caffe2 {
+
+namespace internal {
+/**
+ * A helper function to obtain miopen error strings.
+ */
+inline const char* miopenGetErrorString(miopenStatus_t status)
+{
+    switch(status)
+    {
+    case miopenStatusSuccess: return "MIOPEN_STATUS_SUCCESS";
+    case miopenStatusNotInitialized: return "MIOPEN_STATUS_NOT_INITIALIZED";
+    case miopenStatusAllocFailed: return "MIOPEN_STATUS_ALLOC_FAILED";
+    case miopenStatusBadParm: return "MIOPEN_STATUS_BAD_PARAM";
+    case miopenStatusInternalError: return "MIOPEN_STATUS_INTERNAL_ERROR";
+    case miopenStatusInvalidValue: return "MIOPEN_STATUS_INVALID_VALUE";
+    case miopenStatusNotImplemented: return "MIOPEN_STATUS_NOT_SUPPORTED";
+    case miopenStatusUnknownError: return "MIOPEN_STATUS_UNKNOWN_ERROR";
+    default: return "MIOPEN_STATUS_UNKNOWN_ERROR";
+    }
+}
+} // namespace internal
+
+// A macro that wraps around a miopen statement so we can check if the miopen
+// execution finishes or not.
+#define MIOPEN_ENFORCE(condition)                                           \
+    do                                                                      \
+    {                                                                       \
+        miopenStatus_t status = condition;                                  \
+        CAFFE_ENFORCE_EQ(status,                                            \
+                         miopenStatusSuccess,                               \
+                         ", Error at: ",                                    \
+                         __FILE__,                                          \
+                         ":",                                               \
+                         __LINE__,                                          \
+                         ": ",                                              \
+                         ::caffe2::internal::miopenGetErrorString(status)); \
+    } while(0)
+#define MIOPEN_CHECK(condition)                                                                   \
+    do                                                                                            \
+    {                                                                                             \
+        miopenStatus_t status = condition;                                                        \
+        CHECK(status == miopenStatusSuccess) << ::caffe2::internal::miopenGetErrorString(status); \
+    } while(0)
+
+// report the version of miopen Caffe2 was compiled with
+inline size_t miopenCompiledVersion() { return MIOPEN_VERSION; }
+
+/**
+ * miopenTypeWrapper is a wrapper class that allows us to refer to the miopen type
+ * in a template function. The class is specialized explicitly for different
+ * data types below.
+ */
+template <typename T>
+class miopenTypeWrapper;
+
+template <>
+class miopenTypeWrapper<float>
+{
+    public:
+    static const miopenDataType_t type = miopenFloat;
+    typedef const float ScalingParamType;
+    typedef float BNParamType;
+    static ScalingParamType* kOne()
+    {
+        static ScalingParamType v = 1.0;
+        return &v;
+    }
+    static const ScalingParamType* kZero()
+    {
+        static ScalingParamType v = 0.0;
+        return &v;
+    }
+};
+
+template <>
+class miopenTypeWrapper<float16>
+{
+    public:
+    static const miopenDataType_t type = miopenHalf;
+    typedef const float ScalingParamType;
+    typedef float BNParamType;
+    static ScalingParamType* kOne()
+    {
+        static ScalingParamType v = 1.0;
+        return &v;
+    }
+    static ScalingParamType* kZero()
+    {
+        static ScalingParamType v = 0.0;
+        return &v;
+    }
+};
+
+/**
+ * miopenTensorDescWrapper is the placeholder that wraps around a
+ * miopenTensorDescriptor_t, allowing us to do descriptor change as-needed during
+ * runtime.
+ */
+class miopenTensorDescWrapper
+{
+    public:
+    miopenTensorDescWrapper() { MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&desc_)); }
+    ~miopenTensorDescWrapper() noexcept { MIOPEN_CHECK(miopenDestroyTensorDescriptor(desc_)); }
+
+    inline miopenTensorDescriptor_t
+    Descriptor(const miopenDataType_t type, const vector<int>& dims, bool* changed)
+    {
+        if(type_ == type && dims_ == dims)
+        {
+            // if not changed, simply return the current descriptor.
+            if(changed)
+                *changed = false;
+            return desc_;
+        }
+        CAFFE_ENFORCE_EQ(
+            dims.size(), 4, "MIOPEN currently only support 4-dimensional tensor descriptor");
+
+        type_ = type;
+        dims_ = dims;
+        MIOPEN_ENFORCE(
+            miopenSet4dTensorDescriptor(desc_, type, dims_[0], dims_[1], dims_[2], dims_[3]));
+        if(changed)
+            *changed = true;
+        return desc_;
+    }
+
+    template <typename T>
+    inline miopenTensorDescriptor_t Descriptor(const StorageOrder& order, const vector<int>& dims)
+    {
+        return Descriptor(miopenTypeWrapper<T>::type, dims, nullptr);
+    }
+
+    private:
+    miopenTensorDescriptor_t desc_;
+    miopenDataType_t type_;
+    vector<int> dims_;
+    DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_COMMON_MIOPEN_H_
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
new file mode 100644
index 0000000..86a5fe6
--- /dev/null
+++ b/caffe2/core/hip/context_hip.cc
@@ -0,0 +1,418 @@
+#include <algorithm>
+#include <atomic>
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "cub/util_allocator.cuh"
+#include "caffe2/core/asan.h"
+#include "caffe2/core/hip/common_miopen.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(caffe2_hip_memory_pool,
+                     "",
+                     "Sets the memory pool used by caffe2. Possible values are "
+                     "none, cnmen and cub.");
+
+// For description of CUB caching allocator configuration, see
+// https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html
+CAFFE2_DEFINE_int(caffe2_cub_bin_growth,
+                  8,
+                  "If using cub as the memory allocator, sets the growth of bins "
+                  "used by the cub pool.");
+CAFFE2_DEFINE_int(caffe2_cub_min_bin,
+                  3,
+                  "If using cub as the memory allocator, sets the min number of "
+                  "bins.");
+CAFFE2_DEFINE_int(caffe2_cub_max_bin,
+                  10,
+                  "If using cub as the memory allocator, sets the max number of "
+                  "bins.");
+CAFFE2_DEFINE_int(caffe2_cub_max_managed_mb,
+                  10 * 1024,
+                  "If using cub as the memory allocators, sets the maximum amount "
+                  "of memory managed in gigabytes");
+CAFFE2_DEFINE_bool(caffe2_cub_print_allocation_events,
+                   false,
+                   "If true CachingDeviceAllocator will print allocation and deallocation "
+                   "events to stdout.");
+
+CAFFE2_DEFINE_bool(caffe2_gpu_memory_tracking,
+                   false,
+                   "If set, logs changes in GPU memory allocations");
+CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
+                  128,
+                  "The threshold in MB on how frequently to report memory changes");
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(Tensor<HIPContext>);
+
+thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
+
+// TODO(jiayq): these variables shouldn't be currently accessed during static
+// initialization. We should consider moving them to a Mayer's singleton to
+// be totally safe against SIOF.
+
+// Static global variables for setting up the memory pool.
+HipMemoryPoolType g_hip_memory_pool_type;
+
+// For cub allocator
+unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
+// an unordered map that holds the map from the cuda memory pointer to the
+// device id that it is allocated from. This is used in the cuda memory pool
+// cases, where we need the device id to carry out the deletion.
+// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
+// that is usually quite slow. We might want to benchmark the speed difference
+// though.
+// Note(jiayq): another alternate approach is to augment the Tensor class that
+// would allow one to record the device id. However, this does not address any
+// non-tensor allocation and deallocation.
+// Ideally, a memory pool should already have the device id information, as
+// long as we are using UVA (as of CUDA 5 and later) so the addresses are
+// unique.
+static std::unordered_map<void*, uint8_t> g_hip_device_affiliation;
+
+// Data structures for optional memory tracking. Access to these structures
+// is garded by the HIPContext::mutex.
+static std::unordered_map<void*, long> g_size_map;
+static std::vector<long> g_total_by_gpu_map(CAFFE2_COMPILE_TIME_MAX_HIP_GPUS, 0);
+static std::vector<long> g_max_by_gpu_map(CAFFE2_COMPILE_TIME_MAX_HIP_GPUS, 0);
+
+static long g_total_mem = 0;
+static long g_last_rep  = 0;
+
+HipMemoryPoolType GetHipMemoryPoolType() { return g_hip_memory_pool_type; }
+
+vector<TIndex>
+GetHipTensorInfo(const void* c, bool* shares_data, size_t* capacity, DeviceOption* device)
+{
+    vector<TIndex> dims          = GetTensorInfo<HIPContext>(c, shares_data, capacity, device);
+    const Tensor<HIPContext>* tc = static_cast<const Tensor<HIPContext>*>(c);
+    device->set_device_type(HIP);
+    device->set_hip_gpu_id(GetGPUIDForPointer(tc->raw_data()));
+    return dims;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// A wrapper to allow us to lazily initialize all HIP environments that Caffe
+// uses. This gets done the first time a caffe2::HIPContext::New() gets called
+// which is probably the decisive indication that this caffe2 run is going to
+// use GPUs. We avoid HIP initialization with core/init.h functionalities so
+// that we have minimal resource impact in case we will need to run multiple
+// caffe2 instances on a GPU machine.
+///////////////////////////////////////////////////////////////////////////////
+
+static void Caffe2InitializeHip()
+{
+    // If the current run does not have any HIP devices, do nothing.
+    if(!HasHipGPU())
+    {
+        VLOG(1) << "No HIP gpu present. Skipping.";
+        return;
+    }
+    // Check if the number of GPUs matches the expected compile-time max number
+    // of GPUs.
+    CAFFE_ENFORCE_LE(NumHipDevices(),
+                     CAFFE2_COMPILE_TIME_MAX_HIP_GPUS,
+                     "Number of HIP devices on the machine is larger than the compiled "
+                     "max number of gpus expected (",
+                     CAFFE2_COMPILE_TIME_MAX_HIP_GPUS,
+                     "). Increase that and recompile the caffe binary.");
+
+    for(int i = 0; i < NumHipDevices(); ++i)
+    {
+        DeviceGuard g(i);
+        // Enable peer access.
+        const int peer_group = i / CAFFE2_HIP_MAX_PEER_SIZE;
+        const int peer_start = peer_group * CAFFE2_HIP_MAX_PEER_SIZE;
+        const int peer_end = std::min(NumHipDevices(), (peer_group + 1) * CAFFE2_HIP_MAX_PEER_SIZE);
+        VLOG(1) << "Enabling peer access within group #" << peer_group << ", from gpuid "
+                << peer_start << " to " << peer_end - 1 << ", for gpuid " << i << ".";
+
+        for(int j = peer_start; j < peer_end; ++j)
+        {
+            if(i == j)
+                continue;
+            int can_access;
+            HIP_ENFORCE(hipDeviceCanAccessPeer(&can_access, i, j));
+            if(can_access)
+            {
+                VLOG(1) << "Enabling peer access from " << i << " to " << j;
+                // Note: just for future reference, the 0 here is not a gpu id, it is
+                // a reserved flag for hipDeviceEnablePeerAccess that should always be
+                // zero currently.
+                HIP_ENFORCE(hipDeviceEnablePeerAccess(j, 0));
+            }
+        }
+    }
+
+    RegisterTypeCallFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetTensorType<HIPContext>);
+
+    RegisterTensorInfoFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetHipTensorInfo);
+
+    // CheckMiOpenVersions();
+}
+
+static void SetUpCub()
+{
+    VLOG(1) << "Setting up cub memory pool.";
+    // Sets up the cub memory pool
+    try
+    {
+        g_cub_allocator.reset(
+            new cub::CachingDeviceAllocator(FLAGS_caffe2_cub_bin_growth,
+                                            FLAGS_caffe2_cub_min_bin,
+                                            FLAGS_caffe2_cub_max_bin,
+                                            size_t(FLAGS_caffe2_cub_max_managed_mb) * 1024L * 1024L,
+                                            false,
+                                            FLAGS_caffe2_cub_print_allocation_events));
+    }
+    catch(...)
+    {
+        CAFFE_THROW("Some error happened at cub initialization.");
+    }
+    VLOG(1) << "Done setting up cub memory pool.";
+}
+
+static void Caffe2SetHIPMemoryPool()
+{
+    if(FLAGS_caffe2_hip_memory_pool == "" || FLAGS_caffe2_hip_memory_pool == "none")
+    {
+        g_hip_memory_pool_type = HipMemoryPoolType::NONE;
+    }
+    else if(FLAGS_caffe2_hip_memory_pool == "cnmem")
+    {
+        CAFFE_THROW("CNMEM is no longer used by Caffe2. Use cub instead. "
+                    "This error message may go away in the future.");
+    }
+    else if(FLAGS_caffe2_hip_memory_pool == "cub")
+    {
+        // Sets up cub.
+        g_hip_memory_pool_type = HipMemoryPoolType::CUB;
+        SetUpCub();
+    }
+    else
+    {
+        CAFFE_THROW("Unrecognized HIP memory pool type: ", FLAGS_caffe2_hip_memory_pool);
+    }
+}
+
+// An initialization function that sets the CPU side to use pinned cpu
+// allocator.
+void Caffe2UsePinnedCPUAllocator()
+{
+#if CAFFE2_ASAN_ENABLED
+    // Note(jiayq): for more details, see
+    //     https://github.com/google/sanitizers/issues/629
+    LOG(WARNING) << "There are known issues between address sanitizer and "
+                    "hipHostMalloc. As a result, caffe2 will not enable pinned "
+                    "memory allocation in asan mode. If you are expecting any "
+                    "behavior that depends on asan, be advised that it is not "
+                    "turned on.";
+#else
+    if(!HasHipGPU())
+    {
+        VLOG(1) << "No GPU present. I won't use pinned allocator then.";
+        return;
+    }
+    VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
+    SetCPUAllocator(new PinnedCPUAllocator());
+#endif
+}
+
+// Caffe2HipInitializerHelper is a minimal struct whose sole purpose is to
+// detect the first hint that this Caffe2 run is going to use GPU: either
+// HIPContext is initialized or HIPContext::New is called. It then runs
+// all the related cuda initialization functions.
+namespace {
+struct Caffe2HipInitializerHelper
+{
+    Caffe2HipInitializerHelper()
+    {
+        // We cannot use bool because nvcc changes bool to __nv_bool which does
+        // not have a std::atomic instantiation.
+        static std::atomic<char> first_call(1);
+        if(first_call.fetch_and((char)0))
+        {
+            Caffe2InitializeHip();
+            Caffe2SetHIPMemoryPool();
+            Caffe2UsePinnedCPUAllocator();
+        }
+    }
+};
+} // namespace
+
+/**
+ * A utility function to rectify the gpu id. If the context specifies the
+ * gpu id to be -1, it means that we will just use the current gpu id when
+ * the function is being called.
+ */
+static inline int RectifyGPUID(const int gpu_id) {
+  return gpu_id == -1 ? CaffeHipGetDevice() : gpu_id;
+}
+
+HIPContext::HIPContext(const int gpu_id)
+    : gpu_id_(RectifyGPUID(gpu_id)), random_seed_(RandomNumberSeed()) {
+  static Caffe2HipInitializerHelper g_hip_initializer_;
+}
+
+HIPContext::HIPContext(const DeviceOption& option)
+    : gpu_id_(
+          option.has_hip_gpu_id() ? RectifyGPUID(option.hip_gpu_id())
+                                  : CaffeHipGetDevice()),
+      random_seed_(
+          option.has_random_seed() ? option.random_seed()
+                                   : RandomNumberSeed()) {
+  static Caffe2HipInitializerHelper g_hip_initializer_;
+  DCHECK_EQ(option.device_type(), HIP);
+}
+
+// shared mutex to lock out alloc / free during NCCL launches
+std::mutex& HIPContext::mutex()
+{
+    static std::mutex m;
+    return m;
+}
+
+std::vector<long> HIPContext::TotalMemoryByGpu()
+{
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    CAFFE_ENFORCE(FLAGS_caffe2_gpu_memory_tracking,
+                  "Pass --caffe2_gpu_memory_tracking to enable memory stats");
+    return g_total_by_gpu_map;
+}
+
+std::vector<long> HIPContext::MaxMemoryByGpu()
+{
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    CAFFE_ENFORCE(FLAGS_caffe2_gpu_memory_tracking,
+                  "Pass --caffe2_gpu_memory_tracking to enable memory stats");
+    return g_max_by_gpu_map;
+}
+
+namespace {
+void TrackMemoryAlloc(size_t nbytes)
+{
+    int this_gpu = CaffeHipGetDevice();
+    g_total_by_gpu_map[this_gpu] += nbytes;
+    g_max_by_gpu_map[this_gpu] = std::max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
+    g_total_mem += nbytes;
+    if(g_total_mem - g_last_rep > FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024)
+    {
+        for(int gpu = 0; gpu < g_total_by_gpu_map.size(); gpu++)
+        {
+            long t     = g_total_by_gpu_map[gpu];
+            long max_t = g_max_by_gpu_map[gpu];
+            if(max_t > 0)
+            {
+                if(max_t != t)
+                {
+                    LOG(INFO) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB"
+                              << " (max: " << max_t / 1024 / 1024 << " MB)";
+                }
+                else
+                {
+                    LOG(INFO) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB";
+                }
+            }
+        }
+        LOG(INFO) << "Total: " << g_total_mem / 1024 / 1024 << " MB";
+        g_last_rep = g_total_mem;
+    }
+}
+}
+
+std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
+{
+    // Lock the mutex
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    // A one-time caffe2 cuda initializer.
+    static Caffe2HipInitializerHelper g_hip_initializer_;
+    void* ptr = nullptr;
+
+    if(FLAGS_caffe2_gpu_memory_tracking)
+    {
+        TrackMemoryAlloc(nbytes);
+    }
+    switch(g_hip_memory_pool_type)
+    {
+    case HipMemoryPoolType::NONE:
+        HIP_ENFORCE(hipMalloc(&ptr, nbytes));
+        if(FLAGS_caffe2_gpu_memory_tracking)
+        {
+            g_size_map[ptr]               = nbytes;
+            g_hip_device_affiliation[ptr] = CaffeHipGetDevice();
+        }
+        return {ptr, Delete};
+    case HipMemoryPoolType::CUB:
+        HIP_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
+        g_hip_device_affiliation[ptr] = CaffeHipGetDevice();
+        VLOG(2) << "CUB allocating pointer " << ptr << " on device " << CaffeHipGetDevice();
+        if(FLAGS_caffe2_gpu_memory_tracking)
+        {
+            g_size_map[ptr] = nbytes;
+        }
+        return {ptr, Delete};
+    }
+    return {nullptr, Delete};
+}
+
+void HIPContext::Delete(void* ptr)
+{
+    // lock the mutex
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+
+    if(FLAGS_caffe2_gpu_memory_tracking)
+    {
+        auto sz_it = g_size_map.find(ptr);
+        DCHECK(sz_it != g_size_map.end());
+        auto aff_it = g_hip_device_affiliation.find(ptr);
+        DCHECK(aff_it != g_hip_device_affiliation.end());
+        g_total_mem -= sz_it->second;
+        g_total_by_gpu_map[aff_it->second] -= sz_it->second;
+        g_size_map.erase(sz_it);
+    }
+
+    switch(g_hip_memory_pool_type)
+    {
+    case HipMemoryPoolType::NONE:
+    {
+        // If memory pool is not set up, use simple hipFree.
+        hipError_t error = hipFree(ptr);
+        // For some reason, in Python runtime we sometimes delete a data pointer
+        // after the cuda runtime exits - this is odd but is probably caused by
+        // a static workspace that pycaffe2 uses, and the destruction got
+        // entangled in some race condition. Anyway, since cuda runtime is exiting
+        // anyway, we will not need to worry about memory leak, so we basically
+        // ignore it. This is definitely not ideal but works for now.
+        if(error != hipSuccess)
+        {
+            LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                       << hipGetErrorString(error);
+        }
+
+        if(FLAGS_caffe2_gpu_memory_tracking)
+        {
+            g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
+        }
+
+        break;
+    }
+    case HipMemoryPoolType::CUB:
+    {
+        auto it = g_hip_device_affiliation.find(ptr);
+        DCHECK(it != g_hip_device_affiliation.end());
+        VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
+        HIP_ENFORCE(g_cub_allocator->DeviceFree(it->second, ptr));
+        g_hip_device_affiliation.erase(it);
+        break;
+    }
+    }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
new file mode 100644
index 0000000..577ccd6
--- /dev/null
+++ b/caffe2/core/hip/context_hip.h
@@ -0,0 +1,346 @@
+#ifndef CAFFE2_CORE_CONTEXT_HIP_H_
+#define CAFFE2_CORE_CONTEXT_HIP_H_
+
+#include <hiprand.h>
+#include <ctime>
+#include <mutex>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/hip/common_miopen.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/numa.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+enum class HipMemoryPoolType {
+  NONE = 0,
+  CUB = 1,
+};
+
+/**
+ * Gets the current memory pool type used by Caffe2.
+ *
+ * The memory pool is set up during caffe2's global initialization time.
+ */
+HipMemoryPoolType GetHipMemoryPoolType();
+
+/**
+ * A struct to host thread-local cuda objects.
+ *
+ * In Caffe2, each thread has its own non-default cuda stream as well as
+ * related objects such as cublas and curand handles. This is achieved by
+ * having the ThreadLocalCUDAObjects wrapper that takes care of allocating
+ * and deallocating these objects at the thread scope. This class is solely
+ * used inside CUDAContext and should not be used externally.
+ */
+class ThreadLocalHIPObjects {
+  friend class HIPContext;
+
+ private:
+  ThreadLocalHIPObjects() {
+    for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_HIP_GPUS; ++i) {
+      hip_streams_[i] = vector<hipStream_t>();
+      rocblas_handles_[i] = vector<rocblas_handle>();
+      miopen_handles_[i] = vector<miopenHandle_t>();
+    }
+  }
+
+  hipStream_t GetStream(int gpu, int stream_id) {
+    vector<hipStream_t>& gpu_streams = hip_streams_[gpu];
+    if (gpu_streams.size() <= stream_id) {
+      gpu_streams.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_streams[stream_id]) {
+      DeviceGuard guard(gpu);
+      HIP_ENFORCE(hipStreamCreateWithFlags(
+          &gpu_streams[stream_id], hipStreamNonBlocking));
+    }
+    return gpu_streams[stream_id];
+  }
+
+  rocblas_handle GetHandle(int gpu, int stream_id) {
+    DeviceGuard guard(gpu);
+    vector<rocblas_handle>& gpu_handles = rocblas_handles_[gpu];
+    if (gpu_handles.size() <= stream_id) {
+      gpu_handles.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_handles[stream_id]) {
+      ROCBLAS_ENFORCE(rocblas_create_handle(&gpu_handles[stream_id]));
+      // The default is ROCBLAS_POINTER_MODE_HOST. You can override
+      // it after obtaining the rocblas handle, but do that with
+      // caution.
+      ROCBLAS_ENFORCE(rocblas_set_pointer_mode(
+          gpu_handles[stream_id], rocblas_pointer_mode_host));
+      ROCBLAS_ENFORCE(rocblas_set_stream(
+          gpu_handles[stream_id], GetStream(gpu, stream_id)));
+    }
+    return gpu_handles[stream_id];
+  }
+
+  miopenHandle_t GetMiopenHandle(int gpu, int stream_id) {
+    DeviceGuard guard(gpu);
+    vector<miopenHandle_t>& gpu_handles = miopen_handles_[gpu];
+    if (gpu_handles.size() <= stream_id) {
+      gpu_handles.resize(stream_id + 1, nullptr);
+    }
+    if (!gpu_handles[stream_id]) {
+      MIOPEN_ENFORCE(miopenCreate(&gpu_handles[stream_id]));
+      MIOPEN_ENFORCE(
+          miopenSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
+    }
+    return gpu_handles[stream_id];
+  }
+
+  ~ThreadLocalHIPObjects() noexcept {
+    for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_HIP_GPUS; ++i) {
+      for (auto& handle : rocblas_handles_[i]) {
+        if (handle) {
+          ROCBLAS_CHECK(rocblas_destroy_handle(handle));
+        }
+      }
+      for (auto& stream : hip_streams_[i]) {
+        if (stream) {
+          HIP_CHECK(hipStreamDestroy(stream));
+        }
+      }
+      for (auto& handle : miopen_handles_[i]) {
+        if (handle) {
+          MIOPEN_CHECK(miopenDestroy(handle));
+        }
+      }
+    }
+  }
+  vector<hipStream_t> hip_streams_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
+  vector<rocblas_handle> rocblas_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
+  vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
+};
+
+class HIPContext final {
+ public:
+  // The default HIP context constructor.
+  explicit HIPContext(const int gpu_id = -1);
+  explicit HIPContext(const DeviceOption& option);
+
+  ~HIPContext() {
+    if (hiprand_generator_) {
+      HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
+    }
+    FinishDeviceComputation();
+  }
+
+  inline void SwitchToDevice(int stream_id) {
+    set_stream_id(stream_id);
+    CaffeHipSetDevice(gpu_id_);
+  }
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  inline void WaitEvent(const Event& ev) {
+    ev.Wait(HIP, this);
+  }
+
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+    CAFFE_ENFORCE(ev, "Event must not be null.");
+    ev->Record(HIP, this, err_msg);
+  }
+
+  void FinishDeviceComputation() {
+    hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
+    hipError_t error = hipGetLastError();
+    if (error != hipSuccess) {
+      CAFFE_THROW("Encountered HIP error: ", hipGetErrorString(error));
+    }
+  }
+
+  inline int hip_gpu_id() const {
+    return gpu_id_;
+  }
+
+  inline hipStream_t hip_stream() {
+    return hip_stream(gpu_id_, stream_id_);
+  }
+
+  inline hipStream_t hip_stream() const {
+    return hip_stream(gpu_id_, stream_id_);
+  }
+
+  static hipStream_t hip_stream(int gpu_id, int stream_id) {
+    return hip_objects_.GetStream(gpu_id, stream_id);
+  }
+
+  rocblas_handle rocblas_handle() {
+    return hip_objects_.GetHandle(gpu_id_, stream_id_);
+  }
+
+  miopenHandle_t miopen_handle() {
+    return hip_objects_.GetMiopenHandle(gpu_id_, stream_id_);
+  }
+
+  hiprandGenerator_t& hiprand_generator() {
+    if (!hiprand_generator_) {
+      DeviceGuard guard(gpu_id_);
+      HIPRAND_ENFORCE(hiprandCreateGenerator(
+          &hiprand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT));
+      HIPRAND_ENFORCE(hiprandSetPseudoRandomGeneratorSeed(
+          hiprand_generator_, random_seed_));
+      CHECK_NOTNULL(hiprand_generator_);
+    }
+    HIPRAND_ENFORCE(hiprandSetStream(hiprand_generator_, hip_stream()));
+    return hiprand_generator_;
+  }
+
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+
+  // Get a mutex to lock out hipMalloc / hipFree calls when
+  // NCCL kernels are being launched. Should remove threat of
+  // deadlocks
+  static std::mutex& mutex();
+
+  // Functions to query memory stats. Only available if flag
+  // --caffe2_gpu_memory_tracking is enabled.
+  static std::vector<long> TotalMemoryByGpu();
+  static std::vector<long> MaxMemoryByGpu();
+
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
+    if (nbytes == 0)
+      return;
+    HIP_ENFORCE(hipMemcpyAsync(
+        dst,
+        src,
+        nbytes,
+        hipMemcpyDefault,
+        hip_objects_.GetStream(gpu_id_, stream_id_)));
+  }
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(int n, const T* src, T* dst) {
+    CopyBytes<SrcContext, DstContext>(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    CAFFE_ENFORCE(!meta.copy(), "HIPContext requires fundamental types.");
+    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+  }
+
+  // By default HIP operators have async device parts
+  static bool HasAsyncPartDefault() {
+    return true;
+  }
+
+  static bool SupportsAsyncScheduling() {
+    return true;
+  }
+
+  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
+    auto stream = HIPContext::hip_stream(option.hip_gpu_id(), stream_id);
+    return hipStreamQuery(stream) == hipSuccess;
+  }
+
+ protected:
+  static void Delete(void* data);
+  void set_stream_id(int stream_id) {
+    stream_id_ = stream_id;
+  }
+
+  int gpu_id_;
+  int stream_id_ = 0;
+  int random_seed_;
+  hiprandGenerator_t hiprand_generator_{nullptr};
+  static thread_local ThreadLocalHIPObjects hip_objects_;
+};
+
+// For the CPU context, we also allow a (probably expensive) function
+// to copy the data from a HIP context. Inside the function, we create
+// a temporary CUDAContext object to carry out the copy. From the caller's
+// side, these functions are synchronous with respect to the host, similar
+// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
+template <>
+inline void CPUContext::CopyBytes<HIPContext, CPUContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  HIPContext context(GetGPUIDForPointer(src));
+  context.CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
+}
+template <>
+inline void CPUContext::CopyBytes<CPUContext, HIPContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  HIPContext context(GetGPUIDForPointer(dst));
+  context.CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
+}
+
+/**
+ * An allocator that does the CPU memory allocation with pinned memory.
+ *
+ * This is needed because if we want to do any asynchronous HIP memcpy,
+ * the underlying CPU memory also needs to be allocated into pinned memory
+ * space. As a result, whenever Caffe2 is built with GPU and there is
+ * GPU present during runtime, at global initialization time we will set
+ * the CPU memory allocator to allocate pinned memory.
+ */
+struct PinnedCPUAllocator final : CPUAllocator {
+  PinnedCPUAllocator() {}
+  ~PinnedCPUAllocator() override {}
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+    void* data;
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    if (IsNUMAEnabled()) {
+      auto ptr_and_deleter = baseAllocator_.New(nbytes);
+      data = ptr_and_deleter.first;
+      CAFFE_ENFORCE(data);
+      HIP_ENFORCE(hipHostRegister(data, nbytes, hipHostRegisterDefault));
+    } else {
+      HIP_ENFORCE(hipHostMalloc(&data, nbytes));
+    }
+    memset(data, 0, nbytes);
+    return {data, Delete};
+  }
+
+  MemoryDeleter GetDeleter() override {
+    return Delete;
+  }
+
+ private:
+  static void Delete(void* data) {
+    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
+    // or not. If a HIPContext::New() call is made, inside the CUDAContext
+    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
+    // But, if one calls CPUContext::New() before any HIP allocations,
+    // PinnedCPUAllocator can still delete the corresponding memory.
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    if (IsNUMAEnabled()) {
+      HIP_ENFORCE(hipHostUnregister(data));
+      DefaultCPUAllocator::Delete(data);
+    } else {
+      hipError_t err = hipHostFree(data);
+      if (err == hipErrorInvalidValue) {
+        free(data);
+        // Calling hipGetLastError will reset the cuda error.
+        hipGetLastError();
+      } else {
+        // For all other errors, still do a hip check.
+        HIP_ENFORCE(err);
+      }
+    }
+  }
+
+  DefaultCPUAllocator baseAllocator_;
+};
+
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<HIPContext> TensorHIP;
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_CONTEXT_HIP_H_
diff --git a/caffe2/core/hip/context_hip_test.cc b/caffe2/core/hip/context_hip_test.cc
new file mode 100644
index 0000000..fe86afb
--- /dev/null
+++ b/caffe2/core/hip/context_hip_test.cc
@@ -0,0 +1,169 @@
+#include <chrono>
+#include <future>
+#include <random>
+#include <thread>
+#include <array>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/hip/context_hip.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_bool(caffe2_hip_full_device_control);
+
+namespace caffe2 {
+
+namespace {
+std::shared_ptr<void> shared_from_new(std::pair<void*, MemoryDeleter>&& p)
+{
+    return std::shared_ptr<void>(p.first, std::move(p.second));
+}
+}
+
+TEST(HIPTest, HasHipRuntime) { EXPECT_TRUE(HasHipRuntime()); }
+
+TEST(HIPContextTest, TestAllocDealloc)
+{
+    if(!HasHipGPU())
+        return;
+    HIPContext context(0);
+    context.SwitchToDevice();
+    auto data = shared_from_new(HIPContext::New(10 * sizeof(float)));
+    EXPECT_NE(data.get(), nullptr);
+}
+
+TEST(HIPContextTest, TestSetGetDeviceWithoutCaffeMode)
+{
+    // For a while, set full device control to be true.
+    for(int i = 0; i < NumHipDevices(); ++i)
+    {
+        CaffeHipSetDevice(i);
+        EXPECT_EQ(CaffeHipGetDevice(), i);
+    }
+    for(int i = NumHipDevices() - 1; i >= 0; --i)
+    {
+        CaffeHipSetDevice(i);
+        EXPECT_EQ(CaffeHipGetDevice(), i);
+    }
+}
+
+TEST(HIPContextTest, TestSetGetDeviceWithCaffeMode)
+{
+    // For a while, set full device control to be true.
+    FLAGS_caffe2_hip_full_device_control = true;
+    for(int i = 0; i < NumHipDevices(); ++i)
+    {
+        CaffeHipSetDevice(i);
+        EXPECT_EQ(CaffeHipGetDevice(), i);
+    }
+    for(int i = NumHipDevices() - 1; i >= 0; --i)
+    {
+        CaffeHipSetDevice(i);
+        EXPECT_EQ(CaffeHipGetDevice(), i);
+    }
+    FLAGS_caffe2_hip_full_device_control = false;
+}
+
+TEST(HIPContextTest, MemoryPoolAllocateDealloc)
+{
+    if(!HasHipGPU())
+        return;
+    if(GetHipMemoryPoolType() == HipMemoryPoolType::NONE)
+    {
+        LOG(ERROR) << "Choose a memory type that is not none to test memory pool.";
+        return;
+    }
+    const int nbytes = 1048576;
+    for(int i = 0; i < NumHipDevices(); ++i)
+    {
+        LOG(INFO) << "Device " << i << " of " << NumHipDevices();
+        DeviceGuard guard(i);
+        auto allocated = shared_from_new(HIPContext::New(nbytes));
+        EXPECT_NE(allocated, nullptr);
+        hipPointerAttribute_t attr;
+        HIP_ENFORCE(hipPointerGetAttributes(&attr, allocated.get()));
+        EXPECT_EQ(attr.memoryType, hipMemoryTypeDevice);
+        EXPECT_EQ(attr.device, i);
+        void* prev_allocated = allocated.get();
+        allocated.reset();
+        auto new_allocated = shared_from_new(HIPContext::New(nbytes));
+        // With a pool, the above allocation should yield the same address.
+        EXPECT_EQ(new_allocated.get(), prev_allocated);
+        // But, if we are allocating something larger, we will have a different
+        // chunk of memory.
+        auto larger_allocated = shared_from_new(HIPContext::New(nbytes * 2));
+        EXPECT_NE(larger_allocated.get(), prev_allocated);
+    }
+}
+
+hipStream_t getStreamForHandle(rocblas_handle handle)
+{
+    hipStream_t stream = nullptr;
+    ROCBLAS_ENFORCE(rocblas_get_stream(handle, &stream));
+    CHECK_NOTNULL(stream);
+    return stream;
+}
+
+TEST(HIPContextTest, TestSameThreadSameObject)
+{
+    if(!HasHipGPU())
+        return;
+    HIPContext context_a(0);
+    HIPContext context_b(0);
+    EXPECT_EQ(context_a.hip_stream(), context_b.hip_stream());
+    EXPECT_EQ(context_a.rocblas_handle(), context_b.rocblas_handle());
+    EXPECT_EQ(
+        context_a.hip_stream(), getStreamForHandle(context_b.rocblas_handle()));
+    // hipRAND generators are context-local.
+    EXPECT_NE(context_a.hiprand_generator(), context_b.hiprand_generator());
+}
+
+TEST(HIPContextTest, TestSameThreadDifferntObjectIfDifferentDevices)
+{
+    if(NumHipDevices() > 1)
+    {
+        HIPContext context_a(0);
+        HIPContext context_b(1);
+        EXPECT_NE(context_a.hip_stream(), context_b.hip_stream());
+        EXPECT_NE(context_a.rocblas_handle(), context_b.rocblas_handle());
+        EXPECT_NE(
+            context_a.hip_stream(),
+            getStreamForHandle(context_b.rocblas_handle()));
+        EXPECT_NE(context_a.hiprand_generator(), context_b.hiprand_generator());
+    }
+}
+
+namespace {
+// A test function to return a stream address from a temp HIP context. You
+// should not use that stream though, because the actual stream is destroyed
+// after thread exit.
+void TEST_GetStreamAddress(hipStream_t* ptr)
+{
+    HIPContext context(0);
+    *ptr = context.hip_stream();
+    // Sleep for a while so we have concurrent thread executions
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+}
+} // namespace
+
+TEST(HIPContextTest, TestDifferntThreadDifferentobject)
+{
+    if(!HasHipGPU())
+        return;
+    std::array<hipStream_t, 2> temp = {0};
+    // Same thread
+    TEST_GetStreamAddress(&temp[0]);
+    TEST_GetStreamAddress(&temp[1]);
+    EXPECT_TRUE(temp[0] != nullptr);
+    EXPECT_TRUE(temp[1] != nullptr);
+    EXPECT_EQ(temp[0], temp[1]);
+    // Different threads
+    std::thread thread_a(TEST_GetStreamAddress, &temp[0]);
+    std::thread thread_b(TEST_GetStreamAddress, &temp[1]);
+    thread_a.join();
+    thread_b.join();
+    EXPECT_TRUE(temp[0] != nullptr);
+    EXPECT_TRUE(temp[1] != nullptr);
+    EXPECT_NE(temp[0], temp[1]);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
new file mode 100644
index 0000000..4d2ace0
--- /dev/null
+++ b/caffe2/core/hip/event_hip.cc
@@ -0,0 +1,238 @@
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/event_cpu.h"
+#include "caffe2/core/operator.h"
+
+#include <atomic>
+
+namespace caffe2 {
+
+struct HipEventWrapper
+{
+    explicit HipEventWrapper(const DeviceOption& option)
+        : hip_stream_(nullptr),
+          hip_gpu_id_(option.hip_gpu_id()),
+          status_(EventStatus::EVENT_INITIALIZED)
+    {
+        CAFFE_ENFORCE(option.device_type(), HIP);
+        DeviceGuard g(hip_gpu_id_);
+        HIP_ENFORCE(hipEventCreate(&hip_event_ /*, hipEventDefault | hipEventDisableTiming*/));
+    }
+    ~HipEventWrapper()
+    {
+        DeviceGuard g(hip_gpu_id_);
+        HIP_CHECK(hipEventDestroy(hip_event_));
+    }
+
+    hipEvent_t hip_event_;
+    hipStream_t hip_stream_;
+    int hip_gpu_id_;
+
+    std::atomic<int> status_;
+    std::mutex mutex_recorded_;
+    std::condition_variable cv_recorded_;
+    std::string err_msg_;
+};
+
+namespace {
+const std::string kNoError = "No error";
+}
+
+void EventCreateHIP(const DeviceOption& option, Event* event)
+{
+    event->event_ = std::make_shared<HipEventWrapper>(option);
+}
+
+void EventRecordHIP(Event* event, const void* context, const char* err_msg)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    {
+        std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+
+        // Possible state changes:
+        //  INITIALIZED -> SCHEDULED/FAILED
+        //  SCHEDULED -> SUCCESS/FAILED
+        //  SUCCESS/FAILED - terminal
+        //
+        // No further changes to cuda_event_ and cuda_stream_ after transitioning
+        // from INITIALIZED
+        // No further changes to err_msg_ after transitioning into FAILED
+
+        CAFFE_ENFORCE_EQ(
+            wrapper->status_, EventStatus::EVENT_INITIALIZED, "Calling Record multiple times");
+
+        if(!err_msg)
+        {
+            // When recording, one needs to make sure that the current gpu id is
+            // correct.
+            // TODO(jiayq): move the enforce logic to the caller?
+            const auto& current_device = CaffeHipGetDevice();
+            CAFFE_ENFORCE_EQ(current_device,
+                             wrapper->hip_gpu_id_,
+                             "When you call EventRecordHIP, your current device should be the same "
+                             "as the device specified by the event.");
+            CAFFE_ENFORCE_EQ(current_device, static_cast<const HIPContext*>(context)->hip_gpu_id());
+            HIP_ENFORCE(hipEventRecord(wrapper->hip_event_,
+                                       static_cast<const HIPContext*>(context)->hip_stream()));
+            wrapper->hip_stream_ = static_cast<const HIPContext*>(context)->hip_stream();
+            wrapper->status_     = EventStatus::EVENT_SCHEDULED;
+        }
+        else
+        {
+            wrapper->err_msg_ = err_msg;
+            wrapper->status_  = EventStatus::EVENT_FAILED;
+        }
+    }
+    wrapper->cv_recorded_.notify_all();
+}
+
+void EventFinishHIP(const Event* event)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    {
+        std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+        while(wrapper->status_ == EventStatus::EVENT_INITIALIZED)
+        {
+            wrapper->cv_recorded_.wait(lock);
+        }
+    }
+
+    if(wrapper->status_ == EventStatus::EVENT_SCHEDULED)
+    {
+        // ok, even if event is already completed and status was not yet updated
+        DeviceGuard g(wrapper->hip_gpu_id_);
+        auto hipResult = hipEventSynchronize(wrapper->hip_event_);
+        if(hipResult == hipSuccess)
+        {
+            wrapper->status_ = EventStatus::EVENT_SUCCESS;
+        }
+        else
+        {
+            const auto& err_msg = hipGetErrorString(hipResult);
+
+            std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+            wrapper->err_msg_ = err_msg;
+            wrapper->status_  = EventStatus::EVENT_FAILED;
+        }
+    }
+}
+
+// Both waiter and event are HIP. Non-blocking
+void EventWaitHIPHIP(const Event* event, void* context)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    {
+        std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+        while(wrapper->status_ == EventStatus::EVENT_INITIALIZED)
+        {
+            wrapper->cv_recorded_.wait(lock);
+        }
+    }
+
+    if(wrapper->status_ == EventStatus::EVENT_SCHEDULED)
+    {
+        // ok, even if event is already completed and status was not yet updated
+        auto context_stream = static_cast<HIPContext*>(context)->hip_stream();
+        auto event_stream   = wrapper->hip_stream_;
+        if(context_stream != event_stream)
+        {
+            // CAFFE_ENFORCE_EQ(
+            //    CaffeCudaGetDevice(),
+            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+            HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
+        }
+    }
+}
+
+// Waiter is CPU, event is HIP
+void EventWaitCPUHIP(const Event* event, void* context) { EventFinishHIP(event); }
+
+// Waiter is HIP, event is CPU
+void EventWaitHIPCPU(const Event* event, void* context)
+{
+    event->Finish(); // calls EventFinishCPU
+}
+
+EventStatus EventQueryHIP(const Event* event)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    if(wrapper->status_ == EventStatus::EVENT_SCHEDULED)
+    {
+        auto hipResult = hipEventQuery(wrapper->hip_event_);
+        if(hipResult == hipSuccess)
+        {
+            wrapper->status_ = EventStatus::EVENT_SUCCESS;
+        }
+        else if(hipResult != hipErrorNotReady)
+        {
+            const auto& err_msg = hipGetErrorString(hipResult);
+
+            std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+            wrapper->err_msg_ = err_msg;
+            wrapper->status_  = EventStatus::EVENT_FAILED;
+        }
+    }
+    return static_cast<EventStatus>(wrapper->status_.load());
+}
+
+const std::string& EventErrorMessageHIP(const Event* event)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    // supposed to be called after EventQueryCUDA to update status first
+    if(wrapper->status_ == EventStatus::EVENT_FAILED)
+    {
+        return wrapper->err_msg_;
+    }
+    else
+    {
+        return kNoError;
+    }
+}
+
+void EventSetFinishedHIP(const Event* event, const char* err_msg)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    {
+        std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+
+        CAFFE_ENFORCE_EQ(wrapper->status_,
+                         EventStatus::EVENT_INITIALIZED,
+                         "Calling SetFinished on recorded HIP event");
+
+        if(!err_msg)
+        {
+            wrapper->status_ = EventStatus::EVENT_SUCCESS;
+        }
+        else
+        {
+            wrapper->err_msg_ = err_msg;
+            wrapper->status_  = EventStatus::EVENT_FAILED;
+        }
+    }
+    wrapper->cv_recorded_.notify_all();
+}
+
+void EventResetHIP(Event* event)
+{
+    auto* wrapper = static_cast<HipEventWrapper*>(event->event_.get());
+    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
+    wrapper->status_     = EventStatus::EVENT_INITIALIZED;
+    wrapper->err_msg_    = "";
+    wrapper->hip_stream_ = nullptr;
+}
+
+REGISTER_EVENT_CREATE_FUNCTION(HIP, EventCreateHIP);
+REGISTER_EVENT_RECORD_FUNCTION(HIP, EventRecordHIP);
+REGISTER_EVENT_WAIT_FUNCTION(HIP, HIP, EventWaitHIPHIP);
+REGISTER_EVENT_WAIT_FUNCTION(CPU, HIP, EventWaitCPUHIP);
+REGISTER_EVENT_WAIT_FUNCTION(HIP, CPU, EventWaitHIPCPU);
+REGISTER_EVENT_FINISH_FUNCTION(HIP, EventFinishHIP);
+
+REGISTER_EVENT_QUERY_FUNCTION(HIP, EventQueryHIP);
+REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(HIP, EventErrorMessageHIP);
+REGISTER_EVENT_SET_FINISHED_FUNCTION(HIP, EventSetFinishedHIP);
+REGISTER_EVENT_RESET_FUNCTION(HIP, EventResetHIP);
+
+REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, HIP, EventWaitCPUHIP);
+REGISTER_EVENT_WAIT_FUNCTION(HIP, MKLDNN, EventWaitHIPCPU);
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/event_hip_test.cc b/caffe2/core/hip/event_hip_test.cc
new file mode 100644
index 0000000..c36dded
--- /dev/null
+++ b/caffe2/core/hip/event_hip_test.cc
@@ -0,0 +1,51 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/event.h"
+
+namespace caffe2 {
+
+TEST(EventHIPTest, EventBasics)
+{
+    if(!HasHipGPU())
+        return;
+    DeviceOption device_cpu;
+    device_cpu.set_device_type(CPU);
+    DeviceOption device_hip;
+    device_hip.set_device_type(HIP);
+
+    CPUContext context_cpu(device_cpu);
+    HIPContext context_hip(device_hip);
+
+    Event event_cpu(device_cpu);
+    Event event_hip(device_hip);
+
+    // CPU context and event interactions
+    context_cpu.Record(&event_cpu);
+    event_cpu.SetFinished();
+    event_cpu.Finish();
+    context_cpu.WaitEvent(event_cpu);
+
+    event_cpu.Reset();
+    event_cpu.Record(CPU, &context_cpu);
+    event_cpu.SetFinished();
+    event_cpu.Wait(CPU, &context_cpu);
+
+    // HIP context and event interactions
+    context_hip.SwitchToDevice();
+    context_hip.Record(&event_hip);
+    context_hip.WaitEvent(event_hip);
+    event_hip.Finish();
+
+    event_hip.Reset();
+    event_hip.Record(HIP, &context_hip);
+    event_hip.Wait(HIP, &context_hip);
+
+    // CPU context waiting for HIP event
+    context_cpu.WaitEvent(event_hip);
+
+    // HIP context waiting for CPU event
+    context_hip.WaitEvent(event_cpu);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
new file mode 100644
index 0000000..2671d4b
--- /dev/null
+++ b/caffe2/core/hip/miopen_wrapper.h
@@ -0,0 +1,165 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#ifndef CAFFE2_CORE_MIOPEN_WRAPPERS_H_
+#define CAFFE2_CORE_MIOPEN_WRAPPERS_H_
+
+#include "caffe2/core/hip/common_miopen.h"
+#include "caffe2/core/hip/context_hip.h"
+
+namespace caffe2 {
+
+class MIOPENWrapper;
+
+/**
+ * MIOPENWorkspace is a wrapper around a raw cuda pointer that holds the miopen
+ * scratch space. This struct is meant to be only used in MIOPENWrapper to
+ * provide a program-wide scratch space for MIOPEN. The reason behind it is that
+ * miopen function calls are usually very efficient, hence one probably does not
+ * want to run multiple miopen calls at the same time. As a result, one should
+ * not need more than one miopen workspace per device.
+ */
+struct MIOPENWorkspace
+{
+    ~MIOPENWorkspace() noexcept {}
+
+    void* get(size_t nbytes)
+    {
+        if(nbytes_ < nbytes)
+        {
+            reset();
+            auto data_and_deleter = HIPContext::New(nbytes);
+            data_                 = {data_and_deleter.first, data_and_deleter.second};
+            nbytes_               = nbytes;
+        }
+        CAFFE_ENFORCE_GE(nbytes_, nbytes);
+        return data_.get();
+    }
+
+    void reset()
+    {
+        data_   = nullptr;
+        nbytes_ = 0;
+    }
+
+    private:
+    std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
+    size_t nbytes_{0};
+};
+
+// MIOPENState is the owner of the MIOPENWorkspace, and serializes all
+// executions of operations that use the state onto it's own stream
+// (so multiple Net workers can reuse the same workspace from
+// different threads and HIP streams).
+class MIOPENState
+{
+    public:
+    explicit MIOPENState(size_t gpu_id) : gpu_id_(gpu_id)
+    {
+        DeviceGuard g(gpu_id_);
+        MIOPEN_ENFORCE(miopenCreate(&miopen_handle_));
+        HIP_ENFORCE(hipEventCreate(&before_));
+        HIP_ENFORCE(hipEventCreate(&after_));
+        HIP_ENFORCE(hipStreamCreate(&stream_));
+        MIOPEN_ENFORCE(miopenSetStream(miopen_handle_, stream_));
+    }
+
+    ~MIOPENState() noexcept
+    {
+        DeviceGuard g(gpu_id_);
+        MIOPEN_CHECK(miopenDestroy(miopen_handle_));
+        HIP_CHECK(hipStreamDestroy(stream_));
+        HIP_CHECK(hipEventDestroy(after_));
+        HIP_CHECK(hipEventDestroy(before_));
+    }
+
+    miopenHandle_t& miopen_handle() { return miopen_handle_; }
+
+    MIOPENWorkspace& workspace() { return workspace_; }
+
+    template <typename F>
+    void execute(hipStream_t stream, F&& f)
+    {
+        HIP_ENFORCE(hipEventRecord(before_, stream));
+        HIP_ENFORCE(hipStreamWaitEvent(stream_, before_, 0));
+        f(this);
+        HIP_ENFORCE(hipEventRecord(after_, stream_));
+        HIP_ENFORCE(hipStreamWaitEvent(stream, after_, 0));
+    }
+
+    private:
+    miopenHandle_t miopen_handle_{nullptr};
+    hipEvent_t before_{nullptr};
+    hipEvent_t after_{nullptr};
+    hipStream_t stream_{nullptr};
+    MIOPENWorkspace workspace_;
+    size_t gpu_id_{0};
+    DISABLE_COPY_AND_ASSIGN(MIOPENState);
+};
+
+/**
+ * MIOPENWrapper is a class that wraps the miopen handles and miopen workspaces.
+ *
+ * The wrapper ensures that for each thread and each gpu, there is one
+ * identical miopen handle, which is also associated with the thread-local
+ * per-device hip stream. The wrapper also hosts the device-specific miopen
+ * workspace (scratch space for some miopen functions).
+ *
+ */
+class MIOPENWrapper
+{
+    public:
+    /**
+     * Creates a miopen wrapper associated with a HIPContext object. Note that
+     * the HIPContext object should outlive the MIOPENWrapper.
+     */
+    explicit MIOPENWrapper(HIPContext* context) : context_(context) {}
+
+    /**
+     * Returns the inline miopen handle that executes on the current
+     * thread's hip_stream.
+     */
+    miopenHandle_t inline_miopen_handle() { return context_->miopen_handle(); }
+
+    // Executes the closure F on the MIOPENState associated with state_idx
+    template <typename F>
+    void with_miopen_state(size_t state_idx, F&& f)
+    {
+        CAFFE_ENFORCE(state_idx < CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES, "Invalid state_idx");
+        auto& sync_state = miopen_states()[context_->hip_gpu_id()][state_idx];
+
+        DeviceGuard dg(context_->hip_gpu_id());
+
+        // We need to serialize execution on the MIOPENState as we can't
+        // allow multiple threads to race through the cudaEventRecord
+        // calls (so a worker thread might wait on another worker thread's
+        // execution)
+        std::lock_guard<std::mutex> g(sync_state.mutex);
+        if(!sync_state.state.get())
+        {
+            sync_state.state.reset(new MIOPENState(context_->hip_gpu_id()));
+        }
+        CHECK_NOTNULL(sync_state.state.get())->execute(context_->hip_stream(), f);
+    }
+
+    protected:
+    // Pointer to an external cuda context that the miopen wrapper will use.
+    HIPContext* context_;
+
+    static constexpr size_t CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES = 4;
+
+    struct SyncedMIOPENState
+    {
+        std::mutex mutex;
+        std::unique_ptr<MIOPENState> state;
+    };
+
+    using PerGPUMIOPENStates =
+        std::array<std::array<SyncedMIOPENState, CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES>,
+                   CAFFE2_COMPILE_TIME_MAX_HIP_GPUS>;
+    static PerGPUMIOPENStates& miopen_states();
+
+    DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
+};
+
+}; // namespace caffe2
+
+#endif
diff --git a/caffe2/core/hip/net_async_dag_hip.cc b/caffe2/core/hip/net_async_dag_hip.cc
new file mode 100644
index 0000000..4a9a847
--- /dev/null
+++ b/caffe2/core/hip/net_async_dag_hip.cc
@@ -0,0 +1,194 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/net_async_dag_gpu.h"
+
+#include <set>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "caffe2/core/hip/context_hip.h"
+
+CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling");
+
+CAFFE2_DEFINE_bool(caffe2_async_dag_use_multiple_streams, false, "Use multiple streams per thread");
+
+CAFFE2_DECLARE_bool(caffe2_dag_net_collect_stats);
+
+CAFFE2_DECLARE_bool(caffe2_net_async_finish_chain);
+
+CAFFE2_DECLARE_int(caffe2_streams_per_gpu);
+
+CAFFE2_DECLARE_bool(caffe2_net_async_check_stream_status);
+
+namespace caffe2 {
+
+thread_local std::vector<int> AsyncDAGNet::stream_counters_;
+
+namespace {
+
+using Color                  = int32_t;
+constexpr Color kRunColor    = 0x0000CCFF; // blue
+constexpr Color kRecordColor = 0x00FF3300; // red
+constexpr Color kWaitColor   = 0x0066FF33; // green
+
+class ProfiledRange
+{
+    public:
+    ProfiledRange(const OperatorDef& def, Color color) {}
+
+    private:
+    DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+};
+
+} // namespace
+
+AsyncDAGNet::AsyncDAGNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws)
+    : DAGNetBase(net_def, ws)
+{
+    VLOG(1) << "Constructing Async DAG Net " << net_def->name();
+    eventRecorded_.resize(net_def->op_size());
+
+    // For all chains, their tail should consist the list of events that we are
+    // needing for synchronization in the Run() inteface, unless there are other
+    // chains depending on it.
+    events_.reserve(execution_chains_.size());
+    for(const auto& chain : execution_chains_)
+    {
+        const int tail_op_idx = chain.second.back();
+        if(operator_nodes_[tail_op_idx].children_.empty())
+        {
+            events_.push_back(&operator_nodes_[tail_op_idx].operator_->event());
+        }
+    }
+    VLOG(1) << "Total " << execution_chains_.size() << " chains, final waiting on "
+            << events_.size() << " events";
+}
+
+int AsyncDAGNet::stream(const DeviceOption& device_option)
+{
+    int stream_id = 0;
+    if(device_option.device_type() == HIP)
+    {
+        int gpu_id = device_option.hip_gpu_id();
+        CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
+        if(gpu_id >= stream_counters_.size())
+        {
+            stream_counters_.resize(gpu_id + 1, 0);
+        }
+        do
+        {
+            stream_id = stream_counters_[gpu_id]++;
+            stream_counters_[gpu_id] %= FLAGS_caffe2_streams_per_gpu;
+        } while(FLAGS_caffe2_net_async_check_stream_status &&
+                !HIPContext::IsStreamFree(device_option, stream_id));
+    }
+    return stream_id;
+}
+
+bool AsyncDAGNet::RunAt(int chain_id, const std::vector<int>& chain)
+{
+    CAFFE_ENFORCE(!chain.empty(), "Chain should not be empty.");
+    const auto source_idx = chain.front();
+    const auto& parents   = operator_nodes_[source_idx].parents_;
+    // Help ensure that our chaining is correct by verifying at least
+    // one parent recorded an event.
+    CAFFE_ENFORCE(parents.empty() || std::any_of(parents.begin(),
+                                                 parents.end(),
+                                                 [this](int p) { return eventRecorded_[p]; }),
+                  "None of the parent is recorded for an event.");
+
+    int stream_id = 0;
+    if(FLAGS_caffe2_async_dag_use_multiple_streams)
+    {
+        stream_id = stream(operator_nodes_[source_idx].operator_->event().GetDeviceOption());
+    }
+
+    std::vector<const Event*> parent_events;
+    parent_events.reserve(operator_nodes_[source_idx].parents_.size());
+    for(auto source_parent_idx : operator_nodes_[source_idx].parents_)
+    {
+        parent_events.push_back(&operator_nodes_[source_parent_idx].operator_->event());
+    }
+    {
+        ProfiledRange r(operator_nodes_[source_idx].operator_->debug_def(), kWaitColor);
+        operator_nodes_[source_idx].operator_->WaitEvents(parent_events, stream_id);
+    }
+
+    if(FLAGS_caffe2_dag_net_collect_stats)
+    {
+        const auto& device_option =
+            operator_nodes_[source_idx].operator_->event().GetDeviceOption();
+        CAFFE_EVENT(stats_[device_option.device_type()],
+                    task_wait_time_us,
+                    task_timers_[chain_id]->MicroSeconds());
+    }
+
+    // We've waited on all our parent indices.
+    bool success = true;
+    for(auto idx : chain)
+    {
+        ProfiledRange r(operator_nodes_[idx].operator_->debug_def(), kRunColor);
+        {
+          TRACE_EVENT(
+              tracing::TRACE_OP,
+              idx,
+              tracing::TRACE_TASK,
+              chain_id,
+              tracing::TRACE_STREAM,
+              stream_id);
+          success &= operator_nodes_[idx].operator_->RunAsync(stream_id);
+        }
+    }
+
+    const auto& sink_idx = chain.back();
+    if(success && FLAGS_caffe2_net_async_finish_chain)
+    {
+        operator_nodes_[sink_idx].operator_->event().Finish();
+    }
+    CAFFE_ENFORCE(!eventRecorded_[sink_idx], "An event for ", sink_idx, " should not be recorded.");
+    eventRecorded_[sink_idx] = 1;
+
+    if(FLAGS_caffe2_dag_net_collect_stats)
+    {
+        const auto& device_option =
+            operator_nodes_[source_idx].operator_->event().GetDeviceOption();
+        CAFFE_EVENT(stats_[device_option.device_type()],
+                    task_time_to_scheduled_us,
+                    task_timers_[chain_id]->MicroSeconds());
+    }
+    return success;
+}
+
+bool AsyncDAGNet::DoRunAsync()
+{
+    // Reset the event tracking at each iteration
+    eventRecorded_.assign(eventRecorded_.size(), 0);
+
+    const auto result = DAGNetBase::DoRunAsync();
+    return result;
+}
+
+REGISTER_NET(async_dag, AsyncDAGNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
new file mode 100644
index 0000000..e1b4ff2
--- /dev/null
+++ b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/net_async_base.h"
+
+CAFFE2_DEFINE_int(
+    caffe2_threads_per_hip_gpu,
+    1,
+    "Number of CPU threads per AMD HIP GPU");
+
+namespace caffe2 {
+
+std::shared_ptr<TaskThreadPool>
+GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) {
+  // For GPU, use per device thread pools of predefined constant size
+  if (pool_size != FLAGS_caffe2_threads_per_hip_gpu) {
+    LOG(INFO) << "Overriding AMD HIP GPU pool size: using "
+              << FLAGS_caffe2_threads_per_hip_gpu << " threads per GPU";
+  }
+  static std::unordered_map<int, std::weak_ptr<TaskThreadPool>> pools;
+  static std::mutex pool_mutex;
+
+  if (create_new) {
+    LOG(INFO) << "Created new AMD HIP GPU pool, size: " << FLAGS_caffe2_threads_per_hip_gpu
+              << "; GPU id: " << hip_gpu_id;
+    return std::make_shared<TaskThreadPool>(FLAGS_caffe2_threads_per_hip_gpu);
+  } else {
+    std::lock_guard<std::mutex> lock(pool_mutex);
+
+    std::shared_ptr<TaskThreadPool> shared_pool = nullptr;
+    if (pools.count(hip_gpu_id)) {
+      shared_pool = pools.at(hip_gpu_id).lock();
+    }
+    if (!shared_pool) {
+      LOG(INFO) << "Created shared AMD HIP GPU pool, size: "
+                << FLAGS_caffe2_threads_per_hip_gpu << "; GPU id: " << hip_gpu_id;
+      shared_pool =
+          std::make_shared<TaskThreadPool>(FLAGS_caffe2_threads_per_hip_gpu);
+      pools[hip_gpu_id] = shared_pool;
+    }
+    return shared_pool;
+  }
+}
+
+CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool);
+
+} // namespace caffe2
diff --git a/caffe2/core/hip/operator_hip_test.cc b/caffe2/core/hip/operator_hip_test.cc
new file mode 100644
index 0000000..14b0188
--- /dev/null
+++ b/caffe2/core/hip/operator_hip_test.cc
@@ -0,0 +1,56 @@
+#include <string>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+class JustTest : public OperatorBase
+{
+    public:
+    using OperatorBase::OperatorBase;
+    bool Run(int /* unused */ /*stream_id*/) override { return true; }
+    virtual std::string type() { return "BASE"; }
+};
+
+class JustTestHIP : public JustTest
+{
+    public:
+    using JustTest::JustTest;
+    bool Run(int /* unused */ /*stream_id*/) override { return true; }
+    std::string type() override { return "HIP"; }
+};
+
+class JustTestMIOPEN : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  std::string type() override {
+    return "MIOPEN";
+  }
+};
+
+OPERATOR_SCHEMA(JustTest).NumInputs(0, 1).NumOutputs(0, 1);
+REGISTER_HIP_OPERATOR(JustTest, JustTestHIP);
+REGISTER_MIOPEN_OPERATOR(JustTest, JustTestMIOPEN);
+
+TEST(EnginePrefTest, GPUDeviceDefaultPreferredEngines)
+{
+    if(!HasHipGPU())
+        return;
+    OperatorDef op_def;
+    Workspace ws;
+    op_def.mutable_device_option()->set_device_type(HIP);
+    op_def.set_type("JustTest");
+
+    {
+      const auto op = CreateOperator(op_def, &ws);
+      EXPECT_NE(nullptr, op.get());
+      EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "HIP");
+    }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc
new file mode 100644
index 0000000..388b5e3
--- /dev/null
+++ b/caffe2/core/init.cc
@@ -0,0 +1,103 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h" // for StaticLinkingProtector
+#include "caffe2/core/scope_guard.h"
+
+#include <iomanip>
+#include <mutex>
+
+CAFFE2_DEFINE_bool(
+    caffe2_version,
+    false,
+    "Print Caffe2 version and build options on startup");
+
+namespace caffe2 {
+
+namespace internal {
+// Keep track of stages of initialization to differentiate between
+// (a) Re-entrant calls to GlobalInit (e.g. caller registers a Caffe2 init
+// function which might in turn indirectly invoke GlobalInit).
+// (b) Successive calls to GlobalInit, which are handled as documented in
+// init.h.
+// Note that this is NOT attempting to address thread-safety, see comments
+// in init.h.
+enum class State {
+  Uninitialized,
+  Initializing,
+  Initialized,
+};
+
+Caffe2InitializeRegistry* Caffe2InitializeRegistry::Registry() {
+  static Caffe2InitializeRegistry gRegistry;
+  return &gRegistry;
+}
+
+State& GlobalInitState() {
+  static State state = State::Uninitialized;
+  return state;
+}
+} // namespace internal
+
+bool GlobalInitAlreadyRun() {
+  return internal::GlobalInitState() == internal::State::Initialized;
+}
+
+bool GlobalInit(int* pargc, char*** pargv) {
+  static std::recursive_mutex init_mutex;
+  std::lock_guard<std::recursive_mutex> guard(init_mutex);
+  internal::State& init_state = internal::GlobalInitState();
+  static StaticLinkingProtector g_protector;
+  bool success = true;
+
+  // NOTE: if init_state == internal::State::Initializing at this point, do
+  // nothing because that indicates a re-entrant call
+  if (init_state == internal::State::Initialized) {
+    VLOG(1) << "GlobalInit has already been called: re-parsing gflags only.";
+    // Reparse command line flags
+    success &= ParseCaffeCommandLineFlags(pargc, pargv);
+    UpdateLoggingLevelsFromFlags();
+  } else if (init_state == internal::State::Uninitialized) {
+    init_state = internal::State::Initializing;
+    auto init_state_guard = MakeGuard([&] {
+      // If an exception is thrown, go back to Uninitialized state
+      if (init_state == internal::State::Initializing) {
+        init_state = internal::State::Uninitialized;
+      }
+    });
+
+    success &= internal::Caffe2InitializeRegistry::Registry()
+                   ->RunRegisteredEarlyInitFunctions(pargc, pargv);
+    CAFFE_ENFORCE(
+        success, "Failed to run some early init functions for caffe2.");
+    success &= ParseCaffeCommandLineFlags(pargc, pargv);
+    success &= InitCaffeLogging(pargc, *pargv);
+    // Print out the current build version. Using cerr as LOG(INFO) might be off
+    if (FLAGS_caffe2_version) {
+      std::cerr << "Caffe2 build configuration: " << std::endl;
+      for (const auto& it : GetBuildOptions()) {
+        std::cerr << "  " << std::setw(25) << std::left << it.first << " : "
+                  << it.second << std::endl;
+      }
+    }
+    // All other initialization functions.
+    success &= internal::Caffe2InitializeRegistry::Registry()
+                   ->RunRegisteredInitFunctions(pargc, pargv);
+
+    init_state =
+        success ? internal::State::Initialized : internal::State::Uninitialized;
+  }
+  CAFFE_ENFORCE(success, "Failed to run some init functions for caffe2.");
+  // TODO: if we fail GlobalInit(), should we continue?
+  return success;
+}
+
+bool GlobalInit() {
+  // This is a version of the GlobalInit where no argument is passed in.
+  // On mobile devices, use this global init, since we cannot pass the
+  // command line options to caffe2, no arguments are passed.
+  int mobile_argc = 1;
+  static char caffe2_name[] = "caffe2";
+  char* mobile_name = &caffe2_name[0];
+  char** mobile_argv = &mobile_name;
+  return ::caffe2::GlobalInit(&mobile_argc, &mobile_argv);
+}
+}  // namespace caffe2
diff --git a/caffe2/core/init.h b/caffe2/core/init.h
new file mode 100644
index 0000000..db3e411
--- /dev/null
+++ b/caffe2/core/init.h
@@ -0,0 +1,141 @@
+#ifndef CAFFE2_CORE_INIT_H_
+#define CAFFE2_CORE_INIT_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+namespace internal {
+class Caffe2InitializeRegistry {
+ public:
+  typedef bool (*InitFunction)(int*, char***);
+  // Registry() is defined in .cpp file to make registration work across
+  // multiple shared libraries loaded with RTLD_LOCAL
+  static Caffe2InitializeRegistry* Registry();
+
+  void Register(InitFunction function, bool run_early,
+                const char* description) {
+    if (run_early) {
+      // Disallow late registration of early init functions
+      CAFFE_ENFORCE(!early_init_functions_run_yet_);
+      early_init_functions_.emplace_back(function, description);
+    } else {
+      // Disallow late registration of init functions
+      CAFFE_ENFORCE(!init_functions_run_yet_);
+      init_functions_.emplace_back(function, description);
+    }
+  }
+
+  bool RunRegisteredEarlyInitFunctions(int* pargc, char*** pargv) {
+    CAFFE_ENFORCE(!early_init_functions_run_yet_);
+    early_init_functions_run_yet_ = true;
+    return RunRegisteredInitFunctionsInternal(
+        early_init_functions_, pargc, pargv);
+  }
+
+  bool RunRegisteredInitFunctions(int* pargc, char*** pargv) {
+    CAFFE_ENFORCE(!init_functions_run_yet_);
+    init_functions_run_yet_ = true;
+    return RunRegisteredInitFunctionsInternal(init_functions_, pargc, pargv);
+  }
+
+ private:
+  // Run all registered initialization functions. This has to be called AFTER
+  // all static initialization are finished and main() has started, since we are
+  // using logging.
+  bool RunRegisteredInitFunctionsInternal(
+      vector<std::pair<InitFunction, const char*>>& functions,
+      int* pargc, char*** pargv) {
+    for (const auto& init_pair : functions) {
+      VLOG(1) << "Running init function: " << init_pair.second;
+      if (!(*init_pair.first)(pargc, pargv)) {
+        LOG(ERROR) << "Initialization function failed.";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Caffe2InitializeRegistry() {}
+  vector<std::pair<InitFunction, const char*> > early_init_functions_;
+  vector<std::pair<InitFunction, const char*> > init_functions_;
+  bool early_init_functions_run_yet_ = false;
+  bool init_functions_run_yet_ = false;
+};
+}  // namespace internal
+
+class InitRegisterer {
+ public:
+  InitRegisterer(internal::Caffe2InitializeRegistry::InitFunction function,
+                 bool run_early, const char* description) {
+    internal::Caffe2InitializeRegistry::Registry()
+        ->Register(function, run_early, description);
+  }
+};
+
+#define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description)             \
+  namespace {                                                                  \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
+      function, false, description);                                           \
+  }  // namespace
+
+#define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description)       \
+  namespace {                                                                  \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
+      function, true, description);                                            \
+  }  // namespace
+
+/**
+ * @brief Determine whether GlobalInit has already been run
+ */
+bool GlobalInitAlreadyRun();
+
+class GlobalInitIsCalledGuard {
+ public:
+  GlobalInitIsCalledGuard() {
+    if (!GlobalInitAlreadyRun()) {
+      LOG(WARNING)
+          << "Caffe2 GlobalInit should be run before any other API calls.";
+    }
+  }
+};
+
+/**
+ * @brief Initialize the global environment of caffe2.
+ *
+ * Caffe2 uses a registration pattern for initialization functions. Custom
+ * initialization functions should take the signature
+ *     bool (*func)(int*, char***)
+ * where the pointers to argc and argv are passed in. Caffe2 then runs the
+ * initialization in three phases:
+ * (1) Functions registered with REGISTER_CAFFE2_EARLY_INIT_FUNCTION. Note that
+ *     since it is possible the logger is not initialized yet, any logging in
+ *     such early init functions may not be printed correctly.
+ * (2) Parses Caffe-specific commandline flags, and initializes caffe logging.
+ * (3) Functions registered with REGISTER_CAFFE2_INIT_FUNCTION.
+ * If there is something wrong at each stage, the function returns false. If
+ * the global initialization has already been run, the function returns false
+ * as well.
+ *
+ * GlobalInit is re-entrant safe; a re-entrant call will no-op and exit.
+ *
+ * GlobalInit is safe to call multiple times but not idempotent;
+ * successive calls will parse flags and re-set caffe2 logging levels from
+ * flags as needed, but NOT re-run early init and init functions.
+ *
+ * GlobalInit is also thread-safe and can be called concurrently.
+ */
+bool GlobalInit(int* pargc, char*** argv);
+
+/**
+ * @brief Initialize the global environment without command line arguments
+ *
+ * This is a version of the GlobalInit where no argument is passed in.
+ * On mobile devices, use this global init, since we cannot pass the
+ * command line options to caffe2, no arguments are passed.
+ */
+bool GlobalInit();
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_INIT_H_
diff --git a/caffe2/core/init_intrinsics_check.cc b/caffe2/core/init_intrinsics_check.cc
new file mode 100644
index 0000000..1627219
--- /dev/null
+++ b/caffe2/core/init_intrinsics_check.cc
@@ -0,0 +1,80 @@
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/cpuid.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_quit_on_unsupported_cpu_feature,
+    false,
+    "If set, when Caffe2 is built with a CPU feature (like avx2) but the "
+    "current CPU does not support it, quit early. If not set (by default), "
+    "log this as an error message and continue execution.");
+
+namespace caffe2 {
+
+static void QuitIfFeatureUnsupported(
+    const bool cpu_has_feature, const string& feature) {
+  VLOG(1) << "Caffe2 built with " << feature << ".";
+  if (!cpu_has_feature) {
+    string err_string =
+        "The Caffe2 binary is compiled with CPU feature " + feature +
+        ", but your CPU does not support it. This will lead to segfaults "
+        "on your machine, such as SIGILL 'illegal instructions' on Linux. "
+        "As a result Caffe2 will preemptively quit. Please install or "
+        "build a Caffe2 binary with the feature turned off.";
+    if (FLAGS_caffe2_quit_on_unsupported_cpu_feature) {
+      LOG(FATAL) << err_string;
+    } else {
+      LOG(ERROR) << err_string;
+    }
+  }
+}
+
+static void WarnIfFeatureUnused(
+    const bool cpu_has_feature, const string& feature) {
+  VLOG(1) << "Caffe2 not built with " << feature << ".";
+  if (cpu_has_feature) {
+#ifdef CAFFE2_NO_CROSS_ARCH_WARNING
+    // When cross-compiling single binary for multiple archs - turns off the
+    // annoying warning
+    VLOG(1)
+#else
+    LOG(ERROR)
+#endif
+        << "CPU feature " << feature
+        << " is present on your machine, "
+           "but the Caffe2 binary is not compiled with it. It means you "
+           "may not get the full speed of your CPU.";
+  }
+}
+
+bool Caffe2CheckIntrinsicsFeatures(int*, char***) {
+
+#ifdef __AVX__
+  QuitIfFeatureUnsupported(GetCpuId().avx(), "avx");
+#else
+  WarnIfFeatureUnused(GetCpuId().avx(), "avx");
+#endif
+
+#ifdef __AVX2__
+  QuitIfFeatureUnsupported(GetCpuId().avx2(), "avx2");
+#else
+  WarnIfFeatureUnused(GetCpuId().avx2(), "avx2");
+#endif
+
+#ifdef __FMA__
+  QuitIfFeatureUnsupported(GetCpuId().fma(), "fma");
+#else
+  WarnIfFeatureUnused(GetCpuId().fma(), "fma");
+#endif
+
+  return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(
+    Caffe2CheckIntrinsicsFeatures,
+    &Caffe2CheckIntrinsicsFeatures,
+    "Check intrinsics compatibility between the CPU feature and the binary.");
+
+}  // namespace caffe2
diff --git a/caffe2/core/init_omp.cc b/caffe2/core/init_omp.cc
new file mode 100644
index 0000000..b624e8e
--- /dev/null
+++ b/caffe2/core/init_omp.cc
@@ -0,0 +1,77 @@
+#include <stdlib.h>
+
+#include "caffe2/core/common.h"
+
+#ifdef _OPENMP
+#include "caffe2/core/common_omp.h"
+#endif  // _OPENMP
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#include "caffe2/core/init.h"
+
+CAFFE2_DEFINE_int(
+    caffe2_omp_num_threads, 0,
+    "The number of openmp threads. 0 to use default value. "
+    "Does not have effect if OpenMP is disabled.");
+CAFFE2_DEFINE_int(
+    caffe2_mkl_num_threads,
+    0,
+    "The number of mkl threads. 0 to use default value. If set, "
+    "this overrides the caffe2_omp_num_threads flag if both are set. "
+    "Does not have effect if MKL is not used.");
+
+namespace caffe2 {
+
+#ifdef _OPENMP
+bool Caffe2SetOpenMPThreads(int*, char***) {
+  if (!getenv("OMP_NUM_THREADS")) {
+    // OMP_NUM_THREADS not passed explicitly, so *disable* OMP by
+    // default. The user can use the CLI flag to override.
+    VLOG(1) << "OMP_NUM_THREADS not passed, defaulting to 1 thread";
+    omp_set_num_threads(1);
+  }
+
+  if (FLAGS_caffe2_omp_num_threads > 0) {
+    VLOG(1) << "Setting omp_num_threads to " << FLAGS_caffe2_omp_num_threads;
+    omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
+  }
+  VLOG(1) << "Caffe2 running with " << omp_get_max_threads() << " OMP threads";
+  return true;
+}
+REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetOpenMPThreads,
+                              &Caffe2SetOpenMPThreads,
+                              "Set OpenMP threads.");
+#endif // _OPENMP
+
+#ifdef CAFFE2_USE_MKL
+bool Caffe2SetMKLThreads(int*, char***) {
+  if (!getenv("MKL_NUM_THREADS")) {
+    VLOG(1) << "MKL_NUM_THREADS not passed, defaulting to 1 thread";
+    mkl_set_num_threads(1);
+  }
+
+  // If caffe2_omp_num_threads is set, we use that for MKL as well.
+  if (FLAGS_caffe2_omp_num_threads > 0) {
+    VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_omp_num_threads
+            << " as inherited from omp_num_threads.";
+    mkl_set_num_threads(FLAGS_caffe2_omp_num_threads);
+  }
+
+  // Override omp_num_threads if mkl_num_threads is set.
+  if (FLAGS_caffe2_mkl_num_threads > 0) {
+    VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_mkl_num_threads;
+    mkl_set_num_threads(FLAGS_caffe2_mkl_num_threads);
+  }
+  VLOG(1) << "Caffe2 running with " << mkl_get_max_threads() << " MKL threads";
+  return true;
+}
+REGISTER_CAFFE2_INIT_FUNCTION(
+    Caffe2SetMKLThreads,
+    &Caffe2SetMKLThreads,
+    "Set MKL threads.");
+#endif // CAFFE2_USE_MKL
+
+}  // namespace caffe2
diff --git a/caffe2/core/init_test.cc b/caffe2/core/init_test.cc
new file mode 100644
index 0000000..6057896
--- /dev/null
+++ b/caffe2/core/init_test.cc
@@ -0,0 +1,47 @@
+#include <iostream>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace {
+bool gTestInitFunctionHasBeenRun = false;
+
+bool TestInitFunction(int*, char***) {
+  gTestInitFunctionHasBeenRun = true;
+  return true;
+}
+REGISTER_CAFFE2_INIT_FUNCTION(
+    TestInitFunction,
+    &TestInitFunction,
+    "Just a test to see if GlobalInit invokes "
+    "registered functions correctly.");
+
+int dummy_argc = 1;
+const char* dummy_name = "foo";
+char** dummy_argv = const_cast<char**>(&dummy_name);
+} // namespace
+
+TEST(InitTest, TestInitFunctionHasRun) {
+  caffe2::GlobalInit(&dummy_argc, &dummy_argv);
+  EXPECT_TRUE(gTestInitFunctionHasBeenRun);
+}
+
+TEST(InitTest, CanRerunGlobalInit) {
+  caffe2::GlobalInit(&dummy_argc, &dummy_argv);
+  EXPECT_TRUE(caffe2::GlobalInit(&dummy_argc, &dummy_argv));
+}
+
+void LateRegisterInitFunction() {
+  ::caffe2::InitRegisterer testInitFunc(
+      TestInitFunction, false, "This should fail");
+}
+
+TEST(InitTest, FailLateRegisterInitFunction) {
+  caffe2::GlobalInit(&dummy_argc, &dummy_argv);
+  EXPECT_THROW(LateRegisterInitFunction(), caffe2::EnforceNotMet);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/int8_serialization.cc b/caffe2/core/int8_serialization.cc
new file mode 100644
index 0000000..4003c1f
--- /dev/null
+++ b/caffe2/core/int8_serialization.cc
@@ -0,0 +1,104 @@
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor_int8.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+namespace int8 {
+
+class Int8TensorCPUSerializer : public BlobSerializerBase {
+ public:
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    const auto& tensor = blob.template Get<Int8TensorCPU>();
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("Int8TensorCPU");
+    QTensorProto& proto = *blob_proto.mutable_qtensor();
+    proto.set_name(name);
+    for (int i = 0; i < tensor.t.ndim(); ++i) {
+      proto.add_dims(tensor.t.dim32(i));
+    }
+    proto.set_precision(8);
+    proto.set_scale(tensor.scale);
+    proto.set_bias(tensor.zero_point);
+    proto.set_is_signed(false);
+
+    const TensorProto::DataType data_type = TypeMetaToDataType(tensor.t.meta());
+    proto.set_data_type(data_type);
+    switch (data_type) {
+      case TensorProto_DataType_INT32:
+        detail::CopyToProtoAsIs(
+            tensor.t.size(),
+            tensor.t.template data<int32_t>(),
+            proto.mutable_data(),
+            &this->context_);
+        break;
+      case TensorProto_DataType_UINT8:
+        detail::CopyToProtoWithCast(
+            tensor.t.size(),
+            tensor.t.template data<uint8_t>(),
+            proto.mutable_data(),
+            &this->context_);
+        break;
+      default:
+        CAFFE_ENFORCE(false, "Unsupported data type in Int8TensorCPU");
+    }
+
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+
+ private:
+  CPUContext context_;
+};
+
+class Int8TensorCPUDeserializer : public TensorDeserializer<CPUContext> {
+ public:
+  void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
+    const QTensorProto& proto = blob_proto.qtensor();
+    Int8TensorCPU* tensor = blob->template GetMutable<Int8TensorCPU>();
+    tensor->scale = proto.scale();
+    tensor->zero_point = proto.bias();
+    vector<int> dims;
+    for (const int d : proto.dims()) {
+      dims.push_back(d);
+    }
+    tensor->t.Resize(dims);
+    switch (proto.data_type()) {
+      case TensorProto_DataType_INT32:
+        detail::CopyFromProtoAsIs(
+            tensor->t.size(),
+            proto.data(),
+            tensor->t.template mutable_data<int32_t>(),
+            &this->context_);
+        break;
+      case TensorProto_DataType_UINT8:
+        detail::CopyFromProtoWithCast(
+            tensor->t.size(),
+            proto.data(),
+            tensor->t.template mutable_data<uint8_t>(),
+            &this->context_);
+        break;
+      default:
+        CAFFE_ENFORCE(false, "Unsupported data type in Int8TensorCPU");
+    }
+  }
+
+ private:
+  CPUContext context_;
+};
+
+} // namespace int8
+
+namespace {
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<int8::Int8TensorCPU>()),
+    int8::Int8TensorCPUSerializer);
+REGISTER_BLOB_DESERIALIZER(Int8TensorCPU, int8::Int8TensorCPUDeserializer);
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
new file mode 100644
index 0000000..21162ac
--- /dev/null
+++ b/caffe2/core/logging.cc
@@ -0,0 +1,279 @@
+#include "caffe2/core/logging.h"
+#include "caffe2/core/flags.h"
+
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+
+// Common code that we use regardless of whether we use glog or not.
+
+CAFFE2_DEFINE_bool(caffe2_use_fatal_for_enforce, false,
+                   "If set true, when CAFFE_ENFORCE is not met, abort instead "
+                   "of throwing an exception.");
+
+namespace caffe2 {
+std::string StripBasename(const std::string &full_path) {
+  const char kSeparator = '/';
+  size_t pos = full_path.rfind(kSeparator);
+  if (pos != std::string::npos) {
+    return full_path.substr(pos + 1, std::string::npos);
+  } else {
+    return full_path;
+  }
+}
+
+size_t ReplaceAll(string& s, const char* from, const char* to) {
+  CAFFE_ENFORCE(from && *from);
+  CAFFE_ENFORCE(to);
+
+  size_t numReplaced = 0;
+  string::size_type lenFrom = std::strlen(from);
+  string::size_type lenTo = std::strlen(to);
+  for (string::size_type pos = s.find(from); pos != string::npos;
+       pos = s.find(from, pos + lenTo)) {
+    s.replace(pos, lenFrom, to);
+    numReplaced++;
+  }
+  return numReplaced;
+}
+
+namespace {
+std::function<string(void)>* GetFetchStackTrace() {
+  static std::function<string(void)> func = []() { return ""; };
+  return &func;
+};
+} // namespace
+
+void SetStackTraceFetcher(std::function<string(void)> fetcher) {
+  *GetFetchStackTrace() = fetcher;
+}
+
+static std::function<void(const OperatorDef&)> OperatorLogger =
+    [](const OperatorDef&) { return; };
+
+void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer) {
+  OperatorLogger = tracer;
+}
+
+std::function<void(const OperatorDef&)> GetOperatorLogger() {
+  return OperatorLogger;
+}
+
+EnforceNotMet::EnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const string& msg,
+    const void* caller)
+    : msg_stack_{MakeString(
+          "[enforce fail at ",
+          StripBasename(std::string(file)),
+          ":",
+          line,
+          "] ",
+          condition,
+          ". ",
+          msg,
+          " ")},
+      stack_trace_((*GetFetchStackTrace())()) {
+  if (FLAGS_caffe2_use_fatal_for_enforce) {
+    LOG(FATAL) << msg_stack_[0];
+  }
+  caller_ = caller;
+  full_msg_ = this->msg();
+}
+
+void EnforceNotMet::AppendMessage(const string& msg) {
+  msg_stack_.push_back(msg);
+  full_msg_ = this->msg();
+}
+
+string EnforceNotMet::msg() const {
+  return std::accumulate(msg_stack_.begin(), msg_stack_.end(), string("")) +
+      stack_trace_;
+}
+
+const char* EnforceNotMet::what() const noexcept {
+  return full_msg_.c_str();
+}
+
+const void* EnforceNotMet::caller() const noexcept {
+  return caller_;
+}
+
+}  // namespace caffe2
+
+
+#ifdef CAFFE2_USE_GOOGLE_GLOG
+
+#ifdef CAFFE2_USE_GFLAGS
+// GLOG's minloglevel
+CAFFE2_DECLARE_int(minloglevel);
+// GLOG's verbose log value.
+CAFFE2_DECLARE_int(v);
+// GLOG's logtostderr value
+CAFFE2_DECLARE_bool(logtostderr);
+
+#else
+
+using fLI::FLAGS_minloglevel;
+using fLI::FLAGS_v;
+using fLB::FLAGS_logtostderr;
+
+#endif // CAFFE2_USE_GFLAGS
+
+CAFFE2_DEFINE_int(caffe2_log_level, google::GLOG_ERROR,
+                  "The minimum log level that caffe2 will output.");
+
+// Google glog's api does not have an external function that allows one to check
+// if glog is initialized or not. It does have an internal function - so we are
+// declaring it here. This is a hack but has been used by a bunch of others too
+// (e.g. Torch).
+namespace google {
+namespace glog_internal_namespace_ {
+bool IsGoogleLoggingInitialized();
+}  // namespace glog_internal_namespace_
+}  // namespace google
+
+
+namespace caffe2 {
+bool InitCaffeLogging(int* argc, char** argv) {
+  if (*argc == 0) return true;
+#if !defined(_MSC_VER)
+  // This trick can only be used on UNIX platforms
+  if (!::google::glog_internal_namespace_::IsGoogleLoggingInitialized())
+#endif
+  {
+    ::google::InitGoogleLogging(argv[0]);
+#if !defined(_MSC_VER)
+  // This is never defined on Windows
+    ::google::InstallFailureSignalHandler();
+#endif
+  }
+  UpdateLoggingLevelsFromFlags();
+  return true;
+}
+
+void UpdateLoggingLevelsFromFlags() {
+  // If caffe2_log_level is set and is lower than the min log level by glog,
+  // we will transfer the caffe2_log_level setting to glog to override that.
+  FLAGS_minloglevel = std::min(FLAGS_caffe2_log_level, FLAGS_minloglevel);
+  // If caffe2_log_level is explicitly set, let's also turn on logtostderr.
+  if (FLAGS_caffe2_log_level < google::GLOG_ERROR) {
+    FLAGS_logtostderr = 1;
+  }
+  // Also, transfer the caffe2_log_level verbose setting to glog.
+  if (FLAGS_caffe2_log_level < 0) {
+    FLAGS_v = std::min(FLAGS_v, -FLAGS_caffe2_log_level);
+  }
+}
+
+void ShowLogInfoToStderr() {
+  FLAGS_logtostderr = 1;
+  FLAGS_minloglevel = std::min(FLAGS_minloglevel, google::GLOG_INFO);
+}
+}  // namespace caffe2
+
+#else  // !CAFFE2_USE_GOOGLE_GLOG
+
+#ifdef ANDROID
+#include <android/log.h>
+#endif // ANDROID
+
+CAFFE2_DEFINE_int(caffe2_log_level, ERROR,
+                  "The minimum log level that caffe2 will output.");
+
+namespace caffe2 {
+bool InitCaffeLogging(int* argc, char** argv) {
+  // When doing InitCaffeLogging, we will assume that caffe's flag paser has
+  // already finished.
+  if (*argc == 0) return true;
+  if (!CommandLineFlagsHasBeenParsed()) {
+    std::cerr << "InitCaffeLogging() has to be called after "
+                 "ParseCaffeCommandLineFlags. Modify your program to make sure "
+                 "of this." << std::endl;
+    return false;
+  }
+  if (FLAGS_caffe2_log_level > FATAL) {
+    std::cerr << "The log level of Caffe2 has to be no larger than FATAL("
+              << FATAL << "). Capping it to FATAL." << std::endl;
+    FLAGS_caffe2_log_level = FATAL;
+  }
+  return true;
+}
+
+void UpdateLoggingLevelsFromFlags() {
+}
+
+void ShowLogInfoToStderr() {
+  FLAGS_caffe2_log_level = INFO;
+}
+
+MessageLogger::MessageLogger(const char *file, int line, int severity)
+  : severity_(severity) {
+  if (severity_ < FLAGS_caffe2_log_level) {
+    // Nothing needs to be logged.
+    return;
+  }
+#ifdef ANDROID
+  tag_ = "native";
+#else  // !ANDROID
+  tag_ = "";
+#endif  // ANDROID
+  /*
+  time_t rawtime;
+  struct tm * timeinfo;
+  time(&rawtime);
+  timeinfo = localtime(&rawtime);
+  std::chrono::nanoseconds ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::high_resolution_clock::now().time_since_epoch());
+  */
+  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, FATAL - severity_)]
+          //<< (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
+          //<< std::setfill('0')
+          //<< " " << std::setw(2) << timeinfo->tm_hour
+          //<< ":" << std::setw(2) << timeinfo->tm_min
+          //<< ":" << std::setw(2) << timeinfo->tm_sec
+          //<< "." << std::setw(9) << ns.count() % 1000000000
+          << " " << StripBasename(std::string(file)) << ":" << line << "] ";
+}
+
+// Output the contents of the stream to the proper channel on destruction.
+MessageLogger::~MessageLogger() {
+  if (severity_ < FLAGS_caffe2_log_level) {
+    // Nothing needs to be logged.
+    return;
+  }
+  stream_ << "\n";
+#ifdef ANDROID
+  static const int android_log_levels[] = {
+      ANDROID_LOG_FATAL,    // LOG_FATAL
+      ANDROID_LOG_ERROR,    // LOG_ERROR
+      ANDROID_LOG_WARN,     // LOG_WARNING
+      ANDROID_LOG_INFO,     // LOG_INFO
+      ANDROID_LOG_DEBUG,    // VLOG(1)
+      ANDROID_LOG_VERBOSE,  // VLOG(2) .. VLOG(N)
+  };
+  int android_level_index = FATAL - std::min(FATAL, severity_);
+  int level = android_log_levels[std::min(android_level_index, 5)];
+  // Output the log string the Android log at the appropriate level.
+  __android_log_print(level, tag_, "%s", stream_.str().c_str());
+  // Indicate termination if needed.
+  if (severity_ == FATAL) {
+    __android_log_print(ANDROID_LOG_FATAL, tag_, "terminating.\n");
+  }
+#else  // !ANDROID
+  if (severity_ >= FLAGS_caffe2_log_level) {
+    // If not building on Android, log all output to std::cerr.
+    std::cerr << stream_.str();
+  }
+#endif  // ANDROID
+  if (severity_ == FATAL) {
+    DealWithFatal();
+  }
+}
+
+}  // namespace caffe2
+
+#endif  // !CAFFE2_USE_GOOGLE_GLOG
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
new file mode 100644
index 0000000..8561654
--- /dev/null
+++ b/caffe2/core/logging.h
@@ -0,0 +1,304 @@
+#ifndef CAFFE2_CORE_LOGGING_H_
+#define CAFFE2_CORE_LOGGING_H_
+
+#include <climits>
+#include <exception>
+#include <functional>
+#include <limits>
+#include <sstream>
+
+#include "caffe2/core/flags.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+// CAFFE2_LOG_THRESHOLD is a compile time flag that would allow us to turn off
+// logging at compile time so no logging message below that level is produced
+// at all. The value should be between INT_MIN and CAFFE_FATAL.
+#ifndef CAFFE2_LOG_THRESHOLD
+// If we have not defined the compile time log threshold, we keep all the
+// log cases.
+#define CAFFE2_LOG_THRESHOLD INT_MIN
+#endif // CAFFE2_LOG_THRESHOLD
+
+// Below are different implementations for glog and non-glog cases.
+#ifdef CAFFE2_USE_GOOGLE_GLOG
+#include "caffe2/core/logging_is_google_glog.h"
+#else // !CAFFE2_USE_GOOGLE_GLOG
+#include "caffe2/core/logging_is_not_google_glog.h"
+#endif // CAFFE2_USE_GOOGLE_GLOG
+
+CAFFE2_DECLARE_int(caffe2_log_level);
+CAFFE2_DECLARE_bool(caffe2_use_fatal_for_enforce);
+
+namespace caffe2 {
+// Functions that we use for initialization.
+bool InitCaffeLogging(int* argc, char** argv);
+void UpdateLoggingLevelsFromFlags();
+
+constexpr bool IsUsingGoogleLogging() {
+#ifdef CAFFE2_USE_GOOGLE_GLOG
+  return true;
+#else
+  return false;
+#endif
+}
+
+/**
+ * A utility to allow one to show log info to stderr after the program starts.
+ *
+ * This is similar to calling GLOG's --logtostderr, or setting caffe2_log_level
+ * to smaller than INFO. You are recommended to only use this in a few sparse
+ * cases, such as when you want to write a tutorial or something. Normally, use
+ * the commandline flags to set the log level.
+ */
+void ShowLogInfoToStderr();
+
+inline void MakeStringInternal(std::stringstream& /*ss*/) {}
+
+template <typename T>
+inline void MakeStringInternal(std::stringstream& ss, const T& t) {
+  ss << t;
+}
+
+template <typename T, typename... Args>
+inline void
+MakeStringInternal(std::stringstream& ss, const T& t, const Args&... args) {
+  MakeStringInternal(ss, t);
+  MakeStringInternal(ss, args...);
+}
+
+template <typename... Args>
+string MakeString(const Args&... args) {
+  std::stringstream ss;
+  MakeStringInternal(ss, args...);
+  return string(ss.str());
+}
+
+// Specializations for already-a-string types.
+template <>
+inline string MakeString(const string& str) {
+  return str;
+}
+inline string MakeString(const char* c_str) {
+  return string(c_str);
+}
+
+template <class Container>
+inline string Join(const string& delimiter, const Container& v) {
+  std::stringstream s;
+  int cnt = static_cast<int64_t>(v.size()) - 1;
+  for (auto i = v.begin(); i != v.end(); ++i, --cnt) {
+    s << (*i) << (cnt ? delimiter : "");
+  }
+  return s.str();
+}
+
+// Obtains the base name from a full path.
+string StripBasename(const std::string& full_path);
+
+// Replace all occurrences of "from" substring to "to" string.
+// Returns number of replacements
+size_t ReplaceAll(string& s, const char* from, const char* to);
+
+void SetStackTraceFetcher(std::function<string(void)> fetcher);
+
+void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
+std::function<void(const OperatorDef&)> GetOperatorLogger();
+
+class EnforceNotMet : public std::exception {
+ public:
+  EnforceNotMet(
+      const char* file,
+      const int line,
+      const char* condition,
+      const string& msg,
+      const void* caller=nullptr);
+  void AppendMessage(const string& msg);
+  string msg() const;
+  inline const vector<string>& msg_stack() const {
+    return msg_stack_;
+  }
+
+  const char* what() const noexcept override;
+
+  const void* caller() const noexcept;
+
+ private:
+  vector<string> msg_stack_;
+  string full_msg_;
+  string stack_trace_;
+  const void* caller_;
+};
+
+#define CAFFE_ENFORCE(condition, ...)                                         \
+  do {                                                                        \
+    if (!(condition)) {                                                       \
+      throw ::caffe2::EnforceNotMet(                                          \
+          __FILE__, __LINE__, #condition, ::caffe2::MakeString(__VA_ARGS__)); \
+    }                                                                         \
+  } while (false)
+
+#define CAFFE_ENFORCE_WITH_CALLER(condition, ...)                             \
+  do {                                                                        \
+    if (!(condition)) {                                                       \
+      throw ::caffe2::EnforceNotMet(                                          \
+          __FILE__, __LINE__, #condition, ::caffe2::MakeString(__VA_ARGS__), this); \
+    }                                                                         \
+  } while (false)
+
+#define CAFFE_THROW(...)         \
+  throw ::caffe2::EnforceNotMet( \
+      __FILE__, __LINE__, "", ::caffe2::MakeString(__VA_ARGS__))
+
+/**
+ * Rich logging messages
+ *
+ * CAFFE_ENFORCE_THAT can be used with one of the "checker functions" that
+ * capture input argument values and add it to the exception message. E.g.
+ * `CAFFE_ENFORCE_THAT(Equals(foo(x), bar(y)), "Optional additional message")`
+ * would evaluate both foo and bar only once and if the results are not equal -
+ * include them in the exception message.
+ *
+ * Some of the basic checker functions like Equals or Greater are already
+ * defined below. Other header might define customized checkers by adding
+ * functions to caffe2::enforce_detail namespace. For example:
+ *
+ *   namespace caffe2 { namespace enforce_detail {
+ *   inline EnforceFailMessage IsVector(const vector<TIndex>& shape) {
+ *     if (shape.size() == 1) { return EnforceOK(); }
+ *     return MakeString("Shape ", shape, " is not a vector");
+ *   }
+ *   }}
+ *
+ * With further usages like `CAFFE_ENFORCE_THAT(IsVector(Input(0).dims()))`
+ *
+ * Convenient wrappers for binary operations like CAFFE_ENFORCE_EQ are provided
+ * too. Please use them instead of CHECK_EQ and friends for failures in
+ * user-provided input.
+ */
+
+namespace enforce_detail {
+
+struct EnforceOK {};
+
+class EnforceFailMessage {
+ public:
+#ifdef _MSC_VER
+  // MSVC + NVCC ignores constexpr and will issue a warning if included.
+  /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
+#else
+  constexpr /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
+#endif
+  EnforceFailMessage(EnforceFailMessage&&) = default;
+  EnforceFailMessage(const EnforceFailMessage&) = delete;
+  EnforceFailMessage& operator=(EnforceFailMessage&&) = delete;
+  EnforceFailMessage& operator=(const EnforceFailMessage&) = delete;
+
+  // Catch all wrong usages like CAFFE_ENFORCE_THAT(x < y)
+  template <class... Args>
+  /* implicit */ EnforceFailMessage(Args...) {
+    static_assert(
+        // This stands for an "impossible" condition. Plain `false` doesn't
+        // trick compiler enough.
+        sizeof...(Args) == std::numeric_limits<std::size_t>::max(),
+        "CAFFE_ENFORCE_THAT has to be used with one of special check functions "
+        "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks.");
+  }
+
+  /* implicit */ EnforceFailMessage(std::string&& msg) {
+    msg_ = new std::string(std::move(msg));
+  }
+  inline bool bad() const {
+    return msg_ != nullptr;
+  }
+  std::string get_message_and_free(std::string&& extra) const {
+    std::string r;
+    if (extra.empty()) {
+      r = std::move(*msg_);
+    } else {
+      r = ::caffe2::MakeString(std::move(*msg_), ". ", std::move(extra));
+    }
+    delete msg_;
+    return r;
+  }
+
+ private:
+  std::string* msg_;
+};
+
+#define BINARY_COMP_HELPER(name, op)                         \
+  template <typename T1, typename T2>                        \
+  inline EnforceFailMessage name(const T1& x, const T2& y) { \
+    if (x op y) {                                            \
+      return EnforceOK();                                    \
+    }                                                        \
+    return MakeString(x, " vs ", y);                         \
+  }
+BINARY_COMP_HELPER(Equals, ==)
+BINARY_COMP_HELPER(NotEquals, !=)
+BINARY_COMP_HELPER(Greater, >)
+BINARY_COMP_HELPER(GreaterEquals, >=)
+BINARY_COMP_HELPER(Less, <)
+BINARY_COMP_HELPER(LessEquals, <=)
+#undef BINARY_COMP_HELPER
+
+#define CAFFE_ENFORCE_THAT_IMPL(condition, expr, ...)                   \
+  do {                                                                  \
+    using namespace ::caffe2::enforce_detail;                           \
+    const EnforceFailMessage& CAFFE_ENFORCE_THAT_IMPL_r_ = (condition); \
+    if (CAFFE_ENFORCE_THAT_IMPL_r_.bad()) {                             \
+      throw ::caffe2::EnforceNotMet(                                    \
+          __FILE__,                                                     \
+          __LINE__,                                                     \
+          expr,                                                         \
+          CAFFE_ENFORCE_THAT_IMPL_r_.get_message_and_free(              \
+              ::caffe2::MakeString(__VA_ARGS__)));                      \
+    }                                                                   \
+  } while (false)
+
+#define CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(condition, expr, ...)      \
+  do {                                                                 \
+    using namespace ::caffe2::enforce_detail;                          \
+    const EnforceFailMessage& CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER_r_ = \
+        (condition);                                                   \
+    if (CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER_r_.bad()) {                \
+      throw ::caffe2::EnforceNotMet(                                   \
+          __FILE__,                                                    \
+          __LINE__,                                                    \
+          expr,                                                        \
+          CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER_r_.get_message_and_free( \
+              ::caffe2::MakeString(__VA_ARGS__)),                      \
+          this);                                                       \
+    }                                                                  \
+  } while (false)
+}
+
+#define CAFFE_ENFORCE_THAT(condition, ...) \
+  CAFFE_ENFORCE_THAT_IMPL((condition), #condition, __VA_ARGS__)
+
+#define CAFFE_ENFORCE_EQ(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Equals((x), (y)), #x " == " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_NE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(NotEquals((x), (y)), #x " != " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(LessEquals((x), (y)), #x " <= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LT(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Less((x), (y)), #x " < " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(GreaterEquals((x), (y)), #x " >= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GT(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Greater((x), (y)), #x " > " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_EQ_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(Equals((x), (y)), #x " == " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_NE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(NotEquals((x), (y)), #x " != " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(LessEquals((x), (y)), #x " <= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LT_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(Less((x), (y)), #x " < " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(GreaterEquals((x), (y)), #x " >= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GT_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(Greater((x), (y)), #x " > " #y, __VA_ARGS__)
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_LOGGING_H_
diff --git a/caffe2/core/logging_is_google_glog.h b/caffe2/core/logging_is_google_glog.h
new file mode 100644
index 0000000..4ca2941
--- /dev/null
+++ b/caffe2/core/logging_is_google_glog.h
@@ -0,0 +1,41 @@
+#ifndef CAFFE2_CORE_LOGGING_IS_GOOGLE_GLOG_H_
+#define CAFFE2_CORE_LOGGING_IS_GOOGLE_GLOG_H_
+
+#include <iomanip>  // because some of the caffe2 code uses e.g. std::setw
+// Using google glog. For glog 0.3.2 versions, stl_logging.h needs to be before
+// logging.h to actually use stl_logging. Because template magic.
+// In addition, we do not do stl logging in .cu files because nvcc does not like
+// it. Some mobile platforms do not like stl_logging, so we add an
+// overload in that case as well.
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#endif
+
+#if !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+#include <glog/stl_logging.h>
+#else // !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+
+// here, we need to register a fake overload for vector/string - here,
+// we just ignore the entries in the logs.
+
+namespace std
+{
+  #define INSTANTIATE_FOR_CONTAINER(container)                      \
+    template <class... Types>                                       \
+    ostream& operator<<(ostream& out, const container<Types...>&) { \
+      return out;                                                   \
+    }
+
+  INSTANTIATE_FOR_CONTAINER(vector)
+  INSTANTIATE_FOR_CONTAINER(map)
+  INSTANTIATE_FOR_CONTAINER(set)
+  #undef INSTANTIATE_FOR_CONTAINER
+}
+
+#endif
+
+#include <glog/logging.h>
+
+
+#endif  // CAFFE2_CORE_LOGGING_IS_GOOGLE_GLOG_H_
diff --git a/caffe2/core/logging_is_not_google_glog.h b/caffe2/core/logging_is_not_google_glog.h
new file mode 100644
index 0000000..1db23b7
--- /dev/null
+++ b/caffe2/core/logging_is_not_google_glog.h
@@ -0,0 +1,212 @@
+#ifndef CAFFE2_CORE_LOGGING_IS_NOT_GOOGLE_GLOG_H_
+#define CAFFE2_CORE_LOGGING_IS_NOT_GOOGLE_GLOG_H_
+
+#include <chrono>
+#include <climits>
+#include <ctime>
+#include <iomanip>
+#include <string>
+#include <fstream>
+#include <set>
+#include <sstream>
+#include <vector>
+
+#include "caffe2/core/flags.h"
+
+// Log severity level constants.
+const int FATAL   = 3;
+#if !defined(_MSC_VER) || !defined(ERROR)
+// Windows defines the ERROR macro already, and as a result we will
+// simply use that one. The downside is that one will now mix LOG(INFO)
+// and LOG(ERROR) because ERROR is defined to be zero. Anyway, the
+// recommended way is to use glog so fixing this is a low-pri item.
+const int ERROR   = 2;
+#endif
+const int WARNING = 1;
+const int INFO    = 0;
+const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
+
+namespace caffe2 {
+class MessageLogger {
+ public:
+  MessageLogger(const char *file, int line, int severity);
+  ~MessageLogger();
+  // Return the stream associated with the logger object.
+  std::stringstream &stream() { return stream_; }
+
+ private:
+  // When there is a fatal log, we simply abort.
+  void DealWithFatal() { abort(); }
+
+  const char* tag_;
+  std::stringstream stream_;
+  int severity_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LoggerVoidify {
+ public:
+  LoggerVoidify() { }
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream &s) { }
+};
+
+// Log a message and terminate.
+template<class T>
+void LogMessageFatal(const char *file, int line, const T &message) {
+  MessageLogger(file, line, FATAL).stream() << message;
+}
+
+// Helpers for CHECK_NOTNULL(). Two are necessary to support both raw pointers
+// and smart pointers.
+template <typename T>
+T& CheckNotNullCommon(const char *file, int line, const char *names, T& t) {
+  if (t == nullptr) {
+    LogMessageFatal(file, line, std::string(names));
+  }
+  return t;
+}
+
+template <typename T>
+T* CheckNotNull(const char *file, int line, const char *names, T* t) {
+  return CheckNotNullCommon(file, line, names, t);
+}
+
+template <typename T>
+T& CheckNotNull(const char *file, int line, const char *names, T& t) {
+  return CheckNotNullCommon(file, line, names, t);
+}
+}  // namespace caffe2
+
+// ---------------------- Logging Macro definitions --------------------------
+
+
+static_assert(CAFFE2_LOG_THRESHOLD <= FATAL,
+              "CAFFE2_LOG_THRESHOLD should at most be FATAL.");
+// If n is under the compile time caffe log threshold, The _CAFFE_LOG(n)
+// should not generate anything in optimized code.
+#define LOG(n) \
+  if (n >= CAFFE2_LOG_THRESHOLD) \
+    ::caffe2::MessageLogger((char*)__FILE__, __LINE__, n).stream()
+#define VLOG(n) LOG((-n))
+
+#define LOG_IF(n, condition)                    \
+  if (n >= CAFFE2_LOG_THRESHOLD && (condition)) \
+  ::caffe2::MessageLogger((char*)__FILE__, __LINE__, n).stream()
+#define VLOG_IF(n, condition) LOG_IF((-n), (condition))
+
+#define VLOG_IS_ON(verboselevel) (CAFFE2_LOG_THRESHOLD <= -(verboselevel))
+
+// Log only if condition is met.  Otherwise evaluates to void.
+#define FATAL_IF(condition) \
+  condition ? (void) 0 : ::caffe2::LoggerVoidify() & \
+      ::caffe2::MessageLogger((char*)__FILE__, __LINE__, FATAL).stream()
+
+// Check for a given boolean condition.
+#define CHECK(condition) FATAL_IF(condition) \
+        << "Check failed: " #condition " "
+
+#ifndef NDEBUG
+// Debug only version of CHECK
+#define DCHECK(condition) FATAL_IF(condition) \
+        << "Check failed: " #condition " "
+#else
+// Optimized version - generates no code.
+#define DCHECK(condition) if(false) CHECK(condition)
+#endif  // NDEBUG
+
+#define CHECK_OP(val1, val2, op) FATAL_IF((val1 op val2)) \
+  << "Check failed: " #val1 " " #op " " #val2 " "
+
+// Check_op macro definitions
+#define CHECK_EQ(val1, val2) CHECK_OP(val1, val2, ==)
+#define CHECK_NE(val1, val2) CHECK_OP(val1, val2, !=)
+#define CHECK_LE(val1, val2) CHECK_OP(val1, val2, <=)
+#define CHECK_LT(val1, val2) CHECK_OP(val1, val2, <)
+#define CHECK_GE(val1, val2) CHECK_OP(val1, val2, >=)
+#define CHECK_GT(val1, val2) CHECK_OP(val1, val2, >)
+
+#ifndef NDEBUG
+// Debug only versions of CHECK_OP macros.
+#define DCHECK_EQ(val1, val2) CHECK_OP(val1, val2, ==)
+#define DCHECK_NE(val1, val2) CHECK_OP(val1, val2, !=)
+#define DCHECK_LE(val1, val2) CHECK_OP(val1, val2, <=)
+#define DCHECK_LT(val1, val2) CHECK_OP(val1, val2, <)
+#define DCHECK_GE(val1, val2) CHECK_OP(val1, val2, >=)
+#define DCHECK_GT(val1, val2) CHECK_OP(val1, val2, >)
+#else  // !NDEBUG
+// These versions generate no code in optimized mode.
+#define DCHECK_EQ(val1, val2) if(false) CHECK_OP(val1, val2, ==)
+#define DCHECK_NE(val1, val2) if(false) CHECK_OP(val1, val2, !=)
+#define DCHECK_LE(val1, val2) if(false) CHECK_OP(val1, val2, <=)
+#define DCHECK_LT(val1, val2) if(false) CHECK_OP(val1, val2, <)
+#define DCHECK_GE(val1, val2) if(false) CHECK_OP(val1, val2, >=)
+#define DCHECK_GT(val1, val2) if(false) CHECK_OP(val1, val2, >)
+#endif  // NDEBUG
+
+// Check that a pointer is not null.
+#define CHECK_NOTNULL(val) \
+  ::caffe2::CheckNotNull( \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// Debug only version of CHECK_NOTNULL
+#define DCHECK_NOTNULL(val) \
+  ::caffe2::CheckNotNull( \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+#else  // !NDEBUG
+// Optimized version - generates no code.
+#define DCHECK_NOTNULL(val) if (false) CHECK_NOTNULL(val)
+#endif  // NDEBUG
+
+// ---------------------- Support for std objects --------------------------
+// These are adapted from glog to support a limited set of logging capability
+// for STL objects.
+
+namespace caffe2 {
+// Forward declare these two, and define them after all the container streams
+// operators so that we can recurse from pair -> container -> container -> pair
+// properly.
+template<class First, class Second>
+std::ostream& operator<<(
+    std::ostream& out, const std::pair<First, Second>& p);
+template <class Iter>
+void PrintSequence(std::ostream& ss, Iter begin, Iter end);
+
+#define INSTANTIATE_FOR_CONTAINER(container) \
+template <class... Types> \
+std::ostream& operator<<( \
+    std::ostream& out, const container<Types...>& seq) { \
+  PrintSequence(out, seq.begin(), seq.end()); \
+  return out; \
+}
+
+INSTANTIATE_FOR_CONTAINER(std::vector)
+INSTANTIATE_FOR_CONTAINER(std::map)
+INSTANTIATE_FOR_CONTAINER(std::set)
+#undef INSTANTIATE_FOR_CONTAINER
+
+template<class First, class Second>
+inline std::ostream& operator<<(
+    std::ostream& out, const std::pair<First, Second>& p) {
+  out << '(' << p.first << ", " << p.second << ')';
+  return out;
+}
+
+template <class Iter>
+inline void PrintSequence(std::ostream& out, Iter begin, Iter end) {
+  // Output at most 100 elements -- appropriate if used for logging.
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0) out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+}
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_LOGGING_IS_NOT_GOOGLE_GLOG_H_
diff --git a/caffe2/core/logging_test.cc b/caffe2/core/logging_test.cc
new file mode 100644
index 0000000..9249e08
--- /dev/null
+++ b/caffe2/core/logging_test.cc
@@ -0,0 +1,85 @@
+#include <algorithm>
+
+#include "caffe2/core/logging.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+TEST(LoggingTest, TestEnforceTrue) {
+  // This should just work.
+  CAFFE_ENFORCE(true, "Isn't it?");
+}
+
+TEST(LoggingTest, TestEnforceFalse) {
+  bool kFalse = false;
+  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kFalse);
+  try {
+    CAFFE_ENFORCE(false, "This throws.");
+    // This should never be triggered.
+    ADD_FAILURE();
+  } catch (const EnforceNotMet&) {
+  }
+  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kFalse);
+}
+
+TEST(LoggingTest, TestEnforceEquals) {
+  int x = 4;
+  int y = 5;
+  try {
+    CAFFE_ENFORCE_THAT(Equals(++x, ++y));
+    // This should never be triggered.
+    ADD_FAILURE();
+  } catch (const EnforceNotMet& err) {
+    EXPECT_NE(err.msg().find("5 vs 6"), string::npos);
+  }
+
+  // arguments are expanded only once
+  CAFFE_ENFORCE_THAT(Equals(++x, y));
+  EXPECT_EQ(x, 6);
+  EXPECT_EQ(y, 6);
+}
+
+TEST(LoggingTest, EnforceShowcase) {
+  // It's not really a test but rather a convenient thing that you can run and
+  // see all messages
+  int one = 1;
+  int two = 2;
+  int three = 3;
+#define WRAP_AND_PRINT(exp)                     \
+  try {                                         \
+    exp;                                        \
+  } catch (const EnforceNotMet&) {              \
+    /* EnforceNotMet already does LOG(ERROR) */ \
+  }
+  WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_NE(one * 2, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_GT(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_GE(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_LT(three, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_LE(three, two));
+
+  WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(
+      one * two + three, three * two, "It's a pretty complicated expression"));
+
+  WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
+}
+
+TEST(LoggingTest, Join) {
+  auto s = Join(", ", vector<int>({1, 2, 3}));
+  EXPECT_EQ(s, "1, 2, 3");
+  s = Join(":", vector<string>());
+  EXPECT_EQ(s, "");
+  s = Join(", ", set<int>({3, 1, 2}));
+  EXPECT_EQ(s, "1, 2, 3");
+}
+
+#if GTEST_HAS_DEATH_TEST
+TEST(LoggingDeathTest, TestEnforceUsingFatal) {
+  bool kTrue = true;
+  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
+  EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
+  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
+}
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/core/macros.h b/caffe2/core/macros.h
new file mode 100644
index 0000000..ccc0b1b
--- /dev/null
+++ b/caffe2/core/macros.h
@@ -0,0 +1,4 @@
+#pragma once
+
+// This is a placeholder. The real content will be generated by the cmake
+// script.
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
new file mode 100644
index 0000000..a055de0
--- /dev/null
+++ b/caffe2/core/macros.h.in
@@ -0,0 +1,83 @@
+// Automatically generated header file for caffe2 macros. These
+// macros are used to build the Caffe2 binary, and if you are
+// building a dependent library, they will need to be set as well
+// for your program to link correctly.
+
+#pragma once
+
+// Caffe2 version. The plan is to increment the minor version every other week
+// as a track point for bugs, until we find a proper versioning cycle.
+
+#define CAFFE2_VERSION_MAJOR @CAFFE2_VERSION_MAJOR@
+#define CAFFE2_VERSION_MINOR @CAFFE2_VERSION_MINOR@
+#define CAFFE2_VERSION_PATCH @CAFFE2_VERSION_PATCH@
+#define CAFFE2_GIT_VERSION "@CAFFE2_GIT_VERSION@"
+
+static_assert(
+    CAFFE2_VERSION_MINOR < 100,
+    "Programming error: you set a minor version that is too big.");
+static_assert(
+    CAFFE2_VERSION_PATCH < 100,
+    "Programming error: you set a patch version that is too big.");
+
+#define CAFFE2_VERSION                                         \
+  (CAFFE2_VERSION_MAJOR * 10000 + CAFFE2_VERSION_MINOR * 100 + \
+   CAFFE2_VERSION_PATCH)
+
+#cmakedefine CAFFE2_ANDROID
+#cmakedefine CAFFE2_BUILD_SHARED_LIBS
+#cmakedefine CAFFE2_FORCE_FALLBACK_CUDA_MPI
+#cmakedefine CAFFE2_HAS_MKL_DNN
+#cmakedefine CAFFE2_HAS_MKL_SGEMM_PACK
+#cmakedefine CAFFE2_PERF_WITH_AVX
+#cmakedefine CAFFE2_PERF_WITH_AVX2
+#cmakedefine CAFFE2_THREADPOOL_MAIN_IMBALANCE
+#cmakedefine CAFFE2_THREADPOOL_STATS
+#cmakedefine CAFFE2_UNIQUE_LONG_TYPEMETA
+#cmakedefine CAFFE2_USE_EXCEPTION_PTR
+#cmakedefine CAFFE2_USE_ACCELERATE
+#cmakedefine CAFFE2_USE_CUDNN
+#cmakedefine CAFFE2_USE_EIGEN_FOR_BLAS
+#cmakedefine CAFFE2_USE_FBCODE
+#cmakedefine CAFFE2_USE_GFLAGS
+#cmakedefine CAFFE2_USE_GOOGLE_GLOG
+#cmakedefine CAFFE2_USE_LITE_PROTO
+#cmakedefine CAFFE2_USE_MKL
+#cmakedefine CAFFE2_USE_IDEEP
+#cmakedefine CAFFE2_USE_NVTX
+#cmakedefine CAFFE2_USE_TRT
+#cmakedefine CAFFE2_DISABLE_NUMA
+
+#ifndef EIGEN_MPL2_ONLY
+#cmakedefine EIGEN_MPL2_ONLY
+#endif
+
+// Useful build settings that are recorded in the compiled binary
+#define CAFFE2_BUILD_STRINGS { \
+  {"GIT_VERSION", "${CAFFE2_GIT_VERSION}"}, \
+  {"CXX_FLAGS", "${CMAKE_CXX_FLAGS}"}, \
+  {"BUILD_TYPE", "${CMAKE_BUILD_TYPE}"}, \
+  {"BLAS", "${BLAS}"}, \
+  {"USE_CUDA", "${USE_CUDA}"}, \
+  {"USE_NCCL", "${USE_NCCL}"}, \
+  {"USE_MPI", "${USE_MPI}"}, \
+  {"USE_GFLAGS", "${USE_GFLAGS}"}, \
+  {"USE_GLOG", "${USE_GLOG}"}, \
+  {"USE_GLOO", "${USE_GLOI}"}, \
+  {"USE_NNPACK", "${USE_NNPACK}"}, \
+  {"USE_OPENMP", "${USE_OPENMP}"}, \
+  {"FORCE_FALLBACK_CUDA_MPI", "${CAFFE2_FORCE_FALLBACK_CUDA_MPI}"}, \
+  {"HAS_MKL_DNN", "${CAFFE2_HAS_MKL_DNN}"}, \
+  {"HAS_MKL_SGEMM_PACK", "${CAFFE2_HAS_MKL_SGEMM_PACK}"}, \
+  {"PERF_WITH_AVX", "${CAFFE2_PERF_WITH_AVX}"}, \
+  {"PERF_WITH_AVX2", "${CAFFE2_PERF_WITH_AVX2}"}, \
+  {"UNIQUE_LONG_TYPEMETA", "${CAFFE2_UNIQUE_LONG_TYPEMETA}"}, \
+  {"USE_EXCEPTION_PTR", "${CAFFE2_USE_EXCEPTION_PTR}"}, \
+  {"USE_ACCELERATE", "${CAFFE2_USE_ACCELERATE}"}, \
+  {"USE_EIGEN_FOR_BLAS", "${CAFFE2_USE_EIGEN_FOR_BLAS}"}, \
+  {"USE_LITE_PROTO", "${CAFFE2_USE_LITE_PROTO}"}, \
+  {"USE_MKL", "${CAFFE2_USE_MKL}"}, \
+  {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
+  {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
+  {"DISABLE_NUMA", "${CAFFE2_DISABLE_NUMA}"}, \
+}
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
new file mode 100644
index 0000000..d9816e7
--- /dev/null
+++ b/caffe2/core/memonger.cc
@@ -0,0 +1,593 @@
+#include "caffe2/core/memonger.h"
+
+#include <set>
+#include <unordered_set>
+
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+namespace memonger {
+
+NetDef optimize_inference_net(
+    const NetDef& net,
+    const std::set<string>& static_blobs) {
+  if (net.type() != "" && net.type() != "simple") {
+    LOG(INFO) << "Cannot optimize memory for nets of type: " << net.type();
+    return net;
+  }
+
+  std::vector<OperatorDef> ops;
+  for (auto& op : net.op()) {
+    if (op.type() == "RecurrentNetwork") {
+      // NOTE: for subtleties of RNN op memonger, see memonger.py on how
+      // to deal with the forward/backward links etc.
+      LOG(INFO) << "Memonger does not support RecurrentNetwork yet";
+      return net;
+    }
+    ops.push_back(op);
+  }
+
+  // Step 1: count first and last operator for each blob
+  std::unordered_set<std::string> all_blobs;
+  std::unordered_map<std::string, std::pair<int, int>> ranges;
+  for (size_t i = 0; i < ops.size(); i++) {
+    for (auto& inp : ops[i].input()) {
+      if (ranges.find(inp) != ranges.end()) {
+        ranges[inp].second = i;
+      }
+      all_blobs.insert(inp);
+    }
+    for (auto& outp : ops[i].output()) {
+      all_blobs.insert(outp);
+      if (static_blobs.find(outp) != static_blobs.end()) {
+        continue;
+      }
+      if (ranges.find(outp) == ranges.end()) {
+        ranges[outp] = std::make_pair(i, i);
+      }
+    }
+  }
+
+  // Step 2: pass over ops and recycle
+  std::vector<std::string> free_blobs;
+  std::unordered_map<std::string, std::string> renaming;
+  std::unordered_map<std::string, std::string> mapping;
+
+  for (int i = 0; i < (int)ops.size(); i++) {
+    auto& op = ops[i];
+    std::unordered_set<std::string> new_free_blobs;
+
+    // Check if some input is used the last time, and release it
+    for (auto& inp : op.input()) {
+      auto rit = ranges.find(inp);
+      if (rit != ranges.end() && rit->second.second == i) {
+        if (mapping.find(inp) == mapping.end()) {
+          new_free_blobs.insert(inp);
+          mapping[inp] = inp;
+
+          // Safety check to prevent double-memongering nets.
+          string shared_blob =
+              "__m" + caffe2::to_string(renaming.size()) + "_shared";
+          if (all_blobs.find(shared_blob) != all_blobs.end()) {
+            LOG(INFO) << "Net was already memongered!";
+            return net;
+          }
+          renaming[inp] = shared_blob;
+        } else {
+          new_free_blobs.insert(mapping[inp]);
+        }
+      }
+    }
+
+    // Check if some output appears the first time, and see if we can replace it
+    // with a recycled blob.
+    for (auto& outp : op.output()) {
+      if (!free_blobs.empty()) {
+        // first use?
+        auto rit = ranges.find(outp);
+        if (rit != ranges.end() && rit->second.first == i) {
+          std::string recycled = free_blobs.back();
+          free_blobs.pop_back();
+          mapping[outp] = recycled;
+        }
+      }
+    }
+
+    // Add blobs released from this op to the pool.
+    for (auto& b : new_free_blobs) {
+      free_blobs.push_back(b);
+    }
+  }
+
+  // Step 3: rename inputs and outputs and create new net
+  NetDef optim_net = net;
+  optim_net.mutable_op()->Clear();
+  for (auto op : ops) {
+    for (int i = 0; i < op.input_size(); i++) {
+      auto& inp = op.input(i);
+      if (mapping.find(inp) != mapping.end()) {
+        op.set_input(i, renaming[mapping[inp]]);
+      }
+    }
+    for (int i = 0; i < op.output_size(); i++) {
+      auto& outp = op.output(i);
+      if (mapping.find(outp) != mapping.end()) {
+        op.set_output(i, renaming[mapping[outp]]);
+      }
+    }
+    auto* ao = optim_net.add_op();
+    ao->CopyFrom(op);
+  }
+
+  VLOG(1) << "optimized net using " << renaming.size() << " shared blobs";
+  return optim_net;
+}
+
+class ComputeBlobRecyclingForDag {
+ public:
+  explicit ComputeBlobRecyclingForDag(const int size)
+      : op_inputs_(size),
+        op_visited_count_(size),
+        op_token_deposit_(size),
+        op_visited_(size, false) {}
+  NetDef OptimizeNet(
+      const NetDef& net,
+      const std::vector<string>& heads,
+      const std::vector<int>& op_indices,
+      const std::unordered_set<string>& shareable_blob_names,
+      const string& namescope,
+      const std::unordered_set<string>& dont_share_blob_names,
+      const std::unordered_map<string, vector<int>>& blob_shapes) {
+    // Construct the set of input blobs.
+    std::unordered_set<string> heads_blobs_set(heads.begin(), heads.end());
+
+    // Construct the set of output blobs we want to optimize.
+    for (const int op_index : op_indices) {
+      for (const auto& output : net.op(op_index).output()) {
+        optim_op_outputs_.insert(output);
+      }
+    }
+
+    // Compute operators in degree (op_inputs_) and initialize how many ops are
+    // sharing input blobs (share_counts_).
+    // Note: We have to handle the cases where output blobs are shared.
+    std::unordered_map<string, int> blob_seen;
+    for (const int op_index : op_indices) {
+      for (const auto& input : net.op(op_index).input()) {
+        if (has_key(shareable_blob_names, input) ||
+            has_key(heads_blobs_set, input)) {
+          if (has_key(optim_op_outputs_, input)) {
+            CAFFE_ENFORCE(
+                blob_seen.find(input) != blob_seen.end(),
+                "Input ",
+                input,
+                " was not output by an op before");
+            op_inputs_[op_index] += blob_seen[input];
+          } else {
+            share_counts_[input] = 1;
+          }
+          blob_to_ops_[input].push_back(op_index);
+        }
+      }
+      for (const auto& output : net.op(op_index).output()) {
+        blob_seen[output] += 1;
+        blob_device_[output] = net.op(op_index).device_option();
+        // Exception for CopyGPUToCPU that has
+        // cuda device option but whose inputs/outputs are on CPU
+        if (net.op(op_index).type() == "CopyGPUToCPU") {
+          blob_device_[output].set_device_type(0);
+          blob_device_[output].set_cuda_gpu_id(0);
+        }
+      }
+    }
+
+    // The main recursive call. Here we do start DFS in the operator graph
+    // from the input blobs.
+    for (const auto& input_blob : heads) {
+      for (const int op_index : blob_to_ops_[input_blob]) {
+        if (!op_visited_[op_index]) {
+          vector<std::pair<int, string>> free_blobs;
+          std::unordered_set<int> tokens{tokens_counter_++};
+          process_op(
+              net,
+              shareable_blob_names,
+              namescope,
+              dont_share_blob_names,
+              blob_shapes,
+              op_index,
+              &free_blobs,
+              &tokens);
+        }
+      }
+    }
+
+    // Rename mapped blobs.
+    std::unordered_map<string, string> renamed;
+    int name_idx = 0;
+    std::unordered_set<string> mapped_blobs_set;
+    for (const auto& mapped_blob : mapping_) {
+      mapped_blobs_set.insert(mapped_blob.second);
+      if (has_key(optim_op_outputs_, mapped_blob.second)) {
+        if (renamed.find(mapped_blob.second) == renamed.end()) {
+          renamed.insert(
+              {mapped_blob.second,
+               namescope + "__m" + caffe2::to_string(name_idx++) + "_shared"});
+        }
+      } else {
+        renamed.insert({mapped_blob.second, mapped_blob.second});
+      }
+    }
+
+    // Recursively rename mapped_blobs.
+    mapping_.insert(renamed.begin(), renamed.end());
+    bool had_changes = true;
+    while (had_changes) {
+      had_changes = false;
+      for (const auto mapped_blob : mapping_) {
+        if (has_key(renamed, mapped_blob.second) &&
+            renamed[mapped_blob.second] != mapped_blob.second) {
+          renamed[mapped_blob.first] = renamed[mapped_blob.second];
+          mapping_[mapped_blob.first] = renamed[mapped_blob.first];
+        }
+      }
+    }
+
+    NetDef optimized_net = apply_assignments(net);
+    LOG(INFO) << "Remapping " << mapping_.size() << " using "
+              << mapped_blobs_set.size() << " shared blobs.";
+    if (floats_saved_ > 0) {
+      LOG(INFO) << "Memonger saved approximately : "
+                << (floats_saved_ * 4.0 / 1024.0 / 1024.0) << " MB.";
+    }
+
+    return optimized_net;
+  }
+
+ private:
+  NetDef apply_assignments(const NetDef& net) {
+    NetDef optimized_net = net;
+    // Rename optimized_net blobs.
+    for (int i = 0; i < optimized_net.op_size(); ++i) {
+      // Special handling for RNNs, which have internal nets that
+      // can refer to memongered blobs
+      if (optimized_net.op(i).type().find("RecurrentNetwork") == 0) {
+        apply_recurrent_blob_assignments(optimized_net.mutable_op(i));
+      }
+
+      for (int j = 0; j < optimized_net.op(i).input_size(); ++j) {
+        const string& input_name =
+            get_blob_or_mapped_blob(optimized_net.op(i).input(j));
+        optimized_net.mutable_op(i)->set_input(j, input_name);
+      }
+
+      for (int j = 0; j < optimized_net.op(i).output_size(); ++j) {
+        auto output_name =
+            get_blob_or_mapped_blob(optimized_net.op(i).output(j));
+        optimized_net.mutable_op(i)->set_output(j, output_name);
+      }
+    }
+    return optimized_net;
+  }
+
+  void apply_recurrent_blob_assignments(OperatorDef* op) {
+    // Recursively map stepnets in RecurrentNetworks, and
+    // attach a mapping table
+    for (int i = 0; i < op->arg_size(); i++) {
+      Argument* arg = op->mutable_arg(i);
+      const string& name = arg->name();
+      if (name == "step_net" || name == "backward_step_net") {
+        if (arg->has_n()) {
+          NetDef* step_net_ref = arg->mutable_n();
+          CAFFE_ENFORCE(
+              !arg->has_s(),
+              "Invalid definition for ",
+              name,
+              ". Only one of NetDef and string should be present");
+          NetDef optimized_net = apply_assignments(*step_net_ref);
+          step_net_ref->CopyFrom(optimized_net);
+        } else {
+          NetDef step_net;
+          CAFFE_ENFORCE(
+              TextFormat::ParseFromString(
+                  arg->s(), &step_net),
+              "Could not parse step net:",
+              name);
+          step_net = apply_assignments(step_net);
+          arg->set_s(ProtoDebugString(step_net));
+        }
+      }
+    }
+
+    // Store renamings
+    vector<string> inputs_outputs(op->input().begin(), op->input().end());
+    inputs_outputs.insert(
+        inputs_outputs.end(), op->output().begin(), op->output().end());
+
+    for (auto& b : inputs_outputs) {
+      string mapped = get_blob_or_mapped_blob(b);
+      if (b != mapped) {
+        Argument* map_arg = op->add_arg();
+        map_arg->set_name(b + ".rename");
+        map_arg->set_s(mapped);
+      }
+    }
+  }
+
+  template <typename K, typename V>
+  inline bool has_key(const std::unordered_map<K, V>& in_map, const K& key) {
+    return in_map.find(key) != in_map.end();
+  }
+
+  template <typename K>
+  inline bool has_key(const std::unordered_set<K>& in_set, const K& key) {
+    return in_set.find(key) != in_set.end();
+  }
+
+  void process_op(
+      const NetDef& net,
+      const std::unordered_set<string>& shareable_blob_names,
+      const string& namescope,
+      const std::unordered_set<string>& dont_share_blob_names,
+      const std::unordered_map<string, vector<int>>& blob_shapes,
+      int op_index,
+      std::vector<std::pair<int, string>>* free_blobs,
+      std::unordered_set<int>* tokens) {
+    // The tokens we have now is the union of current tokens operator is holding
+    // and tokens pushed from parents.
+    tokens->insert(
+        op_token_deposit_[op_index].begin(), op_token_deposit_[op_index].end());
+    op_token_deposit_[op_index].clear();
+    CAFFE_ENFORCE(!op_visited_[op_index]);
+    op_visited_[op_index] = true;
+
+    const OperatorDef& current_op = net.op(op_index);
+
+    // The set of freed input blobs by processing current op.
+    std::vector<std::pair<int, string>> new_free_blobs;
+    std::unordered_set<string> new_free_blobs_set;
+
+    // Now update blob tokens.
+    for (const auto& input : current_op.input()) {
+      const auto& actual_blob = get_blob_or_mapped_blob(input);
+      req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
+      if (actual_blob != input) {
+        req_tokens_[input].insert(tokens->begin(), tokens->end());
+      }
+    }
+    for (const auto& output : current_op.output()) {
+      const auto& actual_blob = get_blob_or_mapped_blob(output);
+      req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
+      if (actual_blob != output) {
+        req_tokens_[output].insert(tokens->begin(), tokens->end());
+      }
+    }
+
+    // Increment blob count and check if we can free input blobs.
+    for (const auto& input : current_op.input()) {
+      if (has_key(shareable_blob_names, input)) {
+        blob_input_count_[input]++;
+        if (blob_input_count_[input] == (int)blob_to_ops_[input].size()) {
+          const string& actual_blob = get_blob_or_mapped_blob(input);
+          if (!has_key(dont_share_blob_names, actual_blob)) {
+            new_free_blobs.emplace_back(
+                -share_counts_[actual_blob], actual_blob);
+            new_free_blobs_set.insert(actual_blob);
+          }
+        }
+      }
+    }
+
+    // Check if we can recycle free blobs and use it as output blob.
+    for (const auto& output : current_op.output()) {
+      if (has_key(shareable_blob_names, output) &&
+          !has_key(processed_output_blobs_, output) &&
+          !has_key(new_free_blobs_set, output)) {
+        const string freed_blob = get_free_blob(
+            output, blob_shapes, tokens, free_blobs, blob_device_[output]);
+        if (freed_blob != "") {
+          req_tokens_[freed_blob].insert(tokens->begin(), tokens->end());
+          share_counts_[freed_blob]++;
+          mapping_[output] = freed_blob;
+        }
+        processed_output_blobs_.insert(output);
+      }
+    }
+
+    // Insert new freed blobs.
+    std::unordered_set<string> free_blob_set;
+    for (const auto& free_blob : *free_blobs) {
+      free_blob_set.insert(free_blob.second);
+    }
+    for (const auto& new_free_blob : new_free_blobs) {
+      if (!has_key(free_blob_set, new_free_blob.second)) {
+        free_blobs->push_back(new_free_blob);
+        if (blob_shapes.size() > 0) {
+          if (!has_key(blob_sizes_, new_free_blob.second)) {
+            blob_sizes_.insert(
+                {new_free_blob.second,
+                 infer_blob_size(new_free_blob.second, blob_shapes)});
+          }
+        }
+        std::push_heap(
+            free_blobs->begin(),
+            free_blobs->end(),
+            std::greater<std::pair<int, string>>());
+      }
+    }
+
+    int num_branches = 0;
+    for (const auto& output : current_op.output()) {
+      num_branches += blob_to_ops_[output].size();
+    }
+
+    for (const auto& output : current_op.output()) {
+      for (const auto& input_op_index : blob_to_ops_[output]) {
+        op_visited_count_[input_op_index]++;
+        if (op_visited_count_[input_op_index] == op_inputs_[input_op_index]) {
+          std::unordered_set<int> new_tokens;
+          new_tokens.insert(tokens->begin(), tokens->end());
+          if (num_branches > 1) {
+            new_tokens.insert(tokens_counter_++);
+          }
+          process_op(
+              net,
+              shareable_blob_names,
+              namescope,
+              dont_share_blob_names,
+              blob_shapes,
+              input_op_index,
+              free_blobs,
+              &new_tokens);
+        } else {
+          if (!op_visited_[input_op_index]) {
+            op_token_deposit_[input_op_index].insert(
+                tokens->begin(), tokens->end());
+          }
+        }
+      }
+    }
+  }
+
+  inline int infer_blob_size(
+      const string& blob_name,
+      const std::unordered_map<string, vector<int>>& blob_shapes) {
+    const auto& blob_shapes_iter = blob_shapes.find(blob_name);
+    if (blob_shapes_iter == blob_shapes.end()) {
+      return 0;
+    }
+    int size = 1;
+    for (size_t i = 0; i < blob_shapes_iter->second.size(); ++i) {
+      size *= blob_shapes_iter->second[i];
+    }
+    return size;
+  }
+
+  inline string get_blob_or_mapped_blob(const string& blob_name) {
+    auto mapped_blob = mapping_.find(blob_name);
+    if (mapped_blob == mapping_.end()) {
+      return blob_name;
+    } else {
+      return mapped_blob->second;
+    }
+  }
+
+  // Rturns true if the op that generates that blob acquires all tokens.
+  inline bool can_use_blob(
+      const string& blob_name,
+      std::unordered_set<int>* tokens,
+      const DeviceOption& device_option) {
+    const DeviceOption& blob_device = blob_device_[blob_name];
+    if (device_option.device_type() != blob_device.device_type() ||
+        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
+      return false;
+    }
+    for (const int token : req_tokens_[blob_name]) {
+      if (tokens->find(token) == tokens->end()) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // Returns the name of the blob that we are going to map blob_name into.
+  inline string get_free_blob(
+      const string& blob_name,
+      const std::unordered_map<string, vector<int>>& blob_shapes,
+      std::unordered_set<int>* tokens,
+      std::vector<std::pair<int, string>>* free_blobs,
+      const DeviceOption& device) {
+    string freed_blob = "";
+    if (blob_shapes.size() == 0) {
+      std::vector<std::pair<int, string>> cant_use_blobs;
+      while (free_blobs->size() > 0) {
+        std::pop_heap(
+            free_blobs->begin(),
+            free_blobs->end(),
+            std::greater<std::pair<int, string>>());
+        const auto cand_free_blob = free_blobs->back();
+        free_blobs->pop_back();
+        if (can_use_blob(cand_free_blob.second, tokens, device)) {
+          freed_blob = cand_free_blob.second;
+          break;
+        } else {
+          cant_use_blobs.push_back(cand_free_blob);
+        }
+      }
+      for (const auto& cant_use_blob : cant_use_blobs) {
+        free_blobs->push_back(cant_use_blob);
+        std::push_heap(
+            free_blobs->begin(),
+            free_blobs->end(),
+            std::greater<std::pair<int, string>>());
+      }
+    } else {
+      // Heuristic to choose the largest blob to fit output thats
+      // slightly less than blob_size.
+      const int blob_size = infer_blob_size(blob_name, blob_shapes);
+      int best_size = -1;
+      int free_blob_index = -1;
+      for (size_t i = 0; i < free_blobs->size(); ++i) {
+        const string& cb_name = (*free_blobs)[i].second;
+        if (can_use_blob(cb_name, tokens, device)) {
+          const int cand_bz = blob_sizes_[cb_name];
+          CAFFE_ENFORCE(blob_sizes_.find(cb_name) != blob_sizes_.end());
+          if (cand_bz >= best_size) {
+            if (best_size < blob_size || best_size >= cand_bz) {
+              best_size = cand_bz;
+              free_blob_index = i;
+            }
+          }
+        }
+      }
+      if (free_blob_index != -1) {
+        floats_saved_ += best_size;
+        freed_blob = (*free_blobs)[free_blob_index].second;
+        free_blobs->erase(free_blobs->begin() + free_blob_index);
+      }
+    }
+    return freed_blob;
+  };
+
+  int tokens_counter_ = 1;
+  int floats_saved_ = 0;
+  // blob_name -> Op edges.
+  std::unordered_map<string, std::vector<int>> blob_to_ops_;
+  // Current Op in degree.
+  std::unordered_map<string, int> blob_input_count_;
+  // Op in degree.
+  std::vector<int> op_inputs_;
+  // Current Op visit counts.
+  std::vector<int> op_visited_count_;
+  std::unordered_map<string, int> share_counts_;
+  std::unordered_map<string, int> blob_sizes_;
+  std::unordered_map<string, std::unordered_set<int>> req_tokens_;
+  std::vector<std::unordered_set<int>> op_token_deposit_;
+  std::unordered_set<string> optim_op_outputs_;
+  std::unordered_map<string, string> mapping_;
+  std::unordered_map<string, DeviceOption> blob_device_;
+  // The set of output blobs we already processed.
+  std::unordered_set<string> processed_output_blobs_;
+  std::vector<bool> op_visited_;
+};
+
+NetDef compute_blob_recycling_for_dag(
+    const NetDef& net,
+    const std::vector<string>& heads,
+    const std::vector<int>& op_indices,
+    const std::unordered_set<string>& shareable_blob_names,
+    const string& namescope,
+    const std::unordered_set<string>& dont_share_blob_names,
+    const std::unordered_map<string, vector<int>>& blob_shapes) {
+  ComputeBlobRecyclingForDag memonger(net.op_size());
+  return memonger.OptimizeNet(
+      net,
+      heads,
+      op_indices,
+      shareable_blob_names,
+      namescope,
+      dont_share_blob_names,
+      blob_shapes);
+}
+
+} // memonger
+} // caffe2
diff --git a/caffe2/core/memonger.h b/caffe2/core/memonger.h
new file mode 100644
index 0000000..fe65ae5
--- /dev/null
+++ b/caffe2/core/memonger.h
@@ -0,0 +1,29 @@
+#ifndef CAFFE2_CORE_MEMONGER_H_
+#define CAFFE2_CORE_MEMONGER_H_
+
+#include <unordered_set>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+namespace memonger {
+
+NetDef optimize_inference_net(
+    const NetDef& net,
+    const std::set<string>& static_blobs);
+
+NetDef compute_blob_recycling_for_dag(
+    const NetDef& net,
+    const std::vector<string>& heads,
+    const std::vector<int>& op_indices,
+    const std::unordered_set<string>& shareable_blob_names,
+    const string& namescope,
+    const std::unordered_set<string>& dont_share_blob_names,
+    const std::unordered_map<string, vector<int>>& blob_shapes);
+
+} // memonger
+} // caffe2
+
+#endif
diff --git a/caffe2/core/module.cc b/caffe2/core/module.cc
new file mode 100644
index 0000000..2d47933
--- /dev/null
+++ b/caffe2/core/module.cc
@@ -0,0 +1,122 @@
+#include "caffe2/core/logging.h"
+#include "caffe2/core/module.h"
+
+#ifndef _MSC_VER
+#include <dlfcn.h>
+#endif
+
+namespace caffe2 {
+
+static std::mutex& gModuleChangeMutex() {
+  static std::mutex m_;
+  return m_;
+}
+
+static CaffeMap<string, const ModuleSchema*>& MutableCurrentModules() {
+  static CaffeMap<string, const ModuleSchema*> module_schema_map_;
+  return module_schema_map_;
+}
+
+// Note(jiayq): I am not sure whether the module handles are going to be used
+// as C2 uses modules via registration, but let's keep the handles at least.
+static CaffeMap<string, void*> CurrentModuleHandles() {
+  static CaffeMap<string, void*> module_handle_map_;
+  return module_handle_map_;
+}
+
+const CaffeMap<string, const ModuleSchema*>& CurrentModules() {
+  return MutableCurrentModules();
+}
+
+ModuleSchema::ModuleSchema(const char* name, const char* description)
+    : name_(name), description_(description) {
+  std::lock_guard<std::mutex> guard(gModuleChangeMutex());
+  MutableCurrentModules().emplace(name, this);
+}
+
+bool HasModule(const string& name) {
+ auto& modules = CurrentModules();
+ return (modules.find(name) != modules.end());
+}
+
+#ifdef _MSC_VER
+
+void LoadModule(const string& name, const string& filename) {
+  CAFFE_ENFORCE(!HasModule(name),
+    "On Windows, LoadModule is currently not supported yet and you should "
+    "use static linking for any module that you intend to use.");
+}
+
+#else
+
+void LoadModule(const string& name, const string& filename) {
+  CAFFE_ENFORCE(
+      name.size() > 0 || filename.size() > 0,
+      "You must provide at least one of name and filename.");
+  if (name.size() && HasModule(name)) {
+    VLOG(1) << "Module " << name << " already present. Skip loading."; 
+    return;
+  }
+  void* handle = nullptr;
+  if (filename.size()) {
+    handle = dlopen(
+        filename.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    CAFFE_ENFORCE(handle != nullptr,
+      "Cannot load module ",
+      name,
+      " (with given filename ",
+      filename,
+      "), are you sure it is correct?");
+  } else {
+    string inferred_name = string("lib") + name + ".so";
+    handle = dlopen(
+        inferred_name.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifdef __APPLE__
+    // For apple, we will also try the dylib extension.
+    if (!handle) {
+      string inferred_name = string("lib") + name + ".dylib";
+      handle = dlopen(
+          inferred_name.c_str(), RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
+    }
+#endif
+    CAFFE_ENFORCE(handle != nullptr,
+        "Cannot load module ",
+        name,
+        " (with inferred filename ",
+        inferred_name,
+        "), are you sure it is in the dynamic linker search path?");
+  }
+  // After the module is loaded, we should check if it actually has the
+  // intended module name. If not, it might be that the module file name
+  // and the module name are inconsistent.
+  if (name.size()) {
+    string module_name_check = "gCaffe2ModuleSanityCheck" + name;
+    CAFFE_ENFORCE(dlsym(handle, module_name_check.c_str()),
+        "The loaded module ",
+        name,
+        " did not pass the module name sanity check. Is it built with the "
+        "right configs? Make sure the file name and the CAFFE2_MODULE name "
+        "are consistent.");
+    // After it passes the dlopen and dlsym check, we should add it to the
+    // current handles.
+    std::lock_guard<std::mutex> guard(gModuleChangeMutex());
+    CurrentModuleHandles()[name] = handle;
+  } else {
+    // If not, we issue a warning that one is recommended to use explicit
+    // module name.
+    LOG(WARNING)
+        << "Module file " << filename
+        << " was loaded without a proper module name. It is recommended "
+           "that one load a model with an explicit module name in addition "
+           "to the filename.";
+    // As a contingency, we will store the current module handle with the
+    // filename.
+    std::lock_guard<std::mutex> guard(gModuleChangeMutex());
+    CurrentModuleHandles()[filename] = handle;
+  }
+}
+
+#endif // _MSC_VER
+
+}  // namespace caffe2
+
diff --git a/caffe2/core/module.h b/caffe2/core/module.h
new file mode 100644
index 0000000..7550567
--- /dev/null
+++ b/caffe2/core/module.h
@@ -0,0 +1,75 @@
+/**
+ * A global dictionary that holds information about what Caffe2 modules have
+ * been loaded in the current runtime, and also utility functions to load
+ * modules.
+ */
+#ifndef CAFFE2_CORE_MODULE_H_
+#define CAFFE2_CORE_MODULE_H_
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+/**
+ * A module schema that can be used to store specific information about
+ * different modules. Currently, we only store the name and a simple
+ * description of what this module does.
+ */
+class ModuleSchema {
+ public:
+  ModuleSchema(const char* name, const char* description);
+
+ private:
+  const char* name_;
+  const char* description_;
+};
+
+
+/**
+ * @brief Current Modules present in the Caffe2 runtime.
+ * Returns:
+ *   map: a map of modules and (optionally) their description. The key is the
+ *       module name, and the value is the description for that module. The
+ *       module name is recommended to be the part that constitutes the trunk
+ *       of the dynamic library: for example, a module called
+ *       libcaffe2_db_rocksdb.so should have the name "caffe2_db_rocksdb". The
+ *       reason we do not use "lib" is because it's somewhat redundant, and
+ *       the reason we do not include ".so" is for cross-platform compatibility
+ *       on platforms like mac os.
+ */
+const CaffeMap<string, const ModuleSchema*>& CurrentModules();
+
+/**
+ * @brief Checks whether a module is already present in the current binary.
+ */
+bool HasModule(const string& name);
+
+/**
+ * @brief Load a module.
+ * Inputs:
+ *   name: a module name or a path name.
+ *       It is recommended that you use the name of the module, and leave the
+ *       full path option to only experimental modules.
+ *   filename: (optional) a filename that serves as a hint to load the module.
+ */
+void LoadModule(const string& name, const string& filename="");
+
+
+#define CAFFE2_MODULE(name, description)                                    \
+  extern "C" {                                                              \
+    bool gCaffe2ModuleSanityCheck##name() { return true; }                  \
+  }                                                                         \
+  namespace {                                                               \
+    static ::caffe2::ModuleSchema module_schema_##name(#name, description); \
+  }
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_MODULE_H_
diff --git a/caffe2/core/module_test.cc b/caffe2/core/module_test.cc
new file mode 100644
index 0000000..e5adaba
--- /dev/null
+++ b/caffe2/core/module_test.cc
@@ -0,0 +1,77 @@
+#include <iostream>
+#include <memory>
+
+#include "caffe2/core/module.h"
+#include "caffe2/core/operator.h"
+#include <gtest/gtest.h>
+#include "caffe2/core/logging.h"
+
+// An explicitly defined module, testing correctness when we statically link a
+// module
+CAFFE2_MODULE(caffe2_module_test_static, "Static module for testing.");
+
+namespace caffe2 {
+
+class Caffe2ModuleTestStaticDummyOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  virtual string type() {
+    return "base";
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+  Caffe2ModuleTestStaticDummy, Caffe2ModuleTestStaticDummyOp);
+OPERATOR_SCHEMA(Caffe2ModuleTestStaticDummy);
+
+TEST(ModuleTest, StaticModule) {
+  const string name = "caffe2_module_test_static";
+  const auto& modules = CurrentModules();
+  EXPECT_EQ(modules.count(name), 1);
+  EXPECT_TRUE(HasModule(name));
+
+  // LoadModule should not raise an error, since the module is already present.
+  LoadModule(name);
+  // Even a non-existing path should not cause error.
+  LoadModule(name, "/does/not/exist.so");
+  EXPECT_EQ(modules.count(name), 1);
+  EXPECT_TRUE(HasModule(name));
+
+  // The module will then introduce the Caffe2ModuleTestStaticDummyOp.
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("Caffe2ModuleTestStaticDummy");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+}
+
+#ifdef CAFFE2_BUILD_SHARED_LIBS
+TEST(ModuleTest, DynamicModule) {
+  const string name = "caffe2_module_test_dynamic";
+  const auto& modules = CurrentModules();
+  EXPECT_EQ(modules.count(name), 0);
+  EXPECT_FALSE(HasModule(name));
+
+  // Before loading, we should not be able to create the op.
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("Caffe2ModuleTestDynamicDummy");
+  EXPECT_THROW(
+      CreateOperator(op_def, &ws),
+      EnforceNotMet);
+
+  // LoadModule should load the proper module.
+  LoadModule(name);
+  EXPECT_EQ(modules.count(name), 1);
+  EXPECT_TRUE(HasModule(name));
+
+  // The module will then introduce the Caffe2ModuleTestDynamicDummyOp.
+  unique_ptr<OperatorBase> op_after_load = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op_after_load.get());
+}
+#endif
+
+}  // namespace caffe2
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
new file mode 100644
index 0000000..dfca113
--- /dev/null
+++ b/caffe2/core/net.cc
@@ -0,0 +1,216 @@
+#include "caffe2/core/net.h"
+#include "caffe2/core/net_simple.h"
+
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(
+    caffe2_override_executor,
+    "",
+    "Comma-separated list of executor overrides");
+
+namespace caffe2 {
+
+CAFFE_DEFINE_REGISTRY(
+    NetRegistry,
+    NetBase,
+    const std::shared_ptr<const NetDef>&,
+    Workspace*);
+
+NetBase::NetBase(
+    const std::shared_ptr<const NetDef>& def,
+    Workspace* /* unused */)
+    : external_input_(
+          def->external_input().begin(),
+          def->external_input().end()),
+      external_output_(
+          def->external_output().begin(),
+          def->external_output().end()),
+      name_(def->name()),
+      net_def_(def) {
+  static GlobalInitIsCalledGuard guard;
+  // Check that node_name is empty for all ops
+  for (const OperatorDef& op : def->op()) {
+    if (op.has_device_option()) {
+      CAFFE_ENFORCE(
+          !op.device_option().has_node_name(),
+          "node_name must be empty for all operators at execution time.");
+    }
+  }
+
+  // Go through the operators and make sure that blobs are correctly made.
+  std::set<string> known_blobs(
+      external_input_.begin(), external_input_.end());
+  std::set<string> remaining_output(
+      external_output_.begin(), external_output_.end());
+  for (const auto& blob : known_blobs) {
+    remaining_output.erase(blob);
+  }
+  for (const OperatorDef& op : def->op()) {
+    for (const string& in : op.input()) {
+      if (!known_blobs.count(in)) {
+        if (external_input_.size()) {
+          CAFFE_THROW(
+              "op ",
+              op.type(),
+              ": Source for input ",
+              in,
+              " is unknown for net ",
+              def->name(),
+              ", operator ",
+              ProtoDebugString(op));
+        } else {
+          // If we are not declaring input and output, we will simply VLOG it
+          // for debugging purposes.
+          VLOG(1) << "op " << op.type() << ": input " << in << " is unknown.";
+        }
+      }
+    }
+    for (const string& out : op.output()) {
+      known_blobs.insert(out);
+      remaining_output.erase(out);
+    }
+  }
+  // Finally, check if all declared outputs are being created.
+  CAFFE_ENFORCE(
+      remaining_output.size() == 0,
+      "Some of the blobs are declared as output but never produced by the "
+      "net ",
+      def->name(),
+      ", the first one is ",
+      *remaining_output.begin());
+}
+
+bool NetBase::RunAsync() {
+  for (auto& op : GetOperators()) {
+    op->ResetEvent();
+  }
+  return DoRunAsync();
+}
+
+namespace {
+const std::string kSimpleNet = "simple";
+
+std::vector<NetObserverCreator>* GetNetObserverCreators() {
+  static std::vector<NetObserverCreator> creators;
+  return &creators;
+}
+
+const std::unordered_map<std::string, std::string>& defaultOverrides() {
+  static const std::unordered_map<std::string, std::string> overrides = {
+      {"dag", "async_scheduling"},
+      {"async_dag", "async_scheduling"},
+      {"async_polling", "async_scheduling"},
+      {"async_simple", "simple"},
+  };
+  return overrides;
+}
+
+void checkExecutorOverride(std::string& net_type) {
+  auto executors = caffe2::split(',', FLAGS_caffe2_override_executor);
+  CAFFE_ENFORCE(
+      executors.size() % 2 == 0, "Invalid override executors flag value");
+  std::unordered_map<std::string, std::string> overrides;
+  for (const auto& kv : defaultOverrides()) {
+    overrides[kv.first] = kv.second;
+  }
+  for (size_t idx = 0; idx < executors.size(); idx += 2) {
+    overrides[executors[idx]] = executors[idx + 1];
+  }
+  if (overrides.count(net_type)) {
+    VLOG(1) << "Overrode net type '" << net_type << "' with '"
+            << overrides[net_type] << "'";
+    net_type = overrides[net_type];
+  }
+}
+
+} // namespace
+
+void AddGlobalNetObserverCreator(NetObserverCreator creator) {
+  GetNetObserverCreators()->push_back(creator);
+  VLOG(1) << "Have set a custom GlobalNetObserverCreator";
+}
+
+void ClearGlobalNetObservers() {
+  GetNetObserverCreators()->clear();
+  VLOG(1) << "All net observers cleared";
+}
+
+unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws) {
+  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
+  return CreateNet(tmp_net_def, ws);
+}
+
+unique_ptr<NetBase> CreateNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws) {
+  std::string net_type;
+  if (net_def->has_type()) {
+    net_type = net_def->type();
+  } else {
+    // By default, we will return a simple network that just runs all operators
+    // sequentially.
+    net_type = kSimpleNet;
+  }
+  checkExecutorOverride(net_type);
+  unique_ptr<NetBase> net = NetRegistry()->Create(net_type, net_def, ws);
+
+  VLOG(1) << "Adding a global observer to a net";
+  if (net) {
+    auto* observer_creators = GetNetObserverCreators();
+    for (auto& creator : *observer_creators) {
+      net->AttachObserver(creator(net.get()));
+    }
+  }
+  return net;
+}
+
+TaskThreadPool* ExecutorHelper::GetPool(
+    const DeviceOption& /* unused */) const {
+  CAFFE_THROW("Not implemented");
+}
+
+std::vector<float> NetBase::TEST_Benchmark(
+    const int warmup_runs,
+    const int main_runs,
+    const bool run_individual) {
+  LOG(INFO) << "Starting benchmark, running warmup runs";
+  CAFFE_ENFORCE(
+      warmup_runs >= 0,
+      "Number of warm up runs should be non negative, provided ",
+      warmup_runs);
+  for (int run_idx = 0; run_idx < warmup_runs; ++run_idx) {
+    CAFFE_ENFORCE(Run(), "Warmup run ", run_idx, " has failed");
+  }
+
+  LOG(INFO) << "Running main runs";
+  CAFFE_ENFORCE(
+      main_runs >= 0,
+      "Number of main runs should be non negative, provided ",
+      main_runs);
+
+  Timer timer;
+  for (int run_idx = 0; run_idx < main_runs; ++run_idx) {
+    CAFFE_ENFORCE(Run(), "Main run ", run_idx, " has failed");
+  }
+  auto millis = timer.MilliSeconds();
+  LOG(INFO) << "Main runs finished. Milliseconds per iter: "
+            << millis / main_runs
+            << ". Iters per second: " << 1000.0 * main_runs / millis;
+
+  if (run_individual) {
+    LOG(INFO) << "Net does not support per-op benchmark; "
+                 "to run it, switch to a simple net type";
+  }
+  return std::vector<float>{millis / main_runs};
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
new file mode 100644
index 0000000..f900286
--- /dev/null
+++ b/caffe2/core/net.h
@@ -0,0 +1,165 @@
+#ifndef CAFFE2_CORE_NET_H_
+#define CAFFE2_CORE_NET_H_
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <thread> // NOLINT
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/simple_queue.h"
+#include "caffe2/utils/thread_pool.h"
+
+CAFFE2_DECLARE_string(caffe2_override_executor);
+
+namespace caffe2 {
+
+class NetBase;
+typedef ObserverBase<NetBase> NetObserver;
+typedef std::function<std::unique_ptr<NetObserver>(NetBase*)>
+    NetObserverCreator;
+
+class OperatorBase;
+class Workspace;
+
+// Net is a thin struct that owns all the operators together with the operator
+// contexts.
+class NetBase : public Observable<NetBase> {
+ public:
+  NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  virtual ~NetBase() noexcept {}
+
+  virtual bool SupportsAsync() = 0;
+  inline const vector<const Event*>& events() const {
+    return events_;
+  }
+
+  virtual void Wait() {
+    // by default just wait till all events are finished
+    for (const auto& event : events_) {
+      event->Finish();
+    }
+  }
+
+  virtual bool Run() {
+    if (!RunAsync()) {
+      LOG(ERROR) << "Failed to execute async run";
+      return false;
+    }
+    Wait();
+    return handleRunError();
+  }
+
+  virtual bool RunAsync();
+
+  /**
+   * Benchmarks a network.
+   *
+   * This function returns a vector of float recording the number of milli-
+   * seconds spent during the benchmark. The 0-th item is the time spent per
+   * each network run, and if a net instantiation supports run_individual,
+   * the remainder of the vector returns the number of milliseconds spent per
+   * opeartor.
+   */
+  virtual vector<float> TEST_Benchmark(
+      const int /*warmup_runs*/,
+      const int /*main_runs*/,
+      const bool /*run_individual*/);
+
+  inline const vector<string>& external_output() const {
+    return external_output_;
+  }
+
+  inline const vector<string>& external_input() const {
+    return external_input_;
+  }
+
+  /* Used to attach Observers to operators of a Net
+   *
+   * Returns pointers to objects owned with unique_ptrs.
+   * Use with caution.
+   */
+  virtual vector<OperatorBase*> GetOperators() const = 0;
+
+  const string& Name() const {
+    return name_;
+  }
+
+  inline const NetDef& debug_def() const {
+    CAFFE_ENFORCE(has_debug_def(), "net_def was null!");
+    return *net_def_;
+  }
+
+  inline bool has_debug_def() const {
+    return net_def_ != nullptr;
+  }
+
+ protected:
+  virtual bool DoRunAsync() {
+    CAFFE_THROW("Not implemented");
+  };
+
+  virtual bool handleRunError() {
+    for (const Event* event : events_) {
+      if (event->Query() != EventStatus::EVENT_SUCCESS) {
+        CAFFE_THROW(event->ErrorMessage());
+      }
+    }
+    return true;
+  }
+
+  vector<string> external_input_;
+  vector<string> external_output_;
+  string name_;
+  vector<const Event*> events_;
+  std::shared_ptr<const NetDef> net_def_;
+  DISABLE_COPY_AND_ASSIGN(NetBase);
+};
+
+class ExecutorHelper {
+ public:
+  ExecutorHelper() {}
+  virtual TaskThreadPool* GetPool(const DeviceOption& option) const;
+  virtual ~ExecutorHelper() {}
+};
+
+CAFFE_DECLARE_REGISTRY(
+    NetRegistry,
+    NetBase,
+    const std::shared_ptr<const NetDef>&,
+    Workspace*);
+#define REGISTER_NET_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
+#define REGISTER_NET(name, ...) \
+  CAFFE_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
+
+/**
+ * @brief Creates a network, accessing / creating blobs in the given workspace.
+ *
+ * Note that this is different from Workspace::CreateNet. The latter adds the
+ * created net object to the workspace's net map, while this function returns
+ * a standalone net object.
+ */
+unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
+unique_ptr<NetBase> CreateNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws);
+
+void AddGlobalNetObserverCreator(NetObserverCreator creator);
+
+void ClearGlobalNetObservers();
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_H_
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
new file mode 100644
index 0000000..8164e38
--- /dev/null
+++ b/caffe2/core/net_async_base.cc
@@ -0,0 +1,496 @@
+#include "caffe2/core/net_async_base.h"
+
+#include "caffe2/core/net_async_tracing.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+// experimental support for multiple streams per worker per GPU
+CAFFE2_DEFINE_int(
+    caffe2_streams_per_gpu,
+    1,
+    "Number of streams per worker per GPU"
+    " to use in GPU thread pool (experimental)");
+
+CAFFE2_DECLARE_bool(caffe2_dag_net_collect_stats);
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_finish_chain,
+    false,
+    "Wait for chain to finish");
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_always_schedule_child,
+    false,
+    "Always schedule child chains from parent chain");
+
+CAFFE2_DEFINE_int(
+    caffe2_net_async_max_gpus,
+    16,
+    "Max number of GPUs allowed in net async executor");
+
+CAFFE2_DEFINE_int(
+    caffe2_net_async_max_numa_nodes,
+    8,
+    "Max number of NUMA nodes allowed in net async executor");
+
+CAFFE2_DEFINE_int(
+    caffe2_net_async_cpu_pool_size,
+    0,
+    "Number of threads in CPU pool by default");
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_check_stream_status,
+    false,
+    "Select next non-busy stream");
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_use_single_pool,
+    false,
+    "Use single thread pool for all devices");
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_use_per_net_pools,
+    false,
+    "Use per net thread pools");
+
+namespace caffe2 {
+
+thread_local std::vector<int> AsyncNetBase::stream_counters_;
+
+AsyncNetBase::AsyncNetBase(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : NetBase(net_def, ws) {
+  operator_nodes_ = dag_utils::prepareOperatorNodes(net_def, ws);
+  helper_ = caffe2::make_unique<AsyncNetExecutorHelper>(this);
+  operators_.reserve(operator_nodes_.size());
+  for (const auto& node : operator_nodes_) {
+    auto op_ptr = node.operator_.get();
+    op_ptr->SetExecutorHelper(helper_.get());
+    operators_.push_back(op_ptr);
+  }
+
+  execution_chains_ = dag_utils::computeChains(operator_nodes_);
+  chains_.reserve(execution_chains_.size());
+  for (const auto& kv : execution_chains_) {
+    chains_.push_back(kv.second);
+  }
+  chain_nodes_ = dag_utils::prepareChainGraphNodes(operator_nodes_, chains_);
+
+  events_.reserve(chains_.size());
+  for (const auto& chain : chains_) {
+    const auto& last_op = operators_[chain.back()];
+    events_.push_back(&last_op->event());
+    for (const auto& op_id : chain) {
+      if (op_id == chain.back() || op_id == chain.front()) {
+        continue;
+      }
+      const auto& op = operators_[op_id];
+      op->DisableEvent();
+    }
+  }
+
+  num_workers_ = net_def->has_num_workers() ? net_def->num_workers() : -1;
+
+  tracer_ = tracing::create(this, net_def->name());
+  if (tracer_) {
+    LOG(INFO) << "Tracing net: " << net_def->name();
+  }
+
+  computeExecutionModeFlags();
+}
+
+bool AsyncNetBase::handleRunError() {
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  std::unique_lock<std::mutex> exception_lock(exception_mutex_);
+  if (caught_exception_) {
+    std::rethrow_exception(caught_exception_);
+  }
+#endif // CAFFE2_USE_EXCEPTION_PTR
+  return success_;
+}
+
+bool AsyncNetBase::RunAsync() {
+  tracing::startIter(tracer_);
+  reset();
+  return DoRunAsync();
+}
+
+TaskThreadPool* AsyncNetBase::pool_getter(
+    PoolsMap& pools,
+    int device_type,
+    int device_id,
+    int pool_size) {
+  std::unique_lock<std::mutex> pools_lock(pools_mutex_);
+  auto pool = pools[device_id][pool_size];
+  if (!pool) {
+    pool = ThreadPoolRegistry()->Create(
+        DeviceTypeName(device_type), device_id, pool_size, use_per_net_pools_);
+    pools[device_id][pool_size] = pool;
+  }
+  return pool.get();
+}
+
+TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
+  if (use_single_pool_) {
+    return pool_getter(cpu_pools_, CPU, -1, num_workers_);
+  }
+  static const std::unordered_set<int> cpu_types{
+      CPU,
+      MKLDNN,
+      IDEEP,
+      ONLY_FOR_TEST,
+  };
+  if (cpu_types.find(device_option.device_type()) != cpu_types.end()) {
+    auto numa_node_id = device_option.numa_node_id();
+    CAFFE_ENFORCE(
+        numa_node_id >= -1 &&
+            numa_node_id < FLAGS_caffe2_net_async_max_numa_nodes,
+        "Invalid NUMA node id: " + caffe2::to_string(numa_node_id));
+    return pool_getter(cpu_pools_, CPU, numa_node_id, num_workers_);
+  } else if (device_option.device_type() == CUDA) {
+    auto gpu_id = device_option.cuda_gpu_id();
+    CAFFE_ENFORCE(
+        gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
+        "Invalid GPU id: " + caffe2::to_string(gpu_id));
+    return pool_getter(gpu_pools_, CUDA, gpu_id, num_workers_);
+  } else {
+    CAFFE_THROW(
+        "Unsupported device type " +
+        caffe2::to_string(device_option.device_type()));
+  }
+}
+
+int AsyncNetBase::stream(int task_id) {
+  const auto& device_option = event(task_id).GetDeviceOption();
+  int stream_id = 0;
+  if (device_option.device_type() == CUDA) {
+    int gpu_id = device_option.cuda_gpu_id();
+    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
+    if ((unsigned)gpu_id >= stream_counters_.size()) {
+      stream_counters_.resize(gpu_id + 1, 0);
+    }
+    do {
+      stream_id = stream_counters_[gpu_id]++;
+      stream_counters_[gpu_id] %= streams_per_gpu_;
+    } while (check_stream_status_ && !isStreamFree(task_id, stream_id));
+  }
+  return stream_id;
+}
+
+bool AsyncNetBase::isStreamFree(int task_id, int stream_id) const {
+  auto& task = chains_[task_id];
+  auto& last_task_op = operators_[task.back()];
+  return last_task_op->IsStreamFree(stream_id);
+}
+
+bool AsyncNetBase::canSchedule(
+    int task_id,
+    const std::vector<EventStatus>* status,
+    bool* parent_failed) {
+  auto first_child_op_id = chains_[task_id].front();
+  for (auto parent_id : parents(task_id)) {
+    auto last_parent_op_id = chains_[parent_id].back();
+    EventStatus parent_status;
+    if (status) {
+      parent_status = status->at(parent_id);
+    } else {
+      parent_status = operators_[last_parent_op_id]->event().Query();
+    }
+
+    if (parent_status == EventStatus::EVENT_FAILED) {
+      if (parent_failed) {
+        *parent_failed = true;
+      }
+      return false;
+    }
+
+    bool can_schedule = Event::CanSchedule(
+        operators_[last_parent_op_id]->event().GetType(),
+        parent_status,
+        operators_[first_child_op_id]->event().GetType(),
+        operators_[first_child_op_id]->SupportsAsyncScheduling());
+    if (!can_schedule) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AsyncNetBase::canSchedule(int parent_id, int child_id) {
+  auto& parent_event = event(parent_id);
+  auto first_child_op_id = chains_[child_id].front();
+  auto* first_child_op = operators_[first_child_op_id];
+  return Event::CanSchedule(
+      parent_event.GetType(),
+      parent_event.Query(),
+      first_child_op->event().GetType(),
+      first_child_op->SupportsAsyncScheduling());
+}
+
+int AsyncNetBase::tasksNum() const {
+  return chains_.size();
+}
+
+Event& AsyncNetBase::event(int task_id) const {
+  auto& task = chains_[task_id];
+  auto& last_task_op = operators_[task.back()];
+  return last_task_op->event();
+}
+
+EventStatus AsyncNetBase::query(int task_id) const {
+  return event(task_id).Query();
+}
+
+const std::vector<int>& AsyncNetBase::children(int task_id) const {
+  const auto& task_node = chain_nodes_[task_id];
+  return task_node.children_;
+}
+
+const std::vector<int>& AsyncNetBase::parents(int task_id) const {
+  const auto& task_node = chain_nodes_[task_id];
+  return task_node.parents_;
+}
+
+int AsyncNetBase::getParentCount(int child_id) {
+  auto& child_ops = chains_[child_id];
+  auto& child_node = operator_nodes_[child_ops.front()];
+  return child_node.runtime_parent_count_.load();
+}
+
+int AsyncNetBase::updateParentCount(int child_id) {
+  auto& child_ops = chains_[child_id];
+  auto& child_node = operator_nodes_[child_ops.front()];
+  int parent_count = --child_node.runtime_parent_count_;
+  CAFFE_ENFORCE_GE(parent_count, 0);
+  return parent_count;
+}
+
+bool AsyncNetBase::testAndSetScheduled(int task_id) {
+  auto& task_ops = chains_[task_id];
+  auto& task_op_node = operator_nodes_[task_ops.front()];
+  return !task_op_node.scheduled_.test_and_set();
+}
+
+int AsyncNetBase::num_ops(int task_id) const {
+  return chains_[task_id].size();
+}
+
+void AsyncNetBase::asyncWait(
+    int task_id,
+    int stream_id,
+    const std::vector<int>& wait_task_ids) const {
+  auto first_op_id = chains_[task_id].front();
+  auto& first_op = operators_[first_op_id];
+  std::vector<const Event*> events;
+  events.reserve(wait_task_ids.size());
+  for (auto wait_task_id : wait_task_ids) {
+    events.push_back(&event(wait_task_id));
+  }
+  first_op->WaitEvents(events, stream_id);
+}
+
+void AsyncNetBase::reset() {
+  for (auto& op : GetOperators()) {
+    op->ResetEvent();
+  }
+  for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+    auto& task_ops = chains_[task_id];
+    auto& task_op_node = operator_nodes_[task_ops.front()];
+    task_op_node.runtime_parent_count_ = parents(task_id).size();
+    task_op_node.scheduled_.clear();
+  }
+
+  success_ = true;
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  std::unique_lock<std::mutex> exception_lock(exception_mutex_);
+  caught_exception_ = nullptr;
+#endif // CAFFE2_USE_EXCEPTION_PTR
+}
+
+void AsyncNetBase::storeExceptionPtr() {
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  std::unique_lock<std::mutex> exception_lock(exception_mutex_);
+  if (!caught_exception_) {
+    caught_exception_ = std::current_exception();
+  }
+#endif // CAFFE2_USE_EXCEPTION_PTR
+}
+
+void AsyncNetBase::setTaskErrorMessage(
+    int task_id,
+    const std::string& err_msg) {
+  if (query(task_id) == EventStatus::EVENT_INITIALIZED) {
+    event(task_id).SetFinished(err_msg.c_str());
+  }
+}
+
+bool AsyncNetBase::run(int task_id, int stream_id) {
+  OperatorBase* op = nullptr;
+  try {
+    // Optionally insert async wait ops,
+    // skip when using --caffe2_net_async_finish_chain -
+    // all parents are guaranteed to be finished
+    if (!finish_chain_) {
+      asyncWait(task_id, stream_id, parents(task_id));
+    }
+    for (auto& op_id : chains_[task_id]) {
+      op = operators_[op_id];
+      TRACE_EVENT(
+          tracing::TRACE_OP,
+          op_id,
+          tracing::TRACE_TASK,
+          task_id,
+          tracing::TRACE_STREAM,
+          stream_id);
+      bool success = op->RunAsync(stream_id);
+      if (!success) {
+        auto err_msg = "Failed to execute an op: " +
+            (op->has_debug_def() ? op->type() : " unknown");
+        setTaskErrorMessage(task_id, err_msg);
+        LOG(ERROR) << err_msg;
+        return false;
+      }
+    }
+
+    op = nullptr;
+    if (finish_chain_) {
+      operators_[chains_[task_id].back()]->event().Finish();
+    }
+  } catch (const std::exception& e) {
+    storeExceptionPtr();
+    std::string err_msg = e.what();
+    if (op) {
+      err_msg += ",  op " + (op->has_debug_def() ? op->type() : " unknown");
+    }
+    setTaskErrorMessage(task_id, err_msg);
+    LOG(ERROR) << err_msg;
+    return false;
+  } catch (...) {
+    storeExceptionPtr();
+    std::string err_msg = "Failed to execute task: unknown error";
+    if (op) {
+      err_msg += ",  op " + (op->has_debug_def() ? op->type() : " unknown");
+    }
+    setTaskErrorMessage(task_id, err_msg);
+    LOG(ERROR) << err_msg;
+    return false;
+  }
+
+  return true;
+}
+
+void AsyncNetBase::finishTasks(const std::unordered_set<int>& task_ids) {
+  for (const auto& task_id : task_ids) {
+    event(task_id).Finish();
+  }
+}
+
+void AsyncNetBase::finalizeEvents() {
+  for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+    auto status = query(task_id);
+    if (status == EventStatus::EVENT_SCHEDULED) {
+      event(task_id).Finish();
+    } else if (status == EventStatus::EVENT_INITIALIZED) {
+      event(task_id).SetFinished();
+    }
+  }
+}
+
+AsyncNetBase::~AsyncNetBase() {}
+
+CAFFE_DEFINE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPool,
+    int,
+    int,
+    bool);
+
+CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
+
+/* static */
+std::shared_ptr<TaskThreadPool>
+GetAsyncNetCPUThreadPool(int numa_node_id, int pool_size, bool create_new) {
+  // Note: numa_node_id = -1 (DeviceOption's default value) corresponds to
+  // no NUMA used
+  static std::
+      unordered_map<int, std::unordered_map<int, std::weak_ptr<TaskThreadPool>>>
+          pools;
+  static std::mutex pool_mutex;
+
+  if (pool_size <= 0) {
+    if (FLAGS_caffe2_net_async_cpu_pool_size > 0) {
+      pool_size = FLAGS_caffe2_net_async_cpu_pool_size;
+      LOG(INFO) << "Using default CPU pool size: " << pool_size
+                << "; NUMA node id: " << numa_node_id;
+    } else {
+      auto num_cores = std::thread::hardware_concurrency();
+      CAFFE_ENFORCE(num_cores > 0, "Failed to get number of CPU cores");
+      LOG(INFO) << "Using estimated CPU pool size: " << num_cores
+                << "; NUMA node id: " << numa_node_id;
+      pool_size = num_cores;
+    }
+  } else {
+    LOG(INFO) << "Using specified CPU pool size: " << pool_size
+              << "; NUMA node id: " << numa_node_id;
+  }
+
+  if (create_new) {
+    LOG(INFO) << "Created new CPU pool, size: " << pool_size
+              << "; NUMA node id: " << numa_node_id;
+    return std::make_shared<TaskThreadPool>(pool_size, numa_node_id);
+  } else {
+    std::lock_guard<std::mutex> lock(pool_mutex);
+
+    auto shared_pool = pools[numa_node_id][pool_size].lock();
+    if (!shared_pool) {
+      LOG(INFO) << "Created shared CPU pool, size: " << pool_size
+                << "; NUMA node id: " << numa_node_id;
+      shared_pool = std::make_shared<TaskThreadPool>(pool_size, numa_node_id);
+      pools[numa_node_id][pool_size] = shared_pool;
+    }
+    return shared_pool;
+  }
+}
+
+void AsyncNetBase::computeExecutionModeFlags() {
+  static const std::string kDag = "dag";
+  static const std::string kProfDag = "prof_dag";
+  static const std::string kAsyncDag = "async_dag";
+  static const std::string kSimpleNet = "simple";
+
+  std::string net_type;
+  if (net_def_->has_type() && !net_def_->type().empty()) {
+    net_type = net_def_->type();
+  } else {
+    net_type = kSimpleNet;
+  }
+  if (net_type == kDag || net_type == kProfDag) {
+    streams_per_gpu_ = 1;
+    finish_chain_ = true;
+    always_schedule_child_ = true;
+    check_stream_status_ = false;
+    use_single_pool_ = true;
+    use_per_net_pools_ = true;
+    is_blocking_ = true;
+  } else if (net_type == kAsyncDag) {
+    streams_per_gpu_ = 1;
+    finish_chain_ = false;
+    always_schedule_child_ = true;
+    check_stream_status_ = false;
+    use_single_pool_ = true;
+    use_per_net_pools_ = true;
+    is_blocking_ = true;
+  } else {
+    streams_per_gpu_ = FLAGS_caffe2_streams_per_gpu;
+    finish_chain_ = FLAGS_caffe2_net_async_finish_chain;
+    always_schedule_child_ = FLAGS_caffe2_net_async_always_schedule_child;
+    check_stream_status_ = FLAGS_caffe2_net_async_check_stream_status;
+    use_single_pool_ = FLAGS_caffe2_net_async_use_single_pool;
+    use_per_net_pools_ = FLAGS_caffe2_net_async_use_per_net_pools;
+    is_blocking_ = false;
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
new file mode 100644
index 0000000..c4425ff
--- /dev/null
+++ b/caffe2/core/net_async_base.h
@@ -0,0 +1,165 @@
+#ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_
+#define CAFFE2_CORE_NET_ASYNC_BASE_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/net_async_base.h"
+#include "caffe2/core/net_dag_utils.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/thread_pool.h"
+
+CAFFE2_DECLARE_int(caffe2_streams_per_gpu);
+CAFFE2_DECLARE_bool(caffe2_net_async_finish_chain);
+CAFFE2_DECLARE_bool(caffe2_net_async_always_schedule_child);
+CAFFE2_DECLARE_int(caffe2_net_async_max_gpus);
+CAFFE2_DECLARE_int(caffe2_net_async_max_numa_nodes);
+CAFFE2_DECLARE_int(caffe2_net_async_cpu_pool_size);
+CAFFE2_DECLARE_bool(caffe2_net_async_check_stream_status);
+CAFFE2_DECLARE_bool(caffe2_net_async_use_single_pool);
+CAFFE2_DECLARE_bool(caffe2_net_async_use_per_net_pools);
+
+namespace caffe2 {
+
+class AsyncNetExecutorHelper;
+
+namespace tracing {
+class Tracer;
+}
+
+class AsyncNetBase : public NetBase {
+ public:
+  AsyncNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  ~AsyncNetBase() override;
+
+  bool SupportsAsync() override {
+    return true;
+  }
+
+  vector<OperatorBase*> GetOperators() const override {
+    return operators_;
+  }
+
+  bool RunAsync() override;
+
+  const dag_utils::ExecutionChains& TEST_execution_chains() const {
+    return execution_chains_;
+  }
+
+ protected:
+  bool canSchedule(
+      int chain_id,
+      const std::vector<EventStatus>* status = nullptr,
+      bool* parent_failed = nullptr);
+  bool canSchedule(int parent_id, int child_id);
+
+  int tasksNum() const;
+  Event& event(int task_id) const;
+  EventStatus query(int task_id) const;
+  const std::vector<int>& children(int task_id) const;
+  const std::vector<int>& parents(int task_id) const;
+  int updateParentCount(int child_id);
+  int getParentCount(int child_id);
+  bool testAndSetScheduled(int task_id);
+  int num_ops(int task_id) const;
+
+  void asyncWait(
+      int task_id,
+      int stream_id,
+      const std::vector<int>& wait_task_ids) const;
+  bool run(int task_id, int stream_id);
+  int stream(int task_id);
+  TaskThreadPool* pool(const DeviceOption& device_option);
+
+  void finishTasks(const std::unordered_set<int>& task_ids);
+  void finalizeEvents();
+
+  bool isStreamFree(int task_id, int stream_id) const;
+
+  virtual void reset();
+
+  bool handleRunError() override;
+
+  // Operator/task graph
+  std::vector<OperatorBase*> operators_;
+  std::vector<dag_utils::OperatorNode> operator_nodes_;
+  std::vector<std::vector<int>> chains_;
+  std::vector<dag_utils::OpGraphNode> chain_nodes_; // chains' parents/children
+  dag_utils::ExecutionChains execution_chains_; // for testing
+
+  // Pools and streams
+  std::mutex pools_mutex_;
+  // first int key - device id, second - pool size, one pool per (device, size)
+  typedef std::unordered_map<
+      int,
+      std::unordered_map<int, std::shared_ptr<TaskThreadPool>>>
+      PoolsMap;
+  PoolsMap cpu_pools_;
+  PoolsMap gpu_pools_;
+  static thread_local std::vector<int> stream_counters_;
+  int num_workers_;
+
+  // Exception/error handling
+  void setTaskErrorMessage(int task_id, const std::string& err_msg);
+  std::atomic<bool> success_;
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  // Mutex that protects caught_exception_
+  std::mutex exception_mutex_;
+  std::exception_ptr caught_exception_;
+#endif // CAFFE2_USE_EXCEPTION_PTR
+
+  // Tracing
+  std::shared_ptr<tracing::Tracer> tracer_;
+
+  // execution mode flags
+  void computeExecutionModeFlags();
+  int streams_per_gpu_;
+  bool finish_chain_;
+  bool always_schedule_child_;
+  bool check_stream_status_;
+  bool use_single_pool_;
+  bool use_per_net_pools_;
+  bool is_blocking_;
+
+  DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
+
+ private:
+  void storeExceptionPtr();
+
+  TaskThreadPool*
+  pool_getter(PoolsMap& pools, int device_type, int device_id, int pool_size);
+
+  std::unique_ptr<AsyncNetExecutorHelper> helper_;
+
+  friend class AsyncNetExecutorHelper;
+  friend class tracing::Tracer;
+};
+
+CAFFE_DECLARE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPool,
+    int,
+    int,
+    bool);
+
+class AsyncNetExecutorHelper : public ExecutorHelper {
+ public:
+  explicit AsyncNetExecutorHelper(AsyncNetBase* net) : net_(net) {}
+  TaskThreadPool* GetPool(const DeviceOption& option) const override {
+    return net_->pool(option);
+  }
+
+ private:
+  AsyncNetBase* net_;
+};
+
+std::shared_ptr<TaskThreadPool>
+GetAsyncNetCPUThreadPool(int numa_node_id, int pool_size, bool create_new);
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_BASE_H_
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
new file mode 100644
index 0000000..12bd33a
--- /dev/null
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -0,0 +1,218 @@
+#include "caffe2/core/net_async_dag_gpu.h"
+
+#include <set>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "caffe2/core/context_gpu.h"
+
+#ifdef CAFFE2_USE_NVTX
+#include <nvToolsExt.h>
+#endif
+
+CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling");
+
+CAFFE2_DEFINE_bool(
+    caffe2_async_dag_use_multiple_streams,
+    false,
+    "Use multiple streams per thread");
+
+CAFFE2_DECLARE_bool(caffe2_dag_net_collect_stats);
+
+CAFFE2_DECLARE_bool(caffe2_net_async_finish_chain);
+
+CAFFE2_DECLARE_int(caffe2_streams_per_gpu);
+
+CAFFE2_DECLARE_bool(caffe2_net_async_check_stream_status);
+
+namespace caffe2 {
+
+thread_local std::vector<int> AsyncDAGNet::stream_counters_;
+
+namespace {
+
+using Color = int32_t;
+constexpr Color kRunColor = 0x0000CCFF; // blue
+constexpr Color kRecordColor = 0x00FF3300; // red
+constexpr Color kWaitColor = 0x0066FF33; // green
+
+#ifdef CAFFE2_USE_NVTX
+
+class ProfiledRange {
+ public:
+  ProfiledRange(const OperatorDef& def, Color color) {
+    if (!FLAGS_caffe2_use_nvtx) {
+      return;
+    }
+    nvtxEventAttributes_t eventAttrib = {0};
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.color = color;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.message.ascii = def.type().c_str();
+    range_ = nvtxRangeStartEx(&eventAttrib);
+    CAFFE_ENFORCE(range_, "Start range is invalid.");
+  }
+
+  ~ProfiledRange() {
+    if (!FLAGS_caffe2_use_nvtx) {
+      return;
+    }
+    nvtxRangeEnd(range_);
+  }
+
+ private:
+  nvtxRangeId_t range_ = 0;
+  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+};
+
+#else
+
+class ProfiledRange {
+ public:
+  ProfiledRange(const OperatorDef& def, Color color) {}
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+};
+
+#endif // ifdef CAFFE2_USE_NVTX
+
+} // namespace
+
+AsyncDAGNet::AsyncDAGNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : DAGNetBase(net_def, ws) {
+  VLOG(1) << "Constructing Async DAG Net " << net_def->name();
+  eventRecorded_.resize(net_def->op_size());
+
+  // For all chains, their tail should consist the list of events that we are
+  // needing for synchronization in the Run() inteface, unless there are other
+  // chains depending on it.
+  events_.reserve(execution_chains_.size());
+  for (const auto& chain : execution_chains_) {
+    const int tail_op_idx = chain.second.back();
+    if (operator_nodes_[tail_op_idx].children_.empty()) {
+      events_.push_back(&operator_nodes_[tail_op_idx].operator_->event());
+    }
+  }
+  VLOG(1) << "Total " << execution_chains_.size()
+          << " chains, final waiting on " << events_.size() << " events";
+}
+
+int AsyncDAGNet::stream(const DeviceOption& device_option) {
+  int stream_id = 0;
+  if (device_option.device_type() == CUDA) {
+    int gpu_id = device_option.cuda_gpu_id();
+    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
+    if ((unsigned)gpu_id >= stream_counters_.size()) {
+      stream_counters_.resize(gpu_id + 1, 0);
+    }
+    do {
+      stream_id = stream_counters_[gpu_id]++;
+      stream_counters_[gpu_id] %= FLAGS_caffe2_streams_per_gpu;
+    } while (FLAGS_caffe2_net_async_check_stream_status &&
+             !CUDAContext::IsStreamFree(device_option, stream_id));
+  }
+  return stream_id;
+}
+
+bool AsyncDAGNet::RunAt(int chain_id, const std::vector<int>& chain) {
+  CAFFE_ENFORCE(!chain.empty(), "Chain should not be empty.");
+  const auto source_idx = chain.front();
+  const auto& parents = operator_nodes_[source_idx].parents_;
+  // Help ensure that our chaining is correct by verifying at least
+  // one parent recorded an event.
+  CAFFE_ENFORCE(
+      parents.empty() ||
+          std::any_of(
+              parents.begin(),
+              parents.end(),
+              [this](int p) { return eventRecorded_[p]; }),
+      "None of the parent is recorded for an event.");
+
+  int stream_id = 0;
+  if (FLAGS_caffe2_async_dag_use_multiple_streams) {
+    stream_id = stream(
+        operator_nodes_[source_idx].operator_->event().GetDeviceOption());
+  }
+
+  std::vector<const Event*> parent_events;
+  parent_events.reserve(operator_nodes_[source_idx].parents_.size());
+  for (auto source_parent_idx : operator_nodes_[source_idx].parents_) {
+    parent_events.push_back(
+        &operator_nodes_[source_parent_idx].operator_->event());
+  }
+  {
+    ProfiledRange r(
+        operator_nodes_[source_idx].operator_->debug_def(), kWaitColor);
+    operator_nodes_[source_idx].operator_->WaitEvents(parent_events, stream_id);
+  }
+
+  if (FLAGS_caffe2_dag_net_collect_stats) {
+    const auto& device_option =
+        operator_nodes_[source_idx].operator_->event().GetDeviceOption();
+    CAFFE_EVENT(
+        stats_[device_option.device_type()],
+        task_wait_time_us,
+        task_timers_[chain_id]->MicroSeconds());
+  }
+
+  // We've waited on all our parent indices.
+  bool success = true;
+  for (auto idx : chain) {
+    ProfiledRange r(operator_nodes_[idx].operator_->debug_def(), kRunColor);
+    {
+      TRACE_EVENT(
+          tracing::TRACE_OP,
+          idx,
+          tracing::TRACE_TASK,
+          chain_id,
+          tracing::TRACE_STREAM,
+          stream_id);
+      success &= operator_nodes_[idx].operator_->RunAsync(stream_id);
+    }
+  }
+
+  const auto& sink_idx = chain.back();
+  if (success && FLAGS_caffe2_net_async_finish_chain) {
+    operator_nodes_[sink_idx].operator_->event().Finish();
+  }
+  CAFFE_ENFORCE(
+      !eventRecorded_[sink_idx],
+      "An event for ",
+      sink_idx,
+      " should not be recorded.");
+  eventRecorded_[sink_idx] = 1;
+
+  if (FLAGS_caffe2_dag_net_collect_stats) {
+    const auto& device_option =
+        operator_nodes_[source_idx].operator_->event().GetDeviceOption();
+    CAFFE_EVENT(
+        stats_[device_option.device_type()],
+        task_time_to_scheduled_us,
+        task_timers_[chain_id]->MicroSeconds());
+  }
+  return success;
+}
+
+bool AsyncDAGNet::DoRunAsync() {
+  // Reset the event tracking at each iteration
+  eventRecorded_.assign(eventRecorded_.size(), 0);
+
+  const auto result = DAGNetBase::DoRunAsync();
+  return result;
+}
+
+REGISTER_NET(async_dag, AsyncDAGNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_dag_gpu.h b/caffe2/core/net_async_dag_gpu.h
new file mode 100644
index 0000000..f447c6b
--- /dev/null
+++ b/caffe2/core/net_async_dag_gpu.h
@@ -0,0 +1,40 @@
+#ifndef CAFFE2_CORE_NET_ASYNC_DAG_GPU_H_
+#define CAFFE2_CORE_NET_ASYNC_DAG_GPU_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net_dag.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// Run an event-driven graph - before each operator chain, wait on each parent
+// operator for the chain source, then execute each operator. Due to the chain
+// construction mechanism, operators in the same chain implicitly runs on the
+// same stream.
+// AsyncDAGNet is only registered in gpu mode, because CPU code is always sync
+// and a CPU only AsyncDAG net is essentially a DAG net.
+class AsyncDAGNet : public DAGNetBase {
+ public:
+  AsyncDAGNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  bool SupportsAsync() override {
+    return true;
+  }
+  bool RunAt(int chain_id, const std::vector<int>& chain) override;
+
+ protected:
+  bool DoRunAsync() override;
+
+  // Tracks whether a given op has had an event recorded in each
+  // RunAt() iteration.
+  std::vector<int32_t> eventRecorded_;
+
+  int stream(const DeviceOption& device_option);
+  static thread_local std::vector<int> stream_counters_;
+
+  DISABLE_COPY_AND_ASSIGN(AsyncDAGNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_DAG_GPU_H_
diff --git a/caffe2/core/net_async_gpu_thread_pool.h b/caffe2/core/net_async_gpu_thread_pool.h
new file mode 100644
index 0000000..29ed24c
--- /dev/null
+++ b/caffe2/core/net_async_gpu_thread_pool.h
@@ -0,0 +1,13 @@
+#ifndef CAFFE2_CORE_NET_ASYNC_GPU_THREAD_POOL_H_
+#define CAFFE2_CORE_NET_ASYNC_GPU_THREAD_POOL_H_
+
+#include "caffe2/core/net_async_base.h"
+
+namespace caffe2 {
+
+std::shared_ptr<TaskThreadPool>
+GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new);
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_GPU_THREAD_POOL_H_
diff --git a/caffe2/core/net_async_gpu_thread_pool_gpu.cc b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
new file mode 100644
index 0000000..ca3f691
--- /dev/null
+++ b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
@@ -0,0 +1,43 @@
+#include "caffe2/core/net_async_gpu_thread_pool.h"
+
+#include "caffe2/core/context_gpu.h"
+
+CAFFE2_DEFINE_int(caffe2_threads_per_gpu, 1, "Number of CPU threads per GPU");
+
+namespace caffe2 {
+
+CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
+
+std::shared_ptr<TaskThreadPool>
+GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new) {
+  // For GPU, use per device thread pools of predefined constant size
+  if (pool_size != FLAGS_caffe2_threads_per_gpu) {
+    LOG(INFO) << "Overriding GPU pool size: using "
+              << FLAGS_caffe2_threads_per_gpu << " threads per GPU";
+  }
+  static std::unordered_map<int, std::weak_ptr<TaskThreadPool>> pools;
+  static std::mutex pool_mutex;
+
+  if (create_new) {
+    LOG(INFO) << "Created new GPU pool, size: " << FLAGS_caffe2_threads_per_gpu
+              << "; GPU id: " << gpu_id;
+    return std::make_shared<TaskThreadPool>(FLAGS_caffe2_threads_per_gpu);
+  } else {
+    std::lock_guard<std::mutex> lock(pool_mutex);
+
+    std::shared_ptr<TaskThreadPool> shared_pool = nullptr;
+    if (pools.count(gpu_id)) {
+      shared_pool = pools.at(gpu_id).lock();
+    }
+    if (!shared_pool) {
+      LOG(INFO) << "Created shared GPU pool, size: "
+                << FLAGS_caffe2_threads_per_gpu << "; GPU id: " << gpu_id;
+      shared_pool =
+          std::make_shared<TaskThreadPool>(FLAGS_caffe2_threads_per_gpu);
+      pools[gpu_id] = shared_pool;
+    }
+    return shared_pool;
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_polling.cc b/caffe2/core/net_async_polling.cc
new file mode 100644
index 0000000..67d2c64
--- /dev/null
+++ b/caffe2/core/net_async_polling.cc
@@ -0,0 +1,187 @@
+#include "caffe2/core/net_async_polling.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+CAFFE2_DECLARE_bool(caffe2_dag_net_collect_stats);
+
+namespace caffe2 {
+
+AsyncPollingNet::AsyncPollingNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : AsyncNetBase(net_def, ws), running_(false) {
+  task_timers_.resize(tasksNum());
+  for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+    task_timers_[task_id] = caffe2::make_unique<Timer>();
+  }
+
+  stats_.reserve(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+  for (auto device_idx = 0;
+       device_idx < DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
+       ++device_idx) {
+    stats_.emplace_back(
+        "async_net/stats/" + net_def->name() + "/" +
+        caffe2::DeviceTypeName(device_idx));
+  }
+
+  reset();
+}
+
+bool AsyncPollingNet::DoRunAsync() {
+  CAFFE_ENFORCE(!running_, "Concurrent RunAsync calls");
+  running_ = true;
+  reset();
+
+  StartAllObservers();
+
+  Timer timer;
+  bool success = pollAndSchedule();
+  if (FLAGS_caffe2_dag_net_collect_stats) {
+    CAFFE_EVENT(stats_[CPU], poll_time_ms, timer.MilliSeconds());
+  }
+  if (!success) {
+    finalizeEvents();
+  }
+
+  StopAllObservers();
+  running_ = false;
+  return success;
+}
+
+void AsyncPollingNet::schedule(int task_id) {
+  if (FLAGS_caffe2_dag_net_collect_stats) {
+    task_timers_[task_id]->Start();
+  }
+  const auto& device_option = event(task_id).GetDeviceOption();
+  pool(device_option)->run([this, task_id, device_option]() {
+    int stream_id = stream(task_id);
+
+    if (FLAGS_caffe2_dag_net_collect_stats) {
+      CAFFE_EVENT(
+          stats_[device_option.device_type()],
+          task_pool_wait_time_us,
+          task_timers_[task_id]->MicroSeconds());
+    }
+
+    try {
+      if (FLAGS_caffe2_dag_net_collect_stats) {
+        Timer run_time;
+        run(task_id, stream_id);
+        CAFFE_EVENT(
+            stats_[device_option.device_type()],
+            task_run_time_us,
+            run_time.MicroSeconds());
+      } else {
+        run(task_id, stream_id);
+      }
+    } catch (const std::exception&) {
+      has_chain_failed_ = true;
+    }
+  });
+}
+
+void AsyncPollingNet::reset() {
+  status_.clear();
+  status_.resize(tasksNum(), EventStatus::EVENT_INITIALIZED);
+  has_chain_failed_ = false;
+}
+
+bool AsyncPollingNet::pollAndSchedule() {
+  std::unordered_set<int> scheduled_tasks;
+  std::unordered_set<int> current_tasks;
+
+  for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+    if (parents(task_id).empty()) {
+      current_tasks.insert(task_id);
+      scheduled_tasks.insert(task_id);
+      schedule(task_id);
+    }
+  }
+
+  Timer timer;
+  while (!current_tasks.empty()) {
+    std::unordered_set<int> updated_tasks;
+    std::unordered_set<int> next_tasks;
+    updated_tasks.reserve(current_tasks.size());
+
+    if (FLAGS_caffe2_dag_net_collect_stats) {
+      timer.Start();
+    }
+    if (has_chain_failed_) {
+      finishTasks(current_tasks);
+      return false;
+    }
+    for (auto& task_id : current_tasks) {
+      auto prev_status = status_[task_id];
+      status_[task_id] = query(task_id);
+      if (status_[task_id] == EventStatus::EVENT_FAILED) {
+        finishTasks(current_tasks);
+        return false;
+      }
+
+      if (prev_status != status_[task_id]) {
+        updated_tasks.insert(task_id);
+        if (FLAGS_caffe2_dag_net_collect_stats) {
+          updateTaskStats(task_id);
+        }
+      }
+
+      if (status_[task_id] != EventStatus::EVENT_SUCCESS) {
+        next_tasks.insert(task_id);
+      }
+    }
+    if (FLAGS_caffe2_dag_net_collect_stats) {
+      CAFFE_EVENT(
+          stats_[CPU], poll_status_update_time_us, timer.MicroSeconds());
+    }
+
+    std::unordered_set<int> visited_children;
+    for (auto& task_id : updated_tasks) {
+      CAFFE_ENFORCE(
+          status_[task_id] == EventStatus::EVENT_SCHEDULED ||
+          status_[task_id] == EventStatus::EVENT_SUCCESS);
+
+      for (auto& child_id : children(task_id)) {
+        if (!visited_children.count(child_id)) {
+          visited_children.insert(child_id);
+          // Important - check whether we have already scheduled the task,
+          // e.g. a child CUDA task can be scheduled after parent CUDA
+          // task becomes EventStatus::EVENT_SCHEDULED and also later when
+          // parent CUDA task becomes EventStatus::EVENT_SUCCESS
+          if (!scheduled_tasks.count(child_id) &&
+              canSchedule(child_id, &status_)) {
+            next_tasks.insert(child_id);
+            scheduled_tasks.insert(child_id);
+            schedule(child_id);
+          }
+        }
+      }
+    }
+
+    current_tasks.swap(next_tasks);
+  }
+  return true;
+}
+
+void AsyncPollingNet::updateTaskStats(int task_id) {
+  const auto& device_option = event(task_id).GetDeviceOption();
+  if (status_[task_id] == EventStatus::EVENT_SCHEDULED) {
+    CAFFE_EVENT(
+        stats_[device_option.device_type()],
+        task_time_to_scheduled_us,
+        task_timers_[task_id]->MicroSeconds());
+  }
+  if (status_[task_id] == EventStatus::EVENT_SUCCESS) {
+    CAFFE_EVENT(
+        stats_[device_option.device_type()],
+        task_time_to_succeeded_ms,
+        task_timers_[task_id]->MilliSeconds());
+  }
+}
+
+AsyncPollingNet::~AsyncPollingNet() {}
+
+REGISTER_NET(async_polling, AsyncPollingNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_polling.h b/caffe2/core/net_async_polling.h
new file mode 100644
index 0000000..dc807bb
--- /dev/null
+++ b/caffe2/core/net_async_polling.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_CORE_NET_ASYNC_POLLING_H_
+#define CAFFE2_CORE_NET_ASYNC_POLLING_H_
+
+#include "caffe2/core/net_async_base.h"
+
+namespace caffe2 {
+
+class AsyncPollingNet : public AsyncNetBase {
+ public:
+  AsyncPollingNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  ~AsyncPollingNet() override;
+
+ protected:
+  bool DoRunAsync() override;
+
+  bool pollAndSchedule();
+  void schedule(int task_id);
+
+  // Synchronization
+  std::mutex running_mutex_;
+  std::condition_variable running_cv_;
+  std::atomic<bool> running_;
+
+  // Stats
+  struct AsyncPollingNetStats {
+    CAFFE_STAT_CTOR(AsyncPollingNetStats);
+    CAFFE_AVG_EXPORTED_STAT(poll_time_ms);
+    CAFFE_AVG_EXPORTED_STAT(task_pool_wait_time_us);
+    CAFFE_AVG_EXPORTED_STAT(task_run_time_us);
+    CAFFE_AVG_EXPORTED_STAT(poll_status_update_time_us);
+    CAFFE_AVG_EXPORTED_STAT(task_time_to_scheduled_us);
+    CAFFE_AVG_EXPORTED_STAT(task_time_to_succeeded_ms);
+  };
+  mutable std::vector<AsyncPollingNetStats> stats_;
+  std::vector<std::unique_ptr<Timer>> task_timers_;
+  void updateTaskStats(int task_id);
+
+  // Polling
+  std::vector<EventStatus> status_;
+  void reset() override;
+  std::atomic<bool> has_chain_failed_;
+
+  DISABLE_COPY_AND_ASSIGN(AsyncPollingNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_POLLING_H_
diff --git a/caffe2/core/net_async_scheduling.cc b/caffe2/core/net_async_scheduling.cc
new file mode 100644
index 0000000..7feb363
--- /dev/null
+++ b/caffe2/core/net_async_scheduling.cc
@@ -0,0 +1,250 @@
+#include "caffe2/core/net_async_scheduling.h"
+
+#include "caffe2/core/net_async_tracing.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_optimize_polling,
+    true,
+    "Use event callbacks whenever possible instead of polling");
+
+namespace caffe2 {
+
+AsyncSchedulingNet::AsyncSchedulingNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : AsyncNetBase(net_def, ws), running_(false), use_dfs_scheduling_(false) {
+  for (int arg_idx = 0; arg_idx < net_def->arg_size(); ++arg_idx) {
+    auto& arg = net_def->arg(arg_idx);
+    if (arg.has_name() && arg.name() == "deferrable_mode") {
+      CAFFE_ENFORCE(arg.has_i(), "deferrable_mode should be an int");
+      use_dfs_scheduling_ = arg.i() == 1; // corr. to DFS scheduling
+      break;
+    }
+  }
+}
+
+void AsyncSchedulingNet::reset() {
+  AsyncNetBase::reset();
+  processed_tasks_num_ = 0;
+}
+
+void AsyncSchedulingNet::Wait() {
+  std::unique_lock<std::mutex> lock(running_mutex_);
+  while (running_) {
+    running_cv_.wait(lock);
+  }
+}
+
+void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
+  if (!testAndSetScheduled(task_id)) {
+    return;
+  }
+  auto schedule_func = [this, task_id]() {
+    if (success_) {
+      int stream_id = 0;
+      if (streams_per_gpu_ > 1) {
+        stream_id = stream(task_id);
+      }
+      if (!run(task_id, stream_id)) {
+        success_ = false;
+      }
+    }
+
+    for (auto child_id : children(task_id)) {
+      int parent_count = updateParentCount(child_id);
+      if (parent_count == 0) {
+        // Schedule a child if:
+        // - there is failure, we skip an op execution and finish the job
+        // - forced scheduling though --caffe2_net_async_always_schedule_child
+        // - --caffe2_net_async_finish_chain is set, in this case parents are
+        //   guaranteed to be finished
+        // - in all other cases, check parents with canSchedule
+        if (!success_ || always_schedule_child_ || finish_chain_ ||
+            canSchedule(child_id)) {
+          // if DFS scheduling is enabled, run children inline,
+          // ignore DFS scheduling in callbacks
+          schedule(child_id, use_dfs_scheduling_);
+        } else {
+          bool parent_failed = false;
+          bool parent_needs_polling = false;
+          std::vector<int> parents_with_callback;
+
+          for (auto parent_id : parents(child_id)) {
+            auto& parent_event = event(parent_id);
+            auto parent_status = parent_event.Query();
+
+            if (parent_status == EventStatus::EVENT_FAILED) {
+              parent_failed = true;
+              break;
+            } else if (parent_status == EventStatus::EVENT_SCHEDULED) {
+              // parent is not finished yet, check if this is blocking us
+              // from scheduling a child
+              if (!canSchedule(parent_id, child_id)) {
+                // we can't schedule a child because of this parent,
+                // check if parent supports callback
+                if (FLAGS_caffe2_net_async_optimize_polling &&
+                    parent_event.SupportsCallback()) {
+                  parents_with_callback.push_back(parent_id);
+                } else {
+                  parent_needs_polling = true;
+                  break;
+                }
+              }
+            } else if (parent_status != EventStatus::EVENT_SUCCESS) {
+              VLOG(1) << "Unexpected parent task state: " << parent_status
+                      << ", task id: " << child_id
+                      << ", parent task id: " << parent_id;
+              parent_failed = true;
+              break;
+            }
+          }
+
+          if (parent_failed) {
+            // one of parents failed, set failure flag and wrap up execution
+            success_ = false;
+            schedule(child_id, use_dfs_scheduling_);
+          } else if (parent_needs_polling) {
+            // some parents are blocking us from scheduling a child and don't
+            // support callbacks, using polling
+            const auto& child_device_option = event(child_id).GetDeviceOption();
+            pool(child_device_option)
+                ->run(std::bind(
+                    &AsyncSchedulingNet::pollAndSchedule, this, child_id));
+          } else if (!parents_with_callback.empty()) {
+            // some parents are blocking us from scheduling a child and they
+            // support callbacks
+            for (auto parent_id : parents_with_callback) {
+              event(parent_id).SetCallback(std::bind(
+                  &AsyncSchedulingNet::parentCallback, this, parent_id));
+            }
+          } else {
+            // we're ready to schedule a child
+            schedule(child_id, use_dfs_scheduling_);
+          }
+        }
+      }
+    }
+
+    // In case of net's failure, make sure all pending tasks are finished
+    if (!success_) {
+      // Simple logic to capture all pending tasks - check all tasks
+      // at the end of each task in case of net's failure
+      for (auto tid = 0; tid < tasksNum(); ++tid) {
+        if (event(tid).Query() == EventStatus::EVENT_SCHEDULED) {
+          // SetFinished may throw, e.g. when we call it on already finished
+          // event, and in some other cases (CUDA)
+          try {
+            event(tid).SetFinished("Cancelled");
+          } catch (const EnforceNotMet&) {
+            // ignore
+          }
+        }
+      }
+    }
+
+    // finishRun may cause waiters to wake up and destroy the net,
+    // before we call finishRun we need to make sure all other (finishing)
+    // tasks are done;
+    // Bumping and checking the counter after the task's job is done
+    auto tasks_num = tasksNum();
+    auto cur_processed_tasks = ++processed_tasks_num_;
+    if (cur_processed_tasks == tasks_num) {
+      finishRun();
+    }
+  };
+
+  if (run_inline) {
+    schedule_func();
+  } else {
+    const auto& device_option = event(task_id).GetDeviceOption();
+    pool(device_option)->run(schedule_func);
+  }
+}
+
+void AsyncSchedulingNet::parentCallback(int parent_id) {
+  if (event(parent_id).Query() != EventStatus::EVENT_SUCCESS) {
+    success_ = false;
+  }
+  for (auto child_id : children(parent_id)) {
+    int parent_count = getParentCount(child_id);
+    if (parent_count == 0) {
+      if (!success_ || canSchedule(child_id)) {
+        schedule(child_id);
+      }
+    }
+  }
+}
+
+void AsyncSchedulingNet::pollAndSchedule(int task_id) {
+  bool parent_failed = false;
+  bool can_schedule = canSchedule(task_id, nullptr, &parent_failed);
+  if (parent_failed) {
+    success_ = false;
+  }
+  // schedule the task if:
+  //  - parents are ready
+  //  - we failed / cleanup started (no ops will run)
+
+  if (can_schedule || !success_ || parent_failed) {
+    schedule(task_id);
+  } else {
+    const auto& device_option = event(task_id).GetDeviceOption();
+    pool(device_option)
+        ->run(std::bind(&AsyncSchedulingNet::pollAndSchedule, this, task_id));
+  }
+}
+
+void AsyncSchedulingNet::finishRun() {
+  {
+    std::unique_lock<std::mutex> lock(running_mutex_);
+    running_ = false;
+  }
+  // wait for scheduled ops and make sure all events are marked as finished
+  finalizeEvents();
+  // notify observers and waiters
+  StopAllObservers();
+  running_cv_.notify_all();
+}
+
+bool AsyncSchedulingNet::RunAsync() {
+  try {
+    std::unique_lock<std::mutex> lock(running_mutex_);
+    if (running_) {
+      LOG(ERROR) << "Detected concurrent runs";
+      return false;
+    }
+    running_ = true;
+    reset();
+
+    StartAllObservers();
+    tracing::startIter(tracer_);
+
+    for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+      if (parents(task_id).empty()) {
+        schedule(task_id);
+      }
+    }
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Exception while starting an async run: " << e.what();
+    finishRun();
+    return false;
+  }
+
+  if (tasksNum() == 0) {
+    finishRun();
+  }
+
+  if (is_blocking_) {
+    Wait();
+  }
+
+  return true;
+}
+
+AsyncSchedulingNet::~AsyncSchedulingNet() {
+  Wait();
+}
+
+REGISTER_NET(async_scheduling, AsyncSchedulingNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
new file mode 100644
index 0000000..363872d
--- /dev/null
+++ b/caffe2/core/net_async_scheduling.h
@@ -0,0 +1,38 @@
+#ifndef CAFFE2_CORE_NET_ASYNC_SCHEDULING_H_
+#define CAFFE2_CORE_NET_ASYNC_SCHEDULING_H_
+
+#include "caffe2/core/net_async_base.h"
+
+namespace caffe2 {
+
+class AsyncSchedulingNet : public AsyncNetBase {
+ public:
+  AsyncSchedulingNet(
+      const std::shared_ptr<const NetDef>& net_def,
+      Workspace* ws);
+  ~AsyncSchedulingNet() override;
+
+  void Wait() override;
+
+ protected:
+  bool RunAsync() override;
+
+  void pollAndSchedule(int task_id);
+  void schedule(int task_id, bool run_inline = false);
+  void reset() override;
+  virtual void finishRun();
+  void parentCallback(int parent_id);
+
+  std::mutex running_mutex_;
+  std::condition_variable running_cv_;
+  std::atomic<bool> running_;
+  bool use_dfs_scheduling_;
+
+  std::atomic<int> processed_tasks_num_;
+
+  DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_SCHEDULING_H_
diff --git a/caffe2/core/net_async_tracing.cc b/caffe2/core/net_async_tracing.cc
new file mode 100644
index 0000000..1bc89c6
--- /dev/null
+++ b/caffe2/core/net_async_tracing.cc
@@ -0,0 +1,416 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/net_async_tracing.h"
+
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(
+    caffe2_net_async_tracing_filepath,
+    "/tmp",
+    "Path to save tracing information");
+
+CAFFE2_DEFINE_string(
+    caffe2_net_async_names_to_trace,
+    "",
+    "Comma-separated list of net names to trace");
+
+CAFFE2_DEFINE_int(caffe2_net_async_tracing_nth, 100, "Trace every Nth batch");
+
+// For every Nth iterations, we will dump the tracing results to a json file
+// The file is appended with the iteration number.
+CAFFE2_DEFINE_int(
+    caffe2_net_async_tracing_dumping_nth,
+    10000,
+    "Dump profiling result file every Nth batch");
+
+namespace caffe2 {
+namespace tracing {
+
+int getCounterForNetName(const std::string& net_name) {
+  // Append a unique number suffix because there could be multiple instances
+  // of the same net and we want to uniquely associate each instance with
+  // a profiling trace.
+  static std::unordered_map<std::string, int> net_name_to_counter;
+  static std::mutex map_mutex;
+  std::unique_lock<std::mutex> map_lock(map_mutex);
+  int counter = net_name_to_counter[net_name] + 1;
+  net_name_to_counter[net_name] = counter;
+  return counter;
+}
+
+Tracer::Tracer(const NetBase* net, const std::string& net_name)
+    : net_(net), filename_(net_name), iter_(0) {
+  std::replace(filename_.begin(), filename_.end(), '/', '_');
+  filename_ = FLAGS_caffe2_net_async_tracing_filepath + "/" + filename_ +
+      +"_id_" + caffe2::to_string(getCounterForNetName(net_name));
+  timer_.Start();
+}
+
+void Tracer::recordEvent(const TracerEvent& event) {
+  std::lock_guard<std::mutex> lock(tracer_mutex_);
+  events_.push_back(event);
+}
+
+// Forward
+int getUniqueShardId(const OperatorDef& op_def);
+
+// Special handling of shard blob annotations
+std::string Tracer::opTraceName(const OperatorBase* op) {
+  int unique_shard_id =
+      op->has_debug_def() ? getUniqueShardId(op->debug_def()) : -1;
+  if (unique_shard_id != -1) {
+    return op->type() + ":" + caffe2::to_string(unique_shard_id);
+  } else {
+    return op->type();
+  }
+}
+
+std::string Tracer::opBlobsInfo(const OperatorBase& op) {
+  std::string blobs_info;
+  if (op.has_debug_def()) {
+    blobs_info += "I: ";
+    const auto& op_def = op.debug_def();
+    for (const auto& input : op_def.input()) {
+      blobs_info += input + "; ";
+    }
+    blobs_info += "O: ";
+    for (const auto& output : op_def.output()) {
+      blobs_info += output + "; ";
+    }
+  }
+  return blobs_info;
+}
+
+std::string Tracer::serializeEvent(const TracerEvent& event) {
+  std::stringstream serialized_event;
+  serialized_event << std::fixed;
+  serialized_event << "{\n";
+  serialized_event << " \"ts\": " << event.timestamp_ << ",\n";
+  serialized_event << " \"pid\": 0,\n"; // not using pid field
+  if (event.thread_label_ >= 0) {
+    serialized_event << " \"tid\": " << event.thread_label_ << ",\n";
+  } else {
+    serialized_event << " \"tid\": " << event.tid_ << ",\n";
+  }
+
+  if (event.is_beginning_) {
+    std::unordered_map<std::string, int> int_args;
+    std::unordered_map<std::string, std::string> string_args;
+    if (event.name_) {
+      serialized_event << " \"name\": \"" << event.name_ << "\",\n";
+    } else if (event.op_id_ >= 0) {
+      auto* op = net_->GetOperators().at(event.op_id_);
+      serialized_event << " \"name\": \"" << opTraceName(op) << "\",\n";
+    } else {
+      serialized_event << " \"name\": \"n/a\",\n";
+    }
+
+    if (event.category_) {
+      serialized_event << " \"cat\": \"" << event.category_ << "\",\n";
+    } else {
+      serialized_event << " \"cat\": \"net\",\n";
+    }
+
+    if (event.op_id_ >= 0) {
+      auto* op = net_->GetOperators().at(event.op_id_);
+      int_args["op_id"] = event.op_id_;
+      int_args["device_type"] = op->device_option().device_type();
+      int_args["device_id"] = DeviceId(op->device_option());
+      string_args["blobs"] = opBlobsInfo(*op);
+    }
+
+    if (event.task_id_ >= 0) {
+      int_args["task_id"] = event.task_id_;
+    }
+
+    if (event.stream_id_ >= 0) {
+      int_args["stream_id"] = event.stream_id_;
+    }
+
+    serialized_event << " \"ph\": \"B\"";
+    if (!int_args.empty() || !string_args.empty()) {
+      serialized_event << ",\n \"args\": {\n";
+      auto left_to_output = int_args.size() + string_args.size();
+      for (const auto& kv : int_args) {
+        serialized_event << "  \"" << kv.first << "\": " << kv.second;
+        --left_to_output;
+        if (left_to_output > 0) {
+          serialized_event << ",\n";
+        }
+      }
+      for (const auto& kv : string_args) {
+        serialized_event << "  \"" << kv.first << "\": \"" << kv.second << "\"";
+        --left_to_output;
+        if (left_to_output > 0) {
+          serialized_event << ",\n";
+        }
+      }
+      serialized_event << "\n }";
+    }
+  } else {
+    serialized_event << " \"ph\": \"E\"\n";
+  }
+  serialized_event << "\n}";
+
+  return serialized_event.str();
+}
+
+// fix occasional cases with zero duration events
+void Tracer::linearizeEvents() {
+  std::unordered_map<long, long> time_offsets;
+  std::unordered_map<long, long> last_times;
+  std::hash<std::thread::id> hasher;
+  const long time_eps = 1; // us
+  for (auto& event : events_) {
+    long tid =
+        (event.thread_label_ >= 0) ? event.thread_label_ : hasher(event.tid_);
+    auto event_ts = event.timestamp_;
+    if (last_times.count(tid)) {
+      event_ts += time_offsets[tid];
+      CAFFE_ENFORCE(event_ts >= last_times[tid]);
+      if (event_ts <= last_times[tid] + time_eps) {
+        event_ts += time_eps;
+        time_offsets[tid] += time_eps;
+      } else if (event_ts > last_times[tid] + 2 * time_eps) {
+        long eps_len = (event_ts - last_times[tid]) / time_eps;
+        if (time_offsets[tid] >= time_eps * (eps_len - 1)) {
+          time_offsets[tid] -= time_eps * (eps_len - 1);
+          event_ts -= time_eps * (eps_len - 1);
+        } else {
+          event_ts -= time_offsets[tid];
+          time_offsets[tid] = 0;
+        }
+      }
+      event.timestamp_ = event_ts;
+      last_times[tid] = event_ts;
+    } else {
+      last_times[tid] = event_ts;
+      time_offsets[tid] = 0;
+    }
+  }
+}
+
+void Tracer::renameThreads() {
+  std::unordered_map<long, int> tids;
+  std::unordered_map<int, int> numa_counters;
+  std::unordered_map<long, int> tid_to_numa;
+  std::hash<std::thread::id> hasher;
+  const long numa_multiplier = 1000000000;
+  for (auto& event : events_) {
+    if (event.thread_label_ >= 0 || event.op_id_ < 0) {
+      continue;
+    }
+    auto* op = net_->GetOperators().at(event.op_id_);
+    int numa_node_id = DeviceId(op->device_option());
+    if (numa_node_id < 0) {
+      continue;
+    }
+    long tid = hasher(event.tid_);
+
+    if (!tid_to_numa.count(tid)) {
+      tid_to_numa[tid] = numa_node_id;
+    } else {
+      CAFFE_ENFORCE_EQ(tid_to_numa[tid], numa_node_id);
+    }
+
+    if (!numa_counters.count(numa_node_id)) {
+      numa_counters[numa_node_id] = 1;
+    }
+    if (!tids.count(tid)) {
+      tids[tid] = numa_counters[numa_node_id]++;
+    }
+    event.thread_label_ = numa_multiplier * (numa_node_id + 1) + tids[tid];
+  }
+}
+
+void Tracer::setEnabled(bool enabled) {
+  enabled_ = enabled;
+}
+
+bool Tracer::isEnabled() const {
+  return enabled_;
+}
+
+int Tracer::bumpIter() {
+  return iter_++;
+}
+
+void Tracer::dumpTracingResultAndClearEvents(const std::string& file_suffix) {
+  if (events_.empty() || filename_.empty()) {
+    return;
+  }
+  linearizeEvents();
+  renameThreads();
+  std::stringstream serialized;
+  serialized << "[\n";
+  for (size_t idx = 0; idx < events_.size(); ++idx) {
+    serialized << serializeEvent(events_[idx]);
+    if (idx != events_.size() - 1) {
+      serialized << ",\n";
+    }
+  }
+  serialized << "\n]\n";
+
+  auto output_file_name = filename_ + "_iter_" + file_suffix + ".json";
+  LOG(INFO) << "Dumping profiling result file to " << output_file_name;
+  WriteStringToFile(serialized.str(), output_file_name.c_str());
+  events_.clear();
+}
+
+Tracer::~Tracer() {
+  dumpTracingResultAndClearEvents("final_batch");
+}
+
+void TracerGuard::init(Tracer* tracer) {
+  enabled_ = true;
+  tracer_ = tracer;
+}
+
+void TracerGuard::addArgument() {}
+
+void TracerGuard::addArgument(TracingField field, const char* value) {
+  switch (field) {
+    case TRACE_NAME: {
+      event_.name_ = value;
+      break;
+    }
+    case TRACE_CATEGORY: {
+      event_.category_ = value;
+      break;
+    }
+    default: { CAFFE_THROW("Unexpected tracing string field ", field); }
+  }
+}
+
+void TracerGuard::addArgument(TracingField field, int value) {
+  switch (field) {
+    case TRACE_OP: {
+      event_.op_id_ = value;
+      break;
+    }
+    case TRACE_TASK: {
+      event_.task_id_ = value;
+      break;
+    }
+    case TRACE_STREAM: {
+      event_.stream_id_ = value;
+      break;
+    }
+    case TRACE_THREAD: {
+      event_.thread_label_ = value;
+      break;
+    }
+    default: { CAFFE_THROW("Unexpected tracing int field ", field); }
+  }
+}
+
+void TracerGuard::recordEventStart() {
+  if (enabled_) {
+    if (event_.thread_label_ < 0) {
+      event_.tid_ = std::this_thread::get_id();
+    }
+    event_.is_beginning_ = true;
+    event_.timestamp_ = (long)caffe2::round(tracer_->timer_.MicroSeconds());
+    tracer_->recordEvent(event_);
+  }
+}
+
+TracerGuard::~TracerGuard() {
+  if (enabled_) {
+    event_.is_beginning_ = false;
+    event_.timestamp_ = (long)caffe2::round(tracer_->timer_.MicroSeconds());
+    tracer_->recordEvent(event_);
+  }
+}
+
+int extractShardId(const std::string& name) {
+  const std::string kShard = "shard:";
+  // We sometimes have multiple shards, but actually need the last one, hence
+  // using rfind here. Hacky but it works till we pass shard id in graph
+  // metadata.
+  auto pos = name.rfind(kShard);
+  if (pos != std::string::npos) {
+    int left_pos = pos + kShard.length();
+    int right_pos = left_pos;
+    while (right_pos < name.length() && isdigit(name[right_pos])) {
+      right_pos++;
+    }
+    return caffe2::stoi(name.substr(left_pos, right_pos - left_pos));
+  } else {
+    return -1;
+  }
+}
+
+// Return unique shard id, or -1 if it is not unique.
+int getUniqueShardId(const OperatorDef& op_def) {
+  int unique_shard_id = -1;
+  for (const auto& names : {op_def.input(), op_def.output()}) {
+    for (const auto& name : names) {
+      int shard_id = extractShardId(name);
+      if (shard_id != -1) {
+        if (unique_shard_id != -1) {
+          return -1;
+        }
+        unique_shard_id = shard_id;
+      }
+    }
+  }
+  return unique_shard_id;
+}
+
+bool isTraceableNetName(const std::string& net_name) {
+  auto tracing_nets = caffe2::split(',', FLAGS_caffe2_net_async_names_to_trace);
+  return !net_name.empty() &&
+      std::find(tracing_nets.begin(), tracing_nets.end(), net_name) !=
+      tracing_nets.end();
+}
+
+bool hasEnableTracingFlag(const NetBase* net) {
+  if (!net->has_debug_def()) {
+    return false;
+  }
+  return GetFlagArgument(net->debug_def(), "enable_tracing", false);
+}
+
+std::shared_ptr<Tracer> create(
+    const NetBase* net,
+    const std::string& net_name) {
+  // Enable the tracer if the net has the "enable_tracing" argument set OR
+  // if the command line option includes the net name option in the list of
+  // tracable nets.
+  bool trace_net = hasEnableTracingFlag(net) || isTraceableNetName(net_name);
+  return trace_net ? std::make_shared<Tracer>(net, net_name) : nullptr;
+}
+
+bool startIter(const std::shared_ptr<Tracer>& tracer) {
+  if (!tracer) {
+    return false;
+  }
+  auto iter = tracer->bumpIter();
+  auto is_enabled = iter % FLAGS_caffe2_net_async_tracing_nth == 0;
+  tracer->setEnabled(is_enabled);
+  if (iter % FLAGS_caffe2_net_async_tracing_dumping_nth == 0) {
+    int dumping_iter = iter / FLAGS_caffe2_net_async_tracing_dumping_nth;
+    tracer->dumpTracingResultAndClearEvents(caffe2::to_string(dumping_iter));
+  }
+  return is_enabled;
+}
+
+} // namespace tracing
+
+} // namespace caffe2
diff --git a/caffe2/core/net_async_tracing.h b/caffe2/core/net_async_tracing.h
new file mode 100644
index 0000000..03daf4e
--- /dev/null
+++ b/caffe2/core/net_async_tracing.h
@@ -0,0 +1,146 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_CORE_NET_ASYNC_TRACING_H_
+#define CAFFE2_CORE_NET_ASYNC_TRACING_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net_async_base.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+CAFFE2_DECLARE_string(caffe2_net_async_tracing_filepath);
+CAFFE2_DECLARE_string(caffe2_net_async_names_to_trace);
+CAFFE2_DECLARE_int(caffe2_net_async_tracing_nth);
+
+namespace caffe2 {
+namespace tracing {
+
+struct TracerEvent {
+  int op_id_ = -1;
+  int task_id_ = -1;
+  int stream_id_ = -1;
+  const char* name_ = nullptr;
+  const char* category_ = nullptr;
+  long timestamp_ = -1.0;
+  bool is_beginning_ = false;
+  long thread_label_ = -1;
+  std::thread::id tid_;
+};
+
+enum TracingField {
+  TRACE_OP,
+  TRACE_TASK,
+  TRACE_STREAM,
+  TRACE_THREAD,
+  TRACE_NAME,
+  TRACE_CATEGORY,
+};
+
+class Tracer {
+ public:
+  Tracer(const NetBase* net, const std::string& net_name);
+
+  void recordEvent(const TracerEvent& event);
+  std::string opTraceName(const OperatorBase* op);
+  std::string opBlobsInfo(const OperatorBase& op);
+  std::string serializeEvent(const TracerEvent& event);
+  void linearizeEvents();
+  void renameThreads();
+  void setEnabled(bool enabled);
+  bool isEnabled() const;
+  int bumpIter();
+  // Dump the tracing result to file with given suffix, and then
+  // clear current events.
+  void dumpTracingResultAndClearEvents(const std::string& file_suffix);
+
+  virtual ~Tracer();
+
+ private:
+  const NetBase* net_ = nullptr;
+  std::string filename_;
+  std::vector<TracerEvent> events_;
+  std::mutex tracer_mutex_;
+  bool enabled_ = false;
+  Timer timer_;
+  int iter_;
+
+  friend class TracerGuard;
+};
+
+class TracerGuard {
+ public:
+  TracerGuard() {}
+
+  void init(Tracer* tracer);
+
+  void addArgument();
+  void addArgument(TracingField field, const char* value);
+  void addArgument(TracingField field, int value);
+
+  template <typename T, typename... Args>
+  void addArgument(TracingField field, const T& value, const Args&... args) {
+    addArgument(field, value);
+    addArgument(args...);
+  }
+
+  void recordEventStart();
+
+  virtual ~TracerGuard();
+
+ private:
+  bool enabled_ = false;
+  TracerEvent event_;
+  Tracer* tracer_;
+};
+
+// Extract the shard id from name of the form "...shard:123..."
+// Return -1 if there is no shard found
+int extractShardId(const std::string& name);
+
+// Check if the net name is white-listed for tracing (specified via a command
+// line flag)
+bool isTraceableNetName(const std::string& net_name);
+
+std::shared_ptr<Tracer> create(const NetBase* net, const std::string& net_name);
+bool startIter(const std::shared_ptr<Tracer>& tracer);
+
+} // namespace tracing
+
+#define TRACE_NAME_CONCATENATE(s1, s2) s1##s2
+#define TRACE_ANONYMOUS_NAME(str) TRACE_NAME_CONCATENATE(str, __LINE__)
+
+#define TRACE_EVENT_INIT(...)                                 \
+  TRACE_ANONYMOUS_NAME(trace_guard).init(tracer_.get());      \
+  TRACE_ANONYMOUS_NAME(trace_guard).addArgument(__VA_ARGS__); \
+  TRACE_ANONYMOUS_NAME(trace_guard).recordEventStart();
+
+// Supposed to be used only once per scope in AsyncNetBase-derived nets
+#define TRACE_EVENT(...)                                  \
+  tracing::TracerGuard TRACE_ANONYMOUS_NAME(trace_guard); \
+  if (tracer_ && tracer_->isEnabled()) {                  \
+    TRACE_EVENT_INIT(__VA_ARGS__)                         \
+  }
+
+#define TRACE_EVENT_IF(cond, ...)                         \
+  tracing::TracerGuard TRACE_ANONYMOUS_NAME(trace_guard); \
+  if (tracer_ && tracer_->isEnabled() && (cond)) {        \
+    TRACE_EVENT_INIT(__VA_ARGS__)                         \
+  }
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_ASYNC_TRACING_H_
diff --git a/caffe2/core/net_async_tracing_test.cc b/caffe2/core/net_async_tracing_test.cc
new file mode 100644
index 0000000..095d4f8
--- /dev/null
+++ b/caffe2/core/net_async_tracing_test.cc
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "caffe2/core/net_async_tracing.h"
+
+namespace caffe2 {
+
+namespace tracing {
+
+void testExtractShardId(const string& name, int expectedId) {
+  EXPECT_EQ(extractShardId(name), expectedId);
+}
+
+TEST(NetAsyncTracingTest, ExtractShardId) {
+  testExtractShardId("ABCDEFshard:1705!!A", 1705);
+  // Should use the last one
+  testExtractShardId("ABCDEFshard:4324!!Ashard:01220b", 1220);
+  // Nothing to extract
+  testExtractShardId("ABCDEFsha:222", -1);
+  // Regular cases
+  testExtractShardId("FC:shard:0", 0);
+  testExtractShardId("FC:shard:10", 10);
+  testExtractShardId("FC:shard:15", 15);
+}
+
+} // namespace tracing
+
+} // namespace caffe2
diff --git a/caffe2/core/net_dag.cc b/caffe2/core/net_dag.cc
new file mode 100644
index 0000000..e256362
--- /dev/null
+++ b/caffe2/core/net_dag.cc
@@ -0,0 +1,345 @@
+#include "caffe2/core/net_dag.h"
+
+#include <iostream>
+#include <set>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/thread_name.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_disable_chaining,
+    false,
+    "Disable chaining logic (some latent multi-device issues).");
+
+CAFFE2_DEFINE_bool(
+    caffe2_dag_net_collect_stats,
+    false,
+    "Collect time stats in DAG net");
+
+namespace caffe2 {
+
+DAGNetBase::DAGNetBase(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : NetBase(net_def, ws), caught_exception_yet_(false), iter_(0) {
+  // Blob creator allows us to track which operator created which blob.
+  VLOG(1) << "Constructing DAGNet " << net_def->name();
+
+  operator_nodes_ = dag_utils::prepareOperatorNodes(net_def, ws);
+
+  execution_chains_ =
+      (FLAGS_caffe2_disable_chaining
+           ? dag_utils::singleChains(operator_nodes_)
+           : dag_utils::computeChains(operator_nodes_));
+
+  operators_.reserve(operator_nodes_.size());
+  for (const auto& node : operator_nodes_) {
+    operators_.push_back(node.operator_.get());
+  }
+
+  LOG(INFO) << "Number of parallel execution chains "
+            << execution_chains_.size()
+            << " Number of operators = " << net_def->op_size();
+  // TODO: do we want to make sure that there are no loops in the
+  // dependency graph?
+
+  // Figure out the initial frontier - this is the one we will feed into the job
+  // queue to start a run.
+  for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) {
+    if (operator_nodes_[idx].parents_.size() == 0) {
+      initial_frontier_.push_back(idx);
+    }
+  }
+  // Finally, start the workers.
+  int num_workers = net_def->has_num_workers() ? net_def->num_workers() : 1;
+  CAFFE_ENFORCE(num_workers > 0, "Must have a positive number of workers.");
+  if (num_workers == 1) {
+    LOG(WARNING) << "Number of workers is 1: this means that all operators "
+                 << "will be executed sequentially. Did you forget to set "
+                 << "num_workers in the NetDef?";
+  }
+  num_workers_ = num_workers;
+
+  for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) {
+    if (operator_nodes_[idx].is_chain_start_) {
+      task_timers_[idx] = caffe2::make_unique<Timer>();
+    }
+  }
+  stats_.reserve(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+  for (auto device_idx = 0;
+       device_idx < DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
+       ++device_idx) {
+    stats_.emplace_back(
+        "dag_net/stats/" + net_def->name() + "/" +
+        caffe2::DeviceTypeName(device_idx));
+  }
+
+  tracer_ = tracing::create(this, net_def->name());
+  if (tracer_) {
+    LOG(INFO) << "Tracing net: " << net_def->name();
+  }
+}
+
+DAGNetBase::~DAGNetBase() {
+  if (job_queue_) {
+    job_queue_->NoMoreJobs();
+    VLOG(1) << "Joining workers.";
+    for (auto& worker : workers_) {
+      worker.join();
+    }
+  }
+}
+
+bool DAGNetBase::DoRunAsync() {
+  StartAllObservers();
+
+  tracing::startIter(tracer_);
+
+  // Lock run_in_progress_ to prevent concurrent Run()s.
+  std::unique_lock<std::mutex> run_lock(run_in_progress_);
+  VLOG(1) << "Running parallel net.";
+  // First, set up job queue.
+  remaining_ops_ = operator_nodes_.size();
+  success_ = true;
+  iter_++;
+  if (!job_queue_) {
+    job_queue_ = caffe2::make_unique<SimpleQueue<int>>();
+  }
+  // Figure out number of workers to start.
+  size_t num_workers_to_start = num_workers_ - workers_.size();
+
+  // Ensure the number of workers matches the defined in case
+  // any of the previously started threads terminated.
+  for (size_t i = 0; i < num_workers_to_start; i++) {
+    VLOG(1) << "Start worker #" << workers_.size();
+    workers_.push_back(std::thread(&DAGNetBase::WorkerFunction, this));
+  }
+  // Initialize the runtime parent count.
+  for (auto& node : operator_nodes_) {
+    node.runtime_parent_count_ = node.parents_.size();
+  }
+  // Kickstart the job queue.
+  for (auto& value : initial_frontier_) {
+    if (FLAGS_caffe2_dag_net_collect_stats) {
+      task_timers_[value]->Start();
+    }
+    job_queue_->Push(value);
+  }
+  // Wait for failure or completed execution.
+  {
+    std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+    for (;;) {
+      if (remaining_ops_ == 0 || !success_) {
+        break;
+      }
+      cv_.wait(mutex_lock);
+    }
+  }
+  // Wait for all workers to terminate after failure.
+  // If there is a failure, it is unlikely that the net is executed
+  // again without modifications. Therefore it's easier to let the
+  // workers terminate here, versus adding a drain state to make the
+  // sure the job queue is cleared.
+  if (!success_) {
+    for (auto& worker : workers_) {
+      worker.join();
+    }
+    workers_.clear();
+    job_queue_.reset(nullptr);
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+    if (caught_exception_) {
+      // Reset flag here in case Net gets run again
+      caught_exception_yet_ = false;
+      std::rethrow_exception(caught_exception_);
+    }
+#endif // CAFFE2_USE_EXCEPTION_PTR
+    return success_;
+  }
+  VLOG(2) << "All ops finished running.";
+  for (const auto& op : operator_nodes_) {
+    CAFFE_ENFORCE(
+        op.runtime_parent_count_ == 0,
+        "Operator ",
+        op.operator_->debug_def().name(),
+        "(",
+        op.operator_->debug_def().type(),
+        ") has some runtime parents left.");
+  }
+
+  StopAllObservers();
+  // If the above while loop finished, we know that the current run finished.
+  return success_;
+}
+
+void DAGNetBase::HandleException(
+    int operator_idx,
+    const std::string& exception_str) {
+  const std::string& operator_name =
+      operator_nodes_[operator_idx].operator_->debug_def().name();
+  const std::string& operator_type =
+      operator_nodes_[operator_idx].operator_->debug_def().type();
+  const char* prefix = "Exception from operator chain starting at '";
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  if (!caught_exception_yet_.exchange(true)) {
+    caught_exception_ = std::current_exception();
+  } else {
+    prefix = "Secondary exception from operator chain starting at '";
+  }
+#endif // CAFFE2_USE_EXCEPTION_PTR
+  LOG(ERROR) << prefix << operator_name << "' (type '" << operator_type
+             << "'): " << exception_str << "\n";
+#ifndef CAFFE2_USE_EXCEPTION_PTR
+  throw; // Can't capture for dispatch to other thread, re-throw here
+#endif // CAFFE2_USE_EXCEPTION_PTR
+}
+
+void DAGNetBase::WorkerFunction() {
+  setThreadName("CaffeDAGNet");
+
+  // WorkerFunctions() is an infinite loop until there are no more jobs to run.
+  while (true) {
+    int idx = 0;
+
+    // Return if there are no more operators to run (e.g. the
+    // DAGNetBase is destructing, or there was an error on another
+    // worker and we're cleaning up).
+    if (!job_queue_->Pop(&idx)) {
+      return;
+    }
+    if (FLAGS_caffe2_dag_net_collect_stats) {
+      auto device_option =
+          operator_nodes_[idx].operator_->event().GetDeviceOption();
+      CAFFE_EVENT(
+          stats_[device_option.device_type()],
+          task_pool_wait_time_us,
+          task_timers_[idx]->MicroSeconds());
+    }
+
+    VLOG(1) << "Running chain starting at operator #" << idx << " "
+            << operator_nodes_[idx].operator_->debug_def().name() << "("
+            << operator_nodes_[idx].operator_->debug_def().type() << ").";
+    CAFFE_ENFORCE(
+        execution_chains_.find(idx) != execution_chains_.end(),
+        "Can't find chain ",
+        idx,
+        ".");
+    bool this_success = false;
+    try {
+      this_success = RunAt(idx, execution_chains_[idx]);
+
+      if (!this_success) {
+        // If an exception was thrown, the operator def will get printed
+        // by Operator::Run[Async], but if no exception occurs we print it here.
+        LOG(ERROR) << "Operator chain failed starting at: "
+                   << ProtoDebugString(
+                          operator_nodes_[idx].operator_->debug_def());
+      }
+    } catch (std::exception& e) {
+      std::string exception_str = GetExceptionString(e);
+      HandleException(idx, exception_str);
+    } catch (...) {
+      std::string exception_str = "Unknown exception";
+      HandleException(idx, exception_str);
+    }
+
+    // Do book-keeping
+    std::vector<int> chains_to_queue;
+    const auto& chain = execution_chains_[idx];
+    for (const auto idx : chain) {
+      for (const auto child : operator_nodes_[idx].children_) {
+        const int count = --operator_nodes_[child].runtime_parent_count_;
+        CAFFE_ENFORCE(
+            count >= 0,
+            "Found runtime parent count smaller than zero for ",
+            "operator node ",
+            operator_nodes_[child].operator_->debug_def().name(),
+            "(",
+            operator_nodes_[child].operator_->debug_def().type(),
+            ").");
+
+        if (count != 0) {
+          continue;
+        }
+
+        if (operator_nodes_[child].is_chain_start_) {
+          VLOG(2) << "Pushing chain #" << child << " to queue.";
+          chains_to_queue.push_back(child);
+        }
+      }
+    }
+
+    // Notify the caller of Run
+    {
+      std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+      remaining_ops_ -= chain.size();
+      CAFFE_ENFORCE(remaining_ops_ >= 0);
+      success_ &= this_success;
+      if (remaining_ops_ == 0 || !success_) {
+        cv_.notify_one();
+      }
+
+      // Terminate thread if this or any other operator chain failed.
+      if (!success_) {
+        job_queue_->NoMoreJobs();
+        return;
+      }
+
+      // Queue follow up operator chains.
+      // Can't do this inline because it can race with another thread
+      // calling NoMoreJobs(). So the lock needs to be held on push.
+      for (const auto idx : chains_to_queue) {
+        if (FLAGS_caffe2_dag_net_collect_stats) {
+          task_timers_[idx]->Start();
+        }
+        job_queue_->Push(idx);
+      }
+    }
+
+    VLOG(2) << "Finished executing operator #" << idx;
+  }
+}
+
+bool DAGNet::RunAt(int chain_id, const std::vector<int>& chain) {
+  for (const auto i : chain) {
+#ifdef CAFFE2_ENABLE_SDT
+    const auto& op_name =
+        operator_nodes_[i].operator_->debug_def().name().c_str();
+    const auto& op_type =
+        operator_nodes_[i].operator_->debug_def().type().c_str();
+    auto* op_ptr = operator_nodes_[i].operator_.get();
+    const auto& net_name = name_.c_str();
+    CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr);
+#endif
+    bool success = false;
+    {
+      TRACE_EVENT(tracing::TRACE_OP, i, tracing::TRACE_TASK, chain_id);
+      success = operator_nodes_[i].operator_->Run();
+    }
+#ifdef CAFFE2_ENABLE_SDT
+    CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr);
+#endif
+    if (!success) {
+      return false;
+    }
+  }
+  if (FLAGS_caffe2_dag_net_collect_stats) {
+    auto device_option =
+        operator_nodes_[chain_id].operator_->event().GetDeviceOption();
+    CAFFE_EVENT(
+        stats_[device_option.device_type()],
+        task_time_to_succeeded_ms,
+        task_timers_[chain_id]->MilliSeconds());
+  }
+  return true;
+}
+
+REGISTER_NET(dag, DAGNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h
new file mode 100644
index 0000000..d941f73
--- /dev/null
+++ b/caffe2/core/net_dag.h
@@ -0,0 +1,103 @@
+#ifndef CAFFE2_CORE_NET_DAG_H_
+#define CAFFE2_CORE_NET_DAG_H_
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <thread> // NOLINT
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net_async_tracing.h"
+#include "caffe2/core/net_dag_utils.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/simple_queue.h"
+
+namespace caffe2 {
+
+class DAGNetBase : public NetBase {
+ public:
+  DAGNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  ~DAGNetBase() override;
+
+  // WorkerFunction() is a function wrapper to allow us to run worker threads.
+  // It checks out one ready-to-run operator from the job queue, runs it,
+  // notifies all its children, and for any children that is ready, enqueues
+  // it to the job queue.
+  void WorkerFunction();
+
+  const dag_utils::ExecutionChains& TEST_execution_chains() const {
+    return execution_chains_;
+  }
+
+  vector<OperatorBase*> GetOperators() const override {
+    return operators_;
+  }
+
+ protected:
+  bool DoRunAsync() override;
+
+  virtual bool RunAt(int chain_id, const std::vector<int>& chain) = 0;
+  void HandleException(int operator_idx, const std::string& exception_str);
+
+  vector<dag_utils::OperatorNode> operator_nodes_;
+  vector<OperatorBase*> operators_;
+  dag_utils::ExecutionChains execution_chains_;
+  vector<int> initial_frontier_;
+  std::unique_ptr<SimpleQueue<int>> job_queue_;
+  std::vector<std::thread> workers_;
+  int num_workers_;
+  int remaining_ops_;
+
+  bool success_;
+  // Use an atomic to guard caught_exception_ so it is written to only once
+  std::atomic<bool> caught_exception_yet_;
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  std::exception_ptr caught_exception_;
+#endif // CAFFE2_USE_EXCEPTION_PTR
+  int iter_;
+  std::mutex remaining_ops_mutex_;
+  std::condition_variable cv_;
+  std::mutex run_in_progress_;
+
+  // Tracing
+  std::shared_ptr<tracing::Tracer> tracer_;
+
+  struct DAGNetStats {
+    CAFFE_STAT_CTOR(DAGNetStats);
+    CAFFE_AVG_EXPORTED_STAT(task_pool_wait_time_us);
+    CAFFE_AVG_EXPORTED_STAT(task_time_to_scheduled_us);
+    CAFFE_AVG_EXPORTED_STAT(task_time_to_succeeded_ms);
+    CAFFE_AVG_EXPORTED_STAT(task_wait_time_us);
+  };
+  mutable std::vector<DAGNetStats> stats_;
+  std::unordered_map<int, std::unique_ptr<Timer>> task_timers_;
+
+  DISABLE_COPY_AND_ASSIGN(DAGNetBase);
+};
+
+class DAGNet : public DAGNetBase {
+ public:
+  using DAGNetBase::DAGNetBase;
+
+ protected:
+  bool RunAt(int chain_id, const std::vector<int>& chain) override;
+  bool SupportsAsync() override {
+    return false;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_DAG_H_
diff --git a/caffe2/core/net_dag_utils.cc b/caffe2/core/net_dag_utils.cc
new file mode 100644
index 0000000..6c24a3f
--- /dev/null
+++ b/caffe2/core/net_dag_utils.cc
@@ -0,0 +1,430 @@
+#include "caffe2/core/net_dag_utils.h"
+
+#include <set>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+namespace dag_utils {
+
+namespace {
+void prune(int node_idx, std::vector<OpGraphNode>& nodes) {
+  // Ancestor table for tracking the visited nodes
+  std::vector<bool> ancestors(nodes.size(), false);
+  // stack element is pair of <curr_node, previous_node>
+  std::stack<std::pair<int, int>> nodes_stack;
+  // initialize the prev_node to be -1
+  nodes_stack.push(std::make_pair(node_idx, -1));
+
+  while (!nodes_stack.empty()) {
+    const auto& node_pair = nodes_stack.top();
+    int curr = node_pair.first;
+    int prev = node_pair.second;
+
+    // If the node has already been visited, pop curr out of
+    // stack and clean up the ancestor table
+    CAFFE_ENFORCE(curr < (int)ancestors.size(), "Out of bound access");
+    if (ancestors[curr]) {
+      ancestors[curr] = false;
+      nodes_stack.pop();
+      continue;
+    }
+
+    // Check if this has a parent that can be pruned:
+    //  if parent is not the previous node visited and is
+    //  an ancestor of the current traversar, it can be
+    //  pruned.
+    if (prev >= 0) {
+      std::vector<int> new_parents;
+      for (auto parent : nodes[curr].parents_) {
+        if (parent != prev && ancestors[parent]) {
+          // We can prune this one
+          nodes[parent].children_.erase(
+              std::remove(
+                  nodes[parent].children_.begin(),
+                  nodes[parent].children_.end(),
+                  curr),
+              nodes[parent].children_.end());
+        } else {
+          new_parents.push_back(parent);
+        }
+      }
+      nodes[curr].parents_ = new_parents;
+    }
+
+    ancestors[curr] = true;
+
+    // Descend -- but only once from each node
+    if (nodes[curr].visited_inputs == nodes[curr].num_orig_parents) {
+      const auto& children = nodes[curr].children_;
+      for (auto child : children) {
+        nodes[child].visited_inputs++;
+        nodes_stack.push(std::make_pair(child, curr));
+      }
+    }
+  }
+}
+
+/**
+ * Prune redundant dependencies to improve chaining.
+ * TODO: t15868555 This algorithm is fast but can miss dependencies.
+ */
+std::vector<OpGraphNode> pruneOpNodeGraph(
+    const std::vector<OperatorNode>& nodes) {
+  Timer t;
+  std::vector<OpGraphNode> pruned;
+
+  // Create a separate list of pruned operatornodes used
+  // for the chaining computation. Because of the unique_ptr
+  // in the OperatorNode, we cannot do a copy but have to
+  // copy just the fields we need.
+  for (auto& node : nodes) {
+    OpGraphNode nd;
+    nd.children_ = node.children_;
+    nd.parents_ = node.parents_;
+    nd.num_orig_parents = nd.parents_.size();
+    pruned.push_back(nd);
+  }
+
+  for (int i = 0; i < (int)pruned.size(); ++i) {
+    if (pruned[i].parents_.size() == 0) {
+      prune(i, pruned);
+    }
+  }
+
+  LOG(INFO) << "Operator graph pruning prior to chain compute took: "
+            << t.Seconds() << " secs";
+  return pruned;
+}
+
+void updateOperatorNodes(
+    std::vector<OperatorNode>& nodes,
+    const ExecutionChains& chains) {
+  for (int i = 0; i < (int)nodes.size(); ++i) {
+    auto& node = nodes[i];
+    if (chains.find(i) != chains.end()) {
+      node.is_chain_start_ = true;
+    } else {
+      node.is_chain_start_ = false;
+    }
+    node.runtime_parent_count_ = 0;
+    node.scheduled_.clear();
+  }
+}
+} // namespace
+
+ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
+  const std::vector<OpGraphNode> nodes = pruneOpNodeGraph(orig_nodes);
+  vector<int> initial_frontier;
+  for (int idx = 0; idx < (int)nodes.size(); ++idx) {
+    if (nodes[idx].parents_.size() == 0) {
+      initial_frontier.push_back(idx);
+    }
+  }
+
+  // We need to construct the node_seen_count to know how many inner edges each
+  // node has.
+  std::unordered_map<int, int> node_seen_count;
+
+  for (int root_index : initial_frontier) {
+    const auto& root = nodes[root_index];
+    std::stack<std::pair<int, std::vector<int>::const_iterator>> depth_stack;
+    depth_stack.push(make_pair(root_index, root.children_.begin()));
+    node_seen_count[root_index]++;
+    CAFFE_ENFORCE(
+        node_seen_count[root_index] == 1,
+        "root node ",
+        root_index,
+        " visit count must be == 1");
+
+    while (depth_stack.size() > 0) {
+      auto cur = depth_stack.top();
+      depth_stack.pop();
+      if (cur.second != nodes[cur.first].children_.end()) {
+        int node_index = *cur.second;
+        node_seen_count[node_index]++;
+        cur.second++;
+        depth_stack.push(cur);
+        if (node_seen_count[node_index] == 1) {
+          // Visit each child only once.
+          depth_stack.push(
+              make_pair(node_index, nodes[node_index].children_.begin()));
+        }
+      }
+    }
+  }
+  // Now, we compute the set of execution chains An execution chain is
+  // a linear set of nodes that can be executed on a single stream
+  // (e.g. a chain of single input, single output operators)
+  ExecutionChains chains;
+  std::unordered_set<int> seen_nodes;
+  std::vector<int> chain;
+  std::pair<int, std::vector<int>::const_iterator> cur;
+  std::stack<std::pair<int, std::vector<int>::const_iterator>> depth_stack;
+  auto check_current_for_chaining = [&]() -> bool {
+    return (
+        node_seen_count[cur.first] == 1 &&
+        (chain.size() == 0 ||
+         (
+             // A chain of operators is executed without additional
+             // synchronization by calling RunAsync sequentially on each
+             // operator and passing the same stream id on each call.
+             // RunAsync may schedule an async computation on device.
+             // In order to be scheduled on the same chain two operators
+             // (parent and dependent) need to satisfy:
+             //  1. Both ops are on the same device _and_
+             //  2. Parent op does not have an async part or
+             //     dependent op can be executed as an async dependency
+
+             IsSameDevice(
+                 orig_nodes[cur.first].operator_->device_option(),
+                 orig_nodes[chain.back()].operator_->device_option()) &&
+             (!orig_nodes[chain.back()].operator_->HasAsyncPart() ||
+              orig_nodes[cur.first].operator_->SupportsAsyncScheduling()))));
+  };
+  auto commit_chain = [&]() {
+    if (chain.size() > 0) {
+      CAFFE_ENFORCE(
+          chains.insert({chain.front(), chain}).second,
+          "Chain ",
+          chain.front(),
+          " was already added.");
+      VLOG(2) << "Added chain: " << chain.front() << "with elements";
+      for (auto ch : chain) {
+        VLOG(2) << ch << ", ";
+      }
+      chain.clear();
+    }
+  };
+  auto depth_traverse = [&]() {
+    while (cur.second != nodes[cur.first].children_.end() &&
+           seen_nodes.find(*cur.second) != seen_nodes.end()) {
+      cur.second++;
+    }
+
+    if (cur.second != nodes[cur.first].children_.end()) {
+      auto next = make_pair(*cur.second, nodes[*cur.second].children_.begin());
+      depth_stack.push(cur);
+      depth_stack.push(next);
+    }
+  };
+  for (int root_index : initial_frontier) {
+    depth_stack.push(
+        make_pair(root_index, nodes[root_index].children_.begin()));
+    while (depth_stack.size() > 0) {
+      cur = depth_stack.top();
+      depth_stack.pop();
+      if (seen_nodes.find(cur.first) == seen_nodes.end()) {
+        seen_nodes.insert(cur.first);
+        // Has one child, can be candidate for chain or can be added to the
+        // previous chain.
+        if (nodes[cur.first].children_.size() == 1) {
+          if (check_current_for_chaining()) {
+            // Add oneself to the current chain.
+            VLOG(1) << "Adding to existing chain" << cur.first;
+            chain.push_back(cur.first);
+            int index = *nodes[cur.first].children_.begin();
+            depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+          } else {
+            // Can't belong to the previous chain, commit previous chain and
+            // start a new one.
+            commit_chain();
+            chain.push_back(cur.first);
+            int index = *nodes[cur.first].children_.begin();
+            depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+          }
+        } else if (
+            nodes[cur.first].children_.size() == 0 &&
+            check_current_for_chaining()) {
+          // Add current node to the current chain and commit.
+          chain.push_back(cur.first);
+          commit_chain();
+        } else {
+          // Node has more than one child.
+          commit_chain();
+          // Add current node as an independent chain since it won't be a part
+          // of a bigger chain.
+          chain.push_back(cur.first);
+          commit_chain();
+          depth_traverse();
+        }
+      } else {
+        // This node has been seen before, we will only traverse its children.
+        // Commit any pending chains and continue traversing.
+        commit_chain();
+        depth_traverse();
+      }
+    } // End while
+
+    // Check if this if is even needed.
+    commit_chain();
+  }
+  CAFFE_ENFORCE(
+      seen_nodes.size() == nodes.size(),
+      "Haven't seen all the nodes, expected number of nodes ",
+      nodes.size(),
+      ", but seen only ",
+      seen_nodes.size(),
+      ".");
+
+  updateOperatorNodes(orig_nodes, chains);
+  return chains;
+}
+
+ExecutionChains singleChains(std::vector<OperatorNode>& nodes) {
+  ExecutionChains chains;
+  for (int i = 0; i < (int)nodes.size(); ++i) {
+    chains[i] = {i};
+  }
+  updateOperatorNodes(nodes, chains);
+  return chains;
+}
+
+std::vector<OperatorNode> prepareOperatorNodes(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws) {
+  std::vector<OperatorNode> operator_nodes(net_def->op_size());
+  std::map<string, int> blob_creator;
+  std::map<string, std::set<int>> blob_readers;
+  bool net_def_has_device_option = net_def->has_device_option();
+  // Initialize the operators
+  for (int idx = 0; idx < net_def->op_size(); ++idx) {
+    const OperatorDef& op_def = net_def->op(idx);
+    VLOG(1) << "Creating operator #" << idx << ": " << op_def.name() << ": "
+            << op_def.type();
+    if (!op_def.has_device_option() && net_def_has_device_option) {
+      OperatorDef temp_def(op_def);
+      temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
+      operator_nodes[idx].operator_ = CreateOperator(temp_def, ws, idx);
+    } else {
+      auto op = CreateOperator(op_def, ws, idx);
+      op->set_debug_def(
+          std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
+      operator_nodes[idx].operator_ = std::move(op);
+    }
+    // Check the inputs, and set up parents if necessary. This addressese the
+    // read after write case.
+    auto checkInputs =
+        [&](const google::protobuf::RepeatedPtrField<std::string>& inputs) {
+          for (const string& input : inputs) {
+            if (blob_creator.count(input) == 0) {
+              VLOG(1) << "Input " << input << " not produced by this net. "
+                      << "Assuming it is pre-existing.";
+            } else {
+              int parent = blob_creator[input];
+              VLOG(1) << "op dependency (RaW " << input << "): " << parent
+                      << "->" << idx;
+              operator_nodes[idx].parents_.push_back(parent);
+              operator_nodes[parent].children_.push_back(idx);
+            }
+            // Add the current idx to the readers of this input.
+            blob_readers[input].insert(idx);
+          }
+        };
+    checkInputs(op_def.input());
+    checkInputs(op_def.control_input());
+
+    // Check the outputs.
+    for (const string& output : op_def.output()) {
+      if (blob_creator.count(output) != 0) {
+        // This addresses the write after write case - we will assume that all
+        // writes are inherently sequential.
+        int waw_parent = blob_creator[output];
+        VLOG(1) << "op dependency (WaW " << output << "): " << waw_parent
+                << "->" << idx;
+        operator_nodes[idx].parents_.push_back(waw_parent);
+        operator_nodes[waw_parent].children_.push_back(idx);
+      }
+      // This addresses the write after read case - we will assume that writes
+      // should only occur after all previous reads are finished.
+      for (const int war_parent : blob_readers[output]) {
+        VLOG(1) << "op dependency (WaR " << output << "): " << war_parent
+                << "->" << idx;
+        operator_nodes[idx].parents_.push_back(war_parent);
+        operator_nodes[war_parent].children_.push_back(idx);
+      }
+      // Renew the creator of the output name.
+      blob_creator[output] = idx;
+      // The write would create an implicit barrier that all earlier readers of
+      // this output is now parents of the current op, and future writes would
+      // not need to depend on these earlier readers. Thus, we can clear up the
+      // blob readers.
+      blob_readers[output].clear();
+    }
+  }
+
+  // Now, make sure that the parent list and the children list do not contain
+  // duplicated items.
+  for (int i = 0; i < (int)operator_nodes.size(); ++i) {
+    auto& node = operator_nodes[i];
+    // Sort, remove duplicates, and delete self dependency.
+    auto& p = node.parents_;
+    std::sort(p.begin(), p.end());
+    p.erase(std::unique(p.begin(), p.end()), p.end());
+    p.erase(std::remove(p.begin(), p.end(), i), p.end());
+    // Do the same for the children vector.
+    auto& c = node.children_;
+    std::sort(c.begin(), c.end());
+    c.erase(std::unique(c.begin(), c.end()), c.end());
+    c.erase(std::remove(c.begin(), c.end(), i), c.end());
+  }
+
+  return operator_nodes;
+}
+
+std::vector<OpGraphNode> prepareChainGraphNodes(
+    const std::vector<dag_utils::OperatorNode>& operator_nodes,
+    const std::vector<std::vector<int>>& execution_chains) {
+  std::unordered_map<int, int> op_to_chain_idx;
+  for (int chain_idx = 0; chain_idx < (int)execution_chains.size(); ++chain_idx) {
+    const auto& chain_indices = execution_chains[chain_idx];
+    for (const auto& chain_op_idx : chain_indices) {
+      CAFFE_ENFORCE(!op_to_chain_idx.count(chain_op_idx));
+      op_to_chain_idx[chain_op_idx] = chain_idx;
+    }
+  }
+
+  std::vector<OpGraphNode> chain_nodes(execution_chains.size());
+  for (int op_idx = 0; op_idx < (int)operator_nodes.size(); ++op_idx) {
+    CAFFE_ENFORCE(op_to_chain_idx.count(op_idx));
+    auto chain_idx = op_to_chain_idx[op_idx];
+    auto& chain = chain_nodes[chain_idx];
+    auto& op_node = operator_nodes[op_idx];
+
+    for (const auto& child_idx : op_node.children_) {
+      CAFFE_ENFORCE(op_to_chain_idx.count(child_idx));
+      auto child_chain_idx = op_to_chain_idx[child_idx];
+      if (child_chain_idx != chain_idx) {
+        auto it = std::find(
+            chain.children_.begin(), chain.children_.end(), child_chain_idx);
+        if (it == chain.children_.end()) {
+          chain.children_.push_back(child_chain_idx);
+        }
+      }
+    }
+
+    for (const auto& parent_idx : op_node.parents_) {
+      CAFFE_ENFORCE(op_to_chain_idx.count(parent_idx));
+      auto parent_chain_idx = op_to_chain_idx[parent_idx];
+      if (parent_chain_idx != chain_idx) {
+        auto it = std::find(
+            chain.parents_.begin(), chain.parents_.end(), parent_chain_idx);
+        if (it == chain.parents_.end()) {
+          chain.parents_.push_back(parent_chain_idx);
+        }
+      }
+    }
+  }
+
+  return chain_nodes;
+}
+
+} // namespace dag_utils
+} // namespace caffe2
diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h
new file mode 100644
index 0000000..419c26c
--- /dev/null
+++ b/caffe2/core/net_dag_utils.h
@@ -0,0 +1,61 @@
+#ifndef CAFFE2_CORE_NET_DAG_UTILS_H_
+#define CAFFE2_CORE_NET_DAG_UTILS_H_
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <thread> // NOLINT
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/simple_queue.h"
+
+namespace caffe2 {
+namespace dag_utils {
+
+struct OperatorNode {
+  unique_ptr<OperatorBase> operator_;
+  vector<int> children_;
+  vector<int> parents_;
+  std::atomic<int> runtime_parent_count_;
+  bool is_chain_start_ = false;
+  std::atomic_flag scheduled_ = ATOMIC_FLAG_INIT;
+};
+
+struct OpGraphNode {
+  vector<int> children_;
+  vector<int> parents_;
+  int visited_inputs = 0;
+  int num_orig_parents;
+};
+
+using ExecutionChains = std::unordered_map<int, std::vector<int>>;
+
+ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
+
+ExecutionChains singleChains(std::vector<OperatorNode>& nodes);
+
+std::vector<OperatorNode> prepareOperatorNodes(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws);
+
+std::vector<OpGraphNode> prepareChainGraphNodes(
+    const std::vector<dag_utils::OperatorNode>& operator_nodes,
+    const std::vector<std::vector<int>>& execution_chains);
+
+} // namespace dag_utils
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_DAG_UTILS_H_
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
new file mode 100644
index 0000000..8bc82b5
--- /dev/null
+++ b/caffe2/core/net_gpu_test.cc
@@ -0,0 +1,136 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/net_dag.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/scope_guard.h"
+
+CAFFE2_DECLARE_bool(caffe2_disable_chaining);
+
+namespace caffe2 {
+
+namespace {
+
+static std::atomic<int> counter;
+
+// A net test dummy op that does nothing but scaffolding. Here, we
+// inherit from OperatorBase because we instantiate on both CPU and
+// GPU. In general, you want to only inherit from Operator<Context>.
+class NetTestDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+  NetTestDummyOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        fail_(OperatorBase::GetSingleArgument<bool>("fail", false)) {}
+
+  bool Run(int /* unused */ /*stream_id*/) override {
+    if (fail_) {
+      return false;
+    }
+    counter.fetch_add(1);
+    return true;
+  }
+
+  // Simulate CUDA operator behavior
+  bool HasAsyncPart() const override {
+    return debug_def().device_option().device_type() == CUDA;
+  }
+
+  bool SupportsAsyncScheduling() const override {
+    return debug_def().device_option().device_type() == CUDA;
+  }
+
+ protected:
+  const bool fail_;
+};
+
+REGISTER_CPU_OPERATOR(NetTestDummy, NetTestDummyOp);
+REGISTER_CUDA_OPERATOR(NetTestDummy, NetTestDummyOp);
+REGISTER_CPU_OPERATOR(NetTestDummy2, NetTestDummyOp);
+REGISTER_CUDA_OPERATOR(NetTestDummy2, NetTestDummyOp);
+
+OPERATOR_SCHEMA(NetTestDummy)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+OPERATOR_SCHEMA(NetTestDummy2)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{1, 0}});
+
+}  // namespace
+
+void testExecution(std::unique_ptr<NetBase>& net, int num_ops) {
+  // Run 100 times
+  for (int i = 0; i < 100; i++) {
+    counter.exchange(0);
+    net.get()->Run();
+    ASSERT_EQ(num_ops, counter.load());
+  }
+}
+
+void checkChainingAndRun(
+    const char* spec,
+    const dag_utils::ExecutionChains& expected) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  CAFFE_ENFORCE(TextFormat::ParseFromString(spec, &net_def));
+  {
+    net_def.set_num_workers(4);
+    auto old = FLAGS_caffe2_disable_chaining;
+    auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
+    FLAGS_caffe2_disable_chaining = false;
+
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    auto* dag = dynamic_cast_if_rtti<AsyncNetBase*>(net.get());
+    CHECK_NOTNULL(dag);
+    const auto& chains = dag->TEST_execution_chains();
+    EXPECT_EQ(chains, expected);
+    testExecution(net, net_def.op().size());
+  }
+}
+
+TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden"
+          output: "out"
+          type: "NetTestDummy"
+          device_option {
+            device_type: 1
+          }
+        }
+        op {
+          input: "out"
+          output: "out2"
+          type: "NetTestDummy"
+          device_option {
+            device_type: 1
+          }
+        }
+        op {
+          input: "out2"
+          output: "out3"
+          type: "NetTestDummy"
+          device_option {
+            device_type: 1
+            cuda_gpu_id: 1
+          }
+        }
+)DOC";
+  if (HasCudaGPU() && NumCudaDevices() >= 2) {
+    checkChainingAndRun(spec, {{0, {0, 1, 2}}, {3, {3}}});
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/net_simple.cc b/caffe2/core/net_simple.cc
new file mode 100644
index 0000000..46c1e8f
--- /dev/null
+++ b/caffe2/core/net_simple.cc
@@ -0,0 +1,274 @@
+#include "caffe2/core/net_simple.h"
+#include "caffe2/core/net.h"
+
+#include <iostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_simple_net_benchmark_run_whole_net,
+    true,
+    "If false, whole net passes won't be performed");
+
+namespace caffe2 {
+
+SimpleNet::SimpleNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : NetBase(net_def, ws) {
+  VLOG(1) << "Constructing SimpleNet " << net_def->name();
+  const bool net_def_has_device_option = net_def->has_device_option();
+  // Initialize the operators
+  for (int idx = 0; idx < net_def->op_size(); ++idx) {
+    const auto& operator_def = net_def->op(idx);
+    VLOG(1) << "Creating operator " << operator_def.name() << ": "
+            << operator_def.type();
+    std::unique_ptr<OperatorBase> op{nullptr};
+    if (!operator_def.has_device_option() && net_def_has_device_option) {
+      // In the case that the operator def does not specify a device option but
+      // the net def has a default option, we copy the device option over to the
+      // operator def.
+      OperatorDef temp_def(operator_def);
+      temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
+      op = CreateOperator(temp_def, ws, idx);
+    } else {
+      op = CreateOperator(operator_def, ws, idx);
+      op->set_debug_def(
+          std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
+    }
+    operators_.emplace_back(std::move(op));
+  }
+}
+
+bool SimpleNet::Run() {
+  StartAllObservers();
+  VLOG(1) << "Running net " << name_;
+  for (auto& op : operators_) {
+    VLOG(1) << "Running operator " << op->debug_def().name() << "("
+            << op->debug_def().type() << ").";
+#ifdef CAFFE2_ENABLE_SDT
+    const auto& op_name = op->debug_def().name().c_str();
+    const auto& op_type = op->debug_def().type().c_str();
+    auto* op_ptr = op.get();
+    const auto& net_name = name_.c_str();
+    CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr);
+#endif
+    bool res = op->Run();
+#ifdef CAFFE2_ENABLE_SDT
+    CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr);
+#endif
+    if (!res) {
+      LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
+      return false;
+    }
+  }
+  StopAllObservers();
+  return true;
+}
+
+bool SimpleNet::RunAsync() {
+  return Run();
+}
+
+namespace {
+template <typename A, typename B>
+bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
+  return x.second > y.second;
+}
+}
+
+vector<float> SimpleNet::TEST_Benchmark(
+    const int warmup_runs,
+    const int main_runs,
+    const bool run_individual) {
+  /* Use std::cout because logging may be disabled */
+  std::cout << "Starting benchmark." << std::endl;
+  std::cout << "Running warmup runs." << std::endl;
+  CAFFE_ENFORCE(
+      warmup_runs >= 0,
+      "Number of warm up runs should be non negative, provided ",
+      warmup_runs,
+      ".");
+  for (int i = 0; i < warmup_runs; ++i) {
+    CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
+  }
+
+  std::cout << "Main runs." << std::endl;
+  CAFFE_ENFORCE(
+      main_runs >= 0,
+      "Number of main runs should be non negative, provided ",
+      main_runs,
+      ".");
+  Timer timer;
+  auto millis = timer.MilliSeconds();
+  if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
+    for (int i = 0; i < main_runs; ++i) {
+      CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
+    }
+    millis = timer.MilliSeconds();
+    std::cout << "Main run finished. Milliseconds per iter: "
+              << millis / main_runs
+              << ". Iters per second: " << 1000.0 * main_runs / millis
+              << std::endl;
+  }
+  vector<float> time_per_op(operators_.size(), 0);
+  vector<uint64_t> flops_per_op;
+  vector<uint64_t> memory_bytes_read_per_op;
+  vector<uint64_t> memory_bytes_written_per_op;
+  vector<uint64_t> param_bytes_per_op;
+  CaffeMap<string, float> time_per_op_type;
+  CaffeMap<string, float> flops_per_op_type;
+  CaffeMap<string, float> memory_bytes_read_per_op_type;
+  CaffeMap<string, float> memory_bytes_written_per_op_type;
+  CaffeMap<string, float> param_bytes_per_op_type;
+  if (run_individual) {
+    for (int i = 0; i < main_runs; ++i) {
+      for (auto& op : operators_) {
+        op->ResetEvent();
+      }
+      int idx = 0;
+      for (auto& op : operators_) {
+        const string& op_type = op->debug_def().type();
+        if (i == 0) { // Gather flops on the first run.
+          auto* schema = OpSchemaRegistry::Schema(op_type);
+          if (schema && schema->HasCostInferenceFunction()) {
+            vector<TensorShape> shapes = op->InputTensorShapes();
+
+            auto all_good_shapes = std::accumulate(
+                shapes.begin(),
+                shapes.end(),
+                true,
+                [](bool acc, const TensorShape& shape) {
+                  return acc && !shape.unknown_shape();
+                });
+            OpSchema::Cost cost;
+            if (all_good_shapes) {
+              cost = schema->InferCost(op->debug_def(), shapes);
+            }
+
+            flops_per_op.emplace_back(cost.flops);
+            memory_bytes_read_per_op.emplace_back(cost.bytes_read);
+            memory_bytes_written_per_op.emplace_back(cost.bytes_written);
+            param_bytes_per_op.emplace_back(cost.params_bytes);
+
+            flops_per_op_type[op_type] += cost.flops;
+            memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
+            memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
+            param_bytes_per_op_type[op_type] += cost.params_bytes;
+          }
+        }
+        timer.Start();
+        CAFFE_ENFORCE(
+            op->Run(),
+            "operator ",
+            op->debug_def().name(),
+            "(",
+            op_type,
+            ") has failed.");
+        float spent = timer.MilliSeconds();
+        time_per_op[idx] += spent;
+        time_per_op_type[op_type] += spent;
+        ++idx;
+      }
+    }
+    size_t idx = 0;
+    for (auto& op : operators_) {
+      const string& op_type = op->debug_def().type();
+      const string& print_name =
+          (op->debug_def().name().size()
+               ? op->debug_def().name()
+               : (op->debug_def().output_size() ? op->debug_def().output(0)
+                                                : "NO_OUTPUT"));
+      std::stringstream flops_str;
+      if (idx < flops_per_op.size() && flops_per_op[idx]) {
+        flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
+                  << to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
+                  << " GFLOPS)";
+      }
+      std::stringstream memory_bytes_read_str;
+      if (idx < memory_bytes_read_per_op.size() &&
+          memory_bytes_read_per_op[idx]) {
+        memory_bytes_read_str
+            << " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx])
+            << " MB)";
+      }
+      std::stringstream memory_bytes_written_str;
+      if (idx < memory_bytes_written_per_op.size() &&
+          memory_bytes_written_per_op[idx]) {
+        memory_bytes_written_str
+            << " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
+            << " MB)";
+      }
+      std::stringstream param_bytes_str;
+      if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
+        param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
+                        << " MB)";
+      }
+      std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
+                << ") " << time_per_op[idx] / main_runs << " ms/iter"
+                << flops_str.str() << memory_bytes_written_str.str()
+                << param_bytes_str.str() << std::endl;
+      ++idx;
+    }
+    const std::vector<string> metric({"Time",
+                                      "FLOP",
+                                      "Feature Memory Read",
+                                      "Feature Memory Written",
+                                      "Parameter Memory"});
+    const std::vector<double> normalizer(
+        {1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
+    const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});
+
+    std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
+    metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
+    metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
+    metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
+    metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
+    metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
+    for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
+      std::cout << metric[i] << " per operator type:" << std::endl;
+      auto* item = metric_per_op_type_vec_vec[i];
+      std::vector<std::pair<string, float>> metric_per_op_type_vec(
+          (*item).begin(), (*item).end());
+      std::sort(
+          metric_per_op_type_vec.begin(),
+          metric_per_op_type_vec.end(),
+          PairLargerThan<string, float>);
+      float total_metric = 0.;
+      for (const auto& op_item : metric_per_op_type_vec) {
+        total_metric += op_item.second * normalizer[i];
+      }
+      for (const auto& op_item : metric_per_op_type_vec) {
+        float percent = 0.;
+        if (total_metric > 0.) {
+          percent = (100.0 * op_item.second * normalizer[i] / total_metric);
+        }
+        std::cout << std::setw(15) << std::setfill(' ')
+                  << op_item.second * normalizer[i] << " " << unit[i] << ". "
+                  << std::setw(10) << std::setfill(' ') << percent << "%. "
+                  << op_item.first << std::endl;
+      }
+      std::cout << std::setw(15) << std::setfill(' ') << total_metric << " "
+                << unit[i] << " in Total" << std::endl;
+    }
+  }
+  // We will reuse time_per_op to return the result of BenchmarkNet.
+  for (size_t i = 0; i < time_per_op.size(); ++i) {
+    time_per_op[i] /= main_runs;
+  }
+  if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
+    time_per_op.insert(time_per_op.begin(), millis / main_runs);
+  }
+  return time_per_op;
+}
+
+REGISTER_NET(simple, SimpleNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
new file mode 100644
index 0000000..e741a39
--- /dev/null
+++ b/caffe2/core/net_simple.h
@@ -0,0 +1,56 @@
+#ifndef CAFFE2_CORE_NET_SIMPLE_H_
+#define CAFFE2_CORE_NET_SIMPLE_H_
+
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// This is the very basic structure you need to run a network - all it
+// does is simply to run everything in sequence. If you want more fancy control
+// such as a DAG-like execution, check out other better net implementations.
+class SimpleNet : public NetBase {
+ public:
+  SimpleNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  bool SupportsAsync() override {
+    return false;
+  }
+
+  vector<float> TEST_Benchmark(
+      const int warmup_runs,
+      const int main_runs,
+      const bool run_individual) override;
+
+  /*
+   * This returns a list of pointers to objects stored in unique_ptrs.
+   * Used by Observers.
+   *
+   * Think carefully before using.
+   */
+  vector<OperatorBase*> GetOperators() const override {
+    vector<OperatorBase*> op_list;
+    for (auto& op : operators_) {
+      op_list.push_back(op.get());
+    }
+    return op_list;
+  }
+
+ protected:
+  bool Run() override;
+  bool RunAsync() override;
+
+  vector<unique_ptr<OperatorBase>> operators_;
+
+  DISABLE_COPY_AND_ASSIGN(SimpleNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_SIMPLE_H_
diff --git a/caffe2/core/net_simple_async.cc b/caffe2/core/net_simple_async.cc
new file mode 100644
index 0000000..81e0903
--- /dev/null
+++ b/caffe2/core/net_simple_async.cc
@@ -0,0 +1,86 @@
+#include "caffe2/core/net_simple_async.h"
+#include "caffe2/core/net.h"
+
+#include <iostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+AsyncSimpleNet::AsyncSimpleNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : NetBase(net_def, ws) {
+  VLOG(1) << "Constructing AsyncSimpleNet " << net_def->name();
+  const bool net_def_has_device_option = net_def->has_device_option();
+  // Initialize the operators
+  const DeviceOption* first_device_option = nullptr;
+  const DeviceOption* current_device_option;
+  for (int idx = 0; idx < net_def->op_size(); ++idx) {
+    const auto& operator_def = net_def->op(idx);
+    VLOG(1) << "Creating operator " << operator_def.name() << ": "
+            << operator_def.type();
+    std::unique_ptr<OperatorBase> op{nullptr};
+    if (!operator_def.has_device_option() && net_def_has_device_option) {
+      // In the case that the operator def does not specify a device option but
+      // the net def has a default option, we copy the device option over to the
+      // operator def.
+      OperatorDef temp_def(operator_def);
+      temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
+      op = CreateOperator(temp_def, ws, idx);
+      current_device_option = &net_def->device_option();
+    } else {
+      op = CreateOperator(operator_def, ws, idx);
+      op->set_debug_def(
+          std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
+      current_device_option = &operator_def.device_option();
+    }
+    if (!first_device_option) {
+      first_device_option = current_device_option;
+    } else {
+      CAFFE_ENFORCE(
+          IsSameDevice(*first_device_option, *current_device_option),
+          "AsyncSimpleNet supports only single device networks");
+    }
+    operators_.emplace_back(std::move(op));
+  }
+  events_ = {&operators_.back()->event()};
+}
+
+bool AsyncSimpleNet::DoRunAsync() {
+  StartAllObservers();
+
+  VLOG(1) << "Running net " << name_;
+  for (auto& op : operators_) {
+    VLOG(1) << "Running operator " << op->debug_def().name() << "("
+            << op->debug_def().type() << ").";
+#ifdef CAFFE2_ENABLE_SDT
+    const auto& op_name = op->debug_def().name().c_str();
+    const auto& op_type = op->debug_def().type().c_str();
+    auto* op_ptr = op.get();
+    const auto& net_name = name_.c_str();
+    CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr);
+#endif
+    bool res = op->RunAsync();
+#ifdef CAFFE2_ENABLE_SDT
+    CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr);
+#endif
+    if (!res) {
+      LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
+      return false;
+    }
+  }
+  StopAllObservers();
+  return true;
+}
+
+REGISTER_NET(async_simple, AsyncSimpleNet);
+
+} // namespace caffe2
diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h
new file mode 100644
index 0000000..cf2a3d4
--- /dev/null
+++ b/caffe2/core/net_simple_async.h
@@ -0,0 +1,51 @@
+#ifndef CAFFE2_CORE_NET_SIMPLE_ASYNC_H_
+#define CAFFE2_CORE_NET_SIMPLE_ASYNC_H_
+
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// This is the very basic structure you need to run a network - all it
+// does is simply to run everything in sequence. If you want more fancy control
+// such as a DAG-like execution, check out other better net implementations.
+class AsyncSimpleNet : public NetBase {
+ public:
+  AsyncSimpleNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+
+  bool SupportsAsync() override {
+    return true;
+  }
+
+  /*
+   * This returns a list of pointers to objects stored in unique_ptrs.
+   * Used by Observers.
+   *
+   * Think carefully before using.
+   */
+  vector<OperatorBase*> GetOperators() const override {
+    vector<OperatorBase*> op_list;
+    for (auto& op : operators_) {
+      op_list.push_back(op.get());
+    }
+    return op_list;
+  }
+
+ protected:
+  bool DoRunAsync() override;
+
+  vector<unique_ptr<OperatorBase>> operators_;
+
+  DISABLE_COPY_AND_ASSIGN(AsyncSimpleNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_SIMPLE_ASYNC_H_
diff --git a/caffe2/core/net_test.cc b/caffe2/core/net_test.cc
new file mode 100644
index 0000000..1b93970
--- /dev/null
+++ b/caffe2/core/net_test.cc
@@ -0,0 +1,818 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net.h"
+#include "caffe2/core/net_async_scheduling.h"
+#include "caffe2/core/net_dag.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/scope_guard.h"
+
+#include <google/protobuf/text_format.h>
+
+CAFFE2_DECLARE_bool(caffe2_disable_chaining);
+
+namespace caffe2 {
+
+namespace {
+
+static std::atomic<int> counter;
+
+// A net test dummy op that does nothing but scaffolding. Here, we
+// inherit from OperatorBase because we instantiate on both CPU and
+// GPU. In general, you want to only inherit from Operator<Context>.
+class NetTestDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+  NetTestDummyOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        fail_(OperatorBase::GetSingleArgument<bool>("fail", false)) {}
+
+  bool Run(int /* unused */ /*stream_id*/) override {
+    if (fail_) {
+      return false;
+    }
+    counter.fetch_add(1);
+    return true;
+  }
+
+  // Simulate CUDA operator behavior
+  bool HasAsyncPart() const override {
+    return debug_def().device_option().device_type() == CUDA;
+  }
+
+  bool SupportsAsyncScheduling() const override {
+    return debug_def().device_option().device_type() == CUDA;
+  }
+
+ protected:
+  const bool fail_;
+};
+
+REGISTER_CPU_OPERATOR(NetTestDummy, NetTestDummyOp);
+REGISTER_CUDA_OPERATOR(NetTestDummy, NetTestDummyOp);
+REGISTER_CPU_OPERATOR(NetTestDummy2, NetTestDummyOp);
+REGISTER_CUDA_OPERATOR(NetTestDummy2, NetTestDummyOp);
+
+OPERATOR_SCHEMA(NetTestDummy)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+OPERATOR_SCHEMA(NetTestDummy2)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{1, 0}});
+
+unique_ptr<NetBase> CreateNetTestHelper(
+    Workspace* ws,
+    const vector<string>& input,
+    const vector<string>& output) {
+  NetDef net_def;
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("NetTestDummy");
+    op.add_input("in");
+    op.add_output("hidden");
+  }
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("NetTestDummy");
+    op.add_input("hidden");
+    op.add_output("out");
+  }
+
+  for (const auto& name : input) {
+    net_def.add_external_input(name);
+  }
+  for (const auto& name : output) {
+    net_def.add_external_output(name);
+  }
+  return CreateNet(net_def, ws);
+}
+
+} // namespace
+
+TEST(NetTest, ConstructionNoDeclaredInputOutput) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  unique_ptr<NetBase> net(
+      CreateNetTestHelper(&ws, vector<string>(), vector<string>()));
+  EXPECT_TRUE(net.get() != nullptr);
+}
+
+TEST(NetTest, ConstructionDeclaredInput) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  unique_ptr<NetBase> net(
+      CreateNetTestHelper(&ws, vector<string>{"in"}, vector<string>()));
+  EXPECT_TRUE(net.get() != nullptr);
+}
+
+TEST(NetTest, ConstructionDeclaredOutput) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  unique_ptr<NetBase> net(
+      CreateNetTestHelper(&ws, vector<string>(), vector<string>{"out"}));
+  EXPECT_TRUE(net.get() != nullptr);
+}
+
+TEST(NetTest, DeclaredInputInsufficient) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  ASSERT_THROW(
+      CreateNetTestHelper(&ws, vector<string>{"unuseful_in"}, vector<string>()),
+      EnforceNotMet);
+}
+
+TEST(NetDeathTest, DeclaredOutputNotMet) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  ASSERT_THROW(
+      CreateNetTestHelper(
+          &ws, vector<string>(), vector<string>{"unproduced_out"}),
+      EnforceNotMet);
+}
+
+void testExecution(std::unique_ptr<NetBase>& net, int num_ops) {
+  // Run 100 times
+  for (int i = 0; i < 100; i++) {
+    counter.exchange(0);
+    net.get()->Run();
+    ASSERT_EQ(num_ops, counter.load());
+  }
+}
+
+void checkChainingAndRun(
+    const char* spec,
+    const dag_utils::ExecutionChains& expected) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+  {
+    net_def.set_num_workers(4);
+    auto old = FLAGS_caffe2_disable_chaining;
+    auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
+    FLAGS_caffe2_disable_chaining = false;
+
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    auto* dag = dynamic_cast_if_rtti<AsyncNetBase*>(net.get());
+    CHECK_NOTNULL(dag);
+    const auto& chains = dag->TEST_execution_chains();
+    EXPECT_TRUE(chains == expected);
+    testExecution(net, net_def.op().size());
+  }
+}
+
+void checkNumChainsAndRun(const char* spec, const int expected_num_chains) {
+  Workspace ws;
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+  net_def.set_num_workers(4);
+
+  // Create all external inputs
+  for (auto inp : net_def.external_input()) {
+    ws.CreateBlob(inp);
+  }
+
+  {
+    auto old = FLAGS_caffe2_disable_chaining;
+    auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
+    FLAGS_caffe2_disable_chaining = false;
+
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    auto* dag = dynamic_cast_if_rtti<AsyncNetBase*>(net.get());
+    CHECK_NOTNULL(dag);
+    const auto& chains = dag->TEST_execution_chains();
+    EXPECT_EQ(expected_num_chains, chains.size());
+    testExecution(net, net_def.op().size());
+  }
+}
+
+TEST(NetTest, DISABLED_ChainingForLinearModel) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden"
+          output: "out"
+          type: "NetTestDummy"
+        }
+)DOC";
+  checkChainingAndRun(spec, {{0, {0, 1}}});
+}
+
+TEST(NetTest, DISABLED_ChainingForFork) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden"
+          output: "out1"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden"
+          output: "out2"
+          type: "NetTestDummy"
+        }
+)DOC";
+  checkChainingAndRun(spec, {{0, {0}}, {1, {1}}, {2, {2}}});
+}
+
+// TEST(NetTest, ChainingForJoinWithAncestor) {
+//   const auto spec = R"DOC(
+//         name: "example"
+//         type: "dag"
+//         external_input: "in"
+//         op {
+//           input: "in"
+//           output: "hidden"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           output: "out1"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           output: "out2"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           input: "out2"
+//           type: "NetTestDummy"
+//         }
+// )DOC";
+//   checkChainingAndRun(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
+// }
+
+TEST(NetTest, DISABLED_ChainingForForkJoin) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden1"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "in"
+          output: "hidden2"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden1"
+          input: "hidden2"
+          output: "out"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "out"
+          output: "out2"
+          type: "NetTestDummy"
+        }
+)DOC";
+  checkChainingAndRun(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
+}
+
+TEST(NetTest, DISABLED_ChainingForwardBackward) {
+  const auto spec = R"DOC(
+  name: "gpu_0"
+  type: "dag"
+  op {
+    input: "in"
+    input: "fc_0_w"
+    input: "fc_0_b"
+    output: "fc_0"
+    name: "0"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_0"
+    output: "fc_0"
+    name: "1"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_0"
+    input: "fc_1_w"
+    input: "fc_1_b"
+    output: "fc_1"
+    name: "2"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_1"
+    output: "fc_1"
+    name: "3"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_1"
+    input: "fc_2_w"
+    input: "fc_2_b"
+    output: "fc_2"
+    name: "4"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_2"
+    output: "fc_2"
+    name: "5"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_2"
+    input: "fc_3_w"
+    input: "fc_3_b"
+    output: "fc_3"
+    name: "6"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_3"
+    output: "fc_3"
+    name: "7"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_3"
+    input: "fc_4_w"
+    input: "fc_4_b"
+    output: "fc_4"
+    name: "8"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_4"
+    output: "fc_4"
+    name: "9"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_4"
+    input: "in2"
+    output: "LabelCrossEntropy"
+    name: "10"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "LabelCrossEntropy"
+    output: "AveragedLoss"
+    name: "11"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "AveragedLoss"
+    output: "AveragedLoss_autogen_grad"
+    name: "12"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "LabelCrossEntropy"
+    input: "AveragedLoss_autogen_grad"
+    output: "LabelCrossEntropy_grad"
+    name: "13"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_4"
+    input: "label"
+    input: "LabelCrossEntropy_grad"
+    output: "fc_4_grad"
+    name: "14"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "fc_4"
+    input: "fc_4_grad"
+    output: "fc_4_grad"
+    name: "15"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "fc_3"
+    input: "fc_4_w"
+    input: "fc_4_grad"
+    output: "fc_4_w_grad"
+    output: "fc_4_b_grad"
+    output: "fc_3_grad"
+    name: "16"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_3"
+    input: "fc_3_grad"
+    output: "fc_3_grad"
+    name: "17"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "fc_2"
+    input: "fc_3_w"
+    input: "fc_3_grad"
+    output: "fc_3_w_grad"
+    output: "fc_3_b_grad"
+    output: "fc_2_grad"
+    name: "18"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_2"
+    input: "fc_2_grad"
+    output: "fc_2_grad"
+    name: "19"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "fc_1"
+    input: "fc_2_w"
+    input: "fc_2_grad"
+    output: "fc_2_w_grad"
+    output: "fc_2_b_grad"
+    output: "fc_1_grad"
+    name: "20"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_1"
+    input: "fc_1_grad"
+    output: "fc_1_grad"
+    name: "21"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "fc_0"
+    input: "fc_1_w"
+    input: "fc_1_grad"
+    output: "fc_1_w_grad"
+    output: "fc_1_b_grad"
+    output: "fc_0_grad"
+    name: "22"
+    type: "NetTestDummy"
+  }
+  op {
+    input: "fc_0"
+    input: "fc_0_grad"
+    output: "fc_0_grad"
+    name: "23"
+    type: "NetTestDummy2"
+  }
+  op {
+    input: "in"
+    input: "fc_0_w"
+    input: "fc_0_grad"
+    output: "fc_0_w_grad"
+    output: "fc_0_b_grad"
+    output: "data_grad"
+    name: "24"
+    type: "NetTestDummy"
+  }
+  external_input: "in"
+  external_input: "in2"
+  external_input: "LR"
+  external_input: "fc_0_w"
+  external_input: "fc_0_b"
+  external_input: "fc_1_w"
+  external_input: "fc_1_b"
+  external_input: "fc_2_w"
+  external_input: "fc_2_b"
+  external_input: "fc_3_w"
+  external_input: "fc_3_b"
+  external_input: "fc_4_w"
+  external_input: "fc_4_b"
+  external_input: "label"
+  )DOC";
+  checkNumChainsAndRun(spec, 1);
+}
+
+TEST(NetTest, DISABLED_ChainingForHogwildModel) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden1"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden1"
+          output: "mid1"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "mid1"
+          output: "out1"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "in"
+          output: "hidden2"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden2"
+          output: "mid2"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "mid2"
+          output: "out2"
+          type: "NetTestDummy"
+        }
+)DOC";
+  checkNumChainsAndRun(spec, 2);
+}
+
+TEST(NetTest, DISABLED_FailingOperator) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "hidden"
+          type: "NetTestDummy"
+        }
+        op {
+          input: "hidden"
+          output: "out"
+          type: "NetTestDummy"
+          arg {
+            name: "fail"
+            i: 1
+          }
+        }
+)DOC";
+
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    net_def.set_num_workers(4);
+    auto old = FLAGS_caffe2_disable_chaining;
+    auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
+    FLAGS_caffe2_disable_chaining = false;
+
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    for (int i = 0; i < 10; i++) {
+      counter.exchange(0);
+      bool run_result = false;
+      try {
+        run_result = net->Run();
+      } catch (const std::exception&) {
+        // async_scheduling would throw
+      }
+      ASSERT_FALSE(run_result);
+
+      ASSERT_EQ(1, counter.load());
+    }
+  }
+}
+
+const int kTestPoolSize = 4;
+
+class ExecutorHelperDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+  ExecutorHelperDummyOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws) {}
+
+  bool Run(int /* unused */ /*stream_id*/) override {
+    auto helper = GetExecutorHelper();
+    CAFFE_ENFORCE(helper);
+    auto pool = helper->GetPool(device_option());
+    CAFFE_ENFORCE(pool);
+    auto pool_size = pool->size();
+    CAFFE_ENFORCE_EQ(pool_size, kTestPoolSize);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(ExecutorHelperDummy, ExecutorHelperDummyOp);
+
+OPERATOR_SCHEMA(ExecutorHelperDummy);
+
+TEST(NetTest, OperatorWithExecutorHelper) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "async_scheduling"
+        op {
+          type: "ExecutorHelperDummy"
+        }
+)DOC";
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  Workspace ws;
+  net_def.set_num_workers(kTestPoolSize);
+  std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  ASSERT_TRUE(net->Run());
+}
+
+TEST(NetTest, DISABLED_OperatorWithDisabledEvent) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "async_scheduling"
+        external_input: "in"
+        op {
+          input: "in"
+          output: "out"
+          type: "NetTestDummy"
+          arg {
+            name: "fail"
+            i: 1
+          }
+        }
+)DOC";
+
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    net->GetOperators()[0]->DisableEvent();
+    // async_scheduling propagates exception
+    bool caught_exception = false;
+    try {
+      net->Run();
+    } catch (const std::exception& e) {
+      caught_exception = true;
+    }
+    ASSERT_TRUE(caught_exception);
+  }
+}
+
+TEST(NetTest, ExecutorOverride) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "dag"
+  )DOC";
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    Workspace ws;
+    auto old = FLAGS_caffe2_override_executor;
+    auto g = MakeGuard([&]() { FLAGS_caffe2_override_executor = old; });
+    FLAGS_caffe2_override_executor = "dag,async_scheduling";
+
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    auto async_net =
+        caffe2::dynamic_cast_if_rtti<AsyncSchedulingNet*>(net.get());
+    ASSERT_TRUE(async_net != nullptr);
+  }
+}
+
+TEST(NetTest, AsyncEmptyNet) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "async_scheduling"
+  )DOC";
+
+  Workspace ws;
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    bool caught_exception = false;
+    try {
+      ASSERT_TRUE(net->Run());
+    } catch (const std::exception& e) {
+      caught_exception = true;
+    }
+    ASSERT_FALSE(caught_exception);
+  }
+}
+
+TEST(NetTest, DISABLED_RunAsyncFailure) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "async_scheduling"
+        op {
+          input: "in"
+          output: "out"
+          type: "NetTestDummy"
+          arg {
+            name: "fail"
+            i: 1
+          }
+        }
+  )DOC";
+
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+
+    bool caught_exception = false;
+    try {
+      ASSERT_FALSE(net->Run());
+    } catch (const std::exception& e) {
+      caught_exception = true;
+    }
+    ASSERT_TRUE(caught_exception);
+  }
+}
+
+TEST(NetTest, NoTypeNet) {
+  const auto spec = R"DOC(
+        name: "no_type_net"
+  )DOC";
+
+  Workspace ws;
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  {
+    std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    ASSERT_TRUE(net);
+  }
+}
+
+class NotFinishingOp final : public Operator<CPUContext> {
+ public:
+  NotFinishingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    // never calls SetFinished
+    return true;
+  }
+
+  bool HasAsyncPart() const override {
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(NotFinishingOp, NotFinishingOp);
+
+OPERATOR_SCHEMA(NotFinishingOp);
+
+TEST(NetTest, PendingOpsAndNetFailure) {
+  const auto spec = R"DOC(
+        name: "example"
+        type: "async_scheduling"
+        op {
+          type: "NotFinishingOp"
+        }
+        op {
+          type: "NetTestDummy"
+          arg {
+            name: "fail"
+            i: 1
+          }
+        }
+)DOC";
+
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      ::google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+
+  Workspace ws;
+  std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+
+  // net is not stuck and returns false
+  ASSERT_FALSE(net->Run());
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/nomnigraph/CMakeLists.txt b/caffe2/core/nomnigraph/CMakeLists.txt
new file mode 100644
index 0000000..03ff22b
--- /dev/null
+++ b/caffe2/core/nomnigraph/CMakeLists.txt
@@ -0,0 +1,28 @@
+# ---[ CPU files.
+file(GLOB_RECURSE NOMNI_SRCS *.cc)
+file(GLOB_RECURSE NOMNI_TEST_SRCS *test.cc)
+exclude(NOMNI_SRCS "${NOMNI_SRCS}" "${NOMNI_TEST_SRCS}")
+
+add_library(nomnigraph STATIC "${NOMNI_SRCS}")
+add_dependencies(nomnigraph Caffe_PROTO Caffe2_PROTO)
+
+target_include_directories(nomnigraph PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
+list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS nomnigraph)
+set(Caffe2_PUBLIC_DEPENDENCY_LIBS ${Caffe2_PUBLIC_DEPENDENCY_LIBS} PARENT_SCOPE)
+
+install(TARGETS nomnigraph EXPORT Caffe2Targets DESTINATION lib)
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+
+if (BUILD_TEST)
+  foreach(test_src ${NOMNI_TEST_SRCS})
+    get_filename_component(test_name ${test_src} NAME_WE)
+    add_executable(${test_name} "${test_src}")
+    target_link_libraries(${test_name} nomnigraph gtest_main)
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    install(TARGETS ${test_name} DESTINATION test)
+  endforeach()
+endif()
diff --git a/caffe2/core/nomnigraph/README.md b/caffe2/core/nomnigraph/README.md
new file mode 100644
index 0000000..a6d6015
--- /dev/null
+++ b/caffe2/core/nomnigraph/README.md
@@ -0,0 +1,128 @@
+# nomnigraph
+
+nomnigraph is caffe2's graph transformation subsystem
+
+
+## Usage
+
+The output of `caffe2::convertToNNModule(caffe2::NetDef)` (found in `caffe2/opt`) is an `NNModule`.
+The output of `caffe2::convertToCaffe2Proto(nom::repr::NNModule*, caffe2::NetDef)` is a `NetDef`.
+`convertToCaffe2Proto(convertToNNModule(n), n)` should basically return an unchanged network.
+
+An `NNModule` is composed of both `dataFlow` and `controlFlow` graphs.
+
+Creating a new operator is straightforward.
+```cpp
+auto reluNode = nn.dataFlow.createNode(make_unique<nom::repr::Relu>());
+```
+The line above does a few things worth talking about.
+
+1) It creates a new node using the graph API (both dataFlow and controlFlow are `Graph`s).
+2) It instantiates the node with data, specifically a `unique_ptr` to a neural network operator.
+3) This `unique_ptr` contains a type that inherits from `NeuralNetOperator` and forms the fundamental representation described in the IR section below.
+
+Inserting this operator into the graph would look something like this:
+
+```cpp
+auto edge = nn.dataFlow.createEdge(convOutputTensorNode, reluNode);
+```
+
+Some notes here:
+1) Again the graph API is used to insert the node into the graph with an edge.
+2) Operators are strictly connected to Tensors, not other operators.
+
+## IR
+
+nomnigraph has a *parallel* representation that can contain annotations with caffe2's OperatorDef.
+
+If you call `caffe2::convertToNNModule(caffe2::NetDef)`, every operator in the `NNModule` will be annotated with a reference to the original operator in the net.
+
+This means you should not delete the original protobuf.
+
+```cpp
+auto conv = repr::nn::get<repr::Conv>(convNode);
+if (conv->getAnnotation()) {
+  auto annotation = dyn_cast<caffe2::Caffe2Annotation>(conv->getMutableAnnotation());
+  OperatorDef* op = annotation->getMutableOperatorDef();
+  // Do stuff with the caffe2 protobuf
+}
+```
+
+If you create a new op, as shown in the example above and copied here:
+```cpp
+auto reluNode = nn.dataFlow.createNode(make_unique<nom::repr::Relu>());
+```
+it will not have a caffe2 annotation.
+
+How does `caffe2::convertToCaffe2Proto(nom::repr::NNModule*, caffe2::NetDef)` deal with this?
+
+Operators are either generated manually (see the implementation in `caffe2/opt/converter.cc`) or automatically.
+The automatic generation is done by simply setting the operator `type` to the name of the operator.
+If you'd like to add your own operator to a net and need it to be generated (i.e. are writing a transform that inserts
+new nodes which have attributes likes args) you will need to add your own code to `caffe2/opt/converter.cc`.
+
+Do not create `OperatorDef`s in the transformation itself! This is an anti-pattern as the logic becomes less portable.
+
+## API
+
+Below is a subset of selected API calls that are quite useful.  Lower level manipulation calls are omitted.
+
+### Graph API
+
+```cpp
+auto g = Graph<T>(); // Constructor
+
+Graph<T>::NodeRef n = g.createNode(T t); // Returns reference to the node
+
+Graph<T>::EdgeRef e = g.createEdge(n1, n2); // Returns reference to the edge
+
+g.deleteNode(n); // Deletes the node and all of its in/out edges from the graph
+// Use g.deleteNode(n, false); to keep the edges around.
+
+g.deleteEdge(e); // Deletes the edge between two nodes.
+
+auto e = g.getEdge(n1, n2); // Gets the first edge that has n1 as a tail and n2 as the head.
+
+auto ns = g.getMutableNodes(); // Returns a vector of Graph<T>::NodeRef
+
+auto es = g.getMutableEdges(); // Returns a vector of Graph<T>::EdgeRef
+
+T d = n->data(); // Get the data stored at the node
+```
+
+### NN API
+
+```cpp
+repr::NNModule nn = ...;
+using namespace nom;
+
+repr::NNGraph::NodeRef n;  // Canonical node of the neural network
+
+bool b = repr::nn::is<repr::Tensor>(n); // Checks the type stored on the node.  (Works with parent types.)
+
+repr::Conv* c = repr::nn::get<repr::Conv>(n); // Returns a pointer to the NeuralNetOperator or NeuralNetData in the node
+
+auto pairs = dataIterator(nn); // A useful paradigm for iterating through nodes in no particular order.
+// See https://github.com/pytorch/pytorch/blob/master/caffe2/opt/mobile.cc#L106-L109
+
+
+///// These functions make it easy to check attributes on nodes. /////
+// -- Tensor node functions --
+bool b = hasProducer(tensorNode);  // Checks for producers.
+auto n = getProducer(tensorNode); // Returns the producer of the tensor
+bool b = hasConsumer(tensorNode); // Checks for consumers.
+std::vector<NNGraph::NodeRef> consumers = getConsumers(tensorNode); // Returns a vector of all consumers of the tensor.
+
+// -- Operator node functions --
+bool b = hasInputs(n); // Checks if there are any input tensors.
+std::vector<NNGraph::NodeRef> getInputs(n); // Returns a vector of all the input tensor nodes.
+std::vector<NNGraph::NodeRef> getOutputs(n); // Returns a vector of all the output tensor nodes.
+
+///// These functions are less commonly useful /////
+coalesceInsertedDataDependencies(&nn); // Fixes up all the inserted dependencies in the dataflow graph.
+
+insertOp<repr::Relu>(nn.dataFlow, n1, n2); // Inserts an operator into the dataflow graph and creates a new blob to do so.
+// n1 or n2 must be a tensor and the inserted blob inherits the name from that, appending an underscore.
+
+convertNode<repr::ConvRelu>(nn.dataFlow, n);  // Converts the data at the node to a new node by calling the passed in type with the old node's data as the constructor argument.
+```
diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
new file mode 100644
index 0000000..8281b1d
--- /dev/null
+++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
@@ -0,0 +1,187 @@
+#include "nomnigraph/Representations/NeuralNet.h"
+#include "nomnigraph/Graph/Algorithms.h"
+
+namespace nom {
+namespace repr {
+
+NeuralNetOperator::~NeuralNetOperator() {}
+
+const std::string NeuralNetOperator::getName() const {
+  switch (getKind()) {
+#include "nomnigraph/Generated/OpNames.h"
+    case NNKind::While:
+      return "While";
+    case NNKind::NNPhi:
+      return "Phi";
+    case NNKind::GenericOperator:
+      return dyn_cast<GenericOperator>(this)->getName();
+    default:
+      return "Unknown";
+  }
+}
+
+NeuralNetData::~NeuralNetData() {}
+
+const std::string NeuralNetData::getName() const {
+  switch (getKind()) {
+    case NNDataKind::Tensor: {
+      return dyn_cast<Tensor>(this)->getName();
+    }
+    default:
+      return "";
+  }
+}
+
+namespace nn {
+
+bool hasProducer(NNGraph::NodeRef n) {
+  return n->getInEdges().size() != 0;
+}
+
+NNGraph::NodeRef getProducer(NNGraph::NodeRef n) {
+  assert(
+      is<NeuralNetData>(n) &&
+      "getProducer only works with NeuralNetData types.");
+  auto inEdges = n->getInEdges();
+  assert(inEdges.size() > 0 && "Tensor does not have a producer.");
+  assert(
+      inEdges.size() == 1 &&
+      "Malformed NNGraph, NeuralNetData has multiple producers.");
+  return inEdges.front()->tail();
+}
+
+bool hasConsumer(NNGraph::NodeRef n) {
+  return n->getOutEdges().size() != 0;
+}
+
+std::vector<NNGraph::NodeRef> getConsumers(NNGraph::NodeRef n) {
+  assert(
+      is<NeuralNetData>(n) &&
+      "getProducer only works with NeuralNetData types.");
+  std::vector<NNGraph::NodeRef> out;
+  for (auto outEdge : n->getOutEdges()) {
+    out.emplace_back(outEdge->head());
+  }
+  return out;
+}
+
+bool hasInputs(NNGraph::NodeRef n) {
+  return n->getInEdges().size() != 0;
+}
+
+std::vector<NNGraph::NodeRef> getInputs(NNGraph::NodeRef n) {
+  assert(
+      is<NeuralNetOperator>(n) &&
+      "getInputs only works with NeuralNetOperator types.");
+  std::vector<NNGraph::NodeRef> out;
+  for (auto inEdge : n->getInEdges()) {
+    out.emplace_back(inEdge->tail());
+  }
+  return out;
+}
+
+std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n) {
+  assert(
+      is<NeuralNetOperator>(n) &&
+      "getOutputs only works with NeuralNetOperator types.");
+  std::vector<NNGraph::NodeRef> out;
+  for (auto outEdge : n->getOutEdges()) {
+    out.emplace_back(outEdge->head());
+  }
+  return out;
+}
+
+// Get all nodes tracked by CF graph
+static std::unordered_set<repr::NNGraph::NodeRef> getTrackedNodes(
+    repr::NNCFGraph& cf) {
+  std::unordered_set<repr::NNGraph::NodeRef> cfTrackedNodes;
+  for (const auto& bbNode : cf.getMutableNodes()) {
+    auto bb = repr::nn::get<repr::BasicBlockType<repr::NNGraph>>(bbNode);
+    for (const auto node : bb->getInstructions()) {
+      cfTrackedNodes.insert(node);
+    }
+  }
+  return cfTrackedNodes;
+}
+
+static size_t coalesceInsertedDataDependenciesHelper(repr::NNModule* m) {
+  auto cfTrackedNodes = getTrackedNodes(m->controlFlow);
+
+  for (auto& bbNode : m->controlFlow.getMutableNodes()) {
+    auto bb = repr::nn::get<repr::BasicBlockType<repr::NNGraph>>(bbNode);
+    // We mutate the instructions of the bb, so we copy here.
+    // TODO make this an iterator and simply promote it on insertion.
+    auto instrsCopy = bb->getInstructions();
+    for (const auto instr : instrsCopy) {
+      for (const auto input : repr::nn::getInputs(instr)) {
+        if (!repr::nn::hasProducer(input)) {
+          continue;
+        }
+        auto producer = repr::nn::getProducer(input);
+        if (!cfTrackedNodes.count(producer)) {
+          bb->insertInstructionBefore(producer, instr);
+          cfTrackedNodes.insert(producer);
+        }
+      }
+    }
+  }
+
+  return cfTrackedNodes.size();
+}
+
+// TODO: move this to more generic location.
+// TODO: [algo] improve this algorithm, as it is horrendously inefficient.
+void coalesceInsertedDataDependencies(repr::NNModule* m) {
+  size_t oldSize = 0;
+  size_t newSize = 0;
+  do {
+    oldSize = newSize;
+    newSize = coalesceInsertedDataDependenciesHelper(m);
+  } while (newSize != oldSize);
+
+  // Now we track new nodes that have no relationship to the old CFGraph
+  auto cfTrackedNodes = getTrackedNodes(m->controlFlow);
+  std::unordered_set<repr::NNGraph::NodeRef> dfNodes;
+  for (auto node : m->dataFlow.getMutableNodes()) {
+    if (repr::nn::is<NeuralNetOperator>(node) && !cfTrackedNodes.count(node)) {
+      dfNodes.insert(node);
+    }
+  }
+
+  auto newBbNode = m->controlFlow.createNode(
+      util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+  auto sccs = algorithm::tarjans(&m->dataFlow);
+  for (auto iter = sccs.rbegin(); iter != sccs.rend(); ++iter) {
+    for (auto node : iter->getNodes()) {
+      if (dfNodes.count(node)) {
+        auto currentBasicBlock = newBbNode->mutableData()->get();
+        currentBasicBlock->pushInstructionNode(node);
+      }
+    }
+  }
+
+  // Finally we reconcile any data dependency issues (if we can).
+  for (auto& bbNode : m->controlFlow.getMutableNodes()) {
+    auto bb = bbNode->mutableData()->get();
+    std::unordered_set<repr::NNGraph::NodeRef> seen;
+    for (auto instr_iter = bb->getInstructions().begin();
+         instr_iter != bb->getInstructions().end();
+         ++instr_iter) {
+      // This cannot be auto&, TODO figure out why
+      auto instr = *instr_iter;
+      for (auto& output : getOutputs(instr)) {
+        for (auto& consumer : getConsumers(output)) {
+          if (seen.count(consumer)) {
+            bb->moveInstructionBefore(instr, consumer);
+          }
+        }
+      }
+      seen.insert(instr);
+    }
+  }
+}
+
+} // namespace nn
+
+} // namespace repr
+} // namespace nom
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h b/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h
new file mode 100644
index 0000000..f5a143b
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h
@@ -0,0 +1,109 @@
+#ifndef NOM_CONVERTERS_DOT_H
+#define NOM_CONVERTERS_DOT_H
+
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Support/Casting.h"
+
+#include <functional>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+namespace {
+
+template <typename T, typename... U>
+class DotGenerator {
+ public:
+  using NodePrinter = std::function<std::map<std::string, std::string>(
+      typename nom::Graph<T, U...>::NodeRef)>;
+  using EdgePrinter = std::function<std::map<std::string, std::string>(
+      typename nom::Graph<T, U...>::EdgeRef)>;
+
+  static std::map<std::string, std::string> defaultEdgePrinter(
+      typename nom::Graph<T, U...>::EdgeRef) {
+    std::map<std::string, std::string> labelMap;
+    return labelMap;
+  }
+
+  DotGenerator(typename nom::Graph<T, U...>* g) : g_(g) {}
+
+  std::string convert(NodePrinter nodePrinter, EdgePrinter edgePrinter) {
+    std::ostringstream output;
+    output << "digraph G {\n\
+      bgcolor=\"#ffffff00\"\n\
+      color=white\n\
+      node[fontcolor=white,color=white];\n\
+      edge[fontcolor=white,color=white];\n\
+      ";
+    for (const auto& node : g_->getMutableNodes()) {
+      output << (uint64_t)node; // dot doesn't like hex
+      output << "[";
+      for (const auto& attrib : nodePrinter(node)) {
+        output << attrib.first << "=\"" << attrib.second << "\",";
+      }
+      output << "];\n";
+      for (const auto& edge : node->getOutEdges()) {
+        output << (uint64_t)edge->tail() << " -> " << (uint64_t)edge->head();
+        output << "[";
+        for (const auto& attrib : edgePrinter(edge)) {
+          output << attrib.first << "=\"" << attrib.second << "\",";
+        }
+        output << "];\n";
+      }
+    }
+    for (auto i = 0; i < subgraphs_.size(); ++i) {
+      const auto& subgraph = subgraphs_[i];
+      output << "subgraph cluster" << i << " {\n";
+      output << "style=dotted;\n";
+      for (const auto& node : subgraph->getNodes()) {
+        output << node;
+        output << ";\n";
+      }
+      output << "}\n";
+    }
+    output << "}";
+    return output.str();
+  }
+
+  void addSubgraph(const nom::Subgraph<T, U...>* s) {
+    subgraphs_.emplace_back(s);
+  }
+
+ private:
+  typename nom::Graph<T, U...>* g_;
+  typename std::vector<const nom::Subgraph<T, U...>*> subgraphs_;
+};
+
+} // namespace
+
+namespace nom {
+namespace converters {
+
+template <typename T, typename... U>
+std::string convertToDotString(
+    nom::Graph<T, U...>* g,
+    typename DotGenerator<T, U...>::NodePrinter nodePrinter,
+    typename DotGenerator<T, U...>::EdgePrinter edgePrinter =
+        DotGenerator<T, U...>::defaultEdgePrinter) {
+  auto d = DotGenerator<T, U...>(g);
+  return d.convert(nodePrinter, edgePrinter);
+}
+
+template <typename T, typename... U>
+std::string convertToDotString(
+    nom::Graph<T, U...>* g,
+    const std::vector<nom::Subgraph<T, U...>>& subgraphs,
+    typename DotGenerator<T, U...>::NodePrinter nodePrinter,
+    typename DotGenerator<T, U...>::EdgePrinter edgePrinter =
+        DotGenerator<T, U...>::defaultEdgePrinter) {
+  auto d = DotGenerator<T, U...>(g);
+  for (const auto& subgraph : subgraphs) {
+    d.addSubgraph(&subgraph);
+  }
+  return d.convert(nodePrinter, edgePrinter);
+}
+
+} // namespace converters
+} // namespace nom
+
+#endif // NOM_CONVERTERS_DOT_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
new file mode 100644
index 0000000..1e8156a
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
@@ -0,0 +1,994 @@
+class Relu : public NeuralNetOperator {
+ public:
+  Relu() : NeuralNetOperator(NNKind::Relu) {}
+
+  ~Relu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Relu);
+
+ private:
+};
+
+class Conv : public NeuralNetOperator {
+ public:
+  Conv(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1},
+      int group = 1,
+      vector<int> dilations = {1, 1})
+      : NeuralNetOperator(NNKind::Conv),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides),
+        Group(group),
+        Dilations(dilations) {}
+
+  ~Conv() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Conv);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  int getGroup() const {
+    return Group;
+  }
+
+  vector<int> getDilations() const {
+    return Dilations;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+  void setGroup(int group) {
+    Group = group;
+  }
+
+  void setDilations(vector<int> dilations) {
+    Dilations = dilations;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+  int Group;
+  vector<int> Dilations;
+};
+
+class ConvRelu : public NeuralNetOperator {
+ public:
+  ConvRelu(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1},
+      int group = 1,
+      vector<int> dilations = {1, 1})
+      : NeuralNetOperator(NNKind::ConvRelu),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides),
+        Group(group),
+        Dilations(dilations) {}
+
+  ConvRelu(const Conv& conv)
+      : NeuralNetOperator(NNKind::ConvRelu),
+        KernelShape(conv.getKernelShape()),
+        Pads(conv.getPads()),
+        Strides(conv.getStrides()),
+        Group(conv.getGroup()),
+        Dilations(conv.getDilations()) {}
+
+  ~ConvRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ConvRelu);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  int getGroup() const {
+    return Group;
+  }
+
+  vector<int> getDilations() const {
+    return Dilations;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+  void setGroup(int group) {
+    Group = group;
+  }
+
+  void setDilations(vector<int> dilations) {
+    Dilations = dilations;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+  int Group;
+  vector<int> Dilations;
+};
+
+class ConvTranspose : public NeuralNetOperator {
+ public:
+  ConvTranspose(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1},
+      int group = 1,
+      vector<int> dilations = {1, 1})
+      : NeuralNetOperator(NNKind::ConvTranspose),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides),
+        Group(group),
+        Dilations(dilations) {}
+
+  ~ConvTranspose() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ConvTranspose);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  int getGroup() const {
+    return Group;
+  }
+
+  vector<int> getDilations() const {
+    return Dilations;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+  void setGroup(int group) {
+    Group = group;
+  }
+
+  void setDilations(vector<int> dilations) {
+    Dilations = dilations;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+  int Group;
+  vector<int> Dilations;
+};
+
+class AveragePool : public NeuralNetOperator {
+ public:
+  AveragePool(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1})
+      : NeuralNetOperator(NNKind::AveragePool),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides) {}
+
+  ~AveragePool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(AveragePool);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+};
+
+class AveragePoolRelu : public NeuralNetOperator {
+ public:
+  AveragePoolRelu(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1})
+      : NeuralNetOperator(NNKind::AveragePoolRelu),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides) {}
+
+  AveragePoolRelu(const AveragePool& averagePool)
+      : NeuralNetOperator(NNKind::AveragePoolRelu),
+        KernelShape(averagePool.getKernelShape()),
+        Pads(averagePool.getPads()),
+        Strides(averagePool.getStrides()) {}
+
+  ~AveragePoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(AveragePoolRelu);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+};
+
+class MaxPool : public NeuralNetOperator {
+ public:
+  MaxPool(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1})
+      : NeuralNetOperator(NNKind::MaxPool),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides) {}
+
+  ~MaxPool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(MaxPool);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+};
+
+class MaxPoolRelu : public NeuralNetOperator {
+ public:
+  MaxPoolRelu(
+      vector<int> kernelShape,
+      vector<int> pads = {0, 0},
+      vector<int> strides = {1, 1})
+      : NeuralNetOperator(NNKind::MaxPoolRelu),
+        KernelShape(kernelShape),
+        Pads(pads),
+        Strides(strides) {}
+
+  MaxPoolRelu(const MaxPool& maxPool)
+      : NeuralNetOperator(NNKind::MaxPoolRelu),
+        KernelShape(maxPool.getKernelShape()),
+        Pads(maxPool.getPads()),
+        Strides(maxPool.getStrides()) {}
+
+  ~MaxPoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(MaxPoolRelu);
+
+  vector<int> getKernelShape() const {
+    return KernelShape;
+  }
+
+  vector<int> getPads() const {
+    return Pads;
+  }
+
+  vector<int> getStrides() const {
+    return Strides;
+  }
+
+  void setKernelShape(vector<int> kernelShape) {
+    KernelShape = kernelShape;
+  }
+
+  void setPads(vector<int> pads) {
+    Pads = pads;
+  }
+
+  void setStrides(vector<int> strides) {
+    Strides = strides;
+  }
+
+ private:
+  vector<int> KernelShape;
+  vector<int> Pads;
+  vector<int> Strides;
+};
+
+class Sum : public NeuralNetOperator {
+ public:
+  Sum() : NeuralNetOperator(NNKind::Sum) {}
+
+  ~Sum() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Sum);
+
+ private:
+};
+
+class SumRelu : public NeuralNetOperator {
+ public:
+  SumRelu() : NeuralNetOperator(NNKind::SumRelu) {}
+
+  SumRelu(const Sum& sum) : NeuralNetOperator(NNKind::SumRelu) {}
+
+  ~SumRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(SumRelu);
+
+ private:
+};
+
+class Send : public NeuralNetOperator {
+ public:
+  Send(string destination)
+      : NeuralNetOperator(NNKind::Send), Destination(destination) {}
+
+  ~Send() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Send);
+
+  string getDestination() const {
+    return Destination;
+  }
+
+  void setDestination(string destination) {
+    Destination = destination;
+  }
+
+ private:
+  string Destination;
+};
+
+class Receive : public NeuralNetOperator {
+ public:
+  Receive(string source) : NeuralNetOperator(NNKind::Receive), Source(source) {}
+
+  ~Receive() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Receive);
+
+  string getSource() const {
+    return Source;
+  }
+
+  void setSource(string source) {
+    Source = source;
+  }
+
+ private:
+  string Source;
+};
+
+class BatchNormalization : public NeuralNetOperator {
+ public:
+  BatchNormalization(
+      float epsilon = 1e-5f,
+      float momentum = 0.9f,
+      bool spatial = true,
+      bool isTest = false)
+      : NeuralNetOperator(NNKind::BatchNormalization),
+        Epsilon(epsilon),
+        Momentum(momentum),
+        Spatial(spatial),
+        IsTest(isTest) {}
+
+  ~BatchNormalization() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchNormalization);
+
+  float getEpsilon() const {
+    return Epsilon;
+  }
+
+  float getMomentum() const {
+    return Momentum;
+  }
+
+  bool getSpatial() const {
+    return Spatial;
+  }
+
+  bool getIsTest() const {
+    return IsTest;
+  }
+
+  void setEpsilon(float epsilon) {
+    Epsilon = epsilon;
+  }
+
+  void setMomentum(float momentum) {
+    Momentum = momentum;
+  }
+
+  void setSpatial(bool spatial) {
+    Spatial = spatial;
+  }
+
+  void setIsTest(bool isTest) {
+    IsTest = isTest;
+  }
+
+ private:
+  float Epsilon;
+  float Momentum;
+  bool Spatial;
+  bool IsTest;
+};
+
+class FC : public NeuralNetOperator {
+ public:
+  FC() : NeuralNetOperator(NNKind::FC) {}
+
+  ~FC() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(FC);
+
+ private:
+};
+
+class GivenTensorFill : public NeuralNetOperator {
+ public:
+  GivenTensorFill() : NeuralNetOperator(NNKind::GivenTensorFill) {}
+
+  ~GivenTensorFill() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(GivenTensorFill);
+
+ private:
+};
+
+class Concat : public NeuralNetOperator {
+ public:
+  Concat(int axis = -1, bool addAxis = false)
+      : NeuralNetOperator(NNKind::Concat), Axis(axis), AddAxis(addAxis) {}
+
+  ~Concat() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Concat);
+
+  int getAxis() const {
+    return Axis;
+  }
+
+  bool getAddAxis() const {
+    return AddAxis;
+  }
+
+  void setAxis(int axis) {
+    Axis = axis;
+  }
+
+  void setAddAxis(bool addAxis) {
+    AddAxis = addAxis;
+  }
+
+ private:
+  int Axis;
+  bool AddAxis;
+};
+
+class Softmax : public NeuralNetOperator {
+ public:
+  Softmax() : NeuralNetOperator(NNKind::Softmax) {}
+
+  ~Softmax() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Softmax);
+
+ private:
+};
+
+class ChannelShuffle : public NeuralNetOperator {
+ public:
+  ChannelShuffle() : NeuralNetOperator(NNKind::ChannelShuffle) {}
+
+  ~ChannelShuffle() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ChannelShuffle);
+
+ private:
+};
+
+class Add : public NeuralNetOperator {
+ public:
+  Add() : NeuralNetOperator(NNKind::Add) {}
+
+  ~Add() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Add);
+
+ private:
+};
+
+class Reshape : public NeuralNetOperator {
+ public:
+  Reshape() : NeuralNetOperator(NNKind::Reshape) {}
+
+  ~Reshape() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Reshape);
+
+ private:
+};
+
+class Flatten : public NeuralNetOperator {
+ public:
+  Flatten() : NeuralNetOperator(NNKind::Flatten) {}
+
+  ~Flatten() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Flatten);
+
+ private:
+};
+
+class NCHW2NHWC : public NeuralNetOperator {
+ public:
+  NCHW2NHWC() : NeuralNetOperator(NNKind::NCHW2NHWC) {}
+
+  ~NCHW2NHWC() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(NCHW2NHWC);
+
+ private:
+};
+
+class NHWC2NCHW : public NeuralNetOperator {
+ public:
+  NHWC2NCHW() : NeuralNetOperator(NNKind::NHWC2NCHW) {}
+
+  ~NHWC2NCHW() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(NHWC2NCHW);
+
+ private:
+};
+
+class Int8Quantize : public NeuralNetOperator {
+ public:
+  Int8Quantize() : NeuralNetOperator(NNKind::Int8Quantize) {}
+
+  ~Int8Quantize() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Quantize);
+
+ private:
+};
+
+class Int8Dequantize : public NeuralNetOperator {
+ public:
+  Int8Dequantize() : NeuralNetOperator(NNKind::Int8Dequantize) {}
+
+  ~Int8Dequantize() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Dequantize);
+
+ private:
+};
+
+class Int8AveragePool : public NeuralNetOperator {
+ public:
+  Int8AveragePool() : NeuralNetOperator(NNKind::Int8AveragePool) {}
+
+  Int8AveragePool(const AveragePool& averagePool)
+      : NeuralNetOperator(NNKind::Int8AveragePool) {}
+
+  ~Int8AveragePool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePool);
+
+ private:
+};
+
+class Int8Conv : public NeuralNetOperator {
+ public:
+  Int8Conv() : NeuralNetOperator(NNKind::Int8Conv) {}
+
+  Int8Conv(const Conv& conv) : NeuralNetOperator(NNKind::Int8Conv) {}
+
+  ~Int8Conv() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Conv);
+
+ private:
+};
+
+class Int8ConvTranspose : public NeuralNetOperator {
+ public:
+  Int8ConvTranspose() : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
+
+  Int8ConvTranspose(const ConvTranspose& convTranspose)
+      : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
+
+  ~Int8ConvTranspose() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvTranspose);
+
+ private:
+};
+
+class Int8FC : public NeuralNetOperator {
+ public:
+  Int8FC() : NeuralNetOperator(NNKind::Int8FC) {}
+
+  Int8FC(const FC& fC) : NeuralNetOperator(NNKind::Int8FC) {}
+
+  ~Int8FC() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8FC);
+
+ private:
+};
+
+class Int8MaxPool : public NeuralNetOperator {
+ public:
+  Int8MaxPool() : NeuralNetOperator(NNKind::Int8MaxPool) {}
+
+  Int8MaxPool(const MaxPool& maxPool)
+      : NeuralNetOperator(NNKind::Int8MaxPool) {}
+
+  ~Int8MaxPool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPool);
+
+ private:
+};
+
+class Int8Relu : public NeuralNetOperator {
+ public:
+  Int8Relu() : NeuralNetOperator(NNKind::Int8Relu) {}
+
+  Int8Relu(const Relu& relu) : NeuralNetOperator(NNKind::Int8Relu) {}
+
+  ~Int8Relu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Relu);
+
+ private:
+};
+
+class Int8GivenTensorFill : public NeuralNetOperator {
+ public:
+  Int8GivenTensorFill() : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
+
+  Int8GivenTensorFill(const GivenTensorFill& givenTensorFill)
+      : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
+
+  ~Int8GivenTensorFill() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8GivenTensorFill);
+
+ private:
+};
+
+class Int8Concat : public NeuralNetOperator {
+ public:
+  Int8Concat() : NeuralNetOperator(NNKind::Int8Concat) {}
+
+  Int8Concat(const Concat& concat) : NeuralNetOperator(NNKind::Int8Concat) {}
+
+  ~Int8Concat() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Concat);
+
+ private:
+};
+
+class Int8Softmax : public NeuralNetOperator {
+ public:
+  Int8Softmax() : NeuralNetOperator(NNKind::Int8Softmax) {}
+
+  Int8Softmax(const Softmax& softmax)
+      : NeuralNetOperator(NNKind::Int8Softmax) {}
+
+  ~Int8Softmax() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Softmax);
+
+ private:
+};
+
+class Int8ChannelShuffle : public NeuralNetOperator {
+ public:
+  Int8ChannelShuffle() : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
+
+  Int8ChannelShuffle(const ChannelShuffle& channelShuffle)
+      : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
+
+  ~Int8ChannelShuffle() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ChannelShuffle);
+
+ private:
+};
+
+class Int8Sum : public NeuralNetOperator {
+ public:
+  Int8Sum() : NeuralNetOperator(NNKind::Int8Sum) {}
+
+  Int8Sum(const Sum& sum) : NeuralNetOperator(NNKind::Int8Sum) {}
+
+  ~Int8Sum() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Sum);
+
+ private:
+};
+
+class Int8Add : public NeuralNetOperator {
+ public:
+  Int8Add() : NeuralNetOperator(NNKind::Int8Add) {}
+
+  Int8Add(const Add& add) : NeuralNetOperator(NNKind::Int8Add) {}
+
+  ~Int8Add() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Add);
+
+ private:
+};
+
+class Int8Reshape : public NeuralNetOperator {
+ public:
+  Int8Reshape() : NeuralNetOperator(NNKind::Int8Reshape) {}
+
+  Int8Reshape(const Reshape& reshape)
+      : NeuralNetOperator(NNKind::Int8Reshape) {}
+
+  ~Int8Reshape() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Reshape);
+
+ private:
+};
+
+class Int8Flatten : public NeuralNetOperator {
+ public:
+  Int8Flatten() : NeuralNetOperator(NNKind::Int8Flatten) {}
+
+  Int8Flatten(const Flatten& flatten)
+      : NeuralNetOperator(NNKind::Int8Flatten) {}
+
+  ~Int8Flatten() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Flatten);
+
+ private:
+};
+
+class Int8ConvRelu : public NeuralNetOperator {
+ public:
+  Int8ConvRelu() : NeuralNetOperator(NNKind::Int8ConvRelu) {}
+
+  Int8ConvRelu(const ConvRelu& convRelu)
+      : NeuralNetOperator(NNKind::Int8ConvRelu) {}
+
+  ~Int8ConvRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvRelu);
+
+ private:
+};
+
+class Int8SumRelu : public NeuralNetOperator {
+ public:
+  Int8SumRelu() : NeuralNetOperator(NNKind::Int8SumRelu) {}
+
+  Int8SumRelu(const SumRelu& sumRelu)
+      : NeuralNetOperator(NNKind::Int8SumRelu) {}
+
+  ~Int8SumRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8SumRelu);
+
+ private:
+};
+
+class Int8AveragePoolRelu : public NeuralNetOperator {
+ public:
+  Int8AveragePoolRelu() : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
+
+  Int8AveragePoolRelu(const AveragePoolRelu& averagePoolRelu)
+      : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
+
+  ~Int8AveragePoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePoolRelu);
+
+ private:
+};
+
+class Int8MaxPoolRelu : public NeuralNetOperator {
+ public:
+  Int8MaxPoolRelu() : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
+
+  Int8MaxPoolRelu(const MaxPoolRelu& maxPoolRelu)
+      : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
+
+  ~Int8MaxPoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPoolRelu);
+
+ private:
+};
+
+class BatchMatMul : public NeuralNetOperator {
+ public:
+  BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false)
+      : NeuralNetOperator(NNKind::BatchMatMul),
+        TransA(transA),
+        TransB(transB),
+        Broadcast(broadcast) {}
+
+  ~BatchMatMul() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul);
+
+  bool getTransA() const {
+    return TransA;
+  }
+
+  bool getTransB() const {
+    return TransB;
+  }
+
+  bool getBroadcast() const {
+    return Broadcast;
+  }
+
+  void setTransA(bool transA) {
+    TransA = transA;
+  }
+
+  void setTransB(bool transB) {
+    TransB = transB;
+  }
+
+  void setBroadcast(bool broadcast) {
+    Broadcast = broadcast;
+  }
+
+ private:
+  bool TransA;
+  bool TransB;
+  bool Broadcast;
+};
+
+class BatchGather : public NeuralNetOperator {
+ public:
+  BatchGather() : NeuralNetOperator(NNKind::BatchGather) {}
+
+  ~BatchGather() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather);
+
+ private:
+};
+
+class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator {
+ public:
+  ConcatBatchMatMulBatchGatherOp()
+      : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {}
+
+  ~ConcatBatchMatMulBatchGatherOp() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp);
+
+ private:
+};
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
new file mode 100644
index 0000000..9c42772
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
@@ -0,0 +1,9 @@
+Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
+    MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC,
+    GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten,
+    NCHW2NHWC, NHWC2NCHW, Int8Quantize, Int8Dequantize, Int8AveragePool,
+    Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu,
+    Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum,
+    Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu,
+    Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather,
+    ConcatBatchMatMulBatchGatherOp
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
new file mode 100644
index 0000000..87ffda3
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
@@ -0,0 +1,92 @@
+case NNKind::Relu:
+  return "Relu";
+case NNKind::Conv:
+  return "Conv";
+case NNKind::ConvRelu:
+  return "ConvRelu";
+case NNKind::ConvTranspose:
+  return "ConvTranspose";
+case NNKind::AveragePool:
+  return "AveragePool";
+case NNKind::AveragePoolRelu:
+  return "AveragePoolRelu";
+case NNKind::MaxPool:
+  return "MaxPool";
+case NNKind::MaxPoolRelu:
+  return "MaxPoolRelu";
+case NNKind::Sum:
+  return "Sum";
+case NNKind::SumRelu:
+  return "SumRelu";
+case NNKind::Send:
+  return "Send";
+case NNKind::Receive:
+  return "Receive";
+case NNKind::BatchNormalization:
+  return "BatchNormalization";
+case NNKind::FC:
+  return "FC";
+case NNKind::GivenTensorFill:
+  return "GivenTensorFill";
+case NNKind::Concat:
+  return "Concat";
+case NNKind::Softmax:
+  return "Softmax";
+case NNKind::ChannelShuffle:
+  return "ChannelShuffle";
+case NNKind::Add:
+  return "Add";
+case NNKind::Reshape:
+  return "Reshape";
+case NNKind::Flatten:
+  return "Flatten";
+case NNKind::NCHW2NHWC:
+  return "NCHW2NHWC";
+case NNKind::NHWC2NCHW:
+  return "NHWC2NCHW";
+case NNKind::Int8Quantize:
+  return "Int8Quantize";
+case NNKind::Int8Dequantize:
+  return "Int8Dequantize";
+case NNKind::Int8AveragePool:
+  return "Int8AveragePool";
+case NNKind::Int8Conv:
+  return "Int8Conv";
+case NNKind::Int8ConvTranspose:
+  return "Int8ConvTranspose";
+case NNKind::Int8FC:
+  return "Int8FC";
+case NNKind::Int8MaxPool:
+  return "Int8MaxPool";
+case NNKind::Int8Relu:
+  return "Int8Relu";
+case NNKind::Int8GivenTensorFill:
+  return "Int8GivenTensorFill";
+case NNKind::Int8Concat:
+  return "Int8Concat";
+case NNKind::Int8Softmax:
+  return "Int8Softmax";
+case NNKind::Int8ChannelShuffle:
+  return "Int8ChannelShuffle";
+case NNKind::Int8Sum:
+  return "Int8Sum";
+case NNKind::Int8Add:
+  return "Int8Add";
+case NNKind::Int8Reshape:
+  return "Int8Reshape";
+case NNKind::Int8Flatten:
+  return "Int8Flatten";
+case NNKind::Int8ConvRelu:
+  return "Int8ConvRelu";
+case NNKind::Int8SumRelu:
+  return "Int8SumRelu";
+case NNKind::Int8AveragePoolRelu:
+  return "Int8AveragePoolRelu";
+case NNKind::Int8MaxPoolRelu:
+  return "Int8MaxPoolRelu";
+case NNKind::BatchMatMul:
+  return "BatchMatMul";
+case NNKind::BatchGather:
+  return "BatchGather";
+case NNKind::ConcatBatchMatMulBatchGatherOp:
+  return "ConcatBatchMatMulBatchGatherOp";
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
new file mode 100644
index 0000000..13af7c8
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
@@ -0,0 +1,187 @@
+//===- nomnigraph/Graph/Algorithms.h - Graph algorithms ---------*- C++ -*-===//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines algorithms that only require Graph level annotations.
+// Tarjans is defined.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_GRAPH_ALGORITHMS_H
+#define NOM_GRAPH_ALGORITHMS_H
+
+#include <assert.h>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "nomnigraph/Graph/BinaryMatchImpl.h"
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Graph/TarjansImpl.h"
+
+namespace nom {
+namespace algorithm {
+
+/// \brief Helper for dominator tree finding.
+template <typename G>
+void reachable(
+    typename G::NodeRef root,
+    typename G::NodeRef ignored,
+    std::unordered_set<typename G::NodeRef>* seen) {
+  seen->insert(root);
+  for (const auto& outEdge : root->getOutEdges()) {
+    auto& newNode = outEdge->head();
+    if (newNode != ignored && (seen->find(newNode) == seen->end())) {
+      reachable<G>(newNode, ignored, seen);
+    }
+  }
+}
+
+/// \brief A dominator tree finder.  Runs in O(M*N), there exist
+/// more efficient implementations.
+///
+/// High level description of the algorithm:
+///
+/// 1) Find a map of {node}->{dominator set}
+/// --
+/// allNodes = reachable(root)
+/// for n in nodes:
+///   temporarily delete n from the graph
+///   dom[n] = allNodes - reachable(root)
+///   restore n to the graph
+///
+/// 2) Construct tree from that map
+/// --
+/// starting at root, BFS in dominatorMap:
+///   if newnode has inedge, delete it
+///   draw edge from parent to child
+template <typename G>
+Graph<typename G::NodeRef> dominatorTree(
+    G* g,
+    typename G::NodeRef source = nullptr) {
+  assert(
+      g->getMutableNodes().size() > 0 &&
+      "Cannot find dominator tree of empty graph.");
+  if (!source) {
+    auto rootSCC = tarjans(g).back();
+    assert(
+        rootSCC.getNodes().size() == 1 &&
+        "Cannot determine source node topologically, please specify one.");
+    for (auto& node : rootSCC.getNodes()) {
+      source = node;
+      break;
+    }
+  }
+
+  std::unordered_set<typename G::NodeRef> allNodes;
+  Graph<typename G::NodeRef> tree;
+  std::unordered_map<
+      typename G::NodeRef,
+      typename Graph<typename G::NodeRef>::NodeRef>
+      mapToTreeNode;
+  std::unordered_map<
+      typename G::NodeRef,
+      std::unordered_set<typename G::NodeRef>>
+      dominatorMap;
+
+  for (auto node : g->getMutableNodes()) {
+    mapToTreeNode[node] = tree.createNode(std::move(node));
+    if (node == source) {
+      continue;
+    }
+    dominatorMap[source].insert(node);
+  }
+
+  for (const auto& node : g->getMutableNodes()) {
+    if (node == source) {
+      continue;
+    }
+    std::unordered_set<typename G::NodeRef> seen;
+    std::unordered_set<typename G::NodeRef> dominated;
+    reachable<G>(source, node, &seen);
+    for (auto testNode : dominatorMap[source]) {
+      if (seen.find(testNode) == seen.end() && testNode != node) {
+        dominated.insert(testNode);
+      }
+    }
+    dominatorMap[node] = dominated;
+  }
+
+  std::unordered_set<typename G::NodeRef> nextPass;
+  nextPass.insert(source);
+
+  while (nextPass.size()) {
+    for (auto parent_iter = nextPass.begin(); parent_iter != nextPass.end();) {
+      auto parent = *parent_iter;
+      for (auto child : dominatorMap[parent]) {
+        while (mapToTreeNode[child]->getInEdges().size()) {
+          tree.deleteEdge(mapToTreeNode[child]->getInEdges().front());
+        }
+        tree.createEdge(mapToTreeNode[parent], mapToTreeNode[child]);
+        if (dominatorMap.find(child) != dominatorMap.end()) {
+          nextPass.insert(child);
+        }
+      }
+      nextPass.erase(parent_iter++);
+    }
+  }
+
+  return tree;
+}
+
+/// \brief Map all nodes in the graph to their immediate dominators.
+template <typename G>
+std::unordered_map<typename G::NodeRef, typename G::NodeRef>
+immediateDominatorMap(G* g, typename G::NodeRef source = nullptr) {
+  std::unordered_map<typename G::NodeRef, typename G::NodeRef> idomMap;
+  auto idomTree = dominatorTree(g, source);
+  for (auto node : idomTree.getMutableNodes()) {
+    // Sanity check, really should never happen.
+    assert(
+        node->getInEdges().size() <= 1 &&
+        "Invalid dominator tree generated from graph, cannot determing idom map.");
+    // In degenerate cases, or for the root node, we self dominate.
+    if (node->getInEdges().size() == 0) {
+      idomMap[node->data()] = node->data();
+    } else {
+      auto idom = node->getInEdges()[0]->tail();
+      idomMap[node->data()] = idom->data();
+    }
+  }
+  return idomMap;
+}
+
+/// \brief Map all nodes to their dominance frontiers:
+/// a set of nodes that does not strictly dominate the given node but does
+/// dominate an immediate predecessor.  This is useful as it is the exact
+/// location for the insertion of phi nodes in SSA representation.
+template <typename G>
+std::unordered_map<typename G::NodeRef, std::unordered_set<typename G::NodeRef>>
+dominanceFrontierMap(G* g, typename G::NodeRef source = nullptr) {
+  auto idomMap = immediateDominatorMap(g, source);
+  std::unordered_map<
+      typename G::NodeRef,
+      std::unordered_set<typename G::NodeRef>>
+      domFrontierMap;
+  for (const auto node : g->getMutableNodes()) {
+    if (node->getInEdges().size() < 2) {
+      continue;
+    }
+    for (auto inEdge : node->getInEdges()) {
+      auto predecessor = inEdge->tail();
+      // This variable will track all the way up the dominator tree.
+      auto runner = predecessor;
+      while (runner != idomMap[node]) {
+        domFrontierMap[runner].insert(node);
+        runner = idomMap[runner];
+      }
+    }
+  }
+  return domFrontierMap;
+}
+
+} // namespace algorithm
+} // namespace nom
+
+#endif // NOM_GRAPH_ALGORITHMS_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/BinaryMatchImpl.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/BinaryMatchImpl.h
new file mode 100644
index 0000000..169e928
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/BinaryMatchImpl.h
@@ -0,0 +1,108 @@
+#ifndef NOM_GRAPH_BINARYMATCHIMPL_H
+#define NOM_GRAPH_BINARYMATCHIMPL_H
+
+#include "nomnigraph/Graph/Graph.h"
+
+namespace nom {
+namespace algorithm {
+
+/// \brief A binary graph matching algorithm based on Kahn's algorithm.
+template <typename F, typename T, typename... U>
+std::vector<Subgraph<T, U...>> binaryMatch(Graph<T, U...>* g, F condition) {
+  using G = Graph<T, U...>;
+
+  auto swappableCondition = [&](typename G::NodeRef m, bool match) {
+    return match ? condition(m) : !condition(m);
+  };
+
+  auto edges = g->getMutableEdges();
+  std::unordered_set<typename G::EdgeRef> edgeSet(edges.begin(), edges.end());
+
+  // Topologically sorted matching subgraphs.
+  std::vector<Subgraph<T, U...>> sortedNodes;
+
+  // Find the initial frontier.
+  std::vector<typename G::NodeRef> frontier;
+  std::vector<typename G::NodeRef> nextFrontier;
+
+  for (auto n : g->getMutableNodes()) {
+    if (n->getInEdges().size() == 0) {
+      if (condition(n)) {
+        frontier.emplace_back(n);
+      } else {
+        nextFrontier.emplace_back(n);
+      }
+    }
+  }
+
+  auto stillHasInEdge = [&](typename G::NodeRef m) {
+    for (auto inEdge : m->getInEdges()) {
+      if (edgeSet.count(inEdge)) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // This boolean will store which type of match we are looking for.
+  // If true we are looking for the condition to return true,
+  // if false we are looking for the condition to return false
+  bool match = true;
+
+  // Only if we currently have a frontier should we add a subgraph to the
+  // vector of matches.
+  if (frontier.size()) {
+    sortedNodes.emplace_back();
+  }
+
+  // As long as there is a frontier we continue the algorithm.
+  while (frontier.size() || nextFrontier.size()) {
+    // Swap everything if we exhausted the current frontier.
+    if (!frontier.size() && nextFrontier.size()) {
+      frontier = nextFrontier;
+      nextFrontier.clear();
+      match = !match;
+      if (match) {
+        sortedNodes.emplace_back();
+      }
+    }
+
+    // The main algorithm is inspired by
+    // https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
+    // originally written by @yinghai
+    auto n = frontier.back();
+    if (match) {
+      sortedNodes.back().addNode(n);
+    }
+    frontier.pop_back();
+
+    for (auto outEdge : n->getOutEdges()) {
+      auto m = outEdge->head();
+      if (!edgeSet.count(outEdge)) {
+        continue;
+      }
+      edgeSet.erase(outEdge);
+
+      if (!stillHasInEdge(m)) {
+        if (swappableCondition(m, match)) {
+          frontier.emplace_back(m);
+        } else {
+          nextFrontier.emplace_back(m);
+        }
+      }
+    }
+  }
+
+  if (edgeSet.size()) {
+    assert(
+        0 &&
+        "Invalid graph for Kahn's algorithm, cycle detected.  Please use Tarjans.");
+  }
+
+  return sortedNodes;
+}
+
+} // namespace algorithm
+} // namespace nom
+
+#endif // NOM_GRAPH_BINARYMATCHIMPL_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
new file mode 100644
index 0000000..e506bc1
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -0,0 +1,414 @@
+//===- nomnigraph/Graph/Graph.h - Basic graph implementation ----*- C++ -*-===//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a basic graph API for generic and flexible use with
+// graph algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_GRAPH_GRAPH_H
+#define NOM_GRAPH_GRAPH_H
+
+#include "nomnigraph/Support/Common.h"
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include <assert.h>
+#include <stdio.h>
+
+#define DEBUG_PRINT(...)
+
+namespace nom {
+
+template <typename T, typename... U>
+class Graph;
+
+template <typename T, typename... U>
+class Node;
+
+// Template types:
+//   T   : Data stored within a node.
+//   U...: Data stored within an edge. When this type is not
+//         specified, an empty StorageType is used. If it is
+//         specified, only a single type should be given (as supported
+//         by the underlying StorageType class).
+
+// \brief Edge within a Graph.
+template <typename T, typename... U>
+class Edge : public StorageType<U...> {
+ public:
+  using NodeRef = typename Graph<T, U...>::NodeRef;
+  Edge(NodeRef tail, NodeRef head, U... args)
+      : StorageType<U...>(std::forward<U...>(args)...), Tail(tail), Head(head) {
+    DEBUG_PRINT("Creating instance of Edge: %p\n", this);
+  }
+
+  const NodeRef& tail() const {
+    return Tail;
+  }
+  const NodeRef& head() const {
+    return Head;
+  }
+
+  void setTail(NodeRef n) {
+    Tail = n;
+  }
+
+  void setHead(NodeRef n) {
+    Head = n;
+  }
+
+ private:
+  NodeRef Tail;
+  NodeRef Head;
+  friend class Graph<T, U...>;
+};
+
+// \brief Node within a Graph.
+template <typename T, typename... U>
+class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
+ public:
+  using NodeRef = typename Graph<T, U...>::NodeRef;
+  using EdgeRef = typename Graph<T, U...>::EdgeRef;
+
+  /// \brief Create a node with data.
+  explicit Node(T&& data) : StorageType<T>(std::move(data)) {
+    DEBUG_PRINT("Creating instance of Node: %p\n", this);
+  }
+  /// \brief Create an empty node.
+  explicit Node() : StorageType<T>() {}
+
+  /// \brief Adds an edge by reference to known in-edges.
+  /// \p e A reference to an edge that will be added as an in-edge.
+  void addInEdge(EdgeRef e) {
+    inEdges.emplace_back(e);
+  }
+
+  /// \brief Adds an edge by reference to known out-edges.
+  /// \p e A reference to an edge that will be added as an out-edge.
+  void addOutEdge(EdgeRef e) {
+    outEdges.emplace_back(e);
+  }
+
+  /// \brief Removes an edge by reference to known in-edges.
+  /// \p e A reference to an edge that will be removed from in-edges.
+  void removeInEdge(EdgeRef e) {
+    auto iter = std::find(inEdges.begin(), inEdges.end(), e);
+    assert(
+        iter != inEdges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    inEdges.erase(iter);
+  }
+
+  /// \brief Removes an edge by reference to known out-edges.
+  /// \p e A reference to an edge that will be removed from out-edges.
+  void removeOutEdge(EdgeRef e) {
+    auto iter = std::find(outEdges.begin(), outEdges.end(), e);
+    assert(
+        iter != outEdges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    outEdges.erase(iter);
+  }
+
+  const std::vector<EdgeRef>& getOutEdges() const {
+    return outEdges;
+  }
+  const std::vector<EdgeRef>& getInEdges() const {
+    return inEdges;
+  }
+
+  void setInEdges(std::vector<EdgeRef> es) {
+    inEdges = es;
+  }
+
+  void setOutEdges(std::vector<EdgeRef> es) {
+    outEdges = es;
+  }
+
+ protected:
+  std::vector<EdgeRef> inEdges;
+  std::vector<EdgeRef> outEdges;
+  friend class Graph<T, U...>;
+};
+
+/// \brief Effectively a constant reference to a graph.
+///
+/// \note A Subgraph could actually point to an entire Graph.
+///
+/// Subgraphs can only contain references to nodes/edges in a Graph.
+/// They are technically mutable, but this should be viewed as a construction
+/// helper rather than a fact to be exploited.  There are no deleters,
+/// for example.
+///
+template <typename T, typename... U>
+class Subgraph {
+ public:
+  Subgraph() {
+    DEBUG_PRINT("Creating instance of Subgraph: %p\n", this);
+  }
+
+  using NodeRef = typename Graph<T, U...>::NodeRef;
+  using EdgeRef = typename Graph<T, U...>::EdgeRef;
+
+  void addNode(NodeRef n) {
+    Nodes.insert(n);
+  }
+  bool hasNode(NodeRef n) const {
+    return Nodes.count(n) != 0;
+  }
+  void removeNode(NodeRef n) {
+    Nodes.erase(n);
+  }
+
+  void addEdge(EdgeRef e) {
+    Edges.insert(e);
+  }
+  bool hasEdge(EdgeRef n) const {
+    return Edges.count(n) != 0;
+  }
+  void removeEdge(EdgeRef e) {
+    Edges.erase(e);
+  }
+
+  const std::unordered_set<NodeRef>& getNodes() const {
+    return Nodes;
+  }
+  const std::unordered_set<EdgeRef>& getEdges() const {
+    return Edges;
+  }
+
+  void printEdges() {
+    for (const auto& edge : Edges) {
+      printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head());
+    }
+  }
+
+  void printNodes() const {
+    for (const auto& node : Nodes) {
+      printf("Node: %p\n", node);
+    }
+  }
+
+  std::unordered_set<NodeRef> Nodes;
+  std::unordered_set<EdgeRef> Edges;
+};
+
+/// \brief A simple graph implementation
+///
+/// Everything is owned by the graph to simplify storage concerns.
+///
+template <typename T, typename... U>
+class Graph {
+ public:
+  using SubgraphType = Subgraph<T, U...>;
+  using NodeRef = Node<T, U...>*;
+  using EdgeRef = Edge<T, U...>*;
+
+  Graph() {
+    DEBUG_PRINT("Creating instance of Graph: %p\n", this);
+  }
+  Graph(const Graph&) = delete;
+  Graph(Graph&&) = default;
+  Graph& operator=(Graph&&) = default;
+  ~Graph() {}
+
+  /// \brief Creates a node and retains ownership of it.
+  /// \p data An rvalue of the data being held in the node.
+  /// \return A reference to the node created.
+  NodeRef createNode(T&& data) {
+    Nodes.emplace_back(Node<T, U...>(std::move(data)));
+    DEBUG_PRINT("Creating node (%p)\n", &Nodes.back());
+    return &Nodes.back();
+  }
+
+  void importNode(NodeRef node, Graph<T, U...>& otherGraph) {
+    std::list<Node<T, U...>>& otherNodes = otherGraph.Nodes;
+    for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
+      if (&(*it) == node) {
+        otherNodes.splice(otherNodes.end(), Nodes, it, ++it);
+        break;
+      }
+    }
+  }
+
+  void importEdge(EdgeRef edge, Graph<T, U...>& otherGraph) {
+    std::list<Edge<T, U...>>& otherEdges = otherGraph.Edges;
+    for (auto it = Edges.begin(); it != Edges.end(); ++it) {
+      if (&(*it) == edge) {
+        otherEdges.splice(otherEdges.end(), Edges, it, ++it);
+        break;
+      }
+    }
+  }
+
+  void swapNodes(NodeRef n1, NodeRef n2) {
+    // First rectify the edges
+    for (auto& inEdge : n1->getInEdges()) {
+      inEdge->setHead(n2);
+    }
+    for (auto& outEdge : n1->getOutEdges()) {
+      outEdge->setTail(n2);
+    }
+    for (auto& inEdge : n2->getInEdges()) {
+      inEdge->setHead(n1);
+    }
+    for (auto& outEdge : n2->getOutEdges()) {
+      outEdge->setTail(n1);
+    }
+    // Then simply copy the edge vectors around
+    auto n1InEdges = n1->getInEdges();
+    auto n1OutEdges = n1->getOutEdges();
+    auto n2InEdges = n2->getInEdges();
+    auto n2OutEdges = n2->getOutEdges();
+
+    n1->setOutEdges(n2OutEdges);
+    n1->setInEdges(n2InEdges);
+    n2->setOutEdges(n1OutEdges);
+    n2->setInEdges(n1InEdges);
+  }
+
+  NodeRef createNode() {
+    Nodes.emplace_back(Node<T, U...>());
+    DEBUG_PRINT("Creating node (%p)\n", &Nodes.back());
+    return &Nodes.back();
+  }
+
+  /// \brief Replace a node in the graph with a generic
+  /// set of nodes.
+  /// \note The node replaced simply has its edges cut, but it not
+  /// deleted from the graph.  Call Graph::deleteNode to delete it.
+  /// \p old A node to be replaced in the graph.
+  /// \p newTail The node that inherit the old node's in-edges
+  /// \p newHead (optional) The node that inherit the old node's out-edges
+  void replaceNode(
+      const NodeRef& old,
+      const NodeRef& newTail,
+      const NodeRef& newHead_ = nullptr) {
+    // If no newHead is specified, make the tail the head as well.
+    // We are effectively replacing the node with one node in this case.
+    const NodeRef newHead = newHead_ ? newHead_ : newTail;
+    const auto inEdges = old->getInEdges();
+    const auto outEdges = old->getOutEdges();
+
+    for (const auto& inEdge : inEdges) {
+      inEdge->setHead(newTail);
+      old->removeInEdge(inEdge);
+      newTail->addInEdge(inEdge);
+    }
+
+    for (const auto& outEdge : outEdges) {
+      outEdge->setTail(newHead);
+      old->removeOutEdge(outEdge);
+      newTail->addOutEdge(outEdge);
+    }
+  }
+
+  /// \brief Creates a directed edge and retains ownership of it.
+  /// \p tail The node that will have this edge as an out-edge.
+  /// \p head The node that will have this edge as an in-edge.
+  /// \return A reference to the edge created.
+  EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) {
+    DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head);
+    this->Edges.emplace_back(
+        Edge<T, U...>(tail, head, std::forward<U...>(data)...));
+    EdgeRef e = &this->Edges.back();
+    head->addInEdge(e);
+    tail->addOutEdge(e);
+    return e;
+  }
+
+  /// \brief Get a reference to the edge between two nodes if it exists.
+  /// note: will fail assertion if the edge does not exist.
+  EdgeRef getEdge(NodeRef tail, NodeRef head) const {
+    for (auto& inEdge : head->getInEdges()) {
+      if (inEdge->tail() == tail) {
+        return inEdge;
+      }
+    }
+    assert(0 && "Edge doesn't exist.");
+    return nullptr;
+  }
+
+  /// \brief Deletes a node from the graph.
+  /// \param n A reference to the node.
+  /// \param deleteEdges (optional) Whether or not to delete the edges
+  /// related to the node.
+  void deleteNode(NodeRef n, bool deleteEdges = true) {
+    if (deleteEdges) {
+      auto inEdges = n->inEdges;
+      for (auto& edge : inEdges) {
+        deleteEdge(edge);
+      }
+      auto outEdges = n->outEdges;
+      for (auto& edge : outEdges) {
+        deleteEdge(edge);
+      }
+    }
+    for (auto i = Nodes.begin(); i != Nodes.end(); ++i) {
+      if (&*i == n) {
+        Nodes.erase(i);
+        break;
+      }
+    }
+  }
+
+  /// \brief Deletes a edge from the graph.
+  /// \p e A reference to the edge.
+  void deleteEdge(EdgeRef e, bool remove_ref = true) {
+    if (remove_ref) {
+      e->Tail->removeOutEdge(e);
+      e->Head->removeInEdge(e);
+    }
+    for (auto i = Edges.begin(); i != Edges.end(); ++i) {
+      if (&*i == e) {
+        Edges.erase(i);
+        break;
+      }
+    }
+  }
+
+  const std::vector<NodeRef> getMutableNodes() {
+    std::vector<NodeRef> v;
+    for (auto& n : Nodes) {
+      DEBUG_PRINT("Adding node to mutable output (%p)\n", &n);
+      v.emplace_back(&n);
+    }
+    return v;
+  }
+
+  const std::vector<EdgeRef> getMutableEdges() {
+    std::vector<EdgeRef> v;
+    for (auto& e : Edges) {
+      DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e);
+      v.emplace_back(&e);
+    }
+    return v;
+  }
+
+  void printEdges() {
+    for (const auto& edge : Edges) {
+      printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head());
+    }
+  }
+
+  void printNodes() const {
+    for (const auto& node : Nodes) {
+      printf("Node: %p\n", &node);
+    }
+  }
+
+ protected:
+  std::list<Node<T, U...>> Nodes;
+  std::list<Edge<T, U...>> Edges;
+};
+
+} // namespace nom
+
+#endif // NOM_GRAPH_GRAPH_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/TarjansImpl.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/TarjansImpl.h
new file mode 100644
index 0000000..57c89b0
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/TarjansImpl.h
@@ -0,0 +1,176 @@
+#ifndef NOM_GRAPH_TARJANSIMPL_H
+#define NOM_GRAPH_TARJANSIMPL_H
+
+#include <unordered_map>
+
+#include "nomnigraph/Graph/Graph.h"
+
+namespace nom {
+namespace algorithm {
+
+template <typename T, typename... U>
+struct GraphWrapper {
+  struct NodeWrapper {
+    using NodeRef = typename Graph<T, U...>::NodeRef;
+    NodeWrapper(NodeRef n) : node(n) {}
+    NodeRef node;
+    int Index = -1;
+    int LowLink = -1;
+    bool OnStack = false;
+  };
+
+  struct EdgeWrapper {
+    typename Graph<T, U...>::EdgeRef edge;
+  };
+};
+
+/// \brief Tarjans algorithm implementation.
+///
+/// See details on how the algorithm works here:
+/// https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
+///
+/// The algorithm works by annotating nodes, but we want to be able to
+/// handle generic graphs.  Thus, we wrap the input graph with nodes that
+/// contain data composed of references to the original graph (for later
+/// recovery) and the data required for the algorithm (see NodeWrapper).
+///
+/// We then run the algorithm and return a reverse-topologically sorted
+/// vector of strongly connect components in the form of Subgraphs on the Graph.
+///
+/// \note Head/Tail is used in reverse in Tarjan's early papers.
+/// \bug Edges not included in returned subgraphs.
+///
+template <typename T, typename... U>
+class Tarjans {
+  using NodeWrapper = typename GraphWrapper<T, U...>::NodeWrapper;
+  using EdgeWrapper = typename GraphWrapper<T, U...>::EdgeWrapper;
+  using WrappedGraph = Graph<NodeWrapper, EdgeWrapper>;
+  using WrappedSubgraph = Subgraph<NodeWrapper, EdgeWrapper>;
+
+ private:
+  int Index = 0;
+  std::vector<typename WrappedGraph::NodeRef> Stack;
+  Graph<T, U...>* InputGraph;
+  WrappedGraph WrappedInputGraph;
+  std::vector<WrappedSubgraph> WrappedSCCs;
+
+ public:
+  /// \brief Constructor wraps the input graph with an annotated graph
+  ///        set up with the datastructures needed for the algorithm.
+  /// \p g The graph Tarjan's will be run on.
+  Tarjans(Graph<T, U...>* g) : InputGraph(g) {
+    // Wrap Graph with node labels
+    std::unordered_map<
+        typename Graph<T, U...>::NodeRef,
+        typename WrappedGraph::NodeRef>
+        n_to_wrappedNode;
+
+    for (const auto& n : InputGraph->getMutableNodes()) {
+      NodeWrapper wrappedNode(n);
+      n_to_wrappedNode[n] =
+          WrappedInputGraph.createNode(std::move(wrappedNode));
+    }
+
+    for (const auto& e : InputGraph->getMutableEdges()) {
+      EdgeWrapper wrappedEdge = {e};
+      WrappedInputGraph.createEdge(
+          n_to_wrappedNode[e->tail()],
+          n_to_wrappedNode[e->head()],
+          std::move(wrappedEdge));
+    }
+  }
+
+  /// \brief Helper function for finding strongly connected components.
+  /// \p n A reference to a node within the wrapped graph.
+  void connect(typename WrappedGraph::NodeRef n) {
+    n->mutableData()->Index = Index;
+    n->mutableData()->LowLink = Index;
+    Index++;
+
+    Stack.emplace_back(n);
+    n->mutableData()->OnStack = true;
+
+    for (const auto& outEdge : n->getOutEdges()) {
+      typename WrappedGraph::NodeRef newNode = outEdge->head();
+      // Check if we've considered this node before.
+      if (newNode->data().Index == -1) {
+        connect(newNode);
+        n->mutableData()->LowLink =
+            std::min(n->data().LowLink, newNode->data().LowLink);
+        // Check if newNode is in the SCC.
+      } else if (newNode->data().OnStack) {
+        n->mutableData()->LowLink =
+            std::min(n->data().LowLink, newNode->data().Index);
+      }
+    }
+
+    // If our node is a root node, pop it from the stack (we've found an SCC)
+    if (n->data().LowLink == n->data().Index) {
+      WrappedSubgraph wrappedSCC;
+
+      typename WrappedGraph::NodeRef w;
+      do {
+        w = Stack.back();
+        w->mutableData()->OnStack = false;
+        Stack.pop_back();
+        wrappedSCC.addNode(w);
+      } while (w != n);
+
+      // Add all the edges into the SCC.
+      // TODO include edges in the SCC in a smarter way.
+      const auto& sccNodes = wrappedSCC.getNodes();
+      for (const auto& sccNode : sccNodes) {
+        for (const auto& outEdge : sccNode->getOutEdges()) {
+          if (std::find(sccNodes.begin(), sccNodes.end(), outEdge->head()) !=
+              sccNodes.end()) {
+            wrappedSCC.addEdge(outEdge);
+          }
+        }
+      }
+      WrappedSCCs.emplace_back(wrappedSCC);
+    }
+  }
+
+  /// \brief Helper function for recovering a valid subgraph output.
+  /// \p wrappedS A wrapped subgraph.
+  /// \return A subgraph of the original input graph.
+  ///
+  inline Subgraph<T, U...> unwrapSubgraph(
+      const WrappedSubgraph& wrappedSubgraph) {
+    Subgraph<T, U...> s;
+    for (auto wrappedNode : wrappedSubgraph.getNodes()) {
+      s.addNode(wrappedNode->data().node);
+    }
+    for (auto wrappedEdge : wrappedSubgraph.getEdges()) {
+      s.addEdge(wrappedEdge->data().edge);
+    }
+    return s;
+  }
+
+  std::vector<Subgraph<T, U...>> run() {
+    for (auto n : WrappedInputGraph.getMutableNodes()) {
+      if (n->data().Index == -1) {
+        connect(n);
+      }
+    }
+
+    std::vector<Subgraph<T, U...>> sccs;
+    for (auto wrappedSCC : WrappedSCCs) {
+      sccs.emplace_back(unwrapSubgraph(wrappedSCC));
+    }
+
+    return sccs;
+  }
+};
+
+/// \brief A function wrapper to infer the graph template parameters.
+template <typename T, typename... U>
+std::vector<Subgraph<T, U...>> tarjans(Graph<T, U...>* g) {
+  Tarjans<T, U...> t(g);
+  return t.run();
+}
+
+} // namespace algorithm
+} // namespace nom
+
+#endif // NOM_GRAPH_TARJANSIMPL_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
new file mode 100644
index 0000000..6a2d5d3
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
@@ -0,0 +1,100 @@
+#ifndef NOM_REPRESENTATIONS_COMPILER_H
+#define NOM_REPRESENTATIONS_COMPILER_H
+
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Support/Casting.h"
+
+namespace nom {
+namespace repr {
+
+class Value {
+ public:
+  enum class ValueKind { Value, Instruction, Data };
+  Value(ValueKind K) : Kind(K) {}
+  Value() : Kind(ValueKind::Value) {}
+  ValueKind getKind() const {
+    return Kind;
+  }
+  virtual ~Value() = default;
+
+ private:
+  const ValueKind Kind;
+};
+
+class Data : public Value {
+ public:
+  Data() : Value(ValueKind::Data) {}
+  static bool classof(const Value* V) {
+    return V->getKind() == ValueKind::Data;
+  }
+  virtual ~Data() = default;
+  size_t getVersion() const {
+    return Version;
+  }
+
+  void setVersion(size_t version) {
+    Version = version;
+  }
+
+ private:
+  size_t Version = 0;
+};
+
+class Instruction : public Value {
+ public:
+  /// \brief All the different types of execution.
+  enum class Opcode {
+    Generic, // Handles basic instructions.
+    TerminatorStart, // LLVM style range of operations.
+    Branch,
+    Return,
+    TerminatorEnd,
+    Phi
+  };
+  Instruction() : Value(ValueKind::Instruction), Op(Opcode::Generic) {}
+  Instruction(Opcode op) : Value(ValueKind::Instruction), Op(op) {}
+  static bool classof(const Value* V) {
+    return V->getKind() == ValueKind::Instruction;
+  }
+  virtual ~Instruction() = default;
+  Opcode getOpcode() const {
+    return Op;
+  }
+
+ private:
+  Opcode Op;
+};
+
+class Terminator : public Instruction {
+ public:
+  Terminator(Instruction::Opcode op) : Instruction(op) {}
+
+ private:
+  static bool classof(const Value* V) {
+    return isa<Instruction>(V) &&
+        isTerminator(cast<Instruction>(V)->getOpcode());
+  }
+  static bool isTerminator(const Opcode& op) {
+    return op >= Opcode::TerminatorStart && op <= Opcode::TerminatorEnd;
+  }
+};
+
+class Branch : public Terminator {
+ public:
+  Branch() : Terminator(Instruction::Opcode::Branch) {}
+};
+
+class Return : public Terminator {
+ public:
+  Return() : Terminator(Instruction::Opcode::Return) {}
+};
+
+class Phi : public Instruction {
+ public:
+  Phi() : Instruction(Instruction::Opcode::Phi) {}
+};
+
+} // namespace repr
+} // namespace nom
+
+#endif // NOM_REPRESENTATIONS_COMPILER_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
new file mode 100644
index 0000000..01799ef
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
@@ -0,0 +1,162 @@
+#ifndef NOM_REPRESENTATIONS_CONTROLFLOW_H
+#define NOM_REPRESENTATIONS_CONTROLFLOW_H
+
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Representations/Compiler.h"
+
+#include <unordered_map>
+
+namespace nom {
+namespace repr {
+
+/// \brief A basic block holds a reference to a subgraph
+/// of the data flow graph as well as an ordering on instruction
+/// execution.  Basic blocks are used for control flow analysis.
+template <typename T, typename... U>
+class BasicBlock {
+ public:
+  using NodeRef = typename Subgraph<T, U...>::NodeRef;
+  BasicBlock() {}
+  ~BasicBlock() {
+    for (auto pair : callbacks) {
+      pair.first->deleteDestructorCallback(pair.second);
+    }
+  }
+
+  void trackNode(NodeRef node) {
+    callbacks[node] = node->registerDestructorCallback([&](NodeRef n) {
+      assert(
+          hasInstruction(n) &&
+          "Destructor callback invoked on untracked node in BasicBlock.");
+      deleteInstruction(n);
+    });
+    Nodes.addNode(node);
+  }
+
+  void untrackNode(NodeRef node) {
+    callbacks.erase(node);
+    Nodes.removeNode(node);
+  }
+
+  void pushInstructionNode(NodeRef node) {
+    assert(
+        isa<Instruction>(node->data()) &&
+        "Cannot push non-instruction node to basic block.");
+    Instructions.emplace_back(node);
+    trackNode(node);
+  }
+  const std::vector<NodeRef>& getInstructions() {
+    return Instructions;
+  }
+
+  bool hasInstruction(NodeRef instr) const {
+    return Nodes.hasNode(instr);
+  }
+
+  void insertInstructionBefore(NodeRef newInstr, NodeRef instr) {
+    auto it =
+        std::find(std::begin(Instructions), std::end(Instructions), instr);
+    Instructions.insert(it, newInstr);
+    trackNode(newInstr);
+  }
+
+  void moveInstructionBefore(NodeRef instr1, NodeRef instr2) {
+    assert(hasInstruction(instr1) && "Instruction not in basic block.");
+    assert(hasInstruction(instr2) && "Instruction not in basic block.");
+    auto it1 =
+        std::find(std::begin(Instructions), std::end(Instructions), instr1);
+    auto it2 =
+        std::find(std::begin(Instructions), std::end(Instructions), instr2);
+    Instructions.erase(it1);
+    Instructions.insert(it2, instr1);
+  }
+
+  void deleteInstruction(NodeRef instr) {
+    assert(hasInstruction(instr) && "Instruction not in basic block.");
+    Instructions.erase(
+        std::remove(Instructions.begin(), Instructions.end(), instr),
+        Instructions.end());
+    untrackNode(instr);
+  }
+
+ private:
+  Subgraph<T, U...> Nodes;
+  std::vector<NodeRef> Instructions;
+  // Because we reference a dataflow graph, we need to register callbacks
+  // for when the dataflow graph is modified.
+  std::unordered_map<NodeRef, typename Notifier<Node<T, U...>>::Callback*>
+      callbacks;
+};
+
+using Program = Graph<Value>;
+
+template <typename G>
+struct ControlFlowGraphImpl {
+  // Hack to help debugging in case this class is misused.
+  static_assert(
+      sizeof(ControlFlowGraphImpl),
+      "Template parameter G in "
+      "ControlFlowGraph<G> must be of "
+      "type Graph<T, U...>.");
+};
+
+template <typename T, typename... U>
+struct ControlFlowGraphImpl<Graph<T, U...>> {
+  using type = Graph<std::unique_ptr<BasicBlock<T, U...>>, int>;
+  using bbType = BasicBlock<T, U...>;
+};
+
+/// \brief Control flow graph is a graph of basic blocks that
+/// can be used as an analysis tool.
+///
+/// \note G Must be of type Graph<T, U...>.
+template <typename G>
+class ControlFlowGraph : public ControlFlowGraphImpl<G>::type {
+ public:
+  // This is for C++11 compatibility, otherwise we could use "using"
+  ControlFlowGraph() {}
+  ControlFlowGraph(const ControlFlowGraph&) = delete;
+  ControlFlowGraph(ControlFlowGraph&&) = default;
+  ControlFlowGraph& operator=(ControlFlowGraph&&) = default;
+  ~ControlFlowGraph() {}
+};
+
+/// \brief Helper for extracting the type of BasicBlocks given
+/// a graph (probably a dataflow graph).  TODO: refactor this
+/// to come from something like Graph::NodeDataType
+template <typename G>
+using BasicBlockType = typename ControlFlowGraphImpl<G>::bbType;
+
+/// \brief Converts graph to SSA representation.  Modifies the graph
+/// by inserting versions and phi nodes.
+template <typename Phi, typename G>
+void addSSA(G* dfg, ControlFlowGraph<G>* cfg) {
+  static_assert(
+      std::is_base_of<Instruction, Phi>::value,
+      "Phi type must be derived from Instruction.");
+  auto dfMap = dominanceFrontierMap(cfg);
+  for (auto pair : dfMap) {
+    for (auto n : pair.second) {
+      printf(
+          "%llu -> %llu\n",
+          (unsigned long long)pair.first,
+          (unsigned long long)n);
+    }
+  }
+}
+
+/// \brief Deletes a referenced node from the control flow graph.
+template <typename G>
+void deleteNode(ControlFlowGraph<G>* cfg, typename G::NodeRef node) {
+  for (auto bbNode : cfg->getMutableNodes()) {
+    auto bb = bbNode->data().get();
+    if (bb->hasInstruction(node)) {
+      bb->deleteInstruction(node);
+    }
+  }
+}
+
+} // namespace repr
+} // namespace nom
+
+#endif // NOM_REPRESENTATIONS_CONTROLFLOW_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
new file mode 100644
index 0000000..d42e254
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -0,0 +1,416 @@
+//=== nomnigraph/Representations/NeuralNet.h - NN interface -----*- C++ -*-===//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes that can be used in a
+// nom::Graph<nom::repr::NeuralNetOperator, nom::repr::NeuralNetData> graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_REPRESENTATIONS_NEURALNET_H
+#define NOM_REPRESENTATIONS_NEURALNET_H
+
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Representations/Compiler.h"
+#include "nomnigraph/Representations/ControlFlow.h"
+#include "nomnigraph/Support/Casting.h"
+#include "nomnigraph/Support/Pointer.h"
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <assert.h>
+
+namespace nom {
+namespace repr {
+
+// Expose supported attribute types to this namespace.
+using std::string;
+using std::vector;
+
+class NeuralNetData;
+
+/// \brief Annotations allow for generic manipulation of
+/// neural network operations.  The base class contains
+/// a saved void* pointer for external use.  Derived classes
+/// add richer semantics to the annotation and it is encouraged
+/// to use them.
+class Annotation {
+ public:
+  enum class AnnotationKind { Generic, Caffe2 };
+
+  Annotation(AnnotationKind K) : Kind(K) {}
+  Annotation() : Kind(AnnotationKind::Generic) {}
+  virtual ~Annotation() {}
+
+  AnnotationKind getKind() const {
+    return Kind;
+  }
+
+  Annotation(const Annotation&) = delete;
+  Annotation& operator=(Annotation&) = delete;
+
+ private:
+  const AnnotationKind Kind;
+};
+
+class NeuralNetOperator : public Instruction {
+ public:
+  /// Discriminator for LLVM-style RTTI (isa<>)
+  enum class NNKind {
+    Undefined,
+    GenericOperator,
+    NNPhi,
+    While,
+#include "nomnigraph/Generated/OpEnum.h"
+  };
+
+  /// An optional tensor-type specifier.
+  enum class NNLayout { Undefined, NCHW, NHWC };
+
+  NeuralNetOperator(NNKind K, Opcode I, NNLayout L)
+      : Instruction(I), Kind(K), Layout(L) {}
+  NeuralNetOperator(NNKind K, Opcode I)
+      : Instruction(I), Kind(K), Layout(NNLayout::Undefined) {}
+  NeuralNetOperator(NNKind K, NNLayout L) : Instruction(), Kind(K), Layout(L) {}
+  NeuralNetOperator(NNKind K)
+      : Instruction(), Kind(K), Layout(NNLayout::Undefined) {}
+  NeuralNetOperator()
+      : Instruction(), Kind(NNKind::Undefined), Layout(NNLayout::Undefined) {}
+
+  NNKind getKind() const {
+    return Kind;
+  }
+
+  void setLayout(NNLayout L) {
+    Layout = L;
+  }
+
+  NNLayout getLayout() const {
+    return Layout;
+  }
+
+  void setAnnotation(std::unique_ptr<Annotation> extraAnnotation) {
+    ExtraAnnotation = std::move(extraAnnotation);
+  }
+
+  const Annotation* getAnnotation() const {
+    return ExtraAnnotation.get();
+  }
+  Annotation* getMutableAnnotation() {
+    return ExtraAnnotation.get();
+  }
+
+  const std::string getName() const;
+
+  /// \brief Validate the inputs and outputs to this operator.
+  ///
+  /// \p inputs A vector of references to NeuralNetData types that
+  /// represent the data being fed into the operator.
+  /// \p outputs A vector of references to NeuralNetData types that
+  /// represent the data being outputted by the operator.
+  /// \return true if the inputs and outputs are compatible with the operator.
+  bool checkInputsAndOutputs(
+      std::vector<const NeuralNetData*> inputs,
+      std::vector<const NeuralNetData*> outputs) {
+    return true;
+  }
+
+  virtual ~NeuralNetOperator() = 0;
+
+  NeuralNetOperator(const NeuralNetOperator&) = delete;
+  NeuralNetOperator& operator=(NeuralNetOperator&) = delete;
+
+ private:
+  const NNKind Kind;
+  NNLayout Layout; // Mutable attribute, much like a type cast
+  std::unique_ptr<Annotation> ExtraAnnotation;
+};
+
+class NeuralNetData : public Data {
+ public:
+  /// Discriminator for LLVM-style RTTI (isa<>)
+  enum class NNDataKind { Generic, Tensor };
+
+  NeuralNetData(NNDataKind kind) : Kind(kind) {}
+
+  NeuralNetData() : Kind(NNDataKind::Generic) {}
+
+  NNDataKind getKind() const {
+    return Kind;
+  }
+
+  virtual NeuralNetData* clone() = 0;
+
+  const std::string getName() const;
+
+  virtual ~NeuralNetData() = 0;
+
+ private:
+  NNDataKind Kind;
+  size_t Version = 0;
+};
+
+class Tensor : public NeuralNetData {
+ public:
+  enum class DataType { Generic, Float, Half, Int8 };
+  enum class Layout { Generic, NCHW, NHWC };
+
+  Tensor(std::string name)
+      : NeuralNetData(NNDataKind::Tensor),
+        name_(name),
+        type_(DataType::Generic) {}
+  static bool classof(const NeuralNetData* D) {
+    return D->getKind() == NNDataKind::Tensor;
+  }
+
+  NeuralNetData* clone() {
+    return new Tensor(name_);
+  }
+
+  void setType(DataType type) {
+    type_ = type;
+  }
+
+  DataType getType() const {
+    return type_;
+  }
+
+  const std::string getName() const {
+    return name_;
+  }
+  ~Tensor() {}
+
+ private:
+  std::string name_;
+  DataType type_;
+};
+
+#define NOMNIGRAPH_DEFINE_NN_RTTI(op)                                 \
+  static bool classof(const NeuralNetOperator* N) {                   \
+    return N->getKind() == NNKind::op;                                \
+  }                                                                   \
+  static bool classof(const Value* N) {                               \
+    if (isa<NeuralNetOperator>(N)) {                                  \
+      return dyn_cast<NeuralNetOperator>(N)->getKind() == NNKind::op; \
+    }                                                                 \
+    return false;                                                     \
+  }
+
+#include "nomnigraph/Generated/OpClasses.h"
+
+class While : public NeuralNetOperator {
+ public:
+  While() : NeuralNetOperator(NNKind::While, Opcode::Branch) {}
+  NOMNIGRAPH_DEFINE_NN_RTTI(While);
+  ~While() {}
+};
+
+class NNPhi : public NeuralNetOperator {
+ public:
+  NNPhi() : NeuralNetOperator(NNKind::NNPhi, Opcode::Phi) {}
+  NOMNIGRAPH_DEFINE_NN_RTTI(NNPhi);
+  ~NNPhi() {}
+};
+
+class GenericOperator : public NeuralNetOperator {
+ public:
+  GenericOperator() : NeuralNetOperator(NNKind::GenericOperator) {}
+  GenericOperator(std::string name)
+      : NeuralNetOperator(NNKind::GenericOperator), name_(name) {}
+  NOMNIGRAPH_DEFINE_NN_RTTI(GenericOperator);
+  std::string getName() const {
+    return name_;
+  }
+  void setName(std::string name) {
+    name_ = name;
+  }
+  ~GenericOperator() {}
+
+ private:
+  std::string name_;
+};
+
+using NNGraph = nom::Graph<std::unique_ptr<nom::repr::Value>>;
+using NNSubgraph = nom::Subgraph<std::unique_ptr<nom::repr::Value>>;
+using NNCFGraph = nom::repr::ControlFlowGraph<NNGraph>;
+
+struct NNModule {
+  NNGraph dataFlow;
+  NNCFGraph controlFlow;
+  NNModule(const NNModule&) = delete;
+  NNModule(NNModule&&) = default;
+  NNModule() {}
+};
+
+// Although these seem generic, they make subtle assumptions
+// about the structure of the graph that is 100% valid for NNModule graphs
+// but not any graph (such as data being a unique_ptr).
+namespace nn {
+
+template <bool B, class T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T, typename U>
+struct inheritedFrom {
+  static constexpr bool value =
+      std::is_base_of<U, T>::value && !std::is_same<U, T>::value;
+};
+
+// This is just a way to fix issues when the isa<> implementation
+// can't automatically downcast.
+template <typename T, typename N, typename = void>
+struct is_impl {
+  inline static bool impl(N n) {
+    return isa<T>(n->data());
+  }
+};
+
+template <typename T, typename N>
+struct is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+  inline static bool impl(N n) {
+    if (!isa<NeuralNetOperator>(n->data().get())) {
+      return false;
+    }
+    auto nno = dyn_cast<NeuralNetOperator>(n->data().get());
+    return isa<T>(nno);
+  }
+};
+
+template <typename T, typename N>
+struct is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+  inline static bool impl(N n) {
+    if (!isa<NeuralNetData>(n->data().get())) {
+      return false;
+    }
+    auto nno = dyn_cast<NeuralNetData>(n->data().get());
+    return isa<T>(nno);
+  }
+};
+
+template <typename T, typename N>
+inline bool is(N n) {
+  return is_impl<T, N>::impl(n);
+}
+
+// This is just a way to fix issues when the dyn_cast<> implementation
+// can't automatically downcast.
+template <typename T, typename N, typename = void>
+struct get_impl {
+  inline static T* impl(N n) {
+    return dyn_cast<T>(n->data().get());
+  }
+};
+
+template <typename T, typename N>
+struct get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+  inline static T* impl(N n) {
+    if (!is<T>(n)) {
+      assert(0 && "Cannot get type from node");
+      return nullptr;
+    }
+    auto nno = dyn_cast<NeuralNetOperator>(n->data().get());
+    return dyn_cast<T>(nno);
+  }
+};
+
+template <typename T, typename N>
+struct get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+  inline static T* impl(N n) {
+    if (!is<T>(n)) {
+      assert(0 && "Cannot get type from node");
+      return nullptr;
+    }
+    auto nno = dyn_cast<NeuralNetData>(n->data().get());
+    return dyn_cast<T>(nno);
+  }
+};
+
+template <typename T, typename N>
+inline T* get(N n) {
+  return get_impl<T, N>::impl(n);
+}
+
+template <typename T, typename G>
+std::vector<std::pair<T*, typename G::NodeRef>> dataIterator(G& g) {
+  std::vector<std::pair<T*, typename G::NodeRef>> out;
+  for (auto node : g.getMutableNodes()) {
+    if (!is<T>(node)) {
+      continue;
+    }
+    auto d = get<T>(node);
+    out.emplace_back(std::make_pair(d, node));
+  }
+  return out;
+}
+
+template <typename T, typename... Args>
+void insertOp(
+    NNGraph& g,
+    NNGraph::NodeRef a,
+    NNGraph::NodeRef b,
+    Args... args) {
+  if (is<NeuralNetData>(a) && is<NeuralNetOperator>(b)) {
+    auto newNode = g.createNode(util::make_unique<T>(args...));
+    auto data = get<NeuralNetData>(a);
+    auto newData =
+        g.createNode(util::make_unique<Tensor>(data->getName() + "_"));
+    g.createEdge(a, newNode);
+    g.createEdge(newNode, newData);
+    g.createEdge(newData, b);
+    return;
+  }
+  if (is<NeuralNetOperator>(a) && is<NeuralNetData>(b)) {
+    auto newNode = g.createNode(util::make_unique<T>(args...));
+    auto data = get<NeuralNetData>(b);
+    auto newData =
+        g.createNode(util::make_unique<Tensor>(data->getName() + "_"));
+    g.createEdge(a, newData);
+    g.createEdge(newData, newNode);
+    g.createEdge(newNode, b);
+    return;
+  }
+
+  assert(0 && "insertOp takes (DFG, Tensor, Op) or (DFG, Op, Tensor)");
+}
+
+template <typename NewT, typename OldT>
+NNGraph::NodeRef convertNode(NNGraph& g, NNGraph::NodeRef node) {
+  assert(is<OldT>(node) && "Cannot get type from node.");
+
+  NeuralNetOperator* nnOpPtr =
+      dyn_cast<NeuralNetOperator>(node->mutableData()->release());
+
+  auto newNode =
+      g.createNode(util::make_unique<NewT>(*dyn_cast<OldT>(nnOpPtr)));
+
+  g.replaceNode(node, newNode);
+  g.deleteNode(node);
+
+  return newNode;
+}
+
+/// NeuralNetData specific helpers.
+bool hasProducer(NNGraph::NodeRef n);
+NNGraph::NodeRef getProducer(NNGraph::NodeRef n);
+bool hasConsumer(NNGraph::NodeRef n);
+std::vector<NNGraph::NodeRef> getConsumers(NNGraph::NodeRef n);
+
+bool hasInputs(NNGraph::NodeRef n);
+std::vector<NNGraph::NodeRef> getInputs(NNGraph::NodeRef n);
+std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n);
+
+void coalesceInsertedDataDependencies(repr::NNModule* m);
+
+template <NNGraph* G>
+struct NodeHelper {};
+
+} // namespace nn
+
+} // namespace repr
+} // namespace nom
+
+#endif // NOM_REPRESENTATIONS_NEURALNET_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Casting.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Casting.h
new file mode 100644
index 0000000..11853dc
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Casting.h
@@ -0,0 +1,457 @@
+//===- nomnigraph/Support/Casting.h - Allow casting checks ------*- C++ -*-===//
+//
+// This is taken directly from LLVM's source code.
+//
+// The original file is distributed under the University of Illinois Open Source
+// License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the isa<X>() function for checking downcastibility.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_SUPPORT_CASTING_H
+#define NOM_SUPPORT_CASTING_H
+
+#include <assert.h>
+#include <memory>
+
+//===----------------------------------------------------------------------===//
+//                          isa<x> Support Templates
+//===----------------------------------------------------------------------===//
+
+/// NOMNIGRAPH_NODISCARD - Warn if a type or return value is discarded.
+#define NOMNIGRAPH_NODISCARD
+#if __cplusplus > 201402L && defined(__has_cpp_attribute)
+#if __has_cpp_attribute(nodiscard)
+#undef NOMNIGRAPH_NODISCARD
+#define NOMNIGRAPH_NODISCARD [[nodiscard]]
+#endif
+// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
+// error when __has_cpp_attribute is given a scoped attribute in C mode.
+#elif __cplusplus && defined(__has_cpp_attribute)
+#if __has_cpp_attribute(clang::warn_unused_result)
+#undef NOMNIGRAPH_NODISCARD
+#define NOMNIGRAPH_NODISCARD [[clang::warn_unused_result]]
+#endif
+#endif
+
+/// \brief If T is a pointer, just return it. If it is not, return T&.
+template <typename T, typename Enable = void>
+struct add_lvalue_reference_if_not_pointer {
+  using type = T&;
+};
+
+/// \brief If T is a pointer to X, return a pointer to const X. If it is not,
+/// return const T.
+template <typename T, typename Enable = void>
+struct add_const_past_pointer {
+  using type = const T;
+};
+
+// Define a template that can be specialized by smart pointers to reflect the
+// fact that they are automatically dereferenced, and are not involved with the
+// template selection process...  the default implementation is a noop.
+//
+template <typename From>
+struct simplify_type {
+  using SimpleType = From; // The real type this represents...
+
+  // An accessor to get the real value...
+  static SimpleType& getSimplifiedValue(From& Val) {
+    return Val;
+  }
+};
+
+template <typename From>
+struct simplify_type<const From> {
+  using NonConstSimpleType = typename simplify_type<From>::SimpleType;
+  using SimpleType = typename add_const_past_pointer<NonConstSimpleType>::type;
+  using RetType =
+      typename add_lvalue_reference_if_not_pointer<SimpleType>::type;
+
+  static RetType getSimplifiedValue(const From& Val) {
+    return simplify_type<From>::getSimplifiedValue(const_cast<From&>(Val));
+  }
+};
+
+// The core of the implementation of isa<X> is here; To and From should be
+// the names of classes.  This template can be specialized to customize the
+// implementation of isa<> without rewriting it from scratch.
+template <typename To, typename From, typename Enabler = void>
+struct isa_impl {
+  static inline bool doit(const From& Val) {
+    return To::classof(&Val);
+  }
+};
+
+/// \brief Always allow upcasts, and perform no dynamic check for them.
+template <typename To, typename From>
+struct isa_impl<
+    To,
+    From,
+    typename std::enable_if<std::is_base_of<To, From>::value>::type> {
+  static inline bool doit(const From&) {
+    return true;
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl {
+  static inline bool doit(const From& Val) {
+    return isa_impl<To, From>::doit(Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, const From> {
+  static inline bool doit(const From& Val) {
+    return isa_impl<To, From>::doit(Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, const std::unique_ptr<From>> {
+  static inline bool doit(const std::unique_ptr<From>& Val) {
+    assert(Val && "isa<> used on a null pointer");
+    return isa_impl_cl<To, From>::doit(*Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, From*> {
+  static inline bool doit(const From* Val) {
+    assert(Val && "isa<> used on a null pointer");
+    return isa_impl<To, From>::doit(*Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, From* const> {
+  static inline bool doit(const From* Val) {
+    assert(Val && "isa<> used on a null pointer");
+    return isa_impl<To, From>::doit(*Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, const From*> {
+  static inline bool doit(const From* Val) {
+    assert(Val && "isa<> used on a null pointer");
+    return isa_impl<To, From>::doit(*Val);
+  }
+};
+
+template <typename To, typename From>
+struct isa_impl_cl<To, const From* const> {
+  static inline bool doit(const From* Val) {
+    assert(Val && "isa<> used on a null pointer");
+    return isa_impl<To, From>::doit(*Val);
+  }
+};
+
+template <typename To, typename From, typename SimpleFrom>
+struct isa_impl_wrap {
+  // When From != SimplifiedType, we can simplify the type some more by using
+  // the simplify_type template.
+  static bool doit(const From& Val) {
+    return isa_impl_wrap<
+        To,
+        SimpleFrom,
+        typename simplify_type<SimpleFrom>::SimpleType>::
+        doit(simplify_type<const From>::getSimplifiedValue(Val));
+  }
+};
+
+template <typename To, typename FromTy>
+struct isa_impl_wrap<To, FromTy, FromTy> {
+  // When From == SimpleType, we are as simple as we are going to get.
+  static bool doit(const FromTy& Val) {
+    return isa_impl_cl<To, FromTy>::doit(Val);
+  }
+};
+
+// isa<X> - Return true if the parameter to the template is an instance of the
+// template type argument.  Used like this:
+//
+//  if (isa<Type>(myVal)) { ... }
+//
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline bool isa(const Y& Val) {
+  return isa_impl_wrap<
+      X,
+      const Y,
+      typename simplify_type<const Y>::SimpleType>::doit(Val);
+}
+
+//===----------------------------------------------------------------------===//
+//                          cast<x> Support Templates
+//===----------------------------------------------------------------------===//
+
+template <class To, class From>
+struct cast_retty;
+
+// Calculate what type the 'cast' function should return, based on a requested
+// type of To and a source type of From.
+template <class To, class From>
+struct cast_retty_impl {
+  using ret_type = To&; // Normal case, return Ty&
+};
+template <class To, class From>
+struct cast_retty_impl<To, const From> {
+  using ret_type = const To&; // Normal case, return Ty&
+};
+
+template <class To, class From>
+struct cast_retty_impl<To, From*> {
+  using ret_type = To*; // Pointer arg case, return Ty*
+};
+
+template <class To, class From>
+struct cast_retty_impl<To, const From*> {
+  using ret_type = const To*; // Constant pointer arg case, return const Ty*
+};
+
+template <class To, class From>
+struct cast_retty_impl<To, const From* const> {
+  using ret_type = const To*; // Constant pointer arg case, return const Ty*
+};
+
+template <class To, class From>
+struct cast_retty_impl<To, std::unique_ptr<From>> {
+ private:
+  using PointerType = typename cast_retty_impl<To, From*>::ret_type;
+  using ResultType = typename std::remove_pointer<PointerType>::type;
+
+ public:
+  using ret_type = std::unique_ptr<ResultType>;
+};
+
+template <class To, class From, class SimpleFrom>
+struct cast_retty_wrap {
+  // When the simplified type and the from type are not the same, use the type
+  // simplifier to reduce the type, then reuse cast_retty_impl to get the
+  // resultant type.
+  using ret_type = typename cast_retty<To, SimpleFrom>::ret_type;
+};
+
+template <class To, class FromTy>
+struct cast_retty_wrap<To, FromTy, FromTy> {
+  // When the simplified type is equal to the from type, use it directly.
+  using ret_type = typename cast_retty_impl<To, FromTy>::ret_type;
+};
+
+template <class To, class From>
+struct cast_retty {
+  using ret_type = typename cast_retty_wrap<
+      To,
+      From,
+      typename simplify_type<From>::SimpleType>::ret_type;
+};
+
+// Ensure the non-simple values are converted using the simplify_type template
+// that may be specialized by smart pointers...
+//
+template <class To, class From, class SimpleFrom>
+struct cast_convert_val {
+  // This is not a simple type, use the template to simplify it...
+  static typename cast_retty<To, From>::ret_type doit(From& Val) {
+    return cast_convert_val<
+        To,
+        SimpleFrom,
+        typename simplify_type<SimpleFrom>::SimpleType>::
+        doit(simplify_type<From>::getSimplifiedValue(Val));
+  }
+};
+
+template <class To, class FromTy>
+struct cast_convert_val<To, FromTy, FromTy> {
+  // This _is_ a simple type, just cast it.
+  static typename cast_retty<To, FromTy>::ret_type doit(const FromTy& Val) {
+    typename cast_retty<To, FromTy>::ret_type Res2 =
+        (typename cast_retty<To, FromTy>::ret_type) const_cast<FromTy&>(Val);
+    return Res2;
+  }
+};
+
+template <class X>
+struct is_simple_type {
+  static const bool value =
+      std::is_same<X, typename simplify_type<X>::SimpleType>::value;
+};
+
+// cast<X> - Return the argument parameter cast to the specified type.  This
+// casting operator asserts that the type is correct, so it does not return null
+// on failure.  It does not allow a null argument (use cast_or_null for that).
+// It is typically used like this:
+//
+//  cast<Instruction>(myVal)->getParent()
+//
+template <class X, class Y>
+inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, const Y>::ret_type>::type
+cast(const Y& Val) {
+  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+  return cast_convert_val<
+      X,
+      const Y,
+      typename simplify_type<const Y>::SimpleType>::doit(Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, Y>::ret_type cast(Y& Val) {
+  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+  return cast_convert_val<X, Y, typename simplify_type<Y>::SimpleType>::doit(
+      Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, Y*>::ret_type cast(Y* Val) {
+  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+  return cast_convert_val<X, Y*, typename simplify_type<Y*>::SimpleType>::doit(
+      Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, std::unique_ptr<Y>>::ret_type cast(
+    std::unique_ptr<Y>&& Val) {
+  assert(isa<X>(Val.get()) && "cast<Ty>() argument of incompatible type!");
+  using ret_type = typename cast_retty<X, std::unique_ptr<Y>>::ret_type;
+  return ret_type(
+      cast_convert_val<X, Y*, typename simplify_type<Y*>::SimpleType>::doit(
+          Val.release()));
+}
+
+// cast_or_null<X> - Functionally identical to cast, except that a null value is
+// accepted.
+//
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, const Y>::ret_type>::type
+cast_or_null(const Y& Val) {
+  if (!Val)
+    return nullptr;
+  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+  return cast<X>(Val);
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, Y>::ret_type>::type
+cast_or_null(Y& Val) {
+  if (!Val)
+    return nullptr;
+  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+  return cast<X>(Val);
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename cast_retty<X, Y*>::ret_type cast_or_null(
+    Y* Val) {
+  if (!Val)
+    return nullptr;
+  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+  return cast<X>(Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, std::unique_ptr<Y>>::ret_type cast_or_null(
+    std::unique_ptr<Y>&& Val) {
+  if (!Val)
+    return nullptr;
+  return cast<X>(std::move(Val));
+}
+
+// dyn_cast<X> - Return the argument parameter cast to the specified type.  This
+// casting operator returns null if the argument is of the wrong type, so it can
+// be used to test for a type as well as cast if successful.  This should be
+// used in the context of an if statement like this:
+//
+//  if (const Instruction *I = dyn_cast<Instruction>(myVal)) { ... }
+//
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, const Y>::ret_type>::type
+dyn_cast(const Y& Val) {
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename cast_retty<X, Y>::ret_type dyn_cast(
+    Y& Val) {
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename cast_retty<X, Y*>::ret_type dyn_cast(
+    Y* Val) {
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+// dyn_cast_or_null<X> - Functionally identical to dyn_cast, except that a null
+// value is accepted.
+//
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, const Y>::ret_type>::type
+dyn_cast_or_null(const Y& Val) {
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename std::enable_if<
+    !is_simple_type<Y>::value,
+    typename cast_retty<X, Y>::ret_type>::type
+dyn_cast_or_null(Y& Val) {
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline typename cast_retty<X, Y*>::ret_type
+dyn_cast_or_null(Y* Val) {
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+// unique_dyn_cast<X> - Given a unique_ptr<Y>, try to return a unique_ptr<X>,
+// taking ownership of the input pointer iff isa<X>(Val) is true.  If the
+// cast is successful, From refers to nullptr on exit and the casted value
+// is returned.  If the cast is unsuccessful, the function returns nullptr
+// and From is unchanged.
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline auto unique_dyn_cast(std::unique_ptr<Y>& Val)
+    -> decltype(cast<X>(Val)) {
+  if (!isa<X>(Val))
+    return nullptr;
+  return cast<X>(std::move(Val));
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline auto unique_dyn_cast(std::unique_ptr<Y>&& Val)
+    -> decltype(cast<X>(Val)) {
+  return unique_dyn_cast<X, Y>(Val);
+}
+
+// dyn_cast_or_null<X> - Functionally identical to unique_dyn_cast, except that
+// a null value is accepted.
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline auto unique_dyn_cast_or_null(
+    std::unique_ptr<Y>& Val) -> decltype(cast<X>(Val)) {
+  if (!Val)
+    return nullptr;
+  return unique_dyn_cast<X, Y>(Val);
+}
+
+template <class X, class Y>
+NOMNIGRAPH_NODISCARD inline auto unique_dyn_cast_or_null(
+    std::unique_ptr<Y>&& Val) -> decltype(cast<X>(Val)) {
+  return unique_dyn_cast_or_null<X, Y>(Val);
+}
+
+#endif // NOM_SUPPORT_CASTING_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
new file mode 100644
index 0000000..380afe4
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
@@ -0,0 +1,120 @@
+//== nomnigraph/Support/Common.h - Common class implementations --*- C++ -*-==//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines basic classes that are useful to inherit from.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_SUPPORT_COMMON_H
+#define NOM_SUPPORT_COMMON_H
+
+#include <functional>
+#include <list>
+
+// These #defines are useful when writing passes as the collapse
+//
+// if (!cond) {
+//   continue; // or break; or return;
+// }
+//
+// into a single line without negation
+
+#define NOM_REQUIRE_OR_(_cond, _expr) \
+  if (!(_cond)) {                     \
+    _expr;                            \
+  }
+
+#define NOM_REQUIRE_OR_CONT(_cond) NOM_REQUIRE_OR_(_cond, continue)
+#define NOM_REQUIRE_OR_BREAK(_cond) NOM_REQUIRE_OR_(_cond, break)
+#define NOM_REQUIRE_OR_RET_NULL(_cond) NOM_REQUIRE_OR_(_cond, return nullptr)
+#define NOM_REQUIRE_OR_RET_FALSE(_cond) NOM_REQUIRE_OR_(_cond, return false)
+#define NOM_REQUIRE_OR_RET(_cond) NOM_REQUIRE_OR_(_cond, return )
+
+// Implements accessors for a generic type T. If the type is not
+// specified (i.e., void template type) then the partial specification
+// gives an empty type.
+template <typename T = void>
+class StorageType {
+ public:
+  StorageType(T&& data) : Data(std::move(data)) {}
+  StorageType(const T& data) = delete;
+  StorageType() {}
+
+  const T& data() const {
+    return Data;
+  }
+  T* mutableData() {
+    return &Data;
+  }
+  void resetData(T&& data) {
+    Data = std::move(data);
+  }
+
+ private:
+  T Data;
+};
+
+template <>
+class StorageType<> {};
+
+/// \brief This class enables a listener pattern.
+/// It is to be used with a "curious recursive pattern"
+/// i.e. Derived : public Notifier<Derived> {}
+template <typename T>
+class Notifier {
+ public:
+  using Callback = std::function<void(T*)>;
+  Notifier() {}
+
+  Callback* registerDestructorCallback(Callback fn) {
+    DtorCallbacks.emplace_back(fn);
+    return &DtorCallbacks.back();
+  }
+
+  Callback* registerNotificationCallback(Callback fn) {
+    NotifCallbacks.emplace_back(fn);
+    return &NotifCallbacks.back();
+  }
+
+  void deleteCallback(std::list<Callback>& callbackList, Callback* toDelete) {
+    for (auto i = callbackList.begin(); i != callbackList.end(); ++i) {
+      if (&*i == toDelete) {
+        callbackList.erase(i);
+        break;
+      }
+    }
+  }
+
+  void deleteDestructorCallback(Callback* c) {
+    deleteCallback(DtorCallbacks, c);
+  }
+
+  void deleteNotificationCallback(Callback* c) {
+    deleteCallback(NotifCallbacks, c);
+  }
+
+  /// \brief Notifies all listeners (`registerNotificationCallback`
+  /// users) of an update.  Assumes the information of the update
+  /// is encoded in the state of the derived class, thus only passing
+  /// a pointer of type T* to the callback.
+  void notify() {
+    for (auto callback : NotifCallbacks) {
+      callback(reinterpret_cast<T*>(this));
+    }
+  }
+
+  virtual ~Notifier() {
+    for (auto callback : DtorCallbacks) {
+      callback(reinterpret_cast<T*>(this));
+    }
+  }
+
+ private:
+  std::list<Callback> DtorCallbacks;
+  std::list<Callback> NotifCallbacks;
+};
+
+#endif /* NOM_SUPPORT_COMMON_H */
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Pointer.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Pointer.h
new file mode 100644
index 0000000..97cc5bd
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Pointer.h
@@ -0,0 +1,27 @@
+//===- nomnigraph/Support/Pointer.h - Smart pointer helpers -----*- C++ -*-===//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a C++11 compatible make_unique
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_SUPPORT_POINTER_H
+#define NOM_SUPPORT_POINTER_H
+
+#include <memory>
+
+namespace nom {
+namespace util {
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+} // namespace util
+} // namespace nom
+
+#endif // NOM_SUPPORT_POINTER_H
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/Match.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/Match.h
new file mode 100644
index 0000000..cbcf4ef
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/Match.h
@@ -0,0 +1,99 @@
+//=== nomnigraph/Transformations/Match.h - Graph matching utils -*- C++ -*-===//
+//
+// TODO Licensing.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for matching subgraphs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NOM_TRANFORMATIONS_MATCH_H
+#define NOM_TRANFORMATIONS_MATCH_H
+
+#include "nomnigraph/Graph/Algorithms.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace nom {
+
+template <typename T>
+struct NodeEqualityDefault {
+  static bool equal(const T& a, const T& b) {
+    return a->data() == b->data();
+  }
+};
+
+template <
+    typename G,
+    typename EqualityClass = NodeEqualityDefault<typename G::NodeRef>>
+class Match {
+ public:
+  using SubgraphType = typename G::SubgraphType;
+
+  Match(G& g) : MatchGraph(g) {
+    // First we sort both the matching graph topologically.
+    // This could give us a useful anchor in the best case.
+    auto topoMatch = nom::algorithm::tarjans(&MatchGraph);
+    for (auto scc : topoMatch) {
+      for (auto node : scc.getNodes()) {
+        MatchNodeList.emplace_back(node);
+      }
+    }
+    std::reverse(MatchNodeList.begin(), MatchNodeList.end());
+  }
+
+  std::vector<SubgraphType> recursiveMatch(
+      typename G::NodeRef candidateNode,
+      std::vector<typename G::NodeRef> stack,
+      SubgraphType currentSubgraph) {
+    if (EqualityClass::equal(stack.back(), candidateNode)) {
+      currentSubgraph.addNode(candidateNode);
+
+      // Base case
+      if (stack.size() == MatchNodeList.size()) {
+        return std::vector<SubgraphType>{currentSubgraph};
+      }
+
+      // Recurse and accumulate matches
+      stack.emplace_back(MatchNodeList.at(stack.size()));
+
+      std::vector<SubgraphType> matchingSubgraphs;
+      for (auto outEdge : candidateNode->getOutEdges()) {
+        for (auto subgraph :
+             recursiveMatch(outEdge->head(), stack, currentSubgraph)) {
+          matchingSubgraphs.emplace_back(subgraph);
+        }
+      }
+      return matchingSubgraphs;
+    }
+
+    // No match here, early bailout
+    return std::vector<SubgraphType>{};
+  }
+
+  std::vector<SubgraphType> match(G& g) {
+    std::vector<SubgraphType> out;
+
+    std::vector<typename G::NodeRef> stack;
+    stack.emplace_back(MatchNodeList.front());
+
+    // Try each node in the candidate graph as the anchor.
+    for (auto n : g.getMutableNodes()) {
+      for (auto subgraph : recursiveMatch(n, stack, SubgraphType())) {
+        out.emplace_back(subgraph);
+      }
+    }
+
+    return out;
+  }
+
+ private:
+  G& MatchGraph;
+  std::vector<typename G::NodeRef> MatchNodeList;
+};
+
+} // namespace nom
+
+#endif // NOM_TRANFORMATIONS_MATCH_H
diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py
new file mode 100755
index 0000000..c62148e
--- /dev/null
+++ b/caffe2/core/nomnigraph/op_gen.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+
+
+def parse_lines(lines):
+    # States
+    EMPTY = 0
+    OP = 1
+    MACRO = 2
+    parse_state = EMPTY
+
+    # Preprocess the macros
+    curr_macro = ""
+    macros = {}
+
+    index = 0
+    while index < len(lines):
+        line = lines[index]
+        if line.lower().startswith('macro'):
+            assert (parse_state == EMPTY)
+            macro_line = line.split(' ')
+            # Support macros that look like attributes
+            # e.g. macro - CONV_LIKE
+            curr_macro = ' '.join(macro_line[1:])
+            assert (curr_macro not in macros)
+            macros[curr_macro] = []
+            parse_state = MACRO
+            lines = lines[:index] + lines[index + 1:]
+            continue
+        elif line.lower().startswith('endmacro'):
+            assert (parse_state == MACRO)
+            parse_state = EMPTY
+            lines = lines[:index] + lines[index + 1:]
+            continue
+        elif parse_state == MACRO:
+            macros[curr_macro].append(line)
+            lines = lines[:index] + lines[index + 1:]
+            continue
+        index += 1
+
+    index = 0
+    while index < len(lines):
+        line = lines[index]
+        if line in macros:
+            lines = lines[:index] + macros[line] + lines[index + 1:]
+            index += len(macros[line]) - 1
+        index += 1
+
+    # Now parse the file
+    curr_op = ""
+    # dict of the form
+    #  opName : { attributes: [], ... }
+    ops = {}
+    # To preserve parsing order for dependencies (for things like init_from)
+    op_list = []
+
+    for line in lines:
+        if not len(line):
+            continue
+        if line[0] == '-':
+            assert (parse_state is OP)
+            attr = [_.strip() for _ in line[1:].split(':')]
+            assert (attr[0][0].isupper())
+            if (len(attr) == 2):  # attribute : type
+                ops[curr_op]["attributes"].append((attr[0], attr[1]))
+            elif (len(attr) == 3):  # attribute : type
+                ops[curr_op]["attributes"].append((attr[0], attr[1], attr[2]))
+        else:
+            op = [l.strip() for l in line.split(':')]
+            assert (len(op[0].split(' ')) == 1)
+            parse_state = OP
+            curr_op = op[0]
+            assert (curr_op not in ops)
+            ops[curr_op] = {}
+            op_list.append(curr_op)
+            if len(op) > 1:
+                ops[curr_op]["init_from"] = [op[1]]
+            ops[curr_op]["attributes"] = []
+    return ops, op_list
+
+
+def gen_class(op, op_def):
+    attributes = op_def["attributes"]
+    attribute_args = []
+    default_init = "NeuralNetOperator(NNKind::{op})".format(op=op)
+    attribute_init = [default_init]
+    attribute_declarations = []
+    attribute_getters = []
+    attribute_setters = []
+    for attr in attributes:
+        lower_name = attr[0][0].lower() + attr[0][1:]
+        default_arg = "" if len(attr) < 3 else " = {}".format(attr[2])
+        name = attr[0]
+        t = attr[1]
+        attr_arg = "{type} {lower_name}".format(
+            type=t, lower_name=lower_name + default_arg
+        )
+        attr_init = "{name}({lower_name})".format(
+            name=name, lower_name=lower_name
+        )
+        attr_declare = "{type} {name};".format(type=t, name=name)
+        attr_get = """
+  {type} get{name}() const {{
+    return {name};
+  }}
+""".format(type=t, name=name)
+        attr_set = """
+  void set{name}({type} {lower_name}) {{
+    {name} = {lower_name};
+  }}
+""".format(type=t, name=name, lower_name=lower_name)
+        attribute_args.append(attr_arg)
+        attribute_init.append(attr_init)
+        attribute_declarations.append(attr_declare)
+        attribute_getters.append(attr_get)
+        attribute_setters.append(attr_set)
+
+    extra_init = ""
+    if "init_from" in op_def:
+        for other_op in op_def["init_from"]:
+            lower_other_op = other_op[0].lower() + other_op[1:]
+            other_init = [default_init]
+            for attr in attributes:
+                other_init.append(
+                    "{name}({other_op}.get{name}())".format(
+                        name=attr[0], other_op=lower_other_op
+                    )
+                )
+            init = """
+  {op}(const {other_op}& {lower_other_op}) :
+      {other_init} {{}}
+""".format(
+                op=op,
+                other_op=other_op,
+                lower_other_op=lower_other_op,
+                other_init=',\n      '.join(other_init)
+            )
+            extra_init += init
+
+    return """class {op} : public NeuralNetOperator {{
+ public:
+  {op}({attribute_args}) :
+      {attribute_init} {{}}
+  {extra_init}
+  ~{op}() {{}}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI({op});
+{getters}{setters}
+ private:
+  {attribute_declarations}
+}};
+
+""".format(
+        op=op,
+        extra_init=extra_init,
+        getters=''.join(attribute_getters),
+        setters=''.join(attribute_setters),
+        attribute_args=',\n    '.join(attribute_args),
+        attribute_init=',\n      '.join(attribute_init),
+        attribute_declarations='\n  '.join(attribute_declarations)
+    )
+
+
+def gen_classes(ops, op_list):
+    f = ""
+    for op in op_list:
+        f += gen_class(op, ops[op])
+    return f
+
+
+def gen_enum(op_list):
+    return ',\n'.join([op for op in op_list]) + '\n'
+
+
+def gen_names(op_list):
+    f = ""
+    for op in op_list:
+        f += """case NNKind::{name}:
+    return \"{name}\";
+""".format(name=op)
+    return f
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate op files.')
+    parser.add_argument('--install_dir', help='installation directory')
+    parser.add_argument('--source_def', help='ops.def')
+    args = parser.parse_args()
+    install_dir = args.install_dir
+
+    with open(args.source_def, 'rb') as f:
+        lines = f.readlines()
+        lines = [l.strip().decode("utf-8") for l in lines]
+    ops, op_list = parse_lines(lines)
+
+    with open(install_dir + '/OpClasses.h', 'wb') as f:
+        f.write(gen_classes(ops, op_list).encode("utf-8"))
+    with open(install_dir + '/OpNames.h', 'wb') as f:
+        f.write(gen_names(op_list).encode("utf-8"))
+    with open(install_dir + '/OpEnum.h', 'wb') as f:
+        f.write(gen_enum(op_list).encode("utf-8"))
diff --git a/caffe2/core/nomnigraph/ops.def b/caffe2/core/nomnigraph/ops.def
new file mode 100644
index 0000000..53dd951
--- /dev/null
+++ b/caffe2/core/nomnigraph/ops.def
@@ -0,0 +1,98 @@
+macro - CONV_ATTRS
+- KernelShape : vector<int>
+- Pads : vector<int> : {0, 0}
+- Strides : vector<int> : {1, 1}
+- Group : int : 1
+- Dilations: vector<int> : {1, 1}
+endmacro
+
+macro - POOL_ATTRS
+- KernelShape : vector<int>
+- Pads : vector<int> : {0, 0}
+- Strides : vector<int> : {1, 1}
+endmacro
+
+Relu
+
+Conv
+- CONV_ATTRS
+
+ConvRelu : Conv
+- CONV_ATTRS
+
+ConvTranspose
+- CONV_ATTRS
+
+AveragePool
+- POOL_ATTRS
+
+AveragePoolRelu : AveragePool
+- POOL_ATTRS
+
+MaxPool
+- POOL_ATTRS
+
+MaxPoolRelu : MaxPool
+- POOL_ATTRS
+
+Sum
+
+SumRelu : Sum
+
+Send
+- Destination : string
+
+Receive
+- Source : string
+
+BatchNormalization
+- Epsilon : float : 1e-5f
+- Momentum : float : 0.9f
+- Spatial : bool : true
+- IsTest : bool : false
+
+
+FC
+GivenTensorFill
+Concat
+- Axis : int : -1
+- AddAxis : bool : false
+Softmax
+ChannelShuffle
+Add
+Reshape
+Flatten
+
+CopyToOpenCL
+CopyFromOpenCL
+
+NCHW2NHWC
+NHWC2NCHW
+
+Int8Quantize
+Int8Dequantize
+Int8AveragePool : AveragePool
+Int8Conv : Conv
+Int8ConvTranspose : ConvTranspose
+Int8FC : FC
+Int8MaxPool : MaxPool
+Int8Relu : Relu
+Int8GivenTensorFill : GivenTensorFill
+Int8Concat : Concat
+Int8Softmax : Softmax
+Int8ChannelShuffle : ChannelShuffle
+Int8Sum : Sum
+Int8Add : Add
+Int8Reshape : Reshape
+Int8Flatten : Flatten
+Int8ConvRelu : ConvRelu
+Int8SumRelu : SumRelu
+Int8AveragePoolRelu : AveragePoolRelu
+Int8MaxPoolRelu : MaxPoolRelu
+
+BatchMatMul
+- TransA : bool : false
+- TransB : bool : true
+- Broadcast: bool : false
+BatchGather
+ConcatBatchMatMulBatchGatherOp
diff --git a/caffe2/core/nomnigraph/tests/basic_test.cc b/caffe2/core/nomnigraph/tests/basic_test.cc
new file mode 100644
index 0000000..c32dba1
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/basic_test.cc
@@ -0,0 +1,39 @@
+#include "test_util.h"
+
+#include "nomnigraph/Converters/Dot.h"
+#include "nomnigraph/Graph/Algorithms.h"
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Transformations/Match.h"
+#include "nomnigraph/Support/Casting.h"
+
+#include <gtest/gtest.h>
+
+TEST(Basic, CreateNodeAndEdge) {
+  TestClass t1;
+  TestClass t2;
+  nom::Graph<TestClass> g;
+  nom::Graph<TestClass>::NodeRef n1 = g.createNode(std::move(t1));
+  nom::Graph<TestClass>::NodeRef n2 = g.createNode(std::move(t2));
+  g.createEdge(n1, n2);
+}
+
+TEST(Basic, DeleteNode) {
+  TestClass t1;
+  TestClass t2;
+  nom::Graph<TestClass> g;
+  nom::Graph<TestClass>::NodeRef n1 = g.createNode(std::move(t1));
+  nom::Graph<TestClass>::NodeRef n2 = g.createNode(std::move(t2));
+  g.createEdge(n1, n2);
+  g.deleteNode(n1);
+}
+
+TEST(Basic, DeleteEdge) {
+  TestClass t1;
+  TestClass t2;
+  nom::Graph<TestClass> g;
+  nom::Graph<TestClass>::NodeRef n1 = g.createNode(std::move(t1));
+  nom::Graph<TestClass>::NodeRef n2 = g.createNode(std::move(t2));
+  nom::Graph<TestClass>::EdgeRef e = g.createEdge(n1, n2);
+  g.deleteEdge(e);
+}
+
diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc
new file mode 100644
index 0000000..4834cea
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc
@@ -0,0 +1,123 @@
+#include <gtest/gtest.h>
+#include <set>
+
+#include "test_util.h"
+
+#include "nomnigraph/Converters/Dot.h"
+#include "nomnigraph/Graph/Algorithms.h"
+#include "nomnigraph/Graph/Graph.h"
+
+TEST(BinaryMatch, NoMatch) {
+  auto graph = createGraph();
+  auto matches = nom::algorithm::binaryMatch(
+      &graph, [](decltype(graph)::NodeRef n) { return false; });
+  EXPECT_EQ(matches.size(), 0);
+}
+
+TEST(BinaryMatch, AllMatch) {
+  auto graph = createGraph();
+  auto matches = nom::algorithm::binaryMatch(
+      &graph, [](decltype(graph)::NodeRef n) { return true; });
+  EXPECT_EQ(matches.size(), 1);
+  EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size());
+}
+
+TEST(BinaryMatch, EmptyGraph) {
+  nom::Graph<std::string> graph;
+  auto matches = nom::algorithm::binaryMatch(
+      &graph, [](decltype(graph)::NodeRef n) { return true; });
+  EXPECT_EQ(matches.size(), 0);
+}
+
+// We should get this back:
+// +---+     +-------+
+// | 4 | <-- |   2   |
+// +---+     +-------+
+//   |         |
+//   |         |
+//   |         v
+//   |       +-------+
+//   |       |   3   |
+//   |       +-------+
+//   |         |
+//   |         |
+//   |         v
+//   |       +-------+
+//   +-----> |   6   |
+//           +-------+
+TEST(BinaryMatch, Basic) {
+  auto graph = createGraph();
+  auto matches =
+      nom::algorithm::binaryMatch(&graph, [](decltype(graph)::NodeRef n) {
+        if (n->data() == "2" || n->data() == "3" || n->data() == "4" ||
+            n->data() == "6") {
+          return true;
+        }
+        return false;
+      });
+
+  EXPECT_EQ(matches.size(), 1);
+  auto match = matches.front();
+  EXPECT_EQ(match.Nodes.size(), 4);
+  std::set<std::string> exp{"2", "3", "4", "6"};
+  for (auto n : match.Nodes) {
+    EXPECT_EQ(exp.count(n->data()), 1);
+    exp.erase(n->data());
+  }
+
+  // We found all the those nodes.
+  EXPECT_EQ(exp.size(), 0);
+}
+
+// The interesting bit about this test case is that
+// the predicate does not match on 3.
+//
+// As such, this part of the graph
+// +---+     +-------+
+// | 4 | <-- |   2   |
+// +---+     +-------+
+//   |         |
+//   |         |
+//   |         v
+//   |       +-------+
+//   |       |   3   |
+//   |       +-------+
+//   |         |
+//   |         |
+//   |         v
+//   |       +-------+
+//   +-----> |   6   |
+//           +-------+
+//
+// should match as { 4, 2 }, { 6 } not { 4, 2, 6 }
+TEST(BinaryMatch, RemovedMiddleNode) {
+  auto graph = createGraph();
+  auto matches =
+      nom::algorithm::binaryMatch(&graph, [](decltype(graph)::NodeRef n) {
+        if (n->data() == "2" || n->data() == "4" || n->data() == "6") {
+          return true;
+        }
+        return false;
+      });
+
+  EXPECT_EQ(matches.size(), 2);
+  auto match1 = matches.front();
+  auto match2 = matches.back();
+
+  EXPECT_EQ(match1.Nodes.size(), 2);
+  EXPECT_EQ(match2.Nodes.size(), 1);
+
+  std::set<std::string> exp1{"2", "4"};
+  std::set<std::string> exp2{"6"};
+  for (auto n : match1.Nodes) {
+    EXPECT_EQ(exp1.count(n->data()), 1);
+    exp1.erase(n->data());
+  }
+  for (auto n : match2.Nodes) {
+    EXPECT_EQ(exp2.count(n->data()), 1);
+    exp2.erase(n->data());
+  }
+
+  EXPECT_EQ(exp1.size(), 0);
+  EXPECT_EQ(exp2.size(), 0);
+}
diff --git a/caffe2/core/nomnigraph/tests/dominator_tree_test.cc b/caffe2/core/nomnigraph/tests/dominator_tree_test.cc
new file mode 100644
index 0000000..68e0d18
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/dominator_tree_test.cc
@@ -0,0 +1,98 @@
+#include "test_util.h"
+
+#include <gtest/gtest.h>
+
+TEST(DominatorTree, Test1) {
+  nom::Graph<std::string> graph;
+  auto r = graph.createNode(std::string("r"));
+  auto a = graph.createNode(std::string("a"));
+  auto b = graph.createNode(std::string("b"));
+  auto c = graph.createNode(std::string("c"));
+  auto d = graph.createNode(std::string("d"));
+  auto e = graph.createNode(std::string("e"));
+  auto f = graph.createNode(std::string("f"));
+  auto g = graph.createNode(std::string("g"));
+  auto l = graph.createNode(std::string("l"));
+  auto h = graph.createNode(std::string("h"));
+  auto i = graph.createNode(std::string("i"));
+  auto j = graph.createNode(std::string("j"));
+  auto k = graph.createNode(std::string("k"));
+  graph.createEdge(r, a);
+  graph.createEdge(r, b);
+  graph.createEdge(r, c);
+  graph.createEdge(c, f);
+  graph.createEdge(c, g);
+  graph.createEdge(g, j);
+  graph.createEdge(g, i);
+  graph.createEdge(f, i);
+  graph.createEdge(i, k);
+  graph.createEdge(k, i);
+  graph.createEdge(k, r);
+  graph.createEdge(a, d);
+  graph.createEdge(b, d);
+  graph.createEdge(b, a);
+  graph.createEdge(b, e);
+  graph.createEdge(d, l);
+  graph.createEdge(l, h);
+  graph.createEdge(h, k);
+  graph.createEdge(h, e);
+  graph.createEdge(e, h);
+
+  auto tree = nom::algorithm::dominatorTree(&graph, r);
+  auto map = nom::algorithm::immediateDominatorMap(&graph, r);
+
+  EXPECT_EQ(map[j], g);
+  EXPECT_EQ(map[g], c);
+  EXPECT_EQ(map[f], c);
+  EXPECT_EQ(map[l], d);
+  EXPECT_EQ(map[a], r);
+  EXPECT_EQ(map[b], r);
+  EXPECT_EQ(map[c], r);
+  EXPECT_EQ(map[d], r);
+  EXPECT_EQ(map[e], r);
+  EXPECT_EQ(map[h], r);
+  EXPECT_EQ(map[i], r);
+  EXPECT_EQ(map[k], r);
+  auto domFrontMap = nom::algorithm::dominanceFrontierMap(&graph, r);
+}
+
+// https://www.seas.harvard.edu/courses/cs252/2011sp/slides/Lec04-SSA.pdf
+// using example on page 24
+TEST(DominatorTree, Test2) {
+  nom::Graph<std::string> graph;
+  auto entry = graph.createNode(std::string("entry"));
+  auto n1 = graph.createNode(std::string("1"));
+  auto n2 = graph.createNode(std::string("2"));
+  auto n3 = graph.createNode(std::string("3"));
+  auto n4 = graph.createNode(std::string("4"));
+  auto n5 = graph.createNode(std::string("5"));
+  auto n6 = graph.createNode(std::string("6"));
+  auto n7 = graph.createNode(std::string("7"));
+  auto exit = graph.createNode(std::string("exit"));
+  graph.createEdge(entry, n1);
+  graph.createEdge(n1, n2);
+  graph.createEdge(n1, n5);
+  graph.createEdge(n5, n1);
+  graph.createEdge(n2, n3);
+  graph.createEdge(n2, n4);
+  graph.createEdge(n3, n6);
+  graph.createEdge(n4, n6);
+  graph.createEdge(n6, n7);
+  graph.createEdge(n5, n7);
+  graph.createEdge(n7, exit);
+
+  auto domFrontMap = nom::algorithm::dominanceFrontierMap(&graph, entry);
+  using noderef = nom::Graph<std::string>::NodeRef;
+  std::unordered_map<noderef, std::unordered_set<noderef>> checkMap = {
+    {n1, {n1}},
+    {n2, {n7}},
+    {n3, {n6}},
+    {n4, {n6}},
+    {n5, {n1, n7}},
+    {n6, {n7}}
+  };
+  for (auto pair : domFrontMap) {
+    EXPECT_EQ(pair.second, checkMap[pair.first]);
+  }
+}
+
diff --git a/caffe2/core/nomnigraph/tests/match_test.cc b/caffe2/core/nomnigraph/tests/match_test.cc
new file mode 100644
index 0000000..f509d28
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/match_test.cc
@@ -0,0 +1,37 @@
+#include "test_util.h"
+
+#include "nomnigraph/Transformations/Match.h"
+
+#include <gtest/gtest.h>
+
+TEST(Match, Basic) {
+  nom::Graph<std::string> graph;
+  auto entry = graph.createNode(std::string("entry"));
+  auto n1 = graph.createNode(std::string("1"));
+  auto n2 = graph.createNode(std::string("2"));
+  auto n3 = graph.createNode(std::string("3"));
+  auto n4 = graph.createNode(std::string("4"));
+  auto n5 = graph.createNode(std::string("5"));
+  auto n6 = graph.createNode(std::string("6"));
+  auto n7 = graph.createNode(std::string("7"));
+  auto exit = graph.createNode(std::string("exit"));
+  graph.createEdge(entry, n1);
+  graph.createEdge(n1, n2);
+  graph.createEdge(n1, n5);
+  graph.createEdge(n5, n1);
+  graph.createEdge(n2, n3);
+  graph.createEdge(n2, n4);
+  graph.createEdge(n3, n6);
+  graph.createEdge(n4, n6);
+  graph.createEdge(n6, n7);
+  graph.createEdge(n5, n7);
+  graph.createEdge(n7, exit);
+
+  nom::Graph<std::string> match_graph;
+  auto m1 = match_graph.createNode(std::string("1"));
+  auto m2 = match_graph.createNode(std::string("2"));
+  match_graph.createEdge(m1, m2);
+
+  nom::Match<decltype(graph)> m(match_graph);
+  EXPECT_EQ(m.match(graph).size(), 1);
+}
diff --git a/caffe2/core/nomnigraph/tests/tarjans_test.cc b/caffe2/core/nomnigraph/tests/tarjans_test.cc
new file mode 100644
index 0000000..023d3c3
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/tarjans_test.cc
@@ -0,0 +1,58 @@
+#include <gtest/gtest.h>
+
+#include "test_util.h"
+
+#include "nomnigraph/Graph/Graph.h"
+
+TEST(Tarjans, Simple) {
+  TestClass t1;
+  TestClass t2;
+  nom::Graph<TestClass> g;
+  nom::Graph<TestClass>::NodeRef n1 = g.createNode(std::move(t1));
+  nom::Graph<TestClass>::NodeRef n2 = g.createNode(std::move(t2));
+  g.createEdge(n1, n2);
+  g.createEdge(n2, n1);
+  auto sccs = nom::algorithm::tarjans(&g);
+  EXPECT_EQ(sccs.size(), 1);
+}
+
+TEST(Tarjans, WithEdgeStorage) {
+  TestClass t1;
+  TestClass t2;
+  nom::Graph<TestClass, TestClass> g;
+  nom::Graph<TestClass, TestClass>::NodeRef n1 = g.createNode(std::move(t1));
+  nom::Graph<TestClass, TestClass>::NodeRef n2 = g.createNode(std::move(t2));
+  g.createEdge(n1, n2, TestClass());
+  g.createEdge(n2, n1, TestClass());
+  auto sccs = nom::algorithm::tarjans(&g);
+  EXPECT_EQ(sccs.size(), 1);
+}
+
+TEST(Tarjans, DAG) {
+  auto graph = createGraph();
+  auto sccs = nom::algorithm::tarjans(&graph);
+  EXPECT_EQ(sccs.size(), 9);
+}
+
+TEST(Tarjans, Cycle) {
+  auto graph = createGraphWithCycle();
+  auto sccs = nom::algorithm::tarjans(&graph);
+  EXPECT_EQ(sccs.size(), 8);
+}
+
+TEST(Tarjans, Random) {
+  nom::Graph<TestClass> g;
+  std::vector<nom::Graph<TestClass>::NodeRef> nodes;
+  for (auto i = 0; i < 10; ++i) {
+    TestClass t;
+    nodes.emplace_back(g.createNode(std::move(t)));
+  }
+  for (auto i = 0; i < 30; ++i) {
+    int ri1 = rand() % nodes.size();
+    int ri2 = rand() % nodes.size();
+    g.createEdge(nodes[ri1], nodes[ri2]);
+  }
+
+  auto sccs = nom::algorithm::tarjans(&g);
+  EXPECT_GE(sccs.size(), 1);
+}
diff --git a/caffe2/core/nomnigraph/tests/test_util.cc b/caffe2/core/nomnigraph/tests/test_util.cc
new file mode 100644
index 0000000..4ee423d
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/test_util.cc
@@ -0,0 +1,121 @@
+#include "test_util.h"
+
+#include <string>
+#include <sstream>
+
+namespace {
+
+template <typename T>
+std::string to_string(T value) {
+    std::ostringstream os;
+    os << value;
+    return os.str();
+}
+
+}
+
+nom::Graph<std::string> createGraph() {
+  nom::Graph<std::string> graph;
+  auto entry = graph.createNode(std::string("entry"));
+  auto n1 = graph.createNode(std::string("1"));
+  auto n2 = graph.createNode(std::string("2"));
+  auto n3 = graph.createNode(std::string("3"));
+  auto n4 = graph.createNode(std::string("4"));
+  auto n5 = graph.createNode(std::string("5"));
+  auto n6 = graph.createNode(std::string("6"));
+  auto n7 = graph.createNode(std::string("7"));
+  auto exit = graph.createNode(std::string("exit"));
+  graph.createEdge(entry, n1);
+  graph.createEdge(n1, n2);
+  graph.createEdge(n2, n3);
+  graph.createEdge(n2, n4);
+  graph.createEdge(n3, n6);
+  graph.createEdge(n4, n6);
+  graph.createEdge(n6, n7);
+  graph.createEdge(n5, n7);
+  graph.createEdge(n7, exit);
+  return graph;
+}
+
+nom::Graph<std::string> createGraphWithCycle() {
+  nom::Graph<std::string> graph;
+  auto entry = graph.createNode(std::string("entry"));
+  auto n1 = graph.createNode(std::string("1"));
+  auto n2 = graph.createNode(std::string("2"));
+  auto n3 = graph.createNode(std::string("3"));
+  auto n4 = graph.createNode(std::string("4"));
+  auto n5 = graph.createNode(std::string("5"));
+  auto n6 = graph.createNode(std::string("6"));
+  auto n7 = graph.createNode(std::string("7"));
+  auto exit = graph.createNode(std::string("exit"));
+  graph.createEdge(entry, n1);
+  graph.createEdge(n1, n2);
+  graph.createEdge(n2, n3);
+  graph.createEdge(n2, n4);
+  graph.createEdge(n3, n6);
+  graph.createEdge(n6, n3); // Cycle
+  graph.createEdge(n4, n6);
+  graph.createEdge(n6, n7);
+  graph.createEdge(n5, n7);
+  graph.createEdge(n7, exit);
+  return graph;
+}
+
+std::map<std::string, std::string> BBPrinter(typename nom::repr::NNCFGraph::NodeRef node) {
+  std::map<std::string, std::string> labelMap;
+  assert(node->data() && "Node doesn't have data, can't render it");
+  auto *bb = dyn_cast<nom::repr::BasicBlockType<nom::repr::NNGraph>>(
+      node->data().get());
+  labelMap["label"] = to_string((unsigned long long)node) + "\\n";
+  for (const auto &instr : bb->getInstructions()) {
+    assert(isa<nom::repr::NeuralNetOperator>(instr->data()) &&
+           "Invalid instruction.");
+    auto *op = dyn_cast<nom::repr::NeuralNetOperator>(instr->data().get());
+    bool hasOutput = false;
+    for (const auto &outEdge : instr->getOutEdges()) {
+      auto *output =
+          dyn_cast<nom::repr::NeuralNetData>(outEdge->head()->data().get());
+      labelMap["label"] += " " + output->getName();
+      hasOutput = true;
+    }
+    if (hasOutput) {
+      labelMap["label"] += " = ";
+    }
+    labelMap["label"] += op->getName();
+    for (const auto &inEdge : instr->getInEdges()) {
+      auto *arg =
+          dyn_cast<nom::repr::NeuralNetData>(inEdge->tail()->data().get());
+      labelMap["label"] += " " + arg->getName();
+    }
+    labelMap["label"] += "\\l";
+  }
+  labelMap["shape"] = "box";
+  return labelMap;
+};
+
+std::map<std::string, std::string> cfgEdgePrinter(typename nom::repr::NNCFGraph::EdgeRef edge) {
+  std::map<std::string, std::string> labelMap;
+  if (edge->data() == -1) {
+    labelMap["label"] = "F";
+  } else if (edge->data() == 1) {
+    labelMap["label"] = "T";
+  }
+  return labelMap;
+};
+
+std::map<std::string, std::string> NNPrinter(typename nom::repr::NNGraph::NodeRef node) {
+  std::map<std::string, std::string> labelMap;
+  assert(node->data() && "Node doesn't have data, can't render it");
+  if (isa<nom::repr::NeuralNetOperator>(node->data())) {
+    auto *op = dyn_cast<nom::repr::NeuralNetOperator>(node->data().get());
+    labelMap["label"] =
+        op->getName() + " (" + to_string((unsigned long long)node) + ")";
+    labelMap["shape"] = "box";
+  } else if (isa<nom::repr::Data>(node->data())) {
+    auto tensor = dyn_cast<nom::repr::NeuralNetData>(node->data().get());
+    labelMap["label"] = tensor->getName();
+    labelMap["label"] += "_" + to_string(tensor->getVersion()) + " " + to_string((unsigned long long)node);
+  }
+  return labelMap;
+};
+
diff --git a/caffe2/core/nomnigraph/tests/test_util.h b/caffe2/core/nomnigraph/tests/test_util.h
new file mode 100644
index 0000000..2c447b5
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/test_util.h
@@ -0,0 +1,97 @@
+#ifndef NOM_TESTS_TEST_UTIL_H
+#define NOM_TESTS_TEST_UTIL_H
+
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Graph/Algorithms.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+#include "nomnigraph/Converters/Dot.h"
+
+#include <map>
+
+class TestClass {
+public:
+  TestClass() {}
+  ~TestClass() {}
+};
+
+struct NNEquality {
+  static bool equal(
+      const typename nom::repr::NNGraph::NodeRef& a,
+      const typename nom::repr::NNGraph::NodeRef& b) {
+    if (
+        !nom::repr::nn::is<nom::repr::NeuralNetOperator>(a) ||
+        !nom::repr::nn::is<nom::repr::NeuralNetOperator>(b)) {
+      return false;
+    }
+    auto a_ = nom::repr::nn::get<nom::repr::NeuralNetOperator>(a);
+    auto b_ = nom::repr::nn::get<nom::repr::NeuralNetOperator>(b);
+
+    bool sameKind = a_->getKind() == b_->getKind();
+    if (sameKind && a_->getKind() == nom::repr::NeuralNetOperator::NNKind::GenericOperator) {
+      return a_->getName() == b_->getName();
+    }
+    return sameKind;
+  }
+};
+
+/** Our test graph looks like this:
+ *           +-------+
+ *           | entry |
+ *           +-------+
+ *             |
+ *             |
+ *             v
+ *           +-------+
+ *           |   1   |
+ *           +-------+
+ *             |
+ *             |
+ *             v
+ * +---+     +-------+
+ * | 4 | <-- |   2   |
+ * +---+     +-------+
+ *   |         |
+ *   |         |
+ *   |         v
+ *   |       +-------+
+ *   |       |   3   |
+ *   |       +-------+
+ *   |         |
+ *   |         |
+ *   |         v
+ *   |       +-------+
+ *   +-----> |   6   |
+ *           +-------+
+ *             |
+ *             |
+ *             v
+ * +---+     +-------+
+ * | 5 | --> |   7   |
+ * +---+     +-------+
+ *             |
+ *             |
+ *             v
+ *           +-------+
+ *           | exit  |
+ *           +-------+
+ *
+ * Here is the code used to generate the dot file for it:
+ *
+ *  auto str = nom::converters::convertToDotString(&graph,
+ *    [](nom::Graph<std::string>::NodeRef node) {
+ *      std::map<std::string, std::string> labelMap;
+ *      labelMap["label"] = node->data();
+ *      return labelMap;
+ *    });
+ */
+nom::Graph<std::string> createGraph();
+
+nom::Graph<std::string> createGraphWithCycle();
+
+std::map<std::string, std::string> BBPrinter(typename nom::repr::NNCFGraph::NodeRef node);
+
+std::map<std::string, std::string> cfgEdgePrinter(typename nom::repr::NNCFGraph::EdgeRef edge);
+
+std::map<std::string, std::string> NNPrinter(typename nom::repr::NNGraph::NodeRef node);
+
+#endif // NOM_TESTS_TEST_UTIL_H
diff --git a/caffe2/core/numa.cc b/caffe2/core/numa.cc
new file mode 100644
index 0000000..a8c2d4f
--- /dev/null
+++ b/caffe2/core/numa.cc
@@ -0,0 +1,135 @@
+#include "caffe2/core/numa.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_cpu_numa_enabled,
+    false,
+    "Use NUMA whenever possible.");
+
+#if defined(__linux__) && !defined(CAFFE2_DISABLE_NUMA) && CAFFE2_MOBILE == 0
+#include <numa.h>
+#include <numaif.h>
+#define CAFFE2_NUMA_ENABLED
+#endif
+
+namespace caffe2 {
+
+#ifdef CAFFE2_NUMA_ENABLED
+bool IsNUMAEnabled() {
+  return FLAGS_caffe2_cpu_numa_enabled && numa_available() >= 0;
+}
+
+void NUMABind(int numa_node_id) {
+  if (numa_node_id < 0) {
+    return;
+  }
+  if (!IsNUMAEnabled()) {
+    VLOG(1) << "NUMA is not enabled";
+    return;
+  }
+
+  CAFFE_ENFORCE(
+      numa_node_id <= numa_max_node(),
+      "NUMA node id " + caffe2::to_string(numa_node_id) + " is unavailable");
+
+  auto bm = numa_allocate_nodemask();
+  numa_bitmask_clearall(bm);
+  numa_bitmask_setbit(bm, numa_node_id);
+  numa_bind(bm);
+  numa_bitmask_free(bm);
+}
+
+int GetNUMANode(const void* ptr) {
+  if (!IsNUMAEnabled()) {
+    VLOG(1) << "NUMA is not enabled";
+    return -1;
+  }
+  CAFFE_ENFORCE(ptr);
+
+  int numa_node = -1;
+  CAFFE_ENFORCE(
+      get_mempolicy(
+          &numa_node, NULL, 0, (void*)ptr, MPOL_F_NODE | MPOL_F_ADDR) == 0,
+      "Unable to get memory policy");
+  return numa_node;
+}
+
+int GetNumNUMANodes() {
+  if (!IsNUMAEnabled()) {
+    VLOG(1) << "NUMA is not enabled";
+    return -1;
+  }
+
+  return numa_num_configured_nodes();
+}
+
+void NUMAMove(void* ptr, size_t size, int numa_node_id) {
+  if (numa_node_id < 0) {
+    return;
+  }
+  if (!IsNUMAEnabled()) {
+    VLOG(1) << "NUMA is not enabled";
+    return;
+  }
+  CAFFE_ENFORCE(ptr);
+
+  size_t page_start_ptr = (((size_t)ptr) & ~(getpagesize() - 1));
+  size_t offset = ((size_t)ptr) - page_start_ptr;
+  // Avoid extra dynamic allocation and NUMA api calls
+  CAFFE_ENFORCE(numa_node_id >= 0 && (unsigned)numa_node_id < sizeof(unsigned long) * 8);
+  unsigned long mask = 1UL << numa_node_id;
+  CAFFE_ENFORCE(
+      mbind(
+          (void*)page_start_ptr,
+          size + offset,
+          MPOL_BIND,
+          &mask,
+          sizeof(mask) * 8,
+          MPOL_MF_MOVE | MPOL_MF_STRICT) == 0,
+      "Could not move memory to a NUMA node");
+}
+
+int GetCurrentNUMANode() {
+  if (!IsNUMAEnabled()) {
+    VLOG(1) << "NUMA is not enabled";
+    return -1;
+  }
+
+  return numa_node_of_cpu(sched_getcpu());
+}
+
+#else // CAFFE2_NUMA_ENABLED
+
+bool IsNUMAEnabled() {
+  return false;
+}
+
+void NUMABind(int numa_node_id) {
+  if (numa_node_id >= 0) {
+    VLOG(1) << "NUMA is not enabled";
+  }
+}
+
+int GetNUMANode(const void* ptr) {
+  VLOG(1) << "NUMA is not enabled";
+  return -1;
+}
+
+int GetNumNUMANodes() {
+  VLOG(1) << "NUMA is not enabled";
+  return -1;
+}
+
+void NUMAMove(void* ptr, size_t size, int numa_node_id) {
+  if (numa_node_id >= 0) {
+    VLOG(1) << "NUMA is not enabled";
+  }
+}
+
+int GetCurrentNUMANode() {
+  VLOG(1) << "NUMA is not enabled";
+  return -1;
+}
+
+#endif // CAFFE2_NUMA_ENABLED
+
+} // namespace caffe2
diff --git a/caffe2/core/numa.h b/caffe2/core/numa.h
new file mode 100644
index 0000000..3009ce4
--- /dev/null
+++ b/caffe2/core/numa.h
@@ -0,0 +1,24 @@
+#ifndef CAFFE2_CORE_NUMA_H_
+#define CAFFE2_CORE_NUMA_H_
+
+#include "caffe2/core/logging.h"
+
+CAFFE2_DECLARE_bool(caffe2_cpu_numa_enabled);
+
+namespace caffe2 {
+
+bool IsNUMAEnabled();
+
+void NUMABind(int numa_node_id);
+
+int GetNUMANode(const void* ptr);
+
+int GetNumNUMANodes();
+
+void NUMAMove(void* ptr, size_t size, int numa_node_id);
+
+int GetCurrentNUMANode();
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NUMA_H_
diff --git a/caffe2/core/observer.h b/caffe2/core/observer.h
new file mode 100644
index 0000000..7358301
--- /dev/null
+++ b/caffe2/core/observer.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <memory>
+#include <unordered_set>
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+/**
+ *  Use this to implement a Observer using the Observer Pattern template.
+ */
+
+template <class T>
+class ObserverBase {
+ public:
+  explicit ObserverBase(T* subject) : subject_(subject) {}
+
+  virtual void Start() {}
+  virtual void Stop() {}
+
+  virtual std::string debugInfo() {
+    return "Not implemented.";
+  }
+
+  virtual ~ObserverBase() noexcept {};
+
+  T* subject() const {
+    return subject_;
+  }
+
+  virtual std::unique_ptr<ObserverBase<T>> rnnCopy(T* subject, int rnn_order)
+      const {
+    return nullptr;
+  };
+
+ protected:
+  T* subject_;
+};
+
+/**
+ *  Inherit to make your class observable.
+ */
+template <class T>
+class Observable {
+ public:
+  virtual ~Observable(){};
+  using Observer = ObserverBase<T>;
+
+  /* Returns a reference to the observer after addition. */
+  const Observer* AttachObserver(std::unique_ptr<Observer> observer) {
+    CAFFE_ENFORCE(observer, "Couldn't attach a null observer.");
+    std::unordered_set<const Observer*> observers;
+    for (auto& ob : observers_list_) {
+      observers.insert(ob.get());
+    }
+
+    const auto* observer_ptr = observer.get();
+    if (observers.count(observer_ptr)) {
+      return observer_ptr;
+    }
+    observers_list_.push_back(std::move(observer));
+    UpdateCache();
+
+    return observer_ptr;
+  }
+
+  /**
+   * Returns a unique_ptr to the removed observer. If not found, return a
+   * nullptr
+   */
+  std::unique_ptr<Observer> DetachObserver(const Observer* observer_ptr) {
+    for (auto it = observers_list_.begin(); it != observers_list_.end(); ++it) {
+      if (it->get() == observer_ptr) {
+        auto res = std::move(*it);
+        observers_list_.erase(it);
+        UpdateCache();
+        return res;
+      }
+    }
+    return nullptr;
+  }
+
+  virtual size_t NumObservers() {
+    return num_observers_;
+  }
+
+ private:
+  inline static void StartObserver(Observer* observer) {
+    try {
+      observer->Start();
+    } catch (const std::exception& e) {
+      LOG(ERROR) << "Exception from observer: " << e.what();
+    } catch (...) {
+      LOG(ERROR) << "Exception from observer: unknown";
+    }
+  }
+
+  inline static void StopObserver(Observer* observer) {
+    try {
+      observer->Stop();
+    } catch (const std::exception& e) {
+      LOG(ERROR) << "Exception from observer: " << e.what();
+    } catch (...) {
+      LOG(ERROR) << "Exception from observer: unknown";
+    }
+  }
+
+  void UpdateCache() {
+    num_observers_ = observers_list_.size();
+    if (num_observers_ != 1) {
+      // we cannot take advantage of the cache
+      return;
+    }
+    observer_cache_ = observers_list_[0].get();
+  }
+
+ public:
+  void StartAllObservers() {
+    // do not access observers_list_ unless necessary
+    if (num_observers_ == 0) {
+      return;
+    } else if (num_observers_ == 1) {
+      StartObserver(observer_cache_);
+    } else {
+      for (auto& observer : observers_list_) {
+        StartObserver(observer.get());
+      }
+    }
+  }
+
+  void StopAllObservers() {
+    // do not access observers_list_ unless necessary
+    if (num_observers_ == 0) {
+      return;
+    } else if (num_observers_ == 1) {
+      StopObserver(observer_cache_);
+    } else {
+      for (auto& observer : observers_list_) {
+        StopObserver(observer.get());
+      }
+    }
+  }
+
+ private:
+  // an on-stack cache for fast iteration;
+  // ideally, inside StartAllObservers and StopAllObservers,
+  // we should never access observers_list_
+  Observer* observer_cache_;
+  size_t num_observers_ = 0;
+
+ protected:
+  std::vector<std::unique_ptr<Observer>> observers_list_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc
new file mode 100644
index 0000000..f4f4e81
--- /dev/null
+++ b/caffe2/core/observer_test.cc
@@ -0,0 +1,179 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/net_dag.h"
+#include "caffe2/core/net_simple.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/scope_guard.h"
+
+namespace caffe2 {
+
+namespace {
+
+static std::atomic<int> counter;
+
+template <class T>
+class DummyObserver final : public ObserverBase<T> {
+ public:
+  explicit DummyObserver<T>(T* subject_) : ObserverBase<T>(subject_) {}
+  void Start() override;
+  void Stop() override;
+
+  ~DummyObserver() {}
+};
+
+template <>
+void DummyObserver<NetBase>::Start() {
+  vector<OperatorBase*> operators = subject_->GetOperators();
+  for (auto& op : operators) {
+    op->AttachObserver(caffe2::make_unique<DummyObserver<OperatorBase>>(op));
+  }
+  counter.fetch_add(1000);
+}
+
+template <>
+void DummyObserver<OperatorBase>::Start() {
+  counter.fetch_add(100);
+}
+
+template <>
+void DummyObserver<NetBase>::Stop() {
+  counter.fetch_add(10);
+}
+
+template <>
+void DummyObserver<OperatorBase>::Stop() {
+  counter.fetch_add(1);
+}
+
+class ObsTestDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    StartAllObservers();
+    StopAllObservers();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(ObsTestDummy, ObsTestDummyOp);
+REGISTER_CUDA_OPERATOR(ObsTestDummy, ObsTestDummyOp);
+
+OPERATOR_SCHEMA(ObsTestDummy)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+unique_ptr<NetBase> CreateNetTestHelper(Workspace* ws, bool isDAG = false) {
+  NetDef net_def;
+  if (isDAG) {
+    net_def.set_type("dag");
+  }
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("ObsTestDummy");
+    op.add_input("in");
+    op.add_output("hidden");
+  }
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("ObsTestDummy");
+    op.add_input("hidden");
+    op.add_output("out");
+  }
+  net_def.add_external_input("in");
+  net_def.add_external_output("out");
+
+  return CreateNet(net_def, ws);
+}
+}
+
+TEST(ObserverTest, TestNotify) {
+  auto count_before = counter.load();
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws));
+  EXPECT_EQ(caffe2::dynamic_cast_if_rtti<SimpleNet*>(net.get()), net.get());
+  unique_ptr<DummyObserver<NetBase>> net_ob =
+      make_unique<DummyObserver<NetBase>>(net.get());
+  net.get()->AttachObserver(std::move(net_ob));
+  net.get()->Run();
+  auto count_after = counter.load();
+  EXPECT_EQ(1212, count_after - count_before);
+}
+
+TEST(ObserverTest, TestUniqueMap) {
+  auto count_before = counter.load();
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws));
+  EXPECT_EQ(caffe2::dynamic_cast_if_rtti<SimpleNet*>(net.get()), net.get());
+  unique_ptr<DummyObserver<NetBase>> net_ob =
+      make_unique<DummyObserver<NetBase>>(net.get());
+  auto* ref = net.get()->AttachObserver(std::move(net_ob));
+  net.get()->Run();
+  unique_ptr<Observable<NetBase>::Observer> test =
+      net.get()->DetachObserver(ref);
+  auto count_after = counter.load();
+  EXPECT_EQ(1212, count_after - count_before);
+}
+
+TEST(ObserverTest, TestNotifyAfterDetach) {
+  auto count_before = counter.load();
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws));
+  unique_ptr<DummyObserver<NetBase>> net_ob =
+      make_unique<DummyObserver<NetBase>>(net.get());
+  auto* ob = net.get()->AttachObserver(std::move(net_ob));
+  net.get()->DetachObserver(ob);
+  net.get()->Run();
+  auto count_after = counter.load();
+  EXPECT_EQ(0, count_after - count_before);
+}
+
+TEST(ObserverTest, TestDAGNetBase) {
+  auto count_before = counter.load();
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws, true));
+  unique_ptr<DummyObserver<NetBase>> net_ob =
+      make_unique<DummyObserver<NetBase>>(net.get());
+  net.get()->AttachObserver(std::move(net_ob));
+  net.get()->Run();
+  auto count_after = counter.load();
+  EXPECT_EQ(1212, count_after - count_before);
+}
+
+TEST(ObserverTest, TestMultipleNetBase) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws, true));
+  EXPECT_EQ(caffe2::dynamic_cast_if_rtti<NetBase*>(net.get()), net.get());
+
+  // There may be some default observers
+  const size_t prev_num = net.get()->NumObservers();
+  const int num_tests = 100;
+  vector<const Observable<NetBase>::Observer*> observers;
+  for (int i = 0; i < num_tests; ++i) {
+    unique_ptr<DummyObserver<NetBase>> net_ob =
+        make_unique<DummyObserver<NetBase>>(net.get());
+    observers.emplace_back(net.get()->AttachObserver(std::move(net_ob)));
+  }
+
+  net.get()->Run();
+
+  for (const auto& observer : observers) {
+    net.get()->DetachObserver(observer);
+  }
+
+  EXPECT_EQ(net.get()->NumObservers(), prev_num);
+}
+} // namespace caffe2
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
new file mode 100644
index 0000000..e017456
--- /dev/null
+++ b/caffe2/core/operator.cc
@@ -0,0 +1,642 @@
+#include "caffe2/core/operator.h"
+
+#include <algorithm>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator_gradient.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/core/workspace.h"
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_int(
+    caffe2_operator_max_engine_name_length,
+    10,
+    "Maximum engine name length to be stored");
+CAFFE2_DEFINE_bool(
+    caffe2_disable_implicit_engine_preference,
+    false,
+    "If set, disable implicit engine preferences. This is useful for unit "
+    "testing and debugging cases.");
+
+namespace caffe2 {
+
+OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
+    : operator_ws_(ws),
+      operator_def_(std::make_shared<OperatorDef>(operator_def)),
+      device_option_(
+          operator_def.has_device_option() ? operator_def.device_option()
+                                           : DeviceOption()),
+      event_(caffe2::make_unique<Event>(device_option_)) {
+  static GlobalInitIsCalledGuard guard;
+  for (const string& input_str : operator_def.input()) {
+    auto* blob = ws->GetBlob(input_str);
+    CAFFE_ENFORCE(
+        blob != nullptr,
+        "op ",
+        operator_def.type(),
+        ": Encountered a non-existing input blob: ",
+        input_str);
+    inputs_.push_back(blob);
+  }
+
+  GetOperatorLogger()(operator_def);
+
+  for (const string& output_str : operator_def.output()) {
+    outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
+  }
+
+  type_ = operator_def.type();
+}
+
+vector<TensorShape> OperatorBase::InputTensorShapes() const {
+  vector<TensorShape> tps;
+  for (const auto& blob : inputs_) {
+    tps.push_back(GetTensorShapeOfBlob(blob));
+  }
+  return tps;
+}
+
+namespace {
+
+PerOpEnginePrefType& g_per_op_engine_pref() {
+  static auto* g_per_op_engine_pref_ = new PerOpEnginePrefType();
+  return *g_per_op_engine_pref_;
+}
+
+GlobalEnginePrefType& g_global_engine_pref() {
+  static auto* g_global_engine_pref_ =
+      new GlobalEnginePrefType{{DeviceType::CUDA, {"CUDNN"}}};
+  return *g_global_engine_pref_;
+}
+
+unique_ptr<OperatorBase> TryCreateOperator(
+    const string& key, const OperatorDef& operator_def, Workspace* ws) {
+  const auto& type = operator_def.device_option().device_type();
+  CAFFE_ENFORCE(
+      gDeviceTypeRegistry()->count(type),
+      "Device type ",
+      type,
+      " not registered.");
+  OperatorRegistry* registry = gDeviceTypeRegistry()->at(type);
+  VLOG(1) << "Creating operator with device type " << type;
+  try {
+    return registry->Create(key, operator_def, ws);
+  } catch (const UnsupportedOperatorFeature& err) {
+    LOG(WARNING) << "Operator " << operator_def.type()
+                 << " does not support the requested feature. Msg: "
+                 << err.what()
+                 << ". Proto is: " << ProtoDebugString(operator_def);
+    return nullptr;
+  }
+}
+
+unique_ptr<OperatorBase> _CreateOperator(
+    const OperatorDef& operator_def,
+    Workspace* ws) {
+  static StaticLinkingProtector g_protector;
+  const auto& op_type = operator_def.type();
+  const auto& device_type = operator_def.device_option().device_type();
+
+#ifndef CAFFE2_NO_OPERATOR_SCHEMA
+  // first, check with OpSchema if the operator is legal.
+  auto* schema = OpSchemaRegistry::Schema(op_type);
+  if (schema) {
+    CAFFE_ENFORCE(
+        schema->Verify(operator_def),
+        "Operator def did not pass schema checking: ",
+        ProtoDebugString(operator_def));
+  } else {
+    // We would like to recommend every op to register its schema, so if there
+    // is not one, we print a LOG_ERROR. But we will still allow the operator
+    // to be constructed.
+    LOG(ERROR) << "Cannot find operator schema for " << op_type
+               << ". Will skip schema checking.";
+  }
+#endif
+
+  // second try engines specified in the operator_def and preferred engines
+  std::vector<std::string> engines{};
+  if (operator_def.engine().size()) {
+    const auto op_def_engines = split(',', operator_def.engine());
+    engines.insert(engines.end(), op_def_engines.begin(), op_def_engines.end());
+  }
+  if (!FLAGS_caffe2_disable_implicit_engine_preference &&
+      g_per_op_engine_pref().count(device_type) &&
+      g_per_op_engine_pref()[device_type].count(op_type)) {
+    const auto& preferred_engines =
+        g_per_op_engine_pref()[device_type][op_type];
+    VLOG(2) << "Inserting per-op engine preference: " << preferred_engines;
+    engines.insert(
+        engines.end(), preferred_engines.begin(), preferred_engines.end());
+  }
+  if (!FLAGS_caffe2_disable_implicit_engine_preference &&
+      g_global_engine_pref().count(device_type)) {
+    const auto& preferred_engines = g_global_engine_pref()[device_type];
+    VLOG(2) << "Inserting global engine preference: " << preferred_engines;
+    engines.insert(
+        engines.end(), preferred_engines.begin(), preferred_engines.end());
+  }
+  for (const auto& engine : engines) {
+    const std::string key = OpRegistryKey(op_type, engine);
+    VLOG(1) << "Trying to create operator " << op_type << " with engine "
+            << engine;
+    auto op = TryCreateOperator(key, operator_def, ws);
+    if (op) {
+      if (engine.size() <= (unsigned)FLAGS_caffe2_operator_max_engine_name_length) {
+        op->annotate_engine(engine);
+      } else {
+        op->annotate_engine(
+            engine.substr(0, FLAGS_caffe2_operator_max_engine_name_length));
+      }
+      return op;
+    } else {
+      // If the above fails, we will just return the normal case with the
+      // default implementation.
+      VLOG(1) << "Engine " << engine
+              << " is not available for operator " << op_type << ".";
+    }
+  }
+  if (operator_def.engine().size() && !VLOG_IS_ON(1)) {
+    static int log_occurrences = 0;
+    if (log_occurrences <= 64) {
+      ++log_occurrences;
+      LOG(INFO) << "Engine " << operator_def.engine()
+                << " is not available for operator " << op_type << ".";
+    }
+  }
+  VLOG(1) << "Using default implementation.";
+
+  // Lastly, if the engine does not work here, try using the default engine.
+  auto op = TryCreateOperator(op_type, operator_def, ws);
+  CAFFE_ENFORCE(
+      op,
+      "Cannot create operator of type '",
+      op_type,
+      "' on the device '",
+      DeviceTypeName(device_type),
+      "'. Verify that implementation for the corresponding device exist. It "
+      "might also happen if the binary is not linked with the operator "
+      "implementation code. If Python frontend is used it might happen if "
+      "dyndep.InitOpsLibrary call is missing. Operator def: ",
+      ProtoDebugString(operator_def));
+  return op;
+}
+
+} // namespace
+
+const std::string OpRegistryKey(
+    const std::string& op_type,
+    const std::string& engine) {
+  if (engine == "" || engine == "DEFAULT") {
+    return op_type;
+  } else {
+    return op_type + "_ENGINE_" + engine;
+  }
+}
+
+void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref) {
+  for (const auto& device_pref_pair : per_op_engine_pref) {
+    const auto& device_type = device_pref_pair.first;
+    CAFFE_ENFORCE(
+        gDeviceTypeRegistry()->count(device_type),
+        "Device type ",
+        device_type,
+        " not registered.");
+    auto* registry = gDeviceTypeRegistry()->at(device_type);
+
+    for (const auto& op_pref_pair : device_pref_pair.second) {
+      const auto& op_type = op_pref_pair.first;
+      CAFFE_ENFORCE(
+          registry->Has(op_type),
+          "Operator type ",
+          op_type,
+          " not registered in ",
+          device_type,
+          " registry.");
+    }
+  }
+  g_per_op_engine_pref() = per_op_engine_pref;
+}
+
+void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref) {
+  for (const auto& device_pref_pair : global_engine_pref) {
+    const auto& device_type = device_pref_pair.first;
+    CAFFE_ENFORCE(
+        gDeviceTypeRegistry()->count(device_type),
+        "Device type ",
+        device_type,
+        " not registered.");
+  }
+  g_global_engine_pref() = global_engine_pref;
+}
+
+void SetEnginePref(
+    const PerOpEnginePrefType& per_op_engine_pref,
+    const GlobalEnginePrefType& global_engine_pref) {
+  SetPerOpEnginePref(per_op_engine_pref);
+  SetGlobalEnginePref(global_engine_pref);
+}
+
+void SetOpEnginePref(
+    const std::string& op_type,
+    const CaffeMap<int, EnginePrefType>& op_pref) {
+  for (const auto& device_pref_pair : op_pref) {
+    const auto& device_type = device_pref_pair.first;
+    CAFFE_ENFORCE(
+        gDeviceTypeRegistry()->count(device_type),
+        "Device type ",
+        device_type,
+        " not registered.");
+    CAFFE_ENFORCE(
+        gDeviceTypeRegistry()->at(device_type)->Has(op_type),
+        "Operator type ",
+        op_type,
+        " not registered in ",
+        device_type,
+        " registry.");
+    g_per_op_engine_pref()[device_type][op_type] = device_pref_pair.second;
+  }
+}
+
+unique_ptr<OperatorBase> CreateOperator(
+    const OperatorDef& operator_def,
+    Workspace* ws,
+    int net_position) {
+  try {
+    auto op = _CreateOperator(operator_def, ws);
+    op->set_net_position(net_position);
+    return op;
+  } catch (...) {
+    if (net_position != 0) {
+      VLOG(1) << "Operator constructor with net position " << net_position
+              << " failed";
+      ws->last_failed_op_net_position = net_position;
+    } else {
+      VLOG(1) << "Failed operator constructor doesn't have an id set";
+    }
+    throw;
+  }
+}
+
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
+  static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
+  return &g_device_type_registry;
+}
+
+CAFFE_DEFINE_REGISTRY(
+    CPUOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
+
+CAFFE_DEFINE_REGISTRY(
+    CUDAOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CUDA, CUDAOperatorRegistry);
+
+CAFFE_DEFINE_REGISTRY(
+    HIPOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::HIP, HIPOperatorRegistry);
+
+CAFFE_DEFINE_REGISTRY(
+    GradientRegistry,
+    GradientMakerBase,
+    const OperatorDef&, const vector<GradientWrapper>&);
+
+GradientOpsMeta GetGradientForOp(
+    const OperatorDef& def, const vector<GradientWrapper>& g_output) {
+  std::unique_ptr<GradientMakerBase> maker(
+      GradientRegistry()->Create(def.type(), def, g_output));
+  CAFFE_ENFORCE(maker,
+      "Gradient maker for operator ", def.type(), " not implemented.");
+  GradientOpsMeta meta = maker->Get();
+  // Copy device option, engine, and arguments if needed.
+  if (maker->CopyDeviceOption() && def.has_device_option()) {
+    for (OperatorDef& grad_def : meta.ops_) {
+      grad_def.mutable_device_option()->CopyFrom(def.device_option());
+    }
+  }
+  // Copy engine if needed.
+  if (maker->CopyEngine() && def.has_engine()) {
+    for (OperatorDef& grad_def : meta.ops_) {
+      grad_def.set_engine(def.engine());
+    }
+  }
+  // Copy arguments if needed.
+  if (maker->CopyArguments() && def.arg_size()) {
+    for (OperatorDef& grad_def : meta.ops_) {
+      for (auto& arg : def.arg()) {
+        grad_def.add_arg()->CopyFrom(arg);
+      }
+    }
+  }
+  // VLOG for debugging purposes.
+  for (const OperatorDef& grad_def : meta.ops_) {
+    VLOG(1) << "Gradient ops: " << ProtoDebugString(grad_def);
+  }
+  // Check if the gradient computation has returned the right size for the
+  // gradient vector.
+  CAFFE_ENFORCE_EQ(meta.g_input_.size(), def.input_size());
+  VLOG(1) << "Gradients:";
+  for (const GradientWrapper& grad : meta.g_input_) {
+    // The gradient should either be (1) not set, or (2) dense, or (3) sparse,
+    // but cannot be both dense and sparse.
+    if (!grad.IsDense() && !grad.IsSparse()) {
+      VLOG(1) << "\t [no gradient]";
+    } else if (grad.IsDense()) {
+      VLOG(1) << "\t [dense]" << grad.dense_;
+    } else {
+      CAFFE_ENFORCE(
+          grad.indices_.size() && grad.values_.size(),
+          "For sparse gradient, one should set both indices and values. "
+          "Currently we have: (" +
+              grad.indices_ + ", " + grad.values_ + ").");
+      VLOG(1) << "\t [sparse] " << grad.indices_ << ", " << grad.values_;
+    }
+  }
+  return meta;
+}
+
+TensorShapes InferBlobShapesAndTypes(
+    CaffeMap<string, TensorShape>& blob_desc,
+    const vector<NetDef*>& nets) {
+  for (auto& defptr : nets) {
+    // Hack to work with auto split gradients
+    CaffeMap<string, string> unmatched_sum_blobs;
+    CaffeMap<string, TensorShape> reshape_cache;
+
+    for (const OperatorDef& op : defptr->op()) {
+      // Hack to ignore queues
+      if (op.type().find("Dequeue") != std::string::npos ||
+          op.type().find("Enqueue") != std::string::npos) {
+        continue;
+      }
+
+      vector<TensorShape> input_desc;
+      bool found_all = true;
+      for (const string& in : op.input()) {
+        auto inp_desc = blob_desc.find(in);
+        if (inp_desc == blob_desc.end()) {
+          LOG(WARNING) << "Shape and type inference failed for input: " << in
+                       << " for op " << op.type() << ", skipping.";
+          found_all = false;
+          break;
+        }
+        input_desc.push_back(inp_desc->second);
+      }
+      if (!found_all) {
+        continue;
+      }
+      auto op_schema = OpSchemaRegistry::Schema(op.type());
+      if (op_schema == nullptr) {
+        LOG(WARNING) << "Shape inference failed, no schema for: " << op.type();
+        continue;
+      }
+
+      // Special handling for Sum as it used with the autosplits, which have
+      // different naming convention. Assuming that all sum inputs must be of
+      // same size, we can infer their shapes.
+      if (op.type() == "Sum") {
+        TensorShape sum_shape;
+        for (auto inp : op.input()) {
+          auto it = blob_desc.find(inp);
+          if (it != blob_desc.end() && !it->second.unknown_shape()) {
+            if (it->second.dims_size() > 0) {
+              sum_shape = blob_desc[inp];
+              break;
+            }
+          }
+        }
+        for (auto inp : op.input()) {
+          auto it = blob_desc.find(inp);
+          if (it == blob_desc.end() || it->second.unknown_shape()) {
+            blob_desc[inp] = sum_shape;
+            if (sum_shape.dims_size() == 0) {
+              // Match later with the output
+              unmatched_sum_blobs[inp] = op.output(0);
+            }
+          }
+        }
+      }
+
+      if (op.type() == "Reshape" && op.is_gradient_op()) {
+        CAFFE_ENFORCE(reshape_cache.find(op.input(1)) != reshape_cache.end());
+        TensorShape cached = reshape_cache[op.input(1)];
+        blob_desc[op.output(0)] = cached;
+        continue;
+      }
+
+      std::vector<TensorShape> out;
+      try {
+        out = op_schema->InferTensor(op, input_desc);
+        if (op.is_gradient_op() && out.size()) {
+          // Special handling for gradient ops. We can assume gradients
+          // are of same size as the corresponding variables. This is bit
+          // ugly to base on string matching, but we don't have the connection
+          // between variable and its gradient specified
+
+          CaffeMap<string, string> grads_to_params =
+              GradientMakerBase::MatchGradsToParams(op);
+
+          for (size_t i = 0; i < out.size(); i++) {
+            if (out[i].unknown_shape()) {
+              std::string gradout = op.output(i);
+
+              if (grads_to_params.find(gradout) != grads_to_params.end()) {
+                std::string var = grads_to_params[gradout];
+                if (blob_desc.find(var) != blob_desc.end()) {
+                  out[i] = blob_desc[var];
+                }
+              }
+            }
+          }
+        }
+
+        if (op.type() == "Reshape") {
+          // Reshape stores the original input shape to its second output
+          // blob. We need this for gradient reshape.
+          reshape_cache[op.output(1)] = input_desc[0];
+        }
+
+      } catch (::caffe2::EnforceNotMet& enf) {
+        LOG(ERROR) << "Shape inference error: " << enf.msg();
+        LOG(ERROR) << "Operator: " << ProtoDebugString(op) << std::endl;
+        LOG(ERROR) << "Returning empty results.";
+
+        TensorShapes tps;
+        return tps;
+      }
+
+      if (out.size() != (unsigned)op.output_size()) {
+        if (op.type() == "Slice") {
+          CAFFE_ENFORCE(
+              out.size() == 0,
+              "For Slice operator, either shape of all output blobs are "
+              "inferred or shape of none can be inferred.");
+        } else {
+          CAFFE_THROW(
+              "Invalid shape inference for operator ",
+              op.type(),
+              " Expected ",
+              op.output_size(),
+              " outputs, but got ",
+              out.size());
+        }
+      } else {
+        for (size_t i = 0; i < out.size(); i++) {
+          blob_desc[op.output(i)] = out[i];
+        }
+      }
+    } // net.ops
+
+    for (auto& unmatched : unmatched_sum_blobs) {
+      if (blob_desc.find(unmatched.second) != blob_desc.end()) {
+        blob_desc[unmatched.first] = blob_desc[unmatched.second];
+      }
+    }
+
+  } // nets
+  TensorShapes tps;
+  for (auto kv : blob_desc) {
+    TensorShape& tp = kv.second;
+    TensorShape* tpnew = tps.add_shapes();
+    tpnew->CopyFrom(tp);
+    tpnew->set_name(kv.first);
+  }
+  return tps;
+}
+
+TensorShape GetTensorShapeOfBlob(const Blob* b) {
+  TypeCall type_fun = GetTypeCallFunction(b->meta().id());
+  TensorInfoCall tensor_info_fun = GetTensorInfoFunction(b->meta().id());
+  TensorShape tp;
+
+  if (type_fun) {
+    tp.set_data_type(TypeMetaToDataType(type_fun(b->GetRaw())));
+  }
+  if (tensor_info_fun) {
+    bool _shares_data;
+    size_t _capacity;
+    DeviceOption _device;
+    auto shape =
+        tensor_info_fun(b->GetRaw(), &_shares_data, &_capacity, &_device);
+    for (auto d : shape) {
+      tp.add_dims(d);
+    }
+  } else {
+    tp.set_unknown_shape(true);
+  }
+  return tp;
+}
+
+TensorShapes InferBlobShapesAndTypesFromWorkspace(
+    Workspace* ws,
+    const vector<NetDef*>& nets) {
+  CaffeMap<string, TensorShape> blob_desc;
+  // Populate shapes from workplace
+  const std::vector<string>& ws_blobs = ws->Blobs();
+  for (const auto& s : ws_blobs) {
+    Blob* b = ws->GetBlob(s);
+    TensorShape tp = GetTensorShapeOfBlob(b);
+    blob_desc[s] = tp;
+  }
+  return InferBlobShapesAndTypes(blob_desc, nets);
+}
+
+TensorShapes InferBlobShapesAndTypesFromMap(
+    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const vector<NetDef*>& nets) {
+  CaffeMap<string, TensorShape> blob_desc;
+  // Populate shapes from known blobs
+  for (const auto& blob : blob_dimensions) {
+    TensorShape tp;
+    for (auto d : blob.second) {
+      CAFFE_ENFORCE_GE(d, 0, blob.first);
+      tp.add_dims(d);
+    }
+    blob_desc[blob.first] = tp;
+  }
+  return InferBlobShapesAndTypes(blob_desc, nets);
+}
+
+std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
+    OperatorBase& op,
+    const OperatorDef& op_def) {
+  std::map<string, std::pair<DeviceOption, DeviceOption>> mismatches;
+  DeviceOption op_device = op_def.device_option();
+
+#ifndef CAFFE2_NO_OPERATOR_SCHEMA
+  // Check from op schema if this op is used for crossing devices
+  auto op_schema = OpSchemaRegistry::Schema(op_def.type());
+  if (op_schema != nullptr) {
+    if (op_schema->inputs_can_cross_devices()) {
+      return mismatches;
+    }
+  }
+#endif // CAFFE2_NO_OPERATOR_SCHEMA
+
+  auto Check = [&](const Blob& blob, std::string blob_name) {
+    TensorInfoCall tensor_info_fun = GetTensorInfoFunction(blob.meta().id());
+    if (tensor_info_fun) {
+      bool _shares_data;
+      size_t _capacity;
+      DeviceOption blob_device;
+      tensor_info_fun(
+          const_cast<Blob&>(blob).GetRaw(),
+          &_shares_data,
+          &_capacity,
+          &blob_device);
+
+      if (blob_device.device_type() == CUDA &&
+          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
+        mismatches[blob_name] = std::make_pair(op_device, blob_device);
+      }
+      else if (blob_device.device_type() == HIP &&
+          blob_device.hip_gpu_id() != op_device.hip_gpu_id()) {
+        mismatches[blob_name] = std::make_pair(op_device, blob_device);
+      }
+    }
+  };
+
+  // Check that inputs have same device type as the op
+  for (int i = 0; i < op.InputSize(); i++) {
+    Check(op.InputBlob(i), op_def.input(i));
+  }
+  for (int i = 0; i < op.OutputSize(); i++) {
+    Check(*op.OutputBlob(i), op_def.output(i));
+  }
+  return mismatches;
+}
+
+std::set<std::string> GetRegisteredOperators() {
+  std::set<std::string> all_keys;
+
+  // CPU operators
+  for (const auto& name : CPUOperatorRegistry()->Keys()) {
+    all_keys.emplace(name);
+  }
+  // CUDA operators
+  for (const auto& name : CUDAOperatorRegistry()->Keys()) {
+    all_keys.emplace(name);
+  }
+  // HIP operators
+  for (const auto& name : HIPOperatorRegistry()->Keys()) {
+    all_keys.emplace(name);
+  }
+
+  return all_keys;
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
new file mode 100644
index 0000000..26bb024
--- /dev/null
+++ b/caffe2/core/operator.h
@@ -0,0 +1,980 @@
+#ifndef CAFFE2_CORE_OPERATOR_H_
+#define CAFFE2_CORE_OPERATOR_H_
+
+#include <array>
+#include <climits>
+#include <cstddef>
+#include <exception>
+#include <set>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator_gradient.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/filler.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+class OperatorBase;
+typedef ObserverBase<OperatorBase> OperatorObserver;
+
+class OperatorBase : public Observable<OperatorBase> {
+ public:
+  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
+  virtual ~OperatorBase() noexcept {}
+
+  /** @brief Checks if the operator has an argument of the given name.
+   */
+  inline bool HasArgument(const string& name) const {
+    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
+    return ArgumentHelper::HasArgument(*operator_def_, name);
+  }
+
+  // Functions that deal with arguments. Basically, this allows us to map an
+  // argument name to a specific type of argument that we are trying to access.
+  template <typename T>
+  inline T GetSingleArgument(const string& name, const T& default_value) const {
+    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
+    return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+  template <typename T>
+  inline bool HasSingleArgumentOfType(const string& name) const {
+    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
+    return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
+        *operator_def_, name);
+  }
+  template <typename T>
+  inline vector<T> GetRepeatedArgument(
+      const string& name,
+      const vector<T>& default_value = {}) const {
+    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
+    return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+
+  // Get the inputs and outputs as specific types.
+  template <typename T>
+  inline const T& Input(int idx) {
+    DCHECK_LT(idx, inputs_.size());
+    try {
+      return inputs_.at(idx)->template Get<T>();
+    } catch (::caffe2::EnforceNotMet& enf) {
+      if (has_debug_def()) {
+        enf.AppendMessage(".\nOffending Blob name: ");
+        enf.AppendMessage(debug_def().input(idx));
+        enf.AppendMessage(".\n");
+      }
+      throw enf;
+    }
+  }
+
+  template <typename T>
+  inline T* Output(int idx) {
+    return outputs_.at(idx)->template GetMutable<T>();
+  }
+
+  template <typename T>
+  inline T* Output(int idx, T* allocated) {
+    outputs_.at(idx)->Reset(allocated);
+    return allocated;
+  }
+
+  inline const Blob& InputBlob(int idx) {
+    return *inputs_.at(idx);
+  }
+
+  inline Blob* OutputBlob(int idx) {
+    return outputs_.at(idx);
+  }
+
+  template <typename T>
+  inline bool InputIsType(int idx) {
+    return inputs_.at(idx)->template IsType<T>();
+  }
+
+  template <typename T>
+  inline bool OutputIsType(int idx) {
+    return outputs_.at(idx)->template IsType<T>();
+  }
+
+  inline int InputSize() const {
+    return inputs_.size();
+  }
+  inline int OutputSize() const {
+    return outputs_.size();
+  }
+  inline const vector<const Blob*>& Inputs() const { return inputs_; }
+  inline const vector<Blob*>& Outputs() { return outputs_; }
+  vector<TensorShape> InputTensorShapes() const;
+
+  virtual void WaitEvent(const Event& ev, int /*stream_id */ = -1) {
+    ev.Finish();
+  }
+
+  inline void Wait(const OperatorBase& other, int stream_id = -1) {
+    if (!other.IsEventDisabled()) {
+      WaitEvent(other.event(), stream_id);
+    }
+  }
+
+  virtual void WaitEvents(
+      const std::vector<const Event*>& events,
+      int /*stream_id*/ = -1) {
+    for (const auto& ev : events) {
+      ev->Finish();
+    }
+  }
+
+  virtual void Finish() {
+    if (event_) {
+      event_->Finish();
+    }
+  }
+
+  virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  virtual bool HasAsyncPart() const {
+    return false;
+  }
+
+  virtual bool SupportsAsyncScheduling() const {
+    return false;
+  }
+
+  // RunAsync, if implemenented by the specific operators, will schedule the
+  // computation on the corresponding context and record the event in its
+  // event_ member object. If the specific operator does not support RunAsync,
+  // it will simply be synchronous as a fallback.
+  virtual bool RunAsync(int stream_id = 0) {
+    try {
+      auto result = Run(stream_id);
+      if (result) {
+        if (HasAsyncPart()) {
+          RecordEvent();
+        } else {
+          SetEventFinished();
+        }
+      } else {
+        SetEventFinished(getErrorMsg().c_str());
+      }
+      return result;
+    } catch (EnforceNotMet& err) {
+      SetEventFinished(err.what());
+      throw;
+    } catch (const std::exception& err) {
+      SetEventFinished(err.what());
+      throw;
+    } catch (...) {
+      SetEventFinished(getErrorMsg().c_str());
+      throw;
+    }
+  }
+
+  virtual void AddRelatedBlobInfo(EnforceNotMet* err) {
+    if (!has_debug_def()) {
+      return;
+    }
+
+    bool found_input;
+    if (err->caller() != nullptr) {
+      for (size_t i = 0; i < inputs_.size(); i++) {
+        if (inputs_[i]->GetRaw() == err->caller()) {
+          found_input = true;
+          err->AppendMessage(
+              "\n** while accessing input: " + debug_def().input(i));
+          break;
+        }
+      }
+      for (size_t i = 0; i < outputs_.size(); i++) {
+        if (outputs_[i]->GetRaw() == err->caller()) {
+          if (found_input) {
+            err->AppendMessage("\n OR ");
+          }
+          err->AppendMessage(
+              "\n** while accessing output: " + debug_def().output(i));
+          break;
+        }
+      }
+    }
+  }
+
+  inline const OperatorDef& debug_def() const {
+    CAFFE_ENFORCE(has_debug_def(), "operator_def was null!");
+    return *operator_def_;
+  }
+
+  inline void set_debug_def(
+      const std::shared_ptr<const OperatorDef>& operator_def) {
+    operator_def_ = operator_def;
+  }
+
+  inline bool has_debug_def() const {
+    return operator_def_ != nullptr;
+  }
+
+ public:
+  void RecordLastFailedOpNetPosition() {
+    if (net_position_ != kNoNetPositionSet) {
+      VLOG(1) << "Operator with id " << net_position_ << " failed";
+      operator_ws_->last_failed_op_net_position = net_position_;
+    } else {
+      VLOG(1) << "Failed operator doesn't have id set";
+    }
+  }
+
+  int net_position() const {
+    return net_position_;
+  }
+
+  void set_net_position(int idx) {
+    net_position_ = idx;
+  }
+
+  const DeviceOption& device_option() const {
+    return device_option_;
+  }
+
+  const Event& event() const {
+    CAFFE_ENFORCE(event_, "Event is disabled");
+    return *event_;
+  }
+
+  Event& event() {
+    CAFFE_ENFORCE(event_, "Event is disabled");
+    return *event_;
+  }
+
+  void ResetEvent() {
+    if (event_) {
+      event_->Reset();
+    }
+  }
+
+  void DisableEvent() {
+    event_ = nullptr;
+  }
+
+  bool IsEventDisabled() const {
+    return !event_;
+  }
+
+  // Checks whether stream is ready to execute new computation,
+  // used in stream allocation optimization to skip stream that is currently
+  // busy. Depends on context and operator's device, returns true by default
+  virtual bool IsStreamFree(int /* unused */) const {
+    return true;
+  }
+
+  const std::string& type() const {
+    return type_;
+  }
+
+  void annotate_engine(const std::string& engine) {
+    engine_ = engine;
+  }
+
+  const std::string& engine() const {
+    return engine_;
+  }
+
+  void SetExecutorHelper(ExecutorHelper* helper) {
+    helper_ = helper;
+  }
+
+  ExecutorHelper* GetExecutorHelper() const {
+    return helper_;
+  }
+
+ public:
+  static constexpr int kNoNetPositionSet = -1;
+
+ private:
+  Workspace* operator_ws_;
+  std::shared_ptr<const OperatorDef> operator_def_;
+  DeviceOption device_option_;
+  std::string engine_;
+  std::string type_;
+  vector<const Blob*> inputs_;
+  vector<Blob*> outputs_;
+
+  int net_position_{kNoNetPositionSet};
+
+  ExecutorHelper* helper_ = nullptr;
+
+ protected:
+  virtual void RecordEvent(const char* /*err_msg*/ = nullptr) {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  void SetEventFinished(const char* err_msg = nullptr) {
+    if (event_) {
+      event_->SetFinished(err_msg);
+    }
+  }
+
+  std::string getErrorMsg() {
+    if (has_debug_def()) {
+      return "Error from operator: " + ProtoDebugString(debug_def());
+    } else {
+      return "Error from operator: no op def";
+    }
+  }
+
+  // An event used by asynchronous execution.
+  std::unique_ptr<Event> event_;
+
+  DISABLE_COPY_AND_ASSIGN(OperatorBase);
+};
+
+// If your operator does not need any specialized contructor or destructor,
+// you can simply use this to save two lines of code.
+#define USE_SIMPLE_BASE_CTOR_DTOR(name)                                        \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : OperatorBase(operator_def, ws) {}                                      \
+  virtual ~name() noexcept {}
+
+// OP_SINGLE_ARG provides a shorter initialization choice for initialization of
+// member variables for the class constructors.
+#define OP_SINGLE_ARG(type, name, variable, default)                           \
+  variable(OperatorBase::GetSingleArgument<type>(name, (default)))
+
+// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
+// operator's inputs and outputs, in order to avoid confusion. For example, for
+// a fully convolution layer that has input, weight and bias, you can define its
+// input tags as:
+//     INPUT_TAGS(INPUT, WEIGHT, BIAS);
+// And in the code, instead of doing
+//     auto& weight = Input(1);
+// you can now do
+//     auto& weight = Input(WEIGHT);
+// to make it more clear.
+#define INPUT_TAGS(first_input, ...)                                           \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#define OUTPUT_TAGS(first_input, ...)                                          \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+
+// Operator is the class that you usually want to derive, if your operator will
+// run on different devices. You should then implement the RunOnDevice()
+// function.
+template <class Context>
+class Operator : public OperatorBase {
+ public:
+  explicit Operator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws), context_(operator_def.device_option()) {
+    // In the constructor, we switch to the device so that the child class
+    // constructors will run on that device.
+    context_.SwitchToDevice(0);
+  }
+  ~Operator() noexcept override {}
+
+  inline const Tensor<Context>& Input(int idx) {
+    return OperatorBase::template Input<Tensor<Context>>(idx);
+  }
+  inline Tensor<Context>* Output(int idx) {
+    return OperatorBase::template Output<Tensor<Context>>(idx);
+  }
+
+  void WaitEvent(const Event& ev, int stream_id = -1) final {
+    if (stream_id >= 0) {
+      context_.SwitchToDevice(stream_id);
+    }
+    context_.WaitEvent(ev);
+  }
+
+  void WaitEvents(const std::vector<const Event*>& events, int stream_id = -1)
+      final {
+    if (stream_id >= 0) {
+      context_.SwitchToDevice(stream_id);
+    }
+    for (const auto& ev : events) {
+      context_.WaitEvent(*ev);
+    }
+  }
+
+  // The run function of Operator switches to the device, and then carries out
+  // the actual computation with RunOnDevice(). You should implement RunOnDevice
+  // instead of Run().
+  // Note: Run does not update operator's event and can be used only with
+  // non-async executors that do not rely on events
+  bool Run(int stream_id = 0) final {
+    try {
+      StartAllObservers();
+
+      context_.SwitchToDevice(stream_id);
+      bool result = RunOnDevice();
+      if (!result) {
+        this->RecordLastFailedOpNetPosition();
+      }
+      context_.FinishDeviceComputation(); // throws on error
+
+      StopAllObservers();
+
+      return result;
+    } catch (EnforceNotMet& err) {
+      if (has_debug_def()) {
+        err.AppendMessage(
+            "Error from operator: \n" + ProtoDebugString(debug_def()));
+        AddRelatedBlobInfo(&err);
+      }
+      this->RecordLastFailedOpNetPosition();
+      StopAllObservers();
+      throw;
+    } catch (...) {
+      this->RecordLastFailedOpNetPosition();
+      StopAllObservers();
+      throw;
+    }
+  }
+
+  bool RunAsync(int stream_id = 0) final {
+    try {
+      StartAllObservers();
+
+      context_.SwitchToDevice(stream_id);
+      auto result = RunOnDevice();
+      if (result) {
+        if (HasAsyncPart()) {
+          RecordEvent();
+        } else {
+          // Manually set CPU operator's event status to finished,
+          // unless this is an async CPU operator
+          SetEventFinished();
+        }
+      } else {
+        SetEventFinished(getErrorMsg().c_str());
+        this->RecordLastFailedOpNetPosition();
+      }
+
+      StopAllObservers();
+
+      return result;
+    } catch (EnforceNotMet& err) {
+      if (has_debug_def()) {
+        err.AppendMessage(
+            "Error from operator: \n" + ProtoDebugString(debug_def()));
+        AddRelatedBlobInfo(&err);
+      }
+      SetEventFinished(err.what());
+      this->RecordLastFailedOpNetPosition();
+      StopAllObservers();
+      throw;
+    } catch (const std::exception& err) {
+      SetEventFinished(err.what());
+      this->RecordLastFailedOpNetPosition();
+      StopAllObservers();
+      throw;
+    } catch (...) {
+      SetEventFinished(getErrorMsg().c_str());
+      this->RecordLastFailedOpNetPosition();
+      StopAllObservers();
+      throw;
+    }
+  }
+
+  bool IsStreamFree(int stream_id) const override {
+    return context_.IsStreamFree(device_option(), stream_id);
+  }
+
+  virtual bool RunOnDevice() = 0;
+
+  // Returns whether operator has async on device part.
+  // CUDA operators by default have async parts, CPU operators by default
+  // don't have async parts and are finished after RunOnDevice call.
+  // Events of operators that don't have async parts are automatically set
+  // to finished state by RunAsync.
+  // Defaulting to the value from context (true for CUDA, false for CPU).
+  // Override in case of async CPU operators
+  bool HasAsyncPart() const override {
+    return context_.HasAsyncPartDefault();
+  }
+
+  // Returns whether operator's RunOnDevice schedules async on device part and
+  // can be run without waiting for parent operator's async part to be finished
+  // on the same device.
+  // Note: when true, RunOnDevice must not access the content of the input blobs
+  // as they might not be computed yet
+  // Note: when true, operator's device needs to support async scheduling:
+  //  - supports concept of streams: async ops scheduled on the same stream are
+  //    guaranteed to be executed in the same order they were scheduled
+  //  - provides non-blocking cross device/cross stream synchronization
+  //    primitives
+  //
+  // By default, assuming an op with an async part can be scheduled
+  // asynchronously if device supports async scheduling
+  bool SupportsAsyncScheduling() const override {
+    return HasAsyncPart() && context_.SupportsAsyncScheduling();
+  }
+
+  const Context* getContext() const {
+    return &context_;
+  }
+
+  virtual std::vector<TensorFiller<Context>> InputFillers(
+      const std::vector<std::vector<TIndex>>& shapes) {
+    CAFFE_ENFORCE(shapes.size() == Inputs().size());
+    std::vector<TensorFiller<Context>> fillers;
+    for (const auto& shape : shapes) {
+      fillers.emplace_back(shape, &context_);
+    }
+
+    return fillers;
+  }
+
+#define DISABLE_INPUT_FILLERS(Context)                                  \
+  std::vector<TensorFiller<Context>> InputFillers(                      \
+      const std::vector<std::vector<TIndex>>& /* unused */) override {  \
+    throw UnsupportedOperatorFeature("Op does not have input fillers"); \
+  }
+
+  void SparseLengthsFillerHelper(
+      const std::vector<std::vector<TIndex>>& shapes,
+      size_t value_index,
+      size_t length_index,
+      std::vector<TensorFiller<Context>>* fillers) {
+    CAFFE_ENFORCE_EQ(shapes[length_index].size(), 1);
+    (*fillers)[length_index].SparseLengths(shapes[value_index].front());
+  }
+
+  void SparseSegmentsFillerHelper(
+      const std::vector<std::vector<TIndex>>& shapes,
+      size_t value_index,
+      size_t segment_index,
+      std::vector<TensorFiller<Context>>* fillers) {
+    CAFFE_ENFORCE_EQ(shapes[segment_index].size(), 1);
+    // TODO: what would be a proper #segments
+    (*fillers)[segment_index].SparseSegments(shapes[value_index].front() - 1);
+  }
+
+// The helper is build sparse input with values and lengths; e.g.:
+// values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
+//            \_____/  \________/  \__/
+// lengths =    [3,        4,       2]
+#define USE_VALUE_LENGTH_INPUT_FILLERS(Context, value_index, length_index) \
+  std::vector<TensorFiller<Context>> InputFillers(                         \
+      const std::vector<std::vector<TIndex>>& shapes) override {           \
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<Context>::Inputs().size());   \
+    auto fillers = Operator<Context>::InputFillers(shapes);                \
+    Operator<Context>::SparseLengthsFillerHelper(                          \
+        shapes, value_index, length_index, &fillers);                      \
+    return fillers;                                                        \
+  }
+
+  // The helper is build sparse input with values, keys, and lengths; e.g.:
+  // values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
+  // keys    = [0, 1, 4, 0, 1, 2, 5, 1, 2]
+  //            \_____/  \________/  \__/
+  // lengths =    [3,        4,       2]
+#define USE_VALUE_KEY_LENGTH_INPUT_FILLERS(                              \
+    Context, value_index, key_index, length_index)                       \
+  std::vector<TensorFiller<Context>> InputFillers(                       \
+      const std::vector<std::vector<TIndex>>& shapes) override {         \
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<Context>::Inputs().size()); \
+    auto fillers = Operator<Context>::InputFillers(shapes);              \
+    Operator<Context>::SparseLengthsFillerHelper(                        \
+        shapes, key_index, length_index, &fillers);                      \
+    Operator<Context>::SparseSegmentsFillerHelper(                       \
+        shapes, value_index, key_index, &fillers);                       \
+    return fillers;                                                      \
+  }
+
+ protected:
+  void RecordEvent(const char* err_msg = nullptr) final {
+    if (event_) {
+      context_.Record(event_.get(), err_msg);
+    }
+  }
+
+  Context context_;
+};
+
+#define USE_OPERATOR_BASE_FUNCTIONS                                 \
+  /* using override */ using OperatorBase::HasArgument;             \
+  /* using override */ using OperatorBase::GetSingleArgument;       \
+  /* using override */ using OperatorBase::HasSingleArgumentOfType; \
+  /* using override */ using OperatorBase::GetRepeatedArgument;     \
+  /* using override */ using OperatorBase::InputIsType;             \
+  /* using override */ using OperatorBase::InputSize;               \
+  /* using override */ using OperatorBase::OutputSize
+
+#define USE_OPERATOR_FUNCTIONS(context)                    \
+  USE_OPERATOR_BASE_FUNCTIONS;                             \
+  /* using override */ using Operator<context>::context_;  \
+  /* using override */ using Operator<context>::Input;     \
+  /* using override */ using Operator<context>::InputBlob; \
+  /* using override */ using Operator<context>::Output;    \
+  /* using override */ using Operator<context>::OutputBlob
+
+#define USE_OPERATOR_CONTEXT_FUNCTIONS USE_OPERATOR_FUNCTIONS(Context)
+
+#define USE_SIMPLE_CTOR_DTOR(name)                                             \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : Operator<Context>(operator_def, ws) {}                                 \
+  virtual ~name() noexcept {}
+
+// Helpers to implement runtime op polymorphism. Often it's convenient to make
+// an op work on different input types (e.g. i32 vs i64 indices) or special-case
+// it for particular input size (e.g. ScatterWeightedSum for block size of 1
+// doesn't need to call Eigen).
+//
+// DispatchHelper provides compile-time generation of nested "if" statements,
+// e.g. `DispatchHelper<FixedValues<1, 4>>::call(this, block_size);`
+// unrolls into:
+//   if (block_size == 1) {
+//     return DoRunWithValue<1>();
+//   } else if (block_size = 4) {
+//     return DoRunWithValue<4>();
+//   } else {
+//     return DoRunWithValue<-1>();
+//   }`
+//
+// DoRunWithValue implementation can use template arguments to do "if"
+// statements
+// or proxy to functions in math.h which often provide fixed size
+// implementation.
+//
+// Similarly `TensorTypes<int32_t, int64_t>(this, Input(0))` provides branching
+// based on type of the first input and calls DoRunWithType.
+//
+// Note, that the same instance of Op class is used as the method, not class is
+// templated. We might consider adding static class-level polymorphism later.
+//
+// Convenient macro USE_DISPATCH_HELPER is provided for declaring friendship in
+// case DoRunWithValue or DoRunWithType are declared non-public.
+
+#define USE_DISPATCH_HELPER                           \
+  template <typename FirstArg, typename... ExtraArgs> \
+  friend struct DispatchHelper
+
+template <int... Values>
+struct FixedValues {};
+
+template <typename... Types>
+struct TensorTypes {};
+
+// Special tag that can be listed in TensorTypes to denote that a special
+// implementation in 'RunWithOtherType' needs to be called instead of failing
+// Obviously this needs to be the last item in lists, e.g.
+// TensorTypes<float, double, GenericTensorImplementation>
+struct GenericTensorImplementation {};
+
+// Same as TensorTypes but call DoRunWithType2
+template <typename... Types>
+struct TensorTypes2 {};
+
+template <typename Sizes, typename... ExtraArgs>
+struct DispatchHelper;
+
+template <int FirstVal, int... Values, typename... ExtraArgs>
+struct DispatchHelper<FixedValues<FirstVal, Values...>, ExtraArgs...> {
+  template <typename Op>
+  static bool call(Op* op, int value) {
+    if (FirstVal == value) {
+      return op->template DoRunWithValue<ExtraArgs..., FirstVal>();
+    }
+    return DispatchHelper<FixedValues<Values...>, ExtraArgs...>::template call<
+        Op>(op, value);
+  }
+};
+
+template <typename... ExtraArgs>
+struct DispatchHelper<FixedValues<>, ExtraArgs...> {
+  template <typename Op>
+  static bool call(Op* op, TIndex /*size*/) {
+    return op->template DoRunWithValue<ExtraArgs..., -1>();
+  }
+};
+
+#define CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(                                 \
+    TensorTypes, DoRunWithType, DoRunWithOtherType)                            \
+  template <typename FirstType, typename... Types, typename... ExtraArgs>      \
+  struct DispatchHelper<TensorTypes<FirstType, Types...>, ExtraArgs...> {      \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const TypeMeta& meta) {                           \
+      static_assert(                                                           \
+          !std::is_same<GenericTensorImplementation, FirstType>::value,        \
+          "GenericTensorImplementation must be the last in TensorTypes list"); \
+      if (meta.Match<FirstType>()) {                                           \
+        return op->template DoRunWithType<ExtraArgs..., FirstType>();          \
+      }                                                                        \
+      return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>::             \
+          template call<Op>(op, meta);                                         \
+    }                                                                          \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+      return call<Op>(op, tensor.meta());                                      \
+    }                                                                          \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Blob& blob) {                               \
+      return call<Op>(op, blob.meta());                                        \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <typename... ExtraArgs>                                             \
+  struct DispatchHelper<TensorTypes<>, ExtraArgs...> {                         \
+    template <typename Op>                                                     \
+    static bool call(Op* /* unused */, const TypeMeta& meta) {                 \
+      CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
+    }                                                                          \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+      return call<Op>(op, tensor.meta());                                      \
+    }                                                                          \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Blob& blob) {                               \
+      return call<Op>(op, blob.meta());                                        \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <typename... ExtraArgs>                                             \
+  struct DispatchHelper<                                                       \
+      TensorTypes<GenericTensorImplementation>,                                \
+      ExtraArgs...> {                                                          \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const TypeMeta&) {                                \
+      return op->template DoRunWithOtherType<ExtraArgs...>();                  \
+    }                                                                          \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+      return call<Op>(op, tensor.meta());                                      \
+    }                                                                          \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Blob& blob) {                               \
+      return call<Op>(op, blob.meta());                                        \
+    }                                                                          \
+  };
+CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(
+    TensorTypes,
+    DoRunWithType,
+    DoRunWithOtherType)
+CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(
+    TensorTypes2,
+    DoRunWithType2,
+    DoRunWithOtherType2)
+#undef CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER
+
+// The device type registry. This works in two phases:
+// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
+//     registry function.
+// (2) Then, one can call the operator registry function to further create the
+//     operators.
+typedef Registry<
+    std::string,
+    std::unique_ptr<OperatorBase>,
+    const OperatorDef&,
+    Workspace*>
+    OperatorRegistry;
+typedef Registry<
+    std::string,
+    std::unique_ptr<OperatorBase>,
+    const OperatorDef&,
+    Workspace*>* (*RegistryFunction)();
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
+
+struct DeviceTypeRegisterer {
+  explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
+    if (gDeviceTypeRegistry()->count(type)) {
+      std::cerr << "Device type " << type
+                << "registered twice. This should not happen. Did you have "
+                   "duplicated numbers assigned to different devices?";
+      std::exit(1);
+    }
+    // Calling the registry function to get the actual registry pointer.
+    gDeviceTypeRegistry()->emplace(type, func());
+  }
+};
+
+#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
+  namespace {                                               \
+  static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE(     \
+      DeviceType)(type, &registry_function);                \
+  }
+
+// The operator registry. Since we are not expecting a great number of devices,
+// we will simply have an if-then type command and allocate the actual
+// generation to device-specific registerers.
+// Note that although we have CUDA and CUDNN here, the registerers themselves do
+// not depend on specific cuda or cudnn libraries. This means that we will be
+// able to compile it even when there is no cuda available - we simply do not
+// link any cuda or cudnn operators.
+CAFFE_DECLARE_REGISTRY(
+    CPUOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CPU_OPERATOR(name, ...)                           \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();      \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
+  }                                                                \
+  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_CPU_OPERATOR_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+CAFFE_DECLARE_REGISTRY(
+    CUDAOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CUDA_OPERATOR(name, ...)                           \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
+  }                                                                 \
+  CAFFE_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  CAFFE_REGISTER_CLASS(                                       \
+      CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+// Macros for cudnn since we use it often
+#define REGISTER_CUDNN_OPERATOR(name, ...) \
+  REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
+
+// Macros for HIP operators
+CAFFE_DECLARE_REGISTRY(
+    HIPOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_HIP_OPERATOR(name, ...)                           \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
+  }                                                                 \
+  CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  CAFFE_REGISTER_CLASS(                                       \
+      HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+#define REGISTER_MIOPEN_OPERATOR(name, ...) \
+  REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__)
+
+// StaticLinkingProtector is a helper class that ensures that the Caffe2
+// library is linked correctly with whole archives (in the case of static
+// linking). What happens is that when CreateOperator is called for the first
+// time, it instantiates an OperatorLinkingProtector object to check if the
+// operator registry is empty. If it is empty, this means that we are not
+// properly linking the library.
+//
+// You should not need to use this class.
+struct StaticLinkingProtector {
+  StaticLinkingProtector() {
+    const int registered_ops = CPUOperatorRegistry()->Keys().size();
+    // Note: this is a check failure instead of an exception, because if
+    // the linking is wrong, Caffe2 won't be able to run properly anyway,
+    // so it's better to fail loud.
+    // If Caffe2 is properly linked with whole archive, there should be more
+    // than zero registered ops.
+    if (registered_ops == 0) {
+      LOG(FATAL) <<
+        "You might have made a build error: the Caffe2 library does not seem "
+        "to be linked with whole-static library option. To do so, use "
+        "-Wl,-force_load (clang) or -Wl,--whole-archive (gcc) to link the "
+        "Caffe2 library.";
+    }
+  }
+};
+
+// An exception that can be thrown by an operator constructor that notifies
+// that it does not support the given setting. This can be usually used for
+// specific engines that only implement a subset of the features required by
+// the original operator schema.
+// TODO(jiayq): make more feature-complete exception message.
+class UnsupportedOperatorFeature : public std::exception {
+ public:
+  UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
+  const char* what() const noexcept override {
+    return msg_.c_str();
+  }
+
+ private:
+  string msg_;
+};
+
+// A helper macro that should ONLY be used in the operator constructor to check
+// if needed features are met. If not, throws the UnsupportedOperatorFeature
+// exception with the given message.
+#define OPERATOR_NEEDS_FEATURE(condition, ...)                           \
+  if (!(condition)) {                                                    \
+    throw UnsupportedOperatorFeature(::caffe2::MakeString(__VA_ARGS__)); \
+  }
+
+// Creates an operator with the given operator definition.
+// Throws on error and never returns nullptr
+unique_ptr<OperatorBase> CreateOperator(
+    const OperatorDef& operator_def,
+    Workspace* ws,
+    int net_position = OperatorBase::kNoNetPositionSet);
+
+const std::string OpRegistryKey(
+    const std::string& op_type,
+    const std::string& engine = "");
+
+// User can set the preferred engines as a list of engine names, in
+// descending order of preference.
+using EnginePrefType = std::vector<std::string>;
+// {device_type -> {operator_name -> EnginePrefType}}
+using PerOpEnginePrefType =
+    CaffeMap<int, CaffeMap<std::string, EnginePrefType>>;
+// {device_type -> EnginePrefType}
+using GlobalEnginePrefType = CaffeMap<int, EnginePrefType>;
+void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref);
+void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref);
+void SetEnginePref(
+    const PerOpEnginePrefType& per_op_engine_pref,
+    const GlobalEnginePrefType& global_engine_pref);
+void SetOpEnginePref(
+    const std::string& op_type,
+    const CaffeMap<int, EnginePrefType>& op_pref);
+
+TensorShape GetTensorShapeOfBlob(const Blob* b);
+
+TensorShapes InferBlobShapesAndTypes(
+    CaffeMap<string, TensorShape>& blob_desc,
+    const vector<NetDef*>& nets);
+
+TensorShapes InferBlobShapesAndTypesFromWorkspace(
+    Workspace* ws,
+    const vector<NetDef*>& nets);
+
+TensorShapes InferBlobShapesAndTypesFromMap(
+    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const vector<NetDef*>& nets);
+
+std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
+    OperatorBase& op,
+    const OperatorDef& op_def);
+
+// Get a set of registered operator names
+std::set<std::string> GetRegisteredOperators();
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_OPERATOR_H_
diff --git a/caffe2/core/operator_gpu_test.cc b/caffe2/core/operator_gpu_test.cc
new file mode 100644
index 0000000..103370a
--- /dev/null
+++ b/caffe2/core/operator_gpu_test.cc
@@ -0,0 +1,63 @@
+#include <string>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+class JustTest : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  virtual std::string type() {
+    return "BASE";
+  }
+};
+
+class JustTestCUDA : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  std::string type() override {
+    return "CUDA";
+  }
+};
+
+class JustTestCUDNN : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  std::string type() override {
+    return "CUDNN";
+  }
+};
+
+OPERATOR_SCHEMA(JustTest).NumInputs(0, 1).NumOutputs(0, 1);
+REGISTER_CUDA_OPERATOR(JustTest, JustTestCUDA);
+REGISTER_CUDNN_OPERATOR(JustTest, JustTestCUDNN);
+
+TEST(EnginePrefTest, GPUDeviceDefaultPreferredEngines) {
+  if (!HasCudaGPU())
+    return;
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.mutable_device_option()->set_device_type(CUDA);
+  op_def.set_type("JustTest");
+
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    // CUDNN should be taken as it's in the default global preferred engines
+    // list
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "CUDNN");
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
new file mode 100644
index 0000000..4072065
--- /dev/null
+++ b/caffe2/core/operator_gradient.h
@@ -0,0 +1,330 @@
+#ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_
+#define CAFFE2_CORE_OPERATOR_GRADIENT_H_
+
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+/* @brief A struct that abstracts on top of dense and sparse blobs.
+ *
+ * For a dense blob, its gradient name should be written into dense_, and for
+ * a sparse blob, its gradient name should be written into indice_ for
+ * the sparse indices and value_ for the values.
+ */
+struct GradientWrapper {
+  string dense_;
+  string indices_;
+  string values_;
+
+  inline bool IsDense() const {
+    return (dense_.size() != 0);
+  }
+  inline bool IsSparse() const {
+    return (indices_.size() != 0 || values_.size() != 0);
+  }
+  inline bool IsEmpty() const {
+    return (!IsDense() && !IsSparse());
+  }
+};
+
+/**
+ * A struct that holds the gradient operators and related gradient maps.
+ */
+struct GradientOpsMeta {
+  vector<OperatorDef> ops_;
+  vector<GradientWrapper> g_input_;
+
+  GradientOpsMeta() {}
+  GradientOpsMeta(
+      const vector<OperatorDef>& ops,
+      const vector<GradientWrapper>& v)
+      : ops_(ops), g_input_(v) {}
+};
+
+class GradientMakerBase {
+ public:
+  GradientMakerBase(
+      const OperatorDef& def,
+      const vector<GradientWrapper>& g_output)
+      : def_(def), g_output_(g_output), g_input_(def.input_size()){};
+  virtual ~GradientMakerBase() {}
+  virtual bool CopyDeviceOption() const {
+    return true;
+  }
+  virtual bool CopyEngine() const {
+    return true;
+  }
+  virtual bool CopyArguments() const {
+    return true;
+  }
+
+  virtual void VerifyOp() const {
+    auto* schema = OpSchemaRegistry::Schema(def_.type());
+    if (schema) {
+      CAFFE_ENFORCE(
+          schema->Verify(def_),
+          "(GradientMaker) Operator def did not pass schema checking: ",
+          ProtoDebugString(def_));
+    }
+  }
+
+  /**
+   * @brief Returns the gradient ops meta.
+   *
+   * If your gradient op generator only use standard input and output
+   * manipulations, you can simply implement GetGradientDefs() that
+   * returns vector<OperatorDef>. In that, you can call GI, GI_V and GI_I
+   * that will automatically create the gradient registration for you.
+   *
+   * If you need to do custom gradient name registration, overload this
+   * function directly.
+   */
+  virtual GradientOpsMeta Get() {
+    VerifyOp();
+    vector<OperatorDef> new_defs = GetGradientDefs();
+    for (auto& opdef : new_defs) {
+      opdef.set_is_gradient_op(true);
+    }
+    return GradientOpsMeta(new_defs, g_input_);
+  };
+
+  const OperatorDef& Def() const {
+    return def_;
+  }
+
+ protected:
+  virtual vector<OperatorDef> GetGradientDefs() {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  // Helper functions to return names for the gradient computation.
+  // I(idx), O(idx): return the input and output names.
+  // GO(idx): return the name of the gradient for output idx.
+  // GI(idx), GI_I(idx), GI_V(idx): return the name of the gradient for
+  //     input idx, and also registers that name into the gradient
+  //     registry to be returned.
+  string I(const int i) {
+    CAFFE_ENFORCE((i >= 0) && (i < def_.input().size()));
+    return def_.input(i);
+  }
+  string O(const int i) {
+    CAFFE_ENFORCE((i >= 0) && (i < def_.output().size()));
+    return def_.output(i);
+  }
+  string GI(const int i) {
+    CAFFE_ENFORCE(
+        !g_input_.at(i).IsSparse(),
+        "Input ",
+        def_.input(i),
+        " already set to sparse.");
+    g_input_.at(i).dense_ = GradientName(def_.input(i));
+    return GradientName(def_.input(i));
+  }
+  string GI_I(const int i) {
+    CAFFE_ENFORCE(
+        !g_input_.at(i).IsDense(),
+        "Input ",
+        def_.input(i),
+        " already set to dense.");
+    g_input_.at(i).indices_ = GradientSliceIndices(def_.input(i));
+    return GradientSliceIndices(def_.input(i));
+  }
+  string GI_V(const int i) {
+    CAFFE_ENFORCE(
+        !g_input_.at(i).IsDense(),
+        "Input ",
+        def_.input(i),
+        " already set to dense.");
+    g_input_.at(i).values_ = GradientSliceValues(def_.input(i));
+    return GradientSliceValues(def_.input(i));
+  }
+  string GO(const int i) {
+    CAFFE_ENFORCE(
+        g_output_.at(i).IsDense(),
+        "Gradient of output ",
+        def_.output(i),
+        (g_output_.at(i).IsSparse() ? " is sparse (expected dense)."
+                                    : " is not provided!"));
+    return g_output_.at(i).dense_;
+  }
+  string GO_I(const int i) {
+    CAFFE_ENFORCE(
+        g_output_.at(i).IsSparse(),
+        "Gradient of output ",
+        def_.output(i),
+        (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
+                                   : " is not provided!"));
+    return g_output_.at(i).indices_;
+  }
+  string GO_V(const int i) {
+    CAFFE_ENFORCE(
+        g_output_.at(i).IsSparse(),
+        "Gradient of output ",
+        def_.output(i),
+        (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
+                                   : " is not provided!"));
+    return g_output_.at(i).values_;
+  }
+  const GradientWrapper& GradOut(int i) {
+    return g_output_.at(i);
+  }
+
+  // Function to add a gradient pair to map.
+  void SetDense(const int i, const string& name) {
+    CAFFE_ENFORCE(
+        !g_input_.at(i).IsSparse(),
+        "Input ",
+        def_.input(i),
+        " already set to sparse.");
+    g_input_.at(i).dense_ = name;
+  }
+  void SetSparse(const int i, const string& indices, const string& values) {
+    CAFFE_ENFORCE(
+        !g_input_.at(i).IsDense(),
+        "Input ",
+        def_.input(i),
+        " already set to dense.");
+    g_input_.at(i).indices_ = indices;
+    g_input_.at(i).values_ = values;
+  }
+
+  /**
+   * @brief a helper function to allow one to create one single operator
+   * def, which is usually the case for many simple operators.
+   */
+  template <class... Args>
+  inline static vector<OperatorDef> SingleGradientDef(const Args&... args) {
+    return vector<OperatorDef>{CreateOperatorDef(args...)};
+  }
+
+ public:
+  /**
+    * Returns map that returns the parameters that the gradients are for.
+    */
+  static CaffeMap<string, string> MatchGradsToParams(const OperatorDef& op) {
+    // NOTE: how to go beyond string-matching?
+    CaffeMap<string, string> m;
+    for (auto& out : op.output()) {
+      if (IsGradientBlob(out)) {
+        m[out] = out.substr(0, out.length() - 5);
+      }
+    }
+    return m;
+  }
+
+ private:
+  // Utility functions for gradient name computation. We don't expose them
+  // in order to discourage the use of such names explicitly.
+  static string GradientName(const string& name) {
+    return name + "_grad";
+  }
+
+  static bool IsGradientBlob(const string& name) {
+    return name.length() > 5 && name.find("_grad") == name.length() - 5;
+  }
+
+  static string GradientNameToParam(const string& name) {
+    CHECK(IsGradientBlob(name));
+    return name.substr(0, name.length() - 5);
+  }
+
+  static string GradientSliceIndices(const string& name) {
+    return name + "_grad_indices";
+  }
+
+  static string GradientSliceValues(const string& name) {
+    return name + "_grad_values";
+  }
+
+ protected:
+  // We make the member variables protected in case someone wants to write
+  // a fully custom Get() function.
+  const OperatorDef& def_;
+  const vector<GradientWrapper>& g_output_;
+  vector<GradientWrapper> g_input_;
+};
+
+/**
+ * @brief A helper class to indicate that the operator does not need gradient
+ * computation.
+ *
+ * Use the macro NO_GRADIENT to register operators that do not have gradients.
+ * Note that this is different fron SHOULD_NOT_DO_GRADIENT: the latter means
+ * that the gradient computation should not flow through it at all, and throws
+ * an error if it is called.
+ */
+class NoGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return vector<OperatorDef>();
+  }
+};
+
+/**
+ * @brief A helper class to indicate that the operator should have no gradient.
+ *
+ * This is used when the operator definition is designed to not have a gradient.
+ * Calling a gradient on this operator def will cause Caffe2 to quit.
+ */
+struct ThrowInTheTowelIfGradientIsCalled : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  GradientOpsMeta Get() override {
+    CAFFE_ENFORCE(
+        false, "One should not call gradient for operator ", def_.type(), ".");
+  }
+};
+
+/**
+ * @brief A helper class to indicate that the gradient mechanism is not ready.
+ *
+ * This should only be used sparsely when the gradient does exist, but we have
+ * not implemented it yet and are using this as a lazy excuse. Eventually, a
+ * gradient operator should be implemented.
+ */
+struct GradientNotImplementedYet : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  GradientOpsMeta Get() override {
+    CAFFE_ENFORCE(
+        false,
+        "Operator ",
+        def_.type(),
+        " should have a gradient but is not implemented yet.");
+  }
+};
+
+CAFFE_DECLARE_REGISTRY(
+    GradientRegistry,
+    GradientMakerBase,
+    const OperatorDef&,
+    const vector<GradientWrapper>&);
+
+#define REGISTER_GRADIENT(name, ...) \
+  CAFFE_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
+#define REGISTER_GRADIENT_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
+
+// NO_GRADIENT means that the operator does not need any gradient computation.
+#define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
+
+// SHOULD_NOT_DO_GRADIENT means that the operator is not designed to have
+// gradient operators. If you attempt to call the gradient, a log fatal will
+// occur.
+#define SHOULD_NOT_DO_GRADIENT(name) \
+  REGISTER_GRADIENT(name, ThrowInTheTowelIfGradientIsCalled)
+
+#define GRADIENT_NOT_IMPLEMENTED_YET(name) \
+  REGISTER_GRADIENT(name, GradientNotImplementedYet)
+
+/**
+ * @brief Gets the GradientOpsMeta for the given operator def.
+ */
+GradientOpsMeta GetGradientForOp(
+    const OperatorDef& def,
+    const vector<GradientWrapper>& g_output);
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_OPERATOR_GRADIENT_H_
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
new file mode 100644
index 0000000..4c854e5
--- /dev/null
+++ b/caffe2/core/operator_schema.cc
@@ -0,0 +1,382 @@
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+bool OpSchema::Verify(const OperatorDef& def) const {
+  // Check the number of inputs.
+  if (def.input_size() < min_input_ || def.input_size() > max_input_) {
+    LOG(ERROR) << "Input size " << def.input_size()
+                    << " not in range [min=" << min_input_ << ", max="
+                    << max_input_ << "].";
+    return false;
+  }
+  if (!num_inputs_allowed_(def.input_size())) {
+    LOG(ERROR) << "Input size " << def.input_size()
+                    << " not in allowed input sizes.";
+    return false;
+  }
+  // Check the number of outputs.
+  if (def.output_size() < min_output_ || def.output_size() > max_output_) {
+    LOG(ERROR) << "Output size " << def.output_size()
+                    << " not in range [min=" << min_output_ << ", max="
+                    << max_output_ << "].";
+    return false;
+  }
+  if (!num_outputs_allowed_(def.output_size())) {
+    LOG(ERROR) << "Output size " << def.output_size()
+                    << " not in allowed output sizes.";
+    return false;
+  }
+  if (!num_inputs_outputs_allowed_(def.input_size(), def.output_size())) {
+    LOG(ERROR) << "Combination of input size " << def.input_size()
+               << "and output size " << def.output_size() << " not in allowed.";
+    return false;
+  }
+  // If the number of outputs can be calculated, check if the number matches.
+  if (calculate_output_) {
+    int expected_nout = calculate_output_(def.input_size());
+    if (expected_nout != kCannotComputeNumOutputs &&
+        def.output_size() != expected_nout) {
+      LOG(ERROR) << "Output size " << def.output_size()
+                      << " not matching expected output size, which is "
+                      << expected_nout;
+      return false;
+    }
+  }
+
+  // Check in-place settings.
+  for (int in_idx = 0; in_idx < def.input_size(); ++in_idx) {
+    for (int out_idx = 0; out_idx < def.output_size(); ++out_idx) {
+      // If an input is the same as an output but in-place is not opt-in
+      // either as allowed or enforced, we will fail the verification.
+      if (def.input(in_idx) == def.output(out_idx) &&
+          (!inplace_allowed_(in_idx, out_idx)
+          && !inplace_enforced_(in_idx, out_idx))) {
+        LOG(ERROR) << "Input index " << in_idx << " and output idx " << out_idx
+                   << " (" << def.input(in_idx) << ")"
+                   << " are set to be in-place but this is actually not "
+                   << "supported by op " << def.type();
+        return false;
+      }
+      if (def.input(in_idx) != def.output(out_idx) &&
+          inplace_enforced_(in_idx, out_idx)) {
+        LOG(ERROR) << "Input index " << in_idx << " (" << def.input(in_idx) << ")"
+                   << " and output idx " << out_idx
+                   << " (" << def.output(in_idx) << ")"
+                   << " are not in-place but should be as required by op "
+                   << def.type();
+        return false;
+      }
+    }
+  }
+
+  std::set<std::string> present_args{};
+  for (const auto& arg : def.arg()) {
+    present_args.insert(arg.name());
+  }
+
+  for (const auto& arg : args()) {
+    if (arg.is_required() &&
+        present_args.find(arg.name()) == present_args.end()) {
+      LOG(ERROR) << "Argument '" << arg.name() << "' is required for Operator '"
+                 << def.type() << "'.";
+      return false;
+    }
+  }
+
+  // Phew. All verifications passed.
+  return true;
+}
+
+OpSchema& OpSchema::NumInputs(int min, int max) {
+  min_input_ = min;
+  max_input_ = max;
+  return *this;
+}
+
+OpSchema& OpSchema::NumInputs(int n) {
+  return NumInputs(n, n);
+}
+
+OpSchema& OpSchema::NumInputs(std::function<bool(int)> func) {
+  num_inputs_allowed_ = func;
+  return *this;
+}
+
+OpSchema& OpSchema::NumInputs(set<int> allowed_input_nums) {
+  return NumInputs(
+      [allowed_input_nums](int n)->bool {
+        return allowed_input_nums.count(n);
+      });
+}
+
+OpSchema& OpSchema::NumOutputs(int min, int max) {
+  min_output_ = min;
+  max_output_ = max;
+  return *this;
+}
+
+OpSchema& OpSchema::NumOutputs(int n) {
+  return NumOutputs(n, n);
+}
+
+OpSchema& OpSchema::NumOutputs(std::function<bool(int)> func) {
+  num_outputs_allowed_ = func;
+  return *this;
+}
+
+OpSchema& OpSchema::NumOutputs(set<int> allowed_output_nums) {
+  return NumOutputs(
+      [allowed_output_nums](int n)->bool {
+        return allowed_output_nums.count(n);
+      });
+}
+
+OpSchema& OpSchema::NumInputsOutputs(std::function<bool(int, int)> func) {
+  num_inputs_outputs_allowed_ = func;
+  return *this;
+}
+
+OpSchema& OpSchema::OutputCalculator(std::function<int(int)> calc) {
+  calculate_output_ = calc;
+  return *this;
+}
+
+OpSchema& OpSchema::SameNumberOfOutput() {
+  return OutputCalculator([](int n)->int { return n; } );
+}
+
+OpSchema& OpSchema::AllowInplace(std::function<bool(int, int)> inplace) {
+  inplace_allowed_ = inplace;
+  return *this;
+}
+
+OpSchema& OpSchema::AllowInplace(set<std::pair<int, int>> inplace) {
+  return AllowInplace(
+      [inplace](int in, int out)->bool {
+        return inplace.count(std::make_pair(in, out));
+      });
+}
+
+OpSchema& OpSchema::AllowOneToOneInplace() {
+  return AllowInplace([](int in, int out) { return in == out; });
+}
+
+OpSchema& OpSchema::EnforceInplace(std::function<bool(int, int)> inplace) {
+  inplace_enforced_ = inplace;
+  return *this;
+}
+
+OpSchema& OpSchema::EnforceInplace(set<std::pair<int, int>> inplace) {
+  return EnforceInplace(
+      [inplace](int in, int out)->bool {
+        return inplace.count(std::make_pair(in, out));
+      });
+}
+
+OpSchema& OpSchema::EnforceOneToOneInplace() {
+  return EnforceInplace([](int in, int out) { return in == out; });
+}
+
+OpSchema& OpSchema::Private() {
+  private_ = true;
+  return *this;
+}
+
+OpSchema& OpSchema::InputsCanCrossDevices() {
+  inputs_can_cross_devices_ = true;
+  return *this;
+}
+
+OpSchema& OpSchema::TensorInferenceFunction(
+    TensorInferenceFunctionType function) {
+  tensor_inference_function_ = function;
+  return *this;
+}
+
+OpSchema::TensorInferenceFunctionType OpSchema::NeedsAllInputShapes(
+    TensorInferenceFunctionType f) {
+  return [f](const OperatorDef& def, const vector<TensorShape>& in) {
+    for (const auto& in_ts : in) {
+      if (in_ts.unknown_shape()) {
+        vector<TensorShape> out(def.output().size());
+        for (auto& out_ts : out) {
+          out_ts.set_unknown_shape(true);
+        }
+        return out;
+      }
+    }
+    return f(def, in);
+  };
+}
+
+OpSchema& OpSchema::InheritOnnxSchema(const std::string& onnx_schema_name) {
+  onnx_schema_ = onnx_schema_name;
+  return *this;
+}
+
+OpSchema& OpSchema::IdenticalTypeAndShape() {
+  return TensorInferenceFunction(
+      [](const OperatorDef&, const vector<TensorShape>& input_types) {
+        return vector<TensorShape>(input_types);
+      });
+}
+
+OpSchema& OpSchema::IdenticalTypeAndShapeOfInput(int idx) {
+  return TensorInferenceFunction(
+      [idx](const OperatorDef&, const vector<TensorShape>& input_types) {
+        vector<TensorShape> out(1);
+        out[0] = input_types[idx];
+        return out;
+      });
+}
+
+OpSchema& OpSchema::IdenticalTypeAndShapeOfMultipleInputs(
+    const vector<int>& indices) {
+  return TensorInferenceFunction(
+      [indices](const OperatorDef&, const vector<TensorShape>& input_types) {
+        vector<TensorShape> out(indices.size());
+        for (int i = 0; i < indices.size(); i++) {
+          out[i] = input_types[indices.at(i)];
+        }
+        return out;
+      });
+}
+
+OpSchema& OpSchema::IdenticalTypeAndShapeOfInputDim(int idx, int dim) {
+  return TensorInferenceFunction(
+      [idx, dim](const OperatorDef&, const vector<TensorShape>& input_types) {
+        vector<TensorShape> out(1);
+        out[0].add_dims(input_types[idx].dims(dim));
+        out[0].set_data_type(input_types[idx].data_type());
+        return out;
+      });
+}
+
+OpSchema& OpSchema::ScalarType(::caffe2::TensorProto_DataType dt) {
+  return TensorInferenceFunction(
+      [dt](const OperatorDef& def, const vector<TensorShape>& /*input_types*/) {
+        TensorShape shape;
+        shape.set_data_type(dt);
+        vector<TensorShape> out(def.output_size(), shape);
+        return out;
+      });
+}
+
+OpSchema& OpSchema::CostInferenceFunction(CostInferenceFunctionType function) {
+  cost_inference_function_ =
+      caffe2::make_unique<CostInferenceFunctionType>(function);
+  return *this;
+}
+
+OpSchema& OpSchema::DeviceInferenceFunction(
+    DeviceInferenceFunctionType function) {
+  device_inference_function_ = function;
+  return *this;
+}
+
+OpSchema& OpSchema::SetDoc(const string& doc) {
+  doc_ = doc;
+  return *this;
+}
+
+OpSchema&
+OpSchema::Arg(const char* name, const char* description, bool required) {
+  args_.push_back(Argument(name, description, required));
+  return *this;
+}
+
+#define DEFINE_STANDARG_ARG(name, str)                                \
+  CAFFE2_API const char* OpSchema::Arg_##name = #str;                 \
+  CAFFE2_API OpSchema& OpSchema::Arg##name(const char* description) { \
+    return Arg(#str, description, true);                              \
+  }
+
+DEFINE_STANDARG_ARG(IsTest, is_test)
+
+#undef DEFINE_STANDARG_ARG
+
+OpSchema& OpSchema::Input(const int n, const char* name, const char* description) {
+  if (input_desc_.size() <= (unsigned)n) {
+    input_desc_.resize(n + 1);
+  }
+  input_desc_[n] = std::make_pair(name, description);
+  return *this;
+}
+
+OpSchema& OpSchema::Output(const int n, const char* name, const char* description) {
+  if (output_desc_.size() <= (unsigned)n) {
+    output_desc_.resize(n + 1);
+  }
+  output_desc_[n] = std::make_pair(name, description);
+  return *this;
+}
+
+OpSchema& OpSchema::FillUsing(std::function<void(OpSchema&)> populator) {
+  if (populator) {
+    populator(*this);
+  }
+  return *this;
+}
+
+int OpSchema::CalculateOutput(int num_input) const {
+  if (min_output_ == max_output_) {
+    return min_output_;
+  } else if (calculate_output_) {
+    return calculate_output_(num_input);
+  } else {
+    return kCannotComputeNumOutputs;
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
+  if (!schema.args().empty()) {
+    out << "Arguments:" << std::endl;
+    for (const auto& arg : schema.args()) {
+      out << "  " << arg.name() << " : " << arg.description() << std::endl;
+    }
+  }
+  if (schema.max_input_ > 0) {
+    out << "Inputs:" << std::endl;
+    if (!schema.input_desc_.empty()) {
+      for (size_t i = 0; i < schema.input_desc_.size(); ++i) {
+        const auto& p = schema.input_desc_[i];
+        out << "  " << i << ", " << (p.first ? p.first : "(unnamed)") << " : "
+            << (p.second ? p.second : "(no doc)") << std::endl;
+      }
+    } else {
+      out << "  (no explicit description available)" << std::endl;
+    }
+  }
+  if (schema.max_output_ > 0) {
+    out << "Outputs:" << std::endl;
+    if (!schema.output_desc_.empty()) {
+      for (size_t i = 0; i < schema.output_desc_.size(); ++i) {
+        const auto& p = schema.output_desc_[i];
+        out << "  " << i << ", " << (p.first ? p.first : "(unnamed)") << " : "
+            << (p.second ? p.second : "(no doc)") << std::endl;
+      }
+    } else {
+      out << "  (no explicit description available)" << std::endl;
+    }
+  }
+  out << std::endl;
+  if (schema.doc()) {
+    out << schema.doc();
+  } else {
+    out << "(no documentation yet)" << std::endl;
+  }
+  out << std::endl;
+  if (schema.line_) {
+    out << "Defined at " << schema.file_ << ":" << schema.line_ << std::endl;
+  }
+  return out;
+}
+
+CaffeMap<string, OpSchema>& OpSchemaRegistry::map() {
+  static CaffeMap<string, OpSchema> map;
+  return map;
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
new file mode 100644
index 0000000..6fafcf4
--- /dev/null
+++ b/caffe2/core/operator_schema.h
@@ -0,0 +1,540 @@
+#ifndef CAFFE2_CORE_OPERATOR_SCHEMA_H_
+#define CAFFE2_CORE_OPERATOR_SCHEMA_H_
+
+#include <climits>
+#include <functional>
+#include <initializer_list>
+#include <ostream>
+#include <set>
+#include <vector>
+#include <unordered_map>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// A const value returned by OpSchema::CalculateOutput() if the number of
+// output cannot be determined.
+constexpr int kCannotComputeNumOutputs = -1;
+
+/**
+ * @brief A class to record the schema of an op.
+ *
+ * OpSchema records the common interface of an op specified by its name. This
+ * is optional for each operator implemented in Caffe2 but is strongly
+ * recommended.
+ *
+ * To register an OpSchema, one can use the macro OPERATOR_SCHEMA(name) and
+ * then append the various functions in the class. For example, for an op
+ * that takes in two inputs, one output, and the first input and output
+ * could be in-place, can be written as
+ *
+ *     OPERATOR_SCHEMA(name)
+ *         .NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+ */
+class OpSchema {
+ public:
+  OpSchema() : file_("unknown"), line_(0) {}
+  OpSchema(const string& file, const int line) : file_(file), line_(line) {}
+
+  /**
+   * @brief Returns the file that the op schema is registered from.
+   */
+  inline const string& file() const {
+    return file_;
+  }
+
+  /**
+   * @brief Returns the line in file that the op schema is registered from.
+   */
+  inline int line() const {
+    return line_;
+  }
+
+  /**
+   * @brief Returns the docstring of the op schema.
+   */
+  inline const char* doc() const {
+    return doc_.empty() ? nullptr : doc_.c_str();
+  }
+
+  /**
+   * @brief Verifies if an operator definition protobuf matches the pattern
+   * specified in the schema.
+   */
+  bool Verify(const OperatorDef& def) const;
+
+  // Functions to set the property of the operator schemas.
+  // Sets the number of inputs, either a fixed number or a min and a max.
+
+  /**
+   * @brief A single input.
+   */
+  OpSchema& NumInputs(int n);
+  /**
+   * @brief Input could be in range [min, max], inclusive.
+   */
+  OpSchema& NumInputs(int min, int max);
+  /**
+   * @brief Input could be one of the values specified in allowed_input_nums.
+   */
+  OpSchema& NumInputs(set<int> allowed_input_nums);
+  /**
+   * @brief Input is checked with a specified function.
+   */
+  OpSchema& NumInputs(std::function<bool(int)> func);
+
+  // Sets the number of outputs, either a fixed number, a min and a max,
+  // or a function that takes in the input number and produces an output
+  // number. Use only one function in the set below.
+  /**
+   * @brief A single output.
+   */
+  OpSchema& NumOutputs(int n);
+  /**
+   * @brief Output could be in range [min, max], inclusive.
+   */
+  OpSchema& NumOutputs(int min, int max);
+  /**
+   * @brief Output could be one of the values specified in allowed_output_nums.
+   */
+  OpSchema& NumOutputs(set<int> allowed_output_nums);
+  /**
+   * @brief Output is checked with a specified function.
+   */
+  OpSchema& NumOutputs(std::function<bool(int)> func);
+
+  /**
+   * @brief Relationship between inputs and outputs is checked with a specified
+   * function.
+   */
+  OpSchema& NumInputsOutputs(std::function<bool(int, int)> func);
+
+  // Set the function that can calculate the number of output based on the
+  // number of input. Use only one function in the set below.
+  /**
+   * @brief Set the output calculator to a user-defined function.
+   */
+  OpSchema& OutputCalculator(std::function<int(int)> calc);
+  /**
+   * @brief Set the number of outputs to be the same as the number of inputs.
+   */
+  OpSchema& SameNumberOfOutput();
+
+  // Sets the rule to allow optional in-place operation.
+  OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
+  OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
+  OpSchema& AllowOneToOneInplace();
+  // Sets the rule to enforce in-place opeartion.
+  OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
+  OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
+  OpSchema& EnforceOneToOneInplace();
+
+  // Functions to deal with type and shape inference. Basically, this registers
+  // a function that takes in an OperatorDef and a series of input type and
+  // shape specified by TensorProto objects (whose data fields are empty), and
+  // produces a series of output type and shape.
+  typedef std::function<
+      vector<TensorShape>(const OperatorDef&, const vector<TensorShape>&)>
+      TensorInferenceFunctionType;
+
+  /**
+   * @brief Sets the tensor inference function, which is a std::function object
+   * defined in operator_schema.h.
+   */
+  OpSchema& TensorInferenceFunction(TensorInferenceFunctionType function);
+
+  /**
+   * A wrapper that makes an infer tensor function to return unknown
+   * shape for all outputs if any one of the inputs has unknown shape
+   */
+
+  static TensorInferenceFunctionType NeedsAllInputShapes(
+      TensorInferenceFunctionType f);
+
+  /**
+   * @brief Sets the corresponding onnx schema name
+   */
+  OpSchema& InheritOnnxSchema(const std::string& onnx_schema_name);
+
+  /**
+   * @brief Sets the tensor inference function to produce the same output as
+   * the input.
+   */
+  OpSchema& IdenticalTypeAndShape();
+  OpSchema& IdenticalTypeAndShapeOfInput(int idx);
+  OpSchema& IdenticalTypeAndShapeOfInputDim(int idx, int dim);
+  OpSchema& IdenticalTypeAndShapeOfMultipleInputs(const vector<int>& indices);
+  OpSchema& ScalarType(::caffe2::TensorProto_DataType dt);
+
+  /**
+   * @brief A function to allow one to infer the type and shape from the op
+   * schema.
+   */
+  inline vector<TensorShape> InferTensor(
+      const OperatorDef& def,
+      const vector<TensorShape>& input_type_shape) const {
+    return tensor_inference_function_(def, input_type_shape);
+  }
+
+  /*
+   * @brief A struct to store various cost information about
+   * an operator such as FLOPs, total memory use and parameters.
+   */
+  struct Cost {
+    uint64_t flops{0}; // Floating point operations.
+    uint64_t bytes_read{0}; // Total memory read.
+    uint64_t bytes_written{0}; // Total memory written.
+    uint64_t params_bytes{0}; // Memory footprint of parameters
+  };
+  /**
+   * @brief Registers a function that takes in an OperatorDef
+   * and a series of input shapes and returns the total "cost"
+   * required to run the operator via struct by value.
+   */
+  typedef std::function<
+      struct Cost(const OperatorDef&, const vector<TensorShape>&)>
+      CostInferenceFunctionType;
+
+  /**
+   * @brief Register the Cost inference function.
+   */
+  OpSchema& CostInferenceFunction(CostInferenceFunctionType function);
+
+#if 0 // def _MSC_VER
+  /**
+   * @brief Register the Cost inference function via a pointer.
+   */
+  template <typename T,
+            typename = std::enable_if<
+                std::is_same<CostInferenceFunctionType&&, T>:value
+                >:type>
+  inline OpSchema& CostInferenceFunction(T func) {
+    // Note: This is here in order to resolve an MSVC compiler issue: it
+    // does not automatically convert a function pointer to a std::function,
+    // and needs an explicit conversion.
+    return CostInferenceFunction(CostInferenceFunctionType(func));
+  }
+#endif // _MSC_VER
+
+  bool HasCostInferenceFunction() const {
+    return !!cost_inference_function_;
+  }
+
+  inline struct Cost InferCost(
+      const OperatorDef& def,
+      const vector<TensorShape>& input_tensor_shape) const {
+    CAFFE_ENFORCE(
+        cost_inference_function_, "Cost inference function not defined.");
+    return (*cost_inference_function_)(def, input_tensor_shape);
+  }
+
+  // Functions to do documentation for the operator schema.
+  OpSchema& SetDoc(const string& doc);
+
+  struct Argument {
+    Argument(const char* name, const char* description, bool required)
+        : name_{name}, description_{description}, required_{required} {}
+
+    const char* name() const {
+      return name_;
+    }
+
+    const char* description() const {
+      return description_;
+    }
+
+    bool is_required() const {
+      return required_;
+    }
+
+   private:
+    const char* name_;
+    const char* description_;
+    const bool required_;
+  };
+
+  OpSchema&
+  Arg(const char* name, const char* description, bool required = false);
+
+#define DECLARE_STANDARD_ARG(name, str)     \
+  CAFFE2_API static const char* Arg_##name; \
+  CAFFE2_API OpSchema& Arg##name(const char* description);
+
+  DECLARE_STANDARD_ARG(IsTest, is_test)
+
+#undef DECLARE_STANDARD_ARG
+
+  OpSchema& Input(const int n, const char* name, const char* description);
+  OpSchema& Output(const int n, const char* name, const char* description);
+  // Calls the passed function with `this` as an argument. Useful for
+  // adding docs for temlated/macro ops.
+  OpSchema& FillUsing(std::function<void(OpSchema&)> populator);
+
+  // Remove from documentation
+  OpSchema& Private();
+
+  // This op can pass data across devices
+  OpSchema& InputsCanCrossDevices();
+
+  /**
+   * @brief A function to allow one to get the number of outputs based on the
+   * number of inputs, if this schema supports it.
+   */
+  int CalculateOutput(int num_input) const;
+
+  const std::string& onnx_schema() const {
+    return onnx_schema_;
+  }
+
+  int min_input() const {
+    return min_input_;
+  }
+
+  int max_input() const {
+    return max_input_;
+  }
+
+  int min_output() const {
+    return min_output_;
+  }
+
+  int max_output() const {
+    return max_output_;
+  }
+
+  bool num_inputs_allowed(int x) const {
+    return num_inputs_allowed_(x);
+  }
+
+  bool num_outputs_allowed(int x) const {
+    return num_outputs_allowed_(x);
+  }
+
+  bool num_inputs_outputs_allowed(int x, int y) const {
+    return num_inputs_outputs_allowed_(x, y);
+  }
+
+  int inf() const {
+    return std::numeric_limits<int>::max();
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
+
+  const std::vector<Argument>& args() const {
+    return args_;
+  }
+
+  const std::vector<std::pair<const char*, const char*>>& input_desc() const {
+    return input_desc_;
+  }
+  const std::vector<std::pair<const char*, const char*>>& output_desc() const {
+    return output_desc_;
+  }
+  bool private_op() {
+    return private_;
+  }
+  bool inputs_can_cross_devices() const {
+    return inputs_can_cross_devices_;
+  }
+
+  /**
+   * @brief Returns the required device location of inputs and outputs.
+   */
+  using DeviceInferenceFunctionType = std::function<
+      std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>(
+          const OperatorDef& def)>;
+
+  OpSchema& DeviceInferenceFunction(DeviceInferenceFunctionType function);
+
+  /**
+   * @brief Infer required device location of an op's inputs and outputs
+   */
+  inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
+  InferDevice(const OperatorDef& def) const {
+    return device_inference_function_(def);
+  }
+
+ private:
+  string file_;
+  string doc_;
+  string onnx_schema_;
+  std::vector<Argument> args_{};
+  std::vector<std::pair<const char*, const char*>> input_desc_{};
+  std::vector<std::pair<const char*, const char*>> output_desc_{};
+  int line_ = 0;
+  int min_input_ = 0;
+  int max_input_ = std::numeric_limits<int>::max();
+  int min_output_ = 0;
+  int max_output_ = std::numeric_limits<int>::max();
+  bool private_ = false;
+  bool inputs_can_cross_devices_ = false;
+  std::function<bool(int)> num_inputs_allowed_ = [](int) { return true; };
+  std::function<bool(int)> num_outputs_allowed_ = [](int) { return true; };
+  std::function<bool(int, int)> num_inputs_outputs_allowed_ = [](int, int) {
+    return true;
+  };
+  std::function<int(int)> calculate_output_;
+  // In default, any in-place operation is neither allowed nor enforced.
+  std::function<bool(int, int)> inplace_allowed_ = [](int, int) {
+    return false;
+  };
+  std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
+    return false;
+  };
+  TensorInferenceFunctionType tensor_inference_function_ =
+      [](const OperatorDef& def, const vector<TensorShape>&) {
+        vector<TensorShape> out;
+        for (int i = 0; i < def.output_size(); i++) {
+          TensorShape ts;
+          ts.set_unknown_shape(true);
+          out.push_back(ts);
+        }
+        return out;
+      };
+  std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
+  DeviceInferenceFunctionType device_inference_function_ =
+      [](const OperatorDef& def) {
+        auto op_device =
+            def.has_device_option() ? def.device_option() : DeviceOption();
+        vector<DeviceOption> in_dev(def.input_size(), op_device);
+        vector<DeviceOption> out_dev(def.output_size(), op_device);
+        return std::make_pair(in_dev, out_dev);
+      };
+};
+
+/**
+ * @brief A registry to hold all the operator schemas.
+ */
+class OpSchemaRegistry {
+ public:
+  static OpSchema&
+  NewSchema(const string& key, const string& file, const int line) {
+    auto& m = map();
+    auto it = m.find(key);
+    if (it != m.end()) {
+      const auto& schema = it->second;
+      std::ios_base::Init init;
+      std::cerr << "Trying to register schema with name " << key
+                << " from file " << file << " line " << line
+                << ", but it is already registered from file " << schema.file()
+                << " line " << schema.line();
+      abort();
+    }
+    m.emplace(std::make_pair(key, OpSchema(file, line)));
+    return m[key];
+  }
+
+  static const OpSchema* Schema(const string& key) {
+    auto& m = map();
+    auto it = m.find(key);
+    if (it != m.end()) {
+      return &it->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  // OpSchemaRegistry should not need to be instantiated.
+  OpSchemaRegistry() = delete;
+
+  /**
+   * @brief Returns the underlying string to OpSchema map.
+   *
+   * You should not manually manipulate the map object returned. Instead, use
+   * the macros defined such as OPERATOR_SCHEMA to register your operator
+   * schema.
+   *
+   * We wrap it inside a function to avoid the static initialization order
+   * fiasco.
+   */
+  static CaffeMap<string, OpSchema>& map();
+};
+
+// Helper function for creating simple tensorproto with dimension and type
+template <typename T_I = int>
+inline TensorShape CreateTensorShape(
+    vector<T_I> dims,
+    ::caffe2::TensorProto_DataType dt) {
+  TensorShape ts;
+  for (int d : dims) {
+    ts.add_dims(d);
+  }
+  ts.set_data_type(dt);
+  return ts;
+}
+
+// Helper function
+inline vector<TIndex> GetDimsVector(const TensorShape& shape) {
+  vector<TIndex> dims;
+  for (auto d : shape.dims()) {
+    dims.push_back(d);
+  }
+  return dims;
+}
+
+// Helper function
+inline uint64_t nElemFromDim(const TensorShape& X) {
+  uint64_t nElem = X.dims_size() > 0 ? 1 : 0;
+  for (int i = 0; i < X.dims_size(); ++i) {
+    nElem *= X.dims(i);
+  }
+  return nElem;
+}
+
+// Helper function for infer op inputs and outputs device information.
+inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
+InferOpInputOutputDevice(const OperatorDef& op) {
+  auto op_schema = OpSchemaRegistry::Schema(op.type());
+  if (op_schema) {
+    // op_schema found
+    return op_schema->InferDevice(op);
+
+  } else {
+    // No schema for op.type registered
+    auto temp_schema = OpSchema();
+    return temp_schema.InferDevice(op);
+  }
+}
+
+template <uint64_t OpsPerPoint>
+OpSchema::Cost PointwiseCostInference(
+    const OperatorDef& /* unused */,
+    const vector<TensorShape>& inputs) {
+  struct OpSchema::Cost c;
+  const TensorShape X = inputs[0];
+  uint64_t nElemX = nElemFromDim(X);
+  uint64_t nElemRead = 0;
+  for (int i = 0; i < inputs.size(); ++i) {
+    nElemRead += nElemFromDim(inputs[i]);
+  }
+
+  c.flops = nElemX * OpsPerPoint;
+  c.bytes_read = nElemRead * sizeof(X.data_type());
+  c.bytes_written = nElemX * sizeof(X.data_type());
+  return c;
+}
+
+} // namespace caffe2
+
+#ifndef CAFFE2_NO_OPERATOR_SCHEMA
+
+#define OPERATOR_SCHEMA(name)                                     \
+  void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+      &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
+
+#else // CAFFE2_NO_OPERATOR_SCHEMA
+
+#define OPERATOR_SCHEMA(name)                                     \
+  void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+      1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
+
+#endif // CAFFE2_NO_OPERATOR_SCHEMA
+
+#endif // CAFFE2_CORE_OPERATOR_SCHEMA_H_
diff --git a/caffe2/core/operator_schema_test.cc b/caffe2/core/operator_schema_test.cc
new file mode 100644
index 0000000..5e54cf7
--- /dev/null
+++ b/caffe2/core/operator_schema_test.cc
@@ -0,0 +1,279 @@
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(OpSchemaTestOp)
+  .NumInputs(1).NumOutputs(1)
+  .SetDoc(R"DOC(Test Documentation)DOC")
+  .Input(0, "in0", "dummy input.")
+  .Output(0, "out0", "dummy output.");
+
+TEST(OperatorSchemaTest, BasicSchema) {
+  const OpSchema* schema = OpSchemaRegistry::Schema("OpSchemaTestOp");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  EXPECT_TRUE(schema != nullptr);
+  EXPECT_TRUE(schema->doc() != nullptr);
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaTestOp", "",
+      vector<string>{"in"}, vector<string>{"out"});
+  EXPECT_TRUE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaTestOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out"});
+  EXPECT_FALSE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaTestOp", "",
+      vector<string>{"in"}, vector<string>{"out1", "out2"});
+  EXPECT_FALSE(schema->Verify(def3));
+}
+
+OPERATOR_SCHEMA(OpSchemaSpecifiedInputOutputOp)
+  .NumInputs({2, 4}).NumOutputs({1, 3});
+
+TEST(OperatorSchemaTest, SpecifiedInputOutput) {
+  const OpSchema* schema
+      = OpSchemaRegistry::Schema("OpSchemaSpecifiedInputOutputOp");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  EXPECT_TRUE(schema != nullptr);
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaSpecifiedInputOutputOp", "",
+      vector<string>{"in"}, vector<string>{"out"});
+  EXPECT_FALSE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaSpecifiedInputOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out"});
+  EXPECT_TRUE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaSpecifiedInputOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2"});
+  EXPECT_FALSE(schema->Verify(def3));
+}
+
+OPERATOR_SCHEMA(OpSchemaInputOutputRelationOp)
+    .NumInputsOutputs([](int in, int out) {
+      return out == in || out == in * 2;
+    });
+
+TEST(OperatorSchemaTest, InputOutputRelation) {
+  const OpSchema* schema
+      = OpSchemaRegistry::Schema("OpSchemaInputOutputRelationOp");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  EXPECT_TRUE(schema != nullptr);
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaInputOutputRelationOp", "",
+      vector<string>{"in"}, vector<string>{"out"});
+  EXPECT_TRUE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaInputOutputRelationOp", "",
+      vector<string>{"in"}, vector<string>{"out1", "out2"});
+  EXPECT_TRUE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaInputOutputRelationOp", "",
+      vector<string>{"in1", "in2", "in3"}, vector<string>{"out1", "out2"});
+  EXPECT_FALSE(schema->Verify(def3));
+}
+
+OPERATOR_SCHEMA(OpSchemaSameInputOutputOp)
+    .SameNumberOfOutput();
+
+TEST(OperatorSchemaTest, SameInputOutput) {
+  const OpSchema* schema =
+      OpSchemaRegistry::Schema("OpSchemaSameInputOutputOp");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaSameInputOutputOp", "",
+      vector<string>{"in"}, vector<string>{"out"});
+  EXPECT_TRUE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaSameInputOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2"});
+  EXPECT_TRUE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaSameInputOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2", "out3"});
+  EXPECT_FALSE(schema->Verify(def3));
+}
+
+OPERATOR_SCHEMA(OpSchemaCalculateOutputOp)
+    .NumInputs(1, 5).NumOutputs(2, 6)
+    .OutputCalculator([](int n) { return n + 1; });
+
+TEST(OperatorSchemaTest, CalculateOutput) {
+  const OpSchema* schema =
+      OpSchemaRegistry::Schema("OpSchemaCalculateOutputOp");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaCalculateOutputOp", "",
+      vector<string>{"in"}, vector<string>{"out"});
+  EXPECT_FALSE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaCalculateOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2"});
+  EXPECT_FALSE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaCalculateOutputOp", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2", "out3"});
+  EXPECT_TRUE(schema->Verify(def3));
+}
+
+OPERATOR_SCHEMA(OpSchemaInplace)
+    .NumInputs(2).NumOutputs(2)
+    .AllowInplace({{0, 0}})
+    .EnforceInplace({{1, 1}});
+
+TEST(OperatorSchemaTest, Inplace) {
+  const OpSchema* schema =
+      OpSchemaRegistry::Schema("OpSchemaInplace");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  OperatorDef def1 = CreateOperatorDef(
+      "OpSchemaInplace", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "in2"});
+  EXPECT_TRUE(schema->Verify(def1));
+  OperatorDef def2 = CreateOperatorDef(
+      "OpSchemaInplace", "",
+      vector<string>{"in1", "in2"}, vector<string>{"in1", "in2"});
+  EXPECT_TRUE(schema->Verify(def2));
+  OperatorDef def3 = CreateOperatorDef(
+      "OpSchemaInplace", "",
+      vector<string>{"in1", "in2"}, vector<string>{"in1", "out2"});
+  EXPECT_FALSE(schema->Verify(def3));
+  OperatorDef def4 = CreateOperatorDef(
+      "OpSchemaInplace", "",
+      vector<string>{"in1", "in2"}, vector<string>{"out1", "out2"});
+  EXPECT_FALSE(schema->Verify(def4));
+}
+
+OPERATOR_SCHEMA(OpSchemaSameInputOutputTensorInference).IdenticalTypeAndShape();
+
+TEST(OperatorSchemaTest, TensorInferenceIdentical) {
+  const OpSchema* schema =
+      OpSchemaRegistry::Schema("OpSchemaSameInputOutputTensorInference");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  OperatorDef def = CreateOperatorDef(
+      "OpSchemaSameInputOutputTensorInference",
+      "",
+      vector<string>{"in"},
+      vector<string>{"out"});
+  vector<TensorShape> shapes(1);
+  shapes[0].set_data_type(TensorProto::FLOAT);
+  shapes[0].add_dims(1);
+  shapes[0].add_dims(2);
+  shapes[0].add_dims(3);
+  vector<TensorShape> out = schema->InferTensor(def, shapes);
+  EXPECT_EQ(out.size(), 1);
+  EXPECT_EQ(out[0].SerializeAsString(), shapes[0].SerializeAsString());
+}
+
+OPERATOR_SCHEMA(OpSchemaArbitraryTensorInference)
+    .TensorInferenceFunction(
+        [](const OperatorDef&, const vector<TensorShape>&) {
+          vector<TensorShape> shapes(1);
+          shapes[0].set_data_type(TensorProto::FLOAT);
+          shapes[0].add_dims(1701);
+          return shapes;
+        });
+
+TEST(OperatorSchemaTest, TensorInferenceArbitrary) {
+  const OpSchema* schema =
+      OpSchemaRegistry::Schema("OpSchemaArbitraryTensorInference");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  OperatorDef def = CreateOperatorDef(
+      "OpSchemaArbitraryTensorInference",
+      "",
+      vector<string>{"in"},
+      vector<string>{"out"});
+  vector<TensorShape> out = schema->InferTensor(def, vector<TensorShape>());
+  EXPECT_EQ(out.size(), 1);
+  EXPECT_EQ(out[0].data_type(), TensorProto::FLOAT);
+  EXPECT_EQ(out[0].dims_size(), 1);
+  EXPECT_EQ(out[0].dims(0), 1701);
+}
+
+TEST(OperatorSchemaTest, TestCastSchema) {
+  // This tests a use case of the schema: the Cast op takes in the def and
+  // deduces the
+  // schema from the "to" argument.
+  const OpSchema* schema = OpSchemaRegistry::Schema("Cast");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  if (!schema) {
+    // Compiled without the Cast op.
+    return;
+  }
+  OperatorDef def = CreateOperatorDef(
+      "Cast",
+      "",
+      vector<string>{"in"},
+      vector<string>{"out"},
+      vector<Argument>{MakeArgument<int64_t>("to", TensorProto::UINT8)});
+  vector<TensorShape> out = schema->InferTensor(def, vector<TensorShape>(1));
+  EXPECT_EQ(out.size(), 1);
+  // Data type should be inferred.
+  EXPECT_EQ(out[0].data_type(), TensorProto::UINT8);
+  // Dim should not be set (same as input);
+  EXPECT_EQ(out[0].dims_size(), 0);
+}
+
+OPERATOR_SCHEMA(OpSchemaCostInference)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .CostInferenceFunction([](const OperatorDef& /*def*/,
+                              const vector<TensorShape>& inputs) {
+      struct OpSchema::Cost c;
+      c.flops = 2 * inputs[0].dims(0) * inputs[0].dims(1) * inputs[1].dims(1);
+      return c;
+    });
+
+TEST(OperatorSchemaTest, TestCostInference) {
+  const OpSchema* schema = OpSchemaRegistry::Schema("OpSchemaCostInference");
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+  EXPECT_TRUE(schema == nullptr);
+  return;
+#endif
+  if (!schema) {
+    return;
+  }
+  OperatorDef def = CreateOperatorDef(
+      "OpSchemaCostInference", "", vector<string>{"in"}, vector<string>{"out"});
+  vector<TensorShape> shapes(2);
+  shapes[0].set_data_type(TensorProto::FLOAT);
+  shapes[0].add_dims(10);
+  shapes[0].add_dims(10);
+  shapes[1].set_data_type(TensorProto::FLOAT);
+  shapes[1].add_dims(10);
+  shapes[1].add_dims(10);
+  EXPECT_EQ(2000, schema->InferCost(def, shapes).flops);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/operator_test.cc b/caffe2/core/operator_test.cc
new file mode 100644
index 0000000..93d85c9
--- /dev/null
+++ b/caffe2/core/operator_test.cc
@@ -0,0 +1,598 @@
+#include <iostream>
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+// Since we instantiate this on CPU and GPU (but don't want a
+// CUDAContext dependency, we use OperatorBase. In general, you only
+// want to inherit from Operator<Context> in your code.
+class JustTest : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  virtual string type() {
+    return "base";
+  }
+};
+
+class JustTestAndNeverConstructs : public JustTest {
+ public:
+  JustTestAndNeverConstructs(const OperatorDef& def, Workspace* ws)
+      : JustTest(def, ws) {
+    throw UnsupportedOperatorFeature("I just don't construct.");
+  }
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  string type() override {
+    return "FOO";
+  }
+};
+
+class JustTestAndDoesConstruct : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  string type() override {
+    return "BAR";
+  }
+};
+
+class JustTestWithSomeOutput : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    *OperatorBase::Output<int>(0) = 5;
+    return true;
+  }
+  string type() override {
+    return "SETTING_SOME_OUTPUT";
+  }
+};
+
+OPERATOR_SCHEMA(JustTest).NumInputs(0, 1).NumOutputs(0, 1);
+OPERATOR_SCHEMA(JustTestCPUOnly).NumInputs(0, 1).NumOutputs(0, 1);
+OPERATOR_SCHEMA(JustTestWithSomeOutput);
+
+REGISTER_CPU_OPERATOR(JustTest, JustTest);
+REGISTER_CPU_OPERATOR(JustTestCPUOnly, JustTest);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, FOO, JustTestAndNeverConstructs);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAZ, JustTestAndDoesConstruct);
+REGISTER_CUDA_OPERATOR(JustTest, JustTest);
+REGISTER_CPU_OPERATOR(JustTestWithSomeOutput, JustTestWithSomeOutput);
+
+TEST(OperatorTest, DeviceTypeRegistryWorks) {
+  EXPECT_EQ(gDeviceTypeRegistry()->count(DeviceType::CPU), 1);
+}
+
+TEST(OperatorTest, RegistryWorks) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  // After introducing events, CUDA operator creation has to have CUDA compiled
+  // as it needs to instantiate an Event object with CUDAContext. Thus we will
+  // guard this test below.
+  if (HasCudaRuntime()) {
+    op_def.mutable_device_option()->set_device_type(CUDA);
+    op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+  }
+}
+
+TEST(OperatorTest, RegistryWrongDevice) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTypeCPUOnly");
+  op_def.mutable_device_option()->set_device_type(CUDA);
+  try {
+    CreateOperator(op_def, &ws);
+    LOG(FATAL) << "No exception was thrown";
+  } catch (const std::exception& e) {
+    LOG(INFO) << "Exception " << e.what();
+  }
+}
+
+TEST(OperatorTest, ExceptionWorks) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("ThrowException");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  // Note: we do not do ASSERT_THROW in order to print out
+  // the error message for inspection.
+  try {
+    op->Run();
+    // This should not happen - exception should throw above.
+    LOG(FATAL) << "This should not happen.";
+  } catch (const EnforceNotMet& err) {
+    LOG(INFO) << err.msg();
+  }
+  try {
+    op->RunAsync();
+    // This should not happen - exception should throw above.
+    LOG(FATAL) << "This should not happen.";
+  } catch (const EnforceNotMet& err) {
+    LOG(INFO) << err.msg();
+  }
+}
+
+TEST(OperatorTest, FallbackIfEngineDoesNotBuild) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  op_def.set_engine("FOO");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "base");
+}
+
+TEST(OperatorTest, MultipleEngineChoices) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  op_def.set_engine("FOO,BAR");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+}
+
+TEST(OperatorTest, CannotUseUninitializedBlob) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("output");
+  ASSERT_THROW(CreateOperator(op_def, &ws), EnforceNotMet);
+}
+
+TEST(OperatorTest, TestParameterAccess) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("output");
+  AddArgument<float>("arg0", 0.1, &op_def);
+  AddArgument<vector<int>>("arg1", vector<int>{1, 2}, &op_def);
+  AddArgument<string>("arg2", "argstring", &op_def);
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  vector<int> i = op.GetRepeatedArgument<int>("arg1");
+  EXPECT_EQ(i.size(), 2);
+  EXPECT_EQ(i[0], 1);
+  EXPECT_EQ(i[1], 2);
+  EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
+  auto default1 = op.GetRepeatedArgument<int>("arg3", {2, 3});
+  EXPECT_EQ(default1.size(), 2);
+  EXPECT_EQ(default1[0], 2);
+  EXPECT_EQ(default1[1], 3);
+  auto default2 = op.GetRepeatedArgument<int>("arg4");
+  EXPECT_EQ(default2.size(), 0);
+}
+
+TEST(OperatorTest, CannotAccessParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("output");
+  AddArgument<float>("arg0", 0.1f, &op_def);
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  ASSERT_THROW(op.GetSingleArgument<int>("arg0", 0), EnforceNotMet);
+}
+
+#if GTEST_HAS_DEATH_TEST
+TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("output");
+  AddArgument<vector<float>>("arg0", vector<float>{0.1f}, &op_def);
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  auto args = op.GetRepeatedArgument<float>("arg0");
+  EXPECT_EQ(args.size(), 1);
+  EXPECT_FLOAT_EQ(args[0], 0.1f);
+  EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
+               "Argument does not have the right field: expected ints");
+}
+#endif
+
+TEST(OperatorTest, TestDefaultValue) {
+  OperatorDef op_def;
+  Workspace ws;
+  OperatorBase op(op_def, &ws);
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg-nonexisting", 0.5f), 0.5f);
+}
+
+TEST(OperatorTest, TestSetUp) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(ws.HasBlob("output"));
+}
+
+TEST(OperatorTest, TestSetUpInputOutputCount) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_input("input2");
+  op_def.add_output("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  EXPECT_NE(nullptr, ws.CreateBlob("input2"));
+#ifndef CAFFE2_NO_OPERATOR_SCHEMA
+  // JustTest will only accept one single input.
+  ASSERT_ANY_THROW(CreateOperator(op_def, &ws));
+#endif
+
+  op_def.clear_input();
+  op_def.add_input("input");
+  op_def.add_output("output2");
+#ifndef CAFFE2_NO_OPERATOR_SCHEMA
+  // JustTest will only produce one single output.
+  ASSERT_ANY_THROW(CreateOperator(op_def, &ws));
+#endif
+}
+
+TEST(OperatorTest, TestOutputValues) {
+  NetDef net_def;
+  net_def.set_name("NetForTest");
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest1");
+  op_def.set_type("JustTestWithSomeOutput");
+  op_def.add_output("output");
+  // JustTest will only produce one single output.
+  net_def.add_op()->CopyFrom(op_def);
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_TRUE(net->Run());
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_EQ(ws.GetBlob("output")->Get<int>(), 5);
+}
+
+NetDef GetNetDefForTest() {
+  NetDef net_def;
+  OperatorDef op_def;
+  net_def.set_name("NetForTest");
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_input("input");
+  op_def.add_output("hidden");
+  net_def.add_op()->CopyFrom(op_def);
+  op_def.set_name("JustTest1");
+  op_def.set_input(0, "hidden");
+  op_def.set_output(0, "output");
+  net_def.add_op()->CopyFrom(op_def);
+  return net_def;
+}
+
+TEST(NetTest, TestScaffoldingSimpleNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_type("simple");
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+TEST(NetTest, TestScaffoldingDAGNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_type("dag");
+  net_def.set_num_workers(1);
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+class GetFooGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return vector<OperatorDef>{
+        CreateOperatorDef(
+            "FooGradient", "",
+            std::vector<string>{GO(0)},
+            std::vector<string>{GI(0)})};
+  }
+};
+
+REGISTER_GRADIENT(Foo, GetFooGradient);
+
+TEST(OperatorGradientRegistryTest, GradientSimple) {
+  Argument arg = MakeArgument<int>("arg", 1);
+  DeviceOption option;
+  option.set_device_type(CPU);
+  OperatorDef def = CreateOperatorDef(
+      "Foo", "", std::vector<string>{"in"}, std::vector<string>{"out"},
+      std::vector<Argument>{arg}, option, "DUMMY_ENGINE");
+  vector<GradientWrapper> g_output(1);
+  g_output[0].dense_ = "out_grad";
+  GradientOpsMeta meta = GetGradientForOp(def, g_output);
+  // Check the names, input and output.
+  EXPECT_EQ(meta.ops_.size(), 1);
+  const OperatorDef& grad_op = meta.ops_[0];
+  EXPECT_EQ(grad_op.type(), "FooGradient");
+  EXPECT_EQ(grad_op.name(), "");
+  EXPECT_EQ(grad_op.input_size(), 1);
+  EXPECT_EQ(grad_op.output_size(), 1);
+  EXPECT_EQ(grad_op.input(0), "out_grad");
+  EXPECT_EQ(grad_op.output(0), "in_grad");
+  // Checks the engine, device option and arguments.
+  EXPECT_EQ(grad_op.engine(), "DUMMY_ENGINE");
+  EXPECT_EQ(grad_op.device_option().device_type(), CPU);
+  EXPECT_EQ(grad_op.arg_size(), 1);
+  EXPECT_EQ(grad_op.arg(0).SerializeAsString(),
+            MakeArgument<int>("arg", 1).SerializeAsString());
+  // Checks the gradient name for input.
+  EXPECT_EQ(meta.g_input_.size(), 1);
+  EXPECT_TRUE(meta.g_input_[0].IsDense());
+  EXPECT_EQ(meta.g_input_[0].dense_, "in_grad");
+}
+
+TEST(EnginePrefTest, PerOpEnginePref) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+
+  SetPerOpEnginePref({{DeviceType::CPU, {{"JustTest", {"BAR"}}}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+  }
+  // clear
+  SetPerOpEnginePref({});
+
+  // Invalid operator type
+  ASSERT_THROW(
+      SetPerOpEnginePref({{DeviceType::CPU, {{"NO_EXIST", {"BAR"}}}}}),
+      EnforceNotMet);
+}
+
+TEST(EnginePrefTest, GlobalEnginePref) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+
+  SetGlobalEnginePref({{DeviceType::CPU, {"FOO", "BAR"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+  }
+  // clear
+  SetGlobalEnginePref({});
+
+  SetGlobalEnginePref({{DeviceType::CPU, {"FOO"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "base");
+  }
+  // clear
+  SetGlobalEnginePref({});
+
+  // Invalid device type
+  ASSERT_THROW(SetGlobalEnginePref({{8888, {"FOO"}}}), EnforceNotMet);
+}
+
+TEST(EnginePrefTest, GlobalEnginePrefAndPerOpEnginePref) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+
+  SetPerOpEnginePref({{DeviceType::CPU, {{"JustTest", {"BAR"}}}}});
+  SetGlobalEnginePref({{DeviceType::CPU, {"BAZ"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    // per op pref takes precedence
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+  }
+  // clear
+  SetPerOpEnginePref({});
+  SetGlobalEnginePref({});
+}
+
+TEST(EnginePrefTest, GlobalEnginePrefAndPerOpEnginePrefAndOpDef) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  op_def.set_engine("BAR");
+
+  SetPerOpEnginePref({{DeviceType::CPU, {{"JustTest", {"BAZ"}}}}});
+  SetGlobalEnginePref({{DeviceType::CPU, {"BAZ"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    // operator_def takes precedence
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+  }
+  // clear
+  SetPerOpEnginePref({});
+  SetGlobalEnginePref({});
+}
+
+TEST(EnginePrefTest, SetOpEnginePref) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+
+  SetPerOpEnginePref({{DeviceType::CPU, {{"JustTest", {"BAZ"}}}}});
+  SetOpEnginePref("JustTest", {{DeviceType::CPU, {"BAR"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    // operator_def takes precedence
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+  }
+  // clear
+  SetPerOpEnginePref({});
+  SetGlobalEnginePref({});
+}
+
+TEST(EnginePrefTest, SetDefaultEngine) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+
+  SetPerOpEnginePref({{DeviceType::CPU, {{"JustTest", {"DEFAULT"}}}}});
+  SetGlobalEnginePref({{DeviceType::CPU, {"BAR"}}});
+  {
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    // operator_def takes precedence
+    EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "base");
+  }
+  // clear
+  SetPerOpEnginePref({});
+  SetGlobalEnginePref({});
+}
+
+class JustTestWithRequiredArg : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  string type() override {
+    return "JustTestWithRequiredArg";
+  }
+};
+
+REGISTER_CPU_OPERATOR(JustTestWithRequiredArg, JustTestWithRequiredArg);
+OPERATOR_SCHEMA(JustTestWithRequiredArg)
+    .NumInputs(0, 1)
+    .NumOutputs(0, 1)
+    .Arg("test_arg", "this arg is required", true);
+
+TEST(RequiredArg, Basic) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTestWithRequiredArg");
+
+  {
+    try {
+      CreateOperator(op_def, &ws);
+      LOG(FATAL) << "No exception was thrown";
+    } catch (const std::exception& e) {
+      LOG(INFO) << "Exception thrown (expected): " << e.what();
+    }
+  }
+
+  {
+    op_def.add_arg()->CopyFrom(MakeArgument("test_arg", 1));
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_EQ(
+        static_cast<JustTest*>(op.get())->type(), "JustTestWithRequiredArg");
+  }
+}
+
+class JustTestWithStandardIsTestArg : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  string type() override {
+    return "JustTestWithStandardIsTestArg";
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    JustTestWithStandardIsTestArg,
+    JustTestWithStandardIsTestArg);
+OPERATOR_SCHEMA(JustTestWithStandardIsTestArg)
+    .NumInputs(0, 1)
+    .NumOutputs(0, 1)
+    .ArgIsTest("this is_test arg is required");
+
+TEST(IsTestArg, standard) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTestWithStandardIsTestArg");
+
+  {
+    try {
+      CreateOperator(op_def, &ws);
+      LOG(FATAL) << "No exception was thrown";
+    } catch (const std::exception& e) {
+      LOG(INFO) << "Exception thrown (expected): " << e.what();
+    }
+  }
+
+  {
+    op_def.add_arg()->CopyFrom(MakeArgument(OpSchema::Arg_IsTest, 1));
+    const auto op = CreateOperator(op_def, &ws);
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_EQ(
+        static_cast<JustTest*>(op.get())->type(),
+        "JustTestWithStandardIsTestArg");
+  }
+}
+
+class JustTestWithNonStandardIsTestArg : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  string type() override {
+    return "JustTestWithNonStandardIsTestArg";
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    JustTestWithNonStandardIsTestArg,
+    JustTestWithNonStandardIsTestArg);
+OPERATOR_SCHEMA(JustTestWithNonStandardIsTestArg)
+    .NumInputs(0, 1)
+    .NumOutputs(0, 1)
+    .Arg(OpSchema::Arg_IsTest, "this is_test arg is not required");
+
+TEST(IsTestArg, non_standard) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTestWithNonStandardIsTestArg");
+
+  const auto op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_EQ(
+      static_cast<JustTest*>(op.get())->type(),
+      "JustTestWithNonStandardIsTestArg");
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/parallel_net_test.cc b/caffe2/core/parallel_net_test.cc
new file mode 100644
index 0000000..ad30653
--- /dev/null
+++ b/caffe2/core/parallel_net_test.cc
@@ -0,0 +1,313 @@
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <thread>  // NOLINT
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+using std::clock_t;
+using std::clock;
+
+// When measuring time, we relax the measured time by +- 20ms.
+const int kTimeThreshold = 20;
+
+// SleepOp basically sleeps for a given number of seconds.
+// We allow arbitrary inputs and at most one output so that we can
+// test scaffolding of networks. If the output is 1, it will be filled with
+// vector<clock_t> with two elements: start time and end time.
+class SleepOp final : public Operator<CPUContext> {
+ public:
+  SleepOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
+    DCHECK_GT(ms_, 0);
+    DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
+  }
+
+  bool RunOnDevice() override {
+    clock_t start = clock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
+    clock_t end = clock();
+    if (OperatorBase::OutputSize()) {
+      vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
+      output->resize(2);
+      (*output)[0] = start;
+      (*output)[1] = end;
+    }
+    return true;
+  }
+
+ private:
+  int ms_;
+};
+
+OPERATOR_SCHEMA(Sleep).NumInputs(0, INT_MAX).NumOutputs(0, 1);
+
+REGISTER_CPU_OPERATOR(Sleep, SleepOp);
+REGISTER_CUDA_OPERATOR(Sleep, SleepOp);
+
+const char kSleepNetDefString[] =
+"  name: \"sleepnet\""
+"  type: \"dag\""
+"  num_workers: 2"
+"  op {"
+"    output: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    input: \"sleep1\""
+"    output: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    output: \"sleep3\""
+"    name: \"sleep3\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+namespace {
+// Run a network and get its duration in milliseconds.
+int RunNetAndGetDuration(const string& net_def_str, const string& type) {
+  NetDef net_def;
+  CAFFE_ENFORCE(
+      TextFormat::ParseFromString(net_def_str, &net_def));
+  net_def.set_type(type);
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  CAFFE_ENFORCE(net.get() != nullptr);
+  auto start_time = std::chrono::system_clock::now();
+  CAFFE_ENFORCE(net->Run());
+  // Inspect the time - it should be around 200 milliseconds, since sleep3 can
+  // run in parallel with sleep1 and sleep2.
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now() - start_time);
+  int milliseconds = duration.count();
+  return milliseconds;
+}
+}  // namespace
+
+TEST(DAGNetTest, TestDAGNetTiming) {
+  int ms = RunNetAndGetDuration(string(kSleepNetDefString), "dag");
+  EXPECT_NEAR(ms, 200, kTimeThreshold);
+}
+
+// For sanity check, we also test the sequential time - it should take 0.35
+// seconds instead since everything has to be sequential.
+TEST(SimpleNetTest, TestSimpleNetTiming) {
+  int ms = RunNetAndGetDuration(string(kSleepNetDefString), "simple");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+// This network has two operators reading the same blob at the same time. This
+// should not change anything and the DAG should still make sleep2 and sleep3
+// run in parallel.
+const char kSleepNetDefStringReadAfterRead[] =
+"  name: \"sleepnet\""
+"  type: \"dag\""
+"  num_workers: 2"
+"  op {"
+"    output: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    input: \"sleep1\""
+"    output: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    input: \"sleep1\""
+"    output: \"sleep3\""
+"    name: \"sleep3\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+TEST(DAGNetTest, TestDAGNetTimingReadAfterRead) {
+  int ms = RunNetAndGetDuration(string(kSleepNetDefStringReadAfterRead), "dag");
+  EXPECT_NEAR(ms, 250, kTimeThreshold);
+}
+
+// For sanity check, we also test the sequential time - it should take 0.35
+// seconds instead since everything has to be sequential.
+TEST(SimpleNetTest, TestSimpleNetTimingReadAfterRead) {
+  int ms = RunNetAndGetDuration(string(kSleepNetDefStringReadAfterRead), "simple");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+// This network has two operators writing out the sleep2 blob. As a result, the
+// operator sleep2-again creates a write after write dependency and the whole
+// process should be sequential.
+const char kSleepNetDefStringWriteAfterWrite[] =
+"  name: \"sleepnet\""
+"  type: \"dag\""
+"  num_workers: 2"
+"  op {"
+"    output: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    input: \"sleep1\""
+"    output: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    output: \"sleep2\""
+"    name: \"sleep2-again\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+TEST(DAGNetTest, TestDAGNetTimingWriteAfterWrite) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringWriteAfterWrite), "dag");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+TEST(SimpleNetTest, TestSimpleNetTimingWriteAfterWrite) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringWriteAfterWrite), "simple");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+// This network has an operator writing to sleep1 while another operator is
+// accessing it. As a result, the operator sleep1-again creates a write after
+// read dependency and the whole process should be sequential.
+const char kSleepNetDefStringWriteAfterRead[] =
+"  name: \"sleepnet\""
+"  type: \"dag\""
+"  num_workers: 2"
+"  op {"
+"    output: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    input: \"sleep1\""
+"    output: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  op {"
+"    output: \"sleep1\""
+"    name: \"sleep1-again\""
+"    type: \"Sleep\""
+"    arg {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+TEST(DAGNetTest, TestDAGNetTimingWriteAfterRead) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringWriteAfterRead), "dag");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+TEST(SimpleNetTest, TestSimpleNetTimingWriteAfterRead) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringWriteAfterRead), "simple");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+// This network has an operator writing to sleep1 while another
+// operator has a control dependency on it. As a result, the operator
+// sleep1-again creates a write after read dependency and the whole
+// process should be sequential.
+const char kSleepNetDefStringControlDependency[] = R"DOC(
+  name: "sleepnet"
+  type: "dag"
+  num_workers: 2
+  op {
+    output: "sleep1"
+    name: "sleep1"
+    type: "Sleep"
+    arg {
+      name: "ms"
+      i: 100
+    }
+  }
+  op {
+    control_input: "sleep1"
+    output: "sleep2"
+    name: "sleep2"
+    type: "Sleep"
+    arg {
+      name: "ms"
+      i: 100
+    }
+  }
+  op {
+    output: "sleep1"
+    name: "sleep1-again"
+    type: "Sleep"
+    arg {
+      name: "ms"
+      i: 150
+    }
+  }
+)DOC";
+
+TEST(DAGNetTest, TestDAGNetTimingControlDependency) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringControlDependency), "dag");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+TEST(SimpleNetTest, TestSimpleNetTimingControlDependency) {
+  int ms = RunNetAndGetDuration(
+      string(kSleepNetDefStringControlDependency), "simple");
+  EXPECT_NEAR(ms, 350, kTimeThreshold);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
new file mode 100644
index 0000000..fba9c9d
--- /dev/null
+++ b/caffe2/core/plan_executor.cc
@@ -0,0 +1,519 @@
+#include "caffe2/core/plan_executor.h"
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_handle_executor_threads_exceptions,
+    false,
+    "If used we will handle exceptions in executor threads. "
+    "This avoids SIGABRT but may cause process to deadlock");
+
+namespace caffe2 {
+
+namespace {
+
+struct NetDefInfo {
+  const NetDef* netDef;
+  // in order to keep the "override existing nets" on the top-level workflow,
+  // we need to makr the nets that already exist so that we can override them
+  // exactly once.
+  bool needsOverride;
+};
+
+using NetDefMap = std::unordered_map<std::string, NetDefInfo>;
+
+struct Reporter {
+  struct ReporterInstance {
+    std::mutex report_mutex;
+    std::condition_variable report_cv;
+    std::thread report_thread;
+    ReporterInstance(int intervalMillis, bool* done, std::function<void()> f) {
+      auto interval = std::chrono::milliseconds(intervalMillis);
+      auto reportWorker = [=]() {
+        std::unique_lock<std::mutex> lk(report_mutex);
+        do {
+          report_cv.wait_for(lk, interval, [&]() { return *done; });
+          f();
+        } while (!*done);
+      };
+      report_thread = std::thread(reportWorker);
+    }
+  };
+
+  void start(int64_t intervalMillis, std::function<void()> f) {
+    instances_.emplace_back(new ReporterInstance(intervalMillis, &done, f));
+  }
+
+  ~Reporter() {
+    done = true;
+    for (auto& instance : instances_) {
+      if (!instance->report_thread.joinable()) {
+        continue;
+      }
+      instance->report_cv.notify_all();
+      instance->report_thread.join();
+    }
+  }
+
+ private:
+  std::vector<std::unique_ptr<ReporterInstance>> instances_;
+  bool done{false};
+};
+
+// Returns a function that returns `true` if we should continue
+// iterating, given the current iteration count.
+std::function<bool(int64_t)> getContinuationTest(
+    Workspace* /*ws*/,
+    const ExecutionStep& step) {
+  if (step.has_should_stop_blob()) {
+    CAFFE_ENFORCE(
+        !step.has_num_iter(),
+        "Must not specify num_iter if should_stop_blob is set");
+  }
+
+  if (!step.has_should_stop_blob()) { // control by iteration
+    CAFFE_ENFORCE(!step.has_only_once(), "not supported");
+    int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
+    VLOG(1) << "Will execute step " << step.name() << " for " << iterations
+            << " iterations.";
+    return [=](int64_t i) { return i < iterations; };
+  } else { // control by signal blob
+    bool onlyOnce = step.has_only_once() && step.only_once();
+    VLOG(1) << "Will execute step" << step.name() << (onlyOnce ? " once " : "")
+            << " until stopped by blob " << step.should_stop_blob();
+    if (onlyOnce) {
+      return [](int64_t i) { return i == 0; };
+    } else {
+      return [](int64_t /*i*/) { return true; };
+    }
+  }
+};
+
+// if the blob doesn't exist or is not initiaized, return false
+inline bool getShouldStop(const Blob* b) {
+  if (!b || b->meta().id() == CaffeTypeId::uninitialized()) { // not exist or uninitialized
+    return false;
+  }
+
+  const auto& t = b->Get<TensorCPU>();
+  CAFFE_ENFORCE(t.IsType<bool>() && t.size() == 1, "expects a scalar boolean");
+  return *(t.template data<bool>());
+}
+
+/**
+ * Injects a blob named 'GLOBAL_WORKSPACE_ID' for each workspace, only if
+ * another blob named 'NODE_ID' is present. 'NODE_ID' blob can be used in a
+ * distribued run and in this case 'GLOBAL_WORKSPACE_ID' can be used across
+ * machines for other purposes (e.g. to support model parallelism). Essentially,
+ * 'GLOBAL_WORKSPACE_ID' is an identifier for a workspace that is unique across
+ * all 'NODE_ID's.
+ */
+struct WorkspaceIdInjector {
+  static const string NODE_ID;
+  static const string GLOBAL_WORKSPACE_ID;
+
+  void InjectWorkspaceId(Workspace* workspace) {
+    if (workspace->HasBlob(NODE_ID)) {
+      Blob* node_id_blob = workspace->GetBlob(NODE_ID);
+      const TensorCPU& node_id_tensor = node_id_blob->template Get<TensorCPU>();
+      int node_id = node_id_tensor.template data<int32_t>()[0];
+      CAFFE_ENFORCE(
+          seq_ < (1 << 16),
+          "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
+      int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
+      Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
+      TensorCPU* global_ws_id_tensor =
+          global_ws_id_blob->template GetMutable<TensorCPU>();
+      global_ws_id_tensor->Resize();
+      global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
+      VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
+    }
+  }
+
+ private:
+  std::atomic<int> seq_{0};
+};
+
+const string WorkspaceIdInjector::NODE_ID = "NODE_ID";
+const string WorkspaceIdInjector::GLOBAL_WORKSPACE_ID = "GLOBAL_WORKSPACE_ID";
+
+struct CompiledExecutionStep;
+
+/**
+ * Controls compilation and runtime cloning of execution steps.
+ *
+ * If step.create_workspace=False, this wrapper will compile the execution step
+ * and its children once, and calls to ExecutionStepWrapper::compiled() will
+ * always return the same compiled step.
+ * If step.create_workspace=True, no compilation is done at creation time.
+ * Instead, a new CompiledExecutionStep is created for every compiled() call.
+ *
+ * CompiledExecutionStep owns its Workspace, and the lifetime of the
+ * compiled step along with its workspace will be tied to the lifetime of
+ * the `CompileGuard` object returned by compiled().
+ *
+ * ExecuteStepRecursive will call call compiled() once before the given
+ * execution step is run and keep it alive for the length of its execution.
+ * This means that, for steps with create_workspace=true, a child workspace
+ * will be created everytime the step is executed, and destroyed right
+ * afterwards.
+ */
+struct ExecutionStepWrapper {
+  ExecutionStepWrapper(
+      const ExecutionStep* step,
+      Workspace* externalWorkspace,
+      ShouldContinue externalShouldContinue,
+      NetDefMap* netDefs,
+      WorkspaceIdInjector* ws_id_injector)
+      : step_(step),
+        externalWorkspace_(externalWorkspace),
+        externalShouldContinue_(externalShouldContinue),
+        netDefs_(netDefs),
+        ws_id_injector_(ws_id_injector) {
+    // If this execution step does not create a child workspace,
+    // then just eagerly-compile it. This will trigger CreateNet on the
+    // nets used by this execution step.
+    if (!step_->create_workspace()) {
+      compiledStep_ = doCompile();
+    }
+  }
+
+  class CompiledGuard {
+    void reset(std::unique_ptr<CompiledExecutionStep>&& compiled) {
+      compiled_ = std::move(compiled);
+      compiledRef_ = compiled_.get();
+    }
+    void reset(CompiledExecutionStep* compiledRef) {
+      compiled_.reset();
+      compiledRef_ = compiledRef;
+    }
+
+   public:
+    CompiledExecutionStep* operator->() {
+      return compiledRef_;
+    }
+
+   private:
+    CompiledGuard() {}
+    std::unique_ptr<CompiledExecutionStep> compiled_;
+    CompiledExecutionStep* compiledRef_;
+    friend struct ExecutionStepWrapper;
+  };
+
+  const ExecutionStep& step() {
+    return *step_;
+  }
+
+  CompiledGuard compiled() {
+    CompiledGuard guard;
+    if (compiledStep_) {
+      guard.reset(compiledStep_.get());
+    } else {
+      guard.reset(doCompile());
+    }
+    return guard;
+  }
+
+ private:
+  std::unique_ptr<CompiledExecutionStep> doCompile();
+
+  const ExecutionStep* step_;
+  Workspace* externalWorkspace_;
+  ShouldContinue externalShouldContinue_;
+  NetDefMap* netDefs_;
+  std::unique_ptr<CompiledExecutionStep> compiledStep_;
+  WorkspaceIdInjector* ws_id_injector_;
+};
+
+struct CompiledExecutionStep {
+  typedef std::function<bool(int)> ShouldContinue;
+
+  CompiledExecutionStep(
+      const ExecutionStep* mainStep,
+      Workspace* externalWorkspace,
+      ShouldContinue externalShouldContinue,
+      NetDefMap* netDefs,
+      WorkspaceIdInjector* ws_id_injector)
+      : step(mainStep) {
+    if (mainStep->create_workspace()) {
+      localWorkspace_.reset(new Workspace(externalWorkspace));
+      workspace = localWorkspace_.get();
+      ws_id_injector->InjectWorkspaceId(workspace);
+    } else {
+      workspace = externalWorkspace;
+    }
+
+    CAFFE_ENFORCE(
+        (step->substep_size() == 0 || step->network_size() == 0),
+        "An ExecutionStep should either have substep or networks"
+        "but not both.");
+
+    auto createAndGetNet = [&](const std::string& network_name) {
+      auto it = netDefs->find(network_name);
+      CAFFE_ENFORCE(
+          it != netDefs->end(),
+          "ExecutionStep " + mainStep->name() + " uses undefined net " +
+              network_name);
+      // needsOverride does not need synchronization because it is only
+      // relevant for non-dynamic executions steps. This is due to the fact
+      // that concurrent nets run on child workspaces, that do not needOverride.
+      if (it->second.needsOverride || !workspace->GetNet(network_name)) {
+        workspace->CreateNet(*it->second.netDef, true);
+        it->second.needsOverride = false;
+      }
+      auto* net = workspace->GetNet(network_name);
+      CAFFE_ENFORCE(net != nullptr, "Network ", network_name, " not found.");
+      return net;
+    };
+
+    if (step->substep_size()) {
+      ShouldContinue substepShouldContinue;
+      if (!step->concurrent_substeps() || step->substep().size() <= 1) {
+        substepShouldContinue = externalShouldContinue;
+      } else {
+        substepShouldContinue = [this, externalShouldContinue](int64_t it) {
+          return !gotFailure && externalShouldContinue(it);
+        };
+      }
+
+      for (const auto& ss : step->substep()) {
+        auto compiledSubstep = std::make_shared<ExecutionStepWrapper>(
+            &ss, workspace, substepShouldContinue, netDefs, ws_id_injector);
+        if (ss.has_run_every_ms()) {
+          reportSubsteps.push_back(compiledSubstep);
+        } else {
+          recurringSubsteps.push_back(compiledSubstep);
+        }
+      }
+    } else {
+      for (const string& network_name : step->network()) {
+        networks.push_back(createAndGetNet(network_name));
+      }
+    }
+
+    if (step->has_should_stop_blob()) {
+      shouldStop = workspace->GetBlob(step->should_stop_blob());
+      CAFFE_ENFORCE(
+          shouldStop, "blob ", step->should_stop_blob(), " does not exist");
+    }
+
+    if (step->has_report_net()) {
+      CAFFE_ENFORCE(
+          step->has_report_interval(),
+          "A report_interval must be provided if report_net is set.");
+      reportNet = createAndGetNet(step->report_net());
+    } else {
+      reportNet = nullptr;
+    }
+
+    netShouldContinue = getContinuationTest(workspace, *step);
+    shouldContinue = [this, externalShouldContinue](int64_t iter) {
+      return externalShouldContinue(iter) && this->netShouldContinue(iter);
+    };
+  }
+
+  const ExecutionStep* step;
+  Workspace* workspace;
+  vector<std::shared_ptr<ExecutionStepWrapper>> reportSubsteps;
+  vector<std::shared_ptr<ExecutionStepWrapper>> recurringSubsteps;
+
+  vector<NetBase*> networks;
+  NetBase* reportNet;
+  Blob* shouldStop{nullptr};
+  ShouldContinue netShouldContinue;
+  ShouldContinue shouldContinue;
+  std::atomic<bool> gotFailure{false};
+
+ private:
+  std::unique_ptr<Workspace> localWorkspace_;
+};
+
+std::unique_ptr<CompiledExecutionStep> ExecutionStepWrapper::doCompile() {
+  return std::unique_ptr<CompiledExecutionStep>(new CompiledExecutionStep(
+      step_,
+      externalWorkspace_,
+      externalShouldContinue_,
+      netDefs_,
+      ws_id_injector_));
+}
+
+#define CHECK_SHOULD_STOP(step, shouldStop)                       \
+  if (getShouldStop(shouldStop)) {                                \
+    VLOG(1) << "Execution step " << step.name() << " stopped by " \
+            << step.should_stop_blob();                           \
+    return true;                                                  \
+  }
+
+bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) {
+  const auto& step = stepWrapper.step();
+  auto compiledStep = stepWrapper.compiled();
+
+  VLOG(1) << "Running execution step " << step.name();
+
+  std::unique_ptr<Reporter> reporter;
+  if (step.has_report_net() || compiledStep->reportSubsteps.size() > 0) {
+    reporter = caffe2::make_unique<Reporter>();
+    auto* reportNet = compiledStep->reportNet;
+    if (reportNet) {
+      VLOG(1) << "Starting reporter net";
+      reporter->start(step.report_interval() * 1000, [reportNet]() {
+        if (!reportNet->Run()) {
+          LOG(WARNING) << "Error running report_net.";
+        }
+      });
+    }
+    for (auto& substepWrapper : compiledStep->reportSubsteps) {
+      reporter->start(
+          substepWrapper->step().run_every_ms(), [substepWrapper]() {
+            if (!ExecuteStepRecursive(*substepWrapper)) {
+              LOG(WARNING) << "Error running report step.";
+            }
+          });
+    }
+  }
+
+  const Blob* shouldStop = compiledStep->shouldStop;
+
+  if (step.substep_size()) {
+    bool sequential =
+        (!step.concurrent_substeps() || step.substep().size() <= 1) &&
+        (!step.has_num_concurrent_instances() ||
+         step.num_concurrent_instances() <= 1);
+    for (int64_t iter = 0; compiledStep->shouldContinue(iter); ++iter) {
+      if (sequential) {
+        VLOG(1) << "Executing step " << step.name() << " iteration " << iter;
+        for (auto& substepWrapper : compiledStep->recurringSubsteps) {
+          if (!ExecuteStepRecursive(*substepWrapper)) {
+            return false;
+          }
+          CHECK_SHOULD_STOP(step, shouldStop);
+        }
+      } else {
+        VLOG(1) << "Executing step " << step.name() << " iteration " << iter
+                << " with " << step.substep().size() << " concurrent substeps";
+
+        std::atomic<int> next_substep{0};
+        std::mutex exception_mutex;
+        string first_exception;
+        auto worker = [&]() {
+          auto num_substeps = compiledStep->recurringSubsteps.size();
+          int substep_id = next_substep++ % num_substeps;
+          if (compiledStep->gotFailure) {
+            return;
+          }
+          try {
+            if (!ExecuteStepRecursive(
+                    *compiledStep->recurringSubsteps.at(substep_id))) {
+              compiledStep->gotFailure = true;
+            }
+          } catch (const std::exception& ex) {
+            std::lock_guard<std::mutex> guard(exception_mutex);
+            if (!first_exception.size()) {
+              first_exception = GetExceptionString(ex);
+              LOG(ERROR) << "Parallel worker exception:\n" << first_exception;
+            }
+            compiledStep->gotFailure = true;
+            if (!FLAGS_caffe2_handle_executor_threads_exceptions) {
+              // In complex plans other threads might get stuck if another
+              // one fails. So we let exception to go out of thread which
+              // causes SIGABRT. In local setup one might use this flag
+              // in order to use Python debugger after a failure
+              throw;
+            }
+          }
+        };
+
+        std::vector<std::thread> threads;
+        auto numThreads = compiledStep->recurringSubsteps.size();
+        if (step.has_num_concurrent_instances()) {
+          numThreads *= step.num_concurrent_instances();
+        }
+        for (size_t i = 0; i < numThreads; ++i) {
+          threads.emplace_back(worker);
+        }
+        for (auto& thread : threads) {
+          thread.join();
+        }
+        if (compiledStep->gotFailure) {
+          LOG(ERROR) << "One of the workers failed.";
+          if (first_exception.size()) {
+            CAFFE_THROW(
+                "One of the workers died with an unhandled exception ",
+                first_exception);
+          }
+          return false;
+        }
+        // concurrent substeps should be careful about setting should_stop_blob
+        CHECK_SHOULD_STOP(step, shouldStop);
+      }
+    }
+    return true;
+  } else {
+    // If this ExecutionStep just contains nets, we can directly run it.
+    for (int64_t iter = 0; compiledStep->shouldContinue(iter); ++iter) {
+      VLOG(1) << "Executing networks " << step.name() << " iteration " << iter;
+      for (NetBase* network : compiledStep->networks) {
+        if (!network->Run()) {
+          return false;
+        }
+        CHECK_SHOULD_STOP(step, shouldStop);
+      }
+    }
+  }
+  return true;
+}
+
+#undef CHECK_SHOULD_STOP
+}
+
+bool RunPlanOnWorkspace(
+    Workspace* ws,
+    const PlanDef& plan,
+    ShouldContinue shouldContinue) {
+  LOG(INFO) << "Started executing plan.";
+  if (plan.execution_step_size() == 0) {
+    LOG(WARNING) << "Nothing to run - did you define a correct plan?";
+    // We will do nothing, but the plan is still legal so we will return true.
+    return true;
+  }
+  LOG(INFO) << "Initializing networks.";
+
+  NetDefMap net_defs;
+  for (const NetDef& net_def : plan.network()) {
+    LOG(INFO) << "Processing net '" << net_def.name() << "'";
+    CAFFE_ENFORCE(
+        net_defs.count(net_def.name()) == 0,
+        "Your plan contains networks of the same name \"",
+        net_def.name(),
+        "\", which should not happen. Check your plan to see "
+        "if you made a programming error in creating the plan.");
+    auto netAlreadyExists = ws->GetNet(net_def.name()) != nullptr;
+    net_defs[net_def.name()] = NetDefInfo{&net_def, netAlreadyExists};
+  }
+  WorkspaceIdInjector ws_id_injector;
+  Timer plan_timer;
+  for (const ExecutionStep& step : plan.execution_step()) {
+    Timer step_timer;
+    ExecutionStepWrapper stepWrapper(
+        &step, ws, shouldContinue, &net_defs, &ws_id_injector);
+    if (!ExecuteStepRecursive(stepWrapper)) {
+      LOG(ERROR) << "Failed initializing step " << step.name();
+      return false;
+    }
+    LOG(INFO) << "Step " << step.name() << " took " << step_timer.Seconds()
+              << " seconds.";
+  }
+  LOG(INFO) << "Total plan took " << plan_timer.Seconds() << " seconds.";
+  LOG(INFO) << "Plan executed successfully.";
+  return true;
+}
+}
diff --git a/caffe2/core/plan_executor.h b/caffe2/core/plan_executor.h
new file mode 100644
index 0000000..6b4992d
--- /dev/null
+++ b/caffe2/core/plan_executor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <functional>
+
+namespace caffe2 {
+
+class Workspace;
+class PlanDef;
+
+typedef std::function<bool(int)> ShouldContinue;
+
+bool RunPlanOnWorkspace(Workspace* ws, const PlanDef& plan, ShouldContinue);
+}
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
new file mode 100644
index 0000000..bbe487b
--- /dev/null
+++ b/caffe2/core/predictor.cc
@@ -0,0 +1,180 @@
+#include "caffe2/core/predictor.h"
+#ifdef CAFFE2_OPTIMIZER
+#include "caffe2/opt/optimizer.h"
+#endif
+
+#include <unordered_set>
+#include "caffe2/core/init.h"
+
+namespace caffe2 {
+
+namespace {
+
+void enforceIsTensor(Workspace* ws, const std::string& name) {
+  auto blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
+  CAFFE_ENFORCE(
+      blob->template IsType<TensorCPU>(), "Blob is not a CPU Tensor: ", name);
+}
+
+void shareInputTensor(
+    Workspace* ws,
+    const std::string& name,
+    TensorCPU* input) {
+  enforceIsTensor(ws, name);
+  auto* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
+  auto* tensor = blob->template GetMutable<TensorCPU>();
+  tensor->ResizeLike(*input);
+  tensor->ShareData(*input);
+}
+
+TensorCPU* extractOutputTensor(Workspace* ws, const std::string& name) {
+  enforceIsTensor(ws, name);
+  auto* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
+  return blob->template GetMutable<TensorCPU>();
+}
+
+// We don't use the getNet() from predictor_utils.cc here because that file
+// has additional dependencies that we want to avoid bringing in, to keep the
+// binary size as small as possible.
+const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
+  for (const auto& n : def.nets()) {
+    if (n.key() == name) {
+      return n.value();
+    }
+  }
+  CAFFE_THROW("Net not found: ", name);
+}
+
+const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
+    const MetaNetDef& def,
+    const std::string& name) {
+  for (const auto& b : def.blobs()) {
+    if (b.key() == name) {
+      return b.value();
+    }
+  }
+  CAFFE_THROW("Blob not found: ", name);
+}
+} // namespace
+
+Predictor::Predictor(const MetaNetDef& def, Workspace* parent, bool run_init)
+    : Predictor(
+          getNet(
+              def,
+              PredictorConsts::default_instance().global_init_net_type()),
+          getNet(def, PredictorConsts::default_instance().predict_net_type()),
+          parent,
+          run_init) {
+  const auto& inputs =
+      getBlobs(def, PredictorConsts::default_instance().inputs_blob_type());
+  for (const auto& input : inputs) {
+    inputNames_.insert(input);
+  }
+
+  const auto& outputs =
+      getBlobs(def, PredictorConsts::default_instance().outputs_blob_type());
+  for (const auto& output : outputs) {
+    outputNames_.emplace_back(output);
+  }
+}
+
+Predictor::Predictor(
+    const NetDef& init_net,
+    const NetDef& run_net,
+    Workspace* parent,
+    bool run_init,
+    int optimization)
+    : run_net_(run_net), ws_(parent) {
+
+  if (run_init) {
+    CAFFE_ENFORCE(ws_.RunNetOnce(init_net));
+  }
+#if CAFFE2_MOBILE
+  GlobalInit();
+#endif
+
+  if (optimization) {
+#ifdef CAFFE2_OPTIMIZER
+    try {
+      run_net_ = opt::optimize(run_net_, &ws_, optimization);
+    } catch (const std::exception& e) {
+      LOG(WARNING) << "Optimization pass failed: " << e.what();
+    }
+#else
+    LOG(WARNING) << "Caffe2 is compiled without optimization passes.";
+#endif
+  }
+
+  // real model inputs can be fed later in run* functions
+  const auto& initialized_vec = ws_.Blobs();
+  const std::unordered_set<std::string> initialized{initialized_vec.begin(),
+                                                    initialized_vec.end()};
+  for (const auto& name : run_net.external_input()) {
+    if (!initialized.count(name)) {
+      auto* blob = ws_.CreateBlob(name);
+      blob->template GetMutable<TensorCPU>();
+    }
+  }
+
+  CAFFE_ENFORCE(ws_.CreateNet(run_net));
+}
+
+bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) {
+  CAFFE_ENFORCE(inputs.size() <= (unsigned)run_net_.external_input_size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    shareInputTensor(&ws_, run_net_.external_input(i), inputs[i]);
+  }
+
+  if (!ws_.RunNet(run_net_.name())) {
+    return false;
+  }
+
+  outputs->resize(run_net_.external_output_size());
+  for (size_t i = 0; i < outputs->size(); ++i) {
+    (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
+  }
+  return true;
+}
+
+bool Predictor::run_map_workspace(const TensorMap& inputs) {
+  if (!inputNames_.empty()) {
+    CAFFE_ENFORCE_EQ(inputs.size(), inputNames_.size());
+  }
+  for (auto input : inputs) {
+    if (!inputNames_.empty()) {
+      CAFFE_ENFORCE_GT(inputNames_.count(input.first), 0);
+    }
+    shareInputTensor(&ws_, input.first, input.second);
+  }
+
+  return ws_.RunNet(run_net_.name());
+}
+
+bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) {
+  if (!run_map_workspace(inputs)) {
+    return false;
+  }
+
+  outputs->resize(run_net_.external_output_size());
+  for (size_t i = 0; i < outputs->size(); ++i) {
+    (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
+  }
+  return true;
+}
+
+bool Predictor::run_map_outputs(const TensorMap& inputs, TensorMap* outputs) {
+  if (!run_map_workspace(inputs)) {
+    return false;
+  }
+
+  outputs->reserve(outputNames_.size());
+  for (const std::string& outputName : outputNames_) {
+    (*outputs)[outputName] = extractOutputTensor(&ws_, outputName);
+  }
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/predictor.h b/caffe2/core/predictor.h
new file mode 100644
index 0000000..1212946
--- /dev/null
+++ b/caffe2/core/predictor.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <unordered_set>
+#include "caffe2/core/net.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/proto/metanet.pb.h"
+#include "caffe2/proto/predictor_consts.pb.h"
+
+namespace caffe2 {
+
+class Predictor {
+ public:
+  using TensorVector = std::vector<TensorCPU*>;
+  using TensorMap = std::unordered_map<std::string, TensorCPU*>;
+
+  // MetaNetDef contains 'init_net', 'run_net', and meta-info
+  // The meta-info is used to verify inputs are correctly passed
+  Predictor(
+      const MetaNetDef& net,
+      Workspace* parent = nullptr,
+      bool run_init = true);
+
+  // Runs the `init_net` once, then saves the `run_net` to be executed
+  // in `::run`
+  Predictor(
+      const NetDef& init_net,
+      const NetDef& run_net,
+      Workspace* parent = nullptr,
+      bool run_init = true,
+      int optimization = 0);
+
+  ~Predictor() {}
+
+  // Executes `run_net` on the inputs.
+  // The first `inputs.size()` inputs from run_net::external_inputs
+  // are shared with the data in `inputs`.
+
+  // Precondition:
+  //   inputs.size() <= run_net_.external_inputs.size()
+
+  // Postcondition:
+  //   outputs->size() == run_net.external_inputs.size()
+
+  // Returns true on success
+  bool run(const TensorVector& inputs, TensorVector* outputs);
+
+  // Similar to run, but consumes a map of name to tensor as input
+  bool run_map(const TensorMap& inputs, TensorVector* outputs);
+
+  // Similar to the other run fns, except inputs and outputs are both maps of
+  // string name to tensor.
+  bool run_map_outputs(const TensorMap& inputs, TensorMap* outputs);
+
+  const NetDef& def() const {
+    return run_net_;
+  };
+
+  Workspace* ws() {
+    return &ws_;
+  };
+
+  const std::unordered_set<std::string>& input_names() const {
+    return inputNames_;
+  }
+
+  const std::vector<std::string>& output_names() const {
+    return outputNames_;
+  }
+
+ private:
+  bool run_map_workspace(const TensorMap& inputs);
+
+  NetDef run_net_;
+  Workspace ws_;
+  std::unordered_set<std::string> inputNames_;
+  // Outputs need to be ordered since TensorVector outputs rely on the outputs
+  // being in a certain order.
+  std::vector<std::string> outputNames_;
+};
+}
diff --git a/caffe2/core/predictor_test.cc b/caffe2/core/predictor_test.cc
new file mode 100644
index 0000000..a37dbbb
--- /dev/null
+++ b/caffe2/core/predictor_test.cc
@@ -0,0 +1,229 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+namespace {
+
+const char* predictSpec = R"DOC(
+        name: "predict"
+        type: "dag"
+        external_input: "data"
+        external_input: "W"
+        external_input: "b"
+        external_output: "y"
+        op {
+          input: "data"
+          input: "W"
+          input: "b"
+          output: "y"
+          type: "FC"
+        }
+)DOC";
+
+const char* initSpec = R"DOC(
+        name: "init"
+        type: "dag"
+        op {
+          type: "ConstantFill"
+          output: "W"
+          arg {
+            name: "shape"
+            ints: 10
+            ints: 4
+          }
+          arg {
+            name: "value"
+            f: 2.0
+          }
+        }
+        op {
+          type: "ConstantFill"
+          output: "b"
+          arg {
+            name: "shape"
+            ints: 10
+          }
+          arg {
+            name: "value"
+            f: 2.0
+          }
+        }
+
+)DOC";
+
+const char* metaSpec = R"DOC(
+  blobs {
+    key: "INPUTS_BLOB_TYPE"
+    value: "data"
+  }
+  blobs {
+      key: "OUTPUTS_BLOB_TYPE"
+      value: "y"
+  }
+  nets {
+    key: "GLOBAL_INIT_NET_TYPE"
+    value: {
+      name: "init"
+      type: "dag"
+      op {
+        type: "ConstantFill"
+        output: "data"
+        arg {
+          name: "shape"
+          ints: 1
+          ints: 4
+        }
+        arg {
+          name: "value"
+          f: 2.0
+        }
+      }
+      op {
+        type: "ConstantFill"
+        output: "W"
+        arg {
+          name: "shape"
+          ints: 10
+          ints: 4
+        }
+        arg {
+          name: "value"
+          f: 2.0
+        }
+      }
+      op {
+        type: "ConstantFill"
+        output: "b"
+        arg {
+          name: "shape"
+          ints: 10
+        }
+        arg {
+          name: "value"
+          f: 2.0
+        }
+      }
+    }
+  }
+  nets {
+    key: "PREDICT_NET_TYPE"
+    value: {
+      name: "predict"
+      type: "dag"
+      external_input: "data"
+      external_input: "W"
+      external_input: "b"
+      external_output: "y"
+      op {
+        input: "data"
+        input: "W"
+        input: "b"
+        output: "y"
+        type: "FC"
+      }
+    }
+  }
+)DOC";
+
+std::unique_ptr<Blob> randomTensor(
+    const std::vector<TIndex>& dims,
+    CPUContext* ctx) {
+  auto blob = make_unique<Blob>();
+  auto* t = blob->GetMutable<TensorCPU>();
+  t->Resize(dims);
+  math::RandUniform<float, CPUContext>(
+      t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
+  return blob;
+}
+
+NetDef parseNetDef(const std::string& value) {
+  NetDef def;
+  CAFFE_ENFORCE(
+      TextFormat::ParseFromString(value, &def),
+      "Failed to parse NetDef with value: ",
+      value);
+  return def;
+};
+
+MetaNetDef parseMetaNetDef(const std::string& value) {
+  MetaNetDef def;
+  CAFFE_ENFORCE(
+      TextFormat::ParseFromString(value, &def),
+      "Failed to parse NetDef with value: ",
+      value);
+  return def;
+}
+}
+
+class PredictorTest : public testing::Test {
+ public:
+  void SetUp() override {
+    DeviceOption op;
+    op.set_random_seed(1701);
+    ctx_ = caffe2::make_unique<CPUContext>(op);
+    NetDef init, run;
+    p_ = caffe2::make_unique<Predictor>(
+        parseNetDef(initSpec), parseNetDef(predictSpec));
+  }
+
+  std::unique_ptr<CPUContext> ctx_;
+  std::unique_ptr<Predictor> p_;
+};
+
+TEST_F(PredictorTest, SimpleBatchSized) {
+  auto inputData = randomTensor({1, 4}, ctx_.get());
+  Predictor::TensorVector input{inputData->template GetMutable<TensorCPU>()};
+  Predictor::TensorVector output;
+  p_->run(input, &output);
+  EXPECT_EQ(output.size(), 1);
+  EXPECT_TRUE(output.front()->dims().size() == 2);
+  EXPECT_TRUE(output.front()->dim(0) == 1);
+  EXPECT_TRUE(output.front()->dim(1) == 10);
+  EXPECT_NEAR(output.front()->data<float>()[4], 0.1209, 1E-4);
+}
+
+TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
+  auto inputData = randomTensor({1, 4}, ctx_.get());
+  Predictor::TensorMap input{
+      {"data", inputData->template GetMutable<TensorCPU>()}};
+  Predictor::TensorVector output;
+  p_->run_map(input, &output);
+  EXPECT_EQ(output.size(), 1);
+  EXPECT_TRUE(output.front()->dims().size() == 2);
+  EXPECT_TRUE(output.front()->dim(0) == 1);
+  EXPECT_TRUE(output.front()->dim(1) == 10);
+  EXPECT_NEAR(output.front()->data<float>()[4], 0.1209, 1E-4);
+}
+
+class PredictorMetaNetDefTest : public testing::Test {
+ public:
+  void SetUp() override {
+    DeviceOption op;
+    op.set_random_seed(1701);
+    ctx_ = caffe2::make_unique<CPUContext>(op);
+    p_ = caffe2::make_unique<Predictor>(parseMetaNetDef(metaSpec));
+  }
+
+  std::unique_ptr<CPUContext> ctx_;
+  std::unique_ptr<Predictor> p_;
+};
+
+TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) {
+  auto inputData = randomTensor({1, 4}, ctx_.get());
+  Predictor::TensorMap input{
+      {"data", inputData->template GetMutable<TensorCPU>()}};
+  Predictor::TensorVector output;
+  p_->run_map(input, &output);
+  EXPECT_EQ(output.size(), 1);
+  EXPECT_TRUE(output.front()->dims().size() == 2);
+  EXPECT_TRUE(output.front()->dim(0) == 1);
+  EXPECT_TRUE(output.front()->dim(1) == 10);
+  EXPECT_NEAR(output.front()->data<float>()[4], 0.1209, 1E-4);
+}
+} // namespace caffe2
diff --git a/caffe2/core/predictor_utils.cc b/caffe2/core/predictor_utils.cc
new file mode 100644
index 0000000..dea0388
--- /dev/null
+++ b/caffe2/core/predictor_utils.cc
@@ -0,0 +1,81 @@
+#include "caffe2/core/predictor_utils.h"
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/proto/predictor_consts.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+namespace predictor_utils {
+
+const NetDef getNet(const MetaNetDef& def, const std::string& name) {
+  for (const auto& n : def.nets()) {
+    if (n.key() == name) {
+      return n.value();
+    }
+  }
+  CAFFE_THROW("Net not found: ", name);
+  return NetDef();
+}
+
+std::unique_ptr<MetaNetDef> extractMetaNetDef(
+    db::Cursor* cursor,
+    const std::string& key) {
+  CAFFE_ENFORCE(cursor);
+  if (cursor->SupportsSeek()) {
+    cursor->Seek(key);
+  }
+  for (; cursor->Valid(); cursor->Next()) {
+    if (cursor->key() != key) {
+      continue;
+    }
+    // We've found a match. Parse it out.
+    BlobProto proto;
+    CAFFE_ENFORCE(proto.ParseFromString(cursor->value()));
+    Blob blob;
+    blob.Deserialize(proto);
+    CAFFE_ENFORCE(blob.template IsType<string>());
+    auto def = caffe2::make_unique<MetaNetDef>();
+    CAFFE_ENFORCE(def->ParseFromString(blob.template Get<string>()));
+    return def;
+  }
+  CAFFE_THROW("Failed to find in db the key: ", key);
+}
+
+std::unique_ptr<MetaNetDef> runGlobalInitialization(
+    std::unique_ptr<db::DBReader> db,
+    Workspace* master) {
+  CAFFE_ENFORCE(db.get());
+  auto* cursor = db->cursor();
+
+  auto metaNetDef = extractMetaNetDef(
+      cursor, PredictorConsts::default_instance().meta_net_def());
+  if (metaNetDef->has_modelinfo()) {
+    CAFFE_ENFORCE(
+        metaNetDef->modelinfo().predictortype() ==
+            PredictorConsts::default_instance().single_predictor(),
+        "Can only load single predictor");
+  }
+  VLOG(1) << "Extracted meta net def";
+
+  const auto globalInitNet = getNet(
+      *metaNetDef, PredictorConsts::default_instance().global_init_net_type());
+  VLOG(1) << "Global init net: " << ProtoDebugString(globalInitNet);
+
+  // Now, pass away ownership of the DB into the master workspace for
+  // use by the globalInitNet.
+  master->CreateBlob(PredictorConsts::default_instance().predictor_dbreader())
+      ->Reset(db.release());
+
+  // Now, with the DBReader set, we can run globalInitNet.
+  CAFFE_ENFORCE(
+      master->RunNetOnce(globalInitNet),
+      "Failed running the globalInitNet: ",
+      ProtoDebugString(globalInitNet));
+
+  return metaNetDef;
+}
+
+} // namespace predictor_utils
+} // namespace caffe2
diff --git a/caffe2/core/predictor_utils.h b/caffe2/core/predictor_utils.h
new file mode 100644
index 0000000..15a85e6
--- /dev/null
+++ b/caffe2/core/predictor_utils.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/metanet.pb.h"
+
+namespace caffe2 {
+namespace predictor_utils {
+
+const NetDef getNet(const MetaNetDef& def, const std::string& name);
+
+std::unique_ptr<MetaNetDef> extractMetaNetDef(
+    db::Cursor* cursor,
+    const std::string& key);
+
+// Extract the MetaNetDef from `db`, and run the global init net on the
+// `master` workspace.
+std::unique_ptr<MetaNetDef> runGlobalInitialization(
+    std::unique_ptr<db::DBReader> db,
+    Workspace* master);
+
+} // namespace predictor_utils
+} // namespace caffe2
diff --git a/caffe2/core/qtensor.cc b/caffe2/core/qtensor.cc
new file mode 100644
index 0000000..b97be78
--- /dev/null
+++ b/caffe2/core/qtensor.cc
@@ -0,0 +1,5 @@
+#include "caffe2/core/qtensor.h"
+
+namespace caffe2 {
+CAFFE_KNOWN_TYPE(QTensor<CPUContext>);
+}
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
new file mode 100644
index 0000000..d857cfa
--- /dev/null
+++ b/caffe2/core/qtensor.h
@@ -0,0 +1,257 @@
+#ifndef CAFFE2_CORE_QTENSOR_H_
+#define CAFFE2_CORE_QTENSOR_H_
+
+#include <algorithm>
+#include <climits>
+#include <cstddef>
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+template <class Context>
+class QTensor {
+ public:
+  QTensor() {}
+  virtual ~QTensor() {}
+  /**
+   * @brief Creates a quantized tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   *
+   * The underlying storage of the quantized tensor interleaves elements
+   * by bit depth.
+   *
+   * Labeled memory for tensor of size 6, precision 3
+   *   [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ] // Least significant Bits
+   *   [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
+   *   [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
+   *
+   * In the case of sign bits (see enable_sign argument), an extra bit
+   * per element is added:
+   *
+   * Labeled memory for tensor of size 6, precision 3, sign bit enabled
+   *   [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ]
+   *   [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
+   *   [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
+   *   [ E1[s] E2[s] E3[s] E4[s] E5[s] E6[s] ]
+   *   Where 's' is 1 if E is negative
+   *
+   * The reason for this layout is the ability to efficiently multiply
+   * many low precision integers as a sum of popcnt(A & B) * 1 << bit.
+   * Explained here: https://arxiv.org/abs/1606.06160
+   */
+  explicit QTensor(
+      const std::vector<int>& dims,
+      const unsigned char precision,
+      const bool signbit = false)
+      : precision_(precision), signed_(signbit) {
+    Resize(dims);
+  }
+
+  void Resize(std::vector<int> dim_source) {
+    if (dims_ != dim_source) {
+      size_t source_size = std::accumulate(
+          dim_source.begin(), dim_source.end(), 1, std::multiplies<int>());
+      if ((source_size * (precision_ + signed_)) > capacity_) {
+        data_.reset();
+        capacity_ = 0;
+      }
+      dims_ = dim_source;
+      size_ = source_size;
+    }
+  }
+
+  void
+  SetBitAtIndex(const unsigned char bit, const size_t index, const bool value) {
+    // Get the mutable data at bit depth `bit`.
+    unsigned char* d = mutable_data();
+
+    CAFFE_ENFORCE(
+        bit < precision_ + signed_,
+        "Attempted to a set a bit that is not allocated.");
+    CAFFE_ENFORCE(bit * aligned_size() < capacity_);
+
+    auto idx = (aligned_size() * bit) / CHAR_BIT;
+    d = &d[idx];
+
+    idx = index / CHAR_BIT;
+    auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
+
+    if (value) {
+      d[idx] |= 1 << shift;
+    } else {
+      d[idx] &= ~(1 << shift);
+    }
+  }
+
+  bool GetBitAtIndex(const unsigned char bit, const size_t index) const {
+    // Get the data at bit depth `bit`
+    const unsigned char* d = data();
+    auto idx = (aligned_size() * bit) / CHAR_BIT;
+    d = &d[idx];
+
+    idx = index / CHAR_BIT;
+    auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
+
+    return d[idx] & (1 << shift);
+  }
+
+  void SetPrecision(const unsigned char precision) {
+    precision_ = precision;
+    data_.reset();
+  }
+
+  void SetSigned(const bool make_signed = true) {
+    signed_ = make_signed;
+    data_.reset();
+  }
+
+  void SetScale(const double scale) {
+    scale_ = scale;
+  }
+
+  void SetBias(const double bias) {
+    bias_ = bias;
+  }
+
+  unsigned char* mutable_data() {
+    if (!data_) {
+      auto ptr_and_deleter = Context::New(nbytes());
+      data_.reset(
+          static_cast<unsigned char*>(ptr_and_deleter.first),
+          ptr_and_deleter.second);
+      capacity_ = nbytes() * CHAR_BIT;
+    }
+    CAFFE_ENFORCE(capacity_ == nbytes() * CHAR_BIT);
+    return data_.get();
+  }
+
+  inline const unsigned char* data() const {
+    return data_.get();
+  }
+
+  inline size_t size() const {
+    return size_;
+  }
+
+  inline unsigned char alignment() const {
+    return alignment_;
+  }
+
+  inline unsigned char precision() const {
+    return precision_;
+  }
+
+  inline const vector<int>& dims() const {
+    return dims_;
+  }
+
+  inline bool is_signed() const {
+    return signed_;
+  }
+
+  /**
+   * Returns the number of dimensions of the data.
+   */
+  inline int ndim() const {
+    return dims_.size();
+  }
+
+  inline size_t aligned_size() const {
+    return alignment_ * ((size_ + alignment_ - 1) / alignment_);
+  }
+
+  inline size_t nbytes() const {
+    return (aligned_size() * (precision_ + signed_)) / CHAR_BIT;
+  }
+
+  inline double scale() const {
+    return scale_;
+  }
+
+  inline double bias() const {
+    return bias_;
+  }
+
+  /**
+   * Returns the i-th dimension of the qtensor in int.
+   */
+  inline int dim32(const int i) const {
+    DCHECK_LT(i, dims_.size()) << "Exceeding ndim limit " << dims_.size();
+    DCHECK_GE(i, 0) << "Cannot have negative index";
+    CAFFE_ENFORCE_LT(dims_[i], std::numeric_limits<int>::max());
+    return static_cast<int>(dims_[i]);
+  }
+
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < ndim(), return index.
+   *        If -ndim <= index <= -1, return (ndim() - (-index)),
+   *        e.g., the last axis index (ndim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int canonical_axis_index(int axis_index) const {
+    CAFFE_ENFORCE_GE(axis_index, -ndim());
+    CAFFE_ENFORCE_LT(axis_index, ndim());
+    if (axis_index < 0) {
+      return axis_index + ndim();
+    }
+    return axis_index;
+  }
+
+  /**
+   * Return product of all dimensions starting from K.
+   */
+  inline TIndex size_from_dim(int k) const {
+    TIndex r = 1;
+    for (int i = k; i < dims_.size(); ++i) {
+      r *= dims_[i];
+    }
+    return r;
+  }
+
+  /**
+   * Product of all dims up to.
+   */
+  inline TIndex size_to_dim(int k) const {
+    CAFFE_ENFORCE(k < dims_.size());
+    TIndex r = 1;
+    for (int i = 0; i < k; ++i) {
+      r *= dims_[i];
+    }
+    return r;
+  }
+
+ protected:
+  std::vector<int> dims_;
+  size_t size_ = 0;
+
+  // Precision in bits.
+  unsigned char precision_ = CHAR_BIT;
+  // Bit alignment.
+  unsigned char alignment_ = CHAR_BIT;
+
+  // Allocated data.
+  std::shared_ptr<unsigned char> data_;
+
+  // value = scale_ * (x + bias_)
+  double scale_;
+  double bias_;
+  bool signed_ = false;
+
+  // Capacity in bits.
+  size_t capacity_ = 0;
+};
+
+} // namespace caffe2
+#endif // CAFFE2_CORE_QTENSOR_H_
diff --git a/caffe2/core/qtensor_serialization.cc b/caffe2/core/qtensor_serialization.cc
new file mode 100644
index 0000000..74dd01a
--- /dev/null
+++ b/caffe2/core/qtensor_serialization.cc
@@ -0,0 +1,10 @@
+#include "caffe2/core/qtensor_serialization.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<QTensor<CPUContext>>()),
+    QTensorSerializer<CPUContext>);
+REGISTER_BLOB_DESERIALIZER(QTensor, QTensorDeserializer<CPUContext>);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/qtensor_serialization.h b/caffe2/core/qtensor_serialization.h
new file mode 100644
index 0000000..8efac02
--- /dev/null
+++ b/caffe2/core/qtensor_serialization.h
@@ -0,0 +1,86 @@
+#ifndef CAFFE2_CORE_QTENSOR_SERIALIZATION_H_
+#define CAFFE2_CORE_QTENSOR_SERIALIZATION_H_
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/qtensor.h"
+
+namespace caffe2 {
+
+constexpr auto kQTensorBlobQType = "QTensor";
+
+template <class Context>
+class QTensorSerializer : public BlobSerializerBase {
+ public:
+  QTensorSerializer() : context_() {}
+  ~QTensorSerializer() {}
+  /**
+   * Serializes a Blob. Note that this blob has to contain QTensor<Context>.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override;
+
+ private:
+  Context context_;
+};
+
+template <class Context>
+class QTensorDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override;
+  void Deserialize(const QTensorProto& proto, QTensor<Context>* tensor);
+};
+
+template <class Context>
+void QTensorSerializer<Context>::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  const auto& qtensor = blob.template Get<QTensor<Context>>();
+  BlobProto blob_proto;
+  blob_proto.set_name(name);
+  blob_proto.set_type(kQTensorBlobQType);
+  QTensorProto& proto = *blob_proto.mutable_qtensor();
+  proto.set_name(name);
+  for (int i = 0; i < qtensor.ndim(); ++i) {
+    proto.add_dims(qtensor.dim32(i));
+  }
+  proto.set_precision(qtensor.precision());
+  proto.set_scale(qtensor.scale());
+  proto.set_bias(qtensor.bias());
+  proto.set_is_signed(qtensor.is_signed());
+  detail::CopyToProtoWithCast(
+      qtensor.nbytes(), qtensor.data(), proto.mutable_data(), &this->context_);
+  acceptor(name, blob_proto.SerializeAsString());
+}
+
+template <class Context>
+void QTensorDeserializer<Context>::Deserialize(
+    const BlobProto& blob_proto,
+    Blob* blob) {
+  Deserialize(blob_proto.qtensor(), blob->GetMutable<QTensor<Context>>());
+}
+
+template <class Context>
+void QTensorDeserializer<Context>::Deserialize(
+    const QTensorProto& proto,
+    QTensor<Context>* qtensor) {
+  Context context{};
+  vector<int> dims;
+  for (const int d : proto.dims()) {
+    dims.push_back(d);
+  }
+  qtensor->Resize(dims);
+  qtensor->SetPrecision(proto.precision());
+  qtensor->SetScale(proto.scale());
+  qtensor->SetBias(proto.bias());
+  qtensor->SetSigned(proto.is_signed());
+
+  detail::CopyFromProtoWithCast(
+      qtensor->nbytes(), proto.data(), qtensor->mutable_data(), &context);
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_QTENSOR_SERIALIZATION_H_
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
new file mode 100644
index 0000000..0c8cdb8
--- /dev/null
+++ b/caffe2/core/registry.h
@@ -0,0 +1,213 @@
+/**
+ * Simple registry implementation in Caffe2 that uses static variables to
+ * register object creators during program initialization time.
+ *
+ * WARNING: this registry is not entirely thread-safe, as reads to
+ * the registry are not protected by a mutex.  The safest mode of use
+ * is to dlopen() *all* dynamic libraries that may write to the library
+ * and synchronize prior to performing any reads on the registry.
+ */
+#ifndef CAFFE2_CORE_REGISTRY_H_
+#define CAFFE2_CORE_REGISTRY_H_
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+template <typename KeyType>
+inline void PrintOffendingKey(const KeyType& /*key*/) {
+  printf("[key type printing not supported]\n");
+}
+
+template <>
+inline void PrintOffendingKey(const string& key) {
+  printf("Offending key: %s.\n", key.c_str());
+}
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a string specifying the name, but can be anything that
+ * can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry() : registry_() {}
+
+  void Register(const SrcType& key, Creator creator) {
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    if (registry_.count(key) != 0) {
+      printf("Key already registered.\n");
+      PrintOffendingKey(key);
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  void Register(const SrcType& key, Creator creator, const string& help_msg) {
+    Register(key, creator);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    if (registry_.count(key) == 0) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a vector.
+   */
+  vector<SrcType> Keys() {
+    vector<SrcType> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  const CaffeMap<SrcType, string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+ private:
+  CaffeMap<SrcType, Creator> registry_;
+  CaffeMap<SrcType, string> help_message_;
+  std::mutex register_mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registerer {
+ public:
+  Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    // TODO(jiayq): old versions of NVCC does not handle make_unique well
+    // so we are forced to use a unique_ptr constructor here. Check if it is
+    // fine to use make_unique in the future.
+    // return make_unique<DerivedType>(args...);
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * CAFFE_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ * Pretty much a copy from 'folly/Preprocessor.h'
+ */
+#define CAFFE_CONCATENATE_IMPL(s1, s2) s1##s2
+#define CAFFE_CONCATENATE(s1, s2) CAFFE_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __COUNTER__)
+#else
+#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __LINE__)
+#endif
+
+/**
+ * CAFFE_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+#define CAFFE_DECLARE_TYPED_REGISTRY(                                    \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
+  Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* RegistryName(); \
+  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>        \
+      Registerer##RegistryName;
+
+#define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
+  Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* RegistryName() {    \
+    static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
+        new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
+    return registry;                                                         \
+  }
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define CAFFE_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  namespace {                                                                 \
+  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), __VA_ARGS__);                                      \
+  }
+
+#define CAFFE_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  namespace {                                                                 \
+  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                    \
+      RegistryName(),                                                         \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
+      DemangleType<__VA_ARGS__>());                                           \
+  }
+
+// CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
+// as the key
+// type, because that is the most commonly used cases.
+#define CAFFE_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  CAFFE_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define CAFFE_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  CAFFE_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define CAFFE_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  CAFFE_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define CAFFE_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  CAFFE_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+// CAFFE_REGISTER_CREATOR and CAFFE_REGISTER_CLASS are hard-wired to use string
+// as the key
+// type, because that is the most commonly used cases.
+#define CAFFE_REGISTER_CREATOR(RegistryName, key, ...) \
+  CAFFE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define CAFFE_REGISTER_CLASS(RegistryName, key, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_REGISTRY_H_
diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc
new file mode 100644
index 0000000..7ad8ead
--- /dev/null
+++ b/caffe2/core/registry_test.cc
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <memory>
+
+#include "caffe2/core/registry.h"
+#include <gtest/gtest.h>
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace {
+
+class Foo {
+ public:
+  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
+};
+
+CAFFE_DECLARE_REGISTRY(FooRegistry, Foo, int);
+CAFFE_DEFINE_REGISTRY(FooRegistry, Foo, int);
+#define REGISTER_FOO(clsname) \
+  CAFFE_REGISTER_CLASS(FooRegistry, clsname, clsname)
+
+class Bar : public Foo {
+ public:
+  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
+};
+REGISTER_FOO(Bar);
+
+class AnotherBar : public Foo {
+ public:
+  explicit AnotherBar(int x) : Foo(x) {
+    LOG(INFO) << "AnotherBar " << x;
+  }
+};
+REGISTER_FOO(AnotherBar);
+
+TEST(RegistryTest, CanRunCreator) {
+  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
+  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
+  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
+  EXPECT_TRUE(another_bar != nullptr);
+}
+
+TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
+  EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
+}
+}
+}  // namespace caffe2
diff --git a/caffe2/core/scope_guard.h b/caffe2/core/scope_guard.h
new file mode 100644
index 0000000..e8dcfe1
--- /dev/null
+++ b/caffe2/core/scope_guard.h
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2016 Facebook
+ * @author Tudor Bosman (tudorb@fb.com)
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <functional>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace caffe2 {
+
+// Copied from folly/ScopeGuard.h
+
+namespace detail {
+
+class ScopeGuardImplBase {
+ public:
+  void dismiss() noexcept {
+    dismissed_ = true;
+  }
+
+ protected:
+  ScopeGuardImplBase() noexcept : dismissed_(false) {}
+
+  static ScopeGuardImplBase makeEmptyScopeGuard() noexcept {
+    return ScopeGuardImplBase{};
+  }
+
+  template <typename T>
+  static const T& asConst(const T& t) noexcept {
+    return t;
+  }
+
+  bool dismissed_;
+};
+
+template <typename FunctionType>
+class ScopeGuardImpl : public ScopeGuardImplBase {
+ public:
+  explicit ScopeGuardImpl(FunctionType& fn) noexcept(
+      std::is_nothrow_copy_constructible<FunctionType>::value)
+      : ScopeGuardImpl(
+            asConst(fn),
+            makeFailsafe(std::is_nothrow_copy_constructible<FunctionType>{},
+                         &fn)) {}
+
+  explicit ScopeGuardImpl(const FunctionType& fn) noexcept(
+      std::is_nothrow_copy_constructible<FunctionType>::value)
+      : ScopeGuardImpl(
+            fn,
+            makeFailsafe(std::is_nothrow_copy_constructible<FunctionType>{},
+                         &fn)) {}
+
+  explicit ScopeGuardImpl(FunctionType&& fn) noexcept(
+      std::is_nothrow_move_constructible<FunctionType>::value)
+      : ScopeGuardImpl(
+            std::move_if_noexcept(fn),
+            makeFailsafe(std::is_nothrow_move_constructible<FunctionType>{},
+                         &fn)) {}
+
+  ScopeGuardImpl(ScopeGuardImpl&& other) noexcept(
+      std::is_nothrow_move_constructible<FunctionType>::value)
+      : function_(std::move_if_noexcept(other.function_)) {
+    // If the above line attempts a copy and the copy throws, other is
+    // left owning the cleanup action and will execute it (or not) depending
+    // on the value of other.dismissed_. The following lines only execute
+    // if the move/copy succeeded, in which case *this assumes ownership of
+    // the cleanup action and dismisses other.
+    dismissed_ = other.dismissed_;
+    other.dismissed_ = true;
+  }
+
+  ~ScopeGuardImpl() noexcept {
+    if (!dismissed_) {
+      execute();
+    }
+  }
+
+ private:
+  static ScopeGuardImplBase makeFailsafe(std::true_type, const void*) noexcept {
+    return makeEmptyScopeGuard();
+  }
+
+  template <typename Fn>
+  static auto makeFailsafe(std::false_type, Fn* fn) noexcept
+      -> ScopeGuardImpl<decltype(std::ref(*fn))> {
+    return ScopeGuardImpl<decltype(std::ref(*fn))>{std::ref(*fn)};
+  }
+
+  template <typename Fn>
+  explicit ScopeGuardImpl(Fn&& fn, ScopeGuardImplBase&& failsafe)
+      : ScopeGuardImplBase{}, function_(std::forward<Fn>(fn)) {
+    failsafe.dismiss();
+  }
+
+  void* operator new(std::size_t) = delete;
+
+  void execute() noexcept { function_(); }
+
+  FunctionType function_;
+};
+
+template <typename F>
+using ScopeGuardImplDecay = ScopeGuardImpl<typename std::decay<F>::type>;
+
+} // namespace detail
+
+/**
+ * ScopeGuard is a general implementation of the "Initialization is
+ * Resource Acquisition" idiom.  Basically, it guarantees that a function
+ * is executed upon leaving the currrent scope unless otherwise told.
+ *
+ * The MakeGuard() function is used to create a new ScopeGuard object.
+ * It can be instantiated with a lambda function, a std::function<void()>,
+ * a functor, or a void(*)() function pointer.
+ *
+ *
+ * Usage example: Add a friend to memory iff it is also added to the db.
+ *
+ * void User::addFriend(User& newFriend) {
+ *   // add the friend to memory
+ *   friends_.push_back(&newFriend);
+ *
+ *   // If the db insertion that follows fails, we should
+ *   // remove it from memory.
+ *   auto guard = MakeGuard([&] { friends_.pop_back(); });
+ *
+ *   // this will throw an exception upon error, which
+ *   // makes the ScopeGuard execute UserCont::pop_back()
+ *   // once the Guard's destructor is called.
+ *   db_->addFriend(GetName(), newFriend.GetName());
+ *
+ *   // an exception was not thrown, so don't execute
+ *   // the Guard.
+ *   guard.dismiss();
+ * }
+ *
+ * Examine ScopeGuardTest.cpp for some more sample usage.
+ *
+ * Stolen from:
+ *   Andrei's and Petru Marginean's CUJ article:
+ *     http://drdobbs.com/184403758
+ *   and the loki library:
+ *     http://loki-lib.sourceforge.net/index.php?n=Idioms.ScopeGuardPointer
+ *   and triendl.kj article:
+ *     http://www.codeproject.com/KB/cpp/scope_guard.aspx
+ */
+template <typename F>
+detail::ScopeGuardImplDecay<F> MakeGuard(F&& f) noexcept(
+    noexcept(detail::ScopeGuardImplDecay<F>(static_cast<F&&>(f)))) {
+  return detail::ScopeGuardImplDecay<F>(static_cast<F&&>(f));
+}
+
+}  // namespaces
diff --git a/caffe2/core/static_tracepoint.h b/caffe2/core/static_tracepoint.h
new file mode 100644
index 0000000..5666163
--- /dev/null
+++ b/caffe2/core/static_tracepoint.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__))
+#include <caffe2/core/static_tracepoint_elfx86.h>
+
+#define CAFFE_SDT(name, ...)                                         \
+  CAFFE_SDT_PROBE_N(                                                 \
+    caffe2, name, CAFFE_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+#else
+#define CAFFE_SDT(name, ...) do {} while(0)
+#endif
diff --git a/caffe2/core/static_tracepoint_elfx86.h b/caffe2/core/static_tracepoint_elfx86.h
new file mode 100644
index 0000000..cfe3368
--- /dev/null
+++ b/caffe2/core/static_tracepoint_elfx86.h
@@ -0,0 +1,100 @@
+#pragma once
+
+// Default constraint for the probe arguments as operands.
+#ifndef CAFFE_SDT_ARG_CONSTRAINT
+#define CAFFE_SDT_ARG_CONSTRAINT      "nor"
+#endif
+
+// Instruction to emit for the probe.
+#define CAFFE_SDT_NOP                 nop
+
+// Note section properties.
+#define CAFFE_SDT_NOTE_NAME           "stapsdt"
+#define CAFFE_SDT_NOTE_TYPE           3
+
+// Size of address depending on platform.
+#ifdef __LP64__
+#define CAFFE_SDT_ASM_ADDR            .8byte
+#else
+#define CAFFE_SDT_ASM_ADDR            .4byte
+#endif
+
+// Assembler helper Macros.
+#define CAFFE_SDT_S(x)                #x
+#define CAFFE_SDT_ASM_1(x)            CAFFE_SDT_S(x) "\n"
+#define CAFFE_SDT_ASM_2(a, b)         CAFFE_SDT_S(a) "," CAFFE_SDT_S(b) "\n"
+#define CAFFE_SDT_ASM_3(a, b, c)      CAFFE_SDT_S(a) "," CAFFE_SDT_S(b) ","    \
+                                      CAFFE_SDT_S(c) "\n"
+#define CAFFE_SDT_ASM_STRING(x)       CAFFE_SDT_ASM_1(.asciz CAFFE_SDT_S(x))
+
+// Helper to determine the size of an argument.
+#define CAFFE_SDT_ISARRAY(x)  (__builtin_classify_type(x) == 14)
+#define CAFFE_SDT_ARGSIZE(x)  (CAFFE_SDT_ISARRAY(x) ? sizeof(void*) : sizeof(x))
+
+// Format of each probe arguments as operand.
+// Size of the arugment tagged with CAFFE_SDT_Sn, with "n" constraint.
+// Value of the argument tagged with CAFFE_SDT_An, with configured constraint.
+#define CAFFE_SDT_ARG(n, x)                                                    \
+  [CAFFE_SDT_S##n] "n"                ((size_t)CAFFE_SDT_ARGSIZE(x)),          \
+  [CAFFE_SDT_A##n] CAFFE_SDT_ARG_CONSTRAINT (x)
+
+// Templates to append arguments as operands.
+#define CAFFE_SDT_OPERANDS_0()        [__sdt_dummy] "g" (0)
+#define CAFFE_SDT_OPERANDS_1(_1)      CAFFE_SDT_ARG(1, _1)
+#define CAFFE_SDT_OPERANDS_2(_1, _2)                                           \
+  CAFFE_SDT_OPERANDS_1(_1), CAFFE_SDT_ARG(2, _2)
+#define CAFFE_SDT_OPERANDS_3(_1, _2, _3)                                       \
+  CAFFE_SDT_OPERANDS_2(_1, _2), CAFFE_SDT_ARG(3, _3)
+#define CAFFE_SDT_OPERANDS_4(_1, _2, _3, _4)                                   \
+  CAFFE_SDT_OPERANDS_3(_1, _2, _3), CAFFE_SDT_ARG(4, _4)
+#define CAFFE_SDT_OPERANDS_5(_1, _2, _3, _4, _5)                               \
+  CAFFE_SDT_OPERANDS_4(_1, _2, _3, _4), CAFFE_SDT_ARG(5, _5)
+#define CAFFE_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6)                           \
+  CAFFE_SDT_OPERANDS_5(_1, _2, _3, _4, _5), CAFFE_SDT_ARG(6, _6)
+#define CAFFE_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7)                       \
+  CAFFE_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), CAFFE_SDT_ARG(7, _7)
+#define CAFFE_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8)                   \
+  CAFFE_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7), CAFFE_SDT_ARG(8, _8)
+
+// Templates to reference the arguments from operands in note section.
+#define CAFFE_SDT_ARGFMT(no)        %n[CAFFE_SDT_S##no]@%[CAFFE_SDT_A##no]
+#define CAFFE_SDT_ARG_TEMPLATE_0    /*No arguments*/
+#define CAFFE_SDT_ARG_TEMPLATE_1    CAFFE_SDT_ARGFMT(1)
+#define CAFFE_SDT_ARG_TEMPLATE_2    CAFFE_SDT_ARG_TEMPLATE_1 CAFFE_SDT_ARGFMT(2)
+#define CAFFE_SDT_ARG_TEMPLATE_3    CAFFE_SDT_ARG_TEMPLATE_2 CAFFE_SDT_ARGFMT(3)
+#define CAFFE_SDT_ARG_TEMPLATE_4    CAFFE_SDT_ARG_TEMPLATE_3 CAFFE_SDT_ARGFMT(4)
+#define CAFFE_SDT_ARG_TEMPLATE_5    CAFFE_SDT_ARG_TEMPLATE_4 CAFFE_SDT_ARGFMT(5)
+#define CAFFE_SDT_ARG_TEMPLATE_6    CAFFE_SDT_ARG_TEMPLATE_5 CAFFE_SDT_ARGFMT(6)
+#define CAFFE_SDT_ARG_TEMPLATE_7    CAFFE_SDT_ARG_TEMPLATE_6 CAFFE_SDT_ARGFMT(7)
+#define CAFFE_SDT_ARG_TEMPLATE_8    CAFFE_SDT_ARG_TEMPLATE_7 CAFFE_SDT_ARGFMT(8)
+
+// Structure of note section for the probe.
+#define CAFFE_SDT_NOTE_CONTENT(provider, name, arg_template)                   \
+  CAFFE_SDT_ASM_1(990: CAFFE_SDT_NOP)                                          \
+  CAFFE_SDT_ASM_3(     .pushsection .note.stapsdt,"","note")                   \
+  CAFFE_SDT_ASM_1(     .balign 4)                                              \
+  CAFFE_SDT_ASM_3(     .4byte 992f-991f, 994f-993f, CAFFE_SDT_NOTE_TYPE)       \
+  CAFFE_SDT_ASM_1(991: .asciz CAFFE_SDT_NOTE_NAME)                             \
+  CAFFE_SDT_ASM_1(992: .balign 4)                                              \
+  CAFFE_SDT_ASM_1(993: CAFFE_SDT_ASM_ADDR 990b)                                \
+  CAFFE_SDT_ASM_1(     CAFFE_SDT_ASM_ADDR 0) /*Reserved for Semaphore address*/\
+  CAFFE_SDT_ASM_1(     CAFFE_SDT_ASM_ADDR 0) /*Reserved for Semaphore name*/   \
+  CAFFE_SDT_ASM_STRING(provider)                                               \
+  CAFFE_SDT_ASM_STRING(name)                                                   \
+  CAFFE_SDT_ASM_STRING(arg_template)                                           \
+  CAFFE_SDT_ASM_1(994: .balign 4)                                              \
+  CAFFE_SDT_ASM_1(     .popsection)
+
+// Main probe Macro.
+#define CAFFE_SDT_PROBE(provider, name, n, arglist)                            \
+    __asm__ __volatile__ (                                                     \
+      CAFFE_SDT_NOTE_CONTENT(provider, name, CAFFE_SDT_ARG_TEMPLATE_##n)       \
+      :: CAFFE_SDT_OPERANDS_##n arglist                                        \
+    )                                                                          \
+
+// Helper Macros to handle variadic arguments.
+#define CAFFE_SDT_NARG_(_0, _1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
+#define CAFFE_SDT_NARG(...)                                                    \
+  CAFFE_SDT_NARG_(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define CAFFE_SDT_PROBE_N(provider, name, N, ...)                              \
+  CAFFE_SDT_PROBE(provider, name, N, (__VA_ARGS__))
diff --git a/caffe2/core/stats.cc b/caffe2/core/stats.cc
new file mode 100644
index 0000000..c217a61
--- /dev/null
+++ b/caffe2/core/stats.cc
@@ -0,0 +1,53 @@
+#include "caffe2/core/stats.h"
+
+#include <condition_variable>
+#include <thread>
+
+namespace caffe2 {
+
+ExportedStatMap toMap(const ExportedStatList& stats) {
+  ExportedStatMap statMap;
+  for (const auto& stat : stats) {
+    // allow for multiple instances of a key
+    statMap[stat.key] += stat.value;
+  }
+  return statMap;
+}
+
+StatValue* StatRegistry::add(const std::string& name) {
+  std::lock_guard<std::mutex> lg(mutex_);
+  auto it = stats_.find(name);
+  if (it != stats_.end()) {
+    return it->second.get();
+  }
+  auto v = std::unique_ptr<StatValue>(new StatValue);
+  auto value = v.get();
+  stats_.insert(std::make_pair(name, std::move(v)));
+  return value;
+}
+
+void StatRegistry::publish(ExportedStatList& exported, bool reset) {
+  std::lock_guard<std::mutex> lg(mutex_);
+  exported.resize(stats_.size());
+  int i = 0;
+  for (const auto& kv : stats_) {
+    auto& out = exported.at(i++);
+    out.key = kv.first;
+    out.value = reset ? kv.second->reset() : kv.second->get();
+    out.ts = std::chrono::high_resolution_clock::now();
+  }
+}
+
+void StatRegistry::update(const ExportedStatList& data) {
+  for (const auto& stat : data) {
+    add(stat.key)->increment(stat.value);
+  }
+}
+
+StatRegistry::~StatRegistry() {}
+
+StatRegistry& StatRegistry::get() {
+  static StatRegistry r;
+  return r;
+}
+}
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
new file mode 100644
index 0000000..42b5c75
--- /dev/null
+++ b/caffe2/core/stats.h
@@ -0,0 +1,333 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/logging.h"
+#include "caffe2/core/static_tracepoint.h"
+
+namespace caffe2 {
+
+class StatValue {
+  std::atomic<int64_t> v_{0};
+
+ public:
+  int64_t increment(int64_t inc) {
+    return v_ += inc;
+  }
+
+  int64_t reset(int64_t value = 0) {
+    return v_.exchange(value);
+  }
+
+  int64_t get() const {
+    return v_.load();
+  }
+};
+
+struct ExportedStatValue {
+  std::string key;
+  int64_t value;
+  std::chrono::time_point<std::chrono::high_resolution_clock> ts;
+};
+
+/**
+ * @brief Holds names and values of counters exported from a StatRegistry.
+ */
+using ExportedStatList = std::vector<ExportedStatValue>;
+using ExportedStatMap = std::unordered_map<std::string, int64_t>;
+
+ExportedStatMap toMap(const ExportedStatList& stats);
+
+/**
+ * @brief Holds a map of atomic counters keyed by name.
+ *
+ * The StatRegistry singleton, accessed through StatRegistry::get(), holds
+ * counters registered through the macro CAFFE_EXPORTED_STAT. Example of usage:
+ *
+ * struct MyCaffeClass {
+ *   MyCaffeClass(const std::string& instanceName): stats_(instanceName) {}
+ *   void run(int numRuns) {
+ *     try {
+ *       CAFFE_EVENT(stats_, num_runs, numRuns);
+ *       tryRun(numRuns);
+ *       CAFFE_EVENT(stats_, num_successes);
+ *     } catch (std::exception& e) {
+ *       CAFFE_EVENT(stats_, num_failures, 1, "arg_to_usdt", e.what());
+ *     }
+ *     CAFFE_EVENT(stats_, usdt_only, 1, "arg_to_usdt");
+ *   }
+ *  private:
+ *   struct MyStats {
+ *     CAFFE_STAT_CTOR(MyStats);
+ *     CAFFE_EXPORTED_STAT(num_runs);
+ *     CAFFE_EXPORTED_STAT(num_successes);
+ *     CAFFE_EXPORTED_STAT(num_failures);
+ *     CAFFE_STAT(usdt_only);
+ *   } stats_;
+ * };
+ *
+ * int main() {
+ *   MyCaffeClass a("first");
+ *   MyCaffeClass b("second");
+ *   for (int i = 0; i < 10; ++i) {
+ *     a.run(10);
+ *     b.run(5);
+ *   }
+ *   ExportedStatList finalStats;
+ *   StatRegistry::get().publish(finalStats);
+ * }
+ *
+ * For every new instance of MyCaffeClass, a new counter is created with
+ * the instance name as prefix. Everytime run() is called, the corresponding
+ * counter will be incremented by the given value, or 1 if value not provided.
+ *
+ * Counter values can then be exported into an ExportedStatList. In the
+ * example above, considering "tryRun" never throws, `finalStats` will be
+ * populated as follows:
+ *
+ *   first/num_runs       100
+ *   first/num_successes   10
+ *   first/num_failures     0
+ *   second/num_runs       50
+ *   second/num_successes  10
+ *   second/num_failures    0
+ *
+ * The event usdt_only is not present in ExportedStatList because it is declared
+ * as CAFFE_STAT, which does not create a counter.
+ *
+ * Additionally, for each call to CAFFE_EVENT, a USDT probe is generated.
+ * The probe will be set up with the following arguments:
+ *   - Probe name: field name (e.g. "num_runs")
+ *   - Arg #0: instance name (e.g. "first", "second")
+ *   - Arg #1: For CAFFE_EXPORTED_STAT, value of the updated counter
+ *             For CAFFE_STAT, -1 since no counter is available
+ *   - Args ...: Arguments passed to CAFFE_EVENT, including update value
+ *             when provided.
+ *
+ * It is also possible to create additional StatRegistry instances beyond
+ * the singleton. These instances are not automatically populated with
+ * CAFFE_EVENT. Instead, they can be populated from an ExportedStatList
+ * structure by calling StatRegistry::update().
+ *
+ */
+class StatRegistry {
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::unique_ptr<StatValue>> stats_;
+
+ public:
+  /**
+   * Retrieve the singleton StatRegistry, which gets populated
+   * through the CAFFE_EVENT macro.
+   */
+  static StatRegistry& get();
+
+  /**
+   * Add a new counter with given name. If a counter for this name already
+   * exists, returns a pointer to it.
+   */
+  StatValue* add(const std::string& name);
+
+  /**
+   * Populate an ExportedStatList with current counter values.
+   * If `reset` is true, resets all counters to zero. It is guaranteed that no
+   * count is lost.
+   */
+  void publish(ExportedStatList& exported, bool reset = false);
+
+  ExportedStatList publish(bool reset = false) {
+    ExportedStatList stats;
+    publish(stats, reset);
+    return stats;
+  }
+
+  /**
+   * Update values of counters contained in the given ExportedStatList to
+   * the values provided, creating counters that don't exist.
+   */
+  void update(const ExportedStatList& data);
+
+  ~StatRegistry();
+};
+
+struct Stat {
+  std::string groupName;
+  std::string name;
+  Stat(const std::string& gn, const std::string& n) : groupName(gn), name(n) {}
+
+  template <typename... Unused>
+  int64_t increment(Unused...) {
+    return -1;
+  }
+};
+
+class ExportedStat : public Stat {
+  StatValue* value_;
+
+ public:
+  ExportedStat(const std::string& gn, const std::string& n)
+      : Stat(gn, n), value_(StatRegistry::get().add(gn + "/" + n)) {}
+
+  int64_t increment(int64_t value = 1) {
+    return value_->increment(value);
+  }
+
+  template <typename T, typename Unused1, typename... Unused>
+  int64_t increment(T value, Unused1, Unused...) {
+    return increment(value);
+  }
+};
+
+class AvgExportedStat : public ExportedStat {
+ private:
+  ExportedStat count_;
+
+ public:
+  AvgExportedStat(const std::string& gn, const std::string& n)
+      : ExportedStat(gn, n + "/sum"), count_(gn, n + "/count") {}
+
+  int64_t increment(int64_t value = 1) {
+    count_.increment();
+    return ExportedStat::increment(value);
+  }
+
+  template <typename T, typename Unused1, typename... Unused>
+  int64_t increment(T value, Unused1, Unused...) {
+    return increment(value);
+  }
+};
+
+class StdDevExportedStat : public ExportedStat {
+  // Uses an offset (first_) to remove issue of cancellation
+  // Variance is then (sumsqoffset_ - (sumoffset_^2) / count_) / (count_ - 1)
+ private:
+  ExportedStat count_;
+  ExportedStat sumsqoffset_;
+  ExportedStat sumoffset_;
+  std::atomic<int64_t> first_{std::numeric_limits<int64_t>::min()};
+  int64_t const_min_{std::numeric_limits<int64_t>::min()};
+
+ public:
+  StdDevExportedStat(const std::string& gn, const std::string& n)
+      : ExportedStat(gn, n + "/sum"),
+        count_(gn, n + "/count"),
+        sumsqoffset_(gn, n + "/sumsqoffset"),
+        sumoffset_(gn, n + "/sumoffset") {}
+
+  int64_t increment(int64_t value = 1) {
+    first_.compare_exchange_strong(const_min_, value);
+    int64_t offset_value = first_.load();
+    int64_t orig_value = value;
+    value -= offset_value;
+    count_.increment();
+    sumsqoffset_.increment(value * value);
+    sumoffset_.increment(value);
+    return ExportedStat::increment(orig_value);
+  }
+
+  template <typename T, typename Unused1, typename... Unused>
+  int64_t increment(T value, Unused1, Unused...) {
+    return increment(value);
+  }
+};
+
+class DetailedExportedStat : public ExportedStat {
+ private:
+  std::vector<ExportedStat> details_;
+
+ public:
+  DetailedExportedStat(const std::string& gn, const std::string& n)
+      : ExportedStat(gn, n) {}
+
+  void setDetails(const std::vector<std::string>& detailNames) {
+    details_.clear();
+    for (const auto& detailName : detailNames) {
+      details_.emplace_back(groupName, name + "/" + detailName);
+    }
+  }
+
+  template <typename T, typename... Unused>
+  int64_t increment(T value, size_t detailIndex, Unused...) {
+    if (detailIndex < details_.size()) {
+      details_[detailIndex].increment(value);
+    }
+    return ExportedStat::increment(value);
+  }
+};
+
+namespace detail {
+
+template <class T>
+struct _ScopeGuard {
+  T f_;
+  std::chrono::high_resolution_clock::time_point start_;
+
+  explicit _ScopeGuard(T f)
+      : f_(f), start_(std::chrono::high_resolution_clock::now()) {}
+  ~_ScopeGuard() {
+    using namespace std::chrono;
+    auto duration = high_resolution_clock::now() - start_;
+    int64_t nanos = duration_cast<nanoseconds>(duration).count();
+    f_(nanos);
+  }
+
+  // Using implicit cast to bool so that it can be used in an 'if' condition
+  // within CAFFE_DURATION macro below.
+  /* implicit */ operator bool() {
+    return true;
+  }
+};
+
+template <class T>
+_ScopeGuard<T> ScopeGuard(T f) {
+  return _ScopeGuard<T>(f);
+}
+}
+
+#define CAFFE_STAT_CTOR(ClassName)                 \
+  ClassName(std::string name) : groupName(name) {} \
+  std::string groupName
+
+#define CAFFE_EXPORTED_STAT(name) \
+  ExportedStat name {             \
+    groupName, #name              \
+  }
+
+#define CAFFE_AVG_EXPORTED_STAT(name) \
+  AvgExportedStat name {              \
+    groupName, #name                  \
+  }
+
+#define CAFFE_STDDEV_EXPORTED_STAT(name) \
+  StdDevExportedStat name {              \
+    groupName, #name                     \
+  }
+
+#define CAFFE_DETAILED_EXPORTED_STAT(name) \
+  DetailedExportedStat name {              \
+    groupName, #name                       \
+  }
+
+#define CAFFE_STAT(name) \
+  Stat name {            \
+    groupName, #name     \
+  }
+
+#define CAFFE_EVENT(stats, field, ...)                              \
+  {                                                                 \
+    auto __caffe_event_value_ = stats.field.increment(__VA_ARGS__); \
+    CAFFE_SDT(                                                      \
+        field,                                                      \
+        stats.field.groupName.c_str(),                              \
+        __caffe_event_value_,                                       \
+        ##__VA_ARGS__);                                             \
+  }
+
+#define CAFFE_DURATION(stats, field, ...)                \
+  if (auto g = detail::ScopeGuard([&](int64_t nanos) {   \
+        CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \
+      }))
+}
diff --git a/caffe2/core/stats_test.cc b/caffe2/core/stats_test.cc
new file mode 100644
index 0000000..383daaf
--- /dev/null
+++ b/caffe2/core/stats_test.cc
@@ -0,0 +1,121 @@
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#include "caffe2/core/stats.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+struct MyCaffeClass {
+  explicit MyCaffeClass(const std::string& name) : stats_(name) {}
+
+  void tryRun(int) {}
+
+  void run(int numRuns) {
+    try {
+      CAFFE_EVENT(stats_, num_runs, numRuns);
+      tryRun(numRuns);
+      CAFFE_EVENT(stats_, num_successes);
+    } catch (std::exception& e) {
+      CAFFE_EVENT(stats_, num_failures, 1, "arg_to_usdt", e.what());
+    }
+    CAFFE_EVENT(stats_, usdt_only, 1, "arg_to_usdt");
+  }
+
+ private:
+  struct MyStats {
+    CAFFE_STAT_CTOR(MyStats);
+    CAFFE_EXPORTED_STAT(num_runs);
+    CAFFE_EXPORTED_STAT(num_successes);
+    CAFFE_EXPORTED_STAT(num_failures);
+    CAFFE_STAT(usdt_only);
+  } stats_;
+};
+
+ExportedStatMap filterMap(
+    const ExportedStatMap& map,
+    const ExportedStatMap& keys) {
+  ExportedStatMap filtered;
+  for (const auto& kv : map) {
+    if (keys.count(kv.first) > 0) {
+      filtered.insert(kv);
+    }
+  }
+  return filtered;
+}
+
+#define EXPECT_SUBSET(map, sub) EXPECT_EQ(filterMap((map), (sub)), (sub))
+
+TEST(StatsTest, StatsTestClass) {
+  MyCaffeClass a("first");
+  MyCaffeClass b("second");
+  for (int i = 0; i < 10; ++i) {
+    a.run(10);
+    b.run(5);
+  }
+  EXPECT_SUBSET(
+      ExportedStatMap({
+          {"first/num_runs", 100},
+          {"first/num_successes", 10},
+          {"first/num_failures", 0},
+          {"second/num_runs", 50},
+          {"second/num_successes", 10},
+          {"second/num_failures", 0},
+      }),
+      toMap(StatRegistry::get().publish()));
+}
+
+TEST(StatsTest, StatsTestDuration) {
+  struct TestStats {
+    CAFFE_STAT_CTOR(TestStats);
+    CAFFE_STAT(count);
+    CAFFE_AVG_EXPORTED_STAT(time_ns);
+  };
+  TestStats stats("stats");
+  CAFFE_DURATION(stats, time_ns) {
+    std::this_thread::sleep_for(std::chrono::microseconds(1));
+  }
+
+  ExportedStatList data;
+  StatRegistry::get().publish(data);
+  auto map = toMap(data);
+  auto countIt = map.find("stats/time_ns/count");
+  auto sumIt = map.find("stats/time_ns/sum");
+  EXPECT_TRUE(countIt != map.end() && sumIt != map.end());
+  EXPECT_EQ(countIt->second, 1);
+  EXPECT_GT(sumIt->second, 0);
+}
+
+TEST(StatsTest, StatsTestSimple) {
+  struct TestStats {
+    CAFFE_STAT_CTOR(TestStats);
+    CAFFE_STAT(s1);
+    CAFFE_STAT(s2);
+    CAFFE_EXPORTED_STAT(s3);
+  };
+  TestStats i1("i1");
+  TestStats i2("i2");
+  CAFFE_EVENT(i1, s1);
+  CAFFE_EVENT(i1, s2);
+  CAFFE_EVENT(i1, s3, 1);
+  CAFFE_EVENT(i1, s3, -1);
+  CAFFE_EVENT(i2, s3, 2);
+
+  ExportedStatList data;
+  StatRegistry::get().publish(data);
+  EXPECT_SUBSET(toMap(data), ExportedStatMap({{"i1/s3", 0}, {"i2/s3", 2}}));
+
+  StatRegistry reg2;
+  reg2.update(data);
+  reg2.update(data);
+
+  EXPECT_SUBSET(
+      toMap(reg2.publish(true)), ExportedStatMap({{"i1/s3", 0}, {"i2/s3", 4}}));
+  EXPECT_SUBSET(
+      toMap(reg2.publish()), ExportedStatMap({{"i1/s3", 0}, {"i2/s3", 0}}));
+}
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
new file mode 100644
index 0000000..1be8193
--- /dev/null
+++ b/caffe2/core/tensor.cc
@@ -0,0 +1,97 @@
+#include "caffe2/core/tensor.h"
+
+#include "caffe2/core/blob_stats.h"
+#include "caffe2/core/flags.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_keep_on_shrink,
+    true,
+    "If set, keeps memory when a tensor is shrinking its size.");
+
+CAFFE2_DEFINE_int64(
+    caffe2_max_keep_on_shrink_memory,
+    LLONG_MAX,
+    "The maximum memory in bytes to keep on shrink, if the difference between "
+    "tensor sizes is bigger than this then tensor will be reset.");
+
+namespace caffe2 {
+// declaring it here instead of context.cc because tensor.h includes context.h
+CAFFE_KNOWN_TYPE(Tensor<CPUContext>);
+
+TensorPrinter::TensorPrinter(
+    const std::string& tensor_name,
+    const std::string& file_name,
+    int limit)
+    : to_file_(!file_name.empty()),
+      limit_(limit ? limit : k_limit_default_),
+      tensor_name_(tensor_name) {
+  if (to_file_) {
+    // We will output to file instead of printing on screen.
+    // We will write each individual tensor to its individual file.
+    log_file_.reset(new std::ofstream(
+        file_name, std::ofstream::out | std::ofstream::trunc));
+    CAFFE_ENFORCE(
+        log_file_->good(),
+        "Failed to open TensorPrinter file ",
+        file_name,
+        ". rdstate() = ",
+        log_file_->rdstate());
+  }
+}
+
+TensorPrinter::~TensorPrinter() {
+  if (log_file_.get()) {
+    log_file_->close();
+  }
+}
+
+static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_ {
+  {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<CPUContext>}
+};
+
+TypeCall GetTypeCallFunction(CaffeTypeId id) {
+  auto f = type_call_registry_.find(id);
+  if (f == type_call_registry_.end()) {
+    return nullptr;
+  }
+  return f->second;
+}
+
+void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
+  type_call_registry_[id] = c;
+}
+
+static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
+    {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<CPUContext>}};
+
+TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
+  auto f = tensor_info_call_registry_.find(id);
+  if (f == tensor_info_call_registry_.end()) {
+    return nullptr;
+  }
+  return f->second;
+}
+
+void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
+  tensor_info_call_registry_[id] = c;
+}
+
+namespace {
+
+struct TensorCPUStatGetter : BlobStatGetter {
+  size_t sizeBytes(const Blob& blob) const override {
+    const auto& tensor = blob.Get<TensorCPU>();
+    auto nbytes = tensor.nbytes();
+    if (nbytes > 0 && tensor.IsType<std::string>()) {
+      const auto* data = tensor.data<std::string>();
+      for (int i = 0; i < tensor.size(); ++i) {
+        nbytes += data[i].size();
+      }
+    }
+    return nbytes;
+  }
+};
+REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
new file mode 100644
index 0000000..a0a1705
--- /dev/null
+++ b/caffe2/core/tensor.h
@@ -0,0 +1,894 @@
+#ifndef CAFFE2_CORE_TENSOR_H_
+#define CAFFE2_CORE_TENSOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <sstream>
+#include <type_traits>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/logging.h"
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrinked to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.
+CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+namespace caffe2 {
+
+/**
+ * A utility function to convert vector<int> to vector<TIndex>.
+ */
+inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
+  return vector<TIndex>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from K
+ */
+inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
+  TIndex r = 1;
+  for (size_t i = k; i < dims.size(); ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to
+inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
+  TIndex r = 1;
+  for (int i = 0; i < k; ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)l < dims.size());
+  TIndex r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  CAFFE_ENFORCE_GE(axis_index, -ndims);
+  CAFFE_ENFORCE_LT(axis_index, ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+/**
+ * @brief Tensor is the basic class in Caffe2 that stores a contiguous memory
+ * with its shape information.
+ *
+ * The Tensor class is essentially a wrapper around a device-specific memory
+ * (the device is specified by the Context template argument), and deals with
+ * the allocation and de-allocation of such memory. We make a simplified
+ * assumption that the memory is always contiguous.
+ */
+template <class Context>
+class Tensor {
+ public:
+  /**
+   * Initializes an empty tensor.
+   */
+  Tensor() {}
+
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
+  explicit Tensor(const vector<TIndex>& dims) { Resize(dims); }
+  explicit Tensor(const vector<int>& dims) { Resize(dims); }
+
+  /**
+   * @brief Creates a tensor from a source tensor, copying over the content.
+   *
+   * Note that the source tensor can be from a different device context. The
+   * second argument provides a device context object (either Context or
+   * SrcContext) that will be responsible for copying the underlying data.
+   * If you do not wish to pass in a Context object, an equivalent constructor
+   * function exists that will create an implicit context object for copy, but
+   * be noted that this will cause a potential performance hit.
+   */
+  template <class SrcContext, class ContextForCopy>
+  Tensor(const Tensor<SrcContext>& src, ContextForCopy* context) {
+    CopyFrom(src, context);
+  }
+
+  /**
+   * @brief Creates a tensor from a source tensor, copying over the content.
+   *
+   * Note that this may have a potential performance hit, since a temporary
+   * context object will be created for the memory copy. Prefer explicitly
+   * providing a context for copy if you can.
+   *
+   * Since it's a potentially expensive operation - making copy constructor
+   * explicit here. If SrcContext != Context it's actually a typecast
+   * constructor and it should be definitely explicit.
+   */
+  template <class SrcContext>
+  explicit Tensor(const Tensor<SrcContext>& src) {
+    CopyFrom(src);
+  }
+
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   */
+  template <typename T>
+  Tensor(const vector<TIndex>& dims, const vector<T>& values, Context* context)
+      : meta_(TypeMeta::Make<T>()) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size_);
+    context->template Copy<T, CPUContext, Context>(size_, values.data(), mutable_data<T>());
+  }
+
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   */
+  template <typename T,
+            typename = typename std::enable_if<std::is_scalar<T>::value>::type>
+  Tensor(const T& value, Context* context) {
+    Resize(vector<TIndex>{});
+    context->template Copy<T, CPUContext, Context>(size_, &value, mutable_data<T>());
+  }
+
+  /**
+   * @brief Copies the data from a source tensor, with a contex provided to
+   * carry out the underlying memcpy operation.
+   */
+  template <class SrcContext, class ContextForCopy>
+  void CopyFrom(const Tensor<SrcContext>& src, ContextForCopy* context) {
+    if ((void*)&src == (void*)this) {
+      return;
+    }
+    meta_ = src.meta();
+    if (src.size() == -1) {
+      dims_.clear();
+      size_ = -1;
+      data_.reset();
+      shares_data_ = false;
+      capacity_ = 0;
+      reserved_ = false;
+      return;
+    }
+    Resize(src.dims());
+    if (size() > 0) {
+      if (meta_.copy()) {
+        meta_.copy()(src.raw_data(), raw_mutable_data(), size());
+      } else {
+        context->template CopyBytes<SrcContext, Context>(
+            nbytes(), src.raw_data(), raw_mutable_data());
+      }
+    }
+  }
+
+  /**
+   * @brief Copies the data from a source tensor.
+   *
+   * Note that this may have a potential performance hit, since a temporary
+   * context object will be created for the memory copy. Prefer explicitly
+   * providing a context for copy if you can.
+   */
+  template <class SrcContext>
+  inline void CopyFrom(const Tensor<SrcContext>& src) {
+    SrcContext tmp_context;
+    CopyFrom(src, &tmp_context);
+  }
+
+  virtual ~Tensor() noexcept {}
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  template <class ContextForCopy>
+  void Extend(TIndex num, float growthPct, ContextForCopy* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    auto newDims = dims_;
+    newDims[0] += num;
+    if (!data_) {
+      Resize(newDims);
+      return;
+    }
+    auto newSize = std::accumulate(
+        newDims.begin(),
+        newDims.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newSize * meta_.itemsize() <= capacity_) {
+      dims_ = newDims;
+      size_ = newSize;
+      return;
+    }
+    auto newCapacity = dims_;
+    newCapacity[0] = std::max<size_t>(
+        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
+    Reserve(newCapacity, context);
+    dims_ = newDims;
+    size_ = newSize;
+  }
+
+  template <class T, class ContextForCopy>
+  void Reserve(const std::vector<T>& newCapacity, ContextForCopy* context) {
+    auto newSize = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newSize * meta_.itemsize() <= capacity_) {
+      return;
+    }
+    auto oldData = std::move(data_);
+    auto oldSize = size_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    auto* newData = raw_mutable_data(meta_);
+    context->template CopyItems<ContextForCopy, ContextForCopy>(
+        meta_, oldSize, oldData.get(), newData);
+    dims_ = oldDims;
+    size_ = oldSize;
+    reserved_ = true;
+  }
+
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shurnk tensor is maintained.
+   */
+  void Shrink(TIndex outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= dims_[0],
+        "New outer dimension must be smaller than current.");
+    dims_[0] = outer_dim;
+    size_ = std::accumulate(
+        dims_.begin(),
+        dims_.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+  }
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      // If needed, we will free the data. the next mutable_data() call
+      // will create the data storage.
+      int64_t new_size = size_ * meta_.itemsize();
+      bool reset_tensor = false;
+      if (reserved_) {
+        // If tensor is reserved then don't claim its memeory unless capacity_
+        // is smaller than new size
+        reset_tensor = capacity_ < new_size;
+      } else {
+        reset_tensor = capacity_ < new_size || !FLAGS_caffe2_keep_on_shrink ||
+            capacity_ - new_size > FLAGS_caffe2_max_keep_on_shrink_memory;
+      }
+
+      if (reset_tensor) {
+        FreeMemory();
+      }
+    }
+  }
+
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   */
+  template <class OtherContext>
+  inline void ResizeLike(const Tensor<OtherContext>& src_tensor) {
+    // Note: need casting for different context types.
+    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
+      Resize(src_tensor.dims());
+    }
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  inline void Reshape(const vector<TIndex>& dims) {
+    TIndex new_size = 1;
+    for (auto d : dims) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
+      new_size *= d;
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == size_,
+        "New size and old size are not equal. You cannot use Reshape, "
+        "but should use Resize."
+        // TODO(jiayq): remove the following warning after pending diffs
+        // stabilize.
+        " The old caffe2 mixes Reshape and Resize but this behavior has "
+        "been changed. If you find this error, most likely you will need "
+        "to change corresponding code from Reshape to Resize.");
+    dims_ = dims;
+  }
+
+  inline void Reshape(const vector<int>& dims) {
+    Reshape(ToVectorTIndex(dims));
+  }
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  inline void FreeMemory() {
+    data_.reset();
+    capacity_ = 0;
+    // If reserved is true and we changed tensor memory then it is fine
+    // to switch it to false, if Resize is called from Reserve and it triggers
+    // FreeMemory() then reserved_ will be set to true at end of Reserve()
+    reserved_ = false;
+  }
+
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
+  string DebugString() const {
+    std::stringstream ss;
+    ss << "A Tensor of item size " << itemsize() << " and type "
+       << meta_.name() << " and dimension (";
+    for (int d : dims_) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
+  }
+
+  void swap(Tensor<Context>& other) {
+    std::swap(dims_, other.dims_);
+    std::swap(size_, other.size_);
+    std::swap(meta_, other.meta_);
+    std::swap(data_, other.data_);
+    std::swap(shares_data_, other.shares_data_);
+    std::swap(capacity_, other.capacity_);
+    std::swap(reserved_, other.reserved_);
+  }
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  void ShareData(const Tensor& src) {
+    meta_ = src.meta();
+    CAFFE_ENFORCE_EQ_WITH_CALLER(
+        src.size_,
+        size_,
+        "Size mismatch - did you call reshape before sharing the data?");
+    // It is possible that the source tensor hasn't called mutable_data() yet,
+    // in which case ShareData() doesn't make much sense since we don't really
+    // know what to share yet.
+    CAFFE_ENFORCE_WITH_CALLER(
+        src.data_.get() || src.size_ == 0,
+        "Source tensor has no content and has size > 0");
+    // Finally, do sharing.
+    data_ = src.data_;
+    capacity_ = src.capacity_;
+    shares_data_ = true;
+  }
+
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
+  template <typename T, typename Deleter = MemoryDeleter>
+  void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) {
+    ShareExternalPointer(src, TypeMeta::Make<T>(), capacity, d);
+  }
+
+  template <typename Deleter = MemoryDeleter>
+  void ShareExternalPointer(
+      void* src,
+      const TypeMeta& meta,
+      size_t capacity = 0,
+      Deleter d = nullptr) {
+    meta_ = meta;
+    CAFFE_ENFORCE_WITH_CALLER(
+        meta_.id() != CaffeTypeId::uninitialized(),
+        "To share with a raw external pointer you need to have meta "
+        "already set.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        size_ >= 0,
+        "To share data with a raw pointer, you need to set shape first.");
+    // Check if the deleter is a MemoryDeleter and is a simple nullptr.
+    if (std::is_same<MemoryDeleter, Deleter>::value &&
+        reinterpret_cast<MemoryDeleter*>(&d)[0] == nullptr) {
+      // Use aliasing constructor trick to avoid calling the destructor.
+      data_ = std::shared_ptr<void>(std::shared_ptr<void>(), src);
+    } else {
+      data_.reset(src, d);
+    }
+    // Sets capacity. If not specified, we will implicitly assume that
+    // the capacity is the current size.
+    if (capacity) {
+      capacity_ = capacity;
+    } else {
+      capacity_ = nbytes();
+    }
+    shares_data_ = true;
+  }
+
+  bool shares_data() const {
+    return shares_data_;
+  }
+
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
+  inline const void* raw_data() const {
+    CAFFE_ENFORCE_WITH_CALLER(data_.get() || size_ == 0);
+    return data_.get();
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage. mutable_data() or
+   * raw_mutable_data() must have been called prior to this function call, and
+   * the data type must be of the correct type. If you want to get a void*
+   * pointer instead, use raw_data().
+   */
+  template <typename T>
+  inline const T* data() const {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_.get() || size_ == 0,
+        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        IsType<T>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        TypeMeta::TypeName<T>(),
+        " while tensor contains ",
+        meta_.name());
+    return static_cast<T*>(data_.get());
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (meta_ == meta && (data_.get() || size_ == 0)) {
+      return data_.get();
+    } else {
+      bool had_special_dtor = meta_.dtor() != nullptr;
+      meta_ = meta;
+      CAFFE_ENFORCE_WITH_CALLER(
+          size_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (size_ == 0 ||
+          (meta.ctor() == nullptr && !had_special_dtor &&
+           capacity_ >= size_ * meta_.itemsize())) {
+        return data_.get();
+      }
+      if (meta.ctor()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = size_;
+        auto dtor = meta_.dtor();
+        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        auto deleter = ptr_and_deleter.second;
+        data_.reset(
+            ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
+              dtor(ptr, size);
+              deleter(ptr);
+            });
+        meta_.ctor()(data_.get(), size_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        data_.reset(ptr_and_deleter.first, ptr_and_deleter.second);
+      }
+      capacity_ = size_ * meta_.itemsize();
+      return data_.get();
+    }
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data() {
+    CAFFE_ENFORCE_WITH_CALLER(
+        meta_.id() != CaffeTypeId::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(meta_);
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+   template <typename T>
+    inline T* mutable_data() {
+      if ((size_ == 0 || data_.get()) && IsType<T>()) {
+        return static_cast<T*>(data_.get());
+      }
+      // Check it here statically - otherwise TypeMeta would throw the runtime
+      // error in attempt to invoke TypeMeta::ctor()
+      static_assert(
+          std::is_default_constructible<T>::value,
+          "Tensor can't hold non-default-constructible types");
+      return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
+    }
+
+
+  /**
+   * Returns the number of dimensions of the data.
+   */
+  inline int ndim() const { return dims_.size(); }
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
+  inline TIndex size() const { return size_; }
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
+  inline size_t itemsize() const { return meta_.itemsize(); }
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
+  inline size_t nbytes() const { return size_ * meta_.itemsize(); }
+
+  inline size_t capacity_nbytes() const {
+    return capacity_;
+  }
+  /**
+   * Returns the dimensions of the tensor as a vector.
+   */
+  inline const vector<TIndex>& dims() const { return dims_; }
+
+  inline TIndex size_from_dim(int k) const {
+    return size_from_dim_(k, dims_);
+  }
+
+  inline TIndex size_to_dim(int k) const {
+    return size_to_dim_(k, dims_);
+  }
+
+  inline TIndex size_between_dim(int k, int l) const {
+    return size_between_dim_(k, l, dims_);
+  }
+
+  /**
+  * Returns the 'canonical' version of a (usually)  user-specified axis,
+  * allowing for negative indexing (e.g., -1 for the last axis).
+  *
+  * @param axis_index the axis index.
+  *        If 0 <= index < ndim(), return index.
+  *        If -ndim <= index <= -1, return (ndim() - (-index)),
+  *        e.g., the last axis index (ndim() - 1) if index == -1,
+  *        the second to last if index == -2, etc.
+  *        Dies on out of range index.
+  */
+  inline int canonical_axis_index(int axis_index) const {
+    return canonical_axis_index_(axis_index, ndim());
+  }
+
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
+  template <typename T>
+  inline bool IsType() const { return meta_.Match<T>(); }
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
+  inline const TypeMeta& meta() const { return meta_; }
+
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of TIndex, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
+  inline int dim32(const int i) const {
+    #ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+    #endif
+    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
+    return static_cast<int>(dims_[i]);
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor. Note that the passed in index
+   * must be between 0 (inclusive) and the number of dimensions, otherwise
+   * this function will produce a fatal message.
+   */
+  inline TIndex dim(const int i) const {
+    #ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+    #endif
+    return dims_[i];
+  }
+
+  Tensor Clone() const {
+    Tensor x;
+    x.CopyFrom(*this);
+    return x;
+  }
+
+  Tensor(Tensor<Context>&& src) noexcept {
+    swap(src);
+  }
+
+  /**
+   * @brief Delete the copy constructor and use Clone explicitly
+   */
+  Tensor(const Tensor<Context>& src) = delete;
+
+ protected:
+  vector<TIndex> dims_;
+  TIndex size_ = -1;
+  TypeMeta meta_;
+  std::shared_ptr<void> data_;
+  bool shares_data_ = false;
+  size_t capacity_ = 0;
+  bool reserved_ = false;
+  // In case of chunk load we store how much data was already loaded
+
+ private:
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type>
+  bool SetDims(const vector<T>& src) {
+    auto old_size = size_;
+    dims_.resize(src.size());
+    TIndex new_size = 1;
+    for (size_t i = 0; i < src.size(); ++i) {
+      new_size *= src[i];
+      dims_[i] = src[i];
+    }
+    size_ = new_size;
+    return size_ != old_size;
+  }
+
+  bool SetDims() {
+    auto old_size = size_;
+    dims_.resize(0);
+    size_ = 1;
+    return size_ != old_size;
+  }
+
+  // TODO(jiayq): maybe rewrite the following functions with initializer list.
+  // NVCC does not play well with initializer lists last time, but worth
+  // another shot.
+  bool SetDims(const TIndex d0) {
+    auto old_size = size_;
+    dims_.resize(1);
+    dims_[0] = d0;
+    size_ = d0;
+    return size_ != old_size;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1) {
+    auto old_size = size_;
+    dims_.resize(2);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    size_ = d0 * d1;
+    return size_ != old_size;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+    auto old_size = size_;
+    dims_.resize(3);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    size_ = d0 * d1 * d2;
+    return size_ != old_size;
+  }
+
+  bool
+  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
+    auto old_size = size_;
+    dims_.resize(4);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    dims_[3] = d3;
+    size_ = d0 * d1 * d2 * d3;
+    return size_ != old_size;
+  }
+
+  // Note(jiayq): possibly a rule-of-three violation, but we explicitly
+  // discourage the use of = for Tensors.
+  Tensor& operator=(const Tensor& src) = delete;
+};
+
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<CPUContext> TensorCPU;
+
+constexpr int k_limit_default_ = 1000;
+
+// Type call registry
+typedef TypeMeta (*TypeCall)(const void*);
+TypeCall GetTypeCallFunction(CaffeTypeId id);
+void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c);
+
+template <class Context>
+TypeMeta GetTensorType(const void* c) {
+  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
+  return tc->meta();
+}
+
+// Shape call registry
+typedef vector<TIndex> (*TensorInfoCall)(
+    const void*,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device);
+TensorInfoCall GetTensorInfoFunction(CaffeTypeId id);
+void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c);
+
+template <class Context>
+vector<TIndex> GetTensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
+  *shares_data = tc->shares_data();
+  *capacity = tc->capacity_nbytes();
+  device->set_device_type(CPU);
+  device->set_cuda_gpu_id(0);
+  return tc->dims();
+}
+
+class TensorPrinter {
+ public:
+  explicit TensorPrinter(
+      const std::string& tensor_name = "",
+      const std::string& file_name = "",
+      int limit = k_limit_default_);
+  ~TensorPrinter();
+
+  template <class T>
+  void Print(const Tensor<CPUContext>& tensor);
+
+  template <class Context>
+  void PrintMeta(const Tensor<Context>& tensor);
+
+  template <class Context>
+  string MetaStr(const Tensor<Context>& tensor);
+
+ private:
+  bool to_file_;
+  int limit_;
+  std::unique_ptr<std::ofstream> log_file_;
+  std::string tensor_name_;
+};
+
+template <class T>
+void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
+  std::stringstream values_stream;
+  // One most likely doesn't want to print int64-number of items for visual
+  // inspection, so we cast down to int here.
+  int total_count = static_cast<int>(
+      std::min(tensor.size(), TIndex(limit_)));
+  const T* tensor_data = tensor.template data<T>();
+  for (int i = 0; i < total_count - 1; ++i) {
+    values_stream << tensor_data[i] << ",";
+  }
+  // We do not add a comma after the last item.
+  values_stream << tensor_data[total_count - 1];
+  if (to_file_) {
+    (*log_file_) << MetaStr(tensor) << values_stream.str() << std::endl;
+  } else {
+    // Log to console.
+    LOG(INFO) << MetaStr(tensor) << values_stream.str();
+  }
+}
+
+template <class Context>
+void TensorPrinter::PrintMeta(const Tensor<Context>& tensor) {
+  if (to_file_) {
+    (*log_file_) << MetaStr(tensor) << std::endl;
+  } else {
+    LOG(INFO) << MetaStr(tensor);
+  }
+}
+
+template <class Context>
+std::string TensorPrinter::MetaStr(const Tensor<Context>& tensor) {
+  std::stringstream meta_stream;
+  meta_stream << "Tensor " << tensor_name_ << " of type "
+              << tensor.meta().name() << ". Dims: (";
+  for (const auto dim : tensor.dims()) {
+    meta_stream << dim << ",";
+  }
+  meta_stream << "): ";
+  return meta_stream.str();
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_TENSOR_H_
diff --git a/caffe2/core/tensor_int8.cc b/caffe2/core/tensor_int8.cc
new file mode 100644
index 0000000..abb9b5b
--- /dev/null
+++ b/caffe2/core/tensor_int8.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/tensor_int8.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+CAFFE_KNOWN_TYPE(int8::Int8TensorCPU);
+} // namespace caffe2
diff --git a/caffe2/core/tensor_int8.h b/caffe2/core/tensor_int8.h
new file mode 100644
index 0000000..93efe66
--- /dev/null
+++ b/caffe2/core/tensor_int8.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_TENSOR_INT8_H_
+#define CAFFE2_TENSOR_INT8_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace int8 {
+
+struct Int8TensorCPU {
+  float scale{1.0};
+  int32_t zero_point{0};
+  // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
+  // parameters).
+  TensorCPU t;
+};
+} // namespace int8
+} // namespace caffe2
+
+#endif // CAFFE2_TENSOR_INT8_H_
diff --git a/caffe2/core/timer.h b/caffe2/core/timer.h
new file mode 100644
index 0000000..150aabe
--- /dev/null
+++ b/caffe2/core/timer.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_CORE_TIMER_H_
+#define CAFFE2_CORE_TIMER_H_
+
+#include <chrono>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+/**
+ * @brief A simple timer object for measuring time.
+ *
+ * This is a minimal class around a std::chrono::high_resolution_clock that
+ * serves as a utility class for testing code.
+ */
+class Timer {
+ public:
+  typedef std::chrono::high_resolution_clock clock;
+  typedef std::chrono::nanoseconds ns;
+  Timer() { Start(); }
+  /**
+   * @brief Starts a timer.
+   */
+  inline void Start() { start_time_ = clock::now(); }
+  inline float NanoSeconds() {
+    return static_cast<float>(
+        std::chrono::duration_cast<ns>(clock::now() - start_time_).count());
+  }
+  /**
+   * @brief Returns the elapsed time in milliseconds.
+   */
+  inline float MilliSeconds() { return NanoSeconds() / 1000000.f; }
+  /**
+   * @brief Returns the elapsed time in microseconds.
+   */
+  inline float MicroSeconds() { return NanoSeconds() / 1000.f; }
+  /**
+   * @brief Returns the elapsed time in seconds.
+   */
+  inline float Seconds() { return NanoSeconds() / 1000000000.f; }
+
+ protected:
+  std::chrono::time_point<clock> start_time_;
+  DISABLE_COPY_AND_ASSIGN(Timer);
+};
+}
+
+#endif  // CAFFE2_CORE_TIMER_H_
diff --git a/caffe2/core/timer_test.cc b/caffe2/core/timer_test.cc
new file mode 100644
index 0000000..765b204
--- /dev/null
+++ b/caffe2/core/timer_test.cc
@@ -0,0 +1,59 @@
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#include "caffe2/core/timer.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+TEST(TimerTest, Test) {
+  Timer timer;
+
+  // A timer auto-starts when it is constructed.
+  std::this_thread::sleep_for(std::chrono::microseconds(1));
+  EXPECT_GT(timer.NanoSeconds(), 0);
+
+  // Sleep for a while, and get the time.
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  float ns = timer.NanoSeconds();
+  float us = timer.MicroSeconds();
+  float ms = timer.MilliSeconds();
+
+  // Time should be at least accurate +- 10%.
+  EXPECT_NEAR(ns, 100000000, 10000000);
+  EXPECT_NEAR(us, 100000, 10000);
+  EXPECT_NEAR(ms, 100, 10);
+
+  // Test restarting the clock.
+  timer.Start();
+  EXPECT_LT(timer.MicroSeconds(), 1000);
+}
+
+TEST(TimerTest, TestLatency) {
+  constexpr int iter = 1000;
+  float latency = 0;
+  Timer timer;
+  for (int i = 0; i < iter; ++i) {
+    timer.Start();
+    latency += timer.NanoSeconds();
+  }
+  std::cout << "Average nanosecond latency is: " << latency / iter << std::endl;
+  latency = 0;
+  for (int i = 0; i < iter; ++i) {
+    timer.Start();
+    latency += timer.MicroSeconds();
+  }
+  std::cout << "Average microsecond latency is: " << latency / iter << std::endl;
+  latency = 0;
+  for (int i = 0; i < iter; ++i) {
+    timer.Start();
+    latency += timer.MilliSeconds();
+  }
+  std::cout << "Average millisecond latency is: " << latency / iter << std::endl;
+}
+
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc
new file mode 100644
index 0000000..5ea42fe
--- /dev/null
+++ b/caffe2/core/transform.cc
@@ -0,0 +1,259 @@
+#include "caffe2/core/transform.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+using transform::Graph;
+
+CAFFE_DEFINE_REGISTRY(TransformRegistry, Transform);
+
+std::vector<std::vector<int>> Transform::PatternMatch(const Graph& graph) {
+  // checks if the node at index i is matched already or not
+  std::vector<bool> matched(graph.size(), false);
+
+  // stores matches, which are ordered subgraphs of G
+  std::vector<std::vector<int>> matches;
+
+  // Consider every possible node as the starting point.
+  for (int idx = 0; idx < (int)graph.size(); ++idx) {
+    // The current working subgraph. We will try to add new nodes to this,
+    // when invoking the PatternRule.
+    std::vector<int> subgraph;
+
+    // The largest "validated" subgraph found so far.
+    // This will be mutated by PatternMatchHelper.
+    std::vector<int> best_subgraph;
+
+    // Only begin to match if the start node is accepted.
+    if (!matched.at(idx) && PatternRule(graph, subgraph, idx)) {
+      subgraph.push_back(idx);
+      PatternMatchHelper(graph, matched, &subgraph, &best_subgraph);
+      subgraph.pop_back();
+    }
+    if (best_subgraph.size() > 0) { // match found
+      matches.push_back(best_subgraph);
+      for (const auto& x : best_subgraph) {
+        matched[x] = true;
+      }
+    }
+  }
+  return matches;
+}
+
+void Transform::TryNeighbors(
+    const Graph& graph,
+    const std::map<int, std::vector<string>>& neighbors,
+    const std::vector<bool>& matched,
+    std::vector<int>* subgraph_ptr,
+    std::vector<int>* best_subgraph_ptr) {
+  auto& subgraph = *subgraph_ptr;
+  for (const auto& edge : neighbors) {
+    int j = edge.first;
+    if (std::find(subgraph.begin(), subgraph.end(), j) == subgraph.end()) {
+      if (!matched.at(j) && PatternRule(graph, subgraph, j)) {
+        subgraph.push_back(j);
+        PatternMatchHelper(graph, matched, subgraph_ptr, best_subgraph_ptr);
+        subgraph.pop_back();
+      }
+    }
+  }
+}
+
+void Transform::PatternMatchHelper(
+    const Graph& graph,
+    const std::vector<bool>& matched,
+    std::vector<int>* subgraph_ptr,
+    std::vector<int>* best_subgraph_ptr) {
+  CHECK(subgraph_ptr);
+  auto& subgraph = *subgraph_ptr;
+  CHECK(best_subgraph_ptr);
+  auto& best_subgraph = *best_subgraph_ptr;
+
+  // If the current subgraph is valid, and the largest we've seen so far,
+  // make it the best_subgraph.
+  if (ValidatorRule(graph, subgraph) &&
+      subgraph.size() > best_subgraph.size()) {
+    best_subgraph = subgraph;
+  }
+
+  size_t size_before = subgraph.size();
+
+  if (pattern_match_type_ == CONNECTED_SUBGRAPH) {
+    // Connected Component Order Pattern Matching
+    // We want to match subgraphs which are connected ConnectedComponents
+
+    // Try adding each parent and child of every node in the subgraph,
+    // and see if we can accept it.
+    for (size_t i = 0; i < subgraph.size(); i++) {
+      int x = subgraph[i];
+      TryNeighbors(
+          graph,
+          graph.node(x).children,
+          matched,
+          subgraph_ptr,
+          best_subgraph_ptr);
+      CAFFE_ENFORCE(
+          size_before == subgraph.size(),
+          "Subgraph size should not change after returning from recursive call.");
+      TryNeighbors(
+          graph,
+          graph.node(x).parents,
+          matched,
+          subgraph_ptr,
+          best_subgraph_ptr);
+      CAFFE_ENFORCE(
+          size_before == subgraph.size(),
+          "Subgraph size should not change after returning from recursive call.");
+    }
+  } else if (pattern_match_type_ == SORTED_WRT_EXECUTION_ORDER) {
+    // Sorted Execution Order Pattern matching
+    // We want to be able to match subgraphs in sorted execution order
+
+    // We can safely assume our subgraph is already sorted.
+    // This means, we only need to consider nodes that come after the LAST
+    // node in our current subgraph.
+    // Thus, we simply iterate over the nodes that come AFTER the last node of
+    // our current subgraph.
+    size_t start_idx = 0;
+    if (subgraph.size() > 0) {
+      start_idx = subgraph.back() + 1;
+    }
+    for (size_t i = start_idx; i < graph.size(); i++) {
+      if (!matched.at(i) && PatternRule(graph, subgraph, i)) {
+        subgraph.push_back(i);
+        PatternMatchHelper(graph, matched, subgraph_ptr, best_subgraph_ptr);
+        subgraph.pop_back();
+      }
+    }
+  } else if (pattern_match_type_ == GENERAL) {
+    // General Pattern matching
+    // We want to be able to match any ordered subgraph
+
+    // For every current subgraph, we consider all nodes to be
+    // the next candidate node, as long as it isn't already matched.
+    for (size_t i = 0; i < graph.size(); i++) {
+      if (std::find(subgraph.begin(), subgraph.end(), i) == subgraph.end()) {
+        // Then we try appending it to the subgraph.
+        if (!matched.at(i) && PatternRule(graph, subgraph, i)) {
+          subgraph.push_back(i);
+          PatternMatchHelper(graph, matched, subgraph_ptr, best_subgraph_ptr);
+          subgraph.pop_back();
+        }
+      }
+    }
+  } else {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+}
+
+void Transform::ReplacePattern(
+    const std::vector<vector<int>>& matches,
+    Graph* graph) {
+  for (const auto& match : matches) {
+    // Make sure each matched node is still active (not overwritten)
+    bool is_match_active = true;
+    for (int idx : match) {
+      if (!graph->is_node_active(idx)) {
+        is_match_active = false;
+      }
+    }
+
+    // Simply try to apply the replace rule upon every match.
+    if (is_match_active && !ReplaceRule(match, graph)) {
+      CAFFE_THROW("Replace failed!");
+    }
+  }
+}
+
+// The simple interface - performs the transformation upon a NetDef, and returns
+// the result.
+NetDef Transform::ApplyTo(const NetDef& orig_net) {
+  Graph g(orig_net);
+  const auto matches = PatternMatch(g);
+  ReplacePattern(matches, &g);
+  return g.GetNetDef();
+}
+
+// Create a Transform object
+unique_ptr<Transform> CreateTransform(string key) {
+  auto t = TransformRegistry()->Create(key);
+  CAFFE_ENFORCE(t != nullptr, "Transform not found in registry: ", key);
+  return t;
+}
+
+// Create a Transform object from registry,
+// and immediately apply it to a Netdef.
+NetDef ApplyTransform(const string& key, const NetDef& netdef) {
+  auto t = CreateTransform(key);
+  return t->ApplyTo(netdef);
+}
+
+double average_net_run_duration(
+    const NetDef& netdef,
+    const NetDef& init_netdef,
+    const int warmup_runs,
+    const int main_runs) {
+  Workspace ws;
+  if (init_netdef.op_size() > 0) {
+    std::unique_ptr<NetBase> init_net(CreateNet(init_netdef, &ws));
+    CHECK(init_net);
+    CAFFE_ENFORCE(init_net->Run(), "Init run has failed!");
+  } else {
+    // If a proper init_net is not provided, then this is the best we can do.
+    for (auto inp : netdef.external_input()) {
+      ws.CreateBlob(inp);
+    }
+  }
+  std::unique_ptr<NetBase> net(CreateNet(netdef, &ws));
+  CHECK(net);
+  CAFFE_ENFORCE(
+      warmup_runs >= 0,
+      "Number of warm up runs should be non negative, provided ",
+      warmup_runs,
+      ".");
+
+  for (int i = 0; i < warmup_runs; i++) {
+    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
+  }
+
+  CAFFE_ENFORCE(
+      main_runs > 0,
+      "Number of main runs should be positive, provided ",
+      main_runs,
+      ".");
+  Timer timer;
+  for (int i = 0; i < main_runs; i++) {
+    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
+  }
+  return timer.MilliSeconds();
+}
+
+// Create a Transform object from registry, apply it to a NetDef.
+// Will only return the transformed net if it is faster than the old net.
+// This will run the init net first, will run the two nets warmup_runs times.
+// Then, we will take the average time of main_runs runs, and only keep the
+// transformed net if it is faster by a factor of improvement_threshold.
+NetDef ApplyTransformIfFaster(
+    const string& key,
+    const NetDef& netdef,
+    const NetDef& init_netdef,
+    const int warmup_runs,
+    const int main_runs,
+    const double improvement_threshold) {
+  NetDef transformed_netdef = ApplyTransform(key, netdef);
+  double original_net_time =
+      average_net_run_duration(netdef, init_netdef, warmup_runs, main_runs);
+  double new_net_time = average_net_run_duration(
+      transformed_netdef, init_netdef, warmup_runs, main_runs);
+  if (original_net_time > improvement_threshold * new_net_time) {
+    return transformed_netdef;
+  }
+  return netdef;
+}
+
+} // namespace Caffe2
diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h
new file mode 100644
index 0000000..63f7e26
--- /dev/null
+++ b/caffe2/core/transform.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/graph.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+/**
+ * The Transform Base Object
+ *
+ * A Transform is an operation which manipulates a Caffe2 NetDef.
+ * You can consider it as a function: Transform.ApplyTo(NetDef) -> NetDef
+ *
+ * A Transform Operation does 4 things:
+ *    1) Creates a Graph object from a NetDef, which stores connections.
+ *    2) Pattern Matches on the Graph, to find subgraphs it wants to change.
+ *    3) Replaces the subgraphs that it's matched with new operators.
+ *    4) Creates a NetDef from the changed Graph, and returns it.
+ *
+ * The effect of a Transform is defined by its 3 protected virtual functions.
+ *    1) PatternRule determines for an ordered subgraph and a node, whether to
+ *        consider adding the node to the subgraph.
+ *    2) ValidatorRule determines, for an ordered subgraph, whether it is a
+ *        match.
+ *    3) ReplaceRule mutates the graph, based on a matched subgraph.
+ *
+ * This is the base class for all derived classes to base off. To create your
+ * own transform, write your implementations for PatternRule, ValidatorRule, and
+ * ReplaceRule.
+ */
+class Transform {
+ public:
+  Transform() {}
+
+  /**
+   * Apply a Transform onto a NetDef.
+   * Returns the transformed NetDef.
+   */
+  NetDef ApplyTo(const NetDef& orig_net_def);
+
+  virtual ~Transform() {}
+
+  /**
+   * Determines the type of subgraphs that PatternMatch will find.
+   *
+   * CONNECTED_SUBGRAPH will only match subgraphs that are connected.
+   * These subgraphs satisfy that every node of the match is connected to the
+   * subgraph of the nodes that come before it.
+   * For example, in the graph (1) --> (2) --> (3) --> (4),
+   *    This is capable of matching the subgraph [2, 3] and [4, 3]
+   *    This is not capable of matching the subgraph [2, 4].
+   *
+   *
+   * SORTED_WRT_EXECUTION_ORDER will match subgraphs that guarantee
+   * sorted execution order.
+   * The nodes don't have to be connected. It is faster than General.
+   * For example, in the graph (1) --> (2) --> (3) --> (4),
+   *    This is capable of matching the subgraph [2, 4], [3, 4].
+   *    This is not capable of matching the subgraph [3, 1], [4, 3].
+   *
+   *
+   * GENERAL can match any subgraph.
+   * For example, in the graph (1) --> (2) --> (3) --> (4),
+   *    This is capable of matching subgraphs [2, 4], [3, 4], [4, 2, 1].
+   *    There is no ordered subgraph of G that cannot be matched by this.
+   */
+  enum PatternMatchType {
+    CONNECTED_SUBGRAPH,
+    SORTED_WRT_EXECUTION_ORDER,
+    GENERAL
+  };
+
+  /**
+   * Generates all matches (stored as ordered subgraphs) and returns them.
+   *
+   * A match is stored as vector<int>, which is a mapping to OperatorDefs
+   * in Graph. The order matters.
+   */
+  std::vector<std::vector<int>> PatternMatch(const transform::Graph& graph);
+
+  /**
+   * Applies the replace rule onto each of the matches found.
+   */
+  void ReplacePattern(
+      const std::vector<std::vector<int>>& matches,
+      transform::Graph* graph);
+
+ protected:
+  /**
+   * The PatternRule essentially answers:
+   * Given the current subgraph (ordered), should we append the new node at idx?
+   */
+  virtual bool PatternRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph,
+      int /*idx*/) {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  /**
+   * The ValidatorRule essentially answers:
+   * Given a subgraph, can we accept it?
+   */
+  virtual bool ValidatorRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph) {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  /**
+   * The ReplaceRule actually mutates the graph, and applies the transformation
+   * upon the subgraph.
+   */
+  virtual bool ReplaceRule(
+      const std::vector<int>& subgraph,
+      transform::Graph* g_ptr) {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  void SetPatternMatchType(PatternMatchType type) {
+    pattern_match_type_ = type;
+  }
+
+ private:
+  /**
+   * A helper function for PatternMatch, which keeps track of the best subgraph
+   * so far.
+   */
+  void PatternMatchHelper(
+      const transform::Graph& graph,
+      const std::vector<bool>& matched,
+      std::vector<int>* subgraph_ptr,
+      std::vector<int>* best_subgraph_ptr);
+  /**
+   * Attempts to append each neighbor to the end of the subgraph.
+   */
+  void TryNeighbors(
+      const transform::Graph& graph,
+      const std::map<int, std::vector<string>>& neighbors,
+      const std::vector<bool>& matched,
+      std::vector<int>* subgraph_ptr,
+      std::vector<int>* best_subgraph_ptr);
+
+  PatternMatchType pattern_match_type_ = CONNECTED_SUBGRAPH;
+};
+
+// Creates a Transform based on a key, which should be defined in registry.
+unique_ptr<Transform> CreateTransform(string key);
+
+CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform);
+#define REGISTER_TRANSFORM(name, ...) \
+  CAFFE_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__)
+
+// Create a Transform object from registry,
+// and immediately apply it to a Netdef.
+NetDef ApplyTransform(const string& key, const NetDef& netdef);
+
+// Create a Transform object from registry, apply it to a NetDef.
+// Will only return the transformed net if it is faster than the old net.
+// This will run the init net first, will run the two nets warmup_runs times.
+// Then, we will take the average time of main_runs runs, and only keep the
+// transformed net if it is faster by a factor of improvement_threshold.
+NetDef ApplyTransformIfFaster(
+    const string& key,
+    const NetDef& netdef,
+    const NetDef& init_netdef,
+    const int warmup_runs,
+    const int main_runs,
+    const double improvement_threshold);
+
+} // namespace
diff --git a/caffe2/core/transform_test.cc b/caffe2/core/transform_test.cc
new file mode 100644
index 0000000..0a5b5c7
--- /dev/null
+++ b/caffe2/core/transform_test.cc
@@ -0,0 +1,459 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/transform.h"
+
+namespace caffe2 {
+
+namespace {
+
+using transform::Graph;
+
+static std::atomic<int> counter;
+
+class TransformDummyOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    counter.fetch_add(1);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(TransformDummyOp1, TransformDummyOp);
+
+OPERATOR_SCHEMA(TransformDummyOp1)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(TransformDummyOp2, TransformDummyOp);
+
+OPERATOR_SCHEMA(TransformDummyOp2)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(TransformDummyOp3, TransformDummyOp);
+
+OPERATOR_SCHEMA(TransformDummyOp3)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+/**
+ * This TransformDummy transform will find all subgraphs of shape
+ * (TransformDummyOp1 -> TransformDummyOp2) and replaces them with
+ * (TransformDummyOp3). Simple unit test.
+ */
+class DummyTransform : public Transform {
+ public:
+  // Finds all patterns of the form (TransformDummyOp1 -> TransformDummyOp2)
+  bool PatternRule(const Graph& g, const std::vector<int>& subgraph, int idx)
+      override {
+    if (subgraph.size() >= pattern_chain.size()) {
+      return false;
+    }
+    // which index are we trying to append the new node to?
+    int pattern_idx = subgraph.size();
+    // type doesn't match
+    if (g.node(idx).op.type() != pattern_chain[pattern_idx]) {
+      return false;
+    }
+    // not that head, and doesn't have exactly 1 parent
+    if (pattern_idx > 0 && g.node(idx).parents.size() != 1) {
+      return false;
+    }
+    // not that tail, and doesn't have exactly 1 child
+    if (pattern_idx < pattern_chain.size() - 1 &&
+        g.node(idx).children.size() != 1) {
+      return false;
+    }
+
+    return true;
+  }
+
+  // Checks if the subgraph matched is (TransformDummyOp1 -> TransformDummyOp2)
+  bool ValidatorRule(const Graph& g, const std::vector<int>& subgraph)
+      override {
+    if (subgraph.size() == 2) {
+      if (g.node(subgraph[0]).op.type() == "TransformDummyOp1" &&
+          g.node(subgraph[1]).op.type() == "TransformDummyOp2") {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Replaces a match of (TransformDummyOp1 -> TransformDummyOp2) with
+  // (TransformDummyOp3)
+  bool ReplaceRule(const std::vector<int>& match, Graph* g_ptr) override {
+    CHECK(g_ptr);
+    auto& g = *g_ptr;
+    OperatorDef new_op;
+    new_op.set_type("TransformDummyOp3");
+    int new_idx = g.size();
+
+    std::map<int, std::vector<string>> new_op_children;
+    std::map<int, std::vector<string>> new_op_parents;
+
+    // for each node parent in the head of the match, connect it to our new node
+    for (const auto& edge : g.node(match[0]).parents) {
+      int parent = edge.first;
+      for (const auto& blob : edge.second) {
+        g.node(parent).children[new_idx].push_back(blob);
+        new_op_parents[parent].push_back(blob);
+      }
+    }
+    for (const string& blob : g.node(match[0]).op.input()) {
+      new_op.add_input(blob);
+    }
+
+    // for each child in the tail of the match, connect it to our new node
+    for (const auto& edge : g.node(match[1]).children) {
+      int child = edge.first;
+      for (const auto& blob : edge.second) {
+        g.node(child).parents[new_idx].push_back(blob);
+        new_op_children[child].push_back(blob);
+      }
+    }
+    for (const string& blob : g.node(match[1]).op.output()) {
+      new_op.add_output(blob);
+    }
+
+    g.DeactivateSubgraph(match);
+
+    g.push_node(transform::Node(new_op, true, new_op_parents, new_op_children));
+    return true;
+  }
+
+ private:
+  const std::vector<string> pattern_chain = {"TransformDummyOp1",
+                                             "TransformDummyOp2"};
+};
+
+REGISTER_TRANSFORM(TransformDummySwap, DummyTransform)
+
+TEST(TransformTest, TestPatternMatch) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "TransformDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid3"}, {"out"});
+
+  auto t = CreateTransform("TransformDummySwap");
+  Graph g(netdef);
+  auto matches = t->PatternMatch(g);
+
+  EXPECT_EQ(matches.size(), 2);
+  EXPECT_EQ(matches[0][0], 0);
+  EXPECT_EQ(matches[0][1], 1);
+  EXPECT_EQ(matches[1][0], 2);
+  EXPECT_EQ(matches[1][1], 3);
+}
+
+TEST(TransformTest, TestReplacePattern) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "TransformDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid3"}, {"out"});
+
+  auto t = CreateTransform("TransformDummySwap");
+  Graph g(netdef);
+  std::vector<std::vector<int>> matches = {{0, 1}, {2, 3}};
+  t->ReplacePattern(matches, &g);
+
+  EXPECT_EQ(g.size(), 6);
+  EXPECT_FALSE(g.is_node_active(0));
+  EXPECT_FALSE(g.is_node_active(1));
+  EXPECT_FALSE(g.is_node_active(2));
+  EXPECT_FALSE(g.is_node_active(3));
+  EXPECT_TRUE(g.is_node_active(4));
+  EXPECT_TRUE(g.is_node_active(5));
+
+  EXPECT_EQ(g.node(4).children.size(), 1);
+  EXPECT_EQ(g.node(4).parents.size(), 0);
+  EXPECT_TRUE(g.node(4).children.count(5));
+
+  NetDef replaced_netdef = g.GetNetDef();
+
+  EXPECT_EQ(replaced_netdef.op().size(), 2);
+  EXPECT_EQ(replaced_netdef.op(0).type(), "TransformDummyOp3");
+  EXPECT_EQ(replaced_netdef.op(0).input(0), "in");
+  EXPECT_EQ(replaced_netdef.op(1).type(), "TransformDummyOp3");
+  EXPECT_EQ(replaced_netdef.op(1).output(0), "out");
+}
+
+TEST(TransformTest, TestTransformApply) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "TransformDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "TransformDummyOp2", {"mid3"}, {"out"});
+
+  NetDef replaced_netdef = ApplyTransform("TransformDummySwap", netdef);
+
+  EXPECT_EQ(replaced_netdef.op().size(), 2);
+  EXPECT_EQ(replaced_netdef.op(0).type(), "TransformDummyOp3");
+  EXPECT_EQ(replaced_netdef.op(0).input(0), "in");
+  EXPECT_EQ(replaced_netdef.op(1).type(), "TransformDummyOp3");
+  EXPECT_EQ(replaced_netdef.op(1).output(0), "out");
+}
+
+/**
+ * Transform with Sorted Order matching.
+ * Matches two operators of type TransformDummyOp1, even if disconnected.
+ * These operators will be given in execution order,
+ * but doesn't need connectivity.
+ * Changes them to TransformDummyOp2.
+ */
+class SortedDummyTransform : public Transform {
+ public:
+  SortedDummyTransform() {
+    SetPatternMatchType(SORTED_WRT_EXECUTION_ORDER);
+  }
+  bool PatternRule(const Graph& g, const std::vector<int>& subgraph, int idx)
+      override {
+    if (g.node(idx).op.type() != "TransformDummyOp1") {
+      return false;
+    }
+    return true;
+  }
+  bool ValidatorRule(const Graph& g, const std::vector<int>& subgraph)
+      override {
+    if (subgraph.size() == 2) {
+      if (g.node(subgraph[0]).op.type() == "TransformDummyOp1" &&
+          g.node(subgraph[1]).op.type() == "TransformDummyOp1") {
+        return true;
+      }
+    }
+    return false;
+  }
+  bool ReplaceRule(const std::vector<int>& match, Graph* g_ptr) override {
+    CHECK(g_ptr);
+    for (const auto& x : match) {
+      g_ptr->node(x).op.set_type("TransformDummyOp2");
+    }
+    return true;
+  }
+};
+
+REGISTER_TRANSFORM(SortedTransformDummySwap, SortedDummyTransform)
+
+TEST(TransformTest, TestPatternMatchTypeSortedOrder) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "TransformDummyOp3", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "TransformDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "TransformDummyOp3", {"mid3"}, {"out"});
+
+  auto t = CreateTransform("SortedTransformDummySwap");
+  NetDef replaced_netdef = t->ApplyTo(netdef);
+
+  EXPECT_EQ(replaced_netdef.op().size(), 4);
+  EXPECT_EQ(replaced_netdef.op(0).type(), "TransformDummyOp2");
+  EXPECT_EQ(replaced_netdef.op(2).type(), "TransformDummyOp2");
+}
+
+/**
+ * General subgraph transform.
+ * Matches a TransformDummyOp1, and a TransformDummyOp2.
+ * Order doesn't matter. Connectedness doesn't matter.
+ * Turns them into TransformDummyOp3.
+ */
+class GeneralDummyTransform : public Transform {
+ public:
+  GeneralDummyTransform() {
+    SetPatternMatchType(GENERAL);
+  }
+  bool PatternRule(const Graph& g, const std::vector<int>& subgraph, int idx)
+      override {
+    if (subgraph.size() == 0 && g.node(idx).op.type() == "TransformDummyOp1") {
+      return true;
+    }
+    if (subgraph.size() == 1 && g.node(idx).op.type() == "TransformDummyOp2") {
+      return true;
+    }
+    return false;
+  }
+  bool ValidatorRule(const Graph& g, const std::vector<int>& subgraph)
+      override {
+    if (subgraph.size() == 2) {
+      if (g.node(subgraph[0]).op.type() == "TransformDummyOp1" &&
+          g.node(subgraph[1]).op.type() == "TransformDummyOp2") {
+        return true;
+      }
+    }
+    return false;
+  }
+  bool ReplaceRule(const std::vector<int>& match, Graph* g_ptr) override {
+    CHECK(g_ptr);
+    for (const auto& x : match) {
+      g_ptr->node(x).op.set_type("TransformDummyOp3");
+    }
+    return true;
+  }
+};
+
+REGISTER_TRANSFORM(GeneralTransformDummySwap, GeneralDummyTransform)
+
+TEST(TransformTest, TestPatternMatchTypeGeneral) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef netdef;
+
+  AddOp(&netdef, "TransformDummyOp2", {"in"}, {"mid1"});
+  AddOp(&netdef, "TransformDummyOp3", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "TransformDummyOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "TransformDummyOp3", {"mid3"}, {"out"});
+
+  auto t = CreateTransform("GeneralTransformDummySwap");
+  NetDef replaced_netdef = t->ApplyTo(netdef);
+
+  EXPECT_EQ(replaced_netdef.op().size(), 4);
+  EXPECT_EQ(replaced_netdef.op(0).type(), "TransformDummyOp3");
+  EXPECT_EQ(replaced_netdef.op(2).type(), "TransformDummyOp3");
+}
+
+class TransformSleepFastOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    std::this_thread::sleep_for(std::chrono::milliseconds(30));
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(TransformSleepFastOp, TransformSleepFastOp);
+
+OPERATOR_SCHEMA(TransformSleepFastOp)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+class TransformSleepSlowOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(TransformSleepSlowOp, TransformSleepSlowOp);
+
+OPERATOR_SCHEMA(TransformSleepSlowOp)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+/**
+ * This TransformDummy transform will find all operators of type old_type,
+ * and replace them with type new_type.
+ */
+class TypeSwapTransform : public Transform {
+ public:
+  // Determine the actual strings through inheriting from derived type.
+  explicit TypeSwapTransform(string old_type, string new_type)
+      : old_type(old_type), new_type(new_type) {}
+
+  // Really simple, only accept if it's a FastSleepOp, and no match so far.
+  bool PatternRule(const Graph& g, const std::vector<int>& subgraph, int idx)
+      override {
+    if (subgraph.size() == 0 && g.node(idx).op.type() == old_type) {
+      return true;
+    }
+    return false;
+  }
+  // Checks if the subgraph matched is a FastSleepOp
+  bool ValidatorRule(const Graph& g, const std::vector<int>& subgraph)
+      override {
+    if (subgraph.size() == 1) {
+      if (g.node(subgraph[0]).op.type() == old_type) {
+        return true;
+      }
+    }
+    return false;
+  }
+  // Replaces op of original type to new type.
+  bool ReplaceRule(const std::vector<int>& match, Graph* g_ptr) override {
+    CHECK(g_ptr);
+    auto& g = *g_ptr;
+    g.node(match[0]).op.set_type(new_type);
+    return true;
+  }
+
+ private:
+  string old_type;
+  string new_type;
+};
+
+class FastToSlowTransform : public TypeSwapTransform {
+ public:
+  explicit FastToSlowTransform()
+      : TypeSwapTransform("TransformSleepFastOp", "TransformSleepSlowOp") {}
+};
+
+REGISTER_TRANSFORM(FastToSlow, FastToSlowTransform);
+
+class SlowToFastTransform : public TypeSwapTransform {
+ public:
+  explicit SlowToFastTransform()
+      : TypeSwapTransform("TransformSleepSlowOp", "TransformSleepFastOp") {}
+};
+
+REGISTER_TRANSFORM(SlowToFast, SlowToFastTransform);
+
+TEST(TransformTest, TestApplyTransformIfFasterIsFaster) {
+  NetDef init_netdef;
+  auto* op = AddOp(&init_netdef, "ConstantFill", {}, {"in"});
+
+  NetDef netdef;
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid"});
+  AddOp(&netdef, "TransformSleepSlowOp", {"mid"}, {"out"});
+  netdef.add_external_input("in"); // This is important for this function.
+
+  // Make sure the transform would work normally.
+  auto transformed_net = ApplyTransform("SlowToFast", netdef);
+  EXPECT_EQ(transformed_net.op(1).type(), "TransformSleepFastOp");
+
+  // Should be still transform normally.
+  auto mystery_net =
+      ApplyTransformIfFaster("SlowToFast", netdef, init_netdef, 5, 10, 1.01);
+  EXPECT_EQ(mystery_net.op(1).type(), "TransformSleepFastOp");
+}
+
+TEST(TransformTest, TestApplyTransformIfFasterButSlower) {
+  NetDef init_netdef;
+  auto* op = AddOp(&init_netdef, "ConstantFill", {}, {"in"});
+
+  NetDef netdef;
+  AddOp(&netdef, "TransformDummyOp1", {"in"}, {"mid"});
+  AddOp(&netdef, "TransformSleepFastOp", {"mid"}, {"out"});
+  netdef.add_external_input("in"); // This is important for this function.
+
+  // Make sure the transform would work normally.
+  auto transformed_net = ApplyTransform("FastToSlow", netdef);
+  EXPECT_EQ(transformed_net.op(1).type(), "TransformSleepSlowOp");
+
+  // Should not actually change!
+  auto mystery_net =
+      ApplyTransformIfFaster("FastToSlow", netdef, init_netdef, 5, 10, 1.01);
+  EXPECT_EQ(mystery_net.op(1).type(), "TransformSleepFastOp");
+}
+
+} // namespace
+
+} // namespace Caffe2
diff --git a/caffe2/core/typeid.cc b/caffe2/core/typeid.cc
new file mode 100644
index 0000000..2224931
--- /dev/null
+++ b/caffe2/core/typeid.cc
@@ -0,0 +1,82 @@
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/scope_guard.h"
+
+#include <atomic>
+
+#if !defined(_MSC_VER)
+#include <cxxabi.h>
+#endif
+
+using std::string;
+
+namespace caffe2 {
+
+std::unordered_map<CaffeTypeId, string>& gTypeNames() {
+  static std::unordered_map<CaffeTypeId, string> g_type_names;
+  return g_type_names;
+}
+
+std::unordered_set<string>& gRegisteredTypeNames() {
+  static std::unordered_set<string> g_registered_type_names;
+  return g_registered_type_names;
+}
+
+std::mutex& gTypeRegistrationMutex() {
+  static std::mutex g_type_registration_mutex;
+  return g_type_registration_mutex;
+}
+
+#if defined(_MSC_VER)
+// Windows does not have cxxabi.h, so we will simply return the original.
+string Demangle(const char* name) {
+  return string(name);
+}
+#else
+string Demangle(const char* name) {
+  int status = 0;
+  auto demangled = ::abi::__cxa_demangle(name, nullptr, nullptr, &status);
+  if (demangled) {
+    auto guard = caffe2::MakeGuard([demangled]() { free(demangled); });
+    return string(demangled);
+  }
+  return name;
+}
+#endif
+
+string GetExceptionString(const std::exception& e) {
+#ifdef __GXX_RTTI
+  return Demangle(typeid(e).name()) + ": " + e.what();
+#else
+  return string("Exception (no RTTI available): ") + e.what();
+#endif // __GXX_RTTI
+}
+
+void TypeMeta::_ThrowRuntimeTypeLogicError(const std::string& msg) {
+  // In earlier versions it used to be std::abort() but it's a bit hard-core
+  // for a library
+  CAFFE_THROW(msg);
+}
+
+CaffeTypeId CaffeTypeId::createTypeId() {
+  static std::atomic<CaffeTypeId::underlying_type> counter(0);
+  const CaffeTypeId::underlying_type new_value = ++counter; // note: first type id is 1 because 0 means uninitialized
+  if (new_value == std::numeric_limits<CaffeTypeId::underlying_type>::max()) {
+    throw std::logic_error("Ran out of available type ids. If you need more than 2^16 CAFFE_KNOWN_TYPEs, we need to increase CaffeTypeId to use more than 16 bit.");
+  }
+  return CaffeTypeId(new_value);
+}
+
+namespace {
+// This single registerer exists solely for us to be able to name a TypeMeta
+// for unintializied blob. You should not use this struct yourself - it is
+// intended to be only instantiated once here.
+struct UninitializedTypeNameRegisterer {
+    UninitializedTypeNameRegisterer() {
+      gTypeNames()[CaffeTypeId::uninitialized()] = "nullptr (uninitialized)";
+    }
+};
+static UninitializedTypeNameRegisterer g_uninitialized_type_name_registerer;
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
new file mode 100644
index 0000000..6b69458
--- /dev/null
+++ b/caffe2/core/typeid.h
@@ -0,0 +1,406 @@
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <unordered_map>
+#include <mutex>
+#include <type_traits>
+#include <unordered_set>
+#ifdef __GXX_RTTI
+#include <typeinfo>
+#endif
+
+#include <exception>
+
+#include "caffe2/core/common.h"
+#include "caffe2/utils/IdWrapper.h"
+
+namespace caffe2 {
+class CaffeTypeId;
+}
+
+std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId);
+
+namespace caffe2 {
+
+/**
+ * A type id is a unique id for a given C++ type.
+ * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use CaffeTypeId with custom types.
+ * This is for example used to store the dtype of tensors.
+ */
+class CaffeTypeId final : public c10::guts::IdWrapper<CaffeTypeId, uint16_t> {
+public:
+  static CaffeTypeId createTypeId();
+
+  friend std::ostream& ::operator<<(std::ostream& stream, CaffeTypeId typeId);
+  friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs);
+
+  // TODO Can we get rid of uninitialized?
+  static constexpr CaffeTypeId uninitialized() {
+    return CaffeTypeId(0);
+  }
+
+private:
+    constexpr explicit CaffeTypeId(uint16_t id): IdWrapper(id) {}
+};
+
+// Allow usage in std::map / std::set
+// TODO Disallow this and rather use std::unordered_map/set everywhere
+inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) {
+  return lhs.underlyingId() < rhs.underlyingId();
+}
+
+}
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId)
+
+inline std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId) {
+  return stream << typeId.underlyingId();
+}
+
+namespace caffe2 {
+
+std::unordered_map<CaffeTypeId, std::string>& gTypeNames();
+std::unordered_set<std::string>& gRegisteredTypeNames();
+
+// A utility function to demangle a function name.
+std::string Demangle(const char* name);
+
+/**
+ * Returns the printable name of the type.
+ *
+ * Works for all types, not only the ones registered with CAFFE_KNOWN_TYPE
+ */
+template <typename T>
+static const char* DemangleType() {
+#ifdef __GXX_RTTI
+  static const std::string name = Demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+// A utility function to return an exception std::string by prepending its exception
+// type before its what() content.
+std::string GetExceptionString(const std::exception& e);
+
+std::mutex& gTypeRegistrationMutex();
+
+template <typename T>
+struct TypeNameRegisterer {
+  TypeNameRegisterer(CaffeTypeId id, const std::string& literal_name) {
+    std::lock_guard<std::mutex> guard(gTypeRegistrationMutex());
+#ifdef __GXX_RTTI
+    (void)literal_name;
+
+    std::string name = Demangle(typeid(T).name());
+    // If we are in RTTI mode, we will also use this opportunity to do sanity
+    // check if there are duplicated ids registered for the same type. This
+    // usually happens when one does not do RTLD_GLOBAL, which is often the
+    // case in Python. The way we do the check is to make sure that there are
+    // no duplicated names registered - this could be done by checking the
+    // uniqueness of names.
+    if (gRegisteredTypeNames().count(name)) {
+      std::cerr << "Type name " << name
+                << " registered twice. This should "
+                   "not happen. Do you have duplicated CAFFE_KNOWN_TYPE?"
+                << std::endl;
+      throw std::runtime_error("TypeNameRegisterer error with type " + name);
+    }
+    gRegisteredTypeNames().insert(name);
+    gTypeNames()[id] = name;
+#else // __GXX_RTTI
+    if (literal_name.empty()) {
+      gTypeNames()[id] = "(RTTI disabled, cannot show name)";
+    } else {
+      gTypeNames()[id] = literal_name;
+    }
+#endif // __GXX_RTTI
+  }
+};
+
+/**
+ * TypeMeta is a thin class that allows us to store the type of a container such
+ * as a blob, or the data type of a tensor, with a unique run-time id. It also
+ * stores some additional data such as the item size and the name of the type
+ * for run-time inspection.
+ */
+class TypeMeta {
+ public:
+  using PlacementNew = void (void*, size_t);
+  using TypedCopy = void (const void*, void*, size_t);
+  using TypedDestructor = void (void*, size_t);
+  /** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
+   * type, use TypeMeta::Make<T>().
+   */
+  TypeMeta() noexcept
+      : id_(CaffeTypeId::uninitialized()), itemsize_(0), ctor_(nullptr), copy_(nullptr), dtor_(nullptr) {}
+
+  /**
+   * Copy constructor.
+   */
+  TypeMeta(const TypeMeta& src) noexcept = default;
+
+  /**
+   * Assignment operator.
+   */
+  TypeMeta& operator=(const TypeMeta& src) noexcept = default;
+
+  TypeMeta(TypeMeta &&rhs) noexcept = default;
+
+ private:
+  // TypeMeta can only be created by Make, making sure that we do not
+  // create incorrectly mixed up TypeMeta objects.
+  TypeMeta(
+      CaffeTypeId i,
+      size_t s,
+      PlacementNew* ctor,
+      TypedCopy* copy,
+      TypedDestructor* dtor) noexcept
+      : id_(i), itemsize_(s), ctor_(ctor), copy_(copy), dtor_(dtor) {}
+
+  // Mechanism for throwing errors which can't be prevented at compile time
+  // due to type erasure. E.g. somebody calling TypeMeta::copy() for
+  // non-copiable type. Right now just throws exception but is implemented
+  // in .cpp to manage dependencies
+  static void _ThrowRuntimeTypeLogicError(const std::string& msg);
+
+ public:
+  /**
+   * Returns the type id.
+   */
+  const CaffeTypeId& id() const noexcept {
+    return id_;
+  }
+  /**
+   * Returns the size of the item.
+   */
+  const size_t& itemsize() const noexcept {
+    return itemsize_;
+  }
+  /**
+   * Returns the placement new function pointer for individual items.
+   */
+  PlacementNew* ctor() const noexcept {
+    return ctor_;
+  }
+  /**
+   * Returns the typed copy function pointer for individual iterms.
+   */
+  TypedCopy* copy() const noexcept {
+    return copy_;
+  }
+  /**
+   * Returns the destructor function pointer for individual items.
+   */
+  TypedDestructor* dtor() const noexcept {
+    return dtor_;
+  }
+  /**
+   * Returns a printable name for the type.
+   */
+  const char* name() const noexcept {
+    auto it = gTypeNames().find(id_);
+    assert(it != gTypeNames().end());
+    return it->second.c_str();
+  }
+
+  friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
+
+  template <typename T>
+  bool Match() const {
+    return (id_ == Id<T>());
+  }
+
+  // Below are static functions that can be called by passing a specific type.
+
+  /**
+   * Returns the unique id for the given type T. The id is unique for the type T
+   * in the sense that for any two different types, their id are different; for
+   * the same type T, the id remains the same over different calls of the
+   * function. However, this is not guaranteed over different runs, as the id
+   * is generated during run-time. Do NOT serialize the id for storage.
+   */
+  template <typename T>
+  CAFFE2_API static CaffeTypeId Id();
+
+  /**
+   * Returns the item size of the type. This is equivalent to sizeof(T).
+   */
+  template <typename T>
+  static size_t ItemSize() {
+    return sizeof(T);
+  }
+
+  /**
+   * Returns the registered printable name of the type.
+   *
+   * Works for only the ones registered with CAFFE_KNOWN_TYPE
+   */
+  template <typename T>
+  static const char* TypeName() {
+    auto it = gTypeNames().find(Id<T>());
+    assert(it != gTypeNames().end());
+    return it->second.c_str();
+  }
+
+  /**
+   * Placement new function for the type.
+   */
+  template <typename T>
+  static void _Ctor(void* ptr, size_t n) {
+    T* typed_ptr = static_cast<T*>(ptr);
+    for (size_t i = 0; i < n; ++i) {
+      new (typed_ptr + i) T;
+    }
+  }
+
+  template <typename T>
+  static void _CtorNotDefault(void* /*ptr*/, size_t /*n*/) {
+    _ThrowRuntimeTypeLogicError(
+        "Type " + std::string(DemangleType<T>()) +
+        " is not default-constructible.");
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<std::is_default_constructible<T>::value>::type* =
+          nullptr>
+  static inline PlacementNew* _PickCtor() {
+    return _Ctor<T>;
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<!std::is_default_constructible<T>::value>::type* =
+          nullptr>
+  static inline PlacementNew* _PickCtor() {
+    return _CtorNotDefault<T>;
+  }
+
+  /**
+   * Typed copy function for classes.
+   */
+  template <typename T>
+  static void _Copy(const void* src, void* dst, size_t n) {
+    const T* typed_src = static_cast<const T*>(src);
+    T* typed_dst = static_cast<T*>(dst);
+    for (size_t i = 0; i < n; ++i) {
+      typed_dst[i] = typed_src[i];
+    }
+  }
+
+  /**
+   * A placeholder function for types that do not allow assignment.
+   */
+  template <typename T>
+  static void
+  _CopyNotAllowed(const void* /*src*/, void* /*dst*/, size_t /*n*/) {
+    _ThrowRuntimeTypeLogicError(
+        "Type " + std::string(DemangleType<T>()) +
+        " does not allow assignment.");
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<std::is_copy_assignable<T>::value>::type* =
+          nullptr>
+  static inline TypedCopy* _PickCopy() {
+    return _Copy<T>;
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<!std::is_copy_assignable<T>::value>::type* =
+          nullptr>
+  static inline TypedCopy* _PickCopy() {
+    return _CopyNotAllowed<T>;
+  }
+
+  /**
+   * Destructor for non-fundamental types.
+   */
+  template <typename T>
+  static void _Dtor(void* ptr, size_t n) {
+    T* typed_ptr = static_cast<T*>(ptr);
+    for (size_t i = 0; i < n; ++i) {
+      typed_ptr[i].~T();
+    }
+  }
+
+  /**
+   * Returns a TypeMeta object that corresponds to the typename T.
+   */
+  template <typename T>
+  static typename std::enable_if<
+      std::is_fundamental<T>::value || std::is_pointer<T>::value,
+      TypeMeta>::type
+  Make() {
+    return TypeMeta(Id<T>(), ItemSize<T>(), nullptr, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static typename std::enable_if<
+      !(std::is_fundamental<T>::value || std::is_pointer<T>::value),
+      TypeMeta>::type
+  Make() {
+    return TypeMeta(
+        Id<T>(), ItemSize<T>(), _PickCtor<T>(), _PickCopy<T>(), _Dtor<T>);
+  }
+
+ private:
+  CaffeTypeId id_;
+  size_t itemsize_;
+  PlacementNew* ctor_;
+  TypedCopy* copy_;
+  TypedDestructor* dtor_;
+};
+
+inline bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return (lhs.id_ == rhs.id_);
+}
+inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+/**
+ * Register unique id for a type so it can be used in TypeMeta context, e.g. be
+ * used as a type for Blob or for Tensor elements.
+ *
+ * CAFFE_KNOWN_TYPE does explicit instantiation of TypeMeta::Id<T> template
+ * function and thus needs to be put in a single translation unit (.cpp file)
+ * for a given type T. Other translation units that use type T as a type of the
+ * caffe2::Blob or element type of caffe2::Tensor need to depend on the
+ * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
+ * linkage dependencies.
+ *
+ * NOTE: the macro needs to be invoked in ::caffe2 namespace
+ */
+// Implementation note: in MSVC, we will need to prepend the CAFFE2_EXPORT
+// keyword in order to get things compiled properly. in Linux, gcc seems to
+// create attribute ignored error for explicit template instantiations, see
+//   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
+// and as a result, we define these two macros slightly differently.
+
+#ifdef _MSC_VER
+#define CAFFE_KNOWN_TYPE(T)                                                        \
+  template <>                                                                      \
+  CAFFE2_EXPORT CaffeTypeId TypeMeta::Id<T>() {                                    \
+    static const CaffeTypeId type_id = CaffeTypeId::createTypeId();                \
+    static TypeNameRegisterer<T> registerer(type_id, #T);                          \
+    return type_id;                                                                \
+  }
+#else // _MSC_VER
+#define CAFFE_KNOWN_TYPE(T)                                                        \
+  template <>                                                                      \
+  CaffeTypeId TypeMeta::Id<T>() {                                                  \
+    static const CaffeTypeId type_id = CaffeTypeId::createTypeId();                \
+    static TypeNameRegisterer<T> registerer(type_id, #T);                          \
+    return type_id;                                                                \
+  }
+#endif
+
+}
diff --git a/caffe2/core/typeid_test.cc b/caffe2/core/typeid_test.cc
new file mode 100644
index 0000000..cc62a10
--- /dev/null
+++ b/caffe2/core/typeid_test.cc
@@ -0,0 +1,140 @@
+#include "caffe2/core/typeid.h"
+#include "caffe2/core/types.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace {
+
+class TypeMetaTestFoo {};
+class TypeMetaTestBar {};
+}
+
+CAFFE_KNOWN_TYPE(TypeMetaTestFoo);
+CAFFE_KNOWN_TYPE(TypeMetaTestBar);
+
+namespace {
+
+TEST(TypeMetaTest, TypeMetaStatic) {
+  EXPECT_EQ(TypeMeta::ItemSize<int>(), sizeof(int));
+  EXPECT_EQ(TypeMeta::ItemSize<float>(), sizeof(float));
+  EXPECT_EQ(TypeMeta::ItemSize<TypeMetaTestFoo>(), sizeof(TypeMetaTestFoo));
+  EXPECT_EQ(TypeMeta::ItemSize<TypeMetaTestBar>(), sizeof(TypeMetaTestBar));
+  EXPECT_NE(TypeMeta::Id<int>(), TypeMeta::Id<float>());
+  EXPECT_NE(TypeMeta::Id<int>(), TypeMeta::Id<TypeMetaTestFoo>());
+  EXPECT_NE(TypeMeta::Id<TypeMetaTestFoo>(), TypeMeta::Id<TypeMetaTestBar>());
+  EXPECT_EQ(TypeMeta::Id<int>(), TypeMeta::Id<int>());
+  EXPECT_EQ(TypeMeta::Id<TypeMetaTestFoo>(), TypeMeta::Id<TypeMetaTestFoo>());
+}
+
+TEST(TypeMetaTest, Names) {
+  TypeMeta null_meta;
+  EXPECT_TRUE(string(null_meta.name()) == "nullptr (uninitialized)");
+  TypeMeta int_meta = TypeMeta::Make<int>();
+  EXPECT_TRUE(string(int_meta.name()) == "int");
+#ifdef __GXX_RTTI
+  TypeMeta string_meta = TypeMeta::Make<string>();
+  // For string, we should have a demangled name.
+  EXPECT_TRUE(
+      string(string_meta.name()) != typeid(string).name());
+  EXPECT_TRUE(
+      string(string_meta.name()) == Demangle(typeid(string).name()));
+#endif  // __GXX_RTTI
+}
+
+TEST(TypeMetaTest, TypeMeta) {
+  TypeMeta int_meta = TypeMeta::Make<int>();
+  TypeMeta float_meta = TypeMeta::Make<float>();
+  TypeMeta foo_meta = TypeMeta::Make<TypeMetaTestFoo>();
+  TypeMeta bar_meta = TypeMeta::Make<TypeMetaTestBar>();
+
+  TypeMeta another_int_meta = TypeMeta::Make<int>();
+  TypeMeta another_foo_meta = TypeMeta::Make<TypeMetaTestFoo>();
+
+  EXPECT_EQ(int_meta, another_int_meta);
+  EXPECT_EQ(foo_meta, another_foo_meta);
+  EXPECT_NE(int_meta, float_meta);
+  EXPECT_NE(int_meta, foo_meta);
+  EXPECT_NE(foo_meta, bar_meta);
+  EXPECT_TRUE(int_meta.Match<int>());
+  EXPECT_TRUE(foo_meta.Match<TypeMetaTestFoo>());
+  EXPECT_FALSE(int_meta.Match<float>());
+  EXPECT_FALSE(int_meta.Match<TypeMetaTestFoo>());
+  EXPECT_FALSE(foo_meta.Match<int>());
+  EXPECT_FALSE(foo_meta.Match<TypeMetaTestBar>());
+  EXPECT_EQ(int_meta.id(), TypeMeta::Id<int>());
+  EXPECT_EQ(float_meta.id(), TypeMeta::Id<float>());
+  EXPECT_EQ(foo_meta.id(), TypeMeta::Id<TypeMetaTestFoo>());
+  EXPECT_EQ(bar_meta.id(), TypeMeta::Id<TypeMetaTestBar>());
+  EXPECT_EQ(int_meta.itemsize(), TypeMeta::ItemSize<int>());
+  EXPECT_EQ(float_meta.itemsize(), TypeMeta::ItemSize<float>());
+  EXPECT_EQ(foo_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestFoo>());
+  EXPECT_EQ(bar_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestBar>());
+  EXPECT_STREQ(int_meta.name(), "int");
+  EXPECT_STREQ(float_meta.name(), "float");
+#ifdef __GXX_RTTI
+  EXPECT_NE(
+      std::string(foo_meta.name()).find("TypeMetaTestFoo"), std::string::npos);
+  EXPECT_NE(
+      std::string(bar_meta.name()).find("TypeMetaTestBar"), std::string::npos);
+#endif
+}
+
+
+class ClassAllowAssignment {
+ public:
+  ClassAllowAssignment() : x(42) {}
+  ClassAllowAssignment(const ClassAllowAssignment& src) : x(src.x) {}
+  ClassAllowAssignment& operator=(const ClassAllowAssignment& src) = default;
+  int x;
+};
+
+class ClassNoAssignment {
+ public:
+  ClassNoAssignment() : x(42) {}
+  ClassNoAssignment(const ClassNoAssignment& src) = delete;
+  ClassNoAssignment& operator=(const ClassNoAssignment& src) = delete;
+  int x;
+};
+}
+
+CAFFE_KNOWN_TYPE(ClassAllowAssignment);
+CAFFE_KNOWN_TYPE(ClassNoAssignment);
+
+namespace {
+
+TEST(TypeMetaTest, CtorDtorAndCopy) {
+  TypeMeta fundamental_meta = TypeMeta::Make<int>();
+  EXPECT_EQ(fundamental_meta.ctor(), nullptr);
+  EXPECT_EQ(fundamental_meta.dtor(), nullptr);
+  EXPECT_EQ(fundamental_meta.copy(), nullptr);
+
+  TypeMeta meta_a = TypeMeta::Make<ClassAllowAssignment>();
+  EXPECT_TRUE(meta_a.ctor() != nullptr);
+  EXPECT_TRUE(meta_a.dtor() != nullptr);
+  EXPECT_TRUE(meta_a.copy() != nullptr);
+  ClassAllowAssignment src;
+  src.x = 10;
+  ClassAllowAssignment dst;
+  EXPECT_EQ(dst.x, 42);
+  meta_a.copy()(&src, &dst, 1);
+  EXPECT_EQ(dst.x, 10);
+
+  TypeMeta meta_b = TypeMeta::Make<ClassNoAssignment>();
+
+  EXPECT_TRUE(meta_b.ctor() != nullptr);
+  EXPECT_TRUE(meta_b.dtor() != nullptr);
+#ifndef __clang__
+  // gtest seems to have some problem with function pointers and
+  // clang right now... Disabling it.
+  // TODO: figure out the real cause.
+  EXPECT_EQ(meta_b.copy(),
+            &(TypeMeta::_CopyNotAllowed<ClassNoAssignment>));
+#endif
+}
+
+TEST(TypeMetaTest, Float16IsNotUint16) {
+  EXPECT_NE(TypeMeta::Id<uint16_t>(), TypeMeta::Id<float16>());
+}
+
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/core/types.cc b/caffe2/core/types.cc
new file mode 100644
index 0000000..d174bfa
--- /dev/null
+++ b/caffe2/core/types.cc
@@ -0,0 +1,84 @@
+#include "caffe2/core/types.h"
+#include "caffe2/core/typeid.h"
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(float);
+CAFFE_KNOWN_TYPE(int);
+CAFFE_KNOWN_TYPE(std::string);
+CAFFE_KNOWN_TYPE(bool);
+CAFFE_KNOWN_TYPE(uint8_t);
+CAFFE_KNOWN_TYPE(int8_t);
+CAFFE_KNOWN_TYPE(uint16_t);
+CAFFE_KNOWN_TYPE(int16_t);
+CAFFE_KNOWN_TYPE(int64_t);
+CAFFE_KNOWN_TYPE(caffe2::float16);
+CAFFE_KNOWN_TYPE(double);
+CAFFE_KNOWN_TYPE(char);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::atomic<bool>>);
+CAFFE_KNOWN_TYPE(std::vector<int32_t>);
+CAFFE_KNOWN_TYPE(std::vector<int64_t>);
+CAFFE_KNOWN_TYPE(std::vector<unsigned long>);
+CAFFE_KNOWN_TYPE(bool*);
+CAFFE_KNOWN_TYPE(char*);
+CAFFE_KNOWN_TYPE(int*);
+
+#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
+CAFFE_KNOWN_TYPE(long);
+CAFFE_KNOWN_TYPE(std::vector<long>);
+#endif // CAFFE2_UNIQUE_LONG_TYPEMETA
+
+
+TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta) {
+  static_assert(sizeof(int) == 4,
+                "int in this compiler does not equal to 4 bytes.");
+  static std::map<CaffeTypeId, TensorProto::DataType> data_type_map {
+    {TypeMeta::Id<float>(), TensorProto_DataType_FLOAT},
+    {TypeMeta::Id<int>(), TensorProto_DataType_INT32},
+    // BYTE does not have a type meta to proto mapping: we should
+    // always use uint8_t when serializing. BYTE is kept for backward
+    // compatibility.
+    // {TypeMeta::Id<>(), TensorProto_DataType_BYTE},
+    {TypeMeta::Id<string>(), TensorProto_DataType_STRING},
+    {TypeMeta::Id<bool>(), TensorProto_DataType_BOOL},
+    {TypeMeta::Id<uint8_t>(), TensorProto_DataType_UINT8},
+    {TypeMeta::Id<int8_t>(), TensorProto_DataType_INT8},
+    {TypeMeta::Id<uint16_t>(), TensorProto_DataType_UINT16},
+    {TypeMeta::Id<int16_t>(), TensorProto_DataType_INT16},
+    {TypeMeta::Id<int64_t>(), TensorProto_DataType_INT64},
+    {TypeMeta::Id<float16>(), TensorProto_DataType_FLOAT16},
+    {TypeMeta::Id<double>(), TensorProto_DataType_DOUBLE},
+  };
+  const auto it = data_type_map.find(meta.id());
+  return (it == data_type_map.end()
+          ? TensorProto_DataType_UNDEFINED : it->second);
+}
+
+const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt) {
+  static std::map<TensorProto::DataType, TypeMeta> type_meta_map{
+      {TensorProto_DataType_FLOAT, TypeMeta::Make<float>()},
+      {TensorProto_DataType_INT32, TypeMeta::Make<int>()},
+      {TensorProto_DataType_STRING, TypeMeta::Make<std::string>()},
+      {TensorProto_DataType_BOOL, TypeMeta::Make<bool>()},
+      {TensorProto_DataType_UINT8, TypeMeta::Make<uint8_t>()},
+      {TensorProto_DataType_INT8, TypeMeta::Make<int8_t>()},
+      {TensorProto_DataType_UINT16, TypeMeta::Make<uint16_t>()},
+      {TensorProto_DataType_INT16, TypeMeta::Make<int16_t>()},
+      {TensorProto_DataType_INT64, TypeMeta::Make<int64_t>()},
+      {TensorProto_DataType_FLOAT16, TypeMeta::Make<float16>()},
+      {TensorProto_DataType_DOUBLE, TypeMeta::Make<double>()},
+  };
+  const auto it = type_meta_map.find(dt);
+  if (it == type_meta_map.end()) {
+    throw std::runtime_error("Unknown data type.");
+  }
+  return it->second;
+}
+
+}  // namespace caffe2
diff --git a/caffe2/core/types.h b/caffe2/core/types.h
new file mode 100644
index 0000000..b5385bb
--- /dev/null
+++ b/caffe2/core/types.h
@@ -0,0 +1,78 @@
+#ifndef CAFFE2_CORE_TYPES_H_
+#define CAFFE2_CORE_TYPES_H_
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// Storage orders that are often used in the image applications.
+enum StorageOrder {
+  UNKNOWN = 0,
+  NHWC = 1,
+  NCHW = 2,
+};
+
+inline StorageOrder StringToStorageOrder(const string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return StorageOrder::NHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return StorageOrder::NCHW;
+  } else {
+    LOG(ERROR) << "Unknown storage order string: " << str;
+    return StorageOrder::UNKNOWN;
+  }
+}
+
+inline constexpr char NameScopeSeparator() { return '/'; }
+
+// From TypeMeta to caffe2::DataType protobuffer enum.
+TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta);
+
+// From caffe2::DataType protobuffer enum to TypeMeta
+const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt);
+
+}  // namespace caffe2
+
+///////////////////////////////////////////////////////////////////////////////
+// Half float definition. Currently half float operators are mainly on CUDA
+// gpus.
+// The reason we do not directly use the cuda __half data type is because that
+// requires compilation with nvcc. The float16 data type should be compatible
+// with the cuda __half data type, but will allow us to refer to the data type
+// without the need of cuda.
+static_assert(sizeof(unsigned short) == 2,
+              "Short on this platform is not 16 bit.");
+namespace caffe2 {
+typedef struct CAFFE2_ALIGNED(2) __f16 { uint16_t x; } float16;
+
+// Helpers to avoid using typeinfo with -rtti
+template <typename T>
+inline bool fp16_type();
+// explicit instantation for float16 defined in types.cc.
+template <>
+inline bool fp16_type<float16>() {
+  return true;
+}
+// The rest.
+template <typename T>
+inline bool fp16_type() {
+  return false;
+}
+
+}  // namespace caffe2
+
+// Make __f16 a fundamental type.
+namespace std {
+template<>
+struct is_fundamental<caffe2::__f16> : std::integral_constant<bool, true> {
+};
+}  // namespace std
+
+#endif  // CAFFE2_CORE_TYPES_H_
diff --git a/caffe2/core/workspace.cc b/caffe2/core/workspace.cc
new file mode 100644
index 0000000..812a6a0
--- /dev/null
+++ b/caffe2/core/workspace.cc
@@ -0,0 +1,315 @@
+#include "caffe2/core/workspace.h"
+
+#include <algorithm>
+#include <ctime>
+#include <mutex>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/plan_executor.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_print_blob_sizes_at_exit,
+    false,
+    "If true, workspace destructor will print all blob shapes");
+
+namespace caffe2 {
+
+void Workspace::PrintBlobSizes() {
+  vector<string> blobs = LocalBlobs();
+  size_t cumtotal = 0;
+
+  // First get total sizes and sort
+  vector<std::pair<size_t, std::string>> blob_sizes;
+  for (const auto& s : blobs) {
+    Blob* b = this->GetBlob(s);
+    TensorInfoCall shape_fun = GetTensorInfoFunction(b->meta().id());
+    if (shape_fun) {
+      bool shares_data = false;
+      size_t capacity;
+      DeviceOption _device;
+      auto shape = shape_fun(b->GetRaw(), &shares_data, &capacity, &_device);
+      if (shares_data) {
+        // Blobs sharing data do not actually take any memory
+        capacity = 0;
+      }
+      cumtotal += capacity;
+      blob_sizes.push_back(make_pair(capacity, s));
+    }
+  }
+  std::sort(
+      blob_sizes.begin(),
+      blob_sizes.end(),
+      [](const std::pair<size_t, std::string>& a,
+         const std::pair<size_t, std::string>& b) {
+        return b.first < a.first;
+      });
+
+  // Then print in descending order
+  LOG(INFO) << "---- Workspace blobs: ---- ";
+  LOG(INFO) << "name;current shape;capacity bytes;percentage";
+  for (const auto& sb : blob_sizes) {
+    Blob* b = this->GetBlob(sb.second);
+    TensorInfoCall shape_fun = GetTensorInfoFunction(b->meta().id());
+    CHECK(shape_fun != nullptr);
+    bool _shares_data = false;
+    size_t capacity;
+    DeviceOption _device;
+
+    auto shape = shape_fun(b->GetRaw(), &_shares_data, &capacity, &_device);
+    std::stringstream ss;
+    ss << sb.second << ";";
+    for (const auto d : shape) {
+      ss << d << ",";
+    }
+    LOG(INFO) << ss.str() << ";" << sb.first << ";" << std::setprecision(3)
+              << (cumtotal > 0 ? 100.0 * double(sb.first) / cumtotal : 0.0)
+              << "%";
+  }
+  LOG(INFO) << "Total;;" << cumtotal << ";100%";
+}
+
+vector<string> Workspace::LocalBlobs() const {
+  vector<string> names;
+  names.reserve(blob_map_.size());
+  for (auto& entry : blob_map_) {
+    names.push_back(entry.first);
+  }
+  return names;
+}
+
+vector<string> Workspace::Blobs() const {
+  vector<string> names;
+  names.reserve(blob_map_.size());
+  for (auto& entry : blob_map_) {
+    names.push_back(entry.first);
+  }
+  for (const auto& forwarded : forwarded_blobs_) {
+    const auto parent_ws = forwarded.second.first;
+    const auto& parent_name = forwarded.second.second;
+    if (parent_ws->HasBlob(parent_name)) {
+      names.push_back(forwarded.first);
+    }
+  }
+  if (shared_) {
+    const auto& shared_blobs = shared_->Blobs();
+    names.insert(names.end(), shared_blobs.begin(), shared_blobs.end());
+  }
+  return names;
+}
+
+Blob* Workspace::CreateBlob(const string& name) {
+  if (HasBlob(name)) {
+    VLOG(1) << "Blob " << name << " already exists. Skipping.";
+  } else if (forwarded_blobs_.count(name)) {
+    // possible if parent workspace deletes forwarded blob
+    VLOG(1) << "Blob " << name << " is already forwarded from parent workspace "
+            << "(blob " << forwarded_blobs_[name].second << "). Skipping.";
+  } else {
+    VLOG(1) << "Creating blob " << name;
+    blob_map_[name] = unique_ptr<Blob>(new Blob());
+  }
+  return GetBlob(name);
+}
+
+Blob* Workspace::CreateLocalBlob(const string& name) {
+  if (blob_map_.count(name)) {
+    VLOG(1) << "Blob " << name << " already exists. Skipping.";
+  } else {
+    VLOG(1) << "Creating blob " << name;
+    blob_map_[name] = unique_ptr<Blob>(new Blob());
+  }
+  return GetBlob(name);
+}
+
+Blob* Workspace::RenameBlob(const string& old_name, const string& new_name) {
+  // We allow renaming only local blobs for API clarity purpose
+  auto it = blob_map_.find(old_name);
+  CAFFE_ENFORCE(
+      it != blob_map_.end(),
+      "Blob ",
+      old_name,
+      " is not in the local blob list");
+
+  // New blob can't be in any parent either, otherwise it will hide a parent
+  // blob
+  CAFFE_ENFORCE(
+      !HasBlob(new_name), "Blob ", new_name, "is already in the workspace");
+
+  // First delete the old record
+  auto value = std::move(it->second);
+  blob_map_.erase(it);
+
+  auto* raw_ptr = value.get();
+  blob_map_[new_name] = std::move(value);
+  return raw_ptr;
+}
+
+bool Workspace::RemoveBlob(const string& name) {
+  auto it = blob_map_.find(name);
+  if (it != blob_map_.end()) {
+    VLOG(1) << "Removing blob " << name << " from this workspace.";
+    blob_map_.erase(it);
+    return true;
+  }
+
+  // won't go into shared_ here
+  VLOG(1) << "Blob " << name << " not exists. Skipping.";
+  return false;
+}
+
+const Blob* Workspace::GetBlob(const string& name) const {
+  if (blob_map_.count(name)) {
+    return blob_map_.at(name).get();
+  } else if (forwarded_blobs_.count(name)) {
+    const auto parent_ws = forwarded_blobs_.at(name).first;
+    const auto& parent_name = forwarded_blobs_.at(name).second;
+    return parent_ws->GetBlob(parent_name);
+  } else if (shared_ && shared_->HasBlob(name)) {
+    return shared_->GetBlob(name);
+  }
+  LOG(WARNING) << "Blob " << name << " not in the workspace.";
+  // TODO(Yangqing): do we want to always print out the list of blobs here?
+  // LOG(WARNING) << "Current blobs:";
+  // for (const auto& entry : blob_map_) {
+  //   LOG(WARNING) << entry.first;
+  // }
+  return nullptr;
+}
+
+void Workspace::AddBlobMapping(
+    const Workspace* parent,
+    const std::unordered_map<string, string>& forwarded_blobs,
+    bool skip_defined_blobs) {
+  CAFFE_ENFORCE(parent, "Parent workspace must be specified");
+  for (const auto& forwarded : forwarded_blobs) {
+    CAFFE_ENFORCE(
+        parent->HasBlob(forwarded.second),
+        "Invalid parent workspace blob " + forwarded.second);
+    if (forwarded_blobs_.count(forwarded.first)) {
+      const auto& ws_blob = forwarded_blobs_[forwarded.first];
+      CAFFE_ENFORCE_EQ(
+          ws_blob.first, parent, "Redefinition of blob " + forwarded.first);
+      CAFFE_ENFORCE_EQ(
+          ws_blob.second,
+          forwarded.second,
+          "Redefinition of blob " + forwarded.first);
+    } else {
+      if (skip_defined_blobs && HasBlob(forwarded.first)) {
+        continue;
+      }
+      CAFFE_ENFORCE(
+          !HasBlob(forwarded.first), "Redefinition of blob " + forwarded.first);
+      // Lazy blob resolution - store the parent workspace and
+      // blob name, blob value might change in the parent workspace
+      forwarded_blobs_[forwarded.first] =
+          std::make_pair(parent, forwarded.second);
+    }
+  }
+}
+
+Blob* Workspace::GetBlob(const string& name) {
+  return const_cast<Blob*>(static_cast<const Workspace*>(this)->GetBlob(name));
+}
+
+NetBase* Workspace::CreateNet(const NetDef& net_def, bool overwrite) {
+  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
+  return CreateNet(tmp_net_def, overwrite);
+}
+
+NetBase* Workspace::CreateNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    bool overwrite) {
+  CAFFE_ENFORCE(net_def->has_name(), "Net definition should have a name.");
+  if (net_map_.count(net_def->name()) > 0) {
+    if (!overwrite) {
+      CAFFE_THROW(
+          "I respectfully refuse to overwrite an existing net of the same "
+          "name \"",
+          net_def->name(),
+          "\", unless you explicitly specify overwrite=true.");
+    }
+    VLOG(1) << "Deleting existing network of the same name.";
+    // Note(Yangqing): Why do we explicitly erase it here? Some components of
+    // the old network, such as an opened LevelDB, may prevent us from creating
+    // a new network before the old one is deleted. Thus we will need to first
+    // erase the old one before the new one can be constructed.
+    net_map_.erase(net_def->name());
+  }
+  // Create a new net with its name.
+  VLOG(1) << "Initializing network " << net_def->name();
+  net_map_[net_def->name()] =
+      unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
+  if (net_map_[net_def->name()].get() == nullptr) {
+    LOG(ERROR) << "Error when creating the network."
+               << "Maybe net type: [" << net_def->type() << "] does not exist";
+    net_map_.erase(net_def->name());
+    return nullptr;
+  }
+  return net_map_[net_def->name()].get();
+}
+
+NetBase* Workspace::GetNet(const string& name) {
+  if (!net_map_.count(name)) {
+    return nullptr;
+  } else {
+    return net_map_[name].get();
+  }
+}
+
+void Workspace::DeleteNet(const string& name) {
+  if (net_map_.count(name)) {
+    net_map_.erase(name);
+  }
+}
+
+bool Workspace::RunNet(const string& name) {
+  if (!net_map_.count(name)) {
+    LOG(ERROR) << "Network " << name << " does not exist yet.";
+    return false;
+  }
+  return net_map_[name]->Run();
+}
+
+bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
+  std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
+  if (op.get() == nullptr) {
+    LOG(ERROR) << "Cannot create operator of type " << op_def.type();
+    return false;
+  }
+  if (!op->Run()) {
+    LOG(ERROR) << "Error when running operator " << op_def.type();
+    return false;
+  }
+  return true;
+}
+bool Workspace::RunNetOnce(const NetDef& net_def) {
+  std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
+  if (net == nullptr) {
+    CAFFE_THROW(
+        "Could not create net: " + net_def.name() + " of type " +
+        net_def.type());
+  }
+  if (!net->Run()) {
+    LOG(ERROR) << "Error when running network " << net_def.name();
+    return false;
+  }
+  return true;
+}
+
+bool Workspace::RunPlan(const PlanDef& plan, ShouldContinue shouldContinue) {
+  return RunPlanOnWorkspace(this, plan, shouldContinue);
+}
+
+ThreadPool* Workspace::GetThreadPool() {
+  std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
+  if (!thread_pool_) {
+    thread_pool_ = ThreadPool::defaultThreadPool();
+  }
+  return thread_pool_.get();
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
new file mode 100644
index 0000000..a593604
--- /dev/null
+++ b/caffe2/core/workspace.h
@@ -0,0 +1,305 @@
+#ifndef CAFFE2_CORE_WORKSPACE_H_
+#define CAFFE2_CORE_WORKSPACE_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/observer.h"
+
+#include <climits>
+#include <cstddef>
+#include <mutex>
+#include <typeinfo>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/net.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/signal_handler.h"
+#include "caffe2/utils/threadpool/ThreadPool.h"
+
+CAFFE2_DECLARE_bool(caffe2_print_blob_sizes_at_exit);
+
+namespace caffe2 {
+
+class NetBase;
+
+struct StopOnSignal {
+  StopOnSignal()
+      : handler_(std::make_shared<SignalHandler>(
+            SignalHandler::Action::STOP,
+            SignalHandler::Action::STOP)) {}
+
+  StopOnSignal(const StopOnSignal& other) : handler_(other.handler_) {}
+
+  bool operator()(int /*iter*/) {
+    return handler_->CheckForSignals() != SignalHandler::Action::STOP;
+  }
+
+  std::shared_ptr<SignalHandler> handler_;
+};
+
+/**
+ * Workspace is a class that holds all the related objects created during
+ * runtime: (1) all blobs, and (2) all instantiated networks. It is the owner of
+ * all these objects and deals with the scaffolding logistics.
+ */
+class Workspace {
+ public:
+  typedef std::function<bool(int)> ShouldContinue;
+  typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
+  typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
+  /**
+   * Initializes an empty workspace.
+   */
+  Workspace() : root_folder_("."), shared_(nullptr) {}
+
+  /**
+   * Initializes an empty workspace with the given root folder.
+   *
+   * For any operators that are going to interface with the file system, such
+   * as load operators, they will write things under this root folder given
+   * by the workspace.
+   */
+  explicit Workspace(const string& root_folder)
+      : root_folder_(root_folder), shared_(nullptr) {}
+
+  /**
+   * Initializes a workspace with a shared workspace.
+   *
+   * When we access a Blob, we will first try to access the blob that exists
+   * in the local workspace, and if not, access the blob that exists in the
+   * shared workspace. The caller keeps the ownership of the shared workspace
+   * and is responsible for making sure that its lifetime is longer than the
+   * created workspace.
+   */
+  explicit Workspace(const Workspace* shared)
+      : root_folder_("."), shared_(shared) {}
+
+  /**
+   * Initializes workspace with parent workspace, blob name remapping
+   * (new name -> parent blob name), no other blobs are inherited from
+   * parent workspace
+   */
+  Workspace(
+      const Workspace* shared,
+      const std::unordered_map<string, string>& forwarded_blobs)
+      : root_folder_("."), shared_(nullptr) {
+    CAFFE_ENFORCE(shared, "Parent workspace must be specified");
+    for (const auto& forwarded : forwarded_blobs) {
+      CAFFE_ENFORCE(
+          shared->HasBlob(forwarded.second), "Invalid parent workspace blob");
+      forwarded_blobs_[forwarded.first] =
+          std::make_pair(shared, forwarded.second);
+    }
+  }
+
+  /**
+   * Initializes a workspace with a root folder and a shared workspace.
+   */
+  Workspace(const string& root_folder, Workspace* shared)
+      : root_folder_(root_folder), shared_(shared) {}
+
+  ~Workspace() {
+    if (FLAGS_caffe2_print_blob_sizes_at_exit) {
+      PrintBlobSizes();
+    }
+  }
+
+  /**
+   * Adds blob mappings from workspace to the blobs from parent workspace.
+   * Creates blobs under possibly new names that redirect read/write operations
+   * to the blobs in the parent workspace.
+   * Arguments:
+   *  parent - pointer to parent workspace
+   *  forwarded_blobs - map from new blob name to blob name in parent's
+   * workspace skip_defined_blob - if set skips blobs with names that already
+   * exist in the workspace, otherwise throws exception
+   */
+  void AddBlobMapping(
+      const Workspace* parent,
+      const std::unordered_map<string, string>& forwarded_blobs,
+      bool skip_defined_blobs = false);
+
+  /**
+   * Converts previously mapped tensor blobs to local blobs, copies values from
+   * parent workspace blobs into new local blobs. Ignores undefined blobs.
+   */
+  template <class Context>
+  void CopyForwardedTensors(const std::unordered_set<std::string>& blobs) {
+    for (const auto& blob : blobs) {
+      if (!forwarded_blobs_.count(blob)) {
+        continue;
+      }
+      const auto& ws_blob = forwarded_blobs_[blob];
+      const auto* parent_ws = ws_blob.first;
+      auto* from_blob = parent_ws->GetBlob(ws_blob.second);
+      CAFFE_ENFORCE(from_blob);
+      CAFFE_ENFORCE(
+          from_blob->template IsType<Tensor<Context>>(),
+          "Expected blob with tensor value",
+          ws_blob.second);
+      forwarded_blobs_.erase(blob);
+      auto* to_blob = CreateBlob(blob);
+      CAFFE_ENFORCE(to_blob);
+      const auto& from_tensor = from_blob->template Get<Tensor<Context>>();
+      auto* to_tensor = to_blob->template GetMutable<Tensor<Context>>();
+      to_tensor->CopyFrom(from_tensor);
+    }
+  }
+
+  /**
+   * Return list of blobs owned by this Workspace, not including blobs
+   * shared from parent workspace.
+   */
+  vector<string> LocalBlobs() const;
+
+  /**
+   * Return a list of blob names. This may be a bit slow since it will involve
+   * creation of multiple temp variables. For best performance, simply use
+   * HasBlob() and GetBlob().
+   */
+  vector<string> Blobs() const;
+
+  /**
+   * Return the root folder of the workspace.
+   */
+  const string& RootFolder() { return root_folder_; }
+  /**
+   * Checks if a blob with the given name is present in the current workspace.
+   */
+  inline bool HasBlob(const string& name) const {
+    // First, check the local workspace,
+    // Then, check the forwarding map, then the parent workspace
+    if (blob_map_.count(name)) {
+      return true;
+    } else if (forwarded_blobs_.count(name)) {
+      const auto parent_ws = forwarded_blobs_.at(name).first;
+      const auto& parent_name = forwarded_blobs_.at(name).second;
+      return parent_ws->HasBlob(parent_name);
+    } else if (shared_) {
+      return shared_->HasBlob(name);
+    }
+    return false;
+  }
+
+  void PrintBlobSizes();
+
+  /**
+   * Creates a blob of the given name. The pointer to the blob is returned, but
+   * the workspace keeps ownership of the pointer. If a blob of the given name
+   * already exists, the creation is skipped and the existing blob is returned.
+   */
+  Blob* CreateBlob(const string& name);
+  /**
+   * Similar to CreateBlob(), but it creates a blob in the local workspace even
+   * if another blob with the same name already exists in the parent workspace
+   * -- in such case the new blob hides the blob in parent workspace. If a blob
+   * of the given name already exists in the local workspace, the creation is
+   * skipped and the existing blob is returned.
+   */
+  Blob* CreateLocalBlob(const string& name);
+  /**
+   * Remove the blob of the given name. Return true if removed and false if
+   * not exist.
+   * Will NOT remove from the shared workspace.
+   */
+  bool RemoveBlob(const string& name);
+  /**
+   * Gets the blob with the given name as a const pointer. If the blob does not
+   * exist, a nullptr is returned.
+   */
+  const Blob* GetBlob(const string& name) const;
+  /**
+   * Gets the blob with the given name as a mutable pointer. If the blob does
+   * not exist, a nullptr is returned.
+   */
+  Blob* GetBlob(const string& name);
+
+  /**
+   * Renames a local workspace blob. If blob is not found in the local blob list
+   * or if the target name is already present in local or any parent blob list
+   * the function will throw.
+   */
+  Blob* RenameBlob(const string& old_name, const string& new_name);
+
+  /**
+   * Creates a network with the given NetDef, and returns the pointer to the
+   * network. If there is anything wrong during the creation of the network, a
+   * nullptr is returned. The Workspace keeps ownership of the pointer.
+   *
+   * If there is already a net created in the workspace with the given name,
+   * CreateNet will overwrite it if overwrite=true is specified. Otherwise, an
+   * exception is thrown.
+   */
+  NetBase* CreateNet(const NetDef& net_def, bool overwrite = false);
+  NetBase* CreateNet(
+      const std::shared_ptr<const NetDef>& net_def,
+      bool overwrite = false);
+  /**
+   * Gets the pointer to a created net. The workspace keeps ownership of the
+   * network.
+   */
+  NetBase* GetNet(const string& net_name);
+  /**
+   * Deletes the instantiated network with the given name.
+   */
+  void DeleteNet(const string& net_name);
+  /**
+   * Finds and runs the instantiated network with the given name. If the network
+   * does not exist or there are errors running the network, the function
+   * returns false.
+   */
+  bool RunNet(const string& net_name);
+
+  /**
+   * Returns a list of names of the currently instantiated networks.
+   */
+  vector<string> Nets() const {
+    vector<string> names;
+    for (auto& entry : net_map_) {
+      names.push_back(entry.first);
+    }
+    return names;
+  }
+
+  /**
+   * Runs a plan that has multiple nets and execution steps.
+   */
+  bool RunPlan(const PlanDef& plan_def,
+               ShouldContinue should_continue = StopOnSignal{});
+
+  /*
+   * Returns a CPU threadpool instace for parallel execution of
+   * work. The threadpool is created lazily; if no operators use it,
+   * then no threadpool will be created.
+   */
+  ThreadPool* GetThreadPool();
+
+  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
+  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
+  // have a persistent net object, while RunNetOnce creates a net and discards
+  // it on the fly - this may make things like database read and random number
+  // generators repeat the same thing over multiple calls.
+  bool RunOperatorOnce(const OperatorDef& op_def);
+  bool RunNetOnce(const NetDef& net_def);
+
+ public:
+  std::atomic<int> last_failed_op_net_position;
+
+ private:
+  BlobMap blob_map_;
+  NetMap net_map_;
+  const string root_folder_;
+  const Workspace* shared_;
+  std::unordered_map<string, std::pair<const Workspace*, string>>
+      forwarded_blobs_;
+  std::unique_ptr<ThreadPool> thread_pool_;
+  std::mutex thread_pool_creation_mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(Workspace);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_WORKSPACE_H_
diff --git a/caffe2/core/workspace_test.cc b/caffe2/core/workspace_test.cc
new file mode 100644
index 0000000..ae966e8
--- /dev/null
+++ b/caffe2/core/workspace_test.cc
@@ -0,0 +1,118 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+#include <gtest/gtest.h>
+
+
+namespace caffe2 {
+
+class WorkspaceTestFoo {};
+
+CAFFE_KNOWN_TYPE(WorkspaceTestFoo);
+
+TEST(WorkspaceTest, BlobAccess) {
+  Workspace ws;
+
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_NE(nullptr, ws.GetBlob("newblob"));
+  EXPECT_TRUE(ws.HasBlob("newblob"));
+
+  // Different names should still be not created.
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  // Check if the returned Blob is OK for all operations
+  Blob* blob = ws.GetBlob("newblob");
+  int* int_unused CAFFE2_UNUSED = blob->GetMutable<int>();
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<WorkspaceTestFoo>());
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+
+  // Re-creating the blob does not change the content as long as it already
+  // exists.
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<WorkspaceTestFoo>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+
+  // Re-creating the blob through CreateLocalBlob does not change the content
+  // either.
+  EXPECT_NE(nullptr, ws.CreateLocalBlob("newblob"));
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+
+  // test removing blob
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_FALSE(ws.RemoveBlob("nonexisting"));
+  EXPECT_TRUE(ws.HasBlob("newblob"));
+  EXPECT_TRUE(ws.RemoveBlob("newblob"));
+  EXPECT_FALSE(ws.HasBlob("newblob"));
+}
+
+TEST(WorkspaceTest, RunEmptyPlan) {
+  PlanDef plan_def;
+  Workspace ws;
+  EXPECT_TRUE(ws.RunPlan(plan_def));
+}
+
+TEST(WorkspaceTest, Sharing) {
+  Workspace parent;
+  EXPECT_FALSE(parent.HasBlob("a"));
+  EXPECT_TRUE(parent.CreateBlob("a"));
+  EXPECT_TRUE(parent.GetBlob("a"));
+  {
+    Workspace child(&parent);
+    // Child can access parent blobs
+    EXPECT_TRUE(child.HasBlob("a"));
+    EXPECT_TRUE(child.GetBlob("a"));
+    // Child can create local blobs
+    EXPECT_FALSE(child.HasBlob("b"));
+    EXPECT_FALSE(child.GetBlob("b"));
+    EXPECT_TRUE(child.CreateBlob("b"));
+    EXPECT_TRUE(child.GetBlob("b"));
+    // Parent cannot access child blobs
+    EXPECT_FALSE(parent.GetBlob("b"));
+    EXPECT_FALSE(parent.HasBlob("b"));
+    // Parent can create duplicate names
+    EXPECT_TRUE(parent.CreateBlob("b"));
+    // But child has local overrides
+    EXPECT_NE(child.GetBlob("b"), parent.GetBlob("b"));
+    // Child can create a blob that already exists in the parent
+    EXPECT_TRUE(child.CreateBlob("a"));
+    EXPECT_EQ(child.GetBlob("a"), parent.GetBlob("a"));
+    // Child can create a local blob for the blob already exists in the parent
+    EXPECT_TRUE(child.CreateLocalBlob("a"));
+    // But the local blob will be different from the one in parent workspace
+    EXPECT_NE(child.GetBlob("a"), parent.GetBlob("a"));
+  }
+}
+
+TEST(WorkspaceTest, BlobMapping) {
+  Workspace parent;
+  EXPECT_FALSE(parent.HasBlob("a"));
+  EXPECT_TRUE(parent.CreateBlob("a"));
+  EXPECT_TRUE(parent.GetBlob("a"));
+  {
+    std::unordered_map<string, string> forwarded_blobs;
+    forwarded_blobs["inner_a"] = "a";
+    Workspace child(&parent, forwarded_blobs);
+    EXPECT_FALSE(child.HasBlob("a"));
+    EXPECT_TRUE(child.HasBlob("inner_a"));
+    EXPECT_TRUE(child.GetBlob("inner_a"));
+    Workspace ws;
+    EXPECT_TRUE(ws.CreateBlob("b"));
+    forwarded_blobs.clear();
+    forwarded_blobs["inner_b"] = "b";
+    child.AddBlobMapping(&ws, forwarded_blobs);
+    EXPECT_FALSE(child.HasBlob("b"));
+    EXPECT_TRUE(child.HasBlob("inner_b"));
+    EXPECT_TRUE(child.GetBlob("inner_b"));
+  }
+}
+
+}  // namespace caffe2
diff --git a/caffe2/cuda_rtc/CMakeLists.txt b/caffe2/cuda_rtc/CMakeLists.txt
new file mode 100644
index 0000000..6bb289b
--- /dev/null
+++ b/caffe2/cuda_rtc/CMakeLists.txt
@@ -0,0 +1,11 @@
+if(USE_CUDA)
+    set(Caffe2_CUDA_RTC_GPU_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/elemenntwise_rtc_gpu.cc"
+        "${CMAKE_CURRENT_SOURCE_DIR}/pool_op_rtc_gpu.cc"
+    )
+
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CUDA_RTC_GPU_SRC})
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+else()
+    message(STATUS "CUDA RTC operators skipped due to no CUDA support")
+endif()
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h
new file mode 100644
index 0000000..80c2ba2
--- /dev/null
+++ b/caffe2/cuda_rtc/common_rtc.h
@@ -0,0 +1,120 @@
+#ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_
+#define CAFFE2_CUDA_RTC_COMMON_RTC_H_
+
+#include <sstream>
+#include <string>
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#define NVRTC_CHECK(condition)                                                 \
+  do {                                                                         \
+    nvrtcResult result = condition;                                            \
+    if (result != NVRTC_SUCCESS) {                                             \
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "   \
+                      << nvrtcGetErrorString(result);                          \
+    }                                                                          \
+  } while(0)
+
+namespace caffe2 {
+
+template <typename Derived>
+class CudaRTCFunction {
+ public:
+  CudaRTCFunction() : module_loaded_(false) {}
+  ~CudaRTCFunction() {
+    if (module_loaded_) {
+      CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
+    }
+  }
+
+  // TODO: this function is nontrivial and since CudaRTCFunction uses CRTP, it
+  // may potentially increase the binary size. In that case, move common parts
+  // into a separate function.
+  template <typename... Args>
+  void Compile(Args... args) {
+    string src = static_cast<Derived*>(this)->GetSource(args...);
+    string name = static_cast<Derived*>(this)->KernelName(args...);
+    VLOG(1) << "function name: " << name;
+    VLOG(1) << "function src:\n" << src;
+    // Actually do the compiling.
+    nvrtcProgram prog;
+    NVRTC_CHECK(nvrtcCreateProgram(
+        &prog, src.c_str(), nullptr, 0, nullptr, nullptr));
+    // Compile the program.
+    // TODO(Yangqing): how to find the current gpu architecture instead of hard
+    // coding it?
+    const char *nvrtc_opts[] = {"--gpu-architecture=compute_35",
+                                "--use_fast_math"};
+    nvrtcResult compile_result = nvrtcCompileProgram(
+        prog, 2, nvrtc_opts);
+    if (compile_result != NVRTC_SUCCESS) {
+      size_t log_size;
+      NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
+      vector<char> nvrtc_log(log_size);
+      NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
+      LOG(FATAL) << "Compilation failure for nvrtc("
+                 << nvrtcGetErrorString(compile_result) << "): \n"
+                 << nvrtc_log.data();
+    }
+    size_t ptx_size;
+    NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
+    vector<char> nvrtc_ptx(ptx_size);
+    NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
+    NVRTC_CHECK(nvrtcDestroyProgram(&prog));
+    // After compilation, load the module.
+    if (module_loaded_) {
+      CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
+    }
+    CUDA_DRIVERAPI_ENFORCE(
+        cuModuleLoadDataEx(&module_, nvrtc_ptx.data(), 0, 0, 0));
+    module_loaded_ = true;
+    CUDA_DRIVERAPI_ENFORCE(
+        cuModuleGetFunction(&kernel_, module_, name.c_str()));
+  }
+
+  template <typename... Args>
+  void Launch(unsigned int gx, unsigned int gy, unsigned int gz,
+              unsigned int bx, unsigned int by, unsigned int bz,
+              unsigned int shared_mem, cudaStream_t stream,
+              Args... args) {
+    CAFFE_ENFORCE(
+        module_loaded_, "Cannot call Launch before a module is loaded.");
+    void * args_voidp[] = {&args...};
+    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
+        kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
+  }
+
+  void LaunchEx(unsigned int gx, unsigned int gy, unsigned int gz,
+                unsigned int bx, unsigned int by, unsigned int bz,
+                unsigned int shared_mem, cudaStream_t stream,
+                void** extra) {
+    CAFFE_ENFORCE(
+        module_loaded_, "Cannot call Launch before a module is loaded.");
+    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
+        kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, nullptr, extra));
+  }
+
+ private:
+  bool module_loaded_;
+  CUmodule module_;
+  CUfunction kernel_;
+};
+
+// TODO: this is in no way unique and is just a hack right now.
+inline std::string GetUniqueName() {
+  static constexpr int len = 20;
+  static const char alpha[] =
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::stringstream ss;
+  ss << "_cuda_kernel_";
+  for (int i = 0; i < len; ++i) {
+    ss << alpha[rand() % (sizeof(alpha) - 1)];
+  }
+  return ss.str();
+}
+
+}  // namepsace caffe2
+
+#endif  // CAFFE2_CUDA_RTC_COMMON_RTC_H_
diff --git a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
new file mode 100644
index 0000000..6b3b62d
--- /dev/null
+++ b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
@@ -0,0 +1,123 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/cuda_rtc/common_rtc.h"
+
+namespace caffe2 {
+namespace {
+class ElementwiseRTCFunction
+    : public CudaRTCFunction<ElementwiseRTCFunction> {
+ public:
+  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
+
+  template <typename... Args>
+  string KernelName(Args... /*args*/) {
+    return name_;
+  }
+
+  template <typename... Args>
+  string GetSource(Args... args);
+
+ private:
+  string name_;
+};
+
+template<>
+string ElementwiseRTCFunction::GetSource(
+    int input_size, int output_size,
+    const string command_string) {
+  std::stringstream ss;
+  ss << "extern \"C\" __global__ void " << name_ <<
+        "(const size_t nthreads, \n";
+  // Insert the parameter list.
+  int remain_params = input_size + output_size;
+  for (int i = 0; i < input_size; ++i) {
+    ss << "const float* in" << i
+       << ((remain_params--) ? ", \n" : "");
+  }
+  for (int i = 0; i < output_size; ++i) {
+    ss << "float* out" << i
+       << ((remain_params--) ? ", \n" : "");
+  }
+  ss << ") {\n"
+        "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
+        "index < nthreads; index += blockDim.x * gridDim.x) {\n"
+     << command_string << "\n"
+     << "}\n}";
+  return ss.str();
+}
+}  // namespace
+
+/**
+ * A GPU operator that can generate limited elementwise operations.
+ *
+ * ElementwiseRTCOp allows one to do a simple and limited thing: it takes in
+ * multiple inputs and multiple outputs, as well as a raw string argument
+ * rtc_src. The runtime then generates the following kernel code:
+ *
+ *   __global__ void kernel_name(const size_t nthreads, ...) {
+ *     for(int index = blockIdx.x * blockDim.x + threadIdx.x;
+ *         index < nthreads; index += blockDim.x * gridDim.x) {
+ *       rtc_src
+ *     }
+ *   }
+ * where the "..." part is auto generated, so one can refer to the input and
+ * output as in0, in1, ..., out0, out1... in the rtc_src string.
+ *
+ * For example, if one wants to do a vector multiplication, one can take two
+ * inputs and one outputs, and write rtc_src as
+ *     out0[index] = in0[index] * in1[index];
+ *
+ * This op is currently highly experimental. We do not have a gradient
+ * registered for it either.
+ */
+class ElementwiseRTCOp final : public Operator<CUDAContext> {
+ public:
+  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {
+    const string src = OperatorBase::GetSingleArgument<string>(
+        "rtc_src", "");
+    CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
+    func_.Compile(InputSize(), OutputSize(), src);
+  }
+  ~ElementwiseRTCOp() {}
+
+  bool RunOnDevice() override {
+    static_assert(sizeof(void*) == sizeof(size_t),
+                  "The argbuffer relies on the assumption that void* and "
+                  "size_t have the same size.");
+    vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
+    size_t* argBuffer = argBuffer_vec.data();
+    CAFFE_ENFORCE(
+        Input(0).size() < std::numeric_limits<int>::max(),
+        "The kernel function currently only supports int index.");
+    argBuffer[0] = Input(0).size();
+    void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
+    for (int i = 0; i < InputSize(); ++i) {
+      ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
+    }
+    for (int i = 0; i < OutputSize(); ++i) {
+      Output(i)->ResizeLike(Input(0));
+      ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
+    }
+    size_t argBufferSize = sizeof(argBuffer);
+    void* config[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+      CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
+      CU_LAUNCH_PARAM_END
+    };
+    func_.LaunchEx(CAFFE_GET_BLOCKS(Input(0).size()), 1, 1,
+                   CAFFE_CUDA_NUM_THREADS, 1, 1,
+                   0, context_.cuda_stream(), config);
+    return true;
+  }
+
+ private:
+  ElementwiseRTCFunction func_;
+};
+
+namespace {
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
new file mode 100644
index 0000000..4dc8d59
--- /dev/null
+++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
@@ -0,0 +1,296 @@
+#include <cstdio>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pool_op.h"
+#include "caffe2/cuda_rtc/common_rtc.h"
+
+namespace caffe2 {
+namespace {
+class AveragePool {};
+class MaxPool {};
+}  // namespace
+
+namespace {
+
+// The max pool forward function, with parameters written in const int.
+const char kMaxPoolForwardNCHWSource[] = R"(
+extern "C"
+__global__ void %s(const float* bottom_data, float* top_data) {
+  const int nthreads = %d;
+  const int channels = %d;
+  const int height = %d;
+  const int width = %d;
+  const int pooled_height = %d;
+  const int pooled_width = %d;
+  const int kernel_h = %d;
+  const int kernel_w = %d;
+  const int stride_h = %d;
+  const int stride_w = %d;
+  const int pad_t = %d;
+  const int pad_l = %d;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < nthreads; index += blockDim.x * gridDim.x) {
+    int pw = index %% pooled_width;
+    int ph = (index / pooled_width) %% pooled_height;
+    int c = (index / (pooled_width * pooled_height)) %% channels;
+    int n = index / (pooled_width * pooled_height * channels);
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    float maxval = -1.0e37f;
+    const float* bdata_offset = bottom_data + n * channels * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        maxval = fmaxf(
+            bdata_offset[c * height * width + h * width + w], maxval);
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+)";
+
+// The max pool forward function, with parameters written in const int.
+const char kMaxPoolBackwardNCHWSource[] = R"(
+extern "C"
+__global__ void %s(
+    const float* const bottom_data, const float* const top_data,
+    const float* const top_diff, float* const bottom_diff) {
+  const int nthreads = %d;
+  const int num = %d;
+  const int channels = %d;
+  const int height = %d;
+  const int width = %d;
+  const int pooled_height = %d;
+  const int pooled_width = %d;
+  const int kernel_h = %d;
+  const int kernel_w = %d;
+  const int stride_h = %d;
+  const int stride_w = %d;
+  const int pad_t = %d;
+  const int pad_l = %d;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < nthreads; index += blockDim.x * gridDim.x) {
+    const int w = index %% width + pad_l;
+    const int h = (index / width) %% height + pad_t;
+    const int c = (index / width / height) %% channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int top_offset =
+        (n * channels + c) * pooled_height * pooled_width;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int top_local_offset = top_offset + ph * pooled_width + pw;
+        if (bottom_data[index] == top_data[top_local_offset]) {
+          bottom_diff[index] += top_diff[top_local_offset];
+        }
+      }
+    }
+  }
+}
+)";
+
+
+class MaxPoolRTCFunction : public CudaRTCFunction<MaxPoolRTCFunction> {
+ public:
+  MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
+
+  template <typename... Args>
+  string KernelName(Args... /*args*/) {
+    return name_;
+  }
+
+  template <typename... Args>
+  string GetSource(Args... args);
+
+ private:
+  string name_;
+};
+
+class MaxPoolGradientRTCFunction
+    : public CudaRTCFunction<MaxPoolGradientRTCFunction> {
+ public:
+  MaxPoolGradientRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
+
+  template <typename... Args>
+  string KernelName(Args... /*args*/) {
+    return name_;
+  }
+
+  template <typename... Args>
+  string GetSource(Args... args);
+
+ private:
+  string name_;
+};
+
+
+template <>
+string MaxPoolRTCFunction::GetSource(
+    const int output_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l) {
+  char buffer[65536];
+  int nbytes = snprintf(
+      buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
+      channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
+      stride_h, stride_w, pad_t, pad_l);
+  DCHECK_GE(nbytes, 0);
+  DCHECK_LT(nbytes, 65536);
+  return string(buffer);
+}
+
+template <>
+string MaxPoolGradientRTCFunction::GetSource(
+    const int output_size,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l) {
+  char buffer[65536];
+  int nbytes = snprintf(
+      buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
+      num, channels, height, width, pooled_height, pooled_width, kernel_h,
+      kernel_w, stride_h, stride_w, pad_t, pad_l);
+  DCHECK_GE(nbytes, 0);
+  DCHECK_LT(nbytes, 65536);
+  return string(buffer);
+}
+
+}  // namespace
+
+
+class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  MaxPoolRTCOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
+  }
+  ~MaxPoolRTCOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
+
+    if (input_dims_ != X.dims()) {
+      // recompile
+      VLOG(1) << "MaxPool RTC recompiling";
+      CAFFE_ENFORCE_LT(Y->size(), std::numeric_limits<int>::max());
+      func_.Compile(
+          static_cast<int>(Y->size()),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l());
+      input_dims_ = X.dims();
+    }
+    // Carry out the pooling computation.
+    func_.Launch(CAFFE_GET_BLOCKS(Y->size()), 1, 1, CAFFE_CUDA_NUM_THREADS,
+                 1, 1, 0, context_.cuda_stream(),
+                 X.data<float>(), Y->mutable_data<float>());
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    LOG(FATAL) << "Not implemented.";
+    return false;
+  }
+
+ private:
+  MaxPoolRTCFunction func_;
+  vector<TIndex> input_dims_;
+};
+
+class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  MaxPoolGradientRTCOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
+  }
+  ~MaxPoolGradientRTCOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dY = Input(2);
+    CAFFE_ENFORCE_EQ(dY.ndim(), 4);
+    auto* dX = Output(0);
+    dX->ResizeLike(X);
+    ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
+    if (input_dims_ != X.dims()) {
+      VLOG(1) << "MaxPoolGradient RTC recompiling";
+      CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max());
+      func_.Compile(
+          static_cast<int>(X.size()),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l());
+      input_dims_ = X.dims();
+    }
+    func_.Launch(CAFFE_GET_BLOCKS(X.size()), 1, 1, CAFFE_CUDA_NUM_THREADS, 1, 1,
+                 0, context_.cuda_stream(),
+                 X.data<float>(), Y.data<float>(), dY.data<float>(),
+                 dX->mutable_data<float>());
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    LOG(FATAL) << "Not implemented.";
+    return false;
+  }
+
+ private:
+  MaxPoolGradientRTCFunction func_;
+  vector<TIndex> input_dims_;
+};
+
+namespace {
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, MaxPoolRTCOp);
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
+                                   MaxPoolGradientRTCOp);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/db/CMakeLists.txt b/caffe2/db/CMakeLists.txt
new file mode 100644
index 0000000..e96d6b7
--- /dev/null
+++ b/caffe2/db/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(Caffe2_DB_COMMON_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/create_db_op.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/protodb.cc"
+)
+set(Caffe2_DB_COMMON_GPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/create_db_op_gpu.cc"
+)
+
+# Common files that are always going to be included.
+list(APPEND Caffe2_CPU_SRCS ${Caffe2_DB_COMMON_CPU_SRC})
+list(APPEND Caffe2_GPU_SRCS ${Caffe2_DB_COMMON_GPU_SRC})
+
+# DB specific files
+if (USE_LMDB)
+  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/lmdb.cc")
+endif()
+
+if (USE_LEVELDB)
+  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/leveldb.cc")
+endif()
+
+if (USE_ZMQ)
+  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/zmqdb.cc")
+endif()
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/db/create_db_op.cc b/caffe2/db/create_db_op.cc
new file mode 100644
index 0000000..d5b815f
--- /dev/null
+++ b/caffe2/db/create_db_op.cc
@@ -0,0 +1,9 @@
+#include "caffe2/db/create_db_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(CreateDB, CreateDBOp<CPUContext>);
+
+OPERATOR_SCHEMA(CreateDB).NumInputs(0).NumOutputs(1);
+
+NO_GRADIENT(CreateDB);
+}  // namespace caffe2
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
new file mode 100644
index 0000000..c2d6060
--- /dev/null
+++ b/caffe2/db/create_db_op.h
@@ -0,0 +1,42 @@
+#ifndef CAFFE2_DB_CREATE_DB_OP_H_
+#define CAFFE2_DB_CREATE_DB_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class CreateDBOp final : public Operator<Context> {
+ public:
+  CreateDBOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        db_type_(OperatorBase::template GetSingleArgument<string>(
+            "db_type",
+            "leveldb")),
+        db_name_(OperatorBase::template GetSingleArgument<string>("db", "")),
+        num_shards_(
+            OperatorBase::template GetSingleArgument<int>("num_shards", 1)),
+        shard_id_(
+            OperatorBase::template GetSingleArgument<int>("shard_id", 0)) {
+    CAFFE_ENFORCE_GT(db_name_.size(), 0, "Must specify a db name.");
+  }
+
+  bool RunOnDevice() final {
+    OperatorBase::Output<db::DBReader>(0)->Open(
+        db_type_, db_name_, num_shards_, shard_id_);
+    return true;
+  }
+
+ private:
+  string db_type_;
+  string db_name_;
+  uint32_t num_shards_;
+  uint32_t shard_id_;
+  DISABLE_COPY_AND_ASSIGN(CreateDBOp);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_DB_CREATE_DB_OP_H_
diff --git a/caffe2/db/create_db_op_gpu.cc b/caffe2/db/create_db_op_gpu.cc
new file mode 100644
index 0000000..07552aa
--- /dev/null
+++ b/caffe2/db/create_db_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/db/create_db_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(CreateDB, CreateDBOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/db/db_test.cc b/caffe2/db/db_test.cc
new file mode 100644
index 0000000..7bbe253
--- /dev/null
+++ b/caffe2/db/db_test.cc
@@ -0,0 +1,200 @@
+#include <cstdio>
+#include <iomanip>
+#include <sstream>
+#include <thread>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+namespace db {
+
+constexpr int kMaxItems = 10;
+
+static bool CreateAndFill(const string& db_type, const string& name) {
+  VLOG(1) << "Creating db: " << name;
+  std::unique_ptr<DB> db(CreateDB(db_type, name, NEW));
+  if (!db.get()) {
+    LOG(ERROR) << "Cannot create db of type " << db_type;
+    return false;
+  }
+  std::unique_ptr<Transaction> trans(db->NewTransaction());
+  for (int i = 0; i < kMaxItems; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    trans->Put(ss.str(), ss.str());
+  }
+  trans->Commit();
+  trans.reset();
+  db.reset();
+  return true;
+}
+
+static void TestCursor(Cursor* cursor) {
+  // Test the first key.
+  cursor->SeekToFirst();
+  EXPECT_EQ(cursor->key(), "00");
+  EXPECT_EQ(cursor->value(), "00");
+  // Test if Next() works.
+  cursor->Next();
+  EXPECT_EQ(cursor->key(), "01");
+  cursor->Next();
+  EXPECT_EQ(cursor->key(), "02");
+  // Test if we can return to the first key.
+  cursor->SeekToFirst();
+  EXPECT_EQ(cursor->key(), "00");
+  // Test seeking to a key that exists.
+  cursor->Seek("05");
+  EXPECT_EQ(cursor->key(), "05");
+  // Test seeking to a key that does not exist - that should give us the
+  // immediate next key.
+  cursor->Seek("07.5");
+  EXPECT_EQ(cursor->key(), "08");
+  // Test seeking over the end of the db - that should make the current
+  // cursor invalid.
+  cursor->Seek("11");
+  EXPECT_FALSE(cursor->Valid());
+  // Test seeking to empty string, aka the beginning
+  cursor->Seek("");
+  EXPECT_EQ(cursor->key(), "00");
+}
+
+static void DBSeekTestWrapper(const string& db_type) {
+  std::string name = std::tmpnam(nullptr);
+  if (!CreateAndFill(db_type, name)) {
+    // Manually fail the test, and not do anything onwards.
+    EXPECT_TRUE(0);
+  } else {
+    std::unique_ptr<DB> db(CreateDB(db_type, name, READ));
+    std::unique_ptr<Cursor> cursor(db->NewCursor());
+    TestCursor(cursor.get());
+  }
+}
+
+TEST(DBSeekTest, RocksDB) {
+  DBSeekTestWrapper("rocksdb");
+}
+
+TEST(DBSeekTest, LevelDB) {
+  DBSeekTestWrapper("leveldb");
+}
+
+TEST(DBSeekTest, LMDB) {
+  DBSeekTestWrapper("lmdb");
+}
+
+TEST(DBReaderTest, Reader) {
+  std::string name = std::tmpnam(nullptr);
+  CreateAndFill("leveldb", name);
+  std::unique_ptr<DBReader> reader(new DBReader("leveldb", name));
+  EXPECT_TRUE(reader->cursor() != nullptr);
+  // DBReader should have a full-fledged cursor.
+  TestCursor(reader->cursor());
+  // Test the Read() functionality.
+  reader->cursor()->Seek("05");
+  EXPECT_EQ(reader->cursor()->key(), "05");
+  string key;
+  string value;
+  reader->Read(&key, &value);
+  EXPECT_EQ(key, "05");
+  EXPECT_EQ(value, "05");
+  reader->Read(&key, &value);
+  EXPECT_EQ(key, "06");
+  EXPECT_EQ(value, "06");
+
+  // Test if we are able to serialize it using the blob serialization
+  // interface.
+  reader->cursor()->Seek("05");
+  EXPECT_EQ(reader->cursor()->key(), "05");
+  Blob reader_blob;
+  reader_blob.Reset(reader.release());
+  std::string str = reader_blob.Serialize("saved_reader");
+  // Release to close the old reader.
+  reader_blob.Reset();
+  BlobProto blob_proto;
+  CHECK(blob_proto.ParseFromString(str));
+  EXPECT_EQ(blob_proto.name(), "saved_reader");
+  EXPECT_EQ(blob_proto.type(), "DBReader");
+  DBReaderProto proto;
+  CHECK(proto.ParseFromString(blob_proto.content()));
+  EXPECT_EQ(proto.source(), name);
+  EXPECT_EQ(proto.db_type(), "leveldb");
+  EXPECT_EQ(proto.key(), "05");
+  // Test restoring the reader from the serialized proto.
+  EXPECT_NO_THROW(reader_blob.Deserialize(str));
+  EXPECT_TRUE(reader_blob.IsType<DBReader>());
+  const DBReader& new_reader = reader_blob.Get<DBReader>();
+  EXPECT_TRUE(new_reader.cursor() != nullptr);
+  EXPECT_EQ(new_reader.cursor()->key(), "05");
+
+  // Test Reader's multi-threading capability.
+  vector<unique_ptr<std::thread>> threads(kMaxItems);
+  vector<string> keys(kMaxItems);
+  vector<string> values(kMaxItems);
+  for (int i = 0; i < kMaxItems; ++i) {
+    threads[i].reset(new std::thread(
+        [&new_reader](string* key, string* value) {
+          new_reader.Read(key, value);
+        },
+        &keys[i], &values[i]));
+  }
+  for (int i = 0; i < kMaxItems; ++i) {
+    threads[i]->join();
+    EXPECT_TRUE(keys[i].size() > 0);
+  }
+  // Check if the names are all unique by putting them into a set and
+  // checking the size.
+  std::set<string> keys_set(keys.begin(), keys.end());
+  EXPECT_EQ(keys_set.size(), kMaxItems);
+}
+
+TEST(DBReaderShardedTest, Reader) {
+  std::string name = std::tmpnam(nullptr);
+  CreateAndFill("leveldb", name);
+
+  std::unique_ptr<DBReader> reader0(new DBReader("leveldb", name, 3, 0));
+  string key;
+  string value;
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "00");
+  EXPECT_EQ(value, "00");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "03");
+  EXPECT_EQ(value, "03");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "06");
+  EXPECT_EQ(value, "06");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "09");
+  EXPECT_EQ(value, "09");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "00");
+  EXPECT_EQ(value, "00");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "03");
+  EXPECT_EQ(value, "03");
+
+  CreateAndFill("leveldb", name + "1");
+  std::unique_ptr<DBReader> reader1(new DBReader("leveldb", name + "1", 3, 1));
+  reader1->Read(&key, &value);
+  EXPECT_EQ(key, "01");
+  EXPECT_EQ(value, "01");
+  reader1->Read(&key, &value);
+  EXPECT_EQ(key, "04");
+  EXPECT_EQ(value, "04");
+
+  CreateAndFill("leveldb", name + "2");
+  std::unique_ptr<DBReader> reader2(new DBReader("leveldb", name + "2", 3, 2));
+  reader2->Read(&key, &value);
+  EXPECT_EQ(key, "02");
+  EXPECT_EQ(value, "02");
+  reader2->Read(&key, &value);
+  EXPECT_EQ(key, "05");
+  EXPECT_EQ(value, "05");
+}
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/db/leveldb.cc b/caffe2/db/leveldb.cc
new file mode 100644
index 0000000..6c5eff4
--- /dev/null
+++ b/caffe2/db/leveldb.cc
@@ -0,0 +1,92 @@
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/flags.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+CAFFE2_DEFINE_int(caffe2_leveldb_block_size, 65536,
+                  "The caffe2 leveldb block size when writing a leveldb.");
+
+namespace caffe2 {
+namespace db {
+
+class LevelDBCursor : public Cursor {
+ public:
+  explicit LevelDBCursor(leveldb::DB* db)
+      : iter_(db->NewIterator(leveldb::ReadOptions())) {
+    SeekToFirst();
+  }
+  ~LevelDBCursor() {}
+  void Seek(const string& key) override { iter_->Seek(key); }
+  bool SupportsSeek() override { return true; }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void Next() override { iter_->Next(); }
+  string key() override { return iter_->key().ToString(); }
+  string value() override { return iter_->value().ToString(); }
+  bool Valid() override { return iter_->Valid(); }
+
+ private:
+  std::unique_ptr<leveldb::Iterator> iter_;
+};
+
+class LevelDBTransaction : public Transaction {
+ public:
+  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
+    CAFFE_ENFORCE(db_);
+    batch_.reset(new leveldb::WriteBatch());
+  }
+  ~LevelDBTransaction() { Commit(); }
+  void Put(const string& key, const string& value) override {
+    batch_->Put(key, value);
+  }
+  void Commit() override {
+    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
+    batch_.reset(new leveldb::WriteBatch());
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to write batch to leveldb. ", status.ToString());
+  }
+
+ private:
+  leveldb::DB* db_;
+  std::unique_ptr<leveldb::WriteBatch> batch_;
+
+  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+};
+
+class LevelDB : public DB {
+ public:
+  LevelDB(const string& source, Mode mode) : DB(source, mode) {
+    leveldb::Options options;
+    options.block_size = FLAGS_caffe2_leveldb_block_size;
+    options.write_buffer_size = 268435456;
+    options.max_open_files = 100;
+    options.error_if_exists = mode == NEW;
+    options.create_if_missing = mode != READ;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to open leveldb ", source, ". ", status.ToString());
+    db_.reset(db_temp);
+    VLOG(1) << "Opened leveldb " << source;
+  }
+
+  void Close() override { db_.reset(); }
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<LevelDBCursor>(db_.get());
+  }
+  unique_ptr<Transaction> NewTransaction() override {
+    return make_unique<LevelDBTransaction>(db_.get());
+  }
+
+ private:
+  std::unique_ptr<leveldb::DB> db_;
+};
+
+REGISTER_CAFFE2_DB(LevelDB, LevelDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(leveldb, LevelDB);
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/db/lmdb.cc b/caffe2/db/lmdb.cc
new file mode 100644
index 0000000..0af3af0
--- /dev/null
+++ b/caffe2/db/lmdb.cc
@@ -0,0 +1,173 @@
+#include "lmdb.h"  // NOLINT
+
+#if defined(_MSC_VER)
+#include <direct.h>
+#endif
+
+#include <sys/stat.h>
+
+#include <string>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace db {
+
+constexpr size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
+
+inline void MDB_CHECK(int mdb_status) {
+  CAFFE_ENFORCE_EQ(mdb_status, MDB_SUCCESS, mdb_strerror(mdb_status));
+}
+
+class LMDBCursor : public Cursor {
+ public:
+  explicit LMDBCursor(MDB_env* mdb_env)
+      : mdb_env_(mdb_env), valid_(false) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+    SeekToFirst();
+  }
+  virtual ~LMDBCursor() {
+    mdb_cursor_close(mdb_cursor_);
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+  }
+
+  void Seek(const string& key) override {
+    if (key.size() == 0) {
+      SeekToFirst();
+      return;
+    }
+    // a key of 16k size should be enough? I am not sure though.
+    mdb_key_.mv_size = key.size();
+    mdb_key_.mv_data = const_cast<char*>(key.c_str());
+    int mdb_status = mdb_cursor_get(
+        mdb_cursor_, &mdb_key_, &mdb_value_, MDB_SET_RANGE);
+    if (mdb_status == MDB_NOTFOUND) {
+      valid_ = false;
+    } else {
+      MDB_CHECK(mdb_status);
+      valid_ = true;
+    }
+  }
+
+  bool SupportsSeek() override { return true; }
+
+  void SeekToFirst() override { SeekLMDB(MDB_FIRST); }
+
+  void Next() override { SeekLMDB(MDB_NEXT); }
+
+  string key() override {
+    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+  }
+
+  string value() override {
+    return string(static_cast<const char*>(mdb_value_.mv_data),
+        mdb_value_.mv_size);
+  }
+
+  bool Valid() override { return valid_; }
+
+ private:
+  void SeekLMDB(MDB_cursor_op op) {
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      valid_ = false;
+    } else {
+      MDB_CHECK(mdb_status);
+      valid_ = true;
+    }
+  }
+
+  MDB_env* mdb_env_;
+  MDB_txn* mdb_txn_;
+  MDB_dbi mdb_dbi_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+  bool valid_;
+};
+
+class LMDBTransaction final : public Transaction {
+ public:
+  explicit LMDBTransaction(MDB_env* mdb_env)
+      : mdb_env_(mdb_env) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+  ~LMDBTransaction() {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+  }
+  void Put(const string& key, const string& value) override;
+  void Commit() override {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    // Begin a new transaction.
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+
+ private:
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+  MDB_txn* mdb_txn_;
+
+  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+};
+
+class LMDB : public DB {
+ public:
+  LMDB(const string& source, Mode mode);
+  virtual ~LMDB() { Close(); }
+  void Close() override {
+    if (mdb_env_ != NULL) {
+      mdb_env_close(mdb_env_);
+      mdb_env_ = NULL;
+    }
+  }
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<LMDBCursor>(mdb_env_);
+  }
+  unique_ptr<Transaction> NewTransaction() override {
+    return make_unique<LMDBTransaction>(mdb_env_);
+  }
+
+ private:
+  MDB_env* mdb_env_;
+};
+
+LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
+  if (mode == NEW) {
+#if defined(_MSC_VER)
+    CAFFE_ENFORCE_EQ(_mkdir(source.c_str()), 0, "mkdir ", source, " failed");
+#else
+    CAFFE_ENFORCE_EQ(
+        mkdir(source.c_str(), 0744), 0, "mkdir ", source, " failed");
+#endif
+  }
+  int flags = 0;
+  if (mode == READ) {
+    flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
+  }
+  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+  VLOG(1) << "Opened lmdb " << source;
+}
+
+void LMDBTransaction::Put(const string& key, const string& value) {
+  MDB_val mdb_key, mdb_value;
+  mdb_key.mv_data = const_cast<char*>(key.data());
+  mdb_key.mv_size = key.size();
+  mdb_value.mv_data = const_cast<char*>(value.data());
+  mdb_value.mv_size = value.size();
+  MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
+}
+
+REGISTER_CAFFE2_DB(LMDB, LMDB);
+REGISTER_CAFFE2_DB(lmdb, LMDB);
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/db/protodb.cc b/caffe2/db/protodb.cc
new file mode 100644
index 0000000..64d5e95
--- /dev/null
+++ b/caffe2/db/protodb.cc
@@ -0,0 +1,102 @@
+#include <unordered_set>
+
+#include "caffe2/core/db.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace db {
+
+class ProtoDBCursor : public Cursor {
+ public:
+  explicit ProtoDBCursor(const TensorProtos* proto)
+    : proto_(proto), iter_(0) {}
+  ~ProtoDBCursor() {}
+
+  void Seek(const string& /*str*/) override {
+    CAFFE_THROW("ProtoDB is not designed to support seeking.");
+  }
+
+  void SeekToFirst() override { iter_ = 0; }
+  void Next() override { ++iter_; }
+  string key() override { return proto_->protos(iter_).name(); }
+  string value() override { return proto_->protos(iter_).SerializeAsString(); }
+  bool Valid() override { return iter_ < proto_->protos_size(); }
+
+ private:
+  const TensorProtos* proto_;
+  int iter_;
+};
+
+class ProtoDBTransaction : public Transaction {
+ public:
+  explicit ProtoDBTransaction(TensorProtos* proto)
+      : proto_(proto), existing_names_() {
+    for (const auto& tensor : proto_->protos()) {
+      existing_names_.insert(tensor.name());
+    }
+  }
+  ~ProtoDBTransaction() { Commit(); }
+  void Put(const string& key, const string& value) override {
+    if (existing_names_.count(key)) {
+      CAFFE_THROW("An item with key ", key, " already exists.");
+    }
+    auto* tensor = proto_->add_protos();
+    CAFFE_ENFORCE(
+        tensor->ParseFromString(value),
+        "Cannot parse content from the value string.");
+    CAFFE_ENFORCE(
+        tensor->name() == key,
+        "Passed in key ",
+        key,
+        " does not equal to the tensor name ",
+        tensor->name());
+  }
+  // Commit does nothing. The protocol buffer will be written at destruction
+  // of ProtoDB.
+  void Commit() override {}
+
+ private:
+  TensorProtos* proto_;
+  std::unordered_set<string> existing_names_;
+
+  DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
+};
+
+class ProtoDB : public DB {
+ public:
+  ProtoDB(const string& source, Mode mode)
+      : DB(source, mode), proto_(), source_(source) {
+    if (mode == READ || mode == WRITE) {
+      // Read the current protobuffer.
+      CAFFE_ENFORCE(
+          ReadProtoFromFile(source, &proto_), "Cannot read protobuffer.");
+    }
+    LOG(INFO) << "Opened protodb " << source;
+  }
+  ~ProtoDB() { Close(); }
+
+  void Close() override {
+    if (mode_ == NEW || mode_ == WRITE) {
+      WriteProtoToBinaryFile(proto_, source_);
+    }
+  }
+
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<ProtoDBCursor>(&proto_);
+  }
+  unique_ptr<Transaction> NewTransaction() override {
+    return make_unique<ProtoDBTransaction>(&proto_);
+  }
+
+ private:
+  TensorProtos proto_;
+  string source_;
+};
+
+REGISTER_CAFFE2_DB(ProtoDB, ProtoDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(protodb, ProtoDB);
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/db/zmqdb.cc b/caffe2/db/zmqdb.cc
new file mode 100644
index 0000000..e5947eb
--- /dev/null
+++ b/caffe2/db/zmqdb.cc
@@ -0,0 +1,116 @@
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <thread>  // NOLINT
+
+#include "caffe2/core/db.h"
+#include "caffe2/utils/zmq_helper.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace db {
+
+class ZmqDBCursor : public Cursor {
+ public:
+  explicit ZmqDBCursor(const string& source)
+      : source_(source), socket_(ZMQ_PULL),
+        prefetched_(false), finalize_(false) {
+    socket_.Connect(source_);
+    // Start prefetching thread.
+    prefetch_thread_.reset(
+        new std::thread([this] { this->Prefetch(); }));
+    // obtain the first value.
+    Next();
+  }
+
+  ~ZmqDBCursor() {
+    finalize_ = true;
+    prefetched_ = false;
+    producer_.notify_one();
+    // Wait for the prefetch thread to finish elegantly.
+    prefetch_thread_->join();
+    socket_.Disconnect(source_);
+  }
+
+  void Seek(const string& /*key*/) override { /* do nothing */
+  }
+
+  void SeekToFirst() override { /* do nothing */ }
+
+  void Next() override {
+    std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
+    while (!prefetched_) consumer_.wait(lock);
+    key_ = prefetch_key_;
+    value_ = prefetch_value_;
+    prefetched_ = false;
+    producer_.notify_one();
+  }
+
+  string key() override { return key_; }
+  string value() override { return value_; }
+  bool Valid() override { return true; }
+
+ private:
+
+  void Prefetch() {
+    while (!finalize_) {
+      std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
+      while (prefetched_) producer_.wait(lock);
+      if (finalize_) {
+        return;
+      }
+      ZmqMessage msg;
+      socket_.RecvTillSuccess(&msg);
+      prefetch_key_.assign(static_cast<char*>(msg.data()), msg.size());
+      socket_.RecvTillSuccess(&msg);
+      prefetch_value_.assign(static_cast<char*>(msg.data()), msg.size());
+      prefetched_ = true;
+      consumer_.notify_one();
+    }
+  }
+
+  string source_;
+  ZmqSocket socket_;
+  string key_;
+  string value_;
+  string prefetch_key_;
+  string prefetch_value_;
+
+  unique_ptr<std::thread> prefetch_thread_;
+  std::mutex prefetch_access_mutex_;
+  std::condition_variable producer_, consumer_;
+  std::atomic<bool> prefetched_;
+  // finalize_ is used to tell the prefetcher to quit.
+  std::atomic<bool> finalize_;
+};
+
+class ZmqDB : public DB {
+ public:
+  ZmqDB(const string& source, Mode mode)
+      : DB(source, mode), source_(source) {
+    CAFFE_ENFORCE(mode == READ, "ZeroMQ DB only supports read mode.");
+  }
+
+  ~ZmqDB() {}
+
+  void Close() override {}
+
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<ZmqDBCursor>(source_);
+  }
+
+  unique_ptr<Transaction> NewTransaction() override {
+    CAFFE_THROW("ZeroMQ DB does not support writing with a transaction.");
+    return nullptr;  // dummy placeholder to suppress old compiler warnings.
+  }
+
+ private:
+  string source_;
+};
+
+REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
+
+}  // namespace db
+}  // namespace caffe2
diff --git a/caffe2/distributed/CMakeLists.txt b/caffe2/distributed/CMakeLists.txt
new file mode 100644
index 0000000..41373ef
--- /dev/null
+++ b/caffe2/distributed/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(Caffe2_STORE_COMMON_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler_op.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/store_ops.cc"
+)
+
+set(Caffe2_STORE_COMMON_GPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler_op_gpu.cc"
+)
+
+set(Caffe2_STORE_REDIS_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler_op.cc"
+)
+
+set(Caffe2_STORE_REDIS_GPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler_op_gpu.cc"
+)
+
+# Common files that are always going to be included.
+list(APPEND Caffe2_CPU_SRCS ${Caffe2_STORE_COMMON_SRC})
+list(APPEND Caffe2_GPU_SRCS ${Caffe2_STORE_COMMON_GPU_SRC})
+
+if (USE_REDIS)
+  list(APPEND Caffe2_CPU_SRCS ${Caffe2_STORE_REDIS_SRC})
+  list(APPEND Caffe2_GPU_SRCS ${Caffe2_STORE_REDIS_GPU_SRC})
+endif()
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
new file mode 100644
index 0000000..8f5e639
--- /dev/null
+++ b/caffe2/distributed/file_store_handler.cc
@@ -0,0 +1,160 @@
+#include "file_store_handler_op.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#if defined(_MSC_VER)
+#include <direct.h> // for _mkdir
+#endif
+
+#include "caffe2/utils/murmur_hash3.h"
+
+namespace caffe2 {
+
+static std::string encodeName(const std::string& name) {
+  std::array<uint64_t, 2> out;
+  MurmurHash3_x64_128(name.data(), name.size(), 0xcafef00d, out.data());
+
+  // Size is 33 to have space for final NUL
+  std::array<char, 33> buf;
+  for (int i = 0; i < 16; i++) {
+    snprintf(&buf[i * 2], buf.size() - (i * 2), "%02x", ((char*)out.data())[i]);
+  }
+
+  // Return everything but the final NUL
+  return std::string(buf.data(), buf.size() - 1);
+}
+
+FileStoreHandler::FileStoreHandler(
+    const std::string& path,
+    const std::string& prefix) {
+  basePath_ = realPath(path);
+  if (!prefix.empty()) {
+    basePath_ = basePath_ + "/" + encodeName(prefix);
+  }
+#if defined(_MSC_VER)
+  auto ret = _mkdir(basePath_.c_str());
+#else
+  auto ret = mkdir(basePath_.c_str(), 0777);
+#endif // defined(_MSC_VER)
+  if (ret == -1) {
+    CHECK_EQ(errno, EEXIST) << "mkdir: " << strerror(errno);
+  }
+}
+
+FileStoreHandler::~FileStoreHandler() {}
+
+std::string FileStoreHandler::realPath(const std::string& path) {
+#if defined(_MSC_VER)
+  std::array<char, _MAX_PATH> buf;
+  auto ret = _fullpath(buf.data(), path.c_str(), buf.size());
+#else
+  std::array<char, PATH_MAX> buf;
+  auto ret = realpath(path.c_str(), buf.data());
+#endif
+  CHECK_EQ(buf.data(), ret) << "realpath: " << strerror(errno);
+  return std::string(buf.data());
+}
+
+std::string FileStoreHandler::tmpPath(const std::string& name) {
+  return basePath_ + "/." + encodeName(name);
+}
+
+std::string FileStoreHandler::objectPath(const std::string& name) {
+  return basePath_ + "/" + encodeName(name);
+}
+
+void FileStoreHandler::set(const std::string& name, const std::string& data) {
+  auto tmp = tmpPath(name);
+  auto path = objectPath(name);
+
+  {
+    std::ofstream ofs(tmp.c_str(), std::ios::out | std::ios::trunc);
+    if (!ofs.is_open()) {
+      CAFFE_ENFORCE(
+          false, "File cannot be created: ", tmp, " (", ofs.rdstate(), ")");
+    }
+    ofs << data;
+  }
+
+  // Atomically movve result to final location
+  auto rv = rename(tmp.c_str(), path.c_str());
+  CAFFE_ENFORCE_EQ(rv, 0, "rename: ", strerror(errno));
+}
+
+std::string FileStoreHandler::get(const std::string& name) {
+  auto path = objectPath(name);
+  std::string result;
+
+  // Block until key is set
+  wait({name});
+
+  std::ifstream ifs(path.c_str(), std::ios::in);
+  if (!ifs) {
+    CAFFE_ENFORCE(
+        false, "File cannot be opened: ", path, " (", ifs.rdstate(), ")");
+  }
+  ifs.seekg(0, std::ios::end);
+  size_t n = ifs.tellg();
+  result.resize(n);
+  ifs.seekg(0);
+  ifs.read(&result[0], n);
+  return result;
+}
+
+int64_t FileStoreHandler::add(
+    const std::string& /* unused */,
+    int64_t /* unused */) {
+  CHECK(false) << "add not implemented for FileStoreHandler";
+  return 0;
+}
+
+bool FileStoreHandler::check(const std::vector<std::string>& names) {
+  std::vector<std::string> paths;
+  for (const auto& name : names) {
+    paths.push_back(objectPath(name));
+  }
+
+  for (const auto& path : paths) {
+    int fd = open(path.c_str(), O_RDONLY);
+    if (fd == -1) {
+      // Only deal with files that don't exist.
+      // Anything else is a problem.
+      CHECK_EQ(errno, ENOENT);
+
+      // One of the paths doesn't exist; return early
+      return false;
+    }
+
+    close(fd);
+  }
+
+  return true;
+}
+
+void FileStoreHandler::wait(
+    const std::vector<std::string>& names,
+    const std::chrono::milliseconds& timeout) {
+  // Not using inotify because it doesn't work on many
+  // shared filesystems (such as NFS).
+  const auto start = std::chrono::steady_clock::now();
+  while (!check(names)) {
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - start);
+    if (timeout != kNoTimeout && elapsed > timeout) {
+      STORE_HANDLER_TIMEOUT("Wait timeout for name(s): ", Join(" ", names));
+    }
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+}
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
new file mode 100644
index 0000000..a881839
--- /dev/null
+++ b/caffe2/distributed/file_store_handler.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <caffe2/distributed/store_handler.h>
+
+namespace caffe2 {
+
+class FileStoreHandler : public StoreHandler {
+ public:
+  explicit FileStoreHandler(const std::string& path, const std::string& prefix);
+  virtual ~FileStoreHandler();
+
+  virtual void set(const std::string& name, const std::string& data) override;
+
+  virtual std::string get(const std::string& name) override;
+
+  virtual int64_t add(const std::string& name, int64_t value) override;
+
+  virtual bool check(const std::vector<std::string>& names) override;
+
+  virtual void wait(
+      const std::vector<std::string>& names,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
+
+ protected:
+  std::string basePath_;
+
+  std::string realPath(const std::string& path);
+
+  std::string tmpPath(const std::string& name);
+
+  std::string objectPath(const std::string& name);
+};
+
+} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op.cc b/caffe2/distributed/file_store_handler_op.cc
new file mode 100644
index 0000000..a018daf
--- /dev/null
+++ b/caffe2/distributed/file_store_handler_op.cc
@@ -0,0 +1,26 @@
+#include "file_store_handler_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    FileStoreHandlerCreate,
+    FileStoreHandlerCreateOp<CPUContext>);
+
+OPERATOR_SCHEMA(FileStoreHandlerCreate)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a unique_ptr<StoreHandler> that uses the filesystem as backing
+store (typically a filesystem shared between many nodes, such as NFS).
+This store handler is not built to be fast. Its recommended use is for
+integration tests and prototypes where extra dependencies are
+cumbersome. Use an ephemeral path to ensure multiple processes or runs
+don't interfere.
+)DOC")
+    .Arg("path", "base path used by the FileStoreHandler")
+    .Arg("prefix", "prefix for all keys used by this store")
+    .Output(0, "handler", "unique_ptr<StoreHandler>");
+
+NO_GRADIENT(FileStoreHandlerCreateOp);
+
+} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op.h b/caffe2/distributed/file_store_handler_op.h
new file mode 100644
index 0000000..82aeb34
--- /dev/null
+++ b/caffe2/distributed/file_store_handler_op.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "file_store_handler.h"
+
+#include <caffe2/core/operator.h>
+
+namespace caffe2 {
+
+template <class Context>
+class FileStoreHandlerCreateOp final : public Operator<Context> {
+ public:
+  explicit FileStoreHandlerCreateOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        basePath_(
+            OperatorBase::template GetSingleArgument<std::string>("path", "")),
+        prefix_(OperatorBase::template GetSingleArgument<std::string>(
+            "prefix",
+            "")) {
+    CAFFE_ENFORCE_NE(basePath_, "", "path is a required argument");
+  }
+
+  bool RunOnDevice() override {
+    auto ptr =
+        std::unique_ptr<StoreHandler>(new FileStoreHandler(basePath_, prefix_));
+    *OperatorBase::Output<std::unique_ptr<StoreHandler>>(HANDLER) =
+        std::move(ptr);
+    return true;
+  }
+
+ private:
+  std::string basePath_;
+  std::string prefix_;
+
+  OUTPUT_TAGS(HANDLER);
+};
+
+} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op_gpu.cc b/caffe2/distributed/file_store_handler_op_gpu.cc
new file mode 100644
index 0000000..c2e3ff2
--- /dev/null
+++ b/caffe2/distributed/file_store_handler_op_gpu.cc
@@ -0,0 +1,11 @@
+#include "file_store_handler_op.h"
+
+#include <caffe2/core/context_gpu.h>
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    FileStoreHandlerCreate,
+    FileStoreHandlerCreateOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py
new file mode 100644
index 0000000..cec7c75
--- /dev/null
+++ b/caffe2/distributed/file_store_handler_op_test.py
@@ -0,0 +1,62 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import errno
+import os
+import tempfile
+import shutil
+
+from caffe2.distributed.python import StoreHandlerTimeoutError
+from caffe2.distributed.store_ops_test_util import StoreOpsTests
+from caffe2.python import core, workspace, dyndep
+from caffe2.python.test_util import TestCase
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
+
+
+class TestFileStoreHandlerOp(TestCase):
+    testCounter = 0
+
+    def setUp(self):
+        super(TestFileStoreHandlerOp, self).setUp()
+        self.tmpdir = tempfile.mkdtemp()
+
+        # Use counter to tell test cases apart
+        TestFileStoreHandlerOp.testCounter += 1
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+        super(TestFileStoreHandlerOp, self).tearDown()
+
+    def create_store_handler(self):
+        # Use new path for every test so they are isolated
+        path = self.tmpdir + "/" + str(TestFileStoreHandlerOp.testCounter)
+
+        # Ensure path exists (including counter)
+        try:
+            os.makedirs(path)
+        except OSError as exc:
+            if exc.errno == errno.EEXIST and os.path.isdir(path):
+                pass
+            else:
+                raise
+
+        store_handler = "store_handler"
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "FileStoreHandlerCreate",
+                [],
+                [store_handler],
+                path=path))
+
+        return store_handler
+
+    def test_set_get(self):
+        StoreOpsTests.test_set_get(self.create_store_handler)
+
+    def test_get_timeout(self):
+        with self.assertRaises(StoreHandlerTimeoutError):
+            StoreOpsTests.test_get_timeout(self.create_store_handler)
diff --git a/caffe2/distributed/py_export.cc b/caffe2/distributed/py_export.cc
new file mode 100644
index 0000000..75b86e6
--- /dev/null
+++ b/caffe2/distributed/py_export.cc
@@ -0,0 +1,21 @@
+#include <pybind11/pybind11.h>
+
+#include "caffe2/distributed/store_handler.h"
+
+namespace caffe2 {
+namespace python {
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(python, m) {
+  m.doc() = "Python interface for distributed Caffe2";
+
+  py::register_exception<StoreHandlerNotAvailableException>(
+      m, "StoreHandlerNotAvailableError");
+
+  py::register_exception<StoreHandlerTimeoutException>(
+      m, "StoreHandlerTimeoutError");
+}
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
new file mode 100644
index 0000000..a2f41cc
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -0,0 +1,116 @@
+#include "redis_store_handler.h"
+
+#include <caffe2/core/logging.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+namespace caffe2 {
+
+RedisStoreHandler::RedisStoreHandler(
+    std::string& host,
+    int port,
+    std::string& prefix)
+    : host_(host), port_(port), prefix_(prefix) {
+  struct timeval tv = {
+      .tv_sec = 5, .tv_usec = 0,
+  };
+
+  redis_ = redisConnectWithTimeout(host.c_str(), port, tv);
+  CAFFE_ENFORCE_NE(redis_, (redisContext*)nullptr);
+  CAFFE_ENFORCE_EQ(redis_->err, 0, redis_->errstr);
+}
+
+RedisStoreHandler::~RedisStoreHandler() {
+  redisFree(redis_);
+}
+
+std::string RedisStoreHandler::compoundKey(const std::string& name) {
+  return prefix_ + name;
+}
+
+void RedisStoreHandler::set(const std::string& name, const std::string& data) {
+  auto key = compoundKey(name);
+  void* ptr = redisCommand(
+      redis_,
+      "SETNX %b %b",
+      key.c_str(),
+      (size_t)key.size(),
+      data.c_str(),
+      (size_t)data.size());
+  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
+  redisReply* reply = static_cast<redisReply*>(ptr);
+  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
+  CAFFE_ENFORCE_EQ(
+      reply->integer,
+      1,
+      "Value at ",
+      name,
+      " was already set",
+      " (perhaps you reused a run ID you have used before?)");
+}
+
+std::string RedisStoreHandler::get(const std::string& name) {
+  // Block until key is set
+  wait({name});
+
+  auto key = compoundKey(name);
+  void* ptr = redisCommand(redis_, "GET %b", key.c_str(), (size_t)key.size());
+  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
+  redisReply* reply = static_cast<redisReply*>(ptr);
+  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_STRING);
+  return std::string(reply->str, reply->len);
+}
+
+int64_t RedisStoreHandler::add(const std::string& name, int64_t value) {
+  auto key = compoundKey(name);
+  void* ptr = redisCommand(
+      redis_, "INCRBY %b %ld", key.c_str(), (size_t)key.size(), value);
+  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
+  redisReply* reply = static_cast<redisReply*>(ptr);
+  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
+  return reply->integer;
+}
+
+bool RedisStoreHandler::check(const std::vector<std::string>& names) {
+  std::vector<std::string> args;
+  args.push_back("EXISTS");
+  for (const auto& name : names) {
+    args.push_back(compoundKey(name));
+  }
+
+  std::vector<const char*> argv;
+  std::vector<size_t> argvlen;
+  for (const auto& arg : args) {
+    argv.push_back(arg.c_str());
+    argvlen.push_back(arg.length());
+  }
+
+  auto argc = argv.size();
+  void* ptr = redisCommandArgv(redis_, argc, argv.data(), argvlen.data());
+  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
+  redisReply* reply = static_cast<redisReply*>(ptr);
+  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
+  return reply->integer == names.size();
+}
+
+void RedisStoreHandler::wait(
+    const std::vector<std::string>& names,
+    const std::chrono::milliseconds& timeout) {
+  // Simple approach: poll...
+  // Complex approach: use pub/sub.
+  // Polling is fine for the typical rendezvous use case, as it is
+  // only done at initialization time and  not at run time.
+  const auto start = std::chrono::steady_clock::now();
+  while (!check(names)) {
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - start);
+    if (timeout != kNoTimeout && elapsed > timeout) {
+      STORE_HANDLER_TIMEOUT("Wait timeout for name(s): ", Join(" ", names));
+    }
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+}
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
new file mode 100644
index 0000000..ee208d9
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <caffe2/distributed/store_handler.h>
+
+extern "C" {
+#include <hiredis/hiredis.h>
+}
+
+#include <string>
+
+namespace caffe2 {
+
+class RedisStoreHandler : public StoreHandler {
+ public:
+  explicit RedisStoreHandler(std::string& host, int port, std::string& prefix);
+  virtual ~RedisStoreHandler();
+
+  virtual void set(const std::string& name, const std::string& data) override;
+
+  virtual std::string get(const std::string& name) override;
+
+  virtual int64_t add(const std::string& name, int64_t value) override;
+
+  virtual bool check(const std::vector<std::string>& names) override;
+
+  virtual void wait(
+      const std::vector<std::string>& names,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
+
+ private:
+  std::string host_;
+  int port_;
+  std::string prefix_;
+
+  redisContext* redis_;
+
+  std::string compoundKey(const std::string& name);
+};
+
+} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op.cc b/caffe2/distributed/redis_store_handler_op.cc
new file mode 100644
index 0000000..1ff32a0
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler_op.cc
@@ -0,0 +1,22 @@
+#include "redis_store_handler_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    RedisStoreHandlerCreate,
+    RedisStoreHandlerCreateOp<CPUContext>);
+
+OPERATOR_SCHEMA(RedisStoreHandlerCreate)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a unique_ptr<StoreHandler> that uses a Redis server as backing store.
+)DOC")
+    .Arg("host", "host name of Redis server")
+    .Arg("port", "port number of Redis server")
+    .Arg("prefix", "keys used by this instance are prefixed with this string")
+    .Output(0, "handler", "unique_ptr<StoreHandler>");
+
+NO_GRADIENT(RedisStoreHandlerCreateOp);
+
+} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op.h b/caffe2/distributed/redis_store_handler_op.h
new file mode 100644
index 0000000..5908fbd
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler_op.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "redis_store_handler.h"
+
+#include <caffe2/core/operator.h>
+
+#include <string>
+
+namespace caffe2 {
+
+template <class Context>
+class RedisStoreHandlerCreateOp final : public Operator<Context> {
+ public:
+  explicit RedisStoreHandlerCreateOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        host_(
+            OperatorBase::template GetSingleArgument<std::string>("host", "")),
+        port_(OperatorBase::template GetSingleArgument<int>("port", 0)),
+        prefix_(OperatorBase::template GetSingleArgument<std::string>(
+            "prefix",
+            "")) {
+    CAFFE_ENFORCE_NE(host_, "", "host is a required argument");
+    CAFFE_ENFORCE_NE(port_, 0, "port is a required argument");
+  }
+
+  bool RunOnDevice() override {
+    auto ptr = std::unique_ptr<StoreHandler>(
+        new RedisStoreHandler(host_, port_, prefix_));
+    *OperatorBase::Output<std::unique_ptr<StoreHandler>>(HANDLER) =
+        std::move(ptr);
+    return true;
+  }
+
+ private:
+  std::string host_;
+  int port_;
+  std::string prefix_;
+
+  OUTPUT_TAGS(HANDLER);
+};
+
+} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op_gpu.cc b/caffe2/distributed/redis_store_handler_op_gpu.cc
new file mode 100644
index 0000000..9bf8d25
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler_op_gpu.cc
@@ -0,0 +1,11 @@
+#include "redis_store_handler_op.h"
+
+#include <caffe2/core/context_gpu.h>
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    RedisStoreHandlerCreate,
+    RedisStoreHandlerCreateOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py
new file mode 100644
index 0000000..b42fd1e
--- /dev/null
+++ b/caffe2/distributed/redis_store_handler_op_test.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import uuid
+
+from caffe2.distributed.python import StoreHandlerTimeoutError
+from caffe2.distributed.store_ops_test_util import StoreOpsTests
+from caffe2.python import core, workspace, dyndep
+from caffe2.python.test_util import TestCase
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
+
+
+class TestRedisStoreHandlerOp(TestCase):
+    def setUp(self):
+        super(TestRedisStoreHandlerOp, self).setUp()
+        self.uuid = str(uuid.uuid4()) + "/"
+
+    def tearDown(self):
+        super(TestRedisStoreHandlerOp, self).tearDown()
+
+    def create_store_handler(self):
+        store_handler = "store_handler"
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "RedisStoreHandlerCreate",
+                [],
+                [store_handler],
+                prefix=self.uuid,
+                host=os.getenv("REDIS_HOST", "localhost"),
+                port=int(os.getenv("REDIS_PORT", 6379))))
+        return store_handler
+
+    def test_set_get(self):
+        StoreOpsTests.test_set_get(self.create_store_handler)
+
+    def test_get_timeout(self):
+        with self.assertRaises(StoreHandlerTimeoutError):
+            StoreOpsTests.test_get_timeout(self.create_store_handler)
diff --git a/caffe2/distributed/store_handler.cc b/caffe2/distributed/store_handler.cc
new file mode 100644
index 0000000..5c585f8
--- /dev/null
+++ b/caffe2/distributed/store_handler.cc
@@ -0,0 +1,19 @@
+#include "store_handler.h"
+
+#include <memory>
+
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+constexpr std::chrono::milliseconds StoreHandler::kDefaultTimeout;
+constexpr std::chrono::milliseconds StoreHandler::kNoTimeout;
+
+StoreHandler::~StoreHandler() {
+  // NOP; definition is here to make sure library contains
+  // symbols for this abstract class.
+}
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<StoreHandler>);
+
+} // namespace caffe2
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
new file mode 100644
index 0000000..ffbcdc1
--- /dev/null
+++ b/caffe2/distributed/store_handler.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+class StoreHandler {
+ public:
+  static constexpr std::chrono::milliseconds kDefaultTimeout =
+      std::chrono::seconds(30);
+  static constexpr std::chrono::milliseconds kNoTimeout =
+      std::chrono::milliseconds::zero();
+
+  virtual ~StoreHandler();
+
+  /*
+   * Set data for the key if it doesn't exist.
+   * If the key exists the data should be the same as the existing key.
+   */
+  virtual void set(const std::string& name, const std::string& data) = 0;
+
+  /*
+   * Get the data for the key.
+   * The call should wait until the key is stored with default timeout
+   * and return data if set else fail.
+   */
+  virtual std::string get(const std::string& name) = 0;
+
+  /*
+   * Does an atomic add operation on the key and returns the latest updated
+   * value.
+   * Note: To access the current value for this counter call with value = 0
+   */
+  virtual int64_t add(const std::string& name, int64_t value) = 0;
+
+  /*
+   * Check if a keys exist in the store.
+   */
+  virtual bool check(const std::vector<std::string>& names) = 0;
+
+  /*
+   * Wait for Keys to be stored.
+   */
+  virtual void wait(
+      const std::vector<std::string>& names,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) = 0;
+};
+
+/*
+ * The backing store is no longer available. It may have been deleted.
+ */
+struct StoreHandlerNotAvailableException : public std::runtime_error {
+  StoreHandlerNotAvailableException() = default;
+  explicit StoreHandlerNotAvailableException(const std::string& msg)
+      : std::runtime_error(msg) {}
+};
+
+#define STORE_HANDLER_NOT_AVAILABLE(...)             \
+  throw ::caffe2::StoreHandlerNotAvailableException( \
+      ::caffe2::MakeString("[", __FILE__, ":", __LINE__, "] ", __VA_ARGS__));
+
+/*
+ * Timeout accessing the store.
+ */
+struct StoreHandlerTimeoutException : public std::runtime_error {
+  StoreHandlerTimeoutException() = default;
+  explicit StoreHandlerTimeoutException(const std::string& msg)
+      : std::runtime_error(msg) {}
+};
+
+#define STORE_HANDLER_TIMEOUT(...)              \
+  throw ::caffe2::StoreHandlerTimeoutException( \
+      ::caffe2::MakeString("[", __FILE__, ":", __LINE__, "] ", __VA_ARGS__));
+} // namespace caffe2
diff --git a/caffe2/distributed/store_ops.cc b/caffe2/distributed/store_ops.cc
new file mode 100644
index 0000000..734dfc4
--- /dev/null
+++ b/caffe2/distributed/store_ops.cc
@@ -0,0 +1,124 @@
+#include "store_ops.h"
+
+namespace caffe2 {
+
+constexpr auto kBlobName = "blob_name";
+constexpr auto kAddValue = "add_value";
+
+StoreSetOp::StoreSetOp(const OperatorDef& operator_def, Workspace* ws)
+    : Operator<CPUContext>(operator_def, ws),
+      blobName_(
+          GetSingleArgument<std::string>(kBlobName, operator_def.input(DATA))) {
+}
+
+bool StoreSetOp::RunOnDevice() {
+  // Serialize and pass to store
+  auto* handler =
+      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
+  handler->set(blobName_, InputBlob(DATA).Serialize(blobName_));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(StoreSet, StoreSetOp);
+OPERATOR_SCHEMA(StoreSet)
+    .NumInputs(2)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Set a blob in a store. The key is the input blob's name and the value
+is the data in that blob. The key can be overridden by specifying the
+'blob_name' argument.
+)DOC")
+    .Arg("blob_name", "alternative key for the blob (optional)")
+    .Input(0, "handler", "unique_ptr<StoreHandler>")
+    .Input(1, "data", "data blob");
+
+StoreGetOp::StoreGetOp(const OperatorDef& operator_def, Workspace* ws)
+    : Operator<CPUContext>(operator_def, ws),
+      blobName_(GetSingleArgument<std::string>(
+          kBlobName,
+          operator_def.output(DATA))) {}
+
+bool StoreGetOp::RunOnDevice() {
+  // Get from store and deserialize
+  auto* handler =
+      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
+  OperatorBase::Outputs()[DATA]->Deserialize(handler->get(blobName_));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(StoreGet, StoreGetOp);
+OPERATOR_SCHEMA(StoreGet)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Get a blob from a store. The key is the output blob's name. The key
+can be overridden by specifying the 'blob_name' argument.
+)DOC")
+    .Arg("blob_name", "alternative key for the blob (optional)")
+    .Input(0, "handler", "unique_ptr<StoreHandler>")
+    .Output(0, "data", "data blob");
+
+StoreAddOp::StoreAddOp(const OperatorDef& operator_def, Workspace* ws)
+    : Operator<CPUContext>(operator_def, ws),
+      blobName_(GetSingleArgument<std::string>(kBlobName, "")),
+      addValue_(GetSingleArgument<int64_t>(kAddValue, 1)) {
+  CAFFE_ENFORCE(HasArgument(kBlobName));
+}
+
+bool StoreAddOp::RunOnDevice() {
+  auto* handler =
+      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
+  Output(VALUE)->Resize(1);
+  Output(VALUE)->mutable_data<int64_t>()[0] =
+      handler->add(blobName_, addValue_);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(StoreAdd, StoreAddOp);
+OPERATOR_SCHEMA(StoreAdd)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Add a value to a remote counter. If the key is not set, the store
+initializes it to 0 and then performs the add operation. The operation
+returns the resulting counter value.
+)DOC")
+    .Arg("blob_name", "key of the counter (required)")
+    .Arg("add_value", "value that is added (optional, default: 1)")
+    .Input(0, "handler", "unique_ptr<StoreHandler>")
+    .Output(0, "value", "the current value of the counter");
+
+StoreWaitOp::StoreWaitOp(const OperatorDef& operator_def, Workspace* ws)
+    : Operator<CPUContext>(operator_def, ws),
+      blobNames_(GetRepeatedArgument<std::string>(kBlobName)) {}
+
+bool StoreWaitOp::RunOnDevice() {
+  auto* handler =
+      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
+  if (InputSize() == 2 && Input(1).IsType<std::string>()) {
+    CAFFE_ENFORCE(
+        blobNames_.empty(), "cannot specify both argument and input blob");
+    std::vector<std::string> blobNames;
+    auto* namesPtr = Input(1).data<std::string>();
+    for (int i = 0; i < Input(1).size(); ++i) {
+      blobNames.push_back(namesPtr[i]);
+    }
+    handler->wait(blobNames);
+  } else {
+    handler->wait(blobNames_);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(StoreWait, StoreWaitOp);
+OPERATOR_SCHEMA(StoreWait)
+    .NumInputs(1, 2)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Wait for the specified blob names to be set. The blob names can be passed
+either as an input blob with blob names or as an argument.
+)DOC")
+    .Arg("blob_names", "names of the blobs to wait for (optional)")
+    .Input(0, "handler", "unique_ptr<StoreHandler>")
+    .Input(1, "names", "names of the blobs to wait for (optional)");
+}
diff --git a/caffe2/distributed/store_ops.h b/caffe2/distributed/store_ops.h
new file mode 100644
index 0000000..b0097a3
--- /dev/null
+++ b/caffe2/distributed/store_ops.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "store_handler.h"
+
+#include <caffe2/core/operator.h>
+
+namespace caffe2 {
+
+class StoreSetOp final : public Operator<CPUContext> {
+ public:
+  StoreSetOp(const OperatorDef& operator_def, Workspace* ws);
+  bool RunOnDevice() override;
+
+ private:
+  std::string blobName_;
+
+  INPUT_TAGS(HANDLER, DATA);
+};
+
+class StoreGetOp final : public Operator<CPUContext> {
+ public:
+  StoreGetOp(const OperatorDef& operator_def, Workspace* ws);
+  bool RunOnDevice() override;
+
+ private:
+  std::string blobName_;
+
+  INPUT_TAGS(HANDLER);
+  OUTPUT_TAGS(DATA);
+};
+
+class StoreAddOp final : public Operator<CPUContext> {
+ public:
+  StoreAddOp(const OperatorDef& operator_def, Workspace* ws);
+  bool RunOnDevice() override;
+
+ private:
+  std::string blobName_;
+  int addValue_;
+
+  INPUT_TAGS(HANDLER);
+  OUTPUT_TAGS(VALUE);
+};
+
+class StoreWaitOp final : public Operator<CPUContext> {
+ public:
+  StoreWaitOp(const OperatorDef& operator_def, Workspace* ws);
+  bool RunOnDevice() override;
+
+ private:
+  std::vector<std::string> blobNames_;
+
+  INPUT_TAGS(HANDLER);
+};
+}
diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py
new file mode 100644
index 0000000..2abe697
--- /dev/null
+++ b/caffe2/distributed/store_ops_test_util.py
@@ -0,0 +1,76 @@
+## @package store_ops_test_util
+# Module caffe2.distributed.store_ops_test_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from multiprocessing import Process, Queue
+
+import numpy as np
+
+from caffe2.python import core, workspace
+
+
+class StoreOpsTests(object):
+    @classmethod
+    def _test_set_get(cls, queue, create_store_handler_fn, index, num_procs):
+        store_handler = create_store_handler_fn()
+        blob = "blob"
+        value = np.full(1, 1, np.float32)
+
+        # Use last process to set blob to make sure other processes
+        # are waiting for the blob before it is set.
+        if index == (num_procs - 1):
+            workspace.FeedBlob(blob, value)
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "StoreSet",
+                    [store_handler, blob],
+                    [],
+                    blob_name=blob))
+
+        output_blob = "output_blob"
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "StoreGet",
+                [store_handler],
+                [output_blob],
+                blob_name=blob))
+
+        try:
+            np.testing.assert_array_equal(workspace.FetchBlob(output_blob), 1)
+        except AssertionError as err:
+            queue.put(err)
+
+        workspace.ResetWorkspace()
+
+    @classmethod
+    def test_set_get(cls, create_store_handler_fn):
+        # Queue for assertion errors on subprocesses
+        queue = Queue()
+
+        # Start N processes in the background
+        num_procs = 4
+        procs = []
+        for index in range(num_procs):
+            proc = Process(
+                target=cls._test_set_get,
+                args=(queue, create_store_handler_fn, index, num_procs, ))
+            proc.start()
+            procs.append(proc)
+
+        # Test complete, join background processes
+        for proc in procs:
+            proc.join()
+
+        # Raise first error we find, if any
+        if not queue.empty():
+            raise queue.get()
+
+    @classmethod
+    def test_get_timeout(cls, create_store_handler_fn):
+        store_handler = create_store_handler_fn()
+        net = core.Net('get_missing_blob')
+        net.StoreGet([store_handler], 1, blob_name='blob')
+        workspace.RunNetOnce(net)
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.cc b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
new file mode 100644
index 0000000..686af17
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(FCGradient_Decomp,
+                      FullyConnectedDecompGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(FC_Decomp).NumInputs(4).NumOutputs(1);
+OPERATOR_SCHEMA(FCGradient_Decomp).NumInputs(4).NumOutputs(3, 4);
+
+class GetFCDecompGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 4);
+    // TODO(wyiming): Check whether it is right? Let's move fast first.
+    return SingleGradientDef(
+        "FCGradient_Decomp", "",
+        vector<string>{I(0), I(1), I(2), GO(0)},
+        vector<string>{GI(1), GI(2), GI(3), GI(0)});
+  }
+};
+REGISTER_GRADIENT(FC_Decomp, GetFCDecompGradient);
+}  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.h b/caffe2/experiments/operators/fully_connected_op_decomposition.h
new file mode 100644
index 0000000..f06877f
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@@ -0,0 +1,218 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+/*
+ * Although a FC_decomp is just like 2 small FC,
+ * it is better to have it as one op for future analysis.
+ * And if we have 2 FC with bias, it is not right.
+ * TODO(wyiming): decompose the layer into 2 matrices
+ * W(N * K) = U(N * middle) * trans(V(K * middle))
+ * */
+// This is Caffe's InnerProductOp, with a name that fits its purpose better.
+template <typename T, class Context, class Engine=DefaultEngine>
+class FullyConnectedOpDecomp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedOpDecomp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~FullyConnectedOpDecomp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& U = Input(1);
+    const auto& V = Input(2);
+    const auto& b = Input(3);
+    auto* Y = Output(0);
+    //auto* buffer_ptr = Output(1);
+    // Size M * middle;
+    //auto& multi_buffer_ = *buffer_ptr;
+    CAFFE_ENFORCE_GE(X.ndim(), 1);
+    CAFFE_ENFORCE_GE(U.ndim(), 2);
+    CAFFE_ENFORCE_GE(V.ndim(), 2);
+    if (X.ndim() > 2 || U.ndim() > 2 || V.ndim() > 2) {
+      VLOG(1) << "Using legacy support for arbitrary input and weight "
+                       "dimensions.";
+    }
+    CAFFE_ENFORCE_EQ(b.ndim(), 1);
+    // batch size
+    int M = X.ndim() > 1 ? X.dim32(0) : 1;
+    // Feature dimension
+    int K = X.size() / M;
+    // number of outputs.
+    int N = U.dim32(0);
+    int middle = U.dim32(0);
+    CAFFE_ENFORCE_EQ(K, V.dim32(0));
+    CAFFE_ENFORCE_EQ(N, b.dim32(0));
+    if (X.ndim() > 1) {
+      Y->Resize(M, N);
+      multi_buffer_.Resize(M, middle);
+    } else {
+      Y->Resize(N);
+      multi_buffer_.Resize(middle);
+    }
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+    //  multi_buffer_.Resize(M, middle);
+    T* multi_buffer_data = multi_buffer_.template mutable_data<T>();
+    //  X * V * tans(U)
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasNoTrans, M, middle, K, 1, X.template data<T>(),
+        V.template data<T>(), 0, multi_buffer_data,
+        &context_);
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasTrans, M, N, middle, 1, multi_buffer_data,
+        U.template data<T>(), 0, Y->template mutable_data<T>(),
+        &context_);
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T, Context>(
+          M, static_cast<T>(1), bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasNoTrans, M, N, 1, 1,
+        bias_multiplier_.template data<T>(), b.template data<T>(), 1,
+        Y->template mutable_data<T>(), &context_);
+    return true;
+  }
+
+ protected:
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> multi_buffer_;
+};
+
+template <typename T, class Context, class Engine=DefaultEngine>
+class FullyConnectedDecompGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedDecompGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~FullyConnectedDecompGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& U = Input(1);
+    const auto& V = Input(2);
+    const auto& dY = Input(3);
+    DCHECK_GE(X.ndim(), 1);
+    DCHECK_GE(U.ndim(), 2);
+    DCHECK_GE(V.ndim(), 2);
+    DCHECK_LE(dY.ndim(), 2);
+    // batch size
+    int M = X.ndim() > 1 ? X.dim32(0) : 1;
+    // Feature dimension
+    int K = X.size() / M;
+    // number of outputs.
+    int N = U.dim32(0);
+    int middle = U.dim32(1);
+    DCHECK_EQ(K, V.dim32(0));
+    if (dY.ndim() > 1) {
+      DCHECK_EQ(M, dY.dim32(0));
+      DCHECK_EQ(N, dY.dim32(1));
+    } else {
+      DCHECK_EQ(X.ndim(), 1);
+      DCHECK_EQ(N, dY.size());
+    }
+    auto* dU = Output(0);
+    auto* dV = Output(1);
+    auto* db = Output(2);
+    dU->ResizeLike(U);
+    dV->ResizeLike(V);
+    db->Resize(N);
+
+    // Compute dU
+    // first compute X * V
+    du_buffer_.Resize(N, middle);
+    T* du_buffer_data = du_buffer_.template mutable_data<T>();
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasNoTrans, M, middle, K, 1,
+        X.template data<T>(), V.template data<T>(),
+        0, du_buffer_data,
+        &context_);
+    math::Gemm<T, Context, Engine>(
+        CblasTrans, CblasNoTrans, N, middle, M, 1,
+        dY.template data<T>(), du_buffer_data,
+        0, dU->template mutable_data<T>(),
+        &context_);
+    // Compute dV
+    // first compute dY * U
+    dv_buffer_.Resize(M, middle);
+    T* dv_buffer_data = dv_buffer_.template mutable_data<T>();
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasNoTrans, M, middle, N, 1,
+        dY.template data<T>(), U.template data<T>(),
+        0, dv_buffer_data,
+        &context_);
+    math::Gemm<T, Context, Engine>(
+        CblasTrans, CblasNoTrans, K, middle, M, 1,
+        dY.template data<T>(), du_buffer_data,
+        0, dV->template mutable_data<T>(),
+        &context_);
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T, Context>(
+          M, static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    // Compute dB
+    math::Gemv<T, Context>(
+        CblasTrans, M, N, 1, dY.template data<T>(),
+        bias_multiplier_.template data<T>(), 0,
+        db->template mutable_data<T>(),
+        &context_);
+    // Compute dX if necessary.
+    if (OutputSize() == 4) {
+      auto* dX = Output(3);
+      dX->ResizeLike(X);
+      dx_buffer_.Resize(M, middle);
+      T* dx_buffer_data = dx_buffer_.template mutable_data<T>();
+      math::Gemm<T, Context, Engine>(
+          CblasNoTrans, CblasNoTrans, M, middle, N, 1,
+          dY.template data<T>(), U.template data<T>(),
+          0, dx_buffer_data,
+          &context_);
+      math::Gemm<T, Context, Engine>(
+          CblasNoTrans, CblasTrans, M, K, middle, 1,
+          dx_buffer_data, V.template data<T>(),
+          0, dX->template mutable_data<T>(),
+          &context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> du_buffer_;
+  Tensor<Context> dv_buffer_;
+  Tensor<Context> dx_buffer_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
new file mode 100644
index 0000000..c2ca901
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(FCGradient_Decomp,
+                       FullyConnectedDecompGradientOp<float, CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.cc b/caffe2/experiments/operators/fully_connected_op_prune.cc
new file mode 100644
index 0000000..8ca613e
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_prune.cc
@@ -0,0 +1,44 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/fully_connected_op_prune.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(FC_Prune, FullyConnectedOpPrune<float, CPUContext>);
+REGISTER_CPU_OPERATOR(FCGradient_Prune,
+                      FullyConnectedPruneGradientOp<float, CPUContext>);
+/* 8 Inputs:
+ * X    W   Mask  bias  Ag_dw   Mask_seq  thres   comp_lb
+ * */
+OPERATOR_SCHEMA(FC_Prune).NumInputs(8).NumOutputs(1, 2);
+OPERATOR_SCHEMA(FCGradient_Prune).NumInputs(8).NumOutputs(6, 7)
+      .AllowInplace({{1, 2}, {2, 3}, {4, 4}, {5, 5}});
+
+class GetFCPruneGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 8);
+    return SingleGradientDef(
+        "FCGradient_Prune", "",
+        vector<string>{I(0), I(1), I(2), GO(0), I(4), I(5), I(6), I(7)},
+        vector<string>{GI(1), GI(3), I(1), I(2), I(4), I(5), GI(0)});
+  }
+};
+REGISTER_GRADIENT(FC_Prune, GetFCPruneGradient);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h
new file mode 100644
index 0000000..5b9508f
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@@ -0,0 +1,353 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+  namespace {
+
+    template<int N>
+      using Shape = std::array<int, N>;
+
+    template<int N>
+      const std::vector<TIndex>& shape(Shape<N> vs) {
+        static thread_local std::vector<TIndex> cache;
+        cache.resize(vs.size());
+        for (auto i = 0; i < vs.size(); ++i) {
+          cache[i] = vs[i];
+        }
+        return cache;
+      }
+
+    inline const std::vector<TIndex>& shape(int i) {
+      return shape<1>(Shape<1>({i}));
+    }
+
+    inline const std::vector<TIndex>& shape(int i, int j) {
+      return shape<2>(Shape<2>({i, j}));
+    }
+
+    template <typename T, class Context>
+      void MaskMatrix(const T* mask, T* mat,
+          int M, int N);
+
+    template <typename T, class Context>
+      void MaskMatrix_Inc(T* mask_seq, T* mat,
+          int M, int N, int seq_len, T target);
+
+    template <typename T, class Context>
+      void AggrDW(T* ag_dw, const T* dw, int N, int K, Context* context);
+
+    template <typename T>
+      int MatrixCompare_LT(const T* mat, float thres,
+                           T* mask_seq, int M, int N);
+
+    // TODO(wyiming): write an incremental Mask
+    // Incremental Mask: only give the new mask positions;
+    // Assuming that weights masked will not be mask again;
+    // The incremental mask can also be used to update mask matrix;
+    // But this will include template for bool and float;
+    template <>
+      void MaskMatrix<float, CPUContext>(
+          const float* mask, float* mat, int M, int N) {
+        int offset = 0;
+        for (int i = 0; i < M; ++i) {
+          for (int j = 0; j < N; ++j) {
+            mat[offset] = mask[offset]? mat[offset] : 0;
+            offset++;
+          }
+        }
+      }
+
+      template <>
+      void MaskMatrix_Inc<float, CPUContext>(
+          float* mask_seq,
+          float* mat,
+          int /*M*/,
+          int /*N*/,
+          int seq_len,
+          float target) {
+        for (int i = 0; i < seq_len; ++i) {
+          // assume that the mask_seq is smaller than size
+          // Although it seems that random access gets bad performance,
+          // we make sure that seq is in order;
+          mat[static_cast<int>(mask_seq[i])] = target;
+        }
+      }
+
+    template <>
+      void AggrDW<float, CPUContext>(
+          float* ag_dw, const float* dw,
+          int N, int K, CPUContext* context) {
+        math::Add<float, CPUContext>(N*K, dw, ag_dw, ag_dw, context);
+      }
+
+    template <>
+      int MatrixCompare_LT<float>(
+          const float* mat, float thres,
+          float* mask_seq, int M, int N) {
+        int seq_len = 0;
+        int offset = 0;
+        for (int i = 0 ; i < M; ++i) {
+          for (int j = 0; j < N; ++j) {
+            if (mat[offset] != 0 &&
+                (mat[offset] < thres && mat[offset] > -thres)) {
+              mask_seq[seq_len++] = static_cast<float>(offset);
+            }
+            offset++;
+          }
+        }
+        return seq_len;
+      }
+
+  }
+
+  // This is Caffe's InnerProductOp, with a name that fits its purpose better.
+  template <typename T, class Context, class Engine=DefaultEngine>
+    class FullyConnectedOpPrune final : public Operator<Context> {
+      public:
+        USE_OPERATOR_CONTEXT_FUNCTIONS;
+        FullyConnectedOpPrune(const OperatorDef& operator_def, Workspace* ws)
+          : Operator<Context>(operator_def, ws) {}
+        ~FullyConnectedOpPrune() {}
+
+        bool RunOnDevice() override {
+          const auto& X = Input(0);
+          const auto& W = Input(1);
+          const auto& Mask = Input(2);
+          const auto& b = Input(3);
+          auto* Y = Output(0);
+          CAFFE_ENFORCE_GE(X.ndim(), 1);
+          CAFFE_ENFORCE_GE(W.ndim(), 2);
+          if (X.ndim() > 2 || W.ndim() > 2) {
+            VLOG(1) << "Using legacy support for arbitrary input and weight "
+              "dimensions.";
+          }
+          CAFFE_ENFORCE_EQ(b.ndim(), 1);
+          // batch size
+          int M = X.ndim() > 1 ? X.dim32(0) : 1;
+          // Feature dimension
+          int K = X.size() / M;
+          // number of outputs.
+          int N = W.dim32(0);
+          CAFFE_ENFORCE_EQ(K, W.size() / W.dim32(0));
+          CAFFE_ENFORCE_EQ(N, b.dim32(0));
+          if (X.ndim() > 1) {
+            Y->Resize(M, N);
+          } else {
+            Y->Resize(N);
+          }
+          // W * x
+          math::Gemm<T, Context, Engine>(
+              CblasNoTrans, CblasTrans, M, N, K, 1, X.template data<T>(),
+              W.template data<T>(), 0, Y->template mutable_data<T>(),
+              &context_);
+          // Add bias term
+          if (bias_multiplier_.size() != M) {
+            // If the helper bias multiplier is not M,
+            // reshape and fill it with one.
+            bias_multiplier_.Resize(M);
+            math::Set<T, Context>(
+                M, static_cast<T>(1),
+                bias_multiplier_.template mutable_data<T>(),
+                &context_);
+          }
+          math::Gemm<T, Context, Engine>(
+              CblasNoTrans, CblasNoTrans, M, N, 1, 1,
+              bias_multiplier_.template data<T>(), b.template data<T>(), 1,
+              Y->template mutable_data<T>(), &context_);
+          if (OutputSize() == 2){
+            auto* Comp_rate = Output(1);
+            Comp_rate->Resize(vector<TIndex>());
+            T* comp_data = Comp_rate->template mutable_data<T>();
+            math::Sum<T, Context>(
+                Mask.size(), Mask.template data<T>(), comp_data, &context_);
+            math::Scale<T, Context>(
+                1, static_cast<T>(1.) / Mask.size(), comp_data, comp_data,
+                &context_);
+          }
+          return true;
+        }
+
+      protected:
+        Tensor<Context> bias_multiplier_;
+    };
+
+  template <typename T, class Context, class Engine=DefaultEngine>
+    class FullyConnectedPruneGradientOp : public Operator<Context> {
+      public:
+        int iter_offset;
+      public:
+        USE_OPERATOR_CONTEXT_FUNCTIONS;
+        FullyConnectedPruneGradientOp
+          (const OperatorDef& operator_def, Workspace* ws)
+          : Operator<Context>(operator_def, ws) { iter_offset = 0; }
+        ~FullyConnectedPruneGradientOp() {}
+
+        bool RunOnDevice() override {
+          const auto& X = Input(0);
+          //const auto& W = Input(1);
+          auto* W_ptr = Output(2);
+          auto& W = *W_ptr;
+          //const auto& Mask = Input(2);
+          auto* Mask_ptr = Output(3);
+          auto& Mask = *Mask_ptr;
+          const auto& dY = Input(3);
+          //const auto& Ag_dW = Input(4);
+          auto* Ag_dW_ptr = Output(4);
+          auto& Ag_dW = *Ag_dW_ptr;
+          // it is also the Input(5)
+          auto* mask_seq_auto = Output(5);
+          // how about get threshold
+          auto& thres = Input(6);
+          //TODO(wyiming): check comp_lb is a float
+          auto& comp_lb = Input(7);
+          DCHECK_GE(X.ndim(), 1);
+          DCHECK_GE(W.ndim(), 2);
+          DCHECK_LE(dY.ndim(), 2);
+          // batch size
+          int M = X.ndim() > 1 ? X.dim32(0) : 1;
+          // Feature dimension
+          int K = X.size() / M;
+          // number of outputs.
+          int N = W.dim32(0);
+          // TODO(wyiming): add this window_size to workspace?
+          int window_size = 100;
+          // TODO(wyiming): this threshold should be
+          // based on distribution of the layer weight
+          float thr = 0.01;
+          DCHECK_EQ(Mask.dim32(0), W.dim32(0));
+          DCHECK_EQ(Mask.dim32(1), W.dim32(1));
+          DCHECK_EQ(Ag_dW.dim32(0), W.dim32(0));
+          DCHECK_EQ(Ag_dW.dim32(1), W.dim32(1));
+          DCHECK_EQ(K, W.size() / W.dim32(0));
+          if (dY.ndim() > 1) {
+            DCHECK_EQ(M, dY.dim32(0));
+            DCHECK_EQ(N, dY.dim32(1));
+          } else {
+            DCHECK_EQ(X.ndim(), 1);
+            DCHECK_EQ(N, dY.size());
+          }
+          auto* dW = Output(0);
+          auto* db = Output(1);
+          dW->ResizeLike(W);
+          db->Resize(N);
+
+          // Compute dW
+          math::Gemm<T, Context, Engine>(
+              CblasTrans, CblasNoTrans, N, K, M, 1,
+              dY.template data<T>(), X.template data<T>(),
+              0, dW->template mutable_data<T>(),
+              &context_);
+
+          comp_r_buf_.Resize(vector<TIndex>());
+          T* comp_data = comp_r_buf_.template mutable_data<T>();
+          math::Sum<T, Context>(
+              Mask.size(), Mask.template data<T>(), comp_data, &context_);
+          math::Scale<T, Context>(
+              1, static_cast<T>(1.) / Mask.size(), comp_data, comp_data,
+              &context_);
+          // update W size window
+          // Notice here we need to maintain state in OP.
+          // This is new in Caffe2.
+          // And this is something we might need to discuss in the future.
+          // at most mask half of the matrix at time
+          // 1. mask dw with previous mask
+          MaskMatrix<T, Context>(Mask.template mutable_data<T>(),
+              dW->template mutable_data<T>(), N, K);
+          if(*comp_data > *(comp_lb.template data<T>())){
+            iter_offset++;
+            if (iter_offset % window_size == 0) {
+              // TODO(wyiming):do the prune here;
+              sum_buffer_.ResizeLike(W);
+              math::Add<T, Context>(W.size(),
+                  W.template mutable_data<T>(),
+                  Ag_dW.template mutable_data<T>(),
+                  sum_buffer_.template mutable_data<T>(),
+                  &context_);
+              mask_seq_auto->ResizeLike(W);
+              T* mask_seq = mask_seq_auto->template mutable_data<T>();
+              math::Set<T, Context>(N*K, static_cast<T>(0),
+                  mask_seq_auto->template mutable_data<T>(), &context_);
+              // 2. find dw below thres but not eq 0
+              int seq_len = MatrixCompare_LT<T>(
+                  Ag_dW_ptr->template mutable_data<T>(),
+                  *thres.template data<T>(), mask_seq, N, K);
+              // 3. use the mask_seq to update W and dw
+              MaskMatrix_Inc<T, Context>(mask_seq,
+                                         dW->template mutable_data<T>(),
+                                         N, K, seq_len, 0);
+              MaskMatrix_Inc<T, Context>(mask_seq,
+                                         W.template mutable_data<T>(),
+                                         N, K, seq_len, 0);
+              MaskMatrix_Inc<T, Context>(mask_seq,
+                                         Mask.template mutable_data<T>(),
+                                         N, K, seq_len, 0);
+              math::Set<T, Context>(N*K, static_cast<T>(0),
+                  Ag_dW.template mutable_data<T>(),
+                  &context_);
+            } else {
+              // add dW to Aggregate dW.
+              AggrDW<T, Context>(
+                  Ag_dW.template mutable_data<T>(),
+                  dW->template mutable_data<T>(),
+                  N, K, &context_);
+            }
+          }
+          if (bias_multiplier_.size() != M) {
+            // If the helper bias multiplier is not M,
+            // reshape and fill it with one.
+            bias_multiplier_.Resize(M);
+            math::Set<T, Context>(
+                M, static_cast<T>(1),
+                bias_multiplier_.template mutable_data<T>(),
+                &context_);
+          }
+          // Compute dB
+          math::Gemv<T, Context>(
+              CblasTrans, M, N, 1, dY.template data<T>(),
+              bias_multiplier_.template data<T>(), 0,
+              db->template mutable_data<T>(),
+              &context_);
+          // Compute dX if necessary.
+          if (OutputSize() == 7) {
+            auto* dX = Output(6);
+            dX->ResizeLike(X);
+            math::Gemm<T, Context, Engine>(
+                CblasNoTrans, CblasNoTrans, M, K, N, 1,
+                dY.template data<T>(), W.template data<T>(),
+                0, dX->template mutable_data<T>(),
+                &context_);
+          }
+
+          return true;
+        }
+
+      protected:
+        Tensor<Context> bias_multiplier_;
+        Tensor<Context> sum_buffer_;
+        Tensor<Context> comp_r_buf_;
+    };
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.cc b/caffe2/experiments/operators/fully_connected_op_sparse.cc
new file mode 100644
index 0000000..2213f1e
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.cc
@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/fully_connected_op_sparse.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(FC_Sparse, FullyConnectedOp_SPARSE<float, CPUContext>);
+
+OPERATOR_SCHEMA(FC_Sparse).NumInputs(5).NumOutputs(1);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h
new file mode 100644
index 0000000..a5abe18
--- /dev/null
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@@ -0,0 +1,149 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif  // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+namespace {
+
+template<int N>
+using Shape = std::array<int, N>;
+
+template<int N>
+const std::vector<TIndex>& shape(Shape<N> vs) {
+  static thread_local std::vector<TIndex> cache;
+  cache.resize(vs.size());
+  for (auto i = 0; i < vs.size(); ++i) {
+    cache[i] = vs[i];
+  }
+  return cache;
+}
+
+inline const std::vector<TIndex>& shape(int i) {
+  return shape<1>(Shape<1>({i}));
+}
+
+inline const std::vector<TIndex>& shape(int i, int j) {
+  return shape<2>(Shape<2>({i, j}));
+}
+
+template <typename T, class Context>
+void Sparse_mm(const T* acsr, const int* ia, const int* ja,
+              int m, int k, int n, const T* b, T* c, Context* context);
+
+template<typename T, class Context>
+void trans_mat(const T* o, T* t, int m, int n, Context* context);
+
+template <>
+void trans_mat<float, CPUContext>(
+    const float* o,
+    float* t,
+    int m,
+    int n,
+    CPUContext* /*context*/) {
+  for(int i = 0; i < m; ++i){
+    for(int j = 0; j < n; ++j){
+      t[j*m+i]=o[i*n+j];
+    }
+  }
+}
+
+// C = A(sparse) * B
+// No transpose;
+template <>
+void Sparse_mm<float, CPUContext>(
+    const float* acsr,
+    const int* ia,
+    const int* ja,
+    int m,
+    int k,
+    int n,
+    const float* b,
+    float* c,
+    CPUContext* /*context*/) {
+  float alpha = 1.0, beta = 0.;
+  mkl_scsrmm("N", &m, &n, &k, &alpha, "GLNC",
+             acsr, ja, ia, ia+1, b, &n, &beta, c, &n);
+}
+
+}
+
+// This is Caffe's InnerProductOp, with a name that fits its purpose better.
+template <typename T, class Context, class Engine=DefaultEngine>
+class FullyConnectedOp_SPARSE final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedOp_SPARSE(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~FullyConnectedOp_SPARSE() {}
+
+  bool RunOnDevice() override {
+    const auto& Xt = Input(0); // transposed X
+    const auto& Wcsr = Input(1);
+    const auto& iw = Input(2);
+    const auto& jw = Input(3);
+    // Notice that we do not need to transpose b
+    const auto& b = Input(4);
+    auto* Yt = Output(0); //transposed Y
+    // here we assume X is k-by-m
+    CAFFE_ENFORCE_EQ(Xt.ndim(), 2);
+    CAFFE_ENFORCE_EQ(b.ndim(), 1);
+    // batch size
+    int K = Xt.ndim() > 1 ? Xt.dim32(0) : 1;
+    // Feature dimension
+    int M = Xt.size() / K;
+    // number of outputs.
+    int N = iw.dim32(0)-1;
+    CAFFE_ENFORCE_EQ(N, b.dim32(0));
+    Yt->Resize(shape(N, M));
+
+    // Y' = W * X';
+    Sparse_mm<T, Context>(
+      Wcsr.template data<T>(), iw.template data<int>(),
+      jw.template data<int>(), N, K, M, Xt.template data<T>(),
+      Yt->template mutable_data<T>(), &context_);
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(shape(M));
+      math::Set<T, Context>(
+          M, static_cast<T>(1), bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans, CblasNoTrans, N, M, 1, 1,
+        b.template data<T>(), bias_multiplier_.template data<T>(), 1,
+        Yt->template mutable_data<T>(), &context_);
+    return true;
+  }
+
+ protected:
+  Tensor<Context> bias_multiplier_;
+};
+
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/funhash_op.cc b/caffe2/experiments/operators/funhash_op.cc
new file mode 100644
index 0000000..0340308
--- /dev/null
+++ b/caffe2/experiments/operators/funhash_op.cc
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/funhash_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(FunHash, FunHashOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(FunHashGradient, FunHashGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(FunHash)
+    .NumInputs(4, 5)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This layer compresses a fully-connected layer for sparse inputs
+via hashing.
+It takes four required inputs and an optional fifth input.
+The first three inputs `scalars`, `indices`, and `segment_ids` are
+the sparse segmented representation of sparse data, which are the
+same as the last three inputs of the `SparseSortedSegmentWeightedSum`
+operator. If the argument `num_segments` is specified, it would be used
+as the first dimension for the output; otherwise it would be derived
+from the maximum segment ID.
+
+The fourth input is a 1D weight vector. Each entry of the fully-connected
+layer would be randomly mapped from one of the entries in this vector.
+
+When the optional fifth input vector is present, each weight of the
+fully-connected layer would be the linear combination of K entries
+randomly mapped from the weight vector, provided the input
+(length-K vector) serves as the coefficients.
+)DOC")
+    .Input(0, "scalars", "Values of the non-zero entries of the sparse data.")
+    .Input(1, "indices", "Indices to the non-zero valued features.")
+    .Input(2, "segment_ids",
+        "Segment IDs corresponding to the non-zero entries.")
+    .Input(3, "weight", "Weight vector")
+    .Input(4, "alpha",
+        "Optional coefficients for linear combination of hashed weights.")
+    .Output(0, "output",
+        "Output tensor with the first dimension equal to the number "
+        "of segments.")
+    .Arg("num_outputs", "Number of outputs")
+    .Arg("num_segments", "Number of segments");
+
+OPERATOR_SCHEMA(FunHashGradient)
+    .NumInputs(5, 6)
+    .NumOutputs(1, 2);
+
+class GetFunHashGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (def_.input_size() == 4) {
+      return SingleGradientDef(
+          "FunHashGradient", "",
+          vector<string>{GO(0), I(0), I(1), I(2), I(3)},
+          vector<string>{GI(3)});
+    }
+    // def_.input_size() == 5
+    return SingleGradientDef(
+        "FunHashGradient", "",
+        vector<string>{GO(0), I(0), I(1), I(2), I(3), I(4)},
+        vector<string>{GI(3), GI(4)});
+  }
+};
+
+REGISTER_GRADIENT(FunHash, GetFunHashGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/funhash_op.h b/caffe2/experiments/operators/funhash_op.h
new file mode 100644
index 0000000..76bd37a
--- /dev/null
+++ b/caffe2/experiments/operators/funhash_op.h
@@ -0,0 +1,238 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_FUNHASH_OP_H_
+#define CAFFE2_OPERATORS_FUNHASH_OP_H_
+
+#include <xxhash.h>
+#include <array>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+#define SIGN_MAGIC 0x9e3779b97f4a7c15
+#define INDEX_MAGIC 0xf39cc0605cedc834
+
+#define USE_SIGN
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class FunHashOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FunHashOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_outputs_(
+            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+        num_segments_(
+            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("num_outputs"),
+        "Argument `num_outputs` is missing.");
+    // If alpha is provided, use adaptive hashing parameterized by alpha.
+    adaptive_ = (InputSize() == 5);
+  }
+
+  bool RunOnDevice() override {
+    const auto& val = Input(0);
+    const auto& key = Input(1);
+    const auto& seg = Input(2);
+    const auto& weight = Input(3);
+
+    TIndex num_alpha = 1;
+    if (adaptive_) {
+      const auto& alpha = Input(4);
+      num_alpha = alpha.dim(0);
+    }
+
+    const auto* seg_data = seg.template data<int>();
+
+    TIndex num_weight = weight.dim(0);
+    TIndex num_nz_ent = seg.dim(0);
+
+    TIndex n_segments = num_segments_;
+    if (num_segments_ == -1) {
+      for (TIndex i = 0; i < num_nz_ent; ++i) {
+        if (seg_data[i] > n_segments) {
+          n_segments = seg_data[i];
+        }
+      }
+      ++n_segments;
+    }
+
+    auto* output = Output(0);
+    output->Resize(n_segments, num_outputs_);
+
+    T* output_data = output->template mutable_data<T>();
+
+    memset(output_data, 0, sizeof(T) * n_segments * num_outputs_);
+
+    const auto* weight_data = weight.template data<T>();
+    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
+    const auto* val_data = val.template data<T>();
+    const auto* key_data = key.template data<TIndex>();
+
+    for (TIndex j = 0; j < num_nz_ent; ++j) {
+      TIndex cur_seg = seg_data[j];
+      TIndex cur_key = key_data[j];
+      T cur_val = val_data[j];
+      TIndex output_stride = cur_seg * num_outputs_;
+      for (TIndex i = 0; i < num_outputs_; ++i) {
+        T sum = 0;
+        for (TIndex k = 0; k < num_alpha; ++k) {
+          uint64_t hash;
+          // The hash function takes as input four integers:
+          // 1. feature index
+          // 2. output index
+          // 3. alpha index
+          // 4. magic number: SIGN_MAGIC for sign (-1/+1)
+          //                  INDEX_MAGIC for weight index
+          hash_data[0] = cur_key;
+          hash_data[1] = i;
+          hash_data[2] = k;
+
+          hash_data[3] = INDEX_MAGIC;
+          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+          TIndex index = hash % num_weight;
+
+          T cur_weight = weight_data[index];
+#ifdef USE_SIGN
+          hash_data[3] = SIGN_MAGIC;
+          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+          if (hash % 2) {
+            cur_weight = -cur_weight;
+          }
+#endif // USE_SIGN
+
+          if (adaptive_) {
+            sum += cur_weight * alpha_data[k];
+          } else {
+            sum += cur_weight;
+          }
+        }
+        output_data[output_stride + i] += sum * cur_val;
+      }
+    }
+
+    return true;
+  }
+
+ protected:
+  TIndex num_outputs_;
+  TIndex num_segments_;
+  uint64_t seed_;
+  std::array<uint64_t, 4> hash_data;
+  bool adaptive_;
+};
+
+template <typename T, class Context>
+class FunHashGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_outputs_(
+            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
+    adaptive_ = (InputSize() == 6);
+  }
+
+  bool RunOnDevice() override {
+    const auto& grad_out = Input(0);
+    const auto& val = Input(1);
+    const auto& key = Input(2);
+    const auto& seg = Input(3);
+    const auto& weight = Input(4);
+
+    TIndex num_alpha = 1;
+    T* grad_alpha_data = 0;
+
+    if (adaptive_) {
+      const auto& alpha = Input(5);
+      num_alpha = alpha.dim(0);
+      auto* grad_alpha = Output(1);
+      grad_alpha->ResizeLike(alpha);
+      grad_alpha_data = grad_alpha->template mutable_data<T>();
+      memset(grad_alpha_data, 0, sizeof(T) * num_alpha);
+    }
+
+    const auto* seg_data = seg.template data<int>();
+
+    TIndex num_weight = weight.dim(0);
+    TIndex num_nz_ent = seg.dim(0);
+
+    auto* grad_weight = Output(0);
+    grad_weight->ResizeLike(weight);
+    T* grad_weight_data = grad_weight->template mutable_data<T>();
+
+    const auto* grad_out_data = grad_out.template data<T>();
+    const auto* weight_data = weight.template data<T>();
+    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
+    const auto* val_data = val.template data<T>();
+    const auto* key_data = key.template data<TIndex>();
+
+    memset(grad_weight_data, 0, sizeof(T) * num_weight);
+
+    for (TIndex j = 0; j < num_nz_ent; ++j) {
+      TIndex cur_seg = seg_data[j];
+      TIndex cur_key = key_data[j];
+      T cur_val = val_data[j];
+      TIndex grad_out_stride = cur_seg * num_outputs_;
+      for (TIndex i = 0; i < num_outputs_; ++i) {
+        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
+        for (TIndex k = 0; k < num_alpha; ++k) {
+          uint64_t hash;
+          hash_data[0] = cur_key;
+          hash_data[1] = i;
+          hash_data[2] = k;
+
+          hash_data[3] = INDEX_MAGIC;
+          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+          TIndex index = hash % num_weight;
+
+          T cur_grad_out_scale = grad_out_scale;
+#ifdef USE_SIGN
+          hash_data[3] = SIGN_MAGIC;
+          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+          if (hash % 2) {
+            cur_grad_out_scale = -cur_grad_out_scale;
+          }
+#endif // USE_SIGN
+
+          if (adaptive_) {
+            grad_alpha_data[k] += cur_grad_out_scale * weight_data[index];
+            grad_weight_data[index] += alpha_data[k] * cur_grad_out_scale;
+          } else {
+            grad_weight_data[index] += cur_grad_out_scale;
+          }
+        }
+      }
+    }
+    return true;
+  }
+
+ protected:
+  TIndex num_outputs_;
+  uint64_t seed_;
+  std::array<uint64_t, 4> hash_data;
+  bool adaptive_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FUNHASH_OP_H_
diff --git a/caffe2/experiments/operators/sparse_funhash_op.cc b/caffe2/experiments/operators/sparse_funhash_op.cc
new file mode 100644
index 0000000..58caa63
--- /dev/null
+++ b/caffe2/experiments/operators/sparse_funhash_op.cc
@@ -0,0 +1,92 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/sparse_funhash_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(SparseFunHash, SparseFunHashOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SparseFunHashGradient,
+    SparseFunHashGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SparseFunHash)
+    .NumInputs(4, 5)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This layer compresses a fully-connected layer for sparse inputs
+via hashing.
+It takes four required inputs and an option fifth input.
+The first three inputs `scalars`, `indices`, and `segment_ids` are
+the sparse segmented representation of sparse data, which are the
+same as the last three inputs of the `SparseSortedSegmentWeightedSum`
+operator. If the argument `num_segments` is specified, it would be used
+as the first dimension for the output; otherwise it would be derived
+from the maximum segment ID.
+
+The fourth input is a 1D weight vector. Each entry of the fully-connected
+layer would be randomly mapped from one of the entries in this vector.
+
+When the optional fifth input vector is present, each weight of the
+fully-connected layer would be the linear combination of K entries
+randomly mapped from the weight vector, provided the input
+(length-K vector) serves as the coefficients.
+)DOC")
+    .Input(0, "scalars", "Values of the non-zero entries of the sparse data.")
+    .Input(1, "indices", "Indices to the non-zero valued features.")
+    .Input(
+        2,
+        "segment_ids",
+        "Segment IDs corresponding to the non-zero entries.")
+    .Input(3, "weight", "Weight vector")
+    .Input(
+        4,
+        "alpha",
+        "Optional coefficients for linear combination of hashed weights.")
+    .Output(
+        0,
+        "output",
+        "Output tensor with the first dimension equal to the number "
+        "of segments.")
+    .Arg("num_outputs", "Number of outputs")
+    .Arg("num_segments", "Number of segments");
+
+OPERATOR_SCHEMA(SparseFunHashGradient).NumInputs(5, 6).NumOutputs(2, 3);
+
+class GetSparseFunHashGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (def_.input_size() == 4) {
+      return SingleGradientDef(
+          "SparseFunHashGradient",
+          "",
+          vector<string>{GO(0), I(0), I(1), I(2), I(3)},
+          vector<string>{GI_V(3), GI_I(3)});
+    }
+    // def_.input_size() == 5
+    return SingleGradientDef(
+        "SparseFunHashGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1), I(2), I(3), I(4)},
+        vector<string>{GI_V(3), GI_I(3), GI(4)});
+  }
+};
+
+REGISTER_GRADIENT(SparseFunHash, GetSparseFunHashGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h
new file mode 100644
index 0000000..5c5f27e
--- /dev/null
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@@ -0,0 +1,249 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
+#define CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
+
+#include <xxhash.h>
+#include <array>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+#define HASH_MAGIC 0x9e3779b97f4a7c15
+
+#define USE_SIGN
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SparseFunHashOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseFunHashOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_outputs_(
+            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+        num_segments_(
+            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("num_outputs"),
+        "Argument `num_outputs` is missing.");
+    // If alpha is provided, use adaptive hashing parameterized by alpha.
+    adaptive_ = (InputSize() == 5);
+  }
+
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    const auto& val = Input(0);
+    const auto& key = Input(1);
+    const auto& seg = Input(2);
+    const auto& weight = Input(3);
+
+    TIndex num_alpha = 1;
+    if (adaptive_) {
+      const auto& alpha = Input(4);
+      num_alpha = alpha.dim(0);
+    }
+
+    const auto* seg_data = seg.template data<int>();
+
+    TIndex num_weight = weight.dim(0);
+    TIndex num_nz_ent = seg.dim(0);
+
+    TIndex n_segments = num_segments_;
+    if (num_segments_ == -1) {
+      for (TIndex i = 0; i < num_nz_ent; ++i) {
+        if (seg_data[i] > n_segments) {
+          n_segments = seg_data[i];
+        }
+      }
+      ++n_segments;
+    }
+
+    auto* output = Output(0);
+    output->Resize(n_segments, num_outputs_);
+
+    T* output_data = output->template mutable_data<T>();
+
+    memset(output_data, 0, sizeof(T) * n_segments * num_outputs_);
+
+    const auto* weight_data = weight.template data<T>();
+    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
+    const auto* val_data = val.template data<T>();
+    const auto* key_data = key.template data<TIndex>();
+
+    for (TIndex j = 0; j < num_nz_ent; ++j) {
+      TIndex cur_seg = seg_data[j];
+      TIndex cur_key = key_data[j];
+      T cur_val = val_data[j];
+      TIndex output_stride = cur_seg * num_outputs_;
+      for (TIndex i = 0; i < num_outputs_; ++i) {
+        T sum = 0;
+        for (TIndex k = 0; k < num_alpha; ++k) {
+          // The hash function takes as input three integers:
+          // 1. feature index
+          // 2. output index
+          // 3. alpha index
+          // 4. magic number to improve hashing
+          hash_data[0] = cur_key;
+          hash_data[1] = i;
+          hash_data[2] = k;
+          hash_data[3] = HASH_MAGIC;
+
+          uint64_t hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+
+#ifdef USE_SIGN
+          // Use the least significant bit for sign, the rest for weights.
+          TIndex index = (hash >> 1) % num_weight;
+          T cur_weight = weight_data[index];
+          if (hash & 1) {
+            cur_weight = -cur_weight;
+          }
+#else
+          TIndex index = hash % num_weight;
+          T cur_weight = weight_data[index];
+#endif
+
+          if (adaptive_) {
+            sum += cur_weight * alpha_data[k];
+          } else {
+            sum += cur_weight;
+          }
+        }
+        output_data[output_stride + i] += sum * cur_val;
+      }
+    }
+
+    return true;
+  }
+
+ protected:
+  TIndex num_outputs_;
+  TIndex num_segments_;
+  uint64_t seed_;
+  std::array<uint64_t, 4> hash_data;
+  bool adaptive_;
+};
+
+template <typename T, class Context>
+class SparseFunHashGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseFunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_outputs_(
+            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
+    adaptive_ = (InputSize() == 6);
+  }
+
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    const auto& grad_out = Input(0);
+    const auto& val = Input(1);
+    const auto& key = Input(2);
+    const auto& seg = Input(3);
+    const auto& weight = Input(4);
+
+    TIndex num_alpha = 1;
+    T* grad_alpha_data = 0;
+
+    if (adaptive_) {
+      const auto& alpha = Input(5);
+      num_alpha = alpha.dim(0);
+      auto* grad_alpha = Output(2);
+      grad_alpha->ResizeLike(alpha);
+      grad_alpha_data = grad_alpha->template mutable_data<T>();
+      memset(grad_alpha_data, 0, sizeof(T) * num_alpha);
+    }
+
+    const auto* seg_data = seg.template data<int>();
+
+    TIndex num_weight = weight.dim(0);
+    TIndex num_nz_ent = seg.dim(0);
+
+    TIndex grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
+    auto* grad_weight_val = Output(0);
+    grad_weight_val->Resize(grad_weight_size);
+    T* grad_weight_val_data = grad_weight_val->template mutable_data<T>();
+
+    auto* grad_weight_ind = Output(1);
+    grad_weight_ind->Resize(grad_weight_size);
+    auto* grad_weight_ind_data =
+        grad_weight_ind->template mutable_data<TIndex>();
+
+    const auto* grad_out_data = grad_out.template data<T>();
+    const auto* weight_data = weight.template data<T>();
+    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
+    const auto* val_data = val.template data<T>();
+    const auto* key_data = key.template data<TIndex>();
+
+    TIndex w_ind = 0;
+    for (TIndex j = 0; j < num_nz_ent; ++j) {
+      TIndex cur_seg = seg_data[j];
+      TIndex cur_key = key_data[j];
+      T cur_val = val_data[j];
+      TIndex grad_out_stride = cur_seg * num_outputs_;
+      for (TIndex i = 0; i < num_outputs_; ++i) {
+        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
+        for (TIndex k = 0; k < num_alpha; ++k) {
+          hash_data[0] = cur_key;
+          hash_data[1] = i;
+          hash_data[2] = k;
+          hash_data[3] = HASH_MAGIC;
+
+          uint64_t hash = XXH64(hash_data.data(), hash_data.size(), seed_);
+
+          T cur_grad_out_scale = grad_out_scale;
+#ifdef USE_SIGN
+          TIndex index = (hash >> 1) % num_weight;
+          if (hash & 1) {
+            cur_grad_out_scale = -cur_grad_out_scale;
+          }
+#else
+          TIndex index = hash % num_weight;
+#endif
+
+          if (adaptive_) {
+            grad_alpha_data[k] += cur_grad_out_scale * weight_data[index];
+            grad_weight_val_data[w_ind] = alpha_data[k] * cur_grad_out_scale;
+          } else {
+            grad_weight_val_data[w_ind] = cur_grad_out_scale;
+          }
+          grad_weight_ind_data[w_ind] = index;
+          ++w_ind;
+        }
+      }
+    }
+    return true;
+  }
+
+ protected:
+  TIndex num_outputs_;
+  uint64_t seed_;
+  std::array<uint64_t, 4> hash_data;
+  bool adaptive_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.cc b/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
new file mode 100644
index 0000000..b7dd5bf
--- /dev/null
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/sparse_matrix_reshape_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(SparseMatrixReshape, SparseMatrixReshapeOp<CPUContext>);
+
+OPERATOR_SCHEMA(SparseMatrixReshape)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {1, 1}})
+    .SetDoc(R"DOC(
+Compute the indices of the reshaped sparse matrix.
+
+It takes two 1D tensors as input: the column indices (in int64) and
+the row indices (in int), which correspond to `INDICES` and `SEGMENT_IDS`
+in `SparseSortedSegment` family.
+It outputs the corresponding reshaped column and row indices.
+
+Two arguments are required:
+an argument `old_shape` specifies the original shape of the matrix,
+and `new_shape` specifies the new shape.
+One of the dimension in `old_shape` and `new_shape` can be -1.
+The valid combinations are listed below, where p, q, r, s are
+strictly positive integers.
+
+old_shape=(p, q)
+new_shape=(r, s)
+
+old_shape=(p, q)
+new_shape=(-1, s)
+
+old_shape=(p, q)
+new_shape=(r, -1)
+
+old_shape=(-1, q)
+new_shape=(-1, s)
+
+Note that only the first dimension in `old_shape` can be -1. In that case
+the second dimension in `new_shape` must NOT be -1.
+)DOC")
+    .Arg("old_shape", "Old shape.")
+    .Arg("new_shape", "New shape.")
+    .Input(0, "old_col", "Original column indices.")
+    .Input(1, "old_row", "Original row indices.")
+    .Output(0, "new_col", "New column indices.")
+    .Output(1, "new_row", "New row indices.");
+
+SHOULD_NOT_DO_GRADIENT(SparseMatrixReshape);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
new file mode 100644
index 0000000..8c8d51c
--- /dev/null
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@@ -0,0 +1,135 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
+#define CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SparseMatrixReshapeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseMatrixReshapeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("old_shape"),
+        "Argument `old_shape` is missing.");
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("new_shape"),
+        "Argument `new_shape` is missing.");
+
+    vector<TIndex> old_shape =
+        OperatorBase::GetRepeatedArgument<TIndex>("old_shape");
+    vector<TIndex> new_shape =
+        OperatorBase::GetRepeatedArgument<TIndex>("new_shape");
+
+    CAFFE_ENFORCE(
+        old_shape.size() == 2,
+        "Argument `old_shape` must contain exactly two integers.");
+    CAFFE_ENFORCE(
+        new_shape.size() == 2,
+        "Argument `new_shape` must contain exactly two integers.");
+
+    CAFFE_ENFORCE(
+        old_shape[1] > 0,
+        "The second dimension in argument `old_shape` must be positive.");
+
+    old_stride_ = old_shape[1];
+
+    if (old_shape[0] == -1) {
+      CAFFE_ENFORCE(
+          new_shape[1] > 0,
+          "The second dimension in `new_shape` must be positive.");
+    } else {
+      CAFFE_ENFORCE(
+          old_shape[0] > 0,
+          "The first dimension in `old_shape` must be positive.");
+
+      TIndex matrix_size = old_shape[0] * old_shape[1];
+
+      if (new_shape[0] == -1) {
+        CAFFE_ENFORCE(
+            new_shape[1] > 0,
+            "Only one dimension in argument `new_shape` can be -1.");
+        CAFFE_ENFORCE(
+            matrix_size % new_shape[1] == 0,
+            "Argument `new_shape` does not agree with `old_shape`.");
+      } else {
+        CAFFE_ENFORCE(
+            new_shape[0] > 0 && (new_shape[1] == -1 || new_shape[1] > 0),
+            "Dimensions in argument `new_shape` must be positive or -1.");
+        if (new_shape[1] == -1) {
+          CAFFE_ENFORCE(
+              matrix_size % new_shape[0] == 0,
+              "Argument `new_shape` does not agree with `old_shape`.");
+          new_shape[1] = matrix_size / new_shape[0];
+        } else {
+          CAFFE_ENFORCE(
+              new_shape[0] * new_shape[1] == matrix_size,
+              "Argument `new_shape` does not agree with `old_shape`.");
+        }
+      }
+    }
+    new_stride_ = new_shape[1];
+  }
+
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    auto& old_col = Input(0);
+    CAFFE_ENFORCE(old_col.ndim() == 1, "Row index tensor must be 1-D.");
+    auto& old_row = Input(1);
+    CAFFE_ENFORCE(old_row.ndim() == 1, "Column index tensor must be 1-D.");
+
+    const auto nnz = old_col.size();
+    CAFFE_ENFORCE(
+        old_row.size() == nnz,
+        "Column and row tensors must have the same size.");
+
+    auto* new_col = Output(0);
+    auto* new_row = Output(1);
+    new_col->Resize(nnz);
+    new_row->Resize(nnz);
+
+    const auto* old_col_data = old_col.template data<TIndex>();
+    const auto* old_row_data = old_row.template data<int>();
+
+    auto* new_col_data = new_col->template mutable_data<TIndex>();
+    auto* new_row_data = new_row->template mutable_data<int>();
+
+    for (int i = 0; i < nnz; ++i) {
+      TIndex offset = old_row_data[i] * old_stride_ + old_col_data[i];
+      new_row_data[i] = offset / new_stride_;
+      new_col_data[i] = offset % new_stride_;
+    }
+
+    return true;
+  }
+
+ private:
+  TIndex old_stride_;
+  TIndex new_stride_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
diff --git a/caffe2/experiments/operators/tt_contraction_op.cc b/caffe2/experiments/operators/tt_contraction_op.cc
new file mode 100644
index 0000000..68c61d6
--- /dev/null
+++ b/caffe2/experiments/operators/tt_contraction_op.cc
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/tt_contraction_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(TTContraction, TTContractionOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(TTContraction)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Tensor contraction C = A * B
+)DOC")
+    .Arg("K", "i_{k-1} * r_k")
+    .Arg("M", "r_{k-1} * o_{k-1}")
+    .Arg("N", "o_k")
+    .Input(0, "A", "2D matrix of size (K x M)")
+    .Input(1, "B", "tensor")
+    .Output(0, "C", "contracted tensor");
+
+REGISTER_CPU_OPERATOR(
+    TTContractionGradient,
+    TTContractionGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(TTContractionGradient).NumInputs(3).NumOutputs(2);
+
+class GetTTContractionGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "TTContractionGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1)},
+        vector<string>{GI(0), GI(1)},
+        Def().arg());
+  }
+};
+
+REGISTER_GRADIENT(TTContraction, GetTTContractionGradient);
+
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_contraction_op.h b/caffe2/experiments/operators/tt_contraction_op.h
new file mode 100644
index 0000000..11ef35b
--- /dev/null
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@@ -0,0 +1,166 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
+#define CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTContractionOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTContractionOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {
+    CAFFE_ENFORCE(OperatorBase::HasArgument("K"), "Argument `K` is missing.");
+    CAFFE_ENFORCE(OperatorBase::HasArgument("M"), "Argument `M` is missing.");
+    CAFFE_ENFORCE(OperatorBase::HasArgument("N"), "Argument `N` is missing.");
+  }
+
+  bool RunOnDevice() override {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* C = Output(0);
+
+    CAFFE_ENFORCE(A.ndim() == 2, A.ndim());
+
+    TIndex A_size = A.size_from_dim(0);
+    TIndex B_size = B.size_from_dim(0);
+
+    CAFFE_ENFORCE(
+        K_ * M_ == A_size,
+        "Argument `K` and `M` do not agree with the size of A.");
+
+    CAFFE_ENFORCE(
+        B_size % (K_ * N_) == 0,
+        "Argument `K` and `N` do not agree with the size of B.");
+
+    TIndex D_ = B_size / (K_ * N_);
+
+    TIndex C_size = D_ * M_ * N_;
+    C->Resize(vector<TIndex>{C_size});
+
+    TIndex B_stride = K_ * N_;
+    TIndex C_stride = M_ * N_;
+
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+    T* C_data = C->template mutable_data<T>();
+
+    for (TIndex B_index = 0; B_index < B_size; B_index += B_stride) {
+      math::Gemm<T, Context, Engine>(
+          CblasTrans,
+          CblasNoTrans,
+          M_, N_, K_, 1,
+          A_data,
+          B_data + B_index,
+          0,
+          C_data,
+          &context_);
+      C_data += C_stride;
+    }
+
+    return true;
+  }
+
+ protected:
+  TIndex K_;
+  TIndex M_;
+  TIndex N_;
+};
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTContractionGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTContractionGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {}
+
+  bool RunOnDevice() override {
+    const auto& G = Input(0);
+    const auto& A = Input(1);
+    const auto& B = Input(2);
+    auto* dA = Output(0);
+    auto* dB = Output(1);
+
+    TIndex G_size = G.size_from_dim(0);
+    TIndex D_ = G_size / (M_ * N_);
+
+    TIndex dB_size = D_ * K_ * N_;
+
+    dA->Resize(A.dims());
+    dB->Resize(B.dims());
+
+    TIndex B_stride = K_ * N_;
+    TIndex G_stride = M_ * N_;
+
+    const T* G_data = G.template data<T>();
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+
+    T* dA_data = dA->template mutable_data<T>();
+    T* dB_data = dB->template mutable_data<T>();
+
+    const T* G_ptr = G_data;
+    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+      math::Gemm<T, Context, Engine>(
+          CblasNoTrans,
+          CblasTrans,
+          K_, M_, N_, 1,
+          B_data + B_index,
+          G_ptr,
+          B_index == 0 ? 0 : 1,
+          dA_data,
+          &context_);
+      G_ptr += G_stride;
+    }
+
+    G_ptr = G_data;
+    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+      math::Gemm<T, Context, Engine>(
+          CblasNoTrans,
+          CblasNoTrans,
+          K_, N_, M_, 1,
+          A_data,
+          G_ptr,
+          0,
+          dB_data + B_index,
+          &context_);
+      G_ptr += G_stride;
+    }
+
+    return true;
+  }
+
+ protected:
+  TIndex K_;
+  TIndex M_;
+  TIndex N_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
diff --git a/caffe2/experiments/operators/tt_contraction_op_gpu.cc b/caffe2/experiments/operators/tt_contraction_op_gpu.cc
new file mode 100644
index 0000000..496d8cb
--- /dev/null
+++ b/caffe2/experiments/operators/tt_contraction_op_gpu.cc
@@ -0,0 +1,27 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/experiments/operators/tt_contraction_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(TTContraction, TTContractionOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    TTContractionGradient,
+    TTContractionGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_pad_op.cc b/caffe2/experiments/operators/tt_pad_op.cc
new file mode 100644
index 0000000..4d8d9f3
--- /dev/null
+++ b/caffe2/experiments/operators/tt_pad_op.cc
@@ -0,0 +1,46 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/experiments/operators/tt_pad_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(TTPad, TTPadOp<float, CPUContext>);
+OPERATOR_SCHEMA(TTPad).NumInputs(1).NumOutputs(2).EnforceInplace({{0, 0}});
+
+REGISTER_CPU_OPERATOR(TTPadGradient, TTPadGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(TTPadGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}});
+
+class GetTTPadGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "TTPadGradient",
+        "",
+        vector<string>{GO(0), O(1)},
+        vector<string>{GI(0)},
+        Def().arg());
+  }
+};
+
+REGISTER_GRADIENT(TTPad, GetTTPadGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_pad_op.h b/caffe2/experiments/operators/tt_pad_op.h
new file mode 100644
index 0000000..c851015
--- /dev/null
+++ b/caffe2/experiments/operators/tt_pad_op.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPERATORS_TT_PAD_OP_H_
+#define CAFFE2_OPERATORS_TT_PAD_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTPadOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTPadOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<TIndex>("scale", 0)) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("scale"), "Argument `scale` is missing.");
+  }
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    auto* X_pad = Output(0);
+    CAFFE_ENFORCE(&X == X_pad);
+
+    CAFFE_ENFORCE(X.ndim() == 2, X.ndim());
+
+    auto X_dim0 = X.dim(0);
+    auto X_dim1 = X.dim(1);
+
+    auto* X_orig_dim0 = Output(1);
+    X_orig_dim0->Resize(1);
+    *X_orig_dim0->template mutable_data<TIndex>() = X_dim0;
+
+    if (X_dim0 % scale_ != 0) {
+      TIndex padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
+      auto dim0_diff = padded_dim0 - X_dim0;
+      // set growthPct to the upper bound percentage: (100 * scale_ / X_dim0)
+      X_pad->template Extend(dim0_diff, 100 * scale_ / X_dim0, &context_);
+
+      auto* X_pad_data = X_pad->template mutable_data<T>();
+      TIndex X_size = X_dim0 * X_dim1;
+      memset(X_pad_data + X_size, 0, dim0_diff * X_dim1 * sizeof(T));
+    }
+
+    return true;
+  }
+
+ protected:
+  TIndex scale_;
+};
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTPadGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTPadGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& G = Input(0);
+    auto* output = Output(0);
+    CAFFE_ENFORCE(&G == output);
+
+    auto old_dim0 = *Input(1).template data<TIndex>();
+    auto new_dim0 = G.dim(0);
+    auto dim1 = G.dim(1);
+
+    if (old_dim0 < new_dim0) {
+      output->Shrink(old_dim0);
+    }
+
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TT_PAD_OP_H_
diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py
new file mode 100644
index 0000000..ff9ab77
--- /dev/null
+++ b/caffe2/experiments/python/SparseTransformer.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+## @package SparseTransformer
+# Module caffe2.experiments.python.SparseTransformer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import workspace
+import scipy.sparse
+
+
+class NetDefNode():
+
+    def __init__(self, name, optype, p=None, op=None):
+        self.name = name
+        self.optype = optype
+        self.ops = {}
+        self.prev = {}
+        self.insertInput(p)
+        self.visited = False
+        self.op = op
+
+    def insertInput(self, p):
+        """
+        Insert input of this op
+        also maintain the output of previous op
+        p: a node or a list of node
+        """
+        if isinstance(p, list):
+            for i in p:
+                self.prev[i.name] = i
+                i.ops[self.name] = self
+        elif isinstance(p, NetDefNode):
+            self.prev[p.name] = p
+            p.ops[self.name] = self
+
+    def deleteInput(self, p):
+        if isinstance(p, NetDefNode):
+            del self.prev[p.name]
+            del p.ops[self.name]
+
+
+def maskNallocate(weight_name):
+    """
+    Combine mask and weights
+    create wcsr, iw, jw, return their names
+    """
+    w = workspace.FetchBlob(weight_name)
+    w_csr = scipy.sparse.csr_matrix(w)
+    wcsr = w_csr.data
+    iw = w_csr.indptr
+    jw = w_csr.indices
+    workspace.FeedBlob(weight_name + "wcsr", wcsr)
+    workspace.FeedBlob(weight_name + "iw", iw)
+    workspace.FeedBlob(weight_name + "jw", jw)
+    return weight_name + "wcsr", weight_name + "iw", weight_name + "jw"
+
+
+def transFCRelu(cur, id2node, name2id, ops, model):
+    """
+    Add trans before and after this FC_Prune->(Relu)->FC_Prune chain.
+    """
+    # 1. add trans before the start of this chain
+    # assuming that cur is a FC_Prune, and it has only one input
+    pre = cur.prev.itervalues().next()
+    # Create a node /op and insert it.
+    # TODO(wyiming): check whether it is correct here
+    current_blob = model.Transpose(cur.op.input[0], cur.op.input[0] + "_trans")
+#     print model.net.Proto()
+    trans_op = model.net.Proto().op[-1]
+    trans_node = NetDefNode(trans_op.output[0], "Transpose", pre, trans_op)
+    trans_node.visited = True
+    pre_new = trans_node
+
+    # 2. use while loop to visit the chain
+    while True:
+        # breakup with the parent
+        cur.deleteInput(pre)
+        if not (cur.optype == "FC_Prune" or cur.optype == "Relu"):
+            print("Reaching the end of the chain")
+            break
+        if len(cur.ops) > 1:
+            print("A FC/Relu giving more than 1 useful outputs")
+        if cur.optype == "FC_Prune":
+            op = cur.op
+            wcsr, iw, jw = maskNallocate(op.input[1])
+            bias_name = op.input[3]
+            # TODO(wyiming): create a new Op here
+            current_blob = model.FC_Sparse(current_blob,
+                                           cur.op.output[0] + "_Sparse",
+                                           wcsr, iw, jw, bias_name)
+            sps_op = model.net.Proto().op[-1]
+            sps_node = NetDefNode(cur.op.output[0] + "_Sparse",
+                                  "FC_Sparse",
+                                  pre_new, sps_op)
+            sps_node.visited = True
+            pre_new = sps_node
+        if cur.optype == "Relu":
+            op = cur.op
+            current_blob = model.Relu(current_blob, current_blob)
+            rel_op = model.net.Proto().op[-1]
+            rel_node = NetDefNode(str(current_blob), "Relu",
+                                  pre_new, rel_op)
+            rel_node.visited = True
+            pre_new = rel_node
+
+        cur.visited = True
+        pre = cur
+        flag = False
+        for _, temp in cur.ops.iteritems():
+            if temp.optype == "Relu" or temp.optype == "FC_Prune":
+                flag = True
+                cur = temp
+        if not flag:
+            # assume that there is only 1 output that is not PrintOP
+            cur = cur.ops.itervalues().next()
+            cur.deleteInput(pre)
+            print("No FC/RElu children")
+            print(cur.op.type)
+            break
+    # 3. add trans after this chain like 1.
+    current_blob = model.Transpose(current_blob, pre.op.output[0])
+    trans_op = model.net.Proto().op[-1]
+    trans_node = NetDefNode(str(current_blob), "Transpose", pre_new, trans_op)
+    trans_node.visited = True
+    cur.insertInput(trans_node)
+    print(cur.prev)
+    print(trans_node.ops)
+
+
+def Prune2Sparse(cur, id2node, name2id, ops, model):
+    # Assume that FC and Relu takes in only 1 input;
+    # If not raise warning
+    if not cur.visited and cur.optype == "FC_Prune":
+        transFCRelu(cur, id2node, name2id, ops, model)
+
+    cur.visited = True
+    for name, n in cur.ops.iteritems():
+        Prune2Sparse(n, id2node, name2id, ops, model)
+
+
+def net2list(net_root):
+    """
+    Use topological order(BFS) to print the op of a net in a list
+    """
+    bfs_queue = []
+    op_list = []
+    cur = net_root
+    for _, n in cur.ops.iteritems():
+        bfs_queue.append(n)
+    while bfs_queue:
+        node = bfs_queue[0]
+        bfs_queue = bfs_queue[1:]
+        op_list.append(node.op)
+        for _, n in node.ops.iteritems():
+            bfs_queue.append(n)
+
+    return op_list
+
+
+def netbuilder(model):
+    print("Welcome to model checker")
+    proto = model.net.Proto()
+    net_name2id = {}
+    net_id2node = {}
+    net_root = NetDefNode("net_root", "root", None)
+
+    for op_id, op in enumerate(proto.op):
+        if op.type == "Print":
+            continue
+        op_name = '%s/%s (op#%d)' % (op.name, op.type, op_id) \
+                  if op.name else '%s (op#%d)' % (op.type, op_id)
+        # print(op_name)
+        op_node = NetDefNode(op_name, op.type, op=op)
+        net_id2node[op_id] = op_node
+
+        if_has_layer_input = False
+        for input_name in op.input:
+            if input_name not in net_name2id:
+                # assume that un_occured name are non_layers
+                # TODO: write a non-layer checker and log it
+                continue
+            op_node.insertInput(net_id2node[net_name2id[input_name]])
+            if_has_layer_input = True
+
+        if not if_has_layer_input:
+            op_node.insertInput(net_root)
+
+        for output_name in op.output:
+            net_name2id[output_name] = op_id
+
+    return net_root, net_name2id, net_id2node
diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py
new file mode 100644
index 0000000..b023185
--- /dev/null
+++ b/caffe2/experiments/python/convnet_benchmarks.py
@@ -0,0 +1,700 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+## @package convnet_benchmarks
+# Module caffe2.experiments.python.convnet_benchmarks
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+"""
+Benchmark for common convnets.
+
+(NOTE: Numbers below prior with missing parameter=update step, TODO to update)
+
+Speed on Titan X, with 10 warmup steps and 10 main steps and with different
+versions of cudnn, are as follows (time reported below is per-batch time,
+forward / forward+backward):
+
+                    CuDNN V3        CuDNN v4
+                    AlexNet         32.5 / 108.0    27.4 /  90.1
+                    OverFeat       113.0 / 342.3    91.7 / 276.5
+                    Inception      134.5 / 485.8   125.7 / 450.6
+                    VGG (batch 64) 200.8 / 650.0   164.1 / 551.7
+
+Speed on Inception with varied batch sizes and CuDNN v4 is as follows:
+
+Batch Size   Speed per batch     Speed per image
+16             22.8 /  72.7         1.43 / 4.54
+32             38.0 / 127.5         1.19 / 3.98
+64             67.2 / 233.6         1.05 / 3.65
+128            125.7 / 450.6         0.98 / 3.52
+
+Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
+v4, is as follows:
+
+AlexNet         68.4 / 218.1
+OverFeat       210.5 / 630.3
+Inception      300.2 / 1122.2
+VGG (batch 64) 405.8 / 1327.7
+
+(Note that these numbers involve a "full" backprop, i.e. the gradient
+with respect to the input image is also computed.)
+
+To get the numbers, simply run:
+
+for MODEL in AlexNet OverFeat Inception; do
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 128 --model $MODEL --forward_only True
+done
+for MODEL in AlexNet OverFeat Inception; do
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 128 --model $MODEL
+done
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 64 --model VGGA --forward_only True
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 64 --model VGGA
+
+for BS in 16 32 64 128; do
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size $BS --model Inception --forward_only True
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size $BS --model Inception
+done
+
+Note that VGG needs to be run at batch 64 due to memory limit on the backward
+pass.
+"""
+
+import argparse
+import time
+
+from caffe2.python import cnn, workspace, core
+
+import caffe2.python.SparseTransformer as SparseTransformer
+
+
+def MLP(order):
+    model = cnn.CNNModelHelper()
+    d = 256
+    depth = 20
+    width = 3
+    for i in range(depth):
+        for j in range(width):
+            current = "fc_{}_{}".format(i, j) if i > 0 else "data"
+            next_ = "fc_{}_{}".format(i + 1, j)
+            model.FC(
+                current, next_,
+                dim_in=d, dim_out=d,
+                weight_init=model.XavierInit,
+                bias_init=model.XavierInit)
+            model.Sum(["fc_{}_{}".format(depth, j)
+                       for j in range(width)], ["sum"])
+            model.FC("sum", "last",
+                     dim_in=d, dim_out=1000,
+                     weight_init=model.XavierInit,
+                     bias_init=model.XavierInit)
+            xent = model.LabelCrossEntropy(["last", "label"], "xent")
+            model.AveragedLoss(xent, "loss")
+            return model, d
+
+
+def AlexNet(order):
+    model = cnn.CNNModelHelper(order, name="alexnet",
+                               use_cudnn=True, cudnn_exhaustive_search=True)
+    conv1 = model.Conv(
+        "data",
+        "conv1",
+        3,
+        64,
+        11,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        stride=4,
+        pad=2
+    )
+
+    relu1 = model.Relu(conv1, "conv1")
+    pool1 = model.MaxPool(relu1, "pool1", kernel=3, stride=2)
+    conv2 = model.Conv(
+        pool1,
+        "conv2",
+        64,
+        192,
+        5,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=2
+    )
+    relu2 = model.Relu(conv2, "conv2")
+    pool2 = model.MaxPool(relu2, "pool2", kernel=3, stride=2)
+    conv3 = model.Conv(
+        pool2,
+        "conv3",
+        192,
+        384,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu3 = model.Relu(conv3, "conv3")
+    conv4 = model.Conv(
+        relu3,
+        "conv4",
+        384,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu4 = model.Relu(conv4, "conv4")
+    conv5 = model.Conv(
+        relu4,
+        "conv5",
+        256,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu5 = model.Relu(conv5, "conv5")
+    pool5 = model.MaxPool(relu5, "pool5", kernel=3, stride=2)
+    fc6 = model.FC(
+        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu6 = model.Relu(fc6, "fc6")
+    fc7 = model.FC(
+        relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu7 = model.Relu(fc7, "fc7")
+    fc8 = model.FC(
+        relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pred = model.Softmax(fc8, "pred")
+    xent = model.LabelCrossEntropy([pred, "label"], "xent")
+    model.AveragedLoss(xent, "loss")
+    return model, 224
+
+
+def OverFeat(order):
+    model = cnn.CNNModelHelper(order, name="overfeat",
+                               use_cudnn=True, cudnn_exhaustive_search=True)
+    conv1 = model.Conv(
+        "data",
+        "conv1",
+        3,
+        96,
+        11,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        stride=4
+    )
+    relu1 = model.Relu(conv1, "conv1")
+    pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
+    conv2 = model.Conv(
+        pool1, "conv2", 96, 256, 5, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu2 = model.Relu(conv2, "conv2")
+    pool2 = model.MaxPool(relu2, "pool2", kernel=2, stride=2)
+    conv3 = model.Conv(
+        pool2,
+        "conv3",
+        256,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu3 = model.Relu(conv3, "conv3")
+    conv4 = model.Conv(
+        relu3,
+        "conv4",
+        512,
+        1024,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu4 = model.Relu(conv4, "conv4")
+    conv5 = model.Conv(
+        relu4,
+        "conv5",
+        1024,
+        1024,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu5 = model.Relu(conv5, "conv5")
+    pool5 = model.MaxPool(relu5, "pool5", kernel=2, stride=2)
+    fc6 = model.FC(
+        pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu6 = model.Relu(fc6, "fc6")
+    fc7 = model.FC(
+        relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu7 = model.Relu(fc7, "fc7")
+    fc8 = model.FC(
+        relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pred = model.Softmax(fc8, "pred")
+    xent = model.LabelCrossEntropy([pred, "label"], "xent")
+    model.AveragedLoss(xent, "loss")
+    return model, 231
+
+
+def VGGA(order):
+    model = cnn.CNNModelHelper(order, name='vgg-a',
+                               use_cudnn=True, cudnn_exhaustive_search=True)
+    conv1 = model.Conv(
+        "data",
+        "conv1",
+        3,
+        64,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu1 = model.Relu(conv1, "conv1")
+    pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
+    conv2 = model.Conv(
+        pool1,
+        "conv2",
+        64,
+        128,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu2 = model.Relu(conv2, "conv2")
+    pool2 = model.MaxPool(relu2, "pool2", kernel=2, stride=2)
+    conv3 = model.Conv(
+        pool2,
+        "conv3",
+        128,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu3 = model.Relu(conv3, "conv3")
+    conv4 = model.Conv(
+        relu3,
+        "conv4",
+        256,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu4 = model.Relu(conv4, "conv4")
+    pool4 = model.MaxPool(relu4, "pool4", kernel=2, stride=2)
+    conv5 = model.Conv(
+        pool4,
+        "conv5",
+        256,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu5 = model.Relu(conv5, "conv5")
+    conv6 = model.Conv(
+        relu5,
+        "conv6",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu6 = model.Relu(conv6, "conv6")
+    pool6 = model.MaxPool(relu6, "pool6", kernel=2, stride=2)
+    conv7 = model.Conv(
+        pool6,
+        "conv7",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu7 = model.Relu(conv7, "conv7")
+    conv8 = model.Conv(
+        relu7,
+        "conv8",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu8 = model.Relu(conv8, "conv8")
+    pool8 = model.MaxPool(relu8, "pool8", kernel=2, stride=2)
+
+    fcix = model.FC(
+        pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    reluix = model.Relu(fcix, "fcix")
+    fcx = model.FC(
+        reluix, "fcx", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relux = model.Relu(fcx, "fcx")
+    fcxi = model.FC(
+        relux, "fcxi", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pred = model.Softmax(fcxi, "pred")
+    xent = model.LabelCrossEntropy([pred, "label"], "xent")
+    model.AveragedLoss(xent, "loss")
+    return model, 231
+
+
+def net_DAG_Builder(model):
+    print("====================================================")
+    print("                 Start Building DAG                 ")
+    print("====================================================")
+    net_root = SparseTransformer.netbuilder(model)
+    return net_root
+
+
+def _InceptionModule(
+    model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
+    conv5_depths, pool_depth
+):
+    # path 1: 1x1 conv
+    conv1 = model.Conv(
+        input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
+        ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv1 = model.Relu(conv1, conv1)
+    # path 2: 1x1 conv + 3x3 conv
+    conv3_reduce = model.Conv(
+        input_blob, output_name +
+        ":conv3_reduce", input_depth, conv3_depths[0],
+        1, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv3_reduce = model.Relu(conv3_reduce, conv3_reduce)
+    conv3 = model.Conv(
+        conv3_reduce,
+        output_name + ":conv3",
+        conv3_depths[0],
+        conv3_depths[1],
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    conv3 = model.Relu(conv3, conv3)
+    # path 3: 1x1 conv + 5x5 conv
+    conv5_reduce = model.Conv(
+        input_blob, output_name +
+        ":conv5_reduce", input_depth, conv5_depths[0],
+        1, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv5_reduce = model.Relu(conv5_reduce, conv5_reduce)
+    conv5 = model.Conv(
+        conv5_reduce,
+        output_name + ":conv5",
+        conv5_depths[0],
+        conv5_depths[1],
+        5,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=2
+    )
+    conv5 = model.Relu(conv5, conv5)
+    # path 4: pool + 1x1 conv
+    pool = model.MaxPool(
+        input_blob,
+        output_name + ":pool",
+        kernel=3,
+        stride=1,
+        pad=1
+    )
+    pool_proj = model.Conv(
+        pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
+        ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pool_proj = model.Relu(pool_proj, pool_proj)
+    output = model.Concat([conv1, conv3, conv5, pool_proj], output_name)
+    return output
+
+
+def Inception(order):
+    model = cnn.CNNModelHelper(order, name="inception",
+                               use_cudnn=True, cudnn_exhaustive_search=True)
+    conv1 = model.Conv(
+        "data",
+        "conv1",
+        3,
+        64,
+        7,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        stride=2,
+        pad=3
+    )
+    relu1 = model.Relu(conv1, "conv1")
+    pool1 = model.MaxPool(relu1, "pool1", kernel=3, stride=2, pad=1)
+    conv2a = model.Conv(
+        pool1, "conv2a", 64, 64, 1, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv2a = model.Relu(conv2a, conv2a)
+    conv2 = model.Conv(
+        conv2a,
+        "conv2",
+        64,
+        192,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu2 = model.Relu(conv2, "conv2")
+    pool2 = model.MaxPool(relu2, "pool2", kernel=3, stride=2, pad=1)
+    # Inception modules
+    inc3 = _InceptionModule(
+        model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
+    )
+    inc4 = _InceptionModule(
+        model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
+    )
+    pool5 = model.MaxPool(inc4, "pool5", kernel=3, stride=2, pad=1)
+    inc5 = _InceptionModule(
+        model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
+    )
+    inc6 = _InceptionModule(
+        model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
+    )
+    inc7 = _InceptionModule(
+        model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
+    )
+    inc8 = _InceptionModule(
+        model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
+    )
+    inc9 = _InceptionModule(
+        model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
+    )
+    pool9 = model.MaxPool(inc9, "pool9", kernel=3, stride=2, pad=1)
+    inc10 = _InceptionModule(
+        model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
+    )
+    inc11 = _InceptionModule(
+        model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
+    )
+    pool11 = model.AveragePool(inc11, "pool11", kernel=7, stride=1)
+    fc = model.FC(
+        pool11, "fc", 1024, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    # It seems that Soumith's benchmark does not have softmax on top
+    # for Inception. We will add it anyway so we can have a proper
+    # backward pass.
+    pred = model.Softmax(fc, "pred")
+    xent = model.LabelCrossEntropy([pred, "label"], "xent")
+    model.AveragedLoss(xent, "loss")
+    return model, 224
+
+
+def AddInput(model, batch_size, db, db_type):
+    """Adds the data input part."""
+    data_uint8, label = model.TensorProtosDBInput(
+        [], ["data_uint8", "label"], batch_size=batch_size,
+        db=db, db_type=db_type
+    )
+    data = model.Cast(data_uint8, "data_nhwc", to=core.DataType.FLOAT)
+    data = model.NHWC2NCHW(data, "data")
+    data = model.Scale(data, data, scale=float(1. / 256))
+    data = model.StopGradient(data, data)
+    return data, label
+
+
+def AddParameterUpdate(model):
+    """ Simple plain SGD update -- not tuned to actually train the models """
+    ITER = model.Iter("iter")
+    LR = model.LearningRate(
+        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    for param in model.params:
+        param_grad = model.param_to_grad[param]
+        model.WeightedSum([param, ONE, param_grad, LR], param)
+
+
+def Benchmark(model_gen, arg):
+    model, input_size = model_gen(arg.order)
+    model.Proto().type = arg.net_type
+    model.Proto().num_workers = arg.num_workers
+
+    # In order to be able to run everything without feeding more stuff, let's
+    # add the data and label blobs to the parameter initialization net as well.
+
+    if arg.order == "NCHW":
+        input_shape = [arg.batch_size, 3, input_size, input_size]
+    else:
+        input_shape = [arg.batch_size, input_size, input_size, 3]
+        if arg.model == "MLP":
+            input_shape = [arg.batch_size, input_size]
+
+    model.param_init_net.GaussianFill(
+        [],
+        "data",
+        shape=input_shape,
+        mean=0.0,
+        std=1.0
+    )
+    model.param_init_net.UniformIntFill(
+        [],
+        "label",
+        shape=[arg.batch_size, ],
+        min=0,
+        max=999
+    )
+
+    if arg.forward_only:
+        print('{}: running forward only.'.format(arg.model))
+    else:
+        print('{}: running forward-backward.'.format(arg.model))
+        model.AddGradientOperators(["loss"])
+        AddParameterUpdate(model)
+
+        if arg.order == 'NHWC':
+            print(
+                '==WARNING==\n'
+                'NHWC order with CuDNN may not be supported yet, so I might\n'
+                'exit suddenly.'
+            )
+
+    if not arg.cpu:
+        model.param_init_net.RunAllOnGPU()
+        model.net.RunAllOnGPU()
+
+    if arg.dump_model:
+        # Writes out the pbtxt for benchmarks on e.g. Android
+        with open(
+            "{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
+        ) as fid:
+            fid.write(str(model.param_init_net.Proto()))
+            with open("{0}.pbtxt".format(arg.model,
+                                         arg.batch_size), "w") as fid:
+                fid.write(str(model.net.Proto()))
+
+    workspace.RunNetOnce(model.param_init_net)
+    workspace.CreateNet(model.net)
+    for i in range(arg.warmup_iterations):
+        workspace.RunNet(model.net.Proto().name)
+
+    plan = core.Plan("plan")
+    plan.AddStep(core.ExecutionStep("run", model.net, arg.iterations))
+    start = time.time()
+    workspace.RunPlan(plan)
+    print('Spent: {}'.format((time.time() - start) / arg.iterations))
+    if arg.layer_wise_benchmark:
+        print('Layer-wise benchmark.')
+        workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True)
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size."
+    )
+    parser.add_argument("--model", type=str, help="The model to benchmark.")
+    parser.add_argument(
+        "--order",
+        type=str,
+        default="NCHW",
+        help="The order to evaluate."
+    )
+    parser.add_argument(
+        "--cudnn_ws",
+        type=int,
+        default=-1,
+        help="The cudnn workspace size."
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10,
+        help="Number of iterations to run the network."
+    )
+    parser.add_argument(
+        "--warmup_iterations",
+        type=int,
+        default=10,
+        help="Number of warm-up iterations before benchmarking."
+    )
+    parser.add_argument(
+        "--forward_only",
+        action='store_true',
+        help="If set, only run the forward pass."
+    )
+    parser.add_argument(
+        "--layer_wise_benchmark",
+        action='store_true',
+        help="If True, run the layer-wise benchmark as well."
+    )
+    parser.add_argument(
+        "--cpu",
+        action='store_true',
+        help="If True, run testing on CPU instead of GPU."
+    )
+    parser.add_argument(
+        "--dump_model",
+        action='store_true',
+        help="If True, dump the model prototxts to disk."
+    )
+    parser.add_argument("--net_type", type=str, default="dag")
+    parser.add_argument("--num_workers", type=int, default=2)
+    return parser
+
+
+if __name__ == '__main__':
+    args = GetArgumentParser().parse_args()
+    if (
+        not args.batch_size or not args.model or not args.order or
+        not args.cudnn_ws
+    ):
+        GetArgumentParser().print_help()
+
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+    model_map = {
+        'AlexNet': AlexNet,
+        'OverFeat': OverFeat,
+        'VGGA': VGGA,
+        'Inception': Inception,
+        'MLP': MLP,
+    }
+    Benchmark(model_map[args.model], args)
diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py
new file mode 100644
index 0000000..dbe0dae
--- /dev/null
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+## @package device_reduce_sum_bench
+# Module caffe2.experiments.python.device_reduce_sum_bench
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import itertools
+import logging
+import os
+
+from six import add_metaclass
+import numpy as np
+
+from caffe2.python import workspace, core
+from caffe2.python.hypothesis_test_util import runOpBenchmark, gpu_do
+
+logging.basicConfig()
+logger = logging.getLogger(os.path.basename(__file__))
+logger.setLevel(logging.INFO)
+
+ALL_BENCHMARKS = {}
+
+
+class BenchmarkMeta(type):
+    def __new__(metacls, name, bases, class_dict):
+        cls = type.__new__(metacls, name, bases, class_dict)
+        if name != 'Benchmark':
+            ALL_BENCHMARKS[name] = cls
+        return cls
+
+
+@add_metaclass(BenchmarkMeta)
+class Benchmark(object):
+
+    def __init__(self):
+        self.results = []
+
+    def display(self):
+        print('Results ({}):'.format(type(self).__name__))
+        print('input size                      ms/iter')
+        print('------------------------------  -----------')
+        for size, ms in self.results:
+            print('{!s:<30}  {:.4f}'.format(size, ms))
+
+
+class SumElements(Benchmark):
+    def run(self):
+        op = core.CreateOperator(
+            "SumElements",
+            ["X"],
+            ["y"]
+        )
+
+        for n in itertools.imap(pow, itertools.cycle([10]), range(10)):
+            X = np.random.rand(n).astype(np.float32)
+            logger.info('Running benchmark for n = {}'.format(n))
+            ret = runOpBenchmark(gpu_do, op, inputs=[X])
+            self.results.append((n, ret[1]))
+
+
+class SumSqrElements(Benchmark):
+    def run(self):
+        op = core.CreateOperator(
+            "SumSqrElements",
+            ["X"],
+            ["y"]
+        )
+
+        for n in itertools.imap(pow, itertools.cycle([10]), range(10)):
+            X = np.random.rand(n).astype(np.float32)
+            logger.info('Running benchmark for n = {}'.format(n))
+            ret = runOpBenchmark(gpu_do, op, inputs=[X])
+            self.results.append((n, ret[1]))
+
+
+class SoftMaxWithLoss(Benchmark):
+    def run(self):
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label"],
+            ["probs", "avgloss"],
+        )
+
+        for n in itertools.imap(pow, itertools.cycle([10]), range(8)):
+            for D in itertools.imap(pow, itertools.cycle([10]), range(3)):
+                X = np.random.rand(n, D).astype(np.float32)
+                label = (np.random.rand(n) * D).astype(np.int32)
+                logger.info('Running benchmark for n = {}, D= {}'.format(n, D))
+                ret = runOpBenchmark(gpu_do, op, inputs=[X, label])
+                self.results.append(((n, D), ret[1]))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(os.path.basename(__file__))
+    parser.add_argument('-b', '--benchmarks', nargs='+',
+                        default=ALL_BENCHMARKS.keys(),
+                        help='benchmarks to run (default: %(default)s))')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    benchmarks = [ALL_BENCHMARKS[name]() for name in args.benchmarks]
+    for bench in benchmarks:
+        bench.run()
+    for bench in benchmarks:
+        bench.display()
+
+
+if __name__ == '__main__':
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
+    main()
diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py
new file mode 100644
index 0000000..6a4eb0e
--- /dev/null
+++ b/caffe2/experiments/python/funhash_op_test.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestFunHash(hu.HypothesisTestCase):
+    @given(n_out=st.integers(min_value=5, max_value=20),
+           n_in=st.integers(min_value=10, max_value=20),
+           n_data=st.integers(min_value=2, max_value=8),
+           n_weight=st.integers(min_value=8, max_value=15),
+           n_alpha=st.integers(min_value=3, max_value=8),
+           sparsity=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs)
+    def test_funhash(self, n_out, n_in, n_data, n_weight, n_alpha, sparsity,
+                     gc, dc):
+        A = np.random.rand(n_data, n_in)
+        A[A > sparsity] = 0
+        A_coo = coo_matrix(A)
+        val, key, seg = A_coo.data, A_coo.col, A_coo.row
+
+        weight = np.random.rand(n_weight).astype(np.float32)
+        alpha = np.random.rand(n_alpha).astype(np.float32)
+        val = val.astype(np.float32)
+        key = key.astype(np.int64)
+        seg = seg.astype(np.int32)
+
+        op = core.CreateOperator(
+            'FunHash',
+            ['val', 'key', 'seg', 'weight', 'alpha'],
+            ['out'],
+            num_outputs=n_out)
+
+        # Check over multiple devices
+        self.assertDeviceChecks(
+            dc, op, [val, key, seg, weight, alpha], [0])
+        # Gradient check wrt weight
+        self.assertGradientChecks(
+            gc, op, [val, key, seg, weight, alpha], 3, [0])
+        # Gradient check wrt alpha
+        self.assertGradientChecks(
+            gc, op, [val, key, seg, weight, alpha], 4, [0])
+
+        op2 = core.CreateOperator(
+            'FunHash',
+            ['val', 'key', 'seg', 'weight'],
+            ['out'],
+            num_outputs=n_out)
+
+        # Check over multiple devices
+        self.assertDeviceChecks(
+            dc, op2, [val, key, seg, weight], [0])
+        # Gradient check wrt weight
+        self.assertGradientChecks(
+            gc, op2, [val, key, seg, weight], 3, [0])
diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py
new file mode 100644
index 0000000..da98100
--- /dev/null
+++ b/caffe2/experiments/python/net_construct_bench.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+## @package net_construct_bench
+# Module caffe2.experiments.python.net_construct_bench
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import logging
+import time
+
+from caffe2.python import workspace, data_parallel_model
+from caffe2.python import cnn
+
+import caffe2.python.models.resnet as resnet
+
+'''
+Simple benchmark that creates a data-parallel resnet-50 model
+and measurs the time.
+'''
+
+
+logging.basicConfig()
+log = logging.getLogger("net_construct_bench")
+log.setLevel(logging.DEBUG)
+
+
+def AddMomentumParameterUpdate(train_model, LR):
+    '''
+    Add the momentum-SGD update.
+    '''
+    params = train_model.GetParams()
+    assert(len(params) > 0)
+    ONE = train_model.param_init_net.ConstantFill(
+        [], "ONE", shape=[1], value=1.0,
+    )
+    NEGONE = train_model.param_init_net.ConstantFill(
+        [], 'NEGONE', shape=[1], value=-1.0,
+    )
+
+    for param in params:
+        param_grad = train_model.param_to_grad[param]
+        param_momentum = train_model.param_init_net.ConstantFill(
+            [param], param + '_momentum', value=0.0
+        )
+
+        # Update param_grad and param_momentum in place
+        train_model.net.MomentumSGD(
+            [param_grad, param_momentum, LR],
+            [param_grad, param_momentum],
+            momentum=0.9,
+            nesterov=1
+        )
+
+        # Update parameters by applying the moment-adjusted gradient
+        train_model.WeightedSum(
+            [param, ONE, param_grad, NEGONE],
+            param
+        )
+
+
+def Create(args):
+    gpus = list(range(args.num_gpus))
+    log.info("Running on gpus: {}".format(gpus))
+
+    # Create CNNModeLhelper object
+    train_model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="resnet50",
+        use_cudnn=True,
+        cudnn_exhaustive_search=False
+    )
+
+    # Model building functions
+    def create_resnet50_model_ops(model, loss_scale):
+        [softmax, loss] = resnet.create_resnet50(
+            model,
+            "data",
+            num_input_channels=3,
+            num_labels=1000,
+            label="label",
+        )
+        model.Accuracy([softmax, "label"], "accuracy")
+        return [loss]
+
+    # SGD
+    def add_parameter_update_ops(model):
+        model.AddWeightDecay(1e-4)
+        ITER = model.Iter("ITER")
+        stepsz = int(30)
+        LR = model.net.LearningRate(
+            [ITER],
+            "LR",
+            base_lr=0.1,
+            policy="step",
+            stepsize=stepsz,
+            gamma=0.1,
+        )
+        AddMomentumParameterUpdate(model, LR)
+
+    def add_image_input(model):
+        pass
+
+    start_time = time.time()
+
+    # Create parallelized model
+    data_parallel_model.Parallelize_GPU(
+        train_model,
+        input_builder_fun=add_image_input,
+        forward_pass_builder_fun=create_resnet50_model_ops,
+        param_update_builder_fun=add_parameter_update_ops,
+        devices=gpus,
+    )
+
+    ct = time.time() - start_time
+    train_model.net._CheckLookupTables()
+
+    log.info("Model create for {} gpus took: {} secs".format(len(gpus), ct))
+
+
+def main():
+    # TODO: use argv
+    parser = argparse.ArgumentParser(
+        description="Caffe2: Benchmark for net construction"
+    )
+    parser.add_argument("--num_gpus", type=int, default=1,
+                        help="Number of GPUs.")
+    args = parser.parse_args()
+
+    Create(args)
+
+
+if __name__ == '__main__':
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
+    import cProfile
+
+    cProfile.run('main()', sort="cumulative")
diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py
new file mode 100644
index 0000000..2af0062
--- /dev/null
+++ b/caffe2/experiments/python/sparse_funhash_op_test.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestFunHash(hu.HypothesisTestCase):
+    @given(n_out=st.integers(min_value=5, max_value=20),
+           n_in=st.integers(min_value=10, max_value=20),
+           n_data=st.integers(min_value=2, max_value=8),
+           n_weight=st.integers(min_value=8, max_value=15),
+           n_alpha=st.integers(min_value=3, max_value=8),
+           sparsity=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs)
+    def test_funhash(self, n_out, n_in, n_data, n_weight, n_alpha, sparsity,
+                     gc, dc):
+        A = np.random.rand(n_data, n_in)
+        A[A > sparsity] = 0
+        A_coo = coo_matrix(A)
+        val, key, seg = A_coo.data, A_coo.col, A_coo.row
+
+        weight = np.random.rand(n_weight).astype(np.float32)
+        alpha = np.random.rand(n_alpha).astype(np.float32)
+        val = val.astype(np.float32)
+        key = key.astype(np.int64)
+        seg = seg.astype(np.int32)
+
+        op = core.CreateOperator(
+            'SparseFunHash',
+            ['val', 'key', 'seg', 'weight', 'alpha'],
+            ['out'],
+            num_outputs=n_out)
+
+        # Gradient check wrt weight
+        self.assertGradientChecks(
+            gc, op, [val, key, seg, weight, alpha], 3, [0])
+        # Gradient check wrt alpha
+        self.assertGradientChecks(
+            gc, op, [val, key, seg, weight, alpha], 4, [0])
+
+        op2 = core.CreateOperator(
+            'SparseFunHash',
+            ['val', 'key', 'seg', 'weight'],
+            ['out'],
+            num_outputs=n_out)
+
+        # Gradient check wrt weight
+        self.assertGradientChecks(
+            gc, op2, [val, key, seg, weight], 3, [0])
diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py
new file mode 100644
index 0000000..5849580
--- /dev/null
+++ b/caffe2/experiments/python/sparse_reshape_op_test.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+
+def test_reshape(old_shape, new_shape, stride_only=False):
+    blob_in0 = 'col'
+    blob_out0 = 'col_out'
+
+    blob_in1 = 'row'
+    blob_out1 = 'row_out'
+
+    old_shape_for_op = (-1, old_shape[1]) if stride_only else old_shape
+
+    op = core.CreateOperator('SparseMatrixReshape',
+                             [blob_in0, blob_in1],
+                             [blob_out0, blob_out1],
+                             old_shape=old_shape_for_op,
+                             new_shape=new_shape)
+
+    A = np.random.random_sample(old_shape)
+    A[np.random.random_sample(old_shape) > .5] = 0
+    A_coo = coo_matrix(A)
+    old_row, old_col = A_coo.row, A_coo.col
+
+    workspace.FeedBlob(blob_in0, old_col.astype(np.int64))
+    workspace.FeedBlob(blob_in1, old_row.astype(np.int32))
+
+    workspace.RunOperatorOnce(op)
+
+    A_new_coo = coo_matrix(A.reshape(new_shape))
+    new_row, new_col = A_new_coo.row, A_new_coo.col
+
+    col_out = workspace.FetchBlob(blob_out0)
+    row_out = workspace.FetchBlob(blob_out1)
+
+    np.testing.assert_array_equal(col_out, new_col)
+    np.testing.assert_array_equal(row_out, new_row)
+
+
+class TestSparseMatrixReshapeOp(TestCase):
+    def test_basic_reshape(self):
+        test_reshape(old_shape=(3, 4), new_shape=(4, 3))
+
+    def test_missing_dim(self):
+        test_reshape(old_shape=(2, 8), new_shape=(-1, 4))
+
+    def test_stride_only(self):
+        test_reshape(old_shape=(2, 8), new_shape=(-1, 4), stride_only=True)
+
+    def test_sparse_reshape_mm(self):
+        M, N, K = 300, 400, 500
+        A = np.random.rand(M, K).astype(np.float32)
+        A_sparse = A * (np.random.rand(*A.shape) > .5)
+        A_sparse = A_sparse.reshape((K, M))
+        A_coo = coo_matrix(A_sparse)
+        idx0, idx1, a = A_coo.row, A_coo.col, A_coo.data
+        B = np.random.rand(K, N).astype(np.float32)
+
+        workspace.FeedBlob('col', idx1.astype(np.int64))
+        workspace.FeedBlob('row', idx0.astype(np.int32))
+        workspace.FeedBlob('B', B)
+        workspace.FeedBlob('a', a)
+
+        reshape_op = core.CreateOperator(
+            'SparseMatrixReshape',
+            ['col', 'row'],
+            ['new_col', 'new_row'],
+            old_shape=(K, M),
+            new_shape=(M, K))
+
+        mm_op = core.CreateOperator(
+            'SparseUnsortedSegmentWeightedSum',
+            ['B', 'a', 'new_col', 'new_row'],
+            ['Y'])
+
+        workspace.RunOperatorOnce(reshape_op)
+        workspace.RunOperatorOnce(mm_op)
+
+        Y = workspace.FetchBlob('Y')
+        np.testing.assert_allclose(A_sparse.reshape(M, K).dot(B), Y,
+                                   rtol=1e-4)
diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py
new file mode 100644
index 0000000..4cd04a1
--- /dev/null
+++ b/caffe2/experiments/python/tt_contraction_op_test.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestTTContraction(hu.HypothesisTestCase):
+    @given(D=st.integers(min_value=5, max_value=20),
+           K=st.integers(min_value=5, max_value=20),
+           M=st.integers(min_value=5, max_value=20),
+           N=st.integers(min_value=5, max_value=20),
+           **hu.gcs)
+    def test_tt_contraction(self, D, K, M, N, gc, dc):
+        A = np.random.rand(K, M).astype(np.float32)
+        B = np.random.rand(D, K, N).astype(np.float32)
+
+        workspace.FeedBlob('A', A)
+        workspace.FeedBlob('B', B)
+
+        op = core.CreateOperator(
+            'TTContraction',
+            ['A', 'B'],
+            ['C'],
+            K=K,
+            M=M,
+            N=N)
+        workspace.RunOperatorOnce(op)
+
+        def tt_contraction_ref(A_, B_):
+            return ((A_[:, :, np.newaxis] * B_[:, :, np.newaxis, :])
+                    .sum(axis=1).flatten()),
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [A, B], tt_contraction_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [A, B], [0])
+        # Gradient check wrt A
+        self.assertGradientChecks(gc, op, [A, B], 0, [0])
+        # Gradient check wrt B
+        self.assertGradientChecks(gc, op, [A, B], 1, [0])
diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py
new file mode 100644
index 0000000..10be7ad
--- /dev/null
+++ b/caffe2/experiments/python/tt_pad_op_test.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestTTPad(hu.HypothesisTestCase):
+    @given(K=st.integers(min_value=2, max_value=10),
+           M=st.integers(min_value=10, max_value=20),
+           N=st.integers(min_value=10, max_value=20),
+           **hu.gcs)
+    def test_tt_pad(self, K, M, N, gc, dc):
+        op = core.CreateOperator(
+            'TTPad',
+            ['A'],
+            ['A', 'dim0'],
+            scale=(K))
+
+        A = np.random.rand(M, N).astype(np.float32)
+        workspace.FeedBlob('A', A)
+        workspace.RunOperatorOnce(op)
+
+        def tt_pad_ref(A_):
+            M_ = A_.shape[0]
+            if M_ % K == 0:
+                new_dim0 = M_
+            else:
+                new_dim0 = (M_ // K + 1) * K
+            return (np.vstack((A_, np.zeros((new_dim0 - M_, A_.shape[1])))),
+                    np.array([A.shape[0]]))
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [A], tt_pad_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [A], [0])
+        # Gradient check wrt A
+        self.assertGradientChecks(gc, op, [A], 0, [0])
diff --git a/caffe2/ideep/CMakeLists.txt b/caffe2/ideep/CMakeLists.txt
new file mode 100644
index 0000000..711a4cd
--- /dev/null
+++ b/caffe2/ideep/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(USE_MKL AND USE_IDEEP AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+  message(STATUS "Including IDEEP operators")
+
+  # ---[ CPU files.
+  file(GLOB_RECURSE avx2_srcs *.cc)
+  # exclude test files and gpu files
+  file(GLOB_RECURSE tmp *_test.cc)
+  exclude(avx2_srcs "${avx2_srcs}" ${tmp})
+
+  add_library(Caffe2_ideep_operators OBJECT ${avx2_srcs})
+  add_dependencies(Caffe2_ideep_operators Caffe_PROTO Caffe2_PROTO)
+  set_target_properties(Caffe2_ideep_operators PROPERTIES COMPILE_FLAGS "-mavx2")
+
+  # ---[ Send the lists to the parent scope.
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
+      $<TARGET_OBJECTS:Caffe2_ideep_operators>)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+else()
+  message(STATUS "Excluding ideep operators as we are not using ideep")
+endif()
diff --git a/caffe2/ideep/ideep_utils.h b/caffe2/ideep/ideep_utils.h
new file mode 100644
index 0000000..aba76a6
--- /dev/null
+++ b/caffe2/ideep/ideep_utils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <caffe2/core/macros.h>  // For caffe2 macros.
+
+// All caffe2 ideep related headers
+#include <ideep.hpp>
+#include <caffe2/ideep/utils/ideep_context.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+
+namespace caffe2 {
+
+#define USE_IDEEP_DEF_ALIASES()                                                \
+  using itensor = ideep::tensor;                                               \
+  using iformat = ideep::format;                                               \
+  using ialgo = ideep::algorithm;                                              \
+  using iprop = ideep::prop_kind;                                              \
+  using ipadding = ideep::padding_kind;                                        \
+  using iattr = ideep::descriptor_group::attr_t;                               \
+  using ibn_flag = ideep::batch_normalization_flag;
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
new file mode 100644
index 0000000..f589185
--- /dev/null
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -0,0 +1,148 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPConcatOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws) {
+    CAFFE_ENFORCE(
+      !(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to concat, and the order "
+        "in the case of 4-D images.");
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
+      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+    } else {
+      axis_ = 1;
+      add_axis_ = 0;
+    }
+    CAFFE_ENFORCE_GE(axis_, 0);
+  }
+  virtual ~IDEEPConcatOp() {}
+
+  bool RunOnDevice() override {
+    const auto& input_zero = Input(INPUT0);
+    auto* output = Output(OUTPUT);
+    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO);
+
+    vector<itensor> inputs;
+    for (int i = 0; i < InputSize(); ++i) {
+      inputs.emplace_back(Input(i));
+    }
+
+    auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output);
+    axis_info->Resize(vector<TIndex>(1, InputSize()));
+    int* axis_data = axis_info->template mutable_data<int>();
+    for (int i = 0; i < axis_vdata.size(); i++) {
+      axis_data[i] = axis_vdata[i];
+    }
+
+    return true;
+  }
+
+ private:
+  int axis_;
+  int add_axis_;
+
+  INPUT_TAGS(INPUT0);
+  OUTPUT_TAGS(OUTPUT, AXIS_INFO);
+};
+
+class IDEEPSplitOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        axis_offset_(OperatorBase::GetRepeatedArgument<int>("split")) {
+    CAFFE_ENFORCE(
+      !(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to split, and the order "
+        "in the case of 4-D images.");
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
+      // only exists for computing the gradient of a Concat with 'add_axis'
+      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+    } else {
+      axis_ = 1;
+      add_axis_ = 0;
+    }
+    CAFFE_ENFORCE_GE(axis_, 0);
+  }
+  virtual ~IDEEPSplitOp() {}
+
+  bool RunOnDevice() override {
+    const auto& input = Input(INPUT);
+    CAFFE_ENFORCE_LT(axis_, input.ndims(), "Axis not in input ndim range.");
+    const int input_channels = input.get_dim(axis_);
+    vector<int> axis_vdata(OutputSize(), 0);
+    if (InputSize() == 2) {
+      // We obtain split from the input tensor.
+      CAFFE_ENFORCE_EQ(
+          axis_offset_.size(),
+          0,
+          "If you set split with an input blob, do not pass in "
+          "split in the argument.");
+      auto& axis_info = OperatorBase::Input<TensorCPU>(AXIS_INFO);
+      CAFFE_ENFORCE_EQ(axis_info.size(), OutputSize());
+      auto* axis_data = axis_info.template data<int>();
+      axis_vdata.assign(axis_data, axis_data + OutputSize());
+    } else if (axis_offset_.size() == 0) {
+      CAFFE_ENFORCE_EQ(
+          input_channels % OutputSize(),
+          0,
+          "If you did not specify split explicitly, the number of "
+          "input channels should be divisible by the output size.");
+      axis_vdata.assign(OutputSize(), input_channels / OutputSize());
+    } else {
+      // We obtain split from the parameters.
+      CAFFE_ENFORCE_EQ(
+          axis_offset_.size(),
+          OutputSize(),
+          "The number of splits specified should be equal to the "
+          "number of outputs.");
+      axis_vdata = axis_offset_;
+    }
+
+    CAFFE_ENFORCE_EQ(
+        add_axis_ ? OutputSize()
+                  : std::accumulate(
+                    axis_vdata.data(), axis_vdata.data() + OutputSize(), 0),
+        input_channels,
+        "Sum of split dimensions do not match: should be ",
+        input_channels);
+
+    auto iten_vector = ideep::spliter::compute(
+        input, axis_vdata, axis_, add_axis_);
+    CAFFE_ENFORCE_EQ(
+        iten_vector.size(),
+        OutputSize(),
+        "Output size does not match: should be ",
+        OutputSize());
+
+    for (int i = 0; i < OutputSize(); i++) {
+      auto* output = Output(i);
+      *output = iten_vector[i];
+    }
+
+    return true;
+  }
+
+ private:
+  int axis_;
+  int add_axis_;
+  vector<int> axis_offset_;
+
+  INPUT_TAGS(INPUT, AXIS_INFO);
+};
+
+
+REGISTER_IDEEP_OPERATOR(Concat, IDEEPConcatOp);
+REGISTER_IDEEP_OPERATOR(Split, IDEEPSplitOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/conv_fusion_op.cc b/caffe2/ideep/operators/conv_fusion_op.cc
new file mode 100644
index 0000000..38b9e1e
--- /dev/null
+++ b/caffe2/ideep/operators/conv_fusion_op.cc
@@ -0,0 +1,210 @@
+#include <caffe2/ideep/operators/conv_pool_base_op.h>
+
+namespace caffe2 {
+
+class IDEEPConvFusionOp final : public IDEEPConvPoolOpBase {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
+
+  enum FusionType {
+    FUSION_UNKNOWN = 0,
+    FUSION_CONV_RELU = 1,
+    FUSION_CONV_SUM = 2,
+    FUSION_CONV_SUM_RELU = 3,
+    FUSION_MAX = FUSION_CONV_SUM_RELU + 1,
+  };
+
+  IDEEPConvFusionOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPConvPoolOpBase(operator_def, ws),
+        fusion_type_(static_cast<FusionType>(
+            OperatorBase::GetSingleArgument<int>("fusion_type", 0))),
+        training_mode_(
+            OperatorBase::GetSingleArgument<int>("training_mode", 0)) {
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() == pad_r() && pad_t() == pad_b(),
+        "Uneven padding not supported.");
+    OPERATOR_NEEDS_FEATURE(group_ == 1, "Group not supported.");
+    OPERATOR_NEEDS_FEATURE(
+        fusion_type_ > FUSION_UNKNOWN && fusion_type_ < FUSION_MAX,
+        "Undefined Conv fusion type.",
+        fusion_type_);
+
+    // Check kernel only if we are doing conv. The reason is that a
+    // few other ops, like PadImage, are also using this base class. We really
+    // need to clean this up.
+    for (int dim = 0; dim < kernel_.size(); ++dim) {
+      CAFFE_ENFORCE_GE(pads_[dim], 0);
+      CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0);
+      CAFFE_ENFORCE(
+          kernel_[dim],
+          "If you are doing convolution, you will need to set "
+          "explicitly the kernel size.");
+    }
+  }
+  virtual ~IDEEPConvFusionOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = Input(INPUT_X);
+    const auto& filter = Input(FILTER);
+    auto* Y = Output(OUTPUT);
+    auto Y_dims_conv = CalcOutputDims(X, filter.get_dim(0));
+    auto attr = [this]() {
+      return (fusion_type_ == FUSION_CONV_RELU)
+          ? iattr::fuse_relu()
+          : ((fusion_type_ == FUSION_CONV_SUM)
+                 ? iattr::fuse_sum()
+                 : ((fusion_type_ == FUSION_CONV_SUM_RELU) ? iattr::residual()
+                                                           : iattr()));
+    };
+    auto last_input = [this]() {
+      return (fusion_type_ == FUSION_CONV_RELU) ? BIAS_OR_INPUT_S : INPUT_S;
+    };
+
+    CAFFE_ENFORCE(4 == X.ndims());
+    CAFFE_ENFORCE(4 == filter.ndims());
+    CAFFE_ENFORCE(filter.get_dim(2) == kernel_h());
+    CAFFE_ENFORCE(filter.get_dim(3) == kernel_w());
+    CAFFE_ENFORCE(
+        X.get_dim(1) == filter.get_dim(1) * group_,
+        "Convolution fusion op: input channels does not match: "
+        "# of input channels ",
+        X.get_dim(1),
+        " is not equal to kernel channels * group:",
+        filter.get_dim(1),
+        "*",
+        group_);
+
+    bool weights_changed =
+        (cached_weights_descriptor_ != filter.get_descriptor());
+    if (weights_changed && !training_mode_) {
+      cached_weights_descriptor_ = filter.get_descriptor();
+      filter_ = filter;
+      auto expected_descriptor =
+          ideep::convolution_forward::expected_weights_descriptor(
+              filter.get_dims());
+      if (filter_.get_descriptor() != expected_descriptor) {
+        filter_.init<ideep::utils::allocator, ideep::convolution_forward>(
+            expected_descriptor);
+        ideep::reorder::compute(filter, filter_);
+      }
+    }
+
+    if (InputSize() > last_input()) {
+      ideep::convolution_forward::compute(
+          X,
+          training_mode_ ? filter : filter_,
+          Input(BIAS_OR_INPUT_S),
+          Y_dims_conv,
+          *Y,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_,
+          attr());
+    } else {
+      ideep::convolution_forward::compute(
+          X,
+          training_mode_ ? filter : filter_,
+          Y_dims_conv,
+          *Y,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_,
+          attr());
+    }
+
+    if (fusion_type_ != FUSION_CONV_RELU) {
+      CAFFE_ENFORCE(
+          Y == &(Input(InputSize() - 1)),
+          "Convolution fusion op: InPlace is enforced for sum fusion.");
+    }
+
+    return true;
+  }
+
+ private:
+  FusionType fusion_type_;
+
+  bool training_mode_;
+  ideep::tensor filter_;
+  ideep::tensor::descriptor cached_weights_descriptor_;
+
+  INPUT_TAGS(INPUT_X, FILTER, BIAS_OR_INPUT_S, INPUT_S);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+REGISTER_IDEEP_OPERATOR(ConvFusion, IDEEPConvFusionOp);
+
+const char* kConvFusionDoc = R"DOC(
+Note that other parameters, such as the stride and
+kernel size, or the pads' sizes in each direction are not necessary for input
+because they are provided by the ConvPoolOpBase operator. Various dimension
+checks are done implicitly, and the sizes are specified in the Input docs for
+this operator. As is expected, the filter is convolved with a subset of the
+image and the bias is added; this is done throughout the image data and the
+output is computed. As a side note on the implementation layout:
+conv_op_impl.h is the templated implementation of the conv_op.h file, which is
+why they are separate files.
+)DOC";
+
+std::function<void(OpSchema&)> ConvFusionDocGenerator(const char* dim) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+The convolution fusion operator consumes an input vector, a {dim}filter blob,
+a bias blob and another input vector and computes the output. This operator
+gives the chance to fuse the ReLU or element-wise Sum with a convolution
+operator. {conv_fusion_doc})DOC";
+    ReplaceAll(doc, "{dim}", dim);
+    ReplaceAll(doc, "{conv_fusion_doc}", kConvFusionDoc);
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "Input data blob from previous layer; has size (N x C x H x W), "
+        "where N is the batch size, C is the number of channels, "
+        "and H and W are the height and width. Note that this is for the NCHW "
+        "usage. On the other hand, the NHWC Op has a different set of "
+        "dimension constraints. ");
+    schema.Input(
+        1,
+        "filter",
+        "The filter blob that will be used in the "
+        "convolutions; has size (M x C x kH x kW), where C is the number of "
+        "channels, and kH and kW are the height and width of the kernel.");
+    schema.Input(
+        2,
+        "bias",
+        "The 1D bias blob that is added through the "
+        "convolution; has size (M).");
+    schema.Input(
+        3,
+        "S",
+        "Input data blob for element-wise Sum fusion from previous layer; "
+        "has the same size of convolution output. Its input index should "
+        "be 2 if no bias for this convolution, and it MUST be inplace with "
+        "output Y.");
+    schema.Output(
+        0,
+        "Y",
+        "Output data blob that contains the result of the "
+        "convolution fusion. The output dimensions are functions of the kernel "
+        "size, stride size, and pad lengths."
+        "");
+  };
+}
+
+OPERATOR_SCHEMA(ConvFusion)
+    .NumInputs(2, 4)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
+        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
+    .Arg("fusion_type", "Which fusion type is used")
+    .AllowInplace({{2, 0}, {3, 0}})
+    .FillUsing(ConvFusionDocGenerator(""));
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/conv_op.cc b/caffe2/ideep/operators/conv_op.cc
new file mode 100644
index 0000000..e3f2ca6
--- /dev/null
+++ b/caffe2/ideep/operators/conv_op.cc
@@ -0,0 +1,182 @@
+#include <caffe2/ideep/operators/conv_pool_base_op.h>
+
+namespace caffe2 {
+
+class IDEEPConvOp final : public IDEEPConvPoolOpBase {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
+
+  IDEEPConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPConvPoolOpBase(operator_def, ws),
+        training_mode_(
+            OperatorBase::GetSingleArgument<int>("training_mode", 0)) {
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() == pad_r() && pad_t() == pad_b(),
+        "Uneven padding not supported.");
+  }
+  virtual ~IDEEPConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = Input(INPUT);
+    const auto& filter = Input(FILTER);
+    auto* Y = Output(OUTPUT);
+    auto Y_dims = CalcOutputDims(X, filter.get_dim(0));
+
+    CAFFE_ENFORCE(4 == X.ndims());
+    CAFFE_ENFORCE(4 == filter.ndims());
+    CAFFE_ENFORCE(filter.get_dim(2) == kernel_h());
+    CAFFE_ENFORCE(filter.get_dim(3) == kernel_w());
+    CAFFE_ENFORCE(
+        X.get_dim(1) == filter.get_dim(1) * group_,
+        "Convolution op: input channels does not match: # of input channels ",
+        X.get_dim(1),
+        " is not equal to kernel channels * group:",
+        filter.get_dim(1),
+        "*",
+        group_);
+
+    bool weights_changed =
+        (cached_weights_descriptor_ != filter.get_descriptor());
+    if (weights_changed && !training_mode_) {
+      cached_weights_descriptor_ = filter.get_descriptor();
+      auto filter_in = filter;
+      filter_in.make_group(group_);
+      auto expected_descriptor =
+          ideep::convolution_forward::expected_weights_descriptor(
+              filter_in.get_dims(),
+              filter_in.get_data_type(),
+              stride_,
+              pad_tl(),
+              pad_br(),
+              dilation_,
+              group_);
+      filter_.init<ideep::utils::allocator, ideep::convolution_forward>(
+          expected_descriptor);
+      ideep::reorder::compute(filter_in, filter_);
+    }
+
+    // NB: actually, in the case when `group_ > 1`, IDEEP will create
+    // an itermediate tensor for each run below. However, this tensor is merely
+    // a view of of the weights and there is no actual data copy, so I'll let it
+    // go now. If we encounter performance surprise when convoluting with group
+    // > 1, this is the first place to check and we need to do the same cache
+    // trick as above
+    if (InputSize() > BIAS) {
+      ideep::convolution_forward::compute(
+          X,
+          training_mode_ ? filter : filter_,
+          Input(BIAS),
+          Y_dims,
+          *Y,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_);
+    } else {
+      ideep::convolution_forward::compute(
+          X,
+          training_mode_ ? filter : filter_,
+          Y_dims,
+          *Y,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_);
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  OUTPUT_TAGS(OUTPUT);
+
+  bool training_mode_;
+  ideep::tensor filter_;
+  ideep::tensor::descriptor cached_weights_descriptor_;
+};
+
+class IDEEPConvGradientOp final : public IDEEPConvPoolOpBase {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
+
+  IDEEPConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPConvPoolOpBase(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() == pad_r() && pad_t() == pad_b(),
+        "Uneven padding not supported.");
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+    CAFFE_ENFORCE(
+        OperatorBase::GetSingleArgument<int>("training_mode", 0),
+        "In order to backward propagate weights correctly, "
+        "please set training_mode=1");
+  }
+  virtual ~IDEEPConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = Input(INPUT);
+    const auto& filter = Input(FILTER);
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dfilter = Output(FILTER_GRAD);
+
+    if (no_bias_) {
+      ideep::convolution_backward_weights::compute(
+          X,
+          dY,
+          filter.get_dims(),
+          *dfilter,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_);
+    } else {
+      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+      ideep::convolution_backward_weights::compute(
+          X,
+          dY,
+          filter.get_dims(),
+          *dfilter,
+          *dbias,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_);
+    }
+
+    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+      auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+      ideep::convolution_backward_data::compute(
+          dY,
+          filter,
+          X.get_dims(),
+          *dX,
+          stride_,
+          dilation_,
+          pad_tl(),
+          pad_br(),
+          group_);
+    }
+
+    return true;
+  }
+
+ private:
+  bool no_bias_;
+
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(Conv, IDEEPConvOp);
+REGISTER_IDEEP_OPERATOR(ConvGradient, IDEEPConvGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/conv_pool_base_op.h b/caffe2/ideep/operators/conv_pool_base_op.h
new file mode 100644
index 0000000..03da079
--- /dev/null
+++ b/caffe2/ideep/operators/conv_pool_base_op.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <caffe2/ideep/ideep_utils.h>
+#include <caffe2/operators/conv_pool_op_base.h>
+
+namespace caffe2 {
+
+class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
+ public:
+  IDEEPConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
+     : ConvPoolOpBase<IDEEPContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        order_ == StorageOrder::NCHW, "Unsupported storage order.");
+  }
+  virtual ~IDEEPConvPoolOpBase() {}
+
+  inline const ideep::tensor& Input(int index) {
+    return OperatorBase::template Input<ideep::tensor>(index);
+  }
+  inline ideep::tensor* Output(int index) {
+    return OperatorBase::template Output<ideep::tensor>(index);
+  }
+
+  ideep::tensor::dims pad_tl() {
+    return {pad_t(), pad_l()};
+  }
+
+  ideep::tensor::dims pad_br() {
+    return {pad_b(), pad_r()};
+  }
+
+  ideep::tensor::dims CalcOutputDims(
+      const ideep::tensor& input,
+      int output_channel) {
+    CAFFE_ENFORCE(input.get_descriptor().get_size() > 0);
+
+    bool channel_first;
+    int N = input.get_dim(0);
+    ideep::tensor::dims output_dims;
+
+    auto input_dims = input.get_dims();
+    vector<TIndex> input_Tdims (input_dims.begin(), input_dims.end());
+    InferOutputSize(
+        input_Tdims,
+        output_channel,
+        order_,
+        global_pooling_,
+        legacy_pad_,
+        N,
+        kernel_,
+        output_dims,
+        dilation_,
+        stride_,
+        pads_,
+        channel_first);
+
+    if (channel_first) {
+      output_dims.insert(output_dims.begin(), {N, output_channel});
+    } else {
+      output_dims.insert(output_dims.begin(), N);
+      output_dims.push_back(output_channel);
+    }
+
+    return output_dims;
+  }
+
+  bool RunOnDevice() override {
+    if (!global_pooling_) {
+      for (int dim = 0; dim < kernel_.size(); ++dim) {
+        CAFFE_ENFORCE_GT(kernel_[dim], 0);
+      }
+    }
+
+    try {
+      return RunOnDeviceWithOrderNCHW();
+    } catch (ideep::error& e) {
+      LOG(ERROR) << "IDEEP error:" << e.message;
+      throw;
+    }
+  }
+};
+
+#define USE_IDEEP_CONV_POOL_BASE_FUNCTIONS()                                   \
+  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
+  /* using override */ using IDEEPConvPoolOpBase::Input;                       \
+  /* using override */ using IDEEPConvPoolOpBase::Output;
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/dropout_op.cc b/caffe2/ideep/operators/dropout_op.cc
new file mode 100644
index 0000000..7f4309f
--- /dev/null
+++ b/caffe2/ideep/operators/dropout_op.cc
@@ -0,0 +1,88 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPDropoutOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPDropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+  }
+  virtual ~IDEEPDropoutOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+
+    if (is_test_) {
+      if (Y != &X) {
+        ideep::direct_copy::compute(X, *Y);
+      }
+      return true;
+    }
+
+    auto* mask = Output(MASK);
+    ideep::dropout_forward::compute(X, ratio_, *Y, *mask);
+
+    return true;
+  }
+
+ private:
+  float ratio_;
+  bool is_test_;
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT, MASK);
+};
+
+class IDEEPDropoutGradientOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+  }
+  virtual ~IDEEPDropoutGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dX = Output(INPUT_GRAD);
+
+    if (is_test_) {
+      if (dX != &dY) {
+        ideep::direct_copy::compute(dY, *dX);
+      }
+      return true;
+    }
+
+    const auto& mask = Input(MASK);
+    ideep::dropout_backward::compute(mask, dY, *dX);
+
+    return true;
+  }
+
+ protected:
+  float ratio_;
+  bool is_test_;
+
+  INPUT_TAGS(OUTPUT_GRAD , MASK);
+  OUTPUT_TAGS(INPUT_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(Dropout, IDEEPDropoutOp);
+REGISTER_IDEEP_OPERATOR(DropoutGrad, IDEEPDropoutGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/elementwise_sum_op.cc b/caffe2/ideep/operators/elementwise_sum_op.cc
new file mode 100644
index 0000000..373d880
--- /dev/null
+++ b/caffe2/ideep/operators/elementwise_sum_op.cc
@@ -0,0 +1,49 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPSumOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws) {}
+  virtual ~IDEEPSumOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT0);
+    auto* Y = Output(OUTPUT);
+
+    if (InputSize() == 1) {
+      ideep::direct_copy::compute(X, *Y);
+
+    } else {
+      vector<itensor> inputs;
+      const vector<float> scales(InputSize(), 1.0);
+      const auto dims = X.get_dims();
+      for (int i = 0; i < InputSize(); ++i) {
+        if (Input(i).get_dims() != dims) {
+          CAFFE_ENFORCE_EQ(
+              dims,
+              Input(i).get_dims(),
+              "Broadcast is not yet supported with IDEEP.");
+        }
+        inputs.emplace_back(Input(i));
+      }
+
+      ideep::sum::compute(scales, inputs, *Y);
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(INPUT0);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+REGISTER_IDEEP_OPERATOR(Sum, IDEEPSumOp);
+REGISTER_IDEEP_OPERATOR(Add, IDEEPSumOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/fully_connected_op.cc b/caffe2/ideep/operators/fully_connected_op.cc
new file mode 100644
index 0000000..1bc75a9
--- /dev/null
+++ b/caffe2/ideep/operators/fully_connected_op.cc
@@ -0,0 +1,75 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPFullyConnectedOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPFullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        float16_compute_(
+            OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+  virtual ~IDEEPFullyConnectedOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& filter = Input(FILTER);
+    auto* Y = Output(OUTPUT);
+
+    if (InputSize() > BIAS) {
+      ideep::inner_product_forward::compute(X, filter, Input(BIAS), *Y);
+    } else {
+      ideep::inner_product_forward::compute(X, filter, *Y);
+    }
+
+    return true;
+  }
+
+ private:
+  bool float16_compute_;
+
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+class IDEEPFullyConnectedGradientOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPFullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        float16_compute_(
+            OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+  virtual ~IDEEPFullyConnectedGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& filter = Input(FILTER);
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dfilter = Output(FILTER_GRAD);
+    auto* dbias = Output(BIAS_GRAD);
+
+    ideep::inner_product_backward_weights::compute(X, dY, *dfilter, *dbias);
+
+    if (OutputSize() > INPUT_GRAD) {
+      ideep::inner_product_backward_data::compute(
+          dY, filter, X.get_dims(), *Output(INPUT_GRAD));
+    }
+
+    return true;
+  }
+
+ private:
+  bool float16_compute_;
+
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(FC, IDEEPFullyConnectedOp);
+REGISTER_IDEEP_OPERATOR(FCGradient, IDEEPFullyConnectedGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/local_response_normalization_op.cc b/caffe2/ideep/operators/local_response_normalization_op.cc
new file mode 100644
index 0000000..c59aaf3
--- /dev/null
+++ b/caffe2/ideep/operators/local_response_normalization_op.cc
@@ -0,0 +1,89 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPLRNOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPLRNOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)),
+        pre_pad_((size_ - 1) / 2) {
+    DCHECK_GT(size_, 0);
+    DCHECK_EQ(size_ % 2, 1);
+    DCHECK_GT(alpha_, 0);
+    DCHECK_GT(beta_, 0);
+  }
+  virtual ~IDEEPLRNOp() {}
+
+  bool RunOnDevice() override {
+    auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+
+    ideep::lrn_forward::compute(X, *Y, size_, alpha_, beta_, bias_);
+
+    return true;
+  }
+
+ private:
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+  const int pre_pad_;
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+class IDEEPLRNGradientOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPLRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)),
+        pre_pad_((size_ - 1) / 2) {
+    DCHECK_GT(size_, 0);
+    DCHECK_EQ(size_ % 2, 1);
+    DCHECK_GT(alpha_, 0);
+    DCHECK_GT(beta_, 0);
+  }
+  virtual ~IDEEPLRNGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& Y = Input(FILTER);
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dX = Output(INPUT_GRAD);
+
+    ideep::lrn_backward::compute(X, dY, Y, *dX, size_, alpha_, beta_, bias_);
+
+    return true;
+  }
+
+ private:
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+  const int pre_pad_;
+
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(INPUT_GRAD);
+};
+
+
+REGISTER_IDEEP_OPERATOR(LRN, IDEEPLRNOp);
+REGISTER_IDEEP_OPERATOR(LRNGradient, IDEEPLRNGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
new file mode 100644
index 0000000..d222c69
--- /dev/null
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -0,0 +1,115 @@
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+
+#include <caffe2/operators/bbox_transform_op.h>
+#include <caffe2/operators/box_with_nms_limit_op.h>
+#include <caffe2/operators/channel_shuffle_op.h>
+#include <caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h>
+#include <caffe2/operators/conv_transpose_op.h>
+#include <caffe2/operators/cross_entropy_op.h>
+#include <caffe2/operators/dropout_op.h>
+#include <caffe2/operators/elementwise_ops.h>
+#include <caffe2/operators/filler_op.h>
+#include <caffe2/operators/flatten_op.h>
+#include <caffe2/operators/generate_proposals_op.h>
+#include <caffe2/operators/given_tensor_fill_op.h>
+#include <caffe2/operators/load_save_op.h>
+#include <caffe2/operators/loss_op.h>
+#include <caffe2/operators/pad_op.h>
+#include <caffe2/operators/prelu_op.h>
+#include <caffe2/operators/reshape_op.h>
+#include <caffe2/operators/roi_align_op.h>
+#include <caffe2/operators/roi_align_rotated_op.h>
+#include <caffe2/operators/softmax_op.h>
+#include <caffe2/operators/transpose_op.h>
+#include <caffe2/operators/utility_ops.h>
+
+// can add more non-IDEEP operators if needed
+namespace caffe2 {
+namespace {
+
+struct SigmoidCPUFunctor {
+  template <typename T>
+  bool operator()(const int n, const T* x, T* y, CPUContext* /* context */)
+      const {
+    ConstEigenVectorArrayMap<T> xM(x, n);
+    EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
+    return true;
+  }
+};
+
+} // namespace
+
+REGISTER_IDEEP_OPERATOR(Softmax, IDEEPFallbackOp<SoftmaxOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    ChannelShuffle,
+    IDEEPFallbackOp<ChannelShuffleOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    LabelCrossEntropy,
+    IDEEPFallbackOp<LabelCrossEntropyOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    AveragedLoss,
+    IDEEPFallbackOp<AveragedLoss<float, CPUContext>, SkipIndices<0>>);
+REGISTER_IDEEP_OPERATOR(
+    ConvTranspose,
+    IDEEPFallbackOp<ConvTransposeOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Flatten, IDEEPFallbackOp<FlattenOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(ResizeLike, IDEEPFallbackOp<ResizeLikeOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Transpose, IDEEPFallbackOp<TransposeOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    Reshape,
+    IDEEPFallbackOp<ReshapeOp<float, CPUContext>, SkipIndices<1>>);
+
+// filter operators
+REGISTER_IDEEP_OPERATOR(
+    XavierFill,
+    IDEEPFallbackOp<XavierFillOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    ConstantFill,
+    IDEEPFallbackOp<ConstantFillOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    GaussianFill,
+    IDEEPFallbackOp<GaussianFillOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    MSRAFill,
+    IDEEPFallbackOp<MSRAFillOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    GivenTensorFill,
+    IDEEPFallbackOp<GivenTensorFillOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Load, IDEEPFallbackOp<LoadOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Save, IDEEPFallbackOp<SaveOp<CPUContext>>);
+
+REGISTER_IDEEP_OPERATOR(
+    Sigmoid,
+    IDEEPFallbackOp<
+        UnaryElementwiseOp<TensorTypes<float>, CPUContext, SigmoidCPUFunctor>>);
+REGISTER_IDEEP_OPERATOR(
+    RoIAlign,
+    IDEEPFallbackOp<RoIAlignOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    RoIAlignRotated,
+    IDEEPFallbackOp<RoIAlignRotatedOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    GenerateProposals,
+    IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    GenerateProposalsCPP,
+    IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    CollectAndDistributeFpnRpnProposals,
+    IDEEPFallbackOp<CollectAndDistributeFpnRpnProposalsOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    BoxWithNMSLimit,
+    IDEEPFallbackOp<BoxWithNMSLimitOp<CPUContext>, SkipIndices<0,1,2>>);
+REGISTER_IDEEP_OPERATOR(
+    BBoxTransform,
+    IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
+
+REGISTER_IDEEP_OPERATOR(
+    PadImage,
+    IDEEPFallbackOp<PadImageOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    PRelu,
+    IDEEPFallbackOp<PReluOp<float, CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
new file mode 100644
index 0000000..ac27cd7
--- /dev/null
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <caffe2/core/common.h>
+#include <caffe2/core/context.h>
+#include <caffe2/core/operator.h>
+#include <caffe2/ideep/ideep_utils.h>
+#include <caffe2/proto/caffe2.pb.h>
+
+namespace caffe2 {
+
+/**
+ * @brief A templated class to allow one to wrap a CPU operator as an IDEEP
+ * operator.
+ *
+ * This class can be used when one does not have the IDEEP implementation ready
+ * yet for an operator. Essentially, what this op does is to automatically
+ * deal with data copy for you. Plausibly, this causes a lot of overhead and
+ * is not optimal, so you should use this operator mostly for quick prototyping
+ * purpose.
+ *
+ * All the input and output of the original operator should be TensorCPU.
+ *
+ * Example usage: if you have a class MyMagicOp that is CPU based, and you use
+ * the registration code
+ *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
+ * to register the CPU side, you can create its corresponding IDEEP operator
+ * (with performance hits of course) via
+ *     REGISTER_IDEEP_OPERATOR(MyMagic,
+ *                            IDEEPFallbackOp<MyMagicOp>);
+ *
+ * Advanced usage: if you want to have some specific outputs never copied, you
+ * can use the SkipOutputCopy template argument to do that. For example, if
+ * MyMagic produces two outputs and the first output is always going to live on
+ * the CPU, you can do
+ *     REGISTER_IDEEP_OPERATOR(MyMagic,
+ *                            IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
+ */
+template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
+class IDEEPFallbackOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPFallbackOp(const OperatorDef& def, Workspace* ws)
+      : IDEEPOperator(def, ws) {
+    CAFFE_ENFORCE_EQ(def.device_option().device_type(), IDEEP);
+    OperatorDef base_def_(def);
+    // base_def_ runs on CPU, so we will set its device option to CPU.
+    // Copy to allow random_seed to be correctly propagated.
+    base_def_.mutable_device_option()->CopyFrom(def.device_option());
+    base_def_.mutable_device_option()->set_device_type(CPU);
+    // Create output blobs in parent workspace,
+    // then forward output blobs to local workspace.
+    std::unordered_map<string, string> forwarded_output_blobs;
+    for (int i = 0; i < base_def_.output_size(); i++) {
+      string parent_name(base_def_.output(i));
+      if (!SkipOutputCopy::Contains(i)) {
+        parent_name += "_cpu_output_blob_" + base_def_.type();
+      }
+      local_output_blobs_.push_back(ws->CreateBlob(parent_name));
+      CHECK_NOTNULL(local_output_blobs_.back());
+      forwarded_output_blobs[base_def_.output(i)] = parent_name;
+    }
+    local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
+    // Set up the symbols for the local workspace.
+    for (const string& name : base_def_.input()) {
+      local_input_blobs_.push_back(local_ws_->CreateBlob(name));
+      CHECK_NOTNULL(local_input_blobs_.back());
+    }
+    base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
+  }
+
+  bool RunOnDevice() override {
+    for (int i = 0; i < InputSize(); ++i) {
+      if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
+        auto& input = Input(i);
+        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        dtensor->Resize(input.get_dims());
+        if (input.is_public_format()) {
+          dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
+        } else {
+          input.reorder_to(dtensor->template mutable_data<float>());
+        }
+      } else if (
+          InputIsType<itensor>(i) &&
+          Input(i).get_data_type() == itensor::data_type::s32) {
+        auto& input = Input(i);
+        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        dtensor->Resize(input.get_dims());
+        if (input.is_public_format()) {
+          dtensor->ShareExternalPointer(
+              static_cast<long*>(input.get_data_handle()));
+        } else {
+          input.reorder_to(dtensor->template mutable_data<long>());
+        }
+      } else {
+        VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
+    }
+
+    if (!base_op_->Run()) {
+      LOG(ERROR) << "Base op run failed in IDEEPFallbackOp. Def: "
+                 << ProtoDebugString(this->debug_def());
+      return false;
+    }
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      if (SkipOutputCopy::Contains(i)) {
+        VLOG(1) << "Copy output: index " << i << " skipped.";
+        continue;
+      }
+      CAFFE_ENFORCE(
+          local_output_blobs_[i]->template IsType<TensorCPU>(),
+          "IDEEP fallback op currently does not support non-TensorCPU "
+          "output type who needs copying.");
+      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
+
+      if (src.template IsType<float>()) {
+        Blob* dst = OperatorBase::OutputBlob(i);
+        if (!dst->template IsType<itensor>()) {
+          dst->Reset(new itensor());
+        }
+
+        auto src_dims = src.dims();
+        itensor::dims dst_dims (src_dims.begin(), src_dims.end());
+        auto dtensor = dst->template GetMutable<itensor>();
+        if (dtensor->get_dims() != dst_dims) {
+          dtensor->resize(dst_dims, itensor::data_type::f32);
+        }
+        dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
+      } else if (src.template IsType<long>()) {
+        Blob* dst = OperatorBase::OutputBlob(i);
+        if (!dst->template IsType<itensor>()) {
+          dst->Reset(new itensor());
+        }
+
+        auto src_dims = src.dims();
+        itensor::dims dst_dims(src_dims.begin(), src_dims.end());
+        auto dtensor = dst->template GetMutable<itensor>();
+        if (dtensor->get_dims() != dst_dims) {
+          dtensor->resize(dst_dims, itensor::data_type::s32);
+        }
+        dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
+      } else {
+        CAFFE_THROW("ideep memory only supports float data type.");
+      }
+    }
+    return true;
+  }
+
+ protected:
+  vector<Blob*> local_input_blobs_;
+  vector<Blob*> local_output_blobs_;
+  std::unique_ptr<CPUOp> base_op_;
+  std::unique_ptr<Workspace> local_ws_;
+};
+
+} // namespace caffe2
+
diff --git a/caffe2/ideep/operators/pool_op.cc b/caffe2/ideep/operators/pool_op.cc
new file mode 100644
index 0000000..ace88cf
--- /dev/null
+++ b/caffe2/ideep/operators/pool_op.cc
@@ -0,0 +1,102 @@
+#include <caffe2/ideep/operators/conv_pool_base_op.h>
+
+namespace caffe2 {
+
+class IDEEPPoolOp final : public IDEEPConvPoolOpBase {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
+
+  IDEEPPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPConvPoolOpBase(operator_def, ws) {
+    CAFFE_ENFORCE(
+        (dilation_h() == 1) && (dilation_w() == 1),
+        "Pooling op does not support dilation right now.");
+    if (!global_pooling_) {
+      CAFFE_ENFORCE(
+          pad_t() < kernel_h() && pad_b() < kernel_h() &&
+              pad_l() < kernel_w() && pad_r() < kernel_w(),
+          "Pad should be smaller than kernel.");
+    }
+    // Figure out the pooling descriptor.
+    if (operator_def.type().substr(0, 7) == "MaxPool") {
+      algo_ = ialgo::pooling_max;
+    } else if (operator_def.type().substr(0, 11) == "AveragePool") {
+      algo_ = ialgo::pooling_avg;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+  virtual ~IDEEPPoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+    auto Y_dims = CalcOutputDims(X, X.get_dim(1));
+
+    ideep::pooling_forward::compute(X, Y_dims, *Y,
+        stride_, kernel_, pad_tl(), pad_br(), algo_);
+
+    return true;
+  }
+
+ private:
+  ialgo algo_;
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+class IDEEPPoolGradientOp final : public IDEEPConvPoolOpBase {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
+
+  IDEEPPoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPConvPoolOpBase(operator_def, ws) {
+    CAFFE_ENFORCE(
+        (dilation_h() == 1) && (dilation_w() == 1),
+        "Pooling op does not support dilation right now.");
+    if (!global_pooling_) {
+      CAFFE_ENFORCE(
+          pad_t() < kernel_h() && pad_b() < kernel_h() &&
+              pad_l() < kernel_w() && pad_r() < kernel_w(),
+          "Pad should be smaller than kernel.");
+    }
+    // Figure out the pooling descriptor.
+    if (operator_def.type().substr(0, 15) == "MaxPoolGradient") {
+      algo_ = ialgo::pooling_max;
+    } else if (operator_def.type().substr(0, 19) == "AveragePoolGradient") {
+      algo_ = ialgo::pooling_avg;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+  virtual ~IDEEPPoolGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = Input(INPUT);
+    const auto& Y = Input(OUTPUT);
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dX = Output(INPUT_GRAD);
+
+    ideep::pooling_backward::compute(dY, Y, X, *dX,
+        stride_, kernel_, pad_tl(), pad_br(), algo_);
+
+    return true;
+  }
+
+ private:
+  ialgo algo_;
+
+  INPUT_TAGS(INPUT, OUTPUT, OUTPUT_GRAD);
+  OUTPUT_TAGS(INPUT_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(MaxPool, IDEEPPoolOp);
+REGISTER_IDEEP_OPERATOR(MaxPoolGradient, IDEEPPoolGradientOp);
+
+REGISTER_IDEEP_OPERATOR(AveragePool, IDEEPPoolOp);
+REGISTER_IDEEP_OPERATOR(AveragePoolGradient, IDEEPPoolGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/relu_op.cc b/caffe2/ideep/operators/relu_op.cc
new file mode 100644
index 0000000..7f81d0e
--- /dev/null
+++ b/caffe2/ideep/operators/relu_op.cc
@@ -0,0 +1,57 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPReluOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws) {}
+  virtual ~IDEEPReluOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+
+    ideep::eltwise_forward::compute(X, *Y);
+
+    return true;
+  }
+
+ private:
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+class IDEEPReluGradientOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws) {}
+  virtual ~IDEEPReluGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& Y = Input(OUTPUT);
+    const auto& dY = Input(OUTPUT_GRAD);
+    auto* dX = Output(INPUT_GRAD);
+
+    ideep::eltwise_backward::compute(Y, dY, *dX);
+
+    return true;
+  }
+
+ private:
+
+  INPUT_TAGS(OUTPUT, OUTPUT_GRAD);
+  OUTPUT_TAGS(INPUT_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(Relu, IDEEPReluOp);
+REGISTER_IDEEP_OPERATOR(ReluGradient, IDEEPReluGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/spatial_batch_norm_op.cc b/caffe2/ideep/operators/spatial_batch_norm_op.cc
new file mode 100644
index 0000000..cc7e9b3
--- /dev/null
+++ b/caffe2/ideep/operators/spatial_batch_norm_op.cc
@@ -0,0 +1,102 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+class IDEEPSpatialBNOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)) {
+    CAFFE_ENFORCE(
+        (is_test_ && OutputSize() > OUTPUT)
+          || (!is_test_ && OutputSize() > SAVED_VAR));
+    CAFFE_ENFORCE_GT(epsilon_, 0);
+    CAFFE_ENFORCE_GE(momentum_, 0);
+    CAFFE_ENFORCE_LE(momentum_, 1);
+  }
+  virtual ~IDEEPSpatialBNOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& scale = Input(SCALE);
+    const auto& bias = Input(BIAS);
+    auto* Y = Output(OUTPUT);
+
+    DCHECK_EQ(scale.ndims(), 1);
+    DCHECK_EQ(bias.ndims(), 1);
+    DCHECK_EQ(scale.get_dim(0), X.get_dim(1));
+    DCHECK_EQ(bias.get_dim(0), X.get_dim(1));
+
+    if (is_test_) {
+      const auto& est_mean = Input(EST_MEAN);
+      const auto& est_var = Input(EST_VAR);
+      ideep::batch_normalization_forward_inference::compute(
+          X, est_mean, est_var, scale, bias, *Y, epsilon_);
+    } else {
+      auto* saved_mean = Output(SAVED_MEAN);
+      auto* saved_var = Output(SAVED_VAR);
+      auto* running_mean = Output(RUNNING_MEAN);
+      auto* running_var = Output(RUNNING_VAR);
+      ideep::batch_normalization_forward_training::compute(
+          X, scale, bias, *Y, *saved_mean, *saved_var,
+          *running_mean, *running_var, momentum_, epsilon_);
+    }
+
+    return true;
+  }
+
+ private:
+  bool is_test_;
+  double epsilon_;
+  double momentum_;
+
+  INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
+  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_VAR);
+};
+
+class IDEEPSpatialBNGradientOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)) {
+    CAFFE_ENFORCE(InputSize() > SAVED_VAR);
+    CAFFE_ENFORCE(OutputSize() > BIAS_GRAD);
+  }
+  virtual ~IDEEPSpatialBNGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& scale = Input(SCALE);
+    const auto& dY = Input(OUTPUT_GRAD);
+    const auto& saved_mean = Input(SAVED_MEAN);
+    const auto& saved_var = Input(SAVED_VAR);
+    auto* dX = Output(INPUT_GRAD);
+    auto* dscale = Output(SCALE_GRAD);
+    auto* dbias = Output(BIAS_GRAD);
+
+    ideep::batch_normalization_backward::compute(
+        X, saved_mean, saved_var, dY, scale,
+        *dX, *dscale, *dbias, epsilon_);
+
+    return true;
+  }
+
+ private:
+  double epsilon_;
+
+  INPUT_TAGS(INPUT, SCALE, OUTPUT_GRAD, SAVED_MEAN, SAVED_VAR);
+  OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
+};
+
+REGISTER_IDEEP_OPERATOR(SpatialBN, IDEEPSpatialBNOp);
+REGISTER_IDEEP_OPERATOR(SpatialBNGradient, IDEEPSpatialBNGradientOp)
+
+}  // namespace caffe2
diff --git a/caffe2/ideep/operators/squeeze_op.cc b/caffe2/ideep/operators/squeeze_op.cc
new file mode 100644
index 0000000..fe78e30
--- /dev/null
+++ b/caffe2/ideep/operators/squeeze_op.cc
@@ -0,0 +1,59 @@
+#include <caffe2/ideep/ideep_utils.h>
+#include "caffe2/operators/expand_squeeze_dims_op.h"
+
+namespace caffe2 {
+
+class IDEEPSqueezeOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSqueezeOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
+    auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
+    std::sort(dims_.begin(), dims_.end());
+    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
+    if (dims_.size() < originalSize) {
+      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+    }
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
+  }
+
+  virtual ~IDEEPSqueezeOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+
+    CAFFE_ENFORCE_GT(
+        X.ndims(),
+        dims_.back(),
+        "Input needs at least ",
+        (dims_.back() + 1),
+        " dimensions.");
+    const auto& ideep_dims = X.get_dims();
+    vector<TIndex> dims(ideep_dims.begin(), ideep_dims.end());
+    const auto& new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
+    itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
+    if (&X != Y) {
+      // Copy if not inplace
+      ideep::direct_copy::compute(X, *Y);
+    }
+    Y->reshape(new_dims_ideep);
+
+    return true;
+  }
+
+ private:
+  vector<int> dims_;
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+REGISTER_IDEEP_OPERATOR(Squeeze, IDEEPSqueezeOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
new file mode 100644
index 0000000..9a2ec87
--- /dev/null
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -0,0 +1,55 @@
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/ideep/ideep_utils.h"
+
+namespace caffe2 {
+
+class CopyCPUToIDEEPOp final : public IDEEPOperator {
+ public:
+  USE_SIMPLE_IDEEP_CTOR_DTOR(CopyCPUToIDEEPOp);
+  USE_IDEEP_DEF_ALIASES();
+
+  bool RunOnDevice() override {
+    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    auto* Y = OperatorBase::OutputBlob(0);
+    itensor::dims src_dims(X.dims().begin(), X.dims().end());
+    if (!(Y->template IsType<itensor>() &&
+          Y->Get<itensor>().get_data_type() == itensor::data_type::f32) ||
+        Y->Get<itensor>().get_dims() != src_dims) {
+      Y->Reset(new itensor());
+      Y->GetMutable<itensor>()->resize(src_dims, itensor::data_type::f32);
+    }
+    Y->GetMutable<itensor>()->reorder_from(
+        src_dims, itensor::data_type::f32, X.raw_data());
+    return true;
+  }
+};
+
+class CopyIDEEPToCPUOp final : public IDEEPOperator {
+ public:
+  USE_SIMPLE_IDEEP_CTOR_DTOR(CopyIDEEPToCPUOp);
+  USE_IDEEP_DEF_ALIASES();
+  bool RunOnDevice() override {
+    const auto& X = OperatorBase::Input<itensor>(0);
+    auto* Y = OperatorBase::Output<TensorCPU>(0);
+    Y->Resize(X.get_dims());
+    X.reorder_to(Y->template mutable_data<float>());
+    return true;
+  }
+};
+
+REGISTER_IDEEP_OPERATOR(CopyCPUToIDEEP, CopyCPUToIDEEPOp);
+REGISTER_IDEEP_OPERATOR(CopyIDEEPToCPU, CopyIDEEPToCPUOp);
+
+OPERATOR_SCHEMA(CopyCPUToIDEEP)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Input(0, "cpu_blob", "The input TensorCPU to copy")
+    .Output(0, "ideep_blob", "The output IDEEP tensort to copy to");
+OPERATOR_SCHEMA(CopyIDEEPToCPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Input(0, "ideep_blob", "The input IDEEP tensort to copy")
+    .Output(0, "cpu_blob", "The output TensorCPU to copy to");
+
+} // namespace caffe2
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
new file mode 100644
index 0000000..200d98f
--- /dev/null
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <cstdlib>
+#include <ctime>
+#include <random>
+
+#include <caffe2/core/context.h>
+
+namespace caffe2 {
+
+class IDEEPContext final {
+ public:
+  typedef std::mt19937 rand_gen_type;
+  IDEEPContext() : random_seed_(RandomNumberSeed()) {}
+  explicit IDEEPContext(const DeviceOption& option)
+      : random_seed_(
+            option.has_random_seed() ? option.random_seed()
+                                     : RandomNumberSeed()) {
+    CAFFE_ENFORCE_EQ(option.device_type(), IDEEP);
+  }
+
+  ~IDEEPContext() noexcept {}
+
+  inline void SwitchToDevice(int /*stream_id*/) {}
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  inline void WaitEvent(const Event& ev) {
+    ev.Wait(IDEEP, this);
+  }
+
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+    CAFFE_ENFORCE(ev, "Event must not be null.");
+    ev->Record(IDEEP, this, err_msg);
+  }
+
+
+  inline void FinishDeviceComputation() {}
+
+  inline rand_gen_type& RandGenerator() {
+    if (!random_generator_.get()) {
+      random_generator_.reset(new rand_gen_type(random_seed_));
+    }
+    return *random_generator_.get();
+  }
+
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return GetCPUAllocator()->New(nbytes);
+  }
+
+  // Two copy functions that deals with cross-device copies.
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(size_t n, const T* src, T* dst) {
+    if (std::is_fundamental<T>::value) {
+      CopyBytes<SrcContext, DstContext>(
+          n * sizeof(T),
+          static_cast<const void*>(src),
+          static_cast<void*>(dst));
+    } else {
+      for (int i = 0; i < n; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  static bool HasAsyncPartDefault() {
+    return false;
+  }
+
+  static bool SupportsAsyncScheduling() {
+    return false;
+  }
+
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+    return true;
+  }
+
+ protected:
+  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
+  int random_seed_{1701};
+  std::unique_ptr<rand_gen_type> random_generator_;
+};
+
+template <>
+inline void IDEEPContext::CopyBytes<IDEEPContext, IDEEPContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  if (nbytes == 0) {
+    return;
+  }
+  CAFFE_ENFORCE(src);
+  CAFFE_ENFORCE(dst);
+  memcpy(dst, src, nbytes);
+}
+
+template <>
+inline void IDEEPContext::CopyBytes<CPUContext, IDEEPContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  if (nbytes == 0) {
+    return;
+  }
+  CAFFE_ENFORCE(src);
+  CAFFE_ENFORCE(dst);
+  memcpy(dst, src, nbytes);
+}
+
+template <>
+inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  if (nbytes == 0) {
+    return;
+  }
+  CAFFE_ENFORCE(src);
+  CAFFE_ENFORCE(dst);
+  memcpy(dst, src, nbytes);
+}
+} // namespace caffe2
diff --git a/caffe2/ideep/utils/ideep_operator.h b/caffe2/ideep/utils/ideep_operator.h
new file mode 100644
index 0000000..e4c904a
--- /dev/null
+++ b/caffe2/ideep/utils/ideep_operator.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <ideep.hpp>
+#include <caffe2/core/operator.h>
+#include <caffe2/proto/caffe2.pb.h>
+
+namespace caffe2 {
+
+CAFFE_DECLARE_REGISTRY(
+    IDEEPOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+
+#define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_IDEEP_OPERATOR(name, ...) \
+  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+// IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It
+// provides a few operators that are useful to IDEEP specific implementations.
+class IDEEPOperator : public OperatorBase {
+ public:
+  explicit IDEEPOperator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        context_(operator_def.device_option()),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    OPERATOR_NEEDS_FEATURE(
+        order_ == StorageOrder::NCHW, "Unsupported storage order.");
+  }
+  virtual ~IDEEPOperator() {}
+
+  inline const ideep::tensor& Input(int index) {
+    return OperatorBase::template Input<ideep::tensor>(index);
+  }
+  inline ideep::tensor* Output(int index) {
+    return OperatorBase::template Output<ideep::tensor>(index);
+  }
+
+  // The run function of Operator switches to the device, and then carries out
+  // the actual computation with RunOnDevice(). You should implement RunOnDevice
+  // instead of Run().
+  bool Run(int /* unused */ /*stream_id*/) final {
+    // Since IDEEP does not need to do SwithToDevice and
+    // FinishDeviceComputation,
+    // it is always just a re-route to RunOnDevice().
+    try {
+      return RunOnDevice();
+    } catch (EnforceNotMet& err) {
+      err.AppendMessage(getErrorMsg());
+      throw;
+    } catch (ideep::error& e) {
+      LOG(ERROR) << "IDEEP error:" << e.message;
+      throw;
+    }
+  }
+
+  // Waits for a previous event. Note that to properly wait and run
+  // asynchronously, WaitEvent, RunAsync and Record should all be executed
+  // on the same CPU thread.
+  void WaitEvent(const Event& ev, int /* unused */) final {
+    context_.WaitEvent(ev);
+  }
+
+  void WaitEvents(const std::vector<const Event*>& events, int /* unused */)
+      final {
+    for (const auto& ev : events) {
+      context_.WaitEvent(*ev);
+    }
+  }
+
+  void RecordEvent(const char* err_msg = nullptr) final {
+    if (event_) {
+      context_.Record(event_.get(), err_msg);
+    }
+  }
+
+  virtual bool RunOnDevice() = 0;
+
+ protected:
+  std::string getErrorMsg() {
+    if (has_debug_def()) {
+      return "Error from operator: " + ProtoDebugString(debug_def());
+    } else {
+      return "Error from operator: no op def";
+    }
+  }
+
+  IDEEPContext context_;
+  StorageOrder order_;
+};
+
+#define USE_IDEEP_OPERATOR_FUNCTIONS()                                         \
+  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
+  /* using override */ using IDEEPOperator::Input;                             \
+  /* using override */ using IDEEPOperator::Output;                            \
+  /* using override */ using IDEEPOperator::order_;                            \
+  /* using override */ using IDEEPOperator::context_;
+
+#define USE_SIMPLE_IDEEP_CTOR_DTOR(name)                                       \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : IDEEPOperator(operator_def, ws) {}                                     \
+  virtual ~name() {}
+
+} // namespace caffe2
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
new file mode 100644
index 0000000..45335e9
--- /dev/null
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -0,0 +1,29 @@
+#include <ideep_pin_singletons.hpp>
+#include <caffe2/core/operator.h>
+#include <caffe2/proto/caffe2.pb.h>
+#include <caffe2/core/event_cpu.h>
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(ideep::tensor);
+
+CAFFE_DEFINE_REGISTRY(
+    IDEEPOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::IDEEP, IDEEPOperatorRegistry);
+
+REGISTER_EVENT_CREATE_FUNCTION(IDEEP, EventCreateCPU);
+REGISTER_EVENT_RECORD_FUNCTION(IDEEP, EventRecordCPU);
+REGISTER_EVENT_WAIT_FUNCTION(IDEEP, IDEEP, EventWaitCPUCPU);
+REGISTER_EVENT_WAIT_FUNCTION(IDEEP, CPU, EventWaitCPUCPU);
+REGISTER_EVENT_WAIT_FUNCTION(CPU, IDEEP, EventWaitCPUCPU);
+REGISTER_EVENT_FINISH_FUNCTION(IDEEP, EventFinishCPU);
+REGISTER_EVENT_QUERY_FUNCTION(IDEEP, EventQueryCPU);
+REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
+REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
+REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
+
+} // namespace caffe2
diff --git a/caffe2/image/CMakeLists.txt b/caffe2/image/CMakeLists.txt
new file mode 100644
index 0000000..84d9007
--- /dev/null
+++ b/caffe2/image/CMakeLists.txt
@@ -0,0 +1,38 @@
+if(USE_OPENCV AND OpenCV_FOUND)
+        message(STATUS "Including image processing operators")
+  # ---[ GPU files
+  # ------[ general GPU
+  file(GLOB tmp *_gpu.cc)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+  # ------[ CUDA sources
+  file(GLOB tmp *.cu)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+  # exclude test files
+  file(GLOB tmp *_test.cc)
+  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+  # ---[ CPU files.
+  file(GLOB tmp *.cc)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+  # exclude test files and gpu files
+  file(GLOB tmp *_test.cc)
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+  # ---[ GPU test files
+  file(GLOB tmp *_gpu_test.cc)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+  # ---[ CPU test files
+  file(GLOB tmp *_test.cc)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+
+  # ---[ Send the lists to the parent scope.
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+else()
+        message(STATUS "Excluding image processing operators due to no opencv")
+endif()
diff --git a/caffe2/image/image_input_op.cc b/caffe2/image/image_input_op.cc
new file mode 100644
index 0000000..25fe290
--- /dev/null
+++ b/caffe2/image/image_input_op.cc
@@ -0,0 +1,108 @@
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
+
+OPERATOR_SCHEMA(ImageInput)
+    .NumInputs(0, 1)
+    .NumOutputs(2, INT_MAX)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& /* unused */ ) {
+          vector<TensorShape> out(2);
+          ArgumentHelper helper(def);
+          int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
+          int crop = helper.GetSingleArgument<int>("crop", -1);
+          int color = helper.GetSingleArgument<int>("color", 1);
+          CHECK_GT(crop, 0);
+          out[0] = CreateTensorShape(
+              vector<int>{batch_size, crop, crop, color ? 3 : 1},
+              TensorProto::FLOAT);
+          out[1] =
+              CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
+          return out;
+        })
+    .SetDoc(R"DOC(
+Imports and processes images from a database. For each run of the operator,
+batch_size images will be processed. GPUs can optionally be used for
+part of the processing.
+
+The following transformations are applied to the image
+  - A bounding box is applied to the initial image (optional)
+  - The image is rescaled either up or down (with the scale argument) or
+    just up (with the minsize argument)
+  - The image is randomly cropped (crop size is passed as an argument but
+    the location of the crop is random except if is_test is passed in which case
+    the image in cropped at the center)
+  - The image is normalized. Each of its color channels can have separate
+    normalization values
+
+The dimension of the output image will always be cropxcrop
+)DOC")
+    .Arg("batch_size", "Number of images to output for each run of the operator"
+         ". Must be 1 or greater")
+    .Arg("color", "Number of color channels (1 or 3). Defaults to 1")
+    .Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
+    .Arg("img_saturation", "Image saturation scale used in color jittering. "
+         "Defaults to 0.4")
+    .Arg("img_brightness", "Image brightness scale used in color jittering. "
+         "Defaults to 0.4")
+    .Arg("img_contrast", "Image contrast scale used in color jittering. "
+         "Defaults to 0.4")
+    .Arg("color_lighting", "Whether or not to do color lighting."
+         " Defaults to 0")
+    .Arg("color_lighting_std", "Std of normal distribution where color lighting"
+        " scaling factor is sampled. Defaults to 0.1")
+    .Arg("scale_jitter_type", "Type 0: No scale jittering "
+          "Type 1: Inception-style scale jittering")
+    .Arg("label_type", "Type 0: single integer label for multi-class "
+        "classification. Type 1: sparse active label indices for multi-label "
+        "classification. Type 2: dense label embedding vector for label "
+        "embedding regression")
+    .Arg("scale", "Scale the size of the smallest dimension of the image to"
+         " this. Scale and minsize are mutually exclusive."
+         " Must be larger than crop")
+    .Arg("minsize", "Scale the size of the smallest dimension of the image to"
+         " this only if the size is initially smaller. Scale and minsize are"
+         " mutually exclusive. Must be larger than crop.")
+    .Arg("warp", "If 1, both dimensions of the image will be set to minsize or"
+         " scale; otherwise, the other dimension is proportionally scaled."
+         " Defaults to 0")
+    .Arg("crop", "Size to crop the image to. Must be provided")
+    .Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
+    .Arg("mean", "Mean by which to normalize color channels."
+         " Defaults to 0.")
+    .Arg("mean_per_channel", "Vector of means per color channel "
+         " (1 or 3 elements). Defaults to mean argument. Channel order BGR")
+    .Arg("std", "Standard deviation by which to normalize color channels."
+         " Defaults to 1.")
+    .Arg("std_per_channel", "Vector of standard dev. per color channel "
+     " (1 or 3 elements). Defaults to std argument. Channel order is BGR")
+    .Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
+    .Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
+    .Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
+    .Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
+    .ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
+    .Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
+    .Arg("use_gpu_transform", "1 if GPU acceleration should be used."
+         " Defaults to 0. Can only be 1 in a CUDAContext")
+    .Arg("decode_threads", "Number of CPU decode/transform threads."
+         " Defaults to 4")
+    .Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
+    .Arg("db", "Name of the database (if not passed as input)")
+    .Arg("db_type", "Type of database (if not passed as input)."
+         " Defaults to leveldb")
+    .Arg("output_sizes", "The sizes of any outputs besides the data and label "
+         "(should have a number of elements equal to the number of additional "
+         "outputs)")
+    .Arg("random_scale", "[min, max] shortest-side desired for image resize. "
+         "Defaults to [-1, -1] or no random resize desired.")
+    .Input(0, "reader", "The input reader (a db::DBReader)")
+    .Output(0, "data", "Tensor containing the images")
+    .Output(1, "label", "Tensor containing the labels")
+    .Output(2, "additional outputs", "Any outputs after the first 2 will be "
+            "Tensors read from the input TensorProtos");
+
+NO_GRADIENT(ImageInput);
+
+}  // namespace caffe2
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
new file mode 100644
index 0000000..a8c45ca
--- /dev/null
+++ b/caffe2/image/image_input_op.h
@@ -0,0 +1,1273 @@
+
+#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+
+#include <opencv2/opencv.hpp>
+
+#include <iostream>
+#include <algorithm>
+
+#include "caffe2/core/common.h"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe2/core/db.h"
+#include "caffe2/utils/cast.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/thread_pool.h"
+#include "caffe2/operators/prefetch_op.h"
+#include "caffe2/image/transform_gpu.h"
+
+namespace caffe2 {
+
+class CUDAContext;
+
+template <class Context>
+class ImageInputOp final
+    : public PrefetchOperator<Context> {
+  // SINGLE_LABEL: single integer label for multi-class classification
+  // MULTI_LABEL_SPARSE: sparse active label indices for multi-label classification
+  // MULTI_LABEL_DENSE: dense label embedding vector for label embedding regression
+  // MULTI_LABEL_WEIGHTED_SPARSE: sparse active label indices with per-label weights
+  // for multi-label classification
+  // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification with weighted sampling
+  enum LABEL_TYPE {
+    SINGLE_LABEL = 0,
+    MULTI_LABEL_SPARSE = 1,
+    MULTI_LABEL_DENSE = 2,
+    MULTI_LABEL_WEIGHTED_SPARSE = 3,
+    SINGLE_LABEL_WEIGHTED = 4
+  };
+
+  // INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect
+  // ratio in [3/4, 4/3]. Reference: GoogleNet paper
+  enum SCALE_JITTER_TYPE {
+    NO_SCALE_JITTER = 0,
+    INCEPTION_STYLE = 1
+    // TODO(zyan3): ResNet-style random scale jitter
+  };
+
+ public:
+  using OperatorBase::OutputSize;
+  using PrefetchOperator<Context>::context_;
+  using PrefetchOperator<Context>::prefetch_thread_;
+  explicit ImageInputOp(const OperatorDef& operator_def,
+                                    Workspace* ws);
+  ~ImageInputOp() {
+    PrefetchOperator<Context>::Finalize();
+  }
+
+  bool Prefetch() override;
+  bool CopyPrefetched() override;
+
+ private:
+  using BoundingBox = struct {
+    bool valid;
+    int ymin;
+    int xmin;
+    int height;
+    int width;
+  };
+
+  // Structure to store per-image information
+  // This can be modified by the DecodeAnd* so needs
+  // to be privatized per launch.
+  using PerImageArg = struct {
+    BoundingBox bounding_params;
+  };
+
+  bool GetImageAndLabelAndInfoFromDBValue(
+      const string& value, cv::Mat* img, PerImageArg& info, int item_id,
+      std::mt19937* randgen);
+  void DecodeAndTransform(
+      const std::string& value, float *image_data, int item_id,
+      const int channels, std::size_t thread_index);
+  void DecodeAndTransposeOnly(
+      const std::string& value, uint8_t *image_data, int item_id,
+      const int channels, std::size_t thread_index);
+
+  unique_ptr<db::DBReader> owned_reader_;
+  const db::DBReader* reader_;
+  CPUContext cpu_context_;
+  TensorCPU prefetched_image_;
+  TensorCPU prefetched_label_;
+  vector<TensorCPU> prefetched_additional_outputs_;
+  Tensor<Context> prefetched_image_on_device_;
+  Tensor<Context> prefetched_label_on_device_;
+  vector<Tensor<Context>> prefetched_additional_outputs_on_device_;
+  // Default parameters for images
+  PerImageArg default_arg_;
+  int batch_size_;
+  LABEL_TYPE label_type_;
+  int num_labels_;
+
+  bool color_;
+  bool color_jitter_;
+  float img_saturation_;
+  float img_brightness_;
+  float img_contrast_;
+  bool color_lighting_;
+  float color_lighting_std_;
+  std::vector<std::vector<float>> color_lighting_eigvecs_;
+  std::vector<float> color_lighting_eigvals_;
+  SCALE_JITTER_TYPE scale_jitter_type_;
+  int scale_;
+  // Minsize is similar to scale except that it will only
+  // force the image to scale up if it is too small. In other words,
+  // it ensures that both dimensions of the image are at least minsize_
+  int minsize_;
+  bool warp_;
+  int crop_;
+  std::vector<float> mean_;
+  std::vector<float> std_;
+  Tensor<Context> mean_gpu_;
+  Tensor<Context> std_gpu_;
+  bool mirror_;
+  bool is_test_;
+  bool use_caffe_datum_;
+  bool gpu_transform_;
+  bool mean_std_copied_ = false;
+
+  // thread pool for parse + decode
+  int num_decode_threads_;
+  int additional_inputs_offset_;
+  int additional_inputs_count_;
+  std::shared_ptr<TaskThreadPool> thread_pool_;
+
+  // Output type for GPU transform path
+  TensorProto_DataType output_type_;
+
+  // random minsize
+  vector<int> random_scale_;
+  bool random_scaling_;
+
+  // Working variables
+  std::vector<std::mt19937> randgen_per_thread_;
+
+  // number of exceptions produced by opencv while reading image data
+  std::atomic<long> num_decode_errors_in_batch_{0};
+  // opencv exceptions tolerance
+  float max_decode_error_ratio_;
+};
+
+template <class Context>
+ImageInputOp<Context>::ImageInputOp(
+    const OperatorDef& operator_def,
+    Workspace* ws)
+    : PrefetchOperator<Context>(operator_def, ws),
+      reader_(nullptr),
+      prefetched_additional_outputs_(OutputSize() - 2),
+      prefetched_additional_outputs_on_device_(OutputSize() - 2),
+      batch_size_(
+          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
+      label_type_(static_cast<LABEL_TYPE>(
+          OperatorBase::template GetSingleArgument<int>("label_type", 0))),
+      num_labels_(
+          OperatorBase::template GetSingleArgument<int>("num_labels", 0)),
+      color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
+      color_jitter_(
+          OperatorBase::template GetSingleArgument<int>("color_jitter", 0)),
+      img_saturation_(OperatorBase::template GetSingleArgument<float>(
+          "img_saturation",
+          0.4)),
+      img_brightness_(OperatorBase::template GetSingleArgument<float>(
+          "img_brightness",
+          0.4)),
+      img_contrast_(
+          OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)),
+      color_lighting_(
+          OperatorBase::template GetSingleArgument<int>("color_lighting", 0)),
+      color_lighting_std_(OperatorBase::template GetSingleArgument<float>(
+          "color_lighting_std",
+          0.1)),
+      scale_jitter_type_(static_cast<SCALE_JITTER_TYPE>(
+          OperatorBase::template GetSingleArgument<int>(
+              "scale_jitter_type",
+              0))),
+      scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
+      minsize_(OperatorBase::template GetSingleArgument<int>("minsize", -1)),
+      warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
+      crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
+      mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)),
+      is_test_(OperatorBase::template GetSingleArgument<int>(
+          OpSchema::Arg_IsTest,
+          0)),
+      use_caffe_datum_(
+          OperatorBase::template GetSingleArgument<int>("use_caffe_datum", 0)),
+      gpu_transform_(OperatorBase::template GetSingleArgument<int>(
+          "use_gpu_transform",
+          0)),
+      num_decode_threads_(
+          OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
+      thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)),
+      // output type only supported with CUDA and use_gpu_transform for now
+      output_type_(
+          cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")),
+      random_scale_(OperatorBase::template GetRepeatedArgument<int>(
+          "random_scale",
+          {-1, -1})),
+      max_decode_error_ratio_(OperatorBase::template GetSingleArgument<float>(
+          "max_decode_error_ratio",
+          1.0)) {
+  if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) {
+    random_scaling_ = false;
+  } else {
+    random_scaling_ = true;
+    minsize_ = random_scale_[0];
+  }
+
+  mean_ = OperatorBase::template GetRepeatedArgument<float>(
+    "mean_per_channel",
+    {OperatorBase::template GetSingleArgument<float>("mean", 0.)});
+
+  std_ = OperatorBase::template GetRepeatedArgument<float>(
+    "std_per_channel",
+    {OperatorBase::template GetSingleArgument<float>("std", 1.)});
+
+  vector<int> additional_output_sizes =
+      OperatorBase::template GetRepeatedArgument<int>(
+          "output_sizes", vector<int>(OutputSize() - 2, 1));
+  additional_inputs_count_ = OutputSize() - 2;
+
+  default_arg_.bounding_params = {
+    false,
+    OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
+    OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
+    OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
+    OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
+  };
+
+  if (operator_def.input_size() == 0) {
+    LOG(ERROR) << "You are using an old ImageInputOp format that creates "
+                       "a local db reader. Consider moving to the new style "
+                       "that takes in a DBReader blob instead.";
+    string db_name =
+        OperatorBase::template GetSingleArgument<string>("db", "");
+    CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name.");
+    owned_reader_.reset(new db::DBReader(
+        OperatorBase::template GetSingleArgument<string>(
+            "db_type", "leveldb"),
+        db_name));
+    reader_ = owned_reader_.get();
+  }
+
+  // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
+  color_lighting_eigvecs_.push_back(
+    std::vector<float>{-144.7125, 183.396, 102.2295});
+  color_lighting_eigvecs_.push_back(
+    std::vector<float>{-148.104, -1.1475, -207.57});
+  color_lighting_eigvecs_.push_back(
+    std::vector<float>{-148.818, -177.174, 107.1765});
+
+  color_lighting_eigvals_ = std::vector<float>{0.2175, 0.0188, 0.0045};
+
+  CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
+  if (use_caffe_datum_) {
+    CAFFE_ENFORCE(label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
+      "Caffe datum only supports single integer label");
+  }
+  if (label_type_ !=  SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
+    CAFFE_ENFORCE_GT(num_labels_, 0,
+      "Number of labels must be set for using either sparse label indices or dense label embedding.");
+  }
+  if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE ||
+    label_type_ == SINGLE_LABEL_WEIGHTED) {
+    additional_inputs_offset_ = 3;
+  } else {
+    additional_inputs_offset_ = 2;
+  }
+  CAFFE_ENFORCE((scale_ > 0) != (minsize_ > 0),
+                "Must provide one and only one of scaling or minsize");
+  CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
+  CAFFE_ENFORCE_GE(
+    scale_ > 0 ? scale_ : minsize_,
+    crop_, "The scale/minsize value must be no smaller than the crop value.");
+
+  CAFFE_ENFORCE_EQ(
+      mean_.size(),
+      std_.size(),
+      "The mean and std. dev vectors must be of the same size.");
+  CAFFE_ENFORCE(mean_.size() == 1 || mean_.size() == 3,
+                "The mean and std. dev vectors must be of size 1 or 3");
+  CAFFE_ENFORCE(
+      !use_caffe_datum_ || OutputSize() == 2,
+      "There can only be 2 outputs if the Caffe datum format is used");
+  CAFFE_ENFORCE(
+      additional_output_sizes.size() == OutputSize() - 2,
+      "If the output sizes are specified, they must be specified for all "
+      "additional outputs");
+
+  CAFFE_ENFORCE(random_scale_.size() == 2,
+      "Must provide [scale_min, scale_max]");
+  CAFFE_ENFORCE_GE(random_scale_[1], random_scale_[0],
+      "random scale must provide a range [min, max]");
+
+  if (default_arg_.bounding_params.ymin < 0
+      || default_arg_.bounding_params.xmin < 0
+      || default_arg_.bounding_params.height < 0
+      || default_arg_.bounding_params.width < 0) {
+    default_arg_.bounding_params.valid = false;
+  } else {
+    default_arg_.bounding_params.valid = true;
+  }
+
+  if (mean_.size() == 1) {
+    // We are going to extend to 3 using the first value
+    mean_.resize(3, mean_[0]);
+    std_.resize(3, std_[0]);
+  }
+
+  LOG(INFO) << "Creating an image input op with the following setting: ";
+  LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;";
+  if (gpu_transform_) {
+    LOG(INFO) << "    Performing transformation on GPU";
+  }
+  LOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
+  LOG(INFO) << "    Treating input image as "
+            << (color_ ? "color " : "grayscale ") << "image;";
+  if (default_arg_.bounding_params.valid) {
+    LOG(INFO) << "    Applying a default bounding box of Y ["
+              << default_arg_.bounding_params.ymin << "; "
+              << default_arg_.bounding_params.ymin +
+      default_arg_.bounding_params.height
+              << ") x X ["
+              << default_arg_.bounding_params.xmin << "; "
+              << default_arg_.bounding_params.xmin +
+      default_arg_.bounding_params.width
+              << ")";
+  }
+  if (scale_ > 0 && !random_scaling_) {
+    LOG(INFO) << "    Scaling image to " << scale_
+              << (warp_ ? " with " : " without ") << "warping;";
+  } else {
+    if (random_scaling_) {
+      // randomly set min_size_ for each image
+      LOG(INFO) << "    Randomly scaling shortest side between "
+                << random_scale_[0] << " and "
+                << random_scale_[1];
+    } else {
+      // Here, minsize_ > 0
+      LOG(INFO) << "    Ensuring minimum image size of " << minsize_
+                << (warp_ ? " with " : " without ") << "warping;";
+    }
+  }
+  LOG(INFO) << "    " << (is_test_ ? "Central" : "Random")
+            << " cropping image to " << crop_
+            << (mirror_ ? " with " : " without ") << "random mirroring;";
+  LOG(INFO) << "Label Type: " << label_type_;
+  LOG(INFO) << "Num Labels: " << num_labels_;
+
+  auto mit = mean_.begin();
+  auto sit = std_.begin();
+
+  for (int i = 0;
+       mit != mean_.end() && sit != std_.end();
+       ++mit, ++sit, ++i) {
+    LOG(INFO) << "    Default [Channel " << i << "] Subtract mean " << *mit
+              << " and divide by std " << *sit << ".";
+    // We actually will use the inverse of std, so inverse it here
+    *sit = 1.f / *sit;
+  }
+  LOG(INFO) << "    Outputting images as "
+            << OperatorBase::template GetSingleArgument<string>("output_type", "unknown") << ".";
+
+  std::mt19937 meta_randgen(time(nullptr));
+  for (int i = 0; i < num_decode_threads_; ++i) {
+    randgen_per_thread_.emplace_back(meta_randgen());
+  }
+  prefetched_image_.Resize(
+      TIndex(batch_size_),
+      TIndex(crop_),
+      TIndex(crop_),
+      TIndex(color_ ? 3 : 1));
+  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
+    prefetched_label_.Resize(TIndex(batch_size_), TIndex(num_labels_));
+  } else {
+    prefetched_label_.Resize(vector<TIndex>(1, batch_size_));
+  }
+
+  for (int i = 0; i < additional_output_sizes.size(); ++i) {
+    prefetched_additional_outputs_[i].Resize(
+        TIndex(batch_size_), TIndex(additional_output_sizes[i]));
+  }
+}
+
+// Inception-stype scale jittering
+template <class Context>
+bool RandomSizedCropping(
+  cv::Mat* img,
+  const int crop,
+  std::mt19937* randgen
+) {
+  cv::Mat scaled_img;
+  bool inception_scale_jitter = false;
+  int im_height = img->rows, im_width = img->cols;
+  int area = im_height * im_width;
+  std::uniform_real_distribution<> area_dis(0.08, 1.0);
+  std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0);
+
+  cv::Mat cropping;
+  for (int i = 0; i < 10; ++i) {
+    int target_area = int(ceil(area_dis(*randgen) * area));
+    float aspect_ratio = aspect_ratio_dis(*randgen);
+    int nh = floor(std::sqrt(((float)target_area / aspect_ratio)));
+    int nw = floor(std::sqrt(((float)target_area * aspect_ratio)));
+    if (nh >= 1 && nh <= im_height && nw >=1 && nw <= im_width) {
+      int height_offset = std::uniform_int_distribution<>(
+        0, im_height - nh)(*randgen);
+      int width_offset = std::uniform_int_distribution<>(
+        0,im_width - nw)(*randgen);
+      cv::Rect ROI(width_offset, height_offset, nw, nh);
+      cropping = (*img)(ROI);
+      cv::resize(
+          cropping,
+          scaled_img,
+          cv::Size(crop, crop),
+          0,
+          0,
+          cv::INTER_AREA);
+      *img = scaled_img;
+      inception_scale_jitter = true;
+      break;
+    }
+  }
+  return inception_scale_jitter;
+}
+
+template <class Context>
+bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
+    const string& value,
+    cv::Mat* img,
+    PerImageArg& info,
+    int item_id,
+    std::mt19937* randgen) {
+  //
+  // recommend using --caffe2_use_fatal_for_enforce=1 when using ImageInputOp
+  // as this function runs on a worker thread and the exceptions from
+  // CAFFE_ENFORCE are silently dropped by the thread worker functions
+  //
+  cv::Mat src;
+
+  // Use the default information for images
+  info = default_arg_;
+  if (use_caffe_datum_) {
+    // The input is a caffe datum format.
+    caffe::Datum datum;
+    CAFFE_ENFORCE(datum.ParseFromString(value));
+
+    prefetched_label_.mutable_data<int>()[item_id] = datum.label();
+    if (datum.encoded()) {
+      // encoded image in datum.
+      // count the number of exceptions from opencv imdecode
+      try {
+        src = cv::imdecode(
+            cv::Mat(
+                1,
+                datum.data().size(),
+                CV_8UC1,
+                const_cast<char*>(datum.data().data())),
+            color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+        if (src.rows == 0 or src.cols == 0) {
+          num_decode_errors_in_batch_++;
+          src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
+        }
+      } catch (cv::Exception& e) {
+        num_decode_errors_in_batch_++;
+        src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
+      }
+    } else {
+      // Raw image in datum.
+      CAFFE_ENFORCE(datum.channels() == 3 || datum.channels() == 1);
+
+      int src_c = datum.channels();
+      src.create(
+          datum.height(), datum.width(), (src_c == 3) ? CV_8UC3 : CV_8UC1);
+
+      if (src_c == 1) {
+        memcpy(src.ptr<uchar>(0), datum.data().data(), datum.data().size());
+      } else {
+        // Datum stores things in CHW order, let's do HWC for images to make
+        // things more consistent with conventional image storage.
+        for (int c = 0; c < 3; ++c) {
+          const char* datum_buffer =
+              datum.data().data() + datum.height() * datum.width() * c;
+          uchar* ptr = src.ptr<uchar>(0) + c;
+          for (int h = 0; h < datum.height(); ++h) {
+            for (int w = 0; w < datum.width(); ++w) {
+              *ptr = *(datum_buffer++);
+              ptr += 3;
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // The input is a caffe2 format.
+    TensorProtos protos;
+    CAFFE_ENFORCE(protos.ParseFromString(value));
+    const TensorProto& image_proto = protos.protos(0);
+    const TensorProto& label_proto = protos.protos(1);
+    // add handle protos
+    vector<TensorProto> additional_output_protos;
+    int start = additional_inputs_offset_;
+    int end = start + additional_inputs_count_;
+    for (int i = start; i < end; ++i) {
+      additional_output_protos.push_back(protos.protos(i));
+    }
+
+    if (protos.protos_size() == end + 1) {
+      // We have bounding box information
+      const TensorProto& bounding_proto = protos.protos(end);
+      DCHECK_EQ(bounding_proto.data_type(), TensorProto::INT32);
+      DCHECK_EQ(bounding_proto.int32_data_size(), 4);
+      info.bounding_params.valid = true;
+      info.bounding_params.ymin = bounding_proto.int32_data(0);
+      info.bounding_params.xmin = bounding_proto.int32_data(1);
+      info.bounding_params.height = bounding_proto.int32_data(2);
+      info.bounding_params.width = bounding_proto.int32_data(3);
+    }
+
+    if (image_proto.data_type() == TensorProto::STRING) {
+      // encoded image string.
+      DCHECK_EQ(image_proto.string_data_size(), 1);
+      const string& encoded_image_str = image_proto.string_data(0);
+      int encoded_size = encoded_image_str.size();
+      // We use a cv::Mat to wrap the encoded str so we do not need a copy.
+      // count the number of exceptions from opencv imdecode
+      try {
+        src = cv::imdecode(
+            cv::Mat(
+                1,
+                &encoded_size,
+                CV_8UC1,
+                const_cast<char*>(encoded_image_str.data())),
+            color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+        if (src.rows == 0 or src.cols == 0) {
+          num_decode_errors_in_batch_++;
+          src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
+        }
+      } catch (cv::Exception& e) {
+        num_decode_errors_in_batch_++;
+        src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
+      }
+    } else if (image_proto.data_type() == TensorProto::BYTE) {
+      // raw image content.
+      int src_c = (image_proto.dims_size() == 3) ? image_proto.dims(2) : 1;
+      CAFFE_ENFORCE(src_c == 3 || src_c == 1);
+
+      src.create(
+          image_proto.dims(0),
+          image_proto.dims(1),
+          (src_c == 3) ? CV_8UC3 : CV_8UC1);
+      memcpy(
+          src.ptr<uchar>(0),
+          image_proto.byte_data().data(),
+          image_proto.byte_data().size());
+    } else {
+      LOG(FATAL) << "Unknown image data type.";
+    }
+
+    // TODO: if image decoding was unsuccessful, set label to 0
+    if (label_proto.data_type() == TensorProto::FLOAT) {
+      if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) {
+        DCHECK_EQ(label_proto.float_data_size(), 1);
+        prefetched_label_.mutable_data<float>()[item_id] =
+            label_proto.float_data(0);
+      } else if (label_type_ == MULTI_LABEL_SPARSE) {
+        float* label_data = prefetched_label_.mutable_data<float>() +
+          item_id * num_labels_;
+        memset(label_data, 0, sizeof(float) * num_labels_);
+        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+          label_data[(int)label_proto.float_data(i)] = 1.0;
+        }
+      } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
+        const TensorProto& weight_proto = protos.protos(2);
+        float* label_data =
+            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
+        memset(label_data, 0, sizeof(float) * num_labels_);
+        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+          label_data[(int)label_proto.float_data(i)] =
+              weight_proto.float_data(i);
+        }
+      } else if (label_type_ == MULTI_LABEL_DENSE) {
+        CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_);
+        float* label_data = prefetched_label_.mutable_data<float>() +
+          item_id * num_labels_;
+        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+          label_data[i] = label_proto.float_data(i);
+        }
+      } else {
+        LOG(ERROR) << "Unknown label type:" << label_type_;
+      }
+    } else if (label_proto.data_type() == TensorProto::INT32) {
+      if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) {
+        DCHECK_EQ(label_proto.int32_data_size(), 1);
+        prefetched_label_.mutable_data<int>()[item_id] =
+            label_proto.int32_data(0);
+      } else if (label_type_ == MULTI_LABEL_SPARSE) {
+        int* label_data = prefetched_label_.mutable_data<int>() +
+          item_id * num_labels_;
+        memset(label_data, 0, sizeof(int) * num_labels_);
+        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+          label_data[label_proto.int32_data(i)] = 1;
+        }
+      } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
+        const TensorProto& weight_proto = protos.protos(2);
+        float* label_data =
+            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
+        memset(label_data, 0, sizeof(float) * num_labels_);
+        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+          label_data[label_proto.int32_data(i)] = weight_proto.float_data(i);
+        }
+      } else if (label_type_ == MULTI_LABEL_DENSE) {
+        CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_);
+        int* label_data = prefetched_label_.mutable_data<int>() +
+          item_id * num_labels_;
+        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+          label_data[i] = label_proto.int32_data(i);
+        }
+      } else {
+        LOG(ERROR) << "Unknown label type:" << label_type_;
+      }
+    } else {
+      LOG(FATAL) << "Unsupported label data type.";
+    }
+
+    for (int i = 0; i < additional_output_protos.size(); ++i) {
+      auto additional_output_proto = additional_output_protos[i];
+
+      if (additional_output_proto.data_type() == TensorProto::FLOAT) {
+        float* additional_output =
+            prefetched_additional_outputs_[i].template mutable_data<float>() +
+            item_id * additional_output_proto.float_data_size();
+
+        for (int j = 0; j < additional_output_proto.float_data_size(); ++j) {
+          additional_output[j] = additional_output_proto.float_data(j);
+        }
+      } else if (additional_output_proto.data_type() == TensorProto::INT32) {
+        int* additional_output =
+            prefetched_additional_outputs_[i].template mutable_data<int>() +
+            item_id * additional_output_proto.int32_data_size();
+
+        for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) {
+          additional_output[j] = additional_output_proto.int32_data(j);
+        }
+      } else if (additional_output_proto.data_type() == TensorProto::INT64) {
+        int64_t* additional_output =
+            prefetched_additional_outputs_[i].template mutable_data<int64_t>() +
+            item_id * additional_output_proto.int64_data_size();
+
+        for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) {
+          additional_output[j] = additional_output_proto.int64_data(j);
+        }
+      }
+      else {
+        LOG(FATAL) << "Unsupported output type.";
+      }
+    }
+  }
+
+  //
+  // convert source to the color format requested from Op
+  //
+  int out_c = color_ ? 3 : 1;
+  if (out_c == src.channels()) {
+    *img = src;
+  } else {
+    cv::cvtColor(src, *img, (out_c == 1) ? CV_BGR2GRAY : CV_GRAY2BGR);
+  }
+
+  // Note(Yangqing): I believe that the mat should be created continuous.
+  CAFFE_ENFORCE(img->isContinuous());
+
+  // Sanity check now that we decoded everything
+
+  // Ensure that the bounding box is legit
+  if (info.bounding_params.valid
+      && (src.rows < info.bounding_params.ymin + info.bounding_params.height
+        || src.cols < info.bounding_params.xmin + info.bounding_params.width
+     )) {
+    info.bounding_params.valid = false;
+  }
+
+  // Apply the bounding box if requested
+  if (info.bounding_params.valid) {
+    // If we reach here, we know the parameters are sane
+    cv::Rect bounding_box(info.bounding_params.xmin, info.bounding_params.ymin,
+                          info.bounding_params.width, info.bounding_params.height);
+    *img = (*img)(bounding_box);
+
+    /*
+    LOG(INFO) << "Did bounding with ymin:"
+              << info.bounding_params.ymin << " xmin:" << info.bounding_params.xmin
+              << " height:" << info.bounding_params.height
+              << " width:" << info.bounding_params.width << "\n";
+    LOG(INFO) << "Bounded matrix: " << img;
+    */
+  } else {
+    // LOG(INFO) << "No bounding\n";
+  }
+
+  cv::Mat scaled_img;
+  bool inception_scale_jitter = false;
+  if (scale_jitter_type_ == INCEPTION_STYLE) {
+    if (!is_test_) {
+      // Inception-stype scale jittering is only used for training
+      inception_scale_jitter = RandomSizedCropping<Context>(img, crop_, randgen);
+      // if a random crop is still not found, do simple random cropping later
+    }
+  }
+
+  if ((scale_jitter_type_ == NO_SCALE_JITTER) ||
+    (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) {
+      int scaled_width, scaled_height;
+      int scale_to_use = scale_ > 0 ? scale_ : minsize_;
+
+      // set the random minsize
+      if (random_scaling_) {
+        scale_to_use = std::uniform_int_distribution<>(random_scale_[0],
+                                                       random_scale_[1])(*randgen);
+      }
+
+      if (warp_) {
+        scaled_width = scale_to_use;
+        scaled_height = scale_to_use;
+      } else if (img->rows > img->cols) {
+        scaled_width = scale_to_use;
+        scaled_height =
+            static_cast<float>(img->rows) * scale_to_use / img->cols;
+      } else {
+        scaled_height = scale_to_use;
+        scaled_width =
+            static_cast<float>(img->cols) * scale_to_use / img->rows;
+      }
+      if ((scale_ > 0 &&
+           (scaled_height != img->rows || scaled_width != img->cols))
+          || (scaled_height > img->rows || scaled_width > img->cols)) {
+        // We rescale in all cases if we are using scale_
+        // but only to make the image bigger if using minsize_
+        /*
+        LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height
+                  << " From " << img->cols << " x " << img->rows;
+        */
+        cv::resize(
+            *img,
+            scaled_img,
+            cv::Size(scaled_width, scaled_height),
+            0,
+            0,
+            cv::INTER_AREA);
+        *img = scaled_img;
+      }
+  }
+
+  // TODO(Yangqing): return false if any error happens.
+  return true;
+}
+
+// assume HWC order and color channels BGR
+template <class Context>
+void Saturation(
+  float* img,
+  const int img_size,
+  const float alpha_rand,
+  std::mt19937* randgen
+) {
+  float alpha = 1.0f +
+    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+  // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
+  int p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
+        img[3 * p + 2] * 0.299f;
+      for (int c = 0; c < 3; ++c) {
+        img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha);
+      }
+      p++;
+    }
+  }
+}
+
+// assume HWC order and color channels BGR
+template <class Context>
+void Brightness(
+  float* img,
+  const int img_size,
+  const float alpha_rand,
+  std::mt19937* randgen
+) {
+  float alpha = 1.0f +
+    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+  int p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      for (int c = 0; c < 3; ++c) {
+        img[p++] *= alpha;
+      }
+    }
+  }
+}
+
+// assume HWC order and color channels BGR
+template <class Context>
+void Contrast(
+  float* img,
+  const int img_size,
+  const float alpha_rand,
+  std::mt19937* randgen
+){
+  float gray_mean = 0;
+  int p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
+      gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
+        img[3 * p + 2] * 0.299f;
+      p++;
+    }
+  }
+  gray_mean /= (img_size * img_size);
+
+  float alpha = 1.0f +
+    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+  p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      for (int c = 0; c < 3; ++c) {
+        img[p] = img[p] * alpha + gray_mean * (1.0f - alpha);
+        p++;
+      }
+    }
+  }
+}
+
+// assume HWC order and color channels BGR
+template <class Context>
+void ColorJitter(
+  float* img,
+  const int img_size,
+  const float saturation,
+  const float brightness,
+  const float contrast,
+  std::mt19937* randgen
+) {
+  std::srand (unsigned(std::time(0)));
+  std::vector<int> jitter_order{0, 1, 2};
+  // obtain a time-based seed:
+  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+  std::shuffle(jitter_order.begin(), jitter_order.end(),
+    std::default_random_engine(seed));
+
+  for (int i = 0; i < 3; ++i) {
+    if (jitter_order[i] == 0) {
+      Saturation<Context>(img, img_size, saturation, randgen);
+    } else if (jitter_order[i] == 1) {
+      Brightness<Context>(img, img_size, brightness, randgen);
+    } else {
+      Contrast<Context>(img, img_size, contrast, randgen);
+    }
+  }
+}
+
+// assume HWC order and color channels BGR
+template <class Context>
+void ColorLighting(
+  float* img,
+  const int img_size,
+  const float alpha_std,
+  const std::vector<std::vector<float>>& eigvecs,
+  const std::vector<float>& eigvals,
+  std::mt19937* randgen
+) {
+  std::normal_distribution<float> d(0, alpha_std);
+  std::vector<float> alphas(3);
+  for (int i = 0; i < 3; ++i) {
+    alphas[i] = d(*randgen);
+  }
+
+  std::vector<float> delta_rgb(3, 0.0);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
+    }
+  }
+
+  int p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      for (int c = 0; c < 3; ++c) {
+        img[p++] += delta_rgb[2 - c];
+      }
+    }
+  }
+
+}
+
+// assume HWC order and color channels BGR
+// mean subtraction and scaling.
+template <class Context>
+void ColorNormalization(
+  float* img,
+  const int img_size,
+  const int channels,
+  const std::vector<float>& mean,
+  const std::vector<float>& std
+) {
+  int p = 0;
+  for (int h = 0; h < img_size; ++h) {
+    for (int w = 0; w < img_size; ++w) {
+      for (int c = 0; c < channels; ++c) {
+        img[p] = (img[p] - mean[c]) * std[c];
+        p++;
+      }
+    }
+  }
+}
+
+// Factored out image transformation
+template <class Context>
+void TransformImage(
+    const cv::Mat& scaled_img,
+    const int channels,
+    float* image_data,
+    const bool color_jitter,
+    const float saturation,
+    const float brightness,
+    const float contrast,
+    const bool color_lighting,
+    const float color_lighting_std,
+    const std::vector<std::vector<float>>& color_lighting_eigvecs,
+    const std::vector<float>& color_lighting_eigvals,
+    const int crop,
+    const bool mirror,
+    const std::vector<float>& mean,
+    const std::vector<float>& std,
+    std::mt19937* randgen,
+    std::bernoulli_distribution* mirror_this_image,
+    bool is_test = false) {
+  CAFFE_ENFORCE_GE(
+      scaled_img.rows, crop, "Image height must be bigger than crop.");
+  CAFFE_ENFORCE_GE(
+      scaled_img.cols, crop, "Image width must be bigger than crop.");
+
+  // find the cropped region, and copy it to the destination matrix
+  int width_offset, height_offset;
+  if (is_test) {
+    width_offset = (scaled_img.cols - crop) / 2;
+    height_offset = (scaled_img.rows - crop) / 2;
+  } else {
+    width_offset =
+      std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
+    height_offset =
+      std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
+  }
+
+  float* image_data_ptr = image_data;
+  if (!is_test && mirror && (*mirror_this_image)(*randgen)) {
+    // Copy mirrored image.
+    for (int h = height_offset; h < height_offset + crop; ++h) {
+      for (int w = width_offset + crop - 1; w >= width_offset; --w) {
+        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
+        for (int c = 0; c < channels; ++c) {
+          *(image_data_ptr++) = static_cast<float>(cv_data[c]);
+        }
+      }
+    }
+  } else {
+    // Copy normally.
+    for (int h = height_offset; h < height_offset + crop; ++h) {
+      for (int w = width_offset; w < width_offset + crop; ++w) {
+        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
+        for (int c = 0; c < channels; ++c) {
+          *(image_data_ptr++) = static_cast<float>(cv_data[c]);
+        }
+      }
+    }
+  }
+
+  if (color_jitter && channels == 3 && !is_test) {
+    ColorJitter<Context>(image_data, crop, saturation, brightness, contrast,
+      randgen);
+  }
+  if (color_lighting && channels == 3 && !is_test) {
+    ColorLighting<Context>(image_data, crop, color_lighting_std,
+      color_lighting_eigvecs, color_lighting_eigvals, randgen);
+  }
+
+  // Color normalization
+  // Mean subtraction and scaling.
+  ColorNormalization<Context>(image_data, crop, channels, mean, std);
+}
+
+// Only crop / transose the image
+// leave in uint8_t dataType
+template <class Context>
+void CropTransposeImage(const cv::Mat& scaled_img, const int channels,
+                        uint8_t *cropped_data, const int crop,
+                        const bool mirror, std::mt19937 *randgen,
+                        std::bernoulli_distribution *mirror_this_image,
+                        bool is_test = false) {
+  CAFFE_ENFORCE_GE(
+      scaled_img.rows, crop, "Image height must be bigger than crop.");
+  CAFFE_ENFORCE_GE(
+      scaled_img.cols, crop, "Image width must be bigger than crop.");
+
+  // find the cropped region, and copy it to the destination matrix
+  int width_offset, height_offset;
+  if (is_test) {
+    width_offset = (scaled_img.cols - crop) / 2;
+    height_offset = (scaled_img.rows - crop) / 2;
+  } else {
+    width_offset =
+      std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
+    height_offset =
+      std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
+  }
+
+  if (mirror && (*mirror_this_image)(*randgen)) {
+    // Copy mirrored image.
+    for (int h = height_offset; h < height_offset + crop; ++h) {
+      for (int w = width_offset + crop - 1; w >= width_offset; --w) {
+        const uint8_t* cv_data = scaled_img.ptr(h) + w*channels;
+        for (int c = 0; c < channels; ++c) {
+          *(cropped_data++) = cv_data[c];
+        }
+      }
+    }
+  } else {
+    // Copy normally.
+    for (int h = height_offset; h < height_offset + crop; ++h) {
+      for (int w = width_offset; w < width_offset + crop; ++w) {
+        const uint8_t* cv_data = scaled_img.ptr(h) + w*channels;
+        for (int c = 0; c < channels; ++c) {
+          *(cropped_data++) = cv_data[c];
+        }
+      }
+    }
+  }
+}
+
+// Parse datum, decode image, perform transform
+// Intended as entry point for binding to thread pool
+template <class Context>
+void ImageInputOp<Context>::DecodeAndTransform(
+      const std::string& value, float *image_data, int item_id,
+      const int channels, std::size_t thread_index) {
+
+  CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
+
+  std::bernoulli_distribution mirror_this_image(0.5f);
+  std::mt19937* randgen = &(randgen_per_thread_[thread_index]);
+
+  cv::Mat img;
+  // Decode the image
+  PerImageArg info;
+  CHECK(
+      GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
+  // Factor out the image transformation
+  TransformImage<Context>(img, channels, image_data,
+    color_jitter_, img_saturation_, img_brightness_, img_contrast_,
+    color_lighting_, color_lighting_std_, color_lighting_eigvecs_,
+    color_lighting_eigvals_, crop_, mirror_, mean_, std_,
+    randgen, &mirror_this_image, is_test_);
+}
+
+template <class Context>
+void ImageInputOp<Context>::DecodeAndTransposeOnly(
+    const std::string& value, uint8_t *image_data, int item_id,
+    const int channels, std::size_t thread_index) {
+
+  CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
+
+  std::bernoulli_distribution mirror_this_image(0.5f);
+  std::mt19937* randgen = &(randgen_per_thread_[thread_index]);
+
+  cv::Mat img;
+  // Decode the image
+  PerImageArg info;
+  CHECK(
+      GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
+
+  // Factor out the image transformation
+  CropTransposeImage<Context>(img, channels, image_data, crop_, mirror_,
+                              randgen, &mirror_this_image, is_test_);
+}
+
+
+template <class Context>
+bool ImageInputOp<Context>::Prefetch() {
+  if (!owned_reader_.get()) {
+    // if we are not owning the reader, we will get the reader pointer from
+    // input. Otherwise the constructor should have already set the reader
+    // pointer.
+    reader_ = &OperatorBase::Input<db::DBReader>(0);
+  }
+  const int channels = color_ ? 3 : 1;
+  // Call mutable_data() once to allocate the underlying memory.
+  if (gpu_transform_) {
+    // we'll transfer up in int8, then convert later
+    prefetched_image_.mutable_data<uint8_t>();
+  } else {
+    prefetched_image_.mutable_data<float>();
+  }
+
+  prefetched_label_.mutable_data<int>();
+  // Prefetching handled with a thread pool of "decode_threads" threads.
+
+  for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    std::string key, value;
+    cv::Mat img;
+
+    // read data
+    reader_->Read(&key, &value);
+
+    // determine label type based on first item
+    if( item_id == 0 ) {
+      if( use_caffe_datum_ ) {
+        prefetched_label_.mutable_data<int>();
+      } else {
+        TensorProtos protos;
+        CAFFE_ENFORCE(protos.ParseFromString(value));
+        TensorProto_DataType labeldt = protos.protos(1).data_type();
+        if( labeldt == TensorProto::INT32 ) {
+          prefetched_label_.mutable_data<int>();
+        } else if ( labeldt == TensorProto::FLOAT) {
+          prefetched_label_.mutable_data<float>();
+        } else {
+          LOG(FATAL) << "Unsupported label type.";
+        }
+
+        for (int i = 0; i < additional_inputs_count_; ++i) {
+          int index = additional_inputs_offset_ + i;
+          TensorProto additional_output_proto = protos.protos(index);
+
+          if (additional_output_proto.data_type() == TensorProto::FLOAT) {
+            prefetched_additional_outputs_[i].template mutable_data<float>();
+          } else if (
+              additional_output_proto.data_type() == TensorProto::INT32) {
+            prefetched_additional_outputs_[i].template mutable_data<int>();
+          } else if (
+              additional_output_proto.data_type() == TensorProto::INT64) {
+            prefetched_additional_outputs_[i].template mutable_data<int64_t>();
+          } else {
+            LOG(FATAL) << "Unsupported output type.";
+          }
+        }
+      }
+    }
+
+    // launch into thread pool for processing
+    // TODO: support color jitter and color lighting in gpu_transform
+    if (gpu_transform_) {
+      // output of decode will still be int8
+      uint8_t* image_data = prefetched_image_.mutable_data<uint8_t>() +
+          crop_ * crop_ * channels * item_id;
+      thread_pool_->runTaskWithID(std::bind(
+          &ImageInputOp<Context>::DecodeAndTransposeOnly,
+          this,
+          std::string(value),
+          image_data,
+          item_id,
+          channels,
+          std::placeholders::_1));
+    } else {
+      float* image_data = prefetched_image_.mutable_data<float>() +
+          crop_ * crop_ * channels * item_id;
+      thread_pool_->runTaskWithID(std::bind(
+          &ImageInputOp<Context>::DecodeAndTransform,
+          this,
+          std::string(value),
+          image_data,
+          item_id,
+          channels,
+          std::placeholders::_1));
+    }
+  }
+  thread_pool_->waitWorkComplete();
+
+  // we allow to get at most max_decode_error_ratio from
+  // opencv imdecode until raising a runtime exception
+  if ((float)num_decode_errors_in_batch_ / batch_size_ >
+      max_decode_error_ratio_) {
+    throw std::runtime_error(
+        "max_decode_error_ratio exceeded " +
+        caffe2::to_string(max_decode_error_ratio_));
+  }
+
+  // If the context is not CPUContext, we will need to do a copy in the
+  // prefetch function as well.
+  if (!std::is_same<Context, CPUContext>::value) {
+    prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
+
+    for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
+      prefetched_additional_outputs_on_device_[i].CopyFrom(
+          prefetched_additional_outputs_[i], &context_);
+    }
+  }
+
+  num_decode_errors_in_batch_ = 0;
+
+  return true;
+}
+
+template <class Context>
+bool ImageInputOp<Context>::CopyPrefetched() {
+  auto* image_output = OperatorBase::Output<Tensor<Context> >(0);
+  auto* label_output = OperatorBase::Output<Tensor<Context> >(1);
+  vector<Tensor<Context>*> additional_outputs_output;
+
+  for (int i = 2; i < OutputSize(); ++i) {
+    additional_outputs_output.push_back(
+        OperatorBase::Output<Tensor<Context>>(i));
+  }
+
+  // Note(jiayq): The if statement below should be optimized away by the
+  // compiler since std::is_same is a constexpr.
+  if (std::is_same<Context, CPUContext>::value) {
+    image_output->CopyFrom(prefetched_image_, &context_);
+    label_output->CopyFrom(prefetched_label_, &context_);
+
+    for (int i = 0; i < additional_outputs_output.size(); ++i) {
+      additional_outputs_output[i]->CopyFrom(
+          prefetched_additional_outputs_[i], &context_);
+    }
+  } else {
+    // TODO: support color jitter and color lighting in gpu_transform
+    if (gpu_transform_) {
+      if (!mean_std_copied_) {
+        mean_gpu_.Resize(mean_.size());
+        std_gpu_.Resize(std_.size());
+
+        context_.template Copy<float, CPUContext, Context>(
+          mean_.size(), mean_.data(), mean_gpu_.template mutable_data<float>());
+        context_.template Copy<float, CPUContext, Context>(
+          std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
+        mean_std_copied_ = true;
+      }
+      // GPU transform kernel allows explicitly setting output type
+      if (output_type_ == TensorProto_DataType_FLOAT) {
+        TransformOnGPU<uint8_t,float,Context>(prefetched_image_on_device_,
+                                              image_output, mean_gpu_,
+                                              std_gpu_, &context_);
+      } else if (output_type_ == TensorProto_DataType_FLOAT16) {
+        TransformOnGPU<uint8_t,float16,Context>(prefetched_image_on_device_,
+                                                image_output, mean_gpu_,
+                                                std_gpu_, &context_);
+      }  else {
+        return false;
+      }
+    } else {
+      image_output->CopyFrom(prefetched_image_on_device_, &context_);
+    }
+    label_output->CopyFrom(prefetched_label_on_device_, &context_);
+
+    for (int i = 0; i < additional_outputs_output.size(); ++i) {
+      additional_outputs_output[i]->CopyFrom(
+          prefetched_additional_outputs_on_device_[i], &context_);
+    }
+  }
+  return true;
+}
+}  // namespace caffe2
+
+#endif  // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
diff --git a/caffe2/image/image_input_op_gpu.cc b/caffe2/image/image_input_op_gpu.cc
new file mode 100644
index 0000000..c69889c
--- /dev/null
+++ b/caffe2/image/image_input_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/image/transform_gpu.cu b/caffe2/image/transform_gpu.cu
new file mode 100644
index 0000000..c6d8d77
--- /dev/null
+++ b/caffe2/image/transform_gpu.cu
@@ -0,0 +1,83 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/image/transform_gpu.h"
+#include "caffe2/utils/conversions.h"
+
+/**
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
+ * Distributed under 2-clause BSD license; see accompanying LICENSE file
+ *
+ **/
+
+namespace caffe2 {
+
+namespace {
+
+// input in (int8, NHWC), output in (fp32, NCHW)
+template <typename In, typename Out>
+__global__ void transform_kernel(
+    const int N,
+    const int C,
+    const int H,
+    const int W,
+    const float* mean,
+    const float* std,
+    const In* in,
+    Out* out) {
+  const int n = blockIdx.x;
+
+  const int nStride = C*H*W;
+
+  // pointers to data for this image
+  const In* input_ptr = &in[n*nStride];
+  Out* output_ptr = &out[n*nStride];
+
+  // either read or write uncoalesced - try reading
+  for (int c=0; c < C; ++c) {
+    for (int h=threadIdx.y; h < H; h += blockDim.y) {
+      for (int w=threadIdx.x; w < W; w += blockDim.x) {
+        int in_idx = c + C*w + C*W*h;  // HWC
+        int out_idx = c*H*W + h*W + w;  // CHW
+
+        output_ptr[out_idx] = convert::To<float,Out>(
+          (convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
+      }
+    }
+  }
+}
+
+}
+
+template <typename T_IN, typename T_OUT, class Context>
+
+bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
+                    Tensor<Context>& mean, Tensor<Context>& std,
+                    Context *context) {
+  // data comes in as NHWC
+  const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
+  // data goes out as NCHW
+  Y->Resize(std::vector<int>{N,C,H,W});
+
+  auto* input_data = X.template data<T_IN>();
+  auto* output_data = Y->template mutable_data<T_OUT>();
+
+  transform_kernel<
+    T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
+      N, C, H, W, mean.template data<float>(), std.template data<float>(),
+      input_data, output_data);
+  return true;
+};
+
+template bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X,
+                                                          Tensor<CUDAContext> *Y,
+                                                          Tensor<CUDAContext>& mean,
+                                                          Tensor<CUDAContext>& std,
+                                                          CUDAContext *context);
+
+template bool TransformOnGPU<uint8_t, float16, CUDAContext>(Tensor<CUDAContext>& X,
+                                                            Tensor<CUDAContext> *Y,
+                                                            Tensor<CUDAContext>& mean,
+                                                            Tensor<CUDAContext>& std,
+                                                            CUDAContext *context);
+
+}  // namespace caffe2
diff --git a/caffe2/image/transform_gpu.h b/caffe2/image/transform_gpu.h
new file mode 100644
index 0000000..a19b525
--- /dev/null
+++ b/caffe2/image/transform_gpu.h
@@ -0,0 +1,40 @@
+#ifndef CAFFE2_IMAGE_TRANSFORM_GPU_H_
+#define CAFFE2_IMAGE_TRANSFORM_GPU_H_
+
+/**
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **/
+
+#include "caffe2/core/context.h"
+
+namespace caffe2 {
+
+template <typename T_IN, typename T_OUT, class Context>
+bool TransformOnGPU(Tensor<Context>& X, Tensor<Context>* Y,
+                    Tensor<Context>& mean, Tensor<Context>& std,
+                    Context* context);
+
+}  // namespace caffe2
+
+#endif
diff --git a/caffe2/mkl/CMakeLists.txt b/caffe2/mkl/CMakeLists.txt
new file mode 100644
index 0000000..845afc0
--- /dev/null
+++ b/caffe2/mkl/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(USE_MKL AND USE_MKLML)
+  message(STATUS "Including MKL operators")
+
+  # ---[ CPU files.
+  file(GLOB_RECURSE tmp *.cc)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+  # exclude test files and gpu files
+  file(GLOB_RECURSE tmp *_test.cc)
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+
+  # ---[ CPU test files - currently none but just to be safe
+  file(GLOB_RECURSE tmp *_test.cc)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+
+  # ---[ Send the lists to the parent scope.
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+else()
+  message(STATUS "Excluding mkl operators as we are not using mkl")
+endif()
diff --git a/caffe2/mkl/mkl_operator.cc b/caffe2/mkl/mkl_operator.cc
new file mode 100644
index 0000000..fb03205
--- /dev/null
+++ b/caffe2/mkl/mkl_operator.cc
@@ -0,0 +1,19 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_mkl_memonger_in_use,
+    false,
+    "Turn on if memonger is used to force reallocate intermediate "
+    "and output buffers within each op");
+
+namespace caffe2 {
+
+CAFFE_DEFINE_REGISTRY(
+    MKLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::MKLDNN, MKLOperatorRegistry);
+
+} // namespace caffe2
diff --git a/caffe2/mkl/mkl_utils.h b/caffe2/mkl/mkl_utils.h
new file mode 100644
index 0000000..9514129
--- /dev/null
+++ b/caffe2/mkl/mkl_utils.h
@@ -0,0 +1,76 @@
+#ifndef CAFFE2_UTILS_MKL_UTILS_H_
+#define CAFFE2_UTILS_MKL_UTILS_H_
+
+#include "caffe2/core/macros.h"  // For caffe2 macros.
+
+#ifdef CAFFE2_USE_MKL
+
+#include "caffe2/mkl/utils/mkl_version_check.h"
+
+// MKLDNN_CHECK should be used in places where exceptions should not be thrown,
+// such as in destructors.
+#define MKLDNN_CHECK(condition)   \
+  do {                            \
+    dnnError_t error = condition; \
+    CAFFE_ENFORCE_EQ(             \
+        error,                    \
+        E_SUCCESS,                \
+        "Error at : ",            \
+        __FILE__,                 \
+        ":",                      \
+        __LINE__,                 \
+        ", error number: ",       \
+        error);                   \
+  } while (0)
+
+#define MKLDNN_SAFE_CALL(condition) \
+  do {                              \
+    dnnError_t error = condition;   \
+    CAFFE_ENFORCE_EQ(               \
+        error,                      \
+        E_SUCCESS,                  \
+        "Error at : ",              \
+        __FILE__,                   \
+        ":",                        \
+        __LINE__,                   \
+        ", error number: ",         \
+        error);                     \
+  } while (0)
+
+#define CHECK_INPUT_FILTER_DIMS(X, filter, condition) \
+  do {                                                \
+    if (cached_input_dims_ != X.dims() ||             \
+        cached_filter_dims_ != filter.dims()) {       \
+      cached_input_dims_ = X.dims();                  \
+      cached_filter_dims_ = filter.dims();            \
+      condition = true;                               \
+    } else {                                          \
+      condition = false;                              \
+    }                                                 \
+  } while (0)
+
+#define CHECK_INPUT_DIMS(X, condition)    \
+  do {                                    \
+    if (cached_input_dims_ != X.dims()) { \
+      cached_input_dims_ = X.dims();      \
+      condition = true;                   \
+    } else {                              \
+      condition = false;                  \
+    }                                     \
+  } while (0)
+
+// All caffe2 mkl related headers
+
+#ifdef CAFFE2_HAS_MKL_DNN
+#include "caffe2/mkl/utils/mkl_context.h"
+#include "caffe2/mkl/utils/mkl_dnn_cppwrapper.h"
+#include "caffe2/mkl/utils/mkl_memory.h"
+#include "caffe2/mkl/utils/mkl_operator.h"
+#endif // CAFFE2_HAS_MKL_DNN
+
+#ifdef CAFFE2_HAS_MKL_SGEMM_PACK
+#include "caffe2/mkl/utils/sgemm_pack.h"
+#endif // CAFFE2_HAS_MKL_SGEMM_PACK
+
+#endif // CAFFE2_USE_MKL
+#endif // CAFFE2_UTILS_MKL_UTILS_H_
diff --git a/caffe2/mkl/mkl_utils_test.cc b/caffe2/mkl/mkl_utils_test.cc
new file mode 100644
index 0000000..678d643
--- /dev/null
+++ b/caffe2/mkl/mkl_utils_test.cc
@@ -0,0 +1,134 @@
+#include "caffe2/core/blob.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/math.h"
+
+#include <gtest/gtest.h>
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+TEST(MKLDNNTest, SimpleConvolutionTest) {
+  // In order to create an internal layout, let's use convolution as primitive.
+  size_t dimension = 4;
+  size_t bdata_sizes[4] = {32, 32, 8, 16};
+  size_t bdata_offsets[4] = {1, 32, 32 * 32, 32 * 32 * 8};
+  size_t tdata_sizes[4] = {30, 30, 64, 16};
+  size_t fdata_sizes[4] = {3, 3, 8, 64};
+  size_t strides[2] = {1, 1};
+  int pads[2] = {0, 0};
+
+  // Creating Input and output tensors
+  TensorCPU X(vector<TIndex>{16, 8, 32, 32});
+  TensorCPU W(vector<TIndex>{64, 8, 3, 3});
+  TensorCPU b(vector<TIndex>{64});
+  TensorCPU Y(vector<TIndex>{16, 64, 30, 30});
+
+  float* data = X.mutable_data<float>();
+  for (int i = 0; i < X.size(); ++i) {
+    data[i] = 1;
+  }
+  data = W.mutable_data<float>();
+  for (int i = 0; i < W.size(); ++i) {
+    data[i] = 1;
+  }
+  data = b.mutable_data<float>();
+  for (int i = 0; i < b.size(); ++i) {
+    data[i] = 0.1;
+  }
+
+  PrimitiveWrapper<float> primitive(
+      dnnConvolutionCreateForwardBias<float>,
+      nullptr,
+      dnnAlgorithmConvolutionDirect,
+      dimension,
+      bdata_sizes,
+      tdata_sizes,
+      fdata_sizes,
+      strides,
+      pads,
+      dnnBorderZeros);
+
+  // Test if the resource wrapper works.
+  MKLMemory<float> X_wrapper(X.dims(), primitive, dnnResourceSrc);
+  X_wrapper.CopyFrom(X);
+  TensorCPU X_recover(X.dims());
+  X_wrapper.CopyTo(&X_recover);
+  const float* recover_data = X_recover.data<float>();
+  for (int i = 0; i < X_recover.size(); ++i) {
+    EXPECT_EQ(recover_data[i], 1);
+  }
+
+  // Create W, b and Y wrappers, and run the convolution algorithm.
+  MKLMemory<float> W_wrapper(W.dims(), primitive, dnnResourceFilter);
+  W_wrapper.CopyFrom(W);
+  MKLMemory<float> b_wrapper(b.dims(), primitive, dnnResourceBias);
+  b_wrapper.CopyFrom(b);
+  MKLMemory<float> Y_wrapper(Y.dims(), primitive, dnnResourceDst);
+
+  void* resources[dnnResourceNumber] = {
+      X_wrapper.buffer(),
+      Y_wrapper.buffer(),
+      W_wrapper.buffer(),
+      b_wrapper.buffer(),
+  };
+
+  MKLDNN_SAFE_CALL(dnnExecute<float>(primitive, resources));
+  Y_wrapper.CopyTo(&Y);
+  const float* out_data = Y.data<float>();
+  for (int i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(out_data[i], 72.1, 1e-5);
+  }
+}
+
+TEST(MKLDNNTest, MKLMemoryCopyTest) {
+  // Test copy with valid and empty shapes.
+  // MKL calls fail at different points for dims {0} and dims {0,N} despite
+  // the buffer size being empty for both - former in dnnAllocateBuffer and
+  // the latter in dnnConversionExecute (likely due to some difference in
+  // layout?). Test both cases.
+  vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
+  for (const auto& dims : dims_list) {
+    auto X_cpu_in = caffe2::make_unique<TensorCPU>(dims);
+    CPUContext ctx;
+    math::RandUniform<float, CPUContext>(
+        X_cpu_in->size(),
+        -1.0,
+        1.0,
+        X_cpu_in->template mutable_data<float>(),
+        &ctx);
+
+    // CPU -> MKL1
+    auto X_mkl1 = caffe2::make_unique<MKLMemory<float>>(dims);
+    X_mkl1->CopyFrom(*X_cpu_in);
+
+    // MK1 -> MKL2
+    auto X_mkl2 = caffe2::make_unique<MKLMemory<float>>(dims);
+    X_mkl2->CopyFrom(*X_mkl1);
+
+    // MKL1 <- MKL2
+    X_mkl1 = caffe2::make_unique<MKLMemory<float>>();
+    X_mkl2->CopyTo(X_mkl1.get());
+    EXPECT_EQ(X_mkl1->dims(), dims);
+    EXPECT_EQ(X_mkl1->size(), X_cpu_in->size());
+
+    // CPU <- MKL1
+    auto X_cpu_out = caffe2::make_unique<TensorCPU>();
+    X_mkl1->CopyTo(X_cpu_out.get());
+    EXPECT_EQ(X_cpu_out->dims(), dims);
+    EXPECT_EQ(X_cpu_out->size(), X_cpu_in->size());
+    for (int i = 0; i < X_cpu_in->size(); ++i) {
+      EXPECT_NEAR(
+          X_cpu_in->data<float>()[i], X_cpu_out->data<float>()[i], 1e-5);
+    }
+  }
+}
+
+} // namespace mkl
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/mklmemory_serialization.cc b/caffe2/mkl/mklmemory_serialization.cc
new file mode 100644
index 0000000..35f787b
--- /dev/null
+++ b/caffe2/mkl/mklmemory_serialization.cc
@@ -0,0 +1,123 @@
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+/**
+ * @brief MKLMemorySerializer is the serializer for MKLMemory.
+ *
+ * MKLMemorySerializer takes in a blob that contains an MKLMemory, and
+ * serializes it into a TensorProto protocol buffer.
+ */
+class MKLMemorySerializer : public BlobSerializerBase {
+ public:
+  MKLMemorySerializer() {}
+  ~MKLMemorySerializer() {}
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(kTensorBlobType);
+    TensorProto* proto = blob_proto.mutable_tensor();
+    auto* device_detail = proto->mutable_device_detail();
+    device_detail->set_device_type(MKLDNN);
+    proto->set_name(name);
+    if (blob.IsType<MKLMemory<float>>()) {
+      const MKLMemory<float>& src = blob.Get<MKLMemory<float>>();
+      CAFFE_ENFORCE(
+          src.buffer(), "Cannot serialize an empty MKLMemory object.");
+      size_t total = 1;
+      for (int i = 0; i < src.dims().size(); ++i) {
+        proto->add_dims(src.dims()[i]);
+        total *= src.dims()[i];
+      }
+      proto->mutable_float_data()->Reserve(total);
+      while (total--) {
+        proto->add_float_data(0);
+      }
+      src.CopyTo(proto->mutable_float_data()->mutable_data());
+    } else if (blob.IsType<MKLMemory<double>>()) {
+      const MKLMemory<double>& src = blob.Get<MKLMemory<double>>();
+      CAFFE_ENFORCE(
+          src.buffer(), "Cannot serialize an empty MKLMemory object.");
+      size_t total = 1;
+      for (int i = 0; i < src.dims().size(); ++i) {
+        proto->add_dims(src.dims()[i]);
+        total *= src.dims()[i];
+      }
+      proto->mutable_double_data()->Reserve(total);
+      while (total--) {
+        proto->add_double_data(0);
+      }
+      src.CopyTo(proto->mutable_double_data()->mutable_data());
+    } else {
+      CAFFE_THROW(
+          "MKLMemory could only be either float or double. "
+          "Encountered unsupported type.");
+    }
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+/**
+ * @brief MKLMemoryDeserializer is the deserializer for TensorProto that has
+ * MKLDNN as its device.
+ *
+ * The device that the deserialized Tensor will live under is determined by the
+ * device_detail field. If you want to specify the device of the deserialized
+ * tensor, change the TensorProto's corresponding fields before calling
+ * Deserialize.
+ */
+class MKLMemoryDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
+    const TensorProto& proto = blob_proto.tensor();
+    CAFFE_ENFORCE(
+        proto.data_type() == TensorProto_DataType_FLOAT ||
+            proto.data_type() == TensorProto_DataType_DOUBLE,
+        "MKLMemory only supports either float or double formats.");
+    CAFFE_ENFORCE(
+        !proto.has_segment(), "MKLMemory does not support segment right now.");
+    vector<TIndex> dims;
+    for (const TIndex d : proto.dims()) {
+      dims.push_back(d);
+    }
+    // TODO: right now, every time we do a deserializer we create a new MKL
+    // Memory object. Optionally, we can change that.
+    switch (proto.data_type()) {
+      case TensorProto_DataType_FLOAT: {
+        auto dst = make_unique<MKLMemory<float>>(dims);
+        dst->CopyFrom(proto.float_data().data());
+        blob->Reset(dst.release());
+        break;
+      }
+      case TensorProto_DataType_DOUBLE: {
+        auto dst = make_unique<MKLMemory<double>>(dims);
+        dst->CopyFrom(proto.double_data().data());
+        blob->Reset(dst.release());
+        break;
+      }
+      default:
+        CAFFE_THROW("This should not happen, we guarded things above already.");
+    }
+  }
+};
+
+} // namespace mkl
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<mkl::MKLMemory<float>>()),
+    mkl::MKLMemorySerializer);
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<mkl::MKLMemory<double>>()),
+    mkl::MKLMemorySerializer);
+REGISTER_BLOB_DESERIALIZER(TensorMKLDNN, mkl::MKLMemoryDeserializer);
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/mklmemory_serialization_test.cc b/caffe2/mkl/mklmemory_serialization_test.cc
new file mode 100644
index 0000000..d4b8481
--- /dev/null
+++ b/caffe2/mkl/mklmemory_serialization_test.cc
@@ -0,0 +1,54 @@
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/common.h"
+#include "caffe2/mkl/mkl_utils.h"
+
+#include <gtest/gtest.h>
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+
+using mkl::MKLMemory;
+
+TEST(MKLTest, MKLMemorySerialization) {
+  Blob blob;
+  vector<int> shape{2, 3, 4};
+  float data[2 * 3 * 4];
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    data[i] = i;
+  }
+  blob.Reset<MKLMemory<float>>(new MKLMemory<float>(shape));
+  MKLMemory<float>* mkl_memory = blob.GetMutable<MKLMemory<float>>();
+  mkl_memory->CopyFrom(data);
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "Tensor");
+  EXPECT_TRUE(proto.has_tensor());
+  const TensorProto& tensor_proto = proto.tensor();
+  EXPECT_EQ(
+      tensor_proto.data_type(), TypeMetaToDataType(TypeMeta::Make<float>()));
+  EXPECT_EQ(tensor_proto.float_data_size(), 2 * 3 * 4);
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    EXPECT_EQ(tensor_proto.float_data(i), static_cast<float>(i));
+  }
+  Blob new_blob;
+  EXPECT_NO_THROW(new_blob.Deserialize(serialized));
+  EXPECT_TRUE(new_blob.IsType<MKLMemory<float>>());
+  const auto& new_mkl_memory = blob.Get<MKLMemory<float>>();
+  EXPECT_EQ(new_mkl_memory.dims().size(), 3);
+  EXPECT_EQ(new_mkl_memory.dims()[0], 2);
+  EXPECT_EQ(new_mkl_memory.dims()[1], 3);
+  EXPECT_EQ(new_mkl_memory.dims()[2], 4);
+  float recovered_data[2 * 3 * 4];
+  new_mkl_memory.CopyTo(recovered_data);
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    EXPECT_EQ(recovered_data[i], i);
+  }
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/concat_op.cc b/caffe2/mkl/operators/concat_op.cc
new file mode 100644
index 0000000..204f1c1
--- /dev/null
+++ b/caffe2/mkl/operators/concat_op.cc
@@ -0,0 +1,108 @@
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLConcatOp final : public MKLOperator<T> {
+ public:
+  USE_MKLOPERATOR_FUNCTIONS(T);
+
+  MKLConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : MKLOperator<T>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        !(OperatorBase::HasArgument("axis") &&
+          OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to concat, and the order "
+        "in the case of 4-D images.");
+
+    int add_axis;
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
+      add_axis = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+    } else {
+      const auto& order = StringToStorageOrder(
+          OperatorBase::GetSingleArgument<string>("order", "NCHW"));
+      OPERATOR_NEEDS_FEATURE(
+          order == StorageOrder::NCHW, "Only NCHW order supported.");
+      axis_ = 1;
+      add_axis = 0;
+    }
+
+    OPERATOR_NEEDS_FEATURE(
+        axis_ == 1 || axis_ == -3, "Only channel concatenation is supported.");
+    OPERATOR_NEEDS_FEATURE(add_axis == 0, "Adding axis is not supported.");
+  }
+
+  bool RunOnDevice() override {
+    const auto& X0 = Input(0);
+    auto* Y = Output(0);
+    int nInputs = InputSize();
+    int nDims = X0.ndim();
+    CAFFE_ENFORCE_EQ(nDims, 4, "Only NCHW inputs are supported");
+
+    bool dims_changed = (input_size_cache_.size() != nInputs);
+    for (int i = 0; i < nInputs && !dims_changed; ++i) {
+      dims_changed = (input_size_cache_[i] != Input(i).dims());
+    }
+
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      input_size_cache_.resize(nInputs);
+      int output_channels = 0;
+      int canonical_axis = canonical_axis_index_(axis_, nDims);
+      vector<dnnLayout_t> input_layouts(nInputs);
+      for (int i = 0; i < nInputs; ++i) {
+        const auto& Xi = Input(i);
+        CAFFE_ENFORCE_EQ(Xi.ndim(), nDims, "Input ", i, " has wrong ndim.");
+        for (int j = 0; j < nDims; ++j) {
+          if (j == canonical_axis) {
+            continue;
+          }
+          CAFFE_ENFORCE_EQ(
+              Xi.dim32(j),
+              X0.dim32(j),
+              "Input ",
+              i,
+              " has dimension mismatch at axis ",
+              j);
+        }
+        input_size_cache_[i] = Xi.dims();
+        output_channels += Xi.dim32(canonical_axis);
+        input_layouts[i] = Xi.layout();
+      }
+      cached_output_dims_ = X0.dims();
+      cached_output_dims_[canonical_axis] = output_channels;
+
+      primitive_.Reset(
+          dnnConcatCreate<T>, nullptr, nInputs, input_layouts.data());
+      Y->Reset(cached_output_dims_, primitive_, dnnResourceDst);
+      buffer_.Reset(cached_output_dims_, primitive_, dnnResourceDst, true);
+    }
+    bool shared = buffer_.ShareFrom(*Y);
+
+    for (int i = 0; i < nInputs; ++i) {
+      resources_[dnnResourceMultipleSrc + i] = Input(i).buffer();
+    }
+    resources_[dnnResourceDst] = buffer_.buffer();
+    ExecutePrimitive();
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+ private:
+  int axis_;
+  vector<TIndex> cached_output_dims_;
+};
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(Concat, mkl::MKLConcatOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/conv_op.cc b/caffe2/mkl/operators/conv_op.cc
new file mode 100644
index 0000000..7161831
--- /dev/null
+++ b/caffe2/mkl/operators/conv_op.cc
@@ -0,0 +1,189 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(MKLContext);
+  MKLConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<MKLContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() == pad_r() && pad_t() == pad_b(),
+        "Uneven padding not supported.");
+    OPERATOR_NEEDS_FEATURE(
+        order_ == StorageOrder::NCHW, "Only NCHW order supported.");
+  }
+  ~MKLConvOp() {}
+
+  // TODO(jiayq): support double if needed.
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = OperatorBase::Input<MKLMemory<float>>(INPUT);
+    const auto& filter = OperatorBase::Input<MKLMemory<float>>(FILTER);
+
+    const int M = filter.dim32(0);
+    if (InputSize() == 2 && !zero_bias_) {
+      TensorCPU cpu_zero_bias;
+      cpu_zero_bias.Resize(M);
+      CPUContext ctx;
+      math::Set<T, CPUContext>(
+          M, 0.0, cpu_zero_bias.template mutable_data<float>(), &ctx);
+
+      zero_bias_.reset(new MKLMemory<T>(std::vector<TIndex>{M}));
+      zero_bias_->CopyFrom(cpu_zero_bias);
+    }
+    const auto& bias = InputSize() == 2
+        ? *zero_bias_
+        : OperatorBase::Input<MKLMemory<float>>(BIAS);
+
+    MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
+    CAFFE_ENFORCE(4 == X.ndim());
+    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+    CAFFE_ENFORCE(4 == filter.ndim());
+
+    bool dims_changed;
+    CHECK_INPUT_FILTER_DIMS(X, filter, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      CAFFE_ENFORCE(
+          C == filter.dim32(1) * group_,
+          "Convolution op: input channels does not match: # of input channels ",
+          C,
+          " is not equal to kernel channels * group:",
+          filter.dim32(1),
+          "*",
+          group_);
+      CAFFE_ENFORCE(
+          M % group_ == 0,
+          "The number of output channels is not divisible by group.");
+      CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
+      CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
+      CAFFE_ENFORCE(bias.ndim() == 1);
+      CAFFE_ENFORCE(bias.dim32(0) == M);
+
+      size_t dimension = 4;
+      size_t bdata_sizes[4] = {W, H, C, N};
+      // We will utilize the SetOutputSize() function int he base class
+      // with dummy TensorCPU input and output to calculate the sizes.
+      TensorCPU dummy_input(X.dims());
+      TensorCPU dummy_output;
+      ConvPoolOpBase<MKLContext>::SetOutputSize(
+          dummy_input, &dummy_output, M);
+      size_t tdata_sizes[4] = {
+          dummy_output.dim(3), dummy_output.dim(2),
+          dummy_output.dim(1), dummy_output.dim(0)};
+      size_t fdata_sizes[5] = {
+          kernel_w(), kernel_h(), C / group_, M / group_, group_};
+      size_t strides[2] = {stride_w(), stride_h()};
+      int pads[2] = {-pad_l(), -pad_t()};
+
+      if (group_ > 1) {
+        primitive_.Reset(
+            dnnGroupsConvolutionCreateForwardBias<float>,
+            nullptr,
+            dnnAlgorithmConvolutionDirect,
+            group_,
+            dimension,
+            bdata_sizes,
+            tdata_sizes,
+            fdata_sizes,
+            strides,
+            pads,
+            dnnBorderZeros);
+      } else {
+        primitive_.Reset(
+            dnnConvolutionCreateForwardBias<float>,
+            nullptr,
+            dnnAlgorithmConvolutionDirect,
+            dimension,
+            bdata_sizes,
+            tdata_sizes,
+            fdata_sizes,
+            strides,
+            pads,
+            dnnBorderZeros);
+      }
+      Y->Reset(dummy_output.dims(), primitive_, dnnResourceDst);
+      buffer_.Reset(dummy_output.dims(), primitive_, dnnResourceDst, true);
+      input_layout_.Reset(primitive_, dnnResourceSrc);
+      filter_layout_.Reset(primitive_, dnnResourceFilter);
+      bias_layout_.Reset(primitive_, dnnResourceBias);
+    }
+
+    // Try to share from the output: this allows us to avoid unnecessary copy
+    // operations, if the output is already allocated and is having the same
+    // layout as the buffer has.
+    bool shared = buffer_.ShareFrom(*Y);
+
+    std::shared_ptr<void> X_view = X.View(
+        input_layout_, primitive_, dnnResourceSrc);
+    std::shared_ptr<void> bias_view =
+        bias.View(bias_layout_, primitive_, dnnResourceBias);
+    std::shared_ptr<void> filter_view;
+    if (group_ > 1) {
+      // Explicitly reformat the buffer.
+      MKLMemory<float> group_filter(
+          std::vector<TIndex>{TIndex(group_),
+                              TIndex(filter.dim32(0) / group_),
+                              TIndex(filter.dim32(1)),
+                              TIndex(filter.dim32(2)),
+                              TIndex(filter.dim32(3))},
+          nullptr,
+          dnnResourceFilter,
+          /*share_memory_if_possible=*/true);
+      group_filter.CopyFrom(filter.buffer());
+      filter_view =
+          group_filter.View(filter_layout_, primitive_, dnnResourceFilter);
+    } else {
+      filter_view = filter.View(filter_layout_, primitive_, dnnResourceFilter);
+    }
+
+    resources_[dnnResourceSrc] = X_view.get(); // X.buffer();
+    resources_[dnnResourceFilter] = filter_view.get();
+    resources_[dnnResourceBias] = bias_view.get();
+    resources_[dnnResourceDst] = buffer_.buffer();
+
+    MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      // buffer_ is not shared with Y. Free memory since it'll
+      // be re-allocated in the next run anyway due to memonger in use.
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ private:
+  // Input: X, W, b
+  // Output: Y
+  std::unique_ptr<MKLMemory<T>> zero_bias_;
+  vector<TIndex> cached_input_dims_;
+  vector<TIndex> cached_filter_dims_;
+  PrimitiveWrapper<T> primitive_;
+  LayoutWrapper<T> input_layout_;
+  LayoutWrapper<T> filter_layout_;
+  LayoutWrapper<T> bias_layout_;
+  MKLMemory<T> buffer_;
+  void* resources_[dnnResourceNumber] = {0};
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+} // namespace mkl
+
+
+REGISTER_MKL_OPERATOR(Conv, mkl::MKLConvOp<float>);
+
+}  // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/conv_op_mkldnn.cc b/caffe2/mkl/operators/conv_op_mkldnn.cc
new file mode 100644
index 0000000..0e36386
--- /dev/null
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@@ -0,0 +1,126 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
+  ConvMKLDNNOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() == pad_r() && pad_t() == pad_b(),
+        "Uneven padding not supported.");
+    OPERATOR_NEEDS_FEATURE(
+        order_ == StorageOrder::NCHW, "Only NCHW order supported.");
+  }
+  ~ConvMKLDNNOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(INPUT);
+    auto& filter = Input(FILTER);
+    auto& bias = Input(BIAS);
+    TensorCPU* Y = Output(0);
+    CAFFE_ENFORCE(4 == X.ndim());
+    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+    CAFFE_ENFORCE(4 == filter.ndim());
+    const int M = filter.dim32(0);
+    CAFFE_ENFORCE(
+        C == filter.dim32(1),
+        "Convolution op: # of input channels ",
+        C,
+        " is not equal to kernel channels:",
+        filter.dim32(1));
+    CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
+    CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
+    CAFFE_ENFORCE(bias.ndim() == 1);
+    CAFFE_ENFORCE(bias.dim32(0) == M);
+    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+    // Pre-allocate Y so we can potentially share memory if applicable.
+    Y->mutable_data<T>();
+
+    if (cached_input_dims_ != X.dims() ||
+        cached_filter_dims_ != filter.dims()) {
+      cached_input_dims_ = X.dims();
+      cached_filter_dims_ = filter.dims();
+      // In order to create an internal layout, let's use convolution as
+      // primitive.
+      size_t dimension = 4;
+      size_t bdata_sizes[4] = {W, H, C, N};
+      size_t bdata_offsets[4] = {1, W, W * H, W * H * C};
+      size_t tdata_sizes[4] = {Y->dim(3), Y->dim(2), Y->dim(1), Y->dim(0)};
+      size_t fdata_sizes[4] = {kernel_w(), kernel_h(), C, M};
+      size_t strides[2] = {stride_w(), stride_h()};
+      int pads[2] = {-pad_l(), -pad_t()};
+
+      primitive_.Reset(
+          dnnConvolutionCreateForwardBias<float>,
+          nullptr,
+          dnnAlgorithmConvolutionDirect,
+          dimension,
+          bdata_sizes,
+          tdata_sizes,
+          fdata_sizes,
+          strides,
+          pads,
+          dnnBorderZeros);
+      X_wrapper_.reset(
+          new MKLMemory<T>(X.dims(), primitive_, dnnResourceSrc, true));
+      filter_wrapper_.reset(
+          new MKLMemory<T>(filter.dims(), primitive_, dnnResourceFilter, true));
+      bias_wrapper_.reset(
+          new MKLMemory<T>(bias.dims(), primitive_, dnnResourceBias, true));
+      Y_wrapper_.reset(
+          new MKLMemory<T>(Y->dims(), primitive_, dnnResourceDst, true));
+      X_wrapper_->CopyFrom(X);
+      filter_wrapper_->CopyFrom(filter);
+      bias_wrapper_->CopyFrom(bias);
+      Y_wrapper_->ShareFromTensor(*Y);
+      resources_[dnnResourceSrc] = X_wrapper_->buffer();
+      resources_[dnnResourceFilter] = filter_wrapper_->buffer();
+      resources_[dnnResourceBias] = bias_wrapper_->buffer();
+      resources_[dnnResourceDst] = Y_wrapper_->buffer();
+    } else {
+      X_wrapper_->CopyFrom(X);
+      filter_wrapper_->CopyFrom(filter);
+      bias_wrapper_->CopyFrom(bias);
+      Y_wrapper_->ShareFromTensor(*Y);
+    }
+    MKLDNN_SAFE_CALL(dnnExecute<float>(primitive_, resources_));
+    Y_wrapper_->CopyTo(Y);
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ private:
+  // Input: X, W, b
+  // Output: Y
+  vector<TIndex> cached_input_dims_;
+  vector<TIndex> cached_filter_dims_;
+  PrimitiveWrapper<T> primitive_;
+  unique_ptr<MKLMemory<T>> X_wrapper_ = nullptr;
+  unique_ptr<MKLMemory<T>> filter_wrapper_ = nullptr;
+  unique_ptr<MKLMemory<T>> bias_wrapper_ = nullptr;
+  unique_ptr<MKLMemory<T>> Y_wrapper_ = nullptr;
+  void* resources_[dnnResourceNumber] = {0};
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+} // namespace mkl
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, MKLDNN, mkl::ConvMKLDNNOp<float>);
+
+}  // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/elementwise_sum_op.cc b/caffe2/mkl/operators/elementwise_sum_op.cc
new file mode 100644
index 0000000..7827e87
--- /dev/null
+++ b/caffe2/mkl/operators/elementwise_sum_op.cc
@@ -0,0 +1,78 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLSumOp final : public MKLOperator<T> {
+ public:
+  USE_MKLOPERATOR_FUNCTIONS(T);
+
+  MKLSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : MKLOperator<T>(operator_def, ws) {
+    coefficients_.resize(this->InputSize(), 1);
+  }
+
+  bool RunOnDevice() override {
+    const MKLMemory<T>& X0 = Input(0);
+    // caffe2::AddOp support broadcast but dnnSumCreate() doesn't.
+    for (auto i = 0; i < this->InputSize(); ++i) {
+      const MKLMemory<T>& Xi = Input(i);
+      CAFFE_ENFORCE_EQ(X0.dims(), Xi.dims(), "Broadcast is not yet supported with MKLDNN.");
+    }
+    MKLMemory<T>* Y = Output(0);
+    bool dims_changed;
+    CHECK_INPUT_DIMS(X0, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      primitive_.Reset(
+          dnnSumCreate<T>,
+          nullptr,
+          this->InputSize(),
+          X0.layout(),
+          coefficients_.data());
+      if (Y != &X0) {
+        Y->Reset(X0.dims(), primitive_, dnnResourceDst);
+      }
+      buffer_.Reset(X0.dims(), primitive_, dnnResourceDst, true);
+    }
+    input_views_.resize(this->InputSize());
+    for (auto i = 0; i < this->InputSize(); ++i) {
+      const MKLMemory<T>& Xi = Input(i);
+      // Input layouts might be different depending on preceding primitives.
+      // Create a consistent view as dnnSumCreate expects it that way.
+      input_views_[i] = Xi.View(X0.layout());
+      resources_[dnnResourceMultipleSrc + i] = input_views_[i].get();
+    }
+    bool shared = false;
+    if (Y != &X0) {
+      // TODO: MKLDNN seems broken in the in-place case, so when we specify
+      // in-place we will need to use buffer differnt from X0/Y.
+      shared = buffer_.ShareFrom(*Y);
+    }
+    resources_[dnnResourceDst] = buffer_.buffer();
+    MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+ private:
+  std::vector<float> coefficients_;
+  vector<TIndex> cached_input_dims_;
+  vector<std::shared_ptr<void>> input_views_;
+};
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(Sum, mkl::MKLSumOp<float>);
+REGISTER_MKL_OPERATOR(Add, mkl::MKLSumOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/fully_connected_op.cc b/caffe2/mkl/operators/fully_connected_op.cc
new file mode 100644
index 0000000..404a67a
--- /dev/null
+++ b/caffe2/mkl/operators/fully_connected_op.cc
@@ -0,0 +1,110 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLFullyConnectedOp final : public MKLOperator<T> {
+ public:
+  MKLFullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : MKLOperator<T>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
+  ~MKLFullyConnectedOp() {}
+
+  bool RunOnDevice() override {
+    auto& X = OperatorBase::Input<MKLMemory<float>>(INPUT);
+    auto& filter = OperatorBase::Input<MKLMemory<float>>(FILTER);
+    auto& bias = OperatorBase::Input<MKLMemory<float>>(BIAS);
+    MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
+
+    CAFFE_ENFORCE(filter.ndim() == 2, filter.ndim());
+    CAFFE_ENFORCE(bias.ndim() == 1, bias.ndim());
+
+    bool dims_changed;
+    CHECK_INPUT_FILTER_DIMS(X, filter, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      const int N = filter.dim32(0);
+      CAFFE_ENFORCE(N == bias.dim32(0));
+
+      auto Y_shape = X.dims();
+      Y_shape[1] = N;
+      Y_shape.resize(2);
+
+      size_t inputSizes[4];
+      if (X.ndim() == 2) {
+        inputSizes[0] = X.dim32(1);
+        inputSizes[1] = X.dim32(0);
+      } else {
+        CAFFE_ENFORCE(X.ndim(), 4);
+        inputSizes[0] = X.dim32(3);
+        inputSizes[1] = X.dim32(2);
+        inputSizes[2] = X.dim32(1);
+        inputSizes[3] = X.dim32(0);
+      }
+
+      size_t outputSizes[2] = {Y_shape[1], Y_shape[0]};
+
+      primitive_.Reset(
+          dnnInnerProductCreateForwardBias<float>,
+          nullptr,
+          X.ndim(),
+          inputSizes,
+          outputSizes[0]);
+
+      Y->Reset(Y_shape, primitive_, dnnResourceDst);
+      buffer_.Reset(Y_shape, primitive_, dnnResourceDst, true);
+
+      input_layout_.Reset(primitive_, dnnResourceSrc);
+      filter_layout_.Reset(primitive_, dnnResourceFilter);
+    }
+
+    // Try to share from the output: this allows us to avoid unnecessary copy
+    // operations, if the output is already allocated and is having the same
+    // layout as the buffer has.
+    bool shared = buffer_.ShareFrom(*Y);
+
+    std::shared_ptr<void> X_view =
+        X.View(input_layout_, primitive_, dnnResourceSrc);
+    std::shared_ptr<void> filter_view =
+        filter.View(filter_layout_, primitive_, dnnResourceFilter);
+
+    resources_[dnnResourceSrc] = X_view.get();
+    resources_[dnnResourceFilter] = filter_view.get();
+
+    resources_[dnnResourceBias] = bias.buffer();
+    resources_[dnnResourceDst] = buffer_.buffer();
+
+    MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+ private:
+  // Input: X, W, b
+  // Output: Y
+  size_t axis_{1};
+  vector<TIndex> cached_input_dims_;
+  vector<TIndex> cached_filter_dims_;
+  PrimitiveWrapper<T> primitive_;
+  LayoutWrapper<T> input_layout_;
+  LayoutWrapper<T> filter_layout_;
+  LayoutWrapper<T> bias_layout_;
+  MKLMemory<T> buffer_;
+  void* resources_[dnnResourceNumber] = {0};
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(FC, mkl::MKLFullyConnectedOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/local_response_normalization_op.cc b/caffe2/mkl/operators/local_response_normalization_op.cc
new file mode 100644
index 0000000..f57b4b4
--- /dev/null
+++ b/caffe2/mkl/operators/local_response_normalization_op.cc
@@ -0,0 +1,85 @@
+#include "caffe2/operators/local_response_normalization_op.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLLRNOp final : public LRNOpBase<T, MKLContext> {
+ public:
+  MKLLRNOp(const OperatorDef& operator_def, Workspace* ws)
+      : LRNOpBase<T, MKLContext>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  vector<TIndex> cached_input_dims_;
+  LayoutWrapper<T> workspace_layout_;
+  std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
+  PrimitiveWrapper<T> primitive_;
+  MKLMemory<T> buffer_;
+  void* resources_[dnnResourceNumber] = {0};
+};
+
+template <>
+bool MKLLRNOp<float>::RunOnDeviceWithOrderNCHW() {
+  auto& X = OperatorBase::Input<MKLMemory<float>>(0);
+  MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
+
+  bool dims_changed;
+  CHECK_INPUT_DIMS(X, dims_changed);
+  if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+    size_t dim = X.ndim();
+    CAFFE_ENFORCE(4 == dim);
+
+    // Create main primitive.
+    primitive_.Reset(
+        dnnLRNCreateForward_F32,
+        nullptr,
+        X.layout(),
+        size_,
+        alpha_,
+        beta_,
+        bias_);
+
+    Y->Reset(X.dims(), primitive_, dnnResourceDst);
+    buffer_.Reset(X.dims(), primitive_, dnnResourceDst, true);
+
+    workspace_layout_.Reset(primitive_, dnnResourceWorkspace);
+    workspace_buffer_ =
+        caffe2::make_unique<MKLWorkspace<float>>(workspace_layout_);
+  }
+
+  // Try to share from the output: this allows us to avoid unnecessary copy
+  // operations, if the output is already allocated and is having the same
+  // layout as the buffer has.
+  bool shared = buffer_.ShareFrom(*Y);
+  resources_[dnnResourceSrc] = X.buffer();
+  resources_[dnnResourceDst] = buffer_.buffer();
+  resources_[dnnResourceWorkspace] = workspace_buffer_->buffer();
+  MKLDNN_SAFE_CALL(mkl::dnnExecute<float>(primitive_, resources_));
+  buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+  if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+    buffer_.Reset();
+  }
+  return true;
+}
+
+template <>
+bool MKLLRNOp<float>::RunOnDeviceWithOrderNHWC() {
+  CAFFE_NOT_IMPLEMENTED;
+}
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(LRN, mkl::MKLLRNOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.cc b/caffe2/mkl/operators/operator_fallback_mkl.cc
new file mode 100644
index 0000000..106fa05
--- /dev/null
+++ b/caffe2/mkl/operators/operator_fallback_mkl.cc
@@ -0,0 +1,90 @@
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+
+#include "caffe2/mkl/utils/mkl_operator.h"
+#include "caffe2/operators/channel_shuffle_op.h"
+#include "caffe2/operators/cross_entropy_op.h"
+#include "caffe2/operators/dropout_op.h"
+#include "caffe2/operators/elementwise_linear_op.h"
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/operators/filler_op.h"
+#include "caffe2/operators/load_save_op.h"
+#include "caffe2/operators/loss_op.h"
+#include "caffe2/operators/order_switch_ops.h"
+#include "caffe2/operators/reshape_op.h"
+#include "caffe2/operators/roi_align_op.h"
+#include "caffe2/operators/roi_align_rotated_op.h"
+#include "caffe2/operators/softmax_op.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+namespace {
+
+struct SigmoidCPUFunctor {
+  template <typename T>
+  bool operator()(const int n, const T* x, T* y, CPUContext* /* context */)
+      const {
+    ConstEigenVectorArrayMap<T> xM(x, n);
+    EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
+    return true;
+  }
+};
+
+} // namespace
+} // namespace caffe2
+
+// can add more non-MKL operators if needed
+namespace caffe2 {
+REGISTER_MKL_OPERATOR(
+    Softmax,
+    mkl::MKLFallbackOp<SoftmaxOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    Reshape,
+    mkl::MKLFallbackOp<ReshapeOp<float, CPUContext>, SkipIndices<1>>);
+REGISTER_MKL_OPERATOR(
+    LabelCrossEntropy,
+    mkl::MKLFallbackOp<LabelCrossEntropyOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    AveragedLoss,
+    mkl::MKLFallbackOp<AveragedLoss<float, CPUContext>>);
+
+// filter operators
+REGISTER_MKL_OPERATOR(
+    XavierFill,
+    mkl::MKLFallbackOp<XavierFillOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    ConstantFill,
+    mkl::MKLFallbackOp<ConstantFillOp<CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    GaussianFill,
+    mkl::MKLFallbackOp<GaussianFillOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    MSRAFill,
+    mkl::MKLFallbackOp<MSRAFillOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(Load, mkl::MKLFallbackOp<LoadOp<CPUContext>>);
+REGISTER_MKL_OPERATOR(Save, mkl::MKLFallbackOp<SaveOp<CPUContext>>);
+
+REGISTER_MKL_OPERATOR(
+    Sigmoid,
+    mkl::MKLFallbackOp<
+        UnaryElementwiseOp<TensorTypes<float>, CPUContext, SigmoidCPUFunctor>>);
+REGISTER_MKL_OPERATOR(
+    Dropout,
+    mkl::MKLFallbackOp<DropoutOp<float, CPUContext>, SkipIndices<1>>);
+REGISTER_MKL_OPERATOR(
+    ElementwiseLinear,
+    mkl::MKLFallbackOp<ElementwiseLinearOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    ChannelShuffle,
+    mkl::MKLFallbackOp<ChannelShuffleOp<CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    NCHW2NHWC,
+    mkl::MKLFallbackOp<NCHW2NHWCOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    NHWC2NCHW,
+    mkl::MKLFallbackOp<NHWC2NCHWOp<float, CPUContext>>);
+REGISTER_MKL_OPERATOR(
+    RoIAlignRotated,
+    mkl::MKLFallbackOp<RoIAlignRotatedOp<float, CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
new file mode 100644
index 0000000..cc90bc1
--- /dev/null
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -0,0 +1,132 @@
+#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+namespace caffe2 {
+namespace mkl {
+
+/**
+ * @brief A templated class to allow one to wrap a CPU operator as an MKL
+ * operator.
+ *
+ * This class can be used when one does not have the MKL implementation ready
+ * yet for an operator. Essentially, what this op does is to automatically
+ * deal with data copy for you. Plausibly, this causes a lot of overhead and
+ * is not optimal, so you should use this operator mostly for quick prototyping
+ * purpose.
+ *
+ * All the input and output of the original operator should be TensorCPU.
+ *
+ * Example usage: if you have a class MyMagicOp that is CPU based, and you use
+ * the registration code
+ *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
+ * to register the CPU side, you can create its corresponding MKL operator
+ * (with performance hits of course) via
+ *     REGISTER_MKL_OPERATOR(MyMagic,
+ *                            MKLFallbackOp<MyMagicOp>);
+ *
+ * Advanced usage: if you want to have some specific outputs never copied, you
+ * can use the SkipOutputCopy template argument to do that. For example, if
+ * MyMagic produces two outputs and the first output is always going to live on
+ * the CPU, you can do
+ *     REGISTER_MKL_OPERATOR(MyMagic,
+ *                            MKLFallbackOp<MyMagicOp, SkipIndices<0>>);
+ */
+template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
+class MKLFallbackOp final : public Operator<MKLContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(MKLContext);
+  MKLFallbackOp(const OperatorDef& def, Workspace* ws)
+      : Operator<MKLContext>(def, ws) {
+    CAFFE_ENFORCE_EQ(def.device_option().device_type(), MKLDNN);
+    OperatorDef base_def_(def);
+    // base_def_ runs on CPU, so we will set its device option to CPU.
+    // Copy to allow random_seed to be correctly propagated.
+    base_def_.mutable_device_option()->CopyFrom(def.device_option());
+    base_def_.mutable_device_option()->set_device_type(CPU);
+    // Set up the symbols for the local workspace.
+    for (const string& name : def.input()) {
+      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
+      CHECK_NOTNULL(local_input_blobs_.back());
+    }
+    base_op_.reset(new CPUOp(base_def_, &local_ws_));
+    for (const string& name : def.output()) {
+      local_output_blobs_.push_back(local_ws_.GetBlob(name));
+      CHECK_NOTNULL(local_output_blobs_.back());
+    }
+  }
+
+  bool RunOnDevice() override {
+    for (int i = 0; i < InputSize(); ++i) {
+      if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
+        OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
+            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+      } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
+        OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
+            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+      } else {
+        VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
+    }
+
+    if (!base_op_->Run()) {
+      LOG(ERROR) << "Base op run failed in MKLFallbackOp. Def: "
+                 << ProtoDebugString(this->debug_def());
+      return false;
+    }
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      if (SkipOutputCopy::Contains(i)) {
+        VLOG(1) << "Copy output: index " << i << " skipped.";
+        continue;
+      }
+      CAFFE_ENFORCE(
+          local_output_blobs_[i]->template IsType<TensorCPU>(),
+          "MKL fallback op currently does not support non-TensorCPU "
+          "output type who needs copying.");
+      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
+      if (src.template IsType<float>()) {
+        Blob* dst = OperatorBase::OutputBlob(i);
+        if (!dst->template IsType<MKLMemory<float>>() ||
+            dst->Get<MKLMemory<float>>().dims() != src.dims()) {
+          dst->Reset(new MKLMemory<float>(src.dims()));
+        }
+        dst->GetMutable<MKLMemory<float>>()->CopyFrom(src);
+      } else if (src.template IsType<double>()) {
+        Blob* dst = OperatorBase::OutputBlob(i);
+        if (!dst->template IsType<MKLMemory<double>>() ||
+            dst->Get<MKLMemory<double>>().dims() != src.dims()) {
+          dst->Reset(new MKLMemory<double>(src.dims()));
+        }
+        dst->GetMutable<MKLMemory<double>>()->CopyFrom(src);
+      } else {
+        CAFFE_THROW("MKLMemory only supports float and double.");
+      }
+    }
+    return true;
+  }
+
+ protected:
+  Workspace local_ws_;
+  vector<Blob*> local_input_blobs_;
+  vector<Blob*> local_output_blobs_;
+  std::unique_ptr<CPUOp> base_op_;
+};
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
+#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
new file mode 100644
index 0000000..d24bed7
--- /dev/null
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -0,0 +1,169 @@
+#include <cstdint>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_HAS_MKL_SGEMM_PACK
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(mkl::MKLPackedMatrix);
+
+namespace mkl {
+
+class PackedFCOp final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {
+    OPERATOR_NEEDS_FEATURE(
+        GetCpuId().avx2() || operator_def.type() == "PackedFC",
+        "If you are trying to use PackedFCOp as a FC with PACKED engine on "
+        "a machine that does not have avx2, be noted that the functionality "
+        "is not tuned and you are better off directly using FC.");
+    // TODO(jiayq): after MKL update, remove this constraint. This is different
+    // from the check above, as the above is a performance hint and the below
+    // is about correctness.
+    CAFFE_ENFORCE(
+        GetCpuId().avx2(),
+        "Do not run PackedFC on a machine that does not have avx2 "
+        "right now, as there is a known issue with MKL 2017.0.098 "
+        "that produces wrong results on non-avx2 machines.");
+  }
+  ~PackedFCOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& b = Input(2);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
+    // batch size
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const int M = X.size_to_dim(canonical_axis);
+    const int K = X.size_from_dim(canonical_axis);
+    const int N = b.size();
+
+    // Check out what is the passed in format.
+    const MKLPackedMatrix* packed_matrix = nullptr;
+    if (OperatorBase::InputIsType<TensorCPU>(1)) {
+      const auto& W = Input(1);
+      CAFFE_ENFORCE_EQ(W.ndim(), 2);
+      CAFFE_ENFORCE_EQ(W.dim32(0), N);
+      CAFFE_ENFORCE_EQ(W.dim32(1), K);
+      // Note(jiayq): This will strictly check that we have a proper usage of
+      // the PackedFC operator. The motivation is that, this operator is
+      // stateful unlike most ops in Caffe2, but checking whether the weight
+      // has changed matters quite a lot in the critical path. We only enable
+      // this test during DEBUG mode for performance considerations.
+      DCHECK(hash_ == 0 || hash_ == Hash(W.template data<float>(), W.size()))
+          << "PackedFCOp is currently stateful: you should not change the "
+             "weight during runtime. This is only sanity-checked in debug "
+             "mode for speed considerations.";
+      if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
+        // If there is no pre packed matrix, or the batch size changed, we
+        // do a re-pack.
+        local_packed_matrix_.reset(new MKLPackedMatrix(
+            CblasBMatrix,
+            CblasTrans,
+            M,
+            N,
+            K,
+            1.f,
+            W.template data<float>(),
+            K));
+      }
+      packed_matrix = local_packed_matrix_.get();
+    } else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
+      packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
+    }
+    CAFFE_ENFORCE_EQ(packed_matrix->m_, M);
+    CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
+    CAFFE_ENFORCE_EQ(packed_matrix->n_, N);
+    // Do we want to check the other flags as well?
+
+    Y_shape_cache_ = X.dims();
+    // This is an invariant of canonical_axis, so we can DCHECK.
+    DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
+    Y_shape_cache_.resize(canonical_axis + 1);
+    Y_shape_cache_[canonical_axis] = N;
+    Y->Resize(Y_shape_cache_);
+    CAFFE_ENFORCE(M * N == Y->size());
+
+    cblas_sgemm_compute(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasPacked,
+        M,
+        N,
+        K,
+        X.template data<float>(),
+        K,
+        packed_matrix->data_,
+        K,
+        0,
+        Y->template mutable_data<float>(),
+        N);
+
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<float, CPUContext>(
+          M, 1.f, bias_multiplier_.template mutable_data<float>(), &context_);
+    }
+    math::Gemm<float, CPUContext>(
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        N,
+        1,
+        1,
+        bias_multiplier_.template data<float>(),
+        b.template data<float>(),
+        1,
+        Y->template mutable_data<float>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  uint32_t Hash(const float* ptr, size_t n) {
+    uint32_t hash = 0;
+    const uint32_t* ptr_i = reinterpret_cast<const uint32_t*>(ptr);
+    for (int i = 0; i < n; ++i) {
+      hash ^= ptr_i[i];
+    }
+    return hash;
+  }
+  size_t axis_{1};
+  uint32_t hash_{0};
+  vector<TIndex> Y_shape_cache_;
+  Tensor<CPUContext> bias_multiplier_;
+  std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
+};
+
+} // namespace mkl
+
+REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(FC, PACKED, mkl::PackedFCOp);
+
+OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
+Computes the result of passing an input vector X into a fully connected
+layer with 2D weight matrix W and 1D bias vector b. This is essentially the
+same as the FC operator but allows one to pack the weight matrix for more
+efficient inference. See the schema for the FC op for details.
+
+Unlike many other operators in Caffe2, this operator is stateful: it assumes
+that the input weight matrix W never changes, so it is only suitable for
+inference time when the weight matrix never gets updated by any other ops.
+Due to performance considerations, this is not checked in non-debug builds.
+)DOC");
+
+SHOULD_NOT_DO_GRADIENT(PackedFC);
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_SGEMM_PACK
diff --git a/caffe2/mkl/operators/pool_op.cc b/caffe2/mkl/operators/pool_op.cc
new file mode 100644
index 0000000..434fad2
--- /dev/null
+++ b/caffe2/mkl/operators/pool_op.cc
@@ -0,0 +1,122 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+
+namespace mkl {
+
+template <typename T>
+class MKLPoolOp final : public ConvPoolOpBase<MKLContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(MKLContext);
+  MKLPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<MKLContext>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        (dilation_h() == 1) && (dilation_w() == 1),
+        "Pooling op does not support dilation right now.");
+    if (!global_pooling_) {
+      CAFFE_ENFORCE(
+          pad_t() < kernel_h() && pad_b() < kernel_h() &&
+              pad_l() < kernel_w() && pad_r() < kernel_w(),
+          "Pad should be smaller than kernel.");
+    }
+    // Figure out the pooling descriptor.
+    if (operator_def.type().substr(0, 7) == "MaxPool") {
+      algo = dnnAlgorithmPoolingMax;
+    } else if (operator_def.type().substr(0, 11) == "AveragePool") {
+      algo = dnnAlgorithmPoolingAvg;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X
+  // Output: Y
+ private:
+  vector<TIndex> cached_input_dims_;
+  // vector<TIndex> cached_avgpool_input_dims_;
+  LayoutWrapper<T> workspace_layout_;
+  std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
+  PrimitiveWrapper<T> primitive_;
+  MKLMemory<T> buffer_;
+  void* resources_[dnnResourceNumber] = {0};
+  dnnAlgorithm_t algo;
+};
+
+template <>
+bool MKLPoolOp<float>::RunOnDeviceWithOrderNCHW() {
+  auto& X = OperatorBase::Input<MKLMemory<float>>(0);
+  MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
+
+  bool dims_changed;
+  CHECK_INPUT_DIMS(X, dims_changed);
+  if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+    // We will utilize the SetOutputSize() function in the base class
+    // with dummy TensorCPU input and output to calculate the sizes.
+    TensorCPU dummy_input(X.dims());
+    TensorCPU dummy_output;
+
+    ConvPoolOpBase<MKLContext>::SetOutputSize(
+        dummy_input, &dummy_output, X.dim32(1));
+    size_t dim = X.ndim();
+    CAFFE_ENFORCE(4 == dim);
+
+    int paddings[4] = {-pad_l(), -pad_t(), -pad_r(), -pad_b()};
+    size_t strides[2] = {stride_w(), stride_h()};
+    size_t kernel_size[2] = {kernel_w(), kernel_h()};
+
+    // Create main primitive.
+    primitive_.Reset(
+        dnnPoolingCreateForward_F32,
+        nullptr,
+        algo,
+        X.layout(),
+        kernel_size,
+        strides,
+        paddings,
+        dnnBorderZerosAsymm);
+
+    Y->Reset(dummy_output.dims(), primitive_, dnnResourceDst);
+    buffer_.Reset(dummy_output.dims(), primitive_, dnnResourceDst, true);
+
+    workspace_layout_.Reset(primitive_, dnnResourceWorkspace);
+    workspace_buffer_ =
+        caffe2::make_unique<MKLWorkspace<float>>(workspace_layout_);
+  }
+
+  // Try to share from the output: this allows us to avoid unnecessary copy
+  // operations, if the output is already allocated and is having the same
+  // layout as the buffer has.
+  bool shared = buffer_.ShareFrom(*Y);
+  resources_[dnnResourceSrc] = X.buffer();
+  resources_[dnnResourceDst] = buffer_.buffer();
+  resources_[dnnResourceWorkspace] = workspace_buffer_->buffer();
+  MKLDNN_SAFE_CALL(mkl::dnnExecute<float>(primitive_, resources_));
+  buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+  if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+    buffer_.Reset();
+  }
+  return true;
+}
+
+template <>
+bool MKLPoolOp<float>::RunOnDeviceWithOrderNHWC() {
+  CAFFE_NOT_IMPLEMENTED;
+}
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(AveragePool, mkl::MKLPoolOp<float>);
+REGISTER_MKL_OPERATOR(MaxPool, mkl::MKLPoolOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/relu_op.cc b/caffe2/mkl/operators/relu_op.cc
new file mode 100644
index 0000000..ef734ed
--- /dev/null
+++ b/caffe2/mkl/operators/relu_op.cc
@@ -0,0 +1,84 @@
+#include "caffe2/operators/relu_op.h"
+#include "caffe2/mkl/mkl_utils.h"
+
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLReluOp : public MKLOperator<T> {
+ public:
+  USE_MKLOPERATOR_FUNCTIONS(T);
+  USE_SIMPLE_MKL_CTOR_DTOR(MKLReluOp, T);
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+
+    bool dims_changed;
+    CHECK_INPUT_DIMS(X, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      // First run or changed input size, will need to recreate environment
+      primitive_.Reset(dnnReLUCreateForward<T>, nullptr, X.layout(), 0.f);
+      if (&X != Y) {
+        Y->Reset(X.dims(), primitive_, dnnResourceDst);
+      }
+      buffer_.Reset(X.dims(), primitive_, dnnResourceDst, true);
+    }
+    // Try to share from the output: this allows us to avoid unnecessary copy
+    // operations, if the output is already allocated and is having the same
+    // layout as the buffer has.
+    bool shared = buffer_.ShareFrom(*Y);
+    CAFFE_ENFORCE(dnnLayoutCompare_F32(X.layout(), buffer_.layout()));
+    resources_[dnnResourceSrc] = X.buffer();
+    resources_[dnnResourceDst] = buffer_.buffer();
+    ExecutePrimitive();
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+ private:
+  vector<TIndex> cached_input_dims_;
+};
+
+template <typename T>
+class MKLReluGradientOp : public MKLOperator<T> {
+ public:
+  USE_MKLOPERATOR_FUNCTIONS(T);
+  USE_SIMPLE_MKL_CTOR_DTOR(MKLReluGradientOp, T);
+  bool RunOnDevice() override {
+    auto& Y = Input(0);
+    auto& dY = Input(1);
+    auto* dX = Output(0);
+    if (input_size_cache_.size() != 1 || input_size_cache_[0] != Y.dims()) {
+      // First run or changed input size, will need to recreate environment
+      primitive_.Reset(
+          dnnReLUCreateBackward<T>, nullptr, dY.layout(), Y.layout(), 0.f);
+      dX->Reset(Y.dims(), primitive_, dnnResourceDiffSrc);
+      buffer_.Reset(Y.dims(), primitive_, dnnResourceDiffSrc, true);
+    }
+    // Try to share from the output: this will save a copy if the output is
+    // already allocated and is having the same layout as the buffer has.
+    buffer_.ShareFrom(*dX);
+    // MKLDNN seems to use X instead of Y for src, let's see if this works.
+    resources_[dnnResourceSrc] = Y.buffer();
+    resources_[dnnResourceDiffDst] = dY.buffer();
+    resources_[dnnResourceDiffSrc] = buffer_.buffer();
+    ExecutePrimitive();
+    buffer_.CopyTo(dX);
+    return true;
+  }
+};
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(Relu, mkl::MKLReluOp<float>);
+REGISTER_MKL_OPERATOR(ReluGradient, mkl::MKLReluGradientOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/spatial_batch_norm_op.cc b/caffe2/mkl/operators/spatial_batch_norm_op.cc
new file mode 100644
index 0000000..41ee615
--- /dev/null
+++ b/caffe2/mkl/operators/spatial_batch_norm_op.cc
@@ -0,0 +1,159 @@
+#include "caffe2/operators/spatial_batch_norm_op.h"
+#include <math.h>
+
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLBNOp final : public SpatialBNOp<MKLContext> {
+ public:
+  MKLBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNOp<MKLContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        order_ == StorageOrder::NCHW, "Only NCHW order supported.");
+    OPERATOR_NEEDS_FEATURE(
+        operator_def.input(0) != operator_def.output(0),
+        "Inplace BN not supported");
+  }
+
+  bool RunOnDevice() {
+    auto& X = OperatorBase::Input<MKLMemory<float>>(INPUT);
+    auto& scale = OperatorBase::Input<MKLMemory<float>>(SCALE);
+    auto& bias = OperatorBase::Input<MKLMemory<float>>(BIAS);
+
+    MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(OUTPUT);
+    // anded with is_test_-1 to avoid uninitialized access in case of testing
+    MKLMemory<float>* running_mean =
+        OperatorBase::Output<MKLMemory<float>>(RUNNING_MEAN & (is_test_ - 1));
+    MKLMemory<float>* running_var =
+        OperatorBase::Output<MKLMemory<float>>(RUNNING_VAR & (is_test_ - 1));
+    MKLMemory<float>* saved_mean =
+        OperatorBase::Output<MKLMemory<float>>(SAVED_MEAN & (is_test_ - 1));
+    MKLMemory<float>* saved_var =
+        OperatorBase::Output<MKLMemory<float>>(SAVED_INV_VAR & (is_test_ - 1));
+
+    // current code supports only NCHW -
+    // have to look for MKL related changes for NHWC later
+    const int N = X.dim32(0);
+    const int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
+    const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+    const int W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
+
+    DCHECK_EQ(scale.ndim(), 1);
+    DCHECK_EQ(bias.ndim(), 1);
+    DCHECK_EQ(scale.dim32(0), C);
+    DCHECK_EQ(bias.dim32(0), C);
+
+    bool dims_changed;
+    CHECK_INPUT_DIMS(X, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      // Create main primitive.
+      if (is_test_) {
+        primitive_.Reset(
+            dnnBatchNormalizationCreateForward_v2<T>,
+            nullptr,
+            X.layout(),
+            epsilon_,
+            dnnUseInputMeanVariance | dnnUseScaleShift);
+      } else {
+        primitive_.Reset(
+            dnnBatchNormalizationCreateForward_v2<T>,
+            nullptr,
+            X.layout(),
+            epsilon_,
+            dnnUseScaleShift);
+
+        // using scale dims as it is also of size C
+        saved_mean->Reset(scale.dims(), primitive_, dnnResourceMean);
+        saved_var->Reset(scale.dims(), primitive_, dnnResourceVariance);
+        running_mean->Reset(scale.dims(), primitive_, dnnResourceMean);
+        running_var->Reset(scale.dims(), primitive_, dnnResourceVariance);
+
+        running_mean_buf = (T*)running_mean->buffer();
+        running_var_buf = (T*)running_var->buffer();
+      }
+      Y->Reset(X.dims(), primitive_, dnnResourceDst);
+      buffer_.Reset(X.dims(), primitive_, dnnResourceDst, true);
+
+      scale_bias_layout_.Reset(primitive_, dnnResourceScaleShift);
+      scale_bias_buffer_ =
+          caffe2::make_unique<MKLWorkspace<float>>(scale_bias_layout_);
+
+      // fill scale and bias into a single buffer
+      scale_buf = (T*)scale.buffer();
+      bias_buf = (T*)bias.buffer();
+      for (int i = 0; i < C; i++) {
+        scale_bias_buffer_->buffer()[i] = scale_buf[i];
+        scale_bias_buffer_->buffer()[C + i] = bias_buf[i];
+      }
+    }
+
+    // Try to share from the output: this allows us to avoid unnecessary copy
+    // operations, if the output is already allocated and is having the same
+    // layout as the buffer has.
+    bool shared = buffer_.ShareFrom(*Y);
+    resources_[dnnResourceSrc] = X.buffer();
+    resources_[dnnResourceDst] = buffer_.buffer();
+    resources_[dnnResourceScaleShift] = scale_bias_buffer_->buffer();
+
+    if (is_test_) {
+      auto& est_mean = OperatorBase::Input<MKLMemory<float>>(EST_MEAN);
+      auto& est_var = OperatorBase::Input<MKLMemory<float>>(EST_VAR);
+
+      resources_[dnnResourceMean] = est_mean.buffer();
+      resources_[dnnResourceVariance] = est_var.buffer();
+    } else {
+      resources_[dnnResourceMean] = saved_mean->buffer();
+      resources_[dnnResourceVariance] = saved_var->buffer();
+    }
+
+    MKLDNN_SAFE_CALL(mkl::dnnExecute<float>(primitive_, resources_));
+
+    if (!is_test_) {
+      // compute running mean and variance
+      saved_mean_buf = (T*)saved_mean->buffer();
+      saved_var_buf = (T*)saved_var->buffer();
+
+      for (int i = 0; i < C; i++) {
+        running_mean_buf[i] = running_mean_buf[i] * momentum_ +
+            saved_mean_buf[i] * (1. - momentum_);
+        running_var_buf[i] = running_var_buf[i] * momentum_ +
+            saved_var_buf[i] * (1. - momentum_);
+        saved_var_buf[i] = (1 / sqrt(saved_var_buf[i] + epsilon_));
+      }
+    }
+    buffer_.CopyTo(Y, primitive_, dnnResourceDst);
+    if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
+      buffer_.Reset();
+    }
+    return true;
+  }
+
+ private:
+  vector<TIndex> cached_input_dims_;
+  LayoutWrapper<T> scale_bias_layout_;
+  LayoutWrapper<T> saved_mean_layout_;
+  LayoutWrapper<T> saved_var_layout_;
+  LayoutWrapper<T> running_mean_layout_;
+  LayoutWrapper<T> running_var_layout_;
+  std::unique_ptr<MKLWorkspace<T>> scale_bias_buffer_;
+  T* scale_buf = nullptr;
+  T* bias_buf = nullptr;
+  T* saved_mean_buf = nullptr;
+  T* saved_var_buf = nullptr;
+  T* running_mean_buf = nullptr;
+  T* running_var_buf = nullptr;
+  PrimitiveWrapper<T> primitive_;
+  MKLMemory<T> buffer_;
+  void* resources_[dnnResourceNumber] = {0};
+};
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(SpatialBN, mkl::MKLBNOp<float>);
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/squeeze_op.cc b/caffe2/mkl/operators/squeeze_op.cc
new file mode 100644
index 0000000..fb71be5
--- /dev/null
+++ b/caffe2/mkl/operators/squeeze_op.cc
@@ -0,0 +1,69 @@
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/operators/expand_squeeze_dims_op.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class MKLSqueezeOp final : public MKLOperator<T> {
+ public:
+  USE_MKLOPERATOR_FUNCTIONS(T);
+
+  MKLSqueezeOp(const OperatorDef& operator_def, Workspace* ws)
+      : MKLOperator<T>(operator_def, ws),
+        dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
+    auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
+    std::sort(dims_.begin(), dims_.end());
+    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
+    if (dims_.size() < originalSize) {
+      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+    }
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
+  }
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+
+    CAFFE_ENFORCE_GT(
+        X.ndim(),
+        dims_.back(),
+        "Input needs at least ",
+        (dims_.back() + 1),
+        " dimensions.");
+    const auto& new_dims = SqueezeOp<MKLContext>::ComputeDims(X.dims(), dims_);
+
+    bool dims_changed;
+    CHECK_INPUT_DIMS(X, dims_changed);
+    if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
+      // Temp buffer mainly to convert the input to plain layout before
+      // Reshape() if the input has a custom layout.
+      buffer_.Reset(X.dims());
+    }
+
+    // Always copy to temp buffer to avoid subsequent runs throwing layout
+    // mismatch errors for X.
+    buffer_.CopyFrom(X);
+    Y->Reset(X.dims(), nullptr, dnnResourceNumber, true);
+    CAFFE_ENFORCE(dnnLayoutCompare<T>(buffer_.layout(), Y->layout()));
+    CAFFE_ENFORCE(Y->ShareFrom(buffer_));
+    Y->Reshape(new_dims);
+    return true;
+  }
+
+ private:
+  vector<int> dims_;
+  vector<TIndex> cached_input_dims_;
+};
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(Squeeze, mkl::MKLSqueezeOp<float>);
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/operators/utility_ops.cc b/caffe2/mkl/operators/utility_ops.cc
new file mode 100644
index 0000000..969450c
--- /dev/null
+++ b/caffe2/mkl/operators/utility_ops.cc
@@ -0,0 +1,54 @@
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/mkl_utils.h"
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+class CopyCPUToMKLOp final : public MKLOperator<float> {
+ public:
+  using MKLOperator<float>::MKLOperator;
+  bool RunOnDevice() override {
+    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    auto* Y = OperatorBase::OutputBlob(0);
+    if (!Y->template IsType<MKLMemory<float>>() ||
+        Y->Get<MKLMemory<float>>().dims() != X.dims()) {
+      Y->Reset(new MKLMemory<float>(X.dims()));
+    }
+    Y->GetMutable<MKLMemory<float>>()->CopyFrom(X);
+    return true;
+  }
+};
+
+class CopyMKLToCPUOp final : public MKLOperator<float> {
+ public:
+  using MKLOperator<float>::MKLOperator;
+
+  bool RunOnDevice() override {
+    const auto& X = OperatorBase::Input<MKLMemory<float>>(0);
+    auto* Y = OperatorBase::Output<TensorCPU>(0);
+    X.CopyTo(Y);
+    return true;
+  }
+};
+
+} // namespace mkl
+
+REGISTER_MKL_OPERATOR(CopyCPUToMKL, mkl::CopyCPUToMKLOp);
+REGISTER_MKL_OPERATOR(CopyMKLToCPU, mkl::CopyMKLToCPUOp);
+
+OPERATOR_SCHEMA(CopyCPUToMKL)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Input(0, "cpu_blob", "The input TensorCPU to copy")
+    .Output(0, "mkl_blob", "The output MKLMemory to copy to");
+OPERATOR_SCHEMA(CopyMKLToCPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Input(0, "mkl_blob", "The input MKLMemory to copy")
+    .Output(0, "cpu_blob", "The output TensorCPU to copy to");
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
new file mode 100644
index 0000000..e13b3ad
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -0,0 +1,21 @@
+// #include "caffe2/mkl/utils/mkl_context.h"
+
+#include "caffe2/core/event_cpu.h"
+
+namespace caffe2 {
+
+// MKL events are the same as CPU events
+
+REGISTER_EVENT_CREATE_FUNCTION(MKLDNN, EventCreateCPU);
+REGISTER_EVENT_RECORD_FUNCTION(MKLDNN, EventRecordCPU);
+REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, MKLDNN, EventWaitCPUCPU);
+REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, CPU, EventWaitCPUCPU);
+REGISTER_EVENT_WAIT_FUNCTION(CPU, MKLDNN, EventWaitCPUCPU);
+REGISTER_EVENT_FINISH_FUNCTION(MKLDNN, EventFinishCPU);
+
+REGISTER_EVENT_QUERY_FUNCTION(MKLDNN, EventQueryCPU);
+REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(MKLDNN, EventErrorMessageCPU);
+REGISTER_EVENT_SET_FINISHED_FUNCTION(MKLDNN, EventSetFinishedCPU);
+REGISTER_EVENT_RESET_FUNCTION(MKLDNN, EventResetCPU);
+
+} // namespace caffe2
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
new file mode 100644
index 0000000..b876894
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -0,0 +1,128 @@
+#ifndef CAFFE2_UTILS_MKL_CONTEXT_H_
+#define CAFFE2_UTILS_MKL_CONTEXT_H_
+
+#include <cstdlib>
+#include <ctime>
+#include <random>
+
+#include "caffe2/core/context.h"
+
+namespace caffe2 {
+
+/**
+ * The MKL Context, which is largely the same as the CPUContext. We instantiate
+ * this mainly in order to have a first-class MKL device.
+ *
+ * Note that although New() and Delete() are implemented, we expect MKLContext
+ * operators to mainly perform input and output via MKLMemory. As a result,
+ * most likely MKLContext::New and ::Delete won't be used as often.
+ */
+class MKLContext final {
+ public:
+  MKLContext() : random_seed_(RandomNumberSeed()) {}
+  explicit MKLContext(const DeviceOption& option)
+      : random_seed_(
+            option.has_random_seed() ? option.random_seed()
+                                     : RandomNumberSeed()) {
+    CAFFE_ENFORCE_EQ(option.device_type(), MKLDNN);
+  }
+
+  ~MKLContext() {}
+
+  inline void SwitchToDevice(int /*stream_id*/ = 0) {}
+
+  inline void WaitEvent(const Event& ev) {
+    ev.Wait(MKLDNN, this);
+  }
+
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+    CAFFE_ENFORCE(ev, "Event must not be null.");
+    ev->Record(MKLDNN, this, err_msg);
+  }
+
+  inline void FinishDeviceComputation() {}
+
+  inline std::mt19937& RandGenerator() {
+    if (!random_generator_.get()) {
+      random_generator_.reset(new std::mt19937(random_seed_));
+    }
+    return *random_generator_.get();
+  }
+
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return GetCPUAllocator()->New(nbytes);
+  }
+
+  // Two copy functions that deals with cross-device copies.
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(size_t n, const T* src, T* dst) {
+    if (std::is_fundamental<T>::value) {
+      CopyBytes<SrcContext, DstContext>(
+          n * sizeof(T),
+          static_cast<const void*>(src),
+          static_cast<void*>(dst));
+    } else {
+      for (int i = 0; i < n; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  // By default MKL operators don't have async device parts
+  static bool HasAsyncPartDefault() {
+    return false;
+  }
+
+  static bool SupportsAsyncScheduling() {
+    return false;
+  }
+
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+    return true;
+  }
+
+ protected:
+  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
+  int random_seed_{1701};
+  std::unique_ptr<std::mt19937> random_generator_;
+};
+
+template <>
+inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  memcpy(dst, src, nbytes);
+}
+
+template <>
+inline void MKLContext::CopyBytes<CPUContext, MKLContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  memcpy(dst, src, nbytes);
+}
+
+template <>
+inline void MKLContext::CopyBytes<MKLContext, CPUContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  memcpy(dst, src, nbytes);
+}
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MKL_CONTEXT_H_
diff --git a/caffe2/mkl/utils/mkl_dnn_cppwrapper.h b/caffe2/mkl/utils/mkl_dnn_cppwrapper.h
new file mode 100644
index 0000000..f775c0f
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_dnn_cppwrapper.h
@@ -0,0 +1,1269 @@
+// Do not directl include this file. Include caffe2/mkl/mkl_utils.h instead.
+#ifndef CAFFE2_UTILS_MKL_MKL_DNN_CPPWRAPPER_H
+#define CAFFE2_UTILS_MKL_MKL_DNN_CPPWRAPPER_H
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <mkl.h>
+
+#define C2_MKL_TEMPLATE_PREFIX \
+  template <typename T>        \
+  inline
+#define C2_MKL_SPEC_PREFIX \
+  template <>              \
+  inline
+
+namespace caffe2 {
+namespace mkl {
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnLayoutCreate(
+    dnnLayout_t* pLayout,
+    size_t dimension,
+    const size_t size[],
+    const size_t strides[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutCreate<float>(
+    dnnLayout_t* pLayout,
+    size_t dimension,
+    const size_t size[],
+    const size_t strides[]) {
+  return dnnLayoutCreate_F32(pLayout, dimension, size, strides);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutCreate<double>(
+    dnnLayout_t* pLayout,
+    size_t dimension,
+    const size_t size[],
+    const size_t strides[]) {
+  return dnnLayoutCreate_F64(pLayout, dimension, size, strides);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnLayoutCreateFromPrimitive(
+    dnnLayout_t* pLayout,
+    const dnnPrimitive_t primitive,
+    dnnResourceType_t type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutCreateFromPrimitive<float>(
+    dnnLayout_t* pLayout,
+    const dnnPrimitive_t primitive,
+    dnnResourceType_t type) {
+  return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutCreateFromPrimitive<double>(
+    dnnLayout_t* pLayout,
+    const dnnPrimitive_t primitive,
+    dnnResourceType_t type) {
+  return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type);
+}
+
+C2_MKL_TEMPLATE_PREFIX size_t dnnLayoutGetMemorySize(const dnnLayout_t layout);
+C2_MKL_SPEC_PREFIX size_t
+dnnLayoutGetMemorySize<float>(const dnnLayout_t layout) {
+  return dnnLayoutGetMemorySize_F32(layout);
+}
+C2_MKL_SPEC_PREFIX size_t
+dnnLayoutGetMemorySize<double>(const dnnLayout_t layout) {
+  return dnnLayoutGetMemorySize_F64(layout);
+}
+
+C2_MKL_TEMPLATE_PREFIX int dnnLayoutCompare(
+    const dnnLayout_t l1,
+    const dnnLayout_t l2);
+C2_MKL_SPEC_PREFIX int dnnLayoutCompare<float>(
+    const dnnLayout_t l1,
+    const dnnLayout_t l2) {
+  return dnnLayoutCompare_F32(l1, l2);
+}
+C2_MKL_SPEC_PREFIX int dnnLayoutCompare<double>(
+    const dnnLayout_t l1,
+    const dnnLayout_t l2) {
+  return dnnLayoutCompare_F64(l1, l2);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnAllocateBuffer(void** pPtr, dnnLayout_t layout);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnAllocateBuffer<float>(void** pPtr, dnnLayout_t layout) {
+  return dnnAllocateBuffer_F32(pPtr, layout);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnAllocateBuffer<double>(void** pPtr, dnnLayout_t layout) {
+  return dnnAllocateBuffer_F64(pPtr, layout);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnReleaseBuffer(void* ptr);
+C2_MKL_SPEC_PREFIX dnnError_t dnnReleaseBuffer<float>(void* ptr) {
+  return dnnReleaseBuffer_F32(ptr);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnReleaseBuffer<double>(void* ptr) {
+  return dnnReleaseBuffer_F64(ptr);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnLayoutDelete(dnnLayout_t layout);
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutDelete<float>(dnnLayout_t layout) {
+  return dnnLayoutDelete_F32(layout);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnLayoutDelete<double>(dnnLayout_t layout) {
+  return dnnLayoutDelete_F64(layout);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnPrimitiveAttributesCreate(dnnPrimitiveAttributes_t* attributes);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnPrimitiveAttributesCreate<float>(dnnPrimitiveAttributes_t* attributes) {
+  return dnnPrimitiveAttributesCreate_F32(attributes);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnPrimitiveAttributesCreate<double>(dnnPrimitiveAttributes_t* attributes) {
+  return dnnPrimitiveAttributesCreate_F64(attributes);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnPrimitiveAttributesDestroy(dnnPrimitiveAttributes_t attributes);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnPrimitiveAttributesDestroy<float>(dnnPrimitiveAttributes_t attributes) {
+  return dnnPrimitiveAttributesDestroy_F32(attributes);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnPrimitiveAttributesDestroy<double>(dnnPrimitiveAttributes_t attributes) {
+  return dnnPrimitiveAttributesDestroy_F64(attributes);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnPrimitiveGetAttributes(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t* attributes);
+C2_MKL_SPEC_PREFIX dnnError_t dnnPrimitiveGetAttributes<float>(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t* attributes) {
+  return dnnPrimitiveGetAttributes_F32(primitive, attributes);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnPrimitiveGetAttributes<double>(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t* attributes) {
+  return dnnPrimitiveGetAttributes_F64(primitive, attributes);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnExecute(dnnPrimitive_t primitive, void* resources[]);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnExecute<float>(dnnPrimitive_t primitive, void* resources[]) {
+  return dnnExecute_F32(primitive, resources);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnExecute<double>(dnnPrimitive_t primitive, void* resources[]) {
+  return dnnExecute_F64(primitive, resources);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnExecuteAsync(dnnPrimitive_t primitive, void* resources[]);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnExecuteAsync<float>(dnnPrimitive_t primitive, void* resources[]) {
+  return dnnExecuteAsync_F32(primitive, resources);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnExecuteAsync<double>(dnnPrimitive_t primitive, void* resources[]) {
+  return dnnExecuteAsync_F64(primitive, resources);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnWaitFor(dnnPrimitive_t primitive);
+C2_MKL_SPEC_PREFIX dnnError_t dnnWaitFor<float>(dnnPrimitive_t primitive) {
+  return dnnWaitFor_F32(primitive);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnWaitFor<double>(dnnPrimitive_t primitive) {
+  return dnnWaitFor_F64(primitive);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnDelete(dnnPrimitive_t primitive);
+C2_MKL_SPEC_PREFIX dnnError_t dnnDelete<float>(dnnPrimitive_t primitive) {
+  return dnnDelete_F32(primitive);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnDelete<double>(dnnPrimitive_t primitive) {
+  return dnnDelete_F64(primitive);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConversionCreate(
+    dnnPrimitive_t* pConversion,
+    const dnnLayout_t from,
+    const dnnLayout_t to);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConversionCreate<float>(
+    dnnPrimitive_t* pConversion,
+    const dnnLayout_t from,
+    const dnnLayout_t to) {
+  return dnnConversionCreate_F32(pConversion, from, to);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConversionCreate<double>(
+    dnnPrimitive_t* pConversion,
+    const dnnLayout_t from,
+    const dnnLayout_t to) {
+  return dnnConversionCreate_F64(pConversion, from, to);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t
+dnnConversionExecute(dnnPrimitive_t conversion, void* from, void* to);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnConversionExecute<float>(dnnPrimitive_t conversion, void* from, void* to) {
+  return dnnConversionExecute_F32(conversion, from, to);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnConversionExecute<double>(dnnPrimitive_t conversion, void* from, void* to) {
+  return dnnConversionExecute_F64(conversion, from, to);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConvolutionCreateForward(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateForward<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateForward_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateForward<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateForward_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConvolutionCreateForwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateForwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateForwardBias_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateForwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateForwardBias_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConvolutionCreateBackwardData(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardData<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateBackwardData_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardData<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateBackwardData_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConvolutionCreateBackwardFilter(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardFilter<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateBackwardFilter_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardFilter<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnConvolutionCreateBackwardFilter_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConvolutionCreateBackwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t dstSize[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t dstSize[]) {
+  return dnnConvolutionCreateBackwardBias_F32(
+      pConvolution, attributes, algorithm, dimension, dstSize);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConvolutionCreateBackwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension,
+    const size_t dstSize[]) {
+  return dnnConvolutionCreateBackwardBias_F64(
+      pConvolution, attributes, algorithm, dimension, dstSize);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnGroupsConvolutionCreateForward(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateForward<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateForward_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateForward<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateForward_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnGroupsConvolutionCreateForwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateForwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateForwardBias_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateForwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateForwardBias_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardData(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardData<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateBackwardData_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardData<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateBackwardData_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardFilter(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardFilter<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateBackwardFilter_F32(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardFilter<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t srcSize[],
+    const size_t dstSize[],
+    const size_t filterSize[],
+    const size_t convolutionStrides[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnGroupsConvolutionCreateBackwardFilter_F64(
+      pConvolution,
+      attributes,
+      algorithm,
+      groups,
+      dimension,
+      srcSize,
+      dstSize,
+      filterSize,
+      convolutionStrides,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t dstSize[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t dstSize[]) {
+  return dnnGroupsConvolutionCreateBackwardBias_F32(
+      pConvolution, attributes, algorithm, groups, dimension, dstSize);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnGroupsConvolutionCreateBackwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups,
+    size_t dimension,
+    const size_t dstSize[]) {
+  return dnnGroupsConvolutionCreateBackwardBias_F64(
+      pConvolution, attributes, algorithm, groups, dimension, dstSize);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnReLUCreateForward(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float negativeSlope);
+C2_MKL_SPEC_PREFIX dnnError_t dnnReLUCreateForward<float>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float negativeSlope) {
+  return dnnReLUCreateForward_F32(pRelu, attributes, dataLayout, negativeSlope);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnReLUCreateForward<double>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float negativeSlope) {
+  return dnnReLUCreateForward_F64(pRelu, attributes, dataLayout, negativeSlope);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnReLUCreateBackward(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    float negativeSlope);
+C2_MKL_SPEC_PREFIX dnnError_t dnnReLUCreateBackward<float>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    float negativeSlope) {
+  return dnnReLUCreateBackward_F32(
+      pRelu, attributes, diffLayout, dataLayout, negativeSlope);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnReLUCreateBackward<double>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    float negativeSlope) {
+  return dnnReLUCreateBackward_F64(
+      pRelu, attributes, diffLayout, dataLayout, negativeSlope);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnLRNCreateForward(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k);
+C2_MKL_SPEC_PREFIX dnnError_t dnnLRNCreateForward<float>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k) {
+  return dnnLRNCreateForward_F32(
+      pLrn, attributes, dataLayout, kernel_size, alpha, beta, k);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnLRNCreateForward<double>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k) {
+  return dnnLRNCreateForward_F64(
+      pLrn, attributes, dataLayout, kernel_size, alpha, beta, k);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnLRNCreateBackward(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k);
+C2_MKL_SPEC_PREFIX dnnError_t dnnLRNCreateBackward<float>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k) {
+  return dnnLRNCreateBackward_F32(
+      pLrn, attributes, diffLayout, dataLayout, kernel_size, alpha, beta, k);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnLRNCreateBackward<double>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout,
+    const dnnLayout_t dataLayout,
+    size_t kernel_size,
+    float alpha,
+    float beta,
+    float k) {
+  return dnnLRNCreateBackward_F64(
+      pLrn, attributes, diffLayout, dataLayout, kernel_size, alpha, beta, k);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnPoolingCreateForward(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnPoolingCreateForward<float>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnPoolingCreateForward_F32(
+      pPooling,
+      attributes,
+      op,
+      srcLayout,
+      kernelSize,
+      kernelStride,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnPoolingCreateForward<double>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnPoolingCreateForward_F64(
+      pPooling,
+      attributes,
+      op,
+      srcLayout,
+      kernelSize,
+      kernelStride,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnPoolingCreateBackward(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type);
+C2_MKL_SPEC_PREFIX dnnError_t dnnPoolingCreateBackward<float>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnPoolingCreateBackward_F32(
+      pPooling,
+      attributes,
+      op,
+      srcLayout,
+      kernelSize,
+      kernelStride,
+      inputOffset,
+      border_type);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnPoolingCreateBackward<double>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[],
+    const size_t kernelStride[],
+    const int inputOffset[],
+    const dnnBorder_t border_type) {
+  return dnnPoolingCreateBackward_F64(
+      pPooling,
+      attributes,
+      op,
+      srcLayout,
+      kernelSize,
+      kernelStride,
+      inputOffset,
+      border_type);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnConcatCreate(
+    dnnPrimitive_t* pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnConcatCreate<float>(
+    dnnPrimitive_t* pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]) {
+  return dnnConcatCreate_F32(pConcat, attributes, N, src);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnConcatCreate<double>(
+    dnnPrimitive_t* pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]) {
+  return dnnConcatCreate_F64(pConcat, attributes, N, src);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnSplitCreate(
+    dnnPrimitive_t* pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnSplitCreate<float>(
+    dnnPrimitive_t* pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]) {
+  return dnnSplitCreate_F32(pSplit, attributes, N, src, dst);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnSplitCreate<double>(
+    dnnPrimitive_t* pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]) {
+  return dnnSplitCreate_F64(pSplit, attributes, N, src, dst);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnSumCreate(
+    dnnPrimitive_t* pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands,
+    dnnLayout_t layout,
+    T* coefficients);
+C2_MKL_SPEC_PREFIX dnnError_t dnnSumCreate<float>(
+    dnnPrimitive_t* pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands,
+    dnnLayout_t layout,
+    float* coefficients) {
+  return dnnSumCreate_F32(pSum, attributes, nSummands, layout, coefficients);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnSumCreate<double>(
+    dnnPrimitive_t* pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands,
+    dnnLayout_t layout,
+    double* coefficients) {
+  return dnnSumCreate_F64(pSum, attributes, nSummands, layout, coefficients);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnBatchNormalizationCreateForward(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps);
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateForward<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateForward_F32(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateForward<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateForward_F64(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnBatchNormalizationCreateBackwardData(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps);
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateBackwardData<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateBackwardData_F32(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateBackwardData<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateBackwardData_F64(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnBatchNormalizationCreateBackwardScaleShift(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps);
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnBatchNormalizationCreateBackwardScaleShift<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateBackwardScaleShift_F32(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+C2_MKL_SPEC_PREFIX dnnError_t
+dnnBatchNormalizationCreateBackwardScaleShift<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps) {
+  return dnnBatchNormalizationCreateBackwardScaleShift_F64(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnBatchNormalizationCreateForward_v2(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags);
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateForward_v2<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags) {
+  return dnnBatchNormalizationCreateForward_v2_F32(
+      pBatchNormalization, attributes, dataLayout, eps, flags);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateForward_v2<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags) {
+  return dnnBatchNormalizationCreateForward_v2_F64(
+      pBatchNormalization, attributes, dataLayout, eps, flags);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnBatchNormalizationCreateBackward_v2(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags);
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateBackward_v2<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags) {
+  return dnnBatchNormalizationCreateBackward_v2_F32(
+      pBatchNormalization, attributes, dataLayout, eps, flags);
+}
+
+C2_MKL_SPEC_PREFIX dnnError_t dnnBatchNormalizationCreateBackward_v2<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout,
+    float eps,
+    unsigned int flags) {
+  return dnnBatchNormalizationCreateBackward_v2_F64(
+      pBatchNormalization, attributes, dataLayout, eps, flags);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnInnerProductCreateForward(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateForward<float>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateForward_F32(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateForward<double>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateForward_F64(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnInnerProductCreateForwardBias(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateForwardBias<float>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateForwardBias_F32(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateForwardBias<double>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateForwardBias_F64(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnInnerProductCreateBackwardData(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardData<float>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateBackwardData_F32(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardData<double>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateBackwardData_F64(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnInnerProductCreateBackwardFilter(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardFilter<float>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateBackwardFilter_F32(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardFilter<double>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+  return dnnInnerProductCreateBackwardFilter_F64(
+      pInnerProduct, attributes, dimensions, srcSize, outputChannels);
+}
+
+C2_MKL_TEMPLATE_PREFIX dnnError_t dnnInnerProductCreateBackwardBias(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[]);
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardBias<float>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[]) {
+  return dnnInnerProductCreateBackwardBias_F32(
+      pInnerProduct, attributes, dimensions, srcSize);
+}
+C2_MKL_SPEC_PREFIX dnnError_t dnnInnerProductCreateBackwardBias<double>(
+    dnnPrimitive_t* pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[]) {
+  return dnnInnerProductCreateBackwardBias_F64(
+      pInnerProduct, attributes, dimensions, srcSize);
+}
+
+} // namespace mkl
+} // namespace caffe2
+
+// Undef macros to make sure that things are clean.
+#undef C2_MKL_TEMPLATE_PREFIX
+#undef C2_MKL_SPEC_PREFIX
+
+#endif // CAFFE2_UTILS_MKL_MKL_DNN_CPPWRAPPER_H
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
new file mode 100644
index 0000000..e71c750
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -0,0 +1,62 @@
+#include "caffe2/mkl/mkl_utils.h"
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/tensor.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+CAFFE2_DEFINE_bool(
+    caffe2_mkl_implicit_layout_change, false,
+    "Controls the behavior when we call View() on an MKLMemory: if it is set "
+    "true, then the View() function will actually change the underlying "
+    "storage. If it is set false, an implicit copy is triggered but the "
+    "original storage is not affected."
+    );
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(mkl::MKLMemory<float>);
+CAFFE_KNOWN_TYPE(mkl::MKLMemory<double>);
+
+template <typename T>
+static vector<TIndex> GetMKLTensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
+  // it's too hard to get sharing info from mkl::MKLMemory
+  *shares_data = false;
+  *capacity = tc->size() * sizeof(T);
+  device->set_device_type(MKLDNN);
+  device->set_cuda_gpu_id(0);
+  return tc->dims();
+}
+
+template <typename T>
+static TypeMeta GetMKLTensorType(const void*) {
+  return TypeMeta::Make<T>();
+}
+
+template <typename T>
+static void RegisterForType() {
+  RegisterTypeCallFunction(
+      TypeMeta::Id<mkl::MKLMemory<T>>(), GetMKLTensorType<T>);
+  RegisterTensorInfoFunction(
+      TypeMeta::Id<mkl::MKLMemory<T>>(), GetMKLTensorInfo<T>);
+}
+
+static bool Caffe2InitializeMKL(int*, char***) {
+  RegisterForType<float>();
+  RegisterForType<double>();
+  return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(
+    InitMKLDNNContext,
+    &Caffe2InitializeMKL,
+    "Register wrappers for MKLContext");
+
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
new file mode 100644
index 0000000..ac74e8a
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -0,0 +1,584 @@
+#ifndef CAFFE2_UTILS_MKL_MKL_MEMORY_H_
+#define CAFFE2_UTILS_MKL_MKL_MEMORY_H_
+
+#include <string>
+#include <vector>
+#include <mutex>
+
+#include "caffe2/core/flags.h" // for TIndex
+#include "caffe2/core/tensor.h" // for TIndex
+#include "caffe2/mkl/utils/mkl_dnn_cppwrapper.h"
+
+// A global boolean variable that controls the behavior when we call View() on
+// an MKLMemory: if it is set true, then the View() function will actually
+// change the underlying storage. If it is set false, an implicit copy is
+// triggered but the original storage is not affected.
+CAFFE2_DECLARE_bool(caffe2_mkl_implicit_layout_change);
+
+namespace caffe2 {
+namespace mkl {
+
+template <typename T>
+class PrimitiveWrapper {
+ public:
+  PrimitiveWrapper() {}
+  // Creates a primitive wrapper from an existing primitive. The wrapper
+  // takes over ownership.
+  explicit PrimitiveWrapper(dnnPrimitive_t primitive) : primitive_(primitive) {}
+
+  template <typename Creator, typename FirstArg, typename... Args>
+  PrimitiveWrapper(Creator creator, FirstArg&& arg, Args&&... args) {
+    creator(&primitive_, arg, args...);
+  }
+
+  ~PrimitiveWrapper() {
+    if (primitive_) {
+      MKLDNN_CHECK(dnnDelete<T>(primitive_));
+    }
+  }
+
+  template <typename Creator, typename... Args>
+  void Reset(Creator creator, Args&&... args) {
+    if (primitive_) {
+      MKLDNN_SAFE_CALL(dnnDelete<T>(primitive_));
+    }
+    creator(&primitive_, args...);
+  }
+
+  void Reset() {
+    if (primitive_) {
+      MKLDNN_SAFE_CALL(dnnDelete<T>(primitive_));
+      primitive_ = nullptr;
+    }
+  }
+
+  operator dnnPrimitive_t() const {
+    return primitive_;
+  }
+
+ private:
+  dnnPrimitive_t primitive_ = 0;
+  DISABLE_COPY_AND_ASSIGN(PrimitiveWrapper);
+};
+
+template <typename T>
+class LayoutWrapper {
+ public:
+  LayoutWrapper() {}
+  // Create a user layout from a TensorCPU with the given shapes.
+  explicit LayoutWrapper(const TensorCPU& tensor) {
+    Reset(tensor);
+  }
+
+  // Create an internal layout from the primitive and type.
+  LayoutWrapper(const dnnPrimitive_t primitive, const dnnResourceType_t type) {
+    Reset(primitive, type);
+  }
+
+  // Create a user layout from the given dimension, size and strides.
+  LayoutWrapper(
+      const size_t dimension,
+      const size_t size[],
+      const size_t strides[]) {
+    Reset(dimension, size, strides);
+  }
+
+  // Destructs the layout wrapper.
+  ~LayoutWrapper() {
+    if (layout_)
+      MKLDNN_CHECK(dnnLayoutDelete<T>(layout_));
+  }
+
+  // Create a user layout from a TensorCPU with the given shapes.
+  void Reset(const TensorCPU& tensor) {
+    if (layout_)
+      MKLDNN_CHECK(dnnLayoutDelete<T>(layout_));
+    CAFFE_ENFORCE(tensor.size(), "Cannot reset with an empty tensor.");
+    size_t dimension = tensor.ndim();
+    size_t size[dimension];
+    size_t strides[dimension];
+    for (int i = 0; i < dimension; ++i) {
+      size[i] = tensor.dim(dimension - i - 1);
+      strides[i] = (i == 0) ? 1 : strides[i - 1] * size[i - 1];
+    }
+    MKLDNN_SAFE_CALL(dnnLayoutCreate<T>(&layout_, dimension, size, strides));
+  }
+
+  // Create an internal layout from the primitive and type.
+  void Reset(const dnnPrimitive_t primitive, const dnnResourceType_t type) {
+    CAFFE_ENFORCE(primitive, "Cannot reset with an unknwon primitive.");
+    CAFFE_ENFORCE(
+        type != dnnResourceNumber,
+        "Cannot reset with an unknown resource number.");
+    if (layout_) {
+      MKLDNN_CHECK(dnnLayoutDelete<T>(layout_));
+    }
+    MKLDNN_SAFE_CALL(
+        dnnLayoutCreateFromPrimitive<T>(&layout_, primitive, type));
+  }
+
+  // Create a user layout from the given dimension, size and strides.
+  void
+  Reset(const size_t dimension, const size_t size[], const size_t strides[]) {
+    if (layout_)
+      MKLDNN_CHECK(dnnLayoutDelete<T>(layout_));
+    MKLDNN_SAFE_CALL(dnnLayoutCreate<T>(&layout_, dimension, size, strides));
+  }
+
+  void Reset() {
+    if (layout_) {
+      MKLDNN_CHECK(dnnLayoutDelete<T>(layout_));
+      layout_ = nullptr;
+    }
+  }
+
+  operator dnnLayout_t() const {
+    return layout_;
+  }
+
+ private:
+  dnnLayout_t layout_ = 0;
+  DISABLE_COPY_AND_ASSIGN(LayoutWrapper);
+};
+
+/**
+ * @brief A wrapper around an opaque MKL internal resource that has certain
+ * layouts and convertion primitives set up.
+ *
+ * Most of the MKLMemory functions are not thread safe.
+ */
+template <typename T>
+class MKLMemory {
+ public:
+  // Initializes an empty MKLMemory.
+  MKLMemory() {}
+  // Initialize an MKLMemory with the given size, strides, dnn
+  // primitive and type.
+  MKLMemory(
+      const size_t dimension,
+      const size_t size[],
+      const size_t strides[],
+      const dnnPrimitive_t primitive = nullptr,
+      const dnnResourceType_t type = dnnResourceNumber,
+      bool share_mem_if_possible = false) {
+    Reset(dimension, size, strides, primitive, type, share_mem_if_possible);
+  }
+
+  // Initialize an MKLMemory, with the given dimension assuming a C-contiguous
+  // storage.
+  template <typename IndexType>
+  explicit MKLMemory(
+      const vector<IndexType>& dims,
+      const dnnPrimitive_t primitive = nullptr,
+      const dnnResourceType_t type = dnnResourceNumber,
+      bool share_mem_if_possible = false) {
+    Reset(dims, primitive, type, share_mem_if_possible);
+  }
+
+  // Initialize an MKLMemory with the given size, strides, dnn
+  // primitive and type.
+  void Reset(
+      const size_t dimension,
+      const size_t size[],
+      const size_t strides[],
+      const dnnPrimitive_t primitive = nullptr,
+      const dnnResourceType_t type = dnnResourceNumber,
+      bool share_mem_if_possible = false) {
+    buffer_.reset();
+    dims_.resize(dimension);
+    size_ = 1;
+    for (int i = 0; i < dimension; ++i) {
+      dims_[i] = size[dimension - 1 - i];
+      size_ *= dims_[i];
+    }
+    user_layout_.Reset(dimension, size, strides);
+    if (primitive) {
+      layout_.Reset(primitive, type);
+    } else {
+      layout_.Reset(dimension, size, strides);
+    }
+    convert_in_.Reset(dnnConversionCreate<T>, user_layout_, layout_);
+    convert_out_.Reset(dnnConversionCreate<T>, layout_, user_layout_);
+    share_mem_if_possible_ = share_mem_if_possible;
+    layout_is_user_layout_ = dnnLayoutCompare<T>(layout_, user_layout_);
+    VLOG(2) << "layout is user layout? " << layout_is_user_layout_;
+    if (!share_mem_if_possible_) {
+      // If we are not going to share memory, we will simply allocate
+      // memory upfront.
+      buffer();
+    }
+  }
+
+  // Initialize an MKLMemory, with the given dimension assuming a C-contiguous
+  // storage.
+  template <typename IndexType>
+  void Reset(
+      const vector<IndexType>& dims,
+      const dnnPrimitive_t primitive = nullptr,
+      const dnnResourceType_t type = dnnResourceNumber,
+      bool share_mem_if_possible = false) {
+    buffer_.reset();
+    dims_.resize(dims.size());
+    size_ = 1;
+    for (int i = 0; i < dims.size(); ++i) {
+      dims_[i] = dims[i];
+      size_ *= dims_[i];
+    }
+    size_t dimension = dims.size();
+    vector<size_t> size(dimension);
+    vector<size_t> strides(dimension);
+    for (int i = 0; i < dimension; ++i) {
+      size[i] = dims[dimension - i - 1];
+      strides[i] = (i == 0) ? 1 : strides[i - 1] * size[i - 1];
+    }
+    user_layout_.Reset(dims.size(), size.data(), strides.data());
+    if (primitive) {
+      layout_.Reset(primitive, type);
+    } else {
+      layout_.Reset(dimension, size.data(), strides.data());
+    }
+    convert_in_.Reset(dnnConversionCreate<T>, user_layout_, layout_);
+    convert_out_.Reset(dnnConversionCreate<T>, layout_, user_layout_);
+    share_mem_if_possible_ = share_mem_if_possible;
+    layout_is_user_layout_ = dnnLayoutCompare<T>(layout_, user_layout_);
+    VLOG(2) << "layout is user layout? " << layout_is_user_layout_;
+    if (!share_mem_if_possible_) {
+      // If we are not going to share memory, we will simply allocate
+      // memory upfront.
+      buffer();
+    }
+  }
+
+  void Reset() {
+    buffer_.reset();
+    dims_.clear();
+    size_ = 0;
+    user_layout_.Reset();
+    layout_.Reset();
+    convert_in_.Reset();
+    convert_out_.Reset();
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  template <typename IndexType>
+  void Reshape(const vector<IndexType>& dims) {
+    CAFFE_ENFORCE(
+        layout_is_user_layout_,
+        "Reshape is not allowed for custom layouts. "
+        "Convert to plain layout before invoking Reshape().");
+
+    TIndex new_size = 1;
+    for (auto i = 0; i < dims.size(); ++i) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(dims[i], 0);
+      new_size *= dims[i];
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == size_,
+        "New size and old size are not equal. Reshape is not possible.");
+
+    vector<TIndex> new_dims(dims.size());
+    vector<size_t> size(dims.size());
+    vector<size_t> strides(dims.size());
+    for (int i = 0; i < dims.size(); ++i) {
+      new_dims[i] = dims[i];
+      size[i] = dims[dims.size() - i - 1];
+      strides[i] = (i == 0) ? 1 : strides[i - 1] * size[i - 1];
+    }
+    dims_ = new_dims;
+    user_layout_.Reset(dims.size(), size.data(), strides.data());
+    layout_.Reset(dims.size(), size.data(), strides.data());
+    convert_in_.Reset(dnnConversionCreate<T>, user_layout_, layout_);
+    convert_out_.Reset(dnnConversionCreate<T>, layout_, user_layout_);
+  }
+
+  // Destructs the MKLMemory.
+  ~MKLMemory() {}
+
+  void CopyFrom(const void* ptr) {
+    if (share_mem_if_possible_ && layout_is_user_layout_) {
+      VLOG(2) << "Sharing underlying memory and skip copy.";
+      buffer_.reset(const_cast<void*>(ptr), [](void*) -> void {});
+    } else if (size_ == 0) {
+      VLOG(2) << "Cannot copy into empty MKL buffer.";
+    } else {
+      VLOG(2) << "Copying external content.";
+      MKLDNN_SAFE_CALL(dnnConversionExecute<T>(
+          convert_in_, const_cast<void*>(ptr), buffer()));
+    }
+  }
+
+  void CopyFrom(const TensorCPU& tensor) {
+    CAFFE_ENFORCE_EQ(
+        tensor.dims(),
+        dims_,
+        "Dims does not match the expected dims of the resource.");
+    CopyFrom(tensor.template data<T>());
+  }
+
+  void CopyFrom(const MKLMemory<T>& other) {
+    CAFFE_ENFORCE_EQ(
+        other.dims(),
+        dims_,
+        "Dims does not match the expected dims of the resource.");
+
+    if (share_mem_if_possible_ && dnnLayoutCompare<T>(other.layout_, layout_)) {
+      buffer_ = other.buffer_;
+    } else if (size_ == 0) {
+      VLOG(2) << "Cannot copy between empty MKL buffers";
+    } else {
+      PrimitiveWrapper<T> convert(
+          dnnConversionCreate<T>, other.layout_, layout_);
+      MKLDNN_SAFE_CALL(
+          dnnConversionExecute<T>(convert, other.buffer(), buffer()));
+    }
+  }
+
+  bool ShareFromRaw(const void* ptr) {
+    if (share_mem_if_possible_ && layout_is_user_layout_) {
+      buffer_.reset(const_cast<void*>(ptr), [](void*) -> void {});
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool ShareFromTensor(const TensorCPU& tensor) {
+    CAFFE_ENFORCE_EQ(
+        tensor.dims(),
+        dims_,
+        "Dims does not match the expected dims of the resource.");
+    return ShareFromRaw(tensor.template data<T>());
+  }
+
+  bool ShareFrom(const MKLMemory<T>& other) {
+    if (share_mem_if_possible_ && dnnLayoutCompare<T>(other.layout_, layout_)) {
+      VLOG(2) << "Sharing underlying memory.";
+      buffer_ = other.buffer_;
+      if (!buffer_.get()) {
+        VLOG(2) << "Warning: the source MKLMemory has no content yet, so the "
+                   "sharing actually has no effect.";
+      }
+      return true;
+    } else {
+      VLOG(2) << "Not sharing underlying memory.";
+      return false;
+    }
+  }
+
+  void CopyTo(void* ptr) const {
+    if (buffer_.get() == ptr) {
+      // This is already mapping to the same memory region. Skip copy.
+      VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
+                 "memory with the output.";
+      return;
+    }
+    CAFFE_ENFORCE(
+        buffer_.get(), "Canot copy out from an uninitialized MKLMemory.");
+    VLOG(2) << "Copy to external memory.";
+    MKLDNN_SAFE_CALL(dnnConversionExecute<T>(convert_out_, buffer_.get(), ptr));
+  }
+
+  void CopyTo(TensorCPU* tensor) const {
+    if (tensor->size() > 0 && buffer_.get() == tensor->mutable_data<T>()) {
+      // This is already mapping to the same memory region. Skip copy.
+      VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
+                 "memory with the output.";
+      return;
+    }
+    tensor->Resize(dims_);
+    CopyTo(tensor->mutable_data<T>());
+  }
+
+  // Copies to another MKL memory.
+  //
+  // This function
+  void CopyTo(
+      MKLMemory<T>* other,
+      const dnnPrimitive_t primitive = nullptr,
+      const dnnResourceType_t type = dnnResourceNumber) {
+    if (buffer_ && buffer_.get() == other->buffer_.get()) {
+      CAFFE_ENFORCE(
+          dnnLayoutCompare<T>(other->layout_, layout_),
+          "MKLMemory layout does not match, despite in-place buffers");
+      CAFFE_ENFORCE(
+          other->dims() == dims(),
+          "MKLMemory dimensions do not match, despite in-place buffers");
+      VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
+                 "memory with the output.";
+      // This is already mapping to the same memory region. Skip copy.
+      return;
+    }
+    // TODO(jiayq): if primitive creation is a big overhead and we will be
+    // consistently copying stuff with fixed src and dst layouts, consider
+    // making a cache for the primitive below.
+    VLOG(2) << "CopyTo requires copying. Performing direct copy.";
+    if (dims() != other->dims()) {
+      other->Reset(dims(), primitive, type);
+    }
+    if (size_ == 0) {
+      VLOG(2) << "Cannot copy between empty MKL buffers.";
+      return;
+    }
+    CAFFE_ENFORCE(
+        buffer_.get(), "Cannot copy out from an uninitialized MKLMemory.");
+    PrimitiveWrapper<T> convert(
+        dnnConversionCreate<T>, layout_, other->layout_);
+    MKLDNN_SAFE_CALL(
+        dnnConversionExecute<T>(convert, buffer_.get(), other->buffer()));
+  }
+
+  inline void* buffer() {
+    if (buffer_ == nullptr) {
+      CAFFE_ENFORCE(
+          layout_ != nullptr, "Trying to allocate buffer but layout is empty.");
+      if (size_ == 0) {
+        VLOG(2) << "Cannot allocate empty MKL buffer.";
+        return buffer_.get();
+      }
+      void* allocated = nullptr;
+      MKLDNN_SAFE_CALL(dnnAllocateBuffer<T>(&allocated, layout_));
+      buffer_.reset(allocated, [](void* ptr) -> void {
+        MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
+      });
+    }
+    return buffer_.get();
+  }
+
+  // MKLDNN does not use const void* even for the inputs, so we will
+  // have to use void* and rely on the underlying implementation to make
+  // sure that the buffer is actually not changed.
+  inline void* buffer() const {
+    CAFFE_ENFORCE(
+        buffer_ != nullptr, "Trying to refer to an unallocated buffer.");
+    return buffer_.get();
+  }
+
+  inline const vector<TIndex>& dims() const {
+    return dims_;
+  }
+
+  inline const int ndim() const { return dims_.size(); }
+
+  inline int dim32(const int i) const {
+    CAFFE_ENFORCE_LT(dims_.at(i), std::numeric_limits<int>::max());
+    return static_cast<int>(dims_[i]);
+  }
+
+  /**
+   * Returns the size (i.e., the number of items) in the buffer.
+   */
+  inline TIndex size() const {
+    return size_;
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor. Note that the passed in index
+   * must be between 0 (inclusive) and the number of dimensions, otherwise
+   * this function will produce a fatal message.
+   */
+  inline TIndex dim(const int i) const {
+    return dims_.at(i);
+  }
+
+  inline const LayoutWrapper<T>& layout() const {
+    return layout_;
+  }
+
+  inline bool is_user_layout() const {
+    return layout_is_user_layout_;
+  }
+
+  // Returns a view of the content. We mark this function const, but be noted
+  // that the returned std::shared_ptr is not const protected - user discretion
+  // is recommended for correctness.
+  std::shared_ptr<void> View(
+      dnnLayout_t layout_wanted,
+      dnnPrimitive_t primitive = nullptr,
+      dnnResourceType_t type = dnnResourceNumber) const {
+    std::lock_guard<std::mutex> lock(buffer_lock_);
+    if (dnnLayoutCompare<T>(layout_wanted, layout_)) {
+      // If they are the same, return the original content.
+      VLOG(2) << "Creating a view without the need of copying.";
+      return std::shared_ptr<void>(buffer_);
+    } else {
+      void* temp_buffer;
+      VLOG(2) << "Creating a view with copying.";
+      MKLDNN_SAFE_CALL(dnnAllocateBuffer<T>(&temp_buffer, layout_wanted));
+      PrimitiveWrapper<T> convert(
+          dnnConversionCreate<T>, layout_, layout_wanted);
+      MKLDNN_SAFE_CALL(dnnConversionExecute<T>(
+          convert, buffer_.get(), temp_buffer));
+      if (primitive && FLAGS_caffe2_mkl_implicit_layout_change) {
+        VLOG(2) << "Implicit layout change set. "
+                   "Changing the underlying storage.";
+        // We will need to call Reset to set up all the member variables.
+        // This is not thread safe, so we might want to double check if this
+        // makes sense in actual use cases.
+        const_cast<MKLMemory<T>*>(this)->Reset(
+            dims_, primitive, type, share_mem_if_possible_);
+        CAFFE_ENFORCE(dnnLayoutCompare<T>(layout_wanted, layout_),
+                      "You passed in a target layout that is not "
+                      "generated by the given primitive and type.");
+        buffer_.reset(temp_buffer, [](void* ptr) -> void {
+                MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
+            });
+        return std::shared_ptr<void>(buffer_);
+      } else {
+        return std::shared_ptr<void>(temp_buffer, [](void* ptr) -> void {
+                MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
+            });
+      }
+    }
+  }
+
+ private:
+  bool share_mem_if_possible_;
+  bool layout_is_user_layout_;
+  // The internal buffer in the specific dnn layout.
+  // It is marked mutable but any modification in a const function should
+  // be accompanied by the buffer lock, see the View() function.
+  mutable std::shared_ptr<void> buffer_;
+  // A mutex to control the access of buffer in the View() function.
+  mutable std::mutex buffer_lock_;
+  // The dimensions in the same order as Caffe2 does. This is used to
+  // interface with C2.
+  vector<TIndex> dims_;
+  // Number of items in the buffer.
+  TIndex size_ = -1;
+  // The user dnn layout.
+  LayoutWrapper<T> user_layout_;
+  // The internal dnn layout.
+  LayoutWrapper<T> layout_;
+  // The primitive to use to convert from user layout to internal layout
+  PrimitiveWrapper<T> convert_in_;
+  // The primitive to use to convert from internal layout to user layout
+  PrimitiveWrapper<T> convert_out_;
+
+  DISABLE_COPY_AND_ASSIGN(MKLMemory);
+};
+
+template <typename T>
+class MKLWorkspace {
+ public:
+  MKLWorkspace(const LayoutWrapper<T>& layout) {
+    MKLDNN_SAFE_CALL(mkl::dnnAllocateBuffer<T>(&buffer_, layout));
+  }
+  ~MKLWorkspace() {
+    dnnReleaseBuffer<T>(buffer_);
+  }
+  T* buffer() {
+    return reinterpret_cast<T*>(buffer_);
+  }
+
+ private:
+  void* buffer_;
+  DISABLE_COPY_AND_ASSIGN(MKLWorkspace);
+};
+
+} // namespace mkl
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MKL_MKL_MEMORY_H_
diff --git a/caffe2/mkl/utils/mkl_operator.h b/caffe2/mkl/utils/mkl_operator.h
new file mode 100644
index 0000000..5e7b871
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_operator.h
@@ -0,0 +1,127 @@
+#ifndef CAFFE2_UTILS_MKL_OPERATOR_H_
+#define CAFFE2_UTILS_MKL_OPERATOR_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/mkl/utils/mkl_dnn_cppwrapper.h"
+#include "caffe2/mkl/utils/mkl_memory.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DECLARE_bool(caffe2_mkl_memonger_in_use);
+
+namespace caffe2 {
+
+CAFFE_DECLARE_REGISTRY(
+    MKLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_MKL_OPERATOR_CREATOR(key, ...) \
+  CAFFE_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_MKL_OPERATOR(name, ...) \
+  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_MKL_OPERATOR_STR(str_name, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_MKL_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+namespace mkl {
+// MKLOperator is the base scaffolding of the operators that uses MKLDNN. It
+// provides a few operators that are useful to MKLDNN specific implementations.
+template <typename T>
+class MKLOperator : public OperatorBase {
+ public:
+  explicit MKLOperator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        context_(operator_def.device_option()) {}
+  virtual ~MKLOperator() {}
+
+  inline const MKLMemory<T>& Input(int idx) {
+    return OperatorBase::template Input<MKLMemory<T>>(idx);
+  }
+  inline MKLMemory<T>* Output(int idx) {
+    return OperatorBase::template Output<MKLMemory<T>>(idx);
+  }
+
+  // The run function of Operator switches to the device, and then carries out
+  // the actual computation with RunOnDevice(). You should implement RunOnDevice
+  // instead of Run().
+  bool Run(int /* unused */ /*stream_id*/) final {
+    // Since MKLDNN does not need to do SwithToDevice and
+    // FinishDeviceComputation,
+    // it is always just a re-route to RunOnDevice().
+    try {
+      return RunOnDevice();
+    } catch (EnforceNotMet& err) {
+      err.AppendMessage(getErrorMsg());
+      throw;
+    }
+  }
+
+  // Waits for a previous event. Note that to properly wait and run
+  // asynchronously, WaitEvent, RunAsync and Record should all be executed
+  // on the same CPU thread.
+  void WaitEvent(const Event& ev, int /* unused */) final {
+    context_.WaitEvent(ev);
+  }
+
+  void WaitEvents(const std::vector<const Event*>& events, int /* unused */)
+      final {
+    for (const auto& ev : events) {
+      context_.WaitEvent(*ev);
+    }
+  }
+
+  void RecordEvent(const char* err_msg = nullptr) final {
+    if (event_) {
+      context_.Record(event_.get(), err_msg);
+    }
+  }
+
+  virtual bool RunOnDevice() = 0;
+
+  inline void ExecutePrimitive() {
+    MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
+  }
+
+ protected:
+  std::string getErrorMsg() {
+    if (has_debug_def()) {
+      return "Error from operator: " + ProtoDebugString(debug_def());
+    } else {
+      return "Error from operator: no op def";
+    }
+  }
+
+  MKLContext context_;
+  // The primitive used in the operator.
+  PrimitiveWrapper<T> primitive_;
+  // Size cache for all the input sizes.
+  vector<vector<TIndex>> input_size_cache_;
+  // An internal MKLMemory buffer. This is usually handy when we have a
+  // single output from the operator. If your operator has multiple outputs
+  // then you should allocate your own buffer.
+  MKLMemory<T> buffer_;
+  // The resources vector that we will need to use;
+  void* resources_[dnnResourceNumber];
+};
+} // namespace mkl
+
+#define USE_MKLOPERATOR_FUNCTIONS(T)                            \
+  USE_OPERATOR_BASE_FUNCTIONS;                                  \
+  /* using override */ using MKLOperator<T>::Input;             \
+  /* using override */ using MKLOperator<T>::Output;            \
+  /* using override */ using MKLOperator<T>::ExecutePrimitive;  \
+  /* using override */ using MKLOperator<T>::primitive_;        \
+  /* using override */ using MKLOperator<T>::input_size_cache_; \
+  /* using override */ using MKLOperator<T>::buffer_;           \
+  /* using override */ using MKLOperator<T>::resources_
+
+#define USE_SIMPLE_MKL_CTOR_DTOR(name, T)              \
+  name(const OperatorDef& operator_def, Workspace* ws) \
+      : MKLOperator<T>(operator_def, ws) {}            \
+  virtual ~name() {}
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MKL_OPERATOR_H_
diff --git a/caffe2/mkl/utils/mkl_version_check.h b/caffe2/mkl/utils/mkl_version_check.h
new file mode 100644
index 0000000..7a125c2
--- /dev/null
+++ b/caffe2/mkl/utils/mkl_version_check.h
@@ -0,0 +1,13 @@
+#ifndef CAFFE2_UTILS_MKL_MKL_VERSION_CHECK_H_
+#define CAFFE2_UTILS_MKL_MKL_VERSION_CHECK_H_
+#ifdef CAFFE2_USE_MKL
+
+#include <mkl.h>
+
+#if INTEL_MKL_VERSION >= 20170000
+#define CAFFE2_HAS_MKL_SGEMM_PACK
+#define CAFFE2_HAS_MKL_DNN
+#endif // INTEL_MKL_VERSION >= 20170000
+
+#endif // CAFFE2_USE_MKL
+#endif // CAFFE2_UTILS_MKL_MKL_VERSION_CHECK_H_
diff --git a/caffe2/mkl/utils/sgemm_pack.h b/caffe2/mkl/utils/sgemm_pack.h
new file mode 100644
index 0000000..5d45f84
--- /dev/null
+++ b/caffe2/mkl/utils/sgemm_pack.h
@@ -0,0 +1,50 @@
+#ifndef CAFFE2_UTILS_MKL_SGEMM_PACK_H_
+#define CAFFE2_UTILS_MKL_SGEMM_PACK_H_
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace mkl {
+struct MKLPackedMatrix {
+  CBLAS_IDENTIFIER identifier_;
+  CBLAS_TRANSPOSE trans_;
+  int m_;
+  int n_;
+  int k_;
+  float alpha_;
+  int ld_;
+  float* data_ = nullptr;
+
+  MKLPackedMatrix(
+      const CBLAS_IDENTIFIER identifier,
+      const CBLAS_TRANSPOSE trans,
+      const int m,
+      const int n,
+      const int k,
+      const float alpha,
+      const float* src,
+      const int ld)
+      : identifier_(identifier),
+        trans_(trans),
+        m_(m),
+        n_(n),
+        k_(k),
+        alpha_(alpha),
+        ld_(ld) {
+    data_ = cblas_sgemm_alloc(identifier, m, n, k);
+    CAFFE_ENFORCE(data_, "MKL runtime error: cannot allocate sgemm memory.");
+    cblas_sgemm_pack(
+        CblasRowMajor, identifier, trans, m, n, k, alpha, src, ld, data_);
+  }
+
+  ~MKLPackedMatrix() {
+    if (data_) {
+      cblas_sgemm_free(data_);
+    }
+  }
+};
+
+} // namespace mkl
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MKL_SGEMM_PACK_H_
diff --git a/caffe2/mobile/CMakeLists.txt b/caffe2/mobile/CMakeLists.txt
new file mode 100644
index 0000000..de20a27
--- /dev/null
+++ b/caffe2/mobile/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_subdirectory(contrib)
+
+# CPU source, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/caffe2/mobile/contrib/CMakeLists.txt b/caffe2/mobile/contrib/CMakeLists.txt
new file mode 100644
index 0000000..29a3581
--- /dev/null
+++ b/caffe2/mobile/contrib/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_subdirectory(ios)
+add_subdirectory(opengl)
+if (USE_ACL)
+  add_subdirectory(arm-compute)
+endif()
+# Finally pass the src lists back to the parent
+
+if (USE_NNAPI)
+  add_subdirectory(nnapi)
+endif()
+
+# CPU source, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/caffe2/mobile/contrib/arm-compute/CMakeLists.txt b/caffe2/mobile/contrib/arm-compute/CMakeLists.txt
new file mode 100644
index 0000000..f064601
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(core)
+add_subdirectory(operators)
+add_subdirectory(test)
+
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/caffe2/mobile/contrib/arm-compute/README.md b/caffe2/mobile/contrib/arm-compute/README.md
new file mode 100644
index 0000000..efb6915
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/README.md
@@ -0,0 +1,64 @@
+# Caffe2 - ARM Compute Backend
+
+## Build
+
+To build, clone and install scons
+
+```
+brew install scons
+```
+
+set ANDROID_NDK to /opt/android_ndk/xxx(e.g. /opt/android_ndk/android-ndk-r15c/)
+
+setup toolchain:
+Let's say $PATH_TO_TOOLCHAIN is the directory you want to store the toolchain files.
+
+arm
+```
+rm -rf $PATH_TO_TOOLCHAIN
+$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm --api 21 --install-dir $PATH_TO_TOOLCHAIN
+```
+
+arm64
+```
+rm -rf PATH_TO_TOOLCHAIN
+$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm64 --api 21 --install-dir $PATH_TO_TOOLCHAIN
+```
+
+add the toolchain path to .bashrc/.zshrc etc.
+e.g.
+```
+export PATH=$PATH:$PATH_TO_TOOLCHAIN/bin
+```
+
+use the build\_android.sh:
+
+for 32-bit ARM
+```
+./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON
+```
+
+for 64-bit ARM
+```
+./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang
+```
+
+Before switch between 32 bit and 64 bit, please make sure to delete build\_android folder:
+```
+rm -rf build_android
+```
+## Test
+Plug in an android device, and run a test
+
+```
+cd build_android
+adb push bin/gl_conv_op_test /data/local/tmp && adb shell '/data/local/tmp/gl_conv_op_test'
+```
+or use a script to run them all
+
+In caffe2 top level directory
+```
+./caffe2/mobile/contrib/arm-compute/run_tests.sh build_android
+```
+
+Note that some tests(fully_connected and alignment) have been disabled until the next release of ACL.
diff --git a/caffe2/mobile/contrib/arm-compute/core/CMakeLists.txt b/caffe2/mobile/contrib/arm-compute/core/CMakeLists.txt
new file mode 100644
index 0000000..dbc170e
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/arm-compute/core/context.cc b/caffe2/mobile/contrib/arm-compute/core/context.cc
new file mode 100644
index 0000000..0bf76f1
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/context.cc
@@ -0,0 +1,39 @@
+#include "context.h"
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(GLTensor<GLfloat>);
+CAFFE_KNOWN_TYPE(GLTensor<GLhalf>);
+CAFFE_KNOWN_TYPE(GLTensor<half>);
+CAFFE_KNOWN_TYPE(Tensor<GLContext>);
+
+bool GLContext::initialized = false;
+
+GLContext::GLContext() {
+  CAFFE_ENFORCE(arm_compute::opengles31_is_available());
+  if(!initialized) {
+    arm_compute::GCScheduler::get().default_init();
+    initialized = true;
+  }
+}
+
+void EventCreateOPENGL(const DeviceOption & /* unused */,
+                       Event * /* unused */) {}
+void EventRecordOPENGL(Event * /* unused */, const void * /* unused */,
+                       const char * /* unused */) {}
+void EventWaitOPENGLOPENGL(const Event * /* unused */, void * /* unused */) {}
+void EventFinishOPENGL(const Event * /* unused */) {}
+void EventResetOPENGL(Event * /* unused */) {}
+
+REGISTER_EVENT_CREATE_FUNCTION(OPENGL, EventCreateOPENGL);
+REGISTER_EVENT_RECORD_FUNCTION(OPENGL, EventRecordOPENGL);
+REGISTER_EVENT_WAIT_FUNCTION(OPENGL, OPENGL, EventWaitOPENGLOPENGL);
+REGISTER_EVENT_FINISH_FUNCTION(OPENGL, EventFinishOPENGL);
+REGISTER_EVENT_RESET_FUNCTION(OPENGL, EventResetOPENGL);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/context.h b/caffe2/mobile/contrib/arm-compute/core/context.h
new file mode 100644
index 0000000..1869eb8
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/context.h
@@ -0,0 +1,391 @@
+#ifndef CAFFE2_OPENGL_CONTEXT_H_
+#define CAFFE2_OPENGL_CONTEXT_H_
+
+#ifdef CAFFE2_OPENGL_BACKEND
+#error Can only build one OpenGL backend at a time.
+#else
+#define CAFFE2_OPENGL_BACKEND
+#endif
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/blob.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/timer.h"
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+#include "utils/Utils.h"
+#include "include/half/half.hpp"
+
+namespace caffe2 {
+
+typedef half_float::half half;
+//#define ACL_USE_FLOAT32
+#ifdef ACL_USE_FLOAT32
+ typedef float DataType;
+#else
+ typedef half DataType;
+#endif
+
+template <typename T> class GLTensor;
+
+class GLContext final {
+public:
+  static bool initialized;
+  explicit GLContext();
+  explicit GLContext(const DeviceOption &option) {
+    DCHECK_EQ(option.device_type(), OPENGL);
+    GLContext();
+  }
+  ~GLContext() {}
+
+  static void sync() { arm_compute::GCScheduler::get().memory_barrier(); }
+
+  template <typename T>
+  using deleted_unique_ptr = std::unique_ptr<T, std::function<void(T *)>>;
+  template <typename T>
+    static deleted_unique_ptr<const GLTensor<T>> getGLTensor(const Blob *b, const GLTensor<T>* X_old = nullptr) {
+    if (b->IsType<TensorCPU>()) {
+
+      auto &Xcpu = b->Get<TensorCPU>();
+      GLTensor<T> *X_raw_ptr;
+      if (X_old) {
+        X_raw_ptr = const_cast<GLTensor<T> *>(X_old);
+        X_raw_ptr->ResizeLike(Xcpu);
+        deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
+        return X_unique_ptr;
+      } else {
+        X_raw_ptr = new GLTensor<T>();
+        X_raw_ptr->ResizeLike(Xcpu);
+        deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, GLTensorDeleter<T>);
+        return X_unique_ptr;
+      }
+    }
+    const GLTensor<T> *X_raw_ptr;
+    X_raw_ptr = &b->Get<GLTensor<T>>();
+    deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
+    return X_unique_ptr;
+  }
+
+  /*
+   * Everything below is basically boiler plate for Context classes
+   */
+  static std::pair<void *, MemoryDeleter> New(size_t nbytes) {
+    return std::pair<void *, MemoryDeleter>(malloc(nbytes), GLContext::Delete);
+  }
+
+  static void Delete(void *data) {
+    if (data != nullptr) {
+      free(data);
+    }
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void CopyBytes(size_t nbytes, const void *src, void *dst) {}
+
+  template <typename T, class SrcContext, class DstContext>
+  inline void Copy(int n, const T *src, T *dst) {
+    CopyBytes<SrcContext, DstContext>(n * sizeof(T),
+                                      static_cast<const void *>(src),
+                                      static_cast<void *>(dst));
+  }
+
+  template <class SrcContext, class DstContext>
+  inline void CopyItems(const TypeMeta &meta, size_t n, const void *src,
+                        void *dst) {
+    CAFFE_ENFORCE(!meta.copy(), "GLContext requires fundamental types.");
+    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+  }
+
+  void SwitchToDevice(int a, ...) { /* TODO */
+  }
+  void SwitchToDevice() { SwitchToDevice(0); }
+
+  inline void WaitEvent(const Event &ev) { /* TODO */
+  }
+  void FinishDeviceComputation() { /* TODO */
+  }
+  inline void Record(Event *ev, const char *&) const { /* TODO */
+  }
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+    return true;
+  }
+  bool HasAsyncPartDefault() const { return false; }
+  bool SupportsAsyncScheduling() const { return false; }
+
+private:
+  template <typename T>
+  static void GLTensorDeleter(const GLTensor<T> *X) {
+    delete X;
+  }
+
+  template <typename T>
+  static void EmptyDeleter(const GLTensor<T> *X) {
+    return;
+  }
+
+};
+
+template <typename T> class GLTensor {
+public:
+  GLTensor() { tensor_ = make_unique<arm_compute::GCTensor>(); }
+  ~GLTensor() { tensor_->allocator()->free(); }
+
+  template <typename TensorType> bool ResizeLike(TensorType &X, bool free = false) {
+    bool need_allocation = SetDims(X.dims());
+    for (int i = 0; i < dims_.size(); i++) {
+      shape_.set(dims_.size() - i - 1, dims_[i]);
+    }
+
+    if (need_allocation) {
+      if (free) {
+        tensor_->allocator()->free();
+      }
+      #ifdef ACL_USE_FLOAT32
+      tensor_->allocator()->init(
+                                 arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
+      #else
+      tensor_->allocator()->init(
+                                 arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
+      #endif
+    } else {
+      tensor_->info()->set_tensor_shape(shape_);
+    }
+    return need_allocation;
+  }
+
+  template <typename... Ts> bool Resize(Ts... dim_source) {
+    bool need_allocation = SetDims(dim_source...);
+    for (int i = 0; i < dims_.size(); i++) {
+      shape_.set(dims_.size() - i - 1, dims_[i]);
+    }
+    if (need_allocation) {
+      // TODO: Make it type generic
+      tensor_->allocator()->free();
+      #ifdef ACL_USE_FLOAT32
+      tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
+      #else
+      tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
+      #endif
+    } else {
+      tensor_->info()->set_tensor_shape(shape_);
+    }
+    return need_allocation;
+  }
+
+  // Allocates and copies data if needed
+  void lazy_allocate(const Blob *b, bool allocate_tensor, bool try_to_copy_from_cpu) const {
+    if (try_to_copy_from_cpu) {
+      // we skip GLTensors, nothing to copy
+      if (!b->IsType<GLTensor>()) {
+        // typically only called on the second run
+        if (allocate_tensor) {
+          allocate();
+        }
+        Timer timer;
+        fillGLTensor(b);
+        auto millis = timer.MilliSeconds();
+        VLOG(2) << "[C2DEBUG] fillGLTensor timer: " << millis;
+      }
+    }
+  }
+
+  void allocate() const {
+    tensor_->allocator()->allocate();
+  }
+
+  void fillGLTensor(const Blob *b) const {
+    if (b->IsType<TensorCPU>()) {
+      auto &Xcpu = b->Get<TensorCPU>();
+      VLOG(2) << "[C2DEBUG] fillGLTensor dims: " << Xcpu.dims();
+      T *buffer = map();
+      char *byte_buffer = (char *)buffer;
+      auto info = tensor_->info();
+      arm_compute::Window it_window;
+      it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
+      arm_compute::Iterator it(get_underlying(), it_window);
+      if (Xcpu.ndim() == 4) {
+        auto C = Xcpu.dim32(1);
+        auto H = Xcpu.dim32(2);
+        auto W = Xcpu.dim32(3);
+        arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+            std::copy_n(Xcpu.data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
+          },
+          it);
+      } else if (Xcpu.ndim() == 3) {
+        auto H = Xcpu.dim32(1);
+        auto W = Xcpu.dim32(2);
+        arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+            std::copy_n(Xcpu.data<float>() + (id.z() * (W * H) + id.y() * W), W, reinterpret_cast<T *>(it.ptr()));
+        },
+        it);
+      } else if (Xcpu.ndim() == 2) {
+        auto W = Xcpu.dim32(1);
+        arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+            std::copy_n(Xcpu.data<float>() + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
+        },
+        it);
+      } else {
+        arm_compute::Window w;
+        w.use_tensor_dimensions(info->tensor_shape());
+        arm_compute::Iterator i(get_underlying(), w);
+        auto size = Xcpu.dim32(0);
+        std::copy_n(Xcpu.data<float>(), size, reinterpret_cast<T *>(i.ptr()));
+      }
+      unmap();
+    }
+  }
+
+
+  const int32_t ndim() const { return dims_.size(); }
+
+  vector<TIndex> dims() const { return dims_; }
+
+  const int32_t dim32(const int index) const { return dims_.at(index); }
+
+  const int32_t size() const {
+    int32_t s = 1;
+    for (int i = 0; i < dims_.size(); i++) {
+      s *= dims_[i];
+    }
+    return s;
+  }
+
+  arm_compute::GCTensor *get_underlying() const { return tensor_.get(); }
+
+  T *map() const {
+    GLContext::sync();
+    tensor_->map(true);
+    return reinterpret_cast<T *>(tensor_->buffer());
+  }
+
+  void unmap() const { return tensor_->unmap(); }
+
+  void sync() const {
+    GLContext::sync();
+    tensor_->map();
+    tensor_->unmap();
+  }
+
+private:
+  template <typename TI, typename = typename std::enable_if<
+                             std::is_integral<TI>::value>::type>
+  bool SetDims(const vector<TI> &src) {
+    auto old_size = size_;
+    dims_.resize(src.size());
+    TIndex new_size = 1;
+    for (unsigned int i = 0; i < src.size(); ++i) {
+      new_size *= src[i];
+      dims_[i] = src[i];
+    }
+    size_ = new_size;
+    return size_ > old_size;
+  }
+
+  bool SetDims() {
+    auto old_size = size_;
+    dims_.resize(0);
+    size_ = 1;
+    return size_ > old_size;
+  }
+
+  bool SetDims(const TIndex d0) {
+    auto old_size = size_;
+    dims_.resize(1);
+    dims_[0] = d0;
+    size_ = d0;
+    return size_ > old_size;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1) {
+    auto old_size = size_;
+    dims_.resize(2);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    size_ = d0 * d1;
+    return size_ > old_size;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+    auto old_size = size_;
+    dims_.resize(3);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    size_ = d0 * d1 * d2;
+    return size_ > old_size;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2,
+               const TIndex d3) {
+    auto old_size = size_;
+    dims_.resize(4);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    dims_[3] = d3;
+    size_ = d0 * d1 * d2 * d3;
+    return size_ > old_size;
+  }
+
+  vector<TIndex> dims_;
+  TIndex size_ = -1;
+  arm_compute::TensorShape shape_;
+  unique_ptr<arm_compute::GCTensor> tensor_;
+};
+
+template<typename T = DataType>
+void getTensorCPU(const GLTensor<T>& g_, TensorCPU& g) {
+  VLOG(2) << " [C2DEBUG] getTensorCPU " << g_.dims();
+  g.Resize(g_.dims());
+  g_.map();
+  auto tensor = g_.get_underlying();
+  auto info = tensor->info();
+  arm_compute::Window it_window;
+  it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
+  arm_compute::Iterator it(tensor, it_window);
+  if (g_.ndim() == 4) {
+    auto C = g_.dim32(1);
+    auto H = g_.dim32(2);
+    auto W = g_.dim32(3);
+    arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+        std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W);
+      },
+      it);
+  } else if (g_.ndim() == 3) {
+    auto H = g_.dim32(1);
+    auto W = g_.dim32(2);
+    arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+        std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + (id.z() * (W * H) + id.y() * W));
+      },
+      it);
+  } else if (g_.ndim() == 2) {
+    auto W = g_.dim32(1);
+    arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
+        std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id.y() * W);
+      },
+      it);
+  } else {
+    arm_compute::Window w;
+    w.use_tensor_dimensions(info->tensor_shape());
+    arm_compute::Iterator i(tensor, w);
+    auto size = g_.dim32(0);
+    std::copy_n(reinterpret_cast<T *>(i.ptr()), size, g.mutable_data<float>());
+  }
+  g_.unmap();
+}
+
+
+} // namespace caffe2
+
+#endif /* CAFFE2_OPENGL_CONTEXT_H_ */
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.cc b/caffe2/mobile/contrib/arm-compute/core/net_gl.cc
new file mode 100644
index 0000000..78452d9
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.cc
@@ -0,0 +1,230 @@
+#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/core/net.h"
+
+#include <iostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/static_tracepoint.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+GLNet::GLNet(
+    const std::shared_ptr<const NetDef>& net_def,
+    Workspace* ws)
+    : NetBase(net_def, ws) {
+  ws_ = ws;
+  VLOG(1) << "Constructing GLNet " << net_def->name();
+  const bool net_def_has_device_option = net_def->has_device_option();
+  // Initialize the operators
+  for (int idx = 0; idx < net_def->op_size(); ++idx) {
+    const auto& operator_def = net_def->op(idx);
+    VLOG(1) << "Creating operator " << operator_def.name() << ": "
+            << operator_def.type();
+    output_blobs_.push_back(operator_def.output(0));
+    if (operator_def.has_device_option() && operator_def.device_option().device_type() == OPENGL) {
+      opengl_device_.push_back(true);
+    } else {
+      opengl_device_.push_back(false);
+    }
+
+    std::unique_ptr<OperatorBase> op{nullptr};
+    OperatorDef temp_def(operator_def);
+    if (temp_def.type() == "GenerateProposals") {
+      auto* arg = temp_def.add_arg();
+      arg->set_name("fill_output");
+      arg->set_i(1);
+    }
+    if (!operator_def.has_device_option() && net_def_has_device_option) {
+      // In the case that the operator def does not specify a device option but
+      // the net def has a default option, we copy the device option over to the
+      // operator def.
+      temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
+      op = CreateOperator(temp_def, ws, idx);
+    } else {
+      op = CreateOperator(temp_def, ws, idx);
+      op->set_debug_def(
+          std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
+    }
+    operators_.emplace_back(std::move(op));
+  }
+}
+
+bool GLNet::Run() {
+  StartAllObservers();
+
+  if (first_run_) {
+    first_run_ = false;
+    for (auto& op: operators_) {
+      VLOG(2) << "[C2DEBUG] configure " << ProtoDebugString(op->debug_def());
+      op->Run();
+    }
+    for (auto& op: operators_) {
+      VLOG(2) << "[C2DEBUG] second run " << ProtoDebugString(op->debug_def());
+      op->Run();
+    }
+    // Change the parameters for GenerateProposals
+    for (int i = 0; i < operators_.size(); ++i) {
+      if (operators_[i]->debug_def().type() == "GenerateProposals") {
+        OperatorDef temp_def(operators_[i]->debug_def());
+        auto* arg = temp_def.add_arg();
+        arg->set_name("fill_output");
+        arg->set_i(0);
+        operators_[i].reset(CreateOperator(temp_def, ws_, i).release());
+      }
+    }
+  }
+
+  VLOG(1) << "Running net " << name_;
+  int i = 0;
+  //Timer timer;
+  for (auto& op : operators_) {
+    VLOG(2) << "[C2DEBUG] running " << ProtoDebugString(op->debug_def()) << " " << i;
+    ++i;
+    //timer.Start();
+    bool res = op->Run();
+    // auto millis = timer.MilliSeconds();
+    // LOG(ERROR) << "[C2DEBUG] OP " << op->debug_def().type() << " " << millis <<" ms.";
+    if (!res) {
+      LOG(ERROR) << "[C2DEBUG] Operator failed: " << ProtoDebugString(op->debug_def());
+      return false;
+    }
+  }
+  StopAllObservers();
+  return true;
+}
+
+bool GLNet::RunAsync() {
+  return Run();
+}
+
+namespace {
+template <typename A, typename B>
+bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
+  return x.second > y.second;
+}
+}
+
+vector<float> GLNet::TEST_Benchmark(
+    const int warmup_runs,
+    const int main_runs,
+    const bool run_individual) {
+  std::cout << "Starting benchmark." << std::endl;
+  std::cout << "Running warmup runs." << std::endl;
+  CAFFE_ENFORCE(
+      warmup_runs >= 0,
+      "Number of warm up runs should be non negative, provided ",
+      warmup_runs,
+      ".");
+  for (int i = 0; i < warmup_runs; ++i) {
+    CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
+  }
+
+  auto last_blob = output_blobs_[output_blobs_.size() - 1];
+  Blob *gpu_out_blob = ws_->GetBlob(last_blob);
+  if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
+    auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
+    // Enforce gpu execution
+    g_.sync();
+  }
+
+  std::cout << "Main runs." << std::endl;
+  CAFFE_ENFORCE(
+      main_runs >= 0,
+      "Number of main runs should be non negative, provided ",
+      main_runs,
+      ".");
+  Timer timer;
+  for (int i = 0; i < main_runs; ++i) {
+    CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
+  }
+  if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
+    auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
+    g_.sync();
+  }
+
+  auto millis = timer.MilliSeconds();
+  std::cout << "Main run finished. Milliseconds per iter: "
+            << millis / main_runs
+            << ". Iters per second: " << 1000.0 * main_runs / millis << std::endl;
+
+  vector<float> time_per_op(operators_.size(), 0);
+  vector<uint64_t> flops_per_op(operators_.size(), 0);
+  CaffeMap<string, float> time_per_op_type;
+  if (run_individual) {
+    for (int i = 0; i < main_runs; ++i) {
+      for (auto& op : operators_) {
+        op->ResetEvent();
+      }
+      int idx = 0;
+      for (auto& op : operators_) {
+        const string& op_type = op->debug_def().type();
+        timer.Start();
+        CAFFE_ENFORCE(
+            op->Run(),
+            "operator ",
+            op->debug_def().name(),
+            "(",
+            op_type,
+            ") has failed.");
+        if (opengl_device_[idx] && op_type != "CopyFromGL" && op_type != "Reshape") {
+          Blob *gpu_out_blob = ws_->GetBlob(output_blobs_[idx]);
+          auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
+          g_.sync();
+        }
+        float spent = timer.MilliSeconds();
+        time_per_op[idx] += spent;
+        time_per_op_type[op_type] += spent;
+        ++idx;
+      }
+    }
+
+    int idx = 0;
+    for (auto& op : operators_) {
+      const string& op_type = op->debug_def().type();
+      const string& print_name =
+          (op->debug_def().name().size()
+               ? op->debug_def().name()
+               : (op->debug_def().output_size() ? op->debug_def().output(0)
+                                                : "NO_OUTPUT"));
+      std::stringstream flops_str;
+      if (flops_per_op[idx]) {
+        flops_str << " ("
+                  << to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
+                  << " GFLOPS)";
+      }
+      std::cout << "[C2DEBUG] Operator #" << idx << " (" << print_name << ", " << op_type
+                << ") " << time_per_op[idx] / main_runs << " ms/iter"
+                << flops_str.str() << std::endl;
+      ++idx;
+    }
+    std::cout << "[C2DEBUG] Time per operator type:" << std::endl;
+    // sort by decreasing time spending.
+    std::vector<std::pair<string, float>> time_per_op_type_vec(
+        time_per_op_type.begin(), time_per_op_type.end());
+    std::sort(
+        time_per_op_type_vec.begin(),
+        time_per_op_type_vec.end(),
+        PairLargerThan<string, float>);
+    for (const auto& item : time_per_op_type_vec) {
+      std::cout << "[C2DEBUG] " << std::setw(15) << std::setfill(' ') << item.second / main_runs
+                << " " << item.first << std::endl;
+    }
+  }
+  // We will reuse time_per_op to return the result of BenchmarkNet.
+  for (int i = 0; i < time_per_op.size(); ++i) {
+    time_per_op[i] /= main_runs;
+  }
+  time_per_op.insert(time_per_op.begin(), millis / main_runs);
+  return time_per_op;
+}
+
+REGISTER_NET(opengl, GLNet);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
new file mode 100644
index 0000000..dc8643b
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
@@ -0,0 +1,65 @@
+#ifndef CAFFE2_CORE_NET_GL_H_
+#define CAFFE2_CORE_NET_GL_H_
+
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// This is the very basic structure you need to run a network with
+// ARM's compute library
+class GLNet : public NetBase {
+ private:
+  bool first_run_ = true;
+  Workspace* ws_;
+  // record output blob for sync step in operator level benchmarking
+  std::vector<string> output_blobs_;
+  // record operator type and only sync after gpu op
+  std::vector<bool> opengl_device_;
+ public:
+  GLNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
+  bool SupportsAsync() override {
+    return false;
+  }
+
+  vector<float> TEST_Benchmark(
+      const int warmup_runs,
+      const int main_runs,
+      const bool run_individual) override;
+
+  /*
+   * This returns a list of pointers to objects stored in unique_ptrs.
+   * Used by Observers.
+   *
+   * Think carefully before using.
+   */
+  vector<OperatorBase*> GetOperators() const override {
+    vector<OperatorBase*> op_list;
+    for (auto& op : operators_) {
+      op_list.push_back(op.get());
+    }
+    return op_list;
+  }
+
+ protected:
+  bool Run();
+  bool RunAsync();
+  bool DoRunAsync() override {
+    return Run();
+  }
+
+  vector<unique_ptr<OperatorBase>> operators_;
+
+  DISABLE_COPY_AND_ASSIGN(GLNet);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_NET_SIMPLE_H_
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.cc b/caffe2/mobile/contrib/arm-compute/core/operator.cc
new file mode 100644
index 0000000..bd4337a
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.cc
@@ -0,0 +1,9 @@
+#include "operator.h"
+
+namespace caffe2 {
+
+CAFFE_DEFINE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
+                      Workspace *);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.h b/caffe2/mobile/contrib/arm-compute/core/operator.h
new file mode 100644
index 0000000..0371730
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.h
@@ -0,0 +1,27 @@
+#ifndef CAFFE2_OPENGL_OPERATOR_H_
+#define CAFFE2_OPENGL_OPERATOR_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+
+CAFFE_DECLARE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
+                       Workspace *);
+#define REGISTER_GL_OPERATOR_CREATOR(key, ...)                                 \
+  CAFFE_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR(name, ...)                                        \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                  \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() {              \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                            \
+  }                                                                            \
+  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR_STR(str_name, ...)                                \
+  CAFFE_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
+
+#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...)                    \
+  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPENGL_OPERATOR_H_
diff --git a/caffe2/mobile/contrib/arm-compute/core/rewrite_net.cc b/caffe2/mobile/contrib/arm-compute/core/rewrite_net.cc
new file mode 100644
index 0000000..764b444
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/rewrite_net.cc
@@ -0,0 +1,257 @@
+#include "rewrite_net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/proto_utils.h"
+#include <unordered_map>
+
+namespace caffe2 {
+
+struct Analysis {
+  struct SSA {
+    using BlobVersions = std::unordered_map<std::string, size_t>;
+    BlobVersions inVersions;
+    BlobVersions outVersions;
+  };
+  std::vector<SSA> ssa;
+  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
+};
+
+static Analysis analyzeNet(const NetDef& net) {
+  Analysis::SSA::BlobVersions frontier;
+  Analysis analysis;
+
+  auto play = [&](size_t i, const OperatorDef& op) {
+    Analysis::SSA::BlobVersions inVersions;
+    for (const auto& s : op.input()) {
+      inVersions[s] = frontier[s];
+      analysis.inUsages[s][frontier[s]].push_back(i);
+    }
+    Analysis::SSA::BlobVersions outVersions;
+    for (const auto& s : op.output()) {
+      if (frontier.find(s) != frontier.end()) {
+        frontier[s] += 1;
+      }
+      outVersions[s] = frontier[s];
+    }
+    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
+  };
+
+  for (auto i = 0; i < net.op_size(); ++i) {
+    play(i, net.op(i));
+  }
+  return analysis;
+}
+
+static void insertCopyFromGLOp(NetDef& predictNet, const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_name("CopyFromGL");
+  op->set_type("CopyFromGL");
+  op->add_input(cpu_blob + "_M");
+  op->add_output(cpu_blob);
+}
+
+static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& cpuOp) {
+  // Do some validation of the outputs. For this version, we require:
+  // - a single input (first element of external_input()) is consumed by the NetDef
+  // - a single output (first element of external_output()) is produced by the NetDef.
+  // - the input is consumed by def.op(0), and this is the only consumer.
+  // - the output is produced by def.op(-1).
+  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
+  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
+  auto analysis = analyzeNet(def);
+  // enforce a single use of the input blob.
+  CAFFE_ENFORCE_GE(def.op_size(), 1);
+
+  const auto& inputBlob = def.external_input(0);
+  // Enforce that the input blob has a single usage - in the first operator.
+  CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
+  const auto& outputBlob = def.external_output(0);
+  const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
+  // This should hold true by definition of the SSA analysis.
+  CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
+                analysis.inUsages[outputBlob].end());
+
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+
+  std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
+  cpu_blobs[def.external_input(0)].insert(0);
+  VLOG(2) << "[C2DEBUG] def.op_size(): " << def.op_size();
+  for (auto i = 0; i < def.op_size(); i++) {
+    const auto& currentOp = def.op(i);
+    if (cpuOp.count(currentOp.type()) > 0) {
+      // CPU Op
+      // insert copyFromOpenGLOp
+      for (auto j = 0; j < currentOp.input_size(); j++) {
+        auto& input = currentOp.input(j);
+        auto version = analysis.ssa[i].inVersions[input];
+        if (gpu_blobs[input].count(version) > 0) {
+          insertCopyFromGLOp(mdef, input);
+        }
+      }
+      auto* op = mdef.add_op();
+      op->CopyFrom(currentOp);
+      for (auto j = 0; j < currentOp.output_size(); j++) {
+        auto& output = currentOp.output(j);
+        auto version = analysis.ssa[i].outVersions[output];
+        cpu_blobs[output].insert(version);
+      }
+    } else {
+      // OpenGL Op
+      auto* op = mdef.add_op();
+      op->CopyFrom(currentOp);
+
+     for (auto j = 0; j < op->input_size(); j++) {
+        auto* input = op->mutable_input(j);
+        auto version = analysis.ssa[i].inVersions[*input];
+        if (gpu_blobs[*input].count(version) > 0) {
+          *input = *input + "_M";
+        }
+      }
+
+      for (auto j = 0; j < currentOp.output_size(); j++) {
+        auto& output = currentOp.output(j);
+        auto version = analysis.ssa[i].outVersions[output];
+        gpu_blobs[output].insert(version);
+        // add _M to intermediate OpenGL op outputs
+        auto* output_ = op->mutable_output(j);
+        bool inter = true;
+        for(auto k = 0; k < def.external_output_size(); k++) {
+          if (*output_ == def.external_output(k)) {
+            inter = false;
+          }
+        }
+        if (inter) {
+          *output_ = *output_ + "_M";
+        }
+      }
+    }
+  }
+  return mdef;
+}
+
+static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
+                               const OperatorDef& nextOp,
+                               OperatorDef* fusedOp,
+                               std::unordered_set<std::string>& glOps) {
+  // Check for possible invalid opportunities.
+  if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
+    return false;
+  }
+  // The fused op cannot be inplace
+  if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
+    return false;
+  }
+
+  static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
+    {{"Conv", "Relu"}, "ConvRelu"}};
+  auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
+  if (it == fusionOpportunities.end()) {
+    return false;
+  }
+
+  glOps.insert(it->second);
+  fusedOp->CopyFrom(currentOp);
+  fusedOp->set_output(0, nextOp.output(0));
+  fusedOp->set_type(it->second);
+  for (auto i = 1; i < nextOp.input_size(); i++) {
+    fusedOp->add_input(nextOp.input(i));
+  }
+  return true;
+}
+
+static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
+  CHECK_GE(def.op_size(), 1);
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+  auto i = 0;
+
+  while (i < def.op_size()) {
+    if (i == def.op_size() - 1) {
+      VLOG(2) << "Last operator, skipping";
+      auto* op = mdef.add_op();
+      op->CopyFrom(def.op(i));
+      i += 1;
+      continue;
+    }
+
+    const auto& currentOp = def.op(i);
+    const auto& nextOp = def.op(i + 1);
+    OperatorDef fusedOp;
+    if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
+      VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
+      // We can fuse.
+      auto* op = mdef.add_op();
+      op->CopyFrom(fusedOp);
+      i += 2;
+      continue;
+    }
+    VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
+    // Just emit the current type.
+    auto* op = mdef.add_op();
+    op->CopyFrom(currentOp);
+    i += 1;
+  }
+  return mdef;
+}
+
+void dumpDefForOpenGL(const NetDef& d) {
+  for (const auto& op : d.op()) {
+    LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
+  }
+}
+
+// // For debugging
+// void dumpDefForOpenGL(const NetDef &net) {
+//  for (const auto &op : net.op()) {
+//    printf("***Operator: %s\n", op.type().c_str());
+//    for (auto input : op.input()) {
+//      printf("\tInput: %s\n", input.c_str());
+//    }
+//
+//    for (auto output : op.output()) {
+//      printf("\tOutput: %s\n", output.c_str());
+//    }
+//  }
+//}
+
+NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool runFusion, std::unordered_set<std::string> cpuOps) {
+  CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
+  NetDef net;
+  net.CopyFrom(predictNet);
+  // if (runFusion) {
+  //   net = runOpenGLFusion(net, openGLOps);
+  // }
+  net = insertInputOutputCopyOps(net, cpuOps);
+  VLOG(2) << "[C2DEBUG] net size " << net.op().size();
+  net.set_type("opengl");
+
+  for (auto i = 0; i < net.op().size(); ++i) {
+    auto op = net.mutable_op(i);
+    if (std::find(cpuOps.begin(), cpuOps.end(), op->type()) == cpuOps.end()) {
+      op->mutable_device_option()->set_device_type(OPENGL);
+    }
+  }
+
+  return net;
+}
+
+bool tryConvertToOpenGL(const NetDef& predictNet,
+                        NetDef* glPredictNet,
+                        bool runFusion,
+                        std::unordered_set<std::string> cpuOps) {
+  try {
+    // Throws if unsupported operators are found.
+    VLOG(2) << "[C2DEBUG] in tryConvertToOpenGL";
+    *glPredictNet = rewritePredictNetForOpenGL(predictNet, runFusion, cpuOps);
+    dumpDefForOpenGL(*glPredictNet);
+    // Throws if unsupported parameters are found.
+    LOG(INFO) << "OpenGL is successfully enabled";
+    return true;
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
+    return false;
+  }
+}
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/rewrite_net.h b/caffe2/mobile/contrib/arm-compute/core/rewrite_net.h
new file mode 100644
index 0000000..8d50878
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/core/rewrite_net.h
@@ -0,0 +1,17 @@
+
+#pragma once
+#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
+#include <unordered_set>
+
+namespace caffe2 {
+bool tryConvertToOpenGL(const NetDef& predictNet,
+                        NetDef* glPredictNet,
+                        bool runFusion,
+                        std::unordered_set<std::string> cpuOps);
+
+// Exposed for testing
+NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
+                                  bool runFusion,
+                                  std::unordered_set<std::string> cpuOps);
+void dumpDefForOpenGL(const NetDef& net);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/models/squeezenet_init.pb b/caffe2/mobile/contrib/arm-compute/models/squeezenet_init.pb
new file mode 100644
index 0000000..3d3df32
Binary files /dev/null and b/caffe2/mobile/contrib/arm-compute/models/squeezenet_init.pb differ
diff --git a/caffe2/mobile/contrib/arm-compute/models/squeezenet_predict.pb b/caffe2/mobile/contrib/arm-compute/models/squeezenet_predict.pb
new file mode 100644
index 0000000..188c347
Binary files /dev/null and b/caffe2/mobile/contrib/arm-compute/models/squeezenet_predict.pb differ
diff --git a/caffe2/mobile/contrib/arm-compute/operators/CMakeLists.txt b/caffe2/mobile/contrib/arm-compute/operators/CMakeLists.txt
new file mode 100644
index 0000000..dbc170e
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/arm-compute/operators/activation_ops.cc b/caffe2/mobile/contrib/arm-compute/operators/activation_ops.cc
new file mode 100644
index 0000000..dbea8e9
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/activation_ops.cc
@@ -0,0 +1,110 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+#include "caffe2/mobile/contrib/arm-compute/operators/activation_ops.h"
+#include "caffe2/operators/relu_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+bool GLReluOp<T>::RunOnDevice() {
+
+  auto *Xblob = OperatorBase::Inputs()[0];
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+
+  if (first_run_) {
+    first_run_ = false;
+    if (Y->get_underlying() != X_->get_underlying())
+    {
+      Y->ResizeLike(*X_);
+    }
+    relu_layer_.configure(
+        X_->get_underlying(), Y->get_underlying(),
+        arm_compute::ActivationLayerInfo(
+          arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
+
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    // in place activation, do not need to allocate new memory
+    if (Y->get_underlying() != X_->get_underlying()) {
+      Y->ResizeLike(*X_);
+      Y->allocate();
+    }
+    relu_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = false;
+    if (Y->get_underlying() != X_->get_underlying()) {
+      need_allocation = Y->ResizeLike(*X_, true);
+    }
+    relu_layer_.configure(
+        X_->get_underlying(), Y->get_underlying(),
+        arm_compute::ActivationLayerInfo(
+          arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
+    if (need_allocation) {
+      Y->allocate();
+    }
+    relu_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Relu, GLReluOp<DataType>);
+
+template <typename T>
+bool GLSigmoidOp<T>::RunOnDevice() {
+
+  auto *Xblob = OperatorBase::Inputs()[0];
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+
+    if (Y->get_underlying() != X_->get_underlying())
+    {
+        Y->ResizeLike(*X_);
+    }
+
+    sigmoid_layer_.configure(
+      X_->get_underlying(), Y->get_underlying(),
+      arm_compute::ActivationLayerInfo(
+          arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    // in place activation, do not need to allocate new memory
+    if (Y->get_underlying() != X_->get_underlying()) {
+      Y->ResizeLike(*X_);
+      Y->allocate();
+    }
+    sigmoid_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = false;
+    if (Y->get_underlying() != X_->get_underlying())
+    {
+      need_allocation = Y->ResizeLike(*X_, true);
+    }
+    sigmoid_layer_.configure(
+      X_->get_underlying(), Y->get_underlying(),
+      arm_compute::ActivationLayerInfo(
+          arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    if (need_allocation) {
+      Y->allocate();
+    }
+    sigmoid_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Sigmoid, GLSigmoidOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/activation_ops.h b/caffe2/mobile/contrib/arm-compute/operators/activation_ops.h
new file mode 100644
index 0000000..4de6a07
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/activation_ops.h
@@ -0,0 +1,38 @@
+#ifndef CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
+#define CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+class GLSigmoidOp final : public Operator<GLContext> {
+public:
+  GLSigmoidOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCActivationLayer sigmoid_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+};
+
+template <typename T> class GLReluOp final : public Operator<GLContext> {
+public:
+  GLReluOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLReluOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCActivationLayer relu_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
diff --git a/caffe2/mobile/contrib/arm-compute/operators/concat_op.cc b/caffe2/mobile/contrib/arm-compute/operators/concat_op.cc
new file mode 100644
index 0000000..5b992c4
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/concat_op.cc
@@ -0,0 +1,116 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+#include "caffe2/operators/concat_split_op.h"
+
+namespace caffe2 {
+
+template <typename T> class GLConcatOp final : public Operator<GLContext> {
+public:
+  GLConcatOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLConcatOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCDepthConcatenateLayer concat_layer_;
+  bool first_run_ = true, second_run_ = true;
+  std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
+  int channelCount_ = 0;
+};
+
+
+template <typename T>
+bool GLConcatOp<T>::RunOnDevice() {
+
+  CAFFE_ENFORCE(InputSize() <= 4 && InputSize() >= 2, "Number \
+  of input must be between 2 and 4.");
+
+  auto *X0blob = OperatorBase::Inputs()[0];
+  if (first_run_) {
+    auto X0 = GLContext::getGLTensor<T>(X0blob);
+    inputs_.push_back(std::move(X0));
+  } else {
+    auto X0 = GLContext::getGLTensor<T>(X0blob, inputs_[0].release());
+    inputs_[0] = std::move(X0);
+  }
+
+  int N = inputs_[0]->dim32(0);
+  int channels = inputs_[0]->dim32(1);
+  int height = inputs_[0]->dim32(2);
+  int width = inputs_[0]->dim32(3);
+  std::vector<const Blob*> inputsBlob;
+  inputsBlob.push_back(X0blob);
+
+  if (first_run_) {
+    channelCount_ = channels;
+    for (int i = 1; i < Inputs().size(); ++i) {
+      auto *Xblob = OperatorBase::Inputs()[i];
+      auto X = GLContext::getGLTensor<T>(Xblob);
+      CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
+      CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
+      CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
+      channelCount_ += X->dim32(1);
+      inputs_.push_back(std::move(X));
+    }
+  } else {
+    channelCount_ = channels;
+    for (int i = 1; i < Inputs().size(); ++i) {
+      auto *Xblob = OperatorBase::Inputs()[i];
+      auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
+      CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
+      CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
+      CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
+      channelCount_ += X->dim32(1);
+      inputs_[i] = std::move(X);
+    }
+  }
+
+  for (int i = 1; i < Inputs().size(); ++i) {
+    auto *Xblob = OperatorBase::Inputs()[i];
+    inputsBlob.push_back(Xblob);
+  }
+  std::vector<int> output_dims = {N, channelCount_, height, width};
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->Resize(output_dims);
+    std::vector<arm_compute::IGCTensor*> inputsGC;
+    for (int i = 0; i < inputs_.size(); ++i) {
+      inputsGC.push_back(inputs_[i]->get_underlying());
+    }
+    concat_layer_.configure(inputsGC, Y->get_underlying());
+  } else if (second_run_) {
+    for (int i = 0; i < inputs_.size(); ++i) {
+      auto* X = inputs_[i].get();
+      auto* Xblob = inputsBlob[i];
+      X->lazy_allocate(Xblob, second_run_, true);
+    }
+    second_run_ = false;
+    Y->Resize(output_dims);
+    Y->allocate();
+    concat_layer_.run();
+  } else {
+    for (int i = 0; i < inputs_.size(); ++i) {
+      auto* X = inputs_[i].get();
+      auto* Xblob = inputsBlob[i];
+      X->lazy_allocate(Xblob, second_run_, true);
+    }
+    bool need_allocation = Y->Resize(output_dims);
+    std::vector<arm_compute::IGCTensor*> inputsGC;
+    for (int i = 0; i < inputs_.size(); ++i) {
+      inputsGC.push_back(inputs_[i]->get_underlying());
+    }
+    concat_layer_.configure(inputsGC, Y->get_underlying());
+    if (need_allocation) {
+      Y->allocate();
+    }
+    concat_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Concat, GLConcatOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/conv_op.cc b/caffe2/mobile/contrib/arm-compute/operators/conv_op.cc
new file mode 100644
index 0000000..086b384
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/conv_op.cc
@@ -0,0 +1,113 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+#include "caffe2/operators/conv_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+class GLConvOp final : public ConvPoolOpBase<GLContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
+  GLConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<GLContext>(operator_def, ws) {
+    // Since this is the default convolution implementation, we will
+    // use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
+    CAFFE_ENFORCE(
+        group_ == 1 || order_ == StorageOrder::NCHW,
+        "Group convolution only supports NCHW order right now.");
+  }
+  ~GLConvOp() {}
+
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCDirectConvolutionLayer conv_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_, filter_, bias_;
+};
+
+template <typename T>
+bool GLConvOp<T>::RunOnDevice() {
+  auto *Xblob = OperatorBase::Inputs()[0];
+  auto *filterblob = OperatorBase::Inputs()[1];
+  auto *biasblob = OperatorBase::Inputs()[2];
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+  if (first_run_) {
+    filter_ = GLContext::getGLTensor<T>(filterblob);
+    bias_ = GLContext::getGLTensor<T>(biasblob);
+  }
+
+  GLTensor<T> *Y =
+    OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+
+  const int N = X_->dim32(0), H = X_->dim32(2), W = X_->dim32(3), C = X_->dim32(1);
+  LOG(INFO) << "[C2DEBUG] Conv " << N << " " << H << " " << W << " " << C;
+  CAFFE_ENFORCE_EQ(kernel_.size(), 2,
+                   "Only 2d convolution is supported with ARM compute backend");
+
+  CAFFE_ENFORCE(X_->ndim(), filter_->ndim());
+  const int M = filter_->dim32(0);
+  CAFFE_ENFORCE(filter_->dim32(2) == kernel_h());
+  CAFFE_ENFORCE(filter_->dim32(3) == kernel_w());
+  CAFFE_ENFORCE(filter_->dim32(1) == C);
+
+  if (first_run_) {
+    first_run_ = false;
+
+    // resize output accordingly
+    TensorCPU fakeX;
+    fakeX.Resize(X_->dims());
+    TensorCPU fakeY;
+    ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
+    Y->ResizeLike(fakeY);
+    LOG(INFO) << "[C2DEBUG] dims of X " << X_->dims();
+    LOG(INFO) << "[C2DEBUG] dims of X(gctensor) "
+      << X_->get_underlying()->info()->dimension(3) << " "
+      << X_->get_underlying()->info()->dimension(2) << " "
+      << X_->get_underlying()->info()->dimension(1) << " "
+      << X_->get_underlying()->info()->dimension(0) << " "
+    ;
+    LOG(INFO) << "[C2DEBUG] dims of Y " << Y->dims();
+    LOG(INFO) << "[C2DEBUG] dims of Y(gctensor) "
+      << Y->get_underlying()->info()->dimension(3) << " "
+      << Y->get_underlying()->info()->dimension(2) << " "
+      << Y->get_underlying()->info()->dimension(1) << " "
+      << Y->get_underlying()->info()->dimension(0) << " "
+    ;
+
+    conv_.configure(
+        X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
+        Y->get_underlying(),
+        arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
+
+  } else if (second_run_) {
+    // Always attempt to copy the CPU to GPU on input
+    X_->lazy_allocate(Xblob, second_run_, true);
+    filter_->lazy_allocate(filterblob, second_run_, second_run_);
+    bias_->lazy_allocate(biasblob, second_run_, second_run_);
+    second_run_ = false;
+    Y->allocate();
+    conv_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    TensorCPU fakeX;
+    fakeX.Resize(X_->dims());
+    TensorCPU fakeY;
+    ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
+    bool need_allocation = Y->ResizeLike(fakeY, true);
+    if (need_allocation) {
+      Y->allocate();
+    }
+    conv_.configure(
+                    X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
+                    Y->get_underlying(),
+                    arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
+    conv_.run();
+ }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Conv, GLConvOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
new file mode 100644
index 0000000..56c9523
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -0,0 +1,71 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+template <typename T> class CopyFromGLOp final : public Operator<GLContext> {
+public:
+  CopyFromGLOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~CopyFromGLOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  bool first_run_ = true, second_run_ = true;
+  std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
+};
+
+template <typename T>
+bool CopyFromGLOp<T>::RunOnDevice() {
+
+  std::vector<const Blob*> inputsBlob;
+
+  for (int i = 0; i < Inputs().size(); ++i) {
+    auto *Xblob = OperatorBase::Inputs()[i];
+    inputsBlob.push_back(Xblob);
+  }
+
+  if (first_run_) {
+    for (int i = 0; i < Inputs().size(); ++i) {
+      auto *Xblob = inputsBlob[i];
+      auto X = GLContext::getGLTensor<T>(Xblob);
+      inputs_.push_back(std::move(X));
+    }
+  } else {
+    for (int i = 0; i < Inputs().size(); ++i) {
+      auto *Xblob = inputsBlob[i];
+      auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
+      inputs_[i] = std::move(X);
+    }
+  }
+
+  if (first_run_) {
+    first_run_ = false;
+    for (int i = 0; i < Inputs().size(); ++i) {
+      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      Y->Resize(inputs_[i]->dims());
+      Y->template mutable_data<float>();
+    }
+  } else {
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      // Blob
+      auto* Xblob = inputsBlob[i];
+      // GLTensor
+      auto* X = inputs_[i].get();
+      X->lazy_allocate(Xblob, second_run_, true);
+      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      Timer timer;
+      timer.Start();
+      getTensorCPU(*X, *Y);
+      auto millis = timer.MilliSeconds();
+      //LOG(ERROR) << "[C2DEBUG] copy_op " << X->dims() << " takes " << millis << " milliseconds";
+    }
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(CopyFromGL, CopyFromGLOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/elementwise_sum_op.cc b/caffe2/mobile/contrib/arm-compute/operators/elementwise_sum_op.cc
new file mode 100644
index 0000000..3074a32
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/elementwise_sum_op.cc
@@ -0,0 +1,58 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+#include "caffe2/operators/utility_ops.h"
+
+namespace caffe2 {
+
+template <typename T> class GLSumOp final : public Operator<GLContext> {
+public:
+  GLSumOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLSumOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCArithmeticAddition add_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> A_, B_;
+};
+
+
+template <typename T>
+bool GLSumOp<T>::RunOnDevice() {
+
+  auto *Ablob = OperatorBase::Inputs()[0];
+  auto *Bblob = OperatorBase::Inputs()[1];
+
+  A_ = GLContext::getGLTensor<T>(Ablob, A_.release());
+  B_ = GLContext::getGLTensor<T>(Bblob, B_.release());
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->ResizeLike(*A_);
+    add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
+  } else if (second_run_) {
+    A_->lazy_allocate(Ablob, second_run_, true);
+    B_->lazy_allocate(Bblob, second_run_, true);
+    second_run_ = false;
+    Y->allocate();
+    add_layer_.run();
+  } else {
+    A_->lazy_allocate(Ablob, second_run_, true);
+    B_->lazy_allocate(Bblob, second_run_, true);
+    bool need_allocation = Y->ResizeLike(*A_);
+    add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
+    if (need_allocation) {
+      Y->allocate();
+    }
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Sum, GLSumOp<DataType>);
+REGISTER_GL_OPERATOR(Add, GLSumOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc b/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
new file mode 100644
index 0000000..ac36118
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
@@ -0,0 +1,76 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+
+template <typename T> class GLFullyConnectedOp final : public Operator<GLContext> {
+public:
+  GLFullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLFullyConnectedOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCFullyConnectedLayer fc_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_, W_, B_;
+};
+
+template <typename T>
+bool GLFullyConnectedOp<T>::RunOnDevice() {
+
+  auto Xblob = OperatorBase::Inputs()[0];
+  auto *Wblob = OperatorBase::Inputs()[1];
+  auto *Bblob = OperatorBase::Inputs()[2];
+
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+  if (first_run_) {
+    W_ = GLContext::getGLTensor<T>(Wblob);
+    B_ = GLContext::getGLTensor<T>(Bblob);
+  }
+
+  auto M = X_->dim32(0);
+  auto CIn = X_->dim32(1);
+  auto Height = X_->dim32(2);
+  auto Width = X_->dim32(3);
+  auto N = W_->dim32(0);
+
+  CAFFE_ENFORCE_EQ(1, B_->ndim());
+  CAFFE_ENFORCE_EQ(N, B_->dim32(0));
+
+  vector<TIndex> output_dims = {M, N};
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->Resize(output_dims);
+
+    fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
+                     B_->get_underlying(), Y->get_underlying(), true, false);
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    W_->lazy_allocate(Wblob, second_run_, second_run_);
+    B_->lazy_allocate(Bblob, second_run_, second_run_);
+    second_run_ = false;
+    Y->Resize(output_dims);
+    Y->allocate();
+    fc_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = Y->Resize(output_dims);
+    fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
+                     B_->get_underlying(), Y->get_underlying(), true, false);
+    if (need_allocation) {
+      Y->allocate();
+    }
+    fc_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(FC, GLFullyConnectedOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/norm_planar_yuv_op.cc b/caffe2/mobile/contrib/arm-compute/operators/norm_planar_yuv_op.cc
new file mode 100644
index 0000000..ecb82b3
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/norm_planar_yuv_op.cc
@@ -0,0 +1,70 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+class GLNormalizePlanarYUVOp final : public Operator<GLContext> {
+public:
+  GLNormalizePlanarYUVOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLNormalizePlanarYUVOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCNormalizePlanarYUVLayer norm_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, sd_;
+};
+
+template <typename T> bool GLNormalizePlanarYUVOp<T>::RunOnDevice() {
+
+  auto Xblob = OperatorBase::Inputs()[0];
+  auto *meanblob = OperatorBase::Inputs()[1];
+  auto *sdblob = OperatorBase::Inputs()[2];
+
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+  if (first_run_) {
+    mean_ = GLContext::getGLTensor<T>(meanblob);
+    sd_ = GLContext::getGLTensor<T>(sdblob);
+  }
+
+  CAFFE_ENFORCE_EQ(X_->ndim(), 4);
+  auto N = X_->dim32(0);
+  auto C = X_->dim32(1);
+  auto H = X_->dim32(2);
+  auto W = X_->dim32(3);
+
+  CAFFE_ENFORCE_EQ(C, mean_->dim32(1));
+  CAFFE_ENFORCE_EQ(C, sd_->dim32(1));
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->ResizeLike(*X_);
+    norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    mean_->lazy_allocate(meanblob, second_run_, second_run_);
+    sd_->lazy_allocate(sdblob, second_run_, second_run_);
+    second_run_ = false;
+    Y->ResizeLike(*X_);
+    Y->allocate();
+    norm_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = Y->ResizeLike(*X_);
+    norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
+    if (need_allocation) {
+      Y->allocate();
+    }
+    norm_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(NormalizePlanarYUV, GLNormalizePlanarYUVOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc b/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
new file mode 100644
index 0000000..19aede7
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
@@ -0,0 +1,186 @@
+#include "caffe2/operators/pool_op.h"
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+class GLAveragePoolOp final : public ConvPoolOpBase<GLContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
+  GLAveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<GLContext>(operator_def, ws) {
+  }
+  ~GLAveragePoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+private:
+  arm_compute::GCPoolingLayer pooling_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+};
+
+template<typename T>
+class GLMaxPoolOp final : public ConvPoolOpBase<GLContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
+  GLMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<GLContext>(operator_def, ws) {
+  }
+  ~GLMaxPoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+private:
+  arm_compute::GCPoolingLayer pooling_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+};
+
+template <>
+bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
+
+  auto *Xblob = OperatorBase::Inputs()[0];
+  if (first_run_) {
+    X_ = GLContext::getGLTensor<DataType>(Xblob);
+  } else {
+    X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
+  }
+
+  int N = X_->dim32(0);
+  int channels = X_->dim32(1);
+  int height = X_->dim32(2);
+  int width = X_->dim32(3);
+
+  vector<TIndex> output_dims = {N, channels, 1, 1};
+  if (!global_pooling_) {
+    output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
+    output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
+  }
+
+  GLTensor<DataType> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
+  if (first_run_) {
+    first_run_ = false;
+    CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
+    CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
+                     "ARM OpenGL only supports equal kernel size");
+    Y->Resize(output_dims);
+    if (global_pooling_) {
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    } else {
+      arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
+                                         pad_t(), pad_b(),
+                                         arm_compute::DimensionRoundingType::FLOOR);
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
+                                         ps_info);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    }
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    Y->Resize(output_dims);
+    Y->allocate();
+    pooling_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation =Y->Resize(output_dims);
+    if (global_pooling_) {
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    } else {
+      arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
+                                         pad_t(), pad_b(),
+                                         arm_compute::DimensionRoundingType::FLOOR);
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
+                                         ps_info);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    }
+    if (need_allocation) {
+      Y->allocate();
+    }
+    pooling_layer_.run();
+  }
+  return true;
+}
+
+template <> bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
+
+  auto *Xblob = OperatorBase::Inputs()[0];
+  X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
+
+  int N = X_->dim32(0);
+  int channels = X_->dim32(1);
+  int height = X_->dim32(2);
+  int width = X_->dim32(3);
+
+  vector<TIndex> output_dims = {N, channels, 1, 1};
+  if (!global_pooling_) {
+    output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
+    output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
+  }
+  GLTensor<DataType> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
+
+  if (first_run_) {
+    first_run_ = false;
+    CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
+    CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
+                     "ARM OpenGL only supports equal kernel size");
+    Y->Resize(output_dims);
+    if (global_pooling_) {
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    } else {
+      arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
+                                         pad_t(), pad_b(),
+                                         arm_compute::DimensionRoundingType::FLOOR);
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
+                                         ps_info);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    }
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    Y->Resize(output_dims);
+    Y->allocate();
+    pooling_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = Y->Resize(output_dims);
+    if (global_pooling_) {
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    } else {
+      arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
+                                         pad_t(), pad_b(),
+                                         arm_compute::DimensionRoundingType::FLOOR);
+      arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
+                                         ps_info);
+      pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
+    }
+    if (need_allocation) {
+      Y->allocate();
+    }
+    pooling_layer_.run();
+  }
+
+  return true;
+}
+
+template <>
+bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
+  return false;
+}
+
+template <>
+bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
+  return false;
+}
+
+REGISTER_GL_OPERATOR(AveragePool, GLAveragePoolOp<DataType>);
+REGISTER_GL_OPERATOR(MaxPool, GLMaxPoolOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/reshape_op.cc b/caffe2/mobile/contrib/arm-compute/operators/reshape_op.cc
new file mode 100644
index 0000000..e14a19b
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/reshape_op.cc
@@ -0,0 +1,29 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+#include "caffe2/operators/reshape_op.h"
+
+namespace caffe2 {
+
+template <typename T> class GLReshapeOp final : public Operator<GLContext> {
+public:
+  GLReshapeOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLReshapeOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+};
+
+template <typename T>
+bool GLReshapeOp<T>::RunOnDevice() {
+  auto *Xblob = OperatorBase::Inputs()[0];
+  auto X = GLContext::getGLTensor<T>(Xblob);
+  auto arg = OperatorBase::GetRepeatedArgument<int>("shape");
+  for (int i = 0; i < arg.size(); ++i) {
+    LOG(INFO) << "[C2DEBUG] shape: " << arg[i];
+  }
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Reshape, GLReshapeOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc b/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
new file mode 100644
index 0000000..ed9f672
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
@@ -0,0 +1,74 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+#include "caffe2/operators/resize_op.h"
+
+namespace caffe2 {
+
+template<typename T>
+class GLResizeNearestOp final : public Operator<GLContext> {
+public:
+  GLResizeNearestOp(const OperatorDef &operator_def, Workspace *ws)
+    : Operator<GLContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    if (HasArgument("width_scale")) {
+      width_scale_ = static_cast<float>(
+          OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    }
+    if (HasArgument("height_scale")) {
+      height_scale_ = static_cast<float>(
+          OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    }
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+  }
+  virtual ~GLResizeNearestOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  float width_scale_;
+  float height_scale_;
+  arm_compute::GCScale resize_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+};
+
+template <typename T>
+bool GLResizeNearestOp<T>::RunOnDevice() {
+
+  auto* Xblob = OperatorBase::Inputs()[0];
+
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+
+  auto N = X_->dim32(0);
+  auto C = X_->dim32(1);
+  auto H = X_->dim32(2);
+  auto W = X_->dim32(3);
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  vector<TIndex> output_dims = {N, C, H * height_scale_, W * width_scale_};
+
+  if (first_run_) {
+    Y->Resize(output_dims);
+    first_run_ = false;
+    resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    Y->Resize(output_dims);
+    Y->allocate();
+    resize_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = Y->Resize(output_dims);
+    resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
+    if (need_allocation) {
+      Y->allocate();
+    }
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(ResizeNearest, GLResizeNearestOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/softmax_op.cc b/caffe2/mobile/contrib/arm-compute/operators/softmax_op.cc
new file mode 100644
index 0000000..28665a8
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/softmax_op.cc
@@ -0,0 +1,54 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+#include "caffe2/operators/softmax_op.h"
+
+namespace caffe2 {
+
+template <typename T> class GLSoftmaxOp final : public Operator<GLContext> {
+public:
+  GLSoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws) {}
+  virtual ~GLSoftmaxOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+private:
+  arm_compute::GCSoftmaxLayer softmax_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
+};
+
+template <typename T>
+bool GLSoftmaxOp<T>::RunOnDevice() {
+
+  auto *Xblob = OperatorBase::Inputs()[0];
+  X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->ResizeLike(*X_);
+    softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
+  } else if (second_run_) {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    second_run_ = false;
+    Y->ResizeLike(*X_);
+    Y->allocate();
+    softmax_layer_.run();
+  } else {
+    X_->lazy_allocate(Xblob, second_run_, true);
+    bool need_allocation = Y->ResizeLike(*X_);
+    softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
+    if (need_allocation) {
+      Y->allocate();
+    }
+    softmax_layer_.run();
+  }
+
+  return true;
+}
+
+REGISTER_GL_OPERATOR(Softmax, GLSoftmaxOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/spatial_batch_norm_op.cc b/caffe2/mobile/contrib/arm-compute/operators/spatial_batch_norm_op.cc
new file mode 100644
index 0000000..c6d1a0a
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/operators/spatial_batch_norm_op.cc
@@ -0,0 +1,93 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
+
+#include "caffe2/operators/spatial_batch_norm_op.h"
+
+namespace caffe2 {
+
+template <typename T> class GLSpatialBNOp final : public Operator<GLContext> {
+public:
+  GLSpatialBNOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<GLContext>(operator_def, ws),
+        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) { }
+  virtual ~GLSpatialBNOp() noexcept {}
+  USE_OPERATOR_FUNCTIONS(GLContext);
+  bool RunOnDevice() override;
+ protected:
+  bool is_test_;
+  double epsilon_;
+  double momentum_;
+  StorageOrder order_;
+  INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
+  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_VAR);
+private:
+  arm_compute::GCBatchNormalizationLayer bn_layer_;
+  bool first_run_ = true, second_run_ = true;
+  GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, var_, bias_, scale_;
+};
+
+template <typename T>
+bool GLSpatialBNOp<T>::RunOnDevice() {
+  auto *XBlob = OperatorBase::Inputs()[0];
+  auto *scaleBlob = OperatorBase::Inputs()[SCALE];
+  auto *biasBlob = OperatorBase::Inputs()[BIAS];
+  auto *meanBlob = OperatorBase::Inputs()[EST_MEAN];
+  auto *varBlob = OperatorBase::Inputs()[EST_VAR];
+
+  X_ = GLContext::getGLTensor<T>(XBlob, X_.release());
+  if (first_run_) {
+    scale_ = GLContext::getGLTensor<T>(scaleBlob);
+    bias_ = GLContext::getGLTensor<T>(biasBlob);
+    mean_ = GLContext::getGLTensor<T>(meanBlob);
+    var_ = GLContext::getGLTensor<T>(varBlob);
+  }
+
+  auto C = X_->dim32(1);
+  CAFFE_ENFORCE_EQ(scale_->ndim(), 1);
+  CAFFE_ENFORCE_EQ(bias_->ndim(), 1);
+  CAFFE_ENFORCE_EQ(mean_->ndim(), 1);
+  CAFFE_ENFORCE_EQ(var_->ndim(), 1);
+
+  CAFFE_ENFORCE_EQ(scale_->dim32(0), C);
+  CAFFE_ENFORCE_EQ(bias_->dim32(0), C);
+  CAFFE_ENFORCE_EQ(mean_->dim32(0), C);
+  CAFFE_ENFORCE_EQ(var_->dim32(0), C);
+
+  GLTensor<T> *Y =
+      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
+  if (first_run_) {
+    first_run_ = false;
+    Y->ResizeLike(*X_);
+    bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
+                     mean_->get_underlying(), var_->get_underlying(),
+                     bias_->get_underlying(), scale_->get_underlying(), epsilon_);
+  } else if (second_run_) {
+    X_->lazy_allocate(XBlob, second_run_, true);
+    scale_->lazy_allocate(scaleBlob, second_run_, second_run_);
+    bias_->lazy_allocate(biasBlob, second_run_, second_run_);
+    mean_->lazy_allocate(meanBlob, second_run_, second_run_);
+    var_->lazy_allocate(varBlob, second_run_, second_run_);
+    second_run_ = false;
+    Y->ResizeLike(*X_);
+    Y->allocate();
+    bn_layer_.run();
+  } else {
+    X_->lazy_allocate(XBlob, second_run_, true);
+    bool need_allocation = Y->ResizeLike(*X_);
+    bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
+                     mean_->get_underlying(), var_->get_underlying(),
+                     bias_->get_underlying(), scale_->get_underlying(), epsilon_);
+    if (need_allocation) {
+      Y->allocate();
+    }
+  }
+  return true;
+}
+
+REGISTER_GL_OPERATOR(SpatialBN, GLSpatialBNOp<DataType>);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/run_tests.sh b/caffe2/mobile/contrib/arm-compute/run_tests.sh
new file mode 100755
index 0000000..c08eece
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/run_tests.sh
@@ -0,0 +1,22 @@
+set -vex
+
+if [ -z "$CAFFE2_BINARY_DIR" ] ; then
+  if [ -z "$1" ] ; then
+    CAFFE2_BINARY_DIR=.
+  else
+    CAFFE2_BINARY_DIR=$1
+  fi
+fi
+
+files=($(find "$CAFFE2_BINARY_DIR" -type f -name "*_test"))
+for test_binary in "${files[@]}";
+do
+  test_binary_base=$(basename $test_binary)
+  if [[ $test_binary_base == gl* ]];then
+    echo Running $test_binary_base
+    adb push $test_binary "/data/local/tmp/$test_binary_base"
+    adb shell "GLOG_logtostderr=1 /data/local/tmp/$test_binary_base" 
+  fi
+done
+
+echo All tests passed.
diff --git a/caffe2/mobile/contrib/arm-compute/test/CMakeLists.txt b/caffe2/mobile/contrib/arm-compute/test/CMakeLists.txt
new file mode 100644
index 0000000..480846c
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_activation_ops_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_activation_ops_test.cc
new file mode 100644
index 0000000..7b1d261
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_activation_ops_test.cc
@@ -0,0 +1,70 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, Sigmoid) {
+  Workspace ws;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  compareNetResult(ws, cpu_net, gpu_net);
+
+}
+
+TEST(OPENGLOperatorTest, ReLU) {
+  Workspace ws;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "Relu", {"cpu_X"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Relu", {"cpu_X"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+}
+
+TEST(OPENGLOperatorTest, SigmoidTwice) {
+  Workspace ws;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y1"});
+    AddOp(&cpu_net, "Sigmoid", {"ref_Y1"}, {"ref_Y2"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y1"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"gpu_Y1"}, {"gpu_Y2"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2");
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_alignment_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_alignment_test.cc
new file mode 100644
index 0000000..9b74fdc
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_alignment_test.cc
@@ -0,0 +1,197 @@
+#include "gl_operator_test.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+constexpr float tol = 5.0e-2;
+
+// {MaxPool, Relu, Add} followed by pad 1 conv
+TEST(OPENGLOperatorTest, ConvMaxPoolConv) {
+
+  Workspace ws;
+  auto channel_in = 16;
+  auto channel_out = 16;
+  auto spatial = 32;
+  auto kern = 3;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
+  PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
+  PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
+  PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
+  PopulateCPUBlob(&ws, true, "b2", {channel_out});
+
+#define ADD_CONV_ARGS                                                          \
+  {                                                                            \
+    ADD_ARG((*def), "kernel", i, kern);                                           \
+    ADD_ARG((*def), "stride", i, 1);                                              \
+    ADD_ARG((*def), "pad", i, 1);                                                 \
+    ADD_ARG((*def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
+    def->set_name("cpu_conv");
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"ref_Y"}, {"ref_maxpool"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride_w", i, 2);
+    ADD_ARG((*def), "stride_h", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_maxpool", "W2", "b2"}, {"ref_Y2"});
+    ADD_CONV_ARGS;
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"gpu_Y"}, {"gpu_maxpool"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride_w", i, 2);
+    ADD_ARG((*def), "stride_h", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_maxpool", "W2", "b2"}, {"gpu_Y2"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+
+#undef ADD_CONV_ARGS
+
+  // will work after next release of ACL
+  // compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
+}
+
+TEST(OPENGLOperatorTest, ConvReluConv) {
+
+  Workspace ws;
+  auto channel_in = 16;
+  auto channel_out = 16;
+  auto spatial = 32;
+  auto kern = 3;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
+  PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
+  PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
+  PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
+  PopulateCPUBlob(&ws, true, "b2", {channel_out});
+
+#define ADD_CONV_ARGS                                                          \
+  {                                                                            \
+    ADD_ARG((*def), "kernel", i, kern);                                           \
+    ADD_ARG((*def), "stride", i, 1);                                              \
+    ADD_ARG((*def), "pad", i, 1);                                                 \
+    ADD_ARG((*def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
+    def->set_name("cpu_conv");
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
+    ADD_CONV_ARGS;
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+
+#undef ADD_CONV_ARGS
+
+  // will work after next release of ACL
+  // compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
+
+}
+
+TEST(OPENGLOperatorTest, ConvAddConv) {
+
+  Workspace ws;
+  auto channel_in = 16;
+  auto channel_out = 16;
+  auto spatial = 32; // --> 2x2 w no padding, all values 9
+  auto kern = 3;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
+  PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
+  PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
+  PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
+  PopulateCPUBlob(&ws, true, "b2", {channel_out});
+  PopulateCPUBlob(&ws, true, "cpu_Y", {1, channel_in, spatial, spatial}, 1337);
+
+#define ADD_CONV_ARGS                                                          \
+  {                                                                            \
+    ADD_ARG((*def), "kernel", i, kern);                                           \
+    ADD_ARG((*def), "stride", i, 1);                                              \
+    ADD_ARG((*def), "pad", i, 1);                                                 \
+    ADD_ARG((*def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
+    def->set_name("cpu_conv");
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Add", {"ref_Y", "cpu_Y"}, {"ref_add"});
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_add", "W2", "b2"}, {"ref_Y2"});
+    ADD_CONV_ARGS;
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Add", {"gpu_Y", "cpu_Y"}, {"gpu_add"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_add", "W2", "b2"}, {"gpu_Y2"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+#undef ADD_CONV_ARGS
+
+  // will work after next release of ACL
+  // compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
+
+}
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc
new file mode 100644
index 0000000..5676521
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc
@@ -0,0 +1,45 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, Concat) {
+
+  for (auto Cs: std::vector<std::vector<int>>{
+      {4, 4},
+      {4, 4, 4},
+      {6, 6, 6},
+      {16, 8, 4},
+      {32, 8, 16, 4},
+    }) {
+    Workspace ws;
+    int batchSize = 1;
+    int H = 8;
+    int W = 8;
+    for (int i = 0; i < Cs.size(); ++i) {
+      PopulateCPUBlob(&ws, true, std::string("cpu_X") + caffe2::to_string(i), {batchSize, Cs[i], H, W});
+    }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Concat", {}, {"ref_Y", "cpu_dummy"});
+      for (int i = 0; i < Cs.size(); ++i ) {
+        def->add_input(std::string("cpu_X") + caffe2::to_string(i));
+      }
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Concat", {}, {"gpu_Y", "gpu_dummy"});
+    MAKE_OPENGL_OPERATOR(def);
+    for (int i = 0; i < Cs.size(); ++i ) {
+      def->add_input(std::string("cpu_X") + caffe2::to_string(i));
+    }
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_context_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_context_test.cc
new file mode 100644
index 0000000..71d1136
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_context_test.cc
@@ -0,0 +1,11 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+TEST(OPENGLContextTest, Initialization) {
+  auto gc = new GLContext();
+  delete gc;
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_conv_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_conv_op_test.cc
new file mode 100644
index 0000000..bb8f397
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_conv_op_test.cc
@@ -0,0 +1,162 @@
+#include "gl_operator_test.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+constexpr float tol = 3.0e-2;
+
+TEST(OPENGLOperatorTest, Conv) {
+
+ Workspace ws;
+ auto channel_in = 16;
+ auto channel_out = 16;
+ auto spatial = 16; // --> 2x2 w no padding, all values 9
+ auto kern = 3;
+
+ PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
+ PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
+ PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
+
+#define ADD_CONV_ARGS                                                          \
+  {                                                                            \
+    ADD_ARG((*def), "kernel", i, kern);                                           \
+    ADD_ARG((*def), "stride", i, 1);                                              \
+    ADD_ARG((*def), "pad", i, 0);                                                 \
+    ADD_ARG((*def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
+    def->set_name("cpu_conv");
+    ADD_CONV_ARGS;
+  }
+  ws.RunNetOnce(cpu_net);
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+
+#undef ADD_CONV_ARGS
+
+  compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol);
+
+}
+
+TEST(OPENGLOperatorTest, ConvReluConv) {
+
+  Workspace ws;
+  auto channel_in = 16;
+  auto channel_out = 16;
+  auto spatial = 32; // --> 2x2 w no padding, all values 9
+  auto kern = 3;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
+  PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
+  PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
+  PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
+  PopulateCPUBlob(&ws, true, "b2", {channel_out});
+
+#define ADD_CONV_ARGS                                                          \
+  {                                                                            \
+    ADD_ARG((*def), "kernel", i, kern);                                           \
+    ADD_ARG((*def), "stride", i, 1);                                              \
+    ADD_ARG((*def), "pad", i, 0);                                                 \
+    ADD_ARG((*def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
+    def->set_name("cpu_conv");
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
+  }
+  {
+    OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
+    ADD_CONV_ARGS;
+  }
+
+  ws.RunNetOnce(cpu_net);
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS;
+  }
+
+#undef ADD_CONV_ARGS
+
+  compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
+
+}
+
+TEST(OPENGLOperatorTest, ConvBenchmark) {
+
+ Workspace ws;
+ auto channel_in = 4;
+ auto channel_out = 4;
+ auto spatial = 10;
+ auto kern = 3;
+ long long iters = 2;
+
+ PopulateCPUBlob(&ws, false, "cpu_X", {1, channel_in, spatial, spatial}, 1, 0, 0.1);
+
+#define ADD_CONV_ARGS(_def)                                                        \
+ {                                                                                 \
+    ADD_ARG((*_def), "kernel", i, kern);                                           \
+    ADD_ARG((*_def), "stride", i, 1);                                              \
+    ADD_ARG((*_def), "pad", i, 0);                                                 \
+    ADD_ARG((*_def), "order", s, "NCHW");                                          \
+  }
+
+  NetDef gpu_net;
+  NetDef cpu_net;
+  gpu_net.set_type("opengl");
+
+  std::string prev_out = "cpu_X";
+  for (auto i = 0; i < iters; ++i) {
+    std::string weightName = "W" + to_string(i);
+    std::string biasName = "b" + to_string(i);
+    std::string output = "conv" + to_string(i);
+    PopulateCPUBlob(&ws, false, weightName, {channel_out, channel_in, kern, kern}, 1);
+    PopulateCPUBlob(&ws, false, biasName, {channel_out}, 0);
+    OperatorDef* def = AddOp(&gpu_net, "Conv", {prev_out, weightName, biasName}, {output});
+    if (i == 0) {
+      OperatorDef* def2 = AddOp(&cpu_net, "Conv", {prev_out, weightName, biasName}, {"cpu" + output});
+    ADD_CONV_ARGS(def2);
+    } else {
+      OperatorDef* def2 = AddOp(&cpu_net, "Conv", {"cpu" + prev_out, weightName, biasName}, {"cpu" + output});
+    ADD_CONV_ARGS(def2);
+    }
+    prev_out = output;
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_CONV_ARGS(def);
+  }
+
+#undef ADD_CONV_ARGS
+
+  compareNetResult4D(ws, cpu_net, gpu_net, "cpu" + prev_out, prev_out, tol);
+
+}
+
+} // namespace caffe2
+
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_copy_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_copy_op_test.cc
new file mode 100644
index 0000000..70fadbc
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_copy_op_test.cc
@@ -0,0 +1,42 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, CopyFromGL) {
+
+  for (auto dims: std::vector<std::vector<int>>{
+      {1},
+      {3},
+      {1, 2},
+      {2, 3},
+      {1, 2, 3},
+      {1, 2, 3, 4},
+      {4, 3, 2, 1},
+      {4, 9, 8, 13},
+    }) {
+    Workspace ws;
+    PopulateCPUBlob(&ws, true, std::string("cpu_X"), dims, 1, 0.2, 0.1);
+
+    NetDef gpu_net;
+    gpu_net.set_type("opengl");
+    {
+      OperatorDef* def = AddOp(&gpu_net, "CopyFromGL", {"cpu_X"}, {"cpu_X2"});
+      MAKE_OPENGL_OPERATOR(def);
+    }
+    ws.RunNetOnce(gpu_net);
+    Blob *cpu_out = ws.GetBlob("cpu_X");
+    Blob *gpu_out = ws.GetBlob("cpu_X2");
+    EXPECT_NE(nullptr, cpu_out);
+    EXPECT_NE(nullptr, gpu_out);
+
+    auto &t1 = cpu_out->Get<TensorCPU>();
+    auto &t2 = gpu_out->Get<TensorCPU>();
+    double tol=0.01;
+    for (auto i = 0; i < t1.size(); ++i) {
+      EXPECT_NEAR(t1.data<float>()[i], t2.data<float>()[i], tol)
+        << "at index " << i;
+    }
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_elementwise_sum_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_elementwise_sum_op_test.cc
new file mode 100644
index 0000000..a732237
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_elementwise_sum_op_test.cc
@@ -0,0 +1,27 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, Sum) {
+  Workspace ws;
+  int N = 28;
+  int D = 128;
+  PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
+  PopulateCPUBlob(&ws, true, "cpu_Y", {N, D}, 1);
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_fully_connected_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_fully_connected_op_test.cc
new file mode 100644
index 0000000..0880b2a
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_fully_connected_op_test.cc
@@ -0,0 +1,36 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, FC) {
+
+  Workspace ws;
+  int batchSize = 1;
+  int CIn = 4;
+  int H = 8;
+  int W = 8;
+  int COut = 16;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, CIn, H, W});
+  PopulateCPUBlob(&ws, true, "cpu_W", {COut, CIn * H * W});
+  PopulateCPUBlob(&ws, true, "cpu_B", {COut});
+
+  constexpr float tol = 0.2;
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  // will work after the next release of ACL
+  // compareNetResult(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol, true);
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_model_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_model_test.cc
new file mode 100644
index 0000000..0bb976a
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_model_test.cc
@@ -0,0 +1,11 @@
+#include "caffe2/mobile/contrib/arm-compute/test/gl_model_test.h"
+
+namespace caffe2 {
+
+// The last softmax op didn't pass because of the dimension mismatch, and we are not likely to hit it in other models, but the implementation should be correct
+// TEST(OPENGLModelTest, SqueezenetV11) {
+//   std::string parent_path = "/data/local/tmp/";
+//   benchmarkModel(parent_path + "squeezenet_init.pb", parent_path + "squeezenet_predict.pb", "data", {1, 3, 224, 224}, "squeezenet_v11");
+// }
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_model_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_model_test.h
new file mode 100644
index 0000000..27e543d
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_model_test.h
@@ -0,0 +1,64 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include "caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h"
+#include "caffe2/mobile/contrib/arm-compute/core/rewrite_net.h"
+#include <gtest/gtest.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include <unordered_set>
+
+CAFFE2_DEFINE_int(warmup, 3, "The number of iterations to warm up.");
+CAFFE2_DEFINE_int(iter, 100, "The number of iterations to run.");
+CAFFE2_DEFINE_bool(
+    run_individual,
+    true,
+    "Whether to benchmark individual operators.");
+
+
+constexpr float tol = 0.03;
+namespace caffe2 {
+  void benchmarkModel(std::string init_net_pb, std::string predict_net_pb, std::string input_name, std::vector<int> input_dims, std::string net_name="benchmark_net", std::unordered_set<std::string> cpu_ops = std::unordered_set<std::string>({})) {
+    unique_ptr<caffe2::Workspace> ws(new caffe2::Workspace());
+    NetDef init_net_def;
+    CAFFE_ENFORCE(ReadProtoFromFile(init_net_pb, &init_net_def));
+    CAFFE_ENFORCE(ws->RunNetOnce(init_net_def));
+    NetDef predict_net_def, predict_net_def_gpu;
+    CAFFE_ENFORCE(ReadProtoFromFile(predict_net_pb, &predict_net_def));
+    PopulateCPUBlob(ws.get(), true, input_name, input_dims);
+    LOG(ERROR) << "[C2DEBUG] rewriting OpenGL net";
+    tryConvertToOpenGL(predict_net_def, &predict_net_def_gpu, false, cpu_ops);
+    // change the name of last op
+    auto index = predict_net_def_gpu.op().size() - 1;
+    LOG(ERROR) << "[C2DEBUG] index:" << index;
+    auto last_blob = predict_net_def_gpu.op()[index].output()[0];
+    auto op = predict_net_def_gpu.mutable_op(index);
+    auto output = op->mutable_output(0);
+    *output = last_blob + "_gpu";
+    LOG(ERROR) << "[C2DEBUG] last blob: " << last_blob;
+    for (auto i = 0; i < predict_net_def_gpu.external_output_size(); ++i) {
+      auto out = predict_net_def_gpu.mutable_external_output(i);
+      if (*out == last_blob) {
+        *out = last_blob + "_gpu";
+      }
+    }
+
+    compareNetResult4D(*ws, predict_net_def, predict_net_def_gpu, last_blob, last_blob + "_gpu");
+  LOG(ERROR) << "[C2DEBUG] after compareNetResult4D";
+  NetBase* net = ws->CreateNet(predict_net_def_gpu);
+  LOG(ERROR) << "[C2DEBUG] Benchmarking OpenGL Net";
+  net->TEST_Benchmark(caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
+  // Test CPU
+  for (auto i = 0; i < predict_net_def.op().size(); ++i) {
+    auto op = predict_net_def.mutable_op(i);
+    if (std::find(cpu_ops.begin(), cpu_ops.end(), op->type()) == cpu_ops.end()) {
+      op->mutable_device_option()->set_device_type(CPU);
+    }
+  }
+  predict_net_def.set_type("simple");
+  predict_net_def.set_name("cpu_net");
+  net = ws->CreateNet(predict_net_def);
+  LOG(INFO) << "[C2DEBUG] Benchmarking CPU Net";
+  net->TEST_Benchmark(caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
+
+  }
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_norm_planar_yuv_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_norm_planar_yuv_op_test.cc
new file mode 100644
index 0000000..9720dfd
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_norm_planar_yuv_op_test.cc
@@ -0,0 +1,33 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+constexpr float tol = 5.0e-2;
+
+TEST(OPENGLOperatorTest, NormPlanarYUV) {
+
+  Workspace ws;
+  int batchSize = 1;
+  int channels = 8;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, channels, 8, 13});
+
+  PopulateCPUBlob(&ws, true, "cpu_mean", {1, channels});
+  PopulateCPUBlob(&ws, true, "cpu_stddev", {1, channels}, 1, 0.5);
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult4D(ws, cpu_net, gpu_net);
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
new file mode 100644
index 0000000..fc53479
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -0,0 +1,121 @@
+#include "caffe2/mobile/contrib/arm-compute/core/context.h"
+#include <gtest/gtest.h>
+
+#include "caffe2/core/graph.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+#define DECLARE_OPENGL_OPERATOR(_name)                                         \
+  OperatorDef _name;                                                           \
+  _name.mutable_device_option()->set_device_type(OPENGL);
+
+#define MAKE_OPENGL_OPERATOR(_op)                                              \
+  _op->mutable_device_option()->set_device_type(OPENGL);
+
+#define ADD_ARG(_op, _name, _type, _val)                                       \
+  {                                                                            \
+    Argument *arg = _op.add_arg();                                             \
+    arg->set_name(_name);                                                      \
+    arg->set_##_type(_val);                                                    \
+  }
+
+// Use value 1337 to generate a blob that is deterministic
+// and unique at each value (for debugging purposes)
+template<typename T = float>
+void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
+                     std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
+  Blob *blob = ws->CreateBlob(name);
+  auto *tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(dims);
+  T *t_data = tensor->mutable_data<T>();
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::normal_distribution<> dist(0 + dist_shift, variance + dist_shift);
+  for (int i = 0; i < tensor->size(); ++i) {
+    t_data[i] = T(random ? dist(e2) : (val == 1337 ? i : val));
+  }
+}
+
+template<typename T = DataType>
+void compareNetResult(Workspace& ws,
+                      NetDef& cpu_net, NetDef& gpu_net,
+                      string cpu_blob="ref_Y",
+                      string gpu_blob="gpu_Y",
+                      double tol=0.01,
+                      bool relative=false) {
+  ws.RunNetOnce(cpu_net);
+  ws.RunNetOnce(gpu_net);
+
+  Blob *cpu_out = ws.GetBlob(cpu_blob);
+  Blob *gpu_out = ws.GetBlob(gpu_blob);
+  EXPECT_NE(nullptr, cpu_out);
+  EXPECT_NE(nullptr, gpu_out);
+
+  TensorCPU g;
+  auto& g_ = gpu_out->Get<GLTensor<T>>();
+  getTensorCPU(g_, g);
+
+  auto &t = cpu_out->Get<TensorCPU>();
+  EXPECT_EQ(g.size(), t.size());
+
+  for (auto i = 0; i < g.size(); ++i) {
+    if (relative) {
+      EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol + tol * std::abs(t.data<float>()[i])) << "at index " << i;
+    } else{
+      EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol)
+        << "at index " << i;
+    }
+  }
+}
+
+template<typename T = DataType>
+void compareNetResult4D(Workspace& ws,
+                        NetDef& cpu_net, NetDef& gpu_net,
+                        string cpu_blob="ref_Y",
+                        string gpu_blob="gpu_Y",
+                        double tol=0.05) {
+  LOG(INFO) << "[C2DEBUG] running gpu net";
+  bool gpu_success = ws.RunNetOnce(gpu_net);
+  LOG(INFO) << "[C2DEBUG] after gpu net";
+  bool cpu_success = ws.RunNetOnce(cpu_net);
+  LOG(INFO) << "[C2DEBUG] after cpu net";
+
+  if (!gpu_success || !cpu_success) {
+    LOG(ERROR) << "[C2DEBUG] cpu or gpu net failed.";
+    return;
+  }
+  Blob *cpu_out = ws.GetBlob(cpu_blob);
+  Blob *gpu_out = ws.GetBlob(gpu_blob);
+
+  EXPECT_NE(nullptr, cpu_out);
+  EXPECT_NE(nullptr, gpu_out);
+
+  auto &t = cpu_out->Get<TensorCPU>();
+  int diff_num = 0;
+  if (gpu_out->IsType<TensorCPU>()) {
+    auto& g = gpu_out->Get<TensorCPU>();
+    for (auto i = 0; i < t.size(); ++i) {
+      auto t_elem = t.data<float>()[i];
+      auto g_elem = g.data<float>()[i];
+      if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
+        diff_num++;
+      }
+    }
+  } else if (gpu_out->IsType<GLTensor<T>>()) {
+    TensorCPU g;
+    getTensorCPU(gpu_out->Get<GLTensor<T>>(), g);
+    for (auto i = 0; i < t.size(); ++i) {
+      auto t_elem = t.data<float>()[i];
+      auto g_elem = g.data<float>()[i];
+      if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
+        diff_num++;
+      }
+    }
+  }
+  CHECK(diff_num <= 0.03 * t.size());
+}
+
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_pool_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_pool_op_test.cc
new file mode 100644
index 0000000..26ca2ba
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_pool_op_test.cc
@@ -0,0 +1,89 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, AveragePool) {
+  Workspace ws;
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+
+}
+
+TEST(OPENGLOperatorTest, MaxPool) {
+  Workspace ws;
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"cpu_X"}, {"ref_Y"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"cpu_X"}, {"gpu_Y"});
+    ADD_ARG((*def), "kernel", i, 2);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 2);
+    ADD_ARG((*def), "order", s, "NCHW");
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+
+}
+
+TEST(OPENGLOperatorTest, AverageGlobalPool) {
+  Workspace ws;
+  PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
+    ADD_ARG((*def), "global_pooling", i, 1);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 1);
+    ADD_ARG((*def), "order", s, "NCHW");
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
+    ADD_ARG((*def), "global_pooling", i, 1);
+    ADD_ARG((*def), "pad", i, 0);
+    ADD_ARG((*def), "stride", i, 1);
+    ADD_ARG((*def), "order", s, "NCHW");
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_resize_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_resize_op_test.cc
new file mode 100644
index 0000000..7b3c7a9
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_resize_op_test.cc
@@ -0,0 +1,35 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, ResizeNearest) {
+
+  Workspace ws;
+  float height_scale = 2;
+  float width_scale = 2;
+  int N = 1;
+  int CIn = 7;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {N, CIn, 37, 89});
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "ResizeNearest", {"cpu_X"}, {"ref_Y"});
+    ADD_ARG((*def), "height_scale", f, height_scale);
+    ADD_ARG((*def), "width_scale", f, width_scale);
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "ResizeNearest", {"cpu_X"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_ARG((*def), "height_scale", f, height_scale);
+    ADD_ARG((*def), "width_scale", f, width_scale);
+  }
+
+  compareNetResult4D(ws, cpu_net, gpu_net);
+
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_softmax_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_softmax_op_test.cc
new file mode 100644
index 0000000..28b834e
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_softmax_op_test.cc
@@ -0,0 +1,28 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, Softmax) {
+
+  Workspace ws;
+  int N = 1;
+  int D = 128;
+  PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
+
+  NetDef cpu_net;
+  {
+    AddOp(&cpu_net, "Softmax", {"cpu_X"}, {"ref_Y"});
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "Softmax", {"cpu_X"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+  }
+
+  compareNetResult(ws, cpu_net, gpu_net);
+
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_spatial_batch_norm_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_spatial_batch_norm_op_test.cc
new file mode 100644
index 0000000..38fa2b8
--- /dev/null
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_spatial_batch_norm_op_test.cc
@@ -0,0 +1,35 @@
+#include "gl_operator_test.h"
+
+namespace caffe2 {
+
+TEST(OPENGLOperatorTest, SpatialBN) {
+
+  Workspace ws;
+  int batchSize = 1;
+  int channels = 8;
+
+  PopulateCPUBlob(&ws, true, "cpu_X", {3, channels, 8, 13});
+  PopulateCPUBlob(&ws, true, "cpu_scale", {channels});
+  PopulateCPUBlob(&ws, true, "cpu_bias", {channels});
+  PopulateCPUBlob(&ws, true, "cpu_mean", {channels});
+  PopulateCPUBlob(&ws, true, "cpu_var", {channels}, 1, 0.5);
+
+  NetDef cpu_net;
+  {
+    OperatorDef* def = AddOp(&cpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"ref_Y"});
+    ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
+  }
+
+  NetDef gpu_net;
+  gpu_net.set_type("opengl");
+  {
+    OperatorDef* def = AddOp(&gpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"gpu_Y"});
+    MAKE_OPENGL_OPERATOR(def);
+    ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
+  }
+
+  compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", 0.01);
+
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/CMakeLists.txt b/caffe2/mobile/contrib/ios/CMakeLists.txt
new file mode 100644
index 0000000..36a87f8
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/CMakeLists.txt
@@ -0,0 +1,17 @@
+# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
+if(IOS)
+  # Basic ios srcs.
+  set(Caffe2_CONTRIB_IOS_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
+    # "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
+    )
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
+
+  if (USE_METAL)
+    # metal/mpscnn files
+    add_subdirectory(mpscnn)
+  endif()
+endif()
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/ios/ios_caffe.cc b/caffe2/mobile/contrib/ios/ios_caffe.cc
new file mode 100644
index 0000000..12e0e55
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/ios_caffe.cc
@@ -0,0 +1,55 @@
+
+#include "ios_caffe.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
+
+Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
+                                        const std::string& predict_net_str,
+                                        bool disableMultithreadProcessing,
+                                        bool allowMetalOperators,
+                                        std::string& errorMessage) {
+  caffe2::NetDef init_net, predict_net;
+  init_net.ParseFromString(init_net_str);
+  predict_net.ParseFromString(predict_net_str);
+
+  Caffe2IOSPredictor* predictor = NULL;
+  try {
+    predictor = Caffe2IOSPredictor::NewCaffe2IOSPredictor(
+        init_net, predict_net, disableMultithreadProcessing, allowMetalOperators);
+  } catch (const caffe2::EnforceNotMet& e) {
+    std::string error = e.msg();
+    errorMessage.swap(error);
+    return NULL;
+  } catch (const std::exception& e) {
+    std::string error = e.what();
+    errorMessage.swap(error);
+    return NULL;
+  }
+  return predictor;
+}
+
+void GenerateStylizedImage(std::vector<float>& originalImage,
+                           const std::string& init_net_str,
+                           const std::string& predict_net_str,
+                           int height,
+                           int width,
+                           std::vector<float>& dataOut) {
+  caffe2::NetDef init_net, predict_net;
+  init_net.ParseFromString(init_net_str);
+  predict_net.ParseFromString(predict_net_str);
+  caffe2::Predictor p(init_net, predict_net);
+
+  std::vector<int> dims({1, 3, height, width});
+  caffe2::TensorCPU input;
+  input.Resize(dims);
+  input.ShareExternalPointer(originalImage.data());
+  caffe2::Predictor::TensorVector input_vec{&input};
+  caffe2::Predictor::TensorVector output_vec;
+  p.run(input_vec, &output_vec);
+  assert(output_vec.size() == 1);
+  caffe2::TensorCPU* output = output_vec.front();
+  // output is our styled image
+  float* outputArray = output->mutable_data<float>();
+  dataOut.assign(outputArray, outputArray + output->size());
+}
diff --git a/caffe2/mobile/contrib/ios/ios_caffe.h b/caffe2/mobile/contrib/ios/ios_caffe.h
new file mode 100644
index 0000000..3fbd235
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/ios_caffe.h
@@ -0,0 +1,25 @@
+
+#ifdef __cplusplus
+
+#include <string>
+#include <vector>
+#include "caffe2/core/predictor.h"
+#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
+#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
+
+extern "C" {
+
+IOS_CAFFE_EXPORT Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
+                                                         const std::string& predict_net_str,
+                                                         bool disableMultithreadProcessing,
+                                                         bool allowMetalOperators,
+                                                         std::string& errorMessage);
+IOS_CAFFE_EXPORT void GenerateStylizedImage(std::vector<float>& originalImage,
+                                            const std::string& init_net_str,
+                                            const std::string& predict_net_str,
+                                            int height,
+                                            int width,
+                                            std::vector<float>& dataOut);
+}
+
+#endif
diff --git a/caffe2/mobile/contrib/ios/ios_caffe_defines.h b/caffe2/mobile/contrib/ios/ios_caffe_defines.h
new file mode 100644
index 0000000..57b4858
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/ios_caffe_defines.h
@@ -0,0 +1,2 @@
+
+#define IOS_CAFFE_EXPORT __attribute__((visibility("default")))
diff --git a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
new file mode 100644
index 0000000..d497c9b
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@@ -0,0 +1,72 @@
+#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/tensor.h"
+
+#if defined(CAFFE2_USE_MPSCNN) && CAFFE2_MOBILE
+#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
+#endif
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
+                                                              const caffe2::NetDef& predict_net,
+                                                              bool disableMultithreadProcessing,
+                                                              bool allowMetalOperators) {
+  caffe2::NetDef metal_predict_net;
+  bool usingMetalOperators = false;
+#if defined(CAFFE2_USE_MPSCNN) && CAFFE2_MOBILE
+  if (allowMetalOperators) {
+    caffe2::dumpDef(predict_net);
+    if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
+      LOG(INFO) << "Successfully converted to MPSCNN";
+      caffe2::dumpDef(metal_predict_net);
+      usingMetalOperators = true;
+    } else {
+      LOG(ERROR) << "Failed converting model to MPSCNN";
+    }
+  }
+#endif
+
+  return new Caffe2IOSPredictor(init_net,
+                                usingMetalOperators ? metal_predict_net : predict_net,
+                                disableMultithreadProcessing,
+                                usingMetalOperators);
+}
+
+Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
+                                       const caffe2::NetDef& predict_net,
+                                       bool disableMultithreadProcessing,
+                                       bool usingMetalOperators)
+    : usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
+#if CAFFE2_MOBILE
+  if (disableMultithreadProcessing) {
+    caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
+    if (threadpool != nullptr) {
+      threadpool->setMinWorkSize(std::numeric_limits<size_t>::max());
+    }
+  }
+#endif
+}
+
+void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
+  caffe2::FLAGS_caffe2_force_shared_col_buffer = true;
+  caffe2::TensorCPU input;
+  input.Resize(inData.dims);
+  input.ShareExternalPointer(inData.data);
+  caffe2::Predictor::TensorVector input_vec{&input};
+  caffe2::Predictor::TensorVector output_vec;
+  try {
+    predictor_.run(input_vec, &output_vec);
+  } catch (const caffe2::EnforceNotMet& e) {
+    std::string error = e.msg();
+    errorMessage.swap(error);
+    return;
+  } catch (const std::exception& e) {
+    std::string error = e.what();
+    errorMessage.swap(error);
+    return;
+  }
+  caffe2::TensorCPU* output = output_vec.front();
+  outData.data = output->mutable_data<uint8_t>();
+  outData.dims = output->dims();
+}
diff --git a/caffe2/mobile/contrib/ios/ios_caffe_predictor.h b/caffe2/mobile/contrib/ios/ios_caffe_predictor.h
new file mode 100644
index 0000000..0b065d3
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.h
@@ -0,0 +1,36 @@
+
+#pragma once
+
+#include <string>
+#include "caffe2/core/net.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
+
+struct Tensor {
+  std::vector<int64_t> dims;
+  uint8_t* data;
+};
+
+class IOS_CAFFE_EXPORT Caffe2IOSPredictor final {
+ public:
+  /**
+   @allowMetalOperators Allow converting eligible operators to Metal GPU framework accelerated
+   operators. Setting this flag to true doesn't gaurantee predictor will be using Metal operators;
+   Client code must check usingMetalOperators flag to determine predictor is using them.
+   */
+  static Caffe2IOSPredictor* NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
+                                                   const caffe2::NetDef& predict_net,
+                                                   bool disableMultithreadProcessing,
+                                                   bool allowMetalOperators);
+  void run(const Tensor& inData, Tensor& outData, std::string& errorMessage);
+  ~Caffe2IOSPredictor(){};
+
+  const bool usingMetalOperators;
+
+ private:
+  Caffe2IOSPredictor(const caffe2::NetDef& init_net,
+                     const caffe2::NetDef& predict_net,
+                     bool disableMultithreadProcessing,
+                     bool usingMetalOperators);
+  caffe2::Predictor predictor_;
+};
diff --git a/caffe2/mobile/contrib/ios/mpscnn/CMakeLists.txt b/caffe2/mobile/contrib/ios/mpscnn/CMakeLists.txt
new file mode 100644
index 0000000..f426e41
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(USE_METAL)
+  file(GLOB_RECURSE tmp *.mm *.cc)
+  # exclude test files
+  file(GLOB_RECURSE test_files *_test.cc)
+  exclude(tmp "${tmp}" ${test_files})
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+endif()
diff --git a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
new file mode 100644
index 0000000..48dea9a
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
@@ -0,0 +1,1159 @@
+
+#include <metal_stdlib>
+
+using namespace metal;
+
+constant ushort ushort_arg_0[[function_constant(0)]];
+constant ushort ushort_arg_1[[function_constant(1)]];
+constant ushort ushort_arg_2[[function_constant(2)]];
+constant ushort ushort_arg_3[[function_constant(3)]];
+constant ushort ushort_arg_4[[function_constant(4)]];
+constant ushort ushort_arg_5[[function_constant(5)]];
+constant ushort ushort_arg_6[[function_constant(6)]];
+constant ushort ushort_arg_7[[function_constant(7)]];
+constant ushort ushort_arg_8[[function_constant(8)]];
+constant ushort ushort_arg_9[[function_constant(9)]];
+
+inline constexpr ushort divRoundUp(ushort x, ushort y) { return (x + (y - 1)) / y; }
+
+kernel void affine(constant half4* scale[[buffer(0)]],
+                   constant half4* shift[[buffer(1)]],
+                   texture2d_array<half, access::read> in[[texture(0)]],
+                   texture2d_array<half, access::write> out[[texture(1)]],
+                   ushort3 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  const half4 scale_c = scale[gid.z % divRoundUp(C, 4)];
+  const half4 shift_c = shift[gid.z % divRoundUp(C, 4)];
+  ushort2 gid_(gid.x, gid.y);
+  const half4 x = in.read(gid_, gid.z);
+  const half4 y = scale_c * x + shift_c;
+  out.write(y, gid_, gid.z);
+}
+
+kernel void affine_nonarray(constant half4* scale[[buffer(0)]],
+                            constant half4* shift[[buffer(1)]],
+                            texture2d<half, access::read> in[[texture(0)]],
+                            texture2d<half, access::write> out[[texture(1)]],
+                            ushort2 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  const half4 scale_c = scale[0];
+  const half4 shift_c = shift[0];
+  half4 x = in.read(gid);
+  const half4 y = scale_c * x + shift_c;
+  out.write(y, gid);
+}
+
+kernel void prelu_nonshared(constant half4* weights[[buffer(0)]],
+                            texture2d_array<half, access::read> in[[texture(0)]],
+                            texture2d_array<half, access::write> out[[texture(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  const ushort S = ushort_arg_1;
+  const bool channel_shared = S == 1;
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  half4 w = channel_shared ? half4(weights[0][0], weights[0][0], weights[0][0], weights[0][0])
+                           : weights[gid.z % divRoundUp(C, 4)];
+  ushort2 gid_(gid.x, gid.y);
+  half4 x = in.read(gid_, gid.z);
+  half4 y = select(x * w, x, x > 0.0h);
+  out.write(y, gid_, gid.z);
+}
+
+kernel void prelu_nonshared_nonarray(constant half4* weights[[buffer(0)]],
+                                     texture2d<half, access::read> in[[texture(0)]],
+                                     texture2d<half, access::write> out[[texture(1)]],
+                                     ushort2 gid[[thread_position_in_grid]]) {
+  // const ushort C = ushort_arg_0;
+  const ushort S = ushort_arg_1;
+  const bool channel_shared = S == 1;
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  half4 w = channel_shared ? half4(weights[0][0], weights[0][0], weights[0][0], weights[0][0])
+                           : weights[0];
+  half4 x = in.read(gid);
+  half4 y = select(x * w, x, x > 0.0h);
+  out.write(y, gid);
+}
+
+// One block per texture.
+// 256 threads per block.
+using AccT = float4;
+
+constant const bool instance_norm_has_prelu = ushort_arg_1 > 0;
+
+kernel void instance_norm(
+    constant half4* weights[[buffer(0)]],
+    constant half4* bias[[buffer(1)]],
+    constant half4* preluWeights[[ buffer(2), function_constant(instance_norm_has_prelu) ]],
+    texture2d_array<half, access::read> in[[texture(0)]],
+    texture2d_array<half, access::write> out[[texture(1)]],
+    ushort3 gid[[thread_position_in_grid]],
+    ushort tid[[thread_index_in_threadgroup]],
+    ushort3 tcount[[threads_per_threadgroup]]) {
+  if (gid.z >= out.get_array_size()) {
+    return;
+  }
+  const ushort C = ushort_arg_0;
+  const ushort S = ushort_arg_1;
+  const bool channel_shared = S == 1;
+  const ushort c = gid.z % divRoundUp(C, 4);
+  constexpr ushort THREADGROUP_SIZE = 256;
+
+  threadgroup AccT per_thread_state[THREADGROUP_SIZE];
+  // Each block handles a single texture.
+  per_thread_state[tid] = 0;
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      per_thread_state[tid] += static_cast<AccT>(in.read(ushort2(x, y), gid.z));
+    }
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // 256 -> 32 reduction
+  if (tid < 32) {
+    per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+                             per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+                             per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+                             per_thread_state[tid + 224];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid == 0) {
+    AccT sum = 0.0;
+    for (ushort i = 0; i < 32; ++i) {
+      sum += per_thread_state[i];
+    }
+    sum /= (in.get_width() * in.get_height());
+    per_thread_state[0] = sum;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Broadcast to all threads.
+  const AccT mean = per_thread_state[0];
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  per_thread_state[tid] = 0;
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      AccT delta = static_cast<AccT>(in.read(ushort2(x, y), gid.z)) - mean;
+      per_thread_state[tid] += delta * delta;
+    }
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // 256 -> 32 reduction
+  if (tid < 32) {
+    per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+                             per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+                             per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+                             per_thread_state[tid + 224];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid == 0) {
+    AccT sum = 0.0;
+    for (ushort i = 0; i < 32; ++i) {
+      sum += per_thread_state[i];
+    }
+    sum /= (in.get_width() * in.get_height());
+    per_thread_state[0] = 1.0 / sqrt(max(sum, AccT(1e-5, 1e-5, 1e-5, 1e-5)) + 1.0e-5);
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Broadcast to all threads.
+  const AccT inv_var = per_thread_state[0];
+
+  const AccT c_weights = static_cast<AccT>(weights[c]);
+  const AccT c_bias = static_cast<AccT>(bias[c]);
+
+  const AccT scale = inv_var * c_weights;
+  const AccT shift = c_bias - mean * scale;
+
+  half4 w;
+  if (instance_norm_has_prelu) {
+    w = channel_shared ? half4(preluWeights[0][0]) : preluWeights[c];
+  }
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      half4 scaled =
+          static_cast<half4>(static_cast<AccT>(in.read(ushort2(x, y), gid.z)) * scale + shift);
+      if (instance_norm_has_prelu) {
+        scaled = select(scaled * w, scaled, scaled > 0.0h);
+      }
+      out.write(scaled, ushort2(x, y), gid.z);
+    }
+  }
+}
+
+// One block per texture.
+// 256 threads per block.
+kernel void instance_norm_nonarray(
+    constant half4* weights[[buffer(0)]],
+    constant half4* bias[[buffer(1)]],
+    constant half4* preluWeights[[ buffer(2), function_constant(instance_norm_has_prelu) ]],
+    texture2d<half, access::read> in[[texture(0)]],
+    texture2d<half, access::write> out[[texture(1)]],
+    ushort3 gid[[thread_position_in_grid]],
+    ushort tid[[thread_index_in_threadgroup]],
+    ushort3 tcount[[threads_per_threadgroup]]) {
+  // const ushort C = ushort_arg_0;
+  const ushort S = ushort_arg_1;
+  const bool channel_shared = S == 1;
+  constexpr ushort THREADGROUP_SIZE = 256;
+
+  threadgroup AccT per_thread_state[THREADGROUP_SIZE];
+  // Each block handles a single texture.
+  per_thread_state[tid] = 0;
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      per_thread_state[tid] += static_cast<AccT>(in.read(ushort2(x, y)));
+    }
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // 256 -> 32 reduction
+  if (tid < 32) {
+    per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+                             per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+                             per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+                             per_thread_state[tid + 224];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid == 0) {
+    AccT sum = 0.0;
+    for (ushort i = 0; i < 32; ++i) {
+      sum += per_thread_state[i];
+    }
+    sum /= (in.get_width() * in.get_height());
+    per_thread_state[0] = sum;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Broadcast to all threads.
+  const AccT mean = per_thread_state[0];
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  per_thread_state[tid] = 0;
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      AccT delta = static_cast<AccT>(in.read(ushort2(x, y))) - mean;
+      per_thread_state[tid] += delta * delta;
+    }
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // 256 -> 32 reduction
+  if (tid < 32) {
+    per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+                             per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+                             per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+                             per_thread_state[tid + 224];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid == 0) {
+    AccT sum = 0.0;
+    for (ushort i = 0; i < 32; ++i) {
+      sum += per_thread_state[i];
+    }
+    sum /= (in.get_width() * in.get_height());
+    per_thread_state[0] = 1.0 / sqrt(max(sum, AccT(1e-5, 1e-5, 1e-5, 1e-5)) + 1.0e-5);
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Broadcast to all threads.
+  const AccT inv_var = per_thread_state[0];
+
+  const AccT c_weights = static_cast<AccT>(weights[0]);
+  const AccT c_bias = static_cast<AccT>(bias[0]);
+
+  const AccT scale = inv_var * c_weights;
+  const AccT shift = c_bias - mean * scale;
+
+  half4 w;
+  if (instance_norm_has_prelu) {
+    w = channel_shared ? half4(preluWeights[0][0]) : preluWeights[0];
+  }
+  for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+    for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+      half4 scaled = static_cast<half4>(static_cast<AccT>(in.read(ushort2(x, y))) * scale + shift);
+      if (instance_norm_has_prelu) {
+        scaled = select(scaled * w, scaled, scaled > 0.0h);
+      }
+      out.write(scaled, ushort2(x, y));
+    }
+  }
+}
+
+kernel void copy_nchw_to_metal(constant float* in[[buffer(0)]],
+                               texture2d_array<half, access::write> out[[texture(0)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  const ushort H = ushort_arg_1;
+  const ushort W = ushort_arg_2;
+  if (gid.x >= W || gid.y >= H) {
+    return;
+  }
+
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+
+// TODO: are the `else` branches needed?
+// TODO: trick the optimizer for case where C == 4?
+#define CHW_TO_CHWP4(idx, n, c_, h, w)                                     \
+  if ((c_) < C) {                                                          \
+    trns[idx] = in[n * H * W * C + int(c_) * H * W + int(h) * W + int(w)]; \
+  } else {                                                                 \
+    trns[idx] = 0.0h;                                                      \
+  }
+
+  half4 trns;
+  CHW_TO_CHWP4(0, n, c * 4 + 0, gid.y, gid.x);
+  CHW_TO_CHWP4(1, n, c * 4 + 1, gid.y, gid.x);
+  CHW_TO_CHWP4(2, n, c * 4 + 2, gid.y, gid.x);
+  CHW_TO_CHWP4(3, n, c * 4 + 3, gid.y, gid.x);
+#undef CHW_TO_CHWP4
+
+  out.write(trns, gid.xy, gid.z);
+}
+
+kernel void copy_nchw_to_metal_nonarray(constant float* in[[buffer(0)]],
+                                        texture2d<half, access::write> out[[texture(0)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  const ushort H = ushort_arg_1;
+  const ushort W = ushort_arg_2;
+
+  if (gid.x >= W || gid.y >= H) {
+    return;
+  }
+
+  half4 trns;
+// TODO: are the `else` branches needed?
+// TODO: trick the optimizer for case where C % 4 == 0?
+
+#define CHW_TO_CHWP4(idx, c, h, w)                        \
+  if ((c) < C) {                                          \
+    trns[idx] = in[int(c) * H * W + int(h) * W + int(w)]; \
+  } else {                                                \
+    trns[idx] = 0.0h;                                     \
+  }
+
+  CHW_TO_CHWP4(0, 0, gid.y, gid.x);
+  CHW_TO_CHWP4(1, 1, gid.y, gid.x);
+  CHW_TO_CHWP4(2, 2, gid.y, gid.x);
+  CHW_TO_CHWP4(3, 3, gid.y, gid.x);
+#undef CHW_TO_CHWP4
+
+  out.write(trns, gid.xy);
+}
+
+kernel void copy_metal_to_nchw(texture2d_array<half, access::read> in[[texture(0)]],
+                               device float* out[[buffer(0)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  const ushort H = ushort_arg_1;
+  const ushort W = ushort_arg_2;
+
+  if (gid.x >= W || gid.y >= H) {
+    return;
+  }
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+
+  half4 cs = in.read(gid.xy, gid.z);
+
+#define CHWP4_TO_CHW(idx, n, c_, h, w)                                    \
+  if ((c_) < C) {                                                         \
+    out[n * H * W * C + int(c_) * H * W + int(h) * W + int(w)] = cs[idx]; \
+  }
+
+  CHWP4_TO_CHW(0, n, c * 4 + 0, gid.y, gid.x);
+  CHWP4_TO_CHW(1, n, c * 4 + 1, gid.y, gid.x);
+  CHWP4_TO_CHW(2, n, c * 4 + 2, gid.y, gid.x);
+  CHWP4_TO_CHW(3, n, c * 4 + 3, gid.y, gid.x);
+#undef CHWP4_TO_CHW
+}
+
+kernel void copy_metal_to_nchw_nonarray(texture2d<half, access::read> in[[texture(0)]],
+                                        device float* out[[buffer(0)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+  const ushort C = ushort_arg_0;
+  const ushort H = ushort_arg_1;
+  const ushort W = ushort_arg_2;
+
+  if (gid.x >= W || gid.y >= H) {
+    return;
+  }
+
+  half4 cs = in.read(gid.xy);
+
+#define CHWP4_TO_CHW(idx, c, h, w)                       \
+  if ((c) < C) {                                         \
+    out[int(c) * H * W + int(h) * W + int(w)] = cs[idx]; \
+  }
+
+  CHWP4_TO_CHW(0, 0, gid.y, gid.x);
+  CHWP4_TO_CHW(1, 1, gid.y, gid.x);
+  CHWP4_TO_CHW(2, 2, gid.y, gid.x);
+  CHWP4_TO_CHW(3, 3, gid.y, gid.x);
+#undef CHWP4_TO_CHW
+}
+
+kernel void convtranspose_upscale(texture2d_array<half, access::read> in[[texture(0)]],
+                                  texture2d_array<half, access::write> out[[texture(1)]],
+                                  ushort3 gid[[thread_position_in_grid]]) {
+  // All resolved at compile time.
+  // Assume symmetric kernel/stride/pad for now.
+  const ushort kernel_ = ushort_arg_0;
+  const ushort stride = ushort_arg_1;
+  const ushort pad = ushort_arg_2;
+
+  half4 zero(0.0h, 0.0h, 0.0h, 0.0h);
+
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  const ushort2 gid_ = gid.xy;
+  if (gid.x < kernel_ - 1 - pad || gid.y < kernel_ - 1 - pad) {
+    out.write(zero, gid_, gid.z);
+    return;
+  }
+
+  if (((gid.x - (kernel_ - 1 - pad)) % stride == 0) &&
+      ((gid.y - (kernel_ - 1 - pad)) % stride == 0)) {
+    ushort2 in_pos((gid.x - (kernel_ - 1 - pad)) / stride, (gid.y - (kernel_ - 1 - pad)) / stride);
+
+    if (in_pos.x < in.get_width() && in_pos.y < in.get_height()) {
+      half4 input = in.read(in_pos, gid.z);
+      out.write(input, gid_, gid.z);
+    } else {
+      out.write(zero, gid_, gid.z);
+    }
+  } else {
+    out.write(zero, gid_, gid.z);
+  }
+}
+
+constant bool has_in_arr = (ushort_arg_7 > 1 || ushort_arg_0 * ushort_arg_1 * ushort_arg_6 > 4);
+constant bool has_out_arr = (ushort_arg_7 > 1 || ushort_arg_6 > 4);
+constant bool has_in_tex = (!has_in_arr);
+constant bool has_out_tex = (!has_out_arr);
+
+kernel void col2im(
+    texture2d_array<half, access::read> ina[[ texture(0), function_constant(has_in_arr) ]],
+    texture2d<half, access::read> in[[ texture(0), function_constant(has_in_tex) ]],
+    texture2d_array<half, access::write> outa[[ texture(1), function_constant(has_out_arr) ]],
+    texture2d<half, access::write> out[[ texture(1), function_constant(has_out_tex) ]],
+    constant half4* bias[[buffer(0)]],
+    ushort3 gid[[thread_position_in_grid]]) {
+  if (has_out_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+  }
+  const ushort kernel_h = ushort_arg_0;
+  const ushort kernel_w = ushort_arg_1;
+  const ushort stride_h = ushort_arg_2;
+  const ushort stride_w = ushort_arg_3;
+  const ushort pad_l = ushort_arg_4;
+  const ushort pad_t = ushort_arg_5;
+  const ushort C = ushort_arg_6;
+  //  const int N = ushort_arg_7;
+  const ushort height_col = ushort_arg_8; //(outa.get_height() + pad + pad - kernel_) / stride + 1;
+  const ushort width_col = ushort_arg_9; // (outa.get_width() + pad + pad - kernel_) / stride + 1;
+
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+
+  const ushort w = gid.x + pad_l;
+  const ushort h = gid.y + pad_t;
+
+  // compute the start and end of the output
+  const ushort w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+  const ushort w_col_end = min(ushort(w / stride_w + 1), ushort(width_col));
+  const ushort h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+  const ushort h_col_end = min(ushort(h / stride_h + 1), ushort(height_col));
+
+  float4 val = static_cast<float4>(bias[c]);
+  for (ushort h_col = h_col_start; h_col < h_col_end; ++h_col) {
+    for (ushort w_col = w_col_start; w_col < w_col_end; ++w_col) {
+      const ushort w_k = w - w_col * stride_w;
+      const ushort h_k = h - h_col * stride_h;
+
+      // layout is essentially: [N][K][K][C][H][W]
+      // - where the divRoundUp(K * K * C, 4) channels are interleaved as usual.
+      // Thus, it's actually [N][divRoundUp(K * K * C, 4)][H][W].
+
+      // If C % 4 is not zero, then we have to play some games via partial indexing.
+      // TODO: is it worth optimizing this loop via padding in C?
+      if (C % 4 == 0) {
+        ushort c_col = n * kernel_h * kernel_w * divRoundUp(C, 4) +
+                       h_k * kernel_w * divRoundUp(C, 4) + w_k * divRoundUp(C, 4) + c;
+        if (has_in_arr) {
+          val += static_cast<float4>(ina.read(ushort2(w_col, h_col), c_col));
+        }
+        if (has_in_tex) {
+          val += static_cast<float4>(in.read(ushort2(w_col, h_col), c_col));
+        }
+      } else {
+        half4 components(0, 0, 0, 0);
+        for (auto i = 0; i < 4; ++i) {
+          ushort c_col_i = n * divRoundUp(kernel_h * kernel_w * C, 4) * 4 + h_k * kernel_w * C +
+                           w_k * C + c * 4 + i;
+          ushort c_col_i_z = c_col_i / 4;
+          ushort c_col_i_off = c_col_i - c_col_i_z * 4;
+          if (has_in_arr) {
+            components[i] = ina.read(ushort2(w_col, h_col), c_col_i_z)[c_col_i_off];
+          }
+          if (has_in_tex) {
+            components[i] = in.read(ushort2(w_col, h_col))[c_col_i_off];
+          }
+        }
+        val += static_cast<float4>(components);
+      }
+    }
+  }
+  if (has_out_arr) {
+    outa.write(static_cast<half4>(val), gid.xy, gid.z);
+  }
+  if (has_out_tex) {
+    out.write(static_cast<half4>(val), gid.xy);
+  }
+}
+
+kernel void preprocess_stylizer(device uchar4* in[[buffer(0)]],
+                                constant half* mean[[buffer(1)]],
+                                constant half4* noise[[buffer(2)]],
+                                texture2d<half, access::write> out[[texture(0)]],
+                                ushort2 gid[[thread_position_in_grid]]) {
+
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  const ushort noise_size = ushort_arg_0;
+
+  half4 mean_half(mean[0], mean[1], mean[2], 0.0h);
+  uint input_noise_idx = ((uint)out.get_width() * (uint)gid.y + (uint)gid.x) % (noise_size / 4);
+  const half4 input_noise = noise[input_noise_idx];
+  const uint W = out.get_width();
+#define in_at(h, w) in[(uint)(h)*W + (uint)(w)]
+  uchar4 input = in_at(gid.y, gid.x);
+#undef in_at
+  half4 input_half = static_cast<half4>(input);
+  out.write(input_half - mean_half + input_noise, gid);
+}
+
+kernel void deprocess_stylizer(texture2d<half, access::read> in[[texture(0)]],
+                               device uchar4* out[[buffer(0)]],
+                               constant half* mean[[buffer(1)]],
+                               ushort2 gid[[thread_position_in_grid]]) {
+  if (gid.x >= in.get_width() || gid.y >= in.get_height()) {
+    return;
+  }
+
+  half4 value = in.read(gid);
+
+  half4 mean_h(mean[0], mean[1], mean[2], 0.0h);
+  half4 min_h(0.0h, 0.0h, 0.0h, 255.0h);
+  half4 max_h(255.0h, 255.0h, 255.0h, 255.0h);
+  half4 clamped = clamp(value + mean_h, min_h, max_h);
+  const uint W = in.get_width();
+#define out_at(h, w, v) out[(uint)(h)*W + (uint)(w)] = (v)
+  out_at(gid.y, gid.x, static_cast<uchar4>(clamped));
+#undef out_at
+}
+
+kernel void reflection_padding_nonarray(texture2d<half, access::read> in[[texture(0)]],
+                                        texture2d<half, access::write> out[[texture(1)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  ushort H = in.get_height();
+  ushort PH = out.get_height();
+
+  // Note: we assume symmetric padding on H/W here, which is verified
+  // in the calling code.
+  ushort pad_h = (PH - H) / 2;
+  ushort W = in.get_width();
+  ushort PW = out.get_width();
+  ushort pad_w = (PW - W) / 2;
+
+  short h = short(gid.y) - short(pad_h);
+  h = max(h, short(-h));
+  h = min(h, short(2 * H - h - 2));
+
+  short w = short(gid.x) - short(pad_w);
+  w = max(w, short(-w));
+  w = min(w, short(2 * W - w - 2));
+
+  ushort2 inid(w, h);
+  out.write(in.read(inid), gid);
+}
+
+kernel void reflection_padding(texture2d_array<half, access::read> in[[texture(0)]],
+                               texture2d_array<half, access::write> out[[texture(1)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  ushort H = in.get_height();
+  ushort PH = out.get_height();
+
+  // Note: we assume symmetric padding on H/W here, which is verified
+  // in the calling code.
+  ushort pad_h = (PH - H) / 2;
+  ushort W = in.get_width();
+  ushort PW = out.get_width();
+  ushort pad_w = (PW - W) / 2;
+
+  short h = short(gid.y) - short(pad_h);
+  h = max(h, short(-h));
+  h = min(h, short(2 * H - h - 2));
+
+  short w = short(gid.x) - short(pad_w);
+  w = max(w, short(-w));
+  w = min(w, short(2 * W - w - 2));
+
+  ushort2 inid(w, h);
+
+  out.write(in.read(inid, gid.z), gid.xy, gid.z);
+}
+
+kernel void bilinear_upsample(texture2d<half, access::sample> in[[texture(0)]],
+                              texture2d<half, access::write> out[[texture(1)]],
+                              ushort2 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  ushort2 src = gid / 2;
+  constexpr sampler sampler(address::clamp_to_edge, filter::linear, coord::pixel);
+  half4 value = in.sample(sampler, static_cast<float2>(src));
+  out.write(value, gid);
+}
+
+constant bool in0_is_tex = ushort_arg_0 <= 1 && ushort_arg_1 <= 4;
+constant bool in0_is_arr = !in0_is_tex;
+
+kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = gid.xy;
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = gid.xy;
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+
+kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
+                                     texture2d<half, access::read> in1[[texture(1)]],
+                                     texture2d<half, access::write> out[[texture(2)]],
+                                     ushort2 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  out.write(in0.read(gid) + in1.read(gid), gid);
+}
+
+kernel void elementwise_add(texture2d_array<half, access::read> in0[[texture(0)]],
+                            texture2d_array<half, access::read> in1[[texture(1)]],
+                            texture2d_array<half, access::write> out[[texture(2)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    return;
+  }
+  ushort2 gid_ = gid.xy;
+  out.write(in0.read(gid_, gid.z) + in1.read(gid_, gid.z), gid_, gid.z);
+}
+
+constant bool has_in0_arg = (ushort_arg_0 > 0);
+constant bool has_in1_arg = (ushort_arg_1 > 0);
+constant bool has_in2_arg = (ushort_arg_2 > 0);
+constant bool has_in3_arg = (ushort_arg_3 > 0);
+
+constant bool has_in0_tex = (has_in0_arg && ushort_arg_0 <= 4 && ushort_arg_4 <= 1);
+constant bool has_in1_tex = (has_in1_arg && ushort_arg_1 <= 4 && ushort_arg_4 <= 1);
+constant bool has_in2_tex = (has_in2_arg && ushort_arg_2 <= 4 && ushort_arg_4 <= 1);
+constant bool has_in3_tex = (has_in3_arg && ushort_arg_3 <= 4 && ushort_arg_4 <= 1);
+
+constant bool has_in0_array = (has_in0_arg && !has_in0_tex);
+constant bool has_in1_array = (has_in1_arg && !has_in1_tex);
+constant bool has_in2_array = (has_in2_arg && !has_in2_tex);
+constant bool has_in3_array = (has_in3_arg && !has_in3_tex);
+
+constant bool concat_has_out_tex = (ushort_arg_4 <= 4 && ushort_arg_5 <= 1);
+constant bool concat_has_out_array = !concat_has_out_tex;
+
+inline ushort idx_3(ushort z, ushort C0, ushort C1, ushort C2, ushort C3) {
+  if (z < C0) {
+    return 0;
+  }
+  if (z < (C0 + C1)) {
+    return 1;
+  }
+  if (z < (C0 + C1 + C2)) {
+    return 2;
+  }
+  return 3;
+}
+
+inline ushort idx_2(ushort z, ushort C0, ushort C1, ushort C2) {
+  if (z < C0) {
+    return 0;
+  }
+  if (z < (C0 + C1)) {
+    return 1;
+  }
+  return 2;
+}
+
+inline ushort idx_1(ushort z, ushort C0, ushort C1) {
+  if (z < C0) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+inline ushort idx_0(ushort z, ushort C0) { return 0; }
+
+// in a texture_array with size C, find the offset for image N at plane c.
+inline constexpr ushort z_off(ushort n, ushort c, ushort C) { return n * divRoundUp(C, 4) + c / 4; }
+
+kernel void concat(
+                   texture2d<half, access::read> in0[[ texture(0), function_constant(has_in0_tex) ]],
+                   texture2d<half, access::read> in1[[ texture(1), function_constant(has_in1_tex) ]],
+                   texture2d<half, access::read> in2[[ texture(2), function_constant(has_in2_tex) ]],
+                   texture2d<half, access::read> in3[[ texture(3), function_constant(has_in3_tex) ]],
+                   texture2d_array<half, access::read> ina0[[ texture(0), function_constant(has_in0_array) ]],
+                   texture2d_array<half, access::read> ina1[[ texture(1), function_constant(has_in1_array) ]],
+                   texture2d_array<half, access::read> ina2[[ texture(2), function_constant(has_in2_array) ]],
+                   texture2d_array<half, access::read> ina3[[ texture(3), function_constant(has_in3_array) ]],
+                   texture2d<half, access::write> out[[texture(5),
+                                                       function_constant(concat_has_out_tex) ]],
+                   texture2d_array<half, access::write> outa[[texture(5),
+                                                              function_constant(concat_has_out_array) ]],
+                   ushort3 gid[[thread_position_in_grid]]) {
+  if (concat_has_out_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+  }
+  
+  const ushort C0 = ushort_arg_0;
+  const ushort C1 = ushort_arg_1;
+  const ushort C2 = ushort_arg_2;
+  const ushort C3 = ushort_arg_3;
+  const ushort C = C0 + C1 + C2 + C3;
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+  // Fill channel 4*c to 4*(c+1) of nth image of output
+  
+  ushort2 gid_ = gid.xy;
+  half4 value;
+  
+  for (int off = 0; off < 4; ++off) {
+    ushort cur_channel = c * 4 + off;
+    ushort cur_idx = 0;
+    if (cur_channel >= C) {
+      break;
+    }
+    if (has_in3_arg) {
+      cur_idx = idx_3(cur_channel, C0, C1, C2, C3);
+    } else if (has_in2_arg) {
+      cur_idx = idx_2(cur_channel, C0, C1, C2);
+    } else if (has_in1_arg) {
+      cur_idx = idx_1(cur_channel, C0, C1);
+    } else if (has_in0_arg) {
+      cur_idx = idx_0(cur_channel, C0);
+    } else {
+      // never reached.
+      cur_idx = 0;
+    }
+    ushort src_off = 0;
+    switch (cur_idx) {
+      case 0:
+        src_off = cur_channel % 4;
+        break;
+      case 1:
+        src_off = (cur_channel - C0) % 4;
+        break;
+      case 2:
+        src_off = (cur_channel - (C0 + C1)) % 4;
+        break;
+      case 3:
+        src_off = (cur_channel - (C0 + C1 + C2)) % 4;
+        break;
+    }
+    // try to see if we can only issue one read op for the 4 values
+    bool fast_path = false;
+    if (off == 0 && src_off == 0 && (cur_channel + 3) < C) {
+      ushort last_idx = 0;
+      if (has_in3_arg) {
+        last_idx = idx_3(cur_channel + 3, C0, C1, C2, C3);
+      } else if (has_in2_arg) {
+        last_idx = idx_2(cur_channel + 3, C0, C1, C2);
+      } else if (has_in1_arg) {
+        last_idx = idx_1(cur_channel + 3, C0, C1);
+      } else if (has_in0_arg) {
+        last_idx = idx_0(cur_channel + 3, C0);
+      } else {
+        // never reached.
+        last_idx = 0;
+      }
+      if (cur_idx == last_idx) {
+        fast_path = true;
+      }
+    }
+    switch (cur_idx) {
+      case 0: {
+        if (has_in0_tex) {
+          if (fast_path) {
+            value = in0.read(gid_);
+          } else {
+            value[off] = in0.read(gid_)[src_off];
+          }
+        }
+        if (has_in0_array) {
+          if (fast_path) {
+            value = ina0.read(gid_, z_off(n, cur_channel, C0));
+          } else {
+            value[off] = ina0.read(gid_, z_off(n, cur_channel, C0))[src_off];
+          }
+        }
+        break;
+      }
+      case 1: {
+        if (has_in1_tex) {
+          if (fast_path) {
+            value = in1.read(gid_);
+          } else {
+            value[off] = in1.read(gid_)[src_off];
+          }
+        }
+        if (has_in1_array) {
+          if (fast_path) {
+            value = ina1.read(gid_, z_off(n, cur_channel - C0, C1));
+          } else {
+            value[off] = ina1.read(gid_, z_off(n, cur_channel - C0, C1))[src_off];
+          }
+        }
+        break;
+      }
+      case 2: {
+        if (has_in2_tex) {
+          if (fast_path) {
+            value = in2.read(gid_);
+          } else {
+            value[off] = in2.read(gid_)[src_off];
+          }
+        }
+        if (has_in2_array) {
+          if (fast_path) {
+            value = ina2.read(gid_, z_off(n, cur_channel - (C0 + C1), C2));
+          } else {
+            value[off] = ina2.read(gid_, z_off(n, cur_channel - (C0 + C1), C2))[src_off];
+          }
+        }
+        break;
+      }
+      case 3: {
+        if (has_in3_tex) {
+          if (fast_path) {
+            value = in3.read(gid_);
+          } else {
+            value[off] = in3.read(gid_)[src_off];
+          }
+        }
+        if (has_in3_array) {
+          if (fast_path) {
+            value = ina3.read(gid_, z_off(n, cur_channel - (C0 + C1 + C2), C3));
+          } else {
+            value[off] = ina3.read(gid_, z_off(n, cur_channel - (C0 + C1 + C2), C3))[src_off];
+          }
+        }
+        break;
+      }
+    }
+    if (fast_path) {
+      break;
+    }
+  }
+  if (concat_has_out_tex) {
+    out.write(value, gid_, gid.z);
+  } else {
+    outa.write(value, gid_, gid.z);
+  }
+}
+
+using RoIT = half;
+using RoIT4 = half4;
+constant bool rw_has_in_arr = (ushort_arg_3 > 1 ||  ushort_arg_2 > 4);
+constant bool rw_has_out_arr = (ushort_arg_4 > 1 || ushort_arg_2 > 4);
+constant bool rw_has_in_tex = (!rw_has_in_arr);
+constant bool rw_has_out_tex = (!rw_has_out_arr);
+kernel void roi_warp(texture2d_array<half, access::sample> ina[[texture(0), function_constant(rw_has_in_arr)]],
+                     texture2d<half, access::sample> in[[texture(0), function_constant(rw_has_in_tex)]],
+                     texture2d_array<half, access::write> outa[[texture(1), function_constant(rw_has_out_arr)]],
+                     texture2d<half, access::write> out[[texture(1), function_constant(rw_has_out_tex)]],
+                     constant half4* rois[[buffer(0)]],
+                     ushort3 gid[[thread_position_in_grid]]) {
+  ushort out_width, out_height;
+  if (rw_has_out_arr) {
+    out_width = outa.get_width();
+    out_height = outa.get_height();
+  } else {
+    out_width = out.get_width();
+    out_height = out.get_height();
+  }
+  if (gid.x >= out_width || gid.y >= out_height) {
+    return;
+  }
+  constexpr sampler s2(coord::pixel, address::clamp_to_edge, filter::linear);
+  
+  const half spatial_scale = half(ushort_arg_0) / 10000;
+  const ushort sampling_ratio = ushort_arg_1;
+  const ushort C = ushort_arg_2;
+  const ushort pw = gid.x;
+  const ushort ph = gid.y;
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z % divRoundUp(C, 4);
+  
+  const RoIT4 roi_scaled = rois[n] * spatial_scale;
+  const RoIT roi_start_w = roi_scaled[0];
+  const RoIT roi_start_h = roi_scaled[1];
+  const RoIT roi_end_w = roi_scaled[2];
+  const RoIT roi_end_h = roi_scaled[3];
+  
+  // Force malformed ROIs to be 1x1
+  const RoIT roi_width = max(roi_end_w - roi_start_w, (RoIT)1.);
+  const RoIT roi_height = max(roi_end_h - roi_start_h, (RoIT)1.);
+  
+  const RoIT bin_size_h = static_cast<RoIT>(roi_height) / static_cast<RoIT>(out_height);
+  const RoIT bin_size_w = static_cast<RoIT>(roi_width) / static_cast<RoIT>(out_width);
+  const ushort roi_bin_grid_h = sampling_ratio > 0 ? sampling_ratio : ceil(roi_height / static_cast<RoIT>(out_height));
+  const ushort roi_bin_grid_w = sampling_ratio > 0 ? sampling_ratio : ceil(roi_width / static_cast<RoIT>(out_width));
+  const ushort iy_upper = (sampling_ratio > 0) ? roi_bin_grid_h : (roi_bin_grid_h + 1);
+  const ushort ix_upper = (sampling_ratio > 0) ? roi_bin_grid_w : (roi_bin_grid_w + 1);
+  
+  const RoIT count = iy_upper * ix_upper;
+  
+  RoIT4 output_val = 0.0;
+  for (int iy = 0; iy < iy_upper; iy++) {
+    for (int ix = 0; ix < ix_upper; ix++) {
+      const RoIT y =
+      roi_start_h + ph * bin_size_h + iy * bin_size_h / static_cast<RoIT>(roi_bin_grid_h);
+      const RoIT x =
+      roi_start_w + pw * bin_size_w + ix * bin_size_w / static_cast<RoIT>(roi_bin_grid_w);
+      if (rw_has_in_arr) {
+        output_val += ina.sample(s2, float2(x + 0.5, y + 0.5), c);
+      } else {
+        output_val += in.sample(s2, float2(x + 0.5, y + 0.5));
+      }
+    }
+  }
+  output_val /= count;
+  if (rw_has_out_arr) {
+    outa.write(static_cast<half4>(output_val), gid.xy, gid.z);
+  } else {
+    out.write(static_cast<half4>(output_val), gid.xy);
+  }
+}
+
+kernel void nms(device uint* mask[[buffer(0)]],
+                constant float* proposals[[buffer(1)]],
+                constant int* indices[[buffer(2)]],
+                ushort2 tgid[[threadgroup_position_in_grid]],
+                ushort2 tid[[thread_position_in_threadgroup]]) {
+  const ushort num_proposals = ushort_arg_0;
+  const ushort threads_per_group = ushort_arg_1;
+  float nms_thresh = float(ushort_arg_2) / 10000.0;
+  const ushort global_offset = ushort_arg_3;
+  const ushort row_start = tgid.y;
+  const ushort col_start = tgid.x;
+  const ushort trd_id = tid.x;
+
+  const short row_size = min(short(32), short(num_proposals - row_start * threads_per_group));
+  const short col_size = min(short(32), short(num_proposals - col_start * threads_per_group));
+
+  // mask the bit if the IoU between two proposals exceeds the threshold
+  if (trd_id < row_size) {
+    const ushort cur_idx = global_offset + row_start * threads_per_group + trd_id;
+    const ushort offset = indices[cur_idx] * 4;
+    const float4 cur_proposal = float4(
+        proposals[offset], proposals[offset + 1], proposals[offset + 2], proposals[offset + 3]);
+    uint cur_mask = 0;
+    ushort group_start = 0; // start index within group
+    if (row_start == col_start) {
+      // if in the same group, start from the next
+      group_start = trd_id + 1;
+    }
+    for (ushort i = group_start; i < col_size; i++) {
+      float4 a = cur_proposal;
+      ushort idx = indices[global_offset + col_start * threads_per_group + i] * 4;
+      float4 b = float4(proposals[idx], proposals[idx + 1], proposals[idx + 2], proposals[idx + 3]);
+      float left = max(a[0], b[0]);
+      float right = min(a[2], b[2]);
+      float top = max(a[1], b[1]);
+      float bottom = min(a[3], b[3]);
+      float width = max(right - left + 1.0, 0.0);
+      float height = max(bottom - top + 1.0, 0.0);
+      float interS = width * height;
+      float Sa = (a[2] - a[0] + 1.0) * (a[3] - a[1] + 1.0);
+      float Sb = (b[2] - b[0] + 1.0) * (b[3] - b[1] + 1.0);
+      float iou = interS / (Sa + Sb - interS);
+      if (iou - nms_thresh > 0) {
+        cur_mask |= 1U << i;
+      }
+    }
+    ushort col_blocks = (num_proposals + threads_per_group - 1) / threads_per_group;
+    mask[cur_idx * col_blocks + col_start] = cur_mask;
+  }
+}
+
+kernel void resize_nearest(texture2d_array<half, access::sample> in[[texture(0)]],
+                           texture2d_array<half, access::write> out[[texture(1)]],
+                           ushort3 gid[[thread_position_in_grid]]) {
+  const ushort oH = ushort_arg_0;
+  const ushort oW = ushort_arg_1;
+  if (gid.x >= oW || gid.y >= oH) {
+    return;
+  }
+  const float height_scale = float(ushort_arg_2) / 10000;
+  const float width_scale = float(ushort_arg_3) / 10000;
+  constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
+  const int in_y = (int)(gid.y / height_scale);
+  const int in_x = (int)(gid.x / width_scale);
+  out.write(in.sample(s, float2(in_x, in_y), gid.z), gid.xy, gid.z);
+}
+
+kernel void resize_nearest_nonarray(texture2d<half, access::sample> in[[texture(0)]],
+                                    texture2d<half, access::write> out[[texture(1)]],
+                                    ushort2 gid[[thread_position_in_grid]]) {
+  const ushort oH = ushort_arg_0;
+  const ushort oW = ushort_arg_1;
+  if (gid.x >= oW || gid.y >= oH) {
+    return;
+  }
+  const float height_scale = float(ushort_arg_2) / 10000;
+  const float width_scale = float(ushort_arg_3) / 10000;
+  constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
+  const int in_y = (int)(gid.y / height_scale);
+  const int in_x = (int)(gid.x / width_scale);
+  out.write(in.sample(s, float2(in_x, in_y)), gid.xy);
+}
+
+kernel void channel_shuffle(
+                            texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(1), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(1), function_constant(in0_is_arr)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort C = ushort_arg_1;
+  ushort K = ushort_arg_2;
+  ushort groups = ushort_arg_3;
+  
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+  }
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+  half4 value;
+  ushort2 gid_ = gid.xy;
+  for (int off = 0; off < 4; ++off) {
+    ushort cur_channel = c * 4 + off;
+    if (cur_channel >= C) {
+      break;
+    }
+    ushort channel_id = cur_channel / groups;
+    ushort group_id = cur_channel % groups;
+    ushort c0 = group_id * K + channel_id;
+    if (in0_is_tex) {
+      value[off] = in0.read(gid_)[c0 % 4];
+    } else {
+      value[off] = ina0.read(gid_, c0 / 4 + n * divRoundUp(C, 4))[c0 % 4];
+    }
+  }
+  if (in0_is_tex) {
+    out.write(value, gid_);
+  } else {
+    outa.write(value, gid_, gid.z);
+  }
+}
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
new file mode 100644
index 0000000..ec7b8f1
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
@@ -0,0 +1,23 @@
+
+#pragma once
+#include "caffe2/core/net.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
+static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
+static constexpr const int kMetalMaxTextureArrLength = 2048;
+// We currently only try to convert a fixed set of operators that handle a subset of a full
+// CNN. We also only run when MPSCNN is available, provides a speedup.
+// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
+// parameter.
+
+bool tryConvertToMPSCNN(const NetDef& initNet, const NetDef& predictNet, NetDef* mpscnnPredictNet);
+
+// Exposed for testing.
+NetDef annotateDefWithReadCounts(const NetDef& net);
+NetDef rewriteForMetal(const NetDef& net);
+NetDef runMPSCNNFusion(const NetDef& net);
+void dumpDef(const NetDef& d);
+void mpscnnRecordExecutionFinish();
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
new file mode 100644
index 0000000..ef7d8bf
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -0,0 +1,2612 @@
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+
+#if defined(CAFFE2_USE_MPSCNN) && CAFFE2_MOBILE
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/conv_transpose_unpool_op_base.h"
+#include "caffe2/operators/generate_proposals_op.h"
+#include "caffe2/operators/generate_proposals_op_util_boxes.h"
+#include "caffe2/operators/spatial_batch_norm_op.h"
+
+#include "mpscnn.h"
+#include "mpscnn_context.h"
+
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#import <UIKit/UIDevice.h>
+
+#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v) \
+  ([[[UIDevice currentDevice] systemVersion]       \
+       compare:v                                   \
+       options:NSNumericSearch] != NSOrderedAscending)
+
+// Only compiles against Base SDK iOS 11.0 or greater
+@interface ConvDataSource : NSObject<MPSCNNConvolutionDataSource>
+@property float* weights_;
+@property float* bias_;
+@property MPSCNNConvolutionDescriptor* desc_;
+@end
+
+@implementation ConvDataSource
+- (id)initWithWeight:(float*)weights
+                bias:(float*)bias
+                desc:(MPSCNNConvolutionDescriptor*)desc {
+  self = [super init];
+  self.weights_ = weights;
+  self.bias_ = bias;
+  self.desc_ = desc;
+  return self;
+}
+- (float*)biasTerms {
+  return self.bias_;
+}
+
+- (MPSDataType)dataType {
+  return MPSDataTypeFloat32;
+}
+- (MPSCNNConvolutionDescriptor*)descriptor {
+  return self.desc_;
+}
+- (NSString*)label {
+  return nullptr;
+}
+- (BOOL)load {
+  return true;
+}
+- (float*)lookupTableForUInt8Kernel {
+  return nullptr;
+}
+- (void)purge {
+  return;
+}
+- (vector_float2*)rangesForUInt8Kernel {
+  return nullptr;
+}
+- (void*)weights {
+  return self.weights_;
+}
+@end
+
+namespace caffe2 {
+
+namespace {
+auto divRoundUp(uint x, uint y) -> uint {
+  return (x + y - 1) / y;
+}
+
+MPSTemporaryImage* createTemporaryImage(
+    const OperatorBase* op,
+    id<MTLCommandBuffer> commandBuffer,
+    int n,
+    int height,
+    int width,
+    int channels,
+    size_t output_idx = 0) {
+  auto* image = [MPSTemporaryImage
+      temporaryImageWithCommandBuffer:commandBuffer
+                      imageDescriptor:
+                          [MPSImageDescriptor
+                              imageDescriptorWithChannelFormat:
+                                  MPSImageFeatureChannelFormatFloat16
+                                                         width:width
+                                                        height:height
+                                               featureChannels:channels
+                                                numberOfImages:n
+                                                         usage:
+                                                             MTLTextureUsageShaderRead |
+                                                         MTLTextureUsageShaderWrite]];
+  // We'll try to look at the per-output_idx read-count argument, otherwise,
+  // we'll use the operator-global default.
+  const auto& readCounts = op->GetRepeatedArgument<int>(kMPSCNNReadCountArg);
+  const auto readCount = readCounts.size()
+      ? readCounts.at(output_idx)
+      : op->GetSingleArgument<int>(kMPSCNNReadCountArg, 1);
+  CAFFE_ENFORCE_GE(readCount, 1);
+  image.readCount = readCount;
+  return image;
+}
+
+MPSImage* createStaticImage(int n, int height, int width, int channels) {
+  return [[MPSImage alloc]
+       initWithDevice:getMPSCNNContext().device
+      imageDescriptor:
+          [MPSImageDescriptor
+              imageDescriptorWithChannelFormat:
+                  MPSImageFeatureChannelFormatFloat16
+                                         width:width
+                                        height:height
+                               featureChannels:channels
+                                numberOfImages:n
+                                         usage:MTLTextureUsageShaderRead |
+                                         MTLTextureUsageShaderWrite]];
+}
+
+class MPSImageWrapper {
+ public:
+  MPSImageWrapper() {}
+  MPSImageWrapper(
+      const OperatorBase* op,
+      MPSImageWrapper* parent,
+      int n,
+      int height,
+      int width,
+      int channels,
+      size_t output_idx = 0) {
+    /* If the parent wrapper contains a temporary image, we need to pass on the
+     * command buffer because the temporary images are attached to the command
+     * buffer, we will need to use the same command buffer in order to use the
+     * temporary image. We don't want to synchronize the parent wrapper because
+     * it is still in use. If the parent wrapper contains a static image, we
+     * should create a new command buffer because we use static image so it can
+     * survive synchronization(commit of the command buffer), which means if we
+     * pass on the command buffer the command buffer will be commited in
+     * multiple places in the graph. Also since we don't pass on parent's
+     * command buffer,we need to synchronize(commit) it since it won't be used
+     * in the future.
+     */
+    bool passOnCb = parent != nullptr && parent->isTemporaryImage_;
+    commandBuffer_ = passOnCb ? parent->commandBuffer_
+                              : [getMPSCNNContext().commandQueue commandBuffer];
+
+    bool commitInputCb = parent != nullptr && !parent->isTemporaryImage_;
+    if (commitInputCb) {
+      parent->synchronize();
+    }
+
+    const auto& isTemporaryImages =
+        op->GetRepeatedArgument<int>(kMPSCNNOutputIsTempImageArg);
+    isTemporaryImage_ = isTemporaryImages.size()
+        ? isTemporaryImages.at(output_idx)
+        : op->GetSingleArgument<int>(kMPSCNNOutputIsTempImageArg, 1);
+    if (isTemporaryImage_) {
+      image_ = createTemporaryImage(
+          op, commandBuffer_, n, height, width, channels, output_idx);
+    } else {
+      image_ = createStaticImage(n, height, width, channels);
+    }
+  }
+
+  void markRead() {
+    if (isTemporaryImage_) {
+      MPSTemporaryImage* tempImg = (MPSTemporaryImage*)image_;
+      tempImg.readCount -= 1;
+    }
+  }
+
+  MPSImage* getImage() const {
+    return image_;
+  }
+
+  id<MTLCommandBuffer> getCommandBuffer() const {
+    return commandBuffer_;
+  }
+
+  void synchronize() {
+    // commit the command buffer if it is notEnqueued
+    if (commandBuffer_ != nullptr && commandBuffer_.status == 0) {
+      [commandBuffer_ commit];
+    }
+  }
+
+  void cleanup() {
+    markRead();
+    synchronize();
+  }
+
+  void copyToOutputBlob(Blob* output) {
+    output->GetMutable<MPSImageWrapper>()->image_ = image_;
+    output->GetMutable<MPSImageWrapper>()->commandBuffer_ = commandBuffer_;
+    output->GetMutable<MPSImageWrapper>()->isTemporaryImage_ =
+        isTemporaryImage_;
+  }
+
+ private:
+  MPSImage* image_{nullptr};
+  id<MTLCommandBuffer> commandBuffer_{nullptr};
+  bool isTemporaryImage_ = true;
+};
+
+NSString*
+kernelFor(const MPSImage* X, NSString* arrayKernel, NSString* nonArrayKernel) {
+  if (X.featureChannels > 4) {
+    return arrayKernel;
+  }
+  if (X.numberOfImages > 1) {
+    return arrayKernel;
+  }
+  return nonArrayKernel;
+}
+
+struct LaunchParams {
+  MTLSize threadsPerThreadgroup;
+  MTLSize threadgroupsPerGrid;
+};
+
+LaunchParams spatialPointwiseKernelLaunchParams(
+    id<MTLComputePipelineState> pipeline,
+    const MPSImage* im) {
+  const auto maxThreadsPerThreadgroup =
+      [pipeline maxTotalThreadsPerThreadgroup];
+  const auto threadExecutionWidth = [pipeline threadExecutionWidth];
+  const auto threadsPerThreadgroup = MTLSizeMake(
+      8 /* threadExecutionWidth */,
+      4 /* maxThreadsPerThreadgroup / threadExecutionWidth */,
+      1);
+  const auto threadgroupsPerGrid = MTLSizeMake(
+      divRoundUp(im.width, threadsPerThreadgroup.width),
+      divRoundUp(im.height, threadsPerThreadgroup.height),
+      im.numberOfImages * divRoundUp(im.featureChannels, 4));
+  return {threadsPerThreadgroup, threadgroupsPerGrid};
+};
+
+void computeOutputHW(
+    ConvPoolOpBase<CPUContext>* op,
+    int H,
+    int W,
+    int* OH,
+    int* OW) {
+  Tensor<CPUContext> input, output;
+  input.Resize(1, 1, H, W);
+  op->SetOutputSize<CPUContext>(input, &output, 1);
+  CAFFE_ENFORCE_EQ(output.ndim(), 4);
+  *OH = output.dim(2);
+  *OW = output.dim(3);
+}
+
+constexpr int computeMPSAlignOffset(int kernel, int pad) {
+  // To set the offset, we can just match the top-left pixel (in the input
+  // image, with negative values for padding) that we look at. For 3x3s1p1, we
+  // look at the (-1, -1) pixel in the original impl. For 3x3s1p0, we look at
+  // (0, 0) pixel. For 3x3s1p2, look at (-2, -2) MPSCNN always looks at
+  // (-floor(kernel_size - 1 / 2), -floor(kernel_size - 1 / 2)) Thus, we just
+  // need to match this up.
+
+  // For 3x3s1p1, offset should be (0, 0)
+  // For 3x3s1p0, offset should be (1, 1)
+  // For 3x3s1p2, offset should be (-1, -1)
+  const int mps_offset = kernel / 2;
+  const int c2_offset = pad;
+  return mps_offset - c2_offset;
+};
+
+// Compute the 1-d index of a n-dimensional contiguous row-major tensor for
+//     a given n-dimensional index 'index'
+size_t ComputeStartIndex(
+    const TensorCPU& tensor,
+    const std::vector<int>& index) {
+  DCHECK_EQ(index.size(), tensor.ndim());
+
+  size_t ret = 0;
+  for (int i = 0; i < index.size(); i++) {
+    ret += index[i] * tensor.size_from_dim(i + 1);
+  }
+
+  return ret;
+}
+
+// Get a sub tensor view from 'tensor' using data pointer from 'tensor'
+template <class T>
+utils::ConstTensorView<T> GetSubTensorView(
+    const TensorCPU& tensor,
+    int dim0_start_index) {
+  DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));
+
+  if (tensor.size() == 0) {
+    return utils::ConstTensorView<T>(nullptr, {});
+  }
+
+  std::vector<int> start_dims(tensor.ndim(), 0);
+  start_dims.at(0) = dim0_start_index;
+  auto st_idx = ComputeStartIndex(tensor, start_dims);
+  auto ptr = tensor.data<T>() + st_idx;
+
+  auto& input_dims = tensor.dims();
+  std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());
+
+  utils::ConstTensorView<T> ret(ptr, ret_dims);
+  return ret;
+}
+
+class CopyToMPSCNNOp final : public Operator<CPUContext> {
+ public:
+  CopyToMPSCNNOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    inputBuffers_.resize(Inputs().size());
+    std::vector<MPSImageWrapper> wrappers(Inputs().size());
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      const auto& X = Input(i);
+      CAFFE_ENFORCE(X.ndim() > 0 && X.ndim() <= 4);
+      std::vector<TIndex> XDims = {1, 1, 1, 1};
+      XDims.assign(X.dims().begin(), X.dims().end());
+
+      caffe2::Timer t;
+      const auto n = XDims[0];
+      const auto width = XDims[3];
+      const auto height = XDims[2];
+      const auto channels = XDims[1];
+      caffe2::Timer copyT;
+      if (!inputBuffers_[i] || inputBuffers_[i].length != X.nbytes()) {
+        inputBuffers_[i] = [getMPSCNNContext().device
+            newBufferWithLength:X.nbytes()
+                        options:MTLResourceOptionCPUCacheModeWriteCombined];
+      }
+      memcpy([inputBuffers_[i] contents], X.raw_data(), X.nbytes());
+      VLOG(2) << "CopyToMPSCNNOp input copy took: " << copyT.MilliSeconds();
+      if (i == 0) {
+        wrappers[i] =
+            MPSImageWrapper(this, nullptr, n, height, width, channels, i);
+      } else {
+        wrappers[i] =
+            MPSImageWrapper(this, &wrappers[0], n, height, width, channels, i);
+      }
+      auto commandBuffer = wrappers[i].getCommandBuffer();
+      MPSImage* output = wrappers[i].getImage();
+      id<MTLComputeCommandEncoder> encoder =
+          [commandBuffer computeCommandEncoder];
+      id<MTLComputePipelineState> state =
+          getMPSCNNContext().getSpecializedPipelineState(
+              kernelFor(
+                  output,
+                  @"copy_nchw_to_metal",
+                  @"copy_nchw_to_metal_nonarray"),
+              {{ushort(channels), ushort(height), ushort(width)}});
+      [encoder setComputePipelineState:state];
+      [encoder setBuffer:inputBuffers_[i] offset:0 atIndex:0];
+      [encoder setTexture:[output texture] atIndex:0];
+      const auto& launchParams =
+          spatialPointwiseKernelLaunchParams(state, output);
+      [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+              threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+      [encoder endEncoding];
+      VLOG(2) << "CopyToMPSCNNOp took: " << t.MilliSeconds();
+      wrappers[i].copyToOutputBlob(Outputs()[i]);
+    }
+    return true;
+  }
+
+ private:
+  std::vector<id<MTLBuffer>> inputBuffers_;
+};
+
+REGISTER_CPU_OPERATOR(CopyToMPSCNN, CopyToMPSCNNOp);
+OPERATOR_SCHEMA(CopyToMPSCNN)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SameNumberOfOutput();
+
+auto mpsImageSize = [](MPSImage* X) {
+  return X.featureChannels * X.height * X.width * X.numberOfImages;
+};
+
+class CopyFromMPSCNNOp final : public Operator<CPUContext> {
+ public:
+  CopyFromMPSCNNOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+    auto Wrapper = [&](size_t i) {
+      return Inputs()[i]->template Get<MPSImageWrapper>();
+    };
+    auto cb = [&](size_t i) { return Wrapper(i).getCommandBuffer(); };
+    auto X = [&](size_t i) { return Wrapper(i).getImage(); };
+
+    auto cb0 = cb(0);
+    outputBuffers_.resize(Inputs().size());
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      CAFFE_ENFORCE_EQ(cb0, cb(i));
+      MPSImage* Xi = X(i);
+      if (!outputBuffers_[i] ||
+          outputBuffers_[i].length != mpsImageSize(Xi) * sizeof(float)) {
+        outputBuffers_[i] = [getMPSCNNContext().device
+            newBufferWithLength:mpsImageSize(Xi) * sizeof(float)
+                        options:MTLResourceOptionCPUCacheModeDefault];
+      }
+      id<MTLComputeCommandEncoder> encoder = [cb0 computeCommandEncoder];
+      id<MTLComputePipelineState> state =
+          getMPSCNNContext().getSpecializedPipelineState(
+              kernelFor(
+                  Xi, @"copy_metal_to_nchw", @"copy_metal_to_nchw_nonarray"),
+              {{ushort(Xi.featureChannels),
+                ushort(Xi.height),
+                ushort(Xi.width)}});
+
+      [encoder setComputePipelineState:state];
+      [encoder setBuffer:outputBuffers_[i] offset:0 atIndex:0];
+      [encoder setTexture:[Xi texture] atIndex:0];
+
+      const auto& launchParams = spatialPointwiseKernelLaunchParams(state, Xi);
+      [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+              threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+      [encoder endEncoding];
+      Wrapper(i).markRead();
+    }
+    [cb0 commit];
+    [cb0 waitUntilCompleted];
+
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      caffe2::Timer copyOutT;
+      MPSImage* Xi = X(i);
+      Output(i)->Resize(
+          Xi.numberOfImages, Xi.featureChannels, Xi.height, Xi.width);
+      Output(i)->mutable_data<float>();
+      CAFFE_ENFORCE_EQ(outputBuffers_[i].length, Output(i)->nbytes());
+      memcpy(
+          Output(i)->mutable_data<float>(),
+          [outputBuffers_[i] contents],
+          outputBuffers_[i].length);
+      VLOG(2) << "CopyFromMPSCNNOp memcpy took: " << copyOutT.MilliSeconds();
+    }
+    VLOG(2) << "CopyFromMPSCNNOp took: " << t.MilliSeconds();
+    return true;
+  }
+
+ private:
+  std::vector<id<MTLBuffer>> outputBuffers_;
+};
+
+REGISTER_CPU_OPERATOR(CopyFromMPSCNN, CopyFromMPSCNNOp);
+OPERATOR_SCHEMA(CopyFromMPSCNN)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SameNumberOfOutput();
+
+class MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp final
+    : public Operator<CPUContext> {
+ public:
+  MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws), ws_(ws) {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& mean = Input(1);
+    CAFFE_ENFORCE_EQ(mean.size(), 3);
+    CAFFE_ENFORCE_EQ(X.ndim(), 4);
+    CAFFE_ENFORCE_EQ(X.dim(0), 1);
+    CAFFE_ENFORCE_EQ(X.dim(3), 4);
+    const auto H = X.dim(1);
+    const auto W = X.dim(2);
+
+    caffe2::Timer t;
+
+    auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
+    ushort noiseSize = OperatorBase::GetSingleArgument<int>(
+        "noise_size", 491 /* prime to avoid artifacts */);
+    // Treaded as half4 in the kernel, so need half4 here.
+    noiseSize = divRoundUp(noiseSize, 4) * 4;
+    if (!noiseBlob->IsType<TensorCPU>() ||
+        noiseBlob->Get<TensorCPU>().size() != noiseSize) {
+      VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
+      caffe2::Timer rt;
+      // Initialize random noise on first use.
+      // Cache it to maintain temporal consistency.
+      auto* t = noiseBlob->template GetMutable<TensorCPU>();
+      t->Resize(noiseSize);
+      math::RandGaussian<float, CPUContext>(
+          t->size(),
+          0.0,
+          OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
+          t->template mutable_data<float>(),
+          &context_);
+      VLOG(2) << "Preprocess initializing noise: " << rt.MilliSeconds();
+    }
+    const auto& noise = noiseBlob->Get<TensorCPU>();
+
+    if (!inputBuffer_ || inputBuffer_.length != X.nbytes()) {
+      caffe2::Timer pt;
+
+      inputBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:X.nbytes()
+                      options:MTLResourceOptionCPUCacheModeWriteCombined];
+      meanBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:4 * 2 // (3/4 half-floats).
+                      options:MTLResourceOptionCPUCacheModeWriteCombined];
+      noiseBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:noiseSize * sizeof(float16_t)
+                      options:MTLResourceOptionCPUCacheModeWriteCombined];
+
+      float16_t* meanBufferPtr = (float16_t*)[meanBuffer_ contents];
+      CAFFE_ENFORCE(meanBufferPtr);
+      for (auto i = 0; i < mean.size(); ++i) {
+        meanBufferPtr[i] = mean.data<float>()[i];
+      }
+      float16_t* noiseBufferPtr = (float16_t*)[noiseBuffer_ contents];
+      CAFFE_ENFORCE(noiseBufferPtr);
+      for (auto i = 0; i < noise.size(); ++i) {
+        noiseBufferPtr[i] = noise.data<float>()[i];
+      }
+
+      VLOG(2) << "Preprocess construct took: " << pt.MilliSeconds();
+    }
+
+    {
+      caffe2::Timer ct;
+      memcpy([inputBuffer_ contents], X.raw_data(), X.nbytes());
+      VLOG(2) << "Preprocess memcpy took: " << ct.MilliSeconds();
+    }
+    auto outputWrapper = MPSImageWrapper(this, nullptr, 1, H, W, 3);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"preprocess_stylizer", {noiseSize});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:inputBuffer_ offset:0 atIndex:0];
+    [encoder setBuffer:meanBuffer_ offset:0 atIndex:1];
+    [encoder setBuffer:noiseBuffer_ offset:0 atIndex:2];
+
+    [encoder setTexture:[output texture] atIndex:0];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "Preprocess took: " << t.MilliSeconds();
+    return true;
+  }
+
+ private:
+  Workspace* ws_{nullptr};
+  id<MTLBuffer> inputBuffer_{nullptr};
+  id<MTLBuffer> noiseBuffer_{nullptr};
+  id<MTLBuffer> meanBuffer_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(
+    MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess,
+    MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp);
+OPERATOR_SCHEMA(MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess)
+    .NumInputs(2)
+    .NumOutputs(1);
+
+class MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocessOp final
+    : public Operator<CPUContext> {
+ public:
+  MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocessOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+    id<MTLCommandBuffer> commandBuffer = inputWrapper.getCommandBuffer();
+
+    const auto& mean = Input(1);
+    caffe2::Timer t;
+    const auto W = X.width;
+    const auto H = X.height;
+    CAFFE_ENFORCE_EQ(X.featureChannels, 3);
+    CAFFE_ENFORCE_EQ(X.numberOfImages, 1);
+
+    if (!outputBuffer_ || outputBuffer_.length != X.height * X.width * 4) {
+      caffe2::Timer pt;
+
+      outputBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:X.height * X.width * 4
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      meanBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:4 * 2 // (3/4 half-floats).
+                      options:MTLResourceOptionCPUCacheModeWriteCombined];
+      float16_t* meanBufferPtr = (float16_t*)[meanBuffer_ contents];
+      for (auto i = 0; i < mean.size(); ++i) {
+        meanBufferPtr[i] = mean.data<float>()[i];
+      }
+      VLOG(2) << "Deprocess copy took: " << pt.MilliSeconds();
+    }
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getPipelineState(@"deprocess_stylizer");
+
+    CAFFE_ENFORCE_EQ(outputBuffer_.length, X.height * X.width * 4);
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:outputBuffer_ offset:0 atIndex:0];
+    [encoder setBuffer:meanBuffer_ offset:0 atIndex:1];
+    [encoder setTexture:[X texture] atIndex:0];
+    const auto& launchParams = spatialPointwiseKernelLaunchParams(state, X);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+
+    [commandBuffer commit];
+    [commandBuffer waitUntilCompleted];
+
+    Output(0)->Resize(1, X.height, X.width, 4);
+    {
+      caffe2::Timer ct;
+      memcpy(
+          Output(0)->mutable_data<uint8_t>(),
+          [outputBuffer_ contents],
+          [outputBuffer_ length]);
+      VLOG(2) << "Deprocess copy: " << t.MilliSeconds();
+    }
+    CAFFE_ENFORCE_EQ(Output(0)->nbytes(), [outputBuffer_ length]);
+    VLOG(2) << "Deprocess took: " << t.MilliSeconds();
+
+    return true;
+  }
+
+ private:
+  id<MTLBuffer> outputBuffer_{nullptr};
+  id<MTLBuffer> meanBuffer_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(
+    MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess,
+    MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocessOp);
+OPERATOR_SCHEMA(MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess)
+    .NumInputs(2)
+    .NumOutputs(1);
+
+template <typename Neuron>
+class MPSCNNNeuronOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNNeuronOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->template Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.width, X.width);
+    CAFFE_ENFORCE_EQ(output.height, X.height);
+    CAFFE_ENFORCE_EQ(output.featureChannels, X.featureChannels);
+
+    if (!neuron_) {
+      neuron_ = Neuron::t();
+    }
+    [neuron_ encodeToCommandBuffer:commandBuffer
+                       sourceImage:X
+                  destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "ElementwiseAdd took: " << t.MilliSeconds();
+    return true;
+  }
+  MPSCNNNeuron* neuron_{nullptr};
+};
+
+#define INIT_NEURON_OP(n)                                          \
+  REGISTER_CPU_OPERATOR(MPSCNN##n, MPSCNNNeuronOp<n##NeuronInit>); \
+  OPERATOR_SCHEMA(MPSCNN##n).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+
+struct SigmoidNeuronInit {
+  static MPSCNNNeuron* t() {
+    return
+        [[MPSCNNNeuronSigmoid alloc] initWithDevice:getMPSCNNContext().device];
+  }
+};
+INIT_NEURON_OP(Sigmoid);
+
+struct ReluNeuronInit {
+  static MPSCNNNeuron* t() {
+    return
+        [[MPSCNNNeuronReLU alloc] initWithDevice:getMPSCNNContext().device a:0];
+  }
+};
+INIT_NEURON_OP(Relu);
+
+struct TanhNeuronInit {
+  static MPSCNNNeuron* t() {
+    return [[MPSCNNNeuronTanH alloc] initWithDevice:getMPSCNNContext().device
+                                                  a:1
+                                                  b:1];
+  }
+};
+INIT_NEURON_OP(Tanh);
+
+#undef INIT_NEURON_OP
+
+template <typename Neuron>
+class MPSCNNConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  MPSCNNConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(
+        kernel_h() == kernel_w(),
+        "Metal only supports equal kernel dimension.");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->template Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    auto& filter = Input(FILTER);
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE_EQ(filter.ndim(), 4);
+    // For NCHW, X.dim32(1), inputChannels
+    const int C = X.featureChannels;
+    const int M = filter.dim32(0);
+    const int Cf = filter.dim32(1);
+
+    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
+    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
+    CAFFE_ENFORCE(bias.ndim() == 1, "");
+    CAFFE_ENFORCE(bias.dim32(0) == M, "");
+
+    const auto kH = kernel_h();
+    const auto kW = kernel_w();
+
+    // ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+    // Reformat weights from [M][C][kH][kW] to [M][kH][kW][C].
+    if (!conv_) {
+      caffe2::Timer consT;
+      std::vector<float> refilter(M * kH * kW * Cf);
+      auto* filter_ = filter.template data<float>();
+      for (auto m = 0; m < M; ++m) {
+        for (auto c = 0; c < Cf; ++c) {
+          for (auto kh = 0; kh < kH; ++kh) {
+            for (auto kw = 0; kw < kW; ++kw) {
+              // refilter[m][kh][kw][c]
+              refilter[m * kH * kW * Cf + kh * kW * Cf + kw * Cf + c] =
+                  // filter[m][c][kh][kw]
+                  filter_[m * Cf * kH * kW + c * kH * kW + kh * kW + kw];
+            }
+          }
+        }
+      }
+      // DepthwiseConv path
+      bool runtimeAtLeastIOS11 =
+          SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0");
+      // Only inputFeatureChannels == outputFeatureChannels is supported right
+      // now
+      if (runtimeAtLeastIOS11 && this->group_ > 1 && Cf == 1 &&
+          M == this->group_) {
+        MPSCNNDepthWiseConvolutionDescriptor* desc =
+            [MPSCNNDepthWiseConvolutionDescriptor
+                cnnConvolutionDescriptorWithKernelWidth:kW
+                                           kernelHeight:kH
+                                   inputFeatureChannels:C
+                                  outputFeatureChannels:M
+                                           neuronFilter:Neuron::t()];
+        desc.strideInPixelsX = stride_w();
+        desc.strideInPixelsY = stride_h();
+        desc.groups = 1;
+        auto data_source = [[ConvDataSource alloc]
+            initWithWeight:refilter.data()
+                      bias:const_cast<float*>(bias.template data<float>())
+                      desc:desc];
+        conv_ =
+            [[MPSCNNConvolution alloc] initWithDevice:getMPSCNNContext().device
+                                              weights:data_source];
+      } else {
+        if (this->group_ > 1) {
+          CAFFE_ENFORCE_EQ(
+              Cf % 4,
+              0,
+              "MPSCNNConvolution requires number of input \
+                           channels in each group to be multiple of 4 for \
+                           group > 1.");
+        }
+        MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor
+            cnnConvolutionDescriptorWithKernelWidth:kW
+                                       kernelHeight:kH
+                               inputFeatureChannels:C
+                              outputFeatureChannels:M
+                                       neuronFilter:Neuron::t()];
+        desc.strideInPixelsX = stride_w();
+        desc.strideInPixelsY = stride_h();
+        desc.groups = this->group_;
+        auto data_source = [[ConvDataSource alloc]
+            initWithWeight:refilter.data()
+                      bias:const_cast<float*>(bias.template data<float>())
+                      desc:desc];
+        conv_ =
+            [[MPSCNNConvolution alloc] initWithDevice:getMPSCNNContext().device
+                                              weights:data_source];
+      }
+
+      [conv_ setEdgeMode:MPSImageEdgeModeZero];
+
+      MPSOffset offset;
+      offset.x = computeMPSAlignOffset(kW, pad_l());
+      offset.y = computeMPSAlignOffset(kH, pad_t());
+      offset.z = 0;
+      [conv_ setOffset:offset];
+      VLOG(2) << "MPSCNNConv ConvDesc took: " << consT.MilliSeconds();
+    }
+
+    CAFFE_ENFORCE_EQ(conv_.strideInPixelsY, stride_h());
+    CAFFE_ENFORCE_EQ(conv_.strideInPixelsX, stride_w());
+    CAFFE_ENFORCE_EQ(conv_.inputFeatureChannels, Cf * this->group_);
+    CAFFE_ENFORCE_EQ(M % conv_.groups, 0);
+    CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, M);
+    CAFFE_ENFORCE_EQ(conv_.kernelWidth, kW);
+    CAFFE_ENFORCE_EQ(conv_.kernelHeight, kH);
+
+    int output_height;
+    int output_width;
+    computeOutputHW(this, X.height, X.width, &output_height, &output_width);
+    int output_channels = M;
+
+    VLOG(2) << "Output height: " << output_height;
+    VLOG(2) << "Output width:" << output_width;
+    VLOG(2) << "Output channels:" << output_channels;
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        output_channels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.height, output_height);
+    CAFFE_ENFORCE_EQ(output.width, output_width);
+    [conv_ encodeToCommandBuffer:commandBuffer
+                     sourceImage:X
+                destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "MPSCNNConv took: " << t.MilliSeconds();
+    return true;
+  }
+
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+
+  MPSCNNConvolution* conv_{nullptr};
+};
+
+// No-op init
+struct EmptyNeuronInit {
+  static MPSCNNNeuron* t() {
+    return nil;
+  }
+};
+
+// We can allow the input weights/bias and output to alias each other,
+// for example when doing a Conv + out-of-place ReLU, then fusing.
+#define INIT_CONV_NEURON_OP(name, neuron)                        \
+  REGISTER_CPU_OPERATOR(name, MPSCNNConvOp<neuron>);             \
+  OPERATOR_SCHEMA(name).NumInputs(3).NumOutputs(1).AllowInplace( \
+      {{1, 0}, {2, 0}});
+
+INIT_CONV_NEURON_OP(MPSCNNConv, EmptyNeuronInit);
+INIT_CONV_NEURON_OP(MPSCNNConvRelu, ReluNeuronInit);
+INIT_CONV_NEURON_OP(MPSCNNConvSigmoid, SigmoidNeuronInit);
+
+#undef INIT_CONV_NEURON_OP
+
+class MPSCNNPadImageOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  MPSCNNPadImageOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<string>("mode", "") == "reflect",
+        "Metal only supports reflection");
+    kernel_[0] = kernel_[1] = 1;
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    const auto pH = pad_t();
+    const auto pW = pad_l();
+    const auto output_height = X.height + 2 * pH;
+    const auto output_width = X.width + 2 * pW;
+    VLOG(1) << "Output height: " << output_height;
+    VLOG(1) << "Output width:" << output_width;
+    VLOG(2) << "Output channels:" << X.featureChannels;
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.height, output_height);
+    CAFFE_ENFORCE_EQ(output.width, output_width);
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getPipelineState(kernelFor(
+            output, @"reflection_padding", @"reflection_padding_nonarray"));
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "PadImage took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNPadImage, MPSCNNPadImageOp);
+OPERATOR_SCHEMA(MPSCNNPadImage).NumInputs(1).NumOutputs(1);
+
+class MPSCNNMulOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNMul only supports broadcast");
+
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNMul does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+
+    auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X0 = wrapper0.getImage();
+
+    const auto& X1 = Input(1);
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now");
+
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X0.numberOfImages,
+        X0.height,
+        X0.width,
+        X0.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_mul",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X0 texture] atIndex:0];
+    [encoder setBuffer:X1_ offset:0 atIndex:1];
+    [encoder setTexture:[output texture] atIndex:2];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    wrapper0.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "ElementwiseMul took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp);
+OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
+class MPSCNNSubOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNSub only supports broadcast");
+
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNSub does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+
+    auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X0 = wrapper0.getImage();
+
+    const auto& X1 = Input(1);
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now");
+
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X0.numberOfImages,
+        X0.height,
+        X0.width,
+        X0.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_sub",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X0 texture] atIndex:0];
+    [encoder setBuffer:X1_ offset:0 atIndex:1];
+    [encoder setTexture:[output texture] atIndex:2];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    wrapper0.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp);
+OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
+class MPSCNNAddOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+
+    auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
+    auto wrapper1 = Inputs()[1]->Get<MPSImageWrapper>();
+    MPSImage* X0 = wrapper0.getImage();
+    MPSImage* X1 = wrapper1.getImage();
+    CAFFE_ENFORCE_EQ(wrapper0.getCommandBuffer(), wrapper1.getCommandBuffer());
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X0.numberOfImages,
+        X0.height,
+        X0.width,
+        X0.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(X1.width, X0.width);
+    CAFFE_ENFORCE_EQ(X1.height, X0.height);
+    CAFFE_ENFORCE_EQ(X1.featureChannels, X0.featureChannels);
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state = getMPSCNNContext().getPipelineState(
+        kernelFor(X0, @"elementwise_add", @"elementwise_add_nonarray"));
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X0 texture] atIndex:0];
+    [encoder setTexture:[X1 texture] atIndex:1];
+    [encoder setTexture:[output texture] atIndex:2];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    wrapper0.markRead();
+    wrapper1.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "ElementwiseAdd took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNAdd, MPSCNNAddOp);
+// Not really in-place per-se, but semantically is valid and preserves
+// compatibility.
+OPERATOR_SCHEMA(MPSCNNAdd).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
+class MPSCNNAveragePoolOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  MPSCNNAveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(
+        kernel_h() == kernel_w(),
+        "Metal only supports equal kernel dimension.");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    if (!pool_ || this->global_pooling_) {
+      caffe2::Timer consT;
+      this->ComputePads({(int)X.height, (int)X.width});
+      pool_ =
+          [[MPSCNNPoolingAverage alloc] initWithDevice:getMPSCNNContext().device
+                                           kernelWidth:kernel_w()
+                                          kernelHeight:kernel_h()
+                                       strideInPixelsX:stride_w()
+                                       strideInPixelsY:stride_h()];
+
+      [pool_ setEdgeMode:MPSImageEdgeModeClamp];
+      MPSOffset offset;
+      offset.x = computeMPSAlignOffset(kernel_w(), pad_l());
+      offset.y = computeMPSAlignOffset(kernel_h(), pad_t());
+      offset.z = 0;
+      [pool_ setOffset:offset];
+      VLOG(2) << "MPSCNNAveragePool PoolDesc took: " << consT.MilliSeconds();
+    }
+
+    CAFFE_ENFORCE_EQ(pool_.strideInPixelsY, stride_h());
+    CAFFE_ENFORCE_EQ(pool_.strideInPixelsX, stride_w());
+    int output_height;
+    int output_width;
+    computeOutputHW(this, X.height, X.width, &output_height, &output_width);
+
+    VLOG(2) << "Output height: " << output_height;
+    VLOG(2) << "Output width:" << output_width;
+    VLOG(2) << "Output channels:" << X.featureChannels;
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.height, output_height);
+    CAFFE_ENFORCE_EQ(output.width, output_width);
+    [pool_ encodeToCommandBuffer:commandBuffer
+                     sourceImage:X
+                destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "MPSCNNAveragePool took: " << t.MilliSeconds();
+    return true;
+  }
+
+  MPSCNNPoolingAverage* pool_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNAveragePool, MPSCNNAveragePoolOp);
+OPERATOR_SCHEMA(MPSCNNAveragePool).NumInputs(1).NumOutputs(1);
+
+class MPSCNNMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  MPSCNNMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(
+        kernel_h() == kernel_w(),
+        "Metal only supports equal kernel dimension.");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    if (!pool_ || this->global_pooling_) {
+      caffe2::Timer consT;
+      this->ComputePads({(int)X.height, (int)X.width});
+      pool_ = [[MPSCNNPoolingMax alloc] initWithDevice:getMPSCNNContext().device
+                                           kernelWidth:kernel_w()
+                                          kernelHeight:kernel_h()
+                                       strideInPixelsX:stride_w()
+                                       strideInPixelsY:stride_h()];
+
+      [pool_ setEdgeMode:MPSImageEdgeModeClamp];
+      MPSOffset offset;
+      offset.x = computeMPSAlignOffset(kernel_w(), pad_l());
+      offset.y = computeMPSAlignOffset(kernel_h(), pad_t());
+      offset.z = 0;
+      [pool_ setOffset:offset];
+      VLOG(2) << "MPSCNNMaxPool PoolDesc took: " << consT.MilliSeconds();
+    }
+
+    CAFFE_ENFORCE_EQ(pool_.strideInPixelsY, stride_h());
+    CAFFE_ENFORCE_EQ(pool_.strideInPixelsX, stride_w());
+
+    int output_height;
+    int output_width;
+    computeOutputHW(this, X.height, X.width, &output_height, &output_width);
+
+    VLOG(2) << "Output height: " << output_height;
+    VLOG(2) << "Output width:" << output_width;
+    VLOG(2) << "Output channels:" << X.featureChannels;
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.height, output_height);
+    CAFFE_ENFORCE_EQ(output.width, output_width);
+    [pool_ encodeToCommandBuffer:commandBuffer
+                     sourceImage:X
+                destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "MPSCNNMaxPool took: " << t.MilliSeconds();
+    return true;
+  }
+
+  MPSCNNPoolingMax* pool_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNMaxPool, MPSCNNMaxPoolOp);
+OPERATOR_SCHEMA(MPSCNNMaxPool).NumInputs(1).NumOutputs(1);
+
+class MPSCNNSoftmaxOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(X.height, 1);
+    CAFFE_ENFORCE_EQ(X.width, 1);
+    if (!softmax_) {
+      softmax_ =
+          [[MPSCNNSoftMax alloc] initWithDevice:getMPSCNNContext().device];
+    }
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    [softmax_ encodeToCommandBuffer:commandBuffer
+                        sourceImage:X
+                   destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "MPSCNNSoftmax took: " << t.MilliSeconds();
+    return true;
+  }
+
+  MPSCNNSoftMax* softmax_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNSoftmax, MPSCNNSoftmaxOp);
+OPERATOR_SCHEMA(MPSCNNSoftmax).NumInputs(1).NumOutputs(1);
+
+template <typename Neuron>
+class MPSCNNFullyConnectedOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNFullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->template Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+    const auto& W = Input(1);
+    const auto& b = Input(2);
+
+    const auto input_channels = W.dim32(1) / X.width / X.height;
+    CAFFE_ENFORCE_EQ(input_channels, X.featureChannels);
+    const auto output_channels = W.dim32(0);
+    if (!fc_) {
+      const auto M = output_channels;
+      const auto kH = X.height;
+      const auto kW = X.width;
+      const auto C = input_channels;
+      std::vector<float> refilter(M * kH * kW * C);
+      auto* filter_ = W.template data<float>();
+      for (auto m = 0; m < M; ++m) {
+        for (auto c = 0; c < C; ++c) {
+          for (auto kh = 0; kh < kH; ++kh) {
+            for (auto kw = 0; kw < kW; ++kw) {
+              // refilter[m][kh][kw][c]
+              refilter[m * kH * kW * C + kh * kW * C + kw * C + c] =
+                  // filter[m][c][kh][kw]
+                  filter_[m * C * kH * kW + c * kH * kW + kh * kW + kw];
+            }
+          }
+        }
+      }
+
+      MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor
+          cnnConvolutionDescriptorWithKernelWidth:X.width
+                                     kernelHeight:X.height
+                             inputFeatureChannels:input_channels
+                            outputFeatureChannels:output_channels
+                                     neuronFilter:Neuron::t()];
+      auto data_source = [[ConvDataSource alloc]
+          initWithWeight:refilter.data()
+                    bias:const_cast<float*>(b.template data<float>())
+                    desc:desc];
+      fc_ = [[MPSCNNConvolution alloc] initWithDevice:getMPSCNNContext().device
+                                              weights:data_source];
+    }
+    // Note that X.numberOfImages can change between calls, but X.height and
+    // X.width are static by definition.
+    VLOG(2) << "MPSCNNFC: " << X.numberOfImages << ", " << X.width << ", "
+            << X.height << ", " << X.featureChannels << ", " << output_channels;
+
+    [fc_ setClipRect:MTLRegionMake3D(0, 0, 0, 1, 1, X.numberOfImages)];
+    MPSOffset off;
+    off.x = X.width / 2;
+    off.y = X.height / 2;
+    off.z = 0;
+    [fc_ setOffset:off];
+    auto outputWrapper = MPSImageWrapper(
+        this, &inputWrapper, X.numberOfImages, 1, 1, output_channels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    [fc_ encodeToCommandBuffer:commandBuffer
+                   sourceImage:X
+              destinationImage:output];
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "MPSCNNFC took: " << t.MilliSeconds();
+    return true;
+  }
+
+  MPSCNNConvolution* fc_{nullptr};
+};
+
+#define INIT_FC_NEURON_OP(name, neuron)                        \
+  REGISTER_CPU_OPERATOR(name, MPSCNNFullyConnectedOp<neuron>); \
+  OPERATOR_SCHEMA(name).NumInputs(3).NumOutputs(1);
+
+INIT_FC_NEURON_OP(MPSCNNFC, EmptyNeuronInit);
+INIT_FC_NEURON_OP(MPSCNNFCRelu, ReluNeuronInit);
+#undef INIT_FC_NEURON_OP
+
+class MPSCNNDropoutOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  // Just pass inputs through, since we assume inference-time only.
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    inputWrapper.copyToOutputBlob(Outputs()[0]);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNDropout, MPSCNNDropoutOp);
+// Never use the second output (the mask).
+OPERATOR_SCHEMA(MPSCNNDropout)
+    .NumInputs(1)
+    .NumOutputs(1, 2)
+    .AllowInplace({{0, 0}});
+
+class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
+ public:
+  MPSCNNConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+    CAFFE_ENFORCE_EQ(
+        kernel_w(), kernel_h(), "Metal only supports equal kernel dimensions");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+
+    MPSImage* X = inputWrapper.getImage();
+
+    auto& filter = Input(FILTER);
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(filter.ndim(), 4);
+    const int output_channels = filter.dim32(1);
+    const int input_channels = filter.dim32(0);
+
+    CAFFE_ENFORCE(X.featureChannels == input_channels, "");
+    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
+    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
+    CAFFE_ENFORCE(bias.ndim() == 1, "");
+    CAFFE_ENFORCE(bias.dim32(0) == output_channels, "");
+
+    const auto kH = kernel_h();
+    const auto kW = kernel_w();
+
+    int output_height =
+        (X.height - 1) * stride_h() + kH - pad_b() - pad_t() + adj_h();
+    int output_width =
+        (X.width - 1) * stride_w() + kW - pad_l() - pad_r() + adj_w();
+
+    VLOG(2) << "Output height: " << output_height;
+    VLOG(2) << "Output width:" << output_width;
+    VLOG(2) << "Output channels:" << output_channels;
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        output_channels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+
+    bool runtimeAtLeastIOS11 = SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0");
+    // initialization
+    if (!conv_trans_ && !conv_) {
+      caffe2::Timer consT;
+      std::vector<float> refilter(kH * kW * output_channels * input_channels);
+      refilter.assign(kH * kW * output_channels * input_channels, 0.0f);
+      DCHECK_EQ(refilter.size(), filter.size());
+      auto* filter_ = filter.template data<float>();
+      // For iOS11+ Reformat weights from WT[IC][OC][kH][kW] to
+      // W[OC][kH][kW][IC]; For previous versions, reformat weights
+      // to W[kH][kW][OC][IC]
+      // Also rotate the weight matrix spatially by 180 degrees
+      for (auto oc = 0; oc < output_channels; ++oc) {
+        for (auto ic = 0; ic < input_channels; ++ic) {
+          for (auto kh = 0; kh < kH; ++kh) {
+            for (auto kw = 0; kw < kW; ++kw) {
+              const auto inputIdx =
+                  ic * output_channels * kH * kW + oc * kH * kW + kh * kW + kw;
+              int outputIdx;
+              if (runtimeAtLeastIOS11) {
+                outputIdx = oc * kH * kW * input_channels +
+                    (kH - 1 - kh) * kW * input_channels +
+                    (kW - 1 - kw) * input_channels + ic;
+              } else {
+                outputIdx = kh * kW * output_channels * input_channels +
+                    kw * output_channels * input_channels +
+                    oc * input_channels + ic;
+              }
+              DCHECK_LT(inputIdx, filter.size());
+              DCHECK_LT(outputIdx, filter.size());
+              refilter[outputIdx] = filter_[inputIdx];
+            }
+          }
+        }
+      }
+      DCHECK_EQ(filter.size(), input_channels * output_channels * kH * kW);
+      // initialize data structures
+      if (runtimeAtLeastIOS11) {
+        MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor
+            cnnConvolutionDescriptorWithKernelWidth:kW
+                                       kernelHeight:kH
+                               inputFeatureChannels:input_channels
+                              outputFeatureChannels:output_channels];
+        desc.strideInPixelsX = this->stride_w();
+        desc.strideInPixelsY = this->stride_h();
+        desc.groups = 1;
+        auto data_source = [[ConvDataSource alloc]
+            initWithWeight:refilter.data()
+                      bias:const_cast<float*>(bias.data<float>())
+                      desc:desc];
+
+        conv_trans_ = [[MPSCNNConvolutionTranspose alloc]
+            initWithDevice:getMPSCNNContext().device
+                   weights:data_source];
+        MPSOffset offset;
+        offset.x = 0;
+        offset.y = 0;
+        offset.z = 0;
+        [conv_trans_ setOffset:offset];
+        // kernel offset + padding offset
+        conv_trans_.kernelOffsetX = kW / 2 - kW + 1 + this->pad_l();
+        conv_trans_.kernelOffsetY = kH / 2 - kH + 1 + this->pad_t();
+        VLOG(2) << "MPSCNNConvTranspose ConvDesc took: "
+                << consT.MilliSeconds();
+      } else {
+        MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor
+            cnnConvolutionDescriptorWithKernelWidth:1
+                                       kernelHeight:1
+                               inputFeatureChannels:input_channels
+                              outputFeatureChannels:output_channels * kH * kW
+                                       neuronFilter:nil];
+        // We need to zero-fill the bias here.
+        std::vector<float> fakeBias;
+        fakeBias.assign(output_channels * kH * kW, 0);
+
+        desc.strideInPixelsX = 1;
+        desc.strideInPixelsY = 1;
+        auto data_source =
+            [[ConvDataSource alloc] initWithWeight:refilter.data()
+                                              bias:fakeBias.data()
+                                              desc:desc];
+        conv_ =
+            [[MPSCNNConvolution alloc] initWithDevice:getMPSCNNContext().device
+                                              weights:data_source];
+        [conv_ setEdgeMode:MPSImageEdgeModeZero];
+        MPSOffset offset;
+        offset.x = 0;
+        offset.y = 0;
+        offset.z = 0;
+        [conv_ setOffset:offset];
+
+        const auto biasBytes = divRoundUp(bias.size(), 4) * 4 * 2;
+        biasBuffer_ = [getMPSCNNContext().device
+            newBufferWithLength:biasBytes
+                        options:MTLResourceOptionCPUCacheModeDefault];
+        for (auto i = 0; i < bias.size(); ++i) {
+          ((float16_t*)[biasBuffer_ contents])[i] = bias.data<float>()[i];
+        }
+
+        VLOG(2) << "MPSCNNConvTranspose ConvDesc took: "
+                << consT.MilliSeconds();
+      } // data structure initialization
+    } // initialization
+    CAFFE_ENFORCE((conv_trans_ && !conv_) || (!conv_trans_ && conv_));
+
+    // run the computation
+    if (conv_trans_) {
+      MPSImage* output = outputWrapper.getImage();
+      X = inputWrapper.getImage();
+      CAFFE_ENFORCE_EQ(conv_trans_.groups, 1);
+      [conv_trans_ encodeToCommandBuffer:commandBuffer
+                             sourceImage:X
+                        destinationImage:output];
+    } else {
+      CAFFE_ENFORCE_EQ(conv_.strideInPixelsY, 1);
+      CAFFE_ENFORCE_EQ(conv_.strideInPixelsX, 1);
+      CAFFE_ENFORCE_EQ(conv_.groups, 1);
+      CAFFE_ENFORCE_EQ(conv_.inputFeatureChannels, input_channels);
+      CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW);
+      CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1);
+      CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1);
+      if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) >
+          kMetalMaxTextureArrLength) {
+        LOG(INFO) << "ConvTranspose " << X.numberOfImages << " "
+                  << output_channels << " " << kH << " " << kW;
+        LOG(ERROR)
+            << "arrayLength exceeds the maximum allowed length in texture";
+        inputWrapper.cleanup();
+        outputWrapper.cleanup();
+        return false;
+      }
+      VLOG(2) << "ConvTranspose:" << output_channels << " " << kH << " " << kW
+              << " " << X.numberOfImages;
+
+      auto gemmed = createTemporaryImage(
+          this,
+          commandBuffer,
+          X.numberOfImages,
+          X.height,
+          X.width,
+          output_channels * kH * kW);
+      {
+        caffe2::Timer gt;
+        [conv_ encodeToCommandBuffer:commandBuffer
+                         sourceImage:X
+                    destinationImage:gemmed];
+        VLOG(2) << "MPSCNNConvTranspose GEMM took: " << gt.MilliSeconds();
+      }
+      MPSImage* output = outputWrapper.getImage();
+
+      {
+        caffe2::Timer cit;
+        id<MTLComputePipelineState> state =
+            getMPSCNNContext().getSpecializedPipelineState(
+                @"col2im",
+                {{ushort(kernel_h()),
+                  ushort(kernel_w()),
+                  ushort(stride_h()),
+                  ushort(stride_w()),
+                  ushort(pad_l()),
+                  ushort(pad_t()),
+                  ushort(output.featureChannels),
+                  ushort(output.numberOfImages),
+                  ushort(gemmed.height),
+                  ushort(gemmed.width)}});
+        id<MTLComputeCommandEncoder> encoder =
+            [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:state];
+        [encoder setTexture:[gemmed texture] atIndex:0];
+        [encoder setTexture:[output texture] atIndex:1];
+        [encoder setBuffer:biasBuffer_ offset:0 atIndex:0];
+        const auto& launchParams =
+            spatialPointwiseKernelLaunchParams(state, output);
+        [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+                threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+        [encoder endEncoding];
+        gemmed.readCount -= 1;
+        VLOG(2) << "MPSCNNConvTranspose upscaling took: " << cit.MilliSeconds();
+      }
+    }
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "MPSCNNConvTranspose took: " << t.MilliSeconds();
+    return true;
+  }
+
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  MPSCNNConvolutionTranspose* conv_trans_{nullptr};
+  id<MTLBuffer> biasBuffer_;
+  MPSCNNConvolution* conv_{nullptr};
+};
+
+// No-op init
+#define INIT_CONV_TRANSPOSE_NEURON_OP(name)           \
+  REGISTER_CPU_OPERATOR(name, MPSCNNConvTransposeOp); \
+  OPERATOR_SCHEMA(name).NumInputs(3).NumOutputs(1);
+
+INIT_CONV_TRANSPOSE_NEURON_OP(MPSCNNConvTranspose);
+#undef INIT_CONV_TRANSPOSE_NEURON_OP
+
+enum class InstanceNormFusionTy {
+  NONE,
+  PRELU,
+};
+
+template <InstanceNormFusionTy fusionTy>
+class MPSCNNInstanceNormOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNInstanceNormOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->template Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    const auto& scale = Input(1);
+    const auto& bias = Input(2);
+    CAFFE_ENFORCE_EQ(scale.size(), X.featureChannels);
+    CAFFE_ENFORCE_EQ(bias.size(), X.featureChannels);
+    const auto scaleBytes = divRoundUp(scale.size(), 4) * 4 * 2;
+    if (!scaleBuffer_ || !biasBuffer_ || scaleBuffer_.length != scaleBytes ||
+        biasBuffer_.length != scaleBytes) {
+      caffe2::Timer cvt;
+      // Round-up to nearest multiple of 4,
+      // so accesses to X[i * 4 + 3]  in kernel is valid.
+      scaleBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      biasBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      for (auto i = 0; i < scale.size(); ++i) {
+        ((float16_t*)[scaleBuffer_ contents])[i] =
+            scale.template data<float>()[i];
+      }
+      for (auto i = 0; i < bias.size(); ++i) {
+        ((float16_t*)[biasBuffer_ contents])[i] =
+            bias.template data<float>()[i];
+      }
+      if (fusionTy == InstanceNormFusionTy::PRELU) {
+        const auto& preluWeight = Input(3);
+        preluWeightBuffer_ = [getMPSCNNContext().device
+            newBufferWithLength:divRoundUp(preluWeight.size(), 4) * 4 * 2
+                        options:MTLResourceOptionCPUCacheModeDefault];
+        for (auto i = 0; i < preluWeight.size(); ++i) {
+          ((float16_t*)[preluWeightBuffer_ contents])[i] =
+              preluWeight.template data<float>()[i];
+        }
+      }
+      VLOG(2) << "Buffer setup took: " << cvt.MilliSeconds();
+    }
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = inputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    caffe2::Timer t;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            kernelFor(X, @"instance_norm", @"instance_norm_nonarray"),
+            {{ushort(X.featureChannels),
+              fusionTy == InstanceNormFusionTy::PRELU ? ushort(Input(3).size())
+                                                      : ushort(0)}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:scaleBuffer_ offset:0 atIndex:0];
+    [encoder setBuffer:biasBuffer_ offset:0 atIndex:1];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+    if (fusionTy == InstanceNormFusionTy::PRELU) {
+      [encoder setBuffer:preluWeightBuffer_ offset:0 atIndex:2];
+    }
+    [encoder dispatchThreadgroups:MTLSizeMake(
+                                      1,
+                                      1,
+                                      X.numberOfImages *
+                                          divRoundUp(X.featureChannels, 4))
+            threadsPerThreadgroup:MTLSizeMake(16, 16, 1)];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    VLOG(2) << "InstanceNorm took: " << t.MilliSeconds();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ private:
+  id<MTLBuffer> scaleBuffer_;
+  id<MTLBuffer> biasBuffer_;
+  id<MTLBuffer> preluWeightBuffer_;
+};
+
+REGISTER_CPU_OPERATOR(
+    MPSCNNInstanceNorm,
+    MPSCNNInstanceNormOp<InstanceNormFusionTy::NONE>);
+OPERATOR_SCHEMA(MPSCNNInstanceNorm).NumInputs(3).NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    MPSCNNInstanceNormPRelu,
+    MPSCNNInstanceNormOp<InstanceNormFusionTy::PRELU>);
+OPERATOR_SCHEMA(MPSCNNInstanceNormPRelu).NumInputs(4).NumOutputs(1);
+
+class MPSCNNNormalizePlanarYUVOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNNormalizePlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->template Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+
+    const auto& mean = Input(1);
+    const auto& std = Input(2);
+    CAFFE_ENFORCE_EQ(mean.size(), X.featureChannels);
+    CAFFE_ENFORCE_EQ(std.size(), X.featureChannels);
+    const auto scaleBytes = divRoundUp(mean.size(), 4) * 4 * 2;
+    if (!scaleBuffer_ || !shiftBuffer_ || scaleBuffer_.length != scaleBytes ||
+        shiftBuffer_.length != scaleBytes) {
+      caffe2::Timer cvt;
+      scaleBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      shiftBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      // op computes (X - mean) / std = X * 1/std + (-mean/std)
+      // Thus set scale = 1.0/std, shift = (-mean/std)
+      for (auto i = 0; i < mean.size(); ++i) {
+        ((float16_t*)[scaleBuffer_ contents])[i] =
+            1.0 / double(std.template data<float>()[i]);
+        ((float16_t*)[shiftBuffer_ contents])[i] =
+            double(-mean.template data<float>()[i]) /
+            double(std.template data<float>()[i]);
+      }
+      VLOG(2) << "Buffer setup took: " << cvt.MilliSeconds();
+    }
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = inputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    caffe2::Timer t;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            kernelFor(X, @"affine", @"affine_nonarray"),
+            {ushort(X.featureChannels)});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:scaleBuffer_ offset:0 atIndex:0];
+    [encoder setBuffer:shiftBuffer_ offset:0 atIndex:1];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    VLOG(2) << "InstanceNorm took: " << t.MilliSeconds();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ private:
+  id<MTLBuffer> scaleBuffer_;
+  id<MTLBuffer> shiftBuffer_;
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNNormalizePlanarYUV, MPSCNNNormalizePlanarYUVOp);
+OPERATOR_SCHEMA(MPSCNNNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
+
+class MPSCNNPReluOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNPReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    const MPSImage* X = inputWrapper.getImage();
+
+    const auto& scale = Input(1);
+    const auto scaleBytes = divRoundUp(scale.size(), 4) * 4 * 2;
+    if (!scaleBuffer_ || scaleBuffer_.length != scaleBytes) {
+      caffe2::Timer cvt;
+      scaleBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      for (auto i = 0; i < scale.size(); ++i) {
+        ((float16_t*)[scaleBuffer_ contents])[i] = scale.data<float>()[i];
+      }
+      VLOG(2) << "Buffer setup took: " << cvt.MilliSeconds();
+    }
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = inputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    caffe2::Timer t;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            kernelFor(X, @"prelu_nonshared", @"prelu_nonshared_nonarray"),
+            {{ushort(X.featureChannels), ushort(scale.size())}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:scaleBuffer_ offset:0 atIndex:0];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    VLOG(2) << "PRelu took: " << t.MilliSeconds();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ private:
+  id<MTLBuffer> scaleBuffer_;
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNPRelu, MPSCNNPReluOp);
+// Allow in-place isn't *really* valid here, since nothing is in-place for Metal
+// texture arrays, but requires re-export.
+OPERATOR_SCHEMA(MPSCNNPRelu).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
+class MPSCNNRoIWarpOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNRoIWarpOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    CAFFE_ENFORCE_GT(spatial_scale_, 0);
+    CAFFE_ENFORCE_GT(pooled_height_, 0);
+    CAFFE_ENFORCE_GT(pooled_width_, 0);
+    CAFFE_ENFORCE_GE(sampling_ratio_, 0);
+    VLOG(1) << "spatial_scale: " << spatial_scale_;
+    VLOG(1) << "pooled_h: " << pooled_height_;
+    VLOG(1) << "pooled_w: " << pooled_width_;
+    VLOG(1) << "sampling_ratio: " << sampling_ratio_;
+  }
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    auto X = inputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(X.numberOfImages, 1);
+    const auto& R = Input(1);
+    CAFFE_ENFORCE_EQ(R.ndim(), 2);
+    CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
+    const auto roiBytes = R.dim32(0) * 4 * sizeof(float16_t);
+    if (!roiBuffer_ || roiBuffer_.length != roiBytes) {
+      caffe2::Timer cvt;
+      roiBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:roiBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+    }
+    float16_t* roiBuffer = (float16_t*)[roiBuffer_ contents];
+    // Help compiler generate vcvt?
+    const auto Rdim = R.dim32(1);
+    CAFFE_ENFORCE(Rdim == 4 || Rdim == 5);
+    auto off = Rdim == 5 ? 1 : 0;
+    for (auto i = 0; i < R.dim32(0); ++i) {
+      if (Rdim == 5) {
+        // only handle batch-size of one, so the batch index must be one.
+        CAFFE_ENFORCE_EQ(R.data<float>()[i * Rdim], 0.0);
+      }
+      roiBuffer[i * 4 + 0] = R.data<float>()[i * Rdim + off + 0];
+      roiBuffer[i * 4 + 1] = R.data<float>()[i * Rdim + off + 1];
+      roiBuffer[i * 4 + 2] = R.data<float>()[i * Rdim + off + 2];
+      roiBuffer[i * 4 + 3] = R.data<float>()[i * Rdim + off + 3];
+    }
+    auto featureChannels = X.featureChannels;
+    VLOG(1) << "RoIWarp input size:" << X.numberOfImages << " "
+            << featureChannels << " " << X.height << " " << X.width;
+    VLOG(1) << "RoIWarp output size:" << R.dim32(0) << " " << featureChannels
+            << " " << pooled_width_ << " " << pooled_height_;
+    if (R.dim32(0) <= 0) {
+      LOG(ERROR) << "number of RoIs <= 0 in RoIWarp " << R.dim32(0);
+      inputWrapper.cleanup();
+      return false;
+    }
+    if (divRoundUp(R.dim32(0) * featureChannels, 4) >
+        kMetalMaxTextureArrLength) {
+      LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels;
+      LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
+      inputWrapper.cleanup();
+      return false;
+    }
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        R.dim32(0),
+        pooled_height_,
+        pooled_width_,
+        featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    VLOG(1) << "output: " << output.numberOfImages << ", "
+            << output.featureChannels << ", " << output.height << ", "
+            << output.width;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"roi_warp",
+            {{ushort(spatial_scale_ * 10000),
+              ushort(sampling_ratio_),
+              ushort(featureChannels),
+              ushort(X.numberOfImages),
+              ushort(output.numberOfImages)}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:roiBuffer_ offset:0 atIndex:0];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    VLOG(2) << "RoIWarp took: " << t.MilliSeconds();
+    VLOG(1) << "ROIWarp size: " << output.numberOfImages << ", "
+            << output.featureChannels << ", " << output.height << ", "
+            << output.width;
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ private:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+
+  id<MTLBuffer> roiBuffer_;
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNRoIWarp, MPSCNNRoIWarpOp);
+OPERATOR_SCHEMA(MPSCNNRoIWarp).NumInputs(2).NumOutputs(1);
+
+class MPSCNNGenerateProposalsCPPOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNGenerateProposalsCPPOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.0 / 16)),
+        feat_stride_(1.0 / spatial_scale_),
+        rpn_pre_nms_topN_(
+            OperatorBase::GetSingleArgument<int>("pre_nms_topN", 6000)),
+        rpn_post_nms_topN_(
+            OperatorBase::GetSingleArgument<int>("post_nms_topN", 300)),
+        rpn_nms_thresh_(
+            OperatorBase::GetSingleArgument<float>("nms_thresh", 0.7f)),
+        rpn_min_size_(OperatorBase::GetSingleArgument<float>("min_size", 16)) {}
+
+  template <class Derived1, class Derived2>
+  std::vector<int> nms_metal(
+      const Eigen::ArrayBase<Derived1>& proposals, // EArrXXf
+      const Eigen::ArrayBase<Derived2>& scores, // EArrXf
+      const std::vector<int>& sorted_indices,
+      float thresh) const {
+    CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
+    CAFFE_ENFORCE_EQ(proposals.cols(), 4);
+    CAFFE_ENFORCE_EQ(scores.cols(), 1);
+    CAFFE_ENFORCE_LE(sorted_indices.size(), proposals.rows());
+
+    std::vector<float> proposals_cpu(proposals.size());
+    Eigen::Map<ERArrXXf>(
+        &proposals_cpu[0], proposals.rows(), proposals.cols()) = proposals;
+
+    int box_num = sorted_indices.size();
+    int col_blocks = divRoundUp(box_num, maxThreadsPerThreadgroup);
+    auto pre_nms_size = box_num;
+    auto preNmsProposalsBuffer_ = [getMPSCNNContext().device
+        newBufferWithBytes:proposals_cpu.data()
+                    length:proposals.size() * sizeof(float)
+                   options:MTLResourceOptionCPUCacheModeDefault];
+    auto sortedIndicesBuffer_ = [getMPSCNNContext().device
+        newBufferWithBytes:sorted_indices.data()
+                    length:pre_nms_size * sizeof(int)
+                   options:MTLResourceOptionCPUCacheModeDefault];
+
+    int pose_nms_size = fmin(rpn_post_nms_topN_, pre_nms_size);
+    // round pose_nms_size up to the next power of 2
+    int batch_size = pow(2, ceil(log(pose_nms_size) / log(2)));
+
+    auto maskBuffer_ = [getMPSCNNContext().device
+        newBufferWithLength:batch_size * col_blocks * sizeof(uint32_t)
+                    options:MTLResourceOptionCPUCacheModeDefault];
+    std::vector<uint32_t> masks(batch_size * col_blocks);
+
+    std::vector<int> keep(pose_nms_size);
+    int num_to_keep = 0;
+    bool terminate = false;
+    std::vector<uint32_t> remv(col_blocks);
+
+    for (int offset = 0; !terminate && offset < box_num; offset += batch_size) {
+      auto commandBuffer = [getMPSCNNContext().commandQueue commandBuffer];
+      auto encoder = [commandBuffer computeCommandEncoder];
+      auto state = getMPSCNNContext().getSpecializedPipelineState(
+          @"nms",
+          {{ushort(batch_size),
+            maxThreadsPerThreadgroup,
+            ushort(rpn_nms_thresh_ * 10000),
+            ushort(offset)}});
+      [encoder setComputePipelineState:state];
+      [encoder setBuffer:maskBuffer_ offset:0 atIndex:0];
+      [encoder setBuffer:preNmsProposalsBuffer_ offset:0 atIndex:1];
+      [encoder setBuffer:sortedIndicesBuffer_ offset:0 atIndex:2];
+      const auto threadsPerThreadgroup =
+          MTLSizeMake(maxThreadsPerThreadgroup, 1, 1);
+      const auto threadgroupsPerGrid = MTLSizeMake(
+          divRoundUp(batch_size, maxThreadsPerThreadgroup),
+          divRoundUp(box_num, maxThreadsPerThreadgroup),
+          1);
+      [encoder dispatchThreadgroups:threadgroupsPerGrid
+              threadsPerThreadgroup:threadsPerThreadgroup];
+      [encoder endEncoding];
+      [commandBuffer commit];
+      [commandBuffer waitUntilCompleted];
+      uint32_t* maskBufferPointer = (uint32_t*)[maskBuffer_ contents];
+      std::copy(
+          maskBufferPointer,
+          maskBufferPointer + (maskBuffer_.length / sizeof(uint32_t)),
+          masks.begin());
+
+      for (int i = offset; i < fmin(offset + batch_size, box_num); ++i) {
+        int nblock = i / maxThreadsPerThreadgroup;
+        int inblock = i % maxThreadsPerThreadgroup;
+        if (!(remv[nblock] & (1U << inblock))) {
+          keep[num_to_keep++] = sorted_indices[i];
+          if (num_to_keep >= pose_nms_size) {
+            terminate = true;
+            break;
+          }
+          uint* p = &masks[0] + (i - offset) * col_blocks;
+          for (int j = nblock; j < col_blocks; j++) {
+            remv[j] |= p[j];
+          }
+        }
+      }
+    }
+    keep.resize(num_to_keep);
+    return keep;
+  }
+  void ProposalsForOneImage(
+      const Eigen::Array3f& im_info,
+      const Eigen::Map<const ERMatXf>& all_anchors,
+      const utils::ConstTensorView<float>& bbox_deltas_tensor,
+      const utils::ConstTensorView<float>& scores_tensor,
+      ERArrXXf* out_boxes,
+      EArrXf* out_probs) const {
+    const auto& pre_nms_topN = rpn_pre_nms_topN_;
+    const auto& post_nms_topN = rpn_post_nms_topN_;
+    const auto& nms_thresh = rpn_nms_thresh_;
+    const auto& min_size = rpn_min_size_;
+
+    // Transpose and reshape predicted bbox transformations to get them
+    // into the same order as the anchors:
+    //   - bbox deltas will be (4 * A, H, W) format from conv output
+    //   - transpose to (H, W, 4 * A)
+    //   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
+    //     in slowest to fastest order to match the enumerated anchors
+    CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
+    CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % 4, 0);
+    auto A = bbox_deltas_tensor.dim(0) / 4;
+    auto H = bbox_deltas_tensor.dim(1);
+    auto W = bbox_deltas_tensor.dim(2);
+    // equivalent to python code
+    //  bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4))
+    ERArrXXf bbox_deltas(H * W * A, 4);
+    Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, 4 * A) =
+        Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * 4, H * W)
+            .transpose();
+    CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());
+
+    // - scores are (A, H, W) format from conv output
+    // - transpose to (H, W, A)
+    // - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+    //   to match the order of anchors and bbox_deltas
+    CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
+    CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
+    // equivalent to python code
+    // scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
+    EArrXf scores(scores_tensor.size());
+    Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
+        Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();
+    // Transform anchors into proposals via bbox transformations
+    auto proposals = utils::bbox_transform(all_anchors.array(), bbox_deltas);
+
+    // 2. clip proposals to image (may result in proposals with zero area
+    // that will be removed in the next step)
+    proposals = utils::clip_boxes(proposals, im_info[0], im_info[1]);
+
+    // 3. remove predicted boxes with either height or width < min_size
+    auto keep = utils::filter_boxes(proposals, min_size, im_info);
+
+    DCHECK_LE(keep.size(), scores.size());
+
+    // 4. sort all (proposal, score) pairs by score from highest to lowest
+    // 5. take top pre_nms_topN (e.g. 6000)
+    std::sort(keep.begin(), keep.end(), [&scores](int lhs, int rhs) {
+      return scores[lhs] > scores[rhs];
+    });
+
+    if (pre_nms_topN > 0 && pre_nms_topN < keep.size()) {
+      keep.resize(pre_nms_topN);
+    }
+
+    // 6. apply loose nms (e.g. threshold = 0.7)
+    // 7. take after_nms_topN (e.g. 300)
+    // 8. return the top proposals (-> RoIs top)
+    keep = nms_metal(proposals, scores, keep, nms_thresh);
+    if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
+      keep.resize(post_nms_topN);
+    }
+    // Generate outputs
+    utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
+    utils::GetSubArray(scores, utils::AsEArrXt(keep), out_probs);
+  }
+
+  bool RunOnDevice() override {
+    const auto& scores = Input(0);
+    const auto& bbox_deltas = Input(1);
+    const auto& im_info_tensor = Input(2);
+    const auto& anchors = Input(3);
+    auto* out_rois = Output(0);
+    auto* out_rois_probs = Output(1);
+
+    CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
+    CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
+    const auto num_images = scores.dim(0);
+    const auto A = scores.dim(1);
+    const auto height = scores.dim(2);
+    const auto width = scores.dim(3);
+    const auto K = height * width;
+
+    // bbox_deltas: (num_images, A * 4, H, W)
+    CAFFE_ENFORCE_EQ(
+        bbox_deltas.dims(), (vector<TIndex>{num_images, 4 * A, height, width}));
+
+    // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
+    CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
+    CAFFE_ENFORCE(
+        im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
+
+    // anchors: (A, 4)
+    CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, 4}));
+    CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
+    // Broadcast the anchors to all pixels
+    auto all_anchors_vec =
+        utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
+    Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, 4);
+
+    Eigen::Map<const ERArrXXf> im_info(
+        im_info_tensor.data<float>(),
+        im_info_tensor.dim(0),
+        im_info_tensor.dim(1));
+
+    const int roi_col_count = 5;
+    out_rois->Resize(0, roi_col_count);
+    out_rois_probs->Resize(0);
+    Timer t1;
+    // Use openmp for acceleration?
+    for (int i = 0; i < num_images; i++) {
+      auto cur_im_info = im_info.row(i);
+      auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
+      auto cur_scores = GetSubTensorView<float>(scores, i);
+
+      ERArrXXf im_i_boxes;
+      EArrXf im_i_probs;
+      ProposalsForOneImage(
+          cur_im_info,
+          all_anchors,
+          cur_bbox_deltas,
+          cur_scores,
+          &im_i_boxes,
+          &im_i_probs);
+
+      int csz = im_i_boxes.rows();
+      int cur_start_idx = out_rois->dim(0);
+
+      out_rois->Extend(csz, 50, &context_);
+      out_rois_probs->Extend(csz, 50, &context_);
+
+      // write rois
+      Eigen::Map<ERArrXXf> cur_rois(
+          out_rois->mutable_data<float>() + cur_start_idx * roi_col_count,
+          csz,
+          5);
+      cur_rois.col(0).setConstant(i);
+      cur_rois.block(0, 1, csz, 4) = im_i_boxes;
+
+      // write rois_probs
+      Eigen::Map<EArrXf>(
+          out_rois_probs->mutable_data<float>() + cur_start_idx, csz) =
+          im_i_probs;
+    }
+
+    return true;
+  }
+
+ protected:
+  // spatial_scale_ must be declared before feat_stride_
+  float spatial_scale_{1.0};
+  float feat_stride_{1.0};
+
+  // RPN_PRE_NMS_TOP_N
+  ushort rpn_pre_nms_topN_{6000};
+  // RPN_POST_NMS_TOP_N
+  ushort rpn_post_nms_topN_{300};
+  // RPN_NMS_THRESH
+  float rpn_nms_thresh_{0.7};
+  // RPN_MIN_SIZE
+  float rpn_min_size_{16};
+  // threads per thread group, used in nms
+  ushort maxThreadsPerThreadgroup{32};
+
+ private:
+  id<MTLBuffer> out_rois_{nullptr};
+  id<MTLBuffer> out_rois_probs_{nullptr};
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNGenerateProposalsCPP, MPSCNNGenerateProposalsCPPOp);
+OPERATOR_SCHEMA(MPSCNNGenerateProposalsCPP).NumInputs(4).NumOutputs(2);
+
+class MPSCNNSpatialBNOp final : public SpatialBNOp<CPUContext> {
+ public:
+  MPSCNNSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNOp<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    const MPSImage* X = inputWrapper.getImage();
+    const auto& scale = Input(SCALE);
+    const auto& bias = Input(BIAS);
+    const auto& var = Input(EST_VAR);
+    const auto& mean = Input(EST_MEAN);
+    CAFFE_ENFORCE_EQ(scale.size(), X.featureChannels);
+    CAFFE_ENFORCE_EQ(bias.size(), X.featureChannels);
+    CAFFE_ENFORCE_EQ(var.size(), X.featureChannels);
+    CAFFE_ENFORCE_EQ(mean.size(), X.featureChannels);
+
+    const auto scaleBytes = divRoundUp(scale.size(), 4) * 4 * 2;
+    if (!scaleBuffer_ || scaleBuffer_.length != scaleBytes) {
+      caffe2::Timer cvt;
+      scaleBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      shiftBuffer_ = [getMPSCNNContext().device
+          newBufferWithLength:scaleBytes
+                      options:MTLResourceOptionCPUCacheModeDefault];
+      for (auto i = 0; i < scale.size(); ++i) {
+        // We can fuse the output computation as follows:
+        //   ((x - est_mean) * (inv_var) * scale + bias
+        // to
+        //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+
+        const auto inv_std = 1.0 / std::sqrt(var.data<float>()[i] + epsilon_);
+        ((float16_t*)[scaleBuffer_ contents])[i] =
+            scale.data<float>()[i] * inv_std;
+        ((float16_t*)[shiftBuffer_ contents])[i] = bias.data<float>()[i] -
+            mean.data<float>()[i] * inv_std * scale.data<float>()[i];
+      }
+      VLOG(2) << "Buffer setup took: " << cvt.MilliSeconds();
+    }
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        X.height,
+        X.width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    caffe2::Timer t;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            kernelFor(output, @"affine", @"affine_nonarray"),
+            {ushort(X.featureChannels)});
+
+    [encoder setComputePipelineState:state];
+    [encoder setBuffer:scaleBuffer_ offset:0 atIndex:0];
+    [encoder setBuffer:shiftBuffer_ offset:0 atIndex:1];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    VLOG(2) << "SpatialBN took: " << t.MilliSeconds();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ private:
+  id<MTLBuffer> scaleBuffer_;
+  id<MTLBuffer> shiftBuffer_;
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNSpatialBN, MPSCNNSpatialBNOp);
+OPERATOR_SCHEMA(MPSCNNSpatialBN).NumInputs(5).NumOutputs(1);
+
+class MPSCNNConcatOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    // Only handle three inputs for now.
+    OPERATOR_NEEDS_FEATURE(
+        Inputs().size() <= 4, "MPSCNNConcat only handles up to four inputs");
+  }
+
+  bool RunOnDevice() override {
+    auto Wrapper = [&](size_t i) {
+      return Inputs()[i]->template Get<MPSImageWrapper>();
+    };
+    auto cb = [&](size_t i) { return Wrapper(i).getCommandBuffer(); };
+    auto X = [&](size_t i) { return Wrapper(i).getImage(); };
+
+    // C0, C1, C2, C3, C, N
+    std::vector<ushort> channels = {
+        {0, 0, 0, 0, 0, ushort(X(0).numberOfImages)}};
+    size_t channelCount = 0;
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      // this does not hold for non-temp images inputs
+      CAFFE_ENFORCE_EQ(cb(0), cb(i));
+      CAFFE_ENFORCE_EQ(X(0).height, X(i).height);
+      CAFFE_ENFORCE_EQ(X(0).width, X(i).width);
+      channels[i] = X(i).featureChannels;
+      channelCount += X(i).featureChannels;
+    }
+    channels[4] = channelCount;
+
+    auto wrapper0 = Inputs()[0]->template Get<MPSImageWrapper>();
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X(0).numberOfImages,
+        X(0).height,
+        X(0).width,
+        channelCount);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    caffe2::Timer t;
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(@"concat", channels);
+
+    [encoder setComputePipelineState:state];
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      [encoder setTexture:[X(i) texture] atIndex:i];
+    }
+    [encoder setTexture:[output texture] atIndex:5];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    for (auto i = 0; i < Inputs().size(); ++i) {
+      Wrapper(i).markRead();
+    }
+
+    VLOG(2) << "Concat took: " << t.MilliSeconds();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNConcat, MPSCNNConcatOp);
+// Only store one output in practice (ignore the shape argument).
+OPERATOR_SCHEMA(MPSCNNConcat).NumInputs(2, 4).NumOutputs(1, 2);
+
+class MPSCNNResizeNearestOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    width_scale_ = OperatorBase::GetSingleArgument<float>("width_scale", 1);
+    height_scale_ = OperatorBase::GetSingleArgument<float>("height_scale", 1);
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+
+    // due to the way we pass these parameters, we don't support the scale to be
+    // larger than 6.5
+    CAFFE_ENFORCE_LE(width_scale_, 6.5);
+    CAFFE_ENFORCE_LE(height_scale_, 6.5);
+  }
+
+  bool RunOnDevice() override {
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    const MPSImage* X = inputWrapper.getImage();
+
+    const int N = X.numberOfImages, C = X.featureChannels, H = X.height,
+              W = X.width;
+    int output_width = W * width_scale_;
+    int output_height = H * height_scale_;
+    auto outputWrapper =
+        MPSImageWrapper(this, &inputWrapper, N, output_height, output_width, C);
+    auto commandBuffer = inputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    auto encoder = [commandBuffer computeCommandEncoder];
+    auto state = getMPSCNNContext().getSpecializedPipelineState(
+        kernelFor(output, @"resize_nearest", @"resize_nearest_nonarray"),
+        {{ushort(output_height),
+          ushort(output_width),
+          ushort(height_scale_ * 10000),
+          ushort(width_scale_ * 10000)}});
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+    auto launchParams = spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    return true;
+  }
+
+ protected:
+  float width_scale_;
+  float height_scale_;
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNResizeNearest, MPSCNNResizeNearestOp);
+OPERATOR_SCHEMA(MPSCNNResizeNearest).NumInputs(1).NumOutputs(1);
+
+class MPSCNNChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  MPSCNNChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+    kernel_[0] = kernel_[1] = 1;
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    caffe2::Timer t;
+    auto inputWrapper = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X = inputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(X.featureChannels % this->group_, 0);
+    auto output_height = X.height;
+    auto output_width = X.width;
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &inputWrapper,
+        X.numberOfImages,
+        output_height,
+        output_width,
+        X.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+    CAFFE_ENFORCE_EQ(output.height, output_height);
+    CAFFE_ENFORCE_EQ(output.width, output_width);
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"channel_shuffle",
+            {{
+                ushort(X.numberOfImages),
+                ushort(X.featureChannels),
+                ushort(X.featureChannels / this->group_),
+                ushort(this->group_),
+            }});
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[output texture] atIndex:1];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    inputWrapper.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+
+    VLOG(2) << "ChannelShuffle took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNChannelShuffle, MPSCNNChannelShuffleOp);
+OPERATOR_SCHEMA(MPSCNNChannelShuffle).NumInputs(1).NumOutputs(1);
+}
+
+CAFFE_KNOWN_TYPE(MPSImageWrapper);
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.h
new file mode 100644
index 0000000..58cb54c
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.h
@@ -0,0 +1,33 @@
+
+#pragma once
+
+#import <Metal/MTLBuffer.h>
+#import <Metal/MTLDevice.h>
+#import <Metal/MTLLibrary.h>
+
+#include <array>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+namespace caffe2 {
+
+struct MPSCNNContext {
+ public:
+  id<MTLDevice> device;
+  id<MTLCommandQueue> commandQueue;
+  id<MTLLibrary> library;
+
+  id<MTLComputePipelineState> getPipelineState(NSString* kernel);
+  id<MTLComputePipelineState> getSpecializedPipelineState(NSString* kernel,
+                                                          const std::vector<ushort>& constants);
+
+ private:
+  std::mutex pipelineCacheMutex_;
+  std::unordered_map<std::string, id<MTLComputePipelineState>> pipelineCache_;
+};
+
+// get the singleton instance.
+MPSCNNContext& getMPSCNNContext();
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.mm
new file mode 100644
index 0000000..e52534f
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.mm
@@ -0,0 +1,102 @@
+
+#include "caffe2/core/common.h"
+
+#if CAFFE2_MOBILE
+
+#include "mpscnn_context.h"
+#include "mpscnn_kernels.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/timer.h"
+
+#include <array>
+#include <mutex>
+#include <thread>
+
+#import <Metal/MTLFunctionConstantValues.h>
+
+namespace caffe2 {
+
+MPSCNNContext& getMPSCNNContext() {
+  static std::once_flag once;
+  static MPSCNNContext ctx;
+  std::call_once(once, []() {
+    NSError* compileError = nil;
+    ctx.device = MTLCreateSystemDefaultDevice();
+    ctx.library = [ctx.device newLibraryWithSource:[NSString stringWithUTF8String:MPSCNN_KERNELS]
+                                           options:nil
+                                             error:&compileError];
+    if (compileError != nil || ctx.library == nil) {
+      CAFFE_THROW("Failed to load kernels: ", [[compileError localizedDescription] UTF8String]);
+    }
+    ctx.commandQueue = [ctx.device newCommandQueue];
+  });
+  return ctx;
+}
+
+id<MTLComputePipelineState> MPSCNNContext::getPipelineState(NSString* kernel) {
+  std::string kernelStr = std::string([kernel UTF8String]);
+  std::lock_guard<std::mutex> g(pipelineCacheMutex_);
+  if (pipelineCache_.find(kernelStr) != pipelineCache_.end()) {
+    VLOG(1) << "Hit in pipeline cache for: " << kernelStr;
+    return pipelineCache_[kernelStr];
+  }
+  LOG(INFO) << "Miss in pipeline cache for: " << kernelStr;
+  id<MTLFunction> func = [library newFunctionWithName:kernel];
+  if (!func) {
+    CAFFE_THROW("Couldn't get function: ", kernelStr);
+    return nullptr;
+  }
+  NSError* errors;
+  id<MTLComputePipelineState> state =
+      [device newComputePipelineStateWithFunction:func error:&errors];
+  if (!state) {
+    CAFFE_THROW("Couldn't get state: ", kernelStr);
+    return nullptr;
+  }
+  pipelineCache_[kernelStr] = state;
+  return state;
+}
+
+id<MTLComputePipelineState> MPSCNNContext::getSpecializedPipelineState(
+    NSString* kernel, const std::vector<ushort>& constants) {
+  std::string kernelStr = std::string([kernel UTF8String]);
+  for (auto i = 0; i < constants.size(); ++i) {
+    kernelStr += "_" + std::to_string(constants[i]);
+  }
+  std::lock_guard<std::mutex> g(pipelineCacheMutex_);
+  if (pipelineCache_.find(kernelStr) != pipelineCache_.end()) {
+    VLOG(1) << "Hit in pipeline cache for: " << kernelStr;
+    return pipelineCache_[kernelStr];
+  }
+  MTLFunctionConstantValues* constantValues = [MTLFunctionConstantValues new];
+  for (auto i = 0; i < constants.size(); ++i) {
+    [constantValues setConstantValue:&constants[i] type:MTLDataTypeUShort atIndex:i];
+  }
+  NSError* errors;
+
+  LOG(INFO) << "Miss in pipeline cache for: " << kernelStr;
+  id<MTLFunction> func =
+      [library newFunctionWithName:kernel constantValues:constantValues error:&errors];
+  if (!func) {
+    CAFFE_THROW("Couldn't get function: ",
+                kernelStr,
+                " error: ",
+                [[errors localizedDescription] UTF8String]);
+    return nullptr;
+  }
+  id<MTLComputePipelineState> state =
+      [device newComputePipelineStateWithFunction:func error:&errors];
+  if (!state) {
+    CAFFE_THROW("Couldn't get function: ",
+                kernelStr,
+                " error: ",
+                [[errors localizedDescription] UTF8String]);
+    return nullptr;
+  }
+  pipelineCache_[kernelStr] = state;
+  return state;
+}
+}
+
+#endif
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
new file mode 100644
index 0000000..b2945d5
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
@@ -0,0 +1,348 @@
+#include "caffe2/core/operator.h"
+#include "mpscnn.h"
+#include "mpscnn_context.h"
+
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#import <UIKit/UIDevice.h>
+
+namespace caffe2 {
+struct Analysis {
+  struct SSA {
+    using BlobVersions = std::unordered_map<std::string, size_t>;
+    BlobVersions inVersions;
+    BlobVersions outVersions;
+  };
+  std::vector<SSA> ssa;
+  std::unordered_map<
+      std::string,
+      std::unordered_map<size_t, std::vector<size_t>>>
+      inUsages;
+};
+
+Analysis analyzeNet(const NetDef& net) {
+  Analysis::SSA::BlobVersions frontier;
+  Analysis analysis;
+
+  auto play = [&](size_t i, const OperatorDef& op) {
+    Analysis::SSA::BlobVersions inVersions;
+    for (const auto& s : op.input()) {
+      inVersions[s] = frontier[s];
+      analysis.inUsages[s][frontier[s]].push_back(i);
+    }
+    Analysis::SSA::BlobVersions outVersions;
+    for (const auto& s : op.output()) {
+      if (frontier.find(s) != frontier.end()) {
+        frontier[s] += 1;
+      }
+      outVersions[s] = frontier[s];
+    }
+    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
+  };
+
+  for (auto i = 0; i < net.op_size(); ++i) {
+    play(i, net.op(i));
+  }
+  return analysis;
+}
+
+NetDef insertInputOutputCopyOps(const NetDef& def) {
+  // Do some validation of the outputs. For this version, we require:
+  // - a single input (first element of external_input()) is consumed by the
+  // NetDef - a single output (first element of external_output()) is produced
+  // by the NetDef. - the input is consumed by def.op(0), and this is the only
+  // consumer. - the output is produced by def.op(-1).
+  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
+  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
+  auto analysis = analyzeNet(def);
+  // enforce a single use of the input blob.
+  CAFFE_ENFORCE_GE(def.op_size(), 1);
+  const auto& inputBlob = def.external_input(0);
+  // Enforce that the input blob has a single usage - in the first operator.
+  CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
+  // Enforce that the external_output(0) blob is produced by the last operator
+  // in this sequence.
+  const auto& outputBlob = def.external_output(0);
+  CAFFE_ENFORCE(
+      analysis.ssa.back().outVersions.find(outputBlob) !=
+      analysis.ssa.back().outVersions.end());
+  const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
+  // This should hold true by definition of the SSA analysis.
+  CAFFE_ENFORCE(
+      analysis.inUsages[outputBlob].find(outputBlobVersion) ==
+      analysis.inUsages[outputBlob].end());
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+
+  {
+    auto& op = *(mdef.add_op());
+    op.set_type("CopyToMPSCNN");
+    op.add_input(def.external_input(0));
+    op.add_output("__METAL_INPUT_COPY__");
+  }
+
+  for (auto i = 0; i < def.op_size(); ++i) {
+    const auto& ogOp = def.op(i);
+    auto op = mdef.add_op();
+    op->CopyFrom(ogOp);
+    if (i == 0) {
+      CAFFE_ENFORCE_EQ(op->input(0), def.external_input(0));
+      op->set_input(0, "__METAL_INPUT_COPY__");
+    }
+    if (i == def.op_size() - 1) {
+      CAFFE_ENFORCE_EQ(op->output(0), def.external_output(0));
+      op->set_output(0, "__METAL_OUTPUT_COPY__");
+    }
+  }
+  {
+    auto& op = *(mdef.add_op());
+    op.set_type("CopyFromMPSCNN");
+    op.add_input("__METAL_OUTPUT_COPY__");
+    op.add_output(def.external_output(0));
+  }
+  return mdef;
+}
+
+bool nextIsOnlyUserOfCurrent(
+    const Analysis& analysis,
+    size_t currentIdx,
+    const OperatorDef& currentOp,
+    const OperatorDef& nextOp) {
+  CAFFE_ENFORCE_EQ(currentOp.output_size(), 1);
+  CAFFE_ENFORCE_GE(nextOp.input_size(), 1);
+  CAFFE_ENFORCE_EQ(currentOp.output(0), nextOp.input(0));
+  const auto outputName = currentOp.output(0);
+  // Find the version of the output name we are currently looking at.
+  // This is guaranteed to exist by SSA analysis.
+  const auto currentOutputVersion =
+      analysis.ssa.at(currentIdx).outVersions.at(outputName);
+  VLOG(2) << "Blob: " << outputName << ", idx: " << currentOutputVersion;
+  // Find the usages of this in the SSA analysis.
+
+  // Has this blob every been used?
+  if (analysis.inUsages.find(outputName) == analysis.inUsages.end()) {
+    return false;
+  }
+
+  // Has this version of the blob ever been used?
+  if (analysis.inUsages.at(outputName).find(currentOutputVersion) ==
+      analysis.inUsages.at(outputName).end()) {
+    return false;
+  }
+  const auto currentOutputUsages =
+      analysis.inUsages.at(outputName).at(currentOutputVersion);
+  VLOG(2) << "Blob: " << outputName << ", idx: " << currentOutputVersion
+          << ", usages[0]: " << currentOutputUsages[0];
+
+  return currentOutputUsages == std::vector<size_t>{currentIdx + 1};
+}
+bool tryFuseAdjacentOps(
+    const Analysis& analysis,
+    size_t currentIdx,
+    const OperatorDef& currentOp,
+    const OperatorDef& nextOp,
+    OperatorDef* fusedOp) {
+  // Check for possible invalid opportunities.
+  // Must be identical outputs, with either in-place usage for nextOp, *or* the
+  // only use of the output of currentOp is the consumption by nextOp.
+  if (currentOp.output_size() != 1 || !nextOp.input_size() ||
+      nextOp.output_size() != 1) {
+    return false;
+  }
+
+  if (currentOp.output(0) != nextOp.input(0)) {
+    return false;
+  }
+
+  if (!nextIsOnlyUserOfCurrent(analysis, currentIdx, currentOp, nextOp)) {
+    return false;
+  }
+
+  // Can we autogenerate this at registration time instead?
+  static const std::map<std::pair<std::string, std::string>, std::string>
+      fusionOpportunities = {{
+          {{"MPSCNNConv", "MPSCNNRelu"}, "MPSCNNConvRelu"},
+          {{"MPSCNNConv", "MPSCNNSigmoid"}, "MPSCNNConvSigmoid"},
+          {{"MPSCNNFC", "MPSCNNRelu"}, "MPSCNNFCRelu"},
+          {{"MPSCNNInstanceNorm", "MPSCNNPRelu"}, "MPSCNNInstanceNormPRelu"},
+      }};
+  auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
+  if (it == fusionOpportunities.end()) {
+    return false;
+  }
+  // MPSCNNConvRelu and MPSCNNConvSigmoid cannot be in-place
+  if (currentOp.type() == "MPSCNNConv" &&
+      currentOp.input(0) == nextOp.output(0)) {
+    return false;
+  }
+  LOG(INFO) << "Found a fusion between adjacent ops: (" << currentOp.type()
+            << ", " << nextOp.type() << ") -> " << it->second;
+  fusedOp->CopyFrom(currentOp);
+  fusedOp->set_type(it->second);
+  for (auto i = 1; i < nextOp.input_size(); ++i) {
+    fusedOp->add_input(nextOp.input(i));
+  }
+  fusedOp->set_output(0, nextOp.output(0));
+  return true;
+}
+
+NetDef runMPSCNNFusion(const NetDef& def) {
+  CAFFE_ENFORCE_GE(def.op_size(), 1);
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+  auto i = 0;
+  auto analysis = analyzeNet(def);
+
+  while (i < def.op_size()) {
+    if (i == def.op_size() - 1) {
+      VLOG(2) << "Last operator, skipping";
+      auto* op = mdef.add_op();
+      op->CopyFrom(def.op(i));
+      i += 1;
+      continue;
+    }
+
+    const auto& currentOp = def.op(i);
+    const auto& nextOp = def.op(i + 1);
+    OperatorDef fusedOp;
+    if (tryFuseAdjacentOps(analysis, i, currentOp, nextOp, &fusedOp)) {
+      VLOG(2) << "Found an adjacent fusion at: " << i;
+      // We can fuse.
+      auto* op = mdef.add_op();
+      op->CopyFrom(fusedOp);
+      i += 2;
+      continue;
+    }
+    VLOG(2) << "No fusion available";
+    // Just emit the current type.
+    auto* op = mdef.add_op();
+    op->CopyFrom(currentOp);
+    i += 1;
+  }
+  return mdef;
+}
+
+NetDef rewriteForMetal(const NetDef& def) {
+  NetDef mdef;
+  mdef.CopyFrom(def);
+
+  const auto& opKeyList = CPUOperatorRegistry()->Keys();
+  const auto& opKeySet =
+      std::set<std::string>(opKeyList.begin(), opKeyList.end());
+  for (auto i = 0; i < mdef.op_size(); ++i) {
+    auto* op = mdef.mutable_op(i);
+    const auto mpscnnOp = std::string("MPSCNN") + op->type();
+    CAFFE_ENFORCE(opKeySet.find(mpscnnOp) != opKeySet.end());
+    op->set_type(mpscnnOp);
+  }
+
+  mdef = runMPSCNNFusion(mdef);
+  static std::set<std::string> mpscnnInputOps = {
+      "CopyToMPSCNN", "MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess"};
+  static std::set<std::string> mpscnnOutputOps = {
+      "CopyFromMPSCNN", "MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess"};
+
+  if (mpscnnInputOps.find(mdef.op(0).type()) == mpscnnInputOps.end() &&
+      mpscnnOutputOps.find(mdef.op(mdef.op_size() - 1).type()) ==
+          mpscnnOutputOps.end()) {
+    mdef = insertInputOutputCopyOps(mdef);
+  }
+  CAFFE_ENFORCE_GE(mdef.op_size(), 2);
+  CAFFE_ENFORCE(mpscnnInputOps.find(mdef.op(0).type()) != mpscnnInputOps.end());
+  CAFFE_ENFORCE(
+      mpscnnOutputOps.find(mdef.op(mdef.op_size() - 1).type()) !=
+      mpscnnOutputOps.end());
+  return mdef;
+}
+
+void dumpDef(const NetDef& d) {
+  for (const auto& op : d.op()) {
+    LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
+  }
+}
+
+NetDef annotateDefWithReadCounts(const NetDef& net) {
+  // Now we have usage versions, we want to compute, for each blob version, the
+  // number of usages of each blob version. ReadCount
+  auto analysis = analyzeNet(net);
+  using ReadCount = std::unordered_map<std::string, size_t>;
+  std::vector<ReadCount> readCounts;
+
+  auto computeReadCount = [&](size_t i, const OperatorDef& op) {
+    ReadCount rcs;
+    for (const auto bv : analysis.ssa[i].outVersions) {
+      const auto versionUsages = analysis.inUsages[bv.first][bv.second];
+      rcs[bv.first] = versionUsages.size();
+    }
+    readCounts.push_back(rcs);
+  };
+  for (auto i = 0; i < net.op_size(); ++i) {
+    computeReadCount(i, net.op(i));
+  }
+
+  NetDef annotatedNet;
+  annotatedNet.CopyFrom(net);
+  for (auto i = 0; i < annotatedNet.op_size(); ++i) {
+    auto* op = annotatedNet.mutable_op(i);
+    // TODO - relax this? CAFFE_ENFORCE_EQ(op->output_size(), 1);
+    const auto& blob = op->output(0);
+    const size_t readCount = readCounts[i][blob];
+    if (readCount > 1) {
+      auto* arg = op->add_arg();
+      arg->set_name(kMPSCNNReadCountArg);
+      arg->set_i(readCount);
+      LOG(INFO) << "Op: " << i << ", ty: " << op->type() << ", blob: " << blob
+                << ", read count: " << readCount;
+    }
+  }
+  return annotatedNet;
+}
+
+bool tryConvertToMPSCNN(
+    const NetDef& initNet,
+    const NetDef& predictNet,
+    NetDef* metalPredictNet) {
+  // iOS 10.0 and above.
+
+#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v) \
+  ([[[UIDevice currentDevice] systemVersion]       \
+       compare:v                                   \
+       options:NSNumericSearch] != NSOrderedAscending)
+  if (!SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0")) {
+    LOG(ERROR) << "MPSCNN is only supported for ios version above 11.0.";
+    return false;
+  }
+#undef SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO
+  // The iOS GPU Family 3 v2 feature set. Introduced with the Apple A9 GPU and
+  // iOS 10.0. Don't instantiate the MPSCNNContext, as that compiles the kernel
+  // source.
+  if (![MTLCreateSystemDefaultDevice()
+          supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]) {
+    LOG(ERROR) << "The iOS GPU is less than an A9, so MPSCNN is not available";
+    return false;
+  }
+
+  try {
+    // Instantiating the net and catching failures allows us to
+    Workspace ws;
+    ws.RunNetOnce(initNet);
+    // Throws if unsupported operators are found.
+    *metalPredictNet = rewriteForMetal(predictNet);
+    *metalPredictNet = annotateDefWithReadCounts(*metalPredictNet);
+    // Throws if unsupported parameters are found.
+    ws.CreateNet(*metalPredictNet);
+    LOG(INFO) << "MPSCNN is successfully enabled";
+    return true;
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Caught exception trying to convert NetDef to MPSCNN: "
+               << e.what();
+    return false;
+  }
+}
+
+void mpscnnRecordExecutionFinish() {
+  [getMPSCNNContext().commandQueue insertDebugCaptureBoundary];
+}
+}
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.h
new file mode 100644
index 0000000..19fc59c
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.h
@@ -0,0 +1,16 @@
+
+#pragma once
+#include "caffe2/core/net.h"
+#include "mpscnn.h"
+
+namespace caffe2 {
+// We currently only try to convert a fixed set of operators that handle a subset of a full
+// CNN. We also only run when MPSCNN is available, provides a speedup.
+// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
+// parameter.
+// The rewrite function now supports insertion of copies in intermediate ops.
+bool tryConvertToMPSCNNIntermediateCopies(const NetDef& initNet,
+                                          const NetDef& predictNet,
+                                          NetDef* mpscnnPredictNet);
+NetDef setSpecialArgs(const NetDef& def);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
new file mode 100644
index 0000000..bec56de
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
@@ -0,0 +1,599 @@
+#include "mpscnn_graph_mask.h"
+#include "caffe2/core/operator.h"
+#include "mpscnn_context.h"
+
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#import <UIKit/UIDevice.h>
+
+namespace caffe2 {
+
+namespace {
+enum class StorageType {
+  MPSTEMPORARYIMAGE, /* Default for MPSCNN */
+  MPSIMAGE,
+  CPU,
+  INVALID
+};
+
+string as_string(StorageType st) {
+  switch (st) {
+  case StorageType::MPSTEMPORARYIMAGE:
+    return "MPSTEMPORARYIMAGE";
+  case StorageType::MPSIMAGE:
+    return "MPSIMAGE";
+  case StorageType::CPU:
+    return "CPU";
+  case StorageType::INVALID:
+    return "INVALID";
+  }
+}
+
+std::unordered_map<string, std::vector<StorageType>> inputStorageTypeMap = {
+    {"MPSCNNGenerateProposalsCPP",
+     std::vector<StorageType>{StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNRoIWarp",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
+    {"MPSCNNConvRelu",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNFC",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNConv",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNConvTranspose",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNMul",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
+    {"MPSCNNSub",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}}};
+std::unordered_map<string, std::vector<StorageType>> outputStorageTypeMap = {
+    {"MPSCNNGenerateProposalsCPP", std::vector<StorageType>{StorageType::CPU, StorageType::CPU}}};
+std::vector<string> opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"};
+
+struct Analysis {
+  struct SSA {
+    using BlobVersions = std::unordered_map<std::string, size_t>;
+    BlobVersions inVersions;
+    BlobVersions outVersions;
+  };
+  struct BlobInfo {
+    std::vector<size_t> inUsages; // ids for operator that used the blob
+    StorageType storageType = StorageType::INVALID; // storage type of the blob
+    int commandBufferId; // the id for command buffer used by the blob
+  };
+  std::vector<SSA> ssa;
+  // blob name -> blob version -> blob information
+  std::unordered_map<std::string, std::unordered_map<size_t, BlobInfo>> blobInfoMap;
+  int currentCommandBufferId = 0;
+};
+
+void ssaAnalysis(Analysis& analysis, const NetDef& net) {
+  Analysis::SSA::BlobVersions frontier;
+
+  auto play = [&](size_t i, const OperatorDef& op) {
+    Analysis::SSA::BlobVersions inVersions;
+    for (const auto& s : op.input()) {
+      inVersions[s] = frontier[s];
+      analysis.blobInfoMap[s][frontier[s]].inUsages.push_back(i);
+    }
+    Analysis::SSA::BlobVersions outVersions;
+    auto isTemporaryImages = std::vector<int>();
+    for (auto j = 0; j < op.arg_size(); ++j) {
+      if (op.arg(j).name() == kMPSCNNOutputIsTempImageArg) {
+        for (auto k = 0; k < op.arg(j).ints_size(); ++k) {
+          isTemporaryImages.push_back(op.arg(j).ints(k));
+        }
+      }
+    }
+
+    for (auto j = 0; j < op.output_size(); j++) {
+      auto s = op.output(j);
+      if (frontier.find(s) != frontier.end()) {
+        frontier[s] += 1;
+      }
+      outVersions[s] = frontier[s];
+      if (outputStorageTypeMap.find(op.type()) != outputStorageTypeMap.end()) {
+        analysis.blobInfoMap[s][frontier[s]].storageType = outputStorageTypeMap[op.type()][j];
+      } else if (op.type() == "CopyFromMPSCNN") {
+        analysis.blobInfoMap[s][frontier[s]].storageType = StorageType::CPU;
+      } else if (isTemporaryImages.size() > 0) {
+        if (isTemporaryImages.at(j)) {
+          analysis.blobInfoMap[s][frontier[s]].storageType = StorageType::MPSTEMPORARYIMAGE;
+        } else {
+          analysis.blobInfoMap[s][frontier[s]].storageType = StorageType::MPSIMAGE;
+        }
+      } else if (op.type().find("MPSCNN") != std::string::npos) {
+        analysis.blobInfoMap[s][frontier[s]].storageType = StorageType::MPSTEMPORARYIMAGE;
+      } else {
+        analysis.blobInfoMap[s][frontier[s]].storageType = StorageType::CPU;
+      }
+      VLOG(2) << op.type() << " outputBlobTypes:" << s << " " << frontier[s] << " "
+              << as_string(analysis.blobInfoMap[s][frontier[s]].storageType);
+    }
+    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
+  };
+
+  for (auto i = 0; i < net.op_size(); ++i) {
+    play(i, net.op(i));
+  }
+}
+
+static void rewriteOutput(OperatorDef* op, int i) {
+  auto output = op->output(i);
+  op->set_output(i, output + "_M");
+}
+
+static void rewriteInput(OperatorDef* op, int i) {
+  auto input = op->input(i);
+  op->set_input(i, input + "_I");
+}
+
+static void insertOutputCopyFromMPSCNNOp(NetDef& predictNet, const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_type("CopyFromMPSCNN");
+  op->add_input(cpu_blob + "_M");
+  op->add_output(cpu_blob);
+}
+
+static void insertInputCopyFromMPSCNNOp(NetDef& predictNet, const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_type("CopyFromMPSCNN");
+  op->add_input(cpu_blob);
+  op->add_output(cpu_blob + "_I");
+}
+
+static void insertInputCopyToMPSCNNOp(NetDef& predictNet, const std::string& gpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_type("CopyToMPSCNN");
+  op->add_input(gpu_blob);
+  op->add_output(gpu_blob + "_I");
+}
+
+void commandBufferAnalysis(Analysis& analysis, NetDef& def) {
+  analysis.currentCommandBufferId = 0;
+  analysis.blobInfoMap[def.op(0).input(0)][0].commandBufferId = analysis.currentCommandBufferId;
+  for (auto i = 0; i < def.op_size(); ++i) {
+    auto op = def.op(i);
+    if (std::find(opsNeedsSync.begin(), opsNeedsSync.end(), op.type()) != opsNeedsSync.end()) {
+      analysis.currentCommandBufferId += 1;
+      for (auto j = 0; j < op.output_size(); ++j) {
+        auto outputBlob = op.output(j);
+        auto version = analysis.ssa[i].outVersions[outputBlob];
+        analysis.blobInfoMap[outputBlob][version].commandBufferId = analysis.currentCommandBufferId;
+      }
+    } else {
+      int inputCommandBufferId = 0;
+      for (auto j = 0; j < op.input_size(); ++j) {
+        auto inputBlob = op.input(j);
+        auto version = analysis.ssa[i].inVersions[inputBlob];
+        if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
+            analysis.blobInfoMap[inputBlob][version].storageType == StorageType::MPSIMAGE) {
+          analysis.currentCommandBufferId += 1;
+          inputCommandBufferId = analysis.currentCommandBufferId;
+        } else {
+          inputCommandBufferId =
+              fmax(inputCommandBufferId, analysis.blobInfoMap[inputBlob][version].commandBufferId);
+        }
+      }
+      // command buffer same as input
+      for (auto j = 0; j < op.output_size(); ++j) {
+        auto outputBlob = op.output(j);
+        auto version = analysis.ssa[i].outVersions[outputBlob];
+        analysis.blobInfoMap[outputBlob][version].commandBufferId = inputCommandBufferId;
+      }
+    }
+    for (auto j = 0; j < op.output_size(); ++j) {
+      auto outputBlob = op.output(j);
+      auto version = analysis.ssa[i].outVersions[outputBlob];
+      VLOG(2) << "command buffer analysis: " << outputBlob << " " << version << " "
+              << analysis.blobInfoMap[outputBlob][version].commandBufferId;
+    }
+  }
+}
+
+void analyzeNet(Analysis& analysis, NetDef& net) {
+  analysis.ssa.clear();
+  analysis.blobInfoMap.clear();
+  ssaAnalysis(analysis, net);
+  commandBufferAnalysis(analysis, net);
+}
+
+NetDef mergeCopyFromMPSCNN(Analysis& analysis, NetDef& def) {
+  analyzeNet(analysis, def);
+  // command buffer id -> op id
+  std::unordered_map<int, std::vector<size_t>> commandBufferToOps;
+  // For CopyFromMPSCNN, find the command buffer id each input blob uses. and
+  // aggreagate the ops with the same command buffer
+  for (auto i = 0; i < def.op_size(); ++i) {
+    auto op = def.op(i);
+    if (op.type() == "CopyFromMPSCNN") {
+      auto blobName = op.input(0);
+      auto version = analysis.ssa[i].inVersions[blobName];
+      auto commandId = analysis.blobInfoMap[blobName][version].commandBufferId;
+      VLOG(2) << "Command buffer to ops:" << blobName << " " << version << " " << commandId;
+      if (commandBufferToOps.find(commandId) == commandBufferToOps.end()) {
+        commandBufferToOps[commandId] = std::vector<size_t>();
+      }
+      commandBufferToOps[commandId].push_back(i);
+    }
+  }
+
+  std::vector<size_t> opsToRemove;
+  for (auto item : commandBufferToOps) {
+    auto commandBufferId = item.first;
+    auto ops = item.second;
+    if (ops.size() > 1) {
+      VLOG(2) << "Merging for command buffer:" << commandBufferId;
+      // Let's use the first input as an indicator whether the data is for
+      // external output or internal use, if the data used by intermediate node,
+      // we want to keep the first operator, otherwise, we want to keep
+      // the last operator.
+      // [LATER]There might be cases when some of the data is for external output and
+      // others used by intermediate node, we'll need to have better heuristics
+      // for these cases.
+      auto externalUse = false;
+      auto firstCopy = def.op(ops[0]);
+      auto firstOutput = firstCopy.output(0);
+      for (auto i = 0; i < def.external_output_size(); ++i) {
+        if (def.external_output(i) == firstOutput) {
+          externalUse = true;
+        }
+      }
+      int removeStart, removeEnd, keepIndex;
+      if (externalUse) {
+        // change the last op into the new op and remove the other ops;
+        removeStart = 0;
+        removeEnd = ops.size() - 1;
+        keepIndex = ops[removeEnd];
+      } else {
+        removeStart = 1;
+        removeEnd = ops.size();
+        keepIndex = ops[removeStart - 1];
+      }
+      auto* op = def.mutable_op(keepIndex);
+      auto inputOutputs = std::set<std::pair<string, string>>();
+      for (auto i = removeStart; i < removeEnd; ++i) {
+        auto op0 = def.op(ops[i]);
+        if (op0.input(0) != op->input(0)) {
+          inputOutputs.insert(make_pair(op0.input(0), op0.output(0)));
+        }
+      }
+      for (auto inputOutput : inputOutputs) {
+        op->add_input(inputOutput.first);
+        op->add_output(inputOutput.second);
+      }
+      for (auto i = removeStart; i < removeEnd; ++i) {
+        opsToRemove.push_back(ops[i]);
+      }
+    }
+  }
+
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+  for (auto i = 0; i < def.op_size(); ++i) {
+    if (std::find(opsToRemove.begin(), opsToRemove.end(), i) == opsToRemove.end()) {
+      const auto& ogOp = def.op(i);
+      auto op = mdef.add_op();
+      op->CopyFrom(ogOp);
+    }
+  }
+  return mdef;
+}
+
+/* Remove the CopyToMPSCNN ops that has the same input/output version
+ */
+NetDef mergeCopyToMPSCNN(Analysis& analysis, NetDef& def) {
+  std::vector<size_t> opsToRemove;
+  std::set<std::pair<string, size_t>> copiedBlobs;
+  for (auto i = 0; i < def.op_size(); ++i) {
+    auto op = def.op(i);
+    if (def.op(i).type() == "CopyToMPSCNN") {
+      auto blobName = op.input(0);
+      auto version = analysis.ssa[i].inVersions[blobName];
+      auto pair = make_pair(blobName, version);
+      if (std::find(copiedBlobs.begin(), copiedBlobs.end(), pair) == copiedBlobs.end()) {
+        copiedBlobs.insert(pair);
+      } else {
+        opsToRemove.push_back(i);
+      }
+    }
+  }
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+  for (auto i = 0; i < def.op_size(); ++i) {
+    if (std::find(opsToRemove.begin(), opsToRemove.end(), i) == opsToRemove.end()) {
+      const auto& ogOp = def.op(i);
+      auto op = mdef.add_op();
+      op->CopyFrom(ogOp);
+    }
+  }
+  return mdef;
+}
+
+bool addTempImageArgs(Analysis& analysis, NetDef& def) {
+  analyzeNet(analysis, def);
+
+  std::vector<int> synced; // synced command buffer ids;
+  std::set<std::pair<string, size_t>> mpsImageBlobs; // blobname, version
+
+  // We want to add temp arg one by one since it changes the command buffer id
+  // for later operators.
+  bool found = false;
+  // identify the images that the command buffer is synced before
+  for (auto i = 0; i < def.op_size(); ++i) {
+    auto op = def.op(i);
+    if (op.type().find("MPSCNN") == string::npos) {
+      continue;
+    }
+    for (auto j = 0; j < op.input_size(); ++j) {
+      auto inputBlob = op.input(j);
+      auto version = analysis.ssa[i].inVersions[inputBlob];
+      auto commandId = analysis.blobInfoMap[inputBlob][version].commandBufferId;
+      if (std::find(opsNeedsSync.begin(), opsNeedsSync.end(), op.type()) != opsNeedsSync.end()) {
+        synced.push_back(commandId);
+        break;
+      }
+      if (std::find(synced.begin(), synced.end(), commandId) != synced.end() &&
+          analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
+          analysis.blobInfoMap[inputBlob][version].storageType == StorageType::MPSTEMPORARYIMAGE) {
+        VLOG(2) << "mpsimage blob:" << inputBlob << " " << version << " "
+                << "input " << j << " command: " << commandId;
+        mpsImageBlobs.insert(make_pair(inputBlob, version));
+        found = true;
+      }
+    }
+    if (found) {
+      break;
+    }
+  }
+  // find the blob and add argument
+  if (found) {
+    for (auto i = 0; i < def.op_size(); ++i) {
+      auto op = def.mutable_op(i);
+      std::vector<int> isTempImages;
+      bool setArg = false;
+      for (auto j = 0; j < op->output_size(); ++j) {
+        auto outputBlob = op->output(j);
+        auto version = analysis.ssa[i].outVersions[outputBlob];
+        if (mpsImageBlobs.find(make_pair(outputBlob, version)) != mpsImageBlobs.end()) {
+          setArg = true;
+          isTempImages.push_back(0);
+        } else {
+          isTempImages.push_back(1);
+        }
+      }
+      if (setArg) {
+        auto& arg = *(op->add_arg());
+        arg.set_name(kMPSCNNOutputIsTempImageArg);
+        for (auto j = 0; j < isTempImages.size(); ++j) {
+          arg.add_ints(isTempImages[j]);
+        }
+      }
+    }
+  }
+  return found;
+}
+
+NetDef insertCopies(const NetDef& def) {
+  // For this version, we insert CopyFromMPSCNN both for
+  // intermediate nodes and the output node when necessary
+  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
+  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
+
+  Analysis analysis;
+  ssaAnalysis(analysis, def);
+
+  CAFFE_ENFORCE_GE(def.op_size(), 1);
+
+  const auto& outputBlob = def.external_output(0);
+  const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
+
+  // This should hold true by definition of the SSA analysis.
+  CAFFE_ENFORCE(analysis.blobInfoMap[outputBlob].find(outputBlobVersion) ==
+                    analysis.blobInfoMap[outputBlob].end() ||
+                analysis.blobInfoMap[outputBlob][outputBlobVersion].inUsages.size() == 0);
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+
+  const auto& opKeyList = CPUOperatorRegistry()->Keys();
+  const auto& opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
+
+  for (auto i = 0; i < def.op_size(); ++i) {
+    const auto& ogOp = def.op(i);
+    auto inputsToRewrite = std::vector<int>();
+
+    for (auto j = 0; j < ogOp.input_size(); j++) {
+      // The blob storage type accepted by the operator
+      auto expectedBlobType = StorageType::MPSTEMPORARYIMAGE;
+      // The storage type for blob produced by previous operators
+      // if it's not produced by previous operators, then it should be network
+      // parameters which are stored in CPU
+      auto actualBlobType = StorageType::CPU;
+      // For non-mpscnn operators, we assume the expected storage type to be CPU
+      if (ogOp.type().find("MPSCNN") == std::string::npos) {
+        expectedBlobType = StorageType::CPU;
+      }
+      auto inputBlob = ogOp.input(j);
+      auto version = analysis.ssa[i].inVersions[inputBlob];
+      // Check whether the blob is produced by previous operators
+      if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
+          analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) {
+        actualBlobType = analysis.blobInfoMap[inputBlob][version].storageType;
+        VLOG(2) << "Found " << inputBlob << " " << j << " with type" << as_string(actualBlobType);
+      }
+      if (inputStorageTypeMap.find(ogOp.type()) != inputStorageTypeMap.end()) {
+        expectedBlobType = inputStorageTypeMap[ogOp.type()][j];
+      }
+      if (expectedBlobType != actualBlobType) {
+        if (expectedBlobType == StorageType::CPU &&
+            (actualBlobType == StorageType::MPSTEMPORARYIMAGE ||
+             actualBlobType == StorageType::MPSIMAGE)) {
+          // copy input(MPSCNN) to input_I(CPU)
+          insertInputCopyFromMPSCNNOp(mdef, ogOp.input(j));
+          // rewrite input to input_I for the operator
+          inputsToRewrite.push_back(j);
+        } else if ((expectedBlobType == StorageType::MPSTEMPORARYIMAGE ||
+                    expectedBlobType == StorageType::MPSIMAGE) &&
+                   actualBlobType == StorageType::CPU) {
+          insertInputCopyToMPSCNNOp(mdef, ogOp.input(j));
+          inputsToRewrite.push_back(j);
+        } // We don't need to insert copies in other cases
+      }
+    }
+
+    auto op = mdef.add_op();
+    op->CopyFrom(ogOp);
+
+    for (auto j = 0; j < inputsToRewrite.size(); ++j) {
+      rewriteInput(op, inputsToRewrite[j]);
+    }
+
+    // rewrite name for (single) external input
+    if (op->type().find("MPSCNN") != std::string::npos &&
+        opKeySet.find(op->type()) != opKeySet.end()) {
+      // input used by multiple ops
+      const auto& inputBlob = def.external_input(0);
+      if (std::find(analysis.blobInfoMap[inputBlob][0].inUsages.begin(),
+                    analysis.blobInfoMap[inputBlob][0].inUsages.end(),
+                    i) != analysis.blobInfoMap[inputBlob][0].inUsages.end()) {
+        for (auto j = 0; j < op->input_size(); ++j) {
+          if (op->input(j) == def.external_input(0)) {
+            op->set_input(j, "__METAL_INPUT_COPY__");
+          }
+        }
+      }
+    }
+
+    // if the output is in external output, copy from metal when necessary
+    for (auto j = 0; j < op->output_size(); ++j) {
+      for (auto k = 0; k < def.external_output_size(); ++k) {
+        // Assuming external output blob has unique name, e.g. only version 0
+        // of the blob is used as the output
+        if (op->output(j) == def.external_output(k) &&
+            analysis.blobInfoMap[op->output(j)][0].storageType != StorageType::CPU) {
+          // copy output_M(MPSCNN) to output(CPU)
+          insertOutputCopyFromMPSCNNOp(mdef, op->output(j));
+          // rewrite output to output_M for the operator
+          rewriteOutput(op, j);
+        }
+      }
+    }
+  }
+
+  // Since adding temp image arg changes the result for command buffer analysis,
+  // which is the analysis the function is based on, we'll add one temp image
+  // arg at a time and re-run ssa analysis after each and repeat the process
+  // until convergence
+  int i = 0;
+  while (addTempImageArgs(analysis, mdef) && i < 3 * mdef.op_size()) {
+    i++;
+  };
+
+  mdef = mergeCopyFromMPSCNN(analysis, mdef);
+  mdef = mergeCopyToMPSCNN(analysis, mdef);
+
+  return mdef;
+}
+
+NetDef rewriteForMetalI(const NetDef& def) {
+  NetDef mdef;
+  mdef.CopyFrom(def);
+
+  const auto& opKeyList = CPUOperatorRegistry()->Keys();
+  const auto& opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
+  for (auto i = 0; i < mdef.op_size(); ++i) {
+    auto* op = mdef.mutable_op(i);
+    const auto mpscnnOp = std::string("MPSCNN") + op->type();
+    if (opKeySet.find(mpscnnOp) != opKeySet.end()) {
+      op->set_type(mpscnnOp);
+    }
+  }
+
+  static std::set<std::string> mpscnnInputOps = {
+      "CopyToMPSCNN", "MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess"};
+
+  mdef = insertCopies(mdef);
+
+  mdef = runMPSCNNFusion(mdef);
+
+  mdef = setSpecialArgs(mdef);
+
+  CAFFE_ENFORCE_GE(mdef.op_size(), 2);
+  CAFFE_ENFORCE(mpscnnInputOps.find(mdef.op(0).type()) != mpscnnInputOps.end());
+  return mdef;
+}
+} // namespace
+
+NetDef setSpecialArgs(const NetDef& def) {
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  for (auto i = 0; i < mdef.op_size(); ++i) {
+    auto* op = mdef.mutable_op(i);
+    // setting post_nms_top_N for MPSCNNGenerateProposalsCPP to 36 due to the
+    // texture array length constraint in RoIWarp
+    if (op->type() == "MPSCNNGenerateProposalsCPP" || op->type() == "GenerateProposalsCPP") {
+      auto* arg = op->mutable_arg(0);
+      arg->set_i(36);
+    }
+  }
+  return mdef;
+}
+
+bool tryConvertToMPSCNNIntermediateCopies(const NetDef& initNet,
+                                          const NetDef& predictNet,
+                                          NetDef* metalPredictNet) {
+// iOS 10.0 and above.
+#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v)                                 \
+  ([[[UIDevice currentDevice] systemVersion] compare:v options:NSNumericSearch] != \
+   NSOrderedAscending)
+#define SYSTEM_VERSION_EQUAL_TO(v) \
+  ([[[UIDevice currentDevice] systemVersion] compare:v options:NSNumericSearch] == NSOrderedSame)
+
+  if (!SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0")) {
+    LOG(ERROR) << "MPSCNN is only supported for ios version above 11.0.";
+    return false;
+  }
+#undef SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO
+#undef SYSTEM_VERSION_EQUAL_TO
+
+  // The iOS GPU Family 3 v2 feature set. Introduced with the Apple A9 GPU and iOS 10.0.
+  // Don't instantiate the MPSCNNContext, as that compiles the kernel source.
+  if (![MTLCreateSystemDefaultDevice() supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]) {
+    LOG(ERROR) << "The iOS GPU is less than an A9, so MPSCNN is not available";
+    return false;
+  }
+
+  try {
+    // Instantiating the net and catching failures allows us to
+    Workspace ws;
+    ws.RunNetOnce(initNet);
+    // Throws if unsupported operators are found.
+    *metalPredictNet = rewriteForMetalI(predictNet);
+    *metalPredictNet = annotateDefWithReadCounts(*metalPredictNet);
+    // Throws if unsupported parameters are found.
+    ws.CreateNet(*metalPredictNet);
+    LOG(INFO) << "MPSCNN is successfully enabled";
+    return true;
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Caught exception trying to convert NetDef to MPSCNN: " << e.what();
+    return false;
+  }
+}
+} // caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
new file mode 100644
index 0000000..38e54cd
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
@@ -0,0 +1,1165 @@
+// @generated
+
+static const char* MPSCNN_KERNELS = R"V0G0N(
+
+
+#include <metal_stdlib>
+
+using namespace metal;
+
+constant ushort ushort_arg_0[[function_constant(0)]];
+constant ushort ushort_arg_1[[function_constant(1)]];
+constant ushort ushort_arg_2[[function_constant(2)]];
+constant ushort ushort_arg_3[[function_constant(3)]];
+constant ushort ushort_arg_4[[function_constant(4)]];
+constant ushort ushort_arg_5[[function_constant(5)]];
+constant ushort ushort_arg_6[[function_constant(6)]];
+constant ushort ushort_arg_7[[function_constant(7)]];
+constant ushort ushort_arg_8[[function_constant(8)]];
+constant ushort ushort_arg_9[[function_constant(9)]];
+
+inline constexpr ushort divRoundUp(ushort x, ushort y) { return (x + (y - 1)) / y; }
+
+kernel void affine(constant half4* scale[[buffer(0)]],
+                   constant half4* shift[[buffer(1)]],
+                   texture2d_array<half, access::read> in[[texture(0)]],
+                   texture2d_array<half, access::write> out[[texture(1)]],
+                   ushort3 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    const half4 scale_c = scale[gid.z % divRoundUp(C, 4)];
+    const half4 shift_c = shift[gid.z % divRoundUp(C, 4)];
+    ushort2 gid_(gid.x, gid.y);
+    const half4 x = in.read(gid_, gid.z);
+    const half4 y = scale_c * x + shift_c;
+    out.write(y, gid_, gid.z);
+}
+
+kernel void affine_nonarray(constant half4* scale[[buffer(0)]],
+                            constant half4* shift[[buffer(1)]],
+                            texture2d<half, access::read> in[[texture(0)]],
+                            texture2d<half, access::write> out[[texture(1)]],
+                            ushort2 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    const half4 scale_c = scale[0];
+    const half4 shift_c = shift[0];
+    half4 x = in.read(gid);
+    const half4 y = scale_c * x + shift_c;
+    out.write(y, gid);
+}
+
+kernel void prelu_nonshared(constant half4* weights[[buffer(0)]],
+                            texture2d_array<half, access::read> in[[texture(0)]],
+                            texture2d_array<half, access::write> out[[texture(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    const ushort S = ushort_arg_1;
+    const bool channel_shared = S == 1;
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    half4 w = channel_shared ? half4(weights[0][0], weights[0][0], weights[0][0], weights[0][0])
+    : weights[gid.z % divRoundUp(C, 4)];
+    ushort2 gid_(gid.x, gid.y);
+    half4 x = in.read(gid_, gid.z);
+    half4 y = select(x * w, x, x > 0.0h);
+    out.write(y, gid_, gid.z);
+}
+
+kernel void prelu_nonshared_nonarray(constant half4* weights[[buffer(0)]],
+                                     texture2d<half, access::read> in[[texture(0)]],
+                                     texture2d<half, access::write> out[[texture(1)]],
+                                     ushort2 gid[[thread_position_in_grid]]) {
+    // const ushort C = ushort_arg_0;
+    const ushort S = ushort_arg_1;
+    const bool channel_shared = S == 1;
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    half4 w = channel_shared ? half4(weights[0][0], weights[0][0], weights[0][0], weights[0][0])
+    : weights[0];
+    half4 x = in.read(gid);
+    half4 y = select(x * w, x, x > 0.0h);
+    out.write(y, gid);
+}
+
+// One block per texture.
+// 256 threads per block.
+using AccT = float4;
+
+constant const bool instance_norm_has_prelu = ushort_arg_1 > 0;
+
+kernel void instance_norm(
+                          constant half4* weights[[buffer(0)]],
+                          constant half4* bias[[buffer(1)]],
+                          constant half4* preluWeights[[ buffer(2), function_constant(instance_norm_has_prelu) ]],
+                          texture2d_array<half, access::read> in[[texture(0)]],
+                          texture2d_array<half, access::write> out[[texture(1)]],
+                          ushort3 gid[[thread_position_in_grid]],
+                          ushort tid[[thread_index_in_threadgroup]],
+                          ushort3 tcount[[threads_per_threadgroup]]) {
+    if (gid.z >= out.get_array_size()) {
+        return;
+    }
+    const ushort C = ushort_arg_0;
+    const ushort S = ushort_arg_1;
+    const bool channel_shared = S == 1;
+    const ushort c = gid.z % divRoundUp(C, 4);
+    constexpr ushort THREADGROUP_SIZE = 256;
+    
+    threadgroup AccT per_thread_state[THREADGROUP_SIZE];
+    // Each block handles a single texture.
+    per_thread_state[tid] = 0;
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            per_thread_state[tid] += static_cast<AccT>(in.read(ushort2(x, y), gid.z));
+        }
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    // 256 -> 32 reduction
+    if (tid < 32) {
+        per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+        per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+        per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+        per_thread_state[tid + 224];
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (tid == 0) {
+        AccT sum = 0.0;
+        for (ushort i = 0; i < 32; ++i) {
+            sum += per_thread_state[i];
+        }
+        sum /= (in.get_width() * in.get_height());
+        per_thread_state[0] = sum;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Broadcast to all threads.
+    const AccT mean = per_thread_state[0];
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    per_thread_state[tid] = 0;
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            AccT delta = static_cast<AccT>(in.read(ushort2(x, y), gid.z)) - mean;
+            per_thread_state[tid] += delta * delta;
+        }
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    // 256 -> 32 reduction
+    if (tid < 32) {
+        per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+        per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+        per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+        per_thread_state[tid + 224];
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (tid == 0) {
+        AccT sum = 0.0;
+        for (ushort i = 0; i < 32; ++i) {
+            sum += per_thread_state[i];
+        }
+        sum /= (in.get_width() * in.get_height());
+        per_thread_state[0] = 1.0 / sqrt(max(sum, AccT(1e-5, 1e-5, 1e-5, 1e-5)) + 1.0e-5);
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Broadcast to all threads.
+    const AccT inv_var = per_thread_state[0];
+    
+    const AccT c_weights = static_cast<AccT>(weights[c]);
+    const AccT c_bias = static_cast<AccT>(bias[c]);
+    
+    const AccT scale = inv_var * c_weights;
+    const AccT shift = c_bias - mean * scale;
+    
+    half4 w;
+    if (instance_norm_has_prelu) {
+        w = channel_shared ? half4(preluWeights[0][0]) : preluWeights[c];
+    }
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            half4 scaled =
+            static_cast<half4>(static_cast<AccT>(in.read(ushort2(x, y), gid.z)) * scale + shift);
+            if (instance_norm_has_prelu) {
+                scaled = select(scaled * w, scaled, scaled > 0.0h);
+            }
+            out.write(scaled, ushort2(x, y), gid.z);
+        }
+    }
+}
+
+// One block per texture.
+// 256 threads per block.
+kernel void instance_norm_nonarray(
+                                   constant half4* weights[[buffer(0)]],
+                                   constant half4* bias[[buffer(1)]],
+                                   constant half4* preluWeights[[ buffer(2), function_constant(instance_norm_has_prelu) ]],
+                                   texture2d<half, access::read> in[[texture(0)]],
+                                   texture2d<half, access::write> out[[texture(1)]],
+                                   ushort3 gid[[thread_position_in_grid]],
+                                   ushort tid[[thread_index_in_threadgroup]],
+                                   ushort3 tcount[[threads_per_threadgroup]]) {
+    // const ushort C = ushort_arg_0;
+    const ushort S = ushort_arg_1;
+    const bool channel_shared = S == 1;
+    constexpr ushort THREADGROUP_SIZE = 256;
+    
+    threadgroup AccT per_thread_state[THREADGROUP_SIZE];
+    // Each block handles a single texture.
+    per_thread_state[tid] = 0;
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            per_thread_state[tid] += static_cast<AccT>(in.read(ushort2(x, y)));
+        }
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    // 256 -> 32 reduction
+    if (tid < 32) {
+        per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+        per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+        per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+        per_thread_state[tid + 224];
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (tid == 0) {
+        AccT sum = 0.0;
+        for (ushort i = 0; i < 32; ++i) {
+            sum += per_thread_state[i];
+        }
+        sum /= (in.get_width() * in.get_height());
+        per_thread_state[0] = sum;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Broadcast to all threads.
+    const AccT mean = per_thread_state[0];
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    per_thread_state[tid] = 0;
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            AccT delta = static_cast<AccT>(in.read(ushort2(x, y))) - mean;
+            per_thread_state[tid] += delta * delta;
+        }
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    // 256 -> 32 reduction
+    if (tid < 32) {
+        per_thread_state[tid] += per_thread_state[tid + 32] + per_thread_state[tid + 64] +
+        per_thread_state[tid + 96] + per_thread_state[tid + 128] +
+        per_thread_state[tid + 160] + per_thread_state[tid + 192] +
+        per_thread_state[tid + 224];
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (tid == 0) {
+        AccT sum = 0.0;
+        for (ushort i = 0; i < 32; ++i) {
+            sum += per_thread_state[i];
+        }
+        sum /= (in.get_width() * in.get_height());
+        per_thread_state[0] = 1.0 / sqrt(max(sum, AccT(1e-5, 1e-5, 1e-5, 1e-5)) + 1.0e-5);
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Broadcast to all threads.
+    const AccT inv_var = per_thread_state[0];
+    
+    const AccT c_weights = static_cast<AccT>(weights[0]);
+    const AccT c_bias = static_cast<AccT>(bias[0]);
+    
+    const AccT scale = inv_var * c_weights;
+    const AccT shift = c_bias - mean * scale;
+    
+    half4 w;
+    if (instance_norm_has_prelu) {
+        w = channel_shared ? half4(preluWeights[0][0]) : preluWeights[0];
+    }
+    for (ushort y = gid.y; y < in.get_height(); y += tcount.y) {
+        for (ushort x = gid.x; x < in.get_width(); x += tcount.x) {
+            half4 scaled = static_cast<half4>(static_cast<AccT>(in.read(ushort2(x, y))) * scale + shift);
+            if (instance_norm_has_prelu) {
+                scaled = select(scaled * w, scaled, scaled > 0.0h);
+            }
+            out.write(scaled, ushort2(x, y));
+        }
+    }
+}
+
+kernel void copy_nchw_to_metal(constant float* in[[buffer(0)]],
+                               texture2d_array<half, access::write> out[[texture(0)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    const ushort H = ushort_arg_1;
+    const ushort W = ushort_arg_2;
+    if (gid.x >= W || gid.y >= H) {
+        return;
+    }
+    
+    const ushort n = gid.z / divRoundUp(C, 4);
+    const ushort c = gid.z - n * divRoundUp(C, 4);
+    
+    // TODO: are the `else` branches needed?
+    // TODO: trick the optimizer for case where C == 4?
+#define CHW_TO_CHWP4(idx, n, c_, h, w)                                     \
+if ((c_) < C) {                                                          \
+trns[idx] = in[n * H * W * C + int(c_) * H * W + int(h) * W + int(w)]; \
+} else {                                                                 \
+trns[idx] = 0.0h;                                                      \
+}
+    
+    half4 trns;
+    CHW_TO_CHWP4(0, n, c * 4 + 0, gid.y, gid.x);
+    CHW_TO_CHWP4(1, n, c * 4 + 1, gid.y, gid.x);
+    CHW_TO_CHWP4(2, n, c * 4 + 2, gid.y, gid.x);
+    CHW_TO_CHWP4(3, n, c * 4 + 3, gid.y, gid.x);
+#undef CHW_TO_CHWP4
+    
+    out.write(trns, gid.xy, gid.z);
+}
+
+kernel void copy_nchw_to_metal_nonarray(constant float* in[[buffer(0)]],
+                                        texture2d<half, access::write> out[[texture(0)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    const ushort H = ushort_arg_1;
+    const ushort W = ushort_arg_2;
+    
+    if (gid.x >= W || gid.y >= H) {
+        return;
+    }
+    
+    half4 trns;
+    // TODO: are the `else` branches needed?
+    // TODO: trick the optimizer for case where C % 4 == 0?
+    
+#define CHW_TO_CHWP4(idx, c, h, w)                        \
+if ((c) < C) {                                          \
+trns[idx] = in[int(c) * H * W + int(h) * W + int(w)]; \
+} else {                                                \
+trns[idx] = 0.0h;                                     \
+}
+    
+    CHW_TO_CHWP4(0, 0, gid.y, gid.x);
+    CHW_TO_CHWP4(1, 1, gid.y, gid.x);
+    CHW_TO_CHWP4(2, 2, gid.y, gid.x);
+    CHW_TO_CHWP4(3, 3, gid.y, gid.x);
+#undef CHW_TO_CHWP4
+    
+    out.write(trns, gid.xy);
+}
+
+kernel void copy_metal_to_nchw(texture2d_array<half, access::read> in[[texture(0)]],
+                               device float* out[[buffer(0)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    const ushort H = ushort_arg_1;
+    const ushort W = ushort_arg_2;
+    
+    if (gid.x >= W || gid.y >= H) {
+        return;
+    }
+    const ushort n = gid.z / divRoundUp(C, 4);
+    const ushort c = gid.z - n * divRoundUp(C, 4);
+    
+    half4 cs = in.read(gid.xy, gid.z);
+    
+#define CHWP4_TO_CHW(idx, n, c_, h, w)                                    \
+if ((c_) < C) {                                                         \
+out[n * H * W * C + int(c_) * H * W + int(h) * W + int(w)] = cs[idx]; \
+}
+    
+    CHWP4_TO_CHW(0, n, c * 4 + 0, gid.y, gid.x);
+    CHWP4_TO_CHW(1, n, c * 4 + 1, gid.y, gid.x);
+    CHWP4_TO_CHW(2, n, c * 4 + 2, gid.y, gid.x);
+    CHWP4_TO_CHW(3, n, c * 4 + 3, gid.y, gid.x);
+#undef CHWP4_TO_CHW
+}
+
+kernel void copy_metal_to_nchw_nonarray(texture2d<half, access::read> in[[texture(0)]],
+                                        device float* out[[buffer(0)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+    const ushort C = ushort_arg_0;
+    const ushort H = ushort_arg_1;
+    const ushort W = ushort_arg_2;
+    
+    if (gid.x >= W || gid.y >= H) {
+        return;
+    }
+    
+    half4 cs = in.read(gid.xy);
+    
+#define CHWP4_TO_CHW(idx, c, h, w)                       \
+if ((c) < C) {                                         \
+out[int(c) * H * W + int(h) * W + int(w)] = cs[idx]; \
+}
+    
+    CHWP4_TO_CHW(0, 0, gid.y, gid.x);
+    CHWP4_TO_CHW(1, 1, gid.y, gid.x);
+    CHWP4_TO_CHW(2, 2, gid.y, gid.x);
+    CHWP4_TO_CHW(3, 3, gid.y, gid.x);
+#undef CHWP4_TO_CHW
+}
+
+kernel void convtranspose_upscale(texture2d_array<half, access::read> in[[texture(0)]],
+                                  texture2d_array<half, access::write> out[[texture(1)]],
+                                  ushort3 gid[[thread_position_in_grid]]) {
+    // All resolved at compile time.
+    // Assume symmetric kernel/stride/pad for now.
+    const ushort kernel_ = ushort_arg_0;
+    const ushort stride = ushort_arg_1;
+    const ushort pad = ushort_arg_2;
+    
+    half4 zero(0.0h, 0.0h, 0.0h, 0.0h);
+    
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    const ushort2 gid_ = gid.xy;
+    if (gid.x < kernel_ - 1 - pad || gid.y < kernel_ - 1 - pad) {
+        out.write(zero, gid_, gid.z);
+        return;
+    }
+    
+    if (((gid.x - (kernel_ - 1 - pad)) % stride == 0) &&
+        ((gid.y - (kernel_ - 1 - pad)) % stride == 0)) {
+        ushort2 in_pos((gid.x - (kernel_ - 1 - pad)) / stride, (gid.y - (kernel_ - 1 - pad)) / stride);
+        
+        if (in_pos.x < in.get_width() && in_pos.y < in.get_height()) {
+            half4 input = in.read(in_pos, gid.z);
+            out.write(input, gid_, gid.z);
+        } else {
+            out.write(zero, gid_, gid.z);
+        }
+    } else {
+        out.write(zero, gid_, gid.z);
+    }
+}
+
+constant bool has_in_arr = (ushort_arg_7 > 1 || ushort_arg_0 * ushort_arg_1 * ushort_arg_6 > 4);
+constant bool has_out_arr = (ushort_arg_7 > 1 || ushort_arg_6 > 4);
+constant bool has_in_tex = (!has_in_arr);
+constant bool has_out_tex = (!has_out_arr);
+
+kernel void col2im(
+                   texture2d_array<half, access::read> ina[[ texture(0), function_constant(has_in_arr) ]],
+                   texture2d<half, access::read> in[[ texture(0), function_constant(has_in_tex) ]],
+                   texture2d_array<half, access::write> outa[[ texture(1), function_constant(has_out_arr) ]],
+                   texture2d<half, access::write> out[[ texture(1), function_constant(has_out_tex) ]],
+                   constant half4* bias[[buffer(0)]],
+                   ushort3 gid[[thread_position_in_grid]]) {
+    if (has_out_tex) {
+      if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+      }
+    } else {
+      if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+        return;
+      }
+    }
+    const ushort kernel_h = ushort_arg_0;
+    const ushort kernel_w = ushort_arg_1;
+    const ushort stride_h = ushort_arg_2;
+    const ushort stride_w = ushort_arg_3;
+    const ushort pad_l = ushort_arg_4;
+    const ushort pad_t = ushort_arg_5;
+    const ushort C = ushort_arg_6;
+    //  const int N = ushort_arg_7;
+    const ushort height_col = ushort_arg_8; 
+    const ushort width_col = ushort_arg_9;
+    
+    const ushort n = gid.z / divRoundUp(C, 4);
+    const ushort c = gid.z - n * divRoundUp(C, 4);
+    
+    const ushort w = gid.x + pad_l;
+    const ushort h = gid.y + pad_t;
+    
+    // compute the start and end of the output
+    const ushort w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const ushort w_col_end = min(ushort(w / stride_w + 1), ushort(width_col));
+    const ushort h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const ushort h_col_end = min(ushort(h / stride_h + 1), ushort(height_col));
+    
+    float4 val = static_cast<float4>(bias[c]);
+    for (ushort h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (ushort w_col = w_col_start; w_col < w_col_end; ++w_col) {
+            const ushort w_k = w - w_col * stride_w;
+            const ushort h_k = h - h_col * stride_h;
+            
+            // layout is essentially: [N][K][K][C][H][W]
+            // - where the divRoundUp(K * K * C, 4) channels are interleaved as usual.
+            // Thus, it's actually [N][divRoundUp(K * K * C, 4)][H][W].
+            
+            // If C % 4 is not zero, then we have to play some games via partial indexing.
+            // TODO: is it worth optimizing this loop via padding in C?
+            if (C % 4 == 0) {
+                ushort c_col = n * kernel_h * kernel_w * divRoundUp(C, 4) +
+                h_k * kernel_w * divRoundUp(C, 4) + w_k * divRoundUp(C, 4) + c;
+                if (has_in_arr) {
+                    val += static_cast<float4>(ina.read(ushort2(w_col, h_col), c_col));
+                }
+                if (has_in_tex) {
+                    val += static_cast<float4>(in.read(ushort2(w_col, h_col), c_col));
+                }
+            } else {
+                half4 components(0, 0, 0, 0);
+                for (auto i = 0; i < 4; ++i) {
+                    ushort c_col_i = n * divRoundUp(kernel_h * kernel_w * C, 4) * 4 + h_k * kernel_w * C +
+                    w_k * C + c * 4 + i;
+                    ushort c_col_i_z = c_col_i / 4;
+                    ushort c_col_i_off = c_col_i - c_col_i_z * 4;
+                    if (has_in_arr) {
+                        components[i] = ina.read(ushort2(w_col, h_col), c_col_i_z)[c_col_i_off];
+                    }
+                    if (has_in_tex) {
+                        components[i] = in.read(ushort2(w_col, h_col))[c_col_i_off];
+                    }
+                }
+                val += static_cast<float4>(components);
+            }
+        }
+    }
+    if (has_out_arr) {
+        outa.write(static_cast<half4>(val), gid.xy, gid.z);
+    }
+    if (has_out_tex) {
+        out.write(static_cast<half4>(val), gid.xy);
+    }
+}
+
+kernel void preprocess_stylizer(device uchar4* in[[buffer(0)]],
+                                constant half* mean[[buffer(1)]],
+                                constant half4* noise[[buffer(2)]],
+                                texture2d<half, access::write> out[[texture(0)]],
+                                ushort2 gid[[thread_position_in_grid]]) {
+    
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    const ushort noise_size = ushort_arg_0;
+    
+    half4 mean_half(mean[0], mean[1], mean[2], 0.0h);
+    uint input_noise_idx = ((uint)out.get_width() * (uint)gid.y + (uint)gid.x) % (noise_size / 4);
+    const half4 input_noise = noise[input_noise_idx];
+    const uint W = out.get_width();
+#define in_at(h, w) in[(uint)(h)*W + (uint)(w)]
+    uchar4 input = in_at(gid.y, gid.x);
+#undef in_at
+    half4 input_half = static_cast<half4>(input);
+    out.write(input_half - mean_half + input_noise, gid);
+}
+
+kernel void deprocess_stylizer(texture2d<half, access::read> in[[texture(0)]],
+                               device uchar4* out[[buffer(0)]],
+                               constant half* mean[[buffer(1)]],
+                               ushort2 gid[[thread_position_in_grid]]) {
+    if (gid.x >= in.get_width() || gid.y >= in.get_height()) {
+        return;
+    }
+    
+    half4 value = in.read(gid);
+    
+    half4 mean_h(mean[0], mean[1], mean[2], 0.0h);
+    half4 min_h(0.0h, 0.0h, 0.0h, 255.0h);
+    half4 max_h(255.0h, 255.0h, 255.0h, 255.0h);
+    half4 clamped = clamp(value + mean_h, min_h, max_h);
+    const uint W = in.get_width();
+#define out_at(h, w, v) out[(uint)(h)*W + (uint)(w)] = (v)
+    out_at(gid.y, gid.x, static_cast<uchar4>(clamped));
+#undef out_at
+}
+
+kernel void reflection_padding_nonarray(texture2d<half, access::read> in[[texture(0)]],
+                                        texture2d<half, access::write> out[[texture(1)]],
+                                        ushort2 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    ushort H = in.get_height();
+    ushort PH = out.get_height();
+    
+    // Note: we assume symmetric padding on H/W here, which is verified
+    // in the calling code.
+    ushort pad_h = (PH - H) / 2;
+    ushort W = in.get_width();
+    ushort PW = out.get_width();
+    ushort pad_w = (PW - W) / 2;
+    
+    short h = short(gid.y) - short(pad_h);
+    h = max(h, short(-h));
+    h = min(h, short(2 * H - h - 2));
+    
+    short w = short(gid.x) - short(pad_w);
+    w = max(w, short(-w));
+    w = min(w, short(2 * W - w - 2));
+    
+    ushort2 inid(w, h);
+    out.write(in.read(inid), gid);
+}
+
+kernel void reflection_padding(texture2d_array<half, access::read> in[[texture(0)]],
+                               texture2d_array<half, access::write> out[[texture(1)]],
+                               ushort3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    ushort H = in.get_height();
+    ushort PH = out.get_height();
+    
+    // Note: we assume symmetric padding on H/W here, which is verified
+    // in the calling code.
+    ushort pad_h = (PH - H) / 2;
+    ushort W = in.get_width();
+    ushort PW = out.get_width();
+    ushort pad_w = (PW - W) / 2;
+    
+    short h = short(gid.y) - short(pad_h);
+    h = max(h, short(-h));
+    h = min(h, short(2 * H - h - 2));
+    
+    short w = short(gid.x) - short(pad_w);
+    w = max(w, short(-w));
+    w = min(w, short(2 * W - w - 2));
+    
+    ushort2 inid(w, h);
+    
+    out.write(in.read(inid, gid.z), gid.xy, gid.z);
+}
+
+kernel void bilinear_upsample(texture2d<half, access::sample> in[[texture(0)]],
+                              texture2d<half, access::write> out[[texture(1)]],
+                              ushort2 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    ushort2 src = gid / 2;
+    constexpr sampler sampler(address::clamp_to_edge, filter::linear, coord::pixel);
+    half4 value = in.sample(sampler, static_cast<float2>(src));
+    out.write(value, gid);
+}
+
+constant bool in0_is_tex = ushort_arg_0 <= 1 && ushort_arg_1 <= 4;
+constant bool in0_is_arr = !in0_is_tex;
+
+kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = gid.xy;
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = gid.xy;
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
+                                     texture2d<half, access::read> in1[[texture(1)]],
+                                     texture2d<half, access::write> out[[texture(2)]],
+                                     ushort2 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    out.write(in0.read(gid) + in1.read(gid), gid);
+}
+
+kernel void elementwise_add(texture2d_array<half, access::read> in0[[texture(0)]],
+                            texture2d_array<half, access::read> in1[[texture(1)]],
+                            texture2d_array<half, access::write> out[[texture(2)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    ushort2 gid_ = gid.xy;
+    out.write(in0.read(gid_, gid.z) + in1.read(gid_, gid.z), gid_, gid.z);
+}
+
+constant bool has_in0_arg = (ushort_arg_0 > 0);
+constant bool has_in1_arg = (ushort_arg_1 > 0);
+constant bool has_in2_arg = (ushort_arg_2 > 0);
+constant bool has_in3_arg = (ushort_arg_3 > 0);
+
+constant bool has_in0_tex = (has_in0_arg && ushort_arg_0 <= 4 && ushort_arg_5 <= 1);
+constant bool has_in1_tex = (has_in1_arg && ushort_arg_1 <= 4 && ushort_arg_5 <= 1);
+constant bool has_in2_tex = (has_in2_arg && ushort_arg_2 <= 4 && ushort_arg_5 <= 1);
+constant bool has_in3_tex = (has_in3_arg && ushort_arg_3 <= 4 && ushort_arg_5 <= 1);
+
+constant bool has_in0_array = (has_in0_arg && !has_in0_tex);
+constant bool has_in1_array = (has_in1_arg && !has_in1_tex);
+constant bool has_in2_array = (has_in2_arg && !has_in2_tex);
+constant bool has_in3_array = (has_in3_arg && !has_in3_tex);
+
+constant bool concat_has_out_tex = (ushort_arg_4 <= 4 && ushort_arg_5 <= 1);
+constant bool concat_has_out_array = !concat_has_out_tex;
+
+inline ushort idx_3(ushort z, ushort C0, ushort C1, ushort C2, ushort C3) {
+  if (z < C0) {
+    return 0;
+  }
+  if (z < (C0 + C1)) {
+    return 1;
+  }
+  if (z < (C0 + C1 + C2)) {
+    return 2;
+  }
+  return 3;
+}
+
+inline ushort idx_2(ushort z, ushort C0, ushort C1, ushort C2) {
+  if (z < C0) {
+    return 0;
+  }
+  if (z < (C0 + C1)) {
+    return 1;
+  }
+  return 2;
+}
+
+inline ushort idx_1(ushort z, ushort C0, ushort C1) {
+  if (z < C0) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+inline ushort idx_0(ushort z, ushort C0) { return 0; }
+
+// in a texture_array with size C, find the offset for image N at plane c.
+inline constexpr ushort z_off(ushort n, ushort c, ushort C) { return n * divRoundUp(C, 4) + c / 4; }
+
+kernel void concat(
+                   texture2d<half, access::read> in0[[ texture(0), function_constant(has_in0_tex) ]],
+                   texture2d<half, access::read> in1[[ texture(1), function_constant(has_in1_tex) ]],
+                   texture2d<half, access::read> in2[[ texture(2), function_constant(has_in2_tex) ]],
+                   texture2d<half, access::read> in3[[ texture(3), function_constant(has_in3_tex) ]],
+                   texture2d_array<half, access::read> ina0[[ texture(0), function_constant(has_in0_array) ]],
+                   texture2d_array<half, access::read> ina1[[ texture(1), function_constant(has_in1_array) ]],
+                   texture2d_array<half, access::read> ina2[[ texture(2), function_constant(has_in2_array) ]],
+                   texture2d_array<half, access::read> ina3[[ texture(3), function_constant(has_in3_array) ]],
+                   texture2d<half, access::write> out[[texture(5),
+                                                       function_constant(concat_has_out_tex) ]],
+                   texture2d_array<half, access::write> outa[[texture(5),
+                                                              function_constant(concat_has_out_array) ]],
+                   ushort3 gid[[thread_position_in_grid]]) {
+  if (concat_has_out_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+  }
+  
+  const ushort C0 = ushort_arg_0;
+  const ushort C1 = ushort_arg_1;
+  const ushort C2 = ushort_arg_2;
+  const ushort C3 = ushort_arg_3;
+  const ushort C = C0 + C1 + C2 + C3;
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+  // Fill channel 4*c to 4*(c+1) of nth image of output
+  
+  ushort2 gid_ = ushort2(gid.x, gid.y);
+  half4 value;
+  
+  for (int off = 0; off < 4; ++off) {
+    ushort cur_channel = c * 4 + off;
+    ushort cur_idx = 0;
+    if (cur_channel >= C) {
+      break;
+    }
+    if (has_in3_arg) {
+      cur_idx = idx_3(cur_channel, C0, C1, C2, C3);
+    } else if (has_in2_arg) {
+      cur_idx = idx_2(cur_channel, C0, C1, C2);
+    } else if (has_in1_arg) {
+      cur_idx = idx_1(cur_channel, C0, C1);
+    } else if (has_in0_arg) {
+      cur_idx = idx_0(cur_channel, C0);
+    } else {
+      // never reached.
+      cur_idx = 0;
+    }
+    ushort src_off = 0;
+    switch (cur_idx) {
+      case 0:
+        src_off = cur_channel % 4;
+        break;
+      case 1:
+        src_off = (cur_channel - C0) % 4;
+        break;
+      case 2:
+        src_off = (cur_channel - (C0 + C1)) % 4;
+        break;
+      case 3:
+        src_off = (cur_channel - (C0 + C1 + C2)) % 4;
+        break;
+    }
+    // try to see if we can only issue one read op for the 4 values
+    bool fast_path = false;
+    if (off == 0 && src_off == 0 && (cur_channel + 3) < C) {
+      ushort last_idx = 0;
+      if (has_in3_arg) {
+        last_idx = idx_3(cur_channel + 3, C0, C1, C2, C3);
+      } else if (has_in2_arg) {
+        last_idx = idx_2(cur_channel + 3, C0, C1, C2);
+      } else if (has_in1_arg) {
+        last_idx = idx_1(cur_channel + 3, C0, C1);
+      } else if (has_in0_arg) {
+        last_idx = idx_0(cur_channel + 3, C0);
+      } else {
+        // never reached.
+        last_idx = 0;
+      }
+      if (cur_idx == last_idx) {
+        fast_path = true;
+      }
+    }
+    switch (cur_idx) {
+      case 0: {
+        if (has_in0_tex) {
+          if (fast_path) {
+            value = in0.read(gid_);
+          } else {
+            value[off] = in0.read(gid_)[src_off];
+          }
+        }
+        if (has_in0_array) {
+          if (fast_path) {
+            value = ina0.read(gid_, z_off(n, cur_channel, C0));
+          } else {
+            value[off] = ina0.read(gid_, z_off(n, cur_channel, C0))[src_off];
+          }
+        }
+        break;
+      }
+      case 1: {
+        if (has_in1_tex) {
+          if (fast_path) {
+            value = in1.read(gid_);
+          } else {
+            value[off] = in1.read(gid_)[src_off];
+          }
+        }
+        if (has_in1_array) {
+          if (fast_path) {
+            value = ina1.read(gid_, z_off(n, cur_channel - C0, C1));
+          } else {
+            value[off] = ina1.read(gid_, z_off(n, cur_channel - C0, C1))[src_off];
+          }
+        }
+        break;
+      }
+      case 2: {
+        if (has_in2_tex) {
+          if (fast_path) {
+            value = in2.read(gid_);
+          } else {
+            value[off] = in2.read(gid_)[src_off];
+          }
+        }
+        if (has_in2_array) {
+          if (fast_path) {
+            value = ina2.read(gid_, z_off(n, cur_channel - (C0 + C1), C2));
+          } else {
+            value[off] = ina2.read(gid_, z_off(n, cur_channel - (C0 + C1), C2))[src_off];
+          }
+        }
+        break;
+      }
+      case 3: {
+        if (has_in3_tex) {
+          if (fast_path) {
+            value = in3.read(gid_);
+          } else {
+            value[off] = in3.read(gid_)[src_off];
+          }
+        }
+        if (has_in3_array) {
+          if (fast_path) {
+            value = ina3.read(gid_, z_off(n, cur_channel - (C0 + C1 + C2), C3));
+          } else {
+            value[off] = ina3.read(gid_, z_off(n, cur_channel - (C0 + C1 + C2), C3))[src_off];
+          }
+        }
+        break;
+      }
+    }
+    if (fast_path) {
+      break;
+    }
+  }
+  if (concat_has_out_tex) {
+    out.write(value, gid_, gid.z);
+  } else {
+    outa.write(value, gid_, gid.z);
+  }
+}
+
+using RoIT = half;
+using RoIT4 = half4;
+constant bool rw_has_in_arr = (ushort_arg_3 > 1 ||  ushort_arg_2 > 4);
+constant bool rw_has_out_arr = (ushort_arg_4 > 1 || ushort_arg_2 > 4);
+constant bool rw_has_in_tex = (!rw_has_in_arr);
+constant bool rw_has_out_tex = (!rw_has_out_arr);
+kernel void roi_warp(texture2d_array<half, access::sample> ina[[texture(0), function_constant(rw_has_in_arr)]],
+                     texture2d<half, access::sample> in[[texture(0), function_constant(rw_has_in_tex)]],
+                     texture2d_array<half, access::write> outa[[texture(1), function_constant(rw_has_out_arr)]],
+                     texture2d<half, access::write> out[[texture(1), function_constant(rw_has_out_tex)]],
+                     constant half4* rois[[buffer(0)]],
+                     ushort3 gid[[thread_position_in_grid]]) {
+  ushort out_width, out_height;
+  if (rw_has_out_arr) {
+    out_width = outa.get_width();
+    out_height = outa.get_height();
+  } else {
+    out_width = out.get_width();
+    out_height = out.get_height();
+  }
+  if (gid.x >= out_width || gid.y >= out_height) {
+    return;
+  }
+  constexpr sampler s2(coord::pixel, address::clamp_to_edge, filter::linear);
+
+  const half spatial_scale = half(ushort_arg_0) / 10000;
+  const ushort sampling_ratio = ushort_arg_1;
+  const ushort C = ushort_arg_2;
+  const ushort pw = gid.x;
+  const ushort ph = gid.y;
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z % divRoundUp(C, 4);
+
+  const RoIT4 roi_scaled = rois[n] * spatial_scale;
+  const RoIT roi_start_w = roi_scaled[0];
+  const RoIT roi_start_h = roi_scaled[1];
+  const RoIT roi_end_w = roi_scaled[2];
+  const RoIT roi_end_h = roi_scaled[3];
+
+  // Force malformed ROIs to be 1x1
+  const RoIT roi_width = max(roi_end_w - roi_start_w, (RoIT)1.);
+  const RoIT roi_height = max(roi_end_h - roi_start_h, (RoIT)1.);
+
+  const RoIT bin_size_h = static_cast<RoIT>(roi_height) / static_cast<RoIT>(out_height);
+  const RoIT bin_size_w = static_cast<RoIT>(roi_width) / static_cast<RoIT>(out_width);
+  const ushort roi_bin_grid_h = sampling_ratio > 0 ? sampling_ratio : ceil(roi_height / static_cast<RoIT>(out_height));
+  const ushort roi_bin_grid_w = sampling_ratio > 0 ? sampling_ratio : ceil(roi_width / static_cast<RoIT>(out_width));
+  const ushort iy_upper = (sampling_ratio > 0) ? roi_bin_grid_h : (roi_bin_grid_h + 1);
+  const ushort ix_upper = (sampling_ratio > 0) ? roi_bin_grid_w : (roi_bin_grid_w + 1);
+
+  const RoIT count = iy_upper * ix_upper;
+
+  RoIT4 output_val = 0.0;
+  for (int iy = 0; iy < iy_upper; iy++) {
+    for (int ix = 0; ix < ix_upper; ix++) {
+      const RoIT y =
+          roi_start_h + ph * bin_size_h + iy * bin_size_h / static_cast<RoIT>(roi_bin_grid_h);
+      const RoIT x =
+          roi_start_w + pw * bin_size_w + ix * bin_size_w / static_cast<RoIT>(roi_bin_grid_w);
+      if (rw_has_in_arr) {
+        output_val += ina.sample(s2, float2(x + 0.5, y + 0.5), c);
+      } else {
+        output_val += in.sample(s2, float2(x + 0.5, y + 0.5));
+      }
+    }
+  }
+  output_val /= count;
+  if (rw_has_out_arr) {
+    outa.write(static_cast<half4>(output_val), gid.xy, gid.z);
+  } else {
+    out.write(static_cast<half4>(output_val), gid.xy);
+  }
+}
+
+kernel void resize_nearest(texture2d_array<half, access::sample> in[[texture(0)]],
+                           texture2d_array<half, access::write> out[[texture(1)]],
+                           ushort3 gid[[thread_position_in_grid]]) {
+    const ushort oH = ushort_arg_0;
+    const ushort oW = ushort_arg_1;
+    if (gid.x >= oW || gid.y >= oH) {
+        return;
+    }
+    const float height_scale = float(ushort_arg_2) / 10000;
+    const float width_scale = float(ushort_arg_3) / 10000;
+    constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
+    const int in_y = (int)(gid.y / height_scale);
+    const int in_x = (int)(gid.x / width_scale);
+    out.write(in.sample(s, float2(in_x, in_y), gid.z), gid.xy, gid.z);
+}
+
+kernel void resize_nearest_nonarray(texture2d<half, access::sample> in[[texture(0)]],
+                                    texture2d<half, access::write> out[[texture(1)]],
+                                    ushort2 gid[[thread_position_in_grid]]) {
+    const ushort oH = ushort_arg_0;
+    const ushort oW = ushort_arg_1;
+    if (gid.x >= oW || gid.y >= oH) {
+        return;
+    }
+    const float height_scale = float(ushort_arg_2) / 10000;
+    const float width_scale = float(ushort_arg_3) / 10000;
+    constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
+    const int in_y = (int)(gid.y / height_scale);
+    const int in_x = (int)(gid.x / width_scale);
+    out.write(in.sample(s, float2(in_x, in_y)), gid.xy);
+}
+
+kernel void nms(device uint* mask[[buffer(0)]],
+                constant float* proposals[[buffer(1)]],
+                constant int* indices[[buffer(2)]],
+                ushort2 tgid[[threadgroup_position_in_grid]],
+                ushort2 tid[[thread_position_in_threadgroup]]) {
+    const ushort num_proposals = ushort_arg_0;
+    const ushort threads_per_group = ushort_arg_1;
+    float nms_thresh = float(ushort_arg_2) / 10000.0;
+    const ushort global_offset = ushort_arg_3;
+    const ushort row_start = tgid.y;
+    const ushort col_start = tgid.x;
+    const ushort trd_id = tid.x;
+    
+    const short row_size = min(short(32), short(num_proposals - row_start * threads_per_group));
+    const short col_size = min(short(32), short(num_proposals - col_start * threads_per_group));
+    
+    // mask the bit if the IoU between two proposals exceeds the threshold
+    if (trd_id < row_size) {
+        const ushort cur_idx = global_offset + row_start * threads_per_group + trd_id;
+        const ushort offset = indices[cur_idx] * 4;
+        const float4 cur_proposal = float4(
+                                           proposals[offset], proposals[offset + 1], proposals[offset + 2], proposals[offset + 3]);
+        uint cur_mask = 0;
+        ushort group_start = 0; // start index within group
+        if (row_start == col_start) {
+            // if in the same group, start from the next
+            group_start = trd_id + 1;
+        }
+        for (ushort i = group_start; i < col_size; i++) {
+            float4 a = cur_proposal;
+            ushort idx = indices[global_offset + col_start * threads_per_group + i] * 4;
+            float4 b = float4(proposals[idx], proposals[idx + 1], proposals[idx + 2], proposals[idx + 3]);
+            float left = max(a[0], b[0]);
+            float right = min(a[2], b[2]);
+            float top = max(a[1], b[1]);
+            float bottom = min(a[3], b[3]);
+            float width = max(right - left + 1.0, 0.0);
+            float height = max(bottom - top + 1.0, 0.0);
+            float interS = width * height;
+            float Sa = (a[2] - a[0] + 1.0) * (a[3] - a[1] + 1.0);
+            float Sb = (b[2] - b[0] + 1.0) * (b[3] - b[1] + 1.0);
+            float iou = interS / (Sa + Sb - interS);
+            if (iou - nms_thresh > 0) {
+                cur_mask |= 1U << i;
+            }
+        }
+        ushort col_blocks = (num_proposals + threads_per_group - 1) / threads_per_group;
+        mask[cur_idx * col_blocks + col_start] = cur_mask;
+    }
+}
+
+
+kernel void channel_shuffle(
+    texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+    texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+    texture2d<half, access::write> out[[texture(1), function_constant(in0_is_tex)]],
+    texture2d_array<half, access::write> outa[[texture(1), function_constant(in0_is_arr)]],
+    ushort3 gid[[thread_position_in_grid]]) {
+  ushort C = ushort_arg_1;
+  ushort K = ushort_arg_2;
+  ushort groups = ushort_arg_3;
+
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+  }
+  const ushort n = gid.z / divRoundUp(C, 4);
+  const ushort c = gid.z - n * divRoundUp(C, 4);
+  half4 value;
+  ushort2 gid_ = gid.xy;
+  for (int off = 0; off < 4; ++off) {
+    ushort cur_channel = c * 4 + off;
+    if (cur_channel >= C) {
+      break;
+    }
+    ushort channel_id = cur_channel / groups;
+    ushort group_id = cur_channel % groups;
+    ushort c0 = group_id * K + channel_id;
+    if (in0_is_tex) {
+      value[off] = in0.read(gid_)[c0 % 4];
+    } else {
+      value[off] = ina0.read(gid_, c0 / 4 + n * divRoundUp(C, 4))[c0 % 4];
+    }
+  }
+  if (in0_is_tex) {
+    out.write(value, gid_);
+  } else {
+    outa.write(value, gid_, gid.z);
+  }
+}
+
+)V0G0N";
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.h
new file mode 100644
index 0000000..7fd4a6d
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.h
@@ -0,0 +1,10 @@
+
+#include "caffe2/core/net.h"
+#pragma once
+
+namespace caffe2 {
+
+void testMPSCNN();
+void compareModels(const NetDef& initNet, NetDef predictNet);
+void verifyRewrite(const NetDef& initNet, const NetDef& net, std::vector<int> inputDims);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
new file mode 100644
index 0000000..9f032e6
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -0,0 +1,3458 @@
+#include "caffe2/core/common.h"
+
+#if CAFFE2_MOBILE && defined(CAFFE2_USE_MPSCNN_TEST)
+
+#include "mpscnn_context.h"
+#include "mpscnn_graph_mask.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator_schema.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+#import <UIKit/UIDevice.h>
+
+#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v) \
+  ([[[UIDevice currentDevice] systemVersion]       \
+       compare:v                                   \
+       options:NSNumericSearch] != NSOrderedAscending)
+
+namespace caffe2 {
+
+/* Utility functions for operator definition */
+void add_arg_int(OperatorDef& op, string name, int value) {
+  auto& arg = *(op.add_arg());
+  arg.set_name(name);
+  arg.set_i(value);
+}
+
+void add_arg_str(OperatorDef& op, string name, string value) {
+  auto& arg = *(op.add_arg());
+  arg.set_name(name);
+  arg.set_s(value);
+}
+
+void add_arg_float(OperatorDef& op, string name, float value) {
+  auto& arg = *(op.add_arg());
+  arg.set_name(name);
+  arg.set_f(value);
+}
+
+void add_arg_int_list(
+    OperatorDef& op,
+    std::vector<string> names,
+    std::vector<int> values) {
+  CAFFE_ENFORCE_EQ(names.size(), values.size());
+  for (auto i = 0; i < names.size(); i++) {
+    add_arg_int(op, names[i], values[i]);
+  }
+}
+
+void add_arg_str_list(
+    OperatorDef& op,
+    std::vector<string> names,
+    std::vector<string> values) {
+  CAFFE_ENFORCE_EQ(names.size(), values.size());
+  for (auto i = 0; i < names.size(); i++) {
+    add_arg_str(op, names[i], values[i]);
+  }
+}
+
+void add_inputs(OperatorDef& op, std::vector<string> inputs) {
+  for (auto i = 0; i < inputs.size(); i++) {
+    op.add_input(inputs[i]);
+  }
+}
+
+void add_outputs(OperatorDef& op, std::vector<string> outputs) {
+  for (auto i = 0; i < outputs.size(); i++) {
+    op.add_output(outputs[i]);
+  }
+}
+
+void testMPSCNN() {
+  // initialize.
+  getMPSCNNContext();
+
+  {
+    for (const auto C : std::vector<size_t>{1, 2, 3, 4, 8, 11, 12}) {
+      for (const auto H : std::vector<size_t>{1, 7, 15, 39}) {
+        for (const auto W : std::vector<size_t>{1, 7, 15, 39}) {
+          for (const auto N : std::vector<size_t>{1, 2}) {
+            for (const auto BS : std::vector<size_t>{1, 2}) {
+              LOG(INFO) << "MPSCNNCopyFrom/To Test";
+              auto mtl = [&](size_t i) {
+                return std::string("X_mtl_") + std::to_string(i);
+              };
+              auto cpu = [&](size_t i) {
+                return std::string("X_cpu_") + std::to_string(i);
+              };
+              auto y_cpu = [&](size_t i) {
+                return std::string("Y_cpu_") + std::to_string(i);
+              };
+
+              Workspace ws;
+              for (auto i = 0; i < N; ++i) {
+                auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+                t->Resize(BS, C, H, W);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              NetDef netdef;
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyToMPSCNN");
+                for (auto i = 0; i < N; ++i) {
+                  op.add_input(cpu(i));
+                  op.add_output(mtl(i));
+                }
+              }
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyFromMPSCNN");
+                for (auto i = 0; i < N; ++i) {
+                  op.add_input(mtl(i));
+                  op.add_output(y_cpu(i));
+                }
+              }
+
+              ws.RunNetOnce(netdef);
+              for (auto i = 0; i < N; ++i) {
+                const auto& t1 = ws.GetBlob(cpu(i))->Get<TensorCPU>();
+                const auto& t2 = ws.GetBlob(y_cpu(i))->Get<TensorCPU>();
+                CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+                for (auto i = 0; i < t1.size(); ++i) {
+                  // FP16 <-> FP32 round trip.
+                  CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], 1e-2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto ndim : std::vector<size_t>{1, 2, 3, 4}) {
+      for (const auto N : std::vector<size_t>{1, 2}) {
+        LOG(INFO) << "MPSCNNCopyFrom/To ndim Test";
+        auto mtl = [&](size_t i) {
+          return std::string("X_mtl_") + std::to_string(i);
+        };
+        auto cpu = [&](size_t i) {
+          return std::string("X_cpu_") + std::to_string(i);
+        };
+        auto y_cpu = [&](size_t i) {
+          return std::string("Y_cpu_") + std::to_string(i);
+        };
+
+        Workspace ws;
+        for (auto i = 0; i < N; ++i) {
+          auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+          switch (ndim) {
+            case 1:
+              t->Resize(5);
+              break;
+            case 2:
+              t->Resize(5, 3);
+              break;
+            case 3:
+              t->Resize(5, 3, 4);
+              break;
+            case 4:
+              t->Resize(5, 3, 4, 2);
+              break;
+          }
+          CPUContext ctx;
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+        }
+
+        NetDef netdef;
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyToMPSCNN");
+          for (auto i = 0; i < N; ++i) {
+            op.add_input(cpu(i));
+            op.add_output(mtl(i));
+          }
+        }
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyFromMPSCNN");
+          for (auto i = 0; i < N; ++i) {
+            op.add_input(mtl(i));
+            op.add_output(y_cpu(i));
+          }
+        }
+
+        ws.RunNetOnce(netdef);
+        for (auto i = 0; i < N; ++i) {
+          const auto& t1 = ws.GetBlob(cpu(i))->Get<TensorCPU>();
+          const auto& t2 = ws.GetBlob(y_cpu(i))->Get<TensorCPU>();
+          CAFFE_ENFORCE_EQ(t1.size(), t2.size());
+          for (auto i = 0; i < t1.size(); ++i) {
+            // FP16 <-> FP32 round trip.
+            CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], 1e-2);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& batch_size : std::vector<int>{{1, 2}}) {
+      for (const auto& channels : std::vector<int>{{3, 8}}) {
+        LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
+        Workspace ws;
+        {
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          t->Resize(batch_size, channels, 8, 13);
+          CPUContext ctx;
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+        }
+
+        {
+          auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+          t->Resize(1, channels);
+          CPUContext ctx;
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+        }
+        {
+          auto* t = ws.CreateBlob("stddev")->GetMutable<TensorCPU>();
+          t->Resize(1, channels);
+          CPUContext ctx;
+          math::RandUniform<float, CPUContext>(
+              t->size(), 0.5, 1.5, t->mutable_data<float>(), &ctx);
+        }
+
+        NetDef netdef;
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyToMPSCNN");
+          op.add_input("X_cpu");
+          op.add_output("X_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("MPSCNNNormalizePlanarYUV");
+          op.add_input("X_mtl");
+          op.add_input("mean");
+          op.add_input("stddev");
+          op.add_output("Y_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyFromMPSCNN");
+          op.add_input("Y_mtl");
+          op.add_output("Y_cpu");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("NormalizePlanarYUV");
+          op.add_input("X_cpu");
+          op.add_input("mean");
+          op.add_input("stddev");
+          op.add_output("Y_ref");
+        }
+
+        ws.RunNetOnce(netdef);
+        const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+        const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+        CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+        for (auto i = 0; i < t1.size(); ++i) {
+          // FP16 <-> FP32 round trip, accumulation, etc.
+          const float t1_i = t1.data<float>()[i];
+          const float t2_i = t2.data<float>()[i];
+          CHECK_NEAR(t1_i, t2_i, 0.1);
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNInstanceNorm Test";
+    enum class PreluTy { NONE, CHANNEL, SHARED };
+    for (const auto batchSize : {1, 2}) {
+      for (const auto channels : {3, 8}) {
+        for (const auto prelu :
+             {PreluTy::NONE, PreluTy::CHANNEL, PreluTy::SHARED}) {
+          for (const auto dim : {10, 40}) {
+            Workspace ws;
+            {
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              t->Resize(batchSize, channels, dim, dim);
+              CPUContext ctx;
+              // Too noisy.
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 0, 3, t->mutable_data<float>(), &ctx);
+            }
+
+            {
+              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              t->Resize(channels);
+              CPUContext ctx;
+              for (auto i = 0; i < t->size(); ++i) {
+                t->mutable_data<float>()[i] = i;
+              }
+              // Too noisy.
+              // math::RandGaussian<float, CPUContext>(t->size(), 0, 1,
+              // t->mutable_data<float>(), &ctx);
+            }
+            {
+              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              t->Resize(channels);
+              CPUContext ctx;
+              for (auto i = 0; i < t->size(); ++i) {
+                t->mutable_data<float>()[i] = 8 - 2 * i;
+              }
+              // Too noisy.
+              // math::RandGaussian<float, CPUContext>(t->size(), 0, 1,
+              // t->mutable_data<float>(), &ctx);
+            }
+            {
+              auto* t = ws.CreateBlob("pw")->GetMutable<TensorCPU>();
+              t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
+              CPUContext ctx;
+              // Too noisy.
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+            }
+
+            NetDef netdef;
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyToMPSCNN");
+              op.add_input("X_cpu");
+              op.add_output("X_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type(
+                  prelu == PreluTy::NONE ? "MPSCNNInstanceNorm"
+                                         : "MPSCNNInstanceNormPRelu");
+              op.add_input("X_mtl");
+              op.add_input("W");
+              op.add_input("b");
+              if (prelu != PreluTy::NONE) {
+                op.add_input("pw");
+              }
+              op.add_output("Y_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyFromMPSCNN");
+              op.add_input("Y_mtl");
+              op.add_output("Y_cpu");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("InstanceNorm");
+              op.add_input("X_cpu");
+              op.add_input("W");
+              op.add_input("b");
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+              op.add_output("Y_ref");
+            }
+
+            if (prelu != PreluTy::NONE) {
+              auto& op = *(netdef.add_op());
+              op.set_type("PRelu");
+              op.add_input("Y_ref");
+              op.add_input("pw");
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+              op.add_output("Y_ref");
+            }
+
+            ws.RunNetOnce(netdef);
+            const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+            const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+            CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+            for (auto i = 0; i < t1.size(); ++i) {
+              // FP16 <-> FP32 round trip, accumulation, etc.
+              const float t1_i = t1.data<float>()[i];
+              const float t2_i = t2.data<float>()[i];
+              // Can be larger due to FP errors.
+              constexpr float tol = 5.0e-2;
+              CHECK(std::abs(t1_i - t2_i) <= (tol + tol * std::abs(t1_i)))
+                  << t1_i << ", " << t2_i;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& shared : std::vector<bool>{{true, false}}) {
+      for (const auto& array : std::vector<bool>{{true, false}}) {
+        for (const auto& batch_size : std::vector<int>{{1, 2}}) {
+          LOG(INFO) << "MPSCNNPRelu Test: " << shared << array << batch_size;
+          Workspace ws;
+          const auto channels = array ? 12 : 3;
+          {
+            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            t->Resize(batch_size, channels, 8, 13);
+            CPUContext ctx;
+            math::RandGaussian<float, CPUContext>(
+                t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          }
+
+          {
+            auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+            t->Resize(shared ? channels : 1);
+            CPUContext ctx;
+            math::RandGaussian<float, CPUContext>(
+                t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          }
+
+          NetDef netdef;
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyToMPSCNN");
+            op.add_input("X_cpu");
+            op.add_output("X_mtl");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("MPSCNNPRelu");
+            op.add_input("X_mtl");
+            op.add_input("b");
+            op.add_output("Y_mtl");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyFromMPSCNN");
+            op.add_input("Y_mtl");
+            op.add_output("Y_cpu");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("PRelu");
+            op.add_input("X_cpu");
+            op.add_input("b");
+            auto& arg = *(op.add_arg());
+            arg.set_name("order");
+            arg.set_s("NCHW");
+            op.add_output("Y_ref");
+          }
+
+          ws.RunNetOnce(netdef);
+          const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+          const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+          CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+          for (auto i = 0; i < t1.size(); ++i) {
+            // FP16 <-> FP32 round trip, accumulation, etc.
+            const float t1_i = t1.data<float>()[i];
+            const float t2_i = t2.data<float>()[i];
+            CHECK_NEAR(t1_i, t2_i, 0.1);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& channels : std::vector<size_t>{3, 12, 15}) {
+      for (const auto& batch_size : std::vector<size_t>{1, 2}) {
+        LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
+        Workspace ws;
+        {
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          t->Resize(batch_size, channels, 8, 13);
+          CPUContext ctx;
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+        }
+
+        for (const std::string name : {"scale", "bias", "mean", "var"}) {
+          auto* t = ws.CreateBlob(name)->GetMutable<TensorCPU>();
+          t->Resize(channels);
+          CPUContext ctx;
+          // High mean to avoid var division by zero.
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          if (name == "var") {
+            for (auto i = 0; i < t->size(); ++i) {
+              t->mutable_data<float>()[i] =
+                  std::abs(t->mutable_data<float>()[i]) + 0.5;
+            }
+          }
+        }
+
+        NetDef netdef;
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyToMPSCNN");
+          op.add_input("X_cpu");
+          op.add_output("X_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("MPSCNNSpatialBN");
+          op.add_input("X_mtl");
+          op.add_input("scale");
+          op.add_input("bias");
+          op.add_input("mean");
+          op.add_input("var");
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name(OpSchema::Arg_IsTest);
+            arg.set_i(1);
+          }
+
+          op.add_output("Y_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyFromMPSCNN");
+          op.add_input("Y_mtl");
+          op.add_output("Y_cpu");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("SpatialBN");
+          op.add_input("X_cpu");
+          op.add_input("scale");
+          op.add_input("bias");
+          op.add_input("mean");
+          op.add_input("var");
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name(OpSchema::Arg_IsTest);
+            arg.set_i(1);
+          }
+
+          op.add_output("Y_ref");
+        }
+
+        ws.RunNetOnce(netdef);
+        const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+        const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+        CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+        for (auto i = 0; i < t1.size(); ++i) {
+          // FP16 <-> FP32 round trip, accumulation, etc.
+          const float t1_i = t1.data<float>()[i];
+          const float t2_i = t2.data<float>()[i];
+          CHECK_NEAR(t1_i, t2_i, 0.1);
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& batchSize : std::vector<size_t>{2, 1}) {
+      for (const auto& H : std::vector<size_t>{1, 8}) {
+        for (const auto& W : std::vector<size_t>{1, 8}) {
+          for (const auto& CIn : std::vector<size_t>{1, 12, 224}) {
+            for (const auto& COut : std::vector<size_t>{1, 12, 224}) {
+              LOG(INFO) << "MPSCNNFC Test";
+              Workspace ws;
+              {
+                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                t->Resize(batchSize, CIn, H, W);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              {
+                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                t->Resize(COut, CIn * H * W);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              {
+                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                t->Resize(COut);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 0.0001, t->mutable_data<float>(), &ctx);
+              }
+
+              NetDef netdef;
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyToMPSCNN");
+                op.add_input("X_cpu");
+                op.add_output("X_mtl");
+              }
+
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("MPSCNNFC");
+                op.add_input("X_mtl");
+                op.add_input("W");
+                op.add_input("b");
+                op.add_output("Y_mtl");
+              }
+
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyFromMPSCNN");
+                op.add_input("Y_mtl");
+                op.add_output("Y_cpu");
+              }
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("FC");
+                op.add_input("X_cpu");
+                op.add_input("W");
+                op.add_input("b");
+                auto& arg = *(op.add_arg());
+                arg.set_name("order");
+                arg.set_s("NCHW");
+                op.add_output("Y_ref");
+              }
+
+              ws.RunNetOnce(netdef);
+              const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+              const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+              CAFFE_ENFORCE_EQ(t2.ndim(), 4);
+              CAFFE_ENFORCE_EQ(t1.ndim(), 2);
+              CAFFE_ENFORCE(t2.dim32(2) == 1 && t2.dim32(3) == 1);
+              const_cast<TensorCPU&>(t2).Reshape(
+                  std::vector<TIndex>{TIndex(batchSize), TIndex(COut)});
+              // Note dims do not match, as Metal leaves a 1x1 spatial
+              // dimension.
+              CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+
+              for (auto i = 0; i < t1.size(); ++i) {
+                // FP16 <-> FP32 round trip, accumulation, etc.
+                const float t1_i = t1.data<float>()[i];
+                const float t2_i = t2.data<float>()[i];
+                // LOG(INFO) << "i: " << i << ", cpu: " << t1_i << ", mtl: " <<
+                // t2_i;
+                CHECK_NEAR(t1_i, t2_i, 0.7);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& pool : {"MaxPool", "AveragePool"}) {
+      for (const auto& global_pooling : {true, false}) {
+        for (const auto& batchSize : std::vector<size_t>{1, 2}) {
+          for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
+            for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
+              for (const auto& kernel_h : std::vector<int>{1, 3, 5}) {
+                for (const auto& kernel_w : std::vector<int>{1, 3, 5}) {
+                  for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
+                    for (const auto& pad_r :
+                         std::vector<int>{0, kernel_w / 2}) {
+                      for (const auto& pad_t :
+                           std::vector<int>{0, kernel_h / 2}) {
+                        for (const auto& pad_b :
+                             std::vector<int>{0, kernel_h / 2}) {
+                          // Waiting response from Apple
+                          if (kernel_h != kernel_w) {
+                            continue;
+                          }
+                          LOG(INFO) << "MPSCNNPool Test: " << pool;
+                          Workspace ws;
+                          {
+                            auto* t =
+                                ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                            t->Resize(batchSize, 8, 8, 13);
+                            CPUContext ctx;
+                            math::RandGaussian<float, CPUContext>(
+                                t->size(),
+                                0,
+                                1,
+                                t->mutable_data<float>(),
+                                &ctx);
+                          }
+
+                          NetDef netdef;
+#define ADD_ARGS(op)                                   \
+  do {                                                 \
+    if (global_pooling) {                              \
+      add_arg_int(op, "stride", 1);                    \
+    } else {                                           \
+      add_arg_int_list(                                \
+          op,                                          \
+          std::vector<string>{"pad_l",                 \
+                              "pad_r",                 \
+                              "pad_t",                 \
+                              "pad_b",                 \
+                              "kernel_w",              \
+                              "kernel_h",              \
+                              "stride_w",              \
+                              "stride_h"},             \
+          std::vector<int>{pad_l,                      \
+                           pad_r,                      \
+                           pad_t,                      \
+                           pad_b,                      \
+                           kernel_w,                   \
+                           kernel_h,                   \
+                           stride_w,                   \
+                           stride_h});                 \
+    }                                                  \
+    add_arg_int(op, "global_pooling", global_pooling); \
+  } while (false)
+                          {
+                            auto& op = *(netdef.add_op());
+                            op.set_type("CopyToMPSCNN");
+                            op.add_input("X_cpu");
+                            op.add_output("X_mtl");
+                          }
+
+                          {
+                            auto& op = *(netdef.add_op());
+                            op.set_type(std::string("MPSCNN") + pool);
+                            op.add_input("X_mtl");
+                            ADD_ARGS(op);
+                            op.add_output("Y_mtl");
+                          }
+
+                          {
+                            auto& op = *(netdef.add_op());
+                            op.set_type("CopyFromMPSCNN");
+                            op.add_input("Y_mtl");
+                            op.add_output("Y_cpu");
+                          }
+
+                          {
+                            auto& op = *(netdef.add_op());
+                            op.set_type(pool);
+                            op.add_input("X_cpu");
+                            ADD_ARGS(op);
+                            op.add_output("Y_ref");
+                          }
+#undef ADD_ARGS
+
+                          ws.RunNetOnce(netdef);
+                          const auto& t2 =
+                              ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+                          const auto& t1 =
+                              ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+                          CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+                          for (auto i = 0; i < t1.size(); ++i) {
+                            // FP16 <-> FP32 round trip, accumulation, etc.
+                            const float t1_i = t1.data<float>()[i];
+                            const float t2_i = t2.data<float>()[i];
+                            CHECK_NEAR(t1_i, t2_i, 0.1);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNPadImage Test";
+    for (const auto dims :
+         std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
+      Workspace ws;
+      {
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        t->Resize(dims);
+        CPUContext ctx;
+        math::RandGaussian<float, CPUContext>(
+            t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+      }
+
+      NetDef netdef;
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyToMPSCNN");
+        op.add_input("X_cpu");
+        op.add_output("X_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("MPSCNNPadImage");
+        op.add_input("X_mtl");
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("pad");
+          arg.set_i(10);
+        }
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("mode");
+          arg.set_s("reflect");
+        }
+        op.add_output("Y_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyFromMPSCNN");
+        op.add_input("Y_mtl");
+        op.add_output("Y_cpu");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("PadImage");
+        op.add_input("X_cpu");
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("pad");
+          arg.set_i(10);
+        }
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("mode");
+          arg.set_s("reflect");
+        }
+        op.add_output("Y_ref");
+      }
+
+      ws.RunNetOnce(netdef);
+      const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+      const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+      CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+      for (auto i = 0; i < t1.size(); ++i) {
+        // FP16 <-> FP32 round trip, accumulation, etc.
+        const float t1_i = t1.data<float>()[i];
+        const float t2_i = t2.data<float>()[i];
+        // LOG(INFO) << "i: " << i << ", " << "CPU: " << t1_i << ", MTL: " <<
+        // t2_i;
+        CHECK_NEAR(t1_i, t2_i, 0.01);
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNPreprocess Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 8, 13, 4);
+      CPUContext ctx;
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<uint8_t>()[i] = rand() % 255;
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      t->Resize(3);
+      CPUContext ctx;
+      t->mutable_data<float>()[0] = 100;
+      t->mutable_data<float>()[1] = 50;
+      t->mutable_data<float>()[2] = 150;
+    }
+
+    NetDef netdef;
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess");
+      op.add_input("X_cpu");
+      op.add_input("mean");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("noise_std");
+        arg.set_f(0.00001);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("noise_size");
+        arg.set_i(512);
+      }
+
+      op.add_output("Y_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
+      op.add_input("X_cpu");
+      op.add_input("mean");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("noise_std");
+        arg.set_f(0.00001);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("noise_size");
+        arg.set_i(512);
+      }
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNDeprocess Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 3, 8, 24);
+      CPUContext ctx;
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = rand() % 255;
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      t->Resize(3);
+      CPUContext ctx;
+      t->mutable_data<float>()[0] = 100;
+      t->mutable_data<float>()[1] = 50;
+      t->mutable_data<float>()[2] = 150;
+    }
+
+    NetDef netdef;
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+      op.add_input("X_mtl");
+      op.add_input("mean");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+      op.add_input("X_cpu");
+      op.add_input("mean");
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<uint8_t>()[i];
+      const float t2_i = t2.data<uint8_t>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNDeprocess Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 3, 1280, 720);
+      CPUContext ctx;
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = rand() % 1000 - 500;
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      t->Resize(3);
+      CPUContext ctx;
+      t->mutable_data<float>()[0] = 30;
+      t->mutable_data<float>()[1] = 40;
+      t->mutable_data<float>()[2] = 50;
+    }
+
+    NetDef netdef;
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+      op.add_input("X_mtl");
+      op.add_input("mean");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+      op.add_input("X_cpu");
+      op.add_input("mean");
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<uint8_t>()[i];
+      const float t2_i = t2.data<uint8_t>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  @autoreleasepool {
+    for (const auto& batchSize : std::vector<int>{1, 2}) {
+      for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
+        for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
+          for (const auto& kernel_h : std::vector<int>{1, 3, 8}) {
+            for (const auto& kernel_w : std::vector<int>{1, 3, 8}) {
+              for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
+                for (const auto& pad_r : std::vector<int>{0, kernel_w / 2}) {
+                  for (const auto& pad_t : std::vector<int>{0, kernel_h / 2}) {
+                    for (const auto& pad_b :
+                         std::vector<int>{0, kernel_h / 2}) {
+                      // Waiting response from Apple
+                      if (kernel_h != kernel_w) {
+                        continue;
+                      }
+                      LOG(INFO) << "MPSCNNConv Test";
+                      Workspace ws;
+                      {
+                        auto* t =
+                            ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                        t->Resize(batchSize, 12, 57, 72);
+                        CPUContext ctx;
+                        math::RandGaussian<float, CPUContext>(
+                            t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+                      }
+
+                      {
+                        auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                        t->Resize(8, 12, kernel_h, kernel_w);
+                        CPUContext ctx;
+                        math::RandGaussian<float, CPUContext>(
+                            8 * 12 * kernel_h * kernel_w,
+                            0,
+                            1,
+                            t->mutable_data<float>(),
+                            &ctx);
+                      }
+
+                      {
+                        auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                        t->Resize(8);
+                        CPUContext ctx;
+                        math::RandGaussian<float, CPUContext>(
+                            8, 0, 1, t->mutable_data<float>(), &ctx);
+                      }
+
+                      NetDef netdef;
+#define ADD_ARGS(op)                     \
+  do {                                   \
+    add_arg_str(op, "order", "NCHW");    \
+    add_arg_int_list(                    \
+        op,                              \
+        std::vector<string>{"stride_h",  \
+                            "stride_w",  \
+                            "pad_l",     \
+                            "pad_r",     \
+                            "pad_t",     \
+                            "pad_b",     \
+                            "kernel_w",  \
+                            "kernel_h"}, \
+        std::vector<int>{stride_h,       \
+                         stride_w,       \
+                         pad_l,          \
+                         pad_r,          \
+                         pad_t,          \
+                         pad_b,          \
+                         kernel_w,       \
+                         kernel_h});     \
+  } while (false)
+                      {
+                        auto& op = *(netdef.add_op());
+                        op.set_type("CopyToMPSCNN");
+                        op.add_input("X_cpu");
+                        op.add_output("X_mtl");
+                      }
+
+                      {
+                        auto& op = *(netdef.add_op());
+                        op.set_type("MPSCNNConv");
+                        op.add_input("X_mtl");
+                        op.add_input("W");
+                        op.add_input("b");
+                        ADD_ARGS(op);
+                        op.add_output("Y_mtl");
+                      }
+
+                      {
+                        auto& op = *(netdef.add_op());
+                        op.set_type("CopyFromMPSCNN");
+                        op.add_input("Y_mtl");
+                        op.add_output("Y_cpu");
+                      }
+
+                      {
+                        auto& op = *(netdef.add_op());
+                        op.set_type("Conv");
+                        op.add_input("X_cpu");
+                        op.add_input("W");
+                        op.add_input("b");
+                        ADD_ARGS(op);
+                        op.add_output("Y_ref");
+                      }
+#undef ADD_ARGS
+                      ws.RunNetOnce(netdef);
+                      const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+                      const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+                      CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+                      for (auto i = 0; i < t1.size(); ++i) {
+                        // FP16 <-> FP32 round trip, accumulation, etc.
+                        const float t1_i = t1.data<float>()[i];
+                        const float t2_i = t2.data<float>()[i];
+                        CHECK_NEAR(t1_i, t2_i, 0.2);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @autoreleasepool {
+    bool runtimeAtLeastIOS11 = SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0");
+    if (runtimeAtLeastIOS11) {
+      for (const auto& batchSize : std::vector<int>{1, 2}) {
+        for (const auto& input_channels : std::vector<int>{32, 64, 128, 256}) {
+          for (const auto& channel_multiplier : std::vector<int>{1}) {
+            LOG(INFO) << "MPSCNNDepthwiseConv Test";
+            Workspace ws;
+            int output_channels = input_channels * channel_multiplier;
+            {
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              t->Resize(batchSize, input_channels, 57, 72);
+              CPUContext ctx;
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+            }
+
+            {
+              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              t->Resize(output_channels, 1, 3, 3);
+              CPUContext ctx;
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+            }
+
+            {
+              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              t->Resize(output_channels);
+              CPUContext ctx;
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+            }
+
+            NetDef netdef;
+#define ADD_ARGS(op)                                      \
+  do {                                                    \
+    add_arg_str(op, "order", "NCHW");                     \
+    add_arg_int_list(                                     \
+        op,                                               \
+        std::vector<string>{"stride", "kernel", "group"}, \
+        std::vector<int>{1, 3, input_channels});          \
+  } while (false)
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyToMPSCNN");
+              op.add_input("X_cpu");
+              op.add_output("X_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("MPSCNNConv");
+              op.add_input("X_mtl");
+              op.add_input("W");
+              op.add_input("b");
+              ADD_ARGS(op);
+              op.add_output("Y_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyFromMPSCNN");
+              op.add_input("Y_mtl");
+              op.add_output("Y_cpu");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("Conv");
+              op.add_input("X_cpu");
+              op.add_input("W");
+              op.add_input("b");
+              ADD_ARGS(op);
+              op.add_output("Y_ref");
+            }
+#undef ADD_ARGS
+            ws.RunNetOnce(netdef);
+            const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+            const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+            CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+            for (auto i = 0; i < t1.size(); ++i) {
+              // FP16 <-> FP32 round trip, accumulation, etc.
+              const float t1_i = t1.data<float>()[i];
+              const float t2_i = t2.data<float>()[i];
+              CHECK_NEAR(t1_i, t2_i, 0.3);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNConvRelu Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      t->Resize(8, 12, 3, 3);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          8 * 12 * 3 * 3, 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      t->Resize(8);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          8, 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNConvRelu");
+      op.add_input("X_mtl");
+      op.add_input("W");
+      op.add_input("b");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NCHW");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(3);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad");
+        arg.set_i(1);
+      }
+      op.add_output("Y_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("b");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NCHW");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(3);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad");
+        arg.set_i(1);
+      }
+      op.add_output("Y_ref");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Y_ref");
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSConv Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      t->Resize(8, 12, 3, 3);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          8 * 12 * 3 * 3, 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      t->Resize(8);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          8, 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNConv");
+      op.add_input("X_mtl");
+      op.add_input("W");
+      op.add_input("b");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NCHW");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(3);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad");
+        arg.set_i(0);
+      }
+      op.add_output("Y_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("b");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NCHW");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(3);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad");
+        arg.set_i(0);
+      }
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  {
+    for (const auto& batchSize : {1, 2}) {
+      for (const auto& C : {1, 2}) {
+        for (const auto& M : {1, 2}) {
+          for (const auto& K : {3, 4}) {
+            for (const auto& P : {1, 2}) {
+              LOG(INFO) << "MPSConv Test";
+              Workspace ws;
+              {
+                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                t->Resize(batchSize, C, 12, 16);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              {
+                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                t->Resize(M, C, K, K);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              {
+                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                t->Resize(M);
+                CPUContext ctx;
+                math::RandGaussian<float, CPUContext>(
+                    t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+              }
+
+              NetDef netdef;
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyToMPSCNN");
+                op.add_input("X_cpu");
+                op.add_output("X_mtl");
+              }
+
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("MPSCNNConv");
+                op.add_input("X_mtl");
+                op.add_input("W");
+                op.add_input("b");
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("order");
+                  arg.set_s("NCHW");
+                }
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("kernel");
+                  arg.set_i(K);
+                }
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("pad");
+                  arg.set_i(P);
+                }
+                op.add_output("Y_mtl");
+              }
+
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("CopyFromMPSCNN");
+                op.add_input("Y_mtl");
+                op.add_output("Y_cpu");
+              }
+
+              {
+                auto& op = *(netdef.add_op());
+                op.set_type("Conv");
+                op.add_input("X_cpu");
+                op.add_input("W");
+                op.add_input("b");
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("order");
+                  arg.set_s("NCHW");
+                }
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("kernel");
+                  arg.set_i(K);
+                }
+                {
+                  auto& arg = *(op.add_arg());
+                  arg.set_name("pad");
+                  arg.set_i(P);
+                }
+                op.add_output("Y_ref");
+              }
+
+              ws.RunNetOnce(netdef);
+              const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+              const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+              CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+              for (auto i = 0; i < t1.size(); ++i) {
+                // FP16 <-> FP32 round trip, accumulation, etc.
+                const float t1_i = t1.data<float>()[i];
+                const float t2_i = t2.data<float>()[i];
+                CHECK_NEAR(t1_i, t2_i, 0.1);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto& batchSize : {1, 2}) {
+      for (const auto& group : {1, 2}) {
+        for (const auto& C : {8, 16}) {
+          for (const auto& M : {8, 16}) {
+            for (const auto& K : {3, 4}) {
+              for (const auto& P : {1, 2}) {
+                LOG(INFO) << "MPSCNNConv Test - group";
+                Workspace ws;
+                {
+                  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                  t->Resize(batchSize, C, 12, 16);
+                  CPUContext ctx;
+                  math::RandGaussian<float, CPUContext>(
+                      t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+                }
+
+                {
+                  auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                  t->Resize(M, C / group, K, K);
+                  CPUContext ctx;
+                  math::RandGaussian<float, CPUContext>(
+                      t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+                }
+
+                {
+                  auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                  t->Resize(M);
+                  CPUContext ctx;
+                  math::RandGaussian<float, CPUContext>(
+                      t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+                }
+
+                NetDef netdef;
+                {
+                  auto& op = *(netdef.add_op());
+                  op.set_type("CopyToMPSCNN");
+                  op.add_input("X_cpu");
+                  op.add_output("X_mtl");
+                }
+
+                {
+                  auto& op = *(netdef.add_op());
+                  op.set_type("MPSCNNConv");
+                  op.add_input("X_mtl");
+                  op.add_input("W");
+                  op.add_input("b");
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("order");
+                    arg.set_s("NCHW");
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("kernel");
+                    arg.set_i(K);
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("pad");
+                    arg.set_i(P);
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("group");
+                    arg.set_i(group);
+                  }
+                  op.add_output("Y_mtl");
+                }
+
+                {
+                  auto& op = *(netdef.add_op());
+                  op.set_type("CopyFromMPSCNN");
+                  op.add_input("Y_mtl");
+                  op.add_output("Y_cpu");
+                }
+
+                {
+                  auto& op = *(netdef.add_op());
+                  op.set_type("Conv");
+                  op.add_input("X_cpu");
+                  op.add_input("W");
+                  op.add_input("b");
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("order");
+                    arg.set_s("NCHW");
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("kernel");
+                    arg.set_i(K);
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("pad");
+                    arg.set_i(P);
+                  }
+                  {
+                    auto& arg = *(op.add_arg());
+                    arg.set_name("group");
+                    arg.set_i(group);
+                  }
+                  op.add_output("Y_ref");
+                }
+
+                ws.RunNetOnce(netdef);
+                const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+                const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+                CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+                for (auto i = 0; i < t1.size(); ++i) {
+                  // FP16 <-> FP32 round trip, accumulation, etc.
+                  const float t1_i = t1.data<float>()[i];
+                  const float t2_i = t2.data<float>()[i];
+                  CHECK_NEAR(t1_i, t2_i, 0.1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNMul Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNMul");
+      op.add_input("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("Y_mtl");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Mul");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("Y_ref");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.02);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNSub Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNSub");
+      op.add_input("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("Y_mtl");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Sub");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("Y_ref");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.01);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSAdd Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("X1_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNAdd");
+      op.add_input("X0_mtl");
+      op.add_input("X1_mtl");
+      op.add_output("Y_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Add");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.01);
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSAdd Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("X1_mtl");
+
+      // First input is read twice.
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("__mpscnn_read_count__");
+        arg.add_ints(2);
+        arg.add_ints(1);
+      }
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNAdd");
+      op.add_input("X0_mtl");
+      op.add_input("X1_mtl");
+      op.add_output("X2_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNAdd");
+      op.add_input("X0_mtl");
+      op.add_input("X2_mtl");
+      op.add_output("Y_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Add");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("X2_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Add");
+      op.add_input("X0_cpu");
+      op.add_input("X2_cpu");
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.05);
+    }
+  }
+
+  {
+    for (const auto& n : {"Relu", "Tanh", "Sigmoid"}) {
+      LOG(INFO) << "MPSCNNNeuron Test: " << n;
+      Workspace ws;
+      {
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        t->Resize(1, 4, 12, 12);
+        CPUContext ctx;
+        math::RandGaussian<float, CPUContext>(
+            t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+      }
+
+      NetDef netdef;
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyToMPSCNN");
+        op.add_input("X_cpu");
+        op.add_output("X_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type(std::string("MPSCNN") + n);
+        op.add_input("X_mtl");
+        op.add_output("Y_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyFromMPSCNN");
+        op.add_input("Y_mtl");
+        op.add_output("Y_cpu");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type(n);
+        op.add_input("X_cpu");
+        op.add_output("Y_ref");
+      }
+
+      ws.RunNetOnce(netdef);
+      const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+      const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+      CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+      for (auto i = 0; i < t1.size(); ++i) {
+        // FP16 <-> FP32 round trip, accumulation, etc.
+        const float t1_i = t1.data<float>()[i];
+        const float t2_i = t2.data<float>()[i];
+        CHECK_NEAR(t1_i, t2_i, 0.02);
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNDropout Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNDropout");
+      op.add_input("X_mtl");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name(OpSchema::Arg_IsTest);
+        arg.set_i(1);
+      }
+      op.add_output("Y_mtl");
+      op.add_output("Y_mask_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Dropout");
+      op.add_input("X_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name(OpSchema::Arg_IsTest);
+        arg.set_i(1);
+      }
+      op.add_output("Y_ref");
+      op.add_output("Y_mask");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    LOG(INFO) << t1.dims();
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.1);
+    }
+  }
+
+  {
+    for (const auto scale : std::vector<float>{1.0, 2.0, 0.0625}) {
+      for (const auto channels : std::vector<size_t>{1, 3, 5, 8}) {
+        for (const auto pool : std::vector<size_t>{1, 3, 7}) {
+          for (const auto sampling_ratio : std::vector<size_t>{0, 1, 2, 3}) {
+            LOG(INFO) << "MPSCNNRoIWarp Test - sampling_ratio:"
+                      << sampling_ratio << "- pool: " << pool
+                      << " - scale: " << scale;
+            Workspace ws;
+            {
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              t->Resize(1, channels, 40, 40);
+              CPUContext ctx;
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 4, 2, t->mutable_data<float>(), &ctx);
+            }
+            {
+              // Use the batch-first encoding (n, [bbox])
+              auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+              t->Resize(6, 5);
+              for (auto i = 0; i < t->dim32(0); ++i) {
+                t->mutable_data<float>()[5 * i + 0] = 0; // batch
+                t->mutable_data<float>()[5 * i + 1] = (i % 4 + 1) * 1.0 / scale;
+                t->mutable_data<float>()[5 * i + 2] = (i % 5 + 1) * 1.0 / scale;
+                t->mutable_data<float>()[5 * i + 3] = (i % 3 + 7) * 1.0 / scale;
+                t->mutable_data<float>()[5 * i + 4] = (i % 4 + 7) * 1.0 / scale;
+              }
+            }
+
+            NetDef netdef;
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyToMPSCNN");
+              op.add_input("X_cpu");
+              op.add_output("X_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("MPSCNNRoIWarp");
+              op.add_input("X_mtl");
+              op.add_input("R");
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("sampling_ratio");
+                arg.set_i(sampling_ratio);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("pooled_h");
+                arg.set_i(pool);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("pooled_w");
+                arg.set_i(pool);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("spatial_scale");
+                arg.set_f(scale);
+              }
+              op.add_output("Y_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyFromMPSCNN");
+              op.add_input("Y_mtl");
+              op.add_output("Y_cpu");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("RoIWarp");
+              op.add_input("X_cpu");
+              op.add_input("R");
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("sampling_ratio");
+                arg.set_i(sampling_ratio);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("pooled_h");
+                arg.set_i(pool);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("pooled_w");
+                arg.set_i(pool);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("spatial_scale");
+                arg.set_f(scale);
+              }
+              op.add_output("Y_ref");
+            }
+
+            ws.RunNetOnce(netdef);
+            const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+            const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+
+            CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+            LOG(INFO) << t1.dims();
+            for (auto i = 0; i < t1.size(); ++i) {
+              // FP16 <-> FP32 round trip, accumulation, etc.
+              const float t1_i = t1.data<float>()[i];
+              const float t2_i = t2.data<float>()[i];
+              CHECK_NEAR(t1_i, t2_i, 0.1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto scale : std::vector<float>{1.0, 2.0, 0.0625}) {
+      for (const auto pool : std::vector<size_t>{1, 3, 7}) {
+        LOG(INFO) << "MPSCNNRoIWarp Test 2";
+        Workspace ws;
+        {
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          t->Resize(1, 8, 40, 40);
+          CPUContext ctx;
+          math::RandGaussian<float, CPUContext>(
+              t->size(), 4, 2, t->mutable_data<float>(), &ctx);
+        }
+        {
+          auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+          t->Resize(6, 4);
+          for (auto i = 0; i < t->dim32(0); ++i) {
+            t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
+            t->mutable_data<float>()[4 * i + 1] = (i % 5 + 1) * 1.0 / scale;
+            t->mutable_data<float>()[4 * i + 2] = (i % 3 + 7) * 1.0 / scale;
+            t->mutable_data<float>()[4 * i + 3] = (i % 4 + 7) * 1.0 / scale;
+          }
+        }
+
+        NetDef netdef;
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyToMPSCNN");
+          op.add_input("X_cpu");
+          op.add_output("X_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("MPSCNNRoIWarp");
+          op.add_input("X_mtl");
+          op.add_input("R");
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("sampling_ratio");
+            arg.set_i(1);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("pooled_h");
+            arg.set_i(pool);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("pooled_w");
+            arg.set_i(pool);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("spatial_scale");
+            arg.set_f(scale);
+          }
+          op.add_output("Y_mtl");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("CopyFromMPSCNN");
+          op.add_input("Y_mtl");
+          op.add_output("Y_cpu");
+        }
+
+        {
+          auto& op = *(netdef.add_op());
+          op.set_type("RoIWarp");
+          op.add_input("X_cpu");
+          op.add_input("R");
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("sampling_ratio");
+            arg.set_i(1);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("pooled_h");
+            arg.set_i(pool);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("pooled_w");
+            arg.set_i(pool);
+          }
+          {
+            auto& arg = *(op.add_arg());
+            arg.set_name("spatial_scale");
+            arg.set_f(scale);
+          }
+          op.add_output("Y_ref");
+        }
+
+        ws.RunNetOnce(netdef);
+        const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+        const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+
+        CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+        LOG(INFO) << t1.dims();
+        for (auto i = 0; i < t1.size(); ++i) {
+          // FP16 <-> FP32 round trip, accumulation, etc.
+          const float t1_i = t1.data<float>()[i];
+          const float t2_i = t2.data<float>()[i];
+          CHECK_NEAR(t1_i, t2_i, 0.1);
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto height_scale : std::vector<float>{1.0, 0.5, 1.7}) {
+      for (const auto width_scale : std::vector<float>{1.0, 0.5, 2.3}) {
+        for (const auto C : std::vector<float>{2, 7, 11}) {
+          for (const auto N : std::vector<float>{1, 2}) {
+            LOG(INFO) << "MPSCNNResizeNearestOp Test";
+            Workspace ws;
+            {
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              t->Resize(N, C, 37, 89);
+              CPUContext ctx;
+              math::RandGaussian<float, CPUContext>(
+                  t->size(), 4, 2, t->mutable_data<float>(), &ctx);
+            }
+            NetDef netdef;
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyToMPSCNN");
+              op.add_input("X_cpu");
+              op.add_output("X_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("MPSCNNResizeNearest");
+              op.add_input("X_mtl");
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("height_scale");
+                arg.set_f(height_scale);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("width_scale");
+                arg.set_f(width_scale);
+              }
+              op.add_output("Y_mtl");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("CopyFromMPSCNN");
+              op.add_input("Y_mtl");
+              op.add_output("Y_cpu");
+            }
+
+            {
+              auto& op = *(netdef.add_op());
+              op.set_type("ResizeNearest");
+              op.add_input("X_cpu");
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("height_scale");
+                arg.set_f(height_scale);
+              }
+              {
+                auto& arg = *(op.add_arg());
+                arg.set_name("width_scale");
+                arg.set_f(width_scale);
+              }
+              op.add_output("Y_ref");
+            }
+
+            ws.RunNetOnce(netdef);
+            const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+            const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+
+            CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+            LOG(INFO) << t1.dims();
+            for (auto i = 0; i < t1.size(); ++i) {
+              // FP16 <-> FP32 round trip, accumulation, etc.
+              const float t1_i = t1.data<float>()[i];
+              const float t2_i = t2.data<float>()[i];
+              CHECK_NEAR(t1_i, t2_i, 0.1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNGenerateProposals Test: \n";
+    Workspace ws;
+    auto num_images = 1;
+    auto A = 2; // # anchors
+    auto H = 4; // height
+    auto W = 5; // width
+    vector<float> scores{
+        5.44218998e-03, 1.19207997e-03, 1.12379994e-03, 1.17181998e-03,
+        1.20544003e-03, 6.17993006e-04, 1.05261997e-05, 8.91025957e-06,
+        9.29536981e-09, 6.09605013e-05, 4.72735002e-04, 1.13482002e-10,
+        1.50015003e-05, 4.45032993e-06, 3.21612994e-08, 8.02662980e-04,
+        1.40488002e-04, 3.12508007e-07, 3.02616991e-06, 1.97759000e-08,
+        2.66913995e-02, 5.26766013e-03, 5.05053019e-03, 5.62100019e-03,
+        5.37420018e-03, 5.26280981e-03, 2.48894998e-04, 1.06842002e-04,
+        3.92931997e-06, 1.79388002e-03, 4.79440019e-03, 3.41609990e-07,
+        5.20430971e-04, 3.34090000e-05, 2.19159006e-07, 2.28786003e-03,
+        5.16703985e-05, 4.04523007e-06, 1.79227004e-06, 5.32449000e-08};
+    vector<float> bbx{
+        -1.65040009e-02, -1.84051003e-02, -1.85930002e-02, -2.08263006e-02,
+        -1.83814000e-02, -2.89172009e-02, -3.89706008e-02, -7.52277970e-02,
+        -1.54091999e-01, -2.55433004e-02, -1.77490003e-02, -1.10340998e-01,
+        -4.20190990e-02, -2.71421000e-02, 6.89801015e-03,  5.71171008e-02,
+        -1.75665006e-01, 2.30021998e-02,  3.08554992e-02,  -1.39333997e-02,
+        3.40579003e-01,  3.91070992e-01,  3.91624004e-01,  3.92527014e-01,
+        3.91445011e-01,  3.79328012e-01,  4.26631987e-01,  3.64892989e-01,
+        2.76894987e-01,  5.13985991e-01,  3.79999995e-01,  1.80457994e-01,
+        4.37402993e-01,  4.18545991e-01,  2.51549989e-01,  4.48318988e-01,
+        1.68564007e-01,  4.65440989e-01,  4.21891987e-01,  4.45928007e-01,
+        3.27155995e-03,  3.71480011e-03,  3.60032008e-03,  4.27092984e-03,
+        3.74579988e-03,  5.95752988e-03,  -3.14473989e-03, 3.52022005e-03,
+        -1.88564006e-02, 1.65188999e-03,  1.73791999e-03,  -3.56074013e-02,
+        -1.66615995e-04, 3.14146001e-03,  -1.11830998e-02, -5.35363983e-03,
+        6.49790000e-03,  -9.27671045e-03, -2.83346009e-02, -1.61233004e-02,
+        -2.15505004e-01, -2.19910994e-01, -2.20872998e-01, -2.12831005e-01,
+        -2.19145000e-01, -2.27687001e-01, -3.43973994e-01, -2.75869995e-01,
+        -3.19516987e-01, -2.50418007e-01, -2.48537004e-01, -5.08224010e-01,
+        -2.28724003e-01, -2.82402009e-01, -3.75815988e-01, -2.86352992e-01,
+        -5.28333001e-02, -4.43836004e-01, -4.55134988e-01, -4.34897989e-01,
+        -5.65053988e-03, -9.25739005e-04, -1.06790999e-03, -2.37016007e-03,
+        -9.71166010e-04, -8.90910998e-03, -1.17592998e-02, -2.08992008e-02,
+        -4.94231991e-02, 6.63906988e-03,  3.20469006e-03,  -6.44695014e-02,
+        -3.11607006e-03, 2.02738005e-03,  1.48096997e-02,  4.39785011e-02,
+        -8.28424022e-02, 3.62076014e-02,  2.71668993e-02,  1.38250999e-02,
+        6.76669031e-02,  1.03252999e-01,  1.03255004e-01,  9.89722982e-02,
+        1.03646003e-01,  4.79663983e-02,  1.11014001e-01,  9.31736007e-02,
+        1.15768999e-01,  1.04014002e-01,  -8.90677981e-03, 1.13103002e-01,
+        1.33085996e-01,  1.25405997e-01,  1.50051996e-01,  -1.13038003e-01,
+        7.01059997e-02,  1.79651007e-01,  1.41055003e-01,  1.62841007e-01,
+        -1.00247003e-02, -8.17587040e-03, -8.32176022e-03, -8.90108012e-03,
+        -8.13035015e-03, -1.77263003e-02, -3.69572006e-02, -3.51580009e-02,
+        -5.92143014e-02, -1.80795006e-02, -5.46086021e-03, -4.10550982e-02,
+        -1.83081999e-02, -2.15411000e-02, -1.17953997e-02, 3.33894007e-02,
+        -5.29635996e-02, -6.97528012e-03, -3.15250992e-03, -3.27355005e-02,
+        1.29676998e-01,  1.16080999e-01,  1.15947001e-01,  1.21797003e-01,
+        1.16089001e-01,  1.44875005e-01,  1.15617000e-01,  1.31586999e-01,
+        1.74735002e-02,  1.21973999e-01,  1.31596997e-01,  2.48907991e-02,
+        6.18605018e-02,  1.12855002e-01,  -6.99798986e-02, 9.58312973e-02,
+        1.53593004e-01,  -8.75087008e-02, -4.92327996e-02, -3.32239009e-02};
+    vector<float> im_info{60, 80, 0.166667};
+    vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(num_images, A, H, W);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = scores[i];
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable<TensorCPU>();
+      t->Resize(num_images, 4 * A, H, W);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = bbx[i];
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("im_info")->GetMutable<TensorCPU>();
+      t->Resize(num_images, 3);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = im_info[i];
+      }
+    }
+
+    {
+      auto* t = ws.CreateBlob("anchors")->GetMutable<TensorCPU>();
+      t->Resize(A, 4);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<float>()[i] = anchors[i];
+      }
+    }
+
+    NetDef netdef;
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNGenerateProposalsCPP");
+      op.add_input("X_cpu");
+      op.add_input("bbox_delta_cpu");
+      op.add_input("im_info");
+      op.add_input("anchors");
+      op.add_output("rois");
+      op.add_output("rois_probs");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("GenerateProposalsCPP");
+      op.add_input("X_cpu");
+      op.add_input("bbox_delta_cpu");
+      op.add_input("im_info");
+      op.add_input("anchors");
+      op.add_output("rois_ref");
+      op.add_output("rois_probs_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("rois")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("rois_ref")->Get<TensorCPU>();
+
+    const auto& t4 = ws.GetBlob("rois_probs")->Get<TensorCPU>();
+    const auto& t3 = ws.GetBlob("rois_probs_ref")->Get<TensorCPU>();
+
+    LOG(INFO) << "t1: " << t1.size() << " t2: " << t2.size();
+
+    const float HALF_MIN_VAL = 6.103515625e-05;
+    for (auto i = 0; i < fmin(t1.size(), t2.size()); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      const float t3_i = t3.data<float>()[i / 5];
+      if (t3_i - HALF_MIN_VAL * 2 > 0) {
+        LOG(INFO) << i << " " << t1_i << " " << t2_i << " " << t3_i;
+        CHECK_NEAR(t1_i, t2_i, 0.1);
+      }
+    }
+
+    for (auto i = 0; i < fmin(t3.size(), t4.size()); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t3_i = t3.data<float>()[i];
+      const float t4_i = t4.data<float>()[i];
+      LOG(INFO) << i << " " << t3_i;
+      CHECK_NEAR(t3_i, t4_i, 0.1);
+    }
+  }
+
+  {
+    for (const auto& batchSize : std::vector<size_t>{1, 2}) {
+      LOG(INFO) << "MPSCNNSoftmax Test";
+      Workspace ws;
+      {
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        // Only works for spatial dimension of (1, 1) - weird.
+        t->Resize(batchSize, 12, 1, 1);
+        CPUContext ctx;
+        math::RandGaussian<float, CPUContext>(
+            t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+      }
+
+      NetDef netdef;
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyToMPSCNN");
+        op.add_input("X_cpu");
+        op.add_output("X_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("MPSCNNSoftmax");
+        op.add_input("X_mtl");
+        op.add_output("Y_mtl");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("CopyFromMPSCNN");
+        op.add_input("Y_mtl");
+        op.add_output("Y_cpu");
+      }
+
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("Softmax");
+        op.add_input("X_cpu");
+        op.add_output("Y_ref");
+      }
+
+      ws.RunNetOnce(netdef);
+      const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+      const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+      CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+      LOG(INFO) << t1.dims();
+      for (auto i = 0; i < t1.size(); ++i) {
+        // FP16 <-> FP32 round trip, accumulation, etc.
+        const float t1_i = t1.data<float>()[i];
+        const float t2_i = t2.data<float>()[i];
+        CHECK_NEAR(t1_i, t2_i, 0.1);
+      }
+    }
+  }
+
+  @autoreleasepool {
+    for (const auto& inputChannels : std::vector<size_t>{3, 8}) {
+      for (const auto& outputChannels : std::vector<size_t>{3, 8}) {
+        for (const auto& batchSize : std::vector<size_t>{1, 2}) {
+          for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
+            for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
+              for (const auto& kernel_h : std::vector<int>{3}) {
+                for (const auto& kernel_w : std::vector<int>{3}) {
+                  for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
+                    for (const auto& pad_r :
+                         std::vector<int>{0, kernel_w / 2}) {
+                      for (const auto& pad_t :
+                           std::vector<int>{0, kernel_h / 2}) {
+                        for (const auto& pad_b :
+                             std::vector<int>{0, kernel_h / 2}) {
+                          for (const auto& adj : {0, 1, 2, 3}) {
+                            if (adj >= fmin(stride_h, stride_w)) {
+                              continue;
+                            }
+
+                            LOG(INFO) << "MPSConvTranspose Test";
+                            Workspace ws;
+                            {
+                              auto* t = ws.CreateBlob("X_cpu")
+                                            ->GetMutable<TensorCPU>();
+                              t->Resize(batchSize, inputChannels, 8, 12);
+                              CPUContext ctx;
+                              math::RandGaussian<float, CPUContext>(
+                                  t->size(),
+                                  0,
+                                  1,
+                                  t->mutable_data<float>(),
+                                  &ctx);
+                            }
+
+                            {
+                              auto* t =
+                                  ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                              t->Resize(
+                                  inputChannels,
+                                  outputChannels,
+                                  kernel_h,
+                                  kernel_w);
+                              CPUContext ctx;
+                              math::RandGaussian<float, CPUContext>(
+                                  t->size(),
+                                  0,
+                                  1,
+                                  t->mutable_data<float>(),
+                                  &ctx);
+                            }
+
+                            {
+                              auto* t =
+                                  ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                              t->Resize(outputChannels);
+                              CPUContext ctx;
+                              math::RandGaussian<float, CPUContext>(
+                                  t->size(),
+                                  0,
+                                  1,
+                                  t->mutable_data<float>(),
+                                  &ctx);
+                            }
+
+                            NetDef netdef;
+                            {
+                              auto& op = *(netdef.add_op());
+                              op.set_type("CopyToMPSCNN");
+                              op.add_input("X_cpu");
+                              op.add_output("X_mtl");
+                            }
+
+                            {
+                              auto& op = *(netdef.add_op());
+                              op.set_type("MPSCNNConvTranspose");
+                              op.add_input("X_mtl");
+                              op.add_input("W");
+                              op.add_input("b");
+#define ADD_ARGS(op)                    \
+  do {                                  \
+    add_arg_str(op, "order", "NCHW");   \
+    add_arg_int_list(                   \
+        op,                             \
+        std::vector<string>{"kernel_h", \
+                            "kernel_w", \
+                            "pad_t",    \
+                            "pad_b",    \
+                            "pad_l",    \
+                            "pad_r",    \
+                            "stride_w", \
+                            "stride_h", \
+                            "adj"},     \
+        std::vector<int>{kernel_h,      \
+                         kernel_w,      \
+                         pad_t,         \
+                         pad_b,         \
+                         pad_l,         \
+                         pad_r,         \
+                         stride_w,      \
+                         stride_h,      \
+                         adj});         \
+  } while (false)
+                              ADD_ARGS(op);
+                              op.add_output("Y_mtl");
+                            }
+
+                            {
+                              auto& op = *(netdef.add_op());
+                              op.set_type("CopyFromMPSCNN");
+                              op.add_input("Y_mtl");
+                              op.add_output("Y_cpu");
+                            }
+
+                            {
+                              auto& op = *(netdef.add_op());
+                              op.set_type("ConvTranspose");
+                              op.add_input("X_cpu");
+                              op.add_input("W");
+                              op.add_input("b");
+                              ADD_ARGS(op);
+                              op.add_output("Y_ref");
+                            }
+#undef ADD_ARGS
+
+                            ws.RunNetOnce(netdef);
+                            const auto& t2 =
+                                ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+                            const auto& t1 =
+                                ws.GetBlob("Y_ref")->Get<TensorCPU>();
+                            CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+                            LOG(INFO) << t1.dims();
+                            for (auto i = 0; i < t1.size(); ++i) {
+                              // FP16 <-> FP32 round trip, accumulation, etc.
+                              const float t1_i = t1.data<float>()[i];
+                              const float t2_i = t2.data<float>()[i];
+                              constexpr float tol = 2.0e-2;
+                              CHECK(
+                                  std::abs(t1_i - t2_i) <=
+                                  (tol + tol * std::abs(t1_i)))
+                                  << t1_i << ", " << t2_i;
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto array : std::vector<bool>{true, false}) {
+      for (auto numInputs = 2; numInputs <= 4; numInputs++) {
+        for (const auto batchSize : std::vector<size_t>{1, 2}) {
+          auto mtl = [&](size_t i) {
+            return std::string("X_mtl_") + std::to_string(i);
+          };
+          auto cpu = [&](size_t i) {
+            return std::string("X_cpu_") + std::to_string(i);
+          };
+
+          LOG(INFO) << "MPSCNNConcat Test" << array << ", " << numInputs << ", "
+                    << batchSize;
+          Workspace ws;
+          for (auto i = 0; i < numInputs; ++i) {
+            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
+            CPUContext ctx;
+            math::RandGaussian<float, CPUContext>(
+                t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          }
+
+          NetDef netdef;
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyToMPSCNN");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(cpu(i));
+              op.add_output(mtl(i));
+            }
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("MPSCNNConcat");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(mtl(i));
+            }
+            {
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+            }
+            op.add_output("Y_mtl");
+            op.add_output("Y_mtl_mask");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyFromMPSCNN");
+            op.add_input("Y_mtl");
+            op.add_output("Y_cpu");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("Concat");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(cpu(i));
+            }
+            {
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+            }
+
+            op.add_output("Y_ref");
+            op.add_output("Y_ref_mask");
+          }
+
+          ws.RunNetOnce(netdef);
+          const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+          const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+          CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+          LOG(INFO) << t1.dims();
+          for (auto i = 0; i < t1.size(); ++i) {
+            // FP16 <-> FP32 round trip, accumulation, etc.
+            const float t1_i = t1.data<float>()[i];
+            const float t2_i = t2.data<float>()[i];
+            CHECK_NEAR(t1_i, t2_i, 0.1);
+          }
+        }
+      }
+    }
+  }
+
+  @autoreleasepool {
+    for (const auto& batchSize : std::vector<size_t>{1, 2, 3, 4}) {
+      for (const auto& inputChannels :
+           std::vector<size_t>{1, 2, 3, 4, 16, 24, 32, 48, 96, 128, 256}) {
+        for (const auto& groups : std::vector<int>{1, 4, 8, 16}) {
+          if (inputChannels % groups != 0) {
+            continue;
+          }
+          Workspace ws;
+          {
+            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            t->Resize(batchSize, inputChannels, 53, 47);
+            CPUContext ctx;
+            math::RandGaussian<float, CPUContext>(
+                t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          }
+          NetDef netdef;
+#define ADD_ARGS(op)                                          \
+  do {                                                        \
+    add_arg_str(op, "order", "NCHW");                         \
+    add_arg_int_list(                                         \
+        op,                                                   \
+        std::vector<string>{"kernel_w", "kernel_h", "group"}, \
+        std::vector<int>{1, 1, groups});                      \
+  } while (false)
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyToMPSCNN");
+            op.add_input("X_cpu");
+            op.add_output("X_mtl");
+          }
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("MPSCNNChannelShuffle");
+            op.add_input("X_mtl");
+            ADD_ARGS(op);
+            op.add_output("Y_mtl");
+          }
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyFromMPSCNN");
+            op.add_input("Y_mtl");
+            op.add_output("Y_cpu");
+          }
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("ChannelShuffle");
+            op.add_input("X_cpu");
+            ADD_ARGS(op);
+            op.add_output("Y_ref");
+          }
+#undef ADD_ARGS
+          ws.RunNetOnce(netdef);
+          const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+          const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+          CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+          for (auto i = 0; i < t1.size(); ++i) {
+            // FP16 <-> FP32 round trip, accumulation, etc.
+            const float t1_i = t1.data<float>()[i];
+            const float t2_i = t2.data<float>()[i];
+            CHECK_NEAR(t1_i, t2_i, 0.1);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    for (const auto channelCount : std::vector<size_t>{1, 2, 3, 4}) {
+      for (auto numInputs = 2; numInputs <= 4; numInputs++) {
+        for (const auto batchSize : std::vector<size_t>{1, 2}) {
+          auto mtl = [&](size_t i) {
+            return std::string("X_mtl_") + std::to_string(i);
+          };
+          auto cpu = [&](size_t i) {
+            return std::string("X_cpu_") + std::to_string(i);
+          };
+
+          LOG(INFO) << "MPSCNNConcat(edge case) Test" << channelCount << ", "
+                    << numInputs << ", " << batchSize;
+          Workspace ws;
+          for (auto i = 0; i < numInputs; ++i) {
+            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            t->Resize(batchSize, channelCount, 9, 17);
+            CPUContext ctx;
+            math::RandGaussian<float, CPUContext>(
+                t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+          }
+
+          NetDef netdef;
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyToMPSCNN");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(cpu(i));
+              op.add_output(mtl(i));
+            }
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("MPSCNNConcat");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(mtl(i));
+            }
+            {
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+            }
+            op.add_output("Y_mtl");
+            op.add_output("Y_mtl_mask");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("CopyFromMPSCNN");
+            op.add_input("Y_mtl");
+            op.add_output("Y_cpu");
+          }
+
+          {
+            auto& op = *(netdef.add_op());
+            op.set_type("Concat");
+            for (auto i = 0; i < numInputs; ++i) {
+              op.add_input(cpu(i));
+            }
+            {
+              auto& arg = *(op.add_arg());
+              arg.set_name("order");
+              arg.set_s("NCHW");
+            }
+
+            op.add_output("Y_ref");
+            op.add_output("Y_ref_mask");
+          }
+
+          ws.RunNetOnce(netdef);
+          const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+          const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+          CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+          LOG(INFO) << t1.dims();
+          for (auto i = 0; i < t1.size(); ++i) {
+            // FP16 <-> FP32 round trip, accumulation, etc.
+            const float t1_i = t1.data<float>()[i];
+            const float t2_i = t2.data<float>()[i];
+            CHECK_NEAR(t1_i, t2_i, 0.1);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNReadCount Test";
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.add_input("X_cpu");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.add_input("X_mtl");
+      op.add_output("X_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.add_input("X_mtl");
+      op.add_output("Y");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.add_input("X_mtl");
+      op.add_output("X_mtl");
+    }
+    netdef = annotateDefWithReadCounts(netdef);
+    auto rc = [&](size_t i) -> size_t {
+      auto* arg = GetMutableArgument(
+          "__mpscnn_read_count__", false, netdef.mutable_op(i));
+      if (!arg) {
+        return 1;
+      }
+      return arg->i();
+    };
+    CHECK_EQ(rc(0), 1);
+    CHECK_EQ(rc(1), 2);
+    CHECK_EQ(rc(2), 1);
+    CHECK_EQ(rc(3), 1);
+  }
+
+  {
+    for (const auto& computeOp : std::vector<std::string>{"FC", "Conv"}) {
+      LOG(INFO) << "MPSCNNRewriteForMetal Fusion/Copy Test";
+      NetDef netdef;
+      netdef.add_external_input("X");
+      netdef.add_external_output("Y");
+      // These two ops can be fused.
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type(computeOp);
+        op.add_input("X");
+        op.add_input("W");
+        op.add_input("b");
+        op.add_output("Y");
+      }
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("Relu");
+        op.add_input("Y");
+        op.add_output("Y");
+      }
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type(computeOp);
+        op.add_input("X2");
+        op.add_input("W");
+        op.add_input("b");
+        op.add_output("Y2");
+      }
+      {
+        auto& op = *(netdef.add_op());
+        op.set_type("Relu");
+        op.add_input("Y2");
+        op.add_output("Y");
+      }
+      netdef = rewriteForMetal(netdef);
+      auto ty = [&](size_t i) { return netdef.op(i).type(); };
+      auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
+      auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
+      CHECK_EQ(netdef.op_size(), 4);
+      CHECK_EQ(ty(0), "CopyToMPSCNN");
+      CHECK_EQ(ty(1), std::string("MPSCNN") + computeOp + std::string("Relu"));
+      CHECK_EQ(ty(2), std::string("MPSCNN") + computeOp + std::string("Relu"));
+      CHECK_EQ(ty(3), "CopyFromMPSCNN");
+      CHECK_EQ(i0(0), "X");
+      CHECK_EQ(i0(1), o0(0));
+      CHECK_EQ(i0(2), "X2");
+      CHECK_EQ(o0(2), i0(3));
+      CHECK_EQ(o0(3), "Y");
+      CHECK_EQ(netdef.external_input(0), "X");
+      CHECK_EQ(netdef.external_output(0), "Y");
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNRewriteForMetal Failure Test";
+    NetDef netdef;
+    netdef.add_external_input("X");
+    netdef.add_external_output("Y");
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X");
+      op.add_input("W");
+      op.add_input("b");
+      op.add_output("Y1");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X");
+      op.add_input("W");
+      op.add_input("b");
+      op.add_output("Y2");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Concat");
+      op.add_input("Y1");
+      op.add_input("Y2");
+      op.add_output("Y");
+    }
+    try {
+      netdef = rewriteForMetal(netdef);
+      CHECK(false) << "Shouldn't reach here, due to multiple usages of X";
+    } catch (const std::exception& e) {
+      // Nothing.
+    }
+  }
+
+  {
+    LOG(INFO) << "MPSCNNRewriteForMetal out-of-place Fusion Test";
+    NetDef netdef;
+    netdef.add_external_input("X");
+    netdef.add_external_output("Z");
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X");
+      op.add_input("W");
+      op.add_input("b");
+      op.add_output("Y");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Y");
+      op.add_output("Z");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Z");
+      op.add_output("Z");
+    }
+    netdef = rewriteForMetal(netdef);
+    CHECK_EQ(netdef.op_size(), 4);
+    auto ty = [&](size_t i) { return netdef.op(i).type(); };
+    auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
+    auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
+    CHECK_EQ(ty(0), "CopyToMPSCNN");
+    CHECK_EQ(ty(1), "MPSCNNConvRelu");
+    CHECK_EQ(ty(2), "MPSCNNRelu");
+    CHECK_EQ(ty(3), "CopyFromMPSCNN");
+    CHECK_EQ(i0(1), o0(0));
+    CHECK_EQ(o0(1), "Z");
+    CHECK_EQ(i0(2), "Z");
+    CHECK_EQ(o0(2), i0(3));
+  }
+
+  {
+    LOG(INFO) << "MPSCNNRewriteForMetal out-of-place fusion failure test";
+    NetDef netdef;
+    netdef.add_external_input("X");
+    netdef.add_external_output("Z");
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X");
+      op.add_input("W");
+      op.add_input("b");
+      op.add_output("Y");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Y");
+      op.add_output("Z");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Y");
+      op.add_output("Z");
+    }
+    netdef = rewriteForMetal(netdef);
+    CHECK_EQ(netdef.op_size(), 5);
+    auto ty = [&](size_t i) { return netdef.op(i).type(); };
+    auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
+    auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
+    CHECK_EQ(ty(0), "CopyToMPSCNN");
+    CHECK_EQ(ty(1), "MPSCNNConv");
+    CHECK_EQ(ty(2), "MPSCNNRelu");
+    CHECK_EQ(ty(3), "MPSCNNRelu");
+    CHECK_EQ(ty(4), "CopyFromMPSCNN");
+    CHECK_EQ(i0(1), o0(0));
+    CHECK_EQ(o0(1), "Y");
+    CHECK_EQ(i0(2), o0(1));
+    CHECK_EQ(o0(2), "Z");
+    CHECK_EQ(i0(3), o0(1));
+    CHECK_EQ(o0(3), i0(4));
+  }
+
+  {
+    LOG(INFO) << "MPSCNNRewriteForMetal PreProcess/Deprocess Test";
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
+      op.add_input("X");
+      op.add_output("Y");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("Y");
+      op.add_output("Y");
+    }
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+      op.add_input("Y");
+      op.add_output("Z");
+    }
+    netdef = rewriteForMetal(netdef);
+    auto ty = [&](size_t i) { return netdef.op(i).type(); };
+    auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
+    auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
+    CHECK_EQ(netdef.op_size(), 3);
+    CHECK_EQ(ty(0), "MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess");
+    CHECK_EQ(ty(1), "MPSCNNRelu");
+    CHECK_EQ(ty(2), "MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+    CHECK_EQ(i0(0), "X");
+    CHECK_EQ(i0(1), o0(0));
+    CHECK_EQ(i0(2), o0(1));
+    CHECK_EQ(o0(2), "Z");
+  }
+  LOG(INFO) << "All MPSCNN tests passed.";
+}
+
+NetDef truncateAfter(NetDef def, size_t idx) {
+  // idx = 0, net = 10 -> remove 9
+  // idx = 0, net = 1 -> remove 0
+  const auto toRemove = def.op_size() - idx - 1;
+  for (auto i = 0; i < toRemove; ++i) {
+    def.mutable_op()->RemoveLast();
+  }
+  CHECK_EQ(def.op_size(), idx + 1);
+  return def;
+}
+
+NetDef addMPSCNNCopyFinalizer(NetDef def) {
+  CHECK_GE(def.op_size(), 1);
+  const auto name = def.mutable_op(def.op_size() - 1)->output(0);
+  def.mutable_op(def.op_size() - 1)->set_output(0, "METAL_COPIER");
+  {
+    auto& op = *(def.add_op());
+    op.set_type("CopyFromMPSCNN");
+    op.add_input("METAL_COPIER");
+    op.add_output(name);
+  }
+  return def;
+}
+
+void compareModels(const NetDef& initNet, NetDef predictNet) {
+  auto* arg = predictNet.mutable_op(0)->mutable_arg(0);
+  CHECK_EQ(arg->name(), "noise_std");
+  arg->set_f(0.000001);
+
+  NetDef metalPredictNet;
+  CAFFE_ENFORCE(tryConvertToMPSCNN(initNet, predictNet, &metalPredictNet));
+
+  // TODO: consider last op as well.
+  for (auto i = 0; i < predictNet.op_size(); ++i) {
+    auto truncatedPredictNet = truncateAfter(predictNet, i);
+    auto truncatedMetalPredictNet = truncateAfter(metalPredictNet, i);
+    // For all but the last op, we need to add a copy op.
+    if (i != predictNet.op_size() - 1) {
+      truncatedMetalPredictNet =
+          addMPSCNNCopyFinalizer(truncatedMetalPredictNet);
+    }
+
+    dumpDef(truncatedPredictNet);
+    dumpDef(truncatedMetalPredictNet);
+
+    Workspace cws;
+    cws.RunNetOnce(initNet);
+    {
+      auto* t =
+          cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+      t->Resize(1, 224, 224, 4);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<uint8_t>()[i] = i % 225;
+      }
+    }
+    cws.RunNetOnce(truncatedPredictNet);
+
+    Workspace mws;
+    mws.RunNetOnce(initNet);
+    {
+      auto* t =
+          mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+      t->Resize(1, 224, 224, 4);
+      for (auto i = 0; i < t->size(); ++i) {
+        t->mutable_data<uint8_t>()[i] = i % 225;
+      }
+    }
+    mws.RunNetOnce(truncatedMetalPredictNet);
+
+    const auto name =
+        truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
+
+    LOG(INFO) << "Checking correspondence for name: " << name << ", idx: " << i;
+    {
+      const auto& mt = mws.GetBlob(name)->Get<TensorCPU>();
+      const auto& ct = cws.GetBlob(name)->Get<TensorCPU>();
+      CHECK_EQ(mt.dims(), ct.dims());
+      for (auto j = 0; j < mt.size(); ++j) {
+        if (mt.IsType<float>()) {
+          if (j < 10) {
+            LOG(INFO) << "i: " << i << ", j: " << j
+                      << ", CPU: " << ct.data<float>()[j]
+                      << ", MTL: " << mt.data<float>()[j];
+          }
+          CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
+        } else {
+          CHECK(mt.IsType<uint8_t>());
+          if (j < 10) {
+            LOG(INFO) << "i: " << i << ", j: " << j
+                      << ", CPU: " << ct.data<uint8_t>()[j]
+                      << ", MTL: " << mt.data<uint8_t>()[j];
+          }
+          CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
+        }
+      }
+    }
+  }
+}
+void verifyRewrite(
+    const NetDef& initNet,
+    const NetDef& net,
+    std::vector<int> inputDims) {
+  NetDef metalPredictNet;
+  NetDef predictNet = setSpecialArgs(net);
+  CAFFE_ENFORCE(tryConvertToMPSCNNIntermediateCopies(
+      initNet, predictNet, &metalPredictNet));
+  dumpDef(predictNet);
+  dumpDef(metalPredictNet);
+
+#define RUN_NET(ws, predictNet)                                               \
+  ws.RunNetOnce(initNet);                                                     \
+  {                                                                           \
+    auto* t =                                                                 \
+        ws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>(); \
+    t->Resize(inputDims);                                                     \
+    CPUContext ctx;                                                           \
+    math::RandGaussian<float, CPUContext>(                                    \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                     \
+  }                                                                           \
+  ws.RunNetOnce(predictNet);
+
+  // initialize
+  getMPSCNNContext();
+
+  Workspace cws;
+  RUN_NET(cws, predictNet);
+
+  Workspace mws;
+  RUN_NET(mws, metalPredictNet);
+
+  for (auto i = 0; i < predictNet.external_output_size(); i++) {
+    auto blobName = predictNet.external_output(i);
+    LOG(INFO) << "Checking output blob:" << blobName;
+    const auto& mt = mws.GetBlob(blobName)->Get<TensorCPU>();
+    const auto& ct = cws.GetBlob(blobName)->Get<TensorCPU>();
+    if (mt.size() == 0 || ct.size() == 0) {
+      LOG(INFO) << "One of the operator failed.";
+      return;
+    }
+    // CHECK_EQ(mt.dims(), ct.dims());
+    for (auto j = 0; j < fmin(mt.size(), ct.size()); ++j) {
+      if (mt.IsType<float>()) {
+        if (j < 10) {
+          LOG(INFO) << "i: " << i << ", j: " << j
+                    << ", CPU: " << ct.data<float>()[j]
+                    << ", MTL: " << mt.data<float>()[j];
+        }
+        // Disabling check for now because of precision issues
+        // CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
+      } else {
+        LOG(INFO) << "Type uint8_t";
+        CHECK(mt.IsType<uint8_t>());
+        if (j < 10) {
+          LOG(INFO) << "i: " << i << ", j: " << j
+                    << ", CPU: " << ct.data<uint8_t>()[j]
+                    << ", MTL: " << mt.data<uint8_t>()[j];
+        }
+        // Disabling check for now.
+        // CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
+      }
+    }
+  }
+  LOG(INFO) << "rewrite test passed.";
+}
+}
+
+#endif
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
new file mode 100644
index 0000000..e6d9daa
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -0,0 +1,161 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+#include <random>
+
+namespace caffe2 {
+
+namespace {
+
+void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+      tensor->size(), 0.0f, 3.0f, tensor->mutable_data<float>(), &context);
+  for (auto i = 0; i < tensor->size(); ++i) {
+    tensor->mutable_data<float>()[i] =
+        std::min(-5.0f, std::max(5.0f, tensor->mutable_data<float>()[i]));
+  }
+}
+
+void compareMaxPooling(int N,
+                       int C,
+                       int H,
+                       int W,
+                       int kernelH,
+                       int kernelW,
+                       int strideH,
+                       int strideW,
+                       int padT,
+                       int padL,
+                       int padB,
+                       int padR,
+                       float maxRelErr = 1.0e-5f,
+                       float absErrForRelErrFailure = 1.0e-5f) {
+  Workspace ws;
+
+  OperatorDef def1;
+  def1.set_name("test");
+  def1.set_type("MaxPool");
+  def1.add_input("X");
+  def1.add_output("Y");
+
+  def1.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  def1.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  def1.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  def1.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+
+  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+
+  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
+  EXPECT_NE(nullptr, op1.get());
+  EXPECT_TRUE(op1->Run());
+
+  const auto& X = ws.GetBlob("X")->Get<TensorCPU>();
+  const auto& Y = ws.GetBlob("Y")->Get<TensorCPU>();
+
+  // Compare all output points
+  for (int n = 0; n < Y.dim32(0); ++n) {
+    for (int c = 0; c < Y.dim32(1); ++c) {
+      for (int ph = 0; ph < Y.dim32(2); ++ph) {
+        for (int pw = 0; pw < Y.dim32(3); ++pw) {
+          // Reference implementations
+          int hstart = ph * strideH - padT;
+          int wstart = pw * strideW - padL;
+          int hend = std::min(hstart + kernelH, H);
+          int wend = std::min(wstart + kernelW, W);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * Y.dim32(3) + pw;
+          float v = std::numeric_limits<float>::lowest();
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const auto* Xdata =
+                  X.data<float>() + n * X.dim(1) * X.dim(2) * X.dim(3) + c * X.dim(2) * X.dim(3);
+              const int input_index = h * W + w;
+              v = std::max(v, Xdata[input_index]);
+            }
+          }
+          EXPECT_EQ(Y.data<float>()[n * Y.dim(1) * Y.dim(2) * Y.dim(3) + c * Y.dim(2) * Y.dim(3) +
+                                    ph * Y.dim(3) + pw],
+                    v);
+        }
+      }
+    }
+  }
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+void runMaxPool(int kernel, int stride, int pad) {
+  int N = randInt(1, 2);
+  int C = randInt(1, 12);
+  int H = randInt(50, 100);
+  int W = randInt(50, 100);
+  int planesOut = randInt(1, 6);
+
+  compareMaxPooling(N, C, H, W, kernel, kernel, stride, stride, pad, pad, pad, pad);
+}
+
+TEST(PoolOp, MaxPool2x2s2p0Randomized) {
+  for (int i = 0; i < 40; ++i) {
+    runMaxPool(2, 2, 0);
+  }
+}
+
+TEST(PoolOp, MaxPool4x4s3p2Randomized) {
+  for (int i = 0; i < 40; ++i) {
+    runMaxPool(4, 3, 2);
+  }
+}
+
+TEST(PoolOp, MaxPool2x2s2p0Special) {
+  // 2x2s2p0 where H/W % 4 == 0
+  compareMaxPooling(2, 10, 40, 40, 2, 2, 2, 2, 0, 0, 0, 0, 0.05f, 0.1f);
+
+  // 2x2s2p0 where H/W % 4 != 0
+  compareMaxPooling(2, 10, 39, 39, 2, 2, 2, 2, 0, 0, 0, 0, 0.05f, 0.1f);
+
+  // 2x2s2p0 where H/W % 16 == 0
+  compareMaxPooling(2, 10, 64, 64, 2, 2, 2, 2, 0, 0, 0, 0, 0.05f, 0.1f);
+}
+
+TEST(PoolOp, MaxPoolFullyRandomized) {
+  for (auto i = 0; i < 40; ++i) {
+    auto kernelH = randInt(1, 5);
+    auto kernelW = randInt(1, 5);
+    auto strideH = randInt(1, 5);
+    auto strideW = randInt(1, 5);
+    auto padL = randInt(0, kernelW - 1);
+    auto padR = randInt(0, kernelW - 1);
+    auto padT = randInt(0, kernelH - 1);
+    auto padB = randInt(0, kernelH - 1);
+    auto H = randInt(std::max(1, kernelH - padT - padB), 100);
+    auto W = randInt(std::max(1, kernelW - padL - padR), 100);
+    auto C = randInt(1, 10);
+    auto N = randInt(1, 2);
+    compareMaxPooling(
+        N, C, H, W, kernelH, kernelW, strideH, strideW, padT, padL, padB, padR);
+  }
+}
+} // unnamed namespace
+
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
new file mode 100644
index 0000000..5a14f46
--- /dev/null
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -0,0 +1,96 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+#include <random>
+
+namespace caffe2 {
+
+namespace {
+
+void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+      tensor->size(), 0.0f, 3.0f, tensor->mutable_data<float>(), &context);
+  for (auto i = 0; i < tensor->size(); ++i) {
+    tensor->mutable_data<float>()[i] =
+        std::min(-5.0f, std::max(5.0f, tensor->mutable_data<float>()[i]));
+  }
+}
+
+void compareResizeNeareast(int N,
+                       int C,
+                       int H,
+                       int W,
+                       float wscale,
+                       float hscale) {
+  Workspace ws;
+
+  OperatorDef def1;
+  def1.set_name("test");
+  def1.set_type("ResizeNearest");
+  def1.add_input("X");
+  def1.add_output("Y");
+
+  def1.add_arg()->CopyFrom(MakeArgument("width_scale", wscale));
+  def1.add_arg()->CopyFrom(MakeArgument("height_scale", hscale));
+
+  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+
+  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
+  EXPECT_NE(nullptr, op1.get());
+  EXPECT_TRUE(op1->Run());
+
+  const auto& X = ws.GetBlob("X")->Get<TensorCPU>();
+  const auto& Y = ws.GetBlob("Y")->Get<TensorCPU>();
+
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+
+  int outW = W * wscale;
+  int outH = H * hscale;
+
+  // Compare all output points
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      for (int ph = 0; ph < outH; ++ph) {
+        const int iny = std::min((int)(ph / hscale), (H - 1));
+        for (int pw = 0; pw < outW; ++pw) {
+          const int inx = std::min((int)(pw / wscale), (W - 1));
+          const float v = Xdata[iny * W + inx];
+          EXPECT_EQ(Ydata[outW * ph + pw], v);
+        }
+      }
+      Xdata += H * W;
+      Ydata += outW * outH;
+    }
+  }
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+TEST(ResizeNearestOp, ResizeNearest2x) {
+  for (auto i = 0; i < 40; ++i) {
+    auto H = randInt(1, 100);
+    auto W = randInt(1, 100);
+    auto C = randInt(1, 10);
+    auto N = randInt(1, 2);
+    compareResizeNeareast(N, C, H, W, 2.0f, 2.0f);
+  }
+}
+
+} // unnamed namespace
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/libopencl-stub/LICENSE b/caffe2/mobile/contrib/libopencl-stub/LICENSE
new file mode 100644
index 0000000..6bb8a29
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/LICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org>
\ No newline at end of file
diff --git a/caffe2/mobile/contrib/libopencl-stub/Makefile b/caffe2/mobile/contrib/libopencl-stub/Makefile
new file mode 100644
index 0000000..c578003
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/Makefile
@@ -0,0 +1,23 @@
+CC = gcc
+RANLIB = ranlib
+
+LIBSRC = src/libopencl.c
+LIBOBJ=$(LIBSRC:.c=.o)
+
+CFLAGS = -O2 -fPIC -I ./include -Wall
+
+LIBOPENCL = libOpenCL.a
+TARGETS = $(LIBOPENCL)
+
+all: $(TARGETS)
+
+libopencl.o: libopencl.c
+	$(CC) $(CFLAGS) -c libopencl.c -o libopencl.o
+
+$(TARGETS): $(LIBOBJ)
+	ar rcs $(LIBOPENCL) src/libopencl.o
+	$(RANLIB) $(LIBOPENCL)
+
+clean:
+	rm -f $(TARGETS) $(LIBOBJ)
+
diff --git a/caffe2/mobile/contrib/libopencl-stub/README.md b/caffe2/mobile/contrib/libopencl-stub/README.md
new file mode 100644
index 0000000..20b3daf
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/README.md
@@ -0,0 +1,20 @@
+libopencl-stub
+==============
+
+A stub opecl library that dynamically dlopen/dlsyms opencl implementations at runtime based on environment variables. Will be useful when opencl implementations are installed in non-standard paths (say pocl on android)
+
+
+
+ LIBOPENCL_SO_PATH      -- Path to opencl so that will be searched first
+ 
+ LIBOPENCL_SO_PATH_2    -- Searched second
+ 
+ LIBOPENCL_SO_PATH_3    -- Searched third
+ 
+ LIBOPENCL_SO_PATH_4    -- Searched fourth
+ 
+ 
+ 
+
+Default paths will be searched otherwise
+
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.h
new file mode 100644
index 0000000..91f1dc1
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.h
@@ -0,0 +1,1213 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+// reserved                                         (1 << 6)
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   /* platform */,
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */,
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */,
+               cl_uint          /* num_entries */,
+               cl_device_id *   /* devices */,
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */,
+                 cl_context_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */,
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */,
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */,
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */,
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */,
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */,
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */,
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */,
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
+                     cl_mem             /* buffer */,
+                     cl_bool            /* blocking_write */,
+                     size_t             /* offset */,
+                     size_t             /* size */,
+                     const void *       /* ptr */,
+                     cl_uint            /* num_events_in_wait_list */,
+                     const cl_event *   /* event_wait_list */,
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */,
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */,
+                    const void *       /* pattern */,
+                    size_t             /* pattern_size */,
+                    size_t             /* offset */,
+                    size_t             /* size */,
+                    cl_uint            /* num_events_in_wait_list */,
+                    const cl_event *   /* event_wait_list */,
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */,
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */,
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */,
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */,
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */,
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */,
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */,
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */,
+                   const void *       /* fill_color */,
+                   const size_t *     /* origin[3] */,
+                   const size_t *     /* region[3] */,
+                   cl_uint            /* num_events_in_wait_list */,
+                   const cl_event *   /* event_wait_list */,
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */,
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */,
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */,
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */,
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */,
+                  cl_bool           /* blocking_map */,
+                  cl_map_flags      /* map_flags */,
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *),
+                      void *            /* args */,
+                      size_t            /* cb_args */,
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */,
+                size_t                  /* image_row_pitch */,
+                size_t                  /* image_slice_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
new file mode 100644
index 0000000..4d48b6f
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
@@ -0,0 +1,12453 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.6
+ *   \date August 2013
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#elif defined(__ANDROID__)
+#include <GLES/gl.h>
+#include <CL/opencl.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX) || defined(__ANDROID__)
+#include <alloca.h>
+
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            int copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {    
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+  
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {	
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+    
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+};  
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+    {
+#ifdef _WIN32
+        return (int)(InterlockedCompareExchange(
+           (volatile long*)dest, 
+           (long)exchange, 
+           (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+        return (__sync_val_compare_and_swap(
+            dest, 
+            comparand, 
+            exchange));
+#endif // !_WIN32
+    }
+
+    inline void fence() { __sync_synchronize(); }
+} // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' ) {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from Device.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const Device& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from Platform.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const Platform& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (errResult != NULL) {
+            *errResult = err;
+        }
+        
+        return ids[0];
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static volatile int default_initialized_;
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseContext() on the value held by this instance.
+     */
+    ~Context() { }
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainContext() on the parameter's cl_context.
+     */
+    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from Context.
+     * 
+     *  This calls clRetainContext() on the parameter and clReleaseContext() on
+     *  the previous value held by this instance.
+     */
+    Context& operator = (const Context& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseEvent() on the value held by this instance.
+     */
+    ~Event() { }
+ 
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainEvent() on the parameter's cl_event.
+     */
+    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const Event& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_event.
+     * 
+     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+     *  the previous value held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (cl_event*)&events.front()),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    //! \brief Copy constructor - performs shallow copy.
+    UserEvent(const UserEvent& event) : Event(event) { }
+
+    //! \brief Assignment Operator - performs shallow copy.
+    UserEvent& operator = (const UserEvent& rhs)
+    {
+        if (this != &rhs) {
+            Event::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (cl_event*)&events.front()),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+ 
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseMemObject() on the value held by this instance.
+     */
+    ~Memory() {}
+
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainMemObject() on the parameter's cl_mem.
+     */
+    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from Memory.
+     * 
+     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+     *  on the previous value held by this instance.
+     */
+    Memory& operator = (const Memory& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from Buffer - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const BufferD3D10& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const BufferGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image(const Image& image) : Memory(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from Image - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const Image& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D(const Image1D& image1D) : Image(image1D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from Image1D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const Image1D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const Image1DBuffer& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const Image1DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from Image2D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from Image2DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const Image2DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image2D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const Image2DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D(const Image3D& image3D) : Image(image3D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from Image3D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const Image3D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from Image3DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const Image3DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    ImageGL(const ImageGL& image) : Image(image) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const ImageGL& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseSampler() on the value held by this instance.
+     */
+    ~Sampler() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainSampler() on the parameter's cl_sampler.
+     */
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from Sampler.
+     * 
+     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
+     *  on the previous value held by this instance.
+     */
+    Sampler& operator = (const Sampler& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseKernel() on the value held by this instance.
+     */
+    ~Kernel() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainKernel() on the parameter's cl_kernel.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from Kernel.
+     * 
+     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
+     *  on the previous value held by this instance.
+     */
+    Kernel& operator = (const Kernel& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+		bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, binaryStatus != NULL
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+	cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+				0,
+				NULL,
+				NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    cl_program prog = ::clLinkProgram(
+        Context::getDefault()(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static volatile int default_initialized_;
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+	typedef detail::KernelFunctorGlobal<             
+		       T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_ext.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_ext.h
new file mode 100644
index 0000000..632cb21
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_ext.h
@@ -0,0 +1,251 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+	#include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+	#include <CL/cl.h>
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl.h
new file mode 100644
index 0000000..af2036c
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl.h
@@ -0,0 +1,162 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl_ext.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000..77d5353
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_platform.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_platform.h
new file mode 100644
index 0000000..cf2b721
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_platform.h
@@ -0,0 +1,1254 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y; };
+   __extension__ struct{ cl_char  s0, s1; };
+   __extension__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3; };
+   __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y; };
+   __extension__ struct{ cl_uchar  s0, s1; };
+   __extension__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y; };
+   __extension__ struct{ cl_short  s0, s1; };
+   __extension__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3; };
+   __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y; };
+   __extension__ struct{ cl_ushort  s0, s1; };
+   __extension__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y; };
+   __extension__ struct{ cl_int  s0, s1; };
+   __extension__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3; };
+   __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y; };
+   __extension__ struct{ cl_uint  s0, s1; };
+   __extension__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y; };
+   __extension__ struct{ cl_long  s0, s1; };
+   __extension__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3; };
+   __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y; };
+   __extension__ struct{ cl_ulong  s0, s1; };
+   __extension__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y; };
+   __extension__ struct{ cl_float  s0, s1; };
+   __extension__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3; };
+   __extension__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y; };
+   __extension__ struct{ cl_double s0, s1; };
+   __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3; };
+   __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/opencl.h b/caffe2/mobile/contrib/libopencl-stub/include/CL/opencl.h
new file mode 100644
index 0000000..3f00524
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/libopencl.h b/caffe2/mobile/contrib/libopencl-stub/include/libopencl.h
new file mode 100644
index 0000000..c8f3cef
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/include/libopencl.h
@@ -0,0 +1,248 @@
+#ifndef LIBOPENCL_STUB_H
+#define LIBOPENCL_STUB_H
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <dlfcn.h>
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*f_pfn_notify)(const char *, const void *, size_t, void *);
+
+typedef cl_int (*f_clGetPlatformIDs) (cl_uint, cl_platform_id *, cl_uint *);
+
+typedef cl_int (*f_clGetPlatformInfo) (cl_platform_id, cl_platform_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clGetDeviceIDs) (cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+
+typedef cl_int (*f_clGetDeviceInfo) (cl_device_id, cl_device_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clCreateSubDevices) (cl_device_id, const cl_device_partition_property *,
+					cl_uint, cl_device_id *, cl_uint *);
+
+typedef cl_int (*f_clRetainDevice) (cl_device_id);
+
+typedef cl_int (*f_clReleaseDevice) (cl_device_id);
+
+typedef cl_context (*f_clCreateContext) (const cl_context_properties *, cl_uint, const cl_device_id *,
+                			f_pfn_notify, void *, cl_int *);
+
+typedef cl_context (*f_clCreateContextFromType) (const cl_context_properties *, cl_device_type,
+                        		f_pfn_notify, void *, cl_int *);
+
+typedef cl_int (*f_clRetainContext) (cl_context);
+
+typedef cl_int (*f_clReleaseContext) (cl_context);
+
+typedef cl_int (*f_clGetContextInfo) (cl_context, cl_context_info, size_t, void *, size_t *);
+
+typedef cl_command_queue (*f_clCreateCommandQueue) (cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+
+typedef cl_int (*f_clRetainCommandQueue) (cl_command_queue);
+
+typedef cl_int (*f_clReleaseCommandQueue) (cl_command_queue);
+
+typedef cl_int (*f_clGetCommandQueueInfo) (cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+
+typedef cl_mem (*f_clCreateBuffer) (cl_context, cl_mem_flags, size_t, void *, cl_int *);
+
+typedef cl_mem (*f_clCreateSubBuffer) (cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+
+typedef cl_mem (*f_clCreateImage) (cl_context, cl_mem_flags, const cl_image_format *, const cl_image_desc *, void *, cl_int *);
+
+typedef cl_int (*f_clRetainMemObject) (cl_mem);
+
+typedef cl_int (*f_clReleaseMemObject) (cl_mem);
+
+typedef cl_int (*f_clGetMemObjectInfo) (cl_mem, cl_mem_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clGetImageInfo) (cl_mem, cl_image_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clSetMemObjectDestructorCallback) (cl_mem, void (*pfn_notify)( cl_mem memobj, void* user_data), void *);
+
+typedef cl_int (*f_clGetSupportedImageFormats) (cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *);
+
+typedef cl_sampler (*f_clCreateSampler) (cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *);
+
+typedef cl_int (*f_clRetainSampler) (cl_sampler);
+
+typedef cl_int (*f_clReleaseSampler) (cl_sampler);
+
+typedef cl_int (*f_clGetSamplerInfo) (cl_sampler, cl_sampler_info, size_t, void *, size_t *);
+
+typedef cl_program (*f_clCreateProgramWithSource) (cl_context, cl_uint, const char **, const size_t *, cl_int *);
+
+typedef cl_program (*f_clCreateProgramWithBinary) (cl_context, cl_uint, const cl_device_id *,
+        const size_t *, const unsigned char **, cl_int *, cl_int *);
+
+typedef cl_program (*f_clCreateProgramWithBuiltInKernels) (cl_context, cl_uint, const cl_device_id *, const char *, cl_int *);
+
+typedef cl_int (*f_clRetainProgram) (cl_program);
+
+typedef cl_int (*f_clReleaseProgram) (cl_program);
+
+typedef cl_int (*f_clBuildProgram) (cl_program, cl_uint, const cl_device_id *, const char *,
+        void (*pfn_notify)(cl_program program, void * user_data), void *);
+
+typedef cl_int (*f_clCompileProgram) (cl_program, cl_uint, const cl_device_id *, const char *, cl_uint, const cl_program *,
+        const char **, void (*pfn_notify)(cl_program program, void * user_data), void *);
+
+typedef cl_program (*f_clLinkProgram) (cl_context, cl_uint, const cl_device_id *, const char *, cl_uint, const cl_program *,
+                    void (*pfn_notify)(cl_program program, void * user_data), void *, cl_int *);
+
+typedef cl_int (*f_clUnloadPlatformCompiler)(cl_platform_id);
+
+typedef cl_int (*f_clGetProgramInfo) (cl_program, cl_program_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clGetProgramBuildInfo) (cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+
+typedef cl_kernel (*f_clCreateKernel) (cl_program, const char *, cl_int *);
+
+typedef cl_int (*f_clCreateKernelsInProgram) (cl_program, cl_uint, cl_kernel *, cl_uint *);
+
+typedef cl_int (*f_clRetainKernel) (cl_kernel);
+
+typedef cl_int (*f_clReleaseKernel) (cl_kernel);
+
+typedef cl_int (*f_clSetKernelArg) (cl_kernel, cl_uint, size_t,const void *);
+
+typedef cl_int (*f_clGetKernelInfo) (cl_kernel, cl_kernel_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clGetKernelArgInfo) (cl_kernel, cl_uint, cl_kernel_arg_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clGetKernelWorkGroupInfo) (cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clWaitForEvents) (cl_uint, const cl_event *);
+
+typedef cl_int (*f_clGetEventInfo) (cl_event, cl_event_info, size_t, void *, size_t *);
+
+typedef cl_event (*f_clCreateUserEvent) (cl_context, cl_int *);
+
+typedef cl_int (*f_clRetainEvent) (cl_event);
+
+typedef cl_int (*f_clReleaseEvent) (cl_event);
+
+typedef cl_int (*f_clSetUserEventStatus) (cl_event, cl_int);
+
+typedef cl_int (*f_clSetEventCallback) (cl_event, cl_int, void (*pfn_notify)(cl_event, cl_int, void *), void *);
+
+typedef cl_int (*f_clGetEventProfilingInfo) (cl_event, cl_profiling_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clFlush) (cl_command_queue);
+
+typedef cl_int (*f_clFinish) (cl_command_queue);
+
+typedef cl_int (*f_clEnqueueReadBuffer) (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueReadBufferRect) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *,
+                            size_t, size_t, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueWriteBuffer) (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueWriteBufferRect) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *,
+                            size_t, size_t, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueFillBuffer) (cl_command_queue, cl_mem, const void *, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueCopyBuffer) (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueCopyBufferRect) (cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *,
+                            size_t, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueReadImage) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *,
+							size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueWriteImage) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *,
+							size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueFillImage) (cl_command_queue, cl_mem, const void *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueCopyImage) (cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *,
+          cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueCopyImageToBuffer) (cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, size_t, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueCopyBufferToImage) (cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+
+typedef void * (*f_clEnqueueMapBuffer) (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t,
+						size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+
+typedef void * (*f_clEnqueueMapImage) (cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *,
+                  size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *);
+
+typedef cl_int (*f_clEnqueueUnmapMemObject) (cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueMigrateMemObjects)(cl_command_queue, cl_uint, const cl_mem *, cl_mem_migration_flags,
+						cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *,
+                       const size_t *, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueTask)(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueNativeKernel)(cl_command_queue, void (*user_func)(void *),  void *, size_t,
+                      cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueMarkerWithWaitList)(cl_command_queue, cl_uint, const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueBarrierWithWaitList)(cl_command_queue, cl_uint, const cl_event *, cl_event *);
+
+typedef void * (*f_clGetExtensionFunctionAddressForPlatform)(cl_platform_id, const char *);
+
+typedef cl_mem (*f_clCreateImage2D)(cl_context, cl_mem_flags,const cl_image_format *, size_t, size_t,
+                				size_t, void *, cl_int *);
+
+typedef cl_mem (*f_clCreateImage3D)(cl_context, cl_mem_flags, const cl_image_format *, size_t,
+                		size_t, size_t, size_t, size_t, void *, cl_int *);
+
+typedef cl_int (*f_clEnqueueMarker)(cl_command_queue, cl_event *);
+
+typedef cl_int(*f_clEnqueueWaitForEvents)(cl_command_queue, cl_uint, const cl_event *);
+
+typedef cl_int (*f_clEnqueueBarrier)(cl_command_queue);
+
+typedef cl_int (*f_clUnloadCompiler)(void);
+
+typedef void * (*f_clGetExtensionFunctionAddress)(const char *);
+
+typedef cl_mem (*f_clCreateFromGLBuffer) (cl_context, cl_mem_flags, cl_GLuint, int *);
+
+typedef cl_mem (*f_clCreateFromGLTexture) (cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int *);
+
+typedef cl_mem (*f_clCreateFromGLRenderbuffer) (cl_context, cl_mem_flags, cl_GLuint, cl_int *);
+
+typedef cl_int (*f_clGetGLObjectInfo) (cl_mem memobj, cl_gl_object_type *, cl_GLuint *);
+
+typedef cl_int (*f_clGetGLTextureInfo) (cl_mem, cl_gl_texture_info, size_t, void *, size_t *);
+
+typedef cl_int (*f_clEnqueueAcquireGLObjects) (cl_command_queue, cl_uint, const cl_mem *, cl_uint,
+                                        const cl_event *, cl_event *);
+
+typedef cl_int (*f_clEnqueueReleaseGLObjects) (cl_command_queue, cl_uint, const cl_mem *, cl_uint,
+                                        const cl_event *, cl_event *);
+
+typedef cl_mem (*f_clCreateFromGLTexture2D) (cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int *);
+
+typedef cl_mem (*f_clCreateFromGLTexture3D) (cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int *);
+
+//typedef cl_uint     cl_gl_context_info;
+typedef cl_int (*f_clGetGLContextInfoKHR) (const cl_context_properties *, cl_gl_context_info, size_t,
+                                        void *, size_t *);
+
+// Additional api to reset currently opened opencl shared-object
+// Subsequent calls will use newly set environment variables
+void stubOpenclReset();
+
+// Helper function to get the path to libOpenCL.so
+int open_libopencl_so();
+cl_int get_libopencl_path(char** cl_path);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif    // LIBOPENCL_STUB_H
diff --git a/caffe2/mobile/contrib/libopencl-stub/src/libopencl.c b/caffe2/mobile/contrib/libopencl-stub/src/libopencl.c
new file mode 100644
index 0000000..001b85f
--- /dev/null
+++ b/caffe2/mobile/contrib/libopencl-stub/src/libopencl.c
@@ -0,0 +1,2153 @@
+/*
+ *   Stub libopencl that dlsyms into actual library based on environment variable
+ *
+ *   LIBOPENCL_SO_PATH      -- Path to opencl so that will be searched first
+ *   LIBOPENCL_SO_PATH_2    -- Searched second
+ *   LIBOPENCL_SO_PATH_3    -- Searched third
+ *   LIBOPENCL_SO_PATH_4    -- Searched fourth
+ *
+ *   If none of these are set, default system paths will be considered
+**/
+
+#include "libopencl.h"
+
+
+#if defined(__APPLE__) || defined(__MACOSX)
+static const char *default_so_paths[] = {
+  "libOpenCL.so"
+};
+#elif defined(__ANDROID__)
+static const char *default_so_paths[] = {
+  "/system/lib/libOpenCL.so",
+  "/system/vendor/lib/libOpenCL.so",
+  "/system/vendor/lib/egl/libGLES_mali.so",
+  "/system/vendor/lib/libPVROCL.so",
+  "/data/data/org.pocl.libs/files/lib/libpocl.so",
+  "libOpenCL.so"
+};
+#elif defined(_WIN32)
+static const char *default_so_paths[] = {
+  "OpenCL.dll"
+};
+#elif defined(__linux__)
+static const char *default_so_paths[] = {
+  "/usr/lib/libOpenCL.so",
+  "/usr/local/lib/libOpenCL.so",
+  "/usr/local/lib/libpocl.so",
+  "/usr/lib64/libOpenCL.so",
+  "/usr/lib32/libOpenCL.so",
+  "libOpenCL.so"
+};
+#endif
+
+static void *so_handle = NULL;
+
+
+static int access_file(const char *filename)
+{
+  struct stat buffer;
+  return (stat(filename, &buffer) == 0);
+}
+
+int open_libopencl_so()
+{
+  char *path = NULL, *str = NULL;
+  int i;
+
+  if((str=getenv("LIBOPENCL_SO_PATH")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_2")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_3")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_4")) && access_file(str)) {
+    path = str;
+  }
+
+  if(!path)
+  {
+    for(i=0; i<(sizeof(default_so_paths) / sizeof(char*)); i++)
+    {
+      if(access_file(default_so_paths[i]))
+      {
+        path = (char *) default_so_paths[i];
+        break;
+      }
+    }
+  }
+
+  if(path)
+  {
+    so_handle = dlopen(path, RTLD_LAZY);
+    if(so_handle) {
+      return 0;
+    }
+  }
+
+  return -1;
+}
+
+cl_int get_libopencl_path(char** cl_path)
+{
+  char *path = NULL, *str = NULL;
+  int i;
+
+  if((str=getenv("LIBOPENCL_SO_PATH")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_2")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_3")) && access_file(str)) {
+    path = str;
+  }
+  else if((str=getenv("LIBOPENCL_SO_PATH_4")) && access_file(str)) {
+    path = str;
+  }
+
+  if(!path)
+  {
+    for(i=0; i<(sizeof(default_so_paths) / sizeof(char*)); i++)
+    {
+      if(access_file(default_so_paths[i]))
+      {
+        path = (char *) default_so_paths[i];
+        break;
+      }
+    }
+  }
+
+  if (path)
+  {
+    *cl_path = strndup(path, strlen(path));
+    return CL_SUCCESS;
+  }
+  return CL_INVALID_PLATFORM;
+}
+
+void stubOpenclReset()
+{
+  if(so_handle)
+    dlclose(so_handle);
+
+  so_handle = NULL;
+}
+
+cl_int
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms)
+{
+  f_clGetPlatformIDs func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetPlatformIDs) dlsym(so_handle, "clGetPlatformIDs");
+  if(func) {
+    return func(num_entries, platforms, num_platforms);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret)
+{
+  f_clGetPlatformInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetPlatformInfo) dlsym(so_handle, "clGetPlatformInfo");
+  if(func) {
+    return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices)
+{
+  f_clGetDeviceIDs func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetDeviceIDs) dlsym(so_handle, "clGetDeviceIDs");
+  if(func) {
+    return func(platform, device_type, num_entries, devices, num_devices);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret)
+{
+  f_clGetDeviceInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetDeviceInfo) dlsym(so_handle, "clGetDeviceInfo");
+  if(func) {
+    return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret)
+{
+  f_clCreateSubDevices func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateSubDevices) dlsym(so_handle, "clCreateSubDevices");
+  if(func) {
+    return func(in_device, properties, num_devices, out_devices, num_devices_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+  f_clRetainDevice func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainDevice) dlsym(so_handle, "clRetainDevice");
+  if(func) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+  f_clReleaseDevice func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseDevice) dlsym(so_handle, "clReleaseDevice");
+  if(func) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_context
+clCreateContext(const cl_context_properties * properties,
+                cl_uint                 num_devices,
+                const cl_device_id *    devices,
+                void (*pfn_notify)(const char *, const void *, size_t, void *),
+                void *                  user_data,
+                cl_int *                errcode_ret)
+{
+  f_clCreateContext func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateContext) dlsym(so_handle, "clCreateContext");
+  if(func) {
+    return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type          device_type,
+                        void (*pfn_notify )(const char *, const void *, size_t, void *),
+                        void *                  user_data,
+                        cl_int *                errcode_ret)
+{
+  f_clCreateContextFromType func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateContextFromType) dlsym(so_handle, "clCreateContextFromType");
+  if(func) {
+    return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+  f_clRetainContext func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainContext) dlsym(so_handle, "clRetainContext");
+  if(func) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+  f_clReleaseContext func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseContext) dlsym(so_handle, "clReleaseContext");
+  if(func) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetContextInfo(cl_context         context,
+                 cl_context_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+  f_clGetContextInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetContextInfo) dlsym(so_handle, "clGetContextInfo");
+  if(func) {
+    return func(context, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_command_queue
+clCreateCommandQueue(cl_context                     context,
+                     cl_device_id                   device,
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret)
+{
+  f_clCreateCommandQueue func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateCommandQueue) dlsym(so_handle, "clCreateCommandQueue");
+  if(func) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+  f_clRetainCommandQueue func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainCommandQueue) dlsym(so_handle, "clRetainCommandQueue");
+  if(func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+  f_clReleaseCommandQueue func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseCommandQueue) dlsym(so_handle, "clReleaseCommandQueue");
+  if(func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret)
+{
+  f_clGetCommandQueueInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetCommandQueueInfo) dlsym(so_handle, "clGetCommandQueueInfo");
+  if(func) {
+    return func(command_queue, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_mem
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret)
+{
+  f_clCreateBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateBuffer) dlsym(so_handle, "clCreateBuffer");
+  if(func) {
+    return func(context, flags, size, host_ptr, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem                   buffer,
+                  cl_mem_flags             flags,
+                  cl_buffer_create_type    buffer_create_type,
+                  const void *             buffer_create_info,
+                  cl_int *                 errcode_ret)
+{
+  f_clCreateSubBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateSubBuffer) dlsym(so_handle, "clCreateSubBuffer");
+  if(func) {
+    return func(buffer, flags, buffer_create_type,
+                buffer_create_info, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateImage(cl_context              context,
+              cl_mem_flags            flags,
+              const cl_image_format * image_format,
+              const cl_image_desc *   image_desc,
+              void *                  host_ptr,
+              cl_int *                errcode_ret)
+{
+  f_clCreateImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateImage) dlsym(so_handle, "clCreateImage");
+  if(func) {
+    return func(context, flags, image_format, image_desc,
+                host_ptr, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+  f_clRetainMemObject func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainMemObject) dlsym(so_handle, "clRetainMemObject");
+  if(func) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+  f_clReleaseMemObject func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseMemObject) dlsym(so_handle, "clReleaseMemObject");
+  if(func) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats)
+{
+  f_clGetSupportedImageFormats func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetSupportedImageFormats) dlsym(so_handle, "clGetSupportedImageFormats");
+  if(func) {
+    return func(context, flags, image_type, num_entries,
+                image_formats, num_image_formats);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret)
+{
+  f_clGetMemObjectInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetMemObjectInfo) dlsym(so_handle, "clGetMemObjectInfo");
+  if(func) {
+    return func(memobj, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret)
+{
+  f_clGetImageInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetImageInfo) dlsym(so_handle, "clGetImageInfo");
+  if(func) {
+    return func(image, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clSetMemObjectDestructorCallback(  cl_mem memobj,
+                                   void (*pfn_notify)( cl_mem memobj, void* user_data),
+                                   void * user_data )
+{
+  f_clSetMemObjectDestructorCallback func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clSetMemObjectDestructorCallback) dlsym(so_handle, "clSetMemObjectDestructorCallback");
+  if(func) {
+    return func(memobj, pfn_notify, user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_sampler
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords,
+                cl_addressing_mode  addressing_mode,
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret)
+{
+  f_clCreateSampler func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateSampler) dlsym(so_handle, "clCreateSampler");
+  if(func) {
+    return func(context, normalized_coords, addressing_mode, filter_mode, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+  f_clRetainSampler func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainSampler) dlsym(so_handle, "clRetainSampler");
+  if(func) {
+    return func(sampler);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+  f_clReleaseSampler func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseSampler) dlsym(so_handle, "clReleaseSampler");
+  if(func) {
+    return func(sampler);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+  f_clGetSamplerInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetSamplerInfo) dlsym(so_handle, "clGetSamplerInfo");
+  if(func) {
+    return func(sampler, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_program
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret)
+{
+  f_clCreateProgramWithSource func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateProgramWithSource) dlsym(so_handle, "clCreateProgramWithSource");
+  if(func) {
+    return func(context, count, strings, lengths, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+
+
+cl_program
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret)
+{
+  f_clCreateProgramWithBinary func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateProgramWithBinary) dlsym(so_handle, "clCreateProgramWithBinary");
+  if(func) {
+    return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context            context,
+                                  cl_uint               num_devices,
+                                  const cl_device_id *  device_list,
+                                  const char *          kernel_names,
+                                  cl_int *              errcode_ret)
+{
+  f_clCreateProgramWithBuiltInKernels func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateProgramWithBuiltInKernels) dlsym(so_handle, "clCreateProgramWithBuiltInKernels");
+  if(func) {
+    return func(context, num_devices, device_list, kernel_names, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+  f_clRetainProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainProgram) dlsym(so_handle, "clRetainProgram");
+  if(func) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+  f_clReleaseProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseProgram) dlsym(so_handle, "clReleaseProgram");
+  if(func) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options,
+               void (*pfn_notify)(cl_program program, void * user_data),
+               void *               user_data)
+{
+  f_clBuildProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clBuildProgram) dlsym(so_handle, "clBuildProgram");
+  if(func) {
+    return func(program, num_devices, device_list, options, pfn_notify, user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clCompileProgram(cl_program           program,
+                 cl_uint              num_devices,
+                 const cl_device_id * device_list,
+                 const char *         options,
+                 cl_uint              num_input_headers,
+                 const cl_program *   input_headers,
+                 const char **        header_include_names,
+                 void (*pfn_notify)(cl_program program, void * user_data),
+                 void *               user_data)
+{
+  f_clCompileProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCompileProgram) dlsym(so_handle, "clCompileProgram");
+  if(func) {
+    return func(program, num_devices, device_list, options, num_input_headers, input_headers,
+                header_include_names, pfn_notify, user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_program
+clLinkProgram(cl_context           context,
+              cl_uint              num_devices,
+              const cl_device_id * device_list,
+              const char *         options,
+              cl_uint              num_input_programs,
+              const cl_program *   input_programs,
+              void (*pfn_notify)(cl_program program, void * user_data),
+              void *               user_data,
+              cl_int *             errcode_ret)
+{
+  f_clLinkProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clLinkProgram) dlsym(so_handle, "clLinkProgram");
+  if(func) {
+    return func(context, num_devices, device_list, options, num_input_programs,
+                input_programs, pfn_notify, user_data, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+
+cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+  f_clUnloadPlatformCompiler func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clUnloadPlatformCompiler) dlsym(so_handle, "clUnloadPlatformCompiler");
+  if(func) {
+    return func(platform);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+  f_clGetProgramInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetProgramInfo) dlsym(so_handle, "clGetProgramInfo");
+  if(func) {
+    return func(program, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret)
+{
+  f_clGetProgramBuildInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetProgramBuildInfo) dlsym(so_handle, "clGetProgramBuildInfo");
+  if(func) {
+    return func(program, device, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_kernel
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret)
+{
+  f_clCreateKernel func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateKernel) dlsym(so_handle, "clCreateKernel");
+  if(func) {
+    return func(program, kernel_name, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret)
+{
+  f_clCreateKernelsInProgram func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateKernelsInProgram) dlsym(so_handle, "clCreateKernelsInProgram");
+  if(func) {
+    return func(program, num_kernels, kernels, num_kernels_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clRetainKernel(cl_kernel    kernel)
+{
+  f_clRetainKernel func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainKernel) dlsym(so_handle, "clRetainKernel");
+  if(func) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseKernel(cl_kernel   kernel)
+{
+  f_clReleaseKernel func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseKernel) dlsym(so_handle, "clReleaseKernel");
+  if(func) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value)
+{
+  f_clSetKernelArg func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clSetKernelArg) dlsym(so_handle, "clSetKernelArg");
+  if(func) {
+    return func(kernel, arg_index, arg_size, arg_value);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret)
+{
+  f_clGetKernelInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetKernelInfo) dlsym(so_handle, "clGetKernelInfo");
+  if(func) {
+    return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetKernelArgInfo(cl_kernel       kernel,
+                   cl_uint         arg_indx,
+                   cl_kernel_arg_info  param_name,
+                   size_t          param_value_size,
+                   void *          param_value,
+                   size_t *        param_value_size_ret)
+{
+  f_clGetKernelArgInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetKernelArgInfo) dlsym(so_handle, "clGetKernelArgInfo");
+  if(func) {
+    return func(kernel, arg_indx, param_name, param_value_size,
+                param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret)
+{
+  f_clGetKernelWorkGroupInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetKernelWorkGroupInfo) dlsym(so_handle, "clGetKernelWorkGroupInfo");
+  if(func) {
+    return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list)
+{
+  f_clWaitForEvents func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clWaitForEvents) dlsym(so_handle, "clWaitForEvents");
+  if(func) {
+    return func(num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret)
+{
+  f_clGetEventInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetEventInfo) dlsym(so_handle, "clGetEventInfo");
+  if(func) {
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_event
+clCreateUserEvent(cl_context    context,
+                  cl_int *      errcode_ret)
+{
+  f_clCreateUserEvent func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateUserEvent) dlsym(so_handle, "clCreateUserEvent");
+  if(func) {
+    return func(context, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clRetainEvent(cl_event event)
+{
+  f_clRetainEvent func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clRetainEvent) dlsym(so_handle, "clRetainEvent");
+  if(func) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clReleaseEvent(cl_event event)
+{
+  f_clReleaseEvent func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clReleaseEvent) dlsym(so_handle, "clReleaseEvent");
+  if(func) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clSetUserEventStatus(cl_event   event,
+                     cl_int     execution_status)
+{
+  f_clSetUserEventStatus func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clSetUserEventStatus) dlsym(so_handle, "clSetUserEventStatus");
+  if(func) {
+    return func(event, execution_status);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clSetEventCallback( cl_event    event,
+                    cl_int      command_exec_callback_type,
+                    void (*pfn_notify)(cl_event, cl_int, void *),
+                    void *      user_data)
+{
+  f_clSetEventCallback func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clSetEventCallback) dlsym(so_handle, "clSetEventCallback");
+  if(func) {
+    return func(event, command_exec_callback_type, pfn_notify, user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret)
+{
+  f_clGetEventProfilingInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetEventProfilingInfo) dlsym(so_handle, "clGetEventProfilingInfo");
+  if(func) {
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+  f_clFlush func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clFlush) dlsym(so_handle, "clFlush");
+  if(func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+  f_clFinish func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clFinish) dlsym(so_handle, "clFinish");
+  if(func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              size,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+  f_clEnqueueReadBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueReadBuffer) dlsym(so_handle, "clEnqueueReadBuffer");
+  if(func) {
+    return func(command_queue, buffer, blocking_read, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue    command_queue,
+                        cl_mem              buffer,
+                        cl_bool             blocking_read,
+                        const size_t *      buffer_offset,
+                        const size_t *      host_offset,
+                        const size_t *      region,
+                        size_t              buffer_row_pitch,
+                        size_t              buffer_slice_pitch,
+                        size_t              host_row_pitch,
+                        size_t              host_slice_pitch,
+                        void *              ptr,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event)
+{
+  f_clEnqueueReadBufferRect func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueReadBufferRect) dlsym(so_handle, "clEnqueueReadBufferRect");
+  if(func) {
+    return func(command_queue, buffer, blocking_read, buffer_offset, host_offset, region,
+                buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             size,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event)
+{
+  f_clEnqueueWriteBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueWriteBuffer) dlsym(so_handle, "clEnqueueWriteBuffer");
+  if(func) {
+    return func(command_queue, buffer, blocking_write, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue    command_queue,
+                         cl_mem              buffer,
+                         cl_bool             blocking_write,
+                         const size_t *      buffer_offset,
+                         const size_t *      host_offset,
+                         const size_t *      region,
+                         size_t              buffer_row_pitch,
+                         size_t              buffer_slice_pitch,
+                         size_t              host_row_pitch,
+                         size_t              host_slice_pitch,
+                         const void *        ptr,
+                         cl_uint             num_events_in_wait_list,
+                         const cl_event *    event_wait_list,
+                         cl_event *          event)
+{
+  f_clEnqueueWriteBufferRect func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueWriteBufferRect) dlsym(so_handle, "clEnqueueWriteBufferRect");
+  if(func) {
+    return func(command_queue, buffer, blocking_write, buffer_offset, host_offset, region,
+                buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
+                ptr, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+  f_clEnqueueFillBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueFillBuffer) dlsym(so_handle, "clEnqueueFillBuffer");
+  if(func) {
+    return func(command_queue, buffer, pattern, pattern_size, offset, size,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue    command_queue,
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer,
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              size,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+  f_clEnqueueCopyBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueCopyBuffer) dlsym(so_handle, "clEnqueueCopyBuffer");
+  if(func) {
+    return func(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue    command_queue,
+                        cl_mem              src_buffer,
+                        cl_mem              dst_buffer,
+                        const size_t *      src_origin,
+                        const size_t *      dst_origin,
+                        const size_t *      region,
+                        size_t              src_row_pitch,
+                        size_t              src_slice_pitch,
+                        size_t              dst_row_pitch,
+                        size_t              dst_slice_pitch,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event)
+{
+  f_clEnqueueCopyBufferRect func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueCopyBufferRect) dlsym(so_handle, "clEnqueueCopyBufferRect");
+  if(func) {
+    return func(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region, src_row_pitch,
+                src_slice_pitch, dst_row_pitch, dst_slice_pitch, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read,
+                   const size_t *       origin,
+                   const size_t *       region,
+                   size_t               row_pitch,
+                   size_t               slice_pitch,
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event)
+{
+  f_clEnqueueReadImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueReadImage) dlsym(so_handle, "clEnqueueReadImage");
+  if(func) {
+    return func(command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch,
+                ptr, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write,
+                    const size_t *      origin,
+                    const size_t *      region,
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+  f_clEnqueueWriteImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueWriteImage) dlsym(so_handle, "clEnqueueWriteImage");
+  if(func) {
+    return func(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     origin,
+                   const size_t *     region,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event)
+{
+  f_clEnqueueFillImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueFillImage) dlsym(so_handle, "clEnqueueFillImage");
+  if(func) {
+    return func(command_queue, image, fill_color, origin, region, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image,
+                   const size_t *       src_origin,
+                   const size_t *       dst_origin,
+                   const size_t *       region,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event)
+{
+  f_clEnqueueCopyImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueCopyImage) dlsym(so_handle, "clEnqueueCopyImage");
+  if(func) {
+    return func(command_queue, src_image, dst_image, src_origin, dst_origin, region,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer,
+                           const size_t *   src_origin,
+                           const size_t *   region,
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
+{
+  f_clEnqueueCopyImageToBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueCopyImageToBuffer) dlsym(so_handle, "clEnqueueCopyImageToBuffer");
+  if(func) {
+    return func(command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image,
+                           size_t           src_offset,
+                           const size_t *   dst_origin,
+                           const size_t *   region,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
+{
+  f_clEnqueueCopyBufferToImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueCopyBufferToImage) dlsym(so_handle, "clEnqueueCopyBufferToImage");
+  if(func) {
+    return func(command_queue, src_buffer, dst_image, src_offset, dst_origin, region,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map,
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           size,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret)
+{
+  f_clEnqueueMapBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueMapBuffer) dlsym(so_handle, "clEnqueueMapBuffer");
+  if(func) {
+    return func(command_queue, buffer, blocking_map, map_flags, offset, size,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+void *
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image,
+                  cl_bool           blocking_map,
+                  cl_map_flags      map_flags,
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret)
+{
+  f_clEnqueueMapImage func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueMapImage) dlsym(so_handle, "clEnqueueMapImage");
+  if(func) {
+    return func(command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch,
+                image_slice_pitch, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event *  event_wait_list,
+                        cl_event *        event)
+{
+  f_clEnqueueUnmapMemObject func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueUnmapMemObject) dlsym(so_handle, "clEnqueueUnmapMemObject");
+  if(func) {
+    return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
+                           cl_uint                num_mem_objects,
+                           const cl_mem *         mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint                num_events_in_wait_list,
+                           const cl_event *       event_wait_list,
+                           cl_event *             event)
+{
+  f_clEnqueueMigrateMemObjects func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueMigrateMemObjects) dlsym(so_handle, "clEnqueueMigrateMemObjects");
+  if(func) {
+    return func(command_queue, num_mem_objects, mem_objects, flags, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event)
+{
+  f_clEnqueueNDRangeKernel func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueNDRangeKernel) dlsym(so_handle, "clEnqueueNDRangeKernel");
+  if(func) {
+    return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event)
+{
+  f_clEnqueueTask func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueTask) dlsym(so_handle, "clEnqueueTask");
+  if(func) {
+    return func(command_queue, kernel, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue  command_queue,
+                      void (*user_func)(void *),
+                      void *            args,
+                      size_t            cb_args,
+                      cl_uint           num_mem_objects,
+                      const cl_mem *    mem_list,
+                      const void **     args_mem_loc,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event)
+{
+  f_clEnqueueNativeKernel func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueNativeKernel) dlsym(so_handle, "clEnqueueNativeKernel");
+  if(func) {
+    return func(command_queue, user_func, args, cb_args, num_mem_objects, mem_list,
+                args_mem_loc, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                            cl_uint           num_events_in_wait_list,
+                            const cl_event *  event_wait_list,
+                            cl_event *        event)
+{
+  f_clEnqueueMarkerWithWaitList func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueMarkerWithWaitList) dlsym(so_handle, "clEnqueueMarkerWithWaitList");
+  if(func) {
+    return func(command_queue, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+                             cl_uint           num_events_in_wait_list,
+                             const cl_event *  event_wait_list,
+                             cl_event *        event)
+{
+  f_clEnqueueBarrierWithWaitList func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueBarrierWithWaitList) dlsym(so_handle, "clEnqueueBarrierWithWaitList");
+  if(func) {
+    return func(command_queue, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+void *
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                                         const char *   func_name)
+{
+  f_clGetExtensionFunctionAddressForPlatform func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetExtensionFunctionAddressForPlatform) dlsym(so_handle, "clGetExtensionFunctionAddressForPlatform");
+  if(func) {
+    return func(platform, func_name);
+  } else {
+    return NULL;
+  }
+}
+
+
+cl_mem
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+  f_clCreateImage2D func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateImage2D) dlsym(so_handle, "clCreateImage2D");
+  if(func) {
+    return func(context, flags, image_format, image_width, image_height,
+                image_row_pitch, host_ptr, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+  f_clCreateImage3D func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateImage3D) dlsym(so_handle, "clCreateImage3D");
+  if(func) {
+    return func(context, flags, image_format, image_width, image_height, image_depth,
+                image_row_pitch, image_slice_pitch, host_ptr, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event)
+{
+  f_clEnqueueMarker func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueMarker) dlsym(so_handle, "clEnqueueMarker");
+  if(func) {
+    return func(command_queue, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+                       cl_uint          num_events,
+                       const cl_event * event_list)
+{
+  f_clEnqueueWaitForEvents func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueWaitForEvents) dlsym(so_handle, "clEnqueueWaitForEvents");
+  if(func) {
+    return func(command_queue, num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+  f_clEnqueueBarrier func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueBarrier) dlsym(so_handle, "clEnqueueBarrier");
+  if(func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+  f_clUnloadCompiler func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clUnloadCompiler) dlsym(so_handle, "clUnloadCompiler");
+  if(func) {
+    return func();
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+void *
+clGetExtensionFunctionAddress(const char * func_name)
+{
+  f_clGetExtensionFunctionAddress func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetExtensionFunctionAddress) dlsym(so_handle, "clGetExtensionFunctionAddress");
+  if(func) {
+    return func(func_name);
+  } else {
+    return NULL;
+  }
+}
+
+
+cl_mem
+clCreateFromGLBuffer(cl_context     context,
+                     cl_mem_flags   flags,
+                     cl_GLuint      bufobj,
+                     int *          errcode_ret)
+{
+  f_clCreateFromGLBuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateFromGLBuffer) dlsym(so_handle, "clCreateFromGLBuffer");
+  if(func) {
+    return func(context, flags, bufobj, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret)
+{
+  f_clCreateFromGLTexture func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateFromGLTexture) dlsym(so_handle, "clCreateFromGLTexture");
+  if(func) {
+    return func(context, flags, target, miplevel, texture, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateFromGLRenderbuffer(cl_context   context,
+                           cl_mem_flags flags,
+                           cl_GLuint    renderbuffer,
+                           cl_int *     errcode_ret)
+{
+  f_clCreateFromGLRenderbuffer func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateFromGLRenderbuffer) dlsym(so_handle, "clCreateFromGLRenderbuffer");
+  if(func) {
+    return func(context, flags, renderbuffer, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clGetGLObjectInfo(cl_mem                memobj,
+                  cl_gl_object_type *   gl_object_type,
+                  cl_GLuint *           gl_object_name)
+{
+  f_clGetGLObjectInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetGLObjectInfo) dlsym(so_handle, "clGetGLObjectInfo");
+  if(func) {
+    return func(memobj, gl_object_type, gl_object_name);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clGetGLTextureInfo(cl_mem               memobj,
+                   cl_gl_texture_info   param_name,
+                   size_t               param_value_size,
+                   void *               param_value,
+                   size_t *             param_value_size_ret)
+{
+  f_clGetGLTextureInfo func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetGLTextureInfo) dlsym(so_handle, "clGetGLTextureInfo");
+  if(func) {
+    return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event)
+{
+  f_clEnqueueAcquireGLObjects func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueAcquireGLObjects) dlsym(so_handle, "clEnqueueAcquireGLObjects");
+  if(func) {
+    return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int
+clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event)
+{
+  f_clEnqueueReleaseGLObjects func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clEnqueueReleaseGLObjects) dlsym(so_handle, "clEnqueueReleaseGLObjects");
+  if(func) {
+    return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+
+cl_mem
+clCreateFromGLTexture2D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret)
+{
+  f_clCreateFromGLTexture2D func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateFromGLTexture2D) dlsym(so_handle, "clCreateFromGLTexture2D");
+  if(func) {
+    return func(context, flags, target, miplevel, texture, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_mem
+clCreateFromGLTexture3D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret)
+{
+  f_clCreateFromGLTexture3D func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clCreateFromGLTexture3D) dlsym(so_handle, "clCreateFromGLTexture3D");
+  if(func) {
+    return func(context, flags, target, miplevel, texture, errcode_ret);
+  } else {
+    return NULL;
+  }
+}
+
+cl_int
+clGetGLContextInfoKHR(const cl_context_properties * properties,
+                      cl_gl_context_info            param_name,
+                      size_t                        param_value_size,
+                      void *                        param_value,
+                      size_t *                      param_value_size_ret)
+{
+  f_clGetGLContextInfoKHR func;
+
+  if(!so_handle)
+    open_libopencl_so();
+
+  func = (f_clGetGLContextInfoKHR) dlsym(so_handle, "clGetGLContextInfoKHR");
+  if(func) {
+    return func(properties, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
diff --git a/caffe2/mobile/contrib/libvulkan-stub/include/libvulkan-stub.h b/caffe2/mobile/contrib/libvulkan-stub/include/libvulkan-stub.h
new file mode 100644
index 0000000..d78881d
--- /dev/null
+++ b/caffe2/mobile/contrib/libvulkan-stub/include/libvulkan-stub.h
@@ -0,0 +1,384 @@
+/*
+* Copyright (c) 2016-2017, ARM Limited and Contributors
+*
+* SPDX-License-Identifier: MIT
+* 
+* Permission is hereby granted, free of charge,
+* to any person obtaining a copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation the rights to
+* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/* This header is autogenerated by vulkan_loader_generator.py */
+#ifndef VULKAN_SYMBOL_WRAPPER_H
+#define VULKAN_SYMBOL_WRAPPER_H
+#define VK_NO_PROTOTYPES
+#include <vulkan/vulkan.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern PFN_vkCreateInstance vulkanSymbolWrapper_vkCreateInstance;
+#define vkCreateInstance vulkanSymbolWrapper_vkCreateInstance
+extern PFN_vkEnumerateInstanceExtensionProperties vulkanSymbolWrapper_vkEnumerateInstanceExtensionProperties;
+#define vkEnumerateInstanceExtensionProperties vulkanSymbolWrapper_vkEnumerateInstanceExtensionProperties
+extern PFN_vkEnumerateInstanceLayerProperties vulkanSymbolWrapper_vkEnumerateInstanceLayerProperties;
+#define vkEnumerateInstanceLayerProperties vulkanSymbolWrapper_vkEnumerateInstanceLayerProperties
+extern PFN_vkDestroyInstance vulkanSymbolWrapper_vkDestroyInstance;
+#define vkDestroyInstance vulkanSymbolWrapper_vkDestroyInstance
+extern PFN_vkEnumeratePhysicalDevices vulkanSymbolWrapper_vkEnumeratePhysicalDevices;
+#define vkEnumeratePhysicalDevices vulkanSymbolWrapper_vkEnumeratePhysicalDevices
+extern PFN_vkGetPhysicalDeviceFeatures vulkanSymbolWrapper_vkGetPhysicalDeviceFeatures;
+#define vkGetPhysicalDeviceFeatures vulkanSymbolWrapper_vkGetPhysicalDeviceFeatures
+extern PFN_vkGetPhysicalDeviceFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceFormatProperties;
+#define vkGetPhysicalDeviceFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceFormatProperties
+extern PFN_vkGetPhysicalDeviceImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceImageFormatProperties;
+#define vkGetPhysicalDeviceImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceImageFormatProperties
+extern PFN_vkGetPhysicalDeviceProperties vulkanSymbolWrapper_vkGetPhysicalDeviceProperties;
+#define vkGetPhysicalDeviceProperties vulkanSymbolWrapper_vkGetPhysicalDeviceProperties
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties vulkanSymbolWrapper_vkGetPhysicalDeviceQueueFamilyProperties;
+#define vkGetPhysicalDeviceQueueFamilyProperties vulkanSymbolWrapper_vkGetPhysicalDeviceQueueFamilyProperties
+extern PFN_vkGetPhysicalDeviceMemoryProperties vulkanSymbolWrapper_vkGetPhysicalDeviceMemoryProperties;
+#define vkGetPhysicalDeviceMemoryProperties vulkanSymbolWrapper_vkGetPhysicalDeviceMemoryProperties
+extern PFN_vkGetDeviceProcAddr vulkanSymbolWrapper_vkGetDeviceProcAddr;
+#define vkGetDeviceProcAddr vulkanSymbolWrapper_vkGetDeviceProcAddr
+extern PFN_vkCreateDevice vulkanSymbolWrapper_vkCreateDevice;
+#define vkCreateDevice vulkanSymbolWrapper_vkCreateDevice
+extern PFN_vkDestroyDevice vulkanSymbolWrapper_vkDestroyDevice;
+#define vkDestroyDevice vulkanSymbolWrapper_vkDestroyDevice
+extern PFN_vkEnumerateDeviceExtensionProperties vulkanSymbolWrapper_vkEnumerateDeviceExtensionProperties;
+#define vkEnumerateDeviceExtensionProperties vulkanSymbolWrapper_vkEnumerateDeviceExtensionProperties
+extern PFN_vkEnumerateDeviceLayerProperties vulkanSymbolWrapper_vkEnumerateDeviceLayerProperties;
+#define vkEnumerateDeviceLayerProperties vulkanSymbolWrapper_vkEnumerateDeviceLayerProperties
+extern PFN_vkGetDeviceQueue vulkanSymbolWrapper_vkGetDeviceQueue;
+#define vkGetDeviceQueue vulkanSymbolWrapper_vkGetDeviceQueue
+extern PFN_vkQueueSubmit vulkanSymbolWrapper_vkQueueSubmit;
+#define vkQueueSubmit vulkanSymbolWrapper_vkQueueSubmit
+extern PFN_vkQueueWaitIdle vulkanSymbolWrapper_vkQueueWaitIdle;
+#define vkQueueWaitIdle vulkanSymbolWrapper_vkQueueWaitIdle
+extern PFN_vkDeviceWaitIdle vulkanSymbolWrapper_vkDeviceWaitIdle;
+#define vkDeviceWaitIdle vulkanSymbolWrapper_vkDeviceWaitIdle
+extern PFN_vkAllocateMemory vulkanSymbolWrapper_vkAllocateMemory;
+#define vkAllocateMemory vulkanSymbolWrapper_vkAllocateMemory
+extern PFN_vkFreeMemory vulkanSymbolWrapper_vkFreeMemory;
+#define vkFreeMemory vulkanSymbolWrapper_vkFreeMemory
+extern PFN_vkMapMemory vulkanSymbolWrapper_vkMapMemory;
+#define vkMapMemory vulkanSymbolWrapper_vkMapMemory
+extern PFN_vkUnmapMemory vulkanSymbolWrapper_vkUnmapMemory;
+#define vkUnmapMemory vulkanSymbolWrapper_vkUnmapMemory
+extern PFN_vkFlushMappedMemoryRanges vulkanSymbolWrapper_vkFlushMappedMemoryRanges;
+#define vkFlushMappedMemoryRanges vulkanSymbolWrapper_vkFlushMappedMemoryRanges
+extern PFN_vkInvalidateMappedMemoryRanges vulkanSymbolWrapper_vkInvalidateMappedMemoryRanges;
+#define vkInvalidateMappedMemoryRanges vulkanSymbolWrapper_vkInvalidateMappedMemoryRanges
+extern PFN_vkGetDeviceMemoryCommitment vulkanSymbolWrapper_vkGetDeviceMemoryCommitment;
+#define vkGetDeviceMemoryCommitment vulkanSymbolWrapper_vkGetDeviceMemoryCommitment
+extern PFN_vkBindBufferMemory vulkanSymbolWrapper_vkBindBufferMemory;
+#define vkBindBufferMemory vulkanSymbolWrapper_vkBindBufferMemory
+extern PFN_vkBindImageMemory vulkanSymbolWrapper_vkBindImageMemory;
+#define vkBindImageMemory vulkanSymbolWrapper_vkBindImageMemory
+extern PFN_vkGetBufferMemoryRequirements vulkanSymbolWrapper_vkGetBufferMemoryRequirements;
+#define vkGetBufferMemoryRequirements vulkanSymbolWrapper_vkGetBufferMemoryRequirements
+extern PFN_vkGetImageMemoryRequirements vulkanSymbolWrapper_vkGetImageMemoryRequirements;
+#define vkGetImageMemoryRequirements vulkanSymbolWrapper_vkGetImageMemoryRequirements
+extern PFN_vkGetImageSparseMemoryRequirements vulkanSymbolWrapper_vkGetImageSparseMemoryRequirements;
+#define vkGetImageSparseMemoryRequirements vulkanSymbolWrapper_vkGetImageSparseMemoryRequirements
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceSparseImageFormatProperties;
+#define vkGetPhysicalDeviceSparseImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceSparseImageFormatProperties
+extern PFN_vkQueueBindSparse vulkanSymbolWrapper_vkQueueBindSparse;
+#define vkQueueBindSparse vulkanSymbolWrapper_vkQueueBindSparse
+extern PFN_vkCreateFence vulkanSymbolWrapper_vkCreateFence;
+#define vkCreateFence vulkanSymbolWrapper_vkCreateFence
+extern PFN_vkDestroyFence vulkanSymbolWrapper_vkDestroyFence;
+#define vkDestroyFence vulkanSymbolWrapper_vkDestroyFence
+extern PFN_vkResetFences vulkanSymbolWrapper_vkResetFences;
+#define vkResetFences vulkanSymbolWrapper_vkResetFences
+extern PFN_vkGetFenceStatus vulkanSymbolWrapper_vkGetFenceStatus;
+#define vkGetFenceStatus vulkanSymbolWrapper_vkGetFenceStatus
+extern PFN_vkWaitForFences vulkanSymbolWrapper_vkWaitForFences;
+#define vkWaitForFences vulkanSymbolWrapper_vkWaitForFences
+extern PFN_vkCreateSemaphore vulkanSymbolWrapper_vkCreateSemaphore;
+#define vkCreateSemaphore vulkanSymbolWrapper_vkCreateSemaphore
+extern PFN_vkDestroySemaphore vulkanSymbolWrapper_vkDestroySemaphore;
+#define vkDestroySemaphore vulkanSymbolWrapper_vkDestroySemaphore
+extern PFN_vkCreateEvent vulkanSymbolWrapper_vkCreateEvent;
+#define vkCreateEvent vulkanSymbolWrapper_vkCreateEvent
+extern PFN_vkDestroyEvent vulkanSymbolWrapper_vkDestroyEvent;
+#define vkDestroyEvent vulkanSymbolWrapper_vkDestroyEvent
+extern PFN_vkGetEventStatus vulkanSymbolWrapper_vkGetEventStatus;
+#define vkGetEventStatus vulkanSymbolWrapper_vkGetEventStatus
+extern PFN_vkSetEvent vulkanSymbolWrapper_vkSetEvent;
+#define vkSetEvent vulkanSymbolWrapper_vkSetEvent
+extern PFN_vkResetEvent vulkanSymbolWrapper_vkResetEvent;
+#define vkResetEvent vulkanSymbolWrapper_vkResetEvent
+extern PFN_vkCreateQueryPool vulkanSymbolWrapper_vkCreateQueryPool;
+#define vkCreateQueryPool vulkanSymbolWrapper_vkCreateQueryPool
+extern PFN_vkDestroyQueryPool vulkanSymbolWrapper_vkDestroyQueryPool;
+#define vkDestroyQueryPool vulkanSymbolWrapper_vkDestroyQueryPool
+extern PFN_vkGetQueryPoolResults vulkanSymbolWrapper_vkGetQueryPoolResults;
+#define vkGetQueryPoolResults vulkanSymbolWrapper_vkGetQueryPoolResults
+extern PFN_vkCreateBuffer vulkanSymbolWrapper_vkCreateBuffer;
+#define vkCreateBuffer vulkanSymbolWrapper_vkCreateBuffer
+extern PFN_vkDestroyBuffer vulkanSymbolWrapper_vkDestroyBuffer;
+#define vkDestroyBuffer vulkanSymbolWrapper_vkDestroyBuffer
+extern PFN_vkCreateBufferView vulkanSymbolWrapper_vkCreateBufferView;
+#define vkCreateBufferView vulkanSymbolWrapper_vkCreateBufferView
+extern PFN_vkDestroyBufferView vulkanSymbolWrapper_vkDestroyBufferView;
+#define vkDestroyBufferView vulkanSymbolWrapper_vkDestroyBufferView
+extern PFN_vkCreateImage vulkanSymbolWrapper_vkCreateImage;
+#define vkCreateImage vulkanSymbolWrapper_vkCreateImage
+extern PFN_vkDestroyImage vulkanSymbolWrapper_vkDestroyImage;
+#define vkDestroyImage vulkanSymbolWrapper_vkDestroyImage
+extern PFN_vkGetImageSubresourceLayout vulkanSymbolWrapper_vkGetImageSubresourceLayout;
+#define vkGetImageSubresourceLayout vulkanSymbolWrapper_vkGetImageSubresourceLayout
+extern PFN_vkCreateImageView vulkanSymbolWrapper_vkCreateImageView;
+#define vkCreateImageView vulkanSymbolWrapper_vkCreateImageView
+extern PFN_vkDestroyImageView vulkanSymbolWrapper_vkDestroyImageView;
+#define vkDestroyImageView vulkanSymbolWrapper_vkDestroyImageView
+extern PFN_vkCreateShaderModule vulkanSymbolWrapper_vkCreateShaderModule;
+#define vkCreateShaderModule vulkanSymbolWrapper_vkCreateShaderModule
+extern PFN_vkDestroyShaderModule vulkanSymbolWrapper_vkDestroyShaderModule;
+#define vkDestroyShaderModule vulkanSymbolWrapper_vkDestroyShaderModule
+extern PFN_vkCreatePipelineCache vulkanSymbolWrapper_vkCreatePipelineCache;
+#define vkCreatePipelineCache vulkanSymbolWrapper_vkCreatePipelineCache
+extern PFN_vkDestroyPipelineCache vulkanSymbolWrapper_vkDestroyPipelineCache;
+#define vkDestroyPipelineCache vulkanSymbolWrapper_vkDestroyPipelineCache
+extern PFN_vkGetPipelineCacheData vulkanSymbolWrapper_vkGetPipelineCacheData;
+#define vkGetPipelineCacheData vulkanSymbolWrapper_vkGetPipelineCacheData
+extern PFN_vkMergePipelineCaches vulkanSymbolWrapper_vkMergePipelineCaches;
+#define vkMergePipelineCaches vulkanSymbolWrapper_vkMergePipelineCaches
+extern PFN_vkCreateGraphicsPipelines vulkanSymbolWrapper_vkCreateGraphicsPipelines;
+#define vkCreateGraphicsPipelines vulkanSymbolWrapper_vkCreateGraphicsPipelines
+extern PFN_vkCreateComputePipelines vulkanSymbolWrapper_vkCreateComputePipelines;
+#define vkCreateComputePipelines vulkanSymbolWrapper_vkCreateComputePipelines
+extern PFN_vkDestroyPipeline vulkanSymbolWrapper_vkDestroyPipeline;
+#define vkDestroyPipeline vulkanSymbolWrapper_vkDestroyPipeline
+extern PFN_vkCreatePipelineLayout vulkanSymbolWrapper_vkCreatePipelineLayout;
+#define vkCreatePipelineLayout vulkanSymbolWrapper_vkCreatePipelineLayout
+extern PFN_vkDestroyPipelineLayout vulkanSymbolWrapper_vkDestroyPipelineLayout;
+#define vkDestroyPipelineLayout vulkanSymbolWrapper_vkDestroyPipelineLayout
+extern PFN_vkCreateSampler vulkanSymbolWrapper_vkCreateSampler;
+#define vkCreateSampler vulkanSymbolWrapper_vkCreateSampler
+extern PFN_vkDestroySampler vulkanSymbolWrapper_vkDestroySampler;
+#define vkDestroySampler vulkanSymbolWrapper_vkDestroySampler
+extern PFN_vkCreateDescriptorSetLayout vulkanSymbolWrapper_vkCreateDescriptorSetLayout;
+#define vkCreateDescriptorSetLayout vulkanSymbolWrapper_vkCreateDescriptorSetLayout
+extern PFN_vkDestroyDescriptorSetLayout vulkanSymbolWrapper_vkDestroyDescriptorSetLayout;
+#define vkDestroyDescriptorSetLayout vulkanSymbolWrapper_vkDestroyDescriptorSetLayout
+extern PFN_vkCreateDescriptorPool vulkanSymbolWrapper_vkCreateDescriptorPool;
+#define vkCreateDescriptorPool vulkanSymbolWrapper_vkCreateDescriptorPool
+extern PFN_vkDestroyDescriptorPool vulkanSymbolWrapper_vkDestroyDescriptorPool;
+#define vkDestroyDescriptorPool vulkanSymbolWrapper_vkDestroyDescriptorPool
+extern PFN_vkResetDescriptorPool vulkanSymbolWrapper_vkResetDescriptorPool;
+#define vkResetDescriptorPool vulkanSymbolWrapper_vkResetDescriptorPool
+extern PFN_vkAllocateDescriptorSets vulkanSymbolWrapper_vkAllocateDescriptorSets;
+#define vkAllocateDescriptorSets vulkanSymbolWrapper_vkAllocateDescriptorSets
+extern PFN_vkFreeDescriptorSets vulkanSymbolWrapper_vkFreeDescriptorSets;
+#define vkFreeDescriptorSets vulkanSymbolWrapper_vkFreeDescriptorSets
+extern PFN_vkUpdateDescriptorSets vulkanSymbolWrapper_vkUpdateDescriptorSets;
+#define vkUpdateDescriptorSets vulkanSymbolWrapper_vkUpdateDescriptorSets
+extern PFN_vkCreateFramebuffer vulkanSymbolWrapper_vkCreateFramebuffer;
+#define vkCreateFramebuffer vulkanSymbolWrapper_vkCreateFramebuffer
+extern PFN_vkDestroyFramebuffer vulkanSymbolWrapper_vkDestroyFramebuffer;
+#define vkDestroyFramebuffer vulkanSymbolWrapper_vkDestroyFramebuffer
+extern PFN_vkCreateRenderPass vulkanSymbolWrapper_vkCreateRenderPass;
+#define vkCreateRenderPass vulkanSymbolWrapper_vkCreateRenderPass
+extern PFN_vkDestroyRenderPass vulkanSymbolWrapper_vkDestroyRenderPass;
+#define vkDestroyRenderPass vulkanSymbolWrapper_vkDestroyRenderPass
+extern PFN_vkGetRenderAreaGranularity vulkanSymbolWrapper_vkGetRenderAreaGranularity;
+#define vkGetRenderAreaGranularity vulkanSymbolWrapper_vkGetRenderAreaGranularity
+extern PFN_vkCreateCommandPool vulkanSymbolWrapper_vkCreateCommandPool;
+#define vkCreateCommandPool vulkanSymbolWrapper_vkCreateCommandPool
+extern PFN_vkDestroyCommandPool vulkanSymbolWrapper_vkDestroyCommandPool;
+#define vkDestroyCommandPool vulkanSymbolWrapper_vkDestroyCommandPool
+extern PFN_vkResetCommandPool vulkanSymbolWrapper_vkResetCommandPool;
+#define vkResetCommandPool vulkanSymbolWrapper_vkResetCommandPool
+extern PFN_vkAllocateCommandBuffers vulkanSymbolWrapper_vkAllocateCommandBuffers;
+#define vkAllocateCommandBuffers vulkanSymbolWrapper_vkAllocateCommandBuffers
+extern PFN_vkFreeCommandBuffers vulkanSymbolWrapper_vkFreeCommandBuffers;
+#define vkFreeCommandBuffers vulkanSymbolWrapper_vkFreeCommandBuffers
+extern PFN_vkBeginCommandBuffer vulkanSymbolWrapper_vkBeginCommandBuffer;
+#define vkBeginCommandBuffer vulkanSymbolWrapper_vkBeginCommandBuffer
+extern PFN_vkEndCommandBuffer vulkanSymbolWrapper_vkEndCommandBuffer;
+#define vkEndCommandBuffer vulkanSymbolWrapper_vkEndCommandBuffer
+extern PFN_vkResetCommandBuffer vulkanSymbolWrapper_vkResetCommandBuffer;
+#define vkResetCommandBuffer vulkanSymbolWrapper_vkResetCommandBuffer
+extern PFN_vkCmdBindPipeline vulkanSymbolWrapper_vkCmdBindPipeline;
+#define vkCmdBindPipeline vulkanSymbolWrapper_vkCmdBindPipeline
+extern PFN_vkCmdSetViewport vulkanSymbolWrapper_vkCmdSetViewport;
+#define vkCmdSetViewport vulkanSymbolWrapper_vkCmdSetViewport
+extern PFN_vkCmdSetScissor vulkanSymbolWrapper_vkCmdSetScissor;
+#define vkCmdSetScissor vulkanSymbolWrapper_vkCmdSetScissor
+extern PFN_vkCmdSetLineWidth vulkanSymbolWrapper_vkCmdSetLineWidth;
+#define vkCmdSetLineWidth vulkanSymbolWrapper_vkCmdSetLineWidth
+extern PFN_vkCmdSetDepthBias vulkanSymbolWrapper_vkCmdSetDepthBias;
+#define vkCmdSetDepthBias vulkanSymbolWrapper_vkCmdSetDepthBias
+extern PFN_vkCmdSetBlendConstants vulkanSymbolWrapper_vkCmdSetBlendConstants;
+#define vkCmdSetBlendConstants vulkanSymbolWrapper_vkCmdSetBlendConstants
+extern PFN_vkCmdSetDepthBounds vulkanSymbolWrapper_vkCmdSetDepthBounds;
+#define vkCmdSetDepthBounds vulkanSymbolWrapper_vkCmdSetDepthBounds
+extern PFN_vkCmdSetStencilCompareMask vulkanSymbolWrapper_vkCmdSetStencilCompareMask;
+#define vkCmdSetStencilCompareMask vulkanSymbolWrapper_vkCmdSetStencilCompareMask
+extern PFN_vkCmdSetStencilWriteMask vulkanSymbolWrapper_vkCmdSetStencilWriteMask;
+#define vkCmdSetStencilWriteMask vulkanSymbolWrapper_vkCmdSetStencilWriteMask
+extern PFN_vkCmdSetStencilReference vulkanSymbolWrapper_vkCmdSetStencilReference;
+#define vkCmdSetStencilReference vulkanSymbolWrapper_vkCmdSetStencilReference
+extern PFN_vkCmdBindDescriptorSets vulkanSymbolWrapper_vkCmdBindDescriptorSets;
+#define vkCmdBindDescriptorSets vulkanSymbolWrapper_vkCmdBindDescriptorSets
+extern PFN_vkCmdBindIndexBuffer vulkanSymbolWrapper_vkCmdBindIndexBuffer;
+#define vkCmdBindIndexBuffer vulkanSymbolWrapper_vkCmdBindIndexBuffer
+extern PFN_vkCmdBindVertexBuffers vulkanSymbolWrapper_vkCmdBindVertexBuffers;
+#define vkCmdBindVertexBuffers vulkanSymbolWrapper_vkCmdBindVertexBuffers
+extern PFN_vkCmdDraw vulkanSymbolWrapper_vkCmdDraw;
+#define vkCmdDraw vulkanSymbolWrapper_vkCmdDraw
+extern PFN_vkCmdDrawIndexed vulkanSymbolWrapper_vkCmdDrawIndexed;
+#define vkCmdDrawIndexed vulkanSymbolWrapper_vkCmdDrawIndexed
+extern PFN_vkCmdDrawIndirect vulkanSymbolWrapper_vkCmdDrawIndirect;
+#define vkCmdDrawIndirect vulkanSymbolWrapper_vkCmdDrawIndirect
+extern PFN_vkCmdDrawIndexedIndirect vulkanSymbolWrapper_vkCmdDrawIndexedIndirect;
+#define vkCmdDrawIndexedIndirect vulkanSymbolWrapper_vkCmdDrawIndexedIndirect
+extern PFN_vkCmdDispatch vulkanSymbolWrapper_vkCmdDispatch;
+#define vkCmdDispatch vulkanSymbolWrapper_vkCmdDispatch
+extern PFN_vkCmdDispatchIndirect vulkanSymbolWrapper_vkCmdDispatchIndirect;
+#define vkCmdDispatchIndirect vulkanSymbolWrapper_vkCmdDispatchIndirect
+extern PFN_vkCmdCopyBuffer vulkanSymbolWrapper_vkCmdCopyBuffer;
+#define vkCmdCopyBuffer vulkanSymbolWrapper_vkCmdCopyBuffer
+extern PFN_vkCmdCopyImage vulkanSymbolWrapper_vkCmdCopyImage;
+#define vkCmdCopyImage vulkanSymbolWrapper_vkCmdCopyImage
+extern PFN_vkCmdBlitImage vulkanSymbolWrapper_vkCmdBlitImage;
+#define vkCmdBlitImage vulkanSymbolWrapper_vkCmdBlitImage
+extern PFN_vkCmdCopyBufferToImage vulkanSymbolWrapper_vkCmdCopyBufferToImage;
+#define vkCmdCopyBufferToImage vulkanSymbolWrapper_vkCmdCopyBufferToImage
+extern PFN_vkCmdCopyImageToBuffer vulkanSymbolWrapper_vkCmdCopyImageToBuffer;
+#define vkCmdCopyImageToBuffer vulkanSymbolWrapper_vkCmdCopyImageToBuffer
+extern PFN_vkCmdUpdateBuffer vulkanSymbolWrapper_vkCmdUpdateBuffer;
+#define vkCmdUpdateBuffer vulkanSymbolWrapper_vkCmdUpdateBuffer
+extern PFN_vkCmdFillBuffer vulkanSymbolWrapper_vkCmdFillBuffer;
+#define vkCmdFillBuffer vulkanSymbolWrapper_vkCmdFillBuffer
+extern PFN_vkCmdClearColorImage vulkanSymbolWrapper_vkCmdClearColorImage;
+#define vkCmdClearColorImage vulkanSymbolWrapper_vkCmdClearColorImage
+extern PFN_vkCmdClearDepthStencilImage vulkanSymbolWrapper_vkCmdClearDepthStencilImage;
+#define vkCmdClearDepthStencilImage vulkanSymbolWrapper_vkCmdClearDepthStencilImage
+extern PFN_vkCmdClearAttachments vulkanSymbolWrapper_vkCmdClearAttachments;
+#define vkCmdClearAttachments vulkanSymbolWrapper_vkCmdClearAttachments
+extern PFN_vkCmdResolveImage vulkanSymbolWrapper_vkCmdResolveImage;
+#define vkCmdResolveImage vulkanSymbolWrapper_vkCmdResolveImage
+extern PFN_vkCmdSetEvent vulkanSymbolWrapper_vkCmdSetEvent;
+#define vkCmdSetEvent vulkanSymbolWrapper_vkCmdSetEvent
+extern PFN_vkCmdResetEvent vulkanSymbolWrapper_vkCmdResetEvent;
+#define vkCmdResetEvent vulkanSymbolWrapper_vkCmdResetEvent
+extern PFN_vkCmdWaitEvents vulkanSymbolWrapper_vkCmdWaitEvents;
+#define vkCmdWaitEvents vulkanSymbolWrapper_vkCmdWaitEvents
+extern PFN_vkCmdPipelineBarrier vulkanSymbolWrapper_vkCmdPipelineBarrier;
+#define vkCmdPipelineBarrier vulkanSymbolWrapper_vkCmdPipelineBarrier
+extern PFN_vkCmdBeginQuery vulkanSymbolWrapper_vkCmdBeginQuery;
+#define vkCmdBeginQuery vulkanSymbolWrapper_vkCmdBeginQuery
+extern PFN_vkCmdEndQuery vulkanSymbolWrapper_vkCmdEndQuery;
+#define vkCmdEndQuery vulkanSymbolWrapper_vkCmdEndQuery
+extern PFN_vkCmdResetQueryPool vulkanSymbolWrapper_vkCmdResetQueryPool;
+#define vkCmdResetQueryPool vulkanSymbolWrapper_vkCmdResetQueryPool
+extern PFN_vkCmdWriteTimestamp vulkanSymbolWrapper_vkCmdWriteTimestamp;
+#define vkCmdWriteTimestamp vulkanSymbolWrapper_vkCmdWriteTimestamp
+extern PFN_vkCmdCopyQueryPoolResults vulkanSymbolWrapper_vkCmdCopyQueryPoolResults;
+#define vkCmdCopyQueryPoolResults vulkanSymbolWrapper_vkCmdCopyQueryPoolResults
+extern PFN_vkCmdPushConstants vulkanSymbolWrapper_vkCmdPushConstants;
+#define vkCmdPushConstants vulkanSymbolWrapper_vkCmdPushConstants
+extern PFN_vkCmdBeginRenderPass vulkanSymbolWrapper_vkCmdBeginRenderPass;
+#define vkCmdBeginRenderPass vulkanSymbolWrapper_vkCmdBeginRenderPass
+extern PFN_vkCmdNextSubpass vulkanSymbolWrapper_vkCmdNextSubpass;
+#define vkCmdNextSubpass vulkanSymbolWrapper_vkCmdNextSubpass
+extern PFN_vkCmdEndRenderPass vulkanSymbolWrapper_vkCmdEndRenderPass;
+#define vkCmdEndRenderPass vulkanSymbolWrapper_vkCmdEndRenderPass
+extern PFN_vkCmdExecuteCommands vulkanSymbolWrapper_vkCmdExecuteCommands;
+#define vkCmdExecuteCommands vulkanSymbolWrapper_vkCmdExecuteCommands
+extern PFN_vkDestroySurfaceKHR vulkanSymbolWrapper_vkDestroySurfaceKHR;
+#define vkDestroySurfaceKHR vulkanSymbolWrapper_vkDestroySurfaceKHR
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceSupportKHR;
+#define vkGetPhysicalDeviceSurfaceSupportKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceSupportKHR
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+#define vkGetPhysicalDeviceSurfaceCapabilitiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceCapabilitiesKHR
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceFormatsKHR;
+#define vkGetPhysicalDeviceSurfaceFormatsKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceFormatsKHR
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfacePresentModesKHR;
+#define vkGetPhysicalDeviceSurfacePresentModesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfacePresentModesKHR
+extern PFN_vkCreateSwapchainKHR vulkanSymbolWrapper_vkCreateSwapchainKHR;
+#define vkCreateSwapchainKHR vulkanSymbolWrapper_vkCreateSwapchainKHR
+extern PFN_vkDestroySwapchainKHR vulkanSymbolWrapper_vkDestroySwapchainKHR;
+#define vkDestroySwapchainKHR vulkanSymbolWrapper_vkDestroySwapchainKHR
+extern PFN_vkGetSwapchainImagesKHR vulkanSymbolWrapper_vkGetSwapchainImagesKHR;
+#define vkGetSwapchainImagesKHR vulkanSymbolWrapper_vkGetSwapchainImagesKHR
+extern PFN_vkAcquireNextImageKHR vulkanSymbolWrapper_vkAcquireNextImageKHR;
+#define vkAcquireNextImageKHR vulkanSymbolWrapper_vkAcquireNextImageKHR
+extern PFN_vkQueuePresentKHR vulkanSymbolWrapper_vkQueuePresentKHR;
+#define vkQueuePresentKHR vulkanSymbolWrapper_vkQueuePresentKHR
+extern PFN_vkGetPhysicalDeviceDisplayPropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPropertiesKHR;
+#define vkGetPhysicalDeviceDisplayPropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPropertiesKHR
+extern PFN_vkGetPhysicalDeviceDisplayPlanePropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPlanePropertiesKHR;
+#define vkGetPhysicalDeviceDisplayPlanePropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPlanePropertiesKHR
+extern PFN_vkGetDisplayPlaneSupportedDisplaysKHR vulkanSymbolWrapper_vkGetDisplayPlaneSupportedDisplaysKHR;
+#define vkGetDisplayPlaneSupportedDisplaysKHR vulkanSymbolWrapper_vkGetDisplayPlaneSupportedDisplaysKHR
+extern PFN_vkGetDisplayModePropertiesKHR vulkanSymbolWrapper_vkGetDisplayModePropertiesKHR;
+#define vkGetDisplayModePropertiesKHR vulkanSymbolWrapper_vkGetDisplayModePropertiesKHR
+extern PFN_vkCreateDisplayModeKHR vulkanSymbolWrapper_vkCreateDisplayModeKHR;
+#define vkCreateDisplayModeKHR vulkanSymbolWrapper_vkCreateDisplayModeKHR
+extern PFN_vkGetDisplayPlaneCapabilitiesKHR vulkanSymbolWrapper_vkGetDisplayPlaneCapabilitiesKHR;
+#define vkGetDisplayPlaneCapabilitiesKHR vulkanSymbolWrapper_vkGetDisplayPlaneCapabilitiesKHR
+extern PFN_vkCreateDisplayPlaneSurfaceKHR vulkanSymbolWrapper_vkCreateDisplayPlaneSurfaceKHR;
+#define vkCreateDisplayPlaneSurfaceKHR vulkanSymbolWrapper_vkCreateDisplayPlaneSurfaceKHR
+extern PFN_vkCreateSharedSwapchainsKHR vulkanSymbolWrapper_vkCreateSharedSwapchainsKHR;
+#define vkCreateSharedSwapchainsKHR vulkanSymbolWrapper_vkCreateSharedSwapchainsKHR
+extern PFN_vkCreateDebugReportCallbackEXT vulkanSymbolWrapper_vkCreateDebugReportCallbackEXT;
+#define vkCreateDebugReportCallbackEXT vulkanSymbolWrapper_vkCreateDebugReportCallbackEXT
+extern PFN_vkDestroyDebugReportCallbackEXT vulkanSymbolWrapper_vkDestroyDebugReportCallbackEXT;
+#define vkDestroyDebugReportCallbackEXT vulkanSymbolWrapper_vkDestroyDebugReportCallbackEXT
+extern PFN_vkDebugReportMessageEXT vulkanSymbolWrapper_vkDebugReportMessageEXT;
+#define vkDebugReportMessageEXT vulkanSymbolWrapper_vkDebugReportMessageEXT
+extern PFN_vkDebugMarkerSetObjectTagEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectTagEXT;
+#define vkDebugMarkerSetObjectTagEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectTagEXT
+extern PFN_vkDebugMarkerSetObjectNameEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectNameEXT;
+#define vkDebugMarkerSetObjectNameEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectNameEXT
+extern PFN_vkCmdDebugMarkerBeginEXT vulkanSymbolWrapper_vkCmdDebugMarkerBeginEXT;
+#define vkCmdDebugMarkerBeginEXT vulkanSymbolWrapper_vkCmdDebugMarkerBeginEXT
+extern PFN_vkCmdDebugMarkerEndEXT vulkanSymbolWrapper_vkCmdDebugMarkerEndEXT;
+#define vkCmdDebugMarkerEndEXT vulkanSymbolWrapper_vkCmdDebugMarkerEndEXT
+extern PFN_vkCmdDebugMarkerInsertEXT vulkanSymbolWrapper_vkCmdDebugMarkerInsertEXT;
+#define vkCmdDebugMarkerInsertEXT vulkanSymbolWrapper_vkCmdDebugMarkerInsertEXT
+extern PFN_vkCmdDrawIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndirectCountAMD;
+#define vkCmdDrawIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndirectCountAMD
+extern PFN_vkCmdDrawIndexedIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndexedIndirectCountAMD;
+#define vkCmdDrawIndexedIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndexedIndirectCountAMD
+extern PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV vulkanSymbolWrapper_vkGetPhysicalDeviceExternalImageFormatPropertiesNV;
+#define vkGetPhysicalDeviceExternalImageFormatPropertiesNV vulkanSymbolWrapper_vkGetPhysicalDeviceExternalImageFormatPropertiesNV
+
+VkBool32 vulkanSymbolWrapperInitLoader(void);
+void vulkanSymbolWrapperInit(PFN_vkGetInstanceProcAddr getInstanceProcAddr);
+PFN_vkGetInstanceProcAddr vulkanSymbolWrapperInstanceProcAddr(void);
+void vulkanSymbolWrapperReset(void);
+VkBool32 vulkanSymbolWrapperLoadGlobalSymbols(void);
+VkBool32 vulkanSymbolWrapperLoadCoreInstanceSymbols(VkInstance instance);
+VkBool32 vulkanSymbolWrapperLoadCoreSymbols(VkInstance instance);
+VkBool32 vulkanSymbolWrapperLoadCoreDeviceSymbols(VkDevice device);
+VkBool32 vulkanSymbolWrapperLoadInstanceSymbol(VkInstance instance, const char *name, PFN_vkVoidFunction *ppSymbol);
+VkBool32 vulkanSymbolWrapperLoadDeviceSymbol(VkDevice device, const char *name, PFN_vkVoidFunction *ppSymbol);
+
+#define VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, name, pfn) vulkanSymbolWrapperLoadInstanceSymbol(instance, name, (PFN_vkVoidFunction*) &(pfn))
+#define VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_EXTENSION_SYMBOL(instance, name) vulkanSymbolWrapperLoadInstanceSymbol(instance, #name, (PFN_vkVoidFunction*) & name)
+#define VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, name, pfn) vulkanSymbolWrapperLoadDeviceSymbol(device, name, (PFN_vkVoidFunction*) &(pfn))
+#define VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_EXTENSION_SYMBOL(device, name) vulkanSymbolWrapperLoadDeviceSymbol(device, #name, (PFN_vkVoidFunction*) & name)
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vk_platform.h b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vk_platform.h
new file mode 100644
index 0000000..0fa62ee
--- /dev/null
+++ b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vk_platform.h
@@ -0,0 +1,120 @@
+//
+// File: vk_platform.h
+//
+/*
+** Copyright (c) 2014-2015 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+#ifndef VK_PLATFORM_H_
+#define VK_PLATFORM_H_
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+/*
+***************************************************************************************************
+*   Platform-specific directives and type declarations
+***************************************************************************************************
+*/
+
+/* Platform-specific calling convention macros.
+ *
+ * Platforms should define these so that Vulkan clients call Vulkan commands
+ * with the same calling conventions that the Vulkan implementation expects.
+ *
+ * VKAPI_ATTR - Placed before the return type in function declarations.
+ *              Useful for C++11 and GCC/Clang-style function attribute syntax.
+ * VKAPI_CALL - Placed after the return type in function declarations.
+ *              Useful for MSVC-style calling convention syntax.
+ * VKAPI_PTR  - Placed between the '(' and '*' in function pointer types.
+ *
+ * Function declaration:  VKAPI_ATTR void VKAPI_CALL vkCommand(void);
+ * Function pointer type: typedef void (VKAPI_PTR *PFN_vkCommand)(void);
+ */
+#if defined(_WIN32)
+    // On Windows, Vulkan commands use the stdcall convention
+    #define VKAPI_ATTR
+    #define VKAPI_CALL __stdcall
+    #define VKAPI_PTR  VKAPI_CALL
+#elif defined(__ANDROID__) && defined(__ARM_ARCH) && __ARM_ARCH < 7
+    #error "Vulkan isn't supported for the 'armeabi' NDK ABI"
+#elif defined(__ANDROID__) && defined(__ARM_ARCH) && __ARM_ARCH >= 7 && defined(__ARM_32BIT_STATE)
+    // On Android 32-bit ARM targets, Vulkan functions use the "hardfloat"
+    // calling convention, i.e. float parameters are passed in registers. This
+    // is true even if the rest of the application passes floats on the stack,
+    // as it does by default when compiling for the armeabi-v7a NDK ABI.
+    #define VKAPI_ATTR __attribute__((pcs("aapcs-vfp")))
+    #define VKAPI_CALL
+    #define VKAPI_PTR  VKAPI_ATTR
+#else
+    // On other platforms, use the default calling convention
+    #define VKAPI_ATTR
+    #define VKAPI_CALL
+    #define VKAPI_PTR
+#endif
+
+#include <stddef.h>
+
+#if !defined(VK_NO_STDINT_H)
+    #if defined(_MSC_VER) && (_MSC_VER < 1600)
+        typedef signed   __int8  int8_t;
+        typedef unsigned __int8  uint8_t;
+        typedef signed   __int16 int16_t;
+        typedef unsigned __int16 uint16_t;
+        typedef signed   __int32 int32_t;
+        typedef unsigned __int32 uint32_t;
+        typedef signed   __int64 int64_t;
+        typedef unsigned __int64 uint64_t;
+    #else
+        #include <stdint.h>
+    #endif
+#endif // !defined(VK_NO_STDINT_H)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// Platform-specific headers required by platform window system extensions.
+// These are enabled prior to #including "vulkan.h". The same enable then
+// controls inclusion of the extension interfaces in vulkan.h.
+
+#ifdef VK_USE_PLATFORM_ANDROID_KHR
+#include <android/native_window.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_MIR_KHR
+#include <mir_toolkit/client_types.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+#include <wayland-client.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#include <windows.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+#include <X11/Xlib.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_XCB_KHR
+#include <xcb/xcb.h>
+#endif
+
+#endif
diff --git a/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h
new file mode 100644
index 0000000..cc0ad1f
--- /dev/null
+++ b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h
@@ -0,0 +1,4161 @@
+#ifndef VULKAN_H_
+#define VULKAN_H_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** Copyright (c) 2015-2016 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+/*
+** This header is generated from the Khronos Vulkan XML API Registry.
+**
+*/
+
+
+#define VK_VERSION_1_0 1
+#include "vk_platform.h"
+
+#define VK_MAKE_VERSION(major, minor, patch) \
+    (((major) << 22) | ((minor) << 12) | (patch))
+
+// DEPRECATED: This define has been removed. Specific version defines (e.g. VK_API_VERSION_1_0), or the VK_MAKE_VERSION macro, should be used instead.
+//#define VK_API_VERSION VK_MAKE_VERSION(1, 0, 0)
+
+// Vulkan 1.0 version number
+#define VK_API_VERSION_1_0 VK_MAKE_VERSION(1, 0, 0)
+
+#define VK_VERSION_MAJOR(version) ((uint32_t)(version) >> 22)
+#define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff)
+#define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff)
+// Version of this file
+#define VK_HEADER_VERSION 29
+
+
+#define VK_NULL_HANDLE 0
+        
+
+
+#define VK_DEFINE_HANDLE(object) typedef struct object##_T* object;
+
+
+#if !defined(VK_DEFINE_NON_DISPATCHABLE_HANDLE)
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__) ) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
+        #define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef struct object##_T *object;
+#else
+        #define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef uint64_t object;
+#endif
+#endif
+        
+
+
+typedef uint32_t VkFlags;
+typedef uint32_t VkBool32;
+typedef uint64_t VkDeviceSize;
+typedef uint32_t VkSampleMask;
+
+VK_DEFINE_HANDLE(VkInstance)
+VK_DEFINE_HANDLE(VkPhysicalDevice)
+VK_DEFINE_HANDLE(VkDevice)
+VK_DEFINE_HANDLE(VkQueue)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSemaphore)
+VK_DEFINE_HANDLE(VkCommandBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFence)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDeviceMemory)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImage)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkEvent)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkQueryPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBufferView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImageView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkShaderModule)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineCache)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkRenderPass)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipeline)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSetLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSampler)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSet)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFramebuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkCommandPool)
+
+#define VK_LOD_CLAMP_NONE                 1000.0f
+#define VK_REMAINING_MIP_LEVELS           (~0U)
+#define VK_REMAINING_ARRAY_LAYERS         (~0U)
+#define VK_WHOLE_SIZE                     (~0ULL)
+#define VK_ATTACHMENT_UNUSED              (~0U)
+#define VK_TRUE                           1
+#define VK_FALSE                          0
+#define VK_QUEUE_FAMILY_IGNORED           (~0U)
+#define VK_SUBPASS_EXTERNAL               (~0U)
+#define VK_MAX_PHYSICAL_DEVICE_NAME_SIZE  256
+#define VK_UUID_SIZE                      16
+#define VK_MAX_MEMORY_TYPES               32
+#define VK_MAX_MEMORY_HEAPS               16
+#define VK_MAX_EXTENSION_NAME_SIZE        256
+#define VK_MAX_DESCRIPTION_SIZE           256
+
+
+typedef enum VkPipelineCacheHeaderVersion {
+    VK_PIPELINE_CACHE_HEADER_VERSION_ONE = 1,
+    VK_PIPELINE_CACHE_HEADER_VERSION_BEGIN_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_END_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_RANGE_SIZE = (VK_PIPELINE_CACHE_HEADER_VERSION_ONE - VK_PIPELINE_CACHE_HEADER_VERSION_ONE + 1),
+    VK_PIPELINE_CACHE_HEADER_VERSION_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineCacheHeaderVersion;
+
+typedef enum VkResult {
+    VK_SUCCESS = 0,
+    VK_NOT_READY = 1,
+    VK_TIMEOUT = 2,
+    VK_EVENT_SET = 3,
+    VK_EVENT_RESET = 4,
+    VK_INCOMPLETE = 5,
+    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
+    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
+    VK_ERROR_INITIALIZATION_FAILED = -3,
+    VK_ERROR_DEVICE_LOST = -4,
+    VK_ERROR_MEMORY_MAP_FAILED = -5,
+    VK_ERROR_LAYER_NOT_PRESENT = -6,
+    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
+    VK_ERROR_FEATURE_NOT_PRESENT = -8,
+    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
+    VK_ERROR_TOO_MANY_OBJECTS = -10,
+    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
+    VK_ERROR_FRAGMENTED_POOL = -12,
+    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
+    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001,
+    VK_SUBOPTIMAL_KHR = 1000001003,
+    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
+    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
+    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
+    VK_ERROR_INVALID_SHADER_NV = -1000012000,
+    VK_RESULT_BEGIN_RANGE = VK_ERROR_FRAGMENTED_POOL,
+    VK_RESULT_END_RANGE = VK_INCOMPLETE,
+    VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1),
+    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
+} VkResult;
+
+typedef enum VkStructureType {
+    VK_STRUCTURE_TYPE_APPLICATION_INFO = 0,
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1,
+    VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO = 2,
+    VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO = 3,
+    VK_STRUCTURE_TYPE_SUBMIT_INFO = 4,
+    VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO = 5,
+    VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE = 6,
+    VK_STRUCTURE_TYPE_BIND_SPARSE_INFO = 7,
+    VK_STRUCTURE_TYPE_FENCE_CREATE_INFO = 8,
+    VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO = 9,
+    VK_STRUCTURE_TYPE_EVENT_CREATE_INFO = 10,
+    VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO = 11,
+    VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO = 12,
+    VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO = 13,
+    VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO = 14,
+    VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO = 15,
+    VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO = 16,
+    VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO = 17,
+    VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO = 18,
+    VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO = 19,
+    VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20,
+    VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO = 21,
+    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO = 22,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO = 23,
+    VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO = 24,
+    VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO = 25,
+    VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO = 26,
+    VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO = 27,
+    VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO = 28,
+    VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO = 29,
+    VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO = 30,
+    VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO = 31,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO = 32,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO = 33,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO = 34,
+    VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET = 35,
+    VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET = 36,
+    VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO = 37,
+    VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO = 38,
+    VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO = 39,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO = 40,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO = 41,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO = 42,
+    VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO = 43,
+    VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER = 44,
+    VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER = 45,
+    VK_STRUCTURE_TYPE_MEMORY_BARRIER = 46,
+    VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO = 47,
+    VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO = 48,
+    VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000,
+    VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001,
+    VK_STRUCTURE_TYPE_DISPLAY_MODE_CREATE_INFO_KHR = 1000002000,
+    VK_STRUCTURE_TYPE_DISPLAY_SURFACE_CREATE_INFO_KHR = 1000002001,
+    VK_STRUCTURE_TYPE_DISPLAY_PRESENT_INFO_KHR = 1000003000,
+    VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR = 1000004000,
+    VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR = 1000005000,
+    VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR = 1000006000,
+    VK_STRUCTURE_TYPE_MIR_SURFACE_CREATE_INFO_KHR = 1000007000,
+    VK_STRUCTURE_TYPE_ANDROID_SURFACE_CREATE_INFO_KHR = 1000008000,
+    VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR = 1000009000,
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT = 1000011000,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD = 1000018000,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT = 1000022000,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_TAG_INFO_EXT = 1000022001,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT = 1000022002,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_IMAGE_CREATE_INFO_NV = 1000026000,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_BUFFER_CREATE_INFO_NV = 1000026001,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV = 1000026002,
+    VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_NV = 1000056000,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_NV = 1000056001,
+    VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057000,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057001,
+    VK_STRUCTURE_TYPE_WIN32_KEYED_MUTEX_ACQUIRE_RELEASE_INFO_NV = 1000058000,
+    VK_STRUCTURE_TYPE_VALIDATION_FLAGS_EXT = 1000061000,
+    VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+    VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO,
+    VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1),
+    VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkStructureType;
+
+typedef enum VkSystemAllocationScope {
+    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND = 0,
+    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT = 1,
+    VK_SYSTEM_ALLOCATION_SCOPE_CACHE = 2,
+    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE = 3,
+    VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE = 4,
+    VK_SYSTEM_ALLOCATION_SCOPE_BEGIN_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND,
+    VK_SYSTEM_ALLOCATION_SCOPE_END_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
+    VK_SYSTEM_ALLOCATION_SCOPE_RANGE_SIZE = (VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND + 1),
+    VK_SYSTEM_ALLOCATION_SCOPE_MAX_ENUM = 0x7FFFFFFF
+} VkSystemAllocationScope;
+
+typedef enum VkInternalAllocationType {
+    VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE = 0,
+    VK_INTERNAL_ALLOCATION_TYPE_BEGIN_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_END_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_RANGE_SIZE = (VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE - VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE + 1),
+    VK_INTERNAL_ALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkInternalAllocationType;
+
+typedef enum VkFormat {
+    VK_FORMAT_UNDEFINED = 0,
+    VK_FORMAT_R4G4_UNORM_PACK8 = 1,
+    VK_FORMAT_R4G4B4A4_UNORM_PACK16 = 2,
+    VK_FORMAT_B4G4R4A4_UNORM_PACK16 = 3,
+    VK_FORMAT_R5G6B5_UNORM_PACK16 = 4,
+    VK_FORMAT_B5G6R5_UNORM_PACK16 = 5,
+    VK_FORMAT_R5G5B5A1_UNORM_PACK16 = 6,
+    VK_FORMAT_B5G5R5A1_UNORM_PACK16 = 7,
+    VK_FORMAT_A1R5G5B5_UNORM_PACK16 = 8,
+    VK_FORMAT_R8_UNORM = 9,
+    VK_FORMAT_R8_SNORM = 10,
+    VK_FORMAT_R8_USCALED = 11,
+    VK_FORMAT_R8_SSCALED = 12,
+    VK_FORMAT_R8_UINT = 13,
+    VK_FORMAT_R8_SINT = 14,
+    VK_FORMAT_R8_SRGB = 15,
+    VK_FORMAT_R8G8_UNORM = 16,
+    VK_FORMAT_R8G8_SNORM = 17,
+    VK_FORMAT_R8G8_USCALED = 18,
+    VK_FORMAT_R8G8_SSCALED = 19,
+    VK_FORMAT_R8G8_UINT = 20,
+    VK_FORMAT_R8G8_SINT = 21,
+    VK_FORMAT_R8G8_SRGB = 22,
+    VK_FORMAT_R8G8B8_UNORM = 23,
+    VK_FORMAT_R8G8B8_SNORM = 24,
+    VK_FORMAT_R8G8B8_USCALED = 25,
+    VK_FORMAT_R8G8B8_SSCALED = 26,
+    VK_FORMAT_R8G8B8_UINT = 27,
+    VK_FORMAT_R8G8B8_SINT = 28,
+    VK_FORMAT_R8G8B8_SRGB = 29,
+    VK_FORMAT_B8G8R8_UNORM = 30,
+    VK_FORMAT_B8G8R8_SNORM = 31,
+    VK_FORMAT_B8G8R8_USCALED = 32,
+    VK_FORMAT_B8G8R8_SSCALED = 33,
+    VK_FORMAT_B8G8R8_UINT = 34,
+    VK_FORMAT_B8G8R8_SINT = 35,
+    VK_FORMAT_B8G8R8_SRGB = 36,
+    VK_FORMAT_R8G8B8A8_UNORM = 37,
+    VK_FORMAT_R8G8B8A8_SNORM = 38,
+    VK_FORMAT_R8G8B8A8_USCALED = 39,
+    VK_FORMAT_R8G8B8A8_SSCALED = 40,
+    VK_FORMAT_R8G8B8A8_UINT = 41,
+    VK_FORMAT_R8G8B8A8_SINT = 42,
+    VK_FORMAT_R8G8B8A8_SRGB = 43,
+    VK_FORMAT_B8G8R8A8_UNORM = 44,
+    VK_FORMAT_B8G8R8A8_SNORM = 45,
+    VK_FORMAT_B8G8R8A8_USCALED = 46,
+    VK_FORMAT_B8G8R8A8_SSCALED = 47,
+    VK_FORMAT_B8G8R8A8_UINT = 48,
+    VK_FORMAT_B8G8R8A8_SINT = 49,
+    VK_FORMAT_B8G8R8A8_SRGB = 50,
+    VK_FORMAT_A8B8G8R8_UNORM_PACK32 = 51,
+    VK_FORMAT_A8B8G8R8_SNORM_PACK32 = 52,
+    VK_FORMAT_A8B8G8R8_USCALED_PACK32 = 53,
+    VK_FORMAT_A8B8G8R8_SSCALED_PACK32 = 54,
+    VK_FORMAT_A8B8G8R8_UINT_PACK32 = 55,
+    VK_FORMAT_A8B8G8R8_SINT_PACK32 = 56,
+    VK_FORMAT_A8B8G8R8_SRGB_PACK32 = 57,
+    VK_FORMAT_A2R10G10B10_UNORM_PACK32 = 58,
+    VK_FORMAT_A2R10G10B10_SNORM_PACK32 = 59,
+    VK_FORMAT_A2R10G10B10_USCALED_PACK32 = 60,
+    VK_FORMAT_A2R10G10B10_SSCALED_PACK32 = 61,
+    VK_FORMAT_A2R10G10B10_UINT_PACK32 = 62,
+    VK_FORMAT_A2R10G10B10_SINT_PACK32 = 63,
+    VK_FORMAT_A2B10G10R10_UNORM_PACK32 = 64,
+    VK_FORMAT_A2B10G10R10_SNORM_PACK32 = 65,
+    VK_FORMAT_A2B10G10R10_USCALED_PACK32 = 66,
+    VK_FORMAT_A2B10G10R10_SSCALED_PACK32 = 67,
+    VK_FORMAT_A2B10G10R10_UINT_PACK32 = 68,
+    VK_FORMAT_A2B10G10R10_SINT_PACK32 = 69,
+    VK_FORMAT_R16_UNORM = 70,
+    VK_FORMAT_R16_SNORM = 71,
+    VK_FORMAT_R16_USCALED = 72,
+    VK_FORMAT_R16_SSCALED = 73,
+    VK_FORMAT_R16_UINT = 74,
+    VK_FORMAT_R16_SINT = 75,
+    VK_FORMAT_R16_SFLOAT = 76,
+    VK_FORMAT_R16G16_UNORM = 77,
+    VK_FORMAT_R16G16_SNORM = 78,
+    VK_FORMAT_R16G16_USCALED = 79,
+    VK_FORMAT_R16G16_SSCALED = 80,
+    VK_FORMAT_R16G16_UINT = 81,
+    VK_FORMAT_R16G16_SINT = 82,
+    VK_FORMAT_R16G16_SFLOAT = 83,
+    VK_FORMAT_R16G16B16_UNORM = 84,
+    VK_FORMAT_R16G16B16_SNORM = 85,
+    VK_FORMAT_R16G16B16_USCALED = 86,
+    VK_FORMAT_R16G16B16_SSCALED = 87,
+    VK_FORMAT_R16G16B16_UINT = 88,
+    VK_FORMAT_R16G16B16_SINT = 89,
+    VK_FORMAT_R16G16B16_SFLOAT = 90,
+    VK_FORMAT_R16G16B16A16_UNORM = 91,
+    VK_FORMAT_R16G16B16A16_SNORM = 92,
+    VK_FORMAT_R16G16B16A16_USCALED = 93,
+    VK_FORMAT_R16G16B16A16_SSCALED = 94,
+    VK_FORMAT_R16G16B16A16_UINT = 95,
+    VK_FORMAT_R16G16B16A16_SINT = 96,
+    VK_FORMAT_R16G16B16A16_SFLOAT = 97,
+    VK_FORMAT_R32_UINT = 98,
+    VK_FORMAT_R32_SINT = 99,
+    VK_FORMAT_R32_SFLOAT = 100,
+    VK_FORMAT_R32G32_UINT = 101,
+    VK_FORMAT_R32G32_SINT = 102,
+    VK_FORMAT_R32G32_SFLOAT = 103,
+    VK_FORMAT_R32G32B32_UINT = 104,
+    VK_FORMAT_R32G32B32_SINT = 105,
+    VK_FORMAT_R32G32B32_SFLOAT = 106,
+    VK_FORMAT_R32G32B32A32_UINT = 107,
+    VK_FORMAT_R32G32B32A32_SINT = 108,
+    VK_FORMAT_R32G32B32A32_SFLOAT = 109,
+    VK_FORMAT_R64_UINT = 110,
+    VK_FORMAT_R64_SINT = 111,
+    VK_FORMAT_R64_SFLOAT = 112,
+    VK_FORMAT_R64G64_UINT = 113,
+    VK_FORMAT_R64G64_SINT = 114,
+    VK_FORMAT_R64G64_SFLOAT = 115,
+    VK_FORMAT_R64G64B64_UINT = 116,
+    VK_FORMAT_R64G64B64_SINT = 117,
+    VK_FORMAT_R64G64B64_SFLOAT = 118,
+    VK_FORMAT_R64G64B64A64_UINT = 119,
+    VK_FORMAT_R64G64B64A64_SINT = 120,
+    VK_FORMAT_R64G64B64A64_SFLOAT = 121,
+    VK_FORMAT_B10G11R11_UFLOAT_PACK32 = 122,
+    VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 = 123,
+    VK_FORMAT_D16_UNORM = 124,
+    VK_FORMAT_X8_D24_UNORM_PACK32 = 125,
+    VK_FORMAT_D32_SFLOAT = 126,
+    VK_FORMAT_S8_UINT = 127,
+    VK_FORMAT_D16_UNORM_S8_UINT = 128,
+    VK_FORMAT_D24_UNORM_S8_UINT = 129,
+    VK_FORMAT_D32_SFLOAT_S8_UINT = 130,
+    VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131,
+    VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132,
+    VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133,
+    VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134,
+    VK_FORMAT_BC2_UNORM_BLOCK = 135,
+    VK_FORMAT_BC2_SRGB_BLOCK = 136,
+    VK_FORMAT_BC3_UNORM_BLOCK = 137,
+    VK_FORMAT_BC3_SRGB_BLOCK = 138,
+    VK_FORMAT_BC4_UNORM_BLOCK = 139,
+    VK_FORMAT_BC4_SNORM_BLOCK = 140,
+    VK_FORMAT_BC5_UNORM_BLOCK = 141,
+    VK_FORMAT_BC5_SNORM_BLOCK = 142,
+    VK_FORMAT_BC6H_UFLOAT_BLOCK = 143,
+    VK_FORMAT_BC6H_SFLOAT_BLOCK = 144,
+    VK_FORMAT_BC7_UNORM_BLOCK = 145,
+    VK_FORMAT_BC7_SRGB_BLOCK = 146,
+    VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK = 147,
+    VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK = 148,
+    VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK = 149,
+    VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK = 150,
+    VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK = 151,
+    VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK = 152,
+    VK_FORMAT_EAC_R11_UNORM_BLOCK = 153,
+    VK_FORMAT_EAC_R11_SNORM_BLOCK = 154,
+    VK_FORMAT_EAC_R11G11_UNORM_BLOCK = 155,
+    VK_FORMAT_EAC_R11G11_SNORM_BLOCK = 156,
+    VK_FORMAT_ASTC_4x4_UNORM_BLOCK = 157,
+    VK_FORMAT_ASTC_4x4_SRGB_BLOCK = 158,
+    VK_FORMAT_ASTC_5x4_UNORM_BLOCK = 159,
+    VK_FORMAT_ASTC_5x4_SRGB_BLOCK = 160,
+    VK_FORMAT_ASTC_5x5_UNORM_BLOCK = 161,
+    VK_FORMAT_ASTC_5x5_SRGB_BLOCK = 162,
+    VK_FORMAT_ASTC_6x5_UNORM_BLOCK = 163,
+    VK_FORMAT_ASTC_6x5_SRGB_BLOCK = 164,
+    VK_FORMAT_ASTC_6x6_UNORM_BLOCK = 165,
+    VK_FORMAT_ASTC_6x6_SRGB_BLOCK = 166,
+    VK_FORMAT_ASTC_8x5_UNORM_BLOCK = 167,
+    VK_FORMAT_ASTC_8x5_SRGB_BLOCK = 168,
+    VK_FORMAT_ASTC_8x6_UNORM_BLOCK = 169,
+    VK_FORMAT_ASTC_8x6_SRGB_BLOCK = 170,
+    VK_FORMAT_ASTC_8x8_UNORM_BLOCK = 171,
+    VK_FORMAT_ASTC_8x8_SRGB_BLOCK = 172,
+    VK_FORMAT_ASTC_10x5_UNORM_BLOCK = 173,
+    VK_FORMAT_ASTC_10x5_SRGB_BLOCK = 174,
+    VK_FORMAT_ASTC_10x6_UNORM_BLOCK = 175,
+    VK_FORMAT_ASTC_10x6_SRGB_BLOCK = 176,
+    VK_FORMAT_ASTC_10x8_UNORM_BLOCK = 177,
+    VK_FORMAT_ASTC_10x8_SRGB_BLOCK = 178,
+    VK_FORMAT_ASTC_10x10_UNORM_BLOCK = 179,
+    VK_FORMAT_ASTC_10x10_SRGB_BLOCK = 180,
+    VK_FORMAT_ASTC_12x10_UNORM_BLOCK = 181,
+    VK_FORMAT_ASTC_12x10_SRGB_BLOCK = 182,
+    VK_FORMAT_ASTC_12x12_UNORM_BLOCK = 183,
+    VK_FORMAT_ASTC_12x12_SRGB_BLOCK = 184,
+    VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG = 1000054000,
+    VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG = 1000054001,
+    VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG = 1000054002,
+    VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG = 1000054003,
+    VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG = 1000054004,
+    VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005,
+    VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006,
+    VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007,
+    VK_FORMAT_BEGIN_RANGE = VK_FORMAT_UNDEFINED,
+    VK_FORMAT_END_RANGE = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+    VK_FORMAT_RANGE_SIZE = (VK_FORMAT_ASTC_12x12_SRGB_BLOCK - VK_FORMAT_UNDEFINED + 1),
+    VK_FORMAT_MAX_ENUM = 0x7FFFFFFF
+} VkFormat;
+
+typedef enum VkImageType {
+    VK_IMAGE_TYPE_1D = 0,
+    VK_IMAGE_TYPE_2D = 1,
+    VK_IMAGE_TYPE_3D = 2,
+    VK_IMAGE_TYPE_BEGIN_RANGE = VK_IMAGE_TYPE_1D,
+    VK_IMAGE_TYPE_END_RANGE = VK_IMAGE_TYPE_3D,
+    VK_IMAGE_TYPE_RANGE_SIZE = (VK_IMAGE_TYPE_3D - VK_IMAGE_TYPE_1D + 1),
+    VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageType;
+
+typedef enum VkImageTiling {
+    VK_IMAGE_TILING_OPTIMAL = 0,
+    VK_IMAGE_TILING_LINEAR = 1,
+    VK_IMAGE_TILING_BEGIN_RANGE = VK_IMAGE_TILING_OPTIMAL,
+    VK_IMAGE_TILING_END_RANGE = VK_IMAGE_TILING_LINEAR,
+    VK_IMAGE_TILING_RANGE_SIZE = (VK_IMAGE_TILING_LINEAR - VK_IMAGE_TILING_OPTIMAL + 1),
+    VK_IMAGE_TILING_MAX_ENUM = 0x7FFFFFFF
+} VkImageTiling;
+
+typedef enum VkPhysicalDeviceType {
+    VK_PHYSICAL_DEVICE_TYPE_OTHER = 0,
+    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU = 1,
+    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU = 2,
+    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU = 3,
+    VK_PHYSICAL_DEVICE_TYPE_CPU = 4,
+    VK_PHYSICAL_DEVICE_TYPE_BEGIN_RANGE = VK_PHYSICAL_DEVICE_TYPE_OTHER,
+    VK_PHYSICAL_DEVICE_TYPE_END_RANGE = VK_PHYSICAL_DEVICE_TYPE_CPU,
+    VK_PHYSICAL_DEVICE_TYPE_RANGE_SIZE = (VK_PHYSICAL_DEVICE_TYPE_CPU - VK_PHYSICAL_DEVICE_TYPE_OTHER + 1),
+    VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkPhysicalDeviceType;
+
+typedef enum VkQueryType {
+    VK_QUERY_TYPE_OCCLUSION = 0,
+    VK_QUERY_TYPE_PIPELINE_STATISTICS = 1,
+    VK_QUERY_TYPE_TIMESTAMP = 2,
+    VK_QUERY_TYPE_BEGIN_RANGE = VK_QUERY_TYPE_OCCLUSION,
+    VK_QUERY_TYPE_END_RANGE = VK_QUERY_TYPE_TIMESTAMP,
+    VK_QUERY_TYPE_RANGE_SIZE = (VK_QUERY_TYPE_TIMESTAMP - VK_QUERY_TYPE_OCCLUSION + 1),
+    VK_QUERY_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkQueryType;
+
+typedef enum VkSharingMode {
+    VK_SHARING_MODE_EXCLUSIVE = 0,
+    VK_SHARING_MODE_CONCURRENT = 1,
+    VK_SHARING_MODE_BEGIN_RANGE = VK_SHARING_MODE_EXCLUSIVE,
+    VK_SHARING_MODE_END_RANGE = VK_SHARING_MODE_CONCURRENT,
+    VK_SHARING_MODE_RANGE_SIZE = (VK_SHARING_MODE_CONCURRENT - VK_SHARING_MODE_EXCLUSIVE + 1),
+    VK_SHARING_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSharingMode;
+
+typedef enum VkImageLayout {
+    VK_IMAGE_LAYOUT_UNDEFINED = 0,
+    VK_IMAGE_LAYOUT_GENERAL = 1,
+    VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL = 2,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL = 3,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL = 4,
+    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL = 5,
+    VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL = 6,
+    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL = 7,
+    VK_IMAGE_LAYOUT_PREINITIALIZED = 8,
+    VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002,
+    VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED,
+    VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED,
+    VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1),
+    VK_IMAGE_LAYOUT_MAX_ENUM = 0x7FFFFFFF
+} VkImageLayout;
+
+typedef enum VkImageViewType {
+    VK_IMAGE_VIEW_TYPE_1D = 0,
+    VK_IMAGE_VIEW_TYPE_2D = 1,
+    VK_IMAGE_VIEW_TYPE_3D = 2,
+    VK_IMAGE_VIEW_TYPE_CUBE = 3,
+    VK_IMAGE_VIEW_TYPE_1D_ARRAY = 4,
+    VK_IMAGE_VIEW_TYPE_2D_ARRAY = 5,
+    VK_IMAGE_VIEW_TYPE_CUBE_ARRAY = 6,
+    VK_IMAGE_VIEW_TYPE_BEGIN_RANGE = VK_IMAGE_VIEW_TYPE_1D,
+    VK_IMAGE_VIEW_TYPE_END_RANGE = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
+    VK_IMAGE_VIEW_TYPE_RANGE_SIZE = (VK_IMAGE_VIEW_TYPE_CUBE_ARRAY - VK_IMAGE_VIEW_TYPE_1D + 1),
+    VK_IMAGE_VIEW_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageViewType;
+
+typedef enum VkComponentSwizzle {
+    VK_COMPONENT_SWIZZLE_IDENTITY = 0,
+    VK_COMPONENT_SWIZZLE_ZERO = 1,
+    VK_COMPONENT_SWIZZLE_ONE = 2,
+    VK_COMPONENT_SWIZZLE_R = 3,
+    VK_COMPONENT_SWIZZLE_G = 4,
+    VK_COMPONENT_SWIZZLE_B = 5,
+    VK_COMPONENT_SWIZZLE_A = 6,
+    VK_COMPONENT_SWIZZLE_BEGIN_RANGE = VK_COMPONENT_SWIZZLE_IDENTITY,
+    VK_COMPONENT_SWIZZLE_END_RANGE = VK_COMPONENT_SWIZZLE_A,
+    VK_COMPONENT_SWIZZLE_RANGE_SIZE = (VK_COMPONENT_SWIZZLE_A - VK_COMPONENT_SWIZZLE_IDENTITY + 1),
+    VK_COMPONENT_SWIZZLE_MAX_ENUM = 0x7FFFFFFF
+} VkComponentSwizzle;
+
+typedef enum VkVertexInputRate {
+    VK_VERTEX_INPUT_RATE_VERTEX = 0,
+    VK_VERTEX_INPUT_RATE_INSTANCE = 1,
+    VK_VERTEX_INPUT_RATE_BEGIN_RANGE = VK_VERTEX_INPUT_RATE_VERTEX,
+    VK_VERTEX_INPUT_RATE_END_RANGE = VK_VERTEX_INPUT_RATE_INSTANCE,
+    VK_VERTEX_INPUT_RATE_RANGE_SIZE = (VK_VERTEX_INPUT_RATE_INSTANCE - VK_VERTEX_INPUT_RATE_VERTEX + 1),
+    VK_VERTEX_INPUT_RATE_MAX_ENUM = 0x7FFFFFFF
+} VkVertexInputRate;
+
+typedef enum VkPrimitiveTopology {
+    VK_PRIMITIVE_TOPOLOGY_POINT_LIST = 0,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST = 1,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP = 2,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST = 3,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP = 4,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN = 5,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY = 6,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY = 7,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY = 8,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY = 9,
+    VK_PRIMITIVE_TOPOLOGY_PATCH_LIST = 10,
+    VK_PRIMITIVE_TOPOLOGY_BEGIN_RANGE = VK_PRIMITIVE_TOPOLOGY_POINT_LIST,
+    VK_PRIMITIVE_TOPOLOGY_END_RANGE = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST,
+    VK_PRIMITIVE_TOPOLOGY_RANGE_SIZE = (VK_PRIMITIVE_TOPOLOGY_PATCH_LIST - VK_PRIMITIVE_TOPOLOGY_POINT_LIST + 1),
+    VK_PRIMITIVE_TOPOLOGY_MAX_ENUM = 0x7FFFFFFF
+} VkPrimitiveTopology;
+
+typedef enum VkPolygonMode {
+    VK_POLYGON_MODE_FILL = 0,
+    VK_POLYGON_MODE_LINE = 1,
+    VK_POLYGON_MODE_POINT = 2,
+    VK_POLYGON_MODE_BEGIN_RANGE = VK_POLYGON_MODE_FILL,
+    VK_POLYGON_MODE_END_RANGE = VK_POLYGON_MODE_POINT,
+    VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1),
+    VK_POLYGON_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkPolygonMode;
+
+typedef enum VkFrontFace {
+    VK_FRONT_FACE_COUNTER_CLOCKWISE = 0,
+    VK_FRONT_FACE_CLOCKWISE = 1,
+    VK_FRONT_FACE_BEGIN_RANGE = VK_FRONT_FACE_COUNTER_CLOCKWISE,
+    VK_FRONT_FACE_END_RANGE = VK_FRONT_FACE_CLOCKWISE,
+    VK_FRONT_FACE_RANGE_SIZE = (VK_FRONT_FACE_CLOCKWISE - VK_FRONT_FACE_COUNTER_CLOCKWISE + 1),
+    VK_FRONT_FACE_MAX_ENUM = 0x7FFFFFFF
+} VkFrontFace;
+
+typedef enum VkCompareOp {
+    VK_COMPARE_OP_NEVER = 0,
+    VK_COMPARE_OP_LESS = 1,
+    VK_COMPARE_OP_EQUAL = 2,
+    VK_COMPARE_OP_LESS_OR_EQUAL = 3,
+    VK_COMPARE_OP_GREATER = 4,
+    VK_COMPARE_OP_NOT_EQUAL = 5,
+    VK_COMPARE_OP_GREATER_OR_EQUAL = 6,
+    VK_COMPARE_OP_ALWAYS = 7,
+    VK_COMPARE_OP_BEGIN_RANGE = VK_COMPARE_OP_NEVER,
+    VK_COMPARE_OP_END_RANGE = VK_COMPARE_OP_ALWAYS,
+    VK_COMPARE_OP_RANGE_SIZE = (VK_COMPARE_OP_ALWAYS - VK_COMPARE_OP_NEVER + 1),
+    VK_COMPARE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkCompareOp;
+
+typedef enum VkStencilOp {
+    VK_STENCIL_OP_KEEP = 0,
+    VK_STENCIL_OP_ZERO = 1,
+    VK_STENCIL_OP_REPLACE = 2,
+    VK_STENCIL_OP_INCREMENT_AND_CLAMP = 3,
+    VK_STENCIL_OP_DECREMENT_AND_CLAMP = 4,
+    VK_STENCIL_OP_INVERT = 5,
+    VK_STENCIL_OP_INCREMENT_AND_WRAP = 6,
+    VK_STENCIL_OP_DECREMENT_AND_WRAP = 7,
+    VK_STENCIL_OP_BEGIN_RANGE = VK_STENCIL_OP_KEEP,
+    VK_STENCIL_OP_END_RANGE = VK_STENCIL_OP_DECREMENT_AND_WRAP,
+    VK_STENCIL_OP_RANGE_SIZE = (VK_STENCIL_OP_DECREMENT_AND_WRAP - VK_STENCIL_OP_KEEP + 1),
+    VK_STENCIL_OP_MAX_ENUM = 0x7FFFFFFF
+} VkStencilOp;
+
+typedef enum VkLogicOp {
+    VK_LOGIC_OP_CLEAR = 0,
+    VK_LOGIC_OP_AND = 1,
+    VK_LOGIC_OP_AND_REVERSE = 2,
+    VK_LOGIC_OP_COPY = 3,
+    VK_LOGIC_OP_AND_INVERTED = 4,
+    VK_LOGIC_OP_NO_OP = 5,
+    VK_LOGIC_OP_XOR = 6,
+    VK_LOGIC_OP_OR = 7,
+    VK_LOGIC_OP_NOR = 8,
+    VK_LOGIC_OP_EQUIVALENT = 9,
+    VK_LOGIC_OP_INVERT = 10,
+    VK_LOGIC_OP_OR_REVERSE = 11,
+    VK_LOGIC_OP_COPY_INVERTED = 12,
+    VK_LOGIC_OP_OR_INVERTED = 13,
+    VK_LOGIC_OP_NAND = 14,
+    VK_LOGIC_OP_SET = 15,
+    VK_LOGIC_OP_BEGIN_RANGE = VK_LOGIC_OP_CLEAR,
+    VK_LOGIC_OP_END_RANGE = VK_LOGIC_OP_SET,
+    VK_LOGIC_OP_RANGE_SIZE = (VK_LOGIC_OP_SET - VK_LOGIC_OP_CLEAR + 1),
+    VK_LOGIC_OP_MAX_ENUM = 0x7FFFFFFF
+} VkLogicOp;
+
+typedef enum VkBlendFactor {
+    VK_BLEND_FACTOR_ZERO = 0,
+    VK_BLEND_FACTOR_ONE = 1,
+    VK_BLEND_FACTOR_SRC_COLOR = 2,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR = 3,
+    VK_BLEND_FACTOR_DST_COLOR = 4,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR = 5,
+    VK_BLEND_FACTOR_SRC_ALPHA = 6,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA = 7,
+    VK_BLEND_FACTOR_DST_ALPHA = 8,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA = 9,
+    VK_BLEND_FACTOR_CONSTANT_COLOR = 10,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR = 11,
+    VK_BLEND_FACTOR_CONSTANT_ALPHA = 12,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA = 13,
+    VK_BLEND_FACTOR_SRC_ALPHA_SATURATE = 14,
+    VK_BLEND_FACTOR_SRC1_COLOR = 15,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR = 16,
+    VK_BLEND_FACTOR_SRC1_ALPHA = 17,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA = 18,
+    VK_BLEND_FACTOR_BEGIN_RANGE = VK_BLEND_FACTOR_ZERO,
+    VK_BLEND_FACTOR_END_RANGE = VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA,
+    VK_BLEND_FACTOR_RANGE_SIZE = (VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA - VK_BLEND_FACTOR_ZERO + 1),
+    VK_BLEND_FACTOR_MAX_ENUM = 0x7FFFFFFF
+} VkBlendFactor;
+
+typedef enum VkBlendOp {
+    VK_BLEND_OP_ADD = 0,
+    VK_BLEND_OP_SUBTRACT = 1,
+    VK_BLEND_OP_REVERSE_SUBTRACT = 2,
+    VK_BLEND_OP_MIN = 3,
+    VK_BLEND_OP_MAX = 4,
+    VK_BLEND_OP_BEGIN_RANGE = VK_BLEND_OP_ADD,
+    VK_BLEND_OP_END_RANGE = VK_BLEND_OP_MAX,
+    VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1),
+    VK_BLEND_OP_MAX_ENUM = 0x7FFFFFFF
+} VkBlendOp;
+
+typedef enum VkDynamicState {
+    VK_DYNAMIC_STATE_VIEWPORT = 0,
+    VK_DYNAMIC_STATE_SCISSOR = 1,
+    VK_DYNAMIC_STATE_LINE_WIDTH = 2,
+    VK_DYNAMIC_STATE_DEPTH_BIAS = 3,
+    VK_DYNAMIC_STATE_BLEND_CONSTANTS = 4,
+    VK_DYNAMIC_STATE_DEPTH_BOUNDS = 5,
+    VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK = 6,
+    VK_DYNAMIC_STATE_STENCIL_WRITE_MASK = 7,
+    VK_DYNAMIC_STATE_STENCIL_REFERENCE = 8,
+    VK_DYNAMIC_STATE_BEGIN_RANGE = VK_DYNAMIC_STATE_VIEWPORT,
+    VK_DYNAMIC_STATE_END_RANGE = VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+    VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1),
+    VK_DYNAMIC_STATE_MAX_ENUM = 0x7FFFFFFF
+} VkDynamicState;
+
+typedef enum VkFilter {
+    VK_FILTER_NEAREST = 0,
+    VK_FILTER_LINEAR = 1,
+    VK_FILTER_CUBIC_IMG = 1000015000,
+    VK_FILTER_BEGIN_RANGE = VK_FILTER_NEAREST,
+    VK_FILTER_END_RANGE = VK_FILTER_LINEAR,
+    VK_FILTER_RANGE_SIZE = (VK_FILTER_LINEAR - VK_FILTER_NEAREST + 1),
+    VK_FILTER_MAX_ENUM = 0x7FFFFFFF
+} VkFilter;
+
+typedef enum VkSamplerMipmapMode {
+    VK_SAMPLER_MIPMAP_MODE_NEAREST = 0,
+    VK_SAMPLER_MIPMAP_MODE_LINEAR = 1,
+    VK_SAMPLER_MIPMAP_MODE_BEGIN_RANGE = VK_SAMPLER_MIPMAP_MODE_NEAREST,
+    VK_SAMPLER_MIPMAP_MODE_END_RANGE = VK_SAMPLER_MIPMAP_MODE_LINEAR,
+    VK_SAMPLER_MIPMAP_MODE_RANGE_SIZE = (VK_SAMPLER_MIPMAP_MODE_LINEAR - VK_SAMPLER_MIPMAP_MODE_NEAREST + 1),
+    VK_SAMPLER_MIPMAP_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerMipmapMode;
+
+typedef enum VkSamplerAddressMode {
+    VK_SAMPLER_ADDRESS_MODE_REPEAT = 0,
+    VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT = 1,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE = 2,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,
+    VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE = 4,
+    VK_SAMPLER_ADDRESS_MODE_BEGIN_RANGE = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+    VK_SAMPLER_ADDRESS_MODE_END_RANGE = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+    VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1),
+    VK_SAMPLER_ADDRESS_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerAddressMode;
+
+typedef enum VkBorderColor {
+    VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK = 0,
+    VK_BORDER_COLOR_INT_TRANSPARENT_BLACK = 1,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK = 2,
+    VK_BORDER_COLOR_INT_OPAQUE_BLACK = 3,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE = 4,
+    VK_BORDER_COLOR_INT_OPAQUE_WHITE = 5,
+    VK_BORDER_COLOR_BEGIN_RANGE = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+    VK_BORDER_COLOR_END_RANGE = VK_BORDER_COLOR_INT_OPAQUE_WHITE,
+    VK_BORDER_COLOR_RANGE_SIZE = (VK_BORDER_COLOR_INT_OPAQUE_WHITE - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK + 1),
+    VK_BORDER_COLOR_MAX_ENUM = 0x7FFFFFFF
+} VkBorderColor;
+
+typedef enum VkDescriptorType {
+    VK_DESCRIPTOR_TYPE_SAMPLER = 0,
+    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER = 1,
+    VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE = 2,
+    VK_DESCRIPTOR_TYPE_STORAGE_IMAGE = 3,
+    VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER = 4,
+    VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER = 5,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER = 6,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER = 7,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC = 8,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC = 9,
+    VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT = 10,
+    VK_DESCRIPTOR_TYPE_BEGIN_RANGE = VK_DESCRIPTOR_TYPE_SAMPLER,
+    VK_DESCRIPTOR_TYPE_END_RANGE = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
+    VK_DESCRIPTOR_TYPE_RANGE_SIZE = (VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT - VK_DESCRIPTOR_TYPE_SAMPLER + 1),
+    VK_DESCRIPTOR_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorType;
+
+typedef enum VkAttachmentLoadOp {
+    VK_ATTACHMENT_LOAD_OP_LOAD = 0,
+    VK_ATTACHMENT_LOAD_OP_CLEAR = 1,
+    VK_ATTACHMENT_LOAD_OP_DONT_CARE = 2,
+    VK_ATTACHMENT_LOAD_OP_BEGIN_RANGE = VK_ATTACHMENT_LOAD_OP_LOAD,
+    VK_ATTACHMENT_LOAD_OP_END_RANGE = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+    VK_ATTACHMENT_LOAD_OP_RANGE_SIZE = (VK_ATTACHMENT_LOAD_OP_DONT_CARE - VK_ATTACHMENT_LOAD_OP_LOAD + 1),
+    VK_ATTACHMENT_LOAD_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentLoadOp;
+
+typedef enum VkAttachmentStoreOp {
+    VK_ATTACHMENT_STORE_OP_STORE = 0,
+    VK_ATTACHMENT_STORE_OP_DONT_CARE = 1,
+    VK_ATTACHMENT_STORE_OP_BEGIN_RANGE = VK_ATTACHMENT_STORE_OP_STORE,
+    VK_ATTACHMENT_STORE_OP_END_RANGE = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+    VK_ATTACHMENT_STORE_OP_RANGE_SIZE = (VK_ATTACHMENT_STORE_OP_DONT_CARE - VK_ATTACHMENT_STORE_OP_STORE + 1),
+    VK_ATTACHMENT_STORE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentStoreOp;
+
+typedef enum VkPipelineBindPoint {
+    VK_PIPELINE_BIND_POINT_GRAPHICS = 0,
+    VK_PIPELINE_BIND_POINT_COMPUTE = 1,
+    VK_PIPELINE_BIND_POINT_BEGIN_RANGE = VK_PIPELINE_BIND_POINT_GRAPHICS,
+    VK_PIPELINE_BIND_POINT_END_RANGE = VK_PIPELINE_BIND_POINT_COMPUTE,
+    VK_PIPELINE_BIND_POINT_RANGE_SIZE = (VK_PIPELINE_BIND_POINT_COMPUTE - VK_PIPELINE_BIND_POINT_GRAPHICS + 1),
+    VK_PIPELINE_BIND_POINT_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineBindPoint;
+
+typedef enum VkCommandBufferLevel {
+    VK_COMMAND_BUFFER_LEVEL_PRIMARY = 0,
+    VK_COMMAND_BUFFER_LEVEL_SECONDARY = 1,
+    VK_COMMAND_BUFFER_LEVEL_BEGIN_RANGE = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+    VK_COMMAND_BUFFER_LEVEL_END_RANGE = VK_COMMAND_BUFFER_LEVEL_SECONDARY,
+    VK_COMMAND_BUFFER_LEVEL_RANGE_SIZE = (VK_COMMAND_BUFFER_LEVEL_SECONDARY - VK_COMMAND_BUFFER_LEVEL_PRIMARY + 1),
+    VK_COMMAND_BUFFER_LEVEL_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferLevel;
+
+typedef enum VkIndexType {
+    VK_INDEX_TYPE_UINT16 = 0,
+    VK_INDEX_TYPE_UINT32 = 1,
+    VK_INDEX_TYPE_BEGIN_RANGE = VK_INDEX_TYPE_UINT16,
+    VK_INDEX_TYPE_END_RANGE = VK_INDEX_TYPE_UINT32,
+    VK_INDEX_TYPE_RANGE_SIZE = (VK_INDEX_TYPE_UINT32 - VK_INDEX_TYPE_UINT16 + 1),
+    VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkIndexType;
+
+typedef enum VkSubpassContents {
+    VK_SUBPASS_CONTENTS_INLINE = 0,
+    VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS = 1,
+    VK_SUBPASS_CONTENTS_BEGIN_RANGE = VK_SUBPASS_CONTENTS_INLINE,
+    VK_SUBPASS_CONTENTS_END_RANGE = VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS,
+    VK_SUBPASS_CONTENTS_RANGE_SIZE = (VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS - VK_SUBPASS_CONTENTS_INLINE + 1),
+    VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF
+} VkSubpassContents;
+
+typedef VkFlags VkInstanceCreateFlags;
+
+typedef enum VkFormatFeatureFlagBits {
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT = 0x00000001,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT = 0x00000002,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT = 0x00000004,
+    VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT = 0x00000010,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT = 0x00000020,
+    VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT = 0x00000040,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT = 0x00000080,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT = 0x00000100,
+    VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000200,
+    VK_FORMAT_FEATURE_BLIT_SRC_BIT = 0x00000400,
+    VK_FORMAT_FEATURE_BLIT_DST_BIT = 0x00000800,
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT = 0x00001000,
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_IMG = 0x00002000,
+    VK_FORMAT_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkFormatFeatureFlagBits;
+typedef VkFlags VkFormatFeatureFlags;
+
+typedef enum VkImageUsageFlagBits {
+    VK_IMAGE_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_IMAGE_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_IMAGE_USAGE_SAMPLED_BIT = 0x00000004,
+    VK_IMAGE_USAGE_STORAGE_BIT = 0x00000008,
+    VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT = 0x00000010,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000020,
+    VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT = 0x00000040,
+    VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT = 0x00000080,
+    VK_IMAGE_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageUsageFlagBits;
+typedef VkFlags VkImageUsageFlags;
+
+typedef enum VkImageCreateFlagBits {
+    VK_IMAGE_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_IMAGE_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+    VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT = 0x00000008,
+    VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT = 0x00000010,
+    VK_IMAGE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageCreateFlagBits;
+typedef VkFlags VkImageCreateFlags;
+
+typedef enum VkSampleCountFlagBits {
+    VK_SAMPLE_COUNT_1_BIT = 0x00000001,
+    VK_SAMPLE_COUNT_2_BIT = 0x00000002,
+    VK_SAMPLE_COUNT_4_BIT = 0x00000004,
+    VK_SAMPLE_COUNT_8_BIT = 0x00000008,
+    VK_SAMPLE_COUNT_16_BIT = 0x00000010,
+    VK_SAMPLE_COUNT_32_BIT = 0x00000020,
+    VK_SAMPLE_COUNT_64_BIT = 0x00000040,
+    VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSampleCountFlagBits;
+typedef VkFlags VkSampleCountFlags;
+
+typedef enum VkQueueFlagBits {
+    VK_QUEUE_GRAPHICS_BIT = 0x00000001,
+    VK_QUEUE_COMPUTE_BIT = 0x00000002,
+    VK_QUEUE_TRANSFER_BIT = 0x00000004,
+    VK_QUEUE_SPARSE_BINDING_BIT = 0x00000008,
+    VK_QUEUE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueueFlagBits;
+typedef VkFlags VkQueueFlags;
+
+typedef enum VkMemoryPropertyFlagBits {
+    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT = 0x00000001,
+    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT = 0x00000002,
+    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT = 0x00000004,
+    VK_MEMORY_PROPERTY_HOST_CACHED_BIT = 0x00000008,
+    VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT = 0x00000010,
+    VK_MEMORY_PROPERTY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkMemoryPropertyFlagBits;
+typedef VkFlags VkMemoryPropertyFlags;
+
+typedef enum VkMemoryHeapFlagBits {
+    VK_MEMORY_HEAP_DEVICE_LOCAL_BIT = 0x00000001,
+    VK_MEMORY_HEAP_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkMemoryHeapFlagBits;
+typedef VkFlags VkMemoryHeapFlags;
+typedef VkFlags VkDeviceCreateFlags;
+typedef VkFlags VkDeviceQueueCreateFlags;
+
+typedef enum VkPipelineStageFlagBits {
+    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT = 0x00000001,
+    VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT = 0x00000002,
+    VK_PIPELINE_STAGE_VERTEX_INPUT_BIT = 0x00000004,
+    VK_PIPELINE_STAGE_VERTEX_SHADER_BIT = 0x00000008,
+    VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT = 0x00000010,
+    VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT = 0x00000020,
+    VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT = 0x00000040,
+    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT = 0x00000080,
+    VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT = 0x00000100,
+    VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT = 0x00000200,
+    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT = 0x00000400,
+    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT = 0x00000800,
+    VK_PIPELINE_STAGE_TRANSFER_BIT = 0x00001000,
+    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT = 0x00002000,
+    VK_PIPELINE_STAGE_HOST_BIT = 0x00004000,
+    VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT = 0x00008000,
+    VK_PIPELINE_STAGE_ALL_COMMANDS_BIT = 0x00010000,
+    VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineStageFlagBits;
+typedef VkFlags VkPipelineStageFlags;
+typedef VkFlags VkMemoryMapFlags;
+
+typedef enum VkImageAspectFlagBits {
+    VK_IMAGE_ASPECT_COLOR_BIT = 0x00000001,
+    VK_IMAGE_ASPECT_DEPTH_BIT = 0x00000002,
+    VK_IMAGE_ASPECT_STENCIL_BIT = 0x00000004,
+    VK_IMAGE_ASPECT_METADATA_BIT = 0x00000008,
+    VK_IMAGE_ASPECT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageAspectFlagBits;
+typedef VkFlags VkImageAspectFlags;
+
+typedef enum VkSparseImageFormatFlagBits {
+    VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT = 0x00000001,
+    VK_SPARSE_IMAGE_FORMAT_ALIGNED_MIP_SIZE_BIT = 0x00000002,
+    VK_SPARSE_IMAGE_FORMAT_NONSTANDARD_BLOCK_SIZE_BIT = 0x00000004,
+    VK_SPARSE_IMAGE_FORMAT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSparseImageFormatFlagBits;
+typedef VkFlags VkSparseImageFormatFlags;
+
+typedef enum VkSparseMemoryBindFlagBits {
+    VK_SPARSE_MEMORY_BIND_METADATA_BIT = 0x00000001,
+    VK_SPARSE_MEMORY_BIND_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSparseMemoryBindFlagBits;
+typedef VkFlags VkSparseMemoryBindFlags;
+
+typedef enum VkFenceCreateFlagBits {
+    VK_FENCE_CREATE_SIGNALED_BIT = 0x00000001,
+    VK_FENCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkFenceCreateFlagBits;
+typedef VkFlags VkFenceCreateFlags;
+typedef VkFlags VkSemaphoreCreateFlags;
+typedef VkFlags VkEventCreateFlags;
+typedef VkFlags VkQueryPoolCreateFlags;
+
+typedef enum VkQueryPipelineStatisticFlagBits {
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT = 0x00000001,
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT = 0x00000002,
+    VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT = 0x00000004,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT = 0x00000008,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT = 0x00000010,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT = 0x00000020,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT = 0x00000040,
+    VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT = 0x00000080,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT = 0x00000100,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT = 0x00000200,
+    VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT = 0x00000400,
+    VK_QUERY_PIPELINE_STATISTIC_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryPipelineStatisticFlagBits;
+typedef VkFlags VkQueryPipelineStatisticFlags;
+
+typedef enum VkQueryResultFlagBits {
+    VK_QUERY_RESULT_64_BIT = 0x00000001,
+    VK_QUERY_RESULT_WAIT_BIT = 0x00000002,
+    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004,
+    VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008,
+    VK_QUERY_RESULT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryResultFlagBits;
+typedef VkFlags VkQueryResultFlags;
+
+typedef enum VkBufferCreateFlagBits {
+    VK_BUFFER_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_BUFFER_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+    VK_BUFFER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkBufferCreateFlagBits;
+typedef VkFlags VkBufferCreateFlags;
+
+typedef enum VkBufferUsageFlagBits {
+    VK_BUFFER_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_BUFFER_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000004,
+    VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT = 0x00000010,
+    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT = 0x00000020,
+    VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040,
+    VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080,
+    VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100,
+    VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkBufferUsageFlagBits;
+typedef VkFlags VkBufferUsageFlags;
+typedef VkFlags VkBufferViewCreateFlags;
+typedef VkFlags VkImageViewCreateFlags;
+typedef VkFlags VkShaderModuleCreateFlags;
+typedef VkFlags VkPipelineCacheCreateFlags;
+
+typedef enum VkPipelineCreateFlagBits {
+    VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT = 0x00000001,
+    VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT = 0x00000002,
+    VK_PIPELINE_CREATE_DERIVATIVE_BIT = 0x00000004,
+    VK_PIPELINE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineCreateFlagBits;
+typedef VkFlags VkPipelineCreateFlags;
+typedef VkFlags VkPipelineShaderStageCreateFlags;
+
+typedef enum VkShaderStageFlagBits {
+    VK_SHADER_STAGE_VERTEX_BIT = 0x00000001,
+    VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT = 0x00000002,
+    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT = 0x00000004,
+    VK_SHADER_STAGE_GEOMETRY_BIT = 0x00000008,
+    VK_SHADER_STAGE_FRAGMENT_BIT = 0x00000010,
+    VK_SHADER_STAGE_COMPUTE_BIT = 0x00000020,
+    VK_SHADER_STAGE_ALL_GRAPHICS = 0x0000001F,
+    VK_SHADER_STAGE_ALL = 0x7FFFFFFF,
+    VK_SHADER_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkShaderStageFlagBits;
+typedef VkFlags VkPipelineVertexInputStateCreateFlags;
+typedef VkFlags VkPipelineInputAssemblyStateCreateFlags;
+typedef VkFlags VkPipelineTessellationStateCreateFlags;
+typedef VkFlags VkPipelineViewportStateCreateFlags;
+typedef VkFlags VkPipelineRasterizationStateCreateFlags;
+
+typedef enum VkCullModeFlagBits {
+    VK_CULL_MODE_NONE = 0,
+    VK_CULL_MODE_FRONT_BIT = 0x00000001,
+    VK_CULL_MODE_BACK_BIT = 0x00000002,
+    VK_CULL_MODE_FRONT_AND_BACK = 0x00000003,
+    VK_CULL_MODE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCullModeFlagBits;
+typedef VkFlags VkCullModeFlags;
+typedef VkFlags VkPipelineMultisampleStateCreateFlags;
+typedef VkFlags VkPipelineDepthStencilStateCreateFlags;
+typedef VkFlags VkPipelineColorBlendStateCreateFlags;
+
+typedef enum VkColorComponentFlagBits {
+    VK_COLOR_COMPONENT_R_BIT = 0x00000001,
+    VK_COLOR_COMPONENT_G_BIT = 0x00000002,
+    VK_COLOR_COMPONENT_B_BIT = 0x00000004,
+    VK_COLOR_COMPONENT_A_BIT = 0x00000008,
+    VK_COLOR_COMPONENT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkColorComponentFlagBits;
+typedef VkFlags VkColorComponentFlags;
+typedef VkFlags VkPipelineDynamicStateCreateFlags;
+typedef VkFlags VkPipelineLayoutCreateFlags;
+typedef VkFlags VkShaderStageFlags;
+typedef VkFlags VkSamplerCreateFlags;
+typedef VkFlags VkDescriptorSetLayoutCreateFlags;
+
+typedef enum VkDescriptorPoolCreateFlagBits {
+    VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT = 0x00000001,
+    VK_DESCRIPTOR_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorPoolCreateFlagBits;
+typedef VkFlags VkDescriptorPoolCreateFlags;
+typedef VkFlags VkDescriptorPoolResetFlags;
+typedef VkFlags VkFramebufferCreateFlags;
+typedef VkFlags VkRenderPassCreateFlags;
+
+typedef enum VkAttachmentDescriptionFlagBits {
+    VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT = 0x00000001,
+    VK_ATTACHMENT_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentDescriptionFlagBits;
+typedef VkFlags VkAttachmentDescriptionFlags;
+typedef VkFlags VkSubpassDescriptionFlags;
+
+typedef enum VkAccessFlagBits {
+    VK_ACCESS_INDIRECT_COMMAND_READ_BIT = 0x00000001,
+    VK_ACCESS_INDEX_READ_BIT = 0x00000002,
+    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT = 0x00000004,
+    VK_ACCESS_UNIFORM_READ_BIT = 0x00000008,
+    VK_ACCESS_INPUT_ATTACHMENT_READ_BIT = 0x00000010,
+    VK_ACCESS_SHADER_READ_BIT = 0x00000020,
+    VK_ACCESS_SHADER_WRITE_BIT = 0x00000040,
+    VK_ACCESS_COLOR_ATTACHMENT_READ_BIT = 0x00000080,
+    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT = 0x00000100,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT = 0x00000200,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT = 0x00000400,
+    VK_ACCESS_TRANSFER_READ_BIT = 0x00000800,
+    VK_ACCESS_TRANSFER_WRITE_BIT = 0x00001000,
+    VK_ACCESS_HOST_READ_BIT = 0x00002000,
+    VK_ACCESS_HOST_WRITE_BIT = 0x00004000,
+    VK_ACCESS_MEMORY_READ_BIT = 0x00008000,
+    VK_ACCESS_MEMORY_WRITE_BIT = 0x00010000,
+    VK_ACCESS_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkAccessFlagBits;
+typedef VkFlags VkAccessFlags;
+
+typedef enum VkDependencyFlagBits {
+    VK_DEPENDENCY_BY_REGION_BIT = 0x00000001,
+    VK_DEPENDENCY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkDependencyFlagBits;
+typedef VkFlags VkDependencyFlags;
+
+typedef enum VkCommandPoolCreateFlagBits {
+    VK_COMMAND_POOL_CREATE_TRANSIENT_BIT = 0x00000001,
+    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT = 0x00000002,
+    VK_COMMAND_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandPoolCreateFlagBits;
+typedef VkFlags VkCommandPoolCreateFlags;
+
+typedef enum VkCommandPoolResetFlagBits {
+    VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+    VK_COMMAND_POOL_RESET_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandPoolResetFlagBits;
+typedef VkFlags VkCommandPoolResetFlags;
+
+typedef enum VkCommandBufferUsageFlagBits {
+    VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT = 0x00000001,
+    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT = 0x00000002,
+    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT = 0x00000004,
+    VK_COMMAND_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferUsageFlagBits;
+typedef VkFlags VkCommandBufferUsageFlags;
+
+typedef enum VkQueryControlFlagBits {
+    VK_QUERY_CONTROL_PRECISE_BIT = 0x00000001,
+    VK_QUERY_CONTROL_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryControlFlagBits;
+typedef VkFlags VkQueryControlFlags;
+
+typedef enum VkCommandBufferResetFlagBits {
+    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+    VK_COMMAND_BUFFER_RESET_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferResetFlagBits;
+typedef VkFlags VkCommandBufferResetFlags;
+
+typedef enum VkStencilFaceFlagBits {
+    VK_STENCIL_FACE_FRONT_BIT = 0x00000001,
+    VK_STENCIL_FACE_BACK_BIT = 0x00000002,
+    VK_STENCIL_FRONT_AND_BACK = 0x00000003,
+    VK_STENCIL_FACE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkStencilFaceFlagBits;
+typedef VkFlags VkStencilFaceFlags;
+
+typedef void* (VKAPI_PTR *PFN_vkAllocationFunction)(
+    void*                                       pUserData,
+    size_t                                      size,
+    size_t                                      alignment,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void* (VKAPI_PTR *PFN_vkReallocationFunction)(
+    void*                                       pUserData,
+    void*                                       pOriginal,
+    size_t                                      size,
+    size_t                                      alignment,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkFreeFunction)(
+    void*                                       pUserData,
+    void*                                       pMemory);
+
+typedef void (VKAPI_PTR *PFN_vkInternalAllocationNotification)(
+    void*                                       pUserData,
+    size_t                                      size,
+    VkInternalAllocationType                    allocationType,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkInternalFreeNotification)(
+    void*                                       pUserData,
+    size_t                                      size,
+    VkInternalAllocationType                    allocationType,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkVoidFunction)(void);
+
+typedef struct VkApplicationInfo {
+    VkStructureType    sType;
+    const void*        pNext;
+    const char*        pApplicationName;
+    uint32_t           applicationVersion;
+    const char*        pEngineName;
+    uint32_t           engineVersion;
+    uint32_t           apiVersion;
+} VkApplicationInfo;
+
+typedef struct VkInstanceCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkInstanceCreateFlags       flags;
+    const VkApplicationInfo*    pApplicationInfo;
+    uint32_t                    enabledLayerCount;
+    const char* const*          ppEnabledLayerNames;
+    uint32_t                    enabledExtensionCount;
+    const char* const*          ppEnabledExtensionNames;
+} VkInstanceCreateInfo;
+
+typedef struct VkAllocationCallbacks {
+    void*                                   pUserData;
+    PFN_vkAllocationFunction                pfnAllocation;
+    PFN_vkReallocationFunction              pfnReallocation;
+    PFN_vkFreeFunction                      pfnFree;
+    PFN_vkInternalAllocationNotification    pfnInternalAllocation;
+    PFN_vkInternalFreeNotification          pfnInternalFree;
+} VkAllocationCallbacks;
+
+typedef struct VkPhysicalDeviceFeatures {
+    VkBool32    robustBufferAccess;
+    VkBool32    fullDrawIndexUint32;
+    VkBool32    imageCubeArray;
+    VkBool32    independentBlend;
+    VkBool32    geometryShader;
+    VkBool32    tessellationShader;
+    VkBool32    sampleRateShading;
+    VkBool32    dualSrcBlend;
+    VkBool32    logicOp;
+    VkBool32    multiDrawIndirect;
+    VkBool32    drawIndirectFirstInstance;
+    VkBool32    depthClamp;
+    VkBool32    depthBiasClamp;
+    VkBool32    fillModeNonSolid;
+    VkBool32    depthBounds;
+    VkBool32    wideLines;
+    VkBool32    largePoints;
+    VkBool32    alphaToOne;
+    VkBool32    multiViewport;
+    VkBool32    samplerAnisotropy;
+    VkBool32    textureCompressionETC2;
+    VkBool32    textureCompressionASTC_LDR;
+    VkBool32    textureCompressionBC;
+    VkBool32    occlusionQueryPrecise;
+    VkBool32    pipelineStatisticsQuery;
+    VkBool32    vertexPipelineStoresAndAtomics;
+    VkBool32    fragmentStoresAndAtomics;
+    VkBool32    shaderTessellationAndGeometryPointSize;
+    VkBool32    shaderImageGatherExtended;
+    VkBool32    shaderStorageImageExtendedFormats;
+    VkBool32    shaderStorageImageMultisample;
+    VkBool32    shaderStorageImageReadWithoutFormat;
+    VkBool32    shaderStorageImageWriteWithoutFormat;
+    VkBool32    shaderUniformBufferArrayDynamicIndexing;
+    VkBool32    shaderSampledImageArrayDynamicIndexing;
+    VkBool32    shaderStorageBufferArrayDynamicIndexing;
+    VkBool32    shaderStorageImageArrayDynamicIndexing;
+    VkBool32    shaderClipDistance;
+    VkBool32    shaderCullDistance;
+    VkBool32    shaderFloat64;
+    VkBool32    shaderInt64;
+    VkBool32    shaderInt16;
+    VkBool32    shaderResourceResidency;
+    VkBool32    shaderResourceMinLod;
+    VkBool32    sparseBinding;
+    VkBool32    sparseResidencyBuffer;
+    VkBool32    sparseResidencyImage2D;
+    VkBool32    sparseResidencyImage3D;
+    VkBool32    sparseResidency2Samples;
+    VkBool32    sparseResidency4Samples;
+    VkBool32    sparseResidency8Samples;
+    VkBool32    sparseResidency16Samples;
+    VkBool32    sparseResidencyAliased;
+    VkBool32    variableMultisampleRate;
+    VkBool32    inheritedQueries;
+} VkPhysicalDeviceFeatures;
+
+typedef struct VkFormatProperties {
+    VkFormatFeatureFlags    linearTilingFeatures;
+    VkFormatFeatureFlags    optimalTilingFeatures;
+    VkFormatFeatureFlags    bufferFeatures;
+} VkFormatProperties;
+
+typedef struct VkExtent3D {
+    uint32_t    width;
+    uint32_t    height;
+    uint32_t    depth;
+} VkExtent3D;
+
+typedef struct VkImageFormatProperties {
+    VkExtent3D            maxExtent;
+    uint32_t              maxMipLevels;
+    uint32_t              maxArrayLayers;
+    VkSampleCountFlags    sampleCounts;
+    VkDeviceSize          maxResourceSize;
+} VkImageFormatProperties;
+
+typedef struct VkPhysicalDeviceLimits {
+    uint32_t              maxImageDimension1D;
+    uint32_t              maxImageDimension2D;
+    uint32_t              maxImageDimension3D;
+    uint32_t              maxImageDimensionCube;
+    uint32_t              maxImageArrayLayers;
+    uint32_t              maxTexelBufferElements;
+    uint32_t              maxUniformBufferRange;
+    uint32_t              maxStorageBufferRange;
+    uint32_t              maxPushConstantsSize;
+    uint32_t              maxMemoryAllocationCount;
+    uint32_t              maxSamplerAllocationCount;
+    VkDeviceSize          bufferImageGranularity;
+    VkDeviceSize          sparseAddressSpaceSize;
+    uint32_t              maxBoundDescriptorSets;
+    uint32_t              maxPerStageDescriptorSamplers;
+    uint32_t              maxPerStageDescriptorUniformBuffers;
+    uint32_t              maxPerStageDescriptorStorageBuffers;
+    uint32_t              maxPerStageDescriptorSampledImages;
+    uint32_t              maxPerStageDescriptorStorageImages;
+    uint32_t              maxPerStageDescriptorInputAttachments;
+    uint32_t              maxPerStageResources;
+    uint32_t              maxDescriptorSetSamplers;
+    uint32_t              maxDescriptorSetUniformBuffers;
+    uint32_t              maxDescriptorSetUniformBuffersDynamic;
+    uint32_t              maxDescriptorSetStorageBuffers;
+    uint32_t              maxDescriptorSetStorageBuffersDynamic;
+    uint32_t              maxDescriptorSetSampledImages;
+    uint32_t              maxDescriptorSetStorageImages;
+    uint32_t              maxDescriptorSetInputAttachments;
+    uint32_t              maxVertexInputAttributes;
+    uint32_t              maxVertexInputBindings;
+    uint32_t              maxVertexInputAttributeOffset;
+    uint32_t              maxVertexInputBindingStride;
+    uint32_t              maxVertexOutputComponents;
+    uint32_t              maxTessellationGenerationLevel;
+    uint32_t              maxTessellationPatchSize;
+    uint32_t              maxTessellationControlPerVertexInputComponents;
+    uint32_t              maxTessellationControlPerVertexOutputComponents;
+    uint32_t              maxTessellationControlPerPatchOutputComponents;
+    uint32_t              maxTessellationControlTotalOutputComponents;
+    uint32_t              maxTessellationEvaluationInputComponents;
+    uint32_t              maxTessellationEvaluationOutputComponents;
+    uint32_t              maxGeometryShaderInvocations;
+    uint32_t              maxGeometryInputComponents;
+    uint32_t              maxGeometryOutputComponents;
+    uint32_t              maxGeometryOutputVertices;
+    uint32_t              maxGeometryTotalOutputComponents;
+    uint32_t              maxFragmentInputComponents;
+    uint32_t              maxFragmentOutputAttachments;
+    uint32_t              maxFragmentDualSrcAttachments;
+    uint32_t              maxFragmentCombinedOutputResources;
+    uint32_t              maxComputeSharedMemorySize;
+    uint32_t              maxComputeWorkGroupCount[3];
+    uint32_t              maxComputeWorkGroupInvocations;
+    uint32_t              maxComputeWorkGroupSize[3];
+    uint32_t              subPixelPrecisionBits;
+    uint32_t              subTexelPrecisionBits;
+    uint32_t              mipmapPrecisionBits;
+    uint32_t              maxDrawIndexedIndexValue;
+    uint32_t              maxDrawIndirectCount;
+    float                 maxSamplerLodBias;
+    float                 maxSamplerAnisotropy;
+    uint32_t              maxViewports;
+    uint32_t              maxViewportDimensions[2];
+    float                 viewportBoundsRange[2];
+    uint32_t              viewportSubPixelBits;
+    size_t                minMemoryMapAlignment;
+    VkDeviceSize          minTexelBufferOffsetAlignment;
+    VkDeviceSize          minUniformBufferOffsetAlignment;
+    VkDeviceSize          minStorageBufferOffsetAlignment;
+    int32_t               minTexelOffset;
+    uint32_t              maxTexelOffset;
+    int32_t               minTexelGatherOffset;
+    uint32_t              maxTexelGatherOffset;
+    float                 minInterpolationOffset;
+    float                 maxInterpolationOffset;
+    uint32_t              subPixelInterpolationOffsetBits;
+    uint32_t              maxFramebufferWidth;
+    uint32_t              maxFramebufferHeight;
+    uint32_t              maxFramebufferLayers;
+    VkSampleCountFlags    framebufferColorSampleCounts;
+    VkSampleCountFlags    framebufferDepthSampleCounts;
+    VkSampleCountFlags    framebufferStencilSampleCounts;
+    VkSampleCountFlags    framebufferNoAttachmentsSampleCounts;
+    uint32_t              maxColorAttachments;
+    VkSampleCountFlags    sampledImageColorSampleCounts;
+    VkSampleCountFlags    sampledImageIntegerSampleCounts;
+    VkSampleCountFlags    sampledImageDepthSampleCounts;
+    VkSampleCountFlags    sampledImageStencilSampleCounts;
+    VkSampleCountFlags    storageImageSampleCounts;
+    uint32_t              maxSampleMaskWords;
+    VkBool32              timestampComputeAndGraphics;
+    float                 timestampPeriod;
+    uint32_t              maxClipDistances;
+    uint32_t              maxCullDistances;
+    uint32_t              maxCombinedClipAndCullDistances;
+    uint32_t              discreteQueuePriorities;
+    float                 pointSizeRange[2];
+    float                 lineWidthRange[2];
+    float                 pointSizeGranularity;
+    float                 lineWidthGranularity;
+    VkBool32              strictLines;
+    VkBool32              standardSampleLocations;
+    VkDeviceSize          optimalBufferCopyOffsetAlignment;
+    VkDeviceSize          optimalBufferCopyRowPitchAlignment;
+    VkDeviceSize          nonCoherentAtomSize;
+} VkPhysicalDeviceLimits;
+
+typedef struct VkPhysicalDeviceSparseProperties {
+    VkBool32    residencyStandard2DBlockShape;
+    VkBool32    residencyStandard2DMultisampleBlockShape;
+    VkBool32    residencyStandard3DBlockShape;
+    VkBool32    residencyAlignedMipSize;
+    VkBool32    residencyNonResidentStrict;
+} VkPhysicalDeviceSparseProperties;
+
+typedef struct VkPhysicalDeviceProperties {
+    uint32_t                            apiVersion;
+    uint32_t                            driverVersion;
+    uint32_t                            vendorID;
+    uint32_t                            deviceID;
+    VkPhysicalDeviceType                deviceType;
+    char                                deviceName[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
+    uint8_t                             pipelineCacheUUID[VK_UUID_SIZE];
+    VkPhysicalDeviceLimits              limits;
+    VkPhysicalDeviceSparseProperties    sparseProperties;
+} VkPhysicalDeviceProperties;
+
+typedef struct VkQueueFamilyProperties {
+    VkQueueFlags    queueFlags;
+    uint32_t        queueCount;
+    uint32_t        timestampValidBits;
+    VkExtent3D      minImageTransferGranularity;
+} VkQueueFamilyProperties;
+
+typedef struct VkMemoryType {
+    VkMemoryPropertyFlags    propertyFlags;
+    uint32_t                 heapIndex;
+} VkMemoryType;
+
+typedef struct VkMemoryHeap {
+    VkDeviceSize         size;
+    VkMemoryHeapFlags    flags;
+} VkMemoryHeap;
+
+typedef struct VkPhysicalDeviceMemoryProperties {
+    uint32_t        memoryTypeCount;
+    VkMemoryType    memoryTypes[VK_MAX_MEMORY_TYPES];
+    uint32_t        memoryHeapCount;
+    VkMemoryHeap    memoryHeaps[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryProperties;
+
+typedef struct VkDeviceQueueCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkDeviceQueueCreateFlags    flags;
+    uint32_t                    queueFamilyIndex;
+    uint32_t                    queueCount;
+    const float*                pQueuePriorities;
+} VkDeviceQueueCreateInfo;
+
+typedef struct VkDeviceCreateInfo {
+    VkStructureType                    sType;
+    const void*                        pNext;
+    VkDeviceCreateFlags                flags;
+    uint32_t                           queueCreateInfoCount;
+    const VkDeviceQueueCreateInfo*     pQueueCreateInfos;
+    uint32_t                           enabledLayerCount;
+    const char* const*                 ppEnabledLayerNames;
+    uint32_t                           enabledExtensionCount;
+    const char* const*                 ppEnabledExtensionNames;
+    const VkPhysicalDeviceFeatures*    pEnabledFeatures;
+} VkDeviceCreateInfo;
+
+typedef struct VkExtensionProperties {
+    char        extensionName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t    specVersion;
+} VkExtensionProperties;
+
+typedef struct VkLayerProperties {
+    char        layerName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t    specVersion;
+    uint32_t    implementationVersion;
+    char        description[VK_MAX_DESCRIPTION_SIZE];
+} VkLayerProperties;
+
+typedef struct VkSubmitInfo {
+    VkStructureType                sType;
+    const void*                    pNext;
+    uint32_t                       waitSemaphoreCount;
+    const VkSemaphore*             pWaitSemaphores;
+    const VkPipelineStageFlags*    pWaitDstStageMask;
+    uint32_t                       commandBufferCount;
+    const VkCommandBuffer*         pCommandBuffers;
+    uint32_t                       signalSemaphoreCount;
+    const VkSemaphore*             pSignalSemaphores;
+} VkSubmitInfo;
+
+typedef struct VkMemoryAllocateInfo {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDeviceSize       allocationSize;
+    uint32_t           memoryTypeIndex;
+} VkMemoryAllocateInfo;
+
+typedef struct VkMappedMemoryRange {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDeviceMemory     memory;
+    VkDeviceSize       offset;
+    VkDeviceSize       size;
+} VkMappedMemoryRange;
+
+typedef struct VkMemoryRequirements {
+    VkDeviceSize    size;
+    VkDeviceSize    alignment;
+    uint32_t        memoryTypeBits;
+} VkMemoryRequirements;
+
+typedef struct VkSparseImageFormatProperties {
+    VkImageAspectFlags          aspectMask;
+    VkExtent3D                  imageGranularity;
+    VkSparseImageFormatFlags    flags;
+} VkSparseImageFormatProperties;
+
+typedef struct VkSparseImageMemoryRequirements {
+    VkSparseImageFormatProperties    formatProperties;
+    uint32_t                         imageMipTailFirstLod;
+    VkDeviceSize                     imageMipTailSize;
+    VkDeviceSize                     imageMipTailOffset;
+    VkDeviceSize                     imageMipTailStride;
+} VkSparseImageMemoryRequirements;
+
+typedef struct VkSparseMemoryBind {
+    VkDeviceSize               resourceOffset;
+    VkDeviceSize               size;
+    VkDeviceMemory             memory;
+    VkDeviceSize               memoryOffset;
+    VkSparseMemoryBindFlags    flags;
+} VkSparseMemoryBind;
+
+typedef struct VkSparseBufferMemoryBindInfo {
+    VkBuffer                     buffer;
+    uint32_t                     bindCount;
+    const VkSparseMemoryBind*    pBinds;
+} VkSparseBufferMemoryBindInfo;
+
+typedef struct VkSparseImageOpaqueMemoryBindInfo {
+    VkImage                      image;
+    uint32_t                     bindCount;
+    const VkSparseMemoryBind*    pBinds;
+} VkSparseImageOpaqueMemoryBindInfo;
+
+typedef struct VkImageSubresource {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              mipLevel;
+    uint32_t              arrayLayer;
+} VkImageSubresource;
+
+typedef struct VkOffset3D {
+    int32_t    x;
+    int32_t    y;
+    int32_t    z;
+} VkOffset3D;
+
+typedef struct VkSparseImageMemoryBind {
+    VkImageSubresource         subresource;
+    VkOffset3D                 offset;
+    VkExtent3D                 extent;
+    VkDeviceMemory             memory;
+    VkDeviceSize               memoryOffset;
+    VkSparseMemoryBindFlags    flags;
+} VkSparseImageMemoryBind;
+
+typedef struct VkSparseImageMemoryBindInfo {
+    VkImage                           image;
+    uint32_t                          bindCount;
+    const VkSparseImageMemoryBind*    pBinds;
+} VkSparseImageMemoryBindInfo;
+
+typedef struct VkBindSparseInfo {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+    uint32_t                                    waitSemaphoreCount;
+    const VkSemaphore*                          pWaitSemaphores;
+    uint32_t                                    bufferBindCount;
+    const VkSparseBufferMemoryBindInfo*         pBufferBinds;
+    uint32_t                                    imageOpaqueBindCount;
+    const VkSparseImageOpaqueMemoryBindInfo*    pImageOpaqueBinds;
+    uint32_t                                    imageBindCount;
+    const VkSparseImageMemoryBindInfo*          pImageBinds;
+    uint32_t                                    signalSemaphoreCount;
+    const VkSemaphore*                          pSignalSemaphores;
+} VkBindSparseInfo;
+
+typedef struct VkFenceCreateInfo {
+    VkStructureType       sType;
+    const void*           pNext;
+    VkFenceCreateFlags    flags;
+} VkFenceCreateInfo;
+
+typedef struct VkSemaphoreCreateInfo {
+    VkStructureType           sType;
+    const void*               pNext;
+    VkSemaphoreCreateFlags    flags;
+} VkSemaphoreCreateInfo;
+
+typedef struct VkEventCreateInfo {
+    VkStructureType       sType;
+    const void*           pNext;
+    VkEventCreateFlags    flags;
+} VkEventCreateInfo;
+
+typedef struct VkQueryPoolCreateInfo {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkQueryPoolCreateFlags           flags;
+    VkQueryType                      queryType;
+    uint32_t                         queryCount;
+    VkQueryPipelineStatisticFlags    pipelineStatistics;
+} VkQueryPoolCreateInfo;
+
+typedef struct VkBufferCreateInfo {
+    VkStructureType        sType;
+    const void*            pNext;
+    VkBufferCreateFlags    flags;
+    VkDeviceSize           size;
+    VkBufferUsageFlags     usage;
+    VkSharingMode          sharingMode;
+    uint32_t               queueFamilyIndexCount;
+    const uint32_t*        pQueueFamilyIndices;
+} VkBufferCreateInfo;
+
+typedef struct VkBufferViewCreateInfo {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkBufferViewCreateFlags    flags;
+    VkBuffer                   buffer;
+    VkFormat                   format;
+    VkDeviceSize               offset;
+    VkDeviceSize               range;
+} VkBufferViewCreateInfo;
+
+typedef struct VkImageCreateInfo {
+    VkStructureType          sType;
+    const void*              pNext;
+    VkImageCreateFlags       flags;
+    VkImageType              imageType;
+    VkFormat                 format;
+    VkExtent3D               extent;
+    uint32_t                 mipLevels;
+    uint32_t                 arrayLayers;
+    VkSampleCountFlagBits    samples;
+    VkImageTiling            tiling;
+    VkImageUsageFlags        usage;
+    VkSharingMode            sharingMode;
+    uint32_t                 queueFamilyIndexCount;
+    const uint32_t*          pQueueFamilyIndices;
+    VkImageLayout            initialLayout;
+} VkImageCreateInfo;
+
+typedef struct VkSubresourceLayout {
+    VkDeviceSize    offset;
+    VkDeviceSize    size;
+    VkDeviceSize    rowPitch;
+    VkDeviceSize    arrayPitch;
+    VkDeviceSize    depthPitch;
+} VkSubresourceLayout;
+
+typedef struct VkComponentMapping {
+    VkComponentSwizzle    r;
+    VkComponentSwizzle    g;
+    VkComponentSwizzle    b;
+    VkComponentSwizzle    a;
+} VkComponentMapping;
+
+typedef struct VkImageSubresourceRange {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              baseMipLevel;
+    uint32_t              levelCount;
+    uint32_t              baseArrayLayer;
+    uint32_t              layerCount;
+} VkImageSubresourceRange;
+
+typedef struct VkImageViewCreateInfo {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkImageViewCreateFlags     flags;
+    VkImage                    image;
+    VkImageViewType            viewType;
+    VkFormat                   format;
+    VkComponentMapping         components;
+    VkImageSubresourceRange    subresourceRange;
+} VkImageViewCreateInfo;
+
+typedef struct VkShaderModuleCreateInfo {
+    VkStructureType              sType;
+    const void*                  pNext;
+    VkShaderModuleCreateFlags    flags;
+    size_t                       codeSize;
+    const uint32_t*              pCode;
+} VkShaderModuleCreateInfo;
+
+typedef struct VkPipelineCacheCreateInfo {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkPipelineCacheCreateFlags    flags;
+    size_t                        initialDataSize;
+    const void*                   pInitialData;
+} VkPipelineCacheCreateInfo;
+
+typedef struct VkSpecializationMapEntry {
+    uint32_t    constantID;
+    uint32_t    offset;
+    size_t      size;
+} VkSpecializationMapEntry;
+
+typedef struct VkSpecializationInfo {
+    uint32_t                           mapEntryCount;
+    const VkSpecializationMapEntry*    pMapEntries;
+    size_t                             dataSize;
+    const void*                        pData;
+} VkSpecializationInfo;
+
+typedef struct VkPipelineShaderStageCreateInfo {
+    VkStructureType                     sType;
+    const void*                         pNext;
+    VkPipelineShaderStageCreateFlags    flags;
+    VkShaderStageFlagBits               stage;
+    VkShaderModule                      module;
+    const char*                         pName;
+    const VkSpecializationInfo*         pSpecializationInfo;
+} VkPipelineShaderStageCreateInfo;
+
+typedef struct VkVertexInputBindingDescription {
+    uint32_t             binding;
+    uint32_t             stride;
+    VkVertexInputRate    inputRate;
+} VkVertexInputBindingDescription;
+
+typedef struct VkVertexInputAttributeDescription {
+    uint32_t    location;
+    uint32_t    binding;
+    VkFormat    format;
+    uint32_t    offset;
+} VkVertexInputAttributeDescription;
+
+typedef struct VkPipelineVertexInputStateCreateInfo {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+    VkPipelineVertexInputStateCreateFlags       flags;
+    uint32_t                                    vertexBindingDescriptionCount;
+    const VkVertexInputBindingDescription*      pVertexBindingDescriptions;
+    uint32_t                                    vertexAttributeDescriptionCount;
+    const VkVertexInputAttributeDescription*    pVertexAttributeDescriptions;
+} VkPipelineVertexInputStateCreateInfo;
+
+typedef struct VkPipelineInputAssemblyStateCreateInfo {
+    VkStructureType                            sType;
+    const void*                                pNext;
+    VkPipelineInputAssemblyStateCreateFlags    flags;
+    VkPrimitiveTopology                        topology;
+    VkBool32                                   primitiveRestartEnable;
+} VkPipelineInputAssemblyStateCreateInfo;
+
+typedef struct VkPipelineTessellationStateCreateInfo {
+    VkStructureType                           sType;
+    const void*                               pNext;
+    VkPipelineTessellationStateCreateFlags    flags;
+    uint32_t                                  patchControlPoints;
+} VkPipelineTessellationStateCreateInfo;
+
+typedef struct VkViewport {
+    float    x;
+    float    y;
+    float    width;
+    float    height;
+    float    minDepth;
+    float    maxDepth;
+} VkViewport;
+
+typedef struct VkOffset2D {
+    int32_t    x;
+    int32_t    y;
+} VkOffset2D;
+
+typedef struct VkExtent2D {
+    uint32_t    width;
+    uint32_t    height;
+} VkExtent2D;
+
+typedef struct VkRect2D {
+    VkOffset2D    offset;
+    VkExtent2D    extent;
+} VkRect2D;
+
+typedef struct VkPipelineViewportStateCreateInfo {
+    VkStructureType                       sType;
+    const void*                           pNext;
+    VkPipelineViewportStateCreateFlags    flags;
+    uint32_t                              viewportCount;
+    const VkViewport*                     pViewports;
+    uint32_t                              scissorCount;
+    const VkRect2D*                       pScissors;
+} VkPipelineViewportStateCreateInfo;
+
+typedef struct VkPipelineRasterizationStateCreateInfo {
+    VkStructureType                            sType;
+    const void*                                pNext;
+    VkPipelineRasterizationStateCreateFlags    flags;
+    VkBool32                                   depthClampEnable;
+    VkBool32                                   rasterizerDiscardEnable;
+    VkPolygonMode                              polygonMode;
+    VkCullModeFlags                            cullMode;
+    VkFrontFace                                frontFace;
+    VkBool32                                   depthBiasEnable;
+    float                                      depthBiasConstantFactor;
+    float                                      depthBiasClamp;
+    float                                      depthBiasSlopeFactor;
+    float                                      lineWidth;
+} VkPipelineRasterizationStateCreateInfo;
+
+typedef struct VkPipelineMultisampleStateCreateInfo {
+    VkStructureType                          sType;
+    const void*                              pNext;
+    VkPipelineMultisampleStateCreateFlags    flags;
+    VkSampleCountFlagBits                    rasterizationSamples;
+    VkBool32                                 sampleShadingEnable;
+    float                                    minSampleShading;
+    const VkSampleMask*                      pSampleMask;
+    VkBool32                                 alphaToCoverageEnable;
+    VkBool32                                 alphaToOneEnable;
+} VkPipelineMultisampleStateCreateInfo;
+
+typedef struct VkStencilOpState {
+    VkStencilOp    failOp;
+    VkStencilOp    passOp;
+    VkStencilOp    depthFailOp;
+    VkCompareOp    compareOp;
+    uint32_t       compareMask;
+    uint32_t       writeMask;
+    uint32_t       reference;
+} VkStencilOpState;
+
+typedef struct VkPipelineDepthStencilStateCreateInfo {
+    VkStructureType                           sType;
+    const void*                               pNext;
+    VkPipelineDepthStencilStateCreateFlags    flags;
+    VkBool32                                  depthTestEnable;
+    VkBool32                                  depthWriteEnable;
+    VkCompareOp                               depthCompareOp;
+    VkBool32                                  depthBoundsTestEnable;
+    VkBool32                                  stencilTestEnable;
+    VkStencilOpState                          front;
+    VkStencilOpState                          back;
+    float                                     minDepthBounds;
+    float                                     maxDepthBounds;
+} VkPipelineDepthStencilStateCreateInfo;
+
+typedef struct VkPipelineColorBlendAttachmentState {
+    VkBool32                 blendEnable;
+    VkBlendFactor            srcColorBlendFactor;
+    VkBlendFactor            dstColorBlendFactor;
+    VkBlendOp                colorBlendOp;
+    VkBlendFactor            srcAlphaBlendFactor;
+    VkBlendFactor            dstAlphaBlendFactor;
+    VkBlendOp                alphaBlendOp;
+    VkColorComponentFlags    colorWriteMask;
+} VkPipelineColorBlendAttachmentState;
+
+typedef struct VkPipelineColorBlendStateCreateInfo {
+    VkStructureType                               sType;
+    const void*                                   pNext;
+    VkPipelineColorBlendStateCreateFlags          flags;
+    VkBool32                                      logicOpEnable;
+    VkLogicOp                                     logicOp;
+    uint32_t                                      attachmentCount;
+    const VkPipelineColorBlendAttachmentState*    pAttachments;
+    float                                         blendConstants[4];
+} VkPipelineColorBlendStateCreateInfo;
+
+typedef struct VkPipelineDynamicStateCreateInfo {
+    VkStructureType                      sType;
+    const void*                          pNext;
+    VkPipelineDynamicStateCreateFlags    flags;
+    uint32_t                             dynamicStateCount;
+    const VkDynamicState*                pDynamicStates;
+} VkPipelineDynamicStateCreateInfo;
+
+typedef struct VkGraphicsPipelineCreateInfo {
+    VkStructureType                                  sType;
+    const void*                                      pNext;
+    VkPipelineCreateFlags                            flags;
+    uint32_t                                         stageCount;
+    const VkPipelineShaderStageCreateInfo*           pStages;
+    const VkPipelineVertexInputStateCreateInfo*      pVertexInputState;
+    const VkPipelineInputAssemblyStateCreateInfo*    pInputAssemblyState;
+    const VkPipelineTessellationStateCreateInfo*     pTessellationState;
+    const VkPipelineViewportStateCreateInfo*         pViewportState;
+    const VkPipelineRasterizationStateCreateInfo*    pRasterizationState;
+    const VkPipelineMultisampleStateCreateInfo*      pMultisampleState;
+    const VkPipelineDepthStencilStateCreateInfo*     pDepthStencilState;
+    const VkPipelineColorBlendStateCreateInfo*       pColorBlendState;
+    const VkPipelineDynamicStateCreateInfo*          pDynamicState;
+    VkPipelineLayout                                 layout;
+    VkRenderPass                                     renderPass;
+    uint32_t                                         subpass;
+    VkPipeline                                       basePipelineHandle;
+    int32_t                                          basePipelineIndex;
+} VkGraphicsPipelineCreateInfo;
+
+typedef struct VkComputePipelineCreateInfo {
+    VkStructureType                    sType;
+    const void*                        pNext;
+    VkPipelineCreateFlags              flags;
+    VkPipelineShaderStageCreateInfo    stage;
+    VkPipelineLayout                   layout;
+    VkPipeline                         basePipelineHandle;
+    int32_t                            basePipelineIndex;
+} VkComputePipelineCreateInfo;
+
+typedef struct VkPushConstantRange {
+    VkShaderStageFlags    stageFlags;
+    uint32_t              offset;
+    uint32_t              size;
+} VkPushConstantRange;
+
+typedef struct VkPipelineLayoutCreateInfo {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkPipelineLayoutCreateFlags     flags;
+    uint32_t                        setLayoutCount;
+    const VkDescriptorSetLayout*    pSetLayouts;
+    uint32_t                        pushConstantRangeCount;
+    const VkPushConstantRange*      pPushConstantRanges;
+} VkPipelineLayoutCreateInfo;
+
+typedef struct VkSamplerCreateInfo {
+    VkStructureType         sType;
+    const void*             pNext;
+    VkSamplerCreateFlags    flags;
+    VkFilter                magFilter;
+    VkFilter                minFilter;
+    VkSamplerMipmapMode     mipmapMode;
+    VkSamplerAddressMode    addressModeU;
+    VkSamplerAddressMode    addressModeV;
+    VkSamplerAddressMode    addressModeW;
+    float                   mipLodBias;
+    VkBool32                anisotropyEnable;
+    float                   maxAnisotropy;
+    VkBool32                compareEnable;
+    VkCompareOp             compareOp;
+    float                   minLod;
+    float                   maxLod;
+    VkBorderColor           borderColor;
+    VkBool32                unnormalizedCoordinates;
+} VkSamplerCreateInfo;
+
+typedef struct VkDescriptorSetLayoutBinding {
+    uint32_t              binding;
+    VkDescriptorType      descriptorType;
+    uint32_t              descriptorCount;
+    VkShaderStageFlags    stageFlags;
+    const VkSampler*      pImmutableSamplers;
+} VkDescriptorSetLayoutBinding;
+
+typedef struct VkDescriptorSetLayoutCreateInfo {
+    VkStructureType                        sType;
+    const void*                            pNext;
+    VkDescriptorSetLayoutCreateFlags       flags;
+    uint32_t                               bindingCount;
+    const VkDescriptorSetLayoutBinding*    pBindings;
+} VkDescriptorSetLayoutCreateInfo;
+
+typedef struct VkDescriptorPoolSize {
+    VkDescriptorType    type;
+    uint32_t            descriptorCount;
+} VkDescriptorPoolSize;
+
+typedef struct VkDescriptorPoolCreateInfo {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkDescriptorPoolCreateFlags    flags;
+    uint32_t                       maxSets;
+    uint32_t                       poolSizeCount;
+    const VkDescriptorPoolSize*    pPoolSizes;
+} VkDescriptorPoolCreateInfo;
+
+typedef struct VkDescriptorSetAllocateInfo {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkDescriptorPool                descriptorPool;
+    uint32_t                        descriptorSetCount;
+    const VkDescriptorSetLayout*    pSetLayouts;
+} VkDescriptorSetAllocateInfo;
+
+typedef struct VkDescriptorImageInfo {
+    VkSampler        sampler;
+    VkImageView      imageView;
+    VkImageLayout    imageLayout;
+} VkDescriptorImageInfo;
+
+typedef struct VkDescriptorBufferInfo {
+    VkBuffer        buffer;
+    VkDeviceSize    offset;
+    VkDeviceSize    range;
+} VkDescriptorBufferInfo;
+
+typedef struct VkWriteDescriptorSet {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkDescriptorSet                  dstSet;
+    uint32_t                         dstBinding;
+    uint32_t                         dstArrayElement;
+    uint32_t                         descriptorCount;
+    VkDescriptorType                 descriptorType;
+    const VkDescriptorImageInfo*     pImageInfo;
+    const VkDescriptorBufferInfo*    pBufferInfo;
+    const VkBufferView*              pTexelBufferView;
+} VkWriteDescriptorSet;
+
+typedef struct VkCopyDescriptorSet {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDescriptorSet    srcSet;
+    uint32_t           srcBinding;
+    uint32_t           srcArrayElement;
+    VkDescriptorSet    dstSet;
+    uint32_t           dstBinding;
+    uint32_t           dstArrayElement;
+    uint32_t           descriptorCount;
+} VkCopyDescriptorSet;
+
+typedef struct VkFramebufferCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkFramebufferCreateFlags    flags;
+    VkRenderPass                renderPass;
+    uint32_t                    attachmentCount;
+    const VkImageView*          pAttachments;
+    uint32_t                    width;
+    uint32_t                    height;
+    uint32_t                    layers;
+} VkFramebufferCreateInfo;
+
+typedef struct VkAttachmentDescription {
+    VkAttachmentDescriptionFlags    flags;
+    VkFormat                        format;
+    VkSampleCountFlagBits           samples;
+    VkAttachmentLoadOp              loadOp;
+    VkAttachmentStoreOp             storeOp;
+    VkAttachmentLoadOp              stencilLoadOp;
+    VkAttachmentStoreOp             stencilStoreOp;
+    VkImageLayout                   initialLayout;
+    VkImageLayout                   finalLayout;
+} VkAttachmentDescription;
+
+typedef struct VkAttachmentReference {
+    uint32_t         attachment;
+    VkImageLayout    layout;
+} VkAttachmentReference;
+
+typedef struct VkSubpassDescription {
+    VkSubpassDescriptionFlags       flags;
+    VkPipelineBindPoint             pipelineBindPoint;
+    uint32_t                        inputAttachmentCount;
+    const VkAttachmentReference*    pInputAttachments;
+    uint32_t                        colorAttachmentCount;
+    const VkAttachmentReference*    pColorAttachments;
+    const VkAttachmentReference*    pResolveAttachments;
+    const VkAttachmentReference*    pDepthStencilAttachment;
+    uint32_t                        preserveAttachmentCount;
+    const uint32_t*                 pPreserveAttachments;
+} VkSubpassDescription;
+
+typedef struct VkSubpassDependency {
+    uint32_t                srcSubpass;
+    uint32_t                dstSubpass;
+    VkPipelineStageFlags    srcStageMask;
+    VkPipelineStageFlags    dstStageMask;
+    VkAccessFlags           srcAccessMask;
+    VkAccessFlags           dstAccessMask;
+    VkDependencyFlags       dependencyFlags;
+} VkSubpassDependency;
+
+typedef struct VkRenderPassCreateInfo {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkRenderPassCreateFlags           flags;
+    uint32_t                          attachmentCount;
+    const VkAttachmentDescription*    pAttachments;
+    uint32_t                          subpassCount;
+    const VkSubpassDescription*       pSubpasses;
+    uint32_t                          dependencyCount;
+    const VkSubpassDependency*        pDependencies;
+} VkRenderPassCreateInfo;
+
+typedef struct VkCommandPoolCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkCommandPoolCreateFlags    flags;
+    uint32_t                    queueFamilyIndex;
+} VkCommandPoolCreateInfo;
+
+typedef struct VkCommandBufferAllocateInfo {
+    VkStructureType         sType;
+    const void*             pNext;
+    VkCommandPool           commandPool;
+    VkCommandBufferLevel    level;
+    uint32_t                commandBufferCount;
+} VkCommandBufferAllocateInfo;
+
+typedef struct VkCommandBufferInheritanceInfo {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkRenderPass                     renderPass;
+    uint32_t                         subpass;
+    VkFramebuffer                    framebuffer;
+    VkBool32                         occlusionQueryEnable;
+    VkQueryControlFlags              queryFlags;
+    VkQueryPipelineStatisticFlags    pipelineStatistics;
+} VkCommandBufferInheritanceInfo;
+
+typedef struct VkCommandBufferBeginInfo {
+    VkStructureType                          sType;
+    const void*                              pNext;
+    VkCommandBufferUsageFlags                flags;
+    const VkCommandBufferInheritanceInfo*    pInheritanceInfo;
+} VkCommandBufferBeginInfo;
+
+typedef struct VkBufferCopy {
+    VkDeviceSize    srcOffset;
+    VkDeviceSize    dstOffset;
+    VkDeviceSize    size;
+} VkBufferCopy;
+
+typedef struct VkImageSubresourceLayers {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              mipLevel;
+    uint32_t              baseArrayLayer;
+    uint32_t              layerCount;
+} VkImageSubresourceLayers;
+
+typedef struct VkImageCopy {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffset;
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffset;
+    VkExtent3D                  extent;
+} VkImageCopy;
+
+typedef struct VkImageBlit {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffsets[2];
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffsets[2];
+} VkImageBlit;
+
+typedef struct VkBufferImageCopy {
+    VkDeviceSize                bufferOffset;
+    uint32_t                    bufferRowLength;
+    uint32_t                    bufferImageHeight;
+    VkImageSubresourceLayers    imageSubresource;
+    VkOffset3D                  imageOffset;
+    VkExtent3D                  imageExtent;
+} VkBufferImageCopy;
+
+typedef union VkClearColorValue {
+    float       float32[4];
+    int32_t     int32[4];
+    uint32_t    uint32[4];
+} VkClearColorValue;
+
+typedef struct VkClearDepthStencilValue {
+    float       depth;
+    uint32_t    stencil;
+} VkClearDepthStencilValue;
+
+typedef union VkClearValue {
+    VkClearColorValue           color;
+    VkClearDepthStencilValue    depthStencil;
+} VkClearValue;
+
+typedef struct VkClearAttachment {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              colorAttachment;
+    VkClearValue          clearValue;
+} VkClearAttachment;
+
+typedef struct VkClearRect {
+    VkRect2D    rect;
+    uint32_t    baseArrayLayer;
+    uint32_t    layerCount;
+} VkClearRect;
+
+typedef struct VkImageResolve {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffset;
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffset;
+    VkExtent3D                  extent;
+} VkImageResolve;
+
+typedef struct VkMemoryBarrier {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkAccessFlags      srcAccessMask;
+    VkAccessFlags      dstAccessMask;
+} VkMemoryBarrier;
+
+typedef struct VkBufferMemoryBarrier {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkAccessFlags      srcAccessMask;
+    VkAccessFlags      dstAccessMask;
+    uint32_t           srcQueueFamilyIndex;
+    uint32_t           dstQueueFamilyIndex;
+    VkBuffer           buffer;
+    VkDeviceSize       offset;
+    VkDeviceSize       size;
+} VkBufferMemoryBarrier;
+
+typedef struct VkImageMemoryBarrier {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkAccessFlags              srcAccessMask;
+    VkAccessFlags              dstAccessMask;
+    VkImageLayout              oldLayout;
+    VkImageLayout              newLayout;
+    uint32_t                   srcQueueFamilyIndex;
+    uint32_t                   dstQueueFamilyIndex;
+    VkImage                    image;
+    VkImageSubresourceRange    subresourceRange;
+} VkImageMemoryBarrier;
+
+typedef struct VkRenderPassBeginInfo {
+    VkStructureType        sType;
+    const void*            pNext;
+    VkRenderPass           renderPass;
+    VkFramebuffer          framebuffer;
+    VkRect2D               renderArea;
+    uint32_t               clearValueCount;
+    const VkClearValue*    pClearValues;
+} VkRenderPassBeginInfo;
+
+typedef struct VkDispatchIndirectCommand {
+    uint32_t    x;
+    uint32_t    y;
+    uint32_t    z;
+} VkDispatchIndirectCommand;
+
+typedef struct VkDrawIndexedIndirectCommand {
+    uint32_t    indexCount;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    int32_t     vertexOffset;
+    uint32_t    firstInstance;
+} VkDrawIndexedIndirectCommand;
+
+typedef struct VkDrawIndirectCommand {
+    uint32_t    vertexCount;
+    uint32_t    instanceCount;
+    uint32_t    firstVertex;
+    uint32_t    firstInstance;
+} VkDrawIndirectCommand;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateInstance)(const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkInstance* pInstance);
+typedef void (VKAPI_PTR *PFN_vkDestroyInstance)(VkInstance instance, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumeratePhysicalDevices)(VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFeatures)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures* pFeatures);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties* pFormatProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkImageFormatProperties* pImageFormatProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties* pProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyProperties)(VkPhysicalDevice physicalDevice, uint32_t* pQueueFamilyPropertyCount, VkQueueFamilyProperties* pQueueFamilyProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceMemoryProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties* pMemoryProperties);
+typedef PFN_vkVoidFunction (VKAPI_PTR *PFN_vkGetInstanceProcAddr)(VkInstance instance, const char* pName);
+typedef PFN_vkVoidFunction (VKAPI_PTR *PFN_vkGetDeviceProcAddr)(VkDevice device, const char* pName);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDevice)(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDevice* pDevice);
+typedef void (VKAPI_PTR *PFN_vkDestroyDevice)(VkDevice device, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateInstanceExtensionProperties)(const char* pLayerName, uint32_t* pPropertyCount, VkExtensionProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateDeviceExtensionProperties)(VkPhysicalDevice physicalDevice, const char* pLayerName, uint32_t* pPropertyCount, VkExtensionProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateInstanceLayerProperties)(uint32_t* pPropertyCount, VkLayerProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateDeviceLayerProperties)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkLayerProperties* pProperties);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceQueue)(VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue* pQueue);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueSubmit)(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueWaitIdle)(VkQueue queue);
+typedef VkResult (VKAPI_PTR *PFN_vkDeviceWaitIdle)(VkDevice device);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateMemory)(VkDevice device, const VkMemoryAllocateInfo* pAllocateInfo, const VkAllocationCallbacks* pAllocator, VkDeviceMemory* pMemory);
+typedef void (VKAPI_PTR *PFN_vkFreeMemory)(VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkMapMemory)(VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void** ppData);
+typedef void (VKAPI_PTR *PFN_vkUnmapMemory)(VkDevice device, VkDeviceMemory memory);
+typedef VkResult (VKAPI_PTR *PFN_vkFlushMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange* pMemoryRanges);
+typedef VkResult (VKAPI_PTR *PFN_vkInvalidateMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange* pMemoryRanges);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceMemoryCommitment)(VkDevice device, VkDeviceMemory memory, VkDeviceSize* pCommittedMemoryInBytes);
+typedef VkResult (VKAPI_PTR *PFN_vkBindBufferMemory)(VkDevice device, VkBuffer buffer, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef VkResult (VKAPI_PTR *PFN_vkBindImageMemory)(VkDevice device, VkImage image, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef void (VKAPI_PTR *PFN_vkGetBufferMemoryRequirements)(VkDevice device, VkBuffer buffer, VkMemoryRequirements* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetImageMemoryRequirements)(VkDevice device, VkImage image, VkMemoryRequirements* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetImageSparseMemoryRequirements)(VkDevice device, VkImage image, uint32_t* pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements* pSparseMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceSparseImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkSampleCountFlagBits samples, VkImageUsageFlags usage, VkImageTiling tiling, uint32_t* pPropertyCount, VkSparseImageFormatProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueBindSparse)(VkQueue queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateFence)(VkDevice device, const VkFenceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkFence* pFence);
+typedef void (VKAPI_PTR *PFN_vkDestroyFence)(VkDevice device, VkFence fence, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetFences)(VkDevice device, uint32_t fenceCount, const VkFence* pFences);
+typedef VkResult (VKAPI_PTR *PFN_vkGetFenceStatus)(VkDevice device, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkWaitForFences)(VkDevice device, uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSemaphore)(VkDevice device, const VkSemaphoreCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSemaphore* pSemaphore);
+typedef void (VKAPI_PTR *PFN_vkDestroySemaphore)(VkDevice device, VkSemaphore semaphore, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateEvent)(VkDevice device, const VkEventCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkEvent* pEvent);
+typedef void (VKAPI_PTR *PFN_vkDestroyEvent)(VkDevice device, VkEvent event, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetEventStatus)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkSetEvent)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkResetEvent)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateQueryPool)(VkDevice device, const VkQueryPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkQueryPool* pQueryPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyQueryPool)(VkDevice device, VkQueryPool queryPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetQueryPoolResults)(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void* pData, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateBuffer)(VkDevice device, const VkBufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBuffer* pBuffer);
+typedef void (VKAPI_PTR *PFN_vkDestroyBuffer)(VkDevice device, VkBuffer buffer, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateBufferView)(VkDevice device, const VkBufferViewCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBufferView* pView);
+typedef void (VKAPI_PTR *PFN_vkDestroyBufferView)(VkDevice device, VkBufferView bufferView, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateImage)(VkDevice device, const VkImageCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkImage* pImage);
+typedef void (VKAPI_PTR *PFN_vkDestroyImage)(VkDevice device, VkImage image, const VkAllocationCallbacks* pAllocator);
+typedef void (VKAPI_PTR *PFN_vkGetImageSubresourceLayout)(VkDevice device, VkImage image, const VkImageSubresource* pSubresource, VkSubresourceLayout* pLayout);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateImageView)(VkDevice device, const VkImageViewCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkImageView* pView);
+typedef void (VKAPI_PTR *PFN_vkDestroyImageView)(VkDevice device, VkImageView imageView, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateShaderModule)(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule);
+typedef void (VKAPI_PTR *PFN_vkDestroyShaderModule)(VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreatePipelineCache)(VkDevice device, const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipelineCache)(VkDevice device, VkPipelineCache pipelineCache, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPipelineCacheData)(VkDevice device, VkPipelineCache pipelineCache, size_t* pDataSize, void* pData);
+typedef VkResult (VKAPI_PTR *PFN_vkMergePipelineCaches)(VkDevice device, VkPipelineCache dstCache, uint32_t srcCacheCount, const VkPipelineCache* pSrcCaches);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateGraphicsPipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateComputePipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipeline)(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreatePipelineLayout)(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipelineLayout)(VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSampler)(VkDevice device, const VkSamplerCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSampler* pSampler);
+typedef void (VKAPI_PTR *PFN_vkDestroySampler)(VkDevice device, VkSampler sampler, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDescriptorSetLayout)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorSetLayout* pSetLayout);
+typedef void (VKAPI_PTR *PFN_vkDestroyDescriptorSetLayout)(VkDevice device, VkDescriptorSetLayout descriptorSetLayout, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDescriptorPool)(VkDevice device, const VkDescriptorPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorPool* pDescriptorPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, VkDescriptorPoolResetFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateDescriptorSets)(VkDevice device, const VkDescriptorSetAllocateInfo* pAllocateInfo, VkDescriptorSet* pDescriptorSets);
+typedef VkResult (VKAPI_PTR *PFN_vkFreeDescriptorSets)(VkDevice device, VkDescriptorPool descriptorPool, uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets);
+typedef void (VKAPI_PTR *PFN_vkUpdateDescriptorSets)(VkDevice device, uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites, uint32_t descriptorCopyCount, const VkCopyDescriptorSet* pDescriptorCopies);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateFramebuffer)(VkDevice device, const VkFramebufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkFramebuffer* pFramebuffer);
+typedef void (VKAPI_PTR *PFN_vkDestroyFramebuffer)(VkDevice device, VkFramebuffer framebuffer, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateRenderPass)(VkDevice device, const VkRenderPassCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void (VKAPI_PTR *PFN_vkDestroyRenderPass)(VkDevice device, VkRenderPass renderPass, const VkAllocationCallbacks* pAllocator);
+typedef void (VKAPI_PTR *PFN_vkGetRenderAreaGranularity)(VkDevice device, VkRenderPass renderPass, VkExtent2D* pGranularity);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateCommandPool)(VkDevice device, const VkCommandPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkCommandPool* pCommandPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyCommandPool)(VkDevice device, VkCommandPool commandPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetCommandPool)(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateCommandBuffers)(VkDevice device, const VkCommandBufferAllocateInfo* pAllocateInfo, VkCommandBuffer* pCommandBuffers);
+typedef void (VKAPI_PTR *PFN_vkFreeCommandBuffers)(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, const VkCommandBuffer* pCommandBuffers);
+typedef VkResult (VKAPI_PTR *PFN_vkBeginCommandBuffer)(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo* pBeginInfo);
+typedef VkResult (VKAPI_PTR *PFN_vkEndCommandBuffer)(VkCommandBuffer commandBuffer);
+typedef VkResult (VKAPI_PTR *PFN_vkResetCommandBuffer)(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdBindPipeline)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline);
+typedef void (VKAPI_PTR *PFN_vkCmdSetViewport)(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport* pViewports);
+typedef void (VKAPI_PTR *PFN_vkCmdSetScissor)(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D* pScissors);
+typedef void (VKAPI_PTR *PFN_vkCmdSetLineWidth)(VkCommandBuffer commandBuffer, float lineWidth);
+typedef void (VKAPI_PTR *PFN_vkCmdSetDepthBias)(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, float depthBiasClamp, float depthBiasSlopeFactor);
+typedef void (VKAPI_PTR *PFN_vkCmdSetBlendConstants)(VkCommandBuffer commandBuffer, const float blendConstants[4]);
+typedef void (VKAPI_PTR *PFN_vkCmdSetDepthBounds)(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilCompareMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilWriteMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilReference)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference);
+typedef void (VKAPI_PTR *PFN_vkCmdBindDescriptorSets)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout layout, uint32_t firstSet, uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets, uint32_t dynamicOffsetCount, const uint32_t* pDynamicOffsets);
+typedef void (VKAPI_PTR *PFN_vkCmdBindIndexBuffer)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkIndexType indexType);
+typedef void (VKAPI_PTR *PFN_vkCmdBindVertexBuffers)(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount, const VkBuffer* pBuffers, const VkDeviceSize* pOffsets);
+typedef void (VKAPI_PTR *PFN_vkCmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexed)(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDispatch)(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z);
+typedef void (VKAPI_PTR *PFN_vkCmdDispatchIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyBuffer)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdBlitImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageBlit* pRegions, VkFilter filter);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyBufferToImage)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkBufferImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyImageToBuffer)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdUpdateBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize dataSize, const void* pData);
+typedef void (VKAPI_PTR *PFN_vkCmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size, uint32_t data);
+typedef void (VKAPI_PTR *PFN_vkCmdClearColorImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearColorValue* pColor, uint32_t rangeCount, const VkImageSubresourceRange* pRanges);
+typedef void (VKAPI_PTR *PFN_vkCmdClearDepthStencilImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearDepthStencilValue* pDepthStencil, uint32_t rangeCount, const VkImageSubresourceRange* pRanges);
+typedef void (VKAPI_PTR *PFN_vkCmdClearAttachments)(VkCommandBuffer commandBuffer, uint32_t attachmentCount, const VkClearAttachment* pAttachments, uint32_t rectCount, const VkClearRect* pRects);
+typedef void (VKAPI_PTR *PFN_vkCmdResolveImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageResolve* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdSetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void (VKAPI_PTR *PFN_vkCmdResetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void (VKAPI_PTR *PFN_vkCmdWaitEvents)(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers);
+typedef void (VKAPI_PTR *PFN_vkCmdPipelineBarrier)(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers);
+typedef void (VKAPI_PTR *PFN_vkCmdBeginQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdEndQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query);
+typedef void (VKAPI_PTR *PFN_vkCmdResetQueryPool)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount);
+typedef void (VKAPI_PTR *PFN_vkCmdWriteTimestamp)(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, VkQueryPool queryPool, uint32_t query);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyQueryPoolResults)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdPushConstants)(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, const void* pValues);
+typedef void (VKAPI_PTR *PFN_vkCmdBeginRenderPass)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, VkSubpassContents contents);
+typedef void (VKAPI_PTR *PFN_vkCmdNextSubpass)(VkCommandBuffer commandBuffer, VkSubpassContents contents);
+typedef void (VKAPI_PTR *PFN_vkCmdEndRenderPass)(VkCommandBuffer commandBuffer);
+typedef void (VKAPI_PTR *PFN_vkCmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer* pCommandBuffers);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
+    const VkInstanceCreateInfo*                 pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkInstance*                                 pInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
+    VkInstance                                  instance,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices(
+    VkInstance                                  instance,
+    uint32_t*                                   pPhysicalDeviceCount,
+    VkPhysicalDevice*                           pPhysicalDevices);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceFeatures*                   pFeatures);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkFormatProperties*                         pFormatProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkImageTiling                               tiling,
+    VkImageUsageFlags                           usage,
+    VkImageCreateFlags                          flags,
+    VkImageFormatProperties*                    pImageFormatProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceProperties*                 pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pQueueFamilyPropertyCount,
+    VkQueueFamilyProperties*                    pQueueFamilyProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceMemoryProperties*           pMemoryProperties);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(
+    VkDevice                                    device,
+    const char*                                 pName);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDeviceCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDevice(
+    VkDevice                                    device,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties(
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties(
+    VkPhysicalDevice                            physicalDevice,
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceLayerProperties(
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceLayerProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue(
+    VkDevice                                    device,
+    uint32_t                                    queueFamilyIndex,
+    uint32_t                                    queueIndex,
+    VkQueue*                                    pQueue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(
+    VkQueue                                     queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueWaitIdle(
+    VkQueue                                     queue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(
+    VkDevice                                    device);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateMemory(
+    VkDevice                                    device,
+    const VkMemoryAllocateInfo*                 pAllocateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMemory);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMapMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                offset,
+    VkDeviceSize                                size,
+    VkMemoryMapFlags                            flags,
+    void**                                      ppData);
+
+VKAPI_ATTR void VKAPI_CALL vkUnmapMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFlushMappedMemoryRanges(
+    VkDevice                                    device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkInvalidateMappedMemoryRanges(
+    VkDevice                                    device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceMemoryCommitment(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize*                               pCommittedMemoryInBytes);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                memoryOffset);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory(
+    VkDevice                                    device,
+    VkImage                                     image,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                memoryOffset);
+
+VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    VkMemoryRequirements*                       pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    VkMemoryRequirements*                       pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements*            pSparseMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceSparseImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkSampleCountFlagBits                       samples,
+    VkImageUsageFlags                           usage,
+    VkImageTiling                               tiling,
+    uint32_t*                                   pPropertyCount,
+    VkSparseImageFormatProperties*              pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueBindSparse(
+    VkQueue                                     queue,
+    uint32_t                                    bindInfoCount,
+    const VkBindSparseInfo*                     pBindInfo,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFence(
+    VkDevice                                    device,
+    const VkFenceCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFence*                                    pFence);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFence(
+    VkDevice                                    device,
+    VkFence                                     fence,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetFences(
+    VkDevice                                    device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceStatus(
+    VkDevice                                    device,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkWaitForFences(
+    VkDevice                                    device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences,
+    VkBool32                                    waitAll,
+    uint64_t                                    timeout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(
+    VkDevice                                    device,
+    const VkSemaphoreCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSemaphore*                                pSemaphore);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore(
+    VkDevice                                    device,
+    VkSemaphore                                 semaphore,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateEvent(
+    VkDevice                                    device,
+    const VkEventCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkEvent*                                    pEvent);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyEvent(
+    VkDevice                                    device,
+    VkEvent                                     event,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetEventStatus(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkSetEvent(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetEvent(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateQueryPool(
+    VkDevice                                    device,
+    const VkQueryPoolCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkQueryPool*                                pQueryPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyQueryPool(
+    VkDevice                                    device,
+    VkQueryPool                                 queryPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetQueryPoolResults(
+    VkDevice                                    device,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    size_t                                      dataSize,
+    void*                                       pData,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBuffer(
+    VkDevice                                    device,
+    const VkBufferCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBuffer*                                   pBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBuffer(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBufferView(
+    VkDevice                                    device,
+    const VkBufferViewCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBufferView*                               pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBufferView(
+    VkDevice                                    device,
+    VkBufferView                                bufferView,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImage(
+    VkDevice                                    device,
+    const VkImageCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImage*                                    pImage);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImage(
+    VkDevice                                    device,
+    VkImage                                     image,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout(
+    VkDevice                                    device,
+    VkImage                                     image,
+    const VkImageSubresource*                   pSubresource,
+    VkSubresourceLayout*                        pLayout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImageView(
+    VkDevice                                    device,
+    const VkImageViewCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImageView*                                pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImageView(
+    VkDevice                                    device,
+    VkImageView                                 imageView,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateShaderModule(
+    VkDevice                                    device,
+    const VkShaderModuleCreateInfo*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkShaderModule*                             pShaderModule);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyShaderModule(
+    VkDevice                                    device,
+    VkShaderModule                              shaderModule,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineCache(
+    VkDevice                                    device,
+    const VkPipelineCacheCreateInfo*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineCache*                            pPipelineCache);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineCache(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineCacheData(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    size_t*                                     pDataSize,
+    void*                                       pData);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMergePipelineCaches(
+    VkDevice                                    device,
+    VkPipelineCache                             dstCache,
+    uint32_t                                    srcCacheCount,
+    const VkPipelineCache*                      pSrcCaches);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkComputePipelineCreateInfo*          pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline(
+    VkDevice                                    device,
+    VkPipeline                                  pipeline,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineLayout(
+    VkDevice                                    device,
+    const VkPipelineLayoutCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineLayout*                           pPipelineLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineLayout(
+    VkDevice                                    device,
+    VkPipelineLayout                            pipelineLayout,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSampler(
+    VkDevice                                    device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySampler(
+    VkDevice                                    device,
+    VkSampler                                   sampler,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorSetLayout(
+    VkDevice                                    device,
+    const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorSetLayout*                      pSetLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorSetLayout(
+    VkDevice                                    device,
+    VkDescriptorSetLayout                       descriptorSetLayout,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorPool(
+    VkDevice                                    device,
+    const VkDescriptorPoolCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorPool*                           pDescriptorPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorPool(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetDescriptorPool(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    VkDescriptorPoolResetFlags                  flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateDescriptorSets(
+    VkDevice                                    device,
+    const VkDescriptorSetAllocateInfo*          pAllocateInfo,
+    VkDescriptorSet*                            pDescriptorSets);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFreeDescriptorSets(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets);
+
+VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSets(
+    VkDevice                                    device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFramebuffer(
+    VkDevice                                    device,
+    const VkFramebufferCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFramebuffer*                              pFramebuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFramebuffer(
+    VkDevice                                    device,
+    VkFramebuffer                               framebuffer,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateRenderPass(
+    VkDevice                                    device,
+    const VkRenderPassCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyRenderPass(
+    VkDevice                                    device,
+    VkRenderPass                                renderPass,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity(
+    VkDevice                                    device,
+    VkRenderPass                                renderPass,
+    VkExtent2D*                                 pGranularity);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool(
+    VkDevice                                    device,
+    const VkCommandPoolCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkCommandPool*                              pCommandPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandPool(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    VkCommandPoolResetFlags                     flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers(
+    VkDevice                                    device,
+    const VkCommandBufferAllocateInfo*          pAllocateInfo,
+    VkCommandBuffer*                            pCommandBuffers);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer(
+    VkCommandBuffer                             commandBuffer);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindPipeline(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipeline                                  pipeline);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetViewport(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstViewport,
+    uint32_t                                    viewportCount,
+    const VkViewport*                           pViewports);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetScissor(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstScissor,
+    uint32_t                                    scissorCount,
+    const VkRect2D*                             pScissors);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetLineWidth(
+    VkCommandBuffer                             commandBuffer,
+    float                                       lineWidth);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBias(
+    VkCommandBuffer                             commandBuffer,
+    float                                       depthBiasConstantFactor,
+    float                                       depthBiasClamp,
+    float                                       depthBiasSlopeFactor);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetBlendConstants(
+    VkCommandBuffer                             commandBuffer,
+    const float                                 blendConstants[4]);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBounds(
+    VkCommandBuffer                             commandBuffer,
+    float                                       minDepthBounds,
+    float                                       maxDepthBounds);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilCompareMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    compareMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilWriteMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    writeMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilReference(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    reference);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindDescriptorSets(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipelineLayout                            layout,
+    uint32_t                                    firstSet,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets,
+    uint32_t                                    dynamicOffsetCount,
+    const uint32_t*                             pDynamicOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkIndexType                                 indexType);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstBinding,
+    uint32_t                                    bindingCount,
+    const VkBuffer*                             pBuffers,
+    const VkDeviceSize*                         pOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDraw(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexed(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatch(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkBuffer                                    dstBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferCopy*                         pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageCopy*                          pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBlitImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageBlit*                          pRegions,
+    VkFilter                                    filter);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBufferToImage(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImageToBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkBuffer                                    dstBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdUpdateBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                dataSize,
+    const void*                                 pData);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdFillBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                size,
+    uint32_t                                    data);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image,
+    VkImageLayout                               imageLayout,
+    const VkClearColorValue*                    pColor,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearDepthStencilImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image,
+    VkImageLayout                               imageLayout,
+    const VkClearDepthStencilValue*             pDepthStencil,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearAttachments(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    attachmentCount,
+    const VkClearAttachment*                    pAttachments,
+    uint32_t                                    rectCount,
+    const VkClearRect*                          pRects);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResolveImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageResolve*                       pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetEvent(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetEvent(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        dstStageMask,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        dstStageMask,
+    VkDependencyFlags                           dependencyFlags,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginQuery(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    VkQueryControlFlags                         flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndQuery(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetQueryPool(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteTimestamp(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlagBits                     pipelineStage,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyQueryPoolResults(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPushConstants(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineLayout                            layout,
+    VkShaderStageFlags                          stageFlags,
+    uint32_t                                    offset,
+    uint32_t                                    size,
+    const void*                                 pValues);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBegin,
+    VkSubpassContents                           contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass(
+    VkCommandBuffer                             commandBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdExecuteCommands(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers);
+#endif
+
+#define VK_KHR_surface 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR)
+
+#define VK_KHR_SURFACE_SPEC_VERSION       25
+#define VK_KHR_SURFACE_EXTENSION_NAME     "VK_KHR_surface"
+#define VK_COLORSPACE_SRGB_NONLINEAR_KHR  VK_COLOR_SPACE_SRGB_NONLINEAR_KHR
+
+
+typedef enum VkColorSpaceKHR {
+    VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0,
+    VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+    VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+    VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1),
+    VK_COLOR_SPACE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkColorSpaceKHR;
+
+typedef enum VkPresentModeKHR {
+    VK_PRESENT_MODE_IMMEDIATE_KHR = 0,
+    VK_PRESENT_MODE_MAILBOX_KHR = 1,
+    VK_PRESENT_MODE_FIFO_KHR = 2,
+    VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3,
+    VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR,
+    VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR,
+    VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1),
+    VK_PRESENT_MODE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkPresentModeKHR;
+
+
+typedef enum VkSurfaceTransformFlagBitsKHR {
+    VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR = 0x00000001,
+    VK_SURFACE_TRANSFORM_ROTATE_90_BIT_KHR = 0x00000002,
+    VK_SURFACE_TRANSFORM_ROTATE_180_BIT_KHR = 0x00000004,
+    VK_SURFACE_TRANSFORM_ROTATE_270_BIT_KHR = 0x00000008,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_BIT_KHR = 0x00000010,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_90_BIT_KHR = 0x00000020,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_180_BIT_KHR = 0x00000040,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_270_BIT_KHR = 0x00000080,
+    VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR = 0x00000100,
+    VK_SURFACE_TRANSFORM_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkSurfaceTransformFlagBitsKHR;
+typedef VkFlags VkSurfaceTransformFlagsKHR;
+
+typedef enum VkCompositeAlphaFlagBitsKHR {
+    VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR = 0x00000002,
+    VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR = 0x00000004,
+    VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR = 0x00000008,
+    VK_COMPOSITE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkCompositeAlphaFlagBitsKHR;
+typedef VkFlags VkCompositeAlphaFlagsKHR;
+
+typedef struct VkSurfaceCapabilitiesKHR {
+    uint32_t                         minImageCount;
+    uint32_t                         maxImageCount;
+    VkExtent2D                       currentExtent;
+    VkExtent2D                       minImageExtent;
+    VkExtent2D                       maxImageExtent;
+    uint32_t                         maxImageArrayLayers;
+    VkSurfaceTransformFlagsKHR       supportedTransforms;
+    VkSurfaceTransformFlagBitsKHR    currentTransform;
+    VkCompositeAlphaFlagsKHR         supportedCompositeAlpha;
+    VkImageUsageFlags                supportedUsageFlags;
+} VkSurfaceCapabilitiesKHR;
+
+typedef struct VkSurfaceFormatKHR {
+    VkFormat           format;
+    VkColorSpaceKHR    colorSpace;
+} VkSurfaceFormatKHR;
+
+
+typedef void (VKAPI_PTR *PFN_vkDestroySurfaceKHR)(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, VkSurfaceKHR surface, VkBool32* pSupported);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR* pSurfaceCapabilities);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pSurfaceFormatCount, VkSurfaceFormatKHR* pSurfaceFormats);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pPresentModeCount, VkPresentModeKHR* pPresentModes);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(
+    VkInstance                                  instance,
+    VkSurfaceKHR                                surface,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    VkSurfaceKHR                                surface,
+    VkBool32*                                   pSupported);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    uint32_t*                                   pSurfaceFormatCount,
+    VkSurfaceFormatKHR*                         pSurfaceFormats);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    uint32_t*                                   pPresentModeCount,
+    VkPresentModeKHR*                           pPresentModes);
+#endif
+
+#define VK_KHR_swapchain 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR)
+
+#define VK_KHR_SWAPCHAIN_SPEC_VERSION     68
+#define VK_KHR_SWAPCHAIN_EXTENSION_NAME   "VK_KHR_swapchain"
+
+typedef VkFlags VkSwapchainCreateFlagsKHR;
+
+typedef struct VkSwapchainCreateInfoKHR {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkSwapchainCreateFlagsKHR        flags;
+    VkSurfaceKHR                     surface;
+    uint32_t                         minImageCount;
+    VkFormat                         imageFormat;
+    VkColorSpaceKHR                  imageColorSpace;
+    VkExtent2D                       imageExtent;
+    uint32_t                         imageArrayLayers;
+    VkImageUsageFlags                imageUsage;
+    VkSharingMode                    imageSharingMode;
+    uint32_t                         queueFamilyIndexCount;
+    const uint32_t*                  pQueueFamilyIndices;
+    VkSurfaceTransformFlagBitsKHR    preTransform;
+    VkCompositeAlphaFlagBitsKHR      compositeAlpha;
+    VkPresentModeKHR                 presentMode;
+    VkBool32                         clipped;
+    VkSwapchainKHR                   oldSwapchain;
+} VkSwapchainCreateInfoKHR;
+
+typedef struct VkPresentInfoKHR {
+    VkStructureType          sType;
+    const void*              pNext;
+    uint32_t                 waitSemaphoreCount;
+    const VkSemaphore*       pWaitSemaphores;
+    uint32_t                 swapchainCount;
+    const VkSwapchainKHR*    pSwapchains;
+    const uint32_t*          pImageIndices;
+    VkResult*                pResults;
+} VkPresentInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSwapchainKHR)(VkDevice device, const VkSwapchainCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchain);
+typedef void (VKAPI_PTR *PFN_vkDestroySwapchainKHR)(VkDevice device, VkSwapchainKHR swapchain, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainImagesKHR)(VkDevice device, VkSwapchainKHR swapchain, uint32_t* pSwapchainImageCount, VkImage* pSwapchainImages);
+typedef VkResult (VKAPI_PTR *PFN_vkAcquireNextImageKHR)(VkDevice device, VkSwapchainKHR swapchain, uint64_t timeout, VkSemaphore semaphore, VkFence fence, uint32_t* pImageIndex);
+typedef VkResult (VKAPI_PTR *PFN_vkQueuePresentKHR)(VkQueue queue, const VkPresentInfoKHR* pPresentInfo);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR(
+    VkDevice                                    device,
+    const VkSwapchainCreateInfoKHR*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSwapchainKHR*                             pSwapchain);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    uint32_t*                                   pSwapchainImageCount,
+    VkImage*                                    pSwapchainImages);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    uint64_t                                    timeout,
+    VkSemaphore                                 semaphore,
+    VkFence                                     fence,
+    uint32_t*                                   pImageIndex);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(
+    VkQueue                                     queue,
+    const VkPresentInfoKHR*                     pPresentInfo);
+#endif
+
+#define VK_KHR_display 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayKHR)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayModeKHR)
+
+#define VK_KHR_DISPLAY_SPEC_VERSION       21
+#define VK_KHR_DISPLAY_EXTENSION_NAME     "VK_KHR_display"
+
+
+typedef enum VkDisplayPlaneAlphaFlagBitsKHR {
+    VK_DISPLAY_PLANE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_DISPLAY_PLANE_ALPHA_GLOBAL_BIT_KHR = 0x00000002,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_BIT_KHR = 0x00000004,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_PREMULTIPLIED_BIT_KHR = 0x00000008,
+    VK_DISPLAY_PLANE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkDisplayPlaneAlphaFlagBitsKHR;
+typedef VkFlags VkDisplayPlaneAlphaFlagsKHR;
+typedef VkFlags VkDisplayModeCreateFlagsKHR;
+typedef VkFlags VkDisplaySurfaceCreateFlagsKHR;
+
+typedef struct VkDisplayPropertiesKHR {
+    VkDisplayKHR                  display;
+    const char*                   displayName;
+    VkExtent2D                    physicalDimensions;
+    VkExtent2D                    physicalResolution;
+    VkSurfaceTransformFlagsKHR    supportedTransforms;
+    VkBool32                      planeReorderPossible;
+    VkBool32                      persistentContent;
+} VkDisplayPropertiesKHR;
+
+typedef struct VkDisplayModeParametersKHR {
+    VkExtent2D    visibleRegion;
+    uint32_t      refreshRate;
+} VkDisplayModeParametersKHR;
+
+typedef struct VkDisplayModePropertiesKHR {
+    VkDisplayModeKHR              displayMode;
+    VkDisplayModeParametersKHR    parameters;
+} VkDisplayModePropertiesKHR;
+
+typedef struct VkDisplayModeCreateInfoKHR {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkDisplayModeCreateFlagsKHR    flags;
+    VkDisplayModeParametersKHR     parameters;
+} VkDisplayModeCreateInfoKHR;
+
+typedef struct VkDisplayPlaneCapabilitiesKHR {
+    VkDisplayPlaneAlphaFlagsKHR    supportedAlpha;
+    VkOffset2D                     minSrcPosition;
+    VkOffset2D                     maxSrcPosition;
+    VkExtent2D                     minSrcExtent;
+    VkExtent2D                     maxSrcExtent;
+    VkOffset2D                     minDstPosition;
+    VkOffset2D                     maxDstPosition;
+    VkExtent2D                     minDstExtent;
+    VkExtent2D                     maxDstExtent;
+} VkDisplayPlaneCapabilitiesKHR;
+
+typedef struct VkDisplayPlanePropertiesKHR {
+    VkDisplayKHR    currentDisplay;
+    uint32_t        currentStackIndex;
+} VkDisplayPlanePropertiesKHR;
+
+typedef struct VkDisplaySurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkDisplaySurfaceCreateFlagsKHR    flags;
+    VkDisplayModeKHR                  displayMode;
+    uint32_t                          planeIndex;
+    uint32_t                          planeStackIndex;
+    VkSurfaceTransformFlagBitsKHR     transform;
+    float                             globalAlpha;
+    VkDisplayPlaneAlphaFlagBitsKHR    alphaMode;
+    VkExtent2D                        imageExtent;
+} VkDisplaySurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayPropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPlanePropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayPlanePropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayPlaneSupportedDisplaysKHR)(VkPhysicalDevice physicalDevice, uint32_t planeIndex, uint32_t* pDisplayCount, VkDisplayKHR* pDisplays);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayModePropertiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, uint32_t* pPropertyCount, VkDisplayModePropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDisplayModeKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, const VkDisplayModeCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDisplayModeKHR* pMode);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayPlaneCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayModeKHR mode, uint32_t planeIndex, VkDisplayPlaneCapabilitiesKHR* pCapabilities);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDisplayPlaneSurfaceKHR)(VkInstance instance, const VkDisplaySurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPropertiesKHR*                     pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPlanePropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPlanePropertiesKHR*                pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneSupportedDisplaysKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    planeIndex,
+    uint32_t*                                   pDisplayCount,
+    VkDisplayKHR*                               pDisplays);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayModePropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayModePropertiesKHR*                 pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayModeKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    const VkDisplayModeCreateInfoKHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDisplayModeKHR*                           pMode);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneCapabilitiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayModeKHR                            mode,
+    uint32_t                                    planeIndex,
+    VkDisplayPlaneCapabilitiesKHR*              pCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayPlaneSurfaceKHR(
+    VkInstance                                  instance,
+    const VkDisplaySurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+#endif
+
+#define VK_KHR_display_swapchain 1
+#define VK_KHR_DISPLAY_SWAPCHAIN_SPEC_VERSION 9
+#define VK_KHR_DISPLAY_SWAPCHAIN_EXTENSION_NAME "VK_KHR_display_swapchain"
+
+typedef struct VkDisplayPresentInfoKHR {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkRect2D           srcRect;
+    VkRect2D           dstRect;
+    VkBool32           persistent;
+} VkDisplayPresentInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSharedSwapchainsKHR)(VkDevice device, uint32_t swapchainCount, const VkSwapchainCreateInfoKHR* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchains);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSharedSwapchainsKHR(
+    VkDevice                                    device,
+    uint32_t                                    swapchainCount,
+    const VkSwapchainCreateInfoKHR*             pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSwapchainKHR*                             pSwapchains);
+#endif
+
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+#define VK_KHR_xlib_surface 1
+#include <X11/Xlib.h>
+
+#define VK_KHR_XLIB_SURFACE_SPEC_VERSION  6
+#define VK_KHR_XLIB_SURFACE_EXTENSION_NAME "VK_KHR_xlib_surface"
+
+typedef VkFlags VkXlibSurfaceCreateFlagsKHR;
+
+typedef struct VkXlibSurfaceCreateInfoKHR {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkXlibSurfaceCreateFlagsKHR    flags;
+    Display*                       dpy;
+    Window                         window;
+} VkXlibSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateXlibSurfaceKHR)(VkInstance instance, const VkXlibSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXlibPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, Display* dpy, VisualID visualID);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateXlibSurfaceKHR(
+    VkInstance                                  instance,
+    const VkXlibSurfaceCreateInfoKHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXlibPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    Display*                                    dpy,
+    VisualID                                    visualID);
+#endif
+#endif /* VK_USE_PLATFORM_XLIB_KHR */
+
+#ifdef VK_USE_PLATFORM_XCB_KHR
+#define VK_KHR_xcb_surface 1
+#include <xcb/xcb.h>
+
+#define VK_KHR_XCB_SURFACE_SPEC_VERSION   6
+#define VK_KHR_XCB_SURFACE_EXTENSION_NAME "VK_KHR_xcb_surface"
+
+typedef VkFlags VkXcbSurfaceCreateFlagsKHR;
+
+typedef struct VkXcbSurfaceCreateInfoKHR {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkXcbSurfaceCreateFlagsKHR    flags;
+    xcb_connection_t*             connection;
+    xcb_window_t                  window;
+} VkXcbSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateXcbSurfaceKHR)(VkInstance instance, const VkXcbSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXcbPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, xcb_connection_t* connection, xcb_visualid_t visual_id);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateXcbSurfaceKHR(
+    VkInstance                                  instance,
+    const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXcbPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    xcb_connection_t*                           connection,
+    xcb_visualid_t                              visual_id);
+#endif
+#endif /* VK_USE_PLATFORM_XCB_KHR */
+
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+#define VK_KHR_wayland_surface 1
+#include <wayland-client.h>
+
+#define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 5
+#define VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME "VK_KHR_wayland_surface"
+
+typedef VkFlags VkWaylandSurfaceCreateFlagsKHR;
+
+typedef struct VkWaylandSurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkWaylandSurfaceCreateFlagsKHR    flags;
+    struct wl_display*                display;
+    struct wl_surface*                surface;
+} VkWaylandSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateWaylandSurfaceKHR)(VkInstance instance, const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWaylandPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, struct wl_display* display);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateWaylandSurfaceKHR(
+    VkInstance                                  instance,
+    const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWaylandPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    struct wl_display*                          display);
+#endif
+#endif /* VK_USE_PLATFORM_WAYLAND_KHR */
+
+#ifdef VK_USE_PLATFORM_MIR_KHR
+#define VK_KHR_mir_surface 1
+#include <mir_toolkit/client_types.h>
+
+#define VK_KHR_MIR_SURFACE_SPEC_VERSION   4
+#define VK_KHR_MIR_SURFACE_EXTENSION_NAME "VK_KHR_mir_surface"
+
+typedef VkFlags VkMirSurfaceCreateFlagsKHR;
+
+typedef struct VkMirSurfaceCreateInfoKHR {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkMirSurfaceCreateFlagsKHR    flags;
+    MirConnection*                connection;
+    MirSurface*                   mirSurface;
+} VkMirSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateMirSurfaceKHR)(VkInstance instance, const VkMirSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceMirPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, MirConnection* connection);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateMirSurfaceKHR(
+    VkInstance                                  instance,
+    const VkMirSurfaceCreateInfoKHR*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceMirPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    MirConnection*                              connection);
+#endif
+#endif /* VK_USE_PLATFORM_MIR_KHR */
+
+#ifdef VK_USE_PLATFORM_ANDROID_KHR
+#define VK_KHR_android_surface 1
+#include <android/native_window.h>
+
+#define VK_KHR_ANDROID_SURFACE_SPEC_VERSION 6
+#define VK_KHR_ANDROID_SURFACE_EXTENSION_NAME "VK_KHR_android_surface"
+
+typedef VkFlags VkAndroidSurfaceCreateFlagsKHR;
+
+typedef struct VkAndroidSurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkAndroidSurfaceCreateFlagsKHR    flags;
+    ANativeWindow*                    window;
+} VkAndroidSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateAndroidSurfaceKHR)(VkInstance instance, const VkAndroidSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateAndroidSurfaceKHR(
+    VkInstance                                  instance,
+    const VkAndroidSurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+#endif
+#endif /* VK_USE_PLATFORM_ANDROID_KHR */
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#define VK_KHR_win32_surface 1
+#include <windows.h>
+
+#define VK_KHR_WIN32_SURFACE_SPEC_VERSION 5
+#define VK_KHR_WIN32_SURFACE_EXTENSION_NAME "VK_KHR_win32_surface"
+
+typedef VkFlags VkWin32SurfaceCreateFlagsKHR;
+
+typedef struct VkWin32SurfaceCreateInfoKHR {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkWin32SurfaceCreateFlagsKHR    flags;
+    HINSTANCE                       hinstance;
+    HWND                            hwnd;
+} VkWin32SurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateWin32SurfaceKHR)(VkInstance instance, const VkWin32SurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWin32PresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateWin32SurfaceKHR(
+    VkInstance                                  instance,
+    const VkWin32SurfaceCreateInfoKHR*          pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWin32PresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex);
+#endif
+#endif /* VK_USE_PLATFORM_WIN32_KHR */
+
+#define VK_KHR_sampler_mirror_clamp_to_edge 1
+#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_SPEC_VERSION 1
+#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME "VK_KHR_sampler_mirror_clamp_to_edge"
+
+
+#define VK_EXT_debug_report 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT)
+
+#define VK_EXT_DEBUG_REPORT_SPEC_VERSION  3
+#define VK_EXT_DEBUG_REPORT_EXTENSION_NAME "VK_EXT_debug_report"
+#define VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT
+
+
+typedef enum VkDebugReportObjectTypeEXT {
+    VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT = 0,
+    VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT = 1,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT = 2,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT = 3,
+    VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT = 4,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT = 5,
+    VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT = 6,
+    VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT = 7,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT = 8,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT = 9,
+    VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT = 10,
+    VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT = 11,
+    VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT = 12,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT = 13,
+    VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT = 14,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT = 15,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT = 16,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT = 17,
+    VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT = 18,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT = 19,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT = 20,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT = 21,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT = 22,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT = 23,
+    VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT = 24,
+    VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT = 25,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT = 26,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT = 27,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT = 28,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,
+    VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT,
+    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),
+    VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDebugReportObjectTypeEXT;
+
+typedef enum VkDebugReportErrorEXT {
+    VK_DEBUG_REPORT_ERROR_NONE_EXT = 0,
+    VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT = 1,
+    VK_DEBUG_REPORT_ERROR_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_ERROR_NONE_EXT,
+    VK_DEBUG_REPORT_ERROR_END_RANGE_EXT = VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT,
+    VK_DEBUG_REPORT_ERROR_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT - VK_DEBUG_REPORT_ERROR_NONE_EXT + 1),
+    VK_DEBUG_REPORT_ERROR_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDebugReportErrorEXT;
+
+
+typedef enum VkDebugReportFlagBitsEXT {
+    VK_DEBUG_REPORT_INFORMATION_BIT_EXT = 0x00000001,
+    VK_DEBUG_REPORT_WARNING_BIT_EXT = 0x00000002,
+    VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT = 0x00000004,
+    VK_DEBUG_REPORT_ERROR_BIT_EXT = 0x00000008,
+    VK_DEBUG_REPORT_DEBUG_BIT_EXT = 0x00000010,
+    VK_DEBUG_REPORT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDebugReportFlagBitsEXT;
+typedef VkFlags VkDebugReportFlagsEXT;
+
+typedef VkBool32 (VKAPI_PTR *PFN_vkDebugReportCallbackEXT)(
+    VkDebugReportFlagsEXT                       flags,
+    VkDebugReportObjectTypeEXT                  objectType,
+    uint64_t                                    object,
+    size_t                                      location,
+    int32_t                                     messageCode,
+    const char*                                 pLayerPrefix,
+    const char*                                 pMessage,
+    void*                                       pUserData);
+
+
+typedef struct VkDebugReportCallbackCreateInfoEXT {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkDebugReportFlagsEXT           flags;
+    PFN_vkDebugReportCallbackEXT    pfnCallback;
+    void*                           pUserData;
+} VkDebugReportCallbackCreateInfoEXT;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDebugReportCallbackEXT)(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugReportCallbackEXT* pCallback);
+typedef void (VKAPI_PTR *PFN_vkDestroyDebugReportCallbackEXT)(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks* pAllocator);
+typedef void (VKAPI_PTR *PFN_vkDebugReportMessageEXT)(VkInstance instance, VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, uint64_t object, size_t location, int32_t messageCode, const char* pLayerPrefix, const char* pMessage);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT(
+    VkInstance                                  instance,
+    const VkDebugReportCallbackCreateInfoEXT*   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDebugReportCallbackEXT*                   pCallback);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT(
+    VkInstance                                  instance,
+    VkDebugReportCallbackEXT                    callback,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT(
+    VkInstance                                  instance,
+    VkDebugReportFlagsEXT                       flags,
+    VkDebugReportObjectTypeEXT                  objectType,
+    uint64_t                                    object,
+    size_t                                      location,
+    int32_t                                     messageCode,
+    const char*                                 pLayerPrefix,
+    const char*                                 pMessage);
+#endif
+
+#define VK_NV_glsl_shader 1
+#define VK_NV_GLSL_SHADER_SPEC_VERSION    1
+#define VK_NV_GLSL_SHADER_EXTENSION_NAME  "VK_NV_glsl_shader"
+
+
+#define VK_IMG_filter_cubic 1
+#define VK_IMG_FILTER_CUBIC_SPEC_VERSION  1
+#define VK_IMG_FILTER_CUBIC_EXTENSION_NAME "VK_IMG_filter_cubic"
+
+
+#define VK_AMD_rasterization_order 1
+#define VK_AMD_RASTERIZATION_ORDER_SPEC_VERSION 1
+#define VK_AMD_RASTERIZATION_ORDER_EXTENSION_NAME "VK_AMD_rasterization_order"
+
+
+typedef enum VkRasterizationOrderAMD {
+    VK_RASTERIZATION_ORDER_STRICT_AMD = 0,
+    VK_RASTERIZATION_ORDER_RELAXED_AMD = 1,
+    VK_RASTERIZATION_ORDER_BEGIN_RANGE_AMD = VK_RASTERIZATION_ORDER_STRICT_AMD,
+    VK_RASTERIZATION_ORDER_END_RANGE_AMD = VK_RASTERIZATION_ORDER_RELAXED_AMD,
+    VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1),
+    VK_RASTERIZATION_ORDER_MAX_ENUM_AMD = 0x7FFFFFFF
+} VkRasterizationOrderAMD;
+
+typedef struct VkPipelineRasterizationStateRasterizationOrderAMD {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkRasterizationOrderAMD    rasterizationOrder;
+} VkPipelineRasterizationStateRasterizationOrderAMD;
+
+
+
+#define VK_AMD_shader_trinary_minmax 1
+#define VK_AMD_SHADER_TRINARY_MINMAX_SPEC_VERSION 1
+#define VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME "VK_AMD_shader_trinary_minmax"
+
+
+#define VK_AMD_shader_explicit_vertex_parameter 1
+#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_SPEC_VERSION 1
+#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_EXTENSION_NAME "VK_AMD_shader_explicit_vertex_parameter"
+
+
+#define VK_EXT_debug_marker 1
+#define VK_EXT_DEBUG_MARKER_SPEC_VERSION  3
+#define VK_EXT_DEBUG_MARKER_EXTENSION_NAME "VK_EXT_debug_marker"
+
+typedef struct VkDebugMarkerObjectNameInfoEXT {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkDebugReportObjectTypeEXT    objectType;
+    uint64_t                      object;
+    const char*                   pObjectName;
+} VkDebugMarkerObjectNameInfoEXT;
+
+typedef struct VkDebugMarkerObjectTagInfoEXT {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkDebugReportObjectTypeEXT    objectType;
+    uint64_t                      object;
+    uint64_t                      tagName;
+    size_t                        tagSize;
+    const void*                   pTag;
+} VkDebugMarkerObjectTagInfoEXT;
+
+typedef struct VkDebugMarkerMarkerInfoEXT {
+    VkStructureType    sType;
+    const void*        pNext;
+    const char*        pMarkerName;
+    float              color[4];
+} VkDebugMarkerMarkerInfoEXT;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectTagEXT)(VkDevice device, VkDebugMarkerObjectTagInfoEXT* pTagInfo);
+typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectNameEXT)(VkDevice device, VkDebugMarkerObjectNameInfoEXT* pNameInfo);
+typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerBeginEXT)(VkCommandBuffer commandBuffer, VkDebugMarkerMarkerInfoEXT* pMarkerInfo);
+typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerEndEXT)(VkCommandBuffer commandBuffer);
+typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerInsertEXT)(VkCommandBuffer commandBuffer, VkDebugMarkerMarkerInfoEXT* pMarkerInfo);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectTagEXT(
+    VkDevice                                    device,
+    VkDebugMarkerObjectTagInfoEXT*              pTagInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectNameEXT(
+    VkDevice                                    device,
+    VkDebugMarkerObjectNameInfoEXT*             pNameInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerBeginEXT(
+    VkCommandBuffer                             commandBuffer,
+    VkDebugMarkerMarkerInfoEXT*                 pMarkerInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerEndEXT(
+    VkCommandBuffer                             commandBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT(
+    VkCommandBuffer                             commandBuffer,
+    VkDebugMarkerMarkerInfoEXT*                 pMarkerInfo);
+#endif
+
+#define VK_AMD_gcn_shader 1
+#define VK_AMD_GCN_SHADER_SPEC_VERSION    1
+#define VK_AMD_GCN_SHADER_EXTENSION_NAME  "VK_AMD_gcn_shader"
+
+
+#define VK_NV_dedicated_allocation 1
+#define VK_NV_DEDICATED_ALLOCATION_SPEC_VERSION 1
+#define VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_NV_dedicated_allocation"
+
+typedef struct VkDedicatedAllocationImageCreateInfoNV {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkBool32           dedicatedAllocation;
+} VkDedicatedAllocationImageCreateInfoNV;
+
+typedef struct VkDedicatedAllocationBufferCreateInfoNV {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkBool32           dedicatedAllocation;
+} VkDedicatedAllocationBufferCreateInfoNV;
+
+typedef struct VkDedicatedAllocationMemoryAllocateInfoNV {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkImage            image;
+    VkBuffer           buffer;
+} VkDedicatedAllocationMemoryAllocateInfoNV;
+
+
+
+#define VK_AMD_draw_indirect_count 1
+#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 1
+#define VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_AMD_draw_indirect_count"
+
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCountAMD(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountAMD(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride);
+#endif
+
+#define VK_AMD_negative_viewport_height 1
+#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_SPEC_VERSION 0
+#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME "VK_AMD_negative_viewport_height"
+
+
+#define VK_AMD_gpu_shader_half_float 1
+#define VK_AMD_GPU_SHADER_HALF_FLOAT_SPEC_VERSION 1
+#define VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME "VK_AMD_gpu_shader_half_float"
+
+
+#define VK_AMD_shader_ballot 1
+#define VK_AMD_SHADER_BALLOT_SPEC_VERSION 0
+#define VK_AMD_SHADER_BALLOT_EXTENSION_NAME "VK_AMD_shader_ballot"
+
+
+#define VK_IMG_format_pvrtc 1
+#define VK_IMG_FORMAT_PVRTC_SPEC_VERSION  1
+#define VK_IMG_FORMAT_PVRTC_EXTENSION_NAME "VK_IMG_format_pvrtc"
+
+
+#define VK_NV_external_memory_capabilities 1
+#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1
+#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_NV_external_memory_capabilities"
+
+
+typedef enum VkExternalMemoryHandleTypeFlagBitsNV {
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_NV = 0x00000001,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_NV = 0x00000002,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_BIT_NV = 0x00000004,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_KMT_BIT_NV = 0x00000008,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF
+} VkExternalMemoryHandleTypeFlagBitsNV;
+typedef VkFlags VkExternalMemoryHandleTypeFlagsNV;
+
+typedef enum VkExternalMemoryFeatureFlagBitsNV {
+    VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_NV = 0x00000001,
+    VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_NV = 0x00000002,
+    VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_NV = 0x00000004,
+    VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF
+} VkExternalMemoryFeatureFlagBitsNV;
+typedef VkFlags VkExternalMemoryFeatureFlagsNV;
+
+typedef struct VkExternalImageFormatPropertiesNV {
+    VkImageFormatProperties              imageFormatProperties;
+    VkExternalMemoryFeatureFlagsNV       externalMemoryFeatures;
+    VkExternalMemoryHandleTypeFlagsNV    exportFromImportedHandleTypes;
+    VkExternalMemoryHandleTypeFlagsNV    compatibleHandleTypes;
+} VkExternalImageFormatPropertiesNV;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkExternalMemoryHandleTypeFlagsNV externalHandleType, VkExternalImageFormatPropertiesNV* pExternalImageFormatProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceExternalImageFormatPropertiesNV(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkImageTiling                               tiling,
+    VkImageUsageFlags                           usage,
+    VkImageCreateFlags                          flags,
+    VkExternalMemoryHandleTypeFlagsNV           externalHandleType,
+    VkExternalImageFormatPropertiesNV*          pExternalImageFormatProperties);
+#endif
+
+#define VK_NV_external_memory 1
+#define VK_NV_EXTERNAL_MEMORY_SPEC_VERSION 1
+#define VK_NV_EXTERNAL_MEMORY_EXTENSION_NAME "VK_NV_external_memory"
+
+typedef struct VkExternalMemoryImageCreateInfoNV {
+    VkStructureType                      sType;
+    const void*                          pNext;
+    VkExternalMemoryHandleTypeFlagsNV    handleTypes;
+} VkExternalMemoryImageCreateInfoNV;
+
+typedef struct VkExportMemoryAllocateInfoNV {
+    VkStructureType                      sType;
+    const void*                          pNext;
+    VkExternalMemoryHandleTypeFlagsNV    handleTypes;
+} VkExportMemoryAllocateInfoNV;
+
+
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#define VK_NV_external_memory_win32 1
+#define VK_NV_EXTERNAL_MEMORY_WIN32_SPEC_VERSION 1
+#define VK_NV_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME "VK_NV_external_memory_win32"
+
+typedef struct VkImportMemoryWin32HandleInfoNV {
+    VkStructureType                      sType;
+    const void*                          pNext;
+    VkExternalMemoryHandleTypeFlagsNV    handleType;
+    HANDLE                               handle;
+} VkImportMemoryWin32HandleInfoNV;
+
+typedef struct VkExportMemoryWin32HandleInfoNV {
+    VkStructureType               sType;
+    const void*                   pNext;
+    const SECURITY_ATTRIBUTES*    pAttributes;
+    DWORD                         dwAccess;
+} VkExportMemoryWin32HandleInfoNV;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryWin32HandleNV)(VkDevice device, VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagsNV handleType, HANDLE* pHandle);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryWin32HandleNV(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkExternalMemoryHandleTypeFlagsNV           handleType,
+    HANDLE*                                     pHandle);
+#endif
+#endif /* VK_USE_PLATFORM_WIN32_KHR */
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#define VK_NV_win32_keyed_mutex 1
+#define VK_NV_WIN32_KEYED_MUTEX_SPEC_VERSION 1
+#define VK_NV_WIN32_KEYED_MUTEX_EXTENSION_NAME "VK_NV_win32_keyed_mutex"
+
+typedef struct VkWin32KeyedMutexAcquireReleaseInfoNV {
+    VkStructureType          sType;
+    const void*              pNext;
+    uint32_t                 acquireCount;
+    const VkDeviceMemory*    pAcquireSyncs;
+    const uint64_t*          pAcquireKeys;
+    const uint32_t*          pAcquireTimeoutMilliseconds;
+    uint32_t                 releaseCount;
+    const VkDeviceMemory*    pReleaseSyncs;
+    const uint64_t*          pReleaseKeys;
+} VkWin32KeyedMutexAcquireReleaseInfoNV;
+
+
+#endif /* VK_USE_PLATFORM_WIN32_KHR */
+
+#define VK_EXT_validation_flags 1
+#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 1
+#define VK_EXT_VALIDATION_FLAGS_EXTENSION_NAME "VK_EXT_validation_flags"
+
+
+typedef enum VkValidationCheckEXT {
+    VK_VALIDATION_CHECK_ALL_EXT = 0,
+    VK_VALIDATION_CHECK_BEGIN_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT,
+    VK_VALIDATION_CHECK_END_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT,
+    VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_ALL_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1),
+    VK_VALIDATION_CHECK_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationCheckEXT;
+
+typedef struct VkValidationFlagsEXT {
+    VkStructureType          sType;
+    const void*              pNext;
+    uint32_t                 disabledValidationCheckCount;
+    VkValidationCheckEXT*    pDisabledValidationChecks;
+} VkValidationFlagsEXT;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/caffe2/mobile/contrib/libvulkan-stub/src/libvulkan-stub.c b/caffe2/mobile/contrib/libvulkan-stub/src/libvulkan-stub.c
new file mode 100644
index 0000000..cece809
--- /dev/null
+++ b/caffe2/mobile/contrib/libvulkan-stub/src/libvulkan-stub.c
@@ -0,0 +1,547 @@
+/*
+* Copyright (c) 2016-2017, ARM Limited and Contributors
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge,
+* to any person obtaining a copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation the rights to
+* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/* This header is autogenerated by vulkan_loader_generator.py */
+#include "libvulkan-stub.h"
+
+PFN_vkCreateInstance vulkanSymbolWrapper_vkCreateInstance;
+PFN_vkEnumerateInstanceExtensionProperties vulkanSymbolWrapper_vkEnumerateInstanceExtensionProperties;
+PFN_vkEnumerateInstanceLayerProperties vulkanSymbolWrapper_vkEnumerateInstanceLayerProperties;
+PFN_vkDestroyInstance vulkanSymbolWrapper_vkDestroyInstance;
+PFN_vkEnumeratePhysicalDevices vulkanSymbolWrapper_vkEnumeratePhysicalDevices;
+PFN_vkGetPhysicalDeviceFeatures vulkanSymbolWrapper_vkGetPhysicalDeviceFeatures;
+PFN_vkGetPhysicalDeviceFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceFormatProperties;
+PFN_vkGetPhysicalDeviceImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceImageFormatProperties;
+PFN_vkGetPhysicalDeviceProperties vulkanSymbolWrapper_vkGetPhysicalDeviceProperties;
+PFN_vkGetPhysicalDeviceQueueFamilyProperties vulkanSymbolWrapper_vkGetPhysicalDeviceQueueFamilyProperties;
+PFN_vkGetPhysicalDeviceMemoryProperties vulkanSymbolWrapper_vkGetPhysicalDeviceMemoryProperties;
+PFN_vkGetDeviceProcAddr vulkanSymbolWrapper_vkGetDeviceProcAddr;
+PFN_vkCreateDevice vulkanSymbolWrapper_vkCreateDevice;
+PFN_vkDestroyDevice vulkanSymbolWrapper_vkDestroyDevice;
+PFN_vkEnumerateDeviceExtensionProperties vulkanSymbolWrapper_vkEnumerateDeviceExtensionProperties;
+PFN_vkEnumerateDeviceLayerProperties vulkanSymbolWrapper_vkEnumerateDeviceLayerProperties;
+PFN_vkGetDeviceQueue vulkanSymbolWrapper_vkGetDeviceQueue;
+PFN_vkQueueSubmit vulkanSymbolWrapper_vkQueueSubmit;
+PFN_vkQueueWaitIdle vulkanSymbolWrapper_vkQueueWaitIdle;
+PFN_vkDeviceWaitIdle vulkanSymbolWrapper_vkDeviceWaitIdle;
+PFN_vkAllocateMemory vulkanSymbolWrapper_vkAllocateMemory;
+PFN_vkFreeMemory vulkanSymbolWrapper_vkFreeMemory;
+PFN_vkMapMemory vulkanSymbolWrapper_vkMapMemory;
+PFN_vkUnmapMemory vulkanSymbolWrapper_vkUnmapMemory;
+PFN_vkFlushMappedMemoryRanges vulkanSymbolWrapper_vkFlushMappedMemoryRanges;
+PFN_vkInvalidateMappedMemoryRanges vulkanSymbolWrapper_vkInvalidateMappedMemoryRanges;
+PFN_vkGetDeviceMemoryCommitment vulkanSymbolWrapper_vkGetDeviceMemoryCommitment;
+PFN_vkBindBufferMemory vulkanSymbolWrapper_vkBindBufferMemory;
+PFN_vkBindImageMemory vulkanSymbolWrapper_vkBindImageMemory;
+PFN_vkGetBufferMemoryRequirements vulkanSymbolWrapper_vkGetBufferMemoryRequirements;
+PFN_vkGetImageMemoryRequirements vulkanSymbolWrapper_vkGetImageMemoryRequirements;
+PFN_vkGetImageSparseMemoryRequirements vulkanSymbolWrapper_vkGetImageSparseMemoryRequirements;
+PFN_vkGetPhysicalDeviceSparseImageFormatProperties vulkanSymbolWrapper_vkGetPhysicalDeviceSparseImageFormatProperties;
+PFN_vkQueueBindSparse vulkanSymbolWrapper_vkQueueBindSparse;
+PFN_vkCreateFence vulkanSymbolWrapper_vkCreateFence;
+PFN_vkDestroyFence vulkanSymbolWrapper_vkDestroyFence;
+PFN_vkResetFences vulkanSymbolWrapper_vkResetFences;
+PFN_vkGetFenceStatus vulkanSymbolWrapper_vkGetFenceStatus;
+PFN_vkWaitForFences vulkanSymbolWrapper_vkWaitForFences;
+PFN_vkCreateSemaphore vulkanSymbolWrapper_vkCreateSemaphore;
+PFN_vkDestroySemaphore vulkanSymbolWrapper_vkDestroySemaphore;
+PFN_vkCreateEvent vulkanSymbolWrapper_vkCreateEvent;
+PFN_vkDestroyEvent vulkanSymbolWrapper_vkDestroyEvent;
+PFN_vkGetEventStatus vulkanSymbolWrapper_vkGetEventStatus;
+PFN_vkSetEvent vulkanSymbolWrapper_vkSetEvent;
+PFN_vkResetEvent vulkanSymbolWrapper_vkResetEvent;
+PFN_vkCreateQueryPool vulkanSymbolWrapper_vkCreateQueryPool;
+PFN_vkDestroyQueryPool vulkanSymbolWrapper_vkDestroyQueryPool;
+PFN_vkGetQueryPoolResults vulkanSymbolWrapper_vkGetQueryPoolResults;
+PFN_vkCreateBuffer vulkanSymbolWrapper_vkCreateBuffer;
+PFN_vkDestroyBuffer vulkanSymbolWrapper_vkDestroyBuffer;
+PFN_vkCreateBufferView vulkanSymbolWrapper_vkCreateBufferView;
+PFN_vkDestroyBufferView vulkanSymbolWrapper_vkDestroyBufferView;
+PFN_vkCreateImage vulkanSymbolWrapper_vkCreateImage;
+PFN_vkDestroyImage vulkanSymbolWrapper_vkDestroyImage;
+PFN_vkGetImageSubresourceLayout vulkanSymbolWrapper_vkGetImageSubresourceLayout;
+PFN_vkCreateImageView vulkanSymbolWrapper_vkCreateImageView;
+PFN_vkDestroyImageView vulkanSymbolWrapper_vkDestroyImageView;
+PFN_vkCreateShaderModule vulkanSymbolWrapper_vkCreateShaderModule;
+PFN_vkDestroyShaderModule vulkanSymbolWrapper_vkDestroyShaderModule;
+PFN_vkCreatePipelineCache vulkanSymbolWrapper_vkCreatePipelineCache;
+PFN_vkDestroyPipelineCache vulkanSymbolWrapper_vkDestroyPipelineCache;
+PFN_vkGetPipelineCacheData vulkanSymbolWrapper_vkGetPipelineCacheData;
+PFN_vkMergePipelineCaches vulkanSymbolWrapper_vkMergePipelineCaches;
+PFN_vkCreateGraphicsPipelines vulkanSymbolWrapper_vkCreateGraphicsPipelines;
+PFN_vkCreateComputePipelines vulkanSymbolWrapper_vkCreateComputePipelines;
+PFN_vkDestroyPipeline vulkanSymbolWrapper_vkDestroyPipeline;
+PFN_vkCreatePipelineLayout vulkanSymbolWrapper_vkCreatePipelineLayout;
+PFN_vkDestroyPipelineLayout vulkanSymbolWrapper_vkDestroyPipelineLayout;
+PFN_vkCreateSampler vulkanSymbolWrapper_vkCreateSampler;
+PFN_vkDestroySampler vulkanSymbolWrapper_vkDestroySampler;
+PFN_vkCreateDescriptorSetLayout vulkanSymbolWrapper_vkCreateDescriptorSetLayout;
+PFN_vkDestroyDescriptorSetLayout vulkanSymbolWrapper_vkDestroyDescriptorSetLayout;
+PFN_vkCreateDescriptorPool vulkanSymbolWrapper_vkCreateDescriptorPool;
+PFN_vkDestroyDescriptorPool vulkanSymbolWrapper_vkDestroyDescriptorPool;
+PFN_vkResetDescriptorPool vulkanSymbolWrapper_vkResetDescriptorPool;
+PFN_vkAllocateDescriptorSets vulkanSymbolWrapper_vkAllocateDescriptorSets;
+PFN_vkFreeDescriptorSets vulkanSymbolWrapper_vkFreeDescriptorSets;
+PFN_vkUpdateDescriptorSets vulkanSymbolWrapper_vkUpdateDescriptorSets;
+PFN_vkCreateFramebuffer vulkanSymbolWrapper_vkCreateFramebuffer;
+PFN_vkDestroyFramebuffer vulkanSymbolWrapper_vkDestroyFramebuffer;
+PFN_vkCreateRenderPass vulkanSymbolWrapper_vkCreateRenderPass;
+PFN_vkDestroyRenderPass vulkanSymbolWrapper_vkDestroyRenderPass;
+PFN_vkGetRenderAreaGranularity vulkanSymbolWrapper_vkGetRenderAreaGranularity;
+PFN_vkCreateCommandPool vulkanSymbolWrapper_vkCreateCommandPool;
+PFN_vkDestroyCommandPool vulkanSymbolWrapper_vkDestroyCommandPool;
+PFN_vkResetCommandPool vulkanSymbolWrapper_vkResetCommandPool;
+PFN_vkAllocateCommandBuffers vulkanSymbolWrapper_vkAllocateCommandBuffers;
+PFN_vkFreeCommandBuffers vulkanSymbolWrapper_vkFreeCommandBuffers;
+PFN_vkBeginCommandBuffer vulkanSymbolWrapper_vkBeginCommandBuffer;
+PFN_vkEndCommandBuffer vulkanSymbolWrapper_vkEndCommandBuffer;
+PFN_vkResetCommandBuffer vulkanSymbolWrapper_vkResetCommandBuffer;
+PFN_vkCmdBindPipeline vulkanSymbolWrapper_vkCmdBindPipeline;
+PFN_vkCmdSetViewport vulkanSymbolWrapper_vkCmdSetViewport;
+PFN_vkCmdSetScissor vulkanSymbolWrapper_vkCmdSetScissor;
+PFN_vkCmdSetLineWidth vulkanSymbolWrapper_vkCmdSetLineWidth;
+PFN_vkCmdSetDepthBias vulkanSymbolWrapper_vkCmdSetDepthBias;
+PFN_vkCmdSetBlendConstants vulkanSymbolWrapper_vkCmdSetBlendConstants;
+PFN_vkCmdSetDepthBounds vulkanSymbolWrapper_vkCmdSetDepthBounds;
+PFN_vkCmdSetStencilCompareMask vulkanSymbolWrapper_vkCmdSetStencilCompareMask;
+PFN_vkCmdSetStencilWriteMask vulkanSymbolWrapper_vkCmdSetStencilWriteMask;
+PFN_vkCmdSetStencilReference vulkanSymbolWrapper_vkCmdSetStencilReference;
+PFN_vkCmdBindDescriptorSets vulkanSymbolWrapper_vkCmdBindDescriptorSets;
+PFN_vkCmdBindIndexBuffer vulkanSymbolWrapper_vkCmdBindIndexBuffer;
+PFN_vkCmdBindVertexBuffers vulkanSymbolWrapper_vkCmdBindVertexBuffers;
+PFN_vkCmdDraw vulkanSymbolWrapper_vkCmdDraw;
+PFN_vkCmdDrawIndexed vulkanSymbolWrapper_vkCmdDrawIndexed;
+PFN_vkCmdDrawIndirect vulkanSymbolWrapper_vkCmdDrawIndirect;
+PFN_vkCmdDrawIndexedIndirect vulkanSymbolWrapper_vkCmdDrawIndexedIndirect;
+PFN_vkCmdDispatch vulkanSymbolWrapper_vkCmdDispatch;
+PFN_vkCmdDispatchIndirect vulkanSymbolWrapper_vkCmdDispatchIndirect;
+PFN_vkCmdCopyBuffer vulkanSymbolWrapper_vkCmdCopyBuffer;
+PFN_vkCmdCopyImage vulkanSymbolWrapper_vkCmdCopyImage;
+PFN_vkCmdBlitImage vulkanSymbolWrapper_vkCmdBlitImage;
+PFN_vkCmdCopyBufferToImage vulkanSymbolWrapper_vkCmdCopyBufferToImage;
+PFN_vkCmdCopyImageToBuffer vulkanSymbolWrapper_vkCmdCopyImageToBuffer;
+PFN_vkCmdUpdateBuffer vulkanSymbolWrapper_vkCmdUpdateBuffer;
+PFN_vkCmdFillBuffer vulkanSymbolWrapper_vkCmdFillBuffer;
+PFN_vkCmdClearColorImage vulkanSymbolWrapper_vkCmdClearColorImage;
+PFN_vkCmdClearDepthStencilImage vulkanSymbolWrapper_vkCmdClearDepthStencilImage;
+PFN_vkCmdClearAttachments vulkanSymbolWrapper_vkCmdClearAttachments;
+PFN_vkCmdResolveImage vulkanSymbolWrapper_vkCmdResolveImage;
+PFN_vkCmdSetEvent vulkanSymbolWrapper_vkCmdSetEvent;
+PFN_vkCmdResetEvent vulkanSymbolWrapper_vkCmdResetEvent;
+PFN_vkCmdWaitEvents vulkanSymbolWrapper_vkCmdWaitEvents;
+PFN_vkCmdPipelineBarrier vulkanSymbolWrapper_vkCmdPipelineBarrier;
+PFN_vkCmdBeginQuery vulkanSymbolWrapper_vkCmdBeginQuery;
+PFN_vkCmdEndQuery vulkanSymbolWrapper_vkCmdEndQuery;
+PFN_vkCmdResetQueryPool vulkanSymbolWrapper_vkCmdResetQueryPool;
+PFN_vkCmdWriteTimestamp vulkanSymbolWrapper_vkCmdWriteTimestamp;
+PFN_vkCmdCopyQueryPoolResults vulkanSymbolWrapper_vkCmdCopyQueryPoolResults;
+PFN_vkCmdPushConstants vulkanSymbolWrapper_vkCmdPushConstants;
+PFN_vkCmdBeginRenderPass vulkanSymbolWrapper_vkCmdBeginRenderPass;
+PFN_vkCmdNextSubpass vulkanSymbolWrapper_vkCmdNextSubpass;
+PFN_vkCmdEndRenderPass vulkanSymbolWrapper_vkCmdEndRenderPass;
+PFN_vkCmdExecuteCommands vulkanSymbolWrapper_vkCmdExecuteCommands;
+PFN_vkDestroySurfaceKHR vulkanSymbolWrapper_vkDestroySurfaceKHR;
+PFN_vkGetPhysicalDeviceSurfaceSupportKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceSupportKHR;
+PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfaceFormatsKHR;
+PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceSurfacePresentModesKHR;
+PFN_vkCreateSwapchainKHR vulkanSymbolWrapper_vkCreateSwapchainKHR;
+PFN_vkDestroySwapchainKHR vulkanSymbolWrapper_vkDestroySwapchainKHR;
+PFN_vkGetSwapchainImagesKHR vulkanSymbolWrapper_vkGetSwapchainImagesKHR;
+PFN_vkAcquireNextImageKHR vulkanSymbolWrapper_vkAcquireNextImageKHR;
+PFN_vkQueuePresentKHR vulkanSymbolWrapper_vkQueuePresentKHR;
+PFN_vkGetPhysicalDeviceDisplayPropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPropertiesKHR;
+PFN_vkGetPhysicalDeviceDisplayPlanePropertiesKHR vulkanSymbolWrapper_vkGetPhysicalDeviceDisplayPlanePropertiesKHR;
+PFN_vkGetDisplayPlaneSupportedDisplaysKHR vulkanSymbolWrapper_vkGetDisplayPlaneSupportedDisplaysKHR;
+PFN_vkGetDisplayModePropertiesKHR vulkanSymbolWrapper_vkGetDisplayModePropertiesKHR;
+PFN_vkCreateDisplayModeKHR vulkanSymbolWrapper_vkCreateDisplayModeKHR;
+PFN_vkGetDisplayPlaneCapabilitiesKHR vulkanSymbolWrapper_vkGetDisplayPlaneCapabilitiesKHR;
+PFN_vkCreateDisplayPlaneSurfaceKHR vulkanSymbolWrapper_vkCreateDisplayPlaneSurfaceKHR;
+PFN_vkCreateSharedSwapchainsKHR vulkanSymbolWrapper_vkCreateSharedSwapchainsKHR;
+PFN_vkCreateDebugReportCallbackEXT vulkanSymbolWrapper_vkCreateDebugReportCallbackEXT;
+PFN_vkDestroyDebugReportCallbackEXT vulkanSymbolWrapper_vkDestroyDebugReportCallbackEXT;
+PFN_vkDebugReportMessageEXT vulkanSymbolWrapper_vkDebugReportMessageEXT;
+PFN_vkDebugMarkerSetObjectTagEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectTagEXT;
+PFN_vkDebugMarkerSetObjectNameEXT vulkanSymbolWrapper_vkDebugMarkerSetObjectNameEXT;
+PFN_vkCmdDebugMarkerBeginEXT vulkanSymbolWrapper_vkCmdDebugMarkerBeginEXT;
+PFN_vkCmdDebugMarkerEndEXT vulkanSymbolWrapper_vkCmdDebugMarkerEndEXT;
+PFN_vkCmdDebugMarkerInsertEXT vulkanSymbolWrapper_vkCmdDebugMarkerInsertEXT;
+PFN_vkCmdDrawIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndirectCountAMD;
+PFN_vkCmdDrawIndexedIndirectCountAMD vulkanSymbolWrapper_vkCmdDrawIndexedIndirectCountAMD;
+PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV vulkanSymbolWrapper_vkGetPhysicalDeviceExternalImageFormatPropertiesNV;
+
+#ifndef _WIN32
+#include <dlfcn.h>
+static void *dylib;
+#endif
+
+VkBool32 vulkanSymbolWrapperInitLoader(void)
+{
+#ifndef _WIN32
+  if (!dylib)
+  {
+    dylib = dlopen("libvulkan.so", RTLD_LOCAL | RTLD_NOW);
+  }
+
+  if (dylib)
+  {
+    PFN_vkGetInstanceProcAddr gpa = (PFN_vkGetInstanceProcAddr)(dlsym(dylib, "vkGetInstanceProcAddr"));
+    if (gpa)
+    {
+      vulkanSymbolWrapperInit(gpa);
+    }
+    else
+    {
+      return VK_FALSE;
+    }
+  }
+
+  return dylib != NULL ? VK_TRUE : VK_FALSE;
+#else
+  #error "Platform-specific implementation required"
+#endif
+}
+
+static PFN_vkGetInstanceProcAddr GetInstanceProcAddr;
+void vulkanSymbolWrapperInit(PFN_vkGetInstanceProcAddr getInstanceProcAddr)
+{
+    GetInstanceProcAddr = getInstanceProcAddr;
+}
+
+PFN_vkGetInstanceProcAddr vulkanSymbolWrapperInstanceProcAddr(void)
+{
+    return GetInstanceProcAddr;
+}
+
+void vulkanSymbolWrapperReset(void)
+{
+#ifndef _WIN32
+  if (dylib) {
+    dlclose(dylib);
+  }
+  dylib = NULL;
+#else
+  #error "Platform-specific implementation required"
+#endif
+  GetInstanceProcAddr = NULL;
+}
+
+
+VkBool32 vulkanSymbolWrapperLoadInstanceSymbol(VkInstance instance, const char *name, PFN_vkVoidFunction *ppSymbol)
+{
+    *ppSymbol = GetInstanceProcAddr(instance, name);
+    return *ppSymbol != NULL;
+}
+
+VkBool32 vulkanSymbolWrapperLoadDeviceSymbol(VkDevice device, const char *name, PFN_vkVoidFunction *ppSymbol)
+{
+    *ppSymbol = vkGetDeviceProcAddr(device, name);
+    return *ppSymbol != NULL;
+}
+
+VkBool32 vulkanSymbolWrapperLoadGlobalSymbols(void)
+{
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(NULL, "vkCreateInstance", vkCreateInstance)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(NULL, "vkEnumerateInstanceExtensionProperties", vkEnumerateInstanceExtensionProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(NULL, "vkEnumerateInstanceLayerProperties", vkEnumerateInstanceLayerProperties)) return VK_FALSE;
+    return VK_TRUE;
+}
+
+VkBool32 vulkanSymbolWrapperLoadCoreSymbols(VkInstance instance)
+{
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyInstance", vkDestroyInstance)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumeratePhysicalDevices", vkEnumeratePhysicalDevices)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceFeatures", vkGetPhysicalDeviceFeatures)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceFormatProperties", vkGetPhysicalDeviceFormatProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceImageFormatProperties", vkGetPhysicalDeviceImageFormatProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceProperties", vkGetPhysicalDeviceProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceQueueFamilyProperties", vkGetPhysicalDeviceQueueFamilyProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceMemoryProperties", vkGetPhysicalDeviceMemoryProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetDeviceProcAddr", vkGetDeviceProcAddr)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateDevice", vkCreateDevice)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyDevice", vkDestroyDevice)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumerateDeviceExtensionProperties", vkEnumerateDeviceExtensionProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumerateDeviceLayerProperties", vkEnumerateDeviceLayerProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetDeviceQueue", vkGetDeviceQueue)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkQueueSubmit", vkQueueSubmit)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkQueueWaitIdle", vkQueueWaitIdle)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDeviceWaitIdle", vkDeviceWaitIdle)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkAllocateMemory", vkAllocateMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkFreeMemory", vkFreeMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkMapMemory", vkMapMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkUnmapMemory", vkUnmapMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkFlushMappedMemoryRanges", vkFlushMappedMemoryRanges)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkInvalidateMappedMemoryRanges", vkInvalidateMappedMemoryRanges)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetDeviceMemoryCommitment", vkGetDeviceMemoryCommitment)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkBindBufferMemory", vkBindBufferMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkBindImageMemory", vkBindImageMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetBufferMemoryRequirements", vkGetBufferMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetImageMemoryRequirements", vkGetImageMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetImageSparseMemoryRequirements", vkGetImageSparseMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceSparseImageFormatProperties", vkGetPhysicalDeviceSparseImageFormatProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkQueueBindSparse", vkQueueBindSparse)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateFence", vkCreateFence)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyFence", vkDestroyFence)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkResetFences", vkResetFences)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetFenceStatus", vkGetFenceStatus)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkWaitForFences", vkWaitForFences)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateSemaphore", vkCreateSemaphore)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroySemaphore", vkDestroySemaphore)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateEvent", vkCreateEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyEvent", vkDestroyEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetEventStatus", vkGetEventStatus)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkSetEvent", vkSetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkResetEvent", vkResetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateQueryPool", vkCreateQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyQueryPool", vkDestroyQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetQueryPoolResults", vkGetQueryPoolResults)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateBuffer", vkCreateBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyBuffer", vkDestroyBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateBufferView", vkCreateBufferView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyBufferView", vkDestroyBufferView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateImage", vkCreateImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyImage", vkDestroyImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetImageSubresourceLayout", vkGetImageSubresourceLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateImageView", vkCreateImageView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyImageView", vkDestroyImageView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateShaderModule", vkCreateShaderModule)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyShaderModule", vkDestroyShaderModule)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreatePipelineCache", vkCreatePipelineCache)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyPipelineCache", vkDestroyPipelineCache)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPipelineCacheData", vkGetPipelineCacheData)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkMergePipelineCaches", vkMergePipelineCaches)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateGraphicsPipelines", vkCreateGraphicsPipelines)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateComputePipelines", vkCreateComputePipelines)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyPipeline", vkDestroyPipeline)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreatePipelineLayout", vkCreatePipelineLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyPipelineLayout", vkDestroyPipelineLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateSampler", vkCreateSampler)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroySampler", vkDestroySampler)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateDescriptorSetLayout", vkCreateDescriptorSetLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyDescriptorSetLayout", vkDestroyDescriptorSetLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateDescriptorPool", vkCreateDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyDescriptorPool", vkDestroyDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkResetDescriptorPool", vkResetDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkAllocateDescriptorSets", vkAllocateDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkFreeDescriptorSets", vkFreeDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkUpdateDescriptorSets", vkUpdateDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateFramebuffer", vkCreateFramebuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyFramebuffer", vkDestroyFramebuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateRenderPass", vkCreateRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyRenderPass", vkDestroyRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetRenderAreaGranularity", vkGetRenderAreaGranularity)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateCommandPool", vkCreateCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyCommandPool", vkDestroyCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkResetCommandPool", vkResetCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkAllocateCommandBuffers", vkAllocateCommandBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkFreeCommandBuffers", vkFreeCommandBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkBeginCommandBuffer", vkBeginCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEndCommandBuffer", vkEndCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkResetCommandBuffer", vkResetCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBindPipeline", vkCmdBindPipeline)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetViewport", vkCmdSetViewport)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetScissor", vkCmdSetScissor)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetLineWidth", vkCmdSetLineWidth)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetDepthBias", vkCmdSetDepthBias)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetBlendConstants", vkCmdSetBlendConstants)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetDepthBounds", vkCmdSetDepthBounds)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetStencilCompareMask", vkCmdSetStencilCompareMask)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetStencilWriteMask", vkCmdSetStencilWriteMask)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetStencilReference", vkCmdSetStencilReference)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBindDescriptorSets", vkCmdBindDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBindIndexBuffer", vkCmdBindIndexBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBindVertexBuffers", vkCmdBindVertexBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDraw", vkCmdDraw)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDrawIndexed", vkCmdDrawIndexed)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDrawIndirect", vkCmdDrawIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDrawIndexedIndirect", vkCmdDrawIndexedIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDispatch", vkCmdDispatch)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdDispatchIndirect", vkCmdDispatchIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdCopyBuffer", vkCmdCopyBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdCopyImage", vkCmdCopyImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBlitImage", vkCmdBlitImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdCopyBufferToImage", vkCmdCopyBufferToImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdCopyImageToBuffer", vkCmdCopyImageToBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdUpdateBuffer", vkCmdUpdateBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdFillBuffer", vkCmdFillBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdClearColorImage", vkCmdClearColorImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdClearDepthStencilImage", vkCmdClearDepthStencilImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdClearAttachments", vkCmdClearAttachments)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdResolveImage", vkCmdResolveImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdSetEvent", vkCmdSetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdResetEvent", vkCmdResetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdWaitEvents", vkCmdWaitEvents)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdPipelineBarrier", vkCmdPipelineBarrier)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBeginQuery", vkCmdBeginQuery)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdEndQuery", vkCmdEndQuery)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdResetQueryPool", vkCmdResetQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdWriteTimestamp", vkCmdWriteTimestamp)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdCopyQueryPoolResults", vkCmdCopyQueryPoolResults)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdPushConstants", vkCmdPushConstants)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdBeginRenderPass", vkCmdBeginRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdNextSubpass", vkCmdNextSubpass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdEndRenderPass", vkCmdEndRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCmdExecuteCommands", vkCmdExecuteCommands)) return VK_FALSE;
+    return VK_TRUE;
+}
+
+VkBool32 vulkanSymbolWrapperLoadCoreInstanceSymbols(VkInstance instance)
+{
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkDestroyInstance", vkDestroyInstance)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumeratePhysicalDevices", vkEnumeratePhysicalDevices)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceFeatures", vkGetPhysicalDeviceFeatures)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceFormatProperties", vkGetPhysicalDeviceFormatProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceImageFormatProperties", vkGetPhysicalDeviceImageFormatProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceProperties", vkGetPhysicalDeviceProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceQueueFamilyProperties", vkGetPhysicalDeviceQueueFamilyProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceMemoryProperties", vkGetPhysicalDeviceMemoryProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetDeviceProcAddr", vkGetDeviceProcAddr)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkCreateDevice", vkCreateDevice)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumerateDeviceExtensionProperties", vkEnumerateDeviceExtensionProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkEnumerateDeviceLayerProperties", vkEnumerateDeviceLayerProperties)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_INSTANCE_SYMBOL(instance, "vkGetPhysicalDeviceSparseImageFormatProperties", vkGetPhysicalDeviceSparseImageFormatProperties)) return VK_FALSE;
+    return VK_TRUE;
+}
+
+VkBool32 vulkanSymbolWrapperLoadCoreDeviceSymbols(VkDevice device)
+{
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyDevice", vkDestroyDevice)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetDeviceQueue", vkGetDeviceQueue)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkQueueSubmit", vkQueueSubmit)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkQueueWaitIdle", vkQueueWaitIdle)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDeviceWaitIdle", vkDeviceWaitIdle)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkAllocateMemory", vkAllocateMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkFreeMemory", vkFreeMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkMapMemory", vkMapMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkUnmapMemory", vkUnmapMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkFlushMappedMemoryRanges", vkFlushMappedMemoryRanges)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkInvalidateMappedMemoryRanges", vkInvalidateMappedMemoryRanges)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetDeviceMemoryCommitment", vkGetDeviceMemoryCommitment)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkBindBufferMemory", vkBindBufferMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkBindImageMemory", vkBindImageMemory)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetBufferMemoryRequirements", vkGetBufferMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetImageMemoryRequirements", vkGetImageMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetImageSparseMemoryRequirements", vkGetImageSparseMemoryRequirements)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkQueueBindSparse", vkQueueBindSparse)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateFence", vkCreateFence)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyFence", vkDestroyFence)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkResetFences", vkResetFences)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetFenceStatus", vkGetFenceStatus)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkWaitForFences", vkWaitForFences)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateSemaphore", vkCreateSemaphore)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroySemaphore", vkDestroySemaphore)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateEvent", vkCreateEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyEvent", vkDestroyEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetEventStatus", vkGetEventStatus)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkSetEvent", vkSetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkResetEvent", vkResetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateQueryPool", vkCreateQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyQueryPool", vkDestroyQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetQueryPoolResults", vkGetQueryPoolResults)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateBuffer", vkCreateBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyBuffer", vkDestroyBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateBufferView", vkCreateBufferView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyBufferView", vkDestroyBufferView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateImage", vkCreateImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyImage", vkDestroyImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetImageSubresourceLayout", vkGetImageSubresourceLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateImageView", vkCreateImageView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyImageView", vkDestroyImageView)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateShaderModule", vkCreateShaderModule)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyShaderModule", vkDestroyShaderModule)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreatePipelineCache", vkCreatePipelineCache)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyPipelineCache", vkDestroyPipelineCache)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetPipelineCacheData", vkGetPipelineCacheData)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkMergePipelineCaches", vkMergePipelineCaches)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateGraphicsPipelines", vkCreateGraphicsPipelines)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateComputePipelines", vkCreateComputePipelines)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyPipeline", vkDestroyPipeline)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreatePipelineLayout", vkCreatePipelineLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyPipelineLayout", vkDestroyPipelineLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateSampler", vkCreateSampler)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroySampler", vkDestroySampler)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateDescriptorSetLayout", vkCreateDescriptorSetLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyDescriptorSetLayout", vkDestroyDescriptorSetLayout)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateDescriptorPool", vkCreateDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyDescriptorPool", vkDestroyDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkResetDescriptorPool", vkResetDescriptorPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkAllocateDescriptorSets", vkAllocateDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkFreeDescriptorSets", vkFreeDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkUpdateDescriptorSets", vkUpdateDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateFramebuffer", vkCreateFramebuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyFramebuffer", vkDestroyFramebuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateRenderPass", vkCreateRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyRenderPass", vkDestroyRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkGetRenderAreaGranularity", vkGetRenderAreaGranularity)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCreateCommandPool", vkCreateCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkDestroyCommandPool", vkDestroyCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkResetCommandPool", vkResetCommandPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkAllocateCommandBuffers", vkAllocateCommandBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkFreeCommandBuffers", vkFreeCommandBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkBeginCommandBuffer", vkBeginCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkEndCommandBuffer", vkEndCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkResetCommandBuffer", vkResetCommandBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBindPipeline", vkCmdBindPipeline)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetViewport", vkCmdSetViewport)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetScissor", vkCmdSetScissor)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetLineWidth", vkCmdSetLineWidth)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetDepthBias", vkCmdSetDepthBias)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetBlendConstants", vkCmdSetBlendConstants)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetDepthBounds", vkCmdSetDepthBounds)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetStencilCompareMask", vkCmdSetStencilCompareMask)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetStencilWriteMask", vkCmdSetStencilWriteMask)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetStencilReference", vkCmdSetStencilReference)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBindDescriptorSets", vkCmdBindDescriptorSets)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBindIndexBuffer", vkCmdBindIndexBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBindVertexBuffers", vkCmdBindVertexBuffers)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDraw", vkCmdDraw)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDrawIndexed", vkCmdDrawIndexed)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDrawIndirect", vkCmdDrawIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDrawIndexedIndirect", vkCmdDrawIndexedIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDispatch", vkCmdDispatch)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdDispatchIndirect", vkCmdDispatchIndirect)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdCopyBuffer", vkCmdCopyBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdCopyImage", vkCmdCopyImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBlitImage", vkCmdBlitImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdCopyBufferToImage", vkCmdCopyBufferToImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdCopyImageToBuffer", vkCmdCopyImageToBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdUpdateBuffer", vkCmdUpdateBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdFillBuffer", vkCmdFillBuffer)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdClearColorImage", vkCmdClearColorImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdClearDepthStencilImage", vkCmdClearDepthStencilImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdClearAttachments", vkCmdClearAttachments)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdResolveImage", vkCmdResolveImage)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdSetEvent", vkCmdSetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdResetEvent", vkCmdResetEvent)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdWaitEvents", vkCmdWaitEvents)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdPipelineBarrier", vkCmdPipelineBarrier)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBeginQuery", vkCmdBeginQuery)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdEndQuery", vkCmdEndQuery)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdResetQueryPool", vkCmdResetQueryPool)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdWriteTimestamp", vkCmdWriteTimestamp)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdCopyQueryPoolResults", vkCmdCopyQueryPoolResults)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdPushConstants", vkCmdPushConstants)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdBeginRenderPass", vkCmdBeginRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdNextSubpass", vkCmdNextSubpass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdEndRenderPass", vkCmdEndRenderPass)) return VK_FALSE;
+    if (!VULKAN_SYMBOL_WRAPPER_LOAD_DEVICE_SYMBOL(device, "vkCmdExecuteCommands", vkCmdExecuteCommands)) return VK_FALSE;
+    return VK_TRUE;
+}
diff --git a/caffe2/mobile/contrib/nnapi/CMakeLists.txt b/caffe2/mobile/contrib/nnapi/CMakeLists.txt
new file mode 100644
index 0000000..403bbb8
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (USE_NNAPI AND ANDROID)
+  set(Caffe2_CONTRIB_NNAPI_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/dlnnapi.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/nnapi.cc"
+  )
+  set(Caffe2_CONTRIB_NNAPI_TEST_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nnapi_benchmark.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/nnapi_test.cc"
+  )
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
+    ${Caffe2_CONTRIB_NNAPI_CPU_SRC} PARENT_SCOPE)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
+    ${Caffe2_CONTRIB_NNAPI_TEST_CPU_SRC} PARENT_SCOPE)
+endif()
diff --git a/caffe2/mobile/contrib/nnapi/NeuralNetworks.h b/caffe2/mobile/contrib/nnapi/NeuralNetworks.h
new file mode 100644
index 0000000..beaf6be
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/NeuralNetworks.h
@@ -0,0 +1,1929 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @addtogroup NeuralNetworks
+ * @{
+ */
+
+/**
+ * @file NeuralNetworks.h
+ */
+
+#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+
+/******************************************************************
+ *
+ * IMPORTANT NOTICE:
+ *
+ *   This file is part of Android's set of stable system headers
+ *   exposed by the Android NDK (Native Development Kit).
+ *
+ *   Third-party source AND binary code relies on the definitions
+ *   here to be FROZEN ON ALL UPCOMING PLATFORM RELEASES.
+ *
+ *   - DO NOT MODIFY ENUMS (EXCEPT IF YOU ADD NEW 32-BIT VALUES)
+ *   - DO NOT MODIFY CONSTANTS OR FUNCTIONAL MACROS
+ *   - DO NOT CHANGE THE SIGNATURE OF FUNCTIONS IN ANY WAY
+ *   - DO NOT CHANGE THE LAYOUT OR SIZE OF STRUCTURES
+ */
+
+#if __ANDROID_API__ >= __ANDROID_API_O_MR1__
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+/**
+ * Operand types.
+ *
+ * The type of operands that can be added to a model.
+ *
+ * Although we define many types, most operators accept just a few
+ * types. Most used are {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+ * and {@link ANEURALNETWORKS_INT32}.
+ */
+typedef enum {
+    /** The following entries are used to declare scalars. */
+
+    /** A 32 bit floating point scalar value. */
+    ANEURALNETWORKS_FLOAT32 = 0,
+    /** A signed 32 bit integer scalar value. */
+    ANEURALNETWORKS_INT32 = 1,
+    /** An unsigned 32 bit integer scalar value. */
+    ANEURALNETWORKS_UINT32 = 2,
+
+    /** The following entries are used to declare tensors. */
+
+    /** A tensor of 32 bit floating point values. */
+    ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
+    /** A tensor of 32 bit integer values. */
+    ANEURALNETWORKS_TENSOR_INT32 = 4,
+    /** A tensor of 8 bit integers that represent real numbers.
+     *
+     * Attached to this tensor are two numbers that can be used to convert
+     * the 8 bit integer to the real value and vice versa.  These two numbers are:
+     * - scale: a 32 bit non-negative floating point value.
+     * - zeroPoint: an 32 bit integer, in range [0, 255].
+     *
+     * The formula is:
+     * real_value = (integer_value - zeroPoint) * scale.
+     */
+    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+} OperandCode;
+
+/**
+ * Operation types.
+ *
+ * The type of operations that can be added to a model.
+ */
+typedef enum {
+    /** Adds two tensors, element-wise.
+     *
+     * Takes two input tensors of identical type and compatible dimensions. The output
+     * is the sum of both input tensors, optionally modified by an activation function.
+     *
+     * Two dimensions are compatible when:
+     *     1. they are equal, or
+     *     2. one of them is 1
+     *
+     * The size of the output is the maximum size along each dimension of the input operands.
+     * It starts with the trailing dimensions, and works its way forward.
+     *
+     * Example:
+     *
+     *     input1.dimension = {4, 1, 2}
+     *     input2.dimension = {5, 4, 3, 1}
+     *     output.dimension = {5, 4, 3, 2}
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4
+     *
+     * Inputs:
+     * * 0: A tensor.
+     * * 1: A tensor of the same type, and compatible dimensions as input0.
+     * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The sum, a tensor of the same type as input0.
+     */
+    ANEURALNETWORKS_ADD = 0,
+
+    /** Performs a 2-D average pooling operation.
+     *
+     * The output dimensions are functions of the filter dimensions, stride, and padding.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[batch, row, col, channel] =
+     *         sum_{i, j}(input[batch, row + i, col + j, channel]) / sum(1)
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" (i.e., Num_samples, Height, Width, and Channels)
+     * data layout.
+     *
+     * Both explicit padding and implicit padding are supported.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the padding on the left, in the ‘width’ dimension.
+     * * 2: An INT32 value, specifying the padding on the right,in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the padding on the top, in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’ dimension.
+     * * 5: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 6: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 7: An INT32 value, specifying the filter width.
+     * * 8: An INT32 value, specifying the filter height.
+     * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Inputs (implicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the implicit padding scheme, has to be one of the
+     *      {@link PaddingCode} values.
+     * * 2: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the filter width.
+     * * 5: An INT32 value, specifying the filter height.
+     * * 6: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth].
+     */
+    ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
+
+    /** Concatenates the input tensors along the given dimension.
+     *
+     * The input tensors must have identical type and the same dimensions except the
+     * dimension along the concatenation axis.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4
+     *
+     * Inputs:
+     * * 0 ~ n-1: The list of n input tensors, of shape [D0, D1, ..., Daxis(i), ..., Dm].
+     *            For inputs of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, all
+     *            input tensors must have the same scale and zeroPoint.
+     * * n: An INT32 value, specifying the concatenation axis.
+     *
+     * Outputs:
+     * * 0: The output, a tensor of the same type as the input tensors.
+     *      The output shape is [D0, D1, ..., sum(Daxis(i)), ..., Dm].
+     */
+    ANEURALNETWORKS_CONCATENATION = 2,
+
+    /** Performs an 2-D convolution operation.
+     *
+     * The CONV_2D op sweeps a 2-D filter that can mix channels together over a batch of
+     * images, applying the filter to each window of each image of the appropriate size.
+     *
+     * The output dimensions are functions of the filter dimensions, stride, and padding.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[batch, row, col, channel] =
+     *         sum_{i, j} (
+     *             input[batch, row + i, col + j, k] *
+     *             filter[channel, row + i, col + j, k] +
+     *             bias[channel]
+     *         )
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Both explicit padding and implicit padding are supported.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width, depth_in],
+     *      specifying the filter.
+     * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the bias should
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias
+     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
+     *      bias_scale == input_scale * filter_scale.
+     * * 3: An INT32 value, specifying the padding on the left, in the ‘width’ dimension.
+     * * 4: An INT32 value, specifying the padding on the right,in the ‘width’ dimension.
+     * * 5: An INT32 value, specifying the padding on the top, in the ‘height’ dimension.
+     * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’ dimension.
+     * * 7: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 8: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Inputs (implicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width, depth_in],
+     *      specifying the filter.
+     * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the bias should
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias
+     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
+     *      bias_scale == input_scale * filter_scale.
+     * * 3: An INT32 value, specifying the implicit padding scheme, has to be one of the
+     *      {@link PaddingCode} values.
+     * * 4: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 5: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 6: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth_out].
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the following
+     *      condition must be satisfied: output_scale > input_scale * filter_scale.
+     */
+    ANEURALNETWORKS_CONV_2D = 3,
+
+    /** Performs a depthwise 2-D convolution operation.
+     *
+     * Given an input tensor of shape [batches, height, width, depth_in] and a filter
+     * tensor of shape [1, filter_height, filter_width, depth_out] containing
+     * depth_out convolutional filters of depth 1, DEPTHWISE_CONV applies a different
+     * filter to each input channel (expanding from 1 channel to channel_multiplier channels
+     * for each), then concatenates the results together.
+     *
+     * The output has depth_out = depth_in * depth_multiplier channels.
+     * The output dimensions are functions of the filter dimensions, stride, and padding.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[b, i, j, k * channel_multiplier + q] =
+     *         sum_{di, dj} (
+     *             input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+     *             filter[1, di, dj, k * channel_multiplier + q]
+     *         )
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Both explicit padding and implicit padding are supported.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
+     *      specifying the filter.
+     * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the bias should
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias
+     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
+     *      bias_scale == input_scale * filter_scale.
+     * * 3: An INT32 value, specifying the padding on the left, in the ‘width’ dimension.
+     * * 4: An INT32 value, specifying the padding on the right,in the ‘width’ dimension.
+     * * 5: An INT32 value, specifying the padding on the top, in the ‘height’ dimension.
+     * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’ dimension.
+     * * 7: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 8: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 9: An INT32 value, specifying the depthwise multiplier.
+     * * 10: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *       Specifies the activation to invoke on the result of each addition.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
+     *      specifying the filter.
+     * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the bias should
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias
+     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
+     *      bias_scale == input_scale * filter_scale.
+     * * 3: An INT32 value, specifying the implicit padding scheme, has to be one of the
+     *      {@link PaddingCode} values.
+     * * 4: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 5: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 6: An INT32 value, specifying the depthwise multiplier.
+     * * 7: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *       Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth_out].
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the following
+     *      condition must be satisfied: output_scale > input_scale * filter_scale.
+     */
+    ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
+
+    /** Rearranges data from depth into blocks of spatial data.
+     *
+     * More specifically, this op outputs a copy of the input tensor where values from
+     * the depth dimension are moved in spatial blocks to the height and width dimensions.
+     * The value block_size indicates the input block size and how the data is moved.
+     *
+     * Chunks of data of size block_size * block_size from depth are rearranged into
+     * non-overlapping blocks of size block_size x block_size.
+     *
+     * The width of the output tensor is input_depth * block_size, whereas the height is
+     * input_height * block_size.
+     * The depth of the input tensor must be divisible by block_size * block_size
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Inputs:
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
+     *      block_size * block_size must be a divisor of the input depth.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batch, height*block_size, width*block_size,
+     *      depth/(block_size*block_size)].
+     */
+    ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
+
+    /** Dequantizes the input tensor.
+     *
+     * The formula is:
+     *
+     *     output = (input - zeroPoint) * scale.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4
+     *
+     * Inputs:
+     * * 0: A tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0, but with type
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     */
+    ANEURALNETWORKS_DEQUANTIZE = 6,
+
+    /** Looks up sub-tensors in the input tensor.
+     *
+     * This operator takes for input a tensor of values (Values) and
+     * a one-dimensional tensor of selection indices (Lookups).
+     * The output tensor is the concatenation of sub-tensors of Values as
+     * selected by Lookups.
+     *
+     * Think of Values as being sliced along its first dimension:
+     * The entries in Lookups select which slices are concatenated together
+     * to create the output tensor.
+     *
+     * For example, if Values has shape of [40, 200, 300] and
+     * Lookups has shape of [3], we would expect all three values
+     * found in Lookups to be  between 0 and 39. The resulting tensor will
+     * have shape of [3, 200, 300].
+     *
+     * If a value in Lookups is out of bounds, the operation will fail
+     * and an error will be reported.
+     *
+     * Inputs:
+     * * 0: Lookups. A 1-D tensor of {@link ANEURALNETWORKS_TENSOR_INT32} type.
+     *      The values are indices into the first dimension of Values.
+     * * 1: Values. An n-D tensor, where n >= 2, from which sub-tensors are
+     *      extracted.
+     *
+     * Output:
+     * * 0: A n-D tensor with the same rank and shape as the Values
+     *      tensor, except for the first dimension which has the same size
+     *      as Lookups' only dimension.
+     */
+    ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
+
+    /** Computes element-wise floor() on the input tensor.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: up to 4
+     *
+     * Inputs:
+     * * 0: A tensor.
+     *
+     * Outputs:
+     * * 0: The output tensor, of the same type and dimensions as the input tensor.
+     */
+    ANEURALNETWORKS_FLOOR = 8,
+
+    /** Denotes a fully (densely) connected layer, which connects all elements in the input
+     * tensor with each element in the output tensor.
+     *
+     * This layer implements the operation:
+     *
+     *     outputs = activation(inputs * weights’ + bias)
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input. If rank is greater than 2, then it gets flattened to
+     *      a 2-D Tensor. The 2-D Tensor is handled as if dimensions corresponded to shape
+     *      [batch_size, input_size], where “batch_size” corresponds to the batching dimension,
+     *      and “input_size” is the size of the input.
+     * * 1: A 2-D tensor, specifying the weights, of shape [num_units, input_size], where
+     *      "num_units" corresponds to the number of output nodes.
+     * * 2: A 1-D tensor, of shape [num_units], specifying the bias.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the bias should
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias
+     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
+     *      bias_scale == input_scale * filter_scale.
+     * * 3: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output tensor, of shape [batch_size, num_units].
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the following
+     *      condition must be satisfied: output_scale > input_scale * filter_scale.
+     */
+    ANEURALNETWORKS_FULLY_CONNECTED = 9,
+
+    /** Looks up sub-tensors in the input tensor using a key-value map.
+     *
+     * This operator takes for input a tensor of values (Values),
+     * a one-dimensional tensor of selection values (Lookups) and
+     * a one-dimensional tensor that maps these values to Values
+     * indexes. The output tensor is the concatenation of sub-tensors of
+     * Values as selected by Lookups via Keys.
+     *
+     * Think of Values as being sliced along its outer-most dimension.
+     * The output is a concatenation of selected slices, with one slice
+     * for each entry of Lookups. The slice selected is the one at the
+     * same index as the Maps entry that matches the value in Lookups.
+     *
+     * For a hit, the corresponding sub-tensor of Values is included
+     * in the Output tensor.  For a miss, the corresponding sub-tensor in
+     * Output will have zero values.
+     *
+     * For example, if Values has shape of [40, 200, 300],
+     * Keys should have a shape of [40]. If Lookups tensor has shape
+     * of [3], we're concatenating three slices, so the resulting tensor
+     * will have the shape of [3, 200, 300]. If the first entry in
+     * Lookups has the value 123456, we'll look for that value in Keys tensor.
+     * If the sixth entry of Keys contains 123456, we'll select the sixth
+     * slice of Values. If no entry in Keys has 123456, a slice of zeroes
+     * will be concatenated.
+     *
+     * Inputs:
+     * * 0: Lookups. A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape [ k ].
+     * * 1: Keys. A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape [ n ];
+     *      Keys and Values pair represent a map, i.e., the ith element
+     *      in Keys (Keys[i]) is the key to select the ith sub-tensor
+     *      in Values (Values[i]), where 0 <= i <= n-1.
+     *      Keys tensor *MUST* be sorted in ascending order.
+     * * 2: Values. A tensor with shape of [ n, … ]; i.e., the first dimension must be n.
+     *
+     * Outputs:
+     * * 0: Output. A tensor with shape [ k …].
+     * * 1: Hits. A boolean tensor with shape [ k ] indicates whether the lookup
+     *      hits (True) or not (False).
+     *      Stored as {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} with offset 0 and scale 1.0f.
+     *      A non-zero byte represents True, a hit. A zero indicates otherwise.
+     */
+    ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
+
+    /** Applies L2 normalization along the depth dimension.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[batch, row, col, channel] =
+     *         input[batch, row, col, channel] /
+     *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
+     *
+     * For input tensor with more dimensions, independently normalizes each 1-D slice along dimension dim.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout (i.e., Num_samples, Height, Width, and Channels).
+     *
+     * Inputs:
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth].
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth].
+     */
+    ANEURALNETWORKS_L2_NORMALIZATION = 11,
+
+    /** Performs an 2-D L2 pooling operation.
+     *
+     * The output dimensions are functions of the filter dimensions, stride, and padding.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[batch, row, col, channel] =
+     *         sqrt(sum_{i, j} pow(input[batch, row + i, col + j, channel], 2) / sum(1))
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Both explicit padding and implicit padding are supported.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the padding on the left, in the ‘width’ dimension.
+     * * 2: An INT32 value, specifying the padding on the right,in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the padding on the top, in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’ dimension.
+     * * 5: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 6: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 7: An INT32 value, specifying the filter width.
+     * * 8: An INT32 value, specifying the filter height.
+     * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Inputs (implicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the implicit padding scheme, has to be one of the
+     *      {@link PaddingCode} values.
+     * * 2: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the filter width.
+     * * 5: An INT32 value, specifying the filter height.
+     * * 6: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth].
+     */
+    ANEURALNETWORKS_L2_POOL_2D = 12,
+
+    /** Applies Local Response Normalization along the depth dimension.
+     *
+     * The 4-D input tensor is treated as a 3-D array of 1-D vectors (along the last
+     * dimension), and each vector is normalized independently. Within a given vector,
+     * each component is divided by the weighted, squared sum of inputs within depth_radius.
+     *
+     * The output is calculated using this formula:
+     *
+     *     sqr_sum[a, b, c, d] =
+     *         sum(pow(input[a, b, c, d - depth_radius : d + depth_radius + 1], 2)
+     *     output = input / pow((bias + alpha * sqr_sum), beta)
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Inputs:
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the radius of the normalization window.
+     * * 2: A FLOAT32 value, specifying the bias, must not be zero.
+     * * 3: A FLOAT32 value, specifying the scale factor, alpha.
+     * * 4: A FLOAT32 value, specifying the exponent, beta.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     */
+    ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
+
+    /** Computes sigmoid activation on the input tensor element-wise.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output = 1 / (1 + exp(-input))
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type,
+     *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     */
+    ANEURALNETWORKS_LOGISTIC = 14,
+
+    /**
+     * Projects an input to a bit vector via locality senstive hashing.
+     *
+     * Inputs:
+     * * 0: Hash functions. Dim.size == 2, DataType: Float.
+     *            Tensor[0].Dim[0]: Number of hash functions.
+     *            Tensor[0].Dim[1]: Number of seeds per hash functions.
+     *            Tensor[0].Dim[1] <= 32 in sparse case.
+     *
+     * * 1: Input. Dim.size >= 1, no restriction on DataType.
+     * * 2: Weight. Optional. Dim.size == 1, DataType: Float.
+     *     If not set, each input element is considered to have the same weight of
+     *     1.0.
+     *     Tensor[1].Dim[0] == Tensor[2].Dim[0]
+     * * 3: Type:
+     *        Sparse: Value LSHProjectionType_SPARSE(=1).
+     *          Computed bit vector is considered to be sparse.
+     *          Each output element is an int32 made up of multiple bits computed from
+     *          hash functions.
+     *
+     *        Dense: Value LSHProjectionType_DENSE(=2).
+     *          Computed bit vector is considered to be dense. Each output element
+     *          represents a bit and can take the value of either 0 or 1.
+     *
+     * Outputs:
+     * * 0: If the projection type is sparse:
+     *        Output.Dim == { Tensor[0].Dim[0] }
+     *        A tensor of int32 that represents hash signatures.
+     *      If the projection type is Dense:
+     *        Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
+     *        A flattened tensor that represents projected bit vectors.
+     */
+    ANEURALNETWORKS_LSH_PROJECTION = 15,
+
+    /**
+     * Long short-term memory unit (LSTM) recurrent network layer.
+     *
+     * The default non-peephole implementation is based on:
+     * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+     * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
+     * Computation, 9(8):1735-1780, 1997.
+     *
+     * The peephole implementation is based on:
+     * https://research.google.com/pubs/archive/43905.pdf
+     * Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
+     * recurrent neural network architectures for large scale acoustic modeling."
+     * INTERSPEECH, 2014.
+     *
+     * The coupling of input and forget gate (CIFG) is based on:
+     * http://arxiv.org/pdf/1503.04069.pdf
+     * Greff et al. "LSTM: A Search Space Odyssey"
+     *
+     * The class has the following independently optional inputs:
+     * * If input gate (if CIFG): “input_to_forget_weights”,
+     *   “recurrent_to_input_weights”, “cell_to_input_weights”, “input_gate_bias”.
+     * * If no peephole connections: “cell_to_input_weights”,
+     *   “cell_to_forget_weights”, “cell_to_output_weights”.
+     * * If no projection layer: “projection_weights” and “projection_bias”.
+     * * If no projection bias: “projection_bias”.
+     *
+     * Supported tensor types (type T):
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Inputs:
+     * * 0: Input.
+     *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+     *      “batch_size” corresponds to the batching dimension, and “input_size”
+     *      is the size of the input.
+     * * 1: input_to_input_weights.
+     *      A 2-D tensor of type T, of shape [num_units, input_size], where
+     *      “num_units” corresponds to the number of cell units.
+     * * 2: input_to_forget_weights.
+     *      A 2-D tensor of type T, of shape [num_units, input_size].
+     * * 3: input_to_cell_weights.
+     *      A 2-D tensor of type T, of shape [num_units, input_size].
+     * * 4: input_to_output_weights.
+     *      A 2-D tensor of type T, of shape [num_units, input_size].
+     * * 5: recurrent_to_input_weights.
+     *      A 2-D tensor of type T, of shape [num_units, output_size], where
+     *      “output_size” corresponds to either the number of cell units (i.e.,
+     *      “num_units”), or the second dimension of the “projection_weights”, if
+     *      defined.
+     * * 6: recurrent_to_forget_weights.
+     *      A 2-D tensor of type T, of shape [num_units, output_size].
+     * * 7: recurrent_to_cell_weights.
+     *      A 2-D tensor of type T, of shape [num_units, output_size].
+     * * 8: recurrent_to_output_weights.
+     *      A 2-D tensor of type T, of shape [num_units, output_size].
+     * * 9: cell_to_input_weights.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 10:cell_to_forget_weights.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 11:cell_to_output_weights.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 12:input_gate_bias.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 13:forget_gate_bias.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 14:cell_bias.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 15:output_gate_bias.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 16:projection_weights.
+     *      A 2-D tensor of type T, of shape [output_size, num_units].
+     * * 17:projection_bias.
+     *      A 1-D tensor of type T, of shape [output_size].
+     * * 18: output_state (in).
+     *      A 2-D tensor of type T, of shape [batch_size, output_size].
+     * * 19: cell_state (in).
+     *      A 2-D tensor of type T, of shape [batch_size, num_units].
+     * * 20:fused_activation_function.
+     *      An optional {@link FuseCode} value indicating the activation
+     *      function.
+     *      If “NONE” is specified then it results in a linear activation.
+     * * 21:cell_clip.
+     *      A clipping threshold for the cell state, such that values are bound
+     *      within [-cell_clip, cell_clip]. If set to 0.0 then clipping is
+     *      disabled.
+     * * 22:proj_clip.
+     *      A clipping threshold for the output from the projection layer, such
+     *      that values are bound within [-proj_clip, proj_clip]. If set to 0.0
+     *      then clipping is disabled.
+     *
+     * Outputs:
+     * * 0: scratch_buffer.
+     *      A 3-D tensor of type T, of shape [batch_size, num_cell, 4].
+     * * 1: output_state (out).
+     *      A 2-D tensor of type T, of shape [batch_size, output_size].
+     * * 2: cell_state (out).
+     *      A 2-D tensor of type T, of shape [batch_size, num_units].
+     * * 3: output.
+     *      A 2-D tensor of type T, of shape [batch_size, output_size]. This is
+     *      effectively the same as the current “output_state” value.
+     */
+    ANEURALNETWORKS_LSTM = 16,
+
+    /** Performs an 2-D max pooling operation.
+     *
+     * The output dimensions are functions of the filter dimensions, stride, and padding.
+     *
+     * The values in the output tensor are computed as:
+     *
+     *     output[batch, row, col, channel] =
+     *         max_{i, j} (input[batch, row + i, col + j, channel])
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Both explicit padding and implicit padding are supported.
+     *
+     * Inputs (explicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the padding on the left, in the ‘width’ dimension.
+     * * 2: An INT32 value, specifying the padding on the right,in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the padding on the top, in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’ dimension.
+     * * 5: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 6: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 7: An INT32 value, specifying the filter width.
+     * * 8: An INT32 value, specifying the filter height.
+     * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Inputs (implicit padding):
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the implicit padding scheme, has to be one of the
+     *      {@link PaddingCode} values.
+     * * 2: An INT32 value, specifying the stride when walking through input
+     *      in the ‘width’ dimension.
+     * * 3: An INT32 value, specifying the stride when walking through input
+     *      in the ‘height’ dimension.
+     * * 4: An INT32 value, specifying the filter width.
+     * * 5: An INT32 value, specifying the filter height.
+     * * 6: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, out_height, out_width, depth].
+     */
+    ANEURALNETWORKS_MAX_POOL_2D = 17,
+
+    /** Multiplies two tensors, element-wise.
+     *
+     * Takes two input tensors of identical type and compatible dimensions. The output
+     * is the product of both input tensors, optionally modified by an activation function.
+     *
+     * Two dimensions are compatible when:
+     *     1. they are equal, or
+     *     2. one of them is 1
+     *
+     * The size of the resulting output is the maximum size along each dimension of the
+     * input operands. It starts with the trailing dimensions, and works its way forward.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4
+     *
+     * Inputs:
+     * * 0: A tensor.
+     * * 1: A tensor of the same type, and compatible dimensions as input0.
+     * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
+     *      Specifies the activation to invoke on the result of each addition.
+     *
+     * Outputs:
+     * * 0: The product, a tensor of the same type as input0.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the following
+     *      condition must be satisfied: output_scale > input1_scale * input2_scale.
+     */
+    ANEURALNETWORKS_MUL = 18,
+
+    /** Computes rectified linear activation on the input tensor element-wise.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output = max(0, input)
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     */
+    ANEURALNETWORKS_RELU = 19,
+
+    /** Computes rectified linear 1 activation on the input tensor element-wise.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output = min(1.f, max(-1.f, input))
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     */
+    ANEURALNETWORKS_RELU1 = 20,
+
+    /** Computes rectified linear 6 activation on the input tensor element-wise.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output = min(6, max(0, input))
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     */
+    ANEURALNETWORKS_RELU6 = 21,
+
+    /** Reshapes a tensor.
+     *
+     * Given tensor, this operation returns a tensor that has the same values as tensor,
+     * but with a newly specified shape.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the tensor to be reshaped.
+     * * 1: A 1-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}, defining the shape
+     *      of the output tensor. The number of elements implied by shape must be the same
+     *      as the number of elements in the input tensor.
+     *
+     * Outputs:
+     * * 0: The output tensor, of shape specified by the input shape.
+     */
+    ANEURALNETWORKS_RESHAPE = 22,
+
+    /** Resizes images to given size using the bilinear interpretation.
+     *
+     * Resized images will be distorted if their output aspect ratio is not the same as
+     * input aspect ratio.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Inputs:
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the input.
+     * * 1: An INT32 value, specifying the output height of the output tensor.
+     * * 2: An INT32 value, specifying the output width of the output tensor.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batches, new_height, new_width, depth].
+     */
+    ANEURALNETWORKS_RESIZE_BILINEAR = 23,
+
+    /**
+     * A basic recurrent neural network layer.
+     *
+     * This layer implements the operation:
+     * outputs = state = activation(inputs * input_weights + state * recurrent_weights + bias)
+     *
+     * Where:
+     * * “input_weights” is a weight matrix that multiplies the inputs;
+     * * “recurrent_weights” is a weight matrix that multiplies the current
+     *    “state” which itself is the output from the previous time step
+     *    computation;
+     * * “bias” is a bias vector (added to each output vector in the batch);
+     * * “activation” is the function passed as the “fused_activation_function”
+     *   argument (if not “NONE”).
+     *
+     * Supported tensor types (Type T):
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Inputs:
+     * * 0: input.
+     *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+     *      “batch_size” corresponds to the batching dimension, and “input_size” is
+     *      the size of the input.
+     * * 1: weights.
+     *      A 2-D tensor of type T, of shape [num_units, input_size], where
+     *      “num_units” corresponds to the number of units.
+     * * 2: recurrent_weights.
+     *      A 2-D tensor of type T, of shape [num_units, num_units], with columns
+     *      corresponding to the weights from each unit.
+     * * 3: bias.
+     *      A 1-D tensor of type T, of shape [num_units].
+     * * 4: hidden state (in).
+     *      A 2-D tensor of type T, of shape [batch_size, num_units].
+     * * 5: fused_activation_function.
+     *      An optional {@link FuseCode} value indicating the activation
+     *      function. If “NONE” is specified then it results in a linear
+     *      activation.
+     *
+     * Outputs:
+     * * 0: hidden state (out).
+     *      A 2-D tensor of type T, of shape [batch_size, num_units].
+     *
+     * * 1: output.
+     *      A 2-D tensor of type T, of shape [batch_size, num_units]. This is
+     *      effectively the same as the current state value.
+     */
+    ANEURALNETWORKS_RNN = 24,
+
+    /** Computes the softmax activation on the input tensor element-wise, per batch, by
+     * normalizing the input vector so the maximum coefficient is zero.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output[batch, i] =
+     *         exp((input[batch, i] - max(input[batch, :])) * beta) /
+     *         sum_{k}{exp((input[batch, k] - max(input[batch, :])) * beta)}
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 2 or 4.
+     *
+     * Inputs:
+     * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
+     * * 1: A FLOAT32 value, specifying the positive scaling factor for the exponent, beta.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type,
+     *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     */
+    ANEURALNETWORKS_SOFTMAX = 25,
+
+    /** Rearranges blocks of spatial data, into depth.
+     *
+     * More specifically, this op outputs a copy of the input tensor where values from
+     * the height and width dimensions are moved to the depth dimension.
+     * The value block_size indicates the input block size and how the data is moved.
+     *
+     * Chunks of data of size block_size * block_size from depth are rearranged into
+     * non-overlapping blocks of size block_size x block_size.
+     *
+     * The depth of the output tensor is input_depth * block_size * block_size.
+     * The input tensor's height and width must be divisible by block_size.
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *
+     * Supported tensor rank: 4, with "NHWC" data layout.
+     *
+     * Inputs:
+     * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying the input.
+     * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
+     *      block_size must be a divisor of both the input height and width.
+     *
+     * Outputs:
+     * * 0: The output 4-D tensor, of shape [batch, height/block_size, width/block_size,
+     *      depth*block_size*block_size].
+     */
+    ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
+
+    /**
+     * SVDF op is a kind of stateful layer derived from the notion that a
+     * densely connected layer that's processing a sequence of input frames can
+     * be approximated by using a singular value decomposition of each of its
+     * nodes. The implementation is based on:
+     *
+     * https://research.google.com/pubs/archive/43813.pdf
+     *
+     * P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
+     * “Compressing Deep Neural Networks using a Rank-Constrained Topology”.
+     * INTERSPEECH, 2015.
+     *
+     * It processes the incoming input using a 2-stage filtering mechanism:
+     * * stage 1 performs filtering on the "features" dimension, whose outputs get
+     *   pushed into a memory of fixed-size memory_size.
+     * * stage 2 performs filtering on the "time" dimension of the memory_size
+     *   memoized outputs of stage 1.
+     *
+     * Specifically, for rank 1, this layer implements the operation:
+     *
+     *    memory = push(conv1d(inputs, weights_feature, feature_dim,
+     *                  "ANEURALNETWORKS_PADDING_VALID"));
+     *    outputs = activation(memory * weights_time + bias);
+     *
+     * Where:
+     * * “weights_feature” is a weights matrix that processes the inputs (by
+     *   convolving the input with every “feature filter”), and whose outputs get
+     *   pushed, stacked in order, into the fixed-size “memory” (the oldest entry
+     *   gets dropped);
+     * * “weights_time” is a weights matrix that processes the “memory” (by a
+     *   batched matrix multiplication on the num_units);
+     * * “bias” is an optional bias vector (added to each output vector in the
+     *   batch); and
+     * * “activation” is the function passed as the “fused_activation_function”
+     *   argument (if not “NONE”).
+     *
+     * Each rank adds a dimension to the weights matrices by means of stacking
+     * the filters.
+     *
+     * Supported tensor types (type T):
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Inputs:
+     * * 0: input.
+     *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+     *      “batch_size” corresponds to the batching dimension, and “input_size” is
+     *      the size of the input.
+     * * 1: weights_feature.
+     *      A 2-D tensor of type T, of shape [num_units, input_size], where
+     *      “num_units” corresponds to the number of units.
+     * * 2: weights_time.
+     *      A 2-D tensor of type T, of shape [num_units, memory_size], where
+     *      “memory_size” corresponds to the fixed-size of the memory.
+     * * 3: bias.
+     *      An optional 1-D tensor of type T, of shape [num_units].
+     * * 4: state (in).
+     *      A 2-D tensor of type T, of shape [batch_size, (memory_size - 1) * num_units * rank].
+     * * 5: rank.
+     *      The rank of the SVD approximation.
+     * * 6: fused_activation_function.
+     *      An optional {@link FuseCode} value indicating the activation function.
+     *      If “NONE” is specified then it results in a linear activation.
+     *
+     * Outputs:
+     * * 0: state (out).
+     *      A 2-D tensor of type T, of shape [batch_size, (memory_size - 1) * num_units * rank].
+     * * 1: output.
+     *      A 2-D tensor of type T, of shape [batch_size, num_units].
+     */
+    ANEURALNETWORKS_SVDF = 27,
+
+    /** Computes hyperbolic tangent of input tensor element-wise.
+     *
+     * The output is calculated using this formula:
+     *
+     *     output = tanh(input)
+     *
+     * Supported tensor types:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: up to 4.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape as input0.
+     */
+    ANEURALNETWORKS_TANH = 28,
+} OperationCode;
+
+/**
+ * Fused activation function types.
+ *
+ */
+typedef enum {
+    /** NO fused activation function. */
+    ANEURALNETWORKS_FUSED_NONE = 0,
+    /** Fused ReLU activation function. */
+    ANEURALNETWORKS_FUSED_RELU = 1,
+    /** Fused ReLU1 activation function. */
+    ANEURALNETWORKS_FUSED_RELU1 = 2,
+    /** Fused ReLU6 activation function. */
+    ANEURALNETWORKS_FUSED_RELU6 = 3,
+} FuseCode;
+
+/**
+ * Implicit padding algorithms.
+ *
+ */
+typedef enum {
+    /**
+     * SAME padding.
+     * Padding on both ends are the "same":
+     *     padding_to_beginning =  total_padding / 2
+     *     padding_to_end       = (total_padding + 1)/2.
+     * i.e., for even number of padding, padding to both ends are exactly
+     * the same; for odd number of padding, padding to the ending is bigger
+     * than the padding to the beginning by 1.
+     *
+     * total_padding is a function of input, stride and filter size.
+     * It could be computed as follows:
+     *    out_size = (input + stride - 1) / stride;
+     *    needed_input = (out_size - 1) * stride + filter_size
+     *    total_padding = max(0, needed_input - output_size)
+     *  The computation is the same for the horizontal and vertical directions.
+     */
+    ANEURALNETWORKS_PADDING_SAME = 1,
+
+    /**
+     * VALID padding.
+     * No padding. When the input size is not evenly divisible by
+     * the filter size, the input at the end that could not fill
+     * the whole filter tile will simply be ignored.
+     */
+    ANEURALNETWORKS_PADDING_VALID = 2,
+} PaddingCode;
+
+/**
+ * Execution preferences.
+ */
+typedef enum {
+    /**
+     * Prefer executing in a way that minimizes battery drain.
+     * This is desirable for compilations that will be executed often.
+     */
+    ANEURALNETWORKS_PREFER_LOW_POWER = 0,
+    /**
+     * Prefer returning a single answer as fast as possible, even if this causes
+     * more power consumption.
+     */
+    ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
+    /**
+     * Prefer maximizing the throughput of successive frames, for example when
+     * processing successive frames coming from the camera.
+     */
+    ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
+} PreferenceCode;
+
+/**
+ * Result codes.
+ */
+typedef enum {
+    ANEURALNETWORKS_NO_ERROR = 0,
+    ANEURALNETWORKS_OUT_OF_MEMORY = 1,
+    ANEURALNETWORKS_INCOMPLETE = 2,
+    ANEURALNETWORKS_UNEXPECTED_NULL = 3,
+    ANEURALNETWORKS_BAD_DATA = 4,
+    ANEURALNETWORKS_OP_FAILED = 5,
+    ANEURALNETWORKS_UNMAPPABLE = 5,
+    ANEURALNETWORKS_BAD_STATE = 6,
+} ResultCode;
+
+/**
+ * For {@link ANeuralNetworksModel_setOperandValue}, values with a
+ * length smaller or equal to this will be immediately copied into
+ * the model. The size is in bytes.
+ */
+enum {
+    ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128
+};
+
+/**
+ * ANeuralNetworksMemory is an opaque type that represents memory.
+ *
+ * This type is used to represent shared memory, memory mapped files,
+ * and similar memories.
+ *
+ * By using shared memory, a program can efficiently communicate to the
+ * runtime and drivers the tensors that define a model. See
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
+ * should typically create one shared memory object that contains every tensor
+ * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
+ * used to create shared memory from a file handle. {@link ANeuralNetworksMemory_createShared}
+ * can be used to directly created shared memory.
+ *
+ * Memory objects can also be used to specify the input and output arguments of
+ * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
+ * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ */
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+/**
+ * ANeuralNetworksModel is an opaque type that contains a description of the
+ * mathematical operations that constitute the model.
+ *
+ * <p>The model will be built by calling<ul>
+ * <li>{@link ANeuralNetworksModel_create},</li>
+ * <li>{@link ANeuralNetworksModel_addOperation},</li>
+ * <li>{@link ANeuralNetworksModel_addOperand},</li>
+ * </ul>
+ *
+ * A model is completed by calling {@link ANeuralNetworksModel_finish}.
+ * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
+ *
+ * <p>A model cannot be modified once {@link ANeuralNetworksModel_finish}
+ * has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a model at a given time. It is however safe for more than one
+ * thread to use the model once {@link ANeuralNetworksModel_finish} has returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no other
+ * uses of the model after calling {@link ANeuralNetworksModel_free}.
+ * This includes any compilation or execution object created using the model.</p>
+ */
+typedef struct ANeuralNetworksModel ANeuralNetworksModel;
+
+/**
+ * ANeuralNetworksCompilation is an opaque type that can be used to compile
+ * a machine learning model.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new compilation instance by calling the
+ *        {@link ANeuralNetworksCompilation_create} function.</li>
+ *    <li>Set any desired properties on the compilation (for example,
+ *        {@link ANeuralNetworksCompilation_setPreference}).</li>
+ *    <li>Complete the compilation with {@link ANeuralNetworksCompilation_finish}.</li>
+ *    <li>Use the compilation as many times as needed
+ *        with {@link ANeuralNetworksExecution_create}.</li>
+ *    <li>Destroy the compilation with {@link ANeuralNetworksCompilation_free}
+ *        once all executions using the compilation have completed.</li></ul></p>
+ *
+ * A compilation is completed by calling {@link ANeuralNetworksCompilation_finish}.
+ * A compilation is destroyed by calling {@link ANeuralNetworksCompilation_free}.
+ *
+ * <p>A compilation cannot be modified once {@link ANeuralNetworksCompilation_finish}
+ * has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only
+ * one thread modifies a compilation at a given time. It is however
+ * safe for more than one thread to use the compilation once
+ * {@link ANeuralNetworksCompilation_finish} has returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no other
+ * uses of the compilation after calling {@link ANeuralNetworksCompilation_free}.
+ * This includes any execution object created using the compilation.</p>
+ */
+typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
+
+/**
+ * ANeuralNetworksExecution is an opaque type that can be used to apply a machine
+ * learning model to a set of inputs.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new execution instance by calling the
+ *        {@link ANeuralNetworksExecution_create} function.</li>
+ *    <li>Associate data to the model inputs with
+ *        {@link ANeuralNetworksExecution_setInput} or
+ *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
+ *    <li>Associate output buffers to the model outputs with
+ *        {@link ANeuralNetworksExecution_setOutput} or
+ *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
+ *    <li>Apply the model with {@link ANeuralNetworksExecution_startCompute}.</li>
+ *    <li>Wait for the execution to complete with {@link
+ *        ANeuralNetworksEvent_wait}.</li>
+ *    <li>Destroy the execution with
+ *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
+ *
+ * <p>An execution cannot be modified once {@link ANeuralNetworksExecution_startCompute}
+ * has been called on it.</p>
+ *
+ * <p>An execution can be applied to a model with
+ * {@link ANeuralNetworksExecution_startCompute} only once. Create new executions
+ * to do new evaluations of the model.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies an execution at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksEvent_wait} at the same time.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no other
+ * uses of the request after calling {@link ANeuralNetworksExecution_free}.</p>
+ */
+typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
+
+/**
+ * ANeuralNetworksOperandType describes the type of an operand.
+ * This structure is used to describe both scalars and tensors.
+ */
+typedef struct ANeuralNetworksOperandType {
+    /** The data type, e.g ANEURALNETWORKS_INT8. */
+    int32_t type;
+    /** The number of dimensions. It should be 0 for scalars. */
+    uint32_t dimensionCount;
+    /** The dimensions of the tensor. It should be nullptr for scalars. */
+    const uint32_t* dimensions;
+    /** These two fields are only used for quantized tensors.
+     * They should be zero for scalars and non-fixed point tensors.
+     * The dequantized value of each entry is (value - zeroPoint) * scale.
+     */
+    float scale;
+    int32_t zeroPoint;
+} ANeuralNetworksOperandType;
+
+typedef int32_t ANeuralNetworksOperationType;
+
+/**
+ * ANeuralNetworksEvent is an opaque type that represents an event
+ * that will be signaled once an execution completes.
+ */
+typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
+
+
+/**
+ * Creates a shared memory object from a file descriptor.
+ *
+ * The shared memory is backed by a file descriptor via mmap.
+ * See {@link ANeuralNetworksMemory} for a description on how to use
+ * this shared memory.
+ *
+ * @param size The requested size in bytes.
+ *             Must not be larger than the file size.
+ * @param prot The desired memory protection for the mapping.
+ *             It is either PROT_NONE or the bitwise OR of one or
+ *             more of the following flags: PROT_READ, PROT_WRITE.
+ * @param fd The requested file descriptor.
+ *           The file descriptor has to be mmap-able. The file
+ *           descriptor will be duplicated.
+ * @param offset The offset to the beginning of the file of the area to map.
+ *               The offset has to be aligned to a page size.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ */
+int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd, size_t offset,
+                                       ANeuralNetworksMemory** memory);
+
+/**
+ * Delete a memory object.
+ *
+ * Destroys the object used by the run time to keep track of the memory.
+ * This will free the underlying actual memory if no other code has open
+ * handles to this memory.
+ *
+ * @param memory The memory object to be freed.
+ */
+void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory);
+
+/**
+ * Create an empty {@link ANeuralNetworksModel}.
+ *
+ * <p>This only creates the object. Computation is performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * The model should be constructed with calls to
+ * {@link ANeuralNetworksModel_addOperation} and
+ * {@link ANeuralNetworksModel_addOperand}
+ *
+ * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+ * has been fully constructed.</p>
+ *
+ * <p>{@link ANeuralNetworksModel_free} should be called once the model
+ * is no longer needed.</p>
+ *
+ * @param model The {@link ANeuralNetworksModel} to be created.
+ *              Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_create(ANeuralNetworksModel** model);
+
+/**
+ * Destroy a model.
+ *
+ * The model need not have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+void ANeuralNetworksModel_free(ANeuralNetworksModel* model);
+
+/**
+ * Indicate that we have finished modifying a model. Required before
+ * calling {@link ANeuralNetworksCompilation_create}.
+ *
+ * An application is responsible to make sure that no other thread uses
+ * the model at the same time.
+ *
+ * This function must only be called once for a given model.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_finish(ANeuralNetworksModel* model);
+
+/**
+ * Add an operand to a model.
+ *
+ * The order in which the operands are added is important. The first one added
+ * to a model will have the index value 0, the second 1, etc. These indexes are
+ * used as operand identifiers in {@link ANeuralNetworksModel_addOperation},
+ * {@link ANeuralNetworksExecution_setInput},
+ * {@link ANeuralNetworksExecution_setInputFromMemory},
+ * {@link ANeuralNetworksExecution_setOutput},
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+ * {@link ANeuralNetworksExecution_setOperandValue}.
+ *
+ * To build a model that can accomodate inputs of various sizes, as you may want
+ * to do for a CNN, set the size of the dimensions that will vary at run time to 0.
+ * If you do so, provide the full dimensions when calling
+ * {@link ANeuralNetworksExecution_setInput} or {@link ANeuralNetworksExecution_setInputFromMemory}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+ * of the operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
+                                    const ANeuralNetworksOperandType* type);
+
+/**
+ * Sets an operand to a constant value.
+ *
+ * Values of length smaller or equal to
+ * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}
+ * are immediately copied into the model.
+ *
+ * For values of length greater than {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES},
+ * a pointer to the buffer is stored within the model. The application is responsible
+ * for not changing the content of this region until all executions using this model
+ * have completed. As the data may be copied during processing, modifying the data
+ * after this call yields undefined results.
+ *
+ * For large tensors, using {@link ANeuralNetworksModel_setOperandValueFromMemory}
+ * is likely to be more efficient.
+ *
+ * To indicate that an optional operand should be considered missing,
+ * pass nullptr for buffer and 0 for length.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
+                                         const void* buffer, size_t length);
+
+/**
+ * Sets an operand to a value stored in a memory object.
+ *
+ * The content of the memory is not copied. A reference to that memory is stored
+ * inside the model. The application is responsible for not changing the content
+ * of the memory region until all executions using this model have completed.
+ * As the data may be copied during processing, modifying the data after this call
+ * yields undefined results.
+ *
+ * To indicate that an optional operand should be considered missing,
+ * use {@link ANeuralNetworksModel_setOperandValue} instead, passing nullptr for buffer.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data within the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index,
+                                                   const ANeuralNetworksMemory* memory,
+                                                   size_t offset, size_t length);
+
+/**
+ * Add an operation to a model.
+ *
+ * @param model The model to be modified.
+ * @param type The type of the operation.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying each operand.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying each operand.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
+                                      ANeuralNetworksOperationType type, uint32_t inputCount,
+                                      const uint32_t* inputs, uint32_t outputCount,
+                                      const uint32_t* outputs);
+
+/**
+ * Specfifies which operands will be the model's inputs and outputs.
+ *
+ * An operand cannot be used for both input and output. Doing so will
+ * return an error.
+ *
+ * @param model The model to be modified.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying the input operands.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying the output operands.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ */
+int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount,
+                                                  const uint32_t* inputs, uint32_t outputCount,
+                                                  const uint32_t* outputs);
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+ *
+ * <p>This only creates the object. Compilation is only performed once
+ * {@link ANeuralNetworksCompilation_finish} is invoked.</p>
+ *
+ * <p>{@link ANeuralNetworksCompilation_finish} should be called once
+ * all desired properties have been set on the compilation.</p>
+ *
+ * <p>{@link ANeuralNetworksModel_free} should be called once the compilation
+ * is no longer needed.</p>
+ *
+ * <p>The provided model must outlive the compilation.</p>
+ *
+ * The model must already have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param model The {@link ANeuralNetworksModel} to be compiled.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ */
+int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
+                                      ANeuralNetworksCompilation** compilation);
+
+/**
+ * Destroy a compilation.
+ *
+ * The compilation need not have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param compilation The compilation to be destroyed. Passing NULL is acceptable and
+ *                    results in no operation.
+ */
+void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation);
+
+/**
+ * Sets the execution preference.
+ *
+ * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param preference Either {@link PREFER_LOW_POWER},
+ *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+ *                  {@link PREFER_SUSTAINED_SPEED}.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compilation,
+                                             int32_t preference);
+
+/**
+ * Indicate that we have finished modifying a compilation. Required before
+ * calling {@link ANeuralNetworksExecution_create}.
+ *
+ * An application is responsible to make sure that no other thread uses
+ * the compilation at the same time.
+ *
+ * This function must only be called once for a given compilation.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param compilation The compilation to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation);
+
+/**
+ * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+ * This only creates the object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * <p>The provided compilation must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param execution The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
+                                    ANeuralNetworksExecution** execution);
+
+/**
+ * Destroy an execution.
+ *
+ * <p>If called on an execution for which
+ * {@link ANeuralNetworksExecution_startCompute} has been called, the
+ * function will return immediately but will mark the execution to be deleted
+ * once the computation completes. The related {@link ANeuralNetworksEvent}
+ * will be signaled and the {@link ANeuralNetworksEvent_wait} will return
+ * ANEURALNETWORKS_ERROR_DELETED.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be destroyed. Passing NULL is acceptable and
+ *                  results in no operation.
+ */
+void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution);
+
+/**
+ * Associate a user buffer with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * If the input is optional, you can indicate that it is omitted by
+ * passing nullptr for buffer and 0 for length.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This should be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other properties of the type must be the same as
+ *             specified in the model. If the type is the same as specified
+ *             when the model was built, NULL can be passed.
+ * @param buffer The buffer containing the data.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if the
+ *         name is not recognized or the buffer is too small for the input.
+ */
+int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32_t index,
+                                      const ANeuralNetworksOperandType* type, const void* buffer,
+                                      size_t length);
+
+/**
+ * Associate part of a memory object with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * If the input is optional, you can indicate that it is omitted by
+ * using @{Link ANeuralNetworks_setInput} instead, passing nullptr for buffer
+ * and 0 for length.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data whithin the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if the
+ *         name is not recognized or the buffer is too small for the input.
+ */
+int ANeuralNetworksExecution_setInputFromMemory(ANeuralNetworksExecution* execution, int32_t index,
+                                                const ANeuralNetworksOperandType* type,
+                                                const ANeuralNetworksMemory* memory, size_t offset,
+                                                size_t length);
+
+/**
+ * Associate a user buffer with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * If the output is optional, you can indicate that it is omitted by
+ * passing nullptr for buffer and 0 for length.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param buffer The buffer where the data is to be written.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if the
+ *         name is not recognized or the buffer is too small for the output.
+ */
+int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int32_t index,
+                                       const ANeuralNetworksOperandType* type, void* buffer,
+                                       size_t length);
+
+/**
+ * Associate part of a memory object with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * If the output is optional, you can indicate that it is omitted by
+ * using @{Link ANeuralNetworks_setOutput} instead, passing nullptr for buffer
+ * and 0 for length.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory where the data is to be stored.
+ * @param offset This specifies the location of the data whithin the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The length in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if the
+ *         name is not recognized or the buffer is too small for the output.
+ */
+int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execution, int32_t index,
+                                                 const ANeuralNetworksOperandType* type,
+                                                 const ANeuralNetworksMemory* memory, size_t offset,
+                                                 size_t length);
+
+/**
+ * Schedule evaluation of the execution.
+ *
+ * <p>Schedules evaluation of the execution. Once the model has been
+ * applied and the outputs are ready to be consumed, the returned event will be
+ * signaled. Use {@link ANeuralNetworksEvent_wait} to wait for that event.
+ * </p>
+ *
+ * Multiple executions can be scheduled and evaluated concurrently. The
+ * runtime makes no guarantee on the ordering of completion of
+ * executions. If it's important to the application, the application
+ * should enforce the ordering by using
+ * {@link ANeuralNetworksEvent_wait}.
+ *
+ * ANeuralNetworksEvent_wait must be called to recuperate the resources used
+ * by the execution.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be scheduled and executed.
+ * @param event The event that will be signaled on completion. event is set to
+ *              NULL if there's an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
+                                          ANeuralNetworksEvent** event);
+
+/**
+ * Waits until the execution completes.
+ *
+ * More than one thread can wait on an event. When the execution completes,
+ * all threads will be released.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event);
+
+/**
+ * Destroys the event.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ */
+void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event);
+
+__END_DECLS
+
+#endif  //  __ANDROID_API__ >= 27
+
+#endif  // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+
+/** @} */
diff --git a/caffe2/mobile/contrib/nnapi/dlnnapi.c b/caffe2/mobile/contrib/nnapi/dlnnapi.c
new file mode 100644
index 0000000..153ca09
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/dlnnapi.c
@@ -0,0 +1,129 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dlnnapi.h"
+
+#define DLNNAPI_DEBUG_LOG 0
+#if DLNNAPI_DEBUG_LOG
+#include <android/log.h>
+#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "NNAPI", __VA_ARGS__)
+#endif
+
+#define TAG_API_27 "\x01"
+
+/* clang-format off */
+static const char function_names[] =
+    TAG_API_27 "ANeuralNetworksMemory_createFromFd\0"
+    TAG_API_27 "ANeuralNetworksMemory_free\0"
+    TAG_API_27 "ANeuralNetworksModel_create\0"
+    TAG_API_27 "ANeuralNetworksModel_finish\0"
+    TAG_API_27 "ANeuralNetworksModel_free\0"
+    TAG_API_27 "ANeuralNetworksCompilation_create\0"
+    TAG_API_27 "ANeuralNetworksCompilation_free\0"
+    TAG_API_27 "ANeuralNetworksCompilation_setPreference\0"
+    TAG_API_27 "ANeuralNetworksCompilation_finish\0"
+    TAG_API_27 "ANeuralNetworksModel_addOperand\0"
+    TAG_API_27 "ANeuralNetworksModel_setOperandValue\0"
+    TAG_API_27 "ANeuralNetworksModel_setOperandValueFromMemory\0"
+    TAG_API_27 "ANeuralNetworksModel_addOperation\0"
+    TAG_API_27 "ANeuralNetworksModel_identifyInputsAndOutputs\0"
+    TAG_API_27 "ANeuralNetworksExecution_create\0"
+    TAG_API_27 "ANeuralNetworksExecution_free\0"
+    TAG_API_27 "ANeuralNetworksExecution_setInput\0"
+    TAG_API_27 "ANeuralNetworksExecution_setInputFromMemory\0"
+    TAG_API_27 "ANeuralNetworksExecution_setOutput\0"
+    TAG_API_27 "ANeuralNetworksExecution_setOutputFromMemory\0"
+    TAG_API_27 "ANeuralNetworksExecution_startCompute\0"
+    TAG_API_27 "ANeuralNetworksEvent_wait\0"
+    TAG_API_27 "ANeuralNetworksEvent_free\0";
+/* clang-format on */
+
+bool dlnnapi_load(struct dlnnapi* nnapi, uint32_t flags) {
+  if (nnapi == NULL) {
+    return false;
+  }
+
+  memset(nnapi, 0, sizeof(struct dlnnapi));
+  if (!(flags & DLNNAPI_FLAG_VERSION_27)) {
+    /* No supported NNAPI version is requested */
+    return false;
+  }
+
+  /* Clear libdl error state */
+  dlerror();
+
+  nnapi->handle = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
+  if (nnapi->handle != NULL) {
+#if DLNNAPI_DEBUG_LOG
+    LOGI("note: loaded libneuralnetworks.so\n");
+#endif
+
+    uint8_t version_flags = (uint8_t)(flags & DLNNAPI_FLAG_VERSION_MASK);
+    const char* function_name = function_names;
+    for (size_t i = 0; i < DLNNAPI_FUNCTION_COUNT; i++) {
+      const uint8_t tag = (uint8_t)*function_name++;
+      if ((tag & version_flags) != 0) {
+        void* function = dlsym(nnapi->handle, function_name);
+        if (function == NULL) {
+#if DLNNAPI_DEBUG_LOG
+          LOGI(
+              "note: failed to locate %s in libneuralnetworks.so: %s\n",
+              function_name,
+              dlerror());
+#endif
+          version_flags &= ~tag;
+          if (version_flags == 0) {
+            goto failed;
+          }
+        }
+        nnapi->functions[i] = function;
+      }
+
+      function_name += strlen(function_name) + 1;
+    }
+    nnapi->flags = (uint32_t)version_flags;
+
+    return true;
+  }
+#if DLNNAPI_DEBUG_LOG
+  LOGI("note: failed to load libneuralnetworks.so: %s\n", dlerror());
+#endif
+
+failed:
+  dlnnapi_free(nnapi);
+  return false;
+}
+
+void dlnnapi_free(struct dlnnapi* nnapi) {
+  if (nnapi != NULL) {
+    if (nnapi->handle != NULL) {
+      /* Clear libdl error state */
+      dlerror();
+      if (dlclose(nnapi->handle) != 0) {
+#if DLNNAPI_DEBUG_LOG
+        LOGI("note: failed to unload libneuralnetworks.so: %s\n", dlerror());
+#endif
+      }
+    }
+    memset(nnapi, 0, sizeof(struct dlnnapi));
+  }
+}
diff --git a/caffe2/mobile/contrib/nnapi/dlnnapi.h b/caffe2/mobile/contrib/nnapi/dlnnapi.h
new file mode 100644
index 0000000..989d32b
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/dlnnapi.h
@@ -0,0 +1,104 @@
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "NeuralNetworks.h"
+
+#define DLNNAPI_FUNCTION_COUNT 23
+
+#define DLNNAPI_FLAG_VERSION_MASK 0xFF
+/* Android 8.1, API 27 version */
+#define DLNNAPI_FLAG_VERSION_27 0x01
+
+/* clang-format off */
+/* nn api function types */
+typedef int (*ANeuralNetworksMemory_createFromFd_fn)(size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory** memory);
+
+typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
+
+typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
+
+typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
+
+typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
+
+typedef int (*ANeuralNetworksCompilation_create_fn)(ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+typedef void (*ANeuralNetworksCompilation_free_fn)(ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksCompilation_setPreference_fn)(ANeuralNetworksCompilation* compilation, int32_t preference);
+
+typedef int (*ANeuralNetworksCompilation_finish_fn)(ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksModel_addOperand_fn)(ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandValue_fn)(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksModel_addOperation_fn)(ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksExecution_create_fn)(ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution);
+
+typedef void (*ANeuralNetworksExecution_free_fn)(ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setInput_fn)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutput_fn)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_startCompute_fn)(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
+
+typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
+
+struct dlnnapi {
+	void* handle;
+	uint32_t flags;
+	union {
+		struct {
+			/* ndk-r16b */
+			ANeuralNetworksMemory_createFromFd_fn             ANeuralNetworksMemory_createFromFd;
+			ANeuralNetworksMemory_free_fn                     ANeuralNetworksMemory_free;
+			ANeuralNetworksModel_create_fn                    ANeuralNetworksModel_create;
+			ANeuralNetworksModel_finish_fn                    ANeuralNetworksModel_finish;
+			ANeuralNetworksModel_free_fn                      ANeuralNetworksModel_free;
+			ANeuralNetworksCompilation_create_fn              ANeuralNetworksCompilation_create;
+			ANeuralNetworksCompilation_free_fn                ANeuralNetworksCompilation_free;
+			ANeuralNetworksCompilation_setPreference_fn       ANeuralNetworksCompilation_setPreference;
+			ANeuralNetworksCompilation_finish_fn              ANeuralNetworksCompilation_finish;
+			ANeuralNetworksModel_addOperand_fn                ANeuralNetworksModel_addOperand;
+			ANeuralNetworksModel_setOperandValue_fn           ANeuralNetworksModel_setOperandValue;
+			ANeuralNetworksModel_setOperandValueFromMemory_fn ANeuralNetworksModel_setOperandValueFromMemory;
+			ANeuralNetworksModel_addOperation_fn              ANeuralNetworksModel_addOperation;
+			ANeuralNetworksModel_identifyInputsAndOutputs_fn  ANeuralNetworksModel_identifyInputsAndOutputs;
+			ANeuralNetworksExecution_create_fn                ANeuralNetworksExecution_create;
+			ANeuralNetworksExecution_free_fn                  ANeuralNetworksExecution_free;
+			ANeuralNetworksExecution_setInput_fn              ANeuralNetworksExecution_setInput;
+			ANeuralNetworksExecution_setInputFromMemory_fn    ANeuralNetworksExecution_setInputFromMemory;
+			ANeuralNetworksExecution_setOutput_fn             ANeuralNetworksExecution_setOutput;
+			ANeuralNetworksExecution_setOutputFromMemory_fn   ANeuralNetworksExecution_setOutputFromMemory;
+			ANeuralNetworksExecution_startCompute_fn          ANeuralNetworksExecution_startCompute;
+			ANeuralNetworksEvent_wait_fn                      ANeuralNetworksEvent_wait;
+			ANeuralNetworksEvent_free_fn                      ANeuralNetworksEvent_free;
+		};
+		void* functions[DLNNAPI_FUNCTION_COUNT];
+	};
+};
+/* clang-format on */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool dlnnapi_load(struct dlnnapi* nnapi, uint32_t flags);
+void dlnnapi_free(struct dlnnapi* nnapi);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
new file mode 100644
index 0000000..3f05149
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -0,0 +1,715 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "nnapi.h"
+
+namespace {
+// Bug: ANEURALNETWORKS_UNMAPPABLE and ANEURALNETWORKS_OP_FAILED share the same
+// enum value
+void reportError(int result_code) {
+  switch (result_code) {
+    case ANEURALNETWORKS_NO_ERROR:
+      break;
+    case ANEURALNETWORKS_OUT_OF_MEMORY:
+      CAFFE_THROW("out of memory");
+    case ANEURALNETWORKS_INCOMPLETE:
+      CAFFE_THROW("incomplete");
+    case ANEURALNETWORKS_UNEXPECTED_NULL:
+      CAFFE_THROW("unexpected null");
+    case ANEURALNETWORKS_BAD_DATA:
+      CAFFE_THROW("bad data");
+    case ANEURALNETWORKS_OP_FAILED:
+      CAFFE_THROW("op failed or unmappable");
+    case ANEURALNETWORKS_BAD_STATE:
+      CAFFE_THROW("bad state");
+    default:
+      CAFFE_THROW("unknown error");
+  }
+}
+} // namespace
+
+namespace caffe2 {
+
+bool NNApi::loadNNApiLibrary() {
+  return dlnnapi_load(&libnnapi_, DLNNAPI_FLAG_VERSION_27);
+}
+
+NNApi::~NNApi() {
+  if (run_end_) {
+    libnnapi_.ANeuralNetworksEvent_free(run_end_);
+  }
+  if (run_) {
+    libnnapi_.ANeuralNetworksExecution_free(run_);
+  }
+  if (compilation_) {
+    libnnapi_.ANeuralNetworksCompilation_free(compilation_);
+  }
+  if (model_) {
+    libnnapi_.ANeuralNetworksModel_free(model_);
+  }
+}
+
+bool NNApi::run(const TensorVector& inputs, TensorVector* outputs) {
+  CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
+  try {
+    init(inputs, outputs);
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Error duing model initialization: " << e.what();
+    return false;
+  }
+
+  try {
+    VLOG(1) << "Start compute";
+    int result_code =
+        libnnapi_.ANeuralNetworksExecution_startCompute(run_, &run_end_);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+    result_code = libnnapi_.ANeuralNetworksEvent_wait(run_end_);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+    VLOG(1) << "Finish compute";
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Error during model run: " << e.what();
+    return false;
+  }
+  return true;
+}
+
+void NNApi::getConvPoolArgs(const ArgumentHelper& helper, ConvPoolArgs& args) {
+  std::vector<int> kernel(helper.GetRepeatedArgument<int>("kernels"));
+  std::vector<int> stride(helper.GetRepeatedArgument<int>("strides"));
+  std::vector<int> pads(helper.GetRepeatedArgument<int>("pads"));
+
+  // Get old arguments values
+  if (helper.HasArgument("kernel")) {
+    kernel.resize(2, helper.GetSingleArgument<int>("kernel", 0));
+  } else if (helper.HasArgument("kernelh") && helper.HasArgument("kernelw")) {
+    kernel.push_back(helper.GetSingleArgument<int>("kernelh", 0));
+    kernel.push_back(helper.GetSingleArgument<int>("kernelw", 0));
+  }
+
+  if (helper.HasArgument("stride")) {
+    stride.resize(2, helper.GetSingleArgument<int>("stride", 0));
+  } else if (helper.HasArgument("stride_h") && helper.HasArgument("stride_w")) {
+    stride.push_back(helper.GetSingleArgument<int>("stride_h", 0));
+    stride.push_back(helper.GetSingleArgument<int>("stride_w", 0));
+  }
+
+  if (helper.HasArgument("pad")) {
+    pads.resize(4, helper.GetSingleArgument<int>("pad", 0));
+  } else if (
+      helper.HasArgument("pad_t") && helper.HasArgument("pad_l") &&
+      helper.HasArgument("pad_b") && helper.HasArgument("pad_r")) {
+    pads.push_back(helper.GetSingleArgument<int>("pad_t", 0));
+    pads.push_back(helper.GetSingleArgument<int>("pad_l", 0));
+    pads.push_back(helper.GetSingleArgument<int>("pad_b", 0));
+    pads.push_back(helper.GetSingleArgument<int>("pad_r", 0));
+  }
+
+  // Commit values
+  args.kernel_h = kernel.size() > 0 ? kernel[0] : 1;
+  args.kernel_w = kernel.size() > 1 ? kernel[1] : args.kernel_h;
+  args.stride_x = stride.size() > 0 ? stride[0] : 1;
+  args.stride_y = stride.size() > 1 ? stride[1] : 1;
+  args.pad_t = pads.size() > 0 ? pads[0] : 0;
+  args.pad_l = pads.size() > 1 ? pads[1] : 0;
+  args.pad_b = pads.size() > 2 ? pads[2] : 0;
+  args.pad_r = pads.size() > 3 ? pads[3] : 0;
+}
+
+void NNApi::addPooling(
+    const OperatorDef& op,
+    OperationCode op_code,
+    bool fuse_relu)
+// clang-format off
+{
+  // clang-format on
+  VLOG(1) << "Add AveragePool to NN model";
+  CAFFE_ENFORCE_EQ(op.input_size(), 1);
+  CAFFE_ENFORCE_EQ(op.output_size(), 1);
+  ArgumentHelper helper(op);
+  StorageOrder order = StringToStorageOrder(
+      helper.GetSingleArgument<std::string>("order", "NCHW"));
+  if (order == NCHW) {
+    CAFFE_THROW("NN API supports NHWC only");
+  }
+
+  ConvPoolArgs args;
+  getConvPoolArgs(helper, args);
+  CAFFE_ENFORCE_EQ(
+      args.stride_x,
+      args.stride_y,
+      "NN API only supports stride_x == stride_y");
+
+  // add input operands to model
+  const uint32_t input_indices_count = 10;
+  const uint32_t output_indices_count = 1;
+  uint32_t input_indices[input_indices_count];
+  uint32_t output_indices[output_indices_count];
+
+  uint32_t idx = 0;
+  // input
+  const std::string& input = op.input(0);
+  const std::vector<uint32_t>& input_dims = tensor_dims_[input];
+  input_indices[idx++] = operand_map_[input];
+
+  CAFFE_ENFORCE_EQ(input_dims.size(), 4);
+  uint32_t batches = input_dims[0];
+  uint32_t input_height = input_dims[1];
+  uint32_t input_width = input_dims[2];
+  uint32_t channel = input_dims[3];
+
+  // pads in the order of left, right, top, bottom
+  input_indices[idx++] = addScalarOperand(args.pad_l);
+  input_indices[idx++] = addScalarOperand(args.pad_r);
+  input_indices[idx++] = addScalarOperand(args.pad_t);
+  input_indices[idx++] = addScalarOperand(args.pad_b);
+
+  // strides
+  input_indices[idx++] = addScalarOperand(args.stride_x);
+  input_indices[idx++] = addScalarOperand(args.stride_y);
+
+  // kernel size
+  input_indices[idx++] = addScalarOperand(args.kernel_h);
+  input_indices[idx++] = addScalarOperand(args.kernel_w);
+
+  // fuse relu
+  FuseCode fuse = fuse_relu ? FuseCode::ANEURALNETWORKS_FUSED_RELU
+                            : FuseCode::ANEURALNETWORKS_FUSED_NONE;
+  input_indices[idx] = addScalarOperand(fuse);
+
+  // output
+  uint32_t output_height =
+      (input_height - args.kernel_h + args.pad_t + args.pad_b) / args.stride_y +
+      1;
+  uint32_t output_width =
+      (input_width - args.kernel_w + args.pad_l + args.pad_r) / args.stride_x +
+      1;
+
+  float output_scale = helper.GetSingleArgument<float>("output_scale", 1.0);
+  int output_zero_point = helper.GetSingleArgument<int>("output_zero_point", 0);
+
+  std::vector<uint32_t> dims({batches, output_height, output_width, channel});
+  output_indices[0] = addTensorOperand(
+      op.output(0), tensor_type_, dims, output_scale, output_zero_point);
+
+  int result_code = libnnapi_.ANeuralNetworksModel_addOperation(
+      model_, op_code, input_indices_count, input_indices, 1, output_indices);
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+}
+
+void NNApi::addConv(const OperatorDef& op, bool fuse_relu) {
+  VLOG(1) << "Add Conv to NN model";
+  CAFFE_ENFORCE_EQ(op.input_size(), 3);
+  CAFFE_ENFORCE_EQ(op.output_size(), 1);
+
+  ArgumentHelper helper(op);
+  StorageOrder order = StringToStorageOrder(
+      helper.GetSingleArgument<std::string>("order", "NCHW"));
+  CAFFE_ENFORCE_EQ(order, NHWC, "NN API supports NHWC only");
+
+  // input
+  const std::string& input = op.input(0);
+  const std::vector<uint32_t>& input_dims = tensor_dims_[input];
+
+  CAFFE_ENFORCE_EQ(input_dims.size(), 4);
+  uint32_t batches = input_dims[0];
+  uint32_t input_height = input_dims[1];
+  uint32_t input_width = input_dims[2];
+  uint32_t input_channel = input_dims[3];
+
+  uint32_t group = helper.GetSingleArgument<int>("group", 1);
+
+  bool run_depthwise = false;
+  if (group > 1) {
+    CAFFE_ENFORCE_EQ(
+        group,
+        input_channel,
+        "NN API doesn't support non-depthwise convolution with groups");
+    run_depthwise = true;
+  }
+
+  ConvPoolArgs args;
+  getConvPoolArgs(helper, args);
+
+  CAFFE_ENFORCE_EQ(
+      args.stride_x,
+      args.stride_y,
+      "NN API only supports stride_x == stride_y");
+
+  vector<int> dilation(helper.GetRepeatedArgument<int>("dilations"));
+  if (helper.HasArgument("dilation")) {
+    dilation.resize(2, helper.GetSingleArgument<int>("dilation", 0));
+  } else if (
+      helper.HasArgument("dilationh") && helper.HasArgument("dilationw")) {
+    dilation.push_back(helper.GetSingleArgument<int>("dilation_h", 0));
+    dilation.push_back(helper.GetSingleArgument<int>("dilation_w", 0));
+  }
+
+  for (auto d : dilation) {
+    CAFFE_ENFORCE_EQ(d, 1, "NN API only supports dialation == 1");
+  }
+
+  // add input operands to model
+  const uint32_t input_indices_count = run_depthwise ? 11 : 10;
+  const uint32_t output_indices_count = 1;
+  uint32_t input_indices[input_indices_count];
+  uint32_t output_indices[output_indices_count];
+
+  uint32_t idx = 0;
+  // input
+  input_indices[idx++] = operand_map_[input];
+
+  // weight
+  const std::string& weight_name = op.input(1);
+  const auto& weight = ws_.GetBlob(weight_name)->Get<TensorCPU>();
+  std::vector<uint32_t> weight_dims;
+  for (auto dim : weight.dims()) {
+    weight_dims.push_back(dim);
+  }
+  CAFFE_ENFORCE_EQ(weight_dims.size(), 4);
+  uint32_t num_kernels = weight_dims[0];
+  uint32_t kernel_h = weight_dims[1];
+  uint32_t kernel_w = weight_dims[2];
+  uint32_t kernel_depth = weight_dims[3];
+  CAFFE_ENFORCE_EQ(input_channel, kernel_depth);
+  if (run_depthwise) {
+    CAFFE_ENFORCE_EQ(num_kernels, 1);
+  }
+
+  float weight_scale = helper.GetSingleArgument<float>("weight_scale", 1.0);
+  int weight_zero_point = helper.GetSingleArgument<int>("weight_zero_point", 0);
+
+  uint32_t weight_idx = addTensorOperand(
+      weight_name, tensor_type_, weight_dims, weight_scale, weight_zero_point);
+
+  int result_code = libnnapi_.ANeuralNetworksModel_setOperandValue(
+      model_, weight_idx, weight.raw_data(), weight.nbytes());
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+  input_indices[idx++] = weight_idx;
+
+  // bias
+  const std::string& bias_name = op.input(2);
+  const auto& bias = ws_.GetBlob(bias_name)->Get<TensorCPU>();
+  std::vector<uint32_t> bias_dims;
+  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+  uint32_t bias_size = bias.dim(0);
+  if (!run_depthwise) {
+    CAFFE_ENFORCE_EQ(num_kernels, bias_size);
+  } else {
+    CAFFE_ENFORCE_EQ(kernel_depth, bias_size);
+  }
+  bias_dims.push_back(bias_size);
+
+  OperandCode bias_type = tensor_type_ == ANEURALNETWORKS_TENSOR_FLOAT32
+      ? ANEURALNETWORKS_TENSOR_FLOAT32
+      : ANEURALNETWORKS_TENSOR_INT32;
+  if (bias_type == ANEURALNETWORKS_TENSOR_FLOAT32) {
+    CAFFE_ENFORCE(bias.IsType<float>());
+  } else if (bias_type == ANEURALNETWORKS_TENSOR_INT32) {
+    CAFFE_ENFORCE(bias.IsType<int>());
+  }
+  uint32_t bias_idx = addTensorOperand(bias_name, bias_type, bias_dims);
+
+  result_code = libnnapi_.ANeuralNetworksModel_setOperandValue(
+      model_, bias_idx, bias.raw_data(), bias.nbytes());
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+  input_indices[idx++] = bias_idx;
+
+  // pads in the order of left, right, top, bottom
+  input_indices[idx++] = addScalarOperand(args.pad_l);
+  input_indices[idx++] = addScalarOperand(args.pad_r);
+  input_indices[idx++] = addScalarOperand(args.pad_t);
+  input_indices[idx++] = addScalarOperand(args.pad_b);
+
+  // strides
+  input_indices[idx++] = addScalarOperand(args.stride_x);
+  input_indices[idx++] = addScalarOperand(args.stride_y);
+
+  // depth_wise
+  if (run_depthwise) {
+    // depthwise multiplier == 1
+    input_indices[idx++] = addScalarOperand(1);
+  }
+
+  // fuse relu
+  FuseCode fuse = fuse_relu ? FuseCode::ANEURALNETWORKS_FUSED_RELU
+                            : FuseCode::ANEURALNETWORKS_FUSED_NONE;
+  input_indices[idx] = addScalarOperand(fuse);
+
+  // output
+  uint32_t output_channel = run_depthwise ? kernel_depth : num_kernels;
+  uint32_t output_height =
+      (input_height - args.kernel_h + args.pad_t + args.pad_b) / args.stride_y +
+      1;
+  uint32_t output_width =
+      (input_width - args.kernel_w + args.pad_l + args.pad_r) / args.stride_x +
+      1;
+
+  float output_scale = helper.GetSingleArgument<float>("output_scale", 1.0);
+  int output_zero_point = helper.GetSingleArgument<int>("output_zero_point", 0);
+
+  std::vector<uint32_t> dims(
+      {batches, output_height, output_width, output_channel});
+  output_indices[0] = addTensorOperand(
+      op.output(0), tensor_type_, dims, output_scale, output_zero_point);
+  if (run_depthwise) {
+    CAFFE_ENFORCE_EQ(input_indices_count, 11);
+    result_code = libnnapi_.ANeuralNetworksModel_addOperation(
+        model_,
+        ANEURALNETWORKS_DEPTHWISE_CONV_2D,
+        input_indices_count,
+        input_indices,
+        output_indices_count,
+        output_indices);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+  } else {
+    CAFFE_ENFORCE_EQ(input_indices_count, 10);
+    result_code = libnnapi_.ANeuralNetworksModel_addOperation(
+        model_,
+        ANEURALNETWORKS_CONV_2D,
+        input_indices_count,
+        input_indices,
+        output_indices_count,
+        output_indices);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+  }
+}
+
+void NNApi::addRelu(const OperatorDef& op) {
+  VLOG(1) << "Add Relu to NN model";
+  CAFFE_ENFORCE_EQ(op.input_size(), 1);
+  CAFFE_ENFORCE_EQ(op.output_size(), 1);
+  const std::string& input = op.input(0);
+  uint32_t input_idx = operand_map_[input];
+
+  ArgumentHelper helper(op);
+  float output_scale = helper.GetSingleArgument<float>("output_scale", 1.0);
+  int output_zero_point = helper.GetSingleArgument<int>("output_zero_point", 0);
+
+  uint32_t output_idx = addTensorOperand(
+      op.output(0),
+      tensor_type_,
+      tensor_dims_[input],
+      output_scale,
+      output_zero_point);
+
+  int result_code = libnnapi_.ANeuralNetworksModel_addOperation(
+      model_, ANEURALNETWORKS_RELU, 1, &input_idx, 1, &output_idx);
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+}
+
+void NNApi::addSoftmax(const OperatorDef& op) {
+  VLOG(1) << "Add Softmax to NN model";
+  ArgumentHelper helper(op);
+  CAFFE_ENFORCE_EQ(
+      helper.GetSingleArgument<int>("axis", 1),
+      1,
+      "NN API only supports axis == 1");
+
+  uint32_t input_indices[2];
+  const std::string& input = op.input(0);
+  input_indices[0] = operand_map_[input];
+  const auto& input_dims = tensor_dims_[input];
+  CAFFE_ENFORCE(
+      input_dims.size() == 2 || input_dims.size() == 4,
+      "Supported tensor rank: 2 or 4");
+
+  // the positive scaling factor for the exponent, beta
+  const float scale = 1.0;
+  input_indices[1] = addFloatOperand(scale);
+
+  float output_scale = helper.GetSingleArgument<float>("output_scale", 1.0);
+  int output_zero_point = helper.GetSingleArgument<int>("output_zero_point", 0);
+  if (tensor_type_ == ANEURALNETWORKS_TENSOR_QUANT8_ASYMM) {
+    CAFFE_ENFORCE_EQ(output_scale, 1.f / 256);
+    CAFFE_ENFORCE_EQ(output_zero_point, 0);
+  }
+  uint32_t output_idx = addTensorOperand(
+      op.output(0),
+      tensor_type_,
+      tensor_dims_[input],
+      output_scale,
+      output_zero_point);
+
+  int result_code = libnnapi_.ANeuralNetworksModel_addOperation(
+      model_, ANEURALNETWORKS_SOFTMAX, 2, input_indices, 1, &output_idx);
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+}
+
+// int32_t
+uint32_t NNApi::addScalarOperand(int32_t val) {
+  ANeuralNetworksOperandType scalar;
+  scalar.type = ANEURALNETWORKS_INT32;
+  scalar.scale = 0;
+  scalar.zeroPoint = 0;
+  scalar.dimensionCount = 0;
+  scalar.dimensions = NULL;
+  int result_code = libnnapi_.ANeuralNetworksModel_addOperand(model_, &scalar);
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+
+  result_code = libnnapi_.ANeuralNetworksModel_setOperandValue(
+      model_, operand_idx, &val, sizeof(val));
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+
+  VLOG(1) << "Added scalar, " << val << ", at " << operand_idx;
+  return operand_idx++;
+}
+
+// float32
+uint32_t NNApi::addFloatOperand(float val) {
+  ANeuralNetworksOperandType scalar;
+  scalar.type = ANEURALNETWORKS_TENSOR_FLOAT32;
+  scalar.scale = 0;
+  scalar.zeroPoint = 0;
+  scalar.dimensionCount = 0;
+  scalar.dimensions = NULL;
+  int result_code = libnnapi_.ANeuralNetworksModel_addOperand(model_, &scalar);
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+
+  result_code = libnnapi_.ANeuralNetworksModel_setOperandValue(
+      model_, operand_idx, &val, sizeof(val));
+  if (result_code != ANEURALNETWORKS_NO_ERROR) {
+    reportError(result_code);
+  }
+
+  VLOG(1) << "Added scalar, " << val << ", at " << operand_idx;
+  return operand_idx++;
+}
+
+uint32_t NNApi::addTensorOperand(
+    const std::string& blob,
+    OperandCode type,
+    std::vector<uint32_t>& dims,
+    float scale,
+    int32_t zero_point)
+// clang-format off
+{
+  // clang-format on
+  auto found = operand_map_.find(blob);
+  if (found == operand_map_.end()) {
+    ANeuralNetworksOperandType tensor;
+    tensor.type = type;
+    tensor.scale = scale;
+    tensor.zeroPoint = zero_point;
+    tensor.dimensionCount = dims.size();
+    tensor.dimensions = dims.data();
+
+    int result_code =
+        libnnapi_.ANeuralNetworksModel_addOperand(model_, &tensor);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+
+    operand_map_[blob] = operand_idx++;
+    tensor_dims_[blob] = dims;
+    VLOG(1) << "Added operand, " << blob << ", at " << operand_map_[blob];
+  }
+  return operand_map_[blob];
+}
+
+void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
+  // model
+  if (!model_) {
+    int result_code = libnnapi_.ANeuralNetworksModel_create(&model_);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+    if (!model_) {
+      CAFFE_THROW("Failed to create NN model");
+    } else {
+      LOG(INFO) << "Created NN model";
+    }
+
+    ArgumentHelper helper(run_net_);
+    float scale = helper.GetSingleArgument<float>("scale", 1.0);
+    int zero_point = helper.GetSingleArgument<int>("zero_point", 0);
+
+    // add external input dimension
+    for (int i = 0; i < inputs.size(); i++) {
+      if (inputs[i]->IsType<float>()) {
+        tensor_type_ = ANEURALNETWORKS_TENSOR_FLOAT32;
+      } else if (inputs[i]->IsType<uint8_t>()) {
+        tensor_type_ = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+      } else {
+        CAFFE_THROW("Unsupported tensor type");
+      }
+      const std::string& input_blob = run_net_.external_input(i);
+      std::vector<uint32_t> dims;
+      for (auto dim : inputs[i]->dims()) {
+        dims.push_back(dim);
+      }
+      addTensorOperand(input_blob, tensor_type_, dims, scale, zero_point);
+    }
+
+    // add operands and operations
+    for (const auto& op : run_net_.op()) {
+      if (operator_map_.count(op.type()) == 0) {
+        CAFFE_THROW("Unsupported operator");
+      }
+      switch (operator_map_[op.type()]) {
+        case AVERAGEPOOL:
+          addPooling(op, ANEURALNETWORKS_AVERAGE_POOL_2D);
+          break;
+        case CONV:
+          addConv(op);
+          break;
+        case MAXPOOL:
+          addPooling(op, ANEURALNETWORKS_MAX_POOL_2D);
+          break;
+        case RELU:
+          addRelu(op);
+          break;
+        case SOFTMAX:
+          addSoftmax(op);
+          break;
+        default:
+          CAFFE_THROW("Unsupported operator");
+          break;
+      }
+    }
+
+    // model inputs and outputs
+    int output_size = run_net_.external_output_size();
+    std::vector<uint32_t> input_indices(inputs.size());
+    std::vector<uint32_t> output_indices(output_size);
+    for (int i = 0; i < inputs.size(); i++) {
+      input_indices[i] = operand_map_[run_net_.external_input(i)];
+    }
+    for (int i = 0; i < output_size; i++) {
+      output_indices[i] = operand_map_[run_net_.external_output(i)];
+    }
+
+    result_code = libnnapi_.ANeuralNetworksModel_identifyInputsAndOutputs(
+        model_,
+        inputs.size(),
+        input_indices.data(),
+        output_size,
+        output_indices.data());
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+
+    result_code = libnnapi_.ANeuralNetworksModel_finish(model_);
+    if (result_code != ANEURALNETWORKS_NO_ERROR) {
+      reportError(result_code);
+    }
+
+    LOG(INFO) << "Finish creating model";
+
+    // compile
+    if (!compilation_) {
+      result_code =
+          libnnapi_.ANeuralNetworksCompilation_create(model_, &compilation_);
+      if (result_code != ANEURALNETWORKS_NO_ERROR) {
+        reportError(result_code);
+      }
+
+      result_code = libnnapi_.ANeuralNetworksCompilation_setPreference(
+          compilation_, preference_);
+      if (result_code != ANEURALNETWORKS_NO_ERROR) {
+        reportError(result_code);
+      }
+
+      result_code = libnnapi_.ANeuralNetworksCompilation_finish(compilation_);
+      if (result_code != ANEURALNETWORKS_NO_ERROR) {
+        reportError(result_code);
+      }
+
+      LOG(INFO) << "Finish compilation";
+    }
+
+    // pre-execution
+    if (!run_) {
+      result_code =
+          libnnapi_.ANeuralNetworksExecution_create(compilation_, &run_);
+      if (result_code != ANEURALNETWORKS_NO_ERROR) {
+        reportError(result_code);
+      }
+      LOG(INFO) << "Created model execution";
+    }
+
+    // set external input and output
+    for (int i = 0; i < inputs.size(); i++) {
+      result_code = libnnapi_.ANeuralNetworksExecution_setInput(
+          run_, i, NULL, inputs[i]->raw_data(), inputs[i]->size());
+      if (result_code != ANEURALNETWORKS_NO_ERROR) {
+        reportError(result_code);
+      }
+
+      VLOG(1) << "Set external input " << i << " at " << inputs[i]->raw_data()
+              << ", size = " << inputs[i]->size();
+    }
+    // allocate memory for outputs
+    for (int i = 0; i < output_size; i++) {
+      const std::string& blob = run_net_.external_output(i);
+      if (operand_map_.find(blob) == operand_map_.end()) {
+        CAFFE_THROW("Unknown external output, ", blob);
+      }
+      uint32_t idx = operand_map_[blob];
+      if (tensor_dims_.find(blob) == tensor_dims_.end()) {
+        CAFFE_THROW("Operand dimension unknown");
+      }
+      std::vector<int> output_dims;
+      for (auto dim : tensor_dims_[blob]) {
+        output_dims.push_back(dim);
+      }
+
+      auto* tensor = ws_.CreateBlob(blob)->GetMutable<TensorCPU>();
+      tensor->Resize(output_dims);
+      outputs->push_back(tensor);
+
+      if (tensor_type_ == ANEURALNETWORKS_TENSOR_FLOAT32) {
+        result_code = libnnapi_.ANeuralNetworksExecution_setOutput(
+            run_,
+            i,
+            NULL,
+            (void*)tensor->template mutable_data<float>(),
+            tensor->size());
+        if (result_code != ANEURALNETWORKS_NO_ERROR) {
+          reportError(result_code);
+        }
+
+      } else {
+        result_code = libnnapi_.ANeuralNetworksExecution_setOutput(
+            run_,
+            i,
+            NULL,
+            (void*)tensor->template mutable_data<uint8_t>(),
+            tensor->size());
+        if (result_code != ANEURALNETWORKS_NO_ERROR) {
+          reportError(result_code);
+        }
+      }
+
+      VLOG(1) << "Set external output " << i << " at " << tensor->raw_data()
+              << ", size = " << tensor->size();
+    }
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.h b/caffe2/mobile/contrib/nnapi/nnapi.h
new file mode 100644
index 0000000..04d51aa
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/nnapi.h
@@ -0,0 +1,110 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "NeuralNetworks.h"
+#include "dlnnapi.h"
+
+namespace caffe2 {
+
+class NNApi {
+ public:
+  using TensorVector = std::vector<TensorCPU*>;
+
+  // Three different modes:
+  // ANEURALNETWORKS_PREFER_LOW_POWER
+  // ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER
+  // ANEURALNETWORKS_PREFER_SUSTAINED_SPEED
+  NNApi(
+      const NetDef& init_net,
+      const NetDef& run_net,
+      Workspace* ws = nullptr,
+      const PreferenceCode pref = ANEURALNETWORKS_PREFER_SUSTAINED_SPEED)
+      : order_(StorageOrder::NHWC),
+        preference_(pref),
+        run_net_(run_net),
+        ws_(ws) {
+    if (!loadNNApiLibrary()) {
+      CAFFE_THROW("NNApi is not supported");
+    }
+    CAFFE_ENFORCE(ws_.RunNetOnce(init_net));
+  }
+
+  ~NNApi();
+
+  bool loadNNApiLibrary();
+
+  bool run(const TensorVector& inputs, TensorVector* outputs);
+
+ private:
+  dlnnapi libnnapi_;
+  ANeuralNetworksModel* model_{nullptr};
+  ANeuralNetworksCompilation* compilation_{nullptr};
+  ANeuralNetworksExecution* run_{nullptr};
+  ANeuralNetworksEvent* run_end_{nullptr};
+  StorageOrder order_;
+  PreferenceCode preference_;
+  NetDef run_net_;
+  Workspace ws_;
+  OperandCode tensor_type_;
+  uint32_t operand_idx{0};
+  std::unordered_map<std::string, uint32_t> operand_map_;
+  // dimensions for the tensors
+  std::unordered_map<std::string, std::vector<uint32_t>> tensor_dims_;
+
+  // mapping of the operator name "Conv" to OperatorType CONV
+  enum OperatorType {
+    AVERAGEPOOL,
+    CONV,
+    MAXPOOL,
+    RELU,
+    SOFTMAX,
+  };
+
+  std::unordered_map<std::string, OperatorType> operator_map_{
+      {"AveragePool", AVERAGEPOOL},
+      {"Conv", CONV},
+      {"MaxPool", MAXPOOL},
+      {"Relu", RELU},
+      {"Softmax", SOFTMAX}};
+
+  struct ConvPoolArgs {
+    int kernel_h{0};
+    int kernel_w{0};
+    int stride_x{0};
+    int stride_y{0};
+    int pad_t{0};
+    int pad_l{0};
+    int pad_b{0};
+    int pad_r{0};
+  };
+
+  void getConvPoolArgs(const ArgumentHelper& helper, ConvPoolArgs& args);
+
+  uint32_t addScalarOperand(int32_t val);
+
+  uint32_t addFloatOperand(float val);
+
+  uint32_t addTensorOperand(
+      const std::string& blob,
+      OperandCode type,
+      std::vector<uint32_t>& dims,
+      float scale = 1.0,
+      int32_t zero_point = 0);
+
+  // lazily initialize model_ in run()
+  void init(const TensorVector& inputs, TensorVector* outputs);
+
+  void addConv(const OperatorDef& op, bool fuse_relu = false);
+
+  void addPooling(
+      const OperatorDef& op,
+      OperationCode op_code,
+      bool fuse_relu = false);
+
+  void addRelu(const OperatorDef& op);
+
+  void addSoftmax(const OperatorDef& op);
+};
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
new file mode 100644
index 0000000..db0e867
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -0,0 +1,468 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "nnapi.h"
+
+namespace caffe2 {
+
+namespace {
+
+static double benchmark_conv_caffe2(
+    Workspace* ws,
+    int N,
+    int C,
+    int H,
+    int W,
+    int K,
+    int kernel,
+    int group,
+    int warmup = 5,
+    int run = 10,
+    std::string engine = "NNPACK") {
+  caffe2::Workspace localWs;
+  if (!ws) {
+    ws = &localWs;
+  }
+  {
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    if (group == 1) {
+      t->Resize(K, C, kernel, kernel);
+    } else {
+      t->Resize(K, 1, kernel, kernel);
+    }
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  OperatorDef op;
+  {
+    op.set_type("Conv");
+    op.add_input("X_cpu");
+    op.add_input("W");
+    op.add_input("B");
+    op.add_output("Y_cpu");
+    op.set_engine(engine);
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("convolution_transform_strategy");
+      arg.set_s("PRECOMPUTE");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("kernel");
+      arg.set_i(kernel);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("group");
+      arg.set_i(group);
+    }
+  }
+
+  // NNPack
+  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(op, ws));
+
+  Timer timer;
+  CAFFE_ENFORCE(op1->Run());
+  for (int i = 0; i < warmup; i++) {
+    op1->Run();
+  }
+  timer.Start();
+  for (int i = 0; i < run; i++) {
+    op1->Run();
+  }
+  return double(timer.MilliSeconds()) / run;
+}
+
+static double benchmark_conv_nnapi(
+    Workspace* ws,
+    int N,
+    int C,
+    int H,
+    int W,
+    int K,
+    int kernel,
+    int group,
+    int warmup = 5,
+    int run = 10) {
+  caffe2::Workspace localWs;
+  if (!ws) {
+    ws = &localWs;
+  }
+  {
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    if (group > 1) {
+      CAFFE_ENFORCE_EQ(C, group);
+      t->Resize(1, kernel, kernel, C);
+    } else {
+      t->Resize(K, kernel, kernel, C);
+    }
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("B");
+      op.add_output("Y_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NHWC");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("group");
+        arg.set_i(group);
+      }
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_input("W");
+    netdef.add_external_input("B");
+    netdef.add_external_output("Y_cpu");
+  }
+
+  // NN API
+  NetDef initNet;
+  NNApi model(initNet, netdef, ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  CAFFE_ENFORCE(model.run(inputs, &outputs));
+
+  for (int i = 0; i < warmup; i++) {
+    model.run(inputs, &outputs);
+  }
+  Timer timer;
+  timer.Start();
+  for (int i = 0; i < run; i++) {
+    model.run(inputs, &outputs);
+  }
+  return double(timer.MilliSeconds()) / run;
+}
+
+static double benchmark_conv_nnapi_int8(
+    Workspace* ws,
+    int N,
+    int C,
+    int H,
+    int W,
+    int K,
+    int kernel,
+    int group,
+    int warmup = 5,
+    int run = 10) {
+  caffe2::Workspace localWs;
+  if (!ws) {
+    ws = &localWs;
+  }
+  {
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    for (int i = 0; i < t->size(); i++) {
+      t->mutable_data<uint8_t>()[i] = rand() % 10;
+    }
+  }
+  {
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    if (group > 1) {
+      CAFFE_ENFORCE_EQ(C, group);
+      t->Resize(1, kernel, kernel, C);
+    } else {
+      t->Resize(K, kernel, kernel, C);
+    }
+    for (int i = 0; i < t->size(); i++) {
+      t->mutable_data<uint8_t>()[i] = rand() % 10;
+    }
+  }
+
+  // For input tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the bias
+  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
+  // bias_scale == input_scale * filter_scale.
+  {
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    for (int i = 0; i < t->size(); i++) {
+      t->mutable_data<int32_t>()[i] = rand() % 10;
+    }
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("B");
+      op.add_output("Y_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NHWC");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("group");
+        arg.set_i(group);
+      }
+      // Hack
+      // for weight tensor
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("weight_scale");
+        arg.set_f(1.0);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("weight_zero_point");
+        arg.set_i(0);
+      }
+      // for output tensor
+      // For output tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the
+      // following condition must be satisfied: output_scale > input_scale *
+      // filter_scale
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("output_scale");
+        arg.set_f(2.0);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("output_zero_point");
+        arg.set_i(0);
+      }
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_input("W");
+    netdef.add_external_input("B");
+    netdef.add_external_output("Y_cpu");
+    // scale and zero_point for the input tensor
+    {
+      auto& arg = *(netdef.add_arg());
+      arg.set_name("scale");
+      arg.set_f(1.0);
+    }
+    {
+      auto& arg = *(netdef.add_arg());
+      arg.set_name("zero_point");
+      arg.set_i(0);
+    }
+  }
+
+  // NN API
+  NetDef initNet;
+  NNApi model(initNet, netdef, ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  CAFFE_ENFORCE(model.run(inputs, &outputs));
+
+  for (int i = 0; i < warmup; i++) {
+    model.run(inputs, &outputs);
+  }
+  Timer timer;
+  timer.Start();
+  for (int i = 0; i < run; i++) {
+    model.run(inputs, &outputs);
+  }
+  return double(timer.MilliSeconds()) / run;
+}
+
+} // namespace
+
+} // namespace caffe2
+
+int main(int argc, char** argv) {
+  caffe2::Workspace ws;
+  ws.GetThreadPool()->setMinWorkSize(0);
+
+  int warmup = 2, mainrun = 10;
+  // float32
+  for (int space : {14, 26, 52, 104}) {
+    for (int input_channel : {64, 128, 256, 512}) {
+      for (int kernel : {1, 3}) {
+        int output_channel = input_channel;
+        const double cpu_time = caffe2::benchmark_conv_caffe2(
+            &ws,
+            1,
+            input_channel,
+            space,
+            space,
+            output_channel,
+            kernel,
+            1,
+            warmup,
+            mainrun,
+            "NNPACK");
+        const double nn_time_fp32 = caffe2::benchmark_conv_nnapi(
+            &ws,
+            1,
+            input_channel,
+            space,
+            space,
+            output_channel,
+            kernel,
+            1,
+            warmup,
+            mainrun);
+        const double nn_time_int8 = caffe2::benchmark_conv_nnapi_int8(
+            &ws,
+            1,
+            input_channel,
+            space,
+            space,
+            output_channel,
+            kernel,
+            1,
+            warmup,
+            mainrun);
+        const double flops = double(input_channel) * output_channel * kernel *
+            kernel * (kernel == 1 ? space : space - 2) *
+            (kernel == 1 ? space : space - 2) * 2;
+        printf(
+            "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t32b"
+            "NNPACK GFLOPS: %.2f\t32b"
+            "NN-API GFLOPS: %.2f\t8b"
+            "NN-API GOPS: %.2f\n",
+            space,
+            space,
+            input_channel,
+            output_channel,
+            kernel,
+            kernel,
+            flops / cpu_time / 1E6,
+            flops / nn_time_fp32 / 1E6,
+            flops / nn_time_int8 / 1E6);
+      }
+    }
+  }
+  fflush(stdout);
+
+  // depthwise
+  for (int space : {14, 26, 52, 104}) {
+    for (int channel : {64, 128, 256, 512}) {
+      for (int kernel : {3}) {
+        const double cpu_time = caffe2::benchmark_conv_caffe2(
+            &ws,
+            1,
+            channel,
+            space,
+            space,
+            channel,
+            kernel,
+            channel,
+            warmup,
+            mainrun,
+            "DEPTHWISE_3x3");
+        const double nn_time_fp32_dwise = caffe2::benchmark_conv_nnapi(
+            &ws,
+            1,
+            channel,
+            space,
+            space,
+            channel,
+            kernel,
+            channel,
+            warmup,
+            mainrun);
+        const double nn_time_int8_dwise = caffe2::benchmark_conv_nnapi_int8(
+            &ws,
+            1,
+            channel,
+            space,
+            space,
+            channel,
+            kernel,
+            channel,
+            warmup,
+            mainrun);
+        const double dwise_bandwidth = sizeof(float) * double(channel) *
+            (space * space + kernel == 1
+                 ? space * space
+                 : (space - 2) * (space - 2) + kernel * kernel);
+        printf(
+            "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t32b"
+            "Caffe2 Dwise GB/s: %.2f\t32b"
+            "NN-API Dwise GB/s: %.2f\t8b"
+            "NN-API Dwise GB/s: %.2f\n",
+            space,
+            space,
+            channel,
+            channel,
+            kernel,
+            kernel,
+            dwise_bandwidth / cpu_time / 1E6,
+            dwise_bandwidth / nn_time_fp32_dwise / 1E6,
+            dwise_bandwidth / sizeof(float) / nn_time_int8_dwise / 1E6);
+      }
+    }
+  }
+}
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
new file mode 100644
index 0000000..76278c8
--- /dev/null
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -0,0 +1,642 @@
+#include <gtest/gtest.h>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "NeuralNetworks.h"
+#include "nnapi.h"
+
+namespace caffe2 {
+
+namespace {
+
+// CPU: t1, NN-API: t2
+void checkError(const TensorCPU& t1, const TensorCPU& t2, float error) {
+  CAFFE_ENFORCE_EQ(
+      t1.dims(),
+      t2.dims(),
+      "t1.size() = ",
+      t1.size(),
+      ", t2.size() = ",
+      t2.size());
+  float maxError = 0, minError = 0;
+  if (t1.template IsType<float>()) {
+    for (auto i = 0; i < t1.size(); ++i) {
+      const float t1_i = t1.template data<float>()[i];
+      const float t2_i = t2.template data<float>()[i];
+      EXPECT_NEAR(t1_i, t2_i, error);
+      float err = t1_i - t2_i;
+      if (err > maxError) {
+        maxError = err;
+      } else if (err < minError) {
+        minError = err;
+      }
+    }
+  } else if (t1.template IsType<uint8_t>()) {
+    for (auto i = 0; i < t1.size(); ++i) {
+      const uint8_t t1_i = t1.template data<uint8_t>()[i];
+      const uint8_t t2_i = t2.template data<uint8_t>()[i];
+      EXPECT_NEAR(t1_i, t2_i, error);
+      float err = t1_i - t2_i;
+      if (err > maxError) {
+        maxError = err;
+      } else if (err < minError) {
+        minError = err;
+      }
+    }
+  }
+  LOG(ERROR) << "maxError = " << maxError << ", minError = " << minError;
+}
+
+static void test_relu(int N, int C, int H, int W) {
+  // CPU reference
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Relu");
+      op.add_input("X_cpu");
+      op.add_output("Y_cpu");
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_output("Y_cpu");
+  }
+  ws.RunNetOnce(netdef);
+  const auto& t_cpu = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // cpu
+
+  // NN API
+  netdef.mutable_op(0)->set_output(0, "Y_nn");
+  netdef.set_external_output(0, "Y_nn");
+  NetDef initNet;
+  NNApi model(initNet, netdef, &ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  EXPECT_TRUE(model.run(inputs, &outputs));
+  const auto& t_nn = *outputs[0];
+
+  checkError(t_cpu, t_nn, 0.01);
+}
+
+static void test_conv_NHWC(
+    int N,
+    int C,
+    int H,
+    int W,
+    int K,
+    int kernel,
+    int pad_t,
+    int pad_l,
+    int pad_b,
+    int pad_r,
+    int stride_h,
+    int stride_w) {
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(K, kernel, kernel, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("B");
+      op.add_output("Y_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NHWC");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_h");
+        arg.set_i(stride_h);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_w");
+        arg.set_i(stride_w);
+      }
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_input("W");
+    netdef.add_external_input("B");
+    netdef.add_external_output("Y_cpu");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t_cpu = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // cpu
+
+  // NN API
+  netdef.mutable_op(0)->set_output(0, "Y_nn");
+  netdef.set_external_output(0, "Y_nn");
+  NetDef initNet;
+  NNApi model(initNet, netdef, &ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  EXPECT_TRUE(model.run(inputs, &outputs));
+  const auto& t_nn = *outputs[0];
+
+  checkError(t_cpu, t_nn, 0.01);
+}
+
+static void test_depthwise_conv_NHWC(
+    int N,
+    int C,
+    int H,
+    int W,
+    int D,
+    int kernel,
+    int pad_t,
+    int pad_l,
+    int pad_b,
+    int pad_r,
+    int stride_h,
+    int stride_w) {
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(1, kernel, kernel, D);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+  {
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(D);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef_cpu;
+  {
+    // X: NHWC -> NCHW
+    {
+      auto& op = *(netdef_cpu.add_op());
+      op.set_type("Transpose");
+      op.add_input("X_cpu");
+      op.add_output("X_t");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("axes");
+        arg.add_ints(0);
+        arg.add_ints(3);
+        arg.add_ints(1);
+        arg.add_ints(2);
+      }
+    }
+    // X: MHWC -> CMHW
+    {
+      auto& op = *(netdef_cpu.add_op());
+      op.set_type("Transpose");
+      op.add_input("W");
+      op.add_output("W_t");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("axes");
+        arg.add_ints(3);
+        arg.add_ints(0);
+        arg.add_ints(1);
+        arg.add_ints(2);
+      }
+    }
+    {
+      auto& op = *(netdef_cpu.add_op());
+      op.set_type("Conv");
+      op.add_input("X_t");
+      op.add_input("W_t");
+      op.add_input("B");
+      op.add_output("Y_t");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NCHW");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_h");
+        arg.set_i(stride_h);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_w");
+        arg.set_i(stride_w);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("group");
+        arg.set_i(C);
+      }
+    }
+    // Y: NCHW -> NHWC
+    {
+      auto& op = *(netdef_cpu.add_op());
+      op.set_type("Transpose");
+      op.add_input("Y_t");
+      op.add_output("Y_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("axes");
+        arg.add_ints(0);
+        arg.add_ints(2);
+        arg.add_ints(3);
+        arg.add_ints(1);
+      }
+    }
+    netdef_cpu.add_external_input("X_cpu");
+    netdef_cpu.add_external_input("W");
+    netdef_cpu.add_external_input("B");
+    netdef_cpu.add_external_output("Y_cpu");
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Conv");
+      op.add_input("X_cpu");
+      op.add_input("W");
+      op.add_input("B");
+      op.add_output("Y_nn");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NHWC");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_h");
+        arg.set_i(stride_h);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_w");
+        arg.set_i(stride_w);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("group");
+        arg.set_i(C);
+      }
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_input("W");
+    netdef.add_external_input("B");
+    netdef.add_external_output("Y_nn");
+  }
+
+  ws.RunNetOnce(netdef_cpu);
+  const auto& t_cpu = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // cpu
+
+  // NN API
+  NetDef initNet;
+  NNApi model(initNet, netdef, &ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  EXPECT_TRUE(model.run(inputs, &outputs));
+  const auto& t_nn = *outputs[0];
+
+  checkError(t_cpu, t_nn, 0.01);
+}
+
+static void test_pooling(
+    std::string type,
+    int N,
+    int C,
+    int H,
+    int W,
+    int kernel,
+    int pad_t,
+    int pad_l,
+    int pad_b,
+    int pad_r,
+    int stride_h,
+    int stride_w) {
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type(type);
+      op.add_input("X_cpu");
+      op.add_output("Y_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("order");
+        arg.set_s("NHWC");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("kernel");
+        arg.set_i(kernel);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_h");
+        arg.set_i(stride_h);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("stride_w");
+        arg.set_i(stride_w);
+      }
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_output("Y_cpu");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t_cpu = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // cpu
+
+  // NN API
+  netdef.mutable_op(0)->set_output(0, "Y_nn");
+  netdef.set_external_output(0, "Y_nn");
+  NetDef initNet;
+  NNApi model(initNet, netdef, &ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  EXPECT_TRUE(model.run(inputs, &outputs));
+  const auto& t_nn = *outputs[0];
+
+  checkError(t_cpu, t_nn, 0.01);
+}
+
+static void test_softmax(int N, int C, int H = 1, int W = 1) {
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    if (H == 1 && W == 1) {
+      t->Resize(N, C);
+    } else {
+      t->Resize(N, H, W, C);
+    }
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Softmax");
+      op.add_input("X_cpu");
+      op.add_output("Y_cpu");
+    }
+    netdef.add_external_input("X_cpu");
+    netdef.add_external_output("Y_cpu");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t_cpu = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // cpu
+
+  // NN API
+  netdef.mutable_op(0)->set_output(0, "Y_nn");
+  netdef.set_external_output(0, "Y_nn");
+  NetDef initNet;
+  NNApi model(initNet, netdef, &ws);
+  std::vector<TensorCPU*> inputs, outputs;
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  EXPECT_TRUE(model.run(inputs, &outputs));
+  const auto& t_nn = *outputs[0];
+
+  checkError(t_cpu, t_nn, 0.01);
+}
+
+TEST(NNApi, TestConv) {
+  for (int C : {13, 32}) {
+    for (int M : {4, 7, 17}) {
+      for (int W : {13, 104}) {
+        for (int K : {1, 3, 5}) {
+          for (int P : {0, K - 1}) {
+            for (int S : {1, 2}) {
+              test_conv_NHWC(1, C, W, W, M, K, P, P, P, P, S, S);
+            }
+          }
+        }
+      }
+    }
+  }
+  // Test for asymmetric padding
+  // NN API only supports stride_x == stride_y
+  test_conv_NHWC(1, 3, 26, 26, 7, 3, 1, 2, 2, 0, 1, 1);
+  test_conv_NHWC(1, 3, 26, 26, 7, 3, 1, 2, 0, 2, 2, 2);
+  test_conv_NHWC(1, 3, 26, 26, 7, 3, 1, 1, 2, 1, 2, 2);
+  test_conv_NHWC(1, 3, 26, 26, 7, 3, 1, 1, 0, 1, 1, 1);
+}
+
+TEST(NNApi, Depthwise) {
+  for (int C : {13, 32}) {
+    for (int W : {13, 104}) {
+      for (int K : {1, 3}) {
+        for (int P : {0, K - 1}) {
+          for (int S : {1, 2}) {
+            test_depthwise_conv_NHWC(1, C, W, W, C, K, P, P, P, P, S, S);
+          }
+        }
+      }
+    }
+  }
+  // Test for asymmetric padding
+  // NN API only supports stride_x == stride_y
+  test_depthwise_conv_NHWC(1, 3, 26, 26, 3, 3, 1, 2, 2, 0, 1, 1);
+  test_depthwise_conv_NHWC(1, 3, 26, 26, 3, 3, 1, 2, 0, 2, 2, 2);
+  test_depthwise_conv_NHWC(1, 3, 26, 26, 3, 3, 1, 1, 2, 1, 2, 2);
+  test_depthwise_conv_NHWC(1, 3, 26, 26, 3, 3, 1, 1, 0, 1, 1, 1);
+}
+
+TEST(NNApi, TestRelu) {
+  test_relu(1, 4, 10, 10);
+  test_relu(1, 16, 128, 128);
+}
+
+TEST(NNApi, TestAveragePool) {
+  for (int C : {13, 32}) {
+    for (int W : {13, 104}) {
+      for (int K : {1, 3}) {
+        for (int P : {0, K - 1}) {
+          for (int S : {1, 2}) {
+            test_pooling("AveragePool", 1, C, W, W, K, P, P, P, P, S, S);
+          }
+        }
+      }
+    }
+  }
+  test_pooling("AveragePool", 1, 3, 26, 26, 3, 1, 2, 2, 0, 1, 1);
+  test_pooling("AveragePool", 1, 3, 26, 26, 3, 1, 2, 0, 2, 2, 2);
+  test_pooling("AveragePool", 1, 3, 26, 26, 3, 1, 1, 2, 1, 2, 2);
+  test_pooling("AveragePool", 1, 3, 26, 26, 3, 1, 1, 0, 1, 1, 1);
+}
+
+TEST(NNApi, TestMaxPool) {
+  for (int C : {13, 32}) {
+    for (int W : {13, 104}) {
+      for (int K : {1, 3}) {
+        for (int P : {0, K - 1}) {
+          for (int S : {1, 2}) {
+            test_pooling("MaxPool", 1, C, W, W, K, P, P, P, P, S, S);
+          }
+        }
+      }
+    }
+  }
+  test_pooling("MaxPool", 1, 3, 26, 26, 3, 1, 2, 2, 0, 1, 1);
+  test_pooling("MaxPool", 1, 3, 26, 26, 3, 1, 2, 0, 2, 2, 2);
+  test_pooling("MaxPool", 1, 3, 26, 26, 3, 1, 1, 2, 1, 2, 2);
+  test_pooling("MaxPool", 1, 3, 26, 26, 3, 1, 1, 0, 1, 1, 1);
+}
+
+TEST(NNApi, TestSoftmax) {
+  test_softmax(1, 100);
+  test_softmax(2, 17);
+
+  // NN API doesn't seem to work for 4D tensor
+  // test_softmax(1, 100, 13, 13);
+  // test_softmax(5, 17, 13, 13);
+}
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/CMakeLists.txt b/caffe2/mobile/contrib/opengl/CMakeLists.txt
new file mode 100644
index 0000000..f23de75
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/CMakeLists.txt
@@ -0,0 +1,14 @@
+if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
+  add_subdirectory(core)
+  add_subdirectory(operators)
+
+  if (ANDROID)
+    add_subdirectory(android)
+  endif()
+
+  if (IOS)
+    add_subdirectory(ios)
+  endif()
+endif()
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc
new file mode 100644
index 0000000..14a23c4
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc
@@ -0,0 +1,156 @@
+#include "AndroidGLContext.h"
+#include "caffe2/core/logging.h"
+#include "gl3stub.h"
+#include <regex>
+
+namespace {
+
+static const std::unordered_map<std::string, GL_Renderer>& renderer_map() {
+  static std::unordered_map<std::string, GL_Renderer> m = {
+      {"Adreno", Adreno},
+      {"Mali", Mali},
+      {"NVIDIA", Tegra} /*, {"PowerVR", PowerVR} */};
+  return m;
+}
+
+} // namespace
+
+EGLContext AndroidGLContext::create_opengl_thread_context() {
+  EGLSurface surface = EGL_NO_SURFACE;
+  EGLContext context = EGL_NO_CONTEXT;
+  EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+  if (display == EGL_NO_DISPLAY) {
+    // We failed to get a display
+    CAFFE_THROW("Problem with OpenGL context");
+    return context;
+  }
+
+  EGLint major;
+  EGLint minor;
+  eglInitialize(display, &major, &minor);
+
+  const EGLint configAttr[] = {EGL_RENDERABLE_TYPE,
+                               EGL_OPENGL_ES2_BIT,
+                               EGL_SURFACE_TYPE,
+                               EGL_PBUFFER_BIT, // we create a pixelbuffer surface
+                               EGL_NONE};
+
+  EGLint numConfig;
+  EGLConfig eglConfig;
+  if (!eglChooseConfig(display, configAttr, &eglConfig, 1, &numConfig)) {
+    // We failed to find a suitable config
+    eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+    eglTerminate(display);
+    display = EGL_NO_DISPLAY;
+    CAFFE_THROW("Problem with OpenGL context");
+    return context;
+  }
+
+  const EGLint ctxAttr[] = {EGL_CONTEXT_CLIENT_VERSION,
+                            2, // very important!
+                            EGL_NONE};
+
+  // Create an EGL context based on the chosen configuration.
+  context = eglCreateContext(display, eglConfig, EGL_NO_CONTEXT, ctxAttr);
+
+  // We need a surface. For most mixed JNI/Java based apps it is suggested
+  // that we pass a Java surface through JNI and extract the surface
+  // Pure NDK apps get passed the android_app structure which includes a surface
+  // We want our own OpenGL context for the current thread.
+  // Here we create a fake 1x1 'pixel buffer' surface.
+  // We don't expecting to run vertex or fragment shaders.
+
+  const EGLint surfaceAttr[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
+
+  surface = eglCreatePbufferSurface(display, eglConfig, surfaceAttr);
+
+  // Bind context, draw and surface to current thread
+  eglMakeCurrent(display, surface, surface, context);
+
+  // Bind the API for this context.  In our case we want to use OpenGL_ES
+  eglBindAPI(EGL_OPENGL_ES_API);
+  return context;
+}
+
+bool AndroidGLContext::opengl_thread_context_exists() {
+  return eglGetCurrentContext() != EGL_NO_CONTEXT;
+}
+
+bool AndroidGLContext::release_opengl_thread_context() {
+  EGLContext display = eglGetCurrentDisplay();
+  if (display != EGL_NO_DISPLAY) {
+    if (_eglcontext != EGL_NO_CONTEXT) {
+      eglDestroyContext(display, _eglcontext);
+      _eglcontext = EGL_NO_CONTEXT;
+    }
+    EGLSurface surface = eglGetCurrentSurface(EGL_DRAW);
+    if (surface != EGL_NO_SURFACE) {
+      eglDestroySurface(display, surface);
+      surface = EGL_NO_SURFACE;
+    }
+    surface = eglGetCurrentSurface(EGL_READ);
+    if (surface != EGL_NO_SURFACE) {
+      eglDestroySurface(display, surface);
+      surface = EGL_NO_SURFACE;
+    }
+    eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+    eglTerminate(display);
+    display = EGL_NO_DISPLAY;
+  }
+  eglReleaseThread();
+  return true;
+}
+
+void AndroidGLContext::init_gles3() {
+  if (!gl3stubInit()) {
+    CAFFE_THROW("OpenGL ES 3 not initialized");
+  } else {
+    LOG(INFO) << "OpenGL ES 3 successfully enabled";
+  }
+}
+
+GL_Renderer AndroidGLContext::get_platform() {
+  std::string rendererStr((const char*)glGetString(GL_RENDERER));
+  std::regex regStr("^[A-Za-z]*");
+  std::smatch matchs;
+  if (std::regex_search(rendererStr, matchs, regStr)) {
+    const std::string renderer = *matchs.begin();
+    auto found = renderer_map().find(renderer);
+    if (found != renderer_map().end()) {
+      return found->second;
+    }
+  }
+  CAFFE_THROW("Unsupported GPU renderer");
+}
+
+AndroidGLContext::AndroidGLContext() {
+  if (!opengl_thread_context_exists()) {
+    _eglcontext = create_opengl_thread_context();
+    LOG(INFO) << "New EGLContext created";
+
+    if (!supportOpenGLES3(&half_float_supported)) {
+      CAFFE_THROW("OpenGL ES 3 not supported");
+    }
+
+    if (!isSupportedDevice()) {
+      LOG(ERROR) << "Device not fully supported";
+    }
+  } else {
+    _eglcontext = EGL_NO_CONTEXT;
+    LOG(INFO) << "Reusing EGLContext, make sure OpenGL ES 3 is supported";
+  }
+  static std::once_flag once;
+  std::call_once(once, [&]() { init_gles3(); });
+}
+
+AndroidGLContext::~AndroidGLContext() {
+  if (_eglcontext != EGL_NO_CONTEXT) {
+    release_opengl_thread_context();
+  }
+}
+
+void AndroidGLContext::set_context() {}
+
+void AndroidGLContext::reset_context() {}
+
+void AndroidGLContext::flush_context() {}
diff --git a/caffe2/mobile/contrib/opengl/android/AndroidGLContext.h b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.h
new file mode 100644
index 0000000..51f1970
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.h
@@ -0,0 +1,26 @@
+
+#pragma once
+
+#include "../core/GLContext.h"
+#include "../core/GLTexture.h"
+#include <unordered_map>
+
+enum GL_Renderer { Adreno, Mali, Tegra /*, PowerVR */ };
+
+class AndroidGLContext : public GLContext {
+ private:
+  EGLContext _eglcontext;
+
+  EGLContext create_opengl_thread_context();
+  bool opengl_thread_context_exists();
+  bool release_opengl_thread_context();
+
+ public:
+  AndroidGLContext();
+  ~AndroidGLContext();
+  void set_context();
+  void reset_context();
+  void flush_context();
+  void init_gles3();
+  GL_Renderer get_platform();
+};
diff --git a/caffe2/mobile/contrib/opengl/android/CMakeLists.txt b/caffe2/mobile/contrib/opengl/android/CMakeLists.txt
new file mode 100644
index 0000000..9fe2085
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc *.c)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/android/GLContext.cc b/caffe2/mobile/contrib/opengl/android/GLContext.cc
new file mode 100644
index 0000000..ea707e1
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/GLContext.cc
@@ -0,0 +1,19 @@
+
+#include "AndroidGLContext.h"
+
+std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
+
+void GLContext::initGLContext() {
+  if (_glcontext == nullptr) {
+    _glcontext.reset(new AndroidGLContext());
+  }
+}
+
+GLContext* GLContext::getGLContext() {
+  if (_glcontext == nullptr) {
+    initGLContext();
+  }
+  return _glcontext.get();
+}
+
+void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
diff --git a/caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc
new file mode 100644
index 0000000..1c05833
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc
@@ -0,0 +1,11 @@
+
+#include "../core/GLImageAllocator.h"
+#include "../core/arm_neon_support.h"
+
+template <typename T>
+GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
+  return new GLImageAllocator<T>();
+}
+
+template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
+template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
diff --git a/caffe2/mobile/contrib/opengl/android/arm_neon_support.h b/caffe2/mobile/contrib/opengl/android/arm_neon_support.h
new file mode 100644
index 0000000..ddd9a85
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/arm_neon_support.h
@@ -0,0 +1,5 @@
+
+#pragma once
+
+#include <arm_neon.h>
+typedef __fp16 float16_t;
diff --git a/caffe2/mobile/contrib/opengl/android/gl3stub.c b/caffe2/mobile/contrib/opengl/android/gl3stub.c
new file mode 100755
index 0000000..13411de
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/gl3stub.c
@@ -0,0 +1,357 @@
+
+// clang-format off
+
+#include <EGL/egl.h>
+#include "gl3stub.h"
+
+GLboolean gl3stubInit() {
+    #define FIND_PROC(s) s = (void*)eglGetProcAddress(#s)
+    FIND_PROC(glReadBuffer);
+    FIND_PROC(glDrawRangeElements);
+    FIND_PROC(glTexImage3D);
+    FIND_PROC(glTexSubImage3D);
+    FIND_PROC(glCopyTexSubImage3D);
+    FIND_PROC(glCompressedTexImage3D);
+    FIND_PROC(glCompressedTexSubImage3D);
+    FIND_PROC(glGenQueries);
+    FIND_PROC(glDeleteQueries);
+    FIND_PROC(glIsQuery);
+    FIND_PROC(glBeginQuery);
+    FIND_PROC(glEndQuery);
+    FIND_PROC(glGetQueryiv);
+    FIND_PROC(glGetQueryObjectuiv);
+    FIND_PROC(glUnmapBuffer);
+    FIND_PROC(glGetBufferPointerv);
+    FIND_PROC(glDrawBuffers);
+    FIND_PROC(glUniformMatrix2x3fv);
+    FIND_PROC(glUniformMatrix3x2fv);
+    FIND_PROC(glUniformMatrix2x4fv);
+    FIND_PROC(glUniformMatrix4x2fv);
+    FIND_PROC(glUniformMatrix3x4fv);
+    FIND_PROC(glUniformMatrix4x3fv);
+    FIND_PROC(glBlitFramebuffer);
+    FIND_PROC(glRenderbufferStorageMultisample);
+    FIND_PROC(glFramebufferTextureLayer);
+    FIND_PROC(glMapBufferRange);
+    FIND_PROC(glFlushMappedBufferRange);
+    FIND_PROC(glBindVertexArray);
+    FIND_PROC(glDeleteVertexArrays);
+    FIND_PROC(glGenVertexArrays);
+    FIND_PROC(glIsVertexArray);
+    FIND_PROC(glGetIntegeri_v);
+    FIND_PROC(glBeginTransformFeedback);
+    FIND_PROC(glEndTransformFeedback);
+    FIND_PROC(glBindBufferRange);
+    FIND_PROC(glBindBufferBase);
+    FIND_PROC(glTransformFeedbackVaryings);
+    FIND_PROC(glGetTransformFeedbackVarying);
+    FIND_PROC(glVertexAttribIPointer);
+    FIND_PROC(glGetVertexAttribIiv);
+    FIND_PROC(glGetVertexAttribIuiv);
+    FIND_PROC(glVertexAttribI4i);
+    FIND_PROC(glVertexAttribI4ui);
+    FIND_PROC(glVertexAttribI4iv);
+    FIND_PROC(glVertexAttribI4uiv);
+    FIND_PROC(glGetUniformuiv);
+    FIND_PROC(glGetFragDataLocation);
+    FIND_PROC(glUniform1ui);
+    FIND_PROC(glUniform2ui);
+    FIND_PROC(glUniform3ui);
+    FIND_PROC(glUniform4ui);
+    FIND_PROC(glUniform1uiv);
+    FIND_PROC(glUniform2uiv);
+    FIND_PROC(glUniform3uiv);
+    FIND_PROC(glUniform4uiv);
+    FIND_PROC(glClearBufferiv);
+    FIND_PROC(glClearBufferuiv);
+    FIND_PROC(glClearBufferfv);
+    FIND_PROC(glClearBufferfi);
+    FIND_PROC(glGetStringi);
+    FIND_PROC(glCopyBufferSubData);
+    FIND_PROC(glGetUniformIndices);
+    FIND_PROC(glGetActiveUniformsiv);
+    FIND_PROC(glGetUniformBlockIndex);
+    FIND_PROC(glGetActiveUniformBlockiv);
+    FIND_PROC(glGetActiveUniformBlockName);
+    FIND_PROC(glUniformBlockBinding);
+    FIND_PROC(glDrawArraysInstanced);
+    FIND_PROC(glDrawElementsInstanced);
+    FIND_PROC(glFenceSync);
+    FIND_PROC(glIsSync);
+    FIND_PROC(glDeleteSync);
+    FIND_PROC(glClientWaitSync);
+    FIND_PROC(glWaitSync);
+    FIND_PROC(glGetInteger64v);
+    FIND_PROC(glGetSynciv);
+    FIND_PROC(glGetInteger64i_v);
+    FIND_PROC(glGetBufferParameteri64v);
+    FIND_PROC(glGenSamplers);
+    FIND_PROC(glDeleteSamplers);
+    FIND_PROC(glIsSampler);
+    FIND_PROC(glBindSampler);
+    FIND_PROC(glSamplerParameteri);
+    FIND_PROC(glSamplerParameteriv);
+    FIND_PROC(glSamplerParameterf);
+    FIND_PROC(glSamplerParameterfv);
+    FIND_PROC(glGetSamplerParameteriv);
+    FIND_PROC(glGetSamplerParameterfv);
+    FIND_PROC(glVertexAttribDivisor);
+    FIND_PROC(glBindTransformFeedback);
+    FIND_PROC(glDeleteTransformFeedbacks);
+    FIND_PROC(glGenTransformFeedbacks);
+    FIND_PROC(glIsTransformFeedback);
+    FIND_PROC(glPauseTransformFeedback);
+    FIND_PROC(glResumeTransformFeedback);
+    FIND_PROC(glGetProgramBinary);
+    FIND_PROC(glProgramBinary);
+    FIND_PROC(glProgramParameteri);
+    FIND_PROC(glInvalidateFramebuffer);
+    FIND_PROC(glInvalidateSubFramebuffer);
+    FIND_PROC(glTexStorage2D);
+    FIND_PROC(glTexStorage3D);
+    FIND_PROC(glGetInternalformativ);
+
+    // Bind GL_EXT_texture_border_clamp
+
+    FIND_PROC(glTexParameterIivEXT);
+    FIND_PROC(glTexParameterIuivEXT);
+    FIND_PROC(glGetTexParameterIivEXT);
+    FIND_PROC(glGetTexParameterIuivEXT);
+    FIND_PROC(glSamplerParameterIivEXT);
+    FIND_PROC(glSamplerParameterIuivEXT);
+    FIND_PROC(glGetSamplerParameterIivEXT);
+    FIND_PROC(glGetSamplerParameterIuivEXT);
+
+    #undef FIND_PROC
+
+    if (!glReadBuffer ||
+        !glDrawRangeElements ||
+        !glTexImage3D ||
+        !glTexSubImage3D ||
+        !glCopyTexSubImage3D ||
+        !glCompressedTexImage3D ||
+        !glCompressedTexSubImage3D ||
+        !glGenQueries ||
+        !glDeleteQueries ||
+        !glIsQuery ||
+        !glBeginQuery ||
+        !glEndQuery ||
+        !glGetQueryiv ||
+        !glGetQueryObjectuiv ||
+        !glUnmapBuffer ||
+        !glGetBufferPointerv ||
+        !glDrawBuffers ||
+        !glUniformMatrix2x3fv ||
+        !glUniformMatrix3x2fv ||
+        !glUniformMatrix2x4fv ||
+        !glUniformMatrix4x2fv ||
+        !glUniformMatrix3x4fv ||
+        !glUniformMatrix4x3fv ||
+        !glBlitFramebuffer ||
+        !glRenderbufferStorageMultisample ||
+        !glFramebufferTextureLayer ||
+        !glMapBufferRange ||
+        !glFlushMappedBufferRange ||
+        !glBindVertexArray ||
+        !glDeleteVertexArrays ||
+        !glGenVertexArrays ||
+        !glIsVertexArray ||
+        !glGetIntegeri_v ||
+        !glBeginTransformFeedback ||
+        !glEndTransformFeedback ||
+        !glBindBufferRange ||
+        !glBindBufferBase ||
+        !glTransformFeedbackVaryings ||
+        !glGetTransformFeedbackVarying ||
+        !glVertexAttribIPointer ||
+        !glGetVertexAttribIiv ||
+        !glGetVertexAttribIuiv ||
+        !glVertexAttribI4i ||
+        !glVertexAttribI4ui ||
+        !glVertexAttribI4iv ||
+        !glVertexAttribI4uiv ||
+        !glGetUniformuiv ||
+        !glGetFragDataLocation ||
+        !glUniform1ui ||
+        !glUniform2ui ||
+        !glUniform3ui ||
+        !glUniform4ui ||
+        !glUniform1uiv ||
+        !glUniform2uiv ||
+        !glUniform3uiv ||
+        !glUniform4uiv ||
+        !glClearBufferiv ||
+        !glClearBufferuiv ||
+        !glClearBufferfv ||
+        !glClearBufferfi ||
+        !glGetStringi ||
+        !glCopyBufferSubData ||
+        !glGetUniformIndices ||
+        !glGetActiveUniformsiv ||
+        !glGetUniformBlockIndex ||
+        !glGetActiveUniformBlockiv ||
+        !glGetActiveUniformBlockName ||
+        !glUniformBlockBinding ||
+        !glDrawArraysInstanced ||
+        !glDrawElementsInstanced ||
+        !glFenceSync ||
+        !glIsSync ||
+        !glDeleteSync ||
+        !glClientWaitSync ||
+        !glWaitSync ||
+        !glGetInteger64v ||
+        !glGetSynciv ||
+        !glGetInteger64i_v ||
+        !glGetBufferParameteri64v ||
+        !glGenSamplers ||
+        !glDeleteSamplers ||
+        !glIsSampler ||
+        !glBindSampler ||
+        !glSamplerParameteri ||
+        !glSamplerParameteriv ||
+        !glSamplerParameterf ||
+        !glSamplerParameterfv ||
+        !glGetSamplerParameteriv ||
+        !glGetSamplerParameterfv ||
+        !glVertexAttribDivisor ||
+        !glBindTransformFeedback ||
+        !glDeleteTransformFeedbacks ||
+        !glGenTransformFeedbacks ||
+        !glIsTransformFeedback ||
+        !glPauseTransformFeedback ||
+        !glResumeTransformFeedback ||
+        !glGetProgramBinary ||
+        !glProgramBinary ||
+        !glProgramParameteri ||
+        !glInvalidateFramebuffer ||
+        !glInvalidateSubFramebuffer ||
+        !glTexStorage2D ||
+        !glTexStorage3D ||
+        !glGetInternalformativ)
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+/* Function pointer definitions */
+GL_APICALL void           (* GL_APIENTRY glReadBuffer) (GLenum mode);
+GL_APICALL void           (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
+GL_APICALL void           (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
+GL_APICALL void           (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
+GL_APICALL void           (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GL_APICALL void           (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
+GL_APICALL void           (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
+GL_APICALL void           (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
+GL_APICALL void           (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
+GL_APICALL GLboolean      (* GL_APIENTRY glIsQuery) (GLuint id);
+GL_APICALL void           (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
+GL_APICALL void           (* GL_APIENTRY glEndQuery) (GLenum target);
+GL_APICALL void           (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
+GL_APICALL void           (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
+GL_APICALL GLboolean      (* GL_APIENTRY glUnmapBuffer) (GLenum target);
+GL_APICALL void           (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
+GL_APICALL void           (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+GL_APICALL void           (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GL_APICALL void           (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+GL_APICALL GLvoid*        (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+GL_APICALL void           (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
+GL_APICALL void           (* GL_APIENTRY glBindVertexArray) (GLuint array);
+GL_APICALL void           (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
+GL_APICALL void           (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
+GL_APICALL GLboolean      (* GL_APIENTRY glIsVertexArray) (GLuint array);
+GL_APICALL void           (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
+GL_APICALL void           (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
+GL_APICALL void           (* GL_APIENTRY glEndTransformFeedback) (void);
+GL_APICALL void           (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
+GL_APICALL void           (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
+GL_APICALL void           (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
+GL_APICALL void           (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
+GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
+GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
+GL_APICALL void           (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
+GL_APICALL GLint          (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
+GL_APICALL void           (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
+GL_APICALL void           (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
+GL_APICALL void           (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
+GL_APICALL void           (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+GL_APICALL void           (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
+GL_APICALL void           (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
+GL_APICALL void           (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
+GL_APICALL void           (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
+GL_APICALL void           (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
+GL_APICALL void           (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
+GL_APICALL void           (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
+GL_APICALL void           (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
+GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
+GL_APICALL void           (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+GL_APICALL void           (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
+GL_APICALL void           (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
+GL_APICALL GLuint         (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
+GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
+GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
+GL_APICALL void           (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+GL_APICALL void           (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
+GL_APICALL void           (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
+GL_APICALL GLsync         (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
+GL_APICALL GLboolean      (* GL_APIENTRY glIsSync) (GLsync sync);
+GL_APICALL void           (* GL_APIENTRY glDeleteSync) (GLsync sync);
+GL_APICALL GLenum         (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+GL_APICALL void           (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+GL_APICALL void           (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
+GL_APICALL void           (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
+GL_APICALL void           (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
+GL_APICALL void           (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
+GL_APICALL void           (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
+GL_APICALL void           (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
+GL_APICALL GLboolean      (* GL_APIENTRY glIsSampler) (GLuint sampler);
+GL_APICALL void           (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
+GL_APICALL void           (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
+GL_APICALL void           (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
+GL_APICALL void           (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
+GL_APICALL void           (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
+GL_APICALL void           (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
+GL_APICALL void           (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
+GL_APICALL void           (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
+GL_APICALL void           (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
+GL_APICALL void           (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
+GL_APICALL void           (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
+GL_APICALL GLboolean      (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
+GL_APICALL void           (* GL_APIENTRY glPauseTransformFeedback) (void);
+GL_APICALL void           (* GL_APIENTRY glResumeTransformFeedback) (void);
+GL_APICALL void           (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
+GL_APICALL void           (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
+GL_APICALL void           (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
+GL_APICALL void           (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
+GL_APICALL void           (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
+GL_APICALL void           (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+GL_APICALL void           (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+GL_APICALL void           (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
+
+// GL_EXT_texture_border_clamp
+
+GL_APICALL void           (* GL_APIENTRY  glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
+GL_APICALL void           (* GL_APIENTRY  glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
+GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
+GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
+GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
+GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
+GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
+GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
+
+// End GL_EXT_texture_border_clamp
+
+// clang-format on
diff --git a/caffe2/mobile/contrib/opengl/android/gl3stub.h b/caffe2/mobile/contrib/opengl/android/gl3stub.h
new file mode 100644
index 0000000..49637b6
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/android/gl3stub.h
@@ -0,0 +1,488 @@
+
+#ifndef __gl3_h_
+#define __gl3_h_
+
+/*
+ * stub gl3.h for dynamic loading, based on:
+ * gl3.h last updated on $Date: 2013-02-12 14:37:24 -0800 (Tue, 12 Feb 2013) $
+ *
+ * Changes:
+ * - Added #include <GLES2/gl2.h>
+ * - Removed duplicate OpenGL ES 2.0 declarations
+ * - Converted OpenGL ES 3.0 function prototypes to function pointer
+ *   declarations
+ * - Added gl3stubInit() declaration
+ */
+
+#include <GLES2/gl2.h>
+#include <android/api-level.h>
+
+// clang-format off
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Call this function before calling any OpenGL ES 3.0 functions. It will
+ * return GL_TRUE if the OpenGL ES 3.0 was successfully initialized, GL_FALSE
+ * otherwise. */
+GLboolean gl3stubInit();
+
+/*-------------------------------------------------------------------------
+ * Data type definitions
+ *-----------------------------------------------------------------------*/
+
+/* OpenGL ES 3.0 */
+
+typedef unsigned short   GLhalf;
+#if __ANDROID_API__ <= 19
+typedef khronos_int64_t  GLint64;
+typedef khronos_uint64_t GLuint64;
+typedef struct __GLsync *GLsync;
+#endif
+
+/*-------------------------------------------------------------------------
+ * Token definitions
+ *-----------------------------------------------------------------------*/
+
+/* OpenGL ES core versions */
+#define GL_ES_VERSION_3_0                                1
+
+/* OpenGL ES 3.0 */
+
+#define GL_READ_BUFFER                                   0x0C02
+#define GL_UNPACK_ROW_LENGTH                             0x0CF2
+#define GL_UNPACK_SKIP_ROWS                              0x0CF3
+#define GL_UNPACK_SKIP_PIXELS                            0x0CF4
+#define GL_PACK_ROW_LENGTH                               0x0D02
+#define GL_PACK_SKIP_ROWS                                0x0D03
+#define GL_PACK_SKIP_PIXELS                              0x0D04
+#define GL_COLOR                                         0x1800
+#define GL_DEPTH                                         0x1801
+#define GL_STENCIL                                       0x1802
+#define GL_RED                                           0x1903
+#define GL_RGB8                                          0x8051
+#define GL_RGBA8                                         0x8058
+#define GL_RGB10_A2                                      0x8059
+#define GL_TEXTURE_BINDING_3D                            0x806A
+#define GL_UNPACK_SKIP_IMAGES                            0x806D
+#define GL_UNPACK_IMAGE_HEIGHT                           0x806E
+#define GL_TEXTURE_3D                                    0x806F
+#define GL_TEXTURE_WRAP_R                                0x8072
+#define GL_MAX_3D_TEXTURE_SIZE                           0x8073
+#define GL_UNSIGNED_INT_2_10_10_10_REV                   0x8368
+#define GL_MAX_ELEMENTS_VERTICES                         0x80E8
+#define GL_MAX_ELEMENTS_INDICES                          0x80E9
+#define GL_TEXTURE_MIN_LOD                               0x813A
+#define GL_TEXTURE_MAX_LOD                               0x813B
+#define GL_TEXTURE_BASE_LEVEL                            0x813C
+#define GL_TEXTURE_MAX_LEVEL                             0x813D
+#define GL_MIN                                           0x8007
+#define GL_MAX                                           0x8008
+#define GL_DEPTH_COMPONENT24                             0x81A6
+#define GL_MAX_TEXTURE_LOD_BIAS                          0x84FD
+#define GL_TEXTURE_COMPARE_MODE                          0x884C
+#define GL_TEXTURE_COMPARE_FUNC                          0x884D
+#define GL_CURRENT_QUERY                                 0x8865
+#define GL_QUERY_RESULT                                  0x8866
+#define GL_QUERY_RESULT_AVAILABLE                        0x8867
+#define GL_BUFFER_MAPPED                                 0x88BC
+#define GL_BUFFER_MAP_POINTER                            0x88BD
+#define GL_STREAM_READ                                   0x88E1
+#define GL_STREAM_COPY                                   0x88E2
+#define GL_STATIC_READ                                   0x88E5
+#define GL_STATIC_COPY                                   0x88E6
+#define GL_DYNAMIC_READ                                  0x88E9
+#define GL_DYNAMIC_COPY                                  0x88EA
+#define GL_MAX_DRAW_BUFFERS                              0x8824
+#define GL_DRAW_BUFFER0                                  0x8825
+#define GL_DRAW_BUFFER1                                  0x8826
+#define GL_DRAW_BUFFER2                                  0x8827
+#define GL_DRAW_BUFFER3                                  0x8828
+#define GL_DRAW_BUFFER4                                  0x8829
+#define GL_DRAW_BUFFER5                                  0x882A
+#define GL_DRAW_BUFFER6                                  0x882B
+#define GL_DRAW_BUFFER7                                  0x882C
+#define GL_DRAW_BUFFER8                                  0x882D
+#define GL_DRAW_BUFFER9                                  0x882E
+#define GL_DRAW_BUFFER10                                 0x882F
+#define GL_DRAW_BUFFER11                                 0x8830
+#define GL_DRAW_BUFFER12                                 0x8831
+#define GL_DRAW_BUFFER13                                 0x8832
+#define GL_DRAW_BUFFER14                                 0x8833
+#define GL_DRAW_BUFFER15                                 0x8834
+#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS               0x8B49
+#define GL_MAX_VERTEX_UNIFORM_COMPONENTS                 0x8B4A
+#define GL_SAMPLER_3D                                    0x8B5F
+#define GL_SAMPLER_2D_SHADOW                             0x8B62
+#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT               0x8B8B
+#define GL_PIXEL_PACK_BUFFER                             0x88EB
+#define GL_PIXEL_UNPACK_BUFFER                           0x88EC
+#define GL_PIXEL_PACK_BUFFER_BINDING                     0x88ED
+#define GL_PIXEL_UNPACK_BUFFER_BINDING                   0x88EF
+#define GL_FLOAT_MAT2x3                                  0x8B65
+#define GL_FLOAT_MAT2x4                                  0x8B66
+#define GL_FLOAT_MAT3x2                                  0x8B67
+#define GL_FLOAT_MAT3x4                                  0x8B68
+#define GL_FLOAT_MAT4x2                                  0x8B69
+#define GL_FLOAT_MAT4x3                                  0x8B6A
+#define GL_SRGB                                          0x8C40
+#define GL_SRGB8                                         0x8C41
+#define GL_SRGB8_ALPHA8                                  0x8C43
+#define GL_COMPARE_REF_TO_TEXTURE                        0x884E
+#define GL_MAJOR_VERSION                                 0x821B
+#define GL_MINOR_VERSION                                 0x821C
+#define GL_NUM_EXTENSIONS                                0x821D
+#define GL_RGBA32F                                       0x8814
+#define GL_RGB32F                                        0x8815
+#define GL_RGBA16F                                       0x881A
+#define GL_RGB16F                                        0x881B
+#define GL_VERTEX_ATTRIB_ARRAY_INTEGER                   0x88FD
+#define GL_MAX_ARRAY_TEXTURE_LAYERS                      0x88FF
+#define GL_MIN_PROGRAM_TEXEL_OFFSET                      0x8904
+#define GL_MAX_PROGRAM_TEXEL_OFFSET                      0x8905
+#define GL_MAX_VARYING_COMPONENTS                        0x8B4B
+#define GL_TEXTURE_2D_ARRAY                              0x8C1A
+#define GL_TEXTURE_BINDING_2D_ARRAY                      0x8C1D
+#define GL_R11F_G11F_B10F                                0x8C3A
+#define GL_UNSIGNED_INT_10F_11F_11F_REV                  0x8C3B
+#define GL_RGB9_E5                                       0x8C3D
+#define GL_UNSIGNED_INT_5_9_9_9_REV                      0x8C3E
+#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH         0x8C76
+#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE                0x8C7F
+#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS    0x8C80
+#define GL_TRANSFORM_FEEDBACK_VARYINGS                   0x8C83
+#define GL_TRANSFORM_FEEDBACK_BUFFER_START               0x8C84
+#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE                0x8C85
+#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN         0x8C88
+#define GL_RASTERIZER_DISCARD                            0x8C89
+#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
+#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS       0x8C8B
+#define GL_INTERLEAVED_ATTRIBS                           0x8C8C
+#define GL_SEPARATE_ATTRIBS                              0x8C8D
+#define GL_TRANSFORM_FEEDBACK_BUFFER                     0x8C8E
+#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING             0x8C8F
+#define GL_RGBA32UI                                      0x8D70
+#define GL_RGB32UI                                       0x8D71
+#define GL_RGBA16UI                                      0x8D76
+#define GL_RGB16UI                                       0x8D77
+#define GL_RGBA8UI                                       0x8D7C
+#define GL_RGB8UI                                        0x8D7D
+#define GL_RGBA32I                                       0x8D82
+#define GL_RGB32I                                        0x8D83
+#define GL_RGBA16I                                       0x8D88
+#define GL_RGB16I                                        0x8D89
+#define GL_RGBA8I                                        0x8D8E
+#define GL_RGB8I                                         0x8D8F
+#define GL_RED_INTEGER                                   0x8D94
+#define GL_RGB_INTEGER                                   0x8D98
+#define GL_RGBA_INTEGER                                  0x8D99
+#define GL_SAMPLER_2D_ARRAY                              0x8DC1
+#define GL_SAMPLER_2D_ARRAY_SHADOW                       0x8DC4
+#define GL_SAMPLER_CUBE_SHADOW                           0x8DC5
+#define GL_UNSIGNED_INT_VEC2                             0x8DC6
+#define GL_UNSIGNED_INT_VEC3                             0x8DC7
+#define GL_UNSIGNED_INT_VEC4                             0x8DC8
+#define GL_INT_SAMPLER_2D                                0x8DCA
+#define GL_INT_SAMPLER_3D                                0x8DCB
+#define GL_INT_SAMPLER_CUBE                              0x8DCC
+#define GL_INT_SAMPLER_2D_ARRAY                          0x8DCF
+#define GL_UNSIGNED_INT_SAMPLER_2D                       0x8DD2
+#define GL_UNSIGNED_INT_SAMPLER_3D                       0x8DD3
+#define GL_UNSIGNED_INT_SAMPLER_CUBE                     0x8DD4
+#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY                 0x8DD7
+#define GL_BUFFER_ACCESS_FLAGS                           0x911F
+#define GL_BUFFER_MAP_LENGTH                             0x9120
+#define GL_BUFFER_MAP_OFFSET                             0x9121
+#define GL_DEPTH_COMPONENT32F                            0x8CAC
+#define GL_DEPTH32F_STENCIL8                             0x8CAD
+#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV                0x8DAD
+#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING         0x8210
+#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE         0x8211
+#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE               0x8212
+#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE             0x8213
+#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE              0x8214
+#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE             0x8215
+#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE             0x8216
+#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE           0x8217
+#define GL_FRAMEBUFFER_DEFAULT                           0x8218
+#define GL_FRAMEBUFFER_UNDEFINED                         0x8219
+#define GL_DEPTH_STENCIL_ATTACHMENT                      0x821A
+#define GL_DEPTH_STENCIL                                 0x84F9
+#define GL_UNSIGNED_INT_24_8                             0x84FA
+#define GL_DEPTH24_STENCIL8                              0x88F0
+#define GL_UNSIGNED_NORMALIZED                           0x8C17
+#define GL_DRAW_FRAMEBUFFER_BINDING                      GL_FRAMEBUFFER_BINDING
+#define GL_READ_FRAMEBUFFER                              0x8CA8
+#define GL_DRAW_FRAMEBUFFER                              0x8CA9
+#define GL_READ_FRAMEBUFFER_BINDING                      0x8CAA
+#define GL_RENDERBUFFER_SAMPLES                          0x8CAB
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER          0x8CD4
+#define GL_MAX_COLOR_ATTACHMENTS                         0x8CDF
+#define GL_COLOR_ATTACHMENT1                             0x8CE1
+#define GL_COLOR_ATTACHMENT2                             0x8CE2
+#define GL_COLOR_ATTACHMENT3                             0x8CE3
+#define GL_COLOR_ATTACHMENT4                             0x8CE4
+#define GL_COLOR_ATTACHMENT5                             0x8CE5
+#define GL_COLOR_ATTACHMENT6                             0x8CE6
+#define GL_COLOR_ATTACHMENT7                             0x8CE7
+#define GL_COLOR_ATTACHMENT8                             0x8CE8
+#define GL_COLOR_ATTACHMENT9                             0x8CE9
+#define GL_COLOR_ATTACHMENT10                            0x8CEA
+#define GL_COLOR_ATTACHMENT11                            0x8CEB
+#define GL_COLOR_ATTACHMENT12                            0x8CEC
+#define GL_COLOR_ATTACHMENT13                            0x8CED
+#define GL_COLOR_ATTACHMENT14                            0x8CEE
+#define GL_COLOR_ATTACHMENT15                            0x8CEF
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE            0x8D56
+#define GL_MAX_SAMPLES                                   0x8D57
+#define GL_HALF_FLOAT                                    0x140B
+#define GL_MAP_READ_BIT                                  0x0001
+#define GL_MAP_WRITE_BIT                                 0x0002
+#define GL_MAP_INVALIDATE_RANGE_BIT                      0x0004
+#define GL_MAP_INVALIDATE_BUFFER_BIT                     0x0008
+#define GL_MAP_FLUSH_EXPLICIT_BIT                        0x0010
+#define GL_MAP_UNSYNCHRONIZED_BIT                        0x0020
+#define GL_RG                                            0x8227
+#define GL_RG_INTEGER                                    0x8228
+#define GL_R8                                            0x8229
+#define GL_RG8                                           0x822B
+#define GL_R16F                                          0x822D
+#define GL_R32F                                          0x822E
+#define GL_RG16F                                         0x822F
+#define GL_RG32F                                         0x8230
+#define GL_R8I                                           0x8231
+#define GL_R8UI                                          0x8232
+#define GL_R16I                                          0x8233
+#define GL_R16UI                                         0x8234
+#define GL_R32I                                          0x8235
+#define GL_R32UI                                         0x8236
+#define GL_RG8I                                          0x8237
+#define GL_RG8UI                                         0x8238
+#define GL_RG16I                                         0x8239
+#define GL_RG16UI                                        0x823A
+#define GL_RG32I                                         0x823B
+#define GL_RG32UI                                        0x823C
+#define GL_VERTEX_ARRAY_BINDING                          0x85B5
+#define GL_R8_SNORM                                      0x8F94
+#define GL_RG8_SNORM                                     0x8F95
+#define GL_RGB8_SNORM                                    0x8F96
+#define GL_RGBA8_SNORM                                   0x8F97
+#define GL_SIGNED_NORMALIZED                             0x8F9C
+#define GL_PRIMITIVE_RESTART_FIXED_INDEX                 0x8D69
+#define GL_COPY_READ_BUFFER                              0x8F36
+#define GL_COPY_WRITE_BUFFER                             0x8F37
+#define GL_COPY_READ_BUFFER_BINDING                      GL_COPY_READ_BUFFER
+#define GL_COPY_WRITE_BUFFER_BINDING                     GL_COPY_WRITE_BUFFER
+#define GL_UNIFORM_BUFFER                                0x8A11
+#define GL_UNIFORM_BUFFER_BINDING                        0x8A28
+#define GL_UNIFORM_BUFFER_START                          0x8A29
+#define GL_UNIFORM_BUFFER_SIZE                           0x8A2A
+#define GL_MAX_VERTEX_UNIFORM_BLOCKS                     0x8A2B
+#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS                   0x8A2D
+#define GL_MAX_COMBINED_UNIFORM_BLOCKS                   0x8A2E
+#define GL_MAX_UNIFORM_BUFFER_BINDINGS                   0x8A2F
+#define GL_MAX_UNIFORM_BLOCK_SIZE                        0x8A30
+#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS        0x8A31
+#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS      0x8A33
+#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT               0x8A34
+#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH          0x8A35
+#define GL_ACTIVE_UNIFORM_BLOCKS                         0x8A36
+#define GL_UNIFORM_TYPE                                  0x8A37
+#define GL_UNIFORM_SIZE                                  0x8A38
+#define GL_UNIFORM_NAME_LENGTH                           0x8A39
+#define GL_UNIFORM_BLOCK_INDEX                           0x8A3A
+#define GL_UNIFORM_OFFSET                                0x8A3B
+#define GL_UNIFORM_ARRAY_STRIDE                          0x8A3C
+#define GL_UNIFORM_MATRIX_STRIDE                         0x8A3D
+#define GL_UNIFORM_IS_ROW_MAJOR                          0x8A3E
+#define GL_UNIFORM_BLOCK_BINDING                         0x8A3F
+#define GL_UNIFORM_BLOCK_DATA_SIZE                       0x8A40
+#define GL_UNIFORM_BLOCK_NAME_LENGTH                     0x8A41
+#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS                 0x8A42
+#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES          0x8A43
+#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER     0x8A44
+#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER   0x8A46
+#define GL_INVALID_INDEX                                 0xFFFFFFFFu
+#define GL_MAX_VERTEX_OUTPUT_COMPONENTS                  0x9122
+#define GL_MAX_FRAGMENT_INPUT_COMPONENTS                 0x9125
+#define GL_MAX_SERVER_WAIT_TIMEOUT                       0x9111
+#define GL_OBJECT_TYPE                                   0x9112
+#define GL_SYNC_CONDITION                                0x9113
+#define GL_SYNC_STATUS                                   0x9114
+#define GL_SYNC_FLAGS                                    0x9115
+#define GL_SYNC_FENCE                                    0x9116
+#define GL_SYNC_GPU_COMMANDS_COMPLETE                    0x9117
+#define GL_UNSIGNALED                                    0x9118
+#define GL_SIGNALED                                      0x9119
+#define GL_ALREADY_SIGNALED                              0x911A
+#define GL_TIMEOUT_EXPIRED                               0x911B
+#define GL_CONDITION_SATISFIED                           0x911C
+#define GL_WAIT_FAILED                                   0x911D
+#define GL_SYNC_FLUSH_COMMANDS_BIT                       0x00000001
+#define GL_TIMEOUT_IGNORED                               0xFFFFFFFFFFFFFFFFull
+#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR                   0x88FE
+#define GL_ANY_SAMPLES_PASSED                            0x8C2F
+#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE               0x8D6A
+#define GL_SAMPLER_BINDING                               0x8919
+#define GL_RGB10_A2UI                                    0x906F
+#define GL_TEXTURE_SWIZZLE_R                             0x8E42
+#define GL_TEXTURE_SWIZZLE_G                             0x8E43
+#define GL_TEXTURE_SWIZZLE_B                             0x8E44
+#define GL_TEXTURE_SWIZZLE_A                             0x8E45
+#define GL_GREEN                                         0x1904
+#define GL_BLUE                                          0x1905
+#define GL_INT_2_10_10_10_REV                            0x8D9F
+#define GL_TRANSFORM_FEEDBACK                            0x8E22
+#define GL_TRANSFORM_FEEDBACK_PAUSED                     0x8E23
+#define GL_TRANSFORM_FEEDBACK_ACTIVE                     0x8E24
+#define GL_TRANSFORM_FEEDBACK_BINDING                    0x8E25
+#define GL_PROGRAM_BINARY_RETRIEVABLE_HINT               0x8257
+#define GL_PROGRAM_BINARY_LENGTH                         0x8741
+#define GL_NUM_PROGRAM_BINARY_FORMATS                    0x87FE
+#define GL_PROGRAM_BINARY_FORMATS                        0x87FF
+#define GL_COMPRESSED_R11_EAC                            0x9270
+#define GL_COMPRESSED_SIGNED_R11_EAC                     0x9271
+#define GL_COMPRESSED_RG11_EAC                           0x9272
+#define GL_COMPRESSED_SIGNED_RG11_EAC                    0x9273
+#define GL_COMPRESSED_RGB8_ETC2                          0x9274
+#define GL_COMPRESSED_SRGB8_ETC2                         0x9275
+#define GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2      0x9276
+#define GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2     0x9277
+#define GL_COMPRESSED_RGBA8_ETC2_EAC                     0x9278
+#define GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC              0x9279
+#define GL_TEXTURE_IMMUTABLE_FORMAT                      0x912F
+#define GL_MAX_ELEMENT_INDEX                             0x8D6B
+#define GL_NUM_SAMPLE_COUNTS                             0x9380
+#define GL_TEXTURE_IMMUTABLE_LEVELS                      0x82DF
+
+/*-------------------------------------------------------------------------
+ * Entrypoint definitions
+ *-----------------------------------------------------------------------*/
+
+/* OpenGL ES 3.0 */
+
+extern GL_APICALL void           (* GL_APIENTRY glReadBuffer) (GLenum mode);
+extern GL_APICALL void           (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
+extern GL_APICALL void           (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
+extern GL_APICALL void           (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
+extern GL_APICALL void           (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+extern GL_APICALL void           (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
+extern GL_APICALL void           (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
+extern GL_APICALL void           (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
+extern GL_APICALL void           (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glIsQuery) (GLuint id);
+extern GL_APICALL void           (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
+extern GL_APICALL void           (* GL_APIENTRY glEndQuery) (GLenum target);
+extern GL_APICALL void           (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
+extern GL_APICALL void           (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glUnmapBuffer) (GLenum target);
+extern GL_APICALL void           (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
+extern GL_APICALL void           (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+extern GL_APICALL void           (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+extern GL_APICALL void           (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+extern GL_APICALL GLvoid*        (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+extern GL_APICALL void           (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
+extern GL_APICALL void           (* GL_APIENTRY glBindVertexArray) (GLuint array);
+extern GL_APICALL void           (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
+extern GL_APICALL void           (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glIsVertexArray) (GLuint array);
+extern GL_APICALL void           (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
+extern GL_APICALL void           (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
+extern GL_APICALL void           (* GL_APIENTRY glEndTransformFeedback) (void);
+extern GL_APICALL void           (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
+extern GL_APICALL void           (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
+extern GL_APICALL void           (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
+extern GL_APICALL void           (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
+extern GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
+extern GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
+extern GL_APICALL void           (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
+extern GL_APICALL GLint          (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
+extern GL_APICALL void           (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
+extern GL_APICALL void           (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
+extern GL_APICALL void           (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
+extern GL_APICALL void           (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+extern GL_APICALL void           (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
+extern GL_APICALL void           (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
+extern GL_APICALL void           (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
+extern GL_APICALL void           (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
+extern GL_APICALL void           (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
+extern GL_APICALL void           (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
+extern GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
+extern GL_APICALL void           (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+extern GL_APICALL void           (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
+extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
+extern GL_APICALL GLuint         (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
+extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
+extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
+extern GL_APICALL void           (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+extern GL_APICALL void           (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
+extern GL_APICALL void           (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
+extern GL_APICALL GLsync         (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glIsSync) (GLsync sync);
+extern GL_APICALL void           (* GL_APIENTRY glDeleteSync) (GLsync sync);
+extern GL_APICALL GLenum         (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+extern GL_APICALL void           (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+extern GL_APICALL void           (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
+extern GL_APICALL void           (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
+extern GL_APICALL void           (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
+extern GL_APICALL void           (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
+extern GL_APICALL void           (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
+extern GL_APICALL void           (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glIsSampler) (GLuint sampler);
+extern GL_APICALL void           (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
+extern GL_APICALL void           (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
+extern GL_APICALL void           (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
+extern GL_APICALL void           (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
+extern GL_APICALL void           (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
+extern GL_APICALL void           (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
+extern GL_APICALL void           (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
+extern GL_APICALL void           (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
+extern GL_APICALL void           (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
+extern GL_APICALL void           (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
+extern GL_APICALL void           (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
+extern GL_APICALL GLboolean      (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
+extern GL_APICALL void           (* GL_APIENTRY glPauseTransformFeedback) (void);
+extern GL_APICALL void           (* GL_APIENTRY glResumeTransformFeedback) (void);
+extern GL_APICALL void           (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
+extern GL_APICALL void           (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
+extern GL_APICALL void           (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
+extern GL_APICALL void           (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
+extern GL_APICALL void           (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
+extern GL_APICALL void           (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+extern GL_APICALL void           (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+extern GL_APICALL void           (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
+
+#ifndef GL_EXT_texture_border_clamp
+#define GL_EXT_texture_border_clamp 1
+#define GL_TEXTURE_BORDER_COLOR_EXT       0x1004
+#define GL_CLAMP_TO_BORDER_EXT            0x812D
+extern GL_APICALL void           (* GL_APIENTRY  glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
+extern GL_APICALL void           (* GL_APIENTRY  glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
+extern GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
+extern GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
+extern GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
+extern GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
+extern GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
+extern GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
+#endif /* GL_EXT_texture_border_clamp */
+
+#ifdef __cplusplus
+}
+#endif
+// clang-format on
+
+#endif
diff --git a/caffe2/mobile/contrib/opengl/core/CMakeLists.txt b/caffe2/mobile/contrib/opengl/core/CMakeLists.txt
new file mode 100644
index 0000000..dbc170e
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/core/DataTransfer.cc b/caffe2/mobile/contrib/opengl/core/DataTransfer.cc
new file mode 100644
index 0000000..775d921
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/DataTransfer.cc
@@ -0,0 +1,249 @@
+
+#include "DataTransfer.h"
+#include "GLLogging.h"
+
+#include "caffe2/core/common.h"
+
+inline uint16x4x4_t vld4_u16_aligned16(const uint16_t* address) {
+  return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
+}
+
+inline uint16x4_t vld1_u16_aligned8(const uint16_t* address) {
+  return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
+}
+
+inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
+  vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
+}
+
+inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
+  vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
+}
+
+template <int input_channels>
+static void interleaveSlice(
+    void* output, const float* input, size_t width, size_t height, size_t row_stride) {
+  const float* input_r = input;
+  const float* input_g = input_r + height * width;
+  const float* input_b = input_g + height * width;
+  const float* input_a = input_b + height * width;
+  uint16_t* output_f16 = static_cast<uint16_t*>(output);
+  if (width >= 4) {
+    for (size_t y = 0; y < height; y++) {
+      size_t nx = width;
+      while (nx >= 4) {
+        const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
+        input_r += 4;
+        uint16x4_t g, b, a;
+        g = b = a = vdup_n_u16(0);
+        if (input_channels >= 2) {
+          g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
+          input_g += 4;
+          if (input_channels >= 3) {
+            b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
+            input_b += 4;
+            if (input_channels >= 4) {
+              a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
+              input_a += 4;
+            }
+          }
+        }
+
+        const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
+        vst4_u16_aligned16(output_f16, rgba);
+        output_f16 += 4 * 4;
+
+        nx -= 4;
+      }
+      if (nx != 0) {
+        output_f16 -= (4 - nx) * 4;
+        input_r -= 4 - nx;
+        if (input_channels >= 2) {
+          input_g -= 4 - nx;
+          if (input_channels >= 3) {
+            input_b -= 4 - nx;
+            if (input_channels >= 4) {
+              input_a -= 4 - nx;
+            }
+          }
+        }
+
+        const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
+        input_r += 4;
+        uint16x4_t g, b, a;
+        g = b = a = vdup_n_u16(0);
+        if (input_channels >= 2) {
+          g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
+          input_g += 4;
+          if (input_channels >= 3) {
+            b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
+            input_b += 4;
+            if (input_channels >= 4) {
+              a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
+              input_a += 4;
+            }
+          }
+        }
+
+        const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
+        vst4_u16_aligned16(output_f16, rgba);
+        output_f16 += 4 * 4;
+      }
+      output_f16 += (row_stride - width) * 4;
+    }
+  } else {
+    for (size_t y = 0; y < height; y++) {
+      for (size_t x = 0; x < width; x++) {
+        float32x4_t rgba = vld1q_dup_f32(input_r++);
+        if (input_channels >= 2) {
+          rgba = vld1q_lane_f32(input_g++, rgba, 1);
+          if (input_channels >= 3) {
+            rgba = vld1q_lane_f32(input_b++, rgba, 2);
+            if (input_channels >= 4) {
+              rgba = vld1q_lane_f32(input_a++, rgba, 3);
+            }
+          }
+        }
+        vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
+        output_f16 += 4;
+      }
+      output_f16 += (row_stride - width) * 4;
+    }
+  }
+}
+
+void interleaveSlice(void* output,
+                     const float* input,
+                     size_t width,
+                     size_t height,
+                     size_t row_stride,
+                     uint16_t input_channels) {
+  switch (input_channels) {
+  case 1:
+    interleaveSlice<1>(output, input, width, height, row_stride);
+    break;
+  case 2:
+    interleaveSlice<2>(output, input, width, height, row_stride);
+    break;
+  case 3:
+    interleaveSlice<3>(output, input, width, height, row_stride);
+    break;
+  case 4:
+    interleaveSlice<4>(output, input, width, height, row_stride);
+    break;
+  }
+}
+
+template <int output_channels>
+static void deInterleaveSlice(
+    float* output, const void* input, size_t width, size_t height, size_t row_stride) {
+  float* output_r = output;
+  float* output_g = output_r + height * width;
+  float* output_b = output_g + height * width;
+  float* output_a = output_b + height * width;
+  const uint16_t* input_f16 = static_cast<const uint16_t*>(input);
+  if (width >= 4) {
+    for (size_t y = 0; y < height; y++) {
+      size_t nx = width;
+      while (nx >= 4) {
+        const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
+        input_f16 += 4 * 4;
+        const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
+        vst1q_f32(output_r, r);
+        output_r += 4;
+        if (output_channels >= 2) {
+          const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
+          vst1q_f32(output_g, g);
+          output_g += 4;
+          if (output_channels >= 3) {
+            const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
+            vst1q_f32(output_b, b);
+            output_b += 4;
+            if (output_channels >= 4) {
+              const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
+              vst1q_f32(output_a, a);
+              output_a += 4;
+            }
+          }
+        }
+
+        nx -= 4;
+      }
+      if (nx != 0) {
+        input_f16 -= (4 - nx) * 4;
+        output_r -= 4 - nx;
+        if (output_channels >= 2) {
+          output_g -= 4 - nx;
+          if (output_channels >= 3) {
+            output_b -= 4 - nx;
+            if (output_channels >= 4) {
+              output_a -= 4 - nx;
+            }
+          }
+        }
+
+        const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
+        input_f16 += 4 * 4;
+        const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
+        vst1q_f32(output_r, r);
+        output_r += 4;
+        if (output_channels >= 2) {
+          const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
+          vst1q_f32(output_g, g);
+          output_g += 4;
+          if (output_channels >= 3) {
+            const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
+            vst1q_f32(output_b, b);
+            output_b += 4;
+            if (output_channels >= 4) {
+              const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
+              vst1q_f32(output_a, a);
+              output_a += 4;
+            }
+          }
+        }
+      }
+      input_f16 += (row_stride - width) * 4;
+    }
+  } else {
+    for (size_t y = 0; y < height; y++) {
+      for (size_t x = 0; x < width; x++) {
+        const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
+        input_f16 += 4;
+        vst1q_lane_f32(output_r++, rgba, 0);
+        if (output_channels >= 2) {
+          vst1q_lane_f32(output_g++, rgba, 1);
+          if (output_channels >= 3) {
+            vst1q_lane_f32(output_b++, rgba, 2);
+            if (output_channels >= 4) {
+              vst1q_lane_f32(output_a++, rgba, 3);
+            }
+          }
+        }
+      }
+      input_f16 += (row_stride - width) * 4;
+    }
+  }
+}
+
+void deInterleaveSlice(float* output,
+                       const void* input,
+                       size_t width,
+                       size_t height,
+                       size_t row_stride,
+                       uint32_t output_channels) {
+  switch (output_channels) {
+  case 1:
+    deInterleaveSlice<1>(output, input, width, height, row_stride);
+    break;
+  case 2:
+    deInterleaveSlice<2>(output, input, width, height, row_stride);
+    break;
+  case 3:
+    deInterleaveSlice<3>(output, input, width, height, row_stride);
+    break;
+  case 4:
+    deInterleaveSlice<4>(output, input, width, height, row_stride);
+    break;
+  }
+}
diff --git a/caffe2/mobile/contrib/opengl/core/DataTransfer.h b/caffe2/mobile/contrib/opengl/core/DataTransfer.h
new file mode 100644
index 0000000..59a91dd
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/DataTransfer.h
@@ -0,0 +1,17 @@
+
+#pragma once
+
+#include "arm_neon_support.h"
+
+void interleaveSlice(void* output,
+                     const float* input,
+                     size_t width,
+                     size_t height,
+                     size_t row_stride,
+                     uint16_t input_channels);
+void deInterleaveSlice(float* output,
+                       const void* input,
+                       size_t width,
+                       size_t height,
+                       size_t input_stride,
+                       uint32_t output_channels);
diff --git a/caffe2/mobile/contrib/opengl/core/GL.h b/caffe2/mobile/contrib/opengl/core/GL.h
new file mode 100644
index 0000000..31e1c0d
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GL.h
@@ -0,0 +1,12 @@
+
+#pragma once
+#include "caffe2/core/common.h"
+
+#if CAFFE2_IOS
+#include <OpenGLES/ES3/gl.h>
+#include <OpenGLES/ES3/glext.h>
+#elif CAFFE2_ANDROID
+#include <EGL/egl.h>
+#include <GLES2/gl2.h>
+#include "caffe2/mobile/contrib/opengl/android/gl3stub.h"
+#endif
diff --git a/caffe2/mobile/contrib/opengl/core/GLContext.cc b/caffe2/mobile/contrib/opengl/core/GLContext.cc
new file mode 100644
index 0000000..0f5086c
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLContext.cc
@@ -0,0 +1,126 @@
+
+#include "caffe2/core/logging.h"
+
+#include "GL.h"
+#include "GLContext.h"
+#include "GLLogging.h"
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#if CAFFE2_IOS
+#include "sys/utsname.h"
+#include <regex>
+#endif
+
+void getOpenGLESVersion(int& major, int& minor) {
+  glGetIntegerv(GL_MAJOR_VERSION, &major);
+  glGetIntegerv(GL_MINOR_VERSION, &minor);
+}
+
+bool checkOpenGLExtensions(std::string gl_ext_str) {
+  static std::unordered_set<std::string> extensions;
+  if (extensions.empty()) {
+    const caffe2::string extension_str((const char*)glGetString(GL_EXTENSIONS));
+    LOG(INFO) << "GL_EXTENSIONS: " << extension_str;
+
+    std::stringstream ss(extension_str);
+    while (!ss.eof()) {
+      std::string extension;
+      ss >> extension;
+      extensions.insert(extension);
+    }
+  }
+
+  return extensions.count(gl_ext_str) > 0;
+}
+
+bool GLContext::GL_EXT_texture_border_clamp_defined() {
+  static int major = 0, minor = 0;
+  if (major == 0) {
+    getOpenGLESVersion(major, minor);
+  }
+
+  if (major == 3 && minor == 2) {
+    return true;
+  }
+
+  return checkOpenGLExtensions("GL_EXT_texture_border_clamp") || // Most common
+         checkOpenGLExtensions("GL_OES_texture_border_clamp");
+}
+
+bool supportOpenGLES3(bool* half_float_supported) {
+  int major = 0, minor = 0;
+  getOpenGLESVersion(major, minor);
+
+  LOG(INFO) << "GL_VERSION: OpenGL ES " << major << "." << minor;
+
+  if (major < 3) {
+    LOG(ERROR) << "OpenGL ES 3.0 not supported";
+    return false;
+  }
+
+  if (!checkOpenGLExtensions("GL_EXT_color_buffer_half_float")) {
+    LOG(ERROR) << "GL_EXT_color_buffer_half_float is not available";
+    if (half_float_supported) {
+      *half_float_supported = false;
+    }
+  }
+  return true;
+}
+
+#if CAFFE2_IOS
+int iPhoneVersion() {
+  static int version = 0;
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    struct utsname systemInfo;
+    uname(&systemInfo);
+    std::string iphone_ver_str = systemInfo.machine;
+    LOG(INFO) << systemInfo.machine;
+
+    if (iphone_ver_str.find("iPhone") != std::string::npos) {
+      std::regex regStr("([0-9]+)");
+      std::smatch matchs;
+      if (std::regex_search(iphone_ver_str, matchs, regStr)) {
+        version = stoi(matchs[0]);
+      }
+    }
+  });
+  return version;
+}
+#endif
+
+#if CAFFE2_ANDROID
+// whitelist of supported GPUs
+bool isSupportedRenderer() {
+  static std::unordered_set<std::string> supported_renderers = {
+      "Adreno (TM) 540",
+      "Adreno (TM) 530",
+      "Adreno (TM) 510",
+      "Adreno (TM) 430",
+      "Adreno (TM) 418",
+      "Mali-G71",
+      "Mali-T880",
+      "NVIDIA Tegra"};
+  std::string rendererStr((const char*)glGetString(GL_RENDERER));
+  LOG(INFO) << "GL_RENDERER: " << rendererStr;
+
+  int start = rendererStr.find_first_not_of(" ");
+  int end = rendererStr.find_last_not_of(" ");
+  rendererStr = rendererStr.substr(start, end - start + 1);
+  return supported_renderers.count(rendererStr) > 0;
+}
+#endif
+
+bool isSupportedDevice() {
+#if CAFFE2_IOS
+  return iPhoneVersion() >= 7; // iPhone 6 and up
+#elif CAFFE2_ANDROID
+  return isSupportedRenderer();
+#else
+  return false;
+#endif
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLContext.h b/caffe2/mobile/contrib/opengl/core/GLContext.h
new file mode 100644
index 0000000..fc84f8f
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLContext.h
@@ -0,0 +1,46 @@
+
+#pragma once
+#include "GLTexture.h"
+#include "caffe2/core/common.h"
+#include <functional>
+
+class GLContext {
+ private:
+  static std::unique_ptr<GLContext> _glcontext;
+  std::function<const GLTexture*(const int width, const int height)> foreignTextureAllocator =
+      nullptr;
+
+ protected:
+  bool half_float_supported = true;
+
+ public:
+  virtual void set_context() = 0;
+  virtual void reset_context() = 0;
+  virtual void flush_context() = 0;
+  virtual ~GLContext(){};
+
+  static void initGLContext();
+  static GLContext* getGLContext();
+  static void deleteGLContext();
+
+  static bool GL_EXT_texture_border_clamp_defined();
+
+  inline bool halfFloatTextureSupported() { return half_float_supported; }
+
+  void setTextureAllocator(
+      std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
+    foreignTextureAllocator = textureAllocator;
+  }
+
+  std::function<const GLTexture*(const int width, const int height)> getTextureAllocator() {
+    return foreignTextureAllocator;
+  }
+};
+
+bool supportOpenGLES3(bool* hfs = nullptr);
+
+bool isSupportedDevice();
+
+#if CAFFE2_IOS
+int iPhoneVersion();
+#endif
diff --git a/caffe2/mobile/contrib/opengl/core/GLFilter.cc b/caffe2/mobile/contrib/opengl/core/GLFilter.cc
new file mode 100644
index 0000000..7c039e9
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLFilter.cc
@@ -0,0 +1,567 @@
+
+#include "GLFilter.h"
+#include <sstream>
+
+GLFilter::GLFilter(const std::string _kernel_name,
+                   const std::string _vertex_shader,
+                   const std::string _fragment_shader,
+                   const std::vector<binding*> uniforms,
+                   const std::vector<binding*> uniform_blocks,
+                   const std::vector<binding*> attributes,
+                   const replacements_t& _replacements)
+    : kernel_name(_kernel_name),
+      uniforms_(uniforms),
+      uniform_blocks_(uniform_blocks),
+      attributes_(attributes) {
+  // shader program
+  if (createProgram(_vertex_shader.c_str(),
+                    process_replacements(_fragment_shader, _replacements).c_str(),
+                    &program)) {
+    gl_log(GL_VERBOSE, "created program %d\n", program);
+  } else {
+    releaseBuffers();
+
+    throwRuntimeError(
+        [&](std::stringstream& errmsg) { errmsg << "Problem initializing OpenGL program"; });
+  }
+}
+
+const char* shader_utils = R"GLSL(
+#define unpackHalf4x16(pd) vec4(unpackHalf2x16(pd.x), unpackHalf2x16(pd.y))
+#define packHalf4x16(pd) uvec2(packHalf2x16(pd.xy), packHalf2x16(pd.zw))
+)GLSL";
+
+const char* half_float_texture_utils = R"GLSL(
+precision mediump sampler2D;
+
+#define TEXTURE_OUTPUT(_loc, _var) \
+        layout(location = _loc) out mediump vec4 _var
+#define TEXTURE_INPUT(_var) \
+        uniform sampler2D _var
+#define TEXTURE_LOAD(_input, _coord) \
+        texelFetch((_input), (_coord), 0)
+#define TEXTURE_STORE(_val) \
+        (_val)
+)GLSL";
+
+const char* half_float_compat_texture_utils = R"GLSL(
+precision highp usampler2D;
+
+#define TEXTURE_OUTPUT(_loc, _var) \
+        layout(location = _loc) out highp uvec2 _var
+#define TEXTURE_INPUT(_var) \
+        uniform usampler2D _var
+#define TEXTURE_LOAD(_input, _coord) \
+        unpackHalf4x16(texelFetch((_input), (_coord), 0).xy)
+#define TEXTURE_STORE(_val) \
+        (uvec2(packHalf4x16((_val))))
+)GLSL";
+
+std::string GLFilter::process_replacements(std::string shader,
+                                           const replacements_t& replacements) const {
+  for (auto&& replacement : replacements) {
+    std::string tag = "$(" + replacement.first + ")";
+    std::string value = replacement.second;
+
+    size_t position = shader.find(tag);
+    if (position != std::string::npos) {
+      shader.replace(position, tag.size(), value);
+    } else {
+      throwRuntimeError(
+          [&](std::stringstream& errmsg) { errmsg << "Couldn't find replacement tag: " << tag; });
+    }
+  }
+
+  // Add some #defines for convenience
+  std::string version_tag = "#version 300 es";
+  if (GLContext::getGLContext()->halfFloatTextureSupported()) {
+    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
+  } else {
+    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
+  }
+  shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
+  return shader;
+}
+
+template <typename T>
+void GLFilter::attach_uniform_buffer(const binding* block,
+                                     GLuint bindingPoint,
+                                     std::function<void(T*, size_t)> loader) {
+  if (block->location >= 0) {
+    if (bindingPoint < kMaxUniformBlocks) {
+      if (uniformBlock[bindingPoint] == 0) {
+        // Associate the uniform block index with a binding point
+        glUniformBlockBinding(program, block->location, bindingPoint);
+
+        // Get the size of block
+        glGetActiveUniformBlockiv(program, block->location, GL_UNIFORM_BLOCK_DATA_SIZE, &blockSize[bindingPoint]);
+
+        // Create and fill a buffer object
+        glGenBuffers(1, &uniformBlock[bindingPoint]);
+
+        gl_log(GL_VERBOSE, "created uniform buffer block %d\n", uniformBlock[bindingPoint]);
+      }
+
+      // Fill a buffer object
+      glBindBuffer(GL_UNIFORM_BUFFER, uniformBlock[bindingPoint]);
+      glBufferData(GL_UNIFORM_BUFFER, blockSize[bindingPoint], NULL, GL_DYNAMIC_DRAW);
+
+      checkGLError([&](std::stringstream& errmsg) {
+        errmsg << "Unable to bind uniform buffer " << block->name << ":" << block->location
+               << " at binding point " << bindingPoint;
+      });
+
+      T* blockData = (T*)glMapBufferRange(
+          GL_UNIFORM_BUFFER, 0, blockSize[bindingPoint], GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+      if (blockData != NULL) {
+        // Copy the data into the mapped buffer
+        if (loader)
+          loader(blockData, blockSize[bindingPoint]);
+
+        // Unmap the buffer
+        if (glUnmapBuffer(GL_UNIFORM_BUFFER) == GL_TRUE) {
+          // Bind the buffer object to the uniform block binding point
+          glBindBufferBase(GL_UNIFORM_BUFFER, bindingPoint, uniformBlock[bindingPoint]);
+        } else {
+          throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "Error unmapping element buffer object"; });
+        }
+      } else {
+        throwRuntimeError([&](std::stringstream& errmsg) {
+          errmsg << "Error mapping element buffer object, blockSize: " << blockSize;
+        });
+      }
+
+      glBindBuffer(GL_UNIFORM_BUFFER, 0);
+    } else {
+      throwRuntimeError([&](std::stringstream& errmsg) {
+        errmsg << "Uniform block binding point out of range: " << bindingPoint << ", should be < "
+               << kMaxUniformBlocks;
+      });
+    }
+  } else {
+    throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "unbound uniform block"; });
+  }
+}
+
+template void GLFilter::attach_uniform_buffer<float16_t>(const binding* block,
+                                                         GLuint bindingPoint,
+                                                         std::function<void(float16_t*, size_t)> loader);
+
+static const GLenum unused_capability[] = {GL_CULL_FACE,
+                                           GL_BLEND,
+                                           GL_DITHER,
+                                           GL_STENCIL_TEST,
+                                           GL_DEPTH_TEST,
+                                           GL_SCISSOR_TEST,
+                                           GL_POLYGON_OFFSET_FILL,
+                                           GL_SAMPLE_ALPHA_TO_COVERAGE,
+                                           GL_SAMPLE_COVERAGE};
+
+void GLFilter::run(const std::vector<texture_attachment>& input,
+                   const std::vector<const GLTexture*>& output,
+                   std::function<void(void)> uniforms_initializer,
+                   int width,
+                   int height) {
+  const int first_texture_id = GL_TEXTURE0;
+
+  GLint defaultFramebuffer = 0;
+  glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
+
+  gl_log(GL_VERBOSE,
+         "GLFilter::run %s - inputs: %d, outputs: %d, width: %d, height: %d\n",
+         kernel_name.c_str(),
+         input.size(),
+         output.size(),
+         width,
+         height);
+
+  if (output.size() > 4) {
+    throwRuntimeError([&](std::stringstream& errmsg) {
+      errmsg << "Too many output textures: " << output.size() << ", should be <= 4";
+    });
+  }
+
+  if (frameBuffer == 0) {
+    // create the frame buffer
+    glGenFramebuffers(1, &frameBuffer);
+    gl_log(GL_VERBOSE, "created frame buffer %d\n", frameBuffer);
+  }
+
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBuffer);
+  checkGLError([&](std::stringstream& errmsg) { errmsg << "glBindFramebuffer"; });
+
+  // Set up the output textures
+  for (int i = 0; i < output.size(); i++) {
+    GLenum target = output[i]->target();
+    GLuint texture = output[i]->name();
+
+    glBindTexture(target, texture);
+    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, target, texture, 0);
+
+    checkGLError([&](std::stringstream& errmsg) {
+      errmsg << "Unable to connect output texture " << texture << " at color attachment " << i;
+    });
+
+    gl_log(GL_VERBOSE, "connected output texture %d to color attachment %d\n", texture, i);
+  }
+
+  // Bind the output textures to the frame buffer attachments
+  if (!frame_buffer_initialized) {
+    const int attachments_number = output.size();
+    const GLenum attachments[4] = {
+        GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3};
+
+    glDrawBuffers(attachments_number, attachments);
+
+    int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+
+    if (fbs != GL_FRAMEBUFFER_COMPLETE) {
+      throwRuntimeError(
+          [&](std::stringstream& errmsg) { errmsg << "Frame buffer incomplete: " << fbs; });
+    }
+
+    frame_buffer_initialized = true;
+  }
+
+  glUseProgram(program);
+  checkGLError([&](std::stringstream& errmsg) { errmsg << "glUseProgram"; });
+
+  // Set up the input textures
+  GLenum texture_idx = first_texture_id;
+  for (int i = 0; i < input.size(); i++, texture_idx++) {
+    if (input[i].uniform->location >= 0) {
+      GLenum target = input[i].texture->target();
+      GLuint texture = input[i].texture->name();
+
+      glActiveTexture(texture_idx);
+      glBindTexture(target, texture);
+      glUniform1i(input[i].uniform->location, texture_idx - GL_TEXTURE0);
+
+      checkGLError([&](std::stringstream& errmsg) {
+        errmsg << ": Unable to attach input texture " << texture << " to uniform "
+               << input[i].uniform->name << ":" << input[i].uniform->location << " at index "
+               << texture_idx - GL_TEXTURE0;
+      });
+
+      gl_log(GL_VERBOSE,
+             "connected input texture %d to texture unit %d\n",
+             texture,
+             texture_idx - GL_TEXTURE0);
+    } else {
+      gl_log(GL_VERBOSE, "something wrong happened when i = %d\n", i);
+    }
+  }
+
+  // Caller supplied uniforms initializer
+  if (uniforms_initializer) {
+    uniforms_initializer();
+
+    checkGLError([&](std::stringstream& errmsg) {
+      errmsg << "errors in the uniforms initializer callback";
+    });
+  }
+
+  // Validate program
+  if (check_opengl_errors && !validateProgram(program)) {
+    throwRuntimeError(
+        [&](std::stringstream& errmsg) { errmsg << "Couldn't validate OpenGL program"; });
+  }
+
+  glViewport(0, 0, width, height);
+
+  // Disable stuff we don't need and make sure that we have all the channels ebabled
+  for (int i = 0; i < sizeof(unused_capability) / sizeof(GLenum); i++) {
+    glDisable(unused_capability[i]);
+  }
+  glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+
+  // glDrawElements should be more efficient, but on iOS glDrawArrays is faster.
+
+  const bool useDrawArrays = true;
+
+  if (useDrawArrays) {
+    enum { ATTRIB_VERTEX, ATTRIB_TEXTUREPOSITON, NUM_ATTRIBUTES };
+
+    static const GLfloat squareVertices[] = {
+        -1.0f,
+        -1.0f, // bottom left
+        1.0f,
+        -1.0f, // bottom right
+        -1.0f,
+        1.0f, // top left
+        1.0f,
+        1.0f, // top right
+    };
+
+    static const float textureVertices[] = {
+        0.0f,
+        0.0f, // bottom left
+        1.0f,
+        0.0f, // bottom right
+        0.0f,
+        1.0f, // top left
+        1.0f,
+        1.0f, // top right
+    };
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, squareVertices);
+    glEnableVertexAttribArray(ATTRIB_VERTEX);
+    checkGLError(
+        [&](std::stringstream& errmsg) { errmsg << "glEnableVertexAttribArray(ATTRIB_VERTEX)"; });
+
+    glVertexAttribPointer(ATTRIB_TEXTUREPOSITON, 2, GL_FLOAT, 0, 0, textureVertices);
+    glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON);
+    checkGLError([&](std::stringstream& errmsg) {
+      errmsg << "glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON)";
+    });
+
+    gl_log(GL_VERBOSE, "Calling glDrawArrays\n");
+    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+    checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawArrays"; });
+  } else {
+    // Run the shaders on the output geometry
+    static const GLfloat vVertices[] = {
+        -1.0f, -1.0f, 0.0f, // Position 0
+        0.0f,  0.0f, // TexCoord 0
+        -1.0f, 1.0f,  0.0f, // Position 1
+        0.0f,  1.0f, // TexCoord 1
+        1.0f,  1.0f,  0.0f, // Position 2
+        1.0f,  1.0f, // TexCoord 2
+        1.0f,  -1.0f, 0.0f, // Position 3
+        1.0f,  0.0f // TexCoord 3
+    };
+    static const GLushort indices[] = {0, 1, 2, 0, 2, 3};
+
+    // Load the vertex position
+    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), vVertices);
+    // Load the texture coordinate
+    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), &vVertices[3]);
+
+    glEnableVertexAttribArray(0);
+    glEnableVertexAttribArray(1);
+
+    gl_log(GL_VERBOSE, "Calling glDrawElements\n");
+    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, indices);
+
+    checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawElements"; });
+  }
+
+#if CAFFE2_ANDROID
+  glFlush();
+#endif
+
+  // Unbind the current texture - Man, this is expensive!
+  for (int i = texture_idx - 1; i >= first_texture_id; i--) {
+    gl_log(GL_VERBOSE, "unbinding texture unit %d\n", i - GL_TEXTURE0);
+    glActiveTexture(i);
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    checkGLError([&](std::stringstream& errmsg) {
+      errmsg << "Error unbinding texture unit " << i - GL_TEXTURE0;
+    });
+  }
+
+  glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
+}
+
+void GLFilter::releaseBuffers() {
+  for (int i = 0; i < kMaxUniformBlocks; i++) {
+    if (uniformBlock[i]) {
+      gl_log(GL_VERBOSE, "deleting uniform buffer block %d\n", uniformBlock[i]);
+      glDeleteBuffers(1, &uniformBlock[i]);
+      uniformBlock[i] = 0;
+    }
+  }
+  if (frameBuffer) {
+    gl_log(GL_VERBOSE, "deleting frame buffer %d\n", frameBuffer);
+    glDeleteFramebuffers(1, &frameBuffer);
+    frameBuffer = 0;
+  }
+}
+
+void GLFilter::deleteProgram() {
+  if (program) {
+    gl_log(GL_VERBOSE, "deleting program %d\n", program);
+    glDeleteProgram(program);
+    program = 0;
+  }
+}
+
+void GLFilter::deleteBindings() {
+  for (binding* uniform : uniforms_) {
+    delete uniform;
+  }
+  for (binding* uniform_block : uniform_blocks_) {
+    delete uniform_block;
+  }
+  for (binding* attribute : attributes_) {
+    delete attribute;
+  }
+}
+
+// Simple vertex shader setting up the coordinates system
+const char* GLFilter::vertex_shader = R"GLSL(#version 300 es
+
+  layout(location = 0) in vec4 a_position;
+  layout(location = 1) in vec2 a_texCoord;
+  out vec2 v_texCoord;
+
+  void main()
+  {
+     gl_Position = a_position;
+     v_texCoord = a_texCoord;
+  }
+)GLSL";
+
+bool GLFilter::createProgram(const GLchar* vertSource,
+                             const GLchar* fragSource,
+                             GLuint* program) const {
+  GLuint vertShader = 0, fragShader = 0, prog = 0, status = 1;
+
+  // Clear the error state. We check error state later in the function and
+  // want to capture only errors in filter program initialization.
+  glGetError();
+
+  // Create shader program
+  prog = glCreateProgram();
+
+  // Create and compile vertex shader
+  status *= compileShader(GL_VERTEX_SHADER, 1, &vertSource, &vertShader);
+
+  // Create and compile fragment shader
+  status *= compileShader(GL_FRAGMENT_SHADER, 1, &fragSource, &fragShader);
+
+  // Attach vertex shader to program
+  glAttachShader(prog, vertShader);
+
+  // Attach fragment shader to program
+  glAttachShader(prog, fragShader);
+
+  // Bind attribute locations
+  // This needs to be done prior to linking
+  for (auto&& attribute : attributes_) {
+    glBindAttribLocation(prog, attribute->location, attribute->name.c_str());
+
+    checkGLError([&](std::stringstream& errmsg) {
+      errmsg << "Couldn't bind attribute: " << attribute->name << " at location "
+             << attribute->location;
+    });
+  }
+
+  // Link program
+  status *= linkProgram(prog);
+
+  // Get locations of uniforms
+  if (status) {
+    for (auto&& uniform : uniforms_) {
+      uniform->location = glGetUniformLocation(prog, uniform->name.c_str());
+
+      checkGLError([&](std::stringstream& errmsg) {
+        errmsg << "Couldn't resolve uniform: " << uniform->name;
+      });
+    }
+
+    for (auto&& uniform_block : uniform_blocks_) {
+      uniform_block->location = glGetUniformBlockIndex(prog, uniform_block->name.c_str());
+      gl_log(GL_VERBOSE,
+             "Getting location for uniform block: %s, location: %d\n",
+             uniform_block->name.c_str(),
+             uniform_block->location);
+
+      checkGLError([&](std::stringstream& errmsg) {
+        errmsg << "Couldn't resolve uniform block: " << uniform_block->name;
+      });
+    }
+
+    *program = prog;
+  }
+
+  // Release vertex and fragment shaders
+  if (vertShader) {
+    glDetachShader(prog, vertShader);
+    glDeleteShader(vertShader);
+  }
+  if (fragShader) {
+    glDetachShader(prog, fragShader);
+    glDeleteShader(fragShader);
+  }
+
+  return status == 1;
+}
+
+#include <stdlib.h>
+
+/* Compile a shader from the provided source(s) */
+GLint GLFilter::compileShader(GLenum target,
+                              GLsizei count,
+                              const GLchar** sources,
+                              GLuint* shader) const {
+  GLint status = 1;
+
+  *shader = glCreateShader(target);
+  glShaderSource(*shader, count, sources, NULL);
+  glCompileShader(*shader);
+
+  GLint logLength = 0;
+  glGetShaderiv(*shader, GL_INFO_LOG_LENGTH, &logLength);
+  if (logLength > 0) {
+    std::vector<GLchar> log(logLength);
+    glGetShaderInfoLog(*shader, logLength, &logLength, &log[0]);
+    gl_log(GL_ERR, "Shader compile log:\n%s", &log[0]);
+  }
+
+  glGetShaderiv(*shader, GL_COMPILE_STATUS, &status);
+  if (status == 0) {
+    int i;
+
+    gl_log(GL_ERR, "Failed to compile shader:\n");
+    for (i = 0; i < count; i++)
+      gl_log(GL_ERR, "%s", sources[i]);
+  }
+
+  return status;
+}
+
+/* Link a program with all currently attached shaders */
+GLint GLFilter::linkProgram(GLuint program) const {
+  GLint status = 1;
+
+  glLinkProgram(program);
+
+  GLint logLength = 0;
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
+  if (logLength > 0) {
+    std::vector<GLchar> log(logLength);
+    glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
+    gl_log(GL_ERR, "Program link log:\n%s", &log[0]);
+  }
+
+  glGetProgramiv(program, GL_LINK_STATUS, &status);
+  if (status == 0)
+    gl_log(GL_ERR, "Failed to link program %d\n", program);
+
+  return status;
+}
+
+/* Validate a program (for i.e. inconsistent samplers) */
+GLint GLFilter::validateProgram(GLuint program) const {
+  GLint status = 1;
+
+  glValidateProgram(program);
+
+  GLint logLength = 0;
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
+  if (logLength > 0) {
+    std::vector<GLchar> log(logLength);
+    glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
+    gl_log(GL_ERR, "Program validate log:\n%s", &log[0]);
+  }
+
+  glGetProgramiv(program, GL_VALIDATE_STATUS, &status);
+  if (status == 0)
+    gl_log(GL_ERR, "Failed to validate program %d\n", program);
+
+  return status;
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLFilter.h b/caffe2/mobile/contrib/opengl/core/GLFilter.h
new file mode 100644
index 0000000..d34eac0
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLFilter.h
@@ -0,0 +1,104 @@
+
+#pragma once
+
+#include "GLContext.h"
+#include "GLTexture.h"
+#include "arm_neon_support.h"
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#define BINDING(variableName) (variableName = new binding{#variableName})
+#define ATTRIBUTE(variableName, value) (variableName = new binding{#variableName, value})
+
+class GLFilter {
+ protected:
+  const std::string kernel_name;
+  GLuint program = 0;
+  GLuint frameBuffer = 0;
+  static constexpr int kMaxUniformBlocks = 12;
+  GLuint uniformBlock[kMaxUniformBlocks] = {0};
+  GLint blockSize[kMaxUniformBlocks]     = {0};
+  bool frame_buffer_initialized = false;
+
+  // glGetError() can be expensive, we should turn error checking off when we're done with debugging
+
+  static constexpr bool check_opengl_errors = true;
+
+public:
+  typedef std::vector<std::pair<std::string, std::string>> replacements_t;
+
+  struct binding {
+    const std::string name;
+    GLint location;
+  };
+
+  struct texture_attachment {
+    const GLTexture* texture;
+    const binding* uniform;
+  };
+
+  GLFilter(const std::string kernel_name,
+           const std::string vertex_shader,
+           const std::string fragment_shader,
+           const std::vector<binding*> uniforms,
+           const std::vector<binding*> uniform_blocks = {},
+           const std::vector<binding*> attributes = {},
+           const replacements_t& replacements = {});
+
+  // TODO: The set and reset context need to be commented out for unit testing
+  ~GLFilter() {
+    releaseBuffers();
+    deleteProgram();
+    deleteBindings();
+  }
+
+  void throwRuntimeError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
+    std::stringstream errmsg;
+    errmsg << kernel_name << ": ";
+    error_formatter(errmsg);
+    throw std::runtime_error(errmsg.str());
+  }
+
+  void checkGLError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
+    if (check_opengl_errors) {
+      GLenum glError = glGetError();
+      if (glError != GL_NO_ERROR) {
+        throwRuntimeError([&](std::stringstream& errmsg) {
+          error_formatter(errmsg);
+          errmsg << ", " << glError;
+        });
+      }
+    }
+  }
+
+  template <typename T>
+  void attach_uniform_buffer(const binding* block,
+                             GLuint bindingPoint, std::function<void(T*, size_t)> loader);
+
+  void run(const std::vector<texture_attachment>& input,
+           const std::vector<const GLTexture*>& output,
+           std::function<void(void)> uniforms_initializer,
+           int width,
+           int height);
+
+  void releaseBuffers();
+  void deleteProgram();
+  void deleteBindings();
+
+  static const char* vertex_shader;
+
+ private:
+  const std::vector<binding*> uniforms_;
+  const std::vector<binding*> uniform_blocks_;
+  const std::vector<binding*> attributes_;
+
+  std::string process_replacements(std::string source, const replacements_t& replacements) const;
+
+  bool createProgram(const GLchar* vertSource, const GLchar* fragSource, GLuint* program) const;
+
+  GLint compileShader(GLenum target, GLsizei count, const GLchar** sources, GLuint* shader) const;
+  GLint linkProgram(GLuint program) const;
+  GLint validateProgram(GLuint program) const;
+};
diff --git a/caffe2/mobile/contrib/opengl/core/GLImage.cc b/caffe2/mobile/contrib/opengl/core/GLImage.cc
new file mode 100644
index 0000000..7c7f242
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLImage.cc
@@ -0,0 +1,15 @@
+
+#include "GLImage.h"
+#include "arm_neon_support.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+CAFFE_KNOWN_TYPE(GLImage<float>);
+CAFFE_KNOWN_TYPE(GLImage<uint8_t>);
+CAFFE_KNOWN_TYPE(GLImageVector<float>);
+CAFFE_KNOWN_TYPE(GLImageVector<uint8_t>);
+#ifdef __ARM_NEON__
+CAFFE_KNOWN_TYPE(GLImage<float16_t>);
+CAFFE_KNOWN_TYPE(GLImageVector<float16_t>);
+#endif
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLImage.h b/caffe2/mobile/contrib/opengl/core/GLImage.h
new file mode 100644
index 0000000..4b3a057
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLImage.h
@@ -0,0 +1,151 @@
+
+#pragma once
+
+#include "GLTexture.h"
+#include "caffe2/core/logging.h"
+
+#include <functional>
+#include <vector>
+
+template <typename T>
+class GLImage {
+ public:
+  const int width;
+  const int height;
+  const int channels;
+  const int data_size;
+
+  const int tile_x;
+  const int tile_y;
+  const int texture_width;
+  const int texture_height;
+  const int slices;
+
+  const std::vector<const GLTexture*> textures;
+
+  constexpr static int slice_channels = 4;
+
+  static constexpr int channels_to_slices(int channels, int tile_x, int tile_y) {
+    return ((channels + slice_channels - 1) / slice_channels + tile_x * tile_y - 1) /
+           (tile_x * tile_y);
+  }
+
+  static const std::vector<const GLTexture*> allocate_textures(
+      int slices, std::function<const GLTexture*(int slice)> texture_loader) {
+    std::vector<const GLTexture*> textures;
+    for (int i = 0; i < slices; i++) {
+      textures.push_back(texture_loader(i));
+    }
+    return textures;
+  }
+
+  GLImage(int _width,
+          int _height,
+          int _channels,
+          int _tile_x,
+          int _tile_y,
+          std::function<const GLTexture*(int slice)> texture_loader)
+      : width(_width),
+        height(_height),
+        channels(_channels),
+        data_size(sizeof(T)),
+        tile_x(_tile_x),
+        tile_y(_tile_y),
+        texture_width(_width * _tile_x),
+        texture_height(_height * _tile_y),
+        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
+        textures(allocate_textures(slices, texture_loader)) {
+    CAFFE_ENFORCE_EQ(
+        slices, ((channels + 3) / 4 + tile_x * tile_y - 1) / (tile_x * tile_y));
+  }
+
+  GLImage(int _width,
+          int _height,
+          int _channels,
+          int _tile_x,
+          int _tile_y,
+          bool _destroy,
+          std::function<const GLTexture*(int slice)> texture_loader)
+      : width(_width),
+        height(_height),
+        channels(_channels),
+        data_size(sizeof(T)),
+        tile_x(_tile_x),
+        tile_y(_tile_y),
+        texture_width(_width * _tile_x),
+        texture_height(_height * _tile_y),
+        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
+        textures(allocate_textures(slices, texture_loader)) {
+    CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
+  }
+
+  GLImage()
+      : width(0),
+        height(0),
+        channels(0),
+        data_size(sizeof(T)),
+        tile_x(0),
+        tile_y(0),
+        texture_width(0),
+        texture_height(0),
+        slices(0){};
+
+  virtual ~GLImage() {
+    gl_log(GL_VERBOSE, "deleting GLImage\n");
+    for (auto&& texture : textures) {
+      delete texture;
+    }
+  }
+};
+
+template <typename T>
+class GLImageVector {
+ private:
+  std::vector<GLImage<T>*> images_;
+  int num_images_ = 0;
+  int width_ = 0;
+  int height_ = 0;
+  int channels_ = 0;
+  int tile_x_ = 0;
+  int tile_y_ = 0;
+
+ public:
+  GLImage<T>* operator[](int index) const {
+    CAFFE_ENFORCE_LT(index, num_images_, "Out of bounds when accessing GLImageVector");
+    return images_[index];
+  }
+
+  void push_back(GLImage<T>* image) {
+    CAFFE_ENFORCE_EQ(image->channels, channels_);
+    CAFFE_ENFORCE_EQ(image->width, width_);
+    CAFFE_ENFORCE_EQ(image->height, height_);
+    CAFFE_ENFORCE_EQ(image->tile_x, tile_x_);
+    CAFFE_ENFORCE_EQ(image->tile_y, tile_y_);
+    images_.push_back(image);
+    CAFFE_ENFORCE_LE(images_.size(), num_images_);
+  }
+
+  int size() const { return images_.size(); }
+  int channels() const { return channels_; }
+  int width() const { return width_; }
+  int height() const { return height_; }
+  int tile_x() const { return tile_x_; }
+  int tile_y() const { return tile_y_; }
+  int slices() const { return size() > 0 ? images_[0]->slices : 0; }
+
+  GLImageVector(int num_images, int width, int height, int channels, int tile_x = 1, int tile_y = 1)
+      : num_images_(num_images),
+        width_(width),
+        height_(height),
+        channels_(channels),
+        tile_x_(tile_x),
+        tile_y_(tile_y) {}
+
+  GLImageVector() {}
+
+  ~GLImageVector() {
+    for (int i = 0; i < images_.size(); i++) {
+      delete images_[i];
+    }
+  }
+};
diff --git a/caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc
new file mode 100644
index 0000000..5f15840
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc
@@ -0,0 +1,66 @@
+
+#include "GLImageAllocator.h"
+#include "arm_neon_support.h"
+
+template <class T>
+GLImageVector<T>* GLImageAllocator<T>::newImage(
+    int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output) {
+  GLImageVector<T>* images =
+      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
+  for (int i = 0; i < num_images; i++) {
+    images->push_back(
+        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
+          bool usePadding = is_output;
+          return new GLPlainTexture(type, nullptr, width * tile_x, height * tile_y, usePadding);
+        }));
+  }
+  return images;
+}
+
+template <class T>
+GLImageVector<T>* GLImageAllocator<T>::newImage(
+    int num_images,
+    int width,
+    int height,
+    int channels,
+    int tile_x,
+    int tile_y,
+    std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
+  GLImageVector<T>* images =
+      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
+  for (int i = 0; i < num_images; i++) {
+    images->push_back(
+        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
+          return textureAllocator(width, height);
+        }));
+  }
+  return images;
+}
+
+template <class T>
+GLImageVector<T>* GLImageAllocator<T>::ShareTexture(const GLuint textureID,
+                                                    int num_images,
+                                                    int width,
+                                                    int height,
+                                                    int channels,
+                                                    int tile_x,
+                                                    int tile_y) {
+  GLImageVector<T>* images =
+      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
+  for (int i = 0; i < num_images; i++) {
+    images->push_back(
+        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
+          return new GLPlainTexture(
+              GLImageAllocator<T>::type, textureID, width * tile_x, height * tile_y);
+        }));
+  }
+  return images;
+}
+
+template <>
+const GLTexture::Type& GLImageAllocator<float16_t>::type = GLTexture::FP16;
+template <>
+const GLTexture::Type& GLImageAllocator<uint8_t>::type = GLTexture::UI8;
+
+template class GLImageAllocator<float16_t>;
+template class GLImageAllocator<uint8_t>;
diff --git a/caffe2/mobile/contrib/opengl/core/GLImageAllocator.h b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.h
new file mode 100644
index 0000000..a6764e6
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.h
@@ -0,0 +1,37 @@
+
+#pragma once
+
+#include "GLImage.h"
+#include "GLPlainTexture.h"
+
+template <class T>
+class GLImageAllocator {
+ public:
+  static const GLTexture::Type& type;
+
+  GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
+
+  virtual ~GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
+
+  virtual GLImageVector<T>* newImage(
+      int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output);
+
+  virtual GLImageVector<T>* newImage(
+      int num_images,
+      int width,
+      int height,
+      int channels,
+      int tile_x,
+      int tile_y,
+      std::function<const GLTexture*(const int width, const int height)> textureAllocator);
+
+  virtual GLImageVector<T>* ShareTexture(const GLuint textureID,
+                                         int num_images,
+                                         int width,
+                                         int height,
+                                         int channels,
+                                         int tile_x = 1,
+                                         int tile_y = 1);
+
+  static GLImageAllocator<T>* newGLImageAllocator();
+};
diff --git a/caffe2/mobile/contrib/opengl/core/GLLogging.h b/caffe2/mobile/contrib/opengl/core/GLLogging.h
new file mode 100644
index 0000000..9e57660
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLLogging.h
@@ -0,0 +1,20 @@
+
+#pragma once
+
+#include <stdarg.h>
+#include <stdio.h>
+
+enum { GL_ERR = -1, GL_LOG = 0, GL_VERBOSE = 1 };
+
+static constexpr int GL_LOG_LEVEL = GL_LOG;
+
+static inline int gl_log(int level, const char* format, ...) {
+  int r = 0;
+  if (level <= GL_LOG_LEVEL) {
+    va_list args;
+    va_start(args, format);
+    r = vfprintf(stderr, format, args);
+    va_end(args);
+  }
+  return r;
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPBO.cc b/caffe2/mobile/contrib/opengl/core/GLPBO.cc
new file mode 100644
index 0000000..eea3bed
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPBO.cc
@@ -0,0 +1,93 @@
+
+#include "GLPBO.h"
+
+#include "caffe2/core/logging.h"
+
+GLPBO::~GLPBO() {
+  if (pboId != 0) {
+    gl_log(GL_LOG, "deleting PBO buffer %d\n", pboId);
+    glDeleteBuffers(1, &pboId);
+    pboId = 0;
+  }
+  if (pboFrameBuffer != 0) {
+    gl_log(GL_LOG, "deleting PBO frame buffer %d\n", pboFrameBuffer);
+    glDeleteFramebuffers(1, &pboFrameBuffer);
+    pboFrameBuffer = 0;
+  }
+}
+
+GLPBO* GLPBO::pboContext = NULL;
+
+GLPBO* GLPBO::getContext() {
+  if (pboContext == NULL) {
+    pboContext = new GLPBO();
+  }
+  return pboContext;
+}
+
+void GLPBO::mapTextureData(GLuint _textureId,
+                           GLsizei _width,
+                           GLsizei _height,
+                           GLsizei _stride,
+                           GLsizei _channels,
+                           const GLTexture::Type& _type,
+                           std::function<void(const void* buffer,
+                                              size_t width,
+                                              size_t height,
+                                              size_t stride,
+                                              size_t channels,
+                                              const GLTexture::Type& type)> process) {
+  GLint defaultFramebuffer = 0;
+  glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
+
+  if (pboFrameBuffer == 0) {
+    glGenFramebuffers(1, &pboFrameBuffer);
+    gl_log(GL_VERBOSE, "created PBO frame buffer %d\n", pboFrameBuffer);
+  }
+
+  glBindFramebuffer(GL_FRAMEBUFFER, pboFrameBuffer);
+
+  glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _textureId, 0);
+
+  int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (fbs != GL_FRAMEBUFFER_COMPLETE) {
+    std::stringstream errmsg;
+    errmsg << ": Frame buffer incomplete: " << fbs;
+    throw std::runtime_error(errmsg.str());
+  }
+
+  if (pboId == 0) {
+    glGenBuffers(1, &pboId);
+    gl_log(GL_VERBOSE, "created PBO buffer %d\n", pboId);
+  }
+  glBindBuffer(GL_PIXEL_PACK_BUFFER, pboId);
+
+  size_t buffer_size = _stride * _height * _channels * _type.dataSize();
+
+  if (buffer_size > pboSize) {
+    LOG(INFO) << "Allocating PBO of capacity " << buffer_size;
+
+    glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, NULL, GL_DYNAMIC_READ);
+    pboSize = buffer_size;
+  }
+
+  glReadBuffer(GL_COLOR_ATTACHMENT0);
+  glReadPixels(0, 0, _stride, _height, _type.format, _type.type, 0);
+
+  GLhalf* ptr = (GLhalf*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, GL_MAP_READ_BIT);
+
+  if (ptr) {
+    process(ptr, _width, _height, _stride, _channels, _type);
+  } else {
+    std::stringstream errmsg;
+    errmsg << ": glMapBufferRange using PBO incomplete";
+    throw std::runtime_error(errmsg.str());
+  }
+
+  // Unmap buffer
+  glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+  glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+
+  // Bind to the default FrameBuffer
+  glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPBO.h b/caffe2/mobile/contrib/opengl/core/GLPBO.h
new file mode 100644
index 0000000..c904656
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPBO.h
@@ -0,0 +1,31 @@
+
+#pragma once
+
+#include "GLTexture.h"
+#include <functional>
+
+class GLPBO {
+  GLuint pboId = 0;
+  GLuint pboSize = 0;
+  GLuint pboFrameBuffer = 0;
+
+  ~GLPBO();
+
+  static GLPBO* pboContext;
+
+ public:
+  void mapTextureData(GLuint _textureId,
+                      GLsizei _width,
+                      GLsizei _height,
+                      GLsizei _stride,
+                      GLsizei _channels,
+                      const GLTexture::Type& type,
+                      std::function<void(const void* buffer,
+                                         size_t width,
+                                         size_t height,
+                                         size_t stride,
+                                         size_t channels,
+                                         const GLTexture::Type& type)> process);
+
+  static GLPBO* getContext();
+};
diff --git a/caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc
new file mode 100644
index 0000000..71b69c5
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc
@@ -0,0 +1,58 @@
+
+#include "GLPlainTexture.h"
+#include "GLPBO.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/timer.h"
+
+#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
+
+#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
+
+GLPlainTexture::GLPlainTexture(
+    const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
+    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
+  //  caffe2::Timer timer;
+  //  timer.Start();
+  glGenTextures(1, &_textureId);
+  glBindTexture(GL_TEXTURE_2D, _textureId);
+  glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
+
+  gl_log(
+      GL_VERBOSE,
+      "GLPlainTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
+      _textureId,
+      _type.internalFormat,
+      _type.format,
+      _type.type);
+
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, _filter);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, _filter);
+
+#if GL_EXT_texture_border_clamp
+  GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
+  // Set the texture to use the border clamp wrapping mode.
+  _wrap = GL_CLAMP_TO_BORDER_EXT;
+#endif
+
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, _wrap);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, _wrap);
+
+  glBindTexture(GL_TEXTURE_2D, 0);
+  //  LOG(INFO) << "glTexImage2D takes " << timer.MilliSeconds() << " ms";
+}
+
+GLPlainTexture::GLPlainTexture(
+    const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
+    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
+  _textureId = textureID;
+  isOwner = false;
+  gl_log(
+      GL_VERBOSE,
+      "GLPlainTexture() - wrapped textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
+      _textureId,
+      _type.internalFormat,
+      _type.format,
+      _type.type);
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPlainTexture.h b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.h
new file mode 100644
index 0000000..4a211ac
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.h
@@ -0,0 +1,44 @@
+
+#pragma once
+
+#include "GLContext.h"
+#include "GLTexture.h"
+
+class GLPlainTexture : public GLTexture {
+ private:
+  bool isOwner = true;
+
+ public:
+  GLPlainTexture(const Type& type,
+                 const void* input,
+                 GLsizei width,
+                 GLsizei height,
+                 bool use_padding = false,
+                 GLint filter = GL_NEAREST,
+                 GLint wrap = GL_CLAMP_TO_EDGE);
+
+  GLPlainTexture(const Type& type,
+                 const GLuint textureID,
+                 GLsizei width,
+                 GLsizei height,
+                 bool use_padding = false,
+                 GLint filter = GL_NEAREST,
+                 GLint wrap = GL_CLAMP_TO_EDGE);
+
+  ~GLPlainTexture() {
+    if (glIsTexture(_textureId)) {
+      if (isOwner) {
+        gl_log(GL_VERBOSE, "~GLPlainTexture() - deleting texture %d\n", _textureId);
+        glDeleteTextures(1, &_textureId);
+      }
+    } else {
+      gl_log(GL_ERR, "not deleting texture %d\n", _textureId);
+    }
+  }
+
+  GLuint name() const { return _textureId; };
+
+  GLenum target() const { return GL_TEXTURE_2D; };
+
+  bool flipped() const { return false; };
+};
diff --git a/caffe2/mobile/contrib/opengl/core/GLPredictor.cc b/caffe2/mobile/contrib/opengl/core/GLPredictor.cc
new file mode 100644
index 0000000..405292a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPredictor.cc
@@ -0,0 +1,63 @@
+
+#include "GLPredictor.h"
+#include "GLContext.h"
+#include "rewrite_net.h"
+#include <vector>
+
+namespace caffe2 {
+
+template <class T>
+void shareInputGLImage(Workspace* ws, const std::string& name, GLImageVector<T>* input) {
+  auto* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
+  blob->ShareExternal<GLImageVector<T>>(input);
+}
+
+template <class T>
+const GLImageVector<T>* extractOutputGLImage(Workspace* ws, const std::string& name) {
+  auto* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
+  return &blob->template Get<GLImageVector<T>>();
+}
+
+const NetDef create_gl_run_net(const NetDef& init_net,
+                               const NetDef& run_net,
+                               bool use_texture_input) {
+  NetDef gl_run_net;
+  if (!tryConvertToOpenGL(init_net, run_net, &gl_run_net, use_texture_input)) {
+    CAFFE_THROW("Failed to convert model to OpenGL");
+  }
+  return gl_run_net;
+}
+
+GLPredictor::GLPredictor(const NetDef& init_net,
+                         const NetDef& run_net,
+                         bool use_texture_input,
+                         Workspace* parent)
+    : Predictor(init_net, create_gl_run_net(init_net, run_net, use_texture_input), parent) {}
+
+GLPredictor::~GLPredictor() {}
+
+template <class T>
+bool GLPredictor::run(std::vector<GLImageVector<T>*>& inputs,
+                      std::vector<const GLImageVector<T>*>* outputs) {
+  const NetDef& run_net_ = Predictor::def();
+  CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
+  for (auto i = 0; i < inputs.size(); ++i) {
+    shareInputGLImage<T>(Predictor::ws(), run_net_.external_input(i), inputs[i]);
+  }
+
+  if (!Predictor::ws()->RunNet(run_net_.name())) {
+    return false;
+  }
+
+  for (auto i = 0; i < run_net_.external_output_size(); ++i) {
+    outputs->push_back(extractOutputGLImage<T>(Predictor::ws(), run_net_.external_output(i)));
+  }
+
+  return true;
+}
+
+template bool GLPredictor::run(std::vector<GLImageVector<uint8_t>*>& inputs,
+                               std::vector<const GLImageVector<uint8_t>*>* outputs);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLPredictor.h b/caffe2/mobile/contrib/opengl/core/GLPredictor.h
new file mode 100644
index 0000000..2806f8a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLPredictor.h
@@ -0,0 +1,21 @@
+
+#pragma once
+
+#include "GLImage.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/predictor.h"
+
+namespace caffe2 {
+class GLPredictor : public Predictor {
+ public:
+  GLPredictor(const NetDef& init_net,
+              const NetDef& run_net,
+              bool use_texture_input = false,
+              Workspace* parent = nullptr);
+
+  template <class T>
+  bool run(std::vector<GLImageVector<T>*>& inputs, std::vector<const GLImageVector<T>*>* outputs);
+
+  ~GLPredictor();
+};
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLTexture.cc b/caffe2/mobile/contrib/opengl/core/GLTexture.cc
new file mode 100644
index 0000000..2e3e406
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLTexture.cc
@@ -0,0 +1,71 @@
+
+#include "GLTexture.h"
+#include "DataTransfer.h"
+#include "GLPBO.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/timer.h"
+
+#if CAFFE2_ANDROID && defined(__ARM_NEON__)
+
+#include "../android/AndroidGLContext.h"
+
+// https://community.arm.com/thread/10002
+void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz) {
+  if (sz & 63) {
+    sz = (sz & -64) + 64;
+  }
+
+  asm volatile(
+      "NEONCopyPLD: \n"
+      " VLDM %[src]!,{d0-d7} \n"
+      " VSTM %[dst]!,{d0-d7} \n"
+      " SUBS %[sz],%[sz],#0x40 \n"
+      " BGT NEONCopyPLD \n"
+      : [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
+      :
+      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
+}
+#endif
+
+const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
+const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
+const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
+
+void GLTexture::map_read(std::function<void(const void* buffer,
+                                            size_t width,
+                                            size_t height,
+                                            size_t stride,
+                                            size_t channels,
+                                            const Type& type)> process) const {
+  GLPBO* pbo = GLPBO::getContext();
+  pbo->mapTextureData(_textureId, _width, _height, _stride, _channels, _type, process);
+}
+
+void GLTexture::map_load(std::function<void(void* buffer,
+                                            size_t width,
+                                            size_t height,
+                                            size_t stride,
+                                            size_t channels,
+                                            const Type& type)> process) const {
+  const int alignment = 32; // 4 * _type.dataSize();
+  void* buffer = nullptr;
+  size_t buffer_size = _width * _height * _channels * _type.dataSize();
+
+#ifdef __ANDROID__
+  buffer = (void*)memalign(alignment, buffer_size);
+#else
+  posix_memalign((void**)&buffer, alignment, buffer_size);
+#endif
+  CAFFE_ENFORCE(buffer);
+
+  process(buffer, _width, _height, _width, _channels, _type);
+  loadData(buffer);
+  free(buffer);
+}
+
+void GLTexture::loadData(const void* pixels) const {
+  glBindTexture(GL_TEXTURE_2D, _textureId);
+  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, _width, _height, _type.format, _type.type, pixels);
+  glBindTexture(GL_TEXTURE_2D, 0);
+}
diff --git a/caffe2/mobile/contrib/opengl/core/GLTexture.h b/caffe2/mobile/contrib/opengl/core/GLTexture.h
new file mode 100644
index 0000000..c12152e
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/GLTexture.h
@@ -0,0 +1,105 @@
+
+#pragma once
+#include "GL.h"
+#include "GLLogging.h"
+
+class GLTexture {
+ public:
+  struct Type {
+    const GLenum internalFormat;
+    const GLenum format;
+    const GLenum type;
+
+    int dataSize() const {
+      switch (type) {
+      case GL_UNSIGNED_INT:
+        return 4;
+      case GL_HALF_FLOAT:
+        return 2;
+      case GL_UNSIGNED_BYTE:
+        return 1;
+      default:
+        throw std::runtime_error("Unknown Texture Type");
+      }
+    }
+
+    int channels() const {
+      switch (format) {
+      case GL_R8:
+        return 1;
+      case GL_RG8:
+        return 2;
+      // case GL_BGRA:
+      case GL_RG_INTEGER:
+      case GL_RGBA:
+        return 4;
+      default:
+        throw std::runtime_error("Unknown Texture Format");
+      }
+    }
+  };
+
+  static const Type FP16;
+  static const Type FP16_COMPAT;
+  static const Type UI8;
+
+ protected:
+  const Type& _type;
+
+  const GLsizei _width;
+  const GLsizei _height;
+  const GLsizei _stride;
+  const GLsizei _channels;
+  const bool _use_padding;
+
+  GLint _filter;
+  GLint _wrap;
+  GLuint _textureId;
+
+ public:
+  GLTexture(const Type& type,
+            int width,
+            int height,
+            int stride,
+            bool use_padding,
+            GLint filter,
+            GLint wrap)
+      : _type(type),
+        _width(width),
+        _height(height),
+        _stride(stride),
+        _channels(type.channels()),
+        _use_padding(use_padding),
+        _filter(filter),
+        _wrap(wrap) {}
+
+  GLTexture(const Type& type, int width, int height, bool use_padding, GLint filter, GLint wrap)
+      : GLTexture(type,
+                  width,
+                  height,
+                  use_padding ? (width + 7) / 8 * 8 : width,
+                  use_padding,
+                  filter,
+                  wrap) {}
+
+  virtual ~GLTexture() {}
+  virtual GLuint name() const = 0;
+  virtual GLenum target() const = 0;
+  virtual bool flipped() const = 0;
+
+  virtual void map_read(std::function<void(const void* buffer,
+                                           size_t width,
+                                           size_t height,
+                                           size_t stride,
+                                           size_t channels,
+                                           const Type& type)> process) const;
+
+  virtual void map_load(std::function<void(void* buffer,
+                                           size_t width,
+                                           size_t height,
+                                           size_t stride,
+                                           size_t channels,
+                                           const Type& type)> process) const;
+
+  void loadData(const void* pixels) const;
+};
diff --git a/caffe2/mobile/contrib/opengl/core/ImageAllocator.h b/caffe2/mobile/contrib/opengl/core/ImageAllocator.h
new file mode 100644
index 0000000..22eb25d
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/ImageAllocator.h
@@ -0,0 +1,47 @@
+
+#pragma once
+
+#include "GLImageAllocator.h"
+
+namespace caffe2 {
+
+template <class T>
+class ImageAllocator {
+  GLImageAllocator<T>* glImageAllocator;
+
+ public:
+  ImageAllocator() : glImageAllocator(GLImageAllocator<T>::newGLImageAllocator()) {}
+
+  virtual ~ImageAllocator() { delete glImageAllocator; }
+
+  GLImageVector<T>* newImage(
+      int num_images, int width, int height, int channels, bool is_output = false) {
+    const int tile_x = 1, tile_y = 1;
+    return glImageAllocator->newImage(
+        num_images, width, height, channels, tile_x, tile_y, is_output);
+  }
+
+  GLImageVector<T>* newImage(int num_images,
+                             int width,
+                             int height,
+                             int channels,
+                             int tile_x,
+                             int tile_y,
+                             bool is_output = false) {
+    return glImageAllocator->newImage(
+        num_images, width, height, channels, tile_x, tile_y, is_output);
+  }
+
+  GLImageVector<T>* newImage(
+      int num_images,
+      int width,
+      int height,
+      int channels,
+      int tile_x,
+      int tile_y,
+      std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
+    return glImageAllocator->newImage(
+        num_images, width, height, channels, tile_x, tile_y, textureAllocator);
+  }
+};
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/arm_neon_support.h b/caffe2/mobile/contrib/opengl/core/arm_neon_support.h
new file mode 100644
index 0000000..d1e1a58
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/arm_neon_support.h
@@ -0,0 +1,12 @@
+
+#pragma once
+
+#include "caffe2/core/common.h"
+
+#ifdef __ARM_NEON__
+#if CAFFE2_IOS
+#include "arm_neon.h"
+#elif CAFFE2_ANDROID
+#include "caffe2/mobile/contrib/opengl/android/arm_neon_support.h"
+#endif
+#endif
diff --git a/caffe2/mobile/contrib/opengl/core/rewrite_net.cc b/caffe2/mobile/contrib/opengl/core/rewrite_net.cc
new file mode 100644
index 0000000..1919c90
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/rewrite_net.cc
@@ -0,0 +1,367 @@
+
+#include "rewrite_net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/proto_utils.h"
+#include <unordered_map>
+#include <unordered_set>
+
+#ifdef CAFFE2_ANDROID
+#include "../android/AndroidGLContext.h"
+#endif
+
+namespace caffe2 {
+
+struct Analysis {
+  struct SSA {
+    using BlobVersions = std::unordered_map<std::string, size_t>;
+    BlobVersions inVersions;
+    BlobVersions outVersions;
+  };
+  std::vector<SSA> ssa;
+  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
+};
+
+static Analysis analyzeNet(const NetDef& net) {
+  Analysis::SSA::BlobVersions frontier;
+  Analysis analysis;
+
+  auto play = [&](size_t i, const OperatorDef& op) {
+    Analysis::SSA::BlobVersions inVersions;
+    for (const auto& s : op.input()) {
+      inVersions[s] = frontier[s];
+      analysis.inUsages[s][frontier[s]].push_back(i);
+    }
+    Analysis::SSA::BlobVersions outVersions;
+    for (const auto& s : op.output()) {
+      if (frontier.find(s) != frontier.end()) {
+        frontier[s] += 1;
+      }
+      outVersions[s] = frontier[s];
+    }
+    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
+  };
+
+  for (auto i = 0; i < net.op_size(); ++i) {
+    play(i, net.op(i));
+  }
+  return analysis;
+}
+
+static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_name("CopyToOpenGL");
+  op->set_type("CopyToOpenGL");
+  op->add_input(cpu_blob);
+  op->add_output(cpu_blob + "_M");
+}
+
+static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
+  // add argument "is_last" to the last op to signal this is the last operator before the
+  // CopyFromOpenGL op
+  auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1);
+  auto* arg = last_op->add_arg();
+  arg->set_name("is_last");
+  arg->set_i(1);
+
+  auto* op = predictNet.add_op();
+  op->set_name("CopyFromOpenGL");
+  op->set_type("CopyFromOpenGL");
+  op->add_input(cpu_blob + "_M");
+  op->add_output(cpu_blob);
+}
+
+static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) {
+  // Do some validation of the outputs. For this version, we require:
+  // - a single input (first element of external_input()) is consumed by the NetDef
+  // - a single output (first element of external_output()) is produced by the NetDef.
+  // - the input is consumed by def.op(0), and this is the only consumer.
+  // - the output is produced by def.op(-1).
+  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
+  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
+  auto analysis = analyzeNet(def);
+  // enforce a single use of the input blob.
+  CAFFE_ENFORCE_GE(def.op_size(), 1);
+
+  const auto& inputBlob = def.external_input(0);
+  // Enforce that the input blob has a single usage - in the first operator.
+  CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
+  // Enforce that the external_output(0) blob is produced by the last operator in this sequence.
+  const auto& outputBlob = def.external_output(0);
+  CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) !=
+                analysis.ssa.back().outVersions.end());
+  const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
+  // This should hold true by definition of the SSA analysis.
+  CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
+                analysis.inUsages[outputBlob].end());
+
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+
+  std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
+  cpu_blobs[def.external_input(0)].insert(0);
+
+  for (auto i = 0; i < def.op_size(); i++) {
+    const auto& currentOp = def.op(i);
+    if (glOps.count(currentOp.type()) > 0) {
+      // OpenGL Op
+      // insert copyToOpenGLOp
+      for (auto j = 0; j < currentOp.input_size(); j++) {
+        auto& input = currentOp.input(j);
+        auto version = analysis.ssa[i].inVersions[input];
+        if (cpu_blobs[input].count(version) > 0) {
+          insertCopyToGPUOp(mdef, input);
+          gpu_blobs[input].insert(version);
+          cpu_blobs[input].erase(version);
+        }
+        // Only the first input should be OpenGL texture
+        // Otherwise, copyToOpenGLOp will be inserted for the weights,
+        // which are outputs of QuantDecode
+        if (currentOp.type().find("OpenGLConv") == 0) {
+          if (j == 0) {
+            break;
+          }
+        }
+      }
+
+      auto* op = mdef.add_op();
+      op->CopyFrom(currentOp);
+
+      // swap input blob
+      for (auto j = 0; j < currentOp.input_size(); j++) {
+        auto& input = currentOp.input(j);
+        auto version = analysis.ssa[i].inVersions[input];
+        if (gpu_blobs[input].count(version) > 0) {
+          op->set_input(j, input + "_M");
+        }
+      }
+
+      // swap output blob
+      for (auto j = 0; j < currentOp.output_size(); j++) {
+        auto& output = currentOp.output(j);
+        auto version = analysis.ssa[i].outVersions[output];
+        op->set_output(j, output + "_M");
+        gpu_blobs[output].insert(version);
+      }
+      // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op
+      if (i == def.op_size() - 1) {
+        insertCopyFromGPUOp(mdef, currentOp.output(0));
+      }
+    } else {
+      // CPU Op
+      // insert copyFromOpenGLOp
+      for (auto j = 0; j < currentOp.input_size(); j++) {
+        auto& input = currentOp.input(j);
+        auto version = analysis.ssa[i].inVersions[input];
+        if (gpu_blobs[input].count(version) > 0) {
+          insertCopyFromGPUOp(mdef, input);
+        }
+      }
+      auto* op = mdef.add_op();
+      op->CopyFrom(currentOp);
+      for (auto j = 0; j < currentOp.output_size(); j++) {
+        auto& output = currentOp.output(j);
+        auto version = analysis.ssa[i].outVersions[output];
+        cpu_blobs[output].insert(version);
+      }
+    }
+  }
+  return mdef;
+}
+
+static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
+                               const OperatorDef& nextOp,
+                               OperatorDef* fusedOp,
+                               std::unordered_set<std::string>& glOps) {
+  // Check for possible invalid opportunities.
+  if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
+    return false;
+  }
+  // The fused op cannot be inplace
+  if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
+    return false;
+  }
+
+  static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
+      {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"},
+      {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"},
+      {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"},
+      {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}};
+  auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
+  if (it == fusionOpportunities.end()) {
+    return false;
+  }
+
+  glOps.insert(it->second);
+  fusedOp->CopyFrom(currentOp);
+  fusedOp->set_output(0, nextOp.output(0));
+  fusedOp->set_type(it->second);
+  for (auto i = 1; i < nextOp.input_size(); i++) {
+    fusedOp->add_input(nextOp.input(i));
+  }
+  return true;
+}
+
+static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
+  CHECK_GE(def.op_size(), 1);
+  NetDef mdef;
+  mdef.CopyFrom(def);
+  mdef.clear_op();
+  auto i = 0;
+
+  while (i < def.op_size()) {
+    if (i == def.op_size() - 1) {
+      VLOG(2) << "Last operator, skipping";
+      auto* op = mdef.add_op();
+      op->CopyFrom(def.op(i));
+      i += 1;
+      continue;
+    }
+
+    const auto& currentOp = def.op(i);
+    const auto& nextOp = def.op(i + 1);
+    OperatorDef fusedOp;
+    if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
+      VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
+      // We can fuse.
+      auto* op = mdef.add_op();
+      op->CopyFrom(fusedOp);
+      i += 2;
+      continue;
+    }
+    VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
+    // Just emit the current type.
+    auto* op = mdef.add_op();
+    op->CopyFrom(currentOp);
+    i += 1;
+  }
+  return mdef;
+}
+
+void dumpDefForOpenGL(const NetDef& d) {
+  for (const auto& op : d.op()) {
+    LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
+  }
+}
+
+// // For debugging
+// void dumpDefForOpenGL(const NetDef &net) {
+//  for (const auto &op : net.op()) {
+//    printf("***Operator: %s\n", op.type().c_str());
+//    for (auto input : op.input()) {
+//      printf("\tInput: %s\n", input.c_str());
+//    }
+//
+//    for (auto output : op.output()) {
+//      printf("\tOutput: %s\n", output.c_str());
+//    }
+//  }
+//}
+
+NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
+  CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
+  NetDef net;
+  net.CopyFrom(predictNet);
+
+  std::unordered_map<std::string, std::string> replacements(
+      {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess",
+        useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess"
+                        : "OpenGLTensorToTextureStylizerPreprocess"},
+       {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess",
+        useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess"
+                        : "OpenGLTextureToTensorStylizerDeprocess"}});
+
+  std::unordered_set<std::string> openGLOps; // Used to insert copy ops
+  bool needCopyOps = false;
+
+  const auto& opKeyList = CPUOperatorRegistry()->Keys();
+  auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
+
+#ifdef CAFFE2_ANDROID
+  // TODO: debug InstanceNorm models on Mali devices
+  AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext();
+  if (context->get_platform() == Mali) {
+    opKeySet.erase("OpenGLInstanceNorm");
+    opKeySet.erase("OpenGLInstanceNormPRelu");
+  }
+#endif
+  for (auto i = 0; i < net.op_size(); ++i) {
+    auto* op = net.mutable_op(i);
+    string openGLOp = std::string("OpenGL") + op->type();
+    if (replacements.count(openGLOp) > 0) {
+      openGLOp = replacements[openGLOp];
+    }
+
+    if (opKeySet.find(openGLOp) != opKeySet.end()) {
+      op->set_type(openGLOp);
+      openGLOps.insert(openGLOp);
+
+      if (useTiling) {
+        auto* arg = op->add_arg();
+        arg->set_name("tiling");
+        arg->set_i(1);
+      }
+    } else {
+      needCopyOps = true;
+    }
+  }
+
+  if (useTextureInput && needCopyOps) {
+    CAFFE_THROW("OpenGL operator missing");
+  }
+
+  if (runFusion) {
+    net = runOpenGLFusion(net, openGLOps);
+  }
+
+  if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
+    // For end-to-end testing
+    if (net.op(net.op_size() - 1).type() !=
+        replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) {
+      auto* last_op = net.mutable_op(net.op_size() - 1);
+      auto output = last_op->output(0) + "M";
+      last_op->set_output(0, output);
+      auto* copy_op = net.add_op();
+      copy_op->set_name("CopyFromOpenGL");
+      copy_op->set_type("CopyFromOpenGL");
+      copy_op->add_input(output);
+      // rename output blob in case input and output blob has the same name
+      copy_op->add_output(net.external_output(0));
+    }
+  } else {
+    if (!useTextureInput) {
+      needCopyOps = true;
+    }
+  }
+
+  // copy ops are needed when the input is not a texture
+  if (needCopyOps) {
+    // For non style transfer cases
+    net = insertInputOutputCopyOps(net, openGLOps);
+  }
+
+  return net;
+}
+
+bool tryConvertToOpenGL(const NetDef& initNet,
+                        const NetDef& predictNet,
+                        NetDef* glPredictNet,
+                        bool useTextureInput,
+                        bool useTiling,
+                        bool runFusion) {
+  try {
+    // Throws if unsupported operators are found.
+    *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
+    dumpDefForOpenGL(*glPredictNet);
+    // Throws if unsupported parameters are found.
+    Workspace ws;
+    ws.RunNetOnce(initNet);
+    ws.CreateNet(*glPredictNet);
+    LOG(INFO) << "OpenGL is successfully enabled";
+    return true;
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
+    return false;
+  }
+}
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/rewrite_net.h b/caffe2/mobile/contrib/opengl/core/rewrite_net.h
new file mode 100644
index 0000000..c3c47d6
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/core/rewrite_net.h
@@ -0,0 +1,20 @@
+
+#pragma once
+#include "GLPredictor.h"
+#include "caffe2/core/predictor.h"
+
+namespace caffe2 {
+bool tryConvertToOpenGL(const NetDef& initNet,
+                        const NetDef& predictNet,
+                        NetDef* glPredictNet,
+                        bool useTextureInput = false,
+                        bool useTiling       = false,
+                        bool runFusion       = true);
+
+// Exposed for testing
+NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
+                                  bool useTextureInput = false,
+                                  bool useTiling       = false,
+                                  bool runFusion       = true);
+void dumpDefForOpenGL(const NetDef& net);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/ios/CMakeLists.txt b/caffe2/mobile/contrib/opengl/ios/CMakeLists.txt
new file mode 100644
index 0000000..8801961
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.mm *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/ios/GLContext.cc b/caffe2/mobile/contrib/opengl/ios/GLContext.cc
new file mode 100644
index 0000000..f708ce6
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/GLContext.cc
@@ -0,0 +1,19 @@
+
+#include "IOSGLContext.h"
+
+std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
+
+void GLContext::initGLContext() {
+  if (_glcontext == nullptr) {
+    _glcontext.reset(new IOSGLContext());
+  }
+}
+
+GLContext* GLContext::getGLContext() {
+  if (_glcontext == nullptr) {
+    initGLContext();
+  }
+  return _glcontext.get();
+}
+
+void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
diff --git a/caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc
new file mode 100644
index 0000000..9f6e04a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc
@@ -0,0 +1,11 @@
+
+#include "IOSGLImageAllocator.h"
+#include <arm_neon.h>
+
+template <typename T>
+GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
+  return new IOSGLImageAllocator<T>();
+}
+
+template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
+template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLContext.h b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.h
new file mode 100644
index 0000000..09e807a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.h
@@ -0,0 +1,22 @@
+
+#pragma once
+
+#include "../core/GLContext.h"
+#include "../core/GLTexture.h"
+
+#import <CoreVideo/CoreVideo.h>
+
+class IOSGLContext : public GLContext {
+  void* oglContext;
+  void* oldContext;
+  CVOpenGLESTextureCacheRef textureCache;
+
+ public:
+  IOSGLContext();
+  ~IOSGLContext();
+
+  const GLTexture* createNewTexture(CVPixelBufferRef pixelBuffer, const GLTexture::Type& type);
+  void set_context();
+  void reset_context();
+  void flush_context();
+};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm
new file mode 100644
index 0000000..c436e59
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm
@@ -0,0 +1,98 @@
+
+#include "IOSGLContext.h"
+#include "IOSGLTexture.h"
+#import <sstream>
+
+#import <OpenGLES/EAGL.h>
+
+IOSGLContext::IOSGLContext() {
+  auto const currentContext = [EAGLContext currentContext];
+  oldContext = (void*)CFBridgingRetain(currentContext);
+
+  if (currentContext != nil && [currentContext API] == kEAGLRenderingAPIOpenGLES3) {
+    oglContext = (void*)CFBridgingRetain(currentContext);
+
+    gl_log(GL_LOG, "Reusing current context %p\n", oglContext);
+  } else {
+    oglContext =
+        (void*)CFBridgingRetain([[EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES3]);
+
+    gl_log(GL_LOG, "Created a new context %p\n", oglContext);
+  }
+
+  if (!oglContext) {
+    throw std::runtime_error("Problem with OpenGL context");
+  }
+
+  set_context();
+  textureCache = NULL;
+  CVReturn err = CVOpenGLESTextureCacheCreate(
+      kCFAllocatorDefault, NULL, (__bridge EAGLContext*)oglContext, NULL, &textureCache);
+
+  if (err) {
+    std::stringstream errmsg;
+    errmsg << "Error at CVOpenGLESTextureCacheCreate " << err;
+    throw std::runtime_error(errmsg.str());
+  }
+}
+
+IOSGLContext::~IOSGLContext() {
+  gl_log(GL_VERBOSE, "~IOSGLContext()");
+
+  set_context();
+  if (textureCache) {
+    CFRelease(textureCache);
+    textureCache = 0;
+  }
+  reset_context();
+
+  // Explicitly release only after we `reset_context` since otherwise we are going to read from a
+  // dangling pointer.
+  if (oglContext) {
+    CFBridgingRelease(oglContext);
+  }
+  if (oldContext) {
+    CFBridgingRelease(oldContext);
+  }
+}
+
+const GLTexture* IOSGLContext::createNewTexture(CVPixelBufferRef pixelBuffer,
+                                                const GLTexture::Type& type) {
+  return new IOSGLTexture(type, textureCache, pixelBuffer);
+}
+
+void IOSGLContext::set_context() {
+  auto const currentContext = [EAGLContext currentContext];
+
+  if ((__bridge void*)currentContext != oglContext) {
+    if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oglContext]) {
+      throw std::runtime_error("Problem setting OpenGL context");
+    }
+    GLenum glError = glGetError();
+    if (glError != GL_NO_ERROR) {
+      gl_log(GL_ERR, "There is an error: 0x%X\n", glError);
+    }
+    gl_log(GL_VERBOSE, "Set context to %p\n", oglContext);
+  }
+}
+
+void IOSGLContext::reset_context() {
+  EAGLContext* currentContext = [EAGLContext currentContext];
+
+  if ((__bridge void*)currentContext != oldContext) {
+    GLenum glError = glGetError();
+    if (glError != GL_NO_ERROR) {
+      gl_log(GL_ERR, "There is an error before: 0x%X\n", glError);
+    }
+    if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oldContext]) {
+      throw std::runtime_error("Problem setting OpenGL context");
+    }
+    glError = glGetError();
+    if (glError != GL_NO_ERROR) {
+      gl_log(GL_ERR, "There is an error after: 0x%X\n", glError);
+    }
+    gl_log(GL_VERBOSE, "Reset context to %p\n", oldContext);
+  }
+}
+
+void IOSGLContext::flush_context() { CVOpenGLESTextureCacheFlush(textureCache, 0); }
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc
new file mode 100644
index 0000000..2c4824a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc
@@ -0,0 +1,78 @@
+
+#include "IOSGLImageAllocator.h"
+
+#include "../core/GLImage.h"
+#include "../core/GLImageAllocator.h"
+#include "../core/GLPlainTexture.h"
+
+#include "IOSGLContext.h"
+#include "IOSGLTexture.h"
+
+#include "../core/arm_neon_support.h"
+
+template <class T>
+GLImageVector<T>* IOSGLImageAllocator<T>::newImage(int num_images,
+                                                   int width,
+                                                   int height,
+                                                   int channels,
+                                                   int tile_x,
+                                                   int tile_y,
+                                                   bool useCVPixelBuffer) {
+  GLImageVector<T>* output_images =
+      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
+  if (useCVPixelBuffer) {
+    IOSGLContext* gl_context = (IOSGLContext*)GLContext::getGLContext();
+    for (int i = 0; i < num_images; i++) {
+      GLImage<T>* output_image = new GLImage<T>(
+          width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
+            gl_log(GL_VERBOSE,
+                   "%s pixelbuffers.size(): %ld\n",
+                   __PRETTY_FUNCTION__,
+                   pixelbuffers.size());
+
+            CVPixelBufferRef buffer = NULL;
+            int slices = (channels + 3) / 4;
+            int slice_index = i * slices + slice;
+            if (pixelbuffers.size() < slice_index + 1) {
+              const int texture_width = width * tile_x;
+              const int texture_height = height * tile_y;
+              buffer =
+                  IOSGLTexture::createCVPixelBuffer(pixelFormat, texture_width, texture_height);
+              gl_log(GL_VERBOSE,
+                     "created a new buffer %p for image %d slice %d of dimensions %dx%d\n",
+                     buffer,
+                     i,
+                     slice,
+                     texture_width,
+                     texture_height);
+              pixelbuffers.push_back(buffer);
+            } else {
+              buffer = pixelbuffers[slice_index];
+
+              gl_log(GL_VERBOSE, "reused buffer %p for image %d slice %d\n", buffer, i, slice);
+            }
+
+            return gl_context->createNewTexture(buffer, GLImageAllocator<T>::type);
+          });
+      output_images->push_back(output_image);
+    }
+  } else {
+    for (int i = 0; i < num_images; i++) {
+      GLImage<T>* image = new GLImage<T>(
+          width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
+            return new GLPlainTexture(
+                GLImageAllocator<T>::type, nullptr, width * tile_x, height * tile_y);
+          });
+      output_images->push_back(image);
+    }
+  }
+  return output_images;
+}
+
+template <>
+const FourCharCode IOSGLImageAllocator<float16_t>::pixelFormat = kCVPixelFormatType_64RGBAHalf;
+template <>
+const FourCharCode IOSGLImageAllocator<uint8_t>::pixelFormat = kCVPixelFormatType_32BGRA;
+
+template class IOSGLImageAllocator<float16_t>;
+template class IOSGLImageAllocator<uint8_t>;
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h
new file mode 100644
index 0000000..ced28bc
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h
@@ -0,0 +1,34 @@
+
+#pragma once
+
+#include "../core/GLImageAllocator.h"
+
+#import <CoreVideo/CoreVideo.h>
+
+template <class T>
+class IOSGLImageAllocator : public GLImageAllocator<T> {
+  static const GLTexture::Type& type;
+
+  std::vector<CVPixelBufferRef> pixelbuffers;
+
+ public:
+  static const FourCharCode pixelFormat;
+
+  IOSGLImageAllocator() : GLImageAllocator<T>() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
+
+  ~IOSGLImageAllocator() {
+    gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__);
+
+    for (auto&& pixelbuffer : pixelbuffers) {
+      CFRelease(pixelbuffer);
+    }
+  }
+
+  GLImageVector<T>* newImage(int num_images,
+                             int width,
+                             int height,
+                             int channels,
+                             int tile_x,
+                             int tile_y,
+                             bool useCVPixelBuffer);
+};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h
new file mode 100644
index 0000000..94eb7a8
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h
@@ -0,0 +1,51 @@
+
+#pragma once
+
+#include "../core/GLContext.h"
+#include "../core/GLTexture.h"
+
+#import <CoreVideo/CoreVideo.h>
+
+class IOSGLTexture : public GLTexture {
+  CVOpenGLESTextureRef textureRef;
+
+  IOSGLTexture(const Type& type,
+               CVOpenGLESTextureCacheRef textureCache,
+               CVPixelBufferRef sourceImage,
+               GLint _filter = GL_NEAREST,
+               GLint _wrap = GL_CLAMP_TO_EDGE);
+
+  friend class IOSGLContext;
+
+ public:
+  const CVPixelBufferRef sourceImage;
+
+  ~IOSGLTexture() { CFRelease(textureRef); }
+
+  void map_buffer(std::function<void(void* buffer,
+                                     size_t width,
+                                     size_t height,
+                                     size_t stride,
+                                     size_t channels,
+                                     const Type& type)> process) const;
+
+  virtual void map_read(std::function<void(const void* buffer,
+                                           size_t width,
+                                           size_t height,
+                                           size_t stride,
+                                           size_t channels,
+                                           const Type& type)> process) const;
+
+  virtual void map_load(std::function<void(void* buffer,
+                                           size_t width,
+                                           size_t height,
+                                           size_t stride,
+                                           size_t channels,
+                                           const Type& type)> process) const;
+
+  GLuint name() const { return CVOpenGLESTextureGetName(textureRef); }
+  GLenum target() const { return CVOpenGLESTextureGetTarget(textureRef); };
+  bool flipped() const { return CVOpenGLESTextureIsFlipped(textureRef); };
+
+  static CVPixelBufferRef createCVPixelBuffer(OSType pixelType, int32_t width, int32_t height);
+};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm
new file mode 100644
index 0000000..7fc1bbc
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm
@@ -0,0 +1,121 @@
+
+#include "IOSGLTexture.h"
+#include "../core/DataTransfer.h"
+
+IOSGLTexture::IOSGLTexture(const Type& type,
+                           CVOpenGLESTextureCacheRef textureCache,
+                           CVPixelBufferRef _sourceImage,
+                           GLint filter,
+                           GLint wrap)
+    : GLTexture(type,
+                CVPixelBufferGetWidth(_sourceImage),
+                CVPixelBufferGetHeight(_sourceImage),
+                CVPixelBufferGetBytesPerRow(_sourceImage) / (type.channels() * type.dataSize()),
+                false,
+                filter,
+                wrap),
+      sourceImage(_sourceImage) {
+  CVReturn err = CVOpenGLESTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
+                                                              textureCache,
+                                                              _sourceImage,
+                                                              NULL,
+                                                              GL_TEXTURE_2D,
+                                                              _type.internalFormat,
+                                                              _width,
+                                                              _height,
+                                                              _type.format,
+                                                              _type.type,
+                                                              0,
+                                                              &textureRef);
+
+  if (!textureRef || err) {
+    gl_log(GL_ERR,
+           "something went wrong, sourceImage: %p, width: %d, height: %d, filter: %d, wrap: %d\n",
+           _sourceImage,
+           _width,
+           _height,
+           filter,
+           wrap);
+  }
+  _textureId = name();
+  gl_log(
+      GL_VERBOSE,
+      "IOSGLTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
+      _textureId,
+      _type.internalFormat,
+      _type.format,
+      _type.type);
+
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, _textureId);
+
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
+
+#if GL_EXT_texture_border_clamp
+  GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
+  // Set the texture to use the border clamp wrapping mode.
+  wrap = GL_CLAMP_TO_BORDER_EXT;
+#endif
+
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap);
+
+  glBindTexture(GL_TEXTURE_2D, 0);
+}
+
+CVPixelBufferRef IOSGLTexture::createCVPixelBuffer(OSType pixelFormat,
+                                                   int32_t width,
+                                                   int32_t height) {
+  NSDictionary* pixelBufferAttributes = @{
+    (id)kCVPixelBufferPixelFormatTypeKey : @(pixelFormat),
+    (id)kCVPixelFormatOpenGLESCompatibility : @YES,
+    (id)kCVPixelBufferIOSurfacePropertiesKey : @{/*empty dictionary*/}
+  };
+
+  CVPixelBufferRef buffer = NULL;
+  CVPixelBufferCreate(kCFAllocatorDefault,
+                      width,
+                      height,
+                      pixelFormat,
+                      (__bridge CFDictionaryRef)(pixelBufferAttributes),
+                      &buffer);
+  return buffer;
+}
+
+void IOSGLTexture::map_buffer(std::function<void(void* buffer,
+                                                 size_t width,
+                                                 size_t height,
+                                                 size_t stride,
+                                                 size_t channels,
+                                                 const Type& type)> process) const {
+  if (CVPixelBufferLockBaseAddress(sourceImage, 0) == kCVReturnSuccess) {
+    void* buffer = CVPixelBufferGetBaseAddress(sourceImage);
+    int buffer_stride = CVPixelBufferGetBytesPerRow(sourceImage) / (_channels * _type.dataSize());
+    process(buffer, _width, _height, buffer_stride, _channels, _type);
+
+    CVPixelBufferUnlockBaseAddress(sourceImage, 0);
+  }
+}
+
+void IOSGLTexture::map_load(std::function<void(void* buffer,
+                                               size_t width,
+                                               size_t height,
+                                               size_t stride,
+                                               size_t channels,
+                                               const Type& type)> process) const {
+  map_buffer(process);
+}
+
+void IOSGLTexture::map_read(std::function<void(const void* buffer,
+                                               size_t width,
+                                               size_t height,
+                                               size_t stride,
+                                               size_t channels,
+                                               const Type& type)> process) const {
+  // TODO: why is glFlush() only necessary when running tests
+  glFlush();
+
+  map_buffer(process);
+}
diff --git a/caffe2/mobile/contrib/opengl/operators/CMakeLists.txt b/caffe2/mobile/contrib/opengl/operators/CMakeLists.txt
new file mode 100644
index 0000000..dbc170e
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/operators/GLAdd.cc b/caffe2/mobile/contrib/opengl/operators/GLAdd.cc
new file mode 100644
index 0000000..755678f
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLAdd.cc
@@ -0,0 +1,143 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLAdd : public GLFilter {
+ public:
+  binding* inputData[2];
+  binding* outputSize;
+
+  GLAdd()
+      : GLFilter("GLAdd",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>(
+                     {BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {/* no replacements */}) {}
+
+  template <typename T>
+  void add(const GLImageVector<T>& input_image0,
+           const GLImageVector<T>& input_image1,
+           const GLImageVector<T>& output_image);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLAdd::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData[2]);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+    ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+    vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
+    vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
+    vec4 value = A + B;
+    outputData = TEXTURE_STORE(value);
+}
+
+)GLSL";
+
+template <typename T>
+void GLAdd::add(const GLImageVector<T>& input_images0,
+                const GLImageVector<T>& input_images1,
+                const GLImageVector<T>& output_images) {
+  const int num_images = input_images0.size();
+  for (int i = 0; i < num_images; i++) {
+    GLImage<T>* input_image0 = input_images0[i];
+    GLImage<T>* input_image1 = input_images1[i];
+    int input_slices = input_image0->slices;
+    GLImage<T>* output_image = output_images[i];
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      std::vector<texture_attachment> input_attachments;
+      input_attachments.push_back({input_image0->textures[is], inputData[0]});
+      input_attachments.push_back({input_image1->textures[is], inputData[1]});
+
+      run(input_attachments,
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
+          output_image->texture_width,
+          output_image->texture_height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T>
+class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLAddOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
+                           "OpenGLAdd does not support broadcast");
+
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
+    const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
+
+    CAFFE_ENFORCE_EQ(input0.size(), input1.size());
+
+    const int num_images = input0.size();
+    const int input_channels = input0.channels();
+    const int input_width = input0.width();
+    const int input_height = input0.height();
+    const int input_tile_x   = input0.tile_x();
+    const int input_tile_y   = input0.tile_y();
+
+    CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
+    CAFFE_ENFORCE_EQ(input1.width(), input_width);
+    CAFFE_ENFORCE_EQ(input1.height(), input_height);
+    CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
+    CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+    const int output_tile_x   = input_tile_x;
+    const int output_tile_y   = input_tile_y;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
+
+    if (!_add) {
+      _add.reset(new GLAdd());
+    }
+
+    _add->add(input0, input1, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLAdd> _add;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLAdd, OpenGLAddOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLAdd).NumInputs(2).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConcat.cc b/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
new file mode 100644
index 0000000..5da06de
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
@@ -0,0 +1,196 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+#include "gl_tiling_utils.h"
+
+#include <iostream>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/math.h"
+
+class GLConcat : public GLFilter {
+ public:
+  bool tiling_;
+  binding* inputData;
+  binding* outputSize;
+  binding* inputTileRange;
+  binding* input_tile_x;
+
+  GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
+      : GLFilter("GLConcat",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>(
+                     {BINDING(outputSize), BINDING(inputData), BINDING(inputTileRange), BINDING(input_tile_x)}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"TILING", caffe2::to_string(tiling)},
+                  {"OUTPUT_TILES", caffe2::to_string(output_tile_geometries.tiles)},
+                  {"OUTPUT_TILE_X", caffe2::to_string(output_tile_geometries.tile_dims.x)},
+                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(output_tile_geometries.tile_size.x)},
+                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(output_tile_geometries.tile_size.y)}}),
+        tiling_(tiling) {}
+
+  template <typename T>
+  void concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_image, int size);
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLConcat::fragment_shader = R"GLSL(#version 300 es
+#define TILING                      $(TILING)
+
+// tiling
+#define OUTPUT_TILES                $(OUTPUT_TILES)
+#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
+#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
+#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+uniform ivec2 outputSize;
+uniform ivec2 inputTileRange; // (]
+uniform int input_tile_x;
+
+#if TILING
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
+  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
+
+  if (tileNum >= inputTileRange.x && tileNum < inputTileRange.y) {
+    tileNum = tileNum - inputTileRange.x;
+    texelCoord = ivec2(tileNum % input_tile_x, tileNum / input_tile_x)  * ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT) + tileCoord;
+    vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+    outputData = TEXTURE_STORE(value);
+  } else {
+    // early termination
+    discard;
+  }
+}
+
+#else
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(value);
+}
+#endif
+
+)GLSL";
+
+template <typename T>
+void GLConcat::concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_images, int input_size) {
+  for (int k = 0; k < output_images.size(); k++) {
+    GLImage<T>* output_image = output_images[k];
+
+    int is = 0, os = 0;
+    for (int i = 0; i < input_size; i++) {
+      for (int j = 0; j < input_images[i]->slices(); j++) {
+        GLImage<T>* input_image = (*input_images[i])[k];
+        std::vector<texture_attachment> input_attachments;
+        input_attachments.push_back({input_image->textures[j], inputData});
+
+        run(input_attachments,
+            {output_image->textures.begin() + os, output_image->textures.begin() + os + 1},
+            [&]() {
+              glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
+              glUniform2i(inputTileRange->location, is, is + input_image->tile_x * input_image->tile_y);
+              glUniform1i(input_tile_x->location, input_image->tile_x);
+            },
+            output_image->texture_width,
+            output_image->texture_height);
+        if (!tiling_) {
+          os++; // for tiling, you always write to the same texture
+        }
+        is += input_image->tile_x * input_image->tile_y;
+      }
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T>
+class OpenGLConcatOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input0.size();
+
+    const GLImageVector<T>** input_images = new const GLImageVector<T>*[Inputs().size()];
+    input_images[0] = &input0;
+    int channelCount = input0.channels();
+
+    bool tiling = OperatorBase::GetSingleArgument<int>("tiling", 0);
+
+    // Only supports input channels divisible by 4 for now
+    CAFFE_ENFORCE_EQ(input0.channels() % 4, 0);
+    for (auto i = 1; i < Inputs().size(); i++) {
+      const GLImageVector<T>& inputi = Inputs()[i]->template Get<GLImageVector<T>>();
+      channelCount += inputi.channels();
+      CAFFE_ENFORCE_EQ(num_images, inputi.size());
+      CAFFE_ENFORCE_EQ(inputi.channels() % 4, 0);
+      CAFFE_ENFORCE_EQ(input0.width(), inputi.width());
+      CAFFE_ENFORCE_EQ(input0.height(), inputi.height());
+      input_images[i] = &inputi;
+
+      if (inputi.tile_x() > 1 || inputi.tile_y() > 1) {
+        tiling = true;
+      }
+    }
+
+    const int input_width = input0.width();
+    const int input_height = input0.height();
+
+    const int output_channels = channelCount;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int output_tile_x = 1;
+    int output_tile_y = 1;
+    if (tiling) {
+      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
+    }
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
+    if (!_concat) {
+      tile_descriptor output_tile_geometries{
+          {output_tile_x, output_tile_y}, {output_width, output_height}, output_tile_x * output_tile_y};
+      _concat.reset(new GLConcat(output_tile_geometries, tiling));
+    }
+
+    _concat->concat(input_images, *output, Inputs().size());
+    delete[] input_images;
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  StorageOrder order_;
+  std::unique_ptr<GLConcat> _concat;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLConcat, OpenGLConcatOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLConcat).NumInputs(2, 4).NumOutputs(1, 2);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConvolution.cc b/caffe2/mobile/contrib/opengl/operators/GLConvolution.cc
new file mode 100644
index 0000000..0926a61
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLConvolution.cc
@@ -0,0 +1,1068 @@
+#include "GLConvolution.h"
+#include "../core/GLContext.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/conv_transpose_unpool_op_base.h"
+#include <iostream>
+#include <vector>
+
+#define MaxOutputTileBatchSize 2
+
+// MARK: GLSL
+const char* GLConvolution::fragment_shader = R"GLSL(#version 300 es
+#define TILED_CONVOLUTION           $(TILED_CONVOLUTION)
+#define TRANSPOSED_CONVOLUTION      $(TRANSPOSED_CONVOLUTION)
+
+// batching
+#define INPUT_BATCH_SIZE            $(INPUT_BATCH_SIZE)
+#define OUTPUT_BATCH_SIZE           $(OUTPUT_BATCH_SIZE)
+
+// tiling
+#define INPUT_TILES                 $(INPUT_TILES)
+#define OUTPUT_TILES                $(OUTPUT_TILES)
+#define INPUT_TILE_WIDTH            $(INPUT_TILE_WIDTH)
+#define INPUT_TILE_HEIGHT           $(INPUT_TILE_HEIGHT)
+#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
+#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
+#define INPUT_TILE_X                $(INPUT_TILE_X)
+#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
+#define INPUT_TILE_CHUNK_SIZE       $(INPUT_TILE_CHUNK_SIZE)
+#define OUTPUT_TILE_CHUNK_SIZE      $(OUTPUT_TILE_CHUNK_SIZE)
+#define OUTPUT_TILE_BATCH_SIZE      $(OUTPUT_TILE_BATCH_SIZE)
+
+#define BOUNDS_CHECK_MODE           $(BOUNDS_CHECK_MODE)
+
+// common
+const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
+const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
+const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
+
+precision mediump float;
+precision mediump int;
+precision mediump sampler2D;
+
+in highp vec2 v_texCoord;
+
+#define unpackKernel(pk) \
+  mat4(vec4(unpackHalf2x16(pk.packed_data[0].x), unpackHalf2x16(pk.packed_data[0].y)), \
+       vec4(unpackHalf2x16(pk.packed_data[0].z), unpackHalf2x16(pk.packed_data[0].w)), \
+       vec4(unpackHalf2x16(pk.packed_data[1].x), unpackHalf2x16(pk.packed_data[1].y)), \
+       vec4(unpackHalf2x16(pk.packed_data[1].z), unpackHalf2x16(pk.packed_data[1].w)))
+
+#if BOUNDS_CHECK_MODE == 0
+  #define IN_BOUNDS(p, p0, p1) (true)
+#else
+  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
+#endif
+
+#if TILED_CONVOLUTION
+// Tiled convolution
+const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+uniform ivec2 outputSize;
+uniform bool accumulate;
+uniform bool fusePRelu;
+
+uniform ivec2 inputTileRange;
+
+TEXTURE_INPUT(inputData[1]);
+TEXTURE_INPUT(previousData[1]);
+
+struct packedKernel {
+  highp uvec4 packed_data[2];
+};
+
+struct kernel {
+  packedKernel data[kernel_size.x * kernel_size.y];
+};
+
+layout (std140) uniform Kernel_block {
+  kernel kernel_data[INPUT_TILE_CHUNK_SIZE * OUTPUT_TILE_CHUNK_SIZE];
+} kernel_block[OUTPUT_TILE_BATCH_SIZE];
+
+layout (std140) uniform bias_block {
+  highp uvec4 bias[(OUTPUT_TILES + 1) / 2];
+};
+
+layout (std140) uniform prelu_scale_block {
+  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
+};
+
+TEXTURE_OUTPUT(0, outputData0);
+
+#if TRANSPOSED_CONVOLUTION
+
+#define CONVOLUTION(ib) { \
+  ivec2 p0 = (input_padding + input_stride - tileCoord % input_stride) % input_stride; \
+  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
+    for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
+      int i = y * kernel_size.x + x; \
+      ivec2 idx = tileCoord + ivec2(x, y) - input_padding; \
+      if IN_BOUNDS(idx, ivec2(0), inputTileSize * input_stride) { \
+        vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx / input_stride); \
+        mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
+        sum += k * data; \
+      } \
+    } \
+  } \
+}
+
+#else
+
+#define CONVOLUTION(ib) { \
+  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
+    for (int x = 0; x < kernel_size.x; x++, i++) { \
+      ivec2 idx = tileCoord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
+        vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx); \
+        mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
+        sum += k * data; \
+      } \
+    } \
+  } \
+}
+#endif // TRANSPOSED_CONVOLUTION
+
+void main() {
+  ivec2 inputSize = textureSize(inputData[0], 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
+
+  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
+
+#if !TRANSPOSED_CONVOLUTION
+  tileCoord = input_stride * tileCoord - input_padding;
+#endif
+
+  highp vec4 sum = vec4(0);
+
+  for (int tile_idx = inputTileRange.x; tile_idx < inputTileRange.y; tile_idx++) {
+    int inTileX = tile_idx % INPUT_TILE_X;
+    int inTileY = tile_idx / INPUT_TILE_X;
+    int inTileId = tile_idx % INPUT_TILE_CHUNK_SIZE; // normalized input tile idx, used to index the kernel
+
+    int kernelIdx = OUTPUT_TILE_CHUNK_SIZE * inTileId + tileNum % OUTPUT_TILE_CHUNK_SIZE;
+    ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
+
+    int outputChunkIdx = tileNum / OUTPUT_TILE_CHUNK_SIZE;
+    if (outputChunkIdx == 0) {
+      CONVOLUTION(0);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 1
+    else if (outputChunkIdx == 1) {
+      CONVOLUTION(1);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 2
+    else if (outputChunkIdx == 2) {
+      CONVOLUTION(2);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 3
+    else if (outputChunkIdx == 3) {
+      CONVOLUTION(3);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 4
+    else if (outputChunkIdx == 4) {
+      CONVOLUTION(4);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 5
+    else if (outputChunkIdx == 5) {
+      CONVOLUTION(5);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 6
+    else if (outputChunkIdx == 6) {
+      CONVOLUTION(6);
+    }
+#if OUTPUT_TILE_BATCH_SIZE > 7
+    else if (outputChunkIdx == 7) {
+      CONVOLUTION(7);
+    }
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+  }
+
+  vec4 biasValue = (tileNum % 2 == 0) ? unpackHalf4x16(bias[tileNum/2].xy) : unpackHalf4x16(bias[tileNum/2].zw);
+  vec4 prevData = TEXTURE_LOAD(previousData[0], texelCoord);
+  vec4 value = sum + (accumulate ? prevData : biasValue);
+
+  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
+
+  vec4 o0 = fusePRelu ? mix(value * preluValue, value, vec4(greaterThan(value, vec4(0)))) : value;
+  outputData0 = TEXTURE_STORE(o0);
+}
+
+#else
+
+// batched convolution
+
+uniform ivec2 outputSize;
+uniform bool accumulate;
+uniform bool fusePRelu;
+
+TEXTURE_INPUT(inputData[INPUT_BATCH_SIZE]);
+TEXTURE_INPUT(previousData[OUTPUT_BATCH_SIZE]);
+
+struct packedKernel {
+  highp uvec4 packed_data[2];
+};
+
+struct kernel {
+  packedKernel data[kernel_size.x * kernel_size.y];
+};
+
+layout (std140) uniform Kernel_block {
+  kernel kernel_data[OUTPUT_BATCH_SIZE];
+} kernel_block[INPUT_BATCH_SIZE];
+
+layout (std140) uniform bias_block {
+  highp uvec4 bias[(OUTPUT_BATCH_SIZE + 1) / 2];
+};
+
+layout (std140) uniform prelu_scale_block {
+  highp uvec4 scale[(OUTPUT_BATCH_SIZE + 1) / 2];
+};
+
+TEXTURE_OUTPUT(0, outputData0);
+#if OUTPUT_BATCH_SIZE > 1
+TEXTURE_OUTPUT(1, outputData1);
+#if OUTPUT_BATCH_SIZE > 2
+TEXTURE_OUTPUT(2, outputData2);
+#if OUTPUT_BATCH_SIZE > 3
+TEXTURE_OUTPUT(3, outputData3);
+#endif
+#endif
+#endif
+
+#if TRANSPOSED_CONVOLUTION
+#define CONVOLUTION(ib) { \
+  ivec2 p0 = (input_padding + input_stride - texelCoord % input_stride) % input_stride; \
+  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
+    for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
+      int i = y * kernel_size.x + x; \
+      ivec2 idx = texelCoord + ivec2(x, y) - input_padding; \
+      if IN_BOUNDS(idx, ivec2(0), inputSize * input_stride) { \
+        vec4 data = TEXTURE_LOAD(inputData[ib], idx / input_stride); \
+        for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
+          mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
+          sum[ob] += k * data; \
+        } \
+      } \
+    } \
+  } \
+}
+
+#else
+
+#define CONVOLUTION(ib) { \
+  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
+    for (int x = 0; x < kernel_size.x; x++, i++) { \
+      ivec2 idx = coord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
+        vec4 data = TEXTURE_LOAD(inputData[ib], idx); \
+        for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
+          mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
+          sum[ob] += k * data; \
+        } \
+      } \
+    } \
+  } \
+}
+
+#endif // TRANSPOSED_CONVOLUTION
+
+void main() {
+  ivec2 inputSize = textureSize(inputData[0], 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+
+#if !TRANSPOSED_CONVOLUTION
+  ivec2 coord = input_stride * texelCoord - input_padding;
+#endif
+
+  highp vec4 sum[OUTPUT_BATCH_SIZE] = vec4[OUTPUT_BATCH_SIZE](vec4(0)
+#if OUTPUT_BATCH_SIZE > 1
+                                                                       , vec4(0)
+#if OUTPUT_BATCH_SIZE > 2
+                                                                       , vec4(0)
+#if OUTPUT_BATCH_SIZE > 3
+                                                                       , vec4(0)
+#endif
+#endif
+#endif
+                                                                       );
+
+      CONVOLUTION(0);
+#if INPUT_BATCH_SIZE > 1
+      CONVOLUTION(1);
+#if INPUT_BATCH_SIZE > 2
+      CONVOLUTION(2);
+#if INPUT_BATCH_SIZE > 3
+      CONVOLUTION(3);
+#if INPUT_BATCH_SIZE > 4
+      CONVOLUTION(4);
+#if INPUT_BATCH_SIZE > 5
+      CONVOLUTION(5);
+#if INPUT_BATCH_SIZE > 6
+      CONVOLUTION(6);
+#if INPUT_BATCH_SIZE > 7
+      CONVOLUTION(7);
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+
+  vec4 prev0 = TEXTURE_LOAD(previousData[0], texelCoord);
+  vec4 value = sum[0] + (accumulate ? prev0: unpackHalf4x16(bias[0].xy));
+  vec4 o0 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
+  outputData0 = TEXTURE_STORE(o0);
+#if OUTPUT_BATCH_SIZE > 1
+  vec4 prev1 = TEXTURE_LOAD(previousData[1], texelCoord);
+  value = sum[1] + (accumulate ? prev1 : unpackHalf4x16(bias[0].zw));
+  vec4 o1 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
+  outputData1 = TEXTURE_STORE(o1);
+#if OUTPUT_BATCH_SIZE > 2
+  vec4 prev2 = TEXTURE_LOAD(previousData[2], texelCoord);
+  value = sum[2] + (accumulate ? prev2 : unpackHalf4x16(bias[1].xy));
+  vec4 o2 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
+  outputData2 = TEXTURE_STORE(o2);
+#if OUTPUT_BATCH_SIZE > 3
+  vec4 prev3 = TEXTURE_LOAD(previousData[3], texelCoord);
+  value = sum[3] + (accumulate ? prev3: unpackHalf4x16(bias[1].zw));
+  vec4 o3 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
+  outputData3 = TEXTURE_STORE(o3);
+#endif
+#endif
+#endif
+}
+
+#endif // TILED_CONVOLUTION
+
+)GLSL";
+
+void GLConvolution::pack_kernel_data_for_bached_conv(
+    float16_t* data,
+    size_t size,
+    int input_channels,
+    int output_channels,
+    int is,
+    int os,
+    int ib) {
+  typedef float16_t(packedKernel)[output_batch_size][geometry.kernel_size.y]
+                                 [geometry.kernel_size.x][4][4];
+  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
+
+  const int batch_input_channels = std::min(4, input_channels - 4 * (is + ib));
+  for (int ob = 0; ob < output_batch_size; ob++) {
+    const int batch_output_channels =
+        std::min(4, output_channels - 4 * (os + ob));
+    for (int out = 0; out < batch_output_channels; out++) {
+      for (int in = 0; in < batch_input_channels; in++) {
+        for (int y = 0; y < geometry.kernel_size.y; y++) {
+          for (int x = 0; x < geometry.kernel_size.x; x++) {
+            // clang-format off
+            if (geometry.transposed) {
+              typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
+              const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
+              packed_kernel_data[ob][y][x][in][out] =
+              kernel_data[4 * (is + ib) + in][4 * (os + ob) + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
+            } else {
+              typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
+              const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
+              packed_kernel_data[ob][y][x][in][out] = kernel_data[4 * (os + ob) + out][4 * (is + ib) + in][y][x];
+            }
+            // clang-format on
+          }
+        }
+      }
+    }
+  }
+}
+
+void GLConvolution::pack_kernel_data_for_tiled_conv(
+    float16_t* data, // destination
+    size_t size,
+    int input_channels,
+    int output_channels,
+    point input_tile_range,
+    point output_tile_range) {
+  typedef float16_t(
+      packedKernel)[input_tile_chunk_size][output_tile_chunk_size]
+                   [geometry.kernel_size.y][geometry.kernel_size.x][4][4];
+  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
+
+  for (int it = input_tile_range.x; it < input_tile_range.y; it++) {
+    for (int ot = output_tile_range.x; ot < output_tile_range.y; ot++) {
+      for (int y = 0; y < geometry.kernel_size.y; y++) {
+        for (int x = 0; x < geometry.kernel_size.x; x++) {
+          for (int out = 0; out < std::min(4, (output_channels - ot * 4));
+               out++) {
+            for (int in = 0; in < std::min(4, (input_channels - it * 4));
+                 in++) {
+              // clang-format off
+              if (geometry.transposed) {
+                typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
+                const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
+                packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
+                kernel_data[4 * it + in] [4 * ot + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
+              } else {
+                typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
+                const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
+                packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
+                kernel_data[4 * ot + out][4 * it + in][y][x];
+              }
+              // clang-format on
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void GLConvolution::convolution(
+    const GLImageVector<T>& input_images,
+    const GLImageVector<T>& output_images) {
+  if (tiling) {
+    run_tiled_conv(input_images, output_images);
+  } else {
+    run_batched_conv(input_images, output_images);
+  }
+}
+
+template <typename T>
+void GLConvolution::run_batched_conv(
+    const GLImageVector<T>& input_images,
+    const GLImageVector<T>& output_images) {
+  for (int i = 0; i < input_images.size(); i++) {
+    GLImage<T>* input_image = input_images[i];
+    GLImage<T>* output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is += input_batch_size) {
+      for (int os = 0; os < output_slices; os += output_batch_size) {
+        const int output_channels_per_batch =
+            std::min(4 * output_batch_size, geometry.output_channels - 4 * os);
+
+        gl_log(
+            GL_VERBOSE,
+            "GLConvolution::convolution - is: %d, os: %d\n",
+            is,
+            os);
+
+        // Note the order of the binding point needs to be the same as in the
+        // constructor
+        int binding_point = 0;
+
+        // bias
+        attach_uniform_buffer<float16_t>(
+            bias_block, binding_point++, [&](float16_t* data, size_t size) {
+              CAFFE_ENFORCE_GE(
+                  size,
+                  output_channels_per_batch * sizeof(float16_t),
+                  "Bias buffer size too small");
+              for (int ob = 0; ob < output_channels_per_batch; ob++) {
+                data[ob] = bias[4 * os + ob];
+              }
+            });
+
+        // kernel weights
+        for (int ib = 0; ib < input_batch_size; ib++) {
+          attach_uniform_buffer<float16_t>(
+              kernel_block[ib],
+              binding_point++,
+              [&](float16_t* data, size_t size) {
+                CAFFE_ENFORCE_EQ(
+                    size,
+                    4 * (4 * output_batch_size) * geometry.kernel_size.y *
+                        geometry.kernel_size.x * sizeof(float16_t),
+                    "Kernel size mismatch");
+                pack_kernel_data_for_bached_conv(
+                    data,
+                    size,
+                    input_image->channels,
+                    output_image->channels,
+                    is,
+                    os,
+                    ib);
+              });
+        }
+
+        // PRelu scale
+        if (prelu_scale != nullptr && is == input_slices - input_batch_size) {
+          attach_uniform_buffer<float16_t>(
+              prelu_scale_block,
+              binding_point++,
+              [&](float16_t* data, size_t size) {
+                CAFFE_ENFORCE_GE(
+                    size,
+                    output_channels_per_batch * sizeof(float16_t),
+                    "PRelu buffer size too small");
+                for (int ob = 0; ob < output_channels_per_batch; ob++) {
+                  data[ob] = prelu_scale_size == geometry.output_channels
+                      ? prelu_scale[4 * os + ob]
+                      : prelu_scale[0];
+                }
+              });
+        }
+
+        std::vector<texture_attachment> input_attachments;
+        for (int ib = 0; ib < input_batch_size; ib++) {
+          input_attachments.push_back(
+              {input_image->textures[is + ib], inputData[ib]});
+        }
+        for (int ob = 0; ob < output_batch_size; ob++) {
+          input_attachments.push_back(
+              {output_image->textures[os + ob], previousData[ob]});
+        }
+
+        run(input_attachments,
+            {output_image->textures.begin() + os,
+             output_image->textures.begin() + os + output_batch_size},
+            [&]() {
+              glUniform2i(
+                  outputSize->location,
+                  output_image->texture_width,
+                  output_image->texture_height);
+              glUniform2i(inputTileRange->location, 0, 1);
+              glUniform1i(accumulate->location, is != 0);
+              glUniform1i(
+                  fusePRelu->location,
+                  prelu_scale != nullptr &&
+                      (is == input_slices - input_batch_size));
+            },
+            output_image->texture_width,
+            output_image->texture_height);
+      }
+    }
+  }
+}
+
+template <typename T>
+void GLConvolution::run_tiled_conv(
+    const GLImageVector<T>& input_images,
+    const GLImageVector<T>& output_images) {
+  for (int i = 0; i < input_images.size(); i++) {
+    GLImage<T>* input_image = input_images[i];
+    GLImage<T>* output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+    int input_tile_x = input_image->tile_x;
+    int input_tile_y = input_image->tile_y;
+    int input_tiles = input_image->tile_x * input_image->tile_y;
+    int output_tiles = output_image->tile_x * output_image->tile_y;
+
+    for (int ib = 0, it = 0; it < input_tiles;
+         ib++, it += input_tile_chunk_size) {
+      // Note the order of the binding point needs to be the same as in the
+      // constructor
+      int binding_point = 0;
+
+      // bias
+      attach_uniform_buffer<float16_t>(
+          bias_block, binding_point++, [&](float16_t* data, size_t size) {
+            CAFFE_ENFORCE_GE(
+                size,
+                geometry.output_channels * sizeof(float16_t),
+                "Bias buffer size too small");
+            for (int ob = 0; ob < geometry.output_channels; ob++) {
+              data[ob] = bias[ob];
+            }
+          });
+
+      // kernel weights
+      for (int ob = 0, ot = 0; ot < output_tiles;
+           ob++, ot += output_tile_chunk_size) {
+        attach_uniform_buffer<float16_t>(
+            kernel_block[ob],
+            binding_point++,
+            [&](float16_t* data, size_t size) {
+              CAFFE_ENFORCE_EQ(
+                  size,
+                  (4 * input_tile_chunk_size) * (4 * output_tile_chunk_size) *
+                      geometry.kernel_size.y * geometry.kernel_size.x *
+                      sizeof(float16_t),
+                  "Kernel size mismatch");
+              pack_kernel_data_for_tiled_conv(
+                  data,
+                  size,
+                  input_image->channels,
+                  output_image->channels,
+                  {it, std::min(it + input_tile_chunk_size, input_tiles)},
+                  {ot, std::min(ot + output_tile_chunk_size, output_tiles)});
+            });
+      }
+
+      // PRelu scale
+      if (prelu_scale != nullptr && ib == input_tile_batch_size - 1) {
+        attach_uniform_buffer<float16_t>(
+            prelu_scale_block,
+            binding_point++,
+            [&](float16_t* data, size_t size) {
+              CAFFE_ENFORCE_GE(
+                  size,
+                  geometry.output_channels * sizeof(float16_t),
+                  "PRelu buffer size too small");
+              for (int ob = 0; ob < geometry.output_channels; ob++) {
+                data[ob] = prelu_scale_size == geometry.output_channels
+                    ? prelu_scale[ob]
+                    : prelu_scale[0];
+              }
+            });
+      }
+
+      std::vector<texture_attachment> input_attachments(
+          {{input_image->textures[0], inputData[0]},
+           {output_image->textures[0], previousData[0]}});
+
+      run(input_attachments,
+          {output_image->textures[0]},
+          [&]() {
+            glUniform2i(
+                outputSize->location,
+                output_image->texture_width,
+                output_image->texture_height);
+            // [inputTileFrom, inputTileTo)
+            glUniform2i(
+                inputTileRange->location,
+                it,
+                std::min(it + input_tile_chunk_size, input_tiles));
+
+            glUniform1i(accumulate->location, it != 0);
+            glUniform1i(
+                fusePRelu->location,
+                prelu_scale != nullptr && (ib == input_tile_batch_size - 1));
+          },
+          output_image->texture_width,
+          output_image->texture_height);
+    }
+  }
+}
+
+namespace caffe2 {
+
+template <typename OPBase>
+static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
+  Tensor<CPUContext> input, output;
+  input.Resize(1, 1, H, W);
+  op->SetOutputSize(input, &output, 1);
+  CAFFE_ENFORCE_EQ(output.ndim(), 4);
+  *OH = output.dim(2);
+  *OW = output.dim(3);
+}
+
+static int computeOutputTileChunkSize(int output_tile_x,
+                                      int output_tile_y,
+                                      int kernel_width,
+                                      int kernel_height) {
+  static const int maxUniformBlockBufferSize = 16 * 1024;
+  return std::min(
+      output_tile_x * output_tile_y,
+      maxUniformBlockBufferSize / 4 /
+          (4 * kernel_width * kernel_height * (int)sizeof(float16_t)));
+}
+
+static int computeInputTileChunkSize(
+    int input_tile_x,
+    int input_tile_y,
+    int output_tile_chunk_size,
+    int kernel_width,
+    int kernel_height) {
+  static const int maxUniformBlockBufferSize = 16 * 1024;
+  return std::min(
+      input_tile_x * input_tile_y,
+      maxUniformBlockBufferSize / 4 /
+          (4 * output_tile_chunk_size * kernel_width * kernel_height *
+           (int)sizeof(float16_t)));
+}
+
+// Todo: optimize input/output batch size and use of uniforms/textures for
+// kernel data
+static void computeBatchSizes(
+    GLConvolution::descriptor& geometry,
+    int& input_batch_size,
+    int& output_batch_size) {
+  int kernel_size = std::max(geometry.kernel_size.x, geometry.kernel_size.y);
+  int input_slices = (geometry.input_channels + 3) / 4;
+  int output_slices = (geometry.output_channels + 3) / 4;
+
+#if CAFFE2_ANDROID
+  input_batch_size = input_slices % 2 == 0 ? 2 : 1;
+  output_batch_size = output_slices % 2 == 0 ? 2 : 1;
+#else
+  if (iPhoneVersion() >= 8) {
+    // iPhone 6S and up
+    input_batch_size =
+        /* input_slices % 8 == 0 ? 8 : */ input_slices % 4 == 0
+            ? 4
+            : input_slices % 3 == 0 ? 3 : input_slices % 2 == 0 ? 2 : 1;
+    output_batch_size = output_slices % 4 == 0
+        ? 4
+        : output_slices % 3 == 0 ? 3 : output_slices % 2 == 0 ? 2 : 1;
+  }
+#endif
+}
+
+template <class T, bool fusePRelu, bool fuseRelu>
+class OpenGLConvOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  OpenGLConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(group_ == 1, "OpenGL only supports group == 1");
+    OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "OpenGL only supports dialation == 1");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
+    auto& filter = Input(FILTER);
+    auto& bias = Input(BIAS);
+
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    CAFFE_ENFORCE(filter.ndim(), 4);
+    const int M = filter.dim32(0);
+    const int kernel_width = filter.dim32(2);
+    const int kernel_height = filter.dim32(3);
+
+    CAFFE_ENFORCE(filter.dim32(1) == input_channels, "");
+    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
+    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
+    CAFFE_ENFORCE(bias.ndim() == 1, "");
+    CAFFE_ENFORCE(bias.dim32(0) == M, "");
+
+    int output_height;
+    int output_width;
+    const int output_channels = M;
+    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
+
+    float val = 0;
+    const float* prelu_scale = nullptr;
+    int prelu_scale_size = 0;
+    if (fusePRelu) {
+      auto& prelu = Input(PRELU);
+      prelu_scale = prelu.template data<float>();
+      prelu_scale_size = prelu.size();
+    } else if (fuseRelu) {
+      prelu_scale = &val;
+      prelu_scale_size = 1;
+    }
+
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    int output_tile_x = 1, output_tile_y = 1;
+    int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
+    int input_tile_chunk_size = 1, output_tile_chunk_size = 1;
+    int input_tile_batch_size = 1, output_tile_batch_size = 1;
+
+    const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
+
+    if (tiling) {
+      // Turn on tiling
+      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
+      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
+      output_tiles = output_tile_x * output_tile_y;
+
+      output_tile_chunk_size = computeOutputTileChunkSize(
+          output_tile_x, output_tile_y, kernel_width, kernel_height);
+      output_tile_batch_size = std::max(
+          MaxOutputTileBatchSize,
+          (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
+      output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
+      output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
+
+      input_tile_chunk_size = computeInputTileChunkSize(
+          input_tile_x,
+          input_tile_y,
+          output_tile_chunk_size,
+          kernel_width,
+          kernel_height);
+      input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
+      // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
+      // input_tile_batch_size;
+    }
+    CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
+    CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
+    CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
+
+    int is_last = GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images,
+        output_width,
+        output_height,
+        output_channels,
+        output_tile_x,
+        output_tile_y,
+        is_last);
+
+    // TODO: figure out the dilation business
+    GLConvolution::descriptor geometry{input_channels,
+                                       output_channels,
+                                       {kernel_width, kernel_height},
+                                       {input_width, input_height},
+                                       {output_width, output_height},
+                                       {input_tile_x, input_tile_y},
+                                       {output_tile_x, output_tile_y},
+                                       {pad_l(), pad_t()},
+                                       {stride_w(), stride_h()},
+                                       false};
+
+    if (!conv) {
+      int input_batch_size = 1, output_batch_size = 1;
+      if (!tiling) {
+        computeBatchSizes(geometry, input_batch_size, output_batch_size);
+        input_batch_size =
+            GetSingleArgument<int>("input_batch_size", input_batch_size);
+        output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
+      }
+
+      LOG(INFO) << input_channels << ": " << input_height << " X "
+                << input_width << " => " << output_channels << ": "
+                << output_height << " X " << output_width
+                << " Kernel: " << kernel_width << "X" << kernel_height;
+      if (tiling) {
+        LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
+                  << " => " << output_tile_x << " X " << output_tile_y
+                  << ", Texture size: " << input_width * input_tile_x << " X "
+                  << input_height * input_tile_y << " => "
+                  << output_width * output_tile_x << " X "
+                  << output_height * output_tile_y
+                  << ", Input tile batch size: " << input_tile_batch_size;
+      } else {
+        LOG(INFO) << "input_batch_size = " << input_batch_size
+                  << ", output_batch_size = " << output_batch_size;
+      }
+
+      conv.reset(new GLConvolution(geometry,
+                                   filter.template data<float>(),
+                                   bias.template data<float>(),
+                                   prelu_scale,
+                                   prelu_scale_size,
+                                   input_batch_size,
+                                   output_batch_size,
+                                   input_tiles,
+                                   output_tiles,
+                                   input_tile_chunk_size,
+                                   output_tile_chunk_size,
+                                   input_tile_batch_size,
+                                   output_tile_batch_size,
+                                   tiling));
+    }
+
+    conv->convolution(input, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLConvolution> conv;
+
+  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
+};
+
+REGISTER_CPU_OPERATOR(OpenGLConv, OpenGLConvOp<float16_t, false, false>);
+OPERATOR_SCHEMA(OpenGLConv).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLConvPRelu, OpenGLConvOp<float16_t, true, false>);
+OPERATOR_SCHEMA(OpenGLConvPRelu).NumInputs(4).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLConvRelu, OpenGLConvOp<float16_t, false, true>);
+OPERATOR_SCHEMA(OpenGLConvRelu).NumInputs(3).NumOutputs(1);
+
+template <class T, bool fusePRelu, bool fuseRelu>
+class OpenGLConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext>, ImageAllocator<T> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  OpenGLConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(
+        adj_h() == 0 && adj_w() == 0,
+        "OpenGL only supports adj_h == 1 and adj_w == 1");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
+    auto& filter = Input(FILTER);
+    auto& bias = Input(BIAS);
+
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
+    const int M = filter.dim32(0);
+    const int C = filter.dim32(1);
+    const int kernel_width = filter.dim32(2);
+    const int kernel_height = filter.dim32(3);
+
+    CAFFE_ENFORCE(input_channels == M, "filter number must be equal to input channel number");
+    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "filter height must be equal to kernel height");
+    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "filter width must be equal to kernel width");
+    CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
+    CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
+
+    int output_height;
+    int output_width;
+    const int output_channels = C;
+    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
+
+    float val = 0;
+    const float* prelu_scale = nullptr;
+    int prelu_scale_size = 0;
+    if (fusePRelu) {
+      auto& prelu = Input(PRELU);
+      prelu_scale = prelu.template data<float>();
+      prelu_scale_size = prelu.size();
+    } else if (fuseRelu) {
+      prelu_scale = &val;
+      prelu_scale_size = 1;
+    }
+
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    int output_tile_x = 1, output_tile_y = 1;
+    int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
+    int input_tile_chunk_size = 1, output_tile_chunk_size = 1,
+        input_tile_batch_size = 1, output_tile_batch_size = 1;
+
+    const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
+
+    if (tiling) {
+      // Turn on tiling
+      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
+      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
+      output_tiles = output_tile_x * output_tile_y;
+
+      output_tile_chunk_size = computeOutputTileChunkSize(
+          output_tile_x, output_tile_y, kernel_width, kernel_height);
+      output_tile_batch_size = std::max(
+          MaxOutputTileBatchSize,
+          (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
+      output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
+      output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
+
+      input_tile_chunk_size = computeInputTileChunkSize(
+          input_tile_x,
+          input_tile_y,
+          output_tile_chunk_size,
+          kernel_width,
+          kernel_height);
+      input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
+      // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
+      // input_tile_batch_size;
+    }
+    CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
+    CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
+    CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
+
+    int is_last = GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images,
+        output_width,
+        output_height,
+        output_channels,
+        output_tile_x,
+        output_tile_y,
+        is_last);
+
+    // TODO: figure out the adj business
+    GLConvolution::descriptor geometry{input_channels,
+                                       output_channels,
+                                       {kernel_width, kernel_height},
+                                       {input_width, input_height},
+                                       {output_width, output_height},
+                                       {input_tile_x, input_tile_y},
+                                       {output_tile_x, output_tile_y},
+                                       {pad_l(), pad_t()},
+                                       {stride_w(), stride_h()},
+                                       true};
+
+    if (!conv) {
+      int input_batch_size = 1, output_batch_size = 1;
+      if (!tiling) {
+        computeBatchSizes(geometry, input_batch_size, output_batch_size);
+        input_batch_size =
+            GetSingleArgument<int>("input_batch_size", input_batch_size);
+        output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
+      }
+
+      LOG(INFO) << input_channels << ": " << input_height << " X "
+                << input_width << " => " << output_channels << ": "
+                << output_height << " X " << output_width
+                << " Kernel: " << kernel_width << "X" << kernel_height;
+
+      if (tiling) {
+        LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
+                  << " => " << output_tile_x << " X " << output_tile_y
+                  << ", Texture size: " << input_width * input_tile_x << " X "
+                  << input_height * input_tile_y << " => "
+                  << output_width * output_tile_x << " X "
+                  << output_height * output_tile_y
+                  << ", Input tile batch size: " << input_tile_batch_size;
+      } else {
+        LOG(INFO) << "input_batch_size = " << input_batch_size
+                  << ", output_batch_size = " << output_batch_size;
+      }
+
+      conv.reset(new GLConvolution(geometry,
+                                   filter.template data<float>(),
+                                   bias.template data<float>(),
+                                   prelu_scale,
+                                   prelu_scale_size,
+                                   input_batch_size,
+                                   output_batch_size,
+                                   input.tile_x() * input.tile_y(),
+                                   output->tile_x() * output->tile_y(),
+                                   input_tile_chunk_size,
+                                   output_tile_chunk_size,
+                                   input_tile_batch_size,
+                                   output_tile_batch_size,
+                                   tiling));
+    }
+
+    conv->convolution(input, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLConvolution> conv;
+
+  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
+};
+
+REGISTER_CPU_OPERATOR(OpenGLConvTranspose, OpenGLConvTransposeOp<float16_t, false, false>);
+OPERATOR_SCHEMA(OpenGLConvTranspose).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLConvTransposePRelu, OpenGLConvTransposeOp<float16_t, true, false>);
+OPERATOR_SCHEMA(OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLConvTransposeRelu, OpenGLConvTransposeOp<float16_t, false, true>);
+OPERATOR_SCHEMA(OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConvolution.h b/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
new file mode 100644
index 0000000..fe6e6a0
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "gl_tiling_utils.h"
+
+class GLConvolution : public GLFilter {
+ public:
+  static constexpr int MaxInputBatchSize = 8;
+  static constexpr int MaxOutputBatchSize = 4;
+
+  struct descriptor {
+    int input_channels;
+    int output_channels;
+    point kernel_size;
+    point input_tile_size;
+    point output_tile_size;
+    point input_tile_grid_size;
+    point output_tile_grid_size;
+    point input_padding;
+    point input_stride;
+    bool transposed;
+  };
+
+  const float* kernel;
+  const float* bias;
+  const float* prelu_scale;
+
+  binding* inputData[MaxInputBatchSize];
+  binding* previousData[MaxOutputBatchSize];
+  binding* outputSize;
+  binding* accumulate;
+  binding* fusePRelu;
+  binding* kernel_block[MaxInputBatchSize];
+  binding* bias_block;
+  binding* prelu_scale_block;
+  binding* inputTileRange;
+
+  const descriptor geometry;
+  const int prelu_scale_size;
+  const int input_batch_size;
+  const int output_batch_size;
+  const int input_tiles;
+  const int output_tiles;
+  const int input_tile_chunk_size;
+  const int output_tile_chunk_size;
+  const int input_tile_batch_size;
+  const int output_tile_batch_size;
+  const bool tiling;
+
+  static const char* fragment_shader;
+
+  GLConvolution(
+      const descriptor& _geometry,
+      const float* _kernel,
+      const float* _bias,
+      const float* _prelu_scale = nullptr,
+      int _prelu_scale_size = 0,
+      int _input_batch_size = 1,
+      int _output_batch_size = 1,
+      int _input_tiles = 1,
+      int _output_tiles = 1,
+      int _input_tile_chunk_size = 1,
+      int _output_tile_chunk_size = 1,
+      int _input_tile_batch_size = 1,
+      int _output_tile_batch_size = 1,
+      bool _tiling = false)
+      : GLFilter(
+            "GLConvolution",
+            vertex_shader,
+            fragment_shader,
+            input_bindings(_input_batch_size, _output_batch_size),
+            uniform_blocks_bindings(
+                _input_batch_size,
+                _output_batch_size,
+                _output_tile_batch_size,
+                _prelu_scale != nullptr),
+            {/* no attributes */},
+            {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
+             {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
+             {"INPUT_BATCH_SIZE", caffe2::to_string(_input_batch_size)},
+             {"OUTPUT_BATCH_SIZE", caffe2::to_string(_output_batch_size)},
+             {"INPUT_TILES", caffe2::to_string(_input_tiles)},
+             {"OUTPUT_TILES", caffe2::to_string(_output_tiles)},
+             {"INPUT_TILE_WIDTH",
+              caffe2::to_string(_geometry.input_tile_size.x)},
+             {"INPUT_TILE_HEIGHT",
+              caffe2::to_string(_geometry.input_tile_size.y)},
+             {"OUTPUT_TILE_WIDTH",
+              caffe2::to_string(_geometry.output_tile_size.x)},
+             {"OUTPUT_TILE_HEIGHT",
+              caffe2::to_string(_geometry.output_tile_size.y)},
+             {"INPUT_TILE_X",
+              caffe2::to_string(_geometry.input_tile_grid_size.x)},
+             {"OUTPUT_TILE_X",
+              caffe2::to_string(_geometry.output_tile_grid_size.x)},
+             {"INPUT_TILE_CHUNK_SIZE",
+              caffe2::to_string(_input_tile_chunk_size)},
+             {"OUTPUT_TILE_CHUNK_SIZE",
+              caffe2::to_string(_output_tile_chunk_size)},
+             {"OUTPUT_TILE_BATCH_SIZE",
+              caffe2::to_string(_output_tile_batch_size)},
+             {"TILED_CONVOLUTION", caffe2::to_string(_tiling)},
+             {"INPUT_PADDING_X",
+              caffe2::to_string(
+                  _geometry.transposed
+                      ? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
+                      : _geometry.input_padding.x)},
+             {"INPUT_PADDING_Y",
+              caffe2::to_string(
+                  _geometry.transposed
+                      ? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
+                      : _geometry.input_padding.y)},
+             {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
+             {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
+             {"TRANSPOSED_CONVOLUTION",
+              caffe2::to_string(_geometry.transposed)},
+             {"BOUNDS_CHECK_MODE",
+              caffe2::to_string(bounds_check_mode(_tiling, _geometry))}}),
+        kernel(_kernel),
+        bias(_bias),
+        prelu_scale(_prelu_scale),
+        geometry(_geometry),
+        prelu_scale_size(_prelu_scale_size),
+        input_batch_size(_input_batch_size),
+        output_batch_size(_output_batch_size),
+        input_tiles(_input_tiles),
+        output_tiles(_output_tiles),
+        input_tile_chunk_size(_input_tile_chunk_size),
+        output_tile_chunk_size(_output_tile_chunk_size),
+        input_tile_batch_size(_input_tile_batch_size),
+        output_tile_batch_size(_output_tile_batch_size),
+        tiling(_tiling) {}
+
+  ~GLConvolution() {}
+
+  template <typename T>
+  void convolution(
+      const GLImageVector<T>& input_images,
+      const GLImageVector<T>& output_images);
+
+ private:
+  /*
+   * Computes BOUNDS_CHECK_MODE for the convolution parameters.
+   *
+   * @retval 0 if bounds check can be skipped
+   * @retval non-zero if bounds check can not be skipped
+   */
+  inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
+    if (tiling) {
+      return 1;
+    }
+
+    int input_padding_x = geometry.input_padding.x,
+        input_padding_y = geometry.input_padding.y;
+    if (geometry.transposed) {
+      input_padding_x = geometry.kernel_size.x - 1 - input_padding_x;
+      input_padding_y = geometry.kernel_size.y - 1 - input_padding_y;
+    }
+
+    if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
+        (input_padding_x == 0 && input_padding_y == 0)) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  const std::vector<binding*> input_bindings(
+      int input_batch_size,
+      int output_batch_size) {
+    std::vector<binding*> bindings({BINDING(outputSize),
+                                    BINDING(accumulate),
+                                    BINDING(fusePRelu),
+                                    BINDING(inputTileRange)});
+
+    for (int i = 0; i < input_batch_size; i++) {
+      bindings.push_back(
+          inputData[i] =
+              new binding{"inputData[" + caffe2::to_string(i) + "]"});
+    }
+
+    for (int i = 0; i < output_batch_size; i++) {
+      bindings.push_back(
+          previousData[i] =
+              new binding{"previousData[" + caffe2::to_string(i) + "]"});
+    }
+
+    return bindings;
+  }
+
+  const std::vector<binding*> uniform_blocks_bindings(
+      int input_batch_size,
+      int output_batch_size,
+      int output_tile_batch_size,
+      bool fuse_prelu) {
+    std::vector<binding*> bindings({BINDING(bias_block)});
+    if (fuse_prelu) {
+      bindings.push_back(BINDING(prelu_scale_block));
+    }
+
+    for (int i = 0; i < std::max(input_batch_size, output_tile_batch_size);
+         i++) {
+      bindings.push_back(
+          kernel_block[i] =
+              new binding{"Kernel_block[" + caffe2::to_string(i) + "]"});
+    }
+
+    return bindings;
+  }
+
+  void pack_kernel_data_for_bached_conv(
+      float16_t* data,
+      size_t size,
+      int input_channels,
+      int output_channels,
+      int is,
+      int os,
+      int ib);
+
+  void pack_kernel_data_for_tiled_conv(
+      float16_t* data, // destination
+      size_t size,
+      int input_channels,
+      int output_channels,
+      point input_tile_range,
+      point output_tile_range);
+
+  template <typename T>
+  void run_batched_conv(
+      const GLImageVector<T>& input_images,
+      const GLImageVector<T>& output_images);
+
+  template <typename T>
+  void run_tiled_conv(
+      const GLImageVector<T>& input_images,
+      const GLImageVector<T>& output_images);
+};
diff --git a/caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc b/caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc
new file mode 100644
index 0000000..1d32c49
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc
@@ -0,0 +1,176 @@
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+#include "../core/DataTransfer.h"
+#include "../core/GLContext.h"
+#include "../core/GLImage.h"
+#include "../core/GLPlainTexture.h"
+#include "../core/ImageAllocator.h"
+
+#include <algorithm>
+
+namespace caffe2 {
+template <class T>
+class CopyToOpenGLOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  CopyToOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    // caffe2::Timer timer;
+    const TensorCPU& X = Input(0);
+    const int num_images = X.dim32(0);
+    const int input_channels = X.dim32(1);
+    const int input_width = X.dim32(3);
+    const int input_height = X.dim32(2);
+    const int input_size = input_width * input_height;
+
+    // set up the OpenGL context
+    GLContext::getGLContext()->set_context();
+
+    const float* input = X.template data<float>();
+
+    int tile_x = GetSingleArgument<int>("tile_x", 1);
+    int tile_y = GetSingleArgument<int>("tile_y", 1);
+
+    GLImageVector<T>* output_image = ImageAllocator<T>::newImage(num_images,
+                                                                 input_width,
+                                                                 input_height,
+                                                                 input_channels,
+                                                                 tile_x,
+                                                                 tile_y,
+#if CAFFE2_IOS
+                                                                 true
+#else
+                                                                 false
+#endif
+    );
+
+    if (output_image->tile_x() > 1 || output_image->tile_y() > 1) {
+      LOG(INFO) << "CopyToOpenGLOp tiling: " << output_image->tile_x() << ":"
+                << output_image->tile_y();
+    }
+
+    Outputs()[0]->Reset(output_image);
+
+    for (int i = 0; i < num_images; i++) {
+      const auto textures = (*output_image)[i]->textures;
+      for (int slice = 0; slice < textures.size(); slice++) {
+        // timer.Start();
+
+        textures[slice]->map_load([&](void* buffer,
+                                      size_t width,
+                                      size_t height,
+                                      size_t stride,
+                                      size_t channels,
+                                      const GLTexture::Type& type) {
+          for (int y = 0; y < tile_y; y++) {
+            for (int x = 0; x < tile_x; x++) {
+              const int tiles = slice * tile_x * tile_y + y * tile_x + x;
+              const int slice_channels = std::min(4, input_channels - 4 * tiles);
+              interleaveSlice(
+                  (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
+                  &input[i * input_channels * input_size + 4 * tiles * input_size],
+                  input_width,
+                  input_height,
+                  stride, // texture stride
+                  slice_channels);
+            }
+          }
+        });
+        // LOG(INFO) << "Texture uploading takes " << timer.MilliSeconds() << " ms";
+      }
+    }
+
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(CopyToOpenGL, CopyToOpenGLOp<float16_t>);
+OPERATOR_SCHEMA(CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+
+template <class T>
+class CopyFromOpenGLOp final : public Operator<CPUContext> {
+ public:
+  CopyFromOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    caffe2::Timer timer;
+    const GLImageVector<T>& X = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = X.size();
+    const int input_channels = X.channels();
+    const int input_width = X.width();
+    const int input_height = X.height();
+
+    TensorCPU* Y = Output(0);
+    Y->Resize(num_images, input_channels, input_height, input_width);
+    const int output_width = input_width;
+    const int output_height = input_height;
+    const int output_size = input_width * input_height;
+
+    float* output = Y->mutable_data<float>();
+
+    const int tile_x = X.tile_x();
+    const int tile_y = X.tile_y();
+    for (int i = 0; i < num_images; i++) {
+      for (int slice = 0; slice < X[i]->slices; slice++) {
+        timer.Start();
+        const GLTexture* texture = X[i]->textures[slice];
+
+        texture->map_read([&](const void* buffer,
+                              size_t width,
+                              size_t height,
+                              size_t stride,
+                              size_t channels,
+                              const GLTexture::Type& type) {
+          //#if CAFFE2_ANDROID && defined(__ARM_NEON__)
+          //        if (static_cast<AndroidGLContext*>(GLContext::getGLContext())->get_platform() ==
+          //        Mali) {
+          //          caffe2::Timer timer;
+          //          timer.Start();
+          //          float16_t* copy_buffer = (float16_t*)malloc(_capacity);
+          //          arm_memcpy(
+          //              (volatile unsigned char*)copy_buffer, (volatile unsigned char*)buffer,
+          //              _capacity);
+          //          deInterleaveSlice(
+          //              output + 4 * slice * output_size, copy_buffer, width, height, stride,
+          //              slice_channels);
+          //          free(copy_buffer);
+          //          LOG(INFO) << "memcpy takes " << timer.MilliSeconds() << " ms";
+          //        } else
+          //#endif
+          {
+            gl_log(GL_VERBOSE,
+                   "calling deInterleaveSlice width: %d, height: %d, stride: %d, channels: %d\n",
+                   width,
+                   height,
+                   stride,
+                   channels);
+
+            for (int y = 0; y < tile_y; y++) {
+              for (int x = 0; x < tile_x; x++) {
+                const int tiles = slice * tile_x * tile_y + y * tile_x + x;
+                const int slice_channels = std::min(4, input_channels - 4 * tiles);
+                deInterleaveSlice(
+                    output + i * input_channels * output_size + 4 * tiles * output_size,
+                    (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
+                    input_width,
+                    input_height,
+                    stride,
+                    slice_channels);
+              }
+            }
+          }
+        });
+      }
+    }
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(CopyFromOpenGL, CopyFromOpenGLOp<float16_t>);
+OPERATOR_SCHEMA(CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc b/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
new file mode 100644
index 0000000..301b057
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
@@ -0,0 +1,459 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLReduce : public GLFilter {
+ public:
+  binding* inputSize;
+  binding* outputSize;
+  binding* tileSize;
+  binding* inv_pixel_count;
+  binding* epsilon;
+  binding* inputData;
+  binding* averageData;
+
+  bool compute_inv_stdev;
+  bool compute_norm;
+
+  const std::vector<binding*> input_bindings(bool compute_norm_) {
+    std::vector<binding*> bindings({BINDING(inputSize),
+                                    BINDING(outputSize),
+                                    BINDING(tileSize),
+                                    BINDING(inv_pixel_count),
+                                    BINDING(epsilon),
+                                    BINDING(inputData)});
+    if (compute_norm_) {
+      bindings.push_back(BINDING(averageData));
+    }
+    return bindings;
+  }
+
+  GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
+      : GLFilter("GLReduce",
+                 vertex_shader,
+                 fragment_shader,
+                 input_bindings(compute_norm_),
+                 {/* no uniform_blocks_bindings */},
+                 {/* no attributes */},
+                 {{"COMPUTE_INV_STDEV", caffe2::to_string((int)compute_inv_stdev_)},
+                  {"COMPUTE_NORM", caffe2::to_string((int)compute_norm_)}}),
+        compute_inv_stdev(compute_inv_stdev_),
+        compute_norm(compute_norm_) {}
+
+  template <typename T>
+  void reduce(const GLImage<T>* input_image,
+              const GLImage<T>* output_image,
+              int tile_size_x,
+              int tile_size_y,
+              float inv_pixel_count_ = 1.0,
+              float epsilon_ = 0.0);
+
+  template <typename T>
+  void norm(const GLImage<T>* input_image,
+            const GLImage<T>* avg_image,
+            const GLImage<T>* output_image,
+            int tile_size_x,
+            int tile_size_y,
+            float inv_pixel_count_);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
+
+#define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
+#define COMPUTE_NORM $(COMPUTE_NORM)
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 inputSize;
+uniform ivec2 outputSize;
+uniform ivec2 tileSize;
+uniform float inv_pixel_count;
+uniform float epsilon;
+
+#if COMPUTE_NORM
+TEXTURE_INPUT(averageData);
+#endif
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
+  ivec2 texelCoord = outputCoord * tileSize;
+  ivec2 sumArea = min(tileSize, inputSize - texelCoord);
+  highp vec4 sum = vec4(0.0);
+
+#if COMPUTE_NORM
+  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
+#endif
+
+  for (int y = 0; y < sumArea.y; y++) {
+    for (int x = 0; x < sumArea.x; x++) {
+      ivec2 idx = texelCoord + ivec2(x, y);
+      vec4 val = TEXTURE_LOAD(inputData, idx);
+#if COMPUTE_NORM
+      val -= avg;
+      sum += val * val;
+#else
+      sum += val;
+#endif
+    }
+  }
+
+#if COMPUTE_INV_STDEV
+  outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
+#elif COMPUTE_NORM
+  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
+#else
+  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
+#endif
+}
+
+)GLSL";
+
+template <typename T>
+void GLReduce::reduce(const GLImage<T>* input_image,
+                      const GLImage<T>* output_image,
+                      int tile_size_x,
+                      int tile_size_y,
+                      float inv_pixel_count_,
+                      float epsilon_) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  for (int is = 0; is < input_slices; is++) {
+    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
+
+    run(input_attachments,
+        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+        [&]() {
+          glUniform2i(inputSize->location, input_image->width, input_image->height);
+          glUniform2i(outputSize->location, output_image->width, output_image->height);
+          glUniform2i(tileSize->location, tile_size_x, tile_size_y);
+          glUniform1f(inv_pixel_count->location, inv_pixel_count_);
+          glUniform1f(epsilon->location, epsilon_);
+        },
+        output_image->width,
+        output_image->height);
+  }
+}
+
+template <typename T>
+void GLReduce::norm(const GLImage<T>* input_image,
+                    const GLImage<T>* avg_image,
+                    const GLImage<T>* output_image,
+                    int tile_size_x,
+                    int tile_size_y,
+                    float inv_pixel_count_) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  for (int is = 0; is < input_slices; is++) {
+    std::vector<texture_attachment> input_attachments(
+        {{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
+
+    run(input_attachments,
+        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+        [&]() {
+          glUniform2i(inputSize->location, input_image->width, input_image->height);
+          glUniform2i(outputSize->location, output_image->width, output_image->height);
+          glUniform2i(tileSize->location, tile_size_x, tile_size_y);
+          glUniform1f(inv_pixel_count->location, inv_pixel_count_);
+        },
+        output_image->width,
+        output_image->height);
+  }
+}
+
+class GLScale : public GLFilter {
+ public:
+  binding* outputSize;
+  binding* inputData;
+  binding* averageData;
+  binding* normData;
+
+  binding* scale_factor;
+  binding* bias_factor;
+  binding* prelu_scale_factor;
+
+  const int channels;
+  const float* scale;
+  const float* bias;
+  const float* prelu_scale;
+  const int prelu_size;
+
+  const std::vector<binding*> input_bindings(bool fuse_prelu) {
+    std::vector<binding*> bindings({BINDING(outputSize),
+                                    BINDING(scale_factor),
+                                    BINDING(bias_factor),
+                                    BINDING(inputData),
+                                    BINDING(averageData),
+                                    BINDING(normData)});
+    if (fuse_prelu) {
+      bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
+    }
+    return bindings;
+  }
+
+  GLScale(const int _channels,
+          const float* _scale,
+          const float* _bias,
+          const float* _prelu_scale = nullptr,
+          const int _prelu_size = 0)
+      : GLFilter("GLScale",
+                 vertex_shader,
+                 fragment_shader,
+                 input_bindings(_prelu_scale != nullptr),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"FUSE_PRELU", caffe2::to_string(_prelu_scale != nullptr)}}),
+        channels(_channels),
+        scale(_scale),
+        bias(_bias),
+        prelu_scale(_prelu_scale),
+        prelu_size(_prelu_size) {}
+
+  template <typename T>
+  void scale_and_shift(const GLImage<T>* input_image,
+                       const GLImage<T>* avg_image,
+                       const GLImage<T>* norm_image,
+                       const GLImage<T>* output_image);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLScale::fragment_shader = R"GLSL(#version 300 es
+
+#define FUSE_PRELU $(FUSE_PRELU)
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+uniform ivec2 outputSize;
+uniform vec4 scale_factor;
+uniform vec4 bias_factor;
+
+#if FUSE_PRELU
+uniform vec4 prelu_scale_factor;
+#endif
+
+TEXTURE_INPUT(inputData);
+TEXTURE_INPUT(averageData);
+TEXTURE_INPUT(normData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+
+  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
+  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
+  vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
+
+#if FUSE_PRELU
+  vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
+  vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
+  outputData = TEXTURE_STORE(o);
+#else
+  vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
+  outputData = TEXTURE_STORE(o);
+#endif
+}
+
+)GLSL";
+
+template <typename T>
+void GLScale::scale_and_shift(const GLImage<T>* input_image,
+                              const GLImage<T>* avg_image,
+                              const GLImage<T>* norm_image,
+                              const GLImage<T>* output_image) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  for (int is = 0; is < input_slices; is++) {
+    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
+                                                       {avg_image->textures[is], averageData},
+                                                       {norm_image->textures[is], normData}});
+
+    run(input_attachments,
+        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+        [&]() {
+          glUniform2i(outputSize->location, output_image->width, output_image->height);
+          glUniform4f(scale_factor->location,
+                      scale[4 * is],
+                      channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
+                      channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
+                      channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
+          glUniform4f(bias_factor->location,
+                      bias[4 * is],
+                      channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
+                      channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
+                      channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
+          if (prelu_scale != nullptr) {
+            glUniform4f(prelu_scale_factor->location,
+                        prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
+                        channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
+                                                                        : prelu_scale[0],
+                        channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
+                                                                        : prelu_scale[0],
+                        channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
+                                                                        : prelu_scale[0]);
+          }
+        },
+        output_image->width,
+        output_image->height);
+  }
+}
+
+namespace caffe2 {
+template <class T, bool FUSE_PRELU>
+class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
+        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    const int tile_size_x = 16;
+    const int tile_size_y = 16;
+    int avg_buf_width = input_width;
+    int avg_buf_height = input_height;
+
+    vector<GLImageVector<T>*> reduce_buf;
+    while (reduce_buf.size() == 0 ||
+           (avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
+      avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
+      avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
+
+      reduce_buf.push_back(
+          ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
+    }
+
+    GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
+    GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+    const float* prelu_data = nullptr;
+    int prelu_size = 0;
+    if (FUSE_PRELU) {
+      DCHECK_EQ(InputSize(), 4);
+      const auto& prelu_scale = Input(PRELU);
+      prelu_data = prelu_scale.template data<float>();
+      prelu_size = prelu_scale.size();
+    } else {
+      DCHECK_EQ(InputSize(), 3);
+    }
+
+    const auto& scale = Input(SCALE);
+    const auto& bias = Input(BIAS);
+
+    if (!f_reduce) {
+      f_reduce.reset(new GLReduce());
+      f_norm.reset(new GLReduce(false, true));
+      f_stdDev.reset(new GLReduce(true, false));
+      f_scale.reset(new GLScale(input_channels,
+                                scale.template data<float>(),
+                                bias.template data<float>(),
+                                prelu_data,
+                                prelu_size));
+    }
+
+    for (int i = 0; i < num_images; i++) {
+      for (int k = 0; k < reduce_buf.size() + 1; k++) {
+        const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
+        GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
+
+        float norm = k < reduce_buf.size()
+                         ? 1.0 / (tile_size_x * tile_size_y)
+                         : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
+                               (float)(input_width * input_height);
+        const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
+        const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
+        f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
+      }
+
+      for (int k = 0; k < reduce_buf.size() + 1; k++) {
+        const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
+        GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
+
+        float norm = k < reduce_buf.size()
+                         ? 1.0 / (tile_size_x * tile_size_y)
+                         : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
+                               (float)(input_width * input_height);
+
+        if (k == 0) {
+          f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
+        } else if (k < reduce_buf.size()) {
+          f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
+        } else {
+          const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
+          const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
+          f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
+        }
+      }
+
+      f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
+    }
+    Outputs()[OUTPUT]->Reset(output);
+    if (OutputSize() > 1) {
+      Outputs()[MEAN]->Reset(avg);
+      Outputs()[INV_STDEV]->Reset(inv_stdev);
+    } else {
+      delete avg;
+      delete inv_stdev;
+    }
+    for (auto&& rb : reduce_buf) {
+      delete rb;
+    }
+
+    return true;
+  }
+
+ private:
+  float epsilon_;
+  StorageOrder order_;
+  std::unique_ptr<GLReduce> f_reduce;
+  std::unique_ptr<GLReduce> f_norm;
+  std::unique_ptr<GLReduce> f_stdDev;
+  std::unique_ptr<GLScale> f_scale;
+
+  INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
+  OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
+};
+
+REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
+OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
+REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
+OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLMul.cc b/caffe2/mobile/contrib/opengl/operators/GLMul.cc
new file mode 100644
index 0000000..0d41fce
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLMul.cc
@@ -0,0 +1,120 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+class GLMul : public GLFilter {
+ public:
+  binding* outputSize;
+  binding* inputData;
+  binding* B;
+
+  GLMul()
+      : GLFilter("GLMul",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(outputSize), BINDING(inputData), BINDING(B)}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {/* no replacements */}) {}
+
+  template <typename T>
+  void mul(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images, float b);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLMul::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+uniform vec4 B;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 A = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(A * B);
+}
+
+)GLSL";
+
+template <typename T>
+void GLMul::mul(const GLImageVector<T>& input_images,
+                const GLImageVector<T>& output_images,
+                float b) {
+  for (int i = 0; i < input_images.size(); i++) {
+    auto input_image = input_images[i];
+    auto output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() {
+            glUniform2i(outputSize->location, output_image->width, output_image->height);
+            glUniform4f(B->location, b, b, b, b);
+          },
+          output_image->width,
+          output_image->height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <class T>
+class OpenGLMulOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLMulOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+                           "OpenGLMul only supports broadcast");
+
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
+                           "OpenGLMul does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+    const auto& B = Input(1);
+    CAFFE_ENFORCE_EQ(B.size(), 1); // only scalar is supported
+
+    const int num_images = input.size();
+    const auto output_height = input.height();
+    const auto output_width = input.width();
+    const int output_channels = input.channels();
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+
+    if (!_mult) {
+      _mult.reset(new GLMul());
+    }
+
+    _mult->mul(input, *output, B.template data<float>()[0]);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLMul> _mult;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLMul, OpenGLMulOp<float16_t>);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc b/caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc
new file mode 100644
index 0000000..39468f6
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc
@@ -0,0 +1,142 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLNormPlanarYUV : public GLFilter {
+ public:
+  const float* mean;
+  const float* std;
+
+  binding* inputData;
+  binding* outputSize;
+  binding* mean_data;
+  binding* std_data;
+
+  GLNormPlanarYUV(const float* _mean, const float* _std)
+      : GLFilter("GLNormPlanarYUV",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(inputData),
+                                        BINDING(outputSize),
+                                        BINDING(mean_data),
+                                        BINDING(std_data)}), // input bindings
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {}),
+        mean(_mean),
+        std(_std) {}
+
+  template <typename T>
+  void normalize(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLNormPlanarYUV::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+uniform vec4 mean_data;
+uniform vec4 std_data;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE((value - mean_data) / std_data);
+}
+
+)GLSL";
+
+template <class T>
+void GLNormPlanarYUV::normalize(const GLImageVector<T>& input_images,
+                                const GLImageVector<T>& output_images) {
+  int num_images = input_images.size();
+  for (int i = 0; i < num_images; i++) {
+    GLImage<T>* input_image = input_images[i];
+    GLImage<T>* output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+
+      std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
+
+      run(input_attachments,
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() {
+            glUniform2i(outputSize->location, output_image->width, output_image->height);
+            glUniform4f(mean_data->location, mean[0], mean[1], mean[2], 0.0);
+            glUniform4f(std_data->location, std[0], std[1], std[2], 1.0);
+          },
+          output_image->width,
+          output_image->height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T>
+class GLNormPlanarYUVOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  GLNormPlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+
+    const auto& M = Input(1); // mean
+    const auto& S = Input(2); // standard deviation
+    CAFFE_ENFORCE(input_channels == M.dim(1));
+    CAFFE_ENFORCE(input_channels == S.dim(1));
+
+    if (!_normPlanarYUV) {
+      _normPlanarYUV.reset(new GLNormPlanarYUV(M.template data<float>(), S.template data<float>()));
+    }
+
+    _normPlanarYUV->normalize(input, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  StorageOrder order_;
+  std::unique_ptr<GLNormPlanarYUV> _normPlanarYUV;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc b/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
new file mode 100644
index 0000000..2d9d06a
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
@@ -0,0 +1,273 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLPRelu : public GLFilter {
+ public:
+  typedef enum { PRelu = 0, Relu = 1 } ReluType;
+
+  const float* scale;
+
+  binding* inputData;
+  binding* scale_block;
+
+  const int scale_size;
+  const int channels;
+  const int output_tile_x;
+  const int output_tile_y;
+  const int output_tile_width;
+  const int output_tile_height;
+
+  GLPRelu(
+      const float* _scale,
+      const int _scale_size,
+      const int _channels,
+      int _output_tile_x,
+      int _output_tile_y,
+      int _output_tile_width,
+      int _output_tile_height)
+      : GLFilter(
+            "GLPRelu",
+            vertex_shader,
+            fragment_shader,
+            std::vector<binding*>({BINDING(inputData)}),
+            std::vector<binding*>({BINDING(scale_block)}),
+            {/* no attributes */},
+            {{"USE_RELU", caffe2::to_string(PRelu)},
+             {"OUTPUT_TILES",
+              caffe2::to_string(_output_tile_x * _output_tile_y)},
+             {"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
+             {"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
+             {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
+             {"TILED_PRELU",
+              caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
+        scale(_scale),
+        scale_size(_scale_size),
+        channels(_channels),
+        output_tile_x(_output_tile_x),
+        output_tile_y(_output_tile_y),
+        output_tile_width(_output_tile_width),
+        output_tile_height(_output_tile_height) {}
+
+  GLPRelu(const int _channels)
+      : GLFilter("GLRelu",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(inputData)}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"USE_RELU", caffe2::to_string(Relu)},
+                  {"OUTPUT_TILES", caffe2::to_string(1)},
+                  {"OUTPUT_TILE_X", caffe2::to_string(1)},
+                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
+                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
+                  {"TILED_PRELU", caffe2::to_string(0)}}),
+        scale(nullptr),
+        scale_block(nullptr),
+        scale_size(0),
+        channels(_channels),
+        output_tile_x(1),
+        output_tile_y(1),
+        output_tile_width(1),
+        output_tile_height(1) {}
+
+  template <typename T>
+  void prelu(const GLImageVector<T>& input_images,
+             const GLImageVector<T>& output_images,
+             GLPRelu::ReluType reluType);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
+#define TILED_PRELU                 $(TILED_PRELU)
+#define USE_RELU                    $(USE_RELU)
+
+// tiling
+#define OUTPUT_TILES                $(OUTPUT_TILES)
+#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
+#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
+#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
+
+// common
+precision mediump float;
+precision highp int;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+in highp vec2 v_texCoord;
+
+#if USE_RELU
+
+// Relu
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(max(value, vec4(0.0)));
+}
+
+#else
+
+#if TILED_PRELU
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+layout (std140) uniform scale_block {
+  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
+};
+
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
+
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
+
+  // outputData = value > 0 ? value : value * weight;
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
+  value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
+  outputData = TEXTURE_STORE(value);
+}
+#else
+layout (std140) uniform scale_block {
+  highp uvec4 scale;
+};
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
+
+  // outputData = value > 0 ? value : value * weight;
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
+  outputData = TEXTURE_STORE(value);
+}
+#endif // TILED_PRELU
+
+#endif // USE_RELU
+
+)GLSL";
+
+template <typename T>
+void GLPRelu::prelu(const GLImageVector<T>& input_images,
+                    const GLImageVector<T>& output_images,
+                    GLPRelu::ReluType reluType) {
+  int num_images = input_images.size();
+  for (int i = 0; i < num_images; i++) {
+    GLImage<T>* input_image = input_images[i];
+    GLImage<T>* output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      if (reluType == PRelu) {
+        attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
+          int output_tiles = output_tile_x * output_tile_y;
+          for (int j = 0, k = 4 * is * output_tiles;
+               k < std::min(channels, 4 * (is + 1) * output_tiles);
+               j++, k++) {
+            data[j] = scale_size == channels ? scale[k] : scale[0];
+          }
+        });
+      }
+
+      std::vector<texture_attachment> input_attachments;
+
+      input_attachments.push_back({input_image->textures[is], inputData});
+
+      run(input_attachments,
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() {},
+          output_image->texture_width,
+          output_image->texture_height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T, GLPRelu::ReluType reluType>
+class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
+    if (input_tile_x > 1 || input_tile_y > 1) {
+      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
+    }
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
+                                                           output_width,
+                                                           output_height,
+                                                           output_channels,
+                                                           output_tile_x,
+                                                           output_tile_y,
+                                                           is_last);
+
+    const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
+
+    if (!_prelu) {
+      if (reluType == GLPRelu::PRelu) {
+        _prelu.reset(new GLPRelu(scale->template data<float>(),
+                                 scale->size(),
+                                 input_channels,
+                                 output_tile_x,
+                                 output_tile_y,
+                                 output_width,
+                                 output_height));
+      } else {
+        _prelu.reset(new GLPRelu(input_channels));
+      }
+    }
+
+    _prelu->prelu(input, *output, reluType);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  StorageOrder order_;
+  std::unique_ptr<GLPRelu> _prelu;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
+OPERATOR_SCHEMA(OpenGLPRelu)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
+OPERATOR_SCHEMA(OpenGLRelu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPadImage.cc b/caffe2/mobile/contrib/opengl/operators/GLPadImage.cc
new file mode 100644
index 0000000..c0e7261
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLPadImage.cc
@@ -0,0 +1,159 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+class GLPadImage : public GLFilter {
+ public:
+  binding* padSize;
+  binding* inputSize;
+  binding* outputSize;
+  binding* inputData;
+
+  GLPadImage()
+      : GLFilter(
+            "GLPadImage",
+            vertex_shader,
+            fragment_shader,
+            std::vector<binding*>(
+                {BINDING(padSize), BINDING(inputSize), BINDING(outputSize), BINDING(inputData)}),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {/* no replacements */}) {}
+
+  template <typename T>
+  void pad(const GLImageVector<T>& input_images,
+           const GLImageVector<T>& output_images,
+           const int pad_l,
+           const int pad_t);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLPadImage::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 padSize;
+uniform ivec2 inputSize;
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)) - padSize;
+  texelCoord = max(texelCoord, -texelCoord);
+  texelCoord = min(texelCoord, ivec2(2) * (inputSize - 1) - texelCoord);
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(value);
+}
+
+)GLSL";
+
+template <typename T>
+void GLPadImage::pad(const GLImageVector<T>& input_images,
+                     const GLImageVector<T>& output_images,
+                     const int pad_l,
+                     const int pad_t) {
+  for (int i = 0; i < input_images.size(); i++) {
+    auto input_image = input_images[i];
+    auto output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() {
+            glUniform2i(inputSize->location, input_image->width, input_image->height);
+            glUniform2i(outputSize->location, output_image->width, output_image->height);
+            glUniform2i(padSize->location, pad_l, pad_t);
+          },
+          output_image->width,
+          output_image->height);
+    }
+  }
+}
+
+namespace caffe2 {
+
+template <typename OPBase>
+static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
+  Tensor<CPUContext> input, output;
+  input.Resize(1, 1, H, W);
+  op->SetOutputSize(input, &output, 1);
+  CAFFE_ENFORCE_EQ(output.ndim(), 4);
+  *OH = output.dim(2);
+  *OW = output.dim(3);
+}
+
+template <class T>
+class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLPadImageOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws),
+        mode_(OperatorBase::GetSingleArgument<string>("mode", "")) {
+    OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+    OPERATOR_NEEDS_FEATURE(mode_ == "reflect", "OpenGL only supports reflection");
+
+    CAFFE_ENFORCE(legacy_pad_ == LegacyPadding::NOTSET,
+                  "Padding layer only supports explicit pad values.");
+    CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
+                  "Pooling op does not support dilation right now.");
+    CAFFE_ENFORCE(stride_h() == 1 && stride_w() == 1,
+                  "Pooling op does not support stride right now.");
+    // Pad op does not use kernel sizes, so we set it to 1 for computing the
+    // output size.
+    kernel_.assign(pads_.size() / 2, 1);
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+
+    const int num_images = input.size();
+    const int input_width = input.width();
+    const int input_height = input.height();
+    const int input_channels = input.channels();
+    const int output_channels = input_channels;
+
+    int output_height, output_width;
+    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+
+    if (!padImage_) {
+      padImage_.reset(new GLPadImage());
+      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
+                << output_channels << ": " << output_height << " X " << output_width;
+      LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
+                << ", pad_b = " << pad_b();
+    }
+
+    padImage_->pad(input, *output, pad_l(), pad_t());
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::string mode_;
+  std::unique_ptr<GLPadImage> padImage_;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLPadImage, OpenGLPadImageOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLPadImage).NumInputs(1).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPool.cc b/caffe2/mobile/contrib/opengl/operators/GLPool.cc
new file mode 100644
index 0000000..5f4426f
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLPool.cc
@@ -0,0 +1,341 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/pool_op.h"
+
+class GLPool : public GLFilter {
+ public:
+  typedef enum { AveragePool, MaxPool } PoolType;
+
+  struct point {
+    int x;
+    int y;
+  };
+
+  struct descriptor {
+    int channels;
+    point kernel_size;
+    point input_padding;
+    point input_stride;
+    point input_tile_size;
+    point output_tile_size;
+  };
+
+  binding* inputData;
+  binding* kernelSize;
+  binding* outputSize;
+
+  const descriptor geometry;
+
+  GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
+      : GLFilter(
+            "GLPool",
+            vertex_shader,
+            fragment_shader,
+            {
+                BINDING(inputData),
+                BINDING(kernelSize),
+                BINDING(outputSize),
+            },
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
+             {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
+             {"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
+             {"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
+             {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
+             {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
+             {"INPUT_TILE_WIDTH",
+              caffe2::to_string(_geometry.input_tile_size.x)},
+             {"INPUT_TILE_HEIGHT",
+              caffe2::to_string(_geometry.input_tile_size.y)},
+             {"OUTPUT_TILE_WIDTH",
+              caffe2::to_string(_geometry.output_tile_size.x)},
+             {"OUTPUT_TILE_HEIGHT",
+              caffe2::to_string(_geometry.output_tile_size.y)},
+             {"TILED_POOLING", caffe2::to_string(_tiling)},
+             {"MAX_POOL", caffe2::to_string(poolType == MaxPool)},
+             {"BOUNDS_CHECK_MODE", caffe2::to_string(1)}}),
+        geometry(_geometry) {}
+  ~GLPool() {}
+
+  void pool(const GLImageVector<float16_t>& input_images,
+            const GLImageVector<float16_t>& output_images) {
+    for (int i = 0; i < input_images.size(); i++) {
+      auto input_image = input_images[i];
+      auto output_image = output_images[i];
+      int input_slices = input_image->slices;
+      int output_slices = output_image->slices;
+
+      for (int is = 0; is < input_slices; is++) {
+        run({{input_image->textures[is], inputData}},
+            {output_image->textures[is]},
+            [&]() {
+              glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
+              glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
+            },
+            output_image->texture_width,
+            output_image->texture_height);
+      }
+    }
+  }
+
+ private:
+  /*
+   * Computes BOUNDS_CHECK_MODE for the convolution parameters.
+   *
+   * @retval 0 if bounds check can be skipped
+   * @retval non-zero if bounds check can not be skipped
+   */
+  inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
+    if (tiling) {
+      return 1;
+    }
+
+    if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
+        (geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+const char* GLPool::fragment_shader = R"GLSL(#version 300 es
+#define TILED_POOLING           $(TILED_POOLING)
+#define MAX_POOL                $(MAX_POOL)
+
+// tiling
+#define INPUT_TILE_WIDTH            $(INPUT_TILE_WIDTH)
+#define INPUT_TILE_HEIGHT           $(INPUT_TILE_HEIGHT)
+#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
+#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
+
+#define BOUNDS_CHECK_MODE           $(BOUNDS_CHECK_MODE)
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
+const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
+const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
+
+uniform ivec2 kernelSize;
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+#if BOUNDS_CHECK_MODE == 0
+  #define IN_BOUNDS(p, p0, p1) (true)
+#else
+  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
+#endif
+
+// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
+const float MIN_FLOAT = -exp2(14.0);
+
+#if TILED_POOLING
+
+const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+// tiled pooling
+#if MAX_POOL
+
+#define POOL { \
+  pool = vec4(MIN_FLOAT); \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = tileCoord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
+        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
+        pool = max(pool, data); \
+      } \
+    } \
+  } \
+}
+
+#else
+
+#define POOL { \
+  int count = 0; \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = tileCoord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
+        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
+        pool += data;\
+        count += 1; \
+      } \
+    } \
+  } \
+  pool = pool / float(count); \
+}
+
+#endif // MAX_POOL
+
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
+  tileCoord = input_stride * tileCoord - input_padding;
+
+  ivec2 inputTileOffset = tile * inputTileSize;
+
+#if MAX_POOL
+  vec4 pool = vec4(0);
+#else
+  highp vec4 pool = vec4(0);
+#endif
+
+  POOL;
+
+  outputData = TEXTURE_STORE(pool);
+}
+
+#else
+
+// no tiling
+#if MAX_POOL
+
+#define POOL { \
+  pool = vec4(MIN_FLOAT); \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = texelCoord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
+        vec4 data = TEXTURE_LOAD(inputData, idx); \
+        pool = max(pool, data); \
+      } \
+    } \
+  } \
+}
+
+#else
+
+#define POOL { \
+  int count = 0; \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = texelCoord + ivec2(x, y); \
+      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
+        vec4 data = TEXTURE_LOAD(inputData, idx); \
+        pool += data; \
+        count += 1; \
+      } \
+    } \
+  } \
+  pool = pool / float(count); \
+}
+
+#endif // MAX_POOL
+
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
+#if MAX_POOL
+  vec4 pool = vec4(0);
+#else
+  highp vec4 pool = vec4(0);
+#endif
+
+  POOL;
+
+  outputData = TEXTURE_STORE(pool);
+}
+#endif // TILED_POOLING
+
+)GLSL";
+
+namespace caffe2 {
+
+template <typename OPBase>
+static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
+  Tensor<CPUContext> input, output;
+  input.Resize(1, 1, H, W);
+  op->SetOutputSize(input, &output, 1);
+  CAFFE_ENFORCE_EQ(output.ndim(), 4);
+  *OH = output.dim(2);
+  *OW = output.dim(3);
+}
+
+template <typename T, GLPool::PoolType poolType>
+class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
+ public:
+  GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+    CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
+                  "Pooling op does not support dilation right now.");
+    if (!global_pooling_) {
+      CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
+                        pad_r() < kernel_w(),
+                    "Pad should be smaller than kernel.");
+    }
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    int output_height;
+    int output_width;
+    const int output_channels = input_channels;
+
+    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
+
+    GLPool::descriptor geometry{input_channels,
+                                {kernel_w(), kernel_h()},
+                                {pad_l(), pad_t()},
+                                {stride_w(), stride_h()},
+                                {input_width, input_height},
+                                {output_height, output_width}};
+
+    if (!glPool_) {
+      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
+                << output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
+                << " Tiling: " << input_tile_x << "X" << input_tile_y;
+
+      glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
+    }
+
+    glPool_->pool(input, *output);
+
+    OperatorBase::Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLPool> glPool_;
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
+REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
+OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
+OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
+}; // namespace
+}; // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLResize.cc b/caffe2/mobile/contrib/opengl/operators/GLResize.cc
new file mode 100644
index 0000000..2f5a47c
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLResize.cc
@@ -0,0 +1,135 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLResizeNearest : public GLFilter {
+ public:
+  binding* inputData;
+  binding* outputSize;
+  binding* scale_reverse;
+
+  GLResizeNearest()
+      : GLFilter("GLResizeNearest",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
+                 {/* no uniform blocks*/},
+                 {/* no attributes */},
+                 {/* replacements */}) {}
+
+  template <typename T>
+  void resize(const GLImageVector<T>& input_images,
+              const GLImageVector<T>& output_images,
+              float width_scale_rev,
+              float height_scale_rev);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+uniform highp vec2 scale_reverse;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  // it clamps to the edge by default
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(value);
+}
+)GLSL";
+
+template <typename T>
+void GLResizeNearest::resize(const GLImageVector<T>& input_images,
+                             const GLImageVector<T>& output_images,
+                             float width_scale_rev,
+                             float height_scale_rev) {
+  for (int i = 0; i < input_images.size(); i++) {
+    auto input_image = input_images[i];
+    auto output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
+
+      run(input_attachments,
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() {
+            glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
+            glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
+          },
+          output_image->texture_width,
+          output_image->texture_height);
+    }
+  }
+}
+
+namespace caffe2 {
+
+template <class T>
+class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    if (HasArgument("width_scale")) {
+      width_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    }
+    if (HasArgument("height_scale")) {
+      height_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    }
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_width = input.width();
+    const int input_height = input.height();
+    const int input_channels = input.channels();
+
+    const int output_width = input_width * width_scale_;
+    const int output_height = input_height * height_scale_;
+    const int output_channels = input_channels;
+
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
+
+    if (!resizeNearest_) {
+      resizeNearest_.reset(new GLResizeNearest());
+    }
+    resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ protected:
+  float width_scale_;
+  float height_scale_;
+  std::unique_ptr<GLResizeNearest> resizeNearest_;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLResizeNearest, OpenGLResizeNearestOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLResizeNearest).NumInputs(1).NumOutputs(1);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc b/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
new file mode 100644
index 0000000..a8ac831
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
@@ -0,0 +1,134 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+typedef enum { Sigmoid, Tanh } OpType;
+
+class GLSigmoid : public GLFilter {
+ public:
+  binding* inputData;
+  binding* outputSize;
+
+  GLSigmoid(OpType opType)
+      : GLFilter("GLSigmoid",
+                 vertex_shader,
+                 fragment_shader,
+                 {BINDING(outputSize), BINDING(inputData)},
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"SIGMOID", caffe2::to_string(opType == Sigmoid)},
+                  {"TANH", caffe2::to_string(opType == Tanh)}}) {}
+
+  template <typename T>
+  void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLSigmoid::fragment_shader = R"GLSL(#version 300 es
+#define SIGMOID $(SIGMOID)
+#define TANH $(TANH)
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 value = TEXTURE_LOAD(inputData, ivec2(texelCoord));
+#if SIGMOID
+  value = vec4(1.0) / (vec4(1.0) + exp(-value));
+  outputData = TEXTURE_STORE(value);
+#elif TANH
+  value = tanh(value);
+  outputData = TEXTURE_STORE(value);
+#endif
+}
+
+)GLSL";
+
+template <typename T>
+void GLSigmoid::sigmoid(const GLImageVector<T>& input_images,
+                        const GLImageVector<T>& output_images) {
+  for (int i = 0; i < input_images.size(); i++) {
+    auto input_image = input_images[i];
+    auto output_image = output_images[i];
+    int input_slices = input_image->slices;
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
+          output_image->width,
+          output_image->height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T, OpType opType>
+class OpenGLSigmoidOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+
+    if (!_sigmoid) {
+      _sigmoid.reset(new GLSigmoid(opType));
+    }
+
+    _sigmoid->sigmoid(input, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLSigmoid> _sigmoid;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLSigmoid, OpenGLSigmoidOp<float16_t, Sigmoid>);
+OPERATOR_SCHEMA(OpenGLSigmoid)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+
+REGISTER_CPU_OPERATOR(OpenGLTanh, OpenGLSigmoidOp<float16_t, Tanh>);
+OPERATOR_SCHEMA(OpenGLTanh)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc b/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
new file mode 100644
index 0000000..0f120f8
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
@@ -0,0 +1,434 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLSoftmaxReduce : public GLFilter {
+ public:
+  binding* inputTileSize;
+  binding* outputSize;
+  binding* outputTileSize;
+  binding* tileSize;
+  binding* spatialTileSize;
+  binding* inputTileRange;
+  binding* inputData;
+  binding* maxData;
+  binding* sumData;
+
+  const std::vector<binding*> input_bindings() {
+    std::vector<binding*> bindings({BINDING(inputTileSize),
+                                    BINDING(outputSize),
+                                    BINDING(outputTileSize),
+                                    BINDING(tileSize),
+                                    BINDING(spatialTileSize),
+                                    BINDING(inputTileRange),
+                                    BINDING(inputData),
+                                    BINDING(maxData),
+                                    BINDING(sumData)});
+    return bindings;
+  }
+
+  GLSoftmaxReduce(
+      bool compute_sum_ = false,
+      bool tiled = false,
+      int input_tile_x = 1)
+      : GLFilter(
+            "GLSoftmaxReduce",
+            vertex_shader,
+            fragment_shader,
+            input_bindings(),
+            {/* no uniform_blocks_bindings */},
+            {/* no attributes */},
+            {{"COMPUTE_SUM", caffe2::to_string((int)compute_sum_)},
+             {"INPUT_TILE_X", caffe2::to_string(input_tile_x)},
+             {"TILED_SOFTMAX", caffe2::to_string(int(tiled))}}) {}
+
+  template <typename T>
+  void reduce(const GLImage<T>* input_image,
+              const GLImage<T>* output_image,
+              int tile_size_x,
+              int tile_size_y);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
+
+#define TILED_SOFTMAX $(TILED_SOFTMAX)
+#define INPUT_TILE_X $(INPUT_TILE_X)
+// Compute sum or max
+#define COMPUTE_SUM $(COMPUTE_SUM)
+
+precision highp float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 inputTileSize;
+uniform ivec2 outputSize;
+uniform ivec2 outputTileSize;
+uniform ivec2 spatialTileSize;
+uniform ivec2 tileSize;
+uniform ivec2 inputTileRange;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);
+
+#if TILED_SOFTMAX
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
+  ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
+
+  vec4 result = vec4(0.0);
+  for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
+    int inTileX = tileIdx % INPUT_TILE_X;
+    int inTileY = tileIdx / INPUT_TILE_X;
+    ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
+    for (int y = 0; y < sumArea.y; y++) {
+      for (int x = 0; x < sumArea.x; x++) {
+        ivec2 idx = tileCoord + ivec2(x, y);
+        vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
+  #if COMPUTE_SUM
+        result += val;
+  #else
+        result = max(result, val);
+  #endif
+      }
+    }
+  }
+
+  outputData = TEXTURE_STORE(result);
+}
+#else
+void main() {
+  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
+  ivec2 texelCoord = outputCoord * spatialTileSize;
+  ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
+  vec4 result = vec4(0.0);
+
+  for (int y = 0; y < sumArea.y; y++) {
+    for (int x = 0; x < sumArea.x; x++) {
+      ivec2 idx = texelCoord + ivec2(x, y);
+      vec4 val = TEXTURE_LOAD(inputData, idx);
+#if COMPUTE_SUM
+      result += val;
+#else
+      result = max(result, val);
+#endif
+    }
+  }
+
+  outputData = TEXTURE_STORE(result);
+}
+#endif
+)GLSL";
+
+template <typename T>
+void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
+                             const GLImage<T>* output_image,
+                             int tile_size_x,
+                             int tile_size_y) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  for (int is = 0; is < input_slices; is++) {
+    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
+    run(input_attachments,
+        {output_image->textures.begin() + is,
+         output_image->textures.begin() + is + 1},
+        [&]() {
+          glUniform2i(
+              inputTileSize->location, input_image->width, input_image->height);
+          glUniform2i(
+              outputSize->location,
+              output_image->texture_width,
+              output_image->texture_height);
+          glUniform2i(
+              outputTileSize->location,
+              output_image->width,
+              output_image->height);
+          glUniform2i(
+              tileSize->location, input_image->tile_x, input_image->tile_y);
+          glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
+          glUniform2i(
+              inputTileRange->location,
+              0,
+              std::min(
+                  (input_image->channels + 3) / 4,
+                  input_image->tile_x * input_image->tile_y));
+        },
+        output_image->texture_width,
+        output_image->texture_height);
+  }
+}
+
+class GLSoftmaxScale : public GLFilter {
+ public:
+  binding* outputSize;
+  binding* inputData;
+  binding* maxData;
+  binding* sumData;
+
+  const std::vector<binding*> input_bindings() {
+    std::vector<binding*> bindings(
+        {BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
+    return bindings;
+  }
+
+  GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
+      : GLFilter(
+            "GLSoftmaxScale",
+            vertex_shader,
+            fragment_shader,
+            input_bindings(),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"COMPUTE_EXP", caffe2::to_string((int)_compute_exp)},
+             {"TILED_SOFTMAX", caffe2::to_string((int)tiled)}}) {}
+
+  template <typename T>
+  void scale(const GLImage<T>* input_image,
+             const GLImage<T>* max_image,
+             const GLImage<T>* sum_image,
+             const GLImage<T>* output_image);
+
+  static const char* fragment_shader;
+};
+
+template <typename T>
+void GLSoftmaxScale::scale(const GLImage<T>* input_image,
+                           const GLImage<T>* max_image,
+                           const GLImage<T>* sum_image,
+                           const GLImage<T>* output_image) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  for (int is = 0; is < input_slices; is++) {
+    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
+                                                       {max_image->textures[is], maxData},
+                                                       {sum_image->textures[is], sumData}});
+    run(input_attachments,
+        {output_image->textures.begin() + is,
+         output_image->textures.begin() + is + 1},
+        [&]() {
+          glUniform2i(
+              outputSize->location,
+              output_image->texture_width,
+              output_image->texture_height);
+        },
+        output_image->texture_width,
+        output_image->texture_height);
+  }
+}
+
+// MARK: GLSL
+
+const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
+
+#define COMPUTE_EXP $(COMPUTE_EXP)
+#define TILED_SOFTMAX $(TILED_SOFTMAX)
+
+precision highp float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData);
+TEXTURE_INPUT(maxData);
+TEXTURE_INPUT(sumData);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
+#if COMPUTE_EXP
+  vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
+  #if TILED_SOFTMAX
+    float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
+    maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
+    outputData = TEXTURE_STORE(exp(val - maxVal));
+  #else
+    outputData = TEXTURE_STORE(exp(val - maxVal));
+  #endif
+
+#else
+  vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
+  #if TILED_SOFTMAX
+    float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
+    sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
+    outputData = TEXTURE_STORE(val / sumVal);
+  #else
+    outputData = TEXTURE_STORE(val / sumVal);
+  #endif
+#endif
+
+}
+)GLSL";
+
+#include "../core/ImageAllocator.h"
+#include "caffe2/core/operator.h"
+
+#ifndef CAFFE2_MOBILE
+#error "Caffe2 mobile state not defined"
+#endif
+
+#if CAFFE2_MOBILE
+
+namespace caffe2 {
+template <class T>
+class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
+    const int num_images = input.size();
+    const int input_channels = input.channels();
+    const int input_width = input.width();
+    const int input_height = input.height();
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+    // For tiling
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
+    const bool tiled = input_tile_x > 1 || input_tile_y > 1;
+    if (tiled) {
+      CAFFE_ENFORCE_EQ(
+          input.slices(), 1, "Input needs to be tiled in a single texture");
+    }
+
+    CAFFE_ENFORCE(
+        tiled || input_channels == 1,
+        "Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
+
+    // for spatial dimension
+    const int tile_size_x = 16;
+    const int tile_size_y = 16;
+
+    int max_buf_width = input_width;
+    int max_buf_height = input_height;
+    int max_buf_channels = input_channels;
+    vector<GLImageVector<T>*> reduce_buf;
+
+    while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
+      max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
+      max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
+      if (tiled) {
+        // since we are summing over all the channels within a channel tile
+        max_buf_channels =
+            (max_buf_channels + input_tile_x * input_tile_y - 1) /
+            (input_tile_x + input_tile_y);
+      }
+      reduce_buf.push_back(ImageAllocator<T>::newImage(
+          1,
+          max_buf_width,
+          max_buf_height,
+          max_buf_channels,
+          output_tile_x,
+          output_tile_y));
+    }
+
+    GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
+    GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
+    GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
+        num_images,
+        output_width,
+        output_height,
+        output_channels,
+        output_tile_x,
+        output_tile_y);
+    GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
+        num_images,
+        output_width,
+        output_height,
+        output_channels,
+        output_tile_x,
+        output_tile_y,
+        is_last);
+
+    if (!f_max) {
+      f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
+      f_exp.reset(new GLSoftmaxScale(true, tiled));
+      f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
+      f_scale.reset(new GLSoftmaxScale(false, tiled));
+    }
+
+    for (int i = 0; i < num_images; i++) {
+      auto input_image = input[i];
+      auto max_image = (*max)[i];
+      auto sum_image = (*sum)[i];
+      auto after_exp_image = (*after_exp)[i];
+      auto output_image = (*output_images)[i];
+      // Get Max
+      for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
+        const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
+        GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
+
+        const int running_tile_size_x =
+            ir < reduce_buf.size() ? tile_size_x : in->width;
+        const int running_tile_size_y =
+            ir < reduce_buf.size() ? tile_size_y : in->height;
+        f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
+      }
+      // scale vals by exp(x - max)
+      f_exp->scale(input_image, max_image, sum_image, after_exp_image);
+
+      // Get sum of the exp
+      for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
+        const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
+        GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
+        const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
+        const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
+        f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
+      }
+
+      // Scale(softmax)
+      f_scale->scale(after_exp_image, max_image, sum_image, output_image);
+    }
+
+    Outputs()[OUTPUT]->Reset(output_images);
+
+    delete sum;
+    delete max;
+    delete after_exp;
+    for (auto&& rb : reduce_buf) {
+      delete rb;
+    }
+    return true;
+  }
+
+ private:
+  StorageOrder order_;
+  std::unique_ptr<GLSoftmaxReduce> f_max;
+  std::unique_ptr<GLSoftmaxScale> f_exp;
+  std::unique_ptr<GLSoftmaxReduce> f_sum;
+  std::unique_ptr<GLSoftmaxScale> f_scale;
+
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
+OPERATOR_SCHEMA(OpenGLSoftmax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+} // namespace caffe2
+#endif // CAFFE2_MOBILE
diff --git a/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc b/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
new file mode 100644
index 0000000..af3f8ac
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
@@ -0,0 +1,392 @@
+
+#include "../core/GLContext.h"
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+enum InputFormat { BGRA = 0, RGBA = 1 };
+
+class GLStylizer : public GLFilter {
+  binding* inputData;
+  binding* outputSize;
+  binding* mean;
+  binding* noise_std;
+  bool deprocess;
+
+ public:
+  GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
+      : GLFilter(_deprocess ? "GLDeStylizer" : "GLStylizer",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(inputData), BINDING(mean), BINDING(noise_std), BINDING(outputSize)}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"DEPROCESS", caffe2::to_string(_deprocess)}, {"RGBAINPUT", caffe2::to_string(input_format)}}),
+        deprocess(_deprocess) {}
+
+  template <typename T1, typename T2>
+  void stylize(const GLImage<T1>* input_image,
+               const GLImage<T2>* output_image,
+               const float mean_values[3],
+               float noise_std_value);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLStylizer::fragment_shader = R"GLSL(#version 300 es
+
+#define DEPROCESS         $(DEPROCESS)
+#define RGBAINPUT         $(RGBAINPUT)
+
+precision mediump float;
+precision mediump int;
+precision mediump sampler2D;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+
+uniform vec3 mean;
+uniform float noise_std;
+
+#if DEPROCESS
+TEXTURE_INPUT(inputData);
+layout(location = 0) out mediump vec4 outputData;
+#else
+uniform sampler2D inputData;
+TEXTURE_OUTPUT(0, outputData);
+#endif
+
+#if !DEPROCESS
+// http://byteblacksmith.com/improvements-to-the-canonical-one-liner-glsl-rand-for-opengl-es-2-0/
+
+highp float rand(vec2 co) {
+  highp float a = 12.9898;
+  highp float b = 78.233;
+  highp float c = 43758.5453;
+  highp float dt = dot(co.xy, vec2(a, b));
+  highp float sn = mod(dt, 3.14);
+  return fract(sin(sn) * c);
+}
+#endif
+
+// In AR Engine, input/output a RBGA texture; otherwise, BGRA tensor => texture
+#if RGBAINPUT
+void main() {
+#if DEPROCESS
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = vec4((val.rgb + mean) / 255.0, 1.0).bgra;
+#else
+  outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).bgr - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
+#endif
+}
+#else
+void main() {
+#if DEPROCESS
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = vec4((val.rgb + mean) / 255.0, 1.0);
+#else
+  outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).rgb - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
+#endif
+}
+#endif
+)GLSL";
+
+template <typename T1, typename T2>
+void GLStylizer::stylize(const GLImage<T1>* input_image,
+                         const GLImage<T2>* output_image,
+                         const float mean_values[3],
+                         float noise_std_value) {
+  int input_slices = input_image->slices;
+  int output_slices = output_image->slices;
+
+  run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
+      {output_image->textures[0]},
+      [&]() {
+        glUniform2i(outputSize->location, output_image->width, output_image->height);
+        glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
+        if (!deprocess) {
+          glUniform1f(noise_std->location, noise_std_value);
+        }
+      },
+      output_image->width,
+      output_image->height);
+}
+
+namespace caffe2 {
+class OpenGLTensorToTextureStylizerPreprocessOp : public Operator<CPUContext>,
+                                                  ImageAllocator<uint8_t>,
+                                                  ImageAllocator<float16_t> {
+ public:
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 4;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 3;
+
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  OpenGLTensorToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() {
+    const auto& input = Input(0);
+    const auto& mean = Input(1);
+
+    CAFFE_ENFORCE(input.ndim() == 4);
+
+    const int num_images = input.dim32(0);
+    const int input_height = input.dim32(1);
+    const int input_width = input.dim32(2);
+    const int input_channels = input.dim32(3);
+
+    CAFFE_ENFORCE(input.dim32(0) == 1); // N == 1
+    CAFFE_ENFORCE(input_channels == kInputChannels);
+    CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
+
+    // get the buffers from input tensors
+    const float* mean_buffer = mean.template data<float>();
+    const uint8_t* input_buffer = input.template data<uint8_t>();
+
+    // set up the OpenGL context
+    GLContext::getGLContext()->set_context();
+
+    GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(num_images,
+                                                                                  input_width,
+                                                                                  input_height,
+                                                                                  kOutputChannels,
+#if CAFFE2_IOS
+                                                                                  true
+#else
+                                                                                  false
+#endif
+    );
+    const int tile_x = 1, tile_y = 1;
+    GLImageVector<uint8_t>* input_images = ImageAllocator<uint8_t>::newImage(
+        num_images, input_width, input_height, kInputChannels, tile_x, tile_y, false);
+    for (int i = 0; i < num_images; i++) {
+      auto input_image = (*input_images)[i];
+      auto output_image = (*output_images)[i];
+      const GLTexture* inputTexture = input_image->textures[0];
+      inputTexture->loadData(input_buffer);
+
+      if (!glStylizer_) {
+        glStylizer_.reset(new GLStylizer());
+      }
+
+      glStylizer_->stylize(
+          input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
+    }
+    delete input_images;
+    Outputs()[0]->Reset(output_images);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLStylizer> glStylizer_;
+};
+
+template <InputFormat inputFormat>
+class OpenGLTextureToTextureStylizerPreprocessOp : public Operator<CPUContext>,
+                                                   ImageAllocator<uint8_t>,
+                                                   ImageAllocator<float16_t> {
+ public:
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 4;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 3;
+
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  OpenGLTextureToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() {
+    const GLImageVector<uint8_t>& input = Inputs()[0]->template Get<GLImageVector<uint8_t>>();
+    const auto& mean = Input(1);
+
+    const int num_images = input.size();
+    const int input_height = input.height();
+    const int input_width = input.width();
+    const int input_channels = input.channels();
+
+    CAFFE_ENFORCE_GT(num_images, 0);
+    CAFFE_ENFORCE(input[0]->slices == 1); // N == 1
+    CAFFE_ENFORCE(input_channels == kInputChannels);
+    CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
+
+    // get the buffers from input tensors
+    const float* mean_buffer = mean.template data<float>();
+
+    GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(
+        num_images, input_width, input_height, kOutputChannels, false);
+
+    if (!glStylizer_) {
+      glStylizer_.reset(new GLStylizer(false, inputFormat));
+    }
+    for (int i = 0; i < num_images; i++) {
+      auto input_image = input[i];
+      auto output_image = (*output_images)[i];
+      glStylizer_->stylize(
+          input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
+    }
+    Outputs()[0]->Reset(output_images);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLStylizer> glStylizer_;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLTensorToTextureStylizerPreprocess,
+                      OpenGLTensorToTextureStylizerPreprocessOp);
+OPERATOR_SCHEMA(OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerPreprocess,
+                      OpenGLTextureToTextureStylizerPreprocessOp<RGBA>);
+OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
+
+class OpenGLTextureToTensorStylizerDeprocessOp : public Operator<CPUContext>,
+                                                 ImageAllocator<uint8_t> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 3;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 4;
+
+  bool RunOnDevice() {
+    const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
+    const auto& mean = Input(1);
+    auto* output = Output(0);
+
+    const int num_images = input.size(), channels = input.channels(), height = input.height(),
+              width = input.width();
+    // Assume BGR or BGRA
+    CAFFE_ENFORCE(mean.size() == kInputChannels);
+    CAFFE_ENFORCE(channels == kInputChannels);
+    // RGB
+    output->Resize(num_images, height, width, kOutputChannels);
+
+    const auto* mean_data = mean.template data<float>();
+    auto* output_buffer = output->template mutable_data<uint8_t>();
+
+    GLImageVector<uint8_t>* output_images =
+        ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels, true);
+
+    if (!glStylizer_) {
+      glStylizer_.reset(new GLStylizer(true));
+    }
+
+    for (int i = 0; i < num_images; i++) {
+      auto input_image = input[i];
+      auto output_image = (*output_images)[i];
+      glStylizer_->stylize(input_image, output_image, mean_data, 0);
+
+      output_image->textures[0]->map_read([&](const void* buffer,
+                                              size_t width,
+                                              size_t height,
+                                              size_t stride,
+                                              size_t channels,
+                                              const GLTexture::Type& type) {
+        if (width == stride) {
+          memcpy(output_buffer, buffer, channels * width * height);
+        } else {
+          typedef uint8_t(input_data_t)[height][stride][channels];
+          typedef uint8_t(output_data_t)[height][width][channels];
+
+          const input_data_t& input_data = *reinterpret_cast<const input_data_t*>(buffer);
+          output_data_t& output_data = *reinterpret_cast<output_data_t*>(output_buffer);
+
+          for (int y = 0; y < height; y++) {
+            memcpy(output_data[y], input_data[y], channels * width);
+          }
+        }
+      });
+    }
+    delete output_images;
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLStylizer> glStylizer_;
+};
+
+template <InputFormat inputFormat>
+class OpenGLTextureToTextureStylizerDeprocessOp : public Operator<CPUContext>,
+                                                  ImageAllocator<uint8_t> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 3;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 4;
+
+  bool RunOnDevice() {
+    const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
+    const auto& mean = Input(1);
+
+    const int num_images = input.size(), channels = input.channels(), height = input.height(),
+              width = input.width();
+
+    CAFFE_ENFORCE(mean.size() == kInputChannels);
+    CAFFE_ENFORCE(channels == kInputChannels);
+
+    const auto* mean_data = mean.template data<float>();
+
+    // Use foreignTextureAllocator inside GLContext
+    // glDeleteTexture will not be called from inside caffe2 for this texture
+    GLImageVector<uint8_t>* output_images;
+    auto textureAllocator = GLContext::getGLContext()->getTextureAllocator();
+    const int tile_x = 1, tile_y = 1;
+    if (textureAllocator != nullptr) {
+      output_images = ImageAllocator<uint8_t>::newImage(
+          num_images, width, height, kOutputChannels, tile_x, tile_y, textureAllocator);
+    } else {
+      // fallback when textureAllocator is not set
+      output_images = ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels);
+    }
+
+    if (!glStylizer_) {
+      glStylizer_.reset(new GLStylizer(true, inputFormat));
+    }
+
+    for (int i = 0; i < num_images; i++) {
+      auto input_image = input[i];
+      auto output_image = (*output_images)[i];
+      glStylizer_->stylize(input_image, output_image, mean_data, 0);
+    }
+
+    Outputs()[0]->Reset(output_images);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLStylizer> glStylizer_;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLTextureToTensorStylizerDeprocess,
+                      OpenGLTextureToTensorStylizerDeprocessOp);
+OPERATOR_SCHEMA(OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerDeprocess,
+                      OpenGLTextureToTextureStylizerDeprocessOp<RGBA>);
+OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSub.cc b/caffe2/mobile/contrib/opengl/operators/GLSub.cc
new file mode 100644
index 0000000..a35e1b8
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/GLSub.cc
@@ -0,0 +1,133 @@
+
+#include "../core/GLFilter.h"
+#include "../core/GLImage.h"
+#include "../core/ImageAllocator.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>
+
+class GLSub : public GLFilter {
+ public:
+  binding* inputData[2];
+  binding* outputSize;
+
+  GLSub()
+      : GLFilter("GLSub",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {/* no replacements */}) {}
+
+  template <typename T>
+  void sub(const GLImageVector<T>& input_image0,
+           const GLImageVector<T>& input_image1,
+           const GLImageVector<T>& output_image);
+
+  static const char* fragment_shader;
+};
+
+// MARK: GLSL
+
+const char* GLSub::fragment_shader = R"GLSL(#version 300 es
+
+precision mediump float;
+precision mediump int;
+
+in highp vec2 v_texCoord;
+
+uniform ivec2 outputSize;
+
+TEXTURE_INPUT(inputData[2]);
+TEXTURE_OUTPUT(0, outputData);
+
+void main() {
+    ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+    vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
+    vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
+    vec4 value = A - B;
+    outputData = TEXTURE_STORE(value);}
+
+)GLSL";
+
+template <typename T>
+void GLSub::sub(const GLImageVector<T>& input_images0,
+                const GLImageVector<T>& input_images1,
+                const GLImageVector<T>& output_images) {
+  const int num_images = input_images0.size();
+  for (int i = 0; i < num_images; i++) {
+    GLImage<T>* input_image0 = input_images0[i];
+    GLImage<T>* input_image1 = input_images1[i];
+    int input_slices = input_image0->slices;
+    GLImage<T>* output_image = output_images[i];
+    int output_slices = output_image->slices;
+
+    for (int is = 0; is < input_slices; is++) {
+      std::vector<texture_attachment> input_attachments;
+      input_attachments.push_back({input_image0->textures[is], inputData[0]});
+      input_attachments.push_back({input_image1->textures[is], inputData[1]});
+
+      run(input_attachments,
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
+          output_image->width,
+          output_image->height);
+    }
+  }
+}
+
+namespace caffe2 {
+template <typename T>
+class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
+ public:
+  OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
+
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
+    const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
+
+    CAFFE_ENFORCE_EQ(input0.size(), input1.size());
+
+    const int num_images = input0.size();
+    const int input_channels = input0.channels();
+    const int input_width = input0.width();
+    const int input_height = input0.height();
+    CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
+    CAFFE_ENFORCE_EQ(input1.width(), input_width);
+    CAFFE_ENFORCE_EQ(input1.height(), input_height);
+
+    const int output_channels = input_channels;
+    const int output_width = input_width;
+    const int output_height = input_height;
+
+    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
+
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, is_last);
+
+    if (!_sub) {
+      _sub.reset(new GLSub());
+    }
+
+    _sub->sub(input0, input1, *output);
+
+    Outputs()[0]->Reset(output);
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<GLSub> _sub;
+};
+
+REGISTER_CPU_OPERATOR(OpenGLSub, OpenGLSubOp<float16_t>);
+OPERATOR_SCHEMA(OpenGLSub).NumInputs(2).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h b/caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h
new file mode 100644
index 0000000..8b0c24d
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <cmath>
+
+struct point {
+  int x;
+  int y;
+};
+
+struct tile_descriptor {
+  point tile_dims;
+  point tile_size;
+  int tiles;
+};
+
+namespace caffe2 {
+inline static void squareFactors(int N, int& r1, int& r2) {
+  int f = sqrt(N);
+
+  if (f * f == N) {
+    r1 = r2 = f;
+  } else {
+    while (N % f != 0) {
+      f--;
+    }
+    r1 = N / f;
+    r2 = f;
+  }
+}
+
+inline static void computeOutputTiles(int output_channels, int& output_tile_x, int& output_tile_y) {
+  squareFactors((output_channels + 3) / 4, output_tile_x, output_tile_y);
+}
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
new file mode 100644
index 0000000..13c5a72
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
@@ -0,0 +1,381 @@
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/math.h"
+
+#include "../core/GL.h"
+#include "../core/GLLogging.h"
+#include "../core/arm_neon_support.h"
+#include "../operators/gl_tiling_utils.h"
+#include "TestGLConvolution.h"
+
+#include <vector>
+
+void AddNoiseInput(const std::vector<caffe2::TIndex>& shape,
+                   const std::string& name,
+                   caffe2::Workspace* ws) {
+  caffe2::CPUContext context;
+  caffe2::Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
+  tensor->Resize(shape);
+
+  caffe2::math::RandGaussian<float, caffe2::CPUContext>(
+      tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
+}
+
+double BenchOp(const std::string& typ,
+               int inputC,
+               int outputC,
+               int kW,
+               int kH,
+               int stride,
+               int inW,
+               int inH,
+               bool transposed,
+               caffe2::Workspace* ws = nullptr) {
+  caffe2::Workspace localWs;
+  if (!ws) {
+    ws = &localWs;
+  }
+
+  const char* engine = transposed ? "MOBILE" : "NNPACK";
+
+  caffe2::OperatorDef def1;
+  def1.set_name("test");
+  def1.set_type(typ);
+  def1.set_engine(engine);
+  def1.add_input("X");
+  def1.add_input("W");
+  def1.add_input("B");
+  def1.add_output("Y");
+
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
+
+  AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
+  if (transposed) {
+    AddNoiseInput(std::vector<caffe2::TIndex>{inputC, outputC, kH, kW}, "W", ws);
+  } else {
+    AddNoiseInput(std::vector<caffe2::TIndex>{outputC, inputC, kH, kW}, "W", ws);
+  }
+  AddNoiseInput(std::vector<caffe2::TIndex>{outputC}, "B", ws);
+
+  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
+
+  // Measure one iteration
+  caffe2::Timer timer;
+  timer.Start();
+
+  op1->Run();
+
+  float one_iteration = timer.MilliSeconds();
+
+  int target_iterations = std::max((int)(1000 / one_iteration), 1);
+  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
+
+  // warm up
+  for (int i = 0; i < warmup_iterations; i++) {
+    op1->Run();
+  }
+
+  timer.Start();
+
+  int runs = target_iterations;
+  for (int i = 0; i < runs; i++) {
+    op1->Run();
+  }
+
+  auto total_t = timer.MilliSeconds();
+
+  gl_log(GL_LOG,
+         "%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
+         typ.c_str(),
+         inputC,
+         outputC,
+         inW,
+         inH,
+         kW,
+         kH,
+         engine,
+         timer.MilliSeconds() / (float)runs);
+  return double(total_t) / runs;
+}
+
+template <typename T>
+static double BenchGLConvolution(int input_channels,
+                                 int output_channels,
+                                 int kernel_width,
+                                 int kernel_height,
+                                 int input_width,
+                                 int input_height,
+                                 int input_padding,
+                                 int input_stride,
+                                 bool transposed,
+                                 caffe2::Workspace* ws = nullptr) {
+  int tile_x = 1, tile_y = 1;
+  caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
+
+  gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
+
+  caffe2::Workspace localWs;
+  if (!ws) {
+    ws = &localWs;
+  }
+
+  AddNoiseInput(
+      std::vector<caffe2::TIndex>{1, input_channels, input_height, input_width}, "X_cpu", ws);
+  if (transposed) {
+    AddNoiseInput(
+        std::vector<caffe2::TIndex>{input_channels, output_channels, kernel_height, kernel_width},
+        "W",
+        ws);
+  } else {
+    AddNoiseInput(
+        std::vector<caffe2::TIndex>{output_channels, input_channels, kernel_height, kernel_width},
+        "W",
+        ws);
+  }
+  AddNoiseInput(std::vector<caffe2::TIndex>{output_channels}, "b", ws);
+
+  caffe2::NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
+    op.add_input("X_gl");
+    {
+      op.add_input("W");
+      op.add_input("b");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("kernel");
+      arg.set_i(kernel_height);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("pad");
+      arg.set_i(input_padding);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("stride");
+      arg.set_i(input_stride);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("is_last");
+      arg.set_i(1);
+    }
+    op.add_output("Y_gl");
+  }
+
+  std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
+
+  for (auto& op : netdef.op()) {
+    ops.push_back(CreateOperator(op, ws));
+  }
+
+  // Run the Copy Operator
+  ops[0]->Run();
+
+  // Make sure the tested operator is precompiled
+  ops[1]->Run();
+  glFinish();
+
+  // Measure one iteration
+  caffe2::Timer timer;
+  timer.Start();
+
+  ops[1]->Run();
+  glFinish();
+
+  float one_iteration = timer.MilliSeconds();
+
+  int target_iterations = std::max((int)(1000 / one_iteration), 1);
+  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
+
+  // warm up
+  for (int i = 0; i < warmup_iterations; i++) {
+    ops[1]->Run();
+  }
+  glFinish();
+
+  timer.Start();
+
+  int runs = target_iterations;
+  for (int i = 0; i < runs; i++) {
+    ops[1]->Run();
+  }
+  glFinish();
+
+  const double gpuIterTime = double(timer.MilliSeconds()) / runs;
+
+  gl_log(GL_LOG,
+         "%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
+         transposed ? "ConvTranspose" : "Conv",
+         input_channels,
+         output_channels,
+         input_width,
+         input_height,
+         kernel_width,
+         kernel_height,
+         gpuIterTime);
+
+  return gpuIterTime;
+}
+
+void TestGLConvolution() {
+  caffe2::Workspace ws;
+  ws.GetThreadPool()->setMinWorkSize(0);
+
+  // small input sizes
+  // std::vector<int> sizes({14, 26, 52, 104});
+  // std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
+  // std::vector<int> channels({512, 256, 128, 64});
+
+  // large input sizes
+  // std::vector<int> sizes({208, 312, 416, 720, 1080});
+  // std::vector<int> channels({16, 4});
+  //
+  std::vector<int> sizes({14, 26, 52, 104, 208});
+  // std::vector<int> channels({24, 16, 4});
+
+  //  std::vector<int> sizes({14});
+  std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
+
+  std::vector<int> kernels({3});
+
+  bool transposed = false;
+
+  int stride = 1;
+
+  for (const auto& space : sizes) {
+    for (const auto& input_channel : channels) {
+      int output_channel = input_channel;
+      /* for (const auto& output_channel : channels) */ {
+        for (const auto& kernel : kernels) {
+          const double gpuIterTime = BenchGLConvolution<float16_t>(
+              input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
+          const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
+                                             input_channel,
+                                             output_channel,
+                                             kernel,
+                                             kernel,
+                                             stride,
+                                             space,
+                                             space,
+                                             transposed,
+                                             &ws);
+          const double flops       = double(input_channel) * output_channel * kernel * kernel *
+                               (kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
+          // gl_log(GL_LOG,
+          printf(
+              "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
+              "%.2f\tratio: "
+              "%.2f\n",
+              space,
+              space,
+              input_channel,
+              output_channel,
+              kernel,
+              kernel,
+              flops / gpuIterTime / 1E6,
+              flops / cpuIterTime / 1E6,
+              cpuIterTime / gpuIterTime);
+        }
+      }
+    }
+  }
+
+  //  // ConvTranspose
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
+  //  BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
+  //
+  //  BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
+  //  BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
+  //
+  //  // Conv
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
+  //
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
+  //  BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
+
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
+  //
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
+  //  BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
+  //
+  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
+  //  BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
+  //
+  //  BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
+  //  BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
+  //  BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
+  //  BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
+  //  BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
+  //  BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
+  //  BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
+}
diff --git a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.h b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.h
new file mode 100644
index 0000000..63f5d3b
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.h
@@ -0,0 +1,4 @@
+
+#pragma once
+
+void TestGLConvolution();
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
new file mode 100644
index 0000000..f9ede81
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -0,0 +1,2920 @@
+
+#include "opengl_test.h"
+
+#include "../core/GLContext.h"
+#include "../core/GLImageAllocator.h"
+#include "../core/GLLogging.h"
+#include "../core/ImageAllocator.h"
+#include "../core/arm_neon_support.h"
+#include "../core/rewrite_net.h"
+#include "../operators/gl_tiling_utils.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+#ifdef CAFFE2_USE_MPSCNN
+#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
+#endif
+
+#define DEBUGGING false
+
+namespace caffe2 {
+
+template <class T>
+float absolute_error(T t1, T t2) {
+  return std::abs((float)t1 - (float)t2);
+}
+
+template <class T>
+float relative_error(T t1, T t2) {
+  return t2 != 0 ? absolute_error(t1, t2) / (float)t2 : 1;
+}
+
+// OpenGL: t1, CPU: t2
+void checkError1D(const TensorCPU& t1, const TensorCPU& t2, float error) {
+  CAFFE_ENFORCE_EQ(t1.size(), t2.size());
+#if DEBUGGING
+  gl_log(GL_LOG, "OpenGL output:\n");
+  for (int i = 0; i < t1.size(); i++) {
+    gl_log(GL_LOG, "%.5f\t", t1.template data<float>()[i]);
+  }
+  gl_log(GL_LOG, "\n");
+  gl_log(GL_LOG, "CPU output:\n");
+  for (int i = 0; i < t2.size(); i++) {
+    gl_log(GL_LOG, "%.5f\t", t2.template data<float>()[i]);
+  }
+  gl_log(GL_LOG, "\n");
+
+#else
+  int count = 0;
+  if (t1.template IsType<float>()) {
+    for (auto i = 0; i < t1.size(); ++i) {
+      const float t1_i = t1.template data<float>()[i];
+      const float t2_i = t2.template data<float>()[i];
+
+      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
+        gl_log(GL_ERR,
+               "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
+               i,
+               t1_i,
+               t2_i,
+               absolute_error(t1_i, t2_i),
+               relative_error(t1_i, t2_i) * 100);
+        if (count++ == 10) {
+          CAFFE_THROW("--- Test Failed ---");
+        }
+      }
+    }
+  }
+#endif
+}
+
+// OpenGL: t1, CPU: t2
+void checkError(const TensorCPU& t1, const TensorCPU& t2, float error) {
+  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+#if DEBUGGING
+  gl_log(GL_LOG, "opengl_test output\n");
+  gl_log(GL_LOG, "\nOpenGL output:\n");
+  for (int i = 0; i < t1.size(); i++) {
+    if (t1.ndim() > 2 && i % t1.dim(2) == 0) {
+      gl_log(GL_LOG, "\n");
+    }
+    if (t1.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0) {
+      gl_log(GL_LOG, "\n");
+    }
+    if (t1.template IsType<float>()) {
+      const float t1_i = t1.template data<float>()[i];
+      gl_log(GL_LOG, "%.3f\t", t1_i);
+    } else if (t1.template IsType<uint8_t>()) {
+      const uint8_t t1_i = t1.template data<uint8_t>()[i];
+      gl_log(GL_LOG, "%.3d\t", (int)t1_i);
+    }
+  }
+
+  gl_log(GL_LOG, "\nCPU output:\n");
+  for (int i = 0; i < t2.size(); i++) {
+    if (t2.ndim() > 2 && i % t2.dim(2) == 0)
+      gl_log(GL_LOG, "\n");
+    if (t2.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0)
+      gl_log(GL_LOG, "\n");
+    if (t2.template IsType<float>()) {
+      const float t2_i = t2.template data<float>()[i];
+      gl_log(GL_LOG, "%.3f\t", t2_i);
+    } else if (t2.template IsType<uint8_t>()) {
+      const uint8_t t2_i = t2.template data<uint8_t>()[i];
+      gl_log(GL_LOG, "%.3d\t", (int)t2_i);
+    }
+  }
+  gl_log(GL_LOG, "\n");
+#else
+
+  int count = 0;
+  float maxError = 0, minError = 0;
+  if (t1.template IsType<float>()) {
+    for (auto i = 0; i < t1.size(); ++i) {
+      const float t1_i = t1.template data<float>()[i];
+      const float t2_i = t2.template data<float>()[i];
+      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
+        if (count < 10) {
+          gl_log(GL_ERR,
+                 "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
+                 i,
+                 t1_i,
+                 t2_i,
+                 absolute_error(t1_i, t2_i),
+                 relative_error(t1_i, t2_i) * 100);
+        } else {
+          CAFFE_THROW("--- Test Failed ---");
+        }
+        count++;
+      }
+      float err = t1_i - t2_i;
+      if (err > maxError) {
+        maxError = err;
+      } else if (err < minError) {
+        minError = err;
+      }
+    }
+  } else if (t1.template IsType<uint8_t>()) {
+    for (auto i = 0; i < t1.size(); ++i) {
+      const uint8_t t1_i = t1.template data<uint8_t>()[i];
+      const uint8_t t2_i = t2.template data<uint8_t>()[i];
+      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
+        if (count < 10) {
+          gl_log(GL_ERR,
+                 "i: %d, GL: %d, CPU: %d, absolute error: %.2f, relative error: %.2f%%\n",
+                 i,
+                 t1_i,
+                 t2_i,
+                 absolute_error(t1_i, t2_i),
+                 relative_error(t1_i, t2_i) * 100);
+        } else {
+          CAFFE_THROW("--- Test Failed ---");
+        }
+        count++;
+      }
+      float err = t1_i - t2_i;
+      if (err > maxError) {
+        maxError = err;
+      } else if (err < minError) {
+        minError = err;
+      }
+    }
+  }
+  gl_log(GL_LOG,
+         "#errors = %d in %d, maxError = %f, minError = %f\n",
+         count,
+         (int)t1.size(),
+         maxError,
+         minError);
+#endif
+}
+
+void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1, int tile_y = 1) {
+  LOG(INFO) << "OPENGLCopyFrom/To Test";
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+
+    // Note: may overflow for half precision
+    //    float *data = t->mutable_data<float>();
+    //    for (int i = 0; i < t->size(); i++) {
+    //      data[i] = i;
+    //    }
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("X_gl");
+    op.add_output("Y_cpu");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
+  const auto& t2 = ws.GetBlob("X_cpu")->Get<TensorCPU>(); // CPU
+  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+
+  checkError(t1, t2, error);
+}
+
+typedef enum {
+  AveragePool,
+  MaxPool,
+  Conv,
+  ConvTranspose,
+  ConvPRelu,
+  ConvTransposePRelu,
+  ConvRelu,
+  ConvTransposeRelu
+} PoolOp;
+
+const char* glPoolOperationName[] = {"OpenGLAveragePool",
+                                     "OpenGLMaxPool",
+                                     "OpenGLConv",
+                                     "OpenGLConvTranspose",
+                                     "OpenGLConvPRelu",
+                                     "OpenGLConvTransposePRelu",
+                                     "OpenGLConvRelu",
+                                     "OpenGLConvTransposeRelu"};
+
+const char* cpuPoolOperationName[] = {"AveragePool",
+                                      "MaxPool",
+                                      "Conv",
+                                      "ConvTranspose",
+                                      "Conv",
+                                      "ConvTranspose",
+                                      "Conv",
+                                      "ConvTranspose"};
+
+void testOpenGLConv(int N,
+                    int C,
+                    int H,
+                    int W,
+                    int K, // output_channels
+                    int kernel_h,
+                    int kernel_w,
+                    int pad,
+                    int stride,
+                    PoolOp poolOp,
+                    float error,
+                    bool random_input     = true,
+                    int input_batch_size  = 1,
+                    int output_batch_size = 1,
+                    int input_tile_x      = 1,
+                    int input_tile_y      = 1,
+                    bool tiling           = false) {
+  LOG(INFO) << "OpenGL Conv Test: "
+            << "input C: " << C << ", output C: " << K << ", H: " << H << ", W: " << W
+            << ", K: " << kernel_w << "x" << kernel_h << ", P: " << pad << ", S: " << stride
+            << " Op: " << glPoolOperationName[poolOp];
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+#if 0
+  gl_log(GL_LOG, "Input tensor:");
+  for (int i = 0; i < t->size(); i++) {
+    const float t1_i = t->data<float>()[i];
+    if (i % t->dim(3) == 0)
+      gl_log(GL_LOG, "\n");
+    if (i % (4 * t->dim(2) * t->dim(3)) == 0)
+      gl_log(GL_LOG, "-------------------------------\n");
+    gl_log(GL_LOG, "%.3f\t", t1_i);
+  }
+  gl_log(GL_LOG, "\n\n");
+#endif
+  }
+
+  if (poolOp != AveragePool && poolOp != MaxPool) {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
+      t->Resize(K, C, kernel_h, kernel_w);
+    } else {
+      t->Resize(C, K, kernel_h, kernel_w);
+    }
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      // Set the weights to all 1s
+      //      for (int i = 0; i < t->size(); i++) {
+      //        data[i] = 1;
+      //      }
+
+      // Set the weights to 1s, 2s, 3s... for channel 0, 1, 2, 3...
+      int j = 0;
+      for (int i = 0; i < t->size(); i++) {
+        if (i % (C * kernel_h * kernel_w) == 0) {
+          j++;
+        }
+        data[i] = j;
+      }
+    }
+
+#if 0
+    gl_log(GL_LOG, "Kernel (printing only the first line for each output channel):");
+    for (int i = 0; i < t->size(); i++) {
+      if (i == 0 || i % (t->dim(1) * t->dim(2) * t->dim(3)) == 0) {
+        gl_log(GL_LOG, "\n");
+        for (int j = 0; j < t->dim(3); j++) {
+          const float t1_i = t->data<float>()[i + j];
+          gl_log(GL_LOG, "%.3f\t", t1_i);
+        }
+      }
+    }
+    gl_log(GL_LOG, "\n");
+#endif
+
+    // bias
+    {
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      t->Resize(K);
+      CPUContext ctx;
+      if (random_input) {
+        math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+      } else {
+        // Set bias to 1
+        float* data = t->mutable_data<float>();
+        for (int i = 0; i < t->size(); i++) {
+          data[i] = i + 1;
+        }
+      }
+#if 0
+    gl_log(GL_LOG, "Bias:\n");
+    for (int i = 0; i < t->size(); i++) {
+      const float t1_i = t->data<float>()[i];
+      gl_log(GL_LOG, "%.3f\t", t1_i);
+    }
+    gl_log(GL_LOG, "\n");
+#endif
+    }
+  }
+
+  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      // Set prelu scale to i + 1
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = -0.5;
+      }
+    }
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type(glPoolOperationName[poolOp]);
+    op.add_input("X_gl");
+    if (poolOp != AveragePool && poolOp != MaxPool) {
+      op.add_input("W");
+      op.add_input("b");
+    }
+    if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
+      op.add_input("p");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("kernel");
+      arg.set_i(kernel_h);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("pad");
+      arg.set_i(pad);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("stride");
+      arg.set_i(stride);
+    }
+    if (poolOp != AveragePool && poolOp != MaxPool) {
+      if (tiling) {
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("tiling");
+          arg.set_i(1);
+        }
+      } else {
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("input_batch_size");
+          arg.set_i(input_batch_size);
+        }
+        {
+          auto& arg = *(op.add_arg());
+          arg.set_name("output_batch_size");
+          arg.set_i(output_batch_size);
+        }
+      }
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("is_last");
+      arg.set_i(1);
+    }
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type(cpuPoolOperationName[poolOp]);
+
+    op.add_input("X_cpu");
+    if (poolOp != AveragePool && poolOp != MaxPool) {
+      op.add_input("W");
+      op.add_input("b");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("kernel");
+      arg.set_i(kernel_h);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("pad");
+      arg.set_i(pad);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("stride");
+      arg.set_i(stride);
+    }
+    op.add_output("Y_ref");
+  }
+  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
+    auto& op = *(netdef.add_op());
+    op.set_type("PRelu");
+    op.add_input("Y_ref");
+    op.add_input("p");
+    op.add_output("Y_ref");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+  } else if (poolOp == ConvRelu || poolOp == ConvTransposeRelu) {
+    auto& op = *(netdef.add_op());
+    op.set_type("Relu");
+    op.add_input("Y_ref");
+    op.add_output("Y_ref");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
+  const auto& t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(t1, t2, error);
+}
+
+void testOpenGLPRelu(
+    int N, int C, int H, int W, int prelu_size, int input_tile_x, int input_tile_y, float error) {
+  LOG(INFO) << "OpenGL PRelu Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  // prelu scale
+  {
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    t->Resize(prelu_size);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLPRelu");
+    op.add_input("X_gl");
+    op.add_input("p");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("PRelu");
+    op.add_input("X_cpu");
+    op.add_input("p");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile_y, float error) {
+  LOG(INFO) << "OpenGL Relu Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLRelu");
+    op.add_input("X_gl");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Relu");
+    op.add_input("X_cpu");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
+  LOG(INFO) << "OpenGL Add Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    t0->Resize(N, C, H, W);
+    CPUContext ctx0;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
+
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    t1->Resize(N, C, H, W);
+    CPUContext ctx1;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu0");
+    op.add_output("X_gl0");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu1");
+    op.add_output("X_gl1");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLAdd");
+    op.add_input("X_gl0");
+    op.add_input("X_gl1");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Add");
+    op.add_input("X_cpu0");
+    op.add_input("X_cpu1");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(t1, t2, error);
+}
+
+void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
+  LOG(INFO) << "OpenGL Sub Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+
+  Workspace ws;
+  {
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    t0->Resize(N, C, H, W);
+    CPUContext ctx0;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
+
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    t1->Resize(N, C, H, W);
+    CPUContext ctx1;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu0");
+    op.add_output("X_gl0");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu1");
+    op.add_output("X_gl1");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLSub");
+    op.add_input("X_gl0");
+    op.add_input("X_gl1");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Sub");
+    op.add_input("X_cpu0");
+    op.add_input("X_cpu1");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+  checkError(t2, t1, error);
+}
+
+void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = false, float error = 0.1) {
+  LOG(INFO) << "OpenGL Concat Test "
+            << "H: " << H << ", W: " << W;
+  Workspace ws;
+  for (int i = 0; i < Cs.size(); i++) {
+    auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>();
+    t->Resize(N, Cs[i], H, W);
+    CPUContext ctx0;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx0);
+  }
+
+  NetDef netdef;
+  for (int i = 0; i < Cs.size(); i++) {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu" + caffe2::to_string(i));
+    op.add_output("X_gl" + caffe2::to_string(i));
+    if (tiling) {
+      int tile_x = 1, tile_y = 1;
+      computeOutputTiles(Cs[i], tile_x, tile_y);
+      printf("Cs[i] = %d, tile_x = %d, tile_y = %d\n", Cs[i], tile_x, tile_y);
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_x");
+        arg.set_i(tile_x);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_y");
+        arg.set_i(tile_y);
+      }
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLConcat");
+    for (int i = 0; i < Cs.size(); i++) {
+      op.add_input("X_gl" + caffe2::to_string(i));
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    op.add_output("Y_gl");
+    op.add_output("Y_gl_mask");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Concat");
+    for (int i = 0; i < Cs.size(); i++) {
+      op.add_input("X_cpu" + caffe2::to_string(i));
+    }
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+    op.add_output("Y_ref_mask");
+  }
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL Sigmoid Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLSigmoid");
+    op.add_input("X_gl");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Sigmoid");
+    op.add_input("X_cpu");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLTanh(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL Tanh Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLTanh");
+    op.add_input("X_gl");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Tanh");
+    op.add_input("X_cpu");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLMul(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL Mul Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
+  }
+
+  {
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    t->Resize(1);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLMul");
+    op.add_input("X_gl");
+    op.add_input("B");
+    op.add_output("Y_gl");
+
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("broadcast");
+      arg.set_i(1);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Mul");
+    op.add_input("X_cpu");
+    op.add_input("B");
+
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("broadcast");
+      arg.set_i(1);
+    }
+
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
+  LOG(INFO) << "OpenGL Softmax Test "
+            << "N: " << N << " D: " << D << " Tiled:" << tiled;
+  Workspace ws;
+  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+  {
+    t->Resize(N, D);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Reshape");
+    op.add_input("X_cpu");
+    op.add_output("X_reshaped");
+    op.add_output("old_shape");
+    auto& arg = *(op.add_arg());
+    arg.set_name("shape");
+    if (tiled) {
+      arg.add_ints(N);
+      arg.add_ints(D);
+      arg.add_ints(1);
+      arg.add_ints(1);
+    } else {
+      arg.add_ints(N);
+      arg.add_ints(1);
+      arg.add_ints(D);
+      arg.add_ints(1);
+    }
+  }
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_reshaped");
+    op.add_output("X_gl");
+    if (tiled) {
+      int tile_x = 1, tile_y = 1;
+      squareFactors((D + 3) / 4, tile_x, tile_y);
+      auto& argx = *(op.add_arg());
+      argx.set_name("tile_x");
+      argx.set_i(tile_x);
+      auto& argy = *(op.add_arg());
+      argy.set_name("tile_y");
+      argy.set_i(tile_y);
+    }
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLSoftmax");
+    op.add_input("X_gl");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu0");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Reshape");
+    op.add_input("Y_cpu0");
+    op.add_output("Y_cpu");
+    op.add_output("old_shape");
+    auto& arg = *(op.add_arg());
+    arg.set_name("shape");
+    arg.add_ints(N);
+    arg.add_ints(D);
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("Softmax");
+    op.add_input("X_cpu");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL InstanceNorm Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+    //    for (auto i = 0; i < t->size(); ++i) {
+    //      t->mutable_data<float>()[i] = 0.001;
+    //    }
+  }
+
+  // scale
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(C);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = (i + 1) / t->size();
+    }
+  }
+  // bias
+  {
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    t->Resize(C);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = 8 - 2 * i;
+    }
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLInstanceNorm");
+    op.add_input("X_gl");
+    op.add_input("W");
+    op.add_input("b");
+    op.add_output("Y_gl");
+    op.add_output("Mean_gl");
+    op.add_output("InvStdev_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Mean_gl");
+    op.add_output("Mean_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("InvStdev_gl");
+    op.add_output("InvStdev_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("InstanceNorm");
+    op.add_input("X_cpu");
+    op.add_input("W");
+    op.add_input("b");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+    op.add_output("Mean_ref");
+    op.add_output("InvStdev_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  LOG(INFO) << "Check mean";
+  checkError1D(
+      ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
+  LOG(INFO) << "Check inv_stdev";
+  checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
+               ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
+               0.001);
+  LOG(INFO) << "Check instance norm";
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL InstanceNormPRelu Test "
+            << "C: " << C << ", H: " << H << ", W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    // Too noisy.
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
+    //    for (auto i = 0; i < t->size(); ++i) {
+    //      t->mutable_data<float>()[i] = 0.001;
+    //    }
+  }
+
+  // scale
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(C);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = (i + 1) / t->size();
+    }
+  }
+  // bias
+  {
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    t->Resize(C);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = 8 - 2 * i;
+    }
+  }
+  // prelu scale
+  {
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    t->Resize(C);
+    CPUContext ctx;
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+  }
+
+  NetDef netdef;
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLInstanceNormPRelu");
+    op.add_input("X_gl");
+    op.add_input("W");
+    op.add_input("b");
+    op.add_input("p");
+    op.add_output("Y_gl");
+    op.add_output("Mean_gl");
+    op.add_output("InvStdev_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Mean_gl");
+    op.add_output("Mean_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("InvStdev_gl");
+    op.add_output("InvStdev_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("InstanceNorm");
+    op.add_input("X_cpu");
+    op.add_input("W");
+    op.add_input("b");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+    op.add_output("Mean_ref");
+    op.add_output("InvStdev_ref");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("PRelu");
+    op.add_input("Y_ref");
+    op.add_input("p");
+    auto& arg = *(op.add_arg());
+    arg.set_name("order");
+    arg.set_s("NCHW");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
+
+  LOG(INFO) << "Check mean";
+  checkError1D(
+      ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
+  LOG(INFO) << "Check inv_stdev";
+  checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
+               ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
+               0.001);
+  LOG(INFO) << "Check instance norm";
+  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
+}
+
+void OpenGL_speedtest(int N,
+                      int C,
+                      int H,
+                      int W,
+                      int K,
+                      int kernel_h,
+                      int kernel_w,
+                      int pad,
+                      float error,
+                      bool random_input = true) {
+  LOG(INFO) << "OpenGL Conv Speed Test "
+            << " C: " << C << " H: " << H << " W: " << W;
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(K, C, kernel_h, kernel_w);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  NetDef netdef;
+  netdef.set_name("Test net");
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLConv");
+    op.add_input("X_gl");
+    op.add_input("W");
+    op.add_input("b");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("order");
+      arg.set_s("NCHW");
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("kernel");
+      arg.set_i(kernel_h);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("pad");
+      arg.set_i(pad);
+    }
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  CAFFE_ENFORCE(ws.RunNetOnce(netdef));
+  caffe2::NetBase* net = ws.CreateNet(netdef);
+  CHECK_NOTNULL(net);
+  CAFFE_ENFORCE(net->Run());
+  net->TEST_Benchmark(1, 4, true);
+}
+
+void testOpenGLPadImage(
+    int N, int C, int H, int W, int pad_l, int pad_r, int pad_t, int pad_b, float error) {
+  LOG(INFO) << "OpenGLPadImage Test";
+  {
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(N, C, H, W);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+      //      for (auto i = 0; i < t->size(); ++i) {
+      //        t->mutable_data<float>()[i] = i + 1;
+      //      }
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToOpenGL");
+      op.add_input("X_cpu");
+      op.add_output("X_gl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("OpenGLPadImage");
+      op.add_input("X_gl");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("mode");
+        arg.set_s("reflect");
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("is_last");
+        arg.set_i(1);
+      }
+      op.add_output("Y_gl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromOpenGL");
+      op.add_input("Y_gl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("PadImage");
+      op.add_input("X_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_l");
+        arg.set_i(pad_l);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_r");
+        arg.set_i(pad_r);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_t");
+        arg.set_i(pad_t);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("pad_b");
+        arg.set_i(pad_b);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("mode");
+        arg.set_s("reflect");
+      }
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
+    checkError(t2, t1, error);
+  }
+}
+
+void testOpenGLResize(int N,
+                      int C,
+                      int H,
+                      int W,
+                      int width_scale,
+                      int height_scale,
+                      float error,
+                      int input_tile_x = 1,
+                      int input_tile_y = 1) {
+  LOG(INFO) << "OpenGLResize Test";
+  {
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      t->Resize(N, C, H, W);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToOpenGL");
+      op.add_input("X_cpu");
+      op.add_output("X_gl");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_x");
+        arg.set_i(input_tile_x);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_y");
+        arg.set_i(input_tile_y);
+      }
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("OpenGLResizeNearest");
+      op.add_input("X_gl");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("width_scale");
+        arg.set_f(width_scale);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("height_scale");
+        arg.set_f(height_scale);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("is_last");
+        arg.set_i(1);
+      }
+      op.add_output("Y_gl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromOpenGL");
+      op.add_input("Y_gl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("ResizeNearest");
+      op.add_input("X_cpu");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("width_scale");
+        arg.set_f(width_scale);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("height_scale");
+        arg.set_f(height_scale);
+      }
+      op.add_output("Y_ref");
+    }
+
+    ws.RunNetOnce(netdef);
+
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
+    checkError(t2, t1, error);
+  }
+}
+
+void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGL Preprocess Test";
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, H, W, C);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<uint8_t>()[i] = rand() % 255;
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    t->Resize(3);
+    CPUContext ctx;
+    t->mutable_data<float>()[0] = 100;
+    t->mutable_data<float>()[1] = 50;
+    t->mutable_data<float>()[2] = 150;
+  }
+
+  NetDef netdef;
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLTensorToTextureStylizerPreprocess");
+    op.add_input("X_cpu");
+    op.add_input("mean");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("noise_std");
+      arg.set_f(0.00001);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("noise_size");
+      arg.set_i(512);
+    }
+
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
+    op.add_input("X_cpu");
+    op.add_input("mean");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("noise_std");
+      arg.set_f(0.00001);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("noise_size");
+      arg.set_i(512);
+    }
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
+  checkError(t2, t1, error);
+}
+
+void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGLDeprocess Test";
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = rand() % 1000 - 500;
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    t->Resize(3);
+    CPUContext ctx;
+    t->mutable_data<float>()[0] = 30;
+    t->mutable_data<float>()[1] = 40;
+    t->mutable_data<float>()[2] = 50;
+  }
+
+  NetDef netdef;
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLTextureToTensorStylizerDeprocess");
+    op.add_input("X_gl");
+    op.add_input("mean");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
+    op.add_input("X_cpu");
+    op.add_input("mean");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+  checkError(t2, t1, error);
+}
+
+void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
+  LOG(INFO) << "OpenGLNormPlanarYUV Test";
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, 3, H, W);
+    CPUContext ctx;
+    for (auto i = 0; i < t->size(); ++i) {
+      t->mutable_data<float>()[i] = rand() % 1000 - 500;
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    t->Resize(1, 3);
+    CPUContext ctx;
+    t->mutable_data<float>()[0] = 30;
+    t->mutable_data<float>()[1] = 40;
+    t->mutable_data<float>()[2] = 50;
+  }
+
+  {
+    auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>();
+    t->Resize(1, 3);
+    CPUContext ctx;
+    t->mutable_data<float>()[0] = 6;
+    t->mutable_data<float>()[1] = 7;
+    t->mutable_data<float>()[2] = 8;
+  }
+
+  NetDef netdef;
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("OpenGLNormalizePlanarYUV");
+    op.add_input("X_gl");
+    op.add_input("mean");
+    op.add_input("stdev");
+    op.add_output("Y_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("Y_gl");
+    op.add_output("Y_cpu");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("NormalizePlanarYUV");
+    op.add_input("X_cpu");
+    op.add_input("mean");
+    op.add_input("stdev");
+    op.add_output("Y_ref");
+  }
+
+  ws.RunNetOnce(netdef);
+  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+  checkError(t2, t1, error);
+}
+
+void OpenGL_copyops_speedtest(int N,
+                              int C,
+                              int H,
+                              int W,
+                              int K,
+                              int kernel_h,
+                              int kernel_w,
+                              int pad,
+                              float error,
+                              bool random_input = true) {
+  LOG(INFO) << "OpenGL CopyOps Speed Test";
+  Workspace ws;
+  {
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    t->Resize(N, C, H, W);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    t->Resize(K, C, kernel_h, kernel_w);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  {
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    t->Resize(K);
+    CPUContext ctx;
+    if (random_input) {
+      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    } else {
+      float* data = t->mutable_data<float>();
+      for (int i = 0; i < t->size(); i++) {
+        data[i] = 1;
+      }
+    }
+  }
+
+  NetDef netdef;
+  netdef.set_name("Test net");
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyToOpenGL");
+    op.add_input("X_cpu");
+    op.add_output("X_gl");
+  }
+
+  {
+    auto& op = *(netdef.add_op());
+    op.set_type("CopyFromOpenGL");
+    op.add_input("X_gl");
+    op.add_output("Y_cpu");
+  }
+
+  caffe2::NetBase* net = ws.CreateNet(netdef);
+  CHECK_NOTNULL(net);
+  net->TEST_Benchmark(1, 4, true);
+}
+
+static NetDef truncateAfter(NetDef def, size_t idx) {
+  // idx = 0, net = 10 -> remove 9
+  // idx = 0, net = 1 -> remove 0
+  const auto toRemove = def.op_size() - idx - 1;
+  for (auto i = 0; i < toRemove; ++i) {
+    def.mutable_op()->RemoveLast();
+  }
+  CHECK_EQ(def.op_size(), idx + 1);
+  return def;
+}
+
+void compareModelsForOpenGL(std::string name,
+                            const NetDef& initNet,
+                            NetDef predictNet,
+                            int width,
+                            int height,
+                            int channel,
+                            std::string input_type,
+                            std::string input_order) {
+
+  if (name == "styleTransfer") {
+    for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
+      auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
+      if (arg->name() == "noise_std") {
+        arg->set_f(0);
+      }
+    }
+  }
+
+  for (auto i = 0; i < predictNet.op_size(); ++i) {
+    auto truncatedPredictNet = truncateAfter(predictNet, i);
+
+    // Change the last blob to external_output(0) for the predict net
+    auto output_blob = "_OUTPUT_BLOB__";
+    truncatedPredictNet.set_external_output(0, output_blob);
+    truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
+
+    NetDef truncatedOpenGLPredictNet = rewritePredictNetForOpenGL(truncatedPredictNet);
+
+    //    LOG(INFO) << "truncatedPredictNet";
+    //    dumpDefForOpenGL(truncatedPredictNet);
+    //
+    LOG(INFO) << "truncatedOpenGLPredictNet";
+    dumpDefForOpenGL(truncatedOpenGLPredictNet);
+
+    CPUContext ctx;
+    Workspace cws;
+    cws.RunNetOnce(initNet);
+
+    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    if (name == "styleTransfer") {
+      CAFFE_ENFORCE_EQ(input_order, "NHWC");
+      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
+      t_cpu->Resize(1, height, width, channel);
+      for (auto i = 0; i < t_cpu->size(); ++i) {
+        t_cpu->mutable_data<uint8_t>()[i] = i % 255;
+      }
+    } else if (name == "segmentation") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_cpu->Resize(1, channel, height, width);
+      float* input = t_cpu->mutable_data<float>();
+      const int size = width * height;
+      // Limit input range to YUV
+      math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
+      math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
+      math::RandGaussian<float, CPUContext>(
+          size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
+    } else if (name == "denoiser") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_cpu->Resize(1, channel, height, width);
+      float* input = t_cpu->mutable_data<float>();
+      const int spatial_size = width * height;
+      math::RandGaussian<float, CPUContext>(spatial_size, 0, 0.33, input, &ctx); // R Channel
+      math::RandGaussian<float, CPUContext>(
+          spatial_size, 0, 0.33, input + spatial_size, &ctx); // G Channel
+      math::RandGaussian<float, CPUContext>(
+          spatial_size, 0, 0.33, input + 2 * spatial_size, &ctx); // B Channel
+      // Clamp Range of input [-1, +1]
+      for (auto i = 0; i < t_cpu->size(); ++i) {
+        input[i] = input[i] > 1 ? 1 : input[i] < -1 ? -1 : input[i];
+      }
+    } else {
+      CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
+    }
+
+    Workspace mws;
+    mws.RunNetOnce(initNet);
+
+    auto* t_gl =
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    if (name == "styleTransfer") {
+      CAFFE_ENFORCE_EQ(input_order, "NHWC");
+      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
+      t_gl->Resize(1, height, width, channel);
+      uint8_t* input = t_gl->mutable_data<uint8_t>();
+      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->capacity_nbytes());
+    } else if (name == "segmentation") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_gl->Resize(1, channel, height, width);
+      float* input = t_gl->mutable_data<float>();
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+    } else if (name == "denoiser") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_gl->Resize(1, channel, height, width);
+      float* input = t_gl->mutable_data<float>();
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+    }
+
+    cws.RunNetOnce(truncatedPredictNet);
+    mws.RunNetOnce(truncatedOpenGLPredictNet);
+
+    const auto m_name =
+        truncatedOpenGLPredictNet.op(truncatedOpenGLPredictNet.op_size() - 1).output(0);
+    const auto c_name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
+
+    LOG(INFO) << "Checking correspondence for name: " << m_name << ", idx: " << i;
+    {
+      const auto& mt = mws.GetBlob(m_name)->Get<TensorCPU>(); // GPU
+      const auto& ct = cws.GetBlob(c_name)->Get<TensorCPU>(); // CPU
+      if (name == "denoiser") {
+        checkError(mt, ct, 0.02); // 1% of Scale
+        LOG(INFO) << "Error Check Completed for Denoiser Layer: " << i;
+      } else {
+        checkError(mt, ct, 1);
+      }
+    }
+  }
+}
+
+void compareBatchedToTiledModels(std::string name,
+                                 const NetDef& initNet,
+                                 NetDef predictNet,
+                                 int width,
+                                 int height,
+                                 int channel,
+                                 std::string input_type,
+                                 std::string input_order) {
+
+  if (name == "styleTransfer") {
+    for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
+      auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
+      if (arg->name() == "noise_std") {
+        arg->set_f(0);
+      }
+    }
+  }
+
+  for (auto i = 19; i < predictNet.op_size(); ++i) {
+    auto truncatedPredictNet = truncateAfter(predictNet, i);
+
+    // Change the last blob to external_output(0) for the predict net
+    auto output_blob = "_OUTPUT_BLOB__";
+    truncatedPredictNet.set_external_output(0, output_blob);
+    truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
+
+    NetDef bachedNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, false);
+    NetDef tiledNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, true);
+
+    LOG(INFO) << "truncatedPredictNet";
+    dumpDefForOpenGL(truncatedPredictNet);
+
+    LOG(INFO) << "truncatedOpenGLPredictNet";
+    dumpDefForOpenGL(bachedNet);
+
+    CPUContext ctx;
+
+    Workspace tws;
+    tws.RunNetOnce(initNet);
+
+    auto* t_batch = tws.CreateBlob(bachedNet.external_input(0))->GetMutable<TensorCPU>();
+    if (name == "styleTransfer") {
+      CAFFE_ENFORCE_EQ(input_order, "NHWC");
+      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
+      t_batch->Resize(1, height, width, channel);
+      for (auto i = 0; i < t_batch->size(); ++i) {
+        t_batch->mutable_data<uint8_t>()[i] = i % 255;
+      }
+    } else if (name == "segmentation") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_batch->Resize(1, channel, height, width);
+      float* input = t_batch->mutable_data<float>();
+      const int size = width * height;
+      // Limit input range to YUV
+      math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
+      math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
+      math::RandGaussian<float, CPUContext>(
+          size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
+    } else {
+      CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
+    }
+
+    Workspace bws;
+    bws.RunNetOnce(initNet);
+
+    auto* t_tiling = bws.CreateBlob(tiledNet.external_input(0))->GetMutable<TensorCPU>();
+    if (name == "styleTransfer") {
+      CAFFE_ENFORCE_EQ(input_order, "NHWC");
+      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
+      t_tiling->Resize(1, height, width, channel);
+      uint8_t* input = t_tiling->mutable_data<uint8_t>();
+      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->capacity_nbytes());
+
+    } else if (name == "segmentation") {
+      CAFFE_ENFORCE_EQ(input_order, "NCHW");
+      CAFFE_ENFORCE_EQ(input_type, "float");
+      t_tiling->Resize(1, channel, height, width);
+      float* input = t_tiling->mutable_data<float>();
+      memcpy(input, t_batch->mutable_data<float>(), t_batch->capacity_nbytes());
+    }
+
+    bws.RunNetOnce(bachedNet);
+    tws.RunNetOnce(tiledNet);
+
+    const auto batch_name = bachedNet.op(bachedNet.op_size() - 1).output(0);
+    const auto tile_name = tiledNet.op(tiledNet.op_size() - 1).output(0);
+
+    LOG(INFO) << "Checking correspondence for name: " << batch_name << ", idx: " << i;
+    {
+      const auto& bt = bws.GetBlob(batch_name)->Get<TensorCPU>(); // GPU
+      const auto& tt = tws.GetBlob(tile_name)->Get<TensorCPU>(); // CPU
+      checkError(bt, tt, 0.01);
+    }
+  }
+}
+
+int runModelBenchmarks(caffe2::NetDef& init_net,
+                       caffe2::NetDef& predict_net,
+                       int warm_up_runs,
+                       int main_runs,
+                       int channel,
+                       int height,
+                       int width,
+                       std::string input_type,
+                       std::string input_order,
+                       std::string engine, // "CPU", "OPENGL", or "MPSCNN"
+                       bool run_individual,
+                       bool use_texture_input,
+                       bool use_tiling,
+                       bool run_fusion) {
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+
+  // caffe2::dumpDefForOpenGL(init_net);
+  caffe2::dumpDefForOpenGL(predict_net);
+
+  CAFFE_ENFORCE(workspace->RunNetOnce(init_net));
+  caffe2::NetDef net_def;
+
+  // rewrite network
+  if (engine == "CPU") {
+    net_def.CopyFrom(predict_net);
+  } else if (engine == "OPENGL") {
+    if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
+      CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
+      return -1;
+    }
+  } else if (engine == "MPSCNN") {
+#ifdef CAFFE2_USE_MPSCNN
+    if (!caffe2::tryConvertToMPSCNN(init_net, predict_net, &net_def)) {
+      CAFFE_THROW("Failed to convert to MPSCNN. Benchmark failed to run");
+      return -1;
+    }
+#else
+    CAFFE_THROW("MPSCNN not enabled. Benchmark failed to run");
+    return -1;
+#endif
+  } else {
+    CAFFE_THROW("Unsupported engine. Benchmark failed to run");
+    return -1;
+  }
+
+  if (!net_def.has_name()) {
+    net_def.set_name("benchmark");
+  }
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+
+  // create input blob
+  if (engine == "CPU" || engine == "MPSCNN" || !use_texture_input) {
+    caffe2::TensorCPU* b;
+    if (!net_def.external_input_size()) {
+      b = workspace->CreateBlob("data")->GetMutable<caffe2::TensorCPU>();
+    } else {
+      b = workspace->CreateBlob(net_def.external_input(0))->GetMutable<caffe2::TensorCPU>();
+    }
+
+    if (input_order == "NCHW") {
+      b->Resize(std::vector<int32_t>(
+          {1, static_cast<int>(channel), static_cast<int>(height), static_cast<int>(width)}));
+    } else if (input_order == "NHWC") {
+      b->Resize(std::vector<int32_t>(
+          {1, static_cast<int>(height), static_cast<int>(width), static_cast<int>(channel)}));
+    } else {
+      CAFFE_THROW("Unknown input order: ", input_order);
+    }
+    if (input_type == "uint8_t") {
+      b->mutable_data<uint8_t>();
+    } else if (input_type == "float") {
+      b->mutable_data<float>();
+    } else {
+      CAFFE_THROW("Unknown input type: ", input_type);
+    }
+  } else {
+    const int tile_x = 1, tile_y = 1;
+    Blob* blob = nullptr;
+    if (!net_def.external_input_size()) {
+      blob = workspace->CreateBlob("data");
+    } else {
+      blob = workspace->CreateBlob(net_def.external_input(0));
+    }
+    if (input_type == "float") {
+      ImageAllocator<float16_t> allocator;
+      GLImageVector<float16_t>* output_image = allocator.newImage(1,
+                                                                  width,
+                                                                  height,
+                                                                  channel,
+                                                                  tile_x,
+                                                                  tile_y,
+#if CAFFE2_IOS
+                                                                  true
+#else
+                                                                  false
+#endif
+      );
+      blob->Reset(output_image);
+      for (auto& texture : (*output_image)[0]->textures) {
+        texture->map_load([&](void* buffer,
+                              size_t width,
+                              size_t height,
+                              size_t stride,
+                              size_t channels,
+                              const GLTexture::Type& type) {});
+      }
+    } else {
+      ImageAllocator<uint8_t> allocator;
+      GLImageVector<uint8_t>* output_image = allocator.newImage(1,
+                                                                width,
+                                                                height,
+                                                                channel,
+                                                                tile_x,
+                                                                tile_y,
+#if CAFFE2_IOS
+                                                                true
+#else
+                                                                false
+#endif
+      );
+      blob->Reset(output_image);
+      for (auto& texture : (*output_image)[0]->textures) {
+        texture->map_load([&](void* buffer,
+                              size_t width,
+                              size_t height,
+                              size_t stride,
+                              size_t channels,
+                              const GLTexture::Type& type) {});
+      }
+    }
+  }
+
+  // run benchmark
+  if (engine == "CPU" || engine == "MPSCNN") {
+    CHECK_NOTNULL(net);
+    CAFFE_ENFORCE(net->Run());
+    net->TEST_Benchmark(warm_up_runs, main_runs, run_individual);
+  } else if (engine == "OPENGL") {
+    CHECK_NOTNULL(net);
+    CAFFE_ENFORCE(net->Run());
+
+    for (int i = 0; i < warm_up_runs; i++) {
+      net->Run();
+    }
+    glFinish();
+
+    Timer timer;
+    timer.Start();
+    for (int i = 0; i < main_runs; i++) {
+      net->Run();
+    }
+    if (use_texture_input) {
+      glFinish();
+    }
+
+    double iter_time = (double)timer.MilliSeconds() / main_runs;
+    LOG(INFO) << "Main run finished. Milliseconds per iter: " << iter_time
+              << ". Iters per second: " << 1000.0 / iter_time;
+
+    if (run_individual) {
+      std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
+
+      for (auto& op : net_def.op()) {
+        ops.push_back(CreateOperator(op, workspace.get()));
+        ops.back()->Run(); // warm up
+      }
+
+      for (int k = 0; k < ops.size(); k++) {
+        timer.Start();
+        for (int i = 0; i < main_runs; i++) {
+          ops[k]->Run();
+        }
+        glFinish();
+
+        LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
+                  << (double)timer.MilliSeconds() / main_runs;
+      }
+    }
+  }
+
+  return 0;
+}
+
+template <typename T>
+void testGLTextureTypes() {
+  gl_log(GL_LOG, "Executing %s...\n", __PRETTY_FUNCTION__);
+
+  GLImageAllocator<T>* allocator = GLImageAllocator<T>::newGLImageAllocator();
+
+  GLImageVector<T>* image = allocator->newImage(1, 10, 10, 4, 1, 1, true);
+
+  const GLTexture* texture = (*image)[0]->textures[0];
+
+  texture->map_load([&](void* buffer,
+                        size_t width,
+                        size_t height,
+                        size_t stride,
+                        size_t channels,
+                        const GLTexture::Type& type) {
+    T* buffer_data = (T*)buffer;
+
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        for (int c = 0; c < channels; c++) {
+          buffer_data[channels * (y * stride + x) + c] = x + y;
+        }
+      }
+    }
+  });
+
+  texture->map_read([&](const void* buffer,
+                        size_t width,
+                        size_t height,
+                        size_t stride,
+                        size_t channels,
+                        const GLTexture::Type& type) {
+    const T* buffer_data = (const T*)buffer;
+
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        gl_log(GL_LOG, "%d, ", (int)buffer_data[channels * (y * stride + x) + 0]);
+      }
+      gl_log(GL_LOG, "\n");
+    }
+  });
+  delete image;
+  delete allocator;
+  gl_log(GL_LOG, "...done with %s\n", __PRETTY_FUNCTION__);
+}
+
+void testOpenGL() {
+  {
+    // Test a bunch of different tiled convolutions
+    std::vector<int> channels({3, 4, 6, 8, 12, 16, 32, 64, 128, 256, 512});
+
+    for (const auto& input_channels : channels) {
+      int tile_x = 1, tile_y = 1;
+      squareFactors((input_channels + 3) / 4, tile_x, tile_y);
+
+      for (const auto& output_channels : channels) {
+        for (int size = 5; size < 8; size *= 2) {
+          testOpenGLConv(1,
+                         input_channels,
+                         size,
+                         size,
+                         output_channels,
+                         3,
+                         3,
+                         0,
+                         1,
+                         Conv,
+                         0.1 * input_channels / 8,
+                         true,
+                         1,
+                         1,
+                         tile_x,
+                         tile_y,
+                         true);
+        }
+
+        for (int size = 5; size < 16; size *= 2) {
+          testOpenGLConv(1,
+                         input_channels,
+                         size,
+                         size,
+                         output_channels,
+                         3,
+                         3,
+                         0,
+                         1,
+                         ConvTranspose,
+                         0.1 * input_channels / 8,
+                         true,
+                         1,
+                         1,
+                         tile_x,
+                         tile_y,
+                         true);
+        }
+      }
+    }
+
+    // Test various paddings and strides with tiled convolution
+    for (int kernel_size = 1; kernel_size <= 5; kernel_size++) {
+      for (int pad = 0; pad < kernel_size; pad++) {
+        for (int stride = 1; stride <= 8; stride++) {
+          testOpenGLConv(1,
+                         16,
+                         100,
+                         100,
+                         16,
+                         kernel_size,
+                         kernel_size,
+                         pad,
+                         stride,
+                         Conv,
+                         0.5,
+                         true,
+                         1,
+                         1,
+                         2,
+                         2,
+                         true);
+        }
+
+        for (int stride = 1; stride <= 8; stride++) {
+          testOpenGLConv(1,
+                         16,
+                         100,
+                         100,
+                         16,
+                         kernel_size,
+                         kernel_size,
+                         pad,
+                         stride,
+                         ConvTranspose,
+                         0.5,
+                         true,
+                         1,
+                         1,
+                         2,
+                         2,
+                         true);
+        }
+      }
+    }
+
+    // Test a bunch of batched convolutions
+    for (int kernel_size = 1; kernel_size <= 8; kernel_size++) {
+      for (int stride = 1; stride <= 8; stride++) {
+        testOpenGLConv(1,
+                       16,
+                       10,
+                       10,
+                       16,
+                       kernel_size,
+                       kernel_size,
+                       0,
+                       stride,
+                       ConvTranspose,
+                       0.5 * (1 + kernel_size / 3.0),
+                       true,
+                       1,
+                       1);
+      }
+
+      for (int stride = 1; stride <= 8; stride++) {
+        testOpenGLConv(1,
+                       16,
+                       10,
+                       10,
+                       16,
+                       kernel_size,
+                       kernel_size,
+                       0,
+                       stride,
+                       Conv,
+                       0.5 * (1 + kernel_size / 3.0),
+                       true,
+                       1,
+                       1);
+      }
+    }
+    for (const auto& channel : channels) {
+      int tile_x = 1, tile_y = 1;
+      squareFactors((channel + 3) / 4, tile_x, tile_y);
+      // clang-format off
+      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvPRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposePRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposeRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
+
+      testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
+      testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
+      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
+      testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
+      // clang-format on
+    }
+  }
+
+  {
+    testGLTextureTypes<uint8_t>();
+    testGLTextureTypes<float16_t>();
+
+    testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
+    testOpenGLCopyOps(1, 3, 4, 4, 1e-2);
+    testOpenGLCopyOps(1, 2, 4, 4, 1e-2);
+    testOpenGLCopyOps(1, 1, 4, 4, 1e-2);
+    testOpenGLCopyOps(1, 4, 2, 2, 1e-2);
+    testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
+    testOpenGLCopyOps(1, 4, 1, 1, 1e-2);
+    testOpenGLCopyOps(1, 4, 8, 8, 1e-2);
+    testOpenGLCopyOps(1, 6, 8, 3, 1e-2);
+    testOpenGLCopyOps(1, 4, 1, 2, 1e-2);
+    testOpenGLCopyOps(1, 8, 6, 1, 1e-2);
+    testOpenGLCopyOps(1, 8, 13, 18, 1e-2);
+    testOpenGLCopyOps(1, 16, 13, 18, 1e-2);
+    testOpenGLCopyOps(1, 13, 128, 90, 1e-2);
+    testOpenGLCopyOps(1, 16, 1280, 720, 1e-2);
+
+    testOpenGLCopyOps(1, 16, 4, 4, 1e-2, 2, 2);
+    testOpenGLCopyOps(1, 64, 16, 16, 1e-2, 2, 2);
+    testOpenGLCopyOps(1, 48, 13, 17, 1e-2, 3, 2);
+    testOpenGLCopyOps(1, 512, 1, 1, 1e-2, 4, 16);
+    testOpenGLCopyOps(1, 256, 7, 7, 1e-2, 8, 8);
+    testOpenGLCopyOps(1, 20, 13, 17, 1e-2, 5, 1);
+
+    // Test pooling operators
+    LOG(INFO) << "Test pooling operators";
+    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
+    testOpenGLConv(1, 4, 5, 5, 4, 5, 5, 0, 1, AveragePool, 0.5, true);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, AveragePool, 0.01, true);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, AveragePool, 0.01, true);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, AveragePool, 0.01, true);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, MaxPool, 0.01, true);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, MaxPool, 0.01, true);
+
+    // Test strided convolution
+    LOG(INFO) << "Test strided convolution";
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, Conv, 0.5, true, 1, 1);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 3, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 3, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 3, Conv, 0.5, true, 1, 1);
+
+    // Test input batching
+    LOG(INFO) << "Test input batching";
+    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
+    testOpenGLConv(1, 8, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 2, 1);
+    testOpenGLConv(1, 12, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 3, 1);
+    testOpenGLConv(1, 16, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 4, 1);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
+    testOpenGLConv(1, 8, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 2, 1); // use random input
+    testOpenGLConv(1, 12, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 3, 1); // use random input
+    testOpenGLConv(1, 16, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 4, 1); // use random input
+    testOpenGLConv(1, 32, 10, 10, 4, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
+
+    // Test output batching
+    LOG(INFO) << "Test output batching";
+    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
+    testOpenGLConv(1, 4, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 1, 2);
+    testOpenGLConv(1, 4, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 1, 3);
+    testOpenGLConv(1, 4, 5, 5, 16, 3, 3, 0, 1, Conv, 0.5, false, 1, 4);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
+    testOpenGLConv(1, 4, 10, 10, 8, 3, 3, 0, 1, Conv, 1.5, true, 1, 2); // use random input
+    testOpenGLConv(1, 4, 10, 10, 12, 3, 3, 0, 1, Conv, 0.5, true, 1, 3); // use random input
+    testOpenGLConv(1, 4, 10, 10, 16, 3, 3, 0, 1, Conv, 0.5, true, 1, 4); // use random input
+
+    // Test both
+    LOG(INFO) << "Test both input and output batching";
+    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
+    testOpenGLConv(1, 8, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 2, 2);
+    testOpenGLConv(1, 12, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 3, 3);
+
+    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
+    testOpenGLConv(1, 8, 10, 10, 8, 3, 3, 0, 1, Conv, 1, true, 2, 2); // use random input
+    testOpenGLConv(1, 12, 10, 10, 12, 3, 3, 0, 1, Conv, 2, true, 3, 3); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
+
+    // Test different combination of batching
+    LOG(INFO) << "Test mixed input and output batching sizes";
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 2);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 2);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 4);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 4);
+
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 2);
+
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2); // use random input
+
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1);
+    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2);
+
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 2); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 2); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
+    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 4); // use random input
+
+    // Test input/output channels
+    for (int i = 0; i < 4; i++) {
+      testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
+      testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
+    }
+
+    // Test large input size
+    LOG(INFO) << "Test large input size";
+    testOpenGLConv(1, 4, 1280, 720, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
+
+    // Test non standard input size
+    testOpenGLConv(1, 16, 125, 73, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
+    testOpenGLConv(1, 16, 127, 71, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
+
+    // Test for different kernel size
+    LOG(INFO) << "Test kernel sizes 4 to 6";
+    for (int w = 4; w < 7; w++) {
+      testOpenGLConv(1, 4, 128, 72, 4, w, w, 0, 1, Conv, 4 * (w / 3.0) * (w / 3.0), true, 1, 1);
+    }
+
+    // Test for random failures
+    for (int i = 0; i < 10; i++) {
+      testOpenGLConv(1, 6, 111, 111, 3, 3, 3, 0, 2, ConvTranspose, 0.5, true, 2, 1);
+      testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
+    }
+
+    LOG(INFO) << "Test OpenGL ConvPRelu";
+    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
+    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvPRelu, 1, true, 1, 1);
+    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvPRelu, 2, true, 2, 2);
+    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvPRelu, 4, true, 3, 1);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 1, 1);
+
+    LOG(INFO) << "Test OpenGL ConvTransposePRelu";
+    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
+    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposePRelu, 1, true, 1, 1);
+    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 2, 2);
+    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 3, 1);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 1, 1);
+
+    LOG(INFO) << "Test OpenGL ConvRelu";
+    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvRelu, 2, true, 1, 1);
+    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvRelu, 1, true, 1, 1);
+    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvRelu, 2, true, 2, 2);
+    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvRelu, 4, true, 3, 1);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 1, 1);
+
+    LOG(INFO) << "Test OpenGL ConvTransposeRelu";
+    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 1, 1);
+    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposeRelu, 1, true, 1, 1);
+    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 2, 2);
+    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 3, 1);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
+    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 1, 1);
+
+    LOG(INFO) << "Test OpenGL PRelu";
+    testOpenGLPRelu(1, 4, 16, 16, 4, 1, 1, 0.1);
+    testOpenGLPRelu(1, 16, 16, 16, 1, 1, 1, 0.1);
+    testOpenGLPRelu(1, 12, 16, 16, 1, 1, 1, 0.1);
+    testOpenGLPRelu(1, 6, 640, 360, 6, 1, 1, 0.1);
+
+    LOG(INFO) << "Test OpenGL Relu";
+    testOpenGLRelu(1, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLRelu(1, 16, 16, 16, 1, 1, 0.1);
+    testOpenGLRelu(1, 6, 640, 360, 1, 1, 0.1);
+
+    LOG(INFO) << "Test OpenGL Add";
+    testOpenGLAdd(1, 16, 640, 360, 0.1);
+    testOpenGLAdd(1, 12, 640, 360, 0.1);
+
+    LOG(INFO) << "Test OpenGL Sub";
+    testOpenGLSub(1, 16, 640, 360, 0.1);
+    testOpenGLSub(1, 12, 640, 360, 0.1);
+
+    LOG(INFO) << "Test OpenGL Sigmoid";
+    testOpenGLSigmoid(1, 4, 16, 16, 0.1);
+    testOpenGLSigmoid(1, 12, 64, 48, 0.1);
+    testOpenGLSigmoid(1, 6, 640, 360, 0.1);
+
+    LOG(INFO) << "Test OpenGL Tanh";
+    testOpenGLTanh(1, 4, 16, 16, 0.1);
+    testOpenGLTanh(1, 12, 64, 48, 0.1);
+    testOpenGLTanh(1, 6, 640, 360, 0.1);
+
+    LOG(INFO) << "Test OpenGL Mul";
+    testOpenGLMul(1, 4, 16, 16, 0.1);
+    testOpenGLMul(1, 12, 64, 48, 0.1);
+    testOpenGLMul(1, 6, 640, 360, 0.1);
+
+    LOG(INFO) << "Test OpenGL Concat";
+    testOpenGLConcat(1, std::vector<int>{4, 4}, 16, 16);
+    testOpenGLConcat(1, std::vector<int>{4, 4, 4}, 16, 16);
+    testOpenGLConcat(1, std::vector<int>{4, 4, 4, 4}, 16, 16);
+    testOpenGLConcat(1, std::vector<int>{8, 4, 12}, 16, 16);
+    testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16);
+    testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16);
+
+    testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16, true);
+    testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16, true);
+
+    LOG(INFO) << "Test OpenGL Softmax";
+    testOpenGLSoftmax(1, 100, 0.1);
+    testOpenGLSoftmax(1, 500, 0.1);
+    testOpenGLSoftmax(1, 1000, 0.1);
+    testOpenGLSoftmax(1, 5000, 0.1);
+
+    LOG(INFO) << "Test OpenGL InstanceNorm";
+    testOpenGLInstanceNorm(1, 4, 16, 16, 0.2);
+    testOpenGLInstanceNorm(1, 4, 20, 20, 0.2);
+    testOpenGLInstanceNorm(1, 4, 128, 128, 0.2);
+    testOpenGLInstanceNorm(1, 12, 120, 140, 0.3);
+    testOpenGLInstanceNorm(1, 3, 120, 140, 0.2);
+    testOpenGLInstanceNorm(1, 4, 192, 192, 0.2);
+
+    testOpenGLInstanceNorm(1, 4, 258, 198, 0.2);
+    testOpenGLInstanceNorm(1, 8, 338, 198, 0.2);
+    testOpenGLInstanceNorm(1, 12, 334, 194, 0.2);
+    testOpenGLInstanceNorm(1, 16, 324, 184, 0.2);
+    testOpenGLInstanceNorm(1, 6, 640, 360, 0.2);
+
+    LOG(INFO) << "Test OpenGL InstanceNormPRelu";
+    testOpenGLInstanceNormPRelu(1, 4, 16, 16, 0.2);
+    testOpenGLInstanceNormPRelu(1, 4, 20, 20, 0.2);
+    testOpenGLInstanceNormPRelu(1, 4, 128, 128, 0.2);
+    testOpenGLInstanceNormPRelu(1, 12, 120, 140, 0.3);
+    testOpenGLInstanceNormPRelu(1, 3, 120, 140, 0.2);
+    testOpenGLInstanceNormPRelu(1, 4, 192, 192, 0.2);
+
+    testOpenGLInstanceNormPRelu(1, 4, 258, 198, 0.2);
+    testOpenGLInstanceNormPRelu(1, 8, 338, 198, 0.2);
+    testOpenGLInstanceNormPRelu(1, 12, 334, 194, 0.2);
+    testOpenGLInstanceNormPRelu(1, 16, 324, 184, 0.2);
+    testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);
+
+    LOG(INFO) << "Test OpenGL ResizeNearest";
+    testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
+    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);
+
+    // debug style transfer
+    // conv
+    testOpenGLConv(1, 3, 82, 82, 8, 9, 9, 0, 1, Conv, 4, true, 1, 1);
+    testOpenGLConv(1, 8, 74, 74, 8, 3, 3, 0, 1, Conv, 4, true, 1, 1);
+    testOpenGLConv(1, 8, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
+    testOpenGLConv(1, 12, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
+
+    // convtranspose
+    testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
+    testOpenGLConv(1, 6, 112, 112, 3, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 1);
+
+    LOG(INFO) << "Test OpenGL PadImage";
+    testOpenGLPadImage(1, 3, 11, 11, 0, 1, 0, 1, 0.001);
+    testOpenGLPadImage(1, 3, 50, 80, 0, 1, 0, 1, 0.001);
+    testOpenGLPadImage(1, 12, 50, 80, 10, 9, 10, 9, 0.001);
+
+    LOG(INFO) << "Test OpenGL Preprocess";
+    testOpenGLPreprocess(1, 4, 8, 8, 0.20);
+    testOpenGLPreprocess(1, 4, 1280, 720, 0.20);
+
+    LOG(INFO) << "Test OpenGL Deprocess";
+    testOpenGLDeprocess(1, 3, 8, 8, 0.01);
+    testOpenGLDeprocess(1, 3, 1280, 720, 0.01);
+
+    LOG(INFO) << "Test OpenGL NormalizePlanarYUV";
+    testOpenGLNormPlanarYUV(1, 3, 8, 8, 0.01);
+    testOpenGLNormPlanarYUV(1, 3, 192, 192, 0.01);
+
+    //  for (int i = 0; i < 4; i += 1) {
+    //    LOG(INFO) << "C: " << 4 << ", H: " << 1280 + i << ", W: " << 720 + i;
+    //    OpenGL_copyops_speedtest(1, 4, 1280, 720 + i, 4, 3, 3, 0, 0.5);
+    //  }
+
+    //  for (int i = 0; i < 1; i += 1) {
+    //    LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
+    //    OpenGL_copyops_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
+    //  }
+    //
+    //  for (int i = 0; i < 9; i += 1) {
+    //    LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
+    //    OpenGL_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
+    //  }
+
+    // Multi-Batch Tests
+    LOG(INFO) << "Test OpenGL Multi-batch Support";
+    testOpenGLCopyOps(2, 4, 4, 4, 1e-2);
+    testOpenGLCopyOps(3, 4, 4, 4, 1e-2);
+    testOpenGLCopyOps(5, 4, 4, 4, 1e-2);
+    testOpenGLConv(2, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
+    testOpenGLConv(2, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
+    testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(5, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(7, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(11, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(12, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(21, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(50, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
+    testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, ConvTranspose, 0.5, true, 1, 1);
+    testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
+    testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
+
+    testOpenGLPRelu(3, 4, 16, 16, 4, 1, 1, 0.1);
+    testOpenGLPRelu(5, 4, 16, 16, 4, 1, 1, 0.1);
+
+    testOpenGLRelu(3, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLRelu(7, 4, 16, 16, 1, 1, 0.1);
+
+    testOpenGLAdd(3, 16, 640, 360, 0.1);
+    testOpenGLAdd(9, 16, 640, 360, 0.1);
+
+    testOpenGLSigmoid(3, 4, 16, 16, 0.1);
+    testOpenGLSigmoid(11, 4, 16, 16, 0.1);
+
+    testOpenGLInstanceNorm(3, 4, 16, 16, 0.2);
+    testOpenGLInstanceNorm(13, 4, 16, 16, 0.2);
+
+    testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
+    testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);
+
+    testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);
+
+    testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
+    testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);
+
+    testOpenGLSoftmax(3, 1000, 0.1);
+    testOpenGLSoftmax(27, 100, 0.1);
+
+    testOpenGLNormPlanarYUV(4, 3, 192, 192, 0.01);
+
+    // Test Tiling
+    testOpenGLSoftmax(3, 1000, 0.1, true);
+    testOpenGLSoftmax(9, 523, 0.1, true);
+    testOpenGLSoftmax(27, 100, 0.1, true);
+  }
+
+  LOG(INFO) << "End of OpenGL tests";
+}
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.h b/caffe2/mobile/contrib/opengl/test/opengl_test.h
new file mode 100644
index 0000000..5de80b4
--- /dev/null
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.h
@@ -0,0 +1,38 @@
+
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+void testOpenGL();
+void compareModelsForOpenGL(std::string name,
+                            const NetDef& initNet,
+                            NetDef predictNet,
+                            int width,
+                            int height,
+                            int channel,
+                            std::string input_type,
+                            std::string input_order);
+
+void compareBatchedToTiledModels(std::string name,
+                                 const NetDef& initNet,
+                                 NetDef predictNet,
+                                 int width,
+                                 int height,
+                                 int channel,
+                                 std::string input_type,
+                                 std::string input_order);
+
+int runModelBenchmarks(caffe2::NetDef& init_net,
+                       caffe2::NetDef& predict_net,
+                       int warm_up_runs,
+                       int main_runs,
+                       int channel,
+                       int height,
+                       int width,
+                       std::string input_type,
+                       std::string input_order,
+                       std::string engine,
+                       bool run_individual    = false,
+                       bool use_texture_input = false,
+                       bool use_tiling        = false,
+                       bool run_fusion        = true);
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/snpe/CMakeLists.txt b/caffe2/mobile/contrib/snpe/CMakeLists.txt
new file mode 100644
index 0000000..1f95801
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(USE_SNPE AND ANDROID)
+  file(GLOB_RECURSE tmp *.cc)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+endif()
diff --git a/caffe2/mobile/contrib/snpe/README.md b/caffe2/mobile/contrib/snpe/README.md
new file mode 100644
index 0000000..fdb430e
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/README.md
@@ -0,0 +1,33 @@
+# Snapdragon NPE Support
+
+## Build
+
+Use the typical build_android script, but include a couple Cmake options to enable Snapdragon NPE:
+
+    NPE_HEADERS=/path/to/snpe-1.2.2/include/
+    NPE_LOCATION=/path/to/snpe-1.2.2/lib/arm-android-gcc4.9/libSNPE.so
+    ./scripts/build_android.sh -DUSE_SNPE=ON -DSNPE_LOCATION=$NPE_LOCATION -DSNPE_HEADERS=$NPE_HEADERS
+
+this will enable the Snapdragon NPE Operator, which is a Caffe2 operator that can execute NPE `dlc` files.
+
+## Usage
+
+Follow Qualcomm's instructions to convert a model into `dlc` format. You can then use the `dlc` as a Caffe2 operator in Python:
+
+    with open('submodel.dlc', 'rb') as f:
+        dlc = f.read()
+
+    op = core.CreateOperator('SNPE', ['data_in'], ['data_out'],
+             arg=[
+                 utils.MakeArgument("model_buffer", dlc),
+                 utils.MakeArgument("input_name", "data") # Assuming DLC's first layer takes in 'data'
+             ]
+         )
+
+and adding the operator to your model as you would normally.
+
+## Debug
+
+`libSNPE.so` is a shared library that is loaded at runtime.  You may need to specify the location of the library on your Android device when running standalone binaries.  The runtime assumes it will be able to `dlopen()` a file named `libSNPE.so` at the location specified by `gSNPELocation()`.  Either change this value at runtime or use an environment variable such as `LD_LIBRARY_PATH`.
+
+You also need `libgnustl_shared.so` from Android NDK to be loaded in order to run standalone binaries. 
diff --git a/caffe2/mobile/contrib/snpe/snpe_ffi.cc b/caffe2/mobile/contrib/snpe/snpe_ffi.cc
new file mode 100644
index 0000000..b57bacd
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/snpe_ffi.cc
@@ -0,0 +1,131 @@
+#include "snpe_ffi.h"
+
+#include "DiagLog/IDiagLog.hpp"
+#include "zdl/DlContainer/IDlContainer.hpp"
+#include "zdl/DlSystem/ITensorFactory.hpp"
+#include "zdl/DlSystem/DlError.hpp"
+#include "zdl/SNPE/SNPE.hpp"
+#include "zdl/SNPE/SNPEBuilder.hpp"
+#include "zdl/SNPE/SNPEFactory.hpp"
+
+// Stringify input.
+#define S_(x) #x
+#define S(x) S_(x)
+
+#define SNPE_ENFORCE(condition)                                                             \
+  do {                                                                                      \
+    if (!(condition)) {                                                                     \
+      throw std::runtime_error(std::string("Exception in SNPE: ") + std::string(__FILE__) + \
+                               std::string(":") + std::string(S(__LINE__)) +                 \
+                               zdl::DlSystem::getLastErrorString());                        \
+    }                                                                                       \
+  } while (0);
+
+struct SNPEContext {
+ public:
+  SNPEContext(const std::vector<uint8_t>& buffer, const char* input_name, bool enable_logging=false) {
+    container_ = zdl::DlContainer::IDlContainer::open(buffer);
+    SNPE_ENFORCE(container_);
+
+    zdl::SNPE::SNPEBuilder snpeBuilder(container_.get());
+    SNPE_ENFORCE(zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU));
+
+    dnn_ = snpeBuilder.setOutputLayers({}) // Just the last one is fine.
+                      .setRuntimeProcessor(zdl::DlSystem::Runtime_t::GPU)
+											.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+                      .build();
+
+    if (enable_logging) {
+      auto logger_opt = dnn_->getDiagLogInterface();
+      if (!logger_opt) throw std::runtime_error("SNPE failed to obtain logging interface");
+      auto logger = *logger_opt;
+      auto opts = logger->getOptions();
+      opts.LogFileDirectory = "/data/local/tmp/";
+      SNPE_ENFORCE(logger->setOptions(opts));
+      SNPE_ENFORCE(logger->start());
+    }
+
+    SNPE_ENFORCE(dnn_);
+
+    inputDims_ = dnn_->getInputDimensions(input_name);
+
+    inputTensor_ = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputDims_);
+
+    SNPE_ENFORCE(dnn_->getOutputLayerNames() && (*dnn_->getOutputLayerNames()).size() >= 1);
+  }
+
+  const zdl::DlSystem::Optional<zdl::DlSystem::TensorShape>& getInputDims() const { return inputDims_; };
+
+  const std::vector<std::vector<size_t>>& run(const float* inputData, size_t count) {
+    SNPE_ENFORCE(inputData);
+
+    // Copy input data.
+    memcpy(inputTensor_->begin().dataPointer(), inputData, (count * sizeof(float)));
+    SNPE_ENFORCE(inputTensor_.get());
+
+    // Execute graph in the SNPE runtime.
+    SNPE_ENFORCE(dnn_->execute(inputTensor_.get(), outputTensors_));
+
+    SNPE_ENFORCE(outputTensors_.size() >= 1);
+    for (auto name : outputTensors_.getTensorNames()) {
+      const auto& outputTensor = outputTensors_.getTensor(name);
+      auto dims = outputTensor->getShape().getDimensions();
+      outputDims_.push_back(std::vector<size_t>(dims, dims + outputTensor->getShape().rank()));
+    }
+
+    return outputDims_;
+  }
+
+  void copyOutputTo(float* outputData) {
+    const auto& outputTensor = outputTensors_.getTensor(*outputTensors_.getTensorNames().begin());
+    SNPE_ENFORCE(outputTensor);
+    memcpy(outputData, outputTensor->begin().dataPointer(), (outputTensor->getSize() * sizeof(float)));
+  }
+
+ private:
+  std::shared_ptr<zdl::DlContainer::IDlContainer> container_;
+  std::shared_ptr<zdl::SNPE::SNPE> dnn_;
+  zdl::DlSystem::Optional<zdl::DlSystem::TensorShape> inputDims_;
+  std::vector<std::vector<size_t>> outputDims_;
+  std::shared_ptr<zdl::DlSystem::ITensor> inputTensor_;
+  zdl::DlSystem::TensorMap outputTensors_;
+};
+
+extern "C" {
+
+bool snpe_has_gpu() {
+  return zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU);
+}
+
+void* snpe_create(const uint8_t* container, size_t size, const char* input_name) {
+  std::vector<uint8_t> buffer(container, container + size);
+  return new SNPEContext(buffer, input_name);
+}
+
+void snpe_destroy(void* ctx) { delete ((SNPEContext*)ctx); }
+
+void snpe_get_input_dims(void* ctx, size_t const** dims, size_t* size) {
+  const auto& inputDims = ((SNPEContext*)ctx)->getInputDims();
+  *dims = (*inputDims).getDimensions();
+  *size = (*inputDims).rank();
+}
+
+void snpe_run(void* ctx,
+              const float* inputData,
+              size_t inputSize,
+              size_t const** outputDims,
+              size_t* outputSize) {
+
+  const auto& outputDims_ = ((SNPEContext*)ctx)->run(inputData, inputSize);
+  SNPE_ENFORCE(outputDims_.size() >= 1);
+
+  *outputDims = outputDims_[0].data();
+  *outputSize = outputDims_[0].size();
+}
+
+void snpe_copy_output_to(void* ctx, float* outputData) {
+  ((SNPEContext*)ctx)->copyOutputTo(outputData);
+}
+
+} // extern "C"
+
diff --git a/caffe2/mobile/contrib/snpe/snpe_ffi.h b/caffe2/mobile/contrib/snpe/snpe_ffi.h
new file mode 100644
index 0000000..54c713d
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/snpe_ffi.h
@@ -0,0 +1,35 @@
+#ifndef CAFFE2_SNPE_FFI_H_
+#define CAFFE2_SNPE_FFI_H_
+
+#include <stdint.h>
+#include <string>
+
+namespace caffe2 {
+
+std::string& gSNPELocation();
+
+const char* const snpe_ffi_so = "libsnpe_ffi.so";
+
+}
+
+extern "C" {
+
+bool snpe_has_gpu();
+
+void* snpe_create(const uint8_t* container, size_t size, const char* input_name);
+
+void snpe_destroy(void* ctx);
+
+void snpe_get_input_dims(void* ctx, size_t const** dims, size_t* size);
+
+void snpe_run(void* ctx,
+              const float* inputData,
+              size_t inputSize,
+              size_t const** outputDims,
+              size_t* outputSize);
+
+void snpe_copy_output_to(void* ctx, float* outputData);
+
+}
+
+#endif  // CAFFE2_SNPE_FFI_H_
diff --git a/caffe2/mobile/contrib/snpe/snpe_globals.cc b/caffe2/mobile/contrib/snpe/snpe_globals.cc
new file mode 100644
index 0000000..0df8ac1
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/snpe_globals.cc
@@ -0,0 +1,17 @@
+#include "snpe_ffi.h"
+#include <atomic>
+#include <mutex>
+
+namespace caffe2 {
+
+static std::once_flag flag;
+std::string& gSNPELocation() {
+  static std::string g_snpe_location;
+  std::call_once(flag, [](){
+    g_snpe_location = "";
+  });
+  return g_snpe_location;
+}
+
+}
+
diff --git a/caffe2/mobile/contrib/snpe/snpe_op.cc b/caffe2/mobile/contrib/snpe/snpe_op.cc
new file mode 100644
index 0000000..fa01596
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/snpe_op.cc
@@ -0,0 +1,129 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "snpe_ffi.h"
+#include <dlfcn.h>
+
+namespace caffe2 {
+
+template <typename T>
+using deleted_unique_ptr = std::unique_ptr<T, std::function<void(T*)>>;
+
+class SNPEOp final : public Operator<CPUContext> {
+ public:
+  SNPEOp(const OperatorDef& def, Workspace* ws) : Operator<CPUContext>(def, ws),
+        model_buffer_(OperatorBase::GetSingleArgument<string>("model_buffer", "")),
+        input_name_(OperatorBase::GetSingleArgument<string>("input_name", "data"))
+  {
+    CAFFE_ENFORCE(gSNPELocation() != "", "SNPE library \"", gSNPELocation(), "\" does not exist.");
+    std::ostringstream snpe_ffi;
+    snpe_ffi << gSNPELocation() << "/" << snpe_ffi_so;
+    handle_ = deleted_unique_ptr<void>(dlopen(snpe_ffi.str().c_str(), RTLD_LAZY), [](void* handle) {
+      if (handle) {
+        dlclose(handle);
+      }
+    });
+    if (!handle_.get()) {
+      std::cerr << dlerror() << std::endl;
+    }
+
+    OPERATOR_NEEDS_FEATURE(handle_.get(), "Couldn't find ", snpe_ffi.str());
+
+#define X(n)                                    \
+  dlerror();                                    \
+  auto* n##_f = (decltype(&n))dlsym(handle_.get(), #n); \
+  OPERATOR_NEEDS_FEATURE(n##_f, dlerror());
+
+    {
+      X(snpe_has_gpu);
+      X(snpe_create);
+      X(snpe_destroy);
+      X(snpe_get_input_dims);
+      X(snpe_run);
+      X(snpe_copy_output_to);
+    }
+
+    X(snpe_has_gpu);
+    OPERATOR_NEEDS_FEATURE(snpe_has_gpu_f(), "No GPU found, cannot use SNPE.");
+
+    X(snpe_create)
+#undef X
+
+// Redefine to use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
+
+#define X(n)                                              \
+      dlerror();                                          \
+      auto* n##_f = (decltype(&n))dlsym(handle_.get(), #n); \
+      CAFFE_ENFORCE(n##_f, dlerror());
+
+    CAFFE_ENFORCE(def.input_size(), "No inputs.");
+    if (input_name_ == "") {
+      input_name_ = def.input().Get(0);
+		}
+    ctx_ = deleted_unique_ptr<void>(snpe_create_f(reinterpret_cast<const unsigned char *>(model_buffer_.data()),
+          model_buffer_.length(), input_name_.c_str()), [this](void* ctx) {
+      if (ctx) {
+        X(snpe_destroy);
+        snpe_destroy_f(ctx);
+      }
+    });
+  }
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(gSNPELocation() != "", "SNPE library was never loaded.");
+
+    X(snpe_get_input_dims);
+    size_t const* dims;
+    size_t dimSize;
+    snpe_get_input_dims_f(ctx_.get(), &dims, &dimSize);
+    if (Input(0).ndim() != dimSize) {
+      if (dimSize == 3 && dimSize == Input(0).ndim() - 1 && Input(0).dim32(0) == 1) {
+        const int C = Input(0).dim32(1);
+        const int H = Input(0).dim32(2);
+        const int W = Input(0).dim32(3);
+        if (dims[0] != C ||
+            dims[1] != H ||
+            dims[2] != W) {
+          CAFFE_THROW("Input size must match what SNPE expects, which in this case is: ",
+              dims[0], " ", dims[1], " ", dims[2]);
+        }
+      } else {
+        CAFFE_THROW("SNPE input dimensions are not compatible.");
+      }
+    } else {
+      for (auto i = 0; i < Input(0).ndim(); ++i) {
+        CAFFE_ENFORCE_EQ(dims[i], Input(0).dim32(i), "SNPE input dimension is not compatible.");
+      }
+		}
+
+    X(snpe_run);
+    CAFFE_ENFORCE(ctx_.get(), "SNPE context doesn't exist.");
+    snpe_run_f(ctx_.get(), Input(0).data<float>(), Input(0).size(), &dims, &dimSize);
+
+    std::vector<int64_t> outputDims(dimSize + 1);
+    outputDims[0] = 1;
+    for (auto i = 0; i < dimSize; ++i) {
+      outputDims[i+1] = dims[i];
+    };
+
+    Output(0)->Resize(outputDims);
+    X(snpe_copy_output_to);
+    snpe_copy_output_to_f(ctx_.get(), Output(0)->mutable_data<float>());
+
+    CAFFE_ENFORCE(Output(0)->data<float>(), "nullptr where output should be!\n");
+    return true;
+  }
+
+ private:
+  string model_buffer_;
+  string input_name_;
+  deleted_unique_ptr<void> handle_;
+  // needs to be destroyed *before* handle_
+  deleted_unique_ptr<void> ctx_;
+};
+
+REGISTER_CPU_OPERATOR(SNPE, SNPEOp);
+}
+
+#undef X
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
new file mode 100644
index 0000000..58e3ccb
--- /dev/null
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -0,0 +1,192 @@
+#ifdef __ARM_NEON__
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/proto_utils.h"
+
+#define TEST_REAL_DATA 0
+// If you want to test with real data you may want to grab this
+// script P57273314 and a 227x227 png of a cat or something.
+#if TEST_REAL_DATA
+#include "data_chw.h"
+#include "data_hwc.h"
+#define POPULATE_DATA(_n, _s, _l) do {\
+  Blob* _blob = ws.CreateBlob((_n));\
+  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+  _tensor->Resize((_s));\
+  memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes());\
+} while(0)
+#else
+// Rough test on static data
+#define POPULATE_DATA(_n, _s, _l) do {\
+  Blob* _blob = ws.CreateBlob((_n));\
+  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+  _tensor->Resize((_s));\
+  memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes());\
+} while(0)
+#endif
+
+#include <cmath>
+#include <random>
+#include <iostream>
+#include <fstream>
+
+namespace caffe2 {
+
+void AddConstInput(const vector<TIndex>& shape,
+                   const float value,
+                   const string& name,
+                   Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  math::Set<float, CPUContext>(tensor->size(), value,
+                               tensor->mutable_data<float>(),
+                               &context);
+}
+
+void AddNoiseInput(const vector<TIndex>& shape,
+                   const string& name,
+                   Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+    tensor->size(),
+    0.0f, 10.0f,
+    tensor->mutable_data<float>(),
+    &context);
+}
+
+
+float snpe_run(int iters, Workspace& ws) {
+  const int H = 227;
+  const int W = 227;
+  const int C = 3;
+
+  POPULATE_DATA("X_snpe", (caffe2::vector<caffe2::TIndex>{H, W, C}), hwc);
+  
+  OperatorDef def;
+  def.set_name("snpe_test");
+  def.set_type("SNPE");
+  def.add_input("X_snpe");
+  def.add_output("snpeout");
+  std::ostringstream model_buffer;
+  std::ifstream file("/data/local/tmp/squeeze_net.dlc", std::ios::in|std::ios::binary);
+  CAFFE_ENFORCE(file.is_open(), "Couldn't open test model.");
+  model_buffer << file.rdbuf();
+  CAFFE_ENFORCE(model_buffer.str().length() > 0, "Couldn't load model into string.");
+  def.add_arg()->CopyFrom(MakeArgument("model_buffer", model_buffer.str()));
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  assert(op.get());
+  Timer timer;
+  timer.Start();
+  for (auto i = 0; i < iters; ++i) {
+    op->Run();
+  }
+  return timer.MicroSeconds();
+}
+
+float caffe2_run(int iters, Workspace& ws) {
+  NetDef init_net;
+  NetDef predict_net;
+
+  const int N = 1;
+  const int H = 227;
+  const int W = 227;
+  const int C = 3;
+
+  ReadProtoFromBinaryFile("/data/local/tmp/squeeze_init_net.pb", &init_net);
+  ReadProtoFromBinaryFile("/data/local/tmp/squeeze_predict_net.pb", &predict_net);
+  ws.RunNetOnce(init_net);
+  POPULATE_DATA("data", (caffe2::vector<caffe2::TIndex>{N, C, H, W}), chw);
+  predict_net.set_name("SqueezeNet");
+  ws.CreateNet(predict_net);
+
+  // Timing caffe2
+  Timer timer;
+  timer.Start();
+  for (auto i = 0; i < iters; ++i) {
+    ws.RunNet("SqueezeNet");
+  }
+  float us = timer.MicroSeconds();
+
+  OperatorDef copy_def;
+  copy_def.set_type("Copy");
+  copy_def.set_name("Copy");
+  copy_def.add_input("softmaxout");
+  copy_def.add_output("caffe2out");
+  unique_ptr<OperatorBase> copy_op(CreateOperator(copy_def, &ws));
+  copy_op->Run();
+  return us;
+}
+
+} // caffe2
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::Workspace ws;
+  int iters = 50;
+
+  std::cout << "Testing caffe2...";
+  float t_caffe2 = caffe2::caffe2_run(iters, ws);
+  std::cout << "done!\nTesting snpe...";
+  float t_snpe = caffe2::snpe_run(iters, ws);
+  std::cout << "done!\n";
+
+  caffe2::Blob* caffe2_out_blob = ws.GetBlob("caffe2out");
+  auto& caffe2_tensor = caffe2_out_blob->Get<caffe2::TensorCPU>();
+  caffe2::Blob* snpe_out_blob = ws.GetBlob("snpeout");
+  auto& snpe_tensor = snpe_out_blob->Get<caffe2::TensorCPU>();
+
+  CAFFE_ENFORCE(snpe_tensor.size() == caffe2_tensor.size(), "Outputs are not the same!\n");
+
+  float total_diff = 0;
+  float KL_divergence = 0;
+  float JS_divergence = 0;
+  float max = 0;
+  int max_index = 0;
+
+  for (auto i = 0; i < snpe_tensor.size(); ++i) {
+    auto Q = caffe2_tensor.data<float>()[i];
+    auto P = snpe_tensor.data<float>()[i];
+    if (Q > max) {
+      max = Q;
+      max_index = i;
+    }
+    auto diff = fabs(P - Q);
+    auto avg = P + Q / 2;
+    if (P && Q) {
+      KL_divergence += P * log(P / Q);
+      JS_divergence += 0.5 * P * log(P / Q) + 0.5 * Q * log(Q / P);
+    }
+    total_diff += diff;
+    if (diff / avg > 0.10 && avg > 0.01) { // 10% difference and a non trivial confidence
+      std::cout << "Diff: " << diff << " (" << P << " vs " << Q << ")\n";
+    }
+  }
+
+  float avg_diff = total_diff; // Avg difference as percentage (not a great metric)
+  printf("Average difference is %f%%\n", avg_diff * 100);
+  printf("JS Divergence is %f\n", JS_divergence); // Jensen-Shannon
+  printf("KL Divergence is %f\n", KL_divergence); // Kullback–Leibler
+  printf("Predicted %d with %f%% confidence\n", max_index, max * 100);
+
+  printf ("Caffe2: %f microseconds.\n", t_caffe2);
+  printf ("SNPE: %f microseconds.\n", t_snpe);
+  printf ("SNPE impl %fx faster\n", t_caffe2/t_snpe);
+  return 0;
+}
+#else
+// Compile for different targets.
+int main() {
+  return 0;
+}
+#endif
diff --git a/caffe2/mobile/contrib/ulp2/ulp.cc b/caffe2/mobile/contrib/ulp2/ulp.cc
new file mode 100644
index 0000000..1d8e0e8
--- /dev/null
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@@ -0,0 +1,388 @@
+#include "ulp.h"
+
+#include <cstring>
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "ulp_neon.h"
+
+namespace caffe2 {
+
+void uniformQuantize2b1b(const TensorCPU& X,
+                         const std::vector<std::unique_ptr<TensorCPU>>& XQ,
+                         float offset,
+                         float inter_center_distance) {
+  CAFFE_ENFORCE_GT(X.ndim(), 1);
+  const auto N = X.size_to_dim(X.ndim() - 1);
+  auto C = X.size() / N;
+  const auto QC = divRoundUp(C,  8);
+  auto XQs = X.dims();
+  XQs[X.ndim() - 1] = QC;
+  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    XQ[i]->Resize(XQs);
+  }
+  const float* Xdata = X.data<float>();
+  std::array<uint8_t*, k2b1bXBits> XQdata;
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    XQdata[i] = XQ[i]->mutable_data<uint8_t>();
+  }
+  for (auto n = 0; n < N; ++n) {
+    for (auto qc = 0; qc < QC; ++qc) {
+      // compute the block in X.
+      std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
+      for (auto b = 0; b < 8; ++b) {
+        const auto c = qc * 8 + b;
+        if (c < C) {
+          float v = Xdata[qc * 8 + b + C * n];
+          if (v < offset) {
+            // zero'd already.
+          } else if (v < offset + inter_center_distance) {
+            p[0] |= 1 << b;
+          } else if (v < offset + 2 * inter_center_distance) {
+            p[1] |= 1 << b;
+          } else {
+            p[0] |= 1 << b;
+            p[1] |= 1 << b;
+          }
+        }
+      }
+      for (auto i = 0; i < k2b1bXBits; ++i) {
+        XQdata[i][qc + QC * n] = p[i];
+      }
+    }
+  }
+}
+
+void qconv(const ConvArgs& args,
+           const TensorCPU& X,
+           const TensorCPU& W,
+           const TensorCPU* b,
+           TensorCPU* Y) {
+  const auto N = X.dim32(0);
+  const auto IH = X.dim32(1);
+  const auto IW = X.dim32(2);
+  const auto KH = W.dim32(1);
+  const auto KW = W.dim32(2);
+  const auto KC = W.dim32(3);
+  Y->Resize(X.dim32(0),
+            (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
+            (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
+            W.dim32(0));
+  const auto OH = Y->dim32(1);
+  const auto OW = Y->dim32(2);
+  const auto OC = Y->dim32(3);
+
+  CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3));
+
+  const auto* Xdata = X.data<uint8_t>();
+  const auto* Wdata = W.data<uint8_t>();
+  auto* Ydata = Y->mutable_data<float>();
+  for (size_t n = 0; n < N; ++n) {
+    for (size_t oh = 0; oh < OH; ++oh) {
+      for (size_t ow = 0; ow < OW; ++ow) {
+        for (size_t oc = 0; oc < OC; ++oc) {
+          float acc = 0.0;
+          for (size_t kh = 0; kh < KH; ++kh) {
+            const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t;
+            for (size_t kw = 0; kw < KW; ++kw) {
+              const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l;
+              for (size_t kc = 0; kc < KC; ++kc) {
+                const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
+                // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
+                if ((size_t)ih >= (size_t)IH || (size_t)iw >= (size_t)IW) {
+                  acc += __builtin_popcount(0 ^ w);
+                } else {
+                  const uint8_t x =
+                      Xdata[kc + KC * (size_t)iw + KC * IW * (size_t)ih + n * KC * IW * IH];
+                  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
+                  acc += __builtin_popcount(x ^ w);
+                }
+              }
+            }
+          }
+          Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] =
+              KW * KH * KC * 8 - 2 * acc + (b ? b->data<float>()[oc] : 0.0);
+          ;
+        }
+      }
+    }
+  }
+}
+
+void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
+  CAFFE_ENFORCE_EQ(args.stride_h, 1);
+  CAFFE_ENFORCE_EQ(args.stride_w, 1);
+  const auto* Xdata = X.data<uint8_t>();
+  Y->Resize(X.dim32(0),
+            X.dim32(1) + args.pad_t + args.pad_b,
+            X.dim32(2) + args.pad_l + args.pad_r,
+            X.dim32(3));
+  auto* Ydata = Y->mutable_data<uint8_t>();
+  ::memset(Ydata, Y->nbytes(), 0);
+  const auto C = Y->dim32(3);
+  const auto XrowSize = X.dim32(3) * X.dim32(2);
+  const auto YrowSize = Y->dim32(3) * Y->dim32(2);
+  math::CopyMatrix<CPUContext>(1,
+                               X.dim32(1),
+                               XrowSize,
+                               Xdata,
+                               XrowSize,
+                               Ydata + C * args.pad_l + YrowSize * args.pad_t,
+                               YrowSize,
+                               nullptr);
+}
+
+void signQuantize(const TensorCPU& X, TensorCPU* XQ) {
+  CAFFE_ENFORCE_GT(X.ndim(), 1);
+  const auto N = X.size_to_dim(X.ndim() - 1);
+  auto C = X.size() / N;
+  const auto QC = divRoundUp(C,  8);
+  auto XQs = X.dims();
+  XQs[X.ndim() - 1] = QC;
+  XQ->Resize(XQs);
+  const float* Xdata = X.data<float>();
+  uint8_t* XQdata = XQ->mutable_data<uint8_t>();
+  for (auto n = 0; n < N; ++n) {
+    for (auto qc = 0; qc < QC; ++qc) {
+      // compute the block in X.
+      uint8_t p = 0;
+      for (auto b = 0; b < 8; ++b) {
+        const auto c = qc * 8 + b;
+        if (c < C) {
+          p |= (Xdata[c + C * n] > 0) << b;
+        }
+      }
+      XQdata[qc + QC * n] = p;
+    }
+  }
+}
+
+void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN) {
+  const auto F = WQ.dim32(0);
+  // In our NEON kernel we read up to TileSize, so align allocation to TileSize elements.
+  WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize);
+  const auto WQs = WQ.size() / F;
+  const auto WQbits = 8 * WQs;
+  const auto* WQdata = WQ.data<uint8_t>();
+  auto* WQNdata = WQN->mutable_data<float>();
+  for (auto f = 0; f < F; ++f) {
+    int32_t bitSum = 0;
+    for (auto j = 0; j < WQs; ++j) {
+      bitSum += __builtin_popcount(WQdata[f * WQs + j]);
+    }
+    DCHECK_LE(bitSum, WQbits);
+    WQNdata[f] = 2 * bitSum - WQbits;
+  }
+}
+
+void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1) {
+  const auto F = W.dim32(0);
+  WL1->Resize(F);
+  const auto Ws = W.size() / F;
+  const auto* Wdata = W.data<float>();
+  auto* WL1data = WL1->mutable_data<float>();
+  for (auto f = 0; f < F; ++f) {
+    double l1sum = 0.0;
+    for (auto j = 0; j < Ws; ++j) {
+      l1sum += std::abs(Wdata[f * Ws + j]);
+    }
+    WL1data[f] = l1sum / Ws;
+  }
+}
+
+void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol) {
+  // TODO: pass pre-resized output?
+  // TODO: handle strides?
+
+  CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3));
+  const size_t N = XQ.dim32(0);
+  const size_t IH = XQ.dim32(1);
+  const size_t IW = XQ.dim32(2);
+  const size_t KH = WQ.dim32(1);
+  const size_t KW = WQ.dim32(2);
+  const size_t KC = WQ.dim32(3);
+
+  XQcol->Resize(XQ.dim32(0),
+                (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
+                (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
+                KH * KW * KC);
+
+  if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 &&
+      args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) {
+    CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size());
+    XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size());
+    return;
+  }
+  const size_t OH = XQcol->dim32(1);
+  const size_t OW = XQcol->dim32(2);
+
+  const uint8_t* XQdata = XQ.data<uint8_t>();
+  uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>();
+  for (size_t n = 0; n < N; ++n) {
+    for (size_t oh = 0; oh < OH; ++oh) {
+      int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t;
+      for (size_t ow = 0; ow < OW; ++ow) {
+        int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l;
+        for (size_t kh = 0; kh < KH; ++kh) {
+          int32_t ih = (int32_t)kh + h_pad;
+          if ((size_t)ih < (size_t)IH && (size_t)w_pad < (size_t)IW &&
+              (size_t)((int32_t)w_pad + (int32_t)KW) < (size_t)IW) {
+            // We can do a larger memcpy, of size KW * KC
+            size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
+                         n * KH * KW * KC * OW * OH;
+            std::memcpy(&XQcoldata[off],
+                        &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH],
+                        KW * KC);
+          } else {
+            for (size_t kw = 0; kw < KW; ++kw) {
+              int32_t iw = (int32_t)kw + w_pad;
+              // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
+              size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
+                           n * KH * KW * KC * OW * OH;
+              if ((size_t)ih < (size_t)IH && (size_t)iw < (size_t)IW) {
+                std::memcpy(
+                    &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC);
+              } else {
+                // This should be simply padded with zero.
+                std::memset(&XQcoldata[off], 0, KC);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
+                                                const TensorCPU& W,
+                                                const TensorCPU* b) {
+  auto state = caffe2::make_unique<QConvState>();
+  state->XQs.resize(k2b1bXBits);
+  state->YQs.resize(k2b1bXBits);
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    state->XQs[i] = caffe2::make_unique<TensorCPU>();
+    state->YQs[i] = caffe2::make_unique<TensorCPU>();
+  }
+  state->WQ = caffe2::make_unique<TensorCPU>();
+  state->WQN = caffe2::make_unique<TensorCPU>();
+  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
+  state->scratch = caffe2::make_unique<TensorCPU>();
+  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
+
+  signQuantize(W, state->WQ.get());
+  filterNormalization11(*(state->WQ), state->WQN.get());
+  filterNormalizationL1(W, state->WQL1Norm.get());
+  // TODO: incorporate center distance normalization.
+  // Since inputs to convs are [0, 1, 2, 3], instead of [0, x, 2 * x, ...],
+  // we can just uniformly rescale the outputs by x, i.e.,
+  // for (auto i = 0; i < r->WQL1Norm.size(); ++i) {
+  //   r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
+  // }
+  state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
+#if CAFFE2_MOBILE
+    ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
+#else
+    for (size_t v = 0; v < range; ++v) {
+      f(v);
+    }
+#endif
+  };
+  if (b) {
+    CPUContext context;
+    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
+  }
+  return state;
+}
+
+void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  if (run2b1bConvNeon(state, args, X, Y)) {
+    return;
+  }
+#endif
+  uniformQuantize2b1b(X, state->XQs, 0.5, 1.0);
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    qconv(args, *(state->XQs[i]), *(state->WQ), nullptr, state->YQs[i].get());
+  }
+  Y->ResizeLike(*(state->YQs[0]));
+  const auto F = state->WQ->dim(0);
+  const auto N = Y->size() / F;
+  run2b1bUnification(state,
+                     N,
+                     F,
+                     state->WQN->data<float>(),
+                     state->YQs[0]->data<float>(),
+                     state->YQs[1]->data<float>(),
+                     F,
+                     Y->mutable_data<float>(),
+                     F,
+                     state->bias ? state->bias->data<float>() : nullptr);
+}
+
+void run2b1bUnification(QConvState* state,
+                        size_t N,
+                        size_t C,
+                        const float* WQNVdata,
+                        const float* YQs0Vdata,
+                        const float* YQs1Vdata,
+                        size_t YQstride,
+                        float* Ydata,
+                        size_t Ystride,
+                        const float* bias) {
+  ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C);
+
+  for (size_t j = 0; j < N; ++j) {
+    ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C);
+    ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C);
+    EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C);
+    if (bias) {
+      ConstEigenVectorArrayMap<float> BV(bias, C);
+      YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
+            std::pow<float>(2, 0) * YQs1V + BV;
+    } else {
+      YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
+            std::pow<float>(2, 0) * YQs1V;
+    }
+  }
+}
+
+class QConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  QConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws), ws_(ws) {
+    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC, "QConvOp only supports NHWC order");
+    OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1, "");
+    OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1, "");
+    OPERATOR_NEEDS_FEATURE(this->group_ == 1, "");
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    auto& X = Input(0);
+    auto& filter = Input(1);
+    const auto* bias = InputSize() == 3 ? &Input(2) : nullptr;
+    auto* Y = Output(0);
+
+    // TODO: Support multiple quantization methods instead of assuming 2b1b.
+    if (!state_) {
+      state_ = create2b1bConvState(ws_, filter, bias);
+    }
+    ConvArgs args;
+    args.pad_l = this->pad_l();
+    args.pad_t = this->pad_t();
+    args.pad_b = this->pad_b();
+    args.pad_r = this->pad_r();
+    args.stride_h = this->stride_h();
+    args.stride_w = this->stride_w();
+    run2b1bConvGeneric(state_.get(), args, X, Y);
+    return true;
+  }
+
+ private:
+  std::unique_ptr<QConvState> state_;
+  Workspace* ws_;
+};
+
+REGISTER_CPU_OPERATOR(QConv, QConvOp);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ulp2/ulp.h b/caffe2/mobile/contrib/ulp2/ulp.h
new file mode 100644
index 0000000..bddbf85
--- /dev/null
+++ b/caffe2/mobile/contrib/ulp2/ulp.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+constexpr size_t k2b1bXBits = 2;
+
+struct ConvArgs {
+  int stride_w{1};
+  int stride_h{1};
+  int pad_l{0};
+  int pad_t{0};
+  int pad_b{0};
+  int pad_r{0};
+};
+
+using ParallelFor = std::function<void(size_t, std::function<void(size_t)>)>;
+
+struct QConvState {
+  std::vector<std::unique_ptr<TensorCPU>> XQs;
+  std::vector<std::unique_ptr<TensorCPU>> YQs;
+  std::unique_ptr<TensorCPU> WQ;
+  // architecture-dependent whether packing is used.
+  std::unique_ptr<TensorCPU> WQPacked;
+  std::unique_ptr<TensorCPU> WQN;
+  std::unique_ptr<TensorCPU> WQL1Norm;
+  // Useful for e.g. incomplete tiles
+  std::unique_ptr<TensorCPU> scratch;
+  std::unique_ptr<TensorCPU> scratchColBuffer;
+
+  std::unique_ptr<TensorCPU> bias;
+
+  ParallelFor parallelFor{nullptr};
+};
+
+void uniformQuantize2b1b(const TensorCPU& X,
+                         const std::vector<std::unique_ptr<TensorCPU>>& XQ,
+                         float offset,
+                         float inter_center_distance);
+
+void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y);
+
+inline size_t divRoundUp(size_t x, size_t d) { return (x + d - 1) / d; }
+
+void signQuantize(const TensorCPU& X, TensorCPU* XQ);
+void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN);
+void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1);
+std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
+                                                const TensorCPU& W,
+                                                const TensorCPU* b);
+void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y);
+void qconv(
+    const ConvArgs& args, const TensorCPU& X, const TensorCPU& W, const TensorCPU* b, TensorCPU* Y);
+void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol);
+
+void run2b1bUnification(QConvState* state,
+                        size_t N,
+                        size_t C,
+                        const float* WQNVdata,
+                        const float* YQs0Vdata,
+                        const float* YQs1Vdata,
+                        size_t YQstride,
+                        float* Ydata,
+                        size_t Ystride,
+                        const float* bias);
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ulp2/ulp_neon.cc b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
new file mode 100644
index 0000000..15ad59a
--- /dev/null
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@@ -0,0 +1,555 @@
+#include "ulp_neon.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// TODO: tune this with cache size detection code. Changing to 32 helps on some
+// devices (Snapdragon 820).
+constexpr size_t kL1CacheSizeBytes = 16 * 1024;
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+// Applies 2-bit uniform quantization to the floating point data at Xdata,
+// storing QC bytes into XQdata (i.e. reading 8 * QC floats from Xdata).
+// Requires QC to be a multiple of 8.
+inline void quantize2bNeon(size_t QC,
+                           const float* __restrict__ Xdata,
+                           float offset,
+                           float inter_center_distance,
+                           std::array<uint8_t*, k2b1bXBits> XQdata) {
+  DCHECK_EQ(QC % 8, 0);
+  const auto offset_plus_2_inter_center_distance = vdupq_n_f32(offset + 2 * inter_center_distance);
+  const auto offset_plus_inter_center_distance = vdupq_n_f32(offset + inter_center_distance);
+  const auto offset_ = vdupq_n_f32(offset);
+  const uint8x8_t shifts = {1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7};
+
+  for (size_t qc = 0; qc < QC; qc += 8) {
+    std::array<std::array<uint8x8_t, 8>, k2b1bXBits> ps;
+    for (auto i = 0; i < k2b1bXBits; ++i) {
+      for (auto j = 0; j < 8; ++j) {
+        ps[i][j] = vdup_n_u8(0);
+      }
+    }
+
+    for (auto j = 0; j < 8; ++j) {
+      const auto x0 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 0]);
+      const auto x1 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 4]);
+
+      // logic.
+      // if (v >= offset + inter_center_distance) {
+      //   p[1] |= 1 << b;
+      // } else {
+      //   p[1] |= 0 << b;
+      // }
+
+      // if ((v >= offset && v < offset + inter_center_distance) ||
+      //     (v >= offset * 2 * inter_center_distance)) {
+      //   p[0] |= 1 << b;
+      // } else {
+      //   p[0] |= 0 << b;
+      // }
+
+      auto join = [](uint32x4_t a, uint32x4_t b) -> uint8x8_t {
+        return vmovn_u16(vcombine_u16(vmovn_u32(a), vmovn_u32(b)));
+      };
+
+      const auto x_geq_offset_plus_2_inter_center_distance =
+          join(vcgeq_s32(vreinterpretq_s32_f32(x0),
+                         vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)),
+               vcgeq_s32(vreinterpretq_s32_f32(x1),
+                         vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)));
+      const auto x_ge_offset =
+          join(vcgeq_s32(vreinterpretq_s32_f32(x0), vreinterpretq_s32_f32(offset_)),
+               vcgeq_s32(vreinterpretq_s32_f32(x1), vreinterpretq_s32_f32(offset_)));
+
+      const auto x_lt_offset_plus_inter_center_distance =
+          join(vcltq_s32(vreinterpretq_s32_f32(x0),
+                         vreinterpretq_s32_f32(offset_plus_inter_center_distance)),
+               vcltq_s32(vreinterpretq_s32_f32(x1),
+                         vreinterpretq_s32_f32(offset_plus_inter_center_distance)));
+
+      const auto p1_mask = vmvn_u8(x_lt_offset_plus_inter_center_distance);
+      const auto p0_mask = vorr_u8(vand_u8(x_ge_offset, x_lt_offset_plus_inter_center_distance),
+                                   x_geq_offset_plus_2_inter_center_distance);
+      ps[0][j] = vand_u8(shifts, p0_mask);
+      ps[1][j] = vand_u8(shifts, p1_mask);
+    }
+
+    for (auto i = 0; i < 2; ++i) {
+      const auto p01 = vpadd_u8(ps[i][0], ps[i][1]);
+      const auto p23 = vpadd_u8(ps[i][2], ps[i][3]);
+      const auto p45 = vpadd_u8(ps[i][4], ps[i][5]);
+      const auto p67 = vpadd_u8(ps[i][6], ps[i][7]);
+      const auto p0123 = vpadd_u8(p01, p23);
+      const auto p4567 = vpadd_u8(p45, p67);
+      vst1_u8(XQdata[i] + qc, vpadd_u8(p0123, p4567));
+    }
+  }
+}
+
+void uniformQuantize2b1bNeon(QConvState* state,
+                             const TensorCPU& X,
+                             const std::vector<std::unique_ptr<TensorCPU>>& XQ,
+                             float offset,
+                             float inter_center_distance) {
+  CAFFE_ENFORCE_GT(X.ndim(), 1);
+  const size_t C = X.dim32(X.ndim() - 1);
+  const size_t N = X.size() / C;
+  const size_t QC = divRoundUp(C, 8);
+  auto XQs = X.dims();
+  XQs[X.ndim() - 1] = QC;
+  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    XQ[i]->Resize(XQs);
+  }
+  const float* Xdata = X.data<float>();
+  std::array<uint8_t*, k2b1bXBits> XQdata;
+  for (size_t i = 0; i < k2b1bXBits; ++i) {
+    XQdata[i] = XQ[i]->mutable_data<uint8_t>();
+  }
+  CAFFE_ENFORCE_GT(offset, 0);
+  CAFFE_ENFORCE_GT(inter_center_distance, 0);
+  size_t QCUnroll = ((C / 8) / 8) * 8;
+  // Each worker loads an L1 cache sized block.
+  // We read/write B * K * 4 + 2 * B * (K / 8), so to fit inside C, we have
+  // B = 4 * C / 17 K.
+  // QCUnroll = 0;
+  const size_t rowsPerBlock =
+      std::max<size_t>(std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * C)), 1);
+  state->parallelFor(divRoundUp(N, rowsPerBlock), [&](size_t nb) {
+    for (size_t n = nb * rowsPerBlock; n < std::min<size_t>(nb * rowsPerBlock + rowsPerBlock, N);
+         ++n) {
+      std::array<uint8_t*, k2b1bXBits> XQoff = {{
+          XQdata[0] + 0 + QC * n, XQdata[1] + 0 + QC * n,
+      }};
+      quantize2bNeon(QCUnroll, &Xdata[0 + C * n], offset, inter_center_distance, XQoff);
+      for (size_t qc = QCUnroll; qc < QC; ++qc) {
+        // compute the block in X.
+        std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
+        for (size_t b = 0; b < 8; ++b) {
+          const size_t c = qc * 8 + b;
+          if (c < C) {
+            float v = Xdata[c + C * n];
+            if (v < offset) {
+              // zero'd already.
+            } else if (v < offset + inter_center_distance) {
+              p[0] |= 1 << b;
+            } else if (v < offset + 2 * inter_center_distance) {
+              p[1] |= 1 << b;
+            } else {
+              p[0] |= 1 << b;
+              p[1] |= 1 << b;
+            }
+          }
+        }
+        for (auto i = 0; i < k2b1bXBits; ++i) {
+          XQdata[i][qc + QC * n] = p[i];
+        }
+      }
+    }
+  });
+}
+
+template <size_t TileSize, size_t TileDepthBytes>
+void uniformQuantize2b1bNeonPacked(QConvState* state,
+                                   const TensorCPU& X,
+                                   const std::vector<std::unique_ptr<TensorCPU>>& XQ,
+                                   float offset,
+                                   float inter_center_distance) {
+  const size_t M = X.size_to_dim(3);
+  const size_t K = X.size() / M;
+  const size_t QK = divRoundUp(K, 8);
+  const size_t numTiles = divRoundUp(M, TileSize);
+  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
+  for (size_t i = 0; i < k2b1bXBits; ++i) {
+    XQ[i]->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
+  }
+  const float* Xdata = X.data<float>();
+  std::array<uint8_t*, k2b1bXBits> XQdata;
+  for (auto i = 0; i < k2b1bXBits; ++i) {
+    XQdata[i] = XQ[i]->mutable_data<uint8_t>();
+  }
+  CAFFE_ENFORCE_GT(offset, 0);
+  CAFFE_ENFORCE_GT(inter_center_distance, 0);
+  // Each worker loads an L1 cache sized block.
+  // We read/write B * K * TileSize * 4 + 2 * B * TileSize * (K / 8), so to fit inside C, we have
+  // B = 4 * C / (17 * K * TileSize).
+  const size_t tilesPerBlock = std::max<size_t>(
+      std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * K * TileSize)), 1);
+  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
+    for (size_t i = nb * tilesPerBlock;
+         i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
+         ++i) {
+      for (size_t j = 0; j < numTilesDepth; ++j) {
+        if (i != numTiles - 1 && j != numTilesDepth - 1) {
+          // we have a full tile. Just memcpy.
+          for (auto ii = 0; ii < TileSize; ++ii) {
+            size_t m = i * TileSize + ii;
+            size_t k = j * TileDepthBytes * 8;
+            std::array<uint8_t*, k2b1bXBits> XQoff = {
+                {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                     TileSize * TileDepthBytes * numTilesDepth * i,
+                 XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                     TileSize * TileDepthBytes * numTilesDepth * i}};
+            quantize2bNeon(TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
+          }
+        } else {
+          for (size_t ii = 0; ii < TileSize; ++ii) {
+            size_t m = i * TileSize + ii;
+            size_t k = j * TileDepthBytes * 8;
+            std::array<uint8_t*, k2b1bXBits> XQoff = {
+                {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                     TileSize * TileDepthBytes * numTilesDepth * i,
+                 XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                     TileSize * TileDepthBytes * numTilesDepth * i}};
+            if (m < M && k + TileDepthBytes * 8 <= K) {
+              // We can just read the stripe directly.
+              quantize2bNeon(
+                  TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
+            } else {
+              // We need to pad the stripe to the full amount read by
+              // quantize2bNeon.
+              std::array<float, 8 * TileDepthBytes> Xpad = {{0}};
+              if (m < M) {
+                std::copy(&Xdata[m * K + k], &Xdata[m * K + K], Xpad.begin());
+              }
+              quantize2bNeon(TileDepthBytes, Xpad.data(), offset, inter_center_distance, XQoff);
+            }
+          }
+        }
+      }
+    }
+  });
+}
+
+// Packs a matrix (of size MxK) into a tiled array of size
+// (M/TileSize)x(K/TileDepthBytes)xTileSizexTileDepthBytes.
+template <size_t TileSize, size_t TileDepthBytes>
+void qpack_tiles(QConvState* state, const TensorCPU& X, size_t axis, TensorCPU* XP) {
+  const size_t M = X.size_to_dim(axis);
+  const size_t QK = X.size() / M;
+  const size_t numTiles = divRoundUp(M, TileSize);
+  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
+  XP->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
+
+  const auto* __restrict__ Xdata = X.data<uint8_t>();
+  auto* __restrict__ XPdata = XP->mutable_data<uint8_t>();
+  // Load L1 sized tiles per thread.
+  // We read/write 2 * B * QK * TileSize bytes, so
+  // B = C / (2 * QK * TileSize)
+  const size_t tilesPerBlock = std::max<size_t>(
+      std::floor<size_t>(double(kL1CacheSizeBytes) / double(2 * TileSize * QK)), 1);
+  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
+    for (size_t i = nb * tilesPerBlock;
+         i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
+         ++i) {
+      for (size_t j = 0; j < numTilesDepth; ++j) {
+        if (i != numTiles - 1 && j != numTilesDepth - 1) {
+          // we have a full tile. Just memcpy.
+          for (auto ii = 0; ii < TileSize; ++ii) {
+            auto m = i * TileSize + ii;
+            auto qk = j * TileDepthBytes;
+            std::memcpy(&XPdata[TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                                TileSize * TileDepthBytes * numTilesDepth * i],
+                        &Xdata[m * QK + qk],
+                        TileDepthBytes);
+          }
+        } else {
+          for (size_t ii = 0; ii < TileSize; ++ii) {
+            for (size_t jj = 0; jj < TileDepthBytes; ++jj) {
+              size_t m = i * TileSize + ii;
+              size_t qk = j * TileDepthBytes + jj;
+              uint8_t pval = 0;
+              if (m < M && qk < QK) {
+                // get value from X
+                pval = Xdata[m * QK + qk];
+              }
+              XPdata[jj + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
+                     TileSize * TileDepthBytes * numTilesDepth * i] = pval;
+            }
+          }
+        }
+      }
+    }
+  });
+}
+
+// Computes the kUnrollM x kUnrollM tile of a GEMM by multiplying two packed
+// slices of size (kUnrolLMxK). These tiles are constructed by the qpack_tiles
+// function, which packs an input array of size [M][K] into an
+// [M/TileSize][K/TileDepthBytes][TileSize][TileDepthBytes], which ensures all
+// the array accesses in this function is contiguous.
+template <size_t kUnrollM, size_t kUnrollN, size_t TileDepthBytes, typename F>
+void qgess_packed(const uint8_t* __restrict__ Ablock,
+                  const uint8_t* __restrict__ Bblock,
+                  float* __restrict__ Cblock,
+                  const size_t Cstride,
+                  const size_t QK,
+                  const size_t Nstart,
+                  F&& f) {
+  static_assert(kUnrollN % 8 == 0, "");
+  static_assert(TileDepthBytes == 16, "");
+  DCHECK_EQ(QK % 16, 0);
+  uint16x8_t acc[kUnrollM][kUnrollN / 8];
+  for (size_t mm = 0; mm < kUnrollM; ++mm) {
+    for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
+      acc[mm][nn] = vdupq_n_u16(0);
+    }
+  }
+  size_t qk = 0;
+  const size_t QK16Unroll = (QK / 16) * 16;
+  for (; qk < QK16Unroll; qk += 16) {
+    uint8x16_t Areg[kUnrollM];
+    for (size_t mm = 0; mm < kUnrollM; ++mm) {
+      Areg[mm] = vld1q_u8(Ablock);
+      Ablock += 16;
+    }
+
+    for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
+      uint8x16_t Breg[8];
+      for (size_t nnn = 0; nnn < 8; ++nnn) {
+        Breg[nnn] = vld1q_u8(Bblock);
+        Bblock += 16;
+      }
+      for (size_t mm = 0; mm < kUnrollM; ++mm) {
+        uint8x16_t cnts[8];
+        for (size_t nnn = 0; nnn < 8; ++nnn) {
+          cnts[nnn] = vcntq_u8(veorq_u8(Breg[nnn], Areg[mm]));
+        }
+        uint8x8_t ps[8];
+        for (size_t nnn = 0; nnn < 8; ++nnn) {
+          ps[nnn] = vadd_u8(vget_low_u8(cnts[nnn]), vget_high_u8(cnts[nnn]));
+        }
+        uint8x8_t pss[4];
+        for (size_t nnn = 0; nnn < 4; ++nnn) {
+          pss[nnn] = vpadd_u8(ps[2 * nnn], ps[2 * nnn + 1]);
+        }
+        uint8x8_t psss[2];
+        for (size_t nnn = 0; nnn < 2; ++nnn) {
+          psss[nnn] = vpadd_u8(pss[2 * nnn], pss[2 * nnn + 1]);
+        }
+        uint8x16_t out = vcombine_u8(psss[0], psss[1]);
+        acc[mm][nn] = vpadalq_u8(acc[mm][nn], out);
+      }
+    }
+  }
+
+  for (size_t mm = 0; mm < kUnrollM; ++mm) {
+    auto* Crow = Cblock + mm * Cstride;
+    for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
+      const int32x4_t K_ = vdupq_n_s32(QK * 8);
+      const int16x4_t two = vdup_n_s16(2);
+      const int16x4_t acc0123_l = vreinterpret_s16_u16(vget_low_u16(acc[mm][nn]));
+      const int16x4_t acc0123_h = vreinterpret_s16_u16(vget_high_u16(acc[mm][nn]));
+      const int32x4_t K_minus_2_acc0123_l = vmlsl_s16(K_, two, acc0123_l);
+      const int32x4_t K_minus_2_acc0123_h = vmlsl_s16(K_, two, acc0123_h);
+      f(Crow + nn * 8 + 0, vcvtq_f32_s32(K_minus_2_acc0123_l), Nstart + nn * 8 + 0);
+      f(Crow + nn * 8 + 4, vcvtq_f32_s32(K_minus_2_acc0123_h), Nstart + nn * 8 + 4);
+    }
+  }
+}
+
+// Computes the (normal + transpose) matrix-matrix product of two -1/1 binary
+// matrices, laid out in the standard format.
+template <size_t TileSize, size_t TileDepthBytes, typename F>
+inline void qgemm_nt_packed(
+    QConvState* state, const TensorCPU& A, const TensorCPU& B, TensorCPU* C, F&& f = F()) {
+  CAFFE_ENFORCE_EQ(A.ndim(), 4);
+  CAFFE_ENFORCE_EQ(B.ndim(), 4);
+  CAFFE_ENFORCE_EQ(A.dim(2), TileSize);
+  CAFFE_ENFORCE_EQ(B.dim(2), TileSize);
+  CAFFE_ENFORCE_EQ(A.dim(3), TileDepthBytes);
+  CAFFE_ENFORCE_EQ(B.dim(3), TileDepthBytes);
+  const size_t MT = A.dim(0);
+  const size_t NT = B.dim(0);
+  const size_t M = MT * TileSize;
+  const size_t N = NT * TileSize;
+
+  const size_t QKT = A.dim(1);
+  const size_t K = QKT * 8 * TileDepthBytes;
+  const size_t QK = K / 8;
+  CAFFE_ENFORCE_EQ(A.dim(1), B.dim(1));
+  C->Resize(M, N);
+  const auto* Adata = A.data<uint8_t>();
+  const auto* Bdata = B.data<uint8_t>();
+  auto* Cdata = C->mutable_data<float>();
+
+  // Assume TxT tile. Each input slice is of size T x (K/8) bytes, and the output
+  // is a tile of size T x T x sizeof(float) bytes. We want the sum of this to fit
+  // in L1 cache. This means for a block number of tiles B , we load B * T * K /
+  // 8 + B * T * K / 8 + B * B * T * T * sizeof(float).
+
+  // If cache size = C, we get
+  // B = 1/(32 * T) (sqrt(256 C + K^2) - K)
+  // taking floor (by integer division), gives the result.
+
+  // Assume 16KB L1 cache.
+  size_t tilesPerBlock =
+      std::floor((std::sqrt(256 * kL1CacheSizeBytes + K * K) - K) / (32 * TileSize));
+  if (tilesPerBlock < 1) {
+    tilesPerBlock = 1;
+  }
+  CAFFE_ENFORCE_LT(K, std::pow(2, 16));
+  CAFFE_ENFORCE_EQ(M % TileSize, 0);
+  CAFFE_ENFORCE_EQ(N % TileSize, 0);
+  const size_t MNumTiles = M / TileSize;
+  const size_t NNumTiles = N / TileSize;
+  const size_t MNumBlocks = divRoundUp(MNumTiles, tilesPerBlock);
+  const size_t NNumBlocks = divRoundUp(NNumTiles, tilesPerBlock);
+
+  state->parallelFor(MNumBlocks * NNumBlocks, [&](size_t mn) {
+    const size_t mBlockIdx = mn / NNumBlocks;
+    const size_t nBlockIdx = mn % NNumBlocks;
+    const size_t mTileStart = mBlockIdx * tilesPerBlock;
+    const size_t nTileStart = nBlockIdx * tilesPerBlock;
+    for (size_t mBlockTileIdx = 0;
+         mBlockTileIdx < tilesPerBlock && mBlockTileIdx + mTileStart < MNumTiles;
+         ++mBlockTileIdx) {
+      const size_t mTileIdx = mBlockTileIdx + mTileStart;
+      for (size_t nBlockTileIdx = 0;
+           nBlockTileIdx < tilesPerBlock && nBlockTileIdx + nTileStart < NNumTiles;
+           ++nBlockTileIdx) {
+        const size_t nTileIdx = nBlockTileIdx + nTileStart;
+        // A layout: [M/TileSize][QK / TileDepth][TileSize][TileDepth]
+        // C layout: [M/TileSize][TileSize][N/TileSize][TileSize]
+        const auto* Ablock = &Adata[mTileIdx * QK * TileSize];
+        const auto* Bblock = &Bdata[nTileIdx * QK * TileSize];
+        auto* Cblock = &Cdata[mTileIdx * TileSize * N + nTileIdx * TileSize];
+        const size_t Cstride = N;
+        qgess_packed<TileSize, TileSize, TileDepthBytes, F>(
+            Ablock, Bblock, Cblock, Cstride, QK, nTileIdx * TileSize, std::forward<F>(f));
+      }
+    }
+  });
+}
+
+void run2b1bConvIm2ColGEMM(QConvState* state,
+                           const ConvArgs& args,
+                           const TensorCPU& X,
+                           TensorCPU* Y) {
+  // TODO: packing + quantization in same block.
+  const size_t KH = state->WQ->dim32(1);
+  const size_t KW = state->WQ->dim32(2);
+  const size_t OH = (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1;
+  const size_t OW = (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1;
+  const size_t OC = state->WQ->dim32(0);
+  const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
+  Y->Resize(X.dim32(0), OH, OW, OC);
+  if (!state->WQPacked) {
+    state->WQPacked = caffe2::make_unique<TensorCPU>();
+    qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
+    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
+    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
+    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(2), kGEMMTileSize);
+    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(3), kGEMMTileDepthBytes);
+
+    // We can fuse the bias addition into the filter normalization. We can
+    // replace the bias + 3/2 normalization factor by replacing normalization
+    // with (2/3 bias + normalization), and setting bias to zero.
+    if (state->bias) {
+      for (size_t i = 0; i < state->bias->size(); ++i) {
+        state->WQN->mutable_data<float>()[i] += 2.0f / 3 * state->bias->data<float>()[i];
+      }
+    }
+    state->bias.reset();
+
+    // If we have to pad when we pack our weight tiles, then we need to adjust
+    // the normalization factor by the number of zeros that we added.
+    const size_t QKPadding = divRoundUp(QK, kGEMMTileDepthBytes) * kGEMMTileDepthBytes - QK;
+    if (QKPadding != 0) {
+      for (size_t i = 0; i < state->WQN->size(); ++i) {
+        state->WQN->mutable_data<float>()[i] -= QKPadding * 8;
+      }
+    }
+  }
+  CAFFE_ENFORCE(!state->bias.get());
+  // Since 1x1s are so common, we fuse the quantization + packing steps.
+  const bool is_1x1 = KH == 1 && KW == 1 && args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 &&
+                      args.pad_t == 0 && args.stride_h == 1 && args.stride_w == 1;
+
+  if (is_1x1) {
+    CAFFE_ENFORCE_EQ(OH, X.dim32(1));
+    CAFFE_ENFORCE_EQ(OW, X.dim32(2));
+    uniformQuantize2b1bNeonPacked<kGEMMTileSize, kGEMMTileDepthBytes>(
+        state, X, state->XQs, 0.5, 1.0);
+  } else {
+    uniformQuantize2b1bNeon(state, X, state->XQs, 0.5, 1.0);
+  }
+  TensorCPU* YQ0 = state->YQs[0].get();
+
+  if (state->WQ->dim32(0) % kGEMMTileSize == 0) {
+    // We can run inplace by operating on our Y vector, and then shrinking Y.
+    YQ0 = Y;
+  }
+
+  for (size_t i = 0; i < k2b1bXBits; ++i) {
+    const auto& XQ = *(state->XQs[i]);
+    if (!is_1x1) {
+      qim2col(args, XQ, *(state->WQ), state->scratchColBuffer.get());
+      qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(
+          state, *(state->scratchColBuffer), 3, state->scratch.get());
+    }
+
+    {
+      const auto* __restrict__ WQNdata = state->WQN->data<float>();
+      switch (i) {
+      case 0:
+        qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
+            state,
+            is_1x1 ? XQ : *(state->scratch),
+            *(state->WQPacked),
+            YQ0,
+            [WQNdata](float* __restrict__ acc, float32x4_t value, size_t channel) {
+              // acc[c] = 3/2 WQN[c] + 1/2 value[c];
+              const float32x4_t _32 = vdupq_n_f32(3.0f / 2);
+              const float32x4_t _12 = vdupq_n_f32(1.0f / 2);
+              const float32x4_t WQNc_32 = vmulq_f32(_32, vld1q_f32(WQNdata + channel));
+              const float32x4_t WQNc_32_value_12 = vmlaq_f32(WQNc_32, _12, value);
+              vst1q_f32(acc, WQNc_32_value_12);
+            });
+        break;
+      case 1:
+        qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
+            state,
+            is_1x1 ? XQ : *(state->scratch),
+            *(state->WQPacked),
+            YQ0,
+            [](float* __restrict__ acc, float32x4_t value, size_t channel) {
+              const float32x4_t curr = vld1q_f32(acc);
+              vst1q_f32(acc, vaddq_f32(curr, value));
+            });
+        break;
+      }
+    }
+  }
+
+  if (YQ0 != Y) {
+    // In this case, the stride does not match, so we need to copy the output
+    // data into the contiguous Y matrix.
+    const size_t F = state->WQ->dim(0);
+    const size_t N = Y->size() / F;
+    const size_t NP = YQ0->dim32(0);
+    const size_t FP = YQ0->dim32(1);
+    math::CopyMatrix<CPUContext>(
+        sizeof(float), N, F, YQ0->data<float>(), FP, Y->mutable_data<float>(), F, nullptr);
+  } else {
+    CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
+    CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
+    Y->Shrink(X.dim32(0) * OH * OW);
+    Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});
+  }
+}
+
+bool run2b1bConvNeon(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
+  // TODO: insert specialized cases (e.g. depthwise convolutions, the direct
+  // convolution.
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  run2b1bConvIm2ColGEMM(state, args, X, Y);
+  return true;
+}
+
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ulp2/ulp_neon.h b/caffe2/mobile/contrib/ulp2/ulp_neon.h
new file mode 100644
index 0000000..6ec1bd1
--- /dev/null
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "ulp.h"
+
+namespace caffe2 {
+
+constexpr size_t kGEMMTileSize = 64;
+constexpr size_t kGEMMTileDepthBytes = 16;
+
+bool run2b1bConvNeon(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y);
+}
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
new file mode 100644
index 0000000..58bc5e7
--- /dev/null
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -0,0 +1,459 @@
+#include "ulp.h"
+#include "ulp_neon.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+void conv(const ConvArgs& args,
+          const TensorCPU& X,
+          const TensorCPU& W,
+          const TensorCPU* b,
+          TensorCPU* Y) {
+  const auto N = X.dim32(0);
+  const auto IH = X.dim32(1);
+  const auto IW = X.dim32(2);
+  const auto KH = W.dim32(1);
+  const auto KW = W.dim32(2);
+  const auto IC = W.dim32(3);
+  Y->Resize(X.dim32(0),
+            (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
+            (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
+            W.dim32(0));
+  CHECK_EQ(W.dim32(3), X.dim32(3));
+  const auto OH = Y->dim32(1);
+  const auto OW = Y->dim32(2);
+  const auto OC = Y->dim32(3);
+
+  const auto* Xdata = X.data<float>();
+  const auto* Wdata = W.data<float>();
+  auto* Ydata = Y->mutable_data<float>();
+  for (auto n = 0; n < N; ++n) {
+    for (auto oh = 0; oh < OH; ++oh) {
+      for (auto ow = 0; ow < OW; ++ow) {
+        for (auto oc = 0; oc < OC; ++oc) {
+          float acc = b ? b->data<float>()[oc] : 0.0;
+          for (int kh = 0; kh < KH; ++kh) {
+            for (int kw = 0; kw < KW; ++kw) {
+              for (int ic = 0; ic < IC; ++ic) {
+                if (kh + args.stride_h * oh - args.pad_t < 0 ||
+                    kh + args.stride_h * oh - args.pad_t >= IH ||
+                    kw + args.stride_w * ow - args.pad_l < 0 ||
+                    kw + args.stride_w * ow - args.pad_l >= IW) {
+                  continue;
+                }
+                const auto x =
+                    Xdata[ic + IC * (kw + args.stride_w * ow - args.pad_l) +
+                          IC * IW * (kh + args.stride_h * oh - args.pad_t) + n * IC * IW * IH];
+                const auto w = Wdata[ic + IC * kw + IC * KW * kh + IC * KW * KH * oc];
+                acc += x * w;
+              }
+            }
+          }
+          Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] = acc;
+        }
+      }
+    }
+  }
+}
+
+int randInt(int a, int b) {
+  std::random_device rd;
+  std::default_random_engine gen(rd());
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+TensorCPU genTensor11(std::vector<TIndex> shape) {
+  TensorCPU r;
+  r.Resize(shape);
+
+  std::random_device rd;
+  std::default_random_engine gen(rd());
+  std::uniform_real_distribution<float> dis(0, 1);
+
+  for (auto i = 0; i < r.size(); ++i) {
+    r.mutable_data<float>()[i] = dis(gen) > 0.5 ? -1.0 : 1.0;
+  };
+  return r;
+}
+
+TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
+  TensorCPU r;
+  r.Resize(shape);
+
+  std::random_device rd;
+  std::default_random_engine gen(rd());
+  std::uniform_real_distribution<float> dis(-5.0, 5.0);
+
+  for (auto i = 0; i < r.size(); ++i) {
+    r.mutable_data<float>()[i] = dis(gen);
+  };
+  return r;
+}
+
+TensorCPU genTensor0123(std::vector<TIndex> shape) {
+  TensorCPU r;
+  r.Resize(shape);
+
+  std::random_device rd;
+  std::default_random_engine gen(rd());
+  std::uniform_real_distribution<float> dis(0.1, 3.9);
+
+  for (auto i = 0; i < r.size(); ++i) {
+    r.mutable_data<float>()[i] = std::floor(dis(gen));
+  };
+  return r;
+}
+
+TEST(ULP, QPadZero) {
+  ConvArgs args;
+  args.pad_l = 1;
+  args.pad_r = 1;
+  args.pad_t = 1;
+  args.pad_b = 1;
+
+  const auto ICQ = 1;
+
+  auto X = genTensor11({1, 10, 10, ICQ * 8});
+  TensorCPU XQ, XQPad;
+  signQuantize(X, &XQ);
+  qpad_zero(args, XQ, &XQPad);
+
+  EXPECT_EQ(XQ.dim32(0), XQPad.dim32(0));
+  EXPECT_EQ(XQ.dim32(1), XQPad.dim32(1) - 2 * args.pad_l);
+  EXPECT_EQ(XQ.dim32(2), XQPad.dim32(2) - 2 * args.pad_t);
+  EXPECT_EQ(XQ.dim32(3), XQPad.dim32(3));
+  EXPECT_EQ(XQ.dim32(3), ICQ);
+  EXPECT_EQ(XQPad.dim32(3), ICQ);
+  const auto* XQdata = XQ.data<uint8_t>();
+  const auto* XQPaddata = XQPad.data<uint8_t>();
+  for (auto oh = 0; oh < XQPad.dim32(1); ++oh) {
+    for (auto ow = 0; ow < XQPad.dim32(2); ++ow) {
+      for (auto icq = 0; icq < ICQ; ++icq) {
+        auto ih = oh - args.pad_l;
+        auto iw = ow - args.pad_t;
+        if (ih < 0 || ih >= XQ.dim32(1) || iw < 0 || iw >= XQ.dim32(2)) {
+          EXPECT_EQ(XQPaddata[icq + ICQ * ow + ICQ * XQPad.dim32(2) * oh], 0);
+        } else {
+          EXPECT_EQ(XQPaddata[icq + ICQ * ow + ICQ * XQPad.dim32(2) * oh],
+                    XQdata[icq + ICQ * iw + ICQ * XQ.dim32(2) * ih]);
+        }
+      }
+    }
+  }
+}
+
+inline void gemmNT(int M, int N, int K, const float* A, const float* B, float* C) {
+  for (auto m = 0; m < M; ++m) {
+    for (auto n = 0; n < N; ++n) {
+      float acc = 0.0;
+      for (auto k = 0; k < K; ++k) {
+        acc += A[m * K + k] * B[n * K + k];
+      }
+      C[m * N + n] = acc;
+    }
+  }
+}
+
+inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, float* C) {
+  CHECK_EQ(K % 8, 0);
+  const int QK = K / 8;
+  for (auto m = 0; m < M; ++m) {
+    for (auto n = 0; n < N; ++n) {
+      float acc = 0.0;
+      for (auto qk = 0; qk < QK; ++qk) {
+        uint8_t mk = A[m * QK + qk];
+        uint8_t nk = B[n * QK + qk];
+        auto cnt = __builtin_popcount(mk ^ nk);
+        acc += cnt;
+      }
+      C[m * N + n] = K - 2 * acc;
+    }
+  }
+}
+
+void gemmTest(TIndex M, TIndex N, TIndex K) {
+  auto X = genTensor11({M, K});
+  auto W = genTensor11({N, K});
+  TensorCPU XQ, WQ, YQ, Y;
+  {
+    signQuantize(X, &XQ);
+    signQuantize(W, &WQ);
+    YQ.Resize(M, N);
+    qgemmNT(M, N, K, XQ.data<uint8_t>(), WQ.data<uint8_t>(), YQ.mutable_data<float>());
+  }
+  {
+    Y.Resize(M, N);
+    gemmNT(M, N, K, X.data<float>(), W.data<float>(), Y.mutable_data<float>());
+  }
+  EXPECT_TRUE(Y.dims() == YQ.dims());
+  for (auto i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
+  }
+}
+
+TEST(QConv, GemmTest) {
+  gemmTest(8, 64, 64);
+  gemmTest(16, 64, 256);
+  gemmTest(24, 128, 192);
+  gemmTest(32, 64, 64);
+  gemmTest(40, 64, 128);
+  gemmTest(64, 64, 256);
+}
+
+TEST(QConv, ConvTest) {
+  int S = 9;
+  int IC = 16;
+  int OC = 28;
+  int K = 3;
+  auto X = genTensor11({1, S, S, IC});
+  auto W = genTensor11({OC, K, K, IC});
+  TensorCPU XQ, WQ, YQ, Y;
+  {
+    signQuantize(X, &XQ);
+    signQuantize(W, &WQ);
+    qconv(ConvArgs{}, XQ, WQ, nullptr, &YQ);
+  }
+  { conv(ConvArgs{}, X, W, nullptr, &Y); }
+  EXPECT_TRUE(Y.dims() == YQ.dims());
+  for (auto i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
+  }
+}
+
+void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs args) {
+  args.stride_h = std::min(args.stride_h, KH);
+  args.stride_w = std::min(args.stride_w, KW);
+  args.pad_l = std::min(args.pad_l, KW - 1);
+  args.pad_r = std::min(args.pad_r, KW - 1);
+  args.pad_t = std::min(args.pad_t, KH - 1);
+  args.pad_b = std::min(args.pad_b, KH - 1);
+
+  LOG(INFO) << "IC: " << IC << ", KH: " << KH << ", KW: " << KW << ", H: " << H << ", W: " << W
+            << ", OC: " << OC << ", N: " << N << ", pad_l: " << args.pad_l
+            << ", pad_r: " << args.pad_r << ", pad_t: " << args.pad_t << ", pad_b: " << args.pad_b
+            << ", stride_h: " << args.stride_h << ", stride_w: " << args.stride_w;
+  auto X = genTensor0123({N, H, W, IC});
+  auto W_ = genTensor11({OC, KH, KW, IC});
+  auto bias = genTensorUniform11({OC});
+  TensorCPU Y, YQ, Y2b1b, YOP;
+
+  {
+    std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
+    std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
+    for (auto i = 0; i < k2b1bXBits; ++i) {
+      XQs[i] = caffe2::make_unique<TensorCPU>();
+      YQs[i] = caffe2::make_unique<TensorCPU>();
+    }
+    TensorCPU WQN, WQ;
+    uniformQuantize2b1b(X, XQs, 0.5, 1.0);
+    signQuantize(W_, &WQ);
+    filterNormalization11(WQ, &WQN);
+    for (auto i = 0; i < XQs.size(); ++i) {
+      qconv(args, *(XQs[i]), WQ, nullptr, YQs[i].get());
+    }
+    YQ.ResizeLike(*YQs[0]);
+    const auto F = WQ.dim(0);
+    const auto N = YQ.size() / F;
+
+    run2b1bUnification(nullptr,
+                       N,
+                       F,
+                       WQN.data<float>(),
+                       YQs[0]->data<float>(),
+                       YQs[1]->data<float>(),
+                       F,
+                       YQ.mutable_data<float>(),
+                       F,
+                       bias.data<float>());
+  }
+
+  {
+    Workspace ws;
+    auto state = create2b1bConvState(&ws, W_, &bias);
+    run2b1bConvGeneric(state.get(), args, X, &Y2b1b);
+  }
+  {
+    Workspace ws;
+    OperatorDef def;
+    def.set_type("QConv");
+    def.add_input("X");
+    def.add_input("W");
+    def.add_input("b");
+    def.add_output("Y");
+    def.add_arg()->CopyFrom(MakeArgument("kernel_h", KH));
+    def.add_arg()->CopyFrom(MakeArgument("order", std::string("NHWC")));
+    def.add_arg()->CopyFrom(MakeArgument("kernel_w", KW));
+    def.add_arg()->CopyFrom(MakeArgument("stride_h", args.stride_h));
+    def.add_arg()->CopyFrom(MakeArgument("stride_w", args.stride_w));
+    def.add_arg()->CopyFrom(MakeArgument("pad_l", args.pad_l));
+    def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
+    def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
+    def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
+    auto* Xws = ws.CreateBlob("X")->GetMutable<TensorCPU>();
+    Xws->ResizeLike(X);
+    Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
+    auto* Wws = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    Wws->ResizeLike(W_);
+    Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
+    auto* bws = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    bws->ResizeLike(bias);
+    bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
+    ws.RunOperatorOnce(def);
+    YOP.CopyFrom<CPUContext>(ws.GetBlob("Y")->Get<TensorCPU>());
+  }
+
+  { conv(args, X, W_, &bias, &Y); }
+
+  EXPECT_TRUE(Y.dims() == YQ.dims());
+  EXPECT_TRUE(Y.dims() == Y2b1b.dims());
+  EXPECT_TRUE(Y.dims() == YOP.dims());
+
+  // for (auto i = 0; i < Y.size(); ++i) {
+  //   LOG(INFO) << "i: " << i << ", y[i]: " << Y.data<float>()[i]
+  //             << ", y2b1b[i]: " << Y2b1b.data<float>()[i] << ", yq[i]: " << YQ.data<float>()[i];
+  // }
+
+  for (auto i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
+  }
+
+  for (auto i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(Y.data<float>()[i], Y2b1b.data<float>()[i], 1e-3);
+  }
+
+  for (auto i = 0; i < Y.size(); ++i) {
+    EXPECT_NEAR(Y.data<float>()[i], YOP.data<float>()[i], 1e-3);
+  }
+}
+
+ConvArgs ca(size_t pad = 0, size_t stride = 1) {
+  ConvArgs r;
+  r.pad_l = pad;
+  r.pad_r = pad;
+  r.pad_t = pad;
+  r.pad_b = pad;
+  r.stride_w = stride;
+  r.stride_h = stride;
+  return r;
+}
+
+TEST(QConv, 2b1bConvTest) {
+  ConvTest2b1b(40, 3, 4, 10, 10, 32, 1, ca());
+  ConvTest2b1b(59, 1, 1, 1, 1, 1, 1, ca());
+  ConvTest2b1b(59, 2, 2, 3, 3, 1, 1, ca());
+  ConvTest2b1b(59, 2, 2, 3, 3, 64, 1, ca());
+  ConvTest2b1b(64, 1, 1, 1, 1, 1, 1, ca());
+  ConvTest2b1b(64, 1, 1, 1, 1, 64, 1, ca());
+  ConvTest2b1b(64, 2, 2, 3, 3, 1, 1, ca());
+  ConvTest2b1b(64, 1, 1, 3, 3, 1, 1, ca());
+  ConvTest2b1b(128, 1, 1, 1, 1, 128, 1, ca());
+  ConvTest2b1b(128, 1, 1, 8, 8, 8, 1, ca());
+  ConvTest2b1b(128, 3, 3, 25, 100, 16, 1, ca());
+  ConvTest2b1b(64, 3, 3, 10, 10, 8, 1, ca());
+  ConvTest2b1b(128, 1, 3, 10, 10, 16, 1, ca());
+  ConvTest2b1b(256, 3, 3, 14, 17, 128, 1, ca());
+  ConvTest2b1b(512, 3, 3, 3, 3, 3, 1, ca());
+  ConvTest2b1b(64, 5, 5, 14, 17, 15, 1, ca(1, 2));
+  ConvTest2b1b(64, 1, 3, 14, 17, 32, 1, ca());
+  ConvTest2b1b(64, 2, 1, 14, 17, 7, 1, ca());
+  ConvTest2b1b(128, 1, 1, 14, 17, 128, 1, ca());
+  ConvTest2b1b(128, 1, 1, 14, 17, 32, 1, ca());
+}
+
+TEST(QConv, 2b1bInputPackingTest) {
+  ConvTest2b1b(64, 1, 1, 1, 1, 128, 1, ca());
+  ConvTest2b1b(8, 1, 1, 1, 1, 1, 1, ca());
+  ConvTest2b1b(2, 1, 1, 1, 1, 1, 1, ca());
+  ConvTest2b1b(2, 1, 1, 3, 3, 1, 1, ca());
+  ConvTest2b1b(2, 2, 2, 3, 3, 1, 1, ca());
+}
+
+TEST(QConv, 2b1bConvTestRandomized) {
+  auto rca = []() {
+    ConvArgs r;
+    r.pad_l = randInt(0, 3);
+    r.pad_r = randInt(0, 3);
+    r.pad_t = randInt(0, 3);
+    r.pad_b = randInt(0, 3);
+    r.stride_w = randInt(1, 3);
+    r.stride_h = randInt(1, 3);
+    return r;
+  };
+  for (auto i = 0; i < 10; ++i) {
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 randInt(1, 4),
+                 randInt(1, 4),
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 64),
+                 randInt(1, 2),
+                 rca());
+    // Test 3x3 path.
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 3,
+                 3,
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 64),
+                 randInt(1, 2),
+                 rca());
+
+    // Test 3x3s2 path.
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 3,
+                 3,
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 64),
+                 randInt(1, 2),
+                 rca());
+    // Test 3x3 path with packing.
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 3,
+                 3,
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 8) * kGEMMTileSize,
+                 randInt(1, 2),
+                 rca());
+    // Test 1x1 path
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 1,
+                 1,
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 64),
+                 randInt(1, 2),
+                 ca());
+
+    // Test 1x1 with direct packing
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 1,
+                 1,
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 4) * kGEMMTileSize,
+                 randInt(1, 2),
+                 ca());
+    // Entirely arbitrary, no padding codepath.
+    ConvTest2b1b(randInt(1, 64) * 8,
+                 randInt(1, 4),
+                 randInt(1, 4),
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 128),
+                 randInt(1, 2),
+                 rca());
+    // Entirely arbitrary, mixed codepath.
+    ConvTest2b1b(randInt(1, 64),
+                 randInt(1, 4),
+                 randInt(1, 4),
+                 randInt(5, 12),
+                 randInt(5, 12),
+                 randInt(1, 128),
+                 randInt(1, 2),
+                 rca());
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/mpi/CMakeLists.txt b/caffe2/mpi/CMakeLists.txt
new file mode 100644
index 0000000..796b8f7
--- /dev/null
+++ b/caffe2/mpi/CMakeLists.txt
@@ -0,0 +1,29 @@
+if(USE_MPI AND MPI_CXX_FOUND)
+    set(Caffe2_MPI_CPU_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/mpi_common.cc"
+        "${CMAKE_CURRENT_SOURCE_DIR}/mpi_ops.cc"
+        # TODO: properly compile this together with python.
+        # "${CMAKE_CURRENT_SOURCE_DIR}/mpi_python.cc"
+    )
+    set(Caffe2_MPI_GPU_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/mpi_ops_gpu.cc"
+    )
+    set(Caffe2_MPI_CPU_TEST_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/mpi_test.cc"
+    )
+    set(Caffe2_MPI_GPU_TEST_SRC
+        "${CMAKE_CURRENT_SOURCE_DIR}/mpi_gpu_test.cc"
+    )
+
+    # pass up to calling script
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_MPI_CPU_SRC})
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_MPI_GPU_SRC})
+    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+    set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${Caffe2_MPI_CPU_TEST_SRC})
+    set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+    set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${Caffe2_MPI_GPU_TEST_SRC})
+    set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+else()
+    message(STATUS "MPI operators skipped due to no MPI support")
+endif()
diff --git a/caffe2/mpi/mpi_common.cc b/caffe2/mpi/mpi_common.cc
new file mode 100644
index 0000000..ff5bef6
--- /dev/null
+++ b/caffe2/mpi/mpi_common.cc
@@ -0,0 +1,178 @@
+#include "caffe2/mpi/mpi_common.h"
+
+#include <thread>
+
+#include "caffe2/core/typeid.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(MPICommonWorldWrapper);
+
+static std::mutex gCaffe2MPIMutex;
+
+std::mutex& MPIMutex() {
+  return gCaffe2MPIMutex;
+}
+
+static MPI_Comm gCaffe2MPIComm = MPI_COMM_WORLD;
+
+MPI_Comm GlobalMPIComm() {
+  return gCaffe2MPIComm;
+}
+
+void SetGlobalMPIComm(MPI_Comm new_comm) {
+  if (gCaffe2MPIComm != MPI_COMM_WORLD) {
+    MPI_Comm_free(&gCaffe2MPIComm);
+  }
+  gCaffe2MPIComm = new_comm;
+}
+
+int MPICommSize(MPI_Comm comm) {
+  int comm_size;
+  MPI_CHECK(MPI_Comm_size(comm, &comm_size));
+  return comm_size;
+}
+
+int MPICommRank(MPI_Comm comm) {
+  int comm_rank;
+  MPI_CHECK(MPI_Comm_rank(comm, &comm_rank));
+  return comm_rank;
+}
+
+/**
+ * Helper function used to setup MPI intercommunicator.
+ */
+static MPI_Comm AssimilateComm(MPI_Comm intra, MPI_Comm inter) {
+  MPI_Comm peer = MPI_COMM_NULL;
+  MPI_Comm newInterComm = MPI_COMM_NULL;
+  MPI_Comm newIntraComm = MPI_COMM_NULL;
+
+  // The spawned rank will be the "high" rank in the new intra-comm
+  int high = (MPI_COMM_NULL == intra) ? 1 : 0;
+
+  // If this is one of the (two) ranks in the inter-comm,
+  // create a new intra-comm from the inter-comm
+  if (MPI_COMM_NULL != inter) {
+    MPI_CHECK(MPI_Intercomm_merge(inter, high, &peer));
+  } else {
+    peer = MPI_COMM_NULL;
+  }
+
+  // Create a new inter-comm between the pre-existing intra-comm
+  // (all of it, not only rank zero), and the remote (spawned) rank,
+  // using the just-created intra-comm as the peer communicator.
+  int tag = 12345;
+  if (MPI_COMM_NULL != intra) {
+    // This task is a member of the pre-existing intra-comm
+    MPI_CHECK(MPI_Intercomm_create(intra, 0, peer, 1, tag, &newInterComm));
+  } else {
+    // This is the remote (spawned) task
+    MPI_CHECK(
+        MPI_Intercomm_create(MPI_COMM_SELF, 0, peer, 0, tag, &newInterComm));
+  }
+
+  // Now convert this inter-comm into an intra-comm
+  MPI_CHECK(MPI_Intercomm_merge(newInterComm, high, &newIntraComm));
+
+  // Clean up the intermediaries
+  if (MPI_COMM_NULL != peer) {
+    MPI_CHECK(MPI_Comm_free(&peer));
+  }
+  MPI_CHECK(MPI_Comm_free(&newInterComm));
+
+  // Delete the original intra-comm
+  if (MPI_COMM_NULL != intra && MPI_COMM_WORLD != intra &&
+      GlobalMPIComm() != intra) {
+    MPI_CHECK(MPI_Comm_free(&intra));
+  }
+
+  // Return the new intra-comm
+  return newIntraComm;
+}
+
+void MPISetupPeers(
+    const int replicas,
+    const string& role,
+    const string& job_path) {
+  int flag;
+  MPI_Initialized(&flag);
+  if (!flag) {
+    int mpi_ret;
+    MPI_CHECK(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &mpi_ret));
+    if (mpi_ret != MPI_THREAD_MULTIPLE && mpi_ret != MPI_THREAD_SERIALIZED) {
+      LOG(FATAL) << "This test requires the underlying MPI to support the "
+                 << "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.";
+      return;
+    }
+  }
+
+  if (MPICommSize(MPI_COMM_WORLD) != 1) {
+    LOG(ERROR) << "MPI_COMM_WORLD size is not 1: did you already run "
+                  "MPISetupPeers? Note that if you execute your program with "
+                  "mpirun to launch multiple local processes, you should not "
+                  "call MPISetupPeers.";
+    return;
+  }
+
+  if (role == "server") {
+    // Open a port to accept connections.
+    char port_name[MPI_MAX_PORT_NAME] = {'\0'};
+    MPI_CHECK(MPI_Open_port(MPI_INFO_NULL, port_name));
+    VLOG(1) << "MPI server: port: " << port_name;
+
+    // Writes the port name to the file.
+    CHECK(WriteStringToFile(std::string(port_name), job_path.c_str()));
+    VLOG(1) << "MPI server: wrote to file: " << job_path;
+
+    int comm_size = MPICommSize(GlobalMPIComm());
+    while (comm_size < replicas) {
+      MPI_Comm icomm;
+      VLOG(1) << "MPI server: waiting for client "
+              << "(" << comm_size << "/" << replicas << " have connected)";
+      MPI_CHECK(
+          MPI_Comm_accept(port_name, MPI_INFO_NULL, 0, MPI_COMM_SELF, &icomm));
+      VLOG(1) << "MPI server: accepted client";
+      MPI_Comm new_intra_comm = AssimilateComm(GlobalMPIComm(), icomm);
+      SetGlobalMPIComm(new_intra_comm);
+      comm_size = MPICommSize(new_intra_comm);
+    }
+  } else {
+    // Opens the job path file to obtain server address.
+    std::string port_name;
+    while (!ReadStringFromFile(job_path.c_str(), &port_name) ||
+           port_name.length() == 0) {
+      /* sleep override */
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    }
+
+    // Connect to server.
+    MPI_Comm icomm;
+    VLOG(1) << "MPI client: connecting to port: " << port_name;
+    MPI_CHECK(MPI_Comm_connect(
+        const_cast<char*>(port_name.c_str()),
+        MPI_INFO_NULL,
+        0,
+        GlobalMPIComm(),
+        &icomm));
+
+    VLOG(1) << "MPI client: connected";
+
+    // Join the server's reference intracommunicator.
+    MPI_Comm new_intra_comm = AssimilateComm(MPI_COMM_NULL, icomm);
+    SetGlobalMPIComm(new_intra_comm);
+
+    // Let other clients join the intracommunicator we're now a part of.
+    while (MPICommSize(GlobalMPIComm()) < replicas) {
+      MPI_Comm comm = AssimilateComm(GlobalMPIComm(), MPI_COMM_NULL);
+      SetGlobalMPIComm(comm);
+    }
+  }
+
+  // After all peers have assimilated, do a barrier.
+  MPI_Barrier(GlobalMPIComm());
+  VLOG(1) << "MPI using a communicator of size: "
+          << MPICommSize(GlobalMPIComm());
+}
+
+}  // namespace caffe2
diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h
new file mode 100644
index 0000000..3e1e7a5
--- /dev/null
+++ b/caffe2/mpi/mpi_common.h
@@ -0,0 +1,154 @@
+#ifndef CAFFE2_MPI_MPI_COMMON_H_
+#define CAFFE2_MPI_MPI_COMMON_H_
+
+#include <mpi.h>
+#include <mutex>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+inline void CheckInitializedMPI() {
+  int flag;
+  MPI_Initialized(&flag);
+  CAFFE_ENFORCE(flag, "MPI does not seem to have been initialized.");
+}
+
+template <typename T> class MPIDataTypeWrapper;
+
+#define MPI_DATATYPE_WRAPPER(c_type, mpi_type)                                 \
+  template<> class MPIDataTypeWrapper<c_type> {                                \
+   public:                                                                     \
+    inline static MPI_Datatype type() { return mpi_type; }                     \
+  };
+
+MPI_DATATYPE_WRAPPER(char, MPI_CHAR)
+MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
+MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
+// Note(Yangqing): as necessary, add more specializations.
+#undef MPI_DATATYPE_WRAPPER
+
+// For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard.
+std::mutex& MPIMutex();
+
+#define MPI_CHECK(condition)                                 \
+  do {                                                       \
+    std::lock_guard<std::mutex> guard(::caffe2::MPIMutex()); \
+    int error = (condition);                                 \
+    CAFFE_ENFORCE(                                           \
+        error == MPI_SUCCESS,                                \
+        "Caffe2 MPI Error at: ",                             \
+        __FILE__,                                            \
+        ":",                                                 \
+        __LINE__,                                            \
+        ": ",                                                \
+        error);                                              \
+  } while (0)
+
+/**
+ * @brief Gets the global MPI communicator used by Caffe2. In default, this
+ * is MPI_COMM_WORLD unless you call SetGlobalMPIComm().
+ */
+MPI_Comm GlobalMPIComm();
+
+/**
+ * @brief Sets the global MPI communicator. Caffe2 takes over the ownership
+ * of the passed in communicator.
+ */
+void SetGlobalMPIComm(MPI_Comm new_comm);
+
+/**
+ * @brief A helper function to return the size of the given communicator.
+ */
+int MPICommSize(MPI_Comm comm);
+
+/**
+ * @brief A helper function to return the rank of the given communicator.
+ */
+int MPICommRank(MPI_Comm comm);
+
+/**
+ * @brief A simple wrapper over an MPI common world.
+ */
+class MPICommonWorldWrapper {
+ public:
+  /**
+   * @brief Creates a common world wrapper.
+   *
+   * The new common world is created by taking the existing communicator
+   * passed in as src_comm, and splitting it using the color and the rank
+   * specified. In default, we will split from Caffe2's global communicator,
+   * and use color 0 as well as rank implicitly given by src_comm. As a result,
+   * the default constructor basically creates a comm identical to the source
+   * comm world.
+   */
+  explicit MPICommonWorldWrapper(
+      MPI_Comm src_comm = MPI_COMM_NULL,
+      int color = 0,
+      int rank = -1) {
+    if (src_comm == MPI_COMM_NULL) {
+      src_comm = GlobalMPIComm();
+    }
+    if (rank == -1) {
+      MPI_CHECK(MPI_Comm_rank(src_comm, &rank));
+    }
+    MPI_CHECK(MPI_Comm_split(src_comm, color, rank, &comm_));
+    MPI_CHECK(MPI_Comm_size(comm_, &size_));
+    MPI_CHECK(MPI_Comm_rank(comm_, &rank_));
+  }
+
+  ~MPICommonWorldWrapper() {
+    int ret;
+    MPI_CHECK(MPI_Finalized(&ret));
+    if (!ret) {
+      MPI_Comm_free(&comm_);
+    }
+  }
+
+  /**
+   * @brief Returns the common world held by the wrapper.
+   */
+  inline MPI_Comm comm() const {
+    return comm_;
+  }
+  /**
+   * @brief Returns the size of the world.
+   */
+  inline int size() const {
+    return size_;
+  }
+  /**
+   * @brief Returns the rank of this process in the world.
+   */
+  inline int rank() const {
+    return rank_;
+  }
+
+ private:
+  MPI_Comm comm_;
+  int size_;
+  int rank_;
+};
+
+/**
+ * A function used to perform peer setup so one does not need to use
+ * mpirun / mpiexec to run the binary. Note that if you use mpirun or mpiexec
+ * to set up the common world, do not use this function - MPI_Init would have
+ * already set that up.
+ *
+ * This also assumes that you have a common path (like NFS) that multiple
+ * instances can read from.
+ *
+ * Inputs:
+ *   replicas (int): the number of replicas that mpi will run with.
+ *   role (string): the role of this process, "server" or "client".
+ *   job_path (string): a file name that the server will write its port into
+ *       and the clients will read the server's port from.
+ */
+void MPISetupPeers(
+    const int replicas,
+    const string& role,
+    const string& job_path);
+}  // namespace caffe2
+
+#endif  // CAFFE2_MPI_MPI_COMMON_H_
diff --git a/caffe2/mpi/mpi_gpu_test.cc b/caffe2/mpi/mpi_gpu_test.cc
new file mode 100644
index 0000000..087a875
--- /dev/null
+++ b/caffe2/mpi/mpi_gpu_test.cc
@@ -0,0 +1,336 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mpi/mpi_common.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DEFINE_string(
+    caffe_test_root, "gen/", "The root of the caffe test folder.");
+
+namespace caffe2 {
+
+const char kBcastNet[] = R"NET(
+  name: "bcast"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X"
+    type: "MPIBroadcast"
+    arg {
+      name: "root"
+      i: 0
+    }
+  }
+  device_option {
+    device_type: 1
+  }
+)NET";
+
+TEST(MPITest, TestMPIBroadcast) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kBcastNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  for (int root = 0; root < size; ++root) {
+    net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
+    Workspace ws;
+    unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    EXPECT_NE(nullptr, net.get());
+    EXPECT_TRUE(net->Run());
+    // Let's test the value.
+    auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
+    TensorCPU X_cpu(X);
+    EXPECT_EQ(X.size(), 10);
+    for (int i = 0; i < X.size(); ++i) {
+      EXPECT_EQ(X_cpu.data<float>()[i], root);
+    }
+  }
+}
+
+const char kReduceNet[] = R"NET(
+  name: "reduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_reduced"
+    type: "MPIReduce"
+    arg {
+      name: "root"
+      i: 0
+    }
+  }
+  device_option {
+    device_type: 1
+  }
+)NET";
+
+TEST(MPITest, TestMPIReduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kReduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank0);
+  arg->set_f(rank0);
+  int size0;
+  MPI_Comm_size(MPI_COMM_WORLD, &size0);
+
+  for (int root = 0; root < size0; ++root) {
+    net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
+    Workspace ws;
+    unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    EXPECT_NE(nullptr, net.get());
+    EXPECT_TRUE(net->Run());
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    if (rank == root) {
+      // Let's test the value.
+      auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
+      EXPECT_EQ(X.size(), 10);
+      int expected_result = size * (size - 1) / 2;
+      TensorCPU X_cpu(X);
+      for (int i = 0; i < X.size(); ++i) {
+        EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
+      }
+    }
+  }
+}
+
+const char kMPIAllgatherNet[] = R"NET(
+  name: "allgather"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 2
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_gathered"
+    type: "MPIAllgather"
+  }
+  device_option {
+    device_type: 1
+  }
+)NET";
+
+TEST(MPITest, TestMPIAllgather) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kMPIAllgatherNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  // Let's test the value.
+  auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
+  TensorCPU X_cpu(X);
+  EXPECT_EQ(X.size(), 20);
+  for (int i = 0; i < X.size(); ++i) {
+    EXPECT_EQ(X_cpu.data<float>()[i], rank);
+  }
+  auto& X_gathered = ws.GetBlob("X_gathered")->Get<TensorCUDA>();
+  EXPECT_EQ(X_gathered.size(), 20 * size);
+  EXPECT_EQ(X_gathered.dim(0), 2 * size);
+  EXPECT_EQ(X_gathered.dim(1), 10);
+  TensorCPU X_gathered_cpu(X_gathered);
+  for (int i = 0; i < X_gathered.size(); ++i) {
+    EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
+  }
+}
+
+const char kMPIAllreduceNet[] = R"NET(
+  name: "allreduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_reduced"
+    type: "MPIAllreduce"
+  }
+  device_option {
+    device_type: 1
+  }
+)NET";
+
+TEST(MPITest, TestMPIAllreduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kMPIAllreduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  // Let's test the value.
+  auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
+  EXPECT_EQ(X.size(), 10);
+  TensorCPU X_cpu(X);
+  for (int i = 0; i < X.size(); ++i) {
+    EXPECT_EQ(X_cpu.data<float>()[i], rank);
+  }
+  auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
+  EXPECT_EQ(X_reduced.size(), 10);
+  int expected_result = size * (size - 1) / 2;
+  TensorCPU X_reduced_cpu(X_reduced);
+  for (int i = 0; i < X_reduced.size(); ++i) {
+    EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
+  }
+}
+
+const char kInPlaceMPIAllreduceNet[] = R"NET(
+  name: "allreduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X"
+    type: "MPIAllreduce"
+  }
+  device_option {
+    device_type: 1
+  }
+)NET";
+
+TEST(MPITest, TestInPlaceMPIAllreduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kInPlaceMPIAllreduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
+  EXPECT_EQ(X_reduced.size(), 10);
+  int expected_result = size * (size - 1) / 2;
+  TensorCPU X_reduced_cpu(X_reduced);
+  for (int i = 0; i < X_reduced.size(); ++i) {
+    EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
+  }
+}
+
+}  // namespace caffe2
+
+
+GTEST_API_ int main(int argc, char **argv) {
+  int mpi_ret;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
+  testing::InitGoogleTest(&argc, argv);
+  caffe2::GlobalInit(&argc, &argv);
+  int test_result = RUN_ALL_TESTS();
+  MPI_Finalize();
+  return test_result;
+}
diff --git a/caffe2/mpi/mpi_ops.cc b/caffe2/mpi/mpi_ops.cc
new file mode 100644
index 0000000..819fe04
--- /dev/null
+++ b/caffe2/mpi/mpi_ops.cc
@@ -0,0 +1,33 @@
+#include "caffe2/mpi/mpi_ops.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(MPICreateCommonWorld)
+  .NumInputs(0)
+  .NumOutputs(1);
+OPERATOR_SCHEMA(MPIBroadcast)
+  .NumInputs(2)
+  .NumOutputs(1)
+  .EnforceInplace({{1, 0}});
+OPERATOR_SCHEMA(MPIReduce)
+  .NumInputs(2)
+  .NumOutputs(1);
+OPERATOR_SCHEMA(MPIAllgather)
+  .NumInputs(2)
+  .NumOutputs(1);
+OPERATOR_SCHEMA(MPIAllreduce)
+  .NumInputs(2)
+  .NumOutputs(1)
+  .AllowInplace({{1, 0}});
+OPERATOR_SCHEMA(MPISendTensor);
+OPERATOR_SCHEMA(MPIReceiveTensor);
+
+REGISTER_CPU_OPERATOR(MPICreateCommonWorld, MPICreateCommonWorldOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MPIBroadcast, MPIBroadcastOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MPIReduce, MPIReduceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MPIAllgather, MPIAllgatherOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MPIAllreduce, MPIAllreduceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MPISendTensor, MPISendTensorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MPIReceiveTensor, MPIReceiveTensorOp<CPUContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
new file mode 100644
index 0000000..108bf45
--- /dev/null
+++ b/caffe2/mpi/mpi_ops.h
@@ -0,0 +1,250 @@
+#ifndef CAFFE2_MPI_MPI_OPS_H_
+#define CAFFE2_MPI_MPI_OPS_H_
+
+#include <mpi.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+// TODO(jiayq): if needed, write up the use of color and key with MPI split.
+// Currently, the operator simply creates a communicator that has the
+// same topology as the Caffe2 global communicator.
+template <class Context>
+class MPICreateCommonWorldOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MPICreateCommonWorldOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    OperatorBase::Outputs()[0]->Reset(new MPICommonWorldWrapper());
+    return true;
+  }
+};
+
+template <class Context>
+class MPIBroadcastOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MPIBroadcastOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
+  ~MPIBroadcastOp() {}
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    CAFFE_ENFORCE(
+        OperatorBase::OutputIsType<Tensor<Context>>(0),
+        "Output is of wrong type.");
+    auto* output = Output(0);
+    // Make sure that output is already allocated.
+    CAFFE_ENFORCE(
+        output->size() > 0,
+        "Broadcast op uses in-place operation so the output "
+        "should be already allocated.");
+    MPI_CHECK(MPI_Bcast(
+        output->raw_mutable_data(),
+        output->nbytes(),
+        MPIDataTypeWrapper<char>::type(),
+        root_,
+        comm));
+    return true;
+  }
+
+ protected:
+  int root_;
+};
+
+// MPIReduceOp does Reduce using MPI. Currently, only SUM is supported.
+template <typename T, class Context>
+class MPIReduceOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MPIReduceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
+  ~MPIReduceOp() {}
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    auto& input = Input(1);
+    auto* output = Output(0);
+    output->ResizeLike(input);
+    MPI_CHECK(MPI_Reduce(
+        const_cast<T*>(input.template data<T>()),
+        output->template mutable_data<T>(),
+        input.size(),
+        MPIDataTypeWrapper<T>::type(),
+        MPI_SUM,
+        root_,
+        comm));
+    return true;
+  }
+
+ protected:
+  int root_;
+};
+
+// MPIAllgatherOp does MPIAllgather using MPI.
+template <typename T, class Context>
+class MPIAllgatherOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MPIAllgatherOp);
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    auto& input = Input(1);
+    auto* output = Output(0);
+    vector<TIndex> output_dims = input.dims();
+    output_dims[0] *= OperatorBase::Input<MPICommonWorldWrapper>(0).size();
+    output->Resize(output_dims);
+    MPI_CHECK(MPI_Allgather(
+        const_cast<T*>(input.template data<T>()),
+        input.size(),
+        MPIDataTypeWrapper<T>::type(),
+        output->template mutable_data<T>(),
+        input.size(),
+        MPIDataTypeWrapper<T>::type(),
+        comm));
+    return true;
+  }
+};
+
+// MPIAllreduceOp does MPIAllreduce using MPI. Currently, only SUM is supported.
+template <typename T, class Context>
+class MPIAllreduceOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MPIAllreduceOp);
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    auto& input = Input(1);
+    auto* output = Output(0);
+    output->ResizeLike(input);
+    void* source;
+    if (output->template mutable_data<T>() == input.template data<T>()) {
+      // We are doing in-place call. Special case handling.
+      source = MPI_IN_PLACE;
+    } else {
+      // Normal allreduce takes the source from the input.
+      source = const_cast<T*>(input.template data<T>());
+    }
+    MPI_CHECK(MPI_Allreduce(
+        source,
+        output->template mutable_data<T>(),
+        input.size(),
+        MPIDataTypeWrapper<T>::type(),
+        MPI_SUM,
+        comm));
+    return true;
+  }
+};
+
+template <class Context>
+class MPISendTensorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MPISendTensorOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(int, "dst", dst_, MPI_ANY_SOURCE),
+        OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
+        OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
+    CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
+    CAFFE_ENFORCE(
+        dst_ != MPI_ANY_SOURCE || def.input_size() == 4,
+        "You should explicitly specify the to rank either via "
+        "argument or via input blobs.");
+    CAFFE_ENFORCE(
+        tag_ != MPI_ANY_TAG || def.input_size() == 4,
+        "You should explicitly specify the tag either via "
+        "argument or via input blobs.");
+  }
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
+    auto& input = Input(INPUT);
+    if (InputSize() == 4) {
+      dst_ = OperatorBase::Input<TensorCPU>(DST).template data<int>()[0];
+      tag_ = OperatorBase::Input<TensorCPU>(TAG).template data<int>()[0];
+    }
+    if (raw_buffer_) {
+      // We need to do a const cast to cope with the fact that, before OpenMPI
+      // 1.7, MPI_Send expects a non-const pointer although it uses it in a
+      // const way.
+      MPI_CHECK(MPI_Send(
+          const_cast<void*>(input.raw_data()),
+          input.nbytes(),
+          MPI_CHAR,
+          dst_,
+          tag_,
+          comm));
+    } else {
+      CAFFE_NOT_IMPLEMENTED;
+    }
+    return true;
+  }
+
+ protected:
+  int dst_;
+  int tag_;
+  bool raw_buffer_;
+
+  INPUT_TAGS(COMM, INPUT, DST, TAG);
+};
+
+template <class Context>
+class MPIReceiveTensorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MPIReceiveTensorOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(int, "src", src_, MPI_ANY_SOURCE),
+        OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
+        OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
+    CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
+  }
+
+  bool RunOnDevice() override {
+    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
+    if (InputSize() == 4) {
+      src_ = OperatorBase::Input<TensorCPU>(SRC_IN).template data<int>()[0];
+      tag_ = OperatorBase::Input<TensorCPU>(TAG_IN).template data<int>()[0];
+    }
+    MPI_Status status;
+    if (raw_buffer_) {
+      auto* output = Output(OUTPUT);
+      MPI_CHECK(MPI_Recv(
+          output->raw_mutable_data(),
+          output->nbytes(),
+          MPI_CHAR,
+          src_,
+          tag_,
+          comm,
+          &status));
+    } else {
+      CAFFE_NOT_IMPLEMENTED;
+    }
+    auto* src_out = OperatorBase::Output<TensorCPU>(SRC_OUT);
+    src_out->Resize();
+    src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
+    auto* tag_out = OperatorBase::Output<TensorCPU>(TAG_OUT);
+    tag_out->Resize();
+    tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
+    return true;
+  }
+
+ protected:
+  int src_;
+  int tag_;
+  bool raw_buffer_;
+  INPUT_TAGS(COMM, INPUT, SRC_IN, TAG_IN);
+  OUTPUT_TAGS(OUTPUT, SRC_OUT, TAG_OUT);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_MPI_MPI_OPS_H_
diff --git a/caffe2/mpi/mpi_ops_gpu.cc b/caffe2/mpi/mpi_ops_gpu.cc
new file mode 100644
index 0000000..b28a31f
--- /dev/null
+++ b/caffe2/mpi/mpi_ops_gpu.cc
@@ -0,0 +1,86 @@
+#include "caffe2/mpi/mpi_ops.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+
+namespace caffe2 {
+
+// Here is a bunch of MPI macro definitions that allow us to see if the MPI
+// version supports CUDA aware MPI functions or not.
+
+#if OPEN_MPI
+#define CAFFE2_OMPI_VERSION \
+  OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + OMPI_RELEASE_VERSION
+#if CAFFE2_OMPI_VERSION >= 20000
+// OpenMPI 2.x now supports compile time check whether CUDA is supported.
+#include "mpi-ext.h" /* Needed for CUDA-aware check */
+#if MPIX_CUDA_AWARE_SUPPORT
+#define CAFFE2_HAS_CUDA_MPI_BASICS 1
+#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
+#endif // MPIX_CUDA_AWARE_SUPPORT
+#else // CAFFE2_OMPI_VERSION >= 2000
+// In the case of OpenMPI 1.x, we don't have compile-time flags to
+// figure out if CUDA is supported; as a result, we will assume that
+// the user has built OpenMPI with CUDA support.
+// CUDA-aware MPIBroadcast is introduced after OpenMPI 1.7.
+#if CAFFE2_OMPI_VERSION >= 10700
+#define CAFFE2_HAS_CUDA_MPI_BASICS 1
+#else // CAFFE2_OMPI_VERSION >= 10700
+#define CAFFE2_HAS_CUDA_MPI_BASICS 0
+#endif // CAFFE2_OMPI_VERSION >= 10700
+// CUDA-aware MPIAllreduce is introduced after OpenMPI 1.8.5.
+#if CAFFE2_OMPI_VERSION >= 10805
+#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
+#else // CAFFE2_OMPI_VERSION >= 10805
+#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
+#endif // CAFFE2_OMPI_VERSION >= 10805
+#endif // CAFFE2_OMPI_VERSION >= 2000
+#else // !OPEN_MPI
+// We have not really tested against other MPI environments, so let's go for a
+// safe path and basically say we don't have cuda-aware functions.
+#define CAFFE2_HAS_CUDA_MPI_BASICS 0
+#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
+#endif // OPEN_MPI
+
+// We allow a macro to force using fallback functions.
+#ifdef CAFFE2_FORCE_FALLBACK_CUDA_MPI
+#undef CAFFE2_HAS_CUDA_MPI_BASICS
+#undef CAFFE2_HAS_CUDA_MPI_ALLREDUCE
+#define CAFFE2_HAS_CUDA_MPI_BASICS 0
+#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
+#endif // CAFFE2_FORCE_FALLBACK_CUDA_MPI
+
+REGISTER_CUDA_OPERATOR(
+    MPICreateCommonWorld,
+    MPICreateCommonWorldOp<CUDAContext>);
+#if CAFFE2_HAS_CUDA_MPI_BASICS
+REGISTER_CUDA_OPERATOR(MPIBroadcast, MPIBroadcastOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(MPIReduce, MPIReduceOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MPIAllgather, MPIAllgatherOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MPISendTensor, MPISendTensorOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(MPIReceiveTensor, MPIReceiveTensorOp<CUDAContext>);
+#else
+REGISTER_CUDA_OPERATOR(MPIBroadcast, GPUFallbackOp<MPIBroadcastOp<CPUContext>>);
+REGISTER_CUDA_OPERATOR(
+    MPIReduce,
+    GPUFallbackOp<MPIReduceOp<float, CPUContext>>);
+REGISTER_CUDA_OPERATOR(
+    MPIAllgather,
+    GPUFallbackOp<MPIAllgatherOp<float, CPUContext>>);
+REGISTER_CUDA_OPERATOR(
+    MPISendTensor,
+    GPUFallbackOp<MPISendTensorOp<CPUContext>>);
+REGISTER_CUDA_OPERATOR(
+    MPIReceiveTensor,
+    GPUFallbackOp<MPIReceiveTensorOp<CPUContext>, SkipIndices<1, 2>>);
+#endif
+
+#if CAFFE2_HAS_CUDA_MPI_ALLREDUCE
+REGISTER_CUDA_OPERATOR(MPIAllreduce, MPIAllreduceOp<float, CUDAContext>);
+#else
+REGISTER_CUDA_OPERATOR(
+    MPIAllreduce,
+    GPUFallbackOp<MPIAllreduceOp<float, CPUContext>>);
+#endif
+
+}  // namespace caffe2
diff --git a/caffe2/mpi/mpi_test.cc b/caffe2/mpi/mpi_test.cc
new file mode 100644
index 0000000..c533259
--- /dev/null
+++ b/caffe2/mpi/mpi_test.cc
@@ -0,0 +1,313 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/mpi/mpi_common.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DEFINE_string(
+    caffe_test_root, "gen/", "The root of the caffe test folder.");
+
+namespace caffe2 {
+
+const char kBcastNet[] = R"NET(
+  name: "bcast"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X"
+    type: "MPIBroadcast"
+    arg {
+      name: "root"
+      i: 0
+    }
+  }
+)NET";
+
+TEST(MPITest, TestMPIBroadcast) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kBcastNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  for (int root = 0; root < size; ++root) {
+    net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
+    Workspace ws;
+    unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    EXPECT_NE(nullptr, net.get());
+    EXPECT_TRUE(net->Run());
+    // Let's test the value.
+    auto& X = ws.GetBlob("X")->Get<TensorCPU>();
+    EXPECT_EQ(X.size(), 10);
+    for (int i = 0; i < X.size(); ++i) {
+      EXPECT_EQ(X.data<float>()[i], root);
+    }
+  }
+}
+
+const char kReduceNet[] = R"NET(
+  name: "reduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_reduced"
+    type: "MPIReduce"
+    arg {
+      name: "root"
+      i: 0
+    }
+  }
+)NET";
+
+TEST(MPITest, TestMPIReduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kReduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank0);
+  arg->set_f(rank0);
+  int size0;
+  MPI_Comm_size(MPI_COMM_WORLD, &size0);
+
+  for (int root = 0; root < size0; ++root) {
+    net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
+    Workspace ws;
+    unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+    EXPECT_NE(nullptr, net.get());
+    EXPECT_TRUE(net->Run());
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    if (rank == root) {
+      // Let's test the value.
+      auto& X = ws.GetBlob("X_reduced")->Get<TensorCPU>();
+      EXPECT_EQ(X.size(), 10);
+      int expected_result = size * (size - 1) / 2;
+      for (int i = 0; i < X.size(); ++i) {
+        EXPECT_EQ(X.data<float>()[i], expected_result);
+      }
+    }
+  }
+}
+
+const char kMPIAllgatherNet[] = R"NET(
+  name: "allgather"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 2
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_gathered"
+    type: "MPIAllgather"
+  }
+)NET";
+
+TEST(MPITest, TestMPIAllgather) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kMPIAllgatherNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  // Let's test the value.
+  auto& X = ws.GetBlob("X")->Get<TensorCPU>();
+  EXPECT_EQ(X.size(), 20);
+  for (int i = 0; i < X.size(); ++i) {
+    EXPECT_EQ(X.data<float>()[i], rank);
+  }
+  auto& X_gathered = ws.GetBlob("X_gathered")->Get<TensorCPU>();
+  EXPECT_EQ(X_gathered.size(), 20 * size);
+  EXPECT_EQ(X_gathered.dim(0), 2 * size);
+  EXPECT_EQ(X_gathered.dim(1), 10);
+  for (int i = 0; i < X_gathered.size(); ++i) {
+    EXPECT_EQ(X_gathered.data<float>()[i], i / 20);
+  }
+}
+
+const char kMPIAllreduceNet[] = R"NET(
+  name: "allreduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X_reduced"
+    type: "MPIAllreduce"
+  }
+)NET";
+
+TEST(MPITest, TestMPIAllreduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kMPIAllreduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  // Let's test the value.
+  auto& X = ws.GetBlob("X")->Get<TensorCPU>();
+  EXPECT_EQ(X.size(), 10);
+  for (int i = 0; i < X.size(); ++i) {
+    EXPECT_EQ(X.data<float>()[i], rank);
+  }
+  auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCPU>();
+  EXPECT_EQ(X_reduced.size(), 10);
+  int expected_result = size * (size - 1) / 2;
+  for (int i = 0; i < X_reduced.size(); ++i) {
+    EXPECT_EQ(X_reduced.data<float>()[i], expected_result);
+  }
+}
+
+const char kInPlaceMPIAllreduceNet[] = R"NET(
+  name: "allreduce"
+  op {
+    output: "comm"
+    type: "MPICreateCommonWorld"
+  }
+  op {
+    output: "X"
+    type: "ConstantFill"
+    arg {
+      name: "shape"
+      ints: 10
+    }
+    arg {
+      name: "value"
+      f: 0.0
+    }
+  }
+  op {
+    input: "comm"
+    input: "X"
+    output: "X"
+    type: "MPIAllreduce"
+  }
+)NET";
+
+TEST(MPITest, TestInPlaceMPIAllreduce) {
+  NetDef net_def;
+  CHECK(TextFormat::ParseFromString(
+      string(kInPlaceMPIAllreduceNet), &net_def));
+  // Let's set the network's constant fill value to be the mpi rank.
+  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
+  CAFFE_ENFORCE_EQ(arg->name(), "value");
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  arg->set_f(rank);
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Run());
+  auto& X_reduced = ws.GetBlob("X")->Get<TensorCPU>();
+  EXPECT_EQ(X_reduced.size(), 10);
+  int expected_result = size * (size - 1) / 2;
+  for (int i = 0; i < X_reduced.size(); ++i) {
+    EXPECT_EQ(X_reduced.data<float>()[i], expected_result);
+  }
+}
+
+}  // namespace caffe2
+
+
+GTEST_API_ int main(int argc, char **argv) {
+  int mpi_ret;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
+  testing::InitGoogleTest(&argc, argv);
+  caffe2::GlobalInit(&argc, &argv);
+  int test_result = RUN_ALL_TESTS();
+  MPI_Finalize();
+  return test_result;
+}
diff --git a/caffe2/observers/CMakeLists.txt b/caffe2/observers/CMakeLists.txt
new file mode 100644
index 0000000..9d0b01e
--- /dev/null
+++ b/caffe2/observers/CMakeLists.txt
@@ -0,0 +1,15 @@
+if(USE_OBSERVERS)
+  message(STATUS "Include Observer library")
+  set(Caffe2_CONTRIB_OBSERVERS_CPU_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/time_observer.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/runcnt_observer.cc"
+  )
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_OBSERVERS_CPU_SRC})
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+
+  # ---[ CPU test files
+  file(GLOB tmp *_test.cc)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+endif()
diff --git a/caffe2/observers/README.md b/caffe2/observers/README.md
new file mode 100644
index 0000000..3b85ffc
--- /dev/null
+++ b/caffe2/observers/README.md
@@ -0,0 +1,36 @@
+# Observers
+
+## Usage
+
+Observers are a small framework that allow users to attach code to the execution of SimpleNets and Operators.
+
+An example of an Observer is the `TimeObserver`, used as follows:
+
+### C++
+
+```
+unique_ptr<TimeObserver<NetBase>> net_ob =
+    make_unique<TimeObserver<NetBase>>(net.get());
+auto* ob = net->AttachObserver(std::move(net_ob));
+net->Run();
+LOG(INFO) << "av time children: " << ob->average_time_children();
+LOG(INFO) << "av time: " << ob->average_time();
+```
+  
+### Python
+
+```
+model.net.AttachTimeObserver()
+ws.RunNet(model.net)
+ob = model.net.GetObserver()
+
+print("av time children:", ob.average_time_children())
+print("av time:", ob.average_time())
+```
+
+
+## Implementing An Observer
+
+To implement an observer you must inherit from `ObserverBase` and implement the `Start` and `Stop` functions.
+
+Observers are instantiated with a `subject` of a generic type, such as a `Net` or `Operator`.  The observer framework is built to be generic enough to "observe" various other types, however.
diff --git a/caffe2/observers/operator_attaching_net_observer.h b/caffe2/observers/operator_attaching_net_observer.h
new file mode 100644
index 0000000..4141930
--- /dev/null
+++ b/caffe2/observers/operator_attaching_net_observer.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// Thin class that attaches the observer to all operators in the net
+template <typename TOpObserver, typename TNetObserver>
+class OperatorAttachingNetObserver : public ObserverBase<NetBase> {
+ public:
+  explicit OperatorAttachingNetObserver(
+      NetBase* subject_,
+      TNetObserver* netObserver)
+      : ObserverBase<NetBase>(subject_) {
+    const auto& operators = subject_->GetOperators();
+    for (auto* op : operators) {
+      auto observer = caffe2::make_unique<TOpObserver>(op, netObserver);
+      const auto* ob = observer.get();
+      op->AttachObserver(std::move(observer));
+      operator_observers_.push_back(ob);
+    }
+  }
+  virtual ~OperatorAttachingNetObserver(){};
+
+ protected:
+  std::vector<const TOpObserver*> operator_observers_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/observers/profile_observer.h b/caffe2/observers/profile_observer.h
new file mode 100644
index 0000000..ab110da
--- /dev/null
+++ b/caffe2/observers/profile_observer.h
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <unordered_map>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/event.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/observers/operator_attaching_net_observer.h"
+
+namespace caffe2 {
+
+/**
+ * This observer displays a description of each operator executed in a network.
+ * This includes input and tensors (name, size, type), arguments, and execution
+ * time. This can be used to analyze different performance characteristics.
+ * NOTE: Currently this observer only supports synchronized computation
+ **/
+
+class ProfileObserver;
+class ProfileCounter {
+ public:
+  explicit ProfileCounter() {}
+
+ protected:
+  Timer timer_;
+  float start_time_ = 0.0f;
+  float run_time_ = 0.0f;
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+};
+
+class ProfileOperatorObserver : public ProfileCounter,
+                                public ObserverBase<OperatorBase> {
+ public:
+  explicit ProfileOperatorObserver(OperatorBase* subject) = delete;
+  explicit ProfileOperatorObserver(
+      OperatorBase* subject,
+      ProfileObserver* netObserver)
+      : ObserverBase<OperatorBase>(subject), netObserver_(netObserver) {
+    if (subject) {
+      net_position_ = subject->net_position();
+    }
+  }
+  explicit ProfileOperatorObserver(
+      OperatorBase* subject,
+      ProfileObserver* netObserver,
+      int net_position,
+      int rnn_order)
+      : ProfileOperatorObserver(subject, netObserver) {
+    net_position_ = net_position;
+    rnn_order_ = rnn_order;
+  }
+
+  std::unique_ptr<ObserverBase<OperatorBase>> rnnCopy(
+      OperatorBase* subject,
+      int rnn_order) const override;
+
+  void Dump() const;
+
+  virtual std::string getId() const {
+    std::stringstream ss;
+    ss << net_position_;
+    if (rnn_order_ != OperatorBase::kNoNetPositionSet) {
+      ss << "-" << rnn_order_;
+    }
+    return ss.str();
+  }
+
+ protected:
+  ProfileObserver* netObserver_;
+  int net_position_; // Needed because this is not visible in RNN Executor
+  int rnn_order_ = OperatorBase::kNoNetPositionSet;
+
+ private:
+  void Start() override;
+  void Stop() override;
+};
+
+class ProfileObserver final : public OperatorAttachingNetObserver<
+                                  ProfileOperatorObserver,
+                                  ProfileObserver> {
+ public:
+  explicit ProfileObserver(NetBase* subject)
+      : OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>(
+            subject,
+            this) {}
+
+  void Start() override{};
+  void Stop() override{};
+
+ private:
+  vector<const ProfileOperatorObserver*> operator_observers_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
new file mode 100644
index 0000000..afeac6d
--- /dev/null
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/logging.h"
+#include "profile_observer.h"
+
+namespace caffe2 {
+
+void ProfileOperatorObserver::Dump() const {
+  static std::mutex loggingMutex;
+  std::lock_guard<std::mutex> lock(loggingMutex);
+
+  LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
+            << " op#" << getId() << " ---------";
+  for (int i = 0; i < subject_->InputSize(); ++i) {
+    if (subject_->InputIsType<TensorCPU>(i)) {
+      const auto& tensor = subject_->Input<TensorCPU>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    } else if (subject_->InputIsType<TensorCUDA>(i)) {
+      const auto& tensor = subject_->Input<TensorCUDA>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    }
+  }
+
+  int a = 0;
+  for (const auto& arg : subject_->debug_def().arg()) {
+    LOG(INFO) << "Argument " << a << ": " << arg.ShortDebugString();
+    ++a;
+  }
+
+  for (int o = 0; o < subject_->OutputSize(); ++o) {
+    if (subject_->OutputIsType<TensorCPU>(o)) {
+      auto* tensor = subject_->Output<TensorCPU>(o);
+      const auto& name = subject_->debug_def().output(o);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
+    } else if (subject_->OutputIsType<TensorCUDA>(o)) {
+      auto* tensor = subject_->Output<TensorCUDA>(o);
+      const auto& name = subject_->debug_def().output(o);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
+    }
+  }
+
+  LOG(INFO) << "--------- Finished operator " << subject_->debug_def().type()
+            << " in " << run_time_ << " ms ---------";
+}
+
+void ProfileOperatorObserver::Start() {
+  auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
+  if (cudaOp) {
+    auto context = cudaOp->getContext();
+    int device;
+    cudaGetDevice(&device);
+
+    cudaSetDevice(context->cuda_gpu_id());
+    cudaEventCreate(&start_);
+    cudaEventRecord(start_, context->cuda_stream());
+
+    cudaSetDevice(device);
+
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+      CAFFE_THROW("Encountered CUDA error Start: ", cudaGetErrorString(error));
+    }
+  } else {
+    start_time_ = timer_.MilliSeconds();
+  }
+}
+
+void ProfileOperatorObserver::Stop() {
+  auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
+  if (cudaOp) {
+    auto context = cudaOp->getContext();
+    int device;
+    cudaGetDevice(&device);
+
+    cudaSetDevice(context->cuda_gpu_id());
+    cudaEventCreate(&stop_);
+    cudaEventRecord(stop_, context->cuda_stream());
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&run_time_, start_, stop_);
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+
+    cudaSetDevice(device);
+
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+      CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error));
+    }
+  } else {
+    run_time_ = timer_.MilliSeconds() - start_time_;
+  }
+
+  Dump();
+}
+
+std::unique_ptr<ObserverBase<OperatorBase>> ProfileOperatorObserver::rnnCopy(
+    OperatorBase* subject,
+    int rnn_order) const {
+  return std::unique_ptr<ObserverBase<OperatorBase>>(
+      new ProfileOperatorObserver(
+          subject, netObserver_, net_position_, rnn_order));
+}
+} // namespace caffe2
diff --git a/caffe2/observers/runcnt_observer.cc b/caffe2/observers/runcnt_observer.cc
new file mode 100644
index 0000000..732a040
--- /dev/null
+++ b/caffe2/observers/runcnt_observer.cc
@@ -0,0 +1,38 @@
+#include "runcnt_observer.h"
+
+namespace caffe2 {
+
+RunCountOperatorObserver::RunCountOperatorObserver(
+    OperatorBase* op,
+    RunCountNetObserver* netObserver)
+    : ObserverBase<OperatorBase>(op), netObserver_(netObserver) {
+  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
+}
+
+std::string RunCountNetObserver::debugInfo() {
+#if CAFFE2_ANDROID
+  // workaround
+  int foo = cnt_;
+  return "This operator runs " + caffe2::to_string(foo) + " times.";
+#else
+  return "This operator runs " + caffe2::to_string(cnt_) + " times.";
+#endif
+}
+
+void RunCountNetObserver::Start() {}
+
+void RunCountNetObserver::Stop() {}
+
+void RunCountOperatorObserver::Start() {
+  ++netObserver_->cnt_;
+}
+void RunCountOperatorObserver::Stop() {}
+
+std::unique_ptr<ObserverBase<OperatorBase>> RunCountOperatorObserver::rnnCopy(
+    OperatorBase* subject,
+    int rnn_order) const {
+  return std::unique_ptr<ObserverBase<OperatorBase>>(
+      new RunCountOperatorObserver(subject, netObserver_));
+}
+
+} // namespace caffe2
diff --git a/caffe2/observers/runcnt_observer.h b/caffe2/observers/runcnt_observer.h
new file mode 100644
index 0000000..2dbd5fb
--- /dev/null
+++ b/caffe2/observers/runcnt_observer.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/observers/operator_attaching_net_observer.h"
+
+namespace caffe2 {
+
+class RunCountNetObserver;
+class RunCountOperatorObserver final : public ObserverBase<OperatorBase> {
+ public:
+  explicit RunCountOperatorObserver(OperatorBase* op) = delete;
+  RunCountOperatorObserver(OperatorBase* op, RunCountNetObserver* netObserver);
+  ~RunCountOperatorObserver() {}
+  std::unique_ptr<ObserverBase<OperatorBase>> rnnCopy(
+      OperatorBase* subject,
+      int rnn_order) const override;
+
+ private:
+  void Start() override;
+  void Stop() override;
+
+ private:
+  RunCountNetObserver* netObserver_;
+};
+
+class RunCountNetObserver final : public OperatorAttachingNetObserver<
+                                      RunCountOperatorObserver,
+                                      RunCountNetObserver> {
+ public:
+  explicit RunCountNetObserver(NetBase* subject_)
+      : OperatorAttachingNetObserver<
+            RunCountOperatorObserver,
+            RunCountNetObserver>(subject_, this),
+        cnt_(0) {}
+  ~RunCountNetObserver() {}
+
+  std::string debugInfo() override;
+
+  friend class RunCountOperatorObserver;
+
+ private:
+  void Start() override;
+  void Stop() override;
+
+ protected:
+  std::atomic<int> cnt_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/observers/time_observer.cc b/caffe2/observers/time_observer.cc
new file mode 100644
index 0000000..1a312aa
--- /dev/null
+++ b/caffe2/observers/time_observer.cc
@@ -0,0 +1,36 @@
+#include "time_observer.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+void TimeObserver::Start() {
+  start_time_ = timer_.MilliSeconds();
+  ++iterations_;
+}
+
+void TimeObserver::Stop() {
+  double current_run = timer_.MilliSeconds() - start_time_;
+  total_time_ += current_run;
+  VLOG(1) << "This net iteration took " << current_run << " ms to complete.\n";
+}
+
+void TimeOperatorObserver::Start() {
+  start_time_ = timer_.MilliSeconds();
+  ++iterations_;
+}
+
+void TimeOperatorObserver::Stop() {
+  double current_run = timer_.MilliSeconds() - start_time_;
+  total_time_ += current_run;
+  VLOG(1) << "This operator iteration took " << current_run
+          << " ms to complete.\n";
+}
+
+std::unique_ptr<ObserverBase<OperatorBase>> TimeOperatorObserver::rnnCopy(
+    OperatorBase* subject,
+    int rnn_order) const {
+  return std::unique_ptr<ObserverBase<OperatorBase>>(
+      new TimeOperatorObserver(subject, nullptr));
+}
+
+} // namespace caffe2
diff --git a/caffe2/observers/time_observer.h b/caffe2/observers/time_observer.h
new file mode 100644
index 0000000..cce06cf
--- /dev/null
+++ b/caffe2/observers/time_observer.h
@@ -0,0 +1,71 @@
+#ifndef CAFFE2_CONTRIB_OBSERVERS_TIME_OBSERVER_H_
+#define CAFFE2_CONTRIB_OBSERVERS_TIME_OBSERVER_H_
+
+#include <unordered_map>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/observers/operator_attaching_net_observer.h"
+
+namespace caffe2 {
+
+class TimeObserver;
+class TimeCounter {
+ public:
+  explicit TimeCounter() {}
+  inline float average_time() const {
+    return total_time_ / iterations_;
+  }
+
+ protected:
+  Timer timer_;
+  float start_time_ = 0.0f;
+  float total_time_ = 0.0f;
+  int iterations_ = 0;
+};
+
+class TimeOperatorObserver final : public TimeCounter,
+                                   public ObserverBase<OperatorBase> {
+ public:
+  explicit TimeOperatorObserver(OperatorBase* subject) = delete;
+  explicit TimeOperatorObserver(
+      OperatorBase* subject,
+      TimeObserver* /* unused */)
+      : ObserverBase<OperatorBase>(subject) {}
+  std::unique_ptr<ObserverBase<OperatorBase>> rnnCopy(
+      OperatorBase* subject,
+      int rnn_order) const override;
+
+ private:
+  void Start() override;
+  void Stop() override;
+};
+
+class TimeObserver final
+    : public TimeCounter,
+      public OperatorAttachingNetObserver<TimeOperatorObserver, TimeObserver> {
+ public:
+  explicit TimeObserver(NetBase* subject)
+      : OperatorAttachingNetObserver<TimeOperatorObserver, TimeObserver>(
+            subject,
+            this) {}
+
+  float average_time_children() const {
+    float sum = 0.0f;
+    for (const auto* observer : operator_observers_) {
+      sum += observer->average_time();
+    }
+    return sum / subject_->GetOperators().size();
+  }
+
+ private:
+  void Start() override;
+  void Stop() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_CONTRIB_OBSERVERS_TIME_OBSERVER_H_
diff --git a/caffe2/observers/time_observer_test.cc b/caffe2/observers/time_observer_test.cc
new file mode 100644
index 0000000..5aa3c85
--- /dev/null
+++ b/caffe2/observers/time_observer_test.cc
@@ -0,0 +1,72 @@
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "time_observer.h"
+
+#include <gtest/gtest.h>
+#include <chrono>
+#include <thread>
+
+namespace caffe2 {
+
+namespace {
+
+class SleepOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    StartAllObservers();
+    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
+    StopAllObservers();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(SleepOp, SleepOp);
+REGISTER_CUDA_OPERATOR(SleepOp, SleepOp);
+
+OPERATOR_SCHEMA(SleepOp)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+unique_ptr<NetBase> CreateNetTestHelper(Workspace* ws) {
+  NetDef net_def;
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("SleepOp");
+    op.add_input("in");
+    op.add_output("hidden");
+  }
+  {
+    auto& op = *(net_def.add_op());
+    op.set_type("SleepOp");
+    op.add_input("hidden");
+    op.add_output("out");
+  }
+  net_def.add_external_input("in");
+  net_def.add_external_output("out");
+
+  return CreateNet(net_def, ws);
+}
+} // namespace
+
+TEST(TimeObserverTest, Test3Seconds) {
+  Workspace ws;
+  ws.CreateBlob("in");
+  NetDef net_def;
+  unique_ptr<NetBase> net(CreateNetTestHelper(&ws));
+  auto net_ob = caffe2::make_unique<TimeObserver>(net.get());
+  const auto* ob = net_ob.get();
+  net->AttachObserver(std::move(net_ob));
+  net->Run();
+  CAFFE_ENFORCE(ob);
+  LOG(INFO) << "av time children: " << ob->average_time_children();
+  LOG(INFO) << "av time: " << ob->average_time();
+  CAFFE_ENFORCE(ob->average_time_children() > 3000);
+  CAFFE_ENFORCE(ob->average_time_children() < 3500);
+  CAFFE_ENFORCE(ob->average_time() > 6000);
+  CAFFE_ENFORCE(ob->average_time() < 6500);
+}
+} // namespace caffe2
diff --git a/caffe2/onnx/CMakeLists.txt b/caffe2/onnx/CMakeLists.txt
new file mode 100644
index 0000000..7398f76
--- /dev/null
+++ b/caffe2/onnx/CMakeLists.txt
@@ -0,0 +1,10 @@
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
new file mode 100644
index 0000000..6f3986b
--- /dev/null
+++ b/caffe2/onnx/backend.cc
@@ -0,0 +1,1411 @@
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/onnx/backend.h"
+#include "caffe2/onnx/device.h"
+#include "caffe2/onnx/helper.h"
+#include "caffe2/utils/map_utils.h"
+#include "caffe2/utils/proto_utils.h"
+
+#if !CAFFE2_MOBILE
+#include "onnx/checker.h"
+#include "onnx/optimizer/optimize.h"
+#endif
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace caffe2 {
+namespace onnx {
+
+namespace {
+
+constexpr static int kKnownOpsetVersion = 6;
+
+bool AlmostEqual(double a, double b) {
+  constexpr static double kEps = 1e-15;
+  return (fabs(a - b) < kEps);
+}
+
+template <class T>
+bool TryConvertingTensorRawValues(
+    const TensorProto& onnx_tensor,
+    ::google::protobuf::RepeatedField<T>* field) {
+  if (!onnx_tensor.has_raw_data()) {
+    return false;
+  }
+
+  size_t raw_size = onnx_tensor.raw_data().size();
+  CAFFE_ENFORCE_EQ(raw_size % sizeof(T), 0);
+
+  size_t num_elements = raw_size / sizeof(T);
+  const void* src_ptr = static_cast<const void*>(onnx_tensor.raw_data().data());
+  field->Resize(num_elements, 0);
+  void* target_ptr = static_cast<void*>(field->mutable_data());
+  memcpy(target_ptr, src_ptr, raw_size);
+
+  return true;
+}
+
+bool IsOperator(const std::string& op_type) {
+  // pull in all the operators upon first invocation
+  // Intentional leaky
+  static std::set<std::string>* ops_ =
+      new std::set<std::string>(caffe2::GetRegisteredOperators());
+  return ops_->count(caffe2::OpRegistryKey(op_type, "DEFAULT"));
+}
+
+caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
+  static const std::unordered_map<DeviceType, caffe2::DeviceType> m = {
+      {DeviceType::CPU, caffe2::DeviceType::CPU},
+      {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
+  caffe2::DeviceOption d;
+  d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
+  d.set_cuda_gpu_id(onnx_device.device_id);
+  return d;
+}
+
+#if !CAFFE2_MOBILE
+ModelProto OptimizeOnnx(const ModelProto& input, bool init) {
+  std::vector<std::string> passes{"fuse_consecutive_transposes",
+                                  "eliminate_nop_transpose",
+                                  "fuse_transpose_into_gemm"};
+
+  if (init) {
+    passes.emplace_back("split_init");
+  } else {
+    passes.emplace_back("split_predict");
+  }
+  return ::ONNX_NAMESPACE::optimization::Optimize(input, passes);
+}
+#endif
+
+template <class T, class U>
+U LookUpWithDefault(
+    const std::unordered_map<T, U>& map,
+    const T& key,
+    const U& default_value) {
+  const auto it = map.find(key);
+  if (it == map.end()) {
+    return default_value;
+  } else {
+    return it->second;
+  }
+}
+
+void UpdateNames(std::shared_ptr<DummyName> dummy, const caffe2::OperatorDef& op) {
+  for (const auto& n : op.input()) {
+    dummy->AddName(n);
+  }
+  for (const auto& n : op.output()) {
+    dummy->AddName(n);
+  }
+}
+
+void BuildOperator(
+    caffe2::OperatorDef* c2_op,
+    const std::string& op_type,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<caffe2::Argument>& args) {
+  c2_op->set_name("");
+  c2_op->set_type(op_type);
+  for (const auto& input : inputs) {
+    c2_op->add_input(input);
+  }
+  for (const auto& output : outputs) {
+    c2_op->add_output(output);
+  }
+  for (const auto& arg : args) {
+    auto* tmp = c2_op->add_arg();
+    tmp->CopyFrom(arg);
+  }
+}
+
+void BuildOperator(
+    caffe2::OperatorDef* c2_op,
+    const std::string& op_type,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs) {
+  std::vector<caffe2::Argument> empty;
+  BuildOperator(c2_op, op_type, inputs, outputs, empty);
+}
+
+void CopyOnnxAttrValueToCaffe2Arg(
+    caffe2::Argument* arg,
+    const AttributeProto& attr) {
+  if (attr.has_f()) {
+    arg->set_f(attr.f());
+  } else if (attr.has_i()) {
+    arg->set_i(attr.i());
+  } else if (attr.has_s()) {
+    arg->set_s(attr.s());
+  } else if (attr.has_t()) {
+    // For proto, we convert it to serialized string
+    std::string buffer;
+    attr.t().SerializeToString(&buffer);
+    arg->set_s(buffer);
+  } else if (attr.floats_size()) {
+    arg->mutable_floats()->CopyFrom(attr.floats());
+  } else if (attr.ints_size()) {
+    arg->mutable_ints()->CopyFrom(attr.ints());
+  } else if (attr.strings_size()) {
+    arg->mutable_strings()->CopyFrom(attr.strings());
+  } else {
+    CAFFE_THROW("Unsupported ONNX attribute: ", attr.name());
+  }
+}
+} // namespace
+
+OnnxAttributes::OnnxAttributes(const NodeProto& node) {
+  for (const auto& attr : node.attribute()) {
+    onnx_attrs_.emplace(attr.name(), &attr);
+  }
+}
+
+template <>
+int64_t OnnxAttributes::get(const std::string& key) const {
+  int64_t value = 0;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value = attr.i();
+  }
+  return value;
+}
+
+template <>
+float OnnxAttributes::get(const std::string& key) const {
+  float value = 0.0;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value = attr.f();
+  }
+  return value;
+}
+
+template <>
+::google::protobuf::RepeatedPtrField<std::string> OnnxAttributes::get(
+    const std::string& key) const {
+  ::google::protobuf::RepeatedPtrField<std::string> value;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value.CopyFrom(attr.strings());
+  }
+  return value;
+}
+
+template <>
+::google::protobuf::RepeatedField<::google::protobuf::int64>
+OnnxAttributes::get(const std::string& key) const {
+  ::google::protobuf::RepeatedField<::google::protobuf::int64> value;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value.CopyFrom(attr.ints());
+  }
+  return value;
+}
+
+template <>
+::google::protobuf::RepeatedField<float>
+OnnxAttributes::get(const std::string& key) const {
+  ::google::protobuf::RepeatedField<float> value;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value.CopyFrom(attr.floats());
+  }
+  return value;
+}
+
+template <>
+const TensorProto* OnnxAttributes::get(const std::string& key) const {
+  const TensorProto* value = nullptr;
+  const auto it = onnx_attrs_.find(key);
+  if (it != onnx_attrs_.end()) {
+    const AttributeProto& attr = *it->second;
+    value = &attr.t();
+  }
+  return value;
+}
+
+::google::protobuf::RepeatedPtrField<caffe2::Argument>
+OnnxAttributes::OnnxAttrToCaffe2Arg(
+    std::function<std::string(const std::string&)> mapper) const {
+  ::google::protobuf::RepeatedPtrField<caffe2::Argument> args;
+  for (const auto& kv : onnx_attrs_) {
+    // If the attribute was rewritten, we use it instead. Note that the
+    // rewritten attribute still has the unmapped name
+    const auto& attr = rewritten_onnx_attrs_.count(kv.first)
+        ? rewritten_onnx_attrs_.at(kv.first)
+        : (*kv.second);
+    auto* arg = args.Add();
+    arg->set_name(mapper(attr.name()));
+    CopyOnnxAttrValueToCaffe2Arg(arg, attr);
+  }
+  for (const auto& kv : rewritten_onnx_attrs_) {
+    // If rewritten attribute doesn't appear in the original attributes, this is
+    // a newlly added one and we need to add this to argument too
+    if (!onnx_attrs_.count(kv.first)) {
+      const auto& attr = kv.second;
+      auto* arg = args.Add();
+      arg->set_name(mapper(attr.name()));
+      CopyOnnxAttrValueToCaffe2Arg(arg, attr);
+    }
+  }
+
+  return args;
+}
+
+const std::unordered_map<std::string, int>&
+Caffe2Backend::get_broken_operators() const {
+  const static std::unordered_map<std::string, int> kBrokenOperators{};
+  return kBrokenOperators;
+}
+
+// Temporary hack for RNN related operators, as we don't have C++ interface in
+// C2 to build those operators yet
+const std::unordered_set<std::string>& Caffe2Backend::get_rnn_operators()
+    const {
+  const static std::unordered_set<std::string> kRNNOperators{
+      "LSTM", "GRU", "RNN"};
+  return kRNNOperators;
+}
+
+// Operators that are different between Caffe2 and
+// ONNX but only in their name.
+// In most cases, this should be empty - as the effort of ONNX is
+// to unify the operator definitions.
+const std::unordered_map<std::string, std::string>&
+Caffe2Backend::get_renamed_operators() const {
+  const static std::unordered_map<std::string, std::string> kRenamedOperators{
+      {"Caffe2ConvTranspose", "ConvTranspose"},
+      {"GlobalMaxPool", "MaxPool"},
+      {"GlobalAveragePool", "AveragePool"},
+      {"Pad", "PadImage"},
+      {"Neg", "Negative"},
+      {"BatchNormalization", "SpatialBN"},
+      {"InstanceNormalization", "InstanceNorm"},
+      {"MatMul", "BatchMatMul"},
+      {"Upsample", "ResizeNearest"},
+      {"Identity", "Copy"},
+      {"InstanceNormalization", "InstanceNorm"},
+      {"Equal", "EQ"},
+      {"Less", "LT"},
+      {"Greater", "GT"},
+      {"Unsqueeze", "ExpandDims"},
+      {"Tile", "NumpyTile"}};
+  return kRenamedOperators;
+}
+
+const std::unordered_map<std::string, std::string>&
+Caffe2Backend::get_renamed_attrs() const {
+  const static std::unordered_map<std::string, std::string> kRenamedAttrs{
+      {"kernel_shape", "kernels"}};
+  return kRenamedAttrs;
+}
+
+const std::
+    unordered_map<std::string, std::unordered_map<std::string, std::string>>&
+    Caffe2Backend::get_per_op_renamed_attrs() const {
+  const static std::
+      unordered_map<std::string, std::unordered_map<std::string, std::string>>
+          kPerOpRenamedAttrs = {{"Squeeze", {{"axes", "dims"}}},
+                                {"Unsqueeze", {{"axes", "dims"}}},
+                                {"Transpose", {{"perm", "axes"}}},
+                                {"ConvTranspose", {{"output_padding", "adjs"}}},
+                                {"Selu", {{"gamma", "scale"}}}};
+
+  return kPerOpRenamedAttrs;
+}
+
+// operators whose behavior is different beyond renaming
+// the value is an attribute of this class that is a
+// function from ToffeIR node_def to caffe2 op_def
+const std::unordered_map<std::string, Caffe2Backend::SpecialOpConverter>&
+Caffe2Backend::get_special_operators() const {
+  const static std::
+      unordered_map<std::string, Caffe2Backend::SpecialOpConverter>
+          kSpecialOperators = {
+              {"ArgMax", &Caffe2Backend::CreateArgMaxMin},
+              {"ArgMin", &Caffe2Backend::CreateArgMaxMin},
+              {"Cast", &Caffe2Backend::CreateCast},
+              {"Constant", &Caffe2Backend::CreateConstant},
+              {"Conv", &Caffe2Backend::CreateConvPoolOpBase},
+              {"AveragePool", &Caffe2Backend::CreatePadPool},
+              {"GlobalAveragePool", &Caffe2Backend::CreatePadPool},
+              {"GlobalMaxPool", &Caffe2Backend::CreateConvPoolOpBase},
+              {"MaxPool", &Caffe2Backend::CreateConvPoolOpBase},
+              {"Reshape", &Caffe2Backend::CreateReshape},
+              {"Gather", &Caffe2Backend::CreateGather},
+              {"Gemm", &Caffe2Backend::CreateGemm},
+              {"Pad", &Caffe2Backend::CreatePad},
+              {"Concat", &Caffe2Backend::CreateConcat},
+              {"LogSoftmax", &Caffe2Backend::CreateLogSoftmax},
+              {"Slice", &Caffe2Backend::CreateSlice},
+              {"Split", &Caffe2Backend::CreateSplit},
+              {"Reciprocal", &Caffe2Backend::CreateReciprocal},
+              {"BatchNormalization", &Caffe2Backend::CreateBatchNormalization},
+              {"MatMul", &Caffe2Backend::CreateMatMul},
+              {"Upsample", &Caffe2Backend::CreateUpsample},
+              {"Dropout", &Caffe2Backend::CreateDropout},
+              {"LRN", &Caffe2Backend::CreateLRN}};
+  return kSpecialOperators;
+}
+
+//============================
+// Special Operator Converters
+//============================
+
+Caffe2Ops Caffe2Backend::CreateArgMaxMin(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  auto& attributes = onnx_node->attributes;
+  if (!attributes.HasAttribute("axis")) {
+    auto* attr = attributes.AddRewrittenAttribute("axis");
+    attr->set_i(0);
+  }
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+
+  auto onnx_dtype =
+      onnx_node->attributes.get<int64_t>("to", TensorProto::UNDEFINED);
+  auto c2_dtype = caffe2::TensorProto::UNDEFINED;
+  switch (onnx_dtype) {
+    case ::ONNX_NAMESPACE::TensorProto::FLOAT:
+      c2_dtype = caffe2::TensorProto::FLOAT;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::UINT8:
+      c2_dtype = caffe2::TensorProto::UINT8;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::INT8:
+      c2_dtype = caffe2::TensorProto::INT8;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::UINT16:
+      c2_dtype = caffe2::TensorProto::UINT16;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::INT16:
+      c2_dtype = caffe2::TensorProto::INT16;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::INT32:
+      c2_dtype = caffe2::TensorProto::INT32;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::INT64:
+      c2_dtype = caffe2::TensorProto::INT64;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::STRING:
+      c2_dtype = caffe2::TensorProto::STRING;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::BOOL:
+      c2_dtype = caffe2::TensorProto::BOOL;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::FLOAT16:
+      c2_dtype = caffe2::TensorProto::FLOAT16;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::DOUBLE:
+      c2_dtype = caffe2::TensorProto::DOUBLE;
+      break;
+    case ::ONNX_NAMESPACE::TensorProto::UINT32:
+    case ::ONNX_NAMESPACE::TensorProto::UINT64:
+    case ::ONNX_NAMESPACE::TensorProto::COMPLEX64:
+    case ::ONNX_NAMESPACE::TensorProto::COMPLEX128:
+    case ::ONNX_NAMESPACE::TensorProto::UNDEFINED:
+      c2_dtype = caffe2::TensorProto::UNDEFINED;
+      break;
+  };
+
+  CAFFE_ENFORCE_NE(
+      c2_dtype,
+      caffe2::TensorProto::UNDEFINED,
+      "Casting to '",
+      onnx_dtype,
+      "' dtype is not supported");
+
+  CAFFE_ENFORCE_EQ(
+      c2_op.ops.Get(0).arg().size(),
+      1,
+      "Unexpected number of attributes in 'Cast'");
+  c2_op.ops.Mutable(0)->mutable_arg(0)->set_i(c2_dtype);
+
+  return c2_op;
+}
+
+Caffe2Ops Caffe2Backend::CreateConstant(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  CAFFE_ENFORCE_EQ(onnx_node->node.output_size(), 1);
+
+  Caffe2Ops ret;
+  auto* c2_op = ret.ops.Add();
+  const auto* value = onnx_node->attributes.get<const TensorProto*>("value");
+  BuildTensorFillingOp(c2_op, *value, onnx_node->node.output(0));
+
+  return ret;
+}
+
+//  Note [Caffe2 ConvPoolOpBase]
+//  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//  To understand what is going on here, we have to talk a little bit about
+//  Caffe2's internals.
+//
+//  First, it's important to know that all of Caffe2's pooling and convolution
+//  operators inherit from "ConvPoolOpBase", which is an abstract class that
+//  defines all of the attributes (kernels, dilations, strides, etc) which one
+//  sees on these operators.  Unfortunately, Caffe2's documentation generator
+//  doesn't know how to handle cases like this, so for example, if you look at
+//  the docs for MaxPool at
+//  <https://caffe2.ai/docs/operators-catalogue.html#maxpool> you won't see any
+//  of the attributes.  You have to go source diving to find the information; in
+//  particular, you want to look at:
+//  https://github.com/caffe2/caffe2/blob/master/caffe2/operators/conv_pool_op_base.h
+//  This class handles *global* pooling as well.
+//
+//  Second, it's important to know what Caffe2 expects for padding, which can
+//  be somewhat difficult to understand from the code because Caffe2 handles
+//  both singular/pluralized spellings of padding, and there is also legacy
+//  padding business.  The short version of the story is that, for NON-legacy
+//  padding (which is what we want to output), padding is expected to be
+//  *twice* the size of kernels.  So if you have a 2D convolution, Caffe2
+//  will accept two values in 'kernels', but FOUR values in 'pads';
+//  furthermore, this is *mandatory.*
+//
+//  Finally, ConvPoolOpBase is not the only class of it's kind; there is
+//  be tricked by the fact that Conv and ConvTranspose have similar
+//  parameters; they exercise different codepaths and need to be handled
+//  differently.
+Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  const auto& node = onnx_node->node;
+  auto& attributes = onnx_node->attributes;
+  if (node.op_type().find("Global") == 0) {
+    auto* attr = attributes.AddRewrittenAttribute("global_pooling");
+    attr->set_i(1);
+  }
+
+  if (attributes.HasAttribute("kernel_shape") &&
+      attributes.HasAttribute("pads")) {
+    auto kernel_shape =
+        attributes
+            .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
+                "kernel_shape");
+    auto pads =
+        attributes
+            .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
+                "pads");
+    if (kernel_shape.size() == pads.size()) {
+      // Caffe2 requires pads to be twice the size of kernels.
+      auto* attr = attributes.AddRewrittenAttribute("pads");
+      attr->mutable_ints()->CopyFrom(pads);
+      attr->mutable_ints()->MergeFrom(pads);
+    }
+  }
+
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
+  auto& node = onnx_node->node;
+  auto& attributes = onnx_node->attributes;
+  Caffe2Ops ret;
+  // Pad
+  bool padding = false;
+  const std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  const auto pad_input = dummy_->NewDummyName();
+  if (attributes.HasAttribute("count_include_pad") &&
+      attributes.HasAttribute(pad_name)) {
+    auto count_include_pad = attributes.get<int64_t>("count_include_pad", 0L);
+    ::google::protobuf::RepeatedField<::google::protobuf::int64> pads;
+    pads =
+        attributes
+            .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
+                pad_name);
+    if (count_include_pad == 1 && pads.size() == 4 &&
+        !(pads.Get(0) == 0 && pads.Get(1) == 0 && pads.Get(2) == 0 &&
+          pads.Get(3) == 0)) {
+      padding = true;
+      attributes.remove(pad_name);
+      caffe2::Argument arg_pads;
+      arg_pads.add_ints(pads.Get(0));
+      arg_pads.add_ints(pads.Get(1));
+      arg_pads.add_ints(pads.Get(2));
+      arg_pads.add_ints(pads.Get(3));
+      arg_pads.set_name("pads");
+      auto* c2_op = ret.ops.Add();
+      BuildOperator(
+          c2_op, "PadImage", {node.input(0)}, {pad_input}, {arg_pads});
+    } else if (count_include_pad == 1) {
+      std::string str;
+      bool pads_flag = false;
+      str += "[";
+      for (const auto& i : pads) {
+        str += caffe2::to_string(i) + ",";
+        pads_flag = pads_flag || i > 0;
+      }
+      str += "]";
+      if (pads_flag == true) {
+        CAFFE_THROW(
+            "Caffe2 only supports padding 2D Tensor, whereas padding is ", str);
+      }
+    }
+  }
+  // Pool
+  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version);
+  auto* pool_op = c2_ops.ops.Mutable(0);
+  if (padding) {
+    pool_op->set_input(0, pad_input);
+  }
+  auto* c2_op = ret.ops.Add();
+  c2_op->CopyFrom(*pool_op);
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
+  auto* op = c2_op.ops.Mutable(0);
+  op->add_output(dummy_->NewDummyName());
+
+  return c2_op;
+}
+
+Caffe2Ops Caffe2Backend::CreateReciprocal(
+    OnnxNode* onnx_node,
+    int /*opset_version*/) {
+  const auto& node = onnx_node->node;
+  if (node.input_size() != 1 || node.output_size() != 1) {
+    CAFFE_THROW("Caffe2 Reciprocal should have 1 input and 1 output");
+  }
+
+  Caffe2Ops ret;
+  auto* c2_op = ret.ops.Add();
+
+  caffe2::Argument exponent;
+  exponent.set_name("exponent");
+  exponent.set_f(-1.0);
+  BuildOperator(c2_op, "Pow", {node.input(0)}, {node.output(0)}, {exponent});
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) {
+  const auto& node = onnx_node->node;
+  if (node.input_size() < 2 || node.output_size() < 1) {
+    CAFFE_THROW("Caffe2 Gather should have 2 inputs and 1 output");
+  }
+
+  Caffe2Ops ret;
+  auto* c2_op = ret.ops.Add();
+
+  std::vector<std::string> inputs;
+  inputs.emplace_back(node.input(0));
+  inputs.emplace_back(node.input(1));
+  std::vector<std::string> outputs;
+  outputs.emplace_back(node.output(0));
+
+  auto axis = onnx_node->attributes.get<int64_t>("axis", 0L);
+  if (axis == 0) {
+    BuildOperator(c2_op, "Gather", inputs, outputs);
+  } else if (axis == 1) {
+    BuildOperator(c2_op, "BatchGather", inputs, outputs);
+  } else {
+    CAFFE_THROW(
+        "Caffe2 only supports Gather with axis being 1 or 2, ",
+        "whereas axis is ",
+        axis);
+  }
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
+  const auto& node = onnx_node->node;
+  if (node.input_size() < 3 || node.output_size() < 1) {
+    CAFFE_THROW("Caffe2 Gemm should have 3 inputs and 1 output");
+  }
+
+  Caffe2Ops ret;
+  auto input_a = node.input(0);
+  auto input_b = node.input(1);
+  auto input_c = node.input(2);
+  auto output = node.output(0);
+
+  auto alpha = onnx_node->attributes.get<float>("alpha", 1.0);
+  auto beta = onnx_node->attributes.get<float>("beta", 1.0);
+  if (!AlmostEqual(alpha, 1)) {
+    auto scaled_a = dummy_->NewDummyName();
+    caffe2::Argument scale;
+    scale.set_name("scale");
+    scale.set_f(alpha);
+
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Scale", {input_a}, {scaled_a}, {scale});
+    input_a = scaled_a;
+  }
+  if (!AlmostEqual(beta, 1)) {
+    auto scaled_c = dummy_->NewDummyName();
+    caffe2::Argument scale;
+    scale.set_name("scale");
+    scale.set_f(beta);
+
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Scale", {input_c}, {scaled_c}, {scale});
+    input_c = scaled_c;
+  }
+
+  auto trans_a = onnx_node->attributes.get<int64_t>("transA", 0L);
+  auto trans_b = onnx_node->attributes.get<int64_t>("transB", 0L);
+  auto broadcast = onnx_node->attributes.get<int64_t>("broadcast", 0L);
+  if ((!trans_a) && trans_b && broadcast) {
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
+  } else {
+    auto ab = dummy_->NewDummyName();
+    caffe2::Argument arg_trans_a;
+    arg_trans_a.set_name("trans_a");
+    arg_trans_a.set_i(trans_a);
+    caffe2::Argument arg_trans_b;
+    arg_trans_b.set_name("trans_b");
+    arg_trans_b.set_i(trans_b);
+
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(
+        c2_op, "MatMul", {input_a, input_b}, {ab}, {arg_trans_a, arg_trans_b});
+    c2_op = ret.ops.Add();
+    if (opset_version >= 7) {
+      BuildOperator(c2_op, "Add", {ab, input_c}, {output});
+    } else {
+      caffe2::Argument arg_broadcast;
+      arg_broadcast.set_name("broadcast");
+      arg_broadcast.set_i(broadcast);
+      BuildOperator(c2_op, "Add", {ab, input_c}, {output}, {arg_broadcast});
+    }
+  }
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) {
+  auto& attributes = onnx_node->attributes;
+  ::google::protobuf::RepeatedField<::google::protobuf::int64> pads;
+  std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  pads = attributes
+             .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
+                 pad_name);
+  std::string str;
+  std::stringstream ss;
+  ss << "[";
+  for (const auto& i : pads) {
+    ss << i << ", ";
+  }
+  ss << "]";
+  str = ss.str();
+
+  // Guard the invalid (negative) pads attribute.
+  for (const auto i : pads) {
+    if (i < 0) {
+      CAFFE_THROW("ONNX does not support negative pads in Pad, but get ", str);
+    }
+  }
+
+  // first two dim is for batch and channel. Note that now all the values are
+  // non-negative
+  if (!(pads.size() == 8 &&
+        (pads.Get(0) + pads.Get(1) + pads.Get(4) + pads.Get(5) == 0))) {
+    CAFFE_THROW(
+        "Caffe2 only supports padding 2D Tensor, whereas padding is ", str);
+  }
+
+  // rewrite the padding info
+  auto* attr = attributes.AddRewrittenAttribute(pad_name);
+  attr->add_ints(pads.Get(2));
+  attr->add_ints(pads.Get(3));
+  attr->add_ints(pads.Get(6));
+  attr->add_ints(pads.Get(7));
+
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+// TODO: Caffe2 Concat has an extra output. It should be only
+// used when doing training, so we should change Caffe2 to allow
+// 1 output.
+Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
+  auto* op = c2_op.ops.Mutable(0);
+  op->add_output(dummy_->NewDummyName());
+
+  return c2_op;
+}
+
+Caffe2Ops Caffe2Backend::CreateLogSoftmax(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  const auto& node = onnx_node->node;
+  if (node.input_size() < 1 || node.output_size() < 1) {
+    CAFFE_THROW("LogSoftmax should have 1 input and 1 output");
+  }
+  auto axis = onnx_node->attributes.get<int64_t>("axis", 1L);
+  caffe2::Argument arg_axis;
+  arg_axis.set_name("axis");
+  arg_axis.set_i(axis);
+  auto softmax_a = dummy_->NewDummyName();
+
+  Caffe2Ops ret;
+  auto* c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Softmax", {node.input(0)}, {softmax_a}, {arg_axis});
+  c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Log", {softmax_a}, {node.output(0)});
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) {
+  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1);
+  auto* op = op_tmp.ops.Mutable(0);
+  std::unordered_map<std::string, caffe2::Argument*> args;
+  for (auto& arg : *op->mutable_arg()) {
+    args.emplace(arg.name(), &arg);
+  }
+
+  caffe2::Argument starts_vals;
+  starts_vals.set_name("values");
+  auto pos = args.find("starts");
+  if (pos != args.end()) {
+    for (auto i : pos->second->ints()) {
+      starts_vals.add_ints(i);
+    }
+    args.erase(pos);
+  }
+
+  caffe2::Argument ends_vals;
+  ends_vals.set_name("values");
+  pos = args.find("ends");
+  if (pos != args.end()) {
+    for (auto i : pos->second->ints()) {
+      ends_vals.add_ints(i < 0 ? i - 1 : i);
+    }
+    args.erase(pos);
+  }
+
+  caffe2::Argument axes_vals;
+  axes_vals.set_name("values");
+  pos = args.find("axes");
+  if (pos != args.end()) {
+    for (auto i : pos->second->ints()) {
+      axes_vals.add_ints(i);
+    }
+    args.erase(pos);
+  } else {
+    auto ndim = starts_vals.ints_size();
+    for (int64_t i = 0; i < ndim; ++i) {
+      axes_vals.add_ints(i);
+    }
+  }
+
+  CAFFE_ENFORCE_GE(op->input_size(), 1);
+  auto data = op->input(0);
+  auto shape_tensor = dummy_->NewDummyName();
+  Caffe2Ops ret;
+
+  auto* c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Shape", {data}, {shape_tensor});
+
+  auto axes_tensor = dummy_->NewDummyName();
+  c2_op = ret.ops.Add();
+  {
+    caffe2::Argument shape;
+    shape.set_name("shape");
+    shape.add_ints(axes_vals.ints_size());
+    BuildOperator(
+        c2_op, "GivenTensorIntFill", {}, {axes_tensor}, {shape, axes_vals});
+  }
+
+  auto starts_vals_tensor = dummy_->NewDummyName();
+  auto starts_tensor = dummy_->NewDummyName();
+  auto casted_starts_tensor = dummy_->NewDummyName();
+  c2_op = ret.ops.Add();
+  {
+    caffe2::Argument shape_starts;
+    shape_starts.set_name("shape");
+    shape_starts.add_ints(starts_vals.ints_size());
+    BuildOperator(
+        c2_op,
+        "GivenTensorInt64Fill",
+        {},
+        {starts_vals_tensor},
+        {shape_starts, starts_vals});
+  }
+
+  caffe2::Argument dtype;
+  dtype.set_name("dtype");
+  dtype.set_i(static_cast<int64_t>(caffe2::TensorProto::INT64));
+  caffe2::Argument constant;
+  constant.set_name("value");
+  constant.set_i(0);
+  c2_op = ret.ops.Add();
+  BuildOperator(
+      c2_op,
+      "ConstantFill",
+      {shape_tensor},
+      {starts_tensor},
+      {dtype, constant});
+  c2_op = ret.ops.Add();
+  BuildOperator(
+      c2_op,
+      "ScatterAssign",
+      {starts_tensor, axes_tensor, starts_vals_tensor},
+      {starts_tensor});
+  // Slice only accepts starts as int
+  caffe2::Argument to;
+  to.set_name("to");
+  to.set_i(static_cast<int64_t>(caffe2::TensorProto::INT32));
+  c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Cast", {starts_tensor}, {casted_starts_tensor}, {to});
+
+  auto ends_vals_tensor = dummy_->NewDummyName();
+  auto ends_tensor = dummy_->NewDummyName();
+  auto casted_ends_tensor = dummy_->NewDummyName();
+  c2_op = ret.ops.Add();
+  {
+    caffe2::Argument shape_ends;
+    shape_ends.set_name("shape");
+    shape_ends.add_ints(ends_vals.ints_size());
+    BuildOperator(
+        c2_op,
+        "GivenTensorInt64Fill",
+        {},
+        {ends_vals_tensor},
+        {shape_ends, ends_vals});
+  }
+
+  constant.set_i(-1);
+  c2_op = ret.ops.Add();
+  BuildOperator(
+      c2_op, "ConstantFill", {shape_tensor}, {ends_tensor}, {dtype, constant});
+  c2_op = ret.ops.Add();
+  BuildOperator(
+      c2_op,
+      "ScatterAssign",
+      {ends_tensor, axes_tensor, ends_vals_tensor},
+      {ends_tensor});
+  // Slice only accepts ends as int
+  c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Cast", {ends_tensor}, {casted_ends_tensor}, {to});
+
+  // attach the original op at the end
+  c2_op = ret.ops.Add();
+  c2_op->CopyFrom(*op);
+  c2_op->mutable_input()->Clear();
+  c2_op->add_input(data);
+  c2_op->add_input(casted_starts_tensor);
+  c2_op->add_input(casted_ends_tensor);
+  c2_op->mutable_arg()->Clear();
+  for (const auto& kv : args) {
+    c2_op->add_arg()->CopyFrom(*kv.second);
+  }
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::CreateBatchNormalization(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  if (opset_version < 6) {
+    auto& attributes = onnx_node->attributes;
+    attributes.remove("consumed_inputs");
+  }
+
+  if (opset_version >= 7) {
+    auto& attributes = onnx_node->attributes;
+    auto* attr = attributes.AddRewrittenAttribute("is_test");
+    attr->set_i(1);
+  }
+
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreateSplit(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  auto& attributes = onnx_node->attributes;
+  if (!attributes.HasAttribute("axis")) {
+    auto* attr = attributes.AddRewrittenAttribute("axis");
+    attr->set_i(0);
+  }
+
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) {
+  const auto& node = onnx_node->node;
+  if (node.input_size() != 2) {
+    CAFFE_THROW("MatMul should have 2 inputs");
+  }
+
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
+  auto* op = c2_op.ops.Mutable(0);
+  auto* broadcast_arg = op->add_arg();
+  broadcast_arg->set_name("broadcast");
+  broadcast_arg->set_i(1);
+
+  return c2_op;
+}
+
+Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) {
+  auto& attributes = onnx_node->attributes;
+  attributes.remove("mode");
+  if (opset_version >= 7) {
+    const auto& scales = attributes.get<::google::protobuf::RepeatedField<float>>("scales");
+    if (scales.size() != 4) {
+      CAFFE_THROW("The scales argument should have size 4");
+    } else if (!AlmostEqual(scales.Get(0), 1) || !AlmostEqual(scales.Get(1), 1))  {
+      CAFFE_THROW("The first two elements in the scales argument must be 1");
+    }
+    attributes.remove("scales");
+    auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+    auto* op = c2_op.ops.Mutable(0);
+    auto* c2_height = op->add_arg();
+    c2_height->set_name("height_scale");
+    c2_height->set_f(scales.Get(2));
+    auto* c2_width = op->add_arg();
+    c2_width->set_name("width_scale");
+    c2_width->set_f(scales.Get(3));
+    return c2_op;
+  }
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreateDropout(OnnxNode* onnx_node, int opset_version) {
+  if (opset_version >= 7) {
+    auto& attributes = onnx_node->attributes;
+    auto* attr = attributes.AddRewrittenAttribute("is_test");
+    attr->set_i(1);
+  }
+
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+}
+
+Caffe2Ops Caffe2Backend::CreateLRN(OnnxNode* onnx_node, int opset_version) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  const auto& attributes = onnx_node->attributes;
+  if (!attributes.HasAttribute("alpha")) {
+      auto* arg = c2_op.ops.Mutable(0)->add_arg();
+      arg->set_name("alpha");
+      arg->set_f(1e-4);
+  }
+  if (!attributes.HasAttribute("beta")) {
+      auto* arg = c2_op.ops.Mutable(0)->add_arg();
+      arg->set_name("beta");
+      arg->set_f(0.75);
+  }
+  return c2_op;
+}
+
+//==============================================
+// Rest of the member functions for Caffe2Backend
+//==============================================
+std::unordered_set<std::string>
+Caffe2Backend::AllNamesInGraph(const GraphProto &graph) {
+  std::unordered_set<std::string> names;
+
+  for (const auto& input : graph.input()) {
+    names.emplace(input.name());
+  }
+  for (const auto& output : graph.output()) {
+    names.emplace(output.name());
+  }
+  for (const auto& node : graph.node()) {
+    for (const auto& n : node.input()) {
+      names.emplace(n);
+    }
+    for (const auto& n : node.output()) {
+      names.emplace(n);
+    }
+  }
+
+  return names;
+}
+
+//  This translator performs the basic translation of ONNX nodes into
+//  Caffe2 operators.  Besides doing a straightforward marshalling from
+//  one format to another, it also does these extra things:
+//
+//    - Renames operators based on 'renamed_operators'
+//    - Renames attributes based on 'renamed_attrs' and
+//      'get_per_op_renamed_attrs'
+//
+//  If you're writing a custom translator, consider calling this first,
+//  and then fixing things up further.
+Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
+    OnnxNode* onnx_node,
+    int opset_version) {
+  Caffe2Ops ret;
+  auto* c2_op = ret.ops.Add();
+
+  const auto& node = onnx_node->node;
+  c2_op->mutable_input()->MergeFrom(node.input());
+  c2_op->mutable_output()->MergeFrom(node.output());
+  c2_op->set_name(node.name());
+
+  const auto onnx_op_type = node.op_type();
+  auto broken_version = caffe2::get_default(
+      get_broken_operators(), onnx_op_type, std::numeric_limits<int>::max());
+  if (broken_version <= opset_version) {
+    CAFFE_THROW(
+        "Don't know how to translate op ",
+        onnx_op_type,
+        " in ONNX operator set v",
+        opset_version,
+        " (I only support prior to v",
+        broken_version);
+  }
+  c2_op->set_type(
+      caffe2::get_default(get_renamed_operators(), onnx_op_type, onnx_op_type));
+  if (!IsOperator(c2_op->type())) {
+    CAFFE_THROW(
+        "Don't know how to translate op ", onnx_op_type);
+  }
+
+  auto mapper = [&, this](const std::string& k) {
+    const auto it = get_per_op_renamed_attrs().find(onnx_op_type);
+    if (it != get_per_op_renamed_attrs().end()) {
+      const auto it_op = it->second.find(k);
+      if (it_op != it->second.end()) {
+        return it_op->second;
+      }
+    }
+    const auto it_global = get_renamed_attrs().find(k);
+    if (it_global != get_renamed_attrs().end()) {
+      return it_global->second;
+    }
+    return k;
+  };
+  c2_op->mutable_arg()->MergeFrom(
+      onnx_node->attributes.OnnxAttrToCaffe2Arg(mapper));
+
+  return ret;
+}
+
+Caffe2Ops Caffe2Backend::ConvertNode(
+    const std::string& node_str,
+    int opset_version) {
+  ::google::protobuf::RepeatedPtrField<NodeProto> nodes;
+  auto* n = nodes.Add();
+  ParseProtoFromLargeString(node_str, n);
+  ModelProto init_model;
+  ModelProto pred_model;
+  OnnxNode onnx_node = OnnxNode(nodes.Get(0));
+  return OnnxNodeToCaffe2Ops(init_model, pred_model, &onnx_node, opset_version);
+}
+
+void Caffe2Backend::CheckOpSchemaArguments(
+    const caffe2::OpSchema& schema,
+    const caffe2::OperatorDef& op) {
+  const auto& schema_args = schema.args();
+  if (schema_args.size() > 0){
+    std::vector<std::string> argnames;
+    std::transform(
+        schema_args.begin(),
+        schema_args.end(),
+        std::back_inserter(argnames),
+        [](caffe2::OpSchema::Argument elem) { return elem.name(); });
+
+    for (const auto& arg : op.arg()) {
+      if (std::count(argnames.begin(), argnames.end(), arg.name()) == 0) {
+        CAFFE_THROW(
+            "Don't know how to map unexpected argument ",
+            arg.name(),
+            " (from operator ",
+            op.type(), ")");
+      }
+    }
+  } else {
+    // A number of C2 operators do not declare proper arguments. Let's log the error
+    VLOG(2) << "Operator " << op.type() << " does not declare arguments in its schema. Please file a Caffe2 issue.";
+  }
+}
+
+Caffe2Ops Caffe2Backend::OnnxNodeToCaffe2Ops(
+    const ModelProto& init_model,
+    const ModelProto& pred_model,
+    OnnxNode* onnx_node,
+    int opset_version) {
+  Caffe2Ops res;
+  if (get_special_operators().count(onnx_node->node.op_type())) {
+    res = (this->*get_special_operators().at(onnx_node->node.op_type()))(
+        onnx_node, opset_version);
+  } else {
+    res = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  }
+
+  for (const auto& result_op: res.ops){
+    const auto* schema = OpSchemaRegistry::Schema(result_op.type());
+    if (schema) {
+      CheckOpSchemaArguments(*schema, result_op);
+    } else {
+      CAFFE_THROW("Caffe2 has no such operator, could not find schema for ", result_op.type());
+    }
+  }
+  return res;
+}
+
+void Caffe2Backend::OnnxToCaffe2(
+    caffe2::NetDef* init_net,
+    caffe2::NetDef* pred_net,
+    const ModelProto& onnx_model,
+    const std::string& device,
+    int opset_version,
+    bool include_initializers,
+    const std::vector<Caffe2Ops>& extras) {
+  auto device_option = GetDeviceOption(Device(device));
+
+#if !CAFFE2_MOBILE
+  ModelProto init_model = OptimizeOnnx(onnx_model, true);
+  ModelProto pred_model = OptimizeOnnx(onnx_model, false);
+#else
+  ModelProto init_model = ModelProto();
+  ModelProto pred_model = onnx_model;
+  pred_model.mutable_graph()->mutable_initializer()->Clear();
+#endif
+
+  init_net->set_name(onnx_model.graph().name() + "_init");
+  pred_net->set_name(onnx_model.graph().name() + "_predict");
+
+  // Convert initializer if necessary
+  if (include_initializers) {
+    for (const auto& tp : onnx_model.graph().initializer()) {
+      auto* c2_op = init_net->add_op();
+      BuildTensorFillingOp(c2_op, tp);
+    }
+  }
+
+  auto name_set = AllNamesInGraph(init_model.graph());
+  auto name_set_pred = AllNamesInGraph(pred_model.graph());
+  name_set.insert(name_set_pred.begin(), name_set_pred.end());
+  dummy_->Reset(name_set);
+
+  size_t idx_extra = 0;
+  auto converter = [&](const ModelProto& model, caffe2::NetDef* net) mutable {
+    net->mutable_device_option()->CopyFrom(device_option);
+    for (const auto& node : model.graph().node()) {
+      auto* init_net_tmp = include_initializers ? init_net : net;
+      // For RNN operators, we rely on Python code to convert them for us, and
+      // we simply deserilize the string. This is hack and eventually we want to
+      // get rid of this to have one flow. Note that we need to update the dummy
+      // name generator to avoid having duplicated names between Python and C++
+      // generated dummies
+      if (get_rnn_operators().count(node.op_type())) {
+        if (idx_extra < extras.size()) {
+          const auto& c2ops = extras[idx_extra++];
+          for (const auto& op : c2ops.init_ops) {
+            UpdateNames(dummy_, op);
+          }
+          init_net_tmp->mutable_op()->MergeFrom(c2ops.init_ops);
+          for (const auto& op : c2ops.ops) {
+            UpdateNames(dummy_, op);
+          }
+          net->mutable_op()->MergeFrom(c2ops.ops);
+          for (const auto& input : c2ops.interface_blobs) {
+            dummy_->AddName(input);
+          }
+          net->mutable_external_input()->MergeFrom(c2ops.interface_blobs);
+        } else {
+          CAFFE_THROW(
+              "Don't know how to convert ",
+              node.op_type(),
+              " without enough extra preconverted string");
+        }
+      } else {
+        auto onnx_node = OnnxNode(node);
+        auto c2ops = OnnxNodeToCaffe2Ops(
+            init_model, pred_model, &onnx_node, opset_version);
+        init_net_tmp->mutable_op()->MergeFrom(c2ops.init_ops);
+        net->mutable_op()->MergeFrom(c2ops.ops);
+        net->mutable_external_input()->MergeFrom(c2ops.interface_blobs);
+      }
+    }
+
+    for (const auto& value : model.graph().output()) {
+      net->add_external_output(value.name());
+    }
+    for (const auto& value : model.graph().input()) {
+      net->add_external_input(value.name());
+    }
+  };
+
+  converter(init_model, init_net);
+  converter(pred_model, pred_net);
+}
+
+Caffe2BackendRep* Caffe2Backend::Prepare(
+    const std::string& onnx_model_str,
+    const std::string& device,
+    const std::vector<Caffe2Ops>& extras) {
+  Caffe2BackendRep* rep = new Caffe2BackendRep();
+  ModelProto onnx_model;
+  ParseProtoFromLargeString(onnx_model_str, &onnx_model);
+
+#if !CAFFE2_MOBILE
+  ::ONNX_NAMESPACE::checker::check_model(onnx_model);
+#endif
+
+  int opset_version = -1;
+  for (const auto& imp : onnx_model.opset_import()) {
+    if ((!imp.has_domain()) || imp.domain().empty()) {
+      opset_version = imp.version();
+      if (opset_version > kKnownOpsetVersion) {
+        std::cout
+            << "This version of onnx-caffe2 targets ONNX operator set version "
+            << kKnownOpsetVersion
+            << ", but the model we are trying to import uses version "
+            << opset_version << ".  We will try to import it anyway, "
+            << "but if the model uses operators which had BC-breaking changes "
+               "in the intervening versions, import will fail."
+            << std::endl;
+      }
+    } else {
+      std::cout << "Unrecognized operator set " << opset_version << std::endl;
+    }
+  }
+  if (opset_version < 0) {
+    if (onnx_model.ir_version() >= 0x00000003) {
+      CAFFE_THROW(
+          "Model with IR version >= 3 did not specify ONNX operator set "
+          "version (onnx-caffe2 requires it)");
+    } else {
+      opset_version = 1;
+    }
+  }
+
+  // TODO: avoid extra copy by directly feed initialiers to backend blobs
+  OnnxToCaffe2(
+      &rep->init_net(),
+      &rep->pred_net(),
+      onnx_model,
+      device,
+      opset_version,
+      true,
+      extras);
+
+  // Get a list of uninitialized inputs to help with the inference setup
+  auto& uninitialized_inputs = rep->uninitialized_inputs();
+  std::unordered_set<std::string> initialized_inputs;
+  for (const auto& tp : onnx_model.graph().initializer()) {
+    initialized_inputs.emplace(tp.name());
+  }
+  for (const auto& input : onnx_model.graph().input()) {
+    if (!initialized_inputs.count(input.name())) {
+      uninitialized_inputs.emplace_back(input.name());
+    }
+  }
+
+  return rep;
+}
+
+void Caffe2Backend::BuildTensorFillingOp(
+    caffe2::OperatorDef* c2_op,
+    const TensorProto& onnx_tensor,
+    const std::string& name) {
+  auto fill_name = name.empty() ? onnx_tensor.name() : name;
+  CAFFE_ENFORCE(!fill_name.empty());
+
+  if (onnx_tensor.has_segment()) {
+    CAFFE_THROW("Currently not supporting loading segments.");
+  }
+
+  auto* c2_values = c2_op->add_arg();
+  c2_values->set_name("values");
+
+  if (onnx_tensor.data_type() == TensorProto::FLOAT) {
+    c2_op->set_type("GivenTensorFill");
+    auto* floats = c2_values->mutable_floats();
+    if (!TryConvertingTensorRawValues<float>(onnx_tensor, floats)) {
+      floats->CopyFrom(onnx_tensor.float_data());
+    }
+  } else if (onnx_tensor.data_type() == TensorProto::DOUBLE) {
+    c2_op->set_type("GivenTensorDoubleFill");
+    ::google::protobuf::RepeatedField<double> tmp;
+    const ::google::protobuf::RepeatedField<double>* src = &tmp;
+    if (!TryConvertingTensorRawValues<double>(onnx_tensor, &tmp)) {
+      src = &onnx_tensor.double_data();
+    }
+    for (const auto i : *src) {
+      c2_values->add_floats(i);
+    }
+  } else if (onnx_tensor.data_type() == TensorProto::INT64) {
+    c2_op->set_type("GivenTensorInt64Fill");
+    auto* ints = c2_values->mutable_ints();
+    if (!TryConvertingTensorRawValues<::google::protobuf::int64>(
+            onnx_tensor, ints)) {
+      ints->CopyFrom(onnx_tensor.int64_data());
+    }
+  } else if (onnx_tensor.data_type() == TensorProto::UINT32) {
+    c2_op->set_type("GivenTensorInt64Fill");
+    ::google::protobuf::RepeatedField<::google::protobuf::uint64> tmp;
+    const ::google::protobuf::RepeatedField<::google::protobuf::uint64>* src =
+        &tmp;
+    if (!TryConvertingTensorRawValues<::google::protobuf::uint64>(
+            onnx_tensor, &tmp)) {
+      src = &onnx_tensor.uint64_data();
+    }
+    for (const auto i : *src) {
+      c2_values->add_ints(i);
+    }
+  } else if (
+      onnx_tensor.data_type() == TensorProto::BOOL ||
+      onnx_tensor.data_type() == TensorProto::UINT8 ||
+      onnx_tensor.data_type() == TensorProto::INT8 ||
+      onnx_tensor.data_type() == TensorProto::UINT16 ||
+      onnx_tensor.data_type() == TensorProto::INT16 ||
+      onnx_tensor.data_type() == TensorProto::INT32) {
+    c2_op->set_type(
+        onnx_tensor.data_type() == TensorProto::BOOL ? "GivenTensorBoolFill"
+                                                     : "GivenTensorIntFill");
+    ::google::protobuf::RepeatedField<::google::protobuf::int32> tmp;
+    const ::google::protobuf::RepeatedField<::google::protobuf::int32>* src =
+        &tmp;
+    if (!TryConvertingTensorRawValues<::google::protobuf::int32>(
+            onnx_tensor, &tmp)) {
+      src = &onnx_tensor.int32_data();
+    }
+    for (const auto i : *src) {
+      c2_values->add_ints(i);
+    }
+  } else if (onnx_tensor.data_type() == TensorProto::STRING) {
+    c2_op->set_type("GivenTensorStringFill");
+    auto* strings = c2_values->mutable_strings();
+    strings->CopyFrom(onnx_tensor.string_data());
+  } else {
+    CAFFE_THROW(
+        "unrecognized tensor type: ",
+        TensorProto::DataType_Name(onnx_tensor.data_type()));
+  }
+
+  auto* c2_shape = c2_op->add_arg();
+  c2_shape->set_name("shape");
+  for (const auto d : onnx_tensor.dims()) {
+    c2_shape->add_ints(d);
+  }
+  c2_op->add_output(fill_name);
+}
+
+bool Caffe2Backend::SupportOp(const std::string type) const {
+  return get_special_operators().count(type);
+}
+
+} // namespace onnx
+} // namespace caffe2
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
new file mode 100644
index 0000000..437e572
--- /dev/null
+++ b/caffe2/onnx/backend.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include "caffe2/onnx/backend_rep.h"
+#include "caffe2/onnx/device.h"
+#include "caffe2/onnx/helper.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "onnx/onnx_pb.h"
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace caffe2 {
+namespace onnx {
+
+using ::ONNX_NAMESPACE::AttributeProto;
+using ::ONNX_NAMESPACE::GraphProto;
+using ::ONNX_NAMESPACE::ModelProto;
+using ::ONNX_NAMESPACE::NodeProto;
+using ::ONNX_NAMESPACE::TensorProto;
+
+// \brief This struct holds the converted ops after the onnx->c2 conversion.
+// Notice that for RNN ops, it may create ops in init_net. Hence we have the
+// `init_ops` field.
+struct Caffe2Ops {
+  ::google::protobuf::RepeatedPtrField<caffe2::OperatorDef> init_ops;
+  ::google::protobuf::RepeatedPtrField<caffe2::OperatorDef> ops;
+  ::google::protobuf::RepeatedPtrField<std::string> interface_blobs;
+};
+
+// A convenient class to query attributes of a NodeProto. Note that the
+// NodeProto can not be modified during the query of OnnxAttributes object
+class OnnxAttributes {
+ public:
+  OnnxAttributes(const NodeProto& node);
+
+  bool HasAttribute(const std::string& key) const {
+    return onnx_attrs_.count(key);
+  }
+
+  AttributeProto* AddRewrittenAttribute(const std::string& key) {
+    auto tmp = rewritten_onnx_attrs_.emplace(key, AttributeProto());
+    auto& attr = tmp.first->second;
+    attr.set_name(key);
+    return &attr;
+  }
+
+  ::google::protobuf::RepeatedPtrField<caffe2::Argument> OnnxAttrToCaffe2Arg(
+      std::function<std::string(const std::string&)> mapper) const;
+
+  // Get attribute given attribute name, specialied on data type T. Note that
+  // the return value is copied
+  template <typename T>
+  T get(const std::string& key) const;
+
+  template <typename T>
+  T get(const std::string& key, const T& default_value) const {
+    if (onnx_attrs_.count(key)) {
+      return get<T>(key);
+    } else {
+      return default_value;
+    }
+  }
+
+  const AttributeProto* remove(const std::string& key) {
+    const AttributeProto* result = nullptr;
+    auto iter = onnx_attrs_.find(key);
+    if (iter != onnx_attrs_.end()) {
+      result = iter->second;
+      onnx_attrs_.erase(iter);
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<std::string, const AttributeProto*> onnx_attrs_;
+  std::unordered_map<std::string, AttributeProto> rewritten_onnx_attrs_;
+};
+
+template <>
+int64_t OnnxAttributes::get(const std::string& key) const;
+template <>
+float OnnxAttributes::get(const std::string& key) const;
+
+template <>
+::google::protobuf::RepeatedPtrField<std::string> OnnxAttributes::get(
+    const std::string& key) const;
+
+template <>
+::google::protobuf::RepeatedField<::google::protobuf::int64>
+OnnxAttributes::get(const std::string& key) const;
+
+template <>
+::google::protobuf::RepeatedField<float>
+OnnxAttributes::get(const std::string& key) const;
+
+template <>
+const TensorProto* OnnxAttributes::get(const std::string& key) const;
+
+// convenient class for onnx node
+struct OnnxNode {
+  OnnxNode(const NodeProto& node_in) : node(node_in), attributes(node_in) {}
+
+  const NodeProto& node;
+
+  OnnxAttributes attributes;
+};
+
+class Caffe2Backend {
+ public:
+  // Since we still have this Python-C++ hybrid flow, we will need to take the
+  // DummyName generator from Python as a pointer. In this case, Python env owns
+  // the DummyName object and we don't need to keep track of its life time in
+  // C++. Therefore in this case, we use a null dtor to prevent C++ shared_ptr
+  // from releasing the object
+  Caffe2Backend(DummyName* dummy = nullptr) {
+    if (dummy) {
+      dummy_ = std::shared_ptr<DummyName>(dummy, [](DummyName *){});
+    } else {
+      dummy_ = std::make_shared<DummyName>();
+    }
+  }
+
+  Caffe2BackendRep* Prepare(
+      const std::string& onnx_model_str,
+      const std::string& device,
+      const std::vector<Caffe2Ops>& extras);
+
+  bool SupportOp(const std::string tyep) const;
+
+  Caffe2Ops ConvertNode(const std::string& node_str, int opset_version);
+
+  void BuildTensorFillingOp(
+      caffe2::OperatorDef* c2_op,
+      const TensorProto& onnx_tensor,
+      const std::string& name = "");
+
+ private:
+  using SpecialOpConverter = Caffe2Ops (Caffe2Backend::*)(OnnxNode*, int);
+
+  void OnnxToCaffe2(
+      caffe2::NetDef* init_net,
+      caffe2::NetDef* pred_net,
+      const ModelProto& onnx_model,
+      const std::string& device,
+      int opset_version,
+      bool include_initializers,
+      const std::vector<Caffe2Ops>& extras);
+
+  void CheckOpSchemaArguments(const caffe2::OpSchema& schema, const caffe2::OperatorDef& op);
+
+  Caffe2Ops OnnxNodeToCaffe2Ops(
+      const ModelProto& init_model,
+      const ModelProto& pred_model,
+      OnnxNode* onnx_node,
+      int opset_version);
+
+  std::unordered_set<std::string> AllNamesInGraph(const GraphProto& graph);
+
+  Caffe2Ops CommonOnnxNodeToCaffe2Ops(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateCast(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateConstant(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateGemm(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreatePad(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateConcat(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateSlice(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateSplit(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateBatchNormalization(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateMatMul(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateUpsample(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateDropout(OnnxNode* onnx_node, int opset_version);
+
+  Caffe2Ops CreateLRN(OnnxNode* onnx_node, int opset_version);
+
+
+  // LUT related getters
+  const std::unordered_map<std::string, std::string>& get_renamed_operators()
+      const;
+  const std::unordered_set<std::string>& get_rnn_operators() const;
+  const std::unordered_map<std::string, int>& get_broken_operators() const;
+  const std::unordered_map<std::string, std::string>& get_renamed_attrs() const;
+  const std::
+      unordered_map<std::string, std::unordered_map<std::string, std::string>>&
+      get_per_op_renamed_attrs() const;
+  const std::unordered_map<std::string, Caffe2Backend::SpecialOpConverter>&
+  get_special_operators() const;
+
+  // Dummy name generator
+  std::shared_ptr<DummyName> dummy_;
+};
+
+} // namespace onnx
+} // namespace caffe2
diff --git a/caffe2/onnx/backend_rep.cc b/caffe2/onnx/backend_rep.cc
new file mode 100644
index 0000000..092e40c
--- /dev/null
+++ b/caffe2/onnx/backend_rep.cc
@@ -0,0 +1,31 @@
+#include "caffe2/core/common.h"
+#include "caffe2/onnx/backend_rep.h"
+
+#include <iostream>
+
+namespace caffe2 { namespace onnx {
+
+void Caffe2BackendRep::CheckInit() {
+  if (!predictor_) {
+    predictor_ = caffe2::make_unique<caffe2::Predictor>(init_net_, pred_net_);
+    init_net_.Clear();
+    pred_net_.Clear();
+  }
+}
+
+
+void Caffe2BackendRep::Run(
+    const caffe2::Predictor::TensorVector& inputs,
+    caffe2::Predictor::TensorVector* outputs) {
+  CheckInit();
+  predictor_->run(inputs, outputs);
+}
+
+void Caffe2BackendRep::RunMap(
+    const caffe2::Predictor::TensorMap& inputs,
+    caffe2::Predictor::TensorVector* outputs) {
+  CheckInit();
+  predictor_->run_map(inputs, outputs);
+}
+
+}}
diff --git a/caffe2/onnx/backend_rep.h b/caffe2/onnx/backend_rep.h
new file mode 100644
index 0000000..5fe503b
--- /dev/null
+++ b/caffe2/onnx/backend_rep.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "caffe2/core/predictor.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace caffe2 { namespace onnx {
+class Caffe2BackendRep {
+ public:
+  void Run(
+      const caffe2::Predictor::TensorVector& inputs,
+      caffe2::Predictor::TensorVector* outputs);
+  void RunMap(
+      const caffe2::Predictor::TensorMap& inputs,
+      caffe2::Predictor::TensorVector* outputs);
+
+  caffe2::NetDef& init_net() {
+    return init_net_;
+  }
+  caffe2::NetDef& pred_net() {
+    return pred_net_;
+  }
+  std::vector<std::string>& uninitialized_inputs() {
+    return uninitialized_inputs_;
+  }
+
+  const caffe2::NetDef& init_net() const {
+    return init_net_;
+  }
+  const caffe2::NetDef& pred_net() const {
+    return pred_net_;
+  }
+  const std::vector<std::string>& uninitialized_inputs() const {
+    return uninitialized_inputs_;
+  }
+
+ private:
+  void CheckInit();
+
+  caffe2::NetDef init_net_;
+  caffe2::NetDef pred_net_;
+  std::vector<std::string> uninitialized_inputs_;
+  std::unique_ptr<caffe2::Predictor> predictor_{nullptr};
+};
+}}
diff --git a/caffe2/onnx/device.cc b/caffe2/onnx/device.cc
new file mode 100644
index 0000000..7808c54
--- /dev/null
+++ b/caffe2/onnx/device.cc
@@ -0,0 +1,18 @@
+#include "caffe2/onnx/device.h"
+
+#include <cstdlib>
+#include <unordered_map>
+
+namespace caffe2 { namespace onnx {
+static const std::unordered_map<std::string, DeviceType> kDeviceMap = {
+  {"CPU", DeviceType::CPU},
+  {"CUDA", DeviceType::CUDA}
+};
+
+Device::Device(const std::string &spec) {
+  auto pos = spec.find_first_of(':');
+  type = kDeviceMap.at(spec.substr(0, pos - 1));
+  device_id = atoi(spec.substr(pos + 1).c_str());
+}
+}}
+
diff --git a/caffe2/onnx/device.h b/caffe2/onnx/device.h
new file mode 100644
index 0000000..447d9fd
--- /dev/null
+++ b/caffe2/onnx/device.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <functional>
+#include <string>
+
+namespace caffe2 { namespace onnx {
+
+enum class DeviceType {CPU=0, CUDA=1};
+
+struct Device {
+  Device(const std::string& spec);
+  DeviceType type;
+  int device_id{-1};
+};
+
+}}
+
+namespace std {
+template <> struct hash<caffe2::onnx::DeviceType> {
+  std::size_t operator()(const caffe2::onnx::DeviceType &k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
diff --git a/caffe2/onnx/helper.cc b/caffe2/onnx/helper.cc
new file mode 100644
index 0000000..d2499e6
--- /dev/null
+++ b/caffe2/onnx/helper.cc
@@ -0,0 +1,45 @@
+#include "caffe2/onnx/helper.h"
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 { namespace onnx  {
+
+std::string DummyName::NewDummyName() {
+  while (true) {
+    const std::string name = caffe2::MakeString("OC2_DUMMY_", counter_++);
+    auto ret = used_names_.insert(name);
+    if (ret.second) {
+      return name;
+    }
+  }
+}
+
+void DummyName::Reset(const std::unordered_set<std::string> &used_names) {
+  used_names_ = used_names;
+  counter_ = 0;
+}
+
+NodeProto MakeNode(
+    const std::string& type,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<AttributeProto>& attributes,
+    const std::string& name) {
+  NodeProto node;
+  if (!name.empty()) {
+    node.set_name(name);
+  }
+  node.set_op_type(type);
+  for (const auto& input: inputs) {
+    node.add_input(input);
+  }
+  for (const auto& output: outputs) {
+    node.add_output(output);
+  }
+  for (const auto& attr: attributes) {
+    node.add_attribute()->CopyFrom(attr);
+  }
+  return node;
+}
+}}
diff --git a/caffe2/onnx/helper.h b/caffe2/onnx/helper.h
new file mode 100644
index 0000000..85b27dd
--- /dev/null
+++ b/caffe2/onnx/helper.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "onnx/onnx_pb.h"
+
+#include <set>
+#include <string>
+#include <unordered_set>
+
+namespace caffe2 {
+namespace onnx {
+
+using ::ONNX_NAMESPACE::AttributeProto;
+using ::ONNX_NAMESPACE::NodeProto;
+
+// \brief This class generates unique dummy names
+class DummyName {
+ public:
+  std::string NewDummyName();
+
+  void Reset(const std::unordered_set<std::string>& used_names);
+
+  void AddName(const std::string& new_used) {
+    used_names_.insert(new_used);
+  }
+
+ private:
+  std::unordered_set<std::string> used_names_;
+  size_t counter_{0};
+};
+
+inline AttributeProto MakeAttribute(
+    const std::string& name,
+    const std::vector<int64_t>& vals) {
+  AttributeProto attr;
+  attr.set_name(name);
+  for (const auto v : vals) {
+    attr.add_ints(v);
+  }
+  attr.set_type(AttributeProto::INTS);
+  return attr;
+}
+
+inline AttributeProto MakeAttribute(const std::string& name, int64_t val) {
+  AttributeProto attr;
+  attr.set_name(name);
+  attr.set_i(val);
+  attr.set_type(AttributeProto::INT);
+  return attr;
+}
+
+inline AttributeProto MakeAttribute(
+    const std::string& name,
+    const std::string& val) {
+  AttributeProto attr;
+  attr.set_name(name);
+  attr.set_s(val);
+  attr.set_type(AttributeProto::STRING);
+  return attr;
+}
+
+NodeProto MakeNode(
+    const std::string& type,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<AttributeProto>& attributes,
+    const std::string& name = "");
+
+inline NodeProto MakeNode(
+    const std::string& type,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::string& name = "") {
+  return MakeNode(type, inputs, outputs, {}, name);
+}
+
+} // namespace onnx
+} // namespace caffe2
diff --git a/caffe2/onnx/onnx_exporter.cc b/caffe2/onnx/onnx_exporter.cc
new file mode 100644
index 0000000..02d870d
--- /dev/null
+++ b/caffe2/onnx/onnx_exporter.cc
@@ -0,0 +1,911 @@
+#include "caffe2/onnx/onnx_exporter.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/onnx/helper.h"
+#include "caffe2/proto/caffe2_legacy.pb.h"
+#include "caffe2/utils/map_utils.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include <unordered_set>
+
+namespace caffe2 {
+namespace onnx {
+
+namespace {
+// rewrite padding attributes
+void ApplyTrans(
+    std::unordered_map<std::string, AttributeProto>* attrs,
+    bool global,
+    const std::string& k,
+    int dim = 2,
+    const std::string& ks = "") {
+  std::string ks2 = ks.empty() ? (k + "s") : ks;
+  std::string k_h, k_w, k_t, k_l, k_b, k_r;
+  if (dim == 2) {
+    k_h = k + "_h";
+    k_w = k + "_w";
+  } else {
+    k_t = k + "_t";
+    k_l = k + "_l";
+    k_b = k + "_b";
+    k_r = k + "_r";
+  }
+
+  std::vector<int64_t> vals;
+  if (dim == 2 && attrs->count(k_h) && attrs->count(k_w)) {
+    auto it = attrs->find(k_h);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+    it = attrs->find(k_w);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+  } else if (
+      dim == 4 && attrs->count(k_t) && attrs->count(k_b) && attrs->count(k_l) &&
+      attrs->count(k_r)) {
+    auto it = attrs->find(k_t);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+    it = attrs->find(k_l);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+    it = attrs->find(k_b);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+    it = attrs->find(k_r);
+    vals.push_back(it->second.i());
+    attrs->erase(it);
+  } else if (attrs->count(k)) {
+    auto it = attrs->find(k);
+    auto tmp = it->second.i();
+    for (int i = 0; i < dim; ++i) {
+      vals.push_back(tmp);
+    }
+    attrs->erase(it);
+  }
+
+  if (!vals.empty() && !global) {
+    attrs->emplace(ks2, MakeAttribute(ks2, vals));
+  }
+}
+
+int64_t DimProd(const caffe2::TensorShape& shape, int start, int end) {
+  int64_t acc = 1;
+  for (int i = start; i < end; ++i) {
+    acc *= shape.dims(i);
+  }
+  return acc;
+}
+
+TensorProto CreateOnnxShapeTensor(
+    std::shared_ptr<DummyName> dummy,
+    const std::vector<int64_t>& shape) {
+  TensorProto tensor;
+  tensor.set_name(dummy->NewDummyName());
+  tensor.set_data_type(TensorProto::INT64);
+  tensor.add_dims(shape.size());
+  tensor.mutable_raw_data()->assign(
+      reinterpret_cast<const char*>(shape.data()),
+      sizeof(int64_t) * shape.size());
+  return tensor;
+}
+
+std::string SsaName(const std::string& n, int version) {
+  return MakeString(n, "_", version);
+}
+} // namespace
+
+std::unordered_map<std::string, std::string> SsaRewrite(
+    caffe2::NetDef* init_net,
+    caffe2::NetDef* pred_net) {
+  std::unordered_map<std::string, std::string> input_mapping;
+  std::unordered_map<std::string, int> blob_versions;
+
+#define REWRITE_EXTERNAL_IO(net, name)                 \
+  for (auto& name : *net->mutable_external_##name()) { \
+    auto version = blob_versions.at(name);             \
+    auto new_##name = SsaName(name, version);          \
+    name##_mapping.emplace(new_##name, name);          \
+    name = new_##name;                                 \
+  }
+
+  if (init_net) {
+    for (auto& op : *init_net->mutable_op()) {
+      CAFFE_ENFORCE_EQ(op.type().find("GivenTensor"), 0);
+      CAFFE_ENFORCE_EQ(op.type().rfind("Fill"), op.type().size() - 4);
+      CAFFE_ENFORCE_EQ(op.output_size(), 1);
+      const auto& output = op.output(0);
+      op.set_output(0, SsaName(output, 0));
+    }
+    for (const auto& input : init_net->external_input()) {
+      blob_versions.emplace(input, 0);
+    }
+    for (const auto& output : init_net->external_output()) {
+      blob_versions.emplace(output, 0);
+    }
+    REWRITE_EXTERNAL_IO(init_net, input);
+    blob_versions.clear();
+  }
+
+  if (pred_net) {
+    for (const auto& input : pred_net->external_input()) {
+      blob_versions.emplace(input, 0);
+    }
+    REWRITE_EXTERNAL_IO(pred_net, input);
+    for (auto& op : *pred_net->mutable_op()) {
+      for (auto& input : *op.mutable_input()) {
+        const auto it = blob_versions.find(input);
+        if (it != blob_versions.end()) {
+          input = SsaName(input, it->second);
+        } else {
+          blob_versions.emplace(input, 0);
+          input = SsaName(input, 0);
+        }
+      }
+      for (auto& output : *op.mutable_output()) {
+        auto it = blob_versions.find(output);
+        if (it != blob_versions.end()) {
+          it->second += 1;
+          output = SsaName(output, it->second);
+        } else {
+          blob_versions.emplace(output, 0);
+          output = SsaName(output, 0);
+        }
+      }
+    }
+
+    // Fix the external output name back to original
+    std::unordered_set<std::string> external_outputs;
+    for (const auto& output : pred_net->external_output()) {
+      external_outputs.emplace(output);
+    }
+    for (auto& op : *pred_net->mutable_op()) {
+      for (auto& output : *op.mutable_output()) {
+        auto pos = output.find_last_of('_');
+        CAFFE_ENFORCE_NE(pos, 0);
+        auto basename = output.substr(0, pos);
+        if (!external_outputs.count(basename)) {
+          continue;
+        }
+        auto it = blob_versions.find(basename);
+        if (it != blob_versions.end() &&
+            SsaName(basename, it->second) == output) {
+          output = basename;
+        }
+      }
+    }
+  }
+#undef REWRITE_EXTERNAL_IO
+
+  return input_mapping;
+}
+
+const std::unordered_map<std::string, std::string>&
+OnnxExporter::get_renamed_operators() const {
+  const static std::unordered_map<std::string, std::string> kRenamedOperators{
+      {"SpatialBN", "BatchNormalization"},
+      {"Conv1D", "Conv"},
+      {"Conv2D", "Conv"},
+      {"Conv3D", "Conv"},
+      {"ConvTranspose1D", "ConvTranspose"},
+      {"ConvTranspose2D", "ConvTranspose"},
+      {"ConvTranspose3D", "ConvTranspose"},
+      {"MaxPool1D", "MaxPool"},
+      {"MaxPool2D", "MaxPool"},
+      {"MaxPool3D", "MaxPool"},
+      {"AveragePool1D", "AveragePool"},
+      {"AveragePool2D", "AveragePool"},
+      {"AveragePool3D", "AveragePool"}};
+  return kRenamedOperators;
+}
+
+const std::unordered_map<std::string, std::string>&
+OnnxExporter::get_renamed_attrs() const {
+  const static std::unordered_map<std::string, std::string> kRenamedAttrs{
+      {"kernels", "kernel_shape"}};
+  return kRenamedAttrs;
+}
+
+const std::
+    unordered_map<std::string, std::unordered_map<std::string, std::string>>&
+    OnnxExporter::get_per_op_renamed_attrs() const {
+  const static std::
+      unordered_map<std::string, std::unordered_map<std::string, std::string>>
+          kPerOpRenamedAttrs = {{"Squeeze", {{"dims", "axes"}}},
+                                {"Unsqueeze", {{"dims", "axes"}}},
+                                {"Transpose", {{"axes", "perm"}}},
+                                {"ConvTranspose", {{"adjs", "output_padding"}}},
+                                {"Selu", {{"scale", "gamma"}}}};
+
+  return kPerOpRenamedAttrs;
+}
+
+const std::unordered_map<std::string, OnnxExporter::SpecialOpConverter>&
+OnnxExporter::get_special_operators() const {
+  const static std::unordered_map<std::string, OnnxExporter::SpecialOpConverter>
+      kSpecialOperators = {
+          {"ArgMax", &OnnxExporter::CreateArgMaxMinOpNodes},
+          {"ArgMin", &OnnxExporter::CreateArgMaxMinOpNodes},
+          {"Add", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Sub", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Mul", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Div", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Pow", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"And", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Or", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Xor", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Equal", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Greater", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Less", &OnnxExporter::CreateBinaryElementwiseOpNodes},
+          {"Cast", &OnnxExporter::CreateCastNodes},
+          {"Conv", &OnnxExporter::CreateConvPoolNodes},
+          {"ConvTranspose", &OnnxExporter::CreateConvPoolNodes},
+          {"MaxPool", &OnnxExporter::CreateConvPoolNodes},
+          {"AveragePool", &OnnxExporter::CreateConvPoolNodes},
+          {"FC", &OnnxExporter::CreateGemmNodes},
+          {"Concat", &OnnxExporter::CreateConcatNodes},
+          {"LRN", &OnnxExporter::CreateLrnNodes},
+          {"Reshape", &OnnxExporter::CreateReshapeNodes},
+          {"Slice", &OnnxExporter::CreateSliceNodes},
+          {"ChannelShuffle", &OnnxExporter::CreateChannelShuffleNodes}};
+  return kSpecialOperators;
+}
+
+void OnnxExporter::CopyCaffe2ArgToOnnxAttr(
+    AttributeProto* attr,
+    const std::string& op_type,
+    const caffe2::Argument& arg) {
+  std::string name;
+  const auto& per_op_renamed_attr_lut = get_per_op_renamed_attrs();
+  const auto it = per_op_renamed_attr_lut.find(op_type);
+  if (it != per_op_renamed_attr_lut.end()) {
+    name = caffe2::get_default(it->second, arg.name(), arg.name());
+  } else {
+    name = caffe2::get_default(get_renamed_attrs(), arg.name(), arg.name());
+  }
+  attr->set_name(name);
+
+  if (arg.has_f()) {
+    attr->set_f(arg.f());
+    attr->set_type(AttributeProto::FLOAT);
+  } else if (arg.has_i()) {
+    attr->set_i(arg.i());
+    attr->set_type(AttributeProto::INT);
+  } else if (arg.has_s()) {
+    attr->set_s(arg.s());
+    attr->set_type(AttributeProto::STRING);
+  } else if (arg.floats_size()) {
+    attr->mutable_floats()->CopyFrom(arg.floats());
+    attr->set_type(AttributeProto::STRINGS);
+  } else if (arg.ints_size()) {
+    attr->mutable_ints()->CopyFrom(arg.ints());
+    attr->set_type(AttributeProto::INTS);
+  } else if (arg.strings_size()) {
+    attr->mutable_strings()->CopyFrom(arg.strings());
+    attr->set_type(AttributeProto::STRINGS);
+  } else {
+    CAFFE_THROW(
+        caffe2::MakeString("Unsupported Caffe2 argument: ", arg.name()));
+  }
+}
+
+bool OnnxExporter::IsBlackListed(const caffe2::Argument& arg) {
+  const static std::unordered_map<std::string, std::unordered_set<std::string>>
+      kBlackListString = {{"order", {"NCHW"}}};
+  const static std::unordered_map<std::string, std::unordered_set<int64_t>>
+      kBlackListInt = {{"cudnn_exhaustive_search", {0, 1}},
+                       {"use_cudnn", {0, 1}},
+                       {"is_test", {0, 1}},
+                       {"broadcast", {0, 1}}};
+
+  if (arg.has_i()) {
+    const auto it = kBlackListInt.find(arg.name());
+    if (it != kBlackListInt.end()) {
+      return it->second.count(arg.i());
+    }
+  } else if (arg.has_s()) {
+    const auto it = kBlackListString.find(arg.name());
+    if (it != kBlackListString.end()) {
+      return it->second.count(arg.s());
+    }
+  }
+
+  return false;
+}
+
+ConvertedResult OnnxExporter::Caffe2OpToOnnxNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  std::string type = def.type();
+  const auto& renamed_op_lut = get_renamed_operators();
+  const auto it = renamed_op_lut.find(type);
+  if (it != renamed_op_lut.end()) {
+    type = it->second;
+  }
+  const auto& special_op_lut = get_special_operators();
+  const auto it_op = get_special_operators().find(type);
+  if (it_op != special_op_lut.end()) {
+    return (this->*(it_op->second))(def, shapes);
+  } else {
+    return CommonCaffe2OpToOnnxNodes(def);
+  }
+}
+
+ConvertedResult OnnxExporter::CommonCaffe2OpToOnnxNodes(
+    const caffe2::OperatorDef& def) {
+  ConvertedResult result;
+  auto& nodes = result.first;
+  nodes.emplace_back();
+  NodeProto& node = nodes.back();
+  node.set_name(def.name());
+  node.set_op_type(
+      caffe2::get_default(get_renamed_operators(), def.type(), def.type()));
+  for (const auto& i : def.input()) {
+    node.add_input(i);
+  }
+  for (const auto& o : def.output()) {
+    node.add_output(o);
+  }
+  for (const auto& a : def.arg()) {
+    if (!IsBlackListed(a)) {
+      auto* attr = node.add_attribute();
+      CopyCaffe2ArgToOnnxAttr(attr, def.type(), a);
+    }
+  }
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateArgMaxMinOpNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+
+  CAFFE_ENFORCE_EQ(nodes.size(), 1);
+  auto& node = nodes.back();
+
+  if (!ArgumentHelper::HasArgument(def, "axis")) {
+    const auto& x = def.input(0);
+    const auto& x_shape = shapes.at(x);
+    node.add_attribute()->CopyFrom(
+        MakeAttribute("axis", x_shape.dims().size() - 1));
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateBinaryElementwiseOpNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  caffe2::OperatorDef mdef(def); // The modified def without broadcast and axis
+  const auto& x = mdef.input(0);
+  const auto& y = def.input(1); // Refer to the old def, later won't change it.
+  const auto& x_shape = shapes.at(x);
+  const auto& y_shape = shapes.at(y);
+  for (int i = 0; i < mdef.arg_size(); ++i) {
+    const auto& arg = mdef.arg(i);
+    if (arg.name() == "broadcast") {
+      ArgumentHelper::RemoveArgument(mdef, i);
+      break;
+    }
+  }
+  std::vector<int64_t> axes;
+  for (int i = 0; i < mdef.arg_size(); ++i) {
+    const auto& arg = mdef.arg(i);
+    if (arg.name() == "axis") {
+      int64_t axis = arg.i();
+      if (x_shape.dims().size() - axis != y_shape.dims().size()) {
+        // The upper bound (excluded) of expanded y.
+        int64_t end_dim =
+            y_shape.dims().size() - 1 - axis + x_shape.dims().size();
+        axes.resize(end_dim - y_shape.dims().size());
+        std::iota(axes.begin(), axes.end(), y_shape.dims().size());
+        mdef.set_input(1, dummy_->NewDummyName());
+      }
+      ArgumentHelper::RemoveArgument(mdef, i);
+      break;
+    }
+  }
+
+  auto result = CommonCaffe2OpToOnnxNodes(mdef);
+  if (axes.size() != 0) {
+    result.first.insert(
+        result.first.begin(),
+        MakeNode(
+            "Unsqueeze", {y}, {mdef.input(1)}, {MakeAttribute("axes", axes)}));
+  }
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateCastNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto* attr = result.first[0].mutable_attribute(0);
+  auto onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UNDEFINED;
+  const auto& arg = def.arg(0);
+  if (arg.has_s()) {
+    auto c2_dtype = arg.s();
+    std::transform(
+        c2_dtype.begin(), c2_dtype.end(), c2_dtype.begin(), ::toupper);
+    if (c2_dtype == "FLOAT") {
+    } else if (c2_dtype == "INT32") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::FLOAT;
+    } else if (c2_dtype == "BOOL") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::BOOL;
+    } else if (c2_dtype == "UINT8") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UINT8;
+    } else if (c2_dtype == "INT8") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT8;
+    } else if (c2_dtype == "UINT16") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UINT16;
+    } else if (c2_dtype == "INT16") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT16;
+    } else if (c2_dtype == "INT64") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT64;
+    } else if (c2_dtype == "FLOAT16") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::FLOAT16;
+    } else if (c2_dtype == "DOUBLE") {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::DOUBLE;
+    } else {
+      onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UNDEFINED;
+    }
+    CAFFE_ENFORCE_NE(
+        onnx_dtype,
+        ::ONNX_NAMESPACE::TensorProto::UNDEFINED,
+        "Casting to '",
+        c2_dtype,
+        "' dtype is not supported");
+    attr->clear_s();
+    attr->set_type(AttributeProto::INT);
+  } else if (arg.has_i()) {
+    const auto& c2_dtype = arg.i();
+    switch (c2_dtype) {
+      case caffe2::TensorProto::FLOAT:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::FLOAT;
+        break;
+      case caffe2::TensorProto::INT32:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT32;
+        break;
+      case caffe2::TensorProto::BOOL:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::BOOL;
+        break;
+      case caffe2::TensorProto::UINT8:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UINT8;
+        break;
+      case caffe2::TensorProto::INT8:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT8;
+        break;
+      case caffe2::TensorProto::UINT16:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UINT16;
+        break;
+      case caffe2::TensorProto::INT16:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT16;
+        break;
+      case caffe2::TensorProto::INT64:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::INT64;
+        break;
+      case caffe2::TensorProto::FLOAT16:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::FLOAT16;
+        break;
+      case caffe2::TensorProto::DOUBLE:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::DOUBLE;
+        break;
+
+      case caffe2::TensorProto::STRING:
+      case caffe2::TensorProto::BYTE:
+      case caffe2::TensorProto::UNDEFINED:
+        onnx_dtype = ::ONNX_NAMESPACE::TensorProto::UNDEFINED;
+        break;
+    }
+    CAFFE_ENFORCE_NE(
+        onnx_dtype,
+        ::ONNX_NAMESPACE::TensorProto::UNDEFINED,
+        "Casting to '",
+        c2_dtype,
+        "' dtype is not supported");
+  }
+  attr->set_i(onnx_dtype);
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateConvPoolNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+  auto& node = nodes.back();
+
+  std::unordered_map<std::string, AttributeProto> attrs;
+  for (const auto& attr : node.attribute()) {
+    attrs.emplace(attr.name(), attr);
+  }
+
+  // Handle global pooling
+  bool global = false;
+  if (node.op_type() == "MaxPool" || node.op_type() == "AveragePool") {
+    auto it = attrs.find("global_pooling");
+    if (it != attrs.end() && it->second.has_i() && it->second.i()) {
+      node.set_op_type("Global" + node.op_type());
+      global = true;
+      attrs.erase(it);
+    }
+  }
+
+  ApplyTrans(&attrs, global, "kernel", 2, "kernel_shape");
+  ApplyTrans(&attrs, global, "stride");
+  ApplyTrans(&attrs, global, "dilation");
+  ApplyTrans(&attrs, global, "adj");
+  ApplyTrans(&attrs, global, "pad", 4);
+
+  // Fix legacy pad attr
+  auto it = attrs.find("legacy_pad");
+  if (it != attrs.end()) {
+    auto legacy_pad_attr = it->second;
+    attrs.erase(it);
+    CAFFE_ENFORCE(
+        node.op_type().size() >= 4 &&
+        (node.op_type().rfind("Pool") == node.op_type().size() - 4));
+    const auto& input_size = shapes.at(node.input(0));
+    const auto& output_size = shapes.at(node.output(0));
+    CAFFE_ENFORCE_EQ(output_size.dims().size(), 4);
+    if (!global &&  // global pool does not care about legacy pad
+        legacy_pad_attr.i() != static_cast<int64_t>(caffe2::LegacyPadding::NOTSET)) {
+      if (legacy_pad_attr.i() ==
+          static_cast<int64_t>(caffe2::LegacyPadding::VALID)) {
+        CAFFE_ENFORCE(!attrs.count("pads"));
+        attrs.emplace("auto_pad", MakeAttribute("auto_pad", "VALID"));
+      } else if (legacy_pad_attr.i() ==
+          static_cast<int64_t>(caffe2::LegacyPadding::SAME)) {
+        CAFFE_ENFORCE(!attrs.count("pads"));
+        // default behavior in Caffe2 is SAME_UPPER
+        // https://github.com/caffe2/caffe2/blob/master/caffe2/operators/conv_pool_op_base.h#L39
+        attrs.emplace("auto_pad", MakeAttribute("auto_pad", "SAME_UPPER"));
+      } else if (legacy_pad_attr.i() ==
+          static_cast<int64_t>(caffe2::LegacyPadding::CAFFE_LEGACY_POOLING)) {
+        // The problem here is that, Pool op in Caffe may add an additional pixel,
+        // if the last part is smaller than stride. So we use the explicit padding
+        // to replace legacy_pad. pad[end] = output_size[start + 2] *
+        // stride[start] - pad[start] - 1 + kernel[start] - input[start + 2] end =
+        // start + len(pad) / 2
+        LOG(WARNING) << "Converting legacy padding to explicit padding.";
+        auto* pads_attr = attrs.at("pads").mutable_ints();
+        auto& strides_attr = attrs.at("strides").ints();
+        auto& kernel_shape_attr = attrs.at("kernel_shape").ints();
+        for (int i = 0; i < 2; ++i) {
+          int64_t tmp_pad = output_size.dims(i + 2) * strides_attr.Get(i) -
+              pads_attr->Get(i) - 1 + kernel_shape_attr.Get(i) -
+              input_size.dims(i + 2);
+          pads_attr->Set(i + 2, tmp_pad);
+        }
+      } else {
+        LOG(ERROR) << "Don't know how to handle the legacy_pad:" << legacy_pad_attr.i();
+        CAFFE_THROW("Failed to handle legacy padding in pool operator!");
+      }
+    }
+  }
+
+  node.clear_attribute();
+  for (const auto& kv : attrs) {
+    auto* attr = node.add_attribute();
+    attr->CopyFrom(kv.second);
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateLrnNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+
+  CAFFE_ENFORCE_EQ(nodes.size(), 1);
+  auto& node = nodes.back();
+  if (node.output_size() == 2) {
+    node.mutable_output()->RemoveLast();
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateConcatNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+
+  CAFFE_ENFORCE_EQ(nodes.size(), 1);
+  auto& node = nodes.back();
+  if (node.output_size() == 2) {
+    node.mutable_output()->RemoveLast();
+  }
+
+  bool explicit_axis = false;
+  for (const auto& a : def.arg()) {
+    if (a.name() == "axis") {
+      explicit_axis = true;
+      break;
+    }
+  }
+  if (!explicit_axis) {
+    node.add_attribute()->CopyFrom(MakeAttribute("axis", 1L));
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateChannelShuffleNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  const auto& x = def.input(0);
+  const auto& y = def.output(0);
+  const auto& x_shape = shapes.at(x);
+  CAFFE_ENFORCE_EQ(
+      x_shape.dims().size(),
+      4,
+      "Input shape of ChannelShuffle needs to be in NCHW format");
+  auto n = x_shape.dims(0);
+  auto c = x_shape.dims(1);
+  auto h = x_shape.dims(2);
+  auto w = x_shape.dims(3);
+  int64_t g = 0;
+  for (const auto& arg : def.arg()) {
+    if (arg.name() == "group") {
+      g = arg.i();
+      break;
+    }
+  }
+  CAFFE_ENFORCE(g && c % g == 0);
+  ConvertedResult result;
+  auto& nodes = result.first;
+  auto& const_tensors = result.second;
+
+  const auto reshape_output = dummy_->NewDummyName();
+  std::vector<int64_t> dims = {n, g, c / g, h, w};
+  const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, dims));
+  nodes.emplace_back(
+      MakeNode("Reshape", {x, const_tensors.back().name()}, {reshape_output}));
+
+  const auto transpose_output = dummy_->NewDummyName();
+  dims = {0, 2, 1, 3, 4};
+  nodes.emplace_back(MakeNode(
+      "Transpose",
+      {reshape_output},
+      {transpose_output},
+      {MakeAttribute("perm", dims)}));
+
+  dims = {n, c, h, w};
+  const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, dims));
+  nodes.emplace_back(MakeNode(
+      "Reshape", {transpose_output, const_tensors.back().name()}, {y}));
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateSliceNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  CAFFE_ENFORCE_EQ(
+      def.input_size(),
+      1,
+      "ONNX Slice operator does not support dynamic slice.");
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+  CAFFE_ENFORCE_EQ(nodes.size(), 1);
+  auto& node = nodes.back();
+  const auto& shape = shapes.at(node.input(0));
+
+  std::vector<int64_t> dims;
+  for (auto& attr : *node.mutable_attribute()) {
+    if (attr.name() == "starts") {
+      auto len = attr.ints_size();
+      if (len) {
+        dims.resize(len);
+        std::iota(dims.begin(), dims.end(), 0);
+      }
+    } else if (attr.name() == "ends") {
+      for (int i = 0; i < attr.ints_size(); ++i) {
+        auto end = attr.ints(i);
+        if (end >= 0) {
+          continue;
+        }
+        if (end == -1) {
+          end = shape.dims(i);
+        } else {
+          ++end;
+        }
+        attr.set_ints(i, end);
+      }
+    }
+  }
+  if (!dims.empty()) {
+    node.add_attribute()->CopyFrom(MakeAttribute("axes", dims));
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateReshapeNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  auto result = CommonCaffe2OpToOnnxNodes(def);
+  auto& nodes = result.first;
+  auto& const_tensors = result.second;
+  CAFFE_ENFORCE_EQ(nodes.size(), 1);
+  auto& node = nodes.back();
+
+  int i = 0;
+  int attr_size = node.attribute_size();
+  for (; i < attr_size; ++i) {
+    const auto& attr = node.attribute(i);
+    if (attr.name() == "shape") {
+      std::vector<int64_t> shape;
+      for (const auto k : attr.ints()) {
+        shape.push_back(k);
+      }
+      const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, shape));
+      node.add_input(const_tensors.back().name());
+      break;
+    }
+  }
+  if (i != attr_size) {
+    if (i != attr_size - 1) {
+      node.mutable_attribute()->SwapElements(i, attr_size - 1);
+    }
+    node.mutable_attribute()->RemoveLast();
+  }
+
+  if (node.output_size() == 2) {
+    node.mutable_output()->RemoveLast();
+  }
+
+  return result;
+}
+
+ConvertedResult OnnxExporter::CreateGemmNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  CAFFE_ENFORCE_EQ(def.input_size(), 3);
+  CAFFE_ENFORCE_GE(def.output_size(), 1);
+  auto x = def.input(0);
+  auto w = def.input(1);
+  const auto& b = def.input(2);
+  const auto& y = def.output(0);
+  const auto& x_shape = shapes.at(x);
+  const auto& w_shape = shapes.at(w);
+  CAFFE_ENFORCE_GE(x_shape.dims().size(), 2);
+  CAFFE_ENFORCE_GE(w_shape.dims().size(), 2);
+
+  ConvertedResult result;
+  auto& nodes = result.first;
+  auto& const_tensors = result.second;
+  std::unordered_map<std::string, const caffe2::Argument*> args;
+  for (const auto& a : def.arg()) {
+    args.emplace(a.name(), &a);
+  }
+
+  auto it = args.find("axis");
+  int64_t axis = 1;
+  bool has_axis = (it != args.end());
+  if (has_axis) {
+    axis = it->second->i();
+  }
+  if ((legacy_mode_ && has_axis) ||
+      (!legacy_mode_ && x_shape.dims().size() > 2)) {
+    // we need to reshape only when dimension is higher than 2
+    auto outer = DimProd(x_shape, 0, axis);
+    auto inner = DimProd(x_shape, axis, x_shape.dims().size());
+    std::vector<int64_t> dims = {outer, inner};
+    auto reshaped_x = dummy_->NewDummyName();
+    const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, dims));
+    nodes.emplace_back(
+        MakeNode("Reshape", {x, const_tensors.back().name()}, {reshaped_x}));
+    x = reshaped_x;
+  }
+
+  it = args.find("axis_w");
+  int64_t axis_w = 1;
+  if (it != args.end()) {
+    axis_w = it->second->i();
+  }
+  if ((legacy_mode_ && it != args.end()) ||
+      (!legacy_mode_ && w_shape.dims().size() > 2)) {
+    // we need to reshape only when dimension is higher than 2
+    auto outer = DimProd(w_shape, 0, axis_w);
+    auto inner = DimProd(w_shape, axis_w, w_shape.dims().size());
+    std::vector<int64_t> dims = {outer, inner};
+    auto reshaped_w = dummy_->NewDummyName();
+    const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, dims));
+    nodes.emplace_back(
+        MakeNode("Reshape", {w, const_tensors.back().name()}, {reshaped_w}));
+    w = reshaped_w;
+  }
+
+  auto gemm_y_output = (has_axis) ? dummy_->NewDummyName() : y;
+  std::vector<AttributeProto> attrs = {MakeAttribute("transB", 1L)};
+  if (legacy_mode_) {
+    attrs.emplace_back(MakeAttribute("broadcast", 1));
+  }
+  nodes.emplace_back(MakeNode(
+      "Gemm",
+      {x, w, b},
+      {gemm_y_output},
+      attrs,
+      def.name()));
+
+  if (has_axis) {
+    std::vector<int64_t> dims;
+    for (int i = 0; i < axis; ++i) {
+      dims.push_back(x_shape.dims(i));
+    }
+    dims.push_back(-1);
+    const_tensors.emplace_back(CreateOnnxShapeTensor(dummy_, dims));
+    nodes.emplace_back(
+        MakeNode("Reshape", {gemm_y_output, const_tensors.back().name()}, {y}));
+  }
+
+  return result;
+}
+
+void OnnxExporter::InitOpToTensorProto(
+    const caffe2::OperatorDef& op,
+    TensorProto* tensor) {
+  CAFFE_ENFORCE_EQ(op.input_size(), 0);
+  CAFFE_ENFORCE_EQ(op.output_size(), 1);
+
+  // Set name
+  tensor->set_name(op.output(0));
+
+  const Argument* values = nullptr;
+  const Argument* shape = nullptr;
+  for (const auto& arg: op.arg()) {
+    if (arg.name() == "values") {
+      values = &arg;
+    } else if (arg.name() == "shape") {
+      shape = &arg;
+    }
+  }
+
+  CAFFE_ENFORCE(values);
+  CAFFE_ENFORCE(shape);
+
+  // Set dims
+  for (const auto i: shape->ints()) {
+    tensor->add_dims(i);
+  }
+
+  // Set value
+  if (op.type() == "GivenTensorFill") {
+    tensor->set_data_type(TensorProto::FLOAT);
+    for (const auto i : values->floats()) {
+      tensor->add_float_data(i);
+    }
+  } else if (op.type() == "GivenTensorInt64Fill") {
+    tensor->set_data_type(TensorProto::INT64);
+    for (const auto i : values->ints()) {
+      tensor->add_int64_data(i);
+    }
+  } else if (op.type() == "GivenTensorIntFill") {
+    tensor->set_data_type(TensorProto::INT32);
+    for (const auto i : values->ints()) {
+      tensor->add_int32_data(i);
+    }
+  } else if (op.type() == "GivenTensorBoolFill") {
+    tensor->set_data_type(TensorProto::INT32);
+    for (const auto i : values->ints()) {
+      tensor->add_int32_data(i);
+    }
+  } else if (op.type() == "GivenTensorStringFill") {
+    tensor->set_data_type(TensorProto::STRING);
+    // TODO: we might need to do two pass to avoid adverse memory allocations
+    for (const auto& s : values->strings()) {
+      tensor->mutable_raw_data()->append(s);
+    }
+  } else {
+    CAFFE_THROW(
+        MakeString("Cannot convert C2 op ", op.type(), "to ONNX TensorProto"));
+  }
+}
+
+} // namespace onnx
+} // namespace caffe2
+
diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h
new file mode 100644
index 0000000..7fcd540
--- /dev/null
+++ b/caffe2/onnx/onnx_exporter.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/onnx/helper.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "onnx/onnx_pb.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace caffe2 {
+namespace onnx {
+
+namespace {
+using ::ONNX_NAMESPACE::AttributeProto;
+using ::ONNX_NAMESPACE::GraphProto;
+using ::ONNX_NAMESPACE::ModelProto;
+using ::ONNX_NAMESPACE::NodeProto;
+using ::ONNX_NAMESPACE::TensorProto;
+} // namespace
+
+using ConvertedResult =
+    std::pair<std::vector<NodeProto>, std::vector<TensorProto>>;
+
+// Rewrite Caffe2 nets into SSA forms. Notice that we will preserve the external
+// output names for predict net.
+std::unordered_map<std::string, std::string> SsaRewrite(
+    caffe2::NetDef* init_net,
+    caffe2::NetDef* pred_net);
+
+class OnnxExporter {
+  using SpecialOpConverter = ConvertedResult (OnnxExporter::*)(
+      const caffe2::OperatorDef&,
+      const std::unordered_map<std::string, caffe2::TensorShape>&);
+
+ public:
+  OnnxExporter(DummyName* dummy = nullptr, bool legacy_mode = false)
+      : legacy_mode_(legacy_mode) {
+    if (dummy) {
+      dummy_ = std::shared_ptr<DummyName>(dummy, [](DummyName*) {});
+    } else {
+      dummy_ = std::make_shared<DummyName>();
+    }
+  }
+
+  ConvertedResult Caffe2OpToOnnxNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  void InitOpToTensorProto(const caffe2::OperatorDef& def, TensorProto* tensor);
+ private:
+  ConvertedResult CommonCaffe2OpToOnnxNodes(const caffe2::OperatorDef& def);
+
+  ConvertedResult CreateArgMaxMinOpNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateBinaryElementwiseOpNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateCastNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateConvPoolNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateGemmNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateReshapeNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateSliceNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateChannelShuffleNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateConcatNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  ConvertedResult CreateLrnNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
+  // \brief Check black listed arguemnts where we won't pass down when
+  // converting to ONNX node
+  bool IsBlackListed(const caffe2::Argument& arg);
+
+  // \brief Convert Caffe2 argument to Onnx attribute
+  void CopyCaffe2ArgToOnnxAttr(
+      AttributeProto* attr,
+      const std::string& op_type,
+      const caffe2::Argument& arg);
+
+  // LUT getters
+  const std::unordered_map<std::string, std::string>& get_renamed_operators()
+      const;
+  const std::unordered_map<std::string, std::string>& get_renamed_attrs() const;
+  const std::
+      unordered_map<std::string, std::unordered_map<std::string, std::string>>&
+      get_per_op_renamed_attrs() const;
+  const std::unordered_map<std::string, OnnxExporter::SpecialOpConverter>&
+  get_special_operators() const;
+
+  // To generate onnx models with opset < 6
+  bool legacy_mode_{false};
+
+  // Dummy name generator
+  std::shared_ptr<DummyName> dummy_;
+};
+} // namespace onnx
+} // namespace caffe2
diff --git a/caffe2/onnx/onnxifi_init.cc b/caffe2/onnx/onnxifi_init.cc
new file mode 100644
index 0000000..7ec264b
--- /dev/null
+++ b/caffe2/onnx/onnxifi_init.cc
@@ -0,0 +1,22 @@
+#include "caffe2/onnx/onnxifi_init.h"
+
+#include <mutex>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace onnx {
+onnxifi_library* initOnnxifiLibrary() {
+  static std::once_flag once;
+  static onnxifi_library core{};
+  std::call_once(once, []() {
+    auto ret =
+        onnxifi_load(ONNXIFI_LOADER_FLAG_VERSION_1_0, nullptr, nullptr, &core);
+    if (!ret) {
+      CAFFE_THROW("Cannot load onnxifi lib");
+    }
+  });
+  return &core;
+}
+} // namespace onnx
+} // namespace caffe2
diff --git a/caffe2/onnx/onnxifi_init.h b/caffe2/onnx/onnxifi_init.h
new file mode 100644
index 0000000..ed43448
--- /dev/null
+++ b/caffe2/onnx/onnxifi_init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "onnx/onnxifi_loader.h"
+
+namespace caffe2 {
+namespace onnx {
+onnxifi_library* initOnnxifiLibrary();
+}
+} // namespace caffe2
diff --git a/caffe2/onnx/ssa_test.cc b/caffe2/onnx/ssa_test.cc
new file mode 100644
index 0000000..9f83b8f
--- /dev/null
+++ b/caffe2/onnx/ssa_test.cc
@@ -0,0 +1,39 @@
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/onnx/onnx_exporter.h"
+
+#include <gtest/gtest.h>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+
+TEST(SsaTest, ConvReluInplace) {
+  caffe2::NetDef net;
+  auto* op = net.add_op();
+  op->set_type("Conv");
+  op->add_input("X");
+  op->add_input("W");
+  op->add_input("b");
+  op->add_output("Y");
+  op = net.add_op();
+  op->set_type("Relu");
+  op->add_input("Y");
+  op->add_output("Y");
+  net.add_external_input("X");
+  net.add_external_output("Y");
+
+  std::unordered_map<std::string, std::string> input_mapping =
+      caffe2::onnx::SsaRewrite(nullptr, &net);
+  for (const auto& op : net.op()) {
+    std::unordered_set<std::string> inputs;
+    for (const auto& i : op.input()) {
+      inputs.emplace(i);
+    }
+    for (const auto& o : op.output()) {
+      EXPECT_TRUE(inputs.count(o) == 0);
+    }
+  }
+  EXPECT_EQ(net.op(0).output(0), net.op(1).input(0));
+  EXPECT_EQ("X", input_mapping.at(net.external_input(0)));
+  EXPECT_EQ("Y", net.external_output(0));
+}
diff --git a/caffe2/operators/CMakeLists.txt b/caffe2/operators/CMakeLists.txt
new file mode 100644
index 0000000..5e382ba
--- /dev/null
+++ b/caffe2/operators/CMakeLists.txt
@@ -0,0 +1,80 @@
+# ---[ GPU files
+# ------[ cuDNN
+if (USE_CUDNN)
+  file(GLOB tmp *_cudnn.cc)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+endif()
+# ------[ general GPU
+file(GLOB tmp *_gpu.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# ------[ TensorRT
+if (USE_TENSORRT)
+file(GLOB tmp *_trt.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+endif()
+# ------[ CUDA sources
+file(GLOB tmp *.cu)
+# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
+# TODO: when we move to explicit file list, this would not be needed.
+file(GLOB tmp_cudnn *_cudnn.cu)
+exclude(tmp "${tmp}" ${tmp_cudnn})
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+# ------[ HIP sources
+file(GLOB_RECURSE tmp *_hip.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+# ------[ HIP device sources
+file(GLOB_RECURSE tmp *_hipdev.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+# ---[ MIOPEN files
+file(GLOB_RECURSE tmp *_miopen.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
+# TODO: when we move to explicit file list, this would not be needed.
+file(GLOB tmp_cudnn *_cudnn.cc)
+exclude(tmp "${tmp}" ${tmp_cudnn})
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS} ${Caffe2_HIP_SRCS})
+
+# ---[ GPU test files
+# ------[ cuDNN
+if (USE_CUDNN)
+  file(GLOB tmp *_cudnn_test.cc)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+endif()
+# ------[ general GPU
+file(GLOB tmp *_gpu_test.cc)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+# ---[ HIP test files
+file(GLOB_RECURSE tmp *_hip_test.cc)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
+# ---[ MIOPEN test files
+file(GLOB_RECURSE tmp *_miopen_test.cc)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
+# TODO: when we move to explicit file list, this would not be needed.
+file(GLOB tmp_cudnn *_cudnn_test.cc)
+exclude(tmp "${tmp}" ${tmp_cudnn})
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS} ${Caffe2_HIP_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/operators/abs_op.cc b/caffe2/operators/abs_op.cc
new file mode 100644
index 0000000..9b9e93f
--- /dev/null
+++ b/caffe2/operators/abs_op.cc
@@ -0,0 +1,108 @@
+#include "caffe2/operators/abs_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool AbsGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) =
+      (X_arr == T(0)).select(T(0), (X_arr > T(0)).select(dY_arr, -dY_arr));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Abs,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AbsFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    AbsGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AbsGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Abs)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the absolute value of the given input tensor, element-wise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/abs_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Abs",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.randn(5).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X: [ 0.3005476   1.551666   -1.3591481   0.39191285 -0.21866608]
+Y: [0.3005476  1.551666   1.3591481  0.39191285 0.21866608]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor<float>)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Absolute value of input element-wise.")
+    .InheritOnnxSchema("Abs");
+
+OPERATOR_SCHEMA(AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+
+namespace {
+
+class GetAbsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AbsGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Abs, GetAbsGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/abs_op.cu b/caffe2/operators/abs_op.cu
new file mode 100644
index 0000000..3e1dd44
--- /dev/null
+++ b/caffe2/operators/abs_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/abs_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+AbsGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(X + i) == T(0)
+        ? T(0)
+        : (__ldg(X + i) > T(0) ? __ldg(dY + i) : -__ldg(dY + i));
+#else
+    dX[i] = X[i] == T(0) ? T(0) : (X[i] > T(0) ? dY[i] : -dY[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool AbsGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  AbsGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Abs,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AbsFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AbsGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AbsGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/abs_op.h b/caffe2/operators/abs_op.h
new file mode 100644
index 0000000..74e5703
--- /dev/null
+++ b/caffe2/operators/abs_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ABS_OP_H_
+#define CAFFE2_OPERATORS_ABS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AbsFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Abs(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AbsGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ABS_OP_H_
diff --git a/caffe2/operators/accumulate_op.cc b/caffe2/operators/accumulate_op.cc
new file mode 100644
index 0000000..0037bd7
--- /dev/null
+++ b/caffe2/operators/accumulate_op.cc
@@ -0,0 +1,29 @@
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Accumulate)
+  .NumInputs(1)
+  .NumOutputs(1)
+  .IdenticalTypeAndShape()
+  .SetDoc(R"DOC(
+Accumulate operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+Accumulation is done using Axpby operation as shown:
+  Y = 1*X + gamma*Y
+where X is the input tensor, Y is the output tensor and gamma is the multiplier
+argument.
+)DOC")
+  .Arg("gamma", "(float, default 1.0) Accumulation multiplier")
+  .Input(0, "input", "The input tensor that has to be accumulated to the "
+         "output tensor. If the output size is not the same as input size, the "
+         "output tensor is first reshaped and initialized to zero, and only "
+         "then, accumulation is done.")
+  .Output(0, "output", "Accumulated output tensor");
+
+SHOULD_NOT_DO_GRADIENT(Accumulate);
+}  // namespace caffe2
diff --git a/caffe2/operators/accumulate_op.cu b/caffe2/operators/accumulate_op.cu
new file mode 100644
index 0000000..96f0d0c
--- /dev/null
+++ b/caffe2/operators/accumulate_op.cu
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/accumulate_op.h b/caffe2/operators/accumulate_op.h
new file mode 100644
index 0000000..20cd4a3
--- /dev/null
+++ b/caffe2/operators/accumulate_op.h
@@ -0,0 +1,44 @@
+#ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+#define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class AccumulateOp final : public Operator<Context> {
+ public:
+  AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        gamma_(static_cast<T>(
+            OperatorBase::template GetSingleArgument<float>("gamma", 1.0))) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    if (output->dims() != input.dims()) {
+      LOG(INFO) << "Reshaping and initializing output.";
+      output->ResizeLike(input);
+      math::Set<T, Context>(
+          output->size(), 0, output->template mutable_data<T>(), &context_);
+    }
+    math::Axpby<T, Context>(
+        input.size(),
+        static_cast<T>(1),
+        input.template data<T>(),
+        gamma_,
+        output->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  T gamma_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ACCUMULATE_OP_H_
diff --git a/caffe2/operators/accuracy_op.cc b/caffe2/operators/accuracy_op.cc
new file mode 100644
index 0000000..03733ed
--- /dev/null
+++ b/caffe2/operators/accuracy_op.cc
@@ -0,0 +1,72 @@
+#include "caffe2/operators/accuracy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool AccuracyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = Input(LABEL);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), 2);
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  CAFFE_ENFORCE_EQ(label.ndim(), 1);
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  Y->Resize(vector<TIndex>());
+  const auto* Xdata = X.data<float>();
+  const auto* labelData = label.data<int>();
+  const int top_k = top_k_;
+  int correct = 0;
+
+  // it's equivalent to using a stable sorting algorithm to sort the
+  // classes (with their predictions as key) and then check whether
+  // the label is within the first top_k slots.
+  for (int i = 0; i < N; ++i) {
+    auto label_i = labelData[i];
+    auto label_pred = Xdata[i * D + label_i];
+    int ngt = 1;
+    for (int j = 0; j < D; ++j) {
+      auto pred = Xdata[i * D + j];
+      if ((pred > label_pred) || (pred == label_pred && j < label_i)) {
+        if (++ngt > top_k) {
+          break;
+        }
+      }
+    }
+    if (ngt <= top_k) {
+      ++correct;
+    }
+  }
+  CAFFE_ENFORCE_LE(correct, N);
+  *(Y->mutable_data<float>()) = static_cast<float>(correct) / N;
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Accuracy)
+  .NumInputs(2)
+  .NumOutputs(1)
+  .ScalarType(TensorProto::FLOAT)
+  .SetDoc(R"DOC(
+Accuracy takes two inputs- predictions and labels, and returns a float
+accuracy value for the batch. Predictions are expected in the form of 2-D tensor
+containing a batch of scores for various classes, and labels are expected in the
+ form of 1-D tensor containing true label indices of samples in the batch. If
+the score for the label index in the predictions is the highest among all
+classes, it is considered a correct prediction.
+)DOC")
+  .Arg(
+      "top_k",
+      "Count as correct by comparing the true label to the top k scoring "
+      "classes (default 1: only compare to the top scoring class i.e. argmax)")
+  .Input(0, "predictions", "2-D tensor (Tensor<float>) of size "
+         "(num_batches x num_classes) containing scores")
+  .Input(1, "labels", "1-D tensor (Tensor<int>) of size (num_batches) having "
+        "the indices of true labels")
+  .Output(0, "accuracy", "1-D tensor (Tensor<float>) of size 1 containing "
+          "accuracy");
+
+SHOULD_NOT_DO_GRADIENT(Accuracy);
+}  // namespace caffe2
diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
new file mode 100644
index 0000000..949a077
--- /dev/null
+++ b/caffe2/operators/accuracy_op.cu
@@ -0,0 +1,73 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accuracy_op.h"
+#include "caffe2/utils/math.h"
+
+#include <cub/block/block_reduce.cuh>
+
+namespace caffe2 {
+
+namespace {
+__global__ void AccuracyKernel(
+    const int N,
+    const int D,
+    const int top_k,
+    const float* Xdata,
+    const int* labelData,
+    float* accuracy) {
+  typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int correct = 0;
+  for (int row = blockIdx.x; row < N; row += gridDim.x) {
+    const int label = labelData[row];
+    const float label_pred = Xdata[row * D + label];
+    int ngt = 0;
+    for (int col = threadIdx.x; col < D; col += blockDim.x) {
+      const float pred = Xdata[row * D + col];
+      if (pred > label_pred || (pred == label_pred && col <= label)) {
+        ++ngt;
+      }
+    }
+    ngt = BlockReduce(temp_storage).Sum(ngt);
+    if (ngt <= top_k) {
+      ++correct;
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    atomicAdd(accuracy, static_cast<float>(correct));
+  }
+}
+
+__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
+  *accuracy /= N;
+}
+}  // namespace
+
+template <>
+bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = Input(LABEL);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), 2);
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  CAFFE_ENFORCE_EQ(label.ndim(), 1);
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  Y->Resize(vector<TIndex>());
+  float* Ydata = Y->mutable_data<float>();
+  math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
+  AccuracyKernel<<<
+      std::min(CAFFE_MAXIMUM_NUM_BLOCKS, N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, D, top_k_, X.data<float>(), label.data<int>(), Ydata);
+  // This is going to be executed only in one single kernel. Not very beautiful,
+  // but probably we have to do this?
+  AccuracyDivideKernel<<<1, 1, 0, context_.cuda_stream()>>>(
+      N, Ydata);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/accuracy_op.h b/caffe2/operators/accuracy_op.h
new file mode 100644
index 0000000..e2dfb95
--- /dev/null
+++ b/caffe2/operators/accuracy_op.h
@@ -0,0 +1,26 @@
+#ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
+#define CAFFE2_OPERATORS_ACCURACY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class AccuracyOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AccuracyOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        top_k_(OperatorBase::GetSingleArgument<int>("top_k", 1)) {}
+        
+  bool RunOnDevice() override;
+
+ protected:
+  int top_k_; 
+  INPUT_TAGS(PREDICTION, LABEL);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ACCURACY_OP_H_
diff --git a/caffe2/operators/acos_op.cc b/caffe2/operators/acos_op.cc
new file mode 100644
index 0000000..204bdce
--- /dev/null
+++ b/caffe2/operators/acos_op.cc
@@ -0,0 +1,74 @@
+#include "caffe2/operators/acos_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool AcosGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = -dY_arr * (T(1) - X_arr.square()).rsqrt();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Acos,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AcosFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    AcosGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AcosGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Acos)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the arccosine of the given input tensor, element-wise.
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The arccosine of the input tensor computed element-wise");
+
+OPERATOR_SCHEMA(AcosGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetAcosGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AcosGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Acos, GetAcosGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/acos_op.cu b/caffe2/operators/acos_op.cu
new file mode 100644
index 0000000..8215213
--- /dev/null
+++ b/caffe2/operators/acos_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/acos_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void AcosGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = -__ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = -dY[i] * rsqrtf(1.0f - X[i] * X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool AcosGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  AcosGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Acos,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AcosFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AcosGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AcosGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/acos_op.h b/caffe2/operators/acos_op.h
new file mode 100644
index 0000000..e24d8f3
--- /dev/null
+++ b/caffe2/operators/acos_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ACOS_OP_H_
+#define CAFFE2_OPERATORS_ACOS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AcosFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Acos(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AcosGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ACOS_OP_H_
diff --git a/caffe2/operators/activation_ops_cudnn.h b/caffe2/operators/activation_ops_cudnn.h
new file mode 100644
index 0000000..cc682ce
--- /dev/null
+++ b/caffe2/operators/activation_ops_cudnn.h
@@ -0,0 +1,136 @@
+#ifndef CAFFE2_OPERATORS_ACTIVATION_OPS_CUDNN_H_
+#define CAFFE2_OPERATORS_ACTIVATION_OPS_CUDNN_H_
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+class CuDNNActivationOpBase : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNActivationOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&act_desc_));
+  }
+
+  ~CuDNNActivationOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(act_desc_));
+  }
+
+ protected:
+  void SetTensorDescriptor(
+      const cudnnDataType_t data_type,
+      const int data_size) {
+    if (data_size != input_size_) {
+      // Since the best performance is obtained when the tesor is HW-packed, we
+      // put X.size() to W.
+      input_size_ = data_size;
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc_,
+          GetCudnnTensorFormat(StorageOrder::NCHW),
+          data_type,
+          1,
+          1,
+          1,
+          input_size_));
+    }
+  }
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnActivationDescriptor_t act_desc_;
+
+  int input_size_ = 0;
+};
+
+template <cudnnActivationMode_t kCuDNNActivationMode>
+class CuDNNActivationOp final : public CuDNNActivationOpBase {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNActivationOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNActivationOpBase(operator_def, ws) {
+    CUDNN_ENFORCE(cudnnSetActivationDescriptor(
+        act_desc_, kCuDNNActivationMode, CUDNN_PROPAGATE_NAN, 0.0));
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    if (X.size() == 0) {
+      Y->template mutable_data<T>();
+      return true;
+    }
+    this->SetTensorDescriptor(cudnnTypeWrapper<T>::type, X.size());
+    CUDNN_ENFORCE(cudnnActivationForward(
+        this->cudnn_wrapper_.inline_cudnn_handle(),
+        this->act_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        this->data_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        this->data_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+};
+
+template <cudnnActivationMode_t kCuDNNActivationMode>
+class CuDNNActivationGradientOp final : public CuDNNActivationOpBase {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNActivationGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNActivationOpBase(operator_def, ws) {
+    CUDNN_ENFORCE(cudnnSetActivationDescriptor(
+        act_desc_, kCuDNNActivationMode, CUDNN_PROPAGATE_NAN, 0.0));
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& Y = Input(0);
+    const auto& dY = Input(1);
+    auto* dX = Output(0);
+    dX->ResizeLike(Y);
+    if (Y.size() == 0) {
+      dX->template mutable_data<T>();
+      return true;
+    }
+    this->SetTensorDescriptor(cudnnTypeWrapper<T>::type, Y.size());
+    CUDNN_ENFORCE(cudnnActivationBackward(
+        this->cudnn_wrapper_.inline_cudnn_handle(),
+        this->act_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        this->data_desc_,
+        Y.template data<T>(),
+        this->data_desc_,
+        dY.template data<T>(),
+        this->data_desc_,
+        Y.template data<T>(), // Use Y_data as placeholder here.
+        cudnnTypeWrapper<T>::kZero(),
+        this->data_desc_,
+        dX->template mutable_data<T>()));
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ACTIVATION_OPS_CUDNN_H_
diff --git a/caffe2/operators/affine_channel_op.cc b/caffe2/operators/affine_channel_op.cc
new file mode 100644
index 0000000..2695387
--- /dev/null
+++ b/caffe2/operators/affine_channel_op.cc
@@ -0,0 +1,186 @@
+#include "caffe2/operators/affine_channel_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <vector>
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+void AffineChannelScaleBiasBackwardNCHW(
+    const int N,
+    const int C,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    T* dscale,
+    T* dbias) {
+  const T* dY_ptr = dY;
+  const T* X_ptr = X;
+  const int stride = C * HxW;
+  EigenVectorArrayMap<T> dscale_arr(dscale, C);
+  EigenVectorArrayMap<T> dbias_arr(dbias, C);
+  for (int i = 0; i < N; ++i) {
+    ConstEigenArrayMap<T> dY_arr(dY_ptr, HxW, C);
+    ConstEigenArrayMap<T> X_arr(X_ptr, HxW, C);
+    dscale_arr += (dY_arr * X_arr).colwise().sum();
+    dbias_arr += dY_arr.colwise().sum();
+    dY_ptr += stride;
+    X_ptr += stride;
+  }
+}
+
+template <typename T>
+void AffineChannelScaleBiasBackwardNHWC(
+    const int N,
+    const int C,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    T* dscale,
+    T* dbias) {
+  ConstEigenArrayMap<T> dY_arr(dY, C, N * HxW);
+  ConstEigenArrayMap<T> X_arr(X, C, N * HxW);
+  EigenVectorMap<T>(dscale, C) = (dY_arr * X_arr).rowwise().sum();
+  EigenVectorMap<T>(dbias, C) = dY_arr.rowwise().sum();
+}
+
+} // namespace
+
+template <>
+bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  const auto& scale = is_learnable_ ? Input(2) : Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int N = dY.dim32(0);
+  const int C = dY.dim32(1);
+  const int HxW = dY.size() / (N * C);
+  const float* dY_data = dY.data<float>();
+  const float* scale_data = scale.data<float>();
+  const std::array<int, 3> X_dims = {N, C, HxW};
+  const std::array<int, 3> scale_dims = {1, C, 1};
+  math::Mul<float, CPUContext>(
+      3,
+      X_dims.data(),
+      3,
+      scale_dims.data(),
+      dY_data,
+      scale_data,
+      dX->mutable_data<float>(),
+      &context_);
+  if (is_learnable_) {
+    const auto& X = Input(1);
+    const float* X_data = X.data<float>();
+    auto* dscale = Output(1);
+    auto* dbias = Output(2);
+    dscale->ResizeLike(scale);
+    dbias->ResizeLike(scale);
+    AffineChannelScaleBiasBackwardNCHW<float>(
+        N,
+        C,
+        HxW,
+        dY_data,
+        X_data,
+        dscale->mutable_data<float>(),
+        dbias->mutable_data<float>());
+  }
+  return true;
+}
+
+template <>
+bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& dY = Input(0);
+  const auto& scale = is_learnable_ ? Input(2) : Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int ndim = dY.ndim();
+  const int C = dY.dim32(ndim - 1);
+  const int rows = dY.size() / C;
+  const int cols = C;
+  const float* dY_data = dY.data<float>();
+  const float* scale_data = scale.data<float>();
+  math::RowwiseMul<float, CPUContext>(
+      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+  if (is_learnable_) {
+    const auto& X = Input(1);
+    const float* X_data = X.data<float>();
+    const int N = X.dim32(0);
+    const int HxW = rows / N;
+    auto* dscale = Output(1);
+    auto* dbias = Output(2);
+    dscale->ResizeLike(scale);
+    dbias->ResizeLike(scale);
+    AffineChannelScaleBiasBackwardNHWC<float>(
+        N,
+        C,
+        HxW,
+        dY_data,
+        X_data,
+        dscale->mutable_data<float>(),
+        dbias->mutable_data<float>());
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(AffineChannel, AffineChannelOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    AffineChannelGradient,
+    AffineChannelGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(AffineChannel)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Applies a separate affine transformation to each channel of the input. Useful
+for replacing spatial batch norm with its equivalent fixed transformation.
+)DOC")
+    .Input(0, "X", "Feature map input with order NCHW or NHWC.")
+    .Input(
+        1,
+        "scale",
+        "1D input of shape (C); the c-th element is the scale factor of the "
+        "affine transformation for the c-th channel of the input.")
+    .Input(
+        2,
+        "bias",
+        "1D input of shape (C); the c-th element is the bias of the affine "
+        "transformation for the c-th channel of the input.")
+    .Output(0, "Y", "Output with the same order of Input.");
+
+OPERATOR_SCHEMA(AffineChannelGradient)
+    .NumInputs({2, 3})
+    .NumOutputs({1, 3})
+    .AllowInplace({{0, 0}});
+
+namespace {
+
+class GetAffineChannelGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper arg_helper(def_);
+    const bool is_learnable =
+        arg_helper.GetSingleArgument("is_learnable", false);
+    if (is_learnable) {
+      return SingleGradientDef(
+          "AffineChannelGradient",
+          "",
+          std::vector<std::string>{GO(0), I(0), I(1)},
+          std::vector<std::string>{GI(0), GI(1), GI(2)});
+    } else {
+      return SingleGradientDef(
+          "AffineChannelGradient",
+          "",
+          std::vector<std::string>{GO(0), I(1)},
+          std::vector<std::string>{GI(0)});
+    }
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(AffineChannel, GetAffineChannelGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
new file mode 100644
index 0000000..f3a9703
--- /dev/null
+++ b/caffe2/operators/affine_channel_op.cu
@@ -0,0 +1,144 @@
+#include "caffe2/operators/affine_channel_op.h"
+
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename T, StorageOrder kOrder>
+__global__ void AffineChannelScaleBiasBackwardCUDAKernel(
+    const int N,
+    const int C,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    T* dscale,
+    T* dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  __shared__ typename BlockReduce<T>::TempStorage ds_storage;
+  __shared__ typename BlockReduce<T>::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = 0;
+    T db_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = kOrder == StorageOrder::NCHW
+          ? (j / HxW * C + i) * HxW + j % HxW
+          : j * outer_size + i;
+#if __CUDA_ARCH__ >= 350
+      ds_sum += __ldg(dY + index) * __ldg(X + index);
+      db_sum += __ldg(dY + index);
+#else
+      ds_sum += dY[index] * X[index];
+      db_sum += dY[index];
+#endif
+    }
+    ds_sum = BlockReduce<T>(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce<T>(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+} // namespace
+
+template <>
+bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  const auto& scale = is_learnable_ ? Input(2) : Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int N = dY.dim32(0);
+  const int C = dY.dim32(1);
+  const int HxW = dY.size() / (N * C);
+  const float* dY_data = dY.data<float>();
+  const float* scale_data = scale.data<float>();
+  const std::array<int, 3> X_dims = {N, C, HxW};
+  const std::array<int, 3> scale_dims = {1, C, 1};
+  math::Mul<float, CUDAContext>(
+      3,
+      X_dims.data(),
+      3,
+      scale_dims.data(),
+      dY_data,
+      scale_data,
+      dX->mutable_data<float>(),
+      &context_);
+  if (is_learnable_) {
+    const auto& X = Input(1);
+    const float* X_data = X.data<float>();
+    auto* dscale = Output(1);
+    auto* dbias = Output(2);
+    dscale->ResizeLike(scale);
+    dbias->ResizeLike(scale);
+    const int outer_size = N * HxW;
+    AffineChannelScaleBiasBackwardCUDAKernel<float, StorageOrder::NCHW>
+        <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            C,
+            HxW,
+            dY_data,
+            X_data,
+            dscale->mutable_data<float>(),
+            dbias->mutable_data<float>());
+  }
+  return true;
+}
+
+template <>
+bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& dY = Input(0);
+  const auto& scale = is_learnable_ ? Input(2) : Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int ndim = dY.ndim();
+  const int C = dY.dim32(ndim - 1);
+  const int rows = dY.size() / C;
+  const int cols = C;
+  const float* dY_data = dY.data<float>();
+  const float* scale_data = scale.data<float>();
+  math::RowwiseMul<float, CUDAContext>(
+      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+  if (is_learnable_) {
+    const auto& X = Input(1);
+    const float* X_data = X.data<float>();
+    const int N = X.dim32(0);
+    const int HxW = rows / N;
+    auto* dscale = Output(1);
+    auto* dbias = Output(2);
+    dscale->ResizeLike(scale);
+    dbias->ResizeLike(scale);
+    AffineChannelScaleBiasBackwardCUDAKernel<float, StorageOrder::NHWC>
+        <<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            C,
+            HxW,
+            dY_data,
+            X_data,
+            dscale->mutable_data<float>(),
+            dbias->mutable_data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(AffineChannel, AffineChannelOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    AffineChannelGradient,
+    AffineChannelGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/affine_channel_op.h b/caffe2/operators/affine_channel_op.h
new file mode 100644
index 0000000..18d03dd
--- /dev/null
+++ b/caffe2/operators/affine_channel_op.h
@@ -0,0 +1,138 @@
+#ifndef CAFFE2_OPERATORS_AFFINE_CHANNEL_OP_H_
+#define CAFFE2_OPERATORS_AFFINE_CHANNEL_OP_H_
+
+#include <string>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class AffineChannelOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AffineChannelOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
+        OP_SINGLE_ARG(bool, "is_learnable", is_learnable_, false) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
+  }
+
+  bool RunOnDevice() override {
+    return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
+                                        : RunOnDeviceWithOrderNHWC();
+  }
+
+  bool RunOnDeviceWithOrderNCHW() {
+    const auto& X = Input(0);
+    const auto& scale = Input(1);
+    const auto& bias = Input(2);
+    auto* Y = Output(0);
+    if (is_learnable_) {
+      CAFFE_ENFORCE_NE(
+          Y,
+          &X,
+          "In-place affine_channel_op is not supported when "
+          "is_learnable = true.");
+    }
+    const int N = X.dim32(0);
+    const int C = X.dim32(1);
+    const int HxW = X.size() / (N * C);
+    const std::array<int, 3> X_dims = {N, C, HxW};
+    const std::array<int, 3> scale_dims = {1, C, 1};
+    Y->ResizeLike(X);
+    math::Mul<T, Context>(
+        3,
+        X_dims.data(),
+        3,
+        scale_dims.data(),
+        X.template data<T>(),
+        scale.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    math::Add<T, Context>(
+        3,
+        X_dims.data(),
+        3,
+        scale_dims.data(),
+        Y->template data<T>(),
+        bias.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() {
+    const auto& X = Input(0);
+    const auto& scale = Input(1);
+    const auto& bias = Input(2);
+    auto* Y = Output(0);
+    if (is_learnable_) {
+      CAFFE_ENFORCE_NE(
+          Y,
+          &X,
+          "In-place affine_channel_op is not supported when "
+          "is_learnable = true.");
+    }
+    const int ndim = X.ndim();
+    const int C = X.dim32(ndim - 1);
+    const int rows = X.size() / C;
+    const int cols = C;
+    Y->ResizeLike(X);
+    math::RowwiseMul<T, Context>(
+        rows,
+        cols,
+        X.template data<T>(),
+        scale.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    math::RowwiseAdd<T, Context>(
+        rows,
+        cols,
+        Y->template data<T>(),
+        bias.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ private:
+  const StorageOrder order_;
+  const bool is_learnable_;
+};
+
+template <typename T, class Context>
+class AffineChannelGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AffineChannelGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
+        OP_SINGLE_ARG(bool, "is_learnable", is_learnable_, false) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
+  }
+
+  bool RunOnDevice() override {
+    return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
+                                        : RunOnDeviceWithOrderNHWC();
+  }
+
+  bool RunOnDeviceWithOrderNCHW();
+
+  bool RunOnDeviceWithOrderNHWC();
+
+ private:
+  const StorageOrder order_;
+  const bool is_learnable_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_AFFINE_CHANNEL_OP_H_
diff --git a/caffe2/operators/affine_channel_op_cudnn.cc b/caffe2/operators/affine_channel_op_cudnn.cc
new file mode 100644
index 0000000..e3bf3b1
--- /dev/null
+++ b/caffe2/operators/affine_channel_op_cudnn.cc
@@ -0,0 +1,371 @@
+#include "caffe2/operators/affine_channel_op.h"
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+class CuDNNAffineChannelOpBase : public Operator<CUDAContext> {
+ public:
+  CuDNNAffineChannelOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
+        OP_SINGLE_ARG(bool, "is_learnable", is_learnable_, false),
+        cudnn_wrapper_(&context_) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
+
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&scale_desc_));
+    CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&mul_desc_));
+  }
+
+  virtual ~CuDNNAffineChannelOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(scale_desc_));
+    CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(mul_desc_));
+  }
+
+ protected:
+  void SetTensorDesc4D(
+      const cudnnDataType_t cudnn_type,
+      const int N,
+      const int C,
+      const int H,
+      const int W) {
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        X_desc_, GetCudnnTensorFormat(order_), cudnn_type, N, C, H, W));
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        scale_desc_, GetCudnnTensorFormat(order_), cudnn_type, 1, C, 1, 1));
+  }
+
+  void SetTensorDescND(
+      const cudnnDataType_t cudnn_type,
+      const std::vector<int>& X_dims) {
+    const int ndim = X_dims.size();
+    const int C_dim = order_ == StorageOrder::NCHW ? 1 : ndim - 1;
+    const int C = X_dims[C_dim];
+    std::vector<int> X_strides(ndim);
+    X_strides.back() = 1;
+    for (int i = ndim - 1; i > 0; --i) {
+      X_strides[i - 1] = X_strides[i] * X_dims[i];
+    }
+    std::vector<int> scale_dims(ndim, 1);
+    scale_dims[C_dim] = C;
+    std::vector<int> scale_strides(ndim);
+    std::fill(scale_strides.begin(), scale_strides.begin() + C_dim, C);
+    std::fill(scale_strides.begin() + C_dim, scale_strides.end(), 1);
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        X_desc_, cudnn_type, ndim, X_dims.data(), X_strides.data()));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        scale_desc_,
+        cudnn_type,
+        ndim,
+        scale_dims.data(),
+        scale_strides.data()));
+  }
+
+  const StorageOrder order_;
+  const bool is_learnable_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t X_desc_;
+  cudnnTensorDescriptor_t scale_desc_;
+  cudnnOpTensorDescriptor_t mul_desc_;
+};
+
+class CuDNNAffineChannelOp final : public CuDNNAffineChannelOpBase {
+ public:
+  CuDNNAffineChannelOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNAffineChannelOpBase(operator_def, ws) {
+    CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
+  }
+
+  ~CuDNNAffineChannelOp() {
+    CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    const auto& scale = Input(1);
+    const auto& bias = Input(2);
+    auto* Y = Output(0);
+    if (is_learnable_) {
+      CAFFE_ENFORCE_NE(
+          Y,
+          &X,
+          "In-place affine_channel_op is not supported when "
+          "is_learnable = true.");
+    }
+    Y->ResizeLike(X);
+    const T* X_data = X.data<T>();
+    const T* scale_data = scale.data<T>();
+    const T* bias_data = bias.data<T>();
+    T* Y_data = Y->mutable_data<T>();
+    const int ndim = X.ndim();
+    CAFFE_ENFORCE_GE(ndim, 4);
+    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
+    if (ndim == 4) {
+      const int N = X.dim32(0);
+      const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3);
+      const int H = order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1);
+      const int W = order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2);
+      SetTensorDesc4D(cudnn_type, N, C, H, W);
+    } else {
+      const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+      SetTensorDescND(cudnn_type, X_dims);
+    }
+    CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
+        mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN));
+    CUDNN_ENFORCE(cudnnOpTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        mul_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        X_data,
+        cudnnTypeWrapper<T>::kOne(),
+        scale_desc_,
+        scale_data,
+        cudnnTypeWrapper<T>::kZero(),
+        X_desc_,
+        Y_data));
+    if (ndim == 4) {
+      CUDNN_ENFORCE(cudnnAddTensor(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          cudnnTypeWrapper<T>::kOne(),
+          scale_desc_,
+          bias_data,
+          cudnnTypeWrapper<T>::kOne(),
+          X_desc_,
+          Y_data));
+    } else {
+      CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
+          add_desc_, CUDNN_OP_TENSOR_ADD, cudnn_type, CUDNN_PROPAGATE_NAN));
+      CUDNN_ENFORCE(cudnnOpTensor(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          add_desc_,
+          cudnnTypeWrapper<T>::kOne(),
+          X_desc_,
+          Y_data,
+          cudnnTypeWrapper<T>::kOne(),
+          scale_desc_,
+          bias_data,
+          cudnnTypeWrapper<T>::kZero(),
+          X_desc_,
+          Y_data));
+    }
+    return true;
+  }
+
+ private:
+  cudnnOpTensorDescriptor_t add_desc_;
+};
+
+class CuDNNAffineChannelGradientOp final : public CuDNNAffineChannelOpBase {
+ public:
+  CuDNNAffineChannelGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNAffineChannelOpBase(operator_def, ws) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    CUDNN_ENFORCE(cudnnCreateReduceTensorDescriptor(&reduce_desc_));
+#endif
+  }
+
+  ~CuDNNAffineChannelGradientOp() {
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    CUDNN_ENFORCE(cudnnDestroyReduceTensorDescriptor(reduce_desc_));
+#endif
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& dY = Input(0);
+    const auto& scale = is_learnable_ ? Input(2) : Input(1);
+    auto* dX = Output(0);
+    dX->ResizeLike(dY);
+    const T* dY_data = dY.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* dX_data = dX->mutable_data<T>();
+    const int ndim = dY.ndim();
+    CAFFE_ENFORCE_GE(ndim, 4);
+    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
+    const std::vector<int> X_dims(dY.dims().cbegin(), dY.dims().cend());
+    SetTensorDescND(cudnn_type, X_dims);
+    CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
+        mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN));
+    CUDNN_ENFORCE(cudnnOpTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        mul_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        dY_data,
+        cudnnTypeWrapper<T>::kOne(),
+        scale_desc_,
+        scale_data,
+        cudnnTypeWrapper<T>::kZero(),
+        X_desc_,
+        dX_data));
+    if (is_learnable_) {
+      const auto& X = Input(1);
+      const T* X_data = X.data<T>();
+      auto* dscale = Output(1);
+      auto* dbias = Output(2);
+      dscale->ResizeLike(scale);
+      dbias->ResizeLike(scale);
+      T* dscale_data = dscale->mutable_data<T>();
+      T* dbias_data = dbias->mutable_data<T>();
+      if (X.size() == scale.size()) {
+        CUDNN_ENFORCE(cudnnOpTensor(
+            cudnn_wrapper_.inline_cudnn_handle(),
+            mul_desc_,
+            cudnnTypeWrapper<T>::kOne(),
+            X_desc_,
+            dY_data,
+            cudnnTypeWrapper<T>::kOne(),
+            X_desc_,
+            X_data,
+            cudnnTypeWrapper<T>::kZero(),
+            X_desc_,
+            dscale_data));
+        context_.Copy<T, CUDAContext, CUDAContext>(
+            dY.size(), dY_data, dbias_data);
+      } else {
+        dYxX_.ResizeLike(X);
+        T* dYxX_data = dYxX_.mutable_data<T>();
+        CUDNN_ENFORCE(cudnnOpTensor(
+            cudnn_wrapper_.inline_cudnn_handle(),
+            mul_desc_,
+            cudnnTypeWrapper<T>::kOne(),
+            X_desc_,
+            dY_data,
+            cudnnTypeWrapper<T>::kOne(),
+            X_desc_,
+            X_data,
+            cudnnTypeWrapper<T>::kZero(),
+            X_desc_,
+            dYxX_data));
+#if CUDNN_VERSION_MIN(6, 0, 0)
+        ComputeScaleBiasGradient<T>(
+            dYxX_data, dY_data, dscale_data, dbias_data);
+#else
+        const int N = X.dim32(0);
+        const int C =
+            order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
+        const int HxW = X.size() / (N * C);
+        ComputeScaleBiasGradientFallback<T>(
+            N, C, HxW, dYxX_data, dY_data, dscale_data, dbias_data);
+#endif
+      }
+    }
+    return true;
+  }
+
+ private:
+#if CUDNN_VERSION_MIN(6, 0, 0)
+  template <typename T>
+  void
+  ComputeScaleBiasGradient(const T* dYxX, const T* dY, T* dscale, T* dbias) {
+    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
+    CUDNN_ENFORCE(cudnnSetReduceTensorDescriptor(
+        reduce_desc_,
+        CUDNN_REDUCE_TENSOR_ADD,
+        cudnn_type,
+        CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES,
+        CUDNN_32BIT_INDICES));
+    std::size_t workspace_size = 0;
+    CUDNN_ENFORCE(cudnnGetReductionWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        reduce_desc_,
+        X_desc_,
+        scale_desc_,
+        &workspace_size));
+    workspace_buff_.Resize((workspace_size + sizeof(T) - 1) / sizeof(T));
+    T* workspace_data = workspace_buff_.mutable_data<T>();
+    CUDNN_ENFORCE(cudnnReduceTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        reduce_desc_,
+        nullptr,
+        0,
+        workspace_data,
+        workspace_size,
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        dYxX,
+        cudnnTypeWrapper<T>::kZero(),
+        scale_desc_,
+        dscale));
+    CUDNN_ENFORCE(cudnnReduceTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        reduce_desc_,
+        nullptr,
+        0,
+        workspace_data,
+        workspace_size,
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        dY,
+        cudnnTypeWrapper<T>::kZero(),
+        scale_desc_,
+        dbias));
+  }
+#else
+  template <typename T>
+  void ComputeScaleBiasGradientFallback(
+      const int N,
+      const int C,
+      const int HxW,
+      const T* dYxX,
+      const T* dY,
+      T* dscale,
+      T* dbias) {
+    if (order_ == StorageOrder::NCHW) {
+      std::array<int, 3> dims = {N, C, HxW};
+      std::array<int, 2> axes = {0, 2};
+      math::ReduceSum<T, CUDAContext>(
+          3, dims.data(), 2, axes.data(), dYxX, dscale, &context_);
+      math::ReduceSum<T, CUDAContext>(
+          3, dims.data(), 2, axes.data(), dY, dbias, &context_);
+    } else {
+      std::array<int, 2> dims = {N * HxW, C};
+      const int axis = 0;
+      math::ReduceSum<T, CUDAContext>(
+          2, dims.data(), 1, &axis, dYxX, dscale, &context_);
+      math::ReduceSum<T, CUDAContext>(
+          2, dims.data(), 1, &axis, dY, dbias, &context_);
+    }
+  }
+#endif
+
+  Tensor<CUDAContext> dYxX_;
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+  cudnnReduceTensorDescriptor_t reduce_desc_;
+
+  Tensor<CUDAContext> workspace_buff_;
+#endif
+};
+
+} // namespace
+
+REGISTER_CUDNN_OPERATOR(AffineChannel, CuDNNAffineChannelOp);
+REGISTER_CUDNN_OPERATOR(AffineChannelGradient, CuDNNAffineChannelGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/operators/apmeter_op.cc b/caffe2/operators/apmeter_op.cc
new file mode 100644
index 0000000..7965c81
--- /dev/null
+++ b/caffe2/operators/apmeter_op.cc
@@ -0,0 +1,130 @@
+#include "caffe2/operators/apmeter_op.h"
+
+namespace caffe2 {
+
+template <>
+void APMeterOp<float, CPUContext>::BufferPredictions(
+    const float* XData,
+    const int* labelData,
+    int N,
+    int D) {
+  if (buffers_.empty()) {
+    // Initialize the buffer
+    buffers_.resize(D, std::vector<BufferDataType>(buffer_size_));
+  }
+  DCHECK_EQ(buffers_.size(), D);
+
+  // Fill atmose buffer_size_ data at a time, so truncate the input if needed
+  if (N > buffer_size_) {
+    XData = XData + (N - buffer_size_) * D;
+    labelData = labelData + (N - buffer_size_) * D;
+    N = buffer_size_;
+  }
+
+  // Reclaim space if not enough space in the buffer to hold new data
+  int space_to_reclaim = buffer_used_ + N - buffer_size_;
+  if (space_to_reclaim > 0) {
+    for (auto& buffer : buffers_) {
+      std::rotate(
+          buffer.begin(), buffer.begin() + space_to_reclaim, buffer.end());
+    }
+    buffer_used_ -= space_to_reclaim;
+  }
+
+  // Fill the buffer
+  for (int i = 0; i < D; i++) {
+    for (int j = 0; j < N; j++) {
+      buffers_[i][buffer_used_ + j].first = XData[j * D + i];
+      buffers_[i][buffer_used_ + j].second = labelData[j * D + i];
+    }
+  }
+
+  buffer_used_ += N;
+}
+
+template <>
+bool APMeterOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = Input(LABEL);
+  auto* Y = Output(0);
+  // Check dimensions
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  DCHECK_EQ(label.ndim(), 2);
+  DCHECK_EQ(label.dim32(0), N);
+  DCHECK_EQ(label.dim32(1), D);
+  Y->Resize(D);
+
+  const auto* Xdata = X.data<float>();
+  const auto* labelData = label.data<int>();
+  auto* Ydata = Y->mutable_data<float>();
+
+  BufferPredictions(Xdata, labelData, N, D);
+
+  // Calculate AP for each class
+  for (int i = 0; i < D; i++) {
+    auto& buffer = buffers_[i];
+    // Sort predictions by score
+    std::stable_sort(
+        buffer.begin(),
+        buffer.begin() + buffer_used_,
+        [](const BufferDataType& p1, const BufferDataType& p2) {
+          return p1.first > p2.first;
+        });
+    // Calculate cumulative precision for each sample
+    float tp_sum = 0.0;
+    float precision_sum = 0.0;
+    int ntruth = 0;
+    for (int j = 0; j < buffer_used_; j++) {
+      tp_sum += buffer[j].second;
+      if (buffer[j].second == 1) {
+        ntruth += 1;
+        precision_sum += tp_sum / (j + 1);
+      }
+    }
+
+    // Calculate AP
+    Ydata[i] = precision_sum / std::max(1, ntruth);
+  }
+
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(APMeter, APMeterOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(APMeter)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::FLOAT)
+    .SetDoc(R"DOC(
+APMeter computes Average Precision for binary or multi-class classification.
+It takes two inputs: prediction scores P of size (n_samples x n_classes), and
+true labels Y of size (n_samples x n_classes). It returns a single float number
+per class for the average precision of that class.
+)DOC")
+    .Arg(
+        "buffer_size",
+        "(int32_t) indicates how many predictions should the op buffer. "
+        "defaults to 1000")
+    .Input(
+        0,
+        "predictions",
+        "2-D tensor (Tensor<float>) of size (num_samples x"
+        "num_classes) containing prediction scores")
+    .Input(
+        1,
+        "labels",
+        "2-D tensor (Tensor<int>) of size (num_samples) "
+        "containing true labels for each sample")
+    .Output(
+        0,
+        "AP",
+        "1-D tensor (Tensor<float>) of size num_classes containing "
+        "average precision for each class");
+
+SHOULD_NOT_DO_GRADIENT(APMeter);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/apmeter_op.h b/caffe2/operators/apmeter_op.h
new file mode 100644
index 0000000..4023e4a
--- /dev/null
+++ b/caffe2/operators/apmeter_op.h
@@ -0,0 +1,41 @@
+#ifndef CAFFE2_MAP_OP_H_
+#define CAFFE2_MAP_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class APMeterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  APMeterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        buffer_size_(
+            OperatorBase::GetSingleArgument<int32_t>("buffer_size", 1000)),
+        buffer_used_(0) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  using BufferDataType = std::pair<float, int>;
+  // Buffer the predictions for each class
+  std::vector<std::vector<BufferDataType>> buffers_;
+  // Capacity of the buffer
+  int buffer_size_;
+  // Used buffer
+  int buffer_used_;
+
+  INPUT_TAGS(PREDICTION, LABEL);
+
+ protected:
+  // Buffer predictions for N sample and D classes
+  void
+  BufferPredictions(const float* Xdata, const int* labelData, int N, int D);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_MAP_OP_H_
diff --git a/caffe2/operators/arg_ops.cc b/caffe2/operators/arg_ops.cc
new file mode 100644
index 0000000..aeedbd5
--- /dev/null
+++ b/caffe2/operators/arg_ops.cc
@@ -0,0 +1,246 @@
+#include "caffe2/operators/arg_ops.h"
+
+#include <functional>
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, class Compare, class Context>
+void ComputeArgImpl(
+    const int prev_size,
+    const int next_size,
+    const int n,
+    const Compare& comp,
+    const T* X,
+    TIndex* Y,
+    Context* context) {
+  math::Set<TIndex, Context>(prev_size * next_size, TIndex(0), Y, context);
+  for (int i = 0; i < prev_size; ++i) {
+    const T* cur_X = X + i * n * next_size + next_size;
+    for (int k = 1; k < n; ++k) {
+      for (int j = 0; j < next_size; ++j) {
+        TIndex* cur_Y = Y + i * next_size + j;
+        if (comp(*cur_X, X[i * n * next_size + *cur_Y * next_size + j])) {
+          *cur_Y = k;
+        }
+        ++cur_X;
+      }
+    }
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool ArgMaxReducer<CPUContext>::operator()(
+    const int prev_size,
+    const int next_size,
+    const int n,
+    const T* X,
+    TIndex* Y,
+    CPUContext* context) const {
+  ComputeArgImpl(prev_size, next_size, n, std::greater<T>(), X, Y, context);
+  return true;
+}
+
+template <>
+template <typename T>
+bool ArgMinReducer<CPUContext>::operator()(
+    const int prev_size,
+    const int next_size,
+    const int n,
+    const T* X,
+    TIndex* Y,
+    CPUContext* context) const {
+  ComputeArgImpl(prev_size, next_size, n, std::less<T>(), X, Y, context);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ArgMax, ArgOp<CPUContext, ArgMaxReducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(ArgMin, ArgOp<CPUContext, ArgMinReducer<CPUContext>>);
+
+namespace {
+
+std::vector<TensorShape> InferTensor(
+    const OperatorDef& def,
+    const std::vector<TensorShape>& in) {
+  std::vector<TensorShape> out(1);
+  ArgumentHelper helper(def);
+  int axis = helper.GetSingleArgument("axis", -1);
+  const bool keep_dims = helper.GetSingleArgument("keepdims", true);
+  const auto& in_dims = in[0].dims();
+  auto* out_dims = out[0].mutable_dims();
+  if (axis == -1) {
+    axis = in_dims.size() - 1;
+  }
+  for (int i = 0; i < axis; ++i) {
+    out_dims->Add(in_dims.Get(i));
+  }
+  if (keep_dims) {
+    out_dims->Add(1);
+  }
+  for (int i = axis + 1; i < in_dims.size(); ++i) {
+    out_dims->Add(in_dims.Get(i));
+  }
+  out[0].set_data_type(TensorProto::INT64);
+  return out;
+}
+
+} // namespace
+
+OPERATOR_SCHEMA(ArgMax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(InferTensor)
+    .SetDoc(R"DOC(
+Retrieve the argmax of an axis dimension specified by the `axis`
+argument. Given an input tensor and two arguments (`axis` and
+`keepdims`), returns a tensor containing the indices of the largest
+element along the given axis. If the `keepdims` arg is *True* (default),
+the shape of the output tensor matches the input tensor except the
+`axis` dimension equals 1. Else, the `axis` dimension of the output
+tensor is removed.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/arg_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ArgMax",
+    ["X"],
+    ["Indices"],
+    axis=2,
+    keepdims=False
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(3,3,3))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Indices:", workspace.FetchBlob("Indices"))
+
+```
+
+**Result**
+
+```
+X: [[[4. 9. 6.]
+  [6. 6. 1.]
+  [9. 5. 4.]]
+
+ [[6. 7. 4.]
+  [7. 9. 1.]
+  [3. 2. 8.]]
+
+ [[3. 4. 6.]
+  [5. 2. 7.]
+  [1. 5. 7.]]]
+Indices: [[1 0 0]
+ [1 1 2]
+ [2 2 2]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Indices",
+        "*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
+    .Arg("axis", "*(type: int; default: -1)* The axis to get argmax.")
+    .Arg(
+        "keepdims",
+        "*(type: bool; default: True)* If True (default), the output tensor "
+        "shape will match the input tensor shape except the `axis` dimension "
+        "equals 1. Else, the `axis` dimension of the output tensor is removed.");
+
+OPERATOR_SCHEMA(ArgMin)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(InferTensor)
+    .SetDoc(R"DOC(
+Retrieve the argmin of an axis dimension specified by the `axis`
+argument. Given an input tensor and two arguments (`axis` and
+`keepdims`), returns a tensor containing the indices of the smallest
+element along the given axis. If the `keepdims` arg is *True* (default),
+the shape of the output tensor matches the input tensor except the
+`axis` dimension equals 1. Else, the `axis` dimension of the output
+tensor is removed.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/arg_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ArgMin",
+    ["X"],
+    ["Indices"],
+    axis=1
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(5,5))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Indices:", workspace.FetchBlob("Indices"))
+
+```
+
+**Result**
+
+```
+
+X: [[9. 4. 6. 4. 1.]
+  [5. 9. 8. 3. 4.]
+  [6. 1. 0. 2. 9.]
+  [7. 8. 2. 4. 9.]
+  [3. 9. 4. 9. 4.]]
+Indices: [[4]
+  [3]
+  [2]
+  [2]
+  [0]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Indices",
+        "*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
+    .Arg("axis", "*(type: int; default: -1)* The axis to get argmin.")
+    .Arg(
+        "keepdims",
+        "*(type: bool; default: True)* If True (default), the output tensor "
+        "shape will match the input tensor shape except the `axis` dimension "
+        "equals 1. Else, the `axis` dimension of the output tensor is removed.");
+
+SHOULD_NOT_DO_GRADIENT(ArgMax);
+SHOULD_NOT_DO_GRADIENT(ArgMin);
+
+} // namespace caffe2
diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
new file mode 100644
index 0000000..1735e42
--- /dev/null
+++ b/caffe2/operators/arg_ops.cu
@@ -0,0 +1,107 @@
+#include "caffe2/operators/arg_ops.h"
+
+#include <limits>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/fixed_divisor.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+template <typename K, typename V>
+using BlockReduce =
+    cub::BlockReduce<KeyValuePair<K, V>, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename T, class Reducer>
+__global__ void ComputeArgCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const FixedDivisor<int> stride,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    TIndex* Y) {
+  __shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
+  const int d = stride.d();
+  for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
+    int i;
+    int j;
+    stride.DivMod(idx, &i, &j);
+    KeyValuePair<int, T> kv = {-1, init};
+    for (int k = threadIdx.x; k < inner_size; k += blockDim.x) {
+      kv = reducer({k, X[i * inner_size * d + k * d + j]}, kv);
+    }
+    kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
+    if (threadIdx.x == 0) {
+      Y[idx] = static_cast<TIndex>(kv.key);
+    }
+    __syncthreads();
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool ArgMaxReducer<CUDAContext>::operator()(
+    const int prev_size,
+    const int next_size,
+    const int n,
+    const T* X,
+    TIndex* Y,
+    CUDAContext* context) const {
+  const int outer_size = prev_size * next_size;
+  const FixedDivisor<int> stride(next_size);
+  ComputeArgCUDAKernel<<<
+      std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      outer_size,
+      n,
+      stride,
+      cub::ArgMax(),
+      std::numeric_limits<T>::lowest(),
+      X,
+      Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool ArgMinReducer<CUDAContext>::operator()(
+    const int prev_size,
+    const int next_size,
+    const int n,
+    const T* X,
+    TIndex* Y,
+    CUDAContext* context) const {
+  const int outer_size = prev_size * next_size;
+  const FixedDivisor<int> stride(next_size);
+  ComputeArgCUDAKernel<<<
+      std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      outer_size,
+      n,
+      stride,
+      cub::ArgMin(),
+      std::numeric_limits<T>::max(),
+      X,
+      Y);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ArgMax, ArgOp<CUDAContext, ArgMaxReducer<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(ArgMin, ArgOp<CUDAContext, ArgMinReducer<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/arg_ops.h b/caffe2/operators/arg_ops.h
new file mode 100644
index 0000000..98917a3
--- /dev/null
+++ b/caffe2/operators/arg_ops.h
@@ -0,0 +1,99 @@
+#ifndef CAFFE2_OPERATORS_ARG_OPS_H_
+#define CAFFE2_OPERATORS_ARG_OPS_H_
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+template <class Context, class Reducer>
+class ArgOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ArgOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(bool, "keepdims", keep_dims_, true) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<std::int32_t, std::int64_t, float, double>>::
+        call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    if (axis_ == -1) {
+      axis_ = ndim - 1;
+    }
+    CAFFE_ENFORCE_GE(axis_, 0);
+    CAFFE_ENFORCE_LT(axis_, ndim);
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims;
+    Y_dims.reserve(ndim);
+    int prev_size = 1;
+    int next_size = 1;
+    for (int i = 0; i < axis_; ++i) {
+      Y_dims.push_back(X_dims[i]);
+      prev_size *= X_dims[i];
+    }
+    if (keep_dims_) {
+      Y_dims.push_back(1);
+    }
+    for (int i = axis_ + 1; i < ndim; ++i) {
+      Y_dims.push_back(X_dims[i]);
+      next_size *= X_dims[i];
+    }
+    Y->Resize(Y_dims);
+    const int n = X_dims[axis_];
+    return reducer_(
+        prev_size,
+        next_size,
+        n,
+        X.template data<T>(),
+        Y->template mutable_data<TIndex>(),
+        &context_);
+  }
+
+ private:
+  int axis_;
+  const bool keep_dims_;
+  Reducer reducer_{};
+};
+
+template <class Context>
+struct ArgMaxReducer {
+  template <typename T>
+  bool operator()(
+      const int prev_size,
+      const int next_size,
+      const int n,
+      const T* X,
+      TIndex* Y,
+      Context* context) const;
+};
+
+template <class Context>
+struct ArgMinReducer {
+  template <typename T>
+  bool operator()(
+      const int prev_size,
+      const int next_size,
+      const int n,
+      const T* X,
+      TIndex* Y,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ARG_OPS_H_
diff --git a/caffe2/operators/asin_op.cc b/caffe2/operators/asin_op.cc
new file mode 100644
index 0000000..3f7db59
--- /dev/null
+++ b/caffe2/operators/asin_op.cc
@@ -0,0 +1,74 @@
+#include "caffe2/operators/asin_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool AsinGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (T(1) - X_arr.square()).rsqrt();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Asin,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AsinFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    AsinGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AsinGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Asin)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the arcsine of the given input tensor, element-wise.
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The arcsine of the input tensor computed element-wise");
+
+OPERATOR_SCHEMA(AsinGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetAsinGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AsinGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Asin, GetAsinGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/asin_op.cu b/caffe2/operators/asin_op.cu
new file mode 100644
index 0000000..b199085
--- /dev/null
+++ b/caffe2/operators/asin_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/asin_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void AsinGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = dY[i] * rsqrtf(1.0f - X[i] * X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool AsinGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  AsinGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Asin,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AsinFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AsinGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AsinGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/asin_op.h b/caffe2/operators/asin_op.h
new file mode 100644
index 0000000..d80ac84
--- /dev/null
+++ b/caffe2/operators/asin_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ASIN_OP_H_
+#define CAFFE2_OPERATORS_ASIN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AsinFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Asin(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AsinGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ASIN_OP_H_
diff --git a/caffe2/operators/assert_op.cc b/caffe2/operators/assert_op.cc
new file mode 100644
index 0000000..61a4b5b
--- /dev/null
+++ b/caffe2/operators/assert_op.cc
@@ -0,0 +1,65 @@
+#include "caffe2/operators/assert_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Assert, AssertOp<CPUContext>);
+
+OPERATOR_SCHEMA(Assert)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Takes in a tensor of type *bool*, *int*, *long*, or *long long* and checks if all values are True when coerced into a boolean. In other words, for non-bool types this asserts that all values in the tensor are non-zero. If a value is False after coerced into a boolean, the operator throws an error. Else, if all values are True, nothing is returned. For tracability, a custom error message can be set using the `error_msg` arguement.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/assert_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Assert",
+    ["A"],
+    [],
+    error_msg="Failed assertion from Assert operator"
+)
+
+workspace.FeedBlob("A", np.random.randint(10, size=(3,3)).astype(np.int32))
+print("A:", workspace.FetchBlob("A"))
+try:
+    workspace.RunOperatorOnce(op)
+except RuntimeError:
+    print("Assertion Failed!")
+else:
+    print("Assertion Passed!")
+
+```
+
+**Result**
+
+```
+
+A:
+[[7 5 6]
+ [1 2 4]
+ [5 3 7]]
+Assertion Passed!
+
+```
+
+</details>
+
+	)DOC")
+    .Arg(
+        "error_msg",
+        "(*string*): custom error message to be thrown when the input does not pass assertion",
+        false)
+    .Input(0,"X","(*Tensor*): input tensor");
+
+} // namespace caffe2
diff --git a/caffe2/operators/assert_op.cu b/caffe2/operators/assert_op.cu
new file mode 100644
index 0000000..8e563e0
--- /dev/null
+++ b/caffe2/operators/assert_op.cu
@@ -0,0 +1,8 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/assert_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Assert, AssertOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h
new file mode 100644
index 0000000..65bddac
--- /dev/null
+++ b/caffe2/operators/assert_op.h
@@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_ASSERT_OP_H_
+#define CAFFE2_OPERATORS_ASSERT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class AssertOp final : public Operator<Context> {
+ public:
+  AssertOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        error_msg_(
+            OperatorBase::GetSingleArgument<std::string>("error_msg", "")) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <typename T>
+  bool DoRunWithType() {
+    // Copy into CPU context for comparison
+    cmp_tensor_.CopyFrom(Input(0));
+    auto* cmp_data = cmp_tensor_.template data<T>();
+
+    for (TIndex i = 0; i < cmp_tensor_.size(); ++i) {
+      CAFFE_ENFORCE((bool)cmp_data[i], [&]() {
+        std::stringstream ss;
+        ss << "Assert failed for element " << i
+           << " in tensor, value: " << cmp_data[i] << "\n";
+        if (!error_msg_.empty()) {
+          ss << "Error message: " << error_msg_;
+        }
+        return ss.str();
+      }());
+    }
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<long, int, bool>>::call(this, Input(0));
+  }
+
+ private:
+  TensorCPU cmp_tensor_;
+  std::string error_msg_;
+};
+
+} // namespace caffe2
+
+#endif /* CAFFE2_OPERATORS_ASSERT_OP_H_ */
diff --git a/caffe2/operators/atan_op.cc b/caffe2/operators/atan_op.cc
new file mode 100644
index 0000000..59c0ebb
--- /dev/null
+++ b/caffe2/operators/atan_op.cc
@@ -0,0 +1,74 @@
+#include "caffe2/operators/atan_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool AtanGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr / (T(1) + X_arr.square());
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Atan,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AtanFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    AtanGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AtanGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Atan)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the arctangent of the given input tensor, element-wise.
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The arctangent of the input tensor computed element-wise");
+
+OPERATOR_SCHEMA(AtanGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetAtanGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AtanGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Atan, GetAtanGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/atan_op.cu b/caffe2/operators/atan_op.cu
new file mode 100644
index 0000000..946fbdc
--- /dev/null
+++ b/caffe2/operators/atan_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/atan_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+AtanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / (T(1) + __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = dY[i] / (T(1) + X[i] * X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool AtanGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  AtanGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Atan,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AtanFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AtanGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AtanGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/atan_op.h b/caffe2/operators/atan_op.h
new file mode 100644
index 0000000..a4481d8
--- /dev/null
+++ b/caffe2/operators/atan_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ATAN_OP_H_
+#define CAFFE2_OPERATORS_ATAN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AtanFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Atan(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AtanGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ATAN_OP_H_
diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
new file mode 100644
index 0000000..1113024
--- /dev/null
+++ b/caffe2/operators/atomic_ops.cc
@@ -0,0 +1,142 @@
+#include <mutex>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace fb {
+namespace {
+
+class CreateMutexOp final : public Operator<CPUContext> {
+ public:
+  CreateMutexOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<std::mutex>>(0) =
+        std::unique_ptr<std::mutex>(new std::mutex);
+    return true;
+  }
+};
+
+class AtomicFetchAddOp final : public Operator<CPUContext> {
+ public:
+  AtomicFetchAddOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
+    auto& a = Input(1);
+    auto& b = Input(2);
+    auto* c = Output(0);
+    auto* d = Output(1);
+    c->Resize(std::vector<TIndex>());
+    d->Resize(std::vector<TIndex>());
+    auto* aPtr = a.data<int32_t>();
+    auto* bPtr = b.data<int32_t>();
+    auto* cPtr = c->mutable_data<int32_t>();
+    auto* dPtr = d->mutable_data<int32_t>();
+    std::lock_guard<std::mutex> lg(*mutex);
+    *dPtr = *aPtr;
+    *cPtr = *aPtr + *bPtr;
+    return true;
+  }
+};
+
+class CreateAtomicBoolOp final : public Operator<CPUContext> {
+ public:
+  using Operator::Operator;
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<std::atomic<bool>>>(0) =
+        std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>(false));
+    return true;
+  }
+};
+
+class ConditionalSetAtomicBoolOp final : public Operator<CPUContext> {
+ public:
+  using Operator::Operator;
+
+  bool RunOnDevice() override {
+    auto& ptr =
+        OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(ATOMIC_BOOL);
+    if (Input(CONDITION).data<bool>()[0]) {
+      ptr->store(true);
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(ATOMIC_BOOL, CONDITION);
+};
+
+class CheckAtomicBoolOp final : public Operator<CPUContext> {
+ public:
+  using Operator::Operator;
+
+  bool RunOnDevice() override {
+    auto& ptr = OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(0);
+    Output(0)->Resize(1);
+    *Output(0)->mutable_data<bool>() = ptr->load();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
+REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
+
+REGISTER_CPU_OPERATOR(CreateAtomicBool, CreateAtomicBoolOp);
+REGISTER_CPU_OPERATOR(ConditionalSetAtomicBool, ConditionalSetAtomicBoolOp);
+REGISTER_CPU_OPERATOR(CheckAtomicBool, CheckAtomicBoolOp);
+
+OPERATOR_SCHEMA(CreateMutex)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Creates an unlocked mutex and returns it in a unique_ptr blob.")
+    .Output(0, "mutex_ptr", "Blob containing a std::unique_ptr<mutex>.");
+
+OPERATOR_SCHEMA(AtomicFetchAdd)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given a mutex and two int32 scalar tensors, performs an atomic fetch add
+by mutating the first argument and adding it to the second input
+argument. Returns the updated integer and the value prior to the update.
+)DOC")
+    .Input(0, "mutex_ptr", "Blob containing to a unique_ptr<mutex>")
+    .Input(1, "mut_value", "Value to be mutated after the sum.")
+    .Input(2, "increment", "Value to add to the first operand.")
+    .Output(0, "mut_value", "Mutated value after sum. Usually same as input 1.")
+    .Output(1, "fetched_value", "Value of the first operand before sum.")
+    .AllowInplace({{1, 0}});
+
+OPERATOR_SCHEMA(CreateAtomicBool)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Create an unique_ptr blob to hold an atomic<bool>")
+    .Output(0, "atomic_bool", "Blob containing a unique_ptr<atomic<bool>>");
+
+OPERATOR_SCHEMA(ConditionalSetAtomicBool)
+    .NumInputs(2)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Set an atomic<bool> to true if the given condition bool variable is true
+    )DOC")
+    .Input(0, "atomic_bool", "Blob containing a unique_ptr<atomic<bool>>")
+    .Input(1, "condition", "Blob containing a bool");
+
+OPERATOR_SCHEMA(CheckAtomicBool)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc("Copy the value of an atomic<bool> to a bool")
+    .Input(0, "atomic_bool", "Blob containing a unique_ptr<atomic<bool>>")
+    .Output(0, "value", "Copy of the value for the atomic<bool>");
+
+SHOULD_NOT_DO_GRADIENT(CreateMutex);
+SHOULD_NOT_DO_GRADIENT(AtomicFetchAdd);
+SHOULD_NOT_DO_GRADIENT(CreateAtomicBool);
+SHOULD_NOT_DO_GRADIENT(ConditionalSetAtomicBool);
+SHOULD_NOT_DO_GRADIENT(CheckAtomicBool);
+}
+}
+}
diff --git a/caffe2/operators/batch_box_cox_op.cc b/caffe2/operators/batch_box_cox_op.cc
new file mode 100644
index 0000000..e35c726
--- /dev/null
+++ b/caffe2/operators/batch_box_cox_op.cc
@@ -0,0 +1,343 @@
+#include "caffe2/operators/batch_box_cox_op.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+#ifdef CAFFE2_USE_MKL
+namespace {
+
+// Helpers for copying parameters.
+template <typename T>
+void TileArrayIntoVector(const T* a, int D, int K, vector<T>* b) {
+  b->resize(K * D);
+  for (int k = 0; k < K; k++) {
+    std::copy(a, a + D, b->begin() + k * D);
+  }
+}
+
+void TileIndicesInPlace(vector<int>* v, int D, int K) {
+  int n = v->size();
+  v->resize(K * n);
+  for (int k = 1; k < K; k++) {
+    for (int j = 0; j < n; j++) {
+      (*v)[k * n + j] = (*v)[j] + k * D;
+    }
+  }
+}
+
+// MKL VML function templates.
+template <typename T>
+void PackV(const int N, const T* a, const int* ia, T* y);
+template <typename T>
+void UnpackV(const int N, const T* a, T* y, const int* iy);
+template <typename T>
+void Pow(const int N, const T* a, const T* b, T* y);
+
+#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                   \
+  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
+    OriginalFunc(N, a, ia, y);                                  \
+  }
+DELEGATE_PACKV_FUNCTION(float, vsPackV)
+DELEGATE_PACKV_FUNCTION(double, vdPackV)
+#undef DELEGATE_PACKV_FUNCTION
+
+#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                     \
+  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
+    OriginalFunc(N, a, y, iy);                                    \
+  }
+DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
+DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
+#undef DELEGATE_UNPACKV_FUNCTION
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
+  template <>                                                      \
+  void Funcname<T>(const int N, const T* a, const T* b, T* y) {    \
+    OriginalFunc(N, a, b, y);                                      \
+  }
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION
+
+} // namespace
+#endif // CAFFE2_USE_MKL
+
+template <>
+template <typename T>
+bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
+  auto& data = Input(DATA);
+  auto& lambda1 = Input(LAMBDA1);
+  auto& lambda2 = Input(LAMBDA2);
+  CAFFE_ENFORCE_GE(data.ndim(), 1);
+  auto N = data.dim(0);
+  auto D = data.size_from_dim(1);
+
+  auto* output = Output(0);
+  output->ResizeLike(Input(DATA));
+  auto* output_ptr = output->template mutable_data<T>();
+
+  if (data.size() <= 0) {
+    return true;
+  }
+
+  CAFFE_ENFORCE_EQ(lambda1.size(), D);
+  CAFFE_ENFORCE_EQ(lambda2.size(), D);
+
+  const auto* data_ptr = data.template data<T>();
+  const auto* lambda1_ptr = lambda1.template data<T>();
+  const auto* lambda2_ptr = lambda2.template data<T>();
+
+  const T k_eps = static_cast<T>(1e-6);
+
+#ifdef CAFFE2_USE_MKL
+  if (min_block_size_ < 1) {
+    BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
+  } else {
+    // Find zero-valued columns, since they get special treatment.
+    nonzeros_.clear();
+    zeros_.clear();
+    nonzeros_.reserve(D);
+    zeros_.reserve(D);
+    for (TIndex j = 0; j < D; j++) {
+      if (lambda1_ptr[j] == 0) {
+        zeros_.push_back(j);
+      } else {
+        nonzeros_.push_back(j);
+      }
+    }
+
+    // Process K rows at a time for effective vectorization with small rows.
+    const int K = std::min(N, (min_block_size_ + D - 1) / D);
+
+    // Avoid copying data if all lambda1 values are zero, or if all are nonzero.
+    // In each of the three cases here, when K > 1, first process batches of K
+    // rows by replicating the input parameters K times. Then finish row-by-row.
+    TypedCachedBuffers<T>& b = GetBuffers<T>();
+    if (nonzeros_.size() == D) {
+      TIndex i = 0;
+      if (K > 1) {
+        TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
+        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
+        DCHECK_EQ(K * D, b.lambda1_.size());
+        DCHECK_EQ(K * D, b.lambda2_.size());
+        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
+          BoxCoxNonzeroLambda(
+              K * D,
+              data_ptr,
+              b.lambda1_.data(),
+              b.lambda2_.data(),
+              k_eps,
+              output_ptr);
+        }
+      }
+      for (; i < N; i++, data_ptr += D, output_ptr += D) {
+        BoxCoxNonzeroLambda(
+            D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
+      }
+    } else if (zeros_.size() == D) {
+      TIndex i = 0;
+      if (K > 1) {
+        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
+        DCHECK_EQ(K * D, b.lambda2_z_.size());
+        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
+          BoxCoxZeroLambda(
+              K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr);
+        }
+      }
+      for (; i < N; i++, data_ptr += D, output_ptr += D) {
+        BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr);
+      }
+    } else { // General case of mixed zero and non-zero lambda1 values.
+      int n = nonzeros_.size();
+      if (K > 1) {
+        TileIndicesInPlace(&nonzeros_, 0, K);
+        TileIndicesInPlace(&zeros_, 0, K);
+      }
+
+      // Gather parameter values into contiguous memory.
+      b.lambda1_.resize(nonzeros_.size());
+      b.lambda2_.resize(nonzeros_.size());
+      b.lambda2_z_.resize(zeros_.size());
+      PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data());
+      PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
+      PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
+
+      TIndex i = 0;
+      b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
+      if (K > 1) {
+        // Truncate to original size, and re-tile with offsets this time.
+        nonzeros_.resize(n);
+        zeros_.resize(D - n);
+        TileIndicesInPlace(&nonzeros_, D, K);
+        TileIndicesInPlace(&zeros_, D, K);
+        DCHECK_EQ(nonzeros_.size(), b.lambda1_.size());
+        DCHECK_EQ(nonzeros_.size(), b.lambda2_.size());
+        DCHECK_EQ(zeros_.size(), b.lambda2_z_.size());
+        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
+          BoxCoxMixedLambda(
+              data_ptr,
+              nonzeros_,
+              zeros_,
+              b.lambda1_.data(),
+              b.lambda2_.data(),
+              b.lambda2_z_.data(),
+              k_eps,
+              b.accumulator_.data(),
+              output_ptr);
+        }
+        // Truncate to original size.
+        nonzeros_.resize(n);
+        zeros_.resize(D - n);
+      }
+      for (; i < N; i++, data_ptr += D, output_ptr += D) {
+        BoxCoxMixedLambda(
+            data_ptr,
+            nonzeros_,
+            zeros_,
+            b.lambda1_.data(),
+            b.lambda2_.data(),
+            b.lambda2_z_.data(),
+            k_eps,
+            b.accumulator_.data(),
+            output_ptr);
+      }
+    }
+  }
+#else // CAFFE2_USE_MKL
+  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
+#endif // CAFFE2_USE_MKL
+  return true;
+}
+
+template <>
+template <typename T>
+void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
+    TIndex N,
+    TIndex D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* output_ptr) {
+  for (TIndex i = 0; i < N; i++) {
+    for (TIndex j = 0; j < D; j++, data_ptr++, output_ptr++) {
+      T lambda1_v = lambda1_ptr[j];
+      T lambda2_v = lambda2_ptr[j];
+      T tmp = std::max(*data_ptr + lambda2_v, k_eps);
+      if (lambda1_v == 0) {
+        *output_ptr = std::log(tmp);
+      } else {
+        *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
+      }
+    }
+  }
+}
+
+#ifdef CAFFE2_USE_MKL
+
+template <>
+template <typename T>
+void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
+    TIndex D,
+    const T* data_ptr,
+    const T* lambda1,
+    const T* lambda2,
+    T k_eps,
+    T* out) {
+  caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
+  for (TIndex j = 0; j < D; j++) {
+    out[j] = std::max(out[j], k_eps);
+  }
+  Pow(D, out, lambda1, out);
+  for (TIndex j = 0; j < D; j++) {
+    out[j] -= 1.0;
+  }
+  caffe2::math::Div(D, out, lambda1, out, &context_);
+}
+
+template <>
+template <typename T>
+void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
+    TIndex D,
+    const T* data_ptr,
+    const T* lambda2,
+    T k_eps,
+    T* output_ptr) {
+  caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
+  for (TIndex j = 0; j < D; j++) {
+    output_ptr[j] = std::max(output_ptr[j], k_eps);
+  }
+  caffe2::math::Log(D, output_ptr, output_ptr, &context_);
+}
+
+template <>
+template <typename T>
+void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
+    const T* data_ptr,
+    const vector<int>& nonzeros,
+    const vector<int>& zeros,
+    const T* lambda1,
+    const T* lambda2,
+    const T* lambda2_z,
+    T k_eps,
+    T* buffer,
+    T* output_ptr) {
+  PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer);
+  BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
+  UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data());
+
+  PackV(zeros.size(), data_ptr, zeros.data(), buffer);
+  BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer);
+  UnpackV(zeros.size(), buffer, output_ptr, zeros.data());
+}
+
+// Helpers to access cached buffers.
+#define DEFINE_CACHED_BUFFERS(T, tag)                                         \
+  template <>                                                                 \
+  template <>                                                                 \
+  BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>&                           \
+  BatchBoxCoxOp<CPUContext>::GetBuffers<T>() {                                \
+    if (!buffers_ || buffers_->type_ != tag) {                                \
+      buffers_.reset(new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
+      buffers_->type_ = tag;                                                  \
+    }                                                                         \
+    return *static_cast<TypedCachedBuffers<T>*>(buffers_.get());              \
+  }
+DEFINE_CACHED_BUFFERS(float, 1);
+DEFINE_CACHED_BUFFERS(double, 2);
+#undef DEFINE_CACHED_BUFFERS
+
+#endif // CAFFE2_USE_MKL
+
+namespace {
+
+REGISTER_CPU_OPERATOR(BatchBoxCox, BatchBoxCoxOp<CPUContext>);
+OPERATOR_SCHEMA(BatchBoxCox)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Input `data` is a N * D matrix. Apply box-cox transform for each column.
+`lambda1` and `lambda2` is of size D that defines the hyper-parameters for
+the transform of each column `x` of the input `data`:
+
+    ln(x + lambda2), if lambda1 == 0
+    ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
+
+)DOC")
+    .Input(0, "data", "input float or double N * D matrix")
+    .Input(1, "lambda1", "tensor of size D with the same type as data")
+    .Input(2, "lambda2", "tensor of size D with the same type as data")
+    .Output(0, "output", "output matrix that applied box-cox transform");
+
+GRADIENT_NOT_IMPLEMENTED_YET(BatchBoxCox);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_box_cox_op.h b/caffe2/operators/batch_box_cox_op.h
new file mode 100644
index 0000000..3d5e360
--- /dev/null
+++ b/caffe2/operators/batch_box_cox_op.h
@@ -0,0 +1,93 @@
+#ifndef CAFFE_OPERATORS_BATCH_BOX_COX_OPS_H_
+#define CAFFE_OPERATORS_BATCH_BOX_COX_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BatchBoxCoxOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchBoxCoxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        min_block_size_(
+            OperatorBase::GetSingleArgument<int>("min_block_size", 256)) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(DATA));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ protected:
+  template <typename T>
+  void BoxCoxNaive(
+      TIndex N,
+      TIndex D,
+      const T* data_ptr,
+      const T* lambda1_ptr,
+      const T* lambda2_ptr,
+      T k_eps,
+      T* output_ptr);
+
+#ifdef CAFFE2_USE_MKL
+  template <typename T>
+  void BoxCoxNonzeroLambda(
+      TIndex D,
+      const T* data_ptr,
+      const T* lambda1,
+      const T* lambda2,
+      T k_eps,
+      T* output_ptr);
+
+  template <typename T>
+  void BoxCoxZeroLambda(
+      TIndex D,
+      const T* data_ptr,
+      const T* lambda2,
+      T k_eps,
+      T* output_ptr);
+
+  template <typename T>
+  void BoxCoxMixedLambda(
+      const T* data_ptr,
+      const vector<int>& nonzeros,
+      const vector<int>& zeros,
+      const T* lambda1,
+      const T* lambda2,
+      const T* lambda2_z,
+      T k_eps,
+      T* buffer,
+      T* output_ptr);
+
+  vector<int> nonzeros_, zeros_;
+
+  // Buffers used by the MKL version are cached across calls.
+  struct CachedBuffers {
+    virtual ~CachedBuffers() {}
+    int type_;
+  };
+  template <typename T>
+  struct TypedCachedBuffers : public CachedBuffers {
+    vector<T> lambda1_, lambda2_, lambda2_z_;
+    vector<T> accumulator_;
+  };
+  template <typename T>
+  TypedCachedBuffers<T>& GetBuffers();
+  unique_ptr<CachedBuffers> buffers_;
+
+#endif // CAFFE2_USE_MKL
+
+  int min_block_size_;
+
+  INPUT_TAGS(DATA, LAMBDA1, LAMBDA2);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_BATCH_BOX_COX_OPS_H_
diff --git a/caffe2/operators/batch_bucketize_op.cc b/caffe2/operators/batch_bucketize_op.cc
new file mode 100644
index 0000000..dbbd56d
--- /dev/null
+++ b/caffe2/operators/batch_bucketize_op.cc
@@ -0,0 +1,126 @@
+#include "batch_bucketize_op.h"
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+bool BatchBucketizeOp<CPUContext>::RunOnDevice() {
+  auto& feature = Input(FEATURE);
+  auto& indices = Input(INDICES);
+  auto& boundaries = Input(BOUNDARIES);
+  auto& lengths = Input(LENGTHS);
+  auto* output = Output(O);
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+  CAFFE_ENFORCE_EQ(indices.ndim(), 1);
+  CAFFE_ENFORCE_EQ(boundaries.ndim(), 1);
+  CAFFE_ENFORCE_EQ(feature.ndim(), 2);
+  CAFFE_ENFORCE_EQ(lengths.size(), indices.size());
+
+  const auto* lengths_data = lengths.template data<int32_t>();
+  const auto* indices_data = indices.template data<int32_t>();
+  const auto* boundaries_data = boundaries.template data<float>();
+  const auto* feature_data = feature.template data<float>();
+  auto batch_size = feature.dim(0);
+  auto feature_dim = feature.dim(1);
+  auto output_dim = indices.size();
+
+  TIndex length_sum = 0;
+  for (TIndex i = 0; i < lengths.size(); i++) {
+    CAFFE_ENFORCE_GE(feature_dim, indices_data[i]);
+    length_sum += lengths_data[i];
+  }
+  CAFFE_ENFORCE_EQ(length_sum, boundaries.size());
+
+  TIndex lower_bound = 0;
+  output->Resize(batch_size, output_dim);
+  auto* output_data = output->template mutable_data<int32_t>();
+
+  for (TIndex i = 0; i < batch_size; i++) {
+    lower_bound = 0;
+    for (TIndex j = 0; j < output_dim; j++) {
+      for (TIndex k = 0; k <= lengths_data[j]; k++) {
+        if (k == lengths_data[j] ||
+            feature_data[i * feature_dim + indices_data[j]] <=
+                boundaries_data[lower_bound + k]) {
+          output_data[i * output_dim + j] = k;
+          break;
+        } else {
+          continue;
+        }
+      }
+      lower_bound += lengths_data[j];
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(BatchBucketize, BatchBucketizeOp<CPUContext>);
+
+OPERATOR_SCHEMA(BatchBucketize)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Bucketize the float_features into sparse features.
+The float_features is a N * D tensor where N is the batch_size, and D is the feature_dim.
+The indices is a 1D tensor containing the indices of the features that need to be bucketized.
+The lengths is a 1D tensor that splits the following 'boundaries' argument.
+The boundaries is a 1D tensor containing the border list for each feature.
+
+With in each batch, `indices` should not have duplicate number,
+and the number of elements in `indices` should be less than or euqal to `D`.
+Each element in `lengths` vector (lengths[`i`]) represents
+the number of boundaries in the sub border list.
+The sum of all elements in `lengths` must be equal to the size of  `boundaries`.
+If lengths[0] = 2, the first sub border list is [0.5, 1.0], which separate the
+value to (-inf, 0.5], (0,5, 1.0], (1.0, inf). The bucketized feature will have
+three possible values (i.e. 0, 1, 2).
+
+
+For example, with input:
+
+  float_features = [[1.42, 2.07, 3.19, 0.55, 4.32],
+                    [4.57, 2.30, 0.84, 4.48, 3.09],
+                    [0.89, 0.26, 2.41, 0.47, 1.05],
+                    [0.03, 2.97, 2.43, 4.36, 3.11],
+                    [2.74, 5.77, 0.90, 2.63, 0.38]]
+  indices = [0, 1, 4]
+  lengths = [2, 3, 1]
+  boundaries =  [0.5, 1.0, 1.5, 2.5, 3.5, 2.5]
+
+The output is:
+
+  output =[[2, 1, 1],
+           [2, 1, 1],
+           [1, 0, 0],
+           [0, 2, 1],
+           [2, 3, 0]]
+
+after running this operator.
+)DOC")
+    .Input(
+        0,
+        "float_features",
+        "2-D dense tensor, the second dimension must be greater or equal to the indices dimension")
+    .Input(
+        1,
+        "indices",
+        "Flatten tensor, containing the indices of `float_features` to be bucketized. The datatype must be int32.")
+    .Input(
+        2,
+        "lengths",
+        "Flatten tensor, the size must be equal to that of `indices`. The datatype must be int32.")
+    .Input(
+        3,
+        "boundaries",
+        "Flatten tensor, dimension has to match the sum of lengths")
+    .Output(
+        0,
+        "bucktized_feat",
+        "2-D dense tensor, with 1st dim = float_features.dim(0), 2nd dim = size(indices)"
+        "in the arg list, the tensor is of the same data type as `feature`.");
+
+NO_GRADIENT(BatchBucketize);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_bucketize_op.h b/caffe2/operators/batch_bucketize_op.h
new file mode 100644
index 0000000..8a1223a
--- /dev/null
+++ b/caffe2/operators/batch_bucketize_op.h
@@ -0,0 +1,29 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_BATCH_BUCKETIZE_OP_H_
+#define CAFFE2_OPERATORS_BATCH_BUCKETIZE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BatchBucketizeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BatchBucketizeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(FEATURE, INDICES, BOUNDARIES, LENGTHS);
+  OUTPUT_TAGS(O);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BATCH_BUCKETIZE_OP_H_
diff --git a/caffe2/operators/batch_gather_ops.cc b/caffe2/operators/batch_gather_ops.cc
new file mode 100644
index 0000000..26bb44b
--- /dev/null
+++ b/caffe2/operators/batch_gather_ops.cc
@@ -0,0 +1,69 @@
+#include "caffe2/operators/batch_gather_ops.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(BatchGather, BatchGatherOp<CPUContext>);
+REGISTER_CPU_OPERATOR(BatchGatherGradient, BatchGatherGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(BatchGather)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      ArgumentHelper helper(def);
+
+      vector<int> output_dims;
+      const auto& data_dims = GetDimsVector(in[0]);
+      const auto& indices_dims = GetDimsVector(in[1]);
+      output_dims.push_back(data_dims[0]);
+      output_dims.insert(
+          output_dims.end(), indices_dims.begin(), indices_dims.end());
+      output_dims.insert(
+          output_dims.end(), data_dims.begin() + 2, data_dims.end());
+
+      out[0] = CreateTensorShape(output_dims, TensorProto::FLOAT);
+      return out;
+    })
+    .SetDoc(R"DOC(
+Batch gather operation, first dimension in DATA is the batch size.
+Given DATA tensor of rank r >= 2, and INDICES tensor of rank q >= 1, gather
+entries of the outer-most dimension of DATA indexed by INDICES, and concatenate
+them in an output tensor of rank (q - 1) + (r - 1).
+
+Example:
+  DATA  = [
+      [1.0, 1.2, 2.4, 4.5],
+      [2.3, 3.4, 3.6, 2.3],
+      [4.5, 5.7, 1.2, 4.5],
+  ]
+  INDICES = [
+      [0, 2],
+  ]
+  OUTPUT = [
+      [1.0, 2.4],
+      [2.3, 3.6],
+      [4.5, 1.2],
+  ]
+)DOC")
+    .Input(0, "DATA", "Tensor of rank r >= 2.")
+    .Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
+    .Output(0, "OUTPUT", "Tensor of rank (q - 1) + (r - 1).");
+
+OPERATOR_SCHEMA(BatchGatherGradient).NumInputs(3).NumOutputs(1);
+
+class GetBatchGatherGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    using Op = BatchGatherOp<CPUContext>;
+    return SingleGradientDef(
+        "BatchGatherGradient",
+        "",
+        vector<string>{I(Op::DATA), I(Op::INDICES), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(BatchGather, GetBatchGatherGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_gather_ops.cu b/caffe2/operators/batch_gather_ops.cu
new file mode 100644
index 0000000..8aa8cb4
--- /dev/null
+++ b/caffe2/operators/batch_gather_ops.cu
@@ -0,0 +1,166 @@
+#include <fstream>
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/batch_gather_ops.h"
+
+namespace caffe2 {
+
+template <typename T_INDEX, typename TData>
+__global__ void BatchGatherKernel(
+    const TData* src_base,
+    TData* out,
+    const T_INDEX* indices,
+    const int M,
+    const int N,
+    const int data_batch_size,
+    const int gathered_batch_size,
+    const int block_size) {
+  const int begin_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_items = M * N * block_size;
+  for (int s = begin_idx; s < num_items; s += blockDim.x * gridDim.x) {
+    const int k = s % block_size;
+    const int j = s / block_size % N;
+    const int i = s / block_size / N;
+    const T_INDEX idx = indices[j];
+    const float* src_offset = src_base + i * data_batch_size + idx * block_size;
+    float* dst_offset = out + i * gathered_batch_size + j * block_size;
+    dst_offset[k] = src_offset[k];
+  }
+}
+
+template <>
+bool BatchGatherOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
+}
+
+template <>
+template <typename TInd>
+bool BatchGatherOp<CUDAContext>::DoRunWithType() {
+  auto& data = Input(DATA);
+  auto& indices = Input(INDICES);
+  auto* output = Output(0);
+
+  vector<TIndex> shape;
+  shape.push_back(data.dim(0));
+  shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
+  shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
+  output->Resize(shape);
+
+  const int block_size = data.size_from_dim(2);
+  const int N = indices.size();
+  const auto data_batch_size = data.size_from_dim(1);
+  const auto gathered_batch_size = N * data.size_from_dim(2);
+  const TInd* idxs = indices.template data<TInd>();
+  auto src_base = static_cast<const float*>(data.raw_data());
+  auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));
+  const int M = data.dim32(0);
+
+  BatchGatherKernel<<<
+      std::min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(N * block_size, CAFFE_CUDA_NUM_THREADS),
+      0,
+      context_.cuda_stream()>>>(
+      src_base,
+      out,
+      idxs,
+      M,
+      N,
+      data_batch_size,
+      gathered_batch_size,
+      block_size);
+  return true;
+}
+
+template <typename T_INDEX, typename TData>
+__global__ void BatchGatherGradientKernel(
+    const TData* grad_data,
+    TData* out,
+    const T_INDEX* indices,
+    const int M,
+    const int N,
+    const int data_batch_size,
+    const int gathered_batch_size,
+    const int block_size) {
+  int begin_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_items = M * N * block_size;
+  for (int s = begin_idx; s < num_items; s += blockDim.x * gridDim.x) {
+    const int k = s % block_size;
+    const int j = s / block_size % N;
+    const int i = s / block_size / N;
+    const T_INDEX idx = indices[j];
+    const float* src_offset =
+        grad_data + i * gathered_batch_size + j * block_size;
+    float* dst_offset = out + i * data_batch_size + idx * block_size;
+    atomicAdd(dst_offset + k, src_offset[k]);
+  }
+}
+
+template <>
+bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
+}
+
+template <>
+template <typename TInd>
+bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
+  return DispatchHelper<
+      TensorTypes2<float, GenericTensorImplementation>,
+      TInd>::call(this, OperatorBase::Input<TensorCUDA>(DATA));
+}
+
+template <>
+template <typename TInd, typename TData>
+bool BatchGatherGradientOp<CUDAContext>::DoRunWithType2() {
+  auto& data = Input(DATA);
+  auto& indices = Input(INDICES);
+  auto& grad = Input(GRAD);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(data.dim(0), grad.dim(0), "batch sizes should be the same");
+
+  output->ResizeLike(data);
+  auto* out_data = output->template mutable_data<float>();
+  math::Set<float, CUDAContext>(output->size(), 0, out_data, &context_);
+
+  const auto* grad_data = grad.template data<float>();
+
+  const int M = grad.dim32(0);
+  const int block_size = data.size_from_dim(2);
+  const int N = indices.size();
+  const auto data_batch_size = data.size_from_dim(1);
+  const auto gathered_batch_size = N * data.size_from_dim(2);
+  const TInd* idxs = indices.template data<TInd>();
+
+  BatchGatherGradientKernel<<<
+      std::min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(N * block_size, CAFFE_CUDA_NUM_THREADS),
+      0,
+      context_.cuda_stream()>>>(
+      grad_data,
+      out_data,
+      idxs,
+      M,
+      N,
+      data_batch_size,
+      gathered_batch_size,
+      block_size);
+
+  return true;
+}
+
+template <>
+template <typename TInd>
+bool BatchGatherGradientOp<CUDAContext>::DoRunWithOtherType2() {
+  CAFFE_THROW(
+      "BatchGatherGradient is not implemented on tensor of type ",
+      Input(DATA).meta().name(),
+      "Consider adding it a type in the list DispatchHelper or implementing "
+      "a generic version (which won't work for duplicated indices though)");
+}
+
+REGISTER_CUDA_OPERATOR(BatchGather, BatchGatherOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(BatchGatherGradient, BatchGatherGradientOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
new file mode 100644
index 0000000..b9d3491
--- /dev/null
+++ b/caffe2/operators/batch_gather_ops.h
@@ -0,0 +1,146 @@
+#ifndef CAFFE2_OPERATORS_BATCH_GATHER_OPS_H_
+#define CAFFE2_OPERATORS_BATCH_GATHER_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BatchGatherOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(BatchGatherOp)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(INDICES));
+  }
+
+  template <typename TInd>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& indices = Input(INDICES);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
+
+    vector<TIndex> shape;
+    shape.push_back(data.dim(0));
+    shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
+    shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
+    output->Resize(shape);
+
+    auto block_size = data.size_from_dim(2);
+    auto block_bytesize = block_size * data.meta().itemsize();
+    auto N = indices.size();
+    auto data_batch_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+    auto gathered_batch_bytesize =
+        N * data.size_from_dim(2) * data.meta().itemsize();
+    const TInd* idxs = indices.template data<TInd>();
+    auto src_base = static_cast<const char*>(data.raw_data());
+    auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+    for (auto batch = 0; batch < data.dim(0); ++batch) {
+      for (auto i = 0; i < N; ++i) {
+        auto idx = idxs[i];
+        CAFFE_ENFORCE(
+            0 <= idx && idx < data.dim(1),
+            "INDICES element is out of DATA bounds, id=",
+            idx,
+            " data_dim=",
+            data.dim(1));
+        auto src =
+            src_base + idx * block_bytesize + batch * data_batch_bytesize;
+        auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
+        context_.template CopyItems<Context, Context>(
+            data.meta(), block_size, src, dst);
+      }
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, INDICES);
+};
+
+template <class Context>
+class BatchGatherGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(BatchGatherGradientOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(INDICES));
+  }
+
+  template <typename TInd>
+  bool DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<float, GenericTensorImplementation>,
+        TInd>::call(this, Input(DATA));
+  }
+
+  template <typename TInd, typename TData>
+  bool DoRunWithType2() {
+    auto& data = Input(DATA);
+    auto& indices = Input(INDICES);
+    auto& grad = Input(GRAD);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
+    CAFFE_ENFORCE_EQ(
+        data.dim(0), grad.dim(0), "batch sizes should be the same");
+
+    output->ResizeLike(data);
+    TData* out_data = output->template mutable_data<TData>();
+    if (data.size() <= 0) {
+      return true;
+    }
+
+    memset(out_data, 0, output->nbytes());
+
+    const TData* grad_data = grad.template data<TData>();
+
+    auto block_size = data.size_from_dim(2);
+    auto N = indices.size();
+    auto data_batch_size = data.size_from_dim(1);
+    auto gathered_batch_size = N * data.size_from_dim(2);
+    const TInd* idxs = indices.template data<TInd>();
+
+    for (auto batch = 0; batch < grad.dim(0); ++batch) {
+      for (auto i = 0; i < N; ++i) {
+        auto idx = idxs[i];
+        CAFFE_ENFORCE(
+            0 <= idx && idx < data.dim(1),
+            "INDICES element is out of DATA bounds, id=",
+            idx,
+            " data_dim=",
+            data.dim(1));
+        math::Add(
+            block_size,
+            out_data + idx * block_size + batch * data_batch_size,
+            grad_data + i * block_size + batch * gathered_batch_size,
+            out_data + idx * block_size + batch * data_batch_size,
+            &context_);
+      }
+    }
+    return true;
+  }
+
+  template <typename TInd>
+  bool DoRunWithOtherType2() {
+    CAFFE_THROW(
+        "BatchGatherGradient is not implemented on tensor of type ",
+        Input(DATA).meta().name(),
+        "Consider adding it a type in the list DispatchHelper or implementing "
+        "a generic version (which won't work for duplicated indices though)");
+  }
+
+  INPUT_TAGS(DATA, INDICES, GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BATCH_GATHER_OPS_H_
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
new file mode 100644
index 0000000..eda5195
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -0,0 +1,257 @@
+#include "caffe2/operators/batch_matmul_op.h"
+#include "caffe2/core/operator_schema.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(BatchMatMul, BatchMatMulOp<CPUContext>);
+
+vector<TensorShape> TensorInferenceForBatchMatMul(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  ArgumentHelper helper(def);
+  bool broadcast = helper.GetSingleArgument<int>("broadcast", 0);
+  if (!broadcast) {
+    const auto ndim = in[0].dims_size();
+    CAFFE_ENFORCE_GE(ndim, 2);
+    int a_dim0;
+    int b_dim1;
+    if (helper.GetSingleArgument<int>("trans_a", 0)) {
+      a_dim0 = in[0].dims(ndim - 1);
+    } else {
+      a_dim0 = in[0].dims(ndim - 2);
+    }
+
+    if (helper.GetSingleArgument<int>("trans_b", 0)) {
+      b_dim1 = in[1].dims(ndim - 2);
+    } else {
+      b_dim1 = in[1].dims(ndim - 1);
+    }
+
+    auto output_dims = vector<TIndex>{in[0].dims().begin(), in[0].dims().end()};
+    output_dims[ndim - 2] = a_dim0;
+    output_dims[ndim - 1] = b_dim1;
+
+    return vector<TensorShape>{
+        CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+  } else {
+    auto ndims_A = in[0].dims_size();
+    auto ndims_B = in[1].dims_size();
+    std::vector<TIndex> dims_A(ndims_A), dims_B(ndims_B);
+    for (int i = 0; i < ndims_A; ++i) {
+      dims_A[i] = in[0].dims(i);
+    }
+    for (int i = 0; i < ndims_B; ++i) {
+      dims_B[i] = in[1].dims(i);
+    }
+    bool A_broadcasted = false, B_broadcasted = false;
+    if (ndims_A == 1) {
+      dims_A.insert(dims_A.begin(), 1);
+      ndims_A = 2;
+      A_broadcasted = true;
+    }
+    if (ndims_B == 1) {
+      dims_B.push_back(1);
+      ndims_B = 2;
+      B_broadcasted = true;
+    }
+    size_t M, N;
+    if (helper.GetSingleArgument<int>("trans_a", 0)) {
+      M = dims_A[ndims_A - 1];
+    } else {
+      M = dims_A[ndims_A - 2];
+    }
+    if (helper.GetSingleArgument<int>("trans_b", 0)) {
+      N = dims_B[ndims_B - 2];
+    } else {
+      N = dims_B[ndims_B - 1];
+    }
+
+    std::vector<TIndex> new_dims;
+    if (ndims_A >= ndims_B) {
+      new_dims.assign(dims_A.begin(), dims_A.end() - 2);
+    } else {
+      new_dims.assign(dims_B.begin(), dims_B.end() - 2);
+    }
+    if (!A_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!B_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (A_broadcasted && B_broadcasted) {
+      new_dims.push_back(1);
+    }
+    return vector<TensorShape>{
+        CreateTensorShape(vector<TIndex>{new_dims}, in[0].data_type())};
+  }
+}
+
+OpSchema::Cost CostInferenceForBatchMatMul(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  CAFFE_ENFORCE_EQ(in.size(), 2, "BatchMatMul requires two inputs");
+
+  ArgumentHelper helper(def);
+  struct OpSchema::Cost c;
+  const auto& A = in[0];
+  const auto& B = in[1];
+  const TensorShape Y = TensorInferenceForBatchMatMul(def, in)[0];
+
+  uint64_t nElemA = nElemFromDim(A);
+  uint64_t nElemB = nElemFromDim(B);
+  uint64_t nElemY = nElemFromDim(Y);
+
+  auto ndims_A = A.dims_size();
+  size_t K;
+  if (helper.GetSingleArgument<int>("trans_a", 0)) {
+    K = in[0].dims(ndims_A - 2);
+  } else {
+    K = in[0].dims(ndims_A - 1);
+  }
+
+  c.flops = 2 * nElemY * K;
+  c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
+  c.bytes_written = nElemY * sizeof(Y.data_type());
+  c.params_bytes = 0;
+  return c;
+}
+
+OPERATOR_SCHEMA(BatchMatMul)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
+B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
+from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
+two diemnsional, it behaves like normal matrix multiplication.
+)DOC")
+    .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
+    .Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)")
+    .Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)")
+    .Arg(
+        "trans_a",
+        "Pass 1 to transpose the last two dimensions of A before "
+        "doing multiplication")
+    .Arg(
+        "trans_b",
+        "Pass 1 to transpose the last two dimensions of B before "
+        "doing multiplication")
+    .Arg(
+        "broadcast",
+        "Pass 1 to allow broadcasting of dimensions. Behavior is the same as numpy.matmul. Gradient is currently not supported when running in broadcast mode.")
+    .TensorInferenceFunction(TensorInferenceForBatchMatMul)
+    .CostInferenceFunction(
+        OpSchema::CostInferenceFunctionType(CostInferenceForBatchMatMul));
+
+class GetBatchMatMulGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 2);
+
+    bool broadcast = false;
+    if (ArgumentHelper::HasArgument(Def(), "broadcast")) {
+      broadcast = GetArgument(Def(), "broadcast").i();
+    }
+    CAFFE_ENFORCE(
+        !broadcast,
+        "Gradient is currently not supported with "
+        "broadcast=1 for BatchMatMul.");
+
+    bool trans_a = 0;
+    bool trans_b = 0;
+
+    if (ArgumentHelper::HasArgument(Def(), "trans_a")) {
+      trans_a = GetArgument(Def(), "trans_a").i();
+    }
+    if (ArgumentHelper::HasArgument(Def(), "trans_b")) {
+      trans_b = GetArgument(Def(), "trans_b").i();
+    }
+
+    auto no_trans_arg = vector<Argument>();
+    auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
+    auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
+    auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                           MakeArgument<int>("trans_b", 1)};
+
+    if (ArgumentHelper::HasArgument(Def(), "use_scratch")) {
+      no_trans_arg.push_back(MakeArgument<int>("use_scratch", 1));
+      trans_a_arg.push_back(MakeArgument<int>("use_scratch", 1));
+      trans_b_arg.push_back(MakeArgument<int>("use_scratch", 1));
+      trans_both_arg.push_back(MakeArgument<int>("use_scratch", 1));
+    }
+
+    if (trans_a) {
+      if (trans_b) {
+        // A'B':
+        // dA = B'G', dB = G'A'
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(1), GO(0)},
+                                       vector<string>{GI(0)},
+                                       trans_both_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(0)},
+                                       vector<string>{GI(1)},
+                                       trans_both_arg)};
+      } else {
+        // A'B:
+        // dA = BG', dB = AG
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(1), GO(0)},
+                                       vector<string>{GI(0)},
+                                       trans_b_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(0), GO(0)},
+                                       vector<string>{GI(1)},
+                                       no_trans_arg)};
+      }
+    } else {
+      if (trans_b) {
+        // AB':
+        // dA = GB, dB = G'A
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(1)},
+                                       vector<string>{GI(0)},
+                                       no_trans_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(0)},
+                                       vector<string>{GI(1)},
+                                       trans_a_arg)};
+      } else {
+        // AB:
+        // dA = GB', dB = A'G
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(1)},
+                                       vector<string>{GI(0)},
+                                       trans_b_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(0), GO(0)},
+                                       vector<string>{GI(1)},
+                                       trans_a_arg)};
+      }
+    }
+  }
+
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(BatchMatMul, GetBatchMatMulGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.cu b/caffe2/operators/batch_matmul_op.cu
new file mode 100644
index 0000000..2eee5f7
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.cu
@@ -0,0 +1,27 @@
+#include "caffe2/operators/batch_matmul_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+template <>
+bool BatchMatMulOp<CUDAContext, DefaultEngine>::RunOnDevice() {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<CUDAContext>);
+
+#if CUDA_VERSION >= 9000
+
+template <>
+bool BatchMatMulOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    BatchMatMul,
+    TENSORCORE,
+    BatchMatMulOp<CUDAContext, TensorCoreEngine>);
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
new file mode 100644
index 0000000..99df277
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.h
@@ -0,0 +1,288 @@
+#ifndef CAFFE2_OPERATORS_MATMUL_OP_H_
+#define CAFFE2_OPERATORS_MATMUL_OP_H_
+
+#include <sstream>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context, class Engine = DefaultEngine>
+class BatchMatMulOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchMatMulOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        trans_a_(OperatorBase::GetSingleArgument<int>("trans_a", 0)),
+        trans_b_(OperatorBase::GetSingleArgument<int>("trans_b", 0)),
+        broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
+        use_scratch_(OperatorBase::GetSingleArgument<int>("use_scratch", 0)) {
+    if (use_scratch_) {
+      scratch_ = std::make_shared<Tensor<Context>>();
+    }
+  }
+
+  ~BatchMatMulOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* Y = Output(0);
+
+    auto ndims_A = A.ndim();
+    auto dims_A = A.dims();
+    auto ndims_B = B.ndim();
+    auto dims_B = B.dims();
+
+    auto noBroadcastErrorMsg = [](size_t dim1, size_t dim2) {
+      std::stringstream ss;
+      ss << "Inputs with dimensions A = ";
+      ss << dim1;
+      ss << " and B = ";
+      ss << dim2;
+      ss << " is not supported with broadcast=0. Did you forget to set the "
+            "broadcast flag?";
+      return ss.str();
+    };
+
+    // These should all be false if we're not broadcasting.
+    bool dimMismatch = ndims_A != ndims_B;
+    bool dimsLessThan1D = ndims_A < 2;
+    CAFFE_ENFORCE(
+        broadcast_ || (!dimMismatch && !dimsLessThan1D),
+        noBroadcastErrorMsg(ndims_A, ndims_B));
+
+    auto* data_A = A.template data<T>();
+    auto* data_B = B.template data<T>();
+
+    auto dimMismatchErrorString = [](size_t dimnum1,
+                                     size_t dim1,
+                                     size_t dimnum2,
+                                     size_t dim2,
+                                     bool trans_a,
+                                     bool trans_b) {
+      std::stringstream ss;
+      ss << "Expected dimension ";
+      ss << dimnum1;
+      ss << " of tensor A with value ";
+      ss << dim1;
+      ss << " to match dimension ";
+      ss << dimnum2;
+      ss << " of tensor B with value ";
+      ss << dim2;
+      ss << ". trans_a = ";
+      ss << trans_a;
+      ss << " trans_b = ";
+      ss << trans_b;
+      return ss.str();
+    };
+
+    if (ndims_A == 1 && ndims_B == 1) {
+      // vector-vector
+      CAFFE_ENFORCE_EQ(
+          dims_A[0],
+          dims_B[0],
+          "Vector-vector product requires each of the vectors to "
+          "be the same size.");
+      Y->Resize(1);
+      math::Dot<T, Context>(
+          dims_A[0], data_A, data_B, Y->template mutable_data<T>(), &context_);
+    } else {
+      bool A_broadcasted = false, B_broadcasted = false;
+      if (ndims_A == 1) {
+        dims_A.insert(dims_A.begin(), 1);
+        ndims_A = 2;
+        A_broadcasted = true;
+      }
+      if (ndims_B == 1) {
+        dims_B.push_back(1);
+        ndims_B = 2;
+        B_broadcasted = true;
+      }
+      // matrix-matrix with batches
+      // [B1..., M, K] * [B2..., K, N] -> [B..., M, N]
+      // In the event that A or B are one-dimensional, the trailing or leading
+      // 1 is not added to the output tensor's size.
+
+      // First step: partition the tensors into inner and outer blocks.
+      // Ignoring the last two dimensions of A and B, ensure that one of the
+      // tensors' dimensions is a suffix of the other. For example,
+      // [4, x, x] is a suffix of [2, 3, 4, x, x]. In this example, the
+      // dimensions of size 2 and 3 will be broadcasted, so we partition into
+      // 2*3=6 individual instances of batched GEMM with A and B \in [4, x, x].
+      size_t num_inner_dims = std::min(ndims_A, ndims_B);
+      for (size_t i = 2; i < num_inner_dims; ++i) {
+        auto first_r_itr = dims_A.rbegin();
+        auto second_r_itr = dims_B.rbegin();
+        CAFFE_ENFORCE_EQ(
+            *(first_r_itr + i),
+            *(second_r_itr + i),
+            dimMismatchErrorString(
+                ndims_A - i - 1,
+                *(first_r_itr + i),
+                ndims_B - i - 1,
+                *(second_r_itr + i),
+                trans_a_,
+                trans_b_));
+      }
+      size_t num_outer_dims = std::max(ndims_A, ndims_B) - num_inner_dims;
+
+      // Standard M, N, and K parameters respecting GEMM API and transpose
+      // flags
+      size_t M, N, K, K_dim;
+      if (trans_a_) {
+        M = dims_A[ndims_A - 1];
+        K = dims_A[ndims_A - 2];
+        K_dim = ndims_A - 2;
+      } else {
+        M = dims_A[ndims_A - 2];
+        K = dims_A[ndims_A - 1];
+        K_dim = ndims_A - 1;
+      }
+      if (trans_b_) {
+        N = dims_B[ndims_B - 2];
+        CAFFE_ENFORCE_EQ(
+            K,
+            dims_B[ndims_B - 1],
+            dimMismatchErrorString(
+                K_dim,
+                K,
+                ndims_B - 1,
+                dims_B[ndims_B - 1],
+                trans_a_,
+                trans_b_));
+      } else {
+        N = dims_B[ndims_B - 1];
+        CAFFE_ENFORCE_EQ(
+            K,
+            dims_B[ndims_B - 2],
+            dimMismatchErrorString(
+                K_dim,
+                K,
+                ndims_B - 2,
+                dims_B[ndims_B - 2],
+                trans_a_,
+                trans_b_));
+      }
+
+      // Calculate output tensor shapes [B..., (M), (N)]
+      // Batch dimensions will be broadcasted out to those of the longer tensor
+      // A or B. Either M or N are optional if A or B, respectively are 1-D.
+      std::vector<TIndex> new_dims;
+      if (ndims_A >= ndims_B) {
+        new_dims.assign(dims_A.begin(), dims_A.end() - 2);
+      } else {
+        new_dims.assign(dims_B.begin(), dims_B.end() - 2);
+      }
+      if (!A_broadcasted) {
+        new_dims.push_back(M);
+      } else {
+        new_dims.push_back(1);
+      }
+      if (!B_broadcasted) {
+        new_dims.push_back(N);
+      } else {
+        new_dims.push_back(1);
+      }
+
+      // Calculate strides. Continuing our example above,
+      //   [4, M, K] * [2, 3, 4, K, N] = [2, 3, 4, M, N]
+      // We calculate this as follows:
+      //   1) Treat the outer batch dimensions as flattened, i.e. view the B
+      //      tensor here as [6, 4, K, N] and Y as [6, 4, M, N]. The same rea-
+      //      soning is analogous for the case where # dims A >= # dims B.
+      //   2) Perform this operation:
+      //        for i in range(6):
+      //          Y[i, :, :, :] = BatchMatMul(A, B[i, :, :, :])
+      size_t A_stride = 1; // How far to increment A pointer each itr
+      size_t B_stride = 1; // How far to increment B pointer each itr
+      size_t Y_stride = 1; // How far to increment Y pointer each itr
+      // How many "inner batches" we have. That is, the product of sizes for
+      // the slices excluding M, K, and N, for their respective matrices.
+      size_t num_sub_batches = 1;
+      if (ndims_A >= ndims_B) {
+        auto first_r_itr = dims_A.rbegin();
+        auto output_r_itr = new_dims.rbegin();
+        for (size_t i = 0; i < num_inner_dims; ++i) {
+          A_stride *= *(first_r_itr + i);
+          Y_stride *= *(output_r_itr + i);
+          if (i >= 2) {
+            num_sub_batches *= *(first_r_itr + i);
+          }
+        }
+        B_stride = 0;
+      } else {
+        A_stride = 0;
+        auto second_r_itr = dims_B.rbegin();
+        auto output_r_itr = new_dims.rbegin();
+        for (size_t i = 0; i < num_inner_dims; ++i) {
+          B_stride *= *(second_r_itr + i);
+          Y_stride *= *(output_r_itr + i);
+          if (i >= 2) {
+            num_sub_batches *= *(second_r_itr + i);
+          }
+        }
+      }
+
+      size_t num_outer_batches = 1;
+      for (size_t i = 0; i < num_outer_dims; ++i) {
+        num_outer_batches *= new_dims[i];
+      }
+
+      // Mutually exclusive since otherwise we would've taken the vector-vector
+      // path above
+      if (A_broadcasted) {
+        new_dims.erase(new_dims.end() - 2);
+      } else if (B_broadcasted) {
+        new_dims.erase(new_dims.end() - 1);
+      }
+
+      // Allocate output tensor
+      Y->Resize(new_dims);
+      auto* Y_data = Y->template mutable_data<T>();
+
+      // Zero batch dimension indicates no elements
+      if (num_sub_batches == 0 || num_outer_batches == 0) {
+        return true;
+      }
+
+      // TODO(T23893772): doing this in a loop is likely going to be slow on GPU
+      for (size_t p = 0; p < num_outer_batches; ++p) {
+        math::GemmBatched<T, Context, Engine>(
+            trans_a_ ? CblasTrans : CblasNoTrans,
+            trans_b_ ? CblasTrans : CblasNoTrans,
+            num_sub_batches,
+            M,
+            N,
+            K,
+            1.0f,
+            data_A + p * A_stride,
+            data_B + p * B_stride,
+            0.0f,
+            Y_data + p * Y_stride,
+            &context_,
+            use_scratch_ ? scratch_.get() : nullptr);
+      }
+    }
+    return true;
+  }
+
+ protected:
+  bool trans_a_;
+  bool trans_b_;
+  bool broadcast_;
+
+  bool use_scratch_;
+  std::shared_ptr<Tensor<Context>> scratch_;
+};
+
+} // namespace caffe2
+
+#endif /* CAFFE2_OPERATORS_MATMUL_OP_H_ */
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
new file mode 100644
index 0000000..e8424f0
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -0,0 +1,91 @@
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/batch_matmul_op.h"
+
+namespace caffe2 {
+namespace {
+
+class BatchMatMulOpGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    def_.set_name("test");
+    def_.set_type("BatchMatMul");
+    def_.add_input("A");
+    def_.add_input("B");
+    def_.add_output("Y");
+    def_.mutable_device_option()->set_device_type(CUDA);
+  }
+
+  void AddConstInput(
+      const std::vector<TIndex>& dims,
+      const float value,
+      const string& name) {
+    Blob* blob = ws_.CreateBlob(name);
+    auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+    tensor->Resize(dims);
+    math::Set<float, CUDAContext>(
+        tensor->size(),
+        value,
+        tensor->mutable_data<float>(),
+        cuda_context_.get());
+  }
+
+  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+    const Blob* Y_blob = ws_.GetBlob("Y");
+    ASSERT_NE(nullptr, Y_blob);
+    const auto& Y = Y_blob->Get<Tensor<CUDAContext>>();
+    TensorCPU Y_cpu(Y);
+    const auto& Y_dims = Y_cpu.dims();
+    ASSERT_EQ(dims.size(), Y_dims.size());
+    for (std::size_t i = 0; i < dims.size(); ++i) {
+      ASSERT_EQ(dims[i], Y_dims[i]);
+    }
+    for (int i = 0; i < Y_cpu.size(); ++i) {
+      EXPECT_FLOAT_EQ(value, Y_cpu.data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Workspace ws_;
+  OperatorDef def_;
+};
+
+TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUNormalTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
+  ASSERT_NE(nullptr, op);
+  ASSERT_TRUE(op->Run());
+  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+}
+
+TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUBroadcastTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  auto* arg = def_.add_arg();
+  arg->set_name("broadcast");
+  arg->set_i(1);
+  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
+  ASSERT_NE(nullptr, op);
+  ASSERT_TRUE(op->Run());
+  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+}
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
new file mode 100644
index 0000000..0ec1799
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -0,0 +1,78 @@
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/operators/batch_matmul_op.h"
+
+namespace caffe2 {
+namespace {
+
+class BatchMatMulOpTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+    def_.set_name("test");
+    def_.set_type("BatchMatMul");
+    def_.add_input("A");
+    def_.add_input("B");
+    def_.add_output("Y");
+  }
+
+  void AddConstInput(
+      const std::vector<TIndex>& dims,
+      const float value,
+      const string& name) {
+    Blob* blob = ws_.CreateBlob(name);
+    auto* tensor = blob->GetMutable<TensorCPU>();
+    tensor->Resize(dims);
+    math::Set<float, CPUContext>(
+        tensor->size(),
+        value,
+        tensor->mutable_data<float>(),
+        cpu_context_.get());
+  }
+
+  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+    const Blob* Y_blob = ws_.GetBlob("Y");
+    ASSERT_NE(nullptr, Y_blob);
+    const auto& Y = Y_blob->Get<TensorCPU>();
+    const auto& Y_dims = Y.dims();
+    ASSERT_EQ(dims.size(), Y_dims.size());
+    for (std::size_t i = 0; i < dims.size(); ++i) {
+      ASSERT_EQ(dims[i], Y_dims[i]);
+    }
+    for (int i = 0; i < Y.size(); ++i) {
+      EXPECT_FLOAT_EQ(value, Y.data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+  Workspace ws_;
+  OperatorDef def_;
+};
+
+TEST_F(BatchMatMulOpTest, BatchMatMulOpNormalTest) {
+  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
+  ASSERT_NE(nullptr, op);
+  ASSERT_TRUE(op->Run());
+  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+}
+
+TEST_F(BatchMatMulOpTest, BatchMatMulOpBroadcastTest) {
+  auto* arg = def_.add_arg();
+  arg->set_name("broadcast");
+  arg->set_i(1);
+  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
+  ASSERT_NE(nullptr, op);
+  ASSERT_TRUE(op->Run());
+  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+}
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_sparse_to_dense_op.cc b/caffe2/operators/batch_sparse_to_dense_op.cc
new file mode 100644
index 0000000..9de96df
--- /dev/null
+++ b/caffe2/operators/batch_sparse_to_dense_op.cc
@@ -0,0 +1,242 @@
+#include "batch_sparse_to_dense_op.h"
+
+#include "caffe2/core/context.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool BatchSparseToDenseOp<T, Context>::RunOnDevice() {
+  auto& lengths = Input(LENGTHS);
+  auto& indices = Input(INDICES);
+  auto& values = Input(VALUES);
+  auto* output = Output(0);
+  CAFFE_ENFORCE_EQ(indices.size(), values.size());
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+  CAFFE_ENFORCE_EQ(indices.ndim(), 1);
+
+  const TIndex* lengths_data = lengths.template data<TIndex>();
+  const TIndex* indices_data = indices.template data<TIndex>();
+  const T* values_data = values.template data<T>();
+  TIndex batch_size = lengths.size();
+  TIndex lengths_sum = 0;
+  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  CAFFE_ENFORCE_EQ(lengths_sum, indices.size());
+
+  vector<TIndex> output_shape = {batch_size};
+  if (InputSize() == 4) {
+    auto& shaper = Input(3);
+    CAFFE_ENFORCE_EQ(shaper.ndim(), 2);
+    if (dense_last_dim_ == -1) {
+      dense_last_dim_ = shaper.dim(1);
+    } else {
+      CAFFE_ENFORCE(
+          dense_last_dim_ == shaper.dim(1),
+          "The last dim argument is not aligned with the shape input last dim");
+    }
+  } else {
+    CAFFE_ENFORCE(dense_last_dim_ >= 1, "The last dim of dense must be >= 1");
+  }
+  output_shape.push_back(dense_last_dim_);
+  output->Resize(output_shape);
+  T* output_data = output->template mutable_data<T>();
+  math::Set(
+      output->size(), static_cast<T>(default_value_), output_data, &context_);
+
+  TIndex k = 0;
+  for (TIndex i = 0; i < batch_size; ++i) {
+    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+      CAFFE_ENFORCE(
+          indices_data[k] < dense_last_dim_,
+          "An indice (",
+          indices_data[k],
+          ") is larger then last dim of dense (",
+          dense_last_dim_,
+          ").");
+      output_data[i * dense_last_dim_ + indices_data[k]] = values_data[k];
+      k += 1;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool BatchDenseToSparseOp<T, Context>::RunOnDevice() {
+  auto& lengths = Input(LENGTHS);
+  auto& indices = Input(INDICES);
+  auto& dense = Input(DENSE);
+  auto* output = Output(0);
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+  CAFFE_ENFORCE_EQ(indices.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dense.ndim(), 2);
+  const TIndex* lengths_data = lengths.template data<TIndex>();
+  const TIndex* indices_data = indices.template data<TIndex>();
+  const T* dense_data = dense.template data<T>();
+
+  TIndex batch_size = lengths.size();
+  TIndex lengths_sum = 0;
+  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  CAFFE_ENFORCE_EQ(lengths_sum, indices.size());
+
+  CAFFE_ENFORCE_EQ(batch_size, dense.dim(0));
+  dense_last_dim_ = dense.dim(1);
+  vector<TIndex> output_shape = indices.dims();
+  output->Resize(output_shape);
+  T* output_data = output->template mutable_data<T>();
+
+  TIndex k = 0;
+  for (TIndex i = 0; i < batch_size; ++i) {
+    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+      CAFFE_ENFORCE(
+          indices_data[k] < dense.dim(1),
+          "An indice (",
+          indices_data[k],
+          ") is larger then last dim of dense (",
+          dense.dim(1),
+          ").");
+      output_data[k] = dense_data[i * dense.dim(1) + indices_data[k]];
+      k += 1;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    BatchSparseToDense,
+    BatchSparseToDenseOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchSparseToDense)
+    .NumInputs(3, 4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Convert sparse matrix representation into dense matrix.
+
+A sparse matrix is represented by `lengths` vector, `indices` vector,
+and `values` vector. Each element in `lengths` vector (lengths[`i`]) represents
+the number of indices in this batch (batch `i`).
+With in each batch, `indices` should not have duplicate number.
+
+For example, with input:
+
+  lengths = [2, 3, 1]
+  indices = [0, 1, 2, 3, 4, 5]
+  values =  [6, 7, 8, 9, 10, 11]
+  dense_dim = 6
+  default_value = 0
+
+The output is:
+
+  output = [[6, 7, 0, 0, 0,  0],
+            [0, 0, 8, 9, 10, 0],
+            [0, 0, 0, 0, 0, 11]]
+
+after running this operator.
+)DOC")
+    .Input(
+        0,
+        "lengths",
+        "Flatten tensor, used to break down indices and values into per batch indices and values.")
+    .Input(
+        1,
+        "indices",
+        "Flatten tensor of total size = \\sum lengths, containing the indices ")
+    .Input(2, "values", "Data tensor, dimension has to match `indices`")
+    .Input(
+        3,
+        "output_shape_inference",
+        "Optional, a dense tensor whose shape define the output shape")
+    .Output(
+        0,
+        "dense",
+        "2-D dense tensor, with 1st dim = len(lengths), 2nd dim = dense_last_dim"
+        "in the arg list, the tensor is of the same data type as `values`."
+        "Missing values are filled with default_value")
+    .Arg(
+        "dense_last_dim",
+        "Optional, output dense last dimension. "
+        "If both this argument and output_shape_inference are set, "
+        "it should be consistent with output_shape_inference's last dim")
+    .Arg(
+        "default_value",
+        "Optional, missing values are filled with this value."
+        "default_value = 0 when not set");
+
+REGISTER_CPU_OPERATOR(
+    BatchDenseToSparse,
+    BatchDenseToSparseOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchDenseToSparse)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This Op is a inverse of BatchSparseToDenseOp.
+Basically, given a `lengths` vector, a `indices` vector,
+and a dense matrix `dense`, output `value` vector so that, along with
+`lengths` vector and `indices` vector, forms a sparse representation
+of the dense matrix.
+
+A sparse matrix is represented by `lengths` vector, `indices` vector,
+and `values` vector. Each element in `lengths` vector (lengths[`i`]) represents
+the number of indices in this batch (batch `i`).
+With in each batch, `indices` should not have duplicate number.
+
+For example, with input:
+
+  lengths = [2, 3, 1]
+  indices = [0, 1, 2, 3, 4, 5]
+  output = [[6, 7, 0, 0, 0,  0],
+            [0, 0, 8, 9, 10, 0],
+            [0, 0, 0, 0, 0, 11]]
+
+The output is:
+
+  values = [6, 7, 8, 9, 10, 11]
+
+after running this operator.
+)DOC")
+    .Input(
+        0,
+        "lengths",
+        "Flatten lengths, Used to break down indices into per batch indices")
+    .Input(
+        1,
+        "indices",
+        "Flatten indices, tensor of total size = \\sum lengths, containing the indices ")
+    .Input(
+        2,
+        "dense",
+        "dense 2-D tensor, first dim = len(lengths), last dim > Any(indices)")
+    .Output(
+        0,
+        "values",
+        "Values, tensor of the same size as `indices` and same data type as dense tensor.");
+
+namespace {
+
+class GetBatchSparseToDenseGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchDenseToSparse",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(2)});
+  }
+};
+
+class GetBatchDenseToSparseGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchSparseToDense",
+        "",
+        vector<string>{I(0), I(1), GO(0), I(2)},
+        vector<string>{GI(2)});
+  }
+};
+
+REGISTER_GRADIENT(BatchSparseToDense, GetBatchSparseToDenseGradient);
+REGISTER_GRADIENT(BatchDenseToSparse, GetBatchDenseToSparseGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_sparse_to_dense_op.h b/caffe2/operators/batch_sparse_to_dense_op.h
new file mode 100644
index 0000000..0e854d9
--- /dev/null
+++ b/caffe2/operators/batch_sparse_to_dense_op.h
@@ -0,0 +1,46 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_BATCH_SPARSE_TO_DENSE_OP_H_
+#define CAFFE2_OPERATORS_BATCH_SPARSE_TO_DENSE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BatchSparseToDenseOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchSparseToDenseOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(TIndex, "dense_last_dim", dense_last_dim_, -1),
+        OP_SINGLE_ARG(T, "default_value", default_value_, static_cast<T>(0)) {}
+  bool RunOnDevice() override;
+
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
+ private:
+  TIndex dense_last_dim_;
+  T default_value_;
+  INPUT_TAGS(LENGTHS, INDICES, VALUES);
+};
+
+template <typename T, class Context>
+class BatchDenseToSparseOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchDenseToSparseOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  bool RunOnDevice() override;
+
+ private:
+  TIndex dense_last_dim_;
+  INPUT_TAGS(LENGTHS, INDICES, DENSE);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BATCH_SPARSE_TO_DENSE_OP_H_
diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc
new file mode 100644
index 0000000..0d2b5a3
--- /dev/null
+++ b/caffe2/operators/bbox_transform_op.cc
@@ -0,0 +1,199 @@
+#include "bbox_transform_op.h"
+#include "caffe2/operators/generate_proposals_op_util_boxes.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(BBoxTransform, BBoxTransformOp<float, CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    BBoxTransform,
+    mkl::MKLFallbackOp<BBoxTransformOp<float, CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+// Input: box, delta Output: box
+OPERATOR_SCHEMA(BBoxTransform)
+    .NumInputs(3)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Transform proposal bounding boxes to target bounding box using bounding box
+    regression deltas.
+)DOC")
+    .Arg("weights", "vector<float> weights [wx, wy, ww, wh] for the deltas")
+    .Arg(
+        "apply_scale",
+        "bool (default true), transform the boxes to the scaled image space"
+        " after applying the bbox deltas."
+        "Set to false to match the detectron code, set to true for keypoint"
+        " models and for backward compatibility")
+    .Arg(
+        "correct_transform_coords",
+        "bool (default false), Correct bounding box transform coordates,"
+        " see bbox_transform() in boxes.py "
+        "Set to true to match the detectron code, set to false for backward"
+        " compatibility")
+    .Arg(
+        "rotated",
+        "bool (default false). If true, then boxes (rois and deltas) include "
+        "angle info to handle rotation. The format will be "
+        "[ctr_x, ctr_y, width, height, angle (in degrees)].")
+    .Arg(
+        "angle_bound_on",
+        "bool (default true). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "angle_bound_lo",
+        "int (default -90 degrees). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "angle_bound_hi",
+        "int (default 90 degrees). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "clip_angle_thresh",
+        "float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
+        "within this threshold of tolerance for backward compatibility. "
+        "Set to negative value for no clipping.")
+    .Input(
+        0,
+        "rois",
+        "Bounding box proposals in pixel coordinates, "
+        "Size (M, 4), format [x1, y1, x2, y2], or"
+        "Size (M, 5), format [batch_index, x1, y1, x2, y2]. "
+        "If proposals from multiple images in a batch are present, they "
+        "should be grouped sequentially and in incremental order."
+        "For rotated boxes, this would have an additional angle (in degrees) "
+        "in the format [<optionaal_batch_id>, ctr_x, ctr_y, w, h, angle].")
+    .Input(
+        1,
+        "deltas",
+        "bounding box translations and scales,"
+        "size (M, 4*K), format [dx, dy, dw, dh], K = # classes. "
+        "For rotated boxes, size (M, 5*K, format [dx, dy, dw, dh, da].")
+    .Input(
+        2,
+        "im_info",
+        "Image dimensions, size (batch_size, 3), "
+        "format [img_height, img_width, img_scale]")
+    .Output(
+        0,
+        "box_out",
+        "Pixel coordinates of the transformed bounding boxes,"
+        "Size (M, 4*K), format [x1, y1, x2, y2]. "
+        "For rotated boxes, size (M, 5*K), "
+        "format [ctr_x, ctr_y, w, h, angle].")
+    .Output(
+        1,
+        "roi_batch_splits",
+        "Tensor of shape (batch_size) with each element denoting the number "
+        "of RoIs belonging to the corresponding image in batch");
+
+SHOULD_NOT_DO_GRADIENT(BBoxTransform);
+} // namespace
+
+template <>
+bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
+  const auto& roi_in = Input(0);
+  const auto& delta_in = Input(1);
+  const auto& iminfo_in = Input(2);
+  auto* box_out = Output(0);
+
+  const int box_dim = rotated_ ? 5 : 4;
+  const int N = roi_in.dim32(0);
+  CAFFE_ENFORCE_EQ(roi_in.ndim(), 2);
+  CAFFE_ENFORCE(roi_in.dim32(1) == box_dim || roi_in.dim32(1) == box_dim + 1);
+
+  CAFFE_ENFORCE_EQ(delta_in.ndim(), 2);
+  CAFFE_ENFORCE_EQ(delta_in.dim32(0), N);
+  CAFFE_ENFORCE_EQ(delta_in.dim32(1) % box_dim, 0);
+  const int num_classes = delta_in.dim32(1) / box_dim;
+
+  CAFFE_ENFORCE_EQ(iminfo_in.ndim(), 2);
+  CAFFE_ENFORCE_EQ(iminfo_in.dim32(1), 3);
+  const int batch_size = iminfo_in.dim32(0);
+
+  DCHECK_EQ(weights_.size(), 4);
+
+  Eigen::Map<const ERArrXXf> boxes0(
+      roi_in.data<float>(), roi_in.dim32(0), roi_in.dim32(1));
+  Eigen::Map<const ERArrXXf> deltas0(
+      delta_in.data<float>(), delta_in.dim32(0), delta_in.dim32(1));
+
+  // Count the number of RoIs per batch
+  vector<int> num_rois_per_batch(batch_size, 0);
+  if (roi_in.dim32(1) == box_dim) {
+    CAFFE_ENFORCE_EQ(batch_size, 1);
+    num_rois_per_batch[0] = N;
+  } else {
+    const auto& roi_batch_ids = boxes0.col(0);
+    for (int i = 0; i < roi_batch_ids.size(); ++i) {
+      const int roi_batch_id = roi_batch_ids(i);
+      CAFFE_ENFORCE_LT(roi_batch_id, batch_size);
+      num_rois_per_batch[roi_batch_id]++;
+    }
+  }
+
+  CAFFE_ENFORCE_EQ(iminfo_in.dims(), (vector<TIndex>{batch_size, 3}));
+  Eigen::Map<const ERArrXXf> iminfo(
+      iminfo_in.data<float>(), iminfo_in.dim(0), iminfo_in.dim(1));
+
+  box_out->ResizeLike(delta_in);
+  Eigen::Map<ERArrXXf> new_boxes(
+      box_out->mutable_data<float>(), box_out->dim32(0), box_out->dim32(1));
+
+  // We assume roi_in and delta_in over multiple batches are grouped
+  // together in increasing order as generated by GenerateProposalsOp
+  int offset = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    const int num_rois = num_rois_per_batch[i];
+    const auto& cur_iminfo = iminfo.row(i);
+    const float scale_before = cur_iminfo(2);
+    const float scale_after = apply_scale_ ? cur_iminfo(2) : 1.0;
+    int img_h = int(cur_iminfo(0) / scale_before + 0.5);
+    int img_w = int(cur_iminfo(1) / scale_before + 0.5);
+
+    EArrXXf cur_boxes =
+        boxes0.rightCols(box_dim).block(offset, 0, num_rois, box_dim);
+    // Do not apply scale for angle in rotated boxes
+    cur_boxes.leftCols(4) /= scale_before;
+    for (int k = 0; k < num_classes; k++) {
+      const auto& cur_deltas =
+          deltas0.block(offset, k * box_dim, num_rois, box_dim);
+      const auto& trans_boxes = utils::bbox_transform(
+          cur_boxes,
+          cur_deltas,
+          weights_,
+          utils::BBOX_XFORM_CLIP_DEFAULT,
+          correct_transform_coords_,
+          angle_bound_on_,
+          angle_bound_lo_,
+          angle_bound_hi_);
+      EArrXXf clip_boxes =
+          utils::clip_boxes(trans_boxes, img_h, img_w, clip_angle_thresh_);
+      // Do not apply scale for angle in rotated boxes
+      clip_boxes.leftCols(4) *= scale_after;
+      new_boxes.block(offset, k * box_dim, num_rois, box_dim) = clip_boxes;
+    }
+
+    offset += num_rois;
+  }
+
+  if (OutputSize() > 1) {
+    auto* roi_batch_splits = Output(1);
+    roi_batch_splits->Resize(batch_size);
+    Eigen::Map<EArrXf> roi_batch_splits_map(
+        roi_batch_splits->mutable_data<float>(), batch_size);
+    roi_batch_splits_map =
+        Eigen::Map<const EArrXi>(num_rois_per_batch.data(), batch_size)
+            .cast<float>();
+  }
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/bbox_transform_op.h b/caffe2/operators/bbox_transform_op.h
new file mode 100644
index 0000000..8d76973
--- /dev/null
+++ b/caffe2/operators/bbox_transform_op.h
@@ -0,0 +1,72 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef BBOX_TRANSFORM_OP_H_
+#define BBOX_TRANSFORM_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BBoxTransformOp final : public Operator<Context> {
+ public:
+  BBoxTransformOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        weights_(OperatorBase::GetRepeatedArgument<T>(
+            "weights",
+            vector<T>{1.0f, 1.0f, 1.0f, 1.0f})),
+        apply_scale_(
+            OperatorBase::GetSingleArgument<bool>("apply_scale", true)),
+        correct_transform_coords_(OperatorBase::GetSingleArgument<bool>(
+            "correct_transform_coords",
+            false)),
+        rotated_(OperatorBase::GetSingleArgument<bool>("rotated", false)),
+        angle_bound_on_(
+            OperatorBase::GetSingleArgument<bool>("angle_bound_on", true)),
+        angle_bound_lo_(
+            OperatorBase::GetSingleArgument<int>("angle_bound_lo", -90)),
+        angle_bound_hi_(
+            OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)),
+        clip_angle_thresh_(
+            OperatorBase::GetSingleArgument<float>("clip_angle_thresh", 1.0)) {
+    CAFFE_ENFORCE_EQ(
+        weights_.size(),
+        4,
+        "weights size " + caffe2::to_string(weights_.size()) + "must be 4.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // weights [wx, wy, ww, wh] to apply to the regression target
+  vector<T> weights_;
+  // Transform the boxes to the scaled image space after applying the bbox
+  //   deltas.
+  // Set to false to match the detectron code, set to true for the keypoint
+  //   model and for backward compatibility
+  bool apply_scale_{true};
+  // Correct bounding box transform coordates, see bbox_transform() in boxes.py
+  // Set to true to match the detectron code, set to false for backward
+  //   compatibility
+  bool correct_transform_coords_{false};
+  // Set for RRPN case to handle rotated boxes. Inputs should be in format
+  // [ctr_x, ctr_y, width, height, angle (in degrees)].
+  bool rotated_{false};
+  // If set, for rotated boxes in RRPN, output angles are normalized to be
+  // within [angle_bound_lo, angle_bound_hi].
+  bool angle_bound_on_{true};
+  int angle_bound_lo_{-90};
+  int angle_bound_hi_{90};
+  // For RRPN, clip almost horizontal boxes within this threshold of
+  // tolerance for backward compatibility. Set to negative value for
+  // no clipping.
+  float clip_angle_thresh_{1.0};
+};
+
+} // namespace caffe2
+
+#endif // BBOX_TRANSFORM_OP_H_
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
new file mode 100644
index 0000000..b38182b
--- /dev/null
+++ b/caffe2/operators/boolean_mask_ops.cc
@@ -0,0 +1,573 @@
+#include "caffe2/operators/boolean_mask_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+class BooleanMaskLengthsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BooleanMaskLengthsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& lengths = Input(0);
+    auto& mask = Input(1);
+    auto* lengthsOut = Output(0);
+    CAFFE_ENFORCE(lengths.ndim() == 1);
+    CAFFE_ENFORCE(mask.ndim() == 1);
+    const auto* lengthsPtr = lengths.template data<T>();
+    const auto* maskPtr = mask.template data<bool>();
+    auto totalLength =
+        std::accumulate(lengthsPtr, lengthsPtr + lengths.size(), 0);
+    CAFFE_ENFORCE(mask.size() == totalLength);
+    lengthsOut->ResizeLike(lengths);
+    auto* lengthsOutPtr = lengthsOut->template mutable_data<T>();
+    int p = 0;
+    for (int i = 0; i < lengths.size(); ++i) {
+      T lengthOut = 0;
+      for (int j = 0; j < lengthsPtr[i]; ++j) {
+        if (maskPtr[p++]) {
+          ++lengthOut;
+        }
+      }
+      lengthsOutPtr[i] = lengthOut;
+    }
+    return true;
+  }
+};
+} // namespace
+
+template <>
+bool BooleanMaskOp<CPUContext>::RunOnDevice() {
+  auto& data = Input(0);
+  auto& mask = Input(1);
+  auto* dataOut = Output(0);
+  CAFFE_ENFORCE(data.ndim() >= 1);
+  CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+  CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
+
+  const auto* maskPtr = mask.template data<bool>();
+  int numOutputs = 0;
+  int outerSize = mask.size();
+  for (int i = 0; i < outerSize; ++i) {
+    if (maskPtr[i]) {
+      ++numOutputs;
+    }
+  }
+  std::vector<TIndex> outShape;
+  outShape.push_back(numOutputs);
+  outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
+  dataOut->Resize(outShape);
+  auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
+
+  int64_t* out_vec = nullptr;
+  if (OutputSize() == 2) {
+    auto* indicesOut = Output(1);
+    indicesOut->Resize(numOutputs);
+    out_vec = indicesOut->template mutable_data<int64_t>();
+  }
+
+  if (numOutputs == 0) {
+    return true;
+  }
+  const auto innerSize = data.size_from_dim(1);
+  const auto innerSizeBytes = innerSize * data.meta().itemsize();
+
+  TIndex lastStart = -1;
+  const auto* inPtr = (char*)data.raw_data();
+  TIndex outStart = 0;
+
+  for (TIndex i = 0;; ++i) {
+    // mask was true and either a) became false, or b) sequence finished
+    if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
+      const auto* src = inPtr + lastStart * innerSizeBytes;
+      auto* dst = outPtr + outStart * innerSizeBytes;
+      int numItems = i - lastStart;
+      context_.template CopyItems<CPUContext, CPUContext>(
+          data.meta(), numItems * innerSize, src, dst);
+      outStart += numItems;
+      lastStart = -1;
+    }
+    if (i >= outerSize) {
+      break;
+    }
+    // mask was false and became true
+    if (lastStart == -1 && maskPtr[i]) {
+      lastStart = i;
+    }
+    if (maskPtr[i] && OutputSize() == 2) {
+      *(out_vec++) = i;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp<CPUContext>);
+REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp<CPUContext>);
+
+OPERATOR_SCHEMA(BooleanMask)
+    .NumInputs(2)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Given a 1D `data` tensor and a boolean `mask` tensor of the same shape, returns a `masked_data` tensor containing only the elements corresponding to positions where the `mask` is True, and a `masked_indices` tensor containing the indices of the True elements.
+
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/boolean_mask_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "BooleanMask",
+    ["data", "mask"],
+    ["masked_data", "masked_indices"]
+)
+
+workspace.FeedBlob("data", np.array([1,2,3,4,5,6]))
+workspace.FeedBlob("mask", np.array([True,False,False,True,True,False]))
+print("data:", workspace.FetchBlob("data"))
+print("mask:", workspace.FetchBlob("mask"))
+workspace.RunOperatorOnce(op)
+print("masked_data:", workspace.FetchBlob("masked_data"))
+print("masked_indices:", workspace.FetchBlob("masked_indices"))
+
+```
+
+**Result**
+
+```
+
+data: [1 2 3 4 5 6]
+mask: [ True False False  True  True False]
+masked_data: [1 4 5]
+masked_indices: [0 3 4]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "data", "(*Tensor*): 1D input tensor")
+    .Input(1, "mask", "(*Tensor`<bool>`*): tensor of bools which determines the input elements that will be left in the `masked_data` output tensor; same shape as `data`")
+    .Output(0, "masked_data", "(*Tensor*): 1D tensor of same type as `data` input that contains the masked input tensor")
+    .Output(1, "masked_indices", "(*Tensor`<int>`*): 1D tensor of indices of the True elements in the `mask` tensor");
+
+OPERATOR_SCHEMA(BooleanMaskLengths)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a tensor of int32 `lengths` tensor representing segment lengths and a `mask` (boolean) tensor, return the segment lengths of the corresponding segmented tensor after **BooleanMask** is applied.
+
+If `lengths` tensor is $[a_1, a_2, ..., a_n]$, then length of `mask` tensor must be $a_1 + a_2 + ... + a_n$.
+
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/boolean_mask_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "BooleanMaskLengths",
+    ["lengths", "mask"],
+    ["masked_lengths"]
+)
+
+workspace.FeedBlob("lengths", np.array([1,3,2], dtype=np.int32))
+workspace.FeedBlob("mask", np.array([False,True,True,False,True,True]))
+print("lengths:", workspace.FetchBlob("lengths"))
+print("mask:", workspace.FetchBlob("mask"))
+workspace.RunOperatorOnce(op)
+print("masked_lengths:", workspace.FetchBlob("masked_lengths"))
+
+```
+
+**Result**
+
+```
+
+lengths: [1 3 2]
+mask: [False  True  True False  True  True]
+masked_lengths: [0 2 2]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "lengths", "(*Tensor`<int>`*): input tensor containing segment lengths")
+    .Input(1, "mask", "(*Tensor`<bool>`*): A 1D bool tensor of values to keep.")
+    .Output(0, "masked_lengths", "(*Tensor`<int>`*): 1D tensor of same type as inputs that contains the sequence");
+
+NO_GRADIENT(BooleanMask)
+NO_GRADIENT(BooleanMaskLengths);
+
+const float minf = -1.0f * std::numeric_limits<float>::infinity();
+
+// Template this on a functor object so we can generate different
+// implementations at compile time and have a better chance of inlining
+template <typename Functor>
+void MaskWithFunctor(
+    size_t N,
+    size_t M,
+    int B,
+    const float* in,
+    Functor fn,
+    float fill_val,
+    float* out) {
+  if (B >= 0) { // with batching
+    // collapse tensor to 3-dim view [B, N, M] where:
+    // B is product of dims up to and including batch
+    // N is product of dims between batch and axis, exclusive
+    // M is product of dimensions at/after axis
+    // then mask each batch [i, :, :] (note that this is N x M matrix)
+    for (int i = 0; i < B; ++i) {
+      for (int j = 0; j < N; ++j) {
+        for (int k = 0; k < M; ++k) {
+          // when [i, :, :] is laid out in row major order
+          // N * M * i + M * j + k is index of entry in N x M matrix
+          // with coordinates (row = j, col = k)
+          auto val = in[N * M * i + M * j + k];
+          out[N * M * i + M * j + k] = (fn(j, k, val) ? fill_val : val);
+        }
+      }
+    }
+  } else { // without batching
+    // TODO(T20952436): vector implementation
+    // collapse tensor to 2-dim view [N, M], where
+    // N is product of dimensions before axis
+    // M is product of dimensions at/after axis
+    // and mask N by M matrix
+    for (int i = 0; i < N; ++i) {
+      for (int j = 0; j < M; ++j) {
+        auto val = in[M * i + j];
+        out[M * i + j] = (fn(i, j, val) ? fill_val : val);
+      }
+    }
+  }
+}
+
+// Repeat masking along continuous segments (right axes) of size D
+template <typename Functor>
+void RepeatedMaskWithFunctor(
+    size_t N,
+    size_t M,
+    int D,
+    const float* in,
+    Functor fn,
+    float fill_val,
+    float* out) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < M; ++j) {
+      for (int k = 0; k < D; ++k) {
+        auto val = in[M * D * i + D * j + k];
+        out[M * D * i + D * j + k] = (fn(i, j, val) ? fill_val : val);
+      }
+    }
+  }
+}
+
+namespace {
+
+class SequenceFunctor {
+ public:
+  explicit SequenceFunctor(const int* sl, const size_t len)
+      : sl_(sl), len_(len) {}
+  bool operator()(int i, int j, float /* val*/) {
+    CAFFE_ENFORCE(i < len_, "Out of bound.");
+    return j >= sl_[i];
+  }
+
+ private:
+  const int* sl_;
+  const size_t len_;
+};
+
+class WindowFunctor {
+ public:
+  explicit WindowFunctor(const int* c, int r) : c(c), r(r) {}
+  bool operator()(int i, int j, float /* val*/) {
+    return j > c[i] + r || j < c[i] - r;
+  }
+
+ private:
+  const int* c;
+  const int r;
+};
+
+class UpperFunctor {
+ public:
+  bool operator()(int i, int j, float /* val */) {
+    return j > i;
+  }
+};
+
+class LowerFunctor {
+ public:
+  bool operator()(int i, int j, float /* val */) {
+    return j < i;
+  }
+};
+
+class UpperDiagFunctor {
+ public:
+  bool operator()(int i, int j, float /* val */) {
+    return j >= i;
+  }
+};
+
+class LowerDiagFunctor {
+ public:
+  bool operator()(int i, int j, float /* val */) {
+    return j <= i;
+  }
+};
+
+} // namespace
+
+template <>
+bool SequenceMaskOp<CPUContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+}
+
+template <>
+template <class T>
+bool SequenceMaskOp<CPUContext>::DoRunWithType() {
+  const Tensor<CPUContext>* input = &Input(0);
+  const Tensor<CPUContext>* sequence_lengths = nullptr;
+  const Tensor<CPUContext>* window_centers = nullptr;
+
+  if (mode_ == "sequence") {
+    sequence_lengths = &Input(1);
+  } else if (mode_ == "window") {
+    window_centers = &Input(1);
+  }
+
+  auto* output = Output(0);
+  output->ResizeLike(*input);
+
+  const auto canonical_axis = input->canonical_axis_index(axis_);
+
+  // canonical_batch is non-negative if batching, -1 otherwise
+  int canonical_batch = -1;
+  if ((HasArgument("batch"))) {
+    canonical_batch = input->canonical_axis_index(batch_);
+  }
+
+  // make sure batch < axis
+  if (canonical_batch >= 0) {
+    CAFFE_ENFORCE_LT(canonical_batch, canonical_axis);
+  }
+
+  // if no batch, then left is product of dims up to axis
+  // otherwise, left is product of dims between batch and axis
+  const int left =
+      (canonical_batch >= 0
+           ? input->size_between_dim(canonical_batch, canonical_axis)
+           : input->size_to_dim(canonical_axis));
+  const int right = input->size_from_dim(canonical_axis);
+
+  // product of dims from 1 to batch
+  const int batch_dim =
+      (canonical_batch >= 0
+           ? input->size_to_dim(canonical_batch) * input->dim(canonical_batch)
+           : -1);
+
+  T fill_val = convert::To<float, T>(grad_ ? 0.0f : fill_val_);
+  if (mode_ == "sequence") {
+    CAFFE_ENFORCE(
+        sequence_lengths, "Sequence length not provided for mode 'sequence'!");
+    if (HasArgument("repeat_from_axis")) {
+      const int canonical_repeat_from =
+          input->canonical_axis_index(repeat_from_);
+      const int repeated_dims = input->size_from_dim(canonical_repeat_from);
+      const int masked_dims = right / repeated_dims;
+      RepeatedMaskWithFunctor(
+          left,
+          masked_dims,
+          repeated_dims,
+          input->data<T>(),
+          SequenceFunctor(
+              sequence_lengths->data<int>(), sequence_lengths->size()),
+          fill_val,
+          output->mutable_data<T>());
+    } else {
+      MaskWithFunctor(
+          left,
+          right,
+          batch_dim,
+          input->data<T>(),
+          SequenceFunctor(
+              sequence_lengths->data<int>(), sequence_lengths->size()),
+          fill_val,
+          output->mutable_data<T>());
+    }
+  } else if (mode_ == "window") {
+    MaskWithFunctor(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        WindowFunctor(window_centers->data<int>(), radius_),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "upper") {
+    MaskWithFunctor(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        UpperFunctor(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "lower") {
+    MaskWithFunctor(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        LowerFunctor(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "upperdiag") {
+    MaskWithFunctor(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        UpperDiagFunctor(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "lowerdiag") {
+    MaskWithFunctor(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        LowerDiagFunctor(),
+        fill_val,
+        output->mutable_data<T>());
+  } else {
+    CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
+    return false;
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(SequenceMask, SequenceMaskOp<CPUContext>);
+
+OPERATOR_SCHEMA(SequenceMask)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Mask op designed for use in attention mechanisms for sequence modeling tasks.
+Supports batching: given batch_dim, collapses dims 0 through batch_dim into a
+single dimension, e.g. if tensor dims are [4,2,1,3,4] and batch_dim=2, first
+collapse tensor to [4*2*1,3,4], then mask each batch [i,:,:].
+
+
+Two current operating modes:
+
+
+1) Given a 2D input tensor and 1D tensor of sequence lengths, for each row i in
+the input tensor, set elements in that row to -inf if their column index
+j >= sequence_lengths[i]. This mode takes two inputs and argument mode =
+'sequence'
+
+
+2) Triangular mask. Given row index i and column index j, set elements to -inf
+given the following conditions:
+
+      mode='upper', x_ij = -inf if j < i
+      mode='lower', x_ij = -inf if j > i
+      mode='upperdiag', x_ij = -inf if j <= i
+      mode='lowerdiag', x_ij = -inf if j >= i
+
+This mode takes one input.
+
+
+3) Window Mask. Given a 2D input tensor and 1D tensor of window centers,
+for each row i in the input tensor, set elements in that row to -inf
+if their column index j outside [center - radius, center + radius].
+This mode takes two inputs and argument mode = 'sequence'.
+Argument 'radius' should be provided.
+)DOC")
+    .Input(0, "input", "Tensor to apply masking to")
+    .Input(1, "sequence_lengths", "1D Tensor of sequence lengths for mode #1")
+    .Output(0, "masked_tensor", "Input tensor with masking applied")
+    .Arg(
+        "mode",
+        "(string) Mode selection. Possible values: "
+        "'sequence', 'upper', 'lower', 'upperdiag', 'lowerdiag'")
+    .Arg(
+        "axis",
+        "(int) Beginning axis of row elements. All dimensions to the left "
+        "will be treated as row indices and those to the right (inclusive) "
+        "will be treated as column indices in the 2D mask")
+    .Arg("grad", "(bool) operate in gradient mode")
+    .Arg("radius", "(int) radius of windows in window mode")
+    .Arg("batch", "(int) batch dimension of tensor (optional)")
+    .Arg(
+        "repeat_from_axis",
+        "(int) used when mask should be repeated for "
+        "one or more data dimensions (beginning at this axis).  "
+        "(currently only supported for sequence mode without batch argument)");
+
+class GetSequenceMaskGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<Argument> args;
+    args.reserve(Def().arg().size());
+    for (const auto& x : Def().arg()) {
+      args.push_back(x);
+    }
+    args.push_back(MakeArgument<bool>("grad", true));
+    if (def_.input_size() == 1) {
+      return SingleGradientDef(
+          "SequenceMask",
+          "",
+          vector<string>{GO(0)},
+          vector<string>{GI(0)},
+          args);
+    } else {
+      return SingleGradientDef(
+          "SequenceMask",
+          "",
+          vector<string>{GO(0), I(1)},
+          vector<string>{GI(0)},
+          args);
+    }
+  }
+
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(SequenceMask, GetSequenceMaskGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
new file mode 100644
index 0000000..a976e71
--- /dev/null
+++ b/caffe2/operators/boolean_mask_ops.cu
@@ -0,0 +1,445 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/boolean_mask_ops.h"
+
+#include <cub/cub.cuh>
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void BooleanMaskCopyKernel(
+    const TIndex numOfOutput,
+    const TIndex numBytes,
+    const TIndex* indices,
+    const T* src,
+    T* dest) {
+  for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
+    const auto srcBase = indices[i] * numBytes;
+    const auto destBase = i * numBytes;
+    for (TIndex j = threadIdx.x; j < numBytes; j += blockDim.x) {
+      dest[destBase + j] = src[srcBase + j];
+    }
+  }
+}
+}
+
+template <>
+class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
+ public:
+  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& src = Input(0);
+    const auto& mask = Input(1);
+    auto* dest = Output(0);
+
+    CAFFE_ENFORCE(src.ndim() >= 1);
+    CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+    CAFFE_ENFORCE(src.dims()[0] == mask.dims()[0]);
+
+    const auto* maskData = mask.data<bool>();
+    const auto outerSize = mask.dims()[0];
+    indices_.Resize(outerSize);
+    auto* indicesData = indices_.mutable_data<TIndex>();
+
+    size_t numBytes = 0;
+    cub::CountingInputIterator<int> itr(0);
+    cub::DeviceSelect::Flagged(
+        nullptr,
+        numBytes,
+        itr,
+        maskData,
+        indicesData,
+        static_cast<TIndex*>(nullptr),
+        outerSize,
+        context_.cuda_stream());
+
+    auto numTIndex =
+        static_cast<TIndex>((numBytes + sizeof(TIndex) - 1) / sizeof(TIndex));
+    // allocate one more TIndex at the end of scratch for storing numOfOutput
+    scratch_.Resize(numTIndex + 1);
+    auto* scratchData = scratch_.mutable_data<TIndex>();
+    auto* numOfOutputData = scratchData + numTIndex;
+
+    cub::DeviceSelect::Flagged(
+        static_cast<void*>(scratchData),
+        numBytes,
+        itr,
+        maskData,
+        indicesData,
+        numOfOutputData,
+        outerSize,
+        context_.cuda_stream());
+
+    // Copy numOfOutput from gpu to cpu
+    TIndex numOfOutput;
+    context_.Copy<TIndex, CUDAContext, CPUContext>(
+        1, numOfOutputData, &numOfOutput);
+
+    indices_.Resize(numOfOutput);
+    std::vector<TIndex> dims = src.dims();
+    dims[0] = numOfOutput;
+    dest->Resize(dims);
+    auto* destData = (char*)dest->raw_mutable_data(src.meta());
+    const auto* srcData = (char*)src.raw_data();
+    if (OutputSize() == 2) {
+      auto* indicesOut = Output(1);
+      indicesOut->Resize(numOfOutput);
+      indicesOut->mutable_data<TIndex>();
+    }
+
+    if (numOfOutput > 0) {
+      BooleanMaskCopyKernel<<<
+          min(numOfOutput, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          numOfOutput,
+          src.size_from_dim(1) * src.meta().itemsize(),
+          indicesData,
+          srcData,
+          destData);
+
+      if (OutputSize() == 2) {
+        Output(1)->CopyFrom(indices_, &context_);
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> indices_;
+  Tensor<CUDAContext> scratch_;
+};
+
+REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
+
+namespace {
+
+#define minf (-1.0f * std::numeric_limits<float>::infinity())
+
+template <typename T>
+__global__ void sequenceMaskKernel(
+    int N,
+    int M,
+    int B,
+    const T* in,
+    const int* seq_lengths,
+    T fill_val,
+    T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] = (k >= seq_lengths[j] ? fill_val : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] = (j >= seq_lengths[i] ? fill_val : in[index]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void repeatedSequenceMaskKernel(
+    int N,
+    int M,
+    int D,
+    const T* in,
+    const int* seq_lengths,
+    T fill_val,
+    T* out) {
+  CUDA_1D_KERNEL_LOOP(index, N * M * D) {
+    int i = index / (D * M);
+    int j = (index / D) % M;
+
+    out[index] = (j >= seq_lengths[i] ? fill_val : in[index]);
+  }
+}
+
+template <typename T>
+__global__ void windowMaskKernel(
+    int N,
+    int M,
+    int B,
+    const T* in,
+    const int* window_centers,
+    const int radius,
+    T fill_val,
+    T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] =
+          (k < window_centers[j] - radius || k > window_centers[j] + radius
+               ? fill_val
+               : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] =
+          (j < window_centers[i] - radius || j > window_centers[i] + radius
+               ? fill_val
+               : in[index]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+upperMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] = (k > j ? fill_val : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] = (j > i ? fill_val : in[index]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+lowerMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] = (k < j ? fill_val : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] = (j < i ? fill_val : in[index]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+upperDiagMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] = (k >= j ? fill_val : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] = (j >= i ? fill_val : in[index]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+lowerDiagMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) {
+  if (B >= 0) {
+    CUDA_1D_KERNEL_LOOP(index, B * N * M) {
+      int k = index % M;
+      int j = (index - k) / M % N;
+      int i = (index - M * j - k) / (N * M);
+
+      int ind = N * M * i + M * j + k;
+      out[ind] = (k <= j ? fill_val : in[ind]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * M) {
+      int i = index / M;
+      int j = index % M;
+
+      out[index] = (j <= i ? fill_val : in[index]);
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool SequenceMaskOp<CUDAContext>::RunOnDevice() {
+    return DispatchHelper<TensorTypes<float16, float>>::call(this, Input(0));
+}
+
+template <>
+template <class T>
+bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
+  const Tensor<CUDAContext>* input = &Input(0);
+  const Tensor<CUDAContext>* sequence_lengths = nullptr;
+  const Tensor<CUDAContext>* window_centers = nullptr;
+
+  if (mode_ == "sequence") {
+    sequence_lengths = &Input(1);
+  } else if (mode_ == "window") {
+    window_centers = &Input(1);
+  }
+
+  auto* output = Output(0);
+  output->ResizeLike(*input);
+
+  const auto canonical_axis = input->canonical_axis_index(axis_);
+
+  // canonical_batch is non-negative if batching, -1 otherwise
+  int canonical_batch = -1;
+  if ((HasArgument("batch"))) {
+    canonical_batch = input->canonical_axis_index(batch_);
+  }
+
+  // make sure batch < axis
+  if (canonical_batch >= 0) {
+    CAFFE_ENFORCE_LT(canonical_batch, canonical_axis);
+  }
+
+  // if no batch, then left is product of dims up to axis
+  // otherwise, left is product of dims between batch and axis
+  const int left =
+      (canonical_batch >= 0
+           ? input->size_between_dim(canonical_batch, canonical_axis)
+           : input->size_to_dim(canonical_axis));
+  const int right = input->size_from_dim(canonical_axis);
+
+  // product of dims from 1 to batch
+  const int batch_dim =
+      (canonical_batch >= 0
+           ? input->size_to_dim(canonical_batch) * input->dim(canonical_batch)
+           : -1);
+
+  T fill_val = convert::To<float, T>(grad_ ? 0.0f : fill_val_);
+  if (mode_ == "sequence") {
+    if (HasArgument("repeat_from_axis")) {
+      const int canonical_repeat_from =
+          input->canonical_axis_index(repeat_from_);
+      const int repeated_dims = input->size_from_dim(canonical_repeat_from);
+      const int masked_dims = right / repeated_dims;
+      repeatedSequenceMaskKernel<<<
+          CAFFE_GET_BLOCKS(left * right),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          left,
+          masked_dims,
+          repeated_dims,
+          input->data<T>(),
+          sequence_lengths->data<int>(),
+          fill_val,
+          output->mutable_data<T>());
+    } else {
+      sequenceMaskKernel<<<
+          CAFFE_GET_BLOCKS(left * right),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          left,
+          right,
+          batch_dim,
+          input->data<T>(),
+          sequence_lengths->data<int>(),
+          fill_val,
+          output->mutable_data<T>());
+    }
+  } else if (mode_ == "window") {
+    windowMaskKernel<<<
+        CAFFE_GET_BLOCKS(left * right),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        window_centers->data<int>(),
+        radius_,
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "upper") {
+    upperMaskKernel<<<
+        CAFFE_GET_BLOCKS(left * right),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "lower") {
+    lowerMaskKernel<<<
+        CAFFE_GET_BLOCKS(left * right),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "upperdiag") {
+    upperDiagMaskKernel<<<
+        CAFFE_GET_BLOCKS(left * right),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        fill_val,
+        output->mutable_data<T>());
+  } else if (mode_ == "lowerdiag") {
+    lowerDiagMaskKernel<<<
+        CAFFE_GET_BLOCKS(left * right),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        right,
+        batch_dim,
+        input->data<T>(),
+        fill_val,
+        output->mutable_data<T>());
+  } else {
+    CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
+  }
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(SequenceMask, SequenceMaskOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/boolean_mask_ops.h b/caffe2/operators/boolean_mask_ops.h
new file mode 100644
index 0000000..78629ea
--- /dev/null
+++ b/caffe2/operators/boolean_mask_ops.h
@@ -0,0 +1,69 @@
+#ifndef BOOLEAN_MASK_OPS_H
+#define BOOLEAN_MASK_OPS_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BooleanMaskOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+template <class Context>
+class SequenceMaskOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  explicit SequenceMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)),
+        radius_(OperatorBase::GetSingleArgument<int>("radius", 10)),
+        grad_(OperatorBase::GetSingleArgument<bool>("grad", false)),
+        fill_val_(OperatorBase::GetSingleArgument<float>(
+            "fill_val",
+            -1.0f * std::numeric_limits<float>::infinity())) {
+    // Mode argument is required
+    mode_ = GetArgument(operator_def, "mode").s();
+    // batch argument is optional, but if not given, we don't want a default val
+    if (HasArgument("batch")) {
+      batch_ = GetArgument(operator_def, "batch").i();
+    }
+
+    if (HasArgument("repeat_from_axis")) {
+      CAFFE_ENFORCE(
+          mode_ == "sequence",
+          "repeat_from_axis currently only supported in sequence mode.");
+      CAFFE_ENFORCE(
+          !HasArgument("batch"),
+          "repeat_from_axis and batch not currently supported together.");
+      repeat_from_ =
+          OperatorBase::GetSingleArgument<int>("repeat_from_axis", -1);
+    }
+  }
+
+  bool RunOnDevice() override;
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  int axis_;
+  int radius_;
+  std::string mode_;
+  bool grad_;
+  float fill_val_;
+  int batch_;
+  int repeat_from_;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/boolean_unmask_ops.cc b/caffe2/operators/boolean_unmask_ops.cc
new file mode 100644
index 0000000..5e6aa79
--- /dev/null
+++ b/caffe2/operators/boolean_unmask_ops.cc
@@ -0,0 +1,156 @@
+#include "caffe2/operators/boolean_unmask_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+bool BooleanUnmaskOp<CPUContext>::RunOnDevice() {
+  int maskSize = Input(0).size();
+  int numMasks = InputSize() / 2;
+  auto& valueMeta = Input(1).meta();
+
+  auto* valuesOut = Output(0);
+  valuesOut->Resize(maskSize);
+  auto* valuesOutPtr = (char*)valuesOut->raw_mutable_data(valueMeta);
+
+  std::vector<int> nextValueIndices(numMasks, 0);
+  for (int maskOffset = 0; maskOffset < maskSize; ++maskOffset) {
+    bool maskFound = false;
+    for (int maskIndex = 0; maskIndex < numMasks; ++maskIndex) {
+      auto& mask = Input(maskIndex * 2);
+      CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+      CAFFE_ENFORCE_EQ(mask.size(), maskSize);
+      const auto* maskPtr = mask.template data<bool>();
+
+      auto& values = Input(maskIndex * 2 + 1);
+      CAFFE_ENFORCE_EQ(values.ndim(), 1);
+      const auto* valuesPtr = (char*)values.raw_data();
+
+      if (maskPtr[maskOffset]) {
+        auto& valueIndex = nextValueIndices[maskIndex];
+        CAFFE_ENFORCE_LT(valueIndex, values.size());
+        auto* src = valuesPtr + (valueIndex++) * valueMeta.itemsize();
+        auto* dst = valuesOutPtr + maskOffset * valueMeta.itemsize();
+        std::copy(src, src + valueMeta.itemsize(), dst);
+        maskFound = true;
+        break;
+      }
+    }
+    CAFFE_ENFORCE(
+        maskFound, "All masks have False at position ", maskOffset, ".");
+  }
+  // check all indices match value length
+  for (int i = 0; i < numMasks; ++i) {
+    auto& values = Input(i * 2 + 1);
+    CAFFE_ENFORCE_EQ(
+        values.size(),
+        nextValueIndices[i],
+        "The number of true at mask ",
+        i,
+        " does not match the corresponding value size.");
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CPUContext>);
+
+OPERATOR_SCHEMA(BooleanUnmask)
+    .NumInputs([](int n) { return n > 0 && n % 2 == 0; })
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a series of masks and values, reconstruct values together according to masks. A comprehensive example:
+```
+mask1   = True, False, True, False, False
+values1 = 1.0, 3.0
+mask2   = False, True, False, False, False
+values2 = 2.0
+mask3   = False, False, False, True, True
+values3 = 4.0, 5.0
+```
+
+Reconstruct by:
+
+```
+output = net.BooleanUnmask([mask1, values1, mask2, values2, mask3, values3], ["output"])
+output = 1.0, 2.0, 3.0, 4.0, 5.0
+```
+
+Note that for all mask positions, there must be at least one True. This is not allowed:
+
+```
+mask1   = True, False
+values1 = 1.0
+mask2   = False, False
+values2 =
+
+output = net.BooleanUnmask([mask1, values1, mask2, values2], ["output"])
+```
+
+If there are multiple True values for a field, we accept the first value, and no longer expect a value for that location:
+
+```
+mask1   = True, False
+values1 = 1.0
+mask2   = True, True
+values2 = 2.0
+
+output = net.BooleanUnmask([mask1, values1, mask2, values2], ["output"])
+output = 1.0, 2.0
+```
+
+*** Note that we alternate `data` and `mask` inputs
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/boolean_unmask_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "BooleanUnmask",
+    ["mask1", "data1", "mask2", "data2"],
+    ["unmasked_data"]
+)
+
+workspace.FeedBlob("mask1", np.array([True,False,False,True,True,False]))
+workspace.FeedBlob("data1", np.array([1,4,5]))
+workspace.FeedBlob("mask2", np.array([False,True,True,False,False,True]))
+workspace.FeedBlob("data2", np.array([2,3,6]))
+
+print("data1:", workspace.FetchBlob("data1"))
+print("mask1:", workspace.FetchBlob("mask1"))
+print("data2:", workspace.FetchBlob("data2"))
+print("mask2:", workspace.FetchBlob("mask2"))
+workspace.RunOperatorOnce(op)
+print("unmasked_data:", workspace.FetchBlob("unmasked_data"))
+
+```
+
+**Result**
+
+```
+
+data1: [1 4 5]
+mask1: [ True False False  True  True False]
+data2: [2 3 6]
+mask2: [False  True  True False False  True]
+unmasked_data: [1 2 3 4 5 6]
+
+```
+
+</details>
+)DOC")
+    .Input(0,"data","(*Tensor*): 1D input tensor(s)")
+    .Input(1,"mask","(*Tensor`<bool>`*): 1D boolean mask tensor(s)")
+    .Output(0, "unmasked_data", "(*Tensor*): 1D tensor of same type as `data` input that contains the unmasked input tensor");
+
+NO_GRADIENT(BooleanUnmask)
+}
diff --git a/caffe2/operators/boolean_unmask_ops.cu b/caffe2/operators/boolean_unmask_ops.cu
new file mode 100644
index 0000000..42801e1
--- /dev/null
+++ b/caffe2/operators/boolean_unmask_ops.cu
@@ -0,0 +1,124 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/boolean_unmask_ops.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void ComputeIndicesKernel(
+    const int numMasks,
+    const int maskSize,
+    int* indices,
+    bool* const masks[]) {
+  CUDA_1D_KERNEL_LOOP(i, maskSize) {
+    for (int j = 0; j < numMasks; ++j) {
+      if (masks[j][i]) {
+        indices[i] = j;
+        return;
+      }
+    }
+    CUDA_KERNEL_ASSERT(false);
+  }
+}
+
+__global__ void FillValuesKernel(
+    const int numMasks,
+    const int maskSize,
+    const size_t itemSize,
+    const int* indices,
+    char* const values[],
+    int valueSizes[],
+    char* dest) {
+  CUDA_1D_KERNEL_LOOP(j, numMasks) {
+    int k = 0;
+    for (int i = 0; i < maskSize; ++i) {
+      if (indices[i] == j) {
+        for (int h = 0; h < itemSize; ++h) {
+          dest[i * itemSize + h] = values[j][k * itemSize + h];
+        }
+        ++k;
+      }
+    }
+    CUDA_KERNEL_ASSERT(valueSizes[j] == k);
+  }
+}
+
+} // namespace
+
+template <>
+class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
+ public:
+  BooleanUnmaskOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CUDAContext>(def, ws) {}
+
+  bool RunOnDevice() override {
+    int maskSize = Input(0).size();
+    int numMasks = InputSize() / 2;
+    const auto& meta = Input(1).meta();
+
+    auto* out = Output(0);
+    out->Resize(maskSize);
+    auto* dest = (char*)out->raw_mutable_data(meta);
+
+    hostMasks_.Resize(numMasks);
+    auto* hostMasksData = hostMasks_.mutable_data<bool*>();
+    hostValues_.Resize(numMasks);
+    auto* hostValuesData = hostValues_.mutable_data<char*>();
+    hostValueSizes_.Resize(numMasks);
+    auto* hostValueSizesData = hostValueSizes_.mutable_data<int>();
+    for (int i = 0; i < numMasks; ++i) {
+      auto& mask = Input(i * 2);
+      CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+      CAFFE_ENFORCE_EQ(mask.size(), maskSize);
+      hostMasksData[i] = const_cast<bool*>(mask.data<bool>());
+
+      const auto& value = Input(i * 2 + 1);
+      CAFFE_ENFORCE_EQ(value.ndim(), 1);
+      hostValuesData[i] = (char*)value.raw_data();
+      hostValueSizesData[i] = value.size();
+    }
+    masks_.CopyFrom(hostMasks_, &context_);
+    values_.CopyFrom(hostValues_, &context_);
+    valueSizes_.CopyFrom(hostValueSizes_, &context_);
+
+    indices_.Resize(maskSize);
+    auto* indicesData = indices_.mutable_data<int>();
+
+    ComputeIndicesKernel<<<
+        min(maskSize, CAFFE_MAXIMUM_NUM_BLOCKS),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numMasks, maskSize, indicesData, masks_.data<bool*>());
+
+    auto* valueSizesData = valueSizes_.mutable_data<int>();
+    FillValuesKernel<<<
+        min(numMasks, CAFFE_MAXIMUM_NUM_BLOCKS),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numMasks,
+        maskSize,
+        meta.itemsize(),
+        indicesData,
+        values_.data<char*>(),
+        valueSizesData,
+        dest);
+
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> indices_;
+  Tensor<CUDAContext> masks_;
+  Tensor<CUDAContext> values_;
+  Tensor<CUDAContext> valueSizes_;
+
+  Tensor<CPUContext> hostMasks_;
+  Tensor<CPUContext> hostValues_;
+  Tensor<CPUContext> hostValueSizes_;
+};
+
+REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);
+
+} // caffe2
diff --git a/caffe2/operators/boolean_unmask_ops.h b/caffe2/operators/boolean_unmask_ops.h
new file mode 100644
index 0000000..41c6693
--- /dev/null
+++ b/caffe2/operators/boolean_unmask_ops.h
@@ -0,0 +1,20 @@
+#ifndef BOOLEAN_UNMASK_OPS_H
+#define BOOLEAN_UNMASK_OPS_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BooleanUnmaskOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(BooleanUnmaskOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
new file mode 100644
index 0000000..05c588f
--- /dev/null
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -0,0 +1,70 @@
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/operator.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+template <class DataT>
+static void AddScalarInput(
+    const DataT& value,
+    const string& name,
+    Workspace* ws,
+    bool isEmpty = false) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  if (!isEmpty) {
+    tensor->Resize(vector<TIndex>{1});
+    *(tensor->mutable_data<DataT>()) = value;
+  } else {
+    tensor->Resize(vector<TIndex>{0});
+    tensor->mutable_data<DataT>();
+  }
+  return;
+}
+
+// Test case for BooleanUnmask operator
+//  mask1:   [ false ]
+//  values1: [ ]
+//  mask2:   [ true ]
+//  values2: [ 1.0 ]
+//
+//  Expected Output: [ 1.0 ]
+TEST(BooleanUnmaskTest, Test) {
+  Workspace ws;
+  OperatorDef def;
+
+  def.set_name("test");
+  def.set_type("BooleanUnmask");
+
+  def.add_input("mask1");
+  def.add_input("values1");
+  def.add_input("mask2");
+  def.add_input("values2");
+
+  def.add_output("unmasked_data");
+
+  AddScalarInput(false, "mask1", &ws);
+  AddScalarInput(float(), "values1", &ws, true);
+  AddScalarInput(true, "mask2", &ws);
+  AddScalarInput(1.0f, "values2", &ws);
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+
+  EXPECT_TRUE(op->Run());
+
+  Blob* unmasked_data_blob = ws.GetBlob("unmasked_data");
+  EXPECT_NE(nullptr, unmasked_data_blob);
+
+  auto& unmasked_data = unmasked_data_blob->Get<TensorCPU>();
+  EXPECT_EQ(unmasked_data.size(), 1);
+
+  CHECK_EQ(unmasked_data.data<float>()[0], 1.0f);
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/box_with_nms_limit_op.cc b/caffe2/operators/box_with_nms_limit_op.cc
new file mode 100644
index 0000000..9a3f45f
--- /dev/null
+++ b/caffe2/operators/box_with_nms_limit_op.cc
@@ -0,0 +1,317 @@
+#include "box_with_nms_limit_op.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "generate_proposals_op_util_nms.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+namespace {
+
+template <class Derived, class Func>
+vector<int> filter_with_indices(
+    const Eigen::ArrayBase<Derived>& array,
+    const vector<int>& indices,
+    const Func& func) {
+  vector<int> ret;
+  for (auto& cur : indices) {
+    if (func(array[cur])) {
+      ret.push_back(cur);
+    }
+  }
+  return ret;
+}
+
+} // namespace
+
+template <>
+bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
+  const auto& tscores = Input(0);
+  const auto& tboxes = Input(1);
+  auto* out_scores = Output(0);
+  auto* out_boxes = Output(1);
+  auto* out_classes = Output(2);
+
+  const int box_dim = rotated_ ? 5 : 4;
+
+  // tscores: (num_boxes, num_classes), 0 for background
+  if (tscores.ndim() == 4) {
+    CAFFE_ENFORCE_EQ(tscores.dim(2), 1, tscores.dim(2));
+    CAFFE_ENFORCE_EQ(tscores.dim(3), 1, tscores.dim(3));
+  } else {
+    CAFFE_ENFORCE_EQ(tscores.ndim(), 2, tscores.ndim());
+  }
+  CAFFE_ENFORCE(tscores.template IsType<float>(), tscores.meta().name());
+  // tboxes: (num_boxes, num_classes * box_dim)
+  if (tboxes.ndim() == 4) {
+    CAFFE_ENFORCE_EQ(tboxes.dim(2), 1, tboxes.dim(2));
+    CAFFE_ENFORCE_EQ(tboxes.dim(3), 1, tboxes.dim(3));
+  } else {
+    CAFFE_ENFORCE_EQ(tboxes.ndim(), 2, tboxes.ndim());
+  }
+  CAFFE_ENFORCE(tboxes.template IsType<float>(), tboxes.meta().name());
+
+  int N = tscores.dim(0);
+  int num_classes = tscores.dim(1);
+
+  CAFFE_ENFORCE_EQ(N, tboxes.dim(0));
+  CAFFE_ENFORCE_EQ(num_classes * box_dim, tboxes.dim(1));
+
+  int batch_size = 1;
+  vector<float> batch_splits_default(1, tscores.dim(0));
+  const float* batch_splits_data = batch_splits_default.data();
+  if (InputSize() > 2) {
+    // tscores and tboxes have items from multiple images in a batch. Get the
+    // corresponding batch splits from input.
+    const auto& tbatch_splits = Input(2);
+    CAFFE_ENFORCE_EQ(tbatch_splits.ndim(), 1);
+    batch_size = tbatch_splits.dim(0);
+    batch_splits_data = tbatch_splits.data<float>();
+  }
+  Eigen::Map<const EArrXf> batch_splits(batch_splits_data, batch_size);
+  CAFFE_ENFORCE_EQ(batch_splits.sum(), N);
+
+  out_scores->Resize(0);
+  out_boxes->Resize(0, box_dim);
+  out_classes->Resize(0);
+
+  TensorCPU* out_keeps = nullptr;
+  TensorCPU* out_keeps_size = nullptr;
+  if (OutputSize() > 4) {
+    out_keeps = Output(4);
+    out_keeps_size = Output(5);
+    out_keeps->Resize(0);
+    out_keeps_size->Resize(batch_size, num_classes);
+  }
+
+  vector<int> total_keep_per_batch(batch_size);
+  int offset = 0;
+  for (int b = 0; b < batch_splits.size(); ++b) {
+    int num_boxes = batch_splits(b);
+    Eigen::Map<const ERArrXXf> scores(
+        tscores.data<float>() + offset * tscores.dim(1),
+        num_boxes,
+        tscores.dim(1));
+    Eigen::Map<const ERArrXXf> boxes(
+        tboxes.data<float>() + offset * tboxes.dim(1),
+        num_boxes,
+        tboxes.dim(1));
+
+    // To store updated scores if SoftNMS is used
+    ERArrXXf soft_nms_scores(num_boxes, tscores.dim(1));
+    vector<vector<int>> keeps(num_classes);
+
+    // Perform nms to each class
+    // skip j = 0, because it's the background class
+    int total_keep_count = 0;
+    for (int j = 1; j < num_classes; j++) {
+      auto cur_scores = scores.col(j);
+      auto inds = utils::GetArrayIndices(cur_scores > score_thres_);
+      auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
+
+      if (soft_nms_enabled_) {
+        auto cur_soft_nms_scores = soft_nms_scores.col(j);
+        keeps[j] = utils::soft_nms_cpu(
+            &cur_soft_nms_scores,
+            cur_boxes,
+            cur_scores,
+            inds,
+            soft_nms_sigma_,
+            nms_thres_,
+            soft_nms_min_score_thres_,
+            soft_nms_method_);
+      } else {
+        std::sort(
+            inds.data(),
+            inds.data() + inds.size(),
+            [&cur_scores](int lhs, int rhs) {
+              return cur_scores(lhs) > cur_scores(rhs);
+            });
+        keeps[j] = utils::nms_cpu(cur_boxes, cur_scores, inds, nms_thres_);
+      }
+      total_keep_count += keeps[j].size();
+    }
+
+    if (soft_nms_enabled_) {
+      // Re-map scores to the updated SoftNMS scores
+      new (&scores) Eigen::Map<const ERArrXXf>(
+          soft_nms_scores.data(),
+          soft_nms_scores.rows(),
+          soft_nms_scores.cols());
+    }
+
+    // Limit to max_per_image detections *over all classes*
+    if (detections_per_im_ > 0 && total_keep_count > detections_per_im_) {
+      // merge all scores together and sort
+      auto get_all_scores_sorted = [&scores, &keeps, total_keep_count]() {
+        EArrXf ret(total_keep_count);
+
+        int ret_idx = 0;
+        for (int i = 1; i < keeps.size(); i++) {
+          auto& cur_keep = keeps[i];
+          auto cur_scores = scores.col(i);
+          auto cur_ret = ret.segment(ret_idx, cur_keep.size());
+          utils::GetSubArray(cur_scores, utils::AsEArrXt(keeps[i]), &cur_ret);
+          ret_idx += cur_keep.size();
+        }
+
+        std::sort(ret.data(), ret.data() + ret.size());
+
+        return ret;
+      };
+
+      // Compute image thres based on all classes
+      auto all_scores_sorted = get_all_scores_sorted();
+      DCHECK_GT(all_scores_sorted.size(), detections_per_im_);
+      auto image_thresh =
+          all_scores_sorted[all_scores_sorted.size() - detections_per_im_];
+
+      total_keep_count = 0;
+      // filter results with image_thresh
+      for (int j = 1; j < num_classes; j++) {
+        auto& cur_keep = keeps[j];
+        auto cur_scores = scores.col(j);
+        keeps[j] = filter_with_indices(
+            cur_scores, cur_keep, [&image_thresh](float sc) {
+              return sc >= image_thresh;
+            });
+        total_keep_count += keeps[j].size();
+      }
+    }
+    total_keep_per_batch[b] = total_keep_count;
+
+    // Write results
+    int cur_start_idx = out_scores->dim(0);
+    out_scores->Extend(total_keep_count, 50, &context_);
+    out_boxes->Extend(total_keep_count, 50, &context_);
+    out_classes->Extend(total_keep_count, 50, &context_);
+
+    int cur_out_idx = 0;
+    for (int j = 1; j < num_classes; j++) {
+      auto cur_scores = scores.col(j);
+      auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
+      auto& cur_keep = keeps[j];
+      Eigen::Map<EArrXf> cur_out_scores(
+          out_scores->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          cur_keep.size());
+      Eigen::Map<ERArrXXf> cur_out_boxes(
+          out_boxes->mutable_data<float>() +
+              (cur_start_idx + cur_out_idx) * box_dim,
+          cur_keep.size(),
+          box_dim);
+      Eigen::Map<EArrXf> cur_out_classes(
+          out_classes->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          cur_keep.size());
+
+      utils::GetSubArray(
+          cur_scores, utils::AsEArrXt(cur_keep), &cur_out_scores);
+      utils::GetSubArrayRows(
+          cur_boxes, utils::AsEArrXt(cur_keep), &cur_out_boxes);
+      for (int k = 0; k < cur_keep.size(); k++) {
+        cur_out_classes[k] = static_cast<float>(j);
+      }
+
+      cur_out_idx += cur_keep.size();
+    }
+
+    if (out_keeps) {
+      out_keeps->Extend(total_keep_count, 50, &context_);
+
+      Eigen::Map<EArrXi> out_keeps_arr(
+          out_keeps->mutable_data<int>() + cur_start_idx, total_keep_count);
+      Eigen::Map<EArrXi> cur_out_keeps_size(
+          out_keeps_size->mutable_data<int>() + b * num_classes, num_classes);
+
+      cur_out_idx = 0;
+      for (int j = 0; j < num_classes; j++) {
+        out_keeps_arr.segment(cur_out_idx, keeps[j].size()) =
+            utils::AsEArrXt(keeps[j]);
+        cur_out_keeps_size[j] = keeps[j].size();
+        cur_out_idx += keeps[j].size();
+      }
+    }
+
+    offset += num_boxes;
+  }
+
+  if (OutputSize() > 3) {
+    auto* batch_splits_out = Output(3);
+    batch_splits_out->Resize(batch_size);
+    Eigen::Map<EArrXf> batch_splits_out_map(
+        batch_splits_out->mutable_data<float>(), batch_size);
+    batch_splits_out_map =
+        Eigen::Map<const EArrXi>(total_keep_per_batch.data(), batch_size)
+            .cast<float>();
+  }
+
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(BoxWithNMSLimit, BoxWithNMSLimitOp<CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    BoxWithNMSLimit,
+    mkl::MKLFallbackOp<BoxWithNMSLimitOp<CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+OPERATOR_SCHEMA(BoxWithNMSLimit)
+    .NumInputs(2, 3)
+    .NumOutputs(3, 6)
+    .SetDoc(R"DOC(
+Apply NMS to each class (except background) and limit the number of
+returned boxes.
+)DOC")
+    .Arg("score_thresh", "(float) TEST.SCORE_THRESH")
+    .Arg("nms", "(float) TEST.NMS")
+    .Arg("detections_per_im", "(int) TEST.DEECTIONS_PER_IM")
+    .Arg("soft_nms_enabled", "(bool) TEST.SOFT_NMS.ENABLED")
+    .Arg("soft_nms_method", "(string) TEST.SOFT_NMS.METHOD")
+    .Arg("soft_nms_sigma", "(float) TEST.SOFT_NMS.SIGMA")
+    .Arg(
+        "soft_nms_min_score_thres",
+        "(float) Lower bound on updated scores to discard boxes")
+    .Arg(
+        "rotated",
+        "bool (default false). If true, then boxes (rois and deltas) include "
+        "angle info to handle rotation. The format will be "
+        "[ctr_x, ctr_y, width, height, angle (in degrees)].")
+    .Input(0, "scores", "Scores, size (count, num_classes)")
+    .Input(
+        1,
+        "boxes",
+        "Bounding box for each class, size (count, num_classes * 4). "
+        "For rotated boxes, this would have an additional angle (in degrees) "
+        "in the format [<optionaal_batch_id>, ctr_x, ctr_y, w, h, angle]. "
+        "Size: (count, num_classes * 5).")
+    .Input(
+        2,
+        "batch_splits",
+        "Tensor of shape (batch_size) with each element denoting the number "
+        "of RoIs/boxes belonging to the corresponding image in batch. "
+        "Sum should add up to total count of scores/boxes.")
+    .Output(0, "scores", "Filtered scores, size (n)")
+    .Output(
+        1,
+        "boxes",
+        "Filtered boxes, size (n, 4). "
+        "For rotated boxes, size (n, 5), format [ctr_x, ctr_y, w, h, angle].")
+    .Output(2, "classes", "Class id for each filtered score/box, size (n)")
+    .Output(
+        3,
+        "batch_splits",
+        "Output batch splits for scores/boxes after applying NMS")
+    .Output(4, "keeps", "Optional filtered indices, size (n)")
+    .Output(
+        5,
+        "keeps_size",
+        "Optional number of filtered indices per class, size (num_classes)");
+
+SHOULD_NOT_DO_GRADIENT(BoxWithNMSLimit);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/box_with_nms_limit_op.h b/caffe2/operators/box_with_nms_limit_op.h
new file mode 100644
index 0000000..bb0e5d0
--- /dev/null
+++ b/caffe2/operators/box_with_nms_limit_op.h
@@ -0,0 +1,66 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef BOX_WITH_NMS_AND_LIMIT_OP_H_
+#define BOX_WITH_NMS_AND_LIMIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// C++ implementation of function insert_box_results_with_nms_and_limit()
+template <class Context>
+class BoxWithNMSLimitOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BoxWithNMSLimitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        score_thres_(
+            OperatorBase::GetSingleArgument<float>("score_thresh", 0.05)),
+        nms_thres_(OperatorBase::GetSingleArgument<float>("nms", 0.3)),
+        detections_per_im_(
+            OperatorBase::GetSingleArgument<int>("detections_per_im", 100)),
+        soft_nms_enabled_(
+            OperatorBase::GetSingleArgument<bool>("soft_nms_enabled", false)),
+        soft_nms_method_str_(OperatorBase::GetSingleArgument<std::string>(
+            "soft_nms_method",
+            "linear")),
+        soft_nms_sigma_(
+            OperatorBase::GetSingleArgument<float>("soft_nms_sigma", 0.5)),
+        soft_nms_min_score_thres_(OperatorBase::GetSingleArgument<float>(
+            "soft_nms_min_score_thres",
+            0.001)),
+        rotated_(OperatorBase::GetSingleArgument<bool>("rotated", false)) {
+    CAFFE_ENFORCE(
+        soft_nms_method_str_ == "linear" || soft_nms_method_str_ == "gaussian",
+        "Unexpected soft_nms_method");
+    soft_nms_method_ = (soft_nms_method_str_ == "linear") ? 1 : 2;
+  }
+
+  ~BoxWithNMSLimitOp() {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  // TEST.SCORE_THRESH
+  float score_thres_ = 0.05;
+  // TEST.NMS
+  float nms_thres_ = 0.3;
+  // TEST.DETECTIONS_PER_IM
+  int detections_per_im_ = 100;
+  // TEST.SOFT_NMS.ENABLED
+  bool soft_nms_enabled_ = false;
+  // TEST.SOFT_NMS.METHOD
+  std::string soft_nms_method_str_ = "linear";
+  unsigned int soft_nms_method_ = 1; // linear
+  // TEST.SOFT_NMS.SIGMA
+  float soft_nms_sigma_ = 0.5;
+  // Lower-bound on updated scores to discard boxes
+  float soft_nms_min_score_thres_ = 0.001;
+  // Set for RRPN case to handle rotated boxes. Inputs should be in format
+  // [ctr_x, ctr_y, width, height, angle (in degrees)].
+  bool rotated_{false};
+};
+
+} // namespace caffe2
+#endif // BOX_WITH_NMS_AND_LIMIT_OP_H_
diff --git a/caffe2/operators/cast_op.cc b/caffe2/operators/cast_op.cc
new file mode 100644
index 0000000..f92c439
--- /dev/null
+++ b/caffe2/operators/cast_op.cc
@@ -0,0 +1,225 @@
+#include "caffe2/operators/cast_op.h"
+
+namespace caffe2 {
+
+template <>
+template <typename DstType, typename SrcType>
+bool CastOp<CPUContext>::DoRunWithType() {
+  auto& input = Input(0);
+  auto* output = Output(0);
+  output->ResizeLike(input);
+  const auto* data = input.template data<SrcType>();
+  auto* out = output->template mutable_data<DstType>();
+  auto N = input.size();
+  for (TIndex i = 0; i < N; ++i) {
+    out[i] = static_cast<DstType>(data[i]);
+  }
+  return true;
+}
+
+template <>
+void CastOp<CPUContext>::SetBody(TensorProto_DataType to) {
+  switch (to) {
+    case TensorProto_DataType_FLOAT:
+      // body_ = &CastOp::DoRunIncFp16WithDstType<float>;
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<float>;
+      break;
+    case TensorProto_DataType_INT32:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<int>;
+      break;
+    case TensorProto_DataType_BYTE:
+      LOG(FATAL) << "BYTE is deprecated";
+      break;
+    case TensorProto_DataType_STRING:
+      CAFFE_THROW("Casting to and from strings is not supported yet");
+      // break;
+    case TensorProto_DataType_BOOL:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<bool>;
+      break;
+    case TensorProto_DataType_UINT8:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<uint8_t>;
+      break;
+    case TensorProto_DataType_INT8:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<int8_t>;
+      break;
+    case TensorProto_DataType_UINT16:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<uint16_t>;
+      break;
+    case TensorProto_DataType_INT16:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<int16_t>;
+      break;
+    case TensorProto_DataType_INT64:
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<int64_t>;
+      break;
+    case TensorProto_DataType_FLOAT16:
+      CAFFE_THROW("Casting to and from float16 on CPU is not supported yet");
+      // break;
+    case TensorProto_DataType_DOUBLE:
+      //body_ = &CastOp::DoRunIncFp16WithDstType<double>;
+      body_ = &CastOp<CPUContext>::DoRunWithDstType<double>;
+      break;
+    case TensorProto_DataType_UNDEFINED:
+      CAFFE_THROW("Cast op must have 'to' argument of type DataType");
+      // break;
+    default:
+      CAFFE_THROW("Unexpected 'to' argument value: ", to);
+  }
+}
+
+template <>
+template <typename DstType>
+bool CastOp<CPUContext>::DoRunWithDstType() {
+  return DispatchHelper<
+      TensorTypes<
+          float,
+          int32_t,
+          bool,
+          uint8_t,
+          int8_t,
+          uint16_t,
+          int16_t,
+          int64_t,
+          double>,
+      DstType>::call(this, Input(0));
+}
+
+REGISTER_CPU_OPERATOR(Cast, CastOp<CPUContext>);
+
+OPERATOR_SCHEMA(Cast)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      vector<TensorShape> out;
+      out.push_back(in[0]);
+      out[0].set_data_type(cast::GetCastDataType(helper, "to"));
+      return out;
+    })
+    .SetDoc(R"DOC(
+Casts the elements of a given input tensor to a data type specified by the `to`
+argument and returns an output tensor of the same size in the converted type.
+The `to` argument must be one of the data types specified in the *DataType*
+enum field in the TensorProto message (see below). If the `to` argument is not
+provided or is not one of the enumerated types in *DataType*, Caffe2 throws an
+Enforce error.
+
+NOTE: Casting to and from strings is not supported yet.
+
+TensorProto *DataType* field:
+```
+message TensorProto {
+  ...
+  enum DataType {
+    UNDEFINED = 0;
+    FLOAT = 1;  // float
+    INT32 = 2;  // int
+    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
+    STRING = 4;  // string
+    BOOL = 5;  // bool
+    UINT8 = 6;  // uint8_t
+    INT8 = 7;  // int8_t
+    UINT16 = 8;  // uint16_t
+    INT16 = 9;  // int16_t
+    INT64 = 10;  // int64_t
+    FLOAT16 = 12;  // caffe2::__f16, caffe2::float16
+    DOUBLE = 13;  // double
+  }
+```
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cast_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Cast",
+    ["X"],
+    ["Y"],
+    to=2
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32)*10)
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X: [[9.436466   5.8529844  0.54932857]
+ [1.1583444  2.9936118  0.22950427]
+ [3.9143739  3.4040766  8.905341  ]]
+Y: [[9 5 0]
+ [1 2 0]
+ [3 3 8]]
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "to",
+        "*(type: int)* Data type to which the elements of the input tensor are "
+        "cast. Strictly must be one of the types from *DataType* enum in "
+        "TensorProto.")
+    .Input(0, "X", "*(type: Tensor)* Input tensor to be cast.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<'to' type>`)* Output tensor with the same shape as "
+        "input with type specified by the `to` argument.")
+    .InheritOnnxSchema("Cast");
+
+// Some Casts are compatible with gradients, but for now we don't support it
+// GRADIENT_NOT_IMPLEMENTED_YET(Cast);
+
+class GetCastGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+
+    vector<OperatorDef> defs = SingleGradientDef("Cast", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+
+    // now modify the arguments in defs[0]
+    ArgumentHelper argsHelper(def_);
+
+    auto to_name = cast::GetCastDataType(argsHelper, "to");
+
+    CAFFE_ENFORCE(
+        argsHelper.HasSingleArgumentOfType<string>("from_type") ||
+            argsHelper.HasSingleArgumentOfType<int>("from_type"),
+        "Argument 'from_type' of type int or string"
+        " is required to get the gradient of CastOp");
+
+    auto from_name = cast::GetCastDataType(argsHelper, "from_type");
+    Argument *to = defs[0].add_arg();
+    to->set_name("to");
+    to->set_i(from_name);
+
+    Argument *from = defs[0].add_arg();
+    from->set_name("from_type");
+    from->set_i(to_name);
+
+    return defs;
+  }
+
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(Cast, GetCastGradient);
+
+
+
+
+}  // namespace caffe2
diff --git a/caffe2/operators/cast_op.cu b/caffe2/operators/cast_op.cu
new file mode 100644
index 0000000..229c510
--- /dev/null
+++ b/caffe2/operators/cast_op.cu
@@ -0,0 +1,132 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/cast_op.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+template <typename DstType, typename SrcType>
+__global__ void CastKernel(const int N, const SrcType* X, DstType* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Y[i] = static_cast<DstType>(X[i]);
+    Y[i] = convert::To<SrcType, DstType>(X[i]);
+  }
+}
+
+template <>
+template <typename DstType, typename SrcType>
+bool CastOp<CUDAContext>::DoRunWithType() {
+  auto& input = Input(0);
+  auto* output = Output(0);
+  output->ResizeLike(input);
+  const auto* data = input.template data<SrcType>();
+  auto* out = output->template mutable_data<DstType>();
+  DCHECK(input.size() < INT_MAX);
+  int N = input.size();
+  if (N == 0) {
+    // skip the rest of the computation if input is empty
+    return true;
+  }
+  CastKernel<DstType, SrcType>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(N, data, out);
+  return true;
+}
+
+template <>
+template <typename DstType>
+bool CastOp<CUDAContext>::DoRunWithDstType() {
+  return DispatchHelper<
+      TensorTypes<
+          float,
+          int32_t,
+          bool,
+          uint8_t,
+          int8_t,
+          uint16_t,
+          int16_t,
+          int64_t,
+          double>,
+      DstType>::call(this, Input(0));
+}
+
+// specific version that allows for casting to fp16
+template <>
+template <>
+bool CastOp<CUDAContext>::DoRunWithDstType<float>() {
+  return DispatchHelper<
+      TensorTypes<
+          float,
+          float16,
+          int32_t,
+          bool,
+          uint8_t,
+          int8_t,
+          uint16_t,
+          int16_t,
+          int64_t,
+          double>,
+      float /* DstType */>::call(this, Input(0));
+}
+
+// specific version for casting _from_ fp16
+template <>
+template <>
+bool CastOp<CUDAContext>::DoRunWithDstType<float16>() {
+  return DispatchHelper<
+      TensorTypes<
+          float,
+          float16>,
+      float16 /* DstType */>::call(this, Input(0));
+}
+template <>
+void CastOp<CUDAContext>::SetBody(TensorProto_DataType to) {
+  switch (to) {
+    case TensorProto_DataType_FLOAT:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<float>;
+      break;
+    case TensorProto_DataType_INT32:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<int>;
+      break;
+    case TensorProto_DataType_BYTE:
+      LOG(FATAL) << "BYTE is deprecated";
+      break;
+    case TensorProto_DataType_STRING:
+      CAFFE_THROW("Casting to and from strings is not supported yet");
+      // break;
+    case TensorProto_DataType_BOOL:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<bool>;
+      break;
+    case TensorProto_DataType_UINT8:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<uint8_t>;
+      break;
+    case TensorProto_DataType_INT8:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<int8_t>;
+      break;
+    case TensorProto_DataType_UINT16:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<uint16_t>;
+      break;
+    case TensorProto_DataType_INT16:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<int16_t>;
+      break;
+    case TensorProto_DataType_INT64:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<int64_t>;
+      break;
+    case TensorProto_DataType_FLOAT16:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<float16>;
+      break;
+    case TensorProto_DataType_DOUBLE:
+      body_ = &CastOp<CUDAContext>::DoRunWithDstType<double>;
+      break;
+    case TensorProto_DataType_UNDEFINED:
+      CAFFE_THROW("Cast op must have 'to' argument of type DataType");
+      // break;
+    default:
+      CAFFE_THROW("Unexpected 'to' argument value: ", to);
+  }
+}
+
+REGISTER_CUDA_OPERATOR(Cast, CastOp<CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/cast_op.h b/caffe2/operators/cast_op.h
new file mode 100644
index 0000000..491028c
--- /dev/null
+++ b/caffe2/operators/cast_op.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/cast.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+template <class Context>
+class CastOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  CastOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    const ArgumentHelper helper(operator_def);
+    TensorProto_DataType to = cast::GetCastDataType(helper, "to");
+    TensorProto_DataType from = cast::GetCastDataType(helper, "from_type");
+
+    SetBody(to);
+  }
+
+  bool RunOnDevice() override {
+    return (this->*body_)();
+  }
+
+  // Allow for Context-specific implementations
+  void SetBody(TensorProto_DataType to);
+
+  template <typename DstType>
+  bool DoRunWithDstType();
+
+  template <typename DstType, typename SrcType>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->ResizeLike(input);
+    const auto* data = input.template data<SrcType>();
+    auto* out = output->template mutable_data<DstType>();
+    auto N = input.size();
+    for (TIndex i = 0; i < N; ++i) {
+      out[i] = static_cast<DstType>(data[i]);
+    }
+    return true;
+  }
+
+ private:
+  bool (CastOp::*body_)();
+};
+
+}  // namespace caffe2
diff --git a/caffe2/operators/cbrt_op.cc b/caffe2/operators/cbrt_op.cc
new file mode 100644
index 0000000..6d1a702
--- /dev/null
+++ b/caffe2/operators/cbrt_op.cc
@@ -0,0 +1,73 @@
+#include "caffe2/operators/cbrt_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool CbrtGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  EigenVectorMap<T>(dX, size) = ConstEigenVectorArrayMap<T>(dY, size) /
+      ConstEigenVectorArrayMap<T>(Y, size).square() / T(3);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Cbrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        CbrtFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    CbrtGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        CbrtGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Cbrt)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output tensor calculated as the cbrt of the input tensor, element-wise.");
+
+OPERATOR_SCHEMA(CbrtGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetCbrtGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CbrtGradient",
+        "",
+        std::vector<std::string>{GO(0), O(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Cbrt, GetCbrtGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cbrt_op.cu b/caffe2/operators/cbrt_op.cu
new file mode 100644
index 0000000..fa3ab9c
--- /dev/null
+++ b/caffe2/operators/cbrt_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/cbrt_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+CbrtGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / (__ldg(Y + i) * __ldg(Y + i) * T(3));
+#else
+    dX[i] = dY[i] / (Y[i] * Y[i] * T(3));
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool CbrtGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  CbrtGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Cbrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CbrtFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    CbrtGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CbrtGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cbrt_op.h b/caffe2/operators/cbrt_op.h
new file mode 100644
index 0000000..4acafe9
--- /dev/null
+++ b/caffe2/operators/cbrt_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_CBRT_OP_H_
+#define CAFFE2_OPERATORS_CBRT_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct CbrtFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Cbrt<T, Context>(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct CbrtGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& Y_dims,
+      const T* dY,
+      const T* Y,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CBRT_OP_H_
diff --git a/caffe2/operators/ceil_op.cc b/caffe2/operators/ceil_op.cc
new file mode 100644
index 0000000..fa228c7
--- /dev/null
+++ b/caffe2/operators/ceil_op.cc
@@ -0,0 +1,71 @@
+#include "caffe2/operators/ceil_op.h"
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Ceil, CeilOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Ceil)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise application of the ceil function ($y=ceil(x)$) to the input tensor
+`X`. Output tensor shape is the same as the input tensor.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/ceil_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Ceil",
+    ["X"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32))
+print("X before running op:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("X after running op:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X before running op:
+[[ 8.44598    -6.5098248  -2.2993476  -7.6859694   0.58566964]
+ [-7.846551   -0.03689406  6.9362907  -4.0521703   4.4969673 ]
+ [ 0.33355865 -7.895527   -8.393201    9.374202   -2.3930092 ]
+ [-6.3061996   3.1403487   3.782099   -8.516556   -2.8387244 ]
+ [-2.0164998   4.7663913  -3.422966    0.3636999   8.75713   ]]
+X after running op:
+[[ 9. -6. -2. -7.  1.]
+ [-7. -0.  7. -4.  5.]
+ [ 1. -7. -8. 10. -2.]
+ [-6.  4.  4. -8. -2.]
+ [-2.  5. -3.  1.  9.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
+
+// TODO: Write gradient for this when needed
+GRADIENT_NOT_IMPLEMENTED_YET(Ceil);
+
+} // namespace caffe2
diff --git a/caffe2/operators/ceil_op.cu b/caffe2/operators/ceil_op.cu
new file mode 100644
index 0000000..de382ad
--- /dev/null
+++ b/caffe2/operators/ceil_op.cu
@@ -0,0 +1,30 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/ceil_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T>
+__global__ void CeilKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = std::ceil(X[i]);
+  }
+}
+
+template <>
+bool CeilOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  CeilKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Ceil, CeilOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/ceil_op.h b/caffe2/operators/ceil_op.h
new file mode 100644
index 0000000..f2b9617
--- /dev/null
+++ b/caffe2/operators/ceil_op.h
@@ -0,0 +1,33 @@
+#ifndef CAFFE2_OPERATORS_CEIL_OP_H_
+#define CAFFE2_OPERATORS_CEIL_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class CeilOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(CeilOp);
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+
+    const float* Xdata = X.template data<float>();
+    float* Ydata = Y->template mutable_data<float>();
+    for (int i = 0; i < X.size(); ++i) {
+      Ydata[i] = std::ceil(Xdata[i]);
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CEIL_OP_H_
diff --git a/caffe2/operators/channel_backprop_stats_op.cc b/caffe2/operators/channel_backprop_stats_op.cc
new file mode 100644
index 0000000..bee287d
--- /dev/null
+++ b/caffe2/operators/channel_backprop_stats_op.cc
@@ -0,0 +1,80 @@
+#include "caffe2/operators/channel_backprop_stats_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool ChannelBackpropStatsOp<CPUContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  const auto& dY = Input(OUTPUT_GRAD);
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  const int sampleSize = H * W * D;
+
+  Output(SCALE_GRAD)->Resize(C);
+  Output(BIAS_GRAD)->Resize(C);
+  auto* dScale = Output(SCALE_GRAD);
+  auto* dBias = Output(BIAS_GRAD);
+
+  ConstEigenArrayMap<float> X_arr(X.data<float>(), sampleSize, N * C);
+  ConstEigenArrayMap<float> dY_arr(dY.data<float>(), sampleSize, N * C);
+  ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
+  ConstEigenVectorArrayMap<float> inv_stddev_arr(
+      Input(SAVED_INV_STDDEV).data<float>(), C);
+  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+
+  dBias_arr.setZero();
+  dScale_arr.setZero();
+
+  for (int nc = 0; nc < N * C; ++nc) {
+    int c = nc % C;
+    dBias_arr(c) += dY_arr.col(nc).sum();
+    dScale_arr(c) +=
+        ((X_arr.col(nc) - mean_arr(c)) * inv_stddev_arr(c) * dY_arr.col(nc))
+            .sum();
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ChannelBackpropStats, ChannelBackpropStatsOp<CPUContext>);
+
+OPERATOR_SCHEMA(ChannelBackpropStats)
+    .NumInputs(4)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given an input tensor in NCHW format, the gradient for the output of SpatialBN
+and the per-channel mean and inverse std var vectors for the input, computes the
+per-channel bias and scale gradient to be used during the backward pass for
+subsequent spatial batch normalization gradient calculation. Typically, the
+results of this op are subsequently reduced over multiple devices to obtain
+statistics over a larger batch size in cases where the batch size for a single
+model copy is too low to yield the full benefit of batch normalization. The
+resulting bias and scale can then be plugged back into SpatialBNGradient to get
+results over the larger batch size )DOC")
+    .Input(0, "X", "The input 4-dimensional tensor of shape NCHW")
+    .Input(
+        1,
+        "mean",
+        "The mean saved from the forward pass as a 1-dimensional "
+        "tensor of size C.")
+    .Input(
+        2,
+        "inv_std",
+        "The saved inverse standard deviation as a 1-dimensional tensor "
+        "of size C.")
+    .Input(
+        3,
+        "output_grad",
+        "Gradient for the output layer of SpatialBN, here used as input "
+        "because we are on the backward pass")
+    .Output(0, "scale_grad", "Gradient for the scale vector")
+    .Output(1, "bias_grad", "Gradient for the bias vector");
+SHOULD_NOT_DO_GRADIENT(ChannelBackpropStats);
+
+} // namespace caffe2
diff --git a/caffe2/operators/channel_backprop_stats_op.cu b/caffe2/operators/channel_backprop_stats_op.cu
new file mode 100644
index 0000000..3726773
--- /dev/null
+++ b/caffe2/operators/channel_backprop_stats_op.cu
@@ -0,0 +1,212 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/channel_backprop_stats_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+// based on "Optimizing Parallel Reduction in CUDA" by Mark Harris
+
+// note - volatile keyword is needed to allow doing a warp reduction without
+// synchronization on recent architectures
+template <unsigned int blockSize>
+__device__ void warpReduce(volatile float* sdata, unsigned int tid) {
+  // note - the if statements are "free" as they are resolved at compile time
+  if (blockSize >= 64)
+    sdata[tid] += sdata[tid + 32];
+  if (blockSize >= 32)
+    sdata[tid] += sdata[tid + 16];
+  if (blockSize >= 16)
+    sdata[tid] += sdata[tid + 8];
+  if (blockSize >= 8)
+    sdata[tid] += sdata[tid + 4];
+  if (blockSize >= 4)
+    sdata[tid] += sdata[tid + 2];
+  if (blockSize >= 2)
+    sdata[tid] += sdata[tid + 1];
+}
+
+template <unsigned int blockSize>
+__global__ void ChannelBackpropStatsBlockKernel(
+    int N,
+    int C,
+    int valsPerChannel,
+    const float* X,
+    const float* dY,
+    const float* mean,
+    const float* invStddev,
+    float* dBiasBlocks,
+    float* dScaleBlocks) {
+  __shared__ float dBiasData[blockSize];
+  __shared__ float dScaleData[blockSize];
+
+  auto tid = threadIdx.x;
+  auto numBlocksPerChannel = (valsPerChannel + blockSize - 1) / blockSize;
+  auto localBlockIndex = blockIdx.x % numBlocksPerChannel;
+  auto inputIndex = (blockIdx.x / numBlocksPerChannel) * valsPerChannel +
+      localBlockIndex * blockSize + tid;
+  auto n = blockIdx.x / numBlocksPerChannel / C;
+  auto c = (blockIdx.x / numBlocksPerChannel) % C;
+
+  dBiasData[tid] = 0;
+  dScaleData[tid] = 0;
+
+  if (localBlockIndex * blockSize + tid < valsPerChannel) {
+    dBiasData[tid] += dY[inputIndex];
+    dScaleData[tid] +=
+        (X[inputIndex] - mean[c]) * invStddev[c] * dY[inputIndex];
+  }
+
+  __syncthreads();
+  if (blockSize >= 512) {
+    if (tid < 256) {
+      dBiasData[tid] += dBiasData[tid + 256];
+      dScaleData[tid] += dScaleData[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 256) {
+    if (tid < 128) {
+      dBiasData[tid] += dBiasData[tid + 128];
+      dScaleData[tid] += dScaleData[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 128) {
+    if (tid < 64) {
+      dBiasData[tid] += dBiasData[tid + 64];
+      dScaleData[tid] += dScaleData[tid + 64];
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    warpReduce<blockSize>(dBiasData, tid);
+    warpReduce<blockSize>(dScaleData, tid);
+  }
+
+  // output block data sorted by C to simplify second reduction
+  if (tid == 0) {
+    auto outputIndex = (c * N + n) * numBlocksPerChannel + localBlockIndex;
+    dBiasBlocks[outputIndex] = dBiasData[0];
+    dScaleBlocks[outputIndex] = dScaleData[0];
+  }
+}
+
+template <unsigned int blockSize>
+__global__ void ChannelBackpropStatsFinalSumsKernel(
+    int N,
+    int C,
+    int numSumsPerChannel,
+    const float* dBiasScratch,
+    const float* dScaleScratch,
+    float* dBias,
+    float* dScale) {
+  __shared__ float dBiasData[blockSize];
+  __shared__ float dScaleData[blockSize];
+
+  auto tid = threadIdx.x;
+  auto inputIndex = blockIdx.x * N * numSumsPerChannel + tid;
+  dBiasData[tid] = 0;
+  dScaleData[tid] = 0;
+  for (auto i = inputIndex; i < (blockIdx.x + 1) * N * numSumsPerChannel;
+       i += blockSize) {
+    dBiasData[tid] += dBiasScratch[i];
+    dScaleData[tid] += dScaleScratch[i];
+  }
+  __syncthreads();
+  if (blockSize >= 512) {
+    if (tid < 256) {
+      dBiasData[tid] += dBiasData[tid + 256];
+      dScaleData[tid] += dScaleData[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 256) {
+    if (tid < 128) {
+      dBiasData[tid] += dBiasData[tid + 128];
+      dScaleData[tid] += dScaleData[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 128) {
+    if (tid < 64) {
+      dBiasData[tid] += dBiasData[tid + 64];
+      dScaleData[tid] += dScaleData[tid + 64];
+    }
+    __syncthreads();
+  }
+  if (tid < 32) {
+    warpReduce<blockSize>(dBiasData, tid);
+    warpReduce<blockSize>(dScaleData, tid);
+  }
+
+  if (tid == 0) {
+    dBias[blockIdx.x] = dBiasData[0];
+    dScale[blockIdx.x] = dScaleData[0];
+  }
+}
+} // namespace
+
+template <>
+bool ChannelBackpropStatsOp<CUDAContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  const auto& dY = Input(OUTPUT_GRAD);
+  const auto& mean = Input(SAVED_MEAN);
+  const auto& invStddev = Input(SAVED_INV_STDDEV);
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  auto dScale = Output(SCALE_GRAD);
+  auto dBias = Output(BIAS_GRAD);
+
+  const auto Xarr = X.data<float>();
+  const auto dYarr = dY.data<float>();
+  const auto meanArr = mean.data<float>();
+  const auto invStddevArr = invStddev.data<float>();
+
+  dBias->Resize(C);
+  dScale->Resize(C);
+
+  const auto valsPerChannel = H * W * D;
+
+  const auto numBlocksPerChannel = CAFFE_GET_BLOCKS(valsPerChannel);
+  const auto numBlocksTotal = numBlocksPerChannel * N * C;
+
+  dBiasScratch_.Resize(numBlocksTotal);
+  dScaleScratch_.Resize(numBlocksTotal);
+
+  ChannelBackpropStatsBlockKernel<CAFFE_CUDA_NUM_THREADS>
+      <<<numBlocksTotal, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          N,
+          C,
+          valsPerChannel,
+          Xarr,
+          dYarr,
+          meanArr,
+          invStddevArr,
+          dBiasScratch_.mutable_data<float>(),
+          dScaleScratch_.mutable_data<float>());
+
+  ChannelBackpropStatsFinalSumsKernel<CAFFE_CUDA_NUM_THREADS>
+      <<<C, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          N,
+          C,
+          numBlocksPerChannel,
+          dBiasScratch_.data<float>(),
+          dScaleScratch_.data<float>(),
+          dBias->mutable_data<float>(),
+          dScale->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    ChannelBackpropStats,
+    ChannelBackpropStatsOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/channel_backprop_stats_op.h b/caffe2/operators/channel_backprop_stats_op.h
new file mode 100644
index 0000000..7678c00
--- /dev/null
+++ b/caffe2/operators/channel_backprop_stats_op.h
@@ -0,0 +1,32 @@
+#ifndef CHANNEL_BACKPROP_STATS_OP_H
+#define CHANNEL_BACKPROP_STATS_OP_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ChannelBackpropStatsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ChannelBackpropStatsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~ChannelBackpropStatsOp() {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(INPUT, SAVED_MEAN, SAVED_INV_STDDEV, OUTPUT_GRAD);
+  OUTPUT_TAGS(SCALE_GRAD, BIAS_GRAD);
+
+  Tensor<Context> dBiasScratch_;
+  Tensor<Context> dScaleScratch_;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/channel_shuffle_op.cc b/caffe2/operators/channel_shuffle_op.cc
new file mode 100644
index 0000000..67bdbfb
--- /dev/null
+++ b/caffe2/operators/channel_shuffle_op.cc
@@ -0,0 +1,30 @@
+#include "channel_shuffle_op.h"
+
+namespace caffe2 {
+
+class GetChannelShuffleGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    ChannelShuffleGradient,
+    ChannelShuffleGradientOp<CPUContext>);
+REGISTER_GRADIENT(ChannelShuffle, GetChannelShuffleGradient);
+OPERATOR_SCHEMA(ChannelShuffle)
+    .IdenticalTypeAndShape()
+    .NumInputs(1)
+    .NumOutputs(1)
+    .InheritOnnxSchema("ChannelShuffle");
+OPERATOR_SCHEMA(ChannelShuffleGradient)
+    .IdenticalTypeAndShape()
+    .NumInputs(1)
+    .NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/operators/channel_shuffle_op.h b/caffe2/operators/channel_shuffle_op.h
new file mode 100644
index 0000000..05e66bf
--- /dev/null
+++ b/caffe2/operators/channel_shuffle_op.h
@@ -0,0 +1,124 @@
+#pragma once
+#include "caffe2/utils/math.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+template <typename Context>
+class ChannelShuffleOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_OPERATOR_FUNCTIONS(Context);
+  ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    const auto C = X.dim32(1);
+    const auto G = this->group_;
+    CAFFE_ENFORCE(C % G == 0, "");
+    const auto K = C / G;
+    const auto S = X.dim32(2) * X.dim32(3);
+    for (auto n = 0; n < X.dim32(0); ++n) {
+      for (auto g = 0; g < G; ++g) {
+        // Scatter the group g block (of size KxS) to output channels
+        // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
+        math::CopyMatrix<Context>(
+            X.itemsize(),
+            K,
+            S,
+            X.template data<float>() + g * K * S + n * C * S,
+            S,
+            Y->template mutable_data<float>() + g * S + n * C * S,
+            G * S,
+            &context_,
+            X.meta().copy());
+      }
+    }
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    const auto C = X.dim32(3);
+    const auto G = this->group_;
+    CAFFE_ENFORCE(C % G == 0, "");
+    const auto K = C / G;
+    std::array<int, 2> dims = {G, K};
+    std::array<int, 2> axes = {1, 0};
+    for (auto i = 0; i < X.size(); i += C) {
+      // Transpose each C = GxK matrix
+      math::Transpose(
+          2,
+          dims.data(),
+          axes.data(),
+          X.template data<float>() + i,
+          Y->template mutable_data<float>() + i,
+          &context_);
+    }
+    return true;
+  }
+};
+
+template <typename Context>
+class ChannelShuffleGradientOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_OPERATOR_FUNCTIONS(Context);
+  ChannelShuffleGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const auto& dY = Input(0);
+    auto* dX = Output(0);
+    dX->ResizeLike(dY);
+    const auto C = dY.dim32(1);
+    const auto G = this->group_;
+    CAFFE_ENFORCE(C % G == 0, "");
+    const auto K = C / G;
+    const auto S = dY.dim32(2) * dY.dim32(3);
+    for (auto n = 0; n < dY.dim32(0); ++n) {
+      for (auto g = 0; g < G; ++g) {
+        // Gather the group g block (of size KxS) from output channels
+        // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
+        math::CopyMatrix<Context>(
+            dY.itemsize(),
+            K,
+            S,
+            dY.template data<float>() + g * S + n * C * S,
+            G * S,
+            dX->template mutable_data<float>() + g * K * S + n * C * S,
+            S,
+            &context_,
+            dY.meta().copy());
+      }
+    }
+    return true;
+  }
+
+  bool RunOnDeviceWithOrderNHWC() override {
+    const auto& dY = Input(0);
+    auto* dX = Output(0);
+    dX->ResizeLike(dY);
+    const auto C = dY.dim32(3);
+    const auto G = this->group_;
+    CAFFE_ENFORCE(C % G == 0, "");
+    const auto K = C / G;
+    std::array<int, 2> dims = {K, G};
+    std::array<int, 2> axes = {1, 0};
+    for (auto i = 0; i < dY.size(); i += C) {
+      // Transpose each C = KxG matrix
+      math::Transpose(
+          2,
+          dims.data(),
+          axes.data(),
+          dY.template data<float>() + i,
+          dX->template mutable_data<float>() + i,
+          &context_);
+    }
+    return true;
+  }
+};
+} // namespace caffe2
diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu
new file mode 100644
index 0000000..7e53add
--- /dev/null
+++ b/caffe2/operators/channel_shuffle_op_gpu.cu
@@ -0,0 +1,122 @@
+#include "caffe2/core/context_gpu.h"
+#include "channel_shuffle_op.h"
+
+namespace caffe2 {
+
+__global__ void ChannelShuffleNCHWKernel(
+    const int N,
+    const int S,
+    const int C,
+    const int G,
+    const int K,
+    const float* Xdata,
+    float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const int out_s = i % S;
+    const int i_2 = i / S;
+    const int out_c = i_2 % C;
+    const int n = i_2 / C;
+
+    const int g = out_c % G;
+    const int k = out_c / G;
+    const int in_c = k + K * g;
+    Ydata[out_s + S * out_c + S * C * n] = Xdata[out_s + S * in_c + S * C * n];
+  }
+}
+
+__global__ void ChannelShuffleNHWCKernel(
+    const int N,
+    const int G,
+    const int K,
+    const float* Xdata,
+    float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const int out_g = i % G;
+    const int i_2 = i / G;
+    const int out_k = i_2 % K;
+    const int n = i_2 / K;
+
+    const int in_c = out_k + K * out_g;
+    Ydata[out_g + G * out_k + G * K * n] = Xdata[in_c + G * K * n];
+  }
+}
+
+template <>
+bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto C = X.dim32(1);
+  const auto G = this->group_;
+  CAFFE_ENFORCE(C % G == 0, "");
+  const auto K = C / G;
+  const auto S = X.dim32(2) * X.dim32(3);
+  ChannelShuffleNCHWKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), S, C, G, K, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto C = X.dim32(3);
+  const auto G = this->group_;
+  CAFFE_ENFORCE(C % G == 0, "");
+  const auto K = C / G;
+  ChannelShuffleNHWCKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), G, K, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const auto C = dY.dim32(1);
+  const auto G = this->group_;
+  CAFFE_ENFORCE(C % G == 0, "");
+  const auto K = C / G;
+  const auto S = dY.dim32(2) * dY.dim32(3);
+  ChannelShuffleNCHWKernel<<<
+      CAFFE_GET_BLOCKS(dY.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      dY.size(), S, C, K, G, dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const auto C = dY.dim32(3);
+  const auto G = this->group_;
+  CAFFE_ENFORCE(C % G == 0, "");
+  const auto K = C / G;
+  ChannelShuffleNHWCKernel<<<
+      CAFFE_GET_BLOCKS(dY.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      dY.size(), K, G, dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ChannelShuffleGradient,
+    ChannelShuffleGradientOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/channel_stats_op.cc b/caffe2/operators/channel_stats_op.cc
new file mode 100644
index 0000000..442ab48
--- /dev/null
+++ b/caffe2/operators/channel_stats_op.cc
@@ -0,0 +1,63 @@
+#include "caffe2/operators/channel_stats_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool ChannelStatsOp<CPUContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  const int sampleSize = H * W * D;
+
+  Output(SUM)->Resize(C);
+  Output(SUMSQ)->Resize(C);
+  EigenVectorArrayMap<float> sum(Output(SUM)->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> sumsq(Output(SUMSQ)->mutable_data<float>(), C);
+
+  sum.setZero();
+  sumsq.setZero();
+  ConstEigenArrayMap<float> X_arr(X.data<float>(), sampleSize, N * C);
+  auto index = 0;
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      sum(c) += X_arr.col(index).sum();
+      sumsq(c) += X_arr.col(index).matrix().squaredNorm();
+      index++;
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ChannelStats, ChannelStatsOp<CPUContext>);
+
+OPERATOR_SCHEMA(ChannelStats)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given an input tensor in NCHW format, computes the sum of all elements per
+channel and the sum of all elements squared per channel. These values can be
+reduced across multiple batches and used to obtain the mean and variance across
+the full set of batches. Using the new mean and variance as input to SpatialBN
+has the effect of changing the batch size over which SpatialBN is applied.
+)DOC")
+
+    .Input(0, "X", "The input 4-dimensional tensor of shape NCHW")
+    .Output(
+        0,
+        "sum",
+        "The output 1-dimensional tensor of size C containing the sum of "
+        "elements of X per channel.")
+    .Output(
+        1,
+        "sumsq",
+        "The output 1-dimensional tensor of size C containing the sum of "
+        "elements squared per channel.");
+SHOULD_NOT_DO_GRADIENT(ChannelStats);
+} // namespace caffe2
diff --git a/caffe2/operators/channel_stats_op.cu b/caffe2/operators/channel_stats_op.cu
new file mode 100644
index 0000000..fff23ff
--- /dev/null
+++ b/caffe2/operators/channel_stats_op.cu
@@ -0,0 +1,196 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/channel_stats_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+// based on "Optimizing Parallel Reduction in CUDA" by Mark Harris
+
+// note - volatile keyword is needed to allow doing a warp reduction without
+// synchronization on recent architectures
+template <unsigned int blockSize>
+__device__ void warpReduce(volatile float* sdata, unsigned int tid) {
+  // note - the if statements are "free" as they are resolved at compile time
+  if (blockSize >= 64)
+    sdata[tid] += sdata[tid + 32];
+  if (blockSize >= 32)
+    sdata[tid] += sdata[tid + 16];
+  if (blockSize >= 16)
+    sdata[tid] += sdata[tid + 8];
+  if (blockSize >= 8)
+    sdata[tid] += sdata[tid + 4];
+  if (blockSize >= 4)
+    sdata[tid] += sdata[tid + 2];
+  if (blockSize >= 2)
+    sdata[tid] += sdata[tid + 1];
+}
+
+template <unsigned int blockSize>
+__global__ void ChannelStatsBlockKernel(
+    int N,
+    int C,
+    int valsPerChannel,
+    const float* inputData,
+    float* sums,
+    float* sumsq) {
+  __shared__ float sumData[blockSize];
+  __shared__ float sumSqData[blockSize];
+
+  auto tid = threadIdx.x;
+  auto numBlocksPerChannel = (valsPerChannel + blockSize - 1) / blockSize;
+  auto localBlockIndex = blockIdx.x % numBlocksPerChannel;
+  auto inputIndex = (blockIdx.x / numBlocksPerChannel) * valsPerChannel +
+      localBlockIndex * blockSize + tid;
+
+  sumData[tid] = 0;
+  sumSqData[tid] = 0;
+
+  if (localBlockIndex * blockSize + tid < valsPerChannel) {
+    sumData[tid] += inputData[inputIndex];
+    sumSqData[tid] += inputData[inputIndex] * inputData[inputIndex];
+  }
+
+  __syncthreads();
+  if (blockSize >= 512) {
+    if (tid < 256) {
+      sumData[tid] += sumData[tid + 256];
+      sumSqData[tid] += sumSqData[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 256) {
+    if (tid < 128) {
+      sumData[tid] += sumData[tid + 128];
+      sumSqData[tid] += sumSqData[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 128) {
+    if (tid < 64) {
+      sumData[tid] += sumData[tid + 64];
+      sumSqData[tid] += sumSqData[tid + 64];
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    warpReduce<blockSize>(sumData, tid);
+    warpReduce<blockSize>(sumSqData, tid);
+  }
+
+  // output block data sorted by C to simplify second reduction
+  if (tid == 0) {
+    auto n = blockIdx.x / numBlocksPerChannel / C;
+    auto c = (blockIdx.x / numBlocksPerChannel) % C;
+    auto outputIndex = (c * N + n) * numBlocksPerChannel + localBlockIndex;
+    sums[outputIndex] = sumData[0];
+    sumsq[outputIndex] = sumSqData[0];
+  }
+}
+
+template <unsigned int blockSize>
+__global__ void ChannelStatsFinalSumsKernel(
+    int N,
+    int C,
+    int numSumsPerChannel,
+    const float* sumsScratch,
+    const float* sumsqScratch,
+    float* channelSums,
+    float* channelSumsq) {
+  __shared__ float sumData[blockSize];
+  __shared__ float sumSqData[blockSize];
+
+  auto tid = threadIdx.x;
+  auto inputIndex = blockIdx.x * N * numSumsPerChannel + tid;
+  sumData[tid] = 0;
+  sumSqData[tid] = 0;
+  for (auto i = inputIndex; i < (blockIdx.x + 1) * N * numSumsPerChannel;
+       i += blockSize) {
+    sumData[tid] += sumsScratch[i];
+    sumSqData[tid] += sumsqScratch[i];
+  }
+  __syncthreads();
+  if (blockSize >= 512) {
+    if (tid < 256) {
+      sumData[tid] += sumData[tid + 256];
+      sumSqData[tid] += sumSqData[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 256) {
+    if (tid < 128) {
+      sumData[tid] += sumData[tid + 128];
+      sumSqData[tid] += sumSqData[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockSize >= 128) {
+    if (tid < 64) {
+      sumData[tid] += sumData[tid + 64];
+      sumSqData[tid] += sumSqData[tid + 64];
+    }
+    __syncthreads();
+  }
+  if (tid < 32) {
+    warpReduce<blockSize>(sumData, tid);
+    warpReduce<blockSize>(sumSqData, tid);
+  }
+
+  if (tid == 0) {
+    channelSums[blockIdx.x] = sumData[0];
+    channelSumsq[blockIdx.x] = sumSqData[0];
+  }
+}
+} // namespace
+
+template <>
+bool ChannelStatsOp<CUDAContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  auto sum = Output(SUM);
+  auto sumsq = Output(SUMSQ);
+
+  const auto X_arr = X.data<float>();
+  const auto valsPerChannel = H * W * D;
+
+  const auto numBlocksPerChannel = CAFFE_GET_BLOCKS(valsPerChannel);
+  const auto numBlocksTotal = numBlocksPerChannel * N * C;
+
+  sumScratch_.Resize(numBlocksTotal);
+  sumsqScratch_.Resize(numBlocksTotal);
+
+  sum->Resize(C);
+  sumsq->Resize(C);
+
+  ChannelStatsBlockKernel<CAFFE_CUDA_NUM_THREADS>
+      <<<numBlocksTotal, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          N,
+          C,
+          valsPerChannel,
+          X_arr,
+          sumScratch_.mutable_data<float>(),
+          sumsqScratch_.mutable_data<float>());
+
+  ChannelStatsFinalSumsKernel<CAFFE_CUDA_NUM_THREADS>
+      <<<C, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          N,
+          C,
+          numBlocksPerChannel,
+          sumScratch_.data<float>(),
+          sumsqScratch_.data<float>(),
+          sum->mutable_data<float>(),
+          sumsq->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ChannelStats, ChannelStatsOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/channel_stats_op.h b/caffe2/operators/channel_stats_op.h
new file mode 100644
index 0000000..eb6b062
--- /dev/null
+++ b/caffe2/operators/channel_stats_op.h
@@ -0,0 +1,32 @@
+#ifndef CAFFE2_OPERATORS_CHANNEL_STATS_OP_H
+#define CAFFE2_OPERATORS_CHANNEL_STATS_OP_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ChannelStatsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ChannelStatsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~ChannelStatsOp() {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(SUM, SUMSQ);
+
+  Tensor<Context> sumScratch_;
+  Tensor<Context> sumsqScratch_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CHANNEL_STATS_OP_H
diff --git a/caffe2/operators/clip_op.cc b/caffe2/operators/clip_op.cc
new file mode 100644
index 0000000..02e80bd
--- /dev/null
+++ b/caffe2/operators/clip_op.cc
@@ -0,0 +1,124 @@
+#include "caffe2/operators/clip_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool ClipOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  EigenVectorMap<float>(Y->mutable_data<float>(), Y->size()) =
+      ConstEigenVectorMap<float>(X.data<float>(), X.size())
+          .cwiseMax(min_)
+          .cwiseMin(max_);
+  return true;
+}
+
+template <>
+bool ClipGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_GT(Y.size(), 0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  for (int i = 0; i < Y.size(); ++i) {
+    dXdata[i] = dYdata[i] * (Ydata[i] > min_ && Ydata[i] < max_);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Clip, ClipOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(ClipGradient, ClipGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Clip)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+This operator limits the given input within an interval. The interval is
+specified by the `min` and `max` arguments. They default to
+*numeric_limits::lowest()* and *numeric_limits::max()* respectively. The
+clipping operation can be done in an in-place fashion by using the same output
+blob as the input blob.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/clip_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Clip",
+    ["X"],
+    ["Y"],
+    min=20.0,
+    max=60.0
+
+)
+
+workspace.FeedBlob("X", (np.random.randint(100, size=(5,5))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+X: [[45. 16. 59. 99. 48.]
+ [12. 44. 46. 82. 28.]
+ [ 1. 91. 18.  9. 71.]
+ [24. 37. 61. 12. 81.]
+ [36. 38. 30. 84. 40.]]
+Y: [[45. 20. 59. 60. 48.]
+ [20. 44. 46. 60. 28.]
+ [20. 60. 20. 20. 60.]
+ [24. 37. 60. 20. 60.]
+ [36. 38. 30. 60. 40.]]
+```
+
+</details>
+
+)DOC")
+    .Arg("min", "*(type: float)* Minimum value, under which element is "
+    "replaced by min (default=*numeric_limits::lowest()*).")
+    .Arg("max", "*(type: float)* Maximum value, under which element is "
+    "replaced by max (default=*numeric_limits::max()*).")
+    .Input(
+        0,
+        "X",
+        "*(Tensor`<float>`)* Input tensor within range "
+        "[*numeric_limits::lowest()*, *numeric_limits::max()*].")
+    .Output(
+        0,
+        "Y",
+        "*(Tensor`<float>`)* Output tensor clipped within range [`min`, `max`].")
+    .InheritOnnxSchema("Clip");
+
+OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}});
+
+class GetClipGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ClipGradient", "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Clip, GetClipGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/clip_op.cu b/caffe2/operators/clip_op.cu
new file mode 100644
index 0000000..91b6dca
--- /dev/null
+++ b/caffe2/operators/clip_op.cu
@@ -0,0 +1,72 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/clip_op.h"
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+__device__ T cuda_min(T x, T y);
+template <typename T>
+__device__ T cuda_max(T x, T y);
+template <>
+__device__ float cuda_min(float x, float y) { return fminf(x, y); }
+template <>
+__device__ float cuda_max(float x, float y) { return fmaxf(x, y); }
+
+// Disabled since we don't use it right now.
+/*
+template <>
+__device__ double cuda_min(double x, double y) { return fmin(x, y); }
+template <>
+__device__ double cuda_max(double x, double y) { return fmax(x, y); }
+*/
+
+
+template <typename T>
+__global__ void ClipKernel(const int N, const T minval, const T maxval,
+                           const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = cuda_min<T>(cuda_max<T>(X[i], minval), maxval);
+  }
+}
+
+template <typename T>
+__global__ void ClipGradientKernel(const int N,  const T minval,
+                                   const T maxval, const T* Y,
+                                   const T* dY, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = dY[i] * (Y[i] > minval && Y[i] < maxval);
+  }
+}
+}  // namespace
+
+template <>
+bool ClipOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  ClipKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+               0, context_.cuda_stream()>>>(
+      X.size(), min_, max_, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool ClipGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_GT(Y.size(), 0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  ClipGradientKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS,
+                       0, context_.cuda_stream()>>>(
+      Y.size(), min_, max_, Y.data<float>(), dY.data<float>(),
+      dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Clip, ClipOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(ClipGradient, ClipGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/clip_op.h b/caffe2/operators/clip_op.h
new file mode 100644
index 0000000..d2d899e
--- /dev/null
+++ b/caffe2/operators/clip_op.h
@@ -0,0 +1,62 @@
+#ifndef CAFFE2_OPERATORS_CLIP_OP_H_
+#define CAFFE2_OPERATORS_CLIP_OP_H_
+
+#include <limits>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ClipOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ClipOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        min_(std::numeric_limits<T>::lowest()),
+        max_(std::numeric_limits<T>::max()) {
+    if (HasArgument("min")) {
+      min_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("min", 0));
+    }
+    if (HasArgument("max")) {
+      max_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("max", 0));
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T min_;
+  T max_;
+};
+
+template <typename T, class Context>
+class ClipGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ClipGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        min_(std::numeric_limits<T>::lowest()),
+        max_(std::numeric_limits<T>::max()) {
+    if (HasArgument("min")) {
+      min_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("min", 0));
+    }
+    if (HasArgument("max")) {
+      max_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("max", 0));
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T min_;
+  T max_;
+  // Input: Y, dY; Output: dX
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CLIP_OP_H_
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
new file mode 100644
index 0000000..256e210
--- /dev/null
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
@@ -0,0 +1,352 @@
+#include "caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+namespace utils {
+
+// Compute the area of an array of boxes.
+ERArrXXf BoxesArea(const ERArrXXf& boxes) {
+  // equivalent to python code
+  //   w = (boxes[:, 2] - boxes[:, 0] + 1)
+  //   h = (boxes[:, 3] - boxes[:, 1] + 1)
+  //   areas = w * h
+  //   assert np.all(areas >= 0), 'Negative areas founds'
+  const auto w = boxes.col(2) - boxes.col(0) + 1;
+  const auto h = boxes.col(3) - boxes.col(1) + 1;
+  const ERArrXXf areas = w * h;
+  CAFFE_ENFORCE((areas >= 0).all(), "Negative areas founds: ", boxes);
+  return areas;
+}
+
+// Determine which FPN level each RoI in a set of RoIs should map to based
+// on the heuristic in the FPN paper.
+ERArrXXf MapRoIsToFpnLevels(Eigen::Ref<const ERArrXXf> rois,
+                            const float k_min, const float k_max,
+                            const float s0, const float lvl0) {
+  // Compute level ids
+  ERArrXXf s = BoxesArea(rois).sqrt();
+  // s0 = cfg.FPN.ROI_CANONICAL_SCALE  # default: 224
+  // lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL  # default: 4
+
+  // Eqn.(1) in FPN paper
+  // equivalent to python code
+  //   target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
+  //   target_lvls = np.clip(target_lvls, k_min, k_max)
+  auto target_lvls = (lvl0 + (s / s0 + 1e-6).log() / log(2)).floor();
+  auto target_lvls_clipped = target_lvls.min(k_max).max(k_min);
+  return target_lvls_clipped;
+}
+
+// Sort RoIs from highest to lowest individual RoI score based on
+// values from scores array and limit to n results
+void SortAndLimitRoIsByScores(Eigen::Ref<const EArrXf> scores, int n,
+                              ERArrXXf& rois) {
+  CAFFE_ENFORCE(rois.rows() == scores.size(), "RoIs and scores count mismatch");
+  // Create index array with 0, 1, ... N
+  std::vector<int> idxs(rois.rows());
+  std::iota(idxs.begin(), idxs.end(), 0);
+  // Reuse a comparator based on scores and store a copy of RoIs that
+  // will be truncated and manipulated below
+  auto comp = [&scores](int lhs, int rhs) {
+    if (scores(lhs) > scores(rhs)) return true;
+    if (scores(lhs) < scores(rhs)) return false;
+    // To ensure the sort is stable
+    return lhs < rhs;
+  };
+  ERArrXXf rois_copy = rois;
+  // Note that people have found nth_element + sort to be much faster
+  // than partial_sort so we use it here
+  if (n > 0 && n < rois.rows()) {
+    std::nth_element(idxs.begin(), idxs.begin() + n, idxs.end(), comp);
+    rois.resize(n, rois.cols());
+  } else {
+    n = rois.rows();
+  }
+  std::sort(idxs.begin(), idxs.begin() + n, comp);
+  // Update RoIs based on new order
+  for (int i = 0; i < n; i++) {
+    rois.row(i) = rois_copy.row(idxs[i]);
+  }
+}
+
+// Updates arr to be indices that would sort the array. Implementation of
+// https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
+void ArgSort(EArrXi& arr) {
+  // Create index array with 0, 1, ... N and sort based on array values
+  std::vector<int> idxs(arr.size());
+  std::iota(std::begin(idxs), std::end(idxs), 0);
+  std::sort(idxs.begin(), idxs.end(), [&arr](int lhs, int rhs) {
+    return arr(lhs) < arr(rhs);
+  });
+  // Update array to match new order
+  for (int i = 0; i < arr.size(); i++) {
+    arr(i) = idxs[i];
+  }
+}
+
+// Update out_filtered and out_indices with rows from rois where lvl matches
+// value in lvls passed in.
+void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
+                             const ERArrXXf& lvls, const int lvl,
+                             ERArrXXf* out_filtered, EArrXi* out_indices) {
+  CAFFE_ENFORCE(out_filtered != nullptr, "Output filtered required");
+  CAFFE_ENFORCE(out_indices != nullptr, "Output indices required");
+  CAFFE_ENFORCE(rois.rows() == lvls.rows(), "RoIs and lvls count mismatch");
+  // Calculate how many rows we need
+  int filtered_size = (lvls == lvl).rowwise().any().count();
+  // Fill in the rows and indices
+  out_filtered->resize(filtered_size, rois.cols());
+  out_indices->resize(filtered_size);
+  for (int i = 0, filtered_idx = 0; i < rois.rows(); i++) {
+    auto lvl_row = lvls.row(i);
+    if ((lvl_row == lvl).any()) {
+      out_filtered->row(filtered_idx) = rois.row(i);
+      (*out_indices)(filtered_idx) = i;
+      filtered_idx++;
+    }
+  }
+}
+
+} // namespace utils
+
+template <>
+bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
+  int num_rpn_lvls = rpn_max_level_ - rpn_min_level_ + 1;
+  CAFFE_ENFORCE_EQ(InputSize(), 2 * num_rpn_lvls);
+
+  int num_roi_lvls = roi_max_level_ - roi_min_level_ + 1;
+  CAFFE_ENFORCE_EQ(OutputSize(), num_roi_lvls + 2);
+
+  // Collect rois and scores in Eigen
+  // rois are in [[batch_idx, x0, y0, x1, y2], ...] format
+  // Combine predictions across all levels and retain the top scoring
+  //
+  // equivalent to python code
+  //   roi_inputs = inputs[:num_rpn_lvls]
+  //   score_inputs = inputs[num_rpn_lvls:]
+  //   rois = np.concatenate([blob.data for blob in roi_inputs])
+  //   scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
+  int proposal_num = 0;
+  for (int i = 0; i < num_rpn_lvls; i++) {
+    const auto& roi_in = Input(i);
+    proposal_num += roi_in.dim(0);
+  }
+  ERArrXXf rois(proposal_num, 5);
+  EArrXf scores(proposal_num);
+  int len = 0;
+  for (int i = 0; i < num_rpn_lvls; i++) {
+    const auto& roi_in = Input(i);
+    const int n = roi_in.dim(0);
+
+    Eigen::Map<const ERArrXXf> roi(roi_in.data<float>(), n, 5);
+    rois.block(len, 0, n, 5) = roi;
+
+    const auto& score_in = Input(num_rpn_lvls + i);
+
+    // No need to squeeze, since we are reshaping when converting to Eigen
+    // https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
+    Eigen::Map<const EArrXf> score(score_in.data<float>(), n);
+    scores.segment(len, n) = score;
+
+    len += n;
+  }
+
+  // Grab only top rpn_post_nms_topN rois
+  // equivalent to python code
+  //   inds = np.argsort(-scores)[:rpn_post_nms_topN]
+  //   rois = rois[inds, :]
+  utils::SortAndLimitRoIsByScores(scores, rpn_post_nms_topN_, rois);
+
+  // Distribute
+  // equivalent to python code
+  //   lvl_min = cfg.FPN.ROI_MIN_LEVEL
+  //   lvl_max = cfg.FPN.ROI_MAX_LEVEL
+  //   lvls = fpn.map_rois_to_fpn_levels(rois[:, 1:5], lvl_min, lvl_max)
+  const int lvl_min = roi_min_level_;
+  const int lvl_max = roi_max_level_;
+  const int canon_scale = roi_canonical_scale_;
+  const int canon_level = roi_canonical_level_;
+  auto rois_block = rois.block(0, 1, rois.rows(), 4);
+  auto lvls = utils::MapRoIsToFpnLevels(rois_block,
+                                        lvl_min, lvl_max,
+                                        canon_scale, canon_level);
+
+  // equivalent to python code
+  //   outputs[0].reshape(rois.shape)
+  //   outputs[0].data[...] = rois
+  auto* rois_out = Output(0);
+  rois_out->Resize(rois.rows(), rois.cols());
+  Eigen::Map<ERArrXXf> rois_out_mat(rois_out->mutable_data<float>(),
+                                    rois.rows(), rois.cols());
+  rois_out_mat = rois;
+
+  // Create new roi blobs for each FPN level
+  // (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying
+  // to generalize to support this particular case.)
+  //
+  // equivalent to python code
+  //   rois_idx_order = np.empty((0, ))
+  //   for (output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)))
+  //       idx_lvl = np.where(lvls == lvl)[0]
+  //       blob_roi_level = rois[idx_lvl, :]
+  //       outputs[output_idx + 1].reshape(blob_roi_level.shape)
+  //       outputs[output_idx + 1].data[...] = blob_roi_level
+  //       rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
+  //   rois_idx_restore = np.argsort(rois_idx_order)
+  //   blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), outputs[-1])
+  EArrXi rois_idx_restore;
+  for (int i = 0, lvl = lvl_min; i < num_roi_lvls; i++, lvl++) {
+    ERArrXXf blob_roi_level;
+    EArrXi idx_lvl;
+    utils::RowsWhereRoILevelEquals(rois, lvls, lvl, &blob_roi_level, &idx_lvl);
+
+    // Output blob_roi_level
+    auto* roi_out = Output(i + 1);
+    roi_out->Resize(blob_roi_level.rows(), blob_roi_level.cols());
+    Eigen::Map<ERArrXXf> roi_out_mat(roi_out->mutable_data<float>(),
+                                     blob_roi_level.rows(),
+                                     blob_roi_level.cols());
+    roi_out_mat = blob_roi_level;
+
+    // Append indices from idx_lvl to rois_idx_restore
+    rois_idx_restore.conservativeResize(rois_idx_restore.size() + idx_lvl.size());
+    rois_idx_restore.tail(idx_lvl.size()) = idx_lvl;
+  }
+  utils::ArgSort(rois_idx_restore);
+  auto* rois_idx_restore_out = Output(OutputSize() - 1);
+  rois_idx_restore_out->Resize(rois_idx_restore.size());
+  Eigen::Map<EArrXi> rois_idx_restore_out_mat(rois_idx_restore_out->mutable_data<int>(),
+                                              rois_idx_restore.size());
+  rois_idx_restore_out_mat = rois_idx_restore;
+
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(CollectAndDistributeFpnRpnProposals, CollectAndDistributeFpnRpnProposalsOp<CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    CollectAndDistributeFpnRpnProposals,
+    mkl::MKLFallbackOp<CollectAndDistributeFpnRpnProposalsOp<CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+OPERATOR_SCHEMA(CollectAndDistributeFpnRpnProposals)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(3, INT_MAX)
+    .SetDoc(R"DOC(
+Merge RPN proposals generated at multiple FPN levels and then
+distribute those proposals to their appropriate FPN levels for Faster RCNN.
+An anchor at one FPN level may predict an RoI that will map to another level,
+hence the need to redistribute the proposals.
+
+Only inference is supported. To train, please use the original Python
+operator in Detectron.
+
+Inputs and outputs are examples only; if min/max levels change,
+the number of inputs and outputs, as well as their level numbering,
+will change.
+)DOC")
+    .Arg("roi_canonical_scale", "(int) ROI_CANONICAL_SCALE")
+    .Arg("roi_canonical_level", "(int) ROI_CANONICAL_LEVEL")
+    .Arg("roi_max_level", "(int) ROI_MAX_LEVEL")
+    .Arg("roi_min_level", "(int) ROI_MIN_LEVEL")
+    .Arg("rpn_max_level", "(int) RPN_MAX_LEVEL")
+    .Arg("rpn_min_level", "(int) RPN_MIN_LEVEL")
+    .Arg("rpn_post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
+    .Input(
+        0,
+        "rpn_rois_fpn2",
+        "RPN proposals for FPN level 2, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        1,
+        "rpn_rois_fpn3",
+        "RPN proposals for FPN level 3, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        2,
+        "rpn_rois_fpn4",
+        "RPN proposals for FPN level 4, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        3,
+        "rpn_rois_fpn5",
+        "RPN proposals for FPN level 5, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        4,
+        "rpn_rois_fpn6",
+        "RPN proposals for FPN level 6, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        5,
+        "rpn_roi_probs_fpn2",
+        "RPN objectness probabilities for FPN level 2. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        6,
+        "rpn_roi_probs_fpn3",
+        "RPN objectness probabilities for FPN level 3. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        7,
+        "rpn_roi_probs_fpn4",
+        "RPN objectness probabilities for FPN level 4. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        8,
+        "rpn_roi_probs_fpn5",
+        "RPN objectness probabilities for FPN level 5. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        9,
+        "rpn_roi_probs_fpn6",
+        "RPN objectness probabilities for FPN level 6. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Output(
+        0,
+        "rois",
+        "Top proposals limited to rpn_post_nms_topN total, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        1,
+        "rois_fpn2",
+        "RPN proposals for ROI level 2, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        2,
+        "rois_fpn3",
+        "RPN proposals for ROI level 3, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        3,
+        "rois_fpn4",
+        "RPN proposals for ROI level 4, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        4,
+        "rois_fpn5",
+        "RPN proposals for ROI level 5, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        5,
+        "rois_idx_restore",
+        "Permutation on the concatenation of all "
+        "rois_fpni, i=min...max, such that when applied the RPN RoIs are "
+        "restored to their original order in the input blobs.");
+
+SHOULD_NOT_DO_GRADIENT(CollectAndDistributeFpnRpnProposals);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
new file mode 100644
index 0000000..70b9ac0
--- /dev/null
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -0,0 +1,102 @@
+#ifndef CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_
+#define CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace utils {
+
+// Compute the area of an array of boxes.
+ERArrXXf BoxesArea(const ERArrXXf& boxes);
+
+// Determine which FPN level each RoI in a set of RoIs should map to based
+// on the heuristic in the FPN paper.
+ERArrXXf MapRoIsToFpnLevels(Eigen::Ref<const ERArrXXf> rois,
+                            const float k_min, const float k_max,
+                            const float s0, const float lvl0);
+
+// Sort RoIs from highest to lowest individual RoI score based on
+// values from scores array and limit to n results
+void SortAndLimitRoIsByScores(Eigen::Ref<const EArrXf> scores, int n,
+                              ERArrXXf& rois);
+
+// Updates arr to be indices that would sort the array. Implementation of
+// https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
+void ArgSort(EArrXi& arr);
+
+// Update out_filtered and out_indices with rows from rois where lvl matches
+// value in lvls passed in.
+void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
+                             const ERArrXXf& lvls, const int lvl,
+                             ERArrXXf* out_filtered, EArrXi* out_indices);
+
+} // namespace utils
+
+// C++ implementation of CollectAndDistributeFpnRpnProposalsOp
+// Merge RPN proposals generated at multiple FPN levels and then
+//    distribute those proposals to their appropriate FPN levels for Faster RCNN.
+//    An anchor at one FPN level may predict an RoI that will map to another
+//    level, hence the need to redistribute the proposals.
+// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
+template <class Context>
+class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CollectAndDistributeFpnRpnProposalsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        roi_canonical_scale_(
+            OperatorBase::GetSingleArgument<int>("roi_canonical_scale", 224)),
+        roi_canonical_level_(
+            OperatorBase::GetSingleArgument<int>("roi_canonical_level", 4)),
+        roi_max_level_(
+            OperatorBase::GetSingleArgument<int>("roi_max_level", 5)),
+        roi_min_level_(
+            OperatorBase::GetSingleArgument<int>("roi_min_level", 2)),
+        rpn_max_level_(
+            OperatorBase::GetSingleArgument<int>("rpn_max_level", 6)),
+        rpn_min_level_(
+            OperatorBase::GetSingleArgument<int>("rpn_min_level", 2)),
+        rpn_post_nms_topN_(
+            OperatorBase::GetSingleArgument<int>("rpn_post_nms_topN", 2000)) {
+    CAFFE_ENFORCE_GE(
+        roi_max_level_,
+        roi_min_level_,
+        "roi_max_level " + caffe2::to_string(roi_max_level_) +
+            " must be greater than or equal to roi_min_level " +
+            caffe2::to_string(roi_min_level_) + ".");
+    CAFFE_ENFORCE_GE(
+        rpn_max_level_,
+        rpn_min_level_,
+        "rpn_max_level " + caffe2::to_string(rpn_max_level_) +
+            " must be greater than or equal to rpn_min_level " +
+            caffe2::to_string(rpn_min_level_) + ".");
+  }
+
+  ~CollectAndDistributeFpnRpnProposalsOp() {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  // ROI_CANONICAL_SCALE
+  int roi_canonical_scale_{224};
+  // ROI_CANONICAL_LEVEL
+  int roi_canonical_level_{4};
+  // ROI_MAX_LEVEL
+  int roi_max_level_{5};
+  // ROI_MIN_LEVEL
+  int roi_min_level_{2};
+  // RPN_MAX_LEVEL
+  int rpn_max_level_{6};
+  // RPN_MIN_LEVEL
+  int rpn_min_level_{2};
+  // RPN_POST_NMS_TOP_N
+  int rpn_post_nms_topN_{2000};
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_
diff --git a/caffe2/operators/communicator_op.cc b/caffe2/operators/communicator_op.cc
new file mode 100644
index 0000000..d5d8f4d
--- /dev/null
+++ b/caffe2/operators/communicator_op.cc
@@ -0,0 +1,203 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/no_default_engine_op.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(CreateCommonWorld)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a common world for communication operators.
+)DOC")
+    .Input(0, "kv_handler", "Key/value handler for rendezvous (optional).")
+    .Output(0, "comm_world", "A common world for collective operations.")
+    .Arg("size", "(int) size of the common world.")
+    .Arg("rank", "(int) rank of this node in the common world.");
+
+OPERATOR_SCHEMA(CloneCommonWorld)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Clones existing common world.
+)DOC")
+    .Input(0, "existing_comm_world", "Existing common world to clone.")
+    .Output(0, "comm_world", "A common world for collective operations.");
+
+OPERATOR_SCHEMA(DestroyCommonWorld)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc("Closes all connections managed by a common world.")
+    .Input(0, "common_world", "The common world to be destroyed.");
+
+OPERATOR_SCHEMA(Broadcast)
+    .NumInputsOutputs([](int in, int out) {
+      return in >= 2 && out == (in - 1);
+    })
+    .EnforceInplace([](int in, int out) { return (in - 1) == out; })
+    .InputsCanCrossDevices()
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+Does a broadcast operation from the root node to every other node. The tensor
+on each node should have been pre-created with the same shape and data type.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be broadcasted.")
+    .Output(0, "X", "In-place as input 1.")
+    .Arg("root", "(int, default 0) the root to run broadcast from.");
+
+OPERATOR_SCHEMA(Reduce)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .InputsCanCrossDevices()
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+Does a reduce operation from every node to the root node. Currently only
+Sum is supported.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be reduced.")
+    .Output(0, "Y", "The reduced result on root, not set for other nodes.")
+    .Arg("root", "(int, default 0) the root to run reduce into.");
+
+OPERATOR_SCHEMA(Allreduce)
+    .NumInputsOutputs([](int in, int out) {
+      return in >= 2 && out == (in - 1);
+    })
+    .EnforceInplace([](int in, int out) { return (in - 1) == out; })
+    .IdenticalTypeAndShapeOfInput(0)
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Does an allreduce operation among the nodes. Currently only Sum is supported.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be allreduced.")
+    .Output(0, "Y", "The allreduced tensor, same on all nodes.");
+
+OPERATOR_SCHEMA(ReduceScatter)
+    .NumInputsOutputs([](int in, int out) {
+      return in >= 2 && out == (in - 1);
+    })
+    .EnforceInplace([](int in, int out) { return (in - 1) == out; })
+    .IdenticalTypeAndShapeOfInput(0)
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Does reduce-scatter operation among the nodes. Currently only Sum is supported.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be reduce-scattered.")
+    .Output(0, "Y", "The reduced tensor, scattered on all nodes.");
+
+OPERATOR_SCHEMA(Allgather)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(1)
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Does an allgather operation among the nodes.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be allgathered.")
+    .Output(0, "Y", "The allgathered tensor, same on all nodes.");
+
+OPERATOR_SCHEMA(Barrier)
+    .NumInputs(1)
+    .SetDoc(R"DOC(
+Does a barrier operation among the nodes.
+)DOC")
+    .Input(0, "comm_world", "The common world.");
+
+OPERATOR_SCHEMA(SendTensor)
+    .NumInputs({2, 4})
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Sends the tensor to another node.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(1, "X", "A tensor to be allgathered.")
+    .Input(
+        2,
+        "dst",
+        "An int CPUtensor of size 1 specifying the rank. If "
+        "given, this overrides the 'to' argument of the op.")
+    .Input(
+        3,
+        "tag",
+        "An int CPUtensor of size 1 specifying the tag to "
+        "send the tensor with. This overrides the 'tag' "
+        "argument of the op.")
+    .Arg("dst", "The rank to send the tensor to.")
+    .Arg("tag", "(int) a tag to send the tensor with.")
+    .Arg(
+        "raw_buffer",
+        "(bool) if set, only send the content and assume that the receiver "
+        "has already known the tensor's shape and information.");
+
+OPERATOR_SCHEMA(ReceiveTensor)
+    .NumInputs({2, 4})
+    .NumOutputs(3)
+    .EnforceInplace({{1, 0}})
+    .AllowInplace({{2, 1}, {3, 2}})
+    .SetDoc(R"DOC(
+Receives the tensor from another node.
+)DOC")
+    .Input(0, "comm_world", "The common world.")
+    .Input(
+        1,
+        "Y",
+        "In-place output. If raw_buffer is specified, "
+        "Y should have pre-allocated data and type..")
+    .Input(
+        2,
+        "src",
+        "An int CPUtensor of size 1 specifying the rank. If "
+        "given, this overrides the 'from' argument of the op.")
+    .Input(
+        3,
+        "tag",
+        "An int CPUtensor of size 1 specifying the tag to "
+        "send the tensor with. This overrides the 'tag' "
+        "argument of the op.")
+    .Output(0, "Y", "The received tensor.")
+    .Output(
+        1,
+        "src",
+        "The sender that sent the message as a CPUTensor "
+        "of size 1 and of type int.")
+    .Output(
+        2,
+        "tag",
+        "The tag that the message is sent with as a CPUTensor "
+        "of size 1 and of type int.")
+    .Arg("src", "(int) he rank to receive the tensor from.")
+    .Arg("tag", "(int) a tag to receive the tensor with.")
+    .Arg(
+        "raw_buffer",
+        "(bool) if set, only send the content and assume that the receiver "
+        "has already known the tensor's shape and information.");
+
+SHOULD_NOT_DO_GRADIENT(CreateCommonWorld);
+SHOULD_NOT_DO_GRADIENT(CloneCommonWorld);
+SHOULD_NOT_DO_GRADIENT(DestroyCommonWorld);
+SHOULD_NOT_DO_GRADIENT(Broadcast);
+SHOULD_NOT_DO_GRADIENT(Reduce);
+SHOULD_NOT_DO_GRADIENT(Allgather);
+SHOULD_NOT_DO_GRADIENT(Allreduce);
+SHOULD_NOT_DO_GRADIENT(ReduceScatter);
+SHOULD_NOT_DO_GRADIENT(Barrier);
+SHOULD_NOT_DO_GRADIENT(SendTensor);
+SHOULD_NOT_DO_GRADIENT(ReceiveTensor);
+
+// Communication operators do not have default engines.
+REGISTER_CPU_OPERATOR(CreateCommonWorld, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CloneCommonWorld, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(DestroyCommonWorld, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Broadcast, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Reduce, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Allgather, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Allreduce, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ReduceScatter, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Barrier, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SendTensor, NoDefaultEngineOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ReceiveTensor, NoDefaultEngineOp<CPUContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/communicator_op_gpu.cc b/caffe2/operators/communicator_op_gpu.cc
new file mode 100644
index 0000000..aef6d03
--- /dev/null
+++ b/caffe2/operators/communicator_op_gpu.cc
@@ -0,0 +1,16 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/no_default_engine_op.h"
+
+namespace caffe2 {
+// Communication operators do not have default engines.
+REGISTER_CUDA_OPERATOR(CreateCommonWorld, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(CloneCommonWorld, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Broadcast, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Reduce, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Allgather, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Allreduce, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(SendTensor, NoDefaultEngineOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(ReceiveTensor, NoDefaultEngineOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
new file mode 100644
index 0000000..a8f4c91
--- /dev/null
+++ b/caffe2/operators/concat_split_op.cc
@@ -0,0 +1,414 @@
+#include "caffe2/operators/concat_split_op.h"
+
+namespace caffe2 {
+namespace {
+std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> splitOpDevInfer(
+    const OperatorDef& def) {
+  auto op_device =
+      def.has_device_option() ? def.device_option() : DeviceOption();
+  vector<DeviceOption> in_dev(def.input_size(), op_device);
+  vector<DeviceOption> out_dev(def.output_size(), op_device);
+
+  // If we obtain split from input tensor, then 2nd input's type is always CPU.
+  if (def.input_size() == SplitOp<CPUContext>::kSplitOpInputSize) {
+    CAFFE_ENFORCE_GT(in_dev.size(), 1);
+    in_dev[1] = DeviceOption();
+  }
+  return std::make_pair(in_dev, out_dev);
+}
+} // namespace.
+
+REGISTER_CPU_OPERATOR(Split, SplitOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SplitByLengths, SplitByLengthsOp<CPUContext>);
+OPERATOR_SCHEMA(Split)
+    .NumInputs(1, 2)
+    .NumOutputs(1, INT_MAX)
+    .Input(0, "input", "(*Tensor*): tensor to split")
+    .Input(1, "split", "(*Tensor`<int>`*): [OPTIONAL] list of output lengths (see also arg `split`)")
+    .Arg("axis", "(*int*): axis to split on")
+    .Arg("split", "(*Tuple(int)*): length of each output")
+    .Arg("order", "(*string*): order of dimensions of input and output blobs; either \"NCHW\" or \"NHWC\"")
+    .Output(0,"[output_0, output_1, ...]","(*Tensor*): output tensor")
+    .DeviceInferenceFunction(splitOpDevInfer)
+    .SetDoc(R"DOC(
+Split an `input` tensor into a list of tensors, along the axis specified by the `axis` dimension. The lengths of the split can be specified using argument `split` or optional second input blob to the operator. Otherwise, the tensor is split to equal sized parts.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Split",
+    ["input"],
+    ["output_0","output_1","output_2"],
+    split=(3,2,4),
+    axis=0
+)
+
+workspace.FeedBlob("input", np.random.randint(10, size=(9)))
+print("input:", workspace.FetchBlob("input"))
+workspace.RunOperatorOnce(op)
+print("output_0:", workspace.FetchBlob("output_0"))
+print("output_1:", workspace.FetchBlob("output_1"))
+print("output_2:", workspace.FetchBlob("output_2"))
+
+```
+
+**Result**
+
+```
+
+input: [2 2 6 6 6 0 5 7 4]
+output_0: [2 2 6]
+output_1: [6 6]
+output_2: [0 5 7 4]
+
+```
+
+</details>
+
+)DOC")
+    .InheritOnnxSchema("Split");
+
+OPERATOR_SCHEMA(SplitByLengths)
+    .NumInputs(2)
+    .NumOutputs(1, INT_MAX)
+    .Input(0, "input", "The tensor to split")
+    .Input(1, "legnths", "The tensor `l_i` indicates the logic block of input.")
+    .Arg("axis", "Which axis to split on")
+    .Arg("order", "Either NHWC or NCWH, will split on C axis, defaults to NCHW")
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), op_device);
+      vector<DeviceOption> out_dev(def.output_size(), op_device);
+      // lengths input should be on CPU
+      in_dev[1] = DeviceOption();
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Split a tensor into a list of tensors, given a lengths input, along the specified
+'axis'. If `K` outputs are provided, the op assumes `len(lengths) % K == 0`.
+The `input` will be split into `K` parts. Each part of length
+`sum(lengths[i*k:i*k+k))`)DOC");
+
+namespace {
+OpSchema::Cost CostInferenceForConcat(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  ArgumentHelper helper(def);
+  const int axis = helper.HasArgument("axis")
+      ? helper.GetSingleArgument<int>("axis", -1)
+      : GetDimFromOrderString(
+            helper.GetSingleArgument<string>("order", "NCHW"));
+  bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
+  const int canonical_axis = canonical_axis_index_(axis, in[0].dims_size());
+  CAFFE_ENFORCE_GT(in.size(), 0);
+  vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
+  if (add_axis) {
+    out_shape.insert(out_shape.begin() + canonical_axis, in.size());
+  } else {
+    for (int i = 1; i < in.size(); ++i) {
+      out_shape[canonical_axis] += in[i].dims(canonical_axis);
+    }
+  }
+  uint64_t nElemRead = 1;
+  for (int i = 0; i < in.size(); ++i) {
+    nElemRead += nElemFromDim(in[i]);
+  }
+  int size = 1;
+  for (auto& s : out_shape) {
+    size *= s;
+  }
+
+  struct OpSchema::Cost cost;
+  cost.flops = 0;
+  cost.bytes_read = nElemRead * sizeof(in[0].data_type());
+  cost.bytes_written = size * sizeof(in[0].data_type());
+  cost.params_bytes = 0;
+  return cost;
+}
+
+std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
+concatOpDevInfer(const OperatorDef& def) {
+  auto op_device =
+      def.has_device_option() ? def.device_option() : DeviceOption();
+  vector<DeviceOption> in_dev(def.input_size(), op_device);
+  vector<DeviceOption> out_dev(def.output_size(), op_device);
+
+  // 2nd output's type is always CPU irrespective of op's device option.
+  CAFFE_ENFORCE_GT(out_dev.size(), 1);
+  out_dev[1] = DeviceOption();
+  return std::make_pair(in_dev, out_dev);
+}
+} // namespace
+
+REGISTER_CPU_OPERATOR(Concat, ConcatOp<CPUContext>);
+OPERATOR_SCHEMA(Concat)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(2)
+    .Arg("axis", "*(type: int; default: -1)* Axis to concatenate on.")
+    .Arg(
+        "order",
+        "*(type: string; default='NCHW')* Order of blob dimensions. Concats on the C dimension.")
+    .Arg(
+        "add_axis",
+        "*(type: int)* Pass non-zero integer to add the axis specified in `axis` to all input tensors.")
+    .TensorInferenceFunction(OpSchema::NeedsAllInputShapes(
+      [](const OperatorDef& def,
+         const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      const int axis = helper.HasArgument("axis")
+          ? helper.GetSingleArgument<int>("axis", -1)
+          : GetDimFromOrderString(
+                helper.GetSingleArgument<string>("order", "NCHW"));
+      bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
+      const int canonical_axis = canonical_axis_index_(axis, in[0].dims_size());
+      CAFFE_ENFORCE_GT(in.size(), 0);
+      vector<int> split_shape(1, in.size());
+      vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
+      if (add_axis) {
+        for (int i = 1; i < in.size(); ++i) {
+          CAFFE_ENFORCE_EQ(
+              in[0].dims().size(),
+              in[i].dims().size(),
+              "All inputs of Concat should have same dims when add_axis = 1. "
+              "Got different sizes for inputs 0 and ",
+              i);
+          for (int j = 0; j < in[0].dims().size(); ++j) {
+            CAFFE_ENFORCE_EQ(
+                in[0].dims(j),
+                in[i].dims(j),
+                "All inputs of Concat should have same dims when add_axis = 1. "
+                "Got different dims for inputs 0 and ",
+                i,
+                ". At dim: ",
+                j);
+          }
+        }
+        out_shape.insert(out_shape.begin() + canonical_axis, in.size());
+      } else {
+        for (int i = 1; i < in.size(); ++i) {
+          CAFFE_ENFORCE_EQ(
+              in[0].dims().size(),
+              in[i].dims().size(),
+              "All inputs of Concat should have same dims except "
+              "canonical_axis dim that is equal to ",
+              canonical_axis,
+              "Got different sizes for inputs 0 and ",
+              i);
+          for (int j = 0; j < in[0].dims().size(); ++j) {
+            if (j == canonical_axis) {
+              continue;
+            }
+            CAFFE_ENFORCE_EQ(
+                in[0].dims(j),
+                in[i].dims(j),
+                "All inputs of Concat should have same dims except "
+                "canonical_axis dim that is equal to ",
+                canonical_axis,
+                "Got different dims for inputs 0 and ",
+                i,
+                ". At dim: ",
+                j);
+          }
+        }
+
+        for (int i = 1; i < in.size(); ++i) {
+          out_shape[canonical_axis] += in[i].dims(canonical_axis);
+        }
+      }
+      if (def.output_size() == 1) {
+        return vector<TensorShape>{
+            CreateTensorShape(out_shape, in[0].data_type())};
+      }
+      return vector<TensorShape>{
+          CreateTensorShape(out_shape, in[0].data_type()),
+          CreateTensorShape(split_shape, TensorProto::INT32)};
+    }))
+    .CostInferenceFunction(CostInferenceForConcat)
+    .DeviceInferenceFunction(concatOpDevInfer)
+    .SetDoc(R"DOC(
+Concatenate a list of tensors into a single tensor. Similar functionality to
+Numpy's [concatenate](https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html)
+function. The `axis` argument specifies what axis along which the arrays will be concatenated.
+When set to non-zero (default=0), the `add_axis` argument adds the axis specified in `axis` to
+all input tensors.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.cc
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Concat",
+    ["X1",  "X2"],
+    ["Y", "split_info"],
+    axis=0
+)
+
+workspace.FeedBlob("X1", np.array([[1,2],[3,4]]))
+workspace.FeedBlob("X2", np.array([[5,6]]))
+print("X1:", workspace.FetchBlob("X1"))
+print("X2:", workspace.FetchBlob("X2"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+print("split_info:", workspace.FetchBlob("split_info"))
+
+```
+
+**Result**
+
+```
+
+X1: [[1 2]
+ [3 4]]
+X2: [[5 6]]
+Y: [[1 2]
+ [3 4]
+ [5 6]]
+split_info: [2 1]
+
+```
+
+</details>
+
+<details>
+
+<summary> <b>Example 2</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Concat",
+    ["X1",  "X2"],
+    ["Y", "split_info"],
+    add_axis=1,
+    axis=3
+)
+
+workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
+workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
+print("X1:", workspace.FetchBlob("X1"))
+print("X2:", workspace.FetchBlob("X2"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+print("split_info:", workspace.FetchBlob("split_info"))
+
+```
+
+**Result**
+
+```
+
+X1: [[[[1 8 3 9 0]
+   [6 4 6 5 6]
+   [3 9 1 9 9]
+   [5 1 0 7 7]
+   [9 4 0 0 9]]]]
+X2: [[[[7 0 2 6 1]
+   [3 9 4 0 3]
+   [5 3 8 9 4]
+   [3 4 2 1 0]
+   [0 8 8 8 1]]]]
+Y: [[[[[1 8 3 9 0]
+    [7 0 2 6 1]]
+
+   [[6 4 6 5 6]
+    [3 9 4 0 3]]
+
+   [[3 9 1 9 9]
+    [5 3 8 9 4]]
+
+   [[5 1 0 7 7]
+    [3 4 2 1 0]]
+
+   [[9 4 0 0 9]
+    [0 8 8 8 1]]]]]
+split_info: [1 1]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X1, X2, ...", "*(type: Tensor`<float>`)* List of input tensors.")
+    .Output(0, "concat_result", "*(type: Tensor`<float>`)* Concatenated tensor.")
+    .Output(1, "split_info", "*(type: Tensor`<int>`)* The dimensions of the inputs.")
+    .InheritOnnxSchema("Concat");
+
+// Backward compatibility names.
+REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
+REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
+OPERATOR_SCHEMA(DepthSplit)
+    .NumInputs(1, 2)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc("Backward compatible operator name for Split.");
+OPERATOR_SCHEMA(DepthConcat)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(2)
+    .SetDoc("Backward compatible operator name for Concat.");
+
+class GetSplitGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> output_grads;
+    for (int i = 0; i < def_.output_size(); ++i) {
+      if (!GradOut(i).IsEmpty()) {
+        output_grads.push_back(GO(i));
+      }
+    }
+    if (output_grads.empty()) {
+      return {};
+    }
+    return SingleGradientDef(
+        "Concat",
+        "",
+        output_grads,
+        vector<string>{GI(0), "_" + GI(0) + "_dims"});
+  }
+};
+REGISTER_GRADIENT(Split, GetSplitGradient);
+REGISTER_GRADIENT(DepthSplit, GetSplitGradient);
+REGISTER_GRADIENT(SplitByLengths, GetSplitGradient);
+
+class GetConcatGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (GradOut(0).IsEmpty()) {
+      return {};
+    }
+    vector<string> grads;
+    for (int i = 0; i < def_.input_size(); ++i) {
+      grads.push_back(GI(i));
+    }
+    return SingleGradientDef("Split", "", vector<string>{GO(0), O(1)}, grads);
+  }
+};
+REGISTER_GRADIENT(Concat, GetConcatGradient);
+REGISTER_GRADIENT(DepthConcat, GetConcatGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
new file mode 100644
index 0000000..08e87db
--- /dev/null
+++ b/caffe2/operators/concat_split_op.h
@@ -0,0 +1,334 @@
+#ifndef CAFFE2_OPERATORS_CONCAT_SPLIT_OP_H_
+#define CAFFE2_OPERATORS_CONCAT_SPLIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+inline int GetDimFromOrderString(const string& str) {
+  auto order = StringToStorageOrder(str);
+  switch (order) {
+    case StorageOrder::NHWC:
+      return 3;
+    case StorageOrder::NCHW:
+      return 1;
+    default:
+      CAFFE_THROW("Unsupported storage order: ", str);
+      return -1;
+  }
+}
+} // namespace
+
+template <class Context>
+class SplitOp final : public Operator<Context> {
+ public:
+  static const int kSplitOpInputSize = 2;
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        split_(OperatorBase::GetRepeatedArgument<int>("split")) {
+    CAFFE_ENFORCE(
+        !(OperatorBase::HasArgument("axis") &&
+          OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to split, and the order "
+        "in the case of 4-D images.");
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
+      // only exists for computing the gradient of a Concat with 'add_axis'
+      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+    } else {
+      axis_ = GetDimFromOrderString(
+          OperatorBase::GetSingleArgument<string>("order", "NCHW"));
+      add_axis_ = 0;
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+  int add_axis_;
+  vector<int> split_;
+  // Input: X, optionally split
+  // The split tensor is stored in CPU.
+};
+
+template <class Context>
+class SplitByLengthsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SplitByLengthsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        !(OperatorBase::HasArgument("axis") &&
+          OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to split, and the order "
+        "in the case of 4-D images.");
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", 0);
+    } else {
+      axis_ = GetDimFromOrderString(
+          OperatorBase::GetSingleArgument<string>("order", "NCHW"));
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+  // Input: X, optionally split
+  // The split tensor is stored in CPU.
+};
+
+template <class Context>
+class ConcatOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        !(OperatorBase::HasArgument("axis") &&
+          OperatorBase::HasArgument("order")),
+        "You shouldn't specify both the dim to concat, and the order "
+        "in the case of 4-D images.");
+    if (OperatorBase::HasArgument("axis")) {
+      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
+      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+    } else {
+      axis_ = GetDimFromOrderString(
+          OperatorBase::GetSingleArgument<string>("order", "NCHW"));
+      add_axis_ = 0;
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+  int add_axis_;
+  // Input: a number of tensors. Output: Y, split
+  // The split are stored in CPU.
+};
+
+// Implementations
+template <class Context>
+bool SplitOp<Context>::RunOnDevice() {
+  auto& input = Input(0);
+  int canonical_axis = input.canonical_axis_index(axis_);
+  CAFFE_ENFORCE_LT(
+      canonical_axis, input.ndim(), "Axis not in input ndim range.");
+  const int input_channels = input.dim32(canonical_axis);
+  const int* axis_data;
+  vector<int> equal_split;
+  if (InputSize() == kSplitOpInputSize) {
+    // We obtain split from the input tensor.
+    CAFFE_ENFORCE_EQ(
+        split_.size(),
+        0,
+        "If you set split with an input blob, do not pass in "
+        "split in the argument.");
+    auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
+    CAFFE_ENFORCE_EQ(split_tensor.size(), OutputSize());
+    axis_data = split_tensor.template data<int>();
+  } else if (split_.size() == 0) {
+    CAFFE_ENFORCE_EQ(
+        input_channels % OutputSize(),
+        0,
+        "If you did not specify split explicitly, the number of "
+        "input channels should be divisible by the output size.");
+    equal_split.resize(OutputSize(), input_channels / OutputSize());
+    axis_data = equal_split.data();
+  } else {
+    // We obtain split from the parameters.
+    CAFFE_ENFORCE_EQ(
+        split_.size(),
+        OutputSize(),
+        "The number of splits specified should be equal to the "
+        "number of outputs.");
+    axis_data = split_.data();
+  }
+
+  CAFFE_ENFORCE_EQ(
+      add_axis_ ? OutputSize()
+                : std::accumulate(axis_data, axis_data + OutputSize(), 0),
+      input_channels,
+      "Sum of split dimensions do not match: should be ",
+      input_channels);
+  vector<TIndex> output_dims(input.dims());
+  int before = 1, after = 1;
+  for (int i = 0; i < canonical_axis; ++i) {
+    before *= input.dim32(i);
+  }
+  for (int i = canonical_axis + 1; i < input.ndim(); ++i) {
+    after *= input.dim32(i);
+  }
+  if (add_axis_) {
+    output_dims.erase(output_dims.begin() + canonical_axis);
+  }
+  size_t input_offset = 0;
+  for (int i = 0; i < OutputSize(); ++i) {
+    auto* output = Output(i);
+    auto axis_dim = add_axis_ ? 1 : axis_data[i];
+    if (!add_axis_) {
+      output_dims[canonical_axis] = axis_data[i];
+    }
+    output->Resize(output_dims);
+    math::CopyMatrix<Context>(
+        input.itemsize(),
+        before,
+        axis_dim * after,
+        static_cast<const char*>(input.raw_data()) + input_offset,
+        input.dim32(canonical_axis) * after,
+        output->raw_mutable_data(input.meta()),
+        axis_dim * after,
+        &context_,
+        input.meta().copy());
+    input_offset += axis_dim * after * input.itemsize();
+  }
+  return true;
+}
+
+// Implementations
+template <class Context>
+bool SplitByLengthsOp<Context>::RunOnDevice() {
+  auto& input = Input(0);
+  auto& length = OperatorBase::Input<TensorCPU>(1);
+  auto length_length = length.size();
+  CAFFE_ENFORCE_EQ(
+      length_length % OutputSize(),
+      0,
+      "len(Lengths) should be divisible by OutputSize().");
+  int canonical_axis = input.canonical_axis_index(axis_);
+  CAFFE_ENFORCE_LT(
+      canonical_axis, input.ndim(), "Axis not in input ndim range.");
+  const int input_channels = input.dim32(canonical_axis);
+  const auto* axis_data = length.template data<int>();
+  CAFFE_ENFORCE_EQ(
+      std::accumulate(axis_data, axis_data + length.size(), 0),
+      input_channels,
+      "Sum of split dimensions do not match: should be ",
+      input_channels);
+  vector<TIndex> output_dims(input.dims());
+  int before = input.size_to_dim(canonical_axis);
+  int after = input.size_from_dim(canonical_axis + 1);
+  size_t input_offset = 0;
+  for (int i = 0; i < OutputSize(); ++i) {
+    auto* output = Output(i);
+    const auto* axis_offset = axis_data + length_length / OutputSize() * i;
+    auto axis_dim = std::accumulate(
+        axis_offset, axis_offset + length_length / OutputSize(), 0);
+    output_dims[canonical_axis] = axis_dim;
+    output->Resize(output_dims);
+    math::CopyMatrix<Context>(
+        input.itemsize(),
+        before,
+        axis_dim * after,
+        static_cast<const char*>(input.raw_data()) + input_offset,
+        input.dim32(canonical_axis) * after,
+        output->raw_mutable_data(input.meta()),
+        axis_dim * after,
+        &context_,
+        input.meta().copy());
+    input_offset += axis_dim * after * input.itemsize();
+  }
+  return true;
+}
+
+template <class Context>
+bool ConcatOp<Context>::RunOnDevice() {
+  auto* output = Output(0);
+  TensorCPU* split = OperatorBase::Output<TensorCPU>(1);
+  split->Resize(vector<TIndex>(1, InputSize()));
+  int* axis_data = split->template mutable_data<int>();
+  auto& input_zero = Input(0);
+  int adj_size = input_zero.ndim() + (add_axis_ ? 1 : 0);
+  int canonical_axis = canonical_axis_index_(axis_, adj_size);
+  CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
+  for (int i = 1; i < InputSize(); ++i) {
+    CAFFE_ENFORCE(
+        Input(i).meta() == input_zero.meta(),
+        "All inputs must have the same type, expected: ",
+        input_zero.meta().name(),
+        " but got: ",
+        Input(i).meta().name(),
+        " for input: ",
+        i);
+  }
+
+  int before = 1, after = 1;
+  vector<TIndex> output_dims(input_zero.dims());
+  for (int i = 0; i < input_zero.ndim(); ++i) {
+    if (i == canonical_axis && !add_axis_) {
+      continue;
+    }
+    int dim = input_zero.dim32(i);
+    if (i < canonical_axis) {
+      before *= dim;
+    } else { // i > canonical_axis || i == canonical_axis && add_axis_
+      after *= dim;
+    }
+    // check the input dims are compatible.
+    for (int j = 1; j < InputSize(); ++j) {
+      int dim_j = Input(j).dim32(i);
+      CAFFE_ENFORCE(
+          dim == dim_j,
+          "Expect dimension = ",
+          dim,
+          " got ",
+          dim_j,
+          " at axis = ",
+          i,
+          " for input: ",
+          j,
+          ". The input tensors can only have different dimensions "
+          "when arg 'add_axis' = 0 and along the axis = ",
+          canonical_axis,
+          " <",
+          Input(0).dims(),
+          "> vs <",
+          Input(j).dims(),
+          ">.");
+    }
+  }
+
+  int output_channels = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    axis_data[i] = add_axis_ ? 1 : Input(i).dim32(canonical_axis);
+    output_channels += axis_data[i];
+  }
+  if (add_axis_) {
+    output_dims.insert(output_dims.begin() + canonical_axis, output_channels);
+  } else {
+    output_dims[canonical_axis] = output_channels;
+  }
+  output->Resize(output_dims);
+  size_t output_offset = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    auto& input = Input(i);
+    auto axis_dim = add_axis_ ? 1 : input.dim32(canonical_axis);
+    math::CopyMatrix<Context>(
+        input.itemsize(),
+        before,
+        axis_dim * after,
+        input.raw_data(),
+        axis_dim * after,
+        static_cast<char*>(output->raw_mutable_data(input_zero.meta())) +
+            output_offset,
+        output_channels * after,
+        &context_,
+        input_zero.meta().copy());
+    output_offset += axis_dim * after * input.itemsize();
+  }
+  return true;
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONCAT_SPLIT_OP_H_
diff --git a/caffe2/operators/concat_split_op_gpu.cc b/caffe2/operators/concat_split_op_gpu.cc
new file mode 100644
index 0000000..8adcaa5
--- /dev/null
+++ b/caffe2/operators/concat_split_op_gpu.cc
@@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/concat_split_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Split, SplitOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Concat, ConcatOp<CUDAContext>);
+
+// Backward compatibility settings
+REGISTER_CUDA_OPERATOR(DepthSplit, SplitOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(DepthConcat, ConcatOp<CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(SplitByLengths, SplitByLengthsOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/conditional_op.cc b/caffe2/operators/conditional_op.cc
new file mode 100644
index 0000000..9c53f3f
--- /dev/null
+++ b/caffe2/operators/conditional_op.cc
@@ -0,0 +1,65 @@
+#include "caffe2/operators/conditional_op.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+bool ConditionalOp<CPUContext>::RunOnDevice() {
+  auto& condition = Input(0);
+  auto& dataT = Input(1);
+  auto& dataF = Input(2);
+
+  // verify the inputs shape
+  CAFFE_ENFORCE_EQ(condition.ndim(), 1);
+  CAFFE_ENFORCE(dataT.ndim() >= 1);
+  CAFFE_ENFORCE(dataT.dims()[0] == condition.dims()[0]);
+  CAFFE_ENFORCE_EQ(dataT.ndim(), dataF.ndim());
+  for (size_t i = 0; i < dataT.dims().size(); i++) {
+    CAFFE_ENFORCE(dataT.dims().at(i) == dataF.dims().at(i));
+  }
+  const auto innerSize = dataT.size_from_dim(1);
+  const auto innerSizeBytes = innerSize * dataT.meta().itemsize();
+  CAFFE_ENFORCE(innerSize * dataF.meta().itemsize() == innerSizeBytes);
+
+  // initialize output shape
+  auto* dataOut = Output(0);
+  const auto* condPtr = condition.template data<bool>();
+  dataOut->ResizeLike(dataT);
+  auto* outPtr = (char*)dataOut->raw_mutable_data(dataT.meta());
+
+  // perform conditional op along first dimension
+  const auto* ptrT = (char*)dataT.raw_data();
+  const auto* ptrF = (char*)dataF.raw_data();
+  for (TIndex i = 0; i < condition.size(); i++) {
+    auto* dst = outPtr + i * innerSizeBytes;
+    if (condPtr[i]) {
+      context_.template CopyItems<CPUContext, CPUContext>(
+          dataT.meta(), innerSize, ptrT + i * innerSizeBytes, dst);
+    } else {
+      context_.template CopyItems<CPUContext, CPUContext>(
+          dataF.meta(), innerSize, ptrF + i * innerSizeBytes, dst);
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Conditional, ConditionalOp<CPUContext>);
+
+OPERATOR_SCHEMA(Conditional)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a 1-D tensor of boolean values, apply conditional operator along the first
+dimension of DataT and DataF and return DataO. Note, DataT and DataF must
+have the exact same shape and type.
+)DOC")
+    .Input(0, "Condition", "Boolean tensor to select DataT or DataF")
+    .Input(1, "DataT", "Data to use when True")
+    .Input(2, "DataF", "Data to use when False")
+    .Output(0, "DataO", "Output data after applying ConditionalOp")
+    .IdenticalTypeAndShapeOfInput(1);
+
+NO_GRADIENT(Conditional);
+
+} // caffe2
diff --git a/caffe2/operators/conditional_op.h b/caffe2/operators/conditional_op.h
new file mode 100644
index 0000000..3f51123
--- /dev/null
+++ b/caffe2/operators/conditional_op.h
@@ -0,0 +1,23 @@
+
+#ifndef CONDITIONAL_OP_H
+#define CONDITIONAL_OP_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ConditionalOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConditionalOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+} // caffe2
+
+#endif
diff --git a/caffe2/operators/conv_gradient_op.cc b/caffe2/operators/conv_gradient_op.cc
new file mode 100644
index 0000000..8e357e1
--- /dev/null
+++ b/caffe2/operators/conv_gradient_op.cc
@@ -0,0 +1,66 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(ConvGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(Conv1DGradient, ConvGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(Conv1DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(Conv2DGradient, ConvGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(Conv2DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(Conv3DGradient, ConvGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(Conv3DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+class GetConvGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(def_.input_size() == 3 || def_.input_size() == 2);
+
+    ArgumentHelper argsHelper(def_);
+
+    auto compute_dX = !argsHelper.GetSingleArgument<bool>("no_gradient_to_input", 0);
+
+    if (def_.input_size() == 3) {
+      if (compute_dX) {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            vector<string>{I(0), I(1), GO(0)},
+            vector<string>{GI(1), GI(2), GI(0)});
+      } else {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            vector<string>{I(0), I(1), GO(0)},
+            vector<string>{GI(1), GI(2)});
+      }
+    } else {
+      if (compute_dX) {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            vector<string>{I(0), I(1), GO(0)},
+            vector<string>{GI(1), GI(0)},
+            vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      } else {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            vector<string>{I(0), I(1), GO(0)},
+            vector<string>{GI(1)},
+            vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      }
+    }
+  }
+};
+REGISTER_GRADIENT(Conv, GetConvGradient);
+REGISTER_GRADIENT(Conv1D, GetConvGradient);
+REGISTER_GRADIENT(Conv2D, GetConvGradient);
+REGISTER_GRADIENT(Conv3D, GetConvGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_op.cc b/caffe2/operators/conv_op.cc
new file mode 100644
index 0000000..082c94f
--- /dev/null
+++ b/caffe2/operators/conv_op.cc
@@ -0,0 +1,209 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+const char* kConvDoc = R"DOC(
+The Conv2D operator computes a 2D convolution operation over an input blob $(X)$, with a filter blob $(filter)$ and a bias blob $(bias)$, and outputs a single output blob $(Y)$. Although there are several options for order, the convention is that the input $(X)$ is a blob of shape $(N,C_{in},H_{in},W_{in})$ and the output $(Y)$ is a blob of shape $(N,C_{out},H_{out},W_{out})$. Here, $N$ is the batch size, $C$ is the number of channels, $H$ is the spatial height, and $W$ is the spatial width. For example, if your input data was a batch of five, 100x120pixel RGB images, $X$ would have shape $(5,3,120,100)$.
+
+The $filter$ input blob may contain multiple filters and has shape $(M, C_{in}, K_H, K_W)$. Here, $M$ is the number of individual filters contained in the blob, $C_{in}$ is the number of channels of each filter (by convention in 2D convolution it is the same as the number of channels in the input), $K_H$ is the spatial height of the kernel, and $K_W$ is the spatial width of the kernel. The $bias$ blob is a vector of length $M$, where there is one bias for each filter in the $filter$ blob.
+
+Given the shape of the input blob and the filter blob, we can calculate the shape of the output blob as follows. The number of items in the batch $N$ will stay the same. The number of channels in the output will equal the number of kernels in the filter blob, so $C_{out} = M.$ With stride and pad defined below, the spatial height and width of the output ($H_{out}$ and $W_{out}$) are calculated as
+
+$$H_{out} = \left \lfloor{\frac{H_{in} - K_H + 2*pad}{stride}+1}\right \rfloor$$
+
+
+$$W_{out} = \left \lfloor{\frac{W_{in} - K_W + 2*pad}{stride}+1}\right \rfloor$$
+
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_op.cc
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Conv",
+    ["X", "filter", "bias"],
+    ["Y"],
+    kernel=5,
+    pad=1,
+    stride=2
+)
+
+# Create X: (N,C,H,W)
+data = np.random.randn(1,1,8,8).astype(np.float32)
+print("Data shape: ",data.shape)
+
+# Create W: (M,C,Kh,Kw)
+filters = np.random.randn(3,1,5,5).astype(np.float32)
+print("Filter shape: ",filters.shape)
+
+# Create b: M
+bias = np.array([1.,1.,1.]).astype(np.float32)
+print("Bias shape: ",bias.shape)
+
+# Put the inputs into the workspace
+workspace.FeedBlob("X", data)
+workspace.FeedBlob("filter", filters)
+workspace.FeedBlob("bias", bias)
+
+# Run the operator
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+Data shape:  (1, 1, 8, 8)
+Filter shape:  (3, 1, 5, 5)
+Bias shape:  (3,)
+Y:
+ [[[[  0.6406407    0.8620521    0.56461596]
+   [ -1.5042953   -0.79549205 -10.683343  ]
+   [ -0.5240259    3.4538248   -3.9564204 ]]
+
+  [[  0.6876496    4.8328524   -1.9525816 ]
+   [  1.2995434   -2.3895378    7.2670045 ]
+   [  3.9929862    1.8126237    5.4699917 ]]
+
+  [[  3.55949      4.7934155    0.76086235]
+   [  3.9588015   -1.3251319    4.413117  ]
+   [ -1.5296054   -1.4924102   -3.2552304 ]]]]
+
+```
+
+</details>
+
+
+)DOC";
+
+std::function<void(OpSchema&)> ConvDocGenerator(const char* dim) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+The convolution operator consumes an input vector, a {dim}filter blob
+and a bias blob and computes the output. {conv_doc})DOC";
+    ReplaceAll(doc, "{dim}", dim);
+    ReplaceAll(doc, "{conv_doc}", kConvDoc);
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "Input data blob, of shape $(N, C_{in}, H_{in}, W_{in})$, to be convolved with the kernels in the filter blob."
+      );
+    schema.Input(
+        1,
+        "filter",
+        "The filter blob, of shape $(M, C_{in}, K_H, K_W)$, containing the filters to be convolved with the data."
+      );
+    schema.Input(
+        2,
+        "bias",
+        "The bias blob, of length $M$, containing the biases for the convolution, one bias per filter."
+      );
+    schema.Output(
+        0,
+        "Y",
+        "Output data blob, of shape $(N, C_{out}, H_{out}, W_{out})$, that contains the result of the convolution."
+      );
+      /*
+    schema.Arg(
+        "kernel",
+        "*(type: int; default: 0)* Desired kernel size. If left at default the kernel size will be inferred from the input $filter$ blob.",
+        0
+    );
+    schema.Arg(
+        "stride",
+        "*(type: int; default: 1)* Controls the stride of the kernel as it traverses the input blob.",
+        0
+    );
+    schema.Arg(
+        "dilation",
+        "*(type: int; default: 1)* Controls spacing between kernel points. If dilation is greater than one, the kernel does not operate on a contiguous spatial region. For a visualization click [here](https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md).",
+        0
+    );
+    schema.Arg(
+        "pad",
+        "*(type: int; default: 0)* Controls the amount of padding to apply to the input feature map before computing the convolution.",
+        0
+    );
+    schema.Arg(
+        "float16_compute",
+        "*(type: bool; default: False)* Whether to use float-16 compute kernel.",
+        0
+    );
+    schema.Arg(
+        "group",
+        "*(type: int; default: 1)* Controls level of group convolution. For more info click [here](https://blog.yani.io/filter-group-tutorial/).",
+        0
+    );
+    schema.Arg(
+        "order",
+        "*(type: string; default: \"NCHW\")* Specifies the order of the input data blob, where $N$ is batch size, $C$ is number of channels, $H$ is spatial height, and $W$ is spatial width. The only other valid option is \"NHWC\".",
+        0
+    );
+    schema.Arg(
+        "shared_buffer",
+        "*(type: int; default: 0)*",
+        0
+    );
+    */
+  };
+}
+REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Conv)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
+        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
+    .FillUsing(ConvDocGenerator(""))
+    .InheritOnnxSchema("Conv");
+
+REGISTER_CPU_OPERATOR(Conv1D, ConvOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Conv1D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .FillUsing(ConvDocGenerator("1D "))
+    .InheritOnnxSchema("Conv");
+
+REGISTER_CPU_OPERATOR(Conv2D, ConvOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Conv2D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
+        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .FillUsing(ConvDocGenerator("2D "))
+    .InheritOnnxSchema("Conv");
+
+REGISTER_CPU_OPERATOR(Conv3D, ConvOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Conv3D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
+        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .FillUsing(ConvDocGenerator("3D "))
+    .InheritOnnxSchema("Conv");
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_op.h b/caffe2/operators/conv_op.h
new file mode 100644
index 0000000..7153b14
--- /dev/null
+++ b/caffe2/operators/conv_op.h
@@ -0,0 +1,79 @@
+#ifndef CAFFE2_OPERATORS_CONV_OP_H_
+#define CAFFE2_OPERATORS_CONV_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ConvOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  ConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {
+    // Since this is the default convolution implementation, we will
+    // use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
+    CAFFE_ENFORCE(
+        group_ == 1 || order_ == StorageOrder::NCHW,
+        "Group convolution only supports NCHW order right now.");
+
+    // Create shared buffer mutex in the constructor
+    // to avoid race-condition in DAGNet.
+    if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+      createSharedBuffer<Context>(ws_);
+    }
+  }
+  ~ConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+template <typename T, class Context>
+class ConvGradientOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+    CAFFE_ENFORCE(
+        group_ == 1 || order_ == StorageOrder::NCHW,
+        "Group convolution only supports NCHW order right now.");
+  }
+  ~ConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
+  bool no_bias_;
+  // input: X, W, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_OP_H_
diff --git a/caffe2/operators/conv_op_cache_cudnn.cc b/caffe2/operators/conv_op_cache_cudnn.cc
new file mode 100644
index 0000000..90bdb07
--- /dev/null
+++ b/caffe2/operators/conv_op_cache_cudnn.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/conv_op_cache_cudnn.h"
+
+#include <cudnn.h>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template class AlgorithmsCache<cudnnConvolutionFwdAlgo_t>;
+template class AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>;
+template class AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>;
+template class AlgorithmsCache<int>; // For testing.
+} // namespace caffe2
diff --git a/caffe2/operators/conv_op_cache_cudnn.h b/caffe2/operators/conv_op_cache_cudnn.h
new file mode 100644
index 0000000..ee3bae2
--- /dev/null
+++ b/caffe2/operators/conv_op_cache_cudnn.h
@@ -0,0 +1,66 @@
+#ifndef CAFFE2_OPERATORS_CONV_OP_CACHE_H_
+#define CAFFE2_OPERATORS_CONV_OP_CACHE_H_
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  //
+  TAlgorithm getAlgorithm(
+      const std::vector<TIndex>& tensorDimensions1,
+      const std::vector<TIndex>& tensorDimensions2,
+      int algorithmFlags, // Differentiate between algorithms with different
+                          // parameters in a generic way
+      std::function<TAlgorithm()> generatingFunc);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::getAlgorithm(
+    const std::vector<TIndex>& tensorDimensions1,
+    const std::vector<TIndex>& tensorDimensions2,
+    int algorithmFlags,
+    std::function<TAlgorithm()> generatingFunc) {
+  int64_t seed = 0;
+  // Hash all of the inputs, which we wiill then use to try and look up
+  // a previously discovered algorithm, or fall back to generating a new one.
+  std::hash<TIndex> hashFn;
+  for (const auto num : tensorDimensions1) {
+    // Copied from boost::hash_combine.
+    // Adding 1 to differentiate between first and second vector.
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : tensorDimensions2) {
+    // Copied from boost::hash_combine.
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  // Adding 2 to differentiate from previous vectors
+  seed ^= hashFn(algorithmFlags) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 2;
+
+  if (seed == 0) {
+    return generatingFunc();
+  }
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = generatingFunc();
+    hash_[seed] = value;
+  }
+
+  return hash_[seed];
+}
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/conv_op_cache_cudnn_test.cc b/caffe2/operators/conv_op_cache_cudnn_test.cc
new file mode 100644
index 0000000..2d2da0d
--- /dev/null
+++ b/caffe2/operators/conv_op_cache_cudnn_test.cc
@@ -0,0 +1,61 @@
+#include <vector>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/conv_op_cache_cudnn.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+TEST(AlgorithmsCacheTest, CachesCorrectly) {
+  AlgorithmsCache<int> cache;
+  int result = cache.getAlgorithm(
+      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 5; });
+  EXPECT_EQ(result, 5);
+
+  int res2 = cache.getAlgorithm(
+      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 10; });
+
+  EXPECT_EQ(res2, 5);
+}
+
+TEST(AlgorithmsCacheTest, KeysDifferIfOneVectorIsEmpty) {
+  AlgorithmsCache<int> cache;
+  int result = cache.getAlgorithm(
+      std::vector<TIndex>(1, 10), std::vector<TIndex>(), 0, []() { return 5; });
+  EXPECT_EQ(result, 5);
+
+  int res2 = cache.getAlgorithm(
+      std::vector<TIndex>(), std::vector<TIndex>(1, 10), 0, []() {
+        return 10;
+      });
+
+  EXPECT_EQ(res2, 10);
+}
+
+TEST(AlgorithmsCacheTest, KeysDifferIfFlagsAreDifferent) {
+  AlgorithmsCache<int> cache;
+  int result = cache.getAlgorithm(
+      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 123, []() {
+        return 5;
+      });
+  EXPECT_EQ(result, 5);
+
+  int res2 = cache.getAlgorithm(
+      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+        return 10;
+      });
+
+  EXPECT_EQ(res2, 10);
+
+  int res3 = cache.getAlgorithm(
+      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+        return 15;
+      });
+
+  EXPECT_EQ(res3, 10);
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
new file mode 100644
index 0000000..ddb0f8f
--- /dev/null
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -0,0 +1,1383 @@
+#include "caffe2/core/context_gpu.h"
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_cache_cudnn.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/op_utils_cudnn.h"
+
+namespace caffe2 {
+
+class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
+ public:
+  CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
+            "ws_nbytes_limit",
+            kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
+        exhaustive_search_(
+            OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
+        deterministic_(
+            OperatorBase::GetSingleArgument<int>("deterministic", 0)),
+        cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
+        force_algo_(OperatorBase::GetRepeatedArgument<int>("force_algo", vector<int>{-1,-1,-1})),
+        enable_tensor_core_(OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
+    CHECK(!deterministic_ || !exhaustive_search_);
+    CAFFE_ENFORCE(group_ > 0);
+    CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
+    for (int i = 0; i < kernel_.size(); ++i) {
+      OPERATOR_NEEDS_FEATURE(
+          pads_[i] == pads_[kernel_.size() + i],
+          "The current padding scheme leads to unequal padding on the left "
+          "and right, which is not supported by cudnn.");
+    }
+    // dilated convolution supported by some algorithms in cuDNN v6
+#if !(CUDNN_VERSION_MIN(6,0,0))
+    OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "The cudnn convolution does not support dilation yet.");
+#endif
+    // dilated grouped convolution supported in cuDNN v7.1
+#if !(CUDNN_VERSION_MIN(7,1,0))
+    if (group_ != 1) {
+      for (int dim = 0; dim < kernel_.size(); ++dim) {
+        OPERATOR_NEEDS_FEATURE(dilation_[dim] == 1,
+        "When group is used, dilation should not be set at the same time.");
+      }
+    }
+#endif
+
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    // verify TensorCore math is supported
+    enable_tensor_core_ &= TensorCoreAvailable();
+#else
+    enable_tensor_core_ = false;
+#endif
+
+    bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
+                                 OperatorBase::HasArgument("force_algo_dgrad") ||
+                                 OperatorBase::HasArgument("force_algo_wgrad");
+    if (OperatorBase::HasArgument("force_algo")) {
+      CAFFE_ENFORCE(!individual_force_algo,
+                   "Cannot specify both force_algo and any of",
+                   "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
+    } else {
+      force_algo_ = std::vector<int>{-1,-1,-1};
+      force_algo_[ALGO_FWD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
+      force_algo_[ALGO_DGRAD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
+      force_algo_[ALGO_WGRAD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
+    }
+
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
+    CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
+  }
+
+  ~CudnnConvOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
+    CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  }
+
+ protected:
+  // A helper function to set up the tensor Nd desriptor, depending on the order
+  // the group and the type given.
+  template <typename T>
+  void SetTensorNdDescriptorWithGroup(
+      int size,
+      cudnnTensorDescriptor_t tensorDesc,
+      int N,
+      int C,
+      int H,
+      int W,
+      int D) {
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    const int CC = C;
+#else
+    const int CC = C / group_;
+#endif
+    switch (order_) {
+      case StorageOrder::NHWC:
+        if (size == 4) {
+          CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
+              tensorDesc,
+              cudnnTypeWrapper<T>::type,
+              N,
+              CC,
+              H,
+              W,
+              H * W * C,
+              1,
+              W * C,
+              C));
+        } else {
+          vector<int> dims = {N, H, W, D, CC};
+          vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
+          CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+              tensorDesc,
+              cudnnTypeWrapper<T>::type,
+              size > 3 ? size : 4,
+              dims.data(),
+              strides.data()));
+        }
+        break;
+      case StorageOrder::NCHW:
+        if (size == 4) {
+          CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
+              tensorDesc,
+              cudnnTypeWrapper<T>::type,
+              N,
+              CC,
+              H,
+              W,
+              C * H * W,
+              H * W,
+              W,
+              1));
+        } else {
+          vector<int> dims = {N, CC, H, W, D};
+          vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
+          CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+              tensorDesc,
+              cudnnTypeWrapper<T>::type,
+              size > 3 ? size : 4,
+              dims.data(),
+              strides.data()));
+        }
+        break;
+      default:
+        LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+  }
+
+  void DuplicateConvDesc(
+      cudnnConvolutionDescriptor_t input,
+      size_t kernelDims,
+      size_t dilationDims,
+      cudnnConvolutionDescriptor_t copy) {
+    if (kernelDims == 2) {
+      cudnnConvolutionMode_t mode;
+      cudnnDataType_t dataType;
+      int pad_height = 0;
+      int pad_width = 0;
+      int stride_height = 0;
+      int stride_width = 0;
+      int dilation_height = 0;
+      int dilation_width = 0;
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
+          input,
+          &pad_height,
+          &pad_width,
+          &stride_height,
+          &stride_width,
+          &dilation_height,
+          &dilation_width,
+          &mode,
+          &dataType
+          ));
+#else
+      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
+          input,
+          &pad_height,
+          &pad_width,
+          &stride_height,
+          &stride_width,
+          &dilation_height,
+          &dilation_width,
+          &mode
+          ));
+#endif
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          copy,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          mode,
+          dataType
+          ));
+#else
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          copy,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          mode
+          ));
+#endif
+    } else {
+      cudnnConvolutionMode_t mode;
+      cudnnDataType_t dataType;
+      int arrayLength = 0;
+      vector<int> ones(dilationDims, 1);
+      CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
+          input,
+          kernel_.size(),
+          &arrayLength,
+          pads_.data(),
+          stride_.data(),
+          ones.data(),
+          &mode,
+          &dataType));
+
+      CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
+          copy,
+          kernel_.size(),
+          pads_.data(),
+          stride_.data(),
+          ones.data(),
+          mode,
+          dataType));
+    }
+  }
+
+  template <typename T>
+  cudnnDataType_t DetermineComputeTypeFromInput(const T& X) {
+    const cudaDeviceProp& prop = GetDeviceProperty(0);
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    if (X.template IsType<float16>()) {
+      if (float16_compute_ && prop.major >= 6) {
+        VLOG(1) << "CUDNN Convolution: float16_compute specified and "
+                << "supported, input data is float16 - using float16 "
+                << "compute.";
+        computeType = CUDNN_DATA_HALF;
+      } else if (float16_compute_) {
+        VLOG(1) << "CUDNN Convolution: float16_compute specified but"
+                << "not supported, input data is float16 - using float32 "
+                << "compute.";
+      } else {
+        VLOG(1) << "CUDNN Convolution: float16_compute not specified but "
+                << "input data is float16 - using float32 compute.";
+      }
+    } else {
+      VLOG(1) << "CUDNN Convolution: using float32 compute.";
+    }
+    return computeType;
+  }
+
+  void SetConvDescFromArguments() {
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    if (kernel_.size() == 2) {
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          conv_desc_,
+          pad_t(),
+          pad_l(),
+          stride_h(),
+          stride_w(),
+          dilation_h(),
+          dilation_w(),
+          CUDNN_CROSS_CORRELATION,
+          compute_type_));
+    } else {
+      CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
+          conv_desc_,
+          kernel_.size(),
+          pads_.data(),
+          stride_.data(),
+          dilation_.data(),
+          CUDNN_CROSS_CORRELATION,
+          compute_type_));
+    }
+#else
+    if (kernel_.size() == 2) {
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          conv_desc_,
+          pad_t(),
+          pad_l(),
+          stride_h(),
+          stride_w(),
+          1,
+          1,
+          CUDNN_CROSS_CORRELATION));
+    } else {
+      vector<int> ones(dilation_.size(), 1);
+      CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
+          conv_desc_,
+          kernel_.size(),
+          pads_.data(),
+          stride_.data(),
+          ones.data(),
+          CUDNN_CROSS_CORRELATION,
+          compute_type_));
+    }
+#endif
+  }
+
+  void SetConvDescComputeType(
+      cudnnConvolutionDescriptor_t conv_desc,
+      cudnnDataType_t math) {
+    if (kernel_.size() == 2) {
+      cudnnConvolutionMode_t mode;
+      cudnnDataType_t dataType;
+      int pad_height = 0;
+      int pad_width = 0;
+      int stride_height = 0;
+      int stride_width = 0;
+      int dilation_height = 0;
+      int dilation_width = 0;
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
+          conv_desc,
+          &pad_height,
+          &pad_width,
+          &stride_height,
+          &stride_width,
+          &dilation_height,
+          &dilation_width,
+          &mode,
+          &dataType
+          ));
+#else
+      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
+          conv_desc,
+          &pad_height,
+          &pad_width,
+          &stride_height,
+          &stride_width,
+          &dilation_height,
+          &dilation_width,
+          &mode
+          ));
+#endif
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          conv_desc,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          mode,
+          math
+          ));
+#else
+      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+          conv_desc,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          mode
+          ));
+#endif
+    } else {
+      cudnnConvolutionMode_t mode;
+      cudnnDataType_t dataType;
+      int arrayLength = 0;
+      vector<int> ones(dilation_.size(), 1);
+      CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
+          conv_desc,
+          kernel_.size(),
+          &arrayLength,
+          pads_.data(),
+          stride_.data(),
+          ones.data(),
+          &mode,
+          &dataType));
+
+      CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
+          conv_desc,
+          kernel_.size(),
+          pads_.data(),
+          stride_.data(),
+          ones.data(),
+          mode,
+          math));
+    }
+  }
+
+  vector<TIndex> cudnn_input_dims_;
+  vector<TIndex> cudnn_filter_dims_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  // top desc for bias add in case we do group convolution
+  cudnnTensorDescriptor_t top_desc_for_bias_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  const size_t cudnn_ws_nbytes_limit_;
+  size_t cudnn_ws_nbytes_;
+  bool exhaustive_search_;
+  bool deterministic_;
+  size_t cudnn_state_;
+  vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
+  bool enable_tensor_core_;
+  cudnnDataType_t compute_type_;
+};
+
+class CudnnConvOp final : public CudnnConvOpBase {
+ public:
+  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvOpBase(operator_def, ws) {}
+
+  ~CudnnConvOp() {}
+
+  template <typename T_X, typename T_W, typename T_B, typename T_Y>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ private:
+  cudnnConvolutionFwdAlgo_t algo_;
+  using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
+  AlgorithmsCache<ConvFwdAlgorithmWithCost> algo_cache_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+class CudnnConvGradientOp final : public CudnnConvOpBase {
+ public:
+  CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvOpBase(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+
+    CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
+    CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
+  }
+
+  ~CudnnConvGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
+    CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
+  }
+
+  template <
+      typename T_X,
+      typename T_DY,
+      typename T_W,
+      typename T_B,
+      typename T_DX,
+      typename T_DW,
+      typename T_DB>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ private:
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
+  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
+  using ConvBwdFilterAlgorithmWithCost =
+      std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
+  using ConvBwdDataAlgorithmWithCost =
+      std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
+  AlgorithmsCache<ConvBwdFilterAlgorithmWithCost> filter_algo_cache_;
+  AlgorithmsCache<ConvBwdDataAlgorithmWithCost> data_algo_cache_;
+  bool no_bias_;
+  // input: X, W, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
+    CUDNN_DATA_FLOAT,
+    CUDNN_DATA_HALF};
+static constexpr std::array<const char*, 2> kComputePassNames = {
+    "fp32 compute",
+    "fp16 compute"};
+
+template <typename T_X, typename T_W, typename T_B, typename T_Y>
+bool CudnnConvOp::DoRunWithType() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+
+  // Figure out the output shape
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
+  const int M = filter.dim32(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M);
+  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
+  int group_offset_X = 0, group_offset_Y = 0;
+
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.ndim() > 3 ? X.dim32(2) : 1;
+      D = X.ndim() > 4 ? X.dim32(3) : 1;
+      C = X.dim32(X.ndim() - 1);
+      H_out = Y->dim32(1);
+      W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
+      D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
+      for (int i = 0; i < kernel_.size(); ++i) {
+        CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
+      }
+      CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
+      group_offset_X = C / group_;
+      group_offset_Y = M / group_;
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      C = X.dim32(1);
+      H = X.dim32(2);
+      W = X.ndim() > 3 ? X.dim32(3) : 1;
+      D = X.ndim() > 4 ? X.dim32(4) : 1;
+      H_out = Y->dim32(2);
+      W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
+      D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
+      CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
+      for (int i = 0; i < kernel_.size(); ++i) {
+        CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
+      }
+      group_offset_X = C / group_ * H * W * D;
+      group_offset_Y = M / group_ * H_out * W_out * D_out;
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+
+  CAFFE_ENFORCE(
+      C % group_ == 0,
+      "If you set group, the number of input channels should be divisible "
+      "by group.");
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "If you set group, the number of output channels should be divisible "
+      "by group.");
+
+  int group_offset_filter = filter.size() / group_;
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      SetTensorNdDescriptorWithGroup<T_X>(
+          X.ndim(), bottom_desc_, N, C, H, W, D);
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      if (kernel_.size() == 2) {
+#if CUDNN_VERSION_MIN(7, 0, 0)
+        const int MM = M;
+#else
+        const int MM = M / group_;
+#endif
+        CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
+            filter_desc_,
+            cudnnTypeWrapper<T_W>::type,
+            GetCudnnTensorFormat(order_),
+            MM,
+            C / group_,
+            kernel_h(),
+            kernel_w()));
+      } else {
+        vector<int> dims(filter.dims().begin(), filter.dims().end());
+        dims[0] /= group_;
+#if !CUDNN_VERSION_MIN(7, 0, 0)
+        order_ == StorageOrder::NCHW ? dims[1] /= group_
+                                     : dims[filter.ndim() - 1] /= group_;
+#endif
+        dims[filter.ndim() - 1] /= group_;
+        CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
+            filter_desc_,
+            cudnnTypeWrapper<T_W>::type,
+            GetCudnnTensorFormat(order_),
+            dims.size(),
+            dims.data()));
+      }
+      if (InputSize() == 3) {
+        if (kernel_.size() == 2) {
+          CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+              bias_desc_,
+              GetCudnnTensorFormat(order_),
+              cudnnTypeWrapper<T_B>::type,
+              1,
+              M,
+              1,
+              1));
+        } else {
+          std::vector<int> bias_dims(X.ndim(), 1);
+          bias_dims[1] = M;
+          std::vector<int> strides = {M, 1, 1, 1, 1, 1};
+          CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+              bias_desc_,
+              cudnnTypeWrapper<T_B>::type,
+              X.ndim() > 3 ? X.ndim() : 4,
+              bias_dims.data(),
+              strides.data()));
+        }
+      }
+    }
+    // Set the output
+    SetTensorNdDescriptorWithGroup<T_Y>(
+        X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
+    // Set the output with descriptor useful for bias addition in one run.
+    if (kernel_.size() == 2) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          top_desc_for_bias_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T_B>::type,
+          N,
+          M,
+          H_out,
+          W_out));
+    } else {
+      vector<int> dims = {N, M, H_out, W_out, D_out};
+      vector<int> strides = {M * H_out * W_out * D_out,
+                             H_out * W_out * D_out,
+                             W_out * D_out,
+                             D_out,
+                             1};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          top_desc_for_bias_,
+          cudnnTypeWrapper<T_B>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    }
+
+    compute_type_ = DetermineComputeTypeFromInput(X);
+    SetConvDescFromArguments();
+
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    if (enable_tensor_core_) {
+      CUDNN_ENFORCE(
+          cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
+    }
+
+    // enable cuDNN conv groups
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
+#endif
+
+    if (force_algo_[ALGO_FWD] >= 0) {
+      algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
+    } else if (deterministic_) {
+      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else if (exhaustive_search_) {
+      // Even when FP16 compute is supported and requested, try FP32
+      // because it may be faster. However, if FP32 compute is specified,
+      // FP16 is not a suitable alternative - early out from the loop.
+      std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
+      for (int i = 0; i < 2; i++) {
+        SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
+
+        algosToCompare[i] = algo_cache_.getAlgorithm(
+            X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
+              VLOG(1) << "CUDNN Convolution fwd: doing exhaustive "
+                      << "search for " << kComputePassNames[i];
+              // When we do an exhaustive search, we will ignore the workspace
+              // size limit and simply go for the fastest algorithm. If you
+              // happen to run out of memory later, you will be on your own...
+              int returned_algo_count;
+              std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+                  fwd_perf_stat;
+
+              // no need to clean up workspace,
+              cudnn_wrapper_.with_cudnn_state(
+                  cudnn_state_, [&](CuDNNState* state) {
+                    // Actually run the search.
+                    CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
+                        state->cudnn_handle(),
+                        bottom_desc_,
+                        X.template data<T_X>(),
+                        filter_desc_,
+                        filter.template data<T_W>(),
+                        conv_desc_,
+                        top_desc_,
+                        Y->template mutable_data<T_Y>(),
+                        kNUM_CUDNN_FWD_ALGS,
+                        &returned_algo_count,
+                        fwd_perf_stat.data(),
+                        state->workspace().get(cudnn_ws_nbytes_limit_),
+                        cudnn_ws_nbytes_limit_));
+                  });
+              LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
+              float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
+                  ? fwd_perf_stat[0].time
+                  : 1e10;
+              return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
+            });
+
+        // When set to fp32 compute, don't try fp16
+        if (compute_type_ == CUDNN_DATA_FLOAT) {
+          break;
+        }
+      }
+
+      if (compute_type_ == CUDNN_DATA_FLOAT) {
+        // For FP32 compute, just use the best FP32 algorithm
+        algo_ = std::get<0>(algosToCompare[0]);
+      } else {
+        // For FP16 compute, choose algo with fastest execution
+        int bestAlgoIndex =
+            (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
+            ? 0
+            : 1;
+        algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
+        SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
+      }
+    } else {
+      // Get the convolution algorithm based on the workspace limit.
+      CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          bottom_desc_,
+          filter_desc_,
+          conv_desc_,
+          top_desc_,
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &algo_));
+    }
+    CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        bottom_desc_,
+        filter_desc_,
+        conv_desc_,
+        top_desc_,
+        algo_,
+        &cudnn_ws_nbytes_));
+    VLOG(1) << "CuDNN algorithm: " << algo_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+  }
+
+  // Now, actually run the computation.
+  // Run directly through cuDNN if possible
+#if CUDNN_VERSION_MIN(7,0,0)
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_ENFORCE(cudnnConvolutionForward(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T_X>::kOne(),
+        bottom_desc_,
+        X.template data<T_X>(),
+        filter_desc_,
+        filter.template data<T_W>(),
+        conv_desc_,
+        algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T_Y>::kZero(),
+        top_desc_,
+        Y->template mutable_data<T_Y>()));
+  });
+#else
+  // otherwise manually run through groups
+  for (int i = 0; i < group_; ++i) {
+    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+      CUDNN_ENFORCE(cudnnConvolutionForward(
+          state->cudnn_handle(),
+          cudnnTypeWrapper<T_X>::kOne(),
+          bottom_desc_,
+          X.template data<T_X>() + i * group_offset_X,
+          filter_desc_,
+          filter.template data<T_W>() + i * group_offset_filter,
+          conv_desc_,
+          algo_,
+          state->workspace().get(cudnn_ws_nbytes_),
+          cudnn_ws_nbytes_,
+          cudnnTypeWrapper<T_Y>::kZero(),
+          top_desc_,
+          Y->template mutable_data<T_Y>() + i * group_offset_Y));
+    });
+  }
+#endif
+  // Bias
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+
+    CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+    CAFFE_ENFORCE_EQ(bias.dim32(0), M);
+
+    CUDNN_ENFORCE(cudnnAddTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T_B>::kOne(),
+        bias_desc_,
+        bias.template data<T_B>(),
+        cudnnTypeWrapper<T_Y>::kOne(),
+        top_desc_for_bias_,
+        Y->template mutable_data<T_Y>()));
+  }
+  // Done.
+  return true;
+}
+
+bool CudnnConvOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<
+        float, // X
+        float, // W
+        float, // B
+        float>(); // Y
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<
+        float16, // X
+        float16, // W
+        float16, // B
+        float16>(); // Y
+  } else {
+    LOG(FATAL) << "Only float (32bit) and float16 are supported by "
+               << "cudnn convolution, but input " << debug_def().input(0)
+               << " has [" << Input(0).meta().name() << "]";
+  }
+  return true;
+}
+
+template <
+    typename T_X,
+    typename T_DY,
+    typename T_W,
+    typename T_B,
+    typename T_DX,
+    typename T_DW,
+    typename T_DB>
+bool CudnnConvGradientOp::DoRunWithType() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
+
+  const int M = filter.dim32(0);
+  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
+  int group_offset_X = 0, group_offset_Y = 0;
+
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.ndim() > 3 ? X.dim32(2) : 1;
+      D = X.ndim() > 4 ? X.dim32(3) : 1;
+      C = X.dim32(X.ndim() - 1);
+      H_out = dY.dim32(1);
+      W_out = dY.ndim() > 3 ? dY.dim32(2) : 1;
+      D_out = dY.ndim() > 4 ? dY.dim32(3) : 1;
+      for (int i = 0; i < kernel_.size(); ++i) {
+        CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
+      }
+      CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
+      group_offset_X = C / group_;
+      group_offset_Y = M / group_;
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      C = X.dim32(1);
+      H = X.dim32(2);
+      W = X.ndim() > 3 ? X.dim32(3) : 1;
+      D = X.ndim() > 4 ? X.dim32(4) : 1;
+      H_out = dY.dim32(2);
+      W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
+      D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
+      CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
+      for (int i = 0; i < kernel_.size(); ++i) {
+        CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
+      }
+      group_offset_X = C / group_ * H * W * D;
+      group_offset_Y = M / group_ * H_out * W_out * D_out;
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+
+  CAFFE_ENFORCE(
+      C % group_ == 0,
+      "If you set group, the number of input channels should be divisible "
+      "by group.");
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "If you set group, the number of output channels should be divisible "
+      "by group.");
+
+  int group_offset_filter = filter.size() / group_;
+  if (kernel_.size() == 1) {
+    ConvPoolOpBase<CUDAContext>::ComputePads({H});
+  } else if (kernel_.size() == 2) {
+    ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
+  } else if (kernel_.size() == 3) {
+    ConvPoolOpBase<CUDAContext>::ComputePads({H, W, D});
+  } else {
+    CAFFE_THROW("Unsupported kernel size:", kernel_.size());
+  }
+  dfilter->ResizeLike(filter);
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      SetTensorNdDescriptorWithGroup<T_X>(
+          X.ndim(), bottom_desc_, N, C, H, W, D);
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      if (kernel_.size() == 2) {
+#if CUDNN_VERSION_MIN(7, 0, 0)
+        const int MM = M;
+#else
+        const int MM = M / group_;
+#endif
+        CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
+            filter_desc_,
+            cudnnTypeWrapper<T_W>::type,
+            GetCudnnTensorFormat(order_),
+            MM,
+            C / group_,
+            kernel_h(),
+            kernel_w()));
+      } else {
+        vector<int> dims(filter.dims().begin(), filter.dims().end());
+#if !CUDNN_VERSION_MIN(7, 0, 0)
+        dims[0] /= group_;
+#endif
+        order_ == StorageOrder::NCHW ? dims[1] /= group_
+                                     : dims[filter.ndim() - 1] /= group_;
+        CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
+            filter_desc_,
+            cudnnTypeWrapper<T_W>::type,
+            GetCudnnTensorFormat(order_),
+            dims.size(),
+            dims.data()));
+      }
+      if (!no_bias_) {
+        if (kernel_.size() == 2) {
+          CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+              bias_desc_,
+              GetCudnnTensorFormat(order_),
+              cudnnTypeWrapper<T_B>::type,
+              1,
+              M,
+              1,
+              1));
+        } else {
+          std::vector<int> bias_dims(X.ndim(), 1);
+          bias_dims[1] = M;
+          std::vector<int> strides = {M, 1, 1, 1, 1, 1};
+          CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+              bias_desc_,
+              cudnnTypeWrapper<T_B>::type,
+              X.ndim() > 3 ? X.ndim() : 4,
+              bias_dims.data(),
+              strides.data()));
+        }
+      }
+    }
+    // Set the output
+    SetTensorNdDescriptorWithGroup<T_DX>(
+        X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
+    // Set the output with descriptor useful for bias addition in one run.
+    if (kernel_.size() == 2) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          top_desc_for_bias_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T_B>::type,
+          N,
+          M,
+          H_out,
+          W_out));
+    } else {
+      vector<int> dims = {N, M, H_out, W_out, D_out};
+      vector<int> strides = {M * H_out * W_out * D_out,
+                             H_out * W_out * D_out,
+                             W_out * D_out,
+                             D_out,
+                             1};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          top_desc_for_bias_,
+          cudnnTypeWrapper<T_B>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    }
+
+    compute_type_ = DetermineComputeTypeFromInput(X);
+    SetConvDescFromArguments();
+
+    DuplicateConvDesc(
+        conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
+    DuplicateConvDesc(
+        conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
+
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    if (enable_tensor_core_) {
+      CUDNN_ENFORCE(cudnnSetConvolutionMathType(
+          bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
+      CUDNN_ENFORCE(cudnnSetConvolutionMathType(
+          bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
+    }
+
+    // set cuDNN groups if appropriate
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
+#endif
+
+    // Choose dW algorithm
+    if (force_algo_[ALGO_WGRAD] >= 0) {
+      bwd_filter_algo_ =
+          (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
+    } else if (deterministic_) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    } else if (exhaustive_search_) {
+      // Even when FP16 compute is supported and requested, try FP32
+      // because it may be faster. However, if FP32 compute is specified,
+      // FP16 is not a suitable alternative - early out from the loop.
+      std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
+      for (int i = 0; i < 2; i++) {
+        SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
+
+        algosToCompare[i] = filter_algo_cache_.getAlgorithm(
+            X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
+              VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive"
+                      << "search for " << kComputePassNames[i];
+              // When we do an exhaustive search, we will ignore the workspace
+              // size limit and simply go for the fastest algorithm. If you
+              // happen to run out of memory later, you will be on your own...
+              int returned_algo_count;
+              // We clean up the current workspace memory so that the forward
+              // algorithm is free to allocate memory.
+              // Actually run the search.
+              std::array<
+                  cudnnConvolutionBwdFilterAlgoPerf_t,
+                  kNUM_CUDNN_BWD_FILTER_ALGS>
+                  filter_perf_stat;
+
+              cudnn_wrapper_.with_cudnn_state(
+                  cudnn_state_, [&](CuDNNState* state) {
+                    CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                        state->cudnn_handle(),
+                        bottom_desc_,
+                        X.template data<T_X>(),
+                        top_desc_,
+                        dY.template data<T_DY>(),
+                        bwd_filter_conv_desc_,
+                        filter_desc_,
+                        dfilter->template mutable_data<T_DW>(),
+                        kNUM_CUDNN_BWD_FILTER_ALGS,
+                        &returned_algo_count,
+                        filter_perf_stat.data(),
+                        state->workspace().get(cudnn_ws_nbytes_limit_),
+                        cudnn_ws_nbytes_limit_));
+                  });
+              LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
+              float algo_time =
+                  filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
+                  ? filter_perf_stat[0].time
+                  : 1e10;
+              return ConvBwdFilterAlgorithmWithCost(
+                  filter_perf_stat[0].algo, algo_time);
+            });
+
+        // When set to fp32 compute, don't try fp16
+        if (compute_type_ == CUDNN_DATA_FLOAT) {
+          break;
+        }
+      }
+
+      if (compute_type_ == CUDNN_DATA_FLOAT) {
+        // For FP32 compute, just use the best FP32 algorithm
+        bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
+      } else {
+        // For FP16 compute, choose algo with fastest execution
+        int bestAlgoIndex =
+            (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
+            ? 0
+            : 1;
+        bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
+        SetConvDescComputeType(
+            bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
+      }
+    } else {
+      // choose backward algorithm for filter
+      CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          bottom_desc_,
+          top_desc_,
+          bwd_filter_conv_desc_,
+          filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &bwd_filter_algo_));
+    }
+    // Pick dX algo if needed
+    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+      if (force_algo_[ALGO_DGRAD] >= 0) {
+        bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
+      } else if (deterministic_) {
+        bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      } else if (exhaustive_search_) {
+        // Even when FP16 compute is supported and requested, try FP32
+        // because it may be faster. However, if FP32 compute is specified,
+        // FP16 is not a suitable alternative - early out from the loop.
+        std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
+        for (int i = 0; i < 2; i++) {
+          SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
+
+          algosToCompare[i] = data_algo_cache_.getAlgorithm(
+              X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
+                VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive"
+                        << "search for " << kComputePassNames[i];
+                int returned_algo_count;
+
+                std::array<
+                    cudnnConvolutionBwdDataAlgoPerf_t,
+                    kNUM_CUDNN_BWD_DATA_ALGS>
+                    data_perf_stat;
+                cudnn_wrapper_.with_cudnn_state(
+                    cudnn_state_, [&](CuDNNState* state) {
+                      auto* dX =
+                          Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+                      dX->ResizeLike(X);
+                      const T_W* filter_data = filter.template data<T_W>();
+                      const T_DY* dYdata = dY.template data<T_DY>();
+                      T_DX* dXdata = dX->template mutable_data<T_DX>();
+                      CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
+                          state->cudnn_handle(),
+                          filter_desc_,
+                          filter_data,
+                          top_desc_,
+                          dYdata,
+                          bwd_data_conv_desc_,
+                          bottom_desc_,
+                          dXdata,
+                          kNUM_CUDNN_BWD_DATA_ALGS,
+                          &returned_algo_count,
+                          data_perf_stat.data(),
+                          state->workspace().get(cudnn_ws_nbytes_limit_),
+                          cudnn_ws_nbytes_limit_));
+                    });
+
+                LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
+                float algo_time =
+                    data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
+                    ? data_perf_stat[0].time
+                    : 1e10;
+                return ConvBwdDataAlgorithmWithCost(
+                    data_perf_stat[0].algo, algo_time);
+              });
+
+          // When set to fp32 compute, don't try fp16
+          if (compute_type_ == CUDNN_DATA_FLOAT) {
+            break;
+          }
+        }
+
+        if (compute_type_ == CUDNN_DATA_FLOAT) {
+          // For FP32 compute, just use the best FP32 algorithm
+          bwd_data_algo_ = std::get<0>(algosToCompare[0]);
+        } else {
+          // For FP16 compute, choose algo with fastest execution
+          int bestAlgoIndex =
+              (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
+              ? 0
+              : 1;
+          bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
+          SetConvDescComputeType(
+              bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
+        }
+      } else {
+        CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
+            cudnn_wrapper_.inline_cudnn_handle(),
+            filter_desc_,
+            top_desc_,
+            bwd_data_conv_desc_,
+            bottom_desc_,
+            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+            cudnn_ws_nbytes_limit_,
+            &bwd_data_algo_));
+      }
+    }
+
+    // get workspace size for backwards filter algorithm
+    size_t bwd_filter_ws_size, bwd_data_ws_size;
+
+    CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        bottom_desc_,
+        top_desc_,
+        bwd_filter_conv_desc_,
+        filter_desc_,
+        bwd_filter_algo_,
+        &bwd_filter_ws_size));
+    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+      // get workspace size for backwards data algorithm
+      CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          filter_desc_,
+          top_desc_,
+          bwd_data_conv_desc_,
+          bottom_desc_,
+          bwd_data_algo_,
+          &bwd_data_ws_size));
+    } else {
+      bwd_data_ws_size = 0;
+    }
+    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
+
+    VLOG(1) << "CuDNN bwd data & filter algorithm: " << bwd_data_algo_ << ", "
+            << bwd_filter_algo_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+  }
+
+  // Now, actually run the computation.
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(M);
+    CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T_DY>::kOne(),
+        top_desc_for_bias_,
+        dY.template data<T_DY>(),
+        cudnnTypeWrapper<T_DB>::kZero(),
+        bias_desc_,
+        dbias->template mutable_data<T_DB>()));
+  }
+
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T_X>::kOne(),
+        bottom_desc_,
+        X.template data<T_X>(),
+        top_desc_,
+        dY.template data<T_DY>(),
+        bwd_filter_conv_desc_,
+        bwd_filter_algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T_DW>::kZero(),
+        filter_desc_,
+        dfilter->template mutable_data<T_DW>()));
+    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+      // Compute the gradient w.r.t. the input.
+      auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+      dX->ResizeLike(X);
+      CUDNN_ENFORCE(cudnnConvolutionBackwardData(
+          state->cudnn_handle(),
+          cudnnTypeWrapper<T_W>::kOne(),
+          filter_desc_,
+          filter.template data<T_W>(),
+          top_desc_,
+          dY.template data<T_DY>(),
+          bwd_data_conv_desc_,
+          bwd_data_algo_,
+          state->workspace().get(cudnn_ws_nbytes_),
+          cudnn_ws_nbytes_,
+          cudnnTypeWrapper<T_DX>::kZero(),
+          bottom_desc_,
+          dX->template mutable_data<T_DX>()));
+    }
+  });
+#else
+  for (int i = 0; i < group_; ++i) {
+    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+      CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
+          state->cudnn_handle(),
+          cudnnTypeWrapper<T_X>::kOne(),
+          bottom_desc_,
+          X.template data<T_X>() + i * group_offset_X,
+          top_desc_,
+          dY.template data<T_DY>() + i * group_offset_Y,
+          bwd_filter_conv_desc_,
+          bwd_filter_algo_,
+          state->workspace().get(cudnn_ws_nbytes_),
+          cudnn_ws_nbytes_,
+          cudnnTypeWrapper<T_DW>::kZero(),
+          filter_desc_,
+          dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
+      if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+        // Compute the gradient w.r.t. the input.
+        auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+        dX->ResizeLike(X);
+        CUDNN_ENFORCE(cudnnConvolutionBackwardData(
+            state->cudnn_handle(),
+            cudnnTypeWrapper<T_W>::kOne(),
+            filter_desc_,
+            filter.template data<T_W>() + i * group_offset_filter,
+            top_desc_,
+            dY.template data<T_DY>() + i * group_offset_Y,
+            bwd_data_conv_desc_,
+            bwd_data_algo_,
+            state->workspace().get(cudnn_ws_nbytes_),
+            cudnn_ws_nbytes_,
+            cudnnTypeWrapper<T_DX>::kZero(),
+            bottom_desc_,
+            dX->template mutable_data<T_DX>() + i * group_offset_X));
+      }
+    });
+  }
+#endif
+  return true;
+}
+
+// TODO(Yangqing): a lot of the function contents are very similar. Consider
+// consolidating them.
+bool CudnnConvGradientOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<
+        float, //  X
+        float, // dY
+        float, //  W
+        float, //  b
+        float, // dX
+        float, // dW
+        float>(); // db
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<
+        float16, //  X
+        float16, // dY
+        float16, //  W
+        float16, //  b
+        float16, // dX
+        float16, // dW
+        float16>(); // db
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp);
+REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp);
+
+REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp);
+REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp);
+
+REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp);
+REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp);
+
+REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp);
+REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_op_eigen.cc b/caffe2/operators/conv_op_eigen.cc
new file mode 100644
index 0000000..b565b56
--- /dev/null
+++ b/caffe2/operators/conv_op_eigen.cc
@@ -0,0 +1,221 @@
+#include "Eigen/Core"
+#include "caffe2/utils/eigen_utils.h"
+
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace caffe2 {
+
+template <typename T>
+class EigenConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
+  EigenConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(group_ == 1, "Group convolution not supported yet.");
+  }
+  ~EigenConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+// The NCHW implementation: we do explicit transposes before and after, which
+// are not ideal but provides a compatible path instead of throwing the error.
+template <typename T>
+bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  CAFFE_ENFORCE(4 == filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) == C);
+  CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
+  CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
+  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+  Eigen::array<TIndex, 4> kernel_shuffles
+      { {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
+  Eigen::array<TIndex, 4> input_shuffles
+      { {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };
+
+  Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
+      Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
+          const_cast<T*>(filter.template data<T>()),
+          M,
+          C,
+          kernel_h(),
+          kernel_w())
+          .shuffle(kernel_shuffles);
+  Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
+      Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
+          const_cast<T*>(X.template data<T>()), N, C, H, W)
+          .shuffle(input_shuffles);
+
+  // For Eigen, the definition of row and col actually correspond to width
+  // and height instead of the other way round, so notice how we pass the
+  // stride, pad and dilation values.
+  typedef typename Eigen::internal::traits<
+      Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
+  Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
+
+  Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
+  pre_contract_dims[1] = kernel_h() * kernel_w() * C;
+  pre_contract_dims[0] = Y->size() / M;
+
+  Eigen::DSizes<TensorIndex, 2> kernel_dims;
+  kernel_dims[0] = kernel_h() * kernel_w() * C;
+  kernel_dims[1] = M;
+
+  Eigen::array<TensorIndex, 4> bcast_dims;
+  bcast_dims[0] = N;
+  bcast_dims[1] = Y->dim32(1);
+  bcast_dims[2] = Y->dim32(2);
+  bcast_dims[3] = 1;
+
+  Eigen::Tensor<T, 4, Eigen::RowMajor> Y_tensor(
+      Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
+  Y_tensor = X_tensor
+                 .extract_image_patches(
+                     kernel_w(),
+                     kernel_h(),
+                     stride_w(),
+                     stride_h(),
+                     dilation_w(),
+                     dilation_h(),
+                     1,
+                     1,
+                     pad_l(),
+                     pad_r(),
+                     pad_t(),
+                     pad_b(),
+                     0)
+                 .reshape(pre_contract_dims)
+                 .contract(filter_tensor.reshape(kernel_dims), contract_dims)
+                 .reshape(Y_tensor.dimensions());
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(1 == bias.ndim());
+    CAFFE_ENFORCE(bias.dim32(0) == M);
+    // It seems that the bias broadcast is still slower so let's do the
+    // following for now.
+    EigenArrayMap<T> Y_arr(
+        Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
+    ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
+    Y_arr = Y_arr.colwise() + bias_arr;
+  }
+
+  // Do a last transpose.
+  Eigen::array<TIndex, 4> output_shuffles
+      { {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };
+
+  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
+      Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
+      Y_tensor.shuffle(output_shuffles);
+  return true;
+}
+
+template <typename T>
+bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+  CAFFE_ENFORCE(4 == filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
+  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
+  CAFFE_ENFORCE(filter.dim32(3) == C);
+  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+  // Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
+  // optimization purposes, so we will create a temp one.
+  Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
+      M, kernel_h() * kernel_w() * C);
+  temp_filter = ConstEigenArrayMap<T>(
+                    filter.template data<T>(), kernel_h() * kernel_w() * C, M)
+                    .transpose();
+
+  // Create tensor maps, and call spatial convolution.
+  // TODO(jiayq): right now we const cast away the const pointer, but we will
+  // need to figure out how to properly do a const tensormap.
+  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> X_tensor(
+      const_cast<T*>(X.template data<T>()), N, H, W, C);
+  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
+      Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
+  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
+      const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);
+
+  // For Eigen, the definition of row and col actually correspond to width
+  // and height instead of the other way round, so notice how we pass the
+  // stride, pad and dilation values.
+  typedef typename Eigen::internal::traits<
+      Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
+  Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
+
+  Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
+  pre_contract_dims[1] = kernel_h() * kernel_w() * C;
+  pre_contract_dims[0] = Y->size() / M;
+
+  Eigen::DSizes<TensorIndex, 2> kernel_dims;
+  kernel_dims[0] = kernel_h() * kernel_w() * C;
+  kernel_dims[1] = M;
+
+  Eigen::array<TensorIndex, 4> bcast_dims;
+  bcast_dims[0] = N;
+  bcast_dims[1] = Y->dim32(1);
+  bcast_dims[2] = Y->dim32(2);
+  bcast_dims[3] = 1;
+
+  Y_tensor = X_tensor
+                 .extract_image_patches(
+                     kernel_w(),
+                     kernel_h(),
+                     stride_w(),
+                     stride_h(),
+                     dilation_w(),
+                     dilation_h(),
+                     1,
+                     1,
+                     pad_l(),
+                     pad_r(),
+                     pad_t(),
+                     pad_b(),
+                     0)
+                 .reshape(pre_contract_dims)
+                 .contract(filter_tensor.reshape(kernel_dims), contract_dims)
+                 .reshape(Y_tensor.dimensions());
+
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(1 == bias.ndim());
+    CAFFE_ENFORCE(bias.dim32(0) == M);
+    Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> bias_tensor(
+        const_cast<T*>(bias.template data<T>()), 1, 1, 1, M);
+    // It seems that the bias broadcast is still slower so let's do the
+    // following for now.
+    EigenArrayMap<T> Y_arr(
+        Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
+    ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
+    Y_arr = Y_arr.colwise() + bias_arr;
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, EIGEN, EigenConvOp<float>);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv1D, EIGEN, EigenConvOp<float>);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv2D, EIGEN, EigenConvOp<float>);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv3D, EIGEN, EigenConvOp<float>);
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/conv_op_gpu.cc b/caffe2/operators/conv_op_gpu.cc
new file mode 100644
index 0000000..d147cb5
--- /dev/null
+++ b/caffe2/operators/conv_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(Conv1D, ConvOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Conv1DGradient, ConvGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(Conv2D, ConvOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Conv2DGradient, ConvGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(Conv3D, ConvOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Conv3DGradient, ConvGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
new file mode 100644
index 0000000..161887c
--- /dev/null
+++ b/caffe2/operators/conv_op_impl.h
@@ -0,0 +1,668 @@
+// conv_op_impl.h is the templated implementation of the conv_op.h file.
+#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const Tensor<Context>& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const int N = X.dim32(0), C = X.dim32(1);
+  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(
+      C == filter.dim32(1) * group_,
+      "Convolution op: input channels does not match: # of input channels ",
+      C,
+      " is not equal to kernel channels * group:",
+      filter.dim32(1),
+      "*",
+      group_);
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "The number of output channels is not divisible by group.");
+
+  int kernel_dims_size = 1;
+  for (int i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
+
+  const vector<int> input_dims = GetDims(X);
+  const vector<int> output_dims = GetDims(*Y);
+  const int input_image_size = this->GetDimsSize(X);
+  const int output_image_size = this->GetDimsSize(*Y);
+
+  vector<int> img_shape;
+  img_shape.assign(X.dims().begin() + 1, X.dims().end());
+
+  vector<int> buffer_shape;
+  buffer_shape.push_back(C / group_ * kernel_dims_size);
+  buffer_shape.insert(
+      buffer_shape.end(), output_dims.begin(), output_dims.end());
+
+  if (kernel_.size() != 2) {
+    SetDeviceTensor(img_shape, &img_shape_device_);
+    SetDeviceTensor(buffer_shape, &col_buffer_shape_device_);
+  }
+
+  const int col_buffer_size =
+      (C / group_) * kernel_dims_size * output_image_size;
+
+  // The dimension of each kernel
+  const int kernel_dim = C / group_ * kernel_dims_size;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C / group_ * input_image_size;
+  const int output_offset = Y->size() / Y->dim32(0) / group_;
+  const int filter_offset = filter.size() / group_;
+
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  const T* Xdata = X.template data<T>();
+  if (InputSize() == 3) {
+    const auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(bias.ndim() == 1);
+    CAFFE_ENFORCE(bias.dim32(0) == M);
+    ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+        output_image_size, &bias_multiplier_);
+  }
+  T* Ydata = Y->template mutable_data<T>();
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer->Resize(buffer_shape);
+    T* col_buffer_data = col_buffer->template mutable_data<T>();
+    // Im2Col, followed by gemm.
+    for (int image_id = 0; image_id < N; ++image_id) {
+      for (int group_id = 0; group_id < group_; ++group_id) {
+        if (kernel_.size() == 2) {
+          math::Im2Col<T, Context, StorageOrder::NCHW>(
+              C / group_,
+              input_dims[0],
+              input_dims[1],
+              kernel_h(),
+              kernel_w(),
+              dilation_h(),
+              dilation_w(),
+              pad_t(),
+              pad_l(),
+              pad_b(),
+              pad_r(),
+              stride_h(),
+              stride_w(),
+              Xdata + group_id * input_offset,
+              col_buffer_data,
+              &context_);
+        } else {
+          math::Im2ColNd<T, Context, StorageOrder::NCHW>(
+              kernel_.size(),
+              C * input_image_size,
+              col_buffer_size,
+              img_shape.data(),
+              buffer_shape.data(),
+              kernel_.data(),
+              stride_.data(),
+              dilation_.data(),
+              pads_.data(),
+              Xdata + group_id * input_offset,
+              col_buffer_data,
+              &context_);
+        }
+        // Weight term
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            M / group_,
+            output_image_size,
+            kernel_dim,
+            1,
+            filter.template data<T>() + group_id * filter_offset,
+            col_buffer_data,
+            0,
+            Ydata + group_id * output_offset,
+            &context_);
+      }
+      if (InputSize() == 3) {
+        // Bias term can be carried out outside the group definition
+        // to be efficient.
+        auto* bias_data = Input(BIAS).template data<T>();
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            M,
+            output_image_size,
+            1,
+            1,
+            bias_data,
+            bias_multiplier_.template data<T>(),
+            1,
+            Ydata,
+            &context_);
+      }
+      Xdata += input_offset * group_;
+      Ydata += output_offset * group_;
+    }
+  };
+
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
+  }
+  return true;
+}
+
+// The implementations.
+template <typename T, class Context>
+bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const Tensor<Context>& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+
+  CAFFE_ENFORCE_EQ(
+      kernel_.size(),
+      2,
+      "Only 2d convolution is supported for NHWC storage type");
+
+  CAFFE_ENFORCE(X.ndim(), filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
+  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
+  CAFFE_ENFORCE(filter.dim32(3) == C);
+
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h() * kernel_w() * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = Y->size() / Y->dim32(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = Y->dim32(1) * Y->dim32(2);
+  // The col buffer is stored in HWC order as well - kernel_dim, and the height
+  // and width.
+  const T* Xdata = X.template data<T>();
+  T* Ydata = Y->template mutable_data<T>();
+  // Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
+  // can skip im2col.
+  if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
+      Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
+      pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
+    math::Gemm<T, Context>(
+        CblasNoTrans,
+        CblasTrans,
+        N * H * W,
+        M,
+        C,
+        1,
+        Xdata,
+        filter.template data<T>(),
+        0,
+        Ydata,
+        &context_);
+    if (InputSize() == 3) {
+      auto& bias = Input(BIAS);
+      CAFFE_ENFORCE(1 == bias.ndim());
+      CAFFE_ENFORCE(bias.dim32(0) == M);
+      if (bias_multiplier_.size() != N * H * W) {
+        // If the helper bias multiplier is not M, reshape and fill it with one.
+        bias_multiplier_.Resize(vector<TIndex>(1, N * H * W));
+        math::Set<T, Context>(
+            N * H * W,
+            static_cast<T>(1),
+            bias_multiplier_.template mutable_data<T>(),
+            &context_);
+      }
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          N * H * W,
+          M,
+          1,
+          1,
+          bias_multiplier_.template data<T>(),
+          bias.template data<T>(),
+          1,
+          Ydata,
+          &context_);
+    }
+  } else {
+    if (InputSize() == 3) {
+      const auto& bias = Input(BIAS);
+      CAFFE_ENFORCE(1 == bias.ndim());
+      CAFFE_ENFORCE(bias.dim32(0) == M);
+      ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+          output_image_size, &bias_multiplier_);
+    }
+    auto f = [&](Tensor<Context>* col_buffer) {
+      col_buffer->Resize(
+          vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
+      T* col_buffer_data = col_buffer->template mutable_data<T>();
+      // Im2Col, followed by gemm.
+      for (int image_id = 0; image_id < N; ++image_id) {
+        math::Im2Col<T, Context, StorageOrder::NHWC>(
+            C,
+            H,
+            W,
+            kernel_h(),
+            kernel_w(),
+            dilation_h(),
+            dilation_w(),
+            pad_t(),
+            pad_l(),
+            pad_b(),
+            pad_r(),
+            stride_h(),
+            stride_w(),
+            Xdata,
+            col_buffer_data,
+            &context_);
+        // Weight term
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasTrans,
+            output_image_size,
+            M,
+            kernel_dim,
+            1,
+            col_buffer_data,
+            filter.template data<T>(),
+            0,
+            Ydata,
+            &context_);
+        if (InputSize() == 3) {
+          // Bias term
+          math::Gemm<T, Context>(
+              CblasNoTrans,
+              CblasNoTrans,
+              output_image_size,
+              M,
+              1,
+              1,
+              bias_multiplier_.template data<T>(),
+              Input(BIAS).template data<T>(),
+              1,
+              Ydata,
+              &context_);
+        }
+        Xdata += input_offset;
+        Ydata += output_offset;
+      }
+    };
+    if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+      runWithSharedBuffer<Context>(ws_, f);
+    } else {
+      f(&col_buffer_);
+    }
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  const int N = X.dim32(0), C = X.dim32(1);
+
+  const vector<int> input_dims = this->GetDims(X);
+  const int input_image_size = this->GetDimsSize(X);
+
+  const vector<int> output_dims = this->GetDims(dY);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = this->GetDimsSize(dY);
+
+  ConvPoolOpBase<Context>::ComputePads(input_dims);
+  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
+
+  int kernel_dims_size = 1;
+  for (int i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  CAFFE_ENFORCE(M % group_ == 0);
+  dfilter->ResizeLike(filter);
+  // The dimension of each kernel
+  const int kernel_dim = C / group_ * kernel_dims_size;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C / group_ * input_image_size;
+  const int output_offset = dY.size() / dY.dim32(0) / group_;
+  const int filter_offset = filter.size() / group_;
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+
+  vector<int> img_shape;
+  img_shape.assign(X.dims().begin() + 1, X.dims().end());
+  vector<int> col_buffer_shape;
+  col_buffer_shape.push_back(C / group_ * kernel_dims_size);
+  col_buffer_shape.insert(
+      col_buffer_shape.end(), output_dims.begin(), output_dims.end());
+  col_buffer_.Resize(col_buffer_shape);
+
+  if (kernel_.size() != 2) {
+    SetDeviceTensor(img_shape, &img_shape_device_);
+    SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
+  }
+
+  const int col_buffer_size =
+      (C / group_) * kernel_dims_size * output_image_size;
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* col_buffer_data = col_buffer_.template mutable_data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+
+  // Pre-setting the gradients to zero.
+  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
+
+  T* dbias_data = nullptr;
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(M);
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    dbias_data = dbias->template mutable_data<T>();
+    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
+  }
+
+  for (int image_id = 0; image_id < N; ++image_id) {
+    for (int group_id = 0; group_id < group_; ++group_id) {
+      // When we compute the gradient with respect to the filters, we need to do
+      // im2col to allow gemm-type computation.
+      if (kernel_.size() == 2) {
+        math::Im2Col<T, Context, StorageOrder::NCHW>(
+            C / group_,
+            input_dims[0],
+            input_dims[1],
+            kernel_h(),
+            kernel_w(),
+            dilation_h(),
+            dilation_w(),
+            pad_t(),
+            pad_l(),
+            pad_b(),
+            pad_r(),
+            stride_h(),
+            stride_w(),
+            Xdata + group_id * input_offset,
+            col_buffer_data,
+            &context_);
+      } else {
+        math::Im2ColNd<T, Context, StorageOrder::NCHW>(
+            kernel_.size(),
+            C * input_image_size,
+            col_buffer_size,
+            img_shape.data(),
+            col_buffer_shape.data(),
+            kernel_.data(),
+            stride_.data(),
+            dilation_.data(),
+            pads_.data(),
+            Xdata + group_id * input_offset,
+            col_buffer_data,
+            &context_);
+      }
+      // Gradient with respect to filter.
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasTrans,
+          M / group_,
+          kernel_dim,
+          output_image_size,
+          1,
+          dYdata + group_id * output_offset,
+          col_buffer_data,
+          1,
+          dfilter_data + group_id * filter_offset,
+          &context_);
+    }
+    if (!no_bias_) {
+      // Gradient with respect to bias can be computed independent from group.
+      math::Gemv<T, Context>(
+          CblasNoTrans,
+          M,
+          output_image_size,
+          1,
+          dYdata,
+          bias_multiplier_.template data<T>(),
+          1,
+          dbias_data,
+          &context_);
+    }
+    Xdata += input_offset * group_;
+    dYdata += output_offset * group_;
+  }
+  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+    // Compute the gradient w.r.t. the input.
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    T* dXdata = dX->template mutable_data<T>();
+    dYdata = dY.template data<T>();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      for (int group_id = 0; group_id < group_; ++group_id) {
+        // Compute gradient into col_buffer.
+        math::Gemm<T, Context>(
+            CblasTrans,
+            CblasNoTrans,
+            kernel_dim,
+            output_image_size,
+            M / group_,
+            1,
+            filter_data + group_id * filter_offset,
+            dYdata,
+            0,
+            col_buffer_data,
+            &context_);
+        if (kernel_.size() == 2) {
+          math::Col2Im<T, Context, StorageOrder::NCHW>(
+              C / group_,
+              input_dims[0],
+              input_dims[1],
+              kernel_h(),
+              kernel_w(),
+              dilation_h(),
+              dilation_w(),
+              pad_t(),
+              pad_l(),
+              pad_b(),
+              pad_r(),
+              stride_h(),
+              stride_w(),
+              col_buffer_data,
+              dXdata,
+              &context_);
+        } else {
+          math::Col2ImNd<T, Context, StorageOrder::NCHW>(
+              kernel_.size(),
+              C * input_image_size,
+              col_buffer_size,
+              img_shape.data(),
+              col_buffer_shape.data(),
+              kernel_.data(),
+              stride_.data(),
+              dilation_.data(),
+              pads_.data(),
+              col_buffer_data,
+              dXdata,
+              &context_);
+        }
+        dXdata += input_offset;
+        dYdata += output_offset;
+      }
+    }
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+  ConvPoolOpBase<Context>::ComputePads({H, W});
+  CAFFE_ENFORCE(4 == filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
+  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
+  CAFFE_ENFORCE(filter.dim32(3) == C);
+  dfilter->ResizeLike(filter);
+
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h() * kernel_w() * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = dY.size() / dY.dim32(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = dY.dim32(1) * dY.dim32(2);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Resize(output_image_size, kernel_dim);
+
+  const T* Xdata = X.template data<T>();
+  const T* const filter_data = filter.template data<T>();
+  const T* const dYdata = dY.template data<T>();
+  T* col_buffer_data = col_buffer_.template mutable_data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+
+  // Pre-setting the gradients to zero.
+  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
+
+  T* dbias_data = nullptr;
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(M);
+    dbias_data = dbias->template mutable_data<T>();
+    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+  }
+
+  for (int image_id = 0; image_id < N; ++image_id) {
+    // When we compute the gradient with respect to the filters, we need to do
+    // im2col to allow gemm-type computation.
+    math::Im2Col<T, Context, StorageOrder::NHWC>(
+        C,
+        H,
+        W,
+        kernel_h(),
+        kernel_w(),
+        dilation_h(),
+        dilation_w(),
+        pad_t(),
+        pad_l(),
+        pad_b(),
+        pad_r(),
+        stride_h(),
+        stride_w(),
+        Xdata,
+        col_buffer_data,
+        &context_);
+    // Gradient with respect to filter.
+    math::Gemm<T, Context>(
+        CblasTrans,
+        CblasNoTrans,
+        M,
+        kernel_dim,
+        output_image_size,
+        1,
+        dYdata + output_offset * image_id,
+        col_buffer_data,
+        1,
+        dfilter_data,
+        &context_);
+    if (!no_bias_) {
+      // Gradient with respect to bias
+      math::Gemv<T, Context>(
+          CblasTrans,
+          output_image_size,
+          M,
+          1,
+          dYdata + output_offset * image_id,
+          bias_multiplier_.template data<T>(),
+          1,
+          dbias_data,
+          &context_);
+    }
+    Xdata += input_offset;
+  }
+
+  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+    // Compute the gradient w.r.t. the input.
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    T* dXdata = dX->template mutable_data<T>();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      // Compute gradient into col_buffer.
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          output_image_size,
+          kernel_dim,
+          M,
+          1,
+          dYdata + output_offset * image_id,
+          filter_data,
+          0,
+          col_buffer_data,
+          &context_);
+      math::Col2Im<T, Context, StorageOrder::NHWC>(
+          C,
+          H,
+          W,
+          kernel_h(),
+          kernel_w(),
+          dilation_h(),
+          dilation_w(),
+          pad_t(),
+          pad_l(),
+          pad_b(),
+          pad_r(),
+          stride_h(),
+          stride_w(),
+          col_buffer_data,
+          dXdata,
+          &context_);
+      dXdata += input_offset;
+    }
+  }
+  return true;
+}
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
new file mode 100644
index 0000000..b0ad152
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.cc
@@ -0,0 +1,34 @@
+#include "conv_op_shared.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/workspace.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_force_shared_col_buffer,
+    false,
+    "Always use the shared col buffer");
+
+namespace caffe2 {
+
+template <>
+void createSharedBuffer<CPUContext>(Workspace* ws) {
+  auto* mutexPtr = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU_MUTEX__")
+                       ->GetMutable<std::unique_ptr<std::mutex>>();
+  mutexPtr->reset(new std::mutex());
+  ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__");
+}
+
+template <>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<CPUContext>* buffer)> f) {
+  auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU_MUTEX__");
+  CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
+
+  auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
+  std::lock_guard<std::mutex> g(**mutexPtr);
+  auto* buffer =
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutable<TensorCPU>();
+  f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_op_shared.h b/caffe2/operators/conv_op_shared.h
new file mode 100644
index 0000000..18f7aa8
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.h
@@ -0,0 +1,27 @@
+#ifndef CAFFE2_OPERATORS_CONV_OP_SHARED_H_
+#define CAFFE2_OPERATORS_CONV_OP_SHARED_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+/**
+ * Creates a mutex and shared buffer in the workspace.
+ * Not thread-safe, must be called from the constructor.
+ */
+template <typename Context>
+void createSharedBuffer(Workspace* ws);
+
+/**
+ * Thread-safe, can be invoked from RunOnDevice() to serialize
+ * access to shared buffer.
+ */
+template <typename Context>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<Context>* buffer)> f);
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_OP_SHARED_H_
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
new file mode 100644
index 0000000..70570bd
--- /dev/null
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -0,0 +1,27 @@
+#include "caffe2/core/context_gpu.h"
+#include "conv_op_shared.h"
+
+namespace caffe2 {
+
+template <>
+void createSharedBuffer<CUDAContext>(Workspace* ws) {
+  auto* mutexPtr = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA_MUTEX__")
+                       ->GetMutable<std::unique_ptr<std::mutex>>();
+  mutexPtr->reset(new std::mutex());
+  ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__");
+}
+
+template <>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<CUDAContext>* buffer)> f) {
+  auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA_MUTEX__");
+  CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
+
+  auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
+  std::lock_guard<std::mutex> g(**mutexPtr);
+  auto* buffer = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")
+                     ->GetMutable<TensorCUDA>();
+  f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
new file mode 100644
index 0000000..5d7b003
--- /dev/null
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -0,0 +1,757 @@
+#ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+#define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2_legacy.pb.h"
+#include "caffe2/utils/math.h"
+
+// This macro is here just to allow us to experiment with padding values that
+// determines, when we have an odd number of pads, which side gets the one
+// additional pad value, the head side, or the tail side. Setting it to false
+// will enable the TensorFlow behavior, and setting it to true will enable
+// a behavior more consistent with Caffe and CuDNN.
+// This only affects the case when you set legacy pad to VALID or SAME. The
+// behavior inherits from the early designs of Google's CNN implementation,
+// where padding values are implicitly calculated instead of explicitly
+// specified. This is still the case with TensorFlow. Many frameworks have
+// followed a slightly different approach of explicitly giving padding values,
+// in which case the value of this constant value does not matter.
+const bool CAFFE2_PAD_HEAD_MORE = false;
+
+namespace caffe2 {
+
+template <class Context>
+class ConvPoolOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        legacy_pad_(
+            static_cast<LegacyPadding>(OperatorBase::GetSingleArgument<int>(
+                "legacy_pad",
+                LegacyPadding::NOTSET))),
+        global_pooling_(
+            OperatorBase::GetSingleArgument<int>("global_pooling", 0)),
+        kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
+        dilation_(OperatorBase::GetRepeatedArgument<int>("dilations")),
+        stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
+        pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
+        float16_compute_(
+            OperatorBase::GetSingleArgument<bool>("float16_compute", false)),
+        group_(OperatorBase::GetSingleArgument<int>("group", 1)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        shared_buffer_(
+            OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+        ws_(ws) {
+    // For the padding, they should either be the legacy padding strategy
+    // (VALID or SAME), or an explicit, non-negative value.
+    if (legacy_pad_ == LegacyPadding::VALID ||
+        legacy_pad_ == LegacyPadding::SAME) {
+      CAFFE_ENFORCE(
+          !OperatorBase::HasArgument("pads"),
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+    }
+
+    // Get old arguments values.
+    if (OperatorBase::HasArgument("kernel")) {
+      kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
+    } else if (
+        OperatorBase::HasArgument("kernel_h") &&
+        OperatorBase::HasArgument("kernel_w")) {
+      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
+      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("stride")) {
+      stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
+    } else if (
+        OperatorBase::HasArgument("stride_h") &&
+        OperatorBase::HasArgument("stride_w")) {
+      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
+      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("dilation")) {
+      dilation_.resize(2, OperatorBase::GetSingleArgument<int>("dilation", 0));
+    } else if (
+        OperatorBase::HasArgument("dilation_h") &&
+        OperatorBase::HasArgument("dilation_w")) {
+      dilation_.push_back(
+          OperatorBase::GetSingleArgument<int>("dilation_h", 0));
+      dilation_.push_back(
+          OperatorBase::GetSingleArgument<int>("dilation_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("pad")) {
+      CAFFE_ENFORCE(
+          legacy_pad_ != LegacyPadding::VALID &&
+              legacy_pad_ != LegacyPadding::SAME,
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+      pads_.resize(4, OperatorBase::GetSingleArgument<int>("pad", 0));
+    } else if (
+        OperatorBase::HasArgument("pad_t") &&
+        OperatorBase::HasArgument("pad_l") &&
+        OperatorBase::HasArgument("pad_b") &&
+        OperatorBase::HasArgument("pad_r")) {
+      CAFFE_ENFORCE(
+          legacy_pad_ != LegacyPadding::VALID &&
+              legacy_pad_ != LegacyPadding::SAME,
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
+    }
+
+    // Fill default values.
+    if (kernel_.size() == 0) {
+      kernel_.assign({0, 0});
+    }
+
+    if (stride_.size() == 0) {
+      stride_.resize(kernel_.size(), 1);
+    }
+
+    if (pads_.size() == 0) {
+      pads_.resize(kernel_.size() * 2, 0);
+    }
+
+    if (dilation_.size() == 0) {
+      dilation_.resize(kernel_.size(), 1);
+    }
+
+    CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
+    CAFFE_ENFORCE_EQ(dilation_.size(), kernel_.size());
+
+    if (legacy_pad_ != LegacyPadding::VALID &&
+        legacy_pad_ != LegacyPadding::SAME) {
+      CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
+    }
+
+    if (global_pooling_) {
+      for (int dim = 0; dim < kernel_.size(); ++dim) {
+        CAFFE_ENFORCE(
+            pads_[2 * dim] == 0 && pads_[2 * dim + 1] == 0 &&
+                dilation_[dim] == 1 && stride_[dim] == 1,
+            "If global_pooling is set pad, dilation and stride shouldn't be set.");
+      }
+    }
+
+    // Check kernel only if we are doing conv or pooling. The reason is that a
+    // few other ops, like PadImage, are also using this base class. We really
+    // need to clean this up.
+    if (operator_def.name().find("Conv") == 0 ||
+        operator_def.name().find("Pool") != std::string::npos) {
+      for (int dim = 0; dim < kernel_.size(); ++dim) {
+        CAFFE_ENFORCE_GE(pads_[dim], 0);
+        CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0);
+        CAFFE_ENFORCE(
+            kernel_[dim],
+            "If you are doing convolution or pooling, you will need to set "
+            "explicitly the kernel size.");
+      }
+    }
+
+    for (int dim = 0; dim < kernel_.size(); ++dim) {
+      CAFFE_ENFORCE_GE(kernel_[dim], 0);
+      CAFFE_ENFORCE_GE(dilation_[dim], 0);
+      CAFFE_ENFORCE_GE(stride_[dim], 0);
+    }
+  }
+
+  // Returns the input image dimensions for the current storage order type.
+  vector<int> GetDims(const Tensor<Context>& input) {
+    vector<int> dims;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        dims.assign(input.dims().begin() + 2, input.dims().end());
+        break;
+      case StorageOrder::NHWC:
+        dims.assign(input.dims().begin() + 1, input.dims().end() - 1);
+        break;
+      default:
+        CAFFE_THROW("Unknown storage order : ", order_);
+    }
+    return dims;
+  }
+
+  // Returns the size of the input image for the current storage type.
+  int GetDimsSize(const Tensor<Context>& input) {
+    int size = 0;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        size = std::accumulate(
+            input.dims().begin() + 2,
+            input.dims().end(),
+            1,
+            std::multiplies<int>());
+        break;
+      case StorageOrder::NHWC:
+        size = std::accumulate(
+            input.dims().begin() + 1,
+            input.dims().end() - 1,
+            1,
+            std::multiplies<int>());
+        break;
+      default:
+        CAFFE_THROW("Unknown storage order : ", order_);
+    }
+    return size;
+  }
+
+  // Sets the output size. The output channel is manually provided since
+  // it may not be identical to the input channels.
+  // This function can be used in the forward functions to obtain the output
+  // sizes.
+  // Note(jiayq): the templatization of this function is mainly to help
+  // implementations that do not use first-class Tensor objects, such as the
+  // MKL operator. One can still call this function with dummy
+  // Tensor<CPUContext> objects in order to obtain the sizes.
+  template <typename AlternativeContext>
+  void SetOutputSize(
+      const Tensor<AlternativeContext>& input,
+      Tensor<AlternativeContext>* output,
+      int output_channel) {
+    CAFFE_ENFORCE(input.size() > 0);
+    vector<int> output_dims;
+    int N = input.dim32(0);
+    bool channel_first;
+    InferOutputSize(
+        input.dims(),
+        output_channel,
+        order_,
+        global_pooling_,
+        legacy_pad_,
+        N,
+        kernel_,
+        output_dims,
+        dilation_,
+        stride_,
+        pads_,
+        channel_first);
+
+    if (channel_first) {
+      output_dims.insert(output_dims.begin(), {N, output_channel});
+    } else {
+      output_dims.insert(output_dims.begin(), N);
+      output_dims.push_back(output_channel);
+    }
+    output->Resize(output_dims);
+  }
+
+  // Helper function that is also called from OperatorSchema. Modified
+  // kernel parameters and output output_dims and channel_first.
+  static inline void InferOutputSize(
+      vector<TIndex> input_dims,
+      int /*output_channel*/,
+      StorageOrder order,
+      bool global_pooling,
+      LegacyPadding legacy_pad,
+      int /*N*/,
+      vector<int>& kernel,
+      vector<int>& output_dims,
+      const vector<int>& dilation,
+      const vector<int>& stride,
+      vector<int>& pads,
+      bool& channel_first) {
+    channel_first = false; // initialized to suppress compiler warning.
+    vector<TIndex> dims;
+    switch (order) {
+      case StorageOrder::NHWC:
+        channel_first = false;
+        dims.assign(input_dims.begin() + 1, input_dims.end() - 1);
+        break;
+      case StorageOrder::NCHW:
+        // Old Caffe order.
+        channel_first = true;
+        dims.assign(input_dims.begin() + 2, input_dims.end());
+        break;
+      default:
+        CAFFE_THROW("Unknown Storage order: ", order);
+    }
+
+    if (global_pooling) {
+      kernel.assign(dims.begin(), dims.end());
+      output_dims.assign(dims.size(), 1);
+    } else {
+      for (int dim = 0; dim < dims.size(); ++dim) {
+        int dim_size = 0;
+        ComputeSizeAndPad(
+            dims[dim],
+            stride[dim],
+            kernel[dim],
+            dilation[dim],
+            legacy_pad,
+            &pads[dim],
+            &pads[dims.size() + dim],
+            &dim_size);
+        output_dims.push_back(dim_size);
+      }
+    }
+  }
+
+  // ComputePads could be used in backward functions to figure out the padding
+  // values for the given input.
+  void ComputePads(const vector<int>& dims) {
+    if (global_pooling_) {
+      kernel_ = dims;
+    } else if (legacy_pad_ != LegacyPadding::NOTSET) {
+      int output_unused;
+      for (int dim = 0; dim < dims.size(); ++dim) {
+        ComputeSizeAndPad(
+            dims[dim],
+            stride_[dim],
+            kernel_[dim],
+            dilation_[dim],
+            legacy_pad_,
+            &pads_[dim],
+            &pads_[dims.size() + dim],
+            &output_unused);
+      }
+    }
+  }
+
+  void SetDeviceTensor(const std::vector<int>& data, Tensor<Context>* tensor) {
+    bool reset_tensor_device_ = false;
+
+    if (tensor->size() != data.size()) {
+      tensor->Resize(data.size());
+      reset_tensor_device_ = true;
+    } else {
+      const int* tensor_data = tensor->template data<int>();
+      for (int d_i = 0; d_i < data.size(); ++d_i) {
+        if (tensor_data[d_i] != data[d_i]) {
+          reset_tensor_device_ = true;
+          break;
+        }
+      }
+    }
+
+    if (reset_tensor_device_) {
+      context_.template Copy<int, CPUContext, Context>(
+          data.size(), data.data(), tensor->template mutable_data<int>());
+    }
+  }
+
+  template <typename T>
+  void SetBiasMultiplier(const int size, Tensor<Context>* bias_multiplier_) {
+    if (bias_multiplier_->size() != size) {
+      // If the helper bias multiplier is not image size, reshape and fill it
+      // with one.
+      bias_multiplier_->Resize(std::vector<TIndex>{size});
+      math::Set<T, Context>(
+          size,
+          static_cast<T>(1),
+          bias_multiplier_->template mutable_data<T>(),
+          &context_);
+    }
+  }
+
+  bool RunOnDevice() override {
+    if (!global_pooling_) {
+      for (int dim = 0; dim < kernel_.size(); ++dim) {
+        CAFFE_ENFORCE_GT(kernel_[dim], 0);
+      }
+    }
+    switch (order_) {
+      case StorageOrder::NHWC:
+        // VLOG(2) << "Running NHWC";
+        return RunOnDeviceWithOrderNHWC();
+      case StorageOrder::NCHW:
+        // VLOG(2) << "Running NCHW";
+        return RunOnDeviceWithOrderNCHW();
+      default:
+        CAFFE_THROW("Unknown Storage order: ", order_);
+    }
+  }
+
+  // The actual function that does the computation, if the different
+  // storage order leads to different implementations.
+  virtual bool RunOnDeviceWithOrderNHWC() {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+  virtual bool RunOnDeviceWithOrderNCHW() {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+  static struct OpSchema::Cost CostInferenceForConv(
+      const OperatorDef& def,
+      const vector<TensorShape>& inputs) {
+    CAFFE_ENFORCE_GE(inputs.size(), 2, "Conv requires at least 2 inputs");
+    struct OpSchema::Cost c;
+    const TensorShape X = inputs[0];
+    const TensorShape W = inputs[1];
+    const TensorShape Y = TensorInferenceForConv(def, inputs)[0];
+    ArgumentHelper helper(def);
+    const auto order =
+        StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW"));
+    uint64_t N;
+    uint64_t Y_t = 1;
+    uint64_t Y_h;
+    uint64_t Y_w;
+    uint64_t kernel_t = 1;
+    uint64_t kernel_h;
+    uint64_t kernel_w;
+    uint64_t in_channels;
+    uint64_t out_channels;
+
+    if (X.dims_size() == 0 || W.dims_size() == 0) {
+      return c;
+    }
+    N = X.dims(0);
+    if (X.dims_size() == 5) {
+      // 3D convolution
+      CAFFE_ENFORCE_EQ(order, StorageOrder::NCHW, "Conv3D only supports NCHW");
+      Y_t = Y.dims(2);
+      Y_h = Y.dims(3);
+      Y_w = Y.dims(4);
+      kernel_t = W.dims(2);
+      kernel_h = W.dims(3);
+      kernel_w = W.dims(4);
+      in_channels = W.dims(1);
+      out_channels = W.dims(0);
+    } else {
+      // 2D convolution
+      CAFFE_ENFORCE_EQ(X.dims_size(), 4, "Conv2D should have 4D input tensor");
+      CAFFE_ENFORCE_EQ(W.dims_size(), 4, "Conv2D should have 4D filter tensor");
+      if (order == StorageOrder::NHWC) {
+        Y_h = Y.dims(1);
+        Y_w = Y.dims(2);
+        kernel_h = W.dims(1);
+        kernel_w = W.dims(2);
+        in_channels = W.dims(3);
+        out_channels = W.dims(0);
+      } else {
+        Y_h = Y.dims(2);
+        Y_w = Y.dims(3);
+        kernel_h = W.dims(2);
+        kernel_w = W.dims(3);
+        in_channels = W.dims(1);
+        out_channels = W.dims(0);
+      }
+    }
+
+    uint64_t nElemX = nElemFromDim(X);
+    uint64_t nElemW = nElemFromDim(W);
+    uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
+
+    // grouping is NOT properly handled yet
+    c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
+        in_channels * out_channels * 2;
+    c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
+    c.bytes_written =
+        N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
+    c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
+        kernel_w * sizeof(W.data_type());
+    return c;
+  }
+
+  static vector<TensorShape> TensorInferenceForSchema(
+      const OperatorDef& def,
+      const vector<TensorShape>& in,
+      int output_channel) {
+    ArgumentHelper helper(def);
+    CAFFE_ENFORCE_GT(in.size(), 0);
+    CAFFE_ENFORCE_GT(in[0].dims_size(), 0);
+    int N = in[0].dims(0);
+    bool channel_first;
+    vector<int> pads = helper.GetRepeatedArgument<int>("pads");
+    vector<int> kernel = helper.GetRepeatedArgument<int>("kernels");
+    vector<int> strides = helper.GetRepeatedArgument<int>("strides");
+    vector<int> dilations = helper.GetRepeatedArgument<int>("dilation");
+    if (helper.HasArgument("pad")) {
+      pads.resize(4, helper.GetSingleArgument<int>("pad", 0));
+    } else if (
+        helper.HasArgument("pad_t") && helper.HasArgument("pad_l") &&
+        helper.HasArgument("pad_b") && helper.HasArgument("pad_r")) {
+      pads.push_back(helper.GetSingleArgument<int>("pad_t", 0));
+      pads.push_back(helper.GetSingleArgument<int>("pad_l", 0));
+      pads.push_back(helper.GetSingleArgument<int>("pad_b", 0));
+      pads.push_back(helper.GetSingleArgument<int>("pad_r", 0));
+    }
+
+    if (helper.HasArgument("kernel")) {
+      kernel.resize(2, helper.GetSingleArgument<int>("kernel", 1));
+    } else if (
+        helper.HasArgument("kernel_h") && helper.HasArgument("kernel_w")) {
+      kernel.push_back(helper.GetSingleArgument<int>("kernel_h", 1));
+      kernel.push_back(helper.GetSingleArgument<int>("kernel_w", 1));
+    }
+
+    if (helper.HasArgument("stride")) {
+      strides.resize(2, helper.GetSingleArgument<int>("stride", 1));
+    } else if (
+        helper.HasArgument("stride_h") && helper.HasArgument("stride_w")) {
+      strides.push_back(helper.GetSingleArgument<int>("stride_h", 1));
+      strides.push_back(helper.GetSingleArgument<int>("stride_w", 1));
+    }
+
+    if (helper.HasArgument("dilation")) {
+      strides.resize(2, helper.GetSingleArgument<int>("dilation", 1));
+    } else if (
+        helper.HasArgument("dilation_h") && helper.HasArgument("dilation_w")) {
+      strides.push_back(helper.GetSingleArgument<int>("dilation_h", 1));
+      strides.push_back(helper.GetSingleArgument<int>("dilation_w", 1));
+    }
+
+    auto check_and_set_default_value =
+        [](vector<int>& vec, int size, int value) {
+          if (vec.size() == 0) {
+            vec.resize(size, value);
+          }
+        };
+
+    check_and_set_default_value(kernel, 2, 1);
+    check_and_set_default_value(strides, kernel.size(), 1);
+    check_and_set_default_value(pads, kernel.size() * 2, 0);
+    check_and_set_default_value(dilations, kernel.size(), 1);
+
+    vector<int> output_dims;
+    ConvPoolOpBase<CPUContext>::InferOutputSize(
+        GetDimsVector(in[0]),
+        output_channel,
+        StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW")),
+        helper.GetSingleArgument<int>("global_pooling", 0),
+        static_cast<LegacyPadding>(
+            helper.GetSingleArgument<int>("legacy_pad", LegacyPadding::NOTSET)),
+        N,
+        kernel,
+        output_dims,
+        dilations,
+        strides,
+        pads,
+        channel_first);
+    vector<TensorShape> out(1);
+    if (channel_first) {
+      output_dims.insert(output_dims.begin(), {N, output_channel});
+    } else {
+      output_dims.push_back(output_channel);
+      output_dims.insert(output_dims.begin(), N);
+    }
+
+    out[0] = CreateTensorShape(output_dims, TensorProto::FLOAT);
+    return out;
+  }
+
+  static std::vector<TensorShape> TensorInferenceForConv(
+      const OperatorDef& def,
+      const std::vector<TensorShape>& in) {
+    if (in[0].unknown_shape()) {
+      std::vector<TensorShape> out(1);
+      out[0].set_unknown_shape(true);
+      return out;
+    }
+    return TensorInferenceForSchema(def, in, in[1].dims(0));
+  }
+
+  static std::vector<TensorShape> TensorInferenceForPool(
+      const OperatorDef& def,
+      const std::vector<TensorShape>& in) {
+    if (in[0].unknown_shape()) {
+      std::vector<TensorShape> out(1);
+      out[0].set_unknown_shape(true);
+      return out;
+    }
+    ArgumentHelper helper(def);
+    auto order =
+        StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW"));
+    int num_channels =
+        (order == StorageOrder::NCHW ? in[0].dims(1) : in[0].dims(3));
+    return TensorInferenceForSchema(def, in, num_channels);
+  }
+
+  static std::vector<TensorShape> TensorInferenceForLC(
+      const OperatorDef& def,
+      const std::vector<TensorShape>& in) {
+    if (in[0].unknown_shape()) {
+      std::vector<TensorShape> out(1);
+      out[0].set_unknown_shape(true);
+      return out;
+    }
+    const int img_ndim = in[0].dims_size() - 2;
+    return TensorInferenceForSchema(def, in, in[1].dims(img_ndim));
+  }
+
+  virtual ~ConvPoolOpBase() {}
+
+ protected:
+  LegacyPadding legacy_pad_;
+  bool global_pooling_;
+  vector<int> kernel_;
+  vector<int> dilation_;
+  vector<int> stride_;
+  vector<int> pads_;
+
+  bool float16_compute_;
+
+  int group_;
+  StorageOrder order_;
+  bool shared_buffer_;
+  Workspace* ws_;
+
+  static inline void ComputeSizeAndPad(
+      const int in_size,
+      const int stride,
+      const int kernel,
+      const int dilation,
+      LegacyPadding legacy_pad,
+      int* pad_head,
+      int* pad_tail,
+      int* out_size) {
+    const int dkernel = dilation * (kernel - 1) + 1;
+    switch (legacy_pad) {
+      case LegacyPadding::NOTSET:
+        // We will just use the direct padding head and tail values, but we
+        // will verify that they are non-negative.
+        CAFFE_ENFORCE_GE(in_size + *pad_head + *pad_tail, dkernel);
+        *out_size = static_cast<int>(
+            static_cast<float>(in_size + *pad_head + *pad_tail - dkernel) /
+                stride +
+            1);
+        break;
+      case LegacyPadding::VALID:
+        *pad_head = 0;
+        *pad_tail = 0;
+        *out_size = (in_size - dkernel) / stride + 1;
+        break;
+      case LegacyPadding::SAME: {
+        CAFFE_ENFORCE(
+            1 == dilation, "Dilation not supported for legacy padding.");
+        int legacy_target_size = (in_size + stride - 1) / stride;
+        int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
+        if (CAFFE2_PAD_HEAD_MORE) {
+          *pad_head = (pad_needed + 1) / 2;
+        } else {
+          *pad_head = pad_needed / 2;
+        }
+        *pad_tail = pad_needed - *pad_head;
+        *out_size = (in_size + pad_needed - dkernel) / stride + 1;
+        break;
+      }
+      case LegacyPadding::CAFFE_LEGACY_POOLING:
+        // This is in order to adapt Caffe's pooling padding case. In this case,
+        // we will only use pad_head and will compute pad_tail to match the
+        // old caffe pooling strategy. Also see caffe2_legacy.proto for more
+        // details.
+        CAFFE_ENFORCE_GE(*pad_head, 0);
+        // Here, notice that caffe casts UP while caffe2 casts DOWN for the
+        // output size computation.
+        *out_size = std::ceil(
+            static_cast<float>(in_size + *pad_head * 2 - kernel) / stride + 1);
+        // If we have padding, caffe also ensures that the last pooling starts
+        // strictly inside the image (instead of at the padding); otherwise clip
+        // the last.
+        if (*pad_head > 0 && (*out_size - 1) * stride >= in_size + *pad_head) {
+          --*out_size;
+        }
+        // Now, compare the output size with the standard Caffe2 output size.
+        // The
+        // caffe2 standard output size should always be no larger than the
+        // output
+        // size of caffe.
+        int standard_out_size = static_cast<int>(
+            static_cast<float>(in_size + *pad_head * 2 - kernel) / stride + 1);
+        CAFFE_ENFORCE_GE(
+            *out_size,
+            standard_out_size,
+            "This should never happen. If this happens, double check the logic "
+            "above.");
+        if (*out_size > standard_out_size) {
+          LOG(WARNING)
+              << "You are hitting a case where Caffe's legacy padding calculation "
+                 "is hit. This leads to inefficient and sometimes incorrect "
+                 "results. We are keeping this behavior for backward compatibility"
+                 ", but you are strongly recommended to move away from it.";
+        }
+        *pad_tail = *pad_head + stride * (*out_size - standard_out_size);
+        break;
+    }
+  }
+
+  // Accessors for 2D conv params.
+
+  inline int pad_t() const {
+    return pads_[0];
+  }
+
+  inline int pad_l() const {
+    return pads_[1];
+  }
+
+  inline int pad_b() const {
+    return pads_[2];
+  }
+
+  inline int pad_r() const {
+    return pads_[3];
+  }
+
+  inline int kernel_h() const {
+    return kernel_[0];
+  }
+
+  inline int kernel_w() const {
+    return kernel_[1];
+  }
+
+  inline int stride_h() const {
+    return stride_[0];
+  }
+
+  inline int stride_w() const {
+    return stride_[1];
+  }
+
+  inline int dilation_h() const {
+    return dilation_[0];
+  }
+
+  inline int dilation_w() const {
+    return dilation_[1];
+  }
+
+ private:
+ inline void AllocateAndCopy(const vector<int>& vec, Tensor<Context>& tensor) {
+      tensor.Resize(vec.size());
+      context_.template Copy<int, CPUContext, Context>(
+          vec.size(), vec.data(), tensor.template mutable_data<int>());
+ }
+
+#define USE_CONV_POOL_BASE_FUNCTIONS(Context)      \
+  USE_OPERATOR_FUNCTIONS(Context);                 \
+  using ConvPoolOpBase<Context>::pads_;            \
+  using ConvPoolOpBase<Context>::pad_t;            \
+  using ConvPoolOpBase<Context>::pad_l;            \
+  using ConvPoolOpBase<Context>::pad_b;            \
+  using ConvPoolOpBase<Context>::pad_r;            \
+  using ConvPoolOpBase<Context>::legacy_pad_;      \
+  using ConvPoolOpBase<Context>::global_pooling_;  \
+  using ConvPoolOpBase<Context>::kernel_;          \
+  using ConvPoolOpBase<Context>::kernel_h;         \
+  using ConvPoolOpBase<Context>::kernel_w;         \
+  using ConvPoolOpBase<Context>::dilation_;        \
+  using ConvPoolOpBase<Context>::dilation_h;       \
+  using ConvPoolOpBase<Context>::dilation_w;       \
+  using ConvPoolOpBase<Context>::stride_;          \
+  using ConvPoolOpBase<Context>::stride_h;         \
+  using ConvPoolOpBase<Context>::stride_w;         \
+  using ConvPoolOpBase<Context>::group_;           \
+  using ConvPoolOpBase<Context>::order_;           \
+  using ConvPoolOpBase<Context>::shared_buffer_;   \
+  using ConvPoolOpBase<Context>::GetDims;          \
+  using ConvPoolOpBase<Context>::GetDimsSize;      \
+  using ConvPoolOpBase<Context>::SetDeviceTensor;  \
+  using ConvPoolOpBase<Context>::ws_
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
diff --git a/caffe2/operators/conv_transpose_gradient_op.cc b/caffe2/operators/conv_transpose_gradient_op.cc
new file mode 100644
index 0000000..f54af6a
--- /dev/null
+++ b/caffe2/operators/conv_transpose_gradient_op.cc
@@ -0,0 +1,50 @@
+#include "caffe2/operators/conv_transpose_op.h"
+#include "caffe2/operators/conv_transpose_op_impl.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    ConvTransposeGradient,
+    ConvTransposeGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(ConvTransposeGradient).NumInputs(3).NumOutputs(1, 3);
+
+class GetConvTransposeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    auto compute_dX =
+        !ArgumentHelper::GetSingleArgument(def_, "no_gradient_to_input", false);
+
+    CAFFE_ENFORCE(3 == def_.input_size() || 2 == def_.input_size());
+    if (def_.input_size() == 3 && compute_dX) {
+      return SingleGradientDef(
+          "ConvTransposeGradient",
+          "",
+          vector<string>{I(0), I(1), GO(0)},
+          vector<string>{GI(1), GI(2), GI(0)});
+    } else if (def_.input_size() == 3) {
+      return SingleGradientDef(
+          "ConvTransposeGradient",
+          "",
+          vector<string>{I(0), I(1), GO(0)},
+          vector<string>{GI(1), GI(2)});
+    } else if (compute_dX) {
+      return SingleGradientDef(
+          "ConvTransposeGradient",
+          "",
+          vector<string>{I(0), I(1), GO(0)},
+          vector<string>{GI(1), GI(0)},
+          vector<Argument>{MakeArgument<bool>("no_bias", true)});
+    } else {
+      return SingleGradientDef(
+          "ConvTransposeGradient",
+          "",
+          vector<string>{I(0), I(1), GO(0)},
+          vector<string>{GI(1)},
+          vector<Argument>{MakeArgument<bool>("no_bias", true)});
+    }
+  }
+};
+REGISTER_GRADIENT(ConvTranspose, GetConvTransposeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op.cc b/caffe2/operators/conv_transpose_op.cc
new file mode 100644
index 0000000..57ec02b
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op.cc
@@ -0,0 +1,132 @@
+#include "caffe2/operators/conv_transpose_op.h"
+#include "caffe2/operators/conv_transpose_op_impl.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ConvTranspose, ConvTransposeOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(ConvTranspose)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The ConvTranspose op takes an input data tensor $X$, an input weight tensor $filter$, and optionally an input bias tensor $bias$. It then computes the transposed convolution, sometimes referred to as deconvolution, and produces a single output tensor $Y$. The hyperparameters of the op such as kernel size, stride, and padding are specified as args. At each stride, the filter is deconvolved with a subset of $X$ and the $bias$ is added. This is done throughout the input data until the output computation is complete.
+
+The output shapes are computed as follows. The number of channels in the output feature map is the number of kernels specified in the filter blob. The spatial height and width are computed as:
+
+$$H_{out} = (H_{in}-1)*strides[0] - 2*pads[0] + kernels[0]$$
+
+
+$$W_{out} = (W_{in}-1)*strides[1] - 2*pads[1] + kernels[1]$$
+
+Note on the implementation layout: conv_transpose_op_impl.h is the templated implementation of the conv_transpose_op.h file, which is why they are separate files. Also, in the implementation this operator inherits from the *ConvTransposeUnpoolOpBase* operator.
+
+Github Links:
+- https://github.com/pytorch/pytorch/tree/master/caffe2/operators/conv_transpose_op.h
+- https://github.com/pytorch/pytorch/tree/master/caffe2/operators/conv_transpose_op.cc
+- https://github.com/pytorch/pytorch/tree/master/caffe2/operators/conv_transpose_unpool_op_base.h
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ConvTranspose",
+    ["X", "filter", "bias"],
+    ["Y"],
+    kernels=[2,2],
+    pads=[4,4,4,4],
+    strides=[2,2]
+)
+
+# Create X: (N,C,H,W)
+data = np.random.randn(2,3,5,5).astype(np.float32)
+print("Data shape: ",data.shape)
+
+# Create filter: (M,C,Kh,Kw)
+filters = np.random.randn(3,1,2,2).astype(np.float32)
+print("Filter shape: ",filters.shape)
+
+# Create b: M
+bias = np.array([1.]).astype(np.float32)
+print("Bias shape: ",bias.shape)
+
+# Put the inputs into the workspace
+workspace.FeedBlob("X", data)
+workspace.FeedBlob("filter", filters)
+workspace.FeedBlob("bias", bias)
+
+# Run the operator
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+Data shape:  (2, 3, 5, 5)
+Filter shape:  (3, 1, 2, 2)
+Bias shape:  (1,)
+Y:
+ [[[[0.53606427 0.5775447 ]
+   [0.40148795 1.5188271 ]]]
+
+
+ [[[1.9903406  3.2794335 ]
+   [0.09960175 0.31917763]]]]
+
+```
+
+</details>
+
+  )DOC")
+    .Input(
+        0,
+        "X",
+        "Input data blob, of shape $(N, C_{in}, H_{in}, W_{in})$, to be operated on.")
+    .Input(
+        1,
+        "filter",
+        "The filter blob, of shape $(M, C_{out}, K_H, K_W)$, containing the filters to be used in the transposed convolution.")
+    .Input(
+        2,
+        "bias",
+        "The bias blob, of length $C_{out}$, containing the biases for the operation, one bias per output channel. If not passed, biases assumed to be zeros.")
+    .Output(
+        0,
+        "Y",
+        "Output data blob, of shape $(N, C_{out}, H_{out}, W_{out})$, that contains the result of the operation.")
+    .Arg(
+        "legacy_pad",
+        "*(type: int; optional)* Should the legacy padding be VALID or SAME. When used, pads should not be used.")
+    .Arg(
+        "kernels",
+        "*(type: [int]; default: [])* Desired kernel size. If left at default the kernel size will be inferred from the input $filter$ blob.")
+    .Arg(
+        "strides",
+        "*(type: [int]; default: [])* Controls the stride of the kernel as it traverses the input blob.")
+    .Arg(
+        "pads",
+        "*(type: [int]; default: [])* Controls the amount of padding applied to the input feature map before computation.")
+    .Arg(
+        "adjs",
+        "*(type: [int]; default: [])*")
+    .Arg(
+        "order",
+        "*(type: string; default: \"NCHW\")* Specifies the order of the input data blob, where $N$ is batch size, $C$ is number of channels, $H$ is spatial height, and $W$ is spatial width. The only other valid option is \"NHWC\".")
+    .Arg(
+        "shared_buffer",
+        "*(type: int; default: 0)*")
+    .Arg(
+        "no_bias",
+        "*(type: bool; default: False)* ")
+    .InheritOnnxSchema("ConvTranspose");
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op.h b/caffe2/operators/conv_transpose_op.h
new file mode 100644
index 0000000..6dcdbb8
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op.h
@@ -0,0 +1,55 @@
+#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_H_
+#define CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_transpose_unpool_op_base.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
+ public:
+  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
+  ConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<Context>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+template <typename T, class Context>
+class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
+ public:
+  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
+  ConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<Context>(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<bool>("no_bias", false)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  const bool no_bias_;
+  // input: X, W, dY
+  // output: dW, optionally db and dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_H_
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
new file mode 100644
index 0000000..2843532
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -0,0 +1,674 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/operators/conv_op_cache_cudnn.h"
+#include "caffe2/operators/conv_transpose_op.h"
+#include "caffe2/operators/op_utils_cudnn.h"
+
+namespace caffe2 {
+
+class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
+ public:
+  CudnnConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
+            "ws_nbytes_limit",
+            kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
+        exhaustive_search_(
+            OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
+        deterministic_(
+            OperatorBase::GetSingleArgument<int>("deterministic", 0)),
+        cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
+        force_algo_(OperatorBase::GetRepeatedArgument<int>(
+            "force_algo",
+            vector<int>{-1, -1, -1})),
+        enable_tensor_core_(
+            OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
+    CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
+
+    bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
+        OperatorBase::HasArgument("force_algo_dgrad") ||
+        OperatorBase::HasArgument("force_algo_wgrad");
+    if (OperatorBase::HasArgument("force_algo")) {
+      CAFFE_ENFORCE(
+          !individual_force_algo,
+          "Cannot specify both force_algo and any of",
+          "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
+    } else {
+      force_algo_ = std::vector<int>{-1, -1, -1};
+      force_algo_[ALGO_FWD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
+      force_algo_[ALGO_DGRAD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
+      force_algo_[ALGO_WGRAD] =
+          OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
+    }
+
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
+    if (InputSize() == 3) {
+      CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
+    }
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
+  }
+
+  ~CudnnConvTransposeOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
+    if (InputSize() == 3) {
+      CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
+    }
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  }
+
+ protected:
+  vector<TIndex> cudnn_input_dims_;
+  vector<TIndex> cudnn_filter_dims_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  const size_t cudnn_ws_nbytes_limit_;
+  size_t cudnn_ws_nbytes_;
+  bool exhaustive_search_;
+  bool deterministic_;
+  size_t cudnn_state_;
+  vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
+  bool enable_tensor_core_;
+};
+
+template <typename T>
+class CudnnConvTransposeOp final : public CudnnConvTransposeOpBase {
+ public:
+  CudnnConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvTransposeOpBase(operator_def, ws) {}
+
+  ~CudnnConvTransposeOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> data_algo_cache_;
+  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+template <typename T>
+class CudnnConvTransposeGradientOp final : public CudnnConvTransposeOpBase {
+ public:
+  CudnnConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvTransposeOpBase(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<bool>("no_bias", false)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+  }
+
+  ~CudnnConvTransposeGradientOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  cudnnConvolutionFwdAlgo_t algo_;
+  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
+  AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_algo_cache_;
+  AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> filter_algo_cache_;
+  const bool no_bias_;
+  // input: X, W, dY
+  // output: dW, optionally db and dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+bool CudnnConvTransposeOp<T>::RunOnDevice() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+  int C = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      C = filter.dim32(3);
+      break;
+    case StorageOrder::NCHW:
+      C = filter.dim32(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+  ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
+
+  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.dim32(2);
+      M = X.dim32(3);
+      H_out = Y->dim32(1);
+      W_out = Y->dim32(2);
+      CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
+      CAFFE_ENFORCE_EQ(filter.dim32(3), C);
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      M = X.dim32(1);
+      H = X.dim32(2);
+      W = X.dim32(3);
+      H_out = Y->dim32(2);
+      W_out = Y->dim32(3);
+      CAFFE_ENFORCE_EQ(filter.dim32(1), C);
+      CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+    CAFFE_ENFORCE_EQ(bias.dim32(0), C);
+  }
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          bottom_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          N,
+          M,
+          H,
+          W));
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
+          filter_desc_,
+          cudnnTypeWrapper<T>::type,
+          GetCudnnTensorFormat(order_),
+          M,
+          C,
+          kernel_h(),
+          kernel_w()));
+      if (InputSize() == 3) {
+        CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+            bias_desc_,
+            GetCudnnTensorFormat(order_),
+            cudnnTypeWrapper<T>::type,
+            1,
+            C,
+            1,
+            1));
+      }
+    }
+    // Set the output
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        top_desc_,
+        GetCudnnTensorFormat(order_),
+        cudnnTypeWrapper<T>::type,
+        N,
+        C,
+        H_out,
+        W_out));
+    // Set the convolution descriptor
+    CAFFE_ENFORCE_EQ(
+        pad_t(),
+        pad_b(),
+        "The current padding scheme leads to unequal padding on the top and "
+        "bottom, which is not supported by cudnn.");
+    CAFFE_ENFORCE_EQ(
+        pad_l(),
+        pad_r(),
+        "The current padding scheme leads to unequal padding on the left "
+        "and right, which is not supported by cudnn.");
+    // Set the convolution descriptor
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w(),
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
+    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w(),
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION));
+#endif
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    // enable TensorCore math if desired
+    enable_tensor_core_ &= TensorCoreAvailable();
+    if (enable_tensor_core_) {
+      CUDNN_ENFORCE(
+          cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
+    }
+#endif
+    if (force_algo_[ALGO_DGRAD] >= 0) {
+      bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
+    } else if (deterministic_) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    } else if (exhaustive_search_) {
+      bwd_data_algo_ =
+          data_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
+            int returned_algo_count;
+            std::array<
+                cudnnConvolutionBwdDataAlgoPerf_t,
+                kNUM_CUDNN_BWD_DATA_ALGS>
+                data_perf_stat;
+            cudnn_wrapper_.with_cudnn_state(
+                cudnn_state_, [&](CuDNNState* state) {
+                  state->workspace().reset();
+                  CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithm(
+                      state->cudnn_handle(),
+                      filter_desc_,
+                      bottom_desc_,
+                      conv_desc_,
+                      top_desc_,
+                      kNUM_CUDNN_BWD_DATA_ALGS,
+                      &returned_algo_count,
+                      data_perf_stat.data()));
+                });
+
+            LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
+            return data_perf_stat[0].algo;
+          });
+    } else {
+      CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          filter_desc_,
+          bottom_desc_,
+          conv_desc_,
+          top_desc_,
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &bwd_data_algo_));
+    }
+
+    size_t bwd_data_ws_size;
+    CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        filter_desc_,
+        bottom_desc_,
+        conv_desc_,
+        top_desc_,
+        bwd_data_algo_,
+        &bwd_data_ws_size));
+    cudnn_ws_nbytes_ = bwd_data_ws_size;
+    VLOG(1) << "CuDNN algorithm: " << bwd_data_algo_;
+    VLOG(1) << "CuDNN workspace size: " << bwd_data_ws_size;
+  }
+
+  // Now, actually run the computation.
+  // Filter
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_ENFORCE(cudnnConvolutionBackwardData(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        filter_desc_,
+        filter.template data<T>(),
+        bottom_desc_,
+        X.template data<T>(),
+        conv_desc_,
+        bwd_data_algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T>::kZero(),
+        top_desc_,
+        Y->template mutable_data<T>()));
+  });
+  // Bias
+  if (InputSize() == 3) {
+    CUDNN_ENFORCE(cudnnAddTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        bias_desc_,
+        Input(BIAS).template data<T>(),
+        cudnnTypeWrapper<T>::kOne(),
+        top_desc_,
+        Y->template mutable_data<T>()));
+  }
+  // Done.
+  return true;
+}
+
+// TODO(Yangqing): a lot of the function contents are very similar. Consider
+// consolidating them.
+template <typename T>
+bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  CAFFE_ENFORCE_EQ(filter.ndim(), 4);
+  int C = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      C = filter.dim32(3);
+      break;
+    case StorageOrder::NCHW:
+      C = filter.dim32(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+
+  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.dim32(2);
+      M = X.dim32(3);
+      H_out = dY.dim32(1);
+      W_out = dY.dim32(2);
+      CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
+      CAFFE_ENFORCE_EQ(filter.dim32(3), C);
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      M = X.dim32(1);
+      H = X.dim32(2);
+      W = X.dim32(3);
+      H_out = dY.dim32(2);
+      W_out = dY.dim32(3);
+      CAFFE_ENFORCE_EQ(filter.dim32(1), C);
+      CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
+      CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+  // Since we only handle LegacyPadding::NOTSET, we don't need to
+  // compute padding.
+  dfilter->ResizeLike(filter);
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          bottom_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          N,
+          M,
+          H,
+          W));
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
+          filter_desc_,
+          cudnnTypeWrapper<T>::type,
+          GetCudnnTensorFormat(order_),
+          M,
+          C,
+          kernel_h(),
+          kernel_w()));
+      if (!no_bias_) {
+        CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+            bias_desc_,
+            GetCudnnTensorFormat(order_),
+            cudnnTypeWrapper<T>::type,
+            1,
+            C,
+            1,
+            1));
+      }
+    }
+    // Set the output
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        top_desc_,
+        GetCudnnTensorFormat(order_),
+        cudnnTypeWrapper<T>::type,
+        N,
+        C,
+        H_out,
+        W_out));
+    // Set the convolution descriptor
+    CAFFE_ENFORCE_EQ(
+        pad_t(),
+        pad_b(),
+        "The current padding scheme leads to unequal padding on the top and "
+        "bottom, which is not supported by cudnn.");
+    CAFFE_ENFORCE_EQ(
+        pad_l(),
+        pad_r(),
+        "The current padding scheme leads to unequal padding on the left "
+        "and right, which is not supported by cudnn.");
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w(),
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
+    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w(),
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION));
+#endif
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    // enable TensorCore math if desired
+    enable_tensor_core_ &= TensorCoreAvailable();
+    if (enable_tensor_core_) {
+      CUDNN_ENFORCE(
+          cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
+    }
+#endif
+    if (force_algo_[ALGO_WGRAD] >= 0) {
+      bwd_filter_algo_ =
+          (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
+    } else if (deterministic_) {
+      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    } else if (exhaustive_search_) {
+      bwd_filter_algo_ =
+          filter_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
+            LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
+            // When we do an exhaustive search, we will ignore the workspace
+            // size
+            // limit and simply go for the fastest algorithm. If you happen to
+            // run
+            // out of memory later, you will be on your own...
+            int returned_algo_count;
+            // We clean up the current workspace memory so that the forward
+            // algorithm
+            // is free to allocate memory.
+            // Actually run the search.
+            std::array<
+                cudnnConvolutionBwdFilterAlgoPerf_t,
+                kNUM_CUDNN_BWD_FILTER_ALGS>
+                filter_perf_stat;
+
+            cudnn_wrapper_.with_cudnn_state(
+                cudnn_state_, [&](CuDNNState* state) {
+                  state->workspace().reset();
+                  CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithm(
+                      state->cudnn_handle(),
+                      top_desc_,
+                      bottom_desc_,
+                      conv_desc_,
+                      filter_desc_,
+                      kNUM_CUDNN_BWD_FILTER_ALGS,
+                      &returned_algo_count,
+                      filter_perf_stat.data()));
+                });
+            LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
+            return filter_perf_stat[0].algo;
+          });
+
+      algo_ =
+          forward_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
+            int returned_algo_count;
+            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+                fwd_perf_stat;
+            cudnn_wrapper_.with_cudnn_state(
+                cudnn_state_, [&](CuDNNState* state) {
+                  state->workspace().reset();
+                  CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithm(
+                      state->cudnn_handle(),
+                      top_desc_,
+                      filter_desc_,
+                      conv_desc_,
+                      bottom_desc_,
+                      kNUM_CUDNN_BWD_DATA_ALGS,
+                      &returned_algo_count,
+                      fwd_perf_stat.data()));
+                });
+
+            LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
+            return fwd_perf_stat[0].algo;
+          });
+    } else {
+      // choose backward algorithm for filter
+      CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          top_desc_,
+          bottom_desc_,
+          conv_desc_,
+          filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &bwd_filter_algo_));
+      // choose backward algo for data
+      CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          top_desc_,
+          filter_desc_,
+          conv_desc_,
+          bottom_desc_,
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &algo_));
+    }
+    // get workspace for backwards filter algorithm
+    size_t bwd_filter_ws_size, fwd_ws_size;
+    CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        top_desc_,
+        bottom_desc_,
+        conv_desc_,
+        filter_desc_,
+        bwd_filter_algo_,
+        &bwd_filter_ws_size));
+    // get workspace for backwards data algorithm
+    CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        top_desc_,
+        filter_desc_,
+        conv_desc_,
+        bottom_desc_,
+        algo_,
+        &fwd_ws_size));
+    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
+
+    VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " << algo_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+  }
+
+  // Now, actually run the computation.
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(C);
+    CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        top_desc_,
+        dY.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        bias_desc_,
+        dbias->template mutable_data<T>()));
+  }
+
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        top_desc_,
+        dY.template data<T>(),
+        bottom_desc_,
+        X.template data<T>(),
+        conv_desc_,
+        bwd_filter_algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T>::kZero(),
+        filter_desc_,
+        dfilter->template mutable_data<T>()));
+
+    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+      // Compute the gradient w.r.t. the input.
+      auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+      dX->ResizeLike(X);
+      CUDNN_ENFORCE(cudnnConvolutionForward(
+          state->cudnn_handle(),
+          cudnnTypeWrapper<T>::kOne(),
+          top_desc_,
+          dY.template data<T>(),
+          filter_desc_,
+          filter.template data<T>(),
+          conv_desc_,
+          algo_,
+          state->workspace().get(cudnn_ws_nbytes_),
+          cudnn_ws_nbytes_,
+          cudnnTypeWrapper<T>::kZero(),
+          bottom_desc_,
+          dX->template mutable_data<T>()));
+    }
+  });
+  return true;
+}
+
+REGISTER_CUDNN_OPERATOR(ConvTranspose, CudnnConvTransposeOp<float>);
+REGISTER_CUDNN_OPERATOR(
+    ConvTransposeGradient,
+    CudnnConvTransposeGradientOp<float>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op_gpu.cc b/caffe2/operators/conv_transpose_op_gpu.cc
new file mode 100644
index 0000000..d7993cf
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_gpu.cc
@@ -0,0 +1,10 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/conv_transpose_op.h"
+#include "caffe2/operators/conv_transpose_op_impl.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(ConvTranspose, ConvTransposeOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ConvTransposeGradient,
+    ConvTransposeGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
new file mode 100644
index 0000000..8084339
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -0,0 +1,553 @@
+// conv_transpose_op_impl.h is the templated implementation of the
+// conv_transpose_op.h file.
+#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
+#define CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_transpose_op.h"
+#include "caffe2/operators/conv_transpose_unpool_op_base.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const Tensor<Context>& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
+  CAFFE_ENFORCE(
+      filter.dim32(0) == M,
+      "filter number must be equal to input channel number");
+  const int C = filter.dim32(1);
+  CAFFE_ENFORCE(
+      filter.dim32(2) == this->kernel_h(),
+      "filter height must be equal to kernel height");
+  CAFFE_ENFORCE(
+      filter.dim32(3) == this->kernel_w(),
+      "filter width must be equal to kernel width");
+  ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
+
+  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
+  const int input_image_size = H * W;
+  const int output_image_size = Y->dim32(2) * Y->dim32(3);
+
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
+    CAFFE_ENFORCE(
+        bias.dim32(0) == C,
+        "bias dimension must be equal to output channel number");
+    if (bias_multiplier_.size() != output_image_size) {
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      T* bm_data = bias_multiplier_.template mutable_data<T>();
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bm_data,
+          &context_);
+    }
+  }
+
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  T* Ydata = Y->template mutable_data<T>();
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer->Resize(
+        vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+    T* col_buffer_data = col_buffer->template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Weight term
+      math::Gemm<T, Context>(
+          CblasTrans,
+          CblasNoTrans,
+          kernel_dim,
+          input_image_size,
+          M,
+          1,
+          filter_data,
+          Xdata,
+          0,
+          col_buffer_data,
+          &context_);
+
+      // Col2Im
+      math::Col2Im<T, Context, StorageOrder::NCHW>(
+          C,
+          Y->dim32(2),
+          Y->dim32(3),
+          this->kernel_h(),
+          this->kernel_w(),
+          1,
+          1,
+          this->pad_t(),
+          this->pad_l(),
+          this->pad_b(),
+          this->pad_r(),
+          this->stride_h(),
+          this->stride_w(),
+          col_buffer_data,
+          Ydata,
+          &context_);
+
+      // Bias term
+      if (InputSize() == 3) {
+        const T* bias_data = Input(BIAS).template data<T>();
+        const T* bm_data = bias_multiplier_.template data<T>();
+#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            C,
+            output_image_size,
+            1,
+            1,
+            bias_data,
+            bm_data,
+            1,
+            Ydata,
+            &context_);
+#else
+        math::BiasCHW<T, Context>(
+            bias_data,
+            bm_data,
+            C,
+            output_image_size,
+            Ydata,
+            &context_);
+#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+      }
+
+      Xdata += M * H * W;
+      Ydata += Y->size() / Y->dim32(0);
+    }
+  };
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const Tensor<Context>& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
+  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
+  CAFFE_ENFORCE(
+      filter.dim32(0) == M,
+      "filter number must be equal to input channel number");
+  CAFFE_ENFORCE(
+      filter.dim32(1) == this->kernel_h(),
+      "filter height must be equal to kernel height");
+  CAFFE_ENFORCE(
+      filter.dim32(2) == this->kernel_w(),
+      "filter width must be equal to kernel width");
+  const int C = filter.dim32(3);
+  ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
+
+  const auto kernel_dim = C * this->kernel_h() * this->kernel_w();
+  const auto input_image_size = H * W;
+  const auto output_image_size = Y->dim32(1) * Y->dim32(2);
+
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
+    CAFFE_ENFORCE(
+        bias.dim32(0) == C,
+        "bias dimension must be equal to output channel number");
+    if (bias_multiplier_.size() != output_image_size) {
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      T* bm_data = bias_multiplier_.template mutable_data<T>();
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bm_data,
+          &context_);
+    }
+  }
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  T* Ydata = Y->template mutable_data<T>();
+
+  auto f = [&](Tensor<Context>* /*col_buffer*/) {
+    col_buffer_.Resize(
+        vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+    T* col_buffer_data = col_buffer_.template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Weight term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          input_image_size,
+          kernel_dim,
+          M,
+          1,
+          Xdata,
+          filter_data,
+          0,
+          col_buffer_data,
+          &context_);
+      // Col2Im
+      math::Col2Im<T, Context, StorageOrder::NHWC>(
+          C,
+          Y->dim32(1),
+          Y->dim32(2),
+          this->kernel_h(),
+          this->kernel_w(),
+          1,
+          1,
+          this->pad_t(),
+          this->pad_l(),
+          this->pad_b(),
+          this->pad_r(),
+          this->stride_h(),
+          this->stride_w(),
+          col_buffer_data,
+          Ydata,
+          &context_);
+      // Bias term
+      if (InputSize() == 3) {
+        const T* bm_data = bias_multiplier_.template data<T>();
+        const T* bias_data = Input(BIAS).template data<T>();
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            output_image_size,
+            C,
+            1,
+            1,
+            bm_data,
+            bias_data,
+            1,
+            Ydata,
+            &context_);
+      }
+      Xdata += M * H * W;
+      Ydata += Y->size() / Y->dim32(0);
+    }
+  };
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  // We only handle LegacyPadding::NOTSET case and ignore cases of
+  // LegacyPadding::VALID and LegacyPadding::SAME
+  // Thus, we don't need to manually compute padding values
+  // We simply use the values from the user
+  CAFFE_ENFORCE(filter.ndim() == 4);
+  const int C = filter.dim32(1);
+  CAFFE_ENFORCE(
+      filter.dim32(2) == this->kernel_h(),
+      "filter height must be equal to kernel height");
+  CAFFE_ENFORCE(
+      filter.dim32(3) == this->kernel_w(),
+      "filter width must be equal to kernel width");
+  dfilter->ResizeLike(filter);
+
+  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
+  const int output_image_size = dY.dim32(2) * dY.dim32(3);
+  // The col buffer is stored in CHW order as well
+  col_buffer_.Resize(
+      vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(C);
+    if (bias_multiplier_.size() != output_image_size) {
+      bias_multiplier_.Resize(1, output_image_size);
+      T* bm_data = bias_multiplier_.template mutable_data<T>();
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bm_data,
+          &context_);
+    }
+  }
+  T* col_buffer_data = col_buffer_.template mutable_data<T>();
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+  // Pre-setting the gradients to zero
+  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    T* dbias_data = dbias->template mutable_data<T>();
+    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
+  }
+  for (auto image_id = 0; image_id < N; ++image_id) {
+    // gradient w.r.t. filters. Im2Col followed by Gemm
+    // Im2Col.
+    math::Im2Col<T, Context, StorageOrder::NCHW>(
+        C,
+        dY.dim32(2),
+        dY.dim32(3),
+        this->kernel_h(),
+        this->kernel_w(),
+        1,
+        1,
+        this->pad_t(),
+        this->pad_l(),
+        this->pad_b(),
+        this->pad_r(),
+        this->stride_h(),
+        this->stride_w(),
+        dYdata,
+        col_buffer_data,
+        &context_);
+    // Gemm
+    math::Gemm<T, Context>(
+        CblasNoTrans,
+        CblasTrans,
+        M,
+        kernel_dim,
+        H * W,
+        1,
+        Xdata,
+        col_buffer_data,
+        1,
+        dfilter_data,
+        &context_);
+    // gradient w.r.t. bias
+    if (!no_bias_) {
+      const T* bm_data = bias_multiplier_.template data<T>();
+      T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>();
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          C,
+          1,
+          output_image_size,
+          1,
+          dYdata,
+          bm_data,
+          1,
+          input_grad_data,
+          &context_);
+    }
+    dYdata += dY.size() / dY.dim32(0);
+    Xdata += X.size() / X.dim32(0);
+  }
+  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+    // Compute gradients w.r.t. the input
+    // Since we have changed dYdata in the above loop, we will need to reset.
+    dYdata = dY.template data<T>();
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    T* dXdata = dX->template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Im2Col.
+      // TODO(zyan3): Probably duplicate work as in gradient computation
+      // w.r.t filters
+      math::Im2Col<T, Context, StorageOrder::NCHW>(
+          C,
+          dY.dim32(2),
+          dY.dim32(3),
+          this->kernel_h(),
+          this->kernel_w(),
+          1,
+          1,
+          this->pad_t(),
+          this->pad_l(),
+          this->pad_b(),
+          this->pad_r(),
+          this->stride_h(),
+          this->stride_w(),
+          dYdata,
+          col_buffer_data,
+          &context_);
+      // Gemm
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          M,
+          H * W,
+          kernel_dim,
+          1,
+          filter_data,
+          col_buffer_data,
+          0,
+          dXdata,
+          &context_);
+      dYdata += dY.size() / dY.dim32(0);
+      dXdata += X.size() / X.dim32(0);
+    }
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
+  // We only handle LegacyPadding::NOTSET case and ignore cases of
+  // LegacyPadding::VALID and LegacyPadding::SAME
+  // Thus, we don't need to manually compute padding values
+  // We simply use the values from the user
+  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
+  CAFFE_ENFORCE(
+      filter.dim32(1) == this->kernel_h(),
+      "filter height must be equal to kernel height");
+  CAFFE_ENFORCE(
+      filter.dim32(2) == this->kernel_w(),
+      "filter width must be equal to kernel width");
+  const int C = filter.dim32(3);
+  dfilter->ResizeLike(filter);
+
+  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
+  const int output_image_size = dY.dim32(1) * dY.dim32(2);
+  // The col buffer is stored in HWC order as well
+  col_buffer_.Resize(
+      vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(C);
+    if (bias_multiplier_.size() != output_image_size) {
+      bias_multiplier_.Resize(1, output_image_size);
+      T* bm_data = bias_multiplier_.template mutable_data<T>();
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bm_data,
+          &context_);
+    }
+  }
+  T* col_buffer_data = col_buffer_.template mutable_data<T>();
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+  // Pre-setting the gradients to zero
+  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    T* dbias_data = dbias->template mutable_data<T>();
+    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
+  }
+  for (auto image_id = 0; image_id < N; ++image_id) {
+    // gradient w.r.t. filters. Im2Col followed by Gemm
+    // Im2Col.
+    math::Im2Col<T, Context, StorageOrder::NHWC>(
+        C,
+        dY.dim32(1),
+        dY.dim32(2),
+        this->kernel_h(),
+        this->kernel_w(),
+        1,
+        1,
+        this->pad_t(),
+        this->pad_l(),
+        this->pad_b(),
+        this->pad_r(),
+        this->stride_h(),
+        this->stride_w(),
+        dYdata,
+        col_buffer_data,
+        &context_);
+    // Gemm
+    math::Gemm<T, Context>(
+        CblasTrans,
+        CblasNoTrans,
+        M,
+        kernel_dim,
+        H * W,
+        1,
+        Xdata,
+        col_buffer_data,
+        1,
+        dfilter_data,
+        &context_);
+    // gradients w.r.t. bias
+    if (!no_bias_) {
+      const T* bm_data = bias_multiplier_.template data<T>();
+      T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>();
+      math::Gemm<T, Context>(
+          CblasTrans,
+          CblasNoTrans,
+          C,
+          1,
+          output_image_size,
+          1,
+          dYdata,
+          bm_data,
+          1,
+          input_grad_data,
+          &context_);
+    }
+    dYdata += dY.size() / dY.dim32(0);
+    Xdata += X.size() / X.dim32(0);
+  }
+  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
+    // Compute gradients w.r.t. the input
+    // Since we have changed dYdata in the above loop, we will need to reset.
+    dYdata = dY.template data<T>();
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    T* dXdata = dX->template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Im2Col.
+      // TODO(zyan3): Probably duplicate work as in gradient computation
+      // w.r.t filters
+      math::Im2Col<T, Context, StorageOrder::NHWC>(
+          C,
+          dY.dim32(1),
+          dY.dim32(2),
+          this->kernel_h(),
+          this->kernel_w(),
+          1,
+          1,
+          this->pad_t(),
+          this->pad_l(),
+          this->pad_b(),
+          this->pad_r(),
+          this->stride_h(),
+          this->stride_w(),
+          dYdata,
+          col_buffer_data,
+          &context_);
+      // Gemm
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasTrans,
+          H * W,
+          M,
+          kernel_dim,
+          1,
+          col_buffer_data,
+          filter_data,
+          0,
+          dXdata,
+          &context_);
+      dYdata += dY.size() / dY.dim32(0);
+      dXdata += X.size() / X.dim32(0);
+    }
+  }
+  return true;
+}
+
+} // namespace caffe2
+#endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
diff --git a/caffe2/operators/conv_transpose_op_mobile.cc b/caffe2/operators/conv_transpose_op_mobile.cc
new file mode 100644
index 0000000..468d46c
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_mobile.cc
@@ -0,0 +1,18 @@
+#include "caffe2/operators/conv_transpose_op_mobile.h"
+#include "caffe2/operators/conv_transpose_op_mobile_impl.h"
+
+namespace caffe2 {
+
+#ifndef CAFFE2_MOBILE
+#error "mobile build state not defined"
+#endif
+
+#if CAFFE2_MOBILE
+// mobile-only implementation (tiled + vectorized + multithreaded)
+REGISTER_CPU_OPERATOR_WITH_ENGINE(
+    ConvTranspose,
+    MOBILE,
+    ConvTransposeMobileOp<float, CPUContext>);
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op_mobile.h b/caffe2/operators/conv_transpose_op_mobile.h
new file mode 100644
index 0000000..ddfe365
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_mobile.h
@@ -0,0 +1,49 @@
+#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_H_
+#define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_H_
+
+#include "caffe2/core/common.h"
+
+#ifndef CAFFE2_MOBILE
+#error "mobile build state not defined"
+#endif
+
+#if CAFFE2_MOBILE
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_transpose_unpool_op_base.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ConvTransposeMobileOp final : public ConvTransposeUnpoolBase<Context> {
+ public:
+  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
+  ConvTransposeMobileOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<Context>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "Only NCHW order is supported right now.");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_l() == 0, "operator does not handle row width padding");
+    OPERATOR_NEEDS_FEATURE(
+        this->pad_r() == 0, "operator does not handle row width padding");
+    OPERATOR_NEEDS_FEATURE(this->stride_w() <= 4, "stride width must be <= 4");
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  // We store a numThreasds per-worker  tiles of Y, and numThreads per-worker threadBuffer for the
+  // gemm output, laid out in that order.
+  TensorCPU threadBuffer_;
+
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_MOBILE
+
+#endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_H_
diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h
new file mode 100644
index 0000000..d434ec4
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@@ -0,0 +1,699 @@
+// conv_transpose_op_impl.h is the templated implementation of the
+// conv_transpose_op.h file.
+#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
+#define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
+
+#include "caffe2/core/common.h"
+
+#ifndef CAFFE2_MOBILE
+#error "mobile build state not defined"
+#endif
+
+#if CAFFE2_MOBILE
+
+#include "caffe2/core/logging.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_transpose_op_mobile.h"
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/fixed_divisor.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+namespace caffe2 {
+
+template <typename T, typename Context>
+void runTileContiguous(
+    int tileId,
+    int N,
+    int M,
+    int H,
+    int W,
+    int outputH,
+    int outputW,
+    int C,
+    int kernelH,
+    int kernelW,
+    int strideH,
+    int strideW,
+    int padT,
+    const T* filterData,
+    const T* Xdata,
+    T* colBufferData,
+    T* Ydata,
+    Context* context) {
+  // The tile size is exactly the length of a single row
+  int tileSize = W;
+
+  auto kernelDataSize = C * kernelH * kernelW;
+  auto currentTileStart = tileSize * tileId;
+
+  // gemm tile
+  math::GemmEx<T, Context>(
+      CblasTrans,
+      CblasNoTrans,
+      kernelDataSize,
+      tileSize,
+      M,
+      1,
+      filterData,
+      kernelDataSize,
+      Xdata + currentTileStart,
+      H * W,
+      0,
+      colBufferData,
+      tileSize,
+      context);
+
+  // col2im tile
+  // We assume that there is no padding in the columns (padL and padR
+  // == 0).
+  // FIXME: it is actually possible for us to handle padding, figure
+  // out how to adjust the bounds
+
+  // We write into Y in a de-interleaved fashion; in other words,
+  // every column (mod strideW) == 0 together in one block,
+  // every column (mod strideW) == 1 in another,
+  // ... and so on.
+  int colBlockSize = (W + kernelW / strideW);
+  int numColBlocks = strideW;
+
+  for (int c = 0; c < kernelDataSize; ++c) {
+    int w_offset = c % kernelW;
+    int h_offset = (c / kernelW) % kernelH;
+    int c_im = c / kernelH / kernelW;
+
+    // Each row is a separate tile that we handle. First determine the
+    // row into which we are writing the output.
+    // We can properly handle padding for the rows.
+    int rowY = tileId * strideH - padT + h_offset;
+
+    // If this row is out of bounds, then skip it
+    if (!math::is_a_ge_zero_and_a_lt_b(rowY, outputH)) {
+      continue;
+    }
+
+    // FIXME: we don't actually handle a dynamic padL > 0
+    constexpr int kPadL = 0;
+    int colOffsetStart = -kPadL + w_offset;
+    int colBlockY = colOffsetStart % strideW;
+
+    // However, within a block we may not start writing at offset
+    // 0. The offset at which we begin writing is determined by
+    // colOffsetStart
+    int colWithinBlockOffsetY = colOffsetStart / strideW;
+
+    // So, this is where we begin reading/writing in Y
+    int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
+
+    // This is the complete offset into Y from the start
+    // Each row has strideW blocks of size colBlockSize
+    int offsetY = rowY * colBlockSize * numColBlocks + colY;
+
+    T* colBufferPointer = colBufferData + c * tileSize;
+    T* yPointer =
+        Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
+
+    int b = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    // We vectorize the loop within the row
+    {
+      constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 4;
+      int limit = (tileSize / kUnroll) * kUnroll;
+
+      for (; b < limit; b += kUnroll) {
+        float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
+        float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
+        float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
+        float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
+
+        float32x4_t y0 = vld1q_f32(yPointer + 0);
+        float32x4_t y1 = vld1q_f32(yPointer + 4);
+        float32x4_t y2 = vld1q_f32(yPointer + 8);
+        float32x4_t y3 = vld1q_f32(yPointer + 12);
+
+        y0 = vaddq_f32(y0, cb0);
+        y1 = vaddq_f32(y1, cb1);
+        y2 = vaddq_f32(y2, cb2);
+        y3 = vaddq_f32(y3, cb3);
+
+        vst1q_f32(yPointer + 0, y0);
+        vst1q_f32(yPointer + 4, y1);
+        vst1q_f32(yPointer + 8, y2);
+        vst1q_f32(yPointer + 12, y3);
+
+        colBufferPointer += kUnroll;
+        yPointer += kUnroll;
+      }
+    }
+
+    {
+      constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
+      int limit = (tileSize / kUnroll) * kUnroll;
+
+      for (; b < limit; b += kUnroll) {
+        float32x4_t cb0 = vld1q_f32(colBufferPointer);
+        float32x4_t y0 = vld1q_f32(yPointer);
+
+        y0 = vaddq_f32(y0, cb0);
+
+        vst1q_f32(yPointer, y0);
+
+        colBufferPointer += kUnroll;
+        yPointer += kUnroll;
+      }
+    }
+#endif
+
+    // Handle un-vectorizable epilogue
+    for (; b < tileSize; ++b) {
+      *yPointer += *colBufferPointer;
+      ++yPointer;
+      ++colBufferPointer;
+    }
+  }
+}
+
+template <typename T, int N>
+struct StoreInterleaved {};
+
+template <>
+struct StoreInterleaved<float, 1> {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  inline static void store(float* p, float32x4_t v[1]) {
+    vst1q_f32(p, v[0]);
+  }
+#endif
+
+  inline static void store(float* p, float v[1]) {
+    p[0] = v[0];
+  }
+};
+
+template <>
+struct StoreInterleaved<float, 2> {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  inline static void store(float* p, float32x4_t v[2]) {
+    float32x4x2_t x = {{v[0], v[1]}};
+    vst2q_f32(p, x);
+  }
+#endif
+
+  inline static void store(float* p, float v[2]) {
+    p[0] = v[0];
+    p[1] = v[1];
+  }
+};
+
+template <>
+struct StoreInterleaved<float, 3> {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  inline static void store(float* p, float32x4_t v[3]) {
+    float32x4x3_t x = {{v[0], v[1], v[2]}};
+    vst3q_f32(p, x);
+  }
+#endif
+
+  inline static void store(float* p, float v[3]) {
+    p[0] = v[0];
+    p[1] = v[1];
+    p[2] = v[2];
+  }
+};
+
+template <>
+struct StoreInterleaved<float, 4> {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  inline static void store(float* p, float32x4_t v[4]) {
+    float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
+    vst4q_f32(p, x);
+  }
+#endif
+
+  inline static void store(float* p, float v[4]) {
+    p[0] = v[0];
+    p[1] = v[1];
+    p[2] = v[2];
+    p[3] = v[3];
+  }
+};
+
+template <int kStrideW>
+void reinterleaveRows(
+    const float* src,
+    const float* bias,
+    int c,
+    int h,
+    float* dst,
+    int outputC,
+    int outputH,
+    int outputW,
+    int inputW,
+    int kernelW,
+    int strideW,
+    int adjH) {
+  // Each row in src is of the form:
+  // [w mod strideW == 0 elements]...[w mod strideW == strideW - 1
+  // elements]
+  // We need to re-interleave the values and write them in the output
+  int colBlockSize = inputW + kernelW / kStrideW;
+  int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
+
+  int point = c * outputH + h;
+  src += point * colBlockSize * kStrideW;
+  dst += point * outputW;
+
+  float b = bias ? bias[c] : 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  float32x4_t biasV = vdupq_n_f32(b);
+#endif
+
+  int w = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 2;
+  int limit = ((inputW - 1) / kUnroll) * kUnroll;
+
+  for (; w < limit; w += kUnroll) {
+    // We need to interleave in terms of kStrideW units
+    float32x4_t v0[kStrideW];
+    float32x4_t v1[kStrideW];
+
+    for (int i = 0; i < kStrideW; ++i) {
+      v0[i] = vld1q_f32(src + i * colBlockSize);
+      v1[i] = vld1q_f32(src + i * colBlockSize + 4);
+    }
+
+    // add per-channel bias
+    for (int i = 0; i < kStrideW; ++i) {
+      v0[i] = vaddq_f32(v0[i], biasV);
+      v1[i] = vaddq_f32(v1[i], biasV);
+    }
+
+    // Write interleaved into the output
+    StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
+    StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
+
+    src += kUnroll;
+    dst += kUnroll * kStrideW;
+  }
+#endif
+
+  // Handle non-vectorizable remainder
+  for (; w < inputW - 1; ++w) {
+    float v[kStrideW];
+
+    for (int i = 0; i < kStrideW; ++i) {
+      v[i] = src[i * colBlockSize];
+    }
+
+    // add per-channel bias
+    for (int i = 0; i < kStrideW; ++i) {
+      v[i] += b;
+    }
+
+    // Write interleaved into the output
+    StoreInterleaved<float, kStrideW>::store(dst, v);
+
+    src += 1;
+    dst += kStrideW;
+  }
+
+  // We have handled 0 .. (inputW - 1) * stride inclusive so far.
+  // Handle the remainder
+  int outputPoint = (inputW - 1) * kStrideW;
+  int block = 0;
+
+  // Output width may include adjustment into which we don't
+  // write; ignore it
+  while (outputPoint < noAdjOutputW) {
+    float v = src[block * colBlockSize];
+    dst[0] = v + b;
+    ++outputPoint;
+    dst += 1;
+
+    ++block;
+    if (block >= kStrideW) {
+      block = 0;
+      src += 1;
+    }
+  }
+
+  // Remainder of the buffer comprised of just the `adj` must have
+  // bias added
+  for (; outputPoint < outputW; ++outputPoint) {
+    dst[0] = b;
+    dst += 1;
+  }
+}
+
+template <int N, typename T, typename Context>
+void reinterleaveMultithreaded(
+    const T* y0,
+    const T* bias_data,
+    T* y,
+    int outputC,
+    int outputH,
+    int outputW,
+    int inputW,
+    int kernelW,
+    int strideW,
+    int adjH,
+    ThreadPool* pool) {
+  // # channels times height
+  size_t totalTiles = (size_t)outputC * outputH;
+  FixedDivisor<int> divOutputH(outputH);
+
+#define REINTERLEAVE(N)  \
+  do {                   \
+    reinterleaveRows<N>( \
+        y0,              \
+        bias_data,       \
+        c,               \
+        h,               \
+        y,               \
+        outputC,         \
+        outputH,         \
+        outputW,         \
+        inputW,          \
+        kernelW,         \
+        strideW,         \
+        adjH);           \
+  } while (false)
+
+  std::function<void(int, size_t)> fnReinterleave = [&](int threadId,
+                                                        size_t tileId) {
+    int h;
+    int c;
+    divOutputH.DivMod((int)tileId, &c, &h);
+
+    REINTERLEAVE(N);
+  };
+
+#undef REINTERLEAVE
+
+  pool->run(fnReinterleave, totalTiles);
+}
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <int N>
+struct SumMultiple {
+  static void sumInto(float* acc, float** toSum, size_t size);
+};
+
+template <>
+struct SumMultiple<1> {
+  static void sumInto(float* acc, float** toSum, size_t size) {
+    constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
+    int limit = (size / kUnroll) * kUnroll;
+
+    auto toSum0 = toSum[0];
+
+    size_t i = 0;
+    for (; i < limit; i += kUnroll) {
+      float32x4_t v0 = vld1q_f32_aligned(acc + i);
+      float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
+
+      v0 = vaddq_f32(v0, v1);
+
+      vst1q_f32_aligned(acc + i, v0);
+    }
+
+    for (; i < size; ++i) {
+      float v0 = acc[i];
+      float v1 = toSum0[i];
+
+      v0 += v1;
+
+      acc[i] = v0;
+    }
+  }
+};
+
+template <>
+struct SumMultiple<2> {
+  static void sumInto(float* acc, float** toSum, size_t size) {
+    constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
+    int limit = (size / kUnroll) * kUnroll;
+
+    auto toSum0 = toSum[0];
+    auto toSum1 = toSum[1];
+
+    size_t i = 0;
+    for (; i < limit; i += kUnroll) {
+      float32x4_t v0 = vld1q_f32_aligned(acc + i);
+      float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
+      float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
+
+      v0 = vaddq_f32(v0, v1);
+      v0 = vaddq_f32(v0, v2);
+
+      vst1q_f32_aligned(acc + i, v0);
+    }
+
+    for (; i < size; ++i) {
+      float v0 = acc[i];
+      float v1 = toSum0[i];
+      float v2 = toSum1[i];
+
+      v0 += v1;
+      v0 += v2;
+
+      acc[i] = v0;
+    }
+  }
+};
+
+template <>
+struct SumMultiple<3> {
+  static void sumInto(float* acc, float** toSum, size_t size) {
+    constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
+    int limit = (size / kUnroll) * kUnroll;
+
+    auto toSum0 = toSum[0];
+    auto toSum1 = toSum[1];
+    auto toSum2 = toSum[2];
+
+    size_t i = 0;
+    for (; i < limit; i += kUnroll) {
+      float32x4_t v0 = vld1q_f32_aligned(acc + i);
+      float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
+      float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
+      float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
+
+      v0 = vaddq_f32(v0, v1);
+      v2 = vaddq_f32(v2, v3);
+      v0 = vaddq_f32(v0, v2);
+
+      vst1q_f32_aligned(acc + i, v0);
+    }
+
+    for (; i < size; ++i) {
+      float v0 = acc[i];
+      float v1 = toSum0[i];
+      float v2 = toSum1[i];
+      float v3 = toSum2[i];
+
+      v0 += v1;
+      v2 += v3;
+      v0 += v2;
+
+      acc[i] = v0;
+    }
+  }
+};
+#endif
+
+// Performs acc[i] += sum_j toSum_j[i] pointwise
+void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  if (toSum.size() == 1) {
+    SumMultiple<1>::sumInto(acc, toSum.data(), size);
+    return;
+  } else if (toSum.size() == 2) {
+    SumMultiple<2>::sumInto(acc, toSum.data(), size);
+    return;
+  } else if (toSum.size() == 3) {
+    SumMultiple<3>::sumInto(acc, toSum.data(), size);
+    return;
+  }
+#endif
+
+  // Otherwise, use fallback implementation
+  EigenVectorArrayMap<float> accT(acc, size);
+
+  for (auto p : toSum) {
+    accT += ConstEigenVectorArrayMap<float>(p, size);
+  }
+}
+
+template <typename T, class Context>
+bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const Tensor<Context>& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
+  CAFFE_ENFORCE(
+      filter.dim32(0) == M,
+      "filter number must be equal to input channel number");
+  const int C = filter.dim32(1);
+  CAFFE_ENFORCE(
+      filter.dim32(2) == this->kernel_h(),
+      "filter height must be equal to kernel height");
+  CAFFE_ENFORCE(
+      filter.dim32(3) == this->kernel_w(),
+      "filter width must be equal to kernel width");
+  if (InputSize() == 3) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
+    CAFFE_ENFORCE(
+        bias.dim32(0) == C,
+        "bias dimension must be equal to output channel number");
+  }
+
+  ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
+
+  const int outputH = Y->dim32(2);
+  const int outputW = Y->dim32(3);
+  const int outputPlaneSize = outputH * outputW;
+  const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
+
+  auto Xdata = X.template data<T>();
+  auto Ydata = Y->template mutable_data<T>();
+
+  auto pool = ws_->GetThreadPool();
+  auto numThreads = pool->getNumThreads();
+
+  // Initialize per-thread buffers for output
+  // The main thread will write directly into the output Y, we just
+  // need buffers for the worker threads
+  size_t colBlockSize = W + this->kernel_w() / this->stride_w();
+  size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
+  // Require 16 byte alignment, so 4-element alignment as these are floats.
+  size_t threadYBufferSizeAligned =
+      ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
+  size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
+
+  // Work around GCC 4.9 bug when this is declared inside the inner lambda.
+  auto runLocalTile = [&](TensorCPU* threadBuffer,
+                          int threadId,
+                          size_t tileId) {
+    auto localYData = threadBuffer->template mutable_data<T>() +
+        threadId * threadYBufferSizeAligned;
+
+    auto localColBufferData = threadBuffer->template mutable_data<T>() +
+        numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
+
+    runTileContiguous<T, Context>(
+        tileId,
+        N,
+        M,
+        H,
+        W,
+        outputH,
+        outputW,
+        C,
+        this->kernel_h(),
+        this->kernel_w(),
+        this->stride_h(),
+        this->stride_w(),
+        this->pad_t(),
+        filter.template data<T>(),
+        Xdata,
+        localColBufferData,
+        localYData,
+        &context_);
+  };
+
+  auto f = [&](Tensor<Context>* threadBuffer) {
+    threadBuffer->Resize(
+        numThreads * threadYBufferSizeAligned +
+        numThreads * threadColBufferSize);
+    // Group together thread buffers for accumulation
+    std::vector<T*> toSum(numThreads - 1);
+    for (int i = 1; i < numThreads; ++i) {
+      toSum[i - 1] = threadBuffer->template mutable_data<T>() +
+          i * threadYBufferSizeAligned;
+    }
+
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Each time through, we have to reset all per-thread output
+      // buffers, since the output buffer is only per-batch element
+      // The column buffers are overwritten by the matrix multiplication
+      // each time, so we need not clear them out each round
+      math::Set<T, Context>(
+          numThreads * threadYBufferSizeAligned,
+          0,
+          threadBuffer->template mutable_data<T>(),
+          &context_);
+
+      // Run tiled gemm and col2im in our threadpool; all of these tiles
+      // are guaranteed to be full tiles
+      // Each tile handles a single row of the input
+      pool->run(
+          [&](int threadId, int tileId) {
+            runLocalTile(threadBuffer, threadId, tileId);
+          },
+          H);
+
+      // We need to accumulate the per-thread results into the output
+      // Y; the first worker thread (main thread) already produced its
+      // results in Y
+      sumInto(
+          threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
+
+// y0 now contains the final output, but it is in deinterleaved
+// form. We have to re-interleave it to produce the final form in Y
+// This operation also handles adding the per-channel bias.
+#define REINTERLEAVE(N)                                              \
+  do {                                                               \
+    reinterleaveMultithreaded<N, T, Context>(                        \
+        threadBuffer->template mutable_data<T>(),                    \
+        InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \
+        Ydata,                                                       \
+        Y->dim32(1),                                                 \
+        Y->dim32(2),                                                 \
+        Y->dim32(3),                                                 \
+        W,                                                           \
+        this->kernel_w(),                                            \
+        this->stride_w(),                                            \
+        this->adj_h(),                                               \
+        pool);                                                       \
+  } while (false)
+
+      if (this->stride_w() == 1) {
+        REINTERLEAVE(1);
+      } else if (this->stride_w() == 2) {
+        REINTERLEAVE(2);
+      } else if (this->stride_w() == 3) {
+        REINTERLEAVE(3);
+      } else if (this->stride_w() == 4) {
+        REINTERLEAVE(4);
+      }
+
+#undef REINTERLEAVE
+
+      Xdata += M * H * W;
+      Ydata += Y->size() / Y->dim32(0);
+    }
+  };
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&threadBuffer_);
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  CAFFE_THROW("Not implemented.");
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_MOBILE
+
+#endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
new file mode 100644
index 0000000..b9282e7
--- /dev/null
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -0,0 +1,201 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+#include "gtest/gtest.h"
+#include <cmath>
+#include <random>
+
+namespace caffe2 {
+
+void AddConstInput(const vector<TIndex>& shape,
+                   const float value,
+                   const string& name,
+                   Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  math::Set<float, CPUContext>(tensor->size(), value,
+                               tensor->mutable_data<float>(),
+                               &context);
+}
+
+void AddNoiseInput(const vector<TIndex>& shape,
+                   const string& name,
+                   Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+    tensor->size(),
+    0.0f, 10.0f,
+    tensor->mutable_data<float>(),
+    &context);
+}
+
+inline float relativeError(float a, float b) {
+  return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
+}
+
+void compare(int N, int inputC, int H, int W,
+             int outputC,
+             int kernelH, int kernelW, int strideH, int strideW,
+             int padT, int padL, int padB, int padR,
+             int adjH, int adjW,
+             float maxRelErr, float absErrForRelErrFailure) {
+  LOG(INFO) <<
+    "running N " << N << " inputC " << inputC << " H " << H << " W " << W <<
+    " outputC " << outputC <<
+    " kernelH " << kernelH << " kernelW " << kernelW <<
+    " strideH " << strideH << " strideW " << strideW <<
+    " padT " << padT << " padL " << padL <<
+    " padB " << padB << " padR " << padR <<
+    " adjH " << adjH << " adjW " << adjW;
+
+  Workspace ws;
+
+  OperatorDef def1;
+  def1.set_name("test");
+  def1.set_type("ConvTranspose");
+  def1.set_engine("MOBILE");
+  def1.add_input("X");
+  def1.add_input("W");
+  def1.add_input("B");
+  def1.add_output("Y1");
+
+  def1.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  def1.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  def1.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  def1.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  def1.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  def1.add_arg()->CopyFrom(MakeArgument("adj_h", adjH));
+  def1.add_arg()->CopyFrom(MakeArgument("adj_w", adjW));
+
+  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(vector<TIndex>{inputC, outputC, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+
+  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
+  EXPECT_NE(nullptr, op1.get());
+
+  OperatorDef def2;
+  def2.set_name("test");
+  def2.set_type("ConvTranspose");
+  def2.add_input("X");
+  def2.add_input("W");
+  def2.add_input("B");
+  def2.add_output("Y2");
+
+  def2.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  def2.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  def2.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  def2.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  def2.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  def2.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  def2.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  def2.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  def2.add_arg()->CopyFrom(MakeArgument("adj_h", adjH));
+  def2.add_arg()->CopyFrom(MakeArgument("adj_w", adjW));
+
+  unique_ptr<OperatorBase> op2(CreateOperator(def2, &ws));
+  EXPECT_NE(nullptr, op2.get());
+
+  EXPECT_TRUE(op1->Run());
+  Blob* Y1blob = ws.GetBlob("Y1");
+  EXPECT_NE(nullptr, Y1blob);
+  auto& Y1 = Y1blob->Get<TensorCPU>();
+
+  EXPECT_TRUE(op2->Run());
+  Blob* Y2blob = ws.GetBlob("Y2");
+  EXPECT_NE(nullptr, Y2blob);
+  auto& Y2 = Y2blob->Get<TensorCPU>();
+
+  // Compare all output points
+  for (int n = 0; n < Y1.dim32(0); ++n) {
+    for (int c = 0; c < Y1.dim32(1); ++c) {
+      for (int h = 0; h < Y1.dim32(2); ++h) {
+        for (int w = 0; w < Y1.dim32(3); ++w) {
+          int offset =
+            n * Y1.dim32(1) * Y1.dim32(2) * Y1.dim32(3) +
+            c * Y1.dim32(2) * Y1.dim32(3) +
+            h * Y1.dim32(3) +
+            w;
+
+          auto v1 = Y1.data<float>()[offset];
+          auto v2 = Y2.data<float>()[offset];
+
+          float relErr = relativeError(v1, v2);
+          float absErr = std::abs(v1 - v2);
+
+          // For small values / small difference, the relative error
+          // can be huge but the absolute error will be small
+          EXPECT_TRUE(relErr <= maxRelErr ||
+                      (relErr > maxRelErr &&
+                       absErr <= absErrForRelErrFailure)) <<
+            v1 << " " << v2 << " (rel err " << relErr << ") " <<
+            "(" << n << " " << c << " " << h << " " << w << ") " <<
+            "running N " << N << " inputC " << inputC <<
+            " H " << H << " W " << W <<
+            " outputC " << outputC <<
+            " kernelH " << kernelH << " kernelW " << kernelW <<
+            " strideH " << strideH << " strideW " << strideW <<
+            " padT " << padT << " padL " << padL <<
+            " padB " << padB << " padR " << padR <<
+            " adjH " << adjH << " adjW " << adjW;
+
+        }
+      }
+    }
+  }
+}
+
+} // namespace caffe2
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+// TODO(#14383029) cblas_sgemm not yet implemented on limited mobile cases.
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+TEST(ConvTransposeMobile, Test) {
+  for (int i = 0; i < 10; ++i) {
+    int n = randInt(1, 3);
+    int planesIn = randInt(1, 10);
+    int h = randInt(10, 200);
+    int w = randInt(10, 200);
+    int planesOut = randInt(1, 10);
+    int kernelH = randInt(2, 5);
+    int kernelW = randInt(2, 5);
+    int strideH = randInt(1, 4);
+    int strideW = randInt(1, 4);
+    int padT = randInt(0, 3);
+    int padB = randInt(0, 3);
+    int padL = 0;
+    int padR = 0;
+    int adjH = randInt(0, 3);
+    if (adjH >= strideH) { adjH = strideH - 1; }
+    int adjW = randInt(0, 3);
+    if (adjW >= strideW) { adjW = strideW - 1; }
+
+    caffe2::compare(n, planesIn, h, w,
+                    planesOut,
+                    kernelH, kernelW,
+                    strideH, strideW,
+                    padT, padL, padB, padR,
+                    adjH, adjW, 0.002f, 0.001f);
+  }
+}
+#endif
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
new file mode 100644
index 0000000..bf2708d
--- /dev/null
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -0,0 +1,307 @@
+#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_UNPOOL_OP_BASE_H_
+#define CAFFE2_OPERATORS_CONV_TRANSPOSE_UNPOOL_OP_BASE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/proto/caffe2_legacy.pb.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+namespace caffe2 {
+
+template <class Context>
+class ConvTransposeUnpoolBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConvTransposeUnpoolBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        legacy_pad_(
+            static_cast<LegacyPadding>(OperatorBase::GetSingleArgument<int>(
+                "legacy_pad",
+                LegacyPadding::NOTSET))),
+        kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
+        stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
+        pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
+        adj_(OperatorBase::GetRepeatedArgument<int>("adjs")),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        shared_buffer_(
+            OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+        ws_(ws) {
+    // For the padding, they should either be the legacy padding strategy
+    // (VALID or SAME), or an explicit, non-negative value.
+    if (legacy_pad_ == LegacyPadding::VALID ||
+        legacy_pad_ == LegacyPadding::SAME) {
+      CAFFE_ENFORCE(
+          !OperatorBase::HasArgument("pads"),
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+    }
+    // Get old arguments values.
+    if (OperatorBase::HasArgument("kernel")) {
+      kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
+    } else if (
+        OperatorBase::HasArgument("kernel_h") &&
+        OperatorBase::HasArgument("kernel_w")) {
+      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
+      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("stride")) {
+      stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
+    } else if (
+        OperatorBase::HasArgument("stride_h") &&
+        OperatorBase::HasArgument("stride_w")) {
+      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
+      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("adj")) {
+      adj_.resize(2, OperatorBase::GetSingleArgument<int>("adj", 0));
+    } else if (
+        OperatorBase::HasArgument("adj_h") &&
+        OperatorBase::HasArgument("adj_w")) {
+      adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_h", 0));
+      adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_w", 0));
+    }
+
+    if (OperatorBase::HasArgument("pad")) {
+      CAFFE_ENFORCE(
+          legacy_pad_ != LegacyPadding::VALID &&
+              legacy_pad_ != LegacyPadding::SAME,
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+      pads_.resize(4, OperatorBase::GetSingleArgument<int>("pad", 0));
+    } else if (
+        OperatorBase::HasArgument("pad_t") &&
+        OperatorBase::HasArgument("pad_l") &&
+        OperatorBase::HasArgument("pad_b") &&
+        OperatorBase::HasArgument("pad_r")) {
+      CAFFE_ENFORCE(
+          legacy_pad_ != LegacyPadding::VALID &&
+              legacy_pad_ != LegacyPadding::SAME,
+          "If you use legacy padding VALID or SAME, you should not specify "
+          "any specific padding values.");
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
+      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
+    }
+
+    // Fill default values.
+    if (kernel_.size() == 0) {
+      kernel_.assign({0, 0});
+    }
+
+    if (stride_.size() == 0) {
+      stride_.resize(kernel_.size(), 1);
+    }
+
+    if (pads_.size() == 0) {
+      pads_.resize(kernel_.size() * 2, 0);
+    }
+
+    if (adj_.size() == 0) {
+      adj_.resize(kernel_.size(), 0);
+    }
+
+    CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
+    CAFFE_ENFORCE_EQ(adj_.size(), kernel_.size());
+
+    if (legacy_pad_ != LegacyPadding::VALID &&
+        legacy_pad_ != LegacyPadding::SAME) {
+      CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
+    }
+
+    for (int dim = 0; dim < kernel_.size(); ++dim) {
+      CAFFE_ENFORCE_GT(kernel_[dim], 0);
+      CAFFE_ENFORCE_GT(stride_[dim], 0);
+      CAFFE_ENFORCE_GE(adj_[dim], 0);
+      CAFFE_ENFORCE_LE(adj_[dim], stride_[dim]);
+    }
+
+    // Create shared buffer mutex in the constructor
+    // to avoid race-condition in DAGNet.
+    if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+      createSharedBuffer<Context>(ws_);
+    }
+  }
+  // Sets the output size. The output channel is manually specified.
+  void SetOutputSize(
+      const Tensor<Context>& input,
+      Tensor<Context>* output,
+      int output_channel) {
+    CAFFE_ENFORCE(4 == input.ndim());
+    CAFFE_ENFORCE(input.size() > 0);
+    int N = input.dim32(0);
+    bool channel_first = false; // initialized to suppress compiler warning.
+    int H = 0, W = 0; // initialized to suppress compiler warning.
+    int M = 0;
+    switch (order_) {
+      case StorageOrder::NHWC:
+        channel_first = false;
+        H = input.dim32(1);
+        W = input.dim32(2);
+        M = input.dim32(3);
+        break;
+      case StorageOrder::NCHW:
+        channel_first = true;
+        M = input.dim32(1);
+        H = input.dim32(2);
+        W = input.dim32(3);
+        break;
+      default:
+        LOG(FATAL) << "Unknown Storage order: " << order_;
+    }
+    int output_height = 0, output_width = 0;
+    ComputeSizeAndPad(
+        H,
+        stride_[0],
+        kernel_[0],
+        adj_[0],
+        &pads_[0],
+        &pads_[2],
+        &output_height);
+    ComputeSizeAndPad(
+        W,
+        stride_[1],
+        kernel_[1],
+        adj_[1],
+        &pads_[1],
+        &pads_[3],
+        &output_width);
+    if (channel_first) {
+      output->Resize(N, output_channel, output_height, output_width);
+    } else {
+      output->Resize(N, output_height, output_width, output_channel);
+    }
+    VLOG(2) << "In: N " << N << " M " << M << " H " << H << " W " << W;
+    VLOG(2) << "Out: output_channel " << output_channel << " H "
+            << output_height << " W " << output_width;
+  }
+
+  bool RunOnDevice() override {
+    switch (order_) {
+      case StorageOrder::NHWC:
+        return RunOnDeviceWithOrderNHWC();
+      case StorageOrder::NCHW:
+        return RunOnDeviceWithOrderNCHW();
+      default:
+        LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+    // To suppress old compiler warnings
+    return true;
+  }
+
+  virtual bool RunOnDeviceWithOrderNCHW() {
+    CAFFE_THROW("Not implemented");
+  }
+
+  virtual bool RunOnDeviceWithOrderNHWC() {
+    CAFFE_THROW("Not implemented");
+  }
+
+  virtual ~ConvTransposeUnpoolBase() {}
+
+ private:
+  LegacyPadding legacy_pad_;
+  int pad_;
+
+ protected:
+  vector<int> kernel_;
+  vector<int> stride_;
+  vector<int> pads_;
+  vector<int> adj_;
+  StorageOrder order_;
+  bool shared_buffer_;
+  Workspace* ws_;
+
+  // Accessors for 2D conv params.
+
+  inline int pad_t() const {
+    return pads_[0];
+  }
+
+  inline int pad_l() const {
+    return pads_[1];
+  }
+
+  inline int pad_b() const {
+    return pads_[2];
+  }
+
+  inline int pad_r() const {
+    return pads_[3];
+  }
+
+  inline int kernel_h() const {
+    return kernel_[0];
+  }
+
+  inline int kernel_w() const {
+    return kernel_[1];
+  }
+
+  inline int stride_h() const {
+    return stride_[0];
+  }
+
+  inline int stride_w() const {
+    return stride_[1];
+  }
+
+  inline int adj_h() const {
+    return adj_[0];
+  }
+
+  inline int adj_w() const {
+    return adj_[1];
+  }
+
+  inline void ComputeSizeAndPad(
+      const int in_size,
+      const int stride,
+      const int kernel,
+      const int adj,
+      int* pad_head,
+      int* pad_tail,
+      int* out_size) {
+    switch (legacy_pad_) {
+      case LegacyPadding::NOTSET:
+        CAFFE_ENFORCE(*pad_head >= 0);
+        CAFFE_ENFORCE(*pad_tail >= 0);
+        *out_size =
+            (in_size - 1) * stride + kernel + adj - *pad_head - *pad_tail;
+        break;
+      // We handle cases of LegacyPadding::VALID and LegacyPadding::SAME
+      // the same way
+      case LegacyPadding::VALID:
+      case LegacyPadding::SAME:
+        *pad_head = 0;
+        *pad_tail = 0;
+        *out_size = (in_size - 1) * stride + kernel + adj;
+        break;
+      case LegacyPadding::CAFFE_LEGACY_POOLING:
+        LOG(FATAL) << "CAFFE_LEGACY_POOLING is no longer supported.";
+        break;
+    }
+  }
+};
+
+#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context) \
+  USE_OPERATOR_FUNCTIONS(Context);                        \
+  using ConvTransposeUnpoolBase<Context>::kernel_;        \
+  using ConvTransposeUnpoolBase<Context>::stride_;        \
+  using ConvTransposeUnpoolBase<Context>::pads_;          \
+  using ConvTransposeUnpoolBase<Context>::adj_;           \
+  using ConvTransposeUnpoolBase<Context>::order_;         \
+  using ConvTransposeUnpoolBase<Context>::shared_buffer_; \
+  using ConvTransposeUnpoolBase<Context>::ws_
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_UNPOOL_OP_BASE_H_
diff --git a/caffe2/operators/cos_op.cc b/caffe2/operators/cos_op.cc
new file mode 100644
index 0000000..262ccee
--- /dev/null
+++ b/caffe2/operators/cos_op.cc
@@ -0,0 +1,109 @@
+#include "caffe2/operators/cos_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool CosGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = -dY_arr * X_arr.sin();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Cos,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, CosFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    CosGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        CosGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Cos)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the cosine of the given input tensor, element-wise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cos_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Cos",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X: [0.6816719  0.76771533 0.933932   0.01404487 0.11862425]
+Y: [0.7765203  0.71949923 0.5946774  0.99990135 0.9929724 ]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output tensor calculated as the cosine of the input tensor, element-wise.");
+
+OPERATOR_SCHEMA(CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+
+namespace {
+
+class GetCosGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CosGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Cos, GetCosGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cos_op.cu b/caffe2/operators/cos_op.cu
new file mode 100644
index 0000000..4d00156
--- /dev/null
+++ b/caffe2/operators/cos_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/cos_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+CosGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = -__ldg(dY + i) * sin(__ldg(X + i));
+#else
+    dX[i] = -dY[i] * sin(X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool CosGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  CosGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Cos,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CosFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    CosGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CosGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cos_op.h b/caffe2/operators/cos_op.h
new file mode 100644
index 0000000..87a10d5
--- /dev/null
+++ b/caffe2/operators/cos_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_COS_OP_H_
+#define CAFFE2_OPERATORS_COS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct CosFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Cos(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct CosGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COS_OP_H_
diff --git a/caffe2/operators/cosh_op.cc b/caffe2/operators/cosh_op.cc
new file mode 100644
index 0000000..7eb88ac
--- /dev/null
+++ b/caffe2/operators/cosh_op.cc
@@ -0,0 +1,115 @@
+#include "caffe2/operators/cosh_op.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool CoshGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& /* dY_dims */,
+    const std::vector<int>& X_dims,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (X_arr.exp() - (-X_arr).exp()) / 2;
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Cosh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        CoshFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    CoshGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        CoshGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Cosh)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the hyperbolic cosine of the given input tensor, element-wise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cosh_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Cosh",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X: [0.66423494 0.32074615 0.81523746 0.90423071 0.39275789]
+Y: [1.22883528 1.05188156 1.35112322 1.43744212 1.07812598]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The hyperbolic cosine values of the input tensor, computed "
+        "element-wise")
+    .InheritOnnxSchema("Cosh");
+
+OPERATOR_SCHEMA(CoshGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetCoshGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CoshGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Cosh, GetCoshGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cosh_op.cu b/caffe2/operators/cosh_op.cu
new file mode 100644
index 0000000..ac50284
--- /dev/null
+++ b/caffe2/operators/cosh_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/cosh_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void CoshGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * sinhf(__ldg(X + i));
+#else
+    dX[i] = dY[i] * sinhf(X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool CoshGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& /* dY_dims */,
+    const std::vector<int>& X_dims,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  CoshGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Cosh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CoshFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    CoshGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CoshGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cosh_op.h b/caffe2/operators/cosh_op.h
new file mode 100644
index 0000000..201faa2
--- /dev/null
+++ b/caffe2/operators/cosh_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_COSH_OP_H_
+#define CAFFE2_OPERATORS_COSH_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct CoshFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Cosh(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct CoshGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COSH_OP_H_
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cc b/caffe2/operators/cosine_embedding_criterion_op.cc
new file mode 100644
index 0000000..26f477f
--- /dev/null
+++ b/caffe2/operators/cosine_embedding_criterion_op.cc
@@ -0,0 +1,86 @@
+#include "caffe2/operators/cosine_embedding_criterion_op.h"
+
+#include <algorithm>
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool CosineEmbeddingCriterionOp<CPUContext>::RunOnDevice() {
+  auto& S = Input(0);
+  auto& Y = Input(1);
+  auto* output = Output(0);
+  CAFFE_ENFORCE(
+      S.size() == Y.size(),
+      "The embedding and label should have the same size.");
+  output->ResizeLike(S);
+
+  const float* Sdata = S.data<float>();
+  const int* Ydata = Y.data<int>();
+  float* output_data = output->mutable_data<float>();
+  for (int i = 0; i < S.size(); ++i) {
+    output_data[i] =
+        Ydata[i] == 1 ? (1.f - Sdata[i]) : std::max(0.f, Sdata[i] - margin_);
+  }
+  return true;
+}
+
+template <>
+bool CosineEmbeddingCriterionGradientOp<CPUContext>::RunOnDevice() {
+  auto& S = Input(0);
+  auto& Y = Input(1);
+  auto& dOutput = Input(2);
+  auto* dS = Output(0);
+
+  dS->ResizeLike(S);
+
+  const float* Sdata = S.data<float>();
+  const int* Ydata = Y.data<int>();
+  const float* dOutput_data = dOutput.data<float>();
+  float* dSdata = dS->mutable_data<float>();
+  for (int i = 0; i < S.size(); ++i) {
+    dSdata[i] = dOutput_data[i] *
+        (Ydata[i] == 1 ? -1.f : static_cast<float>(Sdata[i] >= margin_));
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    CosineEmbeddingCriterion,
+    CosineEmbeddingCriterionOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CosineEmbeddingCriterionGradient,
+    CosineEmbeddingCriterionGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(CosineEmbeddingCriterion)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+CosineEmbeddingCriterion takes two inputs: the similarity value and
+the label, and computes the elementwise criterion output as
+
+  output = 1 - s,               if y == 1
+           max(0, s - margin),  if y == -1
+)DOC")
+    .Input(0, "S", "The cosine similarity as a 1-dim TensorCPU.")
+    .Input(1, "Y", "The label as a 1-dim TensorCPU with int value of 1 or -1.")
+    .Output(0, "loss", "The output loss with the same dimensionality as S.");
+
+OPERATOR_SCHEMA(CosineEmbeddingCriterionGradient).NumInputs(3).NumOutputs(1);
+
+class GetCosineEmbeddingCriterionGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CosineEmbeddingCriterionGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(
+    CosineEmbeddingCriterion,
+    GetCosineEmbeddingCriterionGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu
new file mode 100644
index 0000000..792062f
--- /dev/null
+++ b/caffe2/operators/cosine_embedding_criterion_op.cu
@@ -0,0 +1,69 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/cosine_embedding_criterion_op.h"
+
+namespace caffe2 {
+namespace {
+
+
+__global__ void CECKernel(
+    const int N, const float* S, const int* Y, const float margin,
+    float* output) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    output[i] = Y[i] == 1 ? (1. - S[i]) : max(0.f, S[i] - margin);
+  }
+}
+
+__global__ void CECGradientKernel(
+    const int N, const float* S, const int* Y, const float* dOutput,
+    const float margin, float* dS) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dS[i] = dOutput[i] * (Y[i] == 1 ? -1 : static_cast<float>(S[i] >= margin));
+  }
+}
+}  // namespace
+
+template <>
+bool CosineEmbeddingCriterionOp<CUDAContext>::RunOnDevice() {
+  auto& S = Input(0);
+  auto& Y = Input(1);
+  auto* output = Output(0);
+  CAFFE_ENFORCE(S.size() == Y.size(),
+                "The embedding and label should have the same size.");
+  output->ResizeLike(S);
+
+  const float* Sdata = S.data<float>();
+  const int* Ydata = Y.data<int>();
+  float* output_data = output->mutable_data<float>();
+ 
+  CECKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
+              0, context_.cuda_stream()>>>(
+      S.size(), Sdata, Ydata, margin_, output_data);
+  return true;
+}
+
+template <>
+bool CosineEmbeddingCriterionGradientOp<CUDAContext>::RunOnDevice() {
+  auto& S = Input(0);
+  auto& Y = Input(1);
+  auto& dOutput = Input(2);
+  auto* dS = Output(0);
+
+  dS->ResizeLike(S);
+
+  const float* Sdata = S.data<float>();
+  const int* Ydata = Y.data<int>();
+  const float* dOutput_data = dOutput.data<float>();
+  float* dSdata = dS->mutable_data<float>();
+  CECGradientKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
+                      0, context_.cuda_stream()>>>(
+      S.size(), Sdata, Ydata, dOutput_data, margin_, dSdata);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    CosineEmbeddingCriterion,
+    CosineEmbeddingCriterionOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CosineEmbeddingCriterionGradient,
+    CosineEmbeddingCriterionGradientOp<CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/cosine_embedding_criterion_op.h b/caffe2/operators/cosine_embedding_criterion_op.h
new file mode 100644
index 0000000..0d9599d
--- /dev/null
+++ b/caffe2/operators/cosine_embedding_criterion_op.h
@@ -0,0 +1,40 @@
+#ifndef CAFFE2_OPERATORS_COSINE_EMBEDDING_CRITERION_OP_H_
+#define CAFFE2_OPERATORS_COSINE_EMBEDDING_CRITERION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class CosineEmbeddingCriterionOp final : public Operator<Context> {
+ public:
+  CosineEmbeddingCriterionOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(float, "margin", margin_, 0.0) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float margin_;
+};
+
+template <class Context>
+class CosineEmbeddingCriterionGradientOp final : public Operator<Context> {
+ public:
+  CosineEmbeddingCriterionGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(float, "margin", margin_, 0.0) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float margin_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COSINE_EMBEDDING_CRITERION_OP_H_
diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc
new file mode 100644
index 0000000..15cdab5
--- /dev/null
+++ b/caffe2/operators/counter_ops.cc
@@ -0,0 +1,305 @@
+#include "counter_ops.h"
+#include "caffe2/core/blob_serialization.h"
+
+namespace caffe2 {
+
+const char* githubLinks = R"DOC(
+  Github Links:
+  - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/counter_ops.cc
+
+)DOC";
+
+const char* kCountExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+createcounter_op = core.CreateOperator(
+    "CreateCounter",
+    [],
+    ["counter"],
+    init_count=5
+)
+
+retrievecount_op = core.CreateOperator(
+    "RetrieveCount",
+    ["counter"],
+    ["count"]
+)
+
+checkcounterdone_op = core.CreateOperator(
+    "CheckCounterDone",
+    ["counter"],
+    ["done"]
+)
+
+countup_op = core.CreateOperator(
+    "CountUp",
+    ["counter"],
+    ["previous_count"],
+)
+
+countdown_op = core.CreateOperator(
+    "CountDown",
+    ["counter"],
+    ["done"],
+)
+
+resetcounter_op = core.CreateOperator(
+    "ResetCounter",
+    ["counter"],
+    ["previous_count"],
+    init_count=3
+)
+
+
+# Create counter
+workspace.RunOperatorOnce(createcounter_op)
+print("'counter' pointer:", workspace.FetchBlob("counter"))
+
+
+# Retrieve initial counter value
+workspace.RunOperatorOnce(retrievecount_op)
+print("Initial 'count':", workspace.FetchBlob("count"))
+
+
+# Check if counter is done
+workspace.RunOperatorOnce(checkcounterdone_op)
+print("Initial 'done' value:", workspace.FetchBlob("done"))
+
+
+# Test CountUp operator
+print("\nTesting CountUp operator...")
+for i in range(5):
+    workspace.RunOperatorOnce(countup_op)
+    print("'previous_count' after CountUp:", workspace.FetchBlob("previous_count"))
+
+workspace.RunOperatorOnce(retrievecount_op)
+print("'count' value after CountUp test:", workspace.FetchBlob("count"))
+
+
+# Test CountDown operator
+print("\nTesting CountDown operator...")
+for i in range(11):
+    workspace.RunOperatorOnce(countdown_op)
+    workspace.RunOperatorOnce(retrievecount_op)
+    print("'count' value after CountDown: {}\t'done' value: {}".format(workspace.FetchBlob("count"), workspace.FetchBlob("done")))
+```
+
+**Result**
+
+```
+'counter' pointer: counter, a C++ native class of type std::__1::unique_ptr<caffe2::Counter<long long>, std::__1::default_delete<caffe2::Counter<long long> > >.
+Initial 'count': 5
+Initial 'done' value: False
+
+Testing CountUp operator...
+'previous_count' after CountUp: 5
+'previous_count' after CountUp: 6
+'previous_count' after CountUp: 7
+'previous_count' after CountUp: 8
+'previous_count' after CountUp: 9
+'count' value after CountUp test: 10
+
+Testing CountDown operator...
+'count' value after CountDown: 9	'done' value: False
+'count' value after CountDown: 8	'done' value: False
+'count' value after CountDown: 7	'done' value: False
+'count' value after CountDown: 6	'done' value: False
+'count' value after CountDown: 5	'done' value: False
+'count' value after CountDown: 4	'done' value: False
+'count' value after CountDown: 3	'done' value: False
+'count' value after CountDown: 2	'done' value: False
+'count' value after CountDown: 1	'done' value: False
+'count' value after CountDown: 0	'done' value: False
+'count' value after CountDown: -1	'done' value: True
+```
+
+</details>
+
+)DOC";
+
+namespace {
+/**
+ *  @brief CounterSerializer is the serializer for Counter type.
+ *
+ * CounterSerializer takes in a blob that contains a Counter, and serializes
+ * it into a BlobProto protocol buffer. At the moment only int64_t counters are
+ * supported (since it's the only once that is really used).
+ *
+ */
+class CounterSerializer : public BlobSerializerBase {
+ public:
+  CounterSerializer() {}
+  ~CounterSerializer() {}
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<std::unique_ptr<Counter<int64_t>>>());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::unique_ptr<Counter<int64_t>>");
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    proto.set_data_type(TensorProto_DataType_INT64);
+    proto.add_dims(1);
+    proto.add_int64_data(
+        blob.template Get<std::unique_ptr<Counter<int64_t>>>()->retrieve());
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+/**
+ * @brief CounterDeserializer is the deserializer for Counters.
+ *
+ */
+class CounterDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    auto tensorProto = proto.tensor();
+    CAFFE_ENFORCE_EQ(tensorProto.dims_size(), 1, "Unexpected size of dims");
+    CAFFE_ENFORCE_EQ(tensorProto.dims(0), 1, "Unexpected value of dims");
+    CAFFE_ENFORCE_EQ(
+        tensorProto.data_type(),
+        TensorProto_DataType_INT64,
+        "Only int64_t counters supported");
+    CAFFE_ENFORCE_EQ(
+        tensorProto.int64_data_size(), 1, "Unexpected size of data");
+    *blob->GetMutable<std::unique_ptr<Counter<int64_t>>>() =
+        caffe2::make_unique<Counter<int64_t>>(tensorProto.int64_data(0));
+  }
+};
+}
+
+// TODO(jiayq): deprecate these ops & consolidate them with
+// IterOp/AtomicIterOp
+
+REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CheckCounterDone,
+    CheckCounterDoneOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(CountUp, CountUpOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CPUContext>);
+
+OPERATOR_SCHEMA(CreateCounter)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a count-down counter with initial value specified by the `init_count`
+argument.
+
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Output(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a new counter.")
+    .Arg(
+        "init_count",
+        "*(type: int; default: 0)* Initial count for the counter, must be >= 0.");
+
+OPERATOR_SCHEMA(ResetCounter)
+    .NumInputs(1)
+    .NumOutputs(0, 1)
+    .SetDoc(R"DOC(
+Resets a count-down counter with initial value specified by the `init_count`
+argument.
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Input(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
+    .Output(
+        0,
+        "previous_value",
+        "*(type: int)* [OPTIONAL] count value BEFORE this operation.")
+    .Arg(
+        "init_count",
+        "*(type: int; default: 0)* Resets counter to this value, must be >= 0.");
+
+OPERATOR_SCHEMA(CountDown)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+If the internal count value > 0, decreases count value by 1 and outputs False,
+otherwise outputs True.
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Input(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
+    .Output(
+        0,
+        "done",
+        "*(type: bool)* False unless the internal count is zero.");
+
+OPERATOR_SCHEMA(CheckCounterDone)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+If the internal count value <= 0, outputs true, otherwise outputs false.
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Input(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
+    .Output(
+        0,
+        "done",
+        "*(type: bool)* True if the internal count is zero or negative, otherwise False.");
+
+OPERATOR_SCHEMA(CountUp)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Increases count value by 1 and outputs the previous value atomically.
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Input(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
+    .Output(
+        0,
+        "previous_count",
+        "*(type: int)* Count value BEFORE this operation.");
+
+OPERATOR_SCHEMA(RetrieveCount)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::INT64)
+    .SetDoc(R"DOC(
+Retrieve the current value from the counter as an integer.
+)DOC" + (string) githubLinks + (string) kCountExample)
+    .Input(
+        0,
+        "counter",
+        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
+    .Output(
+        0,
+        "count",
+        "*(type: int)* Current count value.");
+
+SHOULD_NOT_DO_GRADIENT(CreateCounter);
+SHOULD_NOT_DO_GRADIENT(ResetCounter);
+SHOULD_NOT_DO_GRADIENT(CountDown);
+SHOULD_NOT_DO_GRADIENT(CountUp);
+SHOULD_NOT_DO_GRADIENT(RetrieveCount);
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<Counter<int64_t>>>()),
+    CounterSerializer);
+REGISTER_BLOB_DESERIALIZER(
+    std::unique_ptr<Counter<int64_t>>,
+    CounterDeserializer);
+
+} // namespace caffe2
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
new file mode 100644
index 0000000..3b7bf7d
--- /dev/null
+++ b/caffe2/operators/counter_ops.h
@@ -0,0 +1,158 @@
+#ifndef CAFFE2_OPERATORS_COUNTER_OPS_H
+#define CAFFE2_OPERATORS_COUNTER_OPS_H
+
+#include <atomic>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+template <typename T>
+class Counter {
+ public:
+  explicit Counter(T count) : count_(count) {}
+  bool countDown() {
+    if (count_-- > 0) {
+      return false;
+    }
+    return true;
+  }
+
+  T countUp() {
+    return count_++;
+  }
+
+  T retrieve() const {
+    return count_.load();
+  }
+
+  T checkIfDone() const {
+    return (count_.load() <= 0);
+  }
+
+  T reset(T init_count) {
+    return count_.exchange(init_count);
+  }
+
+ private:
+  std::atomic<T> count_;
+};
+
+// TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
+
+template <typename T, class Context>
+class CreateCounterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CreateCounterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
+    CAFFE_ENFORCE_LE(0, init_count_, "negative init_count is not permitted.");
+  }
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<Counter<T>>>(0) =
+        std::unique_ptr<Counter<T>>(new Counter<T>(init_count_));
+    return true;
+  }
+
+ private:
+  T init_count_ = 0;
+};
+
+template <typename T, class Context>
+class ResetCounterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ResetCounterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
+    CAFFE_ENFORCE_LE(0, init_count_, "negative init_count is not permitted.");
+  }
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto previous = counterPtr->reset(init_count_);
+    if (OutputSize() == 1) {
+      auto* output = OperatorBase::Output<TensorCPU>(0);
+      output->Resize();
+      *output->template mutable_data<T>() = previous;
+    }
+    return true;
+  }
+
+ private:
+  T init_count_;
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
+class CountDownOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CountDownOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<bool>() = counterPtr->countDown();
+    return true;
+  }
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
+class CheckCounterDoneOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CheckCounterDoneOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<bool>() = counterPtr->checkIfDone();
+    return true;
+  }
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
+class CountUpOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CountUpOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<T>() = counterPtr->countUp();
+    return true;
+  }
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
+class RetrieveCountOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RetrieveCountOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<T>() = counterPtr->retrieve();
+    return true;
+  }
+};
+
+} // namespace caffe2
+#endif // CAFFE2_OPERATORS_COUNTER_OPS_H_
diff --git a/caffe2/operators/counter_ops_gpu.cc b/caffe2/operators/counter_ops_gpu.cc
new file mode 100644
index 0000000..7880aee
--- /dev/null
+++ b/caffe2/operators/counter_ops_gpu.cc
@@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "counter_ops.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(CountDown, CountDownOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CheckCounterDone,
+    CheckCounterDoneOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(CountUp, CountUpOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/create_scope_op.cc b/caffe2/operators/create_scope_op.cc
new file mode 100644
index 0000000..9e85b61
--- /dev/null
+++ b/caffe2/operators/create_scope_op.cc
@@ -0,0 +1,45 @@
+#include "caffe2/operators/create_scope_op.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_workspace_stack_debug,
+    false,
+    "Enable debug checks for CreateScope's workspace stack");
+
+namespace caffe2 {
+CAFFE_KNOWN_TYPE(detail::WorkspaceStack);
+
+template <>
+bool CreateScopeOp<CPUContext>::RunOnDevice() {
+  auto* ws_stack = OperatorBase::Output<detail::WorkspaceStack>(0);
+  ws_stack->clear();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(CreateScope, CreateScopeOp<CPUContext>);
+
+SHOULD_NOT_DO_GRADIENT(CreateScope);
+
+OPERATOR_SCHEMA(CreateScope).NumInputs(0).NumOutputs(1).SetDoc(R"DOC(
+'CreateScope' operator initializes and outputs empty scope that is used
+by Do operator to store local blobs
+    )DOC");
+
+template <>
+bool HasScopeOp<CPUContext>::RunOnDevice() {
+  const auto& ws_stack = OperatorBase::Input<detail::WorkspaceStack>(0);
+  auto* output = Output(0);
+  output->Resize(1);
+  bool* output_value = output->template mutable_data<bool>();
+  *output_value = !ws_stack.empty();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(HasScope, HasScopeOp<CPUContext>);
+
+SHOULD_NOT_DO_GRADIENT(HasScope);
+
+OPERATOR_SCHEMA(HasScope).NumInputs(1).NumOutputs(1).SetDoc(R"DOC(
+Checks whether scope blob has any saved scopes left
+    )DOC");
+
+} // namespace caffe2
diff --git a/caffe2/operators/create_scope_op.h b/caffe2/operators/create_scope_op.h
new file mode 100644
index 0000000..41dcb17
--- /dev/null
+++ b/caffe2/operators/create_scope_op.h
@@ -0,0 +1,176 @@
+#ifndef CAFFE2_OPERATORS_CREATE_SCOPE_OP_H_
+#define CAFFE2_OPERATORS_CREATE_SCOPE_OP_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+CAFFE2_DECLARE_bool(caffe2_workspace_stack_debug);
+
+namespace caffe2 {
+namespace detail {
+
+/*
+ * Keeps track of forward and backward gradient workspaces in stack,
+ * reuses previously created workspaces, non-thread safe
+ */
+class WorkspaceStack {
+ public:
+  explicit WorkspaceStack() : parent_ws_(nullptr), top_(-1) {}
+
+  std::shared_ptr<Workspace> pushForwardWorkspace(Workspace* parent_ws) {
+    return pushForwardWorkspace(
+        parent_ws, std::unordered_map<std::string, std::string>());
+  }
+
+  std::shared_ptr<Workspace> pushForwardWorkspace(
+      Workspace* parent_ws,
+      const std::unordered_map<std::string, std::string>& blob_bindings) {
+    checkStack();
+    if (FLAGS_caffe2_workspace_stack_debug) {
+      if (parent_ws_) {
+        CAFFE_ENFORCE_EQ(parent_ws_, parent_ws, "Parent workspace mismatch");
+      } else {
+        parent_ws_ = parent_ws;
+      }
+      if (!blob_bindings_.empty()) {
+        checkBindingsMatch(blob_bindings_, blob_bindings);
+      } else {
+        blob_bindings_ = blob_bindings;
+      }
+    }
+
+    if (top_ == workspaces_.size() - 1) {
+      workspaces_.push_back(
+          std::make_shared<Workspace>(parent_ws, blob_bindings));
+    } else {
+      // when reusing workspace, make sure copies of external blobs are
+      // removed and blob bindings are set
+      auto& workspace = workspaces_[top_ + 1];
+      const auto& local_blobs = workspace->LocalBlobs();
+      std::unordered_set<std::string> local_blobs_set;
+      local_blobs_set.insert(local_blobs.begin(), local_blobs.end());
+      bool found_local_copy = false;
+      for (const auto& blob_pair : blob_bindings) {
+        if (local_blobs_set.count(blob_pair.first)) {
+          workspace->RemoveBlob(blob_pair.first);
+          found_local_copy = true;
+        }
+      }
+      if (found_local_copy) {
+        workspace->AddBlobMapping(parent_ws, blob_bindings);
+      }
+    }
+
+    return workspaces_[++top_];
+  }
+
+  std::shared_ptr<Workspace> popGradientWorkspace(
+      Workspace* parent_ws,
+      const std::unordered_map<std::string, std::string>& grad_blob_bindings) {
+    checkStack();
+    if (FLAGS_caffe2_workspace_stack_debug) {
+      if (parent_ws_) {
+        CAFFE_ENFORCE_EQ(parent_ws_, parent_ws, "Parent workspace mismatch");
+      } else {
+        parent_ws_ = parent_ws;
+      }
+      if (!grad_blob_bindings_.empty()) {
+        checkBindingsMatch(grad_blob_bindings_, grad_blob_bindings);
+      } else {
+        grad_blob_bindings_ = grad_blob_bindings;
+      }
+    }
+
+    if (top_ < 0) {
+      return nullptr;
+    }
+    auto& grad_workspace = workspaces_[top_];
+    grad_workspace->AddBlobMapping(parent_ws, grad_blob_bindings, true);
+    --top_;
+    return grad_workspace;
+  }
+
+  std::shared_ptr<Workspace> reuseLastForwardWorkspace(Workspace* parent_ws) {
+    return reuseLastForwardWorkspace(
+        parent_ws, std::unordered_map<std::string, std::string>());
+  }
+
+  std::shared_ptr<Workspace> reuseLastForwardWorkspace(
+      Workspace* parent_ws,
+      const std::unordered_map<std::string, std::string>& blob_bindings) {
+    checkStack();
+    if (top_ < 0) {
+      return nullptr;
+    }
+    workspaces_[top_]->AddBlobMapping(parent_ws, blob_bindings);
+    return workspaces_[top_];
+  }
+
+  void clear() {
+    checkStack();
+    top_ = -1;
+  }
+
+  bool empty() const {
+    return top_ < 0;
+  }
+
+ private:
+  void checkStack() const {
+    CAFFE_ENFORCE_GT(
+        (int)workspaces_.size(), top_, "Corrupted workspaces stack");
+  }
+
+  void checkBindingsMatch(
+      const std::unordered_map<std::string, std::string>& bindings,
+      const std::unordered_map<std::string, std::string>& test_bindings) const {
+    CAFFE_ENFORCE_EQ(
+        bindings.size(), test_bindings.size(), "Blob bindings mismatch");
+    for (const auto& blob_binding : bindings) {
+      CAFFE_ENFORCE(
+          test_bindings.count(blob_binding.first), "Blob bindings mismatch");
+      CAFFE_ENFORCE_EQ(
+          test_bindings.at(blob_binding.first),
+          blob_binding.second,
+          "Blob bindings mismatch");
+    }
+  }
+
+  std::unordered_map<std::string, std::string> blob_bindings_;
+  std::unordered_map<std::string, std::string> grad_blob_bindings_;
+  Workspace* parent_ws_;
+  int top_;
+  std::vector<std::shared_ptr<Workspace>> workspaces_;
+};
+}
+
+template <class Context>
+class CreateScopeOp final : public Operator<Context> {
+ public:
+  CreateScopeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+template <class Context>
+class HasScopeOp final : public Operator<Context> {
+ public:
+  HasScopeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CREATE_SCOPE_OP_H_
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
new file mode 100644
index 0000000..c288eb7
--- /dev/null
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -0,0 +1,704 @@
+#include "caffe2/operators/cross_entropy_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+inline float sigmoid_xent_forward(float lgt, float tgt) {
+  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+inline float sigmoid_xent_backward(float lgt, float tgt) {
+  return tgt - 1. / (1. + exp(-lgt));
+}
+
+inline float sigmoid_partition(float lgt) {
+  // computes log(1 + exp(lgt)) with only exp(x) function when x >= 0
+  return lgt * (lgt >= 0) + log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+inline float sigmoid_xent_forward_with_log_d_trick(float lgt, float tgt) {
+  return (2 * tgt - 1.) * (lgt - sigmoid_partition(lgt));
+}
+
+inline float sigmoid_xent_backward_with_log_d_trick(float lgt, float tgt) {
+  return (2 * tgt - 1.) / (1. + exp(lgt));
+}
+
+inline float unjoined_sigmoid_xent_forward(float lgt, float tgt) {
+  return lgt * tgt + (tgt - 1) * lgt * (lgt >= 0) -
+      (1 - tgt) * log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+inline float unjoined_sigmoid_xent_backward(float lgt, float tgt) {
+  return tgt - (1. - tgt) / (1. + exp(-lgt));
+}
+
+} // namespace
+
+template <>
+bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto* Y = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  Y->Resize(N);
+  const auto* Xdata = X.data<float>();
+  const auto* labelData = label.data<int>();
+  auto* Ydata = Y->mutable_data<float>();
+  CAFFE_ENFORCE(
+      (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
+          (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
+      "Label seems to be outside of supported range. Supported labels are in "
+      "range [0,",
+      D,
+      ")");
+  for (int i = 0; i < N; ++i) {
+    Ydata[i] = -log(std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD()));
+  }
+  return true;
+}
+
+template <>
+bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
+  auto& logits = Input(0);
+  auto& targets = Input(1);
+  CAFFE_ENFORCE_EQ(logits.dims(), targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+
+  auto* out = Output(0);
+  if (logits.ndim() == 0) {
+    out->Resize(std::vector<TIndex>{});
+  } else {
+    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    out->Resize(dims);
+  }
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    float value = 0;
+    for (int j = 0; j < inner_size; ++j) {
+      if (unjoined_lr_loss_) {
+        value += unjoined_sigmoid_xent_forward(
+            logits_ptr[in_idx], targets_ptr[in_idx]);
+      } else {
+        value +=
+            (log_D_trick_ ? sigmoid_xent_forward_with_log_d_trick(
+                                logits_ptr[in_idx], targets_ptr[in_idx])
+                          : sigmoid_xent_forward(
+                                logits_ptr[in_idx], targets_ptr[in_idx]));
+      }
+      ++in_idx;
+    }
+    out_ptr[i] = -value / inner_size;
+  }
+  return true;
+}
+
+template <>
+bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& g = Input(0);
+  auto& logits = Input(1);
+  auto& targets = Input(2);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+  CAFFE_ENFORCE(g.size() == outer_size);
+
+  auto* out = Output(0);
+  out->ResizeLike(logits);
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* g_ptr = g.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    auto g_factor = -g_ptr[i] / inner_size;
+    for (int j = 0; j < inner_size; ++j) {
+      if (unjoined_lr_loss_) {
+        out_ptr[in_idx] = g_factor *
+            unjoined_sigmoid_xent_backward(
+                              logits_ptr[in_idx], targets_ptr[in_idx]);
+      } else {
+        out_ptr[in_idx] = g_factor *
+            (log_D_trick_ ? sigmoid_xent_backward_with_log_d_trick(
+                                logits_ptr[in_idx], targets_ptr[in_idx])
+                          : sigmoid_xent_backward(
+                                logits_ptr[in_idx], targets_ptr[in_idx]));
+      }
+      ++in_idx;
+    }
+  }
+  return true;
+}
+
+template <>
+bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
+  auto& logits = Input(0);
+  auto& targets = Input(1);
+  auto& weights = Input(2);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  CAFFE_ENFORCE(weights.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+
+  auto* out = Output(0);
+  if (logits.ndim() == 0) {
+    out->Resize(std::vector<TIndex>{});
+  } else {
+    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    out->Resize(dims);
+  }
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* weights_ptr = weights.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    float value = 0;
+    for (int j = 0; j < inner_size; ++j) {
+      value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
+          weights_ptr[in_idx];
+      ++in_idx;
+    }
+    out_ptr[i] = -value / inner_size;
+  }
+  return true;
+}
+
+template <>
+bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
+    RunOnDevice() {
+  auto& g = Input(0);
+  auto& logits = Input(1);
+  auto& targets = Input(2);
+  auto& weights = Input(3);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  CAFFE_ENFORCE(weights.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+  CAFFE_ENFORCE(g.size() == outer_size);
+
+  auto* out = Output(0);
+  out->ResizeLike(logits);
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* weights_ptr = weights.data<float>();
+  auto* g_ptr = g.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    auto g_factor = -g_ptr[i] / inner_size;
+    for (int j = 0; j < inner_size; ++j) {
+      out_ptr[in_idx] = g_factor *
+          sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
+          weights_ptr[in_idx];
+      ++in_idx;
+    }
+  }
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<float>(),
+                               &context_);
+  const float* Xdata = X.data<float>();
+  const float* dYdata = dY.data<float>();
+  const int* labelData = label.data<int>();
+  float* dXdata = dX->mutable_data<float>();
+  for (int i = 0; i < N; ++i) {
+    dXdata[i * D + labelData[i]] =
+        - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
+  }
+  return true;
+}
+
+template <>
+bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto shape = X.dims();
+  shape.push_back(2);
+  TIndex N = X.size();
+  Y->Resize(shape);
+  const auto* Xdata = X.data<float>();
+  auto* Ydata = Y->mutable_data<float>();
+  for (TIndex i = 0; i < N; ++i) {
+    DCHECK_GE(Xdata[i], 0.0);
+    DCHECK_LE(Xdata[i], 1.0);
+    Ydata[i * 2] = 1.0 - Xdata[i];
+    Ydata[i * 2 + 1] = Xdata[i];
+  }
+  return true;
+}
+
+template <>
+bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  auto shape = dY.dims();
+  CAFFE_ENFORCE_GE(shape.size(), 1);
+  CAFFE_ENFORCE_EQ(shape.back(), 2);
+  shape.pop_back();
+  dX->Resize(shape);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  TIndex N = dX->size();
+  // use eigen?
+  for (TIndex i = 0; i < N; ++i) {
+    dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
+  }
+  return true;
+}
+
+template <>
+bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto* Y = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  Y->Resize(vector<TIndex>{N});
+  const float* Xdata = X.data<float>();
+  const float* labelData = label.data<float>();
+  auto* Ydata = Y->mutable_data<float>();
+  CAFFE_ENFORCE(
+      (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
+          (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
+      "Soft label seems incorrect: label value should be a probability ",
+      "between 0 and 1.0. You may be using the wrong cross entropy operator; ",
+      "use LabelCrossEntropy if the labels are integers whose values are at ",
+      "most the number of classes, ",
+      D,
+      ".");
+  EigenArrayMap<float>(Ydata, 1, N) =
+      -(ConstEigenArrayMap<float>(labelData, D, N) *
+        ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()).log())
+           .colwise()
+           .sum();
+  return true;
+}
+
+template <>
+bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+    dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  const float* Xdata = X.data<float>();
+  const float* dYdata = dY.data<float>();
+  const float* labelData = label.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  EigenArrayMap<float>(dXdata, D, N) =
+      (ConstEigenArrayMap<float>(labelData, D, N) /
+       ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
+          .rowwise() *
+      (-ConstEigenVectorArrayMap<float>(dYdata, N).transpose());
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LabelCrossEntropy,
+                      LabelCrossEntropyOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
+                      LabelCrossEntropyGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LabelCrossEntropy)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$  and a one dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows
+
+$$Y_i = -log(X_{ij})$$
+
+where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability.
+
+The difference between *LabelCrossEntropy* and *CrossEntropy* is how the labels are specified. Here, the labels are a length $N$ list of integers, whereas in CrossEntropy the labels are a $NxD$ dimensional matrix of one hot label vectors. However, the results of computation should be the same, as shown in the two examples where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability.
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LabelCrossEntropy",
+    ["X", "label"],
+    ["Y"]
+)
+
+# Create X: Sample softmax output for 5-class model
+X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
+print("X:\n",X)
+
+# Create label: Sample 1-hot ground truth label vectors
+label = np.array([4,2])
+print("label:\n",label)
+
+# Feed X & label into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+workspace.FeedBlob("label", label.astype(np.int32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[0.01 0.05 0.02 0.02 0.9 ]
+ [0.03 0.1  0.42 0.05 0.4 ]]
+label:
+ [4 2]
+Y:
+ [0.10536055 0.8675006 ]
+
+```
+
+</details>
+
+
+)DOC")
+  .Input(
+      0,
+      "X",
+      "Input tensor which is almost always the result of a softmax operation. $X$ is a 2D array of size $NxD$, where $N$ is the batch size and $D$ is the number of classes.")
+  .Input(
+      1,
+      "label",
+      "Blob containing the labels used to compare the input. $label$ is a length $N$ list of integers, where each element is the integer label for the $n$th element of the batch.")
+  .Output(
+      0,
+      "Y",
+      "Output blob from the cross entropy computation. $Y$ is 1D length $N$ tensor.");
+OPERATOR_SCHEMA(LabelCrossEntropyGradient)
+  .NumInputs(3)
+  .NumOutputs(1);
+
+class GetLabelCrossEntropyGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LabelCrossEntropyGradient", "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(LabelCrossEntropy, GetLabelCrossEntropyGradient);
+
+REGISTER_CPU_OPERATOR(MakeTwoClass,
+                      MakeTwoClassOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
+                      MakeTwoClassGradientOp<float, CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyWithLogits,
+    SigmoidCrossEntropyWithLogitsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyWithLogitsGradient,
+    SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    WeightedSigmoidCrossEntropyWithLogits,
+    WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    WeightedSigmoidCrossEntropyWithLogitsGradient,
+    WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(MakeTwoClass)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(
+        [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
+          vector<TensorShape> out(1);
+          out[0].add_dims(in[0].dims(0));
+          out[0].add_dims(2);
+          return out;
+        })
+    .SetDoc(R"DOC(
+Given a vector of probabilities, this operator transforms this into a 2-column
+ matrix with complimentary probabilities for binary classification. In explicit
+ terms, given the vector X, the output Y is vstack(1 - X, X).
+  )DOC")
+    .Input(0, "X", "Input vector of probabilities")
+    .Output(
+        0,
+        "Y",
+        "2-column matrix with complimentary probabilities of X for "
+        "binary classification");
+
+OPERATOR_SCHEMA(MakeTwoClassGradient)
+  .NumInputs(1)
+  .NumOutputs(1);
+
+OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
+    .Arg("log_D_trick", R"DOC(
+default is false; if enabled, will use the log d trick to avoid the vanishing
+gradients early on; see Goodfellow et. al (2014)
+)DOC")
+    .Arg("unjoined_lr_loss", R"DOC(
+default is false; if enabled, the model will be allowed to train on an unjoined
+dataset, where some examples might be false negative and might appear
+in the dataset later as (true) positive example.
+)DOC")
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+Given two matrices logits and targets, of same shape,
+(batch_size, num_classes), computes the sigmoid cross entropy between the two.
+Returns a tensor of shape (batch_size,) of losses for each example.
+)DOC")
+    .Input(0, "logits", "matrix of logits for each example and class.")
+    .Input(1, "targets", "matrix of targets, same shape as logits.")
+    .Output(0, "xentropy", "Vector with the total xentropy for each example.");
+
+OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
+    .NumInputs(3)
+    .NumOutputs(1);
+
+OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogits)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+Given three matrices: logits, targets, weights, all of the same shape,
+(batch_size, num_classes), computes the weighted sigmoid cross entropy between
+logits and targets. Specifically, at each position r,c, this computes
+weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then
+averages over each row.
+Returns a tensor of shape (batch_size,) of losses for each example.
+)DOC")
+    .Input(0, "logits", "matrix of logits for each example and class.")
+    .Input(1, "targets", "matrix of targets, same shape as logits.")
+    .Input(2, "weights", "matrix of weights, same shape as logits.")
+    .Output(0, "xentropy", "Vector with the total xentropy for each example.");
+
+OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogitsGradient)
+    .NumInputs(4)
+    .NumOutputs(1);
+
+struct GetMakeTwoClassGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MakeTwoClassGradient",
+        "",
+        vector<string>{GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
+
+struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SigmoidCrossEntropyWithLogitsGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(
+    SigmoidCrossEntropyWithLogits,
+    GetSigmoidCrossEntropyWithLogitsGradient);
+
+struct GetWeightedSigmoidCrossEntropyWithLogitsGradient
+    : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "WeightedSigmoidCrossEntropyWithLogitsGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1), I(2)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(
+    WeightedSigmoidCrossEntropyWithLogits,
+    GetWeightedSigmoidCrossEntropyWithLogitsGradient);
+
+REGISTER_CPU_OPERATOR(CrossEntropy,
+                      CrossEntropyOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(CrossEntropyGradient,
+                      CrossEntropyGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(CrossEntropy)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$  and a $NxD$ dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows
+
+$$Y_i = \sum_j (label_{ij} * log(X_{ij}))$$
+
+where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability.
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "CrossEntropy",
+    ["X", "label"],
+    ["Y"]
+)
+
+# Create X: Sample softmax output for 5-class model
+X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
+print("X:\n",X)
+
+# Create label: Sample 1-hot ground truth label vectors
+label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]])
+print("label:\n",label)
+
+# Feed X & label into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+workspace.FeedBlob("label", label.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[0.01 0.05 0.02 0.02 0.9 ]
+ [0.03 0.1  0.42 0.05 0.4 ]]
+label:
+ [[0. 0. 0. 0. 1.]
+ [0. 0. 1. 0. 0.]]
+Y:
+ [0.10536055 0.8675006 ]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(
+        0,
+        "X",
+        "Input tensor which is almost always the result of a softmax operation. $X$ is a 2D array of size $NxD$, where $N$ is the batch size and $D$ is the number of classes.")
+    .Input(
+        1,
+        "label",
+        "Blob containing the labels used to compare the input. $label$ is the same shape as $X$.")
+    .Output(
+        0,
+        "Y",
+        "Output blob from the cross entropy computation. $Y$ is 1D length $N$ tensor.");
+OPERATOR_SCHEMA(CrossEntropyGradient)
+  .NumInputs(3)
+  .NumOutputs(1);
+
+class GetCrossEntropyGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CrossEntropyGradient", "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(CrossEntropy, GetCrossEntropyGradient);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
new file mode 100644
index 0000000..70bfbe4
--- /dev/null
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -0,0 +1,441 @@
+#include <assert.h>
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/cross_entropy_op.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void LabelCrossEntropyKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float log_threshold, float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
+    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
+  }
+}
+__global__ void LabelCrossEntropyGradientKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float* dYdata, const float log_threshold, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int idx = i * D + labeldata[i];
+    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
+  }
+}
+}  // namespace
+
+template <>
+bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto* Y = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  Y->Resize(vector<TIndex>(size_t(1), N));
+  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                            0, context_.cuda_stream()>>>(
+      N, D, X.data<float>(), label.data<int>(), kLOG_THRESHOLD(),
+      Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  int N, D;
+  if (X.ndim() > 1) {
+    N = X.dim32(0);
+    D = X.size_from_dim(1);
+  } else {
+    N = 1;
+    D = X.dim32(0);
+  }
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
+  dX->ResizeLike(X);
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                                    0, context_.cuda_stream()>>>(
+      N, D, X.data<float>(), label.data<int>(), dY.data<float>(),
+      kLOG_THRESHOLD(), dX->mutable_data<float>());
+  return true;
+}
+
+namespace {
+__global__ void MakeTwoClassKernel(
+    const int N, const float* Xdata, float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Ydata[i * 2] = 1.0 - Xdata[i];
+    Ydata[i * 2 + 1] = Xdata[i];
+  }
+}
+__global__ void MakeTwoClassGradientKernel(
+    const int N, const float* dYdata, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
+  }
+}
+}  // namespace
+
+template <>
+bool MakeTwoClassOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto shape = X.dims();
+  shape.push_back(2);
+  CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max() / 2);
+  Y->Resize(shape);
+  int N = X.size();
+  MakeTwoClassKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                       0, context_.cuda_stream()>>>(
+      N, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool MakeTwoClassGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  auto shape = dY.dims();
+  CAFFE_ENFORCE_GE(shape.size(), 1);
+  CAFFE_ENFORCE_EQ(shape.back(), 2);
+  shape.pop_back();
+  CAFFE_ENFORCE_LT(dY.size(), std::numeric_limits<int>::max());
+  dX->Resize(shape);
+  int N = dX->size();
+  MakeTwoClassGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                               0, context_.cuda_stream()>>>(
+      N, dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+namespace {
+
+__device__ float sigmoid_xent_forward(float lgt, float tgt) {
+  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+__device__ float sigmoid_xent_backward(float lgt, float tgt) {
+  return tgt - 1. / (1. + exp(-lgt));
+}
+
+__device__ float sigmoid_partition(float lgt) {
+  // computes log(1 + exp(lgt)) with only exp(x) function when x >= 0
+  return lgt * (lgt >= 0) + log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+__device__ float sigmoid_xent_forward_with_log_d_trick(float lgt, float tgt) {
+  return (2 * tgt - 1.) * (lgt - sigmoid_partition(lgt));
+}
+
+__device__ float sigmoid_xent_backward_with_log_d_trick(float lgt, float tgt) {
+  return (2 * tgt - 1.) / (1. + exp(lgt));
+}
+
+__device__ float unjoined_sigmoid_xent_forward(float lgt, float tgt) {
+  return lgt * tgt + (tgt - 1) * lgt * (lgt >= 0) -
+      (1 - tgt) * log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+__device__ float unjoined_sigmoid_xent_backward(float lgt, float tgt) {
+  return tgt - (1. - tgt) / (1. + exp(-lgt));
+}
+
+__global__ void SigmoidCrossEntropyWithLogitsKernel(
+    const int outer_size,
+    const int inner_size,
+    const bool log_D_trick,
+    const bool unjoined_lr_loss,
+    const float* logits_ptr,
+    const float* targets_ptr,
+    float* out_ptr) {
+  int i = blockIdx.x;
+  int last_idx = (i + 1) * inner_size;
+  float value = 0;
+  for (int in_idx = i * inner_size + threadIdx.x; in_idx < last_idx;
+       in_idx += blockDim.x) {
+    if (unjoined_lr_loss) {
+      value += unjoined_sigmoid_xent_forward(
+          logits_ptr[in_idx], targets_ptr[in_idx]);
+    } else {
+      value +=
+          (log_D_trick
+               ? sigmoid_xent_forward_with_log_d_trick(
+                     logits_ptr[in_idx], targets_ptr[in_idx])
+               : sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]));
+    }
+  }
+
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  float sum = BlockReduce(temp_storage).Sum(value);
+  if (threadIdx.x == 0) {
+    out_ptr[i] = -sum / inner_size;
+  }
+}
+
+__global__ void SigmoidCrossEntropyGradientWithLogitsKernel(
+    const int outer_size,
+    const int inner_size,
+    const bool log_D_trick,
+    const bool unjoined_lr_loss,
+    const float* g_ptr,
+    const float* logits_ptr,
+    const float* targets_ptr,
+    float* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(in_idx, outer_size * inner_size) {
+    int i = in_idx / inner_size;
+    auto g_factor = -g_ptr[i] / inner_size;
+    if (unjoined_lr_loss) {
+      out_ptr[in_idx] = g_factor *
+          unjoined_sigmoid_xent_backward(
+                            logits_ptr[in_idx], targets_ptr[in_idx]);
+    } else {
+      out_ptr[in_idx] = g_factor *
+          (log_D_trick ? sigmoid_xent_backward_with_log_d_trick(
+                             logits_ptr[in_idx], targets_ptr[in_idx])
+                       : sigmoid_xent_backward(
+                             logits_ptr[in_idx], targets_ptr[in_idx]));
+    }
+  }
+}
+} // namespace
+
+template <>
+bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {
+  auto& logits = Input(0);
+  auto& targets = Input(1);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+
+  auto* out = Output(0);
+  if (logits.ndim() == 0) {
+    out->Resize(std::vector<TIndex>{});
+  } else {
+    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    out->Resize(dims);
+  }
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+
+  if (logits.size() <= 0) {
+    // nothing to do, not even launching kernel
+    return true;
+  }
+
+  SigmoidCrossEntropyWithLogitsKernel<<<
+      outer_size,
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      outer_size,
+      inner_size,
+      log_D_trick_,
+      unjoined_lr_loss_,
+      logits_ptr,
+      targets_ptr,
+      out_ptr);
+  return true;
+}
+
+template <>
+bool SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
+    RunOnDevice() {
+  auto& g = Input(0);
+  auto& logits = Input(1);
+  auto& targets = Input(2);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+  CAFFE_ENFORCE(g.size() == outer_size);
+
+  auto* out = Output(0);
+  out->ResizeLike(logits);
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* g_ptr = g.data<float>();
+
+  SigmoidCrossEntropyGradientWithLogitsKernel<<<
+      CAFFE_GET_BLOCKS(outer_size * inner_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      outer_size,
+      inner_size,
+      log_D_trick_,
+      unjoined_lr_loss_,
+      g_ptr,
+      logits_ptr,
+      targets_ptr,
+      out_ptr);
+  return true;
+}
+
+namespace {
+
+__global__ void WeightedSigmoidCrossEntropyWithLogitsKernel(
+    const int outer_size,
+    const int inner_size,
+    const float* logits_ptr,
+    const float* targets_ptr,
+    const float* weights_ptr,
+    float* out_ptr) {
+  int i = blockIdx.x;
+  int last_idx = (i + 1) * inner_size;
+  float value = 0;
+  for (int in_idx = i * inner_size + threadIdx.x; in_idx < last_idx;
+       in_idx += blockDim.x) {
+    value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
+        weights_ptr[in_idx];
+  }
+
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  float sum = BlockReduce(temp_storage).Sum(value);
+  if (threadIdx.x == 0) {
+    out_ptr[i] = -sum / inner_size;
+  }
+}
+
+__global__ void WeightedSigmoidCrossEntropyGradientWithLogitsKernel(
+    const int outer_size,
+    const int inner_size,
+    const float* g_ptr,
+    const float* logits_ptr,
+    const float* targets_ptr,
+    const float* weights_ptr,
+    float* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(in_idx, outer_size * inner_size) {
+    int i = in_idx / inner_size;
+    auto g_factor = -g_ptr[i] / inner_size;
+    out_ptr[in_idx] = g_factor *
+        sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
+        weights_ptr[in_idx];
+  }
+}
+} // namespace
+
+template <>
+bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::
+    RunOnDevice() {
+  auto& logits = Input(0);
+  auto& targets = Input(1);
+  auto& weights = Input(2);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  CAFFE_ENFORCE(weights.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+
+  auto* out = Output(0);
+  if (logits.ndim() == 0) {
+    out->Resize(std::vector<TIndex>{});
+  } else {
+    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    out->Resize(dims);
+  }
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* weights_ptr = weights.data<float>();
+
+  WeightedSigmoidCrossEntropyWithLogitsKernel<<<
+      outer_size,
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      outer_size, inner_size, logits_ptr, targets_ptr, weights_ptr, out_ptr);
+  return true;
+}
+
+template <>
+bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
+    RunOnDevice() {
+  auto& g = Input(0);
+  auto& logits = Input(1);
+  auto& targets = Input(2);
+  auto& weights = Input(3);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  CAFFE_ENFORCE(weights.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+  CAFFE_ENFORCE(g.size() == outer_size);
+
+  auto* out = Output(0);
+  out->ResizeLike(logits);
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* weights_ptr = weights.data<float>();
+  auto* g_ptr = g.data<float>();
+
+  WeightedSigmoidCrossEntropyGradientWithLogitsKernel<<<
+      CAFFE_GET_BLOCKS(outer_size * inner_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      outer_size,
+      inner_size,
+      g_ptr,
+      logits_ptr,
+      targets_ptr,
+      weights_ptr,
+      out_ptr);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
+                       LabelCrossEntropyOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
+                       LabelCrossEntropyGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    SigmoidCrossEntropyWithLogits,
+    SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SigmoidCrossEntropyWithLogitsGradient,
+    SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    WeightedSigmoidCrossEntropyWithLogits,
+    WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    WeightedSigmoidCrossEntropyWithLogitsGradient,
+    WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(MakeTwoClass,
+                       MakeTwoClassOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MakeTwoClassGradient,
+                       MakeTwoClassGradientOp<float, CUDAContext>);
+
+//TODO(surya) Add full GPU/CUDA support for the CrossEntropyOp
+REGISTER_CUDA_OPERATOR(CrossEntropy,
+                       GPUFallbackOp<CrossEntropyOp<float, CPUContext>>);
+REGISTER_CUDA_OPERATOR(CrossEntropyGradient,
+                       GPUFallbackOp<CrossEntropyGradientOp<float, CPUContext>>);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/cross_entropy_op.h b/caffe2/operators/cross_entropy_op.h
new file mode 100644
index 0000000..17ad205
--- /dev/null
+++ b/caffe2/operators/cross_entropy_op.h
@@ -0,0 +1,164 @@
+#ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+#define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LabelCrossEntropyOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  static constexpr T kLOG_THRESHOLD() {
+    return static_cast<T>(1e-20);
+  }
+  // Input: X, label
+  // Output: Y
+};
+
+template <typename T, class Context>
+class LabelCrossEntropyGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, label, dY
+  // Ouptut: dX. There is no gradient with respect to the label.
+  static constexpr T kLOG_THRESHOLD() {
+    return static_cast<T>(1e-20);
+  }
+};
+
+// Hacky: turns a vector of probabilities into a 2-column matrix with
+// complimentary probabilities for binary classification
+template <typename T, class Context>
+class MakeTwoClassOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(MakeTwoClassOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X
+  // Output: Y = vstack(1-X, X)
+};
+
+template <typename T, class Context>
+class MakeTwoClassGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(MakeTwoClassGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: dY
+  // Ouptut: dX
+};
+
+template <typename T, class Context>
+class SigmoidCrossEntropyWithLogitsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SigmoidCrossEntropyWithLogitsOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        log_D_trick_(OperatorBase::template GetSingleArgument<bool>(
+            "log_D_trick",
+            false)),
+        unjoined_lr_loss_(OperatorBase::template GetSingleArgument<bool>(
+            "unjoined_lr_loss",
+            false)) {
+    CAFFE_ENFORCE(
+        !(log_D_trick_ && unjoined_lr_loss_),
+        "log_D_trick_ and unjoined_lr_loss_ cannot be set as True simultaneously");
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool log_D_trick_;
+  bool unjoined_lr_loss_;
+};
+
+template <typename T, class Context>
+class SigmoidCrossEntropyWithLogitsGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SigmoidCrossEntropyWithLogitsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        log_D_trick_(OperatorBase::template GetSingleArgument<bool>(
+            "log_D_trick",
+            false)),
+        unjoined_lr_loss_(OperatorBase::template GetSingleArgument<bool>(
+            "unjoined_lr_loss",
+            false)) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool log_D_trick_;
+  bool unjoined_lr_loss_;
+};
+
+template <typename T, class Context>
+class WeightedSigmoidCrossEntropyWithLogitsOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(WeightedSigmoidCrossEntropyWithLogitsOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class WeightedSigmoidCrossEntropyWithLogitsGradientOp final
+    : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(WeightedSigmoidCrossEntropyWithLogitsGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class CrossEntropyOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(CrossEntropyOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, label
+  // Output: Y
+  static constexpr T kLOG_THRESHOLD() {
+    return static_cast<T>(1e-20);
+  }
+};
+
+template <typename T, class Context>
+class CrossEntropyGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(CrossEntropyGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, label, dY
+  // Ouptut: dX. There is no gradient with respect to the label.
+  static constexpr T kLOG_THRESHOLD() {
+    return static_cast<T>(1e-20);
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
diff --git a/caffe2/operators/ctc_greedy_decoder_op.cc b/caffe2/operators/ctc_greedy_decoder_op.cc
new file mode 100644
index 0000000..1a9c415
--- /dev/null
+++ b/caffe2/operators/ctc_greedy_decoder_op.cc
@@ -0,0 +1,99 @@
+#include "caffe2/operators/ctc_greedy_decoder_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <class Context>
+const float* getTensorDataPtr(const Tensor<Context>& tensor, int t, int n) {
+  const auto& dims = tensor.dims();
+  CAFFE_ENFORCE_EQ(dims.size(), 3);
+  int offset = (t * dims[1] + n) * dims[2];
+  CAFFE_ENFORCE_LT(offset, tensor.size());
+  return tensor.template data<float>() + offset;
+}
+
+} // namespace
+
+template <>
+bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
+  // [max_time_step, batch_size, num_classes]
+  auto& inputs = Input(INPUTS);
+  // [batch_size]
+  auto* output_len = Output(OUTPUT_LEN);
+  // [total_decoded_output]
+  auto* values = Output(VALUES);
+
+  const auto& inputs_dims = inputs.dims();
+  int32_t max_time_step = inputs_dims[0];
+  int32_t batch_size = inputs_dims[1];
+  int32_t num_classes = inputs_dims[2];
+  // [batch_size]
+  const int* seq_len_data =
+      (InputSize() == 2) ? Input(SEQ_LEN).data<int>() : nullptr;
+
+  vector<int> values_cach;
+  output_len->Resize(vector<TIndex>{batch_size});
+  int* output_len_data = output_len->mutable_data<int>();
+
+  for (int32_t i = 0; i < batch_size; ++i) {
+    int previous_label = 0, t_dec = 0;
+    int32_t seq_len_i = (seq_len_data) ? seq_len_data[i] : max_time_step;
+    CAFFE_ENFORCE_LE(seq_len_i, max_time_step);
+    for (int32_t t = 0; t < seq_len_i; ++t) {
+      auto* prob_data = getTensorDataPtr(inputs, t, i);
+      int curr_label =
+          std::max_element(prob_data, prob_data + num_classes) - prob_data;
+      if (curr_label != 0 &&
+          (!merge_repeated_ || (previous_label != curr_label))) {
+        t_dec++;
+        values_cach.push_back(curr_label);
+      }
+      previous_label = curr_label;
+    }
+    output_len_data[i] = t_dec;
+  }
+
+  int32_t values_cach_size = values_cach.size();
+  values->Resize(vector<TIndex>{values_cach_size});
+  int* values_data = values->mutable_data<int>();
+  for (int i = 0; i < values_cach.size(); ++i) {
+    values_data[i] = values_cach.at(i);
+  }
+  values_cach.clear();
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(CTCGreedyDecoder, CTCGreedyDecoderOp<CPUContext>);
+OPERATOR_SCHEMA(CTCGreedyDecoder)
+    .NumInputs(1, 2)
+    .NumOutputs(2)
+    .Arg(
+        "merge_repeated",
+        "When merge_repeated is true, merge repeated classes in output.")
+    .SetDoc("Greedy decoder for connectionist temporal classification.")
+    .Input(
+        0,
+        "INPUTS",
+        "3D float Tensor sized [max_time, batch_size, num_classes]")
+    .Input(
+        1,
+        "SEQ_LEN",
+        "(optional) 1D int vector containing sequence lengths, "
+        "having size [batch_size]"
+        "seq_len will be set to max_time if not provided")
+    .Output(
+        0,
+        "OUTPUT_LEN",
+        "Output_len matrix size (batch). "
+        "The row store: [decoded_length]")
+    .Output(
+        1,
+        "VALUES",
+        "Values vector, size (total_decoded_outputs). "
+        "The vector stores the decoded classes")
+    .InheritOnnxSchema("CTCGreedyDecoder");
+SHOULD_NOT_DO_GRADIENT(CTCGreedyDecoder);
+
+} // namespace caffe2
diff --git a/caffe2/operators/ctc_greedy_decoder_op.h b/caffe2/operators/ctc_greedy_decoder_op.h
new file mode 100644
index 0000000..a5a0bab
--- /dev/null
+++ b/caffe2/operators/ctc_greedy_decoder_op.h
@@ -0,0 +1,32 @@
+#ifndef CAFFE2_OPERATORS_CTC_GREEDY_DECODER_OP_H_
+#define CAFFE2_OPERATORS_CTC_GREEDY_DECODER_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class CTCGreedyDecoderOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CTCGreedyDecoderOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    if (OperatorBase::HasArgument("merge_repeated")) {
+      merge_repeated_ =
+          OperatorBase::GetSingleArgument<bool>("merge_repeated", true);
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool merge_repeated_;
+  INPUT_TAGS(INPUTS, SEQ_LEN);
+  OUTPUT_TAGS(OUTPUT_LEN, VALUES);
+  // Input: X, 3D tensor; L, 1D tensor. Output: Y sparse tensor
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CTC_GREEDY_DECODER_OP_H_
diff --git a/caffe2/operators/cube_op.cc b/caffe2/operators/cube_op.cc
new file mode 100644
index 0000000..1f0cf7d
--- /dev/null
+++ b/caffe2/operators/cube_op.cc
@@ -0,0 +1,68 @@
+#include "caffe2/operators/cube_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool CubeGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  EigenVectorMap<T>(dX, size) = ConstEigenVectorArrayMap<T>(dY, size) *
+      ConstEigenVectorArrayMap<T>(X, size).square() * T(3);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Cube,
+    UnaryElementwiseOp<NumericTypes, CPUContext, CubeFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    CubeGradient,
+    BinaryElementwiseOp<
+        NumericTypes,
+        CPUContext,
+        CubeGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Cube)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output tensor calculated as the cube of the input tensor, element-wise.");
+
+OPERATOR_SCHEMA(CubeGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetCubeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CubeGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Cube, GetCubeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cube_op.cu b/caffe2/operators/cube_op.cu
new file mode 100644
index 0000000..e6ea888
--- /dev/null
+++ b/caffe2/operators/cube_op.cu
@@ -0,0 +1,55 @@
+#include "caffe2/operators/cube_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+CubeGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * __ldg(X + i) * __ldg(X + i) * T(3);
+#else
+    dX[i] = dY[i] * X[i] * X[i] * T(3);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool CubeGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  CubeGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Cube,
+    UnaryElementwiseOp<NumericTypes, CUDAContext, CubeFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    CubeGradient,
+    BinaryElementwiseOp<
+        NumericTypes,
+        CUDAContext,
+        CubeGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/cube_op.h b/caffe2/operators/cube_op.h
new file mode 100644
index 0000000..f5fb9b1
--- /dev/null
+++ b/caffe2/operators/cube_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_CUBE_OP_H_
+#define CAFFE2_OPERATORS_CUBE_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct CubeFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Cube<T, Context>(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct CubeGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CUBE_OP_H_
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
new file mode 100644
index 0000000..e729147
--- /dev/null
+++ b/caffe2/operators/dataset_ops.cc
@@ -0,0 +1,1534 @@
+#include "caffe2/operators/dataset_ops.h"
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<dataset_ops::TreeCursor>);
+CAFFE_KNOWN_TYPE(dataset_ops::TensorVectorPtr<CPUContext>);
+CAFFE_KNOWN_TYPE(dataset_ops::SharedTensorVectorPtr);
+
+namespace dataset_ops {
+namespace {
+
+const char kDatasetFieldSeparator = ':';
+const char* kDatasetLengthField = "lengths";
+
+// how much percent to grow the dataset when needed
+const int kDatasetGrowthPct = 40;
+
+} // namespace
+
+TreeIterator::TreeIterator(const std::vector<std::string>& fields) {
+  // populate field vector and split field names
+  fields_.resize(fields.size());
+  std::vector<std::vector<std::string>> nameParts(fields_.size());
+  for (int i = 0; i < fields.size(); ++i) {
+    auto& field = fields_.at(i);
+    field.name = fields[i];
+    field.id = i;
+    field.lengthFieldId = -1;
+    nameParts.at(i) = split(kDatasetFieldSeparator, field.name);
+  }
+
+  // populate lengthFields
+  for (const auto& field : fields_) {
+    const auto& parts = nameParts.at(field.id);
+    if (!parts.empty() && parts.back() == kDatasetLengthField) {
+      lengthFieldIds_.push_back(field.id);
+    }
+  }
+
+  // find length-field with maximum prefix matching for each field
+  for (auto& field : fields_) {
+    // by default, we are matching against the root domain
+    int maxMatchLevel = 1;
+    int maxMatchLengthFieldId = -1;
+    for (int j = 0; j < numLengthFields(); ++j) {
+      const auto& lenField = lengthField(j);
+      // a length field can't have itself as its length field
+      if (field.id == lenField.id) {
+        continue;
+      }
+      auto lf = nameParts.at(lenField.id);
+      auto lfEnd = lf.end() - 1;
+      // check whether this lengthField is a prefix for this field name
+      if (std::mismatch(lf.begin(), lfEnd, nameParts.at(field.id).begin())
+              .first != lfEnd) {
+        continue;
+      }
+      if (lf.size() > maxMatchLevel) {
+        maxMatchLevel = lf.size();
+        maxMatchLengthFieldId = j;
+      }
+    }
+    field.lengthFieldId = maxMatchLengthFieldId;
+  }
+
+  // check that fields are topologically sorted
+  // (no length field depends on a length defined afterwards)
+  for (const auto& field : fields_) {
+    const auto* lengthField = lengthFieldFor(field);
+    CAFFE_ENFORCE(
+        (lengthField == nullptr) || (lengthField->id < field.id),
+        "Error: Field ",
+        field.id,
+        " (",
+        field.name,
+        ") ",
+        "depends on a field defined afterwards: ",
+        lengthField->id,
+        " (",
+        lengthField->name,
+        ").");
+  }
+}
+
+void TreeIterator::advance(
+    const std::vector<const TLength*>& lengths,
+    std::vector<TOffset>& offsets,
+    std::vector<TOffset>& sizes,
+    std::vector<TOffset>& limits,
+    TOffset num) {
+  std::vector<TOffset> newOffsets;
+  CAFFE_ENFORCE_EQ(lengths.size(), numLengthFields());
+  CAFFE_ENFORCE_EQ(offsets.size(), numOffsetFields());
+  sizes.resize(offsets.size());
+  newOffsets.resize(offsets.size());
+  // first index, top level
+  {
+    auto limit = limits[0];
+    auto offset = offsets[0];
+    CAFFE_ENFORCE(limit >= offset, "Tried to advance past end of cursor.");
+    TOffset total = std::min(limit - offset, num);
+    sizes[0] = total;
+    newOffsets[0] = offset + total;
+  }
+  // child indices
+  for (int j = 1; j < numOffsetFields(); ++j) {
+    TOffset total = 0;
+    int parentOffsetId = offsetFieldIdFor(lengthField(j - 1));
+    const TLength* length = lengths[j - 1] + offsets[parentOffsetId];
+    for (int k = 0; k < sizes[parentOffsetId]; ++k) {
+      total += *(length++);
+    }
+    auto offset = offsets[j];
+    CAFFE_ENFORCE(
+        offset + total <= limits[j],
+        "Inconsistent field length: ",
+        "tried to advance past the end of field ",
+        j);
+    sizes[j] = total;
+    newOffsets[j] = offset + total;
+  }
+  offsets = newOffsets;
+}
+
+TreeWalker::TreeWalker(const vector<const Blob*>& inputs, TreeCursor& cursor)
+    : inputs_(inputs), cursor_(cursor), sizes_(cursor.it.numOffsetFields()) {
+  CAFFE_ENFORCE_EQ(inputs.size(), cursor.it.fields().size());
+  if (cursor.offsets.empty()) {
+    cursor.offsets.assign(cursor.it.numOffsetFields(), 0);
+  }
+
+  for (int fieldId = 0; fieldId < cursor_.it.fields().size(); ++fieldId) {
+    fields_.emplace_back(*this, fieldId);
+  }
+
+  gatherLengthData();
+
+  gatherSizeLimits();
+
+  // The invariant we hold is that we are always one step ahead
+  advance();
+}
+
+void TreeWalker::advance() {
+  prevOffsets_ = cursor_.offsets;
+  cursor_.it.advance(lengths_, cursor_.offsets, sizes_, limits_, 1);
+}
+
+std::vector<TIndex> TreeWalker::fieldDim(int fieldId) const {
+  auto tensorDim = input(fieldId).dims();
+  tensorDim[0] = sizes_[lengthIdx(fieldId)];
+  return tensorDim;
+}
+
+void* TreeWalker::fieldPtr(int fieldId) const {
+  auto& in = input(fieldId);
+  return (char*)in.raw_data() +
+      offset(fieldId) * in.size_from_dim(1) * in.meta().itemsize();
+}
+
+void TreeWalker::gatherLengthData() {
+  static const TLength lenZero = 0;
+  lengths_.resize(cursor_.it.numLengthFields());
+  for (int i = 0; i < lengths_.size(); ++i) {
+    auto& in = input(cursor_.it.lengthField(i).id);
+    if (in.size() > 0) {
+      lengths_[i] = in.data<int>();
+    } else {
+      lengths_[i] = &lenZero;
+    }
+  }
+}
+
+void TreeWalker::gatherSizeLimits() {
+  limits_.assign(sizes_.size(), std::numeric_limits<TOffset>::max());
+  for (auto fieldId = 0; fieldId < cursor_.it.fields().size(); ++fieldId) {
+    auto lengthFieldIdx = lengthIdx(fieldId);
+    limits_[lengthFieldIdx] =
+        std::min(limits_[lengthFieldIdx], (TOffset)input(fieldId).dims()[0]);
+  }
+}
+
+namespace {
+
+class CreateTreeCursorOp : public Operator<CPUContext> {
+ public:
+  CreateTreeCursorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<TreeCursor>>(0) =
+        std::unique_ptr<TreeCursor>(new TreeCursor(TreeIterator(fields_)));
+    return true;
+  }
+
+ private:
+  std::vector<std::string> fields_;
+};
+
+class GetCursorOffsetOp : public Operator<CPUContext> {
+ public:
+  GetCursorOffsetOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    Output(0)->Resize(cursor->offsets.size());
+    auto* output = Output(0)->mutable_data<int>();
+    for (size_t i = 0; i < cursor->offsets.size(); ++i) {
+      output[i] = cursor->offsets[i];
+    }
+    return true;
+  }
+};
+
+class ResetCursorOp : public Operator<CPUContext> {
+ public:
+  ResetCursorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    std::lock_guard<std::mutex> lock(cursor->mutex_);
+    cursor->offsets.clear();
+    return true;
+  }
+};
+
+class CheckDatasetConsistencyOp : public Operator<CPUContext> {
+ public:
+  CheckDatasetConsistencyOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        iterator_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    std::vector<const TLength*> lengths;
+    std::vector<TOffset> limits;
+    std::vector<TOffset> sizes;
+    std::vector<TOffset> offsets;
+    CAFFE_ENFORCE(
+        InputSize() == iterator_.fields().size(),
+        "Invalid number of fields. Expected ",
+        iterator_.fields().size(),
+        ", got ",
+        InputSize());
+    sizes.resize(iterator_.numOffsetFields());
+    // gather length data
+    lengths.resize(iterator_.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      lengths[i] = Input(iterator_.lengthField(i).id).data<TLength>();
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < iterator_.fields().size(); ++i) {
+      int lengthIdx = iterator_.fields()[i].lengthFieldId + 1;
+      CAFFE_ENFORCE_GT(Input(i).ndim(), 0);
+      TOffset size = (TOffset)Input(i).dims()[0];
+      if (limits[lengthIdx] == std::numeric_limits<TOffset>::max()) {
+        limits[lengthIdx] = size;
+      } else {
+        CAFFE_ENFORCE(
+            limits[lengthIdx] == size,
+            "Inconsistent sizes for fields belonging to same domain.",
+            " Field: ",
+            i,
+            " (",
+            iterator_.fields()[i].name,
+            "); Length field index: ",
+            lengthIdx,
+            "); Previous size: ",
+            limits[lengthIdx],
+            "; New size: ",
+            size);
+      }
+    }
+    // advance to the end
+    offsets.assign(sizes.size(), 0);
+    iterator_.advance(lengths, offsets, sizes, limits, limits[0]);
+    for (int i = 0; i < limits.size(); ++i) {
+      CAFFE_ENFORCE(limits[i] == offsets[i]);
+    }
+    return true;
+  }
+
+ private:
+  TreeIterator iterator_;
+};
+
+class PackRecordsOp : public Operator<CPUContext> {
+ public:
+  PackRecordsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    // There should be one input per field
+    CAFFE_ENFORCE_EQ(InputSize(), fields_.size());
+    CAFFE_ENFORCE_EQ(OutputSize(), 1);
+
+    TreeCursor cursor((TreeIterator(fields_)));
+
+    TreeWalker walker(Inputs(), cursor);
+
+    Output(0)->Resize(walker.size());
+
+    // Output(0)->raw_mutable_data(TypeMeta::Make<SharedTensorVectorPtr>()));
+    auto* dst = Output(0)->mutable_data<SharedTensorVectorPtr>();
+
+    for (int batchId = 0; batchId < walker.size(); ++batchId) {
+      dst[batchId] = std::make_shared<std::vector<TensorCPU>>();
+      dst[batchId]->reserve(walker.fields().size());
+
+      for (const auto& field : walker.fields()) {
+        dst[batchId]->emplace_back(field.dim());
+        auto& tensor = dst[batchId]->back();
+        context_.template CopyItems<CPUContext, CPUContext>(
+            field.meta(),
+            tensor.size(),
+            field.ptr() /* src */,
+            tensor.raw_mutable_data(field.meta()) /* dst */);
+      }
+
+      walker.advance();
+    }
+
+    return true;
+  }
+
+ private:
+  std::vector<std::string> fields_;
+};
+
+class UnPackRecordsOp : public Operator<CPUContext> {
+ public:
+  UnPackRecordsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    const auto* inputs = Input(0).template data<SharedTensorVectorPtr>();
+    const auto numRows = Input(0).size();
+
+    CAFFE_ENFORCE_GE(numRows, 0);
+
+    auto numTensors = OutputSize();
+
+    // Precomputer the output sizes to avoid resizing
+    std::vector<std::vector<TIndex>> outputDims(numTensors);
+    std::vector<const TypeMeta*> metas(numTensors);
+
+    CAFFE_ENFORCE(
+        numRows > 0 || InputSize() > 1,
+        "Unpacking empty record without shape will leave output blobs in "
+        "undefined state.");
+
+    if (InputSize() == 1) {
+      getShapeAndMetaFromInput(outputDims, metas);
+    } else {
+      getShapeAndMetaFromPrototypeBlobs(outputDims, metas);
+    }
+
+    for (int i = 0; i < numRows; ++i) {
+      CAFFE_ENFORCE(inputs[i]);
+      for (int j = 0; j < inputs[i]->size(); ++j) {
+        const auto& input = inputs[i]->at(j);
+
+        // Checks to ensure that dimensions/sizes match
+        CAFFE_ENFORCE_EQ(outputDims[j].size(), input.ndim());
+        CAFFE_ENFORCE(*metas[j] == input.meta());
+        // We look from first dimension, because we concat on the first.
+        for (int k = 1; k < input.ndim(); ++k) {
+          CAFFE_ENFORCE_EQ(input.dims()[k], outputDims[j][k]);
+        }
+
+        outputDims[j][0] += input.dim(0);
+      }
+    }
+
+    // Resize to the final output size
+    std::vector<void*> destinations(numTensors);
+    for (int i = 0; i < numTensors; ++i) {
+      Output(i)->Resize(outputDims[i]);
+      destinations[i] = Output(i)->raw_mutable_data(*metas[i]);
+    }
+
+    for (int i = 0; i < numRows; ++i) {
+      for (int j = 0; j < numTensors; ++j) {
+        const auto& input = inputs[i]->at(j);
+
+        context_.CopyItems<CPUContext, CPUContext>(
+            *metas[j],
+            input.size(),
+            input.raw_data() /* src */,
+            destinations[j] /* dst */
+        );
+
+        destinations[j] =
+            (char*)destinations[j] + input.size() * input.itemsize();
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  void getShapeAndMetaFromInput(
+      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<const TypeMeta*>& metas) {
+    const auto* inputs = Input(0).template data<SharedTensorVectorPtr>();
+
+    const auto& inputZero = inputs[0];
+    CAFFE_ENFORCE(inputZero);
+
+    const auto numTensors = inputZero->size();
+
+    CAFFE_ENFORCE_EQ(numTensors, fields_.size());
+    CAFFE_ENFORCE_EQ(numTensors, OutputSize());
+
+    for (int i = 0; i < numTensors; ++i) {
+      outputDims[i] = inputZero->at(i).dims();
+      outputDims[i][0] = 0;
+      metas[i] = &inputZero->at(i).meta();
+    }
+  }
+
+  void getShapeAndMetaFromPrototypeBlobs(
+      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<const TypeMeta*>& metas) {
+    const auto numTensors = fields_.size();
+    CAFFE_ENFORCE_EQ(numTensors, InputSize() - 1);
+    CAFFE_ENFORCE_EQ(numTensors, OutputSize());
+    for (int i = 0; i < numTensors; ++i) {
+      const auto& input = Input(i + 1);
+      outputDims[i] = input.dims();
+      outputDims[i][0] = 0;
+      metas[i] = &input.meta();
+    }
+  }
+
+  std::vector<std::string> fields_;
+};
+
+class ReadNextBatchOp : public Operator<CPUContext> {
+ public:
+  ReadNextBatchOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)),
+        enforceBatchSize_(OperatorBase::GetSingleArgument<bool>(
+            "enforce_batch_size",
+            false)) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
+    std::vector<const TLength*> lengths;
+    std::vector<TOffset> limits;
+    std::vector<TOffset> sizes;
+    std::vector<TOffset> offsets;
+    TLength lenZero = 0;
+    sizes.resize(cursor->it.numOffsetFields());
+    // gather length data
+    lengths.resize(cursor->it.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      auto& a = Input(cursor->it.lengthField(i).id + 1);
+      if (a.size() > 0) {
+        lengths[i] = a.data<int>();
+      } else {
+        lengths[i] = &lenZero;
+      }
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      limits[lengthFieldIdx] =
+          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
+    }
+    // advance cursor
+    {
+      std::lock_guard<std::mutex> lock(cursor->mutex_);
+      if (cursor->offsets.empty()) {
+        cursor->offsets.assign(sizes.size(), 0);
+      }
+      offsets = cursor->offsets;
+      cursor->it.advance(lengths, cursor->offsets, sizes, limits, batchSize_);
+      if (enforceBatchSize_ && sizes[0] < batchSize_) {
+        // if we enforce batch_size but don't have enough rows left to
+        // complete a full batch, return empty for all columns.
+        // This signals end of dataset to the caller.
+        sizes.assign(sizes.size(), 0);
+      }
+    }
+    // gather data
+    std::vector<TIndex> outDim;
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      auto size = sizes[lengthIdx];
+      auto offset = offsets[lengthIdx];
+      auto& in = Input(i + 1);
+      auto innerSize = in.size_from_dim(1);
+      outDim = in.dims();
+      outDim[0] = size;
+      auto* out = Output(i);
+      out->Resize(outDim);
+      void* src =
+          (char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
+      void* dst = out->raw_mutable_data(in.meta()); // create the tensor
+      if (out->size() == 0) {
+        continue;
+      }
+      context_.template CopyItems<CPUContext, CPUContext>(
+          in.meta(), out->size(), src, dst);
+    }
+    return true;
+  }
+  int batchSize_;
+  bool enforceBatchSize_;
+};
+
+class ComputeOffsetOp : public Operator<CPUContext> {
+ public:
+  ComputeOffsetOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
+    auto* out = Output(0);
+    std::vector<const TLength*> lengths;
+    std::vector<TOffset> limits;
+    std::vector<TOffset> sizes;
+    std::vector<TOffset> offsets;
+    TLength lenZero = 0;
+    sizes.resize(cursor->it.numOffsetFields());
+    // gather length data
+    lengths.resize(cursor->it.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      auto& a = Input(cursor->it.lengthField(i).id + 1);
+      if (a.size() > 0) {
+        lengths[i] = a.data<int>();
+      } else {
+        lengths[i] = &lenZero;
+      }
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      limits[lengthFieldIdx] =
+          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
+    }
+    out->Resize(limits.at(0) + 1, sizes.size());
+    auto* out_data = out->mutable_data<int64_t>();
+    for (int k = 0; k <= limits.at(0); k++) {
+      // advance cursor
+      if (cursor->offsets.empty()) {
+        cursor->offsets.assign(sizes.size(), 0);
+      }
+      // write output
+      std::copy(cursor->offsets.begin(), cursor->offsets.end(), out_data);
+      out_data += sizes.size();
+      cursor->it.advance(lengths, cursor->offsets, sizes, limits, 1);
+    }
+    cursor->offsets.assign(sizes.size(), 0); // reSet after getting meta info
+    return true;
+  }
+};
+
+class SortAndShuffleOp : public Operator<CPUContext> {
+ public:
+  SortAndShuffleOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        sort_by_field_idx_(
+            OperatorBase::GetSingleArgument<int>("sort_by_field_idx", 1)),
+        batch_size_(OperatorBase::GetSingleArgument<int>("batch_size", 1)),
+        shuffle_size_(OperatorBase::GetSingleArgument<int>("shuffle_size", 1)) {
+  }
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
+    CAFFE_ENFORCE(-1 <= sort_by_field_idx_);
+    CAFFE_ENFORCE(cursor->it.fields().size() - sort_by_field_idx_ > 0);
+    int size;
+    if (sort_by_field_idx_ != -1) {
+      size = Input(sort_by_field_idx_ + 1).dims()[0];
+    } else {
+      size = Input(1).dims()[0];
+    }
+
+    CAFFE_ENFORCE(
+        batch_size_ > 0 && shuffle_size_ > 0 &&
+        0 < batch_size_ * shuffle_size_);
+    // adjust shuffle_size_ if it is too large
+    if (batch_size_ * shuffle_size_ > size) {
+      shuffle_size_ = size / batch_size_;
+    }
+
+    int num_batch = size / batch_size_;
+    auto* out = Output(0);
+    out->Resize(size);
+    auto* out_data = out->mutable_data<int64_t>();
+
+    vector<int> shuffle_idx(size);
+    iota(shuffle_idx.begin(), shuffle_idx.end(), 0);
+
+    if (sort_by_field_idx_ != -1) {
+      auto& sortblob = Input(sort_by_field_idx_ + 1);
+      auto* sortdata = sortblob.data<int>();
+      // must sort by a field at the root level
+      CAFFE_ENFORCE(
+          cursor->it.fields()[sort_by_field_idx_].lengthFieldId == -1);
+      sort(shuffle_idx.begin(), shuffle_idx.end(), [&sortdata](int i1, int i2) {
+        return sortdata[i1] < sortdata[i2];
+      });
+    }
+
+    if (batch_size_ * shuffle_size_ > 1) {
+      int offset = 0;
+      while (offset + batch_size_ * shuffle_size_ < size) {
+        std::shuffle(
+            shuffle_idx.begin() + offset,
+            shuffle_idx.begin() + offset + batch_size_ * shuffle_size_,
+            std::default_random_engine());
+        offset += batch_size_ * shuffle_size_;
+      }
+    }
+
+    vector<int> batch_idx(num_batch);
+    iota(batch_idx.begin(), batch_idx.end(), 0);
+    std::shuffle(
+        batch_idx.begin(), batch_idx.end(), std::default_random_engine());
+
+    for (int i = 0; i < num_batch; i++) {
+      std::copy(
+          shuffle_idx.begin() + batch_idx[i] * batch_size_,
+          shuffle_idx.begin() + (batch_idx[i] + 1) * batch_size_,
+          out_data);
+      out_data += batch_size_;
+    }
+    std::copy(
+        shuffle_idx.begin() + num_batch * batch_size_,
+        shuffle_idx.end(),
+        out_data);
+
+    return true;
+  }
+
+  int sort_by_field_idx_;
+  int batch_size_;
+  int shuffle_size_;
+};
+
+class ReadRandomBatchOp : public Operator<CPUContext> {
+ public:
+  ReadRandomBatchOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)),
+        enforceBatchSize_(
+            OperatorBase::GetSingleArgument<bool>("enforce_batch_size", false)),
+        loopOver_(OperatorBase::GetSingleArgument<bool>("loop_over", false)) {}
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    auto& idxblob = Input(1);
+    auto& offsetsmat = Input(2);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 3);
+    auto idxvec = idxblob.template data<int64_t>();
+    auto& offsetdim = offsetsmat.dims();
+    // gather data
+    std::vector<TIndex> outDim;
+    int64_t idx;
+    {
+      std::lock_guard<std::mutex> lock(cursor->mutex_);
+      cursor->offsets.resize(1);
+      idx = cursor->offsets.at(0);
+      // if we want to enforce batch size but we dont have a complete
+      // batch, skip the last rows.
+      if (enforceBatchSize_ && idx + batchSize_ > idxblob.size()) {
+        idx = idxblob.size();
+      }
+      if (loopOver_ && idx >= idxblob.size()) {
+        cursor->offsets.at(0) = 0;
+        idx = 0;
+      }
+      cursor->offsets.at(0) += batchSize_;
+    }
+
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      auto& in = Input(i + 3);
+      outDim = in.dims();
+      outDim.at(0) = 0;
+      auto idxbegin = idx;
+      for (int j = 0; j < batchSize_; ++j) {
+        if (idx >= idxblob.size()) {
+          break;
+        }
+        CAFFE_ENFORCE(
+            (idxvec[idx] + 1) * offsetdim[1] + lengthIdx < offsetsmat.size(),
+            "Out of bound when trying to get elem from offsetsmat");
+        auto offsetptr = offsetsmat.template data<TOffset>() +
+            idxvec[idx] * offsetdim[1] + lengthIdx;
+        auto offset = *offsetptr;
+        auto size = *(offsetptr + offsetdim[1]) - offset;
+        outDim.at(0) += size; // accumulate over the batch
+        idx++;
+      }
+      idx = idxbegin; // reSet
+      auto* out = Output(i);
+      out->Resize(outDim);
+      if (out->size() == 0) {
+        continue;
+      }
+      auto dst = static_cast<char*>(out->raw_mutable_data(in.meta()));
+      int block_size = in.size() / in.dim(0);
+      auto block_bytesize = in.size_from_dim(1) * in.meta().itemsize();
+      CAFFE_ENFORCE(
+          block_bytesize == in.nbytes() / in.dim(0),
+          "block_bytesize should be consistent with data dim");
+      auto src_base = static_cast<const char*>(in.raw_data());
+      int start = 0;
+      for (int j = 0; j < batchSize_; ++j) {
+        if (idx >= idxblob.size()) {
+          break;
+        }
+        auto offsetptr = offsetsmat.template data<TOffset>() +
+            idxvec[idx] * offsetdim[1] + lengthIdx;
+        auto offset = *offsetptr;
+        auto size = *(offsetptr + offsetdim[1]) - offset;
+        // copy data
+        auto src = src_base + offset * block_bytesize;
+        context_.template CopyItems<CPUContext, CPUContext>(
+            in.meta(), size * block_size, src, dst + start * block_bytesize);
+        start += size;
+        idx++;
+      }
+      idx = idxbegin; // reSet
+    }
+    return true;
+  }
+  int batchSize_;
+  bool enforceBatchSize_;
+  bool loopOver_;
+};
+
+template <class Context>
+class AppendOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AppendOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& a = Input(0);
+    auto& b = Input(1);
+    auto* c = Output(0);
+    CAFFE_ENFORCE(b.ndim() >= 1);
+    if (a.size() == 0 && a.dim(0) == 0) {
+      c->CopyFrom(b);
+      return true;
+    }
+    CAFFE_ENFORCE(&a == c, "First argument must be in-place.");
+    CAFFE_ENFORCE(c->ndim() == b.ndim());
+    CAFFE_ENFORCE(b.ndim() == c->ndim());
+    CAFFE_ENFORCE(a.meta() == b.meta());
+    for (int i = 1; i < a.ndim(); ++i) {
+      CAFFE_ENFORCE(a.dims()[i] == b.dims()[i]);
+    }
+    auto oldSize = c->size();
+    c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
+    auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
+    context_.template CopyItems<Context, Context>(
+        b.meta(), b.size(), b.raw_data(), dst);
+    return true;
+  }
+};
+
+template <class Context>
+class AtomicAppendOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AtomicAppendOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
+    const auto numFields = (InputSize() - 1) / 2;
+    CAFFE_ENFORCE(OutputSize() == numFields);
+
+    std::lock_guard<std::mutex> guard(*mutex);
+
+    // 1: checks
+    for (int i = 0; i < numFields; ++i) {
+      auto& a = Input(1 + i);
+      auto& b = Input(1 + i + numFields);
+      auto* c = Output(i);
+      CAFFE_ENFORCE(b.ndim() >= 1);
+      if (a.size() == 0) {
+        continue;
+      }
+      CAFFE_ENFORCE(
+          (void*)&a == (void*)c, "Appended-to arguments must be in-place.");
+      CAFFE_ENFORCE(c->ndim() == b.ndim());
+      CAFFE_ENFORCE(b.ndim() == c->ndim());
+      CAFFE_ENFORCE(a.meta() == b.meta());
+      for (int j = 1; j < a.ndim(); ++j) {
+        CAFFE_ENFORCE(a.dims()[j] == b.dims()[j]);
+      }
+    }
+
+    // 2: copies
+    for (int i = 0; i < numFields; ++i) {
+      auto& a = Input(1 + i);
+      auto& b = Input(1 + i + numFields);
+      auto* c = Output(i);
+      if (a.size() == 0 && a.dim(0) == 0) {
+        c->CopyFrom(b);
+        continue;
+      }
+      auto oldSize = c->size();
+      c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
+      auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
+      context_.template CopyItems<Context, Context>(
+          b.meta(), b.size(), b.raw_data(), dst);
+    }
+    return true;
+  }
+};
+
+template <class Context>
+class CreateTensorVectorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    auto ptr = make_unique<std::vector<Tensor<Context>>>();
+    *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR) =
+        std::move(ptr);
+    return true;
+  }
+
+ private:
+  OUTPUT_TAGS(TENSOR_VECTOR);
+};
+
+template <class Context>
+class TensorVectorSizeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(TensorVectorSizeOp);
+
+  bool RunOnDevice() override {
+    auto& vector_ptr =
+        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+    auto* size = Output(SIZE);
+    size->Resize();
+    // 32-bit should be enough here
+    *size->template mutable_data<int32_t>() = vector_ptr->size();
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(TENSOR_VECTOR);
+  OUTPUT_TAGS(SIZE);
+};
+
+template <class Context>
+class ConcatTensorVectorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    const TensorVectorPtr<Context>& tensorVector =
+        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+
+    auto* tensor = Output(TENSOR);
+    CAFFE_ENFORCE(!tensorVector->empty());
+
+    vector<TIndex> outputDims(tensorVector->at(0).dims());
+    CAFFE_ENFORCE(outputDims.size() > 0);
+    for (int i = 1; i < tensorVector->size(); i++) {
+      // the tensor shapes are the same except for the first dimension
+      for (int j = 1; j < tensorVector->at(i).ndim(); j++) {
+        CAFFE_ENFORCE(outputDims[j] == tensorVector->at(i).dims()[j]);
+      }
+      CAFFE_ENFORCE(tensorVector->at(0).meta() == tensorVector->at(i).meta());
+      outputDims[0] += tensorVector->at(i).dims()[0];
+    }
+
+    tensor->Resize(outputDims);
+    TIndex offset = 0;
+    auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
+
+    for (const auto& t : *tensorVector) {
+      context_.template CopyItems<Context, Context>(
+          t.meta(), t.size(), t.raw_data(), dst + offset);
+      offset += t.nbytes();
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(TENSOR_VECTOR);
+  OUTPUT_TAGS(TENSOR);
+};
+
+template <class Context>
+class CollectTensorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CollectTensorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        numToCollect_(
+            OperatorBase::GetSingleArgument<int>("num_to_collect", -1)),
+        numVisited_(0) {
+    CAFFE_ENFORCE(numToCollect_ > 0);
+  }
+
+  bool RunOnDevice() override {
+    int pos = -1;
+    if (numVisited_ < numToCollect_) {
+      // append
+      pos = numVisited_;
+    } else {
+      auto& gen = context_.RandGenerator();
+      // uniform between [0, numVisited_]
+      std::uniform_int_distribution<int> uniformDist(0, numVisited_);
+      pos = uniformDist(gen);
+      if (pos >= numToCollect_) {
+        // discard
+        pos = -1;
+      }
+    }
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
+      TensorVectorPtr<Context>& tensorVector =
+          *OperatorBase::Output<TensorVectorPtr<Context>>(i);
+
+      if (numVisited_ >= numToCollect_) {
+        CAFFE_ENFORCE(
+            tensorVector->size() == numToCollect_,
+            "TensorVecotor size = ",
+            tensorVector->size(),
+            " is different from numToCollect = ",
+            numToCollect_);
+      }
+
+      const auto& tensor = Input(OutputSize() + i);
+
+      if (pos < 0) {
+        // discard
+        CAFFE_ENFORCE(numVisited_ >= numToCollect_);
+      } else if (pos >= tensorVector->size()) {
+        // append
+        tensorVector->push_back(Tensor<Context>());
+        tensorVector->back().template CopyFrom<Context, Context>(
+            tensor, &context_);
+      } else {
+        // replace
+        tensorVector->at(pos).template CopyFrom<Context, Context>(
+            tensor, &context_);
+      }
+    }
+
+    numVisited_++;
+    return true;
+  }
+
+ private:
+  // number of tensors to collect
+  int numToCollect_;
+  // number of tensors visited
+  int numVisited_;
+};
+
+class TrimDatasetOp : public Operator<CPUContext> {
+ public:
+  TrimDatasetOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        iterator_(OperatorBase::GetRepeatedArgument<std::string>("fields")),
+        multiple_of_(OperatorBase::GetSingleArgument<int>("multiple_of", 1)) {
+    CAFFE_ENFORCE_GE(multiple_of_, 1);
+  }
+
+  bool RunOnDevice() override {
+    TreeCursor cursor(iterator_);
+    TreeWalker walker(Inputs(), cursor);
+
+    int trimmedSize = (walker.size() / multiple_of_) * multiple_of_;
+    if (trimmedSize == walker.size()) {
+      // we already satisfy the condition
+      return true;
+    }
+    // advance desired number of records
+    for (int i = 0; i < trimmedSize; ++i) {
+      walker.advance();
+    }
+    // trim each column to the offset
+    for (int col = 0; col < walker.fields().size(); ++col) {
+      auto newOuterSize = walker.fields().at(col).offset();
+      Output(col)->Shrink(newOuterSize);
+    }
+    return true;
+  }
+
+ private:
+  TreeIterator iterator_;
+  int multiple_of_;
+};
+
+REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
+REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
+REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
+REGISTER_CPU_OPERATOR(GetCursorOffset, GetCursorOffsetOp);
+REGISTER_CPU_OPERATOR(ComputeOffset, ComputeOffsetOp);
+REGISTER_CPU_OPERATOR(SortAndShuffle, SortAndShuffleOp);
+REGISTER_CPU_OPERATOR(ReadRandomBatch, ReadRandomBatchOp);
+REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
+REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
+REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CreateTensorVector, CreateTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(TensorVectorSize, TensorVectorSizeOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ConcatTensorVector, ConcatTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CollectTensor, CollectTensorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(PackRecords, PackRecordsOp);
+REGISTER_CPU_OPERATOR(UnPackRecords, UnPackRecordsOp);
+REGISTER_CPU_OPERATOR(TrimDataset, TrimDatasetOp);
+
+OPERATOR_SCHEMA(CreateTreeCursor)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a cursor to iterate through a list of tensors, where some of those
+tensors contains the lengths in a nested schema. The schema is determined by
+the `fields` arguments.
+
+For example, to represent the following schema:
+
+  Struct(
+      a=Int(),
+      b=List(List(Int),
+      c=List(
+          Struct(
+             c1=String,
+             c2=List(Int),
+          ),
+      ),
+  )
+
+the field list will be:
+  [
+      "a",
+      "b:lengths",
+      "b:values:lengths",
+      "b:values:values",
+      "c:lengths",
+      "c:c1",
+      "c:c2:lengths",
+      "c:c2:values",
+  ]
+
+And for the following instance of the struct:
+
+  Struct(
+      a=3,
+      b=[[4, 5], [6, 7, 8], [], [9]],
+      c=[
+          Struct(c1='alex', c2=[10, 11]),
+          Struct(c1='bob', c2=[12]),
+      ],
+  )
+
+The values of the fields will be:
+  {
+      "a": [3],
+      "b:lengths": [4],
+      "b:values:lengths": [2, 3, 0, 1],
+      "b:values:values": [4, 5, 6, 7, 8, 9],
+      "c:lengths": [2],
+      "c:c1": ["alex", "bob"],
+      "c:c2:lengths": [2, 1],
+      "c:c2:values", [10, 11, 12],
+  }
+
+In general, every field name in the format "{prefix}:lengths" defines a domain
+"{prefix}", and every subsequent field in the format "{prefix}:{field}" will
+be in that domain, and the length of the domain is provided for each entry of
+the parent domain. In the example, "b:lengths" defines a domain of length 4, so
+every field under domain "b" will have 4 entries.
+The "lengths" field for a given domain must appear before any reference to
+that domain.
+
+Returns a pointer to an instance of the Cursor, which keeps the current offset
+on each of the domains defined by `fields`. Cursor also ensures thread-safety
+such that ReadNextBatch and ResetCursor can be used safely in parallel.
+
+A cursor does not contain data per se, so calls to ReadNextBatch actually need
+to pass a list of blobs containing the data to read for each one of the fields.
+)DOC")
+    .Output(0, "cursor", "A blob pointing to an instance of a new TreeCursor.")
+    .Arg(
+        "fields",
+        "A list of strings each one representing a field of the dataset.");
+
+OPERATOR_SCHEMA(ResetCursor)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Resets the offsets for the given TreeCursor. This operation is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.");
+
+OPERATOR_SCHEMA(ReadNextBatch)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Read the next batch of examples out of the given cursor and data blobs.
+
+Input(0) is a blob pointing to a TreeCursor, and
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ReadNextBatch is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
+    .Arg("batch_size", "Number of top-level entries to read.");
+
+OPERATOR_SCHEMA(GetCursorOffset)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc("Get the current offset in the cursor.")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Output(0, "offsets", "Tensor containing the offsets for the cursor.");
+
+OPERATOR_SCHEMA(ComputeOffset)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute the offsets matrix given cursor and data blobs. Need to be ran at
+beginning or after reseting cursor
+
+Input(0) is a blob pointing to a TreeCursor, and
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ComputeOffset is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing offset info for this chunk.");
+
+OPERATOR_SCHEMA(SortAndShuffle)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute the sorted indices given a field index to sort by and break the sorted
+indices into chunks of shuffle_size * batch_size and shuffle each chunk,
+finally we shuffle between batches. If sort_by_field_idx is -1 we skip sort.
+
+For example, we have data sorted as
+1,2,3,4,5,6,7,8,9,10,11,12
+
+and batchSize = 2 and shuffleSize = 3, when we shuffle we get:
+[3,1,4,6,5,2] [12,10,11,8,9,7]
+
+After this we will shuffle among different batches with size 2
+[3,1],[4,6],[5,2],[12,10],[11,8],[9,7]
+
+We may end up with something like
+[9,7],[5,2],[12,10],[4,6],[3,1],[11,8]
+
+Input(0) is a blob pointing to a TreeCursor, and
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+SortAndShuffle is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "dataset_field_0", "First dataset field")
+    .Output(0, "indices", "Tensor containing sorted indices.");
+
+OPERATOR_SCHEMA(ReadRandomBatch)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Read the next batch of examples out of the given cursor,
+idx blob, offset matrix and data blobs.
+
+Input(0) is a blob pointing to a TreeCursor,
+Input(1) is a blob pointing to the shuffled idx
+Input(2) is a blob pointing to the offset matrix and
+[Input(3),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ReadRandomBatch is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "idx", "idx with a shuffled order.")
+    .Input(2, "offsetsmat", "offset matrix containing length offset info.")
+    .Input(3, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
+    .Arg("batch_size", "Number of top-level entries to read.")
+    .Arg("loop_over", "(bool) Repeat the dataset indefinitely");
+
+OPERATOR_SCHEMA(CheckDatasetConsistency)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Checks that the given data fields represents a consistent dataset under
+the schema specified by the `fields` argument. Operator fails if the fields
+are not consistent. If data is consistent, each field's data can be safely
+appended to an existing dataset, keeping it consistent.
+)DOC")
+    .Input(0, "field_0", "Data for field 0.")
+    .Arg(
+        "fields",
+        "List of strings representing the string names in the format"
+        "specified in the doc for CreateTreeCursor.");
+
+OPERATOR_SCHEMA(Append)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Append input `B` to the end of input `A`.
+
+- It is required that this operation run in-place, meaning that the input `A` blob must match the output blob.
+- All except the outer-most dimension must be the same between `A` and `B`.
+- Input `A` may have to be re-allocated in order for accommodate to the new size. Currently, an exponential growth ratio is used in order to ensure amortized constant time complexity.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dataset_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Append",
+    ["A", "B"],
+    ["A"],
+)
+
+workspace.FeedBlob("A", np.random.randint(10, size=(1,3,3)))
+workspace.FeedBlob("B", np.random.randint(10, size=(2,3,3)))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("A:", workspace.FetchBlob("A"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[[3 8 7]
+  [1 6 6]
+  [5 0 6]]]
+B:
+[[[4 3 1]
+  [7 9 6]
+  [9 4 5]]
+
+ [[7 7 4]
+  [9 8 7]
+  [1 6 6]]]
+A:
+[[[3 8 7]
+  [1 6 6]
+  [5 0 6]]
+
+ [[4 3 1]
+  [7 9 6]
+  [9 4 5]]
+
+ [[7 7 4]
+  [9 8 7]
+  [1 6 6]]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "A", "(*Tensor*): base input tensor of shape $(N, d_1, d_2, ..., d_n)$")
+    .Input(1, "B", "(*Tensor*): second input tensor of shape $(M, d_1, d_2, ..., d_n)$ to be appended to the base")
+    .Output(0, "A", "(*Tensor*): output tensor of shape $(N+M, d_1, d_2, ..., d_n)$");
+
+OPERATOR_SCHEMA(AtomicAppend)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .AllowInplace([](int in, int out) { return in == out + 1; });
+
+OPERATOR_SCHEMA(CreateTensorVector)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Create a std::unique_ptr<std::vector<Tensor> >");
+
+OPERATOR_SCHEMA(TensorVectorSize)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc("Get the size of the input vector")
+    .Input(0, "tensor vector", "std::unique_ptr<std::vector<Tensor> >")
+    .Output(0, "size", "int32_t size");
+
+OPERATOR_SCHEMA(ConcatTensorVector)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Concat Tensors in the std::unique_ptr<std::vector<Tensor> >
+along the first dimension.
+    )DOC")
+    .Input(0, "vector of Tensor", "std::unique_ptr<std::vector<Tensor> >")
+    .Output(0, "tensor", "tensor after concatenating");
+
+OPERATOR_SCHEMA(CollectTensor)
+    .NumInputs([](int n) { return n > 0 && n % 2 == 0; })
+    .NumOutputs(1, INT_MAX)
+    .NumInputsOutputs([](int in, int out) { return in == out * 2; })
+    .EnforceInplace([](int in, int out) { return in == out; })
+    .SetDoc(R"DOC(
+Collect tensor into tensor vector by reservoir sampling,
+argument num_to_collect indicates the max number of tensors that will be
+collected. The first half of the inputs are tensor vectors, which are also the
+outputs. The second half of the inputs are the tensors to be collected into each
+vector (in the same order). The input tensors are collected in all-or-none
+manner. If they are collected, they will be placed at the same index in the
+output vectors.
+)DOC")
+    .Arg("num_to_collect", "The max number of tensors to collect");
+
+OPERATOR_SCHEMA(PackRecords)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a dataset under a schema specified by the `fields` argument will pack all
+the input tensors into one, where each tensor element represents a row of data
+(batch of size 1). This format allows easier use with the rest of Caffe2
+operators.
+)DOC")
+    .Arg(
+        "fields",
+        "List of strings representing the string names in the format"
+        "specified in the doc for CreateTreeCursor.")
+    .Output(
+        0,
+        "tensor",
+        "One dimensional tensor having a complex type of SharedTensorVectorPtr."
+        " In order to reverse it back to the original input it has to be "
+        "inserted into UnPackRecordsOp.");
+
+OPERATOR_SCHEMA(TrimDataset)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Trim the given dataset inplace, given the dataset blobs and the field specs.
+Trimming happens such that the dataset will contain the largest possible number
+of records that is a multiple of the 'multiple_of' argument.
+)DOC")
+    .EnforceInplace([](int input, int output) { return input == output; })
+    .Arg(
+        "fields",
+        "List of strings representing the string names in the format"
+        "specified in the doc for CreateTreeCursor.");
+
+OPERATOR_SCHEMA(UnPackRecords)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Given a packed dataset (packed by the PackRecordsOp) and the `fields` argument
+describing the datasets schema returns the original dataset format. Number of
+returned tensors is equal to the number of fields in the `fields` argument.
+
+The first input is the packed tensor to be unpacked. Optionally, you can provide
+prototype tensors to give the expected shapes of the output tensors. This is
+helpful when you expected to unpack empty tensor, e.g., output of a sampling
+process.
+)DOC")
+    .Arg(
+        "fields",
+        "List of strings representing the string names in the format"
+        "specified in the doc for CreateTreeCursor.")
+    .Input(0, "packed_tensor", "The tensor to be unpacked");
+
+SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
+SHOULD_NOT_DO_GRADIENT(ResetCursor);
+SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
+SHOULD_NOT_DO_GRADIENT(ComputeOffset);
+SHOULD_NOT_DO_GRADIENT(ReadRandomBatch);
+SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
+SHOULD_NOT_DO_GRADIENT(Append);
+SHOULD_NOT_DO_GRADIENT(AtomicAppend);
+SHOULD_NOT_DO_GRADIENT(CreateTensorVector);
+SHOULD_NOT_DO_GRADIENT(TensorVectorSize);
+SHOULD_NOT_DO_GRADIENT(ConcatTensorVector);
+SHOULD_NOT_DO_GRADIENT(CollectTensor);
+SHOULD_NOT_DO_GRADIENT(UnPackRecords);
+SHOULD_NOT_DO_GRADIENT(PackRecords);
+
+class TreeCursorSerializer : public BlobSerializerBase {
+ public:
+  TreeCursorSerializer() {}
+  ~TreeCursorSerializer() {}
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    auto& cursor = blob.template Get<std::unique_ptr<TreeCursor>>();
+    BlobProto blob_proto;
+
+    // serialize offsets as a tensor
+    if (cursor->offsets.size() > 0) {
+      Blob offsets_blob;
+      auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
+      offsets->Resize(cursor->offsets.size());
+      std::copy(
+          cursor->offsets.begin(),
+          cursor->offsets.end(),
+          offsets->mutable_data<TOffset>());
+      TensorSerializer<CPUContext> ser;
+      ser.Serialize(
+          *offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
+    }
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::unique_ptr<TreeCursor>");
+
+    // serialize field names in the content
+    std::ostringstream os;
+    for (const auto& field : cursor->it.fields()) {
+      os << field.name << " ";
+    }
+    blob_proto.set_content(os.str());
+
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+class TreeCursorDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    // deserialize the offsets
+    TensorDeserializer<CPUContext> deser;
+    Blob offset_blob;
+    deser.Deserialize(proto, &offset_blob);
+    auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
+    auto* offsets_ptr = offsets.data<TOffset>();
+
+    // deserialize the field names
+    std::vector<std::string> fieldNames;
+    std::istringstream is(proto.content());
+    std::string field;
+    while (true) {
+      is >> field;
+      if (is.eof()) {
+        break;
+      }
+      fieldNames.push_back(field);
+    }
+    TreeIterator it(fieldNames);
+
+    auto* base = blob->template GetMutable<std::unique_ptr<TreeCursor>>();
+    (*base).reset(new TreeCursor(it));
+    (*base)->offsets.assign(offsets_ptr, offsets_ptr + offsets.size());
+  }
+};
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<TreeCursor>>()),
+    TreeCursorSerializer);
+REGISTER_BLOB_DESERIALIZER(std::unique_ptr<TreeCursor>, TreeCursorDeserializer);
+
+} // namespace
+
+void SharedTensorVectorPtrSerializer::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  /* This is dummy serialize that doesn't save anything. If saving the content
+  is desired in future use case, you can change this serializer. Note: special
+  care need to be taken for the parameter initialization of
+  LastNWindowCollectorOp and ReservoirSamplingOp if this serializer actually
+  saves the content.
+  */
+  CAFFE_ENFORCE(blob.IsType<std::shared_ptr<std::vector<TensorCPU>>>());
+  BlobProto blob_proto;
+  blob_proto.set_name(name);
+  blob_proto.set_type("std::shared_ptr<std::vector<TensorCPU>>");
+  blob_proto.set_content("");
+  acceptor(name, blob_proto.SerializeAsString());
+};
+
+void SharedTensorVectorPtrDeserializer::Deserialize(
+    const BlobProto& /* unused */,
+    Blob* blob) {
+  /* This is dummy deserialize which creates a nullptr
+   */
+  blob->GetMutable<std::shared_ptr<std::vector<TensorCPU>>>();
+}
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::shared_ptr<std::vector<TensorCPU>>>()),
+    SharedTensorVectorPtrSerializer);
+
+REGISTER_BLOB_DESERIALIZER(
+    std::shared_ptr<std::vector<TensorCPU>>,
+    SharedTensorVectorPtrDeserializer);
+
+} // namespace dataset_ops
+} // namespace caffe2
diff --git a/caffe2/operators/dataset_ops.h b/caffe2/operators/dataset_ops.h
new file mode 100644
index 0000000..161a82b
--- /dev/null
+++ b/caffe2/operators/dataset_ops.h
@@ -0,0 +1,213 @@
+#ifndef CAFFE2_OPERATORS_DATASET_OPS_H_
+#define CAFFE2_OPERATORS_DATASET_OPS_H_
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "caffe2/core/blob.h"
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace dataset_ops {
+
+// used for lengths tensors in the dataset
+using TLength = int32_t;
+// used for all internal dataset operations (offsets, sizes to read, etc.)
+using TOffset = int64_t;
+
+/**
+ * Provides functionality to iterate across a list of tensors where some
+ * of those tensors represent lengths in a hierarchical structure.
+ */
+class TreeIterator {
+ public:
+  struct FieldDesc {
+    int id;
+    int lengthFieldId = -1;
+    std::string name;
+  };
+
+  explicit TreeIterator(const std::vector<std::string>& fields);
+
+  void advance(
+      const std::vector<const TLength*>& lengths,
+      std::vector<TOffset>& offsets,
+      std::vector<TOffset>& sizes,
+      std::vector<TOffset>& limits,
+      TOffset num);
+
+  // Corresponds to the number of fields that have "length" as its last name
+  int numLengthFields() const {
+    return lengthFieldIds_.size();
+  }
+
+  // Corresponds to the number of length fields + 1 (for the top-level domain)
+  int numOffsetFields() const {
+    return numLengthFields() + 1;
+  }
+
+  // Get lengthField description for the given field
+  const FieldDesc* lengthFieldFor(const FieldDesc& desc) {
+    return (desc.lengthFieldId == -1)
+        ? nullptr
+        : &fields_.at(lengthFieldIds_.at(desc.lengthFieldId));
+  }
+
+  // Get lengthField description for the given lengthFieldId, where
+  // 0 <= lengthFieldId < numLengthFields()
+  const FieldDesc& lengthField(int lengthFieldId) {
+    return fields_.at(lengthFieldIds_.at(lengthFieldId));
+  }
+
+  // Returns the index into the 'offset' vector for the given field.
+  int offsetFieldIdFor(const FieldDesc& fieldDesc) {
+    return fieldDesc.lengthFieldId + 1;
+  }
+
+  // Returns the field description for all fields.
+  const std::vector<FieldDesc>& fields() {
+    return fields_;
+  }
+
+  const std::vector<int>& lengthFieldIds() const {
+    return lengthFieldIds_;
+  }
+
+ private:
+  // Description of each field
+  std::vector<FieldDesc> fields_;
+  // Index into fields_ above for the fields that are lengths.
+  std::vector<int> lengthFieldIds_;
+};
+
+class TreeCursor {
+ public:
+  explicit TreeCursor(const TreeIterator& iterator) : it(iterator) {}
+  std::vector<TOffset> offsets;
+  std::mutex mutex_;
+  TreeIterator it;
+};
+
+/**
+ * Simple wrapper class allowing an easy traversal of the tensors representing
+ * the hirerarchical structure.
+ */
+class TreeWalker {
+ public:
+  TreeWalker(const vector<const Blob*>& inputs, TreeCursor& cursor);
+
+  // Returns the number of records in a dataset
+  inline TOffset size() const {
+    return limits_.at(0);
+  }
+
+  void advance();
+
+ private:
+  inline const TensorCPU& input(int32_t idx) const {
+    return inputs_[idx]->Get<TensorCPU>();
+  }
+
+  // TODO: Change to fieldDesc
+  inline const TreeIterator::FieldDesc& field(int idx) const {
+    return cursor_.it.fields().at(idx);
+  }
+
+  inline int lengthIdx(int fieldId) const {
+    return field(fieldId).lengthFieldId + 1;
+  }
+
+  inline TOffset offset(int fieldId) const {
+    return prevOffsets_[lengthIdx(fieldId)];
+  }
+
+  std::vector<TIndex> fieldDim(int fieldId) const;
+
+  void* fieldPtr(int fieldId) const;
+
+ public:
+  // Simple Proxy class to expose nicer API for field access
+  class Field {
+   public:
+    Field(TreeWalker& walker, int fieldId)
+        : walker_(walker), fieldId_(fieldId) {}
+
+    inline std::vector<TIndex> dim() const {
+      return walker_.fieldDim(fieldId_);
+    }
+
+    inline TIndex size() const {
+      TIndex size = 1;
+      for (const auto d : dim()) {
+        size *= d;
+      }
+      return size;
+    }
+
+    inline const TypeMeta& meta() const {
+      return walker_.input(fieldId_).meta();
+    }
+
+    inline void* ptr() const {
+      return walker_.fieldPtr(fieldId_);
+    }
+
+    int fieldId() const {
+      return fieldId_;
+    }
+
+    inline TOffset offset() const {
+      return walker_.offset(fieldId_);
+    }
+
+   private:
+    const TreeWalker& walker_;
+    const int fieldId_;
+  };
+
+  // Notice that a reference is returned. If advance() is called the fields will
+  // be updated to represent the new state.
+  inline const std::vector<Field>& fields() const {
+    return fields_;
+  }
+
+ private:
+  void gatherLengthData();
+
+  void gatherSizeLimits();
+
+  const vector<const Blob*>& inputs_;
+  TreeCursor& cursor_;
+  std::vector<Field> fields_;
+
+  std::vector<const TLength*> lengths_;
+  std::vector<TOffset> limits_;
+  std::vector<TOffset> sizes_;
+  std::vector<TOffset> offsets_;
+  std::vector<TOffset> prevOffsets_;
+};
+
+using SharedTensorVectorPtr = std::shared_ptr<std::vector<TensorCPU>>;
+
+template <class Context>
+using TensorVectorPtr = std::unique_ptr<std::vector<Tensor<Context>>>;
+
+class SharedTensorVectorPtrSerializer : public BlobSerializerBase {
+ public:
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      BlobSerializerBase::SerializationAcceptor acceptor) override;
+};
+
+class SharedTensorVectorPtrDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override;
+};
+
+} // namespace dataset_ops
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DATASET_OPS_H_
diff --git a/caffe2/operators/deform_conv_gradient_op.cc b/caffe2/operators/deform_conv_gradient_op.cc
new file mode 100644
index 0000000..1808279
--- /dev/null
+++ b/caffe2/operators/deform_conv_gradient_op.cc
@@ -0,0 +1,57 @@
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/deform_conv_op.h"
+#include "caffe2/operators/deform_conv_op_impl.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(DeformConvGradient).NumInputs(4, 4).NumOutputs(2, 4);
+
+namespace {
+
+class GetDeformConvGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(def_.input_size() == 3 || def_.input_size() == 4);
+
+    ArgumentHelper argsHelper(def_);
+
+    auto compute_dX =
+        !argsHelper.GetSingleArgument<bool>("no_gradient_to_input", 0);
+
+    if (def_.input_size() == 4) {
+      if (compute_dX) {
+        return SingleGradientDef(
+            "DeformConvGradient",
+            "",
+            vector<string>{I(0), I(1), I(2), GO(0)},
+            vector<string>{GI(1), GI(2), GI(3), GI(0)});
+      } else {
+        return SingleGradientDef(
+            "DeformConvGradient",
+            "",
+            vector<string>{I(0), I(1), I(2), GO(0)},
+            vector<string>{GI(1), GI(2), GI(3)});
+      }
+    } else {
+      if (compute_dX) {
+        return SingleGradientDef(
+            "DeformConvGradient",
+            "",
+            vector<string>{I(0), I(1), I(2), GO(0)},
+            vector<string>{GI(1), GI(2), GI(0)},
+            vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      } else {
+        return SingleGradientDef(
+            "DeformConvGradient",
+            "",
+            vector<string>{I(0), I(1), I(2), GO(0)},
+            vector<string>{GI(1), GI(2)},
+            vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      }
+    }
+  }
+};
+REGISTER_GRADIENT(DeformConv, GetDeformConvGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/deform_conv_op.cc b/caffe2/operators/deform_conv_op.cc
new file mode 100644
index 0000000..cf1018a
--- /dev/null
+++ b/caffe2/operators/deform_conv_op.cc
@@ -0,0 +1,56 @@
+#include "caffe2/operators/deform_conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/deform_conv_op_impl.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(DeformConv)
+    .NumInputs(3, 4)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
+    .SetDoc(R"DOC(
+Deformable convolution operator consumes an input vector, the kernel offsets
+blob, the filter blob and the bias blob and computes the output. Other
+parameters, such as the stride and kernel size, or the pads' sizes in each
+direction are not necessary for input because they are provided by the
+ConvPoolOpBase operator. Various dimension checks are done implicitly, and the
+sizes are specified in the Input docs for this operator. As is expected, the
+filter is convolved with a subset of the image using the deformed kernel as
+specified by offsets blob and the bias is added; this is done throughout the
+image data and the output is computed.
+  )DOC")
+    .Input(
+        0,
+        "X",
+        "Input data blob from previous layer; has size "
+        "(N x C x H x W), where N is the batch size, C is the number of channels, and"
+        " H and W are the height and width. Note that this is for the NCHW usage. On "
+        "the other hand, the NHWC Op has a different set of dimension constraints.")
+    .Input(
+        1,
+        "offset",
+        "Offsets blob that specifies the deformed shape of the "
+        "kernel; consists of 2d offsets for each kernel element, one full set per "
+        "each output element; therefore has size (N x 2*kH*kW x H' x W') where N is "
+        "the batch size, kH and kW are the height and width of the kernel, H' and W' "
+        "are the output blob dimensions.")
+    .Input(
+        2,
+        "filter",
+        "The filter blob that will be used in the convolutions; "
+        "has size (M x C x kH x kW), where C is the number of channels, and kH and "
+        "kW are the height and width of the kernel.")
+    .Input(
+        3,
+        "bias",
+        "The 1D bias blob that is added through the convolution; "
+        "has size (M).")
+    .Output(
+        0,
+        "Y",
+        "Output data blob that contains the result of the "
+        "convolution. The output dimensions are functions of the kernel size, "
+        "stride size, and pad lengths."
+        "");
+
+} // namespace caffe2
diff --git a/caffe2/operators/deform_conv_op.cu b/caffe2/operators/deform_conv_op.cu
new file mode 100644
index 0000000..29e5552
--- /dev/null
+++ b/caffe2/operators/deform_conv_op.cu
@@ -0,0 +1,626 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+ */
+
+#include <cub/block/block_reduce.cuh>
+#include <vector>
+#include "caffe2/core/common.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/deform_conv_op.h"
+#include "caffe2/operators/deform_conv_op_impl.h"
+
+namespace caffe2 {
+
+typedef TIndex index_t;
+typedef std::vector<TIndex> TShape;
+
+template <typename DType>
+__device__ DType deformable_im2col_bilinear(
+    const DType* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    DType h,
+    DType w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high;
+  int w_high;
+  if (h_low >= height - 1) {
+    h_high = h_low = height - 1;
+    h = (DType)h_low;
+  } else {
+    h_high = h_low + 1;
+  }
+
+  if (w_low >= width - 1) {
+    w_high = w_low = width - 1;
+    w = (DType)w_low;
+  } else {
+    w_high = w_low + 1;
+  }
+
+  DType lh = h - h_low;
+  DType lw = w - w_low;
+  DType hh = 1 - lh, hw = 1 - lw;
+
+  DType v1 = bottom_data[h_low * data_width + w_low];
+  DType v2 = bottom_data[h_low * data_width + w_high];
+  DType v3 = bottom_data[h_high * data_width + w_low];
+  DType v4 = bottom_data[h_high * data_width + w_high];
+  DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename DType>
+__device__ DType get_gradient_weight(
+    DType argmax_h,
+    DType argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
+    // empty
+    return 0;
+  }
+
+  argmax_h = max(argmax_h, (DType)0.0f);
+  argmax_w = max(argmax_w, (DType)0.0f);
+
+  int argmax_h_low = (int)argmax_h;
+  int argmax_w_low = (int)argmax_w;
+  int argmax_h_high;
+  int argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = (DType)argmax_h_low;
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1) {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = (DType)argmax_w_low;
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+  DType weight = 0;
+  if (h == argmax_h_low) {
+    if (w == argmax_w_low) {
+      weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+    }
+  } else if (h == argmax_h_high) {
+    if (w == argmax_w_low) {
+      weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+    }
+  }
+  return weight;
+}
+
+template <typename DType>
+__device__ DType get_coordinate_weight(
+    DType argmax_h,
+    DType argmax_w,
+    const int height,
+    const int width,
+    const DType* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
+    // empty
+    return 0;
+  }
+
+  if (argmax_h < 0)
+    argmax_h = 0;
+  if (argmax_w < 0)
+    argmax_w = 0;
+
+  int argmax_h_low = (int)argmax_h;
+  int argmax_w_low = (int)argmax_w;
+  int argmax_h_high;
+  int argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = (DType)argmax_h_low;
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1) {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = (DType)argmax_w_low;
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+  DType weight = 0;
+
+  if (bp_dir == 0) {
+    weight += -1 * (argmax_w_low + 1 - argmax_w) *
+        im_data[argmax_h_low * data_width + argmax_w_low];
+    weight += -1 * (argmax_w - argmax_w_low) *
+        im_data[argmax_h_low * data_width + argmax_w_high];
+    weight += (argmax_w_low + 1 - argmax_w) *
+        im_data[argmax_h_high * data_width + argmax_w_low];
+    weight += (argmax_w - argmax_w_low) *
+        im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    weight += -1 * (argmax_h_low + 1 - argmax_h) *
+        im_data[argmax_h_low * data_width + argmax_w_low];
+    weight += (argmax_h_low + 1 - argmax_h) *
+        im_data[argmax_h_low * data_width + argmax_w_high];
+    weight += -1 * (argmax_h - argmax_h_low) *
+        im_data[argmax_h_high * data_width + argmax_w_low];
+    weight += (argmax_h - argmax_h_low) *
+        im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+/*!
+ * \brief deformable_im2col gpu kernel.
+ * DO NOT call this directly. Use wrapper function im2col() instead;
+ */
+template <typename DType>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const DType* data_im,
+    const DType* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int height_col,
+    const int width_col,
+    DType* data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int c_im = (index / width_col) / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    DType* data_col_ptr =
+        data_col + (c_col * height_col + h_col) * width_col + w_col;
+    const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in;
+    const DType* data_offset_ptr = data_offset +
+        deformable_group_index * 2 * kernel_h * kernel_w * height_col *
+            width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+        const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+        DType val = static_cast<DType>(0);
+        const DType h_im = h_in + i * dilation_h + offset_h;
+        const DType w_im = w_in + j * dilation_w + offset_w;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          const DType map_h = i * dilation_h + offset_h;
+          const DType map_w = j * dilation_w + offset_w;
+          const int cur_height = height - h_in;
+          const int cur_width = width - w_in;
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*!\brief
+ * cpu function of deformable_im2col algorithm
+ * \param s device stream
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape (#channels, output_im_height,
+ * output_im_width, ...) \param kernel_shape kernel filter shape \param pad pad
+ * shape \param stride stride shape \param dilation dilation shape \param
+ * deformable_group #offset group that deformable convolution use \param
+ * data_col column buffer pointer
+ */
+template <typename DType, typename Context>
+void DeformConvOpBase<DType, Context>::DeformableIm2col(
+    const DType* data_im,
+    const DType* data_offset,
+    const std::vector<TIndex>& im_shape,
+    const std::vector<TIndex>& col_shape,
+    DType* data_col) {
+  CHECK_LT(2, CAFFE_CUDA_NUM_THREADS);
+  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
+  CAFFE_ENFORCE_EQ(pad_l(), pad_r());
+  const int pad_h = pad_t();
+  const int pad_w = pad_l();
+  index_t channel_per_deformable_group = im_shape[1] / deformable_group_;
+  index_t num_kernels = im_shape[1] * size_from_dim_(1, col_shape);
+  deformable_im2col_gpu_kernel<DType>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          num_kernels,
+          data_im,
+          data_offset,
+          im_shape[2],
+          im_shape[3],
+          kernel_h(),
+          kernel_w(),
+          pad_h,
+          pad_w,
+          stride_h(),
+          stride_w(),
+          dilation_h(),
+          dilation_w(),
+          channel_per_deformable_group,
+          col_shape[1],
+          col_shape[2],
+          data_col);
+}
+
+/*!
+ * \brief deformable_col2im gpu kernel.
+ * \brief DO NOT call this directly. Use wrapper function deformable_col2im()
+ * instead;
+ */
+template <typename DType>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const DType* data_col,
+    const DType* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int height_col,
+    const int width_col,
+    DType* grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col) % kernel_w;
+    const int i = (index / width_col / height_col / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const DType* data_offset_ptr = data_offset +
+        deformable_group_index * 2 * kernel_h * kernel_w * height_col *
+            width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+    const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+    const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const DType cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              (c * height + cur_h + dy) * width + cur_w + dx;
+          DType weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+/*!\brief
+ * gpu function of deformable_col2im algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_im pointer of a image (C, H, W,...) in the image batch
+ */
+template <typename DType, typename Context>
+void DeformConvOpBase<DType, Context>::DeformableCol2im(
+    const DType* data_col,
+    const DType* data_offset,
+    const std::vector<TIndex>& im_shape,
+    const std::vector<TIndex>& col_shape,
+    DType* grad_im) {
+  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
+  CAFFE_ENFORCE_EQ(pad_l(), pad_r());
+  const int pad_h = pad_t();
+  const int pad_w = pad_l();
+  index_t im_size = size_from_dim_(1, im_shape);
+  index_t channel_per_deformable_group = im_shape[1] / deformable_group_;
+  index_t num_kernels = size_from_dim_(0, col_shape);
+  // num_axes should be smaller than block size
+  CHECK_LT(2, CAFFE_CUDA_NUM_THREADS);
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  deformable_col2im_gpu_kernel<DType>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          num_kernels,
+          data_col,
+          data_offset,
+          im_shape[1],
+          im_shape[2],
+          im_shape[3],
+          kernel_h(),
+          kernel_w(),
+          pad_h,
+          pad_w,
+          stride_h(),
+          stride_w(),
+          dilation_h(),
+          dilation_w(),
+          channel_per_deformable_group,
+          col_shape[1],
+          col_shape[2],
+          grad_im);
+}
+
+/*!
+ * \brief deformable_col2im_coord gpu kernel.
+ * \brief DO NOT call this directly. Use wrapper function
+ * deformable_col2im_coord() instead;
+ */
+template <typename DType>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const DType* data_col,
+    const DType* data_im,
+    const DType* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int height_col,
+    const int width_col,
+    DType* grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    DType val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = index / width_col / height_col;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const DType* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * width_col *
+            height_col;
+    const DType* data_im_ptr = data_im +
+        deformable_group_index * channel_per_deformable_group / kernel_h /
+            kernel_w * height * width;
+    const DType* data_offset_ptr = data_offset +
+        deformable_group_index * 2 * kernel_h * kernel_w * height_col *
+            width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos = ((col_c * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col) % kernel_w;
+      int i = (col_pos / width_col / height_col / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+      const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+      DType inv_h = h_in + i * dilation_h + offset_h;
+      DType inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -1;
+      }
+      const DType weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+/*!\brief
+ * gpu function of deformable_col2im_coord algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch
+ */
+template <typename DType, typename Context>
+void DeformConvOpBase<DType, Context>::DeformableCol2imCoord(
+    const DType* data_col,
+    const DType* data_im,
+    const DType* data_offset,
+    const std::vector<TIndex>& im_shape,
+    const std::vector<TIndex>& col_shape,
+    DType* grad_offset) {
+  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
+  CAFFE_ENFORCE_EQ(pad_l(), pad_r());
+  const int pad_h = pad_t();
+  const int pad_w = pad_l();
+  index_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_h() *
+      kernel_w() * deformable_group_;
+  index_t channel_per_deformable_group = col_shape[0] / deformable_group_;
+  // num_axes should be smaller than block size
+  CHECK_LT(2, CAFFE_CUDA_NUM_THREADS);
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  deformable_col2im_coord_gpu_kernel<DType>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          num_kernels,
+          data_col,
+          data_im,
+          data_offset,
+          im_shape[1],
+          im_shape[2],
+          im_shape[3],
+          kernel_h(),
+          kernel_w(),
+          pad_h,
+          pad_w,
+          stride_h(),
+          stride_w(),
+          dilation_h(),
+          dilation_w(),
+          channel_per_deformable_group,
+          col_shape[1],
+          col_shape[2],
+          grad_offset);
+}
+
+REGISTER_CUDA_OPERATOR(DeformConv, DeformConvOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    DeformConvGradient,
+    DeformConvGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/deform_conv_op.h b/caffe2/operators/deform_conv_op.h
new file mode 100644
index 0000000..56b4d32
--- /dev/null
+++ b/caffe2/operators/deform_conv_op.h
@@ -0,0 +1,112 @@
+#ifndef CAFFE2_OPERATORS_DEFORM_CONV_OP_H_
+#define CAFFE2_OPERATORS_DEFORM_CONV_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class DeformConvOpBase : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  DeformConvOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws),
+        deformable_group_(
+            OperatorBase::GetSingleArgument<int>("deformable_group", 1)) {}
+  ~DeformConvOpBase() {}
+
+ protected:
+  void DeformableIm2col(
+      const T* data_im,
+      const T* data_offset,
+      const std::vector<TIndex>& im_shape,
+      const std::vector<TIndex>& col_shape,
+      T* data_col);
+  void DeformableCol2im(
+      const T* data_col,
+      const T* data_offset,
+      const std::vector<TIndex>& im_shape,
+      const std::vector<TIndex>& col_shape,
+      T* grad_im);
+  void DeformableCol2imCoord(
+      const T* data_col,
+      const T* data_im,
+      const T* data_offset,
+      const std::vector<TIndex>& im_shape,
+      const std::vector<TIndex>& col_shape,
+      T* grad_offset);
+
+ protected:
+  int deformable_group_;
+
+#define USE_DEFORMABLE_CONV_BASE_FUNCTIONS(T, Context)   \
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);                 \
+  using DeformConvOpBase<T, Context>::deformable_group_; \
+  using DeformConvOpBase<T, Context>::DeformableIm2col;  \
+  using DeformConvOpBase<T, Context>::DeformableCol2im;  \
+  using DeformConvOpBase<T, Context>::DeformableCol2imCoord
+};
+
+template <typename T, class Context>
+class DeformConvOp final : public DeformConvOpBase<T, Context> {
+ public:
+  USE_DEFORMABLE_CONV_BASE_FUNCTIONS(T, Context);
+
+  DeformConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : DeformConvOpBase<T, Context>(operator_def, ws) {
+    // Create shared buffer mutex in the constructor
+    // to avoid race-condition in DAGNet.
+    if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+      createSharedBuffer<Context>(ws_);
+    }
+  }
+  ~DeformConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
+  // Input: X, o, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, OFFSET, FILTER, BIAS);
+};
+
+template <typename T, class Context>
+class DeformConvGradientOp final : public DeformConvOpBase<T, Context> {
+ public:
+  USE_DEFORMABLE_CONV_BASE_FUNCTIONS(T, Context);
+
+  DeformConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : DeformConvOpBase<T, Context>(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 4),
+        "If bias is not present, you should not have 4 grad output.");
+  }
+  ~DeformConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+
+ private:
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
+  bool no_bias_;
+  // input: X, W, dY
+  // output: dO, dW, db, and optionally dX
+  INPUT_TAGS(INPUT, OFFSET, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(OFFSET_GRAD, FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DEFORM_CONV_OP_H_
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
new file mode 100644
index 0000000..072c156
--- /dev/null
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -0,0 +1,398 @@
+// conv_op_impl.h is the templated implementation of the conv_op.h file.
+#ifndef CAFFE2_OPERATORS_DEFORM_CONV_OP_IMPL_H_
+#define CAFFE2_OPERATORS_DEFORM_CONV_OP_IMPL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/deform_conv_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const Tensor<Context>& X = Input(INPUT);
+  const Tensor<Context>& offset = Input(OFFSET);
+  auto& filter = Input(FILTER);
+  Tensor<Context>* Y = Output(0);
+  const int N = X.dim32(0), C = X.dim32(1);
+  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(
+      C == filter.dim32(1) * group_,
+      "Convolution op: input channels does not match: # of input channels ",
+      C,
+      " is not equal to kernel channels * group:",
+      filter.dim32(1),
+      "*",
+      group_);
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "The number of output channels is not divisible by group.");
+  CAFFE_ENFORCE(
+      kernel_.size() == 2,
+      "Deformable convolution only supports 2d kernel, has ",
+      kernel_.size(),
+      "d kernel.");
+  CAFFE_ENFORCE(
+      offset.ndim() == 4,
+      "Deformable convolution only supports 4d offset, has ",
+      offset.ndim(),
+      "d offset.");
+  CAFFE_ENFORCE_EQ(offset.dim32(0), N);
+  CAFFE_ENFORCE(
+      C % deformable_group_ == 0,
+      "The number of input channels ",
+      C,
+      " is not divisible by deformable group ",
+      deformable_group_);
+  CAFFE_ENFORCE(
+      M % deformable_group_ == 0,
+      "The number of output channels ",
+      M,
+      " is not divisible by deformable group ",
+      deformable_group_);
+  CAFFE_ENFORCE(
+      offset.dim32(1) == 2 * kernel_h() * kernel_w() * deformable_group_,
+      "Deformable convolution: offset 1st dimension must equal "
+      "2 * kernel_h * kernel_w * deformable_group: 2 * ",
+      kernel_h(),
+      " * ",
+      kernel_w(),
+      " * ",
+      deformable_group_);
+
+  CAFFE_ENFORCE_EQ(
+      offset.dim32(2),
+      (X.dim32(2) + pad_t() + pad_b() - (dilation_h() * (kernel_h() - 1) + 1)) /
+              stride_h() +
+          1);
+  CAFFE_ENFORCE_EQ(
+      offset.dim32(3),
+      (X.dim32(3) + pad_l() + pad_r() - (dilation_w() * (kernel_w() - 1) + 1)) /
+              stride_w() +
+          1);
+
+  int kernel_dims_size = 1;
+  for (int i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
+
+  const vector<int> input_dims = GetDims(X);
+  const vector<int> output_dims = GetDims(*Y);
+  const int input_image_size = this->GetDimsSize(X);
+  const int output_image_size = this->GetDimsSize(*Y);
+
+  vector<int> img_shape;
+  img_shape.assign(X.dims().begin() + 1, X.dims().end());
+
+  vector<int> buffer_shape;
+  buffer_shape.push_back(C / group_ * kernel_dims_size);
+  buffer_shape.insert(
+      buffer_shape.end(), output_dims.begin(), output_dims.end());
+
+  // The dimension of each kernel
+  const int kernel_dim = C / group_ * kernel_dims_size;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C / group_ * input_image_size;
+  const int output_offset = M / group_ * output_image_size;
+  const int offset_offset = offset.size() / offset.dim32(0);
+  const int filter_offset = filter.size() / group_;
+
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  const T* Xdata = X.template data<T>();
+  const T* offset_data = offset.template data<T>();
+
+  if (InputSize() == 4) {
+    auto& bias = Input(BIAS);
+    CAFFE_ENFORCE(bias.ndim() == 1);
+    CAFFE_ENFORCE(bias.dim32(0) == M);
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not image size, reshape and fill it
+      // with
+      // one.
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+  }
+  T* Ydata = Y->template mutable_data<T>();
+  const T* bias_data = nullptr;
+  if (InputSize() == 4) {
+    bias_data = Input(BIAS).template data<T>();
+  }
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer->Resize(buffer_shape);
+    T* col_buffer_data = col_buffer->template mutable_data<T>();
+    // Im2col, followed by gemm.
+    for (int image_id = 0; image_id < N; ++image_id) {
+      for (int group_id = 0; group_id < group_; ++group_id) {
+        DeformableIm2col(
+            Xdata + group_id * input_offset,
+            offset_data,
+            X.dims(),
+            col_buffer->dims(),
+            col_buffer_data);
+        // Weight term
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            M / group_,
+            output_image_size,
+            kernel_dim,
+            1,
+            filter.template data<T>() + group_id * filter_offset,
+            col_buffer_data,
+            0,
+            Ydata + group_id * output_offset,
+            &context_);
+      }
+      if (bias_data) {
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            M,
+            output_image_size,
+            1,
+            1,
+            bias_data,
+            bias_multiplier_.template data<T>(),
+            1,
+            Ydata,
+            &context_);
+      }
+      Xdata += input_offset * group_;
+      Ydata += output_offset * group_;
+      offset_data += offset_offset;
+    }
+  };
+
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& offset = Input(OFFSET);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* doffset = Output(OFFSET_GRAD);
+  const int N = X.dim32(0), C = X.dim32(1);
+
+  const vector<int> input_dims = this->GetDims(X);
+  const int input_image_size = this->GetDimsSize(X);
+
+  const vector<int> output_dims = this->GetDims(dY);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = this->GetDimsSize(dY);
+
+  ConvPoolOpBase<Context>::ComputePads(input_dims);
+  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
+
+  CAFFE_ENFORCE(
+      kernel_.size() == 2,
+      "Deformable convolution only supports 2d kernel, has ",
+      kernel_.size(),
+      "d kernel.");
+  CAFFE_ENFORCE(
+      offset.ndim() == 4,
+      "Deformable convolution only supports 4d offset, has ",
+      offset.ndim(),
+      "d offset.");
+  CAFFE_ENFORCE_EQ(offset.dim32(0), N);
+  CAFFE_ENFORCE(
+      C % deformable_group_ == 0,
+      "The number of input channels ",
+      C,
+      " is not divisible by deformable group ",
+      deformable_group_);
+  CAFFE_ENFORCE(
+      M % deformable_group_ == 0,
+      "The number of output channels ",
+      M,
+      " is not divisible by deformable group ",
+      deformable_group_);
+  CAFFE_ENFORCE(
+      offset.dim32(1) == 2 * kernel_h() * kernel_w() * deformable_group_,
+      "Deformable convolution: offset 1st dimension must equal "
+      "2 * kernel_h * kernel_w * deformable_group: 2 * ",
+      kernel_h(),
+      " * ",
+      kernel_w(),
+      " * ",
+      deformable_group_);
+
+  CAFFE_ENFORCE_EQ(
+      offset.dim32(2),
+      (X.dim32(2) + pad_t() + pad_b() - (dilation_h() * (kernel_h() - 1) + 1)) /
+              stride_h() +
+          1);
+  CAFFE_ENFORCE_EQ(
+      offset.dim32(3),
+      (X.dim32(3) + pad_l() + pad_r() - (dilation_w() * (kernel_w() - 1) + 1)) /
+              stride_w() +
+          1);
+
+  int kernel_dims_size = 1;
+  for (int i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  CAFFE_ENFORCE(M % group_ == 0);
+  dfilter->ResizeLike(filter);
+  doffset->ResizeLike(offset);
+
+  // The dimension of each kernel
+  const int kernel_dim = C / group_ * kernel_dims_size;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C / group_ * input_image_size;
+  const int output_offset = M / group_ * output_image_size;
+  const int offset_offset = offset.size() / offset.dim32(0);
+  const int filter_offset = filter.size() / group_;
+
+  // The col buffer is stored in CHW order as well - kernel_dim, and the
+  // height and width.
+  vector<TIndex> img_shape;
+  img_shape.assign(X.dims().begin() + 1, X.dims().end());
+  vector<TIndex> col_buffer_shape;
+  col_buffer_shape.push_back(C * kernel_dims_size);
+  col_buffer_shape.insert(
+      col_buffer_shape.end(), output_dims.begin(), output_dims.end());
+  col_buffer_.Resize(col_buffer_shape);
+
+  const int col_buffer_offset = col_buffer_.size() / group_;
+
+  const T* Xdata = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* offset_data = offset.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* col_buffer_data = col_buffer_.template mutable_data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+  T* doffset_data = doffset->template mutable_data<T>();
+
+  // Pre-setting the gradients to zero.
+  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
+
+  T* dbias_data = nullptr;
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    dbias->Resize(M);
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      math::Set<T, Context>(
+          output_image_size,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    dbias_data = dbias->template mutable_data<T>();
+    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
+  }
+
+  T* dXdata = nullptr;
+  if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) {
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    dXdata = dX->template mutable_data<T>();
+    math::Set<T, Context>(dX->size(), 0, dXdata, &context_);
+  }
+
+  for (int image_id = 0; image_id < N; ++image_id) {
+    for (int group_id = 0; group_id < group_; ++group_id) {
+      math::Gemm<T, Context>(
+          CblasTrans,
+          CblasNoTrans,
+          kernel_dim,
+          output_image_size,
+          M / group_,
+          1,
+          filter_data + group_id * filter_offset,
+          dYdata + group_id * output_offset,
+          0,
+          col_buffer_data + group_id * col_buffer_offset,
+          &context_);
+    }
+
+    // Gradient with respect to offsets
+    DeformableCol2imCoord(
+        col_buffer_data,
+        Xdata,
+        offset_data,
+        X.dims(),
+        col_buffer_shape,
+        doffset_data);
+
+    // Gradient with respect to input data
+    if (dXdata) {
+      DeformableCol2im(
+          col_buffer_data, offset_data, X.dims(), col_buffer_shape, dXdata);
+      dXdata += input_offset * group_;
+    }
+
+    // Gradient with respect to filter
+    DeformableIm2col(
+        Xdata, offset_data, X.dims(), col_buffer_shape, col_buffer_data);
+
+    for (int group_id = 0; group_id < group_; ++group_id) {
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasTrans,
+          M / group_,
+          kernel_dim,
+          output_image_size,
+          1,
+          dYdata + group_id * output_offset,
+          col_buffer_data + group_id * col_buffer_offset,
+          1,
+          dfilter_data + group_id * filter_offset,
+          &context_);
+    }
+
+    // Gradient with respect to bias
+    if (dbias_data) {
+      math::Gemv<T, Context>(
+          CblasNoTrans,
+          M,
+          output_image_size,
+          1,
+          dYdata,
+          bias_multiplier_.template data<T>(),
+          1,
+          dbias_data,
+          &context_);
+    }
+
+    Xdata += input_offset * group_;
+    dYdata += output_offset * group_;
+    offset_data += offset_offset;
+    doffset_data += offset_offset;
+  }
+
+  return true;
+}
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DEFORM_CONV_OP_IMPL_H_
diff --git a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
new file mode 100644
index 0000000..6868a48
--- /dev/null
+++ b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
@@ -0,0 +1,489 @@
+#include "caffe2/core/common_cudnn.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_cache_cudnn.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+// Modified from TensorFlow,
+// https://github.com/tensorflow/tensorflow/blob/4cb482dc3e0424c3d658ba373a6354dded6a32df/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+
+struct DepthwiseArgs {
+  // Input layer dimensions
+  int batch{0};
+  int in_rows{0};
+  int in_cols{0};
+  int in_depth{0};
+  int filter_rows{0};
+  int filter_cols{0};
+  int stride{0};
+  int pad_rows{0};
+  int pad_cols{0};
+
+  // Output layer dimensions
+  int out_rows{0};
+  int out_cols{0};
+  int out_depth{0};
+};
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+__global__ void DepthwiseConv2dGPUKernelNCHW(
+    const DepthwiseArgs args,
+    const T* input,
+    const T* filter,
+    T* output,
+    int num_outputs) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows = kKnownFilterHeight;
+  const int filter_cols = kKnownFilterWidth;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
+    const int OW = thread_id % out_cols;
+    const int OH = (thread_id / out_cols) % out_rows;
+    const int OC = (thread_id / out_cols / out_rows) % out_depth;
+    const int OB = thread_id / out_cols / out_rows / out_depth;
+    const int in_d = OC;
+
+    const int input_offset_temp = (OB * in_depth + OC) * (in_rows * in_cols);
+    const int input_row_start = OH * stride - pad_rows;
+    const int input_col_start = OW * stride - pad_cols;
+    const int input_row_end = input_row_start + filter_rows;
+    const int input_col_end = input_col_start + filter_cols;
+    const float* filter_start = filter + in_d * filter_rows * filter_cols;
+    T sum = 0;
+    if (input_row_start >= 0 && input_col_start >= 0 &&
+        input_row_end < in_rows && input_col_end < in_cols) {
+// Loop that doesn't need to check for boundary conditions.
+#pragma unroll
+      for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = input_row_start + f_r;
+        const float* filter_offset = filter_start + filter_cols * f_r;
+#pragma unroll
+        for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = input_col_start + f_c;
+
+          const int input_offset =
+              (input_offset_temp) + (in_r * in_cols) + in_c;
+#if __CUDA_ARCH__ >= 350
+          sum += __ldg(input + input_offset) * __ldg(filter_offset + f_c);
+#else
+          sum += input[input_offset] * filter_offset[f_c];
+#endif
+        }
+      }
+    } else {
+// Loop that needs to check for boundary conditions.
+#pragma unroll
+      for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = input_row_start + f_r;
+        const float* filter_offset = filter_start + filter_cols * f_r;
+#pragma unroll
+        for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = input_col_start + f_c;
+          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
+            const int in_c = input_col_start + f_c;
+            const int input_offset =
+                (input_offset_temp) + (in_r * in_cols) + in_c;
+#if __CUDA_ARCH__ >= 350
+            sum += __ldg(input + input_offset) * __ldg(filter_offset + f_c);
+#else
+            sum += input[input_offset] * filter_offset[f_c];
+#endif
+          }
+        }
+      }
+    }
+
+    output[thread_id] = sum;
+  }
+}
+
+// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
+    const DepthwiseArgs args,
+    const T* out_backprop,
+    const T* input,
+    T* filter_backprop,
+    int num_out_backprop) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows = kKnownFilterHeight;
+  const int filter_cols = kKnownFilterWidth;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
+    // Compute the indexes of this thread in the output.
+    const int OW = thread_id % out_cols;
+    const int OH = (thread_id / out_cols) % out_rows;
+    const int OC = (thread_id / out_cols / out_rows) % out_depth;
+    const int OB = thread_id / out_cols / out_rows / out_depth;
+
+    // Compute the input depth and the index of depth multiplier.
+    const int in_d = OC;
+
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
+    const int in_r_start = OH * stride - pad_rows;
+    const int in_c_start = OW * stride - pad_cols;
+    const int in_r_end = in_r_start + filter_rows;
+    const int in_c_end = in_c_start + filter_cols;
+
+    const int out_backprop_offset = (OB * out_depth * out_rows * out_cols) +
+        (OC * out_rows * out_cols) + (OH * out_cols) + (OW);
+
+#if __CUDA_ARCH__ >= 350
+    const T out_bp = __ldg(out_backprop + out_backprop_offset);
+#else
+    const T out_bp = out_backprop[out_backprop_offset];
+#endif
+    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
+        in_c_end < in_cols) {
+#pragma unroll
+      for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = in_r_start + f_r;
+        // Avoid repeated computation.
+        const int input_offset_temp = (OB * in_depth * in_rows * in_cols) +
+            (OC * in_rows * in_cols) + (in_r * in_cols);
+
+#pragma unroll
+        for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = in_c_start + f_c;
+          const int input_offset = input_offset_temp + in_c;
+#if __CUDA_ARCH__ >= 350
+          T partial_sum = __ldg(input + input_offset) * out_bp;
+#else
+          T partial_sum = input[input_offset] * out_bp;
+#endif
+          T* addr = filter_backprop + (in_d * filter_rows * filter_cols) +
+              (f_c + filter_cols * f_r);
+          atomicAdd(addr, partial_sum);
+        }
+      }
+    } else {
+#pragma unroll
+      for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = in_r_start + f_r;
+        // Avoid repeated computation.
+        const int input_offset_temp = (OB * in_depth * in_rows * in_cols) +
+            (OC * in_rows * in_cols) + (in_r * in_cols);
+#pragma unroll
+        for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = in_c_start + f_c;
+
+          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
+            const int input_offset = input_offset_temp + in_c;
+#if __CUDA_ARCH__ >= 350
+            T partial_sum = __ldg(input + input_offset) * out_bp;
+#else
+            T partial_sum = input[input_offset] * out_bp;
+#endif
+            T* addr = filter_backprop + (in_d * filter_rows * filter_cols) +
+                (f_c + filter_cols * f_r);
+            atomicAdd(addr, partial_sum);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+__global__ void DepthwiseConv2dBackpropInputGPUKernelNCHW(
+    const DepthwiseArgs args,
+    const T* out_backprop,
+    const T* filter,
+    T* in_backprop,
+    int num_in_backprop) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows = kKnownFilterHeight;
+  const int filter_cols = kKnownFilterWidth;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  // TODO(vrv): Consider assigning threads to output and using
+  // atomics for accumulation, similar to the filter case.
+  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+    const int IW = thread_id % in_cols;
+    const int IH = (thread_id / in_cols) % in_rows;
+    const int IC = (thread_id / in_cols / in_rows) % in_depth;
+    const int IB = thread_id / in_cols / in_rows / in_depth;
+
+    T sum = 0;
+
+    const int out_r_start =
+        max(0, (IH - filter_rows + pad_rows + stride) / stride);
+    const int out_r_end = min(out_rows - 1, (IH + pad_rows) / stride);
+    const int out_c_start =
+        max(0, (IW - filter_cols + pad_cols + stride) / stride);
+    const int out_c_end = min(out_cols - 1, (IW + pad_cols) / stride);
+
+#pragma unroll
+    for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+      const int f_r = IH + pad_rows - out_r * stride;
+      for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+        const int f_c = IW + pad_cols - out_c * stride;
+        const int filter_offset =
+            IC * filter_rows * filter_cols + f_r * filter_cols + f_c;
+        const int out_backprop_offset = (IB * out_depth * out_rows * out_cols) +
+            (IC * out_rows * out_cols) + (out_r * out_cols) + (out_c);
+
+#if __CUDA_ARCH__ >= 350
+        sum += __ldg(out_backprop + out_backprop_offset) *
+            __ldg(filter + filter_offset);
+#else
+        sum += out_backprop[out_backprop_offset] * filter[filter_offset];
+#endif
+      }
+    }
+    const int in_backprop_offset = (IB * in_rows * in_cols * in_depth) +
+        (IC * in_rows * in_cols) + (IH * in_cols) + (IW);
+    in_backprop[in_backprop_offset] = sum;
+  }
+}
+
+class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CUDAContext);
+  Depthwise3x3ConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "Depthwise3x3ConvOp only supports NCHW order");
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
+  }
+
+  ~Depthwise3x3ConvOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const Tensor<CUDAContext>& X = Input(0);
+    auto& filter = Input(1);
+    Tensor<CUDAContext>* Y = Output(0);
+    const int N = X.dim32(0), C = X.dim32(1);
+    CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+    const int M = filter.dim32(0);
+
+    CAFFE_ENFORCE_EQ(M, X.dim32(1));
+    CAFFE_ENFORCE_EQ(C, X.dim32(1));
+    CAFFE_ENFORCE_EQ(C, this->group_);
+    CAFFE_ENFORCE_GT(this->group_, 1);
+    CAFFE_ENFORCE_EQ(this->kernel_w(), 3);
+    CAFFE_ENFORCE_EQ(this->kernel_h(), 3);
+    CAFFE_ENFORCE_EQ(this->stride_h(), this->stride_w());
+    ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, filter.dim32(0));
+    DepthwiseArgs args;
+    args.batch = X.dim32(0);
+    args.in_rows = X.dim32(2);
+    args.in_cols = X.dim32(3);
+    args.in_depth = X.dim32(1);
+    args.filter_cols = 3;
+    args.filter_rows = 3;
+    args.stride = this->stride_w();
+    args.pad_rows = this->pad_t();
+    args.pad_cols = this->pad_l();
+    args.out_rows = Y->dim32(2);
+    args.out_cols = Y->dim32(3);
+    args.out_depth = Y->dim32(1);
+    DepthwiseConv2dGPUKernelNCHW<float, 3, 3>
+        <<<CAFFE_GET_BLOCKS(Y->size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            args,
+            X.data<float>(),
+            filter.data<float>(),
+            Y->mutable_data<float>(),
+            Y->size());
+    if (InputSize() == 3) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          bias_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<float>::type,
+          1,
+          M,
+          1,
+          1));
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          top_desc_for_bias_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<float>::type,
+          Y->dim32(0),
+          M,
+          Y->dim32(2),
+          Y->dim32(3)));
+      auto& bias = Input(2);
+      CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+      CAFFE_ENFORCE_EQ(bias.dim32(0), M);
+      CUDNN_ENFORCE(cudnnAddTensor(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          cudnnTypeWrapper<float>::kOne(),
+          bias_desc_,
+          bias.data<float>(),
+          cudnnTypeWrapper<float>::kOne(),
+          top_desc_for_bias_,
+          Y->mutable_data<float>()));
+    }
+
+    return true;
+  }
+
+ private:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_for_bias_;
+};
+
+class Depthwise3x3ConvGradientOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CUDAContext);
+  Depthwise3x3ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : cudnn_wrapper_(&context_),
+        ConvPoolOpBase<CUDAContext>(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "Depthwise3x3ConvGradientOp only supports NCHW order");
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
+  }
+
+  ~Depthwise3x3ConvGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    auto& X = Input(INPUT);
+    auto& filter = Input(FILTER);
+    auto& dY = Input(OUTPUT_GRAD);
+    auto* dfilter = Output(FILTER_GRAD);
+    const int N = X.dim32(0), C = X.dim32(1);
+
+    const vector<int> input_dims = this->GetDims(X);
+    ConvPoolOpBase<CUDAContext>::ComputePads(input_dims);
+    CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+    const int M = filter.dim32(0);
+    CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
+    CAFFE_ENFORCE(M % group_ == 0);
+    dfilter->ResizeLike(filter);
+    DepthwiseArgs args;
+    args.batch = X.dim32(0);
+    args.in_rows = X.dim32(2);
+    args.in_cols = X.dim32(3);
+    args.in_depth = X.dim32(1);
+    args.filter_cols = 3;
+    args.filter_rows = 3;
+    args.stride = this->stride_w();
+    args.pad_rows = this->pad_t();
+    args.pad_cols = this->pad_l();
+    args.out_rows = dY.dim32(2);
+    args.out_cols = dY.dim32(3);
+    args.out_depth = dY.dim32(1);
+
+    CAFFE_ENFORCE(OutputSize() == 3 || (no_bias_ && (OutputSize() == 2)));
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    math::Set<float, CUDAContext>(
+        dfilter->size(), 0, dfilter->mutable_data<float>(), &context_);
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW<float, 3, 3>
+        <<<CAFFE_GET_BLOCKS(dY.size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            args,
+            dY.data<float>(),
+            X.data<float>(),
+            dfilter->mutable_data<float>(),
+            dY.size());
+    DepthwiseConv2dBackpropInputGPUKernelNCHW<float, 3, 3>
+        <<<CAFFE_GET_BLOCKS(dX->size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            args,
+            dY.data<float>(),
+            filter.data<float>(),
+            dX->mutable_data<float>(),
+            dX->size());
+    if (!no_bias_) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          bias_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<float>::type,
+          1,
+          M,
+          1,
+          1));
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          top_desc_for_bias_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<float>::type,
+          dY.dim32(0),
+          M,
+          dY.dim32(2),
+          dY.dim32(3)));
+      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+      dbias->Resize(M);
+      CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          cudnnTypeWrapper<float>::kOne(),
+          top_desc_for_bias_,
+          dY.data<float>(),
+          cudnnTypeWrapper<float>::kZero(),
+          bias_desc_,
+          dbias->mutable_data<float>()));
+    }
+    return true;
+  }
+
+ private:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_for_bias_;
+
+  bool no_bias_;
+
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    ConvGradient,
+    DEPTHWISE_3x3,
+    Depthwise3x3ConvGradientOp);
+
+} // namespace caffe2
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
new file mode 100644
index 0000000..4e00cd4
--- /dev/null
+++ b/caffe2/operators/distance_op.cc
@@ -0,0 +1,747 @@
+#include "caffe2/operators/distance_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template<>
+bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  distance->Resize(N);
+  int D = N > 0 ? X.size() / N : 0;
+  float* distance_data = distance->mutable_data<float>();
+  const float* X_data = X.data<float>();
+  const float* Y_data = Y.data<float>();
+  for (int i = 0; i < N; ++i) {
+    float Xscale, Yscale, cross;
+    math::Dot<float, CPUContext>(
+        D, X_data + i * D, X_data + i * D, &Xscale, &context_);
+    math::Dot<float, CPUContext>(
+        D, Y_data + i * D, Y_data + i * D, &Yscale, &context_);
+    math::Dot<float, CPUContext>(
+        D, X_data + i * D, Y_data + i * D, &cross, &context_);
+    distance_data[i] = (Xscale + Yscale) * 0.5 - cross;
+  }
+  return true;
+}
+
+template <>
+bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  distance->Resize(N);
+  int D = N > 0 ? X.size() / N : 0;
+
+  const float* X_data = X.data<float>();
+  const float* Y_data = Y.data<float>();
+
+  for (int i = 0; i < N; ++i) {
+    (distance->mutable_data<float>())[i] =
+        (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
+         ConstEigenVectorMap<float>(Y_data + i * D, D).array())
+            .abs()
+            .sum();
+  }
+  return true;
+}
+
+template <>
+bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dDistance = Input(2);
+  auto* dX = Output(0);
+  auto* dY = Output(1);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  int D = N > 0 ? X.size() / N : 0;
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+  }
+  CAFFE_ENFORCE(dDistance.ndim() == 1);
+  CAFFE_ENFORCE(dDistance.dim32(0) == N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+
+  for (int i = 0; i < N; ++i) {
+    auto offset = i * D;
+    for (int j = 0; j < D; ++j) {
+      const float temp =
+          (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
+      const float kEps = 1e-12f;
+      if (temp < -kEps) {
+        dX->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
+        dY->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
+      } else if (temp > kEps) {
+        dX->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
+        dY->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
+      } else {
+        dX->mutable_data<float>()[offset + j] = 0;
+        dY->mutable_data<float>()[offset + j] = 0;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto* result = Output(COS_OUT);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  const int D = X.size_from_dim(1);
+  result->Resize(N);
+  float* result_data = result->mutable_data<float>();
+  const float* X_data = X.data<float>();
+  const float* Y_data = Y.data<float>();
+  float X2, Y2;
+  const float kEps = 1e-12f;
+  for (int i = 0; i < N; ++i) { // TODO: multithreading
+    auto offset = i * D;
+    math::Dot<float, CPUContext>(
+        D, X_data + offset, X_data + offset, &X2, &context_);
+    math::Dot<float, CPUContext>(
+        D, Y_data + offset, Y_data + offset, &Y2, &context_);
+    math::Dot<float, CPUContext>(
+        D, X_data + offset, Y_data + offset, result_data + i, &context_);
+    result_data[i] /= std::sqrt(std::max(X2, kEps) * std::max(Y2, kEps));
+  }
+  return true;
+}
+
+template <>
+bool CosineSimilarityGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto& dCos = Input(DER_COS_IN);
+  auto* dX = Output(DER_X_OUT);
+  auto* dY = Output(DER_Y_OUT);
+  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  const int D = X.size_from_dim(1);
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+  }
+  CAFFE_ENFORCE(dCos.ndim() == 1);
+  CAFFE_ENFORCE(dCos.dim32(0) == N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+
+  const auto* X_data = X.template data<float>();
+  const auto* Y_data = Y.template data<float>();
+  const auto* dCos_data = dCos.template data<float>();
+  auto* dX_data = dX->template mutable_data<float>();
+  auto* dY_data = dY->template mutable_data<float>();
+  float XN, YN, XY;
+  const float kEps = 1e-12f;
+  for (int i = 0; i < N; ++i) { // TODO: multithreading
+    auto offset = i * D;
+
+    // TODO: cache these result from the forward pass
+    // ||x||
+    math::Dot<float, CPUContext>(
+        D, X_data + offset, X_data + offset, &XN, &context_);
+    XN = std::sqrt(std::max(XN, kEps));
+    // ||y||
+    math::Dot<float, CPUContext>(
+        D, Y_data + offset, Y_data + offset, &YN, &context_);
+    YN = std::sqrt(std::max(YN, kEps));
+    // ||x|| * || y ||
+    float XYN = XN * YN;
+    // x^Ty
+    math::Dot<float, CPUContext>(
+        D, X_data + offset, Y_data + offset, &XY, &context_);
+
+    math::Scale<float, CPUContext>(
+        D, dCos_data[i] / XYN, Y_data + offset, dX_data + offset, &context_);
+    math::Axpy(
+        D,
+        -dCos_data[i] * XY / (XN * XN * XYN),
+        X_data + offset,
+        dX_data + offset,
+        &context_);
+
+    math::Scale<float, CPUContext>(
+        D, dCos_data[i] / XYN, X_data + offset, dY_data + offset, &context_);
+    math::Axpy(
+        D,
+        -dCos_data[i] * XY / (YN * YN * XYN),
+        Y_data + offset,
+        dY_data + offset,
+        &context_);
+  }
+
+  return true;
+}
+
+template <>
+bool DotProductOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto* result = Output(DOT_OUT);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i), "dimension at ", i);
+  }
+  int N, D;
+  if (X.size() > 0) {
+    N = X.ndim() > 0 ? X.dim32(0) : 1;
+    D = X.size() / N;
+  } else {
+    N = 0;
+    D = 0;
+  }
+  result->Resize(N);
+  float* result_data = result->template mutable_data<float>();
+  const float* X_data = X.template data<float>();
+  const float* Y_data = Y.template data<float>();
+  for (int i = 0; i < N; ++i) { // TODO: multithreading
+    auto offset = i * D;
+    math::Dot<float, CPUContext>(
+        D, X_data + offset, Y_data + offset, result_data + i, &context_);
+  }
+  return true;
+}
+
+vector<TensorShape> TensorInferenceForDotProduct(
+    const OperatorDef& /* def */,
+    const vector<TensorShape>& in) {
+  CAFFE_ENFORCE_GT(in.size(), 0);
+
+  vector<TIndex> dims(1);
+  dims[0] = in[0].dims().size() > 0 ? in[0].dims(0) : 1;
+  return vector<TensorShape>{CreateTensorShape(dims, in[0].data_type())};
+}
+
+OpSchema::Cost CostInferenceForDotProduct(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  std::vector<TensorShape> out = TensorInferenceForDotProduct(def, in);
+  CAFFE_ENFORCE_GT(out.size(), 0);
+  CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
+
+  struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
+  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
+  c.params_bytes = 0;
+  return c;
+}
+
+template <>
+bool DotProductGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto& dDot = Input(DER_DOT_IN);
+  auto* dX = Output(DER_X_OUT);
+  auto* dY = Output(DER_Y_OUT);
+  int N, D;
+  if (X.size() > 0) {
+    N = X.ndim() > 0 ? X.dim32(0) : 1;
+    D = X.size() / N;
+  } else {
+    N = 0;
+    D = 0;
+  }
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+  }
+  CAFFE_ENFORCE(dDot.ndim() == 1);
+  CAFFE_ENFORCE(dDot.dim32(0) == N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+
+  const auto* X_data = X.template data<float>();
+  const auto* Y_data = Y.template data<float>();
+  const auto* dDot_data = dDot.template data<float>();
+  auto* dX_data = dX->template mutable_data<float>();
+  auto* dY_data = dY->template mutable_data<float>();
+  for (int i = 0; i < N; ++i) { // TODO: multithreading
+    auto offset = i * D;
+    math::Scale<float, CPUContext>(
+        D, dDot_data[i], X_data + offset, dY_data + offset, &context_);
+    math::Scale<float, CPUContext>(
+        D, dDot_data[i], Y_data + offset, dX_data + offset, &context_);
+  }
+  return true;
+}
+
+template <>
+bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto* result = Output(DOT_OUT);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  CAFFE_ENFORCE_EQ(X.dim32(0), Y.dim32(0));
+
+  int N, D, DX, DY, restD;
+  if (X.size() > 0) {
+    N = X.ndim() > 0 ? X.dim32(0) : 1;
+    DX = X.size() / N;
+    DY = Y.size() / N;
+  } else {
+    N = 0;
+    DX = 0;
+    DY = 0;
+  }
+
+  D = std::min(DX, DY);
+  restD = std::max(DX, DY) - D;
+  result->Resize(N);
+  float* result_data = result->mutable_data<float>();
+  const float* X_data = X.data<float>();
+  const float* Y_data = Y.data<float>();
+  for (int i = 0; i < N; ++i) { // TODO: multithreading
+    auto offsetX = i * DX, offsetY = i * DY;
+    if (replicate_) {
+      // L_ for longer vector and S_ for shorter vector
+      const float *L_data, *S_data;
+      int DL, DS;
+      if (DX > DY) {
+        L_data = X_data + offsetX;
+        S_data = Y_data + offsetY;
+        DL = DX;
+        DS = DY;
+      } else {
+        L_data = Y_data + offsetY;
+        S_data = X_data + offsetX;
+        DL = DY;
+        DS = DX;
+      }
+      float sum = 0.0;
+      float tmp = 0.0;
+      for (int j = 0; j < DL / DS; j++) {
+        math::Dot<float, CPUContext>(
+            DS, L_data + j * DS, S_data, &tmp, &context_);
+        sum += tmp;
+      }
+      *(result_data + i) = sum;
+    } else {
+      math::Dot<float, CPUContext>(
+          D, X_data + offsetX, Y_data + offsetY, result_data + i, &context_);
+    }
+
+    if (!replicate_ && DX != DY) {
+      const float* rest_data;
+      float rest_sum = 0;
+      if (DX > DY) {
+        rest_data = X_data + offsetX + D;
+      } else {
+        rest_data = Y_data + offsetY + D;
+      }
+      math::Sum<float, CPUContext>(restD, rest_data, &rest_sum, &context_);
+      result_data[i] += rest_sum * pad_value_;
+    }
+  }
+  return true;
+}
+
+// L2
+REGISTER_CPU_OPERATOR(SquaredL2Distance,
+                      SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
+                      SquaredL2DistanceGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SquaredL2Distance)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+Given two input float tensors X, Y, and produces one output float tensor
+of the L2 difference between X and Y that is computed as ||(X - Y)^2 / 2||.
+)DOC")
+    .Input(0, "X", "1D or 2D input tensor")
+    .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
+    .Output(0, "Z", "1D output tensor");
+
+OPERATOR_SCHEMA(SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2);
+
+class GetSquaredL2DistanceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SquaredL2DistanceGradient", "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(SquaredL2Distance, GetSquaredL2DistanceGradient);
+
+// L1
+REGISTER_CPU_OPERATOR(L1Distance, L1DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    L1DistanceGradient,
+    L1DistanceGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(L1Distance)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+Computes the row-wise L1 Distance between the two input tensors $X$ and $Y$, which is defined as
+
+$$L1Distance(\mathbf{x},\mathbf{y}) = \sum_{i}\mid x_i - y_i\mid$$
+
+Note, both inputs must either be 1-dimensional or 2-dimensional and both must have the same shape. The output $Z$ will be 1-dimensional regardless and its length will equal the number of rows in the inputs.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "L1Distance",
+    ["X", "Y"],
+    ["Z"]
+)
+
+# Create X
+X = 5*np.ones((1, 4))
+print("X:\n",X)
+
+# Create Y
+Y = np.ones((1, 4))
+print("Y:\n",Y)
+
+# Feed X & Y into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+workspace.FeedBlob("Y", Y.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Z:\n", workspace.FetchBlob("Z"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[5. 5. 5. 5.]]
+Y:
+ [[1. 1. 1. 1.]]
+Z:
+ [16.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "First input tensor. (1D or 2D)")
+    .Input(1, "Y", "Second input tensor. (must have the same shape as $X$)")
+    .Output(0, "Z", "1D output tensor. One value for each row of the inputs.");
+
+OPERATOR_SCHEMA(L1DistanceGradient).NumInputs(3).NumOutputs(2);
+
+class GetL1DistanceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "L1DistanceGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+
+REGISTER_GRADIENT(L1Distance, GetL1DistanceGradient);
+
+// Dot Product
+REGISTER_CPU_OPERATOR(DotProduct, DotProductOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    DotProductGradient,
+    DotProductGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(DotProduct)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+Computes and outputs the dot product of the two input float tensors `X` and `Y`.
+Note that `X` and `Y` must be either 1D or 2D, and they must be the same shape.
+The output tensor is 1D, which represents either the product of each element in
+a respective dimension if the inputs are 1D, or the sum of the products in a
+given dimension if the inputs are 2D matrices. Note that the actual dot product
+is a scalar value, which is effectively the sum of the elements in the 1D
+output tensor.
+
+For 1D inputs:
+Given two vectors $X = [x_0, x_1, x_2]$ and $Y = [y_0, y_1, y_2]$; $Z = [x_0 * y_0, x_1 * y_1, x_2 * y_2]$
+
+For 2D inputs:
+Given two matrices:
+$$X = [[x_0^0, x_1^0, x_2^0], \\ [x_0^1, x_1^1, x_2^1], \\ [x_0^2, x_1^2, x_2^2], \\ ..., \\ [x_0^n, x_1^n, x_2^n]]$$
+
+and
+
+$$Y = [[y_0^0, y_1^0, y_2^0], \\ [y_0^1, y_1^1, y_2^1], \\ [y_0^2, y_1^2, y_2^2], \\ ..., \\ [y_0^n, y_1^n, y_2^n]]$$
+
+then
+
+$$Z =  \biggl[\Big((x_0^0 * y_0^0) + (x_1^0 * y_1^0) + (x_2^0 * y_2^0)\Big), \\ \Big((x_0^1 * y_0^1) + (x_1^1 * y_1^1) + (x_2^1 * y_2^1)\Big), \\ \Big((x_0^2 * y_0^2) + (x_1^2 * y_1^2) + (x_2^2 * y_2^2)\Big), \\ ..., \\ \Big((x_0^n * y_0^n) + (x_1^n * y_1^n) + (x_2^n * y_2^n)\Big)\biggr]$$
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "DotProduct",
+    ["X",  "Y"],
+    ["Z"]
+)
+
+workspace.FeedBlob("X", np.random.randint(20, size=(5)).astype(np.float32))
+workspace.FeedBlob("Y", np.random.randint(20, size=(5)).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"))
+print("Y:\n", workspace.FetchBlob("Y"))
+workspace.RunOperatorOnce(op)
+print("Z:\n", workspace.FetchBlob("X"))
+
+
+workspace.ResetWorkspace()
+workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
+workspace.FeedBlob("Y", np.random.randint(10, size=(3,3)).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"))
+print("Y:\n", workspace.FetchBlob("Y"))
+workspace.RunOperatorOnce(op)
+print("Z:\n", workspace.FetchBlob("Z"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [ 2. 15.  2.  7. 12.]
+Y:
+ [ 3. 12.  9.  3. 18.]
+Z:
+ [ 2. 15.  2.  7. 12.]
+X:
+ [[2. 0. 4.]
+ [7. 7. 4.]
+ [7. 9. 9.]]
+Y:
+ [[2. 0. 8.]
+ [9. 6. 1.]
+ [7. 8. 0.]]
+Z:
+ [ 36. 109. 121.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* 1D or 2D input tensor.")
+    .Input(1, "Y", "*(type: Tensor`<float>`)* 1D or 2D input tensor (must have the same shape as X).")
+    .Output(0, "Z", "*(type: Tensor`<float>`)* 1D output tensor.")
+    .TensorInferenceFunction(TensorInferenceForDotProduct)
+    .CostInferenceFunction(
+        OpSchema::CostInferenceFunctionType(CostInferenceForDotProduct));
+
+OPERATOR_SCHEMA(DotProductGradient).NumInputs(3).NumOutputs(2);
+
+class GetDotProductGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "DotProductGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(DotProduct, GetDotProductGradient);
+
+// Cosine Similarity
+REGISTER_CPU_OPERATOR(CosineSimilarity, CosineSimilarityOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CosineSimilarityGradient,
+    CosineSimilarityGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(CosineSimilarity)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .SetDoc(R"DOC(
+This op takes two input float tensors of the same size, $X$ and $Y$, and produces one output float tensor , $Z$, calculated as the cosine similarity between $X$ and $Y$. Recall, the cosine similarity between two tensors $X$ and $Y$ is defined as:
+
+$$\mathbf{Z}=CosineSimilarity(\mathbf{X},\mathbf{Y}) = \frac{\mathbf{X}\cdot\mathbf{Y}}{\|\mathbf{X}\|\|\mathbf{Y}\|} = \frac{\sum_n^{i=1}X_iY_i}{\sqrt{\sum_n^{i=1}X_i^2}\sqrt{\sum_n^{i=1}Y_i^2}}$$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "CosineSimilarity",
+    ["X", "Y"],
+    ["Z"]
+)
+
+# Create X
+X = np.random.randn(3, 3)
+print("X:\n",X)
+
+# Create Y
+Y = np.random.randn(3, 3)
+print("Y:\n",Y)
+
+# Feed X & Y into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+workspace.FeedBlob("Y", Y.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Z:\n", workspace.FetchBlob("Z"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[-0.42635564 -0.23831588 -0.25515547]
+ [ 1.43914719 -1.05613228  1.01717373]
+ [ 0.06883105  0.33386519 -1.46648334]]
+Y:
+ [[-0.90648691 -0.14241514 -1.1070837 ]
+ [ 0.92152729 -0.28115511 -0.17756722]
+ [-0.88394254  1.34654037 -0.80080998]]
+Z:
+ [-1.7849885e-23  1.7849885e-23 -1.0842022e-07]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "1D or 2D input tensor")
+    .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
+    .Output(0, "Z", "1D output tensor");
+
+OPERATOR_SCHEMA(CosineSimilarityGradient).NumInputs(3).NumOutputs(2);
+
+class GetCosineSimilarityGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CosineSimilarityGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(CosineSimilarity, GetCosineSimilarityGradient);
+
+// Dot Product allows padding
+REGISTER_CPU_OPERATOR(
+    DotProductWithPadding,
+    DotProductWithPaddingOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    DotProductWithPaddingGradient,
+    DotProductWithPaddingGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(DotProductWithPadding)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given two input float tensors X, Y with different shapes and produces one
+output float tensor of the dot product between X and Y. We currently support
+two kinds of strategies to achieve this. Before doing normal dot_product 1)
+pad the smaller tensor (using pad_value) to the same shape as the other one.
+2) replicate the smaller tensor to the same shape as the other one. Note the
+first dimension of X, Y must be equal. Only the second dimension of X or Y
+can be padded.
+)DOC")
+    .Input(0, "X", "1D or 2D input tensor")
+    .Input(1, "Y", "1D or 2D input tensor")
+    .Output(0, "Z", "1D output tensor")
+    .IdenticalTypeAndShapeOfInputDim(0, 0)
+    .Arg("pad_value", "the padding value for tensors with smaller dimension")
+    .Arg("replicate", "whether to replicate the smaller tensor or not");
+
+OPERATOR_SCHEMA(DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2);
+
+class GetDotProductWithPaddingGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    float pad_value = 0;
+    bool replicate = false;
+    if (ArgumentHelper::HasArgument(Def(), "pad_value")) {
+      pad_value = GetArgument(Def(), "pad_value").f();
+    }
+    if (ArgumentHelper::HasArgument(Def(), "replicate")) {
+      replicate = GetArgument(Def(), "replicate").i();
+    }
+
+    const auto dot_arg =
+        vector<Argument>{MakeArgument<float>("pad_value", pad_value),
+                         MakeArgument<bool>("replicate", replicate)};
+
+    return SingleGradientDef(
+        "DotProductWithPaddingGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0), GI(1)},
+        dot_arg);
+  }
+};
+REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
new file mode 100644
index 0000000..037b1a5
--- /dev/null
+++ b/caffe2/operators/distance_op.cu
@@ -0,0 +1,539 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/distance_op.h"
+#include "caffe2/utils/conversions.h"
+
+#include <cub/block/block_reduce.cuh>
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void SquaredL2DistanceKernel(
+    const int N, const int D, const T* X, const T* Y, T* distance) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    float dist = 0.0;
+    for (int j = threadIdx.x; j < D; j += blockDim.x) {
+      T diff = X[i * D + j] - Y[i * D + j];
+      dist += diff * diff;
+    }
+
+    float total_dist = BlockReduce(temp_storage).Sum(dist);
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      distance[i] = total_dist / 2.0;
+    }
+  }
+}
+}  // namespace
+
+template <>
+bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(
+        X.dim32(i),
+        Y.dim32(i),
+        "Mismatch in dimensions",
+        X.dims(),
+        " / ",
+        Y.dims());
+  }
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  int D = X.size() / N;
+  distance->Resize(vector<TIndex>(size_t(1), N));
+  SquaredL2DistanceKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
+  return true;
+}
+
+namespace {
+template <typename T>
+__global__ void
+StripedScaleKernel(const int N, const int D, const T* alpha, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    int k = i / D;
+    y[i] = x[i] * alpha[k];
+  }
+}
+}
+
+template <>
+bool SquaredL2DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dDistance = Input(2);
+  auto* dX = Output(0);
+  auto* dY = Output(1);
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  int D = N > 0 ? X.size() / N : 0;
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(
+        X.dim32(i),
+        Y.dim32(i),
+        "Mismatch on dimensions: ",
+        X.dims(),
+        " / ",
+        Y.dims());
+  }
+  CAFFE_ENFORCE_EQ(dDistance.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dDistance.dim32(0), N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+  math::Sub<float, CUDAContext>(
+      X.size(),
+      X.data<float>(),
+      Y.data<float>(),
+      dX->mutable_data<float>(),
+      &context_);
+
+  StripedScaleKernel<float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      dDistance.data<float>(),
+      dX->data<float>(),
+      dX->mutable_data<float>());
+
+  // The gradient of the other side is basically the negative.
+  math::Scale<float, CUDAContext>(
+      X.size(), -1, dX->data<float>(), dY->mutable_data<float>(), &context_);
+  return true;
+}
+
+namespace {
+template <typename T>
+__global__ void L1DistanceKernel(
+    const int N,
+    const int D,
+    const T* X,
+    const T* Y,
+    T* distance) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    float sum = 0.0f;
+    for (int j = threadIdx.x; j < D; j += blockDim.x) {
+      sum +=
+          abs(convert::To<T, float>(X[i * D + j]) -
+              convert::To<T, float>(Y[i * D + j]));
+    }
+
+    float aggregate = BlockReduce(temp_storage).Sum(sum);
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      distance[i] = aggregate;
+    }
+  }
+}
+} // namespace
+
+template <>
+bool L1DistanceOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  const int D = N > 0 ? X.size() / N : 0;
+  distance->Resize(vector<TIndex>(size_t(1), N));
+  L1DistanceKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
+
+  return true;
+}
+
+namespace {
+template <typename T>
+__global__ void L1DistanceGradientKernel(
+    const int N,
+    const int D,
+    const T* X,
+    const T* Y,
+    const T* dDistance,
+    T* dX,
+    T* dY) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    constexpr float kEps = 1e-12;
+    int k = i / D;
+    if (X[i] - Y[i] < -kEps) {
+      dX[i] = -dDistance[k];
+      dY[i] = dDistance[k];
+    } else if (X[i] - Y[i] > kEps) {
+      dX[i] = dDistance[k];
+      dY[i] = -dDistance[k];
+    } else {
+      dX[i] = 0;
+      dY[i] = 0;
+    }
+  }
+}
+} // namespace
+
+template <>
+bool L1DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dDistance = Input(2);
+  auto* dX = Output(0);
+  auto* dY = Output(1);
+  int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  int D = N > 0 ? X.size() / N : 0;
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(
+        X.dim32(i),
+        Y.dim32(i),
+        "Mismatch on dimensions: ",
+        X.dims(),
+        " / ",
+        Y.dims());
+  }
+  CAFFE_ENFORCE_EQ(dDistance.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dDistance.dim32(0), N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+
+  L1DistanceGradientKernel<<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      X.data<float>(),
+      Y.data<float>(),
+      dDistance.data<float>(),
+      dX->mutable_data<float>(),
+      dY->mutable_data<float>());
+
+  return true;
+}
+
+namespace {
+template <typename T>
+__global__ void
+DotProductKernel(const int N, const int D, const T* X, const T* Y, T* result) {
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    T partialSum = 0;
+    int offset = i * D;
+    for (int j = threadIdx.x; j < D; j += blockDim.x) {
+      partialSum += X[offset + j] * Y[offset + j];
+    }
+
+    typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    T sum = BlockReduce(temp_storage).Sum(partialSum);
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      result[i] = sum;
+    }
+  }
+}
+
+// X.size() = N*D, Y.size() = N
+template <typename T>
+__global__ void
+BatchedMul(const int N, const int D, const T* X, const T* Y, T* result) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    result[i] = X[i] * Y[i / D];
+  }
+}
+
+// X.size() = N*D, Y.size() = N
+template <typename T>
+__global__ void Scale2AxpyScale(
+    const int N,
+    const T* scale,
+    const T* XY,
+    const T* XN,
+    T* result) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    result[i] = -scale[i] * XY[i] / (XN[i] * XN[i]);
+  }
+}
+
+// X.size() = X*N, alpha.size() = N, Y.size() = X*N
+template <typename T>
+__global__ void
+BatchedAxpy(const int N, const int D, const T* alpha, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    Y[i] += X[i] * alpha[i / D];
+  }
+}
+
+} // namespace
+
+template <>
+bool CosineSimilarityOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto* result = Output(COS_OUT);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  const int D = X.size_from_dim(1);
+  result->Resize(N);
+  float* result_data = result->mutable_data<float>();
+  const float* X_data = X.data<float>();
+  const float* Y_data = Y.data<float>();
+  // Auxiliary arrays, one allocation of memory
+  aux_.Resize(2 * N);
+  float* aux_data = aux_.mutable_data<float>();
+  float* x2 = aux_data;
+  float* y2 = aux_data + N;
+  float* scale = x2;
+  const float kEps = 1e-12f;
+
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, X_data, X_data, x2);
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, Y_data, Y_data, y2);
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, X_data, Y_data, result_data);
+  math::Maximum<float, CUDAContext>(N, kEps, x2, x2, &context_);
+  math::Maximum<float, CUDAContext>(N, kEps, y2, y2, &context_);
+  math::Mul(N, x2, y2, scale, &context_);
+  math::Rsqrt(N, scale, scale, &context_);
+  math::Mul(N, result_data, scale, result_data, &context_);
+  return true;
+}
+
+template <>
+bool CosineSimilarityGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto& dCos = Input(DER_COS_IN);
+  auto* dX = Output(DER_X_OUT);
+  auto* dY = Output(DER_Y_OUT);
+  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
+  const int D = X.size_from_dim(1);
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+  }
+  CAFFE_ENFORCE(dCos.ndim() == 1);
+  CAFFE_ENFORCE(dCos.dim32(0) == N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+
+  const auto* X_data = X.data<float>();
+  const auto* Y_data = Y.data<float>();
+  const auto* dCos_data = dCos.data<float>();
+  auto* dX_data = dX->mutable_data<float>();
+  auto* dY_data = dY->mutable_data<float>();
+
+  // one memory allocation, a few arrays
+  aux_.Resize(6 * N);
+  float* aux_data = aux_.mutable_data<float>();
+  float* xn = aux_data;
+  float* yn = aux_data + N;
+  float* xy = aux_data + 2 * N;
+  float* xyn = aux_data + 3 * N;
+  float* scale = aux_data + 4 * N;
+  float* axpy_scale = aux_data + 5 * N;
+  float kEps = 1e-12f;
+
+  // ||x||
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, X_data, X_data, xn);
+  math::Maximum<float, CUDAContext>(N, kEps, xn, xn, &context_);
+  math::Sqrt<float, CUDAContext>(N, xn, xn, &context_);
+  // ||y||
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, Y_data, Y_data, yn);
+  math::Maximum<float, CUDAContext>(N, kEps, yn, yn, &context_);
+  math::Sqrt<float, CUDAContext>(N, yn, yn, &context_);
+  // ||x|| * || y ||
+  math::Mul<float, CUDAContext>(N, xn, yn, xyn, &context_);
+
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, X_data, Y_data, xy);
+  math::Div<float, CUDAContext>(N, dCos_data, xyn, scale, &context_);
+  // dX
+  BatchedMul<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, Y_data, scale, dX_data);
+  Scale2AxpyScale<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, scale, xy, xn, axpy_scale);
+  BatchedAxpy<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, axpy_scale, X_data, dX_data);
+  // dY
+  BatchedMul<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, X_data, scale, dY_data);
+  Scale2AxpyScale<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, scale, xy, yn, axpy_scale);
+  BatchedAxpy<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, axpy_scale, Y_data, dY_data);
+
+  return true;
+}
+
+template <>
+bool DotProductOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto* result = Output(DOT_OUT);
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
+  }
+  int N, D;
+  if (X.size() > 0) {
+    N = X.ndim() > 0 ? X.dim32(0) : 1;
+    D = X.size() / N;
+  } else {
+    N = 0;
+    D = 0;
+  }
+  result->Resize(N);
+
+  DotProductKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, D, X.data<float>(), Y.data<float>(), result->mutable_data<float>());
+
+  return true;
+}
+
+namespace {
+template <typename T>
+__global__ void DotProductGradientKernel(
+    const int N,
+    const int D,
+    const T* X,
+    const T* Y,
+    const T* dDot,
+    T* dX,
+    T* dY) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    T scale = dDot[i / D];
+    dX[i] = Y[i] * scale;
+    dY[i] = X[i] * scale;
+  }
+}
+} // namespace
+
+template <>
+bool DotProductGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  auto& dDot = Input(DER_DOT_IN);
+  auto* dX = Output(DER_X_OUT);
+  auto* dY = Output(DER_Y_OUT);
+  int N, D;
+  if (X.size() > 0) {
+    N = X.ndim() > 0 ? X.dim32(0) : 1;
+    D = X.size() / N;
+  } else {
+    N = 0;
+    D = 0;
+  }
+  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+  }
+  CAFFE_ENFORCE(dDot.ndim() == 1);
+  CAFFE_ENFORCE(dDot.dim32(0) == N);
+  dX->ResizeLike(X);
+  dY->ResizeLike(Y);
+  DotProductGradientKernel<<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      X.data<float>(),
+      Y.data<float>(),
+      dDot.data<float>(),
+      dX->mutable_data<float>(),
+      dY->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(SquaredL2Distance,
+                       SquaredL2DistanceOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SquaredL2DistanceGradient,
+                       SquaredL2DistanceGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(L1Distance, L1DistanceOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    L1DistanceGradient,
+    L1DistanceGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(DotProduct, DotProductOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    DotProductGradient,
+    DotProductGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    CosineSimilarity,
+    CosineSimilarityOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CosineSimilarityGradient,
+    CosineSimilarityGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h
new file mode 100644
index 0000000..aad57e9
--- /dev/null
+++ b/caffe2/operators/distance_op.h
@@ -0,0 +1,285 @@
+#ifndef CAFFE2_OPERATORS_DISTANCE_OP_H_
+#define CAFFE2_OPERATORS_DISTANCE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SquaredL2DistanceOp : public Operator<Context> {
+ public:
+  SquaredL2DistanceOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, Y; Output: Distance
+};
+
+template <typename T, class Context>
+class SquaredL2DistanceGradientOp final : public Operator<Context> {
+ public:
+  SquaredL2DistanceGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dDistance = Input(2);
+    auto* dX = Output(0);
+    auto* dY = Output(1);
+    int N = X.ndim() > 0 ? X.dim32(0) : 1;
+    int D = N > 0 ? X.size() / N : 0;
+    CAFFE_ENFORCE(X.ndim() == Y.ndim());
+    for (int i = 0; i < X.ndim(); ++i) {
+      CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+    }
+    CAFFE_ENFORCE(dDistance.ndim() == 1);
+    CAFFE_ENFORCE(dDistance.dim32(0) == N);
+    dX->ResizeLike(X);
+    dY->ResizeLike(Y);
+    math::Sub<T, Context>(
+        X.size(),
+        X.template data<T>(),
+        Y.template data<T>(),
+        dX->template mutable_data<T>(),
+        &context_);
+    for (int i = 0; i < N; ++i) {
+      math::Scale<T, Context>(
+          D,
+          dDistance.template data<T>() + i,
+          dX->template data<T>() + i * D,
+          dX->template mutable_data<T>() + i * D,
+          &context_);
+    }
+    // The gradient of the other side is basically the negative.
+    math::Scale<T, Context>(
+        X.size(),
+        -1,
+        dX->template data<T>(),
+        dY->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  // Input: X, Y, dDistance; Output: dX, dY
+};
+
+template <typename T, class Context>
+class L1DistanceOp : public Operator<Context> {
+ public:
+  L1DistanceOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, Y; Output: Distance
+};
+
+template <typename T, class Context>
+class L1DistanceGradientOp : public Operator<Context> {
+ public:
+  L1DistanceGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, Y, dDistance; Output: dX, dY
+};
+
+template <typename T, class Context>
+class DotProductOp : public Operator<Context> {
+ public:
+  DotProductOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X_IN, Y_IN);
+  OUTPUT_TAGS(DOT_OUT);
+};
+
+template <typename T, class Context>
+class DotProductGradientOp final : public Operator<Context> {
+ public:
+  DotProductGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X_IN, Y_IN, DER_DOT_IN);
+  OUTPUT_TAGS(DER_X_OUT, DER_Y_OUT);
+};
+
+template <typename T, class Context>
+class DotProductWithPaddingOp : public Operator<Context> {
+ public:
+  DotProductWithPaddingOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        pad_value_(OperatorBase::GetSingleArgument<float>("pad_value", 0.0)),
+        replicate_(OperatorBase::GetSingleArgument<bool>("replicate", false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float pad_value_;
+  bool replicate_;
+  INPUT_TAGS(X_IN, Y_IN);
+  OUTPUT_TAGS(DOT_OUT);
+};
+
+template <typename T, class Context>
+class CosineSimilarityOp : public Operator<Context> {
+ public:
+  CosineSimilarityOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X_IN, Y_IN);
+  OUTPUT_TAGS(COS_OUT);
+
+ private:
+  Tensor<Context> aux_;
+};
+
+template <typename T, class Context>
+class CosineSimilarityGradientOp final : public Operator<Context> {
+ public:
+  CosineSimilarityGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X_IN, Y_IN, DER_COS_IN);
+  OUTPUT_TAGS(DER_X_OUT, DER_Y_OUT);
+
+ private:
+  Tensor<Context> aux_;
+};
+
+template <typename T, class Context>
+class DotProductWithPaddingGradientOp final : public Operator<Context> {
+ public:
+  DotProductWithPaddingGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        pad_value_(OperatorBase::GetSingleArgument<float>("pad_value", 0.0)),
+        replicate_(OperatorBase::GetSingleArgument<bool>("replicate", false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& X = Input(X_IN);
+    auto& Y = Input(Y_IN);
+    auto& dDot = Input(DER_DOT_IN);
+    auto* dX = Output(DER_X_OUT);
+    auto* dY = Output(DER_Y_OUT);
+    int N, D, DX, DY, restD;
+    if (X.size() > 0) {
+      N = X.ndim() > 0 ? X.dim32(0) : 1;
+      DX = X.size() / N;
+      DY = Y.size() / N;
+    } else {
+      N = 0;
+      DX = 0;
+      DY = 0;
+    }
+    CAFFE_ENFORCE(!replicate_ || DX % DY == 0 || DY % DX == 0);
+    D = std::min(DX, DY);
+    restD = std::max(DX, DY) - D;
+    CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
+    CAFFE_ENFORCE_EQ(X.dim32(0), Y.dim32(0));
+    CAFFE_ENFORCE_EQ(dDot.ndim(), 1);
+    CAFFE_ENFORCE_EQ(dDot.dim32(0), N);
+    dX->ResizeLike(X);
+    dY->ResizeLike(Y);
+
+    const auto* X_data = X.template data<T>();
+    const auto* Y_data = Y.template data<T>();
+    const auto* dDot_data = dDot.template data<T>();
+    auto* dX_data = dX->template mutable_data<T>();
+    auto* dY_data = dY->template mutable_data<T>();
+    for (int i = 0; i < N; ++i) { // TODO: multithreading
+      auto offsetX = i * DX;
+      auto offsetY = i * DY;
+      if (replicate_) {
+        // L_ for longer vector and S_ for shorter vector
+        const T *L_data, *S_data;
+        T *dL_data, *dS_data;
+        int DL, DS;
+        if (DX > DY) {
+          L_data = X_data + offsetX;
+          S_data = Y_data + offsetY;
+          dL_data = dX_data + offsetX;
+          dS_data = dY_data + offsetY;
+          DL = DX;
+          DS = DY;
+        } else {
+          L_data = Y_data + offsetY;
+          S_data = X_data + offsetX;
+          dL_data = dY_data + offsetY;
+          dS_data = dX_data + offsetX;
+          DL = DY;
+          DS = DX;
+        }
+
+        // TODO: get rid of temp memory use
+        std::vector<T> tmp_data(DS);
+        math::Set<T, Context>(DS, 0.0, dS_data, &context_);
+        for (int j = 0; j < DL / DS; j++) {
+          math::Scale<T, Context>(
+              DS, dDot_data[i], S_data, dL_data + j * DS, &context_);
+          math::Scale<T, Context>(
+              DS, dDot_data[i], L_data + j * DS, tmp_data.data(), &context_);
+          math::Axpy<T, Context>(DS, 1.0, tmp_data.data(), dS_data, &context_);
+        }
+      } else {
+        math::Scale<T, Context>(
+            D, dDot_data[i], X_data + offsetX, dY_data + offsetY, &context_);
+        math::Scale<T, Context>(
+            D, dDot_data[i], Y_data + offsetY, dX_data + offsetX, &context_);
+      }
+
+      if (!replicate_ && DX != DY) {
+        T* rest_data;
+        if (DX > DY) {
+          rest_data = dX_data + offsetX + D;
+        } else {
+          rest_data = dY_data + offsetY + D;
+        }
+        auto pad_gradient = dDot_data[i] * pad_value_;
+        math::Set<T, Context>(restD, pad_gradient, rest_data, &context_);
+      }
+    }
+
+    return true;
+  }
+
+ protected:
+  float pad_value_;
+  bool replicate_;
+  INPUT_TAGS(X_IN, Y_IN, DER_DOT_IN);
+  OUTPUT_TAGS(DER_X_OUT, DER_Y_OUT);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DISTANCE_OP_H_
diff --git a/caffe2/operators/do_op.cc b/caffe2/operators/do_op.cc
new file mode 100644
index 0000000..3bfd59a
--- /dev/null
+++ b/caffe2/operators/do_op.cc
@@ -0,0 +1,33 @@
+#include "caffe2/operators/do_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Do, DoOp<CPUContext>);
+
+OPERATOR_SCHEMA(Do)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+'Do' control operator, executes a subnet in a separate workspace.
+Last blobs in the input and output lists should be the same blob created with
+CreateScope op. Arguments 'inner_blobs' and 'outer_blobs_idx' provide a mapping
+between selected inner blob names and corresponding outer blob indices.
+    )DOC")
+    .Arg("net", "Subnet with blob bindings")
+    .Arg(
+        "inner_blobs",
+        "List of inner net blob names to bind to outer workspace")
+    .Arg(
+        "outer_blobs_idx",
+        "Indices of corresponding outer workspace blobs, "
+        "in order: operator inputs, operator outputs (skipping workspace blobs)")
+    .Arg(
+        "saved_fwd_blobs",
+        "List of blobs from the forward Do operator workspace needed "
+        "in backward pass, used in gradient Do operator")
+    .Arg(
+        "reuse_workspace",
+        "Whether to reuse workspace or create a new one in a given scope")
+    .AllowInplace([](int in, int out) -> bool { return true; });
+
+} // namespace caffe2
diff --git a/caffe2/operators/do_op.h b/caffe2/operators/do_op.h
new file mode 100644
index 0000000..40eac75
--- /dev/null
+++ b/caffe2/operators/do_op.h
@@ -0,0 +1,183 @@
+#ifndef CAFFE2_OPERATORS_DO_OP_H_
+#define CAFFE2_OPERATORS_DO_OP_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/create_scope_op.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+template <class Context>
+class DoOp final : public Operator<Context> {
+ public:
+  DoOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), parent_ws_(ws) {
+    CAFFE_ENFORCE(
+        this->template HasSingleArgumentOfType<NetDef>("net"),
+        "net must be specified in Do operator");
+    net_def_ = this->template GetSingleArgument<NetDef>("net", NetDef());
+    is_gradient_op_ = operator_def.is_gradient_op();
+    copy_external_blobs_ =
+        this->template GetSingleArgument<bool>("copy_external_blobs", false);
+    reuse_workspace_ =
+        this->template GetSingleArgument<bool>("reuse_workspace", false);
+    CAFFE_ENFORCE(
+        !(is_gradient_op_ && reuse_workspace_),
+        "Gradient Do op requires use of stacked workspaces");
+    CAFFE_ENFORCE(
+        !(copy_external_blobs_ && reuse_workspace_),
+        "Reuse workspace and copy external blobs simultaneously in Do op");
+
+    const auto& inner_blobs =
+        this->template GetRepeatedArgument<std::string>("inner_blobs");
+    const auto& outer_blobs_idx =
+        this->template GetRepeatedArgument<int>("outer_blobs_idx");
+    CAFFE_ENFORCE_EQ(
+        inner_blobs.size(),
+        outer_blobs_idx.size(),
+        "Invalid blob bindings: different inner/outer blobs lengths");
+
+    const auto& outer_blob_names = checkAndGetOuterNames(operator_def);
+    std::unordered_set<std::string> used_outer_names;
+    for (size_t blob_idx = 0; blob_idx < inner_blobs.size(); ++blob_idx) {
+      CAFFE_ENFORCE(
+          !blob_bindings_.count(inner_blobs[blob_idx]),
+          "Invalid blob bindings: redefinition of inner blob " +
+              inner_blobs[blob_idx]);
+      CAFFE_ENFORCE(
+          outer_blobs_idx[blob_idx] >= 0 &&
+              outer_blobs_idx[blob_idx] < outer_blob_names.size(),
+          "Invalid blob bindings: outer blob index (" +
+              caffe2::to_string(outer_blobs_idx[blob_idx]) + ", inner name: " +
+              inner_blobs[blob_idx] + ") is out of bounds [0, " +
+              caffe2::to_string(outer_blob_names.size() - 1) + "]");
+      const auto& outer_name = outer_blob_names[outer_blobs_idx[blob_idx]];
+      CAFFE_ENFORCE(
+          !used_outer_names.count(outer_name),
+          "Reusage of outer name: " + outer_name);
+      used_outer_names.insert(outer_name);
+      blob_bindings_[inner_blobs[blob_idx]] = outer_name;
+      forwarded_inner_blobs_.insert(inner_blobs[blob_idx]);
+    }
+    std::unordered_set<std::string> all_outer_names(
+        outer_blob_names.begin(), outer_blob_names.end());
+    CAFFE_ENFORCE_EQ(
+        used_outer_names.size(),
+        all_outer_names.size(),
+        "Not all outer names are used in blob bindings");
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto* ws_stack =
+        OperatorBase::Output<detail::WorkspaceStack>(OutputSize() - 1);
+    std::shared_ptr<Workspace> net_workspace;
+    if (is_gradient_op_) {
+      net_workspace =
+          ws_stack->popGradientWorkspace(parent_ws_, blob_bindings_);
+    } else {
+      if (reuse_workspace_ && !ws_stack->empty()) {
+        net_workspace =
+            ws_stack->reuseLastForwardWorkspace(parent_ws_, blob_bindings_);
+      } else {
+        net_workspace =
+            ws_stack->pushForwardWorkspace(parent_ws_, blob_bindings_);
+      }
+    }
+    CAFFE_ENFORCE(net_workspace, "Failed to initialize Do op workspace");
+
+    // TODO(iliacher): figure how to reuse existing net with a new workspace
+    auto* net = net_workspace->GetNet(net_def_.name());
+    if (!net) {
+      net = net_workspace->CreateNet(net_def_, true);
+    }
+    CAFFE_ENFORCE(net, "Failed to initialize subnet");
+    auto success = net->Run();
+    if (!is_gradient_op_ && copy_external_blobs_) {
+      net_workspace->template CopyForwardedTensors<Context>(
+          forwarded_inner_blobs_);
+    }
+    return success;
+  }
+
+ private:
+  // returns vector of input blob names followed by output blob names in
+  // operator definition order; ensures that input (output) names are unique,
+  // checks number of input (output) blobs
+  std::vector<std::string> checkAndGetOuterNames(
+      const OperatorDef& operator_def) const {
+    auto input_names = getInputBlobNames(operator_def);
+    CAFFE_ENFORCE(!input_names.empty(), "Expected at least one input blob");
+    std::string input_ws_blob = input_names.back(); // copy
+    // removing blob that holds pointer op workspace
+    input_names.pop_back();
+
+    std::unordered_set<std::string> all_input_names(
+        input_names.begin(), input_names.end());
+    CAFFE_ENFORCE_EQ(
+        input_names.size(), all_input_names.size(), "Duplicate input blobs");
+
+    auto output_names = getOutputBlobNames(operator_def);
+    CAFFE_ENFORCE(!output_names.empty(), "Expected at least one output blob");
+    const auto& output_ws_blob = output_names.back();
+    CAFFE_ENFORCE_EQ(
+        input_ws_blob,
+        output_ws_blob,
+        "Expected same input/output workspace blob");
+    // remove blob that holds pointer to op workspace
+    output_names.pop_back();
+
+    std::unordered_set<std::string> all_output_names(
+        output_names.begin(), output_names.end());
+    CAFFE_ENFORCE_EQ(
+        output_names.size(), all_output_names.size(), "Duplicate output blobs");
+
+    std::vector<std::string> outer_blob_names;
+    outer_blob_names.reserve(input_names.size() + output_names.size());
+    outer_blob_names.insert(
+        outer_blob_names.end(), input_names.begin(), input_names.end());
+    outer_blob_names.insert(
+        outer_blob_names.end(), output_names.begin(), output_names.end());
+    return outer_blob_names;
+  }
+
+  std::vector<std::string> getInputBlobNames(
+      const OperatorDef& operator_def) const {
+    std::vector<std::string> names;
+    names.reserve(operator_def.input_size());
+    for (auto idx = 0; idx < operator_def.input_size(); ++idx) {
+      names.push_back(operator_def.input(idx));
+    }
+    return names;
+  }
+
+  std::vector<std::string> getOutputBlobNames(
+      const OperatorDef& operator_def) const {
+    std::vector<std::string> names;
+    names.reserve(operator_def.output_size());
+    for (auto idx = 0; idx < operator_def.output_size(); ++idx) {
+      names.push_back(operator_def.output(idx));
+    }
+    return names;
+  }
+
+  std::unordered_map<std::string, std::string> blob_bindings_;
+  std::unordered_set<std::string> forwarded_inner_blobs_;
+  bool is_gradient_op_;
+  bool copy_external_blobs_;
+  bool reuse_workspace_;
+  NetDef net_def_;
+  Workspace* parent_ws_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DO_OP_H_
diff --git a/caffe2/operators/do_op_gpu.cc b/caffe2/operators/do_op_gpu.cc
new file mode 100644
index 0000000..1f7b7b1
--- /dev/null
+++ b/caffe2/operators/do_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/do_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Do, DoOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
new file mode 100644
index 0000000..bd9178e
--- /dev/null
+++ b/caffe2/operators/dropout_op.cc
@@ -0,0 +1,184 @@
+#include "caffe2/operators/dropout_op.h"
+
+namespace caffe2 {
+
+template <>
+bool DropoutOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->Resize(X.dims());
+  if (is_test_) {
+    if (Y != &X) {
+      context_.Copy<float, CPUContext, CPUContext>(
+          X.size(), X.data<float>(), Y->mutable_data<float>());
+    }
+    return true;
+  } else {
+    float scale = 1. / (1. - ratio_);
+    // mask=true means keep, and mask=false means not keep, so we will
+    // generate probability depending on 1-ratio.
+    std::bernoulli_distribution dist(1. - ratio_);
+    const float* Xdata = X.data<float>();
+    float* Ydata = Y->mutable_data<float>();
+    auto mask = Output(1);
+    mask->Resize(X.dims());
+    bool* mask_data = mask->mutable_data<bool>();
+    auto& gen = context_.RandGenerator();
+    for (int i = 0; i < X.size(); ++i) {
+      mask_data[i] = dist(gen);
+      Ydata[i] = Xdata[i] * scale * mask_data[i];
+    }
+    return true;
+  }
+}
+
+template <>
+bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(dY.dims());
+  if (is_test_) {
+    if (dX != &dY) {
+      context_.Copy<float, CPUContext, CPUContext>(
+          dY.size(), dY.data<float>(), dX->mutable_data<float>());
+    }
+    return true;
+  } else {
+    auto& mask = Input(1);
+    CAFFE_ENFORCE_EQ(dY.size(), mask.size());
+    const float* dYdata = dY.data<float>();
+    const bool* mask_data = mask.data<bool>();
+    float* dXdata = dX->mutable_data<float>();
+    float scale = 1. / (1. - ratio_);
+    for (int i = 0; i < dY.size(); ++i) {
+      dXdata[i] = dYdata[i] * mask_data[i] * scale;
+    }
+    return true;
+  }
+}
+
+REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Dropout)
+    .NumInputs(1)
+    .NumOutputs(1, 2)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      CAFFE_ENFORCE_EQ(1, in.size());
+      vector<TensorShape> out;
+      ArgumentHelper argsHelper(def);
+      out.push_back(in[0]);
+      if (def.output().size() == 2) {
+        out.push_back(in[0]);
+        out[1].set_data_type(TensorProto_DataType_BOOL);
+      }
+      return out;
+    })
+    .SetDoc(R"DOC(
+
+`Dropout` takes one input data tensor (`X`) and produces two tensor outputs, `Y` and
+`mask`. If the `is_test` argument is zero (default=0), the output `Y` will be the input
+with random elements zeroed. The probability that a given element is zeroed is
+determined by the `ratio` argument.
+
+If the `is_test` argument is set to non-zero, the output `Y` is exactly the same as the
+input `X`. Note that outputs are scaled by a factor of $\frac{1}{1-ratio}$ during
+training, so that during test time, we can simply compute an identity function. This
+scaling is important because we want the output at test time to equal the expected value
+at training time. Dropout has been proven to be an effective regularization technique to
+prevent overfitting during training.
+
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Dropout",
+    ["X"],
+    ["Y"] + ["mask"],
+    ratio=0.5,
+    is_test=0
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(5, 5)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+print("mask:", workspace.FetchBlob("mask"))
+```
+
+**Result**
+
+```
+X: [[5. 4. 3. 6. 9.]
+ [2. 1. 8. 0. 9.]
+ [7. 3. 0. 6. 3.]
+ [1. 8. 2. 6. 4.]
+ [6. 2. 6. 4. 0.]]
+Y: [[ 0.  0.  0. 12. 18.]
+ [ 0.  0. 16.  0.  0.]
+ [ 0.  0.  0. 12.  6.]
+ [ 0.  0.  4.  0.  0.]
+ [12.  0.  0.  0.  0.]]
+mask: [[False False False  True  True]
+ [False False  True  True False]
+ [False False  True  True  True]
+ [False False  True False False]
+ [ True False False False False]]
+```
+
+</details>
+
+)DOC")
+    .Arg("ratio", "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
+    .ArgIsTest(
+        "*(type: int; default: 0)* If zero (train mode), perform dropout. If non-zero"
+        "(test mode), Y = X.")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
+    .Output(
+        1,
+        "mask",
+        "*(type: Tensor`<bool>`)* The output mask containing boolean values for"
+        "each element, signifying which elements are dropped out. If `is_test` is" 
+        "nonzero, this output is not filled.")
+    .InheritOnnxSchema("Dropout");
+
+OPERATOR_SCHEMA(DropoutGrad)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
+
+class GetDropoutGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper argshelper(def_);
+    auto is_test = argshelper.GetSingleArgument<bool>("is_test", 0);
+    if (is_test) {
+      return SingleGradientDef(
+          "DropoutGrad", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return SingleGradientDef(
+          "DropoutGrad",
+          "",
+          vector<string>{GO(0), O(1)},
+          vector<string>{GI(0)});
+    }
+  }
+};
+REGISTER_GRADIENT(Dropout, GetDropoutGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/dropout_op.cu b/caffe2/operators/dropout_op.cu
new file mode 100644
index 0000000..745840e
--- /dev/null
+++ b/caffe2/operators/dropout_op.cu
@@ -0,0 +1,96 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/dropout_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void DropoutKernel(
+    const int N,
+    const float ratio,
+    const float* Xdata,
+    float* Ydata,
+    bool* maskdata) {
+  const float scale = 1. / (1. - ratio);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    maskdata[i] = (Ydata[i] > ratio);
+    Ydata[i] = Xdata[i] * scale * maskdata[i];
+  }
+}
+} // namespace
+
+template <>
+bool DropoutOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->Resize(X.dims());
+  if (is_test_) {
+    if (Y != &X) {
+      context_.Copy<float, CUDAContext, CUDAContext>(
+          X.size(), X.data<float>(), Y->mutable_data<float>());
+    }
+    return true;
+  } else {
+    // We do a simple trick here: since curand cannot generate random
+    // boolean numbers, we will generate into dY and write the result to
+    // mask.
+    float* Ydata = Y->mutable_data<float>();
+    auto* mask = Output(1);
+    mask->Resize(X.dims());
+    CAFFE_ENFORCE(X.data<float>() != Ydata, "In-place GPU dropout is broken");
+    CURAND_ENFORCE(
+        curandGenerateUniform(context_.curand_generator(), Ydata, X.size()));
+    DropoutKernel<<<
+        CAFFE_GET_BLOCKS(X.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        X.size(), ratio_, X.data<float>(), Ydata, mask->mutable_data<bool>());
+    return true;
+  }
+}
+
+namespace {
+__global__ void DropoutGradientKernel(
+    const int N,
+    const float* dYdata,
+    const bool* maskdata,
+    const float scale,
+    float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dXdata[i] = dYdata[i] * maskdata[i] * scale;
+  }
+}
+} // namespace
+
+template <>
+bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(dY.dims());
+  if (is_test_) {
+    if (dX != &dY) {
+      context_.Copy<float, CUDAContext, CUDAContext>(
+          dY.size(), dY.data<float>(), dX->mutable_data<float>());
+    }
+    return true;
+  } else {
+    auto& mask = Input(1);
+    CAFFE_ENFORCE_EQ(dY.size(), mask.size());
+    const float scale = 1. / (1. - ratio_);
+    DropoutGradientKernel<<<
+        CAFFE_GET_BLOCKS(dY.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        dY.size(),
+        dY.data<float>(),
+        mask.data<bool>(),
+        scale,
+        dX->mutable_data<float>());
+    return true;
+  }
+}
+
+REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h
new file mode 100644
index 0000000..f368886
--- /dev/null
+++ b/caffe2/operators/dropout_op.h
@@ -0,0 +1,55 @@
+#ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
+#define CAFFE2_OPERATORS_DROPOUT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class DropoutOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  DropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  float ratio_;
+  bool is_test_;
+  // Input: X; Output: Y, mask.
+};
+
+template <typename T, class Context>
+class DropoutGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  float ratio_;
+  bool is_test_;
+  // Input: dY, mask; Output: dX
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DROPOUT_OP_H_
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
new file mode 100644
index 0000000..906c256
--- /dev/null
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -0,0 +1,300 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+// cudnnRestoreDropoutDescriptor is needed for correctness and
+// doesn't exist prior to cuDNN v7
+#if CUDNN_VERSION_MIN(7,0,0)
+
+class CuDNNDropoutOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        states_initialized_(false),
+        random_seed_(operator_def.device_option().random_seed()) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+
+    CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
+    CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        reinterpret_cast<size_t*>(&states_size_in_bytes_)));
+
+    if (!is_test_) {
+      scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1)));
+      CAFFE_ENFORCE(scratch_blob_);
+    }
+  }
+
+  ~CuDNNDropoutOp() noexcept {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+  static string scratch_blob_name(string mask_blob_name) {
+    return "cudnn_dropout_scratch_" + mask_blob_name;
+  }
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  vector<TIndex> cudnn_input_dims_;
+
+  float ratio_;
+  bool is_test_;
+
+  Blob* scratch_blob_ = nullptr;
+
+  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
+  // Input: X, Output: Y, mask_and_states
+
+  // track whether states have been initialized - only needs to happen once
+  bool states_initialized_;
+
+  // random seed
+  unsigned long long random_seed_;
+};
+
+class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
+        is_test_(
+            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        states_initialized_(false),
+        random_seed_(operator_def.device_option().random_seed()) {
+    CAFFE_ENFORCE_GE(ratio_, 0);
+    CAFFE_ENFORCE_LT(ratio_, 1);
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+
+    CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
+    CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        reinterpret_cast<size_t*>(&states_size_in_bytes_)));
+
+    // Share scratch with the forward op
+    scratch_blob_ =
+        ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1)));
+    CAFFE_ENFORCE(scratch_blob_);
+  }
+
+  ~CuDNNDropoutGradientOp() noexcept {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  vector<TIndex> cudnn_input_dims_;
+
+  Blob* scratch_blob_;
+
+  float ratio_;
+  bool is_test_;
+
+  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
+  // Input: dY, mask_and_states, Output: dX
+
+  // only need to initialize states once (size is static)
+  bool states_initialized_;
+
+  unsigned long long random_seed_;
+};
+
+template <typename T, typename M>
+bool CuDNNDropoutOp::DoRunWithType() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  auto size_prod = 1;
+  for (auto dim : X.dims()) {
+    size_prod *= dim;
+  }
+  // now actually run the computation
+  if (is_test_) {
+    if (Y != &X) {
+      context_.Copy<T, CUDAContext, CUDAContext>(
+          X.size(), X.template data<T>(), Y->template mutable_data<T>());
+    }
+    return true;
+  } else {
+    auto* mask = Output(1);
+    // Reshape tensor descriptors if necessary
+    if (X.dims() != cudnn_input_dims_ && !is_test_) {
+      CAFFE_ENFORCE(scratch_blob_);
+      Tensor<CUDAContext>* states =
+          scratch_blob_->GetMutable<Tensor<CUDAContext>>();
+      cudnn_input_dims_ = X.dims();
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc_,
+          GetCudnnTensorFormat(StorageOrder::NCHW),
+          cudnnTypeWrapper<T>::type,
+          size_prod,
+          1,
+          1,
+          1));
+
+      // get the reserve space we need
+      CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
+          data_desc_, &reserve_space_size_in_bytes_));
+
+      mask->Resize(reserve_space_size_in_bytes_);
+      states->Resize(states_size_in_bytes_);
+
+      if (!states_initialized_) {
+        // set the dropout descriptor (note: need to allocate the states data
+        // before acquiring the mutex)
+        uint8_t* states_data = states->mutable_data<uint8_t>();
+        {
+          // Need to protect  as clashes with NCCL
+          std::lock_guard<std::mutex> lk(CUDAContext::mutex());
+          CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
+              dropout_desc_,
+              cudnn_wrapper_.inline_cudnn_handle(),
+              ratio_,
+              states_data,
+              states_size_in_bytes_,
+              random_seed_
+              ));
+        }
+        states_initialized_ = true;
+      }
+    }
+    CUDNN_ENFORCE(cudnnDropoutForward(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        dropout_desc_,
+        data_desc_,
+        X.template data<T>(),
+        data_desc_,
+        Y->template mutable_data<T>(),
+        mask->mutable_data<uint8_t>(),
+        reserve_space_size_in_bytes_));
+  }
+  return true;
+}
+
+bool CuDNNDropoutOp::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  if (X.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else if (X.IsType<float16>()) {
+    return DoRunWithType<float16, float>();
+  }
+  return false;
+}
+
+template <typename T, typename M>
+bool CuDNNDropoutGradientOp::DoRunWithType() {
+  const auto& dY = Input(0);
+  const auto& mask = Input(1);
+  const Tensor<CUDAContext>& states = scratch_blob_->Get<Tensor<CUDAContext>>();
+  auto* dX = Output(0);
+
+  auto size_prod = 1;
+  for (auto dim : dY.dims()) {
+    size_prod *= dim;
+  }
+
+  if (!states_initialized_) {
+    // set the dropout descriptor
+    {
+      // Need to protect  as clashes with NCCL
+      std::lock_guard<std::mutex> lk(CUDAContext::mutex());
+      CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor(
+          dropout_desc_,
+          cudnn_wrapper_.inline_cudnn_handle(),
+          ratio_,
+          const_cast<uint8_t*>(states.data<uint8_t>()),
+          states_size_in_bytes_,
+          random_seed_
+          ));
+    }
+    states_initialized_ = true;
+  }
+
+  if (dY.dims() != cudnn_input_dims_) {
+    cudnn_input_dims_ = dY.dims();
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        data_desc_,
+        GetCudnnTensorFormat(StorageOrder::NCHW),
+        cudnnTypeWrapper<T>::type,
+        size_prod,
+        1,
+        1,
+        1));
+
+    // get the reserve space we need
+    CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
+        data_desc_, &reserve_space_size_in_bytes_));
+
+  }
+
+  // run the computation
+  void* mask_data = const_cast<void*>(mask.raw_data());
+  CUDNN_ENFORCE(cudnnDropoutBackward(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      dropout_desc_,
+      data_desc_,
+      dY.data<T>(),
+      data_desc_,
+      dX->template mutable_data<T>(),
+      mask_data,
+      reserve_space_size_in_bytes_));
+  return true;
+}
+
+bool CuDNNDropoutGradientOp::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& dY = Input(0);
+  auto* dX = Output(0);
+
+  dX->ResizeLike(dY);
+
+  if (dY.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else if (dY.IsType<float16>()) {
+    return DoRunWithType<float16, float>();
+  }
+  return false;
+}
+
+namespace {
+REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
+REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
+}
+
+#endif
+
+}; // namespace caffe2
diff --git a/caffe2/operators/elementwise_add_gradient_op.cc b/caffe2/operators/elementwise_add_gradient_op.cc
new file mode 100644
index 0000000..b3760d2
--- /dev/null
+++ b/caffe2/operators/elementwise_add_gradient_op.cc
@@ -0,0 +1,33 @@
+#include "caffe2/operators/elementwise_add_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    AddGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        AddFunctor<CPUContext>>);
+
+namespace {
+
+class GetAddGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AddGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Add, GetAddGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_add_op.cc b/caffe2/operators/elementwise_add_op.cc
new file mode 100644
index 0000000..849006a
--- /dev/null
+++ b/caffe2/operators/elementwise_add_op.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/elementwise_add_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Add,
+    BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_add_op.h b/caffe2/operators/elementwise_add_op.h
new file mode 100644
index 0000000..61f9b09
--- /dev/null
+++ b/caffe2/operators/elementwise_add_op.h
@@ -0,0 +1,76 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AddFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Add(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC,
+      const TIn* /* A */,
+      const TIn* /* B */,
+      const TOut* /* C */,
+      TGrad* dA,
+      TGrad* dB,
+      Context* context) const {
+    const std::vector<int> C_dims =
+        elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
+            A_dims, B_dims);
+    std::vector<int> A_axes;
+    std::vector<int> B_axes;
+    elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
+        A_dims, B_dims, &A_axes, &B_axes);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        A_axes.size(),
+        A_axes.data(),
+        dC,
+        dA,
+        context);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        B_axes.size(),
+        B_axes.data(),
+        dC,
+        dB,
+        context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
diff --git a/caffe2/operators/elementwise_add_op_gpu.cc b/caffe2/operators/elementwise_add_op_gpu.cc
new file mode 100644
index 0000000..2f38bbd
--- /dev/null
+++ b/caffe2/operators/elementwise_add_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/elementwise_add_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Add,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, AddFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AddGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        AddFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_gradient_op.cc b/caffe2/operators/elementwise_div_gradient_op.cc
new file mode 100644
index 0000000..f856295
--- /dev/null
+++ b/caffe2/operators/elementwise_div_gradient_op.cc
@@ -0,0 +1,291 @@
+#include "caffe2/operators/elementwise_div_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+namespace {
+
+template <typename TGrad, typename TIn, typename TOut>
+void ComputeDivGradient(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) {
+  const int A_size =
+      std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies<int>());
+  const int B_size =
+      std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies<int>());
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  if (dA != nullptr) {
+    math::Set<TGrad, CPUContext>(A_size, TGrad(0), dA, context);
+  }
+  math::Set<TGrad, CPUContext>(B_size, TGrad(0), dB, context);
+  std::vector<int> index(ndim, 0);
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int B_index =
+        math::utils::GetIndexFromDims(ndim, B_dims, index.data());
+    dB[B_index] += -dC[C_index] * C[C_index] / B[B_index];
+    if (dA != nullptr) {
+      const int A_index =
+          math::utils::GetIndexFromDims(ndim, A_dims, index.data());
+      dA[A_index] += dC[C_index] / B[B_index];
+    }
+    math::utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool DivFunctor<CPUContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* /* A */,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    EigenVectorMap<TGrad>(dB, size) =
+        -ConstEigenVectorArrayMap<TGrad>(dC, size) *
+        ConstEigenVectorArrayMap<TOut>(C, size) /
+        ConstEigenVectorArrayMap<TIn>(B, size);
+    math::Div(size, dC, B, dA, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  if (dA == dC) {
+    ComputeDivGradient<TGrad, TIn, TOut>(
+        ndim,
+        A_broadcast_dims.data(),
+        B_broadcast_dims.data(),
+        C_broadcast_dims.data(),
+        dC,
+        B,
+        C,
+        nullptr,
+        dB,
+        context);
+    math::Div(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        dC,
+        B,
+        dA,
+        context);
+  } else {
+    ComputeDivGradient<TGrad, TIn, TOut>(
+        ndim,
+        A_broadcast_dims.data(),
+        B_broadcast_dims.data(),
+        C_broadcast_dims.data(),
+        dC,
+        B,
+        C,
+        dA,
+        dB,
+        context);
+  }
+  return true;
+}
+
+template <>
+class BinaryElementwiseWithArgsGradientOp<
+    NumericTypes,
+    CPUContext,
+    BinaryFunctorWithDefaultCtor<DivFunctor<CPUContext>>,
+    SameTypeAsInput,
+    SameTypeAsInput>
+    final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+
+  BinaryElementwiseWithArgsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<NumericTypes>::call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto* dA = Output(0);
+    auto* dB = Output(1);
+    const T* dC_data = nullptr;
+    const T* A_data = nullptr;
+    const T* B_data = nullptr;
+    const T* C_data = nullptr;
+    std::vector<int> A_dims;
+    std::vector<int> B_dims;
+    if (InputSize() == 3) {
+      const auto& B = Input(0);
+      const auto& C = Input(1);
+      const auto& dC = Input(2);
+      if (legacy_broadcast_) {
+        if (B.size() == 1) {
+          A_dims = {static_cast<int>(C.size())};
+          B_dims = {1};
+        } else {
+          size_t pre, n, post;
+          std::tie(pre, n, post) =
+              elementwise_ops_utils::ComputeLegacyBroadcastSizes(C, B, axis_);
+          A_dims = {static_cast<int>(pre),
+                    static_cast<int>(n),
+                    static_cast<int>(post)};
+          B_dims = {static_cast<int>(n), 1};
+        }
+      } else {
+        std::copy(
+            C.dims().cbegin(), C.dims().cend(), std::back_inserter(A_dims));
+        std::copy(
+            B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      }
+      B_data = B.template data<T>();
+      C_data = C.template data<T>();
+      dC_data = dC.template data<T>();
+      dA->ResizeLike(C);
+      dB->ResizeLike(B);
+    } else {
+      const auto& dC = Input(0);
+      const auto& A = Input(1);
+      const auto& B = Input(2);
+      const auto& C = Input(3);
+      if (legacy_broadcast_) {
+        if (B.size() == 1) {
+          A_dims = {static_cast<int>(A.size())};
+          B_dims = {1};
+        } else {
+          size_t pre, n, post;
+          std::tie(pre, n, post) =
+              elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+          A_dims = {static_cast<int>(pre),
+                    static_cast<int>(n),
+                    static_cast<int>(post)};
+          B_dims = {static_cast<int>(n), 1};
+        }
+      } else {
+        std::copy(
+            A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+        std::copy(
+            B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      }
+      dC_data = dC.template data<T>();
+      A_data = A.template data<T>();
+      B_data = B.template data<T>();
+      C_data = C.template data<T>();
+      dA->ResizeLike(A);
+      dB->ResizeLike(B);
+    }
+    auto* dA_data = dA->template mutable_data<T>();
+    auto* dB_data = dB->template mutable_data<T>();
+    return functor_.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  BinaryFunctorWithDefaultCtor<DivFunctor<CPUContext>> functor_;
+};
+
+REGISTER_CPU_OPERATOR(
+    DivGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        DivFunctor<CPUContext>>);
+
+namespace {
+
+class GetDivGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "DivGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1), O(0)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Div, GetDivGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.cc b/caffe2/operators/elementwise_div_op.cc
new file mode 100644
index 0000000..73f0b23
--- /dev/null
+++ b/caffe2/operators/elementwise_div_op.cc
@@ -0,0 +1,10 @@
+#include "caffe2/operators/elementwise_div_op.h"
+
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Div,
+    BinaryElementwiseOp<NumericTypes, CPUContext, DivFunctor<CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
new file mode 100644
index 0000000..9244f79
--- /dev/null
+++ b/caffe2/operators/elementwise_div_op.cu
@@ -0,0 +1,472 @@
+#include "caffe2/operators/elementwise_div_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/fixed_divisor.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename TGrad, typename TIn, int D>
+__global__ void ComputeDivAGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<FixedDivisor<int>, D> C_dims,
+    const SimpleArray<int, D> C_strides,
+    const SimpleArray<int, D> B_strides,
+    const SimpleArray<FixedDivisor<int>, D> A_dims,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int A_index = i * inner_size + j;
+      int C_index = 0;
+      int A_index_val = A_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        A_dims.data[d].DivMod(A_index_val, &A_index_val, &r);
+        C_index += r * C_strides.data[d];
+      }
+      int B_index = 0;
+      int C_index_val = C_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        C_dims.data[d].DivMod(C_index_val, &C_index_val, &r);
+        B_index += r * B_strides.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += __ldg(dC + C_index) / __ldg(B + B_index);
+#else
+      sum += dC[C_index] / B[B_index];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dA[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, typename TOut>
+__global__ void ComputeSimpleDivBGradientCUDAKernel(
+    const int size,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+#if __CUDA_ARCH__ >= 350
+    dB[i] = -__ldg(dC + i) * __ldg(C + i) / __ldg(B + i);
+#else
+    dB[i] = -dC[i] * C[i] / B[i];
+#endif
+  }
+}
+
+template <typename TGrad, typename TIn, typename TOut, int D>
+__global__ void ComputeDivBGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<int, D> C_strides,
+    const SimpleArray<FixedDivisor<int>, D> B_dims,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int C_index = 0;
+      int B_index = i * inner_size + j;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        B_dims.data[d].DivMod(B_index, &B_index, &r);
+        C_index += r * C_strides.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += -__ldg(dC + C_index) * __ldg(C + C_index) / __ldg(B + i);
+#else
+      sum += -dC[C_index] * C[C_index] / B[i];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dB[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, int D>
+void ComputeDivAGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* C_dims,
+    const int* B_dims,
+    const int* A_axes,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA,
+    CUDAContext* context) {
+  SimpleArray<FixedDivisor<int>, D> C_dims_arr;
+  SimpleArray<int, D> C_strides_arr;
+  SimpleArray<int, D> B_strides_arr;
+  SimpleArray<FixedDivisor<int>, D> A_dims_arr;
+  for (int i = 0; i < D; ++i) {
+    C_dims_arr.data[i] = FixedDivisor<int>(C_dims[i]);
+    A_dims_arr.data[i] = FixedDivisor<int>(C_dims[A_axes[i]]);
+  }
+  math::utils::ComputeTransposedStrides(D, C_dims, A_axes, C_strides_arr.data);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    B_strides_arr.data[i] = B_dims[i] == 1 ? 0 : cur_stride;
+    cur_stride *= B_dims[i];
+  }
+  ComputeDivAGradientCUDAKernel<TGrad, TIn, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          C_dims_arr,
+          C_strides_arr,
+          B_strides_arr,
+          A_dims_arr,
+          dC,
+          B,
+          dA);
+}
+
+template <typename TGrad, typename TIn, typename TOut, int D>
+void ComputeDivBGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* C_dims,
+    const int* B_axes,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB,
+    CUDAContext* context) {
+  SimpleArray<int, D> C_strides_arr;
+  SimpleArray<FixedDivisor<int>, D> B_dims_arr;
+  math::utils::ComputeTransposedStrides(D, C_dims, B_axes, C_strides_arr.data);
+  for (int i = 0; i < D; ++i) {
+    B_dims_arr.data[i] = FixedDivisor<int>(C_dims[B_axes[i]]);
+  }
+  ComputeDivBGradientCUDAKernel<TGrad, TIn, TOut, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size, inner_size, C_strides_arr, B_dims_arr, dC, B, C, dB);
+}
+
+template <typename TGrad, typename TIn>
+void ComputeDivAGradientCUDA(
+    const std::vector<int>& C_dims,
+    const std::vector<int>& B_dims,
+    const std::vector<int>& A_axes,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_EQ(C_dims.size(), B_dims.size());
+  const int ndim = C_dims.size();
+  std::vector<int> A_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, A_axes.size(), A_axes.data(), A_transpose_axes.data());
+  const int pivot = ndim - A_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= C_dims[A_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= C_dims[A_transpose_axes[i]];
+  }
+  if (outer_size > 0 && inner_size > 0) {
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+        ndim,
+        ComputeDivAGradientCUDAImpl,
+        TGrad,
+        TIn,
+        outer_size,
+        inner_size,
+        C_dims.data(),
+        B_dims.data(),
+        A_transpose_axes.data(),
+        dC,
+        B,
+        dA,
+        context);
+  } else if (outer_size > 0) {
+    math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dA, context);
+  }
+}
+
+template <typename TGrad, typename TIn, typename TOut>
+void ComputeDivBGradientCUDA(
+    const std::vector<int>& C_dims,
+    const std::vector<int>& B_axes,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB,
+    CUDAContext* context) {
+  const int ndim = C_dims.size();
+  std::vector<int> B_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, B_axes.size(), B_axes.data(), B_transpose_axes.data());
+  const int pivot = ndim - B_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= C_dims[B_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= C_dims[B_transpose_axes[i]];
+  }
+  if (outer_size > 0 && inner_size > 0) {
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
+        ndim,
+        ComputeDivBGradientCUDAImpl,
+        TGrad,
+        TIn,
+        TOut,
+        outer_size,
+        inner_size,
+        C_dims.data(),
+        B_transpose_axes.data(),
+        dC,
+        B,
+        C,
+        dB,
+        context);
+  } else if (outer_size > 0) {
+    math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dB, context);
+  }
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool DivFunctor<CUDAContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* /* A */,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CUDAContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    ComputeSimpleDivBGradientCUDAKernel<TGrad, TIn, TOut>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(size, dC, B, C, dB);
+    math::Div(size, dC, B, dA, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  std::vector<int> A_axes;
+  std::vector<int> B_axes;
+  elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
+      A_dims, B_dims, &A_axes, &B_axes);
+  ComputeDivBGradientCUDA<TGrad, TIn, TOut>(
+      C_broadcast_dims, B_axes, dC, B, C, dB, context);
+  ComputeDivAGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context);
+  return true;
+}
+
+template <>
+class BinaryElementwiseWithArgsGradientOp<
+    NumericTypes,
+    CUDAContext,
+    BinaryFunctorWithDefaultCtor<DivFunctor<CUDAContext>>,
+    SameTypeAsInput,
+    SameTypeAsInput>
+    final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  BinaryElementwiseWithArgsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<NumericTypes>::call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto* dA = Output(0);
+    auto* dB = Output(1);
+    const T* dC_data = nullptr;
+    const T* A_data = nullptr;
+    const T* B_data = nullptr;
+    const T* C_data = nullptr;
+    std::vector<int> A_dims;
+    std::vector<int> B_dims;
+    if (InputSize() == 3) {
+      const auto& B = Input(0);
+      const auto& C = Input(1);
+      const auto& dC = Input(2);
+      if (legacy_broadcast_) {
+        if (B.size() == 1) {
+          A_dims = {static_cast<int>(C.size())};
+          B_dims = {1};
+        } else {
+          size_t pre, n, post;
+          std::tie(pre, n, post) =
+              elementwise_ops_utils::ComputeLegacyBroadcastSizes(C, B, axis_);
+          A_dims = {static_cast<int>(pre),
+                    static_cast<int>(n),
+                    static_cast<int>(post)};
+          B_dims = {static_cast<int>(n), 1};
+        }
+      } else {
+        std::copy(
+            C.dims().cbegin(), C.dims().cend(), std::back_inserter(A_dims));
+        std::copy(
+            B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      }
+      B_data = B.template data<T>();
+      C_data = C.template data<T>();
+      dC_data = dC.template data<T>();
+      dA->ResizeLike(C);
+      dB->ResizeLike(B);
+    } else {
+      const auto& dC = Input(0);
+      const auto& A = Input(1);
+      const auto& B = Input(2);
+      const auto& C = Input(3);
+      if (legacy_broadcast_) {
+        if (B.size() == 1) {
+          A_dims = {static_cast<int>(A.size())};
+          B_dims = {1};
+        } else {
+          size_t pre, n, post;
+          std::tie(pre, n, post) =
+              elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+          A_dims = {static_cast<int>(pre),
+                    static_cast<int>(n),
+                    static_cast<int>(post)};
+          B_dims = {static_cast<int>(n), 1};
+        }
+      } else {
+        std::copy(
+            A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+        std::copy(
+            B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      }
+      dC_data = dC.template data<T>();
+      A_data = A.template data<T>();
+      B_data = B.template data<T>();
+      C_data = C.template data<T>();
+      dA->ResizeLike(A);
+      dB->ResizeLike(B);
+    }
+    auto* dA_data = dA->template mutable_data<T>();
+    auto* dB_data = dB->template mutable_data<T>();
+    return functor_.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  BinaryFunctorWithDefaultCtor<DivFunctor<CUDAContext>> functor_;
+};
+
+REGISTER_CUDA_OPERATOR(
+    Div,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, DivFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    DivGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        DivFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.h b/caffe2/operators/elementwise_div_op.h
new file mode 100644
index 0000000..d89fcd4
--- /dev/null
+++ b/caffe2/operators/elementwise_div_op.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct DivFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Div(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc
new file mode 100644
index 0000000..e935136
--- /dev/null
+++ b/caffe2/operators/elementwise_linear_op.cc
@@ -0,0 +1,187 @@
+#include "elementwise_linear_op.h"
+
+namespace caffe2 {
+
+template<>
+bool ElementwiseLinearOp<float, CPUContext>::RunOnDevice(){
+  const auto& X = Input(0);
+  const auto& a = Input(1);
+  const auto& b = Input(2);
+  auto* Y = Output(0);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+
+  CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
+  CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
+  CAFFE_ENFORCE_EQ(b.ndim(), 1, b.ndim());
+  CAFFE_ENFORCE_EQ(b.dim(0), D, b.ndim());
+
+  Y->ResizeLike(X);
+
+  const float* X_data = X.data<float>();
+  const float* a_data = a.data<float>();
+  const float* b_data = b.data<float>();
+  float* Y_data = Y->mutable_data<float>();
+
+  int p = 0;
+  for (int n = 0; n < N; ++n) {
+    for (int d = 0; d < D; ++d) {
+      Y_data[p] = X_data[p] * a_data[d] + b_data[d];
+      p++;
+    }
+  }
+  return true;
+}
+
+template<>
+bool ElementwiseLinearGradientOp<float, CPUContext>::RunOnDevice(){
+  const auto& g_o = Input(0);
+  const auto& X = Input(1);
+  const auto& a = Input(2);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+
+  CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
+  CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
+
+  auto *g_X = Output(0);
+  auto *g_a = Output(1);
+  auto *g_b = Output(2);
+  g_X->ResizeLike(X);
+  g_a->ResizeLike(a);
+  g_b->ResizeLike(a);
+
+  const float* g_o_data = g_o.data<float>();
+  const float* X_data = X.data<float>();
+  const float* a_data = a.data<float>();
+  float* g_X_data = g_X->mutable_data<float>();
+  float* g_a_data = g_a->mutable_data<float>();
+  float* g_b_data = g_b->mutable_data<float>();
+
+  math::Set<float, CPUContext>(g_a->size(), 0.f, g_a_data, &context_);
+  math::Set<float, CPUContext>(g_b->size(), 0.f, g_b_data, &context_);
+
+  int p = 0;
+  for (int n = 0; n < N; ++n) {
+    for (int d = 0; d < D; ++d) {
+      g_X_data[p] = g_o_data[p] * a_data[d];
+      g_a_data[d] += g_o_data[p] * X_data[p];
+      g_b_data[d] += g_o_data[p];
+      p++;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+  ElementwiseLinear,
+  ElementwiseLinearOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+  ElementwiseLinearGradient,
+  ElementwiseLinearGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(ElementwiseLinear)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This op computes the elementwise linear combination of a batch of input vectors with a weight vector and bias vector. As input, the op takes an input tensor $X$ of shape $NxD$, a weight vector $w$ of length $D$, and a bias vector $b$ of length $D$. Here, $N$ represents the batch size and $D$ represents the length of the feature vectors. The output, $Y$, is a tensor of shape $NxD$ and is calculated as
+
+$$Y_{ij} = X_{ij}w_j + b_j \ for \ i\in{N}, j\in{D}$$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_linear_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_linear_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ElementwiseLinear",
+    ["X", "w", "b"],
+    ["Y"]
+)
+
+# Create X
+X = np.array([[1,2,3,4,5],[6,8,9,16,10]])
+print("X:\n",X)
+
+# Create w
+w = np.array([1,1/2.,1/3.,1/4.,1/5.])
+print("w:\n",w)
+
+# Create b
+b = np.array([1.,1.,1.,1.,1.])
+print("b:\n",b)
+
+
+# Feed X & w & b into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+workspace.FeedBlob("w", w.astype(np.float32))
+workspace.FeedBlob("b", b.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[ 1  2  3  4  5]
+ [ 6  8  9 16 10]]
+w:
+ [1.  0.5  0.33333333 0.25  0.2]
+b:
+ [1. 1. 1. 1. 1.]
+Y:
+ [[2. 2. 2. 2. 2.]
+ [7. 5. 4. 5. 3.]]
+
+```
+
+</details>
+
+  )DOC")
+    .Input(0, "X", "2D input tensor of size $NxD$. This input represents the input data to be operated on.")
+    .Input(1, "w", "1D scaling factors, or weights, of size $D$. This input contains the weights that will be multiplied by the data.")
+    .Input(2, "b", "1D biases of size $D$. This input contains the biases that will be added to the products of the weights and data.")
+    .Output(0, "Y", "2D output tensor of size $NxD$. Calculated as described above.")
+    .Arg("axis", "*(type: int; default: 1)* Describes the axis of the inputs; defaults to one because the 0th axis most likely describes the batch size.");
+
+OPERATOR_SCHEMA(ElementwiseLinearGradient)
+  .NumInputs(3)
+  .NumOutputs(3);
+
+struct GetElementwiseLinearGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+      "ElementwiseLinearGradient",
+      "",
+      vector<string>{GO(0), I(0), I(1)},
+      vector<string>{GI(0), GI(1), GI(2)});
+    }
+};
+
+REGISTER_GRADIENT(
+  ElementwiseLinear,
+  GetElementwiseLinearGradient
+);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
new file mode 100644
index 0000000..503675a
--- /dev/null
+++ b/caffe2/operators/elementwise_linear_op.cu
@@ -0,0 +1,122 @@
+#include <assert.h>
+
+#include "elementwise_linear_op.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+#include <cub/block/block_reduce.cuh>
+
+namespace caffe2 {
+
+namespace {
+__global__ void ElementwiseLinearKernel(const int N, const int D,
+  const float* X_data, const float* a_data, const float* b_data,
+  float* Y_data) {
+    CUDA_1D_KERNEL_LOOP(i, N * D) {
+      int d = i % D;
+      Y_data[i] = X_data[i] * a_data[d] + b_data[d];
+    }
+}
+
+__global__ void ElementwiseLinearGradientKernel(const int N, const int D,
+  const float* g_o_data, const float* X_data, const float* a_data,
+  float* g_X_data, float* g_a_data, float* g_b_data) {
+  int d = blockIdx.x; // One block per D
+
+  float g_a_sum = 0;
+  float g_b_sum = 0;
+  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+    const float gox = g_o_data[n * D + d];
+    g_X_data[n * D + d] = gox * a_data[d];
+    g_a_sum += gox * X_data[n * D + d];
+    g_b_sum += gox;
+  }
+
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  float g_a_sum_tot = BlockReduce(temp_storage).Sum(g_a_sum);
+  __syncthreads();
+  float g_b_sum_tot = BlockReduce(temp_storage).Sum(g_b_sum);
+
+  if (threadIdx.x == 0) {
+    g_a_data[d] = g_a_sum_tot;
+    g_b_data[d] = g_b_sum_tot;
+  }
+}
+
+}  // namespace
+
+
+template<>
+bool ElementwiseLinearOp<float, CUDAContext>::RunOnDevice(){
+  const auto& X = Input(0);
+  const auto& a = Input(1);
+  const auto& b = Input(2);
+  auto* Y = Output(0);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+
+  CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
+  CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
+  CAFFE_ENFORCE_EQ(b.ndim(), 1, b.ndim());
+  CAFFE_ENFORCE_EQ(b.dim(0), D, b.ndim());
+
+  Y->ResizeLike(X);
+
+  ElementwiseLinearKernel<<<CAFFE_GET_BLOCKS(N * D), CAFFE_CUDA_NUM_THREADS,
+                          0, context_.cuda_stream()>>>(
+    N, D, X.data<float>(), a.data<float>(), b.data<float>(),
+    Y->mutable_data<float>());
+  return true;
+}
+
+
+template<>
+bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
+  const auto& g_o = Input(0);
+  const auto& X = Input(1);
+  const auto& a = Input(2);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+
+  CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
+  CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
+
+  auto *g_X = Output(0);
+  auto *g_a = Output(1);
+  auto *g_b = Output(2);
+  g_X->ResizeLike(X);
+  g_a->ResizeLike(a);
+  g_b->ResizeLike(a);
+
+  float* g_a_data = g_a->mutable_data<float>();
+  float* g_b_data = g_b->mutable_data<float>();
+
+  ElementwiseLinearGradientKernel<<<
+      D,
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      g_o.data<float>(),
+      X.data<float>(),
+      a.data<float>(),
+      g_X->mutable_data<float>(),
+      g_a_data,
+      g_b_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ElementwiseLinear,
+                       ElementwiseLinearOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(ElementwiseLinearGradient,
+                       ElementwiseLinearGradientOp<float, CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/elementwise_linear_op.h b/caffe2/operators/elementwise_linear_op.h
new file mode 100644
index 0000000..f2811aa
--- /dev/null
+++ b/caffe2/operators/elementwise_linear_op.h
@@ -0,0 +1,39 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_LINEAR_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_LINEAR_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+template <typename T, class Context, class Engine = DefaultEngine>
+class ElementwiseLinearOp final : public Operator<Context> {
+ public:
+  ElementwiseLinearOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+};
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class ElementwiseLinearGradientOp final : public Operator<Context> {
+ public:
+  ElementwiseLinearGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+};
+
+} // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ELEMENTWISE_LINEAR_OP_H_
diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc
new file mode 100644
index 0000000..9da98a8
--- /dev/null
+++ b/caffe2/operators/elementwise_logical_ops.cc
@@ -0,0 +1,117 @@
+#include "caffe2/operators/elementwise_logical_ops.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(Where, WhereOp<CPUContext>);
+
+// Input: C, X, Y, output: Z
+OPERATOR_SCHEMA(Where)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{1, 2}})
+    .IdenticalTypeAndShapeOfInput(1)
+    .SetDoc(R"DOC(
+Operator Where takes three input data (Tensor<bool>, Tensor<T>, Tensor<T>) and
+produces one output data (Tensor<T>) where z = c ? x : y is applied elementwise.
+)DOC")
+    .Input(0, "C", "input tensor containing booleans")
+    .Input(1, "X", "input tensor")
+    .Input(2, "Y", "input tensor")
+    .Output(0, "Z", "output tensor");
+
+SHOULD_NOT_DO_GRADIENT(Where);
+
+REGISTER_CPU_OPERATOR(IsMemberOf, IsMemberOfOp<CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(IsMemberOf)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(
+        [](const OperatorDef&, const vector<TensorShape>& input_types) {
+          vector<TensorShape> out(1);
+          out[0] = input_types[0];
+          out[0].set_data_type(TensorProto_DataType::TensorProto_DataType_BOOL);
+          return out;
+        })
+    .Arg("value", "*(type: []; default: -)* List of values to check for membership.")
+    .Arg("dtype", "*(type: TensorProto_DataType; default: -)* The data type for the elements of the output tensor. Strictly must be one of the types from DataType enum in TensorProto.")
+    .SetDoc(R"DOC(
+The *IsMemberOf* op takes an input tensor *X* and a list of values as argument, and produces one output data tensor *Y*. The output tensor is the same shape as *X* and contains booleans. The output is calculated as the function *f(x) = x in value* and is applied to *X* elementwise.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/elementwise_logical_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/elementwise_logical_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "IsMemberOf",
+    ["X"],
+    ["Y"],
+    value=[0,2,4,6,8],
+)
+
+# Use a not-empty tensor
+workspace.FeedBlob("X", np.array([0,1,2,3,4,5,6,7,8]).astype(np.int32))
+print("X:\n", workspace.FetchBlob("X"))
+
+workspace.RunOperatorOnce(op)
+print("Y: \n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+# value=[0,2,4,6,8]
+
+X:
+ [0 1 2 3 4 5 6 7 8]
+Y:
+ [ True False  True False  True False  True False  True]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "Input tensor of any shape")
+    .Output(0, "Y", "Output tensor (same size as X containing booleans)");
+
+SHOULD_NOT_DO_GRADIENT(IsMemberOf);
+
+} // namespace
+
+template <>
+std::unordered_set<int32_t>& IsMemberOfValueHolder::get<int32_t>() {
+  return int32_values_;
+}
+
+template <>
+std::unordered_set<int64_t>& IsMemberOfValueHolder::get<int64_t>() {
+  return int64_values_;
+}
+
+template <>
+std::unordered_set<bool>& IsMemberOfValueHolder::get<bool>() {
+  return bool_values_;
+}
+
+template <>
+std::unordered_set<string>& IsMemberOfValueHolder::get<string>() {
+  return string_values_;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h
new file mode 100644
index 0000000..99b84c5
--- /dev/null
+++ b/caffe2/operators/elementwise_logical_ops.h
@@ -0,0 +1,172 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_LOGICAL_OPS_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_LOGICAL_OPS_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_ops.h"
+
+#include <unordered_set>
+
+namespace caffe2 {
+
+template <class Context>
+class WhereOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_FUNCTIONS(Context);
+  USE_DISPATCH_HELPER;
+
+  WhereOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast_on_rows", enable_broadcast_, 0) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<float, double, int, long, std::string, bool>>::
+        call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& select = Input(0);
+    auto& left = Input(1);
+    auto& right = Input(2);
+    auto* output = Output(0);
+    if (enable_broadcast_) {
+      CAFFE_ENFORCE_EQ(select.ndim(), 1);
+      CAFFE_ENFORCE_EQ(select.dim(0), right.dim(0));
+      CAFFE_ENFORCE_EQ(left.dims(), right.dims());
+    } else {
+      CAFFE_ENFORCE_EQ(select.dims(), left.dims());
+      CAFFE_ENFORCE_EQ(select.dims(), right.dims());
+    }
+    output->ResizeLike(left);
+
+    const bool* select_data = select.template data<bool>();
+    const T* left_data = left.template data<T>();
+    const T* right_data = right.template data<T>();
+    T* output_data = output->template mutable_data<T>();
+
+    if (enable_broadcast_) {
+      size_t block_size = left.size_from_dim(1);
+      for (int i = 0; i < select.size(); i++) {
+        size_t offset = i * block_size;
+        if (select_data[i]) {
+          context_.template CopyItems<Context, Context>(
+              output->meta(),
+              block_size,
+              left_data + offset,
+              output_data + offset);
+        } else {
+          context_.template CopyItems<Context, Context>(
+              output->meta(),
+              block_size,
+              right_data + offset,
+              output_data + offset);
+        }
+      }
+    } else {
+      for (int i = 0; i < select.size(); ++i) {
+        output_data[i] = select_data[i] ? left_data[i] : right_data[i];
+      }
+    }
+    return true;
+  }
+
+ private:
+  bool enable_broadcast_;
+};
+
+class IsMemberOfValueHolder {
+  std::unordered_set<int32_t> int32_values_;
+  std::unordered_set<int64_t> int64_values_;
+  std::unordered_set<bool> bool_values_;
+  std::unordered_set<std::string> string_values_;
+  bool has_values_ = false;
+
+ public:
+  template <typename T>
+  std::unordered_set<T>& get();
+
+  template <typename T>
+  void set(const std::vector<T>& args) {
+    has_values_ = true;
+    auto& values = get<T>();
+    values.insert(args.begin(), args.end());
+  }
+
+  bool has_values() {
+    return has_values_;
+  }
+};
+
+template <class Context>
+class IsMemberOfOp final : public Operator<Context> {
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  static constexpr const char* VALUE_TAG = "value";
+
+ public:
+  using TestableTypes = TensorTypes<int32_t, int64_t, bool, std::string>;
+
+  IsMemberOfOp(const OperatorDef& op, Workspace* ws)
+      : Operator<Context>(op, ws) {
+    auto dtype =
+        static_cast<TensorProto_DataType>(OperatorBase::GetSingleArgument<int>(
+            "dtype", TensorProto_DataType_UNDEFINED));
+    switch (dtype) {
+      case TensorProto_DataType_INT32:
+        values_.set(OperatorBase::GetRepeatedArgument<int32_t>(VALUE_TAG));
+        break;
+      case TensorProto_DataType_INT64:
+        values_.set(OperatorBase::GetRepeatedArgument<int64_t>(VALUE_TAG));
+        break;
+      case TensorProto_DataType_BOOL:
+        values_.set(OperatorBase::GetRepeatedArgument<bool>(VALUE_TAG));
+        break;
+      case TensorProto_DataType_STRING:
+        values_.set(OperatorBase::GetRepeatedArgument<std::string>(VALUE_TAG));
+        break;
+      case TensorProto_DataType_UNDEFINED:
+        // If dtype is not provided, values_ will be filled the first time that
+        // DoRunWithType is called.
+        break;
+      default:
+        CAFFE_THROW("Unexpected 'dtype' argument value: ", dtype);
+    }
+  }
+  virtual ~IsMemberOfOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<int32_t, int64_t, bool, std::string>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->ResizeLike(input);
+
+    if (!values_.has_values()) {
+      values_.set(OperatorBase::GetRepeatedArgument<T>(VALUE_TAG));
+    }
+    const auto& values = values_.get<T>();
+
+    const T* input_data = input.template data<T>();
+    bool* output_data = output->template mutable_data<bool>();
+    for (int i = 0; i < input.size(); ++i) {
+      output_data[i] = values.find(input_data[i]) != values.end();
+    }
+    return true;
+  }
+
+ protected:
+  IsMemberOfValueHolder values_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_LOGICAL_OPS_H_
diff --git a/caffe2/operators/elementwise_mul_gradient_op.cc b/caffe2/operators/elementwise_mul_gradient_op.cc
new file mode 100644
index 0000000..cbc7366
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_gradient_op.cc
@@ -0,0 +1,116 @@
+#include "caffe2/operators/elementwise_mul_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+namespace {
+
+template <typename TGrad, typename TIn>
+void ComputeMulGradient(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) {
+  const int A_size =
+      std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies<int>());
+  const int B_size =
+      std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies<int>());
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  math::Set<TGrad, CPUContext>(A_size, TGrad(0), dA, context);
+  math::Set<TGrad, CPUContext>(B_size, TGrad(0), dB, context);
+  std::vector<int> index(ndim, 0);
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int A_index =
+        math::utils::GetIndexFromDims(ndim, A_dims, index.data());
+    const int B_index =
+        math::utils::GetIndexFromDims(ndim, B_dims, index.data());
+    dA[A_index] += dC[C_index] * B[B_index];
+    dB[B_index] += dC[C_index] * A[A_index];
+    math::utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool MulFunctor<CPUContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    const TOut* /* C */,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Mul(size, dC, B, dA, context);
+    math::Mul(size, dC, A, dB, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  ComputeMulGradient<TGrad, TIn>(
+      ndim,
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data(),
+      dC,
+      A,
+      B,
+      dA,
+      dB,
+      context);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    MulGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        MulFunctor<CPUContext>>);
+
+namespace {
+
+class GetMulGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MulGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Mul, GetMulGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_mul_op.cc b/caffe2/operators/elementwise_mul_op.cc
new file mode 100644
index 0000000..a55112a
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_op.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/elementwise_mul_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Mul,
+    BinaryElementwiseOp<NumericTypes, CPUContext, MulFunctor<CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
new file mode 100644
index 0000000..33ab8d7
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_op.cu
@@ -0,0 +1,204 @@
+#include "caffe2/operators/elementwise_mul_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/fixed_divisor.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename TGrad, typename TIn, int D>
+__global__ void ComputeMulGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<FixedDivisor<int>, D> Y_dims,
+    const SimpleArray<int, D> Y_strides,
+    const SimpleArray<int, D> W_strides,
+    const SimpleArray<FixedDivisor<int>, D> X_dims,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int X_index = i * inner_size + j;
+      int Y_index = 0;
+      int X_index_val = X_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        X_dims.data[d].DivMod(X_index_val, &X_index_val, &r);
+        Y_index += r * Y_strides.data[d];
+      }
+      int W_index = 0;
+      int Y_index_val = Y_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        Y_dims.data[d].DivMod(Y_index_val, &Y_index_val, &r);
+        W_index += r * W_strides.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += __ldg(dY + Y_index) * __ldg(W + W_index);
+#else
+      sum += dY[Y_index] * W[W_index];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dX[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, int D>
+void ComputeMulGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* Y_dims,
+    const int* W_dims,
+    const int* X_axes,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX,
+    CUDAContext* context) {
+  SimpleArray<FixedDivisor<int>, D> Y_dims_arr;
+  SimpleArray<int, D> Y_strides_arr;
+  SimpleArray<int, D> W_strides_arr;
+  SimpleArray<FixedDivisor<int>, D> X_dims_arr;
+  for (int i = 0; i < D; ++i) {
+    Y_dims_arr.data[i] = FixedDivisor<int>(Y_dims[i]);
+    X_dims_arr.data[i] = FixedDivisor<int>(Y_dims[X_axes[i]]);
+  }
+  math::utils::ComputeTransposedStrides(D, Y_dims, X_axes, Y_strides_arr.data);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    W_strides_arr.data[i] = W_dims[i] == 1 ? 0 : cur_stride;
+    cur_stride *= W_dims[i];
+  }
+  ComputeMulGradientCUDAKernel<TGrad, TIn, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          Y_dims_arr,
+          Y_strides_arr,
+          W_strides_arr,
+          X_dims_arr,
+          dY,
+          W,
+          dX);
+}
+
+template <typename TGrad, typename TIn>
+void ComputeMulGradientCUDA(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& W_dims,
+    const std::vector<int>& X_axes,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_EQ(Y_dims.size(), W_dims.size());
+  const int ndim = Y_dims.size();
+  std::vector<int> X_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, X_axes.size(), X_axes.data(), X_transpose_axes.data());
+  const int pivot = ndim - X_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= Y_dims[X_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= Y_dims[X_transpose_axes[i]];
+  }
+  if (outer_size > 0 && inner_size > 0) {
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+        ndim,
+        ComputeMulGradientCUDAImpl,
+        TGrad,
+        TIn,
+        outer_size,
+        inner_size,
+        Y_dims.data(),
+        W_dims.data(),
+        X_transpose_axes.data(),
+        dY,
+        W,
+        dX,
+        context);
+  } else if (outer_size > 0) {
+    math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dX, context);
+  }
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool MulFunctor<CUDAContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    const TOut* /* C */,
+    TGrad* dA,
+    TGrad* dB,
+    CUDAContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Mul(size, dC, B, dA, context);
+    math::Mul(size, dC, A, dB, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  std::vector<int> A_axes;
+  std::vector<int> B_axes;
+  elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
+      A_dims, B_dims, &A_axes, &B_axes);
+  ComputeMulGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context);
+  ComputeMulGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, A_broadcast_dims, B_axes, dC, A, dB, context);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Mul,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, MulFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    MulGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        MulFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_mul_op.h b/caffe2/operators/elementwise_mul_op.h
new file mode 100644
index 0000000..f1c42ed
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_op.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct MulFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Mul(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
diff --git a/caffe2/operators/elementwise_op_gpu_test.cc b/caffe2/operators/elementwise_op_gpu_test.cc
new file mode 100644
index 0000000..e652a9e
--- /dev/null
+++ b/caffe2/operators/elementwise_op_gpu_test.cc
@@ -0,0 +1,42 @@
+#include "caffe2/operators/elementwise_op_test.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+template <>
+void CopyVector<caffe2::CUDAContext>(const int N, const bool* x, bool* y) {
+  CUDA_CHECK(cudaMemcpy(y, x, N * sizeof(bool), cudaMemcpyHostToDevice));
+}
+
+template <>
+caffe2::OperatorDef CreateOperatorDef<caffe2::CUDAContext>() {
+  caffe2::OperatorDef def;
+  def.mutable_device_option()->set_device_type(caffe2::CUDA);
+  return def;
+}
+
+TEST(ElementwiseGPUTest, And) {
+  if (!caffe2::HasCudaGPU())
+    return;
+  elementwiseAnd<caffe2::CUDAContext>();
+}
+
+TEST(ElementwiseGPUTest, Or) {
+  if (!caffe2::HasCudaGPU())
+    return;
+  elementwiseOr<caffe2::CUDAContext>();
+}
+
+TEST(ElementwiseGPUTest, Xor) {
+  if (!caffe2::HasCudaGPU())
+    return;
+  elementwiseXor<caffe2::CUDAContext>();
+}
+
+TEST(ElementwiseGPUTest, Not) {
+  if (!caffe2::HasCudaGPU())
+    return;
+  elementwiseNot<caffe2::CUDAContext>();
+}
diff --git a/caffe2/operators/elementwise_op_test.cc b/caffe2/operators/elementwise_op_test.cc
new file mode 100644
index 0000000..ad0fe08
--- /dev/null
+++ b/caffe2/operators/elementwise_op_test.cc
@@ -0,0 +1,38 @@
+#include "caffe2/operators/elementwise_op_test.h"
+
+#include "caffe2/core/flags.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+template <>
+void CopyVector<caffe2::CPUContext, bool>(const int N, const bool* x, bool* y) {
+  memcpy(y, x, N * sizeof(bool));
+}
+
+template <>
+void CopyVector<caffe2::CPUContext, int32_t>(
+    const int N,
+    const int32_t* x,
+    int32_t* y) {
+  memcpy(y, x, N * sizeof(int32_t));
+}
+
+TEST(ElementwiseCPUTest, And) {
+  elementwiseAnd<caffe2::CPUContext>();
+}
+
+TEST(ElementwiseTest, Or) {
+  elementwiseOr<caffe2::CPUContext>();
+}
+
+TEST(ElementwiseTest, Xor) {
+  elementwiseXor<caffe2::CPUContext>();
+}
+
+TEST(ElementwiseTest, Not) {
+  elementwiseNot<caffe2::CPUContext>();
+}
+
+TEST(ElementwiseTest, EQ) {
+  elementwiseEQ<caffe2::CPUContext>();
+}
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
new file mode 100644
index 0000000..6b3151e
--- /dev/null
+++ b/caffe2/operators/elementwise_op_test.h
@@ -0,0 +1,276 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+template <typename Context, typename T>
+void CopyVector(const int N, const T* x, T* y);
+
+template <typename Context, typename I_Type, typename O_Type>
+void FillTensor(
+    caffe2::Workspace* ws,
+    const std::string& name,
+    const std::vector<caffe2::TIndex>& shape,
+    const std::vector<I_Type>& values) {
+  auto* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<caffe2::Tensor<Context>>();
+  tensor->Resize(shape);
+  auto* mutable_data = tensor->template mutable_data<O_Type>();
+  const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
+  CopyVector<Context, O_Type>(values.size(), data, mutable_data);
+}
+
+template <typename Context>
+caffe2::OperatorDef CreateOperatorDef() {
+  caffe2::OperatorDef def;
+  return def;
+}
+
+template <typename Context>
+caffe2::OperatorDef DefineOperator(const std::string& op_type) {
+  caffe2::OperatorDef def = CreateOperatorDef<Context>();
+  def.set_name("test");
+  def.set_type(op_type);
+  def.add_input("X");
+  def.add_input("Y");
+  def.add_output("Z");
+  return def;
+}
+
+template <typename Context>
+void elementwiseAnd() {
+  const int N = 4;
+  const int M = 2;
+  caffe2::Workspace ws;
+  auto def = DefineOperator<Context>("And");
+  { // equal size
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {N}, {true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), N);
+    std::vector<bool> result{true, false, false, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+  { // broadcast
+    auto* arg = def.add_arg();
+    arg->set_name("broadcast");
+    arg->set_i(1);
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {M, N}, {true, false, true, false, true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), M * N);
+    std::vector<bool> result{
+        true, false, false, false, true, false, false, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+}
+
+template <typename Context>
+void elementwiseOr() {
+  const int N = 4;
+  const int M = 2;
+  caffe2::Workspace ws;
+  auto def = DefineOperator<Context>("Or");
+  { // equal size
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {N}, {true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), N);
+    std::vector<bool> result{true, true, true, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+  { // broadcast
+    auto* arg = def.add_arg();
+    arg->set_name("broadcast");
+    arg->set_i(1);
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {M, N}, {true, false, true, false, true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), M * N);
+    std::vector<bool> result{true, true, true, false, true, true, true, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+}
+
+template <typename Context>
+void elementwiseXor() {
+  const int N = 4;
+  const int M = 2;
+  caffe2::Workspace ws;
+  auto def = DefineOperator<Context>("Xor");
+  { // equal size
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {N}, {true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), N);
+    std::vector<bool> result{false, true, true, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+  { // broadcast
+    auto* arg = def.add_arg();
+    arg->set_name("broadcast");
+    arg->set_i(1);
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {M, N}, {true, false, true, false, true, false, true, false});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, true, false, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), M * N);
+    std::vector<bool> result{
+        false, true, true, false, false, true, true, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+}
+
+template <typename Context>
+void elementwiseNot() {
+  const int N = 2;
+  caffe2::Workspace ws;
+  caffe2::OperatorDef def = CreateOperatorDef<Context>();
+  def.set_name("test");
+  def.set_type("Not");
+  def.add_input("X");
+  def.add_output("Y");
+  FillTensor<Context, uint8_t, bool>(&ws, "X", {N}, {true, false});
+  std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+  auto* blob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, blob);
+  caffe2::CPUContext context;
+  caffe2::TensorCPU Y(blob->Get<caffe2::Tensor<Context>>(), &context);
+  EXPECT_EQ(Y.size(), N);
+  std::vector<bool> result{false, true};
+  for (size_t i = 0; i < Y.size(); ++i) {
+    EXPECT_EQ(Y.template data<bool>()[i], result[i]);
+  }
+}
+
+template <typename Context>
+void elementwiseEQ() {
+  const int N = 4;
+  const int M = 2;
+  caffe2::Workspace ws;
+  auto def = DefineOperator<Context>("EQ");
+  { // equal size
+    FillTensor<Context, int32_t, int32_t>(&ws, "X", {N}, {1, 100, 5, -10});
+    FillTensor<Context, int32_t, int32_t>(&ws, "Y", {N}, {0, 100, 4, -10});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), N);
+    std::vector<bool> result{false, true, false, true};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+  { // boolean
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "X", {N}, {true, false, false, true});
+    FillTensor<Context, uint8_t, bool>(
+        &ws, "Y", {N}, {true, false, true, false});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), N);
+    std::vector<bool> result{true, true, false, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+  { // broadcast
+    auto* arg = def.add_arg();
+    arg->set_name("broadcast");
+    arg->set_i(1);
+    FillTensor<Context, int32_t, int32_t>(
+        &ws, "X", {M, N}, {1, 100, 5, -10, 3, 6, -1000, 33});
+    FillTensor<Context, int32_t, int32_t>(&ws, "Y", {N}, {1, 6, -1000, -10});
+    std::unique_ptr<caffe2::OperatorBase> op(caffe2::CreateOperator(def, &ws));
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+    auto* blob = ws.GetBlob("Z");
+    EXPECT_NE(nullptr, blob);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    EXPECT_EQ(Z.size(), M * N);
+    std::vector<bool> result{
+        true, false, false, true, false, true, true, false};
+    for (size_t i = 0; i < Z.size(); ++i) {
+      EXPECT_EQ(Z.template data<bool>()[i], result[i]);
+    }
+  }
+}
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_
diff --git a/caffe2/operators/elementwise_ops.cc b/caffe2/operators/elementwise_ops.cc
new file mode 100644
index 0000000..1cd7d65
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.cc
@@ -0,0 +1,127 @@
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Not,
+    UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    Sign,
+    UnaryElementwiseOp<NumericTypes, CPUContext, SignFunctor<CPUContext>>);
+
+#define REGISTER_CPU_COMPARE_OPERATOR(Op)                     \
+  REGISTER_CPU_OPERATOR(                                      \
+      Op,                                                     \
+      BinaryElementwiseOp<                                    \
+          TensorTypes<bool, int32_t, int64_t, float, double>, \
+          CPUContext,                                         \
+          Op##Functor<CPUContext>,                            \
+          FixedType<bool>>)
+
+REGISTER_CPU_COMPARE_OPERATOR(EQ);
+REGISTER_CPU_COMPARE_OPERATOR(NE);
+REGISTER_CPU_COMPARE_OPERATOR(LT);
+REGISTER_CPU_COMPARE_OPERATOR(LE);
+REGISTER_CPU_COMPARE_OPERATOR(GT);
+REGISTER_CPU_COMPARE_OPERATOR(GE);
+
+#undef REGISTER_CPU_COMPARE_OPERATOR
+
+#define REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Op) \
+  REGISTER_CPU_OPERATOR(                         \
+      Op, BinaryElementwiseOp<BoolTypes, CPUContext, Op##Functor<CPUContext>>)
+
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(And);
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Or);
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Xor);
+
+#undef REGISTER_CPU_LOGICAL_BINARY_OPERATOR
+
+#define REGISTER_CPU_BITWISE_BINARY_OPERATOR(Op) \
+  REGISTER_CPU_OPERATOR(                         \
+      Op,                                        \
+      BinaryElementwiseOp<IntBoolTypes, CPUContext, Op##Functor<CPUContext>>)
+
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseAnd);
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseOr);
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseXor);
+
+#undef REGISTER_CPU_BITWISE_BINARY_OPERATOR
+
+template <typename T>
+void SRLHelper::sum2one(const T* x, T* y, size_t n) {
+  *y = ConstEigenArrayMap<T>(x, n, 1).sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcastFront(
+    const T* x,
+    T* y,
+    size_t pre,
+    size_t n,
+    CPUContext*) {
+  EigenArrayMap<T>(y, n, 1) = ConstEigenArrayMap<T>(x, n, pre).rowwise().sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcastBack(
+    const T* x,
+    T* y,
+    size_t post,
+    size_t n,
+    CPUContext*) {
+  EigenArrayMap<T>(y, 1, n) = ConstEigenArrayMap<T>(x, post, n).colwise().sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcast2(
+    const T* a,
+    T* y,
+    size_t pre,
+    size_t n,
+    size_t post,
+    CPUContext*) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = 0;
+    for (int j = 0; j < pre; ++j) {
+      for (int k = 0; k < post; ++k) {
+        y[i] += a[(j * n + i) * post + k];
+      }
+    }
+  }
+}
+
+template <>
+template <typename T>
+bool SumReduceLikeOp<CPUContext>::DoRunWithType() {
+  const auto& A = Input(0);
+  const auto& B = Input(1);
+  auto* C = Output(0);
+  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
+  C->ResizeLike(B);
+  const T* Adata = A.template data<T>();
+  auto* Cdata = C->template mutable_data<T>();
+  if (B.size() == 1) {
+    auto count = A.size();
+    SRLHelper::sum2one<T>(Adata, Cdata, count);
+  } else {
+    size_t pre, n, post;
+    std::tie(pre, n, post) =
+        elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+    if (post == 1) {
+      SRLHelper::RunWithBroadcastFront<T>(Adata, Cdata, pre, n, &context_);
+    } else if (pre == 1) {
+      SRLHelper::RunWithBroadcastBack<T>(Adata, Cdata, post, n, &context_);
+    } else {
+      SRLHelper::RunWithBroadcast2<T>(Adata, Cdata, pre, n, post, &context_);
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(SumReduceLike, SumReduceLikeOp<CPUContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
new file mode 100644
index 0000000..b564d8a
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.cu
@@ -0,0 +1,221 @@
+#include "caffe2/operators/elementwise_ops.h"
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/device/device_reduce.cuh>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Not,
+    UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    Sign,
+    UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);
+
+#define REGISTER_CUDA_COMPARE_OPERATOR(Op)                    \
+  REGISTER_CUDA_OPERATOR(                                     \
+      Op,                                                     \
+      BinaryElementwiseOp<                                    \
+          TensorTypes<bool, int32_t, int64_t, float, double>, \
+          CUDAContext,                                        \
+          Op##Functor<CUDAContext>,                           \
+          FixedType<bool>>)
+
+REGISTER_CUDA_COMPARE_OPERATOR(EQ);
+REGISTER_CUDA_COMPARE_OPERATOR(NE);
+REGISTER_CUDA_COMPARE_OPERATOR(LT);
+REGISTER_CUDA_COMPARE_OPERATOR(LE);
+REGISTER_CUDA_COMPARE_OPERATOR(GT);
+REGISTER_CUDA_COMPARE_OPERATOR(GE);
+
+#undef REGISTER_CUDA_COMPARE_OPERATOR
+
+#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
+  REGISTER_CUDA_OPERATOR(                         \
+      Op,                                         \
+      BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)
+
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);
+
+#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR
+
+#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
+  REGISTER_CUDA_OPERATOR(                         \
+      Op,                                         \
+      BinaryElementwiseOp<                        \
+          IntBoolTypes,                           \
+          CUDAContext,                            \
+          Op##Functor<CUDAContext>>)
+
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);
+
+#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR
+
+namespace {
+
+template <typename T>
+__global__ void
+reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n >= N) {
+    return;
+  }
+
+  float sum = 0.0;
+  for (int i = 0; i < pre; ++i) {
+    sum += convert::To<T, float>(g_idata[i * N + n]);
+  }
+
+  g_odata[n] = convert::To<float, T>(sum);
+}
+
+template <typename T>
+void device_reduce(
+    const T* d_in,
+    T* d_out,
+    int N,
+    Tensor<CUDAContext>* buffer,
+    CUDAContext* context) {
+  // Determine temporary device storage requirements
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Sum(
+      NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());
+
+  auto buffer_size = temp_storage_bytes / sizeof(T);
+  buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
+  buffer->Resize(buffer_size);
+  void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
+  // Run sum-reduction
+  cub::DeviceReduce::Sum(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      N,
+      context->cuda_stream());
+}
+
+template <>
+void device_reduce<float16>(
+    const float16* in,
+    float16* out,
+    int N,
+    Tensor<CUDAContext>* buffer,
+    CUDAContext* context) {
+  auto buffer_size = 1;
+
+  if (buffer->size() != buffer_size) {
+    buffer->Resize(buffer_size);
+
+    math::Set<float16, CUDAContext>(
+        N,
+        convert::To<float, float16>(1.),
+        buffer->mutable_data<float16>(),
+        context);
+  }
+
+  CUBLAS_ENFORCE(cublasDotEx(
+      context->cublas_handle(),
+      N,
+      in,
+      CUDA_R_16F,
+      1,
+      buffer->data<float16>(),
+      CUDA_R_16F,
+      0,
+      out,
+      CUDA_R_16F,
+      CUDA_R_32F));
+}
+
+template <typename T, int BLOCK_THREADS>
+__global__ void
+reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
+  int n = blockIdx.x;
+  float sum = 0.0;
+  int limit = pre * post;
+  for (int i = threadIdx.x; i < limit; i += blockDim.x) {
+    int curPre = i / post;
+    int curPost = i % post;
+
+    sum +=
+        convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
+  }
+  // uses a shared memory reduction within block
+  typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
+  // Shared memory
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+  float aggregate = BlockReduceT(temp_storage).Sum(sum);
+  if (threadIdx.x == 0) {
+    g_odata[n] = convert::To<float, T>(aggregate);
+  }
+}
+} // namespace
+
+template <>
+template <typename T>
+bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
+  const auto& A = Input(0);
+  const auto& B = Input(1);
+  auto* C = Output(0);
+  auto count = A.size();
+  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
+  C->ResizeLike(B);
+  const T* Adata = A.template data<T>();
+  auto* Cdata = C->template mutable_data<T>();
+
+  if (C->size() == 0) {
+    // output is empty, nothing to do, not even launching the CUDA kernel
+    return true;
+  }
+
+  if (B.size() == 1) {
+    device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
+  } else {
+    size_t pre, n, post;
+    std::tie(pre, n, post) =
+        elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+    // because we check shape(B) \in shape(A) before,
+    // post and pre cannot be 1 at same time
+    if (post == 1) {
+      reduce_sum_like_post1<T>
+          <<<CAFFE_GET_BLOCKS(n),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(Adata, Cdata, pre, n);
+    } else {
+      if (post >= 128) {
+        reduce_sum_like<T, 512>
+            <<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else if (post >= 64) {
+        reduce_sum_like<T, 128>
+            <<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else if (post >= 32) {
+        reduce_sum_like<T, 64>
+            <<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else {
+        reduce_sum_like<T, 32>
+            <<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops.h b/caffe2/operators/elementwise_ops.h
new file mode 100644
index 0000000..aec5ea4
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.h
@@ -0,0 +1,521 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
+
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
+using IntTypes = TensorTypes<int32_t, int64_t>;
+using BoolTypes = TensorTypes<bool>;
+using IntBoolTypes = TensorTypes<int32_t, int64_t, bool>; // discrete types
+
+struct SameTypeAsInput {
+  template <typename T>
+  using type = T;
+};
+
+template <typename R>
+struct FixedType {
+  template <typename T>
+  using type = R;
+};
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+class UnaryElementwiseWithArgsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), functor_(*this) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    return functor_(
+        X.size(),
+        X.template data<T>(),
+        Y->template mutable_data<typename OutputTypeMap::template type<T>>(),
+        &context_);
+  }
+
+ private:
+  Functor functor_;
+};
+
+// UnaryFunctorWithDefaultCtor is a functor that can be used as the functor of
+// an UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
+// another functor that doesn't accept arguments in its constructor.
+template <class Functor>
+struct UnaryFunctorWithDefaultCtor {
+  explicit UnaryFunctorWithDefaultCtor(OperatorBase& /* op */) {}
+
+  template <typename TIn, typename TOut, class Context>
+  bool operator()(const int size, const TIn* X, TOut* Y, Context* context)
+      const {
+    return functor(size, X, Y, context);
+  }
+
+  Functor functor{};
+};
+
+// UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
+// difference that it takes a functor with default constructor, e.g. that does
+// not need to take into consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
+    InputTypes,
+    Context,
+    UnaryFunctorWithDefaultCtor<Functor>,
+    OutputTypeMap>;
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+class BinaryElementwiseWithArgsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BinaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* C = Output(0);
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+    std::vector<int> A_dims;
+    std::vector<int> B_dims;
+
+    if (legacy_broadcast_) {
+      CAFFE_ENFORCE_NE(
+          C,
+          &B,
+          "In-place is allowed only with the first tensor when "
+          "legacy-broadcasting");
+      C->ResizeLike(A);
+      if (B.size() == 1) {
+        A_dims = {static_cast<int>(A.size())};
+        B_dims = {1};
+      } else {
+        size_t pre, n, post;
+        std::tie(pre, n, post) =
+            elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+        A_dims = {
+            static_cast<int>(pre), static_cast<int>(n), static_cast<int>(post)};
+        B_dims = {static_cast<int>(n), 1};
+      }
+    } else {
+      std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+      std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      const std::vector<int> C_dims =
+          elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
+              A_dims, B_dims);
+      if (C == &A) {
+        CAFFE_ENFORCE_EQ(C_dims, A_dims);
+      } else if (C == &B) {
+        CAFFE_ENFORCE_EQ(C_dims, B_dims);
+      } else {
+        C->Resize(C_dims);
+      }
+    }
+    auto* C_data =
+        C->template mutable_data<typename OutputTypeMap::template type<T>>();
+    return functor_.Forward(A_dims, B_dims, A_data, B_data, C_data, &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  Functor functor_;
+};
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput,
+    class GradientTypeMap = SameTypeAsInput>
+class BinaryElementwiseWithArgsGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BinaryElementwiseWithArgsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& dC = Input(0);
+    const auto& A = Input(1);
+    const auto& B = Input(2);
+    auto* dA = Output(0);
+    auto* dB = Output(1);
+    vector<int> A_dims;
+    vector<int> B_dims;
+    if (legacy_broadcast_) {
+      if (B.size() == 1) {
+        A_dims = {static_cast<int>(A.size())};
+        B_dims = {1};
+      } else {
+        size_t pre, n, post;
+        std::tie(pre, n, post) =
+            elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+        A_dims = {
+            static_cast<int>(pre), static_cast<int>(n), static_cast<int>(post)};
+        B_dims = {static_cast<int>(n), 1};
+      }
+    } else {
+      std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+      std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+    }
+    const typename OutputTypeMap::template type<T>* C_data = nullptr;
+    if (InputSize() == 4) {
+      const auto& C = Input(3);
+      C_data = C.template data<typename OutputTypeMap::template type<T>>();
+    }
+    const auto* dC_data =
+        dC.template data<typename GradientTypeMap::template type<T>>();
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+    dA->ResizeLike(A);
+    dB->ResizeLike(B);
+    auto* dA_data =
+        dA->template mutable_data<typename GradientTypeMap::template type<T>>();
+    auto* dB_data =
+        dB->template mutable_data<typename GradientTypeMap::template type<T>>();
+    return functor_.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  Functor functor_;
+};
+
+template <class Functor>
+struct BinaryFunctorWithDefaultCtor {
+  explicit BinaryFunctorWithDefaultCtor(OperatorBase& /* op */) {}
+
+  template <typename TIn, typename TOut, class Context>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A_data,
+      const TIn* B_data,
+      TOut* C_data,
+      Context* context) const {
+    return functor.Forward(A_dims, B_dims, A_data, B_data, C_data, context);
+  }
+
+  template <typename TGrad, typename TIn, typename TOut, class Context>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const {
+    return functor.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        context);
+  }
+
+  Functor functor{};
+};
+
+// BinaryElementwiseOp is a wrapper around BinaryElementwiseWithArgsOp, with the
+// difference that it takes a functor with default constructor, e.g. that does
+// not need to take into consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class TypeMap = SameTypeAsInput>
+using BinaryElementwiseOp = BinaryElementwiseWithArgsOp<
+    InputTypes,
+    Context,
+    BinaryFunctorWithDefaultCtor<Functor>,
+    TypeMap>;
+
+// BinaryElementwiseGradientOp is a wrapper around
+// BinaryElementwiseGradientWithArgsOp, with the difference that it takes a
+// functor with default constructor, e.g. that does not need to take into
+// consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput,
+    class GradientTypeMap = SameTypeAsInput>
+using BinaryElementwiseGradientOp = BinaryElementwiseWithArgsGradientOp<
+    InputTypes,
+    Context,
+    BinaryFunctorWithDefaultCtor<Functor>,
+    OutputTypeMap,
+    GradientTypeMap>;
+
+// Forward-only Unary Functors.
+template <class Context>
+struct NotFunctor {
+  bool operator()(const int N, const bool* X, bool* Y, Context* context) const {
+    math::Not(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct SignFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sign(N, X, Y, context);
+    return true;
+  }
+};
+
+// Forward-only Binary Functors.
+#define CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(FunctorName) \
+  template <class Context>                                     \
+  struct FunctorName##Functor {                                \
+    template <typename TIn, typename TOut>                     \
+    bool Forward(                                              \
+        const std::vector<int>& A_dims,                        \
+        const std::vector<int>& B_dims,                        \
+        const TIn* A,                                          \
+        const TIn* B,                                          \
+        TOut* C,                                               \
+        Context* context) const {                              \
+      math::FunctorName(                                       \
+          A_dims.size(),                                       \
+          A_dims.data(),                                       \
+          B_dims.size(),                                       \
+          B_dims.data(),                                       \
+          A,                                                   \
+          B,                                                   \
+          C,                                                   \
+          context);                                            \
+      return true;                                             \
+    }                                                          \
+  };
+
+// Compare functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(EQ);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(NE);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LT);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LE);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GT);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GE);
+
+// Logical functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(And);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Or);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Xor);
+
+// Bitwise functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseAnd);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseOr);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseXor);
+
+#undef CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR
+
+namespace SRLHelper {
+
+template <typename T>
+void sum2one(const T* a, T* y, size_t n);
+
+template <typename T>
+void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*);
+
+template <typename T>
+void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*);
+
+template <typename T>
+void RunWithBroadcast2(
+    const T* a,
+    T* y,
+    size_t pre,
+    size_t n,
+    size_t post,
+    CPUContext*);
+
+} // namespace SRLHelper
+
+// Sum reduction operator that is used for computing the gradient in cases
+// where the forward op is in broadcast mode.
+template <class Context>
+class SumReduceLikeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
+    if (axis_ != -1) {
+      // Get axis from an explicit axis argument.
+      CAFFE_ENFORCE_EQ(
+          axis_str_.size(),
+          0,
+          "Args axis and axis_str cannot be used simultaneously.");
+    } else if (axis_str_.size()) {
+      // Get the axis index semantically.
+      CAFFE_ENFORCE_EQ(
+          axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+      size_t semantic_axis = order_.find(axis_str_);
+      CAFFE_ENFORCE_NE(
+          semantic_axis,
+          string::npos,
+          "Unrecognizable axis string ",
+          axis_str_,
+          " from order string ",
+          order_);
+      axis_ = semantic_axis;
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  int axis_;
+  string axis_str_;
+  string order_;
+  Tensor<Context> ones_;
+  Tensor<Context> sum_buffer_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
diff --git a/caffe2/operators/elementwise_ops_schema.cc b/caffe2/operators/elementwise_ops_schema.cc
new file mode 100644
index 0000000..4cf9702
--- /dev/null
+++ b/caffe2/operators/elementwise_ops_schema.cc
@@ -0,0 +1,987 @@
+#include "caffe2/operators/elementwise_ops.h"
+
+#include "caffe2/core/operator_gradient.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+const char* kBroadcastDoc = R"DOC(
+If necessary the right-hand-side argument will be broadcasted to match the
+shape of left-hand-side argument. When broadcasting is specified, the second
+tensor can either be of size 1 (a scalar value), or having its shape as a
+contiguous subset of the first tensor's shape. The starting of the mutually
+equal shape is specified by the argument "axis", and if it is not set, suffix
+matching is assumed. 1-dim expansion doesn't work yet.
+
+For example, the following tensor shapes are supported (with broadcast=1):
+```
+  shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar
+  shape(A) = (2, 3, 4, 5), shape(B) = (5,)
+  shape(A) = (2, 3, 4, 5), shape(B) = (4, 5)
+  shape(A) = (2, 3, 4, 5), shape(B) = (3, 4), with axis=1
+  shape(A) = (2, 3, 4, 5), shape(B) = (2), with axis=0
+```
+Argument `broadcast=1` needs to be passed to enable broadcasting.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
+
+)DOC";
+
+const char* kAddExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Add",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([[1,2],[3,4]]))
+workspace.FeedBlob("B", np.array([[5,6],[7,8]]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[1 2]
+ [3 4]]
+B:
+[[5 6]
+ [7 8]]
+C:
+[[ 6  8]
+ [10 12]]
+
+```
+
+</details>
+
+)DOC";
+
+const char* kSubExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sub",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([[10,12],[4,14]]))
+workspace.FeedBlob("B", np.array([[5,16],[1,19]]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[10 12]
+ [ 4 14]]
+B:
+[[ 5 16]
+ [ 1 19]]
+C:
+[[ 5 -4]
+ [ 3 -5]]
+
+```
+
+</details>
+
+)DOC";
+
+const char* kMulExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Mul",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([[1,2],[3,4]]))
+workspace.FeedBlob("B", np.array([[5,6],[7,8]]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[1 2]
+ [3 4]]
+B:
+[[5 6]
+ [7 8]]
+C:
+[[ 5 12]
+ [21 32]]
+
+```
+
+</details>
+
+)DOC";
+
+const char* kDivExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Div",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([[18,8],[2,9]]))
+workspace.FeedBlob("B", np.array([[9,2],[3,2]]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[18  8]
+ [ 2  9]]
+B:
+[[9 2]
+ [3 2]]
+C:
+[[2 4]
+ [0 4]]
+
+```
+
+</details>
+)DOC";
+
+std::function<void(OpSchema&)> MathDocGenerator(const char* name, const char* extra) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+Performs element-wise binary {name} (with limited broadcast support).
+{broadcast_doc}
+
+{extra}
+)DOC";
+    ReplaceAll(doc, "{name}", name);
+    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
+    ReplaceAll(doc, "{extra}", extra);
+    schema.SetDoc(doc);
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting");
+    schema.Arg(
+        "axis",
+        "*(type: int; default: -1)* Axis to concatenate on.");
+    schema.Input(
+        0,
+        "A",
+        "*(type: Tensor`<float>`)* First operand, should share the type with the second operand.");
+    schema.Input(
+        1,
+        "B",
+        "*(type: Tensor`<float>`)* Second operand. With broadcasting can be of smaller size than A. "
+        "If broadcasting is disabled it should be of the same size as A.");
+    schema.Output(0, "C", "*(type: Tensor`<float>`)* Output tensor with same dimensions and type as A.");
+  };
+}
+
+std::vector<TensorShape> ElementwiseOpShapeInference(
+    const OperatorDef& def,
+    const std::vector<TensorShape>& in) {
+  std::vector<TensorShape> out(1);
+  out[0].set_data_type(in[0].data_type());
+  ArgumentHelper helper(def);
+  const bool broadcast = helper.GetSingleArgument<bool>("broadcast", false);
+  if (broadcast) {
+    out[0].mutable_dims()->CopyFrom(in[0].dims());
+  } else {
+    const std::vector<int> A_dims(in[0].dims().begin(), in[0].dims().end());
+    const std::vector<int> B_dims(in[1].dims().begin(), in[1].dims().end());
+    const std::vector<int> C_dims =
+        elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
+            A_dims, B_dims);
+    for (const int dim : C_dims) {
+      out[0].add_dims(dim);
+    }
+  }
+  return out;
+}
+
+} // namespace
+
+OPERATOR_SCHEMA(Add)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}, {1, 0}})
+    .CostInferenceFunction(PointwiseCostInference<1>)
+    .TensorInferenceFunction(ElementwiseOpShapeInference)
+    .FillUsing(MathDocGenerator("addition", kAddExample))
+    .InheritOnnxSchema("Add");
+OPERATOR_SCHEMA(AddGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {0, 1}});
+
+OPERATOR_SCHEMA(Sub)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}, {1, 0}})
+    .CostInferenceFunction(PointwiseCostInference<1>)
+    .TensorInferenceFunction(ElementwiseOpShapeInference)
+    .FillUsing(MathDocGenerator("subtraction", kSubExample))
+    .InheritOnnxSchema("Sub");
+OPERATOR_SCHEMA(SubGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {0, 1}});
+
+OPERATOR_SCHEMA(Mul)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}, {1, 0}})
+    .CostInferenceFunction(PointwiseCostInference<1>)
+    .TensorInferenceFunction(ElementwiseOpShapeInference)
+    .FillUsing(MathDocGenerator("multiplication", kMulExample))
+    .InheritOnnxSchema("Mul");
+OPERATOR_SCHEMA(MulGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {0, 1}});
+
+OPERATOR_SCHEMA(Div)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(PointwiseCostInference<1>)
+    .TensorInferenceFunction(ElementwiseOpShapeInference)
+    .FillUsing(MathDocGenerator("division", kDivExample))
+    .InheritOnnxSchema("Div");
+OPERATOR_SCHEMA(DivGradient)
+    .NumInputs(3, 4)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}});
+
+OPERATOR_SCHEMA(SumReduceLike)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+SumReduceLike operator takes 2 tensors as input. It performs reduce sum to the
+first input so that the output looks like the second one.
+It assumes that the first input
+has more dimensions than the second, and the dimensions of the second input is
+the contiguous subset of the dimensions of the first.
+For example, the following tensor shapes are supported:
+
+  shape(A) = (2, 3, 4, 5), shape(B) = (4, 5)
+  shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar
+  shape(A) = (2, 3, 4, 5), shape(B) = (3, 4), with axis=1
+  shape(A) = (2, 3, 2, 5), shape(B) = (2), with axis=0
+    )DOC")
+    .Arg(
+        "axis",
+        "If set, defines the starting dimension for reduction. Args `axis` and "
+        "`axis_str` cannot be used simultaneously.")
+    .Arg(
+        "axis_str",
+        "If set, it could only be N or C or H or W. `order` arg should also be "
+        "provided. It defines the reduction dimensions on NCHW or NHWC. Args "
+        "`axis` and `axis_str` cannot be used simultaneously.")
+    .Arg("order", "Either NHWC or HCWH")
+    .Input(
+        0,
+        "A",
+        "First operand, should share the type with the second operand.")
+    .Input(
+        1,
+        "B",
+        "Second operand. With broadcasting can be of smaller size than A. "
+        "If broadcasting is disabled it should be of the same size.")
+    .Output(0, "C", "Result, has same dimensions and type as B");
+
+const char* kLTExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LT",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [False False  True False False  True]
+
+```
+
+</details>
+)DOC";
+
+const char* kLEExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LE",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [ True False  True  True  True  True]
+
+```
+
+</details>
+)DOC";
+
+const char* kGTExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "GT",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [False  True False False False False]
+
+```
+
+</details>
+)DOC";
+
+const char* kGEExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "GE",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [ True  True False  True  True False]
+
+```
+
+</details>
+)DOC";
+
+const char* kEQExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "EQ",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [ True False False  True  True False]
+```
+
+</details>
+)DOC";
+
+const char* kNEExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "NE",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
+workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+A: [ 1  5  2  9 12  3]
+B: [ 1  3  4  9 12  8]
+C: [False  True  True False False  True]
+```
+
+</details>
+)DOC";
+
+std::function<void(OpSchema&)> ComparisonDocGenerator(
+    const char* name,
+    const char* desc,
+    const char* extra) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+Performs element-wise {desc} comparison **{name}** (with limited broadcast support).
+
+{broadcast_doc}
+
+{extra}
+)DOC";
+    ReplaceAll(doc, "{name}", name);
+    ReplaceAll(doc, "{desc}", desc);
+    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
+    ReplaceAll(doc, "{extra}", extra);
+    schema.SetDoc(doc);
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
+    schema.Arg(
+        "axis",
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
+    schema.Input(
+        0,
+        "A",
+        "*(type: Tensor`<bool>`)* First operand, should share the type with the second operand.");
+    schema.Input(
+        1,
+        "B",
+        "*(type: Tensor`<bool>`)* Second operand. With broadcasting can be of smaller size than `A`. "
+        "If broadcasting is disabled it should be of the same size.");
+    schema.Output(0, "C", "*(type: Tensor`<bool>`)* Output tensor with same dimensions as `A`.");
+  };
+}
+
+#define CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(name, symbol, desc, extra)      \
+  OPERATOR_SCHEMA(name)                                                        \
+      .NumInputs(2)                                                            \
+      .NumOutputs(1)                                                           \
+      .TensorInferenceFunction(                                                \
+          [](const OperatorDef& def, const vector<TensorShape>& in) {          \
+            ArgumentHelper helper(def);                                        \
+            const auto broadcasted =                                           \
+                helper.GetSingleArgument<bool>("broadcast", false);            \
+            if (!broadcasted) {                                                \
+              CAFFE_ENFORCE_EQ(in[0].dims().size(), in[1].dims().size());      \
+              for (int i = 0; i < in[0].dims().size(); ++i) {                  \
+                CAFFE_ENFORCE_EQ(in[0].dims(i), in[1].dims(i));                \
+              }                                                                \
+            }                                                                  \
+            auto output_dims =                                                 \
+                std::vector<TIndex>(in[0].dims().begin(), in[0].dims().end()); \
+            return vector<TensorShape>{                                        \
+                CreateTensorShape(output_dims, TensorProto::BOOL)};            \
+          })                                                                   \
+      .FillUsing(ComparisonDocGenerator(symbol, desc, extra));                 \
+  SHOULD_NOT_DO_GRADIENT(name)
+
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equal to", kEQExample);
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(NE, "!=", "not equal to", kNEExample);
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LT, "<", "less than", kLTExample);
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LE, "<=", "less or equal than", kLEExample);
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GT, ">", "greater than", kGTExample);
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GE, ">=", "greater or equal than", kGEExample);
+
+const char* kAndExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "And",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
+workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+ [[ True False False]
+ [False  True False]
+ [False False  True]]
+B:
+ [[ True False  True]
+ [False False False]
+ [False False False]]
+C:
+ [[ True False False]
+ [False False False]
+ [False False False]]
+
+```
+
+</details>
+)DOC";
+
+const char* kOrExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Or",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
+workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[False  True  True]
+ [False  True  True]
+ [ True  True  True]]
+B:
+[[False  True False]
+ [ True  True  True]
+ [False  True False]]
+C:
+[[False  True  True]
+ [ True  True  True]
+ [ True  True  True]]
+
+```
+
+</details>
+)DOC";
+
+const char* kXorExample = R"DOC(
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Xor",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
+workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("C"))
+
+```
+
+**Result**
+
+```
+
+A:
+[[ True  True  True]
+ [False False  True]
+ [False  True False]]
+B:
+[[False False False]
+ [ True  True  True]
+ [False False False]]
+C:
+[[ True  True  True]
+ [ True  True False]
+ [False  True False]]
+
+```
+
+</details>
+)DOC";
+
+std::function<void(OpSchema&)> LogicalDocGenerator(const char* name, const char* extra) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+Performs element-wise logical operation **{name}** (with limited broadcast support).
+Both input operands should be of type `bool`.
+
+{broadcast_doc}
+
+{extra}
+    )DOC";
+    ReplaceAll(doc, "{name}", name);
+    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
+    ReplaceAll(doc, "{extra}", extra);
+    schema.SetDoc(doc);
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
+    schema.Arg(
+        "axis",
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
+    schema.Input(0, "A", "*(type: Tensor`<bool>`)* First operand.");
+    schema.Input(
+        1,
+        "B",
+        "*(type: Tensor`<bool>`)* Second operand. With broadcasting can be of smaller size than `A`. "
+        "If broadcasting is disabled it should be of the same size.");
+    schema.Output(0, "C", "*(type: Tensor`<bool>`)* Output tensor of booleans. Has same dimensions as input `A`.");
+  };
+}
+
+#define CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(name, symbol, onnx_schema, extra) \
+  OPERATOR_SCHEMA(name)                                                \
+      .NumInputs(2)                                                    \
+      .NumOutputs(1)                                                   \
+      .AllowInplace({{0, 0}})                                          \
+      .FillUsing(LogicalDocGenerator(symbol, extra))                   \
+      .InheritOnnxSchema(onnx_schema);                                 \
+  SHOULD_NOT_DO_GRADIENT(name)
+
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or", kOrExample);
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And", kAndExample);
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor", kXorExample);
+
+#undef CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP
+
+std::function<void(OpSchema&)> BitwiseDocGenerator(const char* name) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+Performs element-wise bitwise operation `{name}` (with limited broadcast support).
+Both input operands should be of type `bool`.
+{broadcast_doc})DOC";
+    ReplaceAll(doc, "{name}", name);
+    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
+    schema.SetDoc(doc);
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
+    schema.Arg(
+        "axis",
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
+    schema.Input(0, "A", "*(type: Tensor)* First operand.");
+    schema.Input(
+        1,
+        "B",
+        "*(type: Tensor)* Second operand. With broadcasting can be of smaller size than `A`. "
+        "If broadcasting is disabled it should be of the same size.");
+    schema.Output(0, "C", "*(type: Tensor)* Output tensor. Has same dimensions as input `A`.");
+  };
+}
+
+#define CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(name, symbol) \
+  OPERATOR_SCHEMA(name)                                   \
+      .NumInputs(2)                                       \
+      .NumOutputs(1)                                      \
+      .AllowInplace({{0, 0}})                             \
+      .FillUsing(BitwiseDocGenerator(symbol));            \
+  SHOULD_NOT_DO_GRADIENT(name)
+
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseOr, "bitwise_or");
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseAnd, "bitwise_and");
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseXor, "bitwise_xor");
+
+#undef CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP
+
+OPERATOR_SCHEMA(Not)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs element-wise negation on input tensor `X`.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+"Not",
+["X"],
+["Y"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3, 3) > 0.5))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[ True False False]
+[False False False]
+[ True  True  True]]
+Y:
+[[False  True  True]
+[ True  True  True]
+[False False False]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X", "*(Tensor`<bool>`)* Input tensor.")
+    .Output(0, "Y", "*(Tensor`<bool>`)* Negated output tensor.")
+    .InheritOnnxSchema("Not");
+SHOULD_NOT_DO_GRADIENT(Not);
+
+OPERATOR_SCHEMA(Sign)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes sign for each element of the input: -1, 0 or 1.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+"Sign",
+["X"],
+["Y"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3, 3).astype(np.float32) - np.random.rand(3, 3).astype(np.float32)))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[ 0.02816287  0.22408086 -0.30342305]
+[-0.18481976  0.03948995  0.39698976]
+[-0.63304734 -0.6919183  -0.31524038]]
+Y:
+[[ 1.  1. -1.]
+[-1.  1.  1.]
+[-1. -1. -1.]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
+SHOULD_NOT_DO_GRADIENT(Sign);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc
new file mode 100644
index 0000000..3d906c9
--- /dev/null
+++ b/caffe2/operators/elementwise_ops_utils.cc
@@ -0,0 +1,73 @@
+#include "caffe2/operators/elementwise_ops_utils.h"
+
+namespace caffe2 {
+namespace elementwise_ops_utils {
+
+std::vector<int> ComputeBinaryBroadcastForwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims) {
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> C_dims(ndim);
+  int i = A_dims.size() - 1;
+  int j = B_dims.size() - 1;
+  int k = ndim - 1;
+  for (; i >= 0 && j >= 0; --k) {
+    const int A_dim = A_dims[i];
+    const int B_dim = B_dims[j];
+    CAFFE_ENFORCE(A_dim == B_dim || A_dim == 1 || B_dim == 1);
+    if (A_dim == 0 || B_dim == 0) {
+      C_dims[k] = 0;
+    } else {
+      C_dims[k] = std::max(A_dims[i], B_dims[j]);
+    }
+    --i;
+    --j;
+  }
+  for (; i >= 0; --i) {
+    C_dims[k--] = A_dims[i];
+  }
+  for (; j >= 0; --j) {
+    C_dims[k--] = B_dims[j];
+  }
+  return C_dims;
+}
+
+void ComputeBinaryBroadcastBackwardAxes(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_axes,
+    std::vector<int>* B_axes) {
+  A_axes->clear();
+  B_axes->clear();
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  int i = A_dims.size() - 1;
+  int j = B_dims.size() - 1;
+  int k = ndim - 1;
+  for (; i >= 0 && j >= 0; --k) {
+    CAFFE_ENFORCE(A_dims[i] == B_dims[j] || A_dims[i] == 1 || B_dims[j] == 1);
+    if (A_dims[i] != B_dims[j]) {
+      if (A_dims[i] == 1) {
+        A_axes->push_back(k);
+      }
+      if (B_dims[j] == 1) {
+        B_axes->push_back(k);
+      }
+    }
+    --i;
+    --j;
+  }
+  if (i < 0) {
+    for (; k >= 0; --k) {
+      A_axes->push_back(k);
+    }
+  } else {
+    for (; k >= 0; --k) {
+      B_axes->push_back(k);
+    }
+  }
+  std::reverse(A_axes->begin(), A_axes->end());
+  std::reverse(B_axes->begin(), B_axes->end());
+}
+
+} // namespace elementwise_ops_utils
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops_utils.h b/caffe2/operators/elementwise_ops_utils.h
new file mode 100644
index 0000000..dd37b12
--- /dev/null
+++ b/caffe2/operators/elementwise_ops_utils.h
@@ -0,0 +1,68 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OPS_UTILS_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OPS_UTILS_H_
+
+#include <tuple>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace elementwise_ops_utils {
+
+template <typename Context>
+std::tuple<size_t, size_t, size_t> ComputeLegacyBroadcastSizes(
+    const Tensor<Context>& A,
+    const Tensor<Context>& B,
+    int axis) {
+  CAFFE_ENFORCE_GE(
+      A.ndim(),
+      B.ndim(),
+      "If you are doing broadcasting, input1 should have "
+      "a smaller or equal number of dimensions.");
+  if (axis == -1) {
+    axis = A.ndim() - B.ndim();
+  }
+  CAFFE_ENFORCE(
+      axis >= 0 && axis <= A.ndim() - B.ndim(),
+      "Broadcast axis should be in the range of"
+      "[0, A.ndim() - B.ndim()], but axis = ",
+      axis);
+
+  int b_dim_start = 0;
+  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
+    ++b_dim_start;
+  }
+  int b_dim_end = B.ndim() - 1;
+  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
+    --b_dim_end;
+  }
+  size_t pre = 1, n = 1, post = 1;
+  for (int i = 0; i < axis + b_dim_start; ++i) {
+    pre *= A.dim(i);
+  }
+  for (int i = b_dim_start; i <= b_dim_end; ++i) {
+    CAFFE_ENFORCE_EQ(
+        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
+    n *= B.dim(i);
+  }
+  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
+    post *= A.dim(i);
+  }
+  return std::make_tuple(pre, n, post);
+}
+
+std::vector<int> ComputeBinaryBroadcastForwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims);
+
+void ComputeBinaryBroadcastBackwardAxes(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_axes,
+    std::vector<int>* B_axes);
+
+} // namespace elementwise_ops_utils
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_OPS_UTILS_H_
diff --git a/caffe2/operators/elementwise_sub_gradient_op.cc b/caffe2/operators/elementwise_sub_gradient_op.cc
new file mode 100644
index 0000000..4743600
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_gradient_op.cc
@@ -0,0 +1,33 @@
+#include "caffe2/operators/elementwise_sub_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SubGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        SubFunctor<CPUContext>>);
+
+namespace {
+
+class GetSubGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SubGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sub, GetSubGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_sub_op.cc b/caffe2/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000..f906883
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_op.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/elementwise_sub_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Sub,
+    BinaryElementwiseOp<NumericTypes, CPUContext, SubFunctor<CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_sub_op.h b/caffe2/operators/elementwise_sub_op.h
new file mode 100644
index 0000000..75759f6
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_op.h
@@ -0,0 +1,79 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SubFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Sub(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC,
+      const TIn* /* A */,
+      const TIn* /* B */,
+      const TOut* /* C */,
+      TGrad* dA,
+      TGrad* dB,
+      Context* context) const {
+    const std::vector<int> C_dims =
+        elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
+            A_dims, B_dims);
+    std::vector<int> A_axes;
+    std::vector<int> B_axes;
+    elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
+        A_dims, B_dims, &A_axes, &B_axes);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        A_axes.size(),
+        A_axes.data(),
+        dC,
+        dA,
+        context);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        B_axes.size(),
+        B_axes.data(),
+        dC,
+        dB,
+        context);
+    const int size = std::accumulate(
+        B_dims.cbegin(), B_dims.cend(), 1, std::multiplies<int>());
+    math::Neg(size, dB, dB, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
diff --git a/caffe2/operators/elementwise_sub_op_gpu.cc b/caffe2/operators/elementwise_sub_op_gpu.cc
new file mode 100644
index 0000000..72a0f11
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/elementwise_sub_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sub,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, SubFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SubGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        SubFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_sum_op.cc b/caffe2/operators/elementwise_sum_op.cc
new file mode 100644
index 0000000..861f4f1
--- /dev/null
+++ b/caffe2/operators/elementwise_sum_op.cc
@@ -0,0 +1,121 @@
+#include "caffe2/operators/utility_ops.h"
+
+namespace caffe2 {
+
+namespace {
+OpSchema::Cost CostInferenceForSum(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  struct OpSchema::Cost cost = PointwiseCostInference<1>(def, in);
+  cost.flops *= (in.size() - 1);
+  cost.params_bytes = 0;
+  return cost;
+}
+} // namespace
+
+REGISTER_CPU_OPERATOR(Sum, SumOp<CPUContext>);
+
+OPERATOR_SCHEMA(Sum)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(CostInferenceForSum)
+    .InputsCanCrossDevices()
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+Element-wise sum of each of the input tensors. The first input tensor can be used
+in-place as the output tensor, in which case the sum will be done in place and
+results will be accumulated the first input tensor. All inputs and outputs must
+have the same shape and data type.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_sum_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sum",
+    ["A",  "B"],
+    ["C"],
+)
+
+workspace.FeedBlob("A", np.array([[1,2],[3,4]]).astype(np.float32))
+workspace.FeedBlob("B", np.array([[5,6],[7,8]]).astype(np.float32))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("C:", workspace.FetchBlob("A"))
+
+```
+
+**Result**
+
+```
+
+A: [[1. 2.]
+ [3. 4.]]
+B: [[5. 6.]
+ [7. 8.]]
+C: [[1. 2.]
+ [3. 4.]]
+
+```
+
+</details>
+
+<details>
+
+<summary> <b>Example 2</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sum",
+    ["A",  "B"],
+    ["A"],  # inplace
+)
+
+workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32))
+workspace.FeedBlob("B", np.array([[9,5,6],[6,7,8]]).astype(np.float32))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("A after Sum:", workspace.FetchBlob("A"))
+
+```
+
+**Result**
+
+```
+
+A: [[1. 2. 5.]
+ [8. 3. 4.]]
+B: [[9. 5. 6.]
+ [6. 7. 8.]]
+A after Sum: [[10.  7. 11.]
+ [14. 10. 12.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "A", "*(type: Tensor`<float>`)* First tensor to be added element-wise.")
+    .Input(1, "B", "*(type: Tensor`<float>`)* Second tensor to be added element-wise.")
+    .Output(0, "C", "*(type: Tensor`<float>`)* Sum of A and B.")
+    .InheritOnnxSchema("Sum");
+}
diff --git a/caffe2/operators/elu_op.cc b/caffe2/operators/elu_op.cc
new file mode 100644
index 0000000..0cf44be
--- /dev/null
+++ b/caffe2/operators/elu_op.cc
@@ -0,0 +1,145 @@
+#include "caffe2/operators/elu_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool EluFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorMap<T>(Y, N) =
+      (X_arr < 0).select(alpha * (X_arr.exp() - T(1)), X_arr);
+  return true;
+}
+
+template <>
+template <typename T>
+bool EluGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  EigenVectorArrayMap<T>(dX, size) =
+      (Y_arr < 0).select(dY_arr * (Y_arr + alpha), dY_arr);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Elu,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CPUContext,
+        EluFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    EluGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CPUContext,
+        EluGradientFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Elu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+
+This op implements the exponential linear unit (ELU) activation function as described in [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](https://arxiv.org/abs/1511.07289). The op takes an input tensor $X$ of arbitrary shape, computes the elementwise elu operation, and returns a vector $Y$ of the same shape as output. The alpha parameter may be passed as an argument, but defaults to 1. The elu operation is defined as
+
+$$y=f(x) =\begin{cases}\alpha(e^x-1) & x < 0 \\ x & otherwise\end{cases}$$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elu_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elu_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Elu",
+    ["X"],
+    ["Y"],
+    alpha=1.1
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[ 0.35339102  1.1860217  -0.10710736]
+ [-3.1173866  -0.1889988  -0.20330353]
+ [ 1.8525308  -0.368949    0.506277  ]]
+
+Y:
+ [[ 0.35339102  1.1860217  -0.11172786]
+ [-1.0513     -0.18943374 -0.20236646]
+ [ 1.8525308  -0.33939326  0.506277  ]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "1D input tensor of data to be operated on.")
+    .Output(0, "Y", "1D input tensor, calculated as described above.")
+    .Arg(
+        "alpha",
+        "*(type: float; default: 1.0)* Defines alpha parameter used in calculation.")
+    .InheritOnnxSchema("Elu");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(EluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+EluGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the rectified linear function.
+)DOC");
+
+namespace {
+
+class GetEluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Elu, GetEluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elu_op.cu b/caffe2/operators/elu_op.cu
new file mode 100644
index 0000000..95f2e1e
--- /dev/null
+++ b/caffe2/operators/elu_op.cu
@@ -0,0 +1,91 @@
+#include "caffe2/operators/elu_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void EluCUDAKernel(const int N, const T alpha, const T* X, T* Y);
+
+template <>
+__global__ void
+EluCUDAKernel<float>(const int N, const float alpha, const float* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] =
+        __ldg(X + i) < 0 ? alpha * (expf(__ldg(X + i)) - 1.0f) : __ldg(X + i);
+#else
+    Y[i] = X[i] < 0 ? alpha * (expf(X[i]) - 1.0f) : X[i];
+#endif
+  }
+}
+
+template <typename T>
+__global__ void EluGradientCUDAKernel(
+    const int N,
+    const T alpha,
+    const T* dY,
+    const T* Y,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(Y + i) < 0 ? __ldg(dY + i) * (__ldg(Y + i) + alpha)
+                             : __ldg(dY + i);
+#else
+    dX[i] = Y[i] < 0 ? dY[i] * (Y[i] + alpha) : dY[i];
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool EluFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  EluCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, alpha, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool EluGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  EluGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, alpha, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Elu,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        EluFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    EluGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        EluGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elu_op.h b/caffe2/operators/elu_op.h
new file mode 100644
index 0000000..cee8368
--- /dev/null
+++ b/caffe2/operators/elu_op.h
@@ -0,0 +1,40 @@
+#ifndef CAFFE2_OPERATORS_ELU_OP_H_
+#define CAFFE2_OPERATORS_ELU_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct EluFunctor {
+  explicit EluFunctor(OperatorBase& op)
+      : alpha(op.GetSingleArgument<float>("alpha", 1.0f)) {}
+
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+
+  const float alpha;
+};
+
+template <class Context>
+struct EluGradientFunctor {
+  explicit EluGradientFunctor(OperatorBase& op)
+      : alpha(op.GetSingleArgument<float>("alpha", 1.0f)) {}
+
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& Y_dims,
+      const std::vector<int>& dY_dims,
+      const T* Y,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+
+  const float alpha;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELU_OP_H_
diff --git a/caffe2/operators/elu_op_cudnn.cc b/caffe2/operators/elu_op_cudnn.cc
new file mode 100644
index 0000000..f270c60
--- /dev/null
+++ b/caffe2/operators/elu_op_cudnn.cc
@@ -0,0 +1,109 @@
+#include "caffe2/operators/elu_op.h"
+
+#include "caffe2/operators/activation_ops_cudnn.h"
+
+namespace caffe2 {
+
+template <>
+class CuDNNActivationOp<CUDNN_ACTIVATION_ELU> final
+    : public CuDNNActivationOpBase {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNActivationOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNActivationOpBase(operator_def, ws),
+        OP_SINGLE_ARG(float, "alpha", alpha_, 1.0f) {
+    CUDNN_ENFORCE(cudnnSetActivationDescriptor(
+        act_desc_,
+        CUDNN_ACTIVATION_ELU,
+        CUDNN_PROPAGATE_NAN,
+        static_cast<double>(alpha_)));
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    if (X.size() == 0) {
+      Y->template mutable_data<T>();
+      return true;
+    }
+    this->SetTensorDescriptor(cudnnTypeWrapper<T>::type, X.size());
+    CUDNN_ENFORCE(cudnnActivationForward(
+        this->cudnn_wrapper_.inline_cudnn_handle(),
+        this->act_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        this->data_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        this->data_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+
+ private:
+  const float alpha_;
+};
+
+template <>
+class CuDNNActivationGradientOp<CUDNN_ACTIVATION_ELU> final
+    : public CuDNNActivationOpBase {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNActivationGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNActivationOpBase(operator_def, ws),
+        OP_SINGLE_ARG(float, "alpha", alpha_, 1.0f) {
+    CUDNN_ENFORCE(cudnnSetActivationDescriptor(
+        act_desc_,
+        CUDNN_ACTIVATION_ELU,
+        CUDNN_PROPAGATE_NAN,
+        static_cast<double>(alpha_)));
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& Y = Input(0);
+    const auto& dY = Input(1);
+    auto* dX = Output(0);
+    dX->ResizeLike(Y);
+    if (Y.size() == 0) {
+      dX->template mutable_data<T>();
+      return true;
+    }
+    this->SetTensorDescriptor(cudnnTypeWrapper<T>::type, Y.size());
+    CUDNN_ENFORCE(cudnnActivationBackward(
+        this->cudnn_wrapper_.inline_cudnn_handle(),
+        this->act_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        this->data_desc_,
+        Y.template data<T>(),
+        this->data_desc_,
+        dY.template data<T>(),
+        this->data_desc_,
+        Y.template data<T>(), // Use Y_data as placeholder here.
+        cudnnTypeWrapper<T>::kZero(),
+        this->data_desc_,
+        dX->template mutable_data<T>()));
+    return true;
+  }
+
+ private:
+  const float alpha_;
+};
+
+REGISTER_CUDNN_OPERATOR(Elu, CuDNNActivationOp<CUDNN_ACTIVATION_ELU>);
+REGISTER_CUDNN_OPERATOR(
+    EluGradient,
+    CuDNNActivationGradientOp<CUDNN_ACTIVATION_ELU>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/enforce_finite_op.cc b/caffe2/operators/enforce_finite_op.cc
new file mode 100644
index 0000000..4c2f2d6
--- /dev/null
+++ b/caffe2/operators/enforce_finite_op.cc
@@ -0,0 +1,24 @@
+#include "caffe2/operators/enforce_finite_op.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool EnforceFiniteOp<CPUContext>::DoRunWithType() {
+  EnforceOnCPU<T>(Input(0));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(EnforceFinite, EnforceFiniteOp<CPUContext>);
+
+OPERATOR_SCHEMA(EnforceFinite)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Raise if there is NaN or Inf values in the input tensor.
+)DOC")
+    .Input(0, "input", "Input tensor");
+
+SHOULD_NOT_DO_GRADIENT(EnforceFinite);
+
+} // namespace caffe2
diff --git a/caffe2/operators/enforce_finite_op.cu b/caffe2/operators/enforce_finite_op.cu
new file mode 100644
index 0000000..b909d70
--- /dev/null
+++ b/caffe2/operators/enforce_finite_op.cu
@@ -0,0 +1,16 @@
+#include "caffe2/operators/enforce_finite_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
+  buffer_.CopyFrom<CUDAContext, CUDAContext>(Input(0), &context_);
+  EnforceOnCPU<T>(buffer_);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(EnforceFinite, EnforceFiniteOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/enforce_finite_op.h b/caffe2/operators/enforce_finite_op.h
new file mode 100644
index 0000000..d8e5a15
--- /dev/null
+++ b/caffe2/operators/enforce_finite_op.h
@@ -0,0 +1,46 @@
+#ifndef CAFFE_OPERATORS_ENFORCE_FINITE_OP_H_
+#define CAFFE_OPERATORS_ENFORCE_FINITE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class EnforceFiniteOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  EnforceFiniteOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  Tensor<CPUContext> buffer_;
+
+  template <typename T>
+  void EnforceOnCPU(const Tensor<CPUContext>& input) {
+    const T* input_data = input.template data<T>();
+    auto size = input.size();
+
+    for (auto i = 0; i < size; i++) {
+      CAFFE_ENFORCE(
+          std::isfinite(input_data[i]),
+          "Index ",
+          i,
+          " is not finite (e.g., NaN, Inf): ",
+          input_data[i]);
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_ENFORCE_FINITE_OP_H_
diff --git a/caffe2/operators/ensure_clipped_op.cc b/caffe2/operators/ensure_clipped_op.cc
new file mode 100644
index 0000000..83ac58a
--- /dev/null
+++ b/caffe2/operators/ensure_clipped_op.cc
@@ -0,0 +1,47 @@
+#include "caffe2/operators/ensure_clipped_op.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+template <typename SIndex>
+bool EnsureClippedOp<float, CPUContext>::DoRunWithType() {
+  Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+  const auto* indices = Input(INDICES).template data<SIndex>();
+  const auto* paramIn = Input(PARAM).template data<float>();
+  auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<float>();
+  CAFFE_ENFORCE_EQ(paramIn, paramOut);
+  // n: number of sparse embeddings to be normalized
+  auto n = Input(INDICES).size();
+  if (n == 0) {
+    return true;
+  }
+  // embedding length, e.g. 32, 64, 128
+  auto block_size = Input(GRAD).size() / n;
+  for (int i = 0; i < n; ++i) {
+    auto idx = indices[i];
+    auto offsetIdx = idx * block_size;
+    EigenVectorMap<float>(paramOut + offsetIdx, block_size) =
+        ConstEigenVectorMap<float>(paramIn + offsetIdx, block_size)
+            .cwiseMax(min_)
+            .cwiseMin(max_);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(EnsureClipped, EnsureClippedOp<float, CPUContext>);
+OPERATOR_SCHEMA(EnsureClipped)
+    .NumInputs(1, 3)
+    .NumOutputs(1)
+    .Input(0, "param", "Parameters to be normalized")
+    .Input(1, "indices", "Sparse indices, only needed for sparse param")
+    .Input(2, "grad", "Gradient computed, only needed for sparse param")
+    .Output(0, "output_param", "param ensured to be clipped within range")
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Given a tensor, apply clip after gradient is applied; when the param is sparse as
+indicated by valid indices and grad, in-place is required
+)DOC");
+
+SHOULD_NOT_DO_GRADIENT(EnsureClipped);
+} // namespace caffe2
diff --git a/caffe2/operators/ensure_clipped_op.h b/caffe2/operators/ensure_clipped_op.h
new file mode 100644
index 0000000..23a1092
--- /dev/null
+++ b/caffe2/operators/ensure_clipped_op.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class EnsureClippedOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  EnsureClippedOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        min_(std::numeric_limits<T>::lowest()),
+        max_(std::numeric_limits<T>::max()) {
+    if (HasArgument("min")) {
+      min_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("min", 0));
+    }
+    if (HasArgument("max")) {
+      max_ = static_cast<T>(OperatorBase::GetSingleArgument<float>("max", 0));
+    }
+  }
+
+  bool RunOnDevice() override {
+    if (InputSize() > INDICES) {
+      // spares gradient, selective checking clipping
+      CAFFE_ENFORCE_EQ(
+          Input(PARAM).size_from_dim(1),
+          Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      auto& X = Input(PARAM);
+      auto* Y = Output(OUTPUT_PARAM);
+      Y->ResizeLike(X);
+      EigenVectorMap<float>(Y->template mutable_data<float>(), Y->size()) =
+          ConstEigenVectorMap<float>(X.template data<float>(), X.size())
+              .cwiseMax(min_)
+              .cwiseMin(max_);
+      return true;
+    }
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType();
+
+ protected:
+  T min_;
+  T max_;
+  INPUT_TAGS(PARAM, INDICES, GRAD);
+  OUTPUT_TAGS(OUTPUT_PARAM);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/ensure_cpu_output_op.cc b/caffe2/operators/ensure_cpu_output_op.cc
new file mode 100644
index 0000000..49dbfb1
--- /dev/null
+++ b/caffe2/operators/ensure_cpu_output_op.cc
@@ -0,0 +1,31 @@
+#include "caffe2/operators/ensure_cpu_output_op.h"
+
+namespace caffe2 {
+
+// From CPU Context, the op takes CPU tensor as input, and produces
+// TensorCPU
+REGISTER_CPU_OPERATOR(EnsureCPUOutput, EnsureCPUOutputOp<CPUContext>);
+
+OPERATOR_SCHEMA(EnsureCPUOutput)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), op_device);
+      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+This Op always create TensorCPU output, and may involves cross-device MemCpy.
+Under CPU Context, this Op takes TensorCPU as input. Under the CUDA Context,
+this Op accepts either CUDA or CPU Tensor input.
+)DOC")
+    .Input(0, "input", "The input CUDA or CPU tensor.")
+    .Output(0, "output", "TensorCPU that is a copy of the input.");
+
+NO_GRADIENT(EnsureCPUOutput);
+} // namespace caffe2
diff --git a/caffe2/operators/ensure_cpu_output_op.cu b/caffe2/operators/ensure_cpu_output_op.cu
new file mode 100644
index 0000000..7594d45
--- /dev/null
+++ b/caffe2/operators/ensure_cpu_output_op.cu
@@ -0,0 +1,9 @@
+#include "caffe2/operators/ensure_cpu_output_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+// From CUDA Context, takes either CUDA or CPU tensor as input, and produce
+// TensorCPU
+REGISTER_CUDA_OPERATOR(EnsureCPUOutput, EnsureCPUOutputOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
new file mode 100644
index 0000000..8130f42
--- /dev/null
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_ENSURE_CPU_OUTPUT_OP_H_
+#define CAFFE2_OPERATORS_ENSURE_CPU_OUTPUT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class EnsureCPUOutputOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  EnsureCPUOutputOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+      return CopyWithContext<CPUContext>();
+    } else if (OperatorBase::InputIsType<Tensor<Context>>(0)) {
+      // CUDA Context will go this branch
+      return CopyWithContext<Context>();
+    } else {
+      CAFFE_THROW(
+          "Unexpected Input Blob: ",
+          OperatorBase::Inputs().at(0)->meta().name());
+    }
+    return true;
+  }
+
+ private:
+  template <class InputContext>
+  bool CopyWithContext() {
+    // Output is always on CPU
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto& input = OperatorBase::Input<Tensor<InputContext>>(0);
+    output->ResizeLike(input);
+    context_.template CopyItems<InputContext, CPUContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ENSURE_CPU_OUTPUT_OP_H_
diff --git a/caffe2/operators/exp_op.cc b/caffe2/operators/exp_op.cc
new file mode 100644
index 0000000..4437c7c
--- /dev/null
+++ b/caffe2/operators/exp_op.cc
@@ -0,0 +1,91 @@
+#include "caffe2/operators/exp_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Exp,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, ExpFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Exp)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the exponential of the given input tensor ($exp(x)$), element-wise. This
+operation can be done in an in-place fashion too, by providing the same input
+and output blobs.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/exp_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Exp",
+    ["X"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
+print("X before running op:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("X after running op:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X before running op:
+[[0.5821691  0.07719802 0.50159824]
+ [0.40952456 0.36788362 0.84887683]
+ [0.02472685 0.65730894 0.9066397 ]]
+X after running op:
+[[1.7899168 1.080256  1.6513585]
+ [1.5061016 1.4446739 2.3370204]
+ [1.0250351 1.9295927 2.4759884]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* The exponential of the input tensor computed "
+        "element-wise.")
+    .InheritOnnxSchema("Exp");
+
+namespace {
+
+class GetExpGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Mul",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Exp, GetExpGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/exp_op.h b/caffe2/operators/exp_op.h
new file mode 100644
index 0000000..051827b
--- /dev/null
+++ b/caffe2/operators/exp_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_EXP_OP_
+#define CAFFE2_OPERATORS_EXP_OP_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct ExpFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Exp(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_EXP_OP_
diff --git a/caffe2/operators/exp_op_gpu.cc b/caffe2/operators/exp_op_gpu.cc
new file mode 100644
index 0000000..0b79a61
--- /dev/null
+++ b/caffe2/operators/exp_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/exp_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Exp,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        ExpFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/expand_op.cc b/caffe2/operators/expand_op.cc
new file mode 100644
index 0000000..169fdc5
--- /dev/null
+++ b/caffe2/operators/expand_op.cc
@@ -0,0 +1,55 @@
+#include "caffe2/operators/expand_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include <caffe2/utils/math.h>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Expand,
+    ExpandOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    ExpandGradient,
+    ExpandGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext>);
+
+OPERATOR_SCHEMA(Expand)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+	Broadcast the input tensor to a materialized new tensor using given shape.
+	Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
+	Dimensions are right alignment;
+	Two corresponding dimensions must have the same value, or one of them
+	equals to 1.
+)DOC")
+    .Input(0, "X", "(*Tensor`<NumericType>`*): input tensor")
+    .Input(1, "shape", "(*Tensor`<int>`*): expand shape")
+    .Output(0, "Y", "(*Tensor`<NumericType>`*): expanded tensor");
+
+OPERATOR_SCHEMA(ExpandGradient).NumInputs(2).NumOutputs(1);
+
+namespace {
+
+class GetExpandGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ExpandGradient",
+        "",
+        std::vector<string>{GO(0), I(0)},
+        std::vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Expand, GetExpandGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h
new file mode 100644
index 0000000..9f5406f
--- /dev/null
+++ b/caffe2/operators/expand_op.h
@@ -0,0 +1,105 @@
+#ifndef CAFFE2_OPERATORS_REDUCE_OPS_H_
+#define CAFFE2_OPERATORS_REDUCE_OPS_H_
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename InputTypes, class Context>
+class ExpandOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ExpandOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+ template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    const auto& Y_shape_tensor = Input(1);
+    std::vector<int64_t> shape_dims(Y_shape_tensor.size());
+    context_.template Copy<int64_t, Context, CPUContext>(
+        Y_shape_tensor.size(),
+        Y_shape_tensor.template data<int64_t>(),
+        shape_dims.data());
+    auto* Y = Output(0);
+
+	const int ndim = shape_dims.size();
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims;
+    Y_dims.reserve(std::max(ndim, X.ndim()));
+    // ndim, X.ndim() might equal to 0
+    for (int i = ndim - 1, j = X.ndim() - 1; i >= 0 || j >= 0; --i, --j) {
+      const int shape_x = (j >= 0 ? X_dims[j] : 1);
+      const int shape_y = (i >= 0 ? shape_dims[i] : 1);
+      CAFFE_ENFORCE(
+          shape_x == 1 || shape_y == 1 || shape_x == shape_y,
+          "Dimensions format invalid.");
+      Y_dims.push_back(std::max(shape_x, shape_y));
+    }
+    std::reverse(Y_dims.begin(), Y_dims.end());
+    Y->Resize(Y_dims);
+    math::Broadcast<T, Context>(
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.size(),
+        Y_dims.data(),
+        X.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+};
+
+template <typename InputTypes, class Context>
+class ExpandGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ExpandGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& dY = Input(0);
+    const auto& X = Input(1);
+    auto* dX = Output(0);
+    const int ndim = dY.ndim();
+    const std::vector<int> dX_dims(X.dims().cbegin(), X.dims().cend());
+    const std::vector<int> dY_dims(dY.dims().cbegin(), dY.dims().cend());
+    dX->ResizeLike(X);
+    std::vector<int> axes;
+    const int offset = ndim - X.ndim();
+    for (int i = 0; i < ndim; i++) {
+      if (i < offset || dX_dims[i - offset] == 1) {
+        axes.push_back(i);
+      }
+    }
+    math::ReduceSum<T, Context>(
+        dY_dims.size(),
+        dY_dims.data(),
+        axes.size(),
+        axes.data(),
+        dY.template data<T>(),
+        dX->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REDUCE_OPS_H_
diff --git a/caffe2/operators/expand_op_gpu.cc b/caffe2/operators/expand_op_gpu.cc
new file mode 100644
index 0000000..7432a90
--- /dev/null
+++ b/caffe2/operators/expand_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/expand_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Expand,
+    ExpandOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ExpandGradient,
+    ExpandGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/expand_squeeze_dims_op.cc b/caffe2/operators/expand_squeeze_dims_op.cc
new file mode 100644
index 0000000..94bf77c
--- /dev/null
+++ b/caffe2/operators/expand_squeeze_dims_op.cc
@@ -0,0 +1,191 @@
+#include "caffe2/operators/expand_squeeze_dims_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
+
+OPERATOR_SCHEMA(ExpandDims)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      auto dims = helper.template GetRepeatedArgument<int>("dims");
+      auto originalSize = dims.size();
+      CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
+      std::sort(dims.begin(), dims.end());
+      dims.erase(std::unique(dims.begin(), dims.end()), dims.end());
+      if (dims.size() < originalSize) {
+        LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+      }
+
+      CAFFE_ENFORCE(dims.front() >= 0, "Dimension ids must be non-negative.");
+      CAFFE_ENFORCE_GE(
+          in[0].dims_size() + dims.size(),
+          dims.back() + 1,
+          "Input needs at least ",
+          (1 + dims.back() - dims.size()),
+          " dimensions given `dims`.");
+
+      vector<TensorShape> out(1);
+
+      int cur_pos = 0;
+      int idx = 0;
+      for (const auto new_dim : dims) {
+        for (int i = cur_pos; i < new_dim; i++) {
+          out[0].add_dims(in[0].dims(idx++));
+        }
+        out[0].add_dims(1);
+        cur_pos = new_dim + 1;
+      }
+      for (; idx < in[0].dims_size(); idx++) {
+        out[0].add_dims(in[0].dims(idx));
+      }
+      out[0].set_data_type(in[0].data_type());
+      return out;
+    })
+    .SetDoc(R"DOC(
+The *ExpandDims* op inserts single-dimensional entries into the shape of the input tensor *data,* and produces a single output tensor *expanded*. The op also takes an argument *dims* with a list of dimensions for where to add the single dimensional entries. If the same blob is provided as input and output, the operation is copy-free. This is the exact inverse operation of *Squeeze*.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ExpandDims",
+    ["data"],
+    ["expanded"],
+    dims=[0,1],
+)
+
+workspace.FeedBlob("data", np.zeros((100,100)).astype(np.float32))
+print("data.shape:", workspace.FetchBlob("data").shape)
+
+workspace.RunOperatorOnce(op)
+print("expanded.shape:", workspace.FetchBlob("expanded").shape)
+
+```
+
+**Result**
+
+```
+
+data.shape: (100, 100)
+expanded.shape: (1, 1, 100, 100)
+
+```
+
+</details>
+
+
+
+)DOC")
+    .Input(0, "data", "Input tensor of data to be operated on.")
+    .Output(0, "expanded", "Reshaped tensor with same data as input.")
+    .Arg("dims", "*(type: [int])* List of dimensions of *data* to add single dimensional entry.");
+
+OPERATOR_SCHEMA(Squeeze)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+The *Squeeze* op removes single-dimensional entries from the shape of the input tensor *data,* and produces a single output tensor *squeezed*. The op also takes an argument *dims* with a list of dimensions to squeeze. If the same blob is provided as input and output, the operation is copy-free. This is the exact inverse operation of *ExpandDims* given the same *dims* argument.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Squeeze",
+    ["data"],
+    ["squeezed"],
+    dims=[0,1],
+)
+
+workspace.FeedBlob("data", np.zeros((1,1,100,100)).astype(np.float32))
+print("data.shape:", workspace.FetchBlob("data").shape)
+
+workspace.RunOperatorOnce(op)
+print("squeezed.shape:", workspace.FetchBlob("squeezed").shape)
+
+```
+
+**Result**
+
+```
+
+data.shape: (1, 1, 100, 100)
+squeezed.shape: (100, 100)
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "data", "Input tensor of data to be operated on.")
+    .Output(0, "squeezed", "Reshaped tensor with same data as input.")
+    .Arg("dims", "*(type: [int])* List of dimensions of *data* to squeeze out.")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      auto dims = helper.template GetRepeatedArgument<int>("dims");
+      auto originalSize = dims.size();
+      std::sort(dims.begin(), dims.end());
+      dims.erase(std::unique(dims.begin(), dims.end()), dims.end());
+      if (dims.size() < originalSize) {
+        LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+      }
+      CAFFE_ENFORCE(dims.front() >= 0, "Dimension ids must be non-negative.");
+
+      vector<TensorShape> out(1);
+      std::vector<int> newDims =
+          SqueezeOp<CPUContext>::ComputeDims(GetDimsVector(in[0]), dims);
+      out[0] = CreateTensorShape(newDims, in[0].data_type());
+      return out;
+    })
+    .InheritOnnxSchema("Squeeze");
+
+class GetSqueezeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ExpandDims", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Squeeze, GetSqueezeGradient);
+
+class GetExpandDimsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Squeeze", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ExpandDims, GetExpandDimsGradient);
+}
diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h
new file mode 100644
index 0000000..ef025ed
--- /dev/null
+++ b/caffe2/operators/expand_squeeze_dims_op.h
@@ -0,0 +1,118 @@
+#ifndef CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
+#define CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ExpandDimsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
+    auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+    std::sort(dims_.begin(), dims_.end());
+    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
+    if (dims_.size() < originalSize) {
+      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+    }
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
+  }
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->CopyFrom(input, &context_);
+    if (dims_.empty()) {
+      return true;
+    }
+
+    auto newDims = input.dims();
+    CAFFE_ENFORCE_GE(
+        input.dims().size() + dims_.size(),
+        dims_.back() + 1,
+        "Input needs at least ",
+        (1 + dims_.back() - dims_.size()),
+        " dimensions given `dims`.");
+    for (const auto dim : dims_) {
+      newDims.insert(newDims.begin() + dim, 1);
+    }
+    output->Reshape(newDims);
+    return true;
+  }
+
+ private:
+  vector<int> dims_;
+};
+
+template <class Context>
+class SqueezeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SqueezeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
+    auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
+    std::sort(dims_.begin(), dims_.end());
+    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
+    if (dims_.size() < originalSize) {
+      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+    }
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
+  }
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->CopyFrom(input, &context_);
+
+    CAFFE_ENFORCE_GT(
+        input.ndim(),
+        dims_.back(),
+        "Input needs at least ",
+        (dims_.back() + 1),
+        " dimensions.");
+
+    std::vector<int> newDims = ComputeDims(input.dims(), dims_);
+    output->Reshape(newDims);
+    return true;
+  }
+
+  static std::vector<int> ComputeDims(
+      std::vector<TIndex> inputDims,
+      std::vector<int> dims) {
+    int j = 0;
+    std::vector<int> newDims;
+    for (int i = 0; i < inputDims.size(); ++i) {
+      if (j < dims.size() && dims[j] == i) {
+        CAFFE_ENFORCE_EQ(
+            inputDims[i],
+            1,
+            "Dimension ",
+            i,
+            " of input must be 1",
+            " instead of ",
+            inputDims[i],
+            ".");
+        ++j;
+        continue;
+      }
+      newDims.push_back(inputDims.at(i));
+    }
+    return newDims;
+  }
+
+ private:
+  vector<int> dims_;
+
+ public:
+  DISABLE_COPY_AND_ASSIGN(SqueezeOp);
+};
+} // namespace caffe2
+#endif // CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
diff --git a/caffe2/operators/expand_squeeze_dims_op_gpu.cc b/caffe2/operators/expand_squeeze_dims_op_gpu.cc
new file mode 100644
index 0000000..23f6520
--- /dev/null
+++ b/caffe2/operators/expand_squeeze_dims_op_gpu.cc
@@ -0,0 +1,7 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/expand_squeeze_dims_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Squeeze, SqueezeOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(ExpandDims, ExpandDimsOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/extend_tensor_op.cc b/caffe2/operators/extend_tensor_op.cc
new file mode 100644
index 0000000..6ac1be0
--- /dev/null
+++ b/caffe2/operators/extend_tensor_op.cc
@@ -0,0 +1,78 @@
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <class Context>
+class ExtendTensorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ExtendTensorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        growthPct_(OperatorBase::GetSingleArgument<int>("growthPct", 40)) {}
+
+  bool RunOnDevice() override {
+    auto& old_tensor = Input(0);
+    auto& indices = Input(1);
+    auto* new_tensor = Output(0);
+    CAFFE_ENFORCE(indices.ndim() >= 1);
+    CAFFE_ENFORCE(
+        &old_tensor == new_tensor, "First argument must be in-place.");
+    CAFFE_ENFORCE(new_tensor->ndim() == indices.ndim());
+    CAFFE_ENFORCE(indices.ndim() == new_tensor->ndim());
+
+    auto oldSize = new_tensor->size();
+    auto maxElem = 1 +
+        *(std::max_element(
+            indices.template data<int>(),
+            indices.template data<int>() + indices.size()));
+
+    auto extendSize = (TIndex)maxElem - oldSize;
+    if (extendSize > 0) {
+      new_tensor->Extend(extendSize, growthPct_, &context_);
+      if (!new_tensor->meta().ctor()) {
+        auto oldSizeBytes = oldSize * new_tensor->meta().itemsize();
+        auto* dst = (char*)new_tensor->raw_mutable_data() + oldSizeBytes;
+        math::Set<char, Context>(
+            new_tensor->nbytes() - oldSizeBytes, 0, dst, &context_);
+      }
+    }
+    return true;
+  }
+
+  int growthPct_;
+};
+
+REGISTER_CPU_OPERATOR(ExtendTensor, ExtendTensorOp<CPUContext>);
+
+OPERATOR_SCHEMA(ExtendTensor)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Extend input 0 if necessary based on max element in input 1.
+Input 0 must be the same as output, that is, it is required to be in-place.
+Input 0 may have to be re-allocated in order for accommodate to the new size.
+Currently, an exponential growth ratio is used in order to ensure amortized
+constant time complexity.
+All except the outer-most dimension must be the same between input 0 and 1.
+)DOC")
+    .Input(0, "tensor", "The tensor to be extended.")
+    .Input(
+        1,
+        "new_indices",
+        "The size of tensor will be extended based on max element in "
+        "new_indices.")
+    .Output(
+        0,
+        "extended_tensor",
+        "Same as input 0, representing the mutated tensor.");
+}
+} // namespace
diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc
new file mode 100644
index 0000000..2c11e8a
--- /dev/null
+++ b/caffe2/operators/fc_inference.cc
@@ -0,0 +1,56 @@
+#include "caffe2/operators/fc_inference.h"
+
+namespace caffe2 {
+std::vector<TensorShape> FCShapeInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight) {
+  vector<TensorShape> out(1);
+  ArgumentHelper helper(def);
+
+  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const int N = pretransposed_weight
+      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
+      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  vector<int> y_shape(in[0].dims().begin(), in[0].dims().end());
+  CAFFE_ENFORCE_LE(canonical_axis + 1, y_shape.size());
+  y_shape.resize(canonical_axis + 1);
+  y_shape[canonical_axis] = N;
+  out[0] = CreateTensorShape(y_shape, in[0].data_type());
+  return out;
+}
+
+OpSchema::Cost CostInferenceForFC(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  CAFFE_ENFORCE_EQ(in.size(), 3, "FC requires three inputs");
+  struct OpSchema::Cost c;
+  ArgumentHelper helper(def);
+
+  const auto& X = in[0];
+  const auto& W = in[1];
+  const auto& b = in[2];
+  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
+  const int M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
+  const int K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const int N = size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  uint64_t nElemX = nElemFromDim(X);
+  uint64_t nElemW = nElemFromDim(W);
+  uint64_t nElemB = nElemFromDim(b);
+  c.flops = 2 * K * M * N + M * N;
+  c.bytes_read = (nElemX + nElemW + nElemB) * sizeof(X.data_type());
+  c.bytes_written = M * N * sizeof(X.data_type());
+  c.params_bytes = (K * N + N) * sizeof(X.data_type());
+  return c;
+}
+} // namespace caffe2
diff --git a/caffe2/operators/fc_inference.h b/caffe2/operators/fc_inference.h
new file mode 100644
index 0000000..f7cf335
--- /dev/null
+++ b/caffe2/operators/fc_inference.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+std::vector<TensorShape> FCShapeInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight);
+
+OpSchema::Cost CostInferenceForFC(
+    const OperatorDef& def,
+    const vector<TensorShape>& in);
+} // namespace caffe2
diff --git a/caffe2/operators/feature_maps_ops.cc b/caffe2/operators/feature_maps_ops.cc
new file mode 100644
index 0000000..07847e8
--- /dev/null
+++ b/caffe2/operators/feature_maps_ops.cc
@@ -0,0 +1,405 @@
+#include "feature_maps_ops.h"
+
+#include "caffe2/core/context.h"
+
+namespace caffe2 {
+namespace {
+
+const std::string doc = R"DOC(
+  Single-feature representation:
+  - scalar features:
+    <feature full name> T
+  - list features:
+    <feature full name>.lengths int32
+    <feature full name>.values T
+  - map features:
+    <feature full name>.lengths int32
+    <feature full name>.keys K
+    <feature full name>.values V
+
+  Missing values are set to zero, and value presence flag is set accordingly:
+    <feature full name>.presence bool
+
+  Multi-feature representation:
+  - scalar features:
+    <feature type>.lengths int32
+    <feature type>.keys int64
+    <feature type>.values T
+  - list features:
+    <feature type>.lengths int32
+    <feature type>.keys int64
+    <feature type>.values.lengths int32
+    <feature type>.values.values T
+  - map features:
+    <feature type>.lengths int32
+    <feature type>.keys int64
+    <feature type>.values.lengths int32
+    <feature type>.values.keys K
+    <feature type>.values.values V
+
+  You can read more about representing batches of lists and maps here:
+  https://our.intern.facebook.com/intern/dex/caffe2/sparse-operations/
+)DOC";
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleScalarFeatureTensors,
+    MergeSingleScalarFeatureTensorsOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleScalarFeatureTensors)
+    .SetDoc(
+        "Merge given single-feature tensors with scalar features into one "
+        "multi-feature tensor." +
+        doc)
+    .NumInputs([](int n) { return n >= 2 && n % 2 == 0; })
+    .NumOutputs(3)
+    .Input(0, "in1", "")
+    .Input(1, "in1_presence", ".presence")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values", ".values")
+    .Arg("feature_ids", "feature ids");
+
+class GetMergeSingleScalarFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / 2; ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * 2 + 1));
+      output_blob_names.push_back(GI(inputIdx * 2));
+    }
+    input_blob_names.push_back(GO(2));
+
+    return SingleGradientDef(
+        "MergeSingleScalarFeatureTensorsGradient",
+        "", /* name */
+        input_blob_names,
+        output_blob_names);
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleScalarFeatureTensorsGradient,
+    MergeSingleScalarFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleScalarFeatureTensorsGradient)
+    .SetDoc(
+        "Explode multi-feature tensor of scalar features into one or more"
+        "single-feature tensors" +
+        doc)
+    .NumInputs([](int n) { return n >= 2; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_presence", ".presence")
+    .Input(1, ".values_grad", ".values_grad")
+    .Output(0, "in1_grad", "_grad of inputs");
+REGISTER_GRADIENT(
+    MergeSingleScalarFeatureTensors,
+    GetMergeSingleScalarFeatureTensorsGradient);
+
+// ##########################################################
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleListFeatureTensors,
+    MergeSingleListFeatureTensorsOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleListFeatureTensors)
+    .SetDoc(
+        "Merge given single-feature tensors with list features into one "
+        "multi-feature tensor." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 3 == 0; })
+    .NumOutputs(4)
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_values", ".values")
+    .Input(2, "in1_presence", ".presence")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values_lengths", ".values.lengths")
+    .Output(3, "out_values_values", ".values.values")
+    .Arg("feature_ids", "feature ids");
+
+class GetMergeSingleListFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / 3; ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * 3));
+      input_blob_names.push_back(I(inputIdx * 3 + 2));
+      output_blob_names.push_back(GI(inputIdx * 3 + 1));
+    }
+    input_blob_names.push_back(GO(3));
+
+    return SingleGradientDef(
+        "MergeSingleListFeatureTensorsGradient",
+        "",
+        input_blob_names,
+        output_blob_names);
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleListFeatureTensorsGradient,
+    MergeSingleListOrMapFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleListFeatureTensorsGradient)
+    .SetDoc(
+        "Explode multi-feature tensors with list features into "
+        "single-feature tensors." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 2 == 1; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_presence", ".presence")
+    .Input(2, "out_values_values", ".values.values_grad")
+    .Output(0, "out1_values", ".values_grad");
+REGISTER_GRADIENT(
+    MergeSingleListFeatureTensors,
+    GetMergeSingleListFeatureTensorsGradient);
+
+// ##########################################################
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleMapFeatureTensors,
+    MergeSingleMapFeatureTensorsOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleMapFeatureTensors)
+    .SetDoc(
+        "Merge given single-feature tensors with map features into one "
+        "multi-feature tensor." +
+        doc)
+    .NumInputs([](int n) { return n >= 4 && n % 4 == 0; })
+    .NumOutputs(5)
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_keys", ".keys")
+    .Input(2, "in1_values", ".values")
+    .Input(3, "in1_presence", ".presence")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values_lengths", ".values.lengths")
+    .Output(3, "out_values_keys", ".values.keys")
+    .Output(4, "out_values_values", ".values.values")
+    .Arg("feature_ids", "feature ids");
+
+class GetMergeSingleMapFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / 4; ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * 4));
+      input_blob_names.push_back(I(inputIdx * 4 + 3));
+      output_blob_names.push_back(GI(inputIdx * 4 + 2));
+    }
+    input_blob_names.push_back(GO(4));
+
+    return SingleGradientDef(
+        "MergeSingleMapFeatureTensorsGradient",
+        "",
+        input_blob_names,
+        output_blob_names);
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    MergeSingleMapFeatureTensorsGradient,
+    MergeSingleListOrMapFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeSingleMapFeatureTensorsGradient)
+    .SetDoc(
+        "Explode given multi-feature tensors with map features into "
+        "multiple single-feature tensor." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 2 == 1; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_presence", ".presence")
+    .Input(2, "out_values_values_grad", ".values.values_grad")
+    .Output(0, "in1_values_grad", ".values_grad");
+REGISTER_GRADIENT(
+    MergeSingleMapFeatureTensors,
+    GetMergeSingleMapFeatureTensorsGradient);
+
+// ##########################################################
+
+REGISTER_CPU_OPERATOR(
+    MergeMultiScalarFeatureTensors,
+    MergeMultiScalarFeatureTensorsOp<CPUContext>);
+OPERATOR_SCHEMA(MergeMultiScalarFeatureTensors)
+    .SetDoc(
+        "Merge given multi-feature tensors with scalar features into one." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 3 == 0; })
+    .NumOutputs(3)
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_keys", ".keys")
+    .Input(2, "in1_values", ".values")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values", ".values");
+
+class GetMergeMultiScalarFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / kNumTensorsPerInput;
+         ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * kNumTensorsPerInput));
+      output_blob_names.push_back(GI(inputIdx * kNumTensorsPerInput + 2));
+    }
+    input_blob_names.push_back(GO(2));
+
+    return SingleGradientDef(
+        "MergeMultiScalarFeatureTensorsGradient",
+        "",
+        input_blob_names,
+        output_blob_names);
+  }
+
+ private:
+  const int kNumTensorsPerInput = 3;
+};
+
+REGISTER_CPU_OPERATOR(
+    MergeMultiScalarFeatureTensorsGradient,
+    MergeMultiScalarFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeMultiScalarFeatureTensorsGradient)
+    .SetDoc(
+        "Explode given multi-feature tensors with scalar features into many." +
+        doc)
+    .NumInputs([](int n) { return n >= 2; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "out_values_grad", ".values_grad")
+    .Output(0, "in1_values_grad", ".values_grad");
+REGISTER_GRADIENT(
+    MergeMultiScalarFeatureTensors,
+    GetMergeMultiScalarFeatureTensorsGradient);
+
+// ##########################################################
+
+REGISTER_CPU_OPERATOR(
+    MergeMultiListFeatureTensors,
+    MergeMultiListFeatureTensorsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    MergeMultiListFeatureTensorsGradient,
+    MergeMultiListOrMapFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeMultiListFeatureTensors)
+    .SetDoc(
+        "Merge given multi-feature tensors with list features into one." + doc)
+    .NumInputs([](int n) { return n >= 4 && n % 4 == 0; })
+    .NumOutputs(4)
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_keys", ".keys")
+    .Input(2, "in1_values_lengths", ".values.lengths")
+    .Input(3, "in1_values_values", ".values.values")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values_lengths", ".values.lengths")
+    .Output(3, "out_values_values", ".values.values");
+OPERATOR_SCHEMA(MergeMultiListFeatureTensorsGradient)
+    .SetDoc(
+        "Explode given multi-feature tensors with list features "
+        "into many." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 2 == 1; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_values_lengths", ".values.lengths")
+    .Input(2, "out_values_values_grad", ".values.values_grad")
+    .Output(0, "in1_values_values_grad", ".values.values_grad");
+
+class GetMergeMultiListFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / kNumTensorsPerInput;
+         ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * kNumTensorsPerInput));
+      input_blob_names.push_back(I(inputIdx * kNumTensorsPerInput + 2));
+      output_blob_names.push_back(GI(inputIdx * kNumTensorsPerInput + 3));
+    }
+    input_blob_names.push_back(GO(3));
+
+    return SingleGradientDef(
+        "MergeMultiListFeatureTensorsGradient",
+        "",
+        input_blob_names,
+        output_blob_names);
+  }
+
+ private:
+  const int kNumTensorsPerInput = 4;
+};
+
+REGISTER_GRADIENT(
+    MergeMultiListFeatureTensors,
+    GetMergeMultiListFeatureTensorsGradient);
+
+// ##########################################################
+
+REGISTER_CPU_OPERATOR(
+    MergeMultiMapFeatureTensors,
+    MergeMultiMapFeatureTensorsOp<CPUContext>);
+OPERATOR_SCHEMA(MergeMultiMapFeatureTensors)
+    .SetDoc(
+        "Merge given multi-feature tensors with map features into one." + doc)
+    .NumInputs([](int n) { return n >= 5 && n % 5 == 0; })
+    .NumOutputs(5)
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_keys", ".keys")
+    .Input(2, "in1_values_lengths", ".values.lengths")
+    .Input(3, "in1_values_keys", ".values.keys")
+    .Input(4, "in1_values_values", ".values.values")
+    .Output(0, "out_lengths", ".lengths")
+    .Output(1, "out_keys", ".keys")
+    .Output(2, "out_values_lengths", ".values_lengths")
+    .Output(3, "out_values_keys", ".values.keys")
+    .Output(4, "out_values_values", ".values.values");
+
+class GetMergeMultiMapFeatureTensorsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> input_blob_names{};
+    vector<string> output_blob_names{};
+
+    for (int inputIdx = 0; inputIdx < def_.input_size() / kNumTensorsPerInput;
+         ++inputIdx) {
+      input_blob_names.push_back(I(inputIdx * kNumTensorsPerInput));
+      input_blob_names.push_back(I(inputIdx * kNumTensorsPerInput + 2));
+      output_blob_names.push_back(GI(inputIdx * kNumTensorsPerInput + 4));
+    }
+    input_blob_names.push_back(GO(4));
+
+    return SingleGradientDef(
+        "MergeMultiMapFeatureTensorsGradient",
+        "",
+        input_blob_names,
+        output_blob_names);
+  }
+
+ private:
+  const int kNumTensorsPerInput = 5;
+};
+
+REGISTER_CPU_OPERATOR(
+    MergeMultiMapFeatureTensorsGradient,
+    MergeMultiListOrMapFeatureTensorsGradientOp<CPUContext>);
+OPERATOR_SCHEMA(MergeMultiMapFeatureTensorsGradient)
+    .SetDoc(
+        "Explode given multi-feature tensors with map features "
+        "into many." +
+        doc)
+    .NumInputs([](int n) { return n >= 3 && n % 2 == 1; })
+    .NumOutputs([](int n) { return n >= 1; })
+    .Input(0, "in1_lengths", ".lengths")
+    .Input(1, "in1_values_lengths", ".values.lengths")
+    .Input(2, "out_values_values_grad", ".values.values_grad")
+    .Output(0, "in1_values_values_grad", ".values.values_grad");
+REGISTER_GRADIENT(
+    MergeMultiMapFeatureTensors,
+    GetMergeMultiMapFeatureTensorsGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/feature_maps_ops.h b/caffe2/operators/feature_maps_ops.h
new file mode 100644
index 0000000..7c9b7ab
--- /dev/null
+++ b/caffe2/operators/feature_maps_ops.h
@@ -0,0 +1,815 @@
+#ifndef CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_
+#define CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class MergeSingleScalarFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeSingleScalarFeatureTensorsOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    featureIDs_ = OperatorBase::GetRepeatedArgument<int64_t>("feature_ids");
+  }
+  virtual ~MergeSingleScalarFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      const bool* inPresenceData =
+          Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
+      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+        if (inPresenceData[exampleIndex]) {
+          ++totalNumFeatures;
+        }
+      }
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValues = Output(2);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValues->Resize(totalNumFeatures);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    T* outValuesData = outValues->template mutable_data<T>();
+
+    int keysOffset = 0;
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const T* inData =
+            Input(kNumTensorsPerInput * inputIndex).template data<T>();
+        const bool* inPresenceData =
+            Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
+        if (inPresenceData[exampleIndex]) {
+          ++outLengthsData[exampleIndex];
+          outKeysData[keysOffset] = featureIDs_[inputIndex];
+          outValuesData[keysOffset] = inData[exampleIndex];
+          ++keysOffset;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 2;
+  int numInputs_;
+  std::vector<int64_t> featureIDs_;
+};
+
+template <class Context>
+class MergeSingleScalarFeatureTensorsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeSingleScalarFeatureTensorsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numFeatureInputs_ = InputSize() - 1; // Everything other than values_grad
+  }
+  virtual ~MergeSingleScalarFeatureTensorsGradientOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(InputSize() - 1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+      Output(inputIndex)->ResizeLike(Input(inputIndex));
+    }
+
+    const T* inValuesGradData = Input(InputSize() - 1).template data<T>();
+
+    T default_value = T();
+    int valuesOffset = 0;
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+        const bool* inPresenceData = Input(inputIndex).template data<bool>();
+        T* outFeatureData = Output(inputIndex)->template mutable_data<T>();
+        if (inPresenceData[exampleIndex]) {
+          outFeatureData[exampleIndex] = inValuesGradData[valuesOffset];
+          ++valuesOffset;
+        } else {
+          outFeatureData[exampleIndex] = default_value;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  int numFeatureInputs_;
+};
+
+template <class Context>
+class MergeSingleListFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeSingleListFeatureTensorsOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    inValuesOffset_.resize(numInputs_);
+    featureIDs_ = OperatorBase::GetRepeatedArgument<int64_t>("feature_ids");
+  }
+  virtual ~MergeSingleListFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    int totalNumValues = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      const int32_t* inLengthsData =
+          Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+      const bool* inPresenceData =
+          Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>();
+      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+        if (inPresenceData[exampleIndex]) {
+          ++totalNumFeatures;
+          totalNumValues += inLengthsData[exampleIndex];
+        }
+      }
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValuesLengths = Output(2);
+    auto* outValuesValues = Output(3);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValuesLengths->Resize(totalNumFeatures);
+    outValuesValues->Resize(totalNumValues);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    int32_t* outValuesLengthsData =
+        outValuesLengths->template mutable_data<int32_t>();
+    T* outValuesValuesData = outValuesValues->template mutable_data<T>();
+
+    int keysOffset = 0;
+    int valuesOffset = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      inValuesOffset_[inputIndex] = 0;
+    }
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 1);
+        const bool* inPresenceData =
+            Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>();
+        if (inPresenceData[exampleIndex]) {
+          ++outLengthsData[exampleIndex];
+          outKeysData[keysOffset] = featureIDs_[inputIndex];
+          outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
+          context_.template CopyItems<Context, Context>(
+              inValues.meta(),
+              inLengthsData[exampleIndex],
+              &inValues.template data<T>()[inValuesOffset_[inputIndex]],
+              &outValuesValuesData[valuesOffset]);
+          valuesOffset += inLengthsData[exampleIndex];
+          inValuesOffset_[inputIndex] += inLengthsData[exampleIndex];
+          ++keysOffset;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 3;
+  int numInputs_;
+  std::vector<int> inValuesOffset_;
+  std::vector<int64_t> featureIDs_;
+};
+
+template <class Context>
+class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeSingleListOrMapFeatureTensorsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
+  }
+  virtual ~MergeSingleListOrMapFeatureTensorsGradientOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(InputSize() - 1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    std::vector<int> outValuesOffset(numFeatureInputs_);
+    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+      int inputNumValues = 0;
+      const int32_t* inLengthsData =
+          Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+      const bool* inPresenceData =
+          Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
+      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+        if (inPresenceData[exampleIndex]) {
+          inputNumValues += inLengthsData[exampleIndex];
+        }
+      }
+      Output(inputIndex)->Resize(inputNumValues);
+    }
+
+    const auto& inValuesValuesGrad = Input(InputSize() - 1);
+    const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();
+
+    int inValuesValuesOffset = 0;
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const bool* inPresenceData =
+            Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
+        if (inPresenceData[exampleIndex]) {
+          T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
+          context_.template CopyItems<Context, Context>(
+              inValuesValuesGrad.meta(),
+              inLengthsData[exampleIndex],
+              &inValuesValuesGradData[inValuesValuesOffset],
+              &outFeatureValues[outValuesOffset[inputIndex]]);
+          outValuesOffset[inputIndex] += inLengthsData[exampleIndex];
+          inValuesValuesOffset += inLengthsData[exampleIndex];
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 2;
+  int numFeatureInputs_;
+};
+
+template <class Context>
+class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeSingleMapFeatureTensorsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    inValuesOffset_.resize(numInputs_);
+    featureIDs_ = OperatorBase::GetRepeatedArgument<int64_t>("feature_ids");
+  }
+  virtual ~MergeSingleMapFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(1));
+  }
+
+  template <typename K>
+  bool DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<bool, int32_t, int64_t, float, double, std::string>,
+        K>::call(this, Input(2));
+  }
+
+  template <typename K, typename V>
+  bool DoRunWithType2() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    int totalNumValues = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      const int32_t* inLengthsData =
+          Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+      const bool* inPresenceData =
+          Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>();
+      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+        if (inPresenceData[exampleIndex]) {
+          ++totalNumFeatures;
+          totalNumValues += inLengthsData[exampleIndex];
+        }
+      }
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValuesLengths = Output(2);
+    auto* outValuesKeys = Output(3);
+    auto* outValuesValues = Output(4);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValuesLengths->Resize(totalNumFeatures);
+    outValuesKeys->Resize(totalNumValues);
+    outValuesValues->Resize(totalNumValues);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    int32_t* outValuesLengthsData =
+        outValuesLengths->template mutable_data<int32_t>();
+    K* outValuesKeysData = outValuesKeys->template mutable_data<K>();
+    V* outValuesValuesData = outValuesValues->template mutable_data<V>();
+
+    int keysOffset = 0;
+    int valuesOffset = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      inValuesOffset_[inputIndex] = 0;
+    }
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const auto& inKeys = Input(kNumTensorsPerInput * inputIndex + 1);
+        const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 2);
+        const bool* inPresenceData =
+            Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>();
+        if (inPresenceData[exampleIndex]) {
+          ++outLengthsData[exampleIndex];
+          outKeysData[keysOffset] = featureIDs_[inputIndex];
+          outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
+          context_.template CopyItems<Context, Context>(
+              inKeys.meta(),
+              inLengthsData[exampleIndex],
+              &inKeys.template data<K>()[inValuesOffset_[inputIndex]],
+              &outValuesKeysData[valuesOffset]);
+          context_.template CopyItems<Context, Context>(
+              inValues.meta(),
+              inLengthsData[exampleIndex],
+              &inValues.template data<V>()[inValuesOffset_[inputIndex]],
+              &outValuesValuesData[valuesOffset]);
+          valuesOffset += inLengthsData[exampleIndex];
+          inValuesOffset_[inputIndex] += inLengthsData[exampleIndex];
+          ++keysOffset;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 4;
+  int numInputs_;
+  std::vector<int> inValuesOffset_;
+  std::vector<int64_t> featureIDs_;
+};
+
+template <class Context>
+class MergeMultiScalarFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeMultiScalarFeatureTensorsOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    inKeysOffset_.resize(numInputs_);
+  }
+  virtual ~MergeMultiScalarFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(2));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).size();
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValues = Output(2);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValues->Resize(totalNumFeatures);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    T* outValuesData = outValues->template mutable_data<T>();
+
+    int outKeysOffset = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      inKeysOffset_[inputIndex] = 0;
+    }
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
+                                        .template data<int64_t>();
+        const T* inValuesData =
+            Input(kNumTensorsPerInput * inputIndex + 2).template data<T>();
+        outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
+        for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
+             ++featureIndex) {
+          outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
+          outValuesData[outKeysOffset] =
+              inValuesData[inKeysOffset_[inputIndex]];
+          ++outKeysOffset;
+          ++inKeysOffset_[inputIndex];
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 3;
+  int numInputs_;
+  std::vector<int> inKeysOffset_;
+};
+
+template <class Context>
+class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeMultiScalarFeatureTensorsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
+  }
+  virtual ~MergeMultiScalarFeatureTensorsGradientOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(InputSize() - 1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    std::vector<int> outValuesOffset(numFeatureInputs_);
+    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+      int inputNumValues = 0;
+      const int32_t* inLengthsData =
+          Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+        inputNumValues += inLengthsData[exampleIndex];
+      }
+      Output(inputIndex)->Resize(inputNumValues);
+    }
+
+    const auto& inValuesGrad = Input(InputSize() - 1);
+    const T* inValuesGradData = inValuesGrad.template data<T>();
+
+    int inValuesOffset = 0;
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        if (inLengthsData[exampleIndex] > 0) {
+          T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
+          context_.template CopyItems<Context, Context>(
+              inValuesGrad.meta(),
+              inLengthsData[exampleIndex],
+              &inValuesGradData[inValuesOffset],
+              &outFeatureValues[outValuesOffset[inputIndex]]);
+          outValuesOffset[inputIndex] += inLengthsData[exampleIndex];
+          inValuesOffset += inLengthsData[exampleIndex];
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  int kNumTensorsPerInput = 1;
+  int numFeatureInputs_;
+};
+
+template <class Context>
+class MergeMultiListFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeMultiListFeatureTensorsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    inKeysOffset_.resize(numInputs_);
+    inValuesValuesOffset_.resize(numInputs_);
+  }
+  virtual ~MergeMultiListFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(3));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    int totalNumValues = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).size();
+      totalNumValues += Input(kNumTensorsPerInput * inputIndex + 3).size();
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValuesLengths = Output(2);
+    auto* outValuesValues = Output(3);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValuesLengths->Resize(totalNumFeatures);
+    outValuesValues->Resize(totalNumValues);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    int32_t* outValuesLengthsData =
+        outValuesLengths->template mutable_data<int32_t>();
+    T* outValuesValuesData = outValuesValues->template mutable_data<T>();
+
+    int outKeysOffset = 0;
+    int outValuesValuesOffset = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      inKeysOffset_[inputIndex] = 0;
+      inValuesValuesOffset_[inputIndex] = 0;
+    }
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
+                                        .template data<int64_t>();
+        const int32_t* inValuesLengthsData =
+            Input(kNumTensorsPerInput * inputIndex + 2)
+                .template data<int32_t>();
+        const auto& inValuesValues =
+            Input(kNumTensorsPerInput * inputIndex + 3);
+        outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
+        for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
+             ++featureIndex) {
+          outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
+          outValuesLengthsData[outKeysOffset] =
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          context_.template CopyItems<Context, Context>(
+              inValuesValues.meta(),
+              inValuesLengthsData[inKeysOffset_[inputIndex]],
+              &inValuesValues
+                   .template data<T>()[inValuesValuesOffset_[inputIndex]],
+              &outValuesValuesData[outValuesValuesOffset]);
+          outValuesValuesOffset +=
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          inValuesValuesOffset_[inputIndex] +=
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          ++outKeysOffset;
+          ++inKeysOffset_[inputIndex];
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 4;
+  int numInputs_;
+  std::vector<int> inKeysOffset_;
+  std::vector<int> inValuesValuesOffset_;
+};
+
+template <class Context>
+class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeMultiMapFeatureTensorsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numInputs_ = InputSize() / kNumTensorsPerInput;
+    inKeysOffset_.resize(numInputs_);
+    inValuesValuesOffset_.resize(numInputs_);
+  }
+  virtual ~MergeMultiMapFeatureTensorsOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(3));
+  }
+
+  template <typename K>
+  bool DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<bool, int32_t, int64_t, float, double, std::string>,
+        K>::call(this, Input(4));
+  }
+
+  template <typename K, typename V>
+  bool DoRunWithType2() {
+    int numExamples = Input(0).size();
+    int totalNumFeatures = 0;
+    int totalNumValues = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).size();
+      totalNumValues += Input(kNumTensorsPerInput * inputIndex + 4).size();
+    }
+
+    auto* outLengths = Output(0);
+    auto* outKeys = Output(1);
+    auto* outValuesLengths = Output(2);
+    auto* outValuesKeys = Output(3);
+    auto* outValuesValues = Output(4);
+
+    outLengths->Resize(numExamples);
+    outKeys->Resize(totalNumFeatures);
+    outValuesLengths->Resize(totalNumFeatures);
+    outValuesKeys->Resize(totalNumValues);
+    outValuesValues->Resize(totalNumValues);
+
+    int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
+    int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
+    int32_t* outValuesLengthsData =
+        outValuesLengths->template mutable_data<int32_t>();
+    K* outValuesKeysData = outValuesKeys->template mutable_data<K>();
+    V* outValuesValuesData = outValuesValues->template mutable_data<V>();
+
+    int outKeysOffset = 0;
+    int outValuesValuesOffset = 0;
+    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      inKeysOffset_[inputIndex] = 0;
+      inValuesValuesOffset_[inputIndex] = 0;
+    }
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      outLengthsData[exampleIndex] = 0;
+      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
+                                        .template data<int64_t>();
+        const int32_t* inValuesLengthsData =
+            Input(kNumTensorsPerInput * inputIndex + 2)
+                .template data<int32_t>();
+        const auto& inValuesKeys = Input(kNumTensorsPerInput * inputIndex + 3);
+        const auto& inValuesValues =
+            Input(kNumTensorsPerInput * inputIndex + 4);
+        outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
+        for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
+             ++featureIndex) {
+          outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
+          outValuesLengthsData[outKeysOffset] =
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          context_.template CopyItems<Context, Context>(
+              inValuesKeys.meta(),
+              inValuesLengthsData[inKeysOffset_[inputIndex]],
+              &inValuesKeys
+                   .template data<K>()[inValuesValuesOffset_[inputIndex]],
+              &outValuesKeysData[outValuesValuesOffset]);
+          context_.template CopyItems<Context, Context>(
+              inValuesValues.meta(),
+              inValuesLengthsData[inKeysOffset_[inputIndex]],
+              &inValuesValues
+                   .template data<V>()[inValuesValuesOffset_[inputIndex]],
+              &outValuesValuesData[outValuesValuesOffset]);
+          outValuesValuesOffset +=
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          inValuesValuesOffset_[inputIndex] +=
+              inValuesLengthsData[inKeysOffset_[inputIndex]];
+          ++outKeysOffset;
+          ++inKeysOffset_[inputIndex];
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  const int kNumTensorsPerInput = 5;
+  int numInputs_;
+  std::vector<int> inKeysOffset_;
+  std::vector<int> inValuesValuesOffset_;
+};
+
+template <class Context>
+class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MergeMultiListOrMapFeatureTensorsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
+  }
+  virtual ~MergeMultiListOrMapFeatureTensorsGradientOp() noexcept {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
+        call(this, Input(InputSize() - 1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    int numExamples = Input(0).size();
+    std::vector<int> outValuesLengthOffset(numFeatureInputs_);
+    std::vector<int> outValuesValuesOffset(numFeatureInputs_);
+    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+      int inputNumValues = 0;
+      auto& inValuesLength = Input(kNumTensorsPerInput * inputIndex + 1);
+      const int32_t* inValuesLengthsData =
+          inValuesLength.template data<int32_t>();
+      for (int valuesIndex = 0; valuesIndex < inValuesLength.size();
+           ++valuesIndex) {
+        inputNumValues += inValuesLengthsData[valuesIndex];
+      }
+      Output(inputIndex)->Resize(inputNumValues);
+    }
+
+    const auto& inValuesValuesGrad = Input(InputSize() - 1);
+    const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();
+
+    int inValuesValuesOffset = 0;
+    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+        const int32_t* inLengthsData =
+            Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
+        const int32_t* inValuesLengthsData =
+            Input(kNumTensorsPerInput * inputIndex + 1)
+                .template data<int32_t>();
+        int valuesLengthCopy = 0;
+        for (int valuesLengthIndex = 0;
+             valuesLengthIndex < inLengthsData[exampleIndex];
+             ++valuesLengthIndex) {
+          valuesLengthCopy += inValuesLengthsData
+              [outValuesLengthOffset[inputIndex] + valuesLengthIndex];
+        }
+        if (valuesLengthCopy > 0) {
+          T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
+          context_.template CopyItems<Context, Context>(
+              inValuesValuesGrad.meta(),
+              valuesLengthCopy,
+              &inValuesValuesGradData[inValuesValuesOffset],
+              &outFeatureValues[outValuesValuesOffset[inputIndex]]);
+        }
+        outValuesLengthOffset[inputIndex] += inLengthsData[exampleIndex];
+        outValuesValuesOffset[inputIndex] += valuesLengthCopy;
+        inValuesValuesOffset += valuesLengthCopy;
+      }
+    }
+    return true;
+  }
+
+ private:
+  int kNumTensorsPerInput = 2;
+  int numFeatureInputs_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_
diff --git a/caffe2/operators/feed_blob_op.cc b/caffe2/operators/feed_blob_op.cc
new file mode 100644
index 0000000..f86bddb
--- /dev/null
+++ b/caffe2/operators/feed_blob_op.cc
@@ -0,0 +1,18 @@
+#include "caffe2/operators/feed_blob_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(FeedBlob, FeedBlobOp<CPUContext>);
+SHOULD_NOT_DO_GRADIENT(FeedBlob);
+
+OPERATOR_SCHEMA(FeedBlob)
+    .NumInputs(0, 0)
+    .NumOutputs(1, 1)
+    .SetDoc(R"DOC(
+FeedBlobs the content of the blobs. The input and output blobs should be
+one-to-one inplace.)DOC")
+    .Arg(
+        "value",
+        "(string) if provided then we will use this string as the value for the"
+        "provided output tensor");
+
+} // namespace caffe2
diff --git a/caffe2/operators/feed_blob_op.h b/caffe2/operators/feed_blob_op.h
new file mode 100644
index 0000000..4121214
--- /dev/null
+++ b/caffe2/operators/feed_blob_op.h
@@ -0,0 +1,31 @@
+#ifndef CAFFE2_OPERATORS_FEED_BLOB_OP_H_
+#define CAFFE2_OPERATORS_FEED_BLOB_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class FeedBlobOp : public Operator<Context> {
+ public:
+  FeedBlobOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasSingleArgumentOfType<string>("value"),
+        "value argument must exist and be passed as a string");
+    value_ = OperatorBase::GetSingleArgument<string>("value", "");
+  }
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::string>(0) = value_;
+    return true;
+  }
+
+ private:
+  std::string value_;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
new file mode 100644
index 0000000..021df2e
--- /dev/null
+++ b/caffe2/operators/filler_op.cc
@@ -0,0 +1,694 @@
+#include "caffe2/operators/filler_op.h"
+
+namespace caffe2 {
+
+template <>
+bool RangeFillOp<float, CPUContext>::Fill(
+    TensorCPU* output) {
+  float* data = output->mutable_data<float>();
+  for (int i = 0; i < output->size(); ++i) {
+    data[i] = i;
+  }
+  return true;
+}
+
+template <>
+template <typename T>
+bool DiagonalFillOp<CPUContext>::FillWithType(TensorCPU* output) {
+  VerifyOutputShape(output);
+  T value = OperatorBase::GetSingleArgument<T>("value", 0);
+  auto* data = output->template mutable_data<T>();
+  // first fill everything with 0
+  math::Set<T, CPUContext>(output->size(), T(0), data, &context_);
+  // then calculate step size for diagonal
+  auto step = GetStepSize(output);
+  for (TIndex i = 0; i < output->size(); i += step) {
+    math::Set<T, CPUContext>(1, value, data, &context_);
+    data += step;
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(UniformIntFill, UniformFillOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(UniqueUniformFill, UniqueUniformFillOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<CPUContext>);
+REGISTER_CPU_OPERATOR(DiagonalFill, DiagonalFillOp<CPUContext>);
+REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MSRAFill, MSRAFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsRangeFill, LengthsRangeFillOp<CPUContext>);
+
+OPERATOR_SCHEMA(ConstantFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+This operator fills the elements of the output tensor with a constant value
+specified by the `value` argument.
+
+- The data type is specified by the `dtype` argument
+
+- Currently, the data types supported are *float*, *int32*, *int64*, and *bool*
+
+- If the `dtype` argument is not provided, the data type of `value` is used
+
+- The output tensor shape is either specified by the `shape` argument or will
+match the shape of the input tensor if one is provided (if an input tensor is
+provided, a shape argument should not be set)
+
+- Optional additional dimensions can be appended at the end as specified by
+`extra_shape` argument
+
+- If `input_as_shape` is set to True, the input should be a 1D tensor
+containing the desired output shape (the dimensions specified in `extra_shape`
+will also be appended)
+
+When specifying `dtype` argument, use the integer keys from the *DataType* enum
+in TensorProto:
+
+```
+message TensorProto {
+  ...
+  enum DataType {
+    UNDEFINED = 0;
+    FLOAT = 1;  // float
+    INT32 = 2;  // int
+    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
+    STRING = 4;  // string
+    BOOL = 5;  // bool
+    UINT8 = 6;  // uint8_t
+    INT8 = 7;  // int8_t
+    UINT16 = 8;  // uint16_t
+    INT16 = 9;  // int16_t
+    INT64 = 10;  // int64_t
+    FLOAT16 = 12;  // caffe2::__f16, caffe2::float16
+    DOUBLE = 13;  // double
+  }
+```
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ConstantFill",
+    [],
+    ["Y"],
+    shape=(1,5,5)
+)
+
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+Y: [[[0. 0. 0. 0. 0.]
+  [0. 0. 0. 0. 0.]
+  [0. 0. 0. 0. 0.]
+  [0. 0. 0. 0. 0.]
+  [0. 0. 0. 0. 0.]]]
+```
+</details>
+
+<details>
+<summary> <b>Example 2</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ConstantFill",
+    ["X"],
+    ["Y"],
+    value=4.0,
+    dtype=1,
+    extra_shape=(1,2)
+)
+
+workspace.FeedBlob("X", (np.random.randint(100, size=(3,3))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X: [[86. 30. 84.]
+ [34. 51.  9.]
+ [29. 86. 59.]]
+Y: [[[[4. 4.]]
+
+  [[4. 4.]]
+
+  [[4. 4.]]]
+
+
+ [[[4. 4.]]
+
+  [[4. 4.]]
+
+  [[4. 4.]]]
+
+
+ [[[4. 4.]]
+
+  [[4. 4.]]
+
+  [[4. 4.]]]]
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "value",
+        "*(type: primitive; default: 0.0f) value to populate output tensor with.")
+    .Arg(
+        "dtype",
+        "*(type: int)* The data type for the elements of the output tensor. "
+        "Strictly must be one of the types from *DataType* enum in TensorProto.")
+    .Arg(
+        "shape",
+        "*(type: int | Tuple(int))* Shape of the output tensor. Cannot pass an "
+        "input blob and this arg at the same time.")
+    .Arg(
+        "extra_shape",
+        "*(type: int | Tuple(int))* Additional dimensions appended at the end "
+        "of the shape indicated by the input blob. Cannot set this"
+        "argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "*(type: int | Tuple(int))* 1D tensor containing the desired output "
+        "shape. First input must be in CPU context.")
+    .Input(
+        0,
+        "X",
+        "*(type: Tensor)* [OPTIONAL] Input tensor to provide shape information.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor)* Output tensor of constant values.");
+
+OPERATOR_SCHEMA(DiagonalFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+The operator fills the diagonal elements of the output tensor (>= 2D)
+with a constant value specified by the 'value' argument, and others 0. If
+number of dimensions of the output tensor is greater than 2, all dimensions
+must be equal.
+
+The data type is specified by the 'dtype' argument. The 'dtype' argument must
+be one of the data types specified in the 'DataType' enum field in the
+TensorProto message. If the 'dtype' argument is not provided, the data type of
+'value' is used.
+
+The output tensor shape is specified by the 'shape' argument. If the number of
+input is 1, the shape will be identical to that of the input at run time with
+optional additional dimensions appended at the end as specified by 'extra_shape'
+argument. In that case the 'shape' argument should not be set.
+
+If input_as_shape is set to true, then the input should be a 1D tensor
+containing the desired output shape (the dimensions specified in extra_shape
+will also be appended)
+
+NOTE: Currently, it supports data type of float, int32, int64, and bool.
+)DOC")
+    .Arg("value", "The value for the elements of the output tensor.")
+    .Arg(
+        "dtype",
+        "The data type for the elements of the output tensor."
+        "Strictly must be one of the types from DataType enum in TensorProto.")
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg("input_as_shape", "1D tensor containing the desired output shape")
+    .Input(0, "input", "Input tensor (optional) to provide shape information.")
+    .Output(
+        0,
+        "output",
+        "Output tensor"
+        "argument and its type is specified by the 'dtype' argument");
+
+OPERATOR_SCHEMA(UniformFill)
+    .NumInputs({0, 1, 3})
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+Fill the output tensor with float samples from uniform distribution [`min`, `max`].
+
+- The range can be defined either by arguments or input blobs. `min` and `max` are inclusive.
+    - If the range is given by input blobs, you also need to give the shape as input.
+    - When the range is given as arguments, this operator enforces min <= max. When the range is given as inputs, the constraint is not enforced.
+    - When the range is given as inputs and max < min, the first dimension of the output is set to 0. This behavior is allowed so that dynamically sampling indices into a dynamically sized tensor is possible.
+- The shape of the output can be given as argument or input.
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op_1 = core.CreateOperator(
+    "UniformFill",
+    [],
+    ["output"],
+    min=5.5,
+    max=10.5,
+    shape=(3,3)
+)
+
+op_2 = core.CreateOperator(
+    "UniformFill",
+    ["shape", "min", "max"],
+    ["output"],
+    input_as_shape=1
+)
+
+# Test arg-based op
+workspace.RunOperatorOnce(op_1)
+print("output (op_1):\n", workspace.FetchBlob("output"))
+
+# Test input-based op
+workspace.ResetWorkspace()
+workspace.FeedBlob("shape", np.array([5,5]))
+workspace.FeedBlob("min", np.array(13.8, dtype=np.float32))
+workspace.FeedBlob("max", np.array(19.3, dtype=np.float32))
+workspace.RunOperatorOnce(op_2)
+print("output (op_2):\n", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+output (op_1):
+ [[8.894862  8.225005  6.7890406]
+ [9.588293  7.1072135 7.7234955]
+ [8.210596  6.0202913 9.665462 ]]
+output (op_2):
+ [[18.965155 15.603871 15.038921 17.14872  18.134571]
+ [18.84237  17.845276 19.214737 16.970337 15.494069]
+ [18.754795 16.724329 15.311974 16.962536 18.60965 ]
+ [15.186268 15.264773 18.73341  19.077969 14.237255]
+ [15.917589 15.844325 16.248466 17.006554 17.502048]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("min", "(*float*): minimum value, inclusive")
+    .Arg("max", "(*float*): maximum value, inclusive")
+    .Arg("shape", "(*Tuple(int)*): shape of the output, do not set when `input_as_shape`=1")
+    .Arg(
+        "input_as_shape",
+        "(*int*): set to 1 to use the first input as shape; `shape` input must be in CPU context")
+    .Input(
+        0,
+        "shape",
+        "(*Tensor`<int>`*): 1-D tensor of the shape of the output, must be used with `input_as_shape` argument")
+    .Input(1, "min", "(*Tensor`<float>`*): scalar tensor containing minimum value, inclusive")
+    .Input(2, "max", "(*Tensor`<float>`*): scalar tensor containing maximum value, inclusive")
+    .Output(0, "output", "(*Tensor`<float>`*): filled output tensor");
+OPERATOR_SCHEMA(UniformIntFill)
+    .NumInputs({0, 1, 3})
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+Fill the output tensor with int32 samples from uniform distribution [`min`, `max`].
+
+- The range can be defined either by arguments or input blobs. `min` and `max` are inclusive.
+    - If the range is given by input blobs, you also need to give the shape as input.
+    - When the range is given as arguments, this operator enforces min <= max. When the range is given as inputs, the constraint is not enforced.
+    - When the range is given as inputs and max < min, the first dimension of the output is set to 0. This behavior is allowed so that dynamically sampling indices into a dynamically sized tensor is possible.
+- The shape of the output can be given as argument or input.
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op_1 = core.CreateOperator(
+    "UniformIntFill",
+    [],
+    ["output"],
+    min=5,
+    max=10,
+    shape=(3,3)
+)
+
+op_2 = core.CreateOperator(
+    "UniformIntFill",
+    ["shape", "min", "max"],
+    ["output"],
+    input_as_shape=1
+)
+
+# Test arg-based op
+workspace.RunOperatorOnce(op_1)
+print("output (op_1):\n", workspace.FetchBlob("output"))
+
+# Test input-based op
+workspace.ResetWorkspace()
+workspace.FeedBlob("shape", np.array([5,5]))
+workspace.FeedBlob("min", np.array(13, dtype=np.int32))
+workspace.FeedBlob("max", np.array(19, dtype=np.int32))
+workspace.RunOperatorOnce(op_2)
+print("output (op_2):\n", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+output (op_1):
+ [[ 6 10  7]
+ [ 5 10  6]
+ [ 7  5 10]]
+output (op_2):
+ [[19 13 15 13 13]
+ [14 17 14 15 15]
+ [17 14 19 13 13]
+ [17 18 16 13 18]
+ [14 15 16 18 16]]
+
+```
+
+</details>
+
+    )DOC")
+    .Arg("min", "(*int*): minimum value, inclusive")
+    .Arg("max", "(*int*): maximum value, inclusive")
+    .Arg(
+        "shape",
+        "(*Tuple(int)*): shape of the output, do not set when `input_as_shape`=1")
+    .Arg(
+        "input_as_shape",
+        "(*int*): set to 1 to use the first input as shape; `shape` input must be in CPU context")
+    .Input(0, "shape", "(*Tensor`<int>`*): 1-D tensor of the shape of the output, must be used with `input_as_shape` argument")
+    .Input(1, "min", "(*Tensor`<int>`*): scalar tensor containing minimum value, inclusive")
+    .Input(2, "max", "(*Tensor`<int>`*): scalar tensor containing maximum value, inclusive")
+    .Output(0, "output", "(*Tensor`<int>`*): filled output tensor");
+OPERATOR_SCHEMA(UniqueUniformFill)
+    .NumInputs(0, 2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+Fill the output tensor with uniform samples between min and max (inclusive).
+If the second input is given, its elements will be excluded from uniform
+sampling. Using the second input will require you to provide shape via the first
+input.
+)DOC")
+    .Arg("min", "Minimum value, inclusive")
+    .Arg("max", "Maximum value, inclusive")
+    .Arg(
+        "dtype",
+        "The data type for the elements of the output tensor."
+        "Strictly must be one of the types from DataType enum in TensorProto."
+        "This only supports INT32 and INT64 now. If not set, assume INT32")
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob. "
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .Input(0, "input", "Input tensor to provide shape information")
+    .Input(
+        1,
+        "avoid",
+        "(optional) Avoid elements in this tensor. Elements must be unique.")
+    .Output(0, "output", "Output tensor of unique uniform samples");
+OPERATOR_SCHEMA(GaussianFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+This op fills an output tensor with samples drawn from a normal distribution specified by the mean and standard deviation arguments. The output tensor shape is specified by the *shape* argument. However, if *input_as_shape* is set to *true*, then the *input* should be a 1D tensor containing the desired output shape (the dimensions specified in *extra_shape* will also be appended). In this case, the *shape* argument should **not** be set.
+
+*Note: cannot set the shape argument and pass in an input at the same time.*
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "GaussianFill",
+    [],
+    ["out"],
+    shape=[3,3],
+    mean=2.0,
+    std=1.1
+)
+
+workspace.RunOperatorOnce(op)
+print("Out:\n", workspace.FetchBlob("out"))
+
+```
+
+**Result**
+
+```
+
+Out:
+ [[1.2084167  2.3336504  2.827349  ]
+ [2.7108908  0.9374752  1.7173369 ]
+ [0.03320992 2.1775863  1.0894578 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "mean",
+        "*(type: float; default: 0.)* Mean of the distribution to draw from.")
+    .Arg(
+        "std",
+        "*(type: float; default: 1.)* Standard deviation of the distribution to draw from.")
+    .Arg(
+        "shape",
+        "*(type: [int])* Desired shape of the *output* tensor.")
+    .Arg(
+        "extra_shape",
+        "*(type: [int])* The additional dimensions appended at the end of the *shape* indicated by the input blob. Cannot set the *extra_shape* argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "*(type: bool; default: False)* set to *True* to use the *input* as shape. First, input must be in CPU context.")
+    .Input(
+        0,
+        "input",
+        "(Optional) 1D tensor specifying the shape of the output. Must be used with *input_as_shape=True*")
+    .Output(
+        0,
+        "output",
+        "Output tensor of random values drawn from a normal distribution. If the shape argument is set, this is the shape specified, and if the *input* exists and *input_as_shape=True*, it is the shape specified by the *input* tensor.");
+OPERATOR_SCHEMA(XavierFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>)
+    .SetDoc(R"DOC(
+This op fills an output tensor with values sampled from a uniform distribution with the range determined by the desired shape of the output. Rather, than specifying the range of values manually, the novelty of Xavier Fill is that it automatically scales the range of the distribution it draws from based on the size of the desired output tensor. For more information check out the paper [Understanding the difficulty of training deep feedforward neural networks](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). The output tensor shape is specified by the *shape* argument. However, if *input_as_shape* is set to *true*, then the *input* should be a 1D tensor containing the desired output shape (the dimensions specified in *extra_shape* will also be appended). In this case, the *shape* argument should **not** be set.
+
+*Note: Do not set the shape argument and pass in an input at the same time.*
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "XavierFill",
+    [],
+    ["out"],
+    shape=[3,3],
+)
+
+workspace.RunOperatorOnce(op)
+print("Out:\n", workspace.FetchBlob("out"))
+
+```
+
+**Result**
+
+```
+
+Out:
+ [[-0.8412168   0.33207083 -0.88418937]
+ [ 0.43059897 -0.8340702   0.07781601]
+ [ 0.93261135 -0.24542928 -0.3980782 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "shape",
+        "*(type: [int])* Desired shape of the *output* tensor.")
+    .Arg(
+        "extra_shape",
+        "*(type: [int])* The additional dimensions appended at the end of the *shape* indicated by the input blob. Cannot set the *extra_shape* argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "*(type: bool; default: False)* set to *True* to use the *input* as shape. First, input must be in CPU context.")
+    .Input(
+        0,
+        "input",
+        "(Optional) 1D tensor specifying the shape of the output. Must be used with *input_as_shape=True*")
+    .Output(
+        0,
+        "output",
+        "Output tensor of random values drawn from an automatically scaled uniform distribution, based on the size of the output tensor. If the shape argument is set, this is the shape specified by the shape argument, and if the *input* exists and *input_as_shape=True*, it is the shape specified by the *input* tensor.");
+
+OPERATOR_SCHEMA(MSRAFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>);
+OPERATOR_SCHEMA(RangeFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .TensorInferenceFunction(FillerTensorInference<>);
+
+NO_GRADIENT(UniformFill);
+NO_GRADIENT(UniformIntFill);
+NO_GRADIENT(UniqueUniformFill);
+NO_GRADIENT(ConstantFill);
+NO_GRADIENT(DiagonalFill);
+NO_GRADIENT(GaussianFill);
+NO_GRADIENT(XavierFill);
+NO_GRADIENT(MSRAFill);
+NO_GRADIENT(RangeFill);
+
+OPERATOR_SCHEMA(LengthsRangeFill)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The *LengthsRangeFill* op takes a single input *lengths* and outputs a single tensor *range_sequence*. For each element of *lengths*, the op appends the range(0,lengths) vector to the end of *range_sequence*. For example, if input=[2,4,1], the output would be [0,1,0,1,2,3,0].
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/filler_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsRangeFill",
+    ["lengths"],
+    ["range_sequence"],
+)
+
+workspace.FeedBlob("lengths", np.array([2,4,1]).astype(np.int32))
+print("lengths:\n", workspace.FetchBlob("lengths"))
+
+workspace.RunOperatorOnce(op)
+print("range_sequence: \n", workspace.FetchBlob("range_sequence"))
+
+```
+
+**Result**
+
+```
+
+lengths:
+ [2 4 1]
+range_sequence:
+ [0 1 0 1 2 3 0]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
+    .Output(
+        0,
+        "range_sequence",
+        "1D tensor whose size is the sum of *lengths*");
+NO_GRADIENT(LengthsRangeFill);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/filler_op.cu b/caffe2/operators/filler_op.cu
new file mode 100644
index 0000000..9df195a
--- /dev/null
+++ b/caffe2/operators/filler_op.cu
@@ -0,0 +1,71 @@
+#include <cmath>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/filler_op.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void FillRangeKernel(const int n, float* data) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    data[index] = index;
+  }
+}
+
+template <typename T>
+__global__ void FillDiagonalKernel(
+    const int num_diagonal_elements,
+    const TIndex step_size,
+    const T value,
+    T* data) {
+  CUDA_1D_KERNEL_LOOP(index, num_diagonal_elements) {
+    data[index * step_size] = value;
+  }
+}
+}
+
+template <>
+bool RangeFillOp<float, CUDAContext>::Fill(TensorCUDA* output) {
+  int N = output->size();
+  FillRangeKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, output->mutable_data<float>());
+  return true;
+}
+
+template <>
+template <typename T>
+bool DiagonalFillOp<CUDAContext>::FillWithType(TensorCUDA* output) {
+  VerifyOutputShape(output);
+  auto* data = output->template mutable_data<T>();
+  int size = output->size();
+  // first fill everything with 0
+  math::Set<T, CUDAContext>(size, T(0), data, &context_);
+
+  T value = OperatorBase::GetSingleArgument<T>("value", 0);
+  TIndex step_size = GetStepSize(output);
+  int num_diagonal_elements = ceil((float)size / step_size);
+
+  FillDiagonalKernel<<<
+      CAFFE_GET_BLOCKS(num_diagonal_elements),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(num_diagonal_elements, step_size, value, data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(UniformFill, UniformFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(UniformIntFill, UniformFillOp<int, CUDAContext>);
+REGISTER_CUDA_OPERATOR(ConstantFill, ConstantFillOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(DiagonalFill, DiagonalFillOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(GaussianFill, GaussianFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(XavierFill, XavierFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MSRAFill, MSRAFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(RangeFill, RangeFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LengthsRangeFill,
+    GPUFallbackOp<LengthsRangeFillOp<CPUContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
new file mode 100644
index 0000000..c144b70
--- /dev/null
+++ b/caffe2/operators/filler_op.h
@@ -0,0 +1,541 @@
+#ifndef CAFFE2_OPERATORS_FILLER_OP_H_
+#define CAFFE2_OPERATORS_FILLER_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// FillerOp takes in either zero or one input.
+//
+// If the number of input is 1, the shape will be identical to that of the input
+// at run time with optional additional dimensions appended at the end as
+// specified by "extra_shape" argument. In that case the "shape" parameter
+// should not be set.
+//
+// If the number of inputs is 0, the full shape must be provided via "shape"
+// argument
+template <class Context>
+class FillerOp : public Operator<Context> {
+ public:
+  FillerOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")),
+        extra_shape_(ToVectorTIndex(
+            OperatorBase::GetRepeatedArgument<int>("extra_shape"))),
+        input_as_shape_(
+            OperatorBase::GetSingleArgument<bool>("input_as_shape", false)) {
+    if (InputSize()) {
+      if (shape_.size() != 0) {
+        CAFFE_THROW(
+            "Cannot set the shape argument and pass in an input at "
+            "the same time");
+      }
+    } else {
+      if (!extra_shape_.empty()) {
+        CAFFE_THROW("Cannot set extra_shape when there is no input");
+      }
+      if (input_as_shape_) {
+        CAFFE_THROW("An input must be given if input_as_shape is true");
+      }
+      if (shape_.size() == 0 &&
+          OperatorBase::HasSingleArgumentOfType<int>("shape")) {
+        CAFFE_THROW("Fill 'shape' argument was a scalar, list expected");
+      }
+    }
+  }
+
+  virtual ~FillerOp() {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto* output = Operator<Context>::Output(0);
+    if (InputSize()) {
+      auto shape = vector<TIndex>{};
+      if (input_as_shape_) {
+        // Shape input must be in CPU context
+        auto& input = OperatorBase::Input<Tensor<CPUContext>>(0);
+        CAFFE_ENFORCE_EQ(
+            input.ndim(),
+            1,
+            "When input_as_shape is true, the input must be a 1D tensor of "
+            "data type TIndex");
+        auto* shape_data = input.template data<TIndex>();
+        shape.insert(shape.end(), shape_data, shape_data + input.dim32(0));
+      } else {
+        auto& input = Input(0);
+        shape.insert(shape.end(), input.dims().begin(), input.dims().end());
+      }
+      shape.insert(shape.end(), extra_shape_.begin(), extra_shape_.end());
+      output->Resize(shape);
+    } else {
+      output->Resize(shape_);
+    }
+    return Fill(output);
+  }
+
+  virtual bool Fill(Tensor<Context>* output) = 0;
+
+ protected:
+  vector<TIndex> shape_;
+  vector<TIndex> extra_shape_;
+  bool input_as_shape_;
+};
+
+template <typename T, class Context>
+class UniformFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  UniformFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws),
+        min_(OperatorBase::template GetSingleArgument<T>("min", 0)),
+        max_(OperatorBase::template GetSingleArgument<T>("max", 1)) {
+    if (InputSize() == 3) {
+      CAFFE_ENFORCE(
+          !OperatorBase::HasSingleArgumentOfType<T>("min"),
+          "Cannot set both min arg and min input blob");
+      CAFFE_ENFORCE(
+          !OperatorBase::HasSingleArgumentOfType<T>("max"),
+          "Cannot set both max arg and max input blob");
+    } else {
+      CAFFE_ENFORCE_LT(
+          min_, max_, "Max value should be bigger than min value.");
+    }
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    T min = min_;
+    T max = max_;
+    if (InputSize() == 3) {
+      CAFFE_ENFORCE_EQ(1, Input(1).size(), "min blob must be scalar");
+      CAFFE_ENFORCE_EQ(1, Input(2).size(), "max blob must be scalar");
+      min = *Input(1).template data<T>();
+      max = *Input(2).template data<T>();
+      if (min > max) {
+        auto shape = output->dims();
+        shape[0] = 0;
+        output->Resize(shape);
+        output->template mutable_data<T>();
+        return true;
+      }
+    }
+    math::RandUniform<T, Context>(
+        output->size(),
+        min,
+        max,
+        output->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <class Context>
+class UniqueUniformFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  UniqueUniformFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {
+    TensorProto_DataType dtype =
+        static_cast<TensorProto_DataType>(OperatorBase::GetSingleArgument<int>(
+            "dtype", TensorProto_DataType_INT32));
+
+    switch (dtype) {
+      case TensorProto_DataType_INT32:
+        CheckRange<int>();
+        body_ = &UniqueUniformFillOp::FillWithType<int>;
+        break;
+      case TensorProto_DataType_INT64:
+        CheckRange<int64_t>();
+        body_ = &UniqueUniformFillOp::FillWithType<int64_t>;
+        break;
+      case TensorProto_DataType_UNDEFINED:
+        CAFFE_THROW(
+            "UniqueUniformFill op cannot have undefined 'dtype' argument");
+      // break;
+      default:
+        CAFFE_THROW("Unexpected 'dtype' argument value: ", dtype);
+    }
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    return (this->*body_)(output);
+  }
+
+ private:
+  template <typename T>
+  void CheckRange() {
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>("min"));
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>("max"));
+    CAFFE_ENFORCE_LT(
+        OperatorBase::GetSingleArgument<T>("min", 0),
+        OperatorBase::GetSingleArgument<T>("max", 0),
+        "Max value should be bigger than min value.");
+  }
+
+  template <typename T>
+  bool FillWithType(Tensor<Context>* output) {
+    T min = OperatorBase::GetSingleArgument<T>("min", 0);
+    T max = OperatorBase::GetSingleArgument<T>("max", 0);
+
+    const T* avoid_data = nullptr;
+    size_t avoid_size = 0;
+    if (InputSize() >= 2) {
+      auto& avoid = Input(1);
+      avoid_data = avoid.template data<T>();
+      avoid_size = avoid.size();
+    }
+    math::RandUniformUnique<T, Context>(
+        output->size(),
+        min,
+        max,
+        output->template mutable_data<T>(),
+        avoid_size,
+        avoid_data,
+        &context_);
+    return true;
+  }
+
+  bool (UniqueUniformFillOp::*body_)(Tensor<Context>* output);
+};
+
+template <class Context>
+class ConstantFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConstantFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {
+    TensorProto_DataType dtype =
+        static_cast<TensorProto_DataType>(OperatorBase::GetSingleArgument<int>(
+            "dtype", TensorProto_DataType_FLOAT));
+
+    if (!OperatorBase::HasArgument("dtype") &&
+        OperatorBase::HasArgument("value")) {
+      // If 'dtype' is not provided, infer type based on the type of 'value'
+      // Currently, single argument contains either float, int64 or bytes
+      if (OperatorBase::HasSingleArgumentOfType<float>("value")) {
+        dtype = TensorProto_DataType_FLOAT;
+      } else if (OperatorBase::HasSingleArgumentOfType<int64_t>("value")) {
+        dtype = TensorProto_DataType_INT64;
+      } else {
+        CAFFE_THROW("Argument 'value' is of unexpected type");
+      }
+      VLOG(1) << "Argument 'dtype' is not provided. Assume the data type is "
+              << "the same as that of argument 'value': " << dtype;
+    }
+
+    switch (dtype) {
+      case TensorProto_DataType_FLOAT:
+        body_ = &ConstantFillOp::FillWithType<float>;
+        break;
+      case TensorProto_DataType_DOUBLE:
+        body_ = &ConstantFillOp::FillWithType<double>;
+        break;
+      case TensorProto_DataType_BOOL:
+        body_ = &ConstantFillOp::FillWithType<bool>;
+        break;
+      case TensorProto_DataType_INT8:
+        body_ = &ConstantFillOp::FillWithType<int8_t>;
+        break;
+      case TensorProto_DataType_INT16:
+        body_ = &ConstantFillOp::FillWithType<int16_t>;
+        break;
+      case TensorProto_DataType_INT32:
+        body_ = &ConstantFillOp::FillWithType<int>;
+        break;
+      case TensorProto_DataType_INT64:
+        body_ = &ConstantFillOp::FillWithType<int64_t>;
+        break;
+      case TensorProto_DataType_UINT8:
+        body_ = &ConstantFillOp::FillWithType<uint8_t>;
+        break;
+      case TensorProto_DataType_UINT16:
+        body_ = &ConstantFillOp::FillWithType<uint16_t>;
+        break;
+      case TensorProto_DataType_STRING:
+        body_ = &ConstantFillOp::FillWithString;
+        break;
+      case TensorProto_DataType_UNDEFINED:
+        CAFFE_THROW("ConstantFill op cannot have undefined 'dtype' argument");
+      // break;
+      default:
+        CAFFE_THROW("Unexpected 'dtype' argument value: ", dtype);
+    }
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    return (this->*body_)(output);
+  }
+
+  template <typename T>
+  bool FillWithType(Tensor<Context>* output) {
+    T value = OperatorBase::GetSingleArgument<T>("value", 0);
+    auto* data = output->template mutable_data<T>();
+    if (output->size()) {
+      math::Set<T, Context>(output->size(), value, data, &context_);
+    }
+    return true;
+  }
+
+  bool FillWithString(Tensor<Context>* output) {
+    auto value = OperatorBase::GetSingleArgument<std::string>("value", "");
+    auto* data = output->template mutable_data<std::string>();
+    for (int i = 0; i < output->size(); ++i) {
+      data[i] = value;
+    }
+    return true;
+  }
+
+ private:
+  bool (ConstantFillOp::*body_)(Tensor<Context>* output);
+};
+
+template <class Context>
+class DiagonalFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  DiagonalFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {
+    TensorProto_DataType dtype =
+        static_cast<TensorProto_DataType>(OperatorBase::GetSingleArgument<int>(
+            "dtype", TensorProto_DataType_FLOAT));
+
+    if (!OperatorBase::HasArgument("dtype") &&
+        OperatorBase::HasArgument("value")) {
+      // If 'dtype' is not provided, infer type based on the type of 'value'
+      // Currently, single argument contains either float, int64 or bytes
+      if (OperatorBase::HasSingleArgumentOfType<float>("value")) {
+        dtype = TensorProto_DataType_FLOAT;
+      } else if (OperatorBase::HasSingleArgumentOfType<int64_t>("value")) {
+        dtype = TensorProto_DataType_INT64;
+      } else {
+        CAFFE_THROW("Argument 'value' is of unexpected type");
+      }
+      VLOG(1) << "Argument 'dtype' is not provided. Assume the data type is "
+              << "the same as that of argument 'value': " << dtype;
+    }
+
+    switch (dtype) {
+      case TensorProto_DataType_FLOAT:
+        body_ = &DiagonalFillOp::FillWithType<float>;
+        break;
+      case TensorProto_DataType_DOUBLE:
+        body_ = &DiagonalFillOp::FillWithType<double>;
+        break;
+      case TensorProto_DataType_BOOL:
+        body_ = &DiagonalFillOp::FillWithType<bool>;
+        break;
+      case TensorProto_DataType_INT8:
+        body_ = &DiagonalFillOp::FillWithType<int8_t>;
+        break;
+      case TensorProto_DataType_INT16:
+        body_ = &DiagonalFillOp::FillWithType<int16_t>;
+        break;
+      case TensorProto_DataType_INT32:
+        body_ = &DiagonalFillOp::FillWithType<int>;
+        break;
+      case TensorProto_DataType_INT64:
+        body_ = &DiagonalFillOp::FillWithType<int64_t>;
+        break;
+      case TensorProto_DataType_UINT8:
+        body_ = &DiagonalFillOp::FillWithType<uint8_t>;
+        break;
+      case TensorProto_DataType_UINT16:
+        body_ = &DiagonalFillOp::FillWithType<uint16_t>;
+        break;
+      case TensorProto_DataType_UNDEFINED:
+        CAFFE_THROW("Cannot have undefined 'dtype' argument");
+      default:
+        CAFFE_THROW("Unexpected 'dtype' argument value: ", dtype);
+    }
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    return (this->*body_)(output);
+  }
+
+  template <typename T>
+  bool FillWithType(Tensor<Context>* output);
+
+ private:
+  void VerifyOutputShape(Tensor<Context>* output) {
+    CAFFE_ENFORCE(output->ndim() >= 2, "Input shape must be >= 2D");
+  }
+
+  TIndex GetStepSize(Tensor<Context>* output) {
+    TIndex step;
+    if (output->ndim() == 2) {
+      step = output->dim(1) + 1;
+    } else {
+      TIndex prev_i = output->dim(0);
+      for (auto i : output->dims()) {
+        if (i != prev_i) {
+          CAFFE_THROW("All dimensions of input must be of equal length");
+        }
+      }
+      vector<TIndex> cumprod(output->ndim());
+      auto dims = output->dims();
+      std::partial_sum(
+          dims.begin(),
+          dims.end() - 1,
+          cumprod.begin(),
+          std::multiplies<TIndex>());
+      step = 1 +
+          std::accumulate(
+                 cumprod.begin(), cumprod.end(), static_cast<TIndex>(0));
+      VLOG(0) << step;
+    }
+    return step;
+  }
+
+  bool (DiagonalFillOp::*body_)(Tensor<Context>* output);
+};
+
+template <typename T, class Context>
+class GaussianFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GaussianFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws),
+        mean_(OperatorBase::template GetSingleArgument<float>("mean", 0)),
+        std_(OperatorBase::template GetSingleArgument<float>("std", 1)) {
+    DCHECK_GT(std_, 0) << "Standard deviation should be nonnegative.";
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    math::RandGaussian<T, Context>(
+        output->size(),
+        mean_,
+        std_,
+        output->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ private:
+  T mean_;
+  T std_;
+};
+
+template <typename T, class Context>
+class XavierFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {}
+
+  bool Fill(Tensor<Context>* output) override {
+    const int fan_in = output->size() / output->dim32(0);
+    T scale = std::sqrt(T(3) / fan_in);
+    math::RandUniform<T, Context>(
+        output->size(),
+        -scale,
+        scale,
+        output->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+};
+
+template <typename T, class Context>
+class MSRAFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MSRAFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {}
+
+  bool Fill(Tensor<Context>* output) override {
+    const int fan_out = output->size() / output->dim32(1);
+    T scale = std::sqrt(T(2) / fan_out);
+    math::RandGaussian<T, Context>(
+        output->size(),
+        0.0,
+        scale,
+        output->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+};
+
+// This is mostly used just as a debugging purpose stuff: it fills a tensor
+// sequentially with values 0, 1, 2..., which can then be used to check e.g.
+// reshape operations by allowing one to read the indices more easily.
+template <typename T, class Context>
+class RangeFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {}
+
+  bool Fill(Tensor<Context>* output) override;
+};
+
+template <class Context>
+class LengthsRangeFillOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsRangeFillOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    auto* input_data = input.template data<int32_t>();
+
+    CAFFE_ENFORCE_EQ(input.ndim(), 1, "Input must be a vector.");
+
+    auto len_sum = std::accumulate(input_data, input_data + input.size(), 0);
+
+    output->Resize(len_sum);
+    auto* output_data = output->template mutable_data<int32_t>();
+
+    int32_t offset = 0;
+    for (int i = 0; i < input.size(); ++i) {
+      auto len = input_data[i];
+      auto start = output_data + offset;
+      std::iota(
+          start,
+          start + len,
+          0); // make the third argument the arg of this operator
+      offset += len;
+    }
+    return true;
+  }
+};
+
+template <int VALUE_TYPE = TensorProto_DataType_FLOAT>
+inline std::vector<TensorShape> FillerTensorInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  vector<TensorShape> out(1);
+  ArgumentHelper helper(def);
+  out[0].set_data_type(static_cast<TensorProto_DataType>(
+      helper.GetSingleArgument<int>("dtype", VALUE_TYPE)));
+
+  if (in.size()) {
+    // TODO
+    bool input_as_shape =
+        helper.GetSingleArgument<bool>("input_as_shape", false);
+    if (input_as_shape) {
+      out[0].set_unknown_shape(true);
+      return out;
+    }
+    for (auto d : in[0].dims()) {
+      out[0].add_dims(d);
+    }
+  } else {
+    auto shape = helper.GetRepeatedArgument<int64_t>("shape");
+    for (auto d : shape) {
+      out[0].add_dims(d);
+    }
+  }
+  return out;
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FILLER_OP_H_
diff --git a/caffe2/operators/find_duplicate_elements_op.cc b/caffe2/operators/find_duplicate_elements_op.cc
new file mode 100644
index 0000000..51f0dcb
--- /dev/null
+++ b/caffe2/operators/find_duplicate_elements_op.cc
@@ -0,0 +1,68 @@
+#include "caffe2/operators/find_duplicate_elements_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(
+    FindDuplicateElements,
+    FindDuplicateElementsOp<CPUContext>);
+
+OPERATOR_SCHEMA(FindDuplicateElements)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The *FindDuplicateElements* op takes a single 1-D tensor *data* as input and returns a single 1-D output tensor *indices*. The output tensor contains the indices of the duplicate elements of the input, excluding the first occurrences. If all elements of *data* are unique, *indices* will be empty.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/find_duplicate_elements_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/find_duplicate_elements_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "FindDuplicateElements",
+    ["data"],
+    ["indices"],
+)
+
+workspace.FeedBlob("data", np.array([8,2,1,1,7,8,1]).astype(np.float32))
+print("data:\n", workspace.FetchBlob("data"))
+
+workspace.RunOperatorOnce(op)
+print("indices: \n", workspace.FetchBlob("indices"))
+
+```
+
+**Result**
+
+```
+
+data:
+ [8. 2. 1. 1. 7. 8. 1.]
+indices:
+ [3 5 6]
+
+```
+
+</details>
+
+
+  )DOC")
+    .Input(0, "data", "a 1-D tensor.")
+    .Output(
+        0,
+        "indices",
+        "Indices of duplicate elements in data, excluding first occurrences.");
+
+SHOULD_NOT_DO_GRADIENT(FindDuplicateElements);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/find_duplicate_elements_op.h b/caffe2/operators/find_duplicate_elements_op.h
new file mode 100644
index 0000000..3dd89db
--- /dev/null
+++ b/caffe2/operators/find_duplicate_elements_op.h
@@ -0,0 +1,56 @@
+#ifndef CAFFE2_OPERATORS_FIND_DUPLICATE_ELEMENTS_OP_H
+#define CAFFE2_OPERATORS_FIND_DUPLICATE_ELEMENTS_OP_H
+
+#include <unordered_map>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <class Context>
+class FindDuplicateElementsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FindDuplicateElementsOp);
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double, int, long, std::string>>::
+        call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& data = Input(0);
+    CAFFE_ENFORCE(data.ndim() == 1, "data should be 1-D.");
+
+    const auto* data_ptr = data.template data<T>();
+    std::unordered_map<T, int64_t> dict;
+    std::vector<int64_t> dupIndices;
+    // i is the index of unique elements, j is the index of all elements
+    for (int64_t i = 0, j = 0; j < data.dims()[0]; ++i, ++j) {
+      bool retVal = dict.insert({data_ptr[j], i}).second;
+      if (!retVal) {
+        --i;
+        dupIndices.push_back(j);
+      }
+    }
+
+    const auto dupSize = dupIndices.size();
+    auto* output = Output(0);
+    output->Resize(dupSize);
+    auto* out_ptr = output->template mutable_data<int64_t>();
+    for (int64_t i = 0; i < dupSize; ++i) {
+      out_ptr[i] = dupIndices[i];
+    }
+
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FIND_DUPLICATE_ELEMENTS_OP_H
diff --git a/caffe2/operators/find_op.cc b/caffe2/operators/find_op.cc
new file mode 100644
index 0000000..71a4746
--- /dev/null
+++ b/caffe2/operators/find_op.cc
@@ -0,0 +1,26 @@
+#include "caffe2/operators/find_op.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(Find)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(1)
+    .Input(0, "index", "Index (integers)")
+    .Input(1, "query", "Needles / query")
+    .Output(
+        0,
+        "query_indices",
+        "Indices of the needles in index or 'missing value'")
+    .Arg("missing_value", "Placeholder for items that are not found")
+    .SetDoc(R"DOC(
+Finds elements of second input from first input,
+outputting the last (max) index for each query.
+If query not find, inserts missing_value.
+See IndexGet() for a version that modifies the index when
+values are not found.
+)DOC");
+
+REGISTER_CPU_OPERATOR(Find, FindOp<CPUContext>)
+
+} // namespace caffe2
diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
new file mode 100644
index 0000000..32bceda
--- /dev/null
+++ b/caffe2/operators/find_op.cu
@@ -0,0 +1,56 @@
+#include <cub/block/block_reduce.cuh>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/find_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+__global__ void FindKernel(
+    int num_needles,
+    int idx_size,
+    const T* idx,
+    const T* needles,
+    int* out,
+    int missing_value) {
+  int needle_idx = blockIdx.x; // One cuda block per needle
+  T q = needles[needle_idx];
+  int res = (-1);
+  for (int j = threadIdx.x; j < idx_size; j += CAFFE_CUDA_NUM_THREADS) {
+    if (idx[j] == q) {
+      res = max(res, j);
+    }
+  }
+  typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int min_res = BlockReduce(temp_storage).Reduce(res, cub::Max());
+  if (threadIdx.x == 0) {
+    out[needle_idx] = min_res == (-1) ? missing_value : min_res;
+  }
+}
+
+template <>
+template <typename T>
+bool FindOp<CUDAContext>::DoRunWithType() {
+  auto& idx = Input(0);
+  auto& needles = Input(1);
+  auto* res_indices = Output(0);
+  res_indices->ResizeLike(needles);
+
+  const T* idx_data = idx.data<T>();
+  const T* needles_data = needles.data<T>();
+  int* res_data = res_indices->mutable_data<int>();
+
+  FindKernel<
+      T><<<needles.size(), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+      needles.size(),
+      idx.size(),
+      idx_data,
+      needles_data,
+      res_data,
+      missing_value_);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Find, FindOp<CUDAContext>)
+
+} // namespace caffe2
diff --git a/caffe2/operators/find_op.h b/caffe2/operators/find_op.h
new file mode 100644
index 0000000..8ba28a8
--- /dev/null
+++ b/caffe2/operators/find_op.h
@@ -0,0 +1,77 @@
+#ifndef CAFFE2_OPERATORS_FIND_OP_H_
+#define CAFFE2_OPERATORS_FIND_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+#include <unordered_map>
+
+namespace caffe2 {
+
+template <class Context>
+class FindOp final : public Operator<Context> {
+ public:
+  FindOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        missing_value_(
+            OperatorBase::GetSingleArgument<int>("missing_value", -1)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() {
+    return DispatchHelper<TensorTypes<int, long>>::call(this, Input(0));
+  }
+
+ protected:
+  template <typename T>
+  bool DoRunWithType() {
+    auto& idx = Input(0);
+    auto& needles = Input(1);
+    auto* res_indices = Output(0);
+    res_indices->ResizeLike(needles);
+
+    const T* idx_data = idx.template data<T>();
+    const T* needles_data = needles.template data<T>();
+    T* res_data = res_indices->template mutable_data<T>();
+    auto idx_size = idx.size();
+
+    // Use an arbitrary cut-off for when to use brute-force
+    // search. For larger needle sizes we first put the
+    // index into a map
+    if (needles.size() < 16) {
+      // Brute force O(nm)
+      for (int i = 0; i < needles.size(); i++) {
+        T x = needles_data[i];
+        T res = static_cast<T>(missing_value_);
+        for (int j = idx_size - 1; j >= 0; j--) {
+          if (idx_data[j] == x) {
+            res = j;
+            break;
+          }
+        }
+        res_data[i] = res;
+      }
+    } else {
+      // O(n + m)
+      std::unordered_map<T, int> idx_map;
+      for (int j = 0; j < idx_size; j++) {
+        idx_map[idx_data[j]] = j;
+      }
+      for (int i = 0; i < needles.size(); i++) {
+        T x = needles_data[i];
+        auto it = idx_map.find(x);
+        res_data[i] = (it == idx_map.end() ? missing_value_ : it->second);
+      }
+    }
+
+    return true;
+  }
+
+ protected:
+  int missing_value_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FIND_OP_H_
diff --git a/caffe2/operators/flatten_op.cc b/caffe2/operators/flatten_op.cc
new file mode 100644
index 0000000..342e5c8
--- /dev/null
+++ b/caffe2/operators/flatten_op.cc
@@ -0,0 +1,107 @@
+#include "caffe2/operators/flatten_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Flatten, FlattenOp<CPUContext>);
+
+OPERATOR_SCHEMA(Flatten)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      const int axis = helper.GetSingleArgument<int>("axis", 1);
+      vector<TensorShape> out(1);
+      TIndex outer = 1;
+      TIndex inner = 1;
+      std::size_t index = 0;
+      for (auto d : in[0].dims()) {
+        if (index < axis) {
+          outer *= d;
+        } else {
+          inner *= d;
+        }
+        ++index;
+      }
+      out[0].set_data_type(in[0].data_type());
+      out[0].add_dims(outer);
+      out[0].add_dims(inner);
+      return out;
+    })
+    .SetDoc(R"DOC(
+Flattens the input tensor into a 2D matrix. If input tensor has shape
+$(d_0, d_1, ..., d_n)$ then the output will have shape
+$\bigl((d_0 * d_1 * ... * d_{(axis-1)}), (d_{axis} * d_{(axis+1)} * ... * d_n)\bigr)$.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/flatten_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Flatten",
+    ["X"],
+    ["Y"],
+    axis=1
+)
+
+workspace.FeedBlob("X", np.random.rand(1,3,2,2))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X: [[[[0.53432311 0.23734561]
+   [0.56481598 0.52152617]]
+
+  [[0.33662627 0.32472711]
+   [0.17939016 0.97175851]]
+
+  [[0.87226421 0.49045439]
+   [0.92470531 0.30935077]]]]
+Y: [[0.53432311 0.23734561 0.56481598 0.52152617 0.33662627 0.32472711
+  0.17939016 0.97175851 0.87226421 0.49045439 0.92470531 0.30935077]]
+```
+
+</details>
+
+)DOC")
+    .Input(
+        0,
+        "X",
+        "*(type: Tensor)* Input Tensor of rank >= axis.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor)* A 2D tensor with the contents of the input tensor, "
+        "with input dimensions up to `axis` flattened to the outer dimension "
+        "of the output and the remaining input dimensions flattened into the "
+        "inner dimension of the output.")
+    .Arg(
+        "axis",
+        "*(type: int; default: 1)* Indicates up to which input dimensions "
+        "(exclusive) should be flattened to the outer dimension of the output.")
+    .InheritOnnxSchema("Flatten");
+
+class GetFlattenGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(Flatten, GetFlattenGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/flatten_op.h b/caffe2/operators/flatten_op.h
new file mode 100644
index 0000000..a250cd4
--- /dev/null
+++ b/caffe2/operators/flatten_op.h
@@ -0,0 +1,37 @@
+#ifndef CAFFE2_OPERATORS_FLATTEN_OP_H_
+#define CAFFE2_OPERATORS_FLATTEN_OP_H_
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class FlattenOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  FlattenOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_GE(
+        input.dims().size(), axis_, "The rank of the tensor must be >= axis.");
+    output->Resize(input.size_to_dim(axis_), input.size_from_dim(axis_));
+    context_.template CopyItems<Context, Context>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+
+ private:
+  int axis_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FLATTEN_OP_H_
diff --git a/caffe2/operators/flexible_top_k.cc b/caffe2/operators/flexible_top_k.cc
new file mode 100644
index 0000000..b8b68f2
--- /dev/null
+++ b/caffe2/operators/flexible_top_k.cc
@@ -0,0 +1,185 @@
+#include "caffe2/operators/flexible_top_k.h"
+
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+struct ValueCmp {
+  bool operator()(
+      const std::pair<T, TIndex>& lhs,
+      const std::pair<T, TIndex>& rhs) {
+    return (
+        lhs.first > rhs.first ||
+        (lhs.first == rhs.first && lhs.second < rhs.second));
+  }
+};
+
+} // namespace
+
+template <typename T, class Context>
+bool FlexibleTopKOp<T, Context>::RunOnDevice() {
+  auto& input = Input(0);
+  auto& k = Input(1);
+  auto* values = Output(0);
+  auto* indices = Output(1);
+
+  const T* input_data = input.template data<T>();
+  const TIndex* k_data = k.template data<TIndex>();
+
+  // get flatten shape of input
+  CAFFE_ENFORCE_GT(input.ndim(), 0);
+  vector<TIndex> input_dims = input.dims();
+  vector<TIndex> linear_shape = {
+      size_to_dim_(input_dims.size() - 1, input_dims), input_dims.back()};
+  CAFFE_ENFORCE_EQ(
+      linear_shape[0],
+      k.size(),
+      "first n-1 dims of input data and K does not match.");
+
+  TIndex output_size = 0;
+  for (TIndex i = 0; i < linear_shape[0]; ++i) {
+    CAFFE_ENFORCE(
+        linear_shape[1] >= k_data[i],
+        "k should not be greater than last dim, error at index ",
+        i,
+        ", with value: ",
+        k_data[i]);
+    CAFFE_ENFORCE(
+        k_data[i] > 0,
+        "k should be greater than 0, error at index ",
+        i,
+        ",  with value: ",
+        k_data[i]);
+    output_size += k_data[i];
+  }
+  values->Resize(output_size);
+  indices->Resize(output_size);
+  T* values_data = values->template mutable_data<T>();
+  TIndex* indices_data = indices->template mutable_data<TIndex>();
+
+  TIndex output_offset = 0;
+  // Sort preserving indices
+  for (TIndex i = 0; i < linear_shape[0]; ++i) {
+    // Build a min-heap, the heap element is pair of (value, idx)
+    // the top of the heap is the smallest value
+    std::priority_queue<
+        std::pair<T, TIndex>,
+        std::vector<std::pair<T, TIndex>>,
+        ValueCmp<T>>
+        PQ;
+
+    TIndex k_ = k_data[i];
+    for (TIndex j = 0; j < linear_shape[1]; ++j) {
+      const T value = input_data[i * linear_shape[1] + j];
+      if (PQ.size() < k_ || value > PQ.top().first) {
+        PQ.push(std::make_pair(value, j));
+      }
+      if (PQ.size() > k_) {
+        PQ.pop();
+      }
+    }
+    for (TIndex j = 0; j < k_; ++j) {
+      auto& pqElem = PQ.top();
+      values_data[output_offset + k_ - j - 1] = pqElem.first;
+      indices_data[output_offset + k_ - j - 1] = pqElem.second;
+      PQ.pop();
+    }
+    output_offset += k_;
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool FlexibleTopKGradientOp<T, Context>::RunOnDevice() {
+  auto& original_input = Input(0);
+  auto& k = Input(1);
+  auto& values = Input(2);
+  auto& indices = Input(3);
+  auto* output = Output(0);
+
+  const TIndex* k_data = k.template data<TIndex>();
+  const T* values_data = values.template data<T>();
+  const TIndex* indices_data = indices.template data<TIndex>();
+
+  // Resize output tensors to be as orignial_input size and initialized with 0
+  CAFFE_ENFORCE_GT(original_input.ndim(), 0);
+  vector<TIndex> original_dims = original_input.dims();
+  output->Resize(original_dims);
+  T* output_data = output->template mutable_data<T>();
+  math::Set<T, Context>(
+      output->size(), static_cast<T>(0), output_data, &context_);
+
+  TIndex index_offset = 0;
+  for (TIndex i = 0; i < k.size(); ++i) {
+    // offset of output_data
+    TIndex output_offset = i * original_dims.back();
+    for (TIndex j = 0; j < k_data[i]; ++j) {
+      TIndex index = indices_data[index_offset + j];
+      T value = values_data[index_offset + j];
+      output_data[output_offset + index] = value;
+    }
+    index_offset += k_data[i];
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(FlexibleTopK, FlexibleTopKOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    FlexibleTopKGradient,
+    FlexibleTopKGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(FlexibleTopK)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given two tensors: X and K,
+retrieve the top K[..., 1] elements from X on the last dimension.
+X is an input tensor of shape [a_1, a_2, ..., a_n, r].
+K is an input tensor of shape [a_1, a_2, ..., a_n, 1],
+where for each element, r >= K[..., 1] > 0
+Output two outputs:
+-Flatten values tensor of shape [ \sum_i K[i, 1] ] which contains the values of
+ the top K[..., 1]  elements along the last dimension
+-Flatten indices tensor of shape [ \sum_i K[i, 1] ] which contains the indices
+ of the top K[..., 1]  elements, flatten indices from the input tensor).
+These two outputs should be used with the input K, so that we know which indices
+in X are picked.
+
+Given two equivalent values, this operator uses the indices along the last dim-
+ension as a tiebreaker. That is, the element with the lower index will appear
+first.
+    )DOC")
+    .Input(0, "X", "Tensor of shape [a_1, a_2, ..., a_n, r]")
+    .Input(1, "K", "Tensor of shape [a_1, a_2, ..., a_n, 1]")
+    .Output(
+        0,
+        "Flatten values",
+        "Tensor of shape [ \\sum_i K[i, 1] ] containing"
+        " top K[..., 1] values from the input tensor")
+    .Output(
+        1,
+        "Flatten indices",
+        "Tensor of shape [ \\sum_i K[i, 1] ] containing the indices "
+        "into the flatten input");
+
+OPERATOR_SCHEMA(FlexibleTopKGradient).NumInputs(4).NumOutputs(1);
+
+class GetFlexibleTopKGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "FlexibleTopKGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0), O(1)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(FlexibleTopK, GetFlexibleTopKGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/flexible_top_k.h b/caffe2/operators/flexible_top_k.h
new file mode 100644
index 0000000..2d68554
--- /dev/null
+++ b/caffe2/operators/flexible_top_k.h
@@ -0,0 +1,36 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_FLEXIBLE_TOP_K_H_
+#define CAFFE2_OPERATORS_FLEXIBLE_TOP_K_H_
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class FlexibleTopKOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  FlexibleTopKOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class FlexibleTopKGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  FlexibleTopKGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FLEXIBLE_TOP_K_H_
diff --git a/caffe2/operators/floor_op.cc b/caffe2/operators/floor_op.cc
new file mode 100644
index 0000000..2dcd6ca
--- /dev/null
+++ b/caffe2/operators/floor_op.cc
@@ -0,0 +1,73 @@
+#include "caffe2/operators/floor_op.h"
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Floor, FloorOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Floor)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise application of the floor function ($y=floor(x)$) to the input
+tensor `X`. Output tensor shape is the same as the input tensor. This
+operator can be used in an in-place fashion by using the same input blob as the
+output blob.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/floor_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Floor",
+    ["X"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32))
+print("X before running op:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("X after running op:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X before running op:
+[[ 3.813361   -1.319647    5.2089314  -4.931328    0.6218652 ]
+ [ 7.2757645   5.5552588   5.785643   -2.4790506  -0.41400087]
+ [ 1.1541046  -6.933266    3.3754056   1.6569928  -1.7670316 ]
+ [-3.4932013   4.891472    1.5530115  -3.2443287  -4.605099  ]
+ [-4.574543   -7.360948    5.91305    -8.196495   -5.357458  ]]
+X after running op:
+[[ 3. -2.  5. -5.  0.]
+ [ 7.  5.  5. -3. -1.]
+ [ 1. -7.  3.  1. -2.]
+ [-4.  4.  1. -4. -5.]
+ [-5. -8.  5. -9. -6.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
+
+// TODO: Write gradient for this when needed
+GRADIENT_NOT_IMPLEMENTED_YET(Floor);
+
+} // namespace caffe2
diff --git a/caffe2/operators/floor_op.cu b/caffe2/operators/floor_op.cu
new file mode 100644
index 0000000..a1bd383
--- /dev/null
+++ b/caffe2/operators/floor_op.cu
@@ -0,0 +1,31 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/floor_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T>
+__global__ void FloorKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = std::floor(X[i]);
+  }
+}
+
+template <>
+bool FloorOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  FloorKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Floor, FloorOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/floor_op.h b/caffe2/operators/floor_op.h
new file mode 100644
index 0000000..ee44511
--- /dev/null
+++ b/caffe2/operators/floor_op.h
@@ -0,0 +1,33 @@
+#ifndef CAFFE2_OPERATORS_FLOOR_OP_H_
+#define CAFFE2_OPERATORS_FLOOR_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class FloorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FloorOp);
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+
+    const float* Xdata = X.template data<float>();
+    float* Ydata = Y->template mutable_data<float>();
+    for (int i = 0; i < X.size(); ++i) {
+      Ydata[i] = std::floor(Xdata[i]);
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FLOOR_OP_H_
diff --git a/caffe2/operators/free_op.cc b/caffe2/operators/free_op.cc
new file mode 100644
index 0000000..702981a
--- /dev/null
+++ b/caffe2/operators/free_op.cc
@@ -0,0 +1,15 @@
+#include "caffe2/operators/free_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(Free, FreeOp<CPUContext>);
+SHOULD_NOT_DO_GRADIENT(Free);
+
+OPERATOR_SCHEMA(Free)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SameNumberOfOutput()
+    .EnforceOneToOneInplace()
+    .SetDoc(R"DOC(
+Frees the content of the blobs. The input and output blobs should be
+one-to-one inplace.)DOC");
+} // namespace caffe2
diff --git a/caffe2/operators/free_op.h b/caffe2/operators/free_op.h
new file mode 100644
index 0000000..7fc38ae
--- /dev/null
+++ b/caffe2/operators/free_op.h
@@ -0,0 +1,27 @@
+#ifndef CAFFE2_OPERATORS_FREE_OP_H_
+#define CAFFE2_OPERATORS_FREE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// FreeOp frees the content of the output blob. We allow it to take in input
+// blobs purely for the reason that it can "wait" on the input blobs to be
+// produced by some of the earlier operators before a free is called.
+template <class Context>
+class FreeOp : public Operator<Context> {
+ public:
+  FreeOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {}
+
+  bool RunOnDevice() override {
+    for (Blob* output : OperatorBase::Outputs()) {
+      output->Reset();
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FREE_OP_H_
diff --git a/caffe2/operators/free_op_gpu.cc b/caffe2/operators/free_op_gpu.cc
new file mode 100644
index 0000000..4b297a8
--- /dev/null
+++ b/caffe2/operators/free_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/free_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Free, FreeOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc
new file mode 100644
index 0000000..6fe95ee
--- /dev/null
+++ b/caffe2/operators/fully_connected_op.cc
@@ -0,0 +1,291 @@
+#include <functional>
+
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
+REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    FCTransposed,
+    FullyConnectedOp<
+        CPUContext,
+        DefaultEngine,
+        false /* don't transpose weight */>);
+REGISTER_CPU_OPERATOR(
+    FCTransposedGradient,
+    FullyConnectedGradientOp<
+        CPUContext,
+        DefaultEngine,
+        false /* don't transpose weight */>);
+
+namespace {
+std::vector<TensorShape> FCShapeInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight) {
+  vector<TensorShape> out(1);
+
+  if (in[0].unknown_shape() || in[1].unknown_shape()) {
+      out[0].set_unknown_shape(true);
+      return out;
+  }
+
+  ArgumentHelper helper(def);
+
+  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const int N = pretransposed_weight
+      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
+      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  vector<int> y_shape(in[0].dims().begin(), in[0].dims().end());
+  CAFFE_ENFORCE_LE(canonical_axis + 1, y_shape.size());
+  y_shape.resize(canonical_axis + 1);
+  y_shape[canonical_axis] = N;
+  out[0] = CreateTensorShape(y_shape, in[0].data_type());
+  return out;
+}
+
+OpSchema::Cost CostInferenceForFC(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight) {
+  CAFFE_ENFORCE_EQ(in.size(), 3, "FC requires three inputs");
+  struct OpSchema::Cost c;
+  ArgumentHelper helper(def);
+
+  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
+  const uint64_t M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
+  const uint64_t K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const uint64_t N = pretransposed_weight
+      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
+      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  const auto& X = in[0];
+  c.flops = M * N * (2 * K + 1);
+  c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
+  c.bytes_written = M * N * sizeof(X.data_type());
+  c.params_bytes = (K * N + N) * sizeof(X.data_type());
+  return c;
+}
+
+std::vector<TensorShape> FCGradientShapeInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight) {
+  vector<TensorShape> out(2);
+  ArgumentHelper helper(def);
+
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const int N = pretransposed_weight
+      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
+      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  vector<int> dW_shape(in[1].dims().begin(), in[1].dims().end());
+  out[0] = CreateTensorShape(dW_shape, in[1].data_type());
+  out[1] = CreateTensorShape(vector<int>{N}, in[1].data_type()); // db
+  if (def.output_size() == 3) {
+    vector<int> dX_shape(in[0].dims().begin(), in[0].dims().end());
+    out.push_back(CreateTensorShape(dX_shape, in[0].data_type()));
+  }
+  return out;
+}
+
+OpSchema::Cost CostInferenceForFCGradient(
+    const OperatorDef& def,
+    const vector<TensorShape>& in,
+    bool pretransposed_weight) {
+  struct OpSchema::Cost c;
+  ArgumentHelper helper(def);
+  std::vector<TensorShape> out =
+      FCGradientShapeInference(def, in, pretransposed_weight);
+
+  CAFFE_ENFORCE_LT(0, out.size());
+  const TensorShape dW = out[0];
+  const TensorShape db = out[1];
+
+  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
+  const uint64_t M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
+  const uint64_t K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
+  auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
+  const int canonical_axis_w =
+      canonical_axis_index_(axis_w, in[1].dims().size());
+  const uint64_t N = pretransposed_weight
+      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
+      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
+
+  uint64_t size_dW = nElemFromDim(dW);
+  uint64_t size_db = nElemFromDim(db);
+
+  c.flops = M * N * (2 * K + 1);
+  c.bytes_written = (size_dW + size_db) * sizeof(float);
+  c.params_bytes = (K * N + N) * sizeof(float);
+
+  if (out.size() == 3) {
+    const TensorShape dX = out[2];
+    uint64_t size_dX = nElemFromDim(dX);
+
+    c.flops += 2 * M * N * K;
+    c.bytes_written += size_dX * sizeof(float);
+  }
+  return c;
+}
+
+} // namespace
+
+using namespace std::placeholders;
+OPERATOR_SCHEMA(FCTransposed)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true))
+    .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true))
+    .SetDoc(R"DOC(
+Same as FC, but weight matrix is supposed to be already pretransposed.
+FCTransposed stands for calling blass with no noTrans, noTrans
+)DOC");
+
+OPERATOR_SCHEMA(FC)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
+    .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
+    .SetDoc(R"DOC(
+The FC operator computes an output $(Y)$ as a linear combination of the input data blob $(X)$ with a weight blob $(W)$ and bias blob $(b)$. More formally,
+
+$$Y = XW^T+b$$
+
+Here, $X$ is a matrix of shape $(M,K)$, $W$ is a matrix of shape $(N,K)$, $b$ is a vector of length $N$, and $Y$ is a matrix of shape $(M,N)$. $N$ can be thought of as the number of nodes in the layer, $M$ is the batch size, and $K$ is the number of features in an input observation.
+
+*NOTE: $X$ does not need to explicitly be a 2-dimensional matrix, however, if it is not it will be coerced into one. For an arbitrary $n$-dimensional tensor $X$, e.g. $[a_0, a_1, \ldots ,a_{k-1}, a_k, \ldots , a_{n-1}]$, where $a_i$ in $N$, and $k$ is the $axis$ arg provided, then $X$ will be coerced into a 2-dimensional tensor with dimensions $[a_0 * \ldots * a_{k-1}, a_k * \ldots * a_{n-1}]$. For the default case where axis=1, this means the $X$ tensor will be coerced into a 2D tensor of dimensions $[a_0, a_1 * \ldots * a_{n-1}]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = M$ and $a_1 * \ldots * a_{n-1} = K$. Lastly, even though $b$ is a vector of length $N$, it is copied and resized to shape $(M x N)$ implicitly, then added to each vector in the batch.*
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+# In this example, our batch size is 1 (M=1), the input observation will have
+#   6 features (K=6), and the layer will have one hidden node (N=1). The
+#   expected output is Y=7.
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "FC",
+    ["X", "W", "b"],
+    ["Y"]
+)
+
+# Create X: MxK
+data = np.array([1,2,3,4,5,6]).astype(np.float32)
+data = data[np.newaxis,:]
+
+# Create W: NxK
+weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32)
+weights = weights[np.newaxis,:]
+
+# Create b: N
+bias = np.array([1.]).astype(np.float32)
+
+# Put the inputs into the workspace
+workspace.FeedBlob("X", data)
+workspace.FeedBlob("W", weights)
+workspace.FeedBlob("b", bias)
+
+# Run the operator
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+Y:
+ [[7.]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "axis",
+        "*(type: int; default: 1)* Describes the axis of the input data $X$. Defaults to one because in the common case when the input $X$ has shape $(M,K)$, the first axis encodes the batch size.")
+    .Arg(
+        "axis_w",
+        "*(type: int; default: 1)* Describes the axis of the input weight matrix $W$. Defaults to one because the first axis most likely describes the batch_size.")
+    .Arg(
+        "float16_compute",
+        "*(type: bool; default: False)* Whether to use float-16 compute kernel.")
+    .Input(
+        0,
+        "X",
+        "Input blob to be coerced into a 2D matrix of shape $(M,K)$, where $M$ is the batch size and $K$ is the number of features in a single observation.")
+    .Input(
+        1,
+        "W",
+        "Input blob to be coerced into a 2D matrix of shape $(N,K)$ describing a fully connected weight matrix. Here, $K$ is the number of features in a single observation and $N$ is the number of nodes in the FC layer.")
+    .Input(
+        2,
+        "b",
+        "Input blob containing vector of length $N$ which describes one bias for each node in the layer.")
+    .Output(
+        0,
+        "Y",
+        "Ouput blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The ouput is calculated as $Y=XW^T+b$.")
+    .InheritOnnxSchema("Gemm");
+
+OPERATOR_SCHEMA(FCGradient)
+    .NumInputs(3)
+    .NumOutputs(2, 3)
+    .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
+    .CostInferenceFunction(
+        std::bind(CostInferenceForFCGradient, _1, _2, false));
+OPERATOR_SCHEMA(FCTransposedGradient)
+    .NumInputs(3)
+    .NumOutputs(2, 3)
+    .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
+    .CostInferenceFunction(
+        std::bind(CostInferenceForFCGradient, _1, _2, false));
+
+namespace {
+
+class GetFCGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 3);
+    CAFFE_ENFORCE(def_.type() == "FC" || def_.type() == "FCTransposed");
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(1), GI(2), GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(FC, GetFCGradient);
+REGISTER_GRADIENT(FCTransposed, GetFCGradient);
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h
new file mode 100644
index 0000000..068acfe
--- /dev/null
+++ b/caffe2/operators/fully_connected_op.h
@@ -0,0 +1,321 @@
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// This is Caffe's InnerProductOp, with a name that fits its purpose better.
+template <
+    class Context,
+    class Engine = DefaultEngine,
+    bool TransposeWeight = true>
+class FullyConnectedOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
+        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+        float16_compute_(
+            OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+  ~FullyConnectedOp() {}
+
+  template <
+      typename T_X,
+      typename T_W,
+      typename T_B,
+      typename T_Y,
+      typename MATH>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    const auto& W = Input(1);
+    const auto& b = Input(2);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
+    // batch size
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const auto M = X.size_to_dim(canonical_axis);
+    const auto K = X.size_from_dim(canonical_axis);
+    const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
+    const int N = TransposeWeight ? W.size_to_dim(canonical_axis_w)
+                                  : W.size_from_dim(canonical_axis_w);
+
+    auto dimErrorString = [&]() {
+      return MakeString(
+          "Dimension mismatch: ",
+          "X: ",
+          X.dims(),
+          ", W: ",
+          W.dims(),
+          ", b: ",
+          b.dims(),
+          ", axis: ",
+          axis_,
+          ", M: ",
+          M,
+          ", N: ",
+          N,
+          ", K: ",
+          K);
+    };
+
+    // Error checking
+    CAFFE_ENFORCE(M == X.size() / K, dimErrorString());
+    CAFFE_ENFORCE(K == W.size() / N, dimErrorString());
+    CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
+    CAFFE_ENFORCE(N == b.size(), dimErrorString());
+
+    Y_shape_cache_ = X.dims();
+    // This is an invariant of canonical_axis, so we can DCHECK.
+    DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
+    Y_shape_cache_.resize(canonical_axis + 1);
+    Y_shape_cache_[canonical_axis] = N;
+    Y->Resize(Y_shape_cache_);
+    CAFFE_ENFORCE(M * N == Y->size(), dimErrorString());
+
+    if (X.size() == 0) {
+      // skip the rest of the computation if X is empty
+      Y->template mutable_data<T_Y>();
+      return true;
+    }
+
+    // default to FLOAT as math.h does.
+    TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
+    if (fp16_type<MATH>()) {
+      math_type = TensorProto_DataType_FLOAT16;
+    }
+
+    // W * x
+    math::Gemm<T_X, Context, Engine>(
+        CblasNoTrans,
+        TransposeWeight ? CblasTrans : CblasNoTrans,
+        M,
+        N,
+        K,
+        1,
+        X.template data<T_X>(),
+        W.template data<T_W>(),
+        0,
+        Y->template mutable_data<T_Y>(),
+        &context_,
+        math_type);
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T_B, Context>(
+          M,
+          convert::To<float, T_B>(1),
+          bias_multiplier_.template mutable_data<T_B>(),
+          &context_);
+    }
+    math::Gemm<T_B, Context, Engine>(
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        N,
+        1,
+        1,
+        bias_multiplier_.template data<T_B>(),
+        b.template data<T_B>(),
+        1,
+        Y->template mutable_data<T_Y>(),
+        &context_,
+        math_type);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<
+        float, // X
+        float, // W
+        float, // B
+        float, // Y
+        float>(); // Math
+  }
+
+ protected:
+  size_t axis_{1};
+  size_t axis_w_{1};
+  // A local vector to cache the output shape so we don't need to recreate
+  // a vector object every time we run Run().
+  vector<TIndex> Y_shape_cache_;
+  Tensor<Context> bias_multiplier_;
+
+  bool float16_compute_;
+};
+
+template <
+    class Context,
+    class Engine = DefaultEngine,
+    bool TransposeWeight = true>
+class FullyConnectedGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
+        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+        float16_compute_(
+            OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+  ~FullyConnectedGradientOp() {}
+
+  template <
+      typename T_X,
+      typename T_W,
+      typename T_DY,
+      typename T_B,
+      typename T_DX,
+      typename T_DW,
+      typename T_DB,
+      typename MATH>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    const auto& W = Input(1);
+    const auto& dY = Input(2);
+    // batch size
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const int M = X.size_to_dim(canonical_axis);
+    const int K = X.size_from_dim(canonical_axis);
+    const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
+    const int N = TransposeWeight ? W.size_to_dim(canonical_axis_w)
+                                  : W.size_from_dim(canonical_axis_w);
+
+    auto dimErrorString = [&]() {
+      return MakeString(
+          "Dimension mismatch: ",
+          "X: ",
+          X.dims(),
+          ", W: ",
+          W.dims(),
+          ", dY: ",
+          dY.dims(),
+          ", axis: ",
+          axis_,
+          ", M: ",
+          M,
+          ", N: ",
+          N,
+          ", K: ",
+          K);
+    };
+
+    CAFFE_ENFORCE(M * K == X.size(), dimErrorString());
+    CAFFE_ENFORCE(K * N == W.size(), dimErrorString());
+
+    auto* dW = Output(0);
+    auto* db = Output(1);
+    dW->ResizeLike(W);
+    db->Resize(N);
+
+    if (X.size() == 0) {
+      // generate a zero blob for db and dW when X is empty
+      math::Set<T_DB, Context>(
+          db->size(),
+          convert::To<float, T_DB>(0),
+          db->template mutable_data<T_DB>(),
+          &context_);
+      math::Set<T_DW, Context>(
+          dW->size(),
+          convert::To<float, T_DW>(0),
+          dW->template mutable_data<T_DW>(),
+          &context_);
+
+      if (OutputSize() == 3) {
+        auto* dX = Output(2);
+        dX->ResizeLike(X);
+        dX->template mutable_data<T_DX>();
+      }
+
+      return true;
+    }
+
+    // default to FLOAT as math.h does.
+    TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
+    if (fp16_type<MATH>()) {
+      math_type = TensorProto_DataType_FLOAT16;
+    }
+
+    // Compute dW
+    math::Gemm<T_DY, Context, Engine>(
+        CblasTrans,
+        CblasNoTrans,
+        TransposeWeight ? N : K,
+        TransposeWeight ? K : N,
+        M,
+        1,
+        TransposeWeight ? dY.template data<T_DY>() : X.template data<T_X>(),
+        TransposeWeight ? X.template data<T_X>() : dY.template data<T_DY>(),
+        0,
+        dW->template mutable_data<T_DW>(),
+        &context_,
+        math_type);
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it
+      // with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T_B, Context>(
+          M,
+          convert::To<float, T_B>(1),
+          bias_multiplier_.template mutable_data<T_B>(),
+          &context_);
+    }
+    // Compute dB
+    math::Gemv<T_DY, Context>(
+        CblasTrans,
+        M,
+        N,
+        1,
+        dY.template data<T_DY>(),
+        bias_multiplier_.template data<T_B>(),
+        0,
+        db->template mutable_data<T_DB>(),
+        &context_);
+
+    // Compute dX
+    if (OutputSize() == 3) {
+      auto* dX = Output(2);
+      dX->ResizeLike(X);
+      math::Gemm<T_DX, Context, Engine>(
+          CblasNoTrans,
+          TransposeWeight ? CblasNoTrans : CblasTrans,
+          M,
+          K,
+          N,
+          1,
+          dY.template data<T_DY>(),
+          W.template data<T_W>(),
+          0,
+          dX->template mutable_data<T_DX>(),
+          &context_,
+          math_type);
+    }
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<
+        float, //  X
+        float, //  W
+        float, // dY
+        float, //  B
+        float, // dX
+        float, // dW
+        float, // dB
+        float>(); // Math
+  }
+
+ protected:
+  size_t axis_{1};
+  size_t axis_w_{1};
+  Tensor<Context> bias_multiplier_;
+  bool float16_compute_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/operators/fully_connected_op_gpu.cc b/caffe2/operators/fully_connected_op_gpu.cc
new file mode 100644
index 0000000..f21d825
--- /dev/null
+++ b/caffe2/operators/fully_connected_op_gpu.cc
@@ -0,0 +1,221 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+constexpr int kFp16CUDADevicePropMajor = 6;
+
+template <class FullyConnectedOp>
+bool RunFullyConnectedOpOnCUDADevice(
+    const bool float16_compute,
+    FullyConnectedOp* op) {
+  if (op->Input(0).template IsType<float>()) {
+    return op->template DoRunWithType<
+        float, // X
+        float, // W
+        float, // B
+        float, // Y
+        float>(); // Math
+  } else if (op->Input(0).template IsType<float16>()) {
+    if (float16_compute) {
+      const cudaDeviceProp& prop = GetDeviceProperty(0);
+      if (prop.major >= kFp16CUDADevicePropMajor) {
+        return op->template DoRunWithType<
+            float16, // X
+            float16, // W
+            float16, // B
+            float16, // Y
+            float16>(); // Math
+      } else {
+        LOG(INFO) << "CUDA Device does not support FP16 computation, "
+                     "falling back to FP32.";
+        return op->template DoRunWithType<
+            float16, // X
+            float16, // W
+            float16, // B
+            float16, // Y
+            float>(); // Math
+      }
+    } else {
+      return op->template DoRunWithType<
+          float16, // X
+          float16, // W
+          float16, // B
+          float16, // Y
+          float>(); // Math
+    }
+  } else {
+    CAFFE_THROW("Unsupported type");
+  }
+  return false;
+}
+
+template <class FullyConnectedGradientOp>
+bool RunFullyConnectedGradientOpOnCUDADevice(
+    const bool float16_compute,
+    FullyConnectedGradientOp* op) {
+  if (op->Input(0).template IsType<float>()) {
+    return op->template DoRunWithType<
+        float, //  X
+        float, //  W
+        float, // dY
+        float, //  B
+        float, // dX
+        float, // dW
+        float, // dB
+        float>(); // Math
+  } else if (op->Input(0).template IsType<float16>()) {
+    if (float16_compute) {
+      const cudaDeviceProp& prop = GetDeviceProperty(0);
+      if (prop.major >= kFp16CUDADevicePropMajor) {
+        return op->template DoRunWithType<
+            float16, //  X
+            float16, //  W
+            float16, // dY
+            float16, //  B
+            float16, // dX
+            float16, // dW
+            float16, // dB
+            float16>(); // Math
+      } else {
+        LOG(INFO) << "CUDA Device does not support FP16 computation, "
+                     "falling back to FP32.";
+        return op->template DoRunWithType<
+            float16, //  X
+            float16, //  W
+            float16, // dY
+            float16, //  B
+            float16, // dX
+            float16, // dW
+            float16, // dB
+            float>(); // Math
+      }
+    } else {
+      return op->template DoRunWithType<
+          float16, //  X
+          float16, //  W
+          float16, // dY
+          float16, //  B
+          float16, // dX
+          float16, // dW
+          float16, // dB
+          float>(); // Math
+    }
+  } else {
+    CAFFE_THROW("Unsupported type");
+  }
+  return false;
+}
+
+} // namespace
+
+// The RunFullyConnectedOpOnCUDADevice Function will use the pointer of current
+// op and the DoRunWithType will make sure to run the correct things.
+template <>
+bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
+  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
+}
+
+template <>
+bool FullyConnectedOp<
+    CUDAContext,
+    DefaultEngine,
+    false /* don't transpose weight */>::RunOnDevice() {
+  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
+}
+
+template <>
+bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
+  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
+}
+
+template <>
+bool FullyConnectedGradientOp<
+    CUDAContext,
+    DefaultEngine,
+    false /* don't transpose weight */>::RunOnDevice() {
+  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
+}
+
+#if CUDA_VERSION >= 9000
+
+// Require these to be defined otherwise TensorCore FC ops will end
+// up calling the default FC implementation which doesn't have
+// fp16 support...
+
+template <>
+bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
+  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
+}
+
+template <>
+bool FullyConnectedOp<
+    CUDAContext,
+    TensorCoreEngine,
+    false /* don't transpose weight */>::RunOnDevice() {
+  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
+}
+
+template <>
+bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
+  return RunFullyConnectedGradientOpOnCUDADevice(
+      false /* float16_compute */, this);
+}
+
+template <>
+bool FullyConnectedGradientOp<
+    CUDAContext,
+    TensorCoreEngine,
+    false /* don't transpose weight */>::RunOnDevice() {
+  return RunFullyConnectedGradientOpOnCUDADevice(
+      false /* float16_compute */, this);
+}
+
+#endif
+
+REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    FCTransposed,
+    FullyConnectedOp<
+        CUDAContext,
+        DefaultEngine,
+        false /* don't transpose weight */>);
+REGISTER_CUDA_OPERATOR(
+    FCTransposedGradient,
+    FullyConnectedGradientOp<
+        CUDAContext,
+        DefaultEngine,
+        false /* don't transpose weight */>);
+
+#if CUDA_VERSION >= 9000
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FC,
+    TENSORCORE,
+    FullyConnectedOp<CUDAContext, TensorCoreEngine>);
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FCGradient,
+    TENSORCORE,
+    FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
+
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FCTransposed,
+    TENSORCORE,
+    FullyConnectedOp<
+        CUDAContext,
+        TensorCoreEngine,
+        false /* don't transpose weight */>);
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    FCTransposedGradient,
+    TENSORCORE,
+    FullyConnectedGradientOp<
+        CUDAContext,
+        TensorCoreEngine,
+        false /* don't transpose weight */>);
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
new file mode 100644
index 0000000..b6966fe
--- /dev/null
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
@@ -0,0 +1,66 @@
+#include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(
+    FloatToFused8BitRowwiseQuantized,
+    FloatToFused8BitRowwiseQuantizedOp<CPUContext>);
+OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantized)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /* def */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out;
+      TensorShape X = in[0];
+      X.set_dims(1, X.dims(1) + 8);
+      out.push_back(std::move(X));
+      out[0].set_data_type(TensorProto_DataType_UINT8);
+      return out;
+    })
+    .SetDoc(R"DOC(
+Applies 8-bit row-wise quantization by determining the range
+(maximum - minimum) and offset (minimum value) of each row in the input
+matrix, and then scaling each element to an 8-bit number between 0 and
+255. To later de-quantize values, the scale (range / 255) and offset
+(bias) are stored alongside the data. More precisely, the first 4 bytes
+of each row in the output matrix are a 32-bit float storing the scale,
+the next 4 bytes store the bias as a 32-bit float, and all remaining
+bytes in the row encode single quantized values.)
+)DOC")
+    .Input(0, "input", "Float32 input data")
+    .Output(0, "output", "Fused scale, bias and quantized data");
+NO_GRADIENT(FloatToFused8BitRowwiseQuantized);
+
+REGISTER_CPU_OPERATOR(
+    Fused8BitRowwiseQuantizedToFloat,
+    Fused8BitRowwiseQuantizedToFloatOp<CPUContext>);
+OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToFloat)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /* def */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out;
+      TensorShape X = in[0];
+      X.set_dims(1, X.dims(1) - 8);
+      out.push_back(std::move(X));
+      out[0].set_data_type(TensorProto_DataType_FLOAT);
+      return out;
+    })
+    .SetDoc(R"DOC(
+De-quantizes the result of the
+FloatToFused8BitRowwiseQuantized operator. The input is expected to
+encode the scale as a 32-bit float in the second to the last 4 bytes of each
+row, followed by the bias as a 32-bit float in the next 4 bytes, and the
+quantized values in the preceding bytes of the row. The output is a
+matrix containing only the values, but de-quantized. De-quantization is
+performed by multiplying each value by its row's scale and bias
+parameters. The de-quantized values will thus not be exactly equal to
+the original, un-quantized floating point values.
+)DOC")
+    .Input(
+        0,
+        "scale_bias_quantized_input",
+        "Fused scale, bias and quantized data")
+    .Output(0, "float_input", "Float32 data");
+NO_GRADIENT(Fused8BitRowwiseQuantizedToFloat);
+} // namespace caffe2
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
new file mode 100644
index 0000000..ca50020
--- /dev/null
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
@@ -0,0 +1,132 @@
+#ifndef CAFFE2_OPERATORS_FUSED_ROWWISE_8BIT_CONVERSION_OPS_H_
+#define CAFFE2_OPERATORS_FUSED_ROWWISE_8BIT_CONVERSION_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/reducer_functors.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+#define IS_LITTLE_ENDIAN                                      \
+  [] {                                                        \
+    const int32_t kValue = 1;                                 \
+    return reinterpret_cast<const uint8_t*>(&kValue)[0] == 1; \
+  }()
+
+template <class Context>
+class FloatToFused8BitRowwiseQuantizedOp : public Operator<Context> {
+ public:
+  static constexpr float kEqualityThreshold = 1e-7f;
+  static constexpr float kEpsilon = 1e-8f;
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FloatToFused8BitRowwiseQuantizedOp)
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
+
+    const auto& input = Input(DATA_FLOAT);
+    auto* output = Output(DATA_FUSED_SCALE_BIAS_INT8);
+
+    const auto input_rows = input.dim(0);
+    const auto input_columns = input.dim(1);
+    CAFFE_ENFORCE_EQ(input.ndim(), 2, "Expect input to be a matrix");
+
+    // The "fused" representation stores the scale and bias with the row-wise
+    // quantized data in one tensor. Since we quantize with 8 bits (1 byte) and
+    // represent the scale and bias with 32-bit floats, we'll use the last 8
+    // bytes of each row for scale (4 bytes) and bias (4 bytes).
+    // | ... int8 data ... | scale | bias |
+    // | number_of_columns |  4B   |  4B  |
+    const std::vector<TIndex> output_dimensions = {input_rows,
+                                                   input_columns + 8};
+    output->Resize(output_dimensions);
+
+    const auto* input_data = input.template data<float>();
+    auto* output_data = output->template mutable_data<uint8_t>();
+    const auto output_columns = output->dim(1);
+
+    for (size_t row = 0; row < input_rows; ++row) {
+      ConstEigenVectorArrayMap<float> input_row(
+          input_data + row * input_columns, input_columns);
+
+      uint8_t* output_row = output_data + row * output_columns;
+      EigenVectorArrayMap<uint8_t> output_row_values(output_row, input_columns);
+      EigenVectorArrayMap<float> output_row_scale_bias(
+          reinterpret_cast<float*>(output_row + input_columns), 2);
+
+      const float minimum_element = input_row.minCoeff();
+      const float maximum_element = input_row.maxCoeff();
+      const float range = maximum_element - minimum_element;
+
+      output_row_scale_bias(0) = range / 255.0f;
+      output_row_scale_bias(1) = minimum_element;
+      const auto inverse_scale = 255.0f / (range + kEpsilon);
+      output_row_values = ((input_row - minimum_element) * inverse_scale)
+                              .round()
+                              .cast<uint8_t>();
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA_FLOAT);
+  OUTPUT_TAGS(DATA_FUSED_SCALE_BIAS_INT8);
+};
+
+template <class Context>
+class Fused8BitRowwiseQuantizedToFloatOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(Fused8BitRowwiseQuantizedToFloatOp)
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
+
+    const auto& input = Input(DATA_FUSED_SCALE_BIAS_INT8);
+    auto* output = Output(DATA_FLOAT);
+
+    const auto input_rows = input.dim(0);
+    const auto input_columns = input.dim(1);
+    CAFFE_ENFORCE_EQ(input.ndim(), 2, "Expect input to be a matrix");
+
+    // The last 8 bytes per row are the scale and the bias. The rest of
+    // input_columns is the number of values in the original row.
+    const std::vector<TIndex> output_dimensions = {input_rows,
+                                                   input_columns - 8};
+    output->Resize(output_dimensions);
+    const auto output_columns = output->dim(1);
+
+    const auto* input_data = input.template data<uint8_t>();
+    auto* output_data = output->template mutable_data<float>();
+
+    for (size_t row = 0; row < input_rows; ++row) {
+      const uint8_t* input_row = input_data + row * input_columns;
+      ConstEigenVectorArrayMap<uint8_t> input_row_values(
+          input_row, output_columns);
+      ConstEigenVectorArrayMap<float> input_row_scale_bias(
+          reinterpret_cast<const float*>(input_row + output_columns), 2);
+
+      EigenVectorArrayMap<float> output_row(
+          output_data + row * output_columns, output_columns);
+
+      output_row = input_row_values.cast<float>() * input_row_scale_bias(0) +
+          input_row_scale_bias(1);
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA_FUSED_SCALE_BIAS_INT8);
+  OUTPUT_TAGS(DATA_FLOAT);
+};
+
+#undef IS_LITTLE_ENDIAN
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_FUSED_ROWWISE_8BIT_CONVERSION_OPS_H_
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.cc b/caffe2/operators/gather_fused_8bit_rowwise_op.cc
new file mode 100644
index 0000000..d41a207
--- /dev/null
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.cc
@@ -0,0 +1,41 @@
+#include "caffe2/operators/gather_fused_8bit_rowwise_op.h"
+
+namespace caffe2 {
+
+OPERATOR_SCHEMA(GatherFused8BitRowwise)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Perform the same operation as Gather, but operating on 8-bit rowwise quantized
+matrices with fused storage (where each row stores quantized values, and then
+the scale and offset).
+DATA needs to have rank 2 and INDICES needs to have rank 1.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor with rank 2 obtained with operator FloatToFused8BitRowwiseQuantized")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first dimension of DATA for"
+        "the rows that are being gathered")
+    .Output(0, "OUTPUT", "output")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      for (auto d : in[1].dims()) {
+        out[0].add_dims(d);
+      }
+      for (int i = 1; i < in[0].dims_size(); ++i) {
+        out[0].add_dims(in[0].dims(i));
+      }
+      out[0].set_data_type(in[0].data_type());
+      return out;
+    });
+
+REGISTER_CPU_OPERATOR(
+    GatherFused8BitRowwise,
+    GatherFused8BitRowwiseOp<CPUContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h
new file mode 100644
index 0000000..621ea33
--- /dev/null
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class GatherFused8BitRowwiseOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(GatherFused8BitRowwiseOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(INDICES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    const auto& data = Input(DATA);
+    const auto& indices = Input(INDICES);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(data.ndim(), 2, "DATA must be a matrix");
+    CAFFE_ENFORCE_EQ(indices.ndim(), 1, "INDICES must be a vector");
+    CAFFE_ENFORCE_GT(data.dim(1), 8, "DATA must have more than 8 columns");
+    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
+    // bytes for bias that we use in the fused representation (per row).
+    const std::vector<TIndex> shape = {indices.dim(0), data.dim(1) - 8};
+    output->Resize(shape);
+
+    int block_size = shape[1];
+    auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+    int N = indices.size();
+
+    const uint8_t* src_base = data.template data<uint8_t>();
+    const Index* idxs = indices.template data<Index>();
+    auto out = output->template mutable_data<float>();
+
+    for (int i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(0),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(0));
+      const uint8_t* src = src_base + idx * block_bytesize;
+      ConstEigenVectorArrayMap<uint8_t> input_row_values(src, shape[1]);
+      ConstEigenVectorArrayMap<float> input_row_scale_bias(
+          reinterpret_cast<const float*>(src + shape[1]), 2);
+
+      EigenVectorArrayMap<float> output_row(out + i * shape[1], shape[1]);
+
+      output_row = input_row_values.cast<float>() * input_row_scale_bias(0) +
+          input_row_scale_bias(1);
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, INDICES);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc
new file mode 100644
index 0000000..79dfc00
--- /dev/null
+++ b/caffe2/operators/gather_ranges_to_dense_op.cc
@@ -0,0 +1,94 @@
+#include "caffe2/operators/gather_ranges_to_dense_op.h"
+
+namespace caffe2 {
+namespace {
+
+OPERATOR_SCHEMA(GatherRangesToDense)
+    .NumInputs(2, 3)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather values
+corresponding to each range into a separate output tensor. If the optional input
+KEY tensor is also given, the output will be sorted by KEY for each example.
+
+RANGES dimensions description:
+1: represents list of examples within a batch
+2: represents list features
+3: two values which are start and length or a range (to be applied on DATA)
+
+Each feature has fixed lengths which are passed as lengths argument and a
+separate tensor will be produced for each feature.
+i.e. DATA.dim(1) = len(lengths) = NumOuptuts.
+
+Missing features (represented by empty ranges) filled with default_value.
+
+Example 1:
+  DATA  = [1, 2, 3, 4, 5, 6, 7, 8]
+  RANGES = [
+    [
+      [2, 4],
+      [0, 2],
+    ],
+    [
+      [0, 0],
+      [6, 2],
+    ]
+  ]
+  lengths = [4, 2]
+  OUTPUT[0] = [[3, 4, 5, 6], [0, 0, 0, 0]]
+  OUTPUT[1] = [[1, 2], [7, 8]]
+
+Example 2 (with KEY):
+DATA  = [1, 2, 3, 4, 5, 6, 7, 8]
+KEY   = [0, 1, 3, 2, 1, 0, 1, 0]
+RANGES = [
+  [
+    [2, 4],
+    [0, 2],
+  ],
+  [
+    [0, 0],
+    [6, 2],
+  ]
+]
+lengths = [4, 2]
+OUTPUT[0] = [[6, 5, 4, 3], [0, 0, 0, 0]]
+OUTPUT[1] = [[1, 2], [8, 7]]
+
+Contrast Example 2 with Example 1. For each data point per feature, the values
+are sorted by the corresponding KEY.
+)DOC")
+    .Input(0, "DATA", "Tensor of rank 1.")
+    .Input(
+        1,
+        "RANGES",
+        "Tensor of int32/int64 ranges, of dims (N, M, 2). "
+        "Where N is number of examples and M is a size of each example. "
+        "Last dimention represents a range in the format (start, lengths)")
+    .Input(2, "KEY", "Tensor of rank 1 and type int64.")
+    .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
+    .Arg("lengths", "Expected lengths for ranges")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      auto lengths = helper.GetRepeatedArgument<int>("lengths");
+      CAFFE_ENFORCE_EQ(in[0].dims_size(), 1, "DATA should be 1-D tensor.");
+      CAFFE_ENFORCE_EQ(in[1].dims_size(), 3, "RANGES should be 3-D tensor.");
+      if (in.size() > 2) {
+        CAFFE_ENFORCE_EQ(in[2].dims_size(), 1, "KEY should be 1-D tensor.");
+      }
+      CAFFE_ENFORCE_GT(lengths.size(), 0, "lengths should be non-empty.");
+      std::vector<TensorShape> out(lengths.size());
+      for (int i = 0; i < lengths.size(); ++i) {
+        out[i].set_data_type(in[0].data_type());
+        out[i].add_dims(in[1].dims(0));
+        out[i].add_dims(lengths[i]);
+      }
+      return out;
+    });
+
+REGISTER_CPU_OPERATOR(GatherRangesToDense, GatherRangesToDenseOp<CPUContext>);
+NO_GRADIENT(GatherRangesToDense);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
new file mode 100644
index 0000000..81f4fa5
--- /dev/null
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -0,0 +1,135 @@
+#ifndef CAFFE2_OPERATORS_GATHER_RANGES_TO_DENSE_OPS_H_
+#define CAFFE2_OPERATORS_GATHER_RANGES_TO_DENSE_OPS_H_
+
+#include <math.h>
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+#include <cstring>
+#include <map>
+#include <utility>
+
+namespace caffe2 {
+template <class Context>
+class GatherRangesToDenseOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GatherRangesToDenseOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        lengths_(OperatorBase::GetRepeatedArgument<int>("lengths")) {
+    CAFFE_ENFORCE_GT(lengths_.size(), 0, "There has to be at least one length");
+    for (auto length : lengths_) {
+      CAFFE_ENFORCE_GT(length, 0, "Each length should be positive");
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(RANGES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& ranges = Input(RANGES);
+    CAFFE_ENFORCE_EQ(data.ndim(), 1, "Data has to be 1-D");
+    CAFFE_ENFORCE_EQ(ranges.ndim(), 3, "Ranges has to be 3-D");
+    if (InputSize() == 3) {
+      auto& key = Input(KEY);
+      CAFFE_ENFORCE_EQ(key.ndim(), 1, "Key has to be 1-D");
+      CAFFE_ENFORCE(
+          key.meta().template Match<int64_t>(), "Key has to be type int64_t");
+    }
+    CAFFE_ENFORCE_EQ(
+        ranges.dim(1),
+        lengths_.size(),
+        "Nummber of ranges should match number of lengths");
+    CAFFE_ENFORCE_EQ(
+        ranges.dim(1),
+        OutputSize(),
+        "Nummber of ranges should match number of outputs");
+    CAFFE_ENFORCE_EQ(
+        ranges.dim(2), 2, "Ranges last dimension should be of size 2");
+
+    auto* rawData = static_cast<const char*>(data.raw_data());
+    auto* rangesData = ranges.template data<Index>();
+    int rangesDataOffset = 0;
+    auto itemsize = data.meta().itemsize();
+
+    auto batchSize = ranges.dim(0);
+    vector<TIndex> outputDims{batchSize, 0};
+    vector<char*> outputRawData;
+    for (int i = 0; i < OutputSize(); ++i) {
+      auto* output = Output(i);
+      outputDims[1] = lengths_[i];
+      output->Resize(outputDims);
+      char* ptr = static_cast<char*>(output->raw_mutable_data(data.meta()));
+      memset(ptr, 0, output->nbytes());
+      outputRawData.push_back(ptr);
+    }
+
+    for (int i = 0; i < batchSize; ++i) {
+      for (int j = 0; j < OutputSize(); ++j) {
+        auto rangeStart = rangesData[rangesDataOffset++];
+        auto rangeLength = rangesData[rangesDataOffset++];
+        if (rangeLength == 0) {
+          // empty range, will be filled with zeros
+          continue;
+        }
+        CAFFE_ENFORCE_EQ(
+            rangeLength,
+            lengths_[j],
+            "Range lengths missmatch for output #",
+            j);
+
+        if (InputSize() == 2) {
+          context_.template CopyItems<Context, Context>(
+              data.meta(),
+              rangeLength,
+              rawData + rangeStart * itemsize,
+              outputRawData[j] + i * itemsize * lengths_[j]);
+        } else {
+          auto& key = Input(KEY);
+          auto* key_data = key.template data<int64_t>();
+          vector<std::pair<int64_t, const char*>> buffer;
+          for (int b_i = 0; b_i < rangeLength; ++b_i) {
+            int64_t one_key_item = key_data[rangeStart + b_i];
+            auto* one_data_item = rawData + (rangeStart + b_i) * itemsize;
+            buffer.emplace_back(one_key_item, one_data_item);
+          }
+          std::sort(
+              buffer.begin(),
+              buffer.end(),
+              [](const std::pair<int64_t, const char*>& left,
+                 const std::pair<int64_t, const char*>& right) {
+                return left.first < right.first;
+              });
+          for (int b_i = 0; b_i < rangeLength; ++b_i) {
+            // Since this CPU only, directly copy to the destination.
+            std::memcpy(
+                outputRawData[j] + (i * lengths_[j] + b_i) * itemsize,
+                buffer[b_i].second,
+                itemsize);
+          }
+        }
+      }
+    }
+    CAFFE_ENFORCE_EQ(rangesDataOffset, ranges.size());
+
+    return true;
+  }
+
+  INPUT_TAGS(DATA, RANGES, KEY);
+
+ private:
+  vector<int> lengths_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_GATHER_RANGES_TO_DENSE_OPS_H_
diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc
new file mode 100644
index 0000000..0b4f3a6
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op.cc
@@ -0,0 +1,394 @@
+#include "caffe2/operators/generate_proposals_op.h"
+#include "caffe2/operators/generate_proposals_op_util_boxes.h"
+#include "generate_proposals_op_util_nms.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+namespace {
+
+// Compute the 1-d index of a n-dimensional contiguous row-major tensor for
+//     a given n-dimensional index 'index'
+size_t ComputeStartIndex(
+    const TensorCPU& tensor,
+    const std::vector<int>& index) {
+  DCHECK_EQ(index.size(), tensor.ndim());
+
+  size_t ret = 0;
+  for (int i = 0; i < index.size(); i++) {
+    ret += index[i] * tensor.size_from_dim(i + 1);
+  }
+
+  return ret;
+}
+
+// Get a sub tensor view from 'tensor' using data pointer from 'tensor'
+template <class T>
+utils::ConstTensorView<T> GetSubTensorView(
+    const TensorCPU& tensor,
+    int dim0_start_index) {
+  DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));
+
+  if (tensor.size() == 0) {
+    return utils::ConstTensorView<T>(nullptr, {});
+  }
+
+  std::vector<int> start_dims(tensor.ndim(), 0);
+  start_dims.at(0) = dim0_start_index;
+  auto st_idx = ComputeStartIndex(tensor, start_dims);
+  auto ptr = tensor.data<T>() + st_idx;
+
+  auto& input_dims = tensor.dims();
+  std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());
+
+  utils::ConstTensorView<T> ret(ptr, ret_dims);
+  return ret;
+}
+
+} // namespace
+
+namespace utils {
+
+ERMatXf ComputeAllAnchors(
+    const TensorCPU& anchors,
+    int height,
+    int width,
+    float feat_stride) {
+  const auto K = height * width;
+  const auto A = anchors.dim(0);
+  const auto box_dim = anchors.dim(1);
+  CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
+
+  ERMatXf shift_x = (ERVecXf::LinSpaced(width, 0.0, width - 1.0) * feat_stride)
+                        .replicate(height, 1);
+  ERMatXf shift_y = (EVecXf::LinSpaced(height, 0.0, height - 1.0) * feat_stride)
+                        .replicate(1, width);
+  Eigen::MatrixXf shifts(K, box_dim);
+  if (box_dim == 4) {
+    // Upright boxes in [x1, y1, x2, y2] format
+    shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
+        ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
+        ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
+        ConstEigenVectorMap<float>(shift_y.data(), shift_y.size());
+  } else {
+    // Rotated boxes in [ctr_x, ctr_y, w, h, angle] format.
+    // Zero shift for width, height and angle.
+    ERMatXf shift_zero = ERMatXf::Constant(height, width, 0.0);
+    shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
+        ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
+        ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
+        ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
+        ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size());
+  }
+
+  // Broacast anchors over shifts to enumerate all anchors at all positions
+  // in the (H, W) grid:
+  //   - add A anchors of shape (1, A, box_dim) to
+  //   - K shifts of shape (K, 1, box_dim) to get
+  //   - all shifted anchors of shape (K, A, box_dim)
+  //   - reshape to (K*A, box_dim) shifted anchors
+  ConstEigenMatrixMap<float> anchors_vec(
+      anchors.template data<float>(), 1, A * box_dim);
+  // equivalent to python code
+  //  all_anchors = (
+  //        self._model.anchors.reshape((1, A, box_dim)) +
+  //        shifts.reshape((1, K, box_dim)).transpose((1, 0, 2)))
+  //    all_anchors = all_anchors.reshape((K * A, box_dim))
+  // all_anchors_vec: (K, A * box_dim)
+  ERMatXf all_anchors_vec =
+      anchors_vec.replicate(K, 1) + shifts.rowwise().replicate(A);
+
+  // use the following to reshape to (K * A, box_dim)
+  // Eigen::Map<const ERMatXf> all_anchors(
+  //            all_anchors_vec.data(), K * A, box_dim);
+
+  return all_anchors_vec;
+}
+
+} // namespace utils
+
+template <>
+void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
+    const Eigen::Array3f& im_info,
+    const Eigen::Map<const ERMatXf>& all_anchors,
+    const utils::ConstTensorView<float>& bbox_deltas_tensor,
+    const utils::ConstTensorView<float>& scores_tensor,
+    ERArrXXf* out_boxes,
+    EArrXf* out_probs) const {
+  const auto& post_nms_topN = rpn_post_nms_topN_;
+  const auto& nms_thresh = rpn_nms_thresh_;
+  const auto& min_size = rpn_min_size_;
+  const int box_dim = static_cast<int>(all_anchors.cols());
+  CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
+
+  // Transpose and reshape predicted bbox transformations to get them
+  // into the same order as the anchors:
+  //   - bbox deltas will be (box_dim * A, H, W) format from conv output
+  //   - transpose to (H, W, box_dim * A)
+  //   - reshape to (H * W * A, box_dim) where rows are ordered by (H, W, A)
+  //     in slowest to fastest order to match the enumerated anchors
+  CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
+  CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % box_dim, 0);
+  auto A = bbox_deltas_tensor.dim(0) / box_dim;
+  auto H = bbox_deltas_tensor.dim(1);
+  auto W = bbox_deltas_tensor.dim(2);
+  // equivalent to python code
+  //  bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, box_dim))
+  ERArrXXf bbox_deltas(H * W * A, box_dim);
+  Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, box_dim * A) =
+      Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * box_dim, H * W)
+          .transpose();
+  CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());
+
+  // - scores are (A, H, W) format from conv output
+  // - transpose to (H, W, A)
+  // - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+  //   to match the order of anchors and bbox_deltas
+  CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
+  CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
+  // equivalent to python code
+  // scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
+  EArrXf scores(scores_tensor.size());
+  Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
+      Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();
+
+  std::vector<int> order(scores.size());
+  std::iota(order.begin(), order.end(), 0);
+  if (rpn_pre_nms_topN_ <= 0 || rpn_pre_nms_topN_ >= scores.size()) {
+    // 4. sort all (proposal, score) pairs by score from highest to lowest
+    // 5. take top pre_nms_topN (e.g. 6000)
+    std::sort(order.begin(), order.end(), [&scores](int lhs, int rhs) {
+      return scores[lhs] > scores[rhs];
+    });
+  } else {
+    // Avoid sorting possibly large arrays; First partition to get top K
+    // unsorted and then sort just those (~20x faster for 200k scores)
+    std::partial_sort(
+        order.begin(),
+        order.begin() + rpn_pre_nms_topN_,
+        order.end(),
+        [&scores](int lhs, int rhs) { return scores[lhs] > scores[rhs]; });
+    order.resize(rpn_pre_nms_topN_);
+  }
+
+  ERArrXXf bbox_deltas_sorted;
+  ERArrXXf all_anchors_sorted;
+  EArrXf scores_sorted;
+  utils::GetSubArrayRows(
+      bbox_deltas, utils::AsEArrXt(order), &bbox_deltas_sorted);
+  utils::GetSubArrayRows(
+      all_anchors.array(), utils::AsEArrXt(order), &all_anchors_sorted);
+  utils::GetSubArray(scores, utils::AsEArrXt(order), &scores_sorted);
+
+  // Transform anchors into proposals via bbox transformations
+  static const std::vector<float> bbox_weights{1.0, 1.0, 1.0, 1.0};
+  auto proposals = utils::bbox_transform(
+      all_anchors_sorted,
+      bbox_deltas_sorted,
+      bbox_weights,
+      utils::BBOX_XFORM_CLIP_DEFAULT,
+      correct_transform_coords_,
+      angle_bound_on_,
+      angle_bound_lo_,
+      angle_bound_hi_);
+
+  // 2. clip proposals to image (may result in proposals with zero area
+  // that will be removed in the next step)
+  proposals =
+      utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);
+
+  // 3. remove predicted boxes with either height or width < min_size
+  auto keep = utils::filter_boxes(proposals, min_size, im_info);
+  DCHECK_LE(keep.size(), scores_sorted.size());
+
+  // 6. apply loose nms (e.g. threshold = 0.7)
+  // 7. take after_nms_topN (e.g. 300)
+  // 8. return the top proposals (-> RoIs top)
+  if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
+    keep = utils::nms_cpu(
+        proposals, scores_sorted, keep, nms_thresh, post_nms_topN);
+  } else {
+    keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh);
+  }
+
+  // Generate outputs
+  utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
+  utils::GetSubArray(scores_sorted, utils::AsEArrXt(keep), out_probs);
+}
+
+template <>
+bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
+  const auto& scores = Input(0);
+  const auto& bbox_deltas = Input(1);
+  const auto& im_info_tensor = Input(2);
+  const auto& anchors = Input(3);
+  auto* out_rois = Output(0);
+  auto* out_rois_probs = Output(1);
+
+  CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
+  CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
+  const auto num_images = scores.dim(0);
+  const auto A = scores.dim(1);
+  const auto height = scores.dim(2);
+  const auto width = scores.dim(3);
+  const auto K = height * width;
+  const auto box_dim = anchors.dim(1);
+  CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
+
+  // bbox_deltas: (num_images, A * box_dim, H, W)
+  CAFFE_ENFORCE_EQ(
+      bbox_deltas.dims(),
+      (vector<TIndex>{num_images, box_dim * A, height, width}));
+
+  // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
+  CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
+  CAFFE_ENFORCE(
+      im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
+
+  // anchors: (A, box_dim)
+  CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, box_dim}));
+  CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
+
+  // Broadcast the anchors to all pixels
+  auto all_anchors_vec =
+      utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
+  Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, box_dim);
+
+  Eigen::Map<const ERArrXXf> im_info(
+      im_info_tensor.data<float>(),
+      im_info_tensor.dim(0),
+      im_info_tensor.dim(1));
+
+  const int roi_col_count = box_dim + 1;
+  out_rois->Resize(0, roi_col_count);
+  out_rois_probs->Resize(0);
+
+  std::vector<ERArrXXf> im_boxes(num_images);
+  std::vector<EArrXf> im_probs(num_images);
+  for (int i = 0; i < num_images; i++) {
+    auto cur_im_info = im_info.row(i);
+    auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
+    auto cur_scores = GetSubTensorView<float>(scores, i);
+
+    ERArrXXf& im_i_boxes = im_boxes[i];
+    EArrXf& im_i_probs = im_probs[i];
+    ProposalsForOneImage(
+        cur_im_info,
+        all_anchors,
+        cur_bbox_deltas,
+        cur_scores,
+        &im_i_boxes,
+        &im_i_probs);
+  }
+
+  int roi_counts = 0;
+  for (int i = 0; i < num_images; i++) {
+    roi_counts += im_boxes[i].rows();
+  }
+  out_rois->Extend(roi_counts, 50, &context_);
+  out_rois_probs->Extend(roi_counts, 50, &context_);
+  float* out_rois_ptr = out_rois->mutable_data<float>();
+  float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
+  for (int i = 0; i < num_images; i++) {
+    const ERArrXXf& im_i_boxes = im_boxes[i];
+    const EArrXf& im_i_probs = im_probs[i];
+    int csz = im_i_boxes.rows();
+
+    // write rois
+    Eigen::Map<ERArrXXf> cur_rois(out_rois_ptr, csz, roi_col_count);
+    cur_rois.col(0).setConstant(i);
+    cur_rois.block(0, 1, csz, box_dim) = im_i_boxes;
+
+    // write rois_probs
+    Eigen::Map<EArrXf>(out_rois_probs_ptr, csz) = im_i_probs;
+
+    out_rois_ptr += csz * roi_col_count;
+    out_rois_probs_ptr += csz;
+  }
+
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(GenerateProposals, GenerateProposalsOp<CPUContext>);
+// For backward compatibility
+REGISTER_CPU_OPERATOR(GenerateProposalsCPP, GenerateProposalsOp<CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    GenerateProposals,
+    mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
+// For backward compatibility
+REGISTER_MKL_OPERATOR(
+    GenerateProposalsCPP,
+    mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+OPERATOR_SCHEMA(GenerateProposals)
+    .NumInputs(4)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Generate bounding box proposals for Faster RCNN. The propoasls are generated for
+a list of images based on image score 'score', bounding box regression result
+'deltas' as well as predefined bounding box shapes 'anchors'. Greedy
+non-maximum suppression is applied to generate the final bounding boxes.
+)DOC")
+    .Arg("spatial_scale", "(float) spatial scale")
+    .Arg("pre_nms_topN", "(int) RPN_PRE_NMS_TOP_N")
+    .Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
+    .Arg("nms_thresh", "(float) RPN_NMS_THRESH")
+    .Arg("min_size", "(float) RPN_MIN_SIZE")
+    .Arg(
+        "correct_transform_coords",
+        "bool (default false), Correct bounding box transform coordates,"
+        " see bbox_transform() in boxes.py "
+        "Set to true to match the detectron code, set to false for backward"
+        " compatibility")
+    .Arg(
+        "angle_bound_on",
+        "bool (default true). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "angle_bound_lo",
+        "int (default -90 degrees). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "angle_bound_hi",
+        "int (default 90 degrees). If set, for rotated boxes, angle is "
+        "normalized to be within [angle_bound_lo, angle_bound_hi].")
+    .Arg(
+        "clip_angle_thresh",
+        "float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
+        "within this threshold of tolerance for backward compatibility. "
+        "Set to negative value for no clipping.")
+    .Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
+    .Input(
+        1,
+        "bbox_deltas",
+        "Bounding box deltas from conv layer, "
+        "size (img_count, 4 * A, H, W)")
+    .Input(
+        2,
+        "im_info",
+        "Image info, size (img_count, 3), "
+        "format (height, width, scale)")
+    .Input(3, "anchors", "Bounding box anchors, size (A, 4)")
+    .Output(
+        0,
+        "rois",
+        "Proposals, size (n x 5), "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(1, "rois_probs", "scores of proposals, size (n)");
+// For backward compatibility
+OPERATOR_SCHEMA(GenerateProposalsCPP).NumInputs(4).NumOutputs(2);
+
+SHOULD_NOT_DO_GRADIENT(GenerateProposals);
+// For backward compatibility
+SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
new file mode 100644
index 0000000..81f7d9a
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op.h
@@ -0,0 +1,140 @@
+#ifndef CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_
+#define CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace utils {
+
+// A sub tensor view
+template <class T>
+class ConstTensorView {
+ public:
+  ConstTensorView(const T* data, const std::vector<int>& dims)
+      : data_(data), dims_(dims) {}
+
+  int ndim() const {
+    return dims_.size();
+  }
+  const std::vector<int>& dims() const {
+    return dims_;
+  }
+  int dim(int i) const {
+    DCHECK_LE(i, dims_.size());
+    return dims_[i];
+  }
+  const T* data() const {
+    return data_;
+  }
+  size_t size() const {
+    return std::accumulate(
+        dims_.begin(), dims_.end(), 1, std::multiplies<size_t>());
+  }
+
+ private:
+  const T* data_ = nullptr;
+  std::vector<int> dims_;
+};
+
+// Generate a list of bounding box shapes for each pixel based on predefined
+//     bounding box shapes 'anchors'.
+// anchors: predefined anchors, size(A, 4)
+// Return: all_anchors_vec: (H * W, A * 4)
+// Need to reshape to (H * W * A, 4) to match the format in python
+ERMatXf ComputeAllAnchors(
+    const TensorCPU& anchors,
+    int height,
+    int width,
+    float feat_stride);
+
+} // namespace utils
+
+// C++ implementation of GenerateProposalsOp
+// Generate bounding box proposals for Faster RCNN. The propoasls are generated
+//     for a list of images based on image score 'score', bounding box
+//     regression result 'deltas' as well as predefined bounding box shapes
+//     'anchors'. Greedy non-maximum suppression is applied to generate the
+//     final bounding boxes.
+// Reference: detectron/lib/ops/generate_proposals.py
+template <class Context>
+class GenerateProposalsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GenerateProposalsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.0 / 16)),
+        feat_stride_(1.0 / spatial_scale_),
+        rpn_pre_nms_topN_(
+            OperatorBase::GetSingleArgument<int>("pre_nms_topN", 6000)),
+        rpn_post_nms_topN_(
+            OperatorBase::GetSingleArgument<int>("post_nms_topN", 300)),
+        rpn_nms_thresh_(
+            OperatorBase::GetSingleArgument<float>("nms_thresh", 0.7f)),
+        rpn_min_size_(OperatorBase::GetSingleArgument<float>("min_size", 16)),
+        correct_transform_coords_(OperatorBase::GetSingleArgument<bool>(
+            "correct_transform_coords",
+            false)),
+        angle_bound_on_(
+            OperatorBase::GetSingleArgument<bool>("angle_bound_on", true)),
+        angle_bound_lo_(
+            OperatorBase::GetSingleArgument<int>("angle_bound_lo", -90)),
+        angle_bound_hi_(
+            OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)),
+        clip_angle_thresh_(
+            OperatorBase::GetSingleArgument<float>("clip_angle_thresh", 1.0)) {}
+
+  ~GenerateProposalsOp() {}
+
+  bool RunOnDevice() override;
+
+  // Generate bounding box proposals for a given image
+  // im_info: [height, width, im_scale]
+  // all_anchors: (H * W * A, 4)
+  // bbox_deltas_tensor: (4 * A, H, W)
+  // scores_tensor: (A, H, W)
+  // out_boxes: (n, 5)
+  // out_probs: n
+  void ProposalsForOneImage(
+      const Eigen::Array3f& im_info,
+      const Eigen::Map<const ERMatXf>& all_anchors,
+      const utils::ConstTensorView<float>& bbox_deltas_tensor,
+      const utils::ConstTensorView<float>& scores_tensor,
+      ERArrXXf* out_boxes,
+      EArrXf* out_probs) const;
+
+ protected:
+  // spatial_scale_ must be declared before feat_stride_
+  float spatial_scale_{1.0};
+  float feat_stride_{1.0};
+
+  // RPN_PRE_NMS_TOP_N
+  int rpn_pre_nms_topN_{6000};
+  // RPN_POST_NMS_TOP_N
+  int rpn_post_nms_topN_{300};
+  // RPN_NMS_THRESH
+  float rpn_nms_thresh_{0.7};
+  // RPN_MIN_SIZE
+  float rpn_min_size_{16};
+  // Correct bounding box transform coordates, see bbox_transform() in boxes.py
+  // Set to true to match the detectron code, set to false for backward
+  // compatibility
+  bool correct_transform_coords_{false};
+  // If set, for rotated boxes in RRPN, output angles are normalized to be
+  // within [angle_bound_lo, angle_bound_hi].
+  bool angle_bound_on_{true};
+  int angle_bound_lo_{-90};
+  int angle_bound_hi_{90};
+  // For RRPN, clip almost horizontal boxes within this threshold of
+  // tolerance for backward compatibility. Set to negative value for
+  // no clipping.
+  float clip_angle_thresh_{1.0};
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
new file mode 100644
index 0000000..3fb7ed9
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -0,0 +1,617 @@
+#include "caffe2/operators/generate_proposals_op.h"
+
+#include <gtest/gtest.h>
+#include "caffe2/core/flags.h"
+
+namespace caffe2 {
+
+static void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  math::Set<float, CPUContext>(
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
+  return;
+}
+
+static void AddLinSpacedInput(
+    const vector<TIndex>& shape,
+    const float min_val,
+    const float max_val,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  EigenVectorMap<float> tensor_vec(
+      tensor->mutable_data<float>(), tensor->size());
+  tensor_vec.setLinSpaced(min_val, max_val);
+
+  return;
+}
+
+static void AddInput(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  EigenVectorMap<float> tensor_vec(
+      tensor->mutable_data<float>(), tensor->size());
+  tensor_vec.array() = utils::AsEArrXt(values);
+
+  return;
+}
+
+TEST(GenerateProposalsTest, TestComputeAllAnchors) {
+  ERMatXf anchors(3, 4);
+  anchors << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191, 103;
+
+  int height = 4;
+  int width = 3;
+  float feat_stride = 16;
+  ERMatXf all_anchors_gt(36, 4);
+  all_anchors_gt << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191, 103,
+      -22, -16, 69, 31, -68, -40, 115, 55, -160, -88, 207, 103, -6, -16, 85, 31,
+      -52, -40, 131, 55, -144, -88, 223, 103, -38, 0, 53, 47, -84, -24, 99, 71,
+      -176, -72, 191, 119, -22, 0, 69, 47, -68, -24, 115, 71, -160, -72, 207,
+      119, -6, 0, 85, 47, -52, -24, 131, 71, -144, -72, 223, 119, -38, 16, 53,
+      63, -84, -8, 99, 87, -176, -56, 191, 135, -22, 16, 69, 63, -68, -8, 115,
+      87, -160, -56, 207, 135, -6, 16, 85, 63, -52, -8, 131, 87, -144, -56, 223,
+      135, -38, 32, 53, 79, -84, 8, 99, 103, -176, -40, 191, 151, -22, 32, 69,
+      79, -68, 8, 115, 103, -160, -40, 207, 151, -6, 32, 85, 79, -52, 8, 131,
+      103, -144, -40, 223, 151;
+
+  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
+  Eigen::Map<ERMatXf>(
+      anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
+      anchors;
+
+  auto result =
+      utils::ComputeAllAnchors(anchors_tensor, height, width, feat_stride);
+  Eigen::Map<const ERMatXf> all_anchors_result(
+      result.data(), height * width * anchors.rows(), 4);
+
+  EXPECT_EQ((all_anchors_result - all_anchors_gt).norm(), 0);
+}
+
+namespace {
+
+template <class Derived>
+ERMatXf boxes_xyxy_to_xywh(const Eigen::MatrixBase<Derived>& boxes) {
+  CAFFE_ENFORCE_EQ(boxes.cols(), 4);
+  ERMatXf res(boxes.rows(), 4);
+  auto ones = ERMatXf::Constant(boxes.rows(), 1, 1.0);
+  res.col(0) = (boxes.col(0) + boxes.col(2)) / 2.0; // ctr_x = (x1 + x2)/2
+  res.col(1) = (boxes.col(1) + boxes.col(3)) / 2.0; // ctr_y = (y1 + y2)/2
+  res.col(2) = boxes.col(2) - boxes.col(0) + ones; // w = x2 - x1 + 1
+  res.col(3) = boxes.col(3) - boxes.col(1) + ones; // h = y2 - y1 + 1
+  return res;
+}
+
+} // namespace
+
+TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
+  // Similar to TestComputeAllAnchors but for rotated boxes with angle info.
+  ERMatXf anchors_xyxy(3, 4);
+  anchors_xyxy << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191, 103;
+
+  // Convert to RRPN format and add angles
+  ERMatXf anchors(3, 5);
+  anchors.block(0, 0, 3, 4) = boxes_xyxy_to_xywh(anchors_xyxy);
+  std::vector<float> angles{0.0, 45.0, -120.0};
+  for (int i = 0; i < anchors.rows(); ++i) {
+    anchors(i, 4) = angles[i % angles.size()];
+  }
+
+  int height = 4;
+  int width = 3;
+  float feat_stride = 16;
+  ERMatXf all_anchors_gt_xyxy(36, 4);
+  all_anchors_gt_xyxy << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191,
+      103, -22, -16, 69, 31, -68, -40, 115, 55, -160, -88, 207, 103, -6, -16,
+      85, 31, -52, -40, 131, 55, -144, -88, 223, 103, -38, 0, 53, 47, -84, -24,
+      99, 71, -176, -72, 191, 119, -22, 0, 69, 47, -68, -24, 115, 71, -160, -72,
+      207, 119, -6, 0, 85, 47, -52, -24, 131, 71, -144, -72, 223, 119, -38, 16,
+      53, 63, -84, -8, 99, 87, -176, -56, 191, 135, -22, 16, 69, 63, -68, -8,
+      115, 87, -160, -56, 207, 135, -6, 16, 85, 63, -52, -8, 131, 87, -144, -56,
+      223, 135, -38, 32, 53, 79, -84, 8, 99, 103, -176, -40, 191, 151, -22, 32,
+      69, 79, -68, 8, 115, 103, -160, -40, 207, 151, -6, 32, 85, 79, -52, 8,
+      131, 103, -144, -40, 223, 151;
+
+  // Convert gt to RRPN format and add angles
+  ERMatXf all_anchors_gt(36, 5);
+  all_anchors_gt.block(0, 0, 36, 4) = boxes_xyxy_to_xywh(all_anchors_gt_xyxy);
+  for (int i = 0; i < all_anchors_gt.rows(); ++i) {
+    all_anchors_gt(i, 4) = angles[i % angles.size()];
+  }
+
+  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
+  Eigen::Map<ERMatXf>(
+      anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
+      anchors;
+
+  auto result =
+      utils::ComputeAllAnchors(anchors_tensor, height, width, feat_stride);
+  Eigen::Map<const ERMatXf> all_anchors_result(
+      result.data(), height * width * anchors.rows(), 5);
+
+  EXPECT_EQ((all_anchors_result - all_anchors_gt).norm(), 0);
+}
+
+TEST(GenerateProposalsTest, TestEmpty) {
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("GenerateProposals");
+  def.add_input("scores");
+  def.add_input("bbox_deltas");
+  def.add_input("im_info");
+  def.add_input("anchors");
+  def.add_output("rois");
+  def.add_output("rois_probs");
+  const int img_count = 3;
+  const int A = 4;
+  const int H = 10;
+  const int W = 8;
+  AddConstInput(vector<TIndex>{img_count, A, H, W}, 1., "scores", &ws);
+  AddLinSpacedInput(
+      vector<TIndex>{img_count, 4 * A, H, W}, 0, 10, "bbox_deltas", &ws);
+  AddConstInput(vector<TIndex>{img_count, 3}, 0.1, "im_info", &ws);
+  AddConstInput(vector<TIndex>{A, 4}, 1.0, "anchors", &ws);
+
+  def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 2.0f));
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+  Blob* rois_blob = ws.GetBlob("rois");
+  EXPECT_NE(nullptr, rois_blob);
+  auto& rois = rois_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois.size(), 0);
+
+  Blob* rois_probs_blob = ws.GetBlob("rois_probs");
+  EXPECT_NE(nullptr, rois_probs_blob);
+  auto& rois_probs = rois_probs_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois_probs.size(), 0);
+}
+
+TEST(GenerateProposalsTest, TestRealDownSampled) {
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("GenerateProposals");
+  def.add_input("scores");
+  def.add_input("bbox_deltas");
+  def.add_input("im_info");
+  def.add_input("anchors");
+  def.add_output("rois");
+  def.add_output("rois_probs");
+  const int img_count = 1;
+  const int A = 2;
+  const int H = 4;
+  const int W = 5;
+
+  vector<float> scores{
+      5.44218998e-03f, 1.19207997e-03f, 1.12379994e-03f, 1.17181998e-03f,
+      1.20544003e-03f, 6.17993006e-04f, 1.05261997e-05f, 8.91025957e-06f,
+      9.29536981e-09f, 6.09605013e-05f, 4.72735002e-04f, 1.13482002e-10f,
+      1.50015003e-05f, 4.45032993e-06f, 3.21612994e-08f, 8.02662980e-04f,
+      1.40488002e-04f, 3.12508007e-07f, 3.02616991e-06f, 1.97759000e-08f,
+      2.66913995e-02f, 5.26766013e-03f, 5.05053019e-03f, 5.62100019e-03f,
+      5.37420018e-03f, 5.26280981e-03f, 2.48894998e-04f, 1.06842002e-04f,
+      3.92931997e-06f, 1.79388002e-03f, 4.79440019e-03f, 3.41609990e-07f,
+      5.20430971e-04f, 3.34090000e-05f, 2.19159006e-07f, 2.28786003e-03f,
+      5.16703985e-05f, 4.04523007e-06f, 1.79227004e-06f, 5.32449000e-08f};
+  vector<float> bbx{
+      -1.65040009e-02f, -1.84051003e-02f, -1.85930002e-02f, -2.08263006e-02f,
+      -1.83814000e-02f, -2.89172009e-02f, -3.89706008e-02f, -7.52277970e-02f,
+      -1.54091999e-01f, -2.55433004e-02f, -1.77490003e-02f, -1.10340998e-01f,
+      -4.20190990e-02f, -2.71421000e-02f, 6.89801015e-03f,  5.71171008e-02f,
+      -1.75665006e-01f, 2.30021998e-02f,  3.08554992e-02f,  -1.39333997e-02f,
+      3.40579003e-01f,  3.91070992e-01f,  3.91624004e-01f,  3.92527014e-01f,
+      3.91445011e-01f,  3.79328012e-01f,  4.26631987e-01f,  3.64892989e-01f,
+      2.76894987e-01f,  5.13985991e-01f,  3.79999995e-01f,  1.80457994e-01f,
+      4.37402993e-01f,  4.18545991e-01f,  2.51549989e-01f,  4.48318988e-01f,
+      1.68564007e-01f,  4.65440989e-01f,  4.21891987e-01f,  4.45928007e-01f,
+      3.27155995e-03f,  3.71480011e-03f,  3.60032008e-03f,  4.27092984e-03f,
+      3.74579988e-03f,  5.95752988e-03f,  -3.14473989e-03f, 3.52022005e-03f,
+      -1.88564006e-02f, 1.65188999e-03f,  1.73791999e-03f,  -3.56074013e-02f,
+      -1.66615995e-04f, 3.14146001e-03f,  -1.11830998e-02f, -5.35363983e-03f,
+      6.49790000e-03f,  -9.27671045e-03f, -2.83346009e-02f, -1.61233004e-02f,
+      -2.15505004e-01f, -2.19910994e-01f, -2.20872998e-01f, -2.12831005e-01f,
+      -2.19145000e-01f, -2.27687001e-01f, -3.43973994e-01f, -2.75869995e-01f,
+      -3.19516987e-01f, -2.50418007e-01f, -2.48537004e-01f, -5.08224010e-01f,
+      -2.28724003e-01f, -2.82402009e-01f, -3.75815988e-01f, -2.86352992e-01f,
+      -5.28333001e-02f, -4.43836004e-01f, -4.55134988e-01f, -4.34897989e-01f,
+      -5.65053988e-03f, -9.25739005e-04f, -1.06790999e-03f, -2.37016007e-03f,
+      -9.71166010e-04f, -8.90910998e-03f, -1.17592998e-02f, -2.08992008e-02f,
+      -4.94231991e-02f, 6.63906988e-03f,  3.20469006e-03f,  -6.44695014e-02f,
+      -3.11607006e-03f, 2.02738005e-03f,  1.48096997e-02f,  4.39785011e-02f,
+      -8.28424022e-02f, 3.62076014e-02f,  2.71668993e-02f,  1.38250999e-02f,
+      6.76669031e-02f,  1.03252999e-01f,  1.03255004e-01f,  9.89722982e-02f,
+      1.03646003e-01f,  4.79663983e-02f,  1.11014001e-01f,  9.31736007e-02f,
+      1.15768999e-01f,  1.04014002e-01f,  -8.90677981e-03f, 1.13103002e-01f,
+      1.33085996e-01f,  1.25405997e-01f,  1.50051996e-01f,  -1.13038003e-01f,
+      7.01059997e-02f,  1.79651007e-01f,  1.41055003e-01f,  1.62841007e-01f,
+      -1.00247003e-02f, -8.17587040e-03f, -8.32176022e-03f, -8.90108012e-03f,
+      -8.13035015e-03f, -1.77263003e-02f, -3.69572006e-02f, -3.51580009e-02f,
+      -5.92143014e-02f, -1.80795006e-02f, -5.46086021e-03f, -4.10550982e-02f,
+      -1.83081999e-02f, -2.15411000e-02f, -1.17953997e-02f, 3.33894007e-02f,
+      -5.29635996e-02f, -6.97528012e-03f, -3.15250992e-03f, -3.27355005e-02f,
+      1.29676998e-01f,  1.16080999e-01f,  1.15947001e-01f,  1.21797003e-01f,
+      1.16089001e-01f,  1.44875005e-01f,  1.15617000e-01f,  1.31586999e-01f,
+      1.74735002e-02f,  1.21973999e-01f,  1.31596997e-01f,  2.48907991e-02f,
+      6.18605018e-02f,  1.12855002e-01f,  -6.99798986e-02f, 9.58312973e-02f,
+      1.53593004e-01f,  -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f};
+  vector<float> im_info{60, 80, 0.166667f};
+  vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
+
+  ERMatXf rois_gt(9, 5);
+  rois_gt << 0, 0, 0, 79, 59, 0, 0, 5.0005703f, 51.6324f, 42.6950f, 0,
+      24.13628387f, 7.51243401f, 79, 45.0663f, 0, 0, 7.50924301f, 67.4779f,
+      45.0336, 0, 0, 23.09477997f, 50.61448669f, 59, 0, 0, 39.52141571f,
+      51.44710541f, 59, 0, 23.57396317f, 29.98791885f, 79, 59, 0, 0,
+      41.90219116f, 79, 59, 0, 0, 23.30098343f, 78.2413f, 58.7287f;
+  vector<float> rois_probs_gt{2.66913995e-02f,
+                              5.44218998e-03f,
+                              1.20544003e-03f,
+                              1.19207997e-03f,
+                              6.17993006e-04f,
+                              4.72735002e-04f,
+                              6.09605013e-05f,
+                              1.50015003e-05f,
+                              8.91025957e-06f};
+
+  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(vector<TIndex>{img_count, 4 * A, H, W}, bbx, "bbox_deltas", &ws);
+  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<TIndex>{A, 4}, anchors, "anchors", &ws);
+
+  def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
+  def.add_arg()->CopyFrom(MakeArgument("post_nms_topN", 300));
+  def.add_arg()->CopyFrom(MakeArgument("nms_thresh", 0.7f));
+  def.add_arg()->CopyFrom(MakeArgument("min_size", 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("correct_transform_coords", true));
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+
+  // test rois
+  Blob* rois_blob = ws.GetBlob("rois");
+  EXPECT_NE(nullptr, rois_blob);
+  auto& rois = rois_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois.dims(), (vector<TIndex>{rois_gt.rows(), rois_gt.cols()}));
+  auto rois_data =
+      Eigen::Map<const ERMatXf>(rois.data<float>(), rois.dim(0), rois.dim(1));
+  EXPECT_NEAR((rois_data.matrix() - rois_gt).cwiseAbs().maxCoeff(), 0, 1e-4);
+
+  // test rois_probs
+  Blob* rois_probs_blob = ws.GetBlob("rois_probs");
+  EXPECT_NE(nullptr, rois_probs_blob);
+  auto& rois_probs = rois_probs_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois_probs.dims(), (vector<TIndex>{TIndex(rois_probs_gt.size())}));
+  auto rois_probs_data =
+      ConstEigenVectorArrayMap<float>(rois_probs.data<float>(), rois.dim(0));
+  EXPECT_NEAR(
+      (rois_probs_data.matrix() - utils::AsEArrXt(rois_probs_gt).matrix())
+          .cwiseAbs()
+          .maxCoeff(),
+      0,
+      1e-4);
+}
+
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
+  // Similar to TestRealDownSampled but for rotated boxes with angle info.
+  float angle = 0;
+  float delta_angle = 0;
+  float clip_angle_thresh = 1.0;
+
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("GenerateProposals");
+  def.add_input("scores");
+  def.add_input("bbox_deltas");
+  def.add_input("im_info");
+  def.add_input("anchors");
+  def.add_output("rois");
+  def.add_output("rois_probs");
+  const int img_count = 1;
+  const int A = 2;
+  const int H = 4;
+  const int W = 5;
+
+  vector<float> scores{
+      5.44218998e-03f, 1.19207997e-03f, 1.12379994e-03f, 1.17181998e-03f,
+      1.20544003e-03f, 6.17993006e-04f, 1.05261997e-05f, 8.91025957e-06f,
+      9.29536981e-09f, 6.09605013e-05f, 4.72735002e-04f, 1.13482002e-10f,
+      1.50015003e-05f, 4.45032993e-06f, 3.21612994e-08f, 8.02662980e-04f,
+      1.40488002e-04f, 3.12508007e-07f, 3.02616991e-06f, 1.97759000e-08f,
+      2.66913995e-02f, 5.26766013e-03f, 5.05053019e-03f, 5.62100019e-03f,
+      5.37420018e-03f, 5.26280981e-03f, 2.48894998e-04f, 1.06842002e-04f,
+      3.92931997e-06f, 1.79388002e-03f, 4.79440019e-03f, 3.41609990e-07f,
+      5.20430971e-04f, 3.34090000e-05f, 2.19159006e-07f, 2.28786003e-03f,
+      5.16703985e-05f, 4.04523007e-06f, 1.79227004e-06f, 5.32449000e-08f};
+  vector<float> bbx{
+      -1.65040009e-02f, -1.84051003e-02f, -1.85930002e-02f, -2.08263006e-02f,
+      -1.83814000e-02f, -2.89172009e-02f, -3.89706008e-02f, -7.52277970e-02f,
+      -1.54091999e-01f, -2.55433004e-02f, -1.77490003e-02f, -1.10340998e-01f,
+      -4.20190990e-02f, -2.71421000e-02f, 6.89801015e-03f,  5.71171008e-02f,
+      -1.75665006e-01f, 2.30021998e-02f,  3.08554992e-02f,  -1.39333997e-02f,
+      3.40579003e-01f,  3.91070992e-01f,  3.91624004e-01f,  3.92527014e-01f,
+      3.91445011e-01f,  3.79328012e-01f,  4.26631987e-01f,  3.64892989e-01f,
+      2.76894987e-01f,  5.13985991e-01f,  3.79999995e-01f,  1.80457994e-01f,
+      4.37402993e-01f,  4.18545991e-01f,  2.51549989e-01f,  4.48318988e-01f,
+      1.68564007e-01f,  4.65440989e-01f,  4.21891987e-01f,  4.45928007e-01f,
+      3.27155995e-03f,  3.71480011e-03f,  3.60032008e-03f,  4.27092984e-03f,
+      3.74579988e-03f,  5.95752988e-03f,  -3.14473989e-03f, 3.52022005e-03f,
+      -1.88564006e-02f, 1.65188999e-03f,  1.73791999e-03f,  -3.56074013e-02f,
+      -1.66615995e-04f, 3.14146001e-03f,  -1.11830998e-02f, -5.35363983e-03f,
+      6.49790000e-03f,  -9.27671045e-03f, -2.83346009e-02f, -1.61233004e-02f,
+      -2.15505004e-01f, -2.19910994e-01f, -2.20872998e-01f, -2.12831005e-01f,
+      -2.19145000e-01f, -2.27687001e-01f, -3.43973994e-01f, -2.75869995e-01f,
+      -3.19516987e-01f, -2.50418007e-01f, -2.48537004e-01f, -5.08224010e-01f,
+      -2.28724003e-01f, -2.82402009e-01f, -3.75815988e-01f, -2.86352992e-01f,
+      -5.28333001e-02f, -4.43836004e-01f, -4.55134988e-01f, -4.34897989e-01f,
+      -5.65053988e-03f, -9.25739005e-04f, -1.06790999e-03f, -2.37016007e-03f,
+      -9.71166010e-04f, -8.90910998e-03f, -1.17592998e-02f, -2.08992008e-02f,
+      -4.94231991e-02f, 6.63906988e-03f,  3.20469006e-03f,  -6.44695014e-02f,
+      -3.11607006e-03f, 2.02738005e-03f,  1.48096997e-02f,  4.39785011e-02f,
+      -8.28424022e-02f, 3.62076014e-02f,  2.71668993e-02f,  1.38250999e-02f,
+      6.76669031e-02f,  1.03252999e-01f,  1.03255004e-01f,  9.89722982e-02f,
+      1.03646003e-01f,  4.79663983e-02f,  1.11014001e-01f,  9.31736007e-02f,
+      1.15768999e-01f,  1.04014002e-01f,  -8.90677981e-03f, 1.13103002e-01f,
+      1.33085996e-01f,  1.25405997e-01f,  1.50051996e-01f,  -1.13038003e-01f,
+      7.01059997e-02f,  1.79651007e-01f,  1.41055003e-01f,  1.62841007e-01f,
+      -1.00247003e-02f, -8.17587040e-03f, -8.32176022e-03f, -8.90108012e-03f,
+      -8.13035015e-03f, -1.77263003e-02f, -3.69572006e-02f, -3.51580009e-02f,
+      -5.92143014e-02f, -1.80795006e-02f, -5.46086021e-03f, -4.10550982e-02f,
+      -1.83081999e-02f, -2.15411000e-02f, -1.17953997e-02f, 3.33894007e-02f,
+      -5.29635996e-02f, -6.97528012e-03f, -3.15250992e-03f, -3.27355005e-02f,
+      1.29676998e-01f,  1.16080999e-01f,  1.15947001e-01f,  1.21797003e-01f,
+      1.16089001e-01f,  1.44875005e-01f,  1.15617000e-01f,  1.31586999e-01f,
+      1.74735002e-02f,  1.21973999e-01f,  1.31596997e-01f,  2.48907991e-02f,
+      6.18605018e-02f,  1.12855002e-01f,  -6.99798986e-02f, 9.58312973e-02f,
+      1.53593004e-01f,  -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f};
+
+  // Add angle in bbox deltas
+  int num_boxes = scores.size();
+  CHECK_EQ(bbx.size() / 4, num_boxes);
+  vector<float> bbx_with_angle(num_boxes * 5);
+  // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta
+  // at each spatial location for each anchor.
+  int i = 0, j = 0;
+  for (int a = 0; a < A; ++a) {
+    for (int k = 0; k < 4 * H * W; ++k) {
+      bbx_with_angle[i++] = bbx[j++];
+    }
+    for (int k = 0; k < H * W; ++k) {
+      bbx_with_angle[i++] = delta_angle;
+    }
+  }
+
+  vector<float> im_info{60, 80, 0.166667f};
+  // vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
+  // Anchors in [x_ctr, y_ctr, w, h, angle] format
+  vector<float> anchors{7.5, 7.5, 92, 48, angle, 7.5, 7.5, 256, 256, angle};
+
+  // Results should exactly be the same as TestRealDownSampled since
+  // angle = 0 for all boxes and clip_angle_thresh > 0 (which means
+  // all horizontal boxes will be clipped to maintain backward compatibility).
+  ERMatXf rois_gt_xyxy(9, 5);
+  rois_gt_xyxy << 0, 0, 0, 79, 59, 0, 0, 5.0005703f, 51.6324f, 42.6950f, 0,
+      24.13628387f, 7.51243401f, 79, 45.0663f, 0, 0, 7.50924301f, 67.4779f,
+      45.0336, 0, 0, 23.09477997f, 50.61448669f, 59, 0, 0, 39.52141571f,
+      51.44710541f, 59, 0, 23.57396317f, 29.98791885f, 79, 59, 0, 0,
+      41.90219116f, 79, 59, 0, 0, 23.30098343f, 78.2413f, 58.7287f;
+  ERMatXf rois_gt(9, 6);
+  // Batch ID
+  rois_gt.block(0, 0, rois_gt.rows(), 1) =
+      ERMatXf::Constant(rois_gt.rows(), 1, 0.0);
+  // rois_gt in [x_ctr, y_ctr, w, h] format
+  rois_gt.block(0, 1, rois_gt.rows(), 4) =
+      boxes_xyxy_to_xywh(rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4));
+  // Angle
+  rois_gt.block(0, 5, rois_gt.rows(), 1) =
+      ERMatXf::Constant(rois_gt.rows(), 1, angle);
+  vector<float> rois_probs_gt{2.66913995e-02f,
+                              5.44218998e-03f,
+                              1.20544003e-03f,
+                              1.19207997e-03f,
+                              6.17993006e-04f,
+                              4.72735002e-04f,
+                              6.09605013e-05f,
+                              1.50015003e-05f,
+                              8.91025957e-06f};
+
+  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(
+      vector<TIndex>{img_count, 5 * A, H, W},
+      bbx_with_angle,
+      "bbox_deltas",
+      &ws);
+  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<TIndex>{A, 5}, anchors, "anchors", &ws);
+
+  def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
+  def.add_arg()->CopyFrom(MakeArgument("post_nms_topN", 300));
+  def.add_arg()->CopyFrom(MakeArgument("nms_thresh", 0.7f));
+  def.add_arg()->CopyFrom(MakeArgument("min_size", 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("correct_transform_coords", true));
+  def.add_arg()->CopyFrom(MakeArgument("clip_angle_thresh", clip_angle_thresh));
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+
+  // test rois
+  Blob* rois_blob = ws.GetBlob("rois");
+  EXPECT_NE(nullptr, rois_blob);
+  auto& rois = rois_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois.dims(), (vector<TIndex>{rois_gt.rows(), rois_gt.cols()}));
+  auto rois_data =
+      Eigen::Map<const ERMatXf>(rois.data<float>(), rois.dim(0), rois.dim(1));
+  EXPECT_NEAR((rois_data.matrix() - rois_gt).cwiseAbs().maxCoeff(), 0, 1e-3);
+
+  // test rois_probs
+  Blob* rois_probs_blob = ws.GetBlob("rois_probs");
+  EXPECT_NE(nullptr, rois_probs_blob);
+  auto& rois_probs = rois_probs_blob->Get<TensorCPU>();
+  EXPECT_EQ(rois_probs.dims(), (vector<TIndex>{TIndex(rois_probs_gt.size())}));
+  auto rois_probs_data =
+      ConstEigenVectorArrayMap<float>(rois_probs.data<float>(), rois.dim(0));
+  EXPECT_NEAR(
+      (rois_probs_data.matrix() - utils::AsEArrXt(rois_probs_gt).matrix())
+          .cwiseAbs()
+          .maxCoeff(),
+      0,
+      1e-4);
+}
+
+TEST(GenerateProposalsTest, TestRealDownSampledRotated) {
+  // Similar to TestRealDownSampled but for rotated boxes with angle info.
+  float angle = 45.0;
+  float delta_angle = 0.174533; // 0.174533 radians -> 10 degrees
+  float expected_angle = 55.0;
+  float clip_angle_thresh = 1.0;
+
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("GenerateProposals");
+  def.add_input("scores");
+  def.add_input("bbox_deltas");
+  def.add_input("im_info");
+  def.add_input("anchors");
+  def.add_output("rois");
+  def.add_output("rois_probs");
+  const int img_count = 1;
+  const int A = 2;
+  const int H = 4;
+  const int W = 5;
+
+  vector<float> scores{
+      5.44218998e-03f, 1.19207997e-03f, 1.12379994e-03f, 1.17181998e-03f,
+      1.20544003e-03f, 6.17993006e-04f, 1.05261997e-05f, 8.91025957e-06f,
+      9.29536981e-09f, 6.09605013e-05f, 4.72735002e-04f, 1.13482002e-10f,
+      1.50015003e-05f, 4.45032993e-06f, 3.21612994e-08f, 8.02662980e-04f,
+      1.40488002e-04f, 3.12508007e-07f, 3.02616991e-06f, 1.97759000e-08f,
+      2.66913995e-02f, 5.26766013e-03f, 5.05053019e-03f, 5.62100019e-03f,
+      5.37420018e-03f, 5.26280981e-03f, 2.48894998e-04f, 1.06842002e-04f,
+      3.92931997e-06f, 1.79388002e-03f, 4.79440019e-03f, 3.41609990e-07f,
+      5.20430971e-04f, 3.34090000e-05f, 2.19159006e-07f, 2.28786003e-03f,
+      5.16703985e-05f, 4.04523007e-06f, 1.79227004e-06f, 5.32449000e-08f};
+  vector<float> bbx{
+      -1.65040009e-02f, -1.84051003e-02f, -1.85930002e-02f, -2.08263006e-02f,
+      -1.83814000e-02f, -2.89172009e-02f, -3.89706008e-02f, -7.52277970e-02f,
+      -1.54091999e-01f, -2.55433004e-02f, -1.77490003e-02f, -1.10340998e-01f,
+      -4.20190990e-02f, -2.71421000e-02f, 6.89801015e-03f,  5.71171008e-02f,
+      -1.75665006e-01f, 2.30021998e-02f,  3.08554992e-02f,  -1.39333997e-02f,
+      3.40579003e-01f,  3.91070992e-01f,  3.91624004e-01f,  3.92527014e-01f,
+      3.91445011e-01f,  3.79328012e-01f,  4.26631987e-01f,  3.64892989e-01f,
+      2.76894987e-01f,  5.13985991e-01f,  3.79999995e-01f,  1.80457994e-01f,
+      4.37402993e-01f,  4.18545991e-01f,  2.51549989e-01f,  4.48318988e-01f,
+      1.68564007e-01f,  4.65440989e-01f,  4.21891987e-01f,  4.45928007e-01f,
+      3.27155995e-03f,  3.71480011e-03f,  3.60032008e-03f,  4.27092984e-03f,
+      3.74579988e-03f,  5.95752988e-03f,  -3.14473989e-03f, 3.52022005e-03f,
+      -1.88564006e-02f, 1.65188999e-03f,  1.73791999e-03f,  -3.56074013e-02f,
+      -1.66615995e-04f, 3.14146001e-03f,  -1.11830998e-02f, -5.35363983e-03f,
+      6.49790000e-03f,  -9.27671045e-03f, -2.83346009e-02f, -1.61233004e-02f,
+      -2.15505004e-01f, -2.19910994e-01f, -2.20872998e-01f, -2.12831005e-01f,
+      -2.19145000e-01f, -2.27687001e-01f, -3.43973994e-01f, -2.75869995e-01f,
+      -3.19516987e-01f, -2.50418007e-01f, -2.48537004e-01f, -5.08224010e-01f,
+      -2.28724003e-01f, -2.82402009e-01f, -3.75815988e-01f, -2.86352992e-01f,
+      -5.28333001e-02f, -4.43836004e-01f, -4.55134988e-01f, -4.34897989e-01f,
+      -5.65053988e-03f, -9.25739005e-04f, -1.06790999e-03f, -2.37016007e-03f,
+      -9.71166010e-04f, -8.90910998e-03f, -1.17592998e-02f, -2.08992008e-02f,
+      -4.94231991e-02f, 6.63906988e-03f,  3.20469006e-03f,  -6.44695014e-02f,
+      -3.11607006e-03f, 2.02738005e-03f,  1.48096997e-02f,  4.39785011e-02f,
+      -8.28424022e-02f, 3.62076014e-02f,  2.71668993e-02f,  1.38250999e-02f,
+      6.76669031e-02f,  1.03252999e-01f,  1.03255004e-01f,  9.89722982e-02f,
+      1.03646003e-01f,  4.79663983e-02f,  1.11014001e-01f,  9.31736007e-02f,
+      1.15768999e-01f,  1.04014002e-01f,  -8.90677981e-03f, 1.13103002e-01f,
+      1.33085996e-01f,  1.25405997e-01f,  1.50051996e-01f,  -1.13038003e-01f,
+      7.01059997e-02f,  1.79651007e-01f,  1.41055003e-01f,  1.62841007e-01f,
+      -1.00247003e-02f, -8.17587040e-03f, -8.32176022e-03f, -8.90108012e-03f,
+      -8.13035015e-03f, -1.77263003e-02f, -3.69572006e-02f, -3.51580009e-02f,
+      -5.92143014e-02f, -1.80795006e-02f, -5.46086021e-03f, -4.10550982e-02f,
+      -1.83081999e-02f, -2.15411000e-02f, -1.17953997e-02f, 3.33894007e-02f,
+      -5.29635996e-02f, -6.97528012e-03f, -3.15250992e-03f, -3.27355005e-02f,
+      1.29676998e-01f,  1.16080999e-01f,  1.15947001e-01f,  1.21797003e-01f,
+      1.16089001e-01f,  1.44875005e-01f,  1.15617000e-01f,  1.31586999e-01f,
+      1.74735002e-02f,  1.21973999e-01f,  1.31596997e-01f,  2.48907991e-02f,
+      6.18605018e-02f,  1.12855002e-01f,  -6.99798986e-02f, 9.58312973e-02f,
+      1.53593004e-01f,  -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f};
+
+  // Add angle in bbox deltas
+  int num_boxes = scores.size();
+  CHECK_EQ(bbx.size() / 4, num_boxes);
+  vector<float> bbx_with_angle(num_boxes * 5);
+  // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta
+  // at each spatial location for each anchor.
+  int i = 0, j = 0;
+  for (int a = 0; a < A; ++a) {
+    for (int k = 0; k < 4 * H * W; ++k) {
+      bbx_with_angle[i++] = bbx[j++];
+    }
+    for (int k = 0; k < H * W; ++k) {
+      bbx_with_angle[i++] = delta_angle;
+    }
+  }
+
+  vector<float> im_info{60, 80, 0.166667f};
+  // vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
+  vector<float> anchors{8, 8, 92, 48, angle, 8, 8, 256, 256, angle};
+
+  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(
+      vector<TIndex>{img_count, 5 * A, H, W},
+      bbx_with_angle,
+      "bbox_deltas",
+      &ws);
+  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<TIndex>{A, 5}, anchors, "anchors", &ws);
+
+  def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
+  def.add_arg()->CopyFrom(MakeArgument("post_nms_topN", 300));
+  def.add_arg()->CopyFrom(MakeArgument("nms_thresh", 0.7f));
+  def.add_arg()->CopyFrom(MakeArgument("min_size", 16.0f));
+  def.add_arg()->CopyFrom(MakeArgument("correct_transform_coords", true));
+  def.add_arg()->CopyFrom(MakeArgument("clip_angle_thresh", clip_angle_thresh));
+
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+
+  // Verify that the resulting angles are correct
+  Blob* rois_blob = ws.GetBlob("rois");
+  EXPECT_NE(nullptr, rois_blob);
+  auto& rois = rois_blob->Get<TensorCPU>();
+  EXPECT_GT(rois.dim(0), 0);
+  auto rois_data =
+      Eigen::Map<const ERMatXf>(rois.data<float>(), rois.dim(0), rois.dim(1));
+  for (int i = 0; i < rois.dim(0); ++i) {
+    EXPECT_LE(std::abs(rois_data(i, 5) - expected_angle), 1e-4);
+  }
+}
+#endif // CV_MAJOR_VERSION >= 3
+
+} // namespace caffe2
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
new file mode 100644
index 0000000..0c4c345
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -0,0 +1,392 @@
+#ifndef CAFFE2_OPERATORS_UTILS_BOXES_H_
+#define CAFFE2_OPERATORS_UTILS_BOXES_H_
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+// Bounding box utils for generate_proposals_op
+// Reference: detectron/lib/utils/boxes.py
+
+namespace caffe2 {
+namespace utils {
+
+// Default value for minimum bounding box width and height after bounding box
+//     transformation (bbox_transform()) in log-space
+const float BBOX_XFORM_CLIP_DEFAULT = log(1000.0 / 16.0);
+const float PI = 3.14159265358979323846;
+
+// Forward transform that maps proposal boxes to ground-truth boxes using
+//     bounding-box regression deltas.
+// boxes: pixel coordinates of the bounding boxes
+//     size (M, 4), format [x1; y1; x2; y2], x2 >= x1, y2 >= y1
+// deltas: bounding box translations and scales
+//     size (M, 4), format [dx; dy; dw; dh]
+//     dx, dy: scale-invariant translation of the center of the bounding box
+//     dw, dh: log-space scaling of the width and height of the bounding box
+// weights: weights [wx, wy, ww, wh] for the deltas
+// bbox_xform_clip: minimum bounding box width and height in log-space after
+//     transofmration
+// correct_transform_coords: Correct bounding box transform coordates. Set to
+//     true to match the detectron code, set to false for backward compatibility
+// return: pixel coordinates of the bounding boxes
+//     size (M, 4), format [x1; y1; x2; y2]
+// see "Rich feature hierarchies for accurate object detection and semantic
+//     segmentation" Appendix C for more details
+// reference: detectron/lib/utils/boxes.py bbox_transform()
+template <class Derived1, class Derived2>
+EArrXXt<typename Derived1::Scalar> bbox_transform_upright(
+    const Eigen::ArrayBase<Derived1>& boxes,
+    const Eigen::ArrayBase<Derived2>& deltas,
+    const std::vector<typename Derived2::Scalar>& weights =
+        std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
+    const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
+    const bool correct_transform_coords = false) {
+  using T = typename Derived1::Scalar;
+  using EArrXX = EArrXXt<T>;
+  using EArrX = EArrXt<T>;
+
+  if (boxes.rows() == 0) {
+    return EArrXX::Zero(T(0), deltas.cols());
+  }
+
+  CAFFE_ENFORCE_EQ(boxes.rows(), deltas.rows());
+  CAFFE_ENFORCE_EQ(boxes.cols(), 4);
+  CAFFE_ENFORCE_EQ(deltas.cols(), 4);
+
+  EArrX widths = boxes.col(2) - boxes.col(0) + T(1.0);
+  EArrX heights = boxes.col(3) - boxes.col(1) + T(1.0);
+  auto ctr_x = boxes.col(0) + T(0.5) * widths;
+  auto ctr_y = boxes.col(1) + T(0.5) * heights;
+
+  auto dx = deltas.col(0).template cast<T>() / weights[0];
+  auto dy = deltas.col(1).template cast<T>() / weights[1];
+  auto dw =
+      (deltas.col(2).template cast<T>() / weights[2]).cwiseMin(bbox_xform_clip);
+  auto dh =
+      (deltas.col(3).template cast<T>() / weights[3]).cwiseMin(bbox_xform_clip);
+
+  EArrX pred_ctr_x = dx * widths + ctr_x;
+  EArrX pred_ctr_y = dy * heights + ctr_y;
+  EArrX pred_w = dw.exp() * widths;
+  EArrX pred_h = dh.exp() * heights;
+
+  T offset(correct_transform_coords ? 1.0 : 0.0);
+
+  EArrXX pred_boxes = EArrXX::Zero(deltas.rows(), deltas.cols());
+  // x1
+  pred_boxes.col(0) = pred_ctr_x - T(0.5) * pred_w;
+  // y1
+  pred_boxes.col(1) = pred_ctr_y - T(0.5) * pred_h;
+  // x2
+  pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - offset;
+  // y2
+  pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - offset;
+
+  return pred_boxes;
+}
+
+// Like bbox_transform_upright, but works on rotated boxes.
+// boxes: pixel coordinates of the bounding boxes
+//     size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
+// deltas: bounding box translations and scales
+//     size (M, 5), format [dx; dy; dw; dh; da]
+//     dx, dy: scale-invariant translation of the center of the bounding box
+//     dw, dh: log-space scaling of the width and height of the bounding box
+//     da: delta for angle in radians
+// return: pixel coordinates of the bounding boxes
+//     size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
+template <class Derived1, class Derived2>
+EArrXXt<typename Derived1::Scalar> bbox_transform_rotated(
+    const Eigen::ArrayBase<Derived1>& boxes,
+    const Eigen::ArrayBase<Derived2>& deltas,
+    const std::vector<typename Derived2::Scalar>& weights =
+        std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
+    const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
+    const bool angle_bound_on = true,
+    const int angle_bound_lo = -90,
+    const int angle_bound_hi = 90) {
+  using T = typename Derived1::Scalar;
+  using EArrXX = EArrXXt<T>;
+  using EArrX = EArrXt<T>;
+
+  if (boxes.rows() == 0) {
+    return EArrXX::Zero(T(0), deltas.cols());
+  }
+
+  CAFFE_ENFORCE_EQ(boxes.rows(), deltas.rows());
+  CAFFE_ENFORCE_EQ(boxes.cols(), 5);
+  CAFFE_ENFORCE_EQ(deltas.cols(), 5);
+
+  const auto& ctr_x = boxes.col(0);
+  const auto& ctr_y = boxes.col(1);
+  const auto& widths = boxes.col(2);
+  const auto& heights = boxes.col(3);
+  const auto& angles = boxes.col(4);
+
+  auto dx = deltas.col(0).template cast<T>() / weights[0];
+  auto dy = deltas.col(1).template cast<T>() / weights[1];
+  auto dw =
+      (deltas.col(2).template cast<T>() / weights[2]).cwiseMin(bbox_xform_clip);
+  auto dh =
+      (deltas.col(3).template cast<T>() / weights[3]).cwiseMin(bbox_xform_clip);
+  // Convert back to degrees
+  auto da = deltas.col(4).template cast<T>() * 180.0 / PI;
+
+  EArrXX pred_boxes = EArrXX::Zero(deltas.rows(), deltas.cols());
+  // new ctr_x
+  pred_boxes.col(0) = dx * widths + ctr_x;
+  // new ctr_y
+  pred_boxes.col(1) = dy * heights + ctr_y;
+  // new width
+  pred_boxes.col(2) = dw.exp() * widths;
+  // new height
+  pred_boxes.col(3) = dh.exp() * heights;
+  // new angle
+  pred_boxes.col(4) = da + angles;
+
+  if (angle_bound_on) {
+    // Normalize angle to be within [angle_bound_lo, angle_bound_hi].
+    // Deltas are guaranteed to be <= period / 2 while computing training
+    // targets by bbox_transform_inv.
+    const int period = angle_bound_hi - angle_bound_lo;
+    CAFFE_ENFORCE(period > 0 && period % 180 == 0);
+    auto angles = pred_boxes.col(4);
+    for (int i = 0; i < angles.size(); ++i) {
+      if (angles[i] < angle_bound_lo) {
+        angles[i] += T(period);
+      } else if (angles[i] > angle_bound_hi) {
+        angles[i] -= T(period);
+      }
+    }
+  }
+
+  return pred_boxes;
+}
+
+template <class Derived1, class Derived2>
+EArrXXt<typename Derived1::Scalar> bbox_transform(
+    const Eigen::ArrayBase<Derived1>& boxes,
+    const Eigen::ArrayBase<Derived2>& deltas,
+    const std::vector<typename Derived2::Scalar>& weights =
+        std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
+    const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
+    const bool correct_transform_coords = false,
+    const bool angle_bound_on = true,
+    const int angle_bound_lo = -90,
+    const int angle_bound_hi = 90) {
+  CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
+  if (boxes.cols() == 4) {
+    // Upright boxes
+    return bbox_transform_upright(
+        boxes, deltas, weights, bbox_xform_clip, correct_transform_coords);
+  } else {
+    // Rotated boxes with angle info
+    return bbox_transform_rotated(
+        boxes,
+        deltas,
+        weights,
+        bbox_xform_clip,
+        angle_bound_on,
+        angle_bound_lo,
+        angle_bound_hi);
+  }
+}
+
+template <class Derived>
+EArrXXt<typename Derived::Scalar> bbox_xyxy_to_ctrwh(
+    const Eigen::ArrayBase<Derived>& boxes) {
+  CAFFE_ENFORCE_EQ(boxes.cols(), 4);
+
+  const auto& x1 = boxes.col(0);
+  const auto& y1 = boxes.col(1);
+  const auto& x2 = boxes.col(2);
+  const auto& y2 = boxes.col(3);
+
+  EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
+  ret.col(0) = (x1 + x2) / 2.0; // x_ctr
+  ret.col(1) = (y1 + y2) / 2.0; // y_ctr
+  ret.col(2) = x2 - x1 + 1.0; // w
+  ret.col(3) = y2 - y1 + 1.0; // h
+  return ret;
+}
+
+template <class Derived>
+EArrXXt<typename Derived::Scalar> bbox_ctrwh_to_xyxy(
+    const Eigen::ArrayBase<Derived>& boxes) {
+  CAFFE_ENFORCE_EQ(boxes.cols(), 4);
+
+  const auto& x_ctr = boxes.col(0);
+  const auto& y_ctr = boxes.col(1);
+  const auto& w = boxes.col(2);
+  const auto& h = boxes.col(3);
+
+  EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
+  ret.col(0) = x_ctr - (w - 1) / 2.0; // x1
+  ret.col(1) = y_ctr - (h - 1) / 2.0; // y1
+  ret.col(2) = x_ctr + (w - 1) / 2.0; // x2
+  ret.col(3) = y_ctr + (h - 1) / 2.0; // y2
+  return ret;
+}
+
+// Clip boxes to image boundaries
+// boxes: pixel coordinates of bounding box, size (M * 4)
+template <class Derived>
+EArrXXt<typename Derived::Scalar> clip_boxes_upright(
+    const Eigen::ArrayBase<Derived>& boxes,
+    int height,
+    int width) {
+  CAFFE_ENFORCE(boxes.cols() == 4);
+
+  EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
+
+  // x1 >= 0 && x1 < width
+  ret.col(0) = boxes.col(0).cwiseMin(width - 1).cwiseMax(0);
+  // y1 >= 0 && y1 < height
+  ret.col(1) = boxes.col(1).cwiseMin(height - 1).cwiseMax(0);
+  // x2 >= 0 && x2 < width
+  ret.col(2) = boxes.col(2).cwiseMin(width - 1).cwiseMax(0);
+  // y2 >= 0 && y2 < height
+  ret.col(3) = boxes.col(3).cwiseMin(height - 1).cwiseMax(0);
+
+  return ret;
+}
+
+// Similar to clip_boxes_upright but handles rotated boxes with angle info.
+// boxes: size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
+//
+// Clipping is only performed for boxes that are almost upright
+// (within a given `angle_thresh` tolerance) to maintain backward compatibility
+// for non-rotated boxes.
+//
+// We don't clip rotated boxes due to a couple of reasons:
+// (1) There are potentially multiple ways to clip a rotated box to make it
+//     fit within the image.
+// (2) It's tricky to make the entire rectangular box fit within the image and
+//     still be able to not leave out pixels of interest.
+// Therefore, we rely on upstream ops like RoIAlignRotated safely handling this.
+template <class Derived>
+EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
+    const Eigen::ArrayBase<Derived>& boxes,
+    int height,
+    int width,
+    float angle_thresh = 1.0) {
+  CAFFE_ENFORCE(boxes.cols() == 5);
+
+  const auto& angles = boxes.col(4);
+
+  // Filter boxes that are upright (with a tolerance of angle_thresh)
+  EArrXXt<typename Derived::Scalar> upright_boxes;
+  const auto& indices = GetArrayIndices(angles.abs() <= angle_thresh);
+  GetSubArrayRows(boxes, AsEArrXt(indices), &upright_boxes);
+
+  // Convert to [x1, y1, x2, y2] format and clip them
+  const auto& upright_boxes_xyxy =
+      bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4));
+  const auto& clipped_upright_boxes_xyxy =
+      clip_boxes_upright(upright_boxes_xyxy, height, width);
+
+  // Convert back to [x_ctr, y_ctr, w, h, angle] and update upright boxes
+  upright_boxes.block(0, 0, upright_boxes.rows(), 4) =
+      bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy);
+
+  EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
+  ret = boxes;
+  for (int i = 0; i < upright_boxes.rows(); ++i) {
+    ret.row(indices[i]) = upright_boxes.row(i);
+  }
+  return ret;
+}
+
+// Clip boxes to image boundaries.
+template <class Derived>
+EArrXXt<typename Derived::Scalar> clip_boxes(
+    const Eigen::ArrayBase<Derived>& boxes,
+    int height,
+    int width,
+    float angle_thresh = 1.0) {
+  CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
+  if (boxes.cols() == 4) {
+    // Upright boxes
+    return clip_boxes_upright(boxes, height, width);
+  } else {
+    // Rotated boxes with angle info
+    return clip_boxes_rotated(boxes, height, width, angle_thresh);
+  }
+}
+
+// Only keep boxes with both sides >= min_size and center within the image.
+// boxes: pixel coordinates of bounding box, size (M * 4)
+// im_info: [height, width, img_scale]
+// return: row indices for 'boxes'
+template <class Derived>
+std::vector<int> filter_boxes_upright(
+    const Eigen::ArrayBase<Derived>& boxes,
+    double min_size,
+    const Eigen::Array3f& im_info) {
+  CAFFE_ENFORCE_EQ(boxes.cols(), 4);
+
+  // Scale min_size to match image scale
+  min_size *= im_info[2];
+
+  using T = typename Derived::Scalar;
+  using EArrX = EArrXt<T>;
+
+  EArrX ws = boxes.col(2) - boxes.col(0) + T(1);
+  EArrX hs = boxes.col(3) - boxes.col(1) + T(1);
+  EArrX x_ctr = boxes.col(0) + ws / T(2);
+  EArrX y_ctr = boxes.col(1) + hs / T(2);
+
+  EArrXb keep = (ws >= min_size) && (hs >= min_size) &&
+      (x_ctr < T(im_info[1])) && (y_ctr < T(im_info[0]));
+
+  return GetArrayIndices(keep);
+}
+
+// Similar to filter_boxes_upright but works for rotated boxes.
+// boxes: pixel coordinates of the bounding boxes
+//     size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
+// im_info: [height, width, img_scale]
+// return: row indices for 'boxes'
+template <class Derived>
+std::vector<int> filter_boxes_rotated(
+    const Eigen::ArrayBase<Derived>& boxes,
+    double min_size,
+    const Eigen::Array3f& im_info) {
+  CAFFE_ENFORCE_EQ(boxes.cols(), 5);
+
+  // Scale min_size to match image scale
+  min_size *= im_info[2];
+
+  using T = typename Derived::Scalar;
+  using EArrX = EArrXt<T>;
+
+  const auto& x_ctr = boxes.col(0);
+  const auto& y_ctr = boxes.col(1);
+  const auto& ws = boxes.col(2);
+  const auto& hs = boxes.col(3);
+
+  EArrXb keep = (ws >= min_size) && (hs >= min_size) &&
+      (x_ctr < T(im_info[1])) && (y_ctr < T(im_info[0]));
+
+  return GetArrayIndices(keep);
+}
+
+template <class Derived>
+std::vector<int> filter_boxes(
+    const Eigen::ArrayBase<Derived>& boxes,
+    double min_size,
+    const Eigen::Array3f& im_info) {
+  CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
+  if (boxes.cols() == 4) {
+    // Upright boxes
+    return filter_boxes_upright(boxes, min_size, im_info);
+  } else {
+    // Rotated boxes with angle info
+    return filter_boxes_rotated(boxes, min_size, im_info);
+  }
+}
+
+} // namespace utils
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_UTILS_BOXES_H_
diff --git a/caffe2/operators/generate_proposals_op_util_boxes_test.cc b/caffe2/operators/generate_proposals_op_util_boxes_test.cc
new file mode 100644
index 0000000..f9ff7e9
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op_util_boxes_test.cc
@@ -0,0 +1,137 @@
+#include "caffe2/operators/generate_proposals_op_util_boxes.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+TEST(UtilsBoxesTest, TestBboxTransformRandom) {
+  using EMatXf = Eigen::MatrixXf;
+
+  EMatXf bbox(5, 4);
+  bbox << 175.62031555, 20.91103172, 253.352005, 155.0145874, 169.24636841,
+      4.85241556, 228.8605957, 105.02092743, 181.77426147, 199.82876587,
+      192.88427734, 214.0255127, 174.36262512, 186.75761414, 296.19091797,
+      231.27906799, 22.73153877, 92.02596283, 135.5695343, 208.80291748;
+
+  EMatXf deltas(5, 4);
+  deltas << 0.47861834, 0.13992102, 0.14961673, 0.71495209, 0.29915856,
+      -0.35664671, 0.89018666, 0.70815367, -0.03852064, 0.44466892, 0.49492538,
+      0.71409376, 0.28052918, 0.02184832, 0.65289006, 1.05060139, -0.38172557,
+      -0.08533806, -0.60335309, 0.79052375;
+
+  EMatXf result_gt(5, 4);
+  result_gt << 206.94953073, -30.71519157, 298.3876512, 245.44846569,
+      143.8712194, -83.34289038, 291.50227513, 122.05339902, 177.43029521,
+      198.66623633, 197.29527254, 229.70308414, 152.25190373, 145.43156421,
+      388.21547899, 275.59425266, 5.06242193, 11.04094661, 67.32890274,
+      270.68622005;
+
+  const float BBOX_XFORM_CLIP = log(1000.0 / 16.0);
+  auto result = utils::bbox_transform(
+      bbox.array(),
+      deltas.array(),
+      std::vector<float>{1.0, 1.0, 1.0, 1.0},
+      BBOX_XFORM_CLIP);
+  EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4);
+}
+
+TEST(UtilsBoxesTest, TestBboxTransformRotated) {
+  // Test rotated bbox transform w/o angle normalization
+  using EMatXf = Eigen::MatrixXf;
+
+  EMatXf bbox(5, 5);
+  bbox << 214.986, 88.4628, 78.7317, 135.104, 0.0, 199.553, 55.4367, 60.6142,
+      101.169, 45.0, 187.829, 207.427, 0012.11, 15.1967, 90.0, 235.777, 209.518,
+      122.828, 45.5215, -60.0, 79.6505, 150.914, 113.838, 117.777, 170.5;
+
+  EMatXf deltas(5, 5);
+  // 0.174533 radians -> 10 degrees
+  deltas << 0.47861834, 0.13992102, 0.14961673, 0.71495209, 0.0, 0.29915856,
+      -0.35664671, 0.89018666, 0.70815367, 0.174533, -0.03852064, 0.44466892,
+      0.49492538, 0.71409376, 0.174533, 0.28052918, 0.02184832, 0.65289006,
+      1.05060139, 0.174533, -0.38172557, -0.08533806, -0.60335309, 0.79052375,
+      0.174533;
+
+  EMatXf result_gt(5, 5);
+  result_gt << 252.668, 107.367, 91.4381, 276.165, 0.0, 217.686, 19.3551,
+      147.631, 205.397, 55.0, 187.363, 214.185, 19.865, 31.0368, 100.0, 270.234,
+      210.513, 235.963, 130.163, -50.0, 36.1956, 140.863, 62.2665, 259.645,
+      180.5;
+
+  const float BBOX_XFORM_CLIP = log(1000.0 / 16.0);
+  auto result = utils::bbox_transform(
+      bbox.array(),
+      deltas.array(),
+      std::vector<float>{1.0, 1.0, 1.0, 1.0},
+      BBOX_XFORM_CLIP,
+      true, /* correct_transform_coords */
+      false /* angle_bound_on */);
+  EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-2);
+}
+
+TEST(UtilsBoxesTest, TestBboxTransformRotatedNormalized) {
+  // Test rotated bbox transform with angle normalization
+  using EMatXf = Eigen::MatrixXf;
+
+  EMatXf bbox(5, 5);
+  bbox << 214.986, 88.4628, 78.7317, 135.104, 0.0, 199.553, 55.4367, 60.6142,
+      101.169, 45.0, 187.829, 207.427, 0012.11, 15.1967, 90.0, 235.777, 209.518,
+      122.828, 45.5215, -60.0, 79.6505, 150.914, 113.838, 117.777, 170.5;
+
+  EMatXf deltas(5, 5);
+  // 0.174533 radians -> 10 degrees
+  deltas << 0.47861834, 0.13992102, 0.14961673, 0.71495209, 0.0, 0.29915856,
+      -0.35664671, 0.89018666, 0.70815367, 0.174533, -0.03852064, 0.44466892,
+      0.49492538, 0.71409376, 0.174533, 0.28052918, 0.02184832, 0.65289006,
+      1.05060139, 0.174533, -0.38172557, -0.08533806, -0.60335309, 0.79052375,
+      0.174533;
+
+  EMatXf result_gt(5, 5);
+  result_gt << 252.668, 107.367, 91.4381, 276.165, 0.0, 217.686, 19.3551,
+      147.631, 205.397, 55.0, 187.363, 214.185, 19.865, 31.0368, -80.0, 270.234,
+      210.513, 235.963, 130.163, -50.0, 36.1956, 140.863, 62.2665, 259.645, 0.5;
+
+  const float BBOX_XFORM_CLIP = log(1000.0 / 16.0);
+  auto result = utils::bbox_transform(
+      bbox.array(),
+      deltas.array(),
+      std::vector<float>{1.0, 1.0, 1.0, 1.0},
+      BBOX_XFORM_CLIP,
+      true, /* correct_transform_coords */
+      true, /* angle_bound_on */
+      -90, /* angle_bound_lo */
+      90 /* angle_bound_hi */);
+  EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-2);
+}
+
+TEST(UtilsBoxesTest, ClipRotatedBoxes) {
+  // Test utils::clip_boxes_rotated()
+  using EMatXf = Eigen::MatrixXf;
+
+  int height = 800;
+  int width = 600;
+  EMatXf bbox(5, 5);
+  bbox << 20, 20, 200, 150, 0, // Horizontal
+      20, 20, 200, 150, 0.5, // Almost horizontal
+      20, 20, 200, 150, 30, // Rotated
+      300, 300, 200, 150, 30, // Rotated
+      579, 779, 200, 150, -0.5; // Almost horizontal
+
+  // Test with no clipping
+  float angle_thresh = -1.0;
+  auto result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
+  EXPECT_NEAR((result.matrix() - bbox).norm(), 0.0, 1e-4);
+
+  EMatXf result_gt(5, 5);
+  result_gt << 59.75, 47.25, 120.5, 95.5, 0, 59.75, 47.25, 120.5, 95.5, 0.5, 20,
+      20, 200, 150, 30, 300, 300, 200, 150, 30, 539.25, 751.75, 120.5, 95.5,
+      -0.5;
+
+  // Test clipping with tolerance
+  angle_thresh = 1.0;
+  result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
+  EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4);
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
new file mode 100644
index 0000000..39e7feb
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -0,0 +1,528 @@
+#ifndef CAFFE2_OPERATORS_UTILS_NMS_H_
+#define CAFFE2_OPERATORS_UTILS_NMS_H_
+
+#include <vector>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+#include <opencv2/opencv.hpp>
+#endif // CV_MAJOR_VERSION >= 3
+
+namespace caffe2 {
+namespace utils {
+
+// Greedy non-maximum suppression for proposed bounding boxes
+// Reject a bounding box if its region has an intersection-overunion (IoU)
+//    overlap with a higher scoring selected bounding box larger than a
+//    threshold.
+// Reference: detectron/lib/utils/cython_nms.pyx
+// proposals: pixel coordinates of proposed bounding boxes,
+//    size: (M, 4), format: [x1; y1; x2; y2]
+// scores: scores for each bounding box, size: (M, 1)
+// sorted_indices: indices that sorts the scores from high to low
+// return: row indices of the selected proposals
+template <class Derived1, class Derived2>
+std::vector<int> nms_cpu_upright(
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& sorted_indices,
+    float thresh,
+    int topN = -1) {
+  CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
+  CAFFE_ENFORCE_EQ(proposals.cols(), 4);
+  CAFFE_ENFORCE_EQ(scores.cols(), 1);
+  CAFFE_ENFORCE_LE(sorted_indices.size(), proposals.rows());
+
+  using EArrX = EArrXt<typename Derived1::Scalar>;
+
+  auto x1 = proposals.col(0);
+  auto y1 = proposals.col(1);
+  auto x2 = proposals.col(2);
+  auto y2 = proposals.col(3);
+
+  EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0);
+
+  EArrXi order = AsEArrXt(sorted_indices);
+  std::vector<int> keep;
+  while (order.size() > 0) {
+    // exit if already enough proposals
+    if (topN >= 0 && keep.size() >= topN) {
+      break;
+    }
+
+    int i = order[0];
+    keep.push_back(i);
+    ConstEigenVectorArrayMap<int> rest_indices(
+        order.data() + 1, order.size() - 1);
+    EArrX xx1 = GetSubArray(x1, rest_indices).cwiseMax(x1[i]);
+    EArrX yy1 = GetSubArray(y1, rest_indices).cwiseMax(y1[i]);
+    EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]);
+    EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]);
+
+    EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0);
+    EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0);
+    EArrX inter = w * h;
+    EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
+
+    // indices for sub array order[1:n]
+    auto inds = GetArrayIndices(ovr <= thresh);
+    order = GetSubArray(order, AsEArrXt(inds) + 1);
+  }
+
+  return keep;
+}
+
+/**
+ * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503.
+ * Reference: detectron/lib/utils/cython_nms.pyx
+ * out_scores: Output updated scores after applying Soft-NMS
+ * proposals: pixel coordinates of proposed bounding boxes,
+ *    size: (M, 4), format: [x1; y1; x2; y2]
+ *    size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN
+ * scores: scores for each bounding box, size: (M, 1)
+ * indices: Indices to consider within proposals and scores. Can be used
+ *     to pre-filter proposals/scores based on some threshold.
+ * sigma: Standard deviation for Gaussian
+ * overlap_thresh: Similar to original NMS
+ * score_thresh: If updated score falls below this thresh, discard proposal
+ * method: 0 - Hard (original) NMS, 1 - Linear, 2 - Gaussian
+ * return: row indices of the selected proposals
+ */
+template <class Derived1, class Derived2, class Derived3>
+std::vector<int> soft_nms_cpu_upright(
+    Eigen::ArrayBase<Derived3>* out_scores,
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& indices,
+    float sigma = 0.5,
+    float overlap_thresh = 0.3,
+    float score_thresh = 0.001,
+    unsigned int method = 1,
+    int topN = -1) {
+  CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
+  CAFFE_ENFORCE_EQ(proposals.cols(), 4);
+  CAFFE_ENFORCE_EQ(scores.cols(), 1);
+
+  using EArrX = EArrXt<typename Derived1::Scalar>;
+
+  const auto& x1 = proposals.col(0);
+  const auto& y1 = proposals.col(1);
+  const auto& x2 = proposals.col(2);
+  const auto& y2 = proposals.col(3);
+
+  EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0);
+
+  // Initialize out_scores with original scores. Will be iteratively updated
+  // as Soft-NMS is applied.
+  *out_scores = scores;
+
+  std::vector<int> keep;
+  EArrXi pending = AsEArrXt(indices);
+  while (pending.size() > 0) {
+    // Exit if already enough proposals
+    if (topN >= 0 && keep.size() >= topN) {
+      break;
+    }
+
+    // Find proposal with max score among remaining proposals
+    int max_pos;
+    auto max_score = GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
+    int i = pending[max_pos];
+    keep.push_back(i);
+
+    // Compute IoU of the remaining boxes with the identified max box
+    std::swap(pending(0), pending(max_pos));
+    const auto& rest_indices = pending.tail(pending.size() - 1);
+    EArrX xx1 = GetSubArray(x1, rest_indices).cwiseMax(x1[i]);
+    EArrX yy1 = GetSubArray(y1, rest_indices).cwiseMax(y1[i]);
+    EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]);
+    EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]);
+
+    EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0);
+    EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0);
+    EArrX inter = w * h;
+    EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
+
+    // Update scores based on computed IoU, overlap threshold and NMS method
+    for (int j = 0; j < rest_indices.size(); ++j) {
+      typename Derived2::Scalar weight;
+      switch (method) {
+        case 1: // Linear
+          weight = (ovr(j) > overlap_thresh) ? (1.0 - ovr(j)) : 1.0;
+          break;
+        case 2: // Gaussian
+          weight = std::exp(-1.0 * ovr(j) * ovr(j) / sigma);
+          break;
+        default: // Original NMS
+          weight = (ovr(j) > overlap_thresh) ? 0.0 : 1.0;
+      }
+      (*out_scores)(rest_indices[j]) *= weight;
+    }
+
+    // Discard boxes with new scores below min threshold and update pending
+    // indices
+    const auto& rest_scores = GetSubArray(*out_scores, rest_indices);
+    const auto& inds = GetArrayIndices(rest_scores >= score_thresh);
+    pending = GetSubArray(rest_indices, AsEArrXt(inds));
+  }
+
+  return keep;
+}
+
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+namespace {
+
+template <class Derived>
+cv::RotatedRect bbox_to_rotated_rect(const Eigen::ArrayBase<Derived>& box) {
+  CAFFE_ENFORCE_EQ(box.size(), 5);
+  // cv::RotatedRect takes angle to mean clockwise rotation, but RRPN bbox
+  // representation means counter-clockwise rotation.
+  return cv::RotatedRect(
+      cv::Point2f(box[0], box[1]), cv::Size2f(box[2], box[3]), -box[4]);
+}
+
+/**
+ * Returns the intersection area of two rotated rectangles.
+ */
+double rotated_rect_intersection(
+    const cv::RotatedRect& rect1,
+    const cv::RotatedRect& rect2) {
+  std::vector<cv::Point2f> intersectPts, orderedPts;
+
+  // Find points of intersection
+  auto ret = cv::rotatedRectangleIntersection(rect1, rect2, intersectPts);
+  if (intersectPts.size() <= 2) {
+    return 0.0;
+  }
+
+  // If one rectangle is fully enclosed within another, return the area
+  // of the smaller one early.
+  if (ret == cv::INTERSECT_FULL) {
+    return std::min(rect1.size.area(), rect2.size.area());
+  }
+
+  // Convex Hull to order the intersection points in clockwise or
+  // counter-clockwise order and find the countour area.
+  cv::convexHull(intersectPts, orderedPts);
+  return cv::contourArea(orderedPts);
+}
+
+} // namespace
+
+/**
+ * Find the intersection area of two rotated boxes represented in format
+ * [ctr_x, ctr_y, width, height, angle].
+ * `angle` represents counter-clockwise rotation in degrees.
+ */
+template <class Derived1, class Derived2>
+double bbox_intersection_rotated(
+    const Eigen::ArrayBase<Derived1>& box1,
+    const Eigen::ArrayBase<Derived2>& box2) {
+  CAFFE_ENFORCE(box1.size() == 5 && box2.size() == 5);
+  const auto& rect1 = bbox_to_rotated_rect(box1);
+  const auto& rect2 = bbox_to_rotated_rect(box2);
+  return rotated_rect_intersection(rect1, rect2);
+}
+
+/**
+ * Similar to `bbox_overlaps()` in detectron/utils/cython_bbox.pyx,
+ * but handles rotated boxes represented in format
+ * [ctr_x, ctr_y, width, height, angle].
+ * `angle` represents counter-clockwise rotation in degrees.
+ */
+template <class Derived1, class Derived2>
+Eigen::ArrayXXf bbox_overlaps_rotated(
+    const Eigen::ArrayBase<Derived1>& boxes,
+    const Eigen::ArrayBase<Derived2>& query_boxes) {
+  CAFFE_ENFORCE(boxes.cols() == 5 && query_boxes.cols() == 5);
+
+  const auto& boxes_areas = boxes.col(2) * boxes.col(3);
+  const auto& query_boxes_areas = query_boxes.col(2) * query_boxes.col(3);
+
+  Eigen::ArrayXXf overlaps(boxes.rows(), query_boxes.rows());
+  for (int i = 0; i < boxes.rows(); ++i) {
+    for (int j = 0; j < query_boxes.rows(); ++j) {
+      auto inter = bbox_intersection_rotated(boxes.row(i), query_boxes.row(j));
+      overlaps(i, j) = (inter == 0.0)
+          ? 0.0
+          : inter / (boxes_areas[i] + query_boxes_areas[j] - inter);
+    }
+  }
+  return overlaps;
+}
+
+// Similar to nms_cpu_upright, but handles rotated proposal boxes
+// in the format:
+//   size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)].
+//
+// For now, we only consider IoU as the metric for suppression. No angle info
+// is used yet.
+template <class Derived1, class Derived2>
+std::vector<int> nms_cpu_rotated(
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& sorted_indices,
+    float thresh,
+    int topN = -1) {
+  CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
+  CAFFE_ENFORCE_EQ(proposals.cols(), 5);
+  CAFFE_ENFORCE_EQ(scores.cols(), 1);
+  CAFFE_ENFORCE_LE(sorted_indices.size(), proposals.rows());
+
+  using EArrX = EArrXt<typename Derived1::Scalar>;
+
+  auto widths = proposals.col(2);
+  auto heights = proposals.col(3);
+  EArrX areas = widths * heights;
+
+  std::vector<cv::RotatedRect> rotated_rects(proposals.rows());
+  for (int i = 0; i < proposals.rows(); ++i) {
+    rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i));
+  }
+
+  EArrXi order = AsEArrXt(sorted_indices);
+  std::vector<int> keep;
+  while (order.size() > 0) {
+    // exit if already enough proposals
+    if (topN >= 0 && keep.size() >= topN) {
+      break;
+    }
+
+    int i = order[0];
+    keep.push_back(i);
+    ConstEigenVectorArrayMap<int> rest_indices(
+        order.data() + 1, order.size() - 1);
+
+    EArrX inter(rest_indices.size());
+    for (int j = 0; j < rest_indices.size(); ++j) {
+      inter[j] = rotated_rect_intersection(
+          rotated_rects[i], rotated_rects[rest_indices[j]]);
+    }
+    EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
+
+    // indices for sub array order[1:n].
+    // TODO (viswanath): Should angle info be included as well while filtering?
+    auto inds = GetArrayIndices(ovr <= thresh);
+    order = GetSubArray(order, AsEArrXt(inds) + 1);
+  }
+
+  return keep;
+}
+
+// Similar to soft_nms_cpu_upright, but handles rotated proposal boxes
+// in the format:
+//   size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)].
+//
+// For now, we only consider IoU as the metric for suppression. No angle info
+// is used yet.
+template <class Derived1, class Derived2, class Derived3>
+std::vector<int> soft_nms_cpu_rotated(
+    Eigen::ArrayBase<Derived3>* out_scores,
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& indices,
+    float sigma = 0.5,
+    float overlap_thresh = 0.3,
+    float score_thresh = 0.001,
+    unsigned int method = 1,
+    int topN = -1) {
+  CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
+  CAFFE_ENFORCE_EQ(proposals.cols(), 5);
+  CAFFE_ENFORCE_EQ(scores.cols(), 1);
+
+  using EArrX = EArrXt<typename Derived1::Scalar>;
+
+  auto widths = proposals.col(2);
+  auto heights = proposals.col(3);
+  EArrX areas = widths * heights;
+
+  std::vector<cv::RotatedRect> rotated_rects(proposals.rows());
+  for (int i = 0; i < proposals.rows(); ++i) {
+    rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i));
+  }
+
+  // Initialize out_scores with original scores. Will be iteratively updated
+  // as Soft-NMS is applied.
+  *out_scores = scores;
+
+  std::vector<int> keep;
+  EArrXi pending = AsEArrXt(indices);
+  while (pending.size() > 0) {
+    // Exit if already enough proposals
+    if (topN >= 0 && keep.size() >= topN) {
+      break;
+    }
+
+    // Find proposal with max score among remaining proposals
+    int max_pos;
+    auto max_score = GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
+    int i = pending[max_pos];
+    keep.push_back(i);
+
+    // Compute IoU of the remaining boxes with the identified max box
+    std::swap(pending(0), pending(max_pos));
+    const auto& rest_indices = pending.tail(pending.size() - 1);
+    EArrX inter(rest_indices.size());
+    for (int j = 0; j < rest_indices.size(); ++j) {
+      inter[j] = rotated_rect_intersection(
+          rotated_rects[i], rotated_rects[rest_indices[j]]);
+    }
+    EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
+
+    // Update scores based on computed IoU, overlap threshold and NMS method
+    // TODO (viswanath): Should angle info be included as well while filtering?
+    for (int j = 0; j < rest_indices.size(); ++j) {
+      typename Derived2::Scalar weight;
+      switch (method) {
+        case 1: // Linear
+          weight = (ovr(j) > overlap_thresh) ? (1.0 - ovr(j)) : 1.0;
+          break;
+        case 2: // Gaussian
+          weight = std::exp(-1.0 * ovr(j) * ovr(j) / sigma);
+          break;
+        default: // Original NMS
+          weight = (ovr(j) > overlap_thresh) ? 0.0 : 1.0;
+      }
+      (*out_scores)(rest_indices[j]) *= weight;
+    }
+
+    // Discard boxes with new scores below min threshold and update pending
+    // indices
+    const auto& rest_scores = GetSubArray(*out_scores, rest_indices);
+    const auto& inds = GetArrayIndices(rest_scores >= score_thresh);
+    pending = GetSubArray(rest_indices, AsEArrXt(inds));
+  }
+
+  return keep;
+}
+#endif // CV_MAJOR_VERSION >= 3
+
+template <class Derived1, class Derived2>
+std::vector<int> nms_cpu(
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& sorted_indices,
+    float thresh,
+    int topN = -1) {
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+  CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5);
+  if (proposals.cols() == 4) {
+    // Upright boxes
+    return nms_cpu_upright(proposals, scores, sorted_indices, thresh, topN);
+  } else {
+    // Rotated boxes with angle info
+    return nms_cpu_rotated(proposals, scores, sorted_indices, thresh, topN);
+  }
+#else
+  return nms_cpu_upright(proposals, scores, sorted_indices, thresh, topN);
+#endif // CV_MAJOR_VERSION >= 3
+}
+
+// Greedy non-maximum suppression for proposed bounding boxes
+// Reject a bounding box if its region has an intersection-overunion (IoU)
+//    overlap with a higher scoring selected bounding box larger than a
+//    threshold.
+// Reference: detectron/lib/utils/cython_nms.pyx
+// proposals: pixel coordinates of proposed bounding boxes,
+//    size: (M, 4), format: [x1; y1; x2; y2]
+//    size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN
+// scores: scores for each bounding box, size: (M, 1)
+// return: row indices of the selected proposals
+template <class Derived1, class Derived2>
+std::vector<int> nms_cpu(
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    float thres) {
+  std::vector<int> indices(proposals.rows());
+  std::iota(indices.begin(), indices.end(), 0);
+  std::sort(
+      indices.data(),
+      indices.data() + indices.size(),
+      [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
+
+  return nms_cpu(proposals, scores, indices, thres);
+}
+
+template <class Derived1, class Derived2, class Derived3>
+std::vector<int> soft_nms_cpu(
+    Eigen::ArrayBase<Derived3>* out_scores,
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    const std::vector<int>& indices,
+    float sigma = 0.5,
+    float overlap_thresh = 0.3,
+    float score_thresh = 0.001,
+    unsigned int method = 1,
+    int topN = -1) {
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+  CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5);
+  if (proposals.cols() == 4) {
+    // Upright boxes
+    return soft_nms_cpu_upright(
+        out_scores,
+        proposals,
+        scores,
+        indices,
+        sigma,
+        overlap_thresh,
+        score_thresh,
+        method,
+        topN);
+  } else {
+    // Rotated boxes with angle info
+    return soft_nms_cpu_rotated(
+        out_scores,
+        proposals,
+        scores,
+        indices,
+        sigma,
+        overlap_thresh,
+        score_thresh,
+        method,
+        topN);
+  }
+#else
+  return soft_nms_cpu_upright(
+      out_scores,
+      proposals,
+      scores,
+      indices,
+      sigma,
+      overlap_thresh,
+      score_thresh,
+      method,
+      topN);
+#endif // CV_MAJOR_VERSION >= 3
+}
+
+template <class Derived1, class Derived2, class Derived3>
+std::vector<int> soft_nms_cpu(
+    Eigen::ArrayBase<Derived3>* out_scores,
+    const Eigen::ArrayBase<Derived1>& proposals,
+    const Eigen::ArrayBase<Derived2>& scores,
+    float sigma = 0.5,
+    float overlap_thresh = 0.3,
+    float score_thresh = 0.001,
+    unsigned int method = 1,
+    int topN = -1) {
+  std::vector<int> indices(proposals.rows());
+  std::iota(indices.begin(), indices.end(), 0);
+  return soft_nms_cpu(
+      out_scores,
+      proposals,
+      scores,
+      indices,
+      sigma,
+      overlap_thresh,
+      score_thresh,
+      method,
+      topN);
+}
+
+} // namespace utils
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_UTILS_NMS_H_
diff --git a/caffe2/operators/generate_proposals_op_util_nms_test.cc b/caffe2/operators/generate_proposals_op_util_nms_test.cc
new file mode 100644
index 0000000..696ff83
--- /dev/null
+++ b/caffe2/operators/generate_proposals_op_util_nms_test.cc
@@ -0,0 +1,390 @@
+#include "caffe2/utils/eigen_utils.h"
+#include "generate_proposals_op_util_nms.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+TEST(UtilsNMSTest, TestNMS) {
+  Eigen::ArrayXXf input(5, 5);
+  input << 10, 10, 50, 60, 0.5, 11, 12, 48, 60, 0.7, 8, 9, 40, 50, 0.6, 100,
+      100, 150, 140, 0.9, 99, 110, 155, 139, 0.8;
+
+  std::vector<float> input_thresh{0.1f, 0.3f, 0.5f, 0.8f, 0.9f};
+  // ground truth generated based on detection.caffe2/lib/nms/py_cpu_nms.py
+  std::vector<std::vector<int>> output_gt{
+      {3, 1}, {3, 1}, {3, 1}, {3, 4, 1, 2}, {3, 4, 1, 2, 0}};
+
+  // test utils::nms_cpu without indices input
+  auto proposals = input.block(0, 0, input.rows(), 4);
+  auto scores = input.col(4);
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]);
+    EXPECT_EQ(output_gt[i], cur_out);
+  }
+
+  // test utils::nms_cpu with indices
+  std::vector<int> indices(proposals.rows());
+  std::iota(indices.begin(), indices.end(), 0);
+  std::sort(
+      indices.data(),
+      indices.data() + indices.size(),
+      [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]);
+    EXPECT_EQ(output_gt[i], cur_out);
+  }
+
+  // test utils::nms_cpu with topN
+  std::vector<int> top_n = {1, 1, 2, 2, 3};
+  auto gt_out = output_gt;
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out =
+        utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]);
+    gt_out[i].resize(top_n[i]);
+    EXPECT_EQ(gt_out[i], cur_out);
+  }
+}
+
+TEST(UtilsNMSTest, TestSoftNMS) {
+  Eigen::ArrayXXf input(5, 5);
+  input.row(0) << 5.18349426e+02, 1.77783920e+02, 9.06085266e+02,
+      2.59163239e+02, 8.17906916e-01;
+  input.row(1) << 2.11392624e+02, 1.76144958e+02, 6.14215149e+02,
+      2.48934662e+02, 9.52467501e-01;
+  input.row(2) << 4.65724518e+02, 1.83594269e+02, 9.39000000e+02,
+      2.55136627e+02, 6.73921347e-01;
+  input.row(3) << 6.07164246e+02, 2.60230377e+02, 8.32768127e+02,
+      3.39919891e+02, 9.99834776e-01;
+  input.row(4) << 3.23936859e+02, 3.43427063e+02, 6.20561157e+02,
+      3.98286072e+02, 9.99737203e-01;
+
+  const auto& proposals = input.block(0, 0, input.rows(), 4);
+  const auto& scores = input.col(4);
+
+  vector<int> method{1, 1, 2, 2};
+  vector<float> overlap_thresh{0.1f, 0.3f, 0.1f, 0.3f};
+
+  // Ground truth generated based on
+  //   detectron/lib/utils/cython_nms.pyx
+  std::vector<int> keep_gt{3, 4, 1, 0, 2};
+
+  // Explicitly use colmajor order to match scores
+  Eigen::ArrayXXf scores_gt(5, 4);
+  // Linear, overlap_thresh=0.1
+  scores_gt.col(0) << 7.13657320e-01, 9.52467501e-01, 1.44501388e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Linear, overlap_thresh=0.3
+  scores_gt.col(1) << 8.17906916e-01, 9.52467501e-01, 1.76800430e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Gaussian, overlap_thresh=0.1
+  scores_gt.col(2) << 7.91758895e-01, 9.52467501e-01, 2.12320581e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Gaussian, overlap_thresh=0.3
+  scores_gt.col(3) << 7.91758895e-01, 9.52467501e-01, 2.12320581e-01,
+      9.99834776e-01, 9.99737203e-01;
+
+  Eigen::ArrayXf out_scores;
+  for (int i = 0; i < method.size(); ++i) {
+    LOG(INFO) << "Testing SoftNMS with method=" << method[i]
+              << ", overlap_thresh=" << overlap_thresh[i];
+    const auto& expected_scores = scores_gt.col(i);
+
+    auto keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        0.5,
+        overlap_thresh[i],
+        0.0001,
+        method[i]);
+    EXPECT_EQ(keep, keep_gt);
+    {
+      auto diff = expected_scores - out_scores;
+      EXPECT_TRUE((diff.abs() < 1e-6).all());
+    }
+
+    // Test with topN
+    for (int topN = 1; topN <= 3; ++topN) {
+      keep = utils::soft_nms_cpu(
+          &out_scores,
+          proposals,
+          scores,
+          0.5,
+          overlap_thresh[i],
+          0.0001,
+          method[i],
+          topN);
+      std::vector<int> expected_keep(keep_gt.begin(), keep_gt.begin() + topN);
+      EXPECT_EQ(expected_keep, keep);
+    }
+
+    // Test with filtered indices
+    auto indices = utils::GetArrayIndices(scores >= 0.9);
+    keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        indices,
+        0.5,
+        overlap_thresh[i],
+        0.0001,
+        method[i]);
+    std::sort(keep.begin(), keep.end());
+    EXPECT_EQ(indices, keep);
+    {
+      const auto& expected = utils::GetSubArray(expected_scores, indices);
+      const auto& actual = utils::GetSubArray(out_scores, indices);
+      EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+    }
+
+    // Test with high score_thresh
+    float score_thresh = 0.9;
+    keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        0.5,
+        overlap_thresh[i],
+        score_thresh,
+        method[i]);
+    {
+      auto expected_keep =
+          utils::GetArrayIndices(expected_scores >= score_thresh);
+      std::sort(keep.begin(), keep.end());
+      EXPECT_EQ(expected_keep, keep);
+
+      const auto& expected = utils::GetSubArray(expected_scores, expected_keep);
+      const auto& actual = utils::GetSubArray(out_scores, expected_keep);
+      EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+    }
+  }
+}
+
+#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+TEST(UtilsNMSTest, TestNMSRotatedAngle0) {
+  // Same inputs as TestNMS, but in RRPN format with angle 0 for testing
+  // nms_cpu_rotated
+  Eigen::ArrayXXf input(5, 5);
+  input << 10, 10, 50, 60, 0.5, 11, 12, 48, 60, 0.7, 8, 9, 40, 50, 0.6, 100,
+      100, 150, 140, 0.9, 99, 110, 155, 139, 0.8;
+
+  std::vector<float> input_thresh{0.1f, 0.3f, 0.5f, 0.8f, 0.9f};
+  // ground truth generated based on detection.caffe2/lib/nms/py_cpu_nms.py
+  std::vector<std::vector<int>> output_gt{
+      {3, 1}, {3, 1}, {3, 1}, {3, 4, 1, 2}, {3, 4, 1, 2, 0}};
+
+  // test utils::nms_cpu without indices input.
+  // Add additional dim for angle and convert from
+  // [x1, y1, x2, y1] to [ctr_x, ctr_y, w, h] format.
+  Eigen::ArrayXXf proposals = Eigen::ArrayXXf::Zero(input.rows(), 5);
+  proposals.col(0) = (input.col(0) + input.col(2)) / 2.0; // ctr_x = (x1 + x2)/2
+  proposals.col(1) = (input.col(1) + input.col(3)) / 2.0; // ctr_y = (y1 + y2)/2
+  proposals.col(2) = input.col(2) - input.col(0) + 1.0; // w = x2 - x1 + 1
+  proposals.col(3) = input.col(3) - input.col(1) + 1.0; // h = y2 - y1 + 1
+
+  auto scores = input.col(4);
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]);
+    EXPECT_EQ(output_gt[i], cur_out);
+  }
+
+  // test utils::nms_cpu with indices
+  std::vector<int> indices(proposals.rows());
+  std::iota(indices.begin(), indices.end(), 0);
+  std::sort(
+      indices.data(),
+      indices.data() + indices.size(),
+      [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]);
+    EXPECT_EQ(output_gt[i], cur_out);
+  }
+
+  // test utils::nms_cpu with topN
+  std::vector<int> top_n = {1, 1, 2, 2, 3};
+  auto gt_out = output_gt;
+  for (int i = 0; i < input_thresh.size(); i++) {
+    auto cur_out =
+        utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]);
+    gt_out[i].resize(top_n[i]);
+    EXPECT_EQ(gt_out[i], cur_out);
+  }
+}
+
+TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) {
+  // Same inputs as TestSoftNMS, but in RRPN format with angle 0 for testing
+  // nms_cpu_rotated
+  Eigen::ArrayXXf input(5, 5);
+  input.row(0) << 5.18349426e+02, 1.77783920e+02, 9.06085266e+02,
+      2.59163239e+02, 8.17906916e-01;
+  input.row(1) << 2.11392624e+02, 1.76144958e+02, 6.14215149e+02,
+      2.48934662e+02, 9.52467501e-01;
+  input.row(2) << 4.65724518e+02, 1.83594269e+02, 9.39000000e+02,
+      2.55136627e+02, 6.73921347e-01;
+  input.row(3) << 6.07164246e+02, 2.60230377e+02, 8.32768127e+02,
+      3.39919891e+02, 9.99834776e-01;
+  input.row(4) << 3.23936859e+02, 3.43427063e+02, 6.20561157e+02,
+      3.98286072e+02, 9.99737203e-01;
+
+  // Add additional dim for angle and convert from
+  // [x1, y1, x2, y1] to [ctr_x, ctr_y, w, h] format.
+  Eigen::ArrayXXf proposals = Eigen::ArrayXXf::Zero(input.rows(), 5);
+  proposals.col(0) = (input.col(0) + input.col(2)) / 2.0; // ctr_x = (x1 + x2)/2
+  proposals.col(1) = (input.col(1) + input.col(3)) / 2.0; // ctr_y = (y1 + y2)/2
+  proposals.col(2) = input.col(2) - input.col(0) + 1.0; // w = x2 - x1 + 1
+  proposals.col(3) = input.col(3) - input.col(1) + 1.0; // h = y2 - y1 + 1
+
+  const auto& scores = input.col(4);
+
+  vector<int> method{1, 1, 2, 2};
+  vector<float> overlap_thresh{0.1f, 0.3f, 0.1f, 0.3f};
+
+  // Ground truth generated based on
+  //   detectron/lib/utils/cython_nms.pyx
+  std::vector<int> keep_gt{3, 4, 1, 0, 2};
+
+  // Explicitly use colmajor order to match scores
+  Eigen::ArrayXXf scores_gt(5, 4);
+  // Linear, overlap_thresh=0.1
+  scores_gt.col(0) << 7.13657320e-01, 9.52467501e-01, 1.44501388e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Linear, overlap_thresh=0.3
+  scores_gt.col(1) << 8.17906916e-01, 9.52467501e-01, 1.76800430e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Gaussian, overlap_thresh=0.1
+  scores_gt.col(2) << 7.91758895e-01, 9.52467501e-01, 2.12320581e-01,
+      9.99834776e-01, 9.99737203e-01;
+  // Gaussian, overlap_thresh=0.3
+  scores_gt.col(3) << 7.91758895e-01, 9.52467501e-01, 2.12320581e-01,
+      9.99834776e-01, 9.99737203e-01;
+
+  Eigen::ArrayXf out_scores;
+  for (int i = 0; i < method.size(); ++i) {
+    LOG(INFO) << "Testing SoftNMS with method=" << method[i]
+              << ", overlap_thresh=" << overlap_thresh[i];
+    const auto& expected_scores = scores_gt.col(i);
+
+    auto keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        0.5,
+        overlap_thresh[i],
+        0.0001,
+        method[i]);
+    EXPECT_EQ(keep, keep_gt);
+    {
+      auto diff = expected_scores - out_scores;
+      EXPECT_TRUE((diff.abs() < 1e-6).all());
+    }
+
+    // Test with topN
+    for (int topN = 1; topN <= 3; ++topN) {
+      keep = utils::soft_nms_cpu(
+          &out_scores,
+          proposals,
+          scores,
+          0.5,
+          overlap_thresh[i],
+          0.0001,
+          method[i],
+          topN);
+      std::vector<int> expected_keep(keep_gt.begin(), keep_gt.begin() + topN);
+      EXPECT_EQ(expected_keep, keep);
+    }
+
+    // Test with filtered indices
+    auto indices = utils::GetArrayIndices(scores >= 0.9);
+    keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        indices,
+        0.5,
+        overlap_thresh[i],
+        0.0001,
+        method[i]);
+    std::sort(keep.begin(), keep.end());
+    EXPECT_EQ(indices, keep);
+    {
+      const auto& expected = utils::GetSubArray(expected_scores, indices);
+      const auto& actual = utils::GetSubArray(out_scores, indices);
+      EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+    }
+
+    // Test with high score_thresh
+    float score_thresh = 0.9;
+    keep = utils::soft_nms_cpu(
+        &out_scores,
+        proposals,
+        scores,
+        0.5,
+        overlap_thresh[i],
+        score_thresh,
+        method[i]);
+    {
+      auto expected_keep =
+          utils::GetArrayIndices(expected_scores >= score_thresh);
+      std::sort(keep.begin(), keep.end());
+      EXPECT_EQ(expected_keep, keep);
+
+      const auto& expected = utils::GetSubArray(expected_scores, expected_keep);
+      const auto& actual = utils::GetSubArray(out_scores, expected_keep);
+      EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+    }
+  }
+}
+
+TEST(UtilsNMSTest, RotatedBBoxOverlaps) {
+  {
+    // Simple case with angle 0 (upright boxes)
+    Eigen::ArrayXXf boxes(2, 5);
+    boxes << 10.5, 15.5, 21, 31, 0, 14.0, 17, 4, 10, 0;
+
+    Eigen::ArrayXXf query_boxes(3, 5);
+    query_boxes << 30.5, 10.5, 41, 1, 0, 13.5, 21.5, 5, 21, 0, 10.5, 15.5, 21,
+        31, 0;
+
+    Eigen::ArrayXXf expected(2, 3);
+    expected << 0.0161527172, 0.152439028, 1., 0., 0.38095239, 0.0614439324;
+
+    auto actual = utils::bbox_overlaps_rotated(boxes, query_boxes);
+    EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+  }
+
+  {
+    // Angle 45
+    Eigen::ArrayXXf boxes(1, 5);
+    boxes << 0, 0, 2.0 * std::sqrt(2), 2.0 * std::sqrt(2), 45;
+
+    Eigen::ArrayXXf query_boxes(1, 5);
+    query_boxes << 1, 1, 2, 2, 0;
+
+    Eigen::ArrayXXf expected(1, 1);
+    expected << 0.2;
+
+    auto actual = utils::bbox_overlaps_rotated(boxes, query_boxes);
+    EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+  }
+
+  {
+    Eigen::ArrayXXf boxes(2, 5);
+    boxes << 60.0, 60.0, 100.0, 100.0, 0.0, 50.0, 50.0, 100.0, 100.0, 135.0;
+
+    Eigen::ArrayXXf query_boxes(6, 5);
+    query_boxes << 60.0, 60.0, 100.0, 100.0, 180.0, 50.0, 50.0, 100.0, 100.0,
+        45.0, 80.0, 50.0, 100.0, 100.0, 0.0, 50.0, 50.0, 200.0, 50.0, 45.0,
+        200.0, 200.0, 100.0, 100.0, 0, 60.0, 60.0, 100.0, 100.0, 1.0;
+
+    Eigen::ArrayXXf expected(2, 6);
+    expected << 1., 0.6507467031, 0.5625, 0.3718426526, 0., 0.9829941392,
+        0.6507467628, 1., 0.4893216789, 0.3333334029, 0., 0.6508141756;
+
+    auto actual = utils::bbox_overlaps_rotated(boxes, query_boxes);
+    EXPECT_TRUE(((expected - actual).abs() < 1e-6).all());
+  }
+}
+#endif // CV_MAJOR_VERSION >= 3
+
+} // namespace caffe2
diff --git a/caffe2/operators/given_tensor_fill_op.cc b/caffe2/operators/given_tensor_fill_op.cc
new file mode 100644
index 0000000..6cf039c
--- /dev/null
+++ b/caffe2/operators/given_tensor_fill_op.cc
@@ -0,0 +1,211 @@
+#include "caffe2/operators/given_tensor_fill_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GivenTensorDoubleFill,
+    GivenTensorFillOp<double, CPUContext>);
+REGISTER_CPU_OPERATOR(GivenTensorBoolFill, GivenTensorFillOp<bool, CPUContext>);
+REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GivenTensorInt64Fill,
+    GivenTensorFillOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GivenTensorStringFill,
+    GivenTensorFillOp<std::string, CPUContext>);
+
+NO_GRADIENT(GivenTensorFill);
+NO_GRADIENT(GivenTensorDoubleFill);
+NO_GRADIENT(GivenTensorBoolFill);
+NO_GRADIENT(GivenTensorIntFill);
+NO_GRADIENT(GivenTensorInt64Fill);
+NO_GRADIENT(GivenTensorStringFill);
+
+OPERATOR_SCHEMA(GivenTensorFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+This op fills an output tensor with the data specified by the *value* and *dtype* arguments.  The output tensor shape is specified by the *shape* argument. Beware, when using this argument *value* should have a value for every element of the *output*, as missing values will not be initialized automatically. If *input_as_shape* is set to *true*, then the *input* should be a 1D tensor containing the desired output shape (the dimensions specified in *extra_shape* will also be appended). In this case, the *shape* argument should **not** be set.
+
+*Note: Do not set the shape argument and pass in an input at the same time.*
+
+Github Links:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/given_tensor_fill_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/given_tensor_fill_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "GivenTensorFill",
+    [],
+    ["out"],
+    values=[1., 2., 3.],
+    shape=[3],
+)
+
+workspace.RunOperatorOnce(op)
+print("Out:\n", workspace.FetchBlob("out"))
+
+```
+
+**Result**
+
+```
+
+Out:
+ [1. 2. 3.]
+
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "values",
+        "*(type depends on dtype, Required=True)* The value of the elements to go in the *output* tensor.",
+        true /* required */)
+    .Arg(
+        "dtype",
+        "The data type for the elements of the output tensor. Strictly must be one of the types from DataType enum in TensorProto.")
+    .Arg(
+        "shape",
+        "*(type: [int])* Desired shape of the *output* tensor.")
+    .Arg(
+        "extra_shape",
+        "*(type: [int])* The additional dimensions appended at the end of the *shape* indicated by the input blob. Cannot set the *extra_shape* argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "*(type: bool; default: False)* set to *True* to use the *input* as shape. First, input must be in CPU context.")
+    .Input(
+        0,
+        "input",
+        "(Optional) 1D tensor specifying the shape of the output. Must be used with *input_as_shape=True*")
+    .Output(
+        0,
+        "output",
+        "Output tensor with desired dimension filled with specified data. If the shape argument is set, this is the shape specified, and if the *input* exists and *input_as_shape=True*, it is the shape specified by the *input* tensor.")
+    .TensorInferenceFunction(FillerTensorInference<>);
+
+OPERATOR_SCHEMA(GivenTensorDoubleFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(
+        FillerTensorInference<TensorProto_DataType_DOUBLE>);
+
+OPERATOR_SCHEMA(GivenTensorBoolFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(FillerTensorInference<TensorProto_DataType_BOOL>);
+
+OPERATOR_SCHEMA(GivenTensorIntFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(FillerTensorInference<TensorProto_DataType_INT32>);
+
+OPERATOR_SCHEMA(GivenTensorInt64Fill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(FillerTensorInference<TensorProto_DataType_INT64>);
+
+OPERATOR_SCHEMA(GivenTensorStringFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(
+        FillerTensorInference<TensorProto_DataType_STRING>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/given_tensor_fill_op.cu b/caffe2/operators/given_tensor_fill_op.cu
new file mode 100644
index 0000000..af0b886
--- /dev/null
+++ b/caffe2/operators/given_tensor_fill_op.cu
@@ -0,0 +1,17 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/given_tensor_fill_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    GivenTensorDoubleFill,
+    GivenTensorFillOp<double, CUDAContext>);
+REGISTER_CUDA_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    GivenTensorInt64Fill,
+    GivenTensorFillOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    GivenTensorBoolFill,
+    GivenTensorFillOp<bool, CUDAContext>);
+}
diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h
new file mode 100644
index 0000000..bf2119d
--- /dev/null
+++ b/caffe2/operators/given_tensor_fill_op.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/filler_op.h"
+#include "caffe2/utils/cast.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class GivenTensorFillOp final : public FillerOp<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<Context>(operator_def, ws) {
+    const ArgumentHelper helper(operator_def);
+    // GivenTensorFillOp can be provided with a "dtype" arg if float is
+    // is specified as T. Otherwise, "dtype" is ignored.
+    // In the ideal world, we would get rid of templating of T at all, but we
+    // need to provide backwards compatibility.
+    if (!std::is_same<T, float>::value || !helper.HasArgument("dtype")) {
+      ExtractValues<T>();
+    } else {
+      auto dtype = cast::GetCastDataType(helper, "dtype");
+      switch (dtype) {
+        case TensorProto_DataType_FLOAT:
+          ExtractValues<float>();
+          break;
+        case TensorProto_DataType_DOUBLE:
+          ExtractValues<double>();
+          break;
+        case TensorProto_DataType_BOOL:
+          ExtractValues<bool>();
+          break;
+        case TensorProto_DataType_INT32:
+          ExtractValues<int>();
+          break;
+        case TensorProto_DataType_INT64:
+          ExtractValues<int64_t>();
+          break;
+        case TensorProto_DataType_STRING:
+          ExtractValues<std::string>();
+          break;
+        case TensorProto_DataType_UNDEFINED:
+          CAFFE_THROW("Cannot have undefined 'dtype' argument");
+        default:
+          CAFFE_THROW("Unexpected 'dtype' argument value: ", dtype);
+      }
+    }
+  }
+
+  bool Fill(Tensor<Context>* output) override {
+    return (this->*body_)(output);
+  }
+
+ private:
+  template <typename Type>
+  void ExtractValues() {
+    auto source_values =
+        OperatorBase::template GetRepeatedArgument<Type>("values");
+    values_.Resize(source_values.size());
+    Type* values_data = values_.template mutable_data<Type>();
+    for (int i = 0; i < source_values.size(); i++) {
+      values_data[i] = static_cast<Type>(source_values[i]);
+    }
+    body_ = &GivenTensorFillOp::FillWithType<Type>;
+  }
+
+  template <typename Type>
+  bool FillWithType(Tensor<Context>* output) {
+    DCHECK_EQ(output->size(), values_.size())
+        << "output size: " << output->size()
+        << " given size: " << values_.size();
+    auto* data = output->template mutable_data<Type>();
+    const Type* values_data = values_.template data<Type>();
+    if (output->size()) {
+      context_.template Copy<Type, CPUContext, Context>(
+          output->size(), values_data, data);
+    }
+    return true;
+  }
+
+  bool (GivenTensorFillOp::*body_)(Tensor<Context>* output);
+  TensorCPU values_;
+};
+} // namespace caffe2
diff --git a/caffe2/operators/glu_op.cc b/caffe2/operators/glu_op.cc
new file mode 100644
index 0000000..5f50fdd
--- /dev/null
+++ b/caffe2/operators/glu_op.cc
@@ -0,0 +1,57 @@
+#include <math.h>
+
+#include "caffe2/operators/glu_op.h"
+
+namespace caffe2 {
+
+namespace {
+float sigmoid(const float x) {
+  if (x >= 0) {
+    return 1. / (1. + exp(-x));
+  } else {
+    const float exp_x = exp(x);
+    return exp_x / (1 + exp_x);
+  }
+}
+} // namespace
+
+template <>
+void GluOp<float, CPUContext>::ComputeGlu(
+    const int M,
+    const int split_dim,
+    const int N,
+    const float* Xdata,
+    float* Ydata) {
+  const int xStride = 2 * split_dim * N;
+  const int yStride = split_dim * N;
+  for (int i = 0; i < M; ++i) {
+    const int idx = i * xStride;
+    const int idy = i * yStride;
+    for (int j = 0; j < split_dim; ++j) {
+      const int jN = j * N;
+      const int jdx1 = idx + jN;
+      const int jdx2 = idx + (j + split_dim) * N;
+      const int jdy = idy + jN;
+      for (int k = 0; k < N; ++k) {
+        const float x1 = Xdata[jdx1 + k];
+        const float x2 = Xdata[jdx2 + k];
+        Ydata[jdy + k] = x1 * sigmoid(x2);
+      }
+    }
+  }
+}
+
+OPERATOR_SCHEMA(Glu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Applies gated linear unit to the input Tensor X. The output Y is half the size
+of the input X, so if the shape of X is [d1, d2, ..., N] shape of Y will be
+[d1, d2, ..., dn/2] and Y(:dn-1, i) = GLU(X(:dn-1, i), X(:dn-1, i+N/2)) =
+X(dn-1, i) * sigmoid(X(dn-1, i+N/2))
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D output tensor");
+
+REGISTER_CPU_OPERATOR(Glu, GluOp<float, CPUContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/glu_op.cu b/caffe2/operators/glu_op.cu
new file mode 100644
index 0000000..e6f7614
--- /dev/null
+++ b/caffe2/operators/glu_op.cu
@@ -0,0 +1,42 @@
+#include "caffe2/operators/glu_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void glu_kernel(
+    const int M,
+    const int split_dim_size,
+    const int N,
+    const float* Xdata,
+    float* Ydata) {
+  const int xOffset = 2 * split_dim_size * N;
+  const int yOffset = split_dim_size * N;
+  CUDA_1D_KERNEL_LOOP(index, M * split_dim_size * N) {
+    const int i = index / split_dim_size / N;
+    const int j = index / N % split_dim_size;
+    const int k = index % N;
+    const float x1 = Xdata[i * xOffset + j * N + k];
+    const float x2 = Xdata[i * xOffset + (j + split_dim_size) * N + k];
+    Ydata[i * yOffset + j * N + k] = x1 * (1. / (1. + exp(-x2)));
+  }
+}
+} // namespace
+
+template <>
+void GluOp<float, CUDAContext>::ComputeGlu(
+    const int M,
+    const int split_dim_size,
+    const int N,
+    const float* x_data,
+    float* y_data) {
+  glu_kernel<<<
+      CAFFE_GET_BLOCKS(M * N * split_dim_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(M, split_dim_size, N, x_data, y_data);
+}
+
+REGISTER_CUDA_OPERATOR(Glu, GluOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/glu_op.h b/caffe2/operators/glu_op.h
new file mode 100644
index 0000000..53b1935
--- /dev/null
+++ b/caffe2/operators/glu_op.h
@@ -0,0 +1,55 @@
+#ifndef CAFFE2_OPERATOR_GLU_OP_H_
+#define CAFFE2_OPERATOR_GLU_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+template <typename T, class Context>
+class GluOp final : public Operator<Context> {
+ public:
+  GluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        dim_(OperatorBase::GetSingleArgument<int>("dim", -1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    vector<TIndex> Yshape;
+    Yshape.insert(Yshape.end(), X.dims().begin(), X.dims().end());
+    const int split_index = dim_ == -1 ? Yshape.size() - 1 : dim_;
+    CAFFE_ENFORCE(
+        Yshape[split_index] % 2 == 0,
+        "Split dimension ",
+        Yshape[split_index],
+        " should be divided by two");
+    const int split_dim_size = Yshape[split_index] / 2;
+    const int M = X.size_to_dim(split_index);
+    const int N = X.size_from_dim(split_index + 1);
+    Yshape[split_index] = split_dim_size;
+    Y->Resize(Yshape);
+    ComputeGlu(
+        M,
+        split_dim_size,
+        N,
+        X.template data<T>(),
+        Y->template mutable_data<T>());
+    return true;
+  }
+
+ protected:
+  void ComputeGlu(
+      const int M,
+      const int split_dim_size,
+      const int N,
+      const T* X,
+      T* output);
+
+ private:
+  const int dim_;
+};
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATOR_GLU_OP_H_
diff --git a/caffe2/operators/group_norm_op.cc b/caffe2/operators/group_norm_op.cc
new file mode 100644
index 0000000..9c203f2
--- /dev/null
+++ b/caffe2/operators/group_norm_op.cc
@@ -0,0 +1,284 @@
+// ------------------------------------------------------------------
+// GroupNorm op in Caffe2 for CPU
+// Written by Kaiming He
+// Improved by Xiaomeng Yang
+// see https://arxiv.org/abs/1803.08494
+// This is a stand-alone op: Y = gamma * (X - mu) / sig + beta
+// ------------------------------------------------------------------
+
+#include "group_norm_op.h"
+
+#include <array>
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline T Cube(const T& x) {
+  return x * x * x;
+}
+
+template <typename T, StorageOrder kOrder>
+void GroupNormForward(
+    const std::array<int, 4>& dims,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    const T* gamma,
+    const T* beta,
+    T* Y) {
+  constexpr int kGDim = kOrder == StorageOrder::NCHW ? 1 : 2;
+  constexpr int kDDim = kOrder == StorageOrder::NCHW ? 2 : 3;
+  const int size = dims[0] * dims[1] * dims[2] * dims[3];
+  std::array<int, 4> index = {0, 0, 0, 0};
+  for (int i = 0; i < size; ++i) {
+    const int i_mu = index[0] * dims[kGDim] + index[kGDim];
+    const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
+    Y[i] = gamma[i_gamma] * (X[i] - mu[i_mu]) * rsig[i_mu] + beta[i_gamma];
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
+  }
+}
+
+template <typename T, StorageOrder kOrder>
+void ComputeInternalGradients(
+    const std::array<int, 4>& dims,
+    const T* dY,
+    const T* X,
+    const T* gamma,
+    T* ds,
+    T* db) {
+  constexpr int kGDim = kOrder == StorageOrder::NCHW ? 1 : 2;
+  constexpr int kDDim = kOrder == StorageOrder::NCHW ? 2 : 3;
+  const int size = dims[0] * dims[1] * dims[2] * dims[3];
+  std::array<int, 4> index = {0, 0, 0, 0};
+  for (int i = 0; i < size; ++i) {
+    const int i_mu = index[0] * dims[kGDim] + index[kGDim];
+    const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
+    ds[i_mu] += gamma[i_gamma] * dY[i] * X[i];
+    db[i_mu] += gamma[i_gamma] * dY[i];
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
+  }
+}
+
+// Math:
+// Y = gamma * (X - mu) * rsig + beta
+// let s = gamma * rsig
+// let b = beta - mu * rsig
+// Y = s * X + b
+// let n = D * HxW
+// dL/dX = dL/dY * dY/dX = dL/dY * (d(s * X)/dX + db/dX)
+// d(s * X)/dX = s + X * ds/dX = s + gamma * X * drsig/dX
+// db/dX = -u * drsig/dX - rsig * dmu/dX
+// drsig/dX = -rsig^3 * (X - mu) / n
+// dmu/dX = 1 / n
+template <typename T, StorageOrder kOrder>
+void GroupNormBackward(
+    const std::array<int, 4>& dims,
+    const T* dY,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    const T* gamma,
+    const T* ds,
+    const T* db,
+    T* dX,
+    T* dgamma,
+    T* dbeta) {
+  constexpr int kGDim = kOrder == StorageOrder::NCHW ? 1 : 2;
+  constexpr int kDDim = kOrder == StorageOrder::NCHW ? 2 : 3;
+  const int size = dims[0] * dims[1] * dims[2] * dims[3];
+  const int HxW = kOrder == StorageOrder::NCHW ? dims[3] : dims[1];
+  const T denom = T(1) / static_cast<T>(dims[kDDim] * HxW);
+  std::array<int, 4> index = {0, 0, 0, 0};
+  for (int i = 0; i < size; ++i) {
+    const int i_mu = index[0] * dims[kGDim] + index[kGDim];
+    const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
+    const T u =
+        (db[i_mu] * mu[i_mu] - ds[i_mu]) * (X[i] - mu[i_mu]) * Cube(rsig[i_mu]);
+    const T v = db[i_mu] * rsig[i_mu];
+    dX[i] = gamma[i_gamma] * dY[i] * rsig[i_mu] + (u - v) * denom;
+    dgamma[i_gamma] += dY[i] * (X[i] - mu[i_mu]) * rsig[i_mu];
+    dbeta[i_gamma] += dY[i];
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
+  }
+}
+
+} // namespace
+
+template <typename T, class Context>
+bool GroupNormOp<T, Context>::RunOnDeviceImpl(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* X_data,
+    const T* gamma_data,
+    const T* beta_data,
+    T* Y_data,
+    T* mu_data,
+    T* rsig_data) {
+  const std::array<int, 4> dims = order_ == StorageOrder::NCHW
+      ? std::array<int, 4>{N, G, D, HxW}
+      : std::array<int, 4>{N, HxW, G, D};
+  const std::array<int, 2> axes = order_ == StorageOrder::NCHW
+      ? std::array<int, 2>{2, 3}
+      : std::array<int, 2>{1, 3};
+
+  // Computes mean and variance.
+  math::Moments<T, Context>(
+      4, dims.data(), 2, axes.data(), X_data, mu_data, rsig_data, &context_);
+
+  // Uses rsqrt to computes 1 / std which is much faster than computes std.
+  EigenArrayMap<T>(rsig_data, G, N) += epsilon_;
+  math::Rsqrt<T, CPUContext>(N * G, rsig_data, rsig_data, &context_);
+
+  // Computes Y = gamma * (X - mu) * rsig + beta.
+  if (order_ == StorageOrder::NCHW) {
+    GroupNormForward<T, StorageOrder::NCHW>(
+        dims, X_data, mu_data, rsig_data, gamma_data, beta_data, Y_data);
+  } else {
+    GroupNormForward<T, StorageOrder::NHWC>(
+        dims, X_data, mu_data, rsig_data, gamma_data, beta_data, Y_data);
+  }
+  return true;
+}
+
+// Math:
+// let: s = gamma * rsig
+// let: b = beta - mu * gamma * rsig
+// then: Y = s * X + b
+template <typename T, class Context>
+bool GroupNormGradientOp<T, Context>::RunOnDeviceImpl(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* dY_data,
+    const T* X_data,
+    const T* mu_data,
+    const T* rsig_data,
+    const T* gamma_data,
+    T* dX_data,
+    T* dgamma_data,
+    T* dbeta_data) {
+  const std::array<int, 4> dims = order_ == StorageOrder::NCHW
+      ? std::array<int, 4>{N, G, D, HxW}
+      : std::array<int, 4>{N, HxW, G, D};
+
+  // Computes dL/ds and dL/db.
+  // dL/ds = Sum(dL/dY * gamma * X)
+  // dL/db = Sum(dL/dY * gamma)
+  const int C = G * D;
+  ds_.Resize(N, G);
+  db_.Resize(N, G);
+  T* ds_data = ds_.template mutable_data<T>();
+  T* db_data = db_.template mutable_data<T>();
+  math::Set<T, Context>(N * G, T(0), ds_data, &context_);
+  math::Set<T, Context>(N * G, T(0), db_data, &context_);
+  if (order_ == StorageOrder::NCHW) {
+    ComputeInternalGradients<T, StorageOrder::NCHW>(
+        dims, dY_data, X_data, gamma_data, ds_data, db_data);
+  } else {
+    ComputeInternalGradients<T, StorageOrder::NHWC>(
+        dims, dY_data, X_data, gamma_data, ds_data, db_data);
+  }
+
+  // Computes dL/dX, dL/dgamma and dL/dbeta.
+  math::Set<T, Context>(C, T(0), dgamma_data, &context_);
+  math::Set<T, Context>(C, T(0), dbeta_data, &context_);
+  if (order_ == StorageOrder::NCHW) {
+    GroupNormBackward<T, StorageOrder::NCHW>(
+        dims,
+        dY_data,
+        X_data,
+        mu_data,
+        rsig_data,
+        gamma_data,
+        ds_data,
+        db_data,
+        dX_data,
+        dgamma_data,
+        dbeta_data);
+  } else {
+    GroupNormBackward<T, StorageOrder::NHWC>(
+        dims,
+        dY_data,
+        X_data,
+        mu_data,
+        rsig_data,
+        gamma_data,
+        ds_data,
+        db_data,
+        dX_data,
+        dgamma_data,
+        dbeta_data);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(GroupNorm, GroupNormOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GroupNormGradient,
+    GroupNormGradientOp<float, CPUContext>);
+
+// Warning: mu and rsig are for backward usage or reference. They should NOT be
+// used as forward activations as they have no direct gradients computed.
+
+// Input: X, gamma, beta; Output: Y, mu, sig
+OPERATOR_SCHEMA(GroupNorm)
+    .NumInputs(3)
+    .NumOutputs(3)
+    .SetDoc(R"DOC(
+Group Normalization (GN) operation: https://arxiv.org/abs/1803.08494
+)DOC")
+    .Arg("num_groups", "(int) default 32; number of groups used by GN.")
+    .Arg("epsilon", "(float) default 1e-5; small constant added to var.")
+    .Input(
+        0,
+        "X",
+        ">=4D feature map input of shape (N, C, H, W) or (N, C, T, H, W)")
+    .Input(
+        1,
+        "gamma",
+        "The scale as a 1-dimensional tensor of size C to be applied to the "
+        "output.")
+    .Input(
+        2,
+        "beta",
+        "The bias as a 1-dimensional tensor of size C to be applied to the "
+        "output.")
+    .Output(0, "Y", "The output >=4-dimensional tensor of the same shape as X.")
+    .Output(
+        1,
+        "mean",
+        "The mean of shape (N, G). "
+        "For backward usage or reference. "
+        "Cannot be used as activations.")
+    .Output(
+        2,
+        "std",
+        "The std of shape (N, G). "
+        "For backward usage or reference. "
+        "Cannot be used as activations.");
+
+// Input: dY, X, gamma, beta, mu, sig; Output: dX, dgamma, dbeta
+OPERATOR_SCHEMA(GroupNormGradient).NumInputs(6).NumOutputs(3);
+
+class GetGroupNormGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "GroupNormGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1), I(2), O(1), O(2)},
+        vector<string>{GI(0), GI(1), GI(2)});
+  }
+};
+
+REGISTER_GRADIENT(GroupNorm, GetGroupNormGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu
new file mode 100644
index 0000000..daf3ab9
--- /dev/null
+++ b/caffe2/operators/group_norm_op.cu
@@ -0,0 +1,396 @@
+// ------------------------------------------------------------------
+// GroupNorm op in Caffe2 for GPU
+// Written by Kaiming He
+// Improved by Xiaomeng Yang
+// see https://arxiv.org/abs/1803.08494
+// This is a stand-alone op: Y = gamma * (X - mu) / sig + beta
+// ------------------------------------------------------------------
+
+#include "group_norm_op.h"
+
+#include <array>
+
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/math_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+__global__ void InvStdCUDAKernel(
+    const int size,
+    const float epsilon,
+    const float* var,
+    float* rsig) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+#if __CUDA_ARCH__ >= 350
+    rsig[i] = rsqrtf(__ldg(var + i) + epsilon);
+#else
+    rsig[i] = rsqrtf(var[i] + epsilon);
+#endif
+  }
+}
+
+template <typename T, StorageOrder kOrder>
+__global__ void GroupNormForwardCUDAKernel(
+    const int size,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    const T* gamma,
+    const T* beta,
+    T* Y) {
+  const int C = G * D;
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const int i_mu = kOrder == StorageOrder::NCHW
+        ? i / (D * HxW)
+        : i / (C * HxW) * G + (i / D % G);
+    const int i_gamma = kOrder == StorageOrder::NCHW ? (i / HxW) % C : i % C;
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(gamma + i_gamma) * (__ldg(X + i) - __ldg(mu + i_mu)) *
+            __ldg(rsig + i_mu) +
+        __ldg(beta + i_gamma);
+#else
+    Y[i] = gamma[i_gamma] * (X[i] - mu[i_mu]) * rsig[i_mu] + beta[i_gamma];
+#endif
+  }
+}
+
+template <typename T, StorageOrder kOrder>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    const T* gamma,
+    T* ds,
+    T* db) {
+  const int outer_size = N * G;
+  const int inner_size = D * HxW;
+  __shared__ typename BlockReduce<T>::TempStorage ds_storage;
+  __shared__ typename BlockReduce<T>::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_val = 0;
+    T db_val = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int i_gamma = i % G * D + j / HxW;
+      const int index = kOrder == StorageOrder::NCHW
+          ? i * inner_size + j
+          : (i / G * HxW + j % HxW) * G * D + i_gamma;
+#if __CUDA_ARCH__ >= 350
+      ds_val += __ldg(gamma + i_gamma) * __ldg(dY + index) * __ldg(X + index);
+      db_val += __ldg(gamma + i_gamma) * __ldg(dY + index);
+#else
+      ds_val += gamma[i_gamma] * dY[index] * X[index];
+      db_val += gamma[i_gamma] * dY[index];
+#endif
+    }
+    ds_val = BlockReduce<T>(ds_storage).Reduce(ds_val, cub::Sum());
+    db_val = BlockReduce<T>(db_storage).Reduce(db_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      ds[i] = ds_val;
+      db[i] = db_val;
+    }
+    __syncthreads();
+  }
+}
+
+// Math:
+// Y = gamma * (X - mu) * rsig + beta
+// let s = gamma * rsig
+// let b = beta - mu * rsig
+// Y = s * X + b
+// let n = D * HxW
+// dL/dX = dL/dY * dY/dX = dL/dY * (d(s * X)/dX + db/dX)
+// d(s * X)/dX = s + X * ds/dX = s + gamma * X * drsig/dX
+// db/dX = -u * drsig/dX - rsig * dmu/dX
+// drsig/dX = -rsig^3 * (X - mu) / n
+// dmu/dX = 1 / n
+template <typename T, StorageOrder kOrder>
+__global__ void GroupNormBackwardCUDAKernel(
+    const int size,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    const T* gamma,
+    const T* ds,
+    const T* db,
+    T* dX) {
+  const int C = G * D;
+  const T denom = T(1) / static_cast<T>(D * HxW);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const int i_mu = kOrder == StorageOrder::NCHW
+        ? i / (D * HxW)
+        : i / (C * HxW) * G + (i / D % G);
+    const int i_gamma = kOrder == StorageOrder::NCHW ? (i / HxW) % C : i % C;
+#if __CUDA_ARCH__ >= 350
+    const T u = (__ldg(db + i_mu) * __ldg(mu + i_mu) - __ldg(ds + i_mu)) *
+        (__ldg(X + i) - __ldg(mu + i_mu)) *
+        math::utils::Cube<T>(__ldg(rsig + i_mu));
+    const T v = __ldg(db + i_mu) * __ldg(rsig + i_mu);
+    dX[i] = __ldg(gamma + i_gamma) * __ldg(dY + i) * __ldg(rsig + i_mu) +
+        (u - v) * denom;
+#else
+    const T u = (db[i_mu] * mu[i_mu] - ds[i_mu]) * (X[i] - mu[i_mu]) *
+        math::utils::Cube<T>(rsig[i_mu]);
+    const T v = db[i_mu] * rsig[i_mu];
+    dX[i] = gamma[i_gamma] * dY[i] * rsig[i_mu] + (u - v) * denom;
+#endif
+  }
+}
+
+template <typename T, StorageOrder kOrder>
+__global__ void GammaBetaBackwardCUDAKernel(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* dY,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    T* dgamma,
+    T* dbeta) {
+  const int outer_size = G * D;
+  const int inner_size = N * HxW;
+  __shared__ typename BlockReduce<T>::TempStorage dg_storage;
+  __shared__ typename BlockReduce<T>::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T dg_val = 0;
+    T db_val = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int n = j / HxW;
+      const int index = kOrder == StorageOrder::NCHW
+          ? (n * outer_size + i) * HxW + j % HxW
+          : j * outer_size + i;
+      const int i_mu = n * G + i / D;
+#if __CUDA_ARCH__ >= 350
+      dg_val += __ldg(dY + index) * (__ldg(X + index) - __ldg(mu + i_mu)) *
+          __ldg(rsig + i_mu);
+      db_val += __ldg(dY + index);
+#else
+      dg_val += dY[index] * (X[index] - mu[i_mu]) * rsig[i_mu];
+      db_val += dY[index];
+#endif
+    }
+    dg_val = BlockReduce<T>(dg_storage).Reduce(dg_val, cub::Sum());
+    db_val = BlockReduce<T>(db_storage).Reduce(db_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      dgamma[i] = dg_val;
+      dbeta[i] = db_val;
+    }
+    __syncthreads();
+  }
+}
+
+} // namespace
+
+template <>
+bool GroupNormOp<float, CUDAContext>::RunOnDeviceImpl(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const float* X_data,
+    const float* gamma_data,
+    const float* beta_data,
+    float* Y_data,
+    float* mu_data,
+    float* rsig_data) {
+  const std::array<int, 4> dims = order_ == StorageOrder::NCHW
+      ? std::array<int, 4>{N, G, D, HxW}
+      : std::array<int, 4>{N, HxW, G, D};
+  const std::array<int, 2> axes = order_ == StorageOrder::NCHW
+      ? std::array<int, 2>{2, 3}
+      : std::array<int, 2>{1, 3};
+
+  // Computes mean and variance.
+  math::Moments<float, CUDAContext>(
+      4, dims.data(), 2, axes.data(), X_data, mu_data, rsig_data, &context_);
+
+  // Uses rsqrt to computes 1 / std which is much faster than computes std.
+  InvStdCUDAKernel<<<
+      CAFFE_GET_BLOCKS(N * G),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N * G, epsilon_, rsig_data, rsig_data);
+
+  // Computes Y = gamma * (X - mu) * rsig + beta.
+  const int size = N * G * D * HxW;
+  if (order_ == StorageOrder::NCHW) {
+    GroupNormForwardCUDAKernel<float, StorageOrder::NCHW>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            size,
+            G,
+            D,
+            HxW,
+            X_data,
+            mu_data,
+            rsig_data,
+            gamma_data,
+            beta_data,
+            Y_data);
+  } else {
+    GroupNormForwardCUDAKernel<float, StorageOrder::NHWC>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            size,
+            G,
+            D,
+            HxW,
+            X_data,
+            mu_data,
+            rsig_data,
+            gamma_data,
+            beta_data,
+            Y_data);
+  }
+  return true;
+}
+
+// Math:
+// let: s = gamma * rsig
+// let: b = beta - mu * gamma * rsig
+// then: Y = s * X + b
+template <>
+bool GroupNormGradientOp<float, CUDAContext>::RunOnDeviceImpl(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const float* dY_data,
+    const float* X_data,
+    const float* mu_data,
+    const float* rsig_data,
+    const float* gamma_data,
+    float* dX_data,
+    float* dgamma_data,
+    float* dbeta_data) {
+  const int size = N * G * D * HxW;
+  const int C = G * D;
+  ds_.Resize(N, G);
+  db_.Resize(N, G);
+  float* ds_data = ds_.mutable_data<float>();
+  float* db_data = db_.mutable_data<float>();
+  if (order_ == StorageOrder::NCHW) {
+    // Computes dL/ds and dL/db.
+    // dL/ds = Sum(dL/dY * gamma * X)
+    // dL/db = Sum(dL/dY * gamma)
+    ComputeInternalGradientsCUDAKernel<float, StorageOrder::NCHW>
+        <<<std::min(N * G, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N, G, D, HxW, dY_data, X_data, gamma_data, ds_data, db_data);
+
+    // Computes dL/dX.
+    GroupNormBackwardCUDAKernel<float, StorageOrder::NCHW>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            size,
+            G,
+            D,
+            HxW,
+            dY_data,
+            X_data,
+            mu_data,
+            rsig_data,
+            gamma_data,
+            ds_data,
+            db_data,
+            dX_data);
+
+    // Computes dL/dgamma and dL/dbeta.
+    GammaBetaBackwardCUDAKernel<float, StorageOrder::NCHW>
+        <<<std::min(C, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            G,
+            D,
+            HxW,
+            dY_data,
+            X_data,
+            mu_data,
+            rsig_data,
+            dgamma_data,
+            dbeta_data);
+  } else {
+    // Computes dL/ds and dL/db.
+    // dL/ds = Sum(dL/dY * gamma * X)
+    // dL/db = Sum(dL/dY * gamma)
+    ComputeInternalGradientsCUDAKernel<float, StorageOrder::NHWC>
+        <<<std::min(N * G, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N, G, D, HxW, dY_data, X_data, gamma_data, ds_data, db_data);
+
+    // Computes dL/dX.
+    GroupNormBackwardCUDAKernel<float, StorageOrder::NHWC>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            size,
+            G,
+            D,
+            HxW,
+            dY_data,
+            X_data,
+            mu_data,
+            rsig_data,
+            gamma_data,
+            ds_data,
+            db_data,
+            dX_data);
+
+    // Computes dL/dgamma and dL/dbeta.
+    GammaBetaBackwardCUDAKernel<float, StorageOrder::NHWC>
+        <<<std::min(C, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            G,
+            D,
+            HxW,
+            dY_data,
+            X_data,
+            mu_data,
+            rsig_data,
+            dgamma_data,
+            dbeta_data);
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(GroupNorm, GroupNormOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    GroupNormGradient,
+    GroupNormGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/group_norm_op.h b/caffe2/operators/group_norm_op.h
new file mode 100644
index 0000000..a65f57c
--- /dev/null
+++ b/caffe2/operators/group_norm_op.h
@@ -0,0 +1,166 @@
+#ifndef CAFFE2_OPERATORS_GROUP_NORM_OP_H_
+#define CAFFE2_OPERATORS_GROUP_NORM_OP_H_
+
+#include <string>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class GroupNormOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  GroupNormOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "group", group_, 32),
+        OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_NE(
+        order_,
+        StorageOrder::UNKNOWN,
+        "order should be either \"NCHW\" or \"NHWC\".");
+  }
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    const auto& gamma = Input(GAMMA);
+    const auto& beta = Input(BETA);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
+    const int HxW = X.size() / (N * C);
+    CAFFE_ENFORCE_EQ(C % group_, 0);
+    CAFFE_ENFORCE_EQ(gamma.size(), C);
+    CAFFE_ENFORCE_EQ(beta.size(), C);
+    const int G = group_;
+    const int D = C / G;
+    auto* Y = Output(OUTPUT);
+    auto* mu = Output(MU);
+    auto* rsig = Output(INV_SIGMA);
+    Y->ResizeLike(X);
+    mu->Resize(N, G);
+    rsig->Resize(N, G);
+    return RunOnDeviceImpl(
+        N,
+        G,
+        D,
+        HxW,
+        X.template data<T>(),
+        gamma.template data<T>(),
+        beta.template data<T>(),
+        Y->template mutable_data<T>(),
+        mu->template mutable_data<T>(),
+        rsig->template mutable_data<T>());
+  }
+
+ protected:
+  bool RunOnDeviceImpl(
+      const int N,
+      const int G,
+      const int D,
+      const int HxW,
+      const T* X_data,
+      const T* gamma_data,
+      const T* beta_data,
+      T* Y_data,
+      T* mu_data,
+      T* rsig_data);
+
+  const int group_;
+  const float epsilon_;
+  const StorageOrder order_;
+
+  // Input: X, gamma, beta
+  // Output: Y, mu, inv_sig
+  INPUT_TAGS(INPUT, GAMMA, BETA);
+  OUTPUT_TAGS(OUTPUT, MU, INV_SIGMA);
+};
+
+template <typename T, class Context>
+class GroupNormGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  GroupNormGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(int, "group", group_, 32),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_NE(
+        order_,
+        StorageOrder::UNKNOWN,
+        "order should be either \"NCHW\" or \"NHWC\".");
+  }
+
+  bool RunOnDevice() override {
+    const auto& dY = Input(OUTPUT_GRAD);
+    const auto& X = Input(INPUT);
+    const auto& gamma = Input(GAMMA);
+    const auto& beta = Input(BETA);
+    const auto& mu = Input(MU);
+    const auto& rsig = Input(INV_SIGMA);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
+    const int HxW = X.size() / (N * C);
+    CAFFE_ENFORCE_EQ(C % group_, 0);
+    CAFFE_ENFORCE_EQ(gamma.size(), C);
+    CAFFE_ENFORCE_EQ(beta.size(), C);
+    const int G = group_;
+    const int D = C / G;
+    auto* dX = Output(INPUT_GRAD);
+    auto* dgamma = Output(GAMMA_GRAD);
+    auto* dbeta = Output(BETA_GRAD);
+    dX->ResizeLike(X);
+    dgamma->ResizeLike(gamma);
+    dbeta->ResizeLike(beta);
+    return RunOnDeviceImpl(
+        N,
+        G,
+        D,
+        HxW,
+        dY.template data<T>(),
+        X.template data<T>(),
+        mu.template data<T>(),
+        rsig.template data<T>(),
+        gamma.template data<T>(),
+        dX->template mutable_data<T>(),
+        dgamma->template mutable_data<T>(),
+        dbeta->template mutable_data<T>());
+  }
+
+ protected:
+  bool RunOnDeviceImpl(
+      const int N,
+      const int G,
+      const int D,
+      const int HxW,
+      const T* dY_data,
+      const T* X_data,
+      const T* mu_data,
+      const T* rsig_data,
+      const T* gamma_data,
+      T* dX_data,
+      T* dgamma_data,
+      T* dbeta_data);
+
+  const int group_;
+  const StorageOrder order_;
+
+  Tensor<Context> ds_;
+  Tensor<Context> db_;
+
+  // Input: dY, X, gamma, beta, mu, inv_sig
+  // Output: dX, dgamma, dbeta
+  INPUT_TAGS(OUTPUT_GRAD, INPUT, GAMMA, BETA, MU, INV_SIGMA);
+  OUTPUT_TAGS(INPUT_GRAD, GAMMA_GRAD, BETA_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_GROUP_NORM_OP_H_
diff --git a/caffe2/operators/gru_unit_op.cc b/caffe2/operators/gru_unit_op.cc
new file mode 100644
index 0000000..045be6c
--- /dev/null
+++ b/caffe2/operators/gru_unit_op.cc
@@ -0,0 +1,55 @@
+#include "gru_unit_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(GRUUnit, GRUUnitOp<float, CPUContext>);
+OPERATOR_SCHEMA(GRUUnit)
+    .NumInputs(3, 4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+GRUUnit computes the activations of a standard GRU,
+in a sequence-length aware fashion.
+
+Concretely, given the (fused) inputs X (TxNxD), the previous hidden
+state (NxD), and the sequence lengths (N), computes the GRU
+activations, avoiding computation if the input is invalid (as in, the
+value at X[t][n] >= seqLengths[n].
+
+)DOC")
+    .Arg(
+        "drop_states",
+        "Bool to determine if hidden state is zeroes or passed "
+        "along for timesteps past the given sequence_length.")
+    .Arg(
+        "sequence_lengths",
+        "When false, the sequence lengths input is left out, "
+        "and all following inputs are shifted left by one.")
+    .Output(0, "hidden", "The new GRU hidden state calculated by this op.");
+REGISTER_CPU_OPERATOR(GRUUnitGradient, GRUUnitGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(GRUUnitGradient)
+    .NumInputs(5, 6)
+    .NumOutputs(2)
+    .Arg(
+        "sequence_lengths",
+        "When false, the sequence lengths input is left out, "
+        "and all following inputs are shifted left by one.");
+
+class GetGRUUnitGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (GetFlagArgument(def_, "sequence_lengths", true)) {
+      return SingleGradientDef(
+          "GRUUnitGradient",
+          "",
+          vector<string>{I(0), I(1), I(2), I(3), O(0), GO(0)},
+          vector<string>{GI(0), GI(1)});
+    } else {
+      return SingleGradientDef(
+          "GRUUnitGradient",
+          "",
+          vector<string>{I(0), I(1), I(2), O(0), GO(0)},
+          vector<string>{GI(0), GI(1)});
+    }
+  }
+};
+REGISTER_GRADIENT(GRUUnit, GetGRUUnitGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/gru_unit_op.h b/caffe2/operators/gru_unit_op.h
new file mode 100644
index 0000000..ecbaac2
--- /dev/null
+++ b/caffe2/operators/gru_unit_op.h
@@ -0,0 +1,240 @@
+#ifndef CAFFE2_OPERATORS_GRU_UNIT_OP_H_
+#define CAFFE2_OPERATORS_GRU_UNIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace detail {
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1.0f / (1.0f + exp(-x));
+}
+
+template <typename T>
+inline T host_tanh(T x) {
+  return 2.0f * sigmoid(2.0f * x) - 1.0f;
+}
+
+template <typename T, typename Context>
+void GRUUnit(
+    int N,
+    int D,
+    int t,
+    const T* H_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    T* H,
+    Context* /*context*/) {
+  for (int n = 0; n < N; ++n) {
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+
+    for (int d = 0; d < D; ++d) {
+      if (!valid) {
+        if (drop_states) {
+          H[d] = 0;
+        } else {
+          H[d] = H_prev[d];
+        }
+      } else {
+        const T update = X[1 * D + d];
+        const T output = X[2 * D + d];
+        T sigmoid_update = sigmoid(update);
+        H[d] = H_prev[d] * sigmoid_update +
+            host_tanh(output) * (1.0f - sigmoid_update);
+      }
+    }
+
+    H_prev += D;
+    X += 3 * D;
+    H += D;
+  }
+}
+
+template <typename T, typename Context>
+void GRUUnitGradient(
+    int N,
+    int D,
+    int t,
+    const T* H_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    const T* H,
+    const T* H_diff,
+    bool drop_states,
+    T* H_prev_diff,
+    T* X_diff,
+    Context* /*context*/) {
+  for (int n = 0; n < N; ++n) {
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+
+    for (int d = 0; d < D; ++d) {
+      T* h_prev_diff = H_prev_diff + d;
+      T* reset_diff = X_diff + 0 * D + d;
+      T* update_diff = X_diff + 1 * D + d;
+      T* output_diff = X_diff + 2 * D + d;
+
+      if (!valid) {
+        if (drop_states) {
+          *h_prev_diff = 0;
+        } else {
+          *h_prev_diff = H_diff[d];
+        }
+        *reset_diff = 0;
+        *update_diff = 0;
+        *output_diff = 0;
+      } else {
+        // Calculate Gate Outputs
+        const T u = sigmoid(X[1 * D + d]);
+        const T o = host_tanh(X[2 * D + d]);
+
+        *h_prev_diff = H_diff[d] * u;
+        *reset_diff = 0; // 0 contribution to gradient from this operation
+        *update_diff = (H_diff[d] * H_prev[d] - H_diff[d] * o) * u * (1.0f - u);
+        *output_diff = H_diff[d] * (1.0f - u) * (1.0f - o * o);
+      }
+    }
+
+    H_prev += D;
+    X += 3 * D;
+    H += D;
+    H_diff += D;
+    X_diff += 3 * D;
+    H_prev_diff += D;
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Context>
+class GRUUnitOp : public Operator<Context> {
+ public:
+  GRUUnitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        drop_states_(OperatorBase::template GetSingleArgument<bool>(
+            "drop_states",
+            false)),
+        sequence_lengths_(OperatorBase::template GetSingleArgument<bool>(
+            "sequence_lengths",
+            true)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // handle potentially-missing sequence lengths input
+    const size_t TIMESTEP = SEQ_LENGTHS + (sequence_lengths_ ? 1 : 0);
+
+    // Extract N
+    const auto N = Input(HIDDEN_T_M_1).dim(1);
+
+    // Gates: 1xNxG
+    const auto G = Input(GATES).dim(2);
+    const auto D = Input(HIDDEN_T_M_1).dim(2);
+
+    CAFFE_ENFORCE_EQ(3 * D, G);
+    const auto* H_prev = Input(HIDDEN_T_M_1).template data<T>();
+    const auto* X = Input(GATES).template data<T>();
+
+    const int32_t* seqLengths = nullptr;
+    if (sequence_lengths_) {
+      CAFFE_ENFORCE_EQ(Input(SEQ_LENGTHS).size(), N);
+      seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
+    }
+
+    const auto t = static_cast<OperatorBase*>(this)->
+      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
+    Output(HIDDEN_T)->ResizeLike(Input(HIDDEN_T_M_1));
+    auto* H = Output(HIDDEN_T)->template mutable_data<T>();
+
+    detail::GRUUnit<T, Context>(
+        N, D, t, H_prev, X, seqLengths, drop_states_, H, &context_);
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(HIDDEN_T_M_1, GATES, SEQ_LENGTHS);
+  // additional input tags are determined dynamically based on whether
+  // sequence_lengths is present.
+  OUTPUT_TAGS(HIDDEN_T);
+
+ private:
+  bool drop_states_;
+  bool sequence_lengths_;
+};
+
+template <typename T, typename Context>
+class GRUUnitGradientOp : public Operator<Context> {
+ public:
+  GRUUnitGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        drop_states_(OperatorBase::template GetSingleArgument<bool>(
+            "drop_states",
+            false)),
+        sequence_lengths_(OperatorBase::template GetSingleArgument<bool>(
+            "sequence_lengths",
+            true)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // handle potentially-missing sequence lengths input
+    const size_t inputOffset = SEQ_LENGTHS + (sequence_lengths_ ? 1 : 0);
+    const size_t TIMESTEP = inputOffset;
+    const size_t HIDDEN_T = inputOffset + 1;
+    const size_t HIDDEN_T_GRAD = inputOffset + 2;
+
+    // Extract N
+    const auto N = Input(HIDDEN_T_M_1).dim(1);
+
+    // Gates: 1xNxG
+    const auto G = Input(GATES).dim(2);
+    const auto D = Input(HIDDEN_T_M_1).dim(2);
+
+    CAFFE_ENFORCE_EQ(3 * D, G);
+    const auto* H_prev = Input(HIDDEN_T_M_1).template data<T>();
+    const auto* X = Input(GATES).template data<T>();
+    const auto t = static_cast<OperatorBase*>(this)->
+      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
+    const auto* H = Input(HIDDEN_T).template data<T>();
+    const auto* H_diff = Input(HIDDEN_T_GRAD).template data<T>();
+
+    const int32_t* seqLengths = nullptr;
+    if (sequence_lengths_) {
+      CAFFE_ENFORCE_EQ(Input(SEQ_LENGTHS).size(), N);
+      seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
+    }
+
+    Output(HIDDEN_T_M_1_GRAD)->ResizeLike(Input(HIDDEN_T_M_1));
+    auto* H_prev_diff = Output(HIDDEN_T_M_1_GRAD)->template mutable_data<T>();
+    Output(GATES_GRAD)->ResizeLike(Input(GATES));
+    auto* X_diff = Output(GATES_GRAD)->template mutable_data<T>();
+
+    detail::GRUUnitGradient<T, Context>(
+        N,
+        D,
+        t,
+        H_prev,
+        X,
+        seqLengths,
+        H,
+        H_diff,
+        drop_states_,
+        H_prev_diff,
+        X_diff,
+        &context_);
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(HIDDEN_T_M_1, GATES, SEQ_LENGTHS);
+  OUTPUT_TAGS(HIDDEN_T_M_1_GRAD, GATES_GRAD);
+
+ private:
+  bool drop_states_;
+  bool sequence_lengths_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_GRU_UNIT_OP_H_
diff --git a/caffe2/operators/gru_unit_op_gpu.cu b/caffe2/operators/gru_unit_op_gpu.cu
new file mode 100644
index 0000000..2df357b
--- /dev/null
+++ b/caffe2/operators/gru_unit_op_gpu.cu
@@ -0,0 +1,140 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "caffe2/core/context_gpu.h"
+#include "gru_unit_op.h"
+
+namespace caffe2 {
+
+namespace detail {
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename T>
+__global__ void GRUUnitKernel(
+    const int ND,
+    const int dim,
+    const int t,
+    const T* H_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    T* H) {
+  // index is virtual thread ID in range [0, ND)
+  CUDA_1D_KERNEL_LOOP(index, ND) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+    if (!valid) {
+      H[index] = H_prev[index] * !drop_states;
+    } else {
+      const T* X_offset = X + 3 * dim * n;
+      const T update = X_offset[1 * dim + d];
+      const T output = X_offset[2 * dim + d];
+      T sigmoid_update = cuda_sigmoid(update);
+      H[index] = H_prev[index] * sigmoid_update +
+          tanh(output) * (1.0f - sigmoid_update);
+    }
+  }
+}
+
+template <typename T>
+__global__ void GRUUnitGradientKernel(
+    const int ND,
+    const int dim,
+    const int t,
+    const T* H_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    const T* H,
+    const T* H_diff,
+    bool drop_states,
+    T* H_prev_diff,
+    T* X_diff) {
+  CUDA_1D_KERNEL_LOOP(index, ND) {
+    const int n = index / dim;
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+    const int d = index % dim;
+    const T* X_offset = X + 3 * dim * n;
+    T* h_prev_diff = H_prev_diff + index;
+    T* X_diff_offset = X_diff + 3 * dim * n;
+    T* reset_diff = X_diff_offset + 0 * dim + d;
+    T* update_diff = X_diff_offset + 1 * dim + d;
+    T* output_diff = X_diff_offset + 2 * dim + d;
+
+    if (!valid) {
+      *h_prev_diff = H_diff[index] * !drop_states;
+      *reset_diff = 0;
+      *update_diff = 0;
+      *output_diff = 0;
+    } else {
+      const T u = cuda_sigmoid(X_offset[1 * dim + d]);
+      const T o = tanh(X_offset[2 * dim + d]);
+
+      *h_prev_diff = H_diff[index] * u;
+      *reset_diff = 0; // 0 contribution to gradient from this operation
+      *update_diff =
+          (H_diff[index] * H_prev[index] - H_diff[index] * o) * u * (1.0f - u);
+      *output_diff = H_diff[index] * (1.0f - u) * (1.0f - o * o);
+    }
+  }
+}
+
+template <>
+void GRUUnit<float, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float* H_prev,
+    const float* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    float* H,
+    CUDAContext* context) {
+  GRUUnitKernel<float>
+      <<<CAFFE_GET_BLOCKS(N * D),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          N * D, D, t, H_prev, X, seqLengths, drop_states, H);
+}
+
+template <>
+void GRUUnitGradient<float, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float* H_prev,
+    const float* X,
+    const int32_t* seqLengths,
+    const float* H,
+    const float* H_diff,
+    bool drop_states,
+    float* H_prev_diff,
+    float* X_diff,
+    CUDAContext* context) {
+  GRUUnitGradientKernel<float>
+      <<<CAFFE_GET_BLOCKS(N * D),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          N * D,
+          D,
+          t,
+          H_prev,
+          X,
+          seqLengths,
+          H,
+          H_diff,
+          drop_states,
+          H_prev_diff,
+          X_diff);
+}
+}
+
+REGISTER_CUDA_OPERATOR(GRUUnit, GRUUnitOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(GRUUnitGradient, GRUUnitGradientOp<float, CUDAContext>);
+}
diff --git a/caffe2/operators/h_softmax_op.cc b/caffe2/operators/h_softmax_op.cc
new file mode 100644
index 0000000..1a8689d
--- /dev/null
+++ b/caffe2/operators/h_softmax_op.cc
@@ -0,0 +1,639 @@
+#include "caffe2/operators/h_softmax_op.h"
+
+#include <queue>
+#include <stack>
+
+namespace caffe2 {
+
+template <>
+float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
+  const float* W, const float* b, int target, float* int_output,
+  const float* bias_multiplier, int dim_out, int dim_in,
+  int& int_output_offset) {
+
+  // W * x
+  float* fc_output_data = int_output + int_output_offset;
+
+  math::Gemm<float, CPUContext>(CblasNoTrans, CblasTrans, 1, dim_out, dim_in, 1,
+    X, W, 0, fc_output_data, &context_);
+  math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, 1,
+    b, bias_multiplier, 1, fc_output_data, &context_);
+
+  int_output_offset += dim_out;
+
+  //Softmax
+  float* softmax_output_data = int_output + int_output_offset;
+
+  if (scale_.size() != 1) {
+    scale_.Resize(1);
+  }
+  if (sum_multiplier_.size() != dim_out) {
+    sum_multiplier_.Resize(dim_out);
+    math::Set<float, CPUContext>(dim_out, 1.f,
+      sum_multiplier_.mutable_data<float>(), &context_);
+  }
+  math::RowwiseMax<float, CPUContext>(1, dim_out, fc_output_data,
+    scale_.mutable_data<float>(), &context_);
+
+  // Put the intermediate result X - max(X) into Y
+  context_.template Copy<float, CPUContext, CPUContext>(dim_out, fc_output_data,
+    softmax_output_data);
+  // Subtract the scale
+  math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, -1,
+    sum_multiplier_.data<float>(), scale_.data<float>(), 1, softmax_output_data,
+    &context_);
+
+  // Exponentiation
+  math::Exp<float, CPUContext>(dim_out, softmax_output_data,
+    softmax_output_data, &context_);
+  math::Gemv<float, CPUContext>(CblasNoTrans, 1, dim_out, 1,
+    softmax_output_data, sum_multiplier_.data<float>(), 0,
+    scale_.mutable_data<float>(), &context_);
+
+  // Do division
+  const float scale = *scale_.data<float>();
+  for (int j = 0; j < dim_out; ++j) {
+    softmax_output_data[j] /= scale;
+  }
+
+  int_output_offset += dim_out;
+
+  if (target < 0) {
+    return -1;
+  }
+  //Return cross entropy loss
+  return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
+}
+
+// Implementation for the CPU context.
+template <>
+bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const auto& W = Input(1);
+  const auto& b = Input(2);
+  auto& label = Input(3);
+  auto* Y = Output(0);
+  auto* intermediate_output = Output(1);
+
+  // Batch size
+  int M = X.ndim() > 1 ? X.dim32(0) : 1;
+  // Input feature dimension
+  int K = X.size() / M;
+  CAFFE_ENFORCE_GE(W.ndim(), 2); // N*K
+  CAFFE_ENFORCE_EQ(b.ndim(), 1); // N
+  CAFFE_ENFORCE_EQ(K, W.size() / (W.dim32(0)));
+  // Sum of output dimensions of all hierarchy nodes
+  int N = W.dim32(0);
+  CAFFE_ENFORCE_EQ(N, b.dim32(0));
+  Y->Resize(M);
+  auto* Ydata = Y->mutable_data<float>();
+  math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
+  const auto* labeldata = label.data<int>();
+
+  auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
+  int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
+  intermediate_output->Resize(int_output_size);
+  float * int_output_data = intermediate_output->mutable_data<float>();
+  int int_output_offset = 0;
+
+  if (bias_multiplier_.size() != M) {
+    bias_multiplier_.Resize(M);
+    math::Set<float, CPUContext>(M, static_cast<float>(1),
+        bias_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  for (int sample = 0; sample < M; ++sample) {
+    int word_id = labeldata[sample];
+    const PathProto& path = hierarchy[word_id];
+    for (const PathNodeProto& node : path.path_nodes()) {
+      //Offset of node's weight matrix in W
+      int w_offset = node.index();
+      //Number of output dimensions in node's weight matrix
+      int w_length = node.length();
+      int target = node.target();
+      //Adding log probabilities
+      Ydata[sample] += RunForwardSingle(X.data<float>() + sample*K,
+        W.data<float>() + w_offset*K, b.data<float>() + w_offset, target,
+        int_output_data, bias_multiplier_.data<float>()+sample, w_length, K,
+        int_output_offset);
+    }
+  }
+  return true;
+}
+
+template <>
+void HSoftmaxGradientOp<float, CPUContext>::RunBackwardSingle(const float* X,
+  const float* dY, const float* W, int target,
+  const float* int_output, float* dX, float* dW, float* db, float* dint_output,
+  int dim_in, int dim_out, int& int_output_offset) {
+
+  //Cross entropy
+  // dX_entropy is the dX for the cross entropy layer
+  float* dX_entropy = dint_output + int_output_offset - dim_out;
+  // X_entropy is the X for the cross entropy layer and Y for the softmax layer
+  const float* X_entropy = int_output + int_output_offset - dim_out;
+
+  math::Set<float, CPUContext>(dim_out, 0.f, dX_entropy, &context_);
+  dX_entropy[target] = - (*dY) / std::max(X_entropy[target], kLOG_THRESHOLD());
+
+  int_output_offset -= dim_out;
+
+  //Softmax
+  if (scale_.size() != 1) {
+    scale_.Resize(1);
+  }
+  float* scaledata = scale_.mutable_data<float>();
+
+  if (sum_multiplier_.size() != dim_out) {
+    sum_multiplier_.Resize(dim_out);
+    math::Set<float, CPUContext>(dim_out, 1.f,
+      sum_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  float* dX_softmax = dint_output + int_output_offset - dim_out;
+  context_.Copy<float, CPUContext, CPUContext>(dim_out, dX_entropy, dX_softmax);
+
+  math::Dot<float, CPUContext>(dim_out, X_entropy, dX_entropy, scaledata,
+    &context_);
+  math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, -1,
+    sum_multiplier_.data<float>(), scaledata , 1, dX_softmax, &context_);
+  math::Mul<float, CPUContext>(dim_out, dX_softmax, X_entropy, dX_softmax,
+    &context_);
+
+  int_output_offset -= dim_out;
+
+  //FC
+  if (bias_multiplier_.size() != 1) {
+    // If the helper bias multiplier has not been created, reshape and fill
+    // it with 1
+    bias_multiplier_.Resize(1);
+    math::Set<float, CPUContext>(1, static_cast<float>(1),
+        bias_multiplier_.template mutable_data<float>(), &context_);
+  }
+
+  // Compute dW and add incrementally
+  // dW = dW + dX_softmax'*X
+  math::Gemm<float, CPUContext>(CblasTrans, CblasNoTrans, dim_out, dim_in, 1, 1,
+    dX_softmax, X, 1, dW, &context_);
+
+  // Compute dB and add incrementally
+  // db = db + dX_softmax*bias_multiplier_
+  math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, 1, dX_softmax,
+    bias_multiplier_.template data<float>(), 1, db, &context_);
+
+  // Compute dX and add incrementally
+  // dX = dX + W'dX_softmax
+  math::Gemv<float, CPUContext>(CblasTrans, dim_out, dim_in,
+    1, W, dX_softmax, 1, dX, &context_);
+}
+
+// Implementation for the CPU context.
+template <>
+bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const auto& W = Input(1);
+  const auto& b = Input(2);
+  auto& label = Input(3);
+  auto& intermediate_output = Input(4);
+  auto& dY = Input(5);
+  auto* dX = Output(0);
+  auto* dW = Output(1);
+  auto* db = Output(2);
+  auto* dX_intermediate_output = Output(3);
+  dX->ResizeLike(X);
+  dW->ResizeLike(W);
+  db->ResizeLike(b);
+  dX_intermediate_output->ResizeLike(intermediate_output);
+
+  float* dX_data = dX->mutable_data<float>();
+  float* dW_data = dW->mutable_data<float>();
+  float* db_data = db->mutable_data<float>();
+  float* dOutput_data = dX_intermediate_output->mutable_data<float>();
+
+  math::Set<float, CPUContext>(X.size(), 0.f, dX_data, &context_);
+  math::Set<float, CPUContext>(W.size(), 0.f, dW_data, &context_);
+  math::Set<float, CPUContext>(b.size(), 0.f, db_data, &context_);
+  math::Set<float, CPUContext>(intermediate_output.size(), 0.f, dOutput_data,
+                               &context_);
+
+  // Batch size
+  int M = X.ndim() > 1 ? X.dim32(0) : 1;
+  // Input feature dimension
+  int K = X.size() / M;
+  const auto* labeldata = label.data<int>();
+
+  auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
+  int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);
+
+  //Traverse backward to access intermediate_output generated by HSoftmaxOp
+  // sequentially in reverse order
+  for (int sample = M-1; sample >= 0; sample--) {
+    int word_id = labeldata[sample];
+    PathProto path = hierarchy[word_id];
+    for (auto node = path.path_nodes().rbegin();
+      node != path.path_nodes().rend(); node++) {
+      int w_offset = node->index();
+      int w_length = node->length();
+      int target = node->target();
+      RunBackwardSingle(X.data<float>() + sample*K, dY.data<float>() + sample,
+        W.data<float>() + w_offset*K, target, intermediate_output.data<float>(),
+        dX_data + sample*K, dW_data + w_offset*K, db_data + w_offset,
+        dOutput_data, K, w_length, output_offset);
+    }
+  }
+  return true;
+}
+
+// Implementation for the CPU context.
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::pruning(
+    const float* X,
+    int sample,
+    int K,
+    const float* W,
+    const float* b,
+    const NodeProto& src_node,
+    NodeProto& dst_node,
+    float parent_score,
+    float beam) {
+  int w_length = src_node.children_size() + src_node.word_ids_size();
+  Tensor<CPUContext> intermediate_data;
+  intermediate_data.Resize(2 * w_length);
+  float* int_output_data = intermediate_data.template mutable_data<float>();
+  int int_output_offset = 0;
+  int w_offset = src_node.offset();
+
+  RunForwardSingle(
+      X + K * sample,
+      W + w_offset * K,
+      b + w_offset,
+      -1,
+      int_output_data,
+      bias_multiplier_.template data<float>() + sample,
+      w_length,
+      K,
+      int_output_offset);
+
+  float* softmax_output_data = int_output_data + w_length;
+  // real probabilities
+  for (int i = 0; i < w_length; i++) {
+    softmax_output_data[i] =
+        -log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
+  }
+  for (int i = 0; i < src_node.children_size(); i++) {
+    if (softmax_output_data[i] < parent_score + beam) {
+      dst_node.add_children();
+      int idx = dst_node.children_size() - 1;
+      CAFFE_ENFORCE(
+          src_node.children(i).has_offset(),
+          "HSM Search require the field offset in NodeProte");
+      dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
+      CAFFE_ENFORCE(
+          src_node.children(i).has_name(),
+          "HSM Search require the field name in NodeProte");
+      dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
+      dst_node.add_scores(softmax_output_data[i]);
+      pruning(
+          X,
+          sample,
+          K,
+          W,
+          b,
+          src_node.children(i),
+          *dst_node.mutable_children(idx),
+          softmax_output_data[i],
+          beam);
+    }
+  }
+
+  for (int i = src_node.children_size(); i < w_length; i++) {
+    if (softmax_output_data[i] < parent_score + beam) {
+      dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
+      dst_node.add_scores(softmax_output_data[i]);
+    }
+  }
+
+  return true;
+}
+
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
+    const NodeProto& node,
+    std::vector<std::pair<string, float>>& info) {
+  int i = 0;
+
+  for (const auto& n : node.children()) {
+    info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
+  }
+  for (const int n : node.word_ids()) {
+    info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
+  }
+
+  for (const auto& n : node.children()) {
+    extractNodes(n, info);
+  }
+  return true;
+}
+
+// Implementation for the CPU context.
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const auto& W = Input(1);
+  const auto& b = Input(2);
+  auto* Y_names = Output(0);
+  auto* Y_scores = Output(1);
+  // Batch size
+  int M = X.ndim() > 1 ? X.dim32(0) : 1;
+  // Input feature dimension
+  int K = X.size() / M;
+  CAFFE_ENFORCE(W.ndim() == 2, "Weight must be a matrix."); // N*K
+  CAFFE_ENFORCE(b.ndim() == 1, "Bias must be a vector."); // N
+  CAFFE_ENFORCE(K == W.size() / (W.dim32(0)), "feature dimension mismatch.");
+  // Sum of output dimensions of all hierarchy nodes
+  int N = W.dim32(0);
+  CAFFE_ENFORCE(N == b.dim32(0), "mismatch between Weight and Bias.");
+  Y_names->Resize(M, top_n_);
+  Y_scores->Resize(M, top_n_);
+
+  if (bias_multiplier_.size() != M) {
+    bias_multiplier_.Resize(M);
+    math::Set<float, CPUContext>(
+        M,
+        static_cast<float>(1),
+        bias_multiplier_.mutable_data<float>(),
+        &context_);
+  }
+
+  for (int sample = 0; sample < M; ++sample) {
+    CAFFE_ENFORCE(
+        tree_.root_node().has_offset(),
+        "HSM Search require the field offset in NodeProte");
+    CAFFE_ENFORCE(
+        tree_.root_node().has_name(),
+        "HSM Search require the field name in NodeProte");
+
+    NodeProto dst_node;
+    dst_node.set_offset(tree_.root_node().offset());
+    dst_node.set_name(tree_.root_node().name());
+
+    pruning(
+        X.data<float>(),
+        sample,
+        K,
+        W.data<float>(),
+        b.data<float>(),
+        tree_.root_node(),
+        dst_node,
+        0,
+        beam_);
+
+    std::vector<std::pair<string, float>> info;
+    extractNodes(dst_node, info);
+    // saving the results for each sample.
+    std::partial_sort(
+        info.begin(),
+        info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
+        info.end(),
+        [&](std::pair<string, float> a, std::pair<string, float> b) {
+          return a.second < b.second;
+        });
+    auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
+    auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
+    for (int i = 0; i < top_n_; i++) {
+      if (i < info.size()) {
+        y_name_data[i] = info[i].first;
+        y_score_data[i] = info[i].second;
+      } else {
+        y_score_data[i] = 0;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool HuffmanTreeHierarchyOp<T, Context>::RunOnDevice() {
+  const auto& Y = Input(0);
+  auto treeOutput = Output(0);
+  CAFFE_ENFORCE_EQ(Y.ndim(), 1, "Input labels must be a vector.");
+  const auto y_data = Y.template data<T>();
+  treeOutput->Resize(1);
+  std::vector<int> labelCounts;
+  labelCounts.resize(num_classes_, 0);
+  for (int i = 0; i < Y.dim32(0); ++i) {
+    // Labels are in range [0, num_classes]
+    const int label_index = y_data[i];
+    CAFFE_ENFORCE_LT(
+        label_index,
+        num_classes_,
+        "Found an input label ",
+        label_index,
+        " not in range [",
+        0,
+        ",",
+        num_classes_,
+        "]");
+    labelCounts[label_index]++;
+  }
+
+  std::priority_queue<Node, std::vector<Node>, NodeComparator> nodes;
+  std::vector<Node> huffmanTree;
+  std::vector<int> labelIndices;
+  labelIndices.resize(num_classes_);
+
+  int current_node_index = 0;
+  for (int i = 0; i < num_classes_; ++i) {
+    Node node(i, labelCounts[i]);
+    nodes.push(node);
+  }
+
+  // Extract node with minimum count and insert it in the tree array.
+  auto get_next_node = [&nodes, &huffmanTree, &labelIndices]() {
+    auto node = nodes.top();
+    int node_index = huffmanTree.size();
+    if (node.label != -1) {
+      labelIndices[node.label] = node_index;
+    }
+    nodes.pop();
+    huffmanTree.push_back(node);
+    return std::pair<int, Node>(node_index, node);
+  };
+
+  // Merge two nodes and insert the results in the queue.
+  auto merge_nodes = [&nodes](
+      const std::pair<int, Node>& node_l, const std::pair<int, Node>& node_r) {
+    Node node(-1, node_l.second.count + node_r.second.count);
+    node.left_ch_index = node_l.first;
+    node.right_ch_index = node_r.first;
+    nodes.push(node);
+  };
+
+  // Main loop for buttom up huffman tree construction.
+  while (!nodes.empty()) {
+    auto lNode = get_next_node();
+    if (!nodes.empty()) {
+      auto rNode = get_next_node();
+      merge_nodes(lNode, rNode);
+    }
+  }
+
+  auto is_leaf_node = [&huffmanTree](const int node_index) {
+    return huffmanTree[node_index].left_ch_index == -1 &&
+        huffmanTree[node_index].right_ch_index == -1;
+  };
+
+  auto get_node_label = [&huffmanTree](const int node_index) {
+    return huffmanTree[node_index].label;
+  };
+
+  // Build huffman tree.
+  int current_offset = 0;
+  std::function<void(int, NodeProto*)> build_tree = [&](
+      const int node_index, NodeProto* node) {
+    if (is_leaf_node(node_index) || node_index == -1) {
+      return;
+    }
+    const int left_ch_index = huffmanTree[node_index].left_ch_index;
+    const int right_ch_index = huffmanTree[node_index].right_ch_index;
+    if (left_ch_index != -1) {
+      if (is_leaf_node(left_ch_index)) {
+        node->add_word_ids(get_node_label(left_ch_index));
+      } else {
+        auto* ch_node = node->add_children();
+        ch_node->set_offset(current_offset);
+        current_offset += 2;
+        build_tree(left_ch_index, ch_node);
+      }
+    }
+    if (right_ch_index != -1) {
+      if (is_leaf_node(right_ch_index)) {
+        node->add_word_ids(get_node_label(right_ch_index));
+        current_offset++;
+      } else {
+        auto* ch_node = node->add_children();
+        ch_node->set_offset(current_offset);
+        current_offset += 2;
+        build_tree(right_ch_index, ch_node);
+      }
+    }
+  };
+
+  // The last element inserted in the tree is the root.
+  const int rootNodeIndex = huffmanTree.size() - 1;
+  NodeProto rootNode;
+  rootNode.set_offset(current_offset);
+  current_offset += 2;
+  build_tree(rootNodeIndex, &rootNode);
+  TreeProto treeProto;
+  *treeProto.mutable_root_node() = rootNode;
+
+  treeProto.SerializeToString(treeOutput->template mutable_data<string>());
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(HSoftmaxGradient,
+  HSoftmaxGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    HuffmanTreeHierarchy,
+    HuffmanTreeHierarchyOp<int64_t, CPUContext>);
+
+OPERATOR_SCHEMA(HSoftmax)
+  .NumInputs(4)
+  .NumOutputs(2)
+  .SetDoc(R"DOC(
+Hierarchical softmax is an operator which approximates the softmax operator
+while giving significant training speed gains and reasonably comparable
+performance. In this operator, instead of calculating the probabilities of all
+the classes, we calculate the probability of each step in the path from root to
+the target word in the hierarchy.
+
+The operator takes a 2-D tensor (Tensor<float>) containing a batch of layers, a
+set of parameters represented by the weight matrix and bias terms, and a 1-D
+tensor (Tensor<int>) holding labels, or the indices of the target class. The
+hierarchy has to be specified as an argument to the operator.
+
+The operator returns a 1-D tensor holding the computed log probability of the
+target class and a 2-D tensor of intermediate outputs (from the weight matrix
+and softmax from each step in the path from root to target class) which will be
+used by the gradient operator to compute gradients for all samples in the batch.
+)DOC")
+  .Arg("hierarchy", "Serialized HierarchyProto string containing list of "
+  "vocabulary words and their paths from root of hierarchy to the leaf")
+  .Input(0, "X", "Input data from previous layer")
+  .Input(1, "W", "2D blob containing 'stacked' fully connected weight "
+  "matrices. Each node in the hierarchy contributes one FC weight matrix if "
+  "it has children nodes. Dimension is N*D, D is input dimension of data (X), "
+  "N is sum of all output dimensions, or total number of nodes (excl root)")
+  .Input(2, "b", "1D blob with N parameters")
+  .Input(3, "labels", "int word_id of the target word")
+  .Output(0, "Y", "1-D of log probability outputs, one per sample")
+  .Output(1, "intermediate_output", "Extra blob to store the intermediate "
+  "FC and softmax outputs for each node in the hierarchical path of a word. "
+  "The outputs from samples are stored in consecutive blocks in the forward "
+  "pass and are used in reverse order in the backward gradientOp pass");
+
+OPERATOR_SCHEMA(HSoftmaxGradient).NumInputs(6).NumOutputs(4);
+
+class GetHSoftmaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "HSoftmaxGradient", "",
+        //X, W, b, label, intermediate output, dY
+        vector<string>{I(0), I(1), I(2), I(3), O(1), GO(0)},
+        //dX, dW, db, dintermediate_output
+        vector<string>{GI(0), GI(1), GI(2), GO(1)});
+  }
+};
+REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
+
+OPERATOR_SCHEMA(HSoftmaxSearch)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+HSoftmaxSearch is an operator to generate the most possible paths given a
+well-trained model and input vector. Greedy algorithm is used for pruning the
+search tree.
+)DOC")
+    .Arg(
+        "tree",
+        "Serialized TreeProto string containing a tree "
+        "including all intermidate nodes and leafs. All nodes must have names "
+        "for correct outputs")
+    .Arg(
+        "beam",
+        "beam used for pruning tree. The pruning algorithm is that "
+        "only children, whose score is smaller than parent's score puls beam, "
+        "will be propagated. ")
+    .Arg("topN", "Number of nodes in outputs")
+    .Input(0, "X", "Input data from previous layer")
+    .Input(1, "W", "The matrix trained from Softmax Ops")
+    .Input(2, "b", "The bias traiend from Softmax Ops")
+    .Output(
+        0,
+        "Y_names",
+        "The name of selected nodes and leafs. "
+        "For nodes, it will be the name defined in the tree. "
+        "For leafs, it will be the index of the word in the tree.")
+    .Output(1, "Y_scores", "The corresponding scores of Y_names");
+SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
+
+OPERATOR_SCHEMA(HuffmanTreeHierarchy)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+HuffmanTreeHierarchy is an operator to generate huffman tree hierarchy given
+the input labels. It returns the tree as seralized HierarchyProto
+)DOC")
+    .Arg("num_classes", "The number of classes used to build the hierarchy.")
+    .Input(0, "Labels", "The labels vector")
+    .Output(0, "Hierarch", "Huffman coding hierarchy of the labels");
+
+SHOULD_NOT_DO_GRADIENT(HuffmanTreeHierarchyOp);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/operators/h_softmax_op.h b/caffe2/operators/h_softmax_op.h
new file mode 100644
index 0000000..423f5b7
--- /dev/null
+++ b/caffe2/operators/h_softmax_op.h
@@ -0,0 +1,175 @@
+#ifndef CAFFE2_OPERATORS_H_SOFTMAX_OP_H_
+#define CAFFE2_OPERATORS_H_SOFTMAX_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/hsm.pb.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, typename Context>
+class HSoftmaxOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  HSoftmaxOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    HierarchyProto hierarchy;
+    CAFFE_ENFORCE(hierarchy.ParseFromString(
+        OperatorBase::GetSingleArgument<string>("hierarchy", "")));
+    for (const auto& path : hierarchy.paths()) {
+      hierarchy_all_map_.emplace(path.word_id(), path);
+    }
+  }
+
+ protected:
+  std::unordered_map<int, PathProto> hierarchy_all_map_;
+  Tensor<Context> scale_;
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> bias_multiplier_;
+  static constexpr T kLOG_THRESHOLD() {
+    return 1e-20f;
+  }
+  static std::unordered_map<int, PathProto> getHierarchyForLabels(
+      int M,
+      const int* labels,
+      const std::unordered_map<int, PathProto>& hierarchy_all_map) {
+    std::unordered_map<int, PathProto> hierarchy_map;
+    std::set<int> label_set = std::set<int>(labels, labels + M);
+    for (const auto& label : label_set) {
+      auto search = hierarchy_all_map.find(label);
+      CAFFE_ENFORCE(search != hierarchy_all_map.end(), "incorrect label.");
+      hierarchy_map.emplace(search->first, search->second);
+    }
+    return hierarchy_map;
+  }
+  int getIntermediateOutputSize(
+      const int* labels,
+      int M,
+      std::unordered_map<int, PathProto>& hierarchy) const {
+    int size = 0;
+    for (int label = 0; label < M; ++label) {
+      int word_id = labels[label];
+      const auto& path = hierarchy[word_id];
+      size += std::accumulate(
+          path.path_nodes().begin(),
+          path.path_nodes().end(),
+          0,
+          // Output of FC + Output of Softmax
+          [](int sz, PathNodeProto node) {
+            return sz + 2 * node.length();
+          });
+    }
+    return size;
+  }
+};
+
+template <typename T, class Context>
+class HSoftmaxOp : public HSoftmaxOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float RunForwardSingle(
+      const float* X,
+      const float* W,
+      const float* b,
+      int target,
+      float* output,
+      const float* bias_multiplier,
+      int w_length,
+      int K,
+      int& output_offset);
+};
+
+template <typename T, class Context>
+class HSoftmaxGradientOp final : public HSoftmaxOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
+  bool RunOnDevice() override;
+
+ private:
+  void RunBackwardSingle(
+      const float* X,
+      const float* dY,
+      const float* W,
+      int target,
+      const float* int_output,
+      float* dX,
+      float* dW,
+      float* db,
+      float* dOutput,
+      int dim_in,
+      int w_length,
+      int& output_offset);
+};
+
+template <typename T, class Context>
+class HSoftmaxSearchOp final : public HSoftmaxOp<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  HSoftmaxSearchOp(const OperatorDef& operator_def, Workspace* ws)
+      : HSoftmaxOp<T, Context>(operator_def, ws),
+        top_n_(OperatorBase::GetSingleArgument<int>("topN", 5)),
+        beam_(OperatorBase::GetSingleArgument<float>("beam", 0.01f)) {
+    CAFFE_ENFORCE(tree_.ParseFromString(
+        OperatorBase::GetSingleArgument<string>("tree", "")));
+  }
+  bool RunOnDevice() override;
+
+ private:
+  int top_n_;
+  float beam_;
+  TreeProto tree_;
+  bool pruning(
+      const float* X,
+      int sample,
+      int K,
+      const float* W,
+      const float* b,
+      const NodeProto& src_node,
+      NodeProto& dst_node,
+      float parent_score,
+      float beam);
+  bool extractNodes(
+      const NodeProto& node,
+      std::vector<std::pair<string, float>>& info);
+};
+
+template <typename T, class Context>
+class HuffmanTreeHierarchyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  HuffmanTreeHierarchyOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", -1)) {}
+  bool RunOnDevice() override;
+
+ private:
+  // Internal huffman tree data.
+  struct Node {
+    Node(T l, int count)
+        : label(l), count(count), left_ch_index(-1), right_ch_index(-1) {}
+    T label;
+    int count;
+    int left_ch_index;
+    int right_ch_index;
+  };
+
+  struct NodeComparator {
+    bool operator()(const Node& node_a, const Node& node_b) {
+      return node_a.count > node_b.count;
+    }
+  };
+
+  int num_classes_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SOFTMAX_OP_H_
diff --git a/caffe2/operators/half_float_ops.cc b/caffe2/operators/half_float_ops.cc
new file mode 100644
index 0000000..208bfce
--- /dev/null
+++ b/caffe2/operators/half_float_ops.cc
@@ -0,0 +1,58 @@
+#include "caffe2/operators/half_float_ops.h"
+
+namespace caffe2 {
+OPERATOR_SCHEMA(FloatToHalf)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          vector<TensorShape> out;
+          const TensorShape& X = in[0];
+          out.push_back(X);
+          out[0].set_data_type(TensorProto_DataType_FLOAT16);
+
+          return out;
+        });
+
+OPERATOR_SCHEMA(HalfToFloat)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          vector<TensorShape> out;
+          const TensorShape& X = in[0];
+          out.push_back(X);
+          out[0].set_data_type(TensorProto_DataType_FLOAT);
+
+          return out;
+        });
+OPERATOR_SCHEMA(Float16ConstantFill)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .TensorInferenceFunction(Float16FillerTensorInference)
+    .Arg("value", "The value for the elements of the output tensor.")
+    .Arg("shape", "The shape of the output tensor.")
+    .Output(
+        0,
+        "output",
+        "Output tensor of constant values specified by 'value'");
+
+class GetFloatToHalfGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "HalfToFloat", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(FloatToHalf, GetFloatToHalfGradient);
+
+class GetHalfToFloatGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "FloatToHalf", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(HalfToFloat, GetHalfToFloatGradient);
+NO_GRADIENT(Float16ConstantFill);
+} // namespace caffe2
diff --git a/caffe2/operators/half_float_ops.cu b/caffe2/operators/half_float_ops.cu
new file mode 100644
index 0000000..fb1cd16
--- /dev/null
+++ b/caffe2/operators/half_float_ops.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/half_float_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+
+#ifdef CAFFE_HAS_CUDA_FP16
+
+namespace caffe2 {
+namespace {
+__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __float2half(X[i]);
+  }
+}
+
+__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __half2float(X[i]);
+  }
+}
+}
+
+template <>
+bool FloatToHalfOp<CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  FloatToHalfKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      X.data<float>(),
+      reinterpret_cast<half*>(Y->mutable_data<float16>()));
+  return true;
+}
+
+template <>
+bool HalfToFloatOp<CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  HalfToFloatKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      reinterpret_cast<const half*>(X.data<float16>()),
+      Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(FloatToHalf, FloatToHalfOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(HalfToFloat, HalfToFloatOp<CUDAContext>);
+} // namespace caffe2
+
+#endif // CAFFE_HAS_CUDA_FP16
diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h
new file mode 100644
index 0000000..6a3b38b
--- /dev/null
+++ b/caffe2/operators/half_float_ops.h
@@ -0,0 +1,58 @@
+#ifndef CAFFE2_OPERATORS_HALF_FLOAT_OPS_H_
+#define CAFFE2_OPERATORS_HALF_FLOAT_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class FloatToHalfOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FloatToHalfOp);
+
+  bool RunOnDevice() override;
+};
+
+template <class Context>
+class HalfToFloatOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(HalfToFloatOp);
+
+  bool RunOnDevice() override;
+};
+
+class Float16ConstantFillOp : public Operator<CPUContext> {
+ public:
+  Float16ConstantFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
+
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  virtual ~Float16ConstantFillOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  vector<TIndex> shape_;
+};
+
+inline std::vector<TensorShape> Float16FillerTensorInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  vector<TensorShape> out(1);
+  ArgumentHelper helper(def);
+  out[0].set_data_type(static_cast<TensorProto_DataType>(
+      helper.GetSingleArgument<int>("dtype", TensorProto_DataType_FLOAT)));
+  auto shape = helper.GetRepeatedArgument<int>("shape");
+  for (int d : shape) {
+    out[0].add_dims(d);
+  }
+  return out;
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_HALF_FLOAT_OPS_H_
diff --git a/caffe2/operators/heatmap_max_keypoint_op.cc b/caffe2/operators/heatmap_max_keypoint_op.cc
new file mode 100644
index 0000000..ed714bd
--- /dev/null
+++ b/caffe2/operators/heatmap_max_keypoint_op.cc
@@ -0,0 +1,161 @@
+#include "heatmap_max_keypoint_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(
+    HeatmapMaxKeypoint,
+    HeatmapMaxKeypointOp<float, CPUContext>);
+
+// Input: heatmaps [size x size], boxes [x0, y0, x1, y1]
+// Output: keypoints (#rois, 4, #keypoints)
+OPERATOR_SCHEMA(HeatmapMaxKeypoint).NumInputs(2).NumOutputs(1);
+
+SHOULD_NOT_DO_GRADIENT(HeatmapMaxKeypoint);
+} // namespace
+
+/**
+Mask R-CNN uses bicubic upscaling before taking the maximum of the heat map
+for keypoints. We would like to avoid bicubic upscaling, because it is
+computationally expensive. This approach uses the Taylor expansion up to the
+quadratic terms on approximation of the heatmap function.
+**/
+template <>
+bool HeatmapMaxKeypointOp<float, CPUContext>::RunOnDevice() {
+  const auto& heatmaps_in = Input(0);
+  const auto& bboxes_in = Input(1);
+  auto* keypoints_out = Output(0);
+
+  CAFFE_ENFORCE_EQ(heatmaps_in.ndim(), 4);
+  const int N = heatmaps_in.dim32(0);
+  CAFFE_ENFORCE_EQ(heatmaps_in.dim32(0), N);
+  const int keypoint_count = heatmaps_in.dim32(1);
+  const int heatmap_size = heatmaps_in.dim32(2);
+  CAFFE_ENFORCE_GE(heatmap_size, 2); // at least 2x2 for approx
+  CAFFE_ENFORCE_EQ(heatmaps_in.dim32(2), heatmaps_in.dim32(3));
+
+  CAFFE_ENFORCE_EQ(bboxes_in.ndim(), 2);
+  CAFFE_ENFORCE_EQ(bboxes_in.dim32(0), N);
+  CAFFE_ENFORCE_GE(bboxes_in.dim32(1), 4);
+
+  // Wrap inputs in Eigen
+  Eigen::Map<const ERArrXXf> heatmaps(
+      heatmaps_in.data<float>(),
+      heatmaps_in.dim32(0) * heatmaps_in.dim32(1),
+      heatmaps_in.dim32(2) * heatmaps_in.dim32(3));
+  Eigen::Map<const ERArrXXf> bboxes(
+      bboxes_in.data<float>(), bboxes_in.dim32(0), bboxes_in.dim32(1));
+
+  // Calculate the softmax
+  ERArrXXf probs(
+      heatmaps_in.dim32(0) * heatmaps_in.dim32(1),
+      heatmaps_in.dim32(2) * heatmaps_in.dim32(3));
+  if (should_output_softmax_) {
+    // softmax output is expensive to compute, if should_output_softmax is not
+    // specified, don't populate it
+    ERArrXXf heatmap_exp = heatmaps.exp();
+    for (int r = 0; r < N * keypoint_count; r++) {
+      probs.row(r) = heatmap_exp.row(r) / heatmap_exp.row(r).sum();
+    }
+  } /* otherwise not initialized */
+
+  // Resize and wrap outputs in Eigen
+  keypoints_out->Resize(N, 4, keypoint_count);
+  Eigen::Map<ERArrXXf> keypoints(
+      keypoints_out->mutable_data<float>(), N, 4 * keypoint_count);
+
+  EArrXi maxIndices(N * keypoint_count);
+  // finding max value first (only maxCoeff() is vectorized, not
+  // maxCoeff(&index)), then find the index (equalness check is also fast)
+  EArrXf maxScores = heatmaps.rowwise().maxCoeff();
+  for (int r = 0; r < N * keypoint_count; r++) {
+    float maxScore = maxScores[r];
+    for (int c = 0; c < heatmap_size * heatmap_size; c++) {
+      if (heatmaps(r, c) == maxScore) {
+        maxIndices[r] = c;
+        break;
+      }
+    }
+  }
+
+  // Populate outputs
+  for (int k = 0; k < N; k++) { // For each box, even skipped
+
+    float x0 = bboxes(k, 0);
+    float y0 = bboxes(k, 1);
+    float xLen = std::max(bboxes(k, 2) - bboxes(k, 0), 1.0f);
+    float yLen = std::max(bboxes(k, 3) - bboxes(k, 1), 1.0f);
+
+    // Extract max keypoints and probabilities from heatmaps
+    for (int j = 0; j < keypoint_count; j++) {
+      const int heatmap_index = k * keypoint_count + j;
+      const int maxIndex = maxIndices[heatmap_index];
+      const float maxScore = maxScores[heatmap_index];
+      const int maxY = maxIndex / heatmap_size;
+      const int maxX = maxIndex - heatmap_size * maxY;
+
+      assert(heatmaps(heatmap_index, maxIndex) == maxScore);
+      ERArrXXf fmax = ERArrXXf::Zero(3, 3);
+
+      // initialize fmax values of local 3x3 grid
+      // when 3x3 grid going out-of-bound, mirrowing around center
+      for (int y = -1; y <= 1; y++) {
+        for (int x = -1; x <= 1; x++) {
+          int xx = x - 2 * (x + maxX >= heatmap_size) + 2 * (x + maxX < 0);
+          int yy = y - 2 * (y + maxY >= heatmap_size) + 2 * (y + maxY < 0);
+          assert((xx + maxX < heatmap_size) && (xx + maxX >= 0));
+          assert((yy + maxY < heatmap_size) && (yy + maxY >= 0));
+          const int coord_index = (yy + maxY) * heatmap_size + xx + maxX;
+          fmax(y + 1, x + 1) = heatmaps(heatmap_index, coord_index);
+        }
+      }
+
+      // b = -f'(0), A = f''(0) Hessian matrix
+      EVecXf b(2);
+      b << -(fmax(1, 2) - fmax(1, 0)) / 2, -(fmax(2, 1) - fmax(0, 1)) / 2;
+      EMatXf A(2, 2);
+      A << fmax(1, 0) - 2 * fmax(1, 1) + fmax(1, 2),
+          (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4,
+          (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4,
+          fmax(0, 1) - 2 * fmax(1, 1) + fmax(2, 1);
+
+      // Solve Ax=b
+      const float div = A.determinant();
+      EVecXf delta(2);
+      float deltaScore;
+      const float MAX_DELTA = 1.5;
+      if (std::abs(div) < 1e-4f) {
+        delta << 0.0f, 0.0f;
+        deltaScore = maxScore;
+      } else {
+        delta = A.ldlt().solve(b);
+        // clip delta if going out-of-range of 3x3 grid
+        if (std::abs(delta(0)) > MAX_DELTA || std::abs(delta(1)) > MAX_DELTA) {
+          float larger_delta = std::max(std::abs(delta(0)), std::abs(delta(1)));
+          delta(0) = delta(0) / larger_delta * MAX_DELTA;
+          delta(1) = delta(1) / larger_delta * MAX_DELTA;
+        }
+        deltaScore = fmax(1, 1) - b.transpose() * delta +
+            1.0 / 2.0 * delta.transpose() * A * delta;
+      }
+      assert(std::abs(delta(0)) <= MAX_DELTA);
+      assert(std::abs(delta(1)) <= MAX_DELTA);
+      // find maximum of detla scores
+      keypoints(k, 0 * keypoint_count + j) =
+          x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size;
+      keypoints(k, 1 * keypoint_count + j) =
+          y0 + (0.5 + maxY + delta(1)) * yLen / heatmap_size;
+      keypoints(k, 2 * keypoint_count + j) = deltaScore;
+      if (should_output_softmax_) {
+        keypoints(k, 3 * keypoint_count + j) = probs(heatmap_index, maxIndex);
+      } else {
+        keypoints(k, 3 * keypoint_count + j) = .0f;
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/heatmap_max_keypoint_op.h b/caffe2/operators/heatmap_max_keypoint_op.h
new file mode 100644
index 0000000..352c9ff
--- /dev/null
+++ b/caffe2/operators/heatmap_max_keypoint_op.h
@@ -0,0 +1,31 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef HEATMAP_MAX_KEYPOINT_OP_H_
+#define HEATMAP_MAX_KEYPOINT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class HeatmapMaxKeypointOp final : public Operator<Context> {
+ public:
+  HeatmapMaxKeypointOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        should_output_softmax_(OperatorBase::GetSingleArgument<bool>(
+            "should_output_softmax",
+            false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool should_output_softmax_ = false;
+};
+
+} // namespace caffe2
+
+#endif // HEATMAP_MAX_KEYPOINT_OP_H_
diff --git a/caffe2/operators/hip/conv_op_miopen.cc b/caffe2/operators/hip/conv_op_miopen.cc
new file mode 100644
index 0000000..84d9a75
--- /dev/null
+++ b/caffe2/operators/hip/conv_op_miopen.cc
@@ -0,0 +1,859 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+// Earlier in the days Caffe sets the default miopen workspace to 8MB. We bump
+// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
+// something very beneficial to more recent CNN models.
+static constexpr size_t kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
+
+class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
+ public:
+  MIOPENConvOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        miopen_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
+            "ws_nbytes_limit",
+            kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        exhaustive_search_(
+            OperatorBase::GetSingleArgument<bool>("exhaustive_search", false)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bottom_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bias_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&weight_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&top_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&top_desc_for_bias_));
+    MIOPEN_ENFORCE(miopenCreateConvolutionDescriptor(&conv_desc_));
+
+    if ((operator_def.type().substr(0, 6) == "Conv") ||
+        (operator_def.type().substr(0, 14) == "ConvGradient")) {
+      mode_ = miopenConvolution;
+    } else if (
+        (operator_def.type().substr(0, 7) == "Trans") ||
+        (operator_def.type().substr(0, 15) == "TransGradient")) {
+      mode_ = miopenTranspose;
+    } else {
+      LOG(FATAL) << "Unsupported convolution method: " << operator_def.type();
+    }
+
+    MIOPEN_ENFORCE(miopenInitConvolutionDescriptor(
+        conv_desc_,
+        mode_,
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w(),
+        dilation_h(),
+        dilation_w()));
+  }
+
+  ~MIOPENConvOpBase() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bottom_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bias_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(weight_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(top_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(top_desc_for_bias_));
+    MIOPEN_ENFORCE(miopenDestroyConvolutionDescriptor(conv_desc_));
+  }
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t bottom_desc_;
+  miopenTensorDescriptor_t bias_desc_;
+  miopenTensorDescriptor_t weight_desc_;
+  miopenTensorDescriptor_t top_desc_;
+  miopenTensorDescriptor_t top_desc_for_bias_;
+  miopenConvolutionDescriptor_t conv_desc_;
+  miopenConvolutionMode_t mode_;
+  const size_t miopen_ws_nbytes_limit_;
+  bool exhaustive_search_;
+  const float alpha_;
+  const float beta_;
+};
+
+class MIOPENConvOp final : public MIOPENConvOpBase {
+ public:
+  MIOPENConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : MIOPENConvOpBase(operator_def, ws),
+        requestAlgoCount_(
+            OperatorBase::GetSingleArgument<int>("requestAlgoCount_", 1)),
+        returnedAlgoCount_(
+            OperatorBase::GetSingleArgument<int>("returnedAlgoCount_", 1)),
+        bestAlgoFound_(
+            OperatorBase::GetSingleArgument<bool>("bestAlgoFound_", false)),
+        fwdConvWs_(nullptr),
+        fwdConvWsSize_(0),
+        fwdAlgo_(miopenConvolutionFwdAlgoGEMM) {}
+
+  ~MIOPENConvOp() {
+    if (fwdConvWs_) {
+      hipFree(fwdConvWs_);
+      fwdConvWs_ = nullptr;
+      fwdConvWsSize_ = 0;
+    }
+  }
+
+  template <
+      typename T_X,
+      typename T_W,
+      typename T_B,
+      typename MATH,
+      typename T_Y>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ private:
+  const int requestAlgoCount_;
+  int returnedAlgoCount_;
+  bool bestAlgoFound_;
+  char* fwdConvWs_;
+  size_t fwdConvWsSize_;
+  miopenConvFwdAlgorithm_t fwdAlgo_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+class MIOPENConvGradientOp final : public MIOPENConvOpBase {
+ public:
+  MIOPENConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : MIOPENConvOpBase(operator_def, ws),
+        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)),
+        requestAlgoCount_(
+            OperatorBase::GetSingleArgument<int>("requestAlgoCount_", 1)),
+        returnedAlgoCount_(
+            OperatorBase::GetSingleArgument<int>("returnedAlgoCount_", 1)),
+        bestDataAlgoFound_(
+            OperatorBase::GetSingleArgument<bool>("bestAlgoFound", false)),
+        bestWeightAlgoFound_(
+            OperatorBase::GetSingleArgument<bool>("bestAlgoFound", false)),
+        bwdWeightWs_(nullptr),
+        bwdWeightWsSize_(0),
+        bwdDataWs_(nullptr),
+        bwdDataWsSize_(0),
+        bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM),
+        bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) {
+    OPERATOR_NEEDS_FEATURE(
+        group_ == 1,
+        "Group convolution not supported yet for MIOpen ConvGradient.");
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+  }
+
+  ~MIOPENConvGradientOp() {
+    if (bwdWeightWs_) {
+      hipFree(bwdWeightWs_);
+      bwdWeightWs_ = nullptr;
+      bwdWeightWsSize_ = 0;
+    }
+    if (bwdDataWs_) {
+      hipFree(bwdDataWs_);
+      bwdDataWs_ = nullptr;
+      bwdDataWsSize_ = 0;
+    }
+  }
+
+  template <
+      typename T_X,
+      typename T_DY,
+      typename T_W,
+      typename T_B,
+      typename MATH,
+      typename T_DX,
+      typename T_DW,
+      typename T_DB>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ private:
+  bool no_bias_;
+  const int requestAlgoCount_;
+  int returnedAlgoCount_;
+  bool bestDataAlgoFound_;
+  bool bestWeightAlgoFound_;
+  miopenConvBwdWeightsAlgorithm_t bwdWeiAlgo_;
+  miopenConvBwdDataAlgorithm_t bwdDataAlgo_;
+  size_t bwdWeightWsSize_;
+  size_t bwdDataWsSize_;
+  char* bwdWeightWs_;
+  char* bwdDataWs_;
+  // input: X, W, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T_X, typename T_W, typename T_B, typename MATH, typename T_Y>
+bool MIOPENConvOp::DoRunWithType() {
+  auto& X = Input(INPUT);
+  auto& Weight = Input(FILTER);
+  auto* Y = Output(0);
+
+  // Figure out the output shape
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  CAFFE_ENFORCE(
+      Weight.ndim() == 4,
+      "Conv/Trans op with MIOpen engine is supported only for 2D convolutions");
+
+  const int M = Weight.dim32(0);
+  ConvPoolOpBase<HIPContext>::SetOutputSize(X, Y, M);
+
+  int N = X.dim32(0);
+  int C = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  int D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  int N_out = Y->dim32(0);
+  int C_out = Y->dim32(1);
+  int H_out = Y->dim32(2);
+  int W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
+  int D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
+  CAFFE_ENFORCE_EQ(Weight.dim32(1), C / group_);
+
+  CAFFE_ENFORCE(
+      C % group_ == 0,
+      "If you set group, the number of input channels should be divisible "
+      "by group.");
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "If you set group, the number of output channels should be divisible "
+      "by group.");
+
+  if (group_ > 1) {
+    int group_offset_filter = Weight.size() / group_;
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        weight_desc_,
+        miopenTypeWrapper<T_W>::type,
+        M / group_,
+        C / group_,
+        kernel_h(),
+        kernel_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T_X>::type, 1, C / group_, H, W));
+
+    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+        conv_desc_,
+        bottom_desc_,
+        weight_desc_,
+        &N_out,
+        &C_out,
+        &H_out,
+        &W_out));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+    if (InputSize() == 3) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, Y->dim32(1), 1, 1));
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          top_desc_for_bias_,
+          miopenTypeWrapper<T_X>::type,
+          Y->dim32(0),
+          Y->dim32(1),
+          H_out,
+          W_out));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        weight_desc_,
+        bottom_desc_,
+        conv_desc_,
+        top_desc_,
+        &fwdConvWsSize_));
+
+    int group_offset_X = C / group_ * H * W * D;
+    int batch_offset_X = group_offset_X * group_;
+    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+    int batch_offset_Y = group_offset_Y * group_;
+
+    if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
+    }
+
+    while (!bestAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          bottom_desc_,
+          X.template data<T_X>(),
+          weight_desc_,
+          Weight.template data<T_W>(),
+          conv_desc_,
+          top_desc_,
+          Y->template mutable_data<T_Y>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          fwdConvWs_,
+          fwdConvWsSize_,
+          false));
+      bestAlgoFound_ = true;
+      fwdAlgo_ = perf.fwd_algo;
+    }
+
+    for (int b = 0; b < N; b++) {
+      for (int g = 0; g < group_; g++) {
+        MIOPEN_ENFORCE(miopenConvolutionForward(
+            miopen_wrapper_.inline_miopen_handle(),
+            &alpha_,
+            bottom_desc_,
+            X.template data<T_X>() + (b * batch_offset_X) +
+                (g * group_offset_X),
+            weight_desc_,
+            Weight.template data<T_W>() + g * group_offset_filter,
+            conv_desc_,
+            fwdAlgo_,
+            &beta_,
+            top_desc_,
+            Y->template mutable_data<T_Y>() + (b * batch_offset_Y) +
+                (g * group_offset_Y),
+            fwdConvWs_,
+            fwdConvWsSize_));
+      }
+    }
+    hipDeviceSynchronize();
+
+    // BIAS
+    if (InputSize() == 3) {
+      auto& bias = Input(BIAS);
+
+      CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+      CAFFE_ENFORCE_EQ(bias.dim32(0), M);
+      MIOPEN_ENFORCE(miopenConvolutionForwardBias(
+          miopen_wrapper_.inline_miopen_handle(),
+          &alpha_,
+          bias_desc_,
+          bias.template data<T_B>(),
+          &beta_,
+          top_desc_for_bias_,
+          Y->template mutable_data<T_Y>()));
+    }
+
+    hipDeviceSynchronize();
+  } else {
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        weight_desc_,
+        miopenTypeWrapper<T_W>::type,
+        M,
+        C,
+        kernel_h(),
+        kernel_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+        conv_desc_,
+        bottom_desc_,
+        weight_desc_,
+        &N_out,
+        &C_out,
+        &H_out,
+        &W_out));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+    if (InputSize() == 3) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, C_out, 1, 1));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        weight_desc_,
+        bottom_desc_,
+        conv_desc_,
+        top_desc_,
+        &fwdConvWsSize_));
+
+    if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
+    }
+
+    while (!bestAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          bottom_desc_,
+          X.template data<T_X>(),
+          weight_desc_,
+          Weight.template data<T_W>(),
+          conv_desc_,
+          top_desc_,
+          Y->template mutable_data<T_Y>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          fwdConvWs_,
+          fwdConvWsSize_,
+          false));
+      bestAlgoFound_ = true;
+      fwdAlgo_ = perf.fwd_algo;
+    }
+    MIOPEN_ENFORCE(miopenConvolutionForward(
+        miopen_wrapper_.inline_miopen_handle(),
+        &alpha_,
+        bottom_desc_,
+        X.template data<T_X>(),
+        weight_desc_,
+        Weight.template data<T_W>(),
+        conv_desc_,
+        fwdAlgo_,
+        &beta_,
+        top_desc_,
+        Y->template mutable_data<T_Y>(),
+        fwdConvWs_,
+        fwdConvWsSize_));
+
+    // BIAS
+    if (InputSize() == 3) {
+      auto& bias = Input(BIAS);
+
+      CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+      CAFFE_ENFORCE_EQ(bias.dim32(0), M);
+      MIOPEN_ENFORCE(miopenConvolutionForwardBias(
+          miopen_wrapper_.inline_miopen_handle(),
+          &alpha_,
+          bias_desc_,
+          bias.template data<T_B>(),
+          &beta_,
+          top_desc_,
+          Y->template mutable_data<T_Y>()));
+    }
+
+    hipDeviceSynchronize();
+  }
+
+  return true;
+}
+// TODO : enable fp16 support.
+bool MIOPENConvOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<
+        float, // X
+        float, // W
+        float, // B
+        float, // Math
+        float>(); // Y
+  } else {
+    LOG(FATAL) << "Only float (32bit) is supported by "
+               << "miopen convolution, but input " << debug_def().input(0)
+               << " has [" << Input(0).meta().name() << "]";
+  }
+  return true;
+}
+
+template <
+    typename T_X,
+    typename T_DY,
+    typename T_W,
+    typename T_B,
+    typename MATH,
+    typename T_DX,
+    typename T_DW,
+    typename T_DB>
+bool MIOPENConvGradientOp::DoRunWithType() {
+  auto& X = Input(INPUT);
+  auto& Weight = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dW = Output(FILTER_GRAD);
+  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+  dX->ResizeLike(X);
+  dW->ResizeLike(Weight);
+
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  CAFFE_ENFORCE(
+      Weight.ndim() == 4,
+      "ConvGradient/TransGradient op with MIOpen engine is supported only for 2D convolutions");
+
+  const int M = Weight.dim32(0);
+  int N = 0, C = 0, H = 0, W = 0, D = 0, N_out = 0, C_out = 0, H_out = 0,
+      W_out = 0, D_out = 0;
+
+  N = X.dim32(0);
+  C = X.dim32(1);
+  H = X.dim32(2);
+  W = X.ndim() > 3 ? X.dim32(3) : 1;
+  D = X.ndim() > 4 ? X.dim32(4) : 1;
+
+  N_out = dY.dim32(0);
+  C_out = dY.dim32(1);
+  H_out = dY.dim32(2);
+  W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
+  D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
+
+  CAFFE_ENFORCE_EQ(Weight.dim32(1), C / group_);
+
+  CAFFE_ENFORCE(
+      C % group_ == 0,
+      "If you set group, the number of input channels should be divisible "
+      "by group.");
+  CAFFE_ENFORCE(
+      M % group_ == 0,
+      "If you set group, the number of output channels should be divisible "
+      "by group.");
+
+  if (group_ > 1) {
+    int group_offset_filter = Weight.size() / group_;
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        weight_desc_,
+        miopenTypeWrapper<T_X>::type,
+        M / group_,
+        C / group_,
+        kernel_h(),
+        kernel_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T_X>::type, 1, C / group_, H, W));
+
+    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+        conv_desc_,
+        bottom_desc_,
+        weight_desc_,
+        &N_out,
+        &C_out,
+        &H_out,
+        &W_out));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+    if (!no_bias_) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          top_desc_for_bias_,
+          miopenTypeWrapper<T_X>::type,
+          dY.dim32(0),
+          M,
+          H_out,
+          W_out));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        top_desc_,
+        weight_desc_,
+        conv_desc_,
+        bottom_desc_,
+        &bwdDataWsSize_));
+
+    int group_offset_X = C / group_ * H * W * D;
+    int batch_offset_X = group_offset_X * group_;
+    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+    int batch_offset_Y = group_offset_Y * group_;
+
+    if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        top_desc_,
+        bottom_desc_,
+        conv_desc_,
+        weight_desc_,
+        &bwdWeightWsSize_));
+
+    if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
+    }
+
+    while (!bestDataAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          dY.template data<T_DY>(),
+          weight_desc_,
+          Weight.template data<T_W>(),
+          conv_desc_,
+          bottom_desc_,
+          dX->template mutable_data<T_DX>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          bwdDataWs_,
+          bwdDataWsSize_,
+          false));
+
+      bestDataAlgoFound_ = true;
+      bwdDataAlgo_ = perf.bwd_data_algo;
+    }
+
+    while (!bestWeightAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          dY.template data<T_DY>(),
+          bottom_desc_,
+          X.template data<T_X>(),
+          conv_desc_,
+          weight_desc_,
+          dW->template mutable_data<T_DW>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          bwdWeightWs_,
+          bwdWeightWsSize_,
+          false));
+      bestWeightAlgoFound_ = true;
+      bwdWeiAlgo_ = perf.bwd_weights_algo;
+    }
+
+    for (int b = 0; b < N; b++) {
+      for (int g = 0; g < group_; g++) {
+        MIOPEN_ENFORCE(miopenConvolutionBackwardData(
+            miopen_wrapper_.inline_miopen_handle(),
+            &alpha_,
+            top_desc_,
+            dY.template data<T_DY>() + (b * batch_offset_Y) +
+                (g * group_offset_Y),
+            weight_desc_,
+            Weight.template data<T_W>() + g * group_offset_filter,
+            conv_desc_,
+            bwdDataAlgo_,
+            &beta_,
+            bottom_desc_,
+            dX->template mutable_data<T_DX>() + (b * batch_offset_X) +
+                (g * group_offset_X),
+            bwdDataWs_,
+            bwdDataWsSize_));
+
+        MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
+            miopen_wrapper_.inline_miopen_handle(),
+            &alpha_,
+            top_desc_,
+            dY.template data<T_DY>() + (b * batch_offset_Y) +
+                (g * group_offset_Y),
+            bottom_desc_,
+            X.template data<T_X>() + (b * batch_offset_X) +
+                (g * group_offset_X),
+            conv_desc_,
+            bwdWeiAlgo_,
+            &beta_,
+            weight_desc_,
+            dW->template mutable_data<T_DW>() + g * group_offset_filter,
+            bwdWeightWs_,
+            bwdWeightWsSize_));
+      }
+    }
+
+    // Synchronize the work across groups.
+    hipDeviceSynchronize();
+
+    ////////////////////////////////////// BIAS ///////////////////////////
+    if (!no_bias_) {
+      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+      dbias->Resize(M);
+      MIOPEN_ENFORCE(miopenConvolutionBackwardBias(
+          miopen_wrapper_.inline_miopen_handle(),
+          &alpha_,
+          top_desc_for_bias_,
+          dY.template data<T_DY>(),
+          &beta_,
+          bias_desc_,
+          dbias->template mutable_data<T_DB>()));
+    }
+  } else // No group
+  {
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        weight_desc_,
+        miopenTypeWrapper<T_X>::type,
+        M,
+        C,
+        kernel_h(),
+        kernel_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+        conv_desc_,
+        bottom_desc_,
+        weight_desc_,
+        &N_out,
+        &C_out,
+        &H_out,
+        &W_out));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+    if (!no_bias_) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        top_desc_,
+        weight_desc_,
+        conv_desc_,
+        bottom_desc_,
+        &bwdDataWsSize_));
+
+    if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+        miopen_wrapper_.inline_miopen_handle(),
+        top_desc_,
+        bottom_desc_,
+        conv_desc_,
+        weight_desc_,
+        &bwdWeightWsSize_));
+
+    if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
+    }
+
+    while (!bestDataAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          dY.template data<T_DY>(),
+          weight_desc_,
+          Weight.template data<T_W>(),
+          conv_desc_,
+          bottom_desc_,
+          dX->template mutable_data<T_DX>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          bwdDataWs_,
+          bwdDataWsSize_,
+          false));
+
+      bestDataAlgoFound_ = true;
+      bwdDataAlgo_ = perf.bwd_data_algo;
+    }
+
+    while (!bestWeightAlgoFound_) {
+      miopenConvAlgoPerf_t perf;
+      MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          dY.template data<T_DY>(),
+          bottom_desc_,
+          X.template data<T_X>(),
+          conv_desc_,
+          weight_desc_,
+          dW->template mutable_data<T_DW>(),
+          requestAlgoCount_,
+          &returnedAlgoCount_,
+          &perf,
+          bwdWeightWs_,
+          bwdWeightWsSize_,
+          false));
+      bestWeightAlgoFound_ = true;
+      bwdWeiAlgo_ = perf.bwd_weights_algo;
+    }
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardData(
+        miopen_wrapper_.inline_miopen_handle(),
+        &alpha_,
+        top_desc_,
+        dY.template data<T_DY>(),
+        weight_desc_,
+        Weight.template data<T_W>(),
+        conv_desc_,
+        bwdDataAlgo_,
+        &beta_,
+        bottom_desc_,
+        dX->template mutable_data<T_DX>(),
+        bwdDataWs_,
+        bwdDataWsSize_));
+
+    MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
+        miopen_wrapper_.inline_miopen_handle(),
+        &alpha_,
+        top_desc_,
+        dY.template data<T_DY>(),
+        bottom_desc_,
+        X.template data<T_X>(),
+        conv_desc_,
+        bwdWeiAlgo_,
+        &beta_,
+        weight_desc_,
+        dW->template mutable_data<T_DW>(),
+        bwdWeightWs_,
+        bwdWeightWsSize_));
+
+    // Synchronize the work across groups.
+    hipDeviceSynchronize();
+
+    ////////////////////////////////////// BIAS ///////////////////////////
+    if (!no_bias_) {
+      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+      dbias->Resize(M);
+      MIOPEN_ENFORCE(miopenConvolutionBackwardBias(
+          miopen_wrapper_.inline_miopen_handle(),
+          &alpha_,
+          top_desc_,
+          dY.template data<T_DY>(),
+          &beta_,
+          bias_desc_,
+          dbias->template mutable_data<T_DB>()));
+    }
+  }
+
+  return true;
+}
+
+bool MIOPENConvGradientOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<
+        float, //  X
+        float, // dY
+        float, //  W
+        float, //  b
+        float, // Math
+        float, // dX
+        float, // dW
+        float>(); // db
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+REGISTER_MIOPEN_OPERATOR(Conv, MIOPENConvOp);
+REGISTER_MIOPEN_OPERATOR(ConvGradient, MIOPENConvGradientOp);
+REGISTER_MIOPEN_OPERATOR(Trans, MIOPENConvOp);
+REGISTER_MIOPEN_OPERATOR(TransGradient, MIOPENConvGradientOp);
+} // namespace caffe2
diff --git a/caffe2/operators/hip/local_response_normalization_op_miopen.cc b/caffe2/operators/hip/local_response_normalization_op_miopen.cc
new file mode 100644
index 0000000..26da9bf
--- /dev/null
+++ b/caffe2/operators/hip/local_response_normalization_op_miopen.cc
@@ -0,0 +1,248 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+class MIOPEN_LRNOP final : public Operator<HIPContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(HIPContext);
+
+  MIOPEN_LRNOP(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        mode_(miopenLRNCrossChannel),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateLRNDescriptor(&norm_desc_));
+    MIOPEN_ENFORCE(
+        miopenSetLRNDescriptor(norm_desc_, mode_, size_, alpha_, beta_, bias_));
+  }
+
+  ~MIOPEN_LRNOP() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyLRNDescriptor(norm_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenLRNDescriptor_t norm_desc_;
+  vector<TIndex> miopen_input_dims_;
+  const miopenLRNMode_t mode_;
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+  // Input: X, Output: Y
+};
+
+class MIOPENLRNGradientOp final : public Operator<HIPContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(HIPContext);
+  MIOPENLRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        mode_(miopenLRNCrossChannel),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)),
+        do_backward_(
+            OperatorBase::GetSingleArgument<bool>("do_backward", false)),
+        bwdLRNWs_(nullptr),
+        bwdLRNScratch_(nullptr) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateLRNDescriptor(&norm_desc_));
+    MIOPEN_ENFORCE(
+        miopenSetLRNDescriptor(norm_desc_, mode_, size_, alpha_, beta_, bias_));
+  }
+
+  ~MIOPENLRNGradientOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyLRNDescriptor(norm_desc_));
+
+    if (bwdLRNWs_) {
+      hipFree(bwdLRNWs_);
+      bwdLRNWs_ = nullptr;
+    }
+    if (bwdLRNScratch_) {
+      hipFree(bwdLRNScratch_);
+      bwdLRNScratch_ = nullptr;
+    }
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenLRNDescriptor_t norm_desc_;
+  vector<TIndex> miopen_input_dims_;
+  const miopenLRNMode_t mode_;
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+  const bool do_backward_;
+  float* bwdLRNWs_;
+  float* bwdLRNScratch_;
+  // Input: X, Y, dY
+  // Output: dX
+};
+
+template <typename T, typename M>
+bool MIOPEN_LRNOP::DoRunWithType() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  // Reshape tensor descriptors if necessary
+  if (X.dims() != miopen_input_dims_) {
+    VLOG(1) << "Setting descriptors";
+    miopen_input_dims_ = X.dims();
+    int C = 1, H = 1, W = 1;
+    // Normal 4-dimensional tensors for images.
+    C = X.dim32(1);
+    H = X.dim32(2);
+    W = X.dim32(3);
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        data_desc_, miopenTypeWrapper<T>::type, X.dim32(0), C, H, W));
+  }
+
+  // now actually run the computation
+  MIOPEN_ENFORCE(miopenLRNForward(
+      miopen_wrapper_.inline_miopen_handle(),
+      norm_desc_,
+      &alpha_,
+      data_desc_,
+      X.template data<T>(),
+      &beta_,
+      data_desc_,
+      Y->template mutable_data<T>(),
+      false,
+      nullptr));
+
+  return true;
+}
+
+bool MIOPEN_LRNOP::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  if (X.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+  return false;
+}
+
+template <typename T, typename M>
+bool MIOPENLRNGradientOp::DoRunWithType() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  const auto& dY = Input(2);
+  auto* dX = Output(0);
+
+  if (dY.dims() != miopen_input_dims_) {
+    VLOG(1) << "Setting descriptors";
+    miopen_input_dims_ = dY.dims();
+    int C = 1, H = 1, W = 1;
+    // Normal 4-dimensional tensors for images.
+    C = dY.dim32(1);
+    H = dY.dim32(2);
+    W = dY.dim32(3);
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        data_desc_, miopenTypeWrapper<T>::type, dY.dim32(0), C, H, W));
+  }
+
+  size_t ws_size = 0;
+  MIOPEN_ENFORCE(miopenLRNGetWorkSpaceSize(data_desc_, &ws_size));
+  if (bwdLRNWs_ == nullptr) {
+    HIP_CHECK(hipMalloc(&bwdLRNWs_, ws_size));
+  }
+
+  // Run fwd pass to populate workspace
+  if (bwdLRNScratch_ == nullptr) {
+    HIP_CHECK(hipMalloc(&bwdLRNScratch_, X.size() * sizeof(float)));
+  }
+  MIOPEN_ENFORCE(miopenLRNForward(
+      miopen_wrapper_.inline_miopen_handle(),
+      norm_desc_,
+      &alpha_,
+      data_desc_,
+      X.template data<T>(),
+      &beta_,
+      data_desc_,
+      bwdLRNScratch_,
+      true,
+      bwdLRNWs_));
+
+  // Run the bwd computation
+  MIOPEN_ENFORCE(miopenLRNBackward(
+      miopen_wrapper_.inline_miopen_handle(),
+      norm_desc_,
+      &alpha_,
+      data_desc_,
+      Y.template data<T>(),
+      data_desc_,
+      dY.template data<T>(),
+      data_desc_,
+      X.template data<T>(),
+      &beta_,
+      data_desc_,
+      dX->template mutable_data<T>(),
+      bwdLRNWs_));
+  return true;
+}
+
+bool MIOPENLRNGradientOp::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  const auto& dY = Input(2);
+  auto* dX = Output(0);
+
+  dX->ResizeLike(dY);
+
+  if (dY.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+  return false;
+}
+
+namespace {
+REGISTER_MIOPEN_OPERATOR(LRN, MIOPEN_LRNOP);
+REGISTER_MIOPEN_OPERATOR(LRNGradient, MIOPENLRNGradientOp);
+} // namespace
+
+}; // namespace caffe2
diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h
new file mode 100644
index 0000000..62e5fe8
--- /dev/null
+++ b/caffe2/operators/hip/operator_fallback_hip.h
@@ -0,0 +1,114 @@
+#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+/**
+ * @brief A templated class to allow one to wrap a CPU operator as a CUDA
+ * operator.
+ *
+ * This class can be used when one does not have the CUDA implementation ready
+ * yet for an operator. Essentially, what this op does is to automatically
+ * deal with data copy for you. Plausibly, this causes a lot of overhead and
+ * is not optimal, so you should use this operator mostly for quick prototyping
+ * purpose.
+ *
+ * All the input and output of the original operator should be TensorCPU.
+ *
+ * Example usage: if you have a class MyMagicOp that is CPU based, and you use
+ * the registration code
+ *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
+ * to register the CPU side, you can create its corresponding GPU operator
+ * (with performance hits of course) via
+ *     REGISTER_HIP_OPERATOR(MyMagic,
+ *                            GPUFallbackOp<MyMagicOp>);
+ *
+ * Advanced usage: if you want to have some specific outputs never copied, you
+ * can use the SkipOutputCopy template argument to do that. For example, if
+ * MyMagic produces two outputs and the first output is always going to live on
+ * the CPU, you can do
+ *     REGISTER_HIP_OPERATOR(MyMagic,
+ *                            GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
+ */
+template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
+class GPUFallbackOp final : public Operator<HIPContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(HIPContext);
+  GPUFallbackOp(const OperatorDef& def, Workspace* ws)
+      : Operator<HIPContext>(def, ws) {
+    CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP);
+    OperatorDef base_def_(def);
+    // base_def_ runs on CPU, so we will set its device option to CPU.
+    base_def_.clear_device_option();
+    base_def_.mutable_device_option()->set_device_type(CPU);
+    // Set up the symbols for the local workspace.
+    for (const string& name : def.input()) {
+      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
+      CHECK_NOTNULL(local_input_blobs_.back());
+    }
+    base_op_.reset(new CPUOp(base_def_, &local_ws_));
+    for (const string& name : def.output()) {
+      local_output_blobs_.push_back(local_ws_.GetBlob(name));
+      CHECK_NOTNULL(local_output_blobs_.back());
+    }
+  }
+
+  bool RunOnDevice() override {
+    bool need_sync = false;
+    for (int i = 0; i < InputSize(); ++i) {
+      if (OperatorBase::InputIsType<TensorHIP>(i)) {
+        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
+            Input(i), &context_);
+        need_sync = true;
+      } else {
+        VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
+    }
+
+    // Sync to make sure copies are done.
+    if (need_sync) {
+      context_.FinishDeviceComputation();
+    }
+
+    if (!base_op_->Run()) {
+      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
+                 << ProtoDebugString(this->debug_def());
+      return false;
+    }
+    for (int i = 0; i < OutputSize(); ++i) {
+      if (SkipOutputCopy::Contains(i)) {
+        VLOG(1) << "Copy output: index " << i << " skipped.";
+        continue;
+      }
+      CAFFE_ENFORCE(
+          local_output_blobs_[i]->template IsType<TensorCPU>(),
+          "GPU fallback op currently does not support non-TensorCPU "
+          "output type who needs copying.");
+      Output(i)->CopyFrom(
+          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
+    }
+    return true;
+  }
+
+ protected:
+  Workspace local_ws_;
+  vector<Blob*> local_input_blobs_;
+  vector<Blob*> local_output_blobs_;
+  std::unique_ptr<CPUOp> base_op_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc
new file mode 100644
index 0000000..4a074c3
--- /dev/null
+++ b/caffe2/operators/hip/operator_fallback_hip_test.cc
@@ -0,0 +1,80 @@
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/hip/operator_fallback_hip.h"
+
+namespace caffe2 {
+
+class IncrementByOneOp final : public Operator<CPUContext> {
+ public:
+  IncrementByOneOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CPUContext>(def, ws) {}
+  bool RunOnDevice() {
+    const auto& in = Input(0);
+    auto* out = Output(0);
+    out->ResizeLike(in);
+    const float* in_data = in.template data<float>();
+    float* out_data = out->template mutable_data<float>();
+    for (int i = 0; i < in.size(); ++i) {
+      out_data[i] = in_data[i] + 1.f;
+    }
+    return true;
+  }
+};
+
+OPERATOR_SCHEMA(IncrementByOne)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
+
+REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp);
+REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp<IncrementByOneOp>);
+
+TEST(OperatorFallbackTest, IncrementByOneOp) {
+  OperatorDef op_def = CreateOperatorDef(
+      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
+  Workspace ws;
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  for (int i = 0; i < 6; ++i) {
+    source_tensor.mutable_data<float>()[i] = i;
+  }
+  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_TRUE(op.get() != nullptr);
+  EXPECT_TRUE(op->Run());
+  const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>();
+  EXPECT_EQ(output.ndim(), 2);
+  EXPECT_EQ(output.dim(0), 2);
+  EXPECT_EQ(output.dim(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(output.data<float>()[i], i + 1);
+  }
+}
+
+TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
+  if (!HasHipGPU())
+    return;
+  OperatorDef op_def = CreateOperatorDef(
+      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
+  op_def.mutable_device_option()->set_device_type(HIP);
+  Workspace ws;
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  for (int i = 0; i < 6; ++i) {
+    source_tensor.mutable_data<float>()[i] = i;
+  }
+  ws.CreateBlob("X")->GetMutable<TensorHIP>()->CopyFrom(source_tensor);
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_TRUE(op.get() != nullptr);
+  EXPECT_TRUE(op->Run());
+  const TensorHIP& output = ws.GetBlob("X")->Get<TensorHIP>();
+  TensorCPU output_cpu(output);
+  EXPECT_EQ(output.ndim(), 2);
+  EXPECT_EQ(output.dim(0), 2);
+  EXPECT_EQ(output.dim(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(output_cpu.data<float>()[i], i + 1);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/hip/pool_op_miopen.cc b/caffe2/operators/hip/pool_op_miopen.cc
new file mode 100644
index 0000000..9f9e7f9
--- /dev/null
+++ b/caffe2/operators/hip/pool_op_miopen.cc
@@ -0,0 +1,310 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
+ public:
+  MIOPENPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        do_backward_(
+            OperatorBase::GetSingleArgument<bool>("do_backward", true)),
+        poolWs_(nullptr),
+        poolWsSize_(0)
+
+  {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bottom_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&top_desc_));
+    MIOPEN_ENFORCE(miopenCreatePoolingDescriptor(&pooling_desc_));
+
+    if ((operator_def.type().substr(0, 9) == "MaxPool") ||
+        (operator_def.type().substr(0, 17) == "MaxPoolGradient")) {
+      mode_ = miopenPoolingMax;
+    } else if (
+        (operator_def.type().substr(0, 13) == "AveragePool") ||
+        (operator_def.type().substr(0, 21) == "AveragePoolGradient")) {
+      mode_ = miopenPoolingAverage;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+
+  ~MIOPENPoolOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bottom_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(top_desc_));
+    MIOPEN_ENFORCE(miopenDestroyPoolingDescriptor(pooling_desc_));
+    poolWsSize_ = 0;
+
+    if (poolWs_ != nullptr) {
+      hipFree(poolWs_);
+      poolWs_ = nullptr;
+    }
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    int N = 0, C = 0, H = 0, W = 0, D = 0;
+    int N_out = 0, C_out = 0, H_out = 0, W_out = 0;
+    CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
+    N = X.dim32(0);
+    C = X.dim32(1);
+    H = X.dim32(2);
+    W = X.ndim() > 3 ? X.dim32(3) : 1;
+    ConvPoolOpBase::SetOutputSize(X, Y, C);
+
+    N_out = Y->dim32(0);
+    C_out = Y->dim32(1);
+    H_out = Y->dim32(2);
+    W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
+
+    CAFFE_ENFORCE(kernel_.size() == 2, "MIOpen supports only 2D pooling");
+    MIOPEN_ENFORCE(miopenSet2dPoolingDescriptor(
+        pooling_desc_,
+        mode_,
+        kernel_h(),
+        kernel_w(),
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T>::type, N_out, C_out, H_out, W_out));
+
+    MIOPEN_ENFORCE(miopenPoolingGetWorkSpaceSize(top_desc_, &poolWsSize_));
+
+    if ((poolWsSize_ > 0) && (poolWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&poolWs_, poolWsSize_));
+    }
+
+    const T* Xdata = X.template data<T>();
+    T* Ydata = Y->template mutable_data<T>();
+    MIOPEN_ENFORCE(miopenPoolingForward(
+        miopen_wrapper_.inline_miopen_handle(),
+        pooling_desc_,
+        &alpha_,
+        bottom_desc_,
+        Xdata,
+        &beta_,
+        top_desc_,
+        Ydata,
+        do_backward_,
+        poolWs_,
+        poolWsSize_));
+
+    return true;
+  }
+
+  bool RunOnDevice() final {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    // TODO enable fp16
+    if (X.IsType<float>()) {
+      return DoRunWithType<float, float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  size_t poolWsSize_;
+  char* poolWs_;
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t bottom_desc_;
+  miopenTensorDescriptor_t top_desc_;
+  miopenPoolingDescriptor_t pooling_desc_;
+  miopenPoolingMode_t mode_;
+  bool do_backward_;
+  const float alpha_;
+  const float beta_;
+};
+
+class MIOPENPoolGradientOp : public ConvPoolOpBase<HIPContext> {
+ public:
+  MIOPENPoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        poolWs_(nullptr),
+        poolWsSize_(0),
+        bwdPoolScratch_(nullptr) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bottom_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&top_desc_));
+    MIOPEN_ENFORCE(miopenCreatePoolingDescriptor(&pooling_desc_));
+
+    if (operator_def.type().substr(0, 7) == "MaxPool") {
+      mode_ = miopenPoolingMax;
+    } else if (operator_def.type().substr(0, 11) == "AveragePool") {
+      mode_ = miopenPoolingAverage;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+
+  ~MIOPENPoolGradientOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bottom_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(top_desc_));
+    MIOPEN_ENFORCE(miopenDestroyPoolingDescriptor(pooling_desc_));
+    poolWsSize_ = 0;
+
+    if (poolWs_ != nullptr) {
+      hipFree(poolWs_);
+      poolWs_ = nullptr;
+    }
+
+    if (bwdPoolScratch_) {
+      hipFree(bwdPoolScratch_);
+      bwdPoolScratch_ = nullptr;
+    }
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dY = Input(2);
+    auto* dX = Output(0);
+
+    // cuDNN pooling support only 2 and 3 spatial dimensions.
+    CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
+
+    dX->ResizeLike(X);
+    int N = 0, C = 0, H = 0, W = 0, D = 0;
+    int N_out = 0, C_out = 0, H_out = 0, W_out = 0, D_out = 0;
+    N = X.dim32(0);
+    C = X.dim32(1);
+    H = X.dim32(2);
+    W = X.ndim() > 3 ? X.dim32(3) : 1;
+    D = X.ndim() > 4 ? X.dim32(4) : 1;
+    N_out = Y.dim32(0);
+    C_out = Y.dim32(1);
+    H_out = Y.dim32(2);
+    W_out = Y.ndim() > 3 ? Y.dim32(3) : 1;
+    D_out = Y.ndim() > 4 ? Y.dim32(4) : 1;
+
+    CAFFE_ENFORCE(kernel_.size() == 2, "MIOpen supports only 2D pooling");
+    MIOPEN_ENFORCE(miopenSet2dPoolingDescriptor(
+        pooling_desc_,
+        mode_,
+        kernel_h(),
+        kernel_w(),
+        pad_t(),
+        pad_l(),
+        stride_h(),
+        stride_w()));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        bottom_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        top_desc_, miopenTypeWrapper<T>::type, N_out, C_out, H_out, W_out));
+
+    MIOPEN_ENFORCE(miopenPoolingGetWorkSpaceSize(top_desc_, &poolWsSize_));
+
+    if ((poolWsSize_ > 0) && (poolWs_ == nullptr)) {
+      HIP_CHECK(hipMalloc(&poolWs_, poolWsSize_));
+    }
+
+    if (bwdPoolScratch_ == nullptr) {
+      HIP_CHECK(hipMalloc(&bwdPoolScratch_, Y.size() * sizeof(float)));
+    }
+
+    // Carry out the pooling computation.
+    const T* Xdata = X.template data<T>();
+    const T* Ydata = Y.template data<T>();
+    const T* dYdata = dY.template data<T>();
+    T* dXdata = dX->template mutable_data<T>();
+
+    MIOPEN_ENFORCE(miopenPoolingForward(
+        miopen_wrapper_.inline_miopen_handle(),
+        pooling_desc_,
+        &alpha_,
+        bottom_desc_,
+        Xdata,
+        &beta_,
+        top_desc_,
+        bwdPoolScratch_,
+        true,
+        poolWs_,
+        poolWsSize_));
+
+    MIOPEN_ENFORCE(miopenPoolingBackward(
+        miopen_wrapper_.inline_miopen_handle(),
+        pooling_desc_,
+        &alpha_,
+        top_desc_,
+        Ydata,
+        top_desc_,
+        dYdata,
+        bottom_desc_,
+        Xdata,
+        &beta_,
+        bottom_desc_,
+        dXdata,
+        poolWs_));
+
+    return true;
+  }
+
+  bool RunOnDevice() final {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dY = Input(2);
+    auto* dX = Output(0);
+    dX->ResizeLike(X);
+
+    if (X.IsType<float>()) {
+      return DoRunWithType<float, float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  size_t poolWsSize_;
+  char* poolWs_;
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t bottom_desc_;
+  miopenTensorDescriptor_t top_desc_;
+  miopenPoolingDescriptor_t pooling_desc_;
+  miopenPoolingMode_t mode_;
+  const float alpha_;
+  const float beta_;
+  float* bwdPoolScratch_;
+};
+
+namespace {
+REGISTER_MIOPEN_OPERATOR(AveragePool, MIOPENPoolOp);
+REGISTER_MIOPEN_OPERATOR(AveragePoolGradient, MIOPENPoolGradientOp);
+
+REGISTER_MIOPEN_OPERATOR(MaxPool, MIOPENPoolOp);
+REGISTER_MIOPEN_OPERATOR(MaxPoolGradient, MIOPENPoolGradientOp);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/hip/relu_op_miopen.cc b/caffe2/operators/hip/relu_op_miopen.cc
new file mode 100644
index 0000000..5a8a147
--- /dev/null
+++ b/caffe2/operators/hip/relu_op_miopen.cc
@@ -0,0 +1,205 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+class MIOPENReluOp final : public Operator<HIPContext> {
+ public:
+  MIOPENReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        power_(OperatorBase::GetSingleArgument<double>("power", 1.0)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateActivationDescriptor(&activ_desc_));
+    MIOPEN_ENFORCE(miopenSetActivationDescriptor(
+        activ_desc_, miopenActivationRELU, alpha_, beta_, power_));
+  }
+
+  ~MIOPENReluOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyActivationDescriptor(activ_desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+
+    // Return if X is empty
+    if (X.size() == 0) {
+      Y->mutable_data<T>();
+      return true;
+    }
+
+    // See if we need to reshape.
+    if (X.dims() != miopen_input_dims_) {
+      VLOG(1) << "Setting descriptors.";
+      miopen_input_dims_ = X.dims();
+      int C = 1, H = 1, W = 1;
+      if (X.ndim() == 4) {
+        // Normal 4-dimensional tensors for images.
+        C = X.dim32(1);
+        H = X.dim32(2);
+        W = X.dim32(3);
+      } else {
+        // If X is not 4-dimensional, we will simply use H = 1 and W = 1
+        // and wrap everything into C.
+        C = X.size() / X.dim32(0);
+      }
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          data_desc_, miopenTypeWrapper<T>::type, X.dim32(0), C, H, W));
+    }
+    MIOPEN_ENFORCE(miopenActivationForward(
+        miopen_wrapper_.inline_miopen_handle(),
+        activ_desc_,
+        &alpha_,
+        data_desc_,
+        X.template data<T>(),
+        &beta_,
+        data_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    // dispatch based on contents of tensor(s)
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    if (X.IsType<float>()) {
+      return DoRunWithType<float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenActivationDescriptor_t activ_desc_;
+  vector<TIndex> miopen_input_dims_;
+  const float alpha_;
+  const float beta_;
+  const double power_;
+};
+
+// Note: You can see that in MIOPENReluGradientOp, we abused the miopen
+// interface by passing in the output tensor for both bottom and top. This is
+// dependent on the assumption that the Relu gradient actually does not rely on
+// the bottom data, or it treats input=0 the same way as input<0. This is of
+// course not very safe, but we have been running in this way in Caffe for a
+// while so it *might* be safe to assume so.
+class MIOPENReluGradientOp final : public Operator<HIPContext> {
+ public:
+  MIOPENReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        power_(OperatorBase::GetSingleArgument<double>("power", 1.0)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateActivationDescriptor(&activ_desc_));
+    MIOPEN_ENFORCE(miopenSetActivationDescriptor(
+        activ_desc_, miopenActivationRELU, alpha_, beta_, power_));
+  }
+
+  ~MIOPENReluGradientOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyActivationDescriptor(activ_desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& Y = Input(0);
+    const auto& dY = Input(1);
+    auto* dX = Output(0);
+
+    // Return if Y is empty
+    if (Y.size() == 0) {
+      dX->mutable_data<T>();
+      return true;
+    }
+
+    // See if we need to reshape.
+    if (Y.dims() != miopen_input_dims_) {
+      VLOG(1) << "Setting descriptors.";
+      miopen_input_dims_ = Y.dims();
+      int C = 1, H = 1, W = 1;
+      if (Y.ndim() == 4) {
+        // Normal 4-dimensional tensors for images.
+        C = Y.dim32(1);
+        H = Y.dim32(2);
+        W = Y.dim32(3);
+      } else {
+        // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
+        // and wrap everything into C.
+        C = Y.size() / Y.dim32(0);
+      }
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          data_desc_, miopenTypeWrapper<T>::type, Y.dim32(0), C, H, W));
+    }
+    MIOPEN_ENFORCE(miopenActivationBackward(
+        miopen_wrapper_.inline_miopen_handle(),
+        activ_desc_,
+        &alpha_,
+        data_desc_,
+        Y.template data<T>(),
+        data_desc_,
+        dY.template data<T>(),
+        data_desc_,
+        Y.template data<T>(),
+        &beta_,
+        data_desc_,
+        dX->template mutable_data<T>()));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    const auto& Y = Input(0);
+    auto* dX = Output(0);
+    dX->ResizeLike(Y);
+    if (Y.IsType<float>()) {
+      return DoRunWithType<float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenActivationDescriptor_t activ_desc_;
+  vector<TIndex> miopen_input_dims_;
+  const float alpha_;
+  const float beta_;
+  const double power_;
+  // Input: Y, dY; Output: dX
+};
+
+namespace {
+REGISTER_MIOPEN_OPERATOR(Relu, MIOPENReluOp);
+REGISTER_MIOPEN_OPERATOR(ReluGradient, MIOPENReluGradientOp);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/hip/softmax_op_miopen.cc b/caffe2/operators/hip/softmax_op_miopen.cc
new file mode 100644
index 0000000..08c43a8
--- /dev/null
+++ b/caffe2/operators/hip/softmax_op_miopen.cc
@@ -0,0 +1,138 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/core/types.h"
+#include "caffe2/operators/softmax_op.h"
+
+namespace caffe2 {
+class MIOpenSoftmaxOp final : public Operator<HIPContext> {
+ public:
+  explicit MIOpenSoftmaxOp(const OperatorDef& def, Workspace* ws)
+      : Operator<HIPContext>(def, ws),
+        miopen_wrapper_(&context_),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&desc_));
+  }
+
+  ~MIOpenSoftmaxOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const int N = X.size_to_dim(canonical_axis);
+    const int D = X.size_from_dim(canonical_axis);
+
+    Y->ResizeLike(X);
+    if (dims_ != X.dims()) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          desc_, miopenTypeWrapper<T>::type, N, D, 1, 1));
+      dims_ = X.dims();
+    }
+    MIOPEN_ENFORCE(miopenSoftmaxForward(
+        miopen_wrapper_.inline_miopen_handle(),
+        &alpha_,
+        desc_,
+        X.template data<T>(),
+        &beta_,
+        desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t desc_;
+  vector<TIndex> dims_;
+  const int axis_;
+  const float alpha_;
+  const float beta_;
+};
+
+class MIOpenSoftmaxGradientOp final : public Operator<HIPContext> {
+ public:
+  explicit MIOpenSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<HIPContext>(def, ws),
+        miopen_wrapper_(&context_),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&desc_));
+  }
+
+  ~MIOpenSoftmaxGradientOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& Y = Input(0);
+    auto& dY = Input(1);
+    auto* dX = Output(0);
+    const auto canonical_axis = Y.canonical_axis_index(axis_);
+    const int N = Y.size_to_dim(canonical_axis);
+    const int D = Y.size_from_dim(canonical_axis);
+
+    CHECK_EQ(Y.dims(), dY.dims());
+    dX->ResizeLike(Y);
+    if (dims_ != Y.dims()) {
+      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+          desc_, miopenTypeWrapper<T>::type, N, D, 1, 1));
+      dims_ = Y.dims();
+    }
+    MIOPEN_ENFORCE(miopenSoftmaxBackward(
+        miopen_wrapper_.inline_miopen_handle(),
+        &alpha_,
+        desc_,
+        Y.template data<T>(),
+        desc_,
+        dY.template data<T>(),
+        &beta_,
+        desc_,
+        dX->template mutable_data<T>()));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  const int axis_;
+  const float alpha_;
+  const float beta_;
+  miopenTensorDescriptor_t desc_;
+  vector<TIndex> dims_;
+};
+
+namespace {
+REGISTER_MIOPEN_OPERATOR(Softmax, MIOpenSoftmaxOp);
+REGISTER_MIOPEN_OPERATOR(SoftmaxGradient, MIOpenSoftmaxGradientOp);
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
new file mode 100644
index 0000000..77f35b1
--- /dev/null
+++ b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
@@ -0,0 +1,318 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cfloat>
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/hip/miopen_wrapper.h"
+#include "caffe2/operators/spatial_batch_norm_op.h"
+#include "caffe2/utils/math.h"
+
+const double MIOPEN_BN_MIN_EPSILON = 1e-6;
+
+namespace caffe2 {
+
+class MIOpenSpatialBNOp final : public SpatialBNOp<HIPContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(HIPContext);
+  MIOpenSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNOp<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        mode_(miopenBNSpatial) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon_ <= MIOPEN_BN_MIN_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "MIOPEN_BN_MIN_EPSILON. Setting it to "
+                 << "MIOPEN_BN_MIN_EPSILON instead.";
+    }
+    epsilon_ = std::max(epsilon_, MIOPEN_BN_MIN_EPSILON);
+  }
+
+  ~MIOpenSpatialBNOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bn_param_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t bn_param_desc_;
+  vector<TIndex> miopen_input_dims_;
+  float alpha_;
+  float beta_;
+  miopenBatchNormMode_t mode_;
+};
+
+class MIOpenSpatialBNGradientOp final : public SpatialBNGradientOp<HIPContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(HIPContext);
+  MIOpenSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNGradientOp<HIPContext>(operator_def, ws),
+        miopen_wrapper_(&context_),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
+        mode_(miopenBNSpatial) {
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&data_desc_));
+    MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon_ <= MIOPEN_BN_MIN_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "MIOPEN_BN_MIN_EPSILON. Setting it to "
+                 << "MIOPEN_BN_MIN_EPSILON instead.";
+    }
+    epsilon_ = std::max(epsilon_, MIOPEN_BN_MIN_EPSILON);
+  }
+
+  ~MIOpenSpatialBNGradientOp() {
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(data_desc_));
+    MIOPEN_ENFORCE(miopenDestroyTensorDescriptor(bn_param_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ protected:
+  MIOPENWrapper miopen_wrapper_;
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t bn_param_desc_;
+  vector<TIndex> miopen_input_dims_;
+  float alpha_;
+  float beta_;
+  miopenBatchNormMode_t mode_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename M>
+bool MIOpenSpatialBNOp::DoRunWithType() {
+  // QoL
+  typedef typename miopenTypeWrapper<T>::BNParamType BNParamType;
+
+  auto& X = Input(INPUT);
+  auto& scale = Input(SCALE);
+  auto& bias = Input(BIAS);
+
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+  CAFFE_ENFORCE_EQ(bias.dim32(0), C);
+  // See if we need to reshape.
+  if (X.dims() != miopen_input_dims_) {
+    VLOG(1) << "Setting descriptors.";
+    miopen_input_dims_ = X.dims();
+    vector<int> dims = {N, C, H, W, D};
+    vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        data_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(
+        miopenDeriveBNTensorDescriptor(bn_param_desc_, data_desc_, mode_));
+  }
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  if (is_test_) {
+    // Run inference mode.
+    auto& est_mean = Input(EST_MEAN);
+    auto& est_var = Input(EST_VAR);
+    CAFFE_ENFORCE_EQ(est_mean.ndim(), 1);
+    CAFFE_ENFORCE_EQ(est_var.ndim(), 1);
+    CAFFE_ENFORCE_EQ(est_mean.dim32(0), C);
+    CAFFE_ENFORCE_EQ(est_var.dim32(0), C);
+
+    auto* Y = Output(OUTPUT);
+    Y->ResizeLike(X);
+    MIOPEN_ENFORCE(miopenBatchNormalizationForwardInference(
+        miopen_wrapper_.inline_miopen_handle(),
+        // Note: PERSISTENT not implemented for inference
+        mode_,
+        &alpha_,
+        &beta_,
+        data_desc_,
+        X.template data<T>(),
+        data_desc_,
+        Y->template mutable_data<T>(),
+        bn_param_desc_,
+        const_cast<float*>(scale.template data<BNParamType>()),
+        const_cast<float*>(bias.template data<BNParamType>()),
+        const_cast<float*>(est_mean.template data<BNParamType>()),
+        const_cast<float*>(est_var.template data<BNParamType>()),
+        epsilon_));
+  } else {
+    // Run training mode.
+    auto* Y = Output(OUTPUT);
+    Y->ResizeLike(X);
+    // obtain running mean and running inv var, and see if we need to
+    // initialize them.
+    auto* running_mean = Output(RUNNING_MEAN);
+    auto* running_var = Output(RUNNING_VAR);
+    double this_factor = 1. - momentum_;
+    BNParamType* running_mean_data = nullptr;
+    BNParamType* running_var_data = nullptr;
+    if (!running_mean->size()) {
+      // If the input mean and var are not initialized yet, this is the first
+      // run and we will initialize the storage.
+      VLOG(1) << "Initializing running mean and var.";
+      // Need to do initialization
+      running_mean->Resize(C);
+      running_var->Resize(C);
+      running_mean_data = running_mean->template mutable_data<BNParamType>();
+      running_var_data = running_var->template mutable_data<BNParamType>();
+      // In principle, setting this_momentum to 1 will wipe existing data.
+      // This has a caveat that if miopen does not deal with 0*NaN cases we
+      // will be having an issue. Thus we choose a safe path by explicitly
+      // setting zero.
+      math::Set<BNParamType, HIPContext>(C, 0, running_mean_data, &context_);
+      math::Set<BNParamType, HIPContext>(C, 0, running_var_data, &context_);
+    } else {
+      // Does not need to do initialization.
+      CAFFE_ENFORCE_EQ(running_mean->ndim(), 1);
+      CAFFE_ENFORCE_EQ(running_var->ndim(), 1);
+      CAFFE_ENFORCE_EQ(running_mean->dim32(0), C);
+      CAFFE_ENFORCE_EQ(running_var->dim32(0), C);
+      running_mean_data = running_mean->template mutable_data<BNParamType>();
+      running_var_data = running_var->template mutable_data<BNParamType>();
+    }
+    // Save the mean and inv var results.
+    auto* save_mean = Output(SAVED_MEAN);
+    auto* save_var = Output(SAVED_INV_VAR);
+    save_mean->Resize(C);
+    save_var->Resize(C);
+    void* save_mean_data = save_mean->template mutable_data<BNParamType>();
+    void* save_var_data = save_var->template mutable_data<BNParamType>();
+
+    MIOPEN_ENFORCE(miopenBatchNormalizationForwardTraining(
+        miopen_wrapper_.inline_miopen_handle(),
+        mode_,
+        &alpha_,
+        &beta_,
+        data_desc_,
+        X.template data<T>(),
+        data_desc_,
+        Y->template mutable_data<T>(),
+        bn_param_desc_,
+        const_cast<float*>(scale.template data<BNParamType>()),
+        const_cast<float*>(bias.template data<BNParamType>()),
+        this_factor,
+        const_cast<float*>(running_mean_data),
+        const_cast<float*>(running_var_data),
+        epsilon_,
+        save_mean_data,
+        save_var_data));
+  }
+  return true;
+}
+bool MIOpenSpatialBNOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+template <typename T, typename M>
+bool MIOpenSpatialBNGradientOp::DoRunWithType() {
+  typedef typename miopenTypeWrapper<T>::BNParamType BNParamType;
+
+  auto& X = Input(INPUT);
+  auto& scale = Input(SCALE);
+  auto& dY = Input(OUTPUT_GRAD);
+
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.ndim() > 3 ? X.dim32(3) : 1;
+  const int D = X.ndim() > 4 ? X.dim32(4) : 1;
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+  // See if we need to reshape.
+  if (X.dims() != miopen_input_dims_) {
+    vector<int> dims = {N, C, H, W, D};
+    vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+        data_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
+
+    MIOPEN_ENFORCE(
+        miopenDeriveBNTensorDescriptor(bn_param_desc_, data_desc_, mode_));
+  }
+
+  auto* dX = Output(INPUT_GRAD);
+  auto* dScale = Output(SCALE_GRAD);
+  auto* dBias = Output(BIAS_GRAD);
+  dX->ResizeLike(X);
+  dScale->ResizeLike(scale);
+  dBias->ResizeLike(scale);
+
+  const auto& saved_mean = Input(SAVED_MEAN);
+  const auto& saved_var = Input(SAVED_INV_VAR);
+  const void* saved_mean_data = saved_mean.template data<BNParamType>();
+  const void* saved_var_data = saved_var.template data<BNParamType>();
+
+  MIOPEN_ENFORCE(miopenBatchNormalizationBackward(
+      miopen_wrapper_.inline_miopen_handle(),
+      mode_,
+      &alpha_,
+      &beta_,
+      &alpha_,
+      &beta_,
+      data_desc_,
+      X.template data<T>(),
+      data_desc_,
+      dY.template data<T>(),
+      data_desc_,
+      dX->template mutable_data<T>(),
+      bn_param_desc_,
+      scale.template data<BNParamType>(),
+      dScale->template mutable_data<BNParamType>(),
+      dBias->template mutable_data<BNParamType>(),
+      epsilon_,
+      saved_mean_data,
+      saved_var_data));
+  return true;
+}
+bool MIOpenSpatialBNGradientOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+// Since there is no default implementation for spatial batch normalization,
+// we will register the miopen version as the default as well.
+REGISTER_HIP_OPERATOR(SpatialBN, MIOpenSpatialBNOp);
+REGISTER_HIP_OPERATOR(SpatialBNGradient, MIOpenSpatialBNGradientOp);
+
+REGISTER_MIOPEN_OPERATOR(SpatialBN, MIOpenSpatialBNOp);
+REGISTER_MIOPEN_OPERATOR(SpatialBNGradient, MIOpenSpatialBNGradientOp);
+} // namespace caffe2
diff --git a/caffe2/operators/if_op.cc b/caffe2/operators/if_op.cc
new file mode 100644
index 0000000..8abc146
--- /dev/null
+++ b/caffe2/operators/if_op.cc
@@ -0,0 +1,20 @@
+#include "caffe2/operators/if_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(If, IfOp<CPUContext>);
+
+OPERATOR_SCHEMA(If)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+'If' control operator, first input is a scalar boolean blob that stores condition
+value. Accepts 'then_net' (required) and 'else_net' (optional) arguments for 'then' and
+'else' subnets respectively. Subnets are executed in the same workspace as 'If'.
+    )DOC")
+    .Arg("then_net", "Net executed when condition is true")
+    .Arg("else_net", "Net executed when condition is false (optional)")
+    .Input(0, "condition", "Scalar boolean condition")
+    .AllowInplace([](int in, int out) -> bool { return true; });
+
+} // namespace caffe2
diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h
new file mode 100644
index 0000000..355dc31
--- /dev/null
+++ b/caffe2/operators/if_op.h
@@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_IF_OP_H_
+#define CAFFE2_OPERATORS_IF_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class IfOp final : public Operator<Context> {
+ public:
+  IfOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        this->template HasSingleArgumentOfType<NetDef>("then_net"),
+        "then_net must be specified in If operator");
+    auto then_net_def =
+        this->template GetSingleArgument<NetDef>("then_net", NetDef());
+    then_net_ = CreateNet(then_net_def, ws);
+    CAFFE_ENFORCE(then_net_, "Failed to initialize then subnet");
+
+    if (this->template HasSingleArgumentOfType<NetDef>("else_net")) {
+      auto else_net_def =
+          this->template GetSingleArgument<NetDef>("else_net", NetDef());
+      else_net_ = CreateNet(else_net_def, ws);
+      CAFFE_ENFORCE(else_net_, "Failed to initialize else subnet");
+    }
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(
+        this->template InputIsType<Tensor<Context>>(0),
+        "Invalid condition in If operator: tensor expected");
+
+    const auto& condition = Input(0);
+    CAFFE_ENFORCE_EQ(
+        condition.size(),
+        1,
+        "Invalid condition tensor in If operator: single value expected");
+
+    auto conditionValue = *condition.template data<bool>();
+    if (conditionValue) {
+      return then_net_->Run();
+    } else if (else_net_) {
+      return else_net_->Run();
+    }
+
+    return true;
+  }
+
+ private:
+  std::unique_ptr<NetBase> then_net_;
+  std::unique_ptr<NetBase> else_net_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_IF_OP_H_
diff --git a/caffe2/operators/if_op_gpu.cc b/caffe2/operators/if_op_gpu.cc
new file mode 100644
index 0000000..6e89812
--- /dev/null
+++ b/caffe2/operators/if_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/if_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(If, IfOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/im2col_op.cc b/caffe2/operators/im2col_op.cc
new file mode 100644
index 0000000..249904d
--- /dev/null
+++ b/caffe2/operators/im2col_op.cc
@@ -0,0 +1,104 @@
+#include "caffe2/operators/im2col_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(Im2Col, Im2ColOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(Col2Im, Col2ImOp<float, CPUContext>);
+
+class GetIm2ColGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Col2Im",
+        "",
+        std::vector<string>{GO(0), I(0)},
+        std::vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Im2Col, GetIm2ColGradient);
+
+class GetCol2ImGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Im2Col", "", std::vector<string>{GO(0)}, std::vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Col2Im, GetCol2ImGradient);
+
+OPERATOR_SCHEMA(Im2Col)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc("The Im2Col operator from Matlab.")
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          ArgumentHelper helper(def);
+          auto pad = helper.GetSingleArgument<int>("pad", 0);
+          auto kernel_h = helper.GetSingleArgument<int>(
+              "kernel_h", helper.GetSingleArgument<int>("kernel", 0));
+          auto kernel_w = helper.GetSingleArgument<int>(
+              "kernel_w", helper.GetSingleArgument<int>("kernel", 0));
+          auto dilation_h = helper.GetSingleArgument<int>(
+              "dilation_h", helper.GetSingleArgument<int>("dilation", 1));
+          auto dilation_w = helper.GetSingleArgument<int>(
+              "dilation_w", helper.GetSingleArgument<int>("dilation", 1));
+          auto stride_h = helper.GetSingleArgument<int>(
+              "stride_h", helper.GetSingleArgument<int>("stride", 1));
+          auto stride_w = helper.GetSingleArgument<int>(
+              "stride_w", helper.GetSingleArgument<int>("stride", 1));
+          auto order = StringToStorageOrder(
+              helper.GetSingleArgument<string>("order", "NCHW"));
+
+          const TensorShape& X = in[0];
+          int N = 0, C = 0, H = 0, W = 0;
+          switch (order) {
+            case StorageOrder::NCHW:
+              N = X.dims(0);
+              C = X.dims(1);
+              H = X.dims(2);
+              W = X.dims(3);
+              break;
+            case StorageOrder::NHWC:
+              N = X.dims(0);
+              H = X.dims(1);
+              W = X.dims(2);
+              C = X.dims(3);
+              break;
+            default:
+              CAFFE_THROW("Unknown storage order: ", order);
+          }
+
+          const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+          const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+          CAFFE_ENFORCE(H >= dkernel_h);
+          CAFFE_ENFORCE(W >= dkernel_w);
+          const int out_h = (H + 2 * pad - dkernel_h) / stride_h + 1;
+          const int out_w = (W + 2 * pad - dkernel_w) / stride_w + 1;
+
+          vector<TensorShape> out(1);
+          switch (order) {
+            case StorageOrder::NCHW:
+              out[0] = CreateTensorShape(
+                  vector<int>{N, C * kernel_h * kernel_w, out_h, out_w},
+                  TensorProto::FLOAT);
+              break;
+            case StorageOrder::NHWC:
+              out[0] = CreateTensorShape(
+                  vector<int>{N, out_h, out_w, kernel_h * kernel_w * C},
+                  TensorProto::FLOAT);
+              break;
+            default:
+              CAFFE_THROW("Unknown storage order: ", order);
+          }
+
+          return out;
+        })
+    .Input(0, "X", "4-tensor in NCHW or NHWC.")
+    .Output(
+        0,
+        "Y",
+        "4-tensor. For NCHW: N x (C x kH x kW) x outH x outW."
+        "For NHWC: N x outH x outW x (kH x kW x C");
+
+OPERATOR_SCHEMA(Col2Im).NumInputs(2).NumOutputs(1);
+
+} // namespace caffe2
diff --git a/caffe2/operators/im2col_op.h b/caffe2/operators/im2col_op.h
new file mode 100644
index 0000000..08067b2
--- /dev/null
+++ b/caffe2/operators/im2col_op.h
@@ -0,0 +1,292 @@
+#ifndef CAFFE2_OPERATORS_IM2COL_OP_H_
+#define CAFFE2_OPERATORS_IM2COL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class Im2ColOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  Im2ColOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
+        kernel_h_(OperatorBase::GetSingleArgument<int>(
+            "kernel_h",
+            OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        kernel_w_(OperatorBase::GetSingleArgument<int>(
+            "kernel_w",
+            OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        dilation_h_(OperatorBase::GetSingleArgument<int>(
+            "dilation_h",
+            OperatorBase::GetSingleArgument<int>("dilation", 1))),
+        dilation_w_(OperatorBase::GetSingleArgument<int>(
+            "dilation_w",
+            OperatorBase::GetSingleArgument<int>("dilation", 1))),
+        stride_h_(OperatorBase::GetSingleArgument<int>(
+            "stride_h",
+            OperatorBase::GetSingleArgument<int>("stride", 1))),
+        stride_w_(OperatorBase::GetSingleArgument<int>(
+            "stride_w",
+            OperatorBase::GetSingleArgument<int>("stride", 1))),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(kernel_h_ > 0);
+    CAFFE_ENFORCE(kernel_w_ > 0);
+    CAFFE_ENFORCE(dilation_h_ > 0);
+    CAFFE_ENFORCE(dilation_w_ > 0);
+    CAFFE_ENFORCE(stride_h_ > 0);
+    CAFFE_ENFORCE(stride_w_ > 0);
+    CAFFE_ENFORCE(pad_ >= 0);
+  }
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(4 == X.ndim());
+
+    int N = 0, C = 0, H = 0, W = 0;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        N = X.dim32(0);
+        C = X.dim32(1);
+        H = X.dim32(2);
+        W = X.dim32(3);
+        break;
+      case StorageOrder::NHWC:
+        N = X.dim32(0);
+        H = X.dim32(1);
+        W = X.dim32(2);
+        C = X.dim32(3);
+        break;
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+
+    const int dkernel_h = dilation_h_ * (kernel_h_ - 1) + 1;
+    const int dkernel_w = dilation_w_ * (kernel_w_ - 1) + 1;
+    CAFFE_ENFORCE(H >= dkernel_h);
+    CAFFE_ENFORCE(W >= dkernel_w);
+    const int out_h = (H + 2 * pad_ - dkernel_h) / stride_h_ + 1;
+    const int out_w = (W + 2 * pad_ - dkernel_w) / stride_w_ + 1;
+
+    switch (order_) {
+      case StorageOrder::NCHW: {
+        Y->Resize(
+            std::vector<TIndex>{N, C * kernel_h_ * kernel_w_, out_h, out_w});
+
+        const size_t dx = X.size() / N;
+        const size_t dy = Y->size() / N;
+        for (int n = 0; n < N; ++n) {
+          const auto* xdata = X.template data<T>() + (n * dx);
+          auto* ydata = Y->template mutable_data<T>() + (n * dy);
+          math::Im2Col<T, Context, StorageOrder::NCHW>(
+              C,
+              H,
+              W,
+              kernel_h_,
+              kernel_w_,
+              dilation_h_,
+              dilation_w_,
+              pad_,
+              pad_,
+              pad_,
+              pad_,
+              stride_h_,
+              stride_w_,
+              xdata,
+              ydata,
+              &context_);
+        }
+      }; break;
+      case StorageOrder::NHWC: {
+        Y->Resize(
+            std::vector<TIndex>{N, out_h, out_w, kernel_h_ * kernel_w_ * C});
+
+        const size_t dx = X.size() / N;
+        const size_t dy = Y->size() / N;
+        for (int n = 0; n < N; ++n) {
+          const auto* xdata = X.template data<T>() + (n * dx);
+          auto* ydata = Y->template mutable_data<T>() + (n * dy);
+          math::Im2Col<T, Context, StorageOrder::NHWC>(
+              C,
+              H,
+              W,
+              kernel_h_,
+              kernel_w_,
+              dilation_h_,
+              dilation_w_,
+              pad_,
+              pad_,
+              pad_,
+              pad_,
+              stride_h_,
+              stride_w_,
+              xdata,
+              ydata,
+              &context_);
+        }
+      }; break;
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+
+    return true;
+  }
+
+ private:
+  int pad_;
+  int kernel_h_;
+  int kernel_w_;
+  int dilation_h_;
+  int dilation_w_;
+  int stride_h_;
+  int stride_w_;
+  StorageOrder order_;
+};
+
+template <typename T, class Context>
+class Col2ImOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  Col2ImOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
+        kernel_h_(OperatorBase::GetSingleArgument<int>(
+            "kernel_h",
+            OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        kernel_w_(OperatorBase::GetSingleArgument<int>(
+            "kernel_w",
+            OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        dilation_h_(OperatorBase::GetSingleArgument<int>(
+            "dilation_h",
+            OperatorBase::GetSingleArgument<int>("dilation", 1))),
+        dilation_w_(OperatorBase::GetSingleArgument<int>(
+            "dilation_w",
+            OperatorBase::GetSingleArgument<int>("dilation", 1))),
+        stride_h_(OperatorBase::GetSingleArgument<int>(
+            "stride_h",
+            OperatorBase::GetSingleArgument<int>("stride", 1))),
+        stride_w_(OperatorBase::GetSingleArgument<int>(
+            "stride_w",
+            OperatorBase::GetSingleArgument<int>("stride", 1))),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(kernel_h_ > 0);
+    CAFFE_ENFORCE(kernel_w_ > 0);
+    CAFFE_ENFORCE(dilation_h_ > 0);
+    CAFFE_ENFORCE(dilation_w_ > 0);
+    CAFFE_ENFORCE(stride_h_ > 0);
+    CAFFE_ENFORCE(stride_w_ > 0);
+    CAFFE_ENFORCE(pad_ >= 0);
+  }
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto& Z = Input(1);
+    auto* Y = Output(0);
+    Y->ResizeLike(Z);
+    CAFFE_ENFORCE(4 == Y->ndim());
+
+    int N = 0, C = 0, H = 0, W = 0;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        N = Y->dim32(0);
+        C = Y->dim32(1);
+        H = Y->dim32(2);
+        W = Y->dim32(3);
+        break;
+      case StorageOrder::NHWC:
+        N = Y->dim32(0);
+        H = Y->dim32(1);
+        W = Y->dim32(2);
+        C = Y->dim32(3);
+        break;
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+
+    const int dkernel_h = dilation_h_ * (kernel_h_ - 1) + 1;
+    const int dkernel_w = dilation_w_ * (kernel_w_ - 1) + 1;
+    CAFFE_ENFORCE(H >= dkernel_h);
+    CAFFE_ENFORCE(W >= dkernel_w);
+    const int out_h = (H + 2 * pad_ - dkernel_h) / stride_h_ + 1;
+    const int out_w = (W + 2 * pad_ - dkernel_w) / stride_w_ + 1;
+    CAFFE_ENFORCE(X.size() == N * kernel_h_ * kernel_w_ * C * out_h * out_w);
+
+    const size_t dx = X.size() / N;
+    const size_t dy = Y->size() / N;
+
+    // could template-specialize this, but it's test code...
+    switch (order_) {
+      case StorageOrder::NCHW: {
+        for (int n = 0; n < N; ++n) {
+          const auto* xdata = X.template data<T>() + (n * dx);
+          auto* ydata = Y->template mutable_data<T>() + (n * dy);
+          math::Col2Im<T, Context, StorageOrder::NCHW>(
+              C,
+              H,
+              W,
+              kernel_h_,
+              kernel_w_,
+              dilation_h_,
+              dilation_w_,
+              pad_,
+              pad_,
+              pad_,
+              pad_,
+              stride_h_,
+              stride_w_,
+              xdata,
+              ydata,
+              &context_);
+        }
+      }; break;
+      case StorageOrder::NHWC: {
+        for (int n = 0; n < N; ++n) {
+          const auto* xdata = X.template data<T>() + (n * dx);
+          auto* ydata = Y->template mutable_data<T>() + (n * dy);
+          math::Col2Im<T, Context, StorageOrder::NHWC>(
+              C,
+              H,
+              W,
+              kernel_h_,
+              kernel_w_,
+              dilation_h_,
+              dilation_w_,
+              pad_,
+              pad_,
+              pad_,
+              pad_,
+              stride_h_,
+              stride_w_,
+              xdata,
+              ydata,
+              &context_);
+        }
+      }; break;
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+
+    return true;
+  }
+
+ private:
+  int pad_;
+  int kernel_h_;
+  int kernel_w_;
+  int dilation_h_;
+  int dilation_w_;
+  int stride_h_;
+  int stride_w_;
+  StorageOrder order_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_IM2COL_OP_H_
diff --git a/caffe2/operators/im2col_op_gpu.cc b/caffe2/operators/im2col_op_gpu.cc
new file mode 100644
index 0000000..ebfec0f
--- /dev/null
+++ b/caffe2/operators/im2col_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/im2col_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Im2Col, Im2ColOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Col2Im, Col2ImOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/index_hash_ops.cc b/caffe2/operators/index_hash_ops.cc
new file mode 100644
index 0000000..4f37489
--- /dev/null
+++ b/caffe2/operators/index_hash_ops.cc
@@ -0,0 +1,32 @@
+#include "caffe2/operators/index_hash_ops.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(IndexHash, IndexHashOp<CPUContext>);
+
+OPERATOR_SCHEMA(IndexHash)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This operator translates a list of indices into a list of hashed indices.
+A seed can be fed as an argument to change the behavior of the hash function.
+If a modulo is specified, all the hashed indices will be modulo the
+specified number. All input and output indices are enforced to be positive.
+)DOC")
+    .Input(0, "Indices", "Input feature indices.")
+    .Output(0, "HashedIndices", "Hashed feature indices.")
+    .Arg("seed", "seed for the hash function")
+    .Arg("modulo", "must be > 0, hashed ids will be modulo this number")
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      std::vector<TensorShape> out(1);
+      std::vector<TIndex> output_dims = GetDimsVector(in[0]);
+      out[0] = CreateTensorShape(output_dims, in[0].data_type());
+      return out;
+    });
+
+SHOULD_NOT_DO_GRADIENT(IndexHash);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/index_hash_ops.h b/caffe2/operators/index_hash_ops.h
new file mode 100644
index 0000000..e77fdb4
--- /dev/null
+++ b/caffe2/operators/index_hash_ops.h
@@ -0,0 +1,72 @@
+#ifndef CAFFE2_OPERATORS_INDEX_HASH_OPS_H_
+#define CAFFE2_OPERATORS_INDEX_HASH_OPS_H_
+
+#include "caffe2/core/asan.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class IndexHashOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  IndexHashOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        seed_(OperatorBase::GetSingleArgument<int64_t>("seed", 0)),
+        modulo_(OperatorBase::GetSingleArgument<int64_t>("modulo", 0)) {
+    CAFFE_ENFORCE_GT(modulo_, 0, "MODULO should be > 0");
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& indices = Input(INDICES);
+    auto* hashed_indices = Output(HASHED_INDICES);
+    hashed_indices->ResizeLike(indices);
+
+    CAFFE_ENFORCE_GE(
+        static_cast<int64_t>(std::numeric_limits<T>::max()),
+        modulo_,
+        "MODULO shouldn't be larger than the numeric limit of the indices");
+
+    auto N = indices.size();
+    auto* indices_data = indices.template data<T>();
+    auto* hashed_indices_data = hashed_indices->template mutable_data<T>();
+
+    for (auto i = 0; i < N; i++) {
+      hashed_indices_data[i] = hash(indices_data[i]);
+    }
+
+    return true;
+  }
+
+ protected:
+  template <typename T>
+  CAFFE2_NO_SANITIZE("signed-integer-overflow") T hash(T id) {
+    int8_t* bytes = (int8_t*)&id;
+    T hashed = seed_ * 0xDEADBEEF;
+    for (int i = 0; i < sizeof(T) / sizeof(int8_t); i++) {
+      hashed = hashed * 65537 + bytes[i];
+    }
+    // We want the result of the modulo to be positive. This works under the
+    // assumption that modulo_ > 0 which is enforced in the constructor.
+    auto modHashed = hashed % modulo_;
+    return modHashed >= 0 ? modHashed : modHashed + modulo_;
+  }
+
+ private:
+  INPUT_TAGS(INDICES);
+  OUTPUT_TAGS(HASHED_INDICES);
+
+  int64_t seed_;
+  int64_t modulo_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_INDEX_HASH_OPS_H_
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
new file mode 100644
index 0000000..dd04c87
--- /dev/null
+++ b/caffe2/operators/index_ops.cc
@@ -0,0 +1,445 @@
+#include <atomic>
+#include <limits>
+#include <mutex>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace {
+using IndexKeyTypes = TensorTypes<int32_t, int64_t, std::string>;
+using TIndexValue = int64_t;
+}  // namespace
+
+struct IndexBase {
+ public:
+  IndexBase(TIndexValue maxElements, const TypeMeta& type)
+    : maxElements_{maxElements}
+    , meta_(type)
+    , frozen_{false} {}
+
+  void Freeze() { frozen_ = true; }
+
+  bool isFrozen() const {
+    return frozen_;
+  }
+
+  int64_t maxElements() const {
+    return maxElements_;
+  }
+
+  virtual ~IndexBase() {}
+
+  const TypeMeta& Type() const { return meta_; }
+
+  TIndexValue Size() {
+    std::lock_guard<std::mutex> guard(dictMutex_);
+    return nextId_;
+  }
+
+ protected:
+  int64_t maxElements_;
+  TypeMeta meta_;
+  TIndexValue nextId_{1}; // guarded by dictMutex_
+  std::atomic<bool> frozen_{false};
+  std::mutex dictMutex_;
+};
+
+template<typename T>
+struct Index: IndexBase {
+  explicit Index(TIndexValue maxElements)
+    : IndexBase(maxElements, TypeMeta::Make<T>()) {}
+
+  void Get(const T* keys, TIndexValue* values, size_t numKeys) {
+    if (frozen_) {
+      FrozenGet(keys, values, numKeys);
+      return;
+    }
+    std::lock_guard<std::mutex> lock(dictMutex_);
+    for (int i = 0; i < numKeys; ++i) {
+      auto it = dict_.find(keys[i]);
+      if (it != dict_.end()) {
+        values[i] = it->second;
+      } else if (nextId_ < maxElements_) {
+        auto newValue = nextId_++;
+        dict_.insert({keys[i], newValue});
+        values[i] = newValue;
+      } else {
+        CAFFE_THROW("Dict max size reached");
+      }
+    }
+  }
+
+  bool Load(const T* keys, size_t numKeys) {
+    CAFFE_ENFORCE(
+        numKeys <= maxElements_,
+        "Cannot load index: Tensor is larger than max_elements.");
+    decltype(dict_) dict;
+    for (int i = 0; i < numKeys; ++i) {
+      CAFFE_ENFORCE(
+          dict.insert({keys[i], i + 1}).second,
+          "Repeated elements found: cannot load into dictionary.");
+    }
+    // assume no `get` is inflight while this happens
+    {
+      std::lock_guard<std::mutex> lock(dictMutex_);
+      // let the old dict get destructed outside of the lock
+      dict_.swap(dict);
+      nextId_ = numKeys + 1;
+    }
+    return true;
+  }
+
+  template<typename Ctx>
+  bool Store(Tensor<Ctx>* out) {
+    std::lock_guard<std::mutex> lock(dictMutex_);
+    out->Resize(nextId_ - 1);
+    auto outData = out->template mutable_data<T>();
+    for (const auto& entry : dict_) {
+      outData[entry.second - 1] = entry.first;
+    }
+    return true;
+  }
+
+ private:
+  void FrozenGet(const T* keys, TIndexValue* values, size_t numKeys) {
+    for (int i = 0; i < numKeys; ++i) {
+      auto it = dict_.find(keys[i]);
+      values[i] = it != dict_.end() ? it->second : 0;
+    }
+  }
+
+  std::unordered_map<T, TIndexValue> dict_;
+};
+
+// TODO(azzolini): support sizes larger than int32
+template<class T>
+class IndexCreateOp: public Operator<CPUContext> {
+ public:
+  IndexCreateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        maxElements_(OperatorBase::GetSingleArgument<int>(
+            "max_elements",
+            std::numeric_limits<int>::max())) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<IndexBase>>(0) =
+      std::unique_ptr<IndexBase>(new Index<T>(maxElements_));
+    return true;
+  }
+
+ private:
+  TIndexValue maxElements_;
+};
+
+class IndexGetOp: public Operator<CPUContext> {
+ public:
+  IndexGetOp(const OperatorDef& operator_def, Workspace* ws)
+   : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<IndexKeyTypes>::call(this, Input(1));
+  }
+  template <typename T>
+  bool DoRunWithType() {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
+    CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys.");
+    const auto& keys = Input(1);
+    auto* values = Output(0);
+    values->ResizeLike(keys);
+    dict->Get(keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+    return true;
+  }
+};
+
+class IndexLoadOp: public Operator<CPUContext> {
+ public:
+  IndexLoadOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        skipFirstEntry_(
+            OperatorBase::GetSingleArgument<int>("skip_first_entry", 0)) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<IndexKeyTypes>::call(this, Input(1));
+  }
+  template <typename T>
+  bool DoRunWithType() {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
+    CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys.");
+    const auto& keys = Input(1);
+    const auto* keys_data = keys.data<T>();
+    auto keys_size = keys.size();
+    if (skipFirstEntry_) {
+      CAFFE_ENFORCE(keys.size() > 0);
+      ++keys_data;
+      --keys_size;
+    }
+    return dict->Load(keys_data, keys_size);
+  }
+
+ private:
+  bool skipFirstEntry_;
+};
+
+class IndexStoreOp: public Operator<CPUContext> {
+ public:
+  IndexStoreOp(const OperatorDef& operator_def, Workspace* ws)
+   : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    return DispatchHelper<IndexKeyTypes>::call(this, base->Type());
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
+    CAFFE_ENFORCE(dict);
+    return dict->Store(Output(0));
+  }
+};
+
+class IndexFreezeOp: public Operator<CPUContext> {
+ public:
+  IndexFreezeOp(const OperatorDef& operator_def, Workspace* ws)
+   : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    base->Freeze();
+    return true;
+  }
+};
+
+class IndexSizeOp : public Operator<CPUContext> {
+ public:
+  IndexSizeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    auto* out = Output(0);
+    out->Resize(std::vector<TIndex>{});
+    *out->mutable_data<TIndexValue>() = base->Size();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(IntIndexCreate, IndexCreateOp<int32_t>);
+REGISTER_CPU_OPERATOR(LongIndexCreate, IndexCreateOp<int64_t>);
+REGISTER_CPU_OPERATOR(StringIndexCreate, IndexCreateOp<std::string>);
+
+REGISTER_CPU_OPERATOR(IndexGet, IndexGetOp);
+REGISTER_CPU_OPERATOR(IndexLoad, IndexLoadOp);
+REGISTER_CPU_OPERATOR(IndexStore, IndexStoreOp);
+REGISTER_CPU_OPERATOR(IndexFreeze, IndexFreezeOp);
+REGISTER_CPU_OPERATOR(IndexSize, IndexSizeOp);
+
+OPERATOR_SCHEMA(IntIndexCreate)
+  .NumInputs(0)
+  .NumOutputs(1)
+  .SetDoc(R"DOC(
+Creates a dictionary that maps int32 keys to consecutive integers
+from 1 to max_elements. Zero is reserved for unknown keys.
+)DOC")
+  .Arg("max_elements", "Max number of elements, including the zero entry.")
+  .Output(0, "handler", "Pointer to an Index instance.");
+
+OPERATOR_SCHEMA(LongIndexCreate)
+  .NumInputs(0)
+  .NumOutputs(1)
+  .SetDoc(R"DOC(
+Creates a dictionary that maps int64 keys to consecutive integers
+from 1 to max_elements. Zero is reserved for unknown keys.
+)DOC")
+  .Arg("max_elements", "Max number of elements, including the zero entry.")
+  .Output(0, "handler", "Pointer to an Index instance.");
+
+OPERATOR_SCHEMA(StringIndexCreate)
+  .NumInputs(0)
+  .NumOutputs(1)
+  .SetDoc(R"DOC(
+Creates a dictionary that maps string keys to consecutive integers
+from 1 to max_elements. Zero is reserved for unknown keys.
+)DOC")
+  .Arg("max_elements", "Max number of elements, including the zero entry.")
+  .Output(0, "handle", "Pointer to an Index instance.");
+
+OPERATOR_SCHEMA(IndexGet)
+  .NumInputs(2)
+  .NumOutputs(1)
+  .SetDoc(R"DOC(
+Given an index handle and a tensor of keys, return an Int tensor of same shape
+containing the indices for each of the keys. If the index is frozen, unknown
+entries are given index 0. Otherwise, new entries are added into the index.
+If an insert is necessary but max_elements has been reached, fail.
+)DOC")
+  .Input(0, "handle", "Pointer to an Index instance.")
+  .Input(1, "keys", "Tensor of keys to be looked up.")
+  .Output(0, "indices", "Indices for each of the keys.");
+
+OPERATOR_SCHEMA(IndexFreeze)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Freezes the given index, disallowing creation of new index entries.
+Should not be called concurrently with IndexGet.
+)DOC")
+    .Input(0, "handle", "Pointer to an Index instance.")
+    .Output(0, "handle", "The input handle.")
+    .EnforceInplace({{0, 0}});
+
+OPERATOR_SCHEMA(IndexLoad)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Loads the index from the given 1-D tensor. Elements in the tensor will be given
+consecutive indexes starting at 1. Fails if tensor contains repeated elements.
+)DOC")
+    .Input(0, "handle", "Pointer to an Index instance.")
+    .Input(1, "items", "1-D tensor with elements starting with index 1.")
+    .Output(0, "handle", "The input handle.")
+    .EnforceInplace({{0, 0}})
+    .Arg(
+        "skip_first_entry",
+        "If set, skips the first entry of the tensor. This allows "
+        "to load tensors that are aligned with an embedding, where the first "
+        "entry corresponds to the default 0 index entry.");
+
+OPERATOR_SCHEMA(IndexStore)
+  .NumInputs(1)
+  .NumOutputs(1)
+  .SetDoc(R"DOC(
+Stores the keys of this index in a 1-D tensor. Since element 0 is reserved
+for unknowns, the first element of the output tensor will be element of index 1.
+)DOC")
+  .Input(0, "handle", "Pointer to an Index instance.")
+  .Output(0, "items", "1-D tensor with elements starting with index 1.");
+
+OPERATOR_SCHEMA(IndexSize)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Returns the number of entries currently present in the index.
+)DOC")
+    .Input(0, "handle", "Pointer to an Index instance.")
+    .Output(0, "items", "Scalar int64 tensor with number of entries.");
+
+NO_GRADIENT(IndexGetOp);
+NO_GRADIENT(IntIndexCreate);
+NO_GRADIENT(LongIndexCreate);
+NO_GRADIENT(StringIndexCreate);
+SHOULD_NOT_DO_GRADIENT(IndexFreeze);
+SHOULD_NOT_DO_GRADIENT(IndexLoad);
+SHOULD_NOT_DO_GRADIENT(IndexStore);
+SHOULD_NOT_DO_GRADIENT(IndexSize);
+
+class IndexSerializer : public BlobSerializerBase {
+ public:
+  IndexSerializer() {}
+  ~IndexSerializer() {}
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
+    Blob tensor_blob;
+    auto* tensor_out = tensor_blob.template GetMutable<Tensor<CPUContext>>();
+
+    if (base->Type().Match<std::string>()) {
+      doStore<std::string>(base, tensor_out);
+    } else if (base->Type().Match<int32_t>()) {
+      doStore<int32_t>(base, tensor_out);
+    } else if (base->Type().Match<int64_t>()) {
+      doStore<int64_t>(base, tensor_out);
+    } else {
+      CAFFE_THROW("Index of this type can't be serialized.");
+    }
+
+    CAFFE_ENFORCE(
+        tensor_out->size() <= std::numeric_limits<int32_t>::max(),
+        "Index too large to be serialized.");
+    BlobProto blob_proto;
+    TensorSerializer<CPUContext> ser;
+    ser.Serialize(
+        *tensor_out, name, blob_proto.mutable_tensor(), 0, tensor_out->size());
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::unique_ptr<caffe2::IndexBase>");
+
+    std::ostringstream os;
+    os << base->maxElements() << " " << base->isFrozen();
+    blob_proto.set_content(os.str());
+
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+
+ private:
+  template <typename T>
+  void doStore(
+      const std::unique_ptr<IndexBase>& base,
+      Tensor<CPUContext>* tensor_out) {
+    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
+    CAFFE_ENFORCE(dict, "Wrong dictionary type.");
+    dict->Store(tensor_out);
+  }
+};
+
+class IndexDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    TensorDeserializer<CPUContext> deser;
+    Blob tensor_blob;
+    deser.Deserialize(proto, &tensor_blob);
+
+    std::istringstream is(proto.content());
+    int64_t maxElements{std::numeric_limits<int64_t>::max()};
+    bool isFrozen{false};
+    is >> maxElements >> isFrozen;
+
+    auto& tensor_in = tensor_blob.template Get<Tensor<CPUContext>>();
+    auto* base = blob->template GetMutable<std::unique_ptr<IndexBase>>();
+
+    if (tensor_in.IsType<std::string>()) {
+      doLoad<std::string>(base, maxElements, tensor_in);
+    } else if (tensor_in.IsType<int32_t>()) {
+      doLoad<int32_t>(base, maxElements, tensor_in);
+    } else if (tensor_in.IsType<int64_t>()) {
+      doLoad<int64_t>(base, maxElements, tensor_in);
+    } else {
+      CAFFE_THROW("Index of this type cannot be deserialized.");
+    }
+
+    if (isFrozen) {
+      (*base)->Freeze();
+    }
+  }
+
+ private:
+  template <typename T>
+  void doLoad(
+      std::unique_ptr<IndexBase>* base,
+      int64_t maxElements,
+      const Tensor<CPUContext>& tensor_in) {
+    base->reset(new Index<T>(maxElements));
+    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base->get());
+    dict->Load(tensor_in.data<T>(), tensor_in.size());
+  }
+};
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<caffe2::IndexBase>);
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<caffe2::IndexBase>>()),
+    IndexSerializer);
+REGISTER_BLOB_DESERIALIZER(
+    std::unique_ptr<caffe2::IndexBase>,
+    IndexDeserializer);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/instance_norm_gradient_op.cc b/caffe2/operators/instance_norm_gradient_op.cc
new file mode 100644
index 0000000..077020e
--- /dev/null
+++ b/caffe2/operators/instance_norm_gradient_op.cc
@@ -0,0 +1,261 @@
+#include "caffe2/operators/instance_norm_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <typename T, typename Context>
+bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  const auto& output_grad = Input(OUTPUT_GRAD);
+  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
+  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
+  auto input_grad = Output(INPUT_GRAD);
+  auto scale_grad = Output(SCALE_GRAD);
+  auto bias_grad = Output(BIAS_GRAD);
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int H = input.dim32(1);
+  const int W = input.dim32(2);
+  const int C = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
+  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
+  CAFFE_ENFORCE_EQ(H, output_grad.dim32(1));
+  CAFFE_ENFORCE_EQ(W, output_grad.dim32(2));
+  CAFFE_ENFORCE_EQ(C, output_grad.dim32(3));
+  input_grad->ResizeLike(input);
+  scale_grad->ResizeLike(scale);
+  bias_grad->ResizeLike(bias);
+
+  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
+  EigenVectorArrayMap<T> scale_grad_arr(
+      scale_grad->template mutable_data<T>(), C);
+  EigenVectorArrayMap<T> bias_grad_arr(
+      bias_grad->template mutable_data<T>(), C);
+
+  // Resize before we get into the per-instance loop
+  if (InputSize() < 5) {
+    mean_.Resize(N, C);
+  }
+  if (InputSize() < 6) {
+    inv_stdev_.Resize(N, C);
+  }
+
+  // looping over per-instance and using Eigen blocks to extract out
+  // a chunk of channels
+  for (int n = 0; n < N; ++n) {
+    // All Eigen mats and arrs in here are per-instance.
+    ConstEigenArrayMap<T> input_mat(
+        input.template data<T>() + n * C * H * W, C, H * W);
+    ConstEigenArrayMap<T> output_grad_mat(
+        output_grad.template data<T>() + n * C * H * W, C, H * W);
+    EigenArrayMap<T> input_grad_mat(
+        input_grad->template mutable_data<T>() + n * C * H * W, C, H * W);
+
+    // Compute mean if it wasn't passed in
+    if (InputSize() < 5) {
+      EigenVectorArrayMap<T> mean_mutable_arr(
+          mean_.template mutable_data<T>() + n * C, C);
+      mean_mutable_arr = input_mat.rowwise().mean();
+    }
+    CAFFE_ENFORCE_EQ(2, mean.ndim());
+    CAFFE_ENFORCE_EQ(N, mean.dim32(0));
+    CAFFE_ENFORCE_EQ(C, mean.dim32(1));
+    ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>() + n * C, C);
+
+    // subtract mean
+    input_grad_mat = input_mat.colwise() - mean_arr;
+
+    // Compute 1 / stdev if it wasn't passed in
+    if (InputSize() < 6) {
+      EigenVectorArrayMap<T> inv_stdev_mutable_arr(
+          inv_stdev_.template mutable_data<T>() + n * C, C);
+
+      // Square the diffs along each channel and take the mean to get var
+      inv_stdev_mutable_arr = input_grad_mat.pow(2).rowwise().mean();
+      // sqrt to get stdev and take the inverse
+      inv_stdev_mutable_arr =
+          (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
+    }
+    CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
+    CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
+    CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
+
+    ConstEigenVectorArrayMap<T> inv_stdev_arr(
+        inv_stdev.template data<T>() + n * C, C);
+
+    // for each channel
+    // dl/dbias = sum_j dl/dy_j
+    bias_grad_arr += output_grad_mat.rowwise().sum();
+    // for each channel
+    // dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
+    scale_grad_arr +=
+        ((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
+            .rowwise()
+            .sum();
+
+    // dl/dx_j = this gross thing
+    // Derived gradient and manually massaged it to minimize extra storage
+    // and number of vectorized calls.  Verified it with the autograd package
+    // in python.
+
+    // a = -1/(HW) sum_j dl/dy_j * (x_j - mu) / stdev^3
+    const auto temp = (inv_stdev_arr.pow(3) *
+                       (input_grad_mat * output_grad_mat).rowwise().mean() *
+                       -1).eval();
+    // b_j = a * (x_j - mu)
+    input_grad_mat.colwise() *= temp;
+
+    // c_j = b_j + dl/dy_j / stdev
+    input_grad_mat += output_grad_mat.colwise() * inv_stdev_arr;
+
+    // dl/dx_j = s * (c_j - mean(c_j))
+    const auto result_mean = input_grad_mat.rowwise().mean().eval();
+    input_grad_mat.colwise() -= result_mean;
+    input_grad_mat.colwise() *= scale_arr;
+  }
+
+  return true;
+}
+
+template <typename T, typename Context>
+bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  const auto& output_grad = Input(OUTPUT_GRAD);
+  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
+  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
+  auto input_grad = Output(INPUT_GRAD);
+  auto scale_grad = Output(SCALE_GRAD);
+  auto bias_grad = Output(BIAS_GRAD);
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int C = input.dim32(1);
+  const int H = input.dim32(2);
+  const int W = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
+  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
+  CAFFE_ENFORCE_EQ(C, output_grad.dim32(1));
+  CAFFE_ENFORCE_EQ(H, output_grad.dim32(2));
+  CAFFE_ENFORCE_EQ(W, output_grad.dim32(3));
+  input_grad->ResizeLike(input);
+  scale_grad->ResizeLike(scale);
+  bias_grad->ResizeLike(bias);
+
+  ConstEigenArrayMap<T> input_mat(input.template data<T>(), H * W, N * C);
+  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
+  ConstEigenArrayMap<T> output_grad_mat(
+      output_grad.template data<T>(), H * W, N * C);
+
+  EigenArrayMap<T> input_grad_mat(
+      input_grad->template mutable_data<T>(), H * W, N * C);
+  EigenVectorArrayMap<T> scale_grad_arr(
+      scale_grad->template mutable_data<T>(), C);
+  EigenVectorArrayMap<T> bias_grad_arr(
+      bias_grad->template mutable_data<T>(), C);
+
+  // Compute mean if it wasn't passed in
+  if (InputSize() < 5) {
+    mean_.Resize(N, C);
+    EigenVectorArrayMap<T> mean_mutable_arr(
+        mean_.template mutable_data<T>(), N * C);
+    mean_mutable_arr = input_mat.colwise().mean();
+  }
+  CAFFE_ENFORCE_EQ(2, mean.ndim());
+  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
+  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
+  ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>(), N * C);
+
+  // subtract mean
+  input_grad_mat = input_mat.rowwise() - mean_arr.transpose();
+
+  // compute 1 / stdev if not passed in
+  if (InputSize() < 6) {
+    inv_stdev_.Resize(N, C);
+    EigenVectorArrayMap<T> inv_stdev_mutable_arr(
+        inv_stdev_.template mutable_data<T>(), N * C);
+
+    // Square the diffs along each column and take mean to get var
+    inv_stdev_mutable_arr = input_grad_mat.pow(2).colwise().mean();
+    // sqrt to get stdev and then invert
+    inv_stdev_mutable_arr = (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
+  }
+  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
+  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
+  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
+
+  ConstEigenVectorArrayMap<T> inv_stdev_arr(
+      inv_stdev.template data<T>(), N * C);
+
+  // Visit comments in the NHWC version about these gradients.  scale and bias
+  // grads are about the same, but the input grads no longer slice out one
+  // example at a time and instead vectorize across all N * C feature maps.
+
+  // scale and bias gradients
+  scale_grad_arr.setZero();
+  bias_grad_arr.setZero();
+  for (int n = 0; n < N; ++n) {
+    scale_grad_arr += ((input_grad_mat.rowwise() * inv_stdev_arr.transpose()) *
+                       output_grad_mat)
+                          .block(0, n * C, H * W, C)
+                          .colwise()
+                          .sum();
+    bias_grad_arr += output_grad_mat.block(0, n * C, H * W, C).colwise().sum();
+  }
+
+  // input gradient
+  const auto temp = ((inv_stdev_arr.pow(3).transpose() *
+                      (input_grad_mat * output_grad_mat).colwise().mean()) *
+                     -1).eval();
+  input_grad_mat.rowwise() *= temp;
+
+  input_grad_mat += output_grad_mat.rowwise() * inv_stdev_arr.transpose();
+
+  const auto result_mean = input_grad_mat.colwise().mean().eval();
+  input_grad_mat.rowwise() -= result_mean;
+
+  for (int n = 0; n < N; ++n) {
+    input_grad_mat.block(0, n * C, H * W, C).rowwise() *= scale_arr.transpose();
+  }
+
+  return true;
+}
+
+class GetInstanceNormGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> inputs{I(0), I(1), I(2), GO(0)};
+    if (def_.output_size() >= 2) {
+      inputs.push_back(O(1));
+    }
+    if (def_.output_size() >= 3) {
+      inputs.push_back(O(2));
+    }
+    return SingleGradientDef(
+        "InstanceNormGradient",
+        "",
+        inputs,
+        vector<string>{GI(0), GI(1), GI(2)});
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    InstanceNormGradient,
+    InstanceNormGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(InstanceNormGradient).NumInputs(4, 6).NumOutputs(3);
+
+REGISTER_GRADIENT(InstanceNorm, GetInstanceNormGradient);
+}
diff --git a/caffe2/operators/instance_norm_op.cc b/caffe2/operators/instance_norm_op.cc
new file mode 100644
index 0000000..b0d0dea
--- /dev/null
+++ b/caffe2/operators/instance_norm_op.cc
@@ -0,0 +1,204 @@
+#include "caffe2/operators/instance_norm_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+// Here lives two separate implementations of the forward and backward passes of
+// instance normalization, one for NHWC order and the other for NCHW order.
+// Two implementations allow us to make use of Eigen vectorized operations
+// without an expensive tensor transpose operation.
+
+template <typename T, typename Context>
+bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(INPUT);
+  auto* Y = Output(OUTPUT);
+  CAFFE_ENFORCE(Y != &X, "Can't run InstanceNorm NHWC in-place");
+  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
+  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
+  const int N = X.dim32(0);
+  const int H = X.dim32(1);
+  const int W = X.dim32(2);
+  const int C = X.dim32(3);
+  const size_t offset = H * W * C;
+
+  CAFFE_ENFORCE_EQ(Input(SCALE).size(), C);
+  CAFFE_ENFORCE_EQ(Input(BIAS).size(), C);
+
+  Y->ResizeLike(X);
+  mean->Resize(N, C);
+  inv_stdev->Resize(N, C);
+  ConstEigenVectorArrayMap<T> scale(Input(SCALE).template data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias(Input(BIAS).template data<T>(), C);
+  for (int n = 0; n < N; ++n) {
+    ConstEigenArrayMap<T> Xmat(X.template data<T>() + offset * n, C, H * W);
+    EigenArrayMap<T> Ymat(Y->template mutable_data<T>() + offset * n, C, H * W);
+    EigenVectorArrayMap<T> mean_arr(
+        mean->template mutable_data<T>() + n * C, C);
+    EigenVectorArrayMap<T> inv_stdev_arr(
+        inv_stdev->template mutable_data<T>() + n * C, C);
+
+    // The following effectively does the row wise mean computation:
+    //   mean_arr = Xmat.rowwise().mean();
+    // but manually vectorizes over columns.
+    mean_arr = Xmat.col(0);
+    for (int i = 1; i < H * W; ++i) {
+      mean_arr += Xmat.col(i);
+    }
+    mean_arr *= 1. / (H * W);
+    Ymat = Xmat.colwise() - mean_arr;
+    // The following effectively does row wise squared norm computation,
+    // but manually vectorizes over columns similar to the mean case.
+    inv_stdev_arr = Ymat.col(0) * Ymat.col(0);
+    for (int i = 1; i < H * W; ++i) {
+      inv_stdev_arr += Ymat.col(i) * Ymat.col(i);
+    }
+    inv_stdev_arr = (inv_stdev_arr / (H * W) + epsilon_).sqrt().inverse();
+    Ymat = (Ymat.colwise() * (inv_stdev_arr * scale)).colwise() + bias;
+  }
+  return true;
+}
+
+template <typename T, typename Context>
+bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  auto* Y = Output(OUTPUT);
+  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
+  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+
+  CAFFE_ENFORCE_EQ(scale.size(), C);
+  CAFFE_ENFORCE_EQ(bias.size(), C);
+
+  Y->ResizeLike(X);
+  mean->Resize(N, C);
+  inv_stdev->Resize(N, C);
+
+  const auto* Xdata = X.template data<T>();
+  auto* Ydata = Y->template mutable_data<T>();
+  const auto* scale_data = scale.template data<T>();
+  const auto* bias_data = bias.template data<T>();
+  auto* mean_data = mean->template mutable_data<T>();
+  auto* inv_stdev_data = inv_stdev->template mutable_data<T>();
+
+  // TODO: benchmark parallelization strategies.
+  for (auto i = 0; i < N * C; ++i) {
+    ConstEigenVectorArrayMap<T> Xi(Xdata + H * W * i, H * W);
+    const T Xi_mean = Xi.mean();
+    const T squared_norm = (Xi - Xi_mean).matrix().squaredNorm();
+    const T inv_stdev = 1.0 / std::sqrt(squared_norm / (H * W) + epsilon_);
+    mean_data[i] = Xi_mean;
+    inv_stdev_data[i] = inv_stdev;
+    EigenVectorArrayMap<T> Yi(Ydata + H * W * i, H * W);
+    const T channel_scale = inv_stdev * scale_data[i % C];
+    const T channel_shift = bias_data[i % C] - Xi_mean * channel_scale;
+    Yi = Xi * channel_scale + channel_shift;
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(InstanceNorm, InstanceNormOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(InstanceNorm)
+    .NumInputs(3)
+    .NumOutputs(1, 3)
+    .AllowInplace({{0,0}})
+    .SetDoc(R"DOC(
+The *InstanceNorm* op applies Instance Normalization over a 4D input as described in [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+
+$$output = \frac{input-\mu_{input}}{\sqrt{\sigma_{input}^2} + \epsilon}*scale + bias$$
+
+Notice, two of the outputs are optional so there are three output cases for this op. Case 1: output; Case 2: output, saved_mean; Case 3: output, saved_mean, saved_inv_stdev.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/instance_norm_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/instance_norm_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "InstanceNorm",
+    ["input", "scale", "bias"],
+    ["output"],
+    epsilon=1e-5,
+)
+
+workspace.FeedBlob("input", np.random.randn(2, 1, 3, 3).astype(np.float32))
+print("input:\n", workspace.FetchBlob("input"), "\n")
+
+workspace.FeedBlob("scale", np.array([1.5]).astype(np.float32))
+print("scale: ", workspace.FetchBlob("scale"))
+
+workspace.FeedBlob("bias", np.array([1.]).astype(np.float32))
+print("bias: ", workspace.FetchBlob("bias"))
+
+workspace.RunOperatorOnce(op)
+print("output:\n", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+ [[[[ 0.97856593 -1.1832817  -0.2540021 ]
+   [-1.3315694  -0.7485018   0.3787225 ]
+   [-0.6826597  -1.4637762   0.57116514]]]
+
+
+ [[[-0.44948956  0.85544354 -0.9315333 ]
+   [-0.37202677 -0.22266895 -0.27194235]
+   [ 0.4948163  -0.7296504   1.3393803 ]]]]
+
+scale:  [1.5]
+bias:  [1.]
+output:
+ [[[[ 3.5017493  -0.3791256   1.2890853 ]
+   [-0.6453266   0.40137637  2.4249308 ]
+   [ 0.5195738  -0.8826599   2.7703972 ]]]
+
+
+ [[[ 0.12639964  2.856744   -0.8821926 ]
+   [ 0.28847694  0.60098207  0.49788612]
+   [ 2.1021945  -0.45978796  3.869297  ]]]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("epsilon", "*(type: float; default: 1e-5)* The epsilon value to use to avoid division by zero.")
+    .Arg("order", "*(type: string; default: \"NCHW\")* Specifies the order of the input data blob, where $N$ is batch size, $C$ is number of channels, $H$ is spatial height, and $W$ is spatial width. The only other valid option is \"NHWC\".")
+    .Input(0, "input", "The input 4-dimensional NCHW tensor to be operated on.")
+    .Input(1, "scale", "The input 1-dimensional scale tensor of size *C*.")
+    .Input(2, "bias", "The input 1-dimensional bias tensor of size *C*.")
+    .Output(
+        0,
+        "output",
+        "The output 4-dimensional tensor of the same shape as input.")
+    .Output(
+        1,
+        "saved_mean",
+        "(Optional) Saved mean used during training to speed up gradient computation. Should not be used for testing.")
+    .Output(
+        2,
+        "saved_inv_stdev",
+        "(Optional) Saved inverse stdev used during training to speed up gradient computation. Should not be used for testing.");
+
+} // namespace caffe2
diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu
new file mode 100644
index 0000000..5a5010d
--- /dev/null
+++ b/caffe2/operators/instance_norm_op.cu
@@ -0,0 +1,604 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/instance_norm_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void InstanceNormMeanKernel(
+    int N,
+    int C,
+    int dim,
+    int N_stride,
+    int C_stride,
+    int dim_stride,
+    const float* input_data,
+    float* mean_data) {
+  CUDA_1D_KERNEL_LOOP(i, N * C) {
+    const auto n = i / C;
+    const auto c = i % C;
+    mean_data[i] = 0;
+    auto input_offset = input_data + n * N_stride + c * C_stride;
+    for (int j = 0; j < dim; ++j) {
+      mean_data[i] += *input_offset;
+      input_offset += dim_stride;
+    }
+    mean_data[i] /= dim;
+  }
+}
+
+__global__ void InstanceNormInvStdevKernel(
+    int N,
+    int C,
+    int dim,
+    int N_stride,
+    int C_stride,
+    int dim_stride,
+    float epsilon,
+    const float* input_data,
+    const float* mean_data,
+    float* inv_stdev_data) {
+  CUDA_1D_KERNEL_LOOP(i, N * C) {
+    const auto n = i / C;
+    const auto c = i % C;
+    inv_stdev_data[i] = 0;
+    auto input_offset = input_data + n * N_stride + c * C_stride;
+    for (int j = 0; j < dim; ++j) {
+      float diff = *input_offset - mean_data[i];
+      inv_stdev_data[i] += diff * diff;
+      input_offset += dim_stride;
+    }
+    inv_stdev_data[i] /= dim;
+    inv_stdev_data[i] += epsilon;
+    inv_stdev_data[i] = 1.0 / std::sqrt(inv_stdev_data[i]);
+  }
+}
+
+__global__ void InstanceNormKernel(
+    int N,
+    int C,
+    int dim,
+    int N_stride,
+    int C_stride,
+    int dim_stride,
+    const float* input_data,
+    const float* scale_data,
+    const float* bias_data,
+    const float* mean_data,
+    const float* inv_stdev_data,
+    float* output_data) {
+  CUDA_1D_KERNEL_LOOP(i, N * C * dim) {
+    auto index = i;
+    const auto j = index % dim;
+    index /= dim;
+    const auto c = index % C;
+    index /= C;
+    const auto n = index;
+
+    index = n * N_stride + c * C_stride + j * dim_stride;
+
+    const auto stat_idx = n * C + c;
+
+    output_data[index] = (input_data[index] - mean_data[stat_idx]) *
+            inv_stdev_data[stat_idx] * scale_data[c] +
+        bias_data[c];
+  }
+}
+
+__global__ void InstanceNormGradientKernel(
+    int N,
+    int C,
+    int dim,
+    int N_stride,
+    int C_stride,
+    int dim_stride,
+    const float* input_data,
+    const float* scale_data,
+    const float* bias_data,
+    const float* output_grad_data,
+    const float* mean_data,
+    const float* inv_stdev_data,
+    float* input_grad_data) {
+  CUDA_1D_KERNEL_LOOP(i, N * C) {
+    const auto n = i / C;
+    const auto c = i % C;
+
+    auto input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
+    auto input_offset = input_data + n * N_stride + c * C_stride;
+    for (int j = 0; j < dim; ++j) {
+      *input_grad_offset = *input_offset - mean_data[i];
+      input_grad_offset += dim_stride;
+      input_offset += dim_stride;
+    }
+
+    auto temp = 0.0;
+    input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
+    auto output_grad_offset = output_grad_data + n * N_stride + c * C_stride;
+    for (int j = 0; j < dim; ++j) {
+      temp += *input_grad_offset * *output_grad_offset;
+      input_grad_offset += dim_stride;
+      output_grad_offset += dim_stride;
+    }
+
+    temp *= -powf(inv_stdev_data[i], 3.0) / dim;
+
+    input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
+    output_grad_offset = output_grad_data + n * N_stride + c * C_stride;
+    auto mean = 0.0;
+    for (int j = 0; j < dim; ++j) {
+      *input_grad_offset *= temp;
+      *input_grad_offset += *output_grad_offset * inv_stdev_data[i];
+      mean += *input_grad_offset;
+      input_grad_offset += dim_stride;
+      output_grad_offset += dim_stride;
+    }
+    mean /= dim;
+
+    input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
+    for (int j = 0; j < dim; ++j) {
+      *input_grad_offset -= mean;
+      *input_grad_offset *= scale_data[c];
+      input_grad_offset += dim_stride;
+    }
+  }
+}
+
+__global__ void InstanceNormScaleBiasGradientKernel(
+    int N,
+    int C,
+    int dim,
+    int N_stride,
+    int C_stride,
+    int dim_stride,
+    const float* input_data,
+    const float* mean_data,
+    const float* output_grad_data,
+    const float* inv_stdev_data,
+    float* scale_grad_data,
+    float* bias_grad_data) {
+  CUDA_1D_KERNEL_LOOP(c, C) {
+    scale_grad_data[c] = 0;
+    bias_grad_data[c] = 0;
+    auto input_offset = input_data + c * C_stride;
+    auto output_grad_offset = output_grad_data + c * C_stride;
+    auto mean_offset = mean_data + c;
+    auto inv_stdev_offset = inv_stdev_data + c;
+    for (int n = 0; n < N; ++n) {
+      auto input_offset_inner = input_offset + n * N_stride;
+      auto output_grad_offset_inner = output_grad_offset + n * N_stride;
+      for (int i = 0; i < dim; ++i) {
+        scale_grad_data[c] += (*input_offset_inner - *mean_offset) *
+            *inv_stdev_offset * *output_grad_offset_inner;
+        bias_grad_data[c] += *output_grad_offset_inner;
+        input_offset_inner += dim_stride;
+        output_grad_offset_inner += dim_stride;
+      }
+      mean_offset += C;
+      inv_stdev_offset += C;
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  auto output = Output(OUTPUT);
+  auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
+  auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int H = input.dim32(1);
+  const int W = input.dim32(2);
+  const int C = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  output->ResizeLike(input);
+  mean->Resize(N, C);
+  inv_stdev->Resize(N, C);
+
+  const auto input_data = input.data<float>();
+  const auto scale_data = scale.data<float>();
+  const auto bias_data = bias.data<float>();
+  auto output_data = output->mutable_data<float>();
+  auto mean_data = mean->mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->mutable_data<float>();
+
+  const auto dim = H * W;
+  const auto N_stride = C * H * W;
+  const auto C_stride = 1;
+  const auto dim_stride = C;
+
+  InstanceNormMeanKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, C, dim, N_stride, C_stride, dim_stride, input_data, mean_data);
+
+  InstanceNormInvStdevKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      epsilon_,
+      input_data,
+      mean_data,
+      inv_stdev_data);
+
+  InstanceNormKernel<<<
+      CAFFE_GET_BLOCKS(N * C * H * W),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      scale_data,
+      bias_data,
+      mean_data,
+      inv_stdev_data,
+      output_data);
+
+  return true;
+}
+
+template <>
+bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  auto output = Output(OUTPUT);
+  auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
+  auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int C = input.dim32(1);
+  const int H = input.dim32(2);
+  const int W = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  output->ResizeLike(input);
+  mean->Resize(N, C);
+  inv_stdev->Resize(N, C);
+
+  const auto input_data = input.data<float>();
+  const auto scale_data = scale.data<float>();
+  const auto bias_data = bias.data<float>();
+  auto output_data = output->mutable_data<float>();
+  auto mean_data = mean->mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->mutable_data<float>();
+
+  const auto dim = H * W;
+  const auto N_stride = C * H * W;
+  const auto C_stride = H * W;
+  const auto dim_stride = 1;
+
+  InstanceNormMeanKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, C, dim, N_stride, C_stride, dim_stride, input_data, mean_data);
+
+  InstanceNormInvStdevKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      epsilon_,
+      input_data,
+      mean_data,
+      inv_stdev_data);
+
+  InstanceNormKernel<<<
+      CAFFE_GET_BLOCKS(N * C * H * W),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      scale_data,
+      bias_data,
+      mean_data,
+      inv_stdev_data,
+      output_data);
+
+  return true;
+}
+
+template <>
+bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  const auto& output_grad = Input(OUTPUT_GRAD);
+  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
+  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
+  auto input_grad = Output(INPUT_GRAD);
+  auto scale_grad = Output(SCALE_GRAD);
+  auto bias_grad = Output(BIAS_GRAD);
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int H = input.dim32(1);
+  const int W = input.dim32(2);
+  const int C = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
+  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
+  CAFFE_ENFORCE_EQ(H, output_grad.dim32(1));
+  CAFFE_ENFORCE_EQ(W, output_grad.dim32(2));
+  CAFFE_ENFORCE_EQ(C, output_grad.dim32(3));
+  input_grad->ResizeLike(input);
+  scale_grad->ResizeLike(scale);
+  bias_grad->ResizeLike(bias);
+
+  const auto input_data = input.data<float>();
+  const auto scale_data = scale.data<float>();
+  const auto bias_data = bias.data<float>();
+  const auto output_grad_data = output_grad.data<float>();
+
+  auto input_grad_data = input_grad->mutable_data<float>();
+  auto scale_grad_data = scale_grad->mutable_data<float>();
+  auto bias_grad_data = bias_grad->mutable_data<float>();
+
+  const auto dim = H * W;
+  const auto N_stride = C * H * W;
+  const auto C_stride = 1;
+  const auto dim_stride = C;
+
+  if (InputSize() < 5) {
+    mean_.Resize(N, C);
+    auto mean_mutable_data = mean_.mutable_data<float>();
+    InstanceNormMeanKernel<<<
+        CAFFE_GET_BLOCKS(N * C),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        C,
+        dim,
+        N_stride,
+        C_stride,
+        dim_stride,
+        input_data,
+        mean_mutable_data);
+  }
+  CAFFE_ENFORCE_EQ(2, mean.ndim());
+  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
+  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
+
+  const auto mean_data = mean.data<float>();
+
+  if (InputSize() < 6) {
+    inv_stdev_.Resize(N, C);
+    auto inv_stdev_mutable_data = inv_stdev_.mutable_data<float>();
+    InstanceNormInvStdevKernel<<<
+        CAFFE_GET_BLOCKS(N * C),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        C,
+        dim,
+        N_stride,
+        C_stride,
+        dim_stride,
+        epsilon_,
+        input_data,
+        mean_data,
+        inv_stdev_mutable_data);
+  }
+  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
+  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
+  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
+
+  const auto inv_stdev_data = inv_stdev.data<float>();
+
+  InstanceNormScaleBiasGradientKernel<<<
+      CAFFE_GET_BLOCKS(C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      mean_data,
+      output_grad_data,
+      inv_stdev_data,
+      scale_grad_data,
+      bias_grad_data);
+
+  InstanceNormGradientKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      scale_data,
+      bias_data,
+      output_grad_data,
+      mean_data,
+      inv_stdev_data,
+      input_grad_data);
+
+  return true;
+}
+
+template <>
+bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& input = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+  const auto& output_grad = Input(OUTPUT_GRAD);
+  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
+  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
+  auto input_grad = Output(INPUT_GRAD);
+  auto scale_grad = Output(SCALE_GRAD);
+  auto bias_grad = Output(BIAS_GRAD);
+  CAFFE_ENFORCE_EQ(4, input.ndim());
+  const int N = input.dim32(0);
+  const int C = input.dim32(1);
+  const int H = input.dim32(2);
+  const int W = input.dim32(3);
+  CAFFE_ENFORCE_EQ(1, scale.ndim());
+  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
+  CAFFE_ENFORCE_EQ(1, bias.ndim());
+  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
+  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
+  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
+  CAFFE_ENFORCE_EQ(C, output_grad.dim32(1));
+  CAFFE_ENFORCE_EQ(H, output_grad.dim32(2));
+  CAFFE_ENFORCE_EQ(W, output_grad.dim32(3));
+  input_grad->ResizeLike(input);
+  scale_grad->ResizeLike(scale);
+  bias_grad->ResizeLike(bias);
+
+  const auto input_data = input.data<float>();
+  const auto scale_data = scale.data<float>();
+  const auto bias_data = bias.data<float>();
+  const auto output_grad_data = output_grad.data<float>();
+
+  auto input_grad_data = input_grad->mutable_data<float>();
+  auto scale_grad_data = scale_grad->mutable_data<float>();
+  auto bias_grad_data = bias_grad->mutable_data<float>();
+
+  const auto dim = H * W;
+  const auto N_stride = C * H * W;
+  const auto C_stride = H * W;
+  const auto dim_stride = 1;
+
+  if (InputSize() < 5) {
+    mean_.Resize(N, C);
+    auto mean_mutable_data = mean_.mutable_data<float>();
+    InstanceNormMeanKernel<<<
+        CAFFE_GET_BLOCKS(N * C),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        C,
+        dim,
+        N_stride,
+        C_stride,
+        dim_stride,
+        input_data,
+        mean_mutable_data);
+  }
+  CAFFE_ENFORCE_EQ(2, mean.ndim());
+  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
+  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
+
+  const auto mean_data = mean.data<float>();
+
+  if (InputSize() < 6) {
+    inv_stdev_.Resize(N, C);
+    auto inv_stdev_mutable_data = inv_stdev_.mutable_data<float>();
+    InstanceNormInvStdevKernel<<<
+        CAFFE_GET_BLOCKS(N * C),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        C,
+        dim,
+        N_stride,
+        C_stride,
+        dim_stride,
+        epsilon_,
+        input_data,
+        mean_data,
+        inv_stdev_mutable_data);
+  }
+  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
+  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
+  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
+
+  const auto inv_stdev_data = inv_stdev.data<float>();
+
+  InstanceNormScaleBiasGradientKernel<<<
+      CAFFE_GET_BLOCKS(C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      mean_data,
+      output_grad_data,
+      inv_stdev_data,
+      scale_grad_data,
+      bias_grad_data);
+
+  InstanceNormGradientKernel<<<
+      CAFFE_GET_BLOCKS(N * C),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      C,
+      dim,
+      N_stride,
+      C_stride,
+      dim_stride,
+      input_data,
+      scale_data,
+      bias_data,
+      output_grad_data,
+      mean_data,
+      inv_stdev_data,
+      input_grad_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(InstanceNorm, InstanceNormOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    InstanceNormGradient,
+    InstanceNormGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/instance_norm_op.h b/caffe2/operators/instance_norm_op.h
new file mode 100644
index 0000000..90f11b3
--- /dev/null
+++ b/caffe2/operators/instance_norm_op.h
@@ -0,0 +1,93 @@
+#ifndef CAFFE2_OPERATORS_INSTANCE_NORM_OP_H_
+#define CAFFE2_OPERATORS_INSTANCE_NORM_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class InstanceNormOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  InstanceNormOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-5f)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
+  }
+  ~InstanceNormOp() {}
+
+  bool RunOnDevice() {
+    switch (order_) {
+      case StorageOrder::NHWC:
+        return RunOnDeviceWithOrderNHWC();
+      case StorageOrder::NCHW:
+        return RunOnDeviceWithOrderNCHW();
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+  }
+
+  bool RunOnDeviceWithOrderNHWC();
+  bool RunOnDeviceWithOrderNCHW();
+
+ protected:
+  // parameters
+  T epsilon_;
+  StorageOrder order_;
+
+  // temp results that get passed to the gradient, but are otherwise stored here
+  Tensor<Context> mean_;
+  Tensor<Context> inv_stdev_;
+
+  INPUT_TAGS(INPUT, SCALE, BIAS);
+  OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
+};
+
+template <typename T, class Context>
+class InstanceNormGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  InstanceNormGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-5f)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
+  }
+  ~InstanceNormGradientOp() {}
+
+  bool RunOnDevice() {
+    switch (order_) {
+      case StorageOrder::NHWC:
+        return RunOnDeviceWithOrderNHWC();
+      case StorageOrder::NCHW:
+        return RunOnDeviceWithOrderNCHW();
+      default:
+        CAFFE_THROW("Unknown storage order: ", order_);
+    }
+  }
+
+  bool RunOnDeviceWithOrderNHWC();
+  bool RunOnDeviceWithOrderNCHW();
+
+ protected:
+  // parameters
+  T epsilon_;
+  StorageOrder order_;
+
+  // temp results that could get passed through to this gradient, but if not,
+  // are stored here
+  Tensor<Context> mean_;
+  Tensor<Context> inv_stdev_;
+
+  INPUT_TAGS(INPUT, SCALE, BIAS, OUTPUT_GRAD, MEAN, INV_STDEV);
+  OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_INSTANCE_NORM_OP_H_
diff --git a/caffe2/operators/integral_image_op.cc b/caffe2/operators/integral_image_op.cc
new file mode 100644
index 0000000..2735610
--- /dev/null
+++ b/caffe2/operators/integral_image_op.cc
@@ -0,0 +1,153 @@
+#include "integral_image_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+using EigenMatrixMapRowMajor = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+template <typename T>
+using ConstEigenMatrixMapRowMajor = Eigen::Map<
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+} // namespace
+
+template <>
+bool IntegralImageOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), 4, "Only supports 4D tensors for the momement");
+
+  vector<TIndex> out_shape(X.dims());
+  out_shape[2] += 1; // H + 1 output size
+  out_shape[3] += 1; // W + 1 output size
+  Y->Resize(out_shape);
+  const int ind = X.dim32(0);
+  const int chans = X.dim32(1);
+  const int rows_in = X.dim32(2);
+  const int cols_in = X.dim32(3);
+  const int rows_out = Y->dim32(2);
+  const int cols_out = Y->dim32(3);
+
+  const float* input_data = X.template data<float>();
+  float* output_data = Y->template mutable_data<float>();
+
+  const int row_out_pass_size = ind * chans * rows_out;
+  const int row_in_pass_size = ind * chans * rows_in;
+  EigenMatrixMapRowMajor<float> Y_arr(output_data, row_out_pass_size, cols_out);
+  ConstEigenMatrixMapRowMajor<float> X_arr(
+      input_data, row_in_pass_size, cols_in);
+
+  // Row Pass
+  for (int i = 0; i < row_out_pass_size; i++) {
+    int row = i % rows_out;
+    int diff = i / rows_out + 1;
+    Y_arr(i, 0) = 0.;
+    if (row == 0) {
+      for (int j = 1; j < cols_out; ++j) {
+        Y_arr(i, j) = 0.;
+      }
+    } else {
+      for (int j = 1; j < cols_out; ++j) {
+        Y_arr(i, j) = Y_arr(i, j - 1) + X_arr(i - diff, j - 1);
+      }
+    }
+  }
+
+  // Col Pass
+  const int col_out_pass_size = X.dim32(0) * chans * cols_out;
+  for (int i = 0; i < col_out_pass_size; i++) {
+    int col = i % cols_out;
+    int row = i / cols_out;
+    for (int j = row * rows_out + 1; j < (row + 1) * rows_out; ++j) {
+      Y_arr(j, col) += Y_arr(j - 1, col);
+    }
+  }
+  return true;
+}
+
+template <>
+bool IntegralImageGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Original input to "forward" op
+  auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
+  // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+  // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  const int ind = X.dim32(0);
+  const int chans = X.dim32(1);
+  const int rows_in = dY.dim32(2);
+  const int cols_in = dY.dim32(3);
+  const int rows_out = dX->dim32(2);
+  const int cols_out = dX->dim32(3);
+
+  const float* input_data = dY.template data<float>();
+  float* output_data = dX->template mutable_data<float>();
+
+  const int row_out_pass_size = ind * chans * rows_out;
+  const int row_in_pass_size = ind * chans * rows_in;
+  EigenMatrixMapRowMajor<float> dX_arr(
+      output_data, row_out_pass_size, cols_out);
+  ConstEigenMatrixMapRowMajor<float> dY_arr(
+      input_data, row_in_pass_size, cols_in);
+  Eigen::MatrixXf tmp(row_in_pass_size, cols_out);
+
+  // Row Pass dY(N, C, H+1, W+1) => tmp(N, C, H+1, W)
+  for (int i = 0; i < row_in_pass_size; i++) {
+    tmp(i, 0) = dY_arr(i, 0);
+    for (int j = 1; j < cols_out; ++j) {
+      tmp(i, j) = tmp(i, j - 1) + dY_arr(i, j);
+    }
+  }
+
+  // Col Pass tmp(N, C, H+1, W)=>dX(N, C, H, W)
+  const int col_out_pass_size = X.dim32(0) * chans * cols_out;
+  for (int i = 0; i < col_out_pass_size; i++) {
+    int col = i % cols_out;
+    int row_out_start = (i / cols_out) * rows_out;
+    int row_in_start = (i / cols_out) * rows_in;
+    dX_arr(row_out_start, col) = tmp(row_in_start, col);
+    for (int j = 1; j < rows_out; ++j) {
+      dX_arr(row_out_start + j, col) =
+          dX_arr(row_out_start + j - 1, col) + tmp(row_in_start + j, col);
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(IntegralImage, IntegralImageOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    IntegralImageGradient,
+    IntegralImageGradientOp<float, CPUContext>);
+
+// Input: X; Output: Y
+OPERATOR_SCHEMA(IntegralImage)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes an integral image, which contains the sum of pixel values within
+an image vertically and horizontally. This integral image can then be used
+with other detection and tracking techniques.
+)DOC")
+    .Input(0, "X", "Images tensor of the form (N, C, H, W)")
+    .Output(0, "Y", "Integrated image of the form (N, C, H+1, W+1)");
+
+// Input: X, dY (aka "gradOutput"); Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(IntegralImageGradient).NumInputs(2).NumOutputs(1);
+
+class GetIntegralImageGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "IntegralImageGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(IntegralImage, GetIntegralImageGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu
new file mode 100644
index 0000000..d5c1220
--- /dev/null
+++ b/caffe2/operators/integral_image_op.cu
@@ -0,0 +1,211 @@
+#include "caffe2/core/context_gpu.h"
+#include "integral_image_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void RowPassKernel(
+    int count,
+    int rows_out,
+    int cols_out,
+    int chans,
+    const float* in,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    // Figure out which row, channel, and batch element we're processing
+    int row = i % rows_out;
+    int chan = (i / rows_out) % chans;
+    int ind = i / rows_out / chans;
+    // Input is (H, W) and output is (H + 1, W + 1)
+    int rows_in = rows_out - 1;
+    int cols_in = cols_out - 1;
+    // Row pointer to input data
+    // Input data is shift (-1, -1) relative to output data, hence row - 1
+    const float* row_in_data =
+        in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
+    // Row pointer to output data
+    float* row_out_data =
+        out + cols_out * (row + rows_out * (chan + ind * chans));
+    // The first row and first column of the output is all zeros
+    row_out_data[0] = 0.;
+    if (row == 0) {
+      for (int i = 1; i < cols_out; ++i) {
+        row_out_data[i] = 0.;
+      }
+    } else {
+      for (int i = 1; i < cols_out; ++i) {
+        // Recall that input data is shift (-1, -1) relative to the output,
+        // hence i - 1
+        row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
+      }
+    }
+  }
+}
+
+__global__ void RowPassGradientKernel(
+    int count,
+    int rows_out,
+    int cols_out,
+    int chans,
+    const float* in,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    // Figure out which row, channel, and batch element we're processing
+    int row = i % rows_out;
+    int chan = (i / rows_out) % chans;
+    int ind = i / rows_out / chans;
+    // Input in (H + 1, W + 1) and output is (H + 1, W)
+    int rows_in = rows_out;
+    int cols_in = cols_out + 1;
+    // Col pointer to input data
+    const float* row_in_data =
+        in + cols_in * (row + rows_in * (chan + ind * chans));
+    // Col pointer to output data
+    float* row_out_data =
+        out + cols_out * (row + rows_out * (chan + ind * chans));
+    row_out_data[0] = row_in_data[0];
+    for (int i = 1; i < cols_out; ++i) {
+      row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
+    }
+  }
+}
+
+__global__ void
+ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    // Figure out which col, channel, and batch element we're processing
+    int col = i % cols_out;
+    int chan = (i / cols_out) % chans;
+    int ind = i / cols_out / chans;
+    float* col_out_data =
+        out + col + cols_out * rows_out * (chan + ind * chans);
+    for (int i = 1; i < rows_out; ++i) {
+      col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
+    }
+  }
+}
+
+__global__ void ColPassGradientKernel(
+    int count,
+    int rows_out,
+    int cols_out,
+    int chans,
+    const float* in,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    // Figure out which col, channel, and batch element we're processing
+    int col = i % cols_out;
+    int chan = (i / cols_out) % chans;
+    int ind = i / cols_out / chans;
+    // Input is (H + 1, W) and output is (H, W)
+    int rows_in = rows_out + 1;
+    int cols_in = cols_out;
+    // Col pointer to input data
+    const float* col_in_data =
+        in + col + cols_in * rows_in * (chan + ind * chans);
+    // Col pointer to output data
+    float* col_out_data =
+        out + col + cols_out * rows_out * (chan + ind * chans);
+    col_out_data[0] = col_in_data[0];
+    for (int i = 1; i < rows_out; ++i) {
+      col_out_data[i * cols_out] =
+          col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE(X.ndim() == 4, "Only supports 4D tensors for the momement");
+
+  // Input is (N, C, H, W)
+  // Output is (N, C, H + 1, W + 1)
+  vector<TIndex> out_shape(X.dims());
+  out_shape[2] += 1; // H + 1 output size
+  out_shape[3] += 1; // W + 1 output size
+  Y->Resize(out_shape);
+
+  const int chans = X.dim32(1);
+  const int rows_out = Y->dim32(2);
+  const int cols_out = Y->dim32(3);
+  // Integral image over rows of input X
+  const int row_pass_size = X.dim32(0) * chans * rows_out;
+  RowPassKernel<<<
+      CAFFE_GET_BLOCKS(row_pass_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      row_pass_size,
+      rows_out,
+      cols_out,
+      chans,
+      X.data<float>(),
+      Y->mutable_data<float>());
+  // Integral image over columns of the integral image over rows
+  const int col_pass_size = X.dim32(0) * chans * cols_out;
+  ColPassKernel<<<
+      CAFFE_GET_BLOCKS(col_pass_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      col_pass_size, rows_out, cols_out, chans, Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Original input to "forward" op
+  auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
+  // to (N, C, H + 1, W)
+  // Col pass reduces shape to (N, C, H, W)
+  vector<TIndex> row_pass_shape(dY.dims());
+  row_pass_shape[3] -= 1;
+  row_pass_buffer_.Resize(row_pass_shape);
+  const int chans = row_pass_buffer_.dim32(1);
+  const int rows_out = row_pass_buffer_.dim32(2);
+  const int cols_out = row_pass_buffer_.dim32(3);
+  // Integral image over rows of input X
+  const int row_pass_size = X.dim32(0) * chans * rows_out;
+  RowPassGradientKernel<<<
+      CAFFE_GET_BLOCKS(row_pass_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      row_pass_size,
+      rows_out,
+      cols_out,
+      chans,
+      dY.data<float>(),
+      row_pass_buffer_.mutable_data<float>());
+  // Integral image over columns of the integral image over rows
+  const int col_pass_size = X.dim32(0) * chans * cols_out;
+  ColPassGradientKernel<<<
+      CAFFE_GET_BLOCKS(col_pass_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      col_pass_size,
+      rows_out - 1,
+      cols_out,
+      chans,
+      row_pass_buffer_.data<float>(),
+      dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    IntegralImageGradient,
+    IntegralImageGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/integral_image_op.h b/caffe2/operators/integral_image_op.h
new file mode 100644
index 0000000..b8920d6
--- /dev/null
+++ b/caffe2/operators/integral_image_op.h
@@ -0,0 +1,36 @@
+#ifndef INTEGRAL_IMAGE_OP_H_
+#define INTEGRAL_IMAGE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class IntegralImageOp final : public Operator<Context> {
+ public:
+  IntegralImageOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class IntegralImageGradientOp final : public Operator<Context> {
+ public:
+  IntegralImageGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  Tensor<Context> row_pass_buffer_;
+};
+
+} // namespace caffe2
+
+#endif // INTEGRAL_IMAGE_OP_H_
diff --git a/caffe2/operators/jsd_op.cc b/caffe2/operators/jsd_op.cc
new file mode 100644
index 0000000..4464883
--- /dev/null
+++ b/caffe2/operators/jsd_op.cc
@@ -0,0 +1,98 @@
+#include "caffe2/operators/jsd_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+static constexpr float kLOG_THRESHOLD() {
+  return 1e-20;
+}
+
+inline float logit(float p) {
+  // it computes log(p / (1-p))
+  // to avoid numeric issue, hard code p log(p) when p approaches 0
+  float x = std::min(std::max(p, kLOG_THRESHOLD()), 1 - kLOG_THRESHOLD());
+  return -log(1. / x - 1.);
+}
+
+inline float entropy(float p) {
+  if (p < kLOG_THRESHOLD() || 1 - p < kLOG_THRESHOLD()) {
+    return 0.;
+  } else {
+    float q = 1 - p;
+    return -p * log(p) - q * log(q);
+  }
+}
+} // namespace
+
+template <>
+bool BernoulliJSDOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // predicted probabilities
+  auto& T = Input(1); // target probabilities
+  auto* L = Output(0); // JSD loss output
+  int N = X.size();
+  CAFFE_ENFORCE_EQ(T.size(), N);
+  L->ResizeLike(X);
+  auto* x_data = X.data<float>();
+  auto* t_data = T.data<float>();
+  auto* l_data = L->mutable_data<float>();
+  for (int i = 0; i < N; i++) {
+    auto p_mdl = x_data[i];
+    auto p_emp = t_data[i];
+    auto p_avg = (p_mdl + p_emp) / 2.;
+    auto jsd = entropy(p_avg) - (entropy(p_mdl) + entropy(p_emp)) / 2.;
+    l_data[i] = jsd;
+  }
+  return true;
+}
+
+template <>
+bool BernoulliJSDGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& go = Input(0);
+  auto& X = Input(1);
+  auto& T = Input(2);
+  auto* gi = Output(0);
+  int N = X.size();
+  gi->ResizeLike(X);
+  auto* go_data = go.data<float>();
+  auto* x_data = X.data<float>();
+  auto* t_data = T.data<float>();
+  auto* gi_data = gi->mutable_data<float>();
+  for (int i = 0; i < N; i++) {
+    auto p_mdl = x_data[i];
+    auto p_emp = t_data[i];
+    auto p_avg = (p_mdl + p_emp) / 2.;
+    auto g_jsd = (logit(p_mdl) - logit(p_avg)) / 2.;
+    gi_data[i] = go_data[i] * g_jsd;
+  }
+  return true;
+}
+REGISTER_CPU_OPERATOR(BernoulliJSD, BernoulliJSDOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    BernoulliJSDGradient,
+    BernoulliJSDGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(BernoulliJSD)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the Jensen-Shannon divergence (JSD) between two Bernoulli distributions
+where each is parametrized by a single probability.
+)DOC")
+    .Input(0, "X", "array of probabilities for prediction")
+    .Input(0, "T", "array of probabilities for target")
+    .Output(0, "L", "array of JSD losses");
+OPERATOR_SCHEMA(BernoulliJSDGradient).NumInputs(3).NumOutputs(1);
+
+class GetBernoulliJSDGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BernoulliJSDGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(BernoulliJSD, GetBernoulliJSDGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/jsd_op.h b/caffe2/operators/jsd_op.h
new file mode 100644
index 0000000..f6eb3f0
--- /dev/null
+++ b/caffe2/operators/jsd_op.h
@@ -0,0 +1,29 @@
+#ifndef CAFFE2_OPERATORS_JSD_OP_H_
+#define CAFFE2_OPERATORS_JSD_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BernoulliJSDOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(BernoulliJSDOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class BernoulliJSDGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(BernoulliJSDGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_JSD_OP_H_
diff --git a/caffe2/operators/key_split_ops.cc b/caffe2/operators/key_split_ops.cc
new file mode 100644
index 0000000..8c49110
--- /dev/null
+++ b/caffe2/operators/key_split_ops.cc
@@ -0,0 +1,11 @@
+#include "caffe2/operators/key_split_ops.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(KeySplit, KeySplitOp<int64_t, CPUContext>);
+NO_GRADIENT(KeySplitOp);
+OPERATOR_SCHEMA(KeySplit).NumInputs(1).NumOutputs(1, INT_MAX);
+} // namespace caffe2
diff --git a/caffe2/operators/key_split_ops.h b/caffe2/operators/key_split_ops.h
new file mode 100644
index 0000000..69b6c22
--- /dev/null
+++ b/caffe2/operators/key_split_ops.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+template <typename T, class Context>
+class KeySplitOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  KeySplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        categorical_limit_(
+            OperatorBase::GetSingleArgument<int>("categorical_limit", 0)) {
+    CAFFE_ENFORCE_GT(categorical_limit_, 0);
+  }
+
+  bool RunOnDevice() override {
+    auto& keys = Input(0);
+    int N = keys.size();
+    const T* keys_data = keys.template data<T>();
+    std::vector<int> counts(categorical_limit_);
+    std::vector<int*> eids(categorical_limit_);
+    for (int k = 0; k < categorical_limit_; k++) {
+      counts[k] = 0;
+    }
+    for (int i = 0; i < N; i++) {
+      int k = keys_data[i];
+      CAFFE_ENFORCE_GT(categorical_limit_, k);
+      CAFFE_ENFORCE_GE(k, 0);
+      counts[k]++;
+    }
+    for (int k = 0; k < categorical_limit_; k++) {
+      auto* eid = Output(k);
+      eid->Resize(counts[k]);
+      eids[k] = eid->template mutable_data<int>();
+      counts[k] = 0;
+    }
+    for (int i = 0; i < N; i++) {
+      int k = keys_data[i];
+      eids[k][counts[k]++] = i;
+    }
+    return true;
+  }
+
+ private:
+  int categorical_limit_;
+};
+} // namespace caffe2
diff --git a/caffe2/operators/last_n_window_collector.cc b/caffe2/operators/last_n_window_collector.cc
new file mode 100644
index 0000000..c9d1a77
--- /dev/null
+++ b/caffe2/operators/last_n_window_collector.cc
@@ -0,0 +1,187 @@
+#include <memory>
+#include <string>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+class LastNWindowCollectorOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LastNWindowCollectorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        numToCollect_(
+            OperatorBase::GetSingleArgument<int>("num_to_collect", -1)) {
+    CAFFE_ENFORCE_GT(numToCollect_, 0);
+  }
+
+  bool RunOnDevice() override {
+    if (InputSize() > MUTEX) {
+      auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(MUTEX);
+      std::lock_guard<std::mutex> guard(*mutex);
+      return collect();
+    } else {
+      return collect();
+    }
+  }
+
+ private:
+  const int32_t numToCollect_;
+
+  bool collect() {
+    auto* output = Output(LAST_N);
+    const auto& input = Input(DATA);
+
+    CAFFE_ENFORCE_GE(input.ndim(), 1);
+    bool output_initialized = output->size() > 0 &&
+        (static_cast<std::shared_ptr<std::vector<TensorCPU>>*>(
+             output->raw_mutable_data(input.meta()))[0] != nullptr);
+    if (output_initialized) {
+      CAFFE_ENFORCE_EQ(output->ndim(), input.ndim());
+      for (size_t i = 1; i < input.ndim(); ++i) {
+        CAFFE_ENFORCE_EQ(output->dim(i), input.dim(i));
+      }
+    }
+
+    auto dims = input.dims();
+    auto num_entries = dims[0];
+
+    if (OutputSize() > NUM_VISITED) {
+      auto* num_visited_tensor = Output(NUM_VISITED);
+      CAFFE_ENFORCE_EQ(1, num_visited_tensor->size());
+      auto* num_visited = num_visited_tensor->template mutable_data<int64_t>();
+      if (!output_initialized) {
+        *num_visited = 0;
+      }
+      CAFFE_ENFORCE_GE(*num_visited, 0);
+      *num_visited += num_entries;
+    }
+
+    dims[0] = numToCollect_;
+    output->Reserve(dims, &context_);
+
+    if (num_entries == 0) {
+      if (!output_initialized) {
+        // Get both shape and meta
+        output->CopyFrom(input, &context_);
+      }
+      return true;
+    }
+
+    auto num_to_copy = std::min<int32_t>(num_entries, numToCollect_);
+    auto output_batch_size = output_initialized ? output->dim(0) : 0;
+    dims[0] = std::min<size_t>(numToCollect_, output_batch_size + num_to_copy);
+    if (output_batch_size < numToCollect_) {
+      output->Resize(dims);
+    }
+    auto* output_data =
+        static_cast<char*>(output->raw_mutable_data(input.meta()));
+
+    auto* next = Output(NEXT);
+    CAFFE_ENFORCE_EQ(0, next->ndim());
+    auto* next_data = next->template mutable_data<int32_t>();
+    if (!output_initialized) {
+      *next_data = 0;
+    }
+    CAFFE_ENFORCE_LT(*next_data, output->dim(0));
+
+    auto block_size = input.size_from_dim(1);
+    auto block_bytesize = block_size * input.itemsize();
+    const auto* input_data = static_cast<const char*>(input.raw_data());
+
+    if (num_entries > numToCollect_) {
+      // just copy the last N rows
+      context_.template CopyItems<Context, Context>(
+          input.meta(),
+          num_to_copy * block_size,
+          input_data + (num_entries - numToCollect_) * block_bytesize,
+          output_data);
+      *next_data = 0;
+      return true;
+    }
+    auto start = *next_data;
+    auto first_chunk_size =
+        std::min<size_t>(num_to_copy + start, numToCollect_) - start;
+    context_.template CopyItems<Context, Context>(
+        input.meta(),
+        first_chunk_size * block_size,
+        input_data,
+        output_data + start * block_bytesize);
+
+    context_.template CopyItems<Context, Context>(
+        input.meta(),
+        (num_to_copy - first_chunk_size) * block_size,
+        input_data + first_chunk_size * block_bytesize,
+        output_data);
+
+    *next_data = (start + num_to_copy) % numToCollect_;
+
+    return true;
+  }
+
+  INPUT_TAGS(LAST_N_IN, NEXT_IN, DATA, MUTEX, NUM_VISITED_IN);
+  OUTPUT_TAGS(LAST_N, NEXT, NUM_VISITED);
+};
+
+REGISTER_CPU_OPERATOR(LastNWindowCollector, LastNWindowCollectorOp<CPUContext>);
+
+OPERATOR_SCHEMA(LastNWindowCollector)
+    .NumInputs({3, 4, 5})
+    .NumOutputs(2, 3)
+    .EnforceInplace({{0, 0}, {1, 1}, {4, 2}})
+    .SetDoc(R"DOC(
+Collect the last N rows from input data. The purpose is to keep track of data
+accross batches, so for example suppose the LastNWindowCollector is called
+successively with the following input data
+
+  [1, 2, 3, 4]
+  [5, 6, 7]
+  [8, 9, 10, 11]
+
+And the number of items is set to 6, then the output after the 3rd call
+will contain the following elements:
+
+  [6, 7, 8, 9, 10, 11]
+
+No guarantee is made on the ordering of elements in input. So a valid value for
+output could have been
+
+  [11, 10, 9, 8, 7, 6]
+
+Also, this method works for any order tensor, treating the first dimension as
+input rows and keeping the last N rows seen as input. So for instance:
+
+  [[1, 2], [2, 3], [3, 4], [4, 5]]
+  [[5, 6], [6, 7], [7, 8]]
+  [[8, 9], [9, 10], [10, 11], [11, 12]]
+
+A possible output would be
+
+  [[6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12]]
+
+This is not thread safe unless a mutex is given.
+)DOC")
+    .Arg(
+        "num_to_collect",
+        "The number of random samples to append for each positive samples")
+    .Input(
+        0,
+        "last-N buffer",
+        "The buffer for last-N record. Should be initialized to empty tensor")
+    .Input(
+        1,
+        "next cursor",
+        "The cursor pointing to the next position that should be replaced. "
+        "Should be initialized to 0.")
+    .Input(2, "DATA", "tensor to collect from")
+    .Input(3, "MUTEX", "(optional) mutex to use to make this thread-safe")
+    .Input(4, "NUM_VISITED", "")
+    .Output(0, "last-N buffer", "Data stored in sessions")
+    .Output(1, "next cursor", "Updated input cursor")
+    .Output(2, "NUM_VISITED", "number of records seen so far");
+SHOULD_NOT_DO_GRADIENT(LastNWindowCollector);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/layer_norm_op.cc b/caffe2/operators/layer_norm_op.cc
new file mode 100644
index 0000000..4b995fa
--- /dev/null
+++ b/caffe2/operators/layer_norm_op.cc
@@ -0,0 +1,211 @@
+#include "caffe2/operators/layer_norm_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+using EigenMatrixMapRowMajor = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+template <typename T>
+using ConstEigenMatrixMapRowMajor = Eigen::Map<
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+} // namespace
+
+template <>
+template <>
+bool LayerNormOp<CPUContext>::DoRunWithType<float>() {
+  const auto& input = Input(0);
+  auto* output = Output(0);
+  auto* mean = Output(1);
+  auto* stdev = Output(2);
+
+  CAFFE_ENFORCE_GE(input.dims().size(), 2, "LayerNorm requires input dim >= 2");
+
+  const auto canonical_axis = input.canonical_axis_index(axis_);
+  const int left = input.size_to_dim(canonical_axis);
+  const int right = input.size_from_dim(canonical_axis);
+
+  output->ResizeLike(input);
+  std::vector<TIndex> stats_dims(
+      input.dims().begin(), input.dims().begin() + canonical_axis);
+  stats_dims.push_back(1);
+  mean->Resize(stats_dims);
+  stdev->Resize(stats_dims);
+
+  auto input_map = ConstEigenMatrixMapRowMajor<float>(
+      input.template data<float>(), left, right);
+  auto mean_map = EigenMatrixMapRowMajor<float>(
+      mean->template mutable_data<float>(), left, 1);
+  auto stdev_map = EigenMatrixMapRowMajor<float>(
+      stdev->template mutable_data<float>(), left, 1);
+  auto output_map = EigenMatrixMapRowMajor<float>(
+      output->template mutable_data<float>(), left, right);
+
+  auto sqr = [](float f) { return f * f; };
+  auto add_ep = [this](float f) { return f + epsilon_; };
+  auto fsqrt = [](float f) { return std::sqrt(f); };
+  // Calculate row-wise statistics
+  mean_map = input_map.rowwise().mean();
+  stdev_map =
+      (input_map.unaryExpr(sqr).rowwise().mean() - mean_map.unaryExpr(sqr))
+          .unaryExpr(add_ep)
+          .unaryExpr(fsqrt);
+  output_map = (input_map - mean_map.replicate(1, right))
+                   .cwiseQuotient(stdev_map.replicate(1, right));
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LayerNorm, LayerNormOp<CPUContext>);
+
+template <>
+template <>
+bool LayerNormGradientOp<CPUContext>::DoRunWithType<float>() {
+  const auto& dout = Input(0);
+  const auto& norm_outputs = Input(1);
+  const auto& means = Input(2);
+  const auto& stdev = Input(3);
+  const auto& norm_inputs = Input(4);
+  auto* ginput = Output(0);
+
+  const auto canonical_axis = norm_inputs.canonical_axis_index(axis_);
+  const int left = norm_inputs.size_to_dim(canonical_axis);
+  const int right = norm_inputs.size_from_dim(canonical_axis);
+
+  ginput->ResizeLike(norm_inputs);
+
+  auto dout_map = ConstEigenMatrixMapRowMajor<float>(
+      dout.template data<float>(), left, right);
+  auto means_map =
+      ConstEigenMatrixMapRowMajor<float>(means.template data<float>(), left, 1);
+  auto stdev_map =
+      ConstEigenMatrixMapRowMajor<float>(stdev.template data<float>(), left, 1);
+  auto norm_inputs_map = ConstEigenMatrixMapRowMajor<float>(
+      norm_inputs.template data<float>(), left, right);
+  auto ginput_map = EigenMatrixMapRowMajor<float>(
+      ginput->template mutable_data<float>(), left, right);
+
+  // Helper functors
+  auto sqr = [](float f) { return f * f; };
+  auto recip = [](float f) { return 1.0f / f; };
+  auto neg_recip = [](float f) { return -1.0f / f; };
+
+  // Gradients - output block
+  // -1 / (stdev + epsilon)^2 * \sum_j^D x_ij - mean * dout
+  // First part: -1 / (stdev + epsilon)^2
+  auto dstdev_end_0 = stdev_map.unaryExpr(sqr).unaryExpr(neg_recip);
+  // Second part: \sum_j^D x_ij - mean * dout
+  auto dstdev_end_1 = (norm_inputs_map - means_map.replicate(1, right))
+                          .cwiseProduct(dout_map)
+                          .rowwise()
+                          .sum();
+  auto dstdev_end = dstdev_end_0.cwiseProduct(dstdev_end_1);
+  // \sum_j^D -dout * 1/(std+epsilon)
+  auto dmean_end = stdev_map.unaryExpr(neg_recip)
+                       .replicate(1, right)
+                       .cwiseProduct(dout_map)
+                       .rowwise()
+                       .sum();
+  // 1.0 / (stdev + epsilon) * dout
+  auto dx_end =
+      stdev_map.unaryExpr(recip).replicate(1, right).cwiseProduct(dout_map);
+
+  // Gradients - standard deviation block
+  // -1.0*(mean / stdev) * dstdev_end
+  auto dmean_stdev = stdev_map.unaryExpr(neg_recip)
+                         .cwiseProduct(means_map)
+                         .cwiseProduct(dstdev_end);
+  // (mean / (D*stdev)) * dstdev
+  auto dx_stdev = (1.0f / right) *
+      norm_inputs_map.cwiseQuotient(stdev_map.replicate(1, right))
+          .cwiseProduct(dstdev_end.replicate(1, right));
+
+  // Gradients - mean block
+  auto dmean = dmean_end + dmean_stdev;
+  auto dx_mean = (1.0f / right) * dmean.replicate(1, right);
+
+  ginput_map = dx_end + dx_stdev + dx_mean;
+
+  return true;
+}
+
+OPERATOR_SCHEMA(LayerNormGradient).NumInputs(5).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(LayerNormGradient, LayerNormGradientOp<CPUContext>);
+
+namespace {
+
+class GetLayerNormGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LayerNormGradient",
+        "",
+        vector<string>{GO(0), O(0), O(1), O(2), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+}  // namespace
+
+REGISTER_GRADIENT(LayerNorm, GetLayerNormGradient);
+
+OPERATOR_SCHEMA(LayerNorm)
+    .NumInputs(1)
+    .NumOutputs(3)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(3);
+      auto input_dims_long = GetDimsVector(in[0]);
+      std::vector<int> input_dims(
+          input_dims_long.begin(), input_dims_long.end());
+      out[0] = CreateTensorShape(input_dims, TensorProto::FLOAT);
+
+      ArgumentHelper helper(def);
+
+      auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+      const auto canonical_axis =
+          canonical_axis_index_(axis, in[0].dims().size());
+      std::vector<int> stat_dims(
+          input_dims.begin(), input_dims.begin() + canonical_axis);
+      stat_dims.push_back(1);
+      out[1] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
+      out[2] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
+      return out;
+    })
+    .SetDoc(R"DOC(
+Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf.
+Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}],
+this op treats dimensions a_k through a_{n-1} as feature vectors. For each
+feature vector, the op contains the mean and standard deviation. Then,
+it returns the normalized values (with respect to the feature vector).
+
+Note that this op does not contain the scale an bias terms described in the
+paper. Simply follow this op with an FC op to add those. Concretely, this op
+implements:
+
+h = \frac{1}{\sigma}(a - \mu)
+where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i
+and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2}
+where H is the number of hidden units (i.e. product of dimensions from 'axis'
+to the end.)
+)DOC")
+    .Arg(
+        "axis",
+        "(int) default to 1; Describes axis of the inputs. Defaults to one "
+        "because the 0th axis most likely describes the batch size")
+    .Arg(
+        "epsilon",
+        "(float) default to 0.001. Small value to be added to the stdev when"
+        " dividing out by that value. This prevents division by zero.")
+    .Input(
+        0,
+        "input",
+        "Input tensor which layer normalization will be applied to")
+    .Output(0, "output", "Normalized values")
+    .Output(1, "mean", "Mean values for each feature vector")
+    .Output(2, "stddev", "Standard deviations for each feature vector");
+
+} // namespace caffe2
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
new file mode 100644
index 0000000..0309b4e
--- /dev/null
+++ b/caffe2/operators/layer_norm_op.cu
@@ -0,0 +1,370 @@
+#include "caffe2/operators/layer_norm_op.h"
+
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+struct SqrTransform {
+  inline __host__ __device__ T operator()(const T v) const {
+    return v * v;
+  }
+};
+
+// X = X - Y^2
+__global__ void sqrtXMinusYSquaredKernel(
+    const int N,
+    float* x,
+    const float* y,
+    const float epsilon) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    x[i] = sqrtf(x[i] - y[i] * y[i] + epsilon);
+  }
+}
+
+// out[i, j] = (X[i, j] - mu[i]) / sigma[i]
+__global__ void normalizeKernel(
+    const int row_dim,
+    const int N,
+    const float* x,
+    const float* mu,
+    const float* sigma,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    out[i] = (x[i] - mu[i / row_dim]) / (sigma[i / row_dim]);
+  }
+}
+
+template <typename InputIterator_t>
+void allocScratchAndReduce(
+    InputIterator_t input,
+    float* output,
+    int num_segments,
+    int* seg_indices,
+    Tensor<CUDAContext>* scratch,
+    cudaStream_t stream) {
+  size_t temp_storage_bytes;
+  cub::DeviceSegmentedReduce::Sum(
+      nullptr, // To retrieve required temporary storage size
+      temp_storage_bytes, // size_t &temp_storage_bytes
+      input, // InputIteratorT d_i
+      output, // OutputIteratorT d_out
+      num_segments, // int num_segments
+      seg_indices, // int *d_begin_offsets
+      seg_indices + 1, // int *d_end_offsets
+      stream // cudaStream_t stream=0
+      );
+  size_t temp_storage_floats = temp_storage_bytes / sizeof(float) +
+      (temp_storage_bytes % sizeof(float) ? 1 : 0);
+  scratch->Resize(vector<size_t>{temp_storage_floats});
+
+  cub::DeviceSegmentedReduce::Sum(
+      scratch->mutable_data<float>(), // To retrieve required temporary storage
+                                      // size
+      temp_storage_bytes, // size_t &temp_storage_bytes
+      input, // InputIteratorT d_i
+      output, // OutputIteratorT d_out
+      num_segments, // int num_segments
+      seg_indices, // int *d_begin_offsets
+      seg_indices + 1, // int *d_end_offsets
+      stream // cudaStream_t stream=0
+      );
+}
+
+} //  namespace
+
+template <>
+template <>
+bool LayerNormOp<CUDAContext>::DoRunWithType<float>() {
+  const auto& input = Input(0);
+  auto* output = Output(0);
+  auto* mean = Output(1);
+  auto* stdev = Output(2);
+
+  CAFFE_ENFORCE_GE(input.dims().size(), 2, "LayerNorm requires input dim >= 2");
+
+  const auto canonical_axis = input.canonical_axis_index(axis_);
+  const int left = input.size_to_dim(canonical_axis);
+  const int right = input.size_from_dim(canonical_axis);
+
+  output->ResizeLike(input);
+  std::vector<TIndex> stats_dims(
+      input.dims().begin(), input.dims().begin() + canonical_axis);
+  stats_dims.push_back(1);
+  mean->Resize(stats_dims);
+  stdev->Resize(stats_dims);
+
+  std::vector<int> segs(left + 1);
+  std::iota(segs.begin(), segs.end(), 0);
+  std::transform(
+      segs.begin(),
+      segs.end(),
+      segs.begin(),
+      std::bind1st(std::multiplies<int>(), right));
+
+  seg_indices_.Resize(vector<size_t>{segs.size()});
+  context_.CopyBytes<CPUContext, CUDAContext>(
+      sizeof(int) * segs.size(),
+      static_cast<void*>(segs.data()),
+      static_cast<void*>(seg_indices_.mutable_data<int>()));
+
+  if (right == 1) {
+    mean->CopyFrom(input);
+    mean->Resize(stats_dims);
+    math::Set<float, CUDAContext>(
+        left, std::sqrt(epsilon_), stdev->mutable_data<float>(), &context_);
+  } else {
+    // Calculate row-wise means
+    // First stage: sum up feature vectors
+    allocScratchAndReduce(
+        input.data<float>(),
+        mean->mutable_data<float>(),
+        left,
+        seg_indices_.mutable_data<int>(),
+        &scratch_,
+        context_.cuda_stream());
+
+    // Second stage: Normalize by feature vector dim
+    math::Scale<float, CUDAContext>(
+        left,
+        1.0f / right,
+        mean->mutable_data<float>(),
+        mean->mutable_data<float>(),
+        &context_);
+
+    // Calculate row-wise standard deviation
+
+    // First stage: sum up row-wise squared values
+    SqrTransform<float> transform;
+    cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
+        input.data<float>(), transform);
+    allocScratchAndReduce(
+        it,
+        stdev->mutable_data<float>(),
+        left,
+        seg_indices_.mutable_data<int>(),
+        &scratch_,
+        context_.cuda_stream());
+
+    // Second stage: Normalize by feature vector dim
+    math::Scale<float, CUDAContext>(
+        left,
+        1.0f / right,
+        stdev->mutable_data<float>(),
+        stdev->mutable_data<float>(),
+        &context_);
+
+    // stddev = sqrt(E(x^2) - E(x)^2 + epsilon)
+    sqrtXMinusYSquaredKernel<<<
+        CAFFE_GET_BLOCKS(left),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        left,
+        stdev->mutable_data<float>(),
+        mean->mutable_data<float>(),
+        epsilon_);
+  }
+
+  // out[i, j] = (in[i,j] - mu[i]) / (sigma[i])
+  normalizeKernel<<<
+      CAFFE_GET_BLOCKS(left),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      right,
+      left * right,
+      input.data<float>(),
+      mean->data<float>(),
+      stdev->data<float>(),
+      output->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LayerNorm, LayerNormOp<CUDAContext>);
+
+namespace {
+// x : [N, D]
+// y : [N, 1]
+// z : [N, D]
+// (x - broadcast(y)) * z
+__global__ void zTimesXminusYbroadcast(
+    int N,
+    int D,
+    const float* x,
+    const float* y,
+    const float* z,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    out[i] = (x[i] - y[i / D]) * z[i];
+  }
+}
+
+__global__ void normalizeByNegStdev(
+    int N,
+    bool var,
+    const float* x,
+    const float* stdev,
+    float* out) {
+  if (var) {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      out[i] = (-1.0f * x[i]) / (stdev[i] * stdev[i]);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      out[i] = (-1.0f * x[i]) / (stdev[i]);
+    }
+  }
+}
+
+__global__ void gradientMegaKernel(
+    int N,
+    int D,
+    const float* stdev,
+    const float* X,
+    const float* dstdev,
+    const float* dmean,
+    const float* dout,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    out[i] = 1.0f / stdev[i / D] * dout[i] +
+        X[i] / (D * stdev[i / D]) * dstdev[i / D] + 1.0f / D * dmean[i / D];
+  }
+}
+
+#define PRINT(X, N, D) printTensor<<<1, 1, 0, context_.cuda_stream()>>>(X, N, D)
+
+} // namespace
+
+template <>
+template <>
+bool LayerNormGradientOp<CUDAContext>::DoRunWithType<float>() {
+  const auto& dout = Input(0);
+  const auto& norm_outputs = Input(1);
+  const auto& means = Input(2);
+  const auto& stdev = Input(3);
+  const auto& norm_inputs = Input(4);
+  auto* ginput = Output(0);
+
+  const auto canonical_axis = norm_inputs.canonical_axis_index(axis_);
+  const unsigned long left = norm_inputs.size_to_dim(canonical_axis);
+  const unsigned long right = norm_inputs.size_from_dim(canonical_axis);
+
+  ginput->ResizeLike(norm_inputs);
+  std::vector<TIndex> stats_dims(
+      norm_inputs.dims().begin(), norm_inputs.dims().begin() + canonical_axis);
+  stats_dims.push_back(1);
+  dmean_.Resize(stats_dims);
+  dstdev_.Resize(stats_dims);
+  gscratch_.Resize(std::vector<size_t>{left, right});
+
+  std::vector<int> segs(left + 1);
+  std::iota(segs.begin(), segs.end(), 0);
+  std::transform(
+      segs.begin(),
+      segs.end(),
+      segs.begin(),
+      std::bind1st(std::multiplies<int>(), right));
+
+  seg_indices_.Resize(vector<size_t>{segs.size()});
+  context_.CopyBytes<CPUContext, CUDAContext>(
+      sizeof(int) * segs.size(),
+      static_cast<void*>(segs.data()),
+      static_cast<void*>(seg_indices_.mutable_data<int>()));
+
+  // Calculate gradient of the standard deviation
+  // temp1 = (x - mean) * dout
+  zTimesXminusYbroadcast<<<
+      CAFFE_GET_BLOCKS(left * right),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      left,
+      right,
+      norm_inputs.data<float>(),
+      means.data<float>(),
+      dout.data<float>(),
+      gscratch_.mutable_data<float>());
+
+  dstdev_.Resize(vector<size_t>{left, 1});
+  // dstdev = reduce(temp1)
+  allocScratchAndReduce(
+      gscratch_.data<float>(),
+      dstdev_.mutable_data<float>(),
+      left,
+      seg_indices_.mutable_data<int>(),
+      &scratch_,
+      context_.cuda_stream());
+  // dstdev = -dstdev / sqrt(stdev)
+  normalizeByNegStdev<<<
+      CAFFE_GET_BLOCKS(left),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      left,
+      true,
+      dstdev_.data<float>(),
+      stdev.data<float>(),
+      dstdev_.mutable_data<float>());
+
+  // Calculate gradient of the mean
+  // dmean = reduce(dout)
+  allocScratchAndReduce(
+      dout.data<float>(),
+      dmean_.mutable_data<float>(),
+      left,
+      seg_indices_.mutable_data<int>(),
+      &scratch_,
+      context_.cuda_stream());
+  // mean * stdev
+  math::Mul(
+      left,
+      means.data<float>(),
+      dstdev_.data<float>(),
+      gscratch_.mutable_data<float>(),
+      &context_);
+  // [\sum dout] + mean * stdev
+  math::Add(
+      left,
+      dmean_.data<float>(),
+      gscratch_.data<float>(),
+      dmean_.mutable_data<float>(),
+      &context_);
+  // -1 / std * [[\sum dout] + mean * stdev]
+  normalizeByNegStdev<<<
+      CAFFE_GET_BLOCKS(left),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      left,
+      false,
+      dmean_.data<float>(),
+      stdev.data<float>(),
+      dmean_.mutable_data<float>());
+
+  // Calculate gradient of input
+  gradientMegaKernel<<<
+      CAFFE_GET_BLOCKS(left),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      left,
+      right,
+      stdev.data<float>(),
+      norm_inputs.data<float>(),
+      dstdev_.data<float>(),
+      dmean_.data<float>(),
+      dout.data<float>(),
+      ginput->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LayerNormGradient, LayerNormGradientOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
new file mode 100644
index 0000000..da74f83
--- /dev/null
+++ b/caffe2/operators/layer_norm_op.h
@@ -0,0 +1,65 @@
+#ifndef CAFFE2_OPERATORS_LAYER_NORM_OP_H
+#define CAFFE2_OPERATORS_LAYER_NORM_OP_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class LayerNormOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LayerNormOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+  ~LayerNormOp() {}
+
+  template <typename T>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  int axis_;
+  float epsilon_;
+
+  Tensor<Context> scratch_;
+  Tensor<Context> seg_indices_;
+};
+
+template <class Context>
+class LayerNormGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LayerNormGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 0.001f)) {}
+  ~LayerNormGradientOp() {}
+
+  template <typename T>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  int axis_;
+  float epsilon_;
+
+  Tensor<Context> scratch_;
+  Tensor<Context> gscratch_;
+  Tensor<Context> seg_indices_;
+  Tensor<Context> dstdev_;
+  Tensor<Context> dmean_;
+};
+
+} // namespace caffe2
+
+#endif /* CAFFE2_OPERATORS_LAYER_NORM_OP_H */
diff --git a/caffe2/operators/leaky_relu_op.cc b/caffe2/operators/leaky_relu_op.cc
new file mode 100644
index 0000000..dcf6208
--- /dev/null
+++ b/caffe2/operators/leaky_relu_op.cc
@@ -0,0 +1,128 @@
+#include "caffe2/operators/leaky_relu_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool LeakyReluOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  ConstEigenVectorMap<float> Xvec(X.template data<float>(), X.size());
+  EigenVectorMap<float> Yvec(Y->template mutable_data<float>(), Y->size());
+  Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * alpha_;
+  return true;
+}
+
+template <>
+bool LeakyReluGradientOp<float, CPUContext>::RunOnDevice() {
+  const auto& Y = Input(0);
+  const auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(Y);
+  CAFFE_ENFORCE_EQ(Y.size(), dY.size());
+  ConstEigenVectorMap<float> Yvec(Y.template data<float>(), Y.size());
+  ConstEigenVectorMap<float> dYvec(dY.template data<float>(), dY.size());
+  EigenVectorMap<float> dXvec(dX->template mutable_data<float>(), dX->size());
+  Eigen::VectorXf gtZero = (Yvec.array() >= 0.0f).cast<float>();
+  dXvec = dYvec.array() * gtZero.array() -
+      dYvec.array() * (gtZero.array() - 1.0f) * alpha_;
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LeakyRelu, LeakyReluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    LeakyReluGradient,
+    LeakyReluGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LeakyRelu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("alpha", "*(type: float; default: 0.01)* Coefficient of leakage.")
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(PointwiseCostInference<2>)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+The *LeakyRelu* op takes one input tensor $X$ and an argument $alpha$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise leaky relu operation, defined as
+
+$$y=LeakyRelu(x) =\begin{cases}\alpha x & x < 0\\x & otherwise\end{cases}$$
+
+The default value of *alpha* is 0.01.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/leaky_relu_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/leaky_relu_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LeakyRelu",
+    ["X"],
+    ["Y"],
+    alpha=0.01
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[-0.91060215  0.09374836  2.1429708 ]
+ [-0.748983    0.19164062 -1.5130422 ]
+ [-0.29539835 -0.8530696   0.7673204 ]]
+
+Y:
+ [[-0.00910602  0.09374836  2.1429708 ]
+ [-0.00748983  0.19164062 -0.01513042]
+ [-0.00295398 -0.0085307   0.7673204 ]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "Input tensor of data to be operated on.")
+    .Output(0, "Y", "Output tensor, calculated as described above.");
+    
+OPERATOR_SCHEMA(LeakyReluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .Arg("alpha", "Coefficient of leakage")
+    .InheritOnnxSchema("LeakyRelu");
+
+class GetLeakyReluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LeakyReluGradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(LeakyRelu, GetLeakyReluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/leaky_relu_op.cu b/caffe2/operators/leaky_relu_op.cu
new file mode 100644
index 0000000..ece07b7
--- /dev/null
+++ b/caffe2/operators/leaky_relu_op.cu
@@ -0,0 +1,67 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/leaky_relu_op.h"
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+template <typename T>
+__global__ void LeakyReluKernel(const int N, const T alpha, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = X[i] >= 0 ? X[i] : X[i] * alpha;
+  }
+}
+
+template <typename T>
+__global__ void LeakyReluGradientKernel(
+    const int N,
+    const T alpha,
+    const T* Y,
+    const T* dY,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = Y[i] >= 0 ? dY[i] : dY[i] * alpha;
+  }
+}
+} // namespace
+
+template <>
+bool LeakyReluOp<float, CUDAContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  LeakyReluKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), alpha_, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool LeakyReluGradientOp<float, CUDAContext>::RunOnDevice() {
+  const auto& Y = Input(0);
+  const auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(Y);
+  CAFFE_ENFORCE_EQ(Y.size(), dY.size());
+  LeakyReluGradientKernel<<<
+      CAFFE_GET_BLOCKS(Y.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      Y.size(),
+      alpha_,
+      Y.data<float>(),
+      dY.data<float>(),
+      dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LeakyRelu, LeakyReluOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LeakyReluGradient,
+    LeakyReluGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/leaky_relu_op.h b/caffe2/operators/leaky_relu_op.h
new file mode 100644
index 0000000..5a4238d
--- /dev/null
+++ b/caffe2/operators/leaky_relu_op.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LeakyReluOp : public Operator<Context> {
+ public:
+  LeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), alpha_(0.01) {
+    if (HasArgument("alpha")) {
+      alpha_ =
+          static_cast<T>(OperatorBase::GetSingleArgument<float>("alpha", 0.01));
+    }
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+template <typename T, class Context>
+class LeakyReluGradientOp final : public Operator<Context> {
+ public:
+  LeakyReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), alpha_(0.01) {
+    if (HasArgument("alpha")) {
+      alpha_ =
+          static_cast<T>(OperatorBase::GetSingleArgument<float>("alpha", 0.01));
+    }
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_pad_op.cc b/caffe2/operators/lengths_pad_op.cc
new file mode 100644
index 0000000..3981937
--- /dev/null
+++ b/caffe2/operators/lengths_pad_op.cc
@@ -0,0 +1,60 @@
+#include "caffe2/operators/lengths_pad_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(LengthsPad, LengthsPadOp<CPUContext>);
+
+OPERATOR_SCHEMA(LengthsPad)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given DATA tensor of rank r >= 1, and LENGTHS tensor of rank 1, pad each
+segment in DATA with `value`, so that each segment's length is `target_length`.
+If will throw, if there is segment of length larger than `target_length`.
+
+Example:
+  DATA  = [
+      [2.3, 3.4],
+      [4.5, 5.7],
+      [6.8, 7.9],
+  ]
+  LENGTHS = [0, 1, 1, 1]
+  and target_length = 2, padding value = -1.0
+  OUTPUT = [
+    [-1.0, -1.0],
+    [-1.0, -1.0],
+    [2.3, 3.4],
+    [-1.0, -1.0],
+    [4.5, 5.7],
+    [-1.0, -1.0],
+    [6.8, 7.9],
+    [-1.0, -1.0],
+  ]
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "Tensor of rank r >= 1. First dimension must be equal to the size of "
+        "lengths")
+    .Input(1, "LENGTHS", "Tensor of int32 lengths of rank 1")
+    .Output(0, "OUTPUT", "Padded DATA tensor")
+    .Arg("padding_value", "The value to pad the data")
+    .Arg("target_length", "The target length of each segment")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      ArgumentHelper helper(def);
+      int target_length = helper.GetSingleArgument<int>("target_length", -1);
+      CAFFE_ENFORCE_GE(target_length, 1);
+      vector<int> output_dims;
+      const auto& data_dims = GetDimsVector(in[0]);
+      const auto& lengths_dims = GetDimsVector(in[1]);
+      output_dims.push_back(lengths_dims[0] * target_length);
+      output_dims.insert(
+          output_dims.end(), data_dims.begin() + 1, data_dims.end());
+
+      out[0] = CreateTensorShape(output_dims, in[0].data_type());
+      return out;
+    });
+
+NO_GRADIENT(LengthsPad);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_pad_op.cu b/caffe2/operators/lengths_pad_op.cu
new file mode 100644
index 0000000..7b2c2cf
--- /dev/null
+++ b/caffe2/operators/lengths_pad_op.cu
@@ -0,0 +1,7 @@
+#include "caffe2/operators/lengths_pad_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(LengthsPad, LengthsPadOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h
new file mode 100644
index 0000000..e89f4fb
--- /dev/null
+++ b/caffe2/operators/lengths_pad_op.h
@@ -0,0 +1,87 @@
+#ifndef CAFFE2_OPERATORS_LENGTHS_PAD_OP_H_
+#define CAFFE2_OPERATORS_LENGTHS_PAD_OP_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class LengthsPadOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LengthsPadOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(double, "padding_value", padding_value_, -1),
+        OP_SINGLE_ARG(int, "target_length", target_length_, -1) {
+    CAFFE_ENFORCE_GE(target_length_, 1, "target_length argument must be >= 1");
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double, int32_t, int64_t>>::call(
+        this, Input(DATA));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& lengths = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+    CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+
+    // Context::CopyFrom and math::Sum need the same context to avoid race
+    // conditions
+    CPUContext cpuContext;
+    lengths_host_.CopyFrom(lengths, &cpuContext);
+
+    auto lengths_size = lengths_host_.size();
+    auto* lengths_data = lengths_host_.data<int32_t>();
+
+    int32_t total_length = 0;
+    math::Sum<int32_t, CPUContext>(
+        lengths_size, lengths_data, &total_length, &cpuContext);
+
+    CAFFE_ENFORCE_EQ(total_length, data.dim(0));
+
+    auto shape = data.dims();
+    shape[0] = lengths_size * target_length_;
+    output->Resize(shape);
+
+    auto block_size = data.size_from_dim(1);
+    auto src_data = data.template data<T>();
+    auto out_data = output->template mutable_data<T>();
+
+    math::Set(
+        output->size(), static_cast<T>(padding_value_), out_data, &context_);
+    for (TIndex i = 0; i < lengths_size; ++i) {
+      auto length = lengths_data[i];
+      CAFFE_ENFORCE_GE(length, 0);
+      CAFFE_ENFORCE_GE(
+          target_length_,
+          length,
+          "Length at index = ",
+          i,
+          " is larger than target length");
+
+      context_.template Copy<T, Context, Context>(
+          block_size * length, src_data, out_data);
+
+      out_data += block_size * target_length_;
+      src_data += block_size * length;
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, LENGTHS);
+
+ private:
+  double padding_value_;
+  int target_length_;
+  TensorCPU lengths_host_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTHS_PAD_OP_H_
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
new file mode 100644
index 0000000..5b0f187
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
@@ -0,0 +1,97 @@
+#include "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h"
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsSumFused8BitRowwise,
+    SparseLengthsFused8BitRowwiseOp<CPUContext>);
+OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwise)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the same operation as SparseLengthsSum, but operating on
+8-bit rowwise quantized matrices with fused storage (where each row
+stores quantized values, and then 4-byte scale and 4-byte bias).
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToFused8BitRowwiseQuantized")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        2,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Output(0, "output", "output");
+NO_GRADIENT(SparseLengthsSumFused8BitRowwise);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsWeightedSumFused8BitRowwise,
+    SparseLengthsFused8BitRowwiseOp<CPUContext, /*with_weights=*/true>);
+OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwise)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the same operation as SparseLengthsWeightedSum,
+but operating on 8-bit rowwise quantized matrices with fused storage
+(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToFused8BitRowwiseQuantized")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        2,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Input(
+        3,
+        "WEIGHTS",
+        "Vector of weights to scale rows of DATA with before reduction")
+    .Output(0, "output", "output");
+
+NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwise);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsMeanFused8BitRowwise,
+    SparseLengthsFused8BitRowwiseOp<
+        CPUContext,
+        /*with_weights=*/false,
+        /*is_mean=*/true>);
+OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwise)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the same operation as SparseLengthsMean, but
+operating on 8-bit rowwise quantized matrices with fused storage
+(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToFused8BitRowwiseQuantized")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        2,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Output(0, "output", "output");
+NO_GRADIENT(SparseLengthsMeanFused8BitRowwise);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
new file mode 100644
index 0000000..7c42d52
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
@@ -0,0 +1,84 @@
+#ifndef CAFFE2_OPERATORS_LENGTHS_REDUCER_FUSED_8BIT_ROWWISE_OPS_H_
+#define CAFFE2_OPERATORS_LENGTHS_REDUCER_FUSED_8BIT_ROWWISE_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
+#include "caffe2/operators/reducer_functors.h"
+#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context, bool with_weights = 0, bool is_mean = 0>
+class SparseLengthsFused8BitRowwiseOp : public Operator<Context> {
+ public:
+  static_assert(
+      !(with_weights && is_mean),
+      "Cannot have with_weights and is_mean a the same time");
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SparseLengthsFused8BitRowwiseOp)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    const auto& data = Input(DATA);
+    const auto& indices = Input(INDICES);
+    const auto& lengths = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(indices.ndim(), 1, "INDICES must be a vector");
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be a vector");
+
+    const float* weights = nullptr;
+    if (with_weights) {
+      const auto& weights_input = Input(WEIGHTS);
+      CAFFE_ENFORCE_EQ(weights_input.ndim(), 1, "WEIGHTS must be a vector");
+      CAFFE_ENFORCE_EQ(
+          weights_input.size(),
+          indices.size(),
+          "WEIGHTS should have the same length as INDICES.");
+      weights = weights_input.template data<float>();
+    }
+
+    CAFFE_ENFORCE_GT(data.dim(1), 8, "DATA must have more than 8 columns");
+    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
+    // bytes for bias that we use in the fused representation (per row).
+    const std::vector<TIndex> shape = {lengths.dim(0), data.dim(1) - 8};
+    output->Resize(shape);
+
+    Fused8BitRowwiseEmbeddingLookup(
+        /*block_size=*/output->dim(1),
+        /*output_size=*/output->dim(0),
+        /*index_size=*/indices.size(),
+        /*data_size=*/data.dim(0),
+        /*input=*/data.template data<uint8_t>(),
+        /*indices=*/indices.template data<IndexType>(),
+        /*lengths=*/lengths.template data<int>(),
+        /*weights=*/weights,
+        /*normalize_by_lengths=*/is_mean,
+        /*out=*/output->template mutable_data<float>());
+
+    return true;
+  }
+
+  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(Context, DATA, INDICES, LENGTHS)
+
+ private:
+  enum {
+    DATA = 0,
+    WEIGHTS = 1,
+    INDICES = 1 + with_weights,
+    LENGTHS = 2 + with_weights,
+  };
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTHS_REDUCER_FUSED_8BIT_ROWWISE_OPS_H_
diff --git a/caffe2/operators/lengths_reducer_ops.cc b/caffe2/operators/lengths_reducer_ops.cc
new file mode 100644
index 0000000..cbbda4e
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_ops.cc
@@ -0,0 +1,56 @@
+#include "caffe2/operators/lengths_reducer_ops.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Use _STR option because the schema is declared using _STR version too in
+// generic fashion. Otherwise it'd break schema declaration check.
+// TODO(dzhulgakov): remove _STR when all lengths ops are off generic version.
+
+REGISTER_CPU_OPERATOR_STR(
+    "SparseLengthsSum",
+    CPUSparseLengthsReductionOp<float, TensorTypes<float, float16>, 0, 0>);
+REGISTER_CPU_OPERATOR_STR(
+    "SparseLengthsWeightedSum",
+    CPUSparseLengthsReductionOp<float, TensorTypes<float, float16>, 1, 0>);
+REGISTER_CPU_OPERATOR_STR(
+    "SparseLengthsMean",
+    CPUSparseLengthsReductionOp<float, TensorTypes<float, float16>, 0, 1>);
+
+OPERATOR_SCHEMA(SparseLengthsPositionalWeightedSum)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Variation of SparseLengthsWeightedSum operator, where, for each row,
+weights are accessed by indices [0..L-1], where L is the length of given row.
+This is basically a fused operator of LengthsRangeFill + Gather +
+SparseWeightedSum
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToRowwiseQuantized8Bits")
+    .Input(
+        1,
+        "WEIGHT",
+        "Scalar multipliers for the input slices. Must "
+        "be a vector with the length matching the length of DATA")
+    .Input(
+        2,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        3,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Output(0, "output", "output");
+
+REGISTER_CPU_OPERATOR_STR(
+    "SparseLengthsPositionalWeightedSum",
+    CPUSparseLengthsReductionOp<float, TensorTypes<float, float16>, 1, 0, 1>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h
new file mode 100644
index 0000000..505dad1
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_ops.h
@@ -0,0 +1,108 @@
+#pragma once
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/perfkernels/embedding_lookup.h"
+
+namespace caffe2 {
+
+// A templated class that implements SparseLengths[Sum,WeightedSum,Mean].
+template <
+    typename T, // output type
+    class InputTypes, // supported input types, such as TensorTypes<float>
+    bool USE_WEIGHT = 0, // Whether it is SparseLengthsWeightedSum
+    bool USE_MEAN = 0, // Whether this is SparseLengthsMean
+    bool USE_POSITIONAL_WEIGHT = 0
+    // USE_WEIGHT = 1 and USE_POSITIONAL_WEIGHT = 1
+    // -> SparseLengthsPositionalWeightedSum
+    >
+class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  CPUSparseLengthsReductionOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    static_assert(
+        !(USE_WEIGHT & USE_MEAN), "Cannot both specify weight and mean.");
+  }
+
+  ~CPUSparseLengthsReductionOp() {}
+
+  // Currently, we support float and float16 inputs for input data type, and
+  // int32_t and int64_t for the index type.
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(DATA));
+  }
+
+  template <typename InputType>
+  bool DoRunWithType() {
+    return DispatchHelper<TensorTypes2<int32_t, int64_t>, InputType>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename InputType, typename IndexType>
+  bool DoRunWithType2() {
+    auto& dataInput = Input(DATA);
+    auto& indicesInput = Input(INDICES);
+    auto& lengthsInput = Input(LENGTHS);
+
+    CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex N = dataInput.dim(0);
+    const int D = dataInput.size_from_dim(1);
+    const TIndex M = lengthsInput.dim(0);
+    const TIndex indices_size = indicesInput.size();
+
+    auto* output = Output(0);
+    auto shape = dataInput.dims();
+    shape[0] = M;
+    output->Resize(shape);
+    T* out_data = output->template mutable_data<T>();
+
+    const InputType* in_data = dataInput.template data<InputType>();
+    const IndexType* indices = indicesInput.template data<IndexType>();
+    const int* lengths = lengthsInput.template data<int>();
+    const T* in_weight = nullptr;
+
+    if (USE_WEIGHT) {
+      // static if
+      auto& weightInput = Input(WEIGHT);
+      CAFFE_ENFORCE_EQ(1, weightInput.ndim(), "WEIGHT must be a vector");
+      if (!USE_POSITIONAL_WEIGHT) {
+        CAFFE_ENFORCE_EQ(
+            weightInput.size(),
+            indices_size,
+            "Weight should have the same length as indices.");
+      }
+      in_weight = weightInput.template data<T>();
+    }
+
+    // delegate work to perfkernel that branches based on architecture
+    EmbeddingLookup<IndexType, InputType, T, USE_POSITIONAL_WEIGHT>(
+        D,
+        M,
+        indices_size,
+        N,
+        in_data,
+        indices,
+        lengths,
+        in_weight,
+        nullptr, // scale_bias field is only used in SparseLengths8BitsRowwiseOp
+        USE_MEAN,
+        out_data);
+    return true;
+  }
+
+  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(CPUContext, DATA, INDICES, LENGTHS)
+
+ private:
+  enum {
+    DATA = 0, // Data input.
+    WEIGHT = 1, // Weight input used in SparseLengthsWeightedSum
+    INDICES = 1 + USE_WEIGHT, // 1 in SparseLengths[Sum,Mean] and
+                              // 2 in SparseLengthsWeightedSum
+    LENGTHS = 2 + USE_WEIGHT, // 2 in SparseLengths[Sum, Mean],
+                              // 3 in SparseLengthsWeightedSum
+  };
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
new file mode 100644
index 0000000..b84a239
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
@@ -0,0 +1,215 @@
+#include "caffe2/operators/lengths_reducer_rowwise_8bit_ops.h"
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Rowwise8BitQuantizedToFloat,
+    Rowwise8BitQuantizedToFloatOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    FloatToRowwiseQuantized8Bits,
+    FloatToRowwiseQuantized8BitsOp<CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsSum8BitsRowwise,
+    SparseLengths8BitsRowwiseOp<CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsWeightedSum8BitsRowwise,
+    SparseLengths8BitsRowwiseOp<CPUContext, 1>);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsMean8BitsRowwise,
+    SparseLengths8BitsRowwiseOp<CPUContext, 0, 1>);
+
+REGISTER_CPU_OPERATOR(
+    SparseLengthsWeightedMean8BitsRowwise,
+    SparseLengths8BitsRowwiseOp<CPUContext, 1, 1>);
+
+OPERATOR_SCHEMA(SparseLengthsSum8BitsRowwise)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Variation of SparseLengthsSum operator, where DATA is
+stored using 8bits. DATA was quantized with 8Bit row-wise
+quantization (see doc to FloatToRowwiseQuantized8Bits operator). To
+restore DATA from 8Bit, we use additional input that stores scales
+and biases.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToRowwiseQuantized8Bits")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        2,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Input(
+        3,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i -- scale and bias for i-th row")
+
+    .Output(0, "output", "output");
+
+OPERATOR_SCHEMA(SparseLengthsWeightedSum8BitsRowwise)
+    .NumInputs(5)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Variation of SparseLengthsWeightedSum operator, where
+DATA is stored using 8bits. DATA was quantized with 8Bit row-wise
+quantization (see doc to FloatToRowwiseQuantized8Bits operator). To
+restore DATA from 8Bit, we use additional input that stores scales
+and biases.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToRowwiseQuantized8Bits")
+    .Input(
+        1,
+        "SCALARS",
+        "Scalar multipliers for the input slices. Must "
+        "be a vector with the length matching the length of INDICES")
+    .Input(
+        2,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        3,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Input(
+        4,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i -- scale and bias for i-th row")
+    .Output(0, "output", "output");
+
+OPERATOR_SCHEMA(SparseLengthsMean8BitsRowwise)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Variation of SparseLengthsMean operator, where DATA is
+stored using 8bits. DATA was quantized with 8Bit row-wise
+quantization (see doc to FloatToRowwiseQuantized8Bits operator). To
+restore DATA from 8Bit, we use additional input that stores scales
+and biases.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToRowwiseQuantized8Bits")
+    .Input(
+        1,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        2,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Input(
+        3,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i -- scale and bias for i-th row")
+
+    .Output(0, "output", "output");
+
+OPERATOR_SCHEMA(SparseLengthsWeightedMean8BitsRowwise)
+    .NumInputs(5)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Variation of SparseLengthsWeightedMean operator, where
+DATA is stored using 8bits. DATA was quantized with 8Bit row-wise
+quantization (see doc to FloatToRowwiseQuantized8Bits operator). To
+restore DATA from 8Bit, we use additional input that stores scales
+and biases.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "uint8 tensor obtained with "
+        "operator FloatToRowwiseQuantized8Bits")
+    .Input(
+        1,
+        "SCALARS",
+        "Scalar multipliers for the input slices. Must "
+        "be a vector with the length matching the length of INDICES")
+    .Input(
+        2,
+        "INDICES",
+        "Integer vector containing indices of the first "
+        "dimension of DATA for the slices that are being aggregated")
+    .Input(
+        3,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA")
+    .Input(
+        4,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i -- scale and bias for i-th row")
+    .Output(0, "output", "output");
+
+OPERATOR_SCHEMA(FloatToRowwiseQuantized8Bits)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+This operator applies 8Bit row-wise quantization to
+input tensor and returns quantized tensor. Row wise quantization of
+input tensor is the following process. We take tensor of size
+(m_1, m_2,...,m_n), n >= 2, reshape it into matrix of size
+(m_1, m_2 x... x m_n) and apply row-wise quantization. After this,
+we compute scale_i= (min_i - max_i) / 255 and  bias_i = min_i for
+i-th row r_i of reshaped matrix, where min_i and max_i --  minimum
+and maximum elements of i-th row, and quantize each element r_{ij} as
+0 <= round(r_ij - bias_i) / scale_i) < 256. Instead of input tensor
+we obtain uint8 tensor and auxiliary information as scale and bias to
+restore input tensor (with losses).
+)DOC")
+    .Input(0, "input", "input")
+    .Output(0, "quantized_input", "quantized_input")
+    .Output(
+        1,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i");
+
+OPERATOR_SCHEMA(Rowwise8BitQuantizedToFloat)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given uint8 tensor, quantized using 8bit row-wise
+quantization, and auxiliary scales and biases, this operator
+restores float tensor in the following way. We take input 8bits tensor
+of size  (m_1, m_2, ..., m_n), n >= 2, reshape it  into matrix of size
+(m_1, m_2 x... x m_n). We compute element r_{ij} of output matrix as
+r_{ij} * s_i + b_i and after this we reshape this output matrix into
+output tensor of size (m_1, m_2, ..., m_n).
+)DOC")
+    .Input(0, "quantized_input", "quantized_input")
+    .Input(
+        1,
+        "scale_bias",
+        "Matrix of floats, each row r_i of which stores a pair "
+        "s_i, b_i -- scale and bias for i-th row")
+    .Output(1, "output", "output");
+
+NO_GRADIENT(Rowwise8BitQuantizedToFloat);
+NO_GRADIENT(FloatToRowwiseQuantized8Bits);
+NO_GRADIENT(SparseLengthsSum8BitsRowwise);
+NO_GRADIENT(SparseLengthsWeightedSum8BitsRowwise);
+NO_GRADIENT(SparseLengthsMean8BitsRowwise);
+NO_GRADIENT(SparseLengthsWeightedMean8BitsRowwise);
+}
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
new file mode 100644
index 0000000..8af4a41
--- /dev/null
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@@ -0,0 +1,188 @@
+
+#ifndef CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_OP_H_
+#define CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_OP_H_
+// SparseLengthsSum8bits
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/reducer_functors.h"
+#include "caffe2/perfkernels/embedding_lookup.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+const float kEqualityThreshold = 1e-10f;
+}
+
+template <
+    class Context,
+    bool USE_WEIGHTS = 0,
+    bool USE_MEAN = 0,
+    class OutDataT = float>
+class SparseLengths8BitsRowwiseOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SparseLengths8BitsRowwiseOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& dataInput = Input(DATA);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+    auto* scale_bias = Input(SCALE_BIAS).template data<float>();
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex outputSize = lengthsInput.dim(0);
+
+    auto& indicesInput = Input(INDICES);
+    CAFFE_ENFORCE_EQ(
+        2, Input(SCALE_BIAS).ndim(), "scale_bias has to be matrix");
+    CAFFE_ENFORCE_EQ(
+        dataInput.dim(0),
+        Input(SCALE_BIAS).dim(0),
+        "scale_bias must have the same first dim as data");
+    CAFFE_ENFORCE_EQ(
+        2,
+        Input(SCALE_BIAS).dim(1),
+        "the second dim of scale_bias has to be equal to 2");
+    CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+    const IndexType* indices = indicesInput.template data<IndexType>();
+    TIndex dataToReduceSize = indicesInput.dim(0);
+
+    const int* lengths = lengthsInput.template data<int>();
+    vector<TIndex> shape = dataInput.dims();
+    shape[0] = outputSize;
+    output->Resize(shape);
+    const float* w = nullptr;
+    if (USE_WEIGHTS) {
+      w = Input(WEIGHTS).template data<float>();
+    }
+    TIndex in_block_size = dataInput.size_from_dim(1);
+    OutDataT* out = output->template mutable_data<OutDataT>();
+    const uint8_t* input_data = dataInput.template data<uint8_t>();
+
+    // delegate work to perfkernel that branches based on architecture
+    const TIndex indices_size = indicesInput.size();
+    const TIndex N = dataInput.dim(0);
+    EmbeddingLookup(
+        in_block_size,
+        outputSize,
+        indices_size,
+        N, // embeding table length
+        input_data,
+        indices,
+        lengths,
+        w,
+        scale_bias,
+        USE_MEAN,
+        out);
+
+    return true;
+  }
+
+  USE_VALUE_LENGTH_INPUT_FILLERS(Context, DATA, LENGTHS)
+
+  enum {
+    DATA = 0,
+    WEIGHTS = 1,
+    INDICES = 1 + USE_WEIGHTS,
+    LENGTHS = 2 + USE_WEIGHTS,
+    SCALE_BIAS = 3 + USE_WEIGHTS
+  };
+};
+
+template <class Context>
+class FloatToRowwiseQuantized8BitsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FloatToRowwiseQuantized8BitsOp);
+  bool RunOnDevice() override {
+    auto& input = Input(DATA_FLOAT);
+    auto* output = Output(DATA_UINT8);
+    auto* scale_bias = Output(SCALE_BIAS);
+    auto* input_data = input.template data<float>();
+    output->ResizeLike(input);
+    vector<TIndex> scale_bias_dims = {input.dim(0), 2};
+    scale_bias->Resize(scale_bias_dims);
+    auto* output_data = output->template mutable_data<uint8_t>();
+    float* scale_bias_data = scale_bias->template mutable_data<float>();
+    size_t n_blocks = input.dim(0);
+    size_t block_size = input.size_from_dim(1);
+    for (size_t i = 0; i < n_blocks; ++i) {
+      ConstEigenVectorArrayMap<float> input_row(
+          input_data + i * block_size, block_size);
+      EigenVectorArrayMap<uint8_t> output_row(
+          output_data + i * block_size, block_size);
+      auto min_element = input_row.minCoeff();
+      auto max_element = input_row.maxCoeff();
+      if (max_element - min_element < kEqualityThreshold) {
+        scale_bias_data[2 * i] = 1.0f;
+        scale_bias_data[2 * i + 1] = min_element;
+        memset(output_data + i * block_size, 0, block_size);
+      } else {
+        scale_bias_data[2 * i] = (max_element - min_element) / 255.0f;
+        scale_bias_data[2 * i + 1] = min_element;
+        const float inv_scale = 1.0f / scale_bias_data[2 * i];
+        output_row = ((input_row - scale_bias_data[2 * i + 1]) * inv_scale)
+                         .round()
+                         .template cast<uint8_t>();
+      }
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA_FLOAT);
+  OUTPUT_TAGS(DATA_UINT8, SCALE_BIAS);
+};
+
+template <class Context>
+class Rowwise8BitQuantizedToFloatOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(Rowwise8BitQuantizedToFloatOp);
+  bool RunOnDevice() override {
+    auto& input = Input(DATA_UINT8);
+    auto& scale_bias = Input(SCALE_BIAS);
+    auto* output = Output(DATA_FLOAT);
+    CAFFE_ENFORCE_EQ(2, scale_bias.ndim(), "scale_bias has to be matrix");
+    CAFFE_ENFORCE_EQ(
+        input.dim(0),
+        scale_bias.dim(0),
+        "scale_bias must have the same first dim as data");
+    CAFFE_ENFORCE_EQ(
+        2,
+        scale_bias.dim(1),
+        "the second dim of scale_bias has to be equal to 2");
+    output->ResizeLike(input);
+    auto* input_data = input.template data<uint8_t>();
+    auto* scale_bias_data = scale_bias.template data<float>();
+
+    auto* output_data = output->template mutable_data<float>();
+    size_t block_size = input.size_from_dim(1);
+    size_t n_blocks = input.dim(0);
+
+    for (size_t i = 0; i < n_blocks; ++i) {
+      ConstEigenVectorArrayMap<uint8_t> input_row(
+          input_data + i * block_size, block_size);
+      EigenVectorArrayMap<float> output_row(
+          output_data + i * block_size, block_size);
+      output_row = input_row.template cast<float>() * scale_bias_data[2 * i] +
+          scale_bias_data[2 * i + 1];
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA_UINT8, SCALE_BIAS);
+  OUTPUT_TAGS(DATA_FLOAT);
+};
+}
+#endif // CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_H_
diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc
new file mode 100644
index 0000000..e832fe8
--- /dev/null
+++ b/caffe2/operators/lengths_tile_op.cc
@@ -0,0 +1,54 @@
+#include "caffe2/operators/lengths_tile_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(LengthsTile, LengthsTileOp<CPUContext>);
+
+OPERATOR_SCHEMA(LengthsTile)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given DATA tensor of rank r >= 1, and LENGTHS tensor of rank 1, duplicate each
+entry of the outer-most dimension of DATA according to LENGTHS, and concatenate
+them in an output tensor of rank r.
+
+Example:
+  DATA  = [
+      [1.0, 1.2],
+      [2.3, 3.4],
+      [4.5, 5.7],
+      [6.8, 7.9],
+  ]
+  LENGTHS = [0, 1, 3, 2]
+  OUTPUT = [
+      [2.3, 3.4],
+      [4.5, 5.7],
+      [4.5, 5.7],
+      [4.5, 5.7],
+      [6.8, 7.9],
+      [6.8, 7.9],
+  ]
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "Tensor of rank r >= 1. First dimension must be equal to the size of "
+        "lengths")
+    .Input(1, "LENGTHS", "Tensor of int32 lengths of rank 1")
+    .Output(0, "OUTPUT", "Tensor of rank r");
+
+class GetLengthsTileGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 2);
+    return SingleGradientDef(
+        "LengthsSum",
+        "",
+        // input 1 is the lengths used to repeat
+        // DATA in the forward pass
+        vector<string>{GO(0), I(1)},
+        // only concerned with the gradient on "DATA"
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(LengthsTile, GetLengthsTileGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h
new file mode 100644
index 0000000..fb525bd
--- /dev/null
+++ b/caffe2/operators/lengths_tile_op.h
@@ -0,0 +1,63 @@
+#ifndef CAFFE2_OPERATORS_LENGTHS_TILE_OP_H_
+#define CAFFE2_OPERATORS_LENGTHS_TILE_OP_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class LengthsTileOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsTileOp);
+
+  bool RunOnDevice() override {
+    auto& data = Input(DATA);
+    auto& lengths = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+    CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+    CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
+
+    // Context::CopyFrom and math::Sum need the same context to avoid race
+    // conditions
+    CPUContext cpuContext;
+    lengths_host_.CopyFrom(lengths, &cpuContext);
+    auto lengths_size = lengths_host_.size();
+    auto* lengths_data = lengths_host_.data<int32_t>();
+
+    int32_t total_length = 0;
+    math::Sum<int32_t, CPUContext>(
+        lengths_size, lengths_data, &total_length, &cpuContext);
+
+    auto shape = data.dims();
+    shape[0] = total_length;
+    output->Resize(shape);
+
+    auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+    auto src = static_cast<const char*>(data.raw_data());
+    auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+    for (TIndex i = 0; i < lengths_size; ++i) {
+      auto length = lengths_data[i];
+      CAFFE_ENFORCE_GE(length, 0);
+      for (int32_t j = 0; j < length; ++j) {
+        context_.template CopyBytes<Context, Context>(block_bytesize, src, out);
+        out += block_bytesize;
+      }
+      src += block_bytesize;
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, LENGTHS);
+
+ private:
+  TensorCPU lengths_host_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTHS_TILE_OP_H_
diff --git a/caffe2/operators/lengths_tile_op_gpu.cc b/caffe2/operators/lengths_tile_op_gpu.cc
new file mode 100644
index 0000000..65ed44b
--- /dev/null
+++ b/caffe2/operators/lengths_tile_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/lengths_tile_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_top_k_op.cc b/caffe2/operators/lengths_top_k_op.cc
new file mode 100644
index 0000000..c871d53
--- /dev/null
+++ b/caffe2/operators/lengths_top_k_op.cc
@@ -0,0 +1,162 @@
+#include "caffe2/operators/lengths_top_k_op.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool LengthsTopKOp<T, Context>::RunOnDevice() {
+  auto& X = Input(X_IN);
+  auto& Y = Input(Y_IN);
+  int N = Y.dim32(0);
+  const T* X_data = X.template data<T>();
+  const int* input_len = Y.template data<int>();
+  auto* output_topk_values = Output(TOPK_VALUES_OUT);
+  auto* output_topk_indices = Output(TOPK_INDICES_OUT);
+
+  output_topk_values->Resize(N * k_);
+  output_topk_indices->Resize(N * k_);
+  std::vector<int> output_dims = std::vector<int>({N, k_});
+  output_topk_values->Reshape(output_dims);
+  output_topk_indices->Reshape(output_dims);
+  T* output_topk_values_data = output_topk_values->template mutable_data<T>();
+  int* output_topk_indices_data =
+      output_topk_indices->template mutable_data<int>();
+
+  auto cmp = [](std::pair<T, TIndex>& lhs, std::pair<T, TIndex>& rhs) {
+    return lhs.first > rhs.first ||
+        (lhs.first == rhs.first && lhs.second < rhs.second);
+  };
+
+  // Sort preserving indices
+  int next_index = 0;
+  for (TIndex i = 0; i < N; ++i) {
+    // Build a min-heap, the heap element is pair of (value, idx)
+    // the top of the heap is the smallest value
+    std::priority_queue<
+        std::pair<T, TIndex>,
+        std::vector<std::pair<T, TIndex>>,
+        decltype(cmp)>
+        p_queue(cmp);
+
+    // Maintain the size of heap to be less or equal to k_, so the
+    // heap will hold the k_ largest values
+    for (TIndex j = 0; j < input_len[i]; ++j) {
+      const auto value = X_data[next_index++];
+      if (p_queue.size() < k_ || value > p_queue.top().first) {
+        p_queue.push(std::make_pair(value, j));
+      }
+      if (p_queue.size() > k_) {
+        p_queue.pop();
+      }
+    }
+
+    int last_index = p_queue.size();
+    for (TIndex j = 0; j < k_; ++j) {
+      if (p_queue.size() > 0) {
+        auto& pqElem = p_queue.top();
+        output_topk_values_data[i * k_ + last_index - j - 1] = pqElem.first;
+        output_topk_indices_data[i * k_ + last_index - j - 1] = pqElem.second;
+        p_queue.pop();
+      } else {
+        output_topk_values_data[i * k_ + j] = 0;
+        output_topk_indices_data[i * k_ + j] = -1;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool LengthsTopKGradientOp<T, Context>::RunOnDevice() {
+  auto& input_len = Input(LENGTH_IN);
+  int N = input_len.size();
+  auto& input_indices = Input(INDICES_IN);
+  CAFFE_ENFORCE_GE(input_indices.ndim(), 2, "input dim must be >= 2");
+  CAFFE_ENFORCE_EQ(
+      input_indices.size(), N * k_, "input_indices shape is not correct");
+  auto& input_topk = Input(DER_TOPK_IN);
+  CAFFE_ENFORCE_EQ(
+      input_topk.size(), N * k_, "input_topk shape is not correct");
+  auto* X_out = Output(DER_X_OUT);
+
+  const int* input_len_data = input_len.template data<int>();
+  const int* input_indices_data = input_indices.template data<int>();
+  const T* input_topk_data = input_topk.template data<T>();
+
+  int num_indices = 0;
+  for (int i = 0; i < N; i++) {
+    num_indices += input_len_data[i];
+  }
+  X_out->Resize(num_indices);
+  std::vector<int> output_dims = std::vector<int>({num_indices});
+  X_out->Reshape(output_dims);
+  T* X_out_data = X_out->template mutable_data<T>();
+  math::Set<T, Context>(num_indices, 0.0, X_out_data, &context_);
+
+  int index_offset = 0;
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < std::min(input_len_data[i], k_); j++) {
+      int cur_index = index_offset + input_indices_data[i * k_ + j];
+      CAFFE_ENFORCE_LT(
+          cur_index, num_indices, "cur_index should be less than num_indices");
+      X_out_data[cur_index] = input_topk_data[i * k_ + j];
+    }
+    index_offset += input_len_data[i];
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LengthsTopK, LengthsTopKOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    LengthsTopKGradient,
+    LengthsTopKGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(LengthsTopK)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Apply TopK to each segment of the input tensor, where segments are defined by
+their LENGTHS, and concatenate them in an output tensor of
+shape=(SIZE(LENGTHs), k). In case there's less than k values in a segment,
+the output value will be padded by 0, and the corresponding output indices will
+be padded by -1.
+)DOC")
+    .Input(
+        0,
+        "DATA",
+        "Tensor of rank 1. First dimension must be equal to the sum of "
+        "lengths")
+    .Input(1, "LENGTHS", "Tensor of int32 lengths of rank 1")
+    .Output(
+        0,
+        "TopKValue",
+        "Output top k elements for each segment, with"
+        "shape=(SIZE(lengths), k)")
+    .Output(
+        1,
+        "TopKIndices",
+        "Output indices in DATA corresponding to value in TopKValue")
+    .Arg(
+        "k",
+        "the number of top values to return for each segment, if the number "
+        "of values is smaller than k, the values would be padded with 0 and "
+        "indices would be padded with -1.");
+OPERATOR_SCHEMA(LengthsTopKGradient).NumInputs(3).NumOutputs(1);
+
+namespace {
+
+class GetLengthsTopKGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LengthsTopKGradient",
+        "",
+        vector<string>{I(1), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(LengthsTopK, GetLengthsTopKGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_top_k_op.h b/caffe2/operators/lengths_top_k_op.h
new file mode 100644
index 0000000..9f5d1cb
--- /dev/null
+++ b/caffe2/operators/lengths_top_k_op.h
@@ -0,0 +1,49 @@
+
+#ifndef CAFFE2_OPERATORS_LENGTHS_TOP_K_OP_H_
+#define CAFFE2_OPERATORS_LENGTHS_TOP_K_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+template <typename T, class Context>
+class LengthsTopKOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  LengthsTopKOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), OP_SINGLE_ARG(int, "k", k_, -1) {
+    CAFFE_ENFORCE_GE(k_, 1, "k argument must be >= 1");
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  int k_;
+  INPUT_TAGS(X_IN, Y_IN);
+  OUTPUT_TAGS(TOPK_VALUES_OUT, TOPK_INDICES_OUT);
+};
+
+template <typename T, class Context>
+class LengthsTopKGradientOp : public Operator<Context> {
+ public:
+  LengthsTopKGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws), OP_SINGLE_ARG(int, "k", k_, -1) {
+    CAFFE_ENFORCE_GE(k_, 1, "k argument must be >= 1");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  int k_;
+  INPUT_TAGS(LENGTH_IN, INDICES_IN, DER_TOPK_IN);
+  OUTPUT_TAGS(DER_X_OUT);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTHS_TOP_K_OP_H_
diff --git a/caffe2/operators/listwise_l2r_op.cc b/caffe2/operators/listwise_l2r_op.cc
new file mode 100644
index 0000000..3940dfb
--- /dev/null
+++ b/caffe2/operators/listwise_l2r_op.cc
@@ -0,0 +1,254 @@
+#include "caffe2/operators/listwise_l2r_op.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+// Returns the indices that would sort an array. For example:
+//   data = [3, 1, 2, 4]
+//   return = [1, 2, 0, 3] (reverse = false)
+//   return = [3, 0, 2, 1] (reverse = true)
+template <typename TDATA, typename TIDX>
+void arg_sort(const TDATA* data, TIDX* idx, const size_t N, bool reverse) {
+  std::function<bool(size_t, size_t)> cmp_lambda;
+  if (reverse) {
+    cmp_lambda = [data](size_t i, size_t j) { return data[i] > data[j]; };
+  } else {
+    cmp_lambda = [data](size_t i, size_t j) { return data[i] < data[j]; };
+  }
+  size_t n = 0;
+  std::generate(idx, idx + N, [&n] { return n++; });
+  std::sort(idx, idx + N, cmp_lambda);
+}
+
+#define PAIRWISE_DIFF(vec, N)                               \
+  ((vec.matrix() * Eigen::MatrixXf::Ones(1, N) -            \
+    Eigen::MatrixXf::Ones(N, 1) * vec.matrix().transpose()) \
+       .array())
+
+#define CWISE_SIGM(vec) (1. / (1. + (-(vec)).exp()))
+
+#define CWISE_GT(vec1, vec2) ((vec1) > (vec2))
+
+#define CWISE_LT(vec1, vec2) ((vec1) < (vec2))
+
+#define CWISE_SIGN(vec) (CWISE_GT((vec), 0).cast<float>() * 2. - 1.)
+
+#define CWISE_LOG_SIGM(vec, huge) \
+  (CWISE_GT((vec), (huge))        \
+       .select(                   \
+           0, CWISE_LT((vec), -(huge)).select(vec, CWISE_SIGM((vec)).log())))
+
+} // namespace
+
+template <>
+void LambdaRankNdcgOp<float, CPUContext>::ResizeInvLogITensor(int size) {
+  int old_size = inv_log_i_.size();
+  int new_size = std::max(old_size, 1);
+  while (new_size < size) {
+    new_size <<= 1;
+  }
+  if (new_size != old_size) {
+    inv_log_i_.Resize(new_size);
+    auto* data = inv_log_i_.template mutable_data<float>();
+    EigenVectorArrayMap<float> vec(data, inv_log_i_.size());
+    const float log2f_ = std::log(2.f);
+    vec = log2f_ *
+        (Eigen::ArrayXf::LinSpaced(new_size, 2, 1 + new_size).log().inverse());
+  }
+  return;
+}
+
+template <>
+void LambdaRankNdcgOp<float, CPUContext>::ComputeDiscounts(int* idx, int N) {
+  discount_.Resize(N);
+  auto* discount_data = discount_.template mutable_data<float>();
+  auto* inv_log_i_data = inv_log_i_.template mutable_data<float>();
+  for (int i = 0; i < N; i++) {
+    discount_data[idx[i]] = inv_log_i_data[i];
+  }
+  return;
+}
+
+template <>
+float LambdaRankNdcgOp<float, CPUContext>::LambdaRankNdcgSession(
+    int start_index,
+    int end_index,
+    const Tensor<CPUContext>& y,
+    const Tensor<CPUContext>& r,
+    Tensor<CPUContext>** dy) {
+  CAFFE_ENFORCE(start_index >= 0);
+  CAFFE_ENFORCE(start_index < y.size());
+  const auto* y_data = y.template data<float>();
+  const auto* r_data = r.template data<float>();
+
+  int N = end_index - start_index + 1;
+
+  ConstEigenVectorArrayMap<float> y_vec(&y_data[start_index], N);
+  ConstEigenVectorArrayMap<float> r_vec(&r_data[start_index], N);
+
+  if (N <= 0) {
+    return 0;
+  }
+
+  ideal_idx_.Resize(N);
+  rank_idx_.Resize(N);
+  auto* rank_idx_data = rank_idx_.template mutable_data<int>();
+  auto* ideal_idx_data = ideal_idx_.template mutable_data<int>();
+
+  // current ranked list is obtained by sorting by current score
+  arg_sort(&y_data[start_index], rank_idx_data, N, true);
+  // ideal ranked list is same as sorting by label
+  arg_sort(&r_data[start_index], ideal_idx_data, N, true);
+
+  auto* dy_data = (*dy)->template mutable_data<float>();
+  EigenVectorArrayMap<float> dy_vec(&dy_data[start_index], N);
+  float loss = 0;
+  dy_vec = 0;
+  // in case that all docs in a session have zero ratings, no op
+  if (r_vec.abs().sum() < 1e-6) {
+    return 0;
+  }
+
+  const double log2f_ = std::log(2.f);
+  gain_.Resize(N);
+  auto* gain_data = gain_.template mutable_data<float>();
+  EigenVectorArrayMap<float> gain_vec(gain_data, gain_.size());
+  // Gain vector = 2^rel = exp{rel * log(2)}
+  gain_vec = (r_vec * log2f_).exp();
+
+  ResizeInvLogITensor(N);
+  ComputeDiscounts(ideal_idx_data, N);
+  auto* ideal_discount_data = discount_.template mutable_data<float>();
+  EigenVectorArrayMap<float> ideal_discount_vec(
+      ideal_discount_data, discount_.size());
+  // ideal dcg = \sum gain_i * ideal_discount_i
+  double idcg = (gain_vec * ideal_discount_vec).sum();
+  if (idcg < 1e-5) {
+    idcg = 1e-5;
+  }
+
+  ComputeDiscounts(rank_idx_data, N);
+  auto* discount_data = discount_.template mutable_data<float>();
+  EigenVectorArrayMap<float> discount_vec(discount_data, discount_.size());
+  // similar to ideal but replace with actual discounts
+  double dcg = (gain_vec * discount_vec).sum();
+
+  lambda_.Resize(N * N);
+  auto* lambda_data = lambda_.template mutable_data<float>();
+  EigenArrayMap<float> lambda_mat(lambda_data, N, N);
+  // computes lambda weight (i, j) = abs(gain_dff * discount_diff)
+  lambda_mat =
+      (PAIRWISE_DIFF(discount_vec, N) * PAIRWISE_DIFF(gain_vec, N)).abs();
+
+  // dy_i =
+  //    \sum_j lambda_{i, j} -sign(i > j) * sigm( -sign(i > j)*(yi - yj) )
+  //                         |++ gradient of rank loss between i & j  ++|
+  dy_vec =
+      -(lambda_mat * CWISE_SIGN(PAIRWISE_DIFF(r_vec, N)) *
+        CWISE_SIGM(
+            -CWISE_SIGN(PAIRWISE_DIFF(r_vec, N)) * PAIRWISE_DIFF(y_vec, N)))
+           .rowwise()
+           .sum() /
+      idcg;
+  if (use_ndcg_as_loss_) {
+    loss = 1 - dcg / idcg;
+  } else {
+    loss = -(lambda_mat *
+             CWISE_LOG_SIGM(
+                 CWISE_SIGN(PAIRWISE_DIFF(r_vec, N)) * PAIRWISE_DIFF(y_vec, N),
+                 100))
+                .sum() /
+        idcg;
+  }
+  return loss;
+}
+
+template <>
+bool LambdaRankNdcgOp<float, CPUContext>::RunOnDevice() {
+  auto& y = Input(PRED);
+  auto& r = Input(REL);
+  auto& sid = Input(SESSION_LENS);
+  auto* loss = Output(LOSS);
+  auto* dy = Output(DPRED);
+
+  const auto* session_lengths = sid.template data<int>();
+  CAFFE_ENFORCE(y.ndim() == 1);
+  CAFFE_ENFORCE(y.size() == r.size());
+  dy->Resize(y.size());
+  loss->Resize(sid.size());
+  auto loss_vec = loss->template mutable_data<float>();
+  int start_id = 0;
+  for (int i = 0; i < sid.size(); i++) {
+    loss_vec[i] = LambdaRankNdcgSession(
+        start_id, session_lengths[i] + start_id - 1, y, r, &dy);
+    start_id += session_lengths[i];
+  }
+
+  return true;
+}
+
+template <>
+bool LambdaRankNdcgGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& y = Input(Y);
+  auto& sids = Input(SESSION_LENS);
+  auto& dy_cache = Input(DY_CACHE);
+  auto& dLoss = Input(DLOSS);
+  auto* dy = Output(DY);
+  CAFFE_ENFORCE(y.ndim() == 1);
+  CAFFE_ENFORCE(dy_cache.ndim() == 1);
+  CAFFE_ENFORCE(dy_cache.size() > 0);
+  CAFFE_ENFORCE(y.size() == dy_cache.size());
+
+  const auto* session_lengths = sids.template data<int>();
+  CAFFE_ENFORCE(dLoss.size() == sids.size());
+
+  ConstEigenVectorArrayMap<float> dy_cache_vec(
+      dy_cache.template data<float>(), dy_cache.size());
+  dy->Resize(dy_cache.size());
+  EigenVectorArrayMap<float> dy_vec(
+      dy->template mutable_data<float>(), dy->size());
+  auto multiplier = dLoss.template data<float>();
+  int count = 0;
+  for (int j = 0; j < sids.size(); j++) {
+    dy_vec.segment(count, session_lengths[j]) =
+        multiplier[j] * dy_cache_vec.segment(count, session_lengths[j]);
+    count += session_lengths[j];
+  }
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(LambdaRankNdcg, LambdaRankNdcgOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    LambdaRankNdcgGradient,
+    LambdaRankNdcgGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LambdaRankNdcg).NumInputs(3).NumOutputs(2).SetDoc(R"DOC(
+It implements the LambdaRank as appeared in Wu, Qiang, et al. "Adapting boosting
+for information retrieval measures." Information Retrieval 13.3 (2010): 254-270.
+
+This method heuristically optimizes the NDCG.
+)DOC");
+OPERATOR_SCHEMA(LambdaRankNdcgGradient).NumInputs(4).NumOutputs(1);
+
+class GetLambdaRankNdcgGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LambdaRankNdcgGradient",
+        "",
+        vector<string>{I(0), I(2), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(LambdaRankNdcg, GetLambdaRankNdcgGradient);
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/operators/listwise_l2r_op.h b/caffe2/operators/listwise_l2r_op.h
new file mode 100644
index 0000000..ae1aca2
--- /dev/null
+++ b/caffe2/operators/listwise_l2r_op.h
@@ -0,0 +1,56 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LambdaRankNdcgOp final : public Operator<Context> {
+ public:
+  LambdaRankNdcgOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        use_ndcg_as_loss_(OperatorBase::template GetSingleArgument<bool>(
+            "use_ndcg_as_loss",
+            false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ private:
+  INPUT_TAGS(PRED, REL, SESSION_LENS);
+  OUTPUT_TAGS(LOSS, DPRED);
+
+  void ResizeInvLogITensor(int);
+  void ComputeDiscounts(int*, int);
+  float LambdaRankNdcgSession(
+      int start_index,
+      int end_index,
+      const Tensor<CPUContext>& y,
+      const Tensor<CPUContext>& r,
+      Tensor<CPUContext>** dy);
+  bool use_ndcg_as_loss_;
+  Tensor<Context> gain_;
+  Tensor<Context> discount_;
+  Tensor<Context> rank_idx_;
+  Tensor<Context> ideal_idx_;
+  Tensor<Context> lambda_;
+  Tensor<Context> inv_log_i_;
+};
+
+template <typename T, class Context>
+class LambdaRankNdcgGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LambdaRankNdcgGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ private:
+  INPUT_TAGS(Y, SESSION_LENS, DY_CACHE, DLOSS);
+  OUTPUT_TAGS(DY);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/load_save_op.cc b/caffe2/operators/load_save_op.cc
new file mode 100644
index 0000000..a77aa76
--- /dev/null
+++ b/caffe2/operators/load_save_op.cc
@@ -0,0 +1,259 @@
+#include "caffe2/operators/load_save_op.h"
+
+namespace caffe2 {
+
+template <>
+void LoadOp<CPUContext>::SetCurrentDevice(BlobProto* proto) {
+  if (proto->has_tensor()) {
+    proto->mutable_tensor()->mutable_device_detail()->set_device_type(CPU);
+  }
+}
+
+REGISTER_CPU_OPERATOR(DBExists, DBExistsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Load, LoadOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Save, SaveOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Checkpoint, CheckpointOp<CPUContext>);
+// CPU Operator old name: do NOT use, we may deprecate this later.
+REGISTER_CPU_OPERATOR(Snapshot, CheckpointOp<CPUContext>);
+
+OPERATOR_SCHEMA(DBExists)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Checks if the db described by the arguments exists.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "DBExists",
+    [],
+    ["exists"],
+    db_name="test_db",
+    db_type="leveldb",
+)
+
+workspace.RunOperatorOnce(op)
+print("exists:", workspace.FetchBlob("exists"))
+
+```
+
+</details>
+
+)DOC")
+    .Output(0, "exists", "*(type: Tensor`<bool>`)* Scalar boolean output "
+    "tensor. True if the db exists, else false.")
+    .Arg(
+        "absolute_path",
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
+        "the path specified by the `db` arg. If not set (default), prepend the "
+        "path of the current root folder of the workspace to the path specified "
+        "by the `db` arg.")
+    .Arg("db_name", "*(type: string)* Path to the db in question; see the "
+    "`absolute_path` arg details for options regarding the current root folder "
+    "of the workspace.")
+    .Arg("db_type", "*(type: string)* Type of db to save (options: \"lmdb\", "
+    "\"leveldb\", \"minidb\").");
+
+OPERATOR_SCHEMA(Load)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+The Load operator loads a set of serialized blobs from a db or multiple dbs. It
+takes $[0, \infty)$ number of inputs and $[0, \infty)$ number of outputs, using
+the db keys to match the db entries with the outputs.
+
+If at least one input is passed, then it is assumed that that input blobs are a
+set of DBReaders to load from. Otherwise the `db` or `dbs` argument is used to load
+blobs from one single db or multiple dbs respectively. `db_type` argument is used
+to specify the type of the input db/dbs.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Load",
+    [],
+    ["X", "Y"],
+    db="test_db",
+    db_type="lmdb"
+)
+
+workspace.RunOperatorOnce(op)
+print("X:", workspace.FetchBlob("X"))
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+</details>
+
+)DOC")
+    .Input(
+      0,
+      "X, Y, ...",
+      "*(type: List(DBReader))* [OPTIONAL] List of DBReaders to load from. Can "
+      "use this instead of the `db`/`dbs` args.")
+    .Arg(
+        "absolute_path",
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
+        "the path specified by the `db` arg. If not set (default), prepend the "
+        "path of the current root folder of the workspace to the path specified "
+        "by the `db` arg.")
+    .Arg(
+        "add_prefix",
+        "*(type: string, default: \"\")* Blobs will be prefixed with this when "
+        "loading. Useful for avoiding collisions with blobs existing in the "
+        "workspace. The output blob names specified to this op should include "
+        "this prefix.")
+    .Arg(
+        "strip_prefix",
+        "*(type: string, default: \"\")* Characters in the provided blob names "
+        "that match `strip_prefix` will be removed prior to saving. Also, "
+        "characters that precede `strip_prefix` will be removed. Useful for "
+        "removing device scope from blob names.")
+    .Arg("db", "*(type: string)* The output path of the db. See the "
+        "`absolute_path` arg details for options regarding the current root folder "
+        "of the workspace.")
+    .Arg(
+        "dbs",
+        "*(type: List(string))* List of paths to dbs to load blobs from. See "
+        "the `absolute_path` arg details for options regarding the current "
+        "root folder of the workspace.")
+    .Arg("db_type", "(type: string)* Type of db to save (options: \"lmdb\", "
+        "\"leveldb\", \"minidb\").")
+    .Arg(
+        "keep_device",
+        "*(type: int; default: 0)* If nonzero, the blobs are loaded into the "
+        "device that is specified in the serialized `BlobProto`. Otherwise, "
+        "the device will be set as the one that the `Load` operator is being "
+        "run under.")
+    .Arg(
+        "load_all",
+        "*(type: int; default: 0)* If nonzero, will load all blobs pointed to "
+        "by the db to the workspace overwriting/creating blobs as needed.")
+    .Arg(
+        "allow_incomplete",
+        "*(type: bool; default: False)* If True, will allow not loading all "
+        "the output blobs specified in the outputs.")
+    .Arg(
+        "source_blob_names",
+        "*(type: List(string))* If set, used instead of output blob names to "
+        "specify which blobs in the db shall be loaded. Must be the same "
+        "length as number of output blobs.");
+
+OPERATOR_SCHEMA(Save)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Saves a set of blobs to a db. It takes $[1, \infty)$ number of inputs and has
+no output. The contents of the inputs are written into the db using the
+settings specified by the arguments.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Save",
+    ["X", "Y", "Z"],
+    [],
+    db="test_db2",
+    db_type="leveldb",
+    blob_name_overrides=["x_scores", "y_scores", "z_scores"]
+)
+
+workspace.FeedBlob("X", np.random.randint(20, size=(5,5)))
+workspace.FeedBlob("Y", np.random.randint(20, size=(5,5)))
+workspace.FeedBlob("Z", np.random.randint(20, size=(5,5)))
+workspace.RunOperatorOnce(op)
+
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "absolute_path",
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
+        "the path specified by the `db` arg. If not set (default), prepend the "
+        "path of the current root folder of the workspace to the path specified "
+        "by the `db` arg.")
+     .Arg(
+         "strip_prefix",
+         "*(type: string, default: \"\")* Characters in the provided blob names "
+         "that match `strip_prefix` will be removed prior to saving. Also, "
+         "characters that precede `strip_prefix` will be removed. Useful for "
+         "removing device scope from blob names.")
+    .Arg(
+        "blob_name_overrides",
+        "*(List(string))* If set, used as blob names instead of original blob "
+        "names. Must be same length as number of blobs.")
+    .Arg("db", "*(type: string)* The output path of the db. See the "
+    "`absolute_path` arg details for options regarding the current root folder "
+    "of the workspace.")
+    .Arg("db_type", "*(type: string)* Type of db to save (options: \"lmdb\", "
+    "\"leveldb\", \"minidb\").")
+    .Input(0, "X", "*(type: Tensor)* Input tensor(s).");
+
+OPERATOR_SCHEMA(Checkpoint)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+The Checkpoint operator is similar to the Save operator, but allows one to save
+to db every few iterations, with a db name that is appended with the iteration
+count. It takes [1, infinity) number of inputs and has no output. The first
+input has to be a TensorCPU of type int and has size 1 (i.e. the iteration
+counter). This is determined whether we need to do checkpointing.
+)DOC")
+    .Arg(
+        "absolute_path",
+        "(int, default 0) if set, use the db path directly and do not prepend "
+        "the current root folder of the workspace.")
+    .Arg(
+        "db",
+        "(string) a template string that one can combine with the "
+        "iteration to create the final db name. For example, "
+        "\"/home/lonestarr/checkpoint_%08d.db\"")
+    .Arg("db_type", "(string) the type of the db.")
+    .Arg(
+        "every",
+        "(int, default 1) the checkpointing is carried out when "
+        "(iter mod every) is zero.");
+
+OPERATOR_SCHEMA(Snapshot);
+
+NO_GRADIENT(Load);
+SHOULD_NOT_DO_GRADIENT(DBExists);
+SHOULD_NOT_DO_GRADIENT(Save);
+SHOULD_NOT_DO_GRADIENT(Checkpoint);
+SHOULD_NOT_DO_GRADIENT(Snapshot);
+}  // namespace caffe2
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
new file mode 100644
index 0000000..4b21fb2
--- /dev/null
+++ b/caffe2/operators/load_save_op.h
@@ -0,0 +1,559 @@
+#ifndef CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
+#define CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
+
+#include <cstdio>
+#include <map>
+#include <unordered_set>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+namespace {
+struct BlobState {
+  int64_t total_size;
+  int64_t current_size;
+  bool is_tensor;
+  std::set<int32_t> seen_chunks_ids;
+
+  explicit BlobState(
+      int64_t total_size = 0,
+      int64_t current_size = 0,
+      bool is_tensor = false)
+      : total_size(total_size),
+        current_size(current_size),
+        is_tensor(is_tensor) {}
+};
+} // namespace
+
+using db::Cursor;
+using db::DB;
+using db::Transaction;
+
+template <class Context>
+class DBExistsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  DBExistsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        absolute_path_(
+            OperatorBase::GetSingleArgument<int>("absolute_path", false)),
+        db_name_(OperatorBase::GetSingleArgument<string>("db_name", "")),
+        db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")) {}
+
+  bool RunOnDevice() override {
+    string full_db_name =
+        absolute_path_ ? db_name_ : (ws_->RootFolder() + "/" + db_name_);
+    auto* output = Output(0);
+    output->Resize();
+    bool* exists = output->template mutable_data<bool>();
+
+    *exists = caffe2::db::DBExists(db_type_, full_db_name);
+    return true;
+  }
+
+ private:
+  Workspace* ws_;
+  bool absolute_path_;
+  std::string db_name_;
+  std::string db_type_;
+};
+
+template <class Context>
+class LoadOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LoadOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        absolute_path_(
+            OperatorBase::GetSingleArgument<int>("absolute_path", false)),
+        add_prefix_(OperatorBase::GetSingleArgument<string>("add_prefix", "")),
+        strip_prefix_(
+            OperatorBase::GetSingleArgument<string>("strip_prefix", "")),
+        db_name_(OperatorBase::GetSingleArgument<string>("db", "")),
+        db_names_(OperatorBase::GetRepeatedArgument<string>("dbs")),
+        db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")),
+        keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)),
+        load_all_(OperatorBase::GetSingleArgument<int>("load_all", 0)),
+        allow_incomplete_(
+            OperatorBase::GetSingleArgument<bool>("allow_incomplete", false)),
+        blob_names_(
+            OperatorBase::GetRepeatedArgument<string>("source_blob_names")) {
+    if (InputSize() == 0) {
+      CAFFE_ENFORCE_GT(db_type_.size(), 0, "Must specify a db type.");
+      if (db_names_.empty()) {
+        CAFFE_ENFORCE_GT(db_name_.size(), 0, "Must specify a db name.");
+        db_names_.push_back(db_name_);
+        db_name_ = "";
+      } else {
+        std::set<std::string> db_name_set;
+        for (const string& db_name : db_names_) {
+          CAFFE_ENFORCE_GT(db_name.size(), 0, "Db name should not be empty.");
+          CAFFE_ENFORCE(
+              db_name_set.insert(db_name).second,
+              "Duplicated db name: ",
+              db_name);
+        }
+        db_name_ = "";
+      }
+    }
+    CAFFE_ENFORCE(blob_names_.empty() || blob_names_.size() == OutputSize(),
+      "Number of output blobs and source_blob_names mismatch.");
+    CAFFE_ENFORCE(blob_names_.empty() || strip_prefix_.empty(),
+        "strip_prefix and source_blob_names are mutually exclusive.");
+    CAFFE_ENFORCE(blob_names_.empty() || !load_all_,
+        "cannot load_all_ while using source_blob_names.");
+    if (!load_all_) {
+      // blob_names_ will be filled with ''source blob names'' in file/db
+      // if argument source_blob_names is not given, then blob_names_ is
+      // inferred from operator output
+      if(blob_names_.empty()) {
+        for (const string& name : operator_def.output()) {
+          blob_names_.push_back(name);
+        }
+      }
+      int idx = 0;
+      std::set<std::string> name_set;
+      for (const string& name : blob_names_) {
+        CAFFE_ENFORCE(name_set.insert(name).second,
+            "Duplicated source blob name: ", name);
+        output_indices_[name] = idx++;
+      }
+    }
+  }
+
+  void SetCurrentDevice(BlobProto* proto);
+
+  bool RunOnDevice() override {
+    int total_loaded_blobs = 0;
+    std::unordered_map<string, BlobState> blob_states;
+    if (InputSize() > 0) {
+      for (int i = 0; i < InputSize(); ++i) {
+        const db::DBReader& reader = OperatorBase::Input<db::DBReader>(i);
+        extract(i, reader.cursor(), &blob_states, &total_loaded_blobs);
+      }
+    } else {
+      for (int i = 0; i < db_names_.size(); ++i) {
+        string full_db_name = absolute_path_
+            ? db_names_[i]
+            : (ws_->RootFolder() + "/" + db_names_[i]);
+        std::unique_ptr<DB> in_db(
+            caffe2::db::CreateDB(db_type_, full_db_name, caffe2::db::READ));
+        CAFFE_ENFORCE(in_db.get(), "Cannot open db: ", full_db_name);
+        std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+        extract(i, cursor.get(), &blob_states, &total_loaded_blobs);
+      }
+    }
+
+    validateBlobStates(blob_states);
+    // Loaded all the needed blobs.
+    if (load_all_ || total_loaded_blobs == OutputSize()) {
+      VLOG(1) << "Loaded " << total_loaded_blobs << " blobs fully from db(s)";
+      return true;
+    }
+
+    // Only loaded a subset of the blobs.
+    if (allow_incomplete_) {
+      VLOG(1) << "Loaded " << total_loaded_blobs << " blobs out of "
+              << OutputSize() << " blobs from db(s).";
+    } else {
+      for (const string& output_name : this->debug_def().output()) {
+        if (blob_states.count(output_name) == 0) {
+          LOG(ERROR) << "Failed to load blob: " << output_name;
+        }
+      }
+      CAFFE_THROW(
+          "Expected to load ",
+          OutputSize(),
+          " blobs, got ",
+          total_loaded_blobs,
+          " only.\n");
+    }
+
+    return true;
+  }
+
+ private:
+  void extract(
+      int db_id,
+      Cursor* cursor,
+      std::unordered_map<string, BlobState>* blob_states,
+      int* total_loaded_blobs) {
+    if (load_all_) {
+      extractAll(db_id, cursor, blob_states, total_loaded_blobs);
+    } else {
+      extractFrom(
+          db_id,
+          cursor,
+          OperatorBase::Outputs(),
+          blob_states,
+          total_loaded_blobs);
+    }
+  }
+
+  void extractAll(
+      int db_id,
+      Cursor* cursor,
+      std::unordered_map<string, BlobState>* blob_states,
+      int* total_loaded_blobs) {
+    CAFFE_ENFORCE(cursor, "cursor is not valid");
+    int loaded_blobs = 0;
+    for (; cursor->Valid(); cursor->Next()) {
+      const auto key = buildBlobNameFromDbKey(cursor->key());
+      if (key_to_dbid_.count(key) && key_to_dbid_[key] != db_id) {
+        CAFFE_THROW("Duplicate Key ", key, " is found!\n");
+      } else {
+        key_to_dbid_[key] = db_id;
+      }
+
+      BlobProto proto;
+      CAFFE_ENFORCE(
+          proto.ParseFromString(cursor->value()), "Couldn't parse Proto");
+      if (!keep_device_) {
+        // If we are not keeping the device as the one specified in the
+        // proto, we will set the current device.
+        SetCurrentDevice(&proto);
+      }
+      Blob* blob = ws_->CreateBlob(key);
+      ProcessBlob(blob, proto, blob_states, key, &loaded_blobs);
+    }
+    *total_loaded_blobs += loaded_blobs;
+  }
+
+  void extractFrom(
+      int db_id,
+      Cursor* cursor,
+      const vector<Blob*>& outputs,
+      std::unordered_map<string, BlobState>* blob_states,
+      int* total_loaded_blobs) {
+    CAFFE_ENFORCE(cursor);
+    int loaded_blobs = 0;
+    for (; cursor->Valid(); cursor->Next()) {
+      const auto key = buildBlobNameFromDbKey(cursor->key());
+      if (!output_indices_.count(key)) {
+        VLOG(1) << "Key " << key << " not used. Skipping.";
+      } else {
+        if (key_to_dbid_.count(key) && key_to_dbid_[key] != db_id) {
+          CAFFE_THROW("Duplicate Key ", key, " is found!\n");
+        } else {
+          key_to_dbid_[key] = db_id;
+        }
+
+        VLOG(2) << "Deserializing blob " << key;
+        BlobProto proto;
+        CAFFE_ENFORCE(proto.ParseFromString(cursor->value()));
+        if (!keep_device_) {
+          // If we are not keeping the device as the one specified in the
+          // proto, we will set the current device.
+          SetCurrentDevice(&proto);
+        }
+        auto blobIndex = output_indices_[key];
+        Blob* blob = outputs.at(blobIndex);
+        ProcessBlob(blob, proto, blob_states, key, &loaded_blobs);
+
+        if (*total_loaded_blobs + loaded_blobs == OutputSize()) {
+          break;
+        }
+      }
+    }
+
+    *total_loaded_blobs += loaded_blobs;
+  }
+
+  string buildBlobNameFromDbKey(const string& dbKey) {
+    string key = dbKey.substr(0, dbKey.find(kChunkIdSeparator));
+    if (!strip_prefix_.empty()) {
+      auto match_pos = key.find(strip_prefix_);
+      if (match_pos != string::npos) {
+        key = key.substr(match_pos + strip_prefix_.size());
+      }
+    }
+    key = add_prefix_ + key;
+    return key;
+  }
+
+ private:
+  // We are tracking sizes of already read tensor parts while reading data
+  // chunks. This way we can make sure that all chunks were loaded in the end.
+  void ProcessBlob(
+      Blob* blob,
+      const BlobProto& proto,
+      std::unordered_map<string, BlobState>* blob_states_ptr,
+      const string& key,
+      int* loaded_blobs) {
+    auto& blob_states = *blob_states_ptr;
+    if (blob_states.count(key) == 0) {
+      // We reset the blob so that any existing content is destroyed. This
+      // is to guaranee correct device placement: if we are deserializing
+      // into a TensorCUDA, without explicit Reset we might be loading data
+      // into an existing TensorCUDA that has pre-allocated memory on a
+      // different GPU.
+      blob->Reset();
+    }
+    blob->Deserialize(proto);
+    if (proto.has_content_num_chunks()) {
+      if (!blob_states.count(key)) {
+        blob_states[key] = BlobState(proto.content_num_chunks());
+      }
+      CAFFE_ENFORCE(
+          blob_states[key]
+              .seen_chunks_ids.insert(proto.content_chunk_id())
+              .second,
+          "Chunk with the same id has occured twice for: ",
+          key);
+      CAFFE_ENFORCE(
+          proto.content_chunk_id() >= 0 &&
+              proto.content_chunk_id() < blob_states[key].total_size,
+          "Chunk id has to be not less than 0 and "
+          "less than content_num_chunks for key: ",
+          key);
+      blob_states[key].current_size++;
+      CAFFE_ENFORCE(
+          !blob_states[key].is_tensor,
+          "Proto with content_chunks can not store tensor: ",
+          key);
+      CAFFE_ENFORCE(
+          blob_states[key].current_size <= blob_states[key].total_size,
+          "Found an extra part for an already filled blob: ",
+          key);
+      if (blob_states[key].current_size == blob_states[key].total_size) {
+        (*loaded_blobs)++;
+      }
+      return;
+    }
+    if (!proto.has_tensor()) {
+      // If blob is divided into chunks the field content_chunks has to be set,
+      // otherwise only tensors can be seen multiple times as chunks.
+      CAFFE_ENFORCE(blob_states.count(key) == 0, "Blob duplicated: ", key);
+      blob_states[key] = BlobState();
+      (*loaded_blobs)++;
+      return;
+    }
+    CAFFE_ENFORCE(proto.has_tensor());
+    if (blob_states.count(key)) {
+      CAFFE_ENFORCE(blob_states[key].is_tensor, "Must be tensor ", key);
+      CAFFE_ENFORCE(
+          blob_states[key].current_size < blob_states[key].total_size,
+          "Found an extra part for an already filled tensor: ",
+          key);
+      CAFFE_ENFORCE(
+          proto.tensor().has_segment(),
+          "Partial tensor must have a segment: ",
+          key);
+      blob_states[key].current_size +=
+          proto.tensor().segment().end() - proto.tensor().segment().begin();
+      CAFFE_ENFORCE(
+          blob_states[key].current_size <= blob_states[key].total_size,
+          "Tensor parts are bigger than target size for tensor: ",
+          key);
+    } else {
+      const auto& dims = proto.tensor().dims();
+      int64_t total_size = 1;
+      for (const auto& dim : dims) {
+        total_size *= dim;
+      }
+      auto current_size = total_size;
+      if (proto.tensor().has_segment()) {
+        current_size =
+            proto.tensor().segment().end() - proto.tensor().segment().begin();
+      }
+      blob_states[key] =
+          BlobState(total_size, current_size, true /* is_tensor */);
+    }
+
+    if (blob_states[key].current_size == blob_states[key].total_size) {
+      (*loaded_blobs)++;
+    }
+  }
+
+  void validateBlobStates(
+      const std::unordered_map<string, BlobState>& blob_states) {
+    for (const auto& iter : blob_states) {
+      const BlobState& blob_state = iter.second;
+      CAFFE_ENFORCE(
+          blob_state.current_size == blob_state.total_size,
+          "Data size mismatch for blob ",
+          iter.first,
+          ". Expected: ",
+          blob_state.total_size,
+          " Read: ",
+          blob_state.current_size);
+    }
+  }
+
+  Workspace* ws_;
+  bool absolute_path_;
+  string add_prefix_;
+  string strip_prefix_;
+  string db_name_;
+  std::vector<std::string> db_names_;
+  string db_type_;
+  bool keep_device_;
+  bool load_all_;
+  bool allow_incomplete_;
+  std::map<string, int> output_indices_;
+  std::map<string, int> key_to_dbid_;
+  std::vector<std::string> blob_names_;
+};
+
+template <class Context>
+class SaveOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SaveOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        absolute_path_(
+            OperatorBase::GetSingleArgument<int>("absolute_path", false)),
+        strip_prefix_(
+            OperatorBase::GetSingleArgument<string>("strip_prefix", "")),
+        db_name_(OperatorBase::GetSingleArgument<string>("db", "")),
+        db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")),
+        blob_names_(
+            OperatorBase::GetRepeatedArgument<string>("blob_name_overrides")) {
+    CAFFE_ENFORCE_GT(db_name_.size(), 0, "Must specify a db name.");
+    CAFFE_ENFORCE_GT(db_type_.size(), 0, "Must specify a db type.");
+    CAFFE_ENFORCE(
+        blob_names_.empty() ||
+            blob_names_.size() == OperatorBase::Inputs().size(),
+        "Number of blobs and blob_name_overrides mismatch.");
+    CAFFE_ENFORCE(
+        blob_names_.empty() || strip_prefix_.empty(),
+        "strip_prefix and blob_name_overrides are mutually exclusive.");
+
+    if (blob_names_.empty()) {
+      std::set<std::string> input_names;
+      blob_names_.resize(OperatorBase::Inputs().size());
+      for (int i = 0; i < blob_names_.size(); ++i) {
+        std::string name;
+        if (strip_prefix_.empty()) {
+          name = operator_def.input(i);
+        } else {
+          auto match_pos = operator_def.input(i).find(strip_prefix_);
+          if (match_pos == string::npos) {
+            name = operator_def.input(i);
+          } else {
+            name = operator_def.input(i).substr(
+                match_pos + strip_prefix_.size(), string::npos);
+          }
+        }
+        CAFFE_ENFORCE(
+            input_names.insert(name).second, "Duplicated input: ", name);
+        blob_names_[i] = name;
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    string full_db_name =
+        absolute_path_ ? db_name_ : (ws_->RootFolder() + "/" + db_name_);
+    std::unique_ptr<DB> out_db(
+        caffe2::db::CreateDB(db_type_, full_db_name, caffe2::db::NEW));
+    CAFFE_ENFORCE(out_db.get(), "Cannot open db for writing: ", full_db_name);
+
+    BlobSerializerBase::SerializationAcceptor acceptor = [&](
+        const std::string& blobName, const std::string& data) {
+      // transaction should take care of locking
+      VLOG(2) << "Sending " << blobName << " blob's data of size "
+              << data.size() << " to db";
+      auto transaction = out_db->NewTransaction();
+      transaction->Put(blobName, data);
+      transaction->Commit();
+    };
+
+    const vector<const Blob*>& inputs = OperatorBase::Inputs();
+    for (int i = 0; i < inputs.size(); ++i) {
+      inputs[i]->Serialize(blob_names_[i], acceptor);
+    }
+    out_db->Close();
+    return true;
+  }
+
+ private:
+  Workspace* ws_;
+  bool absolute_path_;
+  string strip_prefix_;
+  string db_name_;
+  string db_type_;
+  std::vector<std::string> blob_names_;
+};
+
+template <typename... Ts>
+string FormatString(const string& pattern, Ts... values) {
+  // Note(Yangqing): We believe that 1024 is enough, but who are we to assert
+  // that?
+  // As a result, if things go wrong, we'll just throw the towel and quit loud.
+  // Yeah, I know that there is snprintf, but it is not present in *some*
+  // platforms unfortunately.
+  char buffer[1024];
+  int written = sprintf(buffer, pattern.c_str(), values...);
+  if (written < 0 || written + 1 > 1024) {
+    LOG(FATAL) << "FormatString fails: total bytes written " << written;
+  }
+  return string(buffer);
+  /*
+   * The following is the snprintf version that is safe; enable it one day?
+  unsigned int required =
+      std::snprintf(nullptr, 0, pattern.c_str(), values...) + 1;
+  char bytes[required];
+  std::snprintf(bytes, required, pattern.c_str(), values...);
+  return string(bytes);
+  */
+}
+
+// CheckpointOp is a wrapper over a SaveFloatTensorOp that basically allows
+// flexible naming over iterations.
+// The file pattern in db_name should be a format string that can be passed into
+// sprintf with an int argument specifying the current iteration. An example:
+//     "/path/to/my/checkpoint/checkpoint_at_%d.pb"
+template <class Context>
+class CheckpointOp final : public Operator<Context> {
+ public:
+  CheckpointOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        db_pattern_(OperatorBase::GetSingleArgument<string>("db", "")),
+        every_(OperatorBase::GetSingleArgument<int>("every", 1)),
+        ws_(ws),
+        save_op_def_(operator_def) {
+    CAFFE_ENFORCE_GT(
+        db_pattern_.size(), 0, "Must specify a checkpoint file pattern.");
+    CAFFE_ENFORCE_GT(every_, 0, "Checkpoint interval should be positive.");
+    if (every_ == 1) {
+      // Just issue a warning, but it's totally legal so we don't do anything.
+      LOG(WARNING) << "It seems that we are checkpointting every iteration. "
+                   << "Is that intended?";
+    }
+    save_op_def_.set_type("Save");
+  }
+
+  bool RunOnDevice() override {
+    int64_t iter =
+        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
+    if (iter % every_ == 0) {
+      GetMutableArgument("db", true, &save_op_def_)
+          ->set_s(FormatString(db_pattern_, iter));
+      SaveOp<Context> sub_op(save_op_def_, ws_);
+      return sub_op.Run();
+    } else {
+      return true;
+    }
+  }
+
+ private:
+  string db_pattern_;
+  int every_;
+  Workspace* ws_;
+  OperatorDef save_op_def_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
new file mode 100644
index 0000000..54a4959
--- /dev/null
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -0,0 +1,18 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/load_save_op.h"
+
+namespace caffe2 {
+
+template <>
+void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
+  if (proto->has_tensor()) {
+    auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
+    device_detail->set_device_type(CUDA);
+    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
+  }
+}
+
+REGISTER_CUDA_OPERATOR(Load, LoadOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Save, SaveOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Checkpoint, CheckpointOp<CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
new file mode 100644
index 0000000..3345703
--- /dev/null
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -0,0 +1,511 @@
+#include "caffe2/operators/local_response_normalization_op.h"
+
+namespace caffe2 {
+
+template<>
+bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  // Note(Yangqing): this one is copied from my Caffe implementation.
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+  const int image_size = C * H * W;
+  const float* Xdata = X.data<float>();
+  Y->ResizeLike(X);
+  float* Ydata = Y->mutable_data<float>();
+
+  if (OutputSize() > 1) {
+    scale_ = Output(1);
+  } else {
+    if (!scale_) {
+      scale_ = &local_scale_tensor_;
+    }
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
+  TensorCPU padded_square(
+      vector<TIndex>{C + size_ - 1, H, W});
+  float* padded_square_data = padded_square.mutable_data<float>();
+  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
+                               &context_);
+  const float alpha_over_size = alpha_ / size_;
+  // go through the images
+  for (int n = 0; n < N; ++n) {
+    // compute the padded square
+    math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
+                                 padded_square_data + pre_pad_ * H * W,
+                                 &context_);
+    // Create the first channel scale
+    for (int c = 0; c < size_; ++c) {
+      math::Axpy<float, CPUContext>(
+          H * W, alpha_over_size, padded_square_data + c * H * W,
+          scale_data + image_size * n, &context_);
+    }
+    for (int c = 1; c < C; ++c) {
+      float* this_scale_slice = scale_data + n * image_size + c * H * W;
+      // copy previous scale
+      context_.Copy<float, CPUContext, CPUContext>(
+          H * W, this_scale_slice - H * W, this_scale_slice);
+      // add head
+      math::Axpy<float, CPUContext>(
+          H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
+          this_scale_slice, &context_);
+      // subtract tail
+      math::Axpy<float, CPUContext>(
+          H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
+          this_scale_slice, &context_);
+    }
+  }
+  math::Powx<float, CPUContext>(
+      X.size(), scale_data, -beta_, Ydata, &context_);
+  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
+  return true;
+}
+
+template<>
+bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  // Note(Yangqing): This one is copied from my Decaf implementation. How many
+  // variants have I written...?
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int H = X.dim32(1);
+  const int W = X.dim32(2);
+  const int C = X.dim32(3);
+  const int num_rows = N * H * W;
+  const float* Xdata = X.data<float>();
+  Y->ResizeLike(X);
+  float* Ydata = Y->mutable_data<float>();
+
+  if (OutputSize() > 1) {
+    scale_ = Output(1);
+  } else {
+    if (!scale_) {
+      scale_ = &local_scale_tensor_;
+    }
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+
+  TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
+  float* padded_square_data = padded_square.mutable_data<float>();
+  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
+                               &context_);
+  const float alpha_over_size = alpha_ / size_;
+
+  for (int n = 0; n < num_rows; ++n) {
+    for (int c = 0; c < C; ++c) {
+      padded_square_data[c + pre_pad_] =
+          Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
+    }
+    float accum_scale = 0.;
+    for (int i = 0; i < size_ - 1; ++i) {
+      accum_scale += padded_square_data[i];
+    }
+    for (int c = 0; c < C; ++c) {
+      accum_scale += padded_square_data[c + size_ - 1];
+      scale_data[n * C + c] = bias_ + accum_scale;
+      accum_scale -= padded_square_data[c];
+    }
+  }
+  math::Powx<float, CPUContext>(
+      X.size(), scale_data, -beta_, Ydata, &context_);
+  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+  const int image_size = C * H * W;
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ResizeLike(X);
+
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  if (!scale_) {
+    scale_ = &local_scale_tensor_;
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  TensorCPU padded_ratio(
+      vector<TIndex>{C + size_ - 1, H, W});
+  float* padded_ratio_data = padded_ratio.mutable_data<float>();
+  // Compute scale(copied from LRNOp) - reusing padded_ratio
+  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &context_);
+  const float alpha_over_size = alpha_ / size_;
+  // go through the images
+  for (int n = 0; n < N; ++n) {
+    // compute the padded square
+    math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
+                                 padded_ratio_data + pre_pad_ * H * W,
+                                 &context_);
+    // Create the first channel scale
+    for (int c = 0; c < size_; ++c) {
+      math::Axpy<float, CPUContext>(
+          H * W, alpha_over_size, padded_ratio_data + c * H * W,
+          scale_data + image_size * n, &context_);
+    }
+    for (int c = 1; c < C; ++c) {
+      float* this_scale_slice = scale_data + n * image_size + c * H * W;
+      // copy previous scale
+      context_.Copy<float, CPUContext, CPUContext>(
+          H * W, this_scale_slice - H * W, this_scale_slice);
+      // add head
+      math::Axpy<float, CPUContext>(
+          H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
+          this_scale_slice, &context_);
+      // subtract tail
+      math::Axpy<float, CPUContext>(
+          H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
+          this_scale_slice, &context_);
+    }
+  }
+
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &context_);
+  TensorCPU accum_ratio(vector<TIndex>{H, W});
+  float* accum_ratio_data = accum_ratio.mutable_data<float>();
+
+
+  const float cache_ratio = 2. * alpha_ * beta_ / size_;
+  const int inverse_pre_pad = size_ - (size_ + 1) / 2;
+
+  int offset = 0;
+  for (int n = 0; n < N; ++n) {
+    // first, compute diff_i * y_i / s_i
+    math::Mul<float, CPUContext>(
+        image_size, dYdata + offset, Ydata + offset,
+        padded_ratio_data + inverse_pre_pad * H * W, &context_);
+    math::Div<float, CPUContext>(
+        image_size, padded_ratio_data + inverse_pre_pad * H * W,
+        scale_data + offset,
+        padded_ratio_data + inverse_pre_pad * H * W, &context_);
+    // Now, compute the accumulated ratios and the bottom diff
+    math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
+                                 &context_);
+    for (int c = 0; c < size_ - 1; ++c) {
+      math::Axpy<float, CPUContext>(H * W, 1,
+                                    padded_ratio_data + c * H * W,
+                                    accum_ratio_data, &context_);
+    }
+    for (int c = 0; c < C; ++c) {
+      for (int hw = 0; hw < H * W; ++hw) {
+        accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
+        dXdata[offset] =
+            dYdata[offset] * pow(scale_data[offset], -beta_) -
+            cache_ratio * accum_ratio_data[hw] * Xdata[offset];
+        accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
+        ++offset;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int H = X.dim32(1);
+  const int W = X.dim32(2);
+  const int C = X.dim32(3);
+  const int num_rows = N * H * W;
+  const float* Xdata = X.data<float>();
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ResizeLike(X);
+  if (!scale_) {
+    scale_ = &local_scale_tensor_;
+  }
+  scale_->ResizeLike(X);
+  TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
+  float* padded_ratio_data = padded_ratio.mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
+  // Compute scale(copied from LRNOp) - reusing padded_ratio
+  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &context_);
+  const float alpha_over_size = alpha_ / size_;
+
+  for (int n = 0; n < num_rows; ++n) {
+    for (int c = 0; c < C; ++c) {
+      padded_ratio_data[c + pre_pad_] =
+          Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
+    }
+    float accum_scale = 0.;
+    for (int i = 0; i < size_ - 1; ++i) {
+      accum_scale += padded_ratio_data[i];
+    }
+    for (int c = 0; c < C; ++c) {
+      accum_scale += padded_ratio_data[c + size_ - 1];
+      scale_data[n * C + c] = bias_ + accum_scale;
+      accum_scale -= padded_ratio_data[c];
+    }
+  }
+
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &context_);
+  // the ratio 2*alpha*beta/size
+  const float cache_ratio = 2. * alpha_ * beta_ / size_;
+  const float* Ydata = Y.data<float>();
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  for (int n = 0; n < num_rows; ++n) {
+    const int offset = n * C;
+    for (int c = 0; c < C; ++c) {
+      padded_ratio_data[c + pre_pad_] =
+          Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
+    }
+    float accum_ratio = 0.;
+    for (int c = 0; c < size_ - 1; ++c) {
+      accum_ratio += padded_ratio_data[c];
+    }
+    for (int c = 0; c < C; ++c) {
+      accum_ratio += padded_ratio_data[c + size_ - 1];
+      dXdata[offset + c] =
+          dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
+          cache_ratio * Xdata[offset + c] * accum_ratio;
+      accum_ratio -= padded_ratio_data[c];
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LRN)
+  .NumInputs(1)
+  .NumOutputs(1, 2)
+  .SetDoc(R"DOC(
+
+`LRN` applies Local Response Normalization to an input blob. This operation performs
+a kind of "lateral inhibition" by normalizing over local input regions, where 
+normalization is applied across channels. This operator is typically used to 
+normalize an unbounded activation (such as ReLU). The output shape is the same as
+the input shape. The `brew` module has a wrapper for this operator for use in a
+`ModelHelper` object.
+
+The formula for LRN is as follows:
+
+$$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$
+
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator("LRN",
+     ["X"],
+     ["Y", "Y_scale"],
+     size=11,
+     alpha=0.001,
+     beta=0.5,
+     bias=2.0,
+     order="NHWC"
+)
+
+workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW
+print("X:\n", workspace.FetchBlob("X"), "\n")
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+print("Y_scale:\n", workspace.FetchBlob("Y_scale"))
+```
+
+**Result**
+
+```
+X:
+ [[[[ 0.72985137]
+   [-0.3753357 ]
+   [ 2.7344604 ]
+   [-0.5937792 ]
+   [ 0.38440478]
+   [-2.1659644 ]]
+
+  [[-0.92846817]
+   [-0.9996144 ]
+   [ 0.212943  ]
+   [-1.968045  ]
+   [-0.77839696]
+   [ 0.45492038]]
+
+  [[-0.11263168]
+   [ 1.9901097 ]
+   [ 0.19275683]
+   [ 0.15630436]
+   [ 0.7536298 ]
+   [-0.77339894]]
+
+  [[ 0.8353551 ]
+   [-0.7784452 ]
+   [ 1.779317  ]
+   [ 0.22421335]
+   [ 1.3846219 ]
+   [-3.0546608 ]]
+
+  [[ 0.09977621]
+   [ 2.2071757 ]
+   [ 0.79971045]
+   [ 3.563886  ]
+   [-0.7169287 ]
+   [ 0.77170426]]
+
+  [[-1.4296649 ]
+   [ 0.19181213]
+   [ 0.45961624]
+   [-1.0201577 ]
+   [ 0.62854475]
+   [-0.6395456 ]]]] 
+
+Y:
+ [[[[ 0.5160766 ]
+   [-0.26540157]
+   [ 1.9332271 ]
+   [-0.41986194]
+   [ 0.27181432]
+   [-1.5314047 ]]
+
+  [[-0.6565133 ]
+   [-0.7068181 ]
+   [ 0.15057328]
+   [-1.3914955 ]
+   [-0.5504022 ]
+   [ 0.32167578]]
+
+  [[-0.0796426 ]
+   [ 1.4070934 ]
+   [ 0.13629955]
+   [ 0.11052381]
+   [ 0.53288984]
+   [-0.5468682 ]]
+
+  [[ 0.5906759 ]
+   [-0.5504363 ]
+   [ 1.2580767 ]
+   [ 0.1585426 ]
+   [ 0.9790328 ]
+   [-2.1595135 ]]
+
+  [[ 0.07055242]
+   [ 1.5605361 ]
+   [ 0.5654725 ]
+   [ 2.5193207 ]
+   [-0.50693923]
+   [ 0.54567   ]]
+
+  [[-1.0108787 ]
+   [ 0.13563155]
+   [ 0.3249962 ]
+   [-0.72134334]
+   [ 0.44444424]
+   [-0.45222285]]]]
+Y_scale:
+ [[[[2.0000484]
+   [2.0000129]
+   [2.0006797]
+   [2.000032 ]
+   [2.0000134]
+   [2.0004265]]
+
+  [[2.0000784]
+   [2.0000908]
+   [2.000004 ]
+   [2.0003521]
+   [2.000055 ]
+   [2.0000188]]
+
+  [[2.0000012]
+   [2.00036  ]
+   [2.0000033]
+   [2.0000021]
+   [2.0000517]
+   [2.0000544]]
+
+  [[2.0000634]
+   [2.000055 ]
+   [2.0002878]
+   [2.0000045]
+   [2.0001743]
+   [2.0008483]]
+
+  [[2.000001 ]
+   [2.000443 ]
+   [2.0000582]
+   [2.0011547]
+   [2.0000467]
+   [2.0000541]]
+
+  [[2.0001857]
+   [2.0000033]
+   [2.0000193]
+   [2.0000947]
+   [2.000036 ]
+   [2.0000372]]]]
+```
+
+</details>
+
+)DOC")
+  .Arg("size", "*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
+  .Arg("alpha", "*(type: float; default: 0)* Multiplicative (scaling) factor.")
+  .Arg("beta", "*(type: float; default: 0)* Exponent.")
+  .Arg("bias", "*(type: float; default: 1.0)* Additive factor.")
+  .Arg("order", "*(type: float; default: 'NCHW')* Order of blob dimensions.")
+  .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
+  .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
+  .Output(1, "Y_scale", "*(type: Tensor`<float>`)* Output scale.") 
+  .InheritOnnxSchema("LRN");
+OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
+
+class GetLRNGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+      "LRNGradient", "",
+      vector<string>{I(0), O(0), GO(0)},
+      vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(LRN, GetLRNGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/local_response_normalization_op.cu b/caffe2/operators/local_response_normalization_op.cu
new file mode 100644
index 0000000..a6a8f50
--- /dev/null
+++ b/caffe2/operators/local_response_normalization_op.cu
@@ -0,0 +1,324 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/local_response_normalization_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const T alpha_over_size,
+    const T bias, T* scale) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    in += offset;
+    scale += offset;
+    int head = 0;
+    int pre_pad = (size - 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_scale += in[head * step] * in[head * step];
+      scale[(head - post_pad) * step] = bias + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = bias + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = bias + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // recover the pointers for the next loop.
+    in -= offset;
+    scale -= offset;
+  }
+}
+
+template <typename T>
+__global__ void LRNFillScaleNHWC(const int nthreads, const T* in,
+    const int num, const int height, const int width,
+    const int channels, const int size, const T alpha_over_size,
+    const T bias, T* scale) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pre_pad = (size - 1) / 2;
+    scale[index] = 0;
+    for (int i = 0; i < size; ++i) {
+      int raw_idx = c + i - pre_pad;
+      if (raw_idx >= 0 && raw_idx < channels) {
+        scale[index] += in[index + i - pre_pad] * in[index + i - pre_pad];
+      }
+    }
+    scale[index] = bias + scale[index] * alpha_over_size;
+  }
+}
+
+// TODO(Yangqing): check if it would be faster to just put it into the previous
+// kernel.
+template <typename T>
+__global__ void LRNComputeOutput(const int nthreads, const T* in,
+    const T* scale, const T negative_beta, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    out[index] = in[index] * pow(scale[index], negative_beta);
+  }
+}
+
+template <typename T>
+__global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
+    const T* top_data, const T* scale, const T* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const T negative_beta,
+    const T cache_ratio,
+    T* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    int pre_pad = size - (size + 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    T accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // recover pointer for next iteration.
+    bottom_data -= offset;
+    top_data -= offset;
+    scale -= offset;
+    top_diff -= offset;
+    bottom_diff -= offset;
+  }
+}
+
+// This local response normalization gradient does one sum per output location
+// and does not use the running trick for 1-d convolution: thus it might not be
+// the fastest implementation.
+template <typename T>
+__global__ void LRNComputeDiffNHWC(const int nthreads, const T* bottom_data,
+    const T* top_data, const T* scale, const T* top_diff,
+    const int num, const int height, const int width, const int channels,
+    const int size, const T negative_beta, const T cache_ratio,
+    T* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local channel offset
+    int c = index % channels;
+    int pre_pad = size / 2;
+    T accum_ratio = 0;
+    for (int i = -pre_pad; i < size - pre_pad; ++i) {
+      if (c + i >= 0 && c + i < channels) {
+        accum_ratio += top_diff[index + i] * top_data[index + i] /
+            scale[index + i];
+      }
+    }
+    bottom_diff[index] = top_diff[index] * pow(scale[index], negative_beta) -
+                         cache_ratio * bottom_data[index] * accum_ratio;
+  }
+}
+}  // namespace
+
+template<>
+bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+  const float* Xdata = X.data<float>();
+  Y->ResizeLike(X);
+  float* Ydata = Y->mutable_data<float>();
+  if (OutputSize() > 1) {
+    scale_ = Output(1);
+  } else {
+    if (!scale_) {
+      scale_ = &local_scale_tensor_;
+    }
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+
+  int n_threads = N * H * W;
+  LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                        0, context_.cuda_stream()>>>(
+      n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
+  n_threads = X.size();
+  LRNComputeOutput<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                            0, context_.cuda_stream()>>>(
+      n_threads, Xdata, scale_data, -beta_, Ydata);
+  return true;
+}
+
+template<>
+bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int H = X.dim32(1);
+  const int W = X.dim32(2);
+  const int C = X.dim32(3);
+  const float* Xdata = X.data<float>();
+  Y->ResizeLike(X);
+  float* Ydata = Y->mutable_data<float>();
+  if (OutputSize() > 1) {
+    scale_ = Output(1);
+  } else {
+    if (!scale_) {
+      scale_ = &local_scale_tensor_;
+    }
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+
+  int n_threads = X.size();
+  LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                        0, context_.cuda_stream()>>>(
+      n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
+  LRNComputeOutput<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                            0, context_.cuda_stream()>>>(
+      n_threads, Xdata, scale_data, -beta_, Ydata);
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ResizeLike(X);
+
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  if (!scale_) {
+    scale_ = &local_scale_tensor_;
+  }
+  scale_->ResizeLike(X);
+  float* scale_data = scale_->mutable_data<float>();
+  int n_threads = N * H * W;
+  LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                        0, context_.cuda_stream()>>>(
+      n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
+                              CAFFE_CUDA_NUM_THREADS,
+                              0, context_.cuda_stream()>>>(
+      n_threads, Xdata, Ydata, scale_data, dYdata, N, C, H, W, size_, -beta_,
+      2.f * alpha_ * beta_ / size_, dXdata);
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int H = X.dim32(1);
+  const int W = X.dim32(2);
+  const int C = X.dim32(3);
+  const float* Xdata = X.data<float>();
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ResizeLike(X);
+  if (!scale_) {
+    scale_ = &local_scale_tensor_;
+  }
+  scale_->ResizeLike(X);
+
+  float* scale_data = scale_->mutable_data<float>();
+  int n_threads = X.size();
+  LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
+                        0, context_.cuda_stream()>>>(
+      n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
+
+  LRNComputeDiffNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
+                              CAFFE_CUDA_NUM_THREADS, 0,
+                              context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y.data<float>(), scale_data,
+      dY.data<float>(),
+      X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3), size_, -beta_,
+      2.f * alpha_ * beta_ / size_, dX->mutable_data<float>());
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(LRN, LRNOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(LRNGradient, LRNGradientOp<float, CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/local_response_normalization_op.h b/caffe2/operators/local_response_normalization_op.h
new file mode 100644
index 0000000..79f388c
--- /dev/null
+++ b/caffe2/operators/local_response_normalization_op.h
@@ -0,0 +1,92 @@
+#ifndef CAFFE2_OPERATORS_LOCAL_RESPONSE_NORMALIZATION_OP_H_
+#define CAFFE2_OPERATORS_LOCAL_RESPONSE_NORMALIZATION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LRNOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LRNOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        pre_pad_((size_ - 1) / 2) {
+    DCHECK_GT(size_, 0);
+    DCHECK_EQ(size_ % 2, 1);
+    DCHECK_GT(alpha_, 0);
+    DCHECK_GT(beta_, 0);
+  }
+
+  bool RunOnDevice() override {
+    switch (order_) {
+      case StorageOrder::NHWC:
+        return RunOnDeviceWithOrderNHWC();
+      case StorageOrder::NCHW:
+        return RunOnDeviceWithOrderNCHW();
+      default:
+        LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+    // To suppress old compiler warnings
+    return true;
+  }
+
+  virtual bool RunOnDeviceWithOrderNCHW() = 0;
+  virtual bool RunOnDeviceWithOrderNHWC() = 0;
+
+ protected:
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+  const StorageOrder order_;
+  const int pre_pad_;
+  // Input: X; Output: Y, scale.
+};
+
+template <typename T, class Context>
+class LRNOp final : public LRNOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LRNOp(const OperatorDef& operator_def, Workspace* ws)
+      : LRNOpBase<T, Context>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ protected:
+  // Input: X; Output: Y, scale.
+  OUTPUT_TAGS(OUTPUT, SCALE);
+  Tensor<Context>* scale_ = nullptr;
+  Tensor<Context> local_scale_tensor_;
+};
+
+template <typename T, class Context>
+class LRNGradientOp final : public LRNOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : LRNOpBase<T, Context>(operator_def, ws) {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ protected:
+  // Input: X, Y, scale, dY; Output: dX
+  INPUT_TAGS(INPUT, OUTPUT, SCALE, OUTPUT_GRAD);
+  Tensor<Context>* scale_ = nullptr;
+  Tensor<Context> local_scale_tensor_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOCAL_RESPONSE_NORMALIZATION_OP_H_
diff --git a/caffe2/operators/local_response_normalization_op_cudnn.cc b/caffe2/operators/local_response_normalization_op_cudnn.cc
new file mode 100644
index 0000000..1df0062
--- /dev/null
+++ b/caffe2/operators/local_response_normalization_op_cudnn.cc
@@ -0,0 +1,217 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+class CuDNNLRNOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNLRNOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+
+    CUDNN_ENFORCE(cudnnCreateLRNDescriptor(&norm_desc_));
+    CUDNN_ENFORCE(
+        cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, bias_));
+  }
+
+  ~CuDNNLRNOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyLRNDescriptor(norm_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnLRNDescriptor_t norm_desc_;
+
+  vector<TIndex> cudnn_input_dims_;
+
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+
+  // Input: X, Output: Y
+};
+
+class CuDNNLRNGradientOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  CuDNNLRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
+        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+
+    CUDNN_ENFORCE(cudnnCreateLRNDescriptor(&norm_desc_));
+    CUDNN_ENFORCE(
+        cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, bias_));
+  }
+
+  ~CuDNNLRNGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyLRNDescriptor(norm_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnLRNDescriptor_t norm_desc_;
+
+  vector<TIndex> cudnn_input_dims_;
+
+  const int size_;
+  const float alpha_;
+  const float beta_;
+  const float bias_;
+
+  // Input: X, Y, dY
+  // Output: dX
+};
+
+template <typename T, typename M>
+bool CuDNNLRNOp::DoRunWithType() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  // Reshape tensor descriptors if necessary
+  if (X.dims() != cudnn_input_dims_) {
+    VLOG(1) << "Setting descriptors";
+    cudnn_input_dims_ = X.dims();
+    int C = 1, H = 1, W = 1;
+    // Normal 4-dimensional tensors for images.
+    C = X.dim32(1);
+    H = X.dim32(2);
+    W = X.dim32(3);
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        data_desc_,
+        GetCudnnTensorFormat(StorageOrder::NCHW),
+        cudnnTypeWrapper<T>::type,
+        X.dim32(0),
+        C,
+        H,
+        W));
+  }
+
+  // now actually run the computation
+  CUDNN_ENFORCE(cudnnLRNCrossChannelForward(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      norm_desc_,
+      CUDNN_LRN_CROSS_CHANNEL_DIM1,
+      cudnnTypeWrapper<T>::kOne(),
+      data_desc_,
+      X.template data<T>(),
+      cudnnTypeWrapper<T>::kZero(),
+      data_desc_,
+      Y->template mutable_data<T>()));
+
+  return true;
+}
+
+bool CuDNNLRNOp::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  if (X.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else if (X.IsType<float16>()) {
+    return DoRunWithType<float16, float>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+  return false;
+}
+
+template <typename T, typename M>
+bool CuDNNLRNGradientOp::DoRunWithType() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  const auto& dY = Input(2);
+  auto* dX = Output(0);
+
+  if (dY.dims() != cudnn_input_dims_) {
+    VLOG(1) << "Setting descriptors";
+    cudnn_input_dims_ = dY.dims();
+    int C = 1, H = 1, W = 1;
+    // Normal 4-dimensional tensors for images.
+    C = dY.dim32(1);
+    H = dY.dim32(2);
+    W = dY.dim32(3);
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        data_desc_,
+        GetCudnnTensorFormat(StorageOrder::NCHW),
+        cudnnTypeWrapper<T>::type,
+        dY.dim32(0),
+        C,
+        H,
+        W));
+  }
+
+  // run the computation
+  CUDNN_ENFORCE(cudnnLRNCrossChannelBackward(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      norm_desc_,
+      CUDNN_LRN_CROSS_CHANNEL_DIM1,
+      cudnnTypeWrapper<T>::kOne(),
+      data_desc_,
+      Y.template data<T>(),
+      data_desc_,
+      dY.template data<T>(),
+      data_desc_,
+      X.template data<T>(),
+      cudnnTypeWrapper<T>::kZero(),
+      data_desc_,
+      dX->template mutable_data<T>()));
+  return true;
+}
+
+bool CuDNNLRNGradientOp::RunOnDevice() {
+  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  const auto& dY = Input(2);
+  auto* dX = Output(0);
+
+  dX->ResizeLike(dY);
+
+  if (dY.IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else if (dY.IsType<float16>()) {
+    return DoRunWithType<float16, float>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+
+  return false;
+}
+
+namespace {
+REGISTER_CUDNN_OPERATOR(LRN, CuDNNLRNOp);
+REGISTER_CUDNN_OPERATOR(LRNGradient, CuDNNLRNGradientOp);
+}
+
+}; // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op.cc b/caffe2/operators/locally_connected_op.cc
new file mode 100644
index 0000000..46eff35
--- /dev/null
+++ b/caffe2/operators/locally_connected_op.cc
@@ -0,0 +1,165 @@
+#include "caffe2/operators/locally_connected_op.h"
+
+#include <functional>
+#include <vector>
+
+#include "caffe2/operators/locally_connected_op_impl.h"
+
+namespace caffe2 {
+
+namespace {
+
+constexpr char kLCDoc[] = R"DOC(
+Note that other parameters, such as the stride and
+kernel size, or the pads' sizes in each direction are not necessary for input
+because they are provided by the ConvPoolOpBase operator. Various dimension
+checks are done implicitly, and the sizes are specified in the Input docs for
+this operator. As is expected, the filter is locally connected with a subset of
+the image and the bias is added; this is done throughout the image data and the
+output is computed. As a side note on the implementation layout:
+locally_connected_op_impl.h is the templated implementation of the
+locally_connected_op.h file, which is why they are separate files.
+)DOC";
+
+std::function<void(OpSchema&)> LCDocGenerator(const char* dim) {
+  return [dim](OpSchema& schema) {
+    string doc = R"DOC(
+The locally connected operator consumes an input vector, a {dim}filter blob
+and a bias blob and computes the output. {lc_doc})DOC";
+    ReplaceAll(doc, "{dim}", dim);
+    ReplaceAll(doc, "{lc_doc}", kLCDoc);
+    schema.SetDoc(doc);
+    schema.Input(
+        1,
+        "filter",
+        "The filter blob that will be used in the locally connected op; "
+        "has size (YH * YW * M x C x kH x kW) if order == NCHW else "
+        "(YH * YW * M  * KH * KW * C), where YH and YW are the height "
+        "and width of the output image, C is the number of channels, and kH "
+        "and kW are the height and width of the kernel.");
+    schema.Input(
+        2,
+        "bias",
+        "The 1D bias blob that is added through the locally connected op; "
+        "has size (YH * YW * M).");
+    schema.Output(
+        0,
+        "Y",
+        "Output data blob that contains the result of the locally connected op."
+        "The output dimensions are functions of the kernel size, stride size, "
+        "and pad lengths."
+        "");
+  };
+}
+
+} // namespace
+
+REGISTER_CPU_OPERATOR(LC, LocallyConnectedOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForLC)
+    .FillUsing(LCDocGenerator(""));
+
+REGISTER_CPU_OPERATOR(LC1D, LocallyConnectedOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC1D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForLC)
+    .FillUsing(LCDocGenerator("1D "));
+
+REGISTER_CPU_OPERATOR(LC2D, LocallyConnectedOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC2D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForLC)
+    .FillUsing(LCDocGenerator("2D "));
+
+REGISTER_CPU_OPERATOR(LC3D, LocallyConnectedOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC3D)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForLC)
+    .FillUsing(LCDocGenerator("3D "));
+
+REGISTER_CPU_OPERATOR(
+    LCGradient,
+    LocallyConnectedGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LCGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(
+    LC1DGradient,
+    LocallyConnectedGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC1DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(
+    LC2DGradient,
+    LocallyConnectedGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC2DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+REGISTER_CPU_OPERATOR(
+    LC3DGradient,
+    LocallyConnectedGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LC3DGradient).NumInputs(2, 3).NumOutputs(1, 3);
+
+namespace {
+
+class GetLocallyConnectedGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(def_.input_size() == 3 || def_.input_size() == 2);
+    ArgumentHelper argsHelper(def_);
+    const bool compute_dX =
+        !argsHelper.GetSingleArgument<bool>("no_gradient_to_input", 0);
+
+    if (def_.input_size() == 3) {
+      if (compute_dX) {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            std::vector<string>{I(0), I(1), GO(0)},
+            std::vector<string>{GI(1), GI(2), GI(0)});
+      } else {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            std::vector<string>{I(0), I(1), GO(0)},
+            std::vector<string>{GI(1), GI(2)});
+      }
+    } else {
+      if (compute_dX) {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            std::vector<string>{I(0), I(1), GO(0)},
+            std::vector<string>{GI(1), GI(0)},
+            std::vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      } else {
+        return SingleGradientDef(
+            def_.type() + "Gradient",
+            "",
+            std::vector<string>{I(0), I(1), GO(0)},
+            std::vector<string>{GI(1)},
+            std::vector<Argument>{MakeArgument<int>("no_bias", 1)});
+      }
+    }
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(LC, GetLocallyConnectedGradient);
+REGISTER_GRADIENT(LC1D, GetLocallyConnectedGradient);
+REGISTER_GRADIENT(LC2D, GetLocallyConnectedGradient);
+REGISTER_GRADIENT(LC3D, GetLocallyConnectedGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op.h b/caffe2/operators/locally_connected_op.h
new file mode 100644
index 0000000..7b4bbd4
--- /dev/null
+++ b/caffe2/operators/locally_connected_op.h
@@ -0,0 +1,129 @@
+#ifndef CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_H_
+#define CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_H_
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/locally_connected_op_util.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LocallyConnectedOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+
+  LocallyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {
+    // Since this is the default locally connected implementation, we will
+    // use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
+    CAFFE_ENFORCE(
+        group_ == 1 || order_ == StorageOrder::NCHW,
+        "Group locally connected only supports NCHW order right now.");
+  }
+
+  ~LocallyConnectedOp() = default;
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  void RunOnDeviceWithOrderNCHWImpl(
+      const lc_op_util::ShapeParams& shape,
+      const T* X_data,
+      const T* filter_data,
+      const T* bias_data,
+      T* Y_data,
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* output_buffer);
+
+  void RunOnDeviceWithOrderNHWCImpl(
+      const lc_op_util::ShapeParams& shape,
+      const T* X_data,
+      const T* filter_data,
+      const T* bias_data,
+      T* Y_data,
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* Y_transposed_buffer);
+
+  Tensor<Context> bias_multiplier_;
+
+  // Buffer.
+  Tensor<Context> column_buffer_;
+  Tensor<Context> column_transposed_buffer_;
+  Tensor<Context> Y_transposed_buffer_;
+
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+template <typename T, class Context>
+class LocallyConnectedGradientOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+
+  LocallyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "no_bias", no_bias_, false) {
+    CAFFE_ENFORCE(
+        !(no_bias_ && OutputSize() == 3),
+        "If bias is not present, you should not have 3 grad output.");
+    CAFFE_ENFORCE(
+        group_ == 1 || order_ == StorageOrder::NCHW,
+        "Group locally connected only supports NCHW order right now.");
+  }
+
+  ~LocallyConnectedGradientOp() = default;
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  void RunOnDeviceWithOrderNCHWImpl(
+      const lc_op_util::ShapeParams& shape,
+      const T* X_data,
+      const T* filter_data,
+      const T* dY_data,
+      T* dfilter_data,
+      T* dX_data,
+      T* dbias_data,
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* dY_transposed_buffer);
+
+  void RunOnDeviceWithOrderNHWCImpl(
+      const lc_op_util::ShapeParams& shape,
+      const T* X_data,
+      const T* filter_data,
+      const T* dY_data,
+      T* dfilter_data,
+      T* dX_data,
+      T* dbias_data,
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* dY_transposed_buffer);
+
+  const bool no_bias_;
+
+  Tensor<Context> bias_multiplier_;
+
+  // Buffer.
+  Tensor<Context> column_buffer_;
+  Tensor<Context> column_transposed_buffer_;
+  Tensor<Context> dY_transposed_buffer_;
+
+  // input: X, W, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_H_
diff --git a/caffe2/operators/locally_connected_op_gpu.cc b/caffe2/operators/locally_connected_op_gpu.cc
new file mode 100644
index 0000000..4bac2d2
--- /dev/null
+++ b/caffe2/operators/locally_connected_op_gpu.cc
@@ -0,0 +1,27 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/locally_connected_op.h"
+#include "caffe2/operators/locally_connected_op_impl.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(LC, LocallyConnectedOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LCGradient,
+    LocallyConnectedGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(LC1D, LocallyConnectedOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LC1DGradient,
+    LocallyConnectedGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(LC2D, LocallyConnectedOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LC2DGradient,
+    LocallyConnectedGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(LC3D, LocallyConnectedOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LC3DGradient,
+    LocallyConnectedGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h
new file mode 100644
index 0000000..b43b0c3
--- /dev/null
+++ b/caffe2/operators/locally_connected_op_impl.h
@@ -0,0 +1,840 @@
+// locally_connected_impl.h is the templated implementation of the
+// locally_connected.h file.
+
+#ifndef CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_IMPL_H_
+#define CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_IMPL_H_
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/locally_connected_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(INPUT);
+  const auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+  const int image_ndim = X.ndim() - 2;
+  CAFFE_ENFORCE_EQ(X.ndim() + image_ndim, filter.ndim());
+  lc_op_util::ShapeParams shape;
+  shape.N = X.dim32(0);
+  shape.C = X.dim32(1);
+  shape.M = filter.dim32(image_ndim);
+  CAFFE_ENFORCE(
+      shape.C == filter.dim32(image_ndim + 1) * group_,
+      "Locally Connected op: input channels does not match: "
+      "# of input channels ",
+      shape.C,
+      " is not equal to kernel channels * group:",
+      filter.dim32(image_ndim + 1),
+      "*",
+      group_);
+  CAFFE_ENFORCE_EQ(
+      shape.M % group_,
+      0,
+      "The number of output channels is not divisible by group.");
+
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M);
+  shape.input_image_size = GetDimsSize(X);
+  shape.output_image_size = GetDimsSize(*Y);
+  const std::vector<int> output_image_dims = GetDims(*Y);
+  for (int i = 0; i < image_ndim; ++i) {
+    CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
+  }
+
+  int kernel_dims_size = 1;
+  for (std::size_t i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE_EQ(filter.dim32(i + image_ndim + 2), kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  shape.X_dims.assign(X.dims().cbegin() + 1, X.dims().cend());
+  shape.kernel_size = shape.C / group_ * kernel_dims_size;
+  lc_op_util::SetColumnBufferShape(
+      shape.N,
+      shape.kernel_size,
+      shape.output_image_size,
+      output_image_dims,
+      order_,
+      &shape.column_slice_dims,
+      &shape.column_dims,
+      &shape.column_transposed_dims,
+      &shape.column_axes);
+  lc_op_util::SetYBufferShape(
+      shape.N,
+      shape.M,
+      shape.output_image_size,
+      order_,
+      &shape.Y_dims,
+      &shape.Y_transposed_dims,
+      &shape.Y_axes);
+
+  const T* X_data = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* bias_data = nullptr;
+  if (InputSize() == 3) {
+    const auto& bias = Input(BIAS);
+    CAFFE_ENFORCE_EQ(bias.ndim(), image_ndim + 1);
+    for (int i = 0; i < image_ndim; ++i) {
+      CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]);
+    }
+    CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M);
+    bias_data = bias.template data<T>();
+    ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+        shape.N, &bias_multiplier_);
+  }
+  T* Y_data = Y->template mutable_data<T>();
+
+  RunOnDeviceWithOrderNCHWImpl(
+      shape,
+      X_data,
+      filter_data,
+      bias_data,
+      Y_data,
+      &column_buffer_,
+      &column_transposed_buffer_,
+      &Y_transposed_buffer_);
+
+  return true;
+}
+
+template <typename T, class Context>
+bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(INPUT);
+  const auto& filter = Input(FILTER);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_EQ(
+      kernel_.size(),
+      2,
+      "Only 2d locally connected op is supported for NHWC storage type.");
+  const int image_ndim = X.ndim() - 2;
+  CAFFE_ENFORCE_EQ(X.ndim() + image_ndim, filter.ndim());
+  lc_op_util::ShapeParams shape;
+  shape.N = X.dim32(0);
+  shape.C = X.dim32(3);
+  shape.X_dims = {X.dim32(1), X.dim32(2), X.dim32(3)};
+  shape.M = filter.dim32(image_ndim);
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h());
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w());
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C);
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M);
+
+  shape.input_image_size = GetDimsSize(X);
+  shape.output_image_size = GetDimsSize(*Y);
+  const std::vector<int> output_image_dims = GetDims(*Y);
+  for (int i = 0; i < image_ndim; ++i) {
+    CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
+  }
+
+  shape.kernel_size = kernel_h() * kernel_w() * shape.C;
+  lc_op_util::SetColumnBufferShape(
+      shape.N,
+      shape.kernel_size,
+      shape.output_image_size,
+      output_image_dims,
+      order_,
+      &shape.column_slice_dims,
+      &shape.column_dims,
+      &shape.column_transposed_dims,
+      &shape.column_axes);
+  lc_op_util::SetYBufferShape(
+      shape.N,
+      shape.M,
+      shape.output_image_size,
+      order_,
+      &shape.Y_dims,
+      &shape.Y_transposed_dims,
+      &shape.Y_axes);
+
+  const T* X_data = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* bias_data = nullptr;
+  if (InputSize() == 3) {
+    const auto& bias = Input(BIAS);
+    CAFFE_ENFORCE_EQ(bias.ndim(), image_ndim + 1);
+    for (int i = 0; i < image_ndim; ++i) {
+      CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]);
+    }
+    CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M);
+    bias_data = bias.template data<T>();
+    ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+        shape.N, &bias_multiplier_);
+  }
+  T* Y_data = Y->template mutable_data<T>();
+
+  RunOnDeviceWithOrderNHWCImpl(
+      shape,
+      X_data,
+      filter_data,
+      bias_data,
+      Y_data,
+      &column_buffer_,
+      &column_transposed_buffer_,
+      &Y_transposed_buffer_);
+
+  return true;
+}
+
+template <typename T, class Context>
+void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
+    const lc_op_util::ShapeParams& shape,
+    const T* X_data,
+    const T* filter_data,
+    const T* bias_data,
+    T* Y_data,
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* Y_transposed_buffer) {
+  const int input_stride = shape.C / group_ * shape.input_image_size;
+  const int column_stride = shape.kernel_size * shape.output_image_size;
+  column_buffer->Resize(shape.column_dims);
+  column_transposed_buffer->Resize(shape.column_transposed_dims);
+  Y_transposed_buffer->Resize(shape.Y_transposed_dims);
+  T* column_buffer_data = column_buffer->template mutable_data<T>();
+  T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data<T>();
+
+  for (int image_id = 0; image_id < shape.N; ++image_id) {
+    for (int group_id = 0; group_id < group_; ++group_id) {
+      if (kernel_.size() == 2) {
+        math::Im2Col<T, Context, StorageOrder::NCHW>(
+            shape.C / group_,
+            shape.X_dims[1],
+            shape.X_dims[2],
+            kernel_h(),
+            kernel_w(),
+            dilation_h(),
+            dilation_w(),
+            pad_t(),
+            pad_l(),
+            pad_b(),
+            pad_r(),
+            stride_h(),
+            stride_w(),
+            X_data + group_id * input_stride,
+            column_buffer_data + group_id * column_stride,
+            &context_);
+      } else {
+        math::Im2ColNd<T, Context, StorageOrder::NCHW>(
+            kernel_.size(),
+            shape.C * shape.input_image_size,
+            column_stride,
+            shape.X_dims.data(),
+            shape.column_slice_dims.data(),
+            kernel_.data(),
+            stride_.data(),
+            dilation_.data(),
+            pads_.data(),
+            X_data + group_id * input_stride,
+            column_buffer_data + group_id * column_stride,
+            &context_);
+      }
+    }
+    X_data += input_stride * group_;
+    column_buffer_data += column_stride * group_;
+  }
+  math::Transpose(
+      shape.column_dims.size(),
+      shape.column_dims.data(),
+      shape.column_axes.data(),
+      column_buffer->template data<T>(),
+      column_transposed_buffer->template mutable_data<T>(),
+      &context_);
+  math::GemmBatched(
+      CblasNoTrans,
+      CblasNoTrans,
+      shape.output_image_size * group_,
+      shape.M / group_,
+      shape.N,
+      shape.kernel_size,
+      1.0f,
+      filter_data,
+      column_transposed_buffer->template data<T>(),
+      0.0f,
+      Y_transposed_buffer_data,
+      &context_);
+  if (bias_data != nullptr) {
+    math::Gemm<T, Context>(
+        CblasNoTrans,
+        CblasNoTrans,
+        shape.output_image_size * shape.M,
+        shape.N,
+        1,
+        1.0,
+        bias_data,
+        bias_multiplier_.template data<T>(),
+        1.0,
+        Y_transposed_buffer_data,
+        &context_);
+  }
+  math::Transpose(
+      shape.Y_transposed_dims.size(),
+      shape.Y_transposed_dims.data(),
+      shape.Y_axes.data(),
+      Y_transposed_buffer_data,
+      Y_data,
+      &context_);
+}
+
+template <typename T, class Context>
+void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
+    const lc_op_util::ShapeParams& shape,
+    const T* X_data,
+    const T* filter_data,
+    const T* bias_data,
+    T* Y_data,
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* Y_transposed_buffer) {
+  const int input_stride = shape.C * shape.input_image_size;
+  const int column_stride = shape.kernel_size * shape.output_image_size;
+  column_buffer->Resize(shape.column_dims);
+  column_transposed_buffer->Resize(shape.column_transposed_dims);
+  Y_transposed_buffer->Resize(shape.Y_transposed_dims);
+  T* column_buffer_data = column_buffer->template mutable_data<T>();
+  T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data<T>();
+  for (int image_id = 0; image_id < shape.N; ++image_id) {
+    math::Im2Col<T, Context, StorageOrder::NHWC>(
+        shape.C,
+        shape.X_dims[0],
+        shape.X_dims[1],
+        kernel_h(),
+        kernel_w(),
+        dilation_h(),
+        dilation_w(),
+        pad_t(),
+        pad_l(),
+        pad_b(),
+        pad_r(),
+        stride_h(),
+        stride_w(),
+        X_data + image_id * input_stride,
+        column_buffer_data + image_id * column_stride,
+        &context_);
+  }
+  math::Transpose(
+      shape.column_dims.size(),
+      shape.column_dims.data(),
+      shape.column_axes.data(),
+      column_buffer->template data<T>(),
+      column_transposed_buffer->template mutable_data<T>(),
+      &context_);
+  math::GemmBatched(
+      CblasNoTrans,
+      CblasTrans,
+      shape.output_image_size,
+      shape.N,
+      shape.M,
+      shape.kernel_size,
+      1.0f,
+      column_transposed_buffer->template data<T>(),
+      filter_data,
+      0.0f,
+      Y_transposed_buffer_data,
+      &context_);
+  math::Transpose(
+      shape.Y_transposed_dims.size(),
+      shape.Y_transposed_dims.data(),
+      shape.Y_axes.data(),
+      Y_transposed_buffer_data,
+      Y_data,
+      &context_);
+  if (bias_data != nullptr) {
+    math::Gemm<T, Context>(
+        CblasNoTrans,
+        CblasNoTrans,
+        shape.N,
+        shape.output_image_size * shape.M,
+        1,
+        1.0f,
+        bias_multiplier_.template data<T>(),
+        bias_data,
+        1.0f,
+        Y_data,
+        &context_);
+  }
+}
+
+template <typename T, class Context>
+bool LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(INPUT);
+  const auto& filter = Input(FILTER);
+  const auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  const int image_ndim = X.ndim() - 2;
+  CAFFE_ENFORCE_EQ(X.ndim() + image_ndim, filter.ndim());
+
+  lc_op_util::ShapeParams shape;
+  shape.N = X.dim32(0);
+  shape.C = X.dim32(1);
+  shape.M = filter.dim32(image_ndim);
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1) * group_, shape.C);
+  CAFFE_ENFORCE_EQ(shape.M % group_, 0);
+
+  const std::vector<int> input_image_dims = GetDims(X);
+  shape.input_image_size = GetDimsSize(X);
+  const std::vector<int> output_image_dims = GetDims(dY);
+  shape.output_image_size = GetDimsSize(dY);
+  for (int i = 0; i < image_ndim; ++i) {
+    CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
+  }
+  ConvPoolOpBase<Context>::ComputePads(input_image_dims);
+
+  int kernel_dims_size = 1;
+  for (std::size_t i = 0; i < kernel_.size(); ++i) {
+    CAFFE_ENFORCE_EQ(filter.dim32(i + image_ndim + 2), kernel_[i]);
+    kernel_dims_size *= kernel_[i];
+  }
+
+  shape.X_dims.assign(X.dims().cbegin() + 1, X.dims().cend());
+  shape.kernel_size = shape.C / group_ * kernel_dims_size;
+  lc_op_util::SetColumnBufferShape(
+      shape.N,
+      shape.kernel_size,
+      shape.output_image_size,
+      output_image_dims,
+      order_,
+      &shape.column_slice_dims,
+      &shape.column_dims,
+      &shape.column_transposed_dims,
+      &shape.column_axes);
+  lc_op_util::SetYBufferShape(
+      shape.N,
+      shape.M,
+      shape.output_image_size,
+      order_,
+      &shape.Y_dims,
+      &shape.Y_transposed_dims,
+      &shape.Y_axes);
+
+  dfilter->ResizeLike(filter);
+  const T* X_data = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* dY_data = dY.template data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+  T* dX_data = nullptr;
+  T* dbias_data = nullptr;
+  if (OutputSize() == 3 || (no_bias_ && OutputSize() == 2)) {
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    dX_data = dX->template mutable_data<T>();
+  }
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    std::vector<int> dbias_dims = output_image_dims;
+    dbias_dims.push_back(shape.M);
+    dbias->Resize(dbias_dims);
+    ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+        shape.N, &bias_multiplier_);
+    dbias_data = dbias->template mutable_data<T>();
+  }
+  RunOnDeviceWithOrderNCHWImpl(
+      shape,
+      X_data,
+      filter_data,
+      dY_data,
+      dfilter_data,
+      dX_data,
+      dbias_data,
+      &column_buffer_,
+      &column_transposed_buffer_,
+      &dY_transposed_buffer_);
+
+  return true;
+}
+
+template <typename T, class Context>
+bool LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(INPUT);
+  const auto& filter = Input(FILTER);
+  const auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  CAFFE_ENFORCE_EQ(
+      kernel_.size(),
+      2,
+      "Only 2d locally connected op is supported for NHWC storage type.");
+  const int image_ndim = X.ndim() - 2;
+  CAFFE_ENFORCE_EQ(X.ndim() + image_ndim, filter.ndim());
+  lc_op_util::ShapeParams shape;
+  shape.N = X.dim32(0);
+  shape.C = X.dim32(3);
+  shape.X_dims = {X.dim32(1), X.dim32(2), X.dim32(3)};
+  shape.M = filter.dim32(image_ndim);
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h());
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w());
+  CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C);
+  const std::vector<int> input_image_dims = {X.dim32(1), X.dim32(2)};
+  ConvPoolOpBase<Context>::ComputePads(input_image_dims);
+
+  shape.input_image_size = GetDimsSize(X);
+  shape.output_image_size = GetDimsSize(dY);
+  const std::vector<int> output_image_dims = GetDims(dY);
+  for (int i = 0; i < image_ndim; ++i) {
+    CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
+  }
+
+  shape.kernel_size = kernel_h() * kernel_w() * shape.C;
+  lc_op_util::SetColumnBufferShape(
+      shape.N,
+      shape.kernel_size,
+      shape.output_image_size,
+      output_image_dims,
+      order_,
+      &shape.column_slice_dims,
+      &shape.column_dims,
+      &shape.column_transposed_dims,
+      &shape.column_axes);
+  lc_op_util::SetYBufferShape(
+      shape.N,
+      shape.M,
+      shape.output_image_size,
+      order_,
+      &shape.Y_dims,
+      &shape.Y_transposed_dims,
+      &shape.Y_axes);
+
+  dfilter->ResizeLike(filter);
+  const T* X_data = X.template data<T>();
+  const T* filter_data = filter.template data<T>();
+  const T* dY_data = dY.template data<T>();
+  T* dfilter_data = dfilter->template mutable_data<T>();
+  T* dX_data = nullptr;
+  T* dbias_data = nullptr;
+  if (OutputSize() == 3 || (no_bias_ && OutputSize() == 2)) {
+    auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
+    dX->ResizeLike(X);
+    dX_data = dX->template mutable_data<T>();
+  }
+  if (!no_bias_) {
+    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
+    std::vector<int> dbias_dims = output_image_dims;
+    dbias_dims.push_back(shape.M);
+    dbias->Resize(dbias_dims);
+    ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
+        shape.N, &bias_multiplier_);
+    dbias_data = dbias->template mutable_data<T>();
+  }
+  RunOnDeviceWithOrderNHWCImpl(
+      shape,
+      X_data,
+      filter_data,
+      dY_data,
+      dfilter_data,
+      dX_data,
+      dbias_data,
+      &column_buffer_,
+      &column_transposed_buffer_,
+      &dY_transposed_buffer_);
+
+  return true;
+}
+
+template <typename T, class Context>
+void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
+    const lc_op_util::ShapeParams& shape,
+    const T* X_data,
+    const T* filter_data,
+    const T* dY_data,
+    T* dfilter_data,
+    T* dX_data,
+    T* dbias_data,
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* dY_transposed_buffer) {
+  const int input_stride = shape.C * shape.input_image_size;
+  const int column_stride = shape.kernel_size * shape.output_image_size;
+  column_buffer->Resize(shape.column_dims);
+  column_transposed_buffer->Resize(shape.column_transposed_dims);
+  dY_transposed_buffer->Resize(shape.Y_transposed_dims);
+  T* column_buffer_data = column_buffer->template mutable_data<T>();
+  T* dY_transposed_buffer_data =
+      dY_transposed_buffer->template mutable_data<T>();
+
+  for (int image_id = 0; image_id < shape.N; ++image_id) {
+    for (int group_id = 0; group_id < group_; ++group_id) {
+      if (kernel_.size() == 2) {
+        math::Im2Col<T, Context, StorageOrder::NCHW>(
+            shape.C / group_,
+            shape.X_dims[1],
+            shape.X_dims[2],
+            kernel_h(),
+            kernel_w(),
+            dilation_h(),
+            dilation_w(),
+            pad_t(),
+            pad_l(),
+            pad_b(),
+            pad_r(),
+            stride_h(),
+            stride_w(),
+            X_data + group_id * input_stride,
+            column_buffer_data + group_id * column_stride,
+            &context_);
+      } else {
+        math::Im2ColNd<T, Context, StorageOrder::NCHW>(
+            kernel_.size(),
+            shape.C * shape.input_image_size,
+            column_stride,
+            shape.X_dims.data(),
+            shape.column_slice_dims.data(),
+            kernel_.data(),
+            stride_.data(),
+            dilation_.data(),
+            pads_.data(),
+            X_data + group_id * input_stride,
+            column_buffer_data + group_id * column_stride,
+            &context_);
+      }
+    }
+    X_data += input_stride * group_;
+    column_buffer_data += column_stride * group_;
+  }
+  math::Transpose(
+      shape.column_dims.size(),
+      shape.column_dims.data(),
+      shape.column_axes.data(),
+      column_buffer->template data<T>(),
+      column_transposed_buffer->template mutable_data<T>(),
+      &context_);
+
+  math::Transpose(
+      shape.Y_dims.size(),
+      shape.Y_dims.data(),
+      shape.Y_axes.data(),
+      dY_data,
+      dY_transposed_buffer_data,
+      &context_);
+
+  // Gradient respect to filter.
+  math::GemmBatched(
+      CblasNoTrans,
+      CblasTrans,
+      shape.output_image_size * group_,
+      shape.M / group_,
+      shape.kernel_size,
+      shape.N,
+      1.0f,
+      dY_transposed_buffer_data,
+      column_transposed_buffer->template data<T>(),
+      0.0f,
+      dfilter_data,
+      &context_);
+
+  if (dbias_data != nullptr) {
+    // Gradient respect to bias.
+    math::Gemv<T, Context>(
+        CblasNoTrans,
+        shape.output_image_size * shape.M,
+        shape.N,
+        1.0f,
+        dY_transposed_buffer_data,
+        bias_multiplier_.template data<T>(),
+        0.0f,
+        dbias_data,
+        &context_);
+  }
+
+  if (dX_data != nullptr) {
+    // Gradient respect to X.
+    math::GemmBatched(
+        CblasTrans,
+        CblasNoTrans,
+        shape.output_image_size * group_,
+        shape.kernel_size,
+        shape.N,
+        shape.M / group_,
+        1.0f,
+        filter_data,
+        dY_transposed_buffer_data,
+        0.0f,
+        column_transposed_buffer->template mutable_data<T>(),
+        &context_);
+    math::Transpose(
+        shape.column_transposed_dims.size(),
+        shape.column_transposed_dims.data(),
+        shape.column_axes.data(),
+        column_transposed_buffer->template data<T>(),
+        column_buffer->template mutable_data<T>(),
+        &context_);
+    const T* const_column_buffer_data = column_buffer->template data<T>();
+    for (int image_id = 0; image_id < shape.N; ++image_id) {
+      for (int group_id = 0; group_id < group_; ++group_id) {
+        if (kernel_.size() == 2) {
+          math::Col2Im<T, Context, StorageOrder::NCHW>(
+              shape.C / group_,
+              shape.X_dims[1],
+              shape.X_dims[2],
+              kernel_h(),
+              kernel_w(),
+              dilation_h(),
+              dilation_w(),
+              pad_t(),
+              pad_l(),
+              pad_b(),
+              pad_r(),
+              stride_h(),
+              stride_w(),
+              const_column_buffer_data + group_id * column_stride,
+              dX_data + group_id * input_stride,
+              &context_);
+        } else {
+          math::Col2ImNd<T, Context, StorageOrder::NCHW>(
+              kernel_.size(),
+              shape.C * shape.input_image_size,
+              column_stride,
+              shape.X_dims.data(),
+              shape.column_slice_dims.data(),
+              kernel_.data(),
+              stride_.data(),
+              dilation_.data(),
+              pads_.data(),
+              const_column_buffer_data + group_id * column_stride,
+              dX_data + group_id * input_stride,
+              &context_);
+        }
+      }
+      dX_data += input_stride * group_;
+      const_column_buffer_data += column_stride * group_;
+    }
+  }
+}
+
+template <typename T, class Context>
+void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
+    const lc_op_util::ShapeParams& shape,
+    const T* X_data,
+    const T* filter_data,
+    const T* dY_data,
+    T* dfilter_data,
+    T* dX_data,
+    T* dbias_data,
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* dY_transposed_buffer) {
+  const int input_stride = shape.C * shape.input_image_size;
+  const int column_stride = shape.kernel_size * shape.output_image_size;
+  column_buffer->Resize(shape.column_dims);
+  column_transposed_buffer->Resize(shape.column_transposed_dims);
+  dY_transposed_buffer->Resize(shape.Y_transposed_dims);
+  T* column_buffer_data = column_buffer->template mutable_data<T>();
+  T* dY_transposed_buffer_data =
+      dY_transposed_buffer->template mutable_data<T>();
+  for (int image_id = 0; image_id < shape.N; ++image_id) {
+    math::Im2Col<T, Context, StorageOrder::NHWC>(
+        shape.C,
+        shape.X_dims[0],
+        shape.X_dims[1],
+        kernel_h(),
+        kernel_w(),
+        dilation_h(),
+        dilation_w(),
+        pad_t(),
+        pad_l(),
+        pad_b(),
+        pad_r(),
+        stride_h(),
+        stride_w(),
+        X_data + image_id * input_stride,
+        column_buffer_data + image_id * column_stride,
+        &context_);
+  }
+  math::Transpose(
+      shape.column_dims.size(),
+      shape.column_dims.data(),
+      shape.column_axes.data(),
+      column_buffer->template data<T>(),
+      column_transposed_buffer->template mutable_data<T>(),
+      &context_);
+  math::Transpose(
+      shape.Y_dims.size(),
+      shape.Y_dims.data(),
+      shape.Y_axes.data(),
+      dY_data,
+      dY_transposed_buffer_data,
+      &context_);
+
+  // Gradient respect to filter.
+  math::GemmBatched(
+      CblasTrans,
+      CblasNoTrans,
+      shape.output_image_size,
+      shape.M,
+      shape.kernel_size,
+      shape.N,
+      1.0f,
+      dY_transposed_buffer_data,
+      column_transposed_buffer->template data<T>(),
+      0.0f,
+      dfilter_data,
+      &context_);
+
+  if (dbias_data != nullptr) {
+    // Gradient respect to bias.
+    math::Gemv<T, Context>(
+        CblasTrans,
+        shape.N,
+        shape.output_image_size * shape.M,
+        1.0f,
+        dY_data,
+        bias_multiplier_.template data<T>(),
+        0.0f,
+        dbias_data,
+        &context_);
+  }
+
+  if (dX_data != nullptr) {
+    // Gradient respect to X.
+    math::GemmBatched(
+        CblasNoTrans,
+        CblasNoTrans,
+        shape.output_image_size,
+        shape.N,
+        shape.kernel_size,
+        shape.M,
+        1.0f,
+        dY_transposed_buffer_data,
+        filter_data,
+        0.0f,
+        column_transposed_buffer->template mutable_data<T>(),
+        &context_);
+    math::Transpose(
+        shape.column_transposed_dims.size(),
+        shape.column_transposed_dims.data(),
+        shape.column_axes.data(),
+        column_transposed_buffer->template data<T>(),
+        column_buffer->template mutable_data<T>(),
+        &context_);
+    const T* const_column_buffer_data = column_buffer->template data<T>();
+    for (int image_id = 0; image_id < shape.N; ++image_id) {
+      math::Col2Im<T, Context, StorageOrder::NHWC>(
+          shape.C,
+          shape.X_dims[0],
+          shape.X_dims[1],
+          kernel_h(),
+          kernel_w(),
+          dilation_h(),
+          dilation_w(),
+          pad_t(),
+          pad_l(),
+          pad_b(),
+          pad_r(),
+          stride_h(),
+          stride_w(),
+          const_column_buffer_data,
+          dX_data,
+          &context_);
+      dX_data += input_stride;
+      const_column_buffer_data += column_stride;
+    }
+  }
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_IMPL_H_
diff --git a/caffe2/operators/locally_connected_op_util.cc b/caffe2/operators/locally_connected_op_util.cc
new file mode 100644
index 0000000..da54176
--- /dev/null
+++ b/caffe2/operators/locally_connected_op_util.cc
@@ -0,0 +1,61 @@
+#include "caffe2/operators/locally_connected_op_util.h"
+
+#include <algorithm>
+
+namespace caffe2 {
+namespace lc_op_util {
+
+void SetColumnBufferShape(
+    const int N,
+    const int kernel_size,
+    const int output_image_size,
+    const std::vector<int>& output_image_dims,
+    const StorageOrder order,
+    std::vector<int>* column_slice_dims,
+    std::vector<int>* column_dims,
+    std::vector<int>* column_transposed_dims,
+    std::vector<int>* column_axes) {
+  column_slice_dims->resize(output_image_dims.size() + 1);
+  if (order == StorageOrder::NCHW) {
+    column_slice_dims->front() = kernel_size;
+    std::copy(
+        output_image_dims.cbegin(),
+        output_image_dims.cend(),
+        column_slice_dims->begin() + 1);
+  } else {
+    std::copy(
+        output_image_dims.cbegin(),
+        output_image_dims.cend(),
+        column_slice_dims->begin());
+    column_slice_dims->back() = kernel_size;
+  }
+  *column_dims = order == StorageOrder::NCHW
+      ? std::vector<int>{N, kernel_size, output_image_size}
+      : std::vector<int>{N, output_image_size, kernel_size};
+  *column_transposed_dims = order == StorageOrder::NCHW
+      ? std::vector<int>{output_image_size, kernel_size, N}
+      : std::vector<int>{output_image_size, N, kernel_size};
+  *column_axes = order == StorageOrder::NCHW ? std::vector<int>{2, 1, 0}
+                                             : std::vector<int>{1, 0, 2};
+}
+
+void SetYBufferShape(
+    const int N,
+    const int M,
+    const int output_image_size,
+    const StorageOrder order,
+    std::vector<int>* Y_dims,
+    std::vector<int>* Y_transposed_dims,
+    std::vector<int>* Y_axes) {
+  *Y_dims = order == StorageOrder::NCHW
+      ? std::vector<int>{N, M, output_image_size}
+      : std::vector<int>{N, output_image_size, M};
+  *Y_transposed_dims = order == StorageOrder::NCHW
+      ? std::vector<int>{output_image_size, M, N}
+      : std::vector<int>{output_image_size, N, M};
+  *Y_axes = order == StorageOrder::NCHW ? std::vector<int>{2, 1, 0}
+                                        : std::vector<int>{1, 0, 2};
+}
+
+} // namespace lc_op_util
+} // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op_util.h b/caffe2/operators/locally_connected_op_util.h
new file mode 100644
index 0000000..fb7e8b7
--- /dev/null
+++ b/caffe2/operators/locally_connected_op_util.h
@@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_UTIL_H_
+#define CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_UTIL_H_
+
+#include <vector>
+
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+namespace lc_op_util {
+
+struct ShapeParams {
+  int N;
+  int C;
+  int M;
+  int input_image_size;
+  int output_image_size;
+  int kernel_size;
+  std::vector<int> X_dims;
+  std::vector<int> column_slice_dims;
+  std::vector<int> column_dims;
+  std::vector<int> column_transposed_dims;
+  std::vector<int> column_axes;
+  std::vector<int> Y_dims;
+  std::vector<int> Y_transposed_dims;
+  std::vector<int> Y_axes;
+};
+
+struct CUDAConvNetShapeParams {
+  int N;
+  int C;
+  int M;
+  int X_H;
+  int X_W;
+  int Y_H;
+  int Y_W;
+};
+
+void SetColumnBufferShape(
+    int N,
+    int kernel_dim,
+    int output_image_size,
+    const std::vector<int>& output_image_dims,
+    StorageOrder order,
+    std::vector<int>* column_slice_dims,
+    std::vector<int>* column_dims,
+    std::vector<int>* column_transposed_dims,
+    std::vector<int>* column_axes);
+
+void SetYBufferShape(
+    int N,
+    int M,
+    int output_image_size,
+    StorageOrder order,
+    std::vector<int>* Y_dims,
+    std::vector<int>* Y_transposed_dims,
+    std::vector<int>* Y_axes);
+
+} // namespace lc_op_util
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOCALLY_CONNECTED_OP_UTIL_H_
diff --git a/caffe2/operators/log_op.cc b/caffe2/operators/log_op.cc
new file mode 100644
index 0000000..2df5123
--- /dev/null
+++ b/caffe2/operators/log_op.cc
@@ -0,0 +1,90 @@
+#include "caffe2/operators/log_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Log,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, LogFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Log)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the natural log of the given input tensor ($ln(x)$), element-wise. This
+operation can be done in an in-place fashion too, by providing the same input
+and output blobs.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/log_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Log",
+    ["X"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
+print("X before running op:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("X after running op:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X before running op:
+[[0.07341351 0.15404125 0.386613  ]
+ [0.34090295 0.99727786 0.24141751]
+ [0.32016268 0.8724168  0.93515724]]
+X after running op:
+[[-2.6116474  -1.8705349  -0.9503311 ]
+ [-1.0761575  -0.00272586 -1.4212275 ]
+ [-1.138926   -0.13648799 -0.06704059]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output tensor computed as the natural log of the input tensor computed, element-wise.")
+    .InheritOnnxSchema("Log");
+
+namespace {
+
+class GetLogGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Div",
+        "",
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Log, GetLogGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/log_op.h b/caffe2/operators/log_op.h
new file mode 100644
index 0000000..7c420c1
--- /dev/null
+++ b/caffe2/operators/log_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_LOG_OP_H_
+#define CAFFE2_OPERATORS_LOG_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct LogFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Log(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOG_OP_H_
diff --git a/caffe2/operators/log_op_gpu.cc b/caffe2/operators/log_op_gpu.cc
new file mode 100644
index 0000000..dbf920d
--- /dev/null
+++ b/caffe2/operators/log_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/log_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Log,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        LogFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/logit_op.cc b/caffe2/operators/logit_op.cc
new file mode 100644
index 0000000..225608f
--- /dev/null
+++ b/caffe2/operators/logit_op.cc
@@ -0,0 +1,88 @@
+#include "caffe2/operators/logit_op.h"
+
+#include <string>
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool LogitFunctor<CPUContext>::
+operator()(const int size, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorMap<T> X_vec(X, size);
+  EigenVectorMap<T> Y_vec(Y, size);
+  Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
+  Y_vec = Y_vec.array().max(eps_);
+  Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
+  return true;
+}
+
+template <>
+bool LogitGradientOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  int channels = X.dim32(X.ndim() - 1);
+  ConstEigenArrayMap<float> Xmat(
+      X.template data<float>(), channels, X.size() / channels);
+  ConstEigenArrayMap<float> dYmat(
+      dY.template data<float>(), channels, X.size() / channels);
+  EigenArrayMap<float> dXmat(
+      dX->template mutable_data<float>(), channels, X.size() / channels);
+  dXmat = (Xmat < eps_ || Xmat > 1.0 - eps_)
+              .select(0, dYmat * ((1 - Xmat) * Xmat).inverse());
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Logit,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CPUContext,
+        LogitFunctor<CPUContext>>);
+
+REGISTER_CPU_OPERATOR(LogitGradient, LogitGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Logit)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Elementwise logit transform: logit(x) = log(x / (1 - x)), where x is the
+input data clampped in (eps, 1-eps).
+)DOC")
+    .Arg("eps (optional)", "small positive epsilon value, the default is 1e-6.")
+    .Input(0, "X", "input float tensor")
+    .Output(0, "Y", "output float tensor");
+
+OPERATOR_SCHEMA(LogitGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Input(0, "X", "input float tensor")
+    .Input(1, "dY", "input float tensor")
+    .Output(0, "dX", "output float tensor")
+    .Arg("eps", "small positive epsilon value, the default is 1e-6.");
+
+namespace {
+
+class GetLogitGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return vector<OperatorDef>{CreateOperatorDef(
+        "LogitGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)})};
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Logit, GetLogitGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/logit_op.cu b/caffe2/operators/logit_op.cu
new file mode 100644
index 0000000..d2e8351
--- /dev/null
+++ b/caffe2/operators/logit_op.cu
@@ -0,0 +1,69 @@
+#include "caffe2/operators/logit_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void LogitKernel(const int N, const T* X, const float eps, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = fminf(X[i], (T(1) - eps));
+    Y[i] = fmaxf(Y[i], eps);
+    Y[i] = logf(Y[i] / (T(1) - Y[i]));
+  }
+}
+
+template <typename T>
+__global__ void LogitGradientKernel(
+    const int N,
+    const T* X,
+    const T* dY,
+    const float eps,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = (X[i] < eps || X[i] > T(1) - eps) ? T(0)
+                                              : (dY[i] / X[i] / (T(1) - X[i]));
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool LogitFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  LogitKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, eps_, Y);
+  return true;
+}
+
+template <>
+bool LogitGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  int n = X.size();
+  LogitGradientKernel<<<
+      CAFFE_GET_BLOCKS(n),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      n, X.data<float>(), dY.data<float>(), eps_, dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Logit,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        LogitFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(LogitGradient, LogitGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/logit_op.h b/caffe2/operators/logit_op.h
new file mode 100644
index 0000000..1e460ba
--- /dev/null
+++ b/caffe2/operators/logit_op.h
@@ -0,0 +1,41 @@
+#ifndef CAFFE2_OPERATORS_LOGIT_OP_H_
+#define CAFFE2_OPERATORS_LOGIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct LogitFunctor {
+  explicit LogitFunctor(OperatorBase& op)
+      : eps_(op.GetSingleArgument<float>("eps", 1e-6f)) {
+    CAFFE_ENFORCE_GT(eps_, 0.0);
+    CAFFE_ENFORCE_LT(eps_, 0.5);
+  }
+
+  template <typename T>
+  bool operator()(const int size, const T* X, T* Y, Context* context) const;
+
+  const float eps_;
+};
+
+template <typename T, class Context>
+class LogitGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LogitGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        eps_(OperatorBase::GetSingleArgument<float>("eps", 1e-6f)) {}
+  ~LogitGradientOp() {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  float eps_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOGIT_OP_H_
diff --git a/caffe2/operators/loss_op.cc b/caffe2/operators/loss_op.cc
new file mode 100644
index 0000000..7b82e0a
--- /dev/null
+++ b/caffe2/operators/loss_op.cc
@@ -0,0 +1,77 @@
+#include "caffe2/operators/loss_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(AveragedLoss, AveragedLoss<float, CPUContext>);
+REGISTER_CPU_OPERATOR(AveragedLossGradient,
+                      AveragedLossGradient<float, CPUContext>);
+
+OPERATOR_SCHEMA(AveragedLoss)
+  .NumInputs(1)
+  .NumOutputs(1)
+  .ScalarType(TensorProto::FLOAT)
+  .SetDoc(R"DOC(
+The *AveragedLoss* op takes a single 1-D input tensor *input* and returns a single output float value *output*. The output represents the average of the values in *input*. This op is commonly used for averaging losses, hence the name, however it does not exclusively operate on losses.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/loss_op.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/loss_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "AveragedLoss",
+    ["input"],
+    ["output"],
+)
+
+workspace.FeedBlob("input", np.array([8, 10, 12]).astype(np.float32))
+print("input:\n", workspace.FetchBlob("input"))
+
+workspace.RunOperatorOnce(op)
+print("output: \n", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+ [ 8. 10. 12.]
+output:
+ 10.0
+
+```
+
+</details>
+
+
+)DOC")
+  .Input(0, "input", "The input data as Tensor")
+  .Output(0, "output", "The output tensor of size 1 containing the averaged value.");
+
+OPERATOR_SCHEMA(AveragedLossGradient).NumInputs(2).NumOutputs(1);
+
+class GetAveragedLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AveragedLossGradient", "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(AveragedLoss, GetAveragedLossGradient);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/loss_op.cu b/caffe2/operators/loss_op.cu
new file mode 100644
index 0000000..6ad56d2
--- /dev/null
+++ b/caffe2/operators/loss_op.cu
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/loss_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(AveragedLoss, AveragedLoss<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    AveragedLossGradient,
+    AveragedLossGradient<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/loss_op.h b/caffe2/operators/loss_op.h
new file mode 100644
index 0000000..aa61a89
--- /dev/null
+++ b/caffe2/operators/loss_op.h
@@ -0,0 +1,33 @@
+#ifndef CAFFE2_OPERATORS_LOSS_OP_H_
+#define CAFFE2_OPERATORS_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/reduction_ops.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// AveragedLoss takes in the input and produces the output loss value as
+// the average of the input.
+template <typename T, class Context>
+class AveragedLoss final : public SumElementsOp<T, Context> {
+ public:
+  AveragedLoss(const OperatorDef& operator_def, Workspace* ws)
+      : SumElementsOp<T, Context>(operator_def, ws, true) {}
+  ~AveragedLoss() {}
+};
+
+template <typename T, class Context>
+class AveragedLossGradient final : public SumElementsGradientOp<T, Context> {
+ public:
+  AveragedLossGradient(const OperatorDef& operator_def, Workspace* ws)
+      : SumElementsGradientOp<T, Context>(operator_def, ws, true) {}
+  ~AveragedLossGradient() {}
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOSS_OP_H_
diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc
new file mode 100644
index 0000000..c3795a0
--- /dev/null
+++ b/caffe2/operators/lp_pool_op.cc
@@ -0,0 +1,312 @@
+// TODO: reduce the apparent redundancy of all the code below.
+#include "caffe2/operators/pool_op.h"
+
+namespace caffe2 {
+
+using std::min;
+using std::max;
+
+class LpPool {};
+
+template <>
+bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
+  // The main loop
+  int channels = X.dim32(1);
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  int pooled_height = Y->dim32(2);
+  int pooled_width = Y->dim32(3);
+
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_[0] - pads_[0];
+          int wstart = pw * stride_[1] - pads_[1];
+          int hend = min(hstart + kernel_[0], height);
+          int wend = min(wstart + kernel_[1], width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int input_index = h * width + w;
+              Ydata[pool_index] += std::pow(std::abs(Xdata[input_index]), p);
+            }
+          }
+          Ydata[pool_index] = std::pow(Ydata[pool_index], inv_p);
+        }
+      }
+      // Do offset.
+      Xdata += height * width;
+      Ydata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim32(1);
+  int width = X.dim32(2);
+  int channels = X.dim32(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
+  // The main loop
+  int pooled_height = Y->dim32(1);
+  int pooled_width = Y->dim32(2);
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_[0] - pads_[0];
+        int wstart = pw * stride_[1] - pads_[1];
+        int hend = min(hstart + kernel_[0], height);
+        int wend = min(wstart + kernel_[1], width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        const int pool_index = (ph * pooled_width + pw) * channels;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pool_index + c] +=
+                  std::pow(std::abs(Xdata[input_index + c]), p);
+            }
+          }
+        }
+        for (int c = 0; c < channels; ++c) {
+          Ydata[pool_index + c] = std::pow(Ydata[pool_index + c], inv_p);
+        }
+      }
+    }
+    // Do offset.
+    Xdata += X.size() / X.dim32(0);
+    Ydata += Y->size() / Y->dim32(0);
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  // TODO(Yangqing): Add shape checks.
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data<float>(), &context_);
+  const float* dYdata = dY.data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  int channels = X.dim32(1);
+  CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  ConvPoolOpBase<CPUContext>::ComputePads({height, width});
+  int pooled_height = dY.dim32(2);
+  int pooled_width = dY.dim32(3);
+  // The main loop
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_[0] - pads_[0];
+          int wstart = pw * stride_[1] - pads_[1];
+          int hend = min(hstart + kernel_[0], height);
+          int wend = min(wstart + kernel_[1], width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          float scale = 1. / (hend - hstart) / (wend - wstart);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              // gradient of p-norm is x_j * |x_j|^{p-2} / |x|_p^{p-1}
+              dXdata[h * width + w] += dYdata[ph * pooled_width + pw] *
+                  Xdata[h * width + w] *
+                  std::pow(std::abs(Xdata[h * width + w]), p - 2) /
+                  std::pow(Ydata[ph * pooled_width + pw], p - 1);
+            }
+          }
+        }
+      }
+      // offset
+      dXdata += height * width;
+      dYdata += pooled_height * pooled_width;
+      Ydata += pooled_height * pooled_width;
+      Xdata += height * width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data<float>(), &context_);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  // The main loop
+  int height = X.dim32(1);
+  int width = X.dim32(2);
+  ConvPoolOpBase<CPUContext>::ComputePads({height, width});
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  int pooled_height = dY.dim32(1);
+  int pooled_width = dY.dim32(2);
+  int channels = X.dim32(3);
+  CAFFE_ENFORCE_EQ(channels, dY.dim32(3));
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_[0] - pads_[0];
+        int wstart = pw * stride_[1] - pads_[1];
+        int hend = min(hstart + kernel_[0], height);
+        int wend = min(wstart + kernel_[1], width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        float scale = 1. / (hend - hstart) / (wend - wstart);
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              dXdata[(h * width + w) * channels + c] +=
+                  dYdata[(ph * pooled_width + pw) * channels + c] *
+                  Xdata[(h * width + w) * channels + c] *
+                  std::pow(
+                      std::abs(Xdata[(h * width + w) * channels + c]), p - 2) /
+                  std::pow(
+                      Ydata[(ph * pooled_width + pw) * channels + c], p - 1);
+            }
+          }
+        }
+      }
+    }
+    // offset
+    dXdata += X.size() / X.dim32(0);
+    dYdata += dY.size() / dY.dim32(0);
+    Xdata += X.size() / X.dim32(0);
+    Ydata += Y.size() / Y.dim32(0);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LpPool, PoolOp<float, CPUContext, LpPool>);
+REGISTER_CPU_OPERATOR(
+    LpPoolGradient,
+    PoolGradientOp<float, CPUContext, LpPool>);
+
+OPERATOR_SCHEMA(LpPool)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+`LpPool` consumes an input blob and applies max pooling across the the blob according to kernel sizes, stride sizes, pad lengths and dilation. $L_p$ pooling consists of taking the $L_p$ norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing.
+
+Pooling layers reduce the spatial dimensionality of the input blob. Each of the output blob's dimensions will reduce according to:
+
+$$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/lp_pool_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LpPool",
+    ["X"],
+    ["Y"],
+    kernel=2,
+    stride=2,
+    p=2.0
+)
+
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+print("X:\n", workspace.FetchBlob("X"), "\n")
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[[[-1.1113514  -1.1173418  -0.1504435   0.1327146  -1.2221841  -0.5654315 ]
+   [-1.9209646  -0.04675794  0.8604731   1.2042469   0.28154245   0.38656202]
+   [-0.8772837  -0.03264008  0.26222762  0.28526652  0.321102    -2.5891325 ]
+   [-0.9248281   1.440776   -0.56832    -0.6017927   1.2262512   -2.1443934 ]
+   [ 0.5194415  -1.6858683   0.45221648  0.65029615 -0.8574544    0.8121054 ]
+   [ 0.25902653  0.4934758   0.49870652 -0.48134378 -0.9178449   -0.07626943]]]]
+
+Y:
+ [[[[2.4851248 1.49361   1.4290358]
+   [1.9240153 0.9139378 3.5928857]
+   [1.8500228 1.0525136 1.4976646]]]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("p","(*float*): type of $L_p$ norm to use (default=2.0)")
+    .Arg("kernel","(*int*): the size of the window to take a max over")
+    .Arg("stride","(*int*): the stride of the window")
+    .Arg("pad","(*int*): implicit zero padding to be added on both sides")
+    .Arg("dilation","(*int*): parameter that controls the stride of elements in the window")
+    .Arg("order","(*string*): order of blob dimensions (default=\"NCHW\")")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Output(0, "Y", "(*Tensor`<float>`*): output tensor");
+
+OPERATOR_SCHEMA(LpPoolGradient).NumInputs(3).NumOutputs(1);
+
+class GetPoolGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{I(0), O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(LpPool, GetPoolGradient);
+}
diff --git a/caffe2/operators/lp_pool_op.cu b/caffe2/operators/lp_pool_op.cu
new file mode 100644
index 0000000..53f6110
--- /dev/null
+++ b/caffe2/operators/lp_pool_op.cu
@@ -0,0 +1,375 @@
+// TODO: reduce the apparent redundancy of all the code below.
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pool_op.h"
+
+namespace caffe2 {
+namespace {
+class LpPool {};
+} // namespace
+
+namespace {
+template <typename T>
+inline __device__ T cuda_pow(T x, T y);
+
+template <typename T>
+inline __device__ T cuda_abs(T x);
+
+template <>
+inline __device__ float cuda_pow<float>(float x, float y) {
+  return powf(x, y);
+}
+template <>
+inline __device__ double cuda_pow<double>(double x, double y) {
+  return pow(x, y);
+}
+
+template <>
+inline __device__ float cuda_abs(float x) {
+  return fabsf(x);
+}
+template <>
+inline __device__ double cuda_abs(double x) {
+  return fabs(x);
+}
+}
+
+namespace {
+template <typename T>
+__global__ void LpPoolForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int pw = n % pooled_width;
+    n /= pooled_width;
+    int ph = n % pooled_height;
+    n /= pooled_height;
+    int c = n % channels;
+    n /= channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    top_data[index] = 0;
+    int bottom_offset = (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        top_data[index] += cuda_pow<T>(
+            cuda_abs(bottom_data[bottom_offset + h * width + w]), p);
+      }
+    }
+    top_data[index] = cuda_pow<T>(top_data[index], 1.0 / p);
+  }
+}
+
+template <typename T>
+__global__ void LpPoolForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pw = (index / channels) % pooled_width;
+    int ph = (index / channels / pooled_width) % pooled_height;
+    int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T output = 0;
+    int bottom_offset = n * height * width * channels + c;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        output += cuda_pow<T>(
+            cuda_abs(bottom_data[bottom_offset + (h * width + w) * channels]),
+            p);
+      }
+    }
+    top_data[index] = cuda_pow<T>(output, 1.0 / p);
+  }
+}
+
+template <typename T>
+__global__ void LpPoolBackwardNCHW(
+    const int nthreads,
+    const T* const top_diff,
+    const T* const top_data,
+    const T* const bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff,
+    const int p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    const T* const top_data_slice =
+        top_data + (n * channels + c) * pooled_height * pooled_width;
+
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        gradient += top_diff_slice[ph * pooled_width + pw] *
+            bottom_data[index] *
+            cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
+            cuda_pow<T>(top_data_slice[ph * pooled_width + pw], p - 1);
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void LpPoolBackwardNHWC(
+    const int nthreads,
+    const T* const top_diff,
+    const T* const top_data,
+    const T* const bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    const T* const top_data_slice =
+        top_data + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
+            bottom_data[index] *
+            cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
+            cuda_pow<T>(top_data_slice[(ph * pooled_width + pw) * channels],
+                        p - 1);
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+} // namespace
+
+template <>
+bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
+  int output_size = Y->size();
+  LpPoolForwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(2),
+      Y->dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
+  int output_size = Y->size();
+  LpPoolForwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(1),
+      Y->dim32(2),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, LpPool>::
+    RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
+  LpPoolBackwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(2),
+      dY.dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, LpPool>::
+    RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
+  LpPoolBackwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(1),
+      dY.dim32(2),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LpPool, PoolOp<float, CUDAContext, LpPool>);
+REGISTER_CUDA_OPERATOR(
+    LpPoolGradient,
+    PoolGradientOp<float, CUDAContext, LpPool>);
+}
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
new file mode 100644
index 0000000..f79d51a
--- /dev/null
+++ b/caffe2/operators/lpnorm_op.cc
@@ -0,0 +1,172 @@
+#include "caffe2/operators/lpnorm_op.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool LpNormOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* norm = Output(0);
+  norm->Resize(1);
+  const float* X_data = X.data<float>();
+  const float size = average_ ? (float)X.size() : 1.0f;
+  CAFFE_ENFORCE_GT(size, 0);
+  if (p_ == 1) {
+    *(norm->mutable_data<float>()) =
+        (ConstEigenVectorMap<float>(X_data, X.size()).array()).abs().sum() /
+        size;
+    // L1(x) = sum(|x|), L1_average(x) = sum(\x\) / x.size()
+  } else if (p_ == 2) {
+    *(norm->mutable_data<float>()) =
+        (ConstEigenVectorMap<float>(X_data, X.size()).array()).square().sum() /
+        size;
+    // L2(x) = (sum(|x|^2)), L2_average(x) = sum(|x|^2) / x.size()
+  }
+  return true;
+}
+
+template <>
+bool LpNormGradientOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& dnorm = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_EQ(dnorm.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dnorm.dim32(0), 1);
+  dX->ResizeLike(X);
+  const float kEps = 1e-12f;
+  const float size = average_ ? (float)X.size() : 1.0f;
+  if (p_ == 1) {
+    // Todo: implement in eigen
+    for (int i = 0; i < X.size(); ++i) {
+      float temp = (X.data<float>())[i];
+      if (temp < -kEps) {
+        dX->mutable_data<float>()[i] = -(dnorm.data<float>())[0] / size;
+      } else if (temp > kEps) {
+        dX->mutable_data<float>()[i] = (dnorm.data<float>())[0] / size;
+      } else {
+        dX->mutable_data<float>()[i] = 0;
+      }
+    }
+  } else if (p_ == 2) {
+    EigenVectorMap<float>(dX->mutable_data<float>(), X.size()).array() =
+        ConstEigenVectorMap<float>(X.data<float>(), X.size()).array() * 2.0f *
+        ((dnorm.data<float>())[0] / size);
+  }
+
+  return true;
+}
+
+namespace {
+// LpNorm
+REGISTER_CPU_OPERATOR(LpNorm, LpNormOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(LpNormGradient, LpNormGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LpNorm)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This op computes the $L_p$ norm of the one dimensional input tensor $X$, and outputs a one dimensional output tensor $Y$. Here, the $L_p$ norm is calculated as
+
+$$L_p(\mathbf{x}) = \sum_i x_i^p$$
+
+This op supports $p$ values of 1 or 2. If the average argument is set, the norm is calculated as Lp_averaged_norm(x) is defined as Lp_averaged_norm(x) = LpNorm(x) / size(x).
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/lpnorm_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/lpnorm_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LpNorm",
+    ["X"],
+    ["Y"],
+    p=2
+)
+X = np.array([5., 2.])
+print("X:\n",X)
+
+# Feed X into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [5. 2.]
+Y:
+ [29.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "1D Input tensor of data to be operated on.")
+    .Output(0, "Z", "1D output tensor")
+    .Arg(
+        "p",
+        "*(type: int; default: 2, possible values: {1,2})* Order of the norm in p-norm.")
+    .Arg(
+        "average",
+        "*(type: bool; default: False)* Whether we calculate norm or averaged_norm.The Lp_averaged_norm(x) is defined as Lp_averaged_norm(x) = LpNorm(x) / size(x)")
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      std::vector<TIndex> output_dims(1);
+      output_dims[0] = 1; // 1
+      return vector<TensorShape>{
+          CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+    });
+
+OPERATOR_SCHEMA(LpNormGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given one input float tensor X, derivative dout, and produces one output
+float tensor dX. dX is the derivative of the Lp norm of tensor X, computed as
+dx = d(sum over |x^p|)/dx, in which p is either 1 or 2(currently only
+supports l1 and l2 norm) determined by the argument p.
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Input(1, "dout", "1D input tensor")
+    .Output(0, "dx", "1D output tensor")
+    .Arg("p", "Order of the norm in p-norm")
+    .Arg(
+        "average",
+        "whehther we calculate norm or averaged_norm."
+        "The Lp_averaged_norm(x) is defined as"
+        "Lp_averaged_normgradient(x) = LpNormGradient(x) / size(x)");
+
+class GetLpNormGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "LpNormGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(LpNorm, GetLpNormGradient);
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/operators/lpnorm_op.h b/caffe2/operators/lpnorm_op.h
new file mode 100644
index 0000000..0a5c372
--- /dev/null
+++ b/caffe2/operators/lpnorm_op.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_OPERATORS_LPNORM_OP_H_
+#define CAFFE2_OPERATORS_LPNORM_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LpNormOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LpNormOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(int, "p", p_, 2),
+        OP_SINGLE_ARG(bool, "average", average_, false) {
+    CAFFE_ENFORCE(p_ == 1 || p_ == 2, "p should be either 1 or 2.");
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  const int p_;
+  const bool average_;
+};
+
+template <typename T, class Context>
+class LpNormGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LpNormGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(int, "p", p_, 2),
+        OP_SINGLE_ARG(bool, "average", average_, false) {
+    CAFFE_ENFORCE(p_ == 1 || p_ == 2, "p should be either 1 or 2.");
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  const int p_;
+  const bool average_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LPNORM_OP_H_
diff --git a/caffe2/operators/lstm_unit_op.cc b/caffe2/operators/lstm_unit_op.cc
new file mode 100644
index 0000000..ecd0655
--- /dev/null
+++ b/caffe2/operators/lstm_unit_op.cc
@@ -0,0 +1,52 @@
+#include "lstm_unit_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(LSTMUnit, LSTMUnitOp<CPUContext>);
+OPERATOR_SCHEMA(LSTMUnit)
+    .NumInputs(4, 5)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+LSTMUnit computes the activations of a standard LSTM (without peephole
+connections), in a sequence-length aware fashion.
+
+Concretely, given the (fused) inputs X (TxNxD), the previous cell
+state (NxD), and the sequence lengths (N), computes the LSTM
+activations, avoiding computation if the input is invalid (as in, the
+value at X{t][n] >= seqLengths[n].
+
+)DOC")
+    .Arg("forget_bias", "Bias term to add in while calculating forget gate")
+    .Arg(
+        "sequence_lengths",
+        "When false, the sequence lengths input is left out, "
+        "and all following inputs are shifted left by one.");
+REGISTER_CPU_OPERATOR(LSTMUnitGradient, LSTMUnitGradientOp<CPUContext>);
+OPERATOR_SCHEMA(LSTMUnitGradient)
+    .NumInputs(8, 9)
+    .NumOutputs(3)
+    .Arg(
+        "sequence_lengths",
+        "When false, the sequence lengths input is left out, "
+        "and all following inputs are shifted left by one.");
+
+class GetLSTMUnitGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (GetFlagArgument(def_, "sequence_lengths", true)) {
+      return SingleGradientDef(
+          "LSTMUnitGradient",
+          "",
+          vector<string>{
+              I(0), I(1), I(2), I(3), I(4), O(0), O(1), GO(0), GO(1)},
+          vector<string>{GI(0), GI(1), GI(2)});
+    } else {
+      return SingleGradientDef(
+          "LSTMUnitGradient",
+          "",
+          vector<string>{I(0), I(1), I(2), I(3), O(0), O(1), GO(0), GO(1)},
+          vector<string>{GI(0), GI(1), GI(2)});
+    }
+  }
+};
+REGISTER_GRADIENT(LSTMUnit, GetLSTMUnitGradient);
+}
diff --git a/caffe2/operators/lstm_unit_op.h b/caffe2/operators/lstm_unit_op.h
new file mode 100644
index 0000000..73afcbe
--- /dev/null
+++ b/caffe2/operators/lstm_unit_op.h
@@ -0,0 +1,314 @@
+#ifndef CAFFE2_OPERATORS_LSTM_UNIT_OP_H_
+#define CAFFE2_OPERATORS_LSTM_UNIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+namespace detail {
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T host_tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename T, typename Context>
+void LSTMUnit(
+    int N,
+    int D,
+    int t,
+    const T* H_prev,
+    const T* C_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    T* C,
+    T* H,
+    const float forget_bias,
+    Context* /*context*/) {
+  for (int n = 0; n < N; ++n) {
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+
+    for (int d = 0; d < D; ++d) {
+      if (!valid) {
+        if (drop_states) {
+          H[d] = 0;
+          C[d] = 0;
+        } else {
+          H[d] = H_prev[d];
+          C[d] = C_prev[d];
+        }
+      } else {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + convert::To<float, T>(forget_bias));
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = host_tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T host_tanh_c = host_tanh(c);
+        H[d] = o * host_tanh_c;
+      }
+    }
+    H_prev += D;
+    C_prev += D;
+    X += 4 * D;
+    C += D;
+    H += D;
+  }
+}
+
+template <typename T, typename Context>
+void LSTMUnitGradient(
+    int N,
+    int D,
+    int t,
+    const T* C_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    const T* C,
+    const T* H,
+    const T* C_diff,
+    const T* H_diff,
+    bool drop_states,
+    T* H_prev_diff,
+    T* C_prev_diff,
+    T* X_diff,
+    const float forget_bias,
+    Context* /*context*/) {
+  for (int n = 0; n < N; ++n) {
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+
+    for (int d = 0; d < D; ++d) {
+      T* c_prev_diff = C_prev_diff + d;
+      T* h_prev_diff = H_prev_diff + d;
+      T* i_diff = X_diff + d;
+      T* f_diff = X_diff + 1 * D + d;
+      T* o_diff = X_diff + 2 * D + d;
+      T* g_diff = X_diff + 3 * D + d;
+
+      if (!valid) {
+        if (drop_states) {
+          *h_prev_diff = 0;
+          *c_prev_diff = 0;
+        } else {
+          *h_prev_diff = H_diff[d];
+          *c_prev_diff = C_diff[d];
+        }
+        *i_diff = 0;
+        *f_diff = 0;
+        *o_diff = 0;
+        *g_diff = 0;
+      } else {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + convert::To<float, T>(forget_bias));
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = host_tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T host_tanh_c = host_tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - host_tanh_c * host_tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *h_prev_diff = 0; // not used in 'valid' case
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * host_tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+    }
+    C_prev += D;
+    X += 4 * D;
+    C += D;
+    H += D;
+    C_diff += D;
+    H_diff += D;
+    X_diff += 4 * D;
+    H_prev_diff += D;
+    C_prev_diff += D;
+  }
+}
+} // namespace detail
+
+template <typename Context>
+class LSTMUnitOp : public Operator<Context> {
+ public:
+  LSTMUnitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        forget_bias_(
+            static_cast<float>(OperatorBase::template GetSingleArgument<float>(
+                "forget_bias",
+                0.0))),
+        sequence_lengths_(OperatorBase::template GetSingleArgument<bool>(
+            "sequence_lengths",
+            true)),
+        drop_states_(OperatorBase::template GetSingleArgument<bool>(
+            "drop_states",
+            false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  template <typename T>
+  bool DoRunWithType() {
+    // handle potentially-missing sequence lengths input
+    const size_t TIMESTEP = SEQ_LENGTHS + (sequence_lengths_ ? 1 : 0);
+
+    // Extract N
+    const auto N = Input(CELL_T_M_1).dim(1);
+
+    // Gates: 1xNxG
+    const auto G = Input(GATES).dim(2);
+    const auto D = Input(CELL_T_M_1).dim(2);
+
+    CAFFE_ENFORCE_EQ(4 * D, G);
+    const auto* H_prev = Input(HIDDEN_T_M_1).template data<T>();
+    const auto* C_prev = Input(CELL_T_M_1).template data<T>();
+    const auto* X = Input(GATES).template data<T>();
+
+    const int32_t* seqLengths = nullptr;
+    if (sequence_lengths_) {
+      CAFFE_ENFORCE_EQ(Input(SEQ_LENGTHS).size(), N);
+      seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
+    }
+
+    const auto t = static_cast<OperatorBase*>(this)
+                       ->Input<Tensor<CPUContext>>(TIMESTEP)
+                       .template data<int32_t>()[0];
+    Output(CELL_T)->ResizeLike(Input(CELL_T_M_1));
+    auto* C = Output(CELL_T)->template mutable_data<T>();
+    Output(HIDDEN_T)->ResizeLike(Input(CELL_T_M_1));
+    auto* H = Output(HIDDEN_T)->template mutable_data<T>();
+    detail::LSTMUnit<T, Context>(
+        N,
+        D,
+        t,
+        H_prev,
+        C_prev,
+        X,
+        seqLengths,
+        drop_states_,
+        C,
+        H,
+        forget_bias_,
+        &context_);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  INPUT_TAGS(HIDDEN_T_M_1, CELL_T_M_1, GATES, SEQ_LENGTHS);
+  // additional input tags are determined dynamically based on whether
+  // sequence_lengths is present.
+  OUTPUT_TAGS(HIDDEN_T, CELL_T);
+
+  float forget_bias_;
+  bool sequence_lengths_;
+
+ private:
+  bool drop_states_;
+};
+
+template <typename Context>
+class LSTMUnitGradientOp : public Operator<Context> {
+ public:
+  LSTMUnitGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        forget_bias_(
+            static_cast<float>(OperatorBase::template GetSingleArgument<float>(
+                "forget_bias",
+                0.0))),
+        sequence_lengths_(OperatorBase::template GetSingleArgument<bool>(
+            "sequence_lengths",
+            true)),
+        drop_states_(OperatorBase::template GetSingleArgument<bool>(
+            "drop_states",
+            false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <typename T>
+  bool DoRunWithType() {
+    // handle potentially-missing sequence lengths input
+    const size_t inputOffset = SEQ_LENGTHS + (sequence_lengths_ ? 1 : 0);
+    const size_t TIMESTEP = inputOffset;
+    const size_t HIDDEN_T = inputOffset + 1;
+    const size_t CELL_T = inputOffset + 2;
+    const size_t HIDDEN_T_GRAD = inputOffset + 3;
+    const size_t CELL_T_GRAD = inputOffset + 4;
+
+    // Extract N
+    const auto N = Input(CELL_T_M_1).dim(1);
+
+    // Gates: 1xNxG
+    const auto G = Input(GATES).dim(2);
+    const auto D = Input(CELL_T_M_1).dim(2);
+
+    CAFFE_ENFORCE_EQ(4 * D, G);
+    const auto* C_prev = Input(CELL_T_M_1).template data<T>();
+    const auto* X = Input(GATES).template data<T>();
+    const auto t = static_cast<OperatorBase*>(this)
+                       ->Input<Tensor<CPUContext>>(TIMESTEP)
+                       .template data<int32_t>()[0];
+    const auto* C = Input(CELL_T).template data<T>();
+    const auto* H = Input(HIDDEN_T).template data<T>();
+    const auto* C_diff = Input(CELL_T_GRAD).template data<T>();
+    const auto* H_diff = Input(HIDDEN_T_GRAD).template data<T>();
+
+    const int32_t* seqLengths = nullptr;
+    if (sequence_lengths_) {
+      CAFFE_ENFORCE_EQ(Input(SEQ_LENGTHS).size(), N);
+      seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
+    }
+
+    Output(HIDDEN_T_M_1_GRAD)->ResizeLike(Input(HIDDEN_T_M_1));
+    auto* H_prev_diff = Output(HIDDEN_T_M_1_GRAD)->template mutable_data<T>();
+    Output(CELL_T_M_1_GRAD)->ResizeLike(Input(CELL_T_M_1));
+    auto* C_prev_diff = Output(CELL_T_M_1_GRAD)->template mutable_data<T>();
+    Output(GATES_GRAD)->ResizeLike(Input(GATES));
+    auto* X_diff = Output(GATES_GRAD)->template mutable_data<T>();
+
+    detail::LSTMUnitGradient<T, Context>(
+        N,
+        D,
+        t,
+        C_prev,
+        X,
+        seqLengths,
+        C,
+        H,
+        C_diff,
+        H_diff,
+        drop_states_,
+        H_prev_diff,
+        C_prev_diff,
+        X_diff,
+        forget_bias_,
+        &context_);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  INPUT_TAGS(HIDDEN_T_M_1, CELL_T_M_1, GATES, SEQ_LENGTHS);
+  // additional input tags are determined dynamically based on whether
+  // sequence_lengths is present.
+  OUTPUT_TAGS(HIDDEN_T_M_1_GRAD, CELL_T_M_1_GRAD, GATES_GRAD);
+
+  float forget_bias_;
+  bool sequence_lengths_;
+
+ private:
+  bool drop_states_;
+};
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LSTM_UNIT_OP_H_
diff --git a/caffe2/operators/lstm_unit_op_gpu.cu b/caffe2/operators/lstm_unit_op_gpu.cu
new file mode 100644
index 0000000..bdd62e5
--- /dev/null
+++ b/caffe2/operators/lstm_unit_op_gpu.cu
@@ -0,0 +1,270 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "caffe2/core/context_gpu.h"
+#include "lstm_unit_op.h"
+
+namespace caffe2 {
+
+namespace detail {
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename T, typename MATH>
+__global__ void LSTMUnitKernel(
+    const int nthreads,
+    const int dim,
+    const int t,
+    const T* H_prev,
+    const T* C_prev,
+    const T* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    T* C,
+    T* H,
+    const MATH forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+    if (!valid) {
+      H[index] = convert::To<MATH, T>(convert::To<T, MATH>(H_prev[index]) * !drop_states);
+      C[index] = convert::To<MATH, T>(convert::To<T, MATH>(C_prev[index]) * !drop_states);
+    } else {
+      const T* X_offset = X + 4 * dim * n;
+      const MATH i = cuda_sigmoid(convert::To<T, MATH>(X_offset[d]));
+      const MATH f = cuda_sigmoid(convert::To<T, MATH>(X_offset[1 * dim + d]) + forget_bias);
+      const MATH o = cuda_sigmoid(convert::To<T, MATH>(X_offset[2 * dim + d]));
+      const MATH g = tanh(convert::To<T, MATH>(X_offset[3 * dim + d]));
+      const MATH c_prev = convert::To<T, MATH>(C_prev[index]);
+      const MATH c = f * c_prev + i * g;
+      C[index] = convert::To<MATH, T>(c);
+      const MATH tanh_c = tanh(c);
+      H[index] = convert::To<MATH, T>(o * tanh_c);
+    }
+  }
+}
+
+template <typename T, typename MATH>
+__global__ void LSTMUnitGradientKernel(
+    const int nthreads,
+    const int dim,
+    const int t,
+    const T* C_prev,
+    const T* X,
+    const T* C,
+    const T* H,
+    const int32_t* seqLengths,
+    const T* C_diff,
+    const T* H_diff,
+    bool drop_states,
+    T* H_prev_diff,
+    T* C_prev_diff,
+    T* X_diff,
+    const MATH forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const bool valid = seqLengths == nullptr || t < seqLengths[n];
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* h_prev_diff = H_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+    if (!valid) {
+      *h_prev_diff = convert::To<MATH, T>(convert::To<T, MATH>(H_diff[index]) *
+                                          !drop_states);
+      *c_prev_diff = convert::To<MATH, T>(convert::To<T, MATH>(C_diff[index]) *
+                                          !drop_states);
+      *i_diff = convert::To<MATH, T>(0);
+      *f_diff = convert::To<MATH, T>(0);
+      *o_diff = convert::To<MATH, T>(0);
+      *g_diff = convert::To<MATH, T>(0);
+    } else {
+      const MATH i = cuda_sigmoid(convert::To<T, MATH>(X_offset[d]));
+      const MATH f = cuda_sigmoid(convert::To<T, MATH>(X_offset[1 * dim + d]) + forget_bias);
+      const MATH o = cuda_sigmoid(convert::To<T, MATH>(X_offset[2 * dim + d]));
+      const MATH g = tanh(convert::To<T, MATH>(X_offset[3 * dim + d]));
+      const MATH c_prev = convert::To<T, MATH>(C_prev[index]);
+      const MATH c = convert::To<T, MATH>(C[index]);
+      const MATH tanh_c = tanh(c);
+      const MATH c_term_diff =
+          convert::To<T, MATH>(C_diff[index]) +
+          convert::To<T, MATH>(H_diff[index]) * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = convert::To<MATH, T>(c_term_diff * f);
+      *h_prev_diff = convert::To<MATH, T>(0);
+      *i_diff = convert::To<MATH, T>(c_term_diff * g * i * (1 - i));
+      *f_diff = convert::To<MATH, T>(c_term_diff * c_prev * f * (1 - f));
+      *o_diff = convert::To<MATH, T>(
+                  convert::To<T, MATH>(H_diff[index]) * tanh_c * o * (1 - o));
+      *g_diff = convert::To<MATH, T>(c_term_diff * i * (1 - g * g));
+    }
+  }
+}
+
+template <>
+void LSTMUnit<float, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float* H_prev,
+    const float* C_prev,
+    const float* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    float* C,
+    float* H,
+    const float forget_bias,
+    CUDAContext* context) {
+  LSTMUnitKernel<float, float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N * D,
+      D,
+      t,
+      H_prev,
+      C_prev,
+      X,
+      seqLengths,
+      drop_states,
+      C,
+      H,
+      forget_bias);
+}
+
+template <>
+void LSTMUnit<float16, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float16* H_prev,
+    const float16* C_prev,
+    const float16* X,
+    const int32_t* seqLengths,
+    bool drop_states,
+    float16* C,
+    float16* H,
+    const float forget_bias,
+    CUDAContext* context) {
+  LSTMUnitKernel<float16, float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N * D,
+      D,
+      t,
+      H_prev,
+      C_prev,
+      X,
+      seqLengths,
+      drop_states,
+      C,
+      H,
+      forget_bias);
+}
+
+template <>
+void LSTMUnitGradient<float, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float* C_prev,
+    const float* X,
+    const int32_t* seqLengths,
+    const float* C,
+    const float* H,
+    const float* C_diff,
+    const float* H_diff,
+    bool drop_states,
+    float* H_prev_diff,
+    float* C_prev_diff,
+    float* X_diff,
+    const float forget_bias,
+    CUDAContext* context) {
+  LSTMUnitGradientKernel<float, float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N * D,
+      D,
+      t,
+      C_prev,
+      X,
+      C,
+      H,
+      seqLengths,
+      C_diff,
+      H_diff,
+      drop_states,
+      H_prev_diff,
+      C_prev_diff,
+      X_diff,
+      forget_bias);
+}
+
+template <>
+void LSTMUnitGradient<float16, CUDAContext>(
+    int N,
+    int D,
+    int t,
+    const float16* C_prev,
+    const float16* X,
+    const int32_t* seqLengths,
+    const float16* C,
+    const float16* H,
+    const float16* C_diff,
+    const float16* H_diff,
+    bool drop_states,
+    float16* H_prev_diff,
+    float16* C_prev_diff,
+    float16* X_diff,
+    const float forget_bias,
+    CUDAContext* context) {
+  LSTMUnitGradientKernel<float16, float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N * D,
+      D,
+      t,
+      C_prev,
+      X,
+      C,
+      H,
+      seqLengths,
+      C_diff,
+      H_diff,
+      drop_states,
+      H_prev_diff,
+      C_prev_diff,
+      X_diff,
+      forget_bias);
+}
+}
+
+template <>
+bool LSTMUnitOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+template <>
+bool LSTMUnitGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR(LSTMUnit, LSTMUnitOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    LSTMUnitGradient,
+    LSTMUnitGradientOp<CUDAContext>);
+}
diff --git a/caffe2/operators/map_ops.cc b/caffe2/operators/map_ops.cc
new file mode 100644
index 0000000..4862165
--- /dev/null
+++ b/caffe2/operators/map_ops.cc
@@ -0,0 +1,79 @@
+#include "caffe2/operators/map_ops.h"
+
+namespace caffe2 {
+
+using MapType64To64 = MapTypeTraits<int64_t, int64_t>::MapType;
+CAFFE_KNOWN_TYPE(MapType64To64);
+
+using MapType64To32 = MapTypeTraits<int64_t, int32_t>::MapType;
+CAFFE_KNOWN_TYPE(MapType64To32);
+
+using MapType32To32 = MapTypeTraits<int32_t, int32_t>::MapType;
+CAFFE_KNOWN_TYPE(MapType32To32);
+
+using MapType32To64 = MapTypeTraits<int32_t, int64_t>::MapType;
+CAFFE_KNOWN_TYPE(MapType32To64);
+
+namespace {
+
+REGISTER_BLOB_SERIALIZER(
+    TypeMeta::Id<MapType64To64>(),
+    MapSerializer<int64_t, int64_t>);
+
+REGISTER_BLOB_SERIALIZER(
+    TypeMeta::Id<MapType64To32>(),
+    MapSerializer<int64_t, int32_t>);
+
+REGISTER_BLOB_SERIALIZER(
+    TypeMeta::Id<MapType32To32>(),
+    MapSerializer<int32_t, int32_t>);
+
+REGISTER_BLOB_SERIALIZER(
+    TypeMeta::Id<MapType32To64>(),
+    MapSerializer<int32_t, int64_t>);
+
+REGISTER_BLOB_DESERIALIZER(
+    (std::unordered_map<int64_t, int64_t>),
+    MapDeserializer<int64_t, int64_t>);
+
+REGISTER_BLOB_DESERIALIZER(
+    (std::unordered_map<int64_t, int32_t>),
+    MapDeserializer<int64_t, int32_t>);
+
+REGISTER_BLOB_DESERIALIZER(
+    (std::unordered_map<int32_t, int32_t>),
+    MapDeserializer<int32_t, int32_t>);
+
+REGISTER_BLOB_DESERIALIZER(
+    (std::unordered_map<int32_t, int64_t>),
+    MapDeserializer<int32_t, int64_t>);
+
+REGISTER_CPU_OPERATOR(CreateMap, CreateMapOp<CPUContext>);
+REGISTER_CPU_OPERATOR(KeyValueToMap, KeyValueToMapOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MapToKeyValue, MapToKeyValueOp<CPUContext>);
+
+OPERATOR_SCHEMA(CreateMap)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Create an empty map blob")
+    .Arg("key_dtype", "Key's TensorProto::DataType (default INT32)")
+    .Arg("value_dtype", "Value's TensorProto::DataType (default INT32)")
+    .Output(0, "map blob", "Blob reference to the map");
+
+OPERATOR_SCHEMA(KeyValueToMap)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Convert key and value blob pairs into a map blob")
+    .Input(0, "key blob", "Blob reference to the key")
+    .Input(1, "value blob", "Blob reference to the value")
+    .Output(0, "map blob", "Blob reference to the map");
+
+OPERATOR_SCHEMA(MapToKeyValue)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc("Convert a map blob into key and value blob pairs")
+    .Input(0, "map blob", "Blob reference to the map")
+    .Output(0, "key blob", "Blob reference to the key")
+    .Output(1, "value blob", "Blob reference to the value");
+}
+} // namespace caffe2
diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h
new file mode 100644
index 0000000..8d1a18f
--- /dev/null
+++ b/caffe2/operators/map_ops.h
@@ -0,0 +1,258 @@
+#ifndef CAFFE2_OPERATORS_MAP_OPS_H_
+#define CAFFE2_OPERATORS_MAP_OPS_H_
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+struct TypeNameTraits {
+  static constexpr const char* name = "unknown";
+};
+
+template <>
+struct TypeNameTraits<int64_t> {
+  static constexpr const char* name = "int64_t";
+};
+
+template <>
+struct TypeNameTraits<int32_t> {
+  static constexpr const char* name = "int32_t";
+};
+
+template <typename KEY_T, typename VALUE_T>
+struct MapTypeTraits {
+  using MapType = std::unordered_map<KEY_T, VALUE_T>;
+  static string MapTypeName() {
+    return string("(std::unordered_map<") + TypeNameTraits<KEY_T>::name + ", " +
+        TypeNameTraits<VALUE_T>::name + ">)";
+  }
+};
+
+using MapType64To64 = MapTypeTraits<int64_t, int64_t>::MapType;
+using MapType64To32 = MapTypeTraits<int64_t, int32_t>::MapType;
+using MapType32To32 = MapTypeTraits<int32_t, int32_t>::MapType;
+using MapType32To64 = MapTypeTraits<int32_t, int64_t>::MapType;
+
+template <class Context>
+class CreateMapOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CreateMapOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~CreateMapOp() {}
+
+  bool RunOnDevice() override {
+    TensorProto::DataType key_dtype =
+        static_cast<TensorProto::DataType>(OperatorBase::GetSingleArgument<int>(
+            "key_dtype", TensorProto_DataType_INT32));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, DataTypeToTypeMeta(key_dtype));
+  }
+
+  template <typename KEY_T>
+  bool DoRunWithType() {
+    TensorProto::DataType value_dtype =
+        static_cast<TensorProto::DataType>(OperatorBase::GetSingleArgument<int>(
+            "value_dtype", TensorProto_DataType_INT32));
+
+    return DispatchHelper<
+        TensorTypes2<int32_t, int64_t, GenericTensorImplementation>,
+        KEY_T>::call(this, DataTypeToTypeMeta(value_dtype));
+  }
+
+  template <typename KEY_T, typename VALUE_T>
+  bool DoRunWithType2() {
+    // clear to make sure the map is empty
+    OperatorBase::Output<typename MapTypeTraits<KEY_T, VALUE_T>::MapType>(MAP)
+        ->clear();
+    return true;
+  }
+
+  template <typename KEY_T>
+  bool DoRunWithOtherType2() {
+    TensorProto::DataType value_dtype =
+        static_cast<TensorProto::DataType>(OperatorBase::GetSingleArgument<int>(
+            "value_dtype", TensorProto_DataType_INT32));
+
+    CAFFE_THROW(
+        "CreateMap is not implemented on value tensor of type ",
+        DataTypeToTypeMeta(value_dtype).name(),
+        "Consider adding it a type in the list DispatchHelper");
+  }
+
+  OUTPUT_TAGS(MAP);
+};
+
+template <class Context>
+class KeyValueToMapOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  KeyValueToMapOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~KeyValueToMapOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(KEYS));
+  }
+
+  template <typename KEY_T>
+  bool DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<int32_t, int64_t, GenericTensorImplementation>,
+        KEY_T>::call(this, Input(VALUES));
+  }
+
+  template <typename KEY_T, typename VALUE_T>
+  bool DoRunWithType2() {
+    using MapType = typename MapTypeTraits<KEY_T, VALUE_T>::MapType;
+    const auto& key_input = Input(KEYS);
+    const auto& value_input = Input(VALUES);
+
+    CAFFE_ENFORCE_EQ(key_input.size(), value_input.size());
+
+    auto* key_data = key_input.template data<KEY_T>();
+    auto* value_data = value_input.template data<VALUE_T>();
+
+    auto* map_data = OperatorBase::Output<MapType>(MAP);
+
+    for (int i = 0; i < key_input.size(); ++i) {
+      map_data->emplace(key_data[i], value_data[i]);
+    }
+
+    return true;
+  }
+
+  template <typename KEY_T>
+  bool DoRunWithOtherType2() {
+    CAFFE_THROW(
+        "KeyValueToMap is not implemented on value tensor of type ",
+        Input(VALUES).meta().name(),
+        "Consider adding it a type in the list DispatchHelper");
+  }
+
+  INPUT_TAGS(KEYS, VALUES);
+  OUTPUT_TAGS(MAP);
+};
+
+template <class Context>
+class MapToKeyValueOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MapToKeyValueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~MapToKeyValueOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<
+        MapType64To64,
+        MapType64To32,
+        MapType32To32,
+        MapType32To64>>::call(this, OperatorBase::InputBlob(MAP));
+  }
+
+  template <typename MAP_T>
+  bool DoRunWithType() {
+    using key_type = typename MAP_T::key_type;
+    using mapped_type = typename MAP_T::mapped_type;
+    auto& map_data = OperatorBase::Input<MAP_T>(MAP);
+    auto* key_output = Output(KEYS);
+    auto* value_output = Output(VALUES);
+    key_output->Resize(map_data.size());
+    value_output->Resize(map_data.size());
+    auto* key_data = key_output->template mutable_data<key_type>();
+    auto* value_data = value_output->template mutable_data<mapped_type>();
+
+    for (const auto& it : map_data) {
+      *key_data = it.first;
+      *value_data = it.second;
+      key_data++;
+      value_data++;
+    }
+
+    return true;
+  }
+
+  INPUT_TAGS(MAP);
+  OUTPUT_TAGS(KEYS, VALUES);
+};
+
+template <typename KEY_T, typename VALUE_T>
+class MapSerializer : public BlobSerializerBase {
+ public:
+  using MapType = typename MapTypeTraits<KEY_T, VALUE_T>::MapType;
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      BlobSerializerBase::SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<MapType>());
+    const MapType& map_data = blob.template Get<MapType>();
+    TIndex sz = map_data.size();
+    Tensor<CPUContext> key_tensor;
+    key_tensor.Resize(sz);
+    Tensor<CPUContext> value_tensor;
+    value_tensor.Resize(sz);
+    auto* key_data = key_tensor.mutable_data<KEY_T>();
+    auto* value_data = value_tensor.mutable_data<VALUE_T>();
+    for (const auto& it : map_data) {
+      *key_data = it.first;
+      *value_data = it.second;
+      key_data++;
+      value_data++;
+    }
+
+    TensorProtos tensor_protos;
+    TensorSerializer<CPUContext> ser;
+    ser.Serialize(
+        key_tensor, name, tensor_protos.add_protos(), 0, key_tensor.size());
+    ser.Serialize(
+        value_tensor, name, tensor_protos.add_protos(), 0, value_tensor.size());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(MapTypeTraits<KEY_T, VALUE_T>::MapTypeName());
+    blob_proto.set_content(tensor_protos.SerializeAsString());
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+template <typename KEY_T, typename VALUE_T>
+class MapDeserializer : public BlobDeserializerBase {
+ public:
+  using MapType = typename MapTypeTraits<KEY_T, VALUE_T>::MapType;
+
+  void Deserialize(const BlobProto& proto, Blob* blob) override {
+    TensorProtos tensor_protos;
+    CAFFE_ENFORCE(
+        tensor_protos.ParseFromString(proto.content()),
+        "Fail to parse TensorProtos");
+    TensorDeserializer<CPUContext> deser;
+    Tensor<CPUContext> key_tensor, value_tensor;
+    deser.Deserialize(tensor_protos.protos(0), &key_tensor);
+    deser.Deserialize(tensor_protos.protos(1), &value_tensor);
+    auto* key_data = key_tensor.data<KEY_T>();
+    auto* value_data = value_tensor.data<VALUE_T>();
+
+    auto* map_ptr = blob->template GetMutable<MapType>();
+    for (int i = 0; i < key_tensor.size(); ++i) {
+      map_ptr->emplace(key_data[i], value_data[i]);
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MAP_OPS_H_
diff --git a/caffe2/operators/margin_ranking_criterion_op.cc b/caffe2/operators/margin_ranking_criterion_op.cc
new file mode 100644
index 0000000..b699c4b
--- /dev/null
+++ b/caffe2/operators/margin_ranking_criterion_op.cc
@@ -0,0 +1,111 @@
+#include "caffe2/operators/margin_ranking_criterion_op.h"
+
+#include <algorithm>
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool MarginRankingCriterionOp<CPUContext>::RunOnDevice() {
+  auto& X1 = Input(0);
+  auto& X2 = Input(1);
+  auto& Y = Input(2);
+  auto* loss = Output(0);
+  CAFFE_ENFORCE_EQ(
+      X1.size(),
+      X2.size(),
+      "The two inputs for computing ranking loss should have the same size.");
+  CAFFE_ENFORCE_EQ(
+      X1.size(), Y.size(), "The input and label should have the same size.");
+  loss->ResizeLike(X1);
+
+  const float* X1data = X1.data<float>();
+  const float* X2data = X2.data<float>();
+  const int* Ydata = Y.data<int>();
+  float* output = loss->mutable_data<float>();
+  for (int i = 0; i < X1.size(); ++i) {
+    output[i] = std::max(-Ydata[i] * (X1data[i] - X2data[i]) + margin_, 0.f);
+  }
+  return true;
+}
+
+template <>
+bool MarginRankingCriterionGradientOp<CPUContext>::RunOnDevice() {
+  auto& X1 = Input(0);
+  auto& X2 = Input(1);
+  auto& Y = Input(2);
+  auto& dLoss = Input(3);
+  auto* dX1 = Output(0);
+  auto* dX2 = Output(1);
+
+  dX1->ResizeLike(X1);
+  dX2->ResizeLike(X2);
+
+  const float* X1data = X1.data<float>();
+  const float* X2data = X2.data<float>();
+  const int* Ydata = Y.data<int>();
+  const float* dLoss_data = dLoss.data<float>();
+
+  float* dX1_data = dX1->mutable_data<float>();
+  float* dX2_data = dX2->mutable_data<float>();
+  for (int i = 0; i < X1.size(); ++i) {
+    auto dist = -Ydata[i] * (X1data[i] - X2data[i]) + margin_;
+    if (dist < 0.f) {
+      dX1_data[i] = dX2_data[i] = 0.f;
+    } else {
+      dX1_data[i] = -Ydata[i] * dLoss_data[i];
+      dX2_data[i] = Ydata[i] * dLoss_data[i];
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    MarginRankingCriterion,
+    MarginRankingCriterionOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    MarginRankingCriterionGradient,
+    MarginRankingCriterionGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(MarginRankingCriterion)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+MarginRankingCriterion takes two input data X1 (Tensor<float>),
+X2 (Tensor<float>), and label Y (Tensor<int>) to produce the
+loss (Tensor<float>) where the loss function,
+loss(X1, X2, Y) = max(0, -Y * (X1 - X2) + margin), is applied to
+the tensor elementwise.
+
+If y == 1 then it assumed the first input should be ranked higher
+(have a larger value) than the second input, and vice-versa for
+y == -1.
+)DOC")
+    .Input(0, "X1", "The left input vector as a 1-dim TensorCPU.")
+    .Input(1, "X2", "The right input vector as a 1-dim TensorCPU.")
+    .Input(2, "Y", "The label as a 1-dim TensorCPU with int value of 1 or -1.")
+    .Output(0, "loss", "The output loss with the same dimensionality as X1.");
+
+OPERATOR_SCHEMA(MarginRankingCriterionGradient)
+    .NumInputs(4)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+MarginRankingCriterionGradient takes both X1, X2, Y and dY and
+uses them to update dX1, and dX2 according to the chain rule
+and derivatives of the loss function.
+)DOC");
+
+class GetMarginRankingCriterionGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MarginRankingCriterionGradient",
+        "",
+        vector<string>{I(0), I(1), I(2), GO(0)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(MarginRankingCriterion, GetMarginRankingCriterionGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/margin_ranking_criterion_op.cu b/caffe2/operators/margin_ranking_criterion_op.cu
new file mode 100644
index 0000000..bd6ae37
--- /dev/null
+++ b/caffe2/operators/margin_ranking_criterion_op.cu
@@ -0,0 +1,88 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/margin_ranking_criterion_op.h"
+
+namespace caffe2 {
+namespace {
+
+
+__global__ void MRCKernel(
+    const int N, const int* Y, const float* X1, const float* X2, const float margin,
+    float* output) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    output[i] = max(0.f, -Y[i] * (X1[i] - X2[i]) + margin);
+  }
+}
+
+__global__ void MRCGradientKernel(
+    const int N, const int* Y, const float* X1, const float* X2, const float* dOutput,
+    const float margin, float* dX1, float* dX2) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float dist = -Y[i] * (X1[i] - X2[i]) + margin;
+    if (dist < 0.f) {
+      dX1[i] = dX2[i] = 0.f;
+    } else {
+      dX1[i] = -Y[i] * dOutput[i];
+      dX2[i] = Y[i] * dOutput[i];
+    }
+  }
+}
+}  // namespace
+
+template <>
+bool MarginRankingCriterionOp<CUDAContext>::RunOnDevice() {
+  auto& X1 = Input(0);
+  auto& X2 = Input(1);
+  auto& Y = Input(2);
+  auto* loss = Output(0);
+  CAFFE_ENFORCE(
+      X1.size() == X2.size(),
+      "The two inputs for computing ranking loss should have the same size.");
+  CAFFE_ENFORCE(
+      X1.size() == Y.size(),
+      "The input and label should have the same size.");
+  loss->ResizeLike(X1);
+
+  const float* X1data = X1.data<float>();
+  const float* X2data = X2.data<float>();
+  const int* Ydata = Y.data<int>();
+  float* output_data = loss->mutable_data<float>();
+
+  MRCKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
+              0, context_.cuda_stream()>>>(
+      X1.size(), Ydata, X1data, X2data, margin_, output_data);
+  return true;
+}
+
+template <>
+bool MarginRankingCriterionGradientOp<CUDAContext>::RunOnDevice() {
+  auto& X1 = Input(0);
+  auto& X2 = Input(1);
+  auto& Y = Input(2);
+  auto& dOutput = Input(3);
+  auto* dX1 = Output(0);
+  auto* dX2 = Output(1);
+
+  dX1->ResizeLike(X1);
+  dX2->ResizeLike(X2);
+
+  const float* X1data = X1.data<float>();
+  const float* X2data = X2.data<float>();
+  const int* Ydata = Y.data<int>();
+  const float* dOutput_data = dOutput.data<float>();
+
+  float* dX1_data = dX1->mutable_data<float>();
+  float* dX2_data = dX2->mutable_data<float>();
+  MRCGradientKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
+                      0, context_.cuda_stream()>>>(
+      X1.size(), Ydata, X1data, X2data,
+      dOutput_data, margin_, dX1_data, dX2_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    MarginRankingCriterion,
+    MarginRankingCriterionOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    MarginRankingCriterionGradient,
+    MarginRankingCriterionGradientOp<CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/margin_ranking_criterion_op.h b/caffe2/operators/margin_ranking_criterion_op.h
new file mode 100644
index 0000000..933db52
--- /dev/null
+++ b/caffe2/operators/margin_ranking_criterion_op.h
@@ -0,0 +1,40 @@
+#ifndef CAFFE2_OPERATORS_MARGIN_RANKING_CRITERION_OP_H_
+#define CAFFE2_OPERATORS_MARGIN_RANKING_CRITERION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class MarginRankingCriterionOp final : public Operator<Context> {
+ public:
+  MarginRankingCriterionOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(float, "margin", margin_, 1.0) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float margin_;
+};
+
+template <class Context>
+class MarginRankingCriterionGradientOp final : public Operator<Context> {
+ public:
+  MarginRankingCriterionGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        OP_SINGLE_ARG(float, "margin", margin_, 1.0) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float margin_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MARGIN_RANKING_CRITERION_OP_H_
diff --git a/caffe2/operators/matmul_op.cc b/caffe2/operators/matmul_op.cc
new file mode 100644
index 0000000..f6241d1
--- /dev/null
+++ b/caffe2/operators/matmul_op.cc
@@ -0,0 +1,225 @@
+#include "caffe2/operators/matmul_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(MatMul, MatMulOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(MatMul)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      out[0].set_data_type(in[0].data_type());
+      ArgumentHelper arg_helper(def);
+      int axis_a = arg_helper.GetSingleArgument<int>("axis_a", 1);
+      int axis_b = arg_helper.GetSingleArgument<int>("axis_b", 1);
+      int trans_a = arg_helper.GetSingleArgument<bool>("trans_a", false);
+      int trans_b = arg_helper.GetSingleArgument<bool>("trans_b", false);
+      int canonical_axis_a = canonical_axis_index_(axis_a, in[0].dims().size());
+      int canonical_axis_b = canonical_axis_index_(axis_b, in[0].dims().size());
+
+      int M = size_to_dim_(canonical_axis_a, GetDimsVector(in[0]));
+      int N = size_from_dim_(canonical_axis_b, GetDimsVector(in[1]));
+      if (trans_a) {
+        M = size_from_dim_(canonical_axis_a, GetDimsVector(in[0]));
+      }
+      if (trans_b) {
+        N = size_to_dim_(canonical_axis_b, GetDimsVector(in[1]));
+      }
+
+      out[0].add_dims(M);
+      out[0].add_dims(N);
+
+      return out;
+    })
+    .SetDoc(R"DOC(
+Matrix multiplication $Y = A * B$, where `A` has size (M x K), `B` has size
+(K x N), and `Y` will have a size (M x N). To transpose `A` or `B` before
+multiplication, pass 1 to the `trans_a` and/or `trans_b` arguments, which
+separate the first and second dimensions of the respective matrices using
+`axis_a` and `axis_b`.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/matmul_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "MatMul",
+    ["A", "B"],
+    ["Y"],
+)
+
+workspace.FeedBlob("A", np.random.randint(10, size=(3,3)).astype(np.float32))
+workspace.FeedBlob("B", np.random.randint(10, size=(3,3)).astype(np.float32))
+print("A:", workspace.FetchBlob("A"))
+print("B:", workspace.FetchBlob("B"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+A: [[1. 8. 3.]
+ [6. 4. 4.]
+ [5. 4. 7.]]
+B: [[4. 0. 3.]
+ [3. 1. 1.]
+ [8. 5. 8.]]
+Y: [[52. 23. 35.]
+ [68. 24. 54.]
+ [88. 39. 75.]]
+```
+
+</details>
+
+)DOC")
+    .Input(
+        0,
+        "A",
+        "*(type: Tensor`<float>`)* 2D matrix of size (M x K).")
+    .Input(
+        1,
+        "B",
+        "*(type: Tensor`<float>`)* 2D matrix of size (K x N).")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* 2D matrix of size (M x N).")
+    .Arg(
+        "axis_a",
+        "*(type: int; default: 1)* Exclusive axis that divides the first and "
+        "second dimension of matrix `A`.")
+    .Arg(
+        "axis_b",
+        "*(type: int; default: 1)* Exclusive axis that divides the first and "
+        "second dimension of matrix `B`.")
+    .Arg(
+        "trans_a",
+        "*(type: int; default: 0)* Pass 1 to transpose `A` before multiplication and "
+        "after the dimension adjustment using `axis_a`.")
+    .Arg(
+        "trans_b",
+        "*(type: int; default: 0)* Pass 1 to transpose `B` before multiplication and "
+        "after the dimension adjustment using `axis_b`.");
+
+class GetMatMulGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(def_.input_size() == 2 || def_.input_size() == 3);
+
+    bool axis_a = 1;
+    bool axis_b = 1;
+    bool trans_a = 0;
+    bool trans_b = 0;
+
+    if (ArgumentHelper::HasArgument(Def(), "trans_a")) {
+      trans_a = GetArgument(Def(), "trans_a").i();
+    }
+    if (ArgumentHelper::HasArgument(Def(), "trans_b")) {
+      trans_b = GetArgument(Def(), "trans_b").i();
+    }
+    if (ArgumentHelper::HasArgument(Def(), "axis_a")) {
+      axis_a = GetArgument(Def(), "axis_a").i();
+    }
+    if (ArgumentHelper::HasArgument(Def(), "axis_b")) {
+      axis_b = GetArgument(Def(), "axis_b").i();
+    }
+
+    if (trans_a) {
+      if (trans_b) {
+        // A'B':
+        // dA = B'G', dB = G'A'
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{I(1), GO(0), I(0)},
+                vector<string>{GI(0)},
+                vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                 MakeArgument<int>("trans_b", 1),
+                                 MakeArgument<int>("axis_a", axis_b)}),
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{GO(0), I(0), I(1)},
+                vector<string>{GI(1)},
+                vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                 MakeArgument<int>("trans_b", 1),
+                                 MakeArgument<int>("axis_b", axis_a)})};
+      } else {
+        // A'B:
+        // dA = BG', dB = AG
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{I(1), GO(0), I(0)},
+                vector<string>{GI(0)},
+                vector<Argument>{MakeArgument<int>("trans_b", 1),
+                                 MakeArgument<int>("axis_a", axis_b)}),
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{I(0), GO(0), I(1)},
+                vector<string>{GI(1)},
+                vector<Argument>{MakeArgument<int>("axis_a", axis_a)})};
+      }
+    } else {
+      if (trans_b) {
+        // AB':
+        // dA = GB, dB = G'A
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{GO(0), I(1), I(0)},
+                vector<string>{GI(0)},
+                vector<Argument>{MakeArgument<int>("axis_b", axis_b)}),
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{GO(0), I(0), I(1)},
+                vector<string>{GI(1)},
+                vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                 MakeArgument<int>("axis_b", axis_a)})};
+      } else {
+        // AB:
+        // dA = GB', dB = A'G
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{GO(0), I(1), I(0)},
+                vector<string>{GI(0)},
+                vector<Argument>{MakeArgument<int>("trans_b", 1),
+                                 MakeArgument<int>("axis_b", axis_b)}),
+            CreateOperatorDef(
+                "MatMul",
+                "",
+                vector<string>{I(0), GO(0), I(1)},
+                vector<string>{GI(1)},
+                vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                 MakeArgument<int>("axis_a", axis_a)})};
+      }
+    }
+  }
+
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(MatMul, GetMatMulGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/matmul_op.h b/caffe2/operators/matmul_op.h
new file mode 100644
index 0000000..ccc7138
--- /dev/null
+++ b/caffe2/operators/matmul_op.h
@@ -0,0 +1,104 @@
+#ifndef CAFFE2_OPERATORS_MATMUL_OP_H_
+#define CAFFE2_OPERATORS_MATMUL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class MatMulOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MatMulOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_a_(OperatorBase::GetSingleArgument<int>("axis_a", 1)),
+        axis_b_(OperatorBase::GetSingleArgument<int>("axis_b", 1)),
+        trans_a_(OperatorBase::GetSingleArgument<int>("trans_a", 0)),
+        trans_b_(OperatorBase::GetSingleArgument<int>("trans_b", 0)) {}
+  ~MatMulOp() {}
+
+  bool RunOnDevice() override {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* Y = Output(0);
+
+    const auto canonical_axis_a = A.canonical_axis_index(axis_a_);
+    const auto canonical_axis_b = B.canonical_axis_index(axis_b_);
+    int A_dim0 = A.size_to_dim(canonical_axis_a);
+    int A_dim1 = A.size_from_dim(canonical_axis_a);
+    int B_dim0 = B.size_to_dim(canonical_axis_b);
+    int B_dim1 = B.size_from_dim(canonical_axis_b);
+
+    int a_dim0, a_dim1, b_dim0, b_dim1;
+
+    if (trans_a_) {
+      a_dim0 = A_dim1;
+      a_dim1 = A_dim0;
+    } else {
+      a_dim0 = A_dim0;
+      a_dim1 = A_dim1;
+    }
+
+    if (trans_b_) {
+      b_dim0 = B_dim1;
+      b_dim1 = B_dim0;
+    } else {
+      b_dim0 = B_dim0;
+      b_dim1 = B_dim1;
+    }
+
+    auto dimErrorString = [&]() {
+      return MakeString(
+          "Dimension mismatch: ",
+          trans_a_ ? "trans(A): " : "A: ",
+          a_dim0,
+          " ",
+          a_dim1,
+          trans_b_ ? ", trans(B): " : ", B: ",
+          b_dim0,
+          " ",
+          b_dim1);
+    };
+    // Error checking
+    CAFFE_ENFORCE(a_dim1 == b_dim0, dimErrorString());
+
+    Y_shape_cache_[0] = a_dim0;
+    Y_shape_cache_[1] = b_dim1;
+    Y->Resize(Y_shape_cache_);
+    CAFFE_ENFORCE(a_dim0 * b_dim1 == Y->size(), dimErrorString());
+    // Y = A * B
+    math::Gemm<T, Context, Engine>(
+        trans_a_ ? CblasTrans : CblasNoTrans,
+        trans_b_ ? CblasTrans : CblasNoTrans,
+        a_dim0,
+        b_dim1,
+        a_dim1,
+        1,
+        A.template data<T>(),
+        B.template data<T>(),
+        0,
+        Y->template mutable_data<T>(),
+        &context_);
+
+    if (InputSize() == 3) {
+      // In gradient op, resize to input
+      Y->ResizeLike(Input(2));
+    }
+    return true;
+  }
+
+ protected:
+  // A local vector to cache the output shape so we don't need to recreate
+  // a vector object every time we run Run().
+  vector<TIndex> Y_shape_cache_{0, 0};
+  int axis_a_{1};
+  int axis_b_{1};
+  bool trans_a_;
+  bool trans_b_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MATMUL_OP_H_
diff --git a/caffe2/operators/matmul_op_gpu.cc b/caffe2/operators/matmul_op_gpu.cc
new file mode 100644
index 0000000..973bb26
--- /dev/null
+++ b/caffe2/operators/matmul_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/matmul_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(MatMul, MatMulOp<float, CUDAContext>);
+
+}
diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu
new file mode 100644
index 0000000..b8e6d2b
--- /dev/null
+++ b/caffe2/operators/max_pool_with_index.cu
@@ -0,0 +1,258 @@
+#include "caffe2/operators/max_pool_with_index.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+namespace {
+
+/***
+  * Note: CUDA kernels are minor changes from those at:
+  * https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
+  * Originally licensed under BSD
+  **/
+template <typename Dtype>
+__global__ void MaxPoolForward(
+    const int nthreads,
+    const Dtype* const bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    Dtype* const top_data,
+    int* mask) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    float maxval = -FLT_MAX;
+    int maxidx = -1;
+    const Dtype* const bottom_slice =
+        bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (convert::To<Dtype, float>(bottom_slice[h * width + w]) > maxval) {
+          maxidx = h * width + w;
+          maxval = convert::To<Dtype, float>(bottom_slice[maxidx]);
+        }
+      }
+    }
+    top_data[index] = convert::To<float, Dtype>(maxval);
+    mask[index] = maxidx;
+  }
+}
+
+template <typename Dtype>
+__global__ void MaxPoolBackward(
+    const int nthreads,
+    const Dtype* const top_diff,
+    const int* const mask,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    Dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    float gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    const Dtype* const top_diff_slice = top_diff + offset;
+    const int* const mask_slice = mask + offset;
+
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        if (mask_slice[ph * pooled_width + pw] == h * width + w) {
+          gradient +=
+              convert::To<Dtype, float>(top_diff_slice[ph * pooled_width + pw]);
+        }
+      }
+    }
+    bottom_diff[index] = convert::To<float, Dtype>(gradient);
+  }
+}
+};
+
+template <typename T>
+bool MaxPoolWithIndexOp::DoRunWithType() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto* mask = Output(1);
+
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
+  int output_size = Y->size();
+  mask->Resize(output_size);
+
+  MaxPoolForward<T><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<T>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(2),
+      Y->dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<T>(),
+      mask->mutable_data<int>());
+  return true;
+}
+
+bool MaxPoolWithIndexOp::RunOnDevice() {
+  auto& X = Input(0);
+
+  CAFFE_ENFORCE(X.ndim() == 4, "Operator only supports 4D tensors");
+
+  if (X.IsType<float>()) {
+    return DoRunWithType<float>();
+  } else if (X.IsType<float16>()) {
+    return DoRunWithType<float16>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+}
+
+template <typename T>
+bool MaxPoolWithIndexGradientOp::DoRunWithType() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  auto& mask = Input(2);
+  auto* dX = Output(0);
+
+  CAFFE_ENFORCE(X.ndim() == 4, "Operator only supports 4D tensors");
+
+  dX->ResizeLike(X);
+  ConvPoolOpBase<CUDAContext>::ComputePads(vector<int>{X.dim32(2), X.dim32(3)});
+
+  MaxPoolBackward<T><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<T>(),
+      mask.data<int>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(2),
+      dY.dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      dX->template mutable_data<T>());
+  return true;
+}
+
+bool MaxPoolWithIndexGradientOp::RunOnDevice() {
+  auto& X = Input(0);
+
+  if (X.IsType<float>()) {
+    return DoRunWithType<float>();
+  } else if (X.IsType<float16>()) {
+    return DoRunWithType<float16>();
+  } else {
+    CAFFE_THROW("Unsupported input type");
+  }
+}
+
+namespace {
+
+REGISTER_CUDA_OPERATOR(MaxPoolWithIndex, MaxPoolWithIndexOp);
+REGISTER_CUDA_OPERATOR(MaxPoolWithIndexGradient, MaxPoolWithIndexGradientOp);
+
+class GetMaxPoolWithIndexGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MaxPoolWithIndexGradient",
+        "",
+        vector<string>{I(0), GO(0), O(1)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(MaxPoolWithIndex, GetMaxPoolWithIndexGradient);
+
+OPERATOR_SCHEMA(MaxPoolWithIndexGradient);
+
+OPERATOR_SCHEMA(MaxPoolWithIndex)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .SetDoc(R"DOC(
+    MaxPoolWithIndex consumes an input blob X and applies max pooling across the
+    blob according to kernel sizes, stride sizes and pad lengths defined by the
+    ConvPoolOpBase operator. It also produces an explicit mask that defines the
+    location that all maximum values were found, which is re-used in the
+    gradient pass. This op is deterministic.
+  )DOC")
+    .Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; dimensions "
+        "depend on whether the NCHW or NHWC operators are being used. For "
+        "example, in the former, the input has size (N x C x H x W), where N is"
+        " the batch size, C is the number of channels, and H and W are the "
+        "height and the width of the data. The corresponding permutation of "
+        "dimensions is used in the latter case. ")
+    .Output(
+        0,
+        "Y",
+        "Output data tensor from average pooling across the input "
+        "tensor. Dimensions will vary based on various kernel, stride, and pad "
+        "sizes.")
+    .Output(
+        1,
+        "Index",
+        "Mask of location indices of the found maximum values, "
+        " used in the gradient operator to accumulate dY values to the "
+        "appropriate locations in Y");
+};
+
+}; // namespace caffe2
diff --git a/caffe2/operators/max_pool_with_index.h b/caffe2/operators/max_pool_with_index.h
new file mode 100644
index 0000000..64337dc
--- /dev/null
+++ b/caffe2/operators/max_pool_with_index.h
@@ -0,0 +1,49 @@
+#ifndef CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
+#define CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
+
+#include <cfloat>
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/operators/pool_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+class MaxPoolWithIndexOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CUDAContext);
+  MaxPoolWithIndexOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws) {}
+  ~MaxPoolWithIndexOp() {}
+
+  template <typename T>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+  // Input: X
+  // Output: Y, mask
+};
+
+class MaxPoolWithIndexGradientOp final : public ConvPoolOpBase<CUDAContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CUDAContext);
+  MaxPoolWithIndexGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws) {}
+  ~MaxPoolWithIndexGradientOp() {}
+
+  template <typename T>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+  // Input: X, dY, mask
+  // Output: dX
+};
+
+}; // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
diff --git a/caffe2/operators/mean_op.cc b/caffe2/operators/mean_op.cc
new file mode 100644
index 0000000..a1cb7b0
--- /dev/null
+++ b/caffe2/operators/mean_op.cc
@@ -0,0 +1,98 @@
+#include "caffe2/operators/mean_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Mean, MeanOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MeanGradient, MeanGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(Mean)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise mean of an arbitrary number of input tensors. This operation can be
+performed in-place, by using the first input blob as the output blob. All inputs
+must have the same shape and data type, and the output will have the same shape
+as the inputs.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/mean_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Mean",
+    ["X", "Y", "Z"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
+workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32))
+workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+print("Y:", workspace.FetchBlob("Y"))
+print("Z:", workspace.FetchBlob("Z"))
+workspace.RunOperatorOnce(op)
+print("Mean:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[0.6035237  0.5305746  0.6298913 ]
+ [0.9169737  0.01280353 0.16286302]
+ [0.6017664  0.9946255  0.05128575]]
+Y:
+[[0.07544111 0.45371833 0.08460239]
+ [0.9708728  0.7422064  0.7933344 ]
+ [0.97671497 0.3411384  0.73818344]]
+Z:
+[[0.08837954 0.90187573 0.46734726]
+ [0.6308827  0.8719029  0.39888734]
+ [0.90059936 0.92883426 0.5695987 ]]
+Mean:
+[[0.25578147 0.6287229  0.39394698]
+ [0.8395764  0.5423043  0.45169494]
+ [0.8263602  0.75486606 0.45302266]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with the same dimensions as inputs. Contains "
+    "the mean values of the input tensors calculated element-wise.");
+
+class GetMeanGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    auto outputs = std::vector<string>();
+    for (int i = 0; i < def_.input_size(); i++) {
+      outputs.push_back(GI(i));
+    }
+    return SingleGradientDef(
+        "MeanGradient", "", std::vector<string>{GO(0)}, outputs);
+  }
+};
+
+REGISTER_GRADIENT(Mean, GetMeanGradient);
+
+OPERATOR_SCHEMA(MeanGradient)
+    .NumInputs(1)
+    .NumOutputs(1, INT_MAX)
+    .AllowInplace({{0, 0}});
+
+} // namespace caffe2
diff --git a/caffe2/operators/mean_op.cu b/caffe2/operators/mean_op.cu
new file mode 100644
index 0000000..70f5f37
--- /dev/null
+++ b/caffe2/operators/mean_op.cu
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/mean_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Mean, MeanOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(MeanGradient, MeanGradientOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h
new file mode 100644
index 0000000..975288f
--- /dev/null
+++ b/caffe2/operators/mean_op.h
@@ -0,0 +1,125 @@
+#ifndef CAFFE2_OPERATORS_MEAN_OPS_H_
+#define CAFFE2_OPERATORS_MEAN_OPS_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+template <class Context>
+class MeanOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MeanOp)
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& input0 = Input(0);
+    auto* output = Output(0);
+
+    output->ResizeLike(input0);
+    output->CopyFrom(input0, &context_);
+
+    if (InputSize() == 1) {
+      return true;
+    }
+
+    // Dimension checking
+    for (int i = 1; i < InputSize(); ++i) {
+      if (output->dims() != Input(i).dims()) {
+        CAFFE_THROW(
+            "Check failed: output->dims() == Input(i).dims().",
+            "Description: Input #",
+            i,
+            ", input dimension:",
+            Input(i).dims(),
+            " should match output dimension: ",
+            output->dims());
+      }
+    }
+
+    T* output_data = output->template mutable_data<T>();
+    for (int i = 1; i < InputSize(); ++i) {
+      math::Add(
+          output->size(),
+          output_data,
+          Input(i).template data<T>(),
+          output_data,
+          &context_);
+    }
+
+    math::Scale(
+        output->size(),
+        1.0f / InputSize(),
+        output_data,
+        output_data,
+        &context_);
+
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    if (Input(0).template IsType<float>()) {
+      return DoRunWithType<float>();
+    } else {
+      CAFFE_THROW(
+          "Mean operator only supports 32-bit float, but",
+          " input was of type ",
+          Input(0).meta().name());
+    }
+  }
+};
+
+template <class Context>
+class MeanGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MeanGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& dY = Input(0);
+    const auto* dY_data = dY.template data<T>();
+    int size = dY.size();
+
+    int num_inputs = OutputSize();
+    float scale = 1.0f / num_inputs;
+
+    // dX0 = scale * dY
+    auto* dX0 = Output(0);
+    dX0->ResizeLike(dY);
+    math::Scale(
+        size, scale, dY_data, dX0->template mutable_data<T>(), &context_);
+
+    // Copy the rest dX
+    for (int i = 1; i < num_inputs; i++) {
+      auto* cur_dX = Output(i);
+      cur_dX->ResizeLike(dY);
+      cur_dX->CopyFrom(*dX0, &context_);
+    }
+
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    if (Input(0).template IsType<float>()) {
+      return DoRunWithType<float>();
+    } else {
+      CAFFE_THROW(
+          "Mean operator only supports 32-bit float, but",
+          " input was of type ",
+          Input(0).meta().name());
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MEAN_OPS_H_
diff --git a/caffe2/operators/mem_query_op.cu b/caffe2/operators/mem_query_op.cu
new file mode 100644
index 0000000..767be8a
--- /dev/null
+++ b/caffe2/operators/mem_query_op.cu
@@ -0,0 +1,46 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+class GetGPUMemoryUsageOp final : public Operator<CUDAContext> {
+ public:
+  GetGPUMemoryUsageOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  ~GetGPUMemoryUsageOp() {}
+
+  bool RunOnDevice() override {
+    CHECK_EQ(InputSize(), 0);
+    CHECK_EQ(OutputSize(), 1);
+    std::vector<long> total_by_gpu = CUDAContext::TotalMemoryByGpu();
+    std::vector<long> max_by_gpu = CUDAContext::MaxMemoryByGpu();
+    CHECK_EQ(total_by_gpu.size(), max_by_gpu.size());
+
+    auto* stats = Output(0);
+    stats->Resize(2, total_by_gpu.size());
+    context_.Copy<long, CPUContext, CUDAContext>(
+        total_by_gpu.size(), total_by_gpu.data(), stats->mutable_data<long>());
+    context_.Copy<long, CPUContext, CUDAContext>(
+        max_by_gpu.size(),
+        max_by_gpu.data(),
+        stats->mutable_data<long>() + total_by_gpu.size());
+    return true;
+  }
+};
+
+OPERATOR_SCHEMA(GetGPUMemoryUsage)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(Fetches GPU memory stats from CUDAContext. Result is stored
+      in output blob with shape (2, num_gpus). First row contains the total
+      current memory usage, and the second row the maximum usage during
+      this execution.
+
+      NOTE: --caffe2_gpu_memory_tracking flag must be enabled to use this op.
+    )DOC");
+
+REGISTER_CUDA_OPERATOR(GetGPUMemoryUsage, GetGPUMemoryUsageOp);
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/merge_id_lists_op.cc b/caffe2/operators/merge_id_lists_op.cc
new file mode 100644
index 0000000..2505123
--- /dev/null
+++ b/caffe2/operators/merge_id_lists_op.cc
@@ -0,0 +1,32 @@
+#include "caffe2/operators/merge_id_lists_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(MergeIdLists, MergeIdListsOp<CPUContext>);
+
+OPERATOR_SCHEMA(MergeIdLists)
+    .NumInputs([](int n) { return (n > 0 && n % 2 == 0); })
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+MergeIdLists: Merge multiple ID_LISTs into a single ID_LIST.
+
+An ID_LIST is a list of IDs (may be ints, often longs) that represents a single
+feature. As described in https://caffe2.ai/docs/sparse-operations.html, a batch
+of ID_LIST examples is represented as a pair of lengths and values where the
+`lengths` (int32) segment the `values` or ids (int32/int64) into examples.
+
+Given multiple inputs of the form lengths_0, values_0, lengths_1, values_1, ...
+which correspond to lengths and values of ID_LISTs of different features, this
+operator produces a merged ID_LIST that combines the ID_LIST features. The
+final merged output is described by a lengths and values vector.
+
+WARNING: The merge makes no guarantee about the relative order of ID_LISTs
+within a batch. This can be an issue if ID_LIST are order sensitive.
+)DOC")
+    .Input(0, "lengths_0", "Lengths of the ID_LISTs batch for first feature")
+    .Input(1, "values_0", "Values of the ID_LISTs batch for first feature")
+    .Output(0, "merged_lengths", "Lengths of the merged ID_LISTs batch")
+    .Output(1, "merged_values", "Values of the merged ID_LISTs batch");
+NO_GRADIENT(MergeIdLists);
+}
+}
diff --git a/caffe2/operators/merge_id_lists_op.h b/caffe2/operators/merge_id_lists_op.h
new file mode 100644
index 0000000..9858b98
--- /dev/null
+++ b/caffe2/operators/merge_id_lists_op.h
@@ -0,0 +1,82 @@
+#ifndef CAFFE2_OPERATORS_MERGE_ID_LISTS_OP_H_
+#define CAFFE2_OPERATORS_MERGE_ID_LISTS_OP_H_
+
+#include <set>
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class MergeIdListsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MergeIdListsOp);
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& first_lengths = Input(0);
+    CAFFE_ENFORCE_EQ(first_lengths.ndim(), 1, "LENGTHS should be 1-D");
+    const auto batch_size = first_lengths.size();
+
+    auto* out_lengths = Output(0);
+    out_lengths->ResizeLike(first_lengths);
+
+    auto* out_lengths_data = out_lengths->template mutable_data<int32_t>();
+
+    /**
+     * Loop to figure out how much space to reserve for output
+     * and perform checks.
+     */
+    auto M = 0;
+    for (size_t i = 0; i < InputSize(); i += 2) {
+      auto& lengths = Input(i);
+      CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS should be 1-D");
+      CAFFE_ENFORCE_EQ(lengths.size(), batch_size, "LENGTHS should be equal");
+      auto& values = Input(i + 1);
+      CAFFE_ENFORCE_EQ(values.ndim(), 1, "VALUES should be 1-D");
+      M += values.size();
+    }
+
+    auto* out_values = Output(1);
+    out_values->Resize(M);
+
+    T* out_values_data = out_values->template mutable_data<T>();
+    auto pos = 0;
+
+    // TODO(badri): Use unordered_set if performance is an issue
+    std::set<T> deduped;
+    std::vector<int> offsets(InputSize(), 0);
+    for (auto sample = 0; sample < batch_size; sample++) {
+      for (size_t i = 0; i < InputSize(); i += 2) {
+        auto& lengths = Input(i);
+        const auto* lengths_data = lengths.template data<int32_t>();
+
+        auto& values = Input(i + 1);
+        const T* values_data = values.template data<T>();
+        const auto length = lengths_data[sample];
+
+        for (auto j = offsets[i]; j < offsets[i] + length; j++) {
+          deduped.insert(values_data[j]);
+        }
+        offsets[i] += length;
+      }
+      for (auto val : deduped) {
+        out_values_data[pos++] = val;
+      }
+      out_lengths_data[sample] = deduped.size();
+      deduped.clear();
+    }
+    out_values->Resize(pos);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(1));
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MERGE_ID_LISTS_OP_H_
diff --git a/caffe2/operators/minmax_gradient_ops.cc b/caffe2/operators/minmax_gradient_ops.cc
new file mode 100644
index 0000000..0c640d4
--- /dev/null
+++ b/caffe2/operators/minmax_gradient_ops.cc
@@ -0,0 +1,67 @@
+#include "caffe2/operators/minmax_ops.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MinGradient, MinGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+OPERATOR_SCHEMA(MinGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+
+template <typename T, class Context>
+bool SelectGradientOpBase<T, Context>::RunOnDevice() {
+  auto& output = Input(0);
+  auto& grad_output = Input(1);
+  const int kInputStartOffset = 2;
+
+  const T* data = output.template data<T>();
+  ConstEigenArrayMap<T> output_array(
+      output.template data<T>(), 1, output.size());
+  ConstEigenArrayMap<T> grad_out_array(
+      grad_output.template data<T>(), 1, grad_output.size());
+
+  for (int i = 0; i < OutputSize(); i++) {
+    auto& input = Input(i + kInputStartOffset);
+    ConstEigenArrayMap<T> input_array(
+        input.template data<T>(), 1, input.size());
+
+    auto* grad_input = Output(i);
+    grad_input->ResizeLike(input);
+    EigenArrayMap<T> grad_in_array(
+        grad_input->template mutable_data<T>(), 1, grad_input->size());
+    grad_in_array = grad_out_array *
+        input_array.cwiseEqual(output_array).template cast<T>();
+  }
+  return true;
+}
+
+class GetMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    auto gradInputs = vector<string>();
+    auto inputs = vector<string>{O(0), GO(0)};
+    for (int i = 0; i < def_.input_size(); i++) {
+      gradInputs.push_back(GI(i));
+      inputs.push_back(I(i));
+    }
+    return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
+  }
+};
+REGISTER_GRADIENT(Max, GetMaxGradient);
+
+class GetMinGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    auto gradInputs = vector<string>();
+    auto inputs = vector<string>{O(0), GO(0)};
+    for (int i = 0; i < def_.input_size(); i++) {
+      gradInputs.push_back(GI(i));
+      inputs.push_back(I(i));
+    }
+    return SingleGradientDef("MinGradient", "", inputs, gradInputs);
+  }
+};
+REGISTER_GRADIENT(Min, GetMinGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/minmax_ops.cc b/caffe2/operators/minmax_ops.cc
new file mode 100644
index 0000000..16b8f02
--- /dev/null
+++ b/caffe2/operators/minmax_ops.cc
@@ -0,0 +1,176 @@
+#include "caffe2/operators/minmax_ops.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(Min, MinOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Max)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise max of an arbitrary number of input tensors. This operation can be
+performed in-place, by using the first input blob as the output blob. All inputs
+must have the same shape and data type, and the output will have the same shape
+as the inputs.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Max",
+    ["X", "Y", "Z"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
+workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32))
+workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+print("Y:", workspace.FetchBlob("Y"))
+print("Z:", workspace.FetchBlob("Z"))
+workspace.RunOperatorOnce(op)
+print("Max:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[0.4496477  0.07061381 0.7139333 ]
+ [0.83203    0.05970785 0.72786295]
+ [0.75988126 0.04601283 0.32820013]]
+Y:
+[[0.05683139 0.16872478 0.671098  ]
+ [0.70739156 0.09878621 0.03416285]
+ [0.34087983 0.94986707 0.67263436]]
+Z:
+[[0.48051122 0.07141234 0.85264146]
+ [0.77086854 0.22082241 0.13154659]
+ [0.42401117 0.995431   0.4263775 ]]
+Max:
+[[0.48051122 0.16872478 0.85264146]
+ [0.83203    0.22082241 0.72786295]
+ [0.75988126 0.995431   0.67263436]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with same dimensions as input(s)."
+    "Contains the maximum valued element at each location.")
+    .InheritOnnxSchema("Max");
+
+OPERATOR_SCHEMA(Min)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise min of an arbitrary number of input tensors. This operation can be performed in-place, by using the first input blob as the output blob. All inputs must have the same shape and data type, and the output will have the same shape as the inputs.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Min",
+    ["X", "Y", "Z"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", (np.random.rand(2,2)).astype(np.float32))
+workspace.FeedBlob("Y", (np.random.rand(2,2)).astype(np.float32))
+workspace.FeedBlob("Z", (np.random.rand(2,2)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+print("Y:", workspace.FetchBlob("Y"))
+print("Z:", workspace.FetchBlob("Z"))
+workspace.RunOperatorOnce(op)
+print("Min:", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[0.32731926 0.4939747 ]
+ [0.29242373 0.43460014]]
+Y:
+[[0.40928316 0.916115  ]
+ [0.77526504 0.29339448]]
+Z:
+[[0.7899794  0.90335774]
+ [0.82599413 0.2843068 ]]
+Min:
+[[0.32731926 0.4939747 ]
+ [0.29242373 0.2843068 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with same dimensions as input(s)."
+"Contains the minimum valued element at each location.")
+    .InheritOnnxSchema("Min");
+
+template <typename T, class Context>
+bool MaxOp<T, Context>::Compute() {
+  auto& input0 = Input(0);
+  const int N = input0.size();
+  T* output_data = Output(0)->template mutable_data<T>();
+
+  for (int i = 1; i < InputSize(); i++) {
+    auto input_data = Input(i).template data<T>();
+    EigenVectorMap<T> output_vec(output_data, N);
+    output_vec = output_vec.cwiseMax(ConstEigenVectorMap<T>(input_data, N));
+  }
+
+  return true;
+}
+
+template <typename T, class Context>
+bool MinOp<T, Context>::Compute() {
+  auto& input0 = Input(0);
+  const int N = input0.size();
+  T* output_data = Output(0)->template mutable_data<T>();
+
+  for (int i = 1; i < InputSize(); i++) {
+    auto input_data = Input(i).template data<T>();
+    EigenVectorMap<T> output_vec(output_data, N);
+    output_vec = output_vec.cwiseMin(ConstEigenVectorMap<T>(input_data, N));
+  }
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/minmax_ops.h b/caffe2/operators/minmax_ops.h
new file mode 100644
index 0000000..247e27c
--- /dev/null
+++ b/caffe2/operators/minmax_ops.h
@@ -0,0 +1,96 @@
+#ifndef CAFFE2_OPERATORS_MINMAX_OPS_H_
+#define CAFFE2_OPERATORS_MINMAX_OPS_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class MaxMinOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MaxMinOpBase)
+
+  bool RunOnDevice() override {
+    auto& input0 = Input(0);
+    auto* output = Output(0);
+
+    output->ResizeLike(input0);
+    output->CopyFrom(input0, &context_);
+
+    if (InputSize() == 1) {
+      return true;
+    }
+
+    // Dimension checking
+    for (int i = 1; i < InputSize(); ++i) {
+      CAFFE_ENFORCE_EQ(
+          output->dims(),
+          Input(i).dims(),
+          "Description: Input #",
+          i,
+          ", input dimension:",
+          Input(i).dims(),
+          " should match output dimension: ",
+          output->dims());
+    }
+
+    return this->Compute();
+  }
+
+  virtual bool Compute() = 0;
+};
+
+template <typename T, class Context>
+class MaxOp : public MaxMinOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : MaxMinOpBase<T, Context>(operator_def, ws) {}
+  virtual ~MaxOp() noexcept {}
+  bool Compute() override;
+};
+
+template <typename T, class Context>
+class SelectGradientOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SelectGradientOpBase)
+
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class MaxGradientOp : public SelectGradientOpBase<T, Context> {
+ public:
+  MaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : SelectGradientOpBase<T, Context>(operator_def, ws) {}
+  virtual ~MaxGradientOp() noexcept {}
+};
+
+template <typename T, class Context>
+class MinOp : public MaxMinOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MinOp(const OperatorDef& operator_def, Workspace* ws)
+      : MaxMinOpBase<T, Context>(operator_def, ws) {}
+  virtual ~MinOp() noexcept {}
+  bool Compute() override;
+};
+
+template <typename T, class Context>
+class MinGradientOp : public SelectGradientOpBase<T, Context> {
+ public:
+  MinGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : SelectGradientOpBase<T, Context>(operator_def, ws) {}
+  virtual ~MinGradientOp() noexcept {}
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MINMAX_OPS_H_
diff --git a/caffe2/operators/mod_op.cc b/caffe2/operators/mod_op.cc
new file mode 100644
index 0000000..4923b1d
--- /dev/null
+++ b/caffe2/operators/mod_op.cc
@@ -0,0 +1,100 @@
+#include "caffe2/operators/mod_op.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool ModOp<CPUContext>::DoRunWithType() {
+  auto& data = Input(DATA);
+  auto N = data.size();
+  const auto* data_ptr = data.template data<T>();
+
+  auto* output = Output(0);
+  output->ResizeLike(Input(DATA));
+  auto* output_ptr = output->template mutable_data<T>();
+
+  for (auto i = 0; i < N; i++) {
+    output_ptr[i] = data_ptr[i] % divisor_;
+    if (output_ptr[i] && sign_follow_divisor_ &&
+        ((output_ptr[i] > 0) != (divisor_ > 0))) {
+      output_ptr[i] += divisor_;
+    }
+  }
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(Mod, ModOp<CPUContext>);
+OPERATOR_SCHEMA(Mod)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("divisor", "*(type: int; default: 0)* Divisor of the modulo operation (must be >= 1).")
+    .Arg(
+        "sign_follow_divisor",
+        "*(type: bool; default: False)* If true, sign of output matches divisor, else if false, sign follows dividend.")
+    .IdenticalTypeAndShape()
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise modulo operation. Each element in the output is the modulo result
+of the corresponding element in the input data. The divisor of the modulo is
+provided by the `divisor` argument.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/mod_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Mod",
+    ["X"],
+    ["Y"],
+    divisor=10
+)
+
+workspace.FeedBlob("X", (np.random.randint(100, size=(5,5))))
+print("X before running op:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("X after running op:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X before running op:
+[[56 22 43 13 60]
+ [ 4 55 58 10 45]
+ [64 66  4  3 66]
+ [10 36 47 52 78]
+ [91  4 36 47 95]]
+X after running op:
+[[6 2 3 3 0]
+ [4 5 8 0 5]
+ [4 6 4 3 6]
+ [0 6 7 2 8]
+ [1 4 6 7 5]]
+
+ ```
+
+ </details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<int>`)* Input tensor with int32 or int64 data.")
+    .Output(0, "Y", "*(type: Tensor`<int>`)* Output tensor of data with modulo operation applied.");
+
+SHOULD_NOT_DO_GRADIENT(ModOp);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/mod_op.h b/caffe2/operators/mod_op.h
new file mode 100644
index 0000000..a08c706
--- /dev/null
+++ b/caffe2/operators/mod_op.h
@@ -0,0 +1,39 @@
+#ifndef CAFFE_OPERATORS_MOD_OP_H_
+#define CAFFE_OPERATORS_MOD_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ModOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ModOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    divisor_ = OperatorBase::GetSingleArgument<int64_t>("divisor", 0);
+    CAFFE_ENFORCE_NE(divisor_, 0, "divisor must not be 0");
+    sign_follow_divisor_ =
+        OperatorBase::GetSingleArgument<bool>("sign_follow_divisor", false);
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(DATA));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ protected:
+  INPUT_TAGS(DATA);
+
+ private:
+  int64_t divisor_;
+  bool sign_follow_divisor_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_MOD_OP_H_
diff --git a/caffe2/operators/moments_op.cc b/caffe2/operators/moments_op.cc
new file mode 100644
index 0000000..49ecdf7
--- /dev/null
+++ b/caffe2/operators/moments_op.cc
@@ -0,0 +1,82 @@
+#include "caffe2/operators/moments_op.h"
+
+#include <functional>
+#include <string>
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool MomentsGradientOp<T, Context>::Compute(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dmean_data,
+    const T* dvariance_data,
+    const T* X_data,
+    const T* mean_data,
+    T* dX_data) {
+  const int dY_size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  const int dX_size = std::accumulate(
+      dX_dims.cbegin(), dX_dims.cend(), 1, std::multiplies<int>());
+  const int ndim = dX_dims.size();
+  std::vector<int> index(ndim, 0);
+  const T norm = static_cast<T>(dY_size) / static_cast<T>(dX_size);
+  for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
+    const int dY_index =
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+    dX_data[dX_index] =
+        (dmean_data[dY_index] +
+         static_cast<T>(2) * (X_data[dX_index] - mean_data[dY_index]) *
+             dvariance_data[dY_index]) *
+        norm;
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Moments, MomentsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MomentsGradient, MomentsGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Moments)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+  Computes the mean and variance of the input tensor's element along the
+  provided axes. The resulted tensor has the same rank as the input if keepdims
+  equals True.
+  If keepdims equals False, then the resulted tensor have the reduced dimension
+  pruned.
+)DOC")
+    .Arg(
+        "axes",
+        "A list of integers, along which to reduce. If axes is not provided, "
+        "the op computes the element-wise mean and variance.")
+    .Arg(
+        "keepdims",
+        "Keep the reduced dimension(s) or not, default True keeps the reduced "
+        "dimension(s).")
+    .Input(0, "data", "An input tensor.")
+    .Output(0, "mean", "Reduced mean tensor.")
+    .Output(1, "variance", "Reduced variance tensor.");
+
+OPERATOR_SCHEMA(MomentsGradient).NumInputs(4).NumOutputs(1);
+
+namespace {
+
+class GetMomentsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MomentsGradient",
+        "",
+        std::vector<std::string>{GO(0), GO(1), I(0), O(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Moments, GetMomentsGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/moments_op.cu b/caffe2/operators/moments_op.cu
new file mode 100644
index 0000000..0b416f4
--- /dev/null
+++ b/caffe2/operators/moments_op.cu
@@ -0,0 +1,120 @@
+#include "caffe2/operators/moments_op.h"
+
+#include <array>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/fixed_divisor.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, int D>
+__global__ void ComputeMomentsGradientCUDAKernel(
+    const int X_size,
+    const SimpleArray<int, D> Y_strides,
+    const SimpleArray<FixedDivisor<int>, D> X_dims,
+    const T scale,
+    const T* dmean,
+    const T* dvariance,
+    const T* X,
+    const T* mean,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(X_index, X_size) {
+    int Y_index = 0;
+    int X_index_val = X_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      int d;
+      X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
+      Y_index += d * Y_strides.data[i];
+    }
+#if __CUDA_ARCH__ >= 350
+    dX[X_index] =
+        (__ldg(dmean + Y_index) +
+         static_cast<T>(2) * (__ldg(X + X_index) - __ldg(mean + Y_index)) *
+             __ldg(dvariance + Y_index)) *
+        scale;
+#else
+    dX[X_index] = (dmean[Y_index] +
+                   static_cast<T>(2) * (X[X_index] - mean[Y_index]) *
+                       dvariance[Y_index]) *
+        scale;
+#endif
+  }
+}
+
+template <typename T, int D>
+void ComputeMomentsGradientCUDAImpl(
+    const int* Y_dims,
+    const int* X_dims,
+    const T* dmean,
+    const T* dvariance,
+    const T* X,
+    const T* mean,
+    T* dX,
+    CUDAContext* context) {
+  SimpleArray<int, D> Y_strides_array;
+  SimpleArray<FixedDivisor<int>, D> X_dims_array;
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    if (X_dims[i] == 0) {
+      return;
+    }
+    Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
+    X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
+    cur_stride *= Y_dims[i];
+  }
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
+  const int X_size =
+      std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
+  const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
+  ComputeMomentsGradientCUDAKernel<T, D>
+      <<<CAFFE_GET_BLOCKS(X_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          X_size,
+          Y_strides_array,
+          X_dims_array,
+          scale,
+          dmean,
+          dvariance,
+          X,
+          mean,
+          dX);
+}
+
+} // namespace
+
+template <>
+bool MomentsGradientOp<float, CUDAContext>::Compute(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const float* dmean_data,
+    const float* dvariance_data,
+    const float* X_data,
+    const float* mean_data,
+    float* dX_data) {
+  const int ndim = dY_dims.size();
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      ndim,
+      ComputeMomentsGradientCUDAImpl,
+      float,
+      dY_dims.data(),
+      dX_dims.data(),
+      dmean_data,
+      dvariance_data,
+      X_data,
+      mean_data,
+      dX_data,
+      &context_);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Moments, MomentsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MomentsGradient, MomentsGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/moments_op.h b/caffe2/operators/moments_op.h
new file mode 100644
index 0000000..ef40803
--- /dev/null
+++ b/caffe2/operators/moments_op.h
@@ -0,0 +1,130 @@
+#ifndef CAFFE2_OPERATORS_MOMENTS_OP_H_
+#define CAFFE2_OPERATORS_MOMENTS_OP_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class MomentsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MomentsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")),
+        OP_SINGLE_ARG(bool, "keepdims", keep_dims_, true) {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    auto* mean = Output(0);
+    auto* variance = Output(1);
+    const int ndim = X.ndim();
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.begin(), axes_.end(), 0);
+    } else {
+      std::sort(axes_.begin(), axes_.end());
+      CAFFE_ENFORCE_GE(axes_.front(), 0, "Axes ids must be non-negative.");
+      CAFFE_ENFORCE_LT(
+          axes_.back(),
+          ndim,
+          "Axes ids must be smaller than the dimensions of input.");
+    }
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims;
+    Y_dims.reserve(ndim);
+    std::size_t cur_axis = 0;
+    for (int i = 0; i < ndim; ++i) {
+      if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
+        if (keep_dims_) {
+          Y_dims.push_back(1);
+        }
+        ++cur_axis;
+      } else {
+        Y_dims.push_back(X_dims[i]);
+      }
+    }
+    mean->Resize(Y_dims);
+    variance->Resize(Y_dims);
+    math::Moments<float, Context>(
+        X_dims.size(),
+        X_dims.data(),
+        axes_.size(),
+        axes_.data(),
+        X.template data<T>(),
+        mean->template mutable_data<T>(),
+        variance->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ private:
+  std::vector<int> axes_;
+  const int keep_dims_;
+};
+
+template <typename T, class Context>
+class MomentsGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  MomentsGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {}
+
+  bool RunOnDevice() override {
+    const auto& dmean = Input(0);
+    const auto& dvariance = Input(1);
+    const auto& X = Input(2);
+    const auto& mean = Input(3);
+    auto* dX = Output(0);
+    const int ndim = X.ndim();
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.begin(), axes_.end(), 0);
+    } else {
+      std::sort(axes_.begin(), axes_.end());
+      CAFFE_ENFORCE_GE(axes_.front(), 0, "Axes ids must be non-negative.");
+      CAFFE_ENFORCE_LT(
+          axes_.back(),
+          ndim,
+          "Axes ids must be smaller than the dimensions of input.");
+    }
+    const std::vector<int> dX_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> dY_dims = dX_dims;
+    for (const int axis : axes_) {
+      dY_dims[axis] = 1;
+    }
+    dX->ResizeLike(X);
+    return Compute(
+        dY_dims,
+        dX_dims,
+        dmean.template data<T>(),
+        dvariance.template data<T>(),
+        X.template data<T>(),
+        mean.template data<T>(),
+        dX->template mutable_data<T>());
+  }
+
+ private:
+  bool Compute(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dmean_data,
+      const T* dvariance_data,
+      const T* X_data,
+      const T* mean_data,
+      T* dX_data);
+
+  std::vector<int> axes_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MOMENTS_OP_H_
diff --git a/caffe2/operators/multi_class_accuracy_op.cc b/caffe2/operators/multi_class_accuracy_op.cc
new file mode 100644
index 0000000..9eda6fb
--- /dev/null
+++ b/caffe2/operators/multi_class_accuracy_op.cc
@@ -0,0 +1,86 @@
+#include "caffe2/operators/multi_class_accuracy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool MultiClassAccuracyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = Input(LABEL);
+  auto* Y0 = Output(0);
+  auto* Y1 = Output(1);
+  DCHECK_EQ(X.ndim(), 2);
+  // amount, number of instances
+  int N = X.dim32(0);
+  // dimension, number of classes
+  int D = X.dim32(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim32(0), N);
+  Y0->Resize(D);
+  Y1->Resize(D);
+
+  const auto* Xdata = X.data<float>();
+  const auto* labeldata = label.data<int>();
+  auto* accuracies = Y0->mutable_data<float>();
+  auto* amounts = Y1->mutable_data<int>();
+  std::fill(accuracies, accuracies + D, 0);
+  std::fill(amounts, amounts + D, 0);
+
+  for (int i = 0; i < N; ++i) {
+    float maxval = std::numeric_limits<float>::lowest();
+    int maxid = 0;
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    int labelid = labeldata[i];
+    DCHECK_LT(labelid, D);
+    if (maxid == labelid) {
+      accuracies[labelid]++;
+    }
+    amounts[labelid]++;
+  }
+
+  for (int i = 0; i < D; ++i) {
+    int amount = amounts[i];
+    if (amount) {
+      accuracies[i] /= amount;
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+  MultiClassAccuracy, MultiClassAccuracyOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(MultiClassAccuracy)
+  .NumInputs(2)
+  .NumOutputs(2)
+  .SetDoc(R"DOC(
+Respectively compute accuracy score for each class given a number of instances
+and predicted scores of each class for each instance.
+)DOC")
+  .Input(
+    0,
+    "prediction",
+    "2-D float tensor (N,D,) of predicted scores of each class for "
+    "each data. N is the number of instances, i.e., batch size. D is number of "
+    "possible classes/labels.")
+  .Input(
+    1,
+    "labels",
+    "1-D int tensor (N,) of labels for each instance.")
+  .Output(
+    0,
+    "accuracies",
+    "1-D float tensor (D,) of accuracy for each class. If a class has no "
+    "instance in the batch, its accuracy score is set to zero.")
+  .Output(
+    1,
+    "amounts",
+    "1-D int tensor (D,) of number of instances for each class in the batch.");
+
+SHOULD_NOT_DO_GRADIENT(MultiClassAccuracy);
+}  // namespace caffe2
diff --git a/caffe2/operators/multi_class_accuracy_op.cu b/caffe2/operators/multi_class_accuracy_op.cu
new file mode 100644
index 0000000..f567214
--- /dev/null
+++ b/caffe2/operators/multi_class_accuracy_op.cu
@@ -0,0 +1,70 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/multi_class_accuracy_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void MultiClassAccuracyKernel(const int N, const int D, const float* Xdata,
+    const int* labeldata, float* accuracies, int* amounts) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float maxval = Xdata[i * D];
+    int maxid = 0;
+    for (int j = 1; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    int labelid = labeldata[i];
+    if (maxid == labelid) {
+      atomicAdd(accuracies + labelid, static_cast<float>(1));
+    }
+    atomicAdd(amounts + labelid, static_cast<int>(1));
+  }
+}
+__global__ void MultiClassAccuracyDivideKernel(
+  const int D, float* accuracies, const int* amounts) {
+  CUDA_1D_KERNEL_LOOP(i, D) {
+    if (amounts[i]) {
+      accuracies[i] /= amounts[i];
+    }
+  }
+}
+}  // namespace
+
+template <>
+bool MultiClassAccuracyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = Input(LABEL);
+  auto* Y0 = Output(0);
+  auto* Y1 = Output(1);
+  DCHECK_EQ(X.ndim(), 2);
+  // amount, number of instances
+  int N = X.dim32(0);
+  // dimension, number of classes
+  int D = X.dim32(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim32(0), N);
+  Y0->Resize(D);
+  Y1->Resize(D);
+
+  const float* Xdata = X.data<float>();
+  const int* labeldata = label.data<int>();
+  float* accuracies = Y0->mutable_data<float>();
+  int* amounts = Y1->mutable_data<int>();
+  math::Set<float, CUDAContext>(D, 0.0, accuracies, &context_);
+  math::Set<int, CUDAContext>(D, 0, amounts, &context_);
+
+  MultiClassAccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                              0, context_.cuda_stream()>>>(
+      N, D, Xdata, labeldata, accuracies, amounts);
+  MultiClassAccuracyDivideKernel<<<CAFFE_GET_BLOCKS(D), CAFFE_CUDA_NUM_THREADS,
+                                  0, context_.cuda_stream()>>>(
+    D, accuracies, amounts);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+  MultiClassAccuracy, MultiClassAccuracyOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/multi_class_accuracy_op.h b/caffe2/operators/multi_class_accuracy_op.h
new file mode 100644
index 0000000..5710f5a
--- /dev/null
+++ b/caffe2/operators/multi_class_accuracy_op.h
@@ -0,0 +1,22 @@
+#ifndef CAFFE2_OPERATORS_MULTI_CLASS_ACCURACY_OP_H_
+#define CAFFE2_OPERATORS_MULTI_CLASS_ACCURACY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class MultiClassAccuracyOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(MultiClassAccuracyOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(PREDICTION, LABEL);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MULTI_CLASS_ACCURACY_OP_H_
diff --git a/caffe2/operators/negate_gradient_op.cc b/caffe2/operators/negate_gradient_op.cc
new file mode 100644
index 0000000..09c3766
--- /dev/null
+++ b/caffe2/operators/negate_gradient_op.cc
@@ -0,0 +1,26 @@
+#include "caffe2/operators/negate_gradient_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(NegateGradient, NegateGradientOp<CPUContext>);
+OPERATOR_SCHEMA(NegateGradient)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+NegagteGradient operator in forward pass simply copies input to the
+output, and in backward pass, flips the sign of the output gradient
+)DOC");
+
+struct GetNegateGradientGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 1);
+    return SingleGradientDef(
+        "Negative", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(NegateGradient, GetNegateGradientGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/negate_gradient_op.h b/caffe2/operators/negate_gradient_op.h
new file mode 100644
index 0000000..8818ec3
--- /dev/null
+++ b/caffe2/operators/negate_gradient_op.h
@@ -0,0 +1,27 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+// File: negate_gradient_op.h
+
+#pragma once
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class NegateGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(NegateGradientOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    const auto& in = Input(0);
+    auto* out = Output(0);
+    if (out != &in) {
+      out->CopyFrom(in, &context_);
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/negate_gradient_op_gpu.cc b/caffe2/operators/negate_gradient_op_gpu.cc
new file mode 100644
index 0000000..cc68aad
--- /dev/null
+++ b/caffe2/operators/negate_gradient_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/negate_gradient_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(NegateGradient, NegateGradientOp<CUDAContext>)
+} // namespace caffe2
diff --git a/caffe2/operators/negative_op.cc b/caffe2/operators/negative_op.cc
new file mode 100644
index 0000000..9996f5a
--- /dev/null
+++ b/caffe2/operators/negative_op.cc
@@ -0,0 +1,81 @@
+#include "caffe2/operators/negative_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Negative,
+    UnaryElementwiseOp<NumericTypes, CPUContext, NegativeFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Negative)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Computes the element-wise negative of the input.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/negative_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Negative",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,3).astype(np.float32)))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X: [[0.83296907 0.61407167 0.32562155]
+ [0.59304523 0.03111175 0.29365504]
+ [0.09478621 0.5424558  0.73940724]]
+Y: [[-0.83296907 -0.61407167 -0.32562155]
+ [-0.59304523 -0.03111175 -0.29365504]
+ [-0.09478621 -0.5424558  -0.73940724]]
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* 1D input tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* 1D output tensor.")
+    .InheritOnnxSchema("Neg");
+
+namespace {
+
+class GetNegativeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Negative",
+        "",
+        std::vector<std::string>{GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Negative, GetNegativeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/negative_op.h b/caffe2/operators/negative_op.h
new file mode 100644
index 0000000..876a837
--- /dev/null
+++ b/caffe2/operators/negative_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_NEGATIVE_OP_H_
+#define CAFFE2_OPERATORS_NEGATIVE_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct NegativeFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Neg(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NEGATIVE_OP_H_
diff --git a/caffe2/operators/negative_op_gpu.cc b/caffe2/operators/negative_op_gpu.cc
new file mode 100644
index 0000000..dffc947
--- /dev/null
+++ b/caffe2/operators/negative_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/negative_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Negative,
+    UnaryElementwiseOp<
+        NumericTypes,
+        CUDAContext,
+        NegativeFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/ngram_ops.cc b/caffe2/operators/ngram_ops.cc
new file mode 100644
index 0000000..f2c009f
--- /dev/null
+++ b/caffe2/operators/ngram_ops.cc
@@ -0,0 +1,13 @@
+#include "caffe2/operators/ngram_ops.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    NGramFromCategorical,
+    NGramFromCategoricalOp<float, int64_t, CPUContext>);
+NO_GRADIENT(NGramFromCategorical);
+OPERATOR_SCHEMA(NGramFromCategorical).NumInputs(1).NumOutputs(1);
+} // namespace caffe2
diff --git a/caffe2/operators/ngram_ops.h b/caffe2/operators/ngram_ops.h
new file mode 100644
index 0000000..0d9bb60
--- /dev/null
+++ b/caffe2/operators/ngram_ops.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+template <typename F, typename T, class Context>
+class NGramFromCategoricalOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  NGramFromCategoricalOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        col_ids_(OperatorBase::GetRepeatedArgument<int>("col_ids")),
+        categorical_limits_(
+            OperatorBase::GetRepeatedArgument<int>("categorical_limits")),
+        vals_(OperatorBase::GetRepeatedArgument<int>("vals")) {
+    col_num_ = col_ids_.size();
+    max_col_id_ = *std::max_element(col_ids_.begin(), col_ids_.end());
+    CAFFE_ENFORCE_EQ(col_num_, categorical_limits_.size());
+    int expected_vals_size = 0;
+    for (auto& l : categorical_limits_) {
+      CAFFE_ENFORCE_GT(l, 0);
+      expected_vals_size += l;
+    }
+    CAFFE_ENFORCE_EQ(expected_vals_size, vals_.size());
+    // compute ngram maps with small end
+    for (auto& j : col_ids_) {
+      CAFFE_ENFORCE_GE(j, 0);
+      ngram_maps_.push_back(std::map<int, int>());
+    }
+    int base = 1;
+    int idx = 0;
+    for (int k = 0; k < col_num_; k++) {
+      int l = categorical_limits_[k];
+      for (int m = 0; m < l; m++) {
+        int v = vals_[idx++];
+        ngram_maps_[k][v] = m * base;
+      }
+      base *= l;
+    }
+  }
+
+  bool RunOnDevice() override {
+    auto& floats = Input(0);
+    auto N = floats.dim(0);
+    auto D = floats.size_from_dim(1);
+    const F* floats_data = floats.template data<F>();
+    auto* output = Output(0);
+    output->Resize(N);
+    auto* output_data = output->template mutable_data<T>();
+    math::Set<T, Context>(output->size(), 0, output_data, &context_);
+
+    CAFFE_ENFORCE_GT(D, max_col_id_);
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < col_num_; k++) {
+        int j = col_ids_[k];
+        int v = round(floats_data[i * D + j]);
+        // for out-of-vocabulary values, we always treat them the same as the
+        // first value specified in vals; if we want to mimic the behavior as
+        // sigrid NGram transform, just push front a random/impossible value at
+        // each segments of vals
+        output_data[i] += ngram_maps_[k].find(v) == ngram_maps_[k].end()
+            ? 0
+            : ngram_maps_[k][v];
+      }
+    }
+    return true;
+  }
+
+ private:
+  std::vector<int> col_ids_;
+  std::vector<int> categorical_limits_;
+  std::vector<int> vals_;
+  std::vector<std::map<int, int>> ngram_maps_;
+  int col_num_;
+  int max_col_id_;
+};
+} // namespace caffe2
diff --git a/caffe2/operators/no_default_engine_op.h b/caffe2/operators/no_default_engine_op.h
new file mode 100644
index 0000000..c148094
--- /dev/null
+++ b/caffe2/operators/no_default_engine_op.h
@@ -0,0 +1,35 @@
+#ifndef CAFFE2_OPERATORS_NO_DEFAULT_ENGINE_OP_H_
+#define CAFFE2_OPERATORS_NO_DEFAULT_ENGINE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+/**
+ * A helper class to denote that an op does not have a default engine.
+ *
+ * NoDefaultEngineOp is a helper class that one can use to denote that a
+ * specific operator is not intended to be called without an explicit engine
+ * given. This is the case for e.g. the communication operators where one has
+ * to specify a backend (like MPI or ZEROMQ).
+ */
+template <class Context>
+class NoDefaultEngineOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(NoDefaultEngineOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_THROW(
+        "The operator ",
+        this->debug_def().type(),
+        " does not have a default engine implementation. Please "
+        "specify an engine explicitly for this operator.");
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NO_DEFAULT_ENGINE_OP_H_
diff --git a/caffe2/operators/norm_planar_yuv_op.cc b/caffe2/operators/norm_planar_yuv_op.cc
new file mode 100644
index 0000000..ea3ccc2
--- /dev/null
+++ b/caffe2/operators/norm_planar_yuv_op.cc
@@ -0,0 +1,56 @@
+#include <array>
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+class NormalizePlanarYUVOp : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  using Operator<CPUContext>::Operator;
+
+  bool RunOnDevice() {
+    const auto& X = Input(0);
+    const auto& M = Input(1); // mean
+    const auto& S = Input(2); // standard deviation
+    auto* Z = Output(0);
+    Z->ResizeLike(X);
+
+    CAFFE_ENFORCE(X.dims().size() == 4);
+
+    const auto N = X.dim32(0);
+    auto C = X.dim(1);
+    const auto H = X.dim(2);
+    const auto W = X.dim(3);
+    CAFFE_ENFORCE(C == M.dim(1));
+    CAFFE_ENFORCE(C == S.dim(1));
+    const auto* Xdata = X.data<float>();
+    auto* Zdata = Z->mutable_data<float>();
+
+    int offset = H * W;
+    for (auto n = 0; n < N; n++) { // realistically N will always be 1
+      int batch_offset = n * C * offset;
+      for (auto c = 0; c < C; c++) {
+        ConstEigenVectorMap<float> channel_s(
+            &Xdata[batch_offset + (c * offset)], offset);
+        EigenVectorMap<float> channel_d(
+            &Zdata[batch_offset + (c * offset)], offset);
+        channel_d = channel_s.array() - M.data<float>()[c];
+        channel_d = channel_d.array() / S.data<float>()[c];
+      }
+    }
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(NormalizePlanarYUV, NormalizePlanarYUVOp);
+OPERATOR_SCHEMA(NormalizePlanarYUV)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
+;
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/normalize_l1_op.cc b/caffe2/operators/normalize_l1_op.cc
new file mode 100644
index 0000000..908131f
--- /dev/null
+++ b/caffe2/operators/normalize_l1_op.cc
@@ -0,0 +1,41 @@
+#include "caffe2/operators/normalize_l1_op.h"
+
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+void NormalizeL1Op<T, Context>::DoNormalize(
+    const T* xData,
+    T* yData,
+    const int m,
+    const int n,
+    const int sf) {
+  using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
+  using StridedVec =
+      Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+  using ConstStridedVec =
+      Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+
+  for (int i = 0; i < n; ++i) {
+    auto base = (i / sf) * sf * m + (i % sf);
+    ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf));
+    auto norm = xVec.template lpNorm<1>();
+    if (norm != 0) {
+      StridedVec yVec(yData + base, 1, m, InnerStride(sf));
+      yVec = xVec / norm;
+    }
+  }
+};
+
+REGISTER_CPU_OPERATOR(NormalizeL1, NormalizeL1Op<float, CPUContext>);
+OPERATOR_SCHEMA(NormalizeL1)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("axis", "axis to normalize")
+    .SetDoc(R"DOC(
+Given a matrix, apply L1-normalization along the specified axis.
+)DOC");
+
+} // namespace caffe2
diff --git a/caffe2/operators/normalize_l1_op.h b/caffe2/operators/normalize_l1_op.h
new file mode 100644
index 0000000..1a23f39
--- /dev/null
+++ b/caffe2/operators/normalize_l1_op.h
@@ -0,0 +1,39 @@
+#ifndef CAFFE2_OPERATORS_NORMALIZE_L1_OP_H_
+#define CAFFE2_OPERATORS_NORMALIZE_L1_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class NormalizeL1Op final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(NormalizeL1Op)
+
+  bool RunOnDevice() override {
+    const auto& x = Input(0);
+    auto* y = Output(0);
+    const auto* xData = x.template data<T>();
+    y->ResizeLike(x);
+    auto* yData = y->template mutable_data<T>();
+
+    const auto canonical_axis = x.canonical_axis_index(
+        OperatorBase::GetSingleArgument<int>("axis", -1));
+    const int m = x.dim32(canonical_axis);
+    const int n = x.size() / m;
+    const int sf = x.size_from_dim(canonical_axis + 1);
+    DoNormalize(xData, yData, m, n, sf);
+    return true;
+  }
+
+ private:
+  void
+  DoNormalize(const T* xData, T* yData, const int m, const int n, const int sf);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NORMALIZE_L1_OP_H_
diff --git a/caffe2/operators/normalize_op.cc b/caffe2/operators/normalize_op.cc
new file mode 100644
index 0000000..1a7d720
--- /dev/null
+++ b/caffe2/operators/normalize_op.cc
@@ -0,0 +1,92 @@
+#include "caffe2/operators/normalize_op.h"
+
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+void NormalizeOp<T, Context>::DoNormalize(
+    const T* xData,
+    T* yData,
+    const int m,
+    const int n,
+    const int sf) {
+  using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
+  using StridedVec =
+      Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+  using ConstStridedVec =
+      Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+
+  for (int i = 0; i < n; ++i) {
+    auto base = (i / sf) * sf * m + (i % sf);
+    ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf));
+    auto norm = xVec.template lpNorm<2>();
+    if (norm != 0) {
+      StridedVec yVec(yData + base, 1, m, InnerStride(sf));
+      yVec = xVec / norm;
+    }
+  }
+};
+
+template <typename T, class Context>
+void NormalizeGradientOp<T, Context>::DoNormalize(
+    const T* xData,
+    const T* gOutData,
+    T* gInData,
+    const int m,
+    const int n,
+    const int sf) {
+  using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
+  using StridedVec =
+      Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+  using ConstStridedVec =
+      Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
+
+  for (int i = 0; i < n; ++i) {
+    auto base = (i / sf) * sf * m + (i % sf);
+    ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf));
+    ConstStridedVec gOutVec(gOutData + base, 1, m, InnerStride(sf));
+
+    auto row_sum = xVec.dot(gOutVec);
+    auto row_norm = xVec.template lpNorm<2>();
+    auto row_norm_3 = pow(row_norm, 3);
+    if (row_norm != 0) {
+      StridedVec gInVec(gInData + base, 1, m, InnerStride(sf));
+      gInVec = (gOutVec / row_norm) - ((xVec / row_norm_3) * row_sum);
+    }
+  }
+};
+
+REGISTER_CPU_OPERATOR(Normalize, NormalizeOp<float, CPUContext>);
+OPERATOR_SCHEMA(Normalize)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("axis", "axis to normalize")
+    .SetDoc(R"DOC(
+Given a matrix, apply L2-normalization along the specified dimension.
+)DOC")
+    .IdenticalTypeAndShape();
+
+REGISTER_CPU_OPERATOR(
+    NormalizeGradient,
+    NormalizeGradientOp<float, CPUContext>);
+OPERATOR_SCHEMA(NormalizeGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Arg("axis", "axis to normalize");
+
+class GetNormalizeGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 1);
+    return SingleGradientDef(
+        "NormalizeGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Normalize, GetNormalizeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/normalize_op.h b/caffe2/operators/normalize_op.h
new file mode 100644
index 0000000..05e9024
--- /dev/null
+++ b/caffe2/operators/normalize_op.h
@@ -0,0 +1,79 @@
+#ifndef CAFFE2_OPERATORS_NORMALIZE_OP_H_
+#define CAFFE2_OPERATORS_NORMALIZE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class NormalizeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  NormalizeOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& x = Input(0);
+    auto* y = Output(0);
+    const auto* xData = x.template data<T>();
+    y->ResizeLike(x);
+    auto* yData = y->template mutable_data<T>();
+
+    const auto canonical_axis = x.canonical_axis_index(
+        OperatorBase::GetSingleArgument<int>("axis", -1));
+    const int m = x.dim32(canonical_axis);
+    const int n = x.size() / m;
+    const int sf = x.size_from_dim(canonical_axis + 1);
+    DoNormalize(xData, yData, m, n, sf);
+    return true;
+  }
+
+ private:
+  void
+  DoNormalize(const T* xData, T* yData, const int m, const int n, const int sf);
+};
+
+template <typename T, class Context>
+class NormalizeGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  NormalizeGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& x = Input(0);
+    const auto& gOut = Input(GRAD_OUT);
+    auto* gIn = Output(GRAD_IN);
+    gIn->ResizeLike(gOut);
+
+    const auto* xData = x.template data<T>();
+    const auto* gOutData = gOut.template data<T>();
+    auto* gInData = gIn->template mutable_data<T>();
+
+    const auto canonical_axis = x.canonical_axis_index(
+        OperatorBase::GetSingleArgument<int>("axis", -1));
+    const int m = x.dim32(canonical_axis);
+    const int n = x.size() / m;
+    const int sf = x.size_from_dim(canonical_axis + 1);
+    DoNormalize(xData, gOutData, gInData, m, n, sf);
+    return true;
+  }
+
+ private:
+  void DoNormalize(
+      const T* xData,
+      const T* gOutData,
+      T* gInData,
+      const int m,
+      const int n,
+      const int sf);
+
+  INPUT_TAGS(INPUT, GRAD_OUT);
+  OUTPUT_TAGS(GRAD_IN);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NORMALIZE_OP_H_
diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
new file mode 100644
index 0000000..343a8ce
--- /dev/null
+++ b/caffe2/operators/normalize_ops.cu
@@ -0,0 +1,171 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/normalize_l1_op.h"
+#include "caffe2/operators/normalize_op.h"
+
+namespace caffe2 {
+
+__global__ void NormalizeKernel(
+    const int m,
+    const int n,
+    const int sf,
+    const float* xData,
+    float* yData) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < n; i += gridDim.x) {
+    auto base = (i / sf) * sf * m + (i % sf);
+
+    float sum = 0.0;
+    __shared__ float norm;
+    for (int j = threadIdx.x; j < m; j += blockDim.x) {
+      const auto x_ij = xData[base + j * sf];
+      sum += x_ij * x_ij;
+    }
+    float reduce_result = BlockReduce(temp_storage).Sum(sum);
+
+    if (threadIdx.x == 0) {
+      norm = sqrt(reduce_result);
+    }
+    __syncthreads();
+    if (norm != 0) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        const auto index = base + j * sf;
+        yData[index] = xData[index] / norm;
+      }
+    }
+  }
+}
+
+__global__ void NormalizeGradientKernel(
+    const int M,
+    const int N,
+    const int SF,
+    const float* in_mat,
+    const float* grad_out_mat,
+    float* grad_mat) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ BlockReduce::TempStorage temp_storage_sum;
+  __shared__ BlockReduce::TempStorage temp_storage_norm;
+  for (int i = blockIdx.x; i < M; i += gridDim.x) {
+    float sum = 0.0;
+    float norm = 0.0;
+    __shared__ float row_sum;
+    __shared__ float row_norm;
+    __shared__ float row_norm_3;
+    auto base = (i / SF) * SF * N + (i % SF);
+    for (int j = threadIdx.x; j < N; j += blockDim.x) {
+      int index = base + j * SF;
+      sum += in_mat[index] * grad_out_mat[index];
+      norm += in_mat[index] * in_mat[index];
+    }
+    float reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
+    float reduce_norm = BlockReduce(temp_storage_norm).Sum(norm);
+
+    if (threadIdx.x == 0) {
+      row_sum = reduce_result;
+      row_norm = sqrt(reduce_norm);
+      row_norm_3 = pow(row_norm, 3);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < N; j += blockDim.x) {
+      int index = base + j * SF;
+      const float x_ij = in_mat[index];
+      const float dy_ij = grad_out_mat[index];
+      grad_mat[index] = (dy_ij / row_norm) - ((x_ij / row_norm_3) * row_sum);
+    }
+  }
+}
+
+template <>
+void NormalizeOp<float, CUDAContext>::DoNormalize(
+    const float* xData,
+    float* yData,
+    const int m,
+    const int n,
+    const int sf) {
+  NormalizeKernel<<<
+      min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(m, n, sf, xData, yData);
+}
+
+template <>
+bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  const auto canonical_axis =
+      X.canonical_axis_index(OperatorBase::GetSingleArgument<int>("axis", -1));
+  int N = X.dim32(canonical_axis);
+  int M = X.size() / N;
+  const int SF = X.size_from_dim(canonical_axis + 1);
+  NormalizeGradientKernel<<<
+      min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      M, N, SF, X.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+namespace {
+__global__ void NormalizeL1Kernel(
+    const int m,
+    const int n,
+    const int sf,
+    const float* xData,
+    float* yData) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < n; i += gridDim.x) {
+    auto base = (i / sf) * sf * m + (i % sf);
+
+    float sum = 0.0;
+    __shared__ float norm;
+    for (int j = threadIdx.x; j < m; j += blockDim.x) {
+      const auto x_ij = xData[base + j * sf];
+      sum += abs(x_ij);
+    }
+    float reduce_result = BlockReduce(temp_storage).Sum(sum);
+
+    if (threadIdx.x == 0) {
+      norm = reduce_result;
+    }
+    __syncthreads();
+    if (norm != 0) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        const auto index = base + j * sf;
+        yData[index] = xData[index] / norm;
+      }
+    }
+  }
+}
+} // namespace
+
+template <>
+void NormalizeL1Op<float, CUDAContext>::DoNormalize(
+    const float* xData,
+    float* yData,
+    const int m,
+    const int n,
+    const int sf) {
+  NormalizeL1Kernel<<<
+      min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(m, n, sf, xData, yData);
+}
+
+REGISTER_CUDA_OPERATOR(Normalize, NormalizeOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    NormalizeGradient,
+    NormalizeGradientOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(NormalizeL1, NormalizeL1Op<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/numpy_tile_op.cc b/caffe2/operators/numpy_tile_op.cc
new file mode 100644
index 0000000..ee8eee7
--- /dev/null
+++ b/caffe2/operators/numpy_tile_op.cc
@@ -0,0 +1,18 @@
+#include "caffe2/operators/numpy_tile_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(NumpyTile, NumpyTileOp<CPUContext>);
+
+OPERATOR_SCHEMA(NumpyTile)
+    .NumInputs(2)
+    .Input(0, "data", "The input tensor.")
+    .Input(1, "repeats", "1-D Tensor specifying how many times to repeat"
+                         " each axis.")
+    .Output(
+        0,
+        "tiled_data",
+        "Tensor that will contain input replicated along the given axis.")
+    .InheritOnnxSchema("Tile");
+
+} // namespace caffe2
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
new file mode 100644
index 0000000..66a5cc4
--- /dev/null
+++ b/caffe2/operators/numpy_tile_op.h
@@ -0,0 +1,114 @@
+#ifndef CAFFE2_OPERATORS_NUMPY_TILE_OP_H_
+#define CAFFE2_OPERATORS_NUMPY_TILE_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Copy a Blob n times along a specified axis.
+template <class Context>
+class NumpyTileOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  NumpyTileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~NumpyTileOp() {}
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    const auto& repeats = Input(1);
+
+    // Check that the `repeats` tensor has the correct rank, has a number of
+    // elements equal to the number of axes of `input`.
+    CAFFE_ENFORCE_EQ(repeats.ndim(), 1, "repeats input must be a 1-d tensor");
+    CAFFE_ENFORCE_EQ(repeats.size(), input.ndim(), "repeats input have the same"
+                            " number of elements as `inputs` has dimensions.");
+    const int64_t *repeats_data = repeats.template data<int64_t>();
+    for (size_t i=0; i<repeats.size(); ++i) {
+      CAFFE_ENFORCE_GE(repeats_data[i], 0);
+    }
+
+    auto* output = Output(0);
+
+    // Alternate inputs and outputs between two buffers. Repeatedly apply the
+    // Tile kernel along each axis. Then copy out the resulting data into the
+    // output tensor.
+    Tensor<Context> *src = &buffer, *dst = output;
+    src->CopyFrom(input);
+    vector<TIndex> output_dims(input.dims());
+    for (size_t i = 0; i < repeats.size(); ++i) {
+      if (repeats_data[i] == 1) {
+        continue;
+      }
+      // size up to (and not including) axis
+      const auto outer_dim = src->size_to_dim(i);
+      // size from axis up
+      const auto inner_dim = src->size_from_dim(i);
+
+      dst->Resize(outer_dim, inner_dim * repeats_data[i]);
+
+      /**
+       * How this works:
+       * Imagine a 2D tensor (matrix) of size 3x10, tiled 2 times.
+       * - Tiling along axis 0 (row) means copying the entire 3x10 Matrix 2
+       * times. outer_dim = 0, inner_dim = 30.
+       * - Tiling along axis 1 (column) means copying each row 2 times, then
+       * proceed to the next row, until the end. outer_dim = 3, inner_dim = 10.
+       */
+      const char* src_data = static_cast<const char*>(src->raw_data());
+      char* dst_data =
+          static_cast<char*>(dst->raw_mutable_data(src->meta()));
+
+      DoTile(
+          src->meta(),
+          src->itemsize(),
+          outer_dim,
+          inner_dim,
+          repeats_data[i],
+          src_data,
+          dst_data);
+
+        output_dims[i] *= repeats_data[i];
+        dst->Reshape(output_dims);
+
+        std::swap(src, dst);
+    }
+
+    // NB: because we have the swap at the end of the above loop, our real
+    // result tensor is going to live in *src when we reach this line
+    // whether we entered the loop or not :)
+    if (output != src)
+      output->CopyFrom(*src);
+
+    return true;
+  }
+
+ private:
+  void DoTile(
+      const TypeMeta& meta,
+      int item_size,
+      int outer_dim,
+      int inner_dim,
+      int64_t num_tiles,
+      const char* input_data,
+      char* output_data) {
+    for (auto i = 0; i < outer_dim; ++i) {
+      for (auto t = 0; t < num_tiles; ++t) {
+        context_.template CopyItems<Context, Context>(
+            meta, inner_dim, input_data, output_data);
+        output_data += inner_dim * item_size;
+      }
+      input_data += inner_dim * item_size;
+    }
+  }
+
+  Tensor<Context> buffer;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NUMPY_TILE_OP_H_
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
new file mode 100644
index 0000000..bb8a1db
--- /dev/null
+++ b/caffe2/operators/one_hot_ops.cc
@@ -0,0 +1,351 @@
+#include "caffe2/operators/one_hot_ops.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool BatchOneHotOp<CPUContext>::DoRunWithType() {
+  auto& input = Input(X);
+  auto& lens = Input(LENS);
+  auto& vals = Input(VALS);
+  CAFFE_ENFORCE_GE(input.ndim(), 1);
+  auto N = input.dim(0);
+  auto D = input.size_from_dim(1);
+  CAFFE_ENFORCE_EQ(lens.size(), D);
+
+  const auto* lens_data = lens.template data<int32_t>();
+  TIndex output_dim = 0;
+  valsOffsets_.resize(D + 1);
+  for (TIndex i = 0; i < D; i++) {
+    CAFFE_ENFORCE_GE(lens_data[i], 0);
+    valsOffsets_[i] = output_dim;
+    output_dim += lens_data[i];
+  }
+  valsOffsets_[D] = output_dim;
+
+  CAFFE_ENFORCE_EQ(vals.size(), output_dim);
+  auto* output = Output(ONE_HOT);
+  output->Resize(N, output_dim);
+
+  const auto* input_data = input.template data<T>();
+  const auto* vals_data = vals.template data<T>();
+  auto* output_data = output->template mutable_data<T>();
+
+  for (TIndex i = 0; i < N; ++i) {
+    for (TIndex j = 0; j < D; j++) {
+      const auto input_val = input_data[i * D + j];
+      for (TIndex k = valsOffsets_[j]; k < valsOffsets_[j + 1]; ++k) {
+        output_data[k] = vals_data[k] == input_val;
+      }
+    }
+    output_data += output_dim;
+  }
+
+  return true;
+}
+
+vector<TensorShape> TensorInferenceForBatchOneHot(
+    const OperatorDef& /* def */,
+    const vector<TensorShape>& in) {
+  std::vector<TIndex> output_dims(2);
+  output_dims[0] = in[0].dims(0); // N
+  output_dims[1] = in[2].dims(0); // vals.size()
+  return vector<TensorShape>{
+      CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+}
+
+vector<TensorShape> TensorInferenceForBucketBatchOneHot(
+    const OperatorDef& /* def */,
+    const vector<TensorShape>& in) {
+  std::vector<TIndex> output_dims(2);
+  output_dims[0] = in[0].dims(0); // N
+  output_dims[1] = in[1].dims(0) + in[2].dims(0); // vals.size() + length.size()
+  return vector<TensorShape>{
+      CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+}
+
+OpSchema::Cost CostInferenceForBatchOneHot(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  CAFFE_ENFORCE_EQ(in.size(), 3, "BatchOneHot requires three inputs");
+  struct OpSchema::Cost c;
+  const TensorShape output = TensorInferenceForBatchOneHot(def, in)[0];
+
+  const auto& data = in[0];
+  const auto& length = in[1];
+  const auto& values = in[2];
+
+  uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
+  uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
+  uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
+  c.flops = 0;
+  c.bytes_read = nBytesData + nBytesLength + nBytesValues;
+  c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
+  c.params_bytes = 0;
+  return c;
+}
+
+template <>
+void OneHotOp<CPUContext>::DoOneHotOp(
+    TIndex batch_size,
+    TIndex index_size,
+    const Tensor<CPUContext>& indices,
+    Tensor<CPUContext>* one_hots) {
+  const TIndex* indices_ptr = indices.template data<TIndex>();
+  float* one_hots_ptr = one_hots->template mutable_data<float>();
+  memset(one_hots_ptr, 0, one_hots->nbytes());
+  for (int i = 0; i < batch_size; ++i) {
+    auto label_idx = indices_ptr[i];
+    DCHECK((0 <= label_idx) && (label_idx < index_size));
+    one_hots_ptr[label_idx] = 1.0;
+    one_hots_ptr += index_size;
+  }
+}
+
+template <>
+bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
+  auto& input = Input(X);
+  auto& lens = Input(LENS);
+  auto& boundaries = Input(BOUNDARIES);
+  CAFFE_ENFORCE_GE(input.ndim(), 1);
+  auto N = input.dim(0);
+  auto D = input.size_from_dim(1);
+  CAFFE_ENFORCE_EQ(lens.size(), D);
+
+  const auto* lens_data = lens.template data<int32_t>();
+
+  CAFFE_ENFORCE_EQ(
+      std::accumulate(lens_data, lens_data + lens.size(), 0),
+      boundaries.size(),
+      "The sum of length should be equal to the length of boundaries");
+
+  TIndex output_dim = 0;
+  for (TIndex i = 0; i < D; i++) {
+    CAFFE_ENFORCE_GT(lens_data[i], 0);
+    // Number of buckets is number of bucket edges + 1
+    output_dim += (lens_data[i] + 1);
+  }
+  auto* output = Output(ONE_HOT);
+  output->Resize(N, output_dim);
+
+  const auto* input_data = input.template data<float>();
+  const auto* boundaries_data = boundaries.template data<float>();
+  auto* output_data = output->template mutable_data<float>();
+
+  math::Set<float, CPUContext>(output->size(), 0.f, output_data, &context_);
+
+  TIndex pos = 0;
+  for (TIndex i = 0; i < N; i++) {
+    auto* boundaries_offset = boundaries_data;
+    TIndex output_offset = 0;
+
+    for (TIndex j = 0; j < D; j++) {
+      // here we assume the boundary values for each feature are sorted
+      TIndex lower_bucket_idx = std::lower_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
+          boundaries_offset;
+
+      TIndex upper_bucket_idx = std::upper_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
+          boundaries_offset;
+
+      TIndex bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
+      output_data[i * output_dim + output_offset + bucket_idx] = 1.0;
+      boundaries_offset += lens_data[j];
+      output_offset += (lens_data[j] + 1);
+      pos++;
+    }
+  }
+
+  return true;
+};
+
+class SegmentOneHotOp : public Operator<CPUContext> {
+ public:
+  SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& lengths = Input(0);
+    auto& indices = Input(1);
+    auto& index_size_tensor = Input(2);
+    CAFFE_ENFORCE(lengths.ndim() == 1);
+    CAFFE_ENFORCE(indices.ndim() == 1);
+    CAFFE_ENFORCE(index_size_tensor.size() == 1);
+    auto batch_size = lengths.size();
+    auto index_size = *index_size_tensor.data<int64_t>();
+    CAFFE_ENFORCE(index_size > 0);
+
+    auto* lengths_ptr = lengths.data<int32_t>();
+    auto* indices_ptr = indices.data<int64_t>();
+    auto* one_hots = Output(0);
+    one_hots->Resize(batch_size, index_size);
+    auto* one_hots_ptr = one_hots->mutable_data<float>();
+    if (one_hots->size() == 0) {
+      return true;
+    }
+    memset(one_hots_ptr, 0, one_hots->nbytes());
+    int el_idx = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      for (int j = 0; j < lengths_ptr[i]; ++j) {
+        DCHECK(el_idx < indices.size());
+        auto label_idx = indices_ptr[el_idx++];
+        DCHECK((0 <= label_idx) && (label_idx < index_size));
+        one_hots_ptr[label_idx] = 1.0;
+      }
+      one_hots_ptr += index_size;
+    }
+    return true;
+  }
+};
+REGISTER_CPU_OPERATOR(BatchBucketOneHot, BatchBucketOneHotOp<CPUContext>);
+REGISTER_CPU_OPERATOR(BatchOneHot, BatchOneHotOp<CPUContext>);
+REGISTER_CPU_OPERATOR(OneHot, OneHotOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SegmentOneHot, SegmentOneHotOp);
+
+OPERATOR_SCHEMA(BatchBucketOneHot)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Input is a matrix tensor. Its first dimension is the batch
+size. For each column, bucketize it based on the boundary values and then do
+one hot encoding. The `lengths` specifies the number of boundary values for each
+column. The final number of buckets is this number plus 1. This would also be
+the expanded feature size. `boundaries` specifies all the boundary values.
+Note that each bucket is right-inclusive. That is, given boundary values
+[b1, b2, b3], the buckets are defined as (-int, b1], (b1, b2], (b2, b3], (b3, inf).
+For example
+
+  data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
+  If boundaries = [0.1, 2.5, 1, 3.1, 4.5], then
+  output = [[0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
+
+  If boundaries = [0.1, 2.5, 1, 1, 3.1], then
+  output = [[0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
+
+)DOC")
+    .Input(0, "data", "input tensor matrix")
+    .Input(1, "lengths", "the size is the same as the width of the `data`")
+    .Input(2, "boundaries", "bucket boundaries")
+    .Output(
+        0,
+        "output",
+        "output matrix that expands each input column with one hot encoding"
+        "based on the bucketization")
+    .TensorInferenceFunction(TensorInferenceForBucketBatchOneHot);
+
+OPERATOR_SCHEMA(BatchOneHot)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Input is a matrix tensor. Its first dimension is the batch
+size. Expand each column of it using one hot encoding. The `lengths` specifies
+the size of each column after encoding, and the `values` is the dictionary value
+of one-hot encoding for each column. For example
+
+  If data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
+  and values = [2, 4, 1, 3, 5], then
+
+  output = [[1, 0, 0, 1, 0], [0, 1, 1, 0, 0], [1, 0, 0, 0, 1]]
+)DOC")
+    .Input(0, "data", "input tensor matrix")
+    .Input(1, "lengths", "the size is the same as the width of the `data`")
+    .Input(2, "values", "one hot encoding dictionary values")
+    .Output(
+        0,
+        "output",
+        "output matrix that expands each input column with one hot encoding")
+    .TensorInferenceFunction(TensorInferenceForBatchOneHot)
+    .CostInferenceFunction(
+        OpSchema::CostInferenceFunctionType(CostInferenceForBatchOneHot));
+
+OPERATOR_SCHEMA(OneHot)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The *OneHot* op accepts two inputs *indices* and *index_size_tensor*, and produces a single output *one_hots*.  For each index in *indices* the op creates a one-hot row in *one_hots* of length *index_size_tensor* where all entries are zero except the entry at the index is 1. The size of *one_hots* is *len(indices)* x *index_size_tensor*.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/one_hot_ops.h
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/one_hot_ops.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "OneHot",
+    ["indices", "index_size_tensor"],
+    ["one_hots"],
+)
+
+workspace.FeedBlob("indices", np.array([0,1,2,3,4]).astype(np.long))
+print("indices:\n", workspace.FetchBlob("indices"))
+
+workspace.FeedBlob("index_size_tensor", np.array([5]).astype(np.long))
+print("index_size_tensor:\n", workspace.FetchBlob("index_size_tensor"))
+
+workspace.RunOperatorOnce(op)
+print("one_hots: \n", workspace.FetchBlob("one_hots"))
+
+```
+
+**Result**
+
+```
+
+indices:
+ [0 1 2 3 4]
+index_size_tensor:
+ [5]
+one_hots:
+ [[1. 0. 0. 0. 0.]
+ [0. 1. 0. 0. 0.]
+ [0. 0. 1. 0. 0.]
+ [0. 0. 0. 1. 0.]
+ [0. 0. 0. 0. 1.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "indices", "The active index for each example in the batch.")
+    .Input(
+        1,
+        "index_size_tensor",
+        "Scalar with the size of the index. Must be in CPU context")
+    .Output(0, "one_hots", "Matrix of size len(indices) x index_size");
+
+OPERATOR_SCHEMA(SegmentOneHot)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a sequence of indices, segmented by the lengths tensor, returns a matrix
+that has the elements in each sequence set to 1.0, and 0.0 everywhere else.
+)DOC")
+    .Input(0, "lengths", "Size of each segment.")
+    .Input(1, "indices", "Active indices, of size sum(lengths)")
+    .Input(2, "index_size_tensor", "Size of the index")
+    .Output(0, "one_hots", "Matrix of size len(lengths) x index_size");
+
+NO_GRADIENT(BatchOneHot);
+NO_GRADIENT(OneHot);
+NO_GRADIENT(SegmentOneHot);
+NO_GRADIENT(BucketBatchOneHot);
+} // namespace caffe2
diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
new file mode 100644
index 0000000..9cca0a5
--- /dev/null
+++ b/caffe2/operators/one_hot_ops.cu
@@ -0,0 +1,35 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/one_hot_ops.h"
+
+namespace caffe2 {
+
+__global__ void OneHotOpKernel(
+    const TIndex batch_size,
+    const TIndex index_size,
+    const TIndex* indices,
+    float* output) {
+  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+    output[i * index_size + indices[i]] = 1.;
+  }
+}
+
+template <>
+void OneHotOp<CUDAContext>::DoOneHotOp(
+    TIndex batch_size,
+    TIndex index_size,
+    const Tensor<CUDAContext>& indices,
+    Tensor<CUDAContext>* output) {
+  float* output_ptr = output->mutable_data<float>();
+  math::Set<float, CUDAContext>(output->size(), 0., output_ptr, &context_);
+  OneHotOpKernel<<<
+      CAFFE_GET_BLOCKS(batch_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      batch_size, index_size, indices.data<TIndex>(), output_ptr);
+}
+
+REGISTER_CUDA_OPERATOR(OneHot, OneHotOp<CUDAContext>);
+} // namespace
diff --git a/caffe2/operators/one_hot_ops.h b/caffe2/operators/one_hot_ops.h
new file mode 100644
index 0000000..1b48b69
--- /dev/null
+++ b/caffe2/operators/one_hot_ops.h
@@ -0,0 +1,93 @@
+#ifndef CAFFE_OPERATORS_ONE_HOT_OPS_H_
+#define CAFFE_OPERATORS_ONE_HOT_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class OneHotOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  OneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& indices = Input(0);
+    CAFFE_ENFORCE_EQ(
+        indices.ndim(),
+        1,
+        "indices input must be 1D tensor of data type TIndex");
+
+    // Index size input must be in CPU context
+    auto& index_size_tensor = OperatorBase::Input<Tensor<CPUContext>>(1);
+    CAFFE_ENFORCE_EQ(
+        index_size_tensor.size(),
+        1,
+        "index_size_tensor input must be scalar of data type TIndex");
+
+    auto batch_size = indices.size();
+    auto index_size = *index_size_tensor.template data<TIndex>();
+    auto one_hots = Output(0);
+    one_hots->Resize(batch_size, index_size);
+    auto output_size = one_hots->size();
+    if (output_size == 0) {
+      return true;
+    }
+
+    DoOneHotOp(batch_size, index_size, indices, one_hots);
+    return true;
+  }
+
+ protected:
+  void DoOneHotOp(
+      TIndex batch_size,
+      TIndex index_size,
+      const Tensor<Context>& indices,
+      Tensor<Context>* output);
+};
+
+template <class Context>
+class BatchOneHotOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchOneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(X));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ protected:
+  INPUT_TAGS(X, LENS, VALS);
+  OUTPUT_TAGS(ONE_HOT);
+
+ private:
+  // allows for fast random access to a given dict and is re-used across runs
+  std::vector<TIndex> valsOffsets_;
+};
+
+template <class Context>
+class BatchBucketOneHotOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchBucketOneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X, LENS, BOUNDARIES);
+  OUTPUT_TAGS(ONE_HOT);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_ONE_HOT_OPS_H_
diff --git a/caffe2/operators/onnx_while_op.cc b/caffe2/operators/onnx_while_op.cc
new file mode 100644
index 0000000..b94cc9d
--- /dev/null
+++ b/caffe2/operators/onnx_while_op.cc
@@ -0,0 +1,82 @@
+#include "caffe2/operators/onnx_while_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ONNXWhile, ONNXWhileOp<CPUContext>);
+
+OPERATOR_SCHEMA(ONNXWhile)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+*** EXPERIMENTAL. This operator is a work-in-progress. No assumption should be
+made about the stability or correctness of this op. ***
+
+Generic Looping construct confirming to the ONNX Loop operator spec. This loop
+has multiple termination conditions:
+
+1. Trip count. Iteration count specified at runtime. Set by specifying the
+    input M. Optional. Set to empty string to omit. Note that a static trip
+    count (specified at graph construction time) can be specified by passing
+    in a constant node for input M.
+2. Loop termination condition. This is an input to the op that determines
+    whether to run the first interation and also a loop-carried dependency for
+    the body graph. The body graph must yield a value for the condition
+    variable, whether this input is provided or not.
+
+This table summarizes the operating modes of this operator with equivalent
+C-style code:
+
+Operator inputs defined as (max_trip_count, condition_var). Omitted optional
+inputs are represented as empty string. Concretely, in this caffe2 op an input
+is marked as omitted by setting its 'has_{name}' argument to False.
+
+    input ("", ""):
+        for (int i=0; ; ++i) {
+          cond = ... // Note this value is ignored, but is required in the body
+        }
+
+    input ("", cond) // Note this is analogous to a while loop
+        bool cond = ...;
+        for (int i=0; cond; ++i) {
+          cond = ...;
+        }
+
+    input ("", 1) // Note this is analogous to a do-while loop
+        bool cond = true
+        for (int i=0; cond; ++i) {
+          cond = ...;
+        }
+
+    input (trip_count, "") // Note this is analogous to a for loop
+        int trip_count = ...
+        for (int i=0; i < trip_count; ++i) {
+          cond = ...; // ignored
+        }
+
+    input (trip_count, cond)
+        int trip_count = ...;
+        bool cond = ...;
+        for (int i=0; i < trip_count && cond; ++i) {
+          cond = ...;
+        }
+    )DOC")
+    .Arg("body", "Net executed on each iteration")
+    .Arg("has_trip_count", "Whether to use the trip count input")
+    .Arg("has_cond", "Whether to use the condition input")
+    .Arg("save_scopes", "Whether to save the scopes across iterations, as in "
+                        "for backprop")
+    .Arg("disable_scopes", "Do not create new scopes. Use this only if you're "
+                           "certain there will be no name collision, for "
+                           "example if you're converting from a fully-SSA IR")
+    .NumInputs(2, INT_MAX)
+    .Input(0, "max_trip_count", "Number of iterations to go out to. Used if "
+                                "the flag has_trip_count is True.")
+    .Input(1, "first_iter_condition", "Dynamic condition value for the first "
+                                      "iteration. For all subsequent iterations,"
+                                      " the condition from the body graph is "
+                                      "used. This input is used if the flag "
+                                      "has_cond is true.")
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace([](int in, int out) -> bool { return true; });
+
+} // namespace caffe2
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
new file mode 100644
index 0000000..d6a72bf
--- /dev/null
+++ b/caffe2/operators/onnx_while_op.h
@@ -0,0 +1,309 @@
+#ifndef CAFFE2_OPERATORS_ONNX_WHILE_OP_H_
+#define CAFFE2_OPERATORS_ONNX_WHILE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/create_scope_op.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ONNXWhileOp final : public Operator<Context> {
+ public:
+  ONNXWhileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        parent_ws_(ws),
+        has_trip_count_(
+            OperatorBase::GetSingleArgument<int64_t>("has_trip_count", 0)),
+        has_cond_(OperatorBase::GetSingleArgument<int64_t>("has_cond", 0)),
+        save_scopes_(OperatorBase::GetSingleArgument<int64_t>("save_scopes", 0)),
+        disable_scopes_(OperatorBase::GetSingleArgument<int64_t>("disable_scopes", 0)),
+        num_loop_carried_deps_(OperatorBase::GetSingleArgument<int64_t>("num_loop_carried_deps", -1)) {
+    CAFFE_ENFORCE(
+        this->template HasSingleArgumentOfType<NetDef>("body"),
+        "body net must be specified in ONNXWhile operator");
+    if (disable_scopes_) {
+      CAFFE_ENFORCE(!save_scopes_, "Cannot save scopes when disable_scopes=True");
+    }
+    body_net_def_ = this->template GetSingleArgument<NetDef>("body", NetDef());
+    if (!body_net_def_.has_name()) {
+      body_net_def_.set_name("loop_net");
+    }
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() {
+    return DispatchHelper<TensorTypes<int, bool, long>>::call(this, Input(1));
+  }
+
+  // Operator
+  //  Inputs: max trip count, condition, initial loop-carried dependencies
+  //  Outputs: Final loop-carried dependencies, scan_outputs
+  // Body
+  //  Inputs: iteration number, condition, loop-carried dependencies
+  //  Outputs: condition, loop-carried dependencies, scan_outputs
+  template <typename CondVarType>
+  bool DoRunWithType() {
+    // Clear workspaces from the previous invocations of the loop
+    // and setup a local scope for the first iteration
+    ws_stack_.clear();
+    auto loop_ws = !disable_scopes_ ? ws_stack_.pushForwardWorkspace(parent_ws_).get() : parent_ws_;
+    scope_ = std::make_shared<LocalScope>(loop_ws, body_net_def_);
+
+    constexpr int64_t num_inputs_before_lcds = 2;
+    // First input is the maximumt trip count. Second input is the condition
+    // variable (for the first iteration). The rest of the inputs are
+    // loop-carried dependencies.
+    int64_t num_loop_carried_deps;
+    if (num_loop_carried_deps_ != -1) {
+      num_loop_carried_deps = num_loop_carried_deps_;
+    } else {
+      num_loop_carried_deps = InputSize() - num_inputs_before_lcds;
+    }
+    int64_t max_trip_count = *Input(0).template data<int64_t>();
+    const bool first_iter_condition = *Input(1).template data<CondVarType>();
+
+    // Body graph has 1+N+K outputs: recalculated condition variable, N
+    // loop-carried dependencies, and K scan_outputs
+    int num_scan_outputs =
+        scope_->net()->external_output().size() - num_loop_carried_deps - 1;
+
+    CAFFE_ENFORCE_GE(
+        num_scan_outputs,
+        0,
+        "Body graph must have N+K outputs, where N is the number "
+        "of loop-carried dependencies and K is the number of scan "
+        "outputs");
+
+    // Copy initial loop-carried dependencies
+    for (int i = 0; i < num_loop_carried_deps; ++i) {
+      scope_->lcd_tensor(i)->CopyFrom(Input(i + num_inputs_before_lcds));
+    }
+
+    // Initialize iteration variable
+    scope_->set_iteration(0ll);
+
+    // Initialize input condition variable
+    scope_->template set_input_condition<CondVarType>(first_iter_condition);
+
+    auto valid_iter_num = [this, max_trip_count](int64_t i) {
+      if (has_trip_count_) {
+        return i < max_trip_count;
+      } else {
+        return true;
+      }
+    };
+
+    auto condition_true =
+        [this, first_iter_condition](int64_t i, bool cond_value) {
+          if (has_cond_) {
+            if (i == 0) {
+              return (bool)first_iter_condition;
+            } else {
+              return cond_value;
+            }
+          } else {
+            return true;
+          }
+        };
+
+    // Allocate scan_outputs for zero-iteration case
+    for (int i = 0; i < num_scan_outputs; ++i) {
+      Output(i + num_loop_carried_deps)->Resize(0);
+      Output(i + num_loop_carried_deps)->template mutable_data<int32_t>();
+    }
+
+    // Use this to keep track of the sizes of the scan outputs and validate
+    // they're the same across iterations.
+    std::vector<std::vector<TIndex>> scan_outputs_sizes;
+
+    Workspace *cur_ws = nullptr;
+    bool cur_output_condition = false;
+
+    while (true) {
+      int64_t itr = scope_->iteration();
+      if (valid_iter_num(itr) && condition_true(itr, cur_output_condition)) {
+        if (!scope_->net()->Run()) {
+          return false;
+        }
+
+        cur_ws = scope_->workspace();
+        cur_output_condition = scope_->template output_condition<CondVarType>();
+        if (save_scopes_) {
+          loop_ws = ws_stack_.pushForwardWorkspace(parent_ws_).get();
+          scope_ = std::make_shared<LocalScope>(loop_ws, body_net_def_);
+        }
+
+        // Copy forward loop-carried dependencies
+        for (int i = 0; i < num_loop_carried_deps; ++i) {
+          Blob* b = cur_ws->GetBlob(
+              scope_->net()->external_output()[i + 1]);
+          const Tensor<Context>& t = b->template Get<Tensor<Context>>();
+          scope_->lcd_tensor(i)->CopyFrom(t);
+        }
+        // Copy out scan_outputs
+        for (int i = 0; i < num_scan_outputs; ++i) {
+          int net_output_idx = i + 1 + num_loop_carried_deps;
+          const Tensor<Context>& scan_output =
+              cur_ws->GetBlob(
+                  scope_->net()->external_output()[net_output_idx])
+                  ->template Get<Tensor<Context>>();
+          auto* scan_output_target = Output(i + num_loop_carried_deps);
+          if (itr == 0) {
+            auto dims = scan_output.dims();
+            scan_outputs_sizes.push_back(dims);
+            dims.insert(dims.begin(), 1);
+            scan_output_target->Resize(dims);
+            scan_output_target->CopyFrom(scan_output);
+          } else {
+            auto dims = scan_output.dims();
+            CAFFE_ENFORCE_EQ(
+                dims,
+                scan_outputs_sizes[i],
+                "Size of scan output changed across iterations");
+            dims.insert(dims.begin(), itr);
+            scan_output_target->Extend(1, 2.0f, &context_);
+
+            TIndex timestep_size = 1;
+            for (const TIndex t : scan_outputs_sizes[i]) {
+              timestep_size *= t;
+            }
+
+            const void* src_data = scan_output.raw_data();
+            auto& sot_meta = scan_output_target->meta();
+            void* dst_data =
+                (char*)scan_output_target->raw_mutable_data(sot_meta) +
+                timestep_size * scan_output.itemsize() * itr;
+            memcpy(dst_data, src_data, timestep_size * scan_output.itemsize());
+          }
+        }
+        scope_->set_iteration(itr + 1ll);
+        scope_->template set_input_condition<CondVarType>(cur_output_condition);
+      } else {
+        break;
+      }
+    }
+
+    if (scope_->iteration() > 0) {
+      // Copy out final loop-carried dependencies
+      for (int i = 0; i < num_loop_carried_deps; ++i) {
+        Output(i)->CopyFrom(*scope_->lcd_tensor(i));
+      }
+    } else {
+      // Copy out final loop-carried dependencies
+      for (int i = 0; i < num_loop_carried_deps; ++i) {
+        Output(i)->CopyFrom(Input(i + num_inputs_before_lcds));
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  class LocalScope {
+   public:
+    LocalScope(
+        Workspace *loop_ws,
+        const NetDef& body_net_def) : loop_ws_(loop_ws) {
+      CAFFE_ENFORCE(loop_ws_,
+          "Failed to initialize local loop workspace");
+
+      // Create loop-carried deps in Workspace
+      lcd_tensors_.clear();
+      for (int i = 2; i < body_net_def.external_input_size(); ++i) {
+        Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
+        Tensor<Context>* t = b->template GetMutable<Tensor<Context>>();
+        lcd_tensors_.push_back(t);
+      }
+      // First output is the iteration variable
+      auto* iteration_var_blob = loop_ws_->CreateBlob(
+          body_net_def.external_input(0));
+      iteration_var_ =
+          iteration_var_blob->template GetMutable<Tensor<Context>>();
+
+      input_condition_var_ = loop_ws_->CreateBlob(
+          body_net_def.external_input(1))
+          ->template GetMutable<Tensor<Context>>();
+
+      auto* condition_var_blob =
+          loop_ws_->CreateBlob(body_net_def.external_output(0));
+      condition_var_ = condition_var_blob->template GetMutable<Tensor<Context>>();
+      condition_var_->Resize(1);
+      condition_var_->template mutable_data<bool>();
+
+      body_net_ = loop_ws_->GetNet(body_net_def.name());
+      if (!body_net_) {
+        body_net_ = loop_ws_->CreateNet(body_net_def, true);
+      }
+      CAFFE_ENFORCE(body_net_, "Failed to initialize loop subnet");
+    }
+
+    NetBase* net() const {
+      return body_net_;
+    }
+
+    Workspace* workspace() const {
+      return loop_ws_;
+    }
+
+    int64_t iteration() const {
+      auto* iteration_var_ptr =
+          iteration_var_->template mutable_data<int64_t>();
+      return *iteration_var_ptr;
+    }
+
+    Tensor<Context>* lcd_tensor(int idx) {
+      return lcd_tensors_[idx];
+    }
+
+    void set_iteration(int64_t itr) {
+      iteration_var_->Resize(1);
+      auto* iteration_var_ptr =
+          iteration_var_->template mutable_data<int64_t>();
+      *iteration_var_ptr = itr;
+    }
+
+    template <typename CondVarType>
+    void set_input_condition(bool cond_value) {
+      input_condition_var_->Resize(1);
+      auto* input_condition_var_ptr =
+          input_condition_var_->template mutable_data<CondVarType>();
+      *input_condition_var_ptr = cond_value;
+    }
+
+    template <typename CondVarType>
+    bool output_condition() const {
+      auto* condition_var_ptr =
+          condition_var_->template mutable_data<CondVarType>();
+      return *condition_var_ptr;
+    }
+
+   private:
+    Workspace *loop_ws_;
+
+    NetBase* body_net_; // owned by a workspace
+    Tensor<Context>* iteration_var_;
+    Tensor<Context>* input_condition_var_;
+    Tensor<Context>* condition_var_;
+
+    std::vector<Tensor<Context>*> lcd_tensors_;
+  };
+
+  NetDef body_net_def_;
+  Workspace* parent_ws_;
+  detail::WorkspaceStack ws_stack_;
+
+  bool has_trip_count_;
+  bool has_cond_;
+  bool save_scopes_;
+  bool disable_scopes_;
+  int64_t num_loop_carried_deps_;
+
+  std::shared_ptr<LocalScope> scope_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ONNX_WHILE_OP_H
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
new file mode 100644
index 0000000..f4d0c5e
--- /dev/null
+++ b/caffe2/operators/onnxifi_op.cc
@@ -0,0 +1,149 @@
+#include "caffe2/operators/onnxifi_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+void BlobToTensorDescriptor(
+    const std::string& name,
+    Workspace* ws,
+    onnxTensorDescriptor* desc,
+    std::vector<std::vector<uint64_t>>* shapes) {
+  const Blob* blob = ws->GetBlob(name);
+  CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
+
+  // Memory type
+  // We only allow weights to be CPU tensor for now
+  CAFFE_ENFORCE(
+      blob->template IsType<TensorCPU>(),
+      "Initialization blob ",
+      name,
+      " needs to be TensorCPU");
+  desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU;
+
+  // Data type
+  const auto& cpu_tensor = blob->template Get<TensorCPU>();
+  if (cpu_tensor.template IsType<float>()) {
+    desc->dataType = ONNXIFI_DATATYPE_FLOAT32;
+    desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<float>());
+  } else if (cpu_tensor.template IsType<int64_t>()) {
+    desc->dataType = ONNXIFI_DATATYPE_INT64;
+    desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int64_t>());
+  } else if (cpu_tensor.template IsType<int32_t>()) {
+    desc->dataType = ONNXIFI_DATATYPE_INT32;
+    desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
+  }
+
+  // Set dims
+  const auto& shape = cpu_tensor.dims();
+  desc->dimensions = shape.size();
+  shapes->emplace_back(shape.cbegin(), shape.cend());
+  desc->shape = shapes->back().data();
+}
+} // namespace
+
+template <>
+std::vector<onnxTensorDescriptor>
+OnnxifiOp<float, CPUContext>::BuildInitializationList(
+    Workspace* ws,
+    std::unordered_set<std::string>* initialization_list,
+    std::vector<std::string>* weight_names,
+    std::vector<std::vector<uint64_t>>* weight_shapes) {
+  const std::vector<string>& ws_blobs = ws->Blobs();
+  std::vector<onnxTensorDescriptor> descs;
+  for (const auto& s : ws_blobs) {
+    auto it = initialization_list->find(s);
+    if (it != initialization_list->end()) {
+      weight_names->emplace_back(s);
+      onnxTensorDescriptor tensor_desc;
+      tensor_desc.name = weight_names->back().c_str();
+      BlobToTensorDescriptor(s, ws, &tensor_desc, weight_shapes);
+      descs.push_back(tensor_desc);
+      initialization_list->erase(it);
+    }
+  }
+  CAFFE_ENFORCE(
+      initialization_list->empty(), "Unfulfilled initialization list");
+  return descs;
+}
+
+template <>
+bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
+  for (unsigned i = 0U; i < InputSize(); ++i) {
+    const auto& input_tensor = Input(i);
+    const auto& tensor_dims = input_tensor.dims();
+    auto& tensor_descriptor = input_desc_.at(i);
+    tensor_descriptor.dataType = ONNXIFI_DATATYPE_FLOAT32;
+    tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
+    tensor_descriptor.dimensions = tensor_dims.size();
+    input_shapes_.emplace_back(tensor_dims.cbegin(), tensor_dims.cend());
+    tensor_descriptor.shape = input_shapes_.back().data();
+    tensor_descriptor.buffer =
+        reinterpret_cast<onnxPointer>(input_tensor.data<float>());
+  }
+
+  for (unsigned i = 0U; i < OutputSize(); ++i) {
+    auto* output_tensor = Output(i);
+    std::vector<TIndex> tensor_dims;
+    SetOutputShape(i, &tensor_dims);
+    output_tensor->Resize(tensor_dims);
+    auto& tensor_descriptor = output_desc_.at(i);
+    tensor_descriptor.dataType = ONNXIFI_DATATYPE_FLOAT32;
+    tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
+    tensor_descriptor.dimensions = tensor_dims.size();
+    output_shapes_.emplace_back(tensor_dims.cbegin(), tensor_dims.cend());
+    tensor_descriptor.shape = output_shapes_.back().data();
+    tensor_descriptor.buffer =
+        reinterpret_cast<onnxPointer>(output_tensor->mutable_data<float>());
+  }
+
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxSetGraphIO(
+          graph_,
+          input_desc_.size(),
+          input_desc_.data(),
+          output_desc_.size(),
+          output_desc_.data()),
+      ONNXIFI_STATUS_SUCCESS);
+
+  onnxMemoryFence input_fence;
+  input_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxInitEvent(backend_, &input_fence.event),
+      ONNXIFI_STATUS_SUCCESS);
+  onnxMemoryFence output_fence;
+  output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
+
+  // Call the asycn run on backend, singal event on input fence and wait for the
+  // event on output fence
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxRunGraph(graph_, &input_fence, &output_fence),
+      ONNXIFI_STATUS_SUCCESS);
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
+
+  // Destroy the event objects
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxReleaseEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Onnxifi, OnnxifiOp<float, CPUContext>);
+OPERATOR_SCHEMA(Onnxifi)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+    The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend
+    )DOC")
+    .Arg(
+        "onnx_model",
+        "(string default=\"\") Serialized ONNX model to be converted to backend representation")
+    .Arg(
+        "initializers",
+        "Initialization pair indicating the mapping of the name between NetDef and ONNX model");
+} // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h
new file mode 100644
index 0000000..3c5cd2d
--- /dev/null
+++ b/caffe2/operators/onnxifi_op.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include <unordered_map>
+
+#include "onnx/onnx_pb.h"
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/onnx/onnxifi_init.h"
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+
+template <typename T, typename Context>
+class OnnxifiOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  OnnxifiOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    lib_ = onnx::initOnnxifiLibrary();
+    CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
+    auto onnx_model_str =
+        OperatorBase::GetSingleArgument<std::string>("onnx_model", "");
+    CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
+
+    // Setup input/output descriptor templates
+    for (const auto& input : operator_def.input()) {
+      input_desc_.push_back(onnxTensorDescriptor());
+      input_desc_.back().name = input.c_str();
+    }
+    int output_idx = 0;
+    for (const auto& output : operator_def.output()) {
+      output_desc_.push_back(onnxTensorDescriptor());
+      output_desc_.back().name = output.c_str();
+
+      // For output, we try to get its output size hint
+      const std::string key = MakeString("output_size_hint_", output_idx);
+      auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
+      if (!output_size_hint.empty()) {
+        std::vector<TIndex> dims;
+        for (const auto v : output_size_hint) {
+          dims.push_back(v);
+        }
+        output_size_hints_.emplace(output_idx, std::move(dims));
+      }
+      ++output_idx;
+    }
+
+    // Encode arguments starting with "custom_" to backend
+    std::vector<uint64_t> property_pointers;
+    std::vector<int64_t> int_args;
+    std::vector<float> float_args;
+    BuildPropertyList(operator_def, &property_pointers, &int_args, &float_args);
+
+    // Pull the weights from workspace and feed it to the backend through
+    // setGraphIO. Notice that since we may have rewritten the net, we need to
+    // map the weight names
+    auto initializers =
+        OperatorBase::GetRepeatedArgument<std::string>("initializers");
+    CAFFE_ENFORCE_EQ(
+        initializers.size() % 2, 0, "initializers should come in pairs");
+    std::unordered_set<std::string> initializer_set;
+    std::unordered_map<std::string, std::string> input_mapping;
+    for (auto it = initializers.begin(); it != initializers.end(); ++it) {
+      auto key = *it++;
+      input_mapping.emplace(key, *it);
+      initializer_set.emplace(key);
+    }
+    Workspace mapped_ws(ws, input_mapping);
+    std::vector<std::string> weight_names;
+    std::vector<std::vector<uint64_t>> weight_shapes;
+    auto weight_descs = BuildInitializationList(
+        &mapped_ws, &initializer_set, &weight_names, &weight_shapes);
+
+    ::ONNX_NAMESPACE::ModelProto onnx_model;
+    ParseProtoFromLargeString(onnx_model_str, &onnx_model);
+    onnx_model_str.clear();
+    onnx_model.SerializeToString(&onnx_model_str);
+
+    // Build the Onnxifi engine
+    // TODO: In spec, backends are hot-pluggable, so two calls to
+    // onnxGetBackendIDs may result in different number of backend. And we
+    // should retry until it get consistent. For now, we don't do that.
+    CAFFE_ENFORCE_EQ(
+        lib_->onnxGetBackendIDs(nullptr, &num_backends_),
+        ONNXIFI_STATUS_FALLBACK);
+    CAFFE_ENFORCE_GT(
+        num_backends_, 0, "At least 1 onnxifi backend should be available");
+    backend_ids_.resize(num_backends_);
+    CAFFE_ENFORCE_EQ(
+        lib_->onnxGetBackendIDs(backend_ids_.data(), &num_backends_),
+        ONNXIFI_STATUS_SUCCESS);
+
+    // TODO: choose backend id
+    CAFFE_ENFORCE_EQ(
+        lib_->onnxInitBackend(
+            backend_ids_[0], property_pointers.data(), &backend_),
+        ONNXIFI_STATUS_SUCCESS);
+    CAFFE_ENFORCE_EQ(
+        lib_->onnxInitGraph(
+            backend_,
+            onnx_model_str.size(),
+            (void*)(onnx_model_str.c_str()),
+            weight_descs.size(),
+            weight_descs.data(),
+            &graph_),
+        ONNXIFI_STATUS_SUCCESS);
+  }
+
+  ~OnnxifiOp() {
+    if (graph_) {
+      if (lib_->onnxReleaseGraph(graph_) != ONNXIFI_STATUS_SUCCESS) {
+        LOG(ERROR) << "Error when calling onnxReleaseGraph";
+      }
+      graph_ = nullptr;
+    }
+    if (backend_) {
+      if (lib_->onnxReleaseBackend(backend_) != ONNXIFI_STATUS_SUCCESS) {
+        LOG(ERROR) << "Error when calling onnxReleaseBackend";
+      }
+      backend_ = nullptr;
+    }
+    for (unsigned i = 0; i < num_backends_; ++i) {
+      if (lib_->onnxReleaseBackendID(backend_ids_[i]) != ONNXIFI_STATUS_SUCCESS) {
+        LOG(ERROR) << "Error when calling onnxReleaseBackendID";
+      }
+    }
+  }
+
+  bool RunOnDevice() override;
+
+ private:
+  void SetOutputShape(int output_idx, std::vector<TIndex>* dims) {
+    const auto it = output_size_hints_.find(output_idx);
+    if (it != output_size_hints_.end()) {
+      *dims = it->second;
+    }
+  }
+
+  void BuildPropertyList(
+      const OperatorDef& /* unused */,
+      std::vector<uint64_t>* property_list,
+      std::vector<int64_t>* /* unused */,
+      std::vector<float>* /* unused */) {
+    property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE);
+  }
+
+  std::vector<onnxTensorDescriptor> BuildInitializationList(
+      Workspace* ws,
+      std::unordered_set<std::string>* initialization_list,
+      std::vector<std::string>* weight_names,
+      std::vector<std::vector<uint64_t>>* weight_shapes);
+
+  // pointer to loaded onnxifi library
+  onnxifi_library* lib_{nullptr};
+
+  std::vector<onnxBackendID> backend_ids_;
+  onnxBackend backend_{nullptr};
+  onnxGraph graph_{nullptr};
+  size_t num_backends_{0};
+
+  // input/output descriptors
+  std::vector<onnxTensorDescriptor> input_desc_;
+  std::vector<onnxTensorDescriptor> output_desc_;
+  std::vector<std::vector<uint64_t>> input_shapes_;
+  std::vector<std::vector<uint64_t>> output_shapes_;
+
+  // output shape hints
+  std::unordered_map<int, std::vector<TIndex>> output_size_hints_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/op_utils_cudnn.h b/caffe2/operators/op_utils_cudnn.h
new file mode 100644
index 0000000..b76a182
--- /dev/null
+++ b/caffe2/operators/op_utils_cudnn.h
@@ -0,0 +1,54 @@
+#ifndef CAFFE2_OPERATORS_CUDNN_OP_UTILS_H_
+#define CAFFE2_OPERATORS_CUDNN_OP_UTILS_H_
+
+#include "caffe2/core/cudnn_wrappers.h"
+
+namespace caffe2 {
+
+// Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
+// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
+// something very beneficial to more recent CNN models.
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
+
+// Manually specified number of algorithms implemented in CuDNN.
+// This does not have any performance implications, as we will always find the
+// fastest algorithm; setting them to the right number of algorithms will enable
+// us to best report the statistics when doing an exhaustive search, though.
+#if CUDNN_VERSION_MIN(7, 0, 0)
+// Note: Double each of these due to potential
+// tensorcore + non-tensorcore versions
+// which are treated as separate returned algos
+static constexpr size_t kNUM_CUDNN_FWD_ALGS =
+    2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
+    2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
+    2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+#else
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+#endif
+
+namespace {
+template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
+inline void LogCuDNNPerfStats(
+    const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
+    int returned_algo_count) {
+  VLOG(1) << "Perf result: (algo: stat, time, memory)";
+  for (int i = 0; i < returned_algo_count; ++i) {
+    const auto& stat = perf_stat[i];
+    VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
+            << stat.memory;
+  }
+}
+} // namespace
+
+// Easier indexing into force_algo_ vector,
+// shared by CudnnConvTransposeOpBase and CudnnConvOpBase to force
+// usage of a particular algortihm instead of searching
+enum { ALGO_FWD = 0, ALGO_WGRAD = 1, ALGO_DGRAD = 2 };
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CUDNN_OP_UTILS_H_
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
new file mode 100644
index 0000000..62d9cdb
--- /dev/null
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -0,0 +1,114 @@
+#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+/**
+ * @brief A templated class to allow one to wrap a CPU operator as a CUDA
+ * operator.
+ *
+ * This class can be used when one does not have the CUDA implementation ready
+ * yet for an operator. Essentially, what this op does is to automatically
+ * deal with data copy for you. Plausibly, this causes a lot of overhead and
+ * is not optimal, so you should use this operator mostly for quick prototyping
+ * purpose.
+ *
+ * All the input and output of the original operator should be TensorCPU.
+ *
+ * Example usage: if you have a class MyMagicOp that is CPU based, and you use
+ * the registration code
+ *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
+ * to register the CPU side, you can create its corresponding GPU operator
+ * (with performance hits of course) via
+ *     REGISTER_CUDA_OPERATOR(MyMagic,
+ *                            GPUFallbackOp<MyMagicOp>);
+ *
+ * Advanced usage: if you want to have some specific outputs never copied, you
+ * can use the SkipOutputCopy template argument to do that. For example, if
+ * MyMagic produces two outputs and the first output is always going to live on
+ * the CPU, you can do
+ *     REGISTER_CUDA_OPERATOR(MyMagic,
+ *                            GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
+ */
+template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
+class GPUFallbackOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  GPUFallbackOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CUDAContext>(def, ws) {
+    CAFFE_ENFORCE_EQ(def.device_option().device_type(), CUDA);
+    OperatorDef base_def_(def);
+    // base_def_ runs on CPU, so we will set its device option to CPU.
+    base_def_.clear_device_option();
+    base_def_.mutable_device_option()->set_device_type(CPU);
+    // Set up the symbols for the local workspace.
+    for (const string& name : def.input()) {
+      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
+      CHECK_NOTNULL(local_input_blobs_.back());
+    }
+    base_op_.reset(new CPUOp(base_def_, &local_ws_));
+    for (const string& name : def.output()) {
+      local_output_blobs_.push_back(local_ws_.GetBlob(name));
+      CHECK_NOTNULL(local_output_blobs_.back());
+    }
+  }
+
+  bool RunOnDevice() override {
+    bool need_sync = false;
+    for (int i = 0; i < InputSize(); ++i) {
+      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
+        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
+            Input(i), &context_);
+        need_sync = true;
+      } else {
+        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
+    }
+
+    // Sync to make sure copies are done.
+    if (need_sync) {
+      context_.FinishDeviceComputation();
+    }
+
+    if (!base_op_->Run()) {
+      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
+                 << ProtoDebugString(this->debug_def());
+      return false;
+    }
+    for (int i = 0; i < OutputSize(); ++i) {
+      if (SkipOutputCopy::Contains(i)) {
+        VLOG(1) << "Copy output: index " << i << " skipped.";
+        continue;
+      }
+      CAFFE_ENFORCE(
+          local_output_blobs_[i]->template IsType<TensorCPU>(),
+          "GPU fallback op currently does not support non-TensorCPU "
+          "output type who needs copying.");
+      Output(i)->CopyFrom(
+          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
+    }
+    return true;
+  }
+
+ protected:
+  Workspace local_ws_;
+  vector<Blob*> local_input_blobs_;
+  vector<Blob*> local_output_blobs_;
+  std::unique_ptr<CPUOp> base_op_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
new file mode 100644
index 0000000..eb6c225
--- /dev/null
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -0,0 +1,82 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+
+class IncrementByOneOp final : public Operator<CPUContext> {
+ public:
+  IncrementByOneOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CPUContext>(def, ws) {}
+  bool RunOnDevice() {
+    const auto& in = Input(0);
+    auto* out = Output(0);
+    out->ResizeLike(in);
+    const float* in_data = in.template data<float>();
+    float* out_data = out->template mutable_data<float>();
+    for (int i = 0; i < in.size(); ++i) {
+      out_data[i] = in_data[i] + 1.f;
+    }
+    return true;
+  }
+};
+
+
+OPERATOR_SCHEMA(IncrementByOne)
+    .NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+
+REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp);
+REGISTER_CUDA_OPERATOR(IncrementByOne,
+                       GPUFallbackOp<IncrementByOneOp>);
+
+TEST(OperatorFallbackTest, IncrementByOneOp) {
+  OperatorDef op_def = CreateOperatorDef(
+      "IncrementByOne", "", vector<string>{"X"},
+      vector<string>{"X"});
+  Workspace ws;
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  for (int i = 0; i < 6; ++i) {
+    source_tensor.mutable_data<float>()[i] = i;
+  }
+  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_TRUE(op.get() != nullptr);
+  EXPECT_TRUE(op->Run());
+  const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>();
+  EXPECT_EQ(output.ndim(), 2);
+  EXPECT_EQ(output.dim(0), 2);
+  EXPECT_EQ(output.dim(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(output.data<float>()[i], i + 1);
+  }
+}
+
+TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
+  if (!HasCudaGPU()) return;
+  OperatorDef op_def = CreateOperatorDef(
+      "IncrementByOne", "", vector<string>{"X"},
+      vector<string>{"X"});
+  op_def.mutable_device_option()->set_device_type(CUDA);
+  Workspace ws;
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  for (int i = 0; i < 6; ++i) {
+    source_tensor.mutable_data<float>()[i] = i;
+  }
+  ws.CreateBlob("X")->GetMutable<TensorCUDA>()->CopyFrom(source_tensor);
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_TRUE(op.get() != nullptr);
+  EXPECT_TRUE(op->Run());
+  const TensorCUDA& output = ws.GetBlob("X")->Get<TensorCUDA>();
+  TensorCPU output_cpu(output);
+  EXPECT_EQ(output.ndim(), 2);
+  EXPECT_EQ(output.dim(0), 2);
+  EXPECT_EQ(output.dim(1), 3);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(output_cpu.data<float>()[i], i + 1);
+  }
+}
+
+}  // namespace caffe2
diff --git a/caffe2/operators/order_switch_ops.cc b/caffe2/operators/order_switch_ops.cc
new file mode 100644
index 0000000..11cc6de
--- /dev/null
+++ b/caffe2/operators/order_switch_ops.cc
@@ -0,0 +1,105 @@
+#include "caffe2/operators/order_switch_ops.h"
+
+namespace caffe2 {
+
+template <>
+bool NHWC2NCHWOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE(X.ndim() == 4);
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+  Y->Resize(N, C, H, W);
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  for (int n = 0; n < N; ++n) {
+    for (int h = 0; h < H; ++h) {
+      for (int w = 0; w < W; ++w) {
+        for (int c = 0; c < C; ++c) {
+          Ydata[((n * C + c) * H + h) * W + w] = *(Xdata++);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool NCHW2NHWCOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE(X.ndim() == 4);
+  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  Y->Resize(N, H, W, C);
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          Ydata[((n * H + h) * W + w) * C + c] = *(Xdata++);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+
+REGISTER_CPU_OPERATOR(NHWC2NCHW, NHWC2NCHWOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(NCHW2NHWC, NCHW2NHWCOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(NHWC2NCHW)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /*unused*/ /*def*/,
+                                const vector<TensorShape>& in) {
+      CAFFE_ENFORCE_EQ(
+          in[0].dims_size(), 4, "Input for NHWC2NCHW must be 4 dimensional");
+      vector<TensorShape> out(1);
+      out[0].add_dims(in[0].dims(0));
+      out[0].add_dims(in[0].dims(3));
+      out[0].add_dims(in[0].dims(1));
+      out[0].add_dims(in[0].dims(2));
+      return out;
+    })
+    .SetDoc(R"DOC(
+The operator switches the order of data in a tensor from NHWC- sample index N,
+height H, width H and channels C, to the NCHW order.
+)DOC")
+    .Input(0, "data", "The input data (Tensor<float>) in the NHWC order.")
+    .Output(
+        0,
+        "output",
+        "The output tensor (Tensor<float>) in the NCHW order.");
+
+OPERATOR_SCHEMA(NCHW2NHWC).NumInputs(1).NumOutputs(1)
+  .SetDoc(R"DOC(
+The operator switches the order of data in a tensor from NCHW- sample index N,
+channels C, height H and width W, to the NHWC order.
+)DOC")
+  .Input(0, "data", "The input data (Tensor<float>) in the NCHW order.")
+  .Output(0, "output", "The output tensor (Tensor<float>) in the NHWC order.");
+
+
+class GetNHWC2NCHWGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "NCHW2NHWC", "",
+        vector<string>{GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(NHWC2NCHW, GetNHWC2NCHWGradient);
+
+class GetNCHW2NHWCGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "NHWC2NCHW", "",
+        vector<string>{GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(NCHW2NHWC, GetNCHW2NHWCGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
new file mode 100644
index 0000000..2d77b5d
--- /dev/null
+++ b/caffe2/operators/order_switch_ops.cu
@@ -0,0 +1,55 @@
+#include "caffe2/operators/order_switch_ops.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+__global__ void NHWC2NCHWKernel(const int N, const int HW, const int C,
+                                const float* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N * HW * C) {
+    const int c = i % C;
+    const int hw = i / C % HW;
+    const int n = i / C / HW;
+    Y[(n * C + c) * HW + hw] = X[i];
+  }
+}
+
+__global__ void NCHW2NHWCKernel(const int N, const int C, const int HW,
+                                const float* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N * C * HW) {
+    const int hw = i % HW;
+    const int c = i / HW % C;
+    const int n = i / C / HW;
+    Y[(n * HW + hw) * C + c] = X[i];
+  }
+}
+
+template <>
+bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+  Y->Resize(N, C, H, W);
+  NHWC2NCHWKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                    0, context_.cuda_stream()>>>(
+      N, H * W, C, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  Y->Resize(N, H, W, C);
+  NCHW2NHWCKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                    0, context_.cuda_stream()>>>(
+      N, C, H * W, X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(NHWC2NCHW, NHWC2NCHWOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(NCHW2NHWC, NCHW2NHWCOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/order_switch_ops.h b/caffe2/operators/order_switch_ops.h
new file mode 100644
index 0000000..b8420d6
--- /dev/null
+++ b/caffe2/operators/order_switch_ops.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ORDER_SWITCH_OPS_H_
+#define CAFFE2_OPERATORS_ORDER_SWITCH_OPS_H_
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// Note(Yangqing): I think it is possible to do a more general swapaxes operator
+// but I am a little afraid of going down that general path. Only implementing
+// the two actually needed ones here.
+
+template <typename T, class Context>
+class NHWC2NCHWOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(NHWC2NCHWOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+};
+
+template <typename T, class Context>
+class NCHW2NHWCOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(NCHW2NHWCOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ORDER_SWITCH_OPS_H_
diff --git a/caffe2/operators/pack_rnn_sequence_op.cc b/caffe2/operators/pack_rnn_sequence_op.cc
new file mode 100644
index 0000000..b4fd8a0
--- /dev/null
+++ b/caffe2/operators/pack_rnn_sequence_op.cc
@@ -0,0 +1,107 @@
+#include "caffe2/operators/pack_rnn_sequence_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(PackRNNSequence, PackRNNSequenceOpBase<CPUContext, true>);
+REGISTER_CPU_OPERATOR(
+    UnpackRNNSequence,
+    PackRNNSequenceOpBase<CPUContext, false>);
+
+OPERATOR_SCHEMA(PackRNNSequence)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Pack values based on the length blob. Each number from length blob represents
+the corresponding values that need to be packed. The dimension for each pack
+is the same as the maximum number from the length blob (padding with zero is
+implemented for smaller length value). The overall output dimension is:
+T * N * D, where T is the max number of lengths, N is the size of lengths,
+and D is the dimension of each feature value. The following example shows
+the input and output of this operator:
+
+
+Given:
+  values = [v1, v2, v3, v4, v5, v6, v7, v8]
+  lengths = [2, 3, 1, 2];
+
+
+Output:
+  output = [
+    [v1, v3, v6, v7],
+    [v2, v4, 0,  v8],
+    [0,  v5, 0,  0 ],
+  ]
+
+
+One application for this operator is the transfer data into the format that is
+used for RNN models. Note that the gradient operator of PackRNNSequence is
+UnpackRNNSequence.
+)DOC")
+    .Input(0, "values", "Data tensor, contains a sequence of features")
+    .Input(1, "lengths", "lengths with each number representing the pack size.")
+    .Output(0, "output", "Output tensor after packing");
+
+OPERATOR_SCHEMA(UnpackRNNSequence)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This is the reverse operator for PackRNNSequence. It maps the packed values
+back to sequence values based on the length blob. Each number from length blob
+represents the corresponding values that has been grouped. The dimension
+for each pack is the same as the maximum number from the length blob (padding
+with zero was implemented for smaller length value). The overall output
+dimension is: M * D, where M is the sum of lengths, and D is the dimension of
+each feature value. The following example shows the input and output of
+this operator:
+
+
+Given:
+  values = [
+    [v1, v3, v6, v7],
+    [v2, v4, 0,  v8],
+    [0,  v5, 0,  0 ],
+  ]
+  lengths = [2, 3, 1, 2]
+
+
+Output:
+  output = [v1, v2, v3, v4, v5, v6, v7, v8];
+
+
+One application for this operator is the transfer data from the format of RNN
+back to sequence values. Note that the gradient operator of
+UnpackRNNSequence is PackRNNSequence.
+)DOC")
+    .Input(0, "values", "Data tensor, contains the packed features")
+    .Input(1, "lengths", "lengths with each number representing the pack size.")
+    .Output(0, "output", "Output tensor before packing");
+
+class GetPackRNNSequenceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 2);
+    return SingleGradientDef(
+        "UnpackRNNSequence",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+
+class GetUnpackRNNSequenceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(def_.input_size(), 2);
+    return SingleGradientDef(
+        "PackRNNSequence",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(PackRNNSequence, GetPackRNNSequenceGradient);
+REGISTER_GRADIENT(UnpackRNNSequence, GetUnpackRNNSequenceGradient);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h
new file mode 100644
index 0000000..c2fcb7d
--- /dev/null
+++ b/caffe2/operators/pack_rnn_sequence_op.h
@@ -0,0 +1,93 @@
+#ifndef CAFFE2_OPERATORS_PACK_RNN_SEQUENCE_OP_H_
+#define CAFFE2_OPERATORS_PACK_RNN_SEQUENCE_OP_H_
+
+#include <algorithm>
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context, bool Forward>
+class PackRNNSequenceOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  PackRNNSequenceOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t, float, double>>::call(
+        this, Input(0));
+  }
+
+  template <typename ValT>
+  bool DoRunWithType() {
+    // The value is copied from the sequence to the pack
+    // if Forward is true, and vice versa
+    int dim_offset = Forward ? 1 : 2;
+    auto& values = Input(0);
+    CAFFE_ENFORCE_GT(values.ndim(), dim_offset);
+
+    // block_size is the size for each individual feature
+    TIndex block_size = values.size_from_dim(dim_offset);
+    auto values_vec = values.template data<ValT>();
+
+    auto& lengths = Input(LENGTHS);
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+    const auto cols = lengths.size();
+    const int32_t* lengths_vec = lengths.template data<int32_t>();
+    // the total number of rows is defined as the max number from lengths
+    // if when the lengths is empty, we set rows = 0 to support zero lengths
+    const auto rows =
+        cols ? *std::max_element(lengths_vec, lengths_vec + cols) : 0;
+    CAFFE_ENFORCE_GE(rows, 0);
+    int length_sum = 0;
+    if (cols > 0) {
+      math::Sum<int, Context>(cols, lengths_vec, &length_sum, &context_);
+    }
+
+    vector<TIndex> shape;
+    // the output shape is rows * cols for the pack,
+    // or length_sum for the sequence
+    if (Forward) {
+      shape.push_back(rows);
+      shape.push_back(cols);
+    } else {
+      shape.push_back(length_sum);
+    }
+    // insert the dim for the feature
+    shape.insert(
+        shape.end(), values.dims().begin() + dim_offset, values.dims().end());
+
+    auto* output = Output(OUTPUTVALUE);
+    output->Resize(shape);
+
+    auto output_data = output->template mutable_data<ValT>();
+    // initialize output_data with zero, as it is the default value for padding
+    // when certain length is smaller than rows
+    math::Set<ValT, Context>(output->size(), 0, output_data, &context_);
+
+    int32_t offset = 0;
+    for (int c = 0; c < cols; c++) {
+      for (int r = 0; r < lengths_vec[c]; r++) {
+        auto input_offset = Forward ? (offset + r) : (r * cols + c);
+        auto output_offset = Forward ? (r * cols + c) : (offset + r);
+        context_.template CopyItems<Context, Context>(
+            values.meta(),
+            block_size,
+            values_vec + input_offset * block_size,
+            output_data + output_offset * block_size);
+      }
+      offset += lengths_vec[c];
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(INPUTVALUE, LENGTHS);
+  OUTPUT_TAGS(OUTPUTVALUE);
+};
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PACK_RNN_SEQUENCE_OP_H_
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
new file mode 100644
index 0000000..7fcbf91
--- /dev/null
+++ b/caffe2/operators/pack_segments.cc
@@ -0,0 +1,216 @@
+#include "caffe2/operators/pack_segments.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool PackSegmentsOp<CPUContext>::DoRunWithType() {
+  return DispatchHelper<
+      TensorTypes2<char, int32_t, int64_t, float, std::string>,
+      T>::call(this, Input(DATA));
+}
+
+template <>
+template <typename T, typename Data_T>
+bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
+  const auto& data = Input(DATA);
+  const auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+  Tensor<CPUContext>* presence_mask = nullptr;
+  if (return_presence_mask_) {
+    presence_mask = Output(1);
+  }
+
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
+
+  // Find the length of the longest sequence.
+  const T* l = lengths.template data<T>();
+  T max_length = 0;
+  TIndex total_length = 0;
+  for (T i = 0; i < lengths.dim(0); ++i) {
+    max_length = std::max(max_length, l[i]);
+    total_length += l[i];
+  }
+  if (max_length_ != -1) {
+    // Final dim must be greater than the max_length
+    CAFFE_ENFORCE_GE(
+        max_length_,
+        max_length,
+        "Pre-defined max_length should be greater than the real max_length");
+    max_length = max_length_;
+  }
+
+  // Total lengths must be the same as data.dims(0)
+  CAFFE_ENFORCE_EQ(
+      data.dim(0),
+      total_length,
+      " PackSegments requires that the sum of the lengths ",
+      total_length,
+      " is equal to the first data dimension ",
+      data.dim(0));
+
+  auto shape = data.dims(); // Shape of output is batch_size x max_len x ...
+  shape[0] = max_length;
+  shape.insert(shape.begin(), lengths.size());
+  output->Resize(shape);
+
+  // create output tensor
+  auto* out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+  bool* presence_mask_data = nullptr;
+  if (return_presence_mask_) {
+    // Shape of presence is batch_size x max_len
+    std::vector<caffe2::TIndex> presence_shape{lengths.size(), max_length};
+    presence_mask->Resize(presence_shape);
+    presence_mask_data = presence_mask->template mutable_data<bool>();
+  }
+
+  if (!data.dim(0)) {
+    // Return empty output (with the proper shape)
+    return true;
+  }
+
+  // Do padding
+  if (output->template IsType<float>()) {
+    math::Set<float, CPUContext>(
+        output->size(),
+        padding_,
+        output->template mutable_data<float>(),
+        &context_);
+  }
+  if (return_presence_mask_) {
+    memset(presence_mask_data, (int)false, presence_mask->size());
+  }
+
+  auto block_size = data.size_from_dim(1);
+  auto block_bytesize = data.itemsize() * block_size;
+  const auto* d = static_cast<const char*>(data.raw_data());
+  TIndex start = 0;
+  for (TIndex i = 0; i < lengths.dim(0); ++i) {
+    context_.template CopyItems<CPUContext, CPUContext>(
+        data.meta(),
+        l[i] * block_size,
+        d + block_bytesize * start,
+        out + block_bytesize * max_length * i);
+    if (return_presence_mask_) {
+      memset(presence_mask_data + max_length * i, (int)true, l[i]);
+    }
+    start += l[i];
+  }
+
+  return true;
+}
+
+template <>
+template <typename T>
+bool UnpackSegmentsOp<CPUContext>::DoRunWithType() {
+  return DispatchHelper<
+      TensorTypes2<char, int32_t, int64_t, float, std::string>,
+      T>::call(this, Input(DATA));
+}
+
+template <>
+template <typename T, typename Data_T>
+bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
+  const auto& data = Input(DATA);
+  const auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
+
+  const T* l = lengths.template data<T>();
+
+  TIndex total_l = std::accumulate(l, l + lengths.dim(0), (TIndex)0);
+
+  auto shape = data.dims();
+  CAFFE_ENFORCE_EQ(
+      shape[0], lengths.dim(0), "LENGTH should match DATA in dimension 0");
+  shape.erase(shape.begin());
+  shape[0] = total_l;
+  output->Resize(shape);
+  // create output tensor
+  auto* out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+  if (!(data.dim(0) && data.dim(1))) {
+    return true;
+  }
+  auto block_size = data.size_from_dim(2);
+  auto block_bytesize = data.itemsize() * block_size;
+  const auto* d = static_cast<const char*>(data.raw_data());
+  TIndex start = 0;
+  for (TIndex i = 0; i < lengths.dim(0); ++i) {
+    context_.template CopyItems<CPUContext, CPUContext>(
+        data.meta(),
+        l[i] * block_size,
+        d + block_bytesize * data.dim(1) * i,
+        out + block_bytesize * start);
+    start += l[i];
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(PackSegments, PackSegmentsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(UnpackSegments, UnpackSegmentsOp<CPUContext>);
+
+OPERATOR_SCHEMA(PackSegments)
+    .NumInputs(2)
+    .NumOutputs(1, 2)
+    .SetDoc(
+        "Map N dim tensor to N+1 dim based on length blob. Sequences that \
+    are shorter than the longest sequence are padded with zeros.")
+    .Input(
+        0,
+        "lengths",
+        "1-d int/long tensor contains the length in each of the output.")
+    .Input(1, "tensor", "N dim Tensor.")
+    .Output(
+        0,
+        "packed_tensor",
+        "N + 1 dim Tensor"
+        "where dim(1) is the max length"
+        ", dim(0) is the batch size.")
+    .Output(
+        1,
+        "presence_mask",
+        "2 dim boolean tensor"
+        ", false where packed_tensor is padded, true otherwise.")
+    .Arg(
+        "pad_minf",
+        "Padding number in the packed segments. Use true to pad \
+    -infinity, otherwise pad zeros")
+    .Arg(
+        "return_presence_mask",
+        "bool whether to return presence mask, false by default");
+OPERATOR_SCHEMA(UnpackSegments)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Map N+1 dim tensor to N dim based on length blob")
+    .Input(
+        0,
+        "lengths",
+        "1-d int/long tensor contains the length in each of the input.")
+    .Input(1, "tensor", "N+1 dim Tensor.")
+    .Output(0, "packed_tensor", "N dim Tensor");
+
+class GetPackSegmentsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "UnpackSegments",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(1)});
+  }
+};
+REGISTER_GRADIENT(PackSegments, GetPackSegmentsGradient);
+
+class GetUnpackSegmentsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "PackSegments", "", vector<string>{I(0), GO(0)}, vector<string>{GI(1)});
+  }
+};
+REGISTER_GRADIENT(UnpackSegments, GetUnpackSegmentsGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
new file mode 100644
index 0000000..374266c
--- /dev/null
+++ b/caffe2/operators/pack_segments.cu
@@ -0,0 +1,320 @@
+#include <cub/cub.cuh>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pack_segments.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, typename Data_T>
+__global__ void PackSegmentsKernel(
+    const Data_T* data_ptr,
+    const T* lengths_ptr,
+    const T* lengths_cum_sum,
+    const T max_length,
+    const int64_t num_seq,
+    const int64_t cell_size,
+    Data_T padding,
+    Data_T* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
+    int seq = (i / cell_size) / max_length;
+    int cell = (i / cell_size) % max_length;
+    int offset = i % cell_size;
+    if (cell >= lengths_ptr[seq]) {
+      out_ptr[i] = padding;
+    } else {
+      int32_t idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
+      out_ptr[i] = data_ptr[idx];
+    }
+  }
+}
+
+template <typename T, typename Data_T>
+__global__ void UnpackSegmentsKernel(
+    const Data_T* data_ptr,
+    const T* lengths_ptr,
+    const T* lengths_cum_sum,
+    const T max_length,
+    const int64_t num_seq,
+    const int64_t cell_size,
+    Data_T* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
+    int seq = (i / cell_size) / max_length;
+    int cell = (i / cell_size) % max_length;
+    int offset = i % cell_size;
+    if (cell < lengths_ptr[seq]) {
+      int idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
+      out_ptr[idx] = data_ptr[i];
+    }
+  }
+}
+
+template <typename T>
+int64_t int_array_sum(
+    const T* dev_array,
+    int64_t num_items,
+    Tensor<CUDAContext>& dev_buffer,
+    Tensor<CUDAContext>& dev_sum,
+    Tensor<CPUContext>& host_sum,
+    CUDAContext& context) {
+  // Retrieve buffer size
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Sum(
+      nullptr,
+      temp_storage_bytes,
+      dev_array,
+      dev_sum.mutable_data<int64_t>(),
+      num_items,
+      context.cuda_stream());
+
+  // Allocate temporary storage
+  auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
+  dev_buffer.Resize(buffer_size);
+  void* dev_temp_storage = static_cast<void*>(dev_buffer.mutable_data<T>());
+
+  // Find sumimum
+  cub::DeviceReduce::Sum(
+      dev_temp_storage,
+      temp_storage_bytes,
+      dev_array,
+      dev_sum.mutable_data<int64_t>(),
+      num_items,
+      context.cuda_stream());
+
+  // Copy to host
+  host_sum.CopyFrom<CUDAContext>(dev_sum);
+  context.FinishDeviceComputation();
+  return *host_sum.data<int64_t>();
+}
+
+template <typename T>
+T array_max(
+    const T* dev_array,
+    int64_t num_items,
+    Tensor<CUDAContext>& dev_max_buffer,
+    Tensor<CUDAContext>& dev_max,
+    Tensor<CPUContext>& host_max,
+    CUDAContext& context) {
+  // Retrieve buffer size
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Max(
+      nullptr,
+      temp_storage_bytes,
+      dev_array,
+      dev_max.mutable_data<T>(),
+      num_items,
+      context.cuda_stream());
+
+  // Allocate temporary storage
+  auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
+  dev_max_buffer.Resize(buffer_size);
+  void* dev_temp_storage = static_cast<void*>(dev_max_buffer.mutable_data<T>());
+
+  // Find maximum
+  cub::DeviceReduce::Max(
+      dev_temp_storage,
+      temp_storage_bytes,
+      dev_array,
+      dev_max.mutable_data<T>(),
+      num_items,
+      context.cuda_stream());
+
+  // Copy to host
+  host_max.CopyFrom<CUDAContext>(dev_max);
+  context.FinishDeviceComputation();
+  return *host_max.data<T>();
+}
+
+template <typename T>
+void array_prefix_sum_exclusive(
+    const T* dev_array,
+    const int32_t num_items,
+    Tensor<CUDAContext>& prefix_buffer,
+    Tensor<CUDAContext>& prefix_sum,
+    CUDAContext& context) {
+  // Retrieve buffer size
+  size_t temp_storage_bytes = 0;
+  prefix_sum.Resize(num_items);
+  cub::DeviceScan::ExclusiveSum(
+      nullptr,
+      temp_storage_bytes,
+      dev_array,
+      prefix_sum.mutable_data<T>(),
+      num_items,
+      context.cuda_stream());
+
+  // Allocate temporary storage
+  auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
+  prefix_buffer.Resize(buffer_size);
+  void* dev_temp_storage = static_cast<void*>(prefix_buffer.mutable_data<T>());
+
+  // Exclusive sum
+  cub::DeviceScan::ExclusiveSum(
+      dev_temp_storage,
+      temp_storage_bytes,
+      dev_array,
+      prefix_sum.mutable_data<T>(),
+      num_items,
+      context.cuda_stream());
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool PackSegmentsOp<CUDAContext>::DoRunWithType() {
+  return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
+      this, Input(DATA));
+}
+
+template <>
+template <typename T, typename Data_T>
+bool PackSegmentsOp<CUDAContext>::DoRunWithType2() {
+  const auto& data = Input(DATA);
+  const auto& lengths = Input(LENGTHS);
+  int64_t num_seq = lengths.dim(0);
+  const Data_T* data_ptr = data.data<Data_T>();
+  const T* lengths_ptr = lengths.data<T>();
+  auto* out = Output(0);
+
+  if (return_presence_mask_) {
+    CAFFE_THROW("CUDA version of PackSegments does not support presence mask.");
+  }
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
+
+  // Find the length of the longest sequence.
+  dev_max_length_.Resize(1);
+  host_max_length_.Resize(1);
+
+  T temp = num_seq > 0 ? array_max<T>(
+                             lengths_ptr,
+                             num_seq,
+                             dev_buffer_,
+                             dev_max_length_,
+                             host_max_length_,
+                             context_)
+                       : 0;
+  if (max_length_ != -1) {
+    CAFFE_ENFORCE_GE(
+        max_length_,
+        temp,
+        "Pre-defined max_length should be greater than the real max_length");
+    temp = max_length_;
+  }
+  const T& max_length = temp;
+  // Compute prefix sum over the lengths
+  array_prefix_sum_exclusive<T>(
+      lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
+
+  // create output tensor
+  auto shape = data.dims(); // Shape of out is batch_size x max_len x ...
+  shape[0] = max_length;
+  shape.insert(shape.begin(), lengths.size());
+  out->Resize(shape);
+  Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
+
+  // Return empty out (with the proper shape) if first dim is 0.
+  if (!data.dim(0)) {
+    return true;
+  }
+
+  // Do padding
+  Data_T padding = out->IsType<float>() ? padding_ : 0;
+  int64_t cell_size = data.size() / data.dim(0);
+  PackSegmentsKernel<<<
+      CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      data_ptr,
+      lengths_ptr,
+      dev_lengths_prefix_sum_.data<T>(),
+      max_length,
+      num_seq,
+      cell_size,
+      padding,
+      out_ptr);
+  return true;
+}
+
+template <>
+template <typename T>
+bool UnpackSegmentsOp<CUDAContext>::DoRunWithType() {
+  return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
+      this, Input(DATA));
+}
+template <>
+template <typename T, typename Data_T>
+bool UnpackSegmentsOp<CUDAContext>::DoRunWithType2() {
+  const auto& data = Input(DATA);
+  const auto& lengths = Input(LENGTHS);
+  int64_t num_seq = lengths.dim(0);
+  const Data_T* data_ptr = data.data<Data_T>();
+  const T* lengths_ptr = lengths.data<T>();
+  auto* out = Output(0);
+
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
+
+  // Compute prefix sum over the lengths
+  array_prefix_sum_exclusive<T>(
+      lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
+
+  // compute max of the lengths
+  dev_max_length_.Resize(1);
+  host_max_length_.Resize(1);
+  const T max_length = num_seq > 0 ? array_max<T>(
+                                         lengths_ptr,
+                                         num_seq,
+                                         dev_buffer_,
+                                         dev_max_length_,
+                                         host_max_length_,
+                                         context_)
+                                   : 0;
+
+  // compute num of cells: sum of the lengths
+  dev_num_cell_.Resize(1);
+  host_num_cell_.Resize(1);
+  const int64_t num_cell = int_array_sum<T>(
+      lengths_ptr,
+      num_seq,
+      dev_buffer_,
+      dev_num_cell_,
+      host_num_cell_,
+      context_);
+
+  // create output tensor
+  auto shape = data.dims();
+  CAFFE_ENFORCE_EQ(
+      shape[0], lengths.dim(0), "LENGTH should match DATA in dimension 0");
+  shape.erase(shape.begin());
+  shape[0] = num_cell;
+  out->Resize(shape);
+  Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
+
+  // Return empty out (with the proper shape) if any of the dimensions is 0.
+  if (!(data.dim(0) * data.dim(1))) {
+    return true;
+  }
+
+  // Unpack
+  int64_t cell_size = data.size() / (data.dim(0) * data.dim(1));
+  UnpackSegmentsKernel<<<
+      CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      data_ptr,
+      lengths_ptr,
+      dev_lengths_prefix_sum_.data<T>(),
+      max_length,
+      num_seq,
+      cell_size,
+      out_ptr);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(UnpackSegments, UnpackSegmentsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(PackSegments, PackSegmentsOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h
new file mode 100644
index 0000000..091d785
--- /dev/null
+++ b/caffe2/operators/pack_segments.h
@@ -0,0 +1,90 @@
+#ifndef CAFFE2_OPERATORS_PACK_SEGMENTS_H_
+#define CAFFE2_OPERATORS_PACK_SEGMENTS_H_
+
+#include <atomic>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class PackSegmentsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  // USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
+  USE_DISPATCH_HELPER;
+
+  PackSegmentsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        max_length_(OperatorBase::GetSingleArgument<int>("max_length", -1)),
+        pad_minf_(OperatorBase::GetSingleArgument<bool>("pad_minf", false)),
+        return_presence_mask_(OperatorBase::GetSingleArgument<bool>(
+            "return_presence_mask",
+            false)) {
+    if (pad_minf_) {
+      padding_ = -1.0 * std::numeric_limits<float>::infinity();
+    } else {
+      padding_ = 0;
+    }
+  }
+
+  bool RunOnDevice() {
+    return DispatchHelper<TensorTypes<int, long>>::call(this, Input(LENGTHS));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+  template <typename T, typename Data_T>
+  bool DoRunWithType2();
+
+  INPUT_TAGS(LENGTHS, DATA);
+
+ private:
+  TIndex max_length_;
+  bool pad_minf_;
+  float padding_;
+  bool return_presence_mask_;
+
+  // Scratch space required by the CUDA version
+  Tensor<Context> dev_buffer_;
+  Tensor<Context> dev_lengths_prefix_sum_;
+  Tensor<Context> dev_max_length_;
+  Tensor<CPUContext> host_max_length_;
+};
+
+template <class Context>
+class UnpackSegmentsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(UnpackSegmentsOp)
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int, long>>::call(this, Input(LENGTHS));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+  template <typename T, typename Data_T>
+  bool DoRunWithType2();
+
+  INPUT_TAGS(LENGTHS, DATA);
+
+ private:
+  Tensor<Context> dev_buffer_;
+  Tensor<Context> dev_lengths_prefix_sum_;
+  Tensor<Context> dev_max_length_;
+  Tensor<Context> dev_num_cell_;
+  Tensor<CPUContext> host_max_length_;
+  Tensor<CPUContext> host_num_cell_;
+};
+
+} // namespace caffe2
+#endif // CAFFE2_OPERATORS_PACK_SEGMENTS_H_
diff --git a/caffe2/operators/pad_op.cc b/caffe2/operators/pad_op.cc
new file mode 100644
index 0000000..74de23e
--- /dev/null
+++ b/caffe2/operators/pad_op.cc
@@ -0,0 +1,458 @@
+#include "caffe2/operators/pad_op.h"
+
+#include <algorithm>
+
+namespace caffe2 {
+
+PadMode StringToPadMode(const string& mode) {
+  if (mode == "constant") {
+    return PadMode::CONSTANT;
+  } else if (mode == "reflect") {
+    return PadMode::REFLECT;
+  } else if (mode == "edge") {
+    return PadMode::EDGE;
+  } else {
+    CAFFE_THROW("Unknown padding mode: " + mode);
+  }
+}
+
+using std::min;
+using std::max;
+
+template <>
+bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int channels = X.dim32(1);
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  // The main loop
+  int padded_height = Y->dim32(2);
+  int padded_width = Y->dim32(3);
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < padded_height; ++ph) {
+            for (int pw = 0; pw < padded_width; ++pw) {
+              int h = ph - pad_t();
+              int w = pw - pad_l();
+              Ydata[ph * padded_width + pw] =
+                  (h < 0 || w < 0 || h >= height || w >= width)
+                  ? value_
+                  : Xdata[h * width + w];
+            }
+          }
+          // Do offset.
+          Xdata += height * width;
+          Ydata += padded_height * padded_width;
+        }
+      }
+      break;
+    case PadMode::REFLECT:
+      if (pad_r() >= 0 && pad_t() >= 0 && pad_l() >= 0 && pad_b() >= 0) {
+        for (int n = 0; n < X.dim32(0); ++n) {
+          for (int c = 0; c < channels; ++c) {
+            // Handle the valid region:
+            // i.e. Y[n][c][pad_t:pad_t+h][pad_l:pad_l+w]
+            auto* Ystart = Ydata + pad_t() * padded_width + pad_l();
+            math::CopyMatrix<CPUContext>(
+                sizeof(float),
+                height,
+                width,
+                Xdata,
+                width,
+                Ystart,
+                padded_width,
+                &context_);
+
+// Fixup areas where we need to reflect
+#define X(ph, pw)                 \
+  int h = ph - pad_t();           \
+  int w = pw - pad_l();           \
+  h = max(h, -h);                 \
+  h = min(h, 2 * height - h - 2); \
+  w = max(w, -w);                 \
+  w = min(w, 2 * width - w - 2);  \
+  Ydata[ph * padded_width + pw] = Xdata[h * width + w]
+
+            // Top part
+            for (int ph = 0; ph < pad_t(); ++ph) {
+              for (int pw = 0; pw < padded_width; ++pw) {
+                X(ph, pw);
+              }
+            }
+
+            // Bottom part
+            for (int ph = padded_height - pad_b(); ph < padded_height; ++ph) {
+              for (int pw = 0; pw < padded_width; ++pw) {
+                X(ph, pw);
+              }
+            }
+
+            // Interior
+            for (int ph = pad_t(); ph < padded_height - pad_b(); ++ph) {
+              // Left
+              for (int pw = 0; pw < pad_l(); ++pw) {
+                X(ph, pw);
+              }
+              // Right
+              for (int pw = padded_width - pad_r(); pw < padded_width; ++pw) {
+                X(ph, pw);
+              }
+            }
+#undef X
+
+            // Do offset.
+            Xdata += height * width;
+            Ydata += padded_height * padded_width;
+          }
+        }
+      } else {
+        for (int n = 0; n < X.dim32(0); ++n) {
+          for (int c = 0; c < channels; ++c) {
+            for (int ph = 0; ph < padded_height; ++ph) {
+              for (int pw = 0; pw < padded_width; ++pw) {
+                int h = ph - pad_t();
+                int w = pw - pad_l();
+                // max(h, -h) does reflection over 0
+                h = max(h, -h);
+                // min(h, 2 * height - h - 2) does reflection over height.
+                h = min(h, 2 * height - h - 2);
+                w = max(w, -w);
+                w = min(w, 2 * width - w - 2);
+                Ydata[ph * padded_width + pw] = Xdata[h * width + w];
+              }
+            }
+            // Do offset.
+            Xdata += height * width;
+            Ydata += padded_height * padded_width;
+          }
+        }
+      }
+      break;
+    case PadMode::EDGE:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < padded_height; ++ph) {
+            for (int pw = 0; pw < padded_width; ++pw) {
+              // Bounds to the right range.
+              int h = min(height - 1, max(ph - pad_t(), 0));
+              int w = min(width - 1, max(pw - pad_l(), 0));
+              Ydata[ph * padded_width + pw] = Xdata[h * width + w];
+            }
+          }
+          // Do offset.
+          Xdata += height * width;
+          Ydata += padded_height * padded_width;
+        }
+      }
+      break;
+  }
+  return true;
+}
+
+template <>
+bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim32(1);
+  int width = X.dim32(2);
+  int channels = X.dim32(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+
+  // The main loop
+  int padded_height = Y->dim32(1);
+  int padded_width = Y->dim32(2);
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            int h = ph - pad_t();
+            int w = pw - pad_l();
+            const int pad_index = (ph * padded_width + pw) * channels;
+            if (h < 0 || w < 0 || h >= height || w >= width) {
+              for (int c = 0; c < channels; ++c) {
+                Ydata[pad_index + c] = value_;
+              }
+            } else {
+              const int input_index = (h * width + w) * channels;
+              for (int c = 0; c < channels; ++c) {
+                Ydata[pad_index + c] = Xdata[input_index + c];
+              }
+            }
+          }
+        }
+        // Do offset.
+        Xdata += X.size() / X.dim32(0);
+        Ydata += Y->size() / Y->dim32(0);
+      }
+      break;
+    case PadMode::REFLECT:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            const int pad_index = (ph * padded_width + pw) * channels;
+            int h = ph - pad_t();
+            int w = pw - pad_l();
+            // max(h, -h) does reflection over 0
+            h = max(h, -h);
+            // min(h, 2 * height - h - 2) does reflection over height.
+            h = min(h, 2 * height - h - 2);
+            w = max(w, -w);
+            w = min(w, 2 * width - w - 2);
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pad_index + c] = Xdata[input_index + c];
+            }
+          }
+        }
+        // Do offset.
+        Xdata += X.size() / X.dim32(0);
+        Ydata += Y->size() / Y->dim32(0);
+      }
+      break;
+    case PadMode::EDGE:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            const int pad_index = (ph * padded_width + pw) * channels;
+            int h = min(height - 1, max(ph - pad_t(), 0));
+            int w = min(width - 1, max(pw - pad_l(), 0));
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pad_index + c] = Xdata[input_index + c];
+            }
+          }
+        }
+        // Do offset.
+        Xdata += X.size() / X.dim32(0);
+        Ydata += Y->size() / Y->dim32(0);
+      }
+      break;
+  }
+  return true;
+}
+
+template <>
+bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(
+      dY.dim32(0),
+      dY.dim32(1),
+      dY.dim32(2) - pad_t() - pad_b(),
+      dY.dim32(3) - pad_l() - pad_r());
+  int padded_height = dY.dim32(2);
+  int padded_width = dY.dim32(3);
+  int channels = dX->dim32(1);
+  int height = dX->dim32(2);
+  int width = dX->dim32(3);
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
+  // The main loop
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < padded_height; ++ph) {
+            for (int pw = 0; pw < padded_width; ++pw) {
+              int h = ph - pad_t();
+              int w = pw - pad_l();
+              if (!(h < 0 || w < 0 || h >= height || w >= width)) {
+                dXdata[h * width + w] += dYdata[ph * padded_width + pw];
+              }
+            }
+          }
+          // Do offset.
+          dXdata += height * width;
+          dYdata += padded_height * padded_width;
+        }
+      }
+      break;
+    case PadMode::REFLECT:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < padded_height; ++ph) {
+            for (int pw = 0; pw < padded_width; ++pw) {
+              int h = ph - pad_t();
+              int w = pw - pad_l();
+              // max(h, -h) does reflection over 0
+              h = max(h, -h);
+              // min(h, 2 * height - h - 2) does reflection over height.
+              h = min(h, 2 * height - h - 2);
+              w = max(w, -w);
+              w = min(w, 2 * width - w - 2);
+              dXdata[h * width + w] += dYdata[ph * padded_width + pw];
+            }
+          }
+          // Do offset.
+          dXdata += height * width;
+          dYdata += padded_height * padded_width;
+        }
+      }
+      break;
+    case PadMode::EDGE:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < padded_height; ++ph) {
+            for (int pw = 0; pw < padded_width; ++pw) {
+              int h = min(height - 1, max(ph - pad_t(), 0));
+              int w = min(width - 1, max(pw - pad_l(), 0));
+              dXdata[h * width + w] += dYdata[ph * padded_width + pw];
+            }
+          }
+          // Do offset.
+          dXdata += height * width;
+          dYdata += padded_height * padded_width;
+        }
+      }
+      break;
+  }
+  return true;
+}
+
+template <>
+bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(
+      dY.dim32(0),
+      dY.dim32(1) - pad_t() - pad_b(),
+      dY.dim32(2) - pad_l() - pad_r(),
+      dY.dim32(3));
+  int padded_height = dY.dim32(1);
+  int padded_width = dY.dim32(2);
+  int channels = dY.dim32(3);
+  int height = dX->dim32(1);
+  int width = dX->dim32(2);
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            int h = ph - pad_t();
+            int w = pw - pad_l();
+            const int pad_index = (ph * padded_width + pw) * channels;
+            if (!(h < 0 || w < 0 || h >= height || w >= width)) {
+              const int input_index = (h * width + w) * channels;
+              for (int c = 0; c < channels; ++c) {
+                dXdata[input_index + c] += dYdata[pad_index + c];
+              }
+            }
+          }
+        }
+        // Do offset.
+        dXdata += dX->size() / dX->dim32(0);
+        dYdata += dY.size() / dY.dim32(0);
+      }
+      break;
+    case PadMode::REFLECT:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            const int pad_index = (ph * padded_width + pw) * channels;
+            int h = ph - pad_t();
+            int w = pw - pad_l();
+            // max(h, -h) does reflection over 0
+            h = max(h, -h);
+            // min(h, 2 * height - h - 2) does reflection over height.
+            h = min(h, 2 * height - h - 2);
+            w = max(w, -w);
+            w = min(w, 2 * width - w - 2);
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              dXdata[input_index + c] += dYdata[pad_index + c];
+            }
+          }
+        }
+        // Do offset.
+        dXdata += dX->size() / dX->dim32(0);
+        dYdata += dY.size() / dY.dim32(0);
+      }
+      break;
+    case PadMode::EDGE:
+      for (int n = 0; n < dY.dim32(0); ++n) {
+        for (int ph = 0; ph < padded_height; ++ph) {
+          for (int pw = 0; pw < padded_width; ++pw) {
+            const int pad_index = (ph * padded_width + pw) * channels;
+            // Bounds to the right range.
+            int h = min(height - 1, max(ph - pad_t(), 0));
+            int w = min(width - 1, max(pw - pad_l(), 0));
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              dXdata[input_index + c] += dYdata[pad_index + c];
+            }
+          }
+        }
+        // Do offset.
+        dXdata += dX->size() / dX->dim32(0);
+        dYdata += dY.size() / dY.dim32(0);
+      }
+      break;
+  }
+  return true;
+}
+
+template <>
+std::vector<TensorShape> PadImageOp<float, CPUContext>::PadTensorInference(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  return ConvPoolOpBase::TensorInferenceForPool(def, in);
+}
+
+REGISTER_CPU_OPERATOR(PadImage, PadImageOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(PadImageGradient, PadImageGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(PadImage)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(PadImageOp<float, CPUContext>::PadTensorInference)
+    .SetDoc(R"DOC(
+PadImage pads values around the boundary of an image according to the pad
+values and stride sizes defined by the ConvPoolOpBase operator.
+  )DOC")
+    .Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; dimensions "
+        "depend on whether the NCHW or NHWC operators are being used. For example, "
+        "in the former, the input has size (N x C x H x W), where N is the batch "
+        "size, C is the number of channels, and H and W are the height and the width "
+        "of the data. The corresponding permutation of dimensions is used in the "
+        "latter case. ")
+    .Output(
+        0,
+        "Y",
+        "Output data tensor from padding the H and W dimensions on "
+        "the tensor. Dimensions will vary based on various pad and stride "
+        "sizes.");
+
+OPERATOR_SCHEMA(PadImageGradient).NumInputs(1).NumOutputs(1);
+
+class GetPadImageGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "PadImageGradient", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(PadImage, GetPadImageGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/pad_op.h b/caffe2/operators/pad_op.h
new file mode 100644
index 0000000..5ab4f37
--- /dev/null
+++ b/caffe2/operators/pad_op.h
@@ -0,0 +1,92 @@
+#ifndef CAFFE2_OPERATORS_PAD_OP_H_
+#define CAFFE2_OPERATORS_PAD_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Padding mode similar to numpy.
+enum class PadMode {
+  CONSTANT = 0, // pad constant values, with string "constant"
+  REFLECT = 1, // pads with reflect values, with string "reflect"
+  EDGE = 2, // pads with the edge values, with string "edge"
+};
+
+PadMode StringToPadMode(const string&);
+
+template <typename T, class Context>
+class PadImageOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  PadImageOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws),
+        mode_(StringToPadMode(
+            OperatorBase::GetSingleArgument<string>("mode", "constant"))),
+        value_(static_cast<T>(
+            OperatorBase::GetSingleArgument<float>("value", 0.0))) {
+    CAFFE_ENFORCE(
+        legacy_pad_ == LegacyPadding::NOTSET,
+        "Padding layer only supports explicit pad values.");
+    CAFFE_ENFORCE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "Pooling op does not support dilation right now.");
+    CAFFE_ENFORCE(
+        stride_h() == 1 && stride_w() == 1,
+        "Pooling op does not support stride right now.");
+    // Pad op does not use kernel sizes, so we set it to 1 for computing the
+    // output size.
+    kernel_.assign(pads_.size() / 2, 1);
+  }
+  ~PadImageOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  static std::vector<TensorShape> PadTensorInference(
+      const OperatorDef& def,
+      const vector<TensorShape>& in);
+
+ private:
+  PadMode mode_;
+  T value_;
+
+  // Input: X
+  // Output: Y
+};
+
+template <typename T, class Context>
+class PadImageGradientOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  PadImageGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws),
+        mode_(StringToPadMode(
+            OperatorBase::GetSingleArgument<string>("mode", "constant"))) {
+    CAFFE_ENFORCE(
+        legacy_pad_ == LegacyPadding::NOTSET,
+        "Padding layer only supports explicit pad values.");
+    CAFFE_ENFORCE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "Pooling op does not support dilation right now.");
+    // Pad op does not use kernel sizes, so we set it to 1 for computing the
+    // output size.
+    kernel_.assign(pads_.size() / 2, 1);
+  }
+  ~PadImageGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  PadMode mode_;
+  // Input: dY
+  // Output: dX
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PAD_OP_H_
diff --git a/caffe2/operators/pad_op_gpu.cu b/caffe2/operators/pad_op_gpu.cu
new file mode 100644
index 0000000..bfb4542
--- /dev/null
+++ b/caffe2/operators/pad_op_gpu.cu
@@ -0,0 +1,569 @@
+#include <algorithm>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pad_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void PadImageConstNCHW(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T value, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / padded_width;
+    const int pw = index % padded_width;
+    const int ph = nc % padded_height;
+    nc /= padded_height;
+    const int h = ph - pad_t;
+    const int w = pw - pad_l;
+    top_data[index] = (h < 0 || w < 0 || h >= height || w >= width)
+        ? value
+        : bottom_data[(nc * height + h) * width + w];
+  }
+}
+
+template <typename T>
+__global__ void PadImageReflectNCHW(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / padded_width;
+    const int pw = index % padded_width;
+    const int ph = nc % padded_height;
+    nc /= padded_height;
+    int h = ph - pad_t;
+    int w = pw - pad_l;
+    h = max(h, -h);
+    w = max(w, -w);
+    h = min(h, 2 * height - h - 2);
+    w = min(w, 2 * width - w - 2);
+    top_data[index] = bottom_data[(nc * height + h) * width + w];
+  }
+}
+
+template <typename T>
+__global__ void PadImageEdgeNCHW(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / padded_width;
+    const int pw = index % padded_width;
+    const int ph = nc % padded_height;
+    nc /= padded_height;
+    const int h = min(height - 1, max(ph - pad_t, 0));
+    const int w = min(width - 1, max(pw - pad_l, 0));
+    top_data[index] = bottom_data[(nc * height + h) * width + w];
+  }
+}
+
+template <typename T>
+__global__ void PadImageConstNHWC(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T value, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % padded_width;
+    n /= padded_width;
+    const int ph = n % padded_height;
+    n /= padded_height;
+    const int h = ph - pad_t;
+    const int w = pw - pad_l;
+    top_data[index] = (h < 0 || w < 0 || h >= height || w >= width)
+        ? value
+        : bottom_data[((n * height + h) * width + w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void PadImageReflectNHWC(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % padded_width;
+    n /= padded_width;
+    const int ph = n % padded_height;
+    n /= padded_height;
+    int h = ph - pad_t;
+    int w = pw - pad_l;
+    h = max(h, -h);
+    w = max(w, -w);
+    h = min(h, 2 * height - h - 2);
+    w = min(w, 2 * width - w - 2);
+    top_data[index] =
+        bottom_data[((n * height + h) * width + w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void PadImageEdgeNHWC(
+    const int nthreads, const T* const bottom_data, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % padded_width;
+    n /= padded_width;
+    const int ph = n % padded_height;
+    n /= padded_height;
+    const int h = min(height - 1, max(ph - pad_t, 0));
+    const int w = min(width - 1, max(pw - pad_l, 0));
+    top_data[index] =
+        bottom_data[((n * height + h) * width + w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientConstNCHW(
+    const int nthreads, const T* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / width;
+    const int pw = index % width + pad_l;
+    const int ph = nc % height + pad_t;
+    nc /= height;
+    bottom_diff[index] =
+        top_diff[(nc * padded_height + ph) * padded_width + pw];
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientReflectNCHW(
+    const int nthreads, const T* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / padded_width;
+    const int pw = index % padded_width;
+    const int ph = nc % padded_height;
+    nc /= padded_height;
+    int h = ph - pad_t;
+    int w = pw - pad_l;
+    h = max(h, -h);
+    w = max(w, -w);
+    h = min(h, 2 * height - h - 2);
+    w = min(w, 2 * width - w - 2);
+    atomicAdd(&bottom_diff[(nc * height + h) * width + w], top_diff[index]);
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientEdgeNCHW(
+    const int nthreads, const T* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / padded_width;
+    const int pw = index % padded_width;
+    const int ph = nc % padded_height;
+    nc /= padded_height;
+    const int h = min(height - 1, max(ph - pad_t, 0));
+    const int w = min(width - 1, max(pw - pad_l, 0));
+    atomicAdd(&bottom_diff[(nc * height + h) * width + w], top_diff[index]);
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientConstNHWC(
+    const int nthreads, const T* const top_diff, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % width + pad_l;
+    n /= width;
+    const int ph = n % height + pad_t;
+    n /= height;
+    bottom_diff[index] =
+        top_diff[((n * padded_height + ph) * padded_width + pw) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientReflectNHWC(
+    const int nthreads, const T* const top_diff, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % padded_width;
+    n /= padded_width;
+    const int ph = n % padded_height;
+    n /= padded_height;
+    int h = ph - pad_t;
+    int w = pw - pad_l;
+    h = max(h, -h);
+    w = max(w, -w);
+    h = min(h, 2 * height - h - 2);
+    w = min(w, 2 * width - w - 2);
+    atomicAdd(
+        &bottom_diff[((n * height + h) * width + w) * channels + c],
+        top_diff[index]);
+  }
+}
+
+template <typename T>
+__global__ void PadImageGradientEdgeNHWC(
+    const int nthreads, const T* const top_diff, const int num,
+    const int height, const int width, const int channels,
+    const int padded_height, const int padded_width,
+    const int pad_t, const int pad_l, T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int pw = n % padded_width;
+    n /= padded_width;
+    const int ph = n % padded_height;
+    n /= padded_height;
+    const int h = min(height - 1, max(ph - pad_t, 0));
+    const int w = min(width - 1, max(pw - pad_l, 0));
+    atomicAdd(
+        &bottom_diff[((n * height + h) * width + w) * channels + c],
+        top_diff[index]);
+  }
+}
+
+}  // namespace
+
+template <>
+bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  const int num = X.dim32(0);
+  const int channels = X.dim32(1);
+  const int height = X.dim32(2);
+  const int width = X.dim32(3);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels);
+  const int output_size = Y->size();
+  const int padded_height = Y->dim32(2);
+  const int padded_width = Y->dim32(3);
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      PadImageConstNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          value_,
+          Ydata);
+      break;
+    case PadMode::REFLECT:
+      PadImageReflectNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          Ydata);
+      break;
+    case PadMode::EDGE:
+      PadImageEdgeNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          Ydata);
+      break;
+  }
+
+  return true;
+}
+
+template<>
+bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  const int num = X.dim32(0);
+  const int height = X.dim32(1);
+  const int width = X.dim32(2);
+  const int channels = X.dim32(3);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels);
+  const int output_size = Y->size();
+  const int padded_height = Y->dim32(1);
+  const int padded_width = Y->dim32(2);
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      PadImageConstNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          value_,
+          Ydata);
+      break;
+    case PadMode::REFLECT:
+      PadImageReflectNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          Ydata);
+      break;
+    case PadMode::EDGE:
+      PadImageEdgeNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          Xdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          Ydata);
+      break;
+  }
+
+  return true;
+}
+
+template<>
+bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(
+      dY.dim32(0),
+      dY.dim32(1),
+      dY.dim32(2) - pad_t() - pad_b(),
+      dY.dim32(3) - pad_l() - pad_r());
+  const int input_size = dY.size();
+  const int padded_height = dY.dim32(2);
+  const int padded_width = dY.dim32(3);
+  const int output_size = dX->size();
+  const int num = dX->dim32(0);
+  const int channels = dX->dim32(1);
+  const int height = dX->dim32(2);
+  const int width = dX->dim32(3);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      PadImageGradientConstNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          dYdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+    case PadMode::REFLECT:
+      PadImageGradientReflectNCHW<float><<<
+          CAFFE_GET_BLOCKS(input_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          input_size,
+          dYdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+    case PadMode::EDGE:
+      PadImageGradientEdgeNCHW<float><<<
+          CAFFE_GET_BLOCKS(input_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          input_size,
+          dYdata,
+          num,
+          channels,
+          height,
+          width,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+  }
+
+  return true;
+}
+
+template<>
+bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->Resize(
+      dY.dim32(0),
+      dY.dim32(1) - pad_t() - pad_b(),
+      dY.dim32(2) - pad_l() - pad_r(),
+      dY.dim32(3));
+  const int input_size = dY.size();
+  const int padded_height = dY.dim32(1);
+  const int padded_width = dY.dim32(2);
+  const int output_size = dX->size();
+  const int num = dX->dim32(0);
+  const int height = dX->dim32(1);
+  const int width = dX->dim32(2);
+  const int channels = dX->dim32(3);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
+
+  switch (mode_) {
+    case PadMode::CONSTANT:
+      PadImageGradientConstNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          dYdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+    case PadMode::REFLECT:
+      PadImageGradientReflectNHWC<float><<<
+          CAFFE_GET_BLOCKS(input_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          input_size,
+          dYdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+    case PadMode::EDGE:
+      PadImageGradientEdgeNHWC<float><<<
+          CAFFE_GET_BLOCKS(input_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          input_size,
+          dYdata,
+          num,
+          height,
+          width,
+          channels,
+          padded_height,
+          padded_width,
+          pad_t(),
+          pad_l(),
+          dXdata);
+      break;
+  }
+
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(PadImage, PadImageOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(PadImageGradient,
+                       PadImageGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/partition_ops.cc b/caffe2/operators/partition_ops.cc
new file mode 100644
index 0000000..68582ee
--- /dev/null
+++ b/caffe2/operators/partition_ops.cc
@@ -0,0 +1,115 @@
+#include "caffe2/operators/partition_ops.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(Partition, PartitionOp);
+REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);
+REGISTER_CPU_OPERATOR(GatherByKey, GatherByKeyOp);
+
+OPERATOR_SCHEMA(GatherByKey)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Inverse operation of Partition.
+
+Takes the original, full 'keys' tensor followed by sharded value tensors,
+and returns the full value tensor, combined using the same hash used in
+Partition.
+)DOC")
+    .Input(
+        0,
+        "keys",
+        "The first input is the full keys tensor"
+        " (same as the first input of Partition).")
+    .Input(
+        1,
+        "sharded_values",
+        "Subsequented inputs are sharded values tensors.")
+    .Output(0, "values", "Reconstructed values tensor.");
+
+OPERATOR_SCHEMA(Partition)
+    .NumInputsOutputs([](int in, int out) {
+      return in > 0 && out > 0 && out % in == 0;
+    })
+    .SetDoc(R"DOC(
+Splits the input int tensor into multiple ones according to the first tensor.
+
+Takes the first input and partitions it to shards according to the remainder of
+values modulo the number of partitions. It requires that the first tensor is of
+integral type. The number of partitions is derived as (num_output / num_input).
+
+If additional inputs are present they must have the same shape as the first
+input, optionally with extra trailing dimensions. They will be partitioned
+accordingly to the first input.
+
+Optional arg 'pack_first_input' transforms the first tensor values as
+X_ij / num_partitions.
+
+Outputs are ordered as
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
+)DOC")
+    .Arg(
+        "pack_first_input",
+        "(int, default 0) If set, the operator transforms "
+        "the first tensor values as floor(X_ij / num_partitions)")
+    .Input(
+        0,
+        "input",
+        "Input tensor containing data to be partitioned. The "
+        "number of input tensors might be greater than 1 but must have the "
+        "same shape as the previous tensors.")
+    .Output(
+        0,
+        "partitions",
+        "Output Partitions. The number of output tensors has to be a "
+        "multiple of the number of input tensors.");
+
+OPERATOR_SCHEMA(LengthsPartition)
+    .NumInputsOutputs([](int in, int out) {
+      return in >= 2 && out > 0 && out % in == 0;
+    })
+    .SetDoc(R"DOC(
+LengthsPartition splits the input int tensor into multiple ones according to the
+second tensor. The first dimension is expected to be the tensor that describes
+lengths of the elements.
+
+Takes the second input and partitions it to shards according to the remainder of
+values modulo the number of partitions. It requires the second tensor to be
+a 1D-tensor of the integral type. The first tensor should be 1D-tensor of int32
+that would represent the lengths of the elements in the input. The number of
+partitions is derived as (num_output / num_input).
+
+If additional inputs are present they must have the same shape as the first
+input, optionally with extra trailing dimensions. They will be partitioned
+accordingly to the first input.
+
+Optional arg 'pack_first_input' transforms the first tensor values as
+X_ij / num_partitions.
+
+Outputs are ordered as
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
+)DOC")
+    .Arg(
+        "pack_first_input",
+        "(int, default 0) If set, the operator transforms "
+        "the first tensor values as floor(X_ij / num_partitions)")
+    .Input(
+        0,
+        "input",
+        "Input tensor containing data to be partitioned. The "
+        "number of input tensors might be greater than 1 but must have the "
+        "same shape as the previous tensors.")
+    .Output(
+        0,
+        "partitions",
+        "Output Partitions. The number of output tensors has to be a "
+        "multiple of the number of input tensors.");
+
+// This should actually have gradient, but for now nothing uses it.
+// Because gradient computation right now is not input/output aware it can't be
+// GRADIENT_NOT_IMPLEMENTEDYET
+NO_GRADIENT(Partition);
+NO_GRADIENT(LengthsPartition);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
new file mode 100644
index 0000000..563a026
--- /dev/null
+++ b/caffe2/operators/partition_ops.h
@@ -0,0 +1,297 @@
+#ifndef CAFFE2_OPERATORS_PARTITION_OPS_H_
+#define CAFFE2_OPERATORS_PARTITION_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Index>
+static inline int moduloPartition(Index key, int numPartitions) {
+  int shard = key % numPartitions;
+  // equivalent to `if (shard < 0) shard += partitions;`
+  shard += numPartitions & (shard >> (sizeof(int) * 8 - 1));
+  return shard;
+}
+
+class GatherByKeyOp : public Operator<CPUContext> {
+ public:
+  USE_DISPATCH_HELPER;
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  GatherByKeyOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+ private:
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    const auto numPartitions = InputSize() - 1;
+    CAFFE_ENFORCE_GE(numPartitions, 1);
+    const auto& keysTensor = Input(0);
+    const auto* keysData = keysTensor.template data<Index>();
+    const auto& keysShape = Input(0).dims();
+    CAFFE_ENFORCE_EQ(
+        keysShape.size(), 1, "Only 1D keys tensor supported currently.");
+
+    // 1. Shape and type consistency checks
+    const auto& in0Shape = Input(1).dims();
+    CAFFE_ENFORCE_GE(in0Shape.size(), 1);
+
+    vector<TIndex> outShape(keysShape);
+    outShape.insert(outShape.end(), in0Shape.begin() + 1, in0Shape.end());
+
+    CAFFE_ENFORCE_GE(outShape.size(), 1);
+    auto totalSize = in0Shape[0];
+    auto meta = Input(1).meta();
+    for (int i = 2; i < InputSize(); ++i) {
+      const auto& input = Input(i);
+      CAFFE_ENFORCE(meta == input.meta());
+      CAFFE_ENFORCE_GE(input.ndim(), 1);
+      CAFFE_ENFORCE(std::equal(
+          outShape.begin() + keysShape.size(),
+          outShape.end(),
+          input.dims().begin() + 1));
+      totalSize += input.dim(0);
+    }
+    CAFFE_ENFORCE_EQ(keysTensor.size(), totalSize);
+
+    auto* outTensor = Output(0);
+    outTensor->Resize(outShape);
+    auto* outData = static_cast<char*>(outTensor->raw_mutable_data(meta));
+    const auto blockSize = outTensor->size_from_dim(1);
+
+    inputDatas_.resize(numPartitions);
+    for (int i = 0; i < numPartitions; ++i) {
+      inputDatas_[i] = static_cast<const char*>(Input(i + 1).raw_data());
+    }
+    inStartOffsets_.assign(numPartitions, 0);
+    Index outStartOffset = 0;
+    int currentShard = -1;
+
+    // 2. copy from inputs into output based on shard for each input key
+    const auto numEntries = keysTensor.size();
+    for (int64_t i = 0; i <= numEntries; ++i) {
+      auto newShard =
+          i < numEntries ? moduloPartition(keysData[i], numPartitions) : -1;
+      if (newShard != currentShard) {
+        if (currentShard != -1) {
+          auto inStartOffset = inStartOffsets_[currentShard];
+          auto numItems = i - outStartOffset;
+          context_.template CopyItems<CPUContext, CPUContext>(
+              meta,
+              numItems * blockSize,
+              inputDatas_[currentShard] +
+                  inStartOffset * blockSize * meta.itemsize(),
+              outData + outStartOffset * blockSize * meta.itemsize());
+          inStartOffsets_[currentShard] += numItems;
+        }
+        currentShard = newShard;
+        outStartOffset = i;
+      }
+    }
+
+    return true;
+  }
+
+  std::vector<const char*> inputDatas_;
+  std::vector<int64_t> inStartOffsets_;
+};
+
+class PartitionOpBase : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+
+  PartitionOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        OP_SINGLE_ARG(int, "pack_first_input", pack_first_input_, 0) {}
+
+ protected:
+  template <typename Index>
+  void ApplyPartition(bool skipFirstArgument) {
+    CAFFE_ENFORCE_EQ(
+        OutputSize() % InputSize(),
+        0,
+        "Output number must be a multiple of input number");
+    int partitions = OutputSize() / InputSize();
+    int inputSize = InputSize();
+    int mainInputIndex = skipFirstArgument;
+    CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
+
+    auto& main_input = Input(mainInputIndex);
+    TIndex size = main_input.size();
+    const Index* data = main_input.template data<Index>();
+    counts_.assign(partitions, 0);
+    for (TIndex p = 0; p < size; p++) {
+      int shard = moduloPartition(data[p], partitions);
+      ++counts_[shard];
+    }
+
+    raw_datas_.resize(inputSize);
+    block_sizes_.resize(inputSize);
+    metas_.resize(inputSize);
+    out_datas_.resize(OutputSize());
+    for (int i = mainInputIndex; i < inputSize; ++i) {
+      auto& input = Input(i);
+      if (i > mainInputIndex) {
+        CAFFE_ENFORCE_GE(
+            input.ndim(),
+            main_input.ndim(),
+            "Prefix of extra input's shape must match main input's shape, ",
+            "input: ",
+            i);
+        for (int j = 0; j < main_input.ndim(); ++j) {
+          CAFFE_ENFORCE_GE(
+              input.dim(j),
+              main_input.dim(j),
+              "Prefix of extra input's shape must match main input's shape, ",
+              "input: ",
+              i,
+              ", dim ",
+              j);
+        }
+      }
+      raw_datas_[i] = input.raw_data();
+      block_sizes_[i] = input.size_from_dim(main_input.ndim());
+      metas_[i] = input.meta();
+      // shape = partition_size + suffix of input dims
+      vector<TIndex> shape(
+          input.dims().begin() + main_input.ndim() - 1, input.dims().end());
+      for (int j = 0; j < partitions; ++j) {
+        int out_idx = i + j * inputSize;
+        auto output = Output(out_idx);
+        shape[0] = counts_[j];
+        output->Resize(shape);
+        out_datas_[out_idx] = output->raw_mutable_data(input.meta());
+      }
+    }
+
+    counts_.assign(partitions, 0);
+    for (TIndex p = 0; p < size; p++) {
+      int shard = moduloPartition(data[p], partitions);
+      TIndex idx = counts_[shard]++;
+
+      // special case first input
+      static_cast<Index*>(out_datas_[shard * inputSize + mainInputIndex])[idx] =
+          pack_first_input_ ? ((data[p] - shard) / partitions) : data[p];
+
+      int baseIndex = shard * inputSize;
+      for (int i = mainInputIndex + 1; i < inputSize; ++i) {
+        auto bs = block_sizes_[i];
+        auto meta = metas_[i];
+        // special case for small bs?
+        context_.template CopyItems<CPUContext, CPUContext>(
+            meta,
+            bs,
+            static_cast<const char*>(raw_datas_[i]) + p * bs * meta.itemsize(),
+            static_cast<char*>(out_datas_[baseIndex + i]) +
+                idx * bs * meta.itemsize());
+      }
+    }
+  }
+
+  bool pack_first_input_;
+
+  // use member fields to reuse memory
+  vector<TIndex> counts_;
+  vector<TIndex> block_sizes_;
+  vector<TypeMeta> metas_;
+  vector<const void*> raw_datas_;
+  vector<void*> out_datas_;
+};
+
+class PartitionOp : public PartitionOpBase {
+ public:
+  USE_DISPATCH_HELPER;
+
+  PartitionOp(const OperatorDef& operator_def, Workspace* ws)
+      : PartitionOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    ApplyPartition<Index>(false /* skipFirstArgument */);
+    return true;
+  }
+
+  DISABLE_COPY_AND_ASSIGN(PartitionOp);
+};
+
+class LengthsPartitionOp : public PartitionOpBase {
+ public:
+  USE_DISPATCH_HELPER;
+
+  LengthsPartitionOp(const OperatorDef& operator_def, Workspace* ws)
+      : PartitionOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(1));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    CAFFE_ENFORCE(
+        OutputSize() % InputSize() == 0,
+        "Output number must be a multiple of input number");
+    int partitions = OutputSize() / InputSize();
+    CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
+    CAFFE_ENFORCE_EQ(
+        Input(1).ndim(),
+        1,
+        "Only 1-D tensors supported as a partitioning tensor for sharding");
+
+    // Apply sharding to all parameters except lengths
+    ApplyPartition<Index>(true /* skipFirstArgument */);
+
+    // Compute lengths after sharding
+    auto& main_input = Input(1);
+    TIndex size = main_input.size();
+    const Index* data = main_input.template data<Index>();
+
+    auto& length_input = Input(0);
+    TIndex elements = length_input.size();
+    const int32_t* lengths_data = length_input.template data<int32_t>();
+    out_length_.resize(partitions);
+    for (int i = 0; i < partitions; ++i) {
+      auto& output = *Output(i * InputSize());
+      output.Resize(elements);
+      out_length_[i] = output.template mutable_data<int32_t>();
+    }
+
+    int total_length = 0;
+    for (int i = 0; i < elements; ++i) {
+      total_length += lengths_data[i];
+    }
+    CAFFE_ENFORCE(
+        total_length == size,
+        "Total length is not matching to the number of elements");
+
+    int index = 0;
+    for (int i = 0; i < elements; ++i) {
+      for (int j = 0; j < partitions; ++j) {
+        out_length_[j][i] = 0;
+      }
+      for (int j = 0; j < lengths_data[i]; ++j, ++index) {
+        int shard = moduloPartition(data[index], partitions);
+        ++out_length_[shard][i];
+      }
+    }
+    return true;
+  }
+
+  DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
+
+  vector<int32_t*> out_length_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PARTITION_OPS_H_
diff --git a/caffe2/operators/percentile_op.cc b/caffe2/operators/percentile_op.cc
new file mode 100644
index 0000000..ddb907d
--- /dev/null
+++ b/caffe2/operators/percentile_op.cc
@@ -0,0 +1,133 @@
+#include "caffe2/operators/percentile_op.h"
+
+namespace caffe2 {
+
+template <>
+bool PercentileOp<CPUContext>::RunOnDevice() {
+  const auto& original_values = Input(X);
+  CAFFE_ENFORCE_EQ(original_values.ndim(), 2);
+  const auto num_examples = original_values.dim(0);
+  const float* original_values_data = original_values.template data<float>();
+  const auto num_features = original_values.dim(1);
+
+  const auto& value_pct_pairs = Input(VAL_PCT_PAIRS);
+  CAFFE_ENFORCE_EQ(value_pct_pairs.ndim(), 2);
+  CAFFE_ENFORCE_EQ(value_pct_pairs.dim(1), 2);
+  const int num_values = value_pct_pairs.dim(0);
+  const float* value_pct_data = value_pct_pairs.template data<float>();
+
+  const auto& lengths = Input(LENS);
+  const int* lengths_data = lengths.template data<int>();
+  CAFFE_ENFORCE_EQ(lengths.size(), num_features);
+
+  CAFFE_ENFORCE_EQ(
+      std::accumulate(lengths_data, lengths_data + lengths.size(), 0),
+      num_values,
+      "Sum of lengths should be equal to the total number of samples");
+
+  values_tensor.Resize(num_values);
+  percentiles_tensor.Resize(num_values);
+  float* values_tensor_data = values_tensor.template mutable_data<float>();
+  float* percentiles_tensor_data =
+      percentiles_tensor.template mutable_data<float>();
+  for (int ind = 0; ind < num_values; ind++) {
+    values_tensor_data[ind] = value_pct_data[2 * ind];
+    percentiles_tensor_data[ind] = value_pct_data[2 * ind + 1];
+  }
+
+  auto* percentile_values = Output(PCT);
+  percentile_values->ResizeLike(original_values);
+  float* percentile_values_data =
+      percentile_values->template mutable_data<float>();
+
+  int current_ind = 0;
+  int current_dist_start = 0;
+  int current_length;
+  for (int i = 0; i < num_examples; i++) {
+    current_dist_start = 0;
+
+    for (int j = 0; j < num_features; j++) {
+      current_length = lengths_data[j];
+      const auto lower_bound =
+          std::lower_bound(
+              values_tensor_data + current_dist_start,
+              values_tensor_data + current_dist_start + current_length,
+              original_values_data[current_ind]) -
+          values_tensor_data;
+      if (lower_bound == current_dist_start + current_length) {
+        percentile_values_data[current_ind] = 1.0;
+      } else if (
+          original_values_data[current_ind] ==
+          values_tensor_data[lower_bound]) {
+        percentile_values_data[current_ind] =
+            percentiles_tensor_data[lower_bound];
+      } else if (lower_bound == current_dist_start) {
+        percentile_values_data[current_ind] = 0.0;
+      } else {
+        float lower_pct = percentiles_tensor_data[lower_bound - 1];
+        float upper_pct = percentiles_tensor_data[lower_bound];
+        float interval_length = values_tensor_data[lower_bound] -
+            values_tensor_data[lower_bound - 1];
+        float normalized_dist_to_lower = (original_values_data[current_ind] -
+                                          values_tensor_data[lower_bound - 1]) /
+            interval_length;
+        percentile_values_data[current_ind] =
+            lower_pct + normalized_dist_to_lower * (upper_pct - lower_pct);
+      }
+      current_dist_start += current_length;
+      current_ind++;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Percentile, PercentileOp<CPUContext>);
+OPERATOR_SCHEMA(Percentile)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+    This operator is used to find percentile representations for raw values, given a sample
+    set of raw values, labeled with their corresponding percentiles from the same distribution.
+    In particular, this operator takes as input a tensor of floats to find the percentile values
+    for, a 2D tensor of floats, where the first column of the tensor represents sampled values,
+    and the second column represents the percentile labels, and a tensor  of integers lengths.
+
+    This lengths tensor is used because the operator works on multiple sets of raw values at the same time. For
+    example, for an input:
+    original_values=[[3, 5, 3],[5, 1, 6]], lengths = [2, 1, 1], value_to_pct = [[3, 0.2], [5, 0.5], [1, 0.3], [3. 0.6]]
+
+    Our operator expects that each column i of the input tensor is sampled from distribution i. Lengths tells
+    us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two,
+    and the last is from distribution 3. We expect the output of our operator to give us [[0.2, 1.0, 0.6], [0.5, 0.3, 1.0]].
+
+    To calculate the percentile of an element, we check to see if its value is already mapped to
+    a percentile in value_to_pct. If so, we return that value. If not, we linearly interpolate between
+    the two closest values in value_to_pct. If the value is larger than all values in value_to_pct, we
+    return 1. If it's smaller than all the values, we return 0.
+
+)DOC")
+    .Input(
+        0,
+        "original_values",
+        "Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.")
+    .Input(
+        1,
+        "value_to_pct",
+        "Sorted 2D tensor, with 2 columns. Each element in the first column is a float representing the"
+        " raw value of a sample. Its corresponding element in the next column represents the percentile it maps to.")
+    .Input(
+        2,
+        "lengths",
+        "1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"
+        " is equal to the total length of value_to_pct.")
+    .Output(
+        0,
+        "percentile_values",
+        "1D tensor of floats, with the same dimensions as the flattened input tensor. Each element "
+        "of this tensor, percentile_values[i], corresponds to the percentile calculated "
+        "for original_values[i].");
+
+NO_GRADIENT(Percentile);
+
+} // namespace caffe2
diff --git a/caffe2/operators/percentile_op.h b/caffe2/operators/percentile_op.h
new file mode 100644
index 0000000..2cc96e7
--- /dev/null
+++ b/caffe2/operators/percentile_op.h
@@ -0,0 +1,34 @@
+// Operator to calculate percentile values for an input tensor of data,
+// given samples of data from the same distribution, labeled with their
+// percentile values.
+
+#ifndef CAFFE2_OPERATORS_PERCENTILE_OP_H_
+#define CAFFE2_OPERATORS_PERCENTILE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class PercentileOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  PercentileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
+  OUTPUT_TAGS(PCT);
+  Tensor<Context> values_tensor;
+  Tensor<Context> percentiles_tensor;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PERCENTILE_OP_H_
diff --git a/caffe2/operators/perplexity_op.cc b/caffe2/operators/perplexity_op.cc
new file mode 100644
index 0000000..a7c4d52
--- /dev/null
+++ b/caffe2/operators/perplexity_op.cc
@@ -0,0 +1,40 @@
+#include "caffe2/operators/perplexity_op.h"
+
+namespace caffe2 {
+
+template <>
+bool PerplexityOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+
+  DCHECK_EQ(X.ndim(), 1);
+  int N = X.dim32(0);
+
+  Y->Resize(vector<TIndex>());
+  const auto* Xdata = X.data<float>();
+
+  float perplexity = 1.0;
+  for (int i = 0; i < N; ++i) {
+    perplexity *= pow(Xdata[i], -1.0/N);
+  }
+  *(Y->mutable_data<float>()) = perplexity;
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Perplexity, PerplexityOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Perplexity).NumInputs(1).NumOutputs(1)
+.SetDoc(R"DOC(
+Perplexity calculates how well a probability distribution predicts a sample.
+Perplexity takes a 1-D tensor containing a batch of probabilities. Each value
+in the tensor belongs to a different sample and represents the probability of
+the model predicting the true label for that sample. The operator returns a
+single (float) perplexity value for the batch.
+)DOC")
+.Input(0, "probabilities", "The input data as Tensor. It contains a batch of"
+       "true label or target probabilities")
+.Output(0, "output", "The output- a single (float) perplexity value for the "
+        "batch");
+
+SHOULD_NOT_DO_GRADIENT(Perplexity);
+}  // namespace caffe2
diff --git a/caffe2/operators/perplexity_op.cu b/caffe2/operators/perplexity_op.cu
new file mode 100644
index 0000000..afb4d3d
--- /dev/null
+++ b/caffe2/operators/perplexity_op.cu
@@ -0,0 +1,46 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/perplexity_op.h"
+#include "caffe2/utils/math.h"
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+namespace caffe2 {
+
+struct perplexity_function
+{
+  perplexity_function(float p) : pow(p) {}
+  __host__ __device__ float operator()(float x) const
+  {
+    return powf(1.0f/x, pow);
+  }
+  float pow;
+};
+
+template <>
+bool PerplexityOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+
+  DCHECK_EQ(X.ndim(), 1);
+  int N = X.dim32(0);
+
+  Y->Resize(vector<TIndex>());
+  float* Ydata = Y->mutable_data<float>();
+  const float* Xdata = X.data<float>();
+
+  float perplexity = thrust::transform_reduce(
+      #if THRUST_VERSION >= 100800
+        thrust::cuda::par.on(context_.cuda_stream()),
+      #endif  // THRUST_VERSION >= 100800
+      Xdata, Xdata + N,
+      perplexity_function(1.0f/N),
+      1.0f,
+      thrust::multiplies<float>());
+
+  math::Set<float, CUDAContext>(1, perplexity, Ydata, &context_);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Perplexity, PerplexityOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/perplexity_op.h b/caffe2/operators/perplexity_op.h
new file mode 100644
index 0000000..4fda547
--- /dev/null
+++ b/caffe2/operators/perplexity_op.h
@@ -0,0 +1,19 @@
+#ifndef CAFFE2_OPERATORS_PERPLEXITY_OP_H_
+#define CAFFE2_OPERATORS_PERPLEXITY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class PerplexityOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(PerplexityOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PERPLEXITY_OP_H_
diff --git a/caffe2/operators/piecewise_linear_transform_op.cc b/caffe2/operators/piecewise_linear_transform_op.cc
new file mode 100644
index 0000000..528b336
--- /dev/null
+++ b/caffe2/operators/piecewise_linear_transform_op.cc
@@ -0,0 +1,84 @@
+#include "caffe2/operators/piecewise_linear_transform_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    PiecewiseLinearTransform,
+    PiecewiseLinearTransformOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(PiecewiseLinearTransform)
+    .NumInputs(1, 4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+PiecewiseLinearTransform takes inputs -- predictions, a 2-D or 1-D tensor
+(Tensor<float>) of size (batch_size x prediction_dimensions). The piecewise
+linear functions are stored in bounds, slopes and intercepts. The output tensor
+has the same shape of input `predictions` and contains the predictions
+transformed by the piecewise linear functions. Each column of predictions has
+its own piecewise linear transformation functions. Therefore the size of
+piecewise function parameters are pieces x prediction_dimensions, except for
+binary predictions where only the positive prediction needs them. Note that in
+each piece, low bound is excluded while high bound is included. Also the
+piecewise linear function must be continuous.
+
+Notes
+- If the input is binary predictions (Nx2 or Nx1 tensor), set the binary arg
+to true so that one group of piecewise linear functions is needed (see
+details below).
+- The transform parameters (bounds, slopes, intercepts) can be passed either
+through args or through input blobs.
+- If we have multiple groups of piecewise linear functions, each group has the
+same number of pieces.
+- If a prediction is out of the bounds, it is capped to the smallest or largest
+bound.
+)DOC")
+    .Arg(
+        "bounds",
+        "1-D vector of size (prediction_dimensions x (pieces+1)) contain the "
+        "upper bounds of each piece of linear function. One special case is "
+        "the first bound is the lower bound of whole piecewise function and we "
+        "treat it the same as the left most functions. (bounds, slopes, "
+        "intercepts) can be passed through either arg or input blobs.")
+    .Arg(
+        "slopes",
+        "1-D vector of size (prediction_dimensions x pieces) containing the "
+        "slopes of linear function")
+    .Arg(
+        "intercepts",
+        "1-D vector of size (prediction_dimensions x pieces) containing the "
+        "intercepts of linear function")
+    .Arg(
+        "binary",
+        "If set true, we assume the input is a Nx1 or Nx2 tensor. If it is Nx1 "
+        "tensor, it is positive predictions. If the input is Nx2 tensor, its "
+        "first column is negative predictions and second column is positive "
+        "and negative + positive = 1. We just need one group of piecewise "
+        "linear functions for the positive predictions.")
+    .Input(
+        0,
+        "predictions",
+        "2-D tensor (Tensor<float>) of size "
+        "(num_batches x num_classes) containing scores")
+    .Input(
+        1,
+        "bounds (optional)",
+        "See bounds in Arg. (bounds, slopes, intercepts) can be passed through "
+        "either arg or input blobs.")
+    .Input(
+        2,
+        "slopes (optional)",
+        "See slopes in Arg. (bounds, slopes, intercepts) can be passed through "
+        "either arg or input blobs.")
+    .Input(
+        3,
+        "intercepts (optional)",
+        "See intercepts in Arg. (bounds, slopes, intercepts) can be passed "
+        "through either arg or input blobs.")
+    .Output(
+        0,
+        "transforms",
+        "2-D tensor (Tensor<float>) of size (num_batches x num_classes) "
+        "containing transformed predictions");
+
+SHOULD_NOT_DO_GRADIENT(PiecewiseLinearTransform);
+} // namespace caffe2
diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu
new file mode 100644
index 0000000..877b795
--- /dev/null
+++ b/caffe2/operators/piecewise_linear_transform_op.cu
@@ -0,0 +1,283 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/piecewise_linear_transform_op.h"
+
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+
+namespace caffe2 {
+
+namespace {
+__global__ void PieceWiseLinearTransformGeneralKernel(
+    const int N,
+    const int M,
+    const int num_grp,
+    const int num_fnc_per_grp,
+    const float* bounds,
+    const float* slopes,
+    const float* intercepts,
+    const float* X,
+    float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N * M) {
+    int col = i % M;
+    const float* bounds_group = bounds + (col * (num_fnc_per_grp + 1));
+    const float* slopes_group = slopes + (col * num_fnc_per_grp);
+    const float* intercepts_group = intercepts + (col * num_fnc_per_grp);
+
+    if (X[i] <= bounds_group[0]) {
+      Y[i] = slopes_group[0] * bounds_group[0] + intercepts_group[0];
+    } else if (X[i] >= bounds_group[num_fnc_per_grp]) {
+      Y[i] = slopes_group[num_fnc_per_grp - 1] * bounds_group[num_fnc_per_grp] +
+          intercepts_group[num_fnc_per_grp - 1];
+    } else {
+      auto low_bound = thrust::lower_bound(
+          thrust::device,
+          bounds_group,
+          bounds_group + num_fnc_per_grp + 1,
+          X[i]);
+      int bounds_idx = low_bound - bounds_group - 1;
+      Y[i] = slopes_group[bounds_idx] * X[i] + intercepts_group[bounds_idx];
+    }
+  }
+}
+
+} // namespace
+
+namespace {
+__global__ void PieceWiseLinearTransformBinaryKernel1(
+    const int N,
+    const int M,
+    const int num_grp,
+    const int num_fnc_per_grp,
+    const float* bounds,
+    const float* slopes,
+    const float* intercepts,
+    const float* X,
+    float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    if (X[i] <= bounds[0]) {
+      Y[i] = slopes[0] * bounds[0] + intercepts[0];
+    } else if (X[i] >= bounds[num_fnc_per_grp]) {
+      Y[i] = slopes[num_fnc_per_grp - 1] * bounds[num_fnc_per_grp] +
+          intercepts[num_fnc_per_grp - 1];
+    } else {
+      auto low_bound = thrust::lower_bound(
+          thrust::device, bounds, bounds + num_fnc_per_grp + 1, X[i]);
+      int bounds_idx = low_bound - bounds - 1;
+      Y[i] = slopes[bounds_idx] * X[i] + intercepts[bounds_idx];
+    }
+  }
+}
+} // namespace
+
+namespace {
+__global__ void PieceWiseLinearTransformBinaryKernel2(
+    const int N,
+    const int M,
+    const int num_grp,
+    const int num_fnc_per_grp,
+    const float* bounds,
+    const float* slopes,
+    const float* intercepts,
+    const float* X,
+    float* Y) {
+  // N*M/2 = N as M=2
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int index = i * M;
+    if (X[index + 1] <= bounds[0]) {
+      Y[index + 1] = slopes[0] * bounds[0] + intercepts[0];
+    } else if (X[index + 1] >= bounds[num_fnc_per_grp]) {
+      Y[index + 1] = slopes[num_fnc_per_grp - 1] * bounds[num_fnc_per_grp] +
+          intercepts[num_fnc_per_grp - 1];
+    } else {
+      auto low_bound = thrust::lower_bound(
+          thrust::device, bounds, bounds + num_fnc_per_grp + 1, X[index + 1]);
+      int bounds_idx = low_bound - bounds - 1;
+      Y[index + 1] = slopes[bounds_idx] * X[index + 1] + intercepts[bounds_idx];
+    }
+    Y[index] = 1.0f - Y[index + 1];
+  }
+}
+} // namespace
+
+template <>
+void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
+    TIndex& num_func_per_group,
+    TIndex& num_group,
+    TIndex M) {
+  if (transform_param_from_arg_) {
+    if (!gpu_copied_) {
+      TIndex num_bounds;
+      TIndex num_slopes;
+      TIndex num_intercepts;
+
+      CAFFE_ENFORCE_EQ(InputSize(), 1);
+
+      const float* bounds;
+      const float* slopes;
+      const float* intercepts;
+      bounds = bounds_from_arg_.data();
+      slopes = slopes_from_arg_.data();
+      intercepts = intercepts_from_arg_.data();
+      num_bounds = bounds_from_arg_.size();
+      num_slopes = slopes_from_arg_.size();
+      num_intercepts = intercepts_from_arg_.size();
+      InferNumFunctionsPerGroup(
+          num_bounds,
+          num_slopes,
+          num_intercepts,
+          &num_func_per_group,
+          &num_group);
+
+      if (binary_) {
+        CAFFE_ENFORCE_EQ(num_group, 1);
+      } else {
+        CAFFE_ENFORCE_EQ(num_group, M);
+      }
+
+      int length = num_group * num_func_per_group;
+      TensorCPU bounds_host;
+      bounds_host.Resize(length + num_group);
+      memcpy(
+          bounds_host.mutable_data<float>(),
+          bounds,
+          (length + num_group) * sizeof(float));
+
+      TensorCPU intercepts_host;
+      intercepts_host.Resize(length);
+      memcpy(
+          intercepts_host.mutable_data<float>(),
+          intercepts,
+          (length) * sizeof(float));
+      TensorCPU slopes_host;
+      slopes_host.Resize(length);
+      memcpy(
+          slopes_host.mutable_data<float>(), slopes, (length) * sizeof(float));
+
+      bounds_device_.CopyFrom<CPUContext>(bounds_host);
+      intercepts_device_.CopyFrom<CPUContext>(intercepts_host);
+      slopes_device_.CopyFrom<CPUContext>(slopes_host);
+
+      gpu_copied_ = true;
+    }
+  } else {
+    TIndex num_bounds;
+    TIndex num_slopes;
+    TIndex num_intercepts;
+    CAFFE_ENFORCE_EQ(InputSize(), 4);
+    auto& bounds_input = Input(BOUNDS);
+    auto& slopes_input = Input(SLOPES);
+    auto& intercepts_input = Input(INTERCEPTS);
+    num_bounds = bounds_input.size();
+    num_slopes = slopes_input.size();
+    num_intercepts = intercepts_input.size();
+    InferNumFunctionsPerGroup(
+        num_bounds,
+        num_slopes,
+        num_intercepts,
+        &num_func_per_group,
+        &num_group);
+
+    if (binary_) {
+      CAFFE_ENFORCE_EQ(num_group, 1);
+    } else {
+      CAFFE_ENFORCE_EQ(num_group, M);
+    }
+
+    bounds_device_.CopyFrom<CUDAContext>(bounds_input);
+    slopes_device_.CopyFrom<CUDAContext>(slopes_input);
+    intercepts_device_.CopyFrom<CUDAContext>(intercepts_input);
+  }
+}
+
+template <>
+bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformGeneral() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_EQ(X.ndim(), 2);
+  TIndex N = X.dim32(0);
+  TIndex M = X.dim32(1);
+  Y->ResizeLike(X);
+
+  TIndex num_func_per_group;
+  TIndex num_group;
+
+  setUpTensors(num_func_per_group, num_group, M);
+
+  PieceWiseLinearTransformGeneralKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      M,
+      num_group,
+      num_func_per_group,
+      bounds_device_.data<float>(),
+      slopes_device_.data<float>(),
+      intercepts_device_.data<float>(),
+      X.data<float>(),
+      Y->mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE(X.ndim() == 1 || X.ndim() == 2);
+  TIndex N = X.dim32(0);
+  TIndex M = X.ndim() == 2 ? X.dim32(1) : 1;
+  CAFFE_ENFORCE(
+      M == 1 || M == 2,
+      "If binary is set to true, the input must be Nx2 or Nx1 tensor");
+  Y->ResizeLike(X);
+
+  TIndex num_func_per_group;
+  TIndex num_group;
+
+  setUpTensors(num_func_per_group, num_group, M);
+
+  if (M == 1) {
+    PieceWiseLinearTransformBinaryKernel1<<<
+        CAFFE_GET_BLOCKS(X.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        M,
+        num_group,
+        num_func_per_group,
+        bounds_device_.data<float>(),
+        slopes_device_.data<float>(),
+        intercepts_device_.data<float>(),
+        X.data<float>(),
+        Y->mutable_data<float>());
+  } else {
+    PieceWiseLinearTransformBinaryKernel2<<<
+        // don't want N*M threads, only N*M/2
+        CAFFE_GET_BLOCKS(X.size() / 2),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        M,
+        num_group,
+        num_func_per_group,
+        bounds_device_.data<float>(),
+        slopes_device_.data<float>(),
+        intercepts_device_.data<float>(),
+        X.data<float>(),
+        Y->mutable_data<float>());
+  }
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    PiecewiseLinearTransform,
+    PiecewiseLinearTransformOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
new file mode 100644
index 0000000..701acb8
--- /dev/null
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -0,0 +1,250 @@
+#ifndef CAFFE2_OPERATORS_PIECEWISE_LINEAR_TRANSFORM_OP_H_
+#define CAFFE2_OPERATORS_PIECEWISE_LINEAR_TRANSFORM_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class PiecewiseLinearTransformOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  PiecewiseLinearTransformOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    binary_ = OperatorBase::GetSingleArgument<bool>("binary", false);
+
+    // Retrieve transform params (i.e., the linear functions).
+    bounds_from_arg_ = OperatorBase::GetRepeatedArgument<T>("bounds");
+    slopes_from_arg_ = OperatorBase::GetRepeatedArgument<T>("slopes");
+    intercepts_from_arg_ = OperatorBase::GetRepeatedArgument<T>("intercepts");
+    transform_param_from_arg_ = CheckTransParamFromArg();
+  }
+
+  bool RunOnDevice() override {
+    return binary_ ? TransformBinary() : TransformGeneral();
+  }
+
+ private:
+  // num_func_per_group is the number of pieces of linear functions of
+  // each group.
+  // num_group: The number of groups of linear functions. Each group is for
+  // transforming one column of predictions.
+  void InferNumFunctionsPerGroup(
+      const TIndex num_bounds,
+      const TIndex num_slopes,
+      const TIndex num_intercepts,
+      TIndex* num_func_per_group,
+      TIndex* num_group) {
+    CAFFE_ENFORCE_EQ(num_slopes, num_intercepts);
+
+    // This is based on the facts:
+    // 1. in each group, the num of bounds minus the num of slopes is 1;
+    // 2. each group has the same number of pieces.
+    *num_group = num_bounds - num_slopes;
+    CAFFE_ENFORCE_GT(*num_group, 0);
+    if (binary_) {
+      CAFFE_ENFORCE_EQ(*num_group, 1);
+    }
+    *num_func_per_group = num_slopes / *num_group;
+    CAFFE_ENFORCE_GT(*num_func_per_group, 0);
+    CAFFE_ENFORCE_EQ(num_slopes % *num_group, 0);
+  }
+
+  bool CheckBoundsSorted(
+      const T* bounds,
+      const TIndex num_bounds_per_group,
+      const TIndex num_group) {
+    const T* start = bounds;
+    for (TIndex i = 0; i < num_group; i++) {
+      if (!std::is_sorted(start, start + num_bounds_per_group)) {
+        return false;
+      }
+      start += num_bounds_per_group;
+    }
+    return true;
+  }
+
+  // Returns true if the transform params from arg are valid.
+  // Otherwise, we will assume the transform params will pass from Input blobs.
+  bool CheckTransParamFromArg() {
+    int good_param = 0;
+    good_param += bounds_from_arg_.size() > 0;
+    good_param += slopes_from_arg_.size() > 0;
+    good_param += intercepts_from_arg_.size() > 0;
+    CAFFE_ENFORCE(
+        good_param == 0 || good_param == 3,
+        "bounds, slopes, intercepts must be all set or all not set");
+    if (good_param == 3) {
+      TIndex num_func_per_group;
+      TIndex num_group;
+      InferNumFunctionsPerGroup(
+          bounds_from_arg_.size(),
+          slopes_from_arg_.size(),
+          intercepts_from_arg_.size(),
+          &num_func_per_group,
+          &num_group);
+      CAFFE_ENFORCE(
+          CheckBoundsSorted(
+              bounds_from_arg_.data(), num_func_per_group + 1, num_group),
+          "bounds must be sorted for each group");
+    }
+
+    return good_param == 3;
+  }
+
+  void setUpTensors(TIndex& num_func_per_group, TIndex& num_group, TIndex M);
+
+  void GetTransParamData(
+      const T** bounds,
+      const T** slopes,
+      const T** intercepts,
+      TIndex* num_func_per_group,
+      TIndex* num_group) {
+    TIndex num_bounds;
+    TIndex num_slopes;
+    TIndex num_intercepts;
+
+    if (transform_param_from_arg_) {
+      CAFFE_ENFORCE_EQ(InputSize(), 1);
+      *bounds = bounds_from_arg_.data();
+      *slopes = slopes_from_arg_.data();
+      *intercepts = intercepts_from_arg_.data();
+      num_bounds = bounds_from_arg_.size();
+      num_slopes = slopes_from_arg_.size();
+      num_intercepts = intercepts_from_arg_.size();
+    } else {
+      CAFFE_ENFORCE_EQ(InputSize(), 4);
+      auto& bounds_input = Input(BOUNDS);
+      auto& slopes_input = Input(SLOPES);
+      auto& intercepts_input = Input(INTERCEPTS);
+      *bounds = bounds_input.template data<T>();
+      *slopes = slopes_input.template data<T>();
+      *intercepts = intercepts_input.template data<T>();
+      num_bounds = bounds_input.size();
+      num_slopes = slopes_input.size();
+      num_intercepts = intercepts_input.size();
+    }
+    InferNumFunctionsPerGroup(
+        num_bounds, num_slopes, num_intercepts, num_func_per_group, num_group);
+  }
+
+  bool TransformGeneral() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE_EQ(X.ndim(), 2);
+    TIndex N = X.dim32(0);
+    TIndex M = X.dim32(1);
+    Y->ResizeLike(X);
+    const auto* Xdata = X.template data<T>();
+    T* Ydata = Y->template mutable_data<T>();
+
+    const T* bounds;
+    const T* slopes;
+    const T* intercepts;
+    TIndex num_func_per_group;
+    TIndex num_group;
+    GetTransParamData(
+        &bounds, &slopes, &intercepts, &num_func_per_group, &num_group);
+    CAFFE_ENFORCE_EQ(num_group, M);
+
+    for (TIndex j = 0; j < M; ++j) {
+      const T* bounds_group = bounds + j * (num_func_per_group + 1);
+      const T* slopes_group = slopes + j * num_func_per_group;
+      const T* intercepts_group = intercepts + j * num_func_per_group;
+      for (TIndex i = 0; i < N; ++i) {
+        Ydata[i * M + j] = PiecewiseLinearTransform(
+            Xdata[i * M + j],
+            bounds_group,
+            slopes_group,
+            intercepts_group,
+            num_func_per_group);
+      }
+    }
+    return true;
+  }
+
+  bool TransformBinary() {
+    auto& X = Input(PREDICTIONS);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(X.ndim() == 1 || X.ndim() == 2);
+    TIndex N = X.dim32(0);
+    TIndex M = X.ndim() == 2 ? X.dim32(1) : 1;
+    CAFFE_ENFORCE(
+        M == 1 || M == 2,
+        "If binary is set to true, the input must be Nx2 or Nx1 tensor");
+    Y->ResizeLike(X);
+    const auto* Xdata = X.template data<T>();
+    T* Ydata = Y->template mutable_data<T>();
+
+    const T* bounds;
+    const T* slopes;
+    const T* intercepts;
+    TIndex num_func_per_group;
+    TIndex num_group;
+    GetTransParamData(
+        &bounds, &slopes, &intercepts, &num_func_per_group, &num_group);
+    CAFFE_ENFORCE_EQ(num_group, 1);
+
+    if (M == 1) {
+      for (TIndex i = 0; i < N; ++i) {
+        Ydata[i] = PiecewiseLinearTransform(
+            Xdata[i], bounds, slopes, intercepts, num_func_per_group);
+      }
+    } else {
+      for (TIndex i = 0; i < N; ++i) {
+        Ydata[i * M + 1] = PiecewiseLinearTransform(
+            Xdata[i * M + 1], bounds, slopes, intercepts, num_func_per_group);
+        Ydata[i * M] = 1.0f - Ydata[i * M + 1];
+      }
+    }
+
+    return true;
+  }
+
+  T PiecewiseLinearTransform(
+      const T x,
+      const T* bounds,
+      const T* slopes,
+      const T* intercepts,
+      const TIndex num_func_per_group) {
+    T y = 0;
+    // deal with samples out of bounds
+    // make it the same as the upper/lower bound value
+    if (x <= bounds[0]) {
+      y = slopes[0] * bounds[0] + intercepts[0];
+    } else if (x >= bounds[num_func_per_group]) {
+      y = slopes[num_func_per_group - 1] * bounds[num_func_per_group] +
+          intercepts[num_func_per_group - 1];
+    } else {
+      auto low_bound =
+          std::lower_bound(bounds, bounds + num_func_per_group + 1, x);
+      int bounds_idx = low_bound - bounds - 1;
+      // compute the piecewise linear transformation as Y
+      y = slopes[bounds_idx] * x + intercepts[bounds_idx];
+    }
+    return y;
+  }
+
+ private:
+  bool binary_;
+  vector<T> bounds_from_arg_;
+  vector<T> slopes_from_arg_;
+  vector<T> intercepts_from_arg_;
+
+  Tensor<Context> bounds_device_;
+  Tensor<Context> intercepts_device_;
+  Tensor<Context> slopes_device_;
+  bool gpu_copied_ = false;
+
+  // If true, the piecewise linear functions are passed through args,
+  // otherwise, they are passed through Input blobs.
+  bool transform_param_from_arg_;
+
+  INPUT_TAGS(PREDICTIONS, BOUNDS, SLOPES, INTERCEPTS);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PIECEWISE_LINEAR_TRANSFORM_OP_H_
diff --git a/caffe2/operators/pool_gradient_op.cc b/caffe2/operators/pool_gradient_op.cc
new file mode 100644
index 0000000..f7062a6
--- /dev/null
+++ b/caffe2/operators/pool_gradient_op.cc
@@ -0,0 +1,377 @@
+#include "caffe2/operators/pool_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+using std::max;
+using std::min;
+
+namespace {
+// These two classe are just used as template arguments passed to the
+// PoolGradientOp
+// template to instantiate the different algorithms.
+template <typename T>
+class AveragePool {
+ public:
+  static void process_grad(
+      const T& /*x_data*/,
+      const T& /*y_data*/,
+      const T& dy_data,
+      const T& scale,
+      T& dx_data) {
+    dx_data += (scale * dy_data);
+  }
+
+  static void process_grad(
+      const int y_col,
+      const int x_col,
+      const float scale,
+      ConstEigenArrayMap<float>& /*x_data*/,
+      ConstEigenArrayMap<float>& /*y_data*/,
+      ConstEigenArrayMap<float>& dy_data,
+      EigenArrayMap<float>& dx_data) {
+    dx_data.col(x_col) += scale * dy_data.col(y_col);
+  }
+};
+
+template <typename T>
+class MaxPool {
+ public:
+  static void process_grad(
+      const T& x_data,
+      const T& y_data,
+      const T& dy_data,
+      const T& /*scale*/,
+      T& dx_data) {
+    if (x_data == y_data) {
+      dx_data += dy_data;
+    }
+  }
+
+  static void process_grad(
+      const int y_col,
+      const int x_col,
+      const float /*scale*/,
+      ConstEigenArrayMap<float>& x_data,
+      ConstEigenArrayMap<float>& y_data,
+      ConstEigenArrayMap<float>& dy_data,
+      EigenArrayMap<float>& dx_data) {
+    dx_data.col(x_col) +=
+        dy_data.col(y_col) * (x_data.col(x_col)
+                                  .cwiseEqual(y_data.col(y_col))
+                                  .template cast<float>());
+  }
+};
+}
+
+template <typename T, class Context, typename PoolType>
+bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->template mutable_data<float>(), &context_);
+  const float* Xdata = X.template data<float>();
+  const float* Ydata = Y.template data<float>();
+  const float* dYdata = dY.template data<float>();
+  float* dXdata = dX->template mutable_data<float>();
+  int channels = X.dim32(1);
+  CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
+  int height = X.dim32(2);
+  int width = kernel_.size() > 1 ? X.dim32(3) : 1;
+  int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
+  vector<int> dims(X.dims().begin() + 2, X.dims().end());
+  ConvPoolOpBase<CPUContext>::ComputePads(dims);
+  int pooled_height = dY.dim32(2);
+  int pooled_width = kernel_.size() > 1 ? dY.dim32(3) : 1;
+  int pooled_depth = kernel_.size() > 2 ? dY.dim32(4) : 1;
+  // The main loop
+  switch (kernel_.size()) {
+    case 1:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            float scale = 1. / (hend - hstart);
+            for (int h = hstart; h < hend; ++h) {
+              PoolType::process_grad(
+                  Xdata[h], Ydata[ph], dYdata[ph], scale, dXdata[h]);
+            }
+          }
+          // offset
+          Xdata += height;
+          dXdata += height;
+          Ydata += pooled_height;
+          dYdata += pooled_height;
+        }
+      }
+      break;
+    case 2:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int wstart = pw * stride_w() - pad_l();
+              int wend = min(wstart + kernel_w(), width);
+              wstart = max(wstart, 0);
+              float scale = 1. / (hend - hstart) / (wend - wstart);
+              const int pooled_index = ph * pooled_width + pw;
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int index = h * width + w;
+                  PoolType::process_grad(
+                      Xdata[index],
+                      Ydata[pooled_index],
+                      dYdata[pooled_index],
+                      scale,
+                      dXdata[index]);
+                }
+              }
+            }
+          }
+          // offset
+          Xdata += height * width;
+          dXdata += height * width;
+          Ydata += pooled_height * pooled_width;
+          dYdata += pooled_height * pooled_width;
+        }
+      }
+      break;
+    case 3:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int wstart = pw * stride_w() - pad_l();
+              int wend = min(wstart + kernel_w(), width);
+              wstart = max(wstart, 0);
+              for (int pd = 0; pd < pooled_depth; ++pd) {
+                int dstart = pd * stride_[2] - pads_[2];
+                int dend = min(dstart + kernel_[2], depth);
+                dstart = max(dstart, 0);
+                float scale =
+                    1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
+                const int pooled_index =
+                    ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    for (int d = dstart; d < dend; ++d) {
+                      const int index = h * width * depth + w * depth + d;
+                      PoolType::process_grad(
+                          Xdata[index],
+                          Ydata[pooled_index],
+                          dYdata[pooled_index],
+                          scale,
+                          dXdata[index]);
+                    }
+                  }
+                }
+              }
+            }
+          }
+          // offset
+          Xdata += height * width * depth;
+          dXdata += height * width * depth;
+          Ydata += pooled_height * pooled_width * pooled_depth;
+          dYdata += pooled_height * pooled_width * pooled_depth;
+        }
+      }
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size");
+      return false;
+  }
+  return true;
+}
+
+template <typename T, class Context, typename PoolType>
+bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  DCHECK_EQ(dY.ndim(), kernel_.size() + 2);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  int channels = X.dim32(X.ndim() - 1);
+  CAFFE_ENFORCE_EQ(channels, dY.dim32(dY.ndim() - 1));
+  ConstEigenArrayMap<T> Ymat(
+      Y.template data<float>(), channels, Y.size() / channels);
+  ConstEigenArrayMap<float> dYmat(
+      dY.template data<float>(), channels, Y.size() / channels);
+  ConstEigenArrayMap<float> Xmat(
+      X.template data<float>(), channels, X.size() / channels);
+  EigenArrayMap<float> dXmat(
+      dX->template mutable_data<float>(), channels, X.size() / channels);
+  dXmat.setZero();
+  int height = X.dim32(1);
+  int width = kernel_.size() > 1 ? X.dim32(2) : 1;
+  int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
+  vector<int> dims(X.dims().begin() + 1, X.dims().end() - 1);
+  ConvPoolOpBase<CPUContext>::ComputePads(dims);
+  int pooled_height = dY.dim32(1);
+  int pooled_width = kernel_.size() > 1 ? dY.dim32(2) : 1;
+  int pooled_depth = kernel_.size() > 2 ? dY.dim32(3) : 1;
+
+  // The main loop
+  // Do not do openmp here: the following for loops are looping over the pooled
+  // output, so if one parallelizes the outer loops, race conditions could
+  // happen in the inner loops.
+  switch (kernel_.size()) {
+    case 1:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          const int pool_index = n * pooled_height + ph;
+          const float scale = 1. / (hend - hstart);
+          for (int h = hstart; h < hend; ++h) {
+            const int input_index = n * height + h;
+            PoolType::process_grad(
+                pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
+          }
+        }
+      }
+      break;
+    case 2:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int wstart = pw * stride_w() - pad_l();
+            int wend = min(wstart + kernel_w(), width);
+            wstart = max(wstart, 0);
+            const int pool_index = (n * pooled_height + ph) * pooled_width + pw;
+            const float scale = 1. / (hend - hstart) / (wend - wstart);
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int input_index = (n * height + h) * width + w;
+                PoolType::process_grad(
+                    pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
+              }
+            }
+          }
+        }
+      }
+      break;
+    case 3:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int wstart = pw * stride_w() - pad_l();
+            int wend = min(wstart + kernel_w(), width);
+            wstart = max(wstart, 0);
+            for (int pd = 0; pd < pooled_depth; ++pd) {
+              int dstart = pd * stride_[2] - pads_[2];
+              int dend = min(dstart + kernel_[2], depth);
+              dstart = max(dstart, 0);
+              const int pool_index =
+                  ((n * pooled_height + ph) * pooled_width + pw) *
+                      pooled_depth +
+                  pd;
+              const float scale =
+                  1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  for (int d = dstart; d < dend; ++d) {
+                    const int input_index =
+                        ((n * height + h) * width + w) * depth + d;
+                    PoolType::process_grad(
+                        pool_index,
+                        input_index,
+                        scale,
+                        Xmat,
+                        Ymat,
+                        dYmat,
+                        dXmat);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size");
+      return false;
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    AveragePoolGradient,
+    PoolGradientOp<float, CPUContext, AveragePool<float>>);
+OPERATOR_SCHEMA(AveragePoolGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    AveragePool1DGradient,
+    PoolGradientOp<float, CPUContext, AveragePool<float>>);
+OPERATOR_SCHEMA(AveragePool1DGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    AveragePool2DGradient,
+    PoolGradientOp<float, CPUContext, AveragePool<float>>);
+OPERATOR_SCHEMA(AveragePool2DGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    AveragePool3DGradient,
+    PoolGradientOp<float, CPUContext, AveragePool<float>>);
+OPERATOR_SCHEMA(AveragePool3DGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    MaxPoolGradient,
+    PoolGradientOp<float, CPUContext, MaxPool<float>>);
+OPERATOR_SCHEMA(MaxPoolGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    MaxPool1DGradient,
+    PoolGradientOp<float, CPUContext, MaxPool<float>>);
+OPERATOR_SCHEMA(MaxPool1DGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    MaxPool2DGradient,
+    PoolGradientOp<float, CPUContext, MaxPool<float>>);
+OPERATOR_SCHEMA(MaxPool2DGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    MaxPool3DGradient,
+    PoolGradientOp<float, CPUContext, MaxPool<float>>);
+OPERATOR_SCHEMA(MaxPool3DGradient).NumInputs(3).NumOutputs(1);
+
+class GetPoolGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{I(0), O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(AveragePool, GetPoolGradient);
+REGISTER_GRADIENT(AveragePool1D, GetPoolGradient);
+REGISTER_GRADIENT(AveragePool2D, GetPoolGradient);
+REGISTER_GRADIENT(AveragePool3D, GetPoolGradient);
+REGISTER_GRADIENT(MaxPool, GetPoolGradient);
+REGISTER_GRADIENT(MaxPool1D, GetPoolGradient);
+REGISTER_GRADIENT(MaxPool2D, GetPoolGradient);
+REGISTER_GRADIENT(MaxPool3D, GetPoolGradient);
+}
diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc
new file mode 100644
index 0000000..eca7978
--- /dev/null
+++ b/caffe2/operators/pool_op.cc
@@ -0,0 +1,995 @@
+// TODO(ataei): reduce the apparent redundancy of all the code below.
+#include "caffe2/operators/pool_op.h"
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+using std::max;
+using std::min;
+
+namespace {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+bool isNeon4x4p0s0Eligible(
+    int inputH,
+    int inputW,
+    int outputH,
+    int outputW,
+    int kH,
+    int kW,
+    int strideH,
+    int strideW,
+    int padT,
+    int padL,
+    int padB,
+    int padR,
+    int dilationH,
+    int dilationW,
+    const float* input,
+    float* output) {
+  // Use this kernel only if:
+  // Kernel width is 4x4
+  // Kernel stride is 4x4
+  // Padding is 0
+  // Dilation is 1
+  // Output width and height are even divisors of input width
+  // Input width and height are divisible by 4 (should be implied by
+  // all of the above, but just check again)
+  // Input and output pointers are aligned by float32x4_t
+
+  bool kernelOk = (kH == 4) && (kW == 4);
+  bool strideOk = (strideH == 4) && (strideW == 4);
+  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
+  bool dilationOk = (dilationH == 1) && (dilationW == 1);
+
+  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
+  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
+  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
+      isPointerAligned(output, sizeof(float32x4_t));
+
+  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
+      alignOk;
+}
+
+// Vectorizes 4x4p0s0 averge pooling for ARM NEON
+void avgPoolNeon4x4p0s0Plane(
+    int inputH,
+    int inputW,
+    const float* input,
+    float* output) {
+  constexpr int kKernelHeight = 4;
+  constexpr int kKernelWidth = 4;
+  constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));
+
+  // Handle portion that can be unrolled by 4
+  constexpr int kUnroll = 4;
+  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
+  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
+
+  if (inputW % kLoadCols == 0) {
+    //
+    // Manually unroll by 4 (kUnroll)
+    //
+
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+      const float* curInput = input + h * inputW;
+
+      for (int w = 0; w < inputW; w += kLoadCols) {
+        float32x4_t out = {};
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 0);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 1);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 2);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 3);
+        }
+        curInput += kLoadSizeFloat;
+
+        out = vmulq_f32(out, vdupq_n_f32(kDiv));
+        vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
+      }
+    }
+  } else {
+    //
+    // Not unrolled
+    //
+
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      const float* inputRow = input + h * inputW;
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+
+      for (int w = 0; w < inputW; w += kKernelWidth) {
+        const float* curInput = inputRow + w;
+
+        float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+        float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+        float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+        float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+        float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
+        outputRow[w / kKernelWidth] = v0;
+      }
+    }
+  }
+}
+
+void runNeonAveragePool4x4p0s0NCHW(
+    int N,
+    int C,
+    int inputH,
+    int inputW,
+    const float* input,
+    float* output) {
+  // We only have the 4x4p0s0 implementation at present, which is
+  // checked at a higher level
+  int outputH = inputH / 4;
+  int outputW = inputW / 4;
+
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      const float* curInput = input + (n * C + c) * inputH * inputW;
+      float* curOutput = output + (n * C + c) * outputH * outputW;
+
+      avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
+    }
+  }
+}
+
+bool isNeon2x2p0s0Eligible(
+    int inputH,
+    int inputW,
+    int outputH,
+    int outputW,
+    int kH,
+    int kW,
+    int strideH,
+    int strideW,
+    int padT,
+    int padL,
+    int padB,
+    int padR,
+    int dilationH,
+    int dilationW,
+    const float* input,
+    float* output) {
+  // Use this kernel only if:
+  // Kernel width is 2x2
+  // Kernel stride is 2x2
+  // Padding is 0
+  // Dilation is 1
+  // Output width and height are even divisors of input width
+  // Input width and height are divisible by 4 (should be implied by
+  // all of the above, but just check again)
+  // Input and output pointers are aligned by float32x4_t
+
+  bool kernelOk = (kH == 2) && (kW == 2);
+  bool strideOk = (strideH == 2) && (strideW == 2);
+  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
+  bool dilationOk = (dilationH == 1) && (dilationW == 1);
+
+  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
+  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
+  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
+      isPointerAligned(output, sizeof(float32x4_t));
+
+  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
+      alignOk;
+}
+
+// Vectorizes 2x2p0s0 averge pooling for ARM NEON
+void maxPoolNeon2x2p0s0Plane(
+    int inputH,
+    int inputW,
+    const float* input,
+    float* output) {
+  constexpr int kKernelHeight = 2;
+  constexpr int kKernelWidth = 2;
+
+  // Handle portion that can be unrolled by 4
+  constexpr int kUnroll = 4;
+  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
+  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
+
+  if (inputW % kLoadCols == 0) {
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+      const float* curInput = input + h * inputW;
+
+      for (int w = 0; w < inputW; w += kLoadCols) {
+        float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
+          hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+        }
+        curInput += kLoadSizeFloat;
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
+          hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+        }
+        curInput += kLoadSizeFloat;
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
+          hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+        }
+        curInput += kLoadSizeFloat;
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
+          hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+        }
+        curInput += kLoadSizeFloat;
+
+        float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
+        float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
+        vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
+        vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
+      }
+    }
+  } else {
+    // Not unrolled
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      const float* inputRow = input + h * inputW;
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+
+      for (int w = 0; w < inputW; w += kKernelWidth * 2) {
+        const float* curInput = inputRow + w;
+        float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+        float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+        float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
+        float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+        vst1_f32(&outputRow[w / kKernelWidth], hmax);
+      }
+    }
+  }
+}
+
+void runNeonMaxPool2x2p0s0NCHW(
+    int N,
+    int C,
+    int inputH,
+    int inputW,
+    const float* input,
+    float* output) {
+  // We only have the 2x2p0s0 implementation at present, which is
+  // checked at a higher level
+  int outputH = inputH / 2;
+  int outputW = inputW / 2;
+
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      const float* curInput = input + (n * C + c) * inputH * inputW;
+      float* curOutput = output + (n * C + c) * outputH * outputW;
+      maxPoolNeon2x2p0s0Plane(inputH, inputW, curInput, curOutput);
+    }
+  }
+}
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+} // namespace
+
+template <typename T>
+class AveragePool {
+ public:
+  static float initialize() {
+    return 0.0;
+  }
+
+  static void process(
+      const int x_col,
+      const int y_col,
+      ConstEigenMatrixMap<float>& x_mat,
+      EigenMatrixMap<float>& y_mat) {
+    y_mat.col(y_col) += x_mat.col(x_col);
+  }
+
+  static void process(const T& x_data, T& y_data) {
+    y_data += x_data;
+  }
+
+  static void finalize(const int size, T& y_data) {
+    y_data /= size;
+  }
+
+  static void
+  finalize(const int size, const int col, EigenMatrixMap<float>& y_mat) {
+    y_mat.col(col) /= size;
+  }
+
+  static bool runSpecialized(
+      int N,
+      int C,
+      int inputH,
+      int inputW,
+      int outputH,
+      int outputW,
+      int kH,
+      int kW,
+      int strideH,
+      int strideW,
+      int padT,
+      int padL,
+      int padB,
+      int padR,
+      int dilationH,
+      int dilationW,
+      const float* input,
+      float* output) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    if (isNeon4x4p0s0Eligible(
+            inputH,
+            inputW,
+            outputH,
+            outputW,
+            kH,
+            kW,
+            strideH,
+            strideW,
+            padT,
+            padL,
+            padB,
+            padR,
+            dilationH,
+            dilationW,
+            input,
+            output)) {
+      runNeonAveragePool4x4p0s0NCHW(N, C, inputH, inputW, input, output);
+      return true;
+    }
+#else
+    (void)N;
+    (void)C;
+    (void)inputH;
+    (void)inputW;
+    (void)outputH;
+    (void)outputW;
+    (void)kH;
+    (void)kW;
+    (void)strideH;
+    (void)strideW;
+    (void)padT;
+    (void)padL;
+    (void)padB;
+    (void)padR;
+    (void)dilationH;
+    (void)dilationW;
+    (void)input;
+    (void)output;
+#endif
+    return false;
+  }
+};
+
+template <typename T>
+class MaxPool {
+ public:
+  static float initialize() {
+    return std::numeric_limits<float>::lowest();
+  }
+
+  static void process(
+      const int x_col,
+      const int y_col,
+      ConstEigenMatrixMap<float>& x_mat,
+      EigenMatrixMap<float>& y_mat) {
+    y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
+  }
+
+  static void process(const T& x_data, T& y_data) {
+    if (x_data > y_data) {
+      y_data = x_data;
+    }
+  }
+
+  static void finalize(const int /*size*/, T& /*y_data*/) {}
+
+  static void finalize(
+      const int /*size*/,
+      const int /*col*/,
+      EigenMatrixMap<float>& /*y_mat*/) {}
+
+  static bool runSpecialized(
+      int N,
+      int C,
+      int inputH,
+      int inputW,
+      int outputH,
+      int outputW,
+      int kH,
+      int kW,
+      int strideH,
+      int strideW,
+      int padT,
+      int padL,
+      int padB,
+      int padR,
+      int dilationH,
+      int dilationW,
+      const float* input,
+      float* output) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    if (isNeon2x2p0s0Eligible(
+            inputH,
+            inputW,
+            outputH,
+            outputW,
+            kH,
+            kW,
+            strideH,
+            strideW,
+            padT,
+            padL,
+            padB,
+            padR,
+            dilationH,
+            dilationW,
+            input,
+            output)) {
+      runNeonMaxPool2x2p0s0NCHW(N, C, inputH, inputW, input, output);
+      return true;
+    }
+#else
+    (void)N;
+    (void)C;
+    (void)inputH;
+    (void)inputW;
+    (void)outputH;
+    (void)outputW;
+    (void)kH;
+    (void)kW;
+    (void)strideH;
+    (void)strideW;
+    (void)padT;
+    (void)padL;
+    (void)padB;
+    (void)padR;
+    (void)dilationH;
+    (void)dilationW;
+    (void)input;
+    (void)output;
+#endif
+    return false;
+  }
+};
+
+template <typename T, class Context, typename PoolType>
+bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, X.dim32(1));
+
+  const float* Xdata = X.template data<float>();
+  float* Ydata = Y->template mutable_data<float>();
+  // The main loop
+  int channels = X.dim32(1);
+  int height = X.dim32(2);
+  int width = kernel_.size() > 1 ? X.dim32(3) : 1;
+  int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
+  int pooled_height = Y->dim32(2);
+  int pooled_width = kernel_.size() > 1 ? Y->dim32(3) : 1;
+  int pooled_depth = kernel_.size() > 2 ? Y->dim32(4) : 1;
+
+  // We specialize certain variants on ARM for vectorization
+  if (kernel_.size() == 2 &&
+      PoolType::runSpecialized(
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          pad_b(),
+          pad_r(),
+          dilation_h(),
+          dilation_w(),
+          Xdata,
+          Ydata)) {
+    return true;
+  }
+
+  switch (kernel_.size()) {
+    case 1:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            T Yh = PoolType::initialize();
+            for (int h = hstart; h < hend; ++h) {
+              PoolType::process(Xdata[h], Yh);
+            }
+            PoolType::finalize(hend - hstart, Yh);
+            Ydata[ph] = Yh;
+          }
+          // Do offset.
+          Xdata += height;
+          Ydata += pooled_height;
+        }
+      }
+      break;
+    case 2:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int wstart = pw * stride_w() - pad_l();
+              int wend = min(wstart + kernel_w(), width);
+              wstart = max(wstart, 0);
+              const int pool_index = ph * pooled_width + pw;
+              T Yh = PoolType::initialize();
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int input_index = h * width + w;
+                  PoolType::process(Xdata[input_index], Yh);
+                }
+              }
+              PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
+              Ydata[pool_index] = Yh;
+            }
+          }
+          // Do offset.
+          Xdata += height * width;
+          Ydata += pooled_height * pooled_width;
+        }
+      }
+      break;
+    case 3:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int wstart = pw * stride_w() - pad_l();
+              int wend = min(wstart + kernel_w(), width);
+              wstart = max(wstart, 0);
+              for (int pd = 0; pd < pooled_depth; ++pd) {
+                int dstart = pd * stride_[2] - pads_[2];
+                int dend = min(dstart + kernel_[2], depth);
+                dstart = max(dstart, 0);
+                const int pool_index =
+                    ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
+                T Yh = PoolType::initialize();
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    for (int d = dstart; d < dend; ++d) {
+                      const int input_index = h * width * depth + w * depth + d;
+                      PoolType::process(Xdata[input_index], Yh);
+                    }
+                  }
+                }
+                PoolType::finalize(
+                    (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
+                Ydata[pool_index] = Yh;
+              }
+            }
+          }
+          // Do offset.
+          Xdata += height * width * depth;
+          Ydata += pooled_height * pooled_width * pooled_depth;
+        }
+      }
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+      return false;
+  }
+  return true;
+}
+
+template <typename T, class Context, typename PoolType>
+bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim32(1);
+  int width = kernel_.size() > 1 ? X.dim32(2) : 1;
+  int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
+  int channels = X.dim32(X.ndim() - 1);
+  ConvPoolOpBase<Context>::SetOutputSize(X, Y, channels);
+
+  EigenMatrixMap<float> Ymat(
+      Y->template mutable_data<float>(), channels, Y->size() / channels);
+  ConstEigenMatrixMap<float> Xmat(
+      X.template data<float>(), channels, X.size() / channels);
+  int pooled_height = Y->dim32(1);
+  int pooled_width = kernel_.size() > 1 ? Y->dim32(2) : 1;
+  int pooled_depth = kernel_.size() > 2 ? Y->dim32(3) : 1;
+  // The main loop
+  switch (kernel_.size()) {
+    case 1:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          const int y_col = n * pooled_height + ph;
+          Ymat.col(y_col).setConstant(PoolType::initialize());
+          for (int h = hstart; h < hend; ++h) {
+            const int x_col = n * height + h;
+            PoolType::process(x_col, y_col, Xmat, Ymat);
+          }
+          PoolType::finalize((hend - hstart), y_col, Ymat);
+        }
+      }
+      break;
+    case 2:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int wstart = pw * stride_w() - pad_l();
+            int wend = min(wstart + kernel_w(), width);
+            wstart = max(wstart, 0);
+            const int y_col = (n * pooled_height + ph) * pooled_width + pw;
+            Ymat.col(y_col).setConstant(PoolType::initialize());
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int x_col = (n * height + h) * width + w;
+                PoolType::process(x_col, y_col, Xmat, Ymat);
+              }
+            }
+            PoolType::finalize((hend - hstart) * (wend - wstart), y_col, Ymat);
+          }
+        }
+      }
+      break;
+    case 3:
+      for (int n = 0; n < X.dim32(0); ++n) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int hstart = ph * stride_h() - pad_t();
+          int hend = min(hstart + kernel_h(), height);
+          hstart = max(hstart, 0);
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int wstart = pw * stride_w() - pad_l();
+            int wend = min(wstart + kernel_w(), width);
+            wstart = max(wstart, 0);
+            for (int pd = 0; pd < pooled_depth; ++pd) {
+              int dstart = pd * stride_[2] - pads_[2];
+              int dend = min(dstart + kernel_[2], depth);
+              dstart = max(dstart, 0);
+              const int y_col = ((n * pooled_height + ph) * pooled_width + pw) *
+                      pooled_depth +
+                  pd;
+              Ymat.col(y_col).setConstant(PoolType::initialize());
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  for (int d = dstart; d < dend; ++d) {
+                    const int x_col =
+                        ((n * height + h) * width + w) * depth + d;
+                    PoolType::process(x_col, y_col, Xmat, Ymat);
+                  }
+                }
+              }
+              PoolType::finalize(
+                  (hend - hstart) * (wend - wstart) * (dend - dstart),
+                  y_col,
+                  Ymat);
+            }
+          }
+        }
+      }
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+      return false;
+  }
+  return true;
+}
+const char* kAveragePoolDoc = R"DOC(
+consumes an input blob and applies average pooling across the the blob according
+to kernel sizes, stride sizes, pad lengths and dilation. Average pooling consists
+of taking the average value of a subset of the input tensor according to the kernel
+size and downsampling the data into the output blob for further processing. The
+`brew` module has a wrapper for this operator for use in a `ModelHelper` object.
+
+Pooling layers reduce the spatial dimensionality of the input blob. Each of the
+output blob's dimensions will reduce according to:
+
+$$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "AveragePool",
+    ["X"],
+    ["Y"],
+    kernel=2,
+    stride=2,
+)
+
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+print("X:\n", workspace.FetchBlob("X"), "\n")
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X:
+ [[[[-0.2883434   0.43498734  0.05417408  1.912558    0.09390241
+    -0.33173105]
+   [ 1.633709    1.2047161   0.36964908  0.99961185  0.4184147
+     0.9989975 ]
+   [ 1.7644193   0.1789665   1.5812988  -0.6038542  -0.36090398
+     0.33195344]
+   [ 0.9457722  -0.95174325 -0.78124577  1.2062047   1.1903144
+     0.2586746 ]
+   [ 1.252104    0.32645547  1.8073524  -0.78397465  0.9978303
+    -0.97614396]
+   [ 0.5440196   1.5778259  -0.76750124  0.5051756   0.8838398
+    -0.37085298]]]]
+
+Y:
+ [[[[0.7462672  0.83399826 0.2948959 ]
+   [0.4843537  0.3506009  0.35500962]
+   [0.9251013  0.19026303 0.13366827]]]]
+```
+
+</details>
+
+)DOC";
+
+const char* kMaxPoolDoc = R"DOC(
+consumes an input blob and applies max pooling across the the blob according to
+kernel sizes, stride sizes, pad lengths and dilation. Max pooling consists of
+taking the maximum value of a subset of the input tensor according to the kernel
+size and downsampling the data into the output blob for further processing. The
+`brew` module has a wrapper for this operator for use in a `ModelHelper` object.
+
+Pooling layers reduce the spatial dimensionality of the input blob. Each of the
+output blob's dimensions will reduce according to:
+
+$$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "MaxPool",
+    ["X"],
+    ["Y"],
+    kernel=2,
+    stride=2,
+)
+
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+print("X:\n", workspace.FetchBlob("X"), "\n")
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+```
+
+**Result**
+
+```
+X:
+ [[[[-2.8534958e-01 -1.7719941e+00 -8.2277227e-04  1.1088650e+00
+    -2.1476576e+00 -3.5070452e-01]
+   [-9.0058845e-01 -3.0070004e-01 -1.7907504e+00 -7.1746534e-01
+     1.2798511e+00 -3.2214901e-01]
+   [ 1.5806322e+00  1.6845188e+00 -2.6633200e-01 -3.8576153e-01
+    -9.6424848e-02 -3.9696163e-01]
+   [ 1.2572408e-01  6.3612902e-01 -3.9554062e-01 -6.9735396e-01
+    -9.1898698e-01 -1.9609968e-01]
+   [-1.1587460e+00  2.4605224e+00 -1.5497679e+00  1.3020347e-01
+    -8.1293899e-01 -7.8803545e-01]
+   [ 1.4323474e+00  1.3618395e+00  9.8975077e-02 -1.1307785e-01
+     7.2035044e-01  2.7642491e-01]]]]
+
+Y:
+ [[[[-0.28534958  1.108865    1.2798511 ]
+   [ 1.6845188  -0.266332   -0.09642485]
+   [ 2.4605224   0.13020347  0.72035044]]]]
+
+```
+
+</details>
+
+)DOC";
+
+std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
+  return [=](OpSchema& schema) {
+    string doc = "AveragePool{dim} {pool_doc}";
+    ReplaceAll(doc, "{dim}", dim);
+    ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc);
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
+    schema.Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output data tensor.");
+    /*
+    schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
+    schema.Arg("stride", "*(type: int)* Stride of the window.");
+    schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
+    schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
+    schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
+    */
+  };
+}
+
+std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
+  return [=](OpSchema& schema) {
+    string doc = "MaxPool{dim} {pool_doc}";
+    ReplaceAll(doc, "{dim}", dim);
+    ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc);
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
+    schema.Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* Output data tensor.");
+    /*
+    schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
+    schema.Arg("stride", "*(type: int)* Stride of the window.");
+    schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
+    schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
+    schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
+    */
+  };
+}
+REGISTER_CPU_OPERATOR(
+    AveragePool,
+    PoolOp<float, CPUContext, AveragePool<float>>);
+
+OPERATOR_SCHEMA(AveragePool)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(AveragePoolDocGenerator(""))
+    .InheritOnnxSchema("AveragePool");
+
+REGISTER_CPU_OPERATOR(
+    AveragePool1D,
+    PoolOp<float, CPUContext, AveragePool<float>>);
+
+OPERATOR_SCHEMA(AveragePool1D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(AveragePoolDocGenerator("1D"))
+    .InheritOnnxSchema("AveragePool");
+
+REGISTER_CPU_OPERATOR(
+    AveragePool2D,
+    PoolOp<float, CPUContext, AveragePool<float>>);
+
+OPERATOR_SCHEMA(AveragePool2D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(AveragePoolDocGenerator("2D"))
+    .InheritOnnxSchema("AveragePool");
+
+REGISTER_CPU_OPERATOR(
+    AveragePool3D,
+    PoolOp<float, CPUContext, AveragePool<float>>);
+
+OPERATOR_SCHEMA(AveragePool3D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(AveragePoolDocGenerator("3D"))
+    .InheritOnnxSchema("AveragePool");
+
+REGISTER_CPU_OPERATOR(MaxPool, PoolOp<float, CPUContext, MaxPool<float>>);
+
+OPERATOR_SCHEMA(MaxPool)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(MaxPoolDocGenerator(""))
+    .InheritOnnxSchema("MaxPool");
+
+REGISTER_CPU_OPERATOR(MaxPool1D, PoolOp<float, CPUContext, MaxPool<float>>);
+
+OPERATOR_SCHEMA(MaxPool1D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(MaxPoolDocGenerator("1D"))
+    .InheritOnnxSchema("MaxPool");
+
+REGISTER_CPU_OPERATOR(MaxPool2D, PoolOp<float, CPUContext, MaxPool<float>>);
+
+OPERATOR_SCHEMA(MaxPool2D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(MaxPoolDocGenerator("2D"))
+    .InheritOnnxSchema("MaxPool");
+
+REGISTER_CPU_OPERATOR(MaxPool3D, PoolOp<float, CPUContext, MaxPool<float>>);
+
+OPERATOR_SCHEMA(MaxPool3D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
+    .FillUsing(MaxPoolDocGenerator("3D"))
+    .InheritOnnxSchema("MaxPool");
+} // namespace caffe2
diff --git a/caffe2/operators/pool_op.cu b/caffe2/operators/pool_op.cu
new file mode 100644
index 0000000..48b30af
--- /dev/null
+++ b/caffe2/operators/pool_op.cu
@@ -0,0 +1,1766 @@
+// TODO(ataei): reduce the apparent redundancy of all the code below.
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pool_op.h"
+
+namespace caffe2 {
+namespace {
+class AveragePool {};
+class MaxPool {};
+}  // namespace
+
+namespace {
+template <typename T>
+__global__ void Average1DPoolForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int ph = n % pooled_height;
+    n /= pooled_height;
+    int c = n % channels;
+    n /= channels;
+    int hstart = ph * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    top_data[index] = 0;
+    int bottom_offset = (n * channels + c) * height;
+    for (int h = hstart; h < hend; ++h) {
+      top_data[index] += bottom_data[bottom_offset + h];
+    }
+    top_data[index] /= (hend - hstart);
+  }
+}
+
+template <typename T>
+__global__ void Average2DPoolForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int pw = n % pooled_width;
+    n /= pooled_width;
+    int ph = n % pooled_height;
+    n /= pooled_height;
+    int c = n % channels;
+    n /= channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    top_data[index] = 0;
+    int bottom_offset = (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        top_data[index] += bottom_data[bottom_offset + h * width + w];
+      }
+    }
+    top_data[index] /= (hend - hstart) * (wend - wstart);
+  }
+}
+
+template <typename T>
+__global__ void Average3DPoolForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int depth,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int pd = n % pooled_depth;
+    n /= pooled_depth;
+    int pw = n % pooled_width;
+    n /= pooled_width;
+    int ph = n % pooled_height;
+    n /= pooled_height;
+    int c = n % channels;
+    n /= channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int dstart = pd * stride_d - pad_f;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    int dend = min(dstart + kernel_d, depth);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dstart = max(dstart, 0);
+    top_data[index] = 0;
+    int bottom_offset = (n * channels + c) * height * width * depth;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        for (int d = dstart; d < dend; ++d) {
+          const int input_index =
+              bottom_offset + h * width * depth + w * depth + d;
+          top_data[index] += bottom_data[input_index];
+        }
+      }
+    }
+    top_data[index] /= (hend - hstart) * (wend - wstart) * (dend - dstart);
+  }
+}
+
+template <typename T>
+__global__ void Average1DPoolForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int height,
+    const int channels,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int ph = (index / channels) % pooled_height;
+    int n = index / channels / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    T output = 0;
+    int bottom_offset = n * height * channels + c;
+    for (int h = hstart; h < hend; ++h) {
+      output += bottom_data[bottom_offset + h * channels];
+    }
+    int pool_size = (hend - hstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename T>
+__global__ void Average2DPoolForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pw = (index / channels) % pooled_width;
+    int ph = (index / channels / pooled_width) % pooled_height;
+    int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T output = 0;
+    int bottom_offset = n * height * width * channels + c;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        output += bottom_data[bottom_offset + (h * width + w) * channels];
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename T>
+__global__ void Average3DPoolForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int depth,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pd = (index / channels) % pooled_depth;
+    int pw = (index / channels / pooled_depth) % pooled_width;
+    int ph = (index / channels / pooled_depth / pooled_width) % pooled_height;
+    int n = index / channels / pooled_depth / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int dstart = pd * stride_d - pad_f;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    int dend = min(dstart + kernel_d, depth);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dstart = max(dstart, 0);
+    T output = 0;
+    int bottom_offset = n * height * width * depth * channels + c;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        for (int d = dstart; d < dend; ++d) {
+          const int bottom_index =
+              bottom_offset + (h * depth * width + w * depth + d) * channels;
+          output += bottom_data[bottom_index];
+        }
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart) * (dend - dstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename T>
+__global__ void Ave1DPoolBackwardNCHW(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int h = index % height + pad_t;
+    const int c = (index / height) % channels;
+    const int n = index / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height;
+    for (int ph = phstart; ph < phend; ++ph) {
+      // figure out the pooling size
+      int hstart = ph * stride_h - pad_t;
+      int hend = min(hstart + kernel_h, height);
+      hstart = max(hstart, 0);
+      int pool_size = (hend - hstart);
+      gradient += top_diff_slice[ph] / pool_size;
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void Ave2DPoolBackwardNCHW(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void Ave3DPoolBackwardNCHW(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int depth,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int d = index % depth + pad_f;
+    const int w = (index / depth) % width + pad_l;
+    const int h = (index / depth / width) % height + pad_t;
+    const int c = (index / depth / width / height) % channels;
+    const int n = index / depth / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+    T gradient = 0;
+    const T* const top_diff_slice = top_diff +
+        (n * channels + c) * pooled_height * pooled_width * pooled_depth;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        for (int pd = pdstart; pd < pdend; ++pd) {
+          // figure out the pooling size
+          int hstart = ph * stride_h - pad_t;
+          int wstart = pw * stride_w - pad_l;
+          int dstart = pd * stride_d - pad_f;
+          int hend = min(hstart + kernel_h, height);
+          int wend = min(wstart + kernel_w, width);
+          int dend = min(dstart + kernel_d, depth);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          dstart = max(dstart, 0);
+          int pool_size = (hend - hstart) * (wend - wstart) * (dend - dstart);
+          const int pooled_index =
+              ph * pooled_depth * pooled_width + pooled_depth * pw + pd;
+          gradient += top_diff_slice[pooled_index] / pool_size;
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void Ave1DPoolBackwardNHWC(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int channels,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int h = (index / channels) % height + pad_t;
+    const int n = index / channels / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    T gradient = 0;
+    const T* const top_diff_slice = top_diff + n * pooled_height * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      // figure out the pooling size
+      int hstart = ph * stride_h - pad_t;
+      int hend = min(hstart + kernel_h, height);
+      hstart = max(hstart, 0);
+      int pool_size = (hend - hstart);
+      gradient += top_diff_slice[ph * channels] / pool_size;
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void Ave2DPoolBackwardNHWC(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient +=
+            top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void Ave3DPoolBackwardNHWC(
+    const int nthreads,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int width,
+    const int depth,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int d = index / channels % depth + pad_f;
+    const int w = (index / channels / depth) % width + pad_l;
+    const int h = (index / channels / depth / width) % height + pad_t;
+    const int n = index / channels / depth / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+    T gradient = 0;
+    const T* const top_diff_slice = top_diff +
+        n * pooled_height * pooled_width * pooled_depth * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        for (int pd = pdstart; pd < pdend; ++pd) {
+          // figure out the pooling size
+          int hstart = ph * stride_h - pad_t;
+          int wstart = pw * stride_w - pad_l;
+          int dstart = pd * stride_d - pad_f;
+          int hend = min(hstart + kernel_h, height);
+          int wend = min(wstart + kernel_w, width);
+          int dend = min(dstart + kernel_d, depth);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          dstart = max(dstart, 0);
+          int pool_size = (hend - hstart) * (wend - wstart) * (dend - dstart);
+          const int pooled_index =
+              (ph * pooled_depth * pooled_width + pw * pooled_depth + pd) *
+              channels;
+          gradient += top_diff_slice[pooled_index] / pool_size;
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+}  // namespace
+
+template <>
+bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
+  int output_size = Y->size();
+  switch (kernel_.size()) {
+    case 1:
+      Average1DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
+      break;
+    case 2:
+      Average2DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
+      break;
+    case 3:
+      Average3DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(2),
+          Y->dim32(3),
+          Y->dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(X.ndim() - 1));
+  int output_size = Y->size();
+  switch (kernel_.size()) {
+    case 1:
+      Average1DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
+      break;
+    case 2:
+      Average2DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
+      break;
+    case 3:
+      Average3DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(1),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, AveragePool>::
+    RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.dim32(1), X.dim32(1));
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  vector<int> dims(X.dims().begin() + 2, X.dims().end());
+  ConvPoolOpBase<CUDAContext>::ComputePads(dims);
+  switch (kernel_.size()) {
+    case 1:
+      Ave1DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
+      break;
+    case 2:
+      Ave2DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
+      break;
+    case 3:
+      Ave3DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(2),
+          dY.dim32(3),
+          dY.dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, AveragePool>::
+    RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(X.ndim(), dY.ndim());
+  CAFFE_ENFORCE_EQ(X.dim32(X.ndim() - 1), dY.dim32(dY.ndim() - 1));
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  vector<int> dims(X.dims().begin() + 1, X.dims().end() - 1);
+  ConvPoolOpBase<CUDAContext>::ComputePads(dims);
+  switch (kernel_.size()) {
+    case 1:
+      Ave1DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
+      break;
+    case 2:
+      Ave2DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(1),
+          dY.dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
+      break;
+    case 3:
+      Ave3DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(1),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+
+namespace {
+
+template <typename T>
+__global__ void MaxPool1DForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int channels,
+    const int height,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int ph = index % pooled_height;
+    int c = (index / pooled_height) % channels;
+    int n = index / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * channels * height;
+    for (int h = hstart; h < hend; ++h) {
+      int idx = c * height + h;
+      if (bdata_offset[idx] > maxval) {
+        maxval = bdata_offset[idx];
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool2DForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * channels * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = c * height * width + h * width + w;
+        if (bdata_offset[idx] > maxval) {
+          maxval = bdata_offset[idx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool3DForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int channels,
+    const int height,
+    const int width,
+    const int depth,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pd = index % pooled_depth;
+    int pw = (index / pooled_depth) % pooled_width;
+    int ph = (index / pooled_depth / pooled_width) % pooled_height;
+    int c = (index / pooled_depth / pooled_width / pooled_height) % channels;
+    int n = index / pooled_depth / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    int dstart = pd * stride_d - pad_f;
+    int dend = min(dstart + kernel_d, depth);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dstart = max(dstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * channels * height * width * depth;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        for (int d = dstart; d < dend; ++d) {
+          int idx = ((c * height + h) * width + w) * depth + d;
+          if (bdata_offset[idx] > maxval) {
+            maxval = bdata_offset[idx];
+          }
+        }
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool1DForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int height,
+    const int channels,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * height * channels;
+    for (int h = hstart; h < hend; ++h) {
+      int idx = h * channels + c;
+      if (bdata_offset[idx] > maxval) {
+        maxval = bdata_offset[idx];
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool2DForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * height * width * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (bdata_offset[idx] > maxval) {
+          maxval = bdata_offset[idx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool3DForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int height,
+    const int width,
+    const int depth,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int dstart = (n % pooled_depth) * stride_d - pad_f;
+    n /= pooled_depth;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    int dend = min(dstart + kernel_d, depth);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dstart = max(dstart, 0);
+    T maxval = -FLT_MAX;
+    const T* bdata_offset = bottom_data + n * height * width * depth * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        for (int d = dstart; d < dend; ++d) {
+          int idx = ((h * width + w) * depth + d) * channels + c;
+          if (bdata_offset[idx] > maxval) {
+            maxval = bdata_offset[idx];
+          }
+        }
+      }
+    }
+    top_data[index] = maxval;
+  }
+}
+
+template <typename T>
+__global__ void MaxPool1DBackwardNCHW(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int h = index % height + pad_t;
+    const int c = (index / height) % channels;
+    const int n = index / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int top_offset = (n * channels + c) * pooled_height;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      int top_local_offset = top_offset + ph;
+      if (bottom_data[index] == top_data[top_local_offset]) {
+        bottom_diff[index] += top_diff[top_local_offset];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaxPool2DBackwardNCHW(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int top_local_offset = top_offset + ph * pooled_width + pw;
+        if (bottom_data[index] == top_data[top_local_offset]) {
+          bottom_diff[index] += top_diff[top_local_offset];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaxPool3DBackwardNCHW(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int depth,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int d = index % depth + pad_f;
+    const int w = (index / depth) % width + pad_l;
+    const int h = (index / depth / width) % height + pad_t;
+    const int c = (index / depth / width / height) % channels;
+    const int n = index / depth / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+    const int top_offset =
+        (n * channels + c) * pooled_height * pooled_width * pooled_depth;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        for (int pd = pdstart; pd < pdend; ++pd) {
+          int top_local_offset =
+              top_offset + (ph * pooled_width + pw) * pooled_depth + pd;
+          if (bottom_data[index] == top_data[top_local_offset]) {
+            bottom_diff[index] += top_diff[top_local_offset];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaxPool1DBackwardNHWC(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int channels,
+    const int pooled_height,
+    const int kernel_h,
+    const int stride_h,
+    const int pad_t,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int h = (index / channels) % height + pad_t;
+    const int n = index / channels / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int top_offset = n * pooled_height * channels + c;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      int top_local_offset = top_offset + ph * channels;
+      if (bottom_data[index] == top_data[top_local_offset]) {
+        bottom_diff[index] += top_diff[top_local_offset];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaxPool2DBackwardNHWC(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int top_offset =
+        n * pooled_height * pooled_width * channels + c;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int top_local_offset = top_offset + (ph * pooled_width + pw) * channels;
+        if (bottom_data[index] == top_data[top_local_offset]) {
+          bottom_diff[index] += top_diff[top_local_offset];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaxPool3DBackwardNHWC(
+    const int nthreads,
+    const T* const bottom_data,
+    const T* const top_data,
+    const T* const top_diff,
+    const int num,
+    const int height,
+    const int width,
+    const int depth,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int kernel_h,
+    const int kernel_w,
+    const int kernel_d,
+    const int stride_h,
+    const int stride_w,
+    const int stride_d,
+    const int pad_t,
+    const int pad_l,
+    const int pad_f,
+    T* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int d = index / channels % depth + pad_f;
+    const int w = (index / depth / channels) % width + pad_l;
+    const int h = (index / channels / depth / width) % height + pad_t;
+    const int n = index / channels / depth / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+    const int top_offset =
+        n * pooled_height * pooled_width * pooled_depth * channels + c;
+    bottom_diff[index] = 0;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        for (int pd = pdstart; pd < pdend; ++pd) {
+          int top_local_offset = top_offset +
+              ((ph * pooled_width + pw) * pooled_depth + d) * channels;
+          if (bottom_data[index] == top_data[top_local_offset]) {
+            bottom_diff[index] += top_diff[top_local_offset];
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+template <>
+bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
+  int output_size = Y->size();
+  switch (kernel_.size()) {
+    case 1:
+      MaxPool1DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
+      break;
+    case 2:
+      MaxPool2DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
+      break;
+    case 3:
+      MaxPool3DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(2),
+          Y->dim32(3),
+          Y->dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(X.ndim() - 1));
+  int output_size = Y->size();
+  switch (kernel_.size()) {
+    case 1:
+      MaxPool1DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
+      break;
+    case 2:
+      MaxPool2DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
+      break;
+    case 3:
+      MaxPool3DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(1),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.ndim(), X.ndim());
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  vector<int> dims(X.dims().begin() + 2, X.dims().end());
+  ConvPoolOpBase<CUDAContext>::ComputePads(dims);
+  switch (kernel_.size()) {
+    case 1:
+      MaxPool1DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
+      break;
+    case 2:
+      MaxPool2DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
+      break;
+    case 3:
+      MaxPool3DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(2),
+          dY.dim32(3),
+          dY.dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CAFFE_ENFORCE_EQ(dY.ndim(), X.ndim());
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  vector<int> dims(X.dims().begin() + 1, X.dims().end() - 1);
+  ConvPoolOpBase<CUDAContext>::ComputePads(dims);
+  switch (kernel_.size()) {
+    case 1:
+      MaxPool1DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
+    case 2:
+      MaxPool2DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(1),
+          dY.dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
+      break;
+    case 3:
+      MaxPool3DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(1),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
+      break;
+    default:
+      CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(AveragePool, PoolOp<float, CUDAContext, AveragePool>);
+REGISTER_CUDA_OPERATOR(AveragePoolGradient,
+                       PoolGradientOp<float, CUDAContext, AveragePool>);
+
+REGISTER_CUDA_OPERATOR(AveragePool1D, PoolOp<float, CUDAContext, AveragePool>);
+REGISTER_CUDA_OPERATOR(
+    AveragePool1DGradient,
+    PoolGradientOp<float, CUDAContext, AveragePool>);
+
+REGISTER_CUDA_OPERATOR(AveragePool2D, PoolOp<float, CUDAContext, AveragePool>);
+REGISTER_CUDA_OPERATOR(
+    AveragePool2DGradient,
+    PoolGradientOp<float, CUDAContext, AveragePool>);
+
+REGISTER_CUDA_OPERATOR(AveragePool3D, PoolOp<float, CUDAContext, AveragePool>);
+REGISTER_CUDA_OPERATOR(
+    AveragePool3DGradient,
+    PoolGradientOp<float, CUDAContext, AveragePool>);
+
+REGISTER_CUDA_OPERATOR(MaxPool, PoolOp<float, CUDAContext, MaxPool>);
+REGISTER_CUDA_OPERATOR(MaxPoolGradient,
+                       PoolGradientOp<float, CUDAContext, MaxPool>);
+
+REGISTER_CUDA_OPERATOR(MaxPool1D, PoolOp<float, CUDAContext, MaxPool>);
+REGISTER_CUDA_OPERATOR(
+    MaxPool1DGradient,
+    PoolGradientOp<float, CUDAContext, MaxPool>);
+
+REGISTER_CUDA_OPERATOR(MaxPool2D, PoolOp<float, CUDAContext, MaxPool>);
+REGISTER_CUDA_OPERATOR(
+    MaxPool2DGradient,
+    PoolGradientOp<float, CUDAContext, MaxPool>);
+
+REGISTER_CUDA_OPERATOR(MaxPool3D, PoolOp<float, CUDAContext, MaxPool>);
+REGISTER_CUDA_OPERATOR(
+    MaxPool3DGradient,
+    PoolGradientOp<float, CUDAContext, MaxPool>);
+}  // namespace caffe2
diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h
new file mode 100644
index 0000000..fa77492
--- /dev/null
+++ b/caffe2/operators/pool_op.h
@@ -0,0 +1,57 @@
+#ifndef CAFFE2_OPERATORS_POOL_OP_H_
+#define CAFFE2_OPERATORS_POOL_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, typename PoolType>
+class PoolOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  PoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {
+    for (int i = 0; i < kernel_.size(); ++i) {
+      CAFFE_ENFORCE(
+          dilation_[i] == 1, "Pooling op does not support dilation right now.");
+    }
+    if (!global_pooling_) {
+      for (int i = 0; i < kernel_.size(); ++i) {
+        CAFFE_ENFORCE(
+            pads_[i] < kernel_[i] && pads_[i + kernel_.size()] < kernel_[i],
+            "Pad should be smaller than kernel.");
+      }
+    }
+  }
+  ~PoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X
+  // Output: Y
+};
+
+template <typename T, class Context, class PoolType>
+class PoolGradientOp final : public ConvPoolOpBase<Context> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(Context);
+  PoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<Context>(operator_def, ws) {}
+  ~PoolGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X, Y, dY
+  // Output: dX
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_POOL_OP_H_
diff --git a/caffe2/operators/pool_op_cudnn.cu b/caffe2/operators/pool_op_cudnn.cu
new file mode 100644
index 0000000..00f719b
--- /dev/null
+++ b/caffe2/operators/pool_op_cudnn.cu
@@ -0,0 +1,529 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#include <cub/cub.cuh>
+
+namespace caffe2 {
+
+namespace {
+
+// Explicit fast paths for avg and max global pooling due to CuDNN global
+// pooling performance bug which makes pooling extremely slow.
+template <typename T>
+__global__ void
+global_avgpool_kernel_NCHW(const int NC, const int sz, const T* data, T* out) {
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int j = blockIdx.x; j < NC; j += gridDim.x) {
+    T sum(0);
+    for (int k = threadIdx.x; k < sz; k += blockDim.x) {
+      sum += data[j * sz + k];
+    }
+    float totalsum = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) {
+      out[j] = totalsum / sz;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+__global__ void
+global_avgpool_backward_NCHW(const int NC, const int sz, const T* dx, T* out) {
+  CUDA_1D_KERNEL_LOOP(i, NC * sz) {
+    out[i] = dx[i / sz] / sz;
+  }
+}
+
+template <typename T>
+__global__ void
+global_maxpool_kernel_NCHW(const int NC, const int sz, const T* data, T* out) {
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int j = blockIdx.x; j < NC; j += gridDim.x) {
+    T max(-FLT_MAX);
+    for (int k = threadIdx.x; k < sz; k += blockDim.x) {
+      max = data[j * sz + k] > max ? data[j * sz + k] : max;
+    }
+    float totalmax = BlockReduce(temp_storage).Reduce(max, cub::Max());
+    if (threadIdx.x == 0) {
+      out[j] = totalmax;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+__global__ void global_maxpool_backward_NCHW(
+    const int NC,
+    const int sz,
+    const T* dx,
+    T* out,
+    const T* x,
+    const T* in) {
+  CUDA_1D_KERNEL_LOOP(i, NC * sz) {
+    if (in[i] == x[i / sz]) {
+      out[i] = dx[i / sz];
+    } else {
+      out[i] = 0.0;
+    }
+  }
+}
+
+template <typename T>
+void setTensorDescriptor(
+    const int size,
+    const StorageOrder order,
+    const int N,
+    const int C,
+    const int H,
+    const int W,
+    const int D,
+    cudnnTensorDescriptor_t& desc) {
+  if (size == 4) {
+    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+        desc,
+        GetCudnnTensorFormat(order),
+        cudnnTypeWrapper<T>::type,
+        N,
+        C,
+        H,
+        W));
+  } else {
+    vector<int> dims = {N, C, H, W, D};
+    vector<int> strides;
+    order == NCHW
+        ? strides.insert(strides.end(), {C * H * W * D, H * W * D, W * D, D, 1})
+        : strides.insert(
+              strides.end(), {H * W * D * C, 1, W * D * C, D * C, C});
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        desc,
+        cudnnTypeWrapper<T>::type,
+        size > 3 ? size : 4,
+        dims.data(),
+        strides.data()));
+  }
+}
+
+} // namespace
+
+class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
+ public:
+  CuDNNPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
+    OPERATOR_NEEDS_FEATURE(kernel_.size() >=2 && kernel_.size() <=3,
+        "Cudnn pooling only supports 4d and 5d tensor");
+    if (legacy_pad_ != LegacyPadding::CAFFE_LEGACY_POOLING) {
+      for (int i = 0; i < kernel_.size(); ++i) {
+        OPERATOR_NEEDS_FEATURE(
+            pads_[i] == pads_[kernel_.size() + i],
+            "The current padding scheme leads to unequal padding on the left "
+            "and right, which is not supported by cudnn.");
+      }
+    }
+    // Figure out the pooling descriptor.
+    if (operator_def.type().substr(0, 7) == "MaxPool") {
+      bool deterministic =
+          OperatorBase::GetSingleArgument<bool>("deterministic", false);
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      mode_ =
+          deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX;
+#else
+      mode_ = CUDNN_POOLING_MAX;
+#endif
+    } else if (operator_def.type().substr(0, 11) == "AveragePool") {
+      mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+
+  ~CuDNNPoolOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    int N = 0, C = 0, H = 0, W = 0, D = 0;
+    int H_out = 0, W_out = 0, D_out = 0;
+
+    // cuDNN pooling support only 2 and 3 spatial dimensions.
+    CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
+
+    switch (order_) {
+      case StorageOrder::NHWC:
+        N = X.dim32(0);
+        H = X.dim32(1);
+        W = X.ndim() > 3 ? X.dim32(2) : 1;
+        D = X.ndim() > 4 ? X.dim32(3) : 1;
+        C = X.dim32(X.ndim() - 1);
+        ConvPoolOpBase::SetOutputSize(X, Y, C);
+        H_out = Y->dim32(1);
+        W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
+        D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
+        break;
+      case StorageOrder::NCHW:
+        N = X.dim32(0);
+        C = X.dim32(1);
+        H = X.dim32(2);
+        W = X.ndim() > 3 ? X.dim32(3) : 1;
+        D = X.ndim() > 4 ? X.dim32(4) : 1;
+        ConvPoolOpBase::SetOutputSize(X, Y, C);
+        H_out = Y->dim32(2);
+        W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
+        D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
+        break;
+      default:
+        LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+
+    // Fast path for global pooling, as cudnn is slow. But only
+    // on float, because fp16 not supported for CUB.
+    if (std::is_same<T, float>::value) {
+      if (order_ == StorageOrder::NCHW && global_pooling_) {
+        if (mode_ == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) {
+          global_avgpool_kernel_NCHW<float>
+              <<<std::min(N * C, CAFFE_MAXIMUM_NUM_BLOCKS),
+                 CAFFE_CUDA_NUM_THREADS,
+                 0,
+                 context_.cuda_stream()>>>(
+                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
+          return true;
+        }
+        if (mode_ == CUDNN_POOLING_MAX) {
+          global_maxpool_kernel_NCHW<float>
+              <<<std::min(N * C, CAFFE_MAXIMUM_NUM_BLOCKS),
+                 CAFFE_CUDA_NUM_THREADS,
+                 0,
+                 context_.cuda_stream()>>>(
+                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
+          return true;
+        }
+      }
+    }
+
+    if (cudnn_input_dims_ != X.dims()) {
+      // Dimensions changed; we will need to re-initialize things.
+      VLOG(1) << "Changing the cudnn descriptor configurations.";
+      cudnn_input_dims_ = X.dims();
+      setTensorDescriptor<T>(X.ndim(), order_, N, C, H, W, D, bottom_desc_);
+      setTensorDescriptor<T>(
+          Y->ndim(), order_, N, C, H_out, W_out, D_out, top_desc_);
+      for (int i = 0; i < kernel_.size(); ++i) {
+        if (pads_[i] != pads_[kernel_.size() + i]) {
+          CAFFE_ENFORCE(
+              legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
+              "Cudnn pooling only supports even padding on both sides, with "
+              "the only exception of the caffe legacy pooling case where we "
+              "try to preserve backward compatibility with Caffe.");
+        }
+      }
+      if (kernel_.size() == 2) {
+        CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
+            pooling_desc_,
+            mode_,
+            CUDNN_NOT_PROPAGATE_NAN,
+            kernel_h(),
+            kernel_w(),
+            pad_t(),
+            pad_l(),
+            stride_h(),
+            stride_w()));
+      } else {
+        CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
+            pooling_desc_,
+            mode_,
+            CUDNN_NOT_PROPAGATE_NAN,
+            kernel_.size(),
+            kernel_.data(),
+            pads_.data(),
+            stride_.data()));
+      }
+    }
+    // Carry out the pooling computation.
+    const T* Xdata = X.template data<T>();
+    T* Ydata = Y->template mutable_data<T>();
+    CUDNN_ENFORCE(cudnnPoolingForward(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        pooling_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        bottom_desc_,
+        Xdata,
+        cudnnTypeWrapper<T>::kZero(),
+        top_desc_,
+        Ydata));
+    return true;
+  }
+
+  bool RunOnDevice() final {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+
+    if (X.IsType<float>()) {
+      return DoRunWithType<float, float>();
+    } else if (X.IsType<float16>()) {
+      return DoRunWithType<float16, float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  vector<TIndex> cudnn_input_dims_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+  cudnnPoolingMode_t mode_;
+
+ private:
+};
+
+class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
+ public:
+  CuDNNPoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
+    // Figure out the pooling descriptor.
+    if (operator_def.type() == "MaxPoolGradient" ||
+        operator_def.type() == "MaxPool1DGradient" ||
+        operator_def.type() == "MaxPool2DGradient" ||
+        operator_def.type() == "MaxPool3DGradient") {
+      bool deterministic =
+          OperatorBase::GetSingleArgument<bool>("deterministic", false);
+#if CUDNN_VERSION_MIN(6, 0, 0)
+      mode_ =
+          deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX;
+#else
+      mode_ = CUDNN_POOLING_MAX;
+#endif
+    } else if (
+        operator_def.type() == "AveragePoolGradient" ||
+        operator_def.type() == "AveragePool1DGradient" ||
+        operator_def.type() == "AveragePool2DGradient" ||
+        operator_def.type() == "AveragePool3DGradient") {
+      mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    } else {
+      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
+    }
+  }
+
+  ~CuDNNPoolGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dY = Input(2);
+    auto* dX = Output(0);
+
+    // cuDNN pooling support only 2 and 3 spatial dimensions.
+    CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
+
+    dX->ResizeLike(X);
+    int N = 0, C = 0, H = 0, W = 0, D = 0;
+    int H_out = 0, W_out = 0, D_out = 0;
+    switch (order_) {
+      case StorageOrder::NHWC:
+        N = X.dim32(0);
+        H = X.dim32(1);
+        W = X.ndim() > 3 ? X.dim32(2) : 1;
+        D = X.ndim() > 4 ? X.dim32(3) : 1;
+        C = X.dim32(X.ndim() - 1);
+        H_out = Y.dim32(1);
+        W_out = Y.ndim() > 3 ? Y.dim32(2) : 1;
+        D_out = Y.ndim() > 4 ? Y.dim32(3) : 1;
+        break;
+      case StorageOrder::NCHW:
+        N = X.dim32(0);
+        C = X.dim32(1);
+        H = X.dim32(2);
+        W = X.ndim() > 3 ? X.dim32(3) : 1;
+        D = X.ndim() > 4 ? X.dim32(4) : 1;
+        H_out = Y.dim32(2);
+        W_out = Y.ndim() > 3 ? Y.dim32(3) : 1;
+        D_out = Y.ndim() > 4 ? Y.dim32(4) : 1;
+        break;
+      default:
+        LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+
+    // Fast path for global pooling, as cudnn is slow. But only
+    // on float, because fp16 not supported for CUB.
+    if (std::is_same<T, float>::value) {
+      if (order_ == StorageOrder::NCHW && global_pooling_) {
+        if (mode_ == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) {
+          global_avgpool_backward_NCHW<float>
+              <<<CAFFE_GET_BLOCKS(dX->size()),
+                 CAFFE_CUDA_NUM_THREADS,
+                 0,
+                 context_.cuda_stream()>>>(
+                  N * C,
+                  H * W * D,
+                  dY.data<float>(),
+                  dX->mutable_data<float>());
+          return true;
+        }
+#if CUDNN_VERSION_MIN(6, 0, 0)
+        if (mode_ == CUDNN_POOLING_MAX ||
+            mode_ == CUDNN_POOLING_MAX_DETERMINISTIC) {
+#else
+        if (mode_ == CUDNN_POOLING_MAX) {
+#endif
+          global_maxpool_backward_NCHW<float>
+              <<<CAFFE_GET_BLOCKS(dX->size()),
+                 CAFFE_CUDA_NUM_THREADS,
+                 0,
+                 context_.cuda_stream()>>>(
+                  N * C,
+                  H * W * D,
+                  dY.data<float>(),
+                  dX->mutable_data<float>(),
+                  Y.data<float>(),
+                  X.data<float>());
+          return true;
+        }
+      }
+    }
+
+    if (kernel_.size() == 1) {
+      ConvPoolOpBase<CUDAContext>::ComputePads({H});
+    } else if (kernel_.size() == 2) {
+      ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
+    } else if (kernel_.size() == 3) {
+      ConvPoolOpBase<CUDAContext>::ComputePads({H, W, D});
+    } else {
+      CAFFE_THROW("Unsupported kernel size :", kernel_.size());
+    }
+
+    if (cudnn_input_dims_ != X.dims()) {
+      // Dimensions changed; we will need to re-initialize things.
+      VLOG(1) << "Changing the cudnn descriptor configurations.";
+      cudnn_input_dims_ = X.dims();
+      setTensorDescriptor<T>(X.ndim(), order_, N, C, H, W, D, bottom_desc_);
+      setTensorDescriptor<T>(
+          Y.ndim(), order_, N, C, H_out, W_out, D_out, top_desc_);
+      for (int i = 0; i < kernel_.size(); ++i) {
+        if (pads_[i] != pads_[kernel_.size() + i]) {
+          CAFFE_ENFORCE(
+              legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
+              "Cudnn pooling only supports even padding on both sides, with "
+              "the only exception of the caffe legacy pooling case where we "
+              "try to preserve backward compatibility with Caffe.");
+        }
+      }
+      if (kernel_.size() == 2) {
+        CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
+            pooling_desc_,
+            mode_,
+            CUDNN_NOT_PROPAGATE_NAN,
+            kernel_h(),
+            kernel_w(),
+            pad_t(),
+            pad_l(),
+            stride_h(),
+            stride_w()));
+      } else {
+        CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
+            pooling_desc_,
+            mode_,
+            CUDNN_NOT_PROPAGATE_NAN,
+            kernel_.size(),
+            kernel_.data(),
+            pads_.data(),
+            stride_.data()));
+      }
+    }
+    // Carry out the pooling computation.
+    const T* Xdata = X.template data<T>();
+    const T* Ydata = Y.template data<T>();
+    const T* dYdata = dY.template data<T>();
+    T* dXdata = dX->template mutable_data<T>();
+
+    CUDNN_ENFORCE(cudnnPoolingBackward(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        pooling_desc_,
+        cudnnTypeWrapper<T>::kOne(),
+        top_desc_,
+        Ydata,
+        top_desc_,
+        dYdata,
+        bottom_desc_,
+        Xdata,
+        cudnnTypeWrapper<T>::kZero(),
+        bottom_desc_,
+        dXdata));
+    return true;
+  }
+
+  bool RunOnDevice() final {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dY = Input(2);
+    auto* dX = Output(0);
+    dX->ResizeLike(X);
+
+    if (X.IsType<float>()) {
+      return DoRunWithType<float, float>();
+    } else if (X.IsType<float16>()) {
+      return DoRunWithType<float16, float>();
+    } else {
+      LOG(FATAL) << "Unsupported input types";
+    }
+    return true;
+  }
+
+ protected:
+  vector<TIndex> cudnn_input_dims_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+  cudnnPoolingMode_t mode_;
+};
+
+namespace {
+REGISTER_CUDNN_OPERATOR(AveragePool, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(AveragePoolGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(AveragePool1D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(AveragePool1DGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(AveragePool2D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(AveragePool2DGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(AveragePool3D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(AveragePool3DGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(MaxPool, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(MaxPoolGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(MaxPool1D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(MaxPool1DGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(MaxPool2D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(MaxPool2DGradient, CuDNNPoolGradientOp);
+
+REGISTER_CUDNN_OPERATOR(MaxPool3D, CuDNNPoolOp);
+REGISTER_CUDNN_OPERATOR(MaxPool3DGradient, CuDNNPoolGradientOp);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/pow_op.cc b/caffe2/operators/pow_op.cc
new file mode 100644
index 0000000..a028d6d
--- /dev/null
+++ b/caffe2/operators/pow_op.cc
@@ -0,0 +1,363 @@
+#include "caffe2/operators/pow_op.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+// definition of NumericTypes and SameTypeAsInput is in below header file
+//#include "caffe2/operators/elementwise_op.h"
+#include <Eigen/Core>
+
+namespace caffe2 {
+
+#define EIGEN_POW(x, y) (x.pow(y))
+
+struct EigenPowFunctor {
+  template <int b_is_scalar, typename T1, typename T2, typename R>
+  inline void
+  Run(size_t n, const T1* a, const T2* b, T2 e, R* out, CPUContext*) {
+    if (b == NULL) {
+      EigenVectorArrayMap<R>(out, n) =
+          EIGEN_POW((ConstEigenVectorArrayMap<T1>(a, n)), (e));
+    } else {
+      if (b_is_scalar) {
+        EigenVectorArrayMap<R>(out, n) =
+            EIGEN_POW((ConstEigenVectorArrayMap<T1>(a, n)), (b[0]));
+      } else {
+        EigenVectorArrayMap<R>(out, n) = EIGEN_POW(
+            (ConstEigenVectorArrayMap<T1>(a, n)),
+            (ConstEigenVectorArrayMap<T2>(b, n)));
+      }
+    }
+  }
+  template <typename T1, typename T2, typename R>
+  void RunWithBroadcast(
+      const T1* a,
+      const T2* b,
+      R* out,
+      size_t pre,
+      size_t n,
+      CPUContext*) {
+    EigenArrayMap<R>(out, n, pre) = EIGEN_POW(
+        (ConstEigenArrayMap<T1>(a, n, pre)),
+        (ConstEigenVectorArrayMap<T2>(b, n)).rowwise().replicate(pre));
+    /*
+    //below code only allows elementary ops, such as +, -, * and /,
+    //and does not allow operations, such as pow, exp and log
+    EIGEN_POW(
+       (ConstEigenArrayMap<T>(a, n, pre).colwise()),
+       (ConstEigenVectorArrayMap<T>(b, n)));
+     */
+  }
+  template <typename T1, typename T2, typename R>
+  void RunWithBroadcast2(
+      const T1* a,
+      const T2* b,
+      R* out,
+      size_t pre,
+      size_t n,
+      size_t post,
+      CPUContext*) {
+    for (int i = 0; i < pre; ++i) {
+      EigenArrayMap<R>(out + i * n * post, post, n) = EIGEN_POW(
+          (ConstEigenArrayMap<T1>(a + i * n * post, post, n)),
+          (Eigen::Map<const Eigen::Array<T2, 1, Eigen::Dynamic>>(b, n))
+              .colwise()
+              .replicate(post));
+      /*
+      //below code only allows elementary ops, such as +, -, * and /,
+      //and does not allow for operations, such as pow, exp and log
+      EIEGN_POW(
+        (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),
+        (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));
+      */
+    }
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+    Pow,
+    PowOp<
+        TensorTypes<float> /*NumericTypes*/,
+        CPUContext,
+        EigenPowFunctor,
+        SameTypeAsInput>)
+
+OPERATOR_SCHEMA(Pow)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}, {1, 0}})
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+The *Pow* op takes an input data tensor $X$ and an exponent parameter *exponent*, which can be a scalar or another tensor. As output, it produces a single output data tensor $Y$, where the function $f(x) = x^{exponent}$ has been applied to $X$ elementwise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pow_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pow_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Pow",
+    ["X", "exponent"],
+    ["Y"],
+    broadcast=1
+)
+
+workspace.FeedBlob("X", np.array([1,2,3,4,5,6]).astype(np.float32))
+print("X: ", workspace.FetchBlob("X"))
+
+workspace.FeedBlob("exponent", np.array([2]).astype(np.float32))
+print("exponent: ", workspace.FetchBlob("exponent"))
+
+workspace.RunOperatorOnce(op)
+print("Y: ", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:  [1. 2. 3. 4. 5. 6.]
+exponent:  [2.]
+Y:  [ 1.  4.  9. 16. 25. 36.]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "Input data blob to be operated on.")
+    .Input(1, "exponent", "Exponent blob containing the exponent(s) for calculation. Do not use if setting exponent via argument.")
+    .Output(0, "Y", "Output data blob with the same shape as the input.")
+    .Arg("exponent", "The exponent of the power function. Do not use if setting exponent via input.")
+    .Arg("axis", "*(type: int; default: -1)*")
+    .Arg("broadcast", "*(type: bool; default: False)*");
+
+class GetPowGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper arg_helper(def_);
+    if (arg_helper.HasArgument("exponent")) { // second input is a scalar
+      // function f(w,a) = w^a
+      // gradient operator with respect to first input tensor
+      // df/dw = a * w^(a-1) (all operations are component-wise)
+      float exponent = arg_helper.GetSingleArgument<float>("exponent", 0.0);
+      Argument scale_arg;
+      scale_arg.set_name("scale");
+      scale_arg.set_f(exponent);
+      Argument pow_arg;
+      pow_arg.set_name("exponent");
+      if (I(0) != O(0)) {
+        pow_arg.set_f(exponent - 1);
+      } else {
+        LOG(WARNING) << "In-place Pow gradient, possible loss of precision";
+        constexpr float kEps = 1e-12f;
+        CAFFE_ENFORCE(std::fabs(exponent) > kEps);
+        pow_arg.set_f((exponent - 1) / exponent);
+      }
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "Pow",
+                                     "",
+                                     std::vector<string>{I(0)},
+                                     std::vector<string>{GI(0)},
+                                     std::vector<Argument>{pow_arg}),
+                                 CreateOperatorDef(
+                                     "Mul",
+                                     "",
+                                     std::vector<string>{GI(0), GO(0)},
+                                     std::vector<string>{GI(0)}),
+                                 CreateOperatorDef(
+                                     "Scale",
+                                     "",
+                                     std::vector<string>{GI(0)},
+                                     std::vector<string>{GI(0)},
+                                     std::vector<Argument>{scale_arg})};
+      /*
+      // Alternative gradient computation
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "Div",
+                                     "",
+                                     std::vector<string>{O(0), I(0)},
+                                     std::vector<string>{GI(0)}),
+                                 CreateOperatorDef(
+                                     "Mul",
+                                     "",
+                                     std::vector<string>{GI(0), GO(0)},
+                                     std::vector<string>{GI(0)}),
+                                 CreateOperatorDef(
+                                     "Scale",
+                                     "",
+                                     std::vector<string>{GI(0)},
+                                     std::vector<string>{GI(0)},
+                                     std::vector<Argument>{scale_arg})};
+      */
+    } else { // second input is a tensor
+      CAFFE_ENFORCE(
+          Def().input(0) != Def().output(0) &&
+              Def().input(1) != Def().output(0),
+          "Gradient computation cannot be carried out if Pow uses in-place "
+          "computation: ",
+          ProtoDebugString(Def()));
+      vector<OperatorDef> grad_ops;
+      Argument one_arg;
+      one_arg.set_name("value");
+      one_arg.set_f(1);
+      Argument broadcast, axis, axis_str, order;
+      bool bflag = ArgumentHelper::HasArgument(Def(), "broadcast");
+
+      if (bflag) {
+        if (ArgumentHelper::HasArgument(Def(), "broadcast")) {
+          broadcast = GetArgument(Def(), "broadcast");
+        } else {
+          broadcast = MakeArgument<int>("broadcast", 0);
+        }
+        if (ArgumentHelper::HasArgument(Def(), "axis")) {
+          axis = GetArgument(Def(), "axis");
+        } else {
+          axis = MakeArgument<int>("axis", -1);
+        }
+        if (ArgumentHelper::HasArgument(Def(), "axis_str")) {
+          axis_str = GetArgument(Def(), "axis_str");
+        } else {
+          axis_str = MakeArgument<string>("axis_str", "");
+        }
+        if (ArgumentHelper::HasArgument(Def(), "order")) {
+          order = GetArgument(Def(), "order");
+        } else {
+          order = MakeArgument<string>("order", "NCHW");
+        }
+      }
+
+      // function f(w,a) = w^a
+      // gradient operator with respect to first input tensor
+      // df/dw = a * w^(a-1) (all operations are component-wise)
+      grad_ops.push_back(CreateOperatorDef(
+          "ConstantFill",
+          "",
+          std::vector<string>{I(1)},
+          std::vector<string>{GI(1)},
+          std::vector<Argument>{one_arg}));
+      grad_ops.push_back(CreateOperatorDef(
+          "Sub",
+          "",
+          std::vector<string>{I(1), GI(1)},
+          std::vector<string>{GI(1)}));
+      if (bflag) {
+        grad_ops.push_back(CreateOperatorDef(
+            "Pow",
+            "",
+            std::vector<string>{I(0), GI(1)},
+            std::vector<string>{GI(0)},
+            vector<Argument>{broadcast, axis, axis_str, order}));
+      } else {
+        grad_ops.push_back(CreateOperatorDef(
+            "Pow",
+            "",
+            std::vector<string>{I(0), GI(1)},
+            std::vector<string>{GI(0)}));
+      }
+
+      grad_ops.push_back(CreateOperatorDef(
+          "Mul",
+          "",
+          std::vector<string>{GI(0), GO(0)},
+          std::vector<string>{GI(0)}));
+      if (bflag) {
+        grad_ops.push_back(CreateOperatorDef(
+            "Mul",
+            "",
+            std::vector<string>{GI(0), I(1)},
+            std::vector<string>{GI(0)},
+            vector<Argument>{broadcast, axis, axis_str, order}));
+      } else {
+        grad_ops.push_back(CreateOperatorDef(
+            "Mul",
+            "",
+            std::vector<string>{GI(0), I(1)},
+            std::vector<string>{GI(0)}));
+      }
+      /*
+      // Alternative gradient computation (no broadcast support)
+      grad_ops.push_back(CreateOperatorDef(
+                           "Div",
+                           "",
+                           std::vector<string>{O(0), I(0)},
+                           std::vector<string>{GI(0)}));
+      grad_ops.push_back(CreateOperatorDef(
+                           "Mul",
+                           "",
+                           std::vector<string>{GI(0), GO(0)},
+                           std::vector<string>{GI(0)}));
+      grad_ops.push_back(CreateOperatorDef(
+                           "Mul",
+                           "",
+                           std::vector<string>{GI(0), I(1)},
+                           std::vector<string>{GI(0)}));
+      */
+      // gradient operator for with respect to second input tensor
+      // df/da =  w^a * ln w (all operations are component-wise)
+      /*
+      // reset GI(1) to zero
+      Argument zero_arg;
+      zero_arg.set_name("value");
+      zero_arg.set_f(0);
+      grad_ops.push_back(CreateOperatorDef(
+          "ConstantFill",
+          "",
+          std::vector<string>{I(1)},
+          std::vector<string>{GI(1)},
+          std::vector<Argument>{zero_arg}));
+      */
+      grad_ops.push_back(CreateOperatorDef(
+          "Log",
+          "",
+          std::vector<string>{I(0)},
+          std::vector<string>{GI(1) + "_autogen_pre_red"}));
+      grad_ops.push_back(CreateOperatorDef(
+          "Mul",
+          "",
+          std::vector<string>{GI(1) + "_autogen_pre_red", O(0)},
+          std::vector<string>{GI(1) + "_autogen_pre_red"}));
+      if (bflag) {
+        grad_ops.push_back(CreateOperatorDef(
+            "Mul",
+            "",
+            std::vector<string>{GI(1) + "_autogen_pre_red", GO(0)},
+            std::vector<string>{GI(1) + "_autogen_pre_red"}));
+        grad_ops.push_back(CreateOperatorDef(
+            "SumReduceLike",
+            "",
+            vector<string>{GI(1) + "_autogen_pre_red", I(1)},
+            vector<string>{GI(1)},
+            vector<Argument>{axis, axis_str, order}));
+      } else {
+        grad_ops.push_back(CreateOperatorDef(
+            "Mul",
+            "",
+            std::vector<string>{GI(1) + "_autogen_pre_red", GO(0)},
+            std::vector<string>{GI(1)}));
+      }
+
+      return grad_ops;
+    }
+  }
+
+  // Argument `shape` is no longer needed in backprop.
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(Pow, GetPowGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/pow_op.cu b/caffe2/operators/pow_op.cu
new file mode 100644
index 0000000..34e5d72
--- /dev/null
+++ b/caffe2/operators/pow_op.cu
@@ -0,0 +1,90 @@
+#define CUB_STDERR
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/device/device_reduce.cuh>
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pow_op.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+// pow, log and other math functions are defined in
+// CUDA math library in header file math.h
+#define CUDA_POW(x, y) (pow(x, y))
+
+template <int b_is_scalar, typename T1, typename T2, typename R>
+__global__ void PowKernel(const T1* a, const T2* b, T2 e, R* out, int n) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    out[i] = CUDA_POW(a[i], ((b == NULL) ? e : b[b_is_scalar ? 0 : i]));
+  }
+}
+template <typename T1, typename T2, typename R>
+__global__ void
+PowBroadcastKernel(const T1* a, const T2* b, R* out, int pre, int n) {
+  CUDA_1D_KERNEL_LOOP(i, pre * n) {
+    out[i] = CUDA_POW(a[i], b[i % n]);
+  }
+}
+template <typename T1, typename T2, typename R>
+__global__ void PowBroadcast2Kernel(
+    const T1* a,
+    const T2* b,
+    R* out,
+    int pre,
+    int n,
+    int post) {
+  CUDA_1D_KERNEL_LOOP(i, pre * n * post) {
+    out[i] = CUDA_POW(a[i], b[(i / post) % n]);
+  }
+}
+
+struct CudaPowFunctor {
+  template <bool b_is_scalar, typename T1, typename T2, typename R>
+  inline void
+  Run(size_t n, const T1* a, const T2* b, T2 e, R* out, CUDAContext* context) {
+    PowKernel<b_is_scalar, T1, T2, R>
+        <<<CAFFE_GET_BLOCKS(n),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(a, b, e, out, n);
+  }
+  template <typename T1, typename T2, typename R>
+  void RunWithBroadcast(
+      const T1* a,
+      const T2* b,
+      R* out,
+      size_t pre,
+      size_t n,
+      CUDAContext* context) {
+    PowBroadcastKernel<T1, T2, R>
+        <<<CAFFE_GET_BLOCKS(pre * n),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(a, b, out, pre, n);
+  }
+  template <typename T1, typename T2, typename R>
+  void RunWithBroadcast2(
+      const T1* a,
+      const T2* b,
+      R* out,
+      size_t pre,
+      size_t n,
+      size_t post,
+      CUDAContext* context) {
+    PowBroadcast2Kernel<T1, T2, R>
+        <<<CAFFE_GET_BLOCKS(pre * n * post),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(a, b, out, pre, n, post);
+  }
+};
+REGISTER_CUDA_OPERATOR(
+    Pow,
+    PowOp<
+        TensorTypes<float> /*NumericTypes*/,
+        CUDAContext,
+        CudaPowFunctor,
+        SameTypeAsInput>)
+
+} // namespace caffe2
diff --git a/caffe2/operators/pow_op.h b/caffe2/operators/pow_op.h
new file mode 100644
index 0000000..057e71b
--- /dev/null
+++ b/caffe2/operators/pow_op.h
@@ -0,0 +1,134 @@
+#ifndef CAFFE2_OPERATORS_POW_OP_H_
+#define CAFFE2_OPERATORS_POW_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/operators/elementwise_ops_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class TypeMap = SameTypeAsInput>
+class PowOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  PowOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_() {
+    if ((InputSize() == 1) && HasArgument("exponent")) { // UnaryElementwiseOp
+      exponent_ = this->template GetSingleArgument<float>(
+          "exponent", 0); // based on pow_ops.h
+    } else if (InputSize() == 2) { // BinaryElementwiseOp
+      // Figure out the correct axis to use.
+      if (enable_broadcast_) {
+        if (axis_ != -1) {
+          // Get axis from an explicit axis argument.
+          CAFFE_ENFORCE_EQ(
+              axis_str_.size(),
+              0,
+              "Args axis and axis_str cannot be used simultaneously.");
+        } else if (axis_str_.size()) {
+          // Get the axis index semantically.
+          CAFFE_ENFORCE_EQ(
+              axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+          size_t semantic_axis_ = order_.find(axis_str_);
+          CAFFE_ENFORCE_NE(
+              semantic_axis_,
+              string::npos,
+              "Unrecognizable axis string ",
+              axis_str_,
+              " from order string ",
+              order_);
+          axis_ = semantic_axis_;
+        }
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.size() == 0,
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    } else {
+      CAFFE_THROW(
+          "Only a tensor with an argument or two input tensors are supported as input to pow operator.");
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    if ((InputSize() == 1) && HasArgument("exponent")) { // UnaryElementwiseOp
+      const auto& A = Input(0);
+      auto* C = Output(0);
+      C->ResizeLike(A);
+      const T* Adata = A.template data<T>();
+      auto* Cdata =
+          C->template mutable_data<typename TypeMap::template type<T>>();
+      functor_.template Run<true, T, float, T>(
+          A.size(), Adata, NULL, exponent_, Cdata, &context_);
+    } else if (InputSize() == 2) { // BinaryElementwiseOp
+      const auto& A = Input(0);
+      const auto& B = Input(1);
+      auto* C = Output(0);
+      CAFFE_ENFORCE(
+          &B != C || !enable_broadcast_,
+          "In-place is allowed only with the first tensor when broadcasting");
+      C->ResizeLike(A);
+      const T* Adata = A.template data<T>();
+      const T* Bdata = B.template data<T>();
+      auto* Cdata =
+          C->template mutable_data<typename TypeMap::template type<T>>();
+      if (!enable_broadcast_) {
+        CAFFE_ENFORCE_EQ(
+            A.dims(),
+            B.dims(),
+            "Dimension mismatch - did you forget to set broadcast=1?");
+        functor_.template Run<false, T, T, T>(
+            A.size(), Adata, Bdata, 0, Cdata, &context_);
+      } else if (B.size() == 1) {
+        functor_.template Run<true, T, T, T>(
+            A.size(), Adata, Bdata, 0, Cdata, &context_);
+      } else {
+        size_t pre, n, post;
+        std::tie(pre, n, post) =
+            elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
+        if (post == 1) {
+          functor_.template RunWithBroadcast<T, T, T>(
+              Adata, Bdata, Cdata, pre, n, &context_);
+        } else {
+          functor_.template RunWithBroadcast2<T, T, T>(
+              Adata, Bdata, Cdata, pre, n, post, &context_);
+        }
+      }
+    } else {
+      CAFFE_THROW(
+          "Only a tensor with an argument or two input tensors are supported as input to pow operator.");
+    }
+    return true;
+  }
+
+ private:
+  bool enable_broadcast_;
+  int axis_;
+  string axis_str_;
+  string order_;
+  float exponent_;
+  Functor functor_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_POW_OP_H_
diff --git a/caffe2/operators/prefetch_op.h b/caffe2/operators/prefetch_op.h
new file mode 100644
index 0000000..6876bd8
--- /dev/null
+++ b/caffe2/operators/prefetch_op.h
@@ -0,0 +1,142 @@
+#ifndef CAFFE2_OPERATORS_PREFETCH_OP_H_
+#define CAFFE2_OPERATORS_PREFETCH_OP_H_
+
+#include <condition_variable>
+#include <mutex>
+#include <thread> // NOLINT
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// PrefetchOperator is an operator that prefetches the next batch. It should
+// almost always be used to read things from disk, so I am setting the input to
+// zero blobs.
+//
+// For any operator that is derived from PrefetchOperator, it should
+// explicitly call the Finalize() function in its destructor, so that the
+// prefetching thread is properly destructed.
+
+// Note: We inherit from OperatorBase since we control the
+// synchronization properties of this operator ourselves (we inform
+// the waiting producer after we synchronize). This is a special-case
+// - you should generally inherit from Operator<Context> directly.
+template <class Context>
+class PrefetchOperator : public OperatorBase {
+ public:
+  PrefetchOperator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        context_(operator_def.device_option()),
+        prefetched_(false),
+        prefetch_success_(true),
+        finalize_(false),
+        no_prefetch_(GetSingleArgument<bool>("no_prefetch", false)) {
+    context_.SwitchToDevice(0);
+  }
+
+  virtual ~PrefetchOperator() noexcept {
+    CHECK(finalize_ || !prefetch_thread_.get()) <<
+        "YOU MADE A PROGRAMING ERROR: derived class of PrefetchOperator "
+        "should call Finalize() in its destructor so the prefetching "
+        "thread is joined. ";
+  }
+
+  void Finalize() {
+    if (prefetch_thread_.get()) {
+      {
+        std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
+        while (!prefetched_)
+          consumer_.wait(lock);
+        finalize_ = true;
+        prefetched_ = false;
+      }
+      producer_.notify_one();
+      prefetch_thread_->join();
+      prefetch_thread_.reset();
+    } else {
+      // If we never initialized the prefetch thread, just set
+      // finalize anyway.
+      finalize_ = true;
+    }
+  }
+
+  bool Run(int /* unused */ /*stream_id*/) override {
+    if (no_prefetch_) {
+      context_.SwitchToDevice(0);
+      bool result = Prefetch() && CopyPrefetched();
+      context_.FinishDeviceComputation();
+      return result;
+    }
+    // Note(jiayq): We only start the prefetch_thread at the Run() function
+    // instead of in the constructor, because the prefetch_thread needs to start
+    // after all derived classes' constructors finish.
+    if (!prefetch_thread_) {
+      prefetch_thread_.reset(
+          new std::thread([this] { this->PrefetchWorker(); }));
+    }
+    context_.SwitchToDevice(0);
+    std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
+    while (!prefetched_)
+      consumer_.wait(lock);
+    if (!prefetch_success_) {
+      LOG(ERROR) << "Prefetching failed.";
+      return false;
+    }
+    if (!CopyPrefetched()) {
+      LOG(ERROR) << "Error when copying prefetched data.";
+      return false;
+    }
+    prefetched_ = false;
+    context_.FinishDeviceComputation();
+    producer_.notify_one();
+    return true;
+  }
+
+  void PrefetchWorker() {
+    context_.SwitchToDevice();
+    std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
+    while (prefetched_)
+      producer_.wait(lock);
+    while (!finalize_) {
+      // We will need to run a FinishDeviceComputation() call because the
+      // prefetcher thread and the main thread are potentially using different
+      // streams (like on GPU).
+      try {
+        prefetch_success_ = Prefetch();
+        context_.FinishDeviceComputation();
+      } catch (const std::exception& e) {
+        // TODO: propagate exception_ptr to the caller side
+        LOG(ERROR) << "Prefetching error " << e.what();
+        prefetch_success_ = false;
+      }
+      prefetched_ = true;
+      consumer_.notify_one();
+      while (prefetched_)
+        producer_.wait(lock);
+    }
+  }
+
+  // You will need to implement this instead of the Run function.
+  virtual bool Prefetch() = 0;
+  virtual bool CopyPrefetched() = 0;
+
+ protected:
+  Context context_;
+  std::mutex prefetch_access_mutex_;
+  std::condition_variable producer_, consumer_;
+  // prefetched_ is used to tell the operator that it is done.
+  std::atomic<bool> prefetched_;
+  // prefetch_success_ is used to see if prefetching failed or not.
+  std::atomic<bool> prefetch_success_;
+  // finalize_ is used to tell the prefetcher to quit.
+  std::atomic<bool> finalize_;
+  unique_ptr<std::thread> prefetch_thread_;
+
+  // Whether to do prefetching or run this as a normal operator
+  const bool no_prefetch_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PREFETCH_OP_H_
diff --git a/caffe2/operators/prelu_op.cc b/caffe2/operators/prelu_op.cc
new file mode 100644
index 0000000..8bacf1e
--- /dev/null
+++ b/caffe2/operators/prelu_op.cc
@@ -0,0 +1,357 @@
+#include "caffe2/operators/prelu_op.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+#include "caffe2/core/types.h"
+#include "caffe2/utils/cpu_neon.h"
+
+namespace caffe2 {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+namespace {
+
+void runNeonPrelu(float* out, const float* in, int size, float w) {
+  float32x4_t vZero = vdupq_n_f32(0.0f);
+  float32x4_t vW = vdupq_n_f32(w);
+
+  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
+
+  if (size < kVecSizeInFloat) {
+    for (int i = 0; i < size; ++i) {
+      float v = in[i];
+      out[i] = v > 0 ? v : v * w;
+    }
+
+    return;
+  }
+
+  // We want to load aligned from the input, but assume the output is unaligned
+  int prologue =
+    kVecSizeInFloat -
+    // remainder in floats
+    (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
+
+  int i = 0;
+
+  // Prologue loop
+  for (; i < prologue; ++i) {
+    float v = in[i];
+    out[i] = v > 0 ? v : v * w;
+  }
+
+  // The loop is manually unrolled by 6; seems to be the limit for
+  // armv7 to avoid register spills
+  constexpr int kUnroll = 6;
+  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
+
+  int remainder = size - prologue;
+  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
+
+  for (; i < vectorizable; i += kFloatsPerLoop) {
+    float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
+    float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
+    float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
+    float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
+    float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
+    float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
+
+    uint32x4_t gz0 = vcgtq_f32(v0, vZero);
+    uint32x4_t gz1 = vcgtq_f32(v1, vZero);
+    uint32x4_t gz2 = vcgtq_f32(v2, vZero);
+    uint32x4_t gz3 = vcgtq_f32(v3, vZero);
+    uint32x4_t gz4 = vcgtq_f32(v4, vZero);
+    uint32x4_t gz5 = vcgtq_f32(v5, vZero);
+
+    float32x4_t v0neg = vmulq_f32(v0, vW);
+    float32x4_t v1neg = vmulq_f32(v1, vW);
+    float32x4_t v2neg = vmulq_f32(v2, vW);
+    float32x4_t v3neg = vmulq_f32(v3, vW);
+    float32x4_t v4neg = vmulq_f32(v4, vW);
+    float32x4_t v5neg = vmulq_f32(v5, vW);
+
+    // v0 > 0 ? v0 : v0 * w
+    v0 = vbslq_f32(gz0, v0, v0neg);
+    v1 = vbslq_f32(gz1, v1, v1neg);
+    v2 = vbslq_f32(gz2, v2, v2neg);
+    v3 = vbslq_f32(gz3, v3, v3neg);
+    v4 = vbslq_f32(gz4, v4, v4neg);
+    v5 = vbslq_f32(gz5, v5, v5neg);
+
+    vst1q_f32(out + i + 0, v0);
+    vst1q_f32(out + i + 4, v1);
+    vst1q_f32(out + i + 8, v2);
+    vst1q_f32(out + i + 12, v3);
+    vst1q_f32(out + i + 16, v4);
+    vst1q_f32(out + i + 20, v5);
+  }
+
+  for (; i < size; ++i) {
+    float v = in[i];
+    out[i] = v > 0 ? v : v * w;
+  }
+}
+
+}
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <>
+bool PReluOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& W = Input(1);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto* Xdata = X.template data<float>();
+  const auto* Wdata = W.template data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  if (!C_shared) {
+    CAFFE_ENFORCE_EQ(C, W.size());
+  }
+
+  if (C_shared) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    // The function is completely pointwise
+    runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
+#else
+    ConstEigenVectorMap<float> Xvec(Xdata, X.size());
+    EigenVectorMap<float> Yvec(Ydata, Y->size());
+    Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+    return true;
+  }
+
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto N = X.dim(0);
+      const auto dim = X.size_from_dim(2);
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      // Pointwise for each channel
+      for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+          runNeonPrelu(Ydata + (n * C + c) * dim,
+                       Xdata + (n * C + c) * dim,
+                       dim, Wdata[c]);
+        }
+      }
+#else
+      int nc = 0;
+      for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+          ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
+          EigenVectorMap<float>(Ydata + nc * dim, dim) =
+              Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
+          nc++;
+        }
+      }
+#endif
+      break;
+    }
+    case StorageOrder::NHWC: {
+      // Lay out matrix as (NHW, C) and multiply by C
+      const auto NHW = X.size() / C;
+      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
+      ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
+      EigenArrayMap<float> Ymat(Ydata, C, NHW);
+      Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+template <>
+bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto& X = Input(2);
+  auto& W = Input(3);
+
+  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
+  auto* dX = Output(0);
+  auto* dW = Output(1);
+
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  dW->ResizeLike(W);
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Wdata = W.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  float* dWdata = dW->mutable_data<float>();
+
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto dim = X.size_from_dim(2);
+      const auto div_factor = C_shared ? C : 1;
+      for (auto c = 0; c < W.size(); ++c) {
+        dWdata[c] = 0;
+      }
+
+      for (int i = 0; i < Y.size(); ++i) {
+        if (Xdata[i] <= 0) {
+          int c = (i / dim) % C / div_factor;
+          dWdata[c] += dYdata[i] * Xdata[i];
+        }
+      }
+
+      for (int i = 0; i < Y.size(); ++i) {
+        if (Xdata[i] > 0) {
+          dXdata[i] = dYdata[i];
+        } else {
+          int c = (i / dim) % C / div_factor;
+          dXdata[i] = Wdata[c] * dYdata[i];
+        }
+      }
+      break;
+    }
+    case StorageOrder::NHWC: {
+      const auto NHW = X.size() / C;
+      ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
+      EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
+
+      ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
+      ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
+      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
+      EigenArrayMap<float> dXmat(dXdata, C, NHW);
+
+      if (C_shared) {
+        dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
+        dWdata[0] =
+            (Xmat > 0)
+                .select(
+                    Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
+                    dYmat * Xmat)
+                .sum();
+      } else {
+        dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
+        dWvec = (Xmat > 0)
+                    .select(
+                        Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
+                        dYmat * Xmat)
+                    .rowwise()
+                    .sum();
+      }
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
+
+// Input: X, Slope, output: Y
+OPERATOR_SCHEMA(PRelu)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+
+The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as
+
+$$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$
+
+Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "PRelu",
+    ["X","Slope"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
+print("Slope:\n", workspace.FetchBlob("Slope"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[ 0.3957382  -0.19725518 -0.26991343]
+ [ 1.5513182  -0.27427664 -0.14584002]
+ [-0.4121164   0.9292345   0.96426094]]
+
+Slope:
+ [0.1]
+
+Y:
+ [[ 0.3957382  -0.01972552 -0.02699134]
+ [ 1.5513182  -0.02742766 -0.014584  ]
+ [-0.04121164  0.9292345   0.96426094]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "Input tensor of data to be operated on.")
+    .Input(
+        1,
+        "Slope",
+        "1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
+    .Output(0, "Y", "Output tensor, with same shape as $X$.")
+    .InheritOnnxSchema("PRelu");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
+
+PReluGradient takes both Y and dY and uses this to update dX and dW according
+to the chain rule and derivatives of the rectified linear function.
+
+)DOC");
+
+class GetPReluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{O(0), GO(0), I(0), I(1)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(PRelu, GetPReluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
new file mode 100644
index 0000000..b14393d
--- /dev/null
+++ b/caffe2/operators/prelu_op.cu
@@ -0,0 +1,281 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/prelu_op.h"
+
+#include <cub/block/block_reduce.cuh>
+
+namespace caffe2 {
+namespace {
+template <typename T>
+__global__ void PReluKernel(const int N, const T* X, const T* W, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[0];
+  }
+}
+
+template <typename T>
+__global__ void PReluKernelNCHW(
+    const int N,
+    const int C,
+    const int dim,
+    const T* X,
+    const T* W,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N * C * dim) {
+    int c = (i / dim) % C;
+    Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[c];
+  }
+}
+
+template <typename T>
+__global__ void
+PReluKernelNHWC(const int nitems, const int C, const T* X, const T* W, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, nitems) {
+    int c = i % C;
+    Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[c];
+  }
+}
+
+template <typename T>
+__global__ void
+PReluGradientKernel(const int N, const T* X, const T* W, const T* dY, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[0];
+  }
+}
+
+template <typename T>
+__global__ void PReluGradientKernelNCHW(
+    const int N,
+    const int C,
+    const int dim,
+    const T* X,
+    const T* W,
+    const T* dY,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N * C * dim) {
+    int c = (i / dim) % C;
+    dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[c];
+  }
+}
+
+template <typename T>
+__global__ void PReluGradientKernelNHWC(
+    const int nitems,
+    const int C,
+    const T* X,
+    const T* W,
+    const T* dY,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, nitems) {
+    int c = i % C;
+    dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[c];
+  }
+}
+
+template <typename T>
+__global__ void PReluSharedWGradientKernelNCHW(
+    const int num_items,
+    const T* Xdata,
+    const T* dYdata,
+    T* dW) {
+  T wsum = 0.0;
+  for (int i = threadIdx.x; i < num_items; i += blockDim.x) {
+    wsum += (Xdata[i] <= 0) * dYdata[i] * Xdata[i];
+  }
+
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T sum = BlockReduce(temp_storage).Sum(wsum);
+  if (threadIdx.x == 0) {
+    *dW = sum;
+  }
+}
+
+template <typename T>
+__global__ void PReluWGradientKernelNCHW(
+    const int C,
+    const int N,
+    const int num_items,
+    const T* Xdata,
+    const T* dYdata,
+    T* dW) {
+  int c = blockIdx.x;
+
+  T wsum = 0.0;
+  int items_per_channel = num_items / C;
+  int items_per_sample_channel = items_per_channel / N;
+  for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) {
+    // TODO: simplify
+    int n = i / items_per_sample_channel;
+    int ii = n * items_per_sample_channel * C + c * items_per_sample_channel +
+        i % items_per_sample_channel;
+    wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii];
+  }
+
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T sum = BlockReduce(temp_storage).Sum(wsum);
+  if (threadIdx.x == 0) {
+    dW[c] = sum;
+  }
+}
+
+template <typename T>
+__global__ void PReluWGradientKernelNHWC(
+    const int C,
+    const int N,
+    const int num_items,
+    const T* Xdata,
+    const T* dYdata,
+    T* dW) {
+  int c = blockIdx.x;
+  T wsum = 0.0;
+  int items_per_channel = num_items / C;
+  for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) {
+    int ii = i * C + c;
+    wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii];
+  }
+
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T sum = BlockReduce(temp_storage).Sum(wsum);
+  if (threadIdx.x == 0) {
+    dW[c] = sum;
+  }
+}
+
+} // namespace
+
+template <>
+bool PReluOp<float, CUDAContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& W = Input(1);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto* Xdata = X.data<float>();
+  const auto* Wdata = W.data<float>();
+  auto* Ydata = Y->mutable_data<float>();
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  if (!C_shared) {
+    CAFFE_ENFORCE_EQ(C, W.size());
+  }
+  if (C_shared) {
+    PReluKernel<<<
+        CAFFE_GET_BLOCKS(X.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(X.size(), Xdata, Wdata, Ydata);
+    return true;
+  }
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto N = X.dim(0);
+      const auto dim = X.size_from_dim(2);
+      CHECK(N * C * dim == X.size());
+      PReluKernelNCHW<<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(N, C, dim, Xdata, Wdata, Ydata);
+
+      break;
+    }
+    case StorageOrder::NHWC: {
+      PReluKernelNHWC<<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(X.size(), C, Xdata, Wdata, Ydata);
+
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+template <>
+bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto& X = Input(2);
+  auto& W = Input(3);
+
+  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
+  auto* dX = Output(0);
+  auto* dW = Output(1);
+
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  dW->ResizeLike(W);
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Wdata = W.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  float* dWdata = dW->mutable_data<float>();
+  int N = Y.dim(0);
+
+  if (C_shared) {
+    PReluSharedWGradientKernelNCHW<<<
+        1,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(X.size(), Xdata, dYdata, dWdata);
+    PReluGradientKernel<<<
+        CAFFE_GET_BLOCKS(X.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(X.size(), Xdata, Wdata, dYdata, dXdata);
+
+    return true;
+  }
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto dim = Y.size_from_dim(2);
+      PReluWGradientKernelNCHW<<<
+          C,
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(C, N, X.size(), Xdata, dYdata, dWdata);
+      PReluGradientKernelNCHW<<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(N, C, dim, Xdata, Wdata, dYdata, dXdata);
+
+      break;
+    }
+    case StorageOrder::NHWC: {
+      PReluWGradientKernelNHWC<<<
+          C,
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(C, N, X.size(), Xdata, dYdata, dWdata);
+      PReluGradientKernelNHWC<<<
+          CAFFE_GET_BLOCKS(Y.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(X.size(), C, Xdata, Wdata, dYdata, dXdata);
+
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(PRelu, PReluOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(PReluGradient, PReluGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/prelu_op.h b/caffe2/operators/prelu_op.h
new file mode 100644
index 0000000..1c1a4aa
--- /dev/null
+++ b/caffe2/operators/prelu_op.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class PReluOp final : public Operator<Context> {
+ public:
+  PReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+};
+
+template <typename T, class Context>
+class PReluGradientOp final : public Operator<Context> {
+ public:
+  PReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/prepend_dim_op.cc b/caffe2/operators/prepend_dim_op.cc
new file mode 100644
index 0000000..2796fec
--- /dev/null
+++ b/caffe2/operators/prepend_dim_op.cc
@@ -0,0 +1,45 @@
+#include "caffe2/operators/prepend_dim_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(PrependDim, PrependDimOp<CPUContext>);
+REGISTER_CPU_OPERATOR(MergeDim, MergeDimOp<CPUContext>);
+
+OPERATOR_SCHEMA(PrependDim)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Reshape the tensor by prepending a dimension of fixed size and dividing the
+size of the next dimension by that amount.
+)DOC")
+    .Arg("dim_size", "Size of the dimension to prepend.")
+    .Input(0, "data", "An input tensor.")
+    .Output(0, "reshaped", "Reshaped tensor.");
+
+OPERATOR_SCHEMA(MergeDim)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Merge first two dimensions in a single dimension with size dim(0) * dim(1).
+)DOC")
+    .Input(0, "data", "An input tensor.")
+    .Output(0, "reshaped", "Reshaped tensor.");
+
+class GetPrependDimGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MergeDim", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+
+  // Arguments are no longer needed in backprop.
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(PrependDim, GetPrependDimGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/prepend_dim_op.h b/caffe2/operators/prepend_dim_op.h
new file mode 100644
index 0000000..16cbb11
--- /dev/null
+++ b/caffe2/operators/prepend_dim_op.h
@@ -0,0 +1,93 @@
+
+#ifndef CAFFE2_OPERATORS_PREPEND_DIM_OP_H_
+#define CAFFE2_OPERATORS_PREPEND_DIM_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class PrependDimOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  PrependDimOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        dim_size_(OperatorBase::GetSingleArgument<int64_t>("dim_size", 0)) {
+    CAFFE_ENFORCE_GT(
+        dim_size_, 0, "Argument dim_size must be greater than zero.");
+  }
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE(input.ndim() > 0, "Input must be at least 1D.");
+    CAFFE_ENFORCE(
+        input.dim(0) % dim_size_ == 0,
+        "First dimension must be multiple of prepend_dim. Current first dimension: ",
+        input.dim(0));
+
+    vector<int64_t> actual_new_shape(input.ndim() + 1);
+    actual_new_shape[0] = dim_size_;
+    actual_new_shape[1] = input.dim(0) / dim_size_;
+    for (int i = 1; i < input.dims().size(); ++i) {
+      actual_new_shape[i + 1] = input.dim(i);
+    }
+    output->Resize(actual_new_shape);
+
+    if (output != &input) {
+      // If we are not doing in-place computation, a copy is needed.
+      context_.template CopyItems<Context, Context>(
+          input.meta(),
+          input.size(),
+          input.raw_data(),
+          output->raw_mutable_data(input.meta()));
+    }
+    return true;
+  }
+
+ private:
+  int64_t dim_size_;
+};
+
+template <class Context>
+class MergeDimOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MergeDimOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE(input.ndim() > 1, "Input must be at least 2D.");
+
+    vector<int64_t> actual_new_shape(input.ndim() - 1);
+    actual_new_shape[0] = input.dim(0) * input.dim(1);
+    for (int i = 1; i < input.dims().size() - 1; ++i) {
+      actual_new_shape[i] = input.dim(i + 1);
+    }
+    output->Resize(actual_new_shape);
+
+    if (output != &input) {
+      // If we are not doing in-place computation, a copy is needed.
+      context_.template CopyItems<Context, Context>(
+          input.meta(),
+          input.size(),
+          input.raw_data(),
+          output->raw_mutable_data(input.meta()));
+    }
+    return true;
+  }
+
+ private:
+  int64_t dim_size_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PREPEND_DIM_OP_H_
diff --git a/caffe2/operators/prepend_dim_op_gpu.cc b/caffe2/operators/prepend_dim_op_gpu.cc
new file mode 100644
index 0000000..aade1b0
--- /dev/null
+++ b/caffe2/operators/prepend_dim_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/prepend_dim_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(PrependDim, PrependDimOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(MergeDim, MergeDimOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/quant_decode_op.cc b/caffe2/operators/quant_decode_op.cc
new file mode 100644
index 0000000..9f4f7b6
--- /dev/null
+++ b/caffe2/operators/quant_decode_op.cc
@@ -0,0 +1,67 @@
+#include "quant_decode_op.h"
+#include <stdint.h>
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(QuantDecode, QuantDecodeOp<QuantDecodeRunTy::RUN_ALWAYS>);
+REGISTER_CPU_OPERATOR(QuantDecodeGradient, QuantDecodeGradientOp);
+#ifdef CAFFE2_USE_MPSCNN
+REGISTER_CPU_OPERATOR(
+    MPSCNNQuantDecode,
+    QuantDecodeOp<QuantDecodeRunTy::RUN_ONCE>);
+#endif
+
+OPERATOR_SCHEMA(QuantDecode)
+    .NumInputsOutputs([](int in, int out) { return in > 1 && out + 1 == in; })
+    .SetDoc(R"DOC(
+Decode inputs using codebook. This is a general LUT operator that returns
+tensors with values from codebook (input 0) based on given indices in
+codes (input 1 ~ n).
+
+
+Example:
+
+
+Input:
+  codebook = [1.5, 2.5, 3.5]
+  codes_0 = [0, 1, 1, 2]
+  codes_1 = [2, 0, 0]
+
+
+Output:
+  decoded_0 = [1.5, 2.5, 2.5, 3.5]
+  decoded_1 = [3.5, 1.5, 1.5]
+)DOC")
+    .Input(0, "codebook", "Codebook in 1d tensor (float)")
+    .Input(1, "codes_0", "Encoded codes 0 (uint8/uint16/int32)")
+    .Input(2, "codes_1", "Encoded codes 1 if existed (uint8/uint16/int32)")
+    .Input(3, "codes_n", "Encoded codes n if existed (uint8/uint16/int32)")
+    .Output(0, "decoded_0", "Decoded tensor for codes_0 (float)")
+    .Output(1, "decoded_1", "Decoded tensor for codes_1 (float)")
+    .Output(2, "decoded_n", "Decoded tensor for codes_n (float)");
+
+OPERATOR_SCHEMA(QuantDecodeGradient)
+    .NumInputs([](int in) { return in >= 3 && in % 2 == 1; })
+    .NumOutputs(1);
+
+class GetQuantDecodeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE_EQ(Def().input_size(), Def().output_size() + 1);
+    vector<string> gradient_op_inputs;
+    for (int i = 0; i < Def().input_size(); i++) {
+      gradient_op_inputs.push_back(I(i));
+    }
+    for (int i = 0; i < Def().output_size(); i++) {
+      gradient_op_inputs.push_back(GO(i));
+    }
+    return SingleGradientDef(
+        "QuantDecodeGradient", "", gradient_op_inputs, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(QuantDecode, GetQuantDecodeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h
new file mode 100644
index 0000000..768d879
--- /dev/null
+++ b/caffe2/operators/quant_decode_op.h
@@ -0,0 +1,169 @@
+#ifndef QUANT_DECODE_OP_H_
+#define QUANT_DECODE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <class CodebookT, class CodeT>
+void Decode(
+    const TensorCPU& codebook,
+    const TensorCPU& codes,
+    /* optional */ const TensorCPU* const decoded_grad,
+    TensorCPU* const output,
+    bool resizeOnly) {
+  CAFFE_ENFORCE(codebook.IsType<CodebookT>());
+
+  auto* cb_ptr = codebook.data<CodebookT>();
+  int cb_size = codebook.size();
+
+  CAFFE_ENFORCE(codes.IsType<CodeT>());
+  auto* code_ptr = codes.data<CodeT>();
+
+  if (decoded_grad == nullptr) {
+    // Forward pass: decode and store codebook values in output.
+    output->ResizeLike(codes);
+    auto* out_ptr = output->mutable_data<CodebookT>();
+    if (resizeOnly) {
+      return;
+    }
+
+    int sz = output->size();
+    for (int i = 0; i < sz; i++) {
+      DCHECK_LE(*code_ptr, cb_size);
+      *out_ptr++ = cb_ptr[*code_ptr++];
+    }
+  } else {
+    // Backward pass: decode and accumulate gradient w.r.t. codebook values.
+    CAFFE_ENFORCE_EQ(codes.size(), decoded_grad->size());
+    auto* gradient_ptr = decoded_grad->data<CodebookT>();
+    auto* const gradient_end = gradient_ptr + decoded_grad->size();
+
+    CAFFE_ENFORCE_EQ(cb_size, output->size());
+    auto* out_ptr = output->mutable_data<CodebookT>();
+    while (gradient_ptr < gradient_end) {
+      DCHECK_LE(*code_ptr, cb_size);
+      out_ptr[*code_ptr++] += *gradient_ptr++;
+    }
+  }
+}
+
+#define REGISTER_DECODER(codebookType, codesType)                      \
+  {                                                                    \
+    {TypeMeta::Id<codebookType>(), TypeMeta::Id<codesType>()},         \
+        [](const TensorCPU& codebook_,                                 \
+           const TensorCPU& codes_,                                    \
+           const TensorCPU* gradient_,                                 \
+           TensorCPU* outDecoded_,                                     \
+           bool resizeOnly_) {                                         \
+          Decode<codebookType, codesType>(                             \
+              codebook_, codes_, gradient_, outDecoded_, resizeOnly_); \
+        }                                                              \
+  }
+
+inline void DecodeGeneral(
+    const TensorCPU& codebook,
+    const TensorCPU& codes,
+    const TensorCPU* gradient,
+    TensorCPU* outDecoded,
+    bool resizeOnly) {
+  const static std::map<
+      std::pair<CaffeTypeId, CaffeTypeId>,
+      std::function<void(
+          const TensorCPU& codebook,
+          const TensorCPU& codes,
+          const TensorCPU* gradient,
+          TensorCPU* outDecoded,
+          bool resizeOnly)>>
+      gDecoderMapper = {REGISTER_DECODER(float, uint8_t),
+                        REGISTER_DECODER(float, uint16_t),
+                        REGISTER_DECODER(float, int32_t)};
+
+  gDecoderMapper.at({codebook.meta().id(), codes.meta().id()})(
+      codebook, codes, gradient, outDecoded, resizeOnly);
+}
+
+} // namespace
+
+// Decode tensors based on given codebook,
+// The codebook is generated by model_quantize.py
+
+enum class QuantDecodeRunTy {
+  RUN_ALWAYS,
+  RUN_ONCE,
+};
+
+template <QuantDecodeRunTy QuantDecodeRun>
+class QuantDecodeOp final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  QuantDecodeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  ~QuantDecodeOp() {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_GT(InputSize(), 1);
+    // first input is the codebook
+    CAFFE_ENFORCE_EQ(InputSize(), OutputSize() + 1);
+
+    const auto& codebook = Input(0);
+    CAFFE_ENFORCE(codebook.template IsType<float>(), codebook.meta().name());
+
+    for (int i = 0; i < OutputSize(); i++) {
+      auto& ci = Input(i + 1);
+      auto* co = Output(i);
+
+      DecodeGeneral(
+          codebook,
+          ci,
+          nullptr,
+          co,
+          /*resizeOnly=*/QuantDecodeRun == QuantDecodeRunTy::RUN_ONCE &&
+              hasRun_);
+    }
+    hasRun_ = true;
+    return true;
+  }
+
+ private:
+  bool hasRun_{false};
+};
+
+class QuantDecodeGradientOp final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  QuantDecodeGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+  ~QuantDecodeGradientOp() {}
+
+  bool RunOnDevice() override {
+    // Inputs: 1 codebook, n tensors of codes, and n corresponding gradients.
+    CAFFE_ENFORCE(InputSize() >= 3 && InputSize() % 2 == 1);
+    const int num_code_tensors = (InputSize() - 1) / 2;
+    CAFFE_ENFORCE_EQ(OutputSize(), 1);
+
+    const auto& codebook = Input(0);
+    CAFFE_ENFORCE(codebook.template IsType<float>(), codebook.meta().name());
+
+    auto* gradient = Output(0);
+    gradient->ResizeLike(codebook);
+    auto* gradient_ptr = gradient->mutable_data<float>();
+    std::fill(gradient_ptr, gradient_ptr + gradient->size(), 0);
+
+    for (int i = 0; i < num_code_tensors; i++) {
+      auto& codes_i = Input(i + 1);
+      auto& output_gradient_i = Input(i + num_code_tensors + 1);
+      DecodeGeneral(codebook, codes_i, &output_gradient_i, gradient, false);
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+#endif // QUANT_DECODE_OP_H_
diff --git a/caffe2/operators/rank_loss_op.cc b/caffe2/operators/rank_loss_op.cc
new file mode 100644
index 0000000..d256a6e
--- /dev/null
+++ b/caffe2/operators/rank_loss_op.cc
@@ -0,0 +1,206 @@
+#include "caffe2/operators/rank_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+// Computes log(1 + exp(y)) in a way that avoids early over-/under-flow
+template <class T>
+inline T logLogit(T x) {
+  static const auto kMinLogDiff = std::log(std::numeric_limits<T>::epsilon());
+
+  if (x < kMinLogDiff) {
+    return 0;
+  }
+  if (x > -kMinLogDiff) {
+    return x;
+  }
+  return std::log(std::exp(x) + 1);
+}
+}
+
+template <typename T, class Context>
+bool PairWiseLossOp<T, Context>::RunOnDevice() {
+  auto& X = Input(XVALUE);
+  auto& label = Input(LABEL);
+  auto* Y = Output(YVALUE);
+
+  int N = X.ndim() > 0 ? X.dim32(0) : 0;
+  if (N == 0) {
+    Y->Resize(0);
+    Y->template mutable_data<T>();
+    return true;
+  }
+
+  const int32_t* lengths_vec;
+  int len_size = 1;
+  if (InputSize() > LENGTHS) {
+    auto& lengths = Input(LENGTHS);
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+    len_size = lengths.size();
+    lengths_vec = lengths.template data<int32_t>();
+    int len_sum = 0;
+    if (len_size > 0) {
+      math::Sum<int, Context>(len_size, lengths_vec, &len_sum, &context_);
+    }
+    CAFFE_ENFORCE_EQ(len_sum, N);
+  } else {
+    lengths_vec = &N;
+  }
+
+  // a total of len_size sessions
+  Y->Resize(len_size);
+  auto* Ydata = Y->template mutable_data<T>();
+
+  int D = X.size() / N;
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  CAFFE_ENFORCE_EQ(1, D); // only support one class at the moment
+
+  const auto* Xdata = X.template data<T>();
+  const auto* labelData = label.template data<T>();
+  int offset = 0;
+  for (int idx = 0; idx < len_size; ++idx) {
+    Ydata[idx] = 0;
+    int numPairs = 0;
+    for (int i = offset; i < offset + lengths_vec[idx]; ++i) {
+      for (int j = offset; j < i; ++j) {
+        if (std::abs(labelData[i] - labelData[j]) <
+            std::numeric_limits<T>::epsilon()) {
+          continue;
+        }
+        ++numPairs;
+        // only use sigmoid loss function at the moment
+        auto sign = labelData[i] > labelData[j] ? 1 : -1;
+        Ydata[idx] += logLogit(sign * (Xdata[j] - Xdata[i]));
+      }
+    }
+    if (numPairs > 0) {
+      Ydata[idx] /= numPairs;
+    }
+    offset += lengths_vec[idx];
+  }
+  return true;
+}
+
+template <class T, class Context>
+bool PairWiseLossGradientOp<T, Context>::RunOnDevice() {
+  auto& X = Input(XVALUE);
+  auto& label = Input(LABEL);
+  auto& dY = Input(DYVALUE);
+  auto* dX = Output(DXVALUE);
+  int N = X.ndim() > 0 ? X.dim32(0) : 0;
+  CAFFE_ENFORCE_EQ(N, X.size());
+  CAFFE_ENFORCE(
+      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
+  CAFFE_ENFORCE_EQ(label.dim32(0), N);
+  dX->ResizeLike(X);
+  math::Set<T, CPUContext>(
+      dX->size(), 0.f, dX->template mutable_data<T>(), &context_);
+
+  if (N == 0) {
+    return true;
+  }
+
+  const int32_t* lengths_vec;
+  int len_size = 1;
+  if (InputSize() > LENGTHS) {
+    auto& lengths = Input(LENGTHS);
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+    len_size = lengths.size();
+    lengths_vec = lengths.template data<int32_t>();
+    int len_sum = 0;
+    if (len_size > 0) {
+      math::Sum<int, Context>(len_size, lengths_vec, &len_sum, &context_);
+    }
+    CAFFE_ENFORCE_EQ(len_sum, N);
+  } else {
+    lengths_vec = &N;
+  }
+
+  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
+  CAFFE_ENFORCE_EQ(dY.dim32(0), len_size);
+
+  const T* Xdata = X.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  const T* labelData = label.template data<T>();
+  T* dXdata = dX->template mutable_data<T>();
+  int offset = 0;
+  for (int idx = 0; idx < len_size; ++idx) {
+    int numPairs = 0;
+    for (int i = offset; i < offset + lengths_vec[idx]; ++i) {
+      for (int j = offset; j < i; ++j) {
+        if (std::abs(labelData[i] - labelData[j]) <
+            std::numeric_limits<T>::epsilon()) {
+          continue;
+        }
+        ++numPairs;
+        // only use sigmoid loss function at the moment
+        auto sign = labelData[i] > labelData[j] ? 1 : -1;
+        auto grad =
+            sign * dYdata[idx] / (1 + exp(-sign * (Xdata[j] - Xdata[i])));
+        dXdata[i] -= grad;
+        dXdata[j] += grad;
+      }
+    }
+    if (numPairs > 0) {
+      for (int i = offset; i < offset + lengths_vec[idx]; ++i) {
+        dXdata[i] /= numPairs;
+      }
+    }
+    offset += lengths_vec[idx];
+  }
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(PairWiseLoss, PairWiseLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    PairWiseLossGradient,
+    PairWiseLossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(PairWiseLoss)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Operator computes the pair wise loss between all pairs within a batch
+ using the logit loss function on the difference in scores between pairs
+)DOC")
+    .Input(
+        0,
+        "X",
+        "Input blob from the previous layer, which is almost always "
+        "the result of a softmax operation; X is a 2D array of size N x 1"
+        "where N is the batch size. For more info: "
+        "D. Sculley, Large Scale Learning to Rank. "
+        "https://www.eecs.tufts.edu/~dsculley/papers/large-scale-rank.pdf")
+    .Input(1, "label", "Blob containing the labels used to compare the input")
+    .Input(
+        2,
+        "lengths",
+        "Optional input blob that contains the lengths"
+        "of multiple sessions. The summation of this blob must be equal"
+        "to the size of blob X. If lengths blob is provided, the output"
+        "blob has the same size as lengths blob, and the cross entropy"
+        "is computed within each session.")
+    .Output(0, "Y", "Output blob after the cross entropy computation");
+OPERATOR_SCHEMA(PairWiseLossGradient).NumInputs(3, 4).NumOutputs(1);
+
+class GetPairWiseLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{I(0), I(1), GO(0)};
+
+    // Add lengths blob if given
+    if (def_.input_size() == 3) {
+      blob_names.push_back(I(2));
+    }
+    return SingleGradientDef(
+        "PairWiseLossGradient", "", blob_names, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(PairWiseLoss, GetPairWiseLossGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/rank_loss_op.h b/caffe2/operators/rank_loss_op.h
new file mode 100644
index 0000000..420428a
--- /dev/null
+++ b/caffe2/operators/rank_loss_op.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// support multiple batches of sessions
+template <typename T, class Context>
+class PairWiseLossOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(PairWiseLossOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ private:
+  INPUT_TAGS(XVALUE, LABEL, LENGTHS);
+  OUTPUT_TAGS(YVALUE);
+};
+
+template <typename T, class Context>
+class PairWiseLossGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(PairWiseLossGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ private:
+  INPUT_TAGS(XVALUE, LABEL, DYVALUE, LENGTHS);
+  OUTPUT_TAGS(DXVALUE);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduce_ops.cc b/caffe2/operators/reduce_ops.cc
new file mode 100644
index 0000000..56ccf47
--- /dev/null
+++ b/caffe2/operators/reduce_ops.cc
@@ -0,0 +1,536 @@
+#include "caffe2/operators/reduce_ops.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+void ComputeReduceMinMaxGradient(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data) {
+  const int dX_size = std::accumulate(
+      dX_dims.cbegin(), dX_dims.cend(), 1, std::multiplies<int>());
+  const int ndim = dX_dims.size();
+  std::vector<int> index(ndim, 0);
+  for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
+    const int dY_index =
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+    dX_data[dX_index] =
+        Y_data[dY_index] == X_data[dX_index] ? dY_data[dY_index] : T(0);
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool MinReducer<CPUContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CPUContext* /* context */) const {
+  ComputeReduceMinMaxGradient(
+      dY_dims, dX_dims, dY_data, X_data, Y_data, dX_data);
+  return true;
+}
+
+template <>
+template <typename T>
+bool MaxReducer<CPUContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CPUContext* /* context */) const {
+  ComputeReduceMinMaxGradient(
+      dY_dims, dX_dims, dY_data, X_data, Y_data, dX_data);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    ReduceMin,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        MinReducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceMinGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        MinReducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceMin)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+  Computes the min of the input tensor's element along the provided axes.
+  The resulted tensor has the same rank as the input if keepdims equal True.
+  If keepdims equal false, then the resulted tensor have the reduced dimension
+  pruned.
+)DOC")
+    .Arg("axes", "A list of integers, along which to reduce.")
+    .Arg(
+        "keepdims",
+        "Keep the reduced dimension(s) or not, default True keeps the reduced "
+        "dimension(s).")
+    .Input(0, "data", "An input tensor.")
+    .Output(0, "reduced", "Reduced output tensor.");
+
+OPERATOR_SCHEMA(ReduceMinGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    ReduceMax,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        MaxReducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceMaxGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        MaxReducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceMax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+  Computes the max of the input tensor's element along the provided axes.
+  The resulted tensor has the same rank as the input if keepdims equal True.
+  If keepdims equal false, then the resulted tensor have the reduced dimension
+  pruned.
+)DOC")
+    .Arg("axes", "A list of integers, along which to reduce.")
+    .Arg(
+        "keepdims",
+        "Keep the reduced dimension(s) or not, default True keeps the reduced "
+        "dimension(s).")
+    .Input(0, "data", "An input tensor.")
+    .Output(0, "reduced", "Reduced output tensor.");
+
+OPERATOR_SCHEMA(ReduceMaxGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    ReduceSum,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        SumReducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceSumGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CPUContext,
+        SumReducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceSum)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the **sum** of the input tensor's elements along the provided `axes`. The resulting tensor has the same rank as the input if the `keepdims` argument equals 1 (default). If `keepdims` is set to 0, then the `axes` dimensions are pruned.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduce_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceSum",
+    ["X"],
+    ["Y"],
+    axes=(0,1),
+    keepdims=0
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,5,5)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[5. 3. 7. 9. 5.]
+   [4. 5. 1. 8. 3.]
+   [1. 0. 9. 7. 6.]
+   [7. 5. 0. 3. 1.]
+   [6. 4. 4. 8. 3.]]
+
+  [[8. 9. 6. 7. 7.]
+   [5. 5. 4. 7. 0.]
+   [9. 7. 6. 6. 7.]
+   [7. 5. 2. 4. 2.]
+   [4. 5. 1. 9. 4.]]]]
+Y:
+[[13. 12. 13. 16. 12.]
+ [ 9. 10.  5. 15.  3.]
+ [10.  7. 15. 13. 13.]
+ [14. 10.  2.  7.  3.]
+ [10.  9.  5. 17.  7.]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("axes", "(*Tuple(int)*): list of axes to reduce")
+    .Arg(
+        "keepdims",
+        "(*int*): set to 1 to keep the reduced dimension(s) (default=1), else set to 0 to not keep the reduced dimension(s)")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Output(0, "Y", "(*Tensor`<float>`*): reduced tensor");
+
+OPERATOR_SCHEMA(ReduceSumGradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    ReduceMean,
+    ReduceOp<TensorTypes<float>, CPUContext, MeanReducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceMeanGradient,
+    ReduceGradientOp<TensorTypes<float>, CPUContext, MeanReducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceMean)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the **mean** of the input tensor's elements along the provided `axes`. The resulting tensor has the same rank as the input if the `keepdims` argument equals 1 (default). If `keepdims` is set to 0, then the `axes` dimensions are pruned.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduce_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceMean",
+    ["X"],
+    ["Y"],
+    axes=(0,1),
+    keepdims=0
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,5,5)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[9. 0. 3. 6. 0.]
+   [3. 4. 5. 0. 9.]
+   [6. 9. 1. 1. 5.]
+   [6. 2. 3. 7. 7.]
+   [3. 1. 1. 0. 1.]]
+
+  [[4. 3. 9. 8. 1.]
+   [8. 2. 0. 4. 0.]
+   [8. 9. 9. 0. 2.]
+   [7. 2. 5. 8. 9.]
+   [5. 9. 1. 9. 0.]]]]
+Y:
+[[6.5 1.5 6.  7.  0.5]
+ [5.5 3.  2.5 2.  4.5]
+ [7.  9.  5.  0.5 3.5]
+ [6.5 2.  4.  7.5 8. ]
+ [4.  5.  1.  4.5 0.5]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Arg("axes", "(*Tuple(int)*): list of axes to reduce")
+    .Arg(
+        "keepdims",
+        "(*int*): set to 1 to keep the reduced dimension(s) (default=1), else set to 0 to not keep the reduced dimension(s)")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Output(0, "Y", "(*Tensor`<float>`*): reduced tensor");
+
+OPERATOR_SCHEMA(ReduceMeanGradient).NumInputs(3).NumOutputs(1);
+
+template <>
+template <typename T>
+bool L1Reducer<CPUContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* /* Y_data */,
+    T* dX_data,
+    CPUContext* /* context */) const {
+  const float kEps = 1e-12f;
+  const int dX_size = std::accumulate(
+      dX_dims.cbegin(), dX_dims.cend(), 1, std::multiplies<int>());
+  const int ndim = dX_dims.size();
+  std::vector<int> index(ndim, 0);
+  for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
+    const int dY_index =
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+    float temp = X_data[dX_index];
+    if (temp < -kEps) {
+      dX_data[dX_index] = -dY_data[dY_index];
+    } else if (temp > kEps) {
+      dX_data[dX_index] = dY_data[dY_index];
+    } else {
+      dX_data[dX_index] = T(0);
+    }
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+  }
+  return true;
+}
+
+template <>
+template <typename T>
+bool L2Reducer<CPUContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CPUContext* /* context */) const {
+  const float kEps = 1e-12f;
+  const int dX_size = std::accumulate(
+      dX_dims.cbegin(), dX_dims.cend(), 1, std::multiplies<int>());
+  const int ndim = dX_dims.size();
+  std::vector<int> index(ndim, 0);
+  for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
+    const int dY_index =
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+    T norm = Y_data[dY_index];
+    if (norm < kEps) {
+      dX_data[dX_index] = dY_data[dY_index];
+    } else {
+      dX_data[dX_index] = dY_data[dY_index] * X_data[dX_index] / norm;
+    }
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    ReduceL1,
+    ReduceOp<TensorTypes<float>, CPUContext, L1Reducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceL1Gradient,
+    ReduceGradientOp<TensorTypes<float>, CPUContext, L1Reducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceL1)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the **L1 norm** of the input tensor's elements along the provided `axes`. The resulting tensor has the same rank as the input if the `keepdims` argument equals 1 (default). If `keepdims` is set to 0, then the `axes` dimensions are pruned.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduce_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceL1",
+    ["X"],
+    ["Y"],
+    axes=(0,1),
+    keepdims=0
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,5,5)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[ 2.  7.  6.  4.  5.]
+   [ 2.  1.  9.  8.  7.]
+   [ 4.  9.  1.  0.  0.]
+   [ 6.  4.  0.  8.  1.]
+   [ 1.  7.  1.  0.  2.]]
+
+  [[ 5.  8.  1.  7.  7.]
+   [ 4.  5.  6.  5.  4.]
+   [ 1.  9.  6.  6.  3.]
+   [ 6.  6.  8.  8.  4.]
+   [ 2.  3.  5.  8.  1.]]]]
+
+Y:
+[[  7.  15.   7.  11.  12.]
+ [  6.   6.  15.  13.  11.]
+ [  5.  18.   7.   6.   3.]
+ [ 12.  10.   8.  16.   5.]
+ [  3.  10.   6.   8.   3.]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Arg("axes", "(*Tuple(int)*): list of axes to reduce")
+    .Arg(
+        "keepdims",
+        "(*int*): set to 1 to keep the reduced dimension(s) (default=1), else set to 0 to not keep the reduced dimension(s)")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Output(0, "Y", "(*Tensor`<float>`*): reduced tensor");
+
+OPERATOR_SCHEMA(ReduceL1Gradient).NumInputs(3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(
+    ReduceL2,
+    ReduceOp<TensorTypes<float>, CPUContext, L2Reducer<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReduceL2Gradient,
+    ReduceGradientOp<TensorTypes<float>, CPUContext, L2Reducer<CPUContext>>);
+
+OPERATOR_SCHEMA(ReduceL2)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the **L2 norm** of the input tensor's elements along the provided `axes`. The resulting tensor has the same rank as the input if the `keepdims` argument equals 1 (default). If `keepdims` is set to 0, then the `axes` dimensions are pruned.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduce_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceL2",
+    ["X"],
+    ["Y"],
+    axes=(0,1),
+    keepdims=0
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,5,5)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[ 8.  0.  2.  5.  1.]
+   [ 1.  3.  0.  4.  0.]
+   [ 1.  3.  6.  7.  7.]
+   [ 6.  9.  8.  4.  6.]
+   [ 6.  1.  5.  7.  3.]]
+
+  [[ 2.  4.  6.  2.  8.]
+   [ 1.  1.  8.  0.  8.]
+   [ 5.  9.  0.  3.  2.]
+   [ 1.  7.  3.  7.  3.]
+   [ 6.  8.  9.  8.  7.]]]]
+
+Y:
+[[  8.24621105   4.           6.3245554    5.38516474   8.06225777]
+ [  1.41421354   3.1622777    8.           4.           8.        ]
+ [  5.09901953   9.48683262   6.           7.6157732    7.28010988]
+ [  6.08276272  11.40175438   8.54400349   8.06225777   6.70820379]
+ [  8.48528099   8.06225777  10.29563046  10.63014603   7.6157732 ]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Arg("axes", "(*Tuple(int)*): list of axes to reduce")
+    .Arg(
+        "keepdims",
+        "(*int*): set to 1 to keep the reduced dimension(s) (default=1), else set to 0 to not keep the reduced dimension(s)")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Output(0, "Y", "(*Tensor`<float>`*): reduced tensor");
+
+OPERATOR_SCHEMA(ReduceL2Gradient).NumInputs(3).NumOutputs(1);
+
+namespace {
+
+class GetReduceGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        std::vector<string>{GO(0), I(0), O(0)},
+        std::vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(ReduceMin, GetReduceGradient);
+REGISTER_GRADIENT(ReduceMax, GetReduceGradient);
+REGISTER_GRADIENT(ReduceSum, GetReduceGradient);
+REGISTER_GRADIENT(ReduceMean, GetReduceGradient);
+REGISTER_GRADIENT(ReduceL1, GetReduceGradient);
+REGISTER_GRADIENT(ReduceL2, GetReduceGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduce_ops.cu b/caffe2/operators/reduce_ops.cu
new file mode 100644
index 0000000..e565231
--- /dev/null
+++ b/caffe2/operators/reduce_ops.cu
@@ -0,0 +1,181 @@
+#include "caffe2/operators/reduce_ops.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/fixed_divisor.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, int D>
+__global__ void ComputeReduceMinMaxGradientCUDAKernel(
+    const int X_size,
+    const SimpleArray<int, D> Y_strides,
+    const SimpleArray<FixedDivisor<int>, D> X_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data) {
+  CUDA_1D_KERNEL_LOOP(X_index, X_size) {
+    int Y_index = 0;
+    int X_index_val = X_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      int d;
+      X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
+      Y_index += d * Y_strides.data[i];
+    }
+#if __CUDA_ARCH__ >= 350
+    dX_data[X_index] = __ldg(Y_data + Y_index) == __ldg(X_data + X_index)
+        ? __ldg(dY_data + Y_index)
+        : T(0);
+#else
+    dX_data[X_index] =
+        Y_data[Y_index] == X_data[X_index] ? dY_data[Y_index] : T(0);
+#endif
+  }
+}
+
+template <typename T, int D>
+void ComputeReduceMinMaxGradientCUDAImpl(
+    const int* Y_dims,
+    const int* X_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CUDAContext* context) {
+  SimpleArray<int, D> Y_strides_array;
+  SimpleArray<FixedDivisor<int>, D> X_dims_array;
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    if (X_dims[i] == 0) {
+      return;
+    }
+    Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
+    X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
+    cur_stride *= Y_dims[i];
+  }
+  const int X_size =
+      std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
+  ComputeReduceMinMaxGradientCUDAKernel<T, D>
+      <<<CAFFE_GET_BLOCKS(X_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          X_size,
+          Y_strides_array,
+          X_dims_array,
+          dY_data,
+          X_data,
+          Y_data,
+          dX_data);
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool MinReducer<CUDAContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CUDAContext* context) const {
+  const int ndim = dY_dims.size();
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      ndim,
+      ComputeReduceMinMaxGradientCUDAImpl,
+      T,
+      dY_dims.data(),
+      dX_dims.data(),
+      dY_data,
+      X_data,
+      Y_data,
+      dX_data,
+      context);
+  return true;
+}
+
+template <>
+template <typename T>
+bool MaxReducer<CUDAContext>::Backward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& dX_dims,
+    const T* dY_data,
+    const T* X_data,
+    const T* Y_data,
+    T* dX_data,
+    CUDAContext* context) const {
+  const int ndim = dY_dims.size();
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      ndim,
+      ComputeReduceMinMaxGradientCUDAImpl,
+      T,
+      dY_dims.data(),
+      dX_dims.data(),
+      dY_data,
+      X_data,
+      Y_data,
+      dX_data,
+      context);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    ReduceMin,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        MinReducer<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReduceMinGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        MinReducer<CUDAContext>>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceMax,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        MaxReducer<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReduceMaxGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        MaxReducer<CUDAContext>>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceSum,
+    ReduceOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        SumReducer<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReduceSumGradient,
+    ReduceGradientOp<
+        TensorTypes<std::int32_t, std::int64_t, float, double>,
+        CUDAContext,
+        SumReducer<CUDAContext>>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceMean,
+    ReduceOp<TensorTypes<float>, CUDAContext, MeanReducer<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReduceMeanGradient,
+    ReduceGradientOp<
+        TensorTypes<float>,
+        CUDAContext,
+        MeanReducer<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h
new file mode 100644
index 0000000..9c362f8
--- /dev/null
+++ b/caffe2/operators/reduce_ops.h
@@ -0,0 +1,344 @@
+#ifndef CAFFE2_OPERATORS_REDUCE_OPS_H_
+#define CAFFE2_OPERATORS_REDUCE_OPS_H_
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename InputTypes, class Context, class Reducer>
+class ReduceOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ReduceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")),
+        OP_SINGLE_ARG(bool, "keepdims", keep_dims_, true) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.begin(), axes_.end(), 0);
+    } else {
+      std::sort(axes_.begin(), axes_.end());
+      CAFFE_ENFORCE_GE(axes_.front(), 0, "Axes ids must be non-negative.");
+      CAFFE_ENFORCE_LT(
+          axes_.back(),
+          ndim,
+          "Axes ids must be smaller than the dimensions of input.");
+    }
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims;
+    Y_dims.reserve(ndim);
+    std::size_t cur_axis = 0;
+    for (int i = 0; i < ndim; ++i) {
+      if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
+        if (keep_dims_) {
+          Y_dims.push_back(1);
+        }
+        ++cur_axis;
+      } else {
+        Y_dims.push_back(X_dims[i]);
+      }
+    }
+    Y->Resize(Y_dims);
+    return reducer_.template Forward<T>(
+        X_dims,
+        axes_,
+        X.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+  }
+
+ private:
+  std::vector<int> axes_;
+  const int keep_dims_;
+  Reducer reducer_{};
+};
+
+template <typename InputTypes, class Context, class Reducer>
+class ReduceGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ReduceGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& dY = Input(0);
+    const auto& X = Input(1);
+    const auto& Y = Input(2);
+    auto* dX = Output(0);
+    const int ndim = X.ndim();
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.begin(), axes_.end(), 0);
+    } else {
+      std::sort(axes_.begin(), axes_.end());
+      CAFFE_ENFORCE_GE(axes_.front(), 0, "Axes ids must be non-negative.");
+      CAFFE_ENFORCE_LT(
+          axes_.back(),
+          ndim,
+          "Axes ids must be smaller than the dimensions of input.");
+    }
+    const std::vector<int> dX_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> dY_dims = dX_dims;
+    for (const int axis : axes_) {
+      dY_dims[axis] = 1;
+    }
+    dX->ResizeLike(X);
+    return reducer_.template Backward<T>(
+        dY_dims,
+        dX_dims,
+        dY.template data<T>(),
+        X.template data<T>(),
+        Y.template data<T>(),
+        dX->template mutable_data<T>(),
+        &context_);
+  }
+
+ private:
+  std::vector<int> axes_;
+  Reducer reducer_{};
+};
+
+template <class Context>
+struct MinReducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceMin<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* X_data,
+      const T* Y_data,
+      T* dX_data,
+      Context* context) const;
+};
+
+template <class Context>
+struct MaxReducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceMax<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* X_data,
+      const T* Y_data,
+      T* dX_data,
+      Context* context) const;
+};
+
+template <class Context>
+struct SumReducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceSum<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* /* X_data */,
+      const T* /* Y_data */,
+      T* dX_data,
+      Context* context) const {
+    math::Broadcast(
+        dY_dims.size(),
+        dY_dims.data(),
+        dX_dims.size(),
+        dX_dims.data(),
+        dY_data,
+        dX_data,
+        context);
+    return true;
+  }
+};
+
+template <class Context>
+struct MeanReducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceMean<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* /* X_data */,
+      const T* /* Y_data */,
+      T* dX_data,
+      Context* context) const {
+    math::Broadcast(
+        dY_dims.size(),
+        dY_dims.data(),
+        dX_dims.size(),
+        dX_dims.data(),
+        dY_data,
+        dX_data,
+        context);
+    const int dY_size = std::accumulate(
+        dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+    const int dX_size = std::accumulate(
+        dX_dims.cbegin(), dX_dims.cend(), 1, std::multiplies<int>());
+    math::Scale<T, Context>(
+        dX_size,
+        static_cast<float>(dY_size) / static_cast<float>(dX_size),
+        dX_data,
+        dX_data,
+        context);
+    return true;
+  }
+};
+
+template <class Context>
+struct L1Reducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceL1<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* X_data,
+      const T* Y_data,
+      T* dX_data,
+      Context* context) const;
+};
+
+template <class Context>
+struct L2Reducer {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dims,
+      const std::vector<int>& axes,
+      const T* X_data,
+      T* Y_data,
+      Context* context) const {
+    math::ReduceL2<T, Context>(
+        dims.size(),
+        dims.data(),
+        axes.size(),
+        axes.data(),
+        X_data,
+        Y_data,
+        context);
+    return true;
+  }
+
+  template <typename T>
+  bool Backward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& dX_dims,
+      const T* dY_data,
+      const T* X_data,
+      const T* Y_data,
+      T* dX_data,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REDUCE_OPS_H_
diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h
new file mode 100644
index 0000000..f3dd35b
--- /dev/null
+++ b/caffe2/operators/reducer_functors.h
@@ -0,0 +1,845 @@
+
+#ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
+#define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
+
+#include <array>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+////////////////////////////////////////////////////////////////////////////////
+// Range reducers: can leverage that input segment is continuous and provide
+// special implementation
+////////////////////////////////////////////////////////////////////////////////
+
+// Put forward and backward in the same template?
+template <typename T, class Context>
+class SumRangeReducer;
+template <typename T, class Context>
+class SumRangeReducerGradient;
+
+template <typename T>
+class SumRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* /*context*/) {
+    // do we need to go through wrapper in math.h?
+    EigenVectorMap<T> out_vec(out, block_size);
+    out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
+  }
+};
+
+template <typename T, class Context>
+class SumRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad,
+      T* data_grad,
+      const T* /*data_in*/, // unused
+      const T* /*data_out*/, // unused
+      Context* context) {
+    // do we have some op that does it smartly with minimum number of memcpy?
+    for (TIndex i = 0; i < blocks; ++i) {
+      context->template Copy<T, Context, Context>(
+          block_size, segment_grad, data_grad + block_size * i);
+    }
+  }
+};
+
+struct SumRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = SumRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = SumRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "Sum";
+  static constexpr const char* doc =
+      "Summation is done element-wise across slices of the input tensor and "
+      "doesn't change the shape of the individual blocks.";
+};
+
+// Put forward and backward in the same template?
+template <typename T, class Context>
+class LogSumExpRangeReducer;
+template <typename T, class Context>
+class LogSumExpRangeReducerGradient;
+
+template <typename T>
+class LogSumExpRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      T max_value = std::numeric_limits<T>::lowest();
+      for (int i = 0; i < blocks; ++i) {
+        max_value = std::max(max_value, in[i * block_size + j]);
+      }
+      T scaled_exp_sum = 0;
+      for (int i = 0; i < blocks; ++i) {
+        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
+      }
+      *(out++) = std::log(scaled_exp_sum) + max_value;
+    }
+  }
+  T r{1};
+};
+
+template <typename T, class Context>
+class LogSumExpRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* data_in, // I
+      const T* data_out, // O
+      Context* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      const T offset = *(data_out++);
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
+      }
+    }
+  }
+};
+
+struct LogSumExpRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = LogSumExpRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = LogSumExpRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "LogSumExp";
+  static constexpr const char* doc =
+      "LogSumExp computes the element-wise log of the sum of exponentials of "
+      "input slices. Operation doesn't change the shape of individual blocks.";
+};
+
+template <typename T, class Context>
+class LogMeanExpRangeReducer;
+template <typename T, class Context>
+class LogMeanExpRangeReducerGradient;
+
+template <typename T>
+class LogMeanExpRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      T max_value = std::numeric_limits<T>::lowest();
+      for (int i = 0; i < blocks; ++i) {
+        max_value = std::max(max_value, in[i * block_size + j]);
+      }
+      T scaled_exp_sum = 0;
+      for (int i = 0; i < blocks; ++i) {
+        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
+      }
+      scaled_exp_sum /= blocks;
+      *(out++) = std::log(scaled_exp_sum) + max_value;
+    }
+  }
+};
+
+template <typename T, class Context>
+class LogMeanExpRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* data_in, // I
+      const T* data_out, // O
+      Context* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      const T offset = *(data_out++);
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
+      }
+    }
+  }
+};
+
+struct LogMeanExpRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = LogMeanExpRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "LogMeanExp";
+  static constexpr const char* doc =
+      "LogMeanExp computes the element-wise log of the mean of exponentials of "
+      "input slices. Operation doesn't change the shape of individual blocks.";
+};
+
+template <typename T, class Context>
+class MeanRangeReducer;
+template <typename T, class Context>
+class MeanRangeReducerGradient;
+
+template <typename T>
+class MeanRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      T avg_value = 0;
+      for (int i = 0; i < blocks; ++i) {
+        avg_value += in[i * block_size + j] / blocks;
+      }
+      *(out++) = avg_value;
+    }
+  }
+};
+
+template <typename T, class Context>
+class MeanRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* /*data_in*/, // I
+      const T* /*data_out*/, // O
+      Context* /*context*/) {
+    const auto in_grad = 1.0 / blocks;
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        data_grad[idx] = out_grad * in_grad;
+      }
+    }
+  }
+};
+
+struct MeanRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = MeanRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = MeanRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "Mean";
+  static constexpr const char* doc =
+      "Mean computation is done element-wise, so that each element of the "
+      "output slice corresponds to the average value of the respective "
+      "elements in the input slices. Operation doesn't change the shape of "
+      "individual blocks.";
+};
+
+template <typename T, class Context>
+class MaxRangeReducer;
+template <typename T, class Context>
+class MaxRangeReducerGradient;
+
+template <typename T>
+class MaxRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* /*context*/) {
+    for (int j = 0; j < block_size; ++j) {
+      T max_value = std::numeric_limits<T>::lowest();
+      for (int i = 0; i < blocks; ++i) {
+        max_value = std::max(max_value, in[i * block_size + j]);
+      }
+      *(out++) = max_value;
+    }
+  }
+};
+
+template <typename T, class Context>
+class MaxRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* data_in, // I
+      const T* data_out, // O
+      Context* /*context*/) {
+    std::memset(
+        static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      const T out = data_out[j];
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        if (out == data_in[idx]) {
+          data_grad[idx] = out_grad;
+        }
+      }
+    }
+  }
+};
+
+struct MaxRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = MaxRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = MaxRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "Max";
+  static constexpr const char* doc =
+      "Max computation is done element-wise, so that each element of the "
+      "output slice corresponds to the max value of the respective "
+      "elements in the input slices. Operation doesn't change the shape of "
+      "individual blocks. This implementation imitates torch nn.Max operator. "
+      "If the maximum value occurs more than once, the operator will return "
+      "the first occurence of value. When computing the gradient using the "
+      "backward propagation, the gradient input corresponding to the first "
+      "occurence of the maximum value will be used.";
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Incremental reducers: consume elements one by one
+////////////////////////////////////////////////////////////////////////////////
+
+// Base implementation, everything can be overwritten
+class BaseReducer {
+ public:
+  static constexpr int kInputCount = 1;
+
+  struct Meta {
+    TIndex block_size;
+    vector<TIndex> block_shape;
+    bool first_dim;
+
+    explicit Meta(bool first = true) : first_dim(first) {}
+
+    void computeMeta(const std::vector<TIndex>& dims, int skip_dims) {
+      first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
+                : block_shape.assign(dims.begin(), dims.end() - skip_dims);
+      block_size = first_dim ? size_from_dim_(skip_dims, dims)
+                             : size_from_dim_(dims.size() - skip_dims, dims);
+    }
+
+    void
+    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
+      DCHECK_EQ(0, input);
+      auto& dims = value.dims();
+      computeMeta(dims, skip_dims);
+    }
+
+    void appendOutputShape(vector<TIndex>* output_shape) {
+      output_shape->insert(
+          output_shape->end(), block_shape.begin(), block_shape.end());
+    }
+
+    vector<TIndex> getOutputShape(const TensorShape& in, int skip_dims) {
+      vector<TIndex> dims(in.dims().begin(), in.dims().end());
+      computeMeta(dims, skip_dims);
+      return block_shape;
+    }
+  };
+
+  template <int FixedSize>
+  void finish(const Meta& /*meta*/, CPUContext* /*context*/) {}
+};
+
+class BaseReducerGradient {
+ public:
+  // which of the original inputs are required for gradient computation
+  static constexpr std::array<int, 0> originalInputs() {
+    return std::array<int, 0>();
+  }
+
+  static constexpr bool computeLength() {
+    return false;
+  }
+
+  static int numAuxInputsWithGrads(const OperatorDef& /*def*/) {
+    return 0;
+  }
+
+  static bool requiresDataInput(const OperatorDef& /*def*/) {
+    return false;
+  }
+
+  // True if the backward op requires the output of the forward op.
+  static bool requiresForwardOutput() {
+    return false;
+  }
+
+  struct Meta {
+    TIndex block_size;
+    vector<TIndex> block_shape;
+    bool first_dim;
+
+    Meta(
+        const Tensor<CPUContext>& out_grad,
+        int skip_dims,
+        bool first_dim = true)
+        : first_dim(first_dim) {
+      auto& dims = out_grad.dims();
+      first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
+                : block_shape.assign(dims.begin(), dims.end() - skip_dims);
+      block_size = first_dim
+          ? out_grad.size_from_dim(skip_dims)
+          : out_grad.size_from_dim(out_grad.ndim() - skip_dims);
+    }
+
+    void observeOriginalInput(
+        int /*original_input*/,
+        const Tensor<CPUContext>& /*value*/,
+        Tensor<CPUContext>* /*input_grad*/, // optional grad to populate
+        int /*skip_dims*/) {}
+
+    void appendGradShape(vector<TIndex>* output_shape) {
+      output_shape->insert(
+          output_shape->end(), block_shape.begin(), block_shape.end());
+    }
+  };
+};
+
+// Put forward and backward in the same template?
+template <typename T, class Context>
+class SumReducer;
+template <typename T, class Context>
+class SumReducerGradient;
+
+template <typename T>
+class SumReducer<T, CPUContext> : public BaseReducer {
+ public:
+  using FixedDispatch = FixedValues<1>;
+
+  SumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
+      : current_size_(0), out_(out) {
+    // add a wrapper in Context for it
+    if (meta.first_dim) {
+      memset(out, 0, sizeof(T) * meta.block_size);
+    }
+  }
+  template <int FixedSize>
+  void process(
+      const Meta& meta,
+      const T* in,
+      TIndex /*offset*/,
+      CPUContext* context) {
+    if (meta.first_dim) {
+      math::AxpyFixedSize<T, CPUContext, FixedSize>(
+          meta.block_size, 1, in, out_, context);
+    } else {
+      math::Sum<T, CPUContext>(
+          meta.block_size, in, out_ + current_size_++, context);
+    }
+  }
+
+ private:
+  int current_size_;
+  T* out_;
+};
+
+template <typename T, class Context>
+class SumReducerGradient : public BaseReducerGradient {
+ public:
+  using FixedDispatch = FixedValues<1>;
+
+  SumReducerGradient(
+      const Meta& /*meta*/,
+      const T* s_grad,
+      CPUContext* /*context*/)
+      : s_grad_(s_grad) {}
+
+  template <int FixedSize>
+  void fillGrad(
+      const Meta& meta,
+      T* data_grad,
+      TIndex offset,
+      Context* context,
+      const int length) {
+    if (FixedSize == 1) { // static if
+      *data_grad = *s_grad_;
+    } else if (meta.first_dim) {
+      context->template Copy<T, Context, Context>(
+          meta.block_size, s_grad_, data_grad);
+    } else {
+      math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
+    }
+  }
+
+ private:
+  const T* s_grad_;
+};
+
+struct SumReducerDef {
+  template <typename T, class Context>
+  using Reducer = SumReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = SumReducerGradient<T, Context>;
+  static constexpr const char* name = "Sum";
+  static constexpr const char* doc =
+      "Summation is done element-wise across slices of the input tensor and "
+      "doesn't change the shape of the individual blocks.";
+  static void PopulateSchema(OpSchema& /*schema*/) {}
+};
+
+// Put forward and backward in the same template?
+template <typename T, class Context>
+class WeightedSumReducer;
+template <typename T, class Context>
+class WeightedSumReducerGradient;
+
+template <typename T>
+class WeightedSumReducer<T, CPUContext> : public BaseReducer {
+ public:
+  static constexpr int kInputCount = 2;
+
+  using FixedDispatch = FixedValues<1>;
+
+  struct Meta : BaseReducer::Meta {
+    const T* scalars;
+
+    bool first_dim;
+
+    explicit Meta(bool first = true) : first_dim(first) {}
+
+    void
+    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
+      if (input == 1) {
+        CAFFE_ENFORCE_EQ(
+            skip_dims, value.ndim(), "SCALARS mustn't have extra dimensions");
+        scalars = value.data<T>();
+        return;
+      }
+      BaseReducer::Meta::observeInput(input, value, skip_dims);
+    }
+  };
+
+  WeightedSumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
+      : out_(out) {
+    // do we have a wrapper for it?
+    memset(out, 0, sizeof(T) * meta.block_size);
+  }
+  template <int FixedSize>
+  void
+  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
+    CAFFE_ENFORCE(
+        meta.first_dim,
+        "WeightedSumReducer implemented only for "
+        "front dimensions reduction");
+    math::AxpyFixedSize<T, CPUContext, FixedSize>(
+        meta.block_size, meta.scalars[offset], in, out_, context);
+  }
+
+ private:
+  T* out_;
+};
+
+template <typename T, class Context>
+class WeightedSumReducerGradient : public BaseReducerGradient {
+ public:
+  // which of the original inputs are required for gradient computation
+  static constexpr std::array<int, 1> originalInputs() {
+    return {{1}};
+  }
+
+  static int numAuxInputsWithGrads(const OperatorDef& def) {
+    return GetFlagArgument(def, "grad_on_weights");
+  }
+
+  static bool requiresDataInput(const OperatorDef& def) {
+    return numAuxInputsWithGrads(def) > 0;
+  }
+
+  using FixedDispatch = FixedValues<1>;
+
+  struct Meta : public BaseReducerGradient::Meta {
+    const T* scalars;
+    T* scalars_grad;
+
+    using BaseReducerGradient::Meta::Meta;
+
+    void observeOriginalInput(
+        int original_input,
+        const Tensor<CPUContext>& value,
+        Tensor<CPUContext>* input_grad, // optional grad to populate
+        int /*skip_dims*/) {
+      CAFFE_ENFORCE_EQ(1, original_input);
+      scalars = value.data<T>();
+      if (input_grad) {
+        input_grad->ResizeLike(value);
+        scalars_grad = input_grad->mutable_data<T>();
+      }
+    }
+  };
+
+  WeightedSumReducerGradient(
+      const Meta& /*meta*/,
+      const T* s_grad,
+      CPUContext* /*context*/)
+      : s_grad_(s_grad) {}
+
+  template <int FixedSize>
+  void fillGrad(
+      const Meta& meta,
+      T* data_grad,
+      TIndex offset,
+      Context* context,
+      const int /*length*/) {
+    math::ScaleFixedSize<T, CPUContext, FixedSize>(
+        meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
+  }
+
+  // Special version which is called with the main input too, used only if
+  // additional input grad is requested
+  template <int FixedSize>
+  void fillGradWithMainInput(
+      const Meta& meta,
+      const T* data,
+      T* data_grad,
+      TIndex offset,
+      Context* context,
+      const int /*length*/) {
+    math::ScaleFixedSize<T, CPUContext, FixedSize>(
+        meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
+    math::Dot(
+        meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
+  }
+
+ private:
+  const T* s_grad_;
+};
+
+struct WeightedSumReducerDef {
+  template <typename T, class Context>
+  using Reducer = WeightedSumReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = WeightedSumReducerGradient<T, Context>;
+  static constexpr const char* name = "WeightedSum";
+  static constexpr const char* doc =
+      "Input slices are first scaled by SCALARS and then summed element-wise. "
+      "It doesn't change the shape of the individual blocks.";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor for the summation");
+    schema.Input(
+        1,
+        "SCALARS",
+        "Scalar multipliers for the input slices. Must be a vector with the "
+        "length matching the number of slices");
+    schema.Arg(
+        "grad_on_weights",
+        "Produce also gradient for `weights`. For now it's only supported in "
+        "`Lengths`-based operators");
+  }
+};
+
+template <typename T, class Context>
+class MeanReducer;
+template <typename T, class Context>
+class MeanReducerGradient;
+
+template <typename T>
+class MeanReducer<T, CPUContext> : public BaseReducer {
+ public:
+  using FixedDispatch = FixedValues<1>;
+
+  MeanReducer(const Meta& meta, T* out, CPUContext* /*context*/)
+      : out_(out), current_size_(0) {
+    if (meta.first_dim) {
+      memset(out, 0, sizeof(T) * meta.block_size);
+    }
+  }
+
+  template <int FixedSize>
+  void process(
+      const Meta& meta,
+      const T* in,
+      TIndex /*offset*/,
+      CPUContext* context) {
+    if (meta.first_dim) {
+      math::AxpyFixedSize<T, CPUContext, FixedSize>(
+          meta.block_size, 1, in, out_, context);
+    } else {
+      math::Sum<T, CPUContext>(
+          meta.block_size, in, out_ + current_size_, context);
+    }
+    current_size_++;
+  }
+
+  template <int FixedSize>
+  void finish(const Meta& meta, CPUContext* context) {
+    if (meta.first_dim) {
+      if (current_size_ > 0) {
+        math::ScaleFixedSize<T, CPUContext, FixedSize>(
+            meta.block_size, 1.0 / current_size_, out_, out_, context);
+      }
+    } else {
+      math::ScaleFixedSize<T, CPUContext, FixedSize>(
+          current_size_, 1.0 / meta.block_size, out_, out_, context);
+    }
+  }
+
+ private:
+  T* out_;
+  int current_size_;
+};
+
+template <typename T, class Context>
+class MeanReducerGradient : public BaseReducerGradient {
+ public:
+  static constexpr bool computeLength() {
+    return true;
+  }
+
+  using FixedDispatch = FixedValues<1>;
+
+  MeanReducerGradient(
+      const Meta& /*meta*/,
+      const T* s_grad,
+      CPUContext* /*context*/)
+      : s_grad_(s_grad) {}
+
+  template <int FixedSize>
+  void fillGrad(
+      const Meta& meta,
+      T* data_grad,
+      TIndex offset,
+      Context* context,
+      const int length) {
+    CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
+    if (meta.first_dim) {
+      math::ScaleFixedSize<T, CPUContext, FixedSize>(
+          meta.block_size, 1.0 / length, s_grad_, data_grad, context);
+    } else {
+      math::Set<T, CPUContext>(
+          length, s_grad_[offset] * 1.0f / length, data_grad, context);
+    }
+  }
+
+ private:
+  const T* s_grad_;
+};
+
+struct MeanReducerDef {
+  template <typename T, class Context>
+  using Reducer = MeanReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = MeanReducerGradient<T, Context>;
+  static constexpr const char* name = "Mean";
+  static constexpr const char* doc =
+      "Mean computes the element-wise mean of the input slices. "
+      "Operation doesn't change the shape of the individual blocks.";
+  static void PopulateSchema(OpSchema& /*schema*/) {}
+};
+
+template <typename T, class Context>
+class MaxReducer;
+template <typename T, class Context>
+class MaxReducerGradient;
+
+template <typename T>
+class MaxReducer<T, CPUContext> : public BaseReducer {
+ public:
+  using FixedDispatch = FixedValues<1>;
+
+  MaxReducer(const Meta& meta, T* out, CPUContext* /*context*/)
+      : out_(out), current_size_(0) {
+    // add a wrapper in Context for it
+    memset(out, 0, sizeof(T) * meta.block_size);
+  }
+
+  template <int FixedSize>
+  void process(
+      const Meta& meta,
+      const T* in,
+      TIndex /*offset*/,
+      CPUContext* context) {
+    CAFFE_ENFORCE(
+        meta.first_dim,
+        "MaxReducer implemented only for front dimensions reduction");
+    if (current_size_ > 0) {
+      EigenVectorMap<T> output_vec(out_, meta.block_size);
+      output_vec =
+          output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
+    } else {
+      memcpy(out_, in, sizeof(T) * meta.block_size);
+    }
+    ++current_size_;
+  }
+
+ private:
+  T* out_;
+  int current_size_;
+};
+
+template <typename T, class Context>
+class MaxReducerGradient : public BaseReducerGradient {
+ public:
+  static bool requiresDataInput(const OperatorDef& /*def*/) {
+    return true;
+  }
+
+  static bool requiresForwardOutput() {
+    return true;
+  }
+
+  using FixedDispatch = FixedValues<1>;
+
+  MaxReducerGradient(
+      const Meta& /*meta*/,
+      const T* s_grad,
+      CPUContext* /*context*/)
+      : s_grad_(s_grad) {}
+
+  template <int FixedSize>
+  void fillGradWithMainInputAndForwardOutput(
+      const Meta& meta,
+      const T* data,
+      T* data_grad,
+      const T* forward_output,
+      TIndex /*offset*/,
+      Context* /*context*/,
+      const int /*length*/) {
+    for (TIndex i = 0; i < meta.block_size; ++i) {
+      data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
+    }
+  }
+
+ private:
+  const T* s_grad_;
+};
+
+struct MaxReducerDef {
+  template <typename T, class Context>
+  using Reducer = MaxReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = MaxReducerGradient<T, Context>;
+  static constexpr const char* name = "Max";
+  static constexpr const char* doc =
+      "Max computes the element-wise max of the input slices. "
+      "Operation doesn't change the shape of the individual blocks.";
+  static void PopulateSchema(OpSchema& /*schema*/) {}
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
diff --git a/caffe2/operators/reduction_front_back_ops.cc b/caffe2/operators/reduction_front_back_ops.cc
new file mode 100644
index 0000000..5ea5fda
--- /dev/null
+++ b/caffe2/operators/reduction_front_back_ops.cc
@@ -0,0 +1,832 @@
+#include "caffe2/operators/reduction_front_back_ops.h"
+#include "caffe2/core/operator_gradient.h"
+
+namespace caffe2 {
+
+/***
+  Sum Ops
+***/
+
+// ReduceFrontSum: columnwise sum
+template <>
+template <typename T>
+void SumReduceDimsOp<CPUContext, true, false>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int32_t* lengths_data,
+    T* out_data) {
+  for (int j = 0; j < cols; j++) {
+    T sum = in_data[j];
+    int length = lengths_data == nullptr ? rows : lengths_data[j];
+    for (int i = 1; i < length; i++) {
+      sum += in_data[i * cols + j];
+    }
+    out_data[j] = sum;
+  }
+}
+
+// ReduceBackSum: rowwise sum
+template <>
+template <typename T>
+void SumReduceDimsOp<CPUContext, false, false>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int32_t* lengths_data,
+    T* out_data) {
+  for (int i = 0; i < rows; i++) {
+    int offset = i * cols;
+    T sum = in_data[offset];
+    int length = lengths_data == nullptr ? cols : lengths_data[i];
+    for (int j = 1; j < length; j++) {
+      sum += in_data[offset + j];
+    }
+    out_data[i] = sum;
+  }
+}
+
+// ReduceFrontSumGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CPUContext, true, false>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  for (int i = 0; i < rows * cols; i++) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths_data == nullptr || row < lengths_data[col]) {
+      dXdata[i] = dYdata[col];
+    } else {
+      dXdata[i] = 0;
+    }
+  }
+}
+
+// ReduceBackSumGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CPUContext, false, false>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  for (int i = 0; i < rows * cols; i++) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths_data == nullptr || col < lengths_data[row]) {
+      dXdata[i] = dYdata[row];
+    } else {
+      dXdata[i] = 0;
+    }
+  }
+}
+
+REGISTER_CPU_OPERATOR(ReduceFrontSum, SumReduceDimsOp<CPUContext, true, false>);
+REGISTER_CPU_OPERATOR(
+    ReduceFrontSumGradient,
+    SumReduceDimsGradientOp<CPUContext, true, false>);
+
+class GetReduceFrontSumGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceFrontSumGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceFrontSum, GetReduceFrontSumGradient);
+
+REGISTER_CPU_OPERATOR(ReduceBackSum, SumReduceDimsOp<CPUContext, false, false>);
+REGISTER_CPU_OPERATOR(
+    ReduceBackSumGradient,
+    SumReduceDimsGradientOp<CPUContext, false, false>);
+
+class GetReduceBackSumGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceBackSumGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceBackSum, GetReduceBackSumGradient);
+
+#define REDUCTION_OP_SHAPE_INFERENCE(is_front_reducer)                      \
+  CAFFE_ENFORCE_LE(1, in.size());                                           \
+  CAFFE_ENFORCE_GE(2, in.size());                                           \
+  ArgumentHelper helper(def);                                               \
+  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1); \
+  int start_index = is_front_reducer ? num_reduce_dims : 0;                 \
+  int end_index = is_front_reducer ? in[0].dims_size()                      \
+                                   : in[0].dims_size() - num_reduce_dims;   \
+  vector<int> output_shape;                                                 \
+  for (int i = start_index; i < end_index; ++i) {                           \
+    output_shape.push_back(in[0].dims(i));                                  \
+  }                                                                         \
+  return vector<TensorShape>{                                               \
+      CreateTensorShape(output_shape, in[0].data_type())};
+
+OPERATOR_SCHEMA(ReduceFrontSum)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **sum**.
+
+Can reduce more than one of the "first" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the sum operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_1 * d_2 * ... * d_{n})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{0}$ dimension.
+
+For example, if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1,2]$, then $Y = [sum(1,4), sum(5,1,7), sum(2), sum(9,2)] = [2.5, 4.333, 2, 5.5]$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceFrontSum",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[4. 1. 1.]
+  [0. 6. 7.]
+  [7. 8. 6.]]
+
+ [[5. 7. 7.]
+  [0. 1. 6.]
+  [2. 9. 0.]]]
+Y: [18. 32. 27.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(true)
+    });
+OPERATOR_SCHEMA(ReduceFrontSumGradient).NumInputs(2, 3).NumOutputs(1);
+
+OPERATOR_SCHEMA(ReduceBackSum)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **sum**.
+
+Can reduce more than one of the "last" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the sum operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_0 * d_1 * d_2 * ... * d_{n-1})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{n-1}$ dimension.
+
+For example if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1]$, then $Y = [sum(1,5), sum(4,1,8), sum(2)] = [6, 13, 2]$
+
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceBackSum",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[2. 7. 7.]
+   [1. 1. 0.]
+   [9. 7. 2.]]
+
+  [[6. 6. 4.]
+   [1. 2. 6.]
+   [6. 6. 3.]]]]
+Y: [[36. 40.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(false)
+    });
+OPERATOR_SCHEMA(ReduceBackSumGradient).NumInputs(2, 3).NumOutputs(1);
+
+/***
+  Mean Ops
+***/
+
+// ReduceFrontMean: columnwise mean
+template <>
+template <typename T>
+void SumReduceDimsOp<CPUContext, true, true>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int32_t* lengths_data,
+    T* out_data) {
+  for (int j = 0; j < cols; j++) {
+    T sum = in_data[j];
+    int length = lengths_data == nullptr ? rows : lengths_data[j];
+    for (int i = 1; i < length; i++) {
+      sum += in_data[i * cols + j];
+    }
+    out_data[j] = sum / length;
+  }
+}
+
+// ReduceBackMean: rowwise mean
+template <>
+template <typename T>
+void SumReduceDimsOp<CPUContext, false, true>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int32_t* lengths_data,
+    T* out_data) {
+  for (int i = 0; i < rows; i++) {
+    int offset = i * cols;
+    T sum = in_data[offset];
+    int length = lengths_data == nullptr ? cols : lengths_data[i];
+    for (int j = 1; j < length; j++) {
+      sum += in_data[offset + j];
+    }
+    out_data[i] = sum / length;
+  }
+}
+
+// ReduceFrontMeanGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CPUContext, true, true>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  for (int i = 0; i < rows * cols; i++) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths_data == nullptr) {
+      dXdata[i] = dYdata[col] / rows;
+    } else if (row < lengths_data[col]) {
+      dXdata[i] = dYdata[col] / lengths_data[col];
+    } else {
+      dXdata[i] = 0;
+    }
+  }
+}
+
+// ReduceBackMeanGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CPUContext, false, true>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  for (int i = 0; i < rows * cols; i++) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths_data == nullptr) {
+      dXdata[i] = dYdata[row] / cols;
+    } else if (col < lengths_data[row]) {
+      dXdata[i] = dYdata[row] / lengths_data[row];
+    } else {
+      dXdata[i] = 0;
+    }
+  }
+}
+
+REGISTER_CPU_OPERATOR(ReduceFrontMean, SumReduceDimsOp<CPUContext, true, true>);
+REGISTER_CPU_OPERATOR(
+    ReduceFrontMeanGradient,
+    SumReduceDimsGradientOp<CPUContext, true, true>);
+
+class GetReduceFrontMeanGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceFrontMeanGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceFrontMean, GetReduceFrontMeanGradient);
+
+OPERATOR_SCHEMA(ReduceFrontMean)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **mean**.
+
+Can reduce more than one of the "first" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the mean operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_1 * d_2 * ... * d_{n})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{0}$ dimension.
+
+For example if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1,2]$, then $Y = [mean(1,4), mean(5,1,7), mean(2), mean(9,2)] = [2.5, 4.333, 2, 5.5]$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceFrontMean",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[5. 0. 9.]
+  [4. 1. 1.]
+  [9. 0. 8.]]
+
+ [[2. 6. 7.]
+  [6. 2. 6.]
+  [0. 4. 5.]]]
+Y: [4.3333335    2.1666667     6.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(true)
+    });
+OPERATOR_SCHEMA(ReduceFrontMeanGradient).NumInputs(2, 3).NumOutputs(1);
+
+REGISTER_CPU_OPERATOR(ReduceBackMean, SumReduceDimsOp<CPUContext, false, true>);
+REGISTER_CPU_OPERATOR(
+    ReduceBackMeanGradient,
+    SumReduceDimsGradientOp<CPUContext, false, true>);
+
+class GetReduceBackMeanGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceBackMeanGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceBackMean, GetReduceBackMeanGradient);
+
+OPERATOR_SCHEMA(ReduceBackMean)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **mean**.
+
+Can reduce more than one of the "last" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the mean operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_0 * d_1 * d_2 * ... * d_{n-1})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{n-1}$ dimension.
+
+For example if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1]$, then $Y = [mean(1,5), mean(4,1,8), mean(2)] = [3, 4.333, 2]$
+
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceBackMean",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[5. 9. 0.]
+   [8. 4. 0.]
+   [2. 2. 4.]]
+
+  [[9. 0. 9.]
+   [7. 9. 7.]
+   [1. 0. 2.]]]]
+Y: [[3.7777777 4.888889 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(false)
+    });
+OPERATOR_SCHEMA(ReduceBackMeanGradient).NumInputs(2, 3).NumOutputs(1);
+
+/***
+  Max Ops
+***/
+
+// ReduceFrontMax
+template <>
+void MaxReduceDimsOp<float, CPUContext, true>::Compute(
+    int rows,
+    int cols,
+    const float* data,
+    const int32_t* lengths_data,
+    float* out_data) {
+  for (int i = 0; i < cols; i++) {
+    float mx = data[i];
+    int length = lengths_data == nullptr ? rows : lengths_data[i];
+    for (int j = 1; j < length; j++) {
+      mx = std::max(mx, data[j * cols + i]);
+    }
+    out_data[i] = mx;
+  }
+}
+
+// ReduceBackMax
+template <>
+void MaxReduceDimsOp<float, CPUContext, false>::Compute(
+    int rows,
+    int cols,
+    const float* data,
+    const int32_t* lengths_data,
+    float* out_data) {
+  for (int i = 0; i < rows; i++) {
+    float mx = data[i * cols];
+    int length = lengths_data == nullptr ? cols : lengths_data[i];
+    for (int j = 1; j < length; j++) {
+      mx = std::max(mx, data[i * cols + j]);
+    }
+    out_data[i] = mx;
+  }
+}
+
+// ReduceFrontMaxGradient
+template <>
+void MaxReduceDimsGradientOp<float, CPUContext, true>::Compute(
+    int rows,
+    int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int32_t* lengths_data,
+    float* dXdata) {
+  int len = cols * rows;
+  for (int i = 0; i < len; i++) {
+    int col = i % cols;
+    int row = i / cols;
+    if (lengths_data != nullptr && row >= lengths_data[col]) {
+      dXdata[i] = 0.0f;
+    } else {
+      dXdata[i] = Xdata[i] == Ydata[col] ? dYdata[col] : 0.0f;
+    }
+  }
+}
+
+// ReduceBackMaxGradient
+template <>
+void MaxReduceDimsGradientOp<float, CPUContext, false>::Compute(
+    int rows,
+    int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int32_t* lengths_data,
+    float* dXdata) {
+  int len = cols * rows;
+  for (int i = 0; i < len; i++) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths_data == nullptr || col < lengths_data[row]) {
+      dXdata[i] = Xdata[i] == Ydata[row] ? dYdata[row] : 0.0f;
+    } else {
+      dXdata[i] = 0.0f;
+    }
+  }
+}
+
+REGISTER_CPU_OPERATOR(ReduceFrontMax, MaxReduceDimsOp<float, CPUContext, true>);
+REGISTER_CPU_OPERATOR(
+    ReduceFrontMaxGradient,
+    MaxReduceDimsGradientOp<float, CPUContext, true>);
+
+REGISTER_CPU_OPERATOR(ReduceBackMax, MaxReduceDimsOp<float, CPUContext, false>);
+REGISTER_CPU_OPERATOR(
+    ReduceBackMaxGradient,
+    MaxReduceDimsGradientOp<float, CPUContext, false>);
+
+class GetReduceFrontMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0), O(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceFrontMaxGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceFrontMax, GetReduceFrontMaxGradient);
+
+class GetReduceBackMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_in = {GO(0), I(0), O(0)};
+    if (def_.input_size() == 2) {
+      grad_in.push_back(I(1));
+    }
+    return SingleGradientDef(
+        "ReduceBackMaxGradient", "", grad_in, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(ReduceBackMax, GetReduceBackMaxGradient);
+
+OPERATOR_SCHEMA(ReduceFrontMax)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **max**.
+
+Can reduce more than one of the "first" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the max operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_1 * d_2 * ... * d_{n})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{0}$ dimension.
+
+For example if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1,2]$, then $Y = [max(1,4), max(5,1,7), max(2), max(9,2)] = [4, 7, 2, 9]$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceFrontMax",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[2. 8. 1.]
+  [9. 6. 6.]
+  [7. 7. 0.]]
+
+ [[4. 3. 9.]
+  [9. 2. 7.]
+  [6. 4. 7.]]]
+Y: [9. 8. 9.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(true)
+    });
+OPERATOR_SCHEMA(ReduceFrontMaxGradient).NumInputs(3, 4).NumOutputs(1);
+
+OPERATOR_SCHEMA(ReduceBackMax)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .Arg("num_reduce_dims", "(*int*): number of dimensions to reduce (default=1)")
+    .SetDoc(R"DOC(
+Reduces the input tensor along the last dimension of the by applying **max**.
+
+Can reduce more than one of the "last" dimensions by setting `num_reduce_dim`.
+
+A second (optional) input, `lengths`, can be passed, which enforces that only a subset of the elements are considered in the max operation.
+- If input tensor `X` has shape $(d_0, d_1, d_2, ..., d_n)$, `lengths` must have shape $(d_0 * d_1 * d_2 * ... * d_{n-1})$.
+- The values of the `lengths` tensor determine how many of the values to consider for each vector in the $d_{n-1}$ dimension.
+
+For example if $X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1]$, then $Y = [max(1,5), max(4,1,8), max(2)] = [5, 8, 2]$
+
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_front_back_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ReduceBackMax",
+    ["X"],
+    ["Y"],
+    num_reduce_dim=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(1,2,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[[2. 5. 1.]
+   [6. 1. 9.]
+   [8. 5. 9.]]
+
+  [[5. 7. 8.]
+   [9. 9. 6.]
+   [6. 5. 0.]]]]
+Y: [[9. 9.]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor`<float>`*): input tensor")
+    .Input(
+        1,
+        "lengths",
+        "(*Tensor`<int>`*): number of elements in each sample")
+    .Output(0,"Y","(*Tensor`<float>`*): reduced tensor")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      REDUCTION_OP_SHAPE_INFERENCE(false)
+    });
+OPERATOR_SCHEMA(ReduceBackMaxGradient).NumInputs(3, 4).NumOutputs(1);
+
+#undef REDUCTION_OP_SHAPE_INFERENCE
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduction_front_back_ops.cu b/caffe2/operators/reduction_front_back_ops.cu
new file mode 100644
index 0000000..8f5c7dd
--- /dev/null
+++ b/caffe2/operators/reduction_front_back_ops.cu
@@ -0,0 +1,434 @@
+#include <cub/block/block_reduce.cuh>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/reduction_front_back_ops.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T, bool NORMALIZE>
+__global__ void columnwise_fill_kernel(
+    const int rows,
+    const int cols,
+    const T* dY,
+    const int* lengths,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, rows * cols) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths == nullptr) {
+      dX[i] = NORMALIZE ? dY[col] / rows : dY[col];
+    } else if (row < lengths[col]) {
+      dX[i] = NORMALIZE ? dY[col] / lengths[col] : dY[col];
+    } else {
+      dX[i] = 0;
+    }
+  }
+}
+
+template <typename T, bool NORMALIZE>
+__global__ void rowwise_fill_kernel(
+    const int rows,
+    const int cols,
+    const T* dY,
+    const int* lengths,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, rows * cols) {
+    int row = i / cols;
+    int col = i % cols;
+    if (lengths == nullptr) {
+      dX[i] = NORMALIZE ? dY[row] / cols : dY[row];
+    } else if (col < lengths[row]) {
+      dX[i] = NORMALIZE ? dY[row] / lengths[row] : dY[row];
+    } else {
+      dX[i] = 0;
+    }
+  }
+}
+
+template <typename T, bool NORMALIZE>
+__global__ void rowwise_sum_kernel(
+    const int rows,
+    const int cols,
+    const T* data,
+    const int* lengths,
+    T* out) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int rowIndex = blockIdx.x; rowIndex < rows; rowIndex += gridDim.x) {
+    T sum = 0;
+    const int rowOffset = rowIndex * cols;
+    const int length = lengths == nullptr ? cols : lengths[rowIndex];
+    for (int colIndex = threadIdx.x; colIndex < length;
+         colIndex += blockDim.x) {
+      sum += data[rowOffset + colIndex];
+    }
+    sum = BlockReduce(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      out[rowIndex] = NORMALIZE ? sum / length : sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, bool NORMALIZE>
+__global__ void columnwise_sum_kernel(
+    const int rows,
+    const int cols,
+    const T* data,
+    const int* lengths,
+    T* out) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int colIndex = blockIdx.x; colIndex < cols; colIndex += gridDim.x) {
+    T sum = 0;
+    const int length = lengths == nullptr ? rows : lengths[colIndex];
+    for (int rowIndex = threadIdx.x; rowIndex < length;
+         rowIndex += blockDim.x) {
+      sum += data[rowIndex * cols + colIndex];
+    }
+    sum = BlockReduce(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      out[colIndex] = NORMALIZE ? sum / length : sum;
+    }
+    __syncthreads();
+  }
+}
+
+} // anonymous namespace
+
+/***
+  Sum Ops
+***/
+
+// ReduceFrontSum: columnwise sum
+template <>
+template <typename T>
+void SumReduceDimsOp<CUDAContext, true, false>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int* lengths_data,
+    T* out_data) {
+  columnwise_sum_kernel<T, false>
+      <<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, in_data, lengths_data, out_data);
+}
+
+// ReduceBackSum: rowwise sum
+template <>
+template <typename T>
+void SumReduceDimsOp<CUDAContext, false, false>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int* lengths_data,
+    T* out_data) {
+  rowwise_sum_kernel<T, false>
+      <<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, in_data, lengths_data, out_data);
+}
+
+// ReduceFrontSumGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CUDAContext, true, false>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  columnwise_fill_kernel<T, false>
+      <<<CAFFE_GET_BLOCKS(rows * cols),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, dYdata, lengths_data, dXdata);
+}
+
+// ReduceBackSumGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CUDAContext, false, false>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  rowwise_fill_kernel<T, false>
+      <<<CAFFE_GET_BLOCKS(rows * cols),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, dYdata, lengths_data, dXdata);
+}
+
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontSum,
+    SumReduceDimsOp<CUDAContext, true, false>);
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontSumGradient,
+    SumReduceDimsGradientOp<CUDAContext, true, false>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceBackSum,
+    SumReduceDimsOp<CUDAContext, false, false>);
+REGISTER_CUDA_OPERATOR(
+    ReduceBackSumGradient,
+    SumReduceDimsGradientOp<CUDAContext, false, false>);
+
+/***
+  Mean Ops
+***/
+
+// ReduceFrontMean: columnwise mean
+template <>
+template <typename T>
+void SumReduceDimsOp<CUDAContext, true, true>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int* lengths_data,
+    T* out_data) {
+  columnwise_sum_kernel<T, true>
+      <<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, in_data, lengths_data, out_data);
+}
+
+// ReduceBackMean: rowwise mean
+template <>
+template <typename T>
+void SumReduceDimsOp<CUDAContext, false, true>::Compute(
+    int rows,
+    int cols,
+    const T* in_data,
+    const int* lengths_data,
+    T* out_data) {
+  rowwise_sum_kernel<T, true>
+      <<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, in_data, lengths_data, out_data);
+}
+
+// ReduceFrontMeanGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CUDAContext, true, true>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  columnwise_fill_kernel<T, true>
+      <<<CAFFE_GET_BLOCKS(rows * cols),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, dYdata, lengths_data, dXdata);
+}
+
+// ReduceBackMeanGradient
+template <>
+template <typename T>
+void SumReduceDimsGradientOp<CUDAContext, false, true>::Compute(
+    int rows,
+    int cols,
+    const T* dYdata,
+    const int* lengths_data,
+    T* dXdata) {
+  rowwise_fill_kernel<T, true>
+      <<<CAFFE_GET_BLOCKS(rows * cols),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(rows, cols, dYdata, lengths_data, dXdata);
+}
+
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontMean,
+    SumReduceDimsOp<CUDAContext, true, true>);
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontMeanGradient,
+    SumReduceDimsGradientOp<CUDAContext, true, true>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceBackMean,
+    SumReduceDimsOp<CUDAContext, false, true>);
+REGISTER_CUDA_OPERATOR(
+    ReduceBackMeanGradient,
+    SumReduceDimsGradientOp<CUDAContext, false, true>);
+
+/***
+  Max Ops
+***/
+
+namespace {
+
+__global__ void columnwise_max_kernel(
+    const int rows,
+    const int cols,
+    const float* data,
+    const int* lengths,
+    float* out) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int colIndex = blockIdx.x; colIndex < cols; colIndex += gridDim.x) {
+    float mx = FLT_MIN;
+    const int length = lengths == nullptr ? rows : lengths[colIndex];
+    for (int rowIndex = threadIdx.x; rowIndex < length;
+         rowIndex += blockDim.x) {
+      mx = max(mx, data[rowIndex * cols + colIndex]);
+    }
+    mx = BlockReduce(temp_storage).Reduce(mx, cub::Max());
+    if (threadIdx.x == 0) {
+      out[colIndex] = mx;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void rowwise_max_kernel(
+    const int rows,
+    const int cols,
+    const float* data,
+    const int* lengths,
+    float* out) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int rowIndex = blockIdx.x; rowIndex < rows; rowIndex += gridDim.x) {
+    float mx = FLT_MIN;
+    const int length = lengths == nullptr ? cols : lengths[rowIndex];
+    for (int colIndex = threadIdx.x; colIndex < length;
+         colIndex += blockDim.x) {
+      mx = max(mx, data[rowIndex * cols + colIndex]);
+    }
+    mx = BlockReduce(temp_storage).Reduce(mx, cub::Max());
+    if (threadIdx.x == 0) {
+      out[rowIndex] = mx;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void columnwise_max_grad_kernel(
+    const int rows,
+    const int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int* lengths,
+    float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, rows * cols) {
+    int col = i % cols;
+    int row = i / cols;
+    if (lengths != nullptr && row >= lengths[col]) {
+      dXdata[i] = 0.0f;
+    } else {
+      dXdata[i] = (Xdata[i] == Ydata[col]) * dYdata[col];
+    }
+  }
+}
+
+__global__ void rowwise_max_grad_kernel(
+    const int rows,
+    const int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int* lengths,
+    float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, rows * cols) {
+    int col = i % cols;
+    int row = i / cols;
+    if (lengths != nullptr && col >= lengths[row]) {
+      dXdata[i] = 0.0f;
+    } else {
+      dXdata[i] = (Xdata[i] == Ydata[row]) * dYdata[row];
+    }
+  }
+}
+} // anonymous namespace
+
+// ReduceFrontmax
+template <>
+void MaxReduceDimsOp<float, CUDAContext, true>::Compute(
+    int rows,
+    int cols,
+    const float* data,
+    const int32_t* lengths_data,
+    float* out_data) {
+  columnwise_max_kernel<<<
+      std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(rows, cols, data, lengths_data, out_data);
+}
+
+// ReduceBackMax
+template <>
+void MaxReduceDimsOp<float, CUDAContext, false>::Compute(
+    int rows,
+    int cols,
+    const float* data,
+    const int32_t* lengths_data,
+    float* out_data) {
+  rowwise_max_kernel<<<
+      std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(rows, cols, data, lengths_data, out_data);
+}
+
+// ReduceFrontMaxGradient
+template <>
+void MaxReduceDimsGradientOp<float, CUDAContext, true>::Compute(
+    int rows,
+    int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int32_t* lengths_data,
+    float* dXdata) {
+  columnwise_max_grad_kernel<<<
+      CAFFE_GET_BLOCKS(rows * cols),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      rows, cols, dYdata, Xdata, Ydata, lengths_data, dXdata);
+}
+
+// ReduceBackMaxGradient
+template <>
+void MaxReduceDimsGradientOp<float, CUDAContext, false>::Compute(
+    int rows,
+    int cols,
+    const float* dYdata,
+    const float* Xdata,
+    const float* Ydata,
+    const int* lengths_data,
+    float* dXdata) {
+  rowwise_max_grad_kernel<<<
+      CAFFE_GET_BLOCKS(rows * cols),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      rows, cols, dYdata, Xdata, Ydata, lengths_data, dXdata);
+}
+
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontMax,
+    MaxReduceDimsOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR(
+    ReduceFrontMaxGradient,
+    MaxReduceDimsGradientOp<float, CUDAContext, true>);
+
+REGISTER_CUDA_OPERATOR(
+    ReduceBackMax,
+    MaxReduceDimsOp<float, CUDAContext, false>);
+REGISTER_CUDA_OPERATOR(
+    ReduceBackMaxGradient,
+    MaxReduceDimsGradientOp<float, CUDAContext, false>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduction_front_back_ops.h b/caffe2/operators/reduction_front_back_ops.h
new file mode 100644
index 0000000..03633ce
--- /dev/null
+++ b/caffe2/operators/reduction_front_back_ops.h
@@ -0,0 +1,290 @@
+#ifndef CAFFE2_OPERATORS_REDUCTION_FRONT_BACK_OPS_H_
+#define CAFFE2_OPERATORS_REDUCTION_FRONT_BACK_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context, bool FIRSTDIMS, bool NORMALIZE>
+class SumReduceDimsOp final : public Operator<Context> {
+ public:
+  SumReduceDimsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_reduce_dims_(
+            OperatorBase::GetSingleArgument<int32_t>("num_reduce_dim", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int, int64_t, float, double>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+
+    CAFFE_ENFORCE(
+        num_reduce_dims_ >= 0 && num_reduce_dims_ <= X.dims().size(),
+        "For N-dim input tensor, support num_reduce_dims in range [0, N].");
+
+    vector<TIndex> output_shape;
+    int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
+    int end_index =
+        FIRSTDIMS ? X.dims().size() : X.dims().size() - num_reduce_dims_;
+    for (int i = start_index; i < end_index; ++i) {
+      output_shape.push_back(X.dims()[i]);
+    }
+    Y->Resize(output_shape);
+
+    const int rows = FIRSTDIMS ? X.size_to_dim(num_reduce_dims_)
+                               : X.size_to_dim(X.ndim() - num_reduce_dims_);
+    const int cols = FIRSTDIMS ? X.size_from_dim(num_reduce_dims_)
+                               : X.size_from_dim(X.ndim() - num_reduce_dims_);
+
+    const T* in_data = X.template data<T>();
+    T* out_data = Y->template mutable_data<T>();
+
+    if (cols == 0 || rows == 0) {
+      math::Set(Y->size(), static_cast<T>(0), out_data, &context_);
+      return true;
+    }
+
+    const int32_t* lengths_data = nullptr;
+    if (InputSize() > 1) {
+      const auto& lengths = Input(1);
+      lengths_data = lengths.template data<int32_t>();
+      CAFFE_ENFORCE(
+          num_reduce_dims_ == 1,
+          "Given lengths input, the number of reduce dimensions should be one.");
+      const int batch_size = FIRSTDIMS ? cols : rows;
+      CAFFE_ENFORCE(
+          lengths.size() == batch_size,
+          "The size of lengths vector doesn't match the batch size.");
+    }
+
+    Compute(rows, cols, in_data, lengths_data, out_data);
+
+    return true;
+  }
+
+ private:
+  template <typename T>
+  void Compute(
+      int rows,
+      int cols,
+      const T* in_data,
+      const int32_t* lengths_data,
+      T* out_data);
+
+  int num_reduce_dims_;
+};
+
+template <class Context, bool FIRSTDIMS, bool NORMALIZE>
+class SumReduceDimsGradientOp final : public Operator<Context> {
+ public:
+  SumReduceDimsGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_reduce_dims_(
+            OperatorBase::GetSingleArgument<int32_t>("num_reduce_dim", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int, long, float, double>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& dY = Input(0);
+    auto& input_1 = Input(1);
+    auto* dX = Output(0);
+
+    // In previous diff we changed the semantic: Input(1) was changed from
+    // the shape of the input to the data tensor. This made the backward
+    // computation incompatible with old models. To fix this, we check
+    // the dimension and type of Input(1).
+    if (input_1.ndim() == 1 && input_1.template IsType<TIndex>()) {
+      // Input(1) is the shape of the input
+      shape_.CopyFrom(input_1);
+      // Copy first dims
+      vector<TIndex> output_shape(
+          shape_.template data<TIndex>(),
+          shape_.template data<TIndex>() + shape_.size());
+      dX->Resize(output_shape);
+    } else {
+      // Input(1) is data tensor X
+      dX->ResizeLike(input_1);
+    }
+
+    const int rows = FIRSTDIMS ? dX->size_to_dim(num_reduce_dims_)
+                               : dX->size_to_dim(dX->ndim() - num_reduce_dims_);
+    const int cols = FIRSTDIMS
+        ? dX->size_from_dim(num_reduce_dims_)
+        : dX->size_from_dim(dX->ndim() - num_reduce_dims_);
+
+    const int32_t* lengths_data = nullptr;
+    if (InputSize() > 2) {
+      const auto& lengths = Input(2);
+      lengths_data = lengths.template data<int32_t>();
+      CAFFE_ENFORCE(
+          num_reduce_dims_ == 1,
+          "Given lengths input, the number of reduce dimensions should be one.");
+      const int batch_size = FIRSTDIMS ? cols : rows;
+      CAFFE_ENFORCE(
+          lengths.size() == batch_size,
+          "The size of lengths vector doesn't match the batch size.");
+    }
+
+    const T* dYdata = dY.template data<T>();
+    T* dXdata = dX->template mutable_data<T>();
+    Compute<T>(rows, cols, dYdata, lengths_data, dXdata);
+    return true;
+  }
+
+ private:
+  template <typename T>
+  void Compute(
+      int rows,
+      int cols,
+      const T* dYdata,
+      const int32_t* lengths_data,
+      T* dXdata);
+  int num_reduce_dims_;
+  // scratch space used for former version of this reducer
+  Tensor<CPUContext> shape_;
+};
+
+template <typename T, class Context, bool FIRSTDIMS>
+class MaxReduceDimsOp final : public Operator<Context> {
+ public:
+  MaxReduceDimsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_reduce_dims_(
+            OperatorBase::GetSingleArgument<int32_t>("num_reduce_dim", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+
+    CAFFE_ENFORCE(
+        num_reduce_dims_ >= 0 && num_reduce_dims_ <= X.dims().size(),
+        "For N-dim input tensor, support num_reduce_dims in range [0, N].");
+
+    const int rows = FIRSTDIMS ? X.size_to_dim(num_reduce_dims_)
+                               : X.size_to_dim(X.ndim() - num_reduce_dims_);
+    const int cols = FIRSTDIMS ? X.size_from_dim(num_reduce_dims_)
+                               : X.size_from_dim(X.ndim() - num_reduce_dims_);
+
+    vector<TIndex> output_shape;
+    int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
+    int end_index =
+        FIRSTDIMS ? X.dims().size() : X.dims().size() - num_reduce_dims_;
+
+    for (int i = start_index; i < end_index; ++i) {
+      output_shape.push_back(X.dims()[i]);
+    }
+    Y->Resize(output_shape);
+    float* out_data = Y->template mutable_data<float>();
+
+    if (cols == 0 || rows == 0) {
+      math::Set(Y->size(), static_cast<float>(0), out_data, &context_);
+      return true;
+    }
+
+    const int32_t* lengths_data = nullptr;
+    if (InputSize() > 1) {
+      const auto& lengths = Input(1);
+      lengths_data = lengths.template data<int32_t>();
+      CAFFE_ENFORCE(
+          num_reduce_dims_ == 1,
+          "Given lengths input, the number of reduce dimensions should be one.");
+      const int batch_size = FIRSTDIMS ? cols : rows;
+      CAFFE_ENFORCE(
+          lengths.size() == batch_size,
+          "The size of lengths vector doesn't match the batch size.");
+    }
+
+    const float* data = X.template data<float>();
+    Compute(rows, cols, data, lengths_data, out_data);
+    return true;
+  }
+
+ protected:
+  void Compute(
+      int rows,
+      int cols,
+      const float* data,
+      const int32_t* lengths_data,
+      float* out_data);
+
+  int num_reduce_dims_;
+};
+
+template <typename T, class Context, bool FIRSTDIMS>
+class MaxReduceDimsGradientOp final : public Operator<Context> {
+ public:
+  MaxReduceDimsGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_reduce_dims_(
+            OperatorBase::GetSingleArgument<int32_t>("num_reduce_dim", 1)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& dY = Input(0);
+    auto& X = Input(1);
+    auto& Y = Input(2);
+    auto* dX = Output(0);
+
+    dX->ResizeLike(X);
+    const int rows = FIRSTDIMS ? X.size_to_dim(num_reduce_dims_)
+                               : X.size_to_dim(X.ndim() - num_reduce_dims_);
+    const int cols = FIRSTDIMS ? X.size_from_dim(num_reduce_dims_)
+                               : X.size_from_dim(X.ndim() - num_reduce_dims_);
+
+    const float* dYdata = dY.template data<float>();
+    const float* Xdata = X.template data<float>();
+    const float* Ydata = Y.template data<float>();
+
+    const int32_t* lengths_data = nullptr;
+    if (InputSize() > 3) {
+      const auto& lengths = Input(3);
+      lengths_data = lengths.template data<int32_t>();
+      CAFFE_ENFORCE(
+          num_reduce_dims_ == 1,
+          "Given lengths input, the number of reduce dimensions should be one.");
+      const int batch_size = FIRSTDIMS ? cols : rows;
+      CAFFE_ENFORCE(
+          lengths.size() == batch_size,
+          "The size of lengths vector doesn't match the batch size.");
+    }
+
+    float* dXdata = dX->template mutable_data<float>();
+    Compute(rows, cols, dYdata, Xdata, Ydata, lengths_data, dXdata);
+    return true;
+  }
+
+ protected:
+  void Compute(
+      int rows,
+      int cols,
+      const float* dYdata,
+      const float* Xdata,
+      const float* Ydata,
+      const int32_t* lengths_data,
+      float* dXdata);
+
+  int num_reduce_dims_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REDUCTION_FRONT_BACK_OPS_H_
diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc
new file mode 100644
index 0000000..6f043eb
--- /dev/null
+++ b/caffe2/operators/reduction_ops.cc
@@ -0,0 +1,372 @@
+#include "caffe2/operators/reduction_ops.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);
+
+REGISTER_CPU_OPERATOR(
+    SumElementsGradient,
+    SumElementsGradientOp<float, CPUContext>);
+
+REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
+REGISTER_CPU_OPERATOR(
+    RowwiseMaxGradient,
+    MaxReductionGradientOp<float, CPUContext, true>);
+REGISTER_CPU_OPERATOR(
+    ColwiseMaxGradient,
+    MaxReductionGradientOp<float, CPUContext, false>);
+REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);
+
+OPERATOR_SCHEMA(SumElements)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::FLOAT)
+    .SetDoc(R"DOC(
+Sums the elements of the input tensor. Tensor type must be float32.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+sum_op = core.CreateOperator(
+    "SumElements",
+    ["X"],
+    ["Y"]
+)
+
+avg_op = core.CreateOperator(
+    "SumElements",
+    ["X"],
+    ["Y"],
+    average=True
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(sum_op)
+print("Y (sum_op):", workspace.FetchBlob("Y"))
+workspace.RunOperatorOnce(avg_op)
+print("Y (avg_op):", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[7. 2. 5.]
+ [9. 4. 2.]
+ [1. 2. 5.]]
+Y (sum_op): 37.0
+Y (avg_op): 4.111111
+
+```
+
+</details>
+
+    )DOC")
+    .Arg("average", "(*bool*): set to True to compute the average of the elements rather than the sum")
+    .Input(0, "X", "(*Tensor`<float>`*): blob pointing to an instance of a counter")
+    .Output(0, "sum", "(*Tensor`<float>`*): Scalar tensor containing the sum (or average)");
+
+OPERATOR_SCHEMA(SumElementsInt)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::INT32)
+    .SetDoc("Sums the integer elements of the input tensor.")
+    .Input(0, "X", "Tensor to sum up")
+    .Output(0, "sum", "Scalar sum");
+SHOULD_NOT_DO_GRADIENT(SumElementsInt);
+
+OPERATOR_SCHEMA(SumSqrElements)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::FLOAT)
+    .SetDoc("Sums the squares elements of the input tensor.")
+    .Arg("average", "whether to average or not")
+    .Input(0, "X", "Tensor to sum up")
+    .Output(0, "sum", "Scalar sum of squares");
+
+OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);
+
+class GetSumElementsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SumElementsGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(SumElements, GetSumElementsGradient);
+
+OPERATOR_SCHEMA(RowwiseMax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute row-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxM$, with one row for each element of the batch, and the same number of columns as the number of rows of the input tensor.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "RowwiseMax",
+    ["X"],
+    ["Y"]
+)
+
+# Create X, simulating a batch of 2, 4x4 matricies
+X = np.random.randint(0,high=20,size=(2,4,4))
+print("X:\n",X)
+
+# Feed X into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[[ 5 12 10  1]
+  [ 4 16  2 15]
+  [ 5 11 12 15]
+  [15  4 17 19]]
+
+ [[16  5  5 13]
+  [17  2  1 17]
+  [18  3 19  5]
+  [14 16 10 16]]]
+Y:
+ [[12. 16. 15. 19.]
+ [16. 17. 19. 16.]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(
+        0,
+        "X",
+        "A tensor of dimensions $B x M x N$ to compute rowwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
+    .Output(
+        0,
+        "Y",
+        "The output tensor of shape $B x M$, where each row represents the row-wise maximums for that element of the input batch.");
+
+OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
+class GetRowwiseMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RowwiseMaxGradient",
+        "",
+        vector<string>{I(0), O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);
+
+OPERATOR_SCHEMA(ColwiseMaxGradient);
+
+OPERATOR_SCHEMA(ColwiseMax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute column-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxN$, with one row for each element of the batch, and the same number of columns as the input tensor.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "ColwiseMax",
+    ["X"],
+    ["Y"]
+)
+
+# Create X, simulating a batch of 2, 4x4 matricies
+X = np.random.randint(0,high=20,size=(2,4,4))
+print("X:\n",X)
+
+# Feed X into workspace
+workspace.FeedBlob("X", X.astype(np.float32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[[17 15  2  6]
+  [ 8 12  6  0]
+  [ 6  9  7  3]
+  [ 4 13 16 13]]
+
+ [[ 0  3  4 12]
+  [18  1 17 12]
+  [ 7 17 13 14]
+  [12 17  2  1]]]
+Y:
+ [[17. 15. 16. 13.]
+ [18. 17. 17. 14.]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(
+        0,
+        "X",
+        "A tensor of dimensions $B x M x N$ to compute columnwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
+    .Output(
+        0,
+        "Y",
+        "The output tensor of shape $B x N$, where each row represents the column-wise maximums for that element of the input batch.");
+
+OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
+class GetColwiseMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ColwiseMaxGradient",
+        "",
+        vector<string>{I(0), O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);
+
+template <typename T, class Context>
+bool SumElementsGradientOp<T, Context>::RunOnDevice()
+// TODO: T21635077 fix float-divide-by-zero undefined behavior
+#if defined(__has_feature)
+#if __has_feature(__address_sanitizer__)
+    __attribute__((__no_sanitize__("float-divide-by-zero")))
+#endif
+#endif
+{
+  auto& X = Input(0);
+  // Copy Input(1) from Context to CPUContext
+  CPUContext context;
+  TensorCPU sum_grad(Input(1), &context);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  DCHECK_EQ(sum_grad.size(), 1);
+  math::Set<T, Context>(
+      dX->size(),
+      static_cast<T>(
+          sum_grad.template data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
+      dX->template mutable_data<T>(),
+      &context_);
+  return true;
+}
+
+template <typename T, class Context, bool ROWWISE>
+bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  CAFFE_ENFORCE_EQ(X.ndim(), 3);
+
+  const int batch_size = X.dim32(0);
+  const int M = X.dim32(1);
+  const int N = X.dim32(2);
+
+  const T* Xdata = X.template data<T>();
+  const T* Ydata = Y.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* dXdata = dX->template mutable_data<T>();
+
+  const int input_size = M * N;
+  for (int i = 0; i < batch_size; ++i) {
+    const T* Xdata_i = Xdata + i * input_size;
+    T* dXdata_i = dXdata + i * input_size;
+    if (ROWWISE) {
+      const T* Ydata_i = Ydata + i * M;
+      const T* dYdata_i = dYdata + i * M;
+      for (int m = 0; m < M; ++m) {
+        const T* Xdata_m = Xdata_i + m * N;
+        T* dXdata_m = dXdata_i + m * N;
+        for (int n = 0; n < N; ++n) {
+          if (Xdata_m[n] == Ydata_i[m]) {
+            dXdata_m[n] = dYdata_i[m];
+          } else {
+            dXdata_m[n] = static_cast<T>(0);
+          }
+        }
+      }
+    } else {
+      const T* Ydata_i = Ydata + i * N;
+      const T* dYdata_i = dYdata + i * N;
+      for (int n = 0; n < N; ++n) {
+        for (int m = 0; m < M; ++m) {
+          const T* Xdata_m = Xdata_i + m * N;
+          T* dXdata_m = dXdata_i + m * N;
+          if (Xdata_m[n] == Ydata_i[n]) {
+            dXdata_m[n] = dYdata_i[n];
+          } else {
+            dXdata_m[n] = static_cast<T>(0);
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
new file mode 100644
index 0000000..3f9728c
--- /dev/null
+++ b/caffe2/operators/reduction_ops.cu
@@ -0,0 +1,137 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/reduction_ops.h"
+#include "caffe2/utils/conversions.h"
+
+#include <cub/cub.cuh>
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(SumElements, SumElementsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SumElementsInt, SumElementsIntOp<int, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SumSqrElements, SumSqrElementsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(RowwiseMax, MaxReductionOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR(ColwiseMax, MaxReductionOp<float, CUDAContext, false>);
+REGISTER_CUDA_OPERATOR(
+    RowwiseMaxGradient,
+    MaxReductionGradientOp<float, CUDAContext, true>)
+REGISTER_CUDA_OPERATOR(
+    ColwiseMaxGradient,
+    MaxReductionGradientOp<float, CUDAContext, false>)
+
+REGISTER_CUDA_OPERATOR(
+    SumElementsGradient,
+    SumElementsGradientOp<float, CUDAContext>);
+
+template <typename T>
+__global__ void
+SumElementsGradientKernel(bool average, const int N, const T* dY, T* dX) {
+  const T value = average ? (*dY) / N : *dY;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = value;
+  }
+}
+
+__global__ void rowwise_max_gradient_kernel(
+    const int batch_size,
+    const int M,
+    const int N,
+    const float* X,
+    const float* Y,
+    const float* dY,
+    float* dX) {
+  const int input_size = M * N;
+  CUDA_1D_KERNEL_LOOP(i, batch_size * M * N) {
+    const int b_i = i / input_size;
+    const int b_n = i / input_size / N;
+    const int y_index = b_i * M + b_n;
+    if (X[i] == Y[y_index]) {
+      dX[i] = dY[y_index];
+    } else {
+      dX[i] = 0.0;
+    }
+  }
+}
+
+template <>
+bool SumSqrElementsOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+
+__global__ void colwise_max_gradient_kernel(
+    const int batch_size,
+    const int M,
+    const int N,
+    const float* X,
+    const float* Y,
+    const float* dY,
+    float* dX) {
+  const int input_size = M * N;
+  CUDA_1D_KERNEL_LOOP(i, batch_size * M * N) {
+    const int b_i = i / input_size;
+    const int b_n = i % input_size % N;
+    const int y_index = b_i * N + b_n;
+    if (X[i] == Y[y_index]) {
+      dX[i] = dY[y_index];
+    } else {
+      dX[i] = 0.0;
+    }
+  }
+}
+
+template <>
+bool SumElementsGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  DCHECK_EQ(dY.size(), 1);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  SumElementsGradientKernel<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      average_, X.size(), dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+template <typename T, class Context, bool ROWWISE>
+bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  CAFFE_ENFORCE_EQ(X.ndim(), 3);
+
+  const int batch_size = X.dim32(0);
+  const int M = X.dim32(1);
+  const int N = X.dim32(2);
+
+  const T* Xdata = X.template data<T>();
+  const T* Ydata = Y.template data<T>();
+  const T* dYdata = dY.template data<T>();
+  T* dXdata = dX->template mutable_data<T>();
+
+  const int input_size = M * N;
+  if (ROWWISE) {
+    rowwise_max_gradient_kernel<<<
+        CAFFE_GET_BLOCKS(batch_size * input_size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        batch_size, M, N, Xdata, Ydata, dYdata, dXdata);
+  } else {
+    colwise_max_gradient_kernel<<<
+        CAFFE_GET_BLOCKS(batch_size * input_size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        batch_size, M, N, Xdata, Ydata, dYdata, dXdata);
+  }
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
new file mode 100644
index 0000000..ba50248
--- /dev/null
+++ b/caffe2/operators/reduction_ops.h
@@ -0,0 +1,180 @@
+#ifndef CAFFE2_OPERATORS_REDUCTION_OPS_H_
+#define CAFFE2_OPERATORS_REDUCTION_OPS_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SumElementsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  SumElementsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        average_(OperatorBase::GetSingleArgument<bool>("average", false)) {}
+  SumElementsOp(const OperatorDef& operator_def, Workspace* ws, bool average)
+      : Operator<Context>(operator_def, ws), average_(average) {}
+  ~SumElementsOp() {}
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* sum = Output(0);
+    sum->Resize(vector<TIndex>());
+
+    T* data = sum->template mutable_data<T>();
+
+    math::Sum<T, Context>(
+        X.size(), X.template data<T>(), data, &context_, &scratch_);
+    if (average_ && X.size() > 0) {
+      math::Scale<T, Context>(
+          1,
+          static_cast<T>(1.) / X.size(),
+          sum->template data<T>(),
+          data,
+          &context_);
+    }
+    return true;
+  }
+
+ private:
+  bool average_;
+  Tensor<Context> scratch_;
+};
+
+template <typename T, class Context>
+class SumElementsIntOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  SumElementsIntOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~SumElementsIntOp() {}
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto* sum = Output(0);
+    sum->Resize(vector<TIndex>());
+    T* data = sum->template mutable_data<T>();
+    math::Sum<T, Context>(
+        X.size(), X.template data<T>(), data, &context_, &scratch_);
+    return true;
+  }
+
+ private:
+  Tensor<Context> scratch_;
+};
+
+template <typename T, class Context>
+class SumElementsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  SumElementsGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        average_(OperatorBase::GetSingleArgument<bool>("average", false)) {}
+  SumElementsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws,
+      bool average)
+      : Operator<Context>(operator_def, ws), average_(average) {}
+  ~SumElementsGradientOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  bool average_;
+};
+
+template <class Context>
+class SumSqrElementsOp : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SumSqrElementsOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    bool average = OperatorBase::GetSingleArgument<bool>("average", false);
+    auto& X = Input(0);
+    auto* sum = Output(0);
+    sum->Resize(vector<TIndex>());
+    math::SumSqr<T, Context>(
+        X.size(),
+        X.template data<T>(),
+        sum->template mutable_data<T>(),
+        &context_,
+        &scratch_);
+    if (average && X.size() > 0) {
+      math::Scale<T, Context>(
+          1,
+          float(1.) / X.size(),
+          sum->template data<T>(),
+          sum->template mutable_data<T>(),
+          &context_);
+    }
+    return true;
+  }
+
+ private:
+  Tensor<Context> scratch_;
+};
+
+template <typename T, class Context, bool ROWWISE>
+class MaxReductionOp : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(MaxReductionOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    CAFFE_ENFORCE_EQ(X.ndim(), 3);
+
+    const int batch_size = X.dim32(0);
+    const int M = X.dim32(1);
+    const int N = X.dim32(2);
+
+    auto* Y = Output(0);
+    ROWWISE ? Y->Resize(batch_size, M) : Y->Resize(batch_size, N);
+
+    if (ROWWISE) {
+      math::RowwiseMax<T, Context>(
+          batch_size * M,
+          N,
+          X.template data<T>(),
+          Y->template mutable_data<T>(),
+          &context_);
+    } else {
+      const int input_size = N * M;
+      for (int i = 0; i < batch_size; ++i) {
+        math::ColwiseMax<T, Context>(
+            M,
+            N,
+            X.template data<T>() + i * input_size,
+            Y->template mutable_data<T>() + i * N,
+            &context_);
+      }
+    }
+    return true;
+  }
+};
+
+template <typename T, class Context, bool ROWWISE>
+class MaxReductionGradientOp : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(MaxReductionGradientOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/operators/relu_n_op.cc b/caffe2/operators/relu_n_op.cc
new file mode 100644
index 0000000..f04769a
--- /dev/null
+++ b/caffe2/operators/relu_n_op.cc
@@ -0,0 +1,107 @@
+#include "caffe2/operators/relu_n_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool ReluNFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  EigenVectorMap<T>(Y, N) =
+      ConstEigenVectorMap<T>(X, N).cwiseMax(T(0)).cwiseMin(T(n));
+  return true;
+}
+
+template <>
+template <typename T>
+bool ReluNGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  EigenVectorArrayMap<T>(dX, size) =
+      (Y_arr > T(0) && Y_arr < T(n))
+          .select(ConstEigenVectorArrayMap<T>(dY, size), T(0));
+  return true;
+}
+
+namespace {
+
+OpSchema::Cost CostInferenceForReluN(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  struct OpSchema::Cost cost = PointwiseCostInference<2>(def, in);
+  cost.params_bytes = 0;
+  return cost;
+}
+
+} // namespace
+
+REGISTER_CPU_OPERATOR(
+    ReluN,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CPUContext,
+        ReluNFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReluNGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CPUContext,
+        ReluNGradientFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(ReluN)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("n", "the cap of output")
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(CostInferenceForReluN)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Relu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the rectified linear function, y = min(max(0, x), n),
+is applied to the tensor elementwise.
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D input tensor");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(ReluNGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Arg("n", "the cap of forward op output")
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+ReluGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the rectified linear function.
+)DOC");
+
+namespace {
+
+class GetReluNGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(ReluN, GetReluNGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/relu_n_op.cu b/caffe2/operators/relu_n_op.cu
new file mode 100644
index 0000000..607568d
--- /dev/null
+++ b/caffe2/operators/relu_n_op.cu
@@ -0,0 +1,88 @@
+#include "caffe2/operators/relu_n_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+ReluNCUDAKernel(const int N, const T threshold, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) > 0
+        ? (__ldg(X + i) < threshold ? __ldg(X + i) : threshold)
+        : T(0);
+#else
+    Y[i] = X[i] > 0 ? (X[i] < threshold ? X[i] : threshold) : T(0);
+#endif
+  }
+}
+
+template <typename T>
+__global__ void ReluNGradientCUDAKernel(
+    const int N,
+    const T threshold,
+    const T* dY,
+    const T* Y,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = (__ldg(Y + i) > 0 && __ldg(Y + i) < threshold) ? dY[i] : T(0);
+#else
+    dX[i] = (Y[i] > 0 && Y[i] < threshold) ? dY[i] : T(0);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool ReluNFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  ReluNCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, n, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool ReluNGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ReluNGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, n, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    ReluN,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        ReluNFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReluNGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        ReluNGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/relu_n_op.h b/caffe2/operators/relu_n_op.h
new file mode 100644
index 0000000..2fbdabf
--- /dev/null
+++ b/caffe2/operators/relu_n_op.h
@@ -0,0 +1,44 @@
+#ifndef CAFFE2_OPERATORS_RELU_N_OP_H_
+#define CAFFE2_OPERATORS_RELU_N_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct ReluNFunctor {
+  explicit ReluNFunctor(OperatorBase& op)
+      : n(op.GetSingleArgument<float>("n", 6.0f)) {
+    CAFFE_ENFORCE_GT(n, 0, "n should be greater than 0");
+  }
+
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+
+  const float n;
+};
+
+template <class Context>
+struct ReluNGradientFunctor {
+  explicit ReluNGradientFunctor(OperatorBase& op)
+      : n(op.GetSingleArgument<float>("n", 6.0f)) {
+    CAFFE_ENFORCE_GT(n, 0, "n should be greater than 0");
+  }
+
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& Y_dims,
+      const std::vector<int>& dY_dims,
+      const T* Y,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+
+  const float n;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RELU_N_OP_H_
diff --git a/caffe2/operators/relu_op.cc b/caffe2/operators/relu_op.cc
new file mode 100644
index 0000000..0320524
--- /dev/null
+++ b/caffe2/operators/relu_op.cc
@@ -0,0 +1,169 @@
+#include "caffe2/operators/relu_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool ReluFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  EigenVectorMap<T>(Y, N) = ConstEigenVectorMap<float>(X, N).cwiseMax(T(0));
+  return true;
+}
+
+#ifdef CAFFE2_USE_ACCELERATE
+
+template <>
+template <>
+bool ReluFunctor<CPUContext>::operator()<float>(
+    const int N,
+    const float* X,
+    float* Y,
+    CPUContext* /* context */) const {
+  const float zero = 0.0f;
+  vDSP_vthres(X, 1, &zero, Y, 1, N);
+  return true;
+}
+
+#endif // CAFFE2_USE_ACCELERATE
+
+template <>
+template <typename T>
+bool ReluGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  EigenVectorArrayMap<T>(dX, size) =
+      (ConstEigenVectorArrayMap<T>(Y, size) > T(0))
+          .select(ConstEigenVectorArrayMap<T>(dY, size), T(0));
+  return true;
+}
+
+namespace {
+
+OpSchema::Cost CostInferenceForRelu(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  struct OpSchema::Cost cost = PointwiseCostInference<0>(def, in);
+  cost.params_bytes = 0;
+  return cost;
+}
+
+} // namespace
+
+REGISTER_CPU_OPERATOR(
+    Relu,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        ReluFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    ReluGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        ReluGradientFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Relu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(CostInferenceForRelu)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Applies rectified linear unit operation to the input data element-wise. The Relu operation takes one input $X$, produces one output $Y$, and is defined as:
+
+$$Y = max(0,X)$$
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/relu_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/relu_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+  "Relu",
+  ["X"],
+  ["Y"]
+  )
+
+workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) # NCHW
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[-1.4655551   0.64575136  0.7921748   0.4150579 ]
+ [ 0.41085166 -0.2837964   0.9881425  -1.9300346 ]
+ [ 0.39705405  0.44639114  0.9940703   0.2926532 ]
+ [-0.6726489   0.01330667  1.101319    0.33858967]]
+
+Y:
+ [[0.         0.64575136 0.7921748  0.4150579 ]
+ [0.41085166 0.         0.9881425  0.        ]
+ [0.39705405 0.44639114 0.9940703  0.2926532 ]
+ [0.         0.01330667 1.101319   0.33858967]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D output tensor with same shape as input")
+    .InheritOnnxSchema("Relu");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(ReluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+ReluGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the rectified linear function.
+)DOC");
+
+namespace {
+
+class GetReluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Relu, GetReluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/relu_op.cu b/caffe2/operators/relu_op.cu
new file mode 100644
index 0000000..d392e49
--- /dev/null
+++ b/caffe2/operators/relu_op.cu
@@ -0,0 +1,198 @@
+#include "caffe2/operators/relu_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void ReluCUDAKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) > 0 ? __ldg(X + i) : T(0);
+#else
+    Y[i] = X[i] > 0 ? X[i] : T(0);
+#endif
+  }
+}
+
+__global__ void ReluHalfCUDAKernel(const int N, const half* X, half* Y) {
+  const half kZero = __float2half(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 530
+    Y[i] = __hgt(__ldg(X + i), kZero) ? __ldg(X + i) : kZero;
+#else
+    Y[i] = (__half2float(X[i]) > 0) ? X[i] : kZero;
+#endif
+  }
+}
+
+__global__ void ReluHalf2CUDAKernel(const int N, const half2* X, half2* Y) {
+  const half2 kZero = __float2half2_rn(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 530
+    Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
+#else
+    const float2 xx = __half22float2(X[i]);
+    Y[i] = __floats2half2_rn(xx.x > 0 ? xx.x : 0.f, xx.y > 0 ? xx.y : 0.f);
+#endif
+  }
+}
+
+template <typename T>
+__global__ void
+ReluGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(Y + i) > 0 ? __ldg(dY + i) : 0;
+#else
+    dX[i] = Y[i] > 0 ? dY[i] : 0;
+#endif
+  }
+}
+
+__global__ void ReluGradientHalfCUDAKernel(
+    const int N,
+    const half* dY,
+    const half* Y,
+    half* dX) {
+  const half kZero = __float2half(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 530
+    dX[i] = __hgt(__ldg(Y + i), kZero) ? __ldg(dY + i) : kZero;
+#else
+    dX[i] = (__half2float(Y[i]) > 0) ? dY[i] : kZero;
+#endif
+  }
+}
+
+__global__ void ReluGradientHalf2CUDAKernel(
+    const int N,
+    const half2* dY,
+    const half2* Y,
+    half2* dX) {
+  const half2 kZero = __float2half2_rn(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 530
+    dX[i] = __hmul2(__hgt2(__ldg(Y + i), kZero), __ldg(dY + i));
+#else
+    const float2 dy = __half22float2(dY[i]);
+    const float2 yy = __half22float2(Y[i]);
+    dX[i] = __floats2half2_rn(yy.x > 0 ? dy.x : 0.f, yy.y > 0 ? dy.y : 0.f);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool ReluFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  ReluCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <>
+bool ReluFunctor<CUDAContext>::operator()<float16>(
+    const int N,
+    const float16* X,
+    float16* Y,
+    CUDAContext* context) const {
+  if ((N & 1) == 0) {
+    ReluHalf2CUDAKernel<<<
+        CAFFE_GET_BLOCKS((N >> 1)),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        (N >> 1),
+        reinterpret_cast<const half2*>(X),
+        reinterpret_cast<half2*>(Y));
+  } else {
+    ReluHalfCUDAKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        N, reinterpret_cast<const half*>(X), reinterpret_cast<half*>(Y));
+  }
+  return true;
+}
+
+template <>
+template <typename T>
+bool ReluGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ReluGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
+
+template <>
+template <>
+bool ReluGradientFunctor<CUDAContext>::Forward<float16>(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const float16* Y,
+    const float16* dY,
+    float16* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  if ((size & 1) == 0) {
+    ReluGradientHalf2CUDAKernel<<<
+        CAFFE_GET_BLOCKS((size >> 1)),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        (size >> 1),
+        reinterpret_cast<const half2*>(dY),
+        reinterpret_cast<const half2*>(Y),
+        reinterpret_cast<half2*>(dX));
+  } else {
+    ReluGradientHalfCUDAKernel<<<
+        CAFFE_GET_BLOCKS(size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        size,
+        reinterpret_cast<const half*>(dY),
+        reinterpret_cast<const half*>(Y),
+        reinterpret_cast<half*>(dX));
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Relu,
+    UnaryElementwiseOp<
+        TensorTypes<float, float16>,
+        CUDAContext,
+        ReluFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    ReluGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float, float16>,
+        CUDAContext,
+        ReluGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/relu_op.h b/caffe2/operators/relu_op.h
new file mode 100644
index 0000000..1422d16
--- /dev/null
+++ b/caffe2/operators/relu_op.h
@@ -0,0 +1,30 @@
+#ifndef CAFFE2_OPERATORS_RELU_OP_H_
+#define CAFFE2_OPERATORS_RELU_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct ReluFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct ReluGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& Y_dims,
+      const std::vector<int>& dY_dims,
+      const T* Y,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RELU_OP_H_
diff --git a/caffe2/operators/relu_op_cudnn.cc b/caffe2/operators/relu_op_cudnn.cc
new file mode 100644
index 0000000..75dc6cd
--- /dev/null
+++ b/caffe2/operators/relu_op_cudnn.cc
@@ -0,0 +1,12 @@
+#include "caffe2/operators/relu_op.h"
+
+#include "caffe2/operators/activation_ops_cudnn.h"
+
+namespace caffe2 {
+
+REGISTER_CUDNN_OPERATOR(Relu, CuDNNActivationOp<CUDNN_ACTIVATION_RELU>);
+REGISTER_CUDNN_OPERATOR(
+    ReluGradient,
+    CuDNNActivationGradientOp<CUDNN_ACTIVATION_RELU>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/remove_data_blocks_op.cc b/caffe2/operators/remove_data_blocks_op.cc
new file mode 100644
index 0000000..7074da0
--- /dev/null
+++ b/caffe2/operators/remove_data_blocks_op.cc
@@ -0,0 +1,24 @@
+#include "caffe2/operators/remove_data_blocks_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(RemoveDataBlocks, RemoveDataBlocksOp<CPUContext>);
+
+OPERATOR_SCHEMA(RemoveDataBlocks)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Shrink the data tensor by removing data blocks with given zero-based indices in
+the outermost dimension of the tensor. Indices are not assumed in any order or
+unique but with the range [0, blocks_size). Indices could be empty.
+  )DOC")
+    .Input(0, "data", "a N-D data tensor, N >= 1")
+    .Input(1, "indices", "zero-based indices of blocks to be removed")
+    .Output(
+        0,
+        "shrunk data",
+        "data after removing data blocks indexed by 'indices'");
+
+SHOULD_NOT_DO_GRADIENT(RemoveDataBlocks);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/remove_data_blocks_op.h b/caffe2/operators/remove_data_blocks_op.h
new file mode 100644
index 0000000..9530242
--- /dev/null
+++ b/caffe2/operators/remove_data_blocks_op.h
@@ -0,0 +1,85 @@
+#ifndef CAFFE2_OPERATORS_REMOVE_DATA_BLOCKS_OP_H_
+#define CAFFE2_OPERATORS_REMOVE_DATA_BLOCKS_OP_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class RemoveDataBlocksOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(RemoveDataBlocksOp);
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    if (Input(INDICES).dims()[0] == 0) {
+      Output(0)->CopyFrom(Input(0));
+      return true;
+    } else {
+      return DispatchHelper<TensorTypes<int, long>>::call(this, Input(INDICES));
+    }
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& data = Input(DATA);
+    const auto& indices = Input(INDICES);
+    CAFFE_ENFORCE(data.ndim() > 0, "DATA should be at leat 1-D.");
+    CAFFE_ENFORCE(indices.ndim() == 1, "INDICES should be 1-D.");
+
+    const auto outer_size = data.dims()[0];
+    const auto block_size = data.size_from_dim(1);
+    const auto block_size_bytes = block_size * data.meta().itemsize();
+    auto indices_size = indices.dims()[0];
+    const char* data_ptr = (char*)data.raw_data();
+    const auto* ind_ptr = indices.template data<T>();
+
+    std::vector<T> ind_vec;
+    for (int64_t i = 0; i < indices_size; i++) {
+      ind_vec.push_back(ind_ptr[i]);
+    }
+    std::sort(ind_vec.begin(), ind_vec.end());
+    CAFFE_ENFORCE(ind_vec[0] >= 0, "The min index should be larger than zero.");
+    CAFFE_ENFORCE(
+        ind_vec[indices_size - 1] < outer_size,
+        "The max index should be smaller than the data outer size.");
+    // removes duplicate indices
+    ind_vec.erase(std::unique(ind_vec.begin(), ind_vec.end()), ind_vec.end());
+    indices_size = ind_vec.size();
+
+    auto* output = Output(0);
+    auto shape = data.dims();
+    shape[0] -= indices_size;
+    output->Resize(shape);
+    char* out_ptr = (char*)output->raw_mutable_data(data.meta());
+
+    ind_vec.insert(ind_vec.begin(), -1);
+    int64_t ind_vec_size = ind_vec.size();
+    for (auto i = 0; i < ind_vec_size; i++) {
+      int64_t interval_start = ind_vec[i] + 1;
+      int64_t interval_end =
+          (i == ind_vec_size - 1) ? outer_size : ind_vec[i + 1];
+      auto num_items = interval_end - interval_start;
+      context_.template CopyItems<Context, Context>(
+          data.meta(),
+          num_items * block_size,
+          data_ptr + block_size_bytes * interval_start,
+          out_ptr);
+      out_ptr += block_size_bytes * num_items;
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA, INDICES);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REMOVE_DATA_BLOCKS_OP_H_
diff --git a/caffe2/operators/replace_nan_op.cc b/caffe2/operators/replace_nan_op.cc
new file mode 100644
index 0000000..a0c7b27
--- /dev/null
+++ b/caffe2/operators/replace_nan_op.cc
@@ -0,0 +1,37 @@
+#include "caffe2/operators/replace_nan_op.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+void ReplaceNaNOp<CPUContext>::ReplaceNaN(
+    const T& value,
+    const TIndex size,
+    const T* X,
+    T* Y) {
+  for (TIndex i = 0; i < size; i++) {
+    if (std::isnan(X[i])) {
+      Y[i] = value;
+    } else {
+      Y[i] = X[i];
+    }
+  }
+}
+
+REGISTER_CPU_OPERATOR(ReplaceNaN, ReplaceNaNOp<CPUContext>);
+
+OPERATOR_SCHEMA(ReplaceNaN)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Replace the NaN (not a number) element in the input tensor with argument `value`
+)DOC")
+    .Arg("value (optional)", "the value to replace NaN, the default is 0")
+    .Input(0, "input", "Input tensor")
+    .Input(1, "output", "Output tensor");
+
+SHOULD_NOT_DO_GRADIENT(ReplaceNaN);
+
+} // namespace caffe2
diff --git a/caffe2/operators/replace_nan_op.cu b/caffe2/operators/replace_nan_op.cu
new file mode 100644
index 0000000..e84fb3e
--- /dev/null
+++ b/caffe2/operators/replace_nan_op.cu
@@ -0,0 +1,34 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/replace_nan_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void
+replace_nan_kernel(const T value, const TIndex size, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    if (isnan(X[i])) {
+      Y[i] = value;
+    } else {
+      Y[i] = X[i];
+    }
+  }
+}
+} // namespace
+
+template <>
+template <typename T>
+void ReplaceNaNOp<CUDAContext>::ReplaceNaN(
+    const T& value,
+    const TIndex size,
+    const T* X,
+    T* Y) {
+  replace_nan_kernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(value, size, X, Y);
+}
+REGISTER_CUDA_OPERATOR(ReplaceNaN, ReplaceNaNOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/replace_nan_op.h b/caffe2/operators/replace_nan_op.h
new file mode 100644
index 0000000..d9745a4
--- /dev/null
+++ b/caffe2/operators/replace_nan_op.h
@@ -0,0 +1,44 @@
+#ifndef CAFFE_OPERATORS_REPLACE_NAN_OP_H_
+#define CAFFE_OPERATORS_REPLACE_NAN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ReplaceNaNOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ReplaceNaNOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  void ReplaceNaN(const T& value, const TIndex size, const T* X, T* Y);
+
+  template <typename T>
+  bool DoRunWithType() {
+    T value = OperatorBase::GetSingleArgument<T>("value", 0);
+
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->ResizeLike(input);
+
+    const T* input_data = input.template data<T>();
+    T* output_data = output->template mutable_data<T>();
+
+    ReplaceNaN<T>(value, input.size(), input_data, output_data);
+
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_REPLACE_NAN_OP_H_
diff --git a/caffe2/operators/reservoir_sampling.cc b/caffe2/operators/reservoir_sampling.cc
new file mode 100644
index 0000000..79198d7
--- /dev/null
+++ b/caffe2/operators/reservoir_sampling.cc
@@ -0,0 +1,265 @@
+#include <memory>
+#include <string>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/operators/map_ops.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+class ReservoirSamplingOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ReservoirSamplingOp(const OperatorDef operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        numToCollect_(
+            OperatorBase::GetSingleArgument<int>("num_to_collect", -1)) {
+    CAFFE_ENFORCE(numToCollect_ > 0);
+  }
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(MUTEX);
+    std::lock_guard<std::mutex> guard(*mutex);
+
+    auto* output = Output(RESERVOIR);
+    const auto& input = Input(DATA);
+
+    CAFFE_ENFORCE_GE(input.ndim(), 1);
+
+    bool output_initialized = output->size() > 0 &&
+        (static_cast<std::shared_ptr<std::vector<TensorCPU>>*>(
+             output->raw_mutable_data(input.meta()))[0] != nullptr);
+
+    if (output_initialized) {
+      CAFFE_ENFORCE_EQ(output->ndim(), input.ndim());
+      for (size_t i = 1; i < input.ndim(); ++i) {
+        CAFFE_ENFORCE_EQ(output->dim(i), input.dim(i));
+      }
+    }
+
+    auto dims = input.dims();
+    auto num_entries = dims[0];
+
+    dims[0] = numToCollect_;
+    // IMPORTANT: Force the output to have the right type before reserving,
+    // so that the output gets the right capacity
+    output->raw_mutable_data(input.meta());
+    output->Reserve(dims, &context_);
+
+    auto* pos_to_object =
+        OutputSize() > POS_TO_OBJECT ? Output(POS_TO_OBJECT) : nullptr;
+    if (pos_to_object) {
+      if (!output_initialized) {
+        // Cleaning up in case the reservoir got reset.
+        pos_to_object->Resize(0);
+      }
+      pos_to_object->Reserve(std::vector<TIndex>{numToCollect_}, &context_);
+    }
+
+    auto* object_to_pos_map = OutputSize() > OBJECT_TO_POS_MAP
+        ? OperatorBase::Output<MapType64To32>(OBJECT_TO_POS_MAP)
+        : nullptr;
+
+    if (object_to_pos_map && !output_initialized) {
+      object_to_pos_map->clear();
+    }
+
+    auto* num_visited_tensor = Output(NUM_VISITED);
+    CAFFE_ENFORCE_EQ(1, num_visited_tensor->size());
+    auto* num_visited = num_visited_tensor->template mutable_data<int64_t>();
+    if (!output_initialized) {
+      *num_visited = 0;
+    }
+    CAFFE_ENFORCE_GE(*num_visited, 0);
+
+    if (num_entries == 0) {
+      if (!output_initialized) {
+        // Get both shape and meta
+        output->CopyFrom(input, &context_);
+      }
+      return true;
+    }
+
+    const int64_t* object_id_data = nullptr;
+    std::set<int64_t> unique_object_ids;
+    if (InputSize() > OBJECT_ID) {
+      const auto& object_id = Input(OBJECT_ID);
+      CAFFE_ENFORCE_EQ(object_id.ndim(), 1);
+      CAFFE_ENFORCE_EQ(object_id.size(), num_entries);
+      object_id_data = object_id.template data<int64_t>();
+      unique_object_ids.insert(
+          object_id_data, object_id_data + object_id.size());
+    }
+
+    const auto num_new_entries = countNewEntries(unique_object_ids);
+    auto num_to_copy = std::min<int32_t>(num_new_entries, numToCollect_);
+    auto output_batch_size = output_initialized ? output->dim(0) : 0;
+    dims[0] = std::min<size_t>(numToCollect_, output_batch_size + num_to_copy);
+    if (output_batch_size < numToCollect_) {
+      output->Resize(dims);
+      if (pos_to_object) {
+        pos_to_object->Resize(dims[0]);
+      }
+    }
+    auto* output_data =
+        static_cast<char*>(output->raw_mutable_data(input.meta()));
+    auto* pos_to_object_data = pos_to_object
+        ? pos_to_object->template mutable_data<int64_t>()
+        : nullptr;
+
+    auto block_size = input.size_from_dim(1);
+    auto block_bytesize = block_size * input.itemsize();
+    const auto* input_data = static_cast<const char*>(input.raw_data());
+
+    const auto start_num_visited = *num_visited;
+
+    std::set<int64_t> eligible_object_ids;
+    if (object_to_pos_map) {
+      for (auto oid : unique_object_ids) {
+        if (!object_to_pos_map->count(oid)) {
+          eligible_object_ids.insert(oid);
+        }
+      }
+    }
+
+    for (int i = 0; i < num_entries; ++i) {
+      if (object_id_data && object_to_pos_map &&
+          !eligible_object_ids.count(object_id_data[i])) {
+        // Already in the pool or processed
+        continue;
+      }
+      if (object_id_data) {
+        eligible_object_ids.erase(object_id_data[i]);
+      }
+      int64_t pos = -1;
+      if (*num_visited < numToCollect_) {
+        // append
+        pos = *num_visited;
+      } else {
+        auto& gen = context_.RandGenerator();
+        // uniform between [0, num_visited]
+        std::uniform_int_distribution<int64_t> uniformDist(0, *num_visited);
+        pos = uniformDist(gen);
+        if (pos >= numToCollect_) {
+          // discard
+          pos = -1;
+        }
+      }
+
+      if (pos < 0) {
+        // discard
+        CAFFE_ENFORCE_GE(*num_visited, numToCollect_);
+      } else {
+        // replace
+        context_.template CopyItems<Context, Context>(
+            input.meta(),
+            block_size,
+            input_data + i * block_bytesize,
+            output_data + pos * block_bytesize);
+
+        if (object_id_data && pos_to_object_data && object_to_pos_map) {
+          auto old_oid = pos_to_object_data[pos];
+          auto new_oid = object_id_data[i];
+          pos_to_object_data[pos] = new_oid;
+          object_to_pos_map->erase(old_oid);
+          object_to_pos_map->emplace(new_oid, pos);
+        }
+      }
+
+      ++(*num_visited);
+    }
+    // Sanity check
+    CAFFE_ENFORCE_EQ(*num_visited, start_num_visited + num_new_entries);
+    return true;
+  }
+
+ private:
+  // number of tensors to collect
+  int numToCollect_;
+
+  INPUT_TAGS(
+      RESERVOIR_IN,
+      NUM_VISITED_IN,
+      DATA,
+      MUTEX,
+      OBJECT_ID,
+      OBJECT_TO_POS_MAP_IN,
+      POS_TO_OBJECT_IN);
+  OUTPUT_TAGS(RESERVOIR, NUM_VISITED, OBJECT_TO_POS_MAP, POS_TO_OBJECT);
+
+  int32_t countNewEntries(const std::set<int64_t>& unique_object_ids) {
+    const auto& input = Input(DATA);
+    if (InputSize() <= OBJECT_ID) {
+      return input.dim(0);
+    }
+    const auto& object_to_pos_map =
+        OperatorBase::Input<MapType64To32>(OBJECT_TO_POS_MAP_IN);
+    return std::count_if(
+        unique_object_ids.begin(),
+        unique_object_ids.end(),
+        [&object_to_pos_map](int64_t oid) {
+          return !object_to_pos_map.count(oid);
+        });
+  }
+};
+
+REGISTER_CPU_OPERATOR(ReservoirSampling, ReservoirSamplingOp<CPUContext>);
+
+OPERATOR_SCHEMA(ReservoirSampling)
+    .NumInputs({4, 7})
+    .NumOutputs({2, 4})
+    .NumInputsOutputs([](int in, int out) { return in / 3 == out / 2; })
+    .EnforceInplace({{0, 0}, {1, 1}, {5, 2}, {6, 3}})
+    .SetDoc(R"DOC(
+Collect `DATA` tensor into `RESERVOIR` of size `num_to_collect`. `DATA` is
+assumed to be a batch.
+
+In case where 'objects' may be repeated in data and you only want at most one
+instance of each 'object' in the reservoir, `OBJECT_ID` can be given for
+deduplication. If `OBJECT_ID` is given, then you also need to supply additional
+book-keeping tensors. See input blob documentation for details.
+
+This operator is thread-safe.
+)DOC")
+    .Arg(
+        "num_to_collect",
+        "The number of random samples to append for each positive samples")
+    .Input(
+        0,
+        "RESERVOIR",
+        "The reservoir; should be initialized to empty tensor")
+    .Input(
+        1,
+        "NUM_VISITED",
+        "Number of examples seen so far; should be initialized to 0")
+    .Input(
+        2,
+        "DATA",
+        "Tensor to collect from. The first dimension is assumed to be batch "
+        "size. If the object to be collected is represented by multiple "
+        "tensors, use `PackRecords` to pack them into single tensor.")
+    .Input(3, "MUTEX", "Mutex to prevent data race")
+    .Input(
+        4,
+        "OBJECT_ID",
+        "(Optional, int64) If provided, used for deduplicating object in the "
+        "reservoir")
+    .Input(
+        5,
+        "OBJECT_TO_POS_MAP_IN",
+        "(Optional) Auxillary bookkeeping map. This should be created from "
+        " `CreateMap` with keys of type int64 and values of type int32")
+    .Input(
+        6,
+        "POS_TO_OBJECT_IN",
+        "(Optional) Tensor of type int64 used for bookkeeping in deduplication")
+    .Output(0, "RESERVOIR", "Same as the input")
+    .Output(1, "NUM_VISITED", "Same as the input")
+    .Output(2, "OBJECT_TO_POS_MAP", "(Optional) Same as the input")
+    .Output(3, "POS_TO_OBJECT", "(Optional) Same as the input");
+
+SHOULD_NOT_DO_GRADIENT(ReservoirSampling);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/reshape_op.cc b/caffe2/operators/reshape_op.cc
new file mode 100644
index 0000000..5d2c235
--- /dev/null
+++ b/caffe2/operators/reshape_op.cc
@@ -0,0 +1,198 @@
+#include "caffe2/operators/reshape_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Reshape, ReshapeOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Reshape)
+    .NumInputs(1, 2)
+    .NumOutputs(2)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          vector<TensorShape> out(2);
+
+          // Do shape inference for old_shape
+          out[1].set_data_type(TensorProto::INT64);
+          out[1].add_dims(in[0].dims_size());
+
+          ArgumentHelper helper(def);
+          if (!helper.HasArgument("shape")) {
+            // Cannot do shape inference for reshaped tensor from runtime data.
+            CAFFE_ENFORCE_EQ(
+                in.size(),
+                2,
+                "New shape must be specified by either the input blob or the "
+                "argument `shape`.");
+            out[0].set_unknown_shape(true);
+            return out;
+          }
+          CAFFE_ENFORCE_EQ(
+              in.size(),
+              1,
+              "New shape must not be specified by the input blob and the "
+              "argument `shape` at the same time.");
+
+          // Infer the actual new shape
+          auto actualNewShape = helper.GetRepeatedArgument<int64_t>("shape");
+
+          // Copy over the dimensions for those that are specified zero
+          // and check the eligibility of input
+          for (int i = 0; i < actualNewShape.size(); ++i) {
+            CAFFE_ENFORCE_GE(
+                actualNewShape[i],
+                -1,
+                "The dimensions in argument `shape` "
+                "must not be a negative number.");
+
+            if (actualNewShape[i] == 0) {
+              CAFFE_ENFORCE_LT(
+                  i,
+                  in[0].dims_size(),
+                  "Argument `shape` has a dimension set to zero that exceeds "
+                  "the original dimension size.");
+              actualNewShape[i] = in[0].dims(i);
+            }
+          }
+
+          // Check if the new shape is valid and fills in the missing dimension
+          // specified by -1.
+          int64_t totalSize = 1;
+          for (const auto d : in[0].dims()) {
+            totalSize *= d;
+          }
+          int64_t size = 1;
+          int unknownIdx = -1;
+          for (int i = 0; i < actualNewShape.size(); ++i) {
+            const auto dim = actualNewShape[i];
+            if (dim == -1) {
+              CAFFE_ENFORCE(
+                  unknownIdx == -1,
+                  "Argument `shape` has more than one missing dimension.");
+              unknownIdx = i;
+            } else {
+              size *= dim;
+            }
+          }
+
+          if (unknownIdx != -1) {
+            CAFFE_ENFORCE(
+                totalSize % size == 0,
+                "Argument `shape` does not agree with the input data.",
+                " (",
+                totalSize,
+                " vs ",
+                size,
+                ")");
+            actualNewShape[unknownIdx] = totalSize / size;
+          } else {
+            CAFFE_ENFORCE_EQ(
+                totalSize,
+                size,
+                "Argument `shape` does not agree with the input data.",
+                " (",
+                totalSize,
+                " != ",
+                size,
+                ")");
+          }
+
+          out[0].set_data_type(in[0].data_type());
+          for (const auto d : actualNewShape) {
+            out[0].add_dims(d);
+          }
+          return out;
+        })
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Reshape the input tensor similar to numpy's
+[reshape](https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html).
+
+Takes a tensor as input and an optional tensor specifying the new shape. When
+the second input is absent, an extra argument shape must be specified. Outputs
+the reshaped tensor as well as the original shape.
+
+At most one dimension of the new shape can be -1. In this case, the value is
+inferred from the size of the tensor and the remaining dimensions. A dimension
+could also be 0, in which case the actual dimension value is going to be copied
+from the input tensor.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reshape_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Reshape",
+    ["data"],
+    ["reshaped", "old_shape"],
+    shape=(3,2)
+)
+
+workspace.FeedBlob("data", (np.random.randint(100, size=(6))))
+print("data:", workspace.FetchBlob("data"))
+workspace.RunOperatorOnce(op)
+print("reshaped:", workspace.FetchBlob("reshaped"))
+print("old_shape:", workspace.FetchBlob("old_shape"))
+```
+
+**Result**
+
+```
+data: [86 60 85 96  7 37]
+reshaped: [[86 60]
+          [85 96]
+          [ 7 37]]
+old_shape: [6]
+```
+
+</details>
+
+)DOC")
+    .Arg("shape", "*(type: Tuple(int))* New shape. Do not set if using "
+    "`new_shape` input.")
+    .Input(
+        0,
+        "data",
+        "*(type: Tensor)* Input tensor.")
+    .Input(
+        1,
+        "new_shape",
+        "*(type: Tensor`<int>`)* [OPTIONAL] Tensor containing new shape.")
+    .Output(
+        0,
+        "reshaped",
+        "*(type: Tensor)* Reshaped output tensor.")
+    .Output(
+        1,
+        "old_shape",
+        "*(type: Tensor`<int>`)* Tensor containing old shape of `data`.")
+    .InheritOnnxSchema("Reshape");
+
+class GetReshapeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Reshape",
+        "",
+        vector<string>{GO(0), O(1)},
+        vector<string>{GI(0), "_" + GI(0) + "_dims"});
+  }
+
+  // Argument `shape` is no longer needed in backprop.
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(Reshape, GetReshapeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/reshape_op.h b/caffe2/operators/reshape_op.h
new file mode 100644
index 0000000..f59da8a
--- /dev/null
+++ b/caffe2/operators/reshape_op.h
@@ -0,0 +1,141 @@
+#ifndef CAFFE2_OPERATORS_RESHAPE_OP_H_
+#define CAFFE2_OPERATORS_RESHAPE_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Takes a shape and data tensor and reshapes it
+template <typename F, class Context>
+class ReshapeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ReshapeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        new_shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
+
+  bool RunOnDevice() override {
+    if (InputSize() == 2) {
+      return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
+    }
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
+    return this->template DoRunWithType<int64_t>();
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    DoRunWithTypeImpl<T>(Input(0), Output(0));
+    return true;
+  }
+
+ protected:
+  template <typename T>
+  void DoRunWithTypeImpl(
+      const Tensor<Context>& input,
+      Tensor<Context>* output) {
+    vector<int64_t> actual_new_shape = new_shape_;
+    if (InputSize() == 2) {
+      CAFFE_ENFORCE(
+          !OperatorBase::HasArgument("shape"),
+          "New shape is specified by the input blob, do not pass in "
+          "the argument `shape`.");
+
+      auto& shape = Input(1);
+      CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
+
+      const T* shape_data = shape.template data<T>();
+
+      // Bit awkward, but needed so works on both CPU and CUDA contexts
+      std::vector<T> tmpv(shape.size());
+      context_.template CopyBytes<Context, CPUContext>(
+          shape.size() * sizeof(T), shape_data, &tmpv[0]);
+      actual_new_shape.assign(tmpv.begin(), tmpv.begin() + shape.size());
+    }
+
+    // Copy over the dimensions for those that are specified zero.
+    for (int i = 0; i < actual_new_shape.size() && i < input.ndim(); ++i) {
+      if (actual_new_shape[i] == 0) {
+        actual_new_shape[i] = input.dim(i);
+      }
+    }
+
+    // Checks if the new shape is valid and fills in the missing dimension
+    // specified by -1.
+    // NOTE: At most one dimension can be -1.
+    auto total_size = input.size_from_dim(0);
+    T size = 1;
+    int unknown_idx = -1;
+    for (int i = 0; i < actual_new_shape.size(); ++i) {
+      const auto dim = actual_new_shape[i];
+      if (dim == -1) {
+        CAFFE_ENFORCE(
+            unknown_idx == -1,
+            "Argument `shape` has more than one missing dimension.");
+        unknown_idx = i;
+      } else {
+        size *= dim;
+      }
+    }
+    if (size == 0 && total_size != 0) {
+      CAFFE_THROW("Can not reshape a non-zero size (", total_size, ") tensor to zero size.");
+    }
+
+    if (unknown_idx != -1) {
+      CAFFE_ENFORCE_NE(
+          size,
+          0,
+          "New shape at dim ",
+          unknown_idx,
+          " can not be inferred since new size is zero.");
+      CAFFE_ENFORCE(
+          total_size % size == 0,
+          "Argument `shape` does not agree with the input data.",
+          " (",
+          total_size,
+          " vs ",
+          size,
+          ")");
+      actual_new_shape[unknown_idx] = total_size / size;
+    } else {
+      CAFFE_ENFORCE_EQ(
+          total_size,
+          size,
+          "Argument `shape` does not agree with the input data.",
+          " (",
+          total_size,
+          " != ",
+          size,
+          ")");
+    }
+
+    // Write the original shape to the second output.
+    auto* old_shape = Output(1);
+    old_shape->Resize(input.ndim());
+    T* old_shape_data = old_shape->template mutable_data<T>();
+    for (int i = 0; i < input.ndim(); ++i) {
+      math::Set<T, Context>(1, input.dim(i), old_shape_data + i, &context_);
+    }
+
+    output->Resize(actual_new_shape);
+    if (output != &input) {
+      // If we are not doing in-place computation, a copy is needed.
+      context_.template CopyItems<Context, Context>(
+          input.meta(),
+          input.size(),
+          input.raw_data(),
+          output->raw_mutable_data(input.meta()));
+    }
+  }
+
+ private:
+  vector<int64_t> new_shape_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RESHAPE_OP_H_
diff --git a/caffe2/operators/reshape_op_gpu.cc b/caffe2/operators/reshape_op_gpu.cc
new file mode 100644
index 0000000..13283e2
--- /dev/null
+++ b/caffe2/operators/reshape_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/reshape_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Reshape, ReshapeOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
new file mode 100644
index 0000000..300cf87
--- /dev/null
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -0,0 +1,52 @@
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/reshape_op.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+static void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  option.set_device_type(CUDA);
+  CUDAContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  tensor->Resize(shape);
+  math::Set<float, CUDAContext>(
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
+  return;
+}
+
+TEST(ReshapeOpGPUTest, testReshapeWithScalar) {
+  if (!HasCudaGPU())
+    return;
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test_reshape");
+  def.set_type("Reshape");
+  def.add_input("X");
+  def.add_output("XNew");
+  def.add_output("OldShape");
+  def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
+  def.mutable_device_option()->set_device_type(CUDA);
+  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  // execute the op
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_TRUE(op->Run());
+  Blob* XNew = ws.GetBlob("XNew");
+  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
+  EXPECT_EQ(1, XNewTensor.ndim());
+  EXPECT_EQ(1, XNewTensor.size());
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/resize_op.cc b/caffe2/operators/resize_op.cc
new file mode 100644
index 0000000..508ab33
--- /dev/null
+++ b/caffe2/operators/resize_op.cc
@@ -0,0 +1,176 @@
+#include "caffe2/operators/resize_op.h"
+
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+void resizeNearest2x(
+    int batch_size,
+    int num_channels,
+    int input_height,
+    int input_width,
+    const float* input,
+    float* output) {
+  const int output_height = input_height * 2;
+  const int output_width = input_width * 2;
+  for (int n = 0; n < batch_size; ++n) {
+    for (int c = 0; c < num_channels; ++c) {
+      for (int y = 0; y < output_height; ++y) {
+        const int in_y = y / 2;
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+        int vecW = (input_width / 4) * 4; // round down
+        int x = 0;
+        for (; x < vecW; x += 4) {
+          // load 0 1 2 3
+          float32x4_t v = vld1q_f32(input + in_y * input_width + x);
+          const int oidx = output_width * y + x * 2;
+          float32x4x2_t v2 = {{v, v}};
+          // store 00 11 22 33
+          vst2q_f32(output + oidx + 0, v2);
+        }
+
+        // handle remainder
+        for (; x < input_width; ++x) {
+          const float v = input[in_y * input_width + x];
+          const int oidx = output_width * y + x * 2;
+          output[oidx + 0] = v;
+          output[oidx + 1] = v;
+        }
+#else
+        for (int x = 0; x < input_width; ++x) {
+          const float v = input[in_y * input_width + x];
+          const int oidx = output_width * y + x * 2;
+          output[oidx + 0] = v;
+          output[oidx + 1] = v;
+        }
+#endif
+      }
+      input += input_height * input_width;
+      output += output_height * output_width;
+    }
+  }
+}
+
+template <>
+bool ResizeNearestOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  const int batch_size = X.dim32(0),
+            num_channels = X.dim32(1),
+            input_height = X.dim32(2),
+            input_width = X.dim32(3);
+  int output_width = input_width * width_scale_;
+  int output_height = input_height * height_scale_;
+  Y->Resize(batch_size, num_channels, output_height, output_width);
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+
+  // Specialized implementation for fast 2x upsampling
+  if (width_scale_ == 2.0 && height_scale_ == 2.0) {
+    resizeNearest2x(
+        batch_size, num_channels, input_height, input_width, Xdata, Ydata);
+    return true;
+  }
+
+  for (int n = 0; n < batch_size; ++n) {
+    for (int c = 0; c < num_channels; ++c) {
+      for (int y = 0; y < output_height; ++y) {
+        const int in_y = std::min((int)(y / height_scale_), (input_height - 1));
+        for (int x = 0; x < output_width; ++x) {
+          const int in_x = std::min((int)(x / width_scale_), (input_width - 1));
+          Ydata[output_width * y + x] = Xdata[input_width * in_y + in_x];
+        }
+      }
+      Xdata += input_height * input_width;
+      Ydata += output_width * output_height;
+    }
+  }
+
+  return true;
+}
+
+template <>
+bool ResizeNearestGradientOp<float, CPUContext>::RunOnDevice() {
+  const auto& dY = Input(0);
+  const auto& X = Input(1);
+  auto* dX = Output(0);
+
+  const auto& inputDims = dY.dims();
+  CAFFE_ENFORCE_EQ(4, inputDims.size());
+  const int batch_size = dY.dim32(0),
+            num_channels = dY.dim32(1),
+            input_height = dY.dim32(2),
+            input_width = dY.dim32(3);
+  const int output_height = X.dim32(2);
+  const int output_width = X.dim32(3);
+  dX->Resize(batch_size, num_channels, output_height, output_width);
+  math::Set<float, CPUContext>(dX->size(),
+                               0.0f,
+                               dX->mutable_data<float>(),
+                               &context_);
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  for (int n = 0; n < batch_size; ++n) {
+    for (int c = 0; c < num_channels; ++c) {
+      for (int y = 0; y < input_height; ++y) {
+        const int out_y = std::min((int)(y / height_scale_),
+                                   (output_height - 1));
+        for (int x = 0; x < input_width; ++x) {
+          const int out_x = std::min((int)(x / width_scale_),
+                                     (output_width - 1));
+          dXdata[output_width * out_y + out_x] += dYdata[input_width * y + x];
+        }
+      }
+      dYdata += input_height * input_width;
+      dXdata += output_height * output_width;
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ResizeNearest, ResizeNearestOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(ResizeNearestGradient,
+                      ResizeNearestGradientOp<float, CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(ResizeNearest)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension")
+    .SetDoc(R"DOC(
+Resizes the spatial dimensions of the input using nearest neighbor
+interpolation. The `width_scale` and `height_scale` arguments
+control the size of the output, which is given by:
+output_width = floor(input_width * width_scale)
+output_height = floor(output_height * height_scale)
+)DOC")
+    .Input(0, "X", "Input tensor")
+    .Output(0, "Y", "Output tensor");
+
+// Input: dY, output: dX
+OPERATOR_SCHEMA(ResizeNearestGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension");
+
+class GetResizeNearestGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef("ResizeNearestGradient",
+                             "",
+                             vector<string>{GO(0), I(0)},
+                             vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ResizeNearest, GetResizeNearestGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/resize_op.cu b/caffe2/operators/resize_op.cu
new file mode 100644
index 0000000..5098d1d
--- /dev/null
+++ b/caffe2/operators/resize_op.cu
@@ -0,0 +1,146 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+#include "resize_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void NearestNeighborKernel(
+    const int size,
+    const int num_channels,
+    const int input_height,
+    const int input_width,
+    const int output_height,
+    const int output_width,
+    const float height_scale,
+    const float width_scale,
+    const float* X,
+    float* Y) {
+  CUDA_1D_KERNEL_LOOP(index, size) {
+
+    int indexTemp = index;
+    const int w = indexTemp % output_width;
+    indexTemp /= output_width;
+    const int h = indexTemp % output_height;
+    indexTemp /= output_height;
+    const int c = indexTemp % num_channels;
+    indexTemp /= num_channels;
+    const int n = indexTemp;
+
+    const int in_y = fminf(h / height_scale, input_height - 1);
+    const int in_x = fminf(w / width_scale, input_width - 1);
+    Y[index] =
+        X[((n * num_channels + c) * input_height + in_y) * input_width + in_x];
+  }
+}
+
+__global__ void NearestNeighborGradientKernel(
+    const int size,
+    const int num_channels,
+    const int input_height,
+    const int input_width,
+    const int output_height,
+    const int output_width,
+    const float height_scale,
+    const float width_scale,
+    const float* dY,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(index, size) {
+    int indexTemp = index;
+    const int x = indexTemp % input_width;
+    indexTemp /= input_width;
+    const int y = indexTemp % input_height;
+    indexTemp /= input_height;
+    const int c = indexTemp % num_channels;
+    indexTemp /= num_channels;
+    const int n = indexTemp;
+
+    const int out_y = fminf(y / height_scale, output_height - 1);
+    const int out_x = fminf(x / width_scale, output_width - 1);
+    const int out_index =
+        ((n * num_channels + c) * output_height + out_y) * output_width + out_x;
+#if __CUDA_ARCH__ >= 350
+    atomicAdd(dX + out_index, __ldg(dY + index));
+#else
+    atomicAdd(dX + out_index, *(dY + index));
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+bool ResizeNearestOp<float, CUDAContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  const auto& inputDims = X.dims();
+  CAFFE_ENFORCE_EQ(4, inputDims.size());
+  const int batch_size = X.dim32(0), num_channels = X.dim32(1),
+            input_height = X.dim32(2), input_width = X.dim32(3);
+  int output_width = input_width * width_scale_;
+  int output_height = input_height * height_scale_;
+  Y->Resize(batch_size, num_channels, output_height, output_width);
+
+  const auto size = Y->size();
+  NearestNeighborKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      size,
+      num_channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width,
+      height_scale_,
+      width_scale_,
+      X.data<float>(),
+      Y->mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool ResizeNearestGradientOp<float, CUDAContext>::RunOnDevice() {
+  const auto& dY = Input(0);
+  const auto& X = Input(1);
+  auto* dX = Output(0);
+
+  const auto& inputDims = dY.dims();
+  CAFFE_ENFORCE_EQ(4, inputDims.size());
+  const int batch_size = dY.dim32(0), num_channels = dY.dim32(1),
+            input_height = dY.dim32(2), input_width = dY.dim32(3);
+  int output_height = X.dim32(2);
+  int output_width = X.dim32(3);
+  dX->Resize(batch_size, num_channels, output_height, output_width);
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.0f, dX->mutable_data<float>(), &context_);
+
+  const auto size = dY.size();
+  NearestNeighborGradientKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      size,
+      num_channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width,
+      height_scale_,
+      width_scale_,
+      dY.data<float>(),
+      dX->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ResizeNearest, ResizeNearestOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ResizeNearestGradient,
+    ResizeNearestGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/resize_op.h b/caffe2/operators/resize_op.h
new file mode 100644
index 0000000..a0ae814
--- /dev/null
+++ b/caffe2/operators/resize_op.h
@@ -0,0 +1,55 @@
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ResizeNearestOp final : public Operator<Context> {
+ public:
+  ResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    if (HasArgument("width_scale")) {
+      width_scale_ = static_cast<T>(
+          OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    }
+    if (HasArgument("height_scale")) {
+      height_scale_ = static_cast<T>(
+          OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    }
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T width_scale_;
+  T height_scale_;
+};
+
+template <typename T, class Context>
+class ResizeNearestGradientOp final : public Operator<Context> {
+ public:
+  ResizeNearestGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    width_scale_ = static_cast<T>(
+        OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    height_scale_ = static_cast<T>(
+        OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T width_scale_;
+  T height_scale_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/reverse_packed_segs_op.cc b/caffe2/operators/reverse_packed_segs_op.cc
new file mode 100644
index 0000000..0191c61
--- /dev/null
+++ b/caffe2/operators/reverse_packed_segs_op.cc
@@ -0,0 +1,33 @@
+#include "caffe2/operators/reverse_packed_segs_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CPUContext>);
+
+OPERATOR_SCHEMA(ReversePackedSegs)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Reverse segments in a 3-D tensor (lengths, segments, embeddings,), leaving
+paddings unchanged. This operator is used to reverse input of a recurrent neural
+network to make it a BRNN.
+  )DOC")
+    .Input(0, "data", "a 3-D (lengths, segments, embeddings,) tensor.")
+    .Input(1, "lengths", "length of each segment.")
+    .Output(
+        0,
+        "reversed data",
+        "a (lengths, segments, embeddings,) tensor with each segment reversed"
+        "and paddings unchanged.");
+
+class GetReversePackedSegsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ReversePackedSegs",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ReversePackedSegs, GetReversePackedSegsGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/reverse_packed_segs_op.cu b/caffe2/operators/reverse_packed_segs_op.cu
new file mode 100644
index 0000000..fdcffc6
--- /dev/null
+++ b/caffe2/operators/reverse_packed_segs_op.cu
@@ -0,0 +1,91 @@
+#include "caffe2/core/context_gpu.h"
+#include "reverse_packed_segs_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, typename LengthType>
+__global__
+void ReversePackedSegments_kernel(
+      size_t max_length,
+      size_t batch_size,
+      size_t block_size,
+      const LengthType* lengths_ptr,
+      const T* data_ptr,
+      T* rev_data_ptr) {
+
+  const int block_id = blockIdx.x;
+
+  // index into [0, batch_size)
+  const int batch = block_id / max_length;
+  // index into [0, segment)
+  const int segment = block_id % max_length;
+
+  if (batch >= batch_size || segment >= max_length) return;
+
+  const int seg_length = lengths_ptr[batch];
+
+  // unique data pointer for this CTA
+  const T* local_data_ptr = data_ptr + (segment * batch_size + batch) * block_size;
+
+  // unique pointer for result
+  T* local_rev_data_ptr;
+  if (segment < seg_length) {
+    local_rev_data_ptr = rev_data_ptr + ((seg_length - 1 - segment) * batch_size + batch) * block_size;
+  } else {
+    local_rev_data_ptr = rev_data_ptr + (segment * batch_size + batch) * block_size;
+  }
+
+  // copy using 1 element / thread for now
+  for (int idx = threadIdx.x; idx < block_size; idx+=blockDim.x) {
+    local_rev_data_ptr[idx] = local_data_ptr[idx];
+  }
+}
+
+} // namespace
+
+// specialization of DoRunWithLengthType
+template <>
+template <typename T, typename LengthType>
+void ReversePackedSegsOp<CUDAContext>::DoRunWithLengthType() {
+  const auto& data = Input(DATA);
+  const auto& lengths = Input(LENGTHS);
+
+  CAFFE_ENFORCE(
+      data.ndim() == 3,
+      "DATA should be 3-D tensor <lengths, "
+      "segments, embeddings>");
+  CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
+
+  auto* output = Output(0);
+  const auto& shape = data.dims();
+  output->Resize(shape);
+
+  const auto& max_length = data.dims()[0];
+  const auto& batch_size = data.dims()[1];
+  const auto& block_size = data.dims()[2];
+  CAFFE_ENFORCE(
+      lengths.dims()[0] == batch_size,
+      "lenths size should be"
+      " equal to batch size");
+
+  const T* data_ptr = data.template data<T>();
+  const LengthType* lengths_ptr = lengths.template data<LengthType>();
+
+  // reversed data
+  T* rev_data_ptr = output->template mutable_data<T>();
+
+  const int grid = max_length * batch_size;
+
+  ReversePackedSegments_kernel<T,LengthType><<<grid, 512, 0, context_.cuda_stream()>>>(
+        max_length,
+        batch_size,
+        block_size,
+        lengths_ptr,
+        data_ptr,
+        rev_data_ptr);
+}
+
+REGISTER_CUDA_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h
new file mode 100644
index 0000000..f2f1122
--- /dev/null
+++ b/caffe2/operators/reverse_packed_segs_op.h
@@ -0,0 +1,90 @@
+#ifndef CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
+#define CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ReversePackedSegsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ReversePackedSegsOp);
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double, int, long, bool>>::call(
+        this, Input(DATA));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    if (Input(LENGTHS).template IsType<int>()) {
+      DoRunWithLengthType<T, int>();
+    } else {
+      DoRunWithLengthType<T, long>();
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA, LENGTHS);
+
+  template <typename T, typename LengthType>
+  void DoRunWithLengthType() {
+    const auto& data = Input(DATA);
+    const auto& lengths = Input(LENGTHS);
+
+    CAFFE_ENFORCE(
+        data.ndim() == 3,
+        "DATA should be 3-D tensor <lengths, "
+        "segments, embeddings>");
+    CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
+
+    auto* output = Output(0);
+    const auto& shape = data.dims();
+    output->Resize(shape);
+
+    const auto& max_length = data.dims()[0];
+    const auto& batch_size = data.dims()[1];
+    const auto& block_size = data.dims()[2];
+    CAFFE_ENFORCE(
+        lengths.dims()[0] == batch_size,
+        "lenths size should be"
+        " equal to batch size");
+
+    const T* data_ptr = data.template data<T>();
+    const LengthType* lengths_ptr = lengths.template data<LengthType>();
+
+    vector<LengthType> lengths_host(batch_size);
+    context_.template Copy<LengthType, Context, CPUContext>(
+        batch_size, lengths_ptr, &lengths_host[0]);
+    context_.FinishDeviceComputation();
+
+    T* rev_data_ptr = output->template mutable_data<T>();
+    for (TIndex i = 0; i < batch_size; i++) {
+      const auto& seg_length = lengths_host[i];
+      CAFFE_ENFORCE_LE(seg_length, max_length);
+      TIndex j = 0;
+      for (; j < seg_length; j++) {
+        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
+        T* rev_data_block_ptr =
+            rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
+        context_.template Copy<T, Context, Context>(
+            block_size, data_block_ptr, rev_data_block_ptr);
+      }
+      for (; j < max_length; j++) {
+        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
+        T* rev_data_block_ptr =
+            rev_data_ptr + (j * batch_size + i) * block_size;
+        context_.template Copy<T, Context, Context>(
+            block_size, data_block_ptr, rev_data_block_ptr);
+      }
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
diff --git a/caffe2/operators/rmac_regions_op.cc b/caffe2/operators/rmac_regions_op.cc
new file mode 100644
index 0000000..ba6ab10
--- /dev/null
+++ b/caffe2/operators/rmac_regions_op.cc
@@ -0,0 +1,124 @@
+#include "caffe2/operators/rmac_regions_op.h"
+
+#include <float.h>
+
+namespace caffe2 {
+
+template <>
+bool RMACRegionsOp<CPUContext>::RunOnDevice() {
+  const auto& X = Input(0); // Input tensor
+  auto* output = Output(0); // RoIs
+  output->Resize(0, 5); // [batch_id x1 y1 x2 y2] format of ROIPoolOp
+
+  if (X.size() == 0) {
+    return true;
+  }
+
+  int batch_size = X.dim32(0);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  int minW = std::min(H, W);
+
+  // steps(idx) regions for long dimension
+  int step = 0;
+  if (W != H) {
+    int min_step = 1;
+    int max_step = 6;
+    float cur_min = FLT_MAX;
+    for (int idx = min_step; idx <= max_step; ++idx) {
+      float b = (std::max(H, W) - minW) / (1.0 * idx);
+      float val = std::abs((minW * minW - minW * b) / (minW * minW) - overlap_);
+      if (val < cur_min) {
+        step = idx;
+        cur_min = val;
+      }
+    }
+  }
+
+  // Region overplus per dimension
+  int Wd = (W > H) ? step : 0;
+  int Hd = (H > W) ? step : 0;
+
+  // Regions at each scale
+  for (int l = 1; l <= scales_; ++l) {
+    int region_size = 2 * minW / (l + 1);
+    if (region_size == 0) {
+      // Empty region.
+      // Break early as further scales will also result in empty regions.
+      break;
+    }
+
+    // Region coordinates
+    float bw =
+        (l + Wd - 1 > 0) ? ((W - region_size) / (1.0 * (l + Wd - 1))) : 0;
+    float bh =
+        (l + Hd - 1 > 0) ? ((H - region_size) / (1.0 * (l + Hd - 1))) : 0;
+
+    int cur_rows = output->dim32(0);
+    output->Extend((l + Wd) * (l + Hd), 50, &context_);
+    auto* outputData = output->mutable_data<float>() + cur_rows * 5;
+
+    for (int i = 0; i < l + Wd; ++i) {
+      for (int j = 0; j < l + Hd; ++j) {
+        int x1 = bw * i;
+        int y1 = bh * j;
+        // Careful with the borders
+        if (x1 + region_size > W) {
+          x1 -= (x1 + region_size - W);
+        }
+        if (y1 + region_size > H) {
+          y1 -= (y1 + region_size - H);
+        }
+        int x2 = x1 + region_size - 1;
+        int y2 = y1 + region_size - 1;
+
+        // Write region coordinates for batch 0
+        *outputData++ = 0;
+        *outputData++ = x1;
+        *outputData++ = y1;
+        *outputData++ = x2;
+        *outputData++ = y2;
+      }
+    }
+  }
+
+  // Replicate regions for all items in batch
+  int num_rois = output->dim32(0);
+  output->Extend((batch_size - 1) * num_rois, 50, &context_);
+  auto* outputData = output->mutable_data<float>();
+  for (int b = 1; b < batch_size; ++b) {
+    // Copy all rois
+    std::copy_n(outputData, num_rois * 5, outputData + b * num_rois * 5);
+    // Override batch index
+    for (int r = 0; r < num_rois; ++r) {
+      outputData[(b * num_rois + r) * 5] = b;
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RMACRegions, RMACRegionsOp<CPUContext>);
+
+OPERATOR_SCHEMA(RMACRegions)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes a fixed-grid of RMAC region coordinates at various levels
+as described in https://arxiv.org/abs/1511.05879.
+)DOC")
+    .Arg("scales", "Number of scales to sample regions at.")
+    .Arg("overlap", "Overlap between consecutive regions.")
+    .Input(0, "X", "The input 4D tensor of shape NCHW.")
+    .Output(
+        0,
+        "RMAC_REGIONS",
+        "The output RMAC regions for all items in the batch. Tensor of shape "
+        "(N x 5) following the ROIPoolOp format - each row is of the format "
+        "(batch_index x1 y1 x2 y2) where x1, y1, x2, y2 are the region "
+        "co-ordinates. Each region is repeated N times corresponding to each "
+        "item in the batch.");
+
+SHOULD_NOT_DO_GRADIENT(RMACRegions);
+
+} // namespace caffe2
diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
new file mode 100644
index 0000000..49faf7a
--- /dev/null
+++ b/caffe2/operators/rmac_regions_op.cu
@@ -0,0 +1,213 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/rmac_regions_op.h"
+
+namespace cub {
+
+template <typename KeyT, typename ValueT>
+inline __host__ __device__ bool operator<(
+    const cub::KeyValuePair<KeyT, ValueT>& kv1,
+    const cub::KeyValuePair<KeyT, ValueT>& kv2) {
+  return (kv1.value < kv2.value) ||
+      (kv1.value == kv2.value && kv2.key < kv1.key);
+}
+
+} // namespace cub
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void NumRMACRegionsKernel(
+    const int W,
+    const int H,
+    const int min_step,
+    const int max_step,
+    const float overlap,
+    const int scales,
+    int* num_rois_data) {
+  // steps(idx) regions for long dimension
+  typedef cub::KeyValuePair<int, float> KeyValuePair; // <step, value>
+  KeyValuePair kv, min_kv;
+  min_kv.value = FLT_MAX;
+
+  // Local reduction
+  int minW = min(H, W);
+  int diff = max(H, W) - minW;
+  CUDA_1D_KERNEL_LOOP(index, max_step - min_step + 1) {
+    kv.key = min_step + index;
+    float b = diff / (1.0 * kv.key);
+    kv.value = fabsf((minW * minW - minW * b) / (minW * minW) - overlap);
+
+    if (kv < min_kv) {
+      min_kv = kv;
+    }
+  }
+
+  // Block-wise arg-min reduction to find step
+  int step;
+  {
+    typedef cub::BlockReduce<KeyValuePair, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    min_kv = BlockReduce(temp_storage).Reduce(min_kv, cub::Min());
+
+    __shared__ int step_shared;
+    if (threadIdx.x == 0) {
+      step_shared = min_kv.key;
+    }
+    __syncthreads();
+    step = step_shared;
+  }
+
+  // Region overplus per dimension
+  int Wd = (W > H) ? step : 0;
+  int Hd = (H > W) ? step : 0;
+
+  // Local reduction to compute the total number of rois at all scales
+  int num_rois = 0;
+  CUDA_1D_KERNEL_LOOP(index, scales) {
+    int l = index + 1;
+    int region_size = 2 * minW / (l + 1);
+    num_rois += (region_size > 0) ? ((l + Wd) * (l + Hd)) : 0;
+  }
+
+  // Block-wise sum reduction to compute num_rois at all scales
+  {
+    typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    num_rois = BlockReduce(temp_storage).Sum(num_rois);
+  }
+
+  if (threadIdx.x == 0) {
+    num_rois_data[0] = num_rois;
+    num_rois_data[1] = Wd;
+    num_rois_data[2] = Hd;
+  }
+}
+
+__global__ void RMACRegionsKernel(
+    const int W,
+    const int H,
+    const int N,
+    const int* num_rois_data,
+    float* output) {
+  int num_rois = num_rois_data[0];
+  int Wd = num_rois_data[1];
+  int Hd = num_rois_data[2];
+
+  // Block-wide temp shared storage for intermediate ROI results to avoid
+  // uncoalesced writes to global mem
+  __shared__ float output_shared[CAFFE_CUDA_NUM_THREADS * 5];
+
+  CUDA_1D_KERNEL_LOOP(index, N) {
+    int batch_id = index / num_rois;
+    int roi_id = index % num_rois;
+
+    int roi[5];
+    roi[0] = batch_id;
+
+    // Find the scale corresponding to this index and the roi_id relative
+    // to the scale.
+    int l = 0;
+    int num_rois_at_scale = 0;
+    do {
+      roi_id -= num_rois_at_scale;
+      l++;
+      num_rois_at_scale = (l + Wd) * (l + Hd);
+    } while (roi_id - num_rois_at_scale >= 0);
+
+    int region_size = 2 * min(H, W) / (l + 1);
+    float bw =
+        (l + Wd - 1 > 0) ? ((W - region_size) / (1.0 * (l + Wd - 1))) : 0;
+    float bh =
+        (l + Hd - 1 > 0) ? ((H - region_size) / (1.0 * (l + Hd - 1))) : 0;
+
+    int i = roi_id / (l + Hd);
+    int j = roi_id % (l + Hd);
+
+    roi[1] = bw * i;
+    roi[2] = bh * j;
+    // Careful with the borders
+    if (roi[1] + region_size > W) {
+      roi[1] -= (roi[1] + region_size - W);
+    }
+    if (roi[2] + region_size > H) {
+      roi[2] -= (roi[2] + region_size - H);
+    }
+    roi[3] = roi[1] + region_size - 1;
+    roi[4] = roi[2] + region_size - 1;
+
+    // Writing directly to output (global memory) will result in uncoalesced
+    // writes. Write output to shared mem first and then write ROI results to
+    // global output in a coalesced manner.
+    __syncthreads(); // Since output_shared is reused across loop iterations
+    for (int i = 0; i < 5; ++i) {
+      output_shared[threadIdx.x * 5 + i] = roi[i];
+    }
+    __syncthreads();
+    int offset = index - threadIdx.x;
+    float* output_offset = output + offset * 5;
+    int num_threads = min(blockDim.x, N - offset); // Active threads in block
+    for (int i = 0; i < 5; ++i) {
+      output_offset[num_threads * i + threadIdx.x] =
+          output_shared[num_threads * i + threadIdx.x];
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
+  const auto& X = Input(0); // Input tensor
+  auto* output = Output(0); // RoIs
+
+  if (X.size() == 0) {
+    return true;
+  }
+
+  int batch_size = X.dim32(0);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+
+  // Compute number of regions
+  int min_step = 1;
+  int max_step = 6;
+  num_rois_.Resize(3); // num_rois, Wd, Hd
+  NumRMACRegionsKernel<<<
+      1,
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      W,
+      H,
+      min_step,
+      max_step,
+      overlap_,
+      scales_,
+      num_rois_.mutable_data<int>());
+
+  // Bit awkward, but the size of the output tensor depends on the output of
+  // NumRMACRegionsKernel (number of RoIs), so need to copy that to CPU
+  // to Resize() output appropriately.
+  int num_rois = 0;
+  context_.CopyBytes<CUDAContext, CPUContext>(
+      sizeof(int), num_rois_.data<int>(), &num_rois);
+  int N = batch_size * num_rois;
+  output->Resize(N, 5); // [batch_id x1 y1 x2 y2]
+
+  // Compute region coordinates
+  RMACRegionsKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      W, H, N, num_rois_.data<int>(), output->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(RMACRegions, RMACRegionsOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/rmac_regions_op.h b/caffe2/operators/rmac_regions_op.h
new file mode 100644
index 0000000..ec5e86f
--- /dev/null
+++ b/caffe2/operators/rmac_regions_op.h
@@ -0,0 +1,29 @@
+
+#ifndef CAFFE2_OPERATORS_RMAC_REGIONS_OP_H
+#define CAFFE2_OPERATORS_RMAC_REGIONS_OP_H
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class RMACRegionsOp final : public Operator<Context> {
+ public:
+  RMACRegionsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scales_(OperatorBase::GetSingleArgument<int>("scales", 3)),
+        overlap_(OperatorBase::GetSingleArgument<float>("overlap", 0.4f)) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  int scales_;
+  float overlap_;
+  Tensor<Context> num_rois_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RMAC_REGIONS_OP_H
diff --git a/caffe2/operators/rnn/CMakeLists.txt b/caffe2/operators/rnn/CMakeLists.txt
new file mode 100644
index 0000000..8b56704
--- /dev/null
+++ b/caffe2/operators/rnn/CMakeLists.txt
@@ -0,0 +1,48 @@
+# ---[ GPU files
+# ------[ cuDNN
+if (USE_CUDNN)
+  file(GLOB tmp *_cudnn.cc)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+endif()
+# ------[ general GPU
+file(GLOB tmp *_gpu.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# ------[ CUDA sources
+file(GLOB tmp *.cu)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
+# TODO: when we move to explicit file list, this would not be needed.
+file(GLOB tmp_cudnn *_cudnn.cc)
+exclude(tmp "${tmp}" ${tmp_cudnn})
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+# ---[ GPU test files
+# ------[ cuDNN
+if (USE_CUDNN)
+  file(GLOB tmp *_cudnn_test.cc)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+endif()
+# ------[ general GPU
+file(GLOB tmp *_gpu_test.cc)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc
new file mode 100644
index 0000000..b0a87af
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc
@@ -0,0 +1,28 @@
+#include "caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    RecurrentNetworkBlobFetcher,
+    RecurrentNetworkBlobFetcherOp<CPUContext>);
+
+OPERATOR_SCHEMA(RecurrentNetworkBlobFetcher)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Retrieves blobs from scratch workspaces (which contain intermediate recurrent
+network computation for each timestep) and puts them in the global
+workspace under CPUContext.
+)DOC")
+    .Arg("prefix", "Prefix string to prepend extracted blobs.")
+    .Input(
+        0,
+        "ScratchWorkspaceBlob",
+        "Name of scratch workspace blob returned by recurrent network.")
+    .Output(
+        0,
+        "blob_names",
+        "1D tensor of strings containing extracted blob names.");
+
+SHOULD_NOT_DO_GRADIENT(RecurrentNetworkBlobFetcher);
+} // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
new file mode 100644
index 0000000..1f2e62f
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -0,0 +1,72 @@
+#ifndef CAFFE2_OPERATORS_RECURRENT_BLOB_FETCHER_OP_H_
+#define CAFFE2_OPERATORS_RECURRENT_BLOB_FETCHER_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/operators/rnn/recurrent_network_op.h"
+#include "google/protobuf/text_format.h"
+
+#include <string>
+
+namespace caffe2 {
+
+template <class Context>
+class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  RecurrentNetworkBlobFetcherOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    prefix_ = OperatorBase::GetSingleArgument<std::string>("prefix", "rnn");
+    ws_ = ws;
+  }
+
+  bool RunOnDevice() override {
+    const detail::ScratchWorkspaces& scratch =
+        OperatorBase::Input<detail::ScratchWorkspaces>(0);
+    const std::vector<std::shared_ptr<Workspace>>& stepWorkspaces =
+        scratch.stepWorkspaces;
+
+    std::vector<std::string> blob_names_vector = {};
+
+    for (TIndex i = 0; i < stepWorkspaces.size(); i++) {
+      Workspace* currentStepWorkspace = stepWorkspaces[i].get();
+      std::vector<std::string> blob_names = currentStepWorkspace->LocalBlobs();
+
+      for (auto& blob_name : blob_names) {
+        const Blob* currentBlob = currentStepWorkspace->GetBlob(blob_name);
+        const auto& currentTensor = currentBlob->Get<Tensor<Context>>();
+
+        std::string newBlobName =
+            prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
+        blob_names_vector.push_back(newBlobName);
+
+        ws_->CreateBlob(newBlobName)
+            ->template GetMutable<TensorCPU>()
+            ->ResizeLike(currentTensor);
+
+        auto* newTensor =
+            ws_->GetBlob(newBlobName)->template GetMutable<Tensor<Context>>();
+        newTensor->template CopyFrom<Context>(currentTensor);
+      }
+    }
+
+    auto* output = Output(0);
+    output->Resize(blob_names_vector.size());
+    std::copy(
+        blob_names_vector.begin(),
+        blob_names_vector.end(),
+        output->template mutable_data<std::string>());
+
+    return true;
+  }
+
+ private:
+  std::string prefix_;
+  Workspace* ws_;
+};
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECURRENT_BLOB_FETCHER_OP_H_
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc
new file mode 100644
index 0000000..f83a728
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc
@@ -0,0 +1,8 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(
+    RecurrentNetworkBlobFetcher,
+    RecurrentNetworkBlobFetcherOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_network_executor.cc b/caffe2/operators/rnn/recurrent_network_executor.cc
new file mode 100644
index 0000000..9464d50
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_executor.cc
@@ -0,0 +1,222 @@
+#include "caffe2/operators/rnn/recurrent_network_executor.h"
+
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+/**
+ * Implementation of RecurrentNetworkExecutor that uses thread pool for
+ * multithreaded execution of RNNs. Used with CPU.
+ */
+
+template <>
+std::unique_ptr<RecurrentNetworkExecutorBase> createRNNExecutor<CPUContext>(
+    const NetDef& step_net_def,
+    std::map<string, string>& recurrent_input_map,
+    std::string timestep_blob,
+    ArgumentHelper rnn_args) {
+  auto* exec = new ThreadedRecurrentNetworkExecutor(
+      step_net_def, recurrent_input_map, timestep_blob);
+  int num_threads =
+      rnn_args.GetSingleArgument<int>("rnn_executor.num_threads", 0);
+  if (num_threads > 0) {
+    exec->setNumThreads(num_threads);
+    LOG(INFO) << "Set num threads: " << num_threads;
+  }
+  exec->debug_ = rnn_args.GetSingleArgument<int>("rnn_executor_debug", 0);
+  return std::unique_ptr<RecurrentNetworkExecutorBase>(exec);
+}
+
+/**
+ * Run forwardpass with T timesteps.
+ */
+bool ThreadedRecurrentNetworkExecutor::Run(int T) {
+  CAFFE_ENFORCE(timestep_ops_.size() >= T);
+  countdown_ = T * timestep_ops_[0].size();
+  finished_timesteps_ = 0;
+
+  CHECK(task_queue_.size() == 0);
+
+  for (auto& rnn_op : timestep_ops_[0]) {
+    // Launch "frontier"-ops first.
+    if (rnn_op.frontier) {
+      task_queue_.Push(OpTask(0, rnn_op.order, T, 1));
+    }
+  }
+
+  _Exec();
+  return true;
+}
+
+/**
+ * Run backward pass with T timesteps.
+ */
+bool ThreadedRecurrentNetworkExecutor::RunBackwards(int T) {
+  CAFFE_ENFORCE(timestep_ops_.size() >= T);
+  countdown_ = T * timestep_ops_[0].size();
+  finished_timesteps_ = 0;
+
+  // Frontier
+  CHECK(task_queue_.size() == 0);
+
+  for (auto& rnn_op : timestep_ops_[T - 1]) {
+    if (rnn_op.frontier) {
+      task_queue_.Push(OpTask(T - 1, rnn_op.order, T, -1));
+    }
+  }
+
+  _Exec();
+  return true;
+}
+
+/**
+ * Runs a single op and updates its dependencies when finished. If
+ * dependent ops are ready to run, adds them to the task_queue.
+ */
+void ThreadedRecurrentNetworkExecutor::RunOp(OpTask job, int /*thread_id*/) {
+  bool first_timestep =
+      ((job.forward() && job.timestep == 0) ||
+       (job.backward() && job.timestep == job.T - 1));
+  bool last_timestep =
+      ((job.backward() && job.timestep == 0) ||
+       (job.forward() && job.timestep == job.T - 1));
+  auto& rnn_op = timestep_ops_[job.timestep][job.op_idx];
+  if (rnn_op.num_dynamic_inputs > 0 && !rnn_op.frontier) {
+    CAFFE_ENFORCE_EQ(
+        rnn_op.proc_inputs,
+        rnn_op.num_dynamic_inputs -
+            first_timestep * rnn_op.num_recurrent_inputs,
+        "Error at operator ",
+        job.op_idx,
+        " on timestep ",
+        job.timestep,
+        " T=",
+        job.T,
+        " first =",
+        first_timestep);
+  }
+
+  // Reset input dependency counter
+  rnn_op.proc_inputs = 0;
+
+  // Run the operator
+  rnn_op.op->Run();
+
+  // Knock down dependencies and start next ops, if this
+  // was last dependency fulfilled.
+  for (int depidx : rnn_op.dependencies) {
+    int t = job.timestep;
+    bool for_next_timestep = depidx <= rnn_op.order;
+    if (!last_timestep && for_next_timestep) {
+      t += job.direction;
+    } else if (for_next_timestep) {
+      continue;
+    }
+
+    auto& dep_op = timestep_ops_[t][depidx];
+    int proc_inputs = dep_op.proc_inputs.fetch_add(1) + 1;
+
+    // Schedule next op, if this was the last dependency. Note that on
+    // first timestep we don't have recurrent inputs.
+    int num_req_inputs = dep_op.num_dynamic_inputs;
+    if (first_timestep && !for_next_timestep) {
+      num_req_inputs -= dep_op.num_recurrent_inputs;
+    }
+
+    if (proc_inputs == num_req_inputs || num_req_inputs == 0) {
+      task_queue_.Push(OpTask(t, depidx, job.T, job.direction));
+    }
+  }
+
+  // Decrement countdown: when at zero, we have run all ops and can
+  // notify the caller thread.
+  if (countdown_.fetch_sub(1) == 1) {
+    CAFFE_ENFORCE_EQ(0, task_queue_.size());
+    std::unique_lock<std::mutex> lk(countdown_mtx_);
+    cv_.notify_one();
+  }
+}
+
+/**
+ * Run-loop for executor threads: pop tasks from task_queue and execute
+ * them with RunOp().
+ */
+void ThreadedRecurrentNetworkExecutor::WorkerFunction() {
+  size_t num_jobs = 0;
+  static std::atomic<int> seq(0);
+  int id = seq.fetch_add(1);
+
+  while (!failed_) {
+    OpTask job;
+    if (!task_queue_.Pop(&job)) {
+      break;
+    }
+
+    // Check for limited timestep parallelism, and if too many timesteps would
+    // be started concurrently, return the task to task queue.
+    if (max_parallel_timesteps_ > 0) {
+      int t = (job.direction == 1 ? job.timestep : job.T - job.timestep + 1);
+      if (t - finished_timesteps_ >= max_parallel_timesteps_) {
+        // Return to queue
+        task_queue_.Push(job);
+        continue;
+      }
+    }
+
+    try {
+      RunOp(job, id);
+      if (job.op_idx == timestep_ops_template_.size() - 1) {
+        finished_timesteps_.fetch_add(1);
+      }
+      num_jobs++;
+    } catch (::caffe2::EnforceNotMet& enf) {
+      std::unique_lock<std::mutex> lk(countdown_mtx_);
+      LOG(ERROR) << "Crash at thread " << id << " timestep " << job.timestep
+                 << " op:" << ProtoDebugString(step_net_def_.op(job.op_idx))
+                 << enf.what();
+      task_queue_.NoMoreJobs();
+      failed_ = true;
+      cv_.notify_one();
+      return;
+    }
+  }
+  VLOG(1) << "Worker exiting, did run: " << num_jobs << " jobs";
+}
+
+/**
+ * Start worker threads if not started yet, wait until all tasks
+ * finished, or a failure. Called by Run() and RunBackwards().
+ */
+void ThreadedRecurrentNetworkExecutor::_Exec() {
+  CAFFE_ENFORCE_EQ(
+      false, failed_, "Tried to execute a previously failed RNN executor");
+
+  // Start threads if not started
+  std::unique_lock<std::mutex> lk(countdown_mtx_);
+  while (workers_.size() < num_threads_) {
+    VLOG(1) << "Start RNN worker " << workers_.size() << " / " << num_threads_;
+    workers_.push_back(
+        std::thread(&ThreadedRecurrentNetworkExecutor::WorkerFunction, this));
+  }
+
+  // Wait until threads finish.
+  Timer t;
+  while (!failed_ && countdown_ > 0) {
+    cv_.wait_for(lk, std::chrono::seconds(30), [&] {
+      // Log if we are still running, so that we catch deadlocks.. there
+      // should not be any deadlocks, but...
+      if (t.Seconds() > 10) {
+        LOG(INFO) << "RNN Executor still running, remaining ops: "
+                  << countdown_;
+      }
+      return failed_ || countdown_ == 0;
+    });
+  }
+
+  CAFFE_ENFORCE_EQ(
+      false,
+      failed_,
+      "RNN executor encountered failure. See prior error logs for details.");
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
new file mode 100644
index 0000000..c241931
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -0,0 +1,520 @@
+#ifndef CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_H_
+#define CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_H_
+
+#include <map>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/rnn/recurrent_network_executor_incl.h"
+
+namespace caffe2 {
+
+/**
+ * RecurrentNetworkExecutor is a specialized runtime for recurrent
+ * neural networks (RNNs). It is invoked from the RecurrentNetworkOp
+ * and RecurrentNetworkGradientOp.
+ *
+ * Its main benefit over running each RNN timestep as a separate net
+ * is that it can run ops in subsequent timesteps in parallel when possible.
+ * For example, multi-layer LSTMs allow for timestep parallelism because
+ * next timestep's lower layer can start executing at the same time as
+ * the same timestep's upper layer.
+ *
+ * There are two implementations of the RNN executor: one for CPUs
+ * (ThreadedRecurrentNetworkExecutor) and another for GPUs
+ * (CUDARecurrentNetworkExecutor).
+ */
+class RecurrentNetworkExecutorBase {
+ protected:
+  explicit RecurrentNetworkExecutorBase(
+      const NetDef& step_net_def,
+      std::map<string, string>& recurrent_input_map,
+      std::string timestep_blob)
+      : step_net_def_(step_net_def),
+        recurrent_input_map_(recurrent_input_map),
+        timestep_blob_(timestep_blob) {
+    for (int i = 0; i < step_net_def_.op_size(); i++) {
+      op_deps_.push_back(op_deps(i));
+    }
+  }
+
+ public:
+  virtual ~RecurrentNetworkExecutorBase() {
+    if (debug_) {
+      if (timestep_ops_.size() > 0) {
+        PrintInfo(0);
+      }
+    }
+  }
+
+  virtual bool Run(int T) = 0;
+
+  virtual bool RunBackwards(int T) = 0;
+
+  /**
+   * Callers must call EnsureTimestepInitialized before starting execution
+   * for each of the relevant timesteps. If timestep was initialized before,
+   * this is a no-op. First time this is called the dependencies of the
+   * operators in timestep are analyzed, and that incurs higher overhead
+   * than subsequent calls.
+   */
+  void EnsureTimestepInitialized(
+      int t,
+      Workspace* ws,
+      const std::vector<std::unique_ptr<ObserverBase<OperatorBase>>>&
+          observers_list) {
+    if (timestep_ops_template_.size() == 0) {
+      // Firsrt invocation -- compute dependencies
+      CalculateInternalDependencies();
+
+      // Label ops based on whether they contain reference to the timestep
+      // blob. This is an optimization to avoid string comparisons later.
+      for (auto& rnn_op : timestep_ops_template_) {
+        rnn_op.has_timestep_blob = false;
+        const OperatorDef& op = step_net_def_.op(rnn_op.order);
+        for (int i = 0; i < op.input_size(); i++) {
+          if (op.input(i) == timestep_blob_) {
+            rnn_op.has_timestep_blob = true;
+            break;
+          }
+        }
+        CAFFE_ENFORCE(
+            !HasOutput(op, timestep_blob_),
+            "Timestep cannot be output of an op: ",
+            timestep_blob_,
+            " op=" + ProtoDebugString(op));
+      }
+    }
+
+    // Initialize timestep if it is not initialized
+    if (timestep_ops_.size() <= t ||
+        (timestep_ops_.size() > t && timestep_ops_[t].size() == 0)) {
+      // Initialize empty timestep ops vectors for each timestep preceding
+      // this.
+      for (int j = timestep_ops_.size(); j < t + 1; j++) {
+        timestep_ops_.push_back(std::vector<RNNNetOperator>());
+        timestep_ops_.back().reserve(timestep_ops_template_.size());
+      }
+
+      // Keep track of workspaces for optimization in forward-only case
+      if (workspaces_.size() < t + 1) {
+        workspaces_.resize(t + 1);
+      }
+      workspaces_[t] = ws;
+
+      // Create a specific timestep blob for this timestep. This is to
+      // avoid conflicting timestep blobs when reusing workspaces, as with
+      // the forward-only mode.
+      std::string this_timestep_blob =
+          timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
+      ws->CreateBlob(this_timestep_blob)->GetMutable<TensorCPU>()->Resize(1);
+      auto b = ws->GetBlob(this_timestep_blob);
+      CAFFE_ENFORCE(b);
+      b->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
+
+      // Copy the operators from template
+      for (auto& template_rnn_op : timestep_ops_template_) {
+        auto& rnn_op = template_rnn_op;
+
+        // For ops that have the timestep blob as an input we need to
+        // create a new operator definition with the timestep-specific
+        // timestep blob. This is required to avoid race conditions when
+        // multiple timesteps execute in paralle.
+        if (rnn_op.has_timestep_blob) {
+          OperatorDef op_copy = step_net_def_.op(rnn_op.order);
+
+          for (int i = 0; i < op_copy.input_size(); i++) {
+            if (op_copy.input(i) == timestep_blob_) {
+              op_copy.set_input(i, this_timestep_blob);
+            }
+          }
+
+          rnn_op.op = CreateOperator(op_copy, ws);
+          for (const auto& observer : observers_list) {
+            std::unique_ptr<ObserverBase<OperatorBase>> rnn_observer_copy =
+                observer.get()->rnnCopy(rnn_op.op.get(), rnn_op.order);
+            if (rnn_observer_copy) {
+              rnn_op.op->AttachObserver(std::move(rnn_observer_copy));
+            }
+          }
+        } else {
+          // Optimization for forward-only models when we can share workspaces
+          // with timesteps: then we can just copy the op reference.
+          if (t > max_parallel_timesteps_ && max_parallel_timesteps_ > 0 &&
+              workspaces_[t - max_parallel_timesteps_] == ws) {
+            rnn_op.op =
+                timestep_ops_[t - max_parallel_timesteps_][rnn_op.order].op;
+          } else {
+            // Otherwise, we need to create a brand new op with the workspace
+            // owned by this timestep.
+            rnn_op.op = CreateOperator(step_net_def_.op(rnn_op.order), ws);
+            for (const auto& observer : observers_list) {
+              std::unique_ptr<ObserverBase<OperatorBase>> rnn_observer_copy =
+                  observer.get()->rnnCopy(rnn_op.op.get(), rnn_op.order);
+              if (rnn_observer_copy) {
+                rnn_op.op->AttachObserver(std::move(rnn_observer_copy));
+              }
+            }
+          }
+        }
+        rnn_op.op->DisableEvent();
+
+        timestep_ops_[t].emplace_back(rnn_op);
+      }
+    }
+  }
+
+  /**
+   * Set limit for the number of timesteps that run in parallel. Useful
+   * for forward-only execution when we rotate workspaces over timesteps,
+   * i.e when timestep[t] and timestep[t + p] have same workspace.
+   */
+  void SetMaxParallelTimesteps(int p) {
+    max_parallel_timesteps_ = p;
+  }
+
+  size_t NumObserversStepNet() {
+    size_t num = 0;
+    for (auto& ops_at_timestep_t : timestep_ops_) {
+      for (auto& rnn_op : ops_at_timestep_t) {
+        num += rnn_op.op->NumObservers();
+      }
+    }
+    return num;
+  }
+
+ private:
+  // Utility method to check if any of the op inputs or control inputs
+  // contain given blob 'input'
+  bool has_input(std::string x, int opidx) {
+    for (auto& inp : step_net_def_.op(opidx).input()) {
+      if (inp == x) {
+        return true;
+      }
+    }
+    for (auto& inp : step_net_def_.op(opidx).control_input()) {
+      if (inp == x) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Return all outbound dependencies of an op. Special case for
+  // rnn dependencies, that are set in recurent_network_op.
+  std::vector<string> op_deps(int i) {
+    std::vector<string> outs;
+    auto& opdef = step_net_def_.op(i);
+    for (string o : opdef.output()) {
+      outs.push_back(o);
+    };
+    for (auto& arg : opdef.arg()) {
+      if (arg.name().find("rnn_dependency") == 0) {
+        outs.push_back(arg.s());
+      }
+    }
+    return outs;
+  }
+
+  /**
+   * Calculate dependencies of this op, for the ops following it in this
+   * timestep and also for the next timestep. Removes redundant dependencies.
+   */
+  void infer_dependencies(
+      int start_i,
+      std::unordered_set<string> outputs,
+      std::vector<RNNNetOperator>& rnn_ops,
+      std::unordered_set<int>* dep_ops) {
+    std::unordered_set<int> already_accounted_deps;
+    int num_ops = step_net_def_.op_size();
+    bool ignore_links = this->ignoreLinkDependencies();
+    for (int j = 0; j < num_ops - 1 && !outputs.empty(); j++) {
+      int i = (start_i + j) % num_ops;
+      if (ignore_links && rnn_ops[i].link_op) {
+        continue;
+      }
+      for (auto& outp : outputs) {
+        if (has_input(outp, i)) {
+          if (already_accounted_deps.find(i) == already_accounted_deps.end()) {
+            dep_ops->insert(i);
+          }
+
+          // Now we can take the deps of this ops and not
+          // add them anymore
+          for (int odep : rnn_ops[i].dependencies) {
+            already_accounted_deps.insert(odep);
+          }
+          for (string& dep_out : op_deps_[i]) {
+            auto oit = outputs.find(dep_out);
+            if (oit != outputs.end()) {
+              // This op produces output of the orignal op, so the dependency
+              // passed through that op
+              outputs.erase(oit);
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  /**
+   * Add dependencies to ops in the next timestep that would write an op
+   * that this op has as an input or output. This is special for RNNs,
+   * since we can have ops running in different timesteps concurrently.
+   * Also, we need to check ops that output a blob that is input of
+   * of the op in question.
+   */
+  void add_race_conflict_dependencies(
+      int opidx,
+      std::vector<RNNNetOperator>& rnn_ops,
+      std::unordered_set<int>* dep_ops) {
+    for (int i = 0; i < rnn_ops.size(); i++) {
+      if (i == opidx) {
+        continue;
+      }
+      if (rnn_ops[i].link_op && this->ignoreLinkDependencies()) {
+        continue;
+      }
+      for (auto& dep_blob : op_deps_[i]) {
+        for (auto& inp : step_net_def_.op(opidx).input()) {
+          if (inp == dep_blob) {
+            dep_ops->insert(i);
+            break;
+          }
+        }
+        if (i < opidx) {
+          for (auto& outp : step_net_def_.op(opidx).output()) {
+            if (outp == dep_blob) {
+              dep_ops->insert(i);
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Calculate the dependencies between ops inside timestep and across
+   * timestep. These are store in timestep_ops_ vector that is copied
+   * for each timestep.
+   */
+  void CalculateInternalDependencies() {
+    for (int i = 0; i < step_net_def_.op_size(); i++) {
+      timestep_ops_template_.push_back(RNNNetOperator(step_net_def_.op(i), i));
+    }
+    // Then see which outputs appear as inputs, and those are
+    // the internal blobs.
+    for (auto& rnn_op : timestep_ops_template_) {
+      std::unordered_set<string> dep_outputs;
+      for (auto& outp : op_deps_[rnn_op.order]) {
+        dep_outputs.insert(outp);
+      }
+
+      // Add recurrent dependencies as 'outputs' for this op
+      for (auto& outp : dep_outputs) {
+        auto rit = recurrent_input_map_.find(outp);
+        if (rit != recurrent_input_map_.end()) {
+          dep_outputs.insert(rit->second);
+        } else {
+          dep_outputs.insert(outp);
+        }
+      }
+
+      // Compute dependencies of this op.
+      if (!rnn_op.link_op || !this->ignoreLinkDependencies()) {
+        std::unordered_set<int> dependent_ops;
+        infer_dependencies(
+            rnn_op.order + 1,
+            dep_outputs,
+            timestep_ops_template_,
+            &dependent_ops);
+
+        // Race conditions arise when operator writes a blob that is
+        // being read by another.
+        if (!this->ignoreLinkDependencies()) {
+          add_race_conflict_dependencies(
+            rnn_op.order, timestep_ops_template_, &dependent_ops);
+        }
+
+        for (int i : dependent_ops) {
+          rnn_op.dependencies.push_back(i);
+        }
+
+        // Sort in ascending order of dependency distance. If op
+        // j > i, then distance is j - i. But if j < i, then distance
+        // from i to j passes the timestep boundary and is j + num ops - i.
+        std::sort(
+            rnn_op.dependencies.begin(),
+            rnn_op.dependencies.end(),
+            [&](const int& a, const int& b) {
+              if (a < rnn_op.order && b < rnn_op.order) {
+                return a < b;
+              }
+              if (a >= rnn_op.order && b >= rnn_op.order) {
+                return a < b;
+              }
+              if (a >= rnn_op.order && b < rnn_op.order) {
+                return true;
+              }
+              return false;
+            });
+      }
+    }
+
+    // Update dependency counts
+    for (auto& rnn_op : timestep_ops_template_) {
+      for (int i : rnn_op.dependencies) {
+        timestep_ops_template_[i].num_dynamic_inputs++;
+
+        if (i > rnn_op.order) {
+          timestep_ops_template_[i].frontier = false;
+        } else {
+          timestep_ops_template_[i].num_recurrent_inputs++;
+        }
+      }
+    }
+    // Find ops that have no recurrent inputs, and bind them
+    // to the last op of the timestep. If there is only one op
+    // in the step net, then it will depend on itself. Note that
+    // we do not increase the dynamic input counter.
+    for (auto& rnn_op : timestep_ops_template_) {
+      if (rnn_op.num_dynamic_inputs == 0 && rnn_op.num_recurrent_inputs == 0) {
+        if (rnn_op.link_op && this->ignoreLinkDependencies()) {
+          continue;
+        }
+        timestep_ops_template_.back().dependencies.push_back(rnn_op.order);
+      }
+    }
+
+    // compute parents
+    for (auto& rnn_op : timestep_ops_template_) {
+      for (int dep : rnn_op.dependencies) {
+        timestep_ops_template_[dep].parents.push_back(rnn_op.order);
+      }
+    }
+    AnalyzeOps();
+  }
+
+ protected:
+  /**
+   * For debug purposes, print the dependency structure. Set
+   * rnn_executor_debug=1 in the RecurrentNetworkOp to enable.
+   */
+  void PrintInfo(int t) {
+    auto& rnn_ops = timestep_ops_[t];
+
+    LOG(INFO) << "Timestep: " << t;
+    for (auto& rnn_op : rnn_ops) {
+      auto& op = rnn_op.op;
+      LOG(INFO) << "Operator " << rnn_op.order << ": " << op->type()
+                << " dep inputs:" << rnn_op.num_dynamic_inputs
+                << " rec inputs:" << rnn_op.num_recurrent_inputs
+                << " frontier: " << rnn_op.frontier;
+      for (auto& inp : rnn_op.op->debug_def().input()) {
+        LOG(INFO) << " ---- input: " << inp;
+      }
+      for (auto& outp : rnn_op.op->debug_def().output()) {
+        LOG(INFO) << " ---- output: " << outp;
+      }
+      for (auto j : rnn_op.dependencies) {
+        LOG(INFO) << " dep: " << j << ": " << rnn_ops[j].op->type();
+      }
+      for (auto j : rnn_op.parents) {
+        LOG(INFO) << " parent: " << j << ": " << rnn_ops[j].op->type();
+      }
+    }
+
+    LOG(INFO) << "recurrent_inputs:" << recurrent_input_map_;
+
+    for (auto& rnn_op : rnn_ops) {
+      LOG(INFO) << "Operator " << rnn_op.order;
+      LOG(INFO) << ProtoDebugString(rnn_op.op->debug_def());
+    }
+  }
+
+  virtual void AnalyzeOps() {}
+
+  virtual bool ignoreLinkDependencies() = 0;
+
+  std::vector<std::vector<RNNNetOperator>> timestep_ops_;
+  std::vector<OperatorBase*> op_ptrs_;
+
+  std::vector<RNNNetOperator> timestep_ops_template_;
+
+  NetDef step_net_def_;
+  std::vector<std::vector<string>> op_deps_;
+  std::vector<Workspace*> workspaces_;
+  std::map<string, string> recurrent_input_map_;
+  std::string timestep_blob_;
+
+  int max_parallel_timesteps_ = -1;
+
+ public:
+  bool debug_ = false;
+};
+
+template <class Context>
+std::unique_ptr<RecurrentNetworkExecutorBase> createRNNExecutor(
+    const NetDef& step_net_def,
+    std::map<string, string>& recurrent_input_map,
+    std::string timestep_blob,
+    ArgumentHelper rnn_args);
+
+class ThreadedRecurrentNetworkExecutor : public RecurrentNetworkExecutorBase {
+ public:
+  ThreadedRecurrentNetworkExecutor(
+      const NetDef& step_net_def,
+      std::map<string, string>& recurrent_input_map,
+      std::string timestep_blob)
+      : RecurrentNetworkExecutorBase(step_net_def, recurrent_input_map, timestep_blob),
+        failed_(false) {}
+
+  ~ThreadedRecurrentNetworkExecutor() {
+    task_queue_.NoMoreJobs();
+    VLOG(1) << "Joining workers.";
+    for (auto& worker : workers_) {
+      worker.join();
+    }
+  }
+
+  bool Run(int T) override;
+
+  bool RunBackwards(int T) override;
+
+  bool ignoreLinkDependencies() override {
+    return false;
+  }
+
+  void setNumThreads(int n) {
+    num_threads_ = n;
+  }
+
+ private:
+  void _ExecRange(int from, int to);
+
+  void _Exec();
+
+  void WorkerFunction();
+
+  void RunOp(OpTask job, int thread_id);
+
+  SimpleQueue<OpTask> task_queue_;
+  std::atomic<int> countdown_;
+  std::atomic<bool> failed_;
+  std::atomic<int> finished_timesteps_;
+  int num_ops_;
+  std::mutex countdown_mtx_;
+  std::condition_variable cv_;
+  std::vector<std::thread> workers_;
+  int num_threads_ = 4;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_H_
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
new file mode 100644
index 0000000..cc5639d
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -0,0 +1,145 @@
+#include "caffe2/operators/rnn/recurrent_network_executor_gpu.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+template <>
+std::unique_ptr<RecurrentNetworkExecutorBase> createRNNExecutor<CUDAContext>(
+    const NetDef& step_net_def,
+    std::map<string, string>& recurrent_input_map,
+    std::string timestep_blob,
+    ArgumentHelper arg_helper) {
+  auto* exec = new CUDARecurrentNetworkExecutor(
+      step_net_def, recurrent_input_map, timestep_blob);
+  int max_streams = arg_helper.GetSingleArgument<int>("rnn_executor.max_cuda_streams", 0);
+  if (max_streams > 0) {
+    exec->setMaxStreams(max_streams);
+    LOG(INFO) << "Set max streams:" << max_streams;
+  }
+  std::unique_ptr<RecurrentNetworkExecutorBase> ptr(exec);
+  return ptr;
+}
+
+CUDARecurrentNetworkExecutor::~CUDARecurrentNetworkExecutor() {
+  for (cudaEvent_t ev : events_) {
+    if (ev != nullptr) {
+      CUDA_CHECK(cudaEventDestroy(ev));
+    }
+  }
+}
+
+/**
+ * Special execution for CUDA. It tries to run ops with as little overhead as
+ * possible, but to identify opportunities to run ops with "frontier execution"
+ * parallelism, i.e by starting kernel from next timestep in parallel with
+ * the current timestep. This is done by assigning streams.
+ */
+void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
+  int direction = to > from ? 1 : -1;
+
+  int max_streams = max_parallel_timesteps_ > 0 ?
+                    std::min(max_parallel_timesteps_, max_cuda_streams_)
+                    : max_cuda_streams_;
+  int stream_seq = 0;
+  int num_ops = timestep_ops_[0].size();
+
+  events_.resize(num_ops * timestep_ops_.size(), nullptr);
+
+  int gpu_id = -1;
+
+  // Loop over timesteps
+  for (int t = from; t != to; t += direction) {
+    bool first_timestep = t == from;
+    bool last_timestep =
+        (direction == -1 && t == 0) || (direction == 1 && t == to - 1);
+    auto& ops = timestep_ops_[t];
+    int stream_id = stream_seq % max_streams;
+
+    for (int i = 0; i < ops.size(); i++) {
+      auto& rnn_op = ops[i];
+
+      // Special handling for link ops -- we just run them directly
+      // they do not execute any kernels.
+      if (rnn_op.link_op) {
+        rnn_op.op->RunAsync(stream_id);
+        CAFFE_ENFORCE(
+            rnn_op.dependencies.empty(),
+            "GPU executor ignores link dependencies");
+        continue;
+      }
+
+      if (gpu_id == -1 && rnn_op.op->device_option().device_type() == 1) {
+        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
+      } else {
+        CAFFE_ENFORCE(
+            rnn_op.op->device_option().device_type() == 0 ||
+                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
+            "RNN Executor only supports ops on one GPU");
+      }
+
+      // If have recurrent parents, add for event waits so that those
+      // parents complete their work.
+      if (has_timestep_parallelism_ && !first_timestep) {
+        for (int parent : rnn_op.parents) {
+          if (parent > i) {
+            int parent_ev_idx = (t - direction) * num_ops + parent;
+            CHECK(events_.size() > parent_ev_idx);
+            CAFFE_ENFORCE(events_[parent_ev_idx] != nullptr);
+            CUDA_CHECK(cudaStreamWaitEvent(
+                CUDAContext::cuda_stream(gpu_id, stream_id),
+                events_[parent_ev_idx],
+                0));
+        }
+        }
+      }
+
+      // Run the op in the given stream
+      rnn_op.op->RunAsync(stream_id);
+
+      // Create and record event for this op, if it has at least one
+      // recurrent dependency.
+      if (has_timestep_parallelism_ && !last_timestep) {
+        for (int dep : rnn_op.dependencies) {
+          if (dep < i) {
+            int event_idx = t * num_ops + i;
+            // Create event for recurrent connections
+            if (events_[event_idx] == nullptr) {
+              CUDA_CHECK(cudaEventCreate(&events_[event_idx]));
+            }
+            CUDA_CHECK(cudaEventRecord(
+                events_[event_idx],
+                CUDAContext::cuda_stream(gpu_id, stream_id)));
+            break;
+          }
+        }
+      }
+    } // for over ops
+
+    // Next timestep will run on different stream
+    if (has_timestep_parallelism_) {
+      stream_seq++;
+    }
+  } // for over timesteps
+
+  /**
+   * Wait for all the started streams to complete.
+   */
+  for (int stream_id = 0; stream_id <= std::min(stream_seq, max_streams - 1);
+       stream_id++) {
+    VLOG(1) << "Wait for stream:" << stream_id;
+    CUDA_CHECK(
+        cudaStreamSynchronize(CUDAContext::cuda_stream(gpu_id, stream_id)));
+  }
+}
+
+bool CUDARecurrentNetworkExecutor::Run(int T) {
+  _ExecRange(0, T);
+  return true;
+}
+
+bool CUDARecurrentNetworkExecutor::RunBackwards(int T) {
+  _ExecRange(T - 1, -1);
+  return true;
+}
+}
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.h b/caffe2/operators/rnn/recurrent_network_executor_gpu.h
new file mode 100644
index 0000000..b3d3cd6
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.h
@@ -0,0 +1,82 @@
+#ifndef CAFFE2_OPERATORS_RECURRENT_NETWORK_GPU_EXECUTOR_H_
+#define CAFFE2_OPERATORS_RECURRENT_NETWORK_GPU_EXECUTOR_H_
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/rnn/recurrent_network_executor.h"
+
+
+#include <map>
+
+namespace caffe2 {
+
+class CUDARecurrentNetworkExecutor : public RecurrentNetworkExecutorBase {
+ public:
+  CUDARecurrentNetworkExecutor(
+      const NetDef& step_net_def,
+      std::map<string, string>& recurrent_input_map,
+      std::string timestep_blob)
+  : RecurrentNetworkExecutorBase(step_net_def, recurrent_input_map, timestep_blob) {}
+
+  ~CUDARecurrentNetworkExecutor();
+
+ protected:
+  bool Run(int T) override;
+
+  bool RunBackwards(int T) override;
+
+  bool ignoreLinkDependencies() override {
+    return true;
+  }
+
+  void AnalyzeOps() override {
+    /**
+      * Check if there is an op that only depends on ops from previous
+      * timestep, and that ops is not the last op. Then we can start computation
+      * in subsequent timesteps before the whole previous timestep has finished.
+      * If there is no parallelism, we can avoid overhead of event-based
+      * dependency management.
+      */
+    has_timestep_parallelism_ = false;
+    for (auto& rnn_op : timestep_ops_template_) {
+      int i = rnn_op.order;
+      if (rnn_op.parents.size() >= 1 && i < timestep_ops_template_.size() - 1) {
+        bool only_recurrent_deps = std::all_of(
+                  rnn_op.parents.begin(),
+                  rnn_op.parents.end(), [&](const int &parent) {
+                    return parent > i;
+                  }
+        );
+        if (only_recurrent_deps) {
+          VLOG(1) << "Timestep parallel op: " << ProtoDebugString(step_net_def_.op(i));
+          has_timestep_parallelism_ = true;
+
+          for (int dep : rnn_op.parents) {
+            if (dep == timestep_ops_template_.size() - 1) {
+              // This op depends on the last op of the previous iteration,
+              // so it will block any parallelism
+              has_timestep_parallelism_ = false;
+              break;
+            }
+          }
+          break;
+        }
+      }
+    }
+    LOG(INFO) << "Analyzed ops for timestep parallelism: " << has_timestep_parallelism_;
+ }
+
+ public:
+
+   void setMaxStreams(int n) {
+     max_cuda_streams_ = n;
+   }
+
+ private:
+  void _ExecRange(int from, int to);
+
+  std::vector<cudaEvent_t> events_;
+  bool has_timestep_parallelism_ = false;
+  int max_cuda_streams_ = 2;
+};
+}
+#endif
diff --git a/caffe2/operators/rnn/recurrent_network_executor_incl.h b/caffe2/operators/rnn/recurrent_network_executor_incl.h
new file mode 100644
index 0000000..e8d26de
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_executor_incl.h
@@ -0,0 +1,74 @@
+
+#ifndef CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_INCL_H_
+#define CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_INCL_H_
+
+#include <vector>
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+/**
+ * Struct for operator in a timestep and its dependenceis.
+ */
+struct RNNNetOperator {
+  int order; // Position in the step net (i.e nth operator)
+  std::shared_ptr<OperatorBase> op = nullptr;
+  bool link_op; // Special flag for link op, see RNNApplyLinkOp.
+
+  // Bookkeeping, used by ThreadedRecurrentNetworkExecutor
+  int num_dynamic_inputs = 0;
+  int num_recurrent_inputs = 0;
+  std::atomic<int> proc_inputs;
+
+  // Dependencies to other ops. If dependency index < order, it is
+  // a recurrent dependency (i.e to the next timestep)
+  std::vector<int> dependencies;
+  std::vector<int> parents;
+  bool frontier = true; // For ops that are launched first
+  bool has_timestep_blob = false;
+
+  explicit RNNNetOperator(const OperatorDef& def, int order) : order(order) {
+    proc_inputs = 0;
+    link_op = def.type() == "rnn_internal_apply_link";
+  }
+
+  RNNNetOperator(const RNNNetOperator& x) {
+    order = x.order;
+    op = x.op;
+    link_op = x.link_op;
+    num_dynamic_inputs = x.num_dynamic_inputs;
+    num_recurrent_inputs = x.num_recurrent_inputs;
+    proc_inputs = 0;
+    dependencies = x.dependencies;
+    parents = x.parents;
+    frontier = x.frontier;
+  }
+};
+
+/**
+ * Data structure for a scheduled task in the task queue.
+ */
+struct OpTask {
+  int timestep;
+  int op_idx; // matches RNNNetOperator.order
+  int T; // number of timesteps in this execution
+  int direction; // +1 for forward, -1 for backward pass
+  int stream_id = -1; // only used by gpu version
+  OpTask() {}
+  OpTask(int _timestep, int _op_idx, int _T, int _direction)
+      : timestep(_timestep), op_idx(_op_idx), T(_T), direction(_direction) {
+    CHECK(direction == 1 || direction == -1);
+    CHECK(timestep >= 0 && timestep < _T);
+  }
+
+  inline bool backward() {
+    return direction == -1;
+  }
+  inline bool forward() {
+    return direction == 1;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECURRENT_NETWORK_EXECUTOR_H_
diff --git a/caffe2/operators/rnn/recurrent_network_op.cc b/caffe2/operators/rnn/recurrent_network_op.cc
new file mode 100644
index 0000000..fc38455
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_op.cc
@@ -0,0 +1,260 @@
+#include "caffe2/operators/rnn/recurrent_network_op.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/proto_utils.h"
+
+#ifndef CAFFE2_RNN_NO_TEXT_FORMAT
+#endif
+
+CAFFE2_DEFINE_bool(
+    caffe2_rnn_executor,
+    true,
+    "If set, uses special RNN executor for executing RecurrentNetworkOp");
+
+namespace caffe2 {
+CAFFE_KNOWN_TYPE(detail::ScratchWorkspaces);
+
+REGISTER_CPU_OPERATOR(RecurrentNetwork, RecurrentNetworkOp<CPUContext>);
+OPERATOR_SCHEMA(RecurrentNetwork)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(2, INT_MAX)
+    .SetDoc(R"DOC(
+Run the input network in a recurrent fashion. This can be used to
+implement fairly general recurrent neural networks (RNNs).
+
+The operator proceeds as follows.
+
+- First, initialized the states from the input recurrent states
+- For each timestep T, apply the links (that map offsets from input/output
+tensors into the inputs/outputs for the `step` network)
+- Finally, alias the recurrent states to the specified output blobs.
+
+This is a fairly special-case meta-operator, and so the implementation
+is somewhat complex. It trades of generality (and frankly usability)
+against performance and control (compared to e.g. TF
+dynamic_rnn, Theano scan, etc).
+
+See the usage examples for a flavor of how to use it.
+)DOC");
+
+REGISTER_CPU_OPERATOR(
+    RecurrentNetworkGradient,
+    RecurrentNetworkGradientOp<CPUContext>);
+OPERATOR_SCHEMA(RecurrentNetworkGradient);
+
+REGISTER_CPU_OPERATOR(
+    rnn_internal_accumulate_gradient_input,
+    AccumulateInputGradientOp<CPUContext>);
+OPERATOR_SCHEMA(rnn_internal_accumulate_gradient_input)
+    .NumInputs(3)
+    .NumOutputs(1, INT_MAX)
+    .EnforceInplace({{2, 0}})
+    .Private()
+    .SetDoc(R"DOC(
+Internal RNN operator.
+)DOC");
+
+REGISTER_CPU_OPERATOR(
+    rnn_internal_apply_link,
+    RNNApplyLinkOp<CPUContext>);
+OPERATOR_SCHEMA(rnn_internal_apply_link)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .EnforceInplace({{1, 1}})
+    .Private()
+    .SetDoc(R"DOC(
+Internal RNN operator.
+)DOC");
+
+struct GetRecurrentNetworkGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper argsHelper(def_);
+    auto params = argsHelper.GetRepeatedArgument<int32_t>("param");
+    auto recurrentInputs =
+        argsHelper.GetRepeatedArgument<int32_t>("initial_recurrent_state_ids");
+
+    std::vector<std::string> gradientInputs;
+
+    // Argument specifies which outputs have external gradient, (0) by default
+    auto outputs_with_grads =
+        argsHelper.GetRepeatedArgument<int32_t>("outputs_with_grads");
+    CAFFE_ENFORCE(outputs_with_grads.size() > 0);
+    for (auto id : outputs_with_grads) {
+      gradientInputs.push_back(GO(id));
+    }
+
+    // All inputs and outputs are passed back
+    for (int i = 0; i < def_.input_size(); ++i) {
+      gradientInputs.push_back(I(i));
+    }
+    for (int i = 0; i < def_.output_size(); ++i) {
+      gradientInputs.push_back(O(i));
+    }
+
+    // We calculate gradients only for parameters and recurrent inputs
+    std::vector<std::string> gradientOutputs;
+    gradientOutputs.push_back(GI(0));
+    for (auto id : params) {
+      gradientOutputs.push_back(GI(id));
+    }
+    for (auto id : recurrentInputs) {
+      gradientOutputs.push_back(GI(id));
+    }
+
+    VLOG(1) << "Gradient blobs: " << Join(", ", gradientOutputs);
+
+    return SingleGradientDef(
+        "RecurrentNetworkGradient", "", gradientInputs, gradientOutputs);
+  }
+};
+
+REGISTER_GRADIENT(RecurrentNetwork, GetRecurrentNetworkGradient);
+
+namespace detail {
+
+std::map<string, string> GetRecurrentMapping(
+    const std::vector<detail::Link>& links,
+    bool backward) {
+  std::map<string, string> mappings;
+  for (auto it = links.begin(); it != links.end(); ++it) {
+    const auto& l1 = *it;
+
+    // In backward op we expect to see offset 1 before offset 0 and
+    // vice versa.
+    const int offset_l1 = backward ? 1 : 0;
+    const int offset_l2 = 1 - offset_l1;
+    if (l1.offset == offset_l1) {
+      // Find offset = 1 from links. We could probaby rely on order, but
+      // since the number of links is links small, O(n^2) algo is ok
+      for (auto it2 = it + 1; it2 != links.end(); ++it2) {
+        const auto& l2 = *it2;
+        if (l2.offset == offset_l2 && l2.external == l1.external) {
+          mappings[l2.internal] = l1.internal;
+          break;
+        }
+      }
+    }
+  }
+  return mappings;
+}
+
+void PrependOps(std::vector<OperatorDef> ops, NetDef* netdef) {
+  for (auto& o : netdef->op()) {
+    ops.push_back(o);
+  }
+  netdef->mutable_op()->Clear();
+  for (auto& o : ops) {
+    auto* ao = netdef->add_op();
+    ao->CopyFrom(o);
+  }
+}
+
+void AddApplyLinkOps(
+    const vector<Link>& links,
+    std::string timestep,
+    const DeviceOption& device_option,
+    NetDef* netdef) {
+  std::vector<OperatorDef> ops;
+  for (auto& link : links) {
+    OperatorDef opdef;
+    opdef.set_type("rnn_internal_apply_link");
+    opdef.add_input(timestep);
+    opdef.add_input(link.external);
+    opdef.add_output(link.internal);
+    opdef.add_output(link.external);
+    opdef.mutable_device_option()->CopyFrom(device_option);
+
+    Argument* offset_arg = opdef.add_arg();
+    offset_arg->set_name("offset");
+    offset_arg->set_i(link.offset);
+
+    Argument* window_arg = opdef.add_arg();
+    window_arg->set_name("window");
+    window_arg->set_i(link.window);
+
+    // Find out if the linked blob is used first as an output: then we need
+    // to add control_input to that op
+    for (auto& op : *netdef->mutable_op()) {
+      if (HasInput(op, link.internal)) {
+        // First appears as an input, no need to do antyhing
+        continue;
+      }
+      if (HasOutput(op, link.internal)) {
+        op.add_control_input(link.internal);
+        break;
+      }
+    }
+
+    ops.push_back(opdef);
+
+    netdef->add_external_input(link.internal);
+    netdef->add_external_input(link.external);
+  }
+
+  detail::PrependOps(ops, netdef);
+}
+
+void extractLinks(
+    OperatorBase* op,
+    const std::string& internalArg,
+    const std::string& externalArg,
+    const std::string& offsetArg,
+    const std::string& windowArg,
+    std::vector<detail::Link>* links) {
+  const auto& internal = op->GetRepeatedArgument<std::string>(internalArg);
+  const auto& external = op->GetRepeatedArgument<std::string>(externalArg);
+  const auto& offset = op->GetRepeatedArgument<int32_t>(offsetArg);
+  const auto& window = op->GetRepeatedArgument<int32_t>(
+      windowArg, vector<int32_t>(offset.size(), 1));
+  CAFFE_ENFORCE_EQ(
+      internal.size(),
+      offset.size(),
+      "internal/offset mismatch: ",
+      internalArg,
+      " ",
+      externalArg);
+  CAFFE_ENFORCE_EQ(
+      external.size(),
+      offset.size(),
+      "external/offset mismatch: ",
+      externalArg,
+      " ",
+      offsetArg);
+  CAFFE_ENFORCE_EQ(
+      external.size(),
+      window.size(),
+      "external/window mismatch: ",
+      externalArg,
+      " ",
+      windowArg);
+  for (auto i = 0; i < internal.size(); ++i) {
+    detail::Link l;
+    l.internal = internal[i];
+    l.external = external[i];
+    l.offset = offset[i];
+    l.window = window[i];
+    links->push_back(l);
+  }
+}
+
+NetDef extractNetDef(const OperatorDef& op, const std::string& argName) {
+  if (ArgumentHelper::HasSingleArgumentOfType<OperatorDef, NetDef>(
+          op, argName)) {
+    return ArgumentHelper::GetSingleArgument<OperatorDef, NetDef>(
+        op, argName, NetDef());
+  } else {
+#ifndef CAFFE2_RNN_NO_TEXT_FORMAT
+    NetDef result;
+    const auto netString =
+        ArgumentHelper::GetSingleArgument<OperatorDef, string>(op, argName, "");
+    CAFFE_ENFORCE(
+        TextFormat::ParseFromString(netString, &result),
+        "Invalid NetDef");
+    return result;
+#else
+    CAFFE_THROW("No valid NetDef for argument ", argName);
+#endif
+  }
+}
+} // namespace detail
+}
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
new file mode 100644
index 0000000..c50d18e
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -0,0 +1,924 @@
+#ifndef CAFFE2_OPERATORS_RECURRENT_NETWORK_OP_H_
+#define CAFFE2_OPERATORS_RECURRENT_NETWORK_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/operators/rnn/recurrent_network_executor.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_bool(caffe2_rnn_executor);
+
+namespace caffe2 {
+namespace detail {
+
+struct Param {
+  std::string param;
+  std::string grad;
+  std::string cellGradient;
+};
+
+struct RecurrentInput {
+  std::string state;
+  std::string input;
+};
+
+struct RecurrentGradient {
+  std::string param;
+  std::string grad;
+  std::string externalGrad;
+  std::string lastExternalGrad;
+  int32_t offset;
+};
+
+struct OffsetAlias {
+  std::string src;
+  std::string dst;
+  int32_t offset{0};
+};
+
+struct Link {
+  std::string internal;
+  std::string external;
+  int32_t offset{0};
+  int32_t window{1};
+};
+
+struct ScratchWorkspaces {
+  std::vector<std::shared_ptr<Workspace>> stepWorkspaces;
+  std::shared_ptr<Workspace> sharedBlobsWs = nullptr;
+};
+
+inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
+  ws->CreateBlob(blob_name)->GetMutable<TensorCPU>()->Resize(1);
+  auto timestepBlob = ws->GetBlob(blob_name);
+  CAFFE_ENFORCE(timestepBlob);
+  timestepBlob->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
+}
+
+std::map<string, string> GetRecurrentMapping(
+  const std::vector<detail::Link>& links, bool backward);
+
+template <typename T, typename Context>
+void applyOffsetAlias(
+    const OffsetAlias& oc,
+    Workspace* ws,
+    Context* /*context*/) {
+  VLOG(1) << "Aliasing: " << oc.src << " to: " << oc.dst
+          << " at offset: " << oc.offset;
+  auto srcBlob = ws->GetBlob(oc.src);
+  CAFFE_ENFORCE(srcBlob);
+  auto* src = srcBlob->template GetMutable<Tensor<Context>>();
+  auto* dst = ws->GetBlob(oc.dst)->template GetMutable<Tensor<Context>>();
+  auto timestep = src->size() / src->dim(0);
+  auto dims = src->dims();
+  const int32_t startDstTimestep =
+      oc.offset >= 0 ? oc.offset : src->dim(0) + oc.offset;
+  const int32_t numDstTimesteps = src->dim(0) - startDstTimestep;
+  CAFFE_ENFORCE(
+      numDstTimesteps >= 1, "Invalid number of timesteps: ", numDstTimesteps);
+  dims[0] = numDstTimesteps;
+  dst->Resize(dims);
+  CAFFE_ENFORCE(timestep == dst->size() / numDstTimesteps, "Invalid offset");
+  dst->ShareExternalPointer(
+      src->template mutable_data<T>() + startDstTimestep * timestep,
+      dst->size());
+}
+
+template <typename T, class Context>
+void repeatCopy(
+    size_t repeat_n,
+    size_t n,
+    const T* src,
+    T* dst,
+    Context* context) {
+  for (int i = 0; i < repeat_n; ++i) {
+    context->template Copy<T, Context, Context>(n, src, dst + i * n);
+  }
+}
+
+/**
+ * Copy external input to the step net into the first item of
+ * (T + 1) X batch_size X input_size tensor
+ */
+template <typename T, typename Context>
+void initializeRecurrentInput(
+    const RecurrentInput& rc,
+    int32_t seqLen,
+    int32_t batchSize,
+    Workspace* ws,
+    Context* context) {
+  auto stateBlob = ws->GetBlob(rc.state);
+  CAFFE_ENFORCE(stateBlob);
+  auto* state = stateBlob->template GetMutable<Tensor<Context>>();
+
+  auto inputBlob = ws->GetBlob(rc.input);
+  CAFFE_ENFORCE(inputBlob);
+  const auto& input = inputBlob->template Get<Tensor<Context>>();
+  CAFFE_ENFORCE_GE(input.ndim(), 1, rc.input);
+  CAFFE_ENFORCE_LE(input.ndim(), 3, rc.input);
+
+  const auto stateSize = input.dim(input.ndim() - 1);
+  // Sometimes we want to provide more than one initial step.
+  // For example, if we do a convolution op in step net
+  // and need a sufficient left padding around the input.
+  // This could be used together with links where window != 1.
+  auto initialStateLength = 1;
+  if (input.ndim() == 3) {
+    initialStateLength = input.dim(0);
+  }
+  // States at [0, ..., (T + initialStateLength - 1)] (inclusive)
+  state->Resize(seqLen + initialStateLength, batchSize, stateSize);
+
+  if (input.ndim() >= 2) {
+    CAFFE_ENFORCE_EQ(input.dim(input.ndim() - 2), batchSize, rc.input);
+    context->template Copy<T, Context, Context>(
+        batchSize * stateSize * initialStateLength,
+        input.template data<T>(),
+        state->template mutable_data<T>());
+  } else {
+    // Usually, the initial state is the same for all inputs in the batch.
+    // So the op conveniently accepts 1-D input and copies it batchSize times.
+    repeatCopy<T, Context>(
+          batchSize,
+          stateSize,
+          input.template data<T>(),
+          state->template mutable_data<T>(),
+          context);
+  }
+}
+
+void PrependOps(std::vector<OperatorDef> ops, NetDef* netdef);
+
+void AddApplyLinkOps(
+    const vector<Link>& links,
+    std::string timestep,
+    const DeviceOption& device_option,
+    NetDef* netdef);
+
+void extractLinks(
+    OperatorBase* op,
+    const std::string& internalArg,
+    const std::string& externalArg,
+    const std::string& offsetArg,
+    const std::string& windowArg,
+    std::vector<detail::Link>* links);
+
+NetDef extractNetDef(const OperatorDef& op, const std::string& argName);
+} // namespace detail
+
+template <class Context>
+class RecurrentNetworkOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RecurrentNetworkOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        sharedWs_(ws),
+        enable_rnn_executor_(OperatorBase::template GetSingleArgument<bool>(
+            "enable_rnn_executor",
+            false)),
+        timestep_(OperatorBase::template GetSingleArgument<std::string>(
+            "timestep",
+            "timestep")) {
+    CAFFE_ENFORCE(ws);
+
+    stepNetDef_ = detail::extractNetDef(operator_def, "step_net");
+
+    recurrentInputs_ = constructRecurrentInputs(operator_def, sharedWs_);
+    links_ = constructLinks();
+    aliases_ = constructAliases();
+
+    stepNetDef_.add_external_input(timestep_);
+    detail::AddApplyLinkOps(
+        links_, timestep_, operator_def.device_option(), &stepNetDef_);
+
+    if (FLAGS_caffe2_rnn_executor && enable_rnn_executor_) {
+      VLOG(1) << "Use RecurrentNetworkExecutor";
+      auto recurrent_map = detail::GetRecurrentMapping(links_, false /* backward */);
+      rnnExecutor_ =
+          createRNNExecutor<Context>(
+              stepNetDef_,
+              recurrent_map,
+              timestep_,
+              ArgumentHelper(operator_def));
+    } else {
+      // Fix for legacy models that pass "rnn" type net
+      if (stepNetDef_.type() == "rnn") {
+        stepNetDef_.set_type("async_simple");
+      }
+      CAFFE_ENFORCE(stepNetDef_.type() != "async_dag");
+    }
+  }
+
+  size_t NumObservers() override {
+    size_t num = this->observers_list_.size();
+    if (rnnExecutor_) {
+      num += rnnExecutor_->NumObserversStepNet();
+    }
+    return num;
+  }
+
+  std::vector<detail::RecurrentInput> constructRecurrentInputs(
+      const OperatorDef& operator_def,
+      Workspace* sharedWs) {
+    const auto states =
+        OperatorBase::GetRepeatedArgument<std::string>("recurrent_states");
+    const auto inputs =
+        OperatorBase::GetRepeatedArgument<int>("initial_recurrent_state_ids");
+    CAFFE_ENFORCE_EQ(states.size(), inputs.size(), "states/inputs mismatch");
+    std::vector<detail::RecurrentInput> ris;
+    for (auto i = 0; i < states.size(); ++i) {
+      // States need to be "global" (since they are shared between
+      // forward and backward).
+      sharedWs->CreateBlob(states[i]);
+
+      detail::RecurrentInput ri;
+      ri.state = states[i];
+      ri.input = operator_def.input(inputs[i]);
+      ris.push_back(ri);
+    }
+    return ris;
+  }
+
+  std::vector<detail::OffsetAlias> constructAliases() {
+    const auto& src =
+        OperatorBase::GetRepeatedArgument<std::string>("alias_src");
+    const auto& dst =
+        OperatorBase::GetRepeatedArgument<std::string>("alias_dst");
+    const auto& offset =
+        OperatorBase::GetRepeatedArgument<int32_t>("alias_offset");
+    CAFFE_ENFORCE(
+        src.size() == offset.size(), "alias_src/alias_offset mismatch");
+    CAFFE_ENFORCE(
+        dst.size() == offset.size(), "alias_dst/alias_offset mismatch");
+    std::vector<detail::OffsetAlias> aliases;
+    for (auto i = 0; i < src.size(); ++i) {
+      detail::OffsetAlias oc;
+      oc.src = src[i];
+      oc.dst = dst[i];
+      oc.offset = offset[i];
+      aliases.push_back(oc);
+    }
+    return aliases;
+  }
+
+  /**
+    * Some blobs can be marked as to be recomputed on backward pass.
+    * For those blobs, we do not want to allocate on each step workspace,
+    * but we instead store that blob in the shared workspace so all
+    * steps can use the same buffer on forward pass.
+    */
+  void initializeBlobsToRecomputeOnBackward(Workspace* sharedBlobsWs) {
+    std::vector<std::string> v;
+    const auto& blobs = OperatorBase::GetRepeatedArgument<std::string>(
+        "recompute_blobs_on_backward", v);
+    for (const auto& b : blobs) {
+      // Note: if the blob already was created, this is a no-op.
+      sharedBlobsWs->CreateBlob(b);
+    }
+  }
+
+  std::vector<detail::Link> constructLinks() {
+    std::vector<detail::Link> links;
+    detail::extractLinks(
+        this,
+        "link_internal",
+        "link_external",
+        "link_offset",
+        "link_window",
+        &links);
+    return links;
+  }
+
+  template<typename T>
+  bool DoRunWithType() {
+    const auto seqLen = Input(0).dim32(0);
+    const auto batchSize = Input(0).dim32(1);
+    for (const auto& ri : recurrentInputs_) {
+      detail::initializeRecurrentInput<T, Context>(
+          ri, seqLen, batchSize, sharedWs_, &context_);
+    }
+
+    // If we don't have a backward step net, this operator is forward_only
+    // and we can avoid creating multiple workspaces.
+    bool has_backward_pass =
+        OperatorBase::HasSingleArgumentOfType<NetDef>("backward_step_net") ||
+        (OperatorBase::HasSingleArgumentOfType<string>("backward_step_net") &&
+         OperatorBase::GetSingleArgument<string>("backward_step_net", "") !=
+             "");
+
+    // With backward pass: we need to create workspace for each timestep
+    detail::ScratchWorkspaces* scratch =
+        OperatorBase::Output<detail::ScratchWorkspaces>(OutputSize() - 1);
+    std::vector<std::shared_ptr<Workspace>>& stepWorkspaces =
+        scratch->stepWorkspaces;
+    std::shared_ptr<Workspace>& sharedBlobsWs = scratch->sharedBlobsWs;
+    if (!sharedBlobsWs) {
+      sharedBlobsWs = std::make_shared<Workspace>(sharedWs_);
+    }
+
+    // Caller can decide that some of the forward activations
+    // are recomputed on backward pass. Then those activations do not
+    // have to be stored in step workspaces but can be shared.
+    initializeBlobsToRecomputeOnBackward(sharedBlobsWs.get());
+
+    if (has_backward_pass && seqLen > stepWorkspaces.size()) {
+      stepWorkspaces.resize(seqLen);
+    }
+
+    // In forward-only mode, we cycle over workspaces. This limits the amount
+    // of parallelism over timesteps that the RNNExecutor provides. So with
+    // RNN executor we use more workspaces to get better perf.
+    int num_workspaces_on_fwd_only = rnnExecutor_ ? 4 : 2;
+
+    if (!has_backward_pass && stepWorkspaces.size() < num_workspaces_on_fwd_only) {
+      // Use alternating stepWorkspaces when forward_only=True.
+      // Note that the step workspaces can be shared by other ops, thus
+      // we cannot shrink it to 2 if there are more than 2 step workspaces.
+      stepWorkspaces.resize(num_workspaces_on_fwd_only);
+    }
+
+    for (auto t = 0; t < seqLen; ++t) {
+      auto& currentStepWorkspace =
+          (has_backward_pass ? stepWorkspaces[t] :
+              stepWorkspaces[t % num_workspaces_on_fwd_only]);
+      if (!currentStepWorkspace) {
+        currentStepWorkspace = std::make_shared<Workspace>(sharedBlobsWs.get());
+      }
+
+      if (rnnExecutor_) {
+        if (!has_backward_pass) {
+          // Need to limit timestep parallelism because we cycle over workspaces
+          rnnExecutor_->SetMaxParallelTimesteps(num_workspaces_on_fwd_only);
+        }
+        rnnExecutor_->EnsureTimestepInitialized(
+            t, currentStepWorkspace.get(), this->observers_list_);
+      } else {
+        // Use plain Caffe2 nets
+        detail::UpdateTimestepBlob(currentStepWorkspace.get(), timestep_, t);
+        auto* stepNet = currentStepWorkspace->GetNet(stepNetDef_.name());
+        if (stepNet == nullptr) {
+          stepNet = currentStepWorkspace->CreateNet(stepNetDef_);
+        }
+        CAFFE_ENFORCE(stepNet, "Step Net construction failure");
+        // Since we have a SimpleNet, there are no races here.
+        stepNet->RunAsync();
+      }
+    }
+
+    if (rnnExecutor_) {
+      rnnExecutor_->Run(seqLen);
+    }
+
+    for (const auto& alias : aliases_) {
+      detail::applyOffsetAlias<T, Context>(alias, sharedWs_, &context_);
+    }
+
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  NetDef stepNetDef_;
+  Workspace* sharedWs_;
+  bool enable_rnn_executor_;
+  std::unique_ptr<RecurrentNetworkExecutorBase> rnnExecutor_;
+
+  std::vector<detail::Link> links_;
+  std::vector<detail::OffsetAlias> aliases_;
+  std::vector<detail::RecurrentInput> recurrentInputs_;
+  std::string timestep_;
+};
+
+template <class Context>
+class RecurrentNetworkGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RecurrentNetworkGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        sharedWs_(ws),
+        enable_rnn_executor_(OperatorBase::template GetSingleArgument<bool>(
+            "enable_rnn_executor",
+            false)),
+        timestep_(OperatorBase::template GetSingleArgument<std::string>(
+            "timestep",
+            "timestep")),
+        gradInputs_(OperatorBase::template GetRepeatedArgument<int32_t>(
+            "outputs_with_grads")) {
+    CAFFE_ENFORCE(ws);
+
+    stepNetDef_ = detail::extractNetDef(operator_def, "backward_step_net");
+
+    links_ = constructLinks();
+    params_ = constructParams(operator_def);
+    recurrentGradients_ = constructRecurrentGradients(operator_def);
+    recurrentInputIds_ = OperatorBase::template GetRepeatedArgument<int32_t>(
+        "initial_recurrent_state_ids");
+
+    /* Add operators to the backward step net to handle accumulation of
+       gradients over timesteps
+    */
+    stepNetDef_.add_external_input(timestep_);
+
+    AddGradientInputAccumulationOps(operator_def);
+    detail::AddApplyLinkOps(
+        links_, timestep_, operator_def.device_option(), &stepNetDef_);
+    AddParamGradientAccumulationOps(operator_def);
+
+    if (FLAGS_caffe2_rnn_executor && enable_rnn_executor_) {
+      InitializeExecutor(operator_def);
+    }
+  }
+
+  // Renaming maps (generated by memonger.py)
+  std::string remappedName(std::string blob_name) {
+    return OperatorBase::template GetSingleArgument<std::string>(
+        blob_name + ".rename", blob_name);
+  }
+
+  detail::Link remappedLink(const detail::Link& link) {
+    detail::Link renamed_link = link;
+    renamed_link.internal = remappedName(link.internal);
+    renamed_link.external = remappedName(link.external);
+    return renamed_link;
+  }
+
+  void renameOpInputOutput(std::string from_name, std::string to_name) {
+    for (int j = 0; j < stepNetDef_.op_size(); j++) {
+      auto* op = stepNetDef_.mutable_op(j);
+      for (int i = 0; i < op->input_size(); i++) {
+        if (op->input(i) == from_name) {
+          op->set_input(i, to_name);
+        }
+      }
+      for (int i = 0; i < op->output_size(); i++) {
+        if (op->output(i) == from_name) {
+          op->set_output(i, to_name);
+        }
+      }
+    }
+  }
+
+  std::vector<detail::Param> constructParams(const OperatorDef& operator_def) {
+    std::vector<detail::Param> params;
+    const auto& param = OperatorBase::GetRepeatedArgument<int32_t>("param");
+    const auto& param_grads =
+        OperatorBase::GetRepeatedArgument<string>("param_grads");
+    CAFFE_ENFORCE(
+        param_grads.empty() || param_grads.size() == param.size(),
+        param.size(),
+        " != ",
+        param_grads.size());
+    for (int i = 0; i < param.size(); ++i) {
+      detail::Param p;
+      // Forward inputs come after [outputs_with_grads] gradient inputs
+      p.param = operator_def.input(param[i] + gradInputs_.size());
+      // See GetRecurrentNetworkGradient to understand offseting here
+      p.grad = operator_def.output(i + numSequences_);
+
+      std::string grad_blob =
+          param_grads.empty() ? p.grad : remappedName(param_grads[i]);
+      p.cellGradient = grad_blob + "_tmpstep";
+      params.push_back(p);
+
+      renameOpInputOutput(grad_blob, p.cellGradient);
+    }
+    return params;
+  }
+
+  std::vector<detail::RecurrentGradient> constructRecurrentGradients(
+      const OperatorDef& operator_def) {
+    std::vector<detail::RecurrentGradient> rgs;
+    const auto& recurrent =
+        OperatorBase::GetRepeatedArgument<std::string>("recurrent_states");
+    const auto& alias_src =
+        OperatorBase::GetRepeatedArgument<std::string>("alias_src");
+    const auto& offset =
+        OperatorBase::GetRepeatedArgument<int32_t>("alias_offset");
+
+    for (auto i = 0; i < recurrent.size(); ++i) {
+      detail::RecurrentGradient rg;
+      rg.param = recurrent[i];
+      rg.grad = remappedName(recurrent[i] + "_grad");
+
+      for (int j = 0; j < alias_src.size(); ++j) {
+        if (alias_src[j] != recurrent[i]) {
+          continue;
+        }
+        int idx = -1;
+        for (int k = 0; k < gradInputs_.size(); ++k) {
+          if (gradInputs_[k] == j) {
+            idx = k;
+          }
+        }
+        if (idx == -1) {
+          continue;
+        }
+
+        CAFFE_ENFORCE(offset[j] == 1 || offset[j] == -1);
+        if (offset[j] == 1) {
+          rg.externalGrad = operator_def.input(idx);
+        } else if (offset[j] == -1) {
+          rg.lastExternalGrad = operator_def.input(idx);
+        }
+      }
+      rg.offset = 1;
+      rgs.push_back(rg);
+    }
+    return rgs;
+  }
+
+  std::vector<detail::Link> constructLinks() {
+    std::vector<detail::Link> links;
+    detail::extractLinks(
+        this,
+        "link_internal",
+        "link_external",
+        "link_offset",
+        "link_window",
+        &links);
+    detail::extractLinks(
+        this,
+        "backward_link_internal",
+        "backward_link_external",
+        "backward_link_offset",
+        "",
+        &links);
+    for (int i = 0; i < links.size(); i++) {
+      links[i] = remappedLink(links[i]);
+    }
+    return links;
+  }
+
+  void InitializeExecutor(const OperatorDef& operator_def) {
+    VLOG(1) << "Use RecurrentNetworkExecutor for backward";
+    auto recurrent_map = detail::GetRecurrentMapping(links_, true /* backward */);
+    rnnExecutor_ = createRNNExecutor<Context>(
+      stepNetDef_, recurrent_map, timestep_, ArgumentHelper(operator_def));
+  }
+
+  void AddGradientInputAccumulationOps(const OperatorDef& operator_def) {
+    /**
+      * Add ops to the step net to accumulate input gradients.
+      */
+    std::vector<OperatorDef> ops;
+    for (const auto& rg : recurrentGradients_) {
+      if (rg.externalGrad.empty()) {
+        continue;
+      }
+      VLOG(1) << "Accumulating into: " << rg.grad << " from " << rg.externalGrad
+              << ", offset: " << rg.offset;
+
+      OperatorDef opdef;
+      opdef.set_type("rnn_internal_accumulate_gradient_input");
+      opdef.add_input(timestep_);
+      opdef.add_input(rg.externalGrad);
+      opdef.add_input(rg.grad);
+      opdef.add_output(rg.grad);
+
+      // Add also the linked blobs to outputs, to ensure correct
+      // chaining.
+      for (auto& l : links_) {
+        if (rg.grad == l.external) {
+          Argument* dep_arg = opdef.add_arg();
+          dep_arg->set_name("rnn_dependency." + l.internal);
+          dep_arg->set_s(l.internal);
+        }
+      }
+
+      opdef.mutable_device_option()->CopyFrom(operator_def.device_option());
+
+      Argument* offset_arg = opdef.add_arg();
+      offset_arg->set_name("offset");
+      offset_arg->set_i(rg.offset);
+      ops.push_back(opdef);
+
+      stepNetDef_.add_external_input(rg.externalGrad);
+      stepNetDef_.add_external_input(rg.grad);
+    }
+    detail::PrependOps(ops, &stepNetDef_);
+  }
+
+  void AddParamGradientAccumulationOps(const OperatorDef& operator_def) {
+    // If a user passes in param_grads mapping, we can copy dirrectly
+    // form a blob where backward cell net written data to.
+    // This becomes handy in a case where gradient from the cell net
+    // is an internal blob of the backward cell. This happens, for example,
+    // when SumOp is the first op of the cell
+    for (const auto& param : params_) {
+      OperatorDef opdef;
+      opdef.set_type("Sum");
+      opdef.add_input(param.grad);
+      opdef.add_input(param.cellGradient);
+      opdef.add_output(param.grad);
+      opdef.mutable_device_option()->CopyFrom(operator_def.device_option());
+      stepNetDef_.add_op()->CopyFrom(opdef);
+      stepNetDef_.add_external_input(param.grad);
+    }
+  }
+
+  void CreateSharedBlobs(
+      const std::shared_ptr<Workspace>& step0Ws,
+      Workspace* sharedBlobsWs) {
+    /**
+      * Create all output blobs created by ops of the backward step net, they
+      * can be shared.
+      */
+    for (auto& op : stepNetDef_.op()) {
+      for (const string& outp : op.output()) {
+        if (!step0Ws->HasBlob(outp)) {
+          sharedBlobsWs->CreateBlob(outp);
+        }
+      }
+    }
+  }
+
+  template<typename T>
+  bool DoRunWithType() {
+    const auto seqLen = Input(gradInputs_.size()).dim32(0);
+    VLOG(1) << "seqLen: " << seqLen;
+
+    const detail::ScratchWorkspaces& scratch =
+        OperatorBase::Input<detail::ScratchWorkspaces>(InputSize() - 1);
+    const std::vector<std::shared_ptr<Workspace>>& stepWorkspaces =
+        scratch.stepWorkspaces;
+    CAFFE_ENFORCE_GE(stepWorkspaces.size(), seqLen);
+    Workspace& sharedBlobsWs = *scratch.sharedBlobsWs.get();
+
+    const auto batchSize = Input(0).dim32(1);
+    for (auto& param : params_) {
+      auto pBlob = sharedWs_->GetBlob(param.param);
+      CAFFE_ENFORCE(pBlob);
+      const auto& p = pBlob->template Get<Tensor<Context>>();
+
+      auto gBlob = sharedWs_->GetBlob(param.grad);
+      CAFFE_ENFORCE(gBlob);
+      auto* g = gBlob->template GetMutable<Tensor<Context>>();
+      g->ResizeLike(p);
+      math::Set<T, Context>(
+          g->size(),
+          convert::To<float,T>(0.0),
+          g->template mutable_data<T>(),
+          &context_);
+    }
+
+    for (auto& rg : recurrentGradients_) {
+      auto pBlob = sharedWs_->GetBlob(rg.param);
+      CAFFE_ENFORCE(pBlob);
+      const auto& p = pBlob->template Get<Tensor<Context>>();
+
+      auto gBlob = sharedWs_->CreateBlob(rg.grad);
+      CAFFE_ENFORCE(gBlob);
+      auto* g = gBlob->template GetMutable<Tensor<Context>>();
+      g->ResizeLike(p);
+      CAFFE_ENFORCE_EQ(g->ndim(), 3);
+      const auto timestep = g->size() / g->dim(0);
+      // Fill the last timestep with zeros for the gradient
+      math::Set<T, Context>(
+          timestep,
+          convert::To<float,T>(0.0),
+          g->template mutable_data<T>() + (g->dim(0) - 1) * timestep,
+          &context_);
+    }
+
+    // This code assumes that there are several inputs
+    // sequences. Actually it is not supported by the rest of the code,
+    // and numSequences_ is a constant, equal to 1.
+    for (int i = 0; i < numSequences_; ++i) {
+      // Offseting as the first gradInputs_.size() inputs of the op
+      // are from GO. Then all I(0..N).
+      const int gradientInputIndex = i + gradInputs_.size();
+      const auto& inputName = this->debug_def().input(gradientInputIndex);
+      auto gradientName = remappedName(inputName + "_grad");
+      VLOG(1) << "Initializing gradient for input " << gradientInputIndex
+              << " (" << inputName << ") "
+              << " as blob " << gradientName
+              << ". Size: " << Input(gradientInputIndex).size();
+      auto pGradientBlob = sharedWs_->GetBlob(gradientName);
+      CAFFE_ENFORCE(pGradientBlob);
+      auto* g = pGradientBlob->template GetMutable<Tensor<Context>>();
+      g->ResizeLike(Input(gradientInputIndex));
+      g->template mutable_data<T>();
+    }
+
+    auto accumulateFinalInputGradients = [&]() {
+      for (const auto& rg : recurrentGradients_) {
+        if (rg.lastExternalGrad.empty()) {
+          continue;
+        }
+        VLOG(1) << "Accumulating into: " << rg.grad << " from "
+                << rg.lastExternalGrad << " for final time step (sep. blob)";
+        auto gBlob = sharedWs_->GetBlob(rg.grad);
+        CAFFE_ENFORCE(gBlob);
+        auto* g = gBlob->template GetMutable<Tensor<Context>>();
+
+        auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
+        CAFFE_ENFORCE(oglastBlob);
+        const auto& oglast = oglastBlob->template Get<Tensor<Context>>();
+        CAFFE_ENFORCE_EQ(g->dim(1), oglast.dim(1));
+        CAFFE_ENFORCE_EQ(g->dim(2), oglast.dim(2));
+
+        const auto t = g->dim(0) - 1;
+        const auto timestep_size = g->size() / g->dim(0);
+        CAFFE_ENFORCE_EQ(timestep_size, oglast.size());
+        T* g_data_with_offset =
+            g->template mutable_data<T>() + t * timestep_size;
+        math::Add<T, Context>(
+            timestep_size,
+            oglast.template data<T>(),
+            g_data_with_offset,
+            g_data_with_offset,
+            &context_);
+      }
+    };
+
+    accumulateFinalInputGradients();
+
+    // Create shared blobs for blobs that can be shared between
+    // all timesteps.
+    if (stepWorkspaces.size() > 0) {
+      CreateSharedBlobs(stepWorkspaces[0], &sharedBlobsWs);
+    }
+    for (int32_t t = seqLen - 1; t >= 0; --t) {
+      if (rnnExecutor_) {
+        rnnExecutor_->EnsureTimestepInitialized(
+            t, stepWorkspaces[t].get(), this->observers_list_);
+      } else {
+        auto* stepNet = stepWorkspaces[t].get()->GetNet(stepNetDef_.name());
+        if (stepNet == nullptr) {
+          stepNet = stepWorkspaces[t].get()->CreateNet(stepNetDef_);
+        }
+        CAFFE_ENFORCE(stepNet);
+        stepNet->RunAsync();
+      }
+    }
+
+    if (rnnExecutor_) {
+      rnnExecutor_->RunBackwards(seqLen);
+    }
+
+    CAFFE_ENFORCE_EQ(recurrentInputIds_.size(), recurrentGradients_.size());
+    for (int i = 0; i < recurrentInputIds_.size(); ++i) {
+      // See GetRecurrentNetworkGradient to understand offseting here
+      // Outputs of the gradient are inputs of the forward pass.
+      // So we need to offset on all inputs that go before recurrent
+      // initial ones
+      auto outputIdx = i + params_.size() + numSequences_;
+      // because first gradInputs_.size() inputs are from GO
+      int inputId = recurrentInputIds_[i] + gradInputs_.size();
+      VLOG(1) << "Resetting output " << this->debug_def().output(outputIdx)
+              << " like input " << this->debug_def().input(inputId);
+      Output(outputIdx)->ResizeLike(Input(inputId));
+      T* output_data = Output(outputIdx)->template mutable_data<T>();
+      auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
+      CAFFE_ENFORCE(pBlob);
+      auto* p = pBlob->template GetMutable<Tensor<Context>>();
+
+      if (Input(inputId).ndim() >= 2) {
+        // Gradient states blob should live. And if it gets changed by the
+        // backward pass, then output should be changed as well. Thus it should
+        // be okay to share data here
+        Output(outputIdx)->template ShareExternalPointer<T>(
+            p->template mutable_data<T>());
+      } else {
+        // We need to do a bunch of Adds any way. So lets not worry about
+        // copy / share data here. One way to speed this up could be a kernel
+        // which sums up several tensors together instead of going 1 by 1
+        const auto recurrentStateSize = Input(inputId).dim32(0);
+
+        math::Set<T, Context>(
+            recurrentStateSize,
+            convert::To<float,T>(0.0),
+            output_data,
+            &context_);
+
+        math::AddStripedBatch<T, Context>(
+            recurrentStateSize,
+            p->template data<T>(),
+            output_data,
+            recurrentStateSize,
+            batchSize,
+            &context_);
+      }
+    }
+
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ protected:
+  NetDef stepNetDef_;
+  Workspace* sharedWs_;
+  bool enable_rnn_executor_;
+  std::unique_ptr<RecurrentNetworkExecutorBase> rnnExecutor_;
+  std::vector<detail::Link> links_;
+  std::vector<detail::Param> params_;
+  std::vector<detail::RecurrentGradient> recurrentGradients_;
+  std::string timestep_;
+  // For now we support only one input sequence
+  const int numSequences_{1};
+  std::vector<int32_t> recurrentInputIds_;
+  std::vector<int32_t> gradInputs_;
+};
+
+template <class Context>
+class AccumulateInputGradientOp : public Operator<Context> {
+ public:
+  AccumulateInputGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        offset_(OperatorBase::GetSingleArgument<int>("offset", -1)) {
+    CAFFE_ENFORCE(offset_ >= 0, "Offset not set");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template<typename T>
+  bool DoRunWithType() {
+    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
+    const auto t = t0.template data<int32_t>()[0];
+    auto& og = Input(1);
+    auto* g = Output(0);
+
+    T* g_data = g->template mutable_data<T>();
+    const auto timestep_size = g->size() / g->dim(0);
+
+    CAFFE_ENFORCE(
+        (t + offset_) * timestep_size + timestep_size <= g->size(),
+        "Accumulation destination address over bounds");
+    CAFFE_ENFORCE(
+        t * timestep_size + timestep_size <= og.size(),
+        "Accumulation source address out of bounds");
+
+    math::Add<T, Context>(
+        timestep_size,
+        og.template data<T>() + t * timestep_size,
+        g_data + (t + offset_) * timestep_size,
+        g_data + (t + offset_) * timestep_size,
+        &context_);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(1));
+  }
+
+ private:
+  int offset_;
+};
+
+template <class Context>
+class RNNApplyLinkOp : public Operator<Context> {
+ public:
+  RNNApplyLinkOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        offset_(OperatorBase::GetSingleArgument<int>("offset", -1)),
+        window_(OperatorBase::GetSingleArgument<int>("window", -1)) {
+    CAFFE_ENFORCE(offset_ >= 0, "offset not set");
+    CAFFE_ENFORCE(window_ >= 0, "window not set");
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <typename T>
+  bool DoRunWithType() {
+    // Both internal and external appear as both input and output to enforce
+    // correct dependency computation.
+    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
+    const auto t = t0.template data<int32_t>()[0];
+    auto& external = Input(1);
+
+    auto* internal_out = Output(0);
+    auto* external_out = Output(1);
+
+    CAFFE_ENFORCE_GT(external.size(), 0);
+    const TIndex externalTimestepSize = external.size() / external.dim(0);
+    auto* externalData = external_out->template mutable_data<T>() +
+        (t + offset_) * externalTimestepSize;
+    auto internalDims = external_out->dims();
+    internalDims[0] = window_;
+
+    internal_out->Resize(internalDims);
+    internal_out->ShareExternalPointer(
+        externalData, externalTimestepSize * window_);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DoRunWithType<float>();
+  }
+
+ private:
+  int offset_;
+  int window_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECURRENT_NETWORK_OP_H_
diff --git a/caffe2/operators/rnn/recurrent_network_op_gpu.cu b/caffe2/operators/rnn/recurrent_network_op_gpu.cu
new file mode 100644
index 0000000..8c39fa1
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_network_op_gpu.cu
@@ -0,0 +1,94 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/rnn/recurrent_network_op.h"
+
+namespace caffe2 {
+
+namespace detail {
+
+template <typename T, typename Context>
+void initializeRecurrentInput(
+    const RecurrentInput& rc,
+    int32_t seqLen,
+    int32_t batchSize,
+    Workspace* ws,
+    Context* context);
+
+namespace {
+
+template <typename T>
+__global__
+void initRecurrentInput_kernel(
+    size_t stateSize,
+    const T* input,
+    T* state) {
+  // index into appropriate target buffer
+  const int block_id = blockIdx.x;
+  T* state_local = state + block_id*stateSize;
+
+  // copy
+  for (int idx=threadIdx.x; idx < stateSize; idx+=blockDim.x) {
+    state_local[idx] = input[idx];
+  }
+}
+
+
+}; // namespace
+
+template <>
+void repeatCopy(
+    size_t repeat_n,
+    size_t n,
+    const float* src,
+    float* dst,
+    CUDAContext* context) {
+    initRecurrentInput_kernel<float><<<repeat_n, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+        n, src, dst);
+}
+template <>
+void repeatCopy(
+    size_t repeat_n,
+    size_t n,
+    const float16* src,
+    float16* dst,
+    CUDAContext* context) {
+    initRecurrentInput_kernel<float16><<<repeat_n, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+        n, src, dst);
+}
+
+}; // namespace detail
+
+template <>
+bool RecurrentNetworkOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+template <>
+bool RecurrentNetworkGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+template <>
+bool AccumulateInputGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(1));
+}
+
+template <>
+bool RNNApplyLinkOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(1));
+}
+
+REGISTER_CUDA_OPERATOR(
+    RecurrentNetwork,
+    RecurrentNetworkOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    RecurrentNetworkGradient,
+    RecurrentNetworkGradientOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    rnn_internal_accumulate_gradient_input,
+    AccumulateInputGradientOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    rnn_internal_apply_link,
+    RNNApplyLinkOp<CUDAContext>);
+
+
+} // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
new file mode 100644
index 0000000..fe55614
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -0,0 +1,609 @@
+#include "caffe2/operators/rnn/recurrent_op_cudnn.h"
+#include "caffe2/utils/math.h"
+
+#include <map>
+
+namespace caffe2 {
+
+namespace detail {
+
+template <typename T>
+TensorDescriptors<T>::TensorDescriptors(
+    size_t n,
+    const std::vector<int>& dim,
+    const std::vector<int>& stride) {
+  descs_.resize(n);
+  CAFFE_ENFORCE_EQ(dim.size(), stride.size());
+  for (auto i = 0; i < n; ++i) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&descs_[i]));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        descs_[i],
+        cudnnTypeWrapper<T>::type,
+        dim.size(),
+        dim.data(),
+        stride.data()));
+  }
+}
+
+template <typename T>
+TensorDescriptors<T>::~TensorDescriptors() {
+  for (auto desc : descs_) {
+    cudnnDestroyTensorDescriptor(desc);
+  }
+}
+}
+
+template <typename T>
+RecurrentBaseOp<T>::RecurrentBaseOp(
+    const OperatorDef& operator_def,
+    Workspace* ws)
+    : Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
+  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropoutDesc_));
+  CUDNN_ENFORCE(cudnnCreateRNNDescriptor(&rnnDesc_));
+  CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&wDesc_));
+  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&hxDesc_));
+  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&cxDesc_));
+  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&hyDesc_));
+  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&cyDesc_));
+}
+
+template <typename T>
+RecurrentBaseOp<T>::~RecurrentBaseOp() {
+  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropoutDesc_));
+  CUDNN_ENFORCE(cudnnDestroyRNNDescriptor(rnnDesc_));
+  CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(wDesc_));
+  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(hxDesc_));
+  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(cxDesc_));
+  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(hyDesc_));
+  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(cyDesc_));
+}
+
+template <typename T>
+void RecurrentBaseOp<T>::initialize(
+    const Tensor<CUDAContext>& input,
+    Tensor<CUDAContext>* dropoutStates,
+    Tensor<CUDAContext>* output,
+    Tensor<CUDAContext>* hiddenOutput,
+    Tensor<CUDAContext>* cellOutput) {
+  static_assert(sizeof(T) == 4, ""); // workaround clang bug
+  CAFFE_ENFORCE_GE(input.ndim(), 3);
+  const int seqLength = input.dim(0);
+  const int batchSize = input.dim(1);
+  const int inputDim = input.dim(2);
+  const int hiddenSize = OperatorBase::GetSingleArgument<int>("hidden_size", 0);
+  CAFFE_ENFORCE_GT(hiddenSize, 0);
+  const auto bidirectional =
+      OperatorBase::GetSingleArgument<int>("bidirectional", 0);
+  CAFFE_ENFORCE(bidirectional == 0 || bidirectional == 1);
+  const auto numDirections = bidirectional == 1 ? 2 : 1;
+  const auto outputDim = hiddenSize * numDirections;
+  const auto rnnDirection =
+      bidirectional == 1 ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+  const auto numLayers = OperatorBase::GetSingleArgument<int>("num_layers", 0);
+  CAFFE_ENFORCE_GT(numLayers, 0);
+  const auto& rnnModeStr =
+      OperatorBase::GetSingleArgument<string>("rnn_mode", "");
+  CAFFE_ENFORCE(rnnModeStr == "lstm" || rnnModeStr == "gru");
+  const auto rnnMode = rnnModeStr == "lstm" ? CUDNN_LSTM : CUDNN_GRU;
+  const auto& rnnInputStr =
+      OperatorBase::GetSingleArgument<string>("input_mode", "");
+  CAFFE_ENFORCE(rnnInputStr == "linear" || rnnInputStr == "skip");
+  const auto rnnInput =
+      rnnInputStr == "linear" ? CUDNN_LINEAR_INPUT : CUDNN_SKIP_INPUT;
+
+  // Dropout setup
+  {
+    if (dropoutStates) {
+      size_t stateSize;
+      float dropout_param =
+          OperatorBase::GetSingleArgument<float>("dropout", 1.0);
+      if (dropout_param < 1.0) {
+        CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
+            cudnn_wrapper_.inline_cudnn_handle(), &stateSize));
+        dropoutStates->Resize(std::vector<int>{static_cast<int>(
+            stateSize / 4 /* sizeof(T) - workaround clang bug */)});
+        CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
+            dropoutDesc_,
+            cudnn_wrapper_.inline_cudnn_handle(),
+            dropout_param,
+            dropoutStates->template mutable_data<T>(),
+            stateSize,
+            OperatorBase::GetSingleArgument<int>("seed", 0)));
+      }
+    }
+  }
+
+  // RNN setup
+  {
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        hiddenSize,
+        numLayers,
+        dropoutDesc_,
+        rnnInput,
+        rnnDirection,
+        rnnMode,
+        CUDNN_RNN_ALGO_STANDARD, // TODO: verify correctness / efficiency.
+        cudnnTypeWrapper<T>::type));
+#else
+    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+        rnnDesc_,
+        hiddenSize,
+        numLayers,
+        dropoutDesc_,
+        rnnInput,
+        rnnDirection,
+        rnnMode,
+        cudnnTypeWrapper<T>::type));
+#endif
+  }
+  // X setup
+  {
+    xDesc_.reset(new detail::TensorDescriptors<T>(
+        seqLength,
+        // Third dimension is unused
+        {batchSize, inputDim, 1},
+        // Fully-packed
+        {inputDim, 1, 1}));
+  }
+  // Y setup
+  {
+    yDesc_.reset(new detail::TensorDescriptors<T>(
+        seqLength,
+        // Third dimension is unused
+        {batchSize, hiddenSize * numDirections, 1},
+        // Fully-packed
+        {numDirections * hiddenSize, 1, 1}));
+
+    if (output) {
+      output->Resize(std::vector<int>{seqLength, batchSize, outputDim});
+    }
+  }
+
+  // Hidden/Cell setup
+  {
+    const std::array<int, 3> dim{
+        numLayers * numDirections, batchSize, hiddenSize};
+    const std::array<int, 3> stride{batchSize * hiddenSize, hiddenSize, 1};
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        hxDesc_, cudnnTypeWrapper<T>::type, 3, dim.data(), stride.data()));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        cxDesc_, cudnnTypeWrapper<T>::type, 3, dim.data(), stride.data()));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        hyDesc_, cudnnTypeWrapper<T>::type, 3, dim.data(), stride.data()));
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        cyDesc_, cudnnTypeWrapper<T>::type, 3, dim.data(), stride.data()));
+
+    if (hiddenOutput) {
+      hiddenOutput->Resize(
+          std::vector<int>{numLayers * numDirections, batchSize, hiddenSize});
+    }
+
+    if (cellOutput) {
+      cellOutput->Resize(
+          std::vector<int>{numLayers * numDirections, batchSize, hiddenSize});
+    }
+  }
+
+  // Weights setup
+  {
+    size_t weightsSize;
+    CUDNN_ENFORCE(cudnnGetRNNParamsSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        xDesc_->descs()[0],
+        &weightsSize,
+        cudnnTypeWrapper<T>::type));
+    const std::array<int, 3> dims{
+        static_cast<int>(
+            weightsSize / 4 /* sizeof(T) - workaround clang bug */),
+        1,
+        1};
+    CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
+        wDesc_, cudnnTypeWrapper<T>::type, CUDNN_TENSOR_NCHW, 3, dims.data()));
+  }
+
+  // RNN workspace size
+  {
+    CUDNN_ENFORCE(cudnnGetRNNWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        seqLength,
+        xDesc_->descs(),
+        &cudnnWsNbytes_));
+  }
+}
+
+template <typename T>
+bool RecurrentOp<T>::RunOnDevice() {
+  const int seqLength = Input(INPUT).dim32(0);
+  if (Input(INPUT).dims() != cachedInputDims_) {
+    initialize(
+        Input(INPUT),
+        Output(DROPOUT_STATES),
+        Output(OUTPUT),
+        Output(HIDDEN_OUTPUT),
+        Output(CELL_OUTPUT));
+    cachedInputDims_ = Input(INPUT).dims();
+  }
+
+  // Validation checks
+  size_t weightsSize;
+  CUDNN_ENFORCE(cudnnGetRNNParamsSize(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      rnnDesc_,
+      xDesc_->descs()[0],
+      &weightsSize,
+      cudnnTypeWrapper<T>::type));
+  CAFFE_ENFORCE_EQ(Input(WEIGHT).nbytes(), weightsSize);
+
+  // Training reserve size
+  CUDNN_ENFORCE(cudnnGetRNNTrainingReserveSize(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      rnnDesc_,
+      seqLength,
+      xDesc_->descs(),
+      &reserveNbytes_));
+  Output(RNN_SCRATCH)
+      ->Resize(std::vector<int>{static_cast<int>(
+          reserveNbytes_ / 4)}); // sizeof(T) - workaround clang bug
+  Output(RNN_SCRATCH)->template mutable_data<T>();
+
+  auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
+  auto OutputData = [this](int i) {
+    return this->Output(i)->template mutable_data<T>();
+  };
+
+  if (OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
+    cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
+      CUDNN_ENFORCE(cudnnRNNForwardInference(
+          state->cudnn_handle(),
+          rnnDesc_,
+          seqLength,
+          xDesc_->descs(),
+          InputData(INPUT), //.template data<T>(),
+          hxDesc_,
+          InputData(HIDDEN_INPUT), //.template data<T>(),
+          cxDesc_,
+          InputData(CELL_INPUT), //.template data<T>(),
+          wDesc_,
+          InputData(WEIGHT), //.template data<T>(),
+          yDesc_->descs(),
+          OutputData(OUTPUT), //->template mutable_data<T>(),
+          hyDesc_,
+          OutputData(HIDDEN_OUTPUT), //->template mutable_data<T>(),
+          cyDesc_,
+          OutputData(CELL_OUTPUT), //->template mutable_data<T>(),
+          state->workspace().get(cudnnWsNbytes_),
+          cudnnWsNbytes_));
+    });
+  } else {
+    cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
+      CUDNN_ENFORCE(cudnnRNNForwardTraining(
+          state->cudnn_handle(),
+          rnnDesc_,
+          seqLength,
+          xDesc_->descs(),
+          InputData(INPUT), //.template data<T>(),
+          hxDesc_,
+          InputData(HIDDEN_INPUT), //.template data<T>(),
+          cxDesc_,
+          InputData(CELL_INPUT), //.template data<T>(),
+          wDesc_,
+          InputData(WEIGHT), //.template data<T>(),
+          yDesc_->descs(),
+          OutputData(OUTPUT), //->template mutable_data<T>(),
+          hyDesc_,
+          OutputData(HIDDEN_OUTPUT), //->template mutable_data<T>(),
+          cyDesc_,
+          OutputData(CELL_OUTPUT), //->template mutable_data<T>(),
+          state->workspace().get(cudnnWsNbytes_),
+          cudnnWsNbytes_,
+          OutputData(RNN_SCRATCH), //->template mutable_data<T>(),
+          reserveNbytes_));
+    });
+  }
+
+  return true;
+}
+
+template <typename T>
+bool RecurrentGradientOp<T>::RunOnDevice() {
+  const int seqLength = Input(INPUT).dim32(0);
+  if (Input(INPUT).dims() != cachedInputDims_) {
+    initialize(Input(INPUT), Output(DROPOUT_STATES));
+    cachedInputDims_ = Input(INPUT).dims();
+  }
+  CUDNN_ENFORCE(cudnnGetRNNTrainingReserveSize(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      rnnDesc_,
+      seqLength,
+      xDesc_->descs(),
+      &reserveNbytes_));
+  CAFFE_ENFORCE_EQ(reserveNbytes_, Input(RNN_SCRATCH).nbytes());
+  Output(GRAD_INPUT)->ResizeLike(Input(INPUT));
+  Output(GRAD_HIDDEN_INPUT)->ResizeLike(Input(HIDDEN_INPUT));
+  Output(GRAD_CELL_INPUT)->ResizeLike(Input(CELL_INPUT));
+
+  Output(GRAD_WEIGHT)->ResizeLike(Input(WEIGHT));
+  math::Set<T, CUDAContext>(
+      Output(GRAD_WEIGHT)->size(),
+      0.0,
+      Output(GRAD_WEIGHT)->template mutable_data<T>(),
+      &context_);
+
+#if CUDNN_VERSION_MIN(6,0,0)
+  auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data<T>();
+#else
+  const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
+#endif
+  auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
+  auto OutputData = [this](int i) {
+    return this->Output(i)->template mutable_data<T>();
+  };
+
+  cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
+    CUDNN_ENFORCE(cudnnRNNBackwardData(
+        state->cudnn_handle(),
+        rnnDesc_,
+        seqLength,
+        yDesc_->descs(),
+        InputData(OUTPUT), // Input(OUTPUT).template data<T>(),
+        yDesc_->descs(),
+        InputData(GRAD_OUTPUT), // Input(GRAD_OUTPUT).template data<T>(),
+        hyDesc_,
+        // Note: like CNTK, ignore these gradient inputs. t16675365 to
+        // reconsider.
+        nullptr,
+        cyDesc_,
+        nullptr,
+        wDesc_,
+        InputData(WEIGHT), // Input(WEIGHT).template data<T>(),
+        hxDesc_,
+        InputData(HIDDEN_INPUT), // Input(HIDDEN_INPUT).template data<T>(),
+        cxDesc_,
+        InputData(CELL_INPUT),
+        xDesc_->descs(),
+        OutputData(GRAD_INPUT),
+        hxDesc_,
+        OutputData(GRAD_HIDDEN_INPUT),
+        cxDesc_,
+        OutputData(GRAD_CELL_INPUT),
+        state->workspace().get(cudnnWsNbytes_),
+        cudnnWsNbytes_,
+        reserve,
+        reserveNbytes_));
+    CUDNN_ENFORCE(cudnnRNNBackwardWeights(
+        state->cudnn_handle(),
+        rnnDesc_,
+        seqLength,
+        xDesc_->descs(),
+        InputData(INPUT), // Input(INPUT).template data<T>(),
+        hxDesc_,
+        InputData(HIDDEN_INPUT), // Input(HIDDEN_INPUT).template data<T>(),
+        yDesc_->descs(),
+        InputData(OUTPUT), // Input(OUTPUT).template data<T>(),
+        state->workspace().get(cudnnWsNbytes_),
+        cudnnWsNbytes_,
+        wDesc_,
+        OutputData(
+            GRAD_WEIGHT), // Output(GRAD_WEIGHT)->template mutable_data<T>(),
+        reserve,
+        reserveNbytes_));
+  });
+
+  return true;
+}
+
+template <typename T, RecurrentParamOpMode mode>
+bool RecurrentParamAccessOp<T, mode>::RunOnDevice() {
+  initialize(Input(0));
+
+  if (mode == SET_PARAM) {
+    size_t paramsSize;
+    CUDNN_ENFORCE(cudnnGetRNNParamsSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        xDesc_->descs()[0],
+        &paramsSize,
+        cudnnTypeWrapper<T>::type));
+
+    CAFFE_ENFORCE_EQ(
+        paramsSize / 4, Input(1).size(), "Incorrect weight initialization");
+  }
+
+  int layer = OperatorBase::GetSingleArgument<int>("layer", 0);
+  std::string param_type =
+      OperatorBase::GetSingleArgument<string>("param_type", "");
+  std::string input_type =
+      OperatorBase::GetSingleArgument<string>("input_type", "");
+
+  // Mapping to CUDNN constants
+  std::map<string, int> weight_constants = {{"input_gate_w", 0},
+                                            {"forget_gate_w", 1},
+                                            {"cell_w", 2},
+                                            {"output_gate_w", 3}};
+  std::map<string, int> bias_constants = {{"input_gate_b", 0},
+                                          {"forget_gate_b", 1},
+                                          {"cell_b", 2},
+                                          {"output_gate_b", 3}};
+  if (bias_constants.find(param_type) != bias_constants.end()) {
+    int param_id = bias_constants[param_type] + 4 * (input_type == "recurrent");
+
+    cudnnFilterDescriptor_t biasDesc;
+    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&biasDesc));
+    void* bias;
+
+    CUDNN_ENFORCE(cudnnGetRNNLinLayerBiasParams(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        layer,
+        xDesc_->descs()[0],
+        wDesc_,
+        Input(1).template data<T>(),
+        param_id, // Forget gate bias for recurrent input
+        biasDesc,
+        &bias));
+    int numBiasDims;
+    std::vector<int> biasDims(3);
+    cudnnDataType_t dt;
+    cudnnTensorFormat_t tf;
+    // For some reason, the CuDNN Bias tensor is 3 dimensional
+    CUDNN_ENFORCE(cudnnGetFilterNdDescriptor(
+        biasDesc, 3, &dt, &tf, &numBiasDims, biasDims.data()));
+    CAFFE_ENFORCE_EQ(numBiasDims, 3);
+
+    if (mode == SET_PARAM) {
+      CAFFE_ENFORCE_EQ(
+          biasDims[0] * biasDims[1] * biasDims[2], Input(2).size());
+      context_.template Copy<T, CUDAContext, CUDAContext>(
+          biasDims[0] * biasDims[1] * biasDims[2],
+          Input(2).template data<T>(),
+          static_cast<T*>(bias));
+    } else {
+      Output(0)->Resize(biasDims);
+      context_.template Copy<T, CUDAContext, CUDAContext>(
+          biasDims[0] * biasDims[1] * biasDims[2],
+          static_cast<T*>(bias),
+          Output(0)->template mutable_data<T>());
+    }
+  } else if (weight_constants.find(param_type) != weight_constants.end()) {
+    int param_id =
+        weight_constants[param_type] + 4 * (input_type == "recurrent");
+    cudnnFilterDescriptor_t matrixParamDesc;
+    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&matrixParamDesc));
+    void* pmatrix;
+    CUDNN_ENFORCE(cudnnGetRNNLinLayerMatrixParams(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        rnnDesc_,
+        layer,
+        xDesc_->descs()[0],
+        wDesc_,
+        Input(1).template data<T>(),
+        param_id, // Forget gate bias for recurrent input
+        matrixParamDesc,
+        &pmatrix));
+    int numDims;
+    std::vector<int> matDims(3);
+    cudnnDataType_t dt;
+    cudnnTensorFormat_t tf;
+
+    CUDNN_ENFORCE(cudnnGetFilterNdDescriptor(
+        matrixParamDesc, 3, &dt, &tf, &numDims, matDims.data()));
+    CAFFE_ENFORCE_EQ(numDims, 3);
+    if (mode == SET_PARAM) {
+      CAFFE_ENFORCE_EQ(matDims[0] * matDims[1] * matDims[2], Input(2).size());
+      context_.template Copy<T, CUDAContext, CUDAContext>(
+          matDims[0] * matDims[1] * matDims[2],
+          Input(2).template data<T>(),
+          static_cast<T*>(pmatrix));
+    } else {
+      Output(0)->Resize(matDims);
+      context_.template Copy<T, CUDAContext, CUDAContext>(
+          matDims[0] * matDims[1] * matDims[2],
+          static_cast<T*>(pmatrix),
+          Output(0)->template mutable_data<T>());
+    }
+  } else {
+    CAFFE_ENFORCE(false, "Unknown param type:", param_type);
+  }
+
+  return true;
+}
+
+REGISTER_CUDNN_OPERATOR(Recurrent, RecurrentOp<float>);
+OPERATOR_SCHEMA(Recurrent).NumInputs(4).NumOutputs(5).SetDoc(R"DOC(
+
+Recurrent wraps the CuDNN R5 RNN implementation. See the CuDNN R5
+documentation for more information.
+
+In general, the implementation takes an input (TxNxD) tensor, the
+hidden state input (NxD), the cell input (NxD), and a weight tensor
+(effectively an opaque blob, where the size and layout is dictated by
+CuDNN).
+
+The outputs are the output (again, TxNxD), the final hidden/cell
+states (NxD). These can be reset (at sequence boundaries across
+minibatches) by multiplying by zero.
+
+The CuDNN arguments (hidden_size, bidirectional, num_layers, rnn_mode,
+input_mode) are passed directly through to CuDNN.
+
+)DOC");
+REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp<float>);
+OPERATOR_SCHEMA(RecurrentGradient)
+    .NumInputs(7)
+    .NumOutputs(6)
+    .AllowInplace({{4, 5}});
+
+REGISTER_CUDNN_OPERATOR(
+    RecurrentParamSet,
+    RecurrentParamAccessOp<float, SET_PARAM>);
+OPERATOR_SCHEMA(RecurrentParamSet)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .EnforceInplace({{1, 0}})
+    .SetDoc("Set individual parameters of a recurrent net.")
+    .Arg("param_type", R"DOC(Type of param to be set:
+                  "input_gate_w", "forget_gate_w", "cell_w", "output_gate_w"
+                  "input_gate_b", "forget_gate_b", "cell_b", "output_gate_b"
+                  )DOC")
+    .Arg("input_type", "'recurrent' or 'input'")
+    .Arg("layer", "layer index (starting from 0)")
+    .Input(0, "input", R"DOC(Input blob. Needed for inferring the shapes.
+                        A dummy tensor matching the input shape is ok.)DOC")
+    .Input(1, "all_params", "Blob holding all the parameters")
+    .Input(2, "param", "Values for the specified parameter")
+    .Output(
+        0,
+        "all_params",
+        "Blob holding all the parameters (same as input(1))");
+
+REGISTER_CUDNN_OPERATOR(
+    RecurrentParamGet,
+    RecurrentParamAccessOp<float, GET_PARAM>);
+OPERATOR_SCHEMA(RecurrentParamGet)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Retrieve individual parameters of a recurrent net op.")
+    .Arg("param_type", R"DOC(Type of param to be set:
+                  "input_gate_w", "forget_gate_w", "cell_w", "output_gate_w"
+                  "input_gate_b", "forget_gate_b", "cell_b", "output_gate_b"
+                  )DOC")
+    .Arg("input_type", "'recurrent' or 'input'")
+    .Arg("layer", "layer index (starting from 0)")
+    .Input(0, "input", R"DOC(Input blob. Needed for inferring the shapes.
+                        A dummy tensor matching the input shape is ok.)DOC")
+    .Input(1, "all_params", "Blob holding all the parameters")
+    .Output(0, "param", "Blob holding the requested values");
+
+struct GetRecurrentGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RecurrentGradient",
+        "",
+        vector<string>{I(0), // INPUT
+                       I(1), // HIDDEN_INPUT
+                       I(2), // CELL_INPUT
+                       I(3), // WEIGHT
+                       O(3), // RNN_SCRATCH
+                       O(0), // OUTPUT
+                       GO(0)}, // GRAD_OUTPUT
+        // TODO: not currently using these gradients, investigate t16675365
+        //     GO(1), // GRAD_HIDDEN_OUTPUT
+        //     GO(2)}, // GRAD_CELL_OUTPUT
+        vector<string>{
+            GI(0), // GRAD_INPUT
+            GI(1), // GRAD_HIDDEN_INPUT
+            GI(2), // GRAD_CELL_INPUT
+            GI(3), // GRAD_WEIGHT
+            O(4), // DROPOUT_STATES
+            O(3) // RNN_SCRATCH
+        });
+  }
+};
+REGISTER_GRADIENT(Recurrent, GetRecurrentGradient);
+}
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.h b/caffe2/operators/rnn/recurrent_op_cudnn.h
new file mode 100644
index 0000000..25bcc20
--- /dev/null
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.h
@@ -0,0 +1,141 @@
+#ifndef CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
+#define CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace detail {
+
+template <typename T>
+class TensorDescriptors {
+ public:
+  TensorDescriptors(
+      size_t n,
+      const std::vector<int>& dim,
+      const std::vector<int>& stride);
+  ~TensorDescriptors();
+  const cudnnTensorDescriptor_t* descs() const {
+    return descs_.data();
+  }
+
+ private:
+  std::vector<cudnnTensorDescriptor_t> descs_;
+};
+
+} // namespace detail
+
+template <typename T>
+class RecurrentBaseOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  RecurrentBaseOp(const OperatorDef& operator_def, Workspace* ws);
+  virtual ~RecurrentBaseOp();
+
+ protected:
+  void initialize(
+      const Tensor<CUDAContext>& input,
+      Tensor<CUDAContext>* dropoutStates = nullptr,
+      // If passed, reshapes to the appropriate size
+      Tensor<CUDAContext>* output = nullptr,
+      Tensor<CUDAContext>* hiddenOutput = nullptr,
+      Tensor<CUDAContext>* cellOutput = nullptr);
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnDropoutDescriptor_t dropoutDesc_;
+  cudnnRNNDescriptor_t rnnDesc_;
+  cudnnFilterDescriptor_t wDesc_;
+  cudnnTensorDescriptor_t hxDesc_;
+  cudnnTensorDescriptor_t cxDesc_;
+  cudnnTensorDescriptor_t hyDesc_;
+  cudnnTensorDescriptor_t cyDesc_;
+
+  std::unique_ptr<detail::TensorDescriptors<T>> xDesc_;
+  std::unique_ptr<detail::TensorDescriptors<T>> yDesc_;
+
+  std::vector<TIndex> cachedInputDims_;
+  size_t reserveNbytes_;
+  size_t cudnnWsNbytes_;
+
+ private:
+};
+
+#define USE_RECURRENT_BASE_FUNCTIONS          \
+  USE_OPERATOR_FUNCTIONS(CUDAContext);        \
+  using RecurrentBaseOp<T>::cudnn_wrapper_;   \
+  using RecurrentBaseOp<T>::dropoutDesc_;     \
+  using RecurrentBaseOp<T>::rnnDesc_;         \
+  using RecurrentBaseOp<T>::wDesc_;           \
+  using RecurrentBaseOp<T>::hxDesc_;          \
+  using RecurrentBaseOp<T>::cxDesc_;          \
+  using RecurrentBaseOp<T>::hyDesc_;          \
+  using RecurrentBaseOp<T>::cyDesc_;          \
+  using RecurrentBaseOp<T>::xDesc_;           \
+  using RecurrentBaseOp<T>::yDesc_;           \
+  using RecurrentBaseOp<T>::cachedInputDims_; \
+  using RecurrentBaseOp<T>::reserveNbytes_;   \
+  using RecurrentBaseOp<T>::cudnnWsNbytes_;   \
+  using RecurrentBaseOp<T>::initialize;
+
+template <typename T>
+class RecurrentOp : public RecurrentBaseOp<T> {
+ public:
+  USE_RECURRENT_BASE_FUNCTIONS
+  RecurrentOp(const OperatorDef& operator_def, Workspace* ws)
+      : RecurrentBaseOp<T>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(INPUT, HIDDEN_INPUT, CELL_INPUT, WEIGHT);
+  OUTPUT_TAGS(OUTPUT, HIDDEN_OUTPUT, CELL_OUTPUT, RNN_SCRATCH, DROPOUT_STATES);
+};
+
+enum RecurrentParamOpMode { SET_PARAM, GET_PARAM };
+
+template <typename T, RecurrentParamOpMode mode>
+class RecurrentParamAccessOp : public RecurrentBaseOp<T> {
+ public:
+  USE_RECURRENT_BASE_FUNCTIONS
+  RecurrentParamAccessOp(const OperatorDef& operator_def, Workspace* ws)
+      : RecurrentBaseOp<T>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+template <typename T>
+class RecurrentGradientOp : public RecurrentBaseOp<T> {
+ public:
+  USE_RECURRENT_BASE_FUNCTIONS
+  RecurrentGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : RecurrentBaseOp<T>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(
+      INPUT,
+      HIDDEN_INPUT,
+      CELL_INPUT,
+      WEIGHT,
+      RNN_SCRATCH,
+      OUTPUT,
+      GRAD_OUTPUT,
+      GRAD_HIDDEN_OUTPUT,
+      GRAD_CELL_OUTPUT);
+  OUTPUT_TAGS(
+      GRAD_INPUT,
+      GRAD_HIDDEN_INPUT,
+      GRAD_CELL_INPUT,
+      GRAD_WEIGHT,
+      DROPOUT_STATES,
+      RNN_SCRATCH_OUT);
+};
+
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
new file mode 100644
index 0000000..1cc4103
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -0,0 +1,255 @@
+#include "roi_align_gradient_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(const T& val, T* address) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int /*num_rois*/,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    int rois_cols) {
+  DCHECK(rois_cols == 4 || rois_cols == 5);
+
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * rois_cols;
+    int roi_batch_ind = 0;
+    if (rois_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignBackward
+
+} // namespace
+
+template <>
+bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  // if R has 5 columns, the first column is the index, otherwise 0
+  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
+
+  dX->ResizeLike(X);
+
+  // Must zero-out dX before accumulating gradients
+  // (TODO): Kaiming - is this safe?
+  math::Set<float, CPUContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    ROIAlignBackwardFeature<float>(
+        dY.size(),
+        dY.data<float>(),
+        R.dim32(0),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        dX->mutable_data<float>(),
+        R.data<float>(),
+        R.dim32(1));
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIAlignGradient, RoIAlignGradientOp<float, CPUContext>);
+
+// Input: X, rois, dY (aka "gradOutput");
+// Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(RoIAlignGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(0, "X", "See RoIPoolF.")
+    .Input(1, "RoIs", "See RoIPoolF.")
+    .Input(2, "dY", "Gradient of forward output 0 (Y)")
+    .Output(0, "dX", "Gradient of forward input 0 (X)");
+
+namespace {
+
+class GetRoIAlignGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RoIAlignGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(RoIAlign, GetRoIAlignGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
new file mode 100644
index 0000000..26fb555
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -0,0 +1,231 @@
+#include "roi_align_gradient_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__ float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          gpu_atomic_add(
+              static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          gpu_atomic_add(
+              static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+} // namespace
+
+template <>
+bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+
+  // Must zero-out dX before accumulating gradients
+  // (TODO): Kaiming - is this safe?
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    RoIAlignBackwardFeature<float>
+        <<<CAFFE_GET_BLOCKS(dY.size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            dY.size(),
+            dY.data<float>(),
+            R.dim32(0),
+            spatial_scale_,
+            X.dim32(1),
+            X.dim32(2),
+            X.dim32(3),
+            pooled_height_,
+            pooled_width_,
+            sampling_ratio_,
+            dX->mutable_data<float>(),
+            R.data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    RoIAlignGradient,
+    RoIAlignGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_gradient_op.h b/caffe2/operators/roi_align_gradient_op.h
new file mode 100644
index 0000000..509825f
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.h
@@ -0,0 +1,43 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignGradientOp final : public Operator<Context> {
+ public:
+  RoIAlignGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_ALIGN_OP_H_
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
new file mode 100644
index 0000000..0d62dcf
--- /dev/null
+++ b/caffe2/operators/roi_align_op.cc
@@ -0,0 +1,379 @@
+#include "roi_align_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data,
+    StorageOrder order) {
+  DCHECK(roi_cols == 4 || roi_cols == 5);
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    if (order == StorageOrder::NCHW) {
+      for (int c = 0; c < channels; c++) {
+        int index_n_c = index_n + c * pooled_width * pooled_height;
+        const T* offset_bottom_data =
+            bottom_data + (roi_batch_ind * channels + c) * height * width;
+        int pre_calc_index = 0;
+
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            int index = index_n_c + ph * pooled_width + pw;
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                PreCalc<T> pc = pre_calc[pre_calc_index];
+                output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                    pc.w2 * offset_bottom_data[pc.pos2] +
+                    pc.w3 * offset_bottom_data[pc.pos3] +
+                    pc.w4 * offset_bottom_data[pc.pos4];
+
+                pre_calc_index += 1;
+              }
+            }
+            output_val /= count;
+
+            top_data[index] = output_val;
+          } // for pw
+        } // for ph
+      } // for c
+    } // if nchw
+
+    if (order == StorageOrder::NHWC) {
+      const T* offset_bottom_data =
+          bottom_data + roi_batch_ind * channels * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          EVecXf output_vals = EVecXf::Zero(channels);
+
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+
+              ConstEigenVectorMap<T> data_1(
+                  offset_bottom_data + channels * pc.pos1, channels);
+              ConstEigenVectorMap<T> data_2(
+                  offset_bottom_data + channels * pc.pos2, channels);
+              ConstEigenVectorMap<T> data_3(
+                  offset_bottom_data + channels * pc.pos3, channels);
+              ConstEigenVectorMap<T> data_4(
+                  offset_bottom_data + channels * pc.pos4, channels);
+
+              output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
+                  pc.w4 * data_4;
+
+              pre_calc_index += 1;
+            }
+          }
+          output_vals /= count;
+
+          int index_nhw = index_n + (ph * pooled_width + pw) * channels;
+          std::memcpy(
+              top_data + index_nhw, output_vals.data(), channels * sizeof(T));
+        } // for pw
+      } // for ph
+    } // if nhwc
+
+  } // for n
+}
+
+} // namespace
+
+template <>
+bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool, NCHW
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    if (order_ == StorageOrder::NCHW) {
+      Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    } else if (order_ == StorageOrder::NHWC) {
+      Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
+    }
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  // if R has 5 columns, the first column is the index, otherwise 0
+  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
+
+  assert(sampling_ratio_ >= 0);
+
+  if (order_ == StorageOrder::NCHW) {
+    Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+    int output_size = Y->size();
+    ROIAlignForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  } else if (order_ == StorageOrder::NHWC) {
+    Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
+    int output_size = Y->size();
+    ROIAlignForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(3),
+        X.dim32(1),
+        X.dim32(2),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    RoIAlign,
+    mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+// Input: X, rois; Output: Y
+OPERATOR_SCHEMA(RoIAlign)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Region of Interest (RoI) align operation as used in Mask R-CNN.
+)DOC")
+    .Arg(
+        "spatial_scale",
+        "(float) default 1.0; Spatial scale of the input feature map X "
+        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
+        "w.r.t. the input image.")
+    .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
+    .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
+    .Arg(
+        "sampling_ratio",
+        "(int) default -1; number of sampling points in the interpolation grid "
+        "used to compute the output value of each pooled output bin. If > 0, "
+        "then exactly sampling_ratio x sampling_ratio grid points are used. If "
+        "<= 0, then an adaptive number of grid points are used (computed as "
+        "ceil(roi_width / pooled_w), and likewise for height).")
+    .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
+    .Input(
+        1,
+        "RoIs",
+        "2D input of shape (R, 4 or 5) specifying R RoIs "
+        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
+        "coordinates are in the coordinate system of the input image. For "
+        "inputs corresponding to a single image, batch index can be excluded "
+        "to have just 4 columns.")
+    .Output(
+        0,
+        "Y",
+        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
+        "is a pooled feature map cooresponding to the r-th RoI.");
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
new file mode 100644
index 0000000..b02aa9d
--- /dev/null
+++ b/caffe2/operators/roi_align_op.cu
@@ -0,0 +1,188 @@
+#include "roi_align_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    // RoI could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = roundf(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = roundf(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = roundf(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = roundf(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+} // namespace
+
+template <>
+bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  assert(sampling_ratio_ >= 0);
+
+  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+  int output_size = Y->size();
+  RoIAlignForward<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          spatial_scale_,
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          pooled_height_,
+          pooled_width_,
+          sampling_ratio_,
+          R.data<float>(),
+          R.dim32(1),
+          Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(RoIAlign, RoIAlignOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_op.h b/caffe2/operators/roi_align_op.h
new file mode 100644
index 0000000..fc6f67c
--- /dev/null
+++ b/caffe2/operators/roi_align_op.h
@@ -0,0 +1,47 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignOp final : public Operator<Context> {
+ public:
+  RoIAlignOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+    DCHECK(order_ == StorageOrder::NCHW || order_ == StorageOrder::NHWC);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  StorageOrder order_;
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_ALIGN_OP_H_
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
new file mode 100644
index 0000000..199500f
--- /dev/null
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -0,0 +1,267 @@
+#include "caffe2/utils/eigen_utils.h"
+#include "roi_align_op.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Context* context,
+    Workspace* ws) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<Context>>();
+  tensor->Resize(shape);
+  math::Set<float, Context>(
+      tensor->size(), value, tensor->template mutable_data<float>(), context);
+  return;
+}
+
+template <class Context>
+void AddInput(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws);
+
+template <>
+void AddInput<CPUContext>(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  EigenVectorMap<float> tensor_vec(
+      tensor->mutable_data<float>(), tensor->size());
+  tensor_vec.array() = utils::AsEArrXt(values);
+}
+
+template <>
+void AddInput<CUDAContext>(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  TensorCPU tmp(shape);
+  EigenVectorMap<float> tmp_vec(tmp.mutable_data<float>(), tmp.size());
+  tmp_vec.array() = utils::AsEArrXt(values);
+
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->template GetMutable<Tensor<CUDAContext>>();
+  tensor->CopyFrom(tmp);
+}
+
+template <class Context>
+DeviceType GetDeviceType() {
+  return CPU;
+}
+template <>
+DeviceType GetDeviceType<CUDAContext>() {
+  return CUDA;
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+struct TestParams {
+  int N;
+  int C;
+  int H;
+  int W;
+  int n_rois;
+  vector<float> rois_array;
+};
+
+template <class Context>
+void CreateAndRun(
+    TensorCPU* outResult,
+    const string& order,
+    const TestParams& test_params,
+    bool random_test) {
+  Workspace ws;
+  Context context;
+
+  if (random_test) {
+    const int N = test_params.N;
+    const int C = test_params.C;
+    const int H = test_params.H;
+    const int W = test_params.W;
+    vector<float> features(N * C * H * W);
+    std::iota(features.begin(), features.end(), 0);
+    // utils::AsEArrXt(features) /= features.size();
+    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    const int n_rois = test_params.n_rois;
+    const vector<float>& rois = test_params.rois_array;
+    AddInput<Context>(vector<TIndex>{n_rois, 5}, rois, "R", &ws);
+  } else {
+    const int N = 2;
+    const int C = 3;
+    const int H = 100;
+    const int W = 110;
+    vector<float> features(N * C * H * W);
+    std::iota(features.begin(), features.end(), 0);
+    // utils::AsEArrXt(features) /= features.size();
+    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    vector<float> rois{0, 0,            0,            79,           59,
+                       0, 0,            5.0005703f,   52.63237f,    43.69501495f,
+                       0, 24.13628387f, 7.51243401f,  79,           46.06628418f,
+                       0, 0,            7.50924301f,  68.47792816f, 46.03357315f,
+                       0, 0,            23.09477997f, 51.61448669f, 59,
+                       0, 0,            39.52141571f, 52.44710541f, 59,
+                       0, 23.57396317f, 29.98791885f, 79,           59,
+                       0, 0,            41.90219116f, 79,           59,
+                       0, 0,            23.30098343f, 79,           59};
+    AddInput<Context>(vector<TIndex>{9, 5}, rois, "R", &ws);
+  }
+
+  std::vector<unique_ptr<OperatorBase>> ops;
+  EXPECT_TRUE(order == "NCHW" || order == "NHWC");
+  if (order == "NCHW") {
+    OperatorDef def;
+    def.set_name("test");
+    def.set_type("RoIAlign");
+    def.add_input("X");
+    def.add_input("R");
+    def.add_output("Y");
+    def.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+    def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
+    def.add_arg()->CopyFrom(MakeArgument("pooled_h", 6));
+    def.add_arg()->CopyFrom(MakeArgument("pooled_w", 8));
+    def.add_arg()->CopyFrom(MakeArgument("sampling_ratio", 2));
+
+    ops.push_back(CreateOperator(def, &ws));
+  } else if (order == "NHWC") {
+    OperatorDef def_roialign;
+    def_roialign.set_name("test");
+    def_roialign.set_type("RoIAlign");
+    def_roialign.add_input("X_NHWC");
+    def_roialign.add_input("R");
+    def_roialign.add_output("Y_NHWC");
+    def_roialign.mutable_device_option()->set_device_type(
+        GetDeviceType<Context>());
+    def_roialign.add_arg()->CopyFrom(
+        MakeArgument("spatial_scale", 1.0f / 16.0f));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("pooled_h", 6));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("pooled_w", 8));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("sampling_ratio", 2));
+    def_roialign.add_arg()->CopyFrom(MakeArgument<string>("order", "NHWC"));
+
+    OperatorDef def_x;
+    def_x.set_name("test_x");
+    def_x.set_type("NCHW2NHWC");
+    def_x.add_input("X");
+    def_x.add_output("X_NHWC");
+    def_x.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+
+    OperatorDef def_y;
+    def_y.set_name("test_y");
+    def_y.set_type("NHWC2NCHW");
+    def_y.add_input("Y_NHWC");
+    def_y.add_output("Y");
+    def_y.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+
+    ops.push_back(CreateOperator(def_x, &ws));
+    ops.push_back(CreateOperator(def_roialign, &ws));
+    ops.push_back(CreateOperator(def_y, &ws));
+  }
+
+  for (auto const& op : ops) {
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+  }
+
+  Blob* Y_blob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, Y_blob);
+
+  auto& Y = Y_blob->Get<Tensor<Context>>();
+  outResult->CopyFrom(Y, &context);
+}
+
+} // namespace
+
+TEST(RoiAlignTest, CheckCPUGPUEqual) {
+  if (!caffe2::HasCudaGPU())
+    return;
+
+  TensorCPU y_cpu;
+  TensorCPU y_gpu;
+  TensorCPU y_cpu_nhwc;
+
+  // tests using FAIR example
+  {
+    TestParams test_params;
+    CreateAndRun<CPUContext>(&y_cpu, "NCHW", test_params, false);
+    CreateAndRun<CUDAContext>(&y_gpu, "NCHW", test_params, false);
+    CreateAndRun<CPUContext>(&y_cpu_nhwc, "NHWC", test_params, false);
+
+    EXPECT_EQ(y_cpu.dims(), y_gpu.dims());
+    EXPECT_EQ(y_cpu.dims(), y_cpu_nhwc.dims());
+    ConstEigenVectorMap<float> y_cpu_vec(y_cpu.data<float>(), y_cpu.size());
+    ConstEigenVectorMap<float> y_gpu_vec(y_gpu.data<float>(), y_gpu.size());
+    ConstEigenVectorMap<float> y_cpu_nhwc_vec(
+        y_cpu_nhwc.data<float>(), y_cpu_nhwc.size());
+    int max_diff_idx = -1;
+    (y_cpu_vec - y_gpu_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_gpu_vec[max_diff_idx]);
+
+    max_diff_idx = -1;
+    (y_cpu_vec - y_cpu_nhwc_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_cpu_nhwc_vec[max_diff_idx]);
+  }
+
+  // random tests
+  const int random_test_numbers = 100;
+  for (int i = 0; i < random_test_numbers; i++) {
+    const int N = randInt(1, 5);
+    const int C = randInt(1, 5);
+    const int H = randInt(1, 50);
+    const int W = randInt(1, 50);
+    const int n_rois = randInt(1, 30);
+    vector<float> rois_array;
+    for (int n = 0; n < n_rois; n++) {
+      rois_array.push_back(randInt(0, N - 1));
+      int w1 = randInt(-20, W + 20);
+      int w2 = randInt(-20, W + 20);
+      int h1 = randInt(-20, H + 20);
+      int h2 = randInt(-20, H + 20);
+      rois_array.push_back(std::min(w1, w2));
+      rois_array.push_back(std::max(h1, h2));
+      rois_array.push_back(std::min(w1, w2));
+      rois_array.push_back(std::max(h1, h2));
+    }
+    TestParams test_params{N, C, H, W, n_rois, rois_array};
+
+    CreateAndRun<CPUContext>(&y_cpu, "NCHW", test_params, true);
+    CreateAndRun<CUDAContext>(&y_gpu, "NCHW", test_params, true);
+    CreateAndRun<CPUContext>(&y_cpu_nhwc, "NHWC", test_params, true);
+
+    EXPECT_EQ(y_cpu.dims(), y_gpu.dims());
+    EXPECT_EQ(y_cpu.dims(), y_cpu_nhwc.dims());
+    ConstEigenVectorMap<float> y_cpu_vec(y_cpu.data<float>(), y_cpu.size());
+    ConstEigenVectorMap<float> y_gpu_vec(y_gpu.data<float>(), y_gpu.size());
+    ConstEigenVectorMap<float> y_cpu_nhwc_vec(
+        y_cpu_nhwc.data<float>(), y_cpu_nhwc.size());
+    int max_diff_idx = -1;
+    (y_cpu_vec - y_gpu_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_NEAR(y_cpu_vec[max_diff_idx], y_gpu_vec[max_diff_idx], 1e-1);
+
+    max_diff_idx = -1;
+    (y_cpu_vec - y_cpu_nhwc_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_cpu_nhwc_vec[max_diff_idx]);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cc b/caffe2/operators/roi_align_rotated_gradient_op.cc
new file mode 100644
index 0000000..7d8277f
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cc
@@ -0,0 +1,35 @@
+#include "roi_align_rotated_gradient_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Input: X, rois, dY (aka "gradOutput");
+// Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(RoIAlignRotatedGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(0, "X", "See RoIAlignRotated.")
+    .Input(1, "RoIs", "See RoIAlignRotated.")
+    .Input(2, "dY", "Gradient of forward output 0 (Y)")
+    .Output(0, "dX", "Gradient of forward input 0 (X)");
+
+namespace {
+
+class GetRoIAlignRotatedGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RoIAlignRotatedGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(RoIAlignRotated, GetRoIAlignRotatedGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu
new file mode 100644
index 0000000..1606209
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cu
@@ -0,0 +1,236 @@
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES // For M_PI
+#endif // _MSC_VER
+#include <cmath>
+
+#include "roi_align_rotated_gradient_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__ float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackward(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    T roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_width = offset_bottom_rois[3] * spatial_scale;
+    T roi_height = offset_bottom_rois[4] * spatial_scale;
+    T theta = offset_bottom_rois[5] * M_PI / 180.0;
+
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (T)1.);
+    roi_height = max(roi_height, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+    T cosTheta = cos(theta);
+    T sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T x = xx * cosTheta + yy * sinTheta + roi_center_w;
+        T y = yy * cosTheta - xx * sinTheta + roi_center_h;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          gpu_atomic_add(
+              static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          gpu_atomic_add(
+              static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+} // namespace
+
+template <>
+bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+
+  // Must zero-out dX before accumulating gradients
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    RoIAlignRotatedBackward<float>
+        <<<CAFFE_GET_BLOCKS(dY.size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            dY.size(),
+            dY.data<float>(),
+            R.dim32(0),
+            spatial_scale_,
+            X.dim32(1),
+            X.dim32(2),
+            X.dim32(3),
+            pooled_height_,
+            pooled_width_,
+            sampling_ratio_,
+            dX->mutable_data<float>(),
+            R.data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    RoIAlignRotatedGradient,
+    RoIAlignRotatedGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.h b/caffe2/operators/roi_align_rotated_gradient_op.h
new file mode 100644
index 0000000..a0cd898
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_gradient_op.h
@@ -0,0 +1,43 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROI_ALIGN_ROTATED_GRADIENT_OP_H_
+#define ROI_ALIGN_ROTATED_GRADIENT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignRotatedGradientOp final : public Operator<Context> {
+ public:
+  RoIAlignRotatedGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_ALIGN_ROTATED_GRADIENT_OP_H_
diff --git a/caffe2/operators/roi_align_rotated_op.cc b/caffe2/operators/roi_align_rotated_op.cc
new file mode 100644
index 0000000..3f3c7c6
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_op.cc
@@ -0,0 +1,388 @@
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES // For M_PI
+#endif // _MSC_VER
+#include <cmath>
+
+#include "roi_align_rotated_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  T cosTheta = cos(theta);
+  T sinTheta = sin(theta);
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          T x = xx * cosTheta + yy * sinTheta + roi_center_w;
+          T y = yy * cosTheta - xx * sinTheta + roi_center_h;
+
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // Save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data,
+    StorageOrder order) {
+  DCHECK(roi_cols == 5 || roi_cols == 6);
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+    // roi could have 5 or 6 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 6) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not round
+    T roi_center_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_center_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_width = offset_bottom_rois[2] * spatial_scale;
+    T roi_height = offset_bottom_rois[3] * spatial_scale;
+    T theta = offset_bottom_rois[4] * M_PI / 180.0;
+
+    // Force malformed ROIs to be 1x1
+    roi_width = std::max(roi_width, (T)1.);
+    roi_height = std::max(roi_height, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    // We want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization.
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        theta,
+        pre_calc);
+
+    if (order == StorageOrder::NCHW) {
+      for (int c = 0; c < channels; c++) {
+        int index_n_c = index_n + c * pooled_width * pooled_height;
+        const T* offset_bottom_data =
+            bottom_data + (roi_batch_ind * channels + c) * height * width;
+        int pre_calc_index = 0;
+
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            int index = index_n_c + ph * pooled_width + pw;
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                PreCalc<T> pc = pre_calc[pre_calc_index];
+                output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                    pc.w2 * offset_bottom_data[pc.pos2] +
+                    pc.w3 * offset_bottom_data[pc.pos3] +
+                    pc.w4 * offset_bottom_data[pc.pos4];
+
+                pre_calc_index += 1;
+              }
+            }
+            output_val /= count;
+
+            top_data[index] = output_val;
+          } // for pw
+        } // for ph
+      } // for c
+    } // if nchw
+
+    if (order == StorageOrder::NHWC) {
+      const T* offset_bottom_data =
+          bottom_data + roi_batch_ind * channels * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          EVecXf output_vals = EVecXf::Zero(channels);
+
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+
+              ConstEigenVectorMap<T> data_1(
+                  offset_bottom_data + channels * pc.pos1, channels);
+              ConstEigenVectorMap<T> data_2(
+                  offset_bottom_data + channels * pc.pos2, channels);
+              ConstEigenVectorMap<T> data_3(
+                  offset_bottom_data + channels * pc.pos3, channels);
+              ConstEigenVectorMap<T> data_4(
+                  offset_bottom_data + channels * pc.pos4, channels);
+
+              output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
+                  pc.w4 * data_4;
+
+              pre_calc_index += 1;
+            }
+          }
+          output_vals /= count;
+
+          int index_nhw = index_n + (ph * pooled_width + pw) * channels;
+          std::memcpy(
+              top_data + index_nhw, output_vals.data(), channels * sizeof(T));
+        } // for pw
+      } // for ph
+    } // if nhwc
+  } // for n
+}
+
+} // namespace
+
+template <>
+bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    if (order_ == StorageOrder::NCHW) {
+      Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    } else if (order_ == StorageOrder::NHWC) {
+      Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
+    }
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  // Each element of R is [batch_id center_x center_y width height angle].
+  // If R has 6 columns, the first column is the index, otherwise 0.
+  CAFFE_ENFORCE(R.dim32(1) == 5 || R.dim32(1) == 6);
+
+  assert(sampling_ratio_ >= 0);
+
+  if (order_ == StorageOrder::NCHW) {
+    Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+    int output_size = Y->size();
+    ROIAlignRotatedForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  } else if (order_ == StorageOrder::NHWC) {
+    Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
+    int output_size = Y->size();
+    ROIAlignRotatedForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(3),
+        X.dim32(1),
+        X.dim32(2),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIAlignRotated, RoIAlignRotatedOp<float, CPUContext>);
+
+// Input: X, rois; Output: Y
+OPERATOR_SCHEMA(RoIAlignRotated)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Similar to RoIAlign but can handle rotated region proposals.
+Based on https://arxiv.org/abs/1703.01086.
+)DOC")
+    .Arg(
+        "spatial_scale",
+        "(float) default 1.0; Spatial scale of the input feature map X "
+        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
+        "w.r.t. the input image.")
+    .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
+    .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
+    .Arg(
+        "sampling_ratio",
+        "(int) default -1; number of sampling points in the interpolation grid "
+        "used to compute the output value of each pooled output bin. If > 0, "
+        "then exactly sampling_ratio x sampling_ratio grid points are used. If "
+        "<= 0, then an adaptive number of grid points are used (computed as "
+        "ceil(roi_width / pooled_w), and likewise for height).")
+    .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
+    .Input(
+        1,
+        "RoIs",
+        "2D input of shape (R, 5 or 6) specifying R RoIs "
+        "representing: batch index in [0, N - 1], center_x, center_y, width, "
+        "height, angle. The RoI coordinates are in the coordinate system of "
+        "the input image. `angle` should be specified in degrees and "
+        "represents the RoI rotated counter-clockwise. For inputs "
+        "corresponding to a single image, batch index can be excluded to "
+        "have just 5 columns.")
+    .Output(
+        0,
+        "Y",
+        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
+        "is a pooled feature map cooresponding to the r-th RoI.");
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu
new file mode 100644
index 0000000..3f8a609
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_op.cu
@@ -0,0 +1,199 @@
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES // For M_PI
+#endif // _MSC_VER
+#include <cmath>
+
+#include "roi_align_rotated_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    T roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_width = offset_bottom_rois[3] * spatial_scale;
+    T roi_height = offset_bottom_rois[4] * spatial_scale;
+    T theta = offset_bottom_rois[5] * M_PI / 180.0;
+
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (T)1.);
+    roi_height = max(roi_height, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+    T cosTheta = cos(theta);
+    T sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T x = xx * cosTheta + yy * sinTheta + roi_center_w;
+        T y = yy * cosTheta - xx * sinTheta + roi_center_h;
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+} // namespace
+
+template <>
+bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW, "RoIAlign CUDA impl needs NCHW");
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  CAFFE_ENFORCE_EQ(R.dim32(1), 6);
+
+  assert(sampling_ratio_ >= 0);
+
+  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+  int output_size = Y->size();
+  RoIAlignRotatedForward<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          spatial_scale_,
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          pooled_height_,
+          pooled_width_,
+          sampling_ratio_,
+          R.data<float>(),
+          Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(RoIAlignRotated, RoIAlignRotatedOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_rotated_op.h b/caffe2/operators/roi_align_rotated_op.h
new file mode 100644
index 0000000..460e64d
--- /dev/null
+++ b/caffe2/operators/roi_align_rotated_op.h
@@ -0,0 +1,47 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROTATED_ROI_ALIGN_OP_H_
+#define ROTATED_ROI_ALIGN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignRotatedOp final : public Operator<Context> {
+ public:
+  RoIAlignRotatedOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+    DCHECK(order_ == StorageOrder::NCHW || order_ == StorageOrder::NHWC);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  StorageOrder order_;
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROTATED_ROI_ALIGN_OP_H_
diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc
new file mode 100644
index 0000000..d369aec
--- /dev/null
+++ b/caffe2/operators/roi_pool_op.cc
@@ -0,0 +1,206 @@
+#include "roi_pool_op.h"
+
+#include <cfloat>
+
+namespace caffe2 {
+
+using std::max;
+using std::min;
+
+template <>
+bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0); // Input data to pool
+  const auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+  auto* A = is_test_ ? nullptr : Output(1); // argmaxes
+
+  // Each ROI is of the form [batch_index x1 y1 x2 y2]
+  CAFFE_ENFORCE_EQ(R.dim32(1), 5);
+
+  // TODO: Handle the storage_order properly to get the NCWH.
+  int batch_size = X.dim32(0);
+  int channels = X.dim32(1);
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  int num_rois = R.dim32(0);
+
+  Y->Resize(num_rois, channels, pooled_height_, pooled_width_);
+  if (!is_test_) {
+    A->Resize(Y->dims());
+  }
+
+  const float* Xdata = X.data<float>();
+  const float* rois = R.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
+
+  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
+  for (int n = 0; n < num_rois; ++n) {
+    int roi_batch_id = rois[0];
+    int roi_start_w = round(rois[1] * spatial_scale_);
+    int roi_start_h = round(rois[2] * spatial_scale_);
+    int roi_end_w = round(rois[3] * spatial_scale_);
+    int roi_end_h = round(rois[4] * spatial_scale_);
+    CAFFE_ENFORCE_GE(roi_batch_id, 0);
+    CAFFE_ENFORCE_LT(roi_batch_id, batch_size);
+
+    // Force malformed ROIs to be 1x1
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height_);
+    const float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width_);
+
+    const float* batch_data = Xdata + roi_batch_id * X.size_from_dim(1);
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height_; ++ph) {
+        for (int pw = 0; pw < pooled_width_; ++pw) {
+          // Compute pooling region for this output unit:
+          //  start (included) = floor(ph * roi_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          // Add roi offsets and clip to input boundaries
+          hstart = min(max(hstart + roi_start_h, 0), height);
+          hend = min(max(hend + roi_start_h, 0), height);
+          wstart = min(max(wstart + roi_start_w, 0), width);
+          wend = min(max(wend + roi_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width_ + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          Ydata[pool_index] = is_empty ? 0 : -FLT_MAX;
+          if (!is_test_) {
+            // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+            argmax_data[pool_index] = -1;
+          }
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > Ydata[pool_index]) {
+                Ydata[pool_index] = batch_data[index];
+                if (!is_test_) {
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+      }
+      // Increment all data pointers by one channel
+      batch_data += X.size_from_dim(2);
+      Ydata += Y->size_from_dim(2);
+      if (!is_test_) {
+        argmax_data += A->size_from_dim(2);
+      }
+    }
+    // Increment ROI data pointer
+    rois += R.size_from_dim(1);
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIPool, RoIPoolOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CPUContext>);
+
+// Input: X, rois
+// Output case #1: Y, argmaxes (train mode)
+// Output case #2: Y           (test mode)
+OPERATOR_SCHEMA(RoIPool)
+    .NumInputs(2)
+    .NumOutputs({1, 2})
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      const StorageOrder order = StringToStorageOrder(
+          helper.GetSingleArgument<string>("order", "NCHW"));
+      const TensorShape& X = in[0];
+      const int num_channels =
+          (order == StorageOrder::NCHW ? X.dims(1) : X.dims(3));
+      const TensorShape& R = in[1];
+      const int num_rois = R.dims(0);
+      const int pooled_height = helper.GetSingleArgument<int>("pooled_h", 1);
+      const int pooled_width = helper.GetSingleArgument<int>("pooled_w", 1);
+      TensorShape Y = CreateTensorShape(
+          vector<int>({num_rois, num_channels, pooled_height, pooled_width}),
+          X.data_type());
+
+      bool is_test = helper.GetSingleArgument<int>(OpSchema::Arg_IsTest, 0);
+      if (!is_test) {
+        TensorShape argmaxes = Y;
+        argmaxes.set_data_type(TensorProto_DataType_INT32);
+        return vector<TensorShape>({Y, argmaxes});
+      } else {
+        return vector<TensorShape>({Y});
+      }
+    })
+    .SetDoc(R"DOC(
+Carries out ROI Pooling for Faster-RCNN.
+Depending on the mode, there are multiple output cases:
+
+  Output case #1: Y, argmaxes (train mode)
+  Output case #2: Y           (test mode)
+)DOC")
+    .Arg(
+        "is_test",
+        "If set, run in test mode and skip computation of argmaxes (used for "
+        "gradient computation). Only one output tensor is produced. "
+        "(Default: false).")
+    .Arg("order", "A StorageOrder string (Default: \"NCHW\").")
+    .Arg("pooled_h", "The pooled output height (Default: 1).")
+    .Arg("pooled_w", "The pooled output width (Default: 1).")
+    .Arg(
+        "spatial_scale",
+        "Multiplicative spatial scale factor to translate ROI coords from "
+        "their input scale to the scale used when pooling (Default: 1.0).")
+    .Input(
+        0,
+        "X",
+        "The input 4-D tensor of data. Only NCHW order is currently supported.")
+    .Input(
+        1,
+        "rois",
+        "RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of "
+        "shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].")
+    .Output(
+        0,
+        "Y",
+        "RoI pooled output 4-D tensor of shape "
+        "(num_rois, channels, pooled_h, pooled_w).")
+    .Output(
+        1,
+        "argmaxes",
+        "Argmaxes corresponding to indices in X used for gradient computation. "
+        "Only output if arg \"is_test\" is false.");
+
+// Input: X, rois, argmaxes, dY (aka "gradOutput")
+// Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(RoIPoolGradient).NumInputs(4).NumOutputs(1);
+
+class GetRoIPoolGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RoIPoolGradient",
+        "",
+        vector<string>{I(0), I(1), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(RoIPool, GetRoIPoolGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
new file mode 100644
index 0000000..34dcebb
--- /dev/null
+++ b/caffe2/operators/roi_pool_op.cu
@@ -0,0 +1,208 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "roi_pool_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__ float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__global__ void ROIPoolForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const T* bottom_rois,
+    T* top_data,
+    int* argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
+    int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
+    int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
+    int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (offset_bottom_data[bottom_index] > maxval) {
+          maxval = offset_bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void ROIPoolBackward(
+    const int nthreads,
+    const T* top_diff,
+    const int* argmax_data,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    T* offset_bottom_diff = bottom_diff + bottom_offset;
+    const int* offset_argmax_data = argmax_data + top_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      gpu_atomic_add(
+          static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
+          offset_bottom_diff + argmax);
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+  auto* A = is_test_ ? nullptr : Output(1); // argmaxes
+
+  // Handle empty rois
+  if (R.size() == 0) {
+    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    // mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    if (!is_test_) {
+      A->Resize(Y->dims());
+      A->mutable_data<int>();
+    }
+    return true;
+  }
+
+  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+  if (!is_test_) {
+    A->Resize(Y->dims());
+  }
+  int output_size = Y->size();
+  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
+  ROIPoolForward<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      spatial_scale_,
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      pooled_height_,
+      pooled_width_,
+      R.data<float>(),
+      Y->mutable_data<float>(),
+      argmax_data);
+  return true;
+}
+
+template <>
+bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& A = Input(2); // argmaxes
+  auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
+  // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+  // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  // Must zero-out dX before accumulating gradients
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    ROIPoolBackward<float><<<
+        CAFFE_GET_BLOCKS(dY.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        dY.size(),
+        dY.data<float>(),
+        A.data<int>(),
+        R.dim32(0),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        dX->mutable_data<float>(),
+        R.data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(RoIPool, RoIPoolOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_pool_op.h b/caffe2/operators/roi_pool_op.h
new file mode 100644
index 0000000..7be379d
--- /dev/null
+++ b/caffe2/operators/roi_pool_op.h
@@ -0,0 +1,76 @@
+#ifndef ROI_POOL_OP_H_
+#define ROI_POOL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIPoolOp final : public Operator<Context> {
+ public:
+  RoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)) {
+    CAFFE_ENFORCE(
+        (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 2),
+        "Output size mismatch.");
+    CAFFE_ENFORCE_GT(spatial_scale_, 0);
+    CAFFE_ENFORCE_GT(pooled_height_, 0);
+    CAFFE_ENFORCE_GT(pooled_width_, 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool is_test_;
+  StorageOrder order_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+};
+
+template <typename T, class Context>
+class RoIPoolGradientOp final : public Operator<Context> {
+ public:
+  RoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_GT(spatial_scale_, 0);
+    CAFFE_ENFORCE_GT(pooled_height_, 0);
+    CAFFE_ENFORCE_GT(pooled_width_, 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  StorageOrder order_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_POOL_OP_H_
diff --git a/caffe2/operators/rowmul_op.cc b/caffe2/operators/rowmul_op.cc
new file mode 100644
index 0000000..9e09cdd
--- /dev/null
+++ b/caffe2/operators/rowmul_op.cc
@@ -0,0 +1,52 @@
+#include "caffe2/operators/rowmul_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(ReduceTailSum, ReduceTailSumOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(RowMul, RowMulOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(ReduceTailSum)
+    .NumInputs(1, 1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Reduce the tailing dimensions
+)DOC")
+    .Input(0, "mat", "The matrix")
+    .Output(0, "output", "Output");
+
+OPERATOR_SCHEMA(RowMul)
+    .NumInputs(2, 2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a matrix A and column vector w, the output is the multiplication of row i
+of A and element i of w, e.g. C[i][j] = A[i][j] * w[i]. This operator should be
+deprecated when the gradient operator of Mul with broadcast is implemented.
+)DOC")
+    .Input(0, "mat", "The matrix")
+    .Input(1, "w", "The column vector")
+    .Output(0, "output", "Output");
+
+class GetRowMulGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return vector<OperatorDef>{
+        CreateOperatorDef(
+            "RowMul", "", vector<string>{GO(0), I(1)}, vector<string>{GI(0)}),
+        CreateOperatorDef(
+            "Mul",
+            "",
+            vector<string>{GO(0), I(0)},
+            vector<string>{GI(1) + "before_aggregate"}),
+        CreateOperatorDef(
+            "ReduceTailSum",
+            "",
+            vector<string>{GI(1) + "before_aggregate"},
+            vector<string>{GI(1)})};
+  }
+};
+REGISTER_GRADIENT(RowMul, GetRowMulGradient);
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/operators/rowmul_op.h b/caffe2/operators/rowmul_op.h
new file mode 100644
index 0000000..cf82044
--- /dev/null
+++ b/caffe2/operators/rowmul_op.h
@@ -0,0 +1,78 @@
+#ifndef CAFFE2_OPERATORS_ROW_MUL_H_
+#define CAFFE2_OPERATORS_ROW_MUL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// A hacky version of Mul with broadcast
+// RowMul([mat, w], [output])
+template <typename T, class Context>
+class RowMulOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(RowMulOp);
+
+  bool RunOnDevice() override {
+    auto& mat = Input(0);
+    auto& w = Input(1);
+    auto* output = Output(0);
+
+    output->ResizeLike(mat);
+    T* output_data = output->template mutable_data<T>();
+    const T* mat_data = mat.template data<T>();
+    const T* w_data = w.template data<T>();
+
+    // Dimension checking
+    CAFFE_ENFORCE_EQ(
+        w.size(),
+        mat.dim32(0),
+        "Length of w should be equal to the first dim of mat");
+
+    auto block_size = mat.size_from_dim(1);
+    for (int i = 0; i < w.size(); i++) {
+      size_t offset = i * block_size;
+      for (int j = 0; j < block_size; j++) {
+        output_data[offset + j] = mat_data[offset + j] * w_data[i];
+      }
+    }
+
+    return true;
+  }
+};
+
+// A hacky version
+template <typename T, class Context>
+class ReduceTailSumOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ReduceTailSumOp);
+
+  bool RunOnDevice() override {
+    auto& mat = Input(0);
+    auto* output = Output(0);
+
+    int N = mat.dim32(0);
+    int block_size = mat.size_from_dim(1);
+
+    output->Resize(N);
+    T* output_data = output->template mutable_data<T>();
+    const T* mat_data = mat.template data<T>();
+
+    for (int i = 0; i < N; i++) {
+      output_data[i] = 0;
+      size_t offset = i * block_size;
+      for (int j = 0; j < block_size; j++) {
+        output_data[i] += mat_data[offset + j];
+      }
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ROW_MUL_H_
diff --git a/caffe2/operators/rsqrt_op.cc b/caffe2/operators/rsqrt_op.cc
new file mode 100644
index 0000000..de4fc49
--- /dev/null
+++ b/caffe2/operators/rsqrt_op.cc
@@ -0,0 +1,72 @@
+#include "caffe2/operators/rsqrt_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool RsqrtGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  EigenVectorMap<T>(dX, size) = ConstEigenVectorMap<T>(dY, size).array() *
+      ConstEigenVectorMap<T>(Y, size).array().cube() * static_cast<T>(-0.5);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Rsqrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        RsqrtFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    RsqrtGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        RsqrtGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Rsqrt)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc("Computes the element-wise rsqrt of the input.")
+    .Input(0, "X", "ND input tensor")
+    .Output(0, "Y", "ND output tensor");
+
+OPERATOR_SCHEMA(RsqrtGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
+
+namespace {
+
+class GetRsqrtGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RsqrtGradient",
+        "",
+        std::vector<std::string>{GO(0), O(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Rsqrt, GetRsqrtGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/rsqrt_op.cu b/caffe2/operators/rsqrt_op.cu
new file mode 100644
index 0000000..378d131
--- /dev/null
+++ b/caffe2/operators/rsqrt_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/rsqrt_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+RsqrtGradientCUDAKernel(const int size, const T* dY, const T* Y, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * math::utils::Cube<T>(__ldg(Y + i)) *
+        static_cast<T>(-0.5);
+#else
+    dX[i] = dY[i] * math::utils::Cube<T>(Y[i]) * static_cast<T>(-0.5);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool RsqrtGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  RsqrtGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Rsqrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        RsqrtFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    RsqrtGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        RsqrtGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/rsqrt_op.h b/caffe2/operators/rsqrt_op.h
new file mode 100644
index 0000000..7bddae5
--- /dev/null
+++ b/caffe2/operators/rsqrt_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_RSQRT_OP_H_
+#define CAFFE2_OPERATORS_RSQRT_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct RsqrtFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Rsqrt<T, Context>(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct RsqrtGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& Y_dims,
+      const T* dY,
+      const T* Y,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_RSQRT_OP_H_
diff --git a/caffe2/operators/scale_op.cc b/caffe2/operators/scale_op.cc
new file mode 100644
index 0000000..f246db4
--- /dev/null
+++ b/caffe2/operators/scale_op.cc
@@ -0,0 +1,26 @@
+#include "caffe2/operators/scale_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Scale, ScaleOp<CPUContext>);
+OPERATOR_SCHEMA(Scale)
+  .NumInputs(1)
+  .NumOutputs(1)
+  .AllowInplace({{0, 0}})
+  .IdenticalTypeAndShape()
+  .SetDoc(R"DOC(
+Scale takes one input data (Tensor<float>) and produces one output data
+(Tensor<float>) whose value is the input data tensor scaled element-wise.
+)DOC")
+  .Arg("scale", "(float, default 1.0) the scale to apply.");
+
+class GetScaleGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // CopyArguments is true by default so the "scale" arg is going to be copied
+    return SingleGradientDef(
+        "Scale", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Scale, GetScaleGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/scale_op.h b/caffe2/operators/scale_op.h
new file mode 100644
index 0000000..b8632ea
--- /dev/null
+++ b/caffe2/operators/scale_op.h
@@ -0,0 +1,42 @@
+#ifndef CAFFE2_OPERATORS_SCALE_OP_H_
+#define CAFFE2_OPERATORS_SCALE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ScaleOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ScaleOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.0)) {}
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    math::Scale<T, Context>(
+        X.size(),
+        scale_,
+        X.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+  }
+
+ protected:
+  float scale_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SCALE_OP_H_
diff --git a/caffe2/operators/scale_op_gpu.cc b/caffe2/operators/scale_op_gpu.cc
new file mode 100644
index 0000000..471f03d
--- /dev/null
+++ b/caffe2/operators/scale_op_gpu.cc
@@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/scale_op.h"
+
+namespace caffe2 {
+
+template <>
+bool ScaleOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float16, float>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR(Scale, ScaleOp<CUDAContext>);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
new file mode 100644
index 0000000..95d507b
--- /dev/null
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -0,0 +1,572 @@
+#include "caffe2/operators/segment_reduction_op.h"
+
+namespace caffe2 {
+
+// registering 5 input gradient with main output
+// gradient of SparseLengthsWeightedSum
+OPERATOR_SCHEMA(SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient)
+    .NumInputs(5)
+    .NumOutputs(2);
+REGISTER_CPU_OPERATOR(
+    SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient,
+    AbstractLengthsWithMainInputGradientOp<
+        float,
+        int,
+        CPUContext,
+        WeightedSumReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*SparseFused*/,
+        true /*GradientNeedIndices*/>);
+
+// registering 4 input version
+OPERATOR_SCHEMA(SparseLengthsIndicesInGradientWeightedSumGradient)
+    .NumInputs(4)
+    .NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    SparseLengthsIndicesInGradientWeightedSumGradient,
+    AbstractLengthsGradientOp<
+        float,
+        int,
+        CPUContext,
+        WeightedSumReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*GradientNeedIndices*/>);
+
+// registering 3 input version
+// gradient of SparseLengthsSum
+OPERATOR_SCHEMA(SparseLengthsIndicesInGradientSumGradient)
+    .NumInputs(3)
+    .NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    SparseLengthsIndicesInGradientSumGradient,
+    AbstractLengthsGradientOp<
+        float,
+        int,
+        CPUContext,
+        SumReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*GradientNeedIndices*/>);
+// gradient of LengthsSum
+OPERATOR_SCHEMA(LengthsIndicesInGradientSumGradient).NumInputs(3).NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    LengthsIndicesInGradientSumGradient,
+    AbstractLengthsGradientOp<
+        float,
+        int,
+        CPUContext,
+        SumReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*GradientNeedIndices*/>);
+
+// registering 3 input version
+// gradient of SparseLengthsMean
+OPERATOR_SCHEMA(SparseLengthsIndicesInGradientMeanGradient)
+    .NumInputs(3)
+    .NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    SparseLengthsIndicesInGradientMeanGradient,
+    AbstractLengthsGradientOp<
+        float,
+        int,
+        CPUContext,
+        MeanReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*GradientNeedIndices*/>);
+// gradient of LengthsMean
+OPERATOR_SCHEMA(LengthsIndicesInGradientMeanGradient)
+    .NumInputs(3)
+    .NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    LengthsIndicesInGradientMeanGradient,
+    AbstractLengthsGradientOp<
+        float,
+        int,
+        CPUContext,
+        MeanReducerDef::template ReducerGradient<float, CPUContext>,
+        true /*GradientNeedIndices*/>);
+
+namespace {
+
+static const char* kLengthsMaxExtra = R"DOC(
+The *LengthsMax* op takes two inputs *DATA* and *LENGTHS*, and produces a single output *OUTPUT*. The op finds the maximum value in each of the segments of *DATA*, where segments are defined by their lengths.
+For example, if $DATA = [2,4,3,1,2,10]$ and $LENGTHS = [2,3,1]$ then $OUTPUT = [max([2,4]), max([3,1,2]), max([10])] = [4,3,10]$.
+
+Github Link:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/segment_reduction_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsMax",
+    ["DATA", "LENGTHS"],
+    ["OUTPUT"],
+)
+
+workspace.FeedBlob("DATA", np.array([2,4,3,1,2,10]).astype(np.float32))
+print("DATA:\n", workspace.FetchBlob("DATA"))
+
+workspace.FeedBlob("LENGTHS", np.array([2,3,1]).astype(np.int32))
+print("LENGTHS:\n", workspace.FetchBlob("LENGTHS"))
+
+workspace.RunOperatorOnce(op)
+print("OUTPUT: \n", workspace.FetchBlob("OUTPUT"))
+
+```
+
+**Result**
+
+```
+
+DATA:
+ [ 2.  4.  3.  1.  2. 10.]
+LENGTHS:
+ [2 3 1]
+OUTPUT:
+ [ 4.  3. 10.]
+
+```
+
+</details>
+
+)DOC";
+
+static const char* kLengthsMeanExtra = R"DOC(
+The *LengthsMean* op takes two inputs *DATA* and *LENGTHS*, and produces a single output *OUTPUT*. The op finds the mean value in each of the segments of *DATA*, where segments are defined by their lengths.
+For example, if $DATA = [2,4,3,1,2,10]$ and $LENGTHS = [2,3,1]$ then $OUTPUT = [mean([2,4]), mean([3,1,2]), mean([10])] = [3,2,10]$.
+
+Github Link:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/segment_reduction_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsMean",
+    ["DATA", "LENGTHS"],
+    ["OUTPUT"],
+)
+
+workspace.FeedBlob("DATA", np.array([2,4,3,1,2,10]).astype(np.float32))
+print("DATA:\n", workspace.FetchBlob("DATA"))
+
+workspace.FeedBlob("LENGTHS", np.array([2,3,1]).astype(np.int32))
+print("LENGTHS:\n", workspace.FetchBlob("LENGTHS"))
+
+workspace.RunOperatorOnce(op)
+print("OUTPUT: \n", workspace.FetchBlob("OUTPUT"))
+
+```
+
+**Result**
+
+```
+
+DATA:
+ [ 2.  4.  3.  1.  2. 10.]
+LENGTHS:
+ [2 3 1]
+OUTPUT:
+ [ 3.  2. 10.]
+
+```
+
+</details>
+
+)DOC";
+
+static const char* kLengthsSumExtra = R"DOC(
+The *LengthsSum* op takes two inputs *DATA* and *LENGTHS*, and produces a single output *OUTPUT*. The op finds the sum in each of the segments of *DATA*, where segments are defined by their lengths.
+For example, if $DATA = [2,4,3,1,2,10]$ and $LENGTHS = [2,3,1]$ then $OUTPUT = [sum([2,4]), sum([3,1,2]), sum([10])] = [6,6,10]$.
+
+Github Link:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/segment_reduction_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsSum",
+    ["DATA", "LENGTHS"],
+    ["OUTPUT"],
+)
+
+workspace.FeedBlob("DATA", np.array([2,4,3,1,2,10]).astype(np.float32))
+print("DATA:\n", workspace.FetchBlob("DATA"))
+
+workspace.FeedBlob("LENGTHS", np.array([2,3,1]).astype(np.int32))
+print("LENGTHS:\n", workspace.FetchBlob("LENGTHS"))
+
+workspace.RunOperatorOnce(op)
+print("OUTPUT: \n", workspace.FetchBlob("OUTPUT"))
+
+```
+
+**Result**
+
+```
+
+DATA:
+ [ 2.  4.  3.  1.  2. 10.]
+LENGTHS:
+ [2 3 1]
+OUTPUT:
+ [ 6.  6. 10.]
+
+```
+
+</details>
+
+)DOC";
+
+static const char* kLengthsWeightedSumExtra = R"DOC(
+The *LengthsWeightedSum* op takes three inputs *DATA*, *LENGTHS*, and *SCALARS*, and produces a single output *OUTPUT*. The op finds the weighted sum in each of the segments of *DATA*, where segments are defined by their lengths. Before calculating the sums, the input *DATA* is weighted by the contents of *SCALARS*.
+For example, if $DATA = [2,4,3,1,2,10]$, $SCALARS = [8, 2, 1, 4, 1, 0.6]$, and $LENGTHS = [2,3,1]$, then $OUTPUT = [sum([8*2,2*4]), sum([1*3,4*1,1*2]), sum([0.6*10])] = [24,9,6]$.
+
+Github Link:
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/segment_reduction_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsWeightedSum",
+    ["DATA", "SCALARS","LENGTHS"],
+    ["OUTPUT"],
+)
+
+workspace.FeedBlob("DATA", np.array([2,4,3,1,2,10]).astype(np.float32))
+print("DATA:\n", workspace.FetchBlob("DATA"))
+
+workspace.FeedBlob("SCALARS", np.array([8, 2, 1, 4, 1, 0.6]).astype(np.float32))
+print("SCALARS:\n", workspace.FetchBlob("SCALARS"))
+
+workspace.FeedBlob("LENGTHS", np.array([2,3,1]).astype(np.int32))
+print("LENGTHS:\n", workspace.FetchBlob("LENGTHS"))
+
+workspace.RunOperatorOnce(op)
+print("OUTPUT: \n", workspace.FetchBlob("OUTPUT"))
+
+```
+
+**Result**
+
+```
+
+DATA:
+ [ 2.  4.  3.  1.  2. 10.]
+SCALARS:
+ [8.  2.  1.  4.  1.  0.6]
+LENGTHS:
+ [2 3 1]
+OUTPUT:
+ [24.  9.  6.]
+
+```
+
+</details>
+
+)DOC";
+
+template <typename Def>
+string FormatDoc() {
+  string doc = Def::doc;
+  ReplaceAll(doc, "{op}", Def::OpDef::name);
+  ReplaceAll(doc, "{op_doc}", Def::OpDef::doc);
+  if (strcmp(Def::OpDef::name,"Max") == 0){
+    ReplaceAll(doc, "{extra}", kLengthsMaxExtra);
+  }
+  else if (strcmp(Def::OpDef::name,"Mean") == 0){
+    ReplaceAll(doc, "{extra}", kLengthsMeanExtra);
+  }
+  else if (strcmp(Def::OpDef::name,"Sum") == 0){
+    ReplaceAll(doc, "{extra}", kLengthsSumExtra);
+  }
+  else if (strcmp(Def::OpDef::name,"WeightedSum") == 0){
+    ReplaceAll(doc, "{extra}", kLengthsWeightedSumExtra);
+  }
+  else{
+    ReplaceAll(doc, "{extra}", " ");
+  }
+  return doc;
+}
+
+// Helper function to enforce naming conventions at compile time.
+constexpr bool equal(
+    char const* lhs,
+    char const* rhs1,
+    char const* rhs2,
+    char const* rhs3 = "") {
+  return (*lhs == 0 && *rhs1 == 0 && *rhs2 == 0 && *rhs3 == 0) ||
+      (*rhs1 != 0 && *lhs == *rhs1 && equal(lhs + 1, rhs1 + 1, rhs2, rhs3)) ||
+      (*rhs1 == 0 && *rhs2 != 0 && *lhs == *rhs2 &&
+       equal(lhs + 1, rhs1, rhs2 + 1, rhs3)) ||
+      (*rhs1 == 0 && *rhs2 == 0 && *rhs3 != 0 && *lhs == *rhs3 &&
+       equal(lhs + 1, rhs1, rhs2, rhs3 + 1));
+}
+
+// Helper macro when the main op is defined elsewhere, and we only need to
+// define the schema, and the gradient op.
+#define REGISTER_SEGMENT_DEF_SCHEMA_GRADIENT_ONLY(                            \
+    segment_name, gradient_name, ...)                                         \
+  static_assert(                                                              \
+      equal(#segment_name, __VA_ARGS__::basename, __VA_ARGS__::OpDef::name),  \
+      #segment_name);                                                         \
+  static_assert(                                                              \
+      equal(                                                                  \
+          #gradient_name,                                                     \
+          __VA_ARGS__::basename,                                              \
+          __VA_ARGS__::OpDef::name,                                           \
+          "Gradient"),                                                        \
+      #gradient_name);                                                        \
+  OPERATOR_SCHEMA(segment_name)                                               \
+      .NumInputs(__VA_ARGS__::ForwardOp::kNumInputs)                          \
+      .NumOutputs(1)                                                          \
+      .SetDoc(FormatDoc<__VA_ARGS__>())                                       \
+      .Output(0, "OUTPUT", "Aggregated tensor")                               \
+      .FillUsing(__VA_ARGS__::PopulateSchema);                                \
+  REGISTER_CPU_OPERATOR_STR(string(#gradient_name), __VA_ARGS__::BackwardOp); \
+  OPERATOR_SCHEMA(gradient_name)                                              \
+      .NumInputs(__VA_ARGS__::BackwardOp::kNumInputs)                         \
+      .NumOutputs(1);                                                         \
+  REGISTER_GRADIENT_STR(string(#segment_name), __VA_ARGS__::GetGradient)
+
+#define REGISTER_SEGMENT_DEF(segment_name, gradient_name, ...)               \
+  static_assert(                                                             \
+      equal(#segment_name, __VA_ARGS__::basename, __VA_ARGS__::OpDef::name), \
+      #segment_name);                                                        \
+  REGISTER_CPU_OPERATOR_STR(string(#segment_name), __VA_ARGS__::ForwardOp);  \
+  REGISTER_SEGMENT_DEF_SCHEMA_GRADIENT_ONLY(                                 \
+      segment_name, gradient_name, __VA_ARGS__)
+
+REGISTER_SEGMENT_DEF(
+    SortedSegmentRangeSum,
+    SortedSegmentRangeSumGradient,
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, SumRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SortedSegmentRangeLogSumExp,
+    SortedSegmentRangeLogSumExpGradient,
+    AbstractSortedSegmentRangeDef<
+        float,
+        int,
+        CPUContext,
+        LogSumExpRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SortedSegmentRangeLogMeanExp,
+    SortedSegmentRangeLogMeanExpGradient,
+    AbstractSortedSegmentRangeDef<
+        float,
+        int,
+        CPUContext,
+        LogMeanExpRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SortedSegmentRangeMean,
+    SortedSegmentRangeMeanGradient,
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, MeanRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SortedSegmentRangeMax,
+    SortedSegmentRangeMaxGradient,
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);
+
+REGISTER_SEGMENT_DEF(
+    SortedSegmentSum,
+    SortedSegmentSumGradient,
+    AbstractSortedSegmentDef<float, int, CPUContext, SumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseSortedSegmentSum,
+    SparseSortedSegmentSumGradient,
+    AbstractSparseSortedSegmentDef<float, int, CPUContext, SumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    UnsortedSegmentSum,
+    UnsortedSegmentSumGradient,
+    AbstractUnsortedSegmentDef<float, int, CPUContext, SumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseUnsortedSegmentSum,
+    SparseUnsortedSegmentSumGradient,
+    AbstractSparseUnsortedSegmentDef<float, int, CPUContext, SumReducerDef>);
+
+REGISTER_SEGMENT_DEF(
+    LengthsSum,
+    LengthsSumGradient,
+    AbstractLengthsDef<float, int, CPUContext, SumReducerDef, true>);
+
+REGISTER_SEGMENT_DEF(
+    SortedSegmentMean,
+    SortedSegmentMeanGradient,
+    AbstractSortedSegmentDef<float, int, CPUContext, MeanReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseSortedSegmentMean,
+    SparseSortedSegmentMeanGradient,
+    AbstractSparseSortedSegmentDef<float, int, CPUContext, MeanReducerDef>);
+REGISTER_SEGMENT_DEF(
+    UnsortedSegmentMean,
+    UnsortedSegmentMeanGradient,
+    AbstractUnsortedSegmentDef<float, int, CPUContext, MeanReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseUnsortedSegmentMean,
+    SparseUnsortedSegmentMeanGradient,
+    AbstractSparseUnsortedSegmentDef<float, int, CPUContext, MeanReducerDef>);
+
+REGISTER_SEGMENT_DEF(
+    LengthsMean,
+    LengthsMeanGradient,
+    AbstractLengthsDef<float, int, CPUContext, MeanReducerDef, true>);
+
+REGISTER_SEGMENT_DEF(
+    ReduceFrontWeightedSum,
+    ReduceFrontWeightedSumGradient,
+    AbstractReduceFrontDef<float, CPUContext, WeightedSumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SortedSegmentWeightedSum,
+    SortedSegmentWeightedSumGradient,
+    AbstractSortedSegmentDef<float, int, CPUContext, WeightedSumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseSortedSegmentWeightedSum,
+    SparseSortedSegmentWeightedSumGradient,
+    AbstractSparseSortedSegmentDef<
+        float,
+        int,
+        CPUContext,
+        WeightedSumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    UnsortedSegmentWeightedSum,
+    UnsortedSegmentWeightedSumGradient,
+    AbstractUnsortedSegmentDef<float, int, CPUContext, WeightedSumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    SparseUnsortedSegmentWeightedSum,
+    SparseUnsortedSegmentWeightedSumGradient,
+    AbstractSparseUnsortedSegmentDef<
+        float,
+        int,
+        CPUContext,
+        WeightedSumReducerDef>);
+REGISTER_SEGMENT_DEF(
+    LengthsWeightedSum,
+    LengthsWeightedSumGradient,
+    AbstractLengthsDef<float, int, CPUContext, WeightedSumReducerDef, false>);
+
+// SparseLengths[Sum,WeightedSum,Mean] are now implemented separately,
+// so we only rely to the historical implementation for the backward + schema.
+REGISTER_SEGMENT_DEF_SCHEMA_GRADIENT_ONLY(
+    SparseLengthsSum,
+    SparseLengthsSumGradient,
+    AbstractSparseLengthsDef<
+        float,
+        int,
+        CPUContext,
+        SumReducerDef,
+        true /*GradientNeedIndices*/>)
+REGISTER_SEGMENT_DEF_SCHEMA_GRADIENT_ONLY(
+    SparseLengthsWeightedSum,
+    SparseLengthsWeightedSumGradient,
+    AbstractSparseLengthsDef<
+        float,
+        int,
+        CPUContext,
+        WeightedSumReducerDef,
+        true /*GradientNeedIndices*/>)
+
+REGISTER_SEGMENT_DEF_SCHEMA_GRADIENT_ONLY(
+    SparseLengthsMean,
+    SparseLengthsMeanGradient,
+    AbstractSparseLengthsDef<
+        float,
+        int,
+        CPUContext,
+        MeanReducerDef,
+        true /*GradientNeedIndices*/>)
+
+// Auxiliary output gradients are currently implemented only for Lengths version
+#define REGISTER_GRADIENT_WITH_MAIN_INPUT(gradient_name, ...)        \
+  static_assert(                                                     \
+      equal(                                                         \
+          #gradient_name,                                            \
+          __VA_ARGS__::basename,                                     \
+          __VA_ARGS__::OpDef::name,                                  \
+          "WithMainInputGradient"),                                  \
+      #gradient_name);                                               \
+  REGISTER_CPU_OPERATOR_STR(                                         \
+      string(#gradient_name), __VA_ARGS__::WithMainInputBackwardOp); \
+  OPERATOR_SCHEMA(gradient_name)                                     \
+      .NumInputs(__VA_ARGS__::WithMainInputBackwardOp::kNumInputs)   \
+      .NumOutputs(1, INT_MAX)
+
+REGISTER_GRADIENT_WITH_MAIN_INPUT(
+    LengthsWeightedSumWithMainInputGradient,
+    AbstractLengthsDef<float, int, CPUContext, WeightedSumReducerDef>);
+REGISTER_GRADIENT_WITH_MAIN_INPUT(
+    SparseLengthsWeightedSumWithMainInputGradient,
+    AbstractSparseLengthsDef<float, int, CPUContext, WeightedSumReducerDef>);
+} // namespace
+
+#define REGISTER_GRADIENT_WITH_MAIN_INPUT_AND_FORWARD_OUTPUT(               \
+    gradient_name, ...)                                                     \
+  static_assert(                                                            \
+      equal(                                                                \
+          #gradient_name,                                                   \
+          __VA_ARGS__::basename,                                            \
+          __VA_ARGS__::OpDef::name,                                         \
+          "WithMainInputAndForwardOutputGradient"),                         \
+      #gradient_name);                                                      \
+  REGISTER_CPU_OPERATOR_STR(                                                \
+      string(#gradient_name),                                               \
+      __VA_ARGS__::WithMainInputAndForwardOutputBackwardOp);                \
+  OPERATOR_SCHEMA(gradient_name)                                            \
+      .NumInputs(                                                           \
+          __VA_ARGS__::WithMainInputAndForwardOutputBackwardOp::kNumInputs) \
+      .NumOutputs(1, INT_MAX)
+
+#define REGISTER_SEGMENT_DEF_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT(         \
+    segment_name, gradient_name, ...)                                        \
+  static_assert(                                                             \
+      equal(#segment_name, __VA_ARGS__::basename, __VA_ARGS__::OpDef::name), \
+      #segment_name);                                                        \
+  OPERATOR_SCHEMA(segment_name)                                              \
+      .NumInputs(__VA_ARGS__::ForwardOp::kNumInputs)                         \
+      .NumOutputs(1)                                                         \
+      .SetDoc(FormatDoc<__VA_ARGS__>())                                      \
+      .Output(0, "OUTPUT", "Aggregated tensor")                              \
+      .FillUsing(__VA_ARGS__::PopulateSchema);                               \
+  REGISTER_GRADIENT_WITH_MAIN_INPUT_AND_FORWARD_OUTPUT(                      \
+      gradient_name, __VA_ARGS__);                                           \
+  REGISTER_GRADIENT_STR(string(#segment_name), __VA_ARGS__::GetGradient)
+
+// This implements and registers a length op with a gradient which requires
+// the main input as well as the output of the forward output.
+#define REGISTER_LENGTHS_OPS_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT(         \
+    segment_name, gradient_name, ...)                                        \
+  static_assert(                                                             \
+      equal(#segment_name, __VA_ARGS__::basename, __VA_ARGS__::OpDef::name), \
+      #segment_name);                                                        \
+  REGISTER_CPU_OPERATOR_STR(string(#segment_name), __VA_ARGS__::ForwardOp);  \
+  REGISTER_SEGMENT_DEF_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT(               \
+      segment_name, gradient_name, __VA_ARGS__)
+
+REGISTER_LENGTHS_OPS_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT(
+    LengthsMax,
+    LengthsMaxWithMainInputAndForwardOutputGradient,
+    AbstractLengthsDef<float, int, CPUContext, MaxReducerDef>);
+} // namespace caffe2
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
new file mode 100644
index 0000000..2430316
--- /dev/null
+++ b/caffe2/operators/segment_reduction_op.h
@@ -0,0 +1,2046 @@
+#ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
+#define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/reducer_functors.h"
+
+namespace caffe2 {
+
+template <typename TData>
+class BaseInputAccessor {
+ public:
+  BaseInputAccessor() {}
+
+  bool observeInput(const Tensor<CPUContext>& dataInput) {
+    data_ = dataInput.raw_data();
+    return dataInput.template IsType<TData>();
+  }
+
+  inline const TData*
+  getBlockPtr(TIndex in_block_size, TIndex idx, TIndex /* blocks */ = 1) {
+    return static_cast<const TData*>(data_) + in_block_size * idx;
+  }
+
+ protected:
+  const void* data_ = nullptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Range reducer ops: leverage that input segment is continuous and allow
+// reducer functors to do something special
+// Note: for now there are no real use cases for it yet :)
+// Also, doesn't support additional arguments for now
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Base implementation for segment reduction op that leverages continuity of the
+ * data
+ *
+ * Assumes that segments are sorted and there are no skip indices
+ */
+template <
+    typename T,
+    typename SIndex,
+    class Context,
+    class RangeReducer,
+    class InputAccessor = BaseInputAccessor<T>>
+class AbstractSortedSegmentRangeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);
+
+  bool RunOnDevice() override {
+    auto& dataInput = Input(DATA);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    auto N = segment_ids.dim(0);
+    CAFFE_ENFORCE_EQ(
+        N,
+        dataInput.dim(0),
+        "SEGMENT_IDS must have the same length as outer dimension of DATA");
+
+    OPERATOR_NEEDS_FEATURE(
+        inputAccessor_.observeInput(dataInput),
+        "Unsupported input type: ",
+        dataInput.meta().name(),
+        ".");
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+
+    const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
+    auto shape = dataInput.dims();
+    shape[0] = K;
+    output->Resize(shape);
+
+    T* out = output->template mutable_data<T>();
+
+    if (N == 0) {
+      return true;
+    }
+
+    TIndex block_size = dataInput.size() / N;
+
+    // Assume the segments are sorted and there are no gaps
+    CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
+    for (TIndex i = 0; i < N;) {
+      TIndex start = i;
+      for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
+        ;
+
+      RangeReducer()(
+          block_size,
+          i - start,
+          inputAccessor_.getBlockPtr(block_size, start, i - start),
+          out + block_size * s_ids[start],
+          &context_);
+
+      // check correctness of the next segment
+      if (i < N) {
+        CAFFE_ENFORCE_EQ(
+            s_ids[start] + 1,
+            s_ids[i],
+            "Indices must be sorted and not have gaps");
+      }
+    }
+    return true;
+  }
+
+  static constexpr int kNumInputs = 2;
+  INPUT_TAGS(DATA, SEGMENT_IDS);
+
+ private:
+  InputAccessor inputAccessor_;
+};
+
+template <
+    typename T,
+    typename SIndex,
+    class Context,
+    class RangeReducerGradient>
+class AbstractSortedSegmentRangeGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);
+
+  bool RunOnDevice() override {
+    // TODO(azzolini): avoid using input/output if not used by a particular op
+    auto& data_in = Input(DATA_IN);
+    auto& data_out = Input(DATA_OUT);
+    auto& segment_grads = Input(SEGMENT_GRADS);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* data_grads = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex N = segment_ids.dim(0);
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+    const T* s_grads = segment_grads.template data<T>();
+    const T* d_in = data_in.template data<T>();
+    const T* d_out = data_out.template data<T>();
+
+    auto shape = segment_grads.dims();
+    shape[0] = N;
+    data_grads->Resize(shape);
+
+    const SIndex K = segment_grads.dim(0);
+    T* out = data_grads->template mutable_data<T>();
+
+    if (N == 0) {
+      return true;
+    }
+
+    TIndex block_size = segment_grads.size_from_dim(1);
+
+    // Assume the segments are sorted and there are no gaps
+    CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
+    // repeat the check from forward op
+    CAFFE_ENFORCE_EQ(
+        K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
+    for (TIndex i = 0; i < N;) {
+      TIndex start = i;
+      for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
+        ;
+
+      auto expanded_idx = block_size * start;
+      auto reduced_idx = block_size * s_ids[start];
+      RangeReducerGradient()(
+          block_size,
+          i - start,
+          s_grads + reduced_idx,
+          out + expanded_idx,
+          d_in + expanded_idx,
+          d_out + reduced_idx,
+          &context_);
+
+      // check correctness of the next segment
+      if (i < N) {
+        CAFFE_ENFORCE_EQ(
+            s_ids[start] + 1,
+            s_ids[i],
+            "Indices must be sorted and not have gaps");
+      }
+    }
+    return true;
+  }
+
+  static constexpr int kNumInputs = 4;
+  INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSortedSegmentRangeDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SortedSegmentRange";
+  static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of input tensor. In order to allow for more
+efficient implementation of '{op}', the input segments have to be contiguous
+and non-empty.
+
+SEGMENT_IDS is a vector that maps each of the first dimension slices of the
+DATA to a particular group (segment). Values belonging to the same segment are
+aggregated together.
+
+The first dimension of the output is equal to the number of input segments,
+i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor to be aggregated");
+    schema.Input(
+        1,
+        "SEGMENT_IDS",
+        "Vector with the same length as the first dimension of DATA "
+        "and values in the range 0..K-1 and in increasing order that "
+        "maps each slice of DATA to one of the segments");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated tensor with the first dimension of K and the "
+        "other dimentsions inherited from DATA");
+  }
+  using ForwardOp = AbstractSortedSegmentRangeOp<
+      T,
+      SIndex,
+      Context,
+      typename ReducerDef::template Reducer<T, Context>>;
+  using BackwardOp = AbstractSortedSegmentRangeGradientOp<
+      T,
+      SIndex,
+      Context,
+      typename ReducerDef::template ReducerGradient<T, Context>>;
+  struct GetGradient : public GradientMakerBase {
+    using GradientMakerBase::GradientMakerBase;
+    vector<OperatorDef> GetGradientDefs() override {
+      return SingleGradientDef(
+          string(basename) + ReducerDef::name + "Gradient",
+          "",
+          vector<string>{I(0), O(0), GO(0), I(1)},
+          // no gradient on segment_ids!
+          vector<string>{GI(0)});
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Incremental reducer ops: assume that reducer consumes pieces of data one by
+// one. Also, supports additional arguments passed to reducer, e.g. scalers for
+// weighted sum.
+//
+// Note: in current implementation additional inputs are considered auxiliary
+// constants and have limitations:
+// - there is no gradient computation for auxiliary inputs
+// - auxiliary inputs aren't affected by fused embedding lookup in operations
+// like sparse_sorted_segment
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @brief Simple non-segmented reduction over the first few dimensions of the
+ * tensor
+ *
+ * Inputs:
+ *   0: DATA - input embedding to do lookups in
+ *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ *                       reducer
+ *
+ * Args:
+ *   num_reduce_dim (default 1) - the number of dims in front of the tensor to
+ *                                reduce
+ *
+ * Output:
+ *   Tensor without the first `num_dim` dimensions of DATA
+ */
+template <
+    typename T,
+    class Context,
+    class Reducer,
+    bool FirstDim,
+    class InputAccessor = BaseInputAccessor<T>>
+class AbstractReduceFrontOrBackOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AbstractReduceFrontOrBackOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
+
+  bool RunOnDevice() override {
+    auto& data = Input(0);
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex in_block_size = FirstDim
+        ? data.size_from_dim(num_reduce_dims_)
+        : data.size_to_dim(data.ndim() - num_reduce_dims_);
+    return DispatchHelper<typename Reducer::FixedDispatch>::call(
+        this, in_block_size);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& data = Input(0);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_LE(num_reduce_dims_, data.ndim());
+
+    typename Reducer::Meta ctx(FirstDim);
+    ctx.observeInput(0, data, num_reduce_dims_);
+    for (int i = 1; i < Reducer::kInputCount; ++i) {
+      auto& aux_in = Input(i);
+      ctx.observeInput(i, aux_in, num_reduce_dims_);
+    }
+
+    OPERATOR_NEEDS_FEATURE(
+        inputAccessor_.observeInput(data),
+        "Unsupported input type: ",
+        data.meta().name(),
+        ".");
+
+    vector<TIndex> shape;
+    ctx.appendOutputShape(&shape);
+    output->Resize(shape);
+
+    T* out = output->template mutable_data<T>();
+
+    const int block_size = FirstDim
+        ? data.size_from_dim(num_reduce_dims_)
+        : data.size_from_dim(data.ndim() - num_reduce_dims_);
+
+    const int num_blocks = block_size > 0 ? data.size() / block_size : 0;
+
+    Reducer r(ctx, out, &context_);
+    for (TIndex i = 0; i < num_blocks; ++i) {
+      r.template process<FixedSize>(
+          ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
+    }
+    r.template finish<FixedSize>(ctx, &context_);
+    return true;
+  }
+
+  static constexpr int kNumInputs = Reducer::kInputCount;
+
+ private:
+  int num_reduce_dims_;
+  InputAccessor inputAccessor_;
+};
+
+template <
+    typename T,
+    class Context,
+    class ReducerGradient,
+    bool FirstDim = true>
+class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AbstractReduceFrontOrBackGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex grad_block_size = Input(REDUCTION_GRAD).size();
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, grad_block_size);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& reduction_grad = Input(REDUCTION_GRAD);
+    auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
+
+    auto* data_grads = Output(0);
+
+    typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      auto& aux_in = Input(i);
+      ctx.observeOriginalInput(
+          ReducerGradient::originalInputs()[i],
+          aux_in,
+          nullptr, /*no grad*/
+          num_reduce_dims_);
+    }
+
+    const T* r_grad = reduction_grad.template data<T>();
+
+    CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.size());
+
+    vector<TIndex> shape(
+        source_shape.template data<TIndex>(),
+        source_shape.template data<TIndex>() + source_shape.size());
+
+    data_grads->Resize(shape);
+
+    TIndex block_size = FirstDim
+        ? data_grads->size_from_dim(num_reduce_dims_)
+        : data_grads->size_from_dim(data_grads->ndim() - num_reduce_dims_);
+    TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
+
+    T* out = data_grads->template mutable_data<T>();
+
+    ReducerGradient r(ctx, r_grad, &context_);
+    for (TIndex i = 0; i < block_num; ++i) {
+      r.template fillGrad<FixedSize>(
+          ctx,
+          out + block_size * i,
+          i,
+          &context_,
+          FirstDim ? block_num : block_size);
+    }
+    return true;
+  }
+
+  static constexpr int kNumInputs =
+      ReducerGradient::originalInputs().size() + 2;
+  enum _InputTags {
+    REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
+    SOURCE_SHAPE
+  };
+
+ private:
+  int num_reduce_dims_;
+};
+
+template <typename T, typename Context, typename ReducerDef>
+struct AbstractReduceFrontDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "ReduceFront";
+  static constexpr const char* doc = R"DOC(
+Reduces the input tensor along the first dimension of the input tensor by
+applying '{op}'. This op acts in a similar way to SortedSegment{op} and
+UnsortedSegment{op} but as if all input slices belong to a single segment.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(
+        0, "DATA", "Input tensor to be reduced on the first dimension");
+    schema.TensorInferenceFunction([](const OperatorDef& def,
+                                      const vector<TensorShape>& in) {
+      CAFFE_ENFORCE_EQ(1, in.size());
+      ArgumentHelper helper(def);
+      int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
+      typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
+      vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
+      return vector<TensorShape>{
+          CreateTensorShape(out_dims, in[0].data_type())};
+    });
+    ReducerDef::PopulateSchema(schema);
+  }
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractReduceFrontOrBackOp<
+      T,
+      Context,
+      typename ReducerDef::template Reducer<T, Context>,
+      true>;
+  using BackwardOp =
+      AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, true>;
+  struct GetGradient : public GradientMakerBase {
+    using GradientMakerBase::GradientMakerBase;
+    vector<OperatorDef> GetGradientDefs() override {
+      // Have utility function generating these names?
+      string tmp_dims = "_" + O(0) + "_dims";
+
+      vector<string> grad_ins;
+      for (const int i : ReducerGradient::originalInputs()) {
+        grad_ins.push_back(I(i));
+      }
+      grad_ins.push_back(GO(0));
+      grad_ins.push_back(tmp_dims);
+
+      vector<Argument> args;
+      if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
+        args.push_back(GetArgument(def_, "num_reduce_dim"));
+      }
+      // FIXME: pass in num_reduce_dims?!
+      return vector<OperatorDef>{
+          CreateOperatorDef(
+              "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
+          CreateOperatorDef(
+              string(basename) + ReducerDef::name + "Gradient",
+              "",
+              grad_ins,
+              // no gradient on auxiliary inputs for now
+              vector<string>{GI(0)}),
+      };
+    }
+  };
+};
+
+template <typename T, typename Context, typename ReducerDef>
+struct AbstractReduceBackDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "ReduceBack";
+  static constexpr const char* doc = R"DOC(
+Reduces the input tensor along the last dimension of the input tensor by
+applying '{op}'. This op acts in a similar way to SortedSegment{op} and
+UnsortedSegment{op} but as if all input slices belong to a single segment.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(
+        0, "DATA", "Input tensor to be reduced on the first dimension");
+    schema.TensorInferenceFunction([](const OperatorDef& def,
+                                      const vector<TensorShape>& in) {
+      CAFFE_ENFORCE_EQ(1, in.size());
+      ArgumentHelper helper(def);
+      int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
+      typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
+      vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
+      return vector<TensorShape>{
+          CreateTensorShape(out_dims, in[0].data_type())};
+    });
+    ReducerDef::PopulateSchema(schema);
+  }
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractReduceFrontOrBackOp<
+      T,
+      Context,
+      typename ReducerDef::template Reducer<T, Context>,
+      false>;
+  using BackwardOp =
+      AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, false>;
+  struct GetGradient : public GradientMakerBase {
+    using GradientMakerBase::GradientMakerBase;
+    vector<OperatorDef> GetGradientDefs() override {
+      // Have utility function generating these names?
+      string tmp_dims = "_" + O(0) + "_dims";
+
+      vector<string> grad_ins;
+      for (const int i : ReducerGradient::originalInputs()) {
+        grad_ins.push_back(I(i));
+      }
+      grad_ins.push_back(GO(0));
+      grad_ins.push_back(tmp_dims);
+
+      vector<Argument> args;
+      if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
+        args.push_back(GetArgument(def_, "num_reduce_dim"));
+      }
+      // FIXME: pass in num_reduce_dims?!
+      return vector<OperatorDef>{
+          CreateOperatorDef(
+              "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
+          CreateOperatorDef(
+              string(basename) + ReducerDef::name + "Gradient",
+              "",
+              grad_ins,
+              // no gradient on auxiliary inputs for now
+              vector<string>{GI(0)}),
+      };
+    }
+  };
+};
+
+/**
+ * @brief Segment reduction op with optional fused embedding lookup
+ *
+ * Base implementation for SortedSegmentXXX and SparseSortedSegmentXXX depending
+ * on SparseFused static argument.
+ *
+ * Inputs:
+ *   0: DATA - input embedding to do lookups in
+ *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ *                       reducer, should have the same first dimension as
+ *                       SEGMENT_IDS (e.g. scalars in WeightedSum)
+ *   # if SparseFused == true:
+ *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
+ *                  same dimension as SEGMENT_IDS
+ *   # P+1 if SparseFused == false:
+ *   P+1 or P+2: SEGMENT_IDS - sorted segment ids 1-D vector
+ *
+ * Output:
+ *   Tensor with first dimension of K, where K is the max segment id + 1. Rest
+ *   of dimensions are decided by reducer but usually are the same size as extra
+ *   dimensions of DATA
+ */
+template <
+    typename T,
+    typename SIndex,
+    class Context,
+    class Reducer,
+    bool SparseFused = true,
+    class InputAccessor = BaseInputAccessor<T>>
+class AbstractSortedSegmentOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex in_block_size = Input(0).size_from_dim(1);
+    return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
+        this, in_block_size);
+  }
+
+  template <typename IndexType, int FixedSize>
+  bool DoRunWithValue() {
+    auto& dataInput = Input(0);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex N = segment_ids.dim(0);
+    const TIndex M = dataInput.dim(0);
+
+    const IndexType* idxs;
+    if (SparseFused) { // static if
+      auto& indices = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
+      CAFFE_ENFORCE_EQ(
+          N,
+          indices.dim(0),
+          "SEGMENT_IDS must have the same length as INDICES");
+      idxs = indices.template data<IndexType>();
+    } else {
+      CAFFE_ENFORCE_EQ(
+          N, M, "DATA must have the same first dimension as SEGMENT_IDS");
+    }
+
+    // It would probably look nicer with varargs templates but it's too much
+    // metaprogramming
+    typename Reducer::Meta ctx;
+    ctx.observeInput(0, dataInput, 1);
+    for (int i = 1; i < Reducer::kInputCount; ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          N,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeInput(i, aux_in, 1);
+    }
+
+    OPERATOR_NEEDS_FEATURE(
+        inputAccessor_.observeInput(dataInput),
+        "Unsupported input type: ",
+        dataInput.meta().name(),
+        ".");
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+
+    const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
+    vector<TIndex> shape;
+    shape.push_back(K);
+    ctx.appendOutputShape(&shape);
+    output->Resize(shape);
+
+    T* out = output->template mutable_data<T>();
+    if (N == 0) {
+      return true;
+    }
+    TIndex in_block_size = dataInput.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
+
+    // Assume the segments are sorted and there are no gaps
+    CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
+    for (TIndex i = 0; i < N;) {
+      TIndex start = i;
+
+      Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
+      for (; i < N && s_ids[start] == s_ids[i]; ++i) {
+        IndexType idx;
+        if (SparseFused) { // static if
+          CAFFE_ENFORCE(
+              0 <= idxs[i] && idxs[i] < M,
+              "Index out of bounds: ",
+              idxs[i],
+              ", range 0 to ",
+              M);
+          idx = idxs[i];
+        } else {
+          idx = i;
+        }
+        r.template process<FixedSize>(
+            ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
+      }
+
+      r.template finish<FixedSize>(ctx, &context_);
+      // check correctness of the next segment
+      if (i < N) {
+        CAFFE_ENFORCE_EQ(
+            s_ids[start] + 1,
+            s_ids[i],
+            "Indices must be sorted and not have gaps");
+      }
+    }
+    return true;
+  }
+
+  enum {
+    INDICES = Reducer::kInputCount,
+    SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
+  };
+  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
+  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
+
+ private:
+  InputAccessor inputAccessor_;
+};
+
+// Gradient actually doesn't depend on whether sparse lookup is fused or not
+template <typename T, typename SIndex, class Context, class ReducerGradient>
+class AbstractSortedSegmentGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, grad_block_size);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& segment_grads = Input(SEGMENT_GRADS);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* data_grads = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex N = segment_ids.dim(0);
+
+    typename ReducerGradient::Meta ctx(segment_grads, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          N,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeOriginalInput(
+          ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
+    }
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+    const T* s_grads = segment_grads.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(N);
+    ctx.appendGradShape(&shape);
+    data_grads->Resize(shape);
+
+    TIndex d_block_size = data_grads->size_from_dim(1);
+    const SIndex K = segment_grads.dim(0);
+    TIndex s_block_size = segment_grads.size_from_dim(1);
+    T* out = data_grads->template mutable_data<T>();
+
+    if (N == 0) {
+      return true;
+    }
+
+    // Assume the segments are sorted and there are no gaps
+    CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
+    // repeat the check from forward op
+    CAFFE_ENFORCE_EQ(
+        K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
+    for (TIndex i = 0; i < N;) {
+      TIndex start = i;
+      TIndex end = start;
+
+      if (ReducerGradient::computeLength()) {
+        for (; end < N && s_ids[start] == s_ids[end]; ++end) {
+        }
+      }
+
+      ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
+      for (; i < N && s_ids[start] == s_ids[i]; ++i) {
+        r.template fillGrad<FixedSize>(
+            ctx, out + d_block_size * i, i, &context_, end - start);
+      }
+
+      // check correctness of the next segment
+      if (i < N) {
+        CAFFE_ENFORCE_EQ(
+            s_ids[start] + 1,
+            s_ids[i],
+            "Indices must be sorted and not have gaps");
+      }
+    }
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs =
+      ReducerGradient::originalInputs().size() + 2;
+  enum _InputTags {
+    SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+    SEGMENT_IDS
+  };
+};
+
+// base implementation of sorted/unsorted sparse/non-sparse gradient computation
+template <
+    typename ForwardOp,
+    typename ReducerDef,
+    typename ReducerGradient,
+    bool Sorted,
+    bool SparseFused>
+struct SegmentOpGetGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(
+        !ReducerGradient::requiresDataInput(Def()),
+        "grads on aux inputs are not yet implemented for Segment operators.");
+    vector<string> grad_ins;
+    for (const int i : ReducerGradient::originalInputs()) {
+      grad_ins.push_back(I(i));
+    }
+    grad_ins.push_back(GO(0));
+    grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
+    vector<OperatorDef> r{CreateOperatorDef(
+        string(Sorted ? "SortedSegment" : "UnsortedSegment") +
+            ReducerDef::name + "Gradient",
+        "",
+        grad_ins,
+        // no gradient on segment_ids or auxiliary inputs for now
+        vector<string>{SparseFused ? GI_V(0) : GI(0)})};
+    if (SparseFused) {
+      SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
+    }
+    return r;
+  }
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSortedSegmentDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SortedSegment";
+  static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of input tensor. Segments need to be sorted and
+contiguous. See also UnsortedSegment{op} that doesn't have this requirement.
+
+SEGMENT_IDS is a vector that maps each of the first dimension slices of the
+DATA to a particular group (segment). Values belonging to the same segment are
+aggregated together.
+
+The first dimension of the output is equal to the number of input segments,
+i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "SEGMENT_IDS",
+        "Vector with the same length as the first dimension of DATA "
+        "and values in the range 0..K-1 and in increasing order that "
+        "maps each slice of DATA to one of the segments");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of K "
+        "(the number of segments).");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer, false>;
+  using BackwardOp =
+      AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = SegmentOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      true /*Sorted*/,
+      false /*SparseFused*/>;
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSparseSortedSegmentDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SparseSortedSegment";
+  static constexpr const char* doc = R"DOC(
+Pulls in slices of the input tensor, groups them into segments and applies
+'{op}' to each segment. Segments need to be sorted and contiguous. See also
+SparseUnsortedSegment{op} that doesn't have this requirement.
+
+This op is basically Gather and SortedSegment{op} fused together.
+
+INDICES should contain integers in range 0..N-1 where N is the first dimension
+of DATA. INDICES represent which slices of DATA need to be pulled in.
+
+SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
+particular group (segment). Values belonging to the same segment are aggregated
+together. SEGMENT_IDS should have the same dimension as INDICES.
+
+The first dimension of the output is equal to the number of input segments,
+i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "INDICES",
+        "Integer vector containing indices of the first dimension of DATA for "
+        "the slices that are being aggregated");
+    schema.Input(
+        Reducer::kInputCount + 1,
+        "SEGMENT_IDS",
+        "Vector with the same length as INDICES and values in the range "
+        "0..K-1 and in increasing order that maps each slice of DATA referenced"
+        " by INDICES to one of the segments");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of K "
+        "(the number of segments).");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer>;
+  // TODO(dzhulgakov): we're registering the same class twice here,
+  // consider avoiding op duplication here
+  using BackwardOp =
+      AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = SegmentOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      true /*Sorted*/,
+      true /*SparseFused*/>;
+};
+
+/**
+ * @brief Unsorted segment reduction op with optional fused embedding lookup
+ *
+ * Base implementation for UnsortedSegmentXXX and UnsparseSortedSegmentXXX
+ * depending on SparseFused static argument.
+ *
+ * Unlike the sorted version it allows to have "gaps" in segment ids.
+ *
+ * Inputs:
+ *   0: DATA - input embedding to do lookups in
+ *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ *                       reducer, should have the same first dimension as
+ *                       SEGMENT_IDS (e.g. scalars in WeightedSum)
+ *   # if SparseFused == true:
+ *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
+ *                  same dimension as SEGMENT_IDS
+ *   # P+1 if SparseFused == false:
+ *   P+1 or P+2: SEGMENT_IDS - unsorted segment ids 1-D vector
+ *
+ * Args:
+ *   num_segments - allows to override the dimension of the output. If not set
+ *                  it would be inferred from segment_ids tensor.
+ *
+ *
+ * Output:
+ *   Tensor with first dimension of K, where K is the max segment id + 1. Rest
+ *   of dimensions are decided by reducer but usually are the same size as extra
+ *   dimensions of DATA
+ */
+template <
+    typename T,
+    typename SIndex,
+    class Context,
+    class Reducer,
+    bool SparseFused = true,
+    class InputAccessor = BaseInputAccessor<T>>
+class AbstractUnsortedSegmentOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AbstractUnsortedSegmentOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex in_block_size = Input(0).size_from_dim(1);
+    return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
+        this, in_block_size);
+  }
+
+  template <typename IndexType, int FixedSize>
+  bool DoRunWithValue() {
+    auto& data = Input(0);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex N = segment_ids.dim(0);
+    const TIndex M = data.dim(0);
+
+    const IndexType* idxs;
+    if (SparseFused) { // static if
+      auto& indices = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
+      CAFFE_ENFORCE_EQ(
+          N,
+          indices.dim(0),
+          "SEGMENT_IDS must have the same length as INDICES");
+      idxs = indices.template data<IndexType>();
+    } else {
+      CAFFE_ENFORCE_EQ(
+          N, M, "DATA must have the same first dimension as SEGMENT_IDS");
+    }
+
+    // It would probably look nicer with varargs templates but it's too much
+    // metaprogramming
+    typename Reducer::Meta ctx;
+    ctx.observeInput(0, data, 1);
+    for (int i = 1; i < Reducer::kInputCount; ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          N,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeInput(i, aux_in, 1);
+    }
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+    OPERATOR_NEEDS_FEATURE(
+        inputAccessor_.observeInput(data),
+        "Unsupported input type: ",
+        data.meta().name(),
+        ".");
+
+    // determine the number of segments
+    SIndex K;
+    if (num_segments_ != -1) {
+      K = num_segments_;
+    } else {
+      K = 0;
+      for (TIndex i = 0; i < N; ++i) {
+        K = std::max(K, s_ids[i] + 1);
+      }
+    }
+
+    vector<TIndex> shape;
+    shape.push_back(K);
+    ctx.appendOutputShape(&shape);
+    output->Resize(shape);
+
+    TIndex in_block_size = data.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
+    T* out = output->template mutable_data<T>();
+
+    reducers_.clear();
+    reducers_.reserve(K);
+    for (TIndex i = 0; i < K; ++i) {
+      reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
+    }
+
+    for (TIndex i = 0; i < N; ++i) {
+      auto s_id = s_ids[i];
+      CAFFE_ENFORCE(
+          0 <= s_id && s_id < K,
+          "Segment id out of range: ",
+          s_id,
+          ", range 0 to ",
+          K);
+      IndexType idx;
+      if (SparseFused) { // static if
+        CAFFE_ENFORCE(
+            0 <= idxs[i] && idxs[i] < M,
+            "Index out of bounds: ",
+            idxs[i],
+            ", range 0 to ",
+            M);
+        idx = idxs[i];
+      } else {
+        idx = i;
+      }
+      reducers_[s_id].template process<FixedSize>(
+          ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
+    }
+
+    for (TIndex i = 0; i < K; ++i) {
+      reducers_[i].template finish<FixedSize>(ctx, &context_);
+    }
+    // call reducers destructors (if there is any)
+    reducers_.clear();
+    return true;
+  }
+
+  enum {
+    INDICES = Reducer::kInputCount,
+    SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
+  };
+  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
+  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
+
+ private:
+  TIndex num_segments_;
+  // member field to reuse memory
+  vector<Reducer> reducers_;
+  InputAccessor inputAccessor_;
+};
+
+// Gradient actually doesn't depend on whether sparse lookup is fused or not
+template <typename T, typename SIndex, class Context, class ReducerGradient>
+class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, grad_block_size);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& segment_grads = Input(SEGMENT_GRADS);
+    auto& segment_ids = Input(SEGMENT_IDS);
+    auto* data_grads = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex N = segment_ids.dim(0);
+
+    typename ReducerGradient::Meta ctx(segment_grads, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          N,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeOriginalInput(
+          ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
+    }
+
+    const SIndex* s_ids = segment_ids.template data<SIndex>();
+    const T* s_grads = segment_grads.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(N);
+    ctx.appendGradShape(&shape);
+    data_grads->Resize(shape);
+
+    TIndex d_block_size = data_grads->size_from_dim(1);
+    const SIndex K = segment_grads.dim(0);
+    TIndex s_block_size = segment_grads.size_from_dim(1);
+    T* out = data_grads->template mutable_data<T>();
+
+    if (ReducerGradient::computeLength()) {
+      segment_length_.resize(K, 0);
+      for (int i = 0; i < N; ++i) {
+        auto s_id = s_ids[i];
+        CAFFE_ENFORCE(
+            0 <= s_id && s_id < K,
+            "Segment id out of range: ",
+            s_id,
+            ", range 0 to ",
+            K);
+        segment_length_[s_ids[i]]++;
+      }
+    }
+
+    reducers_.clear();
+    reducers_.reserve(K);
+    for (SIndex i = 0; i < K; ++i) {
+      reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
+    }
+
+    for (TIndex i = 0; i < N; ++i) {
+      auto s_id = s_ids[i];
+      if (ReducerGradient::computeLength()) {
+        reducers_[s_id].template fillGrad<FixedSize>(
+            ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
+      } else {
+        reducers_[s_id].template fillGrad<FixedSize>(
+            ctx, out + d_block_size * i, i, &context_, 0);
+      }
+    }
+    // call reducers destructors (if there is any)
+    reducers_.clear();
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs =
+      ReducerGradient::originalInputs().size() + 2;
+  enum _InputTags {
+    SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+    SEGMENT_IDS
+  };
+
+ private:
+  // member field to reuse memory
+  vector<ReducerGradient> reducers_;
+  vector<int> segment_length_;
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractUnsortedSegmentDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "UnsortedSegment";
+  static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of input tensor. Segments ids can appear in
+arbitrary order (unlike in SortedSegment{op}).
+
+SEGMENT_IDS is a vector that maps each of the first dimension slices of the
+DATA to a particular group (segment). Values belonging to the same segment are
+aggregated together.
+
+If `num_segments` argument is passed it would be used as a first dimension for
+the output. Otherwise, it'd be dynamically calculated from as the max value of
+SEGMENT_IDS plus one. Other output dimensions are inherited from the input
+tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Arg(
+        "num_segments",
+        "Optional int argument specifying the number of output segments and "
+        "thus the first dimension of the output");
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "SEGMENT_IDS",
+        "Integer vector with the same length as the first dimension of DATA "
+        "that maps each slice of DATA to one of the segments");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of equal to the "
+        "number of segments.");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractUnsortedSegmentOp<
+      T,
+      SIndex,
+      Context,
+      typename ReducerDef::template Reducer<T, Context>,
+      false>;
+  using BackwardOp =
+      AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = SegmentOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      false /*Sorted*/,
+      false /*SparseFused*/>;
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSparseUnsortedSegmentDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SparseUnsortedSegment";
+  static constexpr const char* doc = R"DOC(
+Pulls in slices of the input tensor, groups them into segments and applies
+'{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
+SparseSortedSegment{op}).
+
+This op is basically Gather and UnsortedSegment{op} fused together.
+
+INDICES should contain integers in range 0..N-1 where N is the first dimension
+of DATA. INDICES represent which slices of DATA need to be pulled in.
+
+SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
+particular group (segment). Values belonging to the same segment are aggregated
+together. SEGMENT_IDS should have the same dimension as INDICES.
+
+If `num_segments` argument is passed it would be used as a first dimension for
+the output. Otherwise, it'd be dynamically calculated from as the max value of
+SEGMENT_IDS plus one. Other output dimensions are inherited from the input
+tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "INDICES",
+        "Integer vector containing indices of the first dimension of DATA for "
+        "the slices that are being aggregated");
+    schema.Input(
+        Reducer::kInputCount + 1,
+        "SEGMENT_IDS",
+        "Integer vector with the same length as INDICES that maps each slice "
+        "of DATA referenced by INDICES to one of the segments");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of equal to the "
+        "number of segments.");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractUnsortedSegmentOp<T, SIndex, Context, Reducer>;
+  // TODO(dzhulgakov): we're registering the same class twice here,
+  // consider avoiding op duplication here
+  using BackwardOp =
+      AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = SegmentOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      false /*Sorted*/,
+      true /*SparseFused*/>;
+};
+
+/**
+ * @brief Segment reduction op with optional fused embedding lookup
+ *
+ * Base implementation for LengthsXXX and SparseLengthsXXX depending
+ * on SparseFused static argument.
+ *
+ * Inputs:
+ *   0: DATA - input embedding to do lookups in
+ *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ *                       reducer, should have the same first dimension as
+ *                       LENGTHS (e.g. scalars in WeightedSum)
+ *   # if SparseFused == true:
+ *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
+ *                  same dimension as LENGTHS
+ *   # P+1 if SparseFused == false:
+ *   P+1 or P+2: LENGTHS - lengths on indecies vector
+ *
+ * Output:
+ *   Tensor with first dimension of K, where K = len(LENGTHS). Rest
+ *   of dimensions are decided by reducer but usually are the same size as extra
+ *   dimensions of DATA
+ */
+// TODO(dzhulgakov): for now it's implemented with incremental reducers because
+// of fused sparse support. But using "lengths" representation actually implies
+// continuous segments and thus range reducers can be used for non-sparse
+// version.
+
+template <
+    typename TData,
+    typename TLengths,
+    class Context,
+    class Reducer,
+    bool SparseFused = true,
+    class InputAccessor = BaseInputAccessor<TData>>
+class AbstractLengthsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex in_block_size = Input(0).size_from_dim(1);
+    return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
+        this, in_block_size);
+  }
+
+  template <typename IndexType, int FixedSize>
+  bool DoRunWithValue() {
+    auto& dataInput = Input(0);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const TIndex outputSize = lengthsInput.dim(0);
+
+    const IndexType* indices;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+      indices = indicesInput.template data<IndexType>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataSize;
+    }
+
+    typename Reducer::Meta ctx;
+    ctx.observeInput(0, dataInput, 1);
+    for (int i = 1; i < Reducer::kInputCount; ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE(
+          dataToReduceSize == aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeInput(i, aux_in, 1);
+    }
+
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+
+    OPERATOR_NEEDS_FEATURE(
+        inputAccessor_.observeInput(dataInput),
+        "Unsupported input type: ",
+        dataInput.meta().name(),
+        ".");
+
+    vector<TIndex> shape{outputSize};
+    ctx.appendOutputShape(&shape);
+    output->Resize(shape);
+
+    TIndex in_block_size = dataInput.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
+    TData* out = output->template mutable_data<TData>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
+      Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        IndexType idx;
+        if (SparseFused) { // static if
+          idx = indices[dataIndex];
+          CAFFE_ENFORCE(
+              0 <= idx && idx < dataSize,
+              "The ",
+              dataIndex,
+              "th index from the input indices is out of bounds: ",
+              idx,
+              " vs. valid range 0 to ",
+              dataSize);
+        } else {
+          idx = dataIndex;
+          CAFFE_ENFORCE(
+              0 <= idx && idx < dataSize,
+              "When calculating the ",
+              rangeIndex,
+              "th output with length=",
+              lengths[rangeIndex],
+              ", the index is out of bounds: ",
+              idx,
+              " vs. valid range 0 to ",
+              dataSize);
+        }
+
+        const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
+        reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
+      }
+      reducer.template finish<FixedSize>(ctx, &context_);
+    }
+    CAFFE_ENFORCE(
+        dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
+
+    return true;
+  }
+
+  enum {
+    INDICES = Reducer::kInputCount,
+    LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
+  };
+  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
+  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
+
+ private:
+  InputAccessor inputAccessor_;
+};
+
+/*
+ * Some notice:
+ * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
+ * 2. INDICES are not used in CPU version, but they are needed in async CUDA
+ *    version. So we register 3 input version for CPU as gradient op for
+ *    GPU/CPU convert. We then register 2 input version for CPU for backward
+ *    compatibility with older nets.
+ */
+template <
+    typename T,
+    typename TLengths,
+    class Context,
+    class ReducerGradient,
+    bool GradientNeedIndices = false>
+class AbstractLengthsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, gradBlockSize);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& segmentGradsInput = Input(SEGMENT_GRADS);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* dataGradsOutput = Output(0);
+
+    CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
+    TIndex reducedDataSize = 0;
+    TIndex numSegments = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+    for (TIndex i = 0; i < numSegments; ++i) {
+      reducedDataSize += lengths[i];
+    }
+
+    typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          reducedDataSize,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have the same first dim as SEGMENT_IDS");
+      ctx.observeOriginalInput(
+          ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
+    }
+
+    const T* segmentGrads = segmentGradsInput.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(reducedDataSize);
+    ctx.appendGradShape(&shape);
+    dataGradsOutput->Resize(shape);
+
+    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    T* dataGrads = dataGradsOutput->template mutable_data<T>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+      ReducerGradient reducer(
+          ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        reducer.template fillGrad<FixedSize>(
+            ctx,
+            dataGrads + dataGradsBlockSize * dataIndex,
+            dataIndex,
+            &context_,
+            lengths[rangeIndex]);
+      }
+    }
+    CAFFE_ENFORCE(
+        dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
+      2 + (GradientNeedIndices ? 1 : 0);
+  enum _InputTags {
+    SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+    LENGTHS,
+    INDICES
+  };
+};
+
+// Version of gradient that requires the main input and thus needs to receive
+// length, indices and other stuff
+template <
+    typename T,
+    typename TLengths,
+    class Context,
+    class ReducerGradient,
+    bool SparseFused = true,
+    bool GradientNeedIndices = false>
+class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch, IndexType>::
+        call(this, in_block_size);
+  }
+
+  template <typename IndexType, int FixedSize>
+  bool DoRunWithValue() {
+    auto& dataInput = Input(DATA_INPUT);
+    auto& segmentGradsInput = Input(SEGMENT_GRADS);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* dataGradsOutput = Output(0);
+
+    CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
+    TIndex numSegments = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+
+    typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      int aux_num = ReducerGradient::originalInputs()[i];
+      auto& aux_in = Input(i);
+      auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
+      ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
+    }
+
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const IndexType* indices = nullptr;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      indices = indicesInput.template data<IndexType>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataInput.dim(0);
+    }
+
+    const T* segmentGrads = segmentGradsInput.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(dataToReduceSize);
+    ctx.appendGradShape(&shape);
+    dataGradsOutput->Resize(shape);
+
+    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    T* dataGrads = dataGradsOutput->template mutable_data<T>();
+
+    const T* data = dataInput.template data<T>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+      ReducerGradient reducer(
+          ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        IndexType data_pos;
+        // No range checking, should've been verified in forward pass
+        if (SparseFused) { // static if
+          data_pos = indices[dataIndex];
+        } else {
+          data_pos = dataIndex;
+        }
+        reducer.template fillGradWithMainInput<FixedSize>(
+            ctx,
+            data + dataGradsBlockSize * data_pos,
+            dataGrads + dataGradsBlockSize * dataIndex,
+            dataIndex,
+            &context_,
+            lengths[rangeIndex]);
+      }
+    }
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS,
+  //      DATA_INPUT, [INDICES]
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
+      3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
+  enum _InputTags {
+    SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+    LENGTHS,
+    DATA_INPUT,
+    INDICES,
+  };
+};
+
+// Version of gradient that requires the main input as well as the output of the
+// forward op.
+template <typename T, typename TLengths, class Context, class ReducerGradient>
+class AbstractLengthsWithMainInputAndForwardOutputGradientOp
+    : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputAndForwardOutputGradientOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class.
+    TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, in_block_size);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& dataInput = Input(DATA_INPUT);
+    auto& segmentGradsInput = Input(SEGMENT_GRADS);
+    auto& lengthsInput = Input(LENGTHS);
+    auto& forwardOutputInput = Input(FORWARD_OUTPUT);
+    auto* dataGradsOutput = Output(0);
+
+    CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
+    TIndex numSegments = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+
+    typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      int aux_num = ReducerGradient::originalInputs()[i];
+      auto& aux_in = Input(i);
+      auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
+      ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
+    }
+
+    CAFFE_ENFORCE(forwardOutputInput.ndim() > 0);
+    CAFFE_ENFORCE(numSegments == forwardOutputInput.dim(0));
+    const T* forwardOutput = forwardOutputInput.template data<T>();
+
+    TIndex dataToReduceSize = dataInput.dim(0);
+
+    const T* segmentGrads = segmentGradsInput.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(dataToReduceSize);
+    ctx.appendGradShape(&shape);
+    dataGradsOutput->Resize(shape);
+
+    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    T* dataGrads = dataGradsOutput->template mutable_data<T>();
+
+    const T* data = dataInput.template data<T>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+      ReducerGradient reducer(
+          ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        // No range checking, should've been verified in forward pass
+        reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
+            ctx,
+            data + dataGradsBlockSize * dataIndex,
+            dataGrads + dataGradsBlockSize * dataIndex,
+            forwardOutput + segmentBlockSize * rangeIndex,
+            dataIndex,
+            &context_,
+            lengths[rangeIndex]);
+      }
+    }
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS,
+  //      LENGTHS, DATA_INPUT
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs =
+      ReducerGradient::originalInputs().size() + 4;
+  enum _InputTags {
+    FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
+    SEGMENT_GRADS,
+    LENGTHS,
+    DATA_INPUT,
+  };
+};
+
+// base implementation of sparse/non-sparse gradient computation
+template <
+    typename ForwardOp,
+    typename ReducerDef,
+    typename ReducerGradient,
+    bool SparseFused,
+    bool GradientNeedIndices = false>
+struct LengthsOpGetGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_ins;
+    string suffix = "Gradient";
+    for (const int i : ReducerGradient::originalInputs()) {
+      grad_ins.push_back(I(i));
+    }
+    if (ReducerGradient::requiresForwardOutput()) {
+      grad_ins.push_back(O(0));
+      CAFFE_ENFORCE(
+          !SparseFused,
+          "Forward pass output not yet supported as input for backward pass "
+          "for SparseLengthsXXX operators");
+      suffix = "AndForwardOutput" + suffix;
+    }
+    grad_ins.push_back(GO(0));
+    grad_ins.push_back(I(ForwardOp::LENGTHS));
+    bool indices_pushed = false;
+    if (ReducerGradient::requiresDataInput(Def())) {
+      grad_ins.push_back(I(0));
+      if (SparseFused) {
+        grad_ins.push_back(I(ForwardOp::INDICES));
+        indices_pushed = true;
+      }
+      suffix = "WithMainInput" + suffix;
+    }
+    if (GradientNeedIndices && !indices_pushed) {
+      if (SparseFused) {
+        grad_ins.push_back(I(ForwardOp::INDICES));
+      } else {
+        // Hacky: using Input as Indices, remove this after we have specialized
+        // cuda LengthsIndicesInGradientSumGradient
+        grad_ins.push_back(I(0));
+      }
+    }
+    vector<string> grad_outs;
+    grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
+    int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
+    for (int i = 1; i <= aux_grads; ++i) {
+      grad_outs.push_back(GI(i));
+    }
+    vector<OperatorDef> r{CreateOperatorDef(
+        string(SparseFused ? "SparseLengths" : "Lengths") +
+            string(GradientNeedIndices ? "IndicesInGradient" : "") +
+            ReducerDef::name + suffix,
+        "",
+        grad_ins,
+        grad_outs)};
+    if (SparseFused) {
+      SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
+    }
+    return r;
+  }
+};
+
+template <
+    typename T,
+    typename SIndex,
+    typename Context,
+    typename ReducerDef,
+    bool GradientNeedIndices = false>
+struct AbstractLengthsDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "Lengths";
+  static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of the input tensor. Segments are defined
+by their *LENGTHS*. *LENGTHS* is a vector that maps each of the slices of
+*DATA* to a particular segment. Values belonging to the same segment are
+aggregated together and considered for the '{op}' operation.
+
+For example *LENGTHS = [2, 1]* stands for segments *DATA[0..1]* and *DATA[2]*
+
+The sum of elements in *LENGTHS* must equal the number of elements in the first
+dimension of *DATA*. The length of *OUTPUT* is equal to the number of input
+segments, i.e. len(*LENGTHS*).
+
+{op_doc}
+
+{extra}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
+    schema.TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          vector<TensorShape> out(0);
+          TensorShape output;
+          for (int d : in[Reducer::kInputCount].dims()) {
+            output.add_dims(d);
+          }
+          for (int j = 1; j < in[0].dims_size(); j++) {
+            output.add_dims(in[0].dims(j));
+          }
+          output.set_data_type(in[0].data_type());
+          out.push_back(output);
+          return out;
+        });
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>;
+  using BackwardOp =
+      AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
+  using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
+      SIndex,
+      Context,
+      ReducerGradient,
+      false>;
+  using WithMainInputAndForwardOutputBackwardOp =
+      AbstractLengthsWithMainInputAndForwardOutputGradientOp<
+          T,
+          SIndex,
+          Context,
+          ReducerGradient>;
+  using GetGradient = LengthsOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      false /*SparseFused*/,
+      GradientNeedIndices>;
+};
+
+template <
+    typename T,
+    typename SIndex,
+    typename Context,
+    typename ReducerDef,
+    bool GradientNeedIndices = false>
+struct AbstractSparseLengthsDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SparseLengths";
+  static constexpr const char* doc = R"DOC(
+Pulls in slices of the input tensor, groups them into segments and applies
+'{op}' to each segment. Segments are defined by their LENGTHS.
+
+This op is basically Gather and Lengths{op} fused together.
+
+INDICES should contain integers in range 0..N-1 where N is the first dimension
+of DATA. INDICES represent which slices of DATA need to be pulled in.
+
+LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
+belonging to the same segment are aggregated together. sum(LENGTHS) has
+to match INDICES size.
+
+The first dimension of the output is equal to the number of input segment,
+i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "INDICES",
+        "Integer vector containing indices of the first dimension of DATA for "
+        "the slices that are being aggregated");
+    schema.Input(
+        Reducer::kInputCount + 1,
+        "LENGTHS",
+        "Non negative vector with sum of elements equal to INDICES length");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of K "
+        "(the number of segments).");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>;
+  // TODO(dzhulgakov): we're registering the same class twice here,
+  // consider avoiding op duplication here
+  // Note: registering 2 input version for now because of naming in the macro,
+  // will register 3 input version alone
+  /* INDICES are not used in CPU version, but they are needed in async CUDA
+   *    version. So we register 3 input version for CPU as gradient op for
+   *    GPU/CPU convert. We then register 2 input version for CPU for backward
+   *    compatibility with older nets.
+   */
+  using BackwardOp = AbstractLengthsGradientOp<
+      T,
+      SIndex,
+      Context,
+      ReducerGradient,
+      false /*GradientNeedIndices*/>;
+  using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
+      SIndex,
+      Context,
+      ReducerGradient>;
+  // Will return 3 input version. This is aliging new CPU/GPU nets.
+  using GetGradient = LengthsOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      true /*SparseFused*/,
+      GradientNeedIndices>;
+};
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
new file mode 100644
index 0000000..9a2d3a8
--- /dev/null
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -0,0 +1,1829 @@
+#include <cub/block/block_reduce.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_scan.cuh>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+
+namespace caffe2 {
+
+namespace {
+
+void inclusive_scan_wrapper(
+    const int* length_data,
+    int len_length,
+    Tensor<CUDAContext>* temp_buffer,
+    Tensor<CUDAContext>* prefix_sum_out,
+    CUDAContext* context_) {
+  // Retrieve buffer size
+  size_t temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveSum(
+      NULL,
+      temp_storage_bytes,
+      length_data,
+      prefix_sum_out->mutable_data<int>(),
+      len_length,
+      context_->cuda_stream());
+  // Allocate temporary storage
+  auto buffer_size = (temp_storage_bytes + sizeof(int)) / sizeof(int);
+  temp_buffer->Resize(buffer_size);
+  void* d_temp_storage = static_cast<void*>(temp_buffer->mutable_data<int>());
+  // Run inclusive prefix sum
+  cub::DeviceScan::InclusiveSum(
+      d_temp_storage,
+      temp_storage_bytes,
+      length_data,
+      prefix_sum_out->mutable_data<int>(),
+      len_length,
+      context_->cuda_stream());
+}
+
+template <typename T, bool ExactBlock = false, bool Average = false>
+__global__ void length_sum_kernel(
+    const T* __restrict__ in,
+    T* __restrict__ out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  if (ExactBlock) {
+    in += threadIdx.x;
+
+    T sum = (T)0;
+    for (int line = start; line < end; ++line) {
+      sum += in[line * post];
+    }
+    if (Average && (end - start) > 1) {
+      sum /= (end - start);
+    }
+    out[group * post + threadIdx.x] = sum;
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      T sum = (T)0;
+      for (int line = start; line < end; ++line) {
+        sum += in[line * post + i];
+      }
+      if (Average && (end - start) > 1) {
+        sum /= (end - start);
+      }
+      out[group * post + i] = sum;
+    }
+  }
+}
+
+template <typename T, bool ExactBlock = false, bool Average = false>
+__global__ void length_sum_gradient_kernel(
+    const T* __restrict__ grad_in,
+    T* __restrict__ grad_out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  if (ExactBlock) {
+    grad_out += threadIdx.x;
+    grad_in += threadIdx.x;
+
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      grad_out[line * post] = grad_in[group * post];
+      if (Average && (end - start) > 1) {
+        grad_out[line * post] /= (end - start);
+      }
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      for (int line = start; line < end; ++line) {
+        grad_out[line * post + i] = grad_in[group * post + i];
+        if (Average && (end - start) > 1) {
+          grad_out[line * post + i] /= (end - start);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, bool ExactBlock = false>
+__global__ void length_max_kernel(
+    const T* __restrict__ in,
+    T* __restrict__ out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length,
+    const T numeric_min) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  if (ExactBlock) {
+    in += threadIdx.x;
+
+    T max = numeric_min;
+    for (int line = start; line < end; ++line) {
+      T in_data = in[line * post];
+      max = max > in_data ? max : in_data;
+    }
+
+    // setting output to 0 to not break gradient
+    max = max == numeric_min ? 0 : max;
+    out[group * post + threadIdx.x] = max;
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      T max = numeric_min;
+      for (int line = start; line < end; ++line) {
+        T in_data = in[line * post + i];
+        max = max > in_data ? max : in_data;
+      }
+      // setting output to 0 to not break gradient
+      max = max == numeric_min ? 0 : max;
+      out[group * post + i] = max;
+    }
+  }
+}
+
+template <typename T, bool ExactBlock = false>
+__global__ void length_weighted_sum_gradient_kernel(
+    const T* __restrict__ grad_in,
+    const T* __restrict__ weights_in,
+    T* __restrict__ grad_out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  if (ExactBlock) {
+    grad_out += threadIdx.x;
+    grad_in += threadIdx.x;
+
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      grad_out[line * post] = weights_in[line] * grad_in[group * post];
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      for (int line = start; line < end; ++line) {
+        grad_out[line * post + i] =
+            weights_in[line] * grad_in[group * post + i];
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexType, int NumThreads>
+__global__ void length_weighted_sum_with_main_input_gradient_kernel(
+    const T* __restrict__ grad_in,
+    const T* __restrict__ weights_in,
+    const T* __restrict__ data_in,
+    const IndexType* __restrict__ indices,
+    T* __restrict__ data_grad_out,
+    T* __restrict__ weights_grad_out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  // todo figure this num threads thing
+  typedef cub::BlockReduce<float, NumThreads> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // TODO(wyiming): parallelize this outter loop
+  for (int line = start; line < end; ++line) {
+    T w_grad = 0;
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      auto g_in = grad_in[group * post + i];
+      data_grad_out[line * post + i] = weights_in[line] * g_in;
+      w_grad += g_in * data_in[indices[line] * post + i];
+    }
+    w_grad = BlockReduce(temp_storage).Reduce(w_grad, cub::Sum());
+    if (threadIdx.x == 0) {
+      weights_grad_out[line] = w_grad;
+    }
+    __syncthreads();
+  }
+}
+
+template <
+    typename T,
+    typename IndexType,
+    bool ExactBlock = false,
+    bool Average = false>
+__global__ void sparse_length_sum_kernel(
+    const T* __restrict__ in,
+    T* __restrict__ out,
+    const int* __restrict__ prefix_sum_length_data,
+    const IndexType* __restrict__ indices,
+    int N,
+    int post,
+    int len_length,
+    int len_indices) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= len_indices);
+  CUDA_KERNEL_ASSERT(end <= len_indices);
+
+  extern __shared__ T reduceVals[];
+
+  if (ExactBlock) {
+    T sum = (T)0;
+
+    in += threadIdx.x;
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      sum += in[indices[line] * post];
+    }
+
+    reduceVals[threadIdx.y * blockDim.x + threadIdx.x] = sum;
+    __syncthreads();
+
+    if (threadIdx.y == 0) {
+      sum = (T)0;
+      for (int i = 0; i < blockDim.y; ++i) {
+        sum += reduceVals[i * blockDim.x + threadIdx.x];
+      }
+      if (Average && (end - start) > 1) {
+        sum /= (end - start);
+      }
+
+      out[group * post + threadIdx.x] = sum;
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      T sum = (T)0;
+      for (int line = start; line < end; ++line) {
+        sum += in[indices[line] * post + i];
+      }
+      if (Average && (end - start) > 1) {
+        sum /= (end - start);
+      }
+      out[group * post + i] = sum;
+    }
+  }
+}
+
+template <typename T, typename IndexType, bool ExactBlock = false>
+__global__ void sparse_length_max_kernel(
+    const T* __restrict__ in,
+    T* __restrict__ out,
+    const int* __restrict__ prefix_sum_length_data,
+    const IndexType* __restrict__ indices,
+    int N,
+    int post,
+    int len_length,
+    int len_indices,
+    const T numeric_min) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= len_indices);
+  CUDA_KERNEL_ASSERT(end <= len_indices);
+
+  extern __shared__ T reduceVals[];
+
+  if (ExactBlock) {
+    T max = numeric_min;
+
+    in += threadIdx.x;
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      T in_data = in[indices[line] * post];
+      max = max > in_data ? max : in_data;
+    }
+
+    reduceVals[threadIdx.y * blockDim.x + threadIdx.x] = max;
+    __syncthreads();
+
+    if (threadIdx.y == 0) {
+      max = numeric_min;
+      for (int i = 0; i < blockDim.y; ++i) {
+        T in_data = reduceVals[i * blockDim.x + threadIdx.x];
+        max = max > in_data ? max : in_data;
+      }
+
+      // setting output to 0 to not break gradient
+      max = max == numeric_min ? 0 : max;
+      out[group * post + threadIdx.x] = max;
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      T max = numeric_min;
+      for (int line = start; line < end; ++line) {
+        T in_data = in[indices[line] * post + i];
+        max = max > in_data ? max : in_data;
+      }
+      // setting output to 0 to not break gradient
+      max = max == numeric_min ? 0 : max;
+      out[group * post + i] = max;
+    }
+  }
+}
+
+template <typename T, typename IndexType, bool ExactBlock = false>
+__global__ void sparse_length_weighted_sum_kernel(
+    const T* __restrict__ in,
+    const T* __restrict__ in_weights,
+    T* __restrict__ out,
+    const int* __restrict__ prefix_sum_length_data,
+    const IndexType* __restrict__ indices,
+    int N,
+    int post,
+    int len_length,
+    int len_indices) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= len_indices);
+  CUDA_KERNEL_ASSERT(end <= len_indices);
+
+  extern __shared__ T reduceVals[];
+
+  if (ExactBlock) {
+    T sum = (T)0;
+
+    in += threadIdx.x;
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      sum += in_weights[line] * in[indices[line] * post];
+    }
+
+    reduceVals[threadIdx.y * blockDim.x + threadIdx.x] = sum;
+    __syncthreads();
+
+    if (threadIdx.y == 0) {
+      sum = (T)0;
+      for (int i = 0; i < blockDim.y; ++i) {
+        sum += reduceVals[i * blockDim.x + threadIdx.x];
+      }
+
+      out[group * post + threadIdx.x] = sum;
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      T sum = (T)0;
+      for (int line = start; line < end; ++line) {
+        sum += in_weights[line] * in[indices[line] * post + i];
+      }
+      out[group * post + i] = sum;
+    }
+  }
+}
+
+} // namespace
+
+template <typename T, class Context = CUDAContext, bool SparseFused = true>
+class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsSumOp() {}
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<int32_t>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& dataInput = Input(0);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const TIndex outputSize = lengthsInput.dim(0);
+    const int len_length = outputSize;
+
+    auto shape = dataInput.dims();
+    shape[0] = outputSize;
+    output->Resize(shape);
+    T* out_data = output->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    const IndexType* indices;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+      indices = indicesInput.template data<IndexType>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataSize;
+    }
+
+    // only compute this the first time
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    const T* in_data = dataInput.template data<T>();
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+    int N = dataSize;
+    int post = dataInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+    if (SparseFused) {
+      if (post <= maxThreads) {
+        int multiple = std::min(maxThreads / post, 16);
+        dim3 block(post, multiple);
+        size_t smem = sizeof(T) * post * multiple;
+
+        // calling cuda kernel with ExactBlock = true, Average = false
+        sparse_length_sum_kernel<T, IndexType, true, false>
+            <<<len_length, block, smem, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize);
+      } else {
+        // calling cuda kernel with ExactBlock = false, Average = false
+        sparse_length_sum_kernel<T, IndexType, false, false>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize);
+      }
+    } else {
+      if (post <= maxThreads) {
+        length_sum_kernel<T, true, false>
+            <<<len_length, post, 0, context_.cuda_stream()>>>(
+                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+      } else {
+        length_sum_kernel<T, true, false>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+      }
+    }
+    return true;
+  }
+
+  enum { INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext, bool SparseFused = true>
+class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsMeanOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsMeanOp() {}
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<int32_t>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& dataInput = Input(0);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const TIndex outputSize = lengthsInput.dim(0);
+    const int len_length = outputSize;
+
+    auto shape = dataInput.dims();
+    shape[0] = outputSize;
+    output->Resize(shape);
+    T* out_data = output->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    const IndexType* indices;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+      indices = indicesInput.template data<IndexType>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataSize;
+    }
+
+    // only compute this the first time
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    const T* in_data = dataInput.template data<T>();
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+    int N = dataSize;
+    int post = dataInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+    if (SparseFused) {
+      if (post <= maxThreads) {
+        int multiple = std::min(maxThreads / post, 16);
+        dim3 block(post, multiple);
+        size_t smem = sizeof(T) * post * multiple;
+        // calling cuda kernel with ExactBlock = true, Average = true
+        sparse_length_sum_kernel<T, IndexType, true, true>
+            <<<len_length, block, smem, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize);
+      } else {
+        // calling cuda kernel with ExactBlock = false, Average = true
+        sparse_length_sum_kernel<T, IndexType, false, true>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize);
+      }
+    } else {
+      if (post <= maxThreads) {
+        // calling cuda kernel with ExactBlock = true, Average = true
+        length_sum_kernel<T, true, true>
+            <<<len_length, post, 0, context_.cuda_stream()>>>(
+                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+      } else {
+        // calling cuda kernel with ExactBlock = true, Average = true
+        length_sum_kernel<T, true, true>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+      }
+    }
+    return true;
+  }
+
+  enum { INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext, bool SparseFused = true>
+class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsMaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsMaxOp() {}
+
+  bool RunOnDevice() override {
+    if (SparseFused) {
+      return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+          this, Input(INDICES));
+    } else {
+      // type doesn't matter
+      return DoRunWithType<int32_t>();
+    }
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& dataInput = Input(0);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const TIndex outputSize = lengthsInput.dim(0);
+    int len_length = outputSize;
+
+    auto shape = dataInput.dims();
+    shape[0] = outputSize;
+    output->Resize(shape);
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    const IndexType* indices;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+      indices = indicesInput.template data<IndexType>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataSize;
+    }
+
+    // only compute this the first time
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    const T* in_data = dataInput.template data<T>();
+    T* out_data = output->template mutable_data<T>();
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+    int N = dataSize;
+    int post = dataInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+    T numeric_min = std::numeric_limits<T>::min();
+    if (SparseFused) {
+      if (post <= maxThreads) {
+        int multiple = std::min(maxThreads / post, 16);
+        dim3 block(post, multiple);
+        size_t smem = sizeof(T) * post * multiple;
+
+        sparse_length_max_kernel<T, IndexType, true>
+            <<<len_length, block, smem, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize,
+                numeric_min);
+      } else {
+        sparse_length_max_kernel<T, IndexType, false>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                indices,
+                N,
+                post,
+                len_length,
+                dataToReduceSize,
+                numeric_min);
+      }
+    } else {
+      if (post <= maxThreads) {
+        length_max_kernel<T, true>
+            <<<len_length, post, 0, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                N,
+                post,
+                len_length,
+                numeric_min);
+      } else {
+        length_max_kernel<T, true>
+            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+                in_data,
+                out_data,
+                prefix_sum_length_data,
+                N,
+                post,
+                len_length,
+                numeric_min);
+      }
+    }
+    return true;
+  }
+
+  enum { INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext, bool SparseFused = true>
+class CUDASparseLengthsWeightedSumOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsWeightedSumOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& dataInput = Input(DATA);
+    auto& weightsInput = Input(WEIGHTS);
+    auto& indicesInput = Input(INDICES);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(1, weightsInput.ndim(), "WEIGHTS must be a vector");
+    CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    const TIndex dataToReduceSize = indicesInput.dim(0);
+    const TIndex outputSize = lengthsInput.dim(0);
+    const int len_length = outputSize;
+
+    auto shape = dataInput.dims();
+    shape[0] = outputSize;
+    output->Resize(shape);
+    T* out_data = output->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    const IndexType* indices = indicesInput.template data<IndexType>();
+    const T* in_data = dataInput.template data<T>();
+    const T* in_weights = weightsInput.template data<T>();
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+    int N = dataSize;
+    int post = dataInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+    if (post <= maxThreads) {
+      int multiple = std::min(maxThreads / post, 16);
+      dim3 block(post, multiple);
+      size_t smem = sizeof(T) * post * multiple;
+
+      sparse_length_weighted_sum_kernel<T, IndexType, true>
+          <<<len_length, block, smem, context_.cuda_stream()>>>(
+              in_data,
+              in_weights,
+              out_data,
+              prefix_sum_length_data,
+              indices,
+              N,
+              post,
+              len_length,
+              dataToReduceSize);
+    } else {
+      sparse_length_weighted_sum_kernel<T, IndexType, false>
+          <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+              in_data,
+              in_weights,
+              out_data,
+              prefix_sum_length_data,
+              indices,
+              N,
+              post,
+              len_length,
+              dataToReduceSize);
+    }
+    return true;
+  }
+
+  enum { DATA = 0, WEIGHTS = 1, INDICES = 2, LENGTHS = 3 };
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename SIndex>
+__global__ void
+MaxSegmentKernel(int n, const SIndex* segment_ids, SIndex* max_segment) {
+  typedef cub::BlockReduce<SIndex, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int mx = 0;
+
+  for (int j = threadIdx.x; j < n; j += blockDim.x) {
+    mx = segment_ids[j] > mx ? segment_ids[j] : mx;
+  }
+  SIndex max_seg = BlockReduce(temp_storage).Reduce(mx, cub::Max());
+  if (threadIdx.x == 0) {
+    *max_segment = max_seg;
+  }
+}
+
+template <typename SIndex, typename T>
+__global__ void UnsortedSegmentSumKernel(
+    int n,
+    int slize_sz,
+    const SIndex* segments,
+    const T* data,
+    T* out,
+    int* scales) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int slice_idx = i / slize_sz;
+    int j = i % slize_sz;
+    SIndex segment = segments[slice_idx];
+    atomicAdd(&out[segment * slize_sz + j], data[i]);
+    if (scales && j == 0) {
+      atomicAdd(&scales[segment], 1);
+    }
+  }
+}
+
+template <typename SIndex, typename T>
+__global__ void
+SegmentScalingKernel(int m, int slize_sz, const int* scales, T* out) {
+  CUDA_1D_KERNEL_LOOP(i, m) {
+    int scale = scales[i / slize_sz];
+    out[i] = scale > 0 ? out[i] / scale : 0.0; // avoid 0/0 division
+  }
+}
+
+template <typename T, typename SIndex, bool mean>
+class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  CUDAUnsortedSegmentSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDAUnsortedSegmentSumOp() {}
+
+  bool RunOnDevice() override {
+    auto& data = Input(0);
+    auto& segment_ids = Input(1);
+    auto* output = Output(0);
+
+    if (segment_ids.size() == 0 || data.size() == 0) {
+      // Special handling for empty input
+      auto dims = data.dims();
+      if (dims.size() > 0) {
+        dims[0] = 0;
+      }
+      output->Resize(dims);
+      output->template mutable_data<T>();
+      return true;
+    }
+
+    CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
+    TIndex slize_sz = data.size_from_dim(1);
+
+    K_tensor_.Resize(1);
+    // Get maximum segment id so we can size the output.
+    // This must be done synchronously with host.
+    if (segment_ids.size() > 4096) {
+      // when the input size is large, device reduce is better.
+      size_t tmp_storage_bytes = 0;
+      // the first call to `Max` do nothing, but set correct tmp_storage_bytes.
+      cub::DeviceReduce::Max(
+          nullptr,
+          tmp_storage_bytes,
+          segment_ids.template data<SIndex>(), // input device data
+          K_tensor_.template mutable_data<SIndex>(), // output device data
+          segment_ids.size(), // number of items
+          context_.cuda_stream());
+
+      // the second call do the real computation.
+      buffer_tensor_.Resize(tmp_storage_bytes);
+      cub::DeviceReduce::Max(
+          static_cast<void*>(buffer_tensor_.mutable_data<char>()),
+          tmp_storage_bytes,
+          segment_ids.template data<SIndex>(), // input device data
+          K_tensor_.template mutable_data<SIndex>(), // output device data
+          segment_ids.size(), // number of items
+          context_.cuda_stream());
+    } else {
+      MaxSegmentKernel<SIndex>
+          <<<1, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+              segment_ids.size(),
+              segment_ids.template data<SIndex>(),
+              K_tensor_.mutable_data<SIndex>());
+    }
+
+    SIndex K = 0;
+    context_.CopyBytes<CUDAContext, CPUContext>(
+        sizeof(SIndex), K_tensor_.template data<SIndex>(), &K);
+    context_.FinishDeviceComputation();
+
+    auto dims = data.dims();
+    dims[0] = K + 1;
+    output->Resize(dims);
+
+    // Clear the output as we will be accumulating the values
+    math::Set<T, CUDAContext>(
+        output->size(), T(0), output->template mutable_data<T>(), &context_);
+
+    if (!mean) {
+      UnsortedSegmentSumKernel<SIndex, T><<<
+          CAFFE_GET_BLOCKS(data.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          data.size(),
+          slize_sz,
+          segment_ids.template data<SIndex>(),
+          data.template data<T>(),
+          output->template mutable_data<T>(),
+          nullptr);
+    } else {
+      // For mean, we need to compute scaling factors
+      scaling_factors_.Resize(K + 1);
+      math::Set<int, CUDAContext>(
+          scaling_factors_.size(),
+          int(0),
+          scaling_factors_.template mutable_data<int>(),
+          &context_);
+      UnsortedSegmentSumKernel<SIndex, T><<<
+          CAFFE_GET_BLOCKS(data.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          data.size(),
+          slize_sz,
+          segment_ids.template data<SIndex>(),
+          data.template data<T>(),
+          output->template mutable_data<T>(),
+          scaling_factors_.template mutable_data<int>());
+      // Divide by the scaling factors to get means
+      SegmentScalingKernel<SIndex, T><<<
+          CAFFE_GET_BLOCKS(output->size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output->size(),
+          slize_sz,
+          scaling_factors_.template data<int>(),
+          output->template mutable_data<T>());
+    }
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> buffer_tensor_;
+  Tensor<CUDAContext> K_tensor_;
+  Tensor<CUDAContext> scaling_factors_; // for mean
+};
+
+template <typename SIndex>
+__global__ void segment_lengths_kernel(int N, const SIndex* X, SIndex* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    atomicAdd(&Y[X[i]], 1);
+  }
+}
+
+template <typename T, typename SIndex, bool LOGEXP = false>
+__global__ void sorted_segment_mean_kernel(
+    const SIndex K,
+    const int N,
+    const SIndex* S,
+    const SIndex* I,
+    const T* X,
+    T* Y) {
+  for (int sId = blockIdx.x; sId < K; sId += gridDim.x) {
+    const int start_index = sId > 0 ? S[sId] * N : 0;
+    const int y_start_index = sId * N;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+      T sum = 0.0;
+      for (int j = 0; j < I[sId]; ++j) {
+        const T x_i_j = X[start_index + j * N + i];
+        sum += LOGEXP ? exp(x_i_j) : x_i_j;
+      }
+      const T norm_sum = sum / I[sId];
+      Y[y_start_index + i] = LOGEXP ? log(norm_sum) : norm_sum;
+    }
+  }
+}
+
+template <typename T, typename SIndex, bool LOGEXP, class Context = CUDAContext>
+class SortedSegmentRangeMeanOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SortedSegmentRangeMeanOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  ~SortedSegmentRangeMeanOp() {}
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    const auto& indices = Input(1);
+    int M = input.dim32(0);
+    int N = input.size_from_dim(1);
+    auto* output = Output(0);
+    auto dims = input.dims();
+    SIndex K = 0;
+    context_.template CopyBytes<Context, CPUContext>(
+        sizeof(SIndex),
+        indices.template data<SIndex>() + indices.size() - 1,
+        &K);
+    context_.FinishDeviceComputation();
+    K += 1;
+    dims[0] = K;
+    if (segment_len_.size() != K) {
+      segment_len_.Resize(K);
+      segment_len_prefix_sum_.Resize(K);
+    }
+    output->Resize(dims);
+    math::Set<SIndex, CUDAContext>(
+        segment_len_.size(),
+        0,
+        segment_len_.template mutable_data<SIndex>(),
+        &context_);
+    segment_lengths_kernel<<<
+        CAFFE_GET_BLOCKS(indices.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        indices.size(),
+        indices.template data<SIndex>(),
+        segment_len_.template mutable_data<SIndex>());
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::ExclusiveSum(
+        nullptr,
+        temp_storage_bytes,
+        segment_len_.template data<SIndex>(),
+        segment_len_prefix_sum_.template mutable_data<SIndex>(),
+        K,
+        context_.cuda_stream());
+    auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
+    prefix_buffer_.Resize(buffer_size);
+    void* dev_temp_storage =
+        static_cast<void*>(prefix_buffer_.mutable_data<T>());
+    cub::DeviceScan::ExclusiveSum(
+        dev_temp_storage,
+        temp_storage_bytes,
+        segment_len_.template data<SIndex>(),
+        segment_len_prefix_sum_.template mutable_data<SIndex>(),
+        K,
+        context_.cuda_stream());
+    sorted_segment_mean_kernel<T, SIndex, LOGEXP>
+        <<<min(K, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            K,
+            N,
+            segment_len_prefix_sum_.template data<SIndex>(),
+            segment_len_.template data<SIndex>(),
+            input.template data<T>(),
+            output->template mutable_data<T>());
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> segment_len_; // for mean
+  Tensor<CUDAContext> segment_len_prefix_sum_;
+  Tensor<CUDAContext> prefix_buffer_;
+};
+
+template <typename T, typename SIndex, bool LOGEXP = false>
+__global__ void sorted_segment_mean_gradient_kernel(
+    const int M,
+    const int N,
+    const T* X,
+    const T* Y,
+    const T* dY,
+    const SIndex* I,
+    const SIndex* S,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, M * N) {
+    const int sId = I[i / N];
+    const int sSize = S[sId];
+    const int yId = N * sId + i % N;
+    dX[i] = LOGEXP ? dY[yId] * exp(X[i] - Y[yId]) / sSize : dY[yId] / sSize;
+  }
+}
+
+template <typename T, typename SIndex, bool LOGEXP, class Context = CUDAContext>
+class SortedSegmentRangeMeanGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SortedSegmentRangeMeanGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  ~SortedSegmentRangeMeanGradientOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& Y = Input(1);
+    const auto& dY = Input(2);
+    const auto& I = Input(3);
+    auto* dX = Output(0);
+    dX->ResizeLike(X);
+
+    const int M = X.dim32(0);
+    const int N = X.size_from_dim(1);
+
+    SIndex K = 0;
+    context_.template CopyBytes<Context, CPUContext>(
+        sizeof(SIndex), I.template data<SIndex>() + I.size() - 1, &K);
+
+    K += 1;
+
+    if (segment_len_.size() != K) {
+      segment_len_.Resize(K);
+    }
+
+    math::Set<SIndex, CUDAContext>(
+        segment_len_.size(),
+        0,
+        segment_len_.template mutable_data<SIndex>(),
+        &context_);
+    segment_lengths_kernel<<<
+        CAFFE_GET_BLOCKS(I.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        I.size(),
+        I.template data<SIndex>(),
+        segment_len_.template mutable_data<SIndex>());
+    sorted_segment_mean_gradient_kernel<T, SIndex, LOGEXP>
+        <<<CAFFE_GET_BLOCKS(dX->size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            M,
+            N,
+            X.template data<T>(),
+            Y.template data<T>(),
+            dY.template data<T>(),
+            I.template data<SIndex>(),
+            segment_len_.template data<SIndex>(),
+            dX->template mutable_data<T>());
+
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> segment_len_; // for mean
+};
+
+REGISTER_CUDA_OPERATOR_STR(
+    "LengthsSum",
+    CUDASparseLengthsSumOp<float, CUDAContext, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SparseLengthsSum",
+    CUDASparseLengthsSumOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "LengthsMean",
+    CUDASparseLengthsMeanOp<float, CUDAContext, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SparseLengthsMean",
+    CUDASparseLengthsMeanOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "LengthsMax",
+    CUDASparseLengthsMaxOp<float, CUDAContext, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SparseLengthsMax",
+    CUDASparseLengthsMaxOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SparseLengthsWeightedSum",
+    CUDASparseLengthsWeightedSumOp<float, CUDAContext, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "UnsortedSegmentSum",
+    CUDAUnsortedSegmentSumOp<float, int, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "UnsortedSegmentMean",
+    CUDAUnsortedSegmentSumOp<float, int, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SortedSegmentRangeMean",
+    SortedSegmentRangeMeanOp<float, int, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SortedSegmentRangeLogMeanExp",
+    SortedSegmentRangeMeanOp<float, int, true>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SortedSegmentRangeMeanGradient",
+    SortedSegmentRangeMeanGradientOp<float, int, false>);
+REGISTER_CUDA_OPERATOR_STR(
+    "SortedSegmentRangeLogMeanExpGradient",
+    SortedSegmentRangeMeanGradientOp<float, int, true>);
+
+template <typename T, class Context = CUDAContext>
+class CUDASparseLengthsSumGradientWithIndicesOp : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsSumGradientWithIndicesOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsSumGradientWithIndicesOp() {}
+
+  bool RunOnDevice() override {
+    auto& segmentGradsInput = Input(0);
+    auto& lengthsInput = Input(1);
+    auto& indicesInput = Input(2);
+    auto* dataGradsOutput = Output(0);
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+
+    const int len_length = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(len_length == segmentGradsInput.dim(0));
+
+    auto shape = segmentGradsInput.dims();
+    int output_0dim = indicesInput.dim(0);
+    shape[0] = output_0dim;
+    dataGradsOutput->Resize(shape);
+    T* out_data = dataGradsOutput->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    // compute output size using length
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+
+    const T* in_data = segmentGradsInput.template data<T>();
+
+    int N = output_0dim;
+    int post = segmentGradsInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+
+    if (post <= maxThreads) {
+      int multiple = std::min(maxThreads / post, 16);
+      dim3 block(post, multiple);
+
+      // calling cuda kernel with ExactBlock = true, Average = false
+      length_sum_gradient_kernel<T, true, false>
+          <<<len_length, block, 0, context_.cuda_stream()>>>(
+
+              in_data, out_data, prefix_sum_length_data, N, post, len_length);
+    } else {
+      // calling cuda kernel with ExactBlock = false, Average = false
+      length_sum_gradient_kernel<T, false, false>
+          <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+              in_data, out_data, prefix_sum_length_data, N, post, len_length);
+    }
+
+    return true;
+  }
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext>
+class CUDASparseLengthsMeanGradientWithIndicesOp
+    : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsMeanGradientWithIndicesOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsMeanGradientWithIndicesOp() {}
+
+  bool RunOnDevice() override {
+    auto& segmentGradsInput = Input(0);
+    auto& lengthsInput = Input(1);
+    auto& indicesInput = Input(2);
+    auto* dataGradsOutput = Output(0);
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+
+    const int len_length = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(len_length == segmentGradsInput.dim(0));
+
+    auto shape = segmentGradsInput.dims();
+    int output_0dim = indicesInput.dim(0);
+    shape[0] = output_0dim;
+    dataGradsOutput->Resize(shape);
+    T* out_data = dataGradsOutput->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    // compute output size using length
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+
+    const T* in_data = segmentGradsInput.template data<T>();
+
+    int N = output_0dim;
+    int post = segmentGradsInput.size_from_dim(1);
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+
+    if (post <= maxThreads) {
+      int multiple = std::min(maxThreads / post, 16);
+      dim3 block(post, multiple);
+
+      // calling cuda kernel with ExactBlock = true, Average = true
+      length_sum_gradient_kernel<T, true, true>
+          <<<len_length, block, 0, context_.cuda_stream()>>>(
+
+              in_data, out_data, prefix_sum_length_data, N, post, len_length);
+    } else {
+      // calling cuda kernel with ExactBlock = false, Average = true
+      length_sum_gradient_kernel<T, false, true>
+          <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+              in_data, out_data, prefix_sum_length_data, N, post, len_length);
+    }
+
+    return true;
+  }
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext>
+class CUDASparseLengthsWeightedSumGradientWithIndicesOp
+    : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsWeightedSumGradientWithIndicesOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsWeightedSumGradientWithIndicesOp() {}
+
+  bool RunOnDevice() override {
+    auto& weightsInput = Input(0);
+    auto& segmentGradsInput = Input(1);
+    auto& lengthsInput = Input(2);
+    auto& indicesInput = Input(3);
+    auto* dataGradsOutput = Output(0);
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    CAFFE_ENFORCE_EQ(1, weightsInput.ndim(), "WEIGHTS must be a vector");
+
+    const int len_length = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(len_length == segmentGradsInput.dim(0));
+
+    auto shape = segmentGradsInput.dims();
+    int output_0dim = indicesInput.dim(0);
+    shape[0] = output_0dim;
+    dataGradsOutput->Resize(shape);
+    T* out_data = dataGradsOutput->template mutable_data<T>();
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    // compute output size using length
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+
+    const T* in_data = segmentGradsInput.template data<T>();
+    const T* in_weights = weightsInput.template data<T>();
+
+    int N = output_0dim;
+    int post = segmentGradsInput.size_from_dim(1);
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+
+    if (post < maxThreads) {
+      int multiple = std::min(maxThreads / post, 16);
+      dim3 block(post, multiple);
+
+      length_weighted_sum_gradient_kernel<T, true>
+          <<<len_length, block, 0, context_.cuda_stream()>>>(
+              in_data,
+              in_weights,
+              out_data,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    } else {
+      length_weighted_sum_gradient_kernel<T, false>
+          <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+              in_data,
+              in_weights,
+              out_data,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    }
+
+    return true;
+  }
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, bool ExactBlock = false>
+__global__ void length_max_gradient_kernel(
+    const T* __restrict__ grad_in,
+    T* __restrict__ grad_out,
+    const T* data_in,
+    const T* data_out,
+    const int* __restrict__ prefix_sum_length_data,
+    int N,
+    int post,
+    int len_length) {
+  // len_length blocks
+  int group = blockIdx.x;
+
+  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  int end = prefix_sum_length_data[group];
+  CUDA_KERNEL_ASSERT(start <= N);
+  CUDA_KERNEL_ASSERT(end <= N);
+
+  if (ExactBlock) {
+    grad_out += threadIdx.x;
+    grad_in += threadIdx.x;
+    data_in += threadIdx.x;
+    data_out += threadIdx.x;
+
+    for (int line = start + threadIdx.y; line < end; line += blockDim.y) {
+      if (data_in[line * post] == data_out[group * post]) {
+        grad_out[line * post] = grad_in[group * post];
+      } else {
+        grad_out[line * post] = 0;
+      }
+    }
+  } else {
+    for (int i = threadIdx.x; i < post; i += blockDim.x) {
+      for (int line = start; line < end; ++line) {
+        if (data_in[line * post + i] == data_out[group * post + i]) {
+          grad_out[line * post + i] = grad_in[group * post + i];
+        } else {
+          grad_out[line * post + i] = 0;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, class Context = CUDAContext>
+class CUDALengthsMaxWithMainInputAndForwardOutputGradientOp
+    : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDALengthsMaxWithMainInputAndForwardOutputGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDALengthsMaxWithMainInputAndForwardOutputGradientOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, float>>::call(this, Input(3));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& segmentGradsInput = Input(1);
+    auto& lengthsInput = Input(2);
+    auto& dataInput = Input(3);
+    auto& dataOutput = Input(0); // based on CPU version
+    auto* dataGradsOutput = Output(0);
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    int len_length = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(len_length == segmentGradsInput.dim(0));
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    // compute output size using length
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+
+    auto shape = dataInput.dims();
+    dataGradsOutput->Resize(shape);
+
+    const T* in_data = segmentGradsInput.template data<T>();
+    T* out_data = dataGradsOutput->template mutable_data<T>();
+
+    int N = dataInput.dim(0);
+    int post = segmentGradsInput.size_from_dim(1);
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    auto maxThreads =
+        GetDeviceProperty(CaffeCudaGetDevice()).maxThreadsPerBlock;
+
+    if (post <= maxThreads) {
+      int multiple = std::min(maxThreads / post, 16);
+      dim3 block(post, multiple);
+
+      length_max_gradient_kernel<T, true>
+          <<<len_length, block, 0, context_.cuda_stream()>>>(
+
+              in_data,
+              out_data,
+              dataInput.template data<T>(),
+              dataOutput.template data<T>(),
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    } else {
+      length_max_gradient_kernel<T, false>
+          <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
+              in_data,
+              out_data,
+              dataInput.template data<T>(),
+              dataOutput.template data<T>(),
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    }
+
+    return true;
+  }
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+template <typename T, class Context = CUDAContext>
+class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
+    : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  ~CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp() {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(4));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto& weightsInput = Input(0);
+    auto& segmentGradsInput = Input(1);
+    auto& lengthsInput = Input(2);
+    auto& dataInput = Input(3);
+    auto& indicesInput = Input(4);
+    auto* dataGradsOutput = Output(0);
+    auto* weightGradsOutput = Output(1);
+    CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
+    CAFFE_ENFORCE_EQ(1, weightsInput.ndim(), "WEIGHTS must be a vector");
+
+    const int len_length = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(len_length == segmentGradsInput.dim(0));
+
+    auto shape = segmentGradsInput.dims();
+    int output_0dim = indicesInput.dim(0);
+    shape[0] = output_0dim;
+    dataGradsOutput->Resize(shape);
+    weightGradsOutput->ResizeLike(indicesInput);
+    T* out_data_grads = dataGradsOutput->template mutable_data<T>();
+    T* out_weight_grads = weightGradsOutput->template mutable_data<T>();
+
+    if (len_length <= 0) {
+      // return early to avoid invalid empty kernel
+      return true;
+    }
+
+    inclusive_scan_length_buffer_.ResizeLike(lengthsInput);
+    inclusive_scan_wrapper(
+        lengthsInput.template data<int>(),
+        len_length,
+        &inclusive_scan_buffer_,
+        &inclusive_scan_length_buffer_,
+        &context_);
+
+    // compute output size using length
+    auto* prefix_sum_length_data =
+        inclusive_scan_length_buffer_.template data<int>();
+
+    const T* in_data = dataInput.template data<T>();
+    const T* in_grads = segmentGradsInput.template data<T>();
+    const T* in_weights = weightsInput.template data<T>();
+    const IndexType* indices = indicesInput.template data<IndexType>();
+
+    int N = output_0dim;
+    int post = segmentGradsInput.size_from_dim(1);
+
+    if (post > 128) {
+      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 512>
+          <<<len_length, 512, 0, context_.cuda_stream()>>>(
+              in_grads,
+              in_weights,
+              in_data,
+              indices,
+              out_data_grads,
+              out_weight_grads,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    } else if (post > 64) {
+      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 128>
+          <<<len_length, 128, 0, context_.cuda_stream()>>>(
+              in_grads,
+              in_weights,
+              in_data,
+              indices,
+              out_data_grads,
+              out_weight_grads,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    } else if (post > 32) {
+      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 64>
+          <<<len_length, 64, 0, context_.cuda_stream()>>>(
+              in_grads,
+              in_weights,
+              in_data,
+              indices,
+              out_data_grads,
+              out_weight_grads,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    } else {
+      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 32>
+          <<<len_length, 32, 0, context_.cuda_stream()>>>(
+              in_grads,
+              in_weights,
+              in_data,
+              indices,
+              out_data_grads,
+              out_weight_grads,
+              prefix_sum_length_data,
+              N,
+              post,
+              len_length);
+    }
+
+    return true;
+  }
+
+ private:
+  // menber field to manage memory
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
+};
+
+// Needed because name is auto-generated in segment_reduction_op.cc:224
+REGISTER_CUDA_OPERATOR_STR(
+    "LengthsMaxWithMainInputAndForwardOutputGradient",
+    CUDALengthsMaxWithMainInputAndForwardOutputGradientOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    SparseLengthsIndicesInGradientWeightedSumGradient,
+    CUDASparseLengthsWeightedSumGradientWithIndicesOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient,
+    CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp<
+        float,
+        CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    SparseLengthsIndicesInGradientSumGradient,
+    CUDASparseLengthsSumGradientWithIndicesOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    LengthsIndicesInGradientSumGradient,
+    CUDASparseLengthsSumGradientWithIndicesOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    SparseLengthsIndicesInGradientMeanGradient,
+    CUDASparseLengthsMeanGradientWithIndicesOp<float, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    LengthsIndicesInGradientMeanGradient,
+    CUDASparseLengthsMeanGradientWithIndicesOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/selu_op.cc b/caffe2/operators/selu_op.cc
new file mode 100644
index 0000000..50d823d
--- /dev/null
+++ b/caffe2/operators/selu_op.cc
@@ -0,0 +1,141 @@
+#include "caffe2/operators/selu_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool SeluOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
+  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
+  Yvec = lambda_ * (Xvec > 0).select(Xvec, (alpha_ * Xvec.exp() - alpha_));
+  return true;
+}
+
+template <>
+bool SeluGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+
+  ConstEigenVectorArrayMap<float> Yvec(Y.data<float>(), Y.size());
+  ConstEigenVectorArrayMap<float> dYvec(dY.data<float>(), dY.size());
+  EigenVectorArrayMap<float> dXvec(dX->mutable_data<float>(), dX->size());
+
+  const float la = lambda_ * alpha_;
+  dXvec = (Yvec > 0).select(lambda_ * dYvec, dYvec * (Yvec + la));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Selu, SeluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SeluGradient, SeluGradientOp<float, CPUContext>);
+
+// Input: X; output: Y
+OPERATOR_SCHEMA(Selu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+
+The *Selu* op takes one input tensor $X$, an argument $alpha$, an argument $scale$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *Selu* operation, defined as
+
+$$y=selu(x) =\begin{cases}scale (\alpha e^{x} - \alpha) & x < 0\\scale * x & otherwise\end{cases}$$
+
+The default value of *alpha* is 1.6732632423543772848170429916717 and the default value of *scale* is 1.0507009873554804934193349852946. See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515) for more information.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/selu_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/selu_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Selu",
+    ["X"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[ 1.1613879  -0.27111396 -1.2076733 ]
+ [ 1.3442237  -1.0701777   1.2070968 ]
+ [ 0.23810555  0.9740916  -1.7872391 ]]
+
+Y:
+ [[ 1.2202715  -0.4174965  -1.2326177 ]
+ [ 1.4123772  -1.1551634   1.2682979 ]
+ [ 0.25017774  1.023479   -1.4637551 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("alpha", "*(type: float; default: 1.673263~)* Alpha constant in equation.")
+    .Arg("scale", "*(type: float; default: 1.050700~; must be > 1.0)* Scale constant in equation.")
+    .Input(0, "X", "Input tensor of data to be operated on.")
+    .Output(0, "Y", "Output tensor with same shape as input.")
+    .InheritOnnxSchema("Selu");
+
+// Input: Y, dY; output: dX
+OPERATOR_SCHEMA(SeluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+SeluGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the selu function.
+)DOC")
+    .Arg(
+        "alpha",
+        "(float) default to 1.6732~; affects the activation function itself."
+        "This should go with the weight initialization in the paper. "
+        " See https://arxiv.org/abs/1706.02515 ")
+    .Arg(
+        "scale",
+        "(float) default to 1.0507~; affects the activation function itself.")
+    .Input(0, "Y", "input tensor")
+    .Input(1, "dY", "input tensor");
+
+class GetSeluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Selu, GetSeluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/selu_op.cu b/caffe2/operators/selu_op.cu
new file mode 100644
index 0000000..314f2b4
--- /dev/null
+++ b/caffe2/operators/selu_op.cu
@@ -0,0 +1,69 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/selu_op.h"
+
+namespace caffe2 {
+namespace {
+template <typename T>
+__global__ void SeluKernel(const int N, const T* X, T* Y, T alpha_, T lambda_) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = lambda_ * (X[i] > 0 ? X[i] : alpha_ * __expf(X[i]) - alpha_);
+  }
+}
+
+template <typename T>
+__global__ void SeluGradientKernel(
+    const int N,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    T alpha_,
+    T lambda_) {
+  const T c = lambda_ * alpha_;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Reuse Y[i] to avoid computing exp(X[i])
+    dX[i] = Y[i] > 0 ? lambda_ * dY[i] : dY[i] * (Y[i] + c);
+  }
+}
+} // namespace
+
+template <>
+bool SeluOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  SeluKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_, lambda_);
+  return true;
+}
+
+template <>
+bool SeluGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_GT(Y.size(), 0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  SeluGradientKernel<<<
+      CAFFE_GET_BLOCKS(Y.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      Y.size(),
+      Y.data<float>(),
+      dY.data<float>(),
+      dX->mutable_data<float>(),
+      alpha_,
+      lambda_);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Selu, SeluOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SeluGradient, SeluGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/selu_op.h b/caffe2/operators/selu_op.h
new file mode 100644
index 0000000..3726db7
--- /dev/null
+++ b/caffe2/operators/selu_op.h
@@ -0,0 +1,56 @@
+#ifndef CAFFE2_OPERATORS_SELU_OP_H_
+#define CAFFE2_OPERATORS_SELU_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SeluOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  SeluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    alpha_ = OperatorBase::GetSingleArgument<T>(
+        "alpha", 1.6732632423543772848170429916717f);
+    lambda_ = OperatorBase::GetSingleArgument<T>(
+        "scale", 1.0507009873554804934193349852946f);
+    // In the paper "scale" is named "lambda", but "lambda" is a reserved
+    // keyword in python
+    CAFFE_ENFORCE_GT(lambda_, 1.0);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+  T lambda_;
+};
+
+template <typename T, class Context>
+class SeluGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SeluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    alpha_ = OperatorBase::GetSingleArgument<T>(
+        "alpha", 1.6732632423543772848170429916717f);
+    lambda_ = OperatorBase::GetSingleArgument<T>(
+        "scale", 1.0507009873554804934193349852946f);
+    CAFFE_ENFORCE_GT(lambda_, 1.0);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+  T lambda_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SELU_OP_H_
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
new file mode 100644
index 0000000..4dd8e65
--- /dev/null
+++ b/caffe2/operators/sequence_ops.cc
@@ -0,0 +1,537 @@
+#include "caffe2/operators/sequence_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+void GatherPaddingOp<CPUContext>::GatherPadding(
+    const int outer_size,
+    const int lengths_size,
+    const int block_size,
+    const int pad_width,
+    const T* in_ptr,
+    const int* lengths_ptr,
+    T* padding_start_ptr,
+    T* padding_end_ptr) {
+  CAFFE_ENFORCE(
+      (!std::is_same<bool, T>::value),
+      "GatherPadding should not be executed on an input of type bool, as "
+      "addition is not properly defined with booleans.");
+  int64_t total_length = 0;
+  for (int i = 0; i < lengths_size; ++i) {
+    // check total length consistency
+    const auto length = lengths_ptr[i];
+    total_length += length;
+    CAFFE_ENFORCE_LE(total_length, outer_size);
+    // accumulate start paddings
+    for (int j = 0; j < startPaddingWidth_; ++j) {
+      for (int k = 0; k < block_size; ++k) {
+        // Note: MSVC warns about unsafe use of type bool in operation.
+        // This is now guarded by a CAFFE_ENFORCE so we can suppress it.
+        #pragma warning(suppress: 4804)
+        padding_start_ptr[k] += in_ptr[k];
+      }
+      in_ptr += block_size;
+    }
+    in_ptr += block_size * (length - pad_width);
+    // accumulate end paddings
+    for (int j = 0; j < endPaddingWidth_; ++j) {
+      for (int k = 0; k < block_size; ++k) {
+        #pragma warning(suppress: 4804)
+        padding_end_ptr[k] += in_ptr[k];
+      }
+      in_ptr += block_size;
+    }
+  }
+}
+
+template <>
+template <typename T>
+bool RemovePaddingOp<CPUContext>::DoRunWithType() {
+  const auto& in = Input(0);
+  CAFFE_ENFORCE_GE(in.ndim(), 1);
+  const int32_t outer_size = in.dims()[0];
+  const auto block_size = std::accumulate(
+      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
+  const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
+
+  // if no lengths is provided, assume it is a single full-span entry
+  const int32_t* lengths_ptr = &outer_size;
+  int64_t lengths_size = 1;
+  if (InputSize() > 1) {
+    const auto& lengths = Input(1);
+    lengths_ptr = lengths.data<int32_t>();
+    lengths_size = lengths.size();
+  }
+
+  auto* out = Output(0);
+  {
+    auto out_dims = in.dims();
+    out_dims[0] -= pad_width * lengths_size;
+    out->Resize(std::move(out_dims));
+  }
+  const auto* in_ptr = in.template data<T>();
+  auto* out_ptr = out->template mutable_data<T>();
+  int64_t total_length = 0;
+  for (int i = 0; i < lengths_size; ++i) {
+    // check that total length is consistent
+    const auto length = lengths_ptr[i];
+    total_length += length;
+    CAFFE_ENFORCE_LE(total_length, outer_size);
+    std::copy(
+        in_ptr + block_size * startPaddingWidth_,
+        in_ptr + block_size * (length - endPaddingWidth_),
+        out_ptr);
+    in_ptr += block_size * length;
+    out_ptr += block_size * (length - pad_width);
+  }
+  if (OutputSize() == 1) {
+    return true;
+  }
+  auto* lengths_out = Output(1);
+  lengths_out->Resize(lengths_size);
+  std::transform(
+      lengths_ptr,
+      lengths_ptr + lengths_size,
+      lengths_out->mutable_data<int32_t>(),
+      [pad_width](int32_t x) { return x - pad_width; });
+  return true;
+}
+
+template <>
+template <typename T>
+bool AddPaddingOp<CPUContext>::MakePadding(
+    const T* in_ptr,
+    T* out_ptr,
+    const int32_t* lengths_ptr,
+    int32_t lengths_size,
+    int32_t outer_size,
+    const T* padding_start_ptr,
+    const T* padding_end_ptr,
+    int64_t block_size) {
+  if (!lengths_ptr) {
+    lengths_ptr = &outer_size;
+  }
+
+  int64_t total_length = 0;
+  for (int i = 0; i < lengths_size; ++i) {
+    // check that total length is consistent
+    const auto length = lengths_ptr[i];
+    total_length += length;
+    CAFFE_ENFORCE_LE(total_length, outer_size);
+    // copy padding before
+    if (!padding_start_ptr) {
+      memset(out_ptr, 0, block_size * startPaddingWidth_ * sizeof(T));
+      out_ptr += block_size * startPaddingWidth_;
+    } else {
+      for (int j = 0; j < startPaddingWidth_; ++j) {
+        std::copy(padding_start_ptr, padding_start_ptr + block_size, out_ptr);
+        out_ptr += block_size;
+      }
+    }
+    // copy payload
+    const auto num_elems = block_size * length;
+    std::copy(in_ptr, in_ptr + num_elems, out_ptr);
+    in_ptr += num_elems;
+    out_ptr += num_elems;
+    // copy padding after
+    if (!padding_end_ptr) {
+      memset(out_ptr, 0, block_size * endPaddingWidth_ * sizeof(T));
+      out_ptr += block_size * endPaddingWidth_;
+    } else {
+      for (int j = 0; j < endPaddingWidth_; ++j) {
+        std::copy(padding_end_ptr, padding_end_ptr + block_size, out_ptr);
+        out_ptr += block_size;
+      }
+    }
+  }
+  if (OutputSize() == 1) {
+    return true;
+  }
+  auto* lengths_out = Output(1);
+  lengths_out->Resize(lengths_size);
+  const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
+  std::transform(
+      lengths_ptr,
+      lengths_ptr + lengths_size,
+      lengths_out->mutable_data<int32_t>(),
+      [pad_width](int32_t x) { return x + pad_width; });
+  return true;
+}
+
+template <>
+bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
+  auto& lengths = Input(0);
+  auto* lengthsPtr = lengths.template data<int32_t>();
+  CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
+  CAFFE_ENFORCE(InputSize() >= 1, "Input size must be no less than 1");
+
+  auto* out_lengths = Output(0);
+  int needPadding = 0;
+  int sumLen = 0;
+  for (int i = 0; i < lengths.size(); ++i) {
+    if (lengthsPtr[i] == 0) {
+      needPadding++;
+    }
+    sumLen += lengthsPtr[i];
+  }
+
+  out_lengths->Resize(lengths.size());
+  auto* outLengthsPtr = out_lengths->template mutable_data<int32_t>();
+  for (int i = 0; i < lengths.size(); ++i) {
+    if (lengthsPtr[i] == 0) {
+      outLengthsPtr[i] = 1;
+    } else {
+      outLengthsPtr[i] = lengthsPtr[i];
+    }
+  }
+
+  for (int k = 0; k < InputSize() - 1; k++) {
+    auto& features = Input(1 + k);
+    CAFFE_ENFORCE(features.ndim() >= 1, "FEATURE should at least 1-D");
+    CAFFE_ENFORCE(
+        features.dim(0) == sumLen, "FEATURE and LENGTH should be consistent");
+    const auto block_size = features.size_from_dim(1);
+
+    auto* out_features = Output(1 + k);
+    auto outDim = features.dims();
+    outDim.at(0) += needPadding;
+    out_features->Resize(outDim);
+    auto dst =
+        static_cast<char*>(out_features->raw_mutable_data(features.meta()));
+    auto src_base = static_cast<const char*>(features.raw_data());
+    // copy data and add padding index as zero
+    Tensor<CPUContext> zero;
+    zero.Resize(block_size);
+    auto zeroPtr =
+        static_cast<const char*>(zero.raw_mutable_data(features.meta()));
+    int start_dest = 0;
+    int start_src = 0;
+    for (int i = 0; i < lengths.size(); ++i) {
+      if (lengthsPtr[i] == 0) {
+        context_.template CopyItems<CPUContext, CPUContext>(
+            features.meta(),
+            block_size,
+            zeroPtr,
+            dst + start_dest * features.meta().itemsize());
+        start_dest += block_size;
+      } else {
+        auto src = src_base + start_src * features.meta().itemsize();
+        context_.template CopyItems<CPUContext, CPUContext>(
+            features.meta(),
+            lengthsPtr[i] * block_size,
+            src,
+            dst + start_dest * features.meta().itemsize());
+        start_src += lengthsPtr[i] * block_size;
+        start_dest += lengthsPtr[i] * block_size;
+      }
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(AddPadding, AddPaddingOp<CPUContext>);
+REGISTER_CPU_OPERATOR(RemovePadding, RemovePaddingOp<CPUContext>);
+REGISTER_CPU_OPERATOR(GatherPadding, GatherPaddingOp<CPUContext>);
+REGISTER_CPU_OPERATOR(PadEmptySamples, PadEmptySamplesOp<CPUContext>);
+
+struct GetAddPaddingGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // whether to provide lengths as input to gradient
+    vector<std::string> g_inputs{GO(0)};
+    if (Def().input_size() > 1) {
+      CAFFE_ENFORCE(Def().output_size() > 1);
+      g_inputs.push_back(O(1));
+    }
+
+    vector<OperatorDef> ops;
+    // gradient on the data
+    ops.push_back(CreateOperatorDef(
+        "RemovePadding", "", g_inputs, vector<string>{GI(0)}));
+    // gradient on the start_padding (and end_padding)
+    if (Def().input_size() >= 3) {
+      std::vector<string> padding_grads{GI(2)};
+      if (Def().input_size() == 4) {
+        padding_grads.push_back(GI(3));
+      }
+      auto g_inputs2 = g_inputs;
+      ops.push_back(
+          CreateOperatorDef("GatherPadding", "", g_inputs2, padding_grads));
+    }
+    return ops;
+  }
+};
+REGISTER_GRADIENT(AddPadding, GetAddPaddingGradient);
+
+struct GetRemovePaddingGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // whether to provide lengths as input to gradient
+    vector<std::string> g_inputs{GO(0)};
+    if (Def().input_size() > 1) {
+      CAFFE_ENFORCE(Def().output_size() > 1);
+      g_inputs.push_back(O(1));
+    }
+
+    return SingleGradientDef("AddPadding", "", g_inputs, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(RemovePadding, GetRemovePaddingGradient);
+
+OPERATOR_SCHEMA(AddPadding)
+    .NumInputs(1, 4)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Given a partitioned tensor $T<N, D_1, ..., D_n>$, where the partitions are
+defined as ranges on its outer-most (slowest varying) dimension $N$,
+return a tensor $T<(N + 2 * padding\_width), D_1, ..., D_n>$ with paddings
+added to the start and end of each range.
+
+Optionally, different paddings can be provided for beginning and end.
+Paddings provided must be a tensor $T<D_1, ..., D_n>$. If no padding is
+provided, add zero padding. If no lengths vector is provided, add padding
+only once, at the start and end of data.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "AddPadding",
+    ["X", "lengths"],
+    ["Y", "lengths_out"],
+    padding_width=1
+
+)
+
+workspace.FeedBlob("X", (np.random.rand(3,2,2).astype(np.float32)))
+workspace.FeedBlob("lengths", np.array([3]).astype(np.int32))
+
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+print("lengths_out:", workspace.FetchBlob("lengths_out"))
+```
+
+**Result**
+
+```
+X: [[[0.2531572  0.4588472 ]
+  [0.45140603 0.61161053]]
+
+ [[0.92500854 0.8045306 ]
+  [0.03356671 0.30233648]]
+
+ [[0.4660227  0.6287745 ]
+  [0.79372746 0.08609265]]]
+Y: [[[0.         0.        ]
+  [0.         0.        ]]
+
+ [[0.2531572  0.4588472 ]
+  [0.45140603 0.61161053]]
+
+ [[0.92500854 0.8045306 ]
+  [0.03356671 0.30233648]]
+
+ [[0.4660227  0.6287745 ]
+  [0.79372746 0.08609265]]
+
+ [[0.         0.        ]
+  [0.         0.        ]]]
+lengths_out: [5]
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "padding_width",
+        "*(type: int)* Number of copies of padding to add around each range.")
+    .Arg(
+        "end_padding_width",
+        "*(type: int)* [OPTIONAL] Specifies a different end-padding width. If "
+        "this is not set, will use same as `padding_width`.")
+    .Input(
+        0,
+        "data_in",
+        "*(type: Tensor)* Input data ($T<N, D_1, ..., D_n>$).")
+    .Input(
+        1,
+        "lengths",
+        "*(type: Tensor`<int>`)* Number of elements in each range. "
+        "sum(lengths) = N.")
+    .Input(
+        2,
+        "start_padding",
+        "*(type: Tensor`<int>`)* [OPTIONAL] Padding data for range start "
+        "($T<D_1, ..., D_n>$).")
+    .Input(
+        3,
+        "end_padding",
+        "*(type: Tensor`<int>`)* [OPTIONAL] Padding for range end. If not "
+        "provided, `start_padding` is used ($T<D_1, ..., D_n>$).")
+    .Output(
+        0,
+        "data_out",
+        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding_width, "
+        "D_1, ..., D_n>$).")
+    .Output(
+        1,
+        "lengths_out",
+        "*(type: Tensor`<int>`)* [OPTIONAL] Lengths for each padded range.");
+
+OPERATOR_SCHEMA(RemovePadding)
+    .NumInputs(1, 2)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Remove padding around the edges of each segment of the input data. This is the
+reverse operation of **AddPadding**, and uses the same arguments and conventions
+for input and output data format.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+addpad_op = core.CreateOperator(
+    "AddPadding",
+    ["X", "lengths_add"],
+    ["Y", "lengths_out_add"],
+    padding_width=1
+)
+
+rmpad_op = core.CreateOperator(
+    "RemovePadding",
+    ["Y", "lengths_rm"],
+    ["Z", "lengths_out_rm"],
+    padding_width=1
+)
+
+workspace.FeedBlob("X", (np.random.randint(20, size=(3,5))))
+workspace.FeedBlob("lengths_add", np.array([3]).astype(np.int32))
+workspace.FeedBlob("lengths_rm", np.array([5]).astype(np.int32))
+
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(addpad_op)
+print("Y:", workspace.FetchBlob("Y"))
+print("lengths_out_add:", workspace.FetchBlob("lengths_out_add"))
+
+workspace.RunOperatorOnce(rmpad_op)
+print("Z:", workspace.FetchBlob("Z"))
+print("lengths_out_rm:", workspace.FetchBlob("lengths_out_rm"))
+```
+
+**Result**
+
+```
+X: [[17 19  1  9  1]
+ [19  3  5 19  1]
+ [16  0  0  0  4]]
+Y: [[ 0  0  0  0  0]
+ [17 19  1  9  1]
+ [19  3  5 19  1]
+ [16  0  0  0  4]
+ [ 0  0  0  0  0]]
+lengths_out_add: [5]
+Z: [[17 19  1  9  1]
+ [19  3  5 19  1]
+ [16  0  0  0  4]]
+lengths_out_rm: [3]
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "padding_width",
+        "*(type: int)* Outer-size of padding to remove around each range.")
+    .Arg(
+        "end_padding_width",
+        "*(type: int)* [OPTIONAL] Specifies a different end-padding width. "
+        "If this is not set, will use same as `padding_width`.")
+    .Input(
+        0,
+        "data_in",
+        "Input tensor ($T<N, D_1, ..., D_n>$).")
+    .Input(
+        1,
+        "lengths",
+        "*(type: Tensor`<int>`)* Number of elements in each range. "
+        "sum(lengths) = N. If not provided, considers all data as a single "
+        "segment.")
+    .Output(
+        0,
+        "data_out",
+        "*(type: Tensor)* Padded data tensor "
+        "($T<N + 2*padding_width, D_1, ..., D_n>$).")
+    .Output(
+        1,
+        "lengths_out",
+        "*(type: Tensor`<int>`)* [OPTIONAL] Lengths for each padded range.");
+
+OPERATOR_SCHEMA(GatherPadding)
+    .NumInputs(2)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Gather the sum of start and end paddings in a padded input sequence. Used in
+order to compute the gradients of AddPadding w.r.t the padding tensors.
+)DOC")
+    .Arg("padding_width", "Outer-size of padding present around each range.")
+    .Arg(
+        "end_padding_width",
+        "(Optional) Specifies a different end-padding width.")
+    .Input(0, "data_in", "T<N, D1..., Dn> Padded input data")
+    .Input(
+        1,
+        "lengths",
+        "(i64) Num of elements in each range. sum(lengths) = N. "
+        "If not provided, considers all data as a single segment.")
+    .Output(
+        0,
+        "padding_sum",
+        "Sum of all start paddings, or of all "
+        "paddings if end_padding_sum is not provided.")
+    .Output(
+        1,
+        "end_padding_sum",
+        "T<D1..., Dn> Sum of all end paddings, if provided.");
+
+OPERATOR_SCHEMA(PadEmptySamples)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Pad empty field given lengths and index features,
+
+Input(0) is a blob pointing to the lengths of samples in one batch,
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the features.
+
+PadEmptySamples is thread safe.
+)DOC")
+    .Input(0, "lengths", "A blob containing a pointer to the lengths.")
+    .Output(
+        0,
+        "out_lengths",
+        "Tensor containing lengths with empty sample padded.");
+
+} // namespace caffe2
diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
new file mode 100644
index 0000000..549c288
--- /dev/null
+++ b/caffe2/operators/sequence_ops.cu
@@ -0,0 +1,375 @@
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/sequence_ops.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void AddPaddingKernel(
+    const T* in,
+    int block_size,
+    int lengths_size,
+    int outer_size,
+    const int32_t* lengths_prefix_sum,
+    const T* padding_start_ptr,
+    int start_padding_width_blocks,
+    const T* padding_end_ptr,
+    int end_padding_width_blocks,
+    T* out,
+    int32_t* lengths_out) {
+  int element_idx = blockIdx.x;
+  int prior_padding =
+      element_idx * (start_padding_width_blocks + end_padding_width_blocks);
+  int out_start_idx = element_idx == 0
+      ? 0
+      : lengths_prefix_sum[element_idx - 1] + prior_padding;
+  int len_blocks;
+  int in_start_idx;
+  if (lengths_prefix_sum) {
+    len_blocks = lengths_prefix_sum[element_idx] -
+        (element_idx == 0 ? 0 : lengths_prefix_sum[element_idx - 1]);
+    in_start_idx = lengths_prefix_sum[element_idx] - len_blocks;
+  } else {
+    // Only one element, use the outer size
+    CUDA_KERNEL_ASSERT(lengths_size == 1);
+    len_blocks = outer_size;
+    in_start_idx = 0;
+  }
+
+  out_start_idx *= block_size;
+  in_start_idx *= block_size;
+
+  int len = len_blocks * block_size;
+  int start_padding_width = start_padding_width_blocks * block_size;
+  int end_padding_width = end_padding_width_blocks * block_size;
+
+  // start pad
+  T* out_ptr = out + out_start_idx;
+  for (int i = threadIdx.x; i < start_padding_width; i += blockDim.x) {
+    T fill = padding_start_ptr ? padding_start_ptr[i % block_size] : T(0);
+    out_ptr[i] = fill;
+  }
+
+  // payload
+  for (int i = threadIdx.x; i < len; i += blockDim.x) {
+    out_ptr[i + start_padding_width] = in[in_start_idx + i];
+  }
+
+  // end pad
+  for (int i = threadIdx.x; i < end_padding_width; i += blockDim.x) {
+    T fill = padding_end_ptr ? padding_end_ptr[i % block_size] : T(0);
+    out_ptr[i + start_padding_width + len] = fill;
+  }
+
+  // update the lengths
+  if (threadIdx.x == 0 && lengths_out != nullptr) {
+    lengths_out[element_idx] =
+        len_blocks + start_padding_width_blocks + end_padding_width_blocks;
+  }
+}
+
+template <typename T>
+__global__ void RemovePaddingKernel(
+    const T* in,
+    int block_size,
+    int lengths_size,
+    int outer_size,
+    const int32_t* lengths_prefix_sum,
+    int start_padding_width_blocks,
+    int end_padding_width_blocks,
+    T* out,
+    int32_t* lengths_out) {
+  int element_idx = blockIdx.x;
+  int prior_padding =
+      element_idx * (start_padding_width_blocks + end_padding_width_blocks);
+  int out_start_idx = element_idx == 0
+      ? 0
+      : lengths_prefix_sum[element_idx - 1] - prior_padding;
+  int len_blocks;
+  int in_start_idx;
+  if (lengths_prefix_sum) {
+    len_blocks = lengths_prefix_sum[element_idx] -
+        (element_idx == 0 ? 0 : lengths_prefix_sum[element_idx - 1]);
+    in_start_idx = lengths_prefix_sum[element_idx] - len_blocks;
+  } else {
+    // Only one element, use the outer size
+    CUDA_KERNEL_ASSERT(lengths_size == 1);
+    len_blocks = outer_size;
+    in_start_idx = 0;
+  }
+
+  out_start_idx *= block_size;
+  in_start_idx *= block_size;
+
+  int len = len_blocks * block_size;
+  int start_padding_width = start_padding_width_blocks * block_size;
+
+  // payload
+  T* out_ptr = out + out_start_idx;
+  for (int i = threadIdx.x; i < len; i += blockDim.x) {
+    out_ptr[in_start_idx + i] = in[i + start_padding_width];
+  }
+
+  // update the lengths
+  if (threadIdx.x == 0 && lengths_out != nullptr) {
+    lengths_out[element_idx] =
+        len_blocks - (start_padding_width_blocks + end_padding_width_blocks);
+  }
+}
+
+template <bool Inclusive = true>
+void lengths_prefix_sum(
+    const int32_t* lengths,
+    int32_t num_items,
+    Tensor<CUDAContext>* prefix_buffer,
+    Tensor<CUDAContext>* prefix_sum,
+    CUDAContext* context) {
+  // Retrieve buffer size
+  size_t temp_storage_bytes = 0;
+  prefix_sum->Resize(num_items);
+  if (Inclusive) {
+    cub::DeviceScan::InclusiveSum(
+        NULL,
+        temp_storage_bytes,
+        lengths,
+        prefix_sum->mutable_data<int32_t>(),
+        num_items,
+        context->cuda_stream());
+  } else {
+    cub::DeviceScan::ExclusiveSum(
+        NULL,
+        temp_storage_bytes,
+        lengths,
+        prefix_sum->mutable_data<int32_t>(),
+        num_items,
+        context->cuda_stream());
+  }
+
+  // Allocate temporary storage
+  auto buffer_size = (temp_storage_bytes + sizeof(int32_t)) / sizeof(int32_t);
+  prefix_buffer->Resize(buffer_size);
+  void* d_temp_storage =
+      static_cast<void*>(prefix_buffer->mutable_data<int32_t>());
+
+  if (Inclusive) {
+    cub::DeviceScan::InclusiveSum(
+        d_temp_storage,
+        temp_storage_bytes,
+        lengths,
+        prefix_sum->mutable_data<int32_t>(),
+        num_items,
+        context->cuda_stream());
+  } else {
+    cub::DeviceScan::ExclusiveSum(
+        d_temp_storage,
+        temp_storage_bytes,
+        lengths,
+        prefix_sum->mutable_data<int32_t>(),
+        num_items,
+        context->cuda_stream());
+  }
+}
+} // namespace
+
+template <>
+template <typename T>
+bool AddPaddingOp<CUDAContext>::MakePadding(
+    const T* in_ptr,
+    T* out_ptr,
+    const int32_t* lengths_ptr,
+    int32_t lengths_size,
+    int32_t outer_size,
+    const T* padding_start_ptr,
+    const T* padding_end_ptr,
+    int64_t block_size) {
+  // Step 1: compute prefix sum over the lengths -- unless
+  // there were no lengths given, i.e there is only one segment
+  const int32_t* lengths_prefix_sum_ptr = nullptr;
+  if (lengths_ptr != nullptr) {
+    lengths_prefix_sum(
+        lengths_ptr,
+        lengths_size,
+        &lengths_prefix_sum_buffer_,
+        &lengths_prefix_sum_,
+        &context_);
+    lengths_prefix_sum_ptr = lengths_prefix_sum_.data<int32_t>();
+  }
+
+  int32_t* lengths_out_ptr = nullptr;
+  if (OutputSize() > 1) {
+    auto* lengths_out = Output(1);
+    lengths_out->Resize(lengths_size);
+    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
+  }
+
+  if (lengths_size == 0) {
+    return true;
+  }
+
+  // Compute the padding using the accumulated lengths
+  AddPaddingKernel<T>
+      <<<lengths_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          in_ptr,
+          block_size,
+          lengths_size,
+          outer_size,
+          lengths_prefix_sum_ptr,
+          padding_start_ptr,
+          startPaddingWidth_,
+          padding_end_ptr,
+          endPaddingWidth_,
+          out_ptr,
+          lengths_out_ptr);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(AddPadding, AddPaddingOp<CUDAContext>);
+
+template <>
+template <typename T>
+bool RemovePaddingOp<CUDAContext>::DoRunWithType() {
+  const auto& in = Input(0);
+  CAFFE_ENFORCE_GE(in.ndim(), 1);
+  const int32_t outer_size = in.dims()[0];
+  const auto block_size = std::accumulate(
+      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
+
+  // if no lengths is provided, assume it is a single full-span entry
+  const int32_t* lengths_ptr = nullptr;
+  int32_t lengths_size = 1;
+  if (InputSize() > 1) {
+    const auto& lengths = Input(1);
+    lengths_ptr = lengths.data<int32_t>();
+    lengths_size = lengths.size();
+  }
+
+  auto* out = Output(0);
+  {
+    auto out_dims = in.dims();
+    out_dims[0] -= (startPaddingWidth_ + endPaddingWidth_) * lengths_size;
+    out->Resize(std::move(out_dims));
+  }
+  const auto* in_ptr = in.template data<T>();
+  auto* out_ptr = out->template mutable_data<T>();
+
+  // Step 1: compute prefix sum over the (padded) lengths -- unless
+  // there were no lengths given, i.e there is only one segment
+  const int32_t* lengths_prefix_sum_ptr = nullptr;
+  if (lengths_ptr != nullptr) {
+    lengths_prefix_sum(
+        lengths_ptr,
+        lengths_size,
+        &lengths_prefix_sum_buffer_,
+        &lengths_prefix_sum_,
+        &context_);
+    lengths_prefix_sum_ptr = lengths_prefix_sum_.data<int32_t>();
+  }
+
+  int32_t* lengths_out_ptr = nullptr;
+  if (OutputSize() > 1) {
+    auto* lengths_out = Output(1);
+    lengths_out->Resize(lengths_size);
+    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
+  }
+
+  if (lengths_size == 0) {
+    return true;
+  }
+
+  // Compute the padding using the accumulated lengths
+  RemovePaddingKernel<T>
+      <<<lengths_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+          in_ptr,
+          block_size,
+          lengths_size,
+          outer_size,
+          lengths_prefix_sum_ptr,
+          startPaddingWidth_,
+          endPaddingWidth_,
+          out_ptr,
+          lengths_out_ptr);
+  return true;
+}
+
+template <typename T>
+__global__ void gather_padding_kernel(
+    const int K,
+    const int N,
+    const int Y0Width,
+    const int Y1Width,
+    const T* X,
+    const int* I,
+    const int* L,
+    T* Y0,
+    T* Y1) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage y0_tmp;
+  __shared__ typename BlockReduce::TempStorage y1_tmp;
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    T sum_1 = T(0);
+    T sum_2 = T(0);
+    for (int j = threadIdx.x; j < K * Y0Width; j += blockDim.x) {
+      const int j1 = j / Y0Width;
+      const int j2 = j % Y0Width;
+      const int idx1 = N * (L[j1] + j2);
+      sum_1 += X[idx1 + i];
+    }
+    for (int j = threadIdx.x; j < K * Y1Width; j += blockDim.x) {
+      const int j1 = j / Y1Width;
+      const int j2 = j % Y1Width;
+      const int idx1 = N * L[j1];
+      const int idx2 = idx1 + N * (I[j1] - Y1Width + j2);
+      sum_2 += X[idx2 + i];
+    }
+    sum_1 = BlockReduce(y0_tmp).Reduce(sum_1, cub::Sum());
+    sum_2 = BlockReduce(y1_tmp).Reduce(sum_2, cub::Sum());
+    if (threadIdx.x == 0) {
+      Y0[i] = sum_1;
+      Y0 != Y1 ? Y1[i] = sum_2 : Y0[i] = sum_1 + sum_2;
+    }
+    __syncthreads();
+  }
+}
+
+template <>
+template <typename T>
+void GatherPaddingOp<CUDAContext>::GatherPadding(
+    const int outer_size,
+    const int lengths_size,
+    const int block_size,
+    const int pad_width,
+    const T* in_ptr,
+    const int* lengths_ptr,
+    T* padding_start_ptr,
+    T* padding_end_ptr) {
+  if (lengths_size > 0) {
+    lengths_prefix_sum<false>(
+        lengths_ptr,
+        lengths_size,
+        &lengths_prefix_sum_buffer_,
+        &lengths_prefix_sum_,
+        &context_);
+    gather_padding_kernel<T>
+        <<<min(block_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            lengths_size,
+            block_size,
+            startPaddingWidth_,
+            endPaddingWidth_,
+            in_ptr,
+            lengths_ptr,
+            lengths_prefix_sum_.template data<int>(),
+            padding_start_ptr,
+            padding_end_ptr);
+  }
+}
+REGISTER_CUDA_OPERATOR(RemovePadding, RemovePaddingOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(GatherPadding, GatherPaddingOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/sequence_ops.h b/caffe2/operators/sequence_ops.h
new file mode 100644
index 0000000..c29ff7b
--- /dev/null
+++ b/caffe2/operators/sequence_ops.h
@@ -0,0 +1,255 @@
+#ifndef CAFFE2_OPERATORS_SEQUENCE_OPS_H_
+#define CAFFE2_OPERATORS_SEQUENCE_OPS_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class GatherPaddingOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GatherPaddingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        startPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("padding_width", 1)),
+        endPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("end_padding_width", -1)) {
+    CAFFE_ENFORCE_GE(startPaddingWidth_, 0);
+    if (endPaddingWidth_ < 0) {
+      endPaddingWidth_ = startPaddingWidth_;
+    }
+  }
+
+  bool RunOnDevice() override {
+    if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
+      Output(0)->Resize(std::vector<TIndex>(0));
+      Output(0)->template mutable_data<TIndex>();
+      if (OutputSize() == 2) {
+        Output(1)->Resize(std::vector<TIndex>(0));
+        Output(1)->template mutable_data<TIndex>();
+      }
+      return true;
+    }
+    return DispatchHelper<TensorTypes<float, double, int, int64_t, bool>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& in = Input(0);
+    CAFFE_ENFORCE_GE(in.ndim(), 1);
+    const int32_t outer_size = in.dims()[0];
+    const auto block_size = in.size_from_dim(1);
+    const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
+
+    // if no lengths is provided, assume it is a single full-span entry
+    const int32_t* lengths_ptr = &outer_size;
+    int64_t lengths_size = 1;
+    if (InputSize() > 1) {
+      const auto& lengths = Input(1);
+      lengths_ptr = lengths.template data<int32_t>();
+      lengths_size = lengths.size();
+    }
+    std::vector<TIndex> padShape(in.dims().begin() + 1, in.dims().end());
+    // output will contain accumulator over paddings
+    Output(0)->Resize(padShape);
+    T* padding_start_ptr = Output(0)->template mutable_data<T>();
+    math::Set<T, Context>(block_size, 0.0, padding_start_ptr, &context_);
+
+    // if no end_padding is provided, assume it's the same as start_padding
+    T* padding_end_ptr = padding_start_ptr;
+    if (OutputSize() == 2) {
+      Output(1)->Resize(padShape);
+      padding_end_ptr = Output(1)->template mutable_data<T>();
+      math::Set<T, Context>(block_size, 0.0, padding_end_ptr, &context_);
+    }
+    GatherPadding<T>(
+        outer_size,
+        lengths_size,
+        block_size,
+        pad_width,
+        in.template data<T>(),
+        lengths_ptr,
+        padding_start_ptr,
+        padding_end_ptr);
+    return true;
+  }
+
+ private:
+  template <typename T>
+  void GatherPadding(
+      const int outer_size,
+      const int lengths_size,
+      const int block_size,
+      const int pad_width,
+      const T* in_ptr,
+      const int* lengths_ptr,
+      T* padding_start_ptr,
+      T* padding_end_ptr);
+
+  int startPaddingWidth_;
+  int endPaddingWidth_;
+  // Scratch space required by the CUDA version
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
+};
+
+template <class Context>
+class RemovePaddingOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RemovePaddingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        startPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("padding_width", 1)),
+        endPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("end_padding_width", -1)) {
+    CAFFE_ENFORCE_GE(startPaddingWidth_, 0);
+    if (endPaddingWidth_ < 0) {
+      endPaddingWidth_ = startPaddingWidth_;
+    }
+  }
+
+  bool RunOnDevice() override {
+    if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
+      Output(0)->CopyFrom(Input(0), &context_);
+      if (OutputSize() == 2) {
+        Output(1)->CopyFrom(Input(1), &context_);
+      }
+      return true;
+    }
+    return DispatchHelper<TensorTypes<float, double, int, int64_t, bool>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  int startPaddingWidth_;
+  int endPaddingWidth_;
+
+  // Scratch space required by the CUDA version
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
+};
+
+template <class Context>
+class AddPaddingOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AddPaddingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        startPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("padding_width", 1)),
+        endPaddingWidth_(
+            OperatorBase::GetSingleArgument<int>("end_padding_width", -1)) {
+    CAFFE_ENFORCE_GE(startPaddingWidth_, 0);
+    if (endPaddingWidth_ < 0) {
+      endPaddingWidth_ = startPaddingWidth_;
+    }
+  }
+
+  bool RunOnDevice() override {
+    if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
+      Output(0)->CopyFrom(Input(0), &context_);
+      if (OutputSize() == 2) {
+        Output(1)->CopyFrom(Input(1), &context_);
+      }
+      return true;
+    }
+    return DispatchHelper<TensorTypes<float, double, int, int64_t, bool>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& in = Input(0);
+    CAFFE_ENFORCE_GE(in.ndim(), 1);
+    const int32_t outer_size = in.dims()[0];
+    const auto block_size = in.size_from_dim(1);
+
+    // if no lengths is provided, assume it is a single full-span entry
+    const int32_t* lengths_ptr = nullptr;
+    int32_t lengths_size = 1;
+    if (InputSize() > 1) {
+      const auto& lengths = Input(1);
+      lengths_ptr = lengths.template data<int32_t>();
+      lengths_size = lengths.size();
+    }
+
+    // fetch paddings
+    // input_size == 2 : pad with zeros
+    // input_size == 3 : start and end paddings are the same
+    // input_size == 4 : different start and end paddings
+    const T* padding_start_ptr = nullptr;
+    const T* padding_end_ptr = nullptr;
+    if (InputSize() >= 3) {
+      auto& padding_start = Input(2);
+      CAFFE_ENFORCE_EQ(block_size, padding_start.size());
+      padding_start_ptr = padding_start.template data<T>();
+    }
+    if (InputSize() == 4) {
+      auto& padding_end = Input(3);
+      CAFFE_ENFORCE_EQ(block_size, padding_end.size());
+      padding_end_ptr = padding_end.template data<T>();
+    } else {
+      padding_end_ptr = padding_start_ptr;
+    }
+
+    auto* out = Output(0);
+    {
+      auto out_dims = in.dims();
+      out_dims[0] += (startPaddingWidth_ + endPaddingWidth_) * lengths_size;
+      out->Resize(std::move(out_dims));
+    }
+    const auto* in_ptr = in.template data<T>();
+    auto* out_ptr = out->template mutable_data<T>();
+
+    return MakePadding<T>(
+        in_ptr,
+        out_ptr,
+        lengths_ptr,
+        lengths_size,
+        outer_size,
+        padding_start_ptr,
+        padding_end_ptr,
+        block_size);
+  }
+
+ private:
+  template <typename T>
+  bool MakePadding(
+      const T* in_ptr,
+      T* out_ptr,
+      const int32_t* lengths_ptr,
+      int32_t lengths_size,
+      int32_t outer_size,
+      const T* padding_start_ptr,
+      const T* padding_end_ptr,
+      int64_t block_size);
+
+  int startPaddingWidth_;
+  int endPaddingWidth_;
+
+  // Scratch space required by the CUDA version
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
+};
+
+template <class Context>
+class PadEmptySamplesOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  PadEmptySamplesOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SEQUENCE_OPS_H_
diff --git a/caffe2/operators/shape_op.cc b/caffe2/operators/shape_op.cc
new file mode 100644
index 0000000..1dab875
--- /dev/null
+++ b/caffe2/operators/shape_op.cc
@@ -0,0 +1,78 @@
+#include "caffe2/operators/shape_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
+
+OPERATOR_SCHEMA(Shape)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg(
+        "axes",
+        "*(type: int[])* Array of interested axes."
+        "If given, this operator only returns the dimensions of the given axes."
+        "Otherwise, the operator returns the dimensions of all axes.")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper args(def);
+      const vector<int>& axes = args.GetRepeatedArgument<int>("axes");
+      vector<TensorShape> out(1);
+      if (axes.empty()) {
+        out[0].add_dims(in[0].dims().size());
+      } else {
+        out[0].add_dims(axes.size());
+      }
+      out[0].set_data_type(TensorProto::INT64);
+      return out;
+    })
+    .SetDoc(R"DOC(
+Produce a 1D int64 tensor with the shape of the input tensor.
+If called with an optional argument `axes`, the result will only
+contain the dimensions of specified axes.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/shape_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Shape",
+    ["X"],
+    ["shape"],
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(2,3))))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("shape:", workspace.FetchBlob("shape"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[3 2 5]
+ [5 7 3]]
+shape: [2 3]
+
+```
+
+</details>
+
+      )DOC")
+    .Input(0,"X", "*(type: Tensor)* Input tensor.")
+    .Output(0,"shape", "*(type: Tensor)* Output tensor containing shape of input tensor.");
+
+SHOULD_NOT_DO_GRADIENT(Shape);
+
+} // namespace caffe2
diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h
new file mode 100644
index 0000000..128a00a
--- /dev/null
+++ b/caffe2/operators/shape_op.h
@@ -0,0 +1,53 @@
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// RecordShapeOp records the shape of the input tensor to a vector of int. You
+// mostly don't need this operator explicitly, and it is mostly used in the
+// autodiff process.
+template <class Context>
+class ShapeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ShapeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase ::GetRepeatedArgument<int>("axes")) {}
+
+  bool RunOnDevice() override {
+    auto& data = Input(DATA);
+    auto* output = OperatorBase::Output<Tensor<Context>>(0);
+    int numDims = data.ndim();
+    int numAxes = axes_.size();
+    if (numAxes == 0) {
+      output->Resize(numDims);
+      TIndex* output_data = output->template mutable_data<TIndex>();
+      context_.template CopyBytes<Context, Context>(
+          numDims * sizeof(TIndex), data.dims().data(), output_data);
+      return true;
+    }
+
+    output->Resize(numAxes);
+    auto src = reinterpret_cast<const char*>(data.dims().data());
+    auto out = reinterpret_cast<char*>(output->template mutable_data<TIndex>());
+    for (int i = 0; i < numAxes; i++) {
+      auto axis = axes_[i];
+      CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
+      CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
+      context_.template CopyBytes<Context, Context>(
+          sizeof(TIndex), src + axis * sizeof(TIndex), out);
+      out += sizeof(TIndex);
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA);
+
+ private:
+  vector<int> axes_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/shape_op_gpu.cc b/caffe2/operators/shape_op_gpu.cc
new file mode 100644
index 0000000..5880dca
--- /dev/null
+++ b/caffe2/operators/shape_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/shape_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Shape, ShapeOp<CUDAContext>);
+}
diff --git a/caffe2/operators/sigmoid_gradient_op.cc b/caffe2/operators/sigmoid_gradient_op.cc
new file mode 100644
index 0000000..a9be4b2
--- /dev/null
+++ b/caffe2/operators/sigmoid_gradient_op.cc
@@ -0,0 +1,53 @@
+#include "caffe2/operators/sigmoid_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SigmoidGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  EigenVectorArrayMap<T>(dX, size) = dY_arr * Y_arr * (T(1) - Y_arr);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    SigmoidGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SigmoidGradientFunctor<CPUContext>>);
+
+namespace {
+
+class GetSigmoidGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SigmoidGradient",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sigmoid, GetSigmoidGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sigmoid_op.cc b/caffe2/operators/sigmoid_op.cc
new file mode 100644
index 0000000..7402ce4
--- /dev/null
+++ b/caffe2/operators/sigmoid_op.cc
@@ -0,0 +1,90 @@
+#include "caffe2/operators/sigmoid_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SigmoidFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  EigenVectorArrayMap<T>(Y, N) =
+      T(1) / (T(1) + (-ConstEigenVectorArrayMap<T>(X, N)).exp());
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Sigmoid,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SigmoidFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Sigmoid)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Apply the Sigmoid function element-wise to the input tensor. This is often used
+as a non-linear activation function in a neural network. The sigmoid function is
+defined as:
+
+$$Sigmoid(x) = \frac{1}{1+\exp(-x)}$$
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sigmoid_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sigmoid",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.randn(5).astype(np.float32))
+print("input:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("sigmoid:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+input: [ 1.5744036   0.31632107  1.7842269   1.4450722  -2.1726978 ]
+sigmoid: [0.8284105  0.57842743 0.85621804 0.80923885 0.10222916]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
+    .InheritOnnxSchema("Sigmoid");
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(SigmoidGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+SigmoidGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the sigmoid function.
+)DOC");
+
+} // namespace caffe2
diff --git a/caffe2/operators/sigmoid_op.cu b/caffe2/operators/sigmoid_op.cu
new file mode 100644
index 0000000..41eccab
--- /dev/null
+++ b/caffe2/operators/sigmoid_op.cu
@@ -0,0 +1,85 @@
+#include "caffe2/operators/sigmoid_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void SigmoidCUDAKernel(const int N, const T* X, T* Y);
+
+template <>
+__global__ void
+SigmoidCUDAKernel<float>(const int N, const float* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = 1.0f / (1.0f + expf(-__ldg(X + i)));
+#else
+    Y[i] = 1.0f / (1.0f + expf(-X[i]));
+#endif
+  }
+}
+
+template <typename T>
+__global__ void
+SigmoidGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * __ldg(Y + i) * (T(1) - __ldg(Y + i));
+#else
+    dX[i] = dY[i] * Y[i] * (T(1) - Y[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool SigmoidFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SigmoidCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool SigmoidGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  SigmoidGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Sigmoid,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SigmoidFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SigmoidGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SigmoidGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sigmoid_op.h b/caffe2/operators/sigmoid_op.h
new file mode 100644
index 0000000..fb10dab
--- /dev/null
+++ b/caffe2/operators/sigmoid_op.h
@@ -0,0 +1,30 @@
+#ifndef CAFFE2_OPERATORS_SIGMOID_OP_H_
+#define CAFFE2_OPERATORS_SIGMOID_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SigmoidFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct SigmoidGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& Y_dims,
+      const std::vector<int>& dY_dims,
+      const T* Y,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SIGMOID_OP_H_
diff --git a/caffe2/operators/sigmoid_op_cudnn.cc b/caffe2/operators/sigmoid_op_cudnn.cc
new file mode 100644
index 0000000..187f40c
--- /dev/null
+++ b/caffe2/operators/sigmoid_op_cudnn.cc
@@ -0,0 +1,12 @@
+#include "caffe2/operators/sigmoid_op.h"
+
+#include "caffe2/operators/activation_ops_cudnn.h"
+
+namespace caffe2 {
+
+REGISTER_CUDNN_OPERATOR(Sigmoid, CuDNNActivationOp<CUDNN_ACTIVATION_SIGMOID>);
+REGISTER_CUDNN_OPERATOR(
+    SigmoidGradient,
+    CuDNNActivationGradientOp<CUDNN_ACTIVATION_SIGMOID>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sin_op.cc b/caffe2/operators/sin_op.cc
new file mode 100644
index 0000000..90fcb97
--- /dev/null
+++ b/caffe2/operators/sin_op.cc
@@ -0,0 +1,108 @@
+#include "caffe2/operators/sin_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SinGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * X_arr.cos();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Sin,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SinFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    SinGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SinGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Sin)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the sine of the given input tensor, element-wise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sin_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sin",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X: [0.8466114  0.1803606  0.5601509  0.04959291 0.64770824]
+Y: [0.74903965 0.17938434 0.5313141  0.04957259 0.60336035]
+
+```
+
+</details>
+
+)DOC")
+.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
+.Output(
+    0,
+    "Y",
+    "*(type: Tensor`<float>`)* Output tensor calculated as the sine of the input tensor, element-wise.");
+
+OPERATOR_SCHEMA(SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+
+namespace {
+
+class GetSinGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SinGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sin, GetSinGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sin_op.cu b/caffe2/operators/sin_op.cu
new file mode 100644
index 0000000..152ddc6
--- /dev/null
+++ b/caffe2/operators/sin_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/sin_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+SinGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * cos(__ldg(X + i));
+#else
+    dX[i] = dY[i] * cos(X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool SinGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  SinGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Sin,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SinFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SinGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SinGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sin_op.h b/caffe2/operators/sin_op.h
new file mode 100644
index 0000000..a192e49
--- /dev/null
+++ b/caffe2/operators/sin_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_SIN_OP_H_
+#define CAFFE2_OPERATORS_SIN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SinFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sin(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct SinGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SIN_OP_H_
diff --git a/caffe2/operators/sinh_op.cc b/caffe2/operators/sinh_op.cc
new file mode 100644
index 0000000..dcf94ae
--- /dev/null
+++ b/caffe2/operators/sinh_op.cc
@@ -0,0 +1,115 @@
+#include "caffe2/operators/sinh_op.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SinhGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& /* dY_dims */,
+    const std::vector<int>& X_dims,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (X_arr.exp() + (-X_arr).exp()) / 2;
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Sinh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SinhFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    SinhGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SinhGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Sinh)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the hyperbolic sine of the given input tensor, element-wise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sinh_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sinh",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X: [0.98907769 0.52907848 0.03216429 0.94983935 0.47881418]
+Y: [1.15841695 0.5541099  0.03216984 1.09924557 0.49732079]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The hyperbolic sine values of the input tensor, computed "
+        "element-wise")
+    .InheritOnnxSchema("Sinh");
+
+OPERATOR_SCHEMA(SinhGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
+
+class GetSinhGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SinhGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sinh, GetSinhGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sinh_op.cu b/caffe2/operators/sinh_op.cu
new file mode 100644
index 0000000..3dc8e0c
--- /dev/null
+++ b/caffe2/operators/sinh_op.cu
@@ -0,0 +1,60 @@
+#include "caffe2/operators/sinh_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void SinhGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * coshf(__ldg(X + i));
+#else
+    dX[i] = dY[i] * coshf(X[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool SinhGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& /* dY_dims */,
+    const std::vector<int>& X_dims,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  SinhGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Sinh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SinhFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SinhGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SinhGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sinh_op.h b/caffe2/operators/sinh_op.h
new file mode 100644
index 0000000..62e867b
--- /dev/null
+++ b/caffe2/operators/sinh_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_SINH_OP_H_
+#define CAFFE2_OPERATORS_SINH_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SinhFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sinh(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct SinhGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SINH_OP_H_
diff --git a/caffe2/operators/sinusoid_position_encoding_op.cc b/caffe2/operators/sinusoid_position_encoding_op.cc
new file mode 100644
index 0000000..c551c38
--- /dev/null
+++ b/caffe2/operators/sinusoid_position_encoding_op.cc
@@ -0,0 +1,34 @@
+#include "caffe2/operators/sinusoid_position_encoding_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(
+    SinusoidPositionEncoding,
+    SinusoidPositionEncodingOp<CPUContext>);
+
+OPERATOR_SCHEMA(SinusoidPositionEncoding)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Calculates a sinusoid position encoding tensor as described
+in https://arxiv.org/abs/1706.03762. Takes a 2-D tensor
+(of size M x K) of positions as input, the embedding size
+as an argument, and outputs a position encoding tensor of
+size (M x K x embedding_size). Here M is typically the max
+sequence length and K is typically the batch size.
+The input tensor must satisfy input[m, 0] == input[m, k] for all k.
+
+Encoded as amplitude * SIN(pos/alpha^(i/embedding_size)) if i is even,
+else amplitude * COS(pos/alpha^(i/embedding_size)). Here, pos is the position,
+alpha and amplitude are tuning parameters, i is the current dimension for
+the embedding, and embedding_size is the number of total dimensions in
+the embedding.
+)DOC")
+    .Arg(
+        "embedding_size",
+        "Desired embedding size/number of dimensions -- defaults to 100")
+    .Arg("alpha", "Sinusoid tuning parameter -- defaults to 10000")
+    .Arg("amplitude", "Amplitude of Sin/Cos output")
+    .Input(0, "positions", "2-D tensor of positions to be encoded")
+    .Output(0, "output", "3-D tensor representing the positional encoding");
+
+} // namespace caffe2
diff --git a/caffe2/operators/sinusoid_position_encoding_op.h b/caffe2/operators/sinusoid_position_encoding_op.h
new file mode 100644
index 0000000..5591b97
--- /dev/null
+++ b/caffe2/operators/sinusoid_position_encoding_op.h
@@ -0,0 +1,94 @@
+#ifndef CAFFE2_OPERATORS_SINUSOID_POSITION_ENCODING_OP_H_
+#define CAFFE2_OPERATORS_SINUSOID_POSITION_ENCODING_OP_H_
+
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif // _MSC_VER
+#include <cmath>
+
+#include "caffe2/core/operator.h"
+
+#include "Eigen/Core"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SinusoidPositionEncodingOp : public Operator<Context> {
+ public:
+  SinusoidPositionEncodingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        embedding_size_(OperatorBase::template GetSingleArgument<int>(
+            "embedding_size",
+            100)),
+        alpha_(OperatorBase::template GetSingleArgument<float>("alpha", 10000)),
+        amplitude_(
+            OperatorBase::template GetSingleArgument<float>("amplitude", 1)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(0));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& positions = Input(0);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_EQ(positions.ndim(), 2, "POSITIONS should be a 2-D tensor");
+
+    auto shape = positions.dims();
+    shape.push_back(embedding_size_);
+    output->Resize(shape);
+
+    int M = shape[0];
+    int K = shape[1];
+    const Index* idxs = positions.template data<Index>();
+    float* out = output->template mutable_data<float>();
+
+    float log_alpha = std::log(alpha_);
+    float max_alpha_pow =
+        ((float)embedding_size_ - 1.0f) / (float)embedding_size_;
+
+    for (int i = 0; i < M; ++i) {
+      float pos = (float)idxs[i * K];
+
+      // Compute the embedding for position i, example 0 first
+      float* row = &out[i * K * embedding_size_];
+      Eigen::Map<Eigen::VectorXf> row_map(row, embedding_size_, 1);
+      auto row_array = row_map.array();
+
+      float log_pos = std::log(pos);
+      row_array.setLinSpaced(
+          embedding_size_, log_pos, log_pos - log_alpha * max_alpha_pow);
+      row_array = row_array.exp().eval();
+      // row_array[k] == pos / alpha^(k / embedding_size)
+
+      // Phase shift so that alternating elements are cosines
+      for (int k = 1; k < embedding_size_; k += 2) {
+        row[k] += (float)M_PI_2;
+      }
+      row_array = amplitude_ * row_array.sin().eval();
+
+      // Copy the embedding to position i in the other examples
+      for (int j = 1; j < K; ++j) {
+        int base = i * K * embedding_size_;
+        std::copy(
+            &out[base],
+            &out[base + embedding_size_],
+            &out[base + j * embedding_size_]);
+      }
+    }
+    return true;
+  }
+
+ protected:
+  int embedding_size_;
+  float alpha_;
+  float amplitude_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SINUSOID_POSITION_ENCODING_OP_H_
diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc
new file mode 100644
index 0000000..fbba902
--- /dev/null
+++ b/caffe2/operators/slice_op.cc
@@ -0,0 +1,131 @@
+#include "caffe2/operators/slice_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp<int, CPUContext>);
+
+OPERATOR_SCHEMA(Slice)
+    .NumInputs(1, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Produces a slice of the input tensor.
+
+- Currently, only slicing in a single dimension is supported.
+
+- Start and end indices are either passed as two 1D input tensors or using the `starts` and `ends` arguments.
+
+- If a negative value is passed for any of the start or end indices, it represents the number of elements before the end of that dimension. End indices are non-inclusive unless negative (end index -1 means up to and including the last element).
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/slice_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Slice",
+    ["X"],
+    ["Y"],
+    starts=(0,1),
+    ends=(-1,3)
+)
+
+workspace.FeedBlob("X", np.array([[1,2,3,4],[5,6,7,8]]))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[1 2 3 4]
+ [5 6 7 8]]
+Y:
+[[2 3]
+ [6 7]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "(*Tensor*): tensor to extract slices from")
+    .Input(1, "starts", "(*Tensor`<int>`*): 1D tensor of start-indices for each dimension of data")
+    .Input(2, "ends", "(*Tensor`<int>`*): 1D tensor of end-indices for each dimension of data")
+    .Arg("starts", "(*Tuple(int)*): list of starting indices")
+    .Arg("ends", "(*Tuple(int)*): list of ending indices")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      if (in.size() > 1) {
+        // Cannot compute shape inference when the splits are defined
+        // in data.
+        return vector<TensorShape>();
+      }
+      auto const& data = in[0];
+
+      ArgumentHelper helper(def);
+      auto starts = helper.GetRepeatedArgument<int>("starts", vector<int>());
+      auto ends = helper.GetRepeatedArgument<int>("ends", vector<int>());
+      vector<int> dst_sizes(data.dims_size());
+
+      for (int i = 0; i < data.dims_size(); ++i) {
+        if (i >= starts.size()) {
+          continue;
+        }
+        if (data.dims_size() > 0) {
+          auto start = starts[i];
+          auto end = ends[i];
+          if (start < 0) {
+            start = data.dims(i) + 1 + start;
+          }
+          if (end < 0) {
+            end = data.dims(i) + 1 + end;
+          }
+          dst_sizes[i] = end - start;
+        } else {
+          dst_sizes[i] = 0;
+        }
+      }
+      return vector<TensorShape>{
+          CreateTensorShape(dst_sizes, data.data_type())};
+    })
+    .Output(0, "Y", "(*Tensor*): sliced output tensor")
+    .InheritOnnxSchema("Slice");
+
+OPERATOR_SCHEMA(SliceGradient);
+
+namespace {
+struct GetSliceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (def_.input_size() > 1) {
+      return vector<OperatorDef>{CreateOperatorDef(
+          "SliceGradient",
+          "",
+          std::vector<string>{I(0), I(1), I(2), GO(0)},
+          std::vector<string>{GI(0)})};
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+          "SliceGradient",
+          "",
+          std::vector<string>{I(0), GO(0)},
+          std::vector<string>{GI(0)})};
+    }
+  }
+};
+}
+REGISTER_GRADIENT(Slice, GetSliceGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
new file mode 100644
index 0000000..a9ac0db
--- /dev/null
+++ b/caffe2/operators/slice_op.cu
@@ -0,0 +1,309 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/slice_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+__global__ void SliceCopyKernel(
+    char* src_offset_bytes,
+    int src_block_size_bytes,
+    char* dst_offset_bytes,
+    int dst_block_size_bytes,
+    int copy_size,
+    int itemsize,
+    int num_blocks) {
+  if ((copy_size % sizeof(int) == 0) &&
+      (src_block_size_bytes % sizeof(int) == 0) &&
+      (dst_block_size_bytes % sizeof(int) == 0)) {
+    int* src = (int*)src_offset_bytes;
+    int* dst = (int*)dst_offset_bytes;
+
+    int src_block_size = src_block_size_bytes / sizeof(int);
+    int dst_block_size = dst_block_size_bytes / sizeof(int);
+
+    int copyChunks = copy_size / sizeof(int);
+
+    CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
+      int chunk = index % copyChunks;
+      int block = index / copyChunks;
+
+      dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
+    }
+  } else {
+    char* src = (char*)src_offset_bytes;
+    char* dst = (char*)dst_offset_bytes;
+
+    int src_block_size = src_block_size_bytes / sizeof(char);
+    int dst_block_size = dst_block_size_bytes / sizeof(char);
+
+    int copyChunks = copy_size / sizeof(char);
+
+    CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
+      int chunk = index % copyChunks;
+      int block = index / copyChunks;
+
+      dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
+    }
+  }
+}
+
+template <class SIndex, class Context>
+bool SliceImplGpu(
+    Tensor<Context>* output,
+    const Tensor<Context>& data,
+    const TensorCPU& starts,
+    const TensorCPU& ends,
+    Context* context,
+    Tensor<Context>* gdata = nullptr,
+    const Tensor<Context>* go = nullptr) {
+  bool backward = output == nullptr;
+
+  auto* starts_data = starts.template data<SIndex>();
+  auto* ends_data = ends.template data<SIndex>();
+
+  CAFFE_ENFORCE_EQ(starts.ndim(), 1);
+  CAFFE_ENFORCE_EQ(ends.ndim(), 1);
+  CAFFE_ENFORCE_GE(data.ndim(), starts.size());
+  CAFFE_ENFORCE_EQ(starts.size(), ends.size());
+
+  std::vector<int> starts_idx(data.ndim());
+  std::vector<int> ends_idx(data.ndim());
+  std::vector<int> dst_sizes(data.ndim());
+
+  for (int i = 0; i < data.ndim(); ++i) {
+    if (i >= starts.size()) {
+      starts_idx[i] = 0;
+      ends_idx[i] = data.dims()[i];
+      continue;
+    }
+    if (data.dims()[i] > 0) {
+      auto start = starts_data[i];
+      auto end = ends_data[i];
+      if (start < 0) {
+        start = data.dims()[i] + 1 + start;
+      }
+      if (end < 0) {
+        end = data.dims()[i] + 1 + end;
+      }
+      if (start > data.dims()[i]) {
+        start = data.dims()[i];
+      }
+      if (end > data.dims()[i]) {
+        end = data.dims()[i];
+      }
+      CAFFE_ENFORCE_GE(start, 0);
+      CAFFE_ENFORCE_GE(end, 0);
+      CAFFE_ENFORCE_GE(end, start);
+      starts_idx[i] = start;
+      ends_idx[i] = end;
+      dst_sizes[i] = end - start;
+    } else {
+      starts_idx[i] = 0;
+      ends_idx[i] = 0;
+      dst_sizes[i] = 0;
+    }
+  }
+
+  if (data.size() <= 0) {
+    // When the input is empty, we do not need to do copy.
+    if (!backward) {
+      output->Resize(dst_sizes);
+      output->raw_mutable_data(data.meta());
+    }
+    return true;
+  }
+  // for now only supports slicing in 1 dimension
+  int dim = -1;
+  for (int i = 0; i < data.ndim(); ++i) {
+    if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
+      CAFFE_ENFORCE_EQ(
+          dim, -1, "Currently only possible to slice in 1 dimension.");
+      dim = i;
+    }
+  }
+  if (dim == -1) {
+    if (!backward) {
+      output->CopyFrom(data, context);
+    } else {
+      gdata->CopyFrom(*go, context);
+    }
+    return true;
+  }
+  int unit = std::accumulate(
+      data.dims().begin() + dim + 1,
+      data.dims().end(),
+      1,
+      std::multiplies<int>());
+  int num_blocks = std::accumulate(
+      data.dims().begin(),
+      data.dims().begin() + dim,
+      1,
+      std::multiplies<int>());
+  if (!backward) {
+    output->Resize(dst_sizes);
+  } else {
+    gdata->ResizeLike(data);
+  }
+
+  auto itemsize = data.meta().itemsize();
+
+  if (!backward) {
+    char* src_bytes = (char*)data.raw_data();
+    char* dst_bytes = (char*)output->raw_mutable_data(data.meta());
+
+    size_t src_nbytes = data.nbytes();
+    size_t dst_nbytes = output->nbytes();
+
+    size_t src_block_size = unit * data.dims()[dim];
+    size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
+    size_t src_offset = unit * starts_idx[dim];
+
+    if (num_blocks == 0 || dst_block_size == 0) {
+      return true;
+    }
+
+    size_t src_block_size_bytes = itemsize * src_block_size;
+    size_t dst_block_size_bytes = itemsize * dst_block_size;
+    char* src_offset_bytes = src_bytes + itemsize * src_offset;
+    char* dst_offset_bytes = dst_bytes;
+
+    SliceCopyKernel<<<
+        std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        src_offset_bytes,
+        src_block_size_bytes,
+        dst_offset_bytes,
+        dst_block_size_bytes,
+        dst_block_size_bytes,
+        itemsize,
+        num_blocks);
+  } else {
+    char* src_bytes = (char*)go->raw_data();
+    char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
+
+    size_t src_nbytes = go->nbytes();
+    size_t dst_nbytes = gdata->nbytes();
+
+    size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
+    size_t dst_block_size = unit * data.dims()[dim];
+    size_t dst_offset = unit * starts_idx[dim];
+
+    if (num_blocks == 0 || dst_block_size == 0) {
+      return true;
+    }
+
+    size_t src_block_size_bytes = itemsize * src_block_size;
+    size_t dst_block_size_bytes = itemsize * dst_block_size;
+
+    char* src_offset_bytes = src_bytes;
+    char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
+    // Zero out gradient blob before copy since we copy in fewer items than
+    // there is space for
+    math::Set<float, CUDAContext>(
+        gdata->size(),
+        0.0f,
+        (float*)gdata->raw_mutable_data(go->meta()),
+        context);
+
+    // If output tensor is empty, just return zeroed gradient tensor
+    if (!src_bytes) {
+      return true;
+    }
+
+    SliceCopyKernel<<<
+        std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(
+        src_offset_bytes,
+        src_block_size_bytes,
+        dst_offset_bytes,
+        dst_block_size_bytes,
+        src_block_size_bytes,
+        itemsize,
+        num_blocks);
+  }
+
+  return true;
+}
+
+} // namespace
+
+template <>
+bool SliceOp<int, CUDAContext>::RunOnDevice() {
+  auto* output = Output(0);
+  auto& data = Input(0);
+
+  if (InputSize() > 1) {
+    starts_host_.CopyFrom<CUDAContext>(Input(1));
+    ends_host_.CopyFrom<CUDAContext>(Input(2));
+  } else {
+    if (!statically_inited_) {
+      CAFFE_ENFORCE(HasArgument("starts"));
+      CAFFE_ENFORCE(HasArgument("ends"));
+      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+      starts_host_.Resize(starts_.size());
+      ends_host_.Resize(ends_.size());
+
+      memcpy(
+          starts_host_.mutable_data<int>(),
+          starts_.data(),
+          sizeof(int) * starts_.size());
+      memcpy(
+          ends_host_.mutable_data<int>(),
+          ends_.data(),
+          sizeof(int) * ends_.size());
+      statically_inited_ = true;
+    }
+  }
+
+  return SliceImplGpu<int, CUDAContext>(
+      output, data, starts_host_, ends_host_, &context_);
+}
+
+REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);
+
+template <>
+bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
+  auto* gdata = Output(0);
+  auto& data = Input(0);
+
+  if (InputSize() == 4) {
+    starts_host_.CopyFrom<CUDAContext>(Input(1));
+    ends_host_.CopyFrom<CUDAContext>(Input(2));
+
+    auto& go = Input(3);
+
+    return SliceImplGpu<int, CUDAContext>(
+        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+  } else {
+    if (!statically_inited_) {
+      CAFFE_ENFORCE(HasArgument("starts"));
+      CAFFE_ENFORCE(HasArgument("ends"));
+      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+      starts_host_.Resize(starts_.size());
+      ends_host_.Resize(ends_.size());
+
+      memcpy(
+          starts_host_.mutable_data<int>(),
+          starts_.data(),
+          sizeof(int) * starts_.size());
+      memcpy(
+          ends_host_.mutable_data<int>(),
+          ends_.data(),
+          sizeof(int) * ends_.size());
+
+      statically_inited_ = true;
+    }
+    auto& go = Input(1);
+
+    return SliceImplGpu<int, CUDAContext>(
+        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+  }
+}
+REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
new file mode 100644
index 0000000..12734a8
--- /dev/null
+++ b/caffe2/operators/slice_op.h
@@ -0,0 +1,313 @@
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <class SIndex, class Context>
+bool SliceImpl(
+    Tensor<Context>* output,
+    const Tensor<Context>& data,
+    const Tensor<Context>& starts,
+    const Tensor<Context>& ends,
+    Context* context,
+    Tensor<Context>* gdata = nullptr,
+    const Tensor<Context>* go = nullptr) {
+  bool backward = output == nullptr;
+
+  auto* starts_data = starts.template data<SIndex>();
+  auto* ends_data = ends.template data<SIndex>();
+
+  CAFFE_ENFORCE_EQ(starts.ndim(), 1);
+  CAFFE_ENFORCE_EQ(ends.ndim(), 1);
+  CAFFE_ENFORCE_GE(data.ndim(), starts.size());
+  CAFFE_ENFORCE_EQ(starts.size(), ends.size());
+
+  std::vector<SIndex> starts_idx(data.ndim());
+  std::vector<SIndex> ends_idx(data.ndim());
+  std::vector<SIndex> dst_sizes(data.ndim());
+
+  for (int i = 0; i < data.ndim(); ++i) {
+    if (i >= starts.size()) {
+      starts_idx[i] = 0;
+      ends_idx[i] = data.dims()[i];
+      continue;
+    }
+    if (data.dims()[i] > 0) {
+      auto start = starts_data[i];
+      auto end = ends_data[i];
+      if (start < 0) {
+        start = data.dims()[i] + 1 + start;
+      }
+      if (end < 0) {
+        end = data.dims()[i] + 1 + end;
+      }
+      if (start > data.dims()[i]) {
+        start = data.dims()[i];
+      }
+      if (end > data.dims()[i]) {
+        end = data.dims()[i];
+      }
+      CAFFE_ENFORCE_GE(start, 0);
+      CAFFE_ENFORCE_GE(end, 0);
+      CAFFE_ENFORCE_GE(end, start);
+      starts_idx[i] = start;
+      ends_idx[i] = end;
+      dst_sizes[i] = end - start;
+    } else {
+      starts_idx[i] = 0;
+      ends_idx[i] = 0;
+      dst_sizes[i] = 0;
+    }
+  }
+
+  if (data.size() <= 0) {
+    // When the input is empty, we do not need to do copy.
+    if (!backward) {
+      output->Resize(dst_sizes);
+      output->raw_mutable_data(data.meta());
+    }
+    return true;
+  }
+  // for now only supports slicing in 1 dimension
+  int dim = -1;
+  for (int i = 0; i < data.ndim(); ++i) {
+    if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
+      CAFFE_ENFORCE_EQ(
+          dim, -1, "Currently only possible to slice in 1 dimension.");
+      dim = i;
+    }
+  }
+  if (dim == -1) {
+    if (!backward) {
+      output->CopyFrom(data, context);
+    } else {
+      gdata->CopyFrom(*go, context);
+    }
+    return true;
+  }
+  size_t unit = std::accumulate(
+      data.dims().begin() + dim + 1,
+      data.dims().end(),
+      1,
+      std::multiplies<SIndex>());
+  size_t num_blocks = std::accumulate(
+      data.dims().begin(),
+      data.dims().begin() + dim,
+      1,
+      std::multiplies<SIndex>());
+  if (!backward) {
+    output->Resize(dst_sizes);
+  } else {
+    gdata->ResizeLike(data);
+  }
+
+  size_t itemsize = data.meta().itemsize();
+
+  if (!backward) {
+    char* src_bytes = (char*)data.raw_data();
+    char* dst_bytes = (char*)output->raw_mutable_data(data.meta());
+
+    size_t src_nbytes = data.nbytes();
+    size_t dst_nbytes = output->nbytes();
+
+    size_t src_block_size = unit * data.dims()[dim];
+    size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
+    size_t src_offset = unit * starts_idx[dim];
+
+    if (num_blocks == 0 || dst_block_size == 0) {
+      return true;
+    }
+
+    size_t src_block_size_bytes = itemsize * src_block_size;
+    size_t dst_block_size_bytes = itemsize * dst_block_size;
+
+    char* src_offset_bytes = src_bytes + itemsize * src_offset;
+    char* dst_offset_bytes = dst_bytes;
+    for (int i = 0; i < num_blocks; ++i) {
+      char* local_src_offset_bytes =
+          src_offset_bytes + i * src_block_size_bytes;
+      char* local_dst_offset_bytes =
+          dst_offset_bytes + i * dst_block_size_bytes;
+      DCHECK_LE(
+          static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes),
+          static_cast<void*>(src_bytes + src_nbytes));
+      DCHECK_LE(
+          static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes),
+          static_cast<void*>(dst_bytes + dst_nbytes));
+      context->template CopyItems<Context, Context>(
+          data.meta(),
+          dst_block_size,
+          (void*)local_src_offset_bytes,
+          (void*)local_dst_offset_bytes);
+    }
+  } else {
+    char* src_bytes = (char*)go->raw_data();
+    char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
+
+    size_t src_nbytes = go->nbytes();
+    size_t dst_nbytes = gdata->nbytes();
+
+    size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
+    size_t dst_block_size = unit * data.dims()[dim];
+    size_t dst_offset = unit * starts_idx[dim];
+
+    if (num_blocks == 0 || dst_block_size == 0) {
+      return true;
+    }
+
+    size_t src_block_size_bytes = itemsize * src_block_size;
+    size_t dst_block_size_bytes = itemsize * dst_block_size;
+
+    char* src_offset_bytes = src_bytes;
+    char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
+    // Zero out gradient blob before copy since we copy in fewer items than
+    // there is space for
+    math::Set<char, Context>(dst_nbytes, 0, dst_bytes, context);
+
+    // If output tensor is empty, just return zeroed gradient tensor
+    if (!src_bytes) {
+      return true;
+    }
+
+    for (int i = 0; i < num_blocks; ++i) {
+      char* local_src_offset_bytes =
+          src_offset_bytes + i * src_block_size_bytes;
+      char* local_dst_offset_bytes =
+          dst_offset_bytes + i * dst_block_size_bytes;
+      DCHECK_LE(
+          local_src_offset_bytes + src_block_size_bytes,
+          src_bytes + src_nbytes);
+      DCHECK_LE(
+          local_dst_offset_bytes + src_block_size_bytes,
+          dst_bytes + dst_nbytes);
+      context->template CopyItems<Context, Context>(
+          go->meta(),
+          src_block_size,
+          (void*)local_src_offset_bytes,
+          (void*)local_dst_offset_bytes);
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+template <class SIndex, class Context>
+class SliceOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SliceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        starts_(OperatorBase::GetRepeatedArgument<SIndex>("starts")),
+        ends_(OperatorBase::GetRepeatedArgument<SIndex>("ends")),
+        statically_inited_(false) {}
+
+  bool RunOnDevice() override {
+    return RunOnDeviceImpl(Input(0), Output(0));
+  }
+
+ protected:
+  bool RunOnDeviceImpl(const Tensor<Context>& data, Tensor<Context>* output) {
+    if (InputSize() > 1) {
+      starts_host_.template CopyFrom<Context>(Input(1));
+      ends_host_.template CopyFrom<Context>(Input(2));
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.template mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.template mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+        statically_inited_ = true;
+      }
+    }
+
+    return SliceImpl<SIndex, Context>(
+        output, data, starts_host_, ends_host_, &context_);
+  }
+
+  DISABLE_COPY_AND_ASSIGN(SliceOp);
+
+ private:
+  std::vector<SIndex> starts_;
+  std::vector<SIndex> ends_;
+  bool statically_inited_;
+  TensorCPU starts_host_;
+  TensorCPU ends_host_;
+};
+
+template <class SIndex, class Context>
+class SliceGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        starts_(OperatorBase::GetRepeatedArgument<SIndex>("starts")),
+        ends_(OperatorBase::GetRepeatedArgument<SIndex>("ends")),
+        statically_inited_(false) {}
+
+  bool RunOnDevice() override {
+    auto* gdata = Output(0);
+    auto& data = Input(0);
+
+    if (InputSize() == 4) {
+      starts_host_.template CopyFrom<Context>(Input(1));
+      ends_host_.template CopyFrom<Context>(Input(2));
+
+      auto& go = Input(3);
+
+      return SliceImpl<SIndex, Context>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.template mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.template mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+
+        statically_inited_ = true;
+      }
+      auto& go = Input(1);
+
+      return SliceImpl<SIndex, Context>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+
+ private:
+  std::vector<SIndex> starts_;
+  std::vector<SIndex> ends_;
+  bool statically_inited_;
+  TensorCPU starts_host_;
+  TensorCPU ends_host_;
+};
+} // namespace caffe2
diff --git a/caffe2/operators/softmax_op.cc b/caffe2/operators/softmax_op.cc
new file mode 100644
index 0000000..881b939
--- /dev/null
+++ b/caffe2/operators/softmax_op.cc
@@ -0,0 +1,180 @@
+#include "caffe2/operators/softmax_op.h"
+#include "caffe2/operators/softmax_shared.h"
+
+namespace caffe2 {
+
+// Implementation for the CPU context.
+template <>
+bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+  Y->ResizeLike(X);
+  float* Ydata = Y->mutable_data<float>();
+  // First, get scales
+  if (scale_.size() != N) {
+    scale_.Resize(N);
+  }
+  if (rowmax_.size() != N) {
+    rowmax_.Resize(N);
+  }
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
+                                 &context_);
+  }
+
+  SoftmaxCPU(
+      context_,
+      N,
+      D,
+      X.data<float>(),
+      Ydata,
+      scale_.mutable_data<float>(),
+      sum_multiplier_.data<float>(),
+      false,
+      rowmax_.mutable_data<float>());
+  return true;
+}
+
+// Implementation for the CPU context.
+template <>
+bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  const auto canonical_axis = Y.canonical_axis_index(axis_);
+  const int N = Y.size_to_dim(canonical_axis);
+  const int D = Y.size_from_dim(canonical_axis);
+  // First, get scales
+  if (scale_.size() != N) {
+    scale_.Resize(N);
+  }
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
+                                 &context_);
+  }
+  dX->ResizeLike(Y);
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  if (N == 0) {
+    return true;
+  }
+  context_.Copy<float, CPUContext, CPUContext>(Y.size(), dYdata, dXdata);
+  float* scaledata = scale_.mutable_data<float>();
+  for (int i = 0; i < N; ++i) {
+    math::Dot<float, CPUContext>(D, Ydata + i * D, dYdata + i * D,
+                                 scaledata + i, &context_);
+  }
+  math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans, N, D, 1, -1,
+                                scaledata, sum_multiplier_.data<float>(), 1,
+                                dXdata, &context_);
+  math::Mul<float, CPUContext>(Y.size(), dXdata, Ydata, dXdata,
+                               &context_);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Softmax, SoftmaxOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SoftmaxGradient, SoftmaxGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Softmax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+
+Applies the Softmax function to an n-dimensional input Tensor rescaling them so
+that the elements of the n-dimensional output Tensor lie in the range (0,1) and
+sum to 1. The softmax operator is typically the last layer in a classifier network,
+as its output can be interpreted as confidence probabilities of an input belonging
+to each class. The input is a 2-D tensor (Tensor) of size (batch_size x
+input_feature_dimensions). The output tensor has the same shape and contains the
+softmax normalized values of the corresponding input. The softmax function is
+defined as follows:
+
+$$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$
+
+The input does not need to explicitly be a 2D vector; rather, it will be coerced
+into one. For an arbitrary n-dimensional tensor `X` in
+$[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided,
+then `X` will be coerced into a 2-dimensional tensor with dimensions
+$[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where
+`axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions
+$[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this
+situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these
+dimensions must be matched correctly, or else the operator will throw errors.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Softmax",
+    ["X"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32))
+print("input:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("softmax:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+input: [[ 0.0417839   0.61960053 -0.23150268 -0.64389366 -3.0000346 ]]
+softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]]
+
+```
+
+</details>
+
+
+
+)DOC")
+    .Arg(
+        "axis",
+        "*(type: int; default: 1)* Axis of the inputs when coerced to 2D matrix.")
+    .Input(
+        0,
+        "X",
+        "*(type: Tensor`<float>`)* Input tensor that's coerced into a 2D matrix of size (NxD) as described above.")
+    .Output(
+        0,
+        "Y",
+        "*(type: Tensor`<float>`)* The softmax normalized output tensor with the same shape as input tensor.")
+    .InheritOnnxSchema("Softmax");
+
+// Input: Y, dY. Output: dX
+OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1);
+
+class GetSoftmaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient", "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Softmax, GetSoftmaxGradient);
+REGISTER_GRADIENT(SoftmaxFp16, GetSoftmaxGradient);
+
+}  // namespace caffe2
diff --git a/caffe2/operators/softmax_op.h b/caffe2/operators/softmax_op.h
new file mode 100644
index 0000000..9073a0e
--- /dev/null
+++ b/caffe2/operators/softmax_op.h
@@ -0,0 +1,44 @@
+#ifndef CAFFE2_OPERATORS_SOFTMAX_OP_H_
+#define CAFFE2_OPERATORS_SOFTMAX_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SoftmaxOp final : public Operator<Context> {
+ public:
+  SoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+      axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+  Tensor<Context> scale_;
+  Tensor<Context> rowmax_;
+  Tensor<Context> sum_multiplier_;
+};
+
+template <typename T, class Context>
+class SoftmaxGradientOp final : public Operator<Context> {
+ public:
+  SoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  int axis_;
+  Tensor<Context> scale_;
+  Tensor<Context> sum_multiplier_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SOFTMAX_OP_H_
diff --git a/caffe2/operators/softmax_op_cudnn.cc b/caffe2/operators/softmax_op_cudnn.cc
new file mode 100644
index 0000000..2cfc505
--- /dev/null
+++ b/caffe2/operators/softmax_op_cudnn.cc
@@ -0,0 +1,147 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/types.h"
+#include "caffe2/operators/softmax_op.h"
+
+namespace caffe2 {
+
+namespace {
+constexpr int NUM_DESCRIPTORS = 2;
+constexpr int GRADIENT_NUM_DESCRIPTORS = 3;
+constexpr int BOTTOM_DESC_ID = 0;
+constexpr int TOP_DESC_ID = 1;
+constexpr int TOP_GRADIENT_DESC_ID = 2;
+}  // namespace
+
+class CuDNNSoftmaxOp final : public Operator<CUDAContext> {
+ public:
+  explicit CuDNNSoftmaxOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CUDAContext>(def, ws),
+        cudnn_wrapper_(&context_),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
+  }
+
+  ~CuDNNSoftmaxOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const int N = X.size_to_dim(canonical_axis);
+    const int D = X.size_from_dim(canonical_axis);
+
+    Y->ResizeLike(X);
+    auto* Y_data = Y->template mutable_data<T>();
+    if (N == 0) {
+      return true;
+    }
+    if (dims_ != X.dims()) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          desc_,
+          GetCudnnTensorFormat(StorageOrder::NCHW),
+          cudnnTypeWrapper<T>::type,
+          N,
+          D,
+          1,
+          1));
+      dims_ = X.dims();
+    }
+    CUDNN_ENFORCE(cudnnSoftmaxForward(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_MODE_INSTANCE,
+        cudnnTypeWrapper<T>::kOne(),
+        desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        desc_,
+        Y_data));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  int axis_;
+  cudnnTensorDescriptor_t desc_;
+  vector<TIndex> dims_;
+};
+
+
+class CuDNNSoftmaxGradientOp final : public Operator<CUDAContext> {
+ public:
+  explicit CuDNNSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<CUDAContext>(def, ws),
+        cudnn_wrapper_(&context_),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
+  }
+
+  ~CuDNNSoftmaxGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& Y = Input(0);
+    auto& dY = Input(1);
+    auto* dX = Output(0);
+    const auto canonical_axis = Y.canonical_axis_index(axis_);
+    const int N = Y.size_to_dim(canonical_axis);
+    const int D = Y.size_from_dim(canonical_axis);
+
+    CHECK_EQ(Y.dims(), dY.dims());
+    dX->ResizeLike(Y);
+    auto* dX_data = dX->template mutable_data<T>();
+    if (N == 0) {
+      return true;
+    }
+    if (dims_ != Y.dims()) {
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          desc_,
+          GetCudnnTensorFormat(StorageOrder::NCHW),
+          cudnnTypeWrapper<T>::type,
+          N,
+          D,
+          1,
+          1));
+      dims_ = Y.dims();
+    }
+    CUDNN_ENFORCE(cudnnSoftmaxBackward(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_MODE_INSTANCE,
+        cudnnTypeWrapper<T>::kOne(),
+        desc_,
+        Y.template data<T>(),
+        desc_,
+        dY.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        desc_,
+        dX_data));
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  int axis_;
+  cudnnTensorDescriptor_t desc_;
+  vector<TIndex> dims_;
+};
+
+namespace {
+REGISTER_CUDNN_OPERATOR(Softmax, CuDNNSoftmaxOp);
+REGISTER_CUDNN_OPERATOR(SoftmaxGradient, CuDNNSoftmaxGradientOp);
+}  // namespace
+}  // namespace caffe2
diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
new file mode 100644
index 0000000..8795b6c
--- /dev/null
+++ b/caffe2/operators/softmax_ops.cu
@@ -0,0 +1,786 @@
+#include <cfloat>
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "softmax_op.h"
+#include "softmax_with_loss_op.h"
+#include "spatial_softmax_with_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void LabelCrossEntropyKernel(
+    const int N,
+    const int D,
+    const float* logPdata,
+    const int* labeldata,
+    const float* weights,
+    float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
+    float weight = weights ? weights[i] : 1.0;
+    Ydata[i] = -logPdata[i * D + labeldata[i]] * weight;
+  }
+}
+
+__global__ void LabelCrossEntropyGradientKernel(
+    const int N,
+    const int D,
+    const float* Pdata,
+    const int* labeldata,
+    float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int idx = i * D + labeldata[i];
+    dXdata[idx] = Pdata[idx] - 1.;
+  }
+}
+
+__global__ void LabelCrossEntropyGradientKernelWeighted(
+    const int N,
+    const int D,
+    const float* Pdata,
+    const int* labeldata,
+    float* dXdata,
+    const float* weights) {
+  CUDA_1D_KERNEL_LOOP(i, N * D) {
+    int row = i / D;
+    int d = i % D;
+    float val = Pdata[i] - 1.0 * (d == labeldata[row]);
+    float weight = weights[row];
+    dXdata[i] = val * weight;
+  }
+}
+
+__global__ void ProbCrossEntropyKernel(
+    const int N,
+    const int D,
+    const float* Pdata,
+    const float* labeldata,
+    const float* weights,
+    float* Ydata) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    float weight = weights ? weights[i] : 1.0;
+    float sum = 0.0;
+    float total_prob = 0.0;
+    for (int j = threadIdx.x; j < D; j += blockDim.x) {
+      int idx = i * D + j;
+      CUDA_KERNEL_ASSERT(labeldata[idx] >= 0);
+      total_prob += labeldata[idx];
+      sum += -logf(max(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight;
+    }
+    float tot = BlockReduce(temp_storage).Sum(sum);
+    __syncthreads();
+    float total_prob_sum = BlockReduce(temp_storage).Sum(total_prob);
+    if (threadIdx.x == 0) {
+      Ydata[i] = tot;
+      // Sanity check
+      CUDA_KERNEL_ASSERT(abs(1.0 - total_prob_sum) < 1e-5f);
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ProbCrossEntropyGradientKernel(
+    const int N,
+    const int D,
+    const float* Pdata,
+    const float* labeldata,
+    float* dXdata,
+    const float* weights) {
+  if (weights == NULL) {
+    CUDA_1D_KERNEL_LOOP(idx, N * D) {
+      dXdata[idx] = Pdata[idx] - labeldata[idx];
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(idx, N * D) {
+      dXdata[idx] = (Pdata[idx] - labeldata[idx]) * weights[idx / D];
+    }
+  }
+}
+
+__global__ void SpatialSoftmaxKernel(
+    const int num,
+    const int D,
+    const int W,
+    const int H,
+    const float* Xdata,
+    float* Pdata) {
+  CUDA_1D_KERNEL_LOOP(index, num * W * H) {
+    int x = index % W;
+    int y = (index / W) % H;
+    int i = index / W / H;
+
+    // Subtract max on each cell for numerical reasons
+    float max_val = -FLT_MAX;
+    for(int c = 0; c < D; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      max_val = max(max_val, Xdata[idx]);
+    }
+
+    // Exponentiate
+    float expsum = 0.0f;
+    for(int c = 0; c < D; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      float expx = exp(Xdata[idx] - max_val);
+      Pdata[idx] = expx;
+      expsum += expx;
+    }
+
+    // Normalize
+    for(int c=0; c<D; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      Pdata[idx] /= expsum;
+    }
+  }
+}
+
+
+#define DONTCARE (-1)
+
+__global__ void SpatialCrossEntropyLossKernel(
+    const int N,
+    const int D,
+    const int W,
+    const int H,
+    const float* Pdata,
+    const int* label_data,
+    const float* weights,
+    float* loss_data,
+    float* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, N * W * H) {
+    int x = index % W;
+    int y = (index / W) % H;
+    int i = index / W / H;
+    const int label = static_cast<int>(label_data[index]);
+
+    if (label != DONTCARE) {
+      CUDA_KERNEL_ASSERT(label >= 0 && label < D);
+      float weight = (weights == NULL ? 1.0 : weights[index]);
+      loss_data[index] = -log(max(
+        Pdata[i * W * H * D + label * W * H + y * W + x], 1e-20f)) * weight;
+      weight_data[index] = weight;
+    } else {
+      loss_data[index] = 0;
+      weight_data[index] = 0;
+    }
+  }
+}
+
+__global__ void SpatialSoftmaxLossGradientKernel(const int N, const int D,
+    const int W, const int H, const int* label_data, const float* weights,
+         float* dX_data, float* weights_) {
+ CUDA_1D_KERNEL_LOOP(index, N * W * H) {
+   int x = index % W;
+   int y = (index / W) % H;
+   int i = index / W / H;
+   const int label = static_cast<int>(label_data[index]);
+
+   if (label != DONTCARE) {
+     int data_idx = i * (H * W * D) + label * (H * W) + y * W + x;
+     dX_data[data_idx] -= 1.0;
+     if (weights != NULL) {
+       float weight = weights[index];
+       for (int c = 0; c < D; ++c) {
+         int data_idx = i * (H * W * D) + c * (H * W) + y * W + x;
+         dX_data[data_idx] *= weight;
+       }
+       weights_[index] = weight;
+     } else {
+       weights_[index] = 1.0;
+     }
+   } else {
+     // Ignore-label, so set all gradients for this positions
+     // tp zero
+     for (int c = 0; c < D; ++c) {
+       int data_idx = i * (H * W * D) + c * (H * W) + y * W + x;
+       dX_data[data_idx] = 0.0;
+     }
+     weights_[index] = 0.0;
+   }
+ }
+}
+
+__global__ void SoftmaxNormalizeLogsKernel(
+    const int nthreads,
+    const int D,
+    const float* logits,
+    const float* rowmax,
+    const float* scales,
+    float* out_log) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / D;
+    out_log[index] = logits[index] - rowmax[n] - logf(max(scales[n], FLT_MIN));
+  }
+}
+
+__global__ void SoftmaxNormalizeKernel(
+    const int nthreads,
+    const int D,
+    const float* probs,
+    const float* scales,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / D;
+    out[index] = probs[index] / scales[n];
+  }
+}
+
+void Softmax(
+    const int N,
+    const int D,
+    const float* logits,
+    const float* sum_multiplier,
+    float* scales,
+    float* rowmax,
+    float* probs,
+    bool log_softmax,
+    CUDAContext* context) {
+  const int size = N * D;
+
+  math::RowwiseMax<float, CUDAContext>(N, D, logits, rowmax, context);
+  // Put the intermediate result X - max(X) into Y
+  context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
+  // Subtract the scale
+  math::Gemm<float, CUDAContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      N,
+      D,
+      1,
+      -1,
+      rowmax,
+      sum_multiplier,
+      1,
+      probs,
+      context);
+  // Exponentiation
+  math::Exp<float, CUDAContext>(size, probs, probs, context);
+  // Sum exponentiated values
+  math::Gemv<float, CUDAContext>(CblasNoTrans, N, D, 1, probs, sum_multiplier,
+                                 0, scales, context);
+  // Normalize
+  if (!log_softmax) {
+    SoftmaxNormalizeKernel<<<
+        CAFFE_GET_BLOCKS(size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(size, D, probs, scales, probs);
+  } else {
+    SoftmaxNormalizeLogsKernel<<<
+        CAFFE_GET_BLOCKS(size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(size, D, logits, rowmax, scales, probs);
+  }
+}
+
+} // namespace
+
+template<>
+bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Logits
+  auto& T = Input(1);  // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  int N, D;
+  N = X.size_to_dim(canonical_axis); // batch size
+  D = X.size_from_dim(canonical_axis);
+  P->ResizeLike(X);
+  total_weight_ptr_.Resize(1);
+
+  if (label_prob_mode_) {
+    CAFFE_ENFORCE_GE(T.ndim(), 2);
+    CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+    CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
+  } else {
+    if (T.ndim() == canonical_axis) {
+      CAFFE_ENFORCE_EQ(T.size(), N);
+    } else {
+      CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+      CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
+    }
+  }
+
+  avg_loss->Resize(vector<TIndex>());
+  if (losses_.size() != N) {
+    losses_.Resize(N);
+  }
+  if (rowmax_.size() != N) {
+    rowmax_.Resize(N);
+  }
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CUDAContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+  Softmax(
+      N,
+      D,
+      X.data<float>(),
+      sum_multiplier_.data<float>(),
+      losses_.mutable_data<float>(),
+      rowmax_.mutable_data<float>(),
+      P->mutable_data<float>(),
+      !label_prob_mode_, // logarithmic output
+      &context_);
+  // Compute label xent loss per example
+  if (!label_prob_mode_) {
+    LabelCrossEntropyKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        D,
+        P->data<float>(),
+        T.data<int>(),
+        weights,
+        losses_.mutable_data<float>());
+    // Since we had logarithmic output, we need to exponentiate
+    // them again.
+    math::Exp<float, CUDAContext>(
+        N * D, P->data<float>(), P->mutable_data<float>(), &context_);
+  } else {
+    ProbCrossEntropyKernel<<<
+        std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        D,
+        P->data<float>(),
+        T.data<float>(),
+        weights,
+        losses_.mutable_data<float>());
+  }
+
+  float total_weight = N;
+  if (weights) {
+    // Sum weights
+    math::Sum<float, CUDAContext>(
+        N, weights, total_weight_ptr_.mutable_data<float>(), &context_, &scratch_);
+    CUDA_CHECK(cudaMemcpyAsync(
+        &total_weight,
+        total_weight_ptr_.data<float>(),
+        sizeof(float),
+        cudaMemcpyDeviceToHost,
+        context_.cuda_stream()));
+  }
+
+  // Sum of all losses
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Sum<float, CUDAContext>(
+      losses_.size(), losses_.data<float>(), avg_loss_data, &context_, &scratch_);
+  // Average of input batch size
+  if (total_weight > 0) {
+    math::Scale<float, CUDAContext>(
+        1, scale_ / total_weight, avg_loss_data, avg_loss_data, &context_);
+  }
+
+  return true;
+}
+
+template <>
+bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
+  int N, D;
+  N = X.dim32(0);
+  D = X.dim32(1);
+  P->ResizeLike(X);
+  total_weight_ptr_.Resize(1);
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  CAFFE_ENFORCE_EQ(T.ndim(), 3);
+  CAFFE_ENFORCE_EQ(T.dim32(0), N);
+
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  if (losses_.size() != N * W * H) {
+    losses_.Resize(N * W * H);
+  }
+  if (weights_.size() != N * W * H) {
+    weights_.Resize(N * W * H);
+  }
+
+  const float* Xdata = X.data<float>();
+  float* Pdata = P->mutable_data<float>();
+
+  // Softmax for each x,y location
+  SpatialSoftmaxKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, D, W, H, Xdata, Pdata);
+
+  // Cross entropy
+  avg_loss->Resize(vector<TIndex>());
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
+
+  const int* label_data = T.data<int>();
+  math::Set<float, CUDAContext>(
+      1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
+
+  SpatialCrossEntropyLossKernel<<<
+      CAFFE_GET_BLOCKS(N * W * H),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      W,
+      H,
+      P->data<float>(),
+      label_data,
+      weights,
+      losses_.mutable_data<float>(),
+      weights_.mutable_data<float>());
+
+  // Somewhat awkward scalar passing from device to host
+  float h_total_weight;
+  math::Sum<float, CUDAContext>(
+      weights_.size(),
+      weights_.data<float>(),
+      total_weight_ptr_.mutable_data<float>(),
+      &context_,
+      &scratch_);
+  CUDA_CHECK(cudaMemcpyAsync(
+      &h_total_weight,
+      total_weight_ptr_.data<float>(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost,
+      context_.cuda_stream()));
+
+  math::Sum<float, CUDAContext>(
+      losses_.size(), losses_.data<float>(), avg_loss_data, &context_, &scratch_);
+
+  // Final scaling
+  if (h_total_weight > 0) {
+    math::Scale<float, CUDAContext>(
+        1, scale_ / h_total_weight, avg_loss_data, avg_loss_data, &context_);
+  }
+
+  return true;
+}
+
+template <>
+bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Logits
+  auto& T = Input(1);  // Labels / targets
+  // Input(2) is weights, if given
+  auto& P = Input(InputSize() - 2);  // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
+
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  int N, D;
+  N = X.size_to_dim(canonical_axis); // batch size
+  D = X.size_from_dim(canonical_axis);
+
+  if (only_loss_) {
+    // Memory saving trick to share the buffer with the softmax output.
+    // Softmax output is thus overwritten.
+    dX->ShareData(P);
+  }
+
+  total_weight_ptr_.Resize(1);
+
+  if (label_prob_mode_) {
+    CAFFE_ENFORCE_GE(T.ndim(), 2);
+    CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+    CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
+  } else {
+    if (T.ndim() == canonical_axis) {
+      CAFFE_ENFORCE_EQ(T.size(), N);
+    } else {
+      CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+      CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
+    }
+  }
+
+  // Subtract 1 from labeled positions
+  if (!label_prob_mode_) {
+    if (weights == nullptr) {
+      // Copy softmax probabilities into dX
+      if (!only_loss_) {
+        context_.Copy<float, CUDAContext, CUDAContext>(
+            P.size(), P.data<float>(), dX->mutable_data<float>());
+      }
+      LabelCrossEntropyGradientKernel<<<
+          CAFFE_GET_BLOCKS(N),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
+    } else {
+      // Weighted version gets the Pdata values internally
+      LabelCrossEntropyGradientKernelWeighted<<<
+          CAFFE_GET_BLOCKS(N * D),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          N,
+          D,
+          P.data<float>(),
+          T.data<int>(),
+          dX->mutable_data<float>(),
+          weights);
+    }
+  } else {
+    ProbCrossEntropyGradientKernel<<<
+        CAFFE_GET_BLOCKS(N * D),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N,
+        D,
+        P.data<float>(),
+        T.data<float>(),
+        dX->mutable_data<float>(),
+        weights);
+  }
+  float total_weight = N;
+  if (weights) {
+    // Sum weights
+    math::Sum<float, CUDAContext>(
+        N, weights, total_weight_ptr_.mutable_data<float>(), &context_, &scratch_);
+    CUDA_CHECK(cudaMemcpyAsync(
+        &total_weight,
+        total_weight_ptr_.data<float>(),
+        sizeof(float),
+        cudaMemcpyDeviceToHost,
+        context_.cuda_stream()));
+  }
+
+  // Scale by d_avg_loss / N
+  if (total_weight > 0) {
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        scale_ / total_weight,
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+  }
+  math::Scale<float, CUDAContext>(
+      dX->size(),
+      d_avg_loss.data<float>(),
+      dX->data<float>(),
+      dX->mutable_data<float>(),
+      &context_);
+
+  return true;
+}
+
+template <>
+bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  // Input(2) is weights, if given
+  auto& P = Input(InputSize() - 2); // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
+
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+
+  const auto canonical_axis = X.canonical_axis_index(1);
+  int N, D;
+  N = X.dim32(0);
+  D = X.dim32(1);
+
+  if (only_loss_) {
+    // Memory saving trick to share the buffer with the softmax output.
+    // Softmax output is thus overwritten.
+    dX->ShareData(P);
+  }
+
+  total_weight_ptr_.Resize(1);
+  // Spatial mode, compute softmax for each x, y location
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  CAFFE_ENFORCE_EQ(T.ndim(), 3);
+
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  dX->ResizeLike(X);
+  if (weights_.size() != N * W * H) {
+    weights_.Resize(N * W * H);
+  }
+
+  const float* Pdata = P.data<float>();
+  float* dX_data = dX->mutable_data<float>();
+  const int* label_data = T.data<int>();
+  const float* d_avg_loss_data = d_avg_loss.data<float>();
+
+  // Copy softmax probabilities into dX. All but the neuron
+  // corresponding to the correct label has gradient equaling e(x_j)
+  // which is the probability under softmax.
+  context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
+
+  math::Set<float, CUDAContext>(
+      1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
+
+  SpatialSoftmaxLossGradientKernel<<<
+      CAFFE_GET_BLOCKS(N * W * H),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, D, W, H, label_data, weights, dX_data, weights_.mutable_data<float>());
+
+  math::Sum<float, CUDAContext>(
+      weights_.size(),
+      weights_.data<float>(),
+      total_weight_ptr_.mutable_data<float>(),
+      &context_,
+      &scratch_);
+
+  // Somewhat awkward scalar passing from device to host
+  float h_total_weight;
+  CUDA_CHECK(cudaMemcpyAsync(
+      &h_total_weight,
+      total_weight_ptr_.data<float>(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost,
+      context_.cuda_stream()));
+
+  // Final scaling
+  if (h_total_weight > 0) {
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        scale_ / h_total_weight,
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+  }
+  math::Scale<float, CUDAContext>(
+      dX->size(),
+      d_avg_loss.data<float>(),
+      dX->data<float>(),
+      dX->mutable_data<float>(),
+      &context_);
+
+  return true;
+}
+
+// Implementation for the CUDA context.
+template <>
+bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* P = Output(0);
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
+  P->ResizeLike(X);
+  auto* P_data = P->mutable_data<float>();
+  if (N == 0) {
+    return true;
+  }
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CUDAContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+  if (scale_.size() != N) {
+    scale_.Resize(N);
+  }
+  if (rowmax_.size() != N) {
+    rowmax_.Resize(N);
+  }
+  Softmax(
+      N,
+      D,
+      X.data<float>(),
+      sum_multiplier_.data<float>(),
+      scale_.mutable_data<float>(),
+      rowmax_.mutable_data<float>(),
+      P_data,
+      false,
+      &context_);
+  return true;
+}
+#define SOFTMAX_NUM_THREADS 128
+
+// The softmax gradient kernel. This kernel has to be called with the number of
+// threads per block being no more than SOFTMAX_NUM_THREADS.
+namespace {
+__global__ void softmax_gradient_kernel(
+    const int dim,
+    const float* Y,
+    const float* dY,
+    float* dX) {
+  Y += blockIdx.x * dim;
+  dY += blockIdx.x * dim;
+  dX += blockIdx.x * dim;
+  const int idx = threadIdx.x;
+  __shared__ float reduction_buffer[SOFTMAX_NUM_THREADS];
+  float tmp;
+
+  // A two-level reduction to compute the inner products.
+  tmp = 0;
+  for (int i = idx; i < dim; i += blockDim.x) {
+    tmp += dY[i] * Y[i];
+  }
+  reduction_buffer[idx] = tmp;
+  __syncthreads();
+  if (idx == 0) {
+    tmp = reduction_buffer[0];
+    for (int i = 1; i < blockDim.x; ++i)
+      tmp += reduction_buffer[i];
+    reduction_buffer[0] = tmp;
+  }
+  __syncthreads();
+  // Compute gradient.
+  tmp = reduction_buffer[0];
+  for (int i = idx; i < dim; i += blockDim.x) {
+    dX[i] = Y[i] * (dY[i] - tmp);
+  }
+}
+} // namespace
+
+template <>
+bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  const auto canonical_axis = Y.canonical_axis_index(axis_);
+  const int N = Y.size_to_dim(canonical_axis);
+  const int D = Y.size_from_dim(canonical_axis);
+  dX->ResizeLike(Y);
+  auto* dX_data = dX->mutable_data<float>();
+  if (N == 0) {
+    return true;
+  }
+  softmax_gradient_kernel<<<
+      N,
+      SOFTMAX_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(D, Y.data<float>(), dY.data<float>(), dX_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(SoftmaxWithLoss,
+                       SoftmaxWithLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SoftmaxWithLossGradient,
+                       SoftmaxWithLossGradientOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SpatialSoftmaxWithLoss,
+    SpatialSoftmaxWithLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SpatialSoftmaxWithLossGradient,
+    SpatialSoftmaxWithLossGradientOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Softmax, SoftmaxOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SoftmaxGradient, SoftmaxGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/softmax_shared.cc b/caffe2/operators/softmax_shared.cc
new file mode 100644
index 0000000..14e823c
--- /dev/null
+++ b/caffe2/operators/softmax_shared.cc
@@ -0,0 +1,55 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+void SoftmaxCPU(
+    CPUContext& context,
+    const int N,
+    const int D,
+    const float* Xdata,
+    float* Ydata,
+    float* scale,
+    const float* sum_multiplier,
+    bool logarithmic,
+    float* rowmax) {
+  math::RowwiseMax<float, CPUContext>(N, D, Xdata, rowmax, &context);
+  // Put the intermediate result X - max(X) into Y
+  context.template Copy<float, CPUContext, CPUContext>(N * D, Xdata, Ydata);
+  // Subtract the max (for numerical reasons)
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      N,
+      D,
+      1,
+      -1,
+      rowmax,
+      sum_multiplier,
+      1,
+      Ydata,
+      &context);
+  // Exponentiation
+  math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
+  math::Gemv<float, CPUContext>(
+      CblasNoTrans, N, D, 1, Ydata, sum_multiplier, 0, scale, &context);
+  // Do division
+  // TODO(Yangqing): maybe implement it more beautifully?
+  if (!logarithmic) {
+    for (int i = 0; i < N; ++i) {
+      for (int j = 0; j < D; ++j) {
+        Ydata[i * D + j] /= scale[i];
+      }
+    }
+  } else {
+    for (int i = 0; i < N; ++i) {
+      for (int j = 0; j < D; ++j) {
+        Ydata[i * D + j] =
+            Xdata[i * D + j] - rowmax[i] - log(fmaxf(scale[i], 1e-20f));
+      }
+    }
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/softmax_shared.h b/caffe2/operators/softmax_shared.h
new file mode 100644
index 0000000..60c2bd0
--- /dev/null
+++ b/caffe2/operators/softmax_shared.h
@@ -0,0 +1,21 @@
+#ifndef CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
+#define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+void SoftmaxCPU(
+    CPUContext& context,
+    const int N,
+    const int D,
+    const float* Xdata,
+    float* Ydata,
+    float* scale,
+    const float* sum_multiplier,
+    bool logarithmic,
+    float* rowmax);
+} // namespace caffe2
+
+#endif // #define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
diff --git a/caffe2/operators/softmax_with_loss_op.cc b/caffe2/operators/softmax_with_loss_op.cc
new file mode 100644
index 0000000..32cb2ce
--- /dev/null
+++ b/caffe2/operators/softmax_with_loss_op.cc
@@ -0,0 +1,380 @@
+#include "softmax_with_loss_op.h"
+#include "softmax_shared.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SoftmaxWithLossGradient,
+    SoftmaxWithLossGradientOp<float, CPUContext>);
+
+// Input: X (logits), T (labels); Output: P (probs), Y
+OPERATOR_SCHEMA(SoftmaxWithLoss)
+    .NumInputs(2, 3)
+    .NumOutputs(2)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          ArgumentHelper helper(def);
+          auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
+
+          vector<TensorShape> out(2);
+
+          auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
+          auto labels = in[1]; // Tensor with shape [batch_size, ]
+          const auto canonical_axis =
+              canonical_axis_index_(axis, logits.dims().size());
+          const int batch_size =
+              size_to_dim_(canonical_axis, GetDimsVector(logits));
+          const int num_classes =
+              size_from_dim_(canonical_axis, GetDimsVector(logits));
+
+          out[0].set_data_type(logits.data_type());
+          out[0].add_dims(batch_size);
+          out[0].add_dims(num_classes);
+
+          return out;
+        })
+    .SetDoc(R"DOC(
+Combined Softmax and Cross-Entropy loss operator. The operator first computes the softmax normalized values for each layer in the batch of the given input, then computes cross-entropy loss. This operator is numerically more stable than separate `Softmax` and `CrossEntropy` ops. The inputs are a 2-D tensor `logits` of size (batch_size x input_feature_dimensions), which represents the unscaled log probabilities, and a 1-dimensional integer `labels` tensor for ground truth. An optional third input blob (`weight_tensor`) can be used to weight the samples for the loss, which is useful if the training set is unbalanced. This operator outputs a `softmax` tensor which contains the probability for each label for each example (same shape is `logits` input), and a scalar `loss` value, which is the averaged cross-entropy loss between the softmax probabilities and the ground truth values. Use parameter `label_prob`=1 to enable inputting labels as a probability distribution.
+
+Softmax cross-entropy loss function:
+
+$$loss(x, class) = -\log{\biggl(\frac{\exp(x[class])}{\sum_{j} \exp(x[j])}\biggr)} = -x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}$$
+
+or if the `weight_tensor` has been passed:
+
+$$loss(x, class) = weight[class]\biggl(-x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}\biggr)$$
+
+The `logits` input does not need to explicitly be a 2D vector; rather, it will be coerced into one. For an arbitrary n-dimensional tensor `X` in $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, then `X` will be coerced into a 2-dimensional tensor with dimensions $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these dimensions must be matched correctly, or else the operator will throw errors.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_with_loss_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "SoftmaxWithLoss",
+    ["logits", "labels"],
+    ["softmax", "avgloss"]
+)
+
+workspace.FeedBlob("logits", np.random.randn(1, 5).astype(np.float32))
+workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32))
+print("logits:", workspace.FetchBlob("logits"))
+print("labels:", workspace.FetchBlob("labels"))
+workspace.RunOperatorOnce(op)
+print("softmax:", workspace.FetchBlob("softmax"))
+print("avgloss:", workspace.FetchBlob("avgloss"))
+
+```
+
+**Result**
+
+```
+
+logits: [[-0.3429451  -0.80375195  0.23104447  1.4569176  -0.5268362 ]]
+labels: [4]
+softmax: [[0.09721052 0.0613179  0.17258129 0.58800864 0.0808817 ]]
+avgloss: 2.5147676
+
+```
+
+</details>
+
+<details>
+
+<summary> <b>Example 2</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "SoftmaxWithLoss",
+    ["logits", "labels"],
+    ["softmax", "avgloss"],
+    scale=5.0
+)
+
+workspace.FeedBlob("logits", np.asarray([[.1, .4, .7, 1.5, .2]]).astype(np.float32))
+workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32))
+print("logits:", workspace.FetchBlob("logits"))
+print("labels:", workspace.FetchBlob("labels"))
+workspace.RunOperatorOnce(op)
+print("softmax:", workspace.FetchBlob("softmax"))
+print("avgloss:", workspace.FetchBlob("avgloss"))
+
+```
+
+**Result**
+
+```
+
+logits: [[0.1 0.4 0.7 1.5 0.2]]
+labels: [4]
+softmax: [[0.10715417 0.144643   0.19524762 0.4345316  0.11842369]]
+avgloss: 10.667433
+
+```
+
+</details>
+
+)DOC")
+    .Arg("label_prob","*(type: int; default: 0)* Setting to 1 enables inputting labels as probability distribution.")
+    .Arg("axis","*(type: int; default: 1)* Axis of the inputs when coerced to 2D.")
+    .Arg("scale","*(type: float)* Average loss output scaling factor (must be >= 0).")
+    .Arg("order","*(type: string; default: 'NCHW')* Order of blob dimensions (only 'NCHW' is supported currently).")
+    .Input(0, "logits", "*(type: Tensor`<float>`)* Input tensor.")
+    .Input(1, "labels", "*(type: Tensor`<float>`)* Ground truth label tensor.")
+    .Input(
+        2,
+        "weight_tensor",
+        "*(type: Tensor`<float>`)* [OPTIONAL] Blob used to weight the samples for the loss.")
+    .Output(0, "softmax", "*(type: Tensor`<float>`)* Softmax output tensor.")
+    .Output(1, "loss", "*(type: float)* Averaged cross-entropy loss output.");
+
+// Input: X, T, P, dY; Output: dX
+OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
+
+#define DONT_CARE (-1)
+
+template <>
+bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  int N, D;
+  N = X.size_to_dim(canonical_axis); // batch size
+  D = X.size_from_dim(canonical_axis);
+  P->ResizeLike(X);
+
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  float* Pdata = P->mutable_data<float>();
+  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
+
+  if (label_prob_mode_) {
+    CAFFE_ENFORCE_GE(T.ndim(), 2);
+    CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+    CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
+  } else {
+    if (T.ndim() == canonical_axis) {
+      CAFFE_ENFORCE_EQ(T.size(), N);
+    } else {
+      CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+      CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
+    }
+  }
+
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  rowmax_.Resize(N);
+  losses_.Resize(N);
+
+  SoftmaxCPU(
+      context_,
+      N,
+      D,
+      X.data<float>(),
+      Pdata,
+      losses_.mutable_data<float>(),
+      sum_multiplier_.data<float>(),
+      !label_prob_mode_,
+      rowmax_.mutable_data<float>());
+
+  // Then compute cross entropy
+  float loss_sum = 0.0;
+  float weight_sum = 0.0;
+  if (!label_prob_mode_) {
+    const int* label_data = T.data<int>();
+    const float* Xdata = X.data<float>();
+
+    for (int i = 0; i < N; ++i) {
+      CAFFE_ENFORCE(
+          label_data[i] < D && label_data[i] >= 0,
+          "Label seems incorrect: label value larger than number of classes: ",
+          label_data[i],
+          " vs ",
+          D);
+      float weight = weights ? weights[i] : 1.0;
+      float l = -Pdata[i * D + label_data[i]] * weight;
+      loss_sum += l;
+      weight_sum += weight;
+    }
+    math::Exp(N * D, Pdata, Pdata, &context_);
+  } else {
+    const float* label_data = T.data<float>();
+
+    for (int i = 0; i < N; ++i) {
+      float l = 0.0;
+      float total_prob = 0.0;
+      float weight = weights ? weights[i] : 1.0;
+      for (int j = 0; j < D; ++j) {
+        CAFFE_ENFORCE(
+            label_data[i * D + j] >= 0,
+            "Label prob seems incorrect: label prob value must be nonnegative:",
+            " ",
+            label_data[i * D + j]);
+        l += -log(std::max(Pdata[i * D + j], 1e-20f)) * label_data[i * D + j] *
+            weight;
+        total_prob += label_data[i * D + j];
+      }
+      loss_sum += l;
+      CAFFE_ENFORCE(
+          std::abs(total_prob - 1.) < 1e-5f,
+          "Label prob seems incorrect: label prob values do not sum to 1.0: ",
+          total_prob,
+          " vs 1.0 (+/- 1e-5)");
+      weight_sum += weight;
+    }
+  }
+
+  avg_loss->Resize(vector<TIndex>());
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  if (weight_sum != 0.0) {
+    avg_loss_data[0] = loss_sum * scale_ / weight_sum;
+  } else {
+    avg_loss_data[0] = 0.0;
+  }
+  return true;
+}
+
+template <>
+bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  // Input(2) is weights if given
+  auto& P = Input(InputSize() - 2); // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  auto* dX = Output(0);
+  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
+
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  int N, D;
+  N = X.size_to_dim(canonical_axis); // batch size
+  D = X.size_from_dim(canonical_axis);
+  dX->ResizeLike(X);
+
+  if (label_prob_mode_) {
+    CAFFE_ENFORCE_GE(T.ndim(), 2);
+    CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+    CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
+  } else {
+    if (T.ndim() == canonical_axis) {
+      CAFFE_ENFORCE_EQ(T.size(), N);
+    } else {
+      CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
+      CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
+    }
+  }
+
+  const float* Pdata = P.data<float>();
+  float* dX_data = dX->mutable_data<float>();
+
+  // Copy softmax probabilities into dX. All but the neuron
+  // corresponding to the correct label has gradient equaling e(x_j)
+  // which is the probability under softmax.
+  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+
+  // Compute gradient for the matching labels.
+  float total_weight = 0.0f;
+  if (!label_prob_mode_) {
+    const int* label_data = T.data<int>();
+
+    if (weights) {
+      for (int i = 0; i < N; ++i) {
+        int idx = i * D + label_data[i];
+        float weight = weights[i];
+        dX_data[idx] = Pdata[idx] - 1.0;
+        for (int d = 0; d < D; d++) {
+          int k = i * D + d;
+          dX_data[k] *= weight;
+        }
+
+        total_weight += weight;
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        int idx = i * D + label_data[i];
+        dX_data[idx] = Pdata[idx] - 1.0f;
+      }
+      total_weight = N;
+    }
+  } else {
+    const float* label_data = T.data<float>();
+
+    if (weights) {
+      for (int i = 0; i < N; ++i) {
+        float weight = weights[i];
+        for (int j = 0; j < D; ++j) {
+          int idx = i * D + j;
+          dX_data[idx] = (Pdata[idx] - label_data[idx]) * weight;
+        }
+        total_weight += weight;
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        for (int j = 0; j < D; ++j) {
+          int idx = i * D + j;
+          dX_data[idx] = Pdata[idx] - label_data[idx];
+        }
+      }
+      total_weight = N;
+    }
+  }
+
+  // Scale by d_avg_loss / N
+  if (total_weight > 0) {
+    math::Scale<float, CPUContext>(
+        dX->size(),
+        scale_ / total_weight * d_avg_loss.data<float>()[0],
+        dX->data<float>(),
+        dX_data,
+        &context_);
+  }
+  return true;
+}
+
+namespace {
+class GetSoftmaxWithLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{
+        {I(0), I(1), O(0), GO(1)},
+    };
+
+    // Add weight blob, if given
+    if (def_.input_size() == 3) {
+      blob_names.emplace(blob_names.begin() + 2, I(2));
+    }
+    return SingleGradientDef(
+        "SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
+}
+} // namespace caffe2
diff --git a/caffe2/operators/softmax_with_loss_op.h b/caffe2/operators/softmax_with_loss_op.h
new file mode 100644
index 0000000..27e6db2
--- /dev/null
+++ b/caffe2/operators/softmax_with_loss_op.h
@@ -0,0 +1,76 @@
+#ifndef SOFTMAX_WITH_LOSS_OP_H_
+#define SOFTMAX_WITH_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SoftmaxWithLossOp final : public Operator<Context> {
+ public:
+  SoftmaxWithLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        label_prob_mode_(OperatorBase::GetSingleArgument<int>("label_prob", 0)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  int label_prob_mode_;
+  StorageOrder order_;
+  int axis_;
+
+  Tensor<Context> losses_; // Per example loss
+  Tensor<Context> rowmax_; // per example row max
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
+  Tensor<Context> total_weight_ptr_;
+  Tensor<Context> scratch_;
+};
+
+template <typename T, class Context>
+class SoftmaxWithLossGradientOp final : public Operator<Context> {
+ public:
+  SoftmaxWithLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        label_prob_mode_(OperatorBase::GetSingleArgument<int>("label_prob", 0)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        only_loss_(OperatorBase::GetSingleArgument<bool>("only_loss", false)),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  int label_prob_mode_;
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> total_weight_ptr_;
+  StorageOrder order_;
+  bool only_loss_;
+  int axis_;
+  Tensor<Context> scratch_;
+};
+
+} // namespace caffe2
+
+#endif // SOFTMAX_WITH_LOSS_OP_H_
diff --git a/caffe2/operators/softplus_op.cc b/caffe2/operators/softplus_op.cc
new file mode 100644
index 0000000..7d2efd5
--- /dev/null
+++ b/caffe2/operators/softplus_op.cc
@@ -0,0 +1,122 @@
+#include "caffe2/operators/softplus_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool SoftplusOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  EigenVectorMap<float>(Y->mutable_data<float>(), X.size()) =
+      (ConstEigenVectorMap<float>(X.data<float>(), X.size()).array().exp() +
+       1.0f)
+          .log();
+  return true;
+}
+
+template <>
+bool SoftplusGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
+  ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
+  ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
+  dXvec = dYvec * (1.0 - (-Yvec).exp());
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Softplus, SoftplusOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SoftplusGradient, SoftplusGradientOp<float, CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Softplus)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Softplus takes one input data tensor $X$ and produces one output data tensor $Y,$ where the softplus function, $y = ln(e^x + 1)$, is applied to $X$ elementwise.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softplus_op.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softplus_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Softplus",
+    ["X"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[-0.5380011   0.65190786  0.55673236]
+ [-0.16272168  0.5451048   0.30880353]
+ [-0.76606876 -0.6238556  -0.40444514]]
+
+Y:
+ [[0.4598992  1.0713093  1.0097669 ]
+ [0.61509246 1.0023911  0.8594219 ]
+ [0.38174385 0.42909983 0.5112337 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "X", "Input data blob to be operated on.")
+    .Output(0, "Y", "Output data blob with same shape as input.")
+    .InheritOnnxSchema("Softplus");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(SoftplusGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}});
+
+class GetSoftplusGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SoftplusGradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Softplus, GetSoftplusGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/softplus_op.cu b/caffe2/operators/softplus_op.cu
new file mode 100644
index 0000000..e733c47
--- /dev/null
+++ b/caffe2/operators/softplus_op.cu
@@ -0,0 +1,59 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/softplus_op.h"
+
+namespace caffe2 {
+namespace {
+template <typename T>
+__global__ void SoftplusKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = log(exp(X[i]) + 1.0f);
+  }
+}
+
+template <typename T>
+__global__ void
+SoftplusGradientKernel(const int N, const T* Y, const T* dY, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const float nexpY = exp(-Y[i]);
+    dX[i] = dY[i] * (1 - nexpY);
+  }
+}
+} // namespace
+
+template <>
+bool SoftplusOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  DCHECK_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  SoftplusKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool SoftplusGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  DCHECK_GT(Y.size(), 0);
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  SoftplusGradientKernel<<<
+      CAFFE_GET_BLOCKS(Y.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Softplus, SoftplusOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SoftplusGradient,
+    SoftplusGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/softplus_op.h b/caffe2/operators/softplus_op.h
new file mode 100644
index 0000000..38559ef
--- /dev/null
+++ b/caffe2/operators/softplus_op.h
@@ -0,0 +1,36 @@
+#ifndef CAFFE2_OPERATORS_SOFTPLUS_OP_H_
+#define CAFFE2_OPERATORS_SOFTPLUS_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SoftplusOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SoftplusOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+};
+
+template <typename T, class Context>
+class SoftplusGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SoftplusGradientOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: Y, dY; Output: dX
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SOFTPLUS_OP_H_
diff --git a/caffe2/operators/softsign_op.cc b/caffe2/operators/softsign_op.cc
new file mode 100644
index 0000000..4062848
--- /dev/null
+++ b/caffe2/operators/softsign_op.cc
@@ -0,0 +1,149 @@
+#include "caffe2/operators/softsign_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SoftsignFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorMap<T>(Y, N) = (T(1) + X_arr.abs()).inverse() * X_arr;
+  return true;
+}
+
+template <>
+template <typename T>
+bool SoftsignGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) =
+      dY_arr * (T(1) + X_arr.abs()).square().inverse();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Softsign,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SoftsignFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    SoftsignGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SoftsignGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Softsign)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+*Softsign* takes one input data tensor $X$ and produces one output data $Y,$ where the softsign function, $y = \frac{x}{1+ |x|}$, is applied to $X$ elementwise. This operation can be done in an in-place fashion too, by providing the same input and output blobs.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softsign_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Softsign",
+    ["X"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[-1.3060539   0.7242748  -1.9907674 ]
+ [-0.64802396 -0.03244735  0.7455406 ]
+ [-0.298492   -0.5774271   2.8364444 ]]
+
+Y:
+ [[-0.5663588   0.420046   -0.6656376 ]
+ [-0.39321268 -0.03142761  0.4271116 ]
+ [-0.2298759  -0.36605626  0.739342  ]]
+
+```
+
+</details>
+
+
+)DOC")
+    .Input(0, "input", "Input data blob to be operated on.")
+    .Output(0, "output", "Output data blob with same shape as input")
+    .InheritOnnxSchema("Softsign");
+
+OPERATOR_SCHEMA(SoftsignGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor
+element-wise.
+)DOC")
+    .Input(0, "input", "1-D input tensor")
+    .Input(1, "input", "1-D input tensor")
+    .Output(
+        0,
+        "output",
+        "The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor "
+        "computed element-wise");
+
+namespace {
+
+class GetSoftsignGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(
+        I(0) != O(0),
+        "Cannot compute softsign gradient "
+        "if you choose to do an in-place calculation.");
+
+    return SingleGradientDef(
+        "SoftsignGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Softsign, GetSoftsignGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/softsign_op.cu b/caffe2/operators/softsign_op.cu
new file mode 100644
index 0000000..9eeaad3
--- /dev/null
+++ b/caffe2/operators/softsign_op.cu
@@ -0,0 +1,86 @@
+#include "caffe2/operators/softsign_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __host__ __device__ T SquareCUDA(const T x) {
+  return x * x;
+}
+
+template <typename T>
+__global__ void SoftsignCUDAKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) / (T(1) + abs(__ldg(X + i)));
+#else
+    Y[i] = X[i] / (T(1) + abs(X[i]));
+#endif
+  }
+}
+
+template <typename T>
+__global__ void
+SoftsignGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + abs(__ldg(X + i)));
+#else
+    dX[i] = dY[i] / SquareCUDA(T(1) + abs(X[i]));
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool SoftsignFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SoftsignCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool SoftsignGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  SoftsignGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Softsign,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SoftsignFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SoftsignGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SoftsignGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/softsign_op.h b/caffe2/operators/softsign_op.h
new file mode 100644
index 0000000..6d545a5
--- /dev/null
+++ b/caffe2/operators/softsign_op.h
@@ -0,0 +1,31 @@
+#ifndef CAFFE2_OPERATORS_SOFTSIGN_OP_H_
+#define CAFFE2_OPERATORS_SOFTSIGN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SoftsignFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct SoftsignGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SOFTSIGN_OP_H_
diff --git a/caffe2/operators/space_batch_op.cc b/caffe2/operators/space_batch_op.cc
new file mode 100644
index 0000000..256da12
--- /dev/null
+++ b/caffe2/operators/space_batch_op.cc
@@ -0,0 +1,121 @@
+#include "caffe2/operators/space_batch_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SpaceToBatch, SpaceToBatchOp<CPUContext>);
+OPERATOR_SCHEMA(SpaceToBatch).NumInputs(1).NumOutputs(1).SetDoc(R"DOC(
+Zero-pads and then rearranges (permutes) blocks of spatial data into batch. More specifically, this op outputs a copy of the input tensor where values from the height and width dimensions are moved to the batch dimension. After the zero-padding is according to the `pad` argument, both height and width of the input must be divisible by the `block_size`. Only "NCHW" order is currently supported.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/space_batch_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "SpaceToBatch",
+    ["X"],
+    ["Y"],
+    pad=2,
+    block_size=3
+)
+
+workspace.FeedBlob("X", np.random.rand(1,3,5,5).astype(np.float32))
+print("X.shape:", workspace.FetchBlob("X").shape)
+workspace.RunOperatorOnce(op)
+print("Y.shape:", workspace.FetchBlob("Y").shape)
+
+```
+
+**Result**
+
+```
+
+X.shape: (1, 3, 5, 5)
+Y.shape: (9, 3, 3, 3)
+
+```
+
+</details>
+
+)DOC")
+    .Arg("pad","(*int*): exclusive axis that divides the first and second dimension of matrix `A` (default=0)")
+    .Arg("block_size","(*int*): height/width of spatial blocks to be moved (default=2)")
+    .Arg("order","(*string*): order of dimensions of input and output blobs; only \"NCHW\" order is currently supported (default=\"NCHW\")")
+    .Input(0,"X","(*Tensor`<float>`*): input tensor (NCHW order)")
+    .Output(0,"Y","(*Tensor`<float>`*): output tensor (NCHW order)");
+
+REGISTER_CPU_OPERATOR(BatchToSpace, BatchToSpaceOp<CPUContext>);
+OPERATOR_SCHEMA(BatchToSpace).NumInputs(1).NumOutputs(1).SetDoc(R"DOC(
+Rearranges (permutes) data from batch into blocks of spatial data, followed by cropping. This is the reverse transformation of `SpaceToBatch`. More specifically, this op outputs a copy of the input tensor where values from the batch dimension are moved in spatial blocks to the height and width dimensions, followed by cropping along the height and width dimensions. Only "NCHW" order is currently supported.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/space_batch_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "BatchToSpace",
+    ["X"],
+    ["Y"],
+    pad=3
+)
+
+workspace.FeedBlob("X", np.random.rand(10,3,32,32).astype(np.float32))
+print("X.shape:", workspace.FetchBlob("X").shape)
+workspace.RunOperatorOnce(op)
+print("Y.shape:", workspace.FetchBlob("Y").shape)
+
+```
+
+**Result**
+
+```
+
+X.shape: (10, 3, 32, 32)
+Y.shape: (2, 3, 58, 58)
+
+```
+
+</details>
+
+)DOC")
+    .Arg("pad","(*int*): exclusive axis that divides the first and second dimension of matrix `A` (default=0)")
+    .Arg("block_size","(*int*): height/width of spatial blocks to be moved (default=2)")
+    .Arg("order","(*string*): order of dimensions of input and output blobs; only \"NCHW\" order is currently supported (default=\"NCHW\")")
+    .Input(0,"X","(*Tensor`<float>`*): input tensor (NCHW order)")
+    .Output(0,"Y","(*Tensor`<float>`*): output tensor (NCHW order)");
+
+class GetSpaceToBatchGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchToSpace", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+
+class GetBatchToSpaceGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SpaceToBatch", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(SpaceToBatch, GetSpaceToBatchGradient);
+REGISTER_GRADIENT(BatchToSpace, GetBatchToSpaceGradient);
+}
diff --git a/caffe2/operators/space_batch_op.h b/caffe2/operators/space_batch_op.h
new file mode 100644
index 0000000..4f42dc0
--- /dev/null
+++ b/caffe2/operators/space_batch_op.h
@@ -0,0 +1,208 @@
+#ifndef CAFFE2_OPERATORS_SPACE_BATCH_OP_H_
+#define CAFFE2_OPERATORS_SPACE_BATCH_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void spaceToBatch(
+    const Tensor<Context>& input,
+    int pad_t,
+    int pad_l,
+    int block_size,
+    Tensor<Context>* output,
+    Context* /*context*/) {
+  CAFFE_ENFORCE(input.ndim() == 4);
+  CAFFE_ENFORCE(output->ndim() == 4);
+
+  const int output_batch = output->dim32(0);
+  const int output_depth = output->dim32(1);
+  const int output_height = output->dim32(2);
+  const int output_width = output->dim32(3);
+
+  const int input_batch = input.dim32(0);
+  const int input_depth = input.dim32(1);
+  const int input_height = input.dim32(2);
+  const int input_width = input.dim32(3);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    const int in_b = out_b % input_batch;
+    const int offset_w = (out_b / input_batch) % block_size;
+    const int offset_h = (out_b / input_batch) / block_size;
+    for (int d = 0; d < input_depth; ++d) {
+      for (int out_h = 0; out_h < output_height; ++out_h) {
+        const int in_h = out_h * block_size + offset_h - pad_t;
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          const int in_w = out_w * block_size + offset_w - pad_l;
+          const auto output_offset =
+              ((out_b * output_depth + d) * output_height + out_h) *
+                  output_width +
+              out_w;
+          const auto input_offset =
+              ((in_b * input_depth + d) * input_height + in_h) * input_width +
+              in_w;
+          if (in_h >= 0 && in_w >= 0 && in_h < input_height &&
+              in_w < input_width) {
+            output->template mutable_data<float>()[output_offset] =
+                input.template data<float>()[input_offset];
+          } else {
+            output->template mutable_data<float>()[output_offset] = 0.0;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Context>
+void batchToSpace(
+    const Tensor<Context>& input,
+    int pad_t,
+    int pad_l,
+    int block_size,
+    Tensor<Context>* output,
+    Context* /*context*/) {
+  CAFFE_ENFORCE(input.ndim() == 4);
+  CAFFE_ENFORCE(output->ndim() == 4);
+
+  const int output_batch = output->dim32(0);
+  const int output_depth = output->dim32(1);
+  const int output_height = output->dim32(2);
+  const int output_width = output->dim32(3);
+
+  const int input_batch = input.dim32(0);
+  const int input_depth = input.dim32(1);
+  const int input_height = input.dim32(2);
+  const int input_width = input.dim32(3);
+
+  CAFFE_ENFORCE(input_depth == output_depth);
+  for (int in_b = 0; in_b < input_batch; ++in_b) {
+    const int out_b = in_b % output_batch;
+    const int offset_w = (in_b / output_batch) % block_size;
+    const int offset_h = (in_b / output_batch) / block_size;
+    for (int d = 0; d < input_depth; ++d) {
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        const int out_h = in_h * block_size + offset_h - pad_t;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          const int out_w = in_w * block_size + offset_w - pad_l;
+          if (out_h >= 0 && out_w >= 0 && out_h < output_height &&
+              out_w < output_width) {
+            const auto output_offset =
+                ((out_b * output_depth + d) * output_height + out_h) *
+                    output_width +
+                out_w;
+            const auto input_offset =
+                ((in_b * input_depth + d) * input_height + in_h) * input_width +
+                in_w;
+            output->template mutable_data<float>()[output_offset] =
+                input.template data<float>()[input_offset];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Context>
+class SpaceBatchOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SpaceBatchOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
+        pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", pad_)),
+        pad_l_(OperatorBase::GetSingleArgument<int>("pad", pad_)),
+        pad_b_(OperatorBase::GetSingleArgument<int>("pad", pad_)),
+        pad_r_(OperatorBase::GetSingleArgument<int>("pad", pad_)),
+        block_size_(OperatorBase::GetSingleArgument<int>("block_size", 2)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(order_ == StorageOrder::NCHW);
+  }
+
+ protected:
+  int pad_;
+  int pad_t_;
+  int pad_l_;
+  int pad_b_;
+  int pad_r_;
+  int block_size_;
+  StorageOrder order_;
+};
+
+template <typename Context>
+class SpaceToBatchOp final : public SpaceBatchOpBase<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using SpaceBatchOpBase<Context>::SpaceBatchOpBase;
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    auto* output = Output(0);
+    const int batch = input.dim32(0);
+    const int depth = input.dim32(1);
+    const int height = this->pad_b_ + this->pad_t_ + input.dim32(2);
+    const int width = this->pad_l_ + this->pad_r_ + input.dim32(3);
+    CAFFE_ENFORCE(
+        height % this->block_size_ == 0,
+        "Height: ",
+        height,
+        ", block size: ",
+        this->block_size_);
+    CAFFE_ENFORCE(width % this->block_size_ == 0);
+
+    const int output_batch = batch * this->block_size_ * this->block_size_;
+    const int output_height = height / this->block_size_;
+    const int output_width = width / this->block_size_;
+    Output(0)->Resize(output_batch, depth, output_height, output_width);
+
+    spaceToBatch<Context>(
+        input,
+        this->pad_t_,
+        this->pad_l_,
+        this->block_size_,
+        output,
+        &context_);
+
+    return true;
+  }
+};
+
+template <typename Context>
+class BatchToSpaceOp final : public SpaceBatchOpBase<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using SpaceBatchOpBase<Context>::SpaceBatchOpBase;
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    auto* output = Output(0);
+    const int batch = input.dim32(0);
+    const int depth = input.dim32(1);
+    const int height = input.dim32(2);
+    const int width = input.dim32(3);
+
+    const int output_batch = batch / this->block_size_ / this->block_size_;
+    const int output_height =
+        height * this->block_size_ - this->pad_b_ - this->pad_t_;
+    const int output_width =
+        width * this->block_size_ - this->pad_l_ - this->pad_r_;
+    Output(0)->Resize(output_batch, depth, output_height, output_width);
+    batchToSpace<Context>(
+        input,
+        this->pad_t_,
+        this->pad_l_,
+        this->block_size_,
+        output,
+        &context_);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPACE_BATCH_OP_H_
diff --git a/caffe2/operators/space_batch_op_gpu.cu b/caffe2/operators/space_batch_op_gpu.cu
new file mode 100644
index 0000000..8624409
--- /dev/null
+++ b/caffe2/operators/space_batch_op_gpu.cu
@@ -0,0 +1,179 @@
+#include "caffe2/operators/space_batch_op.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+__global__ void SpaceToBatch(
+    int N,
+    int output_batch,
+    int output_depth,
+    int output_height,
+    int output_width,
+    int input_batch,
+    int input_depth,
+    int input_height,
+    int input_width,
+    const int pad_l,
+    const int pad_t,
+    int block_size,
+    const float* input,
+    float* output) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Recall:
+    // const auto output_offset =
+    //     ((out_b * output_depth + d) * output_height + out_h) * output_width +
+    //     out_w;
+    const int out_w = i % output_width;
+    const int i_2 = i / output_width;
+    const int out_h = i_2 % output_height;
+    const int i_3 = i_2 / output_height;
+    const int d = i_3 % output_depth;
+    const int out_b = i_3 / output_depth;
+
+    const int in_b = out_b % input_batch;
+    const int offset_w = (out_b / input_batch) % block_size;
+    const int offset_h = (out_b / input_batch) / block_size;
+    const int in_h = out_h * block_size + offset_h - pad_t;
+    const int in_w = out_w * block_size + offset_w - pad_l;
+
+    if (in_h >= 0 && in_w >= 0 && in_h < input_height && in_w < input_width) {
+      const auto input_offset =
+          ((in_b * input_depth + d) * input_height + in_h) * input_width +
+          in_w;
+      output[i] = input[input_offset];
+    } else {
+      output[i] = 0.0;
+    }
+  }
+}
+
+template<>
+void spaceToBatch<CUDAContext>(
+    const Tensor<CUDAContext>& input,
+    int pad_t,
+    int pad_l,
+    int block_size,
+    Tensor<CUDAContext>* output,
+    CUDAContext* context) {
+  const int output_batch = output->dim32(0);
+  const int output_depth = output->dim32(1);
+  const int output_height = output->dim32(2);
+  const int output_width = output->dim32(3);
+
+  const int input_batch = input.dim32(0);
+  const int input_depth = input.dim32(1);
+  const int input_height = input.dim32(2);
+  const int input_width = input.dim32(3);
+  const int N = output->size();
+  SpaceToBatch<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N,
+      output_batch,
+      output_depth,
+      output_height,
+      output_width,
+      input_batch,
+      input_depth,
+      input_height,
+      input_width,
+      pad_l,
+      pad_t,
+      block_size,
+      input.data<float>(),
+      output->mutable_data<float>());
+}
+
+
+__global__ void BatchToSpace(
+    int N,
+    int output_batch,
+    int output_depth,
+    int output_height,
+    int output_width,
+    int input_batch,
+    int input_depth,
+    int input_height,
+    int input_width,
+    const int pad_l,
+    const int pad_t,
+    int block_size,
+    const float* input,
+    float* output) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Recall:
+    // const auto input_offset = ((in_b * input_depth + d) *
+    //   input_height + in_h) * input_width + in_w;
+    const int in_w = i  % input_width;
+    const int i_2 = i / input_width;
+    const int in_h = i_2 % input_height;
+    const int i_3 = i_2 / input_height;
+    const int d = i_3 % input_depth;
+    const int in_b = i_3 / input_depth;
+
+    const int out_b = in_b % output_batch;
+    const int offset_w = (in_b / output_batch) % block_size;
+    const int offset_h = (in_b / output_batch) / block_size;
+    const int out_h = in_h * block_size + offset_h - pad_t;
+    const int out_w = in_w * block_size + offset_w - pad_l;
+
+    if (out_h >= 0 && out_w >= 0 && out_h < output_height &&
+        out_w < output_width) {
+      const auto output_offset =
+          ((out_b * output_depth + d) * output_height + out_h) *
+          output_width +
+          out_w;
+      output[output_offset] = input[i];
+    }
+  }
+}
+
+template <>
+void batchToSpace(
+    const Tensor<CUDAContext>& input,
+    int pad_t,
+    int pad_l,
+    int block_size,
+    Tensor<CUDAContext>* output,
+    CUDAContext* context) {
+  CAFFE_ENFORCE(input.ndim() == 4);
+  CAFFE_ENFORCE(output->ndim() == 4);
+
+  const int output_batch = output->dim32(0);
+  const int output_depth = output->dim32(1);
+  const int output_height = output->dim32(2);
+  const int output_width = output->dim32(3);
+
+  const int input_batch = input.dim32(0);
+  const int input_depth = input.dim32(1);
+  const int input_height = input.dim32(2);
+  const int input_width = input.dim32(3);
+  const int N = input.size();
+  BatchToSpace<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N,
+      output_batch,
+      output_depth,
+      output_height,
+      output_width,
+      input_batch,
+      input_depth,
+      input_height,
+      input_width,
+      pad_l,
+      pad_t,
+      block_size,
+      input.data<float>(),
+      output->mutable_data<float>());
+}
+
+REGISTER_CUDA_OPERATOR(SpaceToBatch, SpaceToBatchOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(BatchToSpace, BatchToSpaceOp<CUDAContext>);
+
+}
diff --git a/caffe2/operators/sparse_normalize_op.cc b/caffe2/operators/sparse_normalize_op.cc
new file mode 100644
index 0000000..43ded90
--- /dev/null
+++ b/caffe2/operators/sparse_normalize_op.cc
@@ -0,0 +1,78 @@
+#include "caffe2/operators/sparse_normalize_op.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool SparseNormalizeOp<float, CPUContext>::RunOnDevice() {
+  CAFFE_ENFORCE_EQ(
+     Input(PARAM).size_from_dim(1),
+     Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+     this, Input(INDICES));
+}
+
+template <>
+template <typename SIndex>
+bool SparseNormalizeOp<float, CPUContext>::DoRunWithType() {
+  const auto* indices = Input(INDICES).template data<SIndex>();
+  const auto* paramIn = Input(PARAM).template data<float>();
+  auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<float>();
+  const float kEps = 1e-12f;
+
+  // n: number of sparse embeddings to be normalized
+  auto n = Input(INDICES).size();
+  if (n == 0) {
+    return true;
+  }
+
+  // embedding length, e.g. 32, 64, 128
+  auto block_size = Input(GRAD).size() / n;
+  for (int i = 0; i < n; ++i) {
+    auto idx = indices[i];
+    auto offsetIdx = idx * block_size;
+    ConstEigenVectorMap<float> xVec(paramIn + offsetIdx, block_size);
+    auto norm = xVec.template lpNorm<2>();
+
+    if (use_max_norm_ && norm <= norm_) {
+      continue;
+    }
+
+    math::Scale(
+        block_size,
+        norm_ / (norm + kEps),
+        paramOut + offsetIdx,
+        paramOut + offsetIdx,
+        &context_);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(SparseNormalize, SparseNormalizeOp<float, CPUContext>);
+OPERATOR_SCHEMA(SparseNormalize)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(0, "param", "Parameters to be normalized")
+    .Input(1, "indices", "Sparse indices")
+    .Input(2, "grad", "Gradient computed")
+    .Output(0, "output_param", "Normalized parameters")
+    .EnforceOneToOneInplace()
+    .Arg(
+        "use_max_norm",
+        "A bool variable to control whether to use max norm \
+    or constant norm. When use_max_norm = false, constant norm is used so that \
+    all the embedding vectors are scaled to have a L2 norm equals to A \
+    (see blow arugment norm=A). If use_max_norm = true, \
+    max norm is used so that embedding is scaled so that its l2 norm is no larger \
+    than A. If an embedding's norm is less than A originally, \
+    the embedding is left unchanged.\
+    The default is True.")
+    .Arg("norm", "L2 norm of the embedding. The default is 1.0.")
+    .SetDoc(R"DOC(
+Given a sparse matrix, apply max_norm or constant_norm sparse regularization.
+)DOC");
+
+SHOULD_NOT_DO_GRADIENT(SparseNormalize);
+} // namespace caffe2
diff --git a/caffe2/operators/sparse_normalize_op.h b/caffe2/operators/sparse_normalize_op.h
new file mode 100644
index 0000000..34a7d8a
--- /dev/null
+++ b/caffe2/operators/sparse_normalize_op.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SparseNormalizeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseNormalizeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        use_max_norm_(
+            OperatorBase::GetSingleArgument<bool>("use_max_norm", true)),
+        norm_(OperatorBase::GetSingleArgument<float>("norm", 1.0)) {
+    CAFFE_ENFORCE_GE(norm_, 0, "norm should be bigger than 0");
+  }
+
+  bool RunOnDevice() override;
+
+  template <typename SIndex>
+  bool DoRunWithType();
+
+ protected:
+  bool use_max_norm_;
+  float norm_;
+  INPUT_TAGS(PARAM, INDICES, GRAD);
+  OUTPUT_TAGS(OUTPUT_PARAM);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/sparse_normalize_op_gpu.cu b/caffe2/operators/sparse_normalize_op_gpu.cu
new file mode 100644
index 0000000..d5e6cc7
--- /dev/null
+++ b/caffe2/operators/sparse_normalize_op_gpu.cu
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+#include "caffe2/operators/sparse_normalize_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(
+    SparseNormalize,
+    GPUFallbackOp<SparseNormalizeOp<float, CPUContext>>);
+}
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
new file mode 100644
index 0000000..4ab2297
--- /dev/null
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -0,0 +1,117 @@
+#include "caffe2/operators/sparse_to_dense_mask_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(SparseToDenseMask, SparseToDenseMaskOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SparseToDenseMaskGradient,
+    SparseToDenseMaskGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(SparseToDenseMask)
+    .NumInputs(3, 4)
+    .NumOutputs(1, 2)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      auto mask = helper.template GetRepeatedArgument<int64_t>("mask");
+      bool return_presence_mask = helper.template GetSingleArgument<bool>(
+          "return_presence_mask", false);
+      vector<TensorShape> out(1);
+
+      if (in.size() == 4) {
+        out[0].add_dims(in[3].dims(0));
+      }
+      out[0].add_dims(mask.size());
+      for (const auto dim : in[2].dims()) {
+        out[0].add_dims(dim);
+      }
+      out[0].set_data_type(in[2].data_type());
+
+      if (return_presence_mask) {
+        out.emplace_back();
+        if (in.size() == 4) {
+          out[1].add_dims(in[3].dims(0));
+        }
+        out[1].add_dims(mask.size());
+        out[1].set_data_type(TensorProto::BOOL);
+      }
+
+      return out;
+    })
+    .SetDoc(R"DOC(
+Convert sparse representations to dense with given indices.
+
+Transforms a sparse representation of map<id, value> represented as `indices`
+vector and `values` tensor into a compacted tensor where the first dimension
+corresponds to each id provided in mask argument. Missing values are filled with
+the value of `default_value`. After running this op:
+
+  output[j, :] = values[i] # where mask[j] == indices[i]
+  output[j, ...] = default_value # when mask[j] doesn't appear in indices
+
+If `lengths` is provided and not empty, and extra "batch" dimension is prepended
+to the output.
+
+`values` and `default_value` can have additional matching dimensions, operation
+is performed on the entire subtensor in thise case.
+
+For example, if `lengths` is supplied and `values` is 1-D vector of floats and
+`default_value` is a float scalar, the output is going to be a float matrix
+of size `len(lengths) X len(mask)`
+)DOC")
+    .Arg(
+        "mask",
+        "list(int) argument with desired ids on the 'dense' output dimension")
+    .Arg(
+        "return_presence_mask",
+        "bool whether to return presence mask, false by default")
+    .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
+    .Input(1, "values", "Data tensor, first dimension has to match `indices`")
+    .Input(
+        2,
+        "default_value",
+        "Default value for the output if the id is not present in `indices`. "
+        "Must have the same type as `values` and the same shape, but without "
+        "the first dimension")
+    .Input(
+        3,
+        "lengths",
+        "Optional lengths to represent a batch of `indices` and `values`.")
+    .Output(
+        0,
+        "output",
+        "Output tensor of the same type as `values` of shape `[len(lengths), "
+        "len(mask)] + shape(default_value)` (if `lengths` is not provided the "
+        "first dimension is omitted)")
+    .Output(
+        1,
+        "presence_mask",
+        "Bool tensor of shape `[len(lengths), len(mask)]` (if `lengths` is not "
+        "provided the first dimension is omitted). True when a value for given "
+        "id was present, false otherwise.");
+
+OPERATOR_SCHEMA(SparseToDenseMaskGradient)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The output is the gradient of the input value from SparseToDenseMask. The
+gradient for default_value has not been implemented.
+)DOC");
+
+class GetSparseToDenseMaskGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{I(0), GO(0)};
+
+    // Add lengths blob if given
+    if (def_.input_size() == 4) {
+      blob_names.push_back(I(3));
+    }
+    return SingleGradientDef(
+        "SparseToDenseMaskGradient", "", blob_names, vector<string>{GI(1)});
+  }
+};
+REGISTER_GRADIENT(SparseToDenseMask, GetSparseToDenseMaskGradient);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
new file mode 100644
index 0000000..5ea10d1
--- /dev/null
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -0,0 +1,288 @@
+#ifndef CAFFE2_OPERATORS_SPARSE_TO_DENSE_MASK_OP_H_
+#define CAFFE2_OPERATORS_SPARSE_TO_DENSE_MASK_OP_H_
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SparseToDenseMaskBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseToDenseMaskBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    std::vector<int64_t> mask =
+        OperatorBase::template GetRepeatedArgument<int64_t>("mask");
+    featuresCount_ = mask.size();
+
+    CAFFE_ENFORCE(!mask.empty(), "mask can't be empty");
+    auto biggest = *std::max_element(mask.begin(), mask.end());
+    dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1);
+    for (int i = 0; i < mask.size(); i++) {
+      int64_t id = mask[i];
+      CAFFE_ENFORCE_GE(id, 0, "Only positive IDs are allowed.");
+      if (id >= kMaxDenseSize) {
+        CAFFE_ENFORCE(sparse_.count(id) == 0, "Duplicated id: ", id);
+        sparse_[id] = i;
+      } else {
+        CAFFE_ENFORCE(dense_[id] == -1, "Duplicated id: ", id);
+        dense_[id] = i;
+      }
+    }
+  }
+
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
+ protected:
+  const int64_t kMaxDenseSize = 1024 * 128;
+
+  std::unordered_map<int64_t, int> sparse_;
+  std::vector<int> dense_;
+  int featuresCount_;
+
+  inline int getFeatureIdx(int64_t id) const {
+    if (id >= kMaxDenseSize) {
+      const auto& iter = sparse_.find(id);
+      if (iter == sparse_.end()) {
+        return -1;
+      } else {
+        return iter->second;
+      }
+    } else {
+      return (id >= dense_.size()) ? -1 : dense_[id];
+    }
+  }
+};
+
+template <class Context>
+class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseToDenseMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : SparseToDenseMaskBase<Context>(operator_def, ws) {
+    returnPresenceMask_ = OperatorBase::template GetSingleArgument<bool>(
+        "return_presence_mask", false);
+    maxSkippedSparseIndices_ =
+        OperatorBase::template GetSingleArgument<int32_t>(
+            "max_skipped_indices", kMaxSkippedSparseIndices);
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename TInd>
+  bool DoRunWithType() {
+    auto& sparse_indices = Input(INDICES);
+    CAFFE_ENFORCE_EQ(sparse_indices.ndim(), 1);
+    auto& sparse_values = Input(VALUES);
+    CAFFE_ENFORCE_GE(sparse_values.ndim(), 1);
+    CAFFE_ENFORCE_EQ(sparse_indices.size(), sparse_values.dim(0));
+    auto& default_value = Input(DEFAULT);
+    CAFFE_ENFORCE_EQ(default_value.ndim() + 1, sparse_values.ndim());
+    CAFFE_ENFORCE_EQ(default_value.size(), sparse_values.size_from_dim(1));
+    CAFFE_ENFORCE(sparse_values.meta() == default_value.meta());
+
+    const TInd* sparse_indices_vec = sparse_indices.template data<TInd>();
+    const char* sparse_values_vec =
+        static_cast<const char*>(sparse_values.raw_data());
+    const void* default_val = default_value.raw_data();
+
+    TIndex block_size = default_value.size();
+    size_t block_nbytes = default_value.nbytes();
+
+    const int cols = this->featuresCount_;
+    int rows = -1;
+    int32_t sparse_indices_length = sparse_indices.dim32(0);
+    const int32_t* lengths_vec = nullptr;
+    auto* output = Output(OUTPUTVALUE);
+    Tensor<Context>* presence_mask = nullptr;
+    if (returnPresenceMask_) {
+      presence_mask = Output(PRESENCEMASK);
+    }
+    vector<TIndex> shape;
+    if (InputSize() == 4) {
+      auto& lengths = Input(LENGTHS);
+      CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+      lengths_vec = lengths.template data<int32_t>();
+      rows = lengths.dim32(0);
+    }
+    if (rows == -1) {
+      // if the LENGTHS is not set, the output will be a vector
+      rows = 1;
+      lengths_vec = &sparse_indices_length;
+    } else {
+      shape.push_back(rows);
+    }
+    shape.push_back(cols);
+    if (returnPresenceMask_) {
+      presence_mask->Resize(shape);
+    }
+    shape.insert(
+        shape.end(), default_value.dims().begin(), default_value.dims().end());
+    output->Resize(shape);
+
+    // init
+    // TODO: consider unrolling CopyItems to make elemental types copy faster
+    char* output_data =
+        static_cast<char*>(output->raw_mutable_data(sparse_values.meta()));
+    for (int i = 0; i < cols * rows; i++) {
+      context_.template CopyItems<Context, Context>(
+          default_value.meta(),
+          block_size,
+          default_val,
+          output_data + i * block_nbytes);
+    }
+    bool* presence_mask_data = nullptr;
+    if (returnPresenceMask_) {
+      presence_mask_data = presence_mask->template mutable_data<bool>();
+      math::Set<bool, Context>(
+          rows * cols, false, presence_mask_data, &context_);
+    }
+
+    int64_t offset = 0;
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < lengths_vec[r]; c++) {
+        const auto sparse_index = sparse_indices_vec[offset + c];
+        if (sparse_index < 0 ||
+            sparse_index >= std::numeric_limits<TInd>::max()) {
+          CAFFE_ENFORCE_LT(
+              ++skippedSparseIndices_,
+              maxSkippedSparseIndices_,
+              "Too many sparse indices skipped");
+          continue;
+        }
+        int idx = this->getFeatureIdx(sparse_index);
+        if (idx != -1) {
+          context_.template CopyItems<Context, Context>(
+              sparse_values.meta(),
+              block_size,
+              sparse_values_vec + (offset + c) * block_nbytes,
+              output_data + (r * cols + idx) * block_nbytes);
+          if (returnPresenceMask_) {
+            presence_mask_data[r * cols + idx] = true;
+          }
+        }
+      }
+      offset += lengths_vec[r];
+    }
+
+    return true;
+  }
+
+ private:
+  static const uint32_t kMaxSkippedSparseIndices = 5;
+
+  bool returnPresenceMask_;
+  uint32_t maxSkippedSparseIndices_ = 0;
+  uint32_t skippedSparseIndices_ = 0;
+
+  INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS);
+  OUTPUT_TAGS(OUTPUTVALUE, PRESENCEMASK);
+};
+
+template <class Context>
+class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseToDenseMaskGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : SparseToDenseMaskBase<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename TInd>
+  bool DoRunWithType() {
+    auto& sparse_indices = Input(INDICES);
+    CAFFE_ENFORCE_EQ(sparse_indices.ndim(), 1);
+    auto& gradient_output = Input(GOUTPUT);
+
+    TIndex block_size = gradient_output.size_from_dim(1);
+    size_t block_nbytes = gradient_output.itemsize() * block_size;
+
+    const int cols = this->featuresCount_;
+    int rows = -1;
+    int iter_offset = 1;
+    int32_t default_length = sparse_indices.dim32(0);
+    const int32_t* lengths_vec = nullptr;
+    auto* output = Output(GVALUES);
+    vector<TIndex> shape;
+    if (InputSize() > LENGTHS) {
+      // if the LENGTHS is set, the gradient_output has dim:
+      // lengths * mask.size() * feature_dim
+      auto& lengths = Input(LENGTHS);
+      lengths_vec = lengths.template data<int32_t>();
+      rows = lengths.dim32(0);
+      CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
+      CAFFE_ENFORCE_GE(gradient_output.ndim(), 2);
+      CAFFE_ENFORCE_EQ(gradient_output.dim(0), rows);
+      CAFFE_ENFORCE_EQ(gradient_output.dim(1), cols);
+      block_nbytes /= gradient_output.dim(1);
+      block_size /= gradient_output.dim(1);
+      iter_offset += 1;
+    }
+    if (rows == -1) {
+      // if the LENGTHS is not set, the gradient_output has dim:
+      // mask.size() * feature_dim
+      rows = 1;
+      lengths_vec = &default_length;
+      CAFFE_ENFORCE_GE(gradient_output.ndim(), 1);
+      CAFFE_ENFORCE_EQ(gradient_output.dim(0), cols);
+    }
+    shape.push_back(default_length);
+    // insert feature_dim
+    shape.insert(
+        shape.end(),
+        gradient_output.dims().begin() + iter_offset,
+        gradient_output.dims().end());
+    output->Resize(shape);
+
+    const TInd* sparse_indices_vec = sparse_indices.template data<TInd>();
+    const char* gradient_output_vec =
+        static_cast<const char*>(gradient_output.raw_data());
+
+    char* output_data =
+        static_cast<char*>(output->raw_mutable_data(gradient_output.meta()));
+    math::Set<char, Context>(
+        default_length * gradient_output.itemsize(), 0, output_data, &context_);
+
+    int32_t offset = 0;
+    // SparseToDenseMask is not injective; gradient_used records
+    // if the gradient is used for other input value from the same row
+    vector<bool> gradient_used(cols, false);
+    for (int r = 0; r < rows; r++) {
+      std::fill(gradient_used.begin(), gradient_used.end(), false);
+      for (int c = lengths_vec[r] - 1; c >= 0; c--) {
+        int idx = this->getFeatureIdx(sparse_indices_vec[offset + c]);
+        if (idx != -1 && !gradient_used[idx]) {
+          gradient_used[idx] = true;
+          context_.template CopyItems<Context, Context>(
+              gradient_output.meta(),
+              block_size,
+              gradient_output_vec + (r * cols + idx) * block_nbytes,
+              output_data + (offset + c) * block_nbytes);
+        }
+      }
+      offset += lengths_vec[r];
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(INDICES, GOUTPUT, LENGTHS);
+  OUTPUT_TAGS(GVALUES);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPARSE_TO_DENSE_MASK_OP_H_
diff --git a/caffe2/operators/sparse_to_dense_op.cc b/caffe2/operators/sparse_to_dense_op.cc
new file mode 100644
index 0000000..4f6a497
--- /dev/null
+++ b/caffe2/operators/sparse_to_dense_op.cc
@@ -0,0 +1,59 @@
+#include "sparse_to_dense_op.h"
+
+#include "caffe2/core/context.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SparseToDense, SparseToDenseOp<CPUContext>);
+
+OPERATOR_SCHEMA(SparseToDense)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Convert sparse representations to dense with given indices.
+
+Transforms a sparse representation of map<id, value> represented as `indices`
+vector and `values` tensor into a compacted tensor where the first dimension
+is determined by the first dimension of the 3rd input if it is given or the
+max index. Missing values are filled with zeros.
+
+The op supports duplicated indices and performs summation over corresponding
+values. This behavior is useful for converting GradientSlices into dense
+representation.
+
+After running this op:
+
+  output[indices[i], :] += values[i]  # sum over all indices[i] equal to the index
+  output[j, ...] = 0 if j not in indices
+)DOC")
+    .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
+    .Input(
+        1,
+        "values",
+        "Data tensor, first dimension has to match `indices`, "
+        "basic numeric types are supported")
+    .Input(
+        2,
+        "data_to_infer_dim",
+        "Optional: if provided, the first dimension of output is the first "
+        "dimension of this tensor.")
+    .Output(
+        0,
+        "output",
+        "Output tensor of the same type as `values` of shape `[len(lengths), "
+        "len(mask)] + shape(default_value)` (if `lengths` is not provided the "
+        "first dimension is omitted)");
+
+
+namespace {
+class GetSparseToDenseGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "Gather", "", vector<string>{GO(0), I(0)}, vector<string>{GI(1)});
+  }
+};
+
+REGISTER_GRADIENT(SparseToDense, GetSparseToDenseGradient);
+}
+} // namespace caffe2
diff --git a/caffe2/operators/sparse_to_dense_op.cu b/caffe2/operators/sparse_to_dense_op.cu
new file mode 100644
index 0000000..1086c0a
--- /dev/null
+++ b/caffe2/operators/sparse_to_dense_op.cu
@@ -0,0 +1,77 @@
+#include "sparse_to_dense_op.h"
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+  template <typename TInd, typename TData>
+  __global__ void SparseToDenseKernel(
+    size_t N, TIndex block_nitems, const TInd* indices, const TData* vals, TData* dst) {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      int idx = indices[i / block_nitems];
+      int dst_idx = block_nitems * idx + i % block_nitems;
+      atomicAdd(&dst[dst_idx], vals[i]);
+    }
+  }
+
+  template <>
+  bool SparseToDenseOp<CUDAContext>::RunOnDevice() {
+    return DispatchHelper<TensorTypes<int32_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <>
+  template <typename TInd>
+  bool SparseToDenseOp<CUDAContext>::DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<
+            float,
+            int32_t>,
+        TInd>::call(this, Input(VALUES));
+  }
+
+  template <>
+  template <typename TInd, typename TData>
+  bool SparseToDenseOp<CUDAContext>::DoRunWithType2() {
+    auto& sparse_indices = Input(INDICES);
+    CAFFE_ENFORCE_EQ(sparse_indices.ndim(), 1);
+    auto& sparse_values = Input(VALUES);
+    CAFFE_ENFORCE_GE(sparse_values.ndim(), 1);
+    CAFFE_ENFORCE_EQ(sparse_indices.size(), sparse_values.dim(0));
+
+    const TInd* sparse_indices_vec = sparse_indices.template data<TInd>();
+    const int32_t sparse_indices_len = sparse_indices.dim32(0);
+    const int output_first_dim =
+        GetOutputFirstDim(sparse_indices_vec, sparse_indices_len);
+
+    auto shape = sparse_values.dims();
+    shape[0] = output_first_dim;
+    auto* output = Output(0);
+    output->Resize(shape);
+
+    TData* output_data = output->template mutable_data<TData>();
+    math::Set<TData>(output->size(), TData(0), output_data, &context_);
+
+    const auto block_nitems = sparse_values.size_from_dim(1);
+    const TData* sparse_values_vec = sparse_values.template data<TData>();
+
+    size_t N = block_nitems * sparse_indices_len;
+    CAFFE_ENFORCE_EQ(output->size(), output_first_dim * block_nitems);
+    SparseToDenseKernel<TInd, TData><<<
+      CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
+      context_.cuda_stream()>>>(
+        N,
+        block_nitems,
+        sparse_indices_vec,
+        sparse_values_vec,
+        output_data
+    );
+
+    return true;
+  }
+
+
+REGISTER_CUDA_OPERATOR(SparseToDense, SparseToDenseOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sparse_to_dense_op.h b/caffe2/operators/sparse_to_dense_op.h
new file mode 100644
index 0000000..7fbfa38
--- /dev/null
+++ b/caffe2/operators/sparse_to_dense_op.h
@@ -0,0 +1,122 @@
+#ifndef CAFFE2_OPERATORS_SPARSE_TO_DENSE_OP_H_
+#define CAFFE2_OPERATORS_SPARSE_TO_DENSE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SparseToDenseOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  SparseToDenseOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        output_first_dim_(
+            OperatorBase::GetSingleArgument<int>("output_first_dim", 0)) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+ private:
+  template <typename TInd>
+  int GetOutputFirstDim(
+      const TInd* sparse_indices_vec,
+      const int32_t sparse_indices_len) {
+    if (output_first_dim_ > 0) {
+      CAFFE_ENFORCE_EQ(InputSize(), 2);
+      return output_first_dim_;
+    }
+    if (InputSize() == 3) {
+      auto& data_to_infer_dim = Input(DATA_TO_INFER_DIM);
+      CAFFE_ENFORCE_GE(data_to_infer_dim.ndim(), 1);
+      return data_to_infer_dim.dim32(0);
+    }
+    if (sparse_indices_len <= 0) {
+      return 0;
+    }
+
+    // Awkward way to get the max element to make it work with both CUDA
+    // and CPU.
+    max_element_.Resize(1);
+    TInd* max_element_ptr = max_element_.template mutable_data<TInd>();
+    math::ReduceMax<TInd>(sparse_indices_len, sparse_indices_vec, max_element_ptr,
+          &scratch_, &context_);
+    max_element_host_.CopyFrom(max_element_);
+    return 1 + max_element_host_.template data<TInd>()[0];
+  }
+
+  template <typename TInd>
+  bool DoRunWithType() {
+    return DispatchHelper<
+        TensorTypes2<
+            float,
+            int32_t,
+            int64_t,
+            GenericTensorImplementation>,
+        TInd>::call(this, Input(VALUES));
+  }
+
+  template <typename TInd, typename TData>
+  bool DoRunWithType2() {
+    auto& sparse_indices = Input(INDICES);
+    CAFFE_ENFORCE_EQ(sparse_indices.ndim(), 1);
+    auto& sparse_values = Input(VALUES);
+    CAFFE_ENFORCE_GE(sparse_values.ndim(), 1);
+    CAFFE_ENFORCE_EQ(sparse_indices.size(), sparse_values.dim(0));
+
+    const TInd* sparse_indices_vec = sparse_indices.template data<TInd>();
+    const int32_t sparse_indices_len = sparse_indices.dim32(0);
+    const int output_first_dim =
+        GetOutputFirstDim(sparse_indices_vec, sparse_indices_len);
+
+    auto shape = sparse_values.dims();
+    shape[0] = output_first_dim;
+    auto* output = Output(0);
+    output->Resize(shape);
+
+    TData* output_data = output->template mutable_data<TData>();
+    memset(output_data, 0, output->nbytes());
+    const auto block_nitems = sparse_values.size_from_dim(1);
+    const TData* sparse_values_vec = sparse_values.template data<TData>();
+
+    for (int32_t i = 0; i < sparse_indices_len; i++) {
+      const TInd idx = sparse_indices_vec[i];
+      CAFFE_ENFORCE_GE(idx, 0);
+      CAFFE_ENFORCE_LT(idx, output_first_dim);
+      math::Add(
+          block_nitems,
+          output_data + idx * block_nitems,
+          sparse_values_vec + i * block_nitems,
+          output_data + idx * block_nitems,
+          &context_);
+    }
+    return true;
+  }
+
+  template <typename TInd>
+  bool DoRunWithOtherType2() {
+    CAFFE_THROW(
+        "SparseToDense is not implemented on tensor of type ",
+        Input(VALUES).meta().name(),
+        "Consider adding it a type in the list DispatchHelper or implementing "
+        "a generic version (which won't work for duplicated indices though)");
+  }
+
+ private:
+  int output_first_dim_;
+  Tensor<Context> scratch_;
+  Tensor<CPUContext> max_element_host_;
+  Tensor<Context> max_element_;
+
+  INPUT_TAGS(INDICES, VALUES, DATA_TO_INFER_DIM);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPARSE_TO_DENSE_OP_H_
diff --git a/caffe2/operators/spatial_batch_norm_gradient_op.cc b/caffe2/operators/spatial_batch_norm_gradient_op.cc
new file mode 100644
index 0000000..dd5434d
--- /dev/null
+++ b/caffe2/operators/spatial_batch_norm_gradient_op.cc
@@ -0,0 +1,165 @@
+#include "caffe2/operators/spatial_batch_norm_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  const auto& dY = Input(OUTPUT_GRAD);
+  const auto& scale = Input(SCALE);
+
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C =
+      (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1));
+  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+  const int W = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
+      : 1;
+  const int D = X.ndim() > 4
+      ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
+      : 1;
+
+  const int sample_size = H * W * D;
+
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+
+  ConstEigenVectorArrayMap<float> scale_arr(scale.data<float>(), C);
+  ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
+  ConstEigenVectorArrayMap<float> inv_var_arr(
+      Input(SAVED_INV_VAR).data<float>(), C);
+
+  auto* dX = Output(INPUT_GRAD);
+  dX->ResizeLike(X);
+
+  auto* dScale = Output(SCALE_GRAD);
+  auto* dBias = Output(BIAS_GRAD);
+
+  if (num_batches_ == 1) {
+    dScale->ResizeLike(scale);
+    dBias->ResizeLike(scale);
+  }
+
+  // dBias = np.sum(dY, axis=0)
+  // dScale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // dX = (1. / N) * scale * inv_var * (N * dY - np.sum(dY, axis=0) - (X - mean)
+  //   * inv_var * inv_var * np.sum(dY * (X - mean), axis=0))
+
+  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+
+  if (num_batches_ == 1) {
+    dBias_arr.setZero();
+    dScale_arr.setZero();
+  }
+
+  const auto scaleInvVarNHW = scale_arr * inv_var_arr / (N * sample_size);
+
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      ConstEigenArrayMap<float> X_arr(X.data<float>(), sample_size, N * C);
+      ConstEigenArrayMap<float> dY_arr(dY.data<float>(), sample_size, N * C);
+      EigenArrayMap<float> dX_arr(
+          dX->mutable_data<float>(), sample_size, N * C);
+      dX_arr.setZero();
+      if (N == 0) {
+        return true;
+      }
+      if (num_batches_ == 1) {
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          dBias_arr(c) += dY_arr.col(nc).sum();
+          dScale_arr(c) +=
+              ((X_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * dY_arr.col(nc))
+                  .sum();
+        }
+      } else {
+        for (int c = 0; c < C; ++c) {
+          dBias_arr(c) /= num_batches_;
+          dScale_arr(c) /= num_batches_;
+        }
+      }
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dX_arr.col(nc) += scaleInvVarNHW(c) *
+            (dY_arr.col(nc) * N * sample_size - dBias_arr(c) -
+             (X_arr.col(nc) - mean_arr[c]) * dScale_arr(c) * inv_var_arr(c));
+      }
+      break;
+    }
+    case StorageOrder::NHWC: {
+      ConstEigenArrayMap<float> X_arr(X.data<float>(), C, N * sample_size);
+      ConstEigenArrayMap<float> dY_arr(dY.data<float>(), C, N * sample_size);
+      EigenArrayMap<float> dX_arr(
+          dX->mutable_data<float>(), C, N * sample_size);
+      dX_arr.setZero();
+      if (N == 0) {
+        return true;
+      }
+
+      const auto dYRowSum = dY_arr.rowwise().sum();
+      const auto XMinusMean = X_arr.colwise() - mean_arr;
+      const auto dYMulXMinusMeanRowSum = (dY_arr * XMinusMean).rowwise().sum();
+      const auto invVarSqr = inv_var_arr * inv_var_arr;
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dBias_arr += dY_arr.col(nhw);
+        dScale_arr +=
+            (X_arr.col(nhw) - mean_arr) * inv_var_arr * dY_arr.col(nhw);
+        dX_arr.col(nhw) += scaleInvVarNHW *
+            (dY_arr.col(nhw) * N * sample_size - dYRowSum -
+             XMinusMean.col(nhw) * invVarSqr * dYMulXMinusMeanRowSum);
+      }
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(SpatialBNGradient, SpatialBNGradientOp<CPUContext>);
+
+// Input: X, scale, dY, mean, variance, dscale, dbias
+// Output: dX, dscale, dbias
+OPERATOR_SCHEMA(SpatialBNGradient)
+    .NumInputs({5, 7})
+    .NumOutputs(3)
+    .AllowInplace({{5, 1}, {6, 2}});
+
+// Spatial batch normalization's gradient, depending on the various input sizes,
+// is a bit more complex than usual gradient operators.
+class GetSpatialBNGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // Check if we are in training or testing mode.
+    bool is_test =
+        ArgumentHelper::GetSingleArgument(def_, OpSchema::Arg_IsTest, 0);
+    int num_batches = ArgumentHelper::GetSingleArgument(def_, "num_batches", 1);
+    vector<string> grad_outputs{GI(0), GI(1), GI(2)};
+    vector<string> grad_inputs;
+    if (is_test) {
+      // This is in testing mode. The operator should have five inputs:
+      //     X, scale, bias, estimated_mean, estimated_variance
+      // The gradient inputs are:
+      //     X, scale, dY, estimated_mean, estimated_variance
+      CAFFE_ENFORCE_EQ(def_.input_size(), 5);
+      CAFFE_ENFORCE_EQ(def_.output_size(), 1);
+      grad_inputs = vector<string>{I(0), I(1), GO(0), I(3), I(4)};
+    } else if (num_batches > 1) {
+      CAFFE_ENFORCE_EQ(def_.input_size(), 7);
+      CAFFE_ENFORCE_EQ(def_.output_size(), 5);
+      grad_inputs = vector<string>{I(0), I(1), GO(0), O(3), O(4), GI(1), GI(2)};
+    } else {
+      CAFFE_ENFORCE_EQ(def_.input_size(), 5);
+      CAFFE_ENFORCE_EQ(def_.output_size(), 5);
+      grad_inputs = vector<string>{I(0), I(1), GO(0), O(3), O(4)};
+    }
+    return SingleGradientDef(
+        "SpatialBNGradient", "", grad_inputs, grad_outputs);
+  }
+};
+REGISTER_GRADIENT(SpatialBN, GetSpatialBNGradient);
+}
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
new file mode 100644
index 0000000..671493a
--- /dev/null
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -0,0 +1,296 @@
+#include "caffe2/operators/spatial_batch_norm_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool SpatialBNOp<CPUContext>::RunOnDevice() {
+  const auto& X = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+
+  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
+  const int N = X.dim32(0);
+  const int C =
+      (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1));
+  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+  const int W = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
+      : 1;
+  const int D = X.ndim() > 4
+      ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
+      : 1;
+
+  const int sample_size = H * W * D;
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+  CAFFE_ENFORCE_EQ(bias.dim32(0), C);
+
+  ConstEigenVectorArrayMap<float> scale_arr(scale.data<float>(), C);
+  ConstEigenVectorArrayMap<float> bias_arr(bias.data<float>(), C);
+
+  auto* Y = Output(OUTPUT);
+  Y->ResizeLike(X);
+  auto* Y_data = Y->mutable_data<float>();
+
+  if (!is_test_) {
+    // training mode
+    // Get the mean and variance.
+    // Note that, to be consistent with cudnn, we will output saved inverse
+    // std as output 5, but we will still use the same storage place to
+    // compute var as well. The inverse is going to be carried out at the end
+    // of the op.
+    Output(SAVED_MEAN)->Resize(C);
+    Output(SAVED_INV_VAR)->Resize(C);
+    EigenVectorArrayMap<float> mean(
+        Output(SAVED_MEAN)->mutable_data<float>(), C);
+    EigenVectorArrayMap<float> var(
+        Output(SAVED_INV_VAR)->mutable_data<float>(), C);
+    if (N > 0) {
+      if (num_batches_ > 1) {
+        ConstEigenVectorArrayMap<float> sums(Input(SUMS).data<float>(), C);
+        ConstEigenVectorArrayMap<float> sumsq(Input(SUMSQ).data<float>(), C);
+        const auto multi_batch_size = N * num_batches_ * sample_size;
+        mean = sums / multi_batch_size;
+        var = (sumsq - (sums * sums) / multi_batch_size) / multi_batch_size;
+      } else {
+        mean.setZero();
+        var.setZero();
+        switch (order_) {
+          case StorageOrder::NCHW: {
+            ConstEigenArrayMap<float> X_arr(
+                X.data<float>(), sample_size, N * C);
+            for (int nc = 0; nc < N * C; ++nc) {
+              mean(nc % C) += X_arr.col(nc).sum();
+            }
+            mean /= N * sample_size;
+            for (int nc = 0; nc < N * C; ++nc) {
+              var(nc % C) +=
+                  (X_arr.col(nc) - mean(nc % C)).matrix().squaredNorm();
+            }
+            var /= N * sample_size;
+            break;
+          }
+          case StorageOrder::NHWC: {
+            ConstEigenArrayMap<float> X_arr(
+                X.data<float>(), C, N * sample_size);
+            for (int i = 0; i < N * sample_size; ++i) {
+              mean += X_arr.col(i);
+            }
+            mean /= N * sample_size;
+            for (int i = 0; i < N * sample_size; ++i) {
+              var += (X_arr.col(i) - mean) * (X_arr.col(i) - mean);
+            }
+            var /= N * sample_size;
+            break;
+          }
+          default:
+            CAFFE_THROW("Unknown storage order: ", order_);
+        }
+      }
+
+      // Compute the running mean and running inv variance.
+      auto* running_mean = Output(RUNNING_MEAN);
+      auto* running_var = Output(RUNNING_VAR);
+      // Check if they are initialized
+      if (!running_mean->size()) {
+        running_mean->Resize(C);
+        EigenVectorArrayMap<float> running_mean_map(
+            running_mean->mutable_data<float>(), C);
+        running_mean_map.setZero();
+      }
+      if (!running_var->size()) {
+        running_var->Resize(C);
+        EigenVectorArrayMap<float> running_var_map(
+            running_var->mutable_data<float>(), C);
+        running_var_map.setZero();
+      }
+      EigenVectorArrayMap<float> running_mean_arr(
+          running_mean->mutable_data<float>(), C);
+      EigenVectorArrayMap<float> running_var_arr(
+          running_var->mutable_data<float>(), C);
+      running_mean_arr = running_mean_arr * momentum_ + mean * (1. - momentum_);
+      running_var_arr = running_var_arr * momentum_ + var * (1. - momentum_);
+    } else {
+      // set empty batch's mean / var output to zeros
+      mean.setZero();
+      var.setZero();
+      return true;
+    }
+  }
+
+  // Regardless of training or testing, we will apply the estimated mean
+  // and standard deviation to the input. For testing, they are
+  // specified directly by the input, and for training, they are computed
+  // by the op.
+  Eigen::Array<float, Eigen::Dynamic, 1> inv_std(C);
+  if (is_test_) {
+    ConstEigenVectorArrayMap<float> var_arr(Input(EST_VAR).data<float>(), C);
+    inv_std = (var_arr + epsilon_).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<float> saved_inv_std(
+        Output(SAVED_INV_VAR)->mutable_data<float>(), C);
+    saved_inv_std = (saved_inv_std + epsilon_).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<float> mean_arr(
+      is_test_ ? Input(EST_MEAN).data<float>()
+               : Output(SAVED_MEAN)->data<float>(),
+      C);
+  // We can fuse the output computation as follows:
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  // to
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  Eigen::Array<float, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<float, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+  switch (order_) {
+    case StorageOrder::NHWC: {
+      EigenArrayMap<float>(Y_data, C, N * sample_size) =
+          (ConstEigenArrayMap<float>(X.data<float>(), C, N * sample_size)
+               .colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    case StorageOrder::NCHW: {
+      EigenArrayMap<float> Y_arr(Y_data, sample_size, N * C);
+      ConstEigenArrayMap<float> X_arr(X.data<float>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        Y_arr.col(nc) = X_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+namespace {
+OpSchema::Cost CostInferenceForSpatialBN(
+    const OperatorDef& def,
+    const vector<TensorShape>& in) {
+  struct OpSchema::Cost cost = PointwiseCostInference<4>(def, in);
+  ArgumentHelper helper(def);
+  auto order =
+      StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW"));
+  const TensorShape X = in[0];
+  const int C =
+      (order == StorageOrder::NCHW ? X.dims(1) : X.dims(X.dims_size() - 1));
+  cost.params_bytes = 2 * C * sizeof(float);
+  return cost;
+}
+} // namespace
+
+REGISTER_CPU_OPERATOR(SpatialBN, SpatialBNOp<CPUContext>);
+
+OPERATOR_SCHEMA(SpatialBN)
+    .NumInputs({5, 7})
+    .NumOutputs({1, 5})
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(CostInferenceForSpatialBN)
+    .EnforceInplace({{3, 1}, {4, 2}})
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          ArgumentHelper helper(def);
+          bool is_test = helper.GetSingleArgument<int>(OpSchema::Arg_IsTest, 0);
+
+          if (!is_test) {
+            vector<TensorShape> out;
+            StorageOrder order = StringToStorageOrder(
+                helper.GetSingleArgument<string>("order", "NCHW"));
+            const TensorShape& X = in[0];
+            const int C =
+                (order == StorageOrder::NCHW ? X.dims(1)
+                                             : X.dims(X.dims_size() - 1));
+
+            out.push_back(in[0]);
+            TensorShape meanvar_tp =
+                CreateTensorShape(vector<int>{C}, TensorProto::FLOAT);
+            out.push_back(meanvar_tp); // RUNNING_MEAN
+            out.push_back(meanvar_tp); // RUNNING_MEAN
+            out.push_back(meanvar_tp); // SAVED_MEAN
+            out.push_back(meanvar_tp); // SAVED_VAR
+            return out;
+          } else {
+            return vector<TensorShape>{in[0]};
+          }
+        })
+    .SetDoc(R"DOC(
+Applies spatial batch normalization to the input tensor as described in the original paper, [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167). Be aware, this operator has two different output sets, depending on the value of *is_test*. According to the paper, the primary operation of spatial batch normalization is:
+
+$$Y = \frac{X - \mu_x}{\sqrt{\sigma^2_{x} + \epsilon}}*\gamma + b$$
+
+In the equation, $\mu_x$ is the *mean*, $X$ is the input data, $\sigma^2_{x}$ is the *var*, $\epsilon$ is *epsilon*, $\gamma$ is the *scale*, $b$ is the *bias*, and $Y$ is the output data. The *momentum* arg also affects this calculation in the computation of the running mean and variance. The influence of *momentum* is as follows:
+
+$$running\_mean = running\_mean * momentum + mean * (1 - momentum)$$
+
+$$running\_var = running\_var * momentum + var * (1 - momentum)$$
+
+Output when is_test = 0 (train mode): *Y, mean, var, saved_mean, saved_var*
+
+Output when is_test = 1 (test mode): *Y*
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/spatial_batch_norm_op.cc
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/spatial_batch_norm_op.h
+
+)DOC")
+    .ArgIsTest("*(type: int; default: 0)* If set to nonzero, run spatial batch normalization in test mode.")
+    .Arg("epsilon", "*(type: float; default: 1e-5)* The epsilon value to use to avoid division by zero.")
+    .Arg("order", "*(type: string; default: \"NCHW\")* Specifies the order of the input data blob, where $N$ is batch size, $C$ is number of channels, $H$ is spatial height, and $W$ is spatial width. The only other valid option is \"NHWC\".")
+    .Arg("momentum", "*(type: float; default: 0.9)* Factor used in computing the running mean and variance. e.g., running_mean = running_mean x momentum + mean x (1 - momentum)")
+    .Arg("num_batches", "*(type: int; default: 1)* Specifies the number of batches to apply normalization on. Requires specifying the optional sums and sumsq inputs that provide statistics across multiple batches from which mean and variance can be determined.")
+    .Input(
+        0,
+        "X",
+        "The input 4-dimensional tensor of shape $NCHW$ or $NHWC$ depending on the order parameter.")
+    .Input(
+        1,
+        "scale",
+        "The scale as a 1-dimensional tensor of size $C$ to be applied to the output.")
+    .Input(
+        2,
+        "bias",
+        "The bias as a 1-dimensional tensor of size $C$ to be applied to the output.")
+    .Input(
+        3,
+        "mean",
+        "The running mean (training) or the estimated mean (testing) as a 1-dimensional tensor of size $C$.")
+    .Input(
+        4,
+        "var",
+        "The running variance (training) or the estimated variance (testing) as a 1-dimensional tensor of size $C$.")
+    .Input(
+        5,
+        "sums",
+        "*(optional)* Per-channel sums of elements to be used to determine the mean and variance for this batch.")
+    .Input(
+        6,
+        "sumsq",
+        "*(optional)* Per-channel sum of elements squared per channel to be used to determine the variance for this batch.")
+
+    .Output(0, "Y", "The output 4-dimensional tensor of the same shape as $X$.")
+    .Output(
+        1,
+        "mean",
+        "The running mean after the spatial BN operator. Must be in-place with the input *mean*. Should not be used for testing.")
+    .Output(
+        2,
+        "var",
+        "The running variance after the spatial BN operator. Must be in-place with the input *var*. Should not be used for testing.")
+    .Output(
+        3,
+        "saved_mean",
+        "Saved mean used during training to speed up gradient computation. Should not be used for testing.")
+    .Output(
+        4,
+        "saved_var",
+        "Saved variance used during training to speed up gradient computation. Should not be used for testing.")
+    .InheritOnnxSchema("BatchNormalization");
+
+} // namespace caffe2
diff --git a/caffe2/operators/spatial_batch_norm_op.h b/caffe2/operators/spatial_batch_norm_op.h
new file mode 100644
index 0000000..31c678a
--- /dev/null
+++ b/caffe2/operators/spatial_batch_norm_op.h
@@ -0,0 +1,84 @@
+#ifndef CAFFE2_OPERATORS_SPATIAL_BATCH_NORM_OP_H_
+#define CAFFE2_OPERATORS_SPATIAL_BATCH_NORM_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SpatialBNOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9f)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        num_batches_(OperatorBase::GetSingleArgument<int>("num_batches", 1)) {
+    // TODO(jiayq): update the input and output size checks.
+    CAFFE_ENFORCE(
+        (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
+    CAFFE_ENFORCE_GT(epsilon_, 0);
+    CAFFE_ENFORCE_GE(momentum_, 0);
+    CAFFE_ENFORCE_LE(momentum_, 1);
+  }
+  ~SpatialBNOp() {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+
+ protected:
+  bool is_test_;
+  double epsilon_;
+  double momentum_;
+  StorageOrder order_;
+  int num_batches_;
+  INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR, SUMS, SUMSQ);
+  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_VAR);
+};
+
+template <class Context>
+class SpatialBNGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        num_batches_(OperatorBase::GetSingleArgument<int>("num_batches", 1)) {
+    CAFFE_ENFORCE(InputSize() == 5 || InputSize() == 7);
+    CAFFE_ENFORCE(OutputSize() == 3);
+  }
+  ~SpatialBNGradientOp() {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+
+ protected:
+  bool is_test_;
+  double epsilon_;
+  StorageOrder order_;
+  int num_batches_;
+
+  INPUT_TAGS(
+      INPUT,
+      SCALE,
+      OUTPUT_GRAD,
+      SAVED_MEAN,
+      SAVED_INV_VAR,
+      AGGREGATE_SCALE_GRAD,
+      AGGREGATE_BIAS_GRAD);
+  OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SPATIAL_BATCH_NORM_OP_H_
diff --git a/caffe2/operators/spatial_batch_norm_op_cudnn.cc b/caffe2/operators/spatial_batch_norm_op_cudnn.cc
new file mode 100644
index 0000000..4864856
--- /dev/null
+++ b/caffe2/operators/spatial_batch_norm_op_cudnn.cc
@@ -0,0 +1,376 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/operators/spatial_batch_norm_op.h"
+#include "caffe2/utils/math.h"
+
+// Note: Instead of directly failing, we will choose to not build this operator
+// if cudnn version is not high enough.
+static_assert(CUDNN_VERSION >= 5000,
+             "CudnnSpatialBN requires cudnn version 5.0 or above.");
+
+namespace caffe2 {
+
+class CudnnSpatialBNOp final : public SpatialBNOp<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  CudnnSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNOp<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon_ <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon_ = std::max(epsilon_, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7,0,0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+  }
+
+  ~CudnnSpatialBNOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+  bool RunOnDevice() override;
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  vector<TIndex> cudnn_input_dims_;
+
+  cudnnBatchNormMode_t mode_;
+};
+
+class CudnnSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  CudnnSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : SpatialBNGradientOp<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon_ <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon_ = std::max(epsilon_, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7,0,0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+  }
+
+  ~CudnnSpatialBNGradientOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+
+  template <typename T, typename M>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override;
+
+ protected:
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  vector<TIndex> cudnn_input_dims_;
+
+  cudnnBatchNormMode_t mode_;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename M>
+bool CudnnSpatialBNOp::DoRunWithType() {
+
+  // QoL
+  typedef typename cudnnTypeWrapper<T>::BNParamType BNParamType;
+
+  const auto& X = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& bias = Input(BIAS);
+
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1))
+      : (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(2));
+  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+  const int W = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
+      : 1;
+  const int D = X.ndim() > 4
+      ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
+      : 1;
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+  CAFFE_ENFORCE_EQ(bias.dim32(0), C);
+  // See if we need to reshape.
+  if (N > 0 && X.dims() != cudnn_input_dims_) {
+    VLOG(1) << "Setting descriptors.";
+    cudnn_input_dims_ = X.dims();
+    if (order_ == StorageOrder::NCHW) {
+      vector<int> dims = {N, C, H, W, D};
+      vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc_,
+          cudnnTypeWrapper<T>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    } else {
+      vector<int> dims = {N, C, H, W, D};
+      vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc_,
+          cudnnTypeWrapper<T>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    }
+    CUDNN_ENFORCE(cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+  }
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  if (is_test_) {
+    // Run inference mode.
+    const auto& est_mean = Input(EST_MEAN);
+    const auto& est_var = Input(EST_VAR);
+    CAFFE_ENFORCE_EQ(est_mean.ndim(), 1);
+    CAFFE_ENFORCE_EQ(est_var.ndim(), 1);
+    CAFFE_ENFORCE_EQ(est_mean.dim32(0), C);
+    CAFFE_ENFORCE_EQ(est_var.dim32(0), C);
+
+    auto* Y = Output(OUTPUT);
+    Y->ResizeLike(X);
+    T* Y_data = Y->template mutable_data<T>();
+    if (N == 0) {
+      return true;
+    }
+    CUDNN_ENFORCE(cudnnBatchNormalizationForwardInference(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        // Note: PERSISTENT not implemented for inference
+        CUDNN_BATCHNORM_SPATIAL,
+        cudnnTypeWrapper<T>::kOne(),
+        cudnnTypeWrapper<T>::kZero(),
+        data_desc_,
+        X.template data<T>(),
+        data_desc_,
+        Y_data,
+        bn_param_desc_,
+        scale.template data<BNParamType>(),
+        bias.template data<BNParamType>(),
+        est_mean.template data<BNParamType>(),
+        est_var.template data<BNParamType>(),
+        epsilon_));
+  } else {
+    // Run training mode.
+    auto* Y = Output(OUTPUT);
+    Y->ResizeLike(X);
+    T* Y_data = Y->template mutable_data<T>();
+    // obtain running mean and running inv var, and see if we need to
+    // initialize them.
+    auto* running_mean = Output(RUNNING_MEAN);
+    auto* running_var = Output(RUNNING_VAR);
+    double this_factor = 1. - momentum_;
+    BNParamType* running_mean_data = nullptr;
+    BNParamType* running_var_data = nullptr;
+    if (!running_mean->size()) {
+      // If the input mean and var are not initialized yet, this is the first
+      // run and we will initialize the storage.
+      VLOG(1) << "Initializing running mean and var.";
+      // Need to do initialization
+      running_mean->Resize(C);
+      running_var->Resize(C);
+      running_mean_data = running_mean->template mutable_data<BNParamType>();
+      running_var_data = running_var->template mutable_data<BNParamType>();
+      // In principle, setting this_momentum to 1 will wipe existing data.
+      // This has a caveat that if cudnn does not deal with 0*NaN cases we
+      // will be having an issue. Thus we choose a safe path by explicitly
+      // setting zero.
+      math::Set<BNParamType, CUDAContext>(C, 0, running_mean_data, &context_);
+      math::Set<BNParamType, CUDAContext>(C, 0, running_var_data, &context_);
+    } else {
+      // Does not need to do initialization.
+      CAFFE_ENFORCE_EQ(running_mean->ndim(), 1);
+      CAFFE_ENFORCE_EQ(running_var->ndim(), 1);
+      CAFFE_ENFORCE_EQ(running_mean->dim32(0), C);
+      CAFFE_ENFORCE_EQ(running_var->dim32(0), C);
+      running_mean_data = running_mean->template mutable_data<BNParamType>();
+      running_var_data = running_var->template mutable_data<BNParamType>();
+    }
+    // Save the mean and inv var results.
+    auto* save_mean = Output(SAVED_MEAN);
+    auto* save_var = Output(SAVED_INV_VAR);
+    save_mean->Resize(C);
+    save_var->Resize(C);
+    void* save_mean_data = save_mean->template mutable_data<BNParamType>();
+    void* save_var_data = save_var->template mutable_data<BNParamType>();
+    if (N == 0) {
+      // set empty batch's mean / var output to zeros
+      math::Set<BNParamType, CUDAContext>(
+          C, 0, (BNParamType*)save_mean_data, &context_);
+      math::Set<BNParamType, CUDAContext>(
+          C, 0, (BNParamType*)save_var_data, &context_);
+      return true;
+    }
+    CUDNN_ENFORCE(cudnnBatchNormalizationForwardTraining(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        mode_,
+        cudnnTypeWrapper<T>::kOne(),
+        cudnnTypeWrapper<T>::kZero(),
+        data_desc_,
+        X.template data<T>(),
+        data_desc_,
+        Y_data,
+        bn_param_desc_,
+        scale.template data<BNParamType>(),
+        bias.template data<BNParamType>(),
+        this_factor,
+        running_mean_data,
+        running_var_data,
+        epsilon_,
+        save_mean_data,
+        save_var_data));
+  }
+  return true;
+}
+
+bool CudnnSpatialBNOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float,float>();
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<float16,float>();
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+template <typename T, typename M>
+bool CudnnSpatialBNGradientOp::DoRunWithType() {
+  // QoL
+  typedef typename cudnnTypeWrapper<T>::BNParamType BNParamType;
+
+  const auto& X = Input(INPUT);
+  const auto& scale = Input(SCALE);
+  const auto& dY = Input(OUTPUT_GRAD);
+
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1))
+      : (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(2));
+  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+  const int W = X.ndim() > 3
+      ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
+      : 1;
+  const int D = X.ndim() > 4
+      ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
+      : 1;
+  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
+  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
+  // See if we need to reshape.
+  if (N > 0 && X.dims() != cudnn_input_dims_) {
+    if (order_ == StorageOrder::NCHW) {
+      vector<int> dims = {N, C, H, W, D};
+      vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc_,
+          cudnnTypeWrapper<T>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    } else {
+      vector<int> dims = {N, C, H, W, D};
+      vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc_,
+          cudnnTypeWrapper<T>::type,
+          X.ndim() > 3 ? X.ndim() : 4,
+          dims.data(),
+          strides.data()));
+    }
+    CUDNN_ENFORCE(cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+  }
+
+  auto* dX = Output(INPUT_GRAD);
+  dX->ResizeLike(X);
+  T* dX_data = dX->template mutable_data<T>();
+  auto* dScale = Output(SCALE_GRAD);
+  auto* dBias = Output(BIAS_GRAD);
+  dScale->ResizeLike(scale);
+  dBias->ResizeLike(scale);
+  auto* dScale_data = dScale->template mutable_data<BNParamType>();
+  auto* dBias_data = dBias->template mutable_data<BNParamType>();
+
+  const auto& saved_mean = Input(SAVED_MEAN);
+  const auto& saved_var = Input(SAVED_INV_VAR);
+  const void* saved_mean_data = saved_mean.template data<BNParamType>();
+  const void* saved_var_data = saved_var.template data<BNParamType>();
+  if (N == 0) {
+    // set gradients to zeros
+    math::Set<BNParamType, CUDAContext>(C, 0, dScale_data, &context_);
+    math::Set<BNParamType, CUDAContext>(C, 0, dBias_data, &context_);
+    return true;
+  }
+  CUDNN_ENFORCE(cudnnBatchNormalizationBackward(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      mode_,
+      cudnnTypeWrapper<T>::kOne(),
+      cudnnTypeWrapper<T>::kZero(),
+      cudnnTypeWrapper<T>::kOne(),
+      cudnnTypeWrapper<T>::kZero(),
+      data_desc_,
+      X.template data<T>(),
+      data_desc_,
+      dY.template data<T>(),
+      data_desc_,
+      dX_data,
+      bn_param_desc_,
+      scale.template data<BNParamType>(),
+      dScale_data,
+      dBias_data,
+      epsilon_,
+      saved_mean_data,
+      saved_var_data));
+  return true;
+}
+
+bool CudnnSpatialBNGradientOp::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float,float>();
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<float16,float>();
+  } else {
+    LOG(FATAL) << "Unsupported input types";
+  }
+  return true;
+}
+
+// Since there is no default implementation for spatial batch normalization,
+// we will register the cudnn version as the default as well.
+REGISTER_CUDA_OPERATOR(SpatialBN, CudnnSpatialBNOp);
+REGISTER_CUDA_OPERATOR(SpatialBNGradient, CudnnSpatialBNGradientOp);
+
+REGISTER_CUDNN_OPERATOR(SpatialBN, CudnnSpatialBNOp);
+REGISTER_CUDNN_OPERATOR(SpatialBNGradient, CudnnSpatialBNGradientOp);
+}  // namespace caffe2
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.cc b/caffe2/operators/spatial_softmax_with_loss_op.cc
new file mode 100644
index 0000000..1288bc7
--- /dev/null
+++ b/caffe2/operators/spatial_softmax_with_loss_op.cc
@@ -0,0 +1,258 @@
+#include "spatial_softmax_with_loss_op.h"
+#include "softmax_shared.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SpatialSoftmaxWithLoss,
+    SpatialSoftmaxWithLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SpatialSoftmaxWithLossGradient,
+    SpatialSoftmaxWithLossGradientOp<float, CPUContext>);
+
+// Input: X (logits), T (labels); Output: P (probs), Y
+OPERATOR_SCHEMA(SpatialSoftmaxWithLoss)
+    .NumInputs(2, 3)
+    .NumOutputs(2)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          ArgumentHelper helper(def);
+          vector<TensorShape> out(2);
+
+          auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
+          auto labels = in[1]; // Tensor with shape [batch_size, ]
+          auto batch_size = logits.dims().Get(0);
+          auto num_classes = logits.dims().Get(1);
+
+          CAFFE_ENFORCE_EQ(logits.dims_size(), 4);
+          CAFFE_ENFORCE_EQ(labels.dims_size(), 3);
+          out[0].set_data_type(logits.data_type());
+          out[0].add_dims(batch_size);
+          out[0].add_dims(num_classes);
+          out[0].add_dims(in[0].dims(2));
+          out[0].add_dims(in[0].dims(3));
+          // Output 2 is scalar shape, so no dims added
+          return out;
+        })
+    .SetDoc(R"DOC(
+Combined Spatial Softmax and Cross-Entropy loss operator.
+Similar to SoftmaxWithLoss, this operator computes the spatial softmax
+normalized values for each layer in the batch of the given input, after which
+cross-entropy loss is computed. This operator is numerically more stable than
+separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor
+(Tensor<float>) of size (batch_size x input_feature_dimensions) and tensor of
+labels (ground truth).
+Output is tensor with the probability for each label in a pixel for each example
+(N x D x W x H) and averaged loss (scalar).
+For spatial softmax, weighting is by x,y position of the input.
+)DOC")
+    .Input(0, "logits", "Unscaled log probabilities")
+    .Input(1, "labels", "Ground truth")
+    .Input(
+        2,
+        "weight_tensor",
+        "Optional blob to be used to weight the samples for the loss. With\
+        spatial set, weighting is by x,y of the input")
+    .Output(0, "softmax", "Tensor with softmax cross entropy loss")
+    .Output(1, "loss", "Average loss");
+
+// Input: X, T, P, dY; Output: dX
+OPERATOR_SCHEMA(SpatialSoftmaxWithLossGradient).NumOutputs(1);
+
+#define DONT_CARE (-1)
+
+template <>
+bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+  int N, D;
+  N = X.dim32(0);
+  D = X.dim32(1);
+  P->ResizeLike(X);
+
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  float* Pdata = P->mutable_data<float>();
+  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  CAFFE_ENFORCE_EQ(T.ndim(), 3);
+  CAFFE_ENFORCE_EQ(T.dim32(0), N);
+
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+
+  const float* Xdata = X.data<float>();
+
+  for (int i = 0; i < N; ++i) {
+    for (int y = 0; y < H; ++y) {
+      for (int x = 0; x < W; ++x) {
+        // Subtract max on each cell for numerical reasons
+        float max_val = (-1e20f);
+        for (int c = 0; c < D; ++c) {
+          // TODO optimize
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          max_val = std::max(max_val, Xdata[idx]);
+        }
+
+        // Exponentiate
+        float expsum = 0.0f;
+        for (int c = 0; c < D; ++c) {
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          float expx = exp(Xdata[idx] - max_val);
+          Pdata[idx] = expx;
+          expsum += expx;
+        }
+
+        // Normalize
+        for (int c = 0; c < D; ++c) {
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          Pdata[idx] /= expsum;
+        }
+      }
+    }
+  }
+
+  // Compute the avg cross-entropy loss
+  avg_loss->Resize(vector<TIndex>());
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  const int* label_data = T.data<int>();
+
+  float sum_label_xent = 0.0f;
+  float total_weight = 0.0;
+
+  for (int y = 0; y < H; y++) {
+    for (int x = 0; x < W; x++) {
+      for (int i = 0; i < N; i++) {
+        int label_idx = i * H * W + y * W + x;
+        int label = label_data[label_idx];
+        if (label != DONT_CARE) {
+          CAFFE_ENFORCE(
+              label < D && label >= 0,
+              "Label seems incorrect:label value larger than number of classes",
+              label_data[i],
+              " vs ",
+              D);
+          int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+          float w = weights ? weights[label_idx] : 1.0;
+          total_weight += w;
+          sum_label_xent += -log(std::max(Pdata[idx], 1e-20f)) * w;
+        }
+      }
+    }
+  }
+  if (total_weight != 0.0) {
+    *avg_loss_data = sum_label_xent / total_weight;
+  } else {
+    *avg_loss_data = 0.0;
+  }
+  return true;
+}
+
+template <>
+bool SpatialSoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  // Input(2) is weights if given
+  auto& P = Input(InputSize() - 2); // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  auto* dX = Output(0);
+  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
+  int N, D;
+  N = X.dim32(0);
+  D = X.dim32(1);
+  dX->ResizeLike(X);
+  CAFFE_ENFORCE_EQ(T.dim32(0), N);
+  CAFFE_ENFORCE_EQ(X.ndim(), 4);
+  CAFFE_ENFORCE_EQ(T.ndim(), 3);
+
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+
+  const float* Pdata = P.data<float>();
+  float* dX_data = dX->mutable_data<float>();
+  const int* label_data = T.data<int>();
+
+  // Copy softmax probabilities into dX. All but the neuron
+  // corresponding to the correct label has gradient equaling e(x_j)
+  // which is the probability under softmax.
+  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+
+  float total_weight = 0.0f;
+  for (int y = 0; y < H; ++y) {
+    for (int x = 0; x < W; ++x) {
+      for (int i = 0; i < N; ++i) {
+        int label_idx = i * H * W + y * W + x;
+        int label = label_data[label_idx];
+
+        if (label != DONT_CARE) {
+          int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+
+          dX_data[idx] = (dX_data[idx] - 1.0);
+
+          if (weights != nullptr) {
+            float weight = weights[label_idx];
+            for (int c = 0; c < D; ++c) {
+              int k = i * (H * W * D) + c * (H * W) + y * W + x;
+              dX_data[k] *= weight;
+            }
+            total_weight += weight;
+          } else {
+            total_weight += 1.0;
+          }
+        } else {
+          // Set gradient to zero for coordinates where we have dont care
+          for (int c = 0; c < D; ++c) {
+            int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+            dX_data[idx] = 0;
+          }
+        }
+      }
+    }
+  }
+
+  if (total_weight > 0) {
+    math::Scale<float, CPUContext>(
+        dX->size(),
+        scale_ / total_weight,
+        dX->data<float>(),
+        dX_data,
+        &context_);
+  }
+  math::Scale<float, CPUContext>(
+      dX->size(),
+      d_avg_loss.data<float>(),
+      dX->data<float>(),
+      dX->mutable_data<float>(),
+      &context_);
+  return true;
+}
+
+namespace {
+class GetSoftmaxWithLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{
+        {I(0), I(1), O(0), GO(1)},
+    };
+
+    // Add weight blob, if given
+    if (def_.input_size() == 3) {
+      blob_names.emplace(blob_names.begin() + 2, I(2));
+    }
+    return SingleGradientDef(
+        "SpatialSoftmaxWithLossGradient",
+        "",
+        blob_names,
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SpatialSoftmaxWithLoss, GetSoftmaxWithLossGradient);
+}
+} // namespace caffe2
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.h b/caffe2/operators/spatial_softmax_with_loss_op.h
new file mode 100644
index 0000000..d466063
--- /dev/null
+++ b/caffe2/operators/spatial_softmax_with_loss_op.h
@@ -0,0 +1,68 @@
+#ifndef SPATIAL_SOFTMAX_WITH_LOSS_OP_H_
+#define SPATIAL_SOFTMAX_WITH_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SpatialSoftmaxWithLossOp final : public Operator<Context> {
+ public:
+  SpatialSoftmaxWithLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  StorageOrder order_;
+
+  Tensor<Context> losses_; // Per example loss
+  Tensor<Context> rowmax_; // per example row max
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
+  Tensor<Context> total_weight_ptr_;
+  Tensor<Context> scratch_;
+};
+
+template <typename T, class Context>
+class SpatialSoftmaxWithLossGradientOp final : public Operator<Context> {
+ public:
+  SpatialSoftmaxWithLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        only_loss_(OperatorBase::GetSingleArgument<bool>("only_loss", false)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> total_weight_ptr_;
+  StorageOrder order_;
+  bool only_loss_;
+  Tensor<Context> scratch_;
+};
+
+} // namespace caffe2
+
+#endif // SOFTMAX_WITH_LOSS_OP_H_
diff --git a/caffe2/operators/sqr_op.cc b/caffe2/operators/sqr_op.cc
new file mode 100644
index 0000000..de98adc
--- /dev/null
+++ b/caffe2/operators/sqr_op.cc
@@ -0,0 +1,93 @@
+#include "caffe2/operators/sqr_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Sqr,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SqrFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Sqr)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Performs element-wise squaring ($x^2$) of input tensor.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sqr_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sqr",
+    ["X"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[4. 6. 2.]
+ [0. 1. 6.]
+ [9. 2. 7.]]
+Y:
+[[16. 36.  4.]
+ [ 0.  1. 36.]
+ [81.  4. 49.]]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
+
+namespace {
+
+class GetSqrGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    Argument scale_arg;
+    scale_arg.set_name("scale");
+    scale_arg.set_f(2.0);
+    return std::vector<OperatorDef>{CreateOperatorDef(
+                                        "Scale",
+                                        "",
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<Argument>{scale_arg}),
+                                    CreateOperatorDef(
+                                        "Mul",
+                                        "",
+                                        std::vector<std::string>{GO(0), I(0)},
+                                        std::vector<std::string>{GI(0)})};
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sqr, GetSqrGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sqr_op.h b/caffe2/operators/sqr_op.h
new file mode 100644
index 0000000..f5af654
--- /dev/null
+++ b/caffe2/operators/sqr_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_SQR_OP_H_
+#define CAFFE2_OPERATORS_SQR_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SqrFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sqr(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SQR_OP_H_
diff --git a/caffe2/operators/sqr_op_gpu.cc b/caffe2/operators/sqr_op_gpu.cc
new file mode 100644
index 0000000..6f7ed8e
--- /dev/null
+++ b/caffe2/operators/sqr_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/sqr_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sqr,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SqrFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sqrt_op.cc b/caffe2/operators/sqrt_op.cc
new file mode 100644
index 0000000..47d7a9a
--- /dev/null
+++ b/caffe2/operators/sqrt_op.cc
@@ -0,0 +1,96 @@
+#include "caffe2/operators/sqrt_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Sqrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SqrtFunctor<CPUContext>>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Sqrt)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Performs element-wise square-root ($\sqrt{x}$) of input tensor $X$.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sqrt_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Sqrt",
+    ["X"],
+    ["Y"],
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[8. 3. 3.]
+ [4. 0. 0.]
+ [1. 2. 5.]]
+Y:
+[[2.8284268  1.7320508  1.7320508 ]
+ [1.9999999  0.         0.        ]
+ [0.99999994 1.4142134  2.236068  ]]
+
+```
+
+</details>
+)DOC")
+.Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
+.Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
+
+namespace {
+
+class GetSqrtGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    Argument scale_arg;
+    scale_arg.set_name("scale");
+    scale_arg.set_f(0.5);
+    return std::vector<OperatorDef>{CreateOperatorDef(
+                                        "Scale",
+                                        "",
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<std::string>{GI(0)},
+                                        std::vector<Argument>{scale_arg}),
+                                    CreateOperatorDef(
+                                        "Div",
+                                        "",
+                                        std::vector<std::string>{GI(0), O(0)},
+                                        std::vector<std::string>{GI(0)})};
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sqrt, GetSqrtGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sqrt_op.h b/caffe2/operators/sqrt_op.h
new file mode 100644
index 0000000..4200068
--- /dev/null
+++ b/caffe2/operators/sqrt_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_SQRT_OP_H_
+#define CAFFE2_OPERATORS_SQRT_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SqrtFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sqrt(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SQRT_OP_H_
diff --git a/caffe2/operators/sqrt_op_gpu.cc b/caffe2/operators/sqrt_op_gpu.cc
new file mode 100644
index 0000000..7a4ae2d
--- /dev/null
+++ b/caffe2/operators/sqrt_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/sqrt_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sqrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SqrtFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/square_root_divide_op.cc b/caffe2/operators/square_root_divide_op.cc
new file mode 100644
index 0000000..9d26d2a
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.cc
@@ -0,0 +1,43 @@
+#include "caffe2/operators/square_root_divide_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SquareRootDivide, SquareRootDivideOp<CPUContext>);
+OPERATOR_SCHEMA(SquareRootDivide)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Given DATA tensor with first dimension N and SCALE vector of the same size N
+produces an output tensor with same dimensions as DATA. Which consists of DATA
+slices. i-th slice is divided by sqrt(SCALE[i]) elementwise. If SCALE[i] == 0
+output slice is identical to the input one (no scaling)
+
+Example:
+
+  Data = [
+    [2.0, 4.0],
+    [9.0, 12.0]
+  ]
+
+  SCALE = [4, 9]
+
+  OUTPUT = [
+    [1.0, 2.0],
+    [3.0, 4.0]
+  ]
+
+)DOC");
+
+class GetSquareRootDivideGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SquareRootDivide",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(SquareRootDivide, GetSquareRootDivideGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h
new file mode 100644
index 0000000..dce63a8
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.h
@@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_SQUARE_ROOT_DIVIDE_OP_H_
+#define CAFFE2_OPERATORS_SQUARE_ROOT_DIVIDE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SquareRootDivideOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  SquareRootDivideOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(DATA));
+  }
+
+ private:
+  template <typename TData>
+  bool DoRunWithType() {
+    return DispatchHelper<TensorTypes2<float, int32_t, int64_t>, TData>::call(
+        this, Input(SCALE));
+  }
+
+  template <typename TData, typename TScale>
+  bool DoRunWithType2() {
+    auto& data = Input(DATA);
+    auto& scale = Input(SCALE);
+    auto* Y = Output(0);
+    Y->ResizeLike(data);
+    size_t batchSize = data.dim(0);
+    size_t exampleSize = data.size_from_dim(1);
+    CAFFE_ENFORCE(batchSize == scale.dim(0), batchSize, " != ", scale.dim(0));
+    auto* scalePtr = scale.template data<TScale>();
+    auto* dataPtr = data.template data<TData>();
+    auto* yPtr = Y->template mutable_data<TData>();
+    for (auto i = 0; i < batchSize; ++i) {
+      auto scale = scalePtr[i];
+      CAFFE_ENFORCE(scale >= 0, scale, " < 0");
+      auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale);
+      math::Scale<TData, Context>(
+          exampleSize,
+          multiplier,
+          dataPtr + i * exampleSize,
+          yPtr + i * exampleSize,
+          &context_);
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, SCALE);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SQUARE_ROOT_DIVIDE_OP_H_
diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc
new file mode 100644
index 0000000..64a0c1a
--- /dev/null
+++ b/caffe2/operators/stats_ops.cc
@@ -0,0 +1,341 @@
+#include <chrono>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+class StatRegistryCreateOp : public Operator<CPUContext> {
+ public:
+  StatRegistryCreateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<StatRegistry>>(0) =
+        std::unique_ptr<StatRegistry>(new StatRegistry);
+    return true;
+  }
+};
+
+class StatRegistryExportOp : public Operator<CPUContext> {
+ public:
+  StatRegistryExportOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        reset_(GetSingleArgument<bool>("reset", true)) {}
+
+  bool RunOnDevice() override {
+    auto registry = InputSize() > 0
+        ? OperatorBase::Input<std::unique_ptr<StatRegistry>>(0).get()
+        : &StatRegistry::get();
+    auto* keys = Output(0);
+    auto* values = Output(1);
+    auto* timestamps = Output(2);
+    auto data = registry->publish(reset_);
+    keys->Resize(data.size());
+    values->Resize(data.size());
+    timestamps->Resize(data.size());
+    auto* pkeys = keys->mutable_data<std::string>();
+    auto* pvals = values->mutable_data<int64_t>();
+    auto* ptimestamps = timestamps->mutable_data<int64_t>();
+    int i = 0;
+    for (const auto& stat : data) {
+      pkeys[i] = std::move(stat.key);
+      pvals[i] = stat.value;
+      ptimestamps[i] =
+          std::chrono::nanoseconds(stat.ts.time_since_epoch()).count();
+      ++i;
+    }
+    return true;
+  }
+
+ private:
+  bool reset_;
+};
+
+class StatRegistryUpdateOp : public Operator<CPUContext> {
+ public:
+  StatRegistryUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& keys = Input(0);
+    const auto& values = Input(1);
+    auto registry = InputSize() == 3
+        ? OperatorBase::Input<std::unique_ptr<StatRegistry>>(2).get()
+        : &StatRegistry::get();
+    CAFFE_ENFORCE_EQ(keys.size(), values.size());
+    ExportedStatList data(keys.size());
+    auto* pkeys = keys.data<std::string>();
+    auto* pvals = values.data<int64_t>();
+    int i = 0;
+    for (auto& stat : data) {
+      stat.key = pkeys[i];
+      stat.value = pvals[i];
+      ++i;
+    }
+    registry->update(data);
+    return true;
+  }
+};
+
+class TimerInstance {
+ public:
+  explicit TimerInstance(const std::string& name)
+      : running_(false), stat_(name) {}
+
+  void begin() {
+    CAFFE_ENFORCE(!running_, "Called TimerBegin on an already running timer.");
+    running_ = true;
+    start_ = std::chrono::high_resolution_clock::now();
+  }
+
+  void end() {
+    CAFFE_ENFORCE(running_, "Called TimerEnd on a stopped timer.");
+    using namespace std::chrono;
+    auto duration = high_resolution_clock::now() - start_;
+    auto nanos = duration_cast<nanoseconds>(duration).count();
+    CAFFE_EVENT(stat_, time_ns, nanos);
+    running_ = false;
+  }
+
+  int64_t get_ns() {
+    CAFFE_ENFORCE(running_, "Called TimerGet on a stopped timer.");
+    using namespace std::chrono;
+    auto duration = high_resolution_clock::now() - start_;
+    auto nanos = duration_cast<nanoseconds>(duration).count();
+    return nanos;
+  }
+
+ private:
+  bool running_;
+  std::chrono::high_resolution_clock::time_point start_;
+
+  struct TimerStat {
+    CAFFE_STAT_CTOR(TimerStat);
+    CAFFE_AVG_EXPORTED_STAT(time_ns);
+  } stat_;
+};
+
+struct TimerBeginOp : public Operator<CPUContext> {
+  TimerBeginOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        given_name_(GetSingleArgument<std::string>(
+            "counter_name",
+            operator_def.output().Get(0))),
+        timer_([this]() { return given_name_; }()) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<TimerInstance*>(0) = &timer_;
+    timer_.begin();
+    return true;
+  }
+
+ private:
+  const std::string given_name_;
+  TimerInstance timer_;
+};
+
+struct TimerEndOp : public Operator<CPUContext> {
+  TimerEndOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    OperatorBase::Input<TimerInstance*>(0)->end();
+    return true;
+  }
+};
+
+struct TimerGetAndEndOp : public Operator<CPUContext> {
+  TimerGetAndEndOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
+    OperatorBase::Input<TimerInstance*>(0)->end();
+    auto* res = OperatorBase::Output<TensorCPU>(0);
+    res->Resize(1);
+    res->template mutable_data<int64_t>()[0] = nanos;
+    return true;
+  }
+};
+
+struct TimerGetOp : public Operator<CPUContext> {
+  TimerGetOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
+    auto* res = OperatorBase::Output<TensorCPU>(0);
+    res->Resize();
+    res->template mutable_data<int64_t>()[0] = nanos;
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(StatRegistryCreate, StatRegistryCreateOp);
+REGISTER_CPU_OPERATOR(StatRegistryUpdate, StatRegistryUpdateOp);
+REGISTER_CPU_OPERATOR(StatRegistryExport, StatRegistryExportOp);
+
+REGISTER_CPU_OPERATOR(TimerBegin, TimerBeginOp);
+REGISTER_CPU_OPERATOR(TimerEnd, TimerEndOp);
+REGISTER_CPU_OPERATOR(TimerGetAndEnd, TimerGetAndEndOp);
+REGISTER_CPU_OPERATOR(TimerGet, TimerGetOp);
+
+OPERATOR_SCHEMA(StatRegistryCreate)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Create a StatRegistry object that will contain a map of performance counters
+keyed by name. A StatRegistry is used to gather and retrieve performance
+counts throughout the caffe2 codebase.
+)DOC")
+    .Output(0, "handle", "A Blob pointing to the newly created StatRegistry.");
+
+OPERATOR_SCHEMA(StatRegistryUpdate)
+    .NumInputs(2, 3)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Update the given StatRegistry, or the global StatRegistry,
+with the values of counters for the given keys.
+)DOC")
+    .Input(0, "keys", "1D string tensor with the key names to update.")
+    .Input(1, "values", "1D int64 tensor with the values to update.")
+    .Input(
+        2,
+        "handle",
+        "If provided, update the given StatRegistry. "
+        "Otherwise, update the global singleton.");
+
+OPERATOR_SCHEMA(StatRegistryExport)
+    .NumInputs(0, 1)
+    .NumOutputs(3)
+    .Input(
+        0,
+        "handle",
+        "If provided, export values from given StatRegistry."
+        "Otherwise, export values from the global singleton StatRegistry.")
+    .Output(0, "keys", "1D string tensor with exported key names")
+    .Output(1, "values", "1D int64 tensor with exported values")
+    .Output(2, "timestamps", "The unix timestamp at counter retrieval.")
+    .Arg(
+        "reset",
+        "(default true) Whether to atomically reset the counters afterwards.");
+
+OPERATOR_SCHEMA(TimerBegin)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Start a wallclock timer, returning a scalar tensor containing a pointer to it. The timer is stopped by calling **TimerEnd**.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_ops.cc
+
+    )DOC")
+    .Arg("counter_name", "(*str*): name of the timer object; if not set use output name")
+    .Output(0, "timer", "(*Tensor`<ptr>`*): pointer to a timer object");
+
+OPERATOR_SCHEMA(TimerEnd)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Stop a timer started with **TimerBegin**. Publishes a CAFFE_EVENT.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_ops.cc
+
+    )DOC")
+    .Input(0, "timer", "(*Tensor`<ptr>`*): pointer to a timer object; obtained from **TimerBegin** op");
+
+OPERATOR_SCHEMA(TimerGetAndEnd)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Queries the current time of a timer in nanos, stops the timer publishing a CAFFE_EVENT.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+timerbegin_op = core.CreateOperator(
+    "TimerBegin",
+    [],
+    ["timer"]
+)
+
+timerget_op = core.CreateOperator(
+    "TimerGet",
+    ["timer"],
+    ["nanos"]
+)
+
+timerend_op = core.CreateOperator(
+    "TimerEnd",
+    ["timer"],
+    []
+)
+
+timergetandend_op = core.CreateOperator(
+    "TimerGetAndEnd",
+    ["timer"],
+    ["nanos"]
+)
+
+# Test TimerBegin/TimerGet/TimerEnd
+workspace.RunOperatorOnce(timerbegin_op)
+print("timer:", workspace.FetchBlob("timer"))
+workspace.RunOperatorOnce(timerget_op)
+print("nanos:", workspace.FetchBlob("nanos"))
+workspace.RunOperatorOnce(timerend_op)
+
+
+# Test TimerBegin/TimerGetAndEnd
+workspace.RunOperatorOnce(timerbegin_op)
+print("timer:", workspace.FetchBlob("timer"))
+workspace.RunOperatorOnce(timergetandend_op)
+print("nanos:", workspace.FetchBlob("nanos"))
+
+```
+
+**Result**
+
+```
+
+timer: b'timer, a C++ native class of type caffe2::TimerInstance*.'
+nanos: 361140
+timer: b'timer, a C++ native class of type caffe2::TimerInstance*.'
+nanos: [252250]
+
+```
+
+</details>
+
+      )DOC")
+    .Input(0, "timer", "(*Tensor`<ptr>`*): pointer to a timer object; obtained from **TimerBegin** op")
+    .Output(0, "nanos", "(*Tensor`<int64>`*): scalar tensor containing time in nanoseconds");
+
+OPERATOR_SCHEMA(TimerGet)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Queries the current time of a timer object in nanoseconds.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_ops.cc
+
+    )DOC")
+    .Input(0, "timer", "(*Tensor`<ptr>`*): pointer to a timer object; obtained from **TimerBegin** op")
+    .Output(0, "nanos", "(*Tensor`<int64>`*): scalar containing time in nanoseconds");
+
+CAFFE_KNOWN_TYPE(TimerInstance*);
+CAFFE_KNOWN_TYPE(std::unique_ptr<caffe2::StatRegistry>);
+} // namespace caffe2
diff --git a/caffe2/operators/stop_gradient.cc b/caffe2/operators/stop_gradient.cc
new file mode 100644
index 0000000..ef89088
--- /dev/null
+++ b/caffe2/operators/stop_gradient.cc
@@ -0,0 +1,19 @@
+#include "caffe2/operators/stop_gradient.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(StopGradient, StopGradientOp<CPUContext>);
+
+// TODO(jiayq): Add example to the doc string.
+OPERATOR_SCHEMA(StopGradient)
+    .NumInputs(1, 1)
+    .NumOutputs(1, 1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+StopGradient is a helper operator that does no actual numerical computation,
+and in the gradient computation phase stops the gradient from being computed
+through it.
+)DOC");
+
+NO_GRADIENT(StopGradient);
+}  // namespace caffe2
diff --git a/caffe2/operators/stop_gradient.h b/caffe2/operators/stop_gradient.h
new file mode 100644
index 0000000..e05cd11
--- /dev/null
+++ b/caffe2/operators/stop_gradient.h
@@ -0,0 +1,25 @@
+#ifndef CAFFE2_OPERATORS_STOP_GRADIENT_H_
+#define CAFFE2_OPERATORS_STOP_GRADIENT_H_
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class StopGradientOp : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(StopGradientOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override {
+    const auto& in = Input(0);
+    auto* out = Output(0);
+    if (out != &in) {
+      out->CopyFrom(in, &context_);
+    }
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_STOP_GRADIENT_H_
diff --git a/caffe2/operators/stop_gradient_gpu.cc b/caffe2/operators/stop_gradient_gpu.cc
new file mode 100644
index 0000000..efbcfbd
--- /dev/null
+++ b/caffe2/operators/stop_gradient_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/stop_gradient.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(StopGradient, StopGradientOp<CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
new file mode 100644
index 0000000..819bb6a
--- /dev/null
+++ b/caffe2/operators/string_ops.cc
@@ -0,0 +1,192 @@
+#include "caffe2/operators/string_ops.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool StringJoinOp<CPUContext>::DoRunWithType() {
+  const auto& input = Input(0);
+  auto* output = Output(0);
+  CAFFE_ENFORCE_GT(input.size(), 0);
+  CAFFE_ENFORCE_LE(input.ndim(), 2, "Only 1-D and 2-D tensors are supported");
+
+  const auto* inputData = input.data<T>();
+  int rowSize = (input.ndim() == 2) ? input.dim(1) : 1;
+  if (this->axis_ == 0) {
+    output->Resize(input.dim(0));
+    auto* outputData = output->mutable_data<std::string>();
+
+    int offset = 0;
+    for (int i = 0; i < input.dim(0); ++i) {
+      std::stringstream stream;
+      std::copy(
+          inputData + offset,
+          inputData + offset + rowSize,
+          std::ostream_iterator<T>(stream, delimiter_.c_str()));
+      outputData[i] = stream.str();
+      offset += rowSize;
+    }
+  } else if (this->axis_ == 1) {
+    output->Resize(input.dim(1));
+    auto* outputData = output->mutable_data<std::string>();
+
+    for (int j = 0; j < input.dim(1); ++j) {
+      std::stringstream stream;
+      for (int i = 0; i < input.dim(0); ++i) {
+        stream << inputData[i * rowSize + j] << delimiter_;
+      }
+      outputData[j] = stream.str();
+    }
+  } else {
+    CAFFE_ENFORCE(false, "Not supported");
+  }
+
+  return true;
+}
+
+namespace {
+
+struct StartsWith {
+  explicit StartsWith(OperatorBase& op)
+      : prefix_(op.GetSingleArgument<std::string>("prefix", "")) {}
+  bool operator()(const std::string& str) {
+    return std::mismatch(prefix_.begin(), prefix_.end(), str.begin()).first ==
+        prefix_.end();
+  }
+
+ private:
+  std::string prefix_;
+};
+
+struct EndsWith {
+  explicit EndsWith(OperatorBase& op)
+      : suffix_(op.GetSingleArgument<std::string>("suffix", "")) {}
+  bool operator()(const std::string& str) {
+    return std::mismatch(suffix_.rbegin(), suffix_.rend(), str.rbegin())
+               .first == suffix_.rend();
+  }
+
+ private:
+  std::string suffix_;
+};
+
+struct Prefix {
+  explicit Prefix(OperatorBase& op)
+      : length_(op.GetSingleArgument<int>("length", 3)) {}
+  std::string operator()(const std::string& str) {
+    return std::string(str.begin(), std::min(str.end(), str.begin() + length_));
+  }
+
+ private:
+  int length_;
+};
+
+struct Suffix {
+  explicit Suffix(OperatorBase& op)
+      : length_(op.GetSingleArgument<int>("length", 3)) {}
+  std::string operator()(const std::string& str) {
+    return std::string(std::max(str.begin(), str.end() - length_), str.end());
+  }
+
+ private:
+  int length_;
+};
+
+template <typename ScalarFunctor, typename TypeMap = FixedType<std::string>>
+using StringElementwiseOp = UnaryElementwiseWithArgsOp<
+    TensorTypes<std::string>,
+    CPUContext,
+    ForEach<ScalarFunctor>,
+    TypeMap>;
+
+REGISTER_CPU_OPERATOR(StringPrefix, StringElementwiseOp<Prefix>);
+REGISTER_CPU_OPERATOR(StringSuffix, StringElementwiseOp<Suffix>);
+REGISTER_CPU_OPERATOR(
+    StringStartsWith,
+    StringElementwiseOp<StartsWith, FixedType<bool>>);
+REGISTER_CPU_OPERATOR(
+    StringEndsWith,
+    StringElementwiseOp<EndsWith, FixedType<bool>>);
+REGISTER_CPU_OPERATOR(StringJoin, StringJoinOp<CPUContext>);
+
+OPERATOR_SCHEMA(StringPrefix)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the element-wise string prefix of the string tensor.
+Input strings that are shorter than prefix length will be returned unchanged.
+NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
+and potentially invalid strings for variable-length encodings such as utf-8.
+)DOC")
+    .Arg("length", "Maximum size of the prefix, in bytes.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(
+        0,
+        "prefixes",
+        "Tensor of std::string containing prefixes for each input.");
+
+OPERATOR_SCHEMA(StringSuffix)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the element-wise string suffix of the string tensor.
+Input strings that are shorter than suffix length will be returned unchanged.
+NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
+and potentially invalid strings for variable-length encodings such as utf-8.
+)DOC")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(
+        0,
+        "suffixes",
+        "Tensor of std::string containing suffixes for each output.")
+    .Arg("length", "Maximum size of the suffix, in bytes.");
+
+OPERATOR_SCHEMA(StringStartsWith)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the starts-with check on each string in the input tensor.
+Returns tensor of boolean of the same dimension of input.
+)DOC")
+    .Arg("prefix", "The prefix to check input strings against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
+OPERATOR_SCHEMA(StringEndsWith)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the ends-with check on each string in the input tensor.
+Returns tensor of boolean of the same dimension of input.
+)DOC")
+    .Arg("suffix", "The suffix to check input strings against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
+OPERATOR_SCHEMA(StringJoin)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Takes a 1-D or a 2-D tensor as input and joins elements in each row with the
+provided delimiter. Output is a 1-D tensor of size equal to the first dimension
+of the input. Each element in the output tensor is a string of concatenated
+elements corresponding to each row in the input tensor. For 1-D input, each
+element is treated as a row.
+)DOC")
+    .Arg("delimiter", "Delimiter for join (Default: \",\").")
+    .Arg("axis", "Axis for the join (either 0 or 1)")
+    .Input(0, "input", "1-D or 2-D tensor")
+    .Output(
+        0,
+        "strings",
+        "1-D tensor of strings created by joining row elements from the "
+        "input tensor.");
+
+SHOULD_NOT_DO_GRADIENT(StringPrefix);
+SHOULD_NOT_DO_GRADIENT(StringSuffix);
+SHOULD_NOT_DO_GRADIENT(StringStartsWith);
+SHOULD_NOT_DO_GRADIENT(StringEndsWith);
+SHOULD_NOT_DO_GRADIENT(StringJoin);
+}
+} // namespace caffe2
diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h
new file mode 100644
index 0000000..b6f670f
--- /dev/null
+++ b/caffe2/operators/string_ops.h
@@ -0,0 +1,76 @@
+#ifndef CAFFE2_OPERATORS_STRING_OPS_H_
+#define CAFFE2_OPERATORS_STRING_OPS_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+/**
+ * ForEach is a unary functor that forwards each element of the input array
+ * into the elementwise Functor provided, and gathers the results of each
+ * call into the resulting array. Use it as an adaptor if you want to create
+ * a UnaryElementwiseOp that acts on each element of the tensor per function
+ * call -- this is resonable for complex types where vectorization wouldn't
+ * be much of a gain, performance-wise.
+ */
+template <typename Functor>
+struct ForEach {
+  explicit ForEach(OperatorBase& op) : functor(op) {}
+
+  template <typename In, typename Out, typename Context>
+  bool operator()(int n, const In* in, Out* out, Context* /*c*/) {
+    for (int i = 0; i < n; ++i) {
+      out[i] = functor(in[i]);
+    }
+    return true;
+  }
+
+  Functor functor;
+};
+
+template <typename ScalarFunctor, typename TypeMap = FixedType<std::string>>
+using StringElementwiseOp = UnaryElementwiseWithArgsOp<
+    TensorTypes<std::string>,
+    CPUContext,
+    ForEach<ScalarFunctor>,
+    TypeMap>;
+
+template <class Context>
+class StringJoinOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  StringJoinOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        delimiter_(
+            OperatorBase::GetSingleArgument<std::string>("delimiter", ",")),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 0)) {
+    CAFFE_ENFORCE(axis_ == 0 || axis_ == 1);
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<
+        float,
+        double,
+        int8_t,
+        uint8_t,
+        int16_t,
+        uint16_t,
+        int32_t,
+        int64_t,
+        std::string,
+        bool>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ protected:
+  std::string delimiter_;
+  int axis_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_STRING_OPS_H_
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
new file mode 100644
index 0000000..3d6fb47
--- /dev/null
+++ b/caffe2/operators/string_ops_test.cc
@@ -0,0 +1,140 @@
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "caffe2/operators/string_ops.h"
+
+namespace caffe2 {
+
+class StringJoinOpTest : public testing::Test {
+ public:
+  bool runOp(const TensorCPU& input) {
+    auto* blob = ws_.CreateBlob("X");
+    auto* tensor = blob->GetMutable<TensorCPU>();
+    tensor->ResizeLike(input);
+    tensor->ShareData(input);
+
+    OperatorDef def;
+    def.set_name("test");
+    def.set_type("StringJoin");
+    def.add_input("X");
+    def.add_output("Y");
+
+    auto op = CreateOperator(def, &ws_);
+    return op->Run();
+  }
+
+  const std::string* checkAndGetOutput(int outputSize) {
+    const auto* output = ws_.GetBlob("Y");
+    EXPECT_NE(output, nullptr);
+    EXPECT_TRUE(output->IsType<TensorCPU>());
+    const auto& outputTensor = output->Get<TensorCPU>();
+    EXPECT_EQ(outputTensor.ndim(), 1);
+    EXPECT_EQ(outputTensor.dim(0), outputSize);
+    EXPECT_EQ(outputTensor.size(), outputSize);
+    return outputTensor.data<std::string>();
+  }
+
+ protected:
+  Workspace ws_;
+};
+
+TEST_F(StringJoinOpTest, testString1DJoin) {
+  std::vector<std::string> input = {"a", "xx", "c"};
+
+  auto blob = caffe2::make_unique<Blob>();
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(input.size());
+  auto* data = tensor->mutable_data<std::string>();
+  for (int i = 0; i < input.size(); ++i) {
+    *data++ = input[i];
+  }
+
+  EXPECT_TRUE(runOp(*tensor));
+
+  const auto* outputData = checkAndGetOutput(input.size());
+  EXPECT_EQ(outputData[0], "a,");
+  EXPECT_EQ(outputData[1], "xx,");
+  EXPECT_EQ(outputData[2], "c,");
+}
+
+TEST_F(StringJoinOpTest, testString2DJoin) {
+  std::vector<std::vector<std::string>> input = {{"aa", "bb", "cc"},
+                                                 {"dd", "ee", "ff"}};
+
+  auto blob = caffe2::make_unique<Blob>();
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(input.size(), input[0].size());
+  auto* data = tensor->mutable_data<std::string>();
+  for (int i = 0; i < input.size(); ++i) {
+    for (int j = 0; j < input[0].size(); ++j) {
+      *data++ = input[i][j];
+    }
+  }
+
+  EXPECT_TRUE(runOp(*tensor));
+
+  const auto* outputData = checkAndGetOutput(input.size());
+  EXPECT_EQ(outputData[0], "aa,bb,cc,");
+  EXPECT_EQ(outputData[1], "dd,ee,ff,");
+}
+
+TEST_F(StringJoinOpTest, testFloat1DJoin) {
+  std::vector<float> input = {3.90f, 5.234f, 8.12f};
+
+  auto blob = caffe2::make_unique<Blob>();
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(input.size());
+  auto* data = tensor->mutable_data<float>();
+  for (int i = 0; i < input.size(); ++i) {
+    *data++ = input[i];
+  }
+
+  EXPECT_TRUE(runOp(*tensor));
+
+  const auto* outputData = checkAndGetOutput(input.size());
+  EXPECT_EQ(outputData[0], "3.9,");
+  EXPECT_EQ(outputData[1], "5.234,");
+  EXPECT_EQ(outputData[2], "8.12,");
+}
+
+TEST_F(StringJoinOpTest, testFloat2DJoin) {
+  std::vector<std::vector<float>> input = {{1.23f, 2.45f, 3.56f},
+                                           {4.67f, 5.90f, 6.32f}};
+
+  auto blob = caffe2::make_unique<Blob>();
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(input.size(), input[0].size());
+  auto* data = tensor->mutable_data<float>();
+  for (int i = 0; i < input.size(); ++i) {
+    for (int j = 0; j < input[0].size(); ++j) {
+      *data++ = input[i][j];
+    }
+  }
+
+  EXPECT_TRUE(runOp(*tensor));
+
+  const auto* outputData = checkAndGetOutput(input.size());
+  EXPECT_EQ(outputData[0], "1.23,2.45,3.56,");
+  EXPECT_EQ(outputData[1], "4.67,5.9,6.32,");
+}
+
+TEST_F(StringJoinOpTest, testLong2DJoin) {
+  std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
+
+  auto blob = caffe2::make_unique<Blob>();
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(input.size(), input[0].size());
+  auto* data = tensor->mutable_data<int64_t>();
+  for (int i = 0; i < input.size(); ++i) {
+    for (int j = 0; j < input[0].size(); ++j) {
+      *data++ = input[i][j];
+    }
+  }
+
+  EXPECT_TRUE(runOp(*tensor));
+
+  const auto* outputData = checkAndGetOutput(input.size());
+  EXPECT_EQ(outputData[0], "100,200,");
+  EXPECT_EQ(outputData[1], "1000,2000,");
+}
+}
diff --git a/caffe2/operators/stump_func_op.cc b/caffe2/operators/stump_func_op.cc
new file mode 100644
index 0000000..fa3fd09
--- /dev/null
+++ b/caffe2/operators/stump_func_op.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/operators/stump_func_op.h"
+
+namespace caffe2 {
+
+template <>
+bool StumpFuncOp<float, float, CPUContext>::RunOnDevice() {
+  auto& in = Input(0);
+  const float* in_data = in.template data<float>();
+  auto* out = Output(0);
+  out->ResizeLike(in);
+  float* out_data = out->template mutable_data<float>();
+  for (int i = 0; i < in.size(); i++) {
+    out_data[i] = (in_data[i] <= threshold_) ? low_value_ : high_value_;
+  }
+  return true;
+}
+
+template <>
+bool StumpFuncIndexOp<float, int64_t, CPUContext>::RunOnDevice() {
+  auto& in = Input(0);
+  const float* in_data = in.template data<float>();
+  auto* out_lo = Output(0);
+  auto* out_hi = Output(1);
+  int lo_cnt = 0;
+  for (int i = 0; i < in.size(); i++) {
+    lo_cnt += (in_data[i] <= threshold_);
+  }
+  out_lo->Resize(lo_cnt);
+  out_hi->Resize(in.size() - lo_cnt);
+  int64_t* lo_data = out_lo->template mutable_data<int64_t>();
+  int64_t* hi_data = out_hi->template mutable_data<int64_t>();
+  int lidx = 0;
+  int hidx = 0;
+  for (int i = 0; i < in.size(); i++) {
+    if (in_data[i] <= threshold_) {
+      lo_data[lidx++] = i;
+    } else {
+      hi_data[hidx++] = i;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(StumpFunc, StumpFuncOp<float, float, CPUContext>);
+
+OPERATOR_SCHEMA(StumpFunc)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Input(0, "X", "tensor of float")
+    .Output(0, "Y", "tensor of float")
+    .SetDoc(R"DOC(
+Converts each input element into either high_ or low_value
+based on the given threshold.
+)DOC");
+
+NO_GRADIENT(StumpFunc);
+
+REGISTER_CPU_OPERATOR(
+    StumpFuncIndex,
+    StumpFuncIndexOp<float, int64_t, CPUContext>);
+
+OPERATOR_SCHEMA(StumpFuncIndex)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .Input(0, "X", "tensor of float")
+    .Output(
+        0,
+        "Index_Low",
+        "tensor of int64 indices for elements below/equal threshold")
+    .Output(
+        1,
+        "Index_High",
+        "tensor of int64 indices for elements above threshold")
+    .SetDoc(R"DOC(
+Split the elemnts and return the indices based on the given threshold.
+)DOC");
+
+NO_GRADIENT(StumpFuncIndex);
+
+} // caffe2
diff --git a/caffe2/operators/stump_func_op.cu b/caffe2/operators/stump_func_op.cu
new file mode 100644
index 0000000..2ea3108
--- /dev/null
+++ b/caffe2/operators/stump_func_op.cu
@@ -0,0 +1,55 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/stump_func_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename TIN, typename TOUT>
+__global__ void StumpFuncKernel(
+  const int N,
+  const TIN threshold,
+  const TOUT low_value,
+  const TOUT high_value,
+  const TIN* X,
+  TOUT* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = (X[i] <= threshold) ? low_value : high_value;
+  }
+}
+
+} //
+
+template <>
+bool StumpFuncOp<float, float, CUDAContext>::RunOnDevice() {
+  auto& in = Input(0);
+  const float* in_data = in.data<float>();
+  auto* out = Output(0);
+  out->ResizeLike(in);
+  float* out_data = out->mutable_data<float>();
+  StumpFuncKernel<<<CAFFE_GET_BLOCKS(in.size()), CAFFE_CUDA_NUM_THREADS,
+    0, context_.cuda_stream()>>>(
+      in.size(), threshold_, low_value_, high_value_, in_data, out_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(StumpFunc, StumpFuncOp<float, float, CUDAContext>);
+// NO_GRADIENT(StumpFuncGpu);
+
+} // caffe2
diff --git a/caffe2/operators/stump_func_op.h b/caffe2/operators/stump_func_op.h
new file mode 100644
index 0000000..92d6383
--- /dev/null
+++ b/caffe2/operators/stump_func_op.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef CAFFE2_FB_OPERATORS_UTILITY_OPS_H_
+#define CAFFE2_FB_OPERATORS_UTILITY_OPS_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// Converts each input element into either high_ or low_value
+// based on the given threshold.
+//
+// out[i] = low_value if in[i] <= threshold else high_value
+template <typename TIN, typename TOUT, class Context>
+class StumpFuncOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  StumpFuncOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        threshold_(OperatorBase::GetSingleArgument<TIN>("threshold", 0)),
+        low_value_(OperatorBase::GetSingleArgument<TOUT>("low_value", 0)),
+        high_value_(OperatorBase::GetSingleArgument<TOUT>("high_value", 0)) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  TIN threshold_;
+  TOUT low_value_;
+  TOUT high_value_;
+
+  // Input: label, output: weight
+};
+
+template <typename TIN, typename TOUT, class Context>
+class StumpFuncIndexOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  StumpFuncIndexOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        threshold_(OperatorBase::GetSingleArgument<TIN>("threshold", 0)) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  TIN threshold_;
+  // Input: label, output: indices
+};
+
+} // caffe2
+
+#endif // CAFFE2_FB_OPERATORS_UTILITY_OPS_H_
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
new file mode 100644
index 0000000..ca4a762
--- /dev/null
+++ b/caffe2/operators/stylizer_ops.cc
@@ -0,0 +1,598 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
+namespace caffe2 {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+namespace {
+
+//
+// ARM Neon code utilities
+//
+
+inline float32x4_t to_v4_f32(uint16x4_t v) {
+  return vcvtq_f32_u32(vmovl_u16(v));
+}
+
+inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
+  float32x4x4_t out;
+
+  uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
+
+  out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
+  out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
+
+  uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
+
+  out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
+  out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
+
+  return out;
+}
+
+inline void clamp(float32x4_t& v) {
+  v = vmaxq_f32(v, vdupq_n_f32(0));
+  v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
+}
+
+inline void addMeanAndClamp(float32x4_t& v, float mean) {
+  v = vaddq_f32(v, vdupq_n_f32(mean));
+  clamp(v);
+}
+
+inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
+  uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
+  uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
+  uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
+  return vmovn_u16(u16_01);
+}
+
+} // unnamed namespace
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
+    : public Operator<CPUContext> {
+ public:
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 4;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 3;
+
+  // We read this much noise per vectorized cycle
+  static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;
+
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws), ws_(ws) {}
+
+  bool RunOnDevice() {
+    const auto& X = Input(0);
+    const auto& mean = Input(1);
+    auto* Y = Output(0);
+    auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
+    auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
+        "noise_size", 491 /* prime to avoid artifacts */);
+
+    if (!noiseBlob->IsType<TensorCPU>()) {
+      // Initialize random noise on first use.
+      // Cache it to maintain temporal consistency.
+      auto* t = noiseBlob->template GetMutable<TensorCPU>();
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      // Noise space is larger for vectorized code due to the
+      // vectorized load
+      initNoiseCPUNeon(t, defaultNoiseSize);
+#else
+      initNoiseCPU(t, defaultNoiseSize);
+#endif
+    }
+    const auto& noise = noiseBlob->template Get<TensorCPU>();
+    CAFFE_ENFORCE(noise.size() >= defaultNoiseSize);
+
+    CAFFE_ENFORCE(X.ndim() == 4);
+    const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
+    // Assume BGR or BGRA
+    CAFFE_ENFORCE(mean.size() == kOutputChannels);
+
+    CAFFE_ENFORCE(C == kInputChannels);
+    Y->Resize(N, kOutputChannels, H, W);
+
+    runBatch(
+        N,
+        C,
+        H,
+        W,
+        defaultNoiseSize,
+        X.data<uint8_t>(),
+        mean.data<float>(),
+        noise.data<float>(),
+        Y->mutable_data<float>());
+
+    return true;
+  }
+
+#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+  void initNoiseCPU(Tensor<CPUContext>* noise, int size) {
+    noise->Resize(size);
+
+    math::RandGaussian<float, CPUContext>(
+        size,
+        0.0,
+        OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
+        noise->template mutable_data<float>(),
+        &context_);
+  }
+#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  void initNoiseCPUNeon(Tensor<CPUContext>* noise, int size) {
+    // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
+    // the inner loop is vectorized. Round up to the next highest
+    // multiple of kNeonNoiseReadSize
+    size = math::roundUp(size, kNeonNoiseReadSize) + size;
+    noise->Resize(size);
+
+    math::RandGaussian<float, CPUContext>(
+        size,
+        0.0,
+        OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
+        noise->template mutable_data<float>(),
+        &context_);
+  }
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+  void runBatch(
+      int N,
+      int /*C*/,
+      int H,
+      int W,
+      int noiseCycle,
+      const uint8_t* input,
+      const float* meanChannel,
+      const float* noise,
+      float* output) {
+    int planeSize = H * W;
+
+    for (int n = 0; n < N; ++n) {
+      auto curInput = input + n * kInputChannels * planeSize;
+      auto curOutput = output + n * kOutputChannels * planeSize;
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
+#else
+      runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+    }
+  }
+
+#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+  void runCPU(
+      int H,
+      int W,
+      int noiseCycle,
+      const uint8_t* input,
+      const float* meanChannel,
+      const float* noise,
+      float* output) {
+    int planeSize = H * W;
+    int noiseOffset = 0;
+
+    for (int point = 0; point < planeSize; ++point) {
+      for (int c = 0; c < kOutputChannels; ++c) {
+        float v = (float)input[point * kInputChannels + c];
+        output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
+
+        if (++noiseOffset >= noiseCycle) {
+          noiseOffset = 0;
+        }
+      }
+    }
+  }
+#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  void runCPUNeon(
+      int H,
+      int W,
+      int noiseCycle,
+      const uint8_t* input,
+      const float* meanChannel,
+      const float* noise,
+      float* output) {
+    // Vectorized load parameters:
+
+    // Loop unroll factor
+    // FIXME: this doesn't actually unroll; clang has per-loop unroll
+    // pragmas but GCC does not
+    constexpr int kUnroll = 1;
+
+    // How much data we load for each inner loop
+    constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);
+
+    // What we write out
+    constexpr int kInnerStoreSize = sizeof(float32x4_t);
+
+    // We load 16 pixels at a time, with 4 channels each
+    constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
+    static_assert(kLoadPixels == 16, "unexpected");
+
+    // How many pixels we load per loop
+    constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
+
+    // We need at least this much noise each loop through
+    CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
+
+    int noiseUsed = 0;
+    const float* curNoise = noise;
+
+    float mean[kOutputChannels] = {
+        meanChannel[0], meanChannel[1], meanChannel[2]};
+    int planeSize = H * W;
+
+    // Vectorized portion
+    int point = 0;
+
+    // If the slice is not aligned, then we have to use the
+    // un-vectorized version
+    bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
+        isPointerAligned(output, kInnerStoreSize) &&
+        // Because we are writing to output at offsets of planeSize,
+        // planeSize has to be an even multiple of kInnerStoreSize
+        (planeSize % kInnerStoreSize == 0);
+
+    // What portion the vectorized loop will handle
+    int limit =
+        isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
+
+    for (; point < limit; point += kLoadPixelsPerLoop) {
+      // Unroll load/update/store by kUnroll
+      for (int j = 0; j < kUnroll; ++j) {
+        // We load 16 pixels x 4 channels at a time
+        const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
+            input + (point + j * kLoadPixels) * kInputChannels,
+            sizeof(uint8x16x4_t));
+        uint8x16x4_t loadV = vld4q_u8(inputAligned);
+
+        // The compiler doesn't want to unroll this when we put it in a
+        // loop, and in GCC there's no per-loop unroll pragma, so we do
+        // it manually.
+        // This seems to involve no register spillage, crossing fingers
+        // that it remains that way.
+        {
+          constexpr int kChannel = 0;
+          float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
+          float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
+          float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
+          float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
+
+          float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
+          float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
+          outV.val[0] = vsubq_f32(outV.val[0], meanV);
+          outV.val[1] = vsubq_f32(outV.val[1], meanV);
+          outV.val[2] = vsubq_f32(outV.val[2], meanV);
+          outV.val[3] = vsubq_f32(outV.val[3], meanV);
+
+          outV.val[0] = vaddq_f32(outV.val[0], noise0);
+          outV.val[1] = vaddq_f32(outV.val[1], noise1);
+          outV.val[2] = vaddq_f32(outV.val[2], noise2);
+          outV.val[3] = vaddq_f32(outV.val[3], noise3);
+
+          float* outputAligned = (float*)__builtin_assume_aligned(
+              &output[kChannel * planeSize + (point + j * kLoadPixels)],
+              sizeof(float32x4_t));
+
+          vst1q_f32(outputAligned + 0, outV.val[0]);
+          vst1q_f32(outputAligned + 4, outV.val[1]);
+          vst1q_f32(outputAligned + 8, outV.val[2]);
+          vst1q_f32(outputAligned + 12, outV.val[3]);
+        }
+
+        {
+          constexpr int kChannel = 1;
+          float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
+          float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
+          float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
+          float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
+
+          float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
+          float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
+          outV.val[0] = vsubq_f32(outV.val[0], meanV);
+          outV.val[1] = vsubq_f32(outV.val[1], meanV);
+          outV.val[2] = vsubq_f32(outV.val[2], meanV);
+          outV.val[3] = vsubq_f32(outV.val[3], meanV);
+
+          outV.val[0] = vaddq_f32(outV.val[0], noise0);
+          outV.val[1] = vaddq_f32(outV.val[1], noise1);
+          outV.val[2] = vaddq_f32(outV.val[2], noise2);
+          outV.val[3] = vaddq_f32(outV.val[3], noise3);
+
+          float* outputAligned = (float*)__builtin_assume_aligned(
+              &output[kChannel * planeSize + (point + j * kLoadPixels)],
+              sizeof(float32x4_t));
+
+          vst1q_f32(outputAligned + 0, outV.val[0]);
+          vst1q_f32(outputAligned + 4, outV.val[1]);
+          vst1q_f32(outputAligned + 8, outV.val[2]);
+          vst1q_f32(outputAligned + 12, outV.val[3]);
+        }
+
+        {
+          constexpr int kChannel = 2;
+          float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
+          float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
+          float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
+          float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
+
+          float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
+          float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
+          outV.val[0] = vsubq_f32(outV.val[0], meanV);
+          outV.val[1] = vsubq_f32(outV.val[1], meanV);
+          outV.val[2] = vsubq_f32(outV.val[2], meanV);
+          outV.val[3] = vsubq_f32(outV.val[3], meanV);
+
+          outV.val[0] = vaddq_f32(outV.val[0], noise0);
+          outV.val[1] = vaddq_f32(outV.val[1], noise1);
+          outV.val[2] = vaddq_f32(outV.val[2], noise2);
+          outV.val[3] = vaddq_f32(outV.val[3], noise3);
+
+          float* outputAligned = (float*)__builtin_assume_aligned(
+              &output[kChannel * planeSize + (point + j * kLoadPixels)],
+              sizeof(float32x4_t));
+
+          vst1q_f32(outputAligned + 0, outV.val[0]);
+          vst1q_f32(outputAligned + 4, outV.val[1]);
+          vst1q_f32(outputAligned + 8, outV.val[2]);
+          vst1q_f32(outputAligned + 12, outV.val[3]);
+        }
+      }
+
+      curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
+      noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
+
+      if (noiseUsed >= noiseCycle) {
+        noiseUsed = 0;
+        curNoise = noise + ((curNoise - noise) % noiseCycle);
+      }
+    }
+
+    // Epilogue: non-vectorized remainder
+    for (; point < planeSize; ++point) {
+      for (int c = 0; c < kOutputChannels; ++c) {
+        float v = (float)input[point * kInputChannels + c];
+        output[c * planeSize + point] = v - mean[c] + *curNoise++;
+        ++noiseUsed;
+      }
+
+      if (noiseUsed >= noiseCycle) {
+        noiseUsed = 0;
+        curNoise = noise + ((curNoise - noise) % noiseCycle);
+      }
+    }
+  }
+#endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+ private:
+  Workspace* ws_;
+};
+
+namespace {
+
+template <typename T>
+static inline T clamped_cast(float f) {
+  if (f >= std::numeric_limits<T>::max()) {
+    return std::numeric_limits<T>::max();
+  }
+  if (f <= std::numeric_limits<T>::min()) {
+    return std::numeric_limits<T>::min();
+  }
+  return static_cast<T>(f);
+}
+
+} // unnamed namespace
+
+class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
+    : public Operator<CPUContext> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  // Expect this many channels as input
+  static constexpr int kInputChannels = 3;
+
+  // Expect this many channels as output
+  static constexpr int kOutputChannels = 4;
+
+  bool RunOnDevice() {
+    const auto& X = Input(0);
+    const auto& mean = Input(1);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(X.ndim() == 4);
+    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+    // Assume BGR or BGRA
+    CAFFE_ENFORCE(mean.size() == kInputChannels);
+    CAFFE_ENFORCE(C == kInputChannels);
+    // RGB
+    Y->Resize(N, H, W, kOutputChannels);
+
+    runBatch(
+        N,
+        C,
+        H,
+        W,
+        X.data<float>(),
+        mean.data<float>(),
+        Y->mutable_data<uint8_t>());
+
+    return true;
+  }
+
+  void runBatch(
+      int N,
+      int /*C*/,
+      int H,
+      int W,
+      const float* input,
+      const float* meanChannel,
+      uint8_t* output) {
+    int planeSize = H * W;
+
+    for (int n = 0; n < N; ++n) {
+      auto curInput = input + n * kInputChannels * planeSize;
+      auto curOutput = output + n * kOutputChannels * planeSize;
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      runCPUNeon(H, W, curInput, meanChannel, curOutput);
+#else
+      runCPU(H, W, curInput, meanChannel, curOutput);
+#endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+    }
+  }
+
+#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+  void runCPU(
+      int H,
+      int W,
+      const float* input,
+      const float* meanChannel,
+      uint8_t* output) {
+    int planeSize = H * W;
+
+    for (int point = 0; point < planeSize; ++point) {
+      for (int c = 0; c < kInputChannels; ++c) {
+        uint8_t v = clamped_cast<uint8_t>(
+            input[c * planeSize + point] + meanChannel[c]);
+        output[point * kOutputChannels + c] = v;
+      }
+
+      // alpha
+      output[point * kOutputChannels + (kOutputChannels - 1)] =
+          std::numeric_limits<uint8_t>::max();
+    }
+  }
+#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  void runCPUNeon(
+      int H,
+      int W,
+      const float* input,
+      const float* meanChannel,
+      uint8_t* output) {
+    // Vectorized load parameters:
+
+    // We load in chunks of this size
+    constexpr int kLoadUnit = sizeof(float32x4_t);
+    constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));
+
+    // We store in chunks of this size
+    constexpr int kStoreUnit = sizeof(uint8x8x4_t);
+
+    // The vector portion loads this many f32 pixels at a time (8)
+    constexpr int kLoadPixels = 2 * kLoadFloats;
+
+    float mean[kInputChannels] = {
+        meanChannel[0], meanChannel[1], meanChannel[2]};
+    int planeSize = H * W;
+
+    // Vectorized portion
+    int point = 0;
+
+    // If the slice is not aligned, then we have to use the
+    // un-vectorized version
+    bool isAligned = isPointerAligned(input, kLoadUnit) &&
+        isPointerAligned(output, kStoreUnit) &&
+        // Because we are reading from input at offsets of planeSize,
+        // planeSize has to be an even multiple of kLoadUnit
+        (planeSize % kLoadUnit == 0);
+
+    // What portion the vectorized loop will handle
+    int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
+
+    for (; point < limit; point += kLoadPixels) {
+      // Load 8 f32 pixels from each channel; loading 16 involves
+      // register spills it seems
+      float32x4_t inputc0_0 =
+          vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
+      float32x4_t inputc0_1 =
+          vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
+
+      float32x4_t inputc1_0 =
+          vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
+      float32x4_t inputc1_1 =
+          vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
+
+      float32x4_t inputc2_0 =
+          vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
+      float32x4_t inputc2_1 =
+          vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
+
+      addMeanAndClamp(inputc0_0, mean[0]);
+      addMeanAndClamp(inputc0_1, mean[0]);
+      uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
+
+      addMeanAndClamp(inputc1_0, mean[1]);
+      addMeanAndClamp(inputc1_1, mean[1]);
+      uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
+
+      addMeanAndClamp(inputc2_0, mean[2]);
+      addMeanAndClamp(inputc2_1, mean[2]);
+      uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
+
+      // This is the alpha channel
+      uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
+
+      // We now have 8 bytes of each channel in a separate vector
+      // Write BGRA interleaved to output
+      uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
+      vst4_u8_aligned(output + kOutputChannels * point, u8_out);
+    }
+
+    // Epilogue: non-vectorized remainder
+    for (; point < planeSize; ++point) {
+      for (int c = 0; c < kInputChannels; ++c) {
+        uint8_t v =
+            clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
+        output[point * kOutputChannels + c] = v;
+      }
+
+      // alpha
+      output[point * kOutputChannels + (kOutputChannels - 1)] =
+          std::numeric_limits<uint8_t>::max();
+    }
+  }
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+};
+
+namespace {
+
+REGISTER_CPU_OPERATOR(
+    PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
+    PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp);
+OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
+    .NumInputs(2)
+    .NumOutputs(1);
+REGISTER_CPU_OPERATOR(
+    BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
+    BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp);
+OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
+    .NumInputs(2)
+    .NumOutputs(1);
+
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
+    IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>);
+REGISTER_IDEEP_OPERATOR(
+    PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
+    IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>);
+#endif
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/summarize_op.cc b/caffe2/operators/summarize_op.cc
new file mode 100644
index 0000000..6ae7ed0
--- /dev/null
+++ b/caffe2/operators/summarize_op.cc
@@ -0,0 +1,70 @@
+#include "caffe2/operators/summarize_op.h"
+
+namespace caffe2 {
+
+template <>
+bool SummarizeOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const auto N = X.size();
+  CAFFE_ENFORCE_GT(N, 0);
+
+  const float* Xdata = X.data<float>();
+  double mean = 0;
+  float max = Xdata[0];
+  float min = Xdata[0];
+  for (auto i = 0; i < N; ++i) {
+    mean += static_cast<double>(Xdata[i]) / N;
+    max = std::max(max, Xdata[i]);
+    min = std::min(min, Xdata[i]);
+  }
+  // We will simply do a two-pass. More efficient solutions can be written but
+  // I'll keep code simple for now.
+  double standard_deviation = 0;
+  for (auto i = 0; i < N; ++i) {
+    double diff = Xdata[i] - mean;
+    standard_deviation += diff * diff;
+  }
+  // Unbiased or biased? Let's do unbiased now.
+  standard_deviation = N == 1 ? 0 : std::sqrt(standard_deviation / (N - 1));
+  if (to_file_) {
+    (*log_file_) << min << " " << max << " " << mean << " "
+                 << standard_deviation << std::endl;
+  }
+  if (OutputSize()) {
+    auto* Y = Output(0);
+    Y->Resize(NUM_STATS);
+    float* Ydata = Y->mutable_data<float>();
+    Ydata[MIN_IDX] = min;
+    Ydata[MAX_IDX] = max;
+    Ydata[MEAN_IDX] = static_cast<float>(mean);
+    Ydata[STD_IDX] = static_cast<float>(standard_deviation);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Summarize, SummarizeOp<float, CPUContext>);
+
+// Input: X; output: if set, a summarized Tensor of shape 4, with the values
+// being min, max, mean and std respectively.
+OPERATOR_SCHEMA(Summarize)
+    .NumInputs(1)
+    .NumOutputs(0, 1)
+    .SetDoc(R"DOC(
+Summarize computes four statistics of the input tensor (Tensor<float>)- min,
+max, mean and standard deviation. The output will be written to a 1-D tensor of
+size 4 if an output tensor is provided. Else, if the argument 'to_file' is
+greater than 0, the values are written to a log file in the root folder.
+)DOC")
+    .Arg(
+        "to_file",
+        "(int, default 0) flag to indicate if the summarized "
+        "statistics have to be written to a log file.")
+    .Input(0, "data", "The input data as Tensor<float>.")
+    .Output(
+        0,
+        "output",
+        "1-D tensor (Tensor<float>) of size 4 containing min, "
+        "max, mean and standard deviation");
+
+SHOULD_NOT_DO_GRADIENT(Summarize);
+} // namespace caffe2
diff --git a/caffe2/operators/summarize_op.cu b/caffe2/operators/summarize_op.cu
new file mode 100644
index 0000000..89dd4c0
--- /dev/null
+++ b/caffe2/operators/summarize_op.cu
@@ -0,0 +1,110 @@
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+#include "caffe2/operators/summarize_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+// structure used to accumulate the moments and other statistical properties
+// encountered so far.
+template <typename T>
+struct SummaryStatsData {
+  T n;
+  T min;
+  T max;
+  T mean;
+  T M2;
+
+  // initialize to the identity element
+  void initialize() {
+    n = mean = M2 = 0;
+    min = std::numeric_limits<T>::max();
+    max = std::numeric_limits<T>::min();
+  }
+
+  T variance() { return (n == 1 ? 0 : M2 / (n - 1)); }
+};
+
+// stats_unary_op is a functor that takes in a value x and
+// returns a variace_data whose mean value is initialized to x.
+template <typename T>
+struct summary_stats_unary_op {
+  __host__ __device__ SummaryStatsData<T> operator()(const T& x) const {
+     SummaryStatsData<T> result;
+     result.n    = 1;
+     result.min  = x;
+     result.max  = x;
+     result.mean = x;
+     result.M2   = 0;
+     return result;
+  }
+};
+
+// summary_stats_binary_op is a functor that accepts two SummaryStatsData
+// structs and returns a new SummaryStatsData which are an
+// approximation to the summary_stats for
+// all values that have been agregated so far
+template <typename T>
+struct summary_stats_binary_op
+    : public thrust::binary_function<const SummaryStatsData<T>&,
+                                     const SummaryStatsData<T>&,
+                                           SummaryStatsData<T> > {
+  __host__ __device__ SummaryStatsData<T> operator()(
+      const SummaryStatsData<T>& x, const SummaryStatsData <T>& y) const {
+    SummaryStatsData<T> result;
+    T n  = x.n + y.n;
+    T delta  = y.mean - x.mean;
+    T delta2 = delta  * delta;
+    result.n   = n;
+    result.min = thrust::min(x.min, y.min);
+    result.max = thrust::max(x.max, y.max);
+    result.mean = x.mean + delta * y.n / n;
+    result.M2  = x.M2 + y.M2;
+    result.M2 += delta2 * x.n * y.n / n;
+    return result;
+  }
+};
+
+}  // namespace
+
+template<>
+bool SummarizeOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const int N = X.size();
+  DCHECK_GT(N, 0);
+
+  // TODO(Yangqing): Any better way to avoid having to const cast?
+  thrust::device_ptr<float> Xdata(const_cast<float*>(X.data<float>()));
+  summary_stats_unary_op<float> unary_op;
+  summary_stats_binary_op<float> binary_op;
+  SummaryStatsData<float> init;
+  init.initialize();
+  // compute summary statistics
+  SummaryStatsData<float> result = thrust::transform_reduce(
+#if THRUST_VERSION >= 100800
+      thrust::cuda::par.on(context_.cuda_stream()),
+#endif  // THRUST_VERSION >= 100800
+      Xdata, Xdata + N, unary_op, init, binary_op);
+  float standard_deviation = std::sqrt(result.variance());
+  if (to_file_) {
+    (*log_file_) << result.min << " " << result.max << " " << result.mean << " "
+                 << standard_deviation << std::endl;
+  }
+  if (OutputSize()) {
+    auto* Y = OperatorBase::Output<TensorCUDA>(0);
+    Y->Resize(4);
+    float output_buffer[NUM_STATS] = {result.min, result.max, result.mean,
+                               standard_deviation};
+    context_.Copy<float, CPUContext, CUDAContext>(
+        NUM_STATS, output_buffer, Y->mutable_data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Summarize, SummarizeOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/summarize_op.h b/caffe2/operators/summarize_op.h
new file mode 100644
index 0000000..dddac5c
--- /dev/null
+++ b/caffe2/operators/summarize_op.h
@@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_SUMMARIZE_OP_H_
+#define CAFFE2_OPERATORS_SUMMARIZE_OP_H_
+
+#include <fstream>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+constexpr char kSummaryzeOpExtension[] = ".summary";
+
+template <typename T, class Context>
+class SummarizeOp final : public Operator<Context> {
+ public:
+  SummarizeOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        to_file_(OperatorBase::GetSingleArgument<int>("to_file", 0)) {
+    if (to_file_) {
+      // We will output to file instead of printing on screen.
+      const string& target_folder = ws->RootFolder();
+      // We will write each individual tensor to its individual file.
+      // Also, since the namescope is currently represented by "/", we will
+      // need to replace it with a symbol that does not conflict with the
+      // folder separator in Linux.
+      string proper_name = def.input(0);
+      std::replace(proper_name.begin(), proper_name.end(), '/', '#');
+      log_file_.reset(new std::ofstream(
+          target_folder + "/" + proper_name + kSummaryzeOpExtension,
+          std::ofstream::out | std::ofstream::trunc));
+      CAFFE_ENFORCE(
+          log_file_->good(),
+          "Failed to open summarize file for tensor ",
+          def.input(0),
+          ". rdstate() = ",
+          log_file_->rdstate());
+    }
+  }
+  ~SummarizeOp() {
+    if (to_file_)
+      log_file_->close();
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+
+  static constexpr int MIN_IDX = 0;
+  static constexpr int MAX_IDX = 1;
+  static constexpr int MEAN_IDX = 2;
+  static constexpr int STD_IDX = 3;
+
+  static constexpr int NUM_STATS = 4;
+
+ protected:
+  bool to_file_;
+  std::unique_ptr<std::ofstream> log_file_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SUMMARIZE_OP_H_
diff --git a/caffe2/operators/swish_op.cc b/caffe2/operators/swish_op.cc
new file mode 100644
index 0000000..a636d23
--- /dev/null
+++ b/caffe2/operators/swish_op.cc
@@ -0,0 +1,94 @@
+#include "caffe2/operators/swish_op.h"
+
+#include <string>
+#include <vector>
+
+#include "caffe2/core/types.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool SwishFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorArrayMap<T>(Y, N) = X_arr / (T(1) + (-X_arr).exp());
+  return true;
+}
+
+template <>
+template <typename T>
+bool SwishGradientOp<CPUContext>::DoRunWithType() {
+  auto& Xin = Input(X);
+  auto& Yin = Input(Y);
+  auto& DYin = Input(DY);
+  auto* DXout = Output(DX);
+  CAFFE_ENFORCE_EQ(Xin.size(), Yin.size());
+  CAFFE_ENFORCE_EQ(DYin.size(), Yin.size());
+  DXout->ResizeLike(Yin);
+
+  const float* Xdata = Xin.template data<float>();
+  const float* Ydata = Yin.template data<float>();
+  const float* dYdata = DYin.template data<float>();
+  float* dXdata = DXout->template mutable_data<float>();
+
+  EigenVectorArrayMap<float> dXvec(dXdata, DXout->size());
+  ConstEigenVectorArrayMap<float> Xvec(Xdata, Xin.size());
+  ConstEigenVectorArrayMap<float> Yvec(Ydata, Yin.size());
+  ConstEigenVectorArrayMap<float> dYvec(dYdata, DYin.size());
+
+  // dx = dy * (y + sigmoid(x)*(1-y))
+  dXvec = dYvec * (Yvec + (T(1) / (T(1) + (-Xvec).exp())) * (T(1) - Yvec));
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Swish,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SwishFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(SwishGradient, SwishGradientOp<CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Swish)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Swish takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the swish function, y = x / (1 + exp(-x)), is applied to the
+tensor elementwise.
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D output tensor");
+// Input: X, Y, dY, output: dX
+OPERATOR_SCHEMA(SwishGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{2, 0}})
+    .SetDoc(R"DOC(
+SwishGradient takes X, Y and dY and uses this to update dX according to the
+chain rule and derivatives of the swish function.
+)DOC");
+
+namespace {
+
+class GetSwishGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SwishGradient",
+        "",
+        std::vector<std::string>{I(0), O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Swish, GetSwishGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/swish_op.cu b/caffe2/operators/swish_op.cu
new file mode 100644
index 0000000..d42ad3d
--- /dev/null
+++ b/caffe2/operators/swish_op.cu
@@ -0,0 +1,88 @@
+#include "caffe2/operators/swish_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void SwishCUDAKernel(const int N, const T* X, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) / (T(1) + exp(-__ldg(X + i)));
+#else
+    Y[i] = X[i] / (T(1) + exp(-X[i]));
+#endif
+  }
+}
+
+template <typename T>
+__global__ void SwishGradientCUDAKernel(
+    const int N,
+    const T* X,
+    const T* Y,
+    const T* dY,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) *
+        (__ldg(Y + i) + (T(1) - __ldg(Y + i)) / (T(1) + exp(-__ldg(X + i))));
+#else
+    dX[i] = dY[i] * (Y[i] + (T(1) - Y[i]) / (T(1) + exp(-X[i])));
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool SwishFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SwishCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool SwishGradientOp<CUDAContext>::DoRunWithType() {
+  auto& Xin = Input(X);
+  auto& Yin = Input(Y);
+  auto& DYin = Input(DY);
+  auto* DXout = Output(DX);
+  CAFFE_ENFORCE_EQ(Xin.size(), Yin.size());
+  CAFFE_ENFORCE_EQ(DYin.size(), Yin.size());
+  DXout->ResizeLike(Yin);
+
+  const int n = Xin.size();
+  const T* x = Xin.template data<T>();
+  const T* y = Yin.template data<T>();
+  const T* dy = DYin.template data<T>();
+  T* dx = DXout->template mutable_data<T>();
+  SwishGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(n, x, y, dy, dx);
+  return true;
+}
+
+template <>
+bool SwishGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, double>>::call(this, Input(X));
+}
+
+REGISTER_CUDA_OPERATOR(
+    Swish,
+    UnaryElementwiseOp<
+        TensorTypes<float, double>,
+        CUDAContext,
+        SwishFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(SwishGradient, SwishGradientOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/swish_op.h b/caffe2/operators/swish_op.h
new file mode 100644
index 0000000..c582691
--- /dev/null
+++ b/caffe2/operators/swish_op.h
@@ -0,0 +1,35 @@
+#ifndef CAFFE2_OPERATORS_SWISH_OP_H_
+#define CAFFE2_OPERATORS_SWISH_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SwishFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+class SwishGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SwishGradientOp)
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <typename T>
+  bool DoRunWithType();
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(X));
+  }
+
+ protected:
+  INPUT_TAGS(X, Y, DY);
+  OUTPUT_TAGS(DX);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SWISH_OP_H_
diff --git a/caffe2/operators/tan_op.cc b/caffe2/operators/tan_op.cc
new file mode 100644
index 0000000..62a48bb
--- /dev/null
+++ b/caffe2/operators/tan_op.cc
@@ -0,0 +1,69 @@
+#include "caffe2/operators/tan_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool TanGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr / X_arr.cos().square();
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    Tan,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, TanFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    TanGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        TanGradientFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Tan)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the tangent of the given input tensor, element-wise.
+)DOC")
+    .Input(0, "input", "Input tensor")
+    .Output(
+        0,
+        "output",
+        "The tangent of the input tensor computed element-wise");
+
+OPERATOR_SCHEMA(TanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+
+namespace {
+
+class GetTanGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "TanGradient",
+        "",
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Tan, GetTanGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tan_op.cu b/caffe2/operators/tan_op.cu
new file mode 100644
index 0000000..b04d9df
--- /dev/null
+++ b/caffe2/operators/tan_op.cu
@@ -0,0 +1,59 @@
+#include "caffe2/operators/tan_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+template <typename T>
+inline __host__ __device__ T Square(const T& x) {
+  return x * x;
+}
+
+template <typename T>
+__global__ void
+TanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / Square(cos(__ldg(X + i)));
+#else
+    dX[i] = dY[i] / Square(cos(X[i]));
+#endif
+  }
+}
+
+template <>
+template <typename T>
+bool TanGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& X_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* X,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
+  TanGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Tan,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    TanGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tan_op.h b/caffe2/operators/tan_op.h
new file mode 100644
index 0000000..d9ff787
--- /dev/null
+++ b/caffe2/operators/tan_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_TAN_OP_H_
+#define CAFFE2_OPERATORS_TAN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct TanFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Tan(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct TanGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& dY_dims,
+      const T* X,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TAN_OP_H_
diff --git a/caffe2/operators/tanh_gradient_op.cc b/caffe2/operators/tanh_gradient_op.cc
new file mode 100644
index 0000000..385d895
--- /dev/null
+++ b/caffe2/operators/tanh_gradient_op.cc
@@ -0,0 +1,53 @@
+#include "caffe2/operators/tanh_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool TanhGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (1 - Y_arr * Y_arr);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    TanhGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        TanhGradientFunctor<CPUContext>>);
+
+namespace {
+
+class GetTanhGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "TanhGradient",
+        "",
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Tanh, GetTanhGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tanh_op.cc b/caffe2/operators/tanh_op.cc
new file mode 100644
index 0000000..b0378a5
--- /dev/null
+++ b/caffe2/operators/tanh_op.cc
@@ -0,0 +1,95 @@
+#include "caffe2/operators/tanh_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+#ifdef CAFFE2_USE_ACCELERATE
+template <>
+template <>
+bool TanhFunctor<CPUContext>::operator()<float>(
+    const int N,
+    const float* X,
+    float* Y,
+    CPUContext* /* context */) const {
+  vvtanhf(Y, X, &N);
+  return true;
+}
+#endif // CAFFE2_USE_ACCELERATE
+
+REGISTER_CPU_OPERATOR(
+    Tanh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        TanhFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Tanh)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Calculates the hyperbolic tangent of the given input tensor element-wise. This
+operation can be done in an in-place fashion too, by providing the same input
+and output blobs.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/tanh_op.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Tanh",
+    ["X"],
+    ["X"],
+)
+
+workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
+print("X:\n", workspace.FetchBlob("X"), "\n")
+
+workspace.RunOperatorOnce(op)
+print("X:\n", workspace.FetchBlob("X"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [[ 2.032603   -2.3556721  -0.14955314]
+ [ 0.39309832 -1.1020128  -0.92951244]
+ [-0.62815386  0.21342885  1.4002231 ]]
+
+X:
+ [[ 0.9662601  -0.982175   -0.14844811]
+ [ 0.3740282  -0.8012209  -0.73036647]
+ [-0.55677974  0.21024609  0.8853999 ]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "1-D input tensor")
+    .Output(
+        0,
+        "output",
+        "The hyperbolic tangent values of the input tensor, computed "
+        "element-wise")
+    .InheritOnnxSchema("Tanh");
+
+OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}});
+
+} // namespace caffe2
diff --git a/caffe2/operators/tanh_op.cu b/caffe2/operators/tanh_op.cu
new file mode 100644
index 0000000..17ebac1
--- /dev/null
+++ b/caffe2/operators/tanh_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/operators/tanh_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void
+TanhGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * (T(1) - __ldg(Y + i) * __ldg(Y + i));
+#else
+    dX[i] = dY[i] * (T(1) - Y[i] * Y[i]);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool TanhGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  TanhGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Tanh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanhFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    TanhGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanhGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tanh_op.h b/caffe2/operators/tanh_op.h
new file mode 100644
index 0000000..123773d
--- /dev/null
+++ b/caffe2/operators/tanh_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_TANH_OP_H_
+#define CAFFE2_OPERATORS_TANH_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct TanhFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Tanh<T, Context>(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct TanhGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& Y_dims,
+      const std::vector<int>& dY_dims,
+      const T* Y,
+      const T* dY,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TANH_OP_H_
diff --git a/caffe2/operators/tanh_op_cudnn.cc b/caffe2/operators/tanh_op_cudnn.cc
new file mode 100644
index 0000000..d59895a
--- /dev/null
+++ b/caffe2/operators/tanh_op_cudnn.cc
@@ -0,0 +1,12 @@
+#include "caffe2/operators/tanh_op.h"
+
+#include "caffe2/operators/activation_ops_cudnn.h"
+
+namespace caffe2 {
+
+REGISTER_CUDNN_OPERATOR(Tanh, CuDNNActivationOp<CUDNN_ACTIVATION_TANH>);
+REGISTER_CUDNN_OPERATOR(
+    TanhGradient,
+    CuDNNActivationGradientOp<CUDNN_ACTIVATION_TANH>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tensor_protos_db_input.cc b/caffe2/operators/tensor_protos_db_input.cc
new file mode 100644
index 0000000..c102fad
--- /dev/null
+++ b/caffe2/operators/tensor_protos_db_input.cc
@@ -0,0 +1,32 @@
+#include "caffe2/operators/tensor_protos_db_input.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(TensorProtosDBInput, TensorProtosDBInput<CPUContext>);
+
+OPERATOR_SCHEMA(TensorProtosDBInput)
+  .NumInputs(1)
+  .NumOutputs(1, INT_MAX)
+  .SetDoc(R"DOC(
+TensorProtosDBInput is a simple input operator that basically reads things
+from a db where each key-value pair stores an index as key, and a TensorProtos
+object as value. These TensorProtos objects should have the same size, and they
+will be grouped into batches of the given size. The DB Reader is provided as
+input to the operator and it returns as many output tensors as the size of the
+TensorProtos object. Each output will simply be a tensor containing a batch of
+data with size specified by the 'batch_size' argument containing data from the
+corresponding index in the TensorProtos objects in the DB.
+)DOC")
+  .Arg("batch_size", "(int, default 0) the number of samples in a batch. The "
+       "default value of 0 means that the operator will attempt to insert the "
+       "entire data in a single output blob.")
+  .Input(0, "data", "A pre-initialized DB reader. Typically, this is obtained "
+         "by calling CreateDB operator with a db_name and a db_type. The "
+         "resulting output blob is a DB Reader tensor")
+  .Output(0, "output", "The output tensor in which the batches of data are "
+          "returned. The number of output tensors is equal to the size of "
+          "(number of TensorProto's in) the TensorProtos objects stored in the "
+          "DB as values. Each output tensor will be of size specified by the "
+          "'batch_size' argument of the operator");
+
+NO_GRADIENT(TensorProtosDBInput);
+}  // namespace caffe2
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
new file mode 100644
index 0000000..e9e55b8
--- /dev/null
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -0,0 +1,109 @@
+#ifndef CAFFE2_OPERATORS_TENSOR_PROTOS_DB_INPUT_H_
+#define CAFFE2_OPERATORS_TENSOR_PROTOS_DB_INPUT_H_
+
+#include <iostream>
+#include <mutex>
+
+#include "caffe2/core/db.h"
+#include "caffe2/operators/prefetch_op.h"
+
+namespace caffe2 {
+
+template <class Context>
+class TensorProtosDBInput final : public PrefetchOperator<Context> {
+ public:
+  using OperatorBase::OutputSize;
+  using PrefetchOperator<Context>::prefetch_thread_;
+  explicit TensorProtosDBInput(const OperatorDef& operator_def, Workspace* ws);
+  ~TensorProtosDBInput() {
+    PrefetchOperator<Context>::Finalize();
+  }
+
+  bool Prefetch() override;
+  bool CopyPrefetched() override;
+
+ private:
+  // Prefetch will always just happen on the CPU side.
+  vector<Blob> prefetched_blobs_;
+  int batch_size_;
+  bool shape_inferred_ = false;
+  string key_;
+  string value_;
+};
+
+template <class Context>
+TensorProtosDBInput<Context>::TensorProtosDBInput(
+    const OperatorDef& operator_def,
+    Workspace* ws)
+    : PrefetchOperator<Context>(operator_def, ws),
+      prefetched_blobs_(operator_def.output_size()),
+      batch_size_(
+          OperatorBase::template GetSingleArgument<int>("batch_size", 0)) {}
+
+template <class Context>
+bool TensorProtosDBInput<Context>::Prefetch() {
+  const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
+  TensorDeserializer<CPUContext> deserializer;
+  if (batch_size_ == 0) {
+    // We do not need to construct a batch. As a result, we will simply
+    // deserialize everything into the target prefetched blob.
+    reader.Read(&key_, &value_);
+    TensorProtos protos;
+    CAFFE_ENFORCE(protos.ParseFromString(value_));
+    CAFFE_ENFORCE(protos.protos_size() == OutputSize());
+    for (int i = 0; i < protos.protos_size(); ++i) {
+      if (protos.protos(i).has_device_detail()) {
+        protos.mutable_protos(i)->clear_device_detail();
+      }
+      deserializer.Deserialize(
+          protos.protos(i),
+          prefetched_blobs_[i].template GetMutable<TensorCPU>());
+    }
+  } else {
+    vector<TensorCPU> temp_tensors(OutputSize());
+    for (int item_id = 0; item_id < batch_size_; ++item_id) {
+      reader.Read(&key_, &value_);
+      TensorProtos protos;
+      CAFFE_ENFORCE(protos.ParseFromString(value_));
+      CAFFE_ENFORCE(protos.protos_size() == OutputSize());
+      if (!shape_inferred_) {
+        // First, set the shape of all the blobs.
+        for (int i = 0; i < protos.protos_size(); ++i) {
+          vector<int> dims(
+              protos.protos(i).dims().begin(), protos.protos(i).dims().end());
+          dims.insert(dims.begin(), batch_size_);
+          prefetched_blobs_[i].template GetMutable<TensorCPU>()->Resize(dims);
+        }
+      }
+      for (int i = 0; i < protos.protos_size(); ++i) {
+        TensorCPU* dst = prefetched_blobs_[i].template GetMutable<TensorCPU>();
+        TensorCPU& src = temp_tensors[i];
+        if (protos.protos(i).has_device_detail()) {
+          protos.mutable_protos(i)->clear_device_detail();
+        }
+        deserializer.Deserialize(protos.protos(i), &src);
+        DCHECK_EQ(src.size() * batch_size_, dst->size());
+        this->context_.template CopyItems<CPUContext, CPUContext>(
+            src.meta(),
+            src.size(),
+            src.raw_data(),
+            static_cast<char*>(dst->raw_mutable_data(src.meta())) +
+                src.nbytes() * item_id);
+      }
+    }
+  }
+  return true;
+}
+
+template <class Context>
+bool TensorProtosDBInput<Context>::CopyPrefetched() {
+  for (int i = 0; i < OutputSize(); ++i) {
+    OperatorBase::Output<Tensor<Context>>(i)->CopyFrom(
+        prefetched_blobs_[i].template Get<TensorCPU>(), &this->context_);
+  }
+  return true;
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TENSOR_PROTOS_DB_INPUT_H_
diff --git a/caffe2/operators/tensor_protos_db_input_gpu.cc b/caffe2/operators/tensor_protos_db_input_gpu.cc
new file mode 100644
index 0000000..d932b50
--- /dev/null
+++ b/caffe2/operators/tensor_protos_db_input_gpu.cc
@@ -0,0 +1,7 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/tensor_protos_db_input.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(TensorProtosDBInput, TensorProtosDBInput<CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/operators/text_file_reader.cc b/caffe2/operators/text_file_reader.cc
new file mode 100644
index 0000000..7fb696f
--- /dev/null
+++ b/caffe2/operators/text_file_reader.cc
@@ -0,0 +1,192 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/operators/text_file_reader_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+
+struct TextFileReaderInstance {
+  TextFileReaderInstance(
+      const std::vector<char>& delims,
+      char escape,
+      const std::string& filename,
+      int numPasses,
+      const std::vector<int>& types)
+      : fileReader(filename),
+        tokenizer(Tokenizer(delims, escape), &fileReader, numPasses),
+        fieldTypes(types) {
+    for (const auto dt : fieldTypes) {
+      fieldMetas.push_back(
+          DataTypeToTypeMeta(static_cast<TensorProto_DataType>(dt)));
+      fieldByteSizes.push_back(fieldMetas.back().itemsize());
+    }
+  }
+
+  FileReader fileReader;
+  BufferedTokenizer tokenizer;
+  std::vector<int> fieldTypes;
+  std::vector<TypeMeta> fieldMetas;
+  std::vector<size_t> fieldByteSizes;
+  size_t rowsRead{0};
+
+  // hack to guarantee thread-safeness of the read op
+  // TODO(azzolini): support multi-threaded reading.
+  std::mutex globalMutex_;
+};
+
+class CreateTextFileReaderOp : public Operator<CPUContext> {
+ public:
+  CreateTextFileReaderOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        filename_(GetSingleArgument<string>("filename", "")),
+        numPasses_(GetSingleArgument<int>("num_passes", 1)),
+        fieldTypes_(GetRepeatedArgument<int>("field_types")) {
+    CAFFE_ENFORCE(fieldTypes_.size() > 0, "field_types arg must be non-empty");
+  }
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<TextFileReaderInstance>>(0) =
+        std::unique_ptr<TextFileReaderInstance>(new TextFileReaderInstance(
+            {'\n', '\t'}, '\0', filename_, numPasses_, fieldTypes_));
+    return true;
+  }
+
+ private:
+  std::string filename_;
+  int numPasses_;
+  std::vector<int> fieldTypes_;
+};
+
+inline void convert(
+    TensorProto_DataType dst_type,
+    const char* src_start,
+    const char* src_end,
+    void* dst) {
+  switch (dst_type) {
+    case TensorProto_DataType_STRING: {
+      static_cast<std::string*>(dst)->assign(src_start, src_end);
+    } break;
+    case TensorProto_DataType_FLOAT: {
+      // TODO(azzolini): avoid copy, use faster convertion
+      std::string str_copy(src_start, src_end);
+      const char* src_copy = str_copy.c_str();
+      char* src_copy_end;
+      float val = strtof(src_copy, &src_copy_end);
+      if (src_copy == src_copy_end) {
+        throw std::runtime_error("Invalid float: " + str_copy);
+      }
+      *static_cast<float*>(dst) = val;
+    } break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+class TextFileReaderReadOp : public Operator<CPUContext> {
+ public:
+  TextFileReaderReadOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        batchSize_(GetSingleArgument<int>("batch_size", 1)) {}
+
+  bool RunOnDevice() override {
+    const int numFields = OutputSize();
+    CAFFE_ENFORCE(numFields > 0, "Expected at least one output.");
+
+    auto instance =
+        OperatorBase::Input<std::unique_ptr<TextFileReaderInstance>>(0).get();
+
+    CAFFE_ENFORCE(
+        instance->fieldTypes.size() == numFields,
+        "Invalid number of outputs. Expected " +
+            to_string(instance->fieldTypes.size()) + " got " +
+            to_string(numFields));
+
+    // char* datas[numFields];
+    // MSVC does not allow using const int, so we will need to dynamically allocate
+    // it.
+    std::vector<char*> datas(numFields);
+    for (int i = 0; i < numFields; ++i) {
+      Output(i)->Resize(batchSize_);
+      datas[i] = (char*)Output(i)->raw_mutable_data(instance->fieldMetas[i]);
+    }
+
+    int rowsRead = 0;
+    {
+      // TODO(azzolini): support multi-threaded reading
+      std::lock_guard<std::mutex> guard(instance->globalMutex_);
+
+      bool finished = false;
+      Token token;
+      while (!finished && (rowsRead < batchSize_)) {
+        int field;
+        for (field = 0; field < numFields; ++field) {
+          finished = !instance->tokenizer.next(token);
+          if (finished) {
+            CAFFE_ENFORCE(
+                field == 0, "Invalid number of fields at end of file.");
+            break;
+          }
+          CAFFE_ENFORCE(
+              (field == 0 && token.startDelimId == 0) ||
+                  (field > 0 && token.startDelimId == 1),
+              "Invalid number of columns at row ",
+              instance->rowsRead + rowsRead + 1);
+          const auto& meta = instance->fieldMetas[field];
+          char*& data = datas[field];
+          convert(
+              (TensorProto_DataType)instance->fieldTypes[field],
+              token.start,
+              token.end,
+              data);
+          data += instance->fieldByteSizes[field];
+        }
+        if (!finished) {
+          ++rowsRead;
+        }
+      }
+      instance->rowsRead += rowsRead;
+    }
+
+    for (int i = 0; i < numFields; ++i) {
+      Output(i)->Shrink(rowsRead);
+    }
+    return true;
+  }
+
+ private:
+  TIndex batchSize_;
+};
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<TextFileReaderInstance>);
+
+REGISTER_CPU_OPERATOR(CreateTextFileReader, CreateTextFileReaderOp);
+REGISTER_CPU_OPERATOR(TextFileReaderRead, TextFileReaderReadOp);
+
+OPERATOR_SCHEMA(CreateTextFileReader)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Create a text file reader. Fields are delimited by <TAB>.")
+    .Arg("filename", "Path to the file.")
+    .Arg("num_passes", "Number of passes over the file.")
+    .Arg(
+        "field_types",
+        "List with type of each field. Type enum is found at core.DataType.")
+    .Output(0, "handler", "Pointer to the created TextFileReaderInstance.");
+
+OPERATOR_SCHEMA(TextFileReaderRead)
+    .NumInputs(1)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(
+        "Read a batch of rows from the given text file reader instance. "
+        "Expects the number of fields to be equal to the number of outputs. "
+        "Each output is a 1D tensor containing the values for the given field "
+        "for each row. When end of file is reached, returns empty tensors.")
+    .Input(0, "handler", "Pointer to an existing TextFileReaderInstance.")
+    .Arg("batch_size", "Maximum number of rows to read.");
+
+NO_GRADIENT(CreateTextFileReader);
+NO_GRADIENT(TextFileReaderRead);
+
+} // namespace caffe2
diff --git a/caffe2/operators/text_file_reader_utils.cc b/caffe2/operators/text_file_reader_utils.cc
new file mode 100644
index 0000000..615575c
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.cc
@@ -0,0 +1,119 @@
+#include "caffe2/operators/text_file_reader_utils.h"
+
+#include <fcntl.h>
+#include <cerrno>
+#include <cstring>
+#include <sstream>
+
+namespace caffe2 {
+
+Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
+    : escape_(escape) {
+  reset();
+  std::memset(delimTable_, 0, sizeof(delimTable_));
+  for (int i = 0; i < delims.size(); ++i) {
+    delimTable_[(unsigned char)delims.at(i)] = i + 1;
+  }
+}
+
+void Tokenizer::reset() {
+  toBeSkipped_ = 0;
+  startDelimId_ = 0;
+  leftover_.clear();
+}
+
+void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
+  tokenized.modifiedStrings_.clear();
+  tokenized.tokens_.clear();
+
+  char* currentStart = start;
+  std::string* copied = nullptr;
+  if (!leftover_.empty()) {
+    tokenized.modifiedStrings_.emplace_back(new std::string());
+    copied = tokenized.modifiedStrings_.back().get();
+    *copied = std::move(leftover_);
+  }
+
+  char* ch;
+  for (ch = start + toBeSkipped_; ch < end; ++ch) {
+    if (*ch == escape_) {
+      if (!copied) {
+        tokenized.modifiedStrings_.emplace_back(new std::string());
+        copied = tokenized.modifiedStrings_.back().get();
+      }
+      copied->append(currentStart, ch);
+      currentStart = ch + 1;
+      // skip next character, since it's escaped
+      ++ch;
+      continue;
+    }
+    int newDelimId = delimTable_[(unsigned char)*ch];
+    if (newDelimId > 0) {
+      // found delimiter
+      tokenized.tokens_.emplace_back();
+      auto& token = tokenized.tokens_.back();
+      token.startDelimId = startDelimId_;
+      if (copied) {
+        copied->append(currentStart, ch);
+        const char* c_str = copied->data();
+        token.start = c_str;
+        token.end = c_str + copied->size();
+      } else {
+        token.start = currentStart;
+        token.end = ch;
+      }
+      currentStart = ch + 1;
+      copied = nullptr;
+      startDelimId_ = newDelimId - 1;
+    }
+  }
+  tokenized.lastDelim_ = startDelimId_;
+
+  toBeSkipped_ = ch - end;
+  if (copied) {
+    copied->append(currentStart, end);
+    leftover_ = std::move(*copied);
+  } else {
+    leftover_.assign(currentStart, end);
+  }
+}
+
+FileReader::FileReader(const std::string& path, size_t bufferSize)
+    : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
+  fd_ = open(path.c_str(), O_RDONLY, 0777);
+  if (fd_ < 0) {
+    throw std::runtime_error(
+        "Error opening file for reading: " + std::string(std::strerror(errno)) +
+        " Path=" + path);
+  }
+}
+
+void FileReader::reset() {
+  if (lseek(fd_, 0, SEEK_SET) == -1) {
+    throw std::runtime_error(
+        "Error reseting file cursor: " + std::string(std::strerror(errno)));
+  }
+}
+
+FileReader::~FileReader() {
+  if (fd_ >= 0) {
+    close(fd_);
+  }
+}
+
+void FileReader::operator()(CharRange& range) {
+  char* buffer = buffer_.get();
+  auto numRead = read(fd_, buffer, bufferSize_);
+  if (numRead == -1) {
+    throw std::runtime_error(
+        "Error reading file: " + std::string(std::strerror(errno)));
+  }
+  if (numRead == 0) {
+    range.start = nullptr;
+    range.end = nullptr;
+    return;
+  }
+  range.start = buffer;
+  range.end = buffer + numRead;
+}
+}
diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h
new file mode 100644
index 0000000..eaa94f4
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.h
@@ -0,0 +1,122 @@
+#ifndef CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
+#define CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+struct Token {
+  int startDelimId;
+  const char* start;
+  const char* end;
+};
+
+class TokenizedString {
+  // holder for strings that have been modified
+  std::vector<std::unique_ptr<std::string>> modifiedStrings_;
+  std::vector<Token> tokens_;
+  int lastDelim_;
+
+ public:
+  const std::vector<Token>& tokens() const {
+    return tokens_;
+  }
+  int lastDelim() const {
+    return lastDelim_;
+  }
+  friend class Tokenizer;
+};
+
+class Tokenizer {
+ private:
+  int startDelimId_;
+  // state of the tokenizer
+  std::string leftover_;
+  // if we need to skip the first characters of the next batch because
+  // e.g. an escape char that was the last character of the last batch.
+  int toBeSkipped_;
+  int delimTable_[256];
+  const char escape_;
+
+ public:
+  Tokenizer(const std::vector<char>& delimiters, char escape);
+  void reset();
+  void next(char* start, char* end, TokenizedString& tokenized);
+};
+
+struct CharRange {
+  char* start;
+  char* end;
+};
+
+struct StringProvider {
+  virtual void operator()(CharRange&) = 0;
+  virtual void reset() = 0;
+  virtual ~StringProvider() {}
+};
+
+class BufferedTokenizer {
+ public:
+  BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
+      : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
+
+  bool next(Token& token) {
+    CharRange range;
+    while (tokenIndex_ >= tokenized_.tokens().size()) {
+      range.start = nullptr;
+      while (range.start == nullptr && pass_ < numPasses_) {
+        (*provider_)(range);
+        if (range.start == nullptr) {
+          ++pass_;
+          if (pass_ < numPasses_) {
+            provider_->reset();
+            tokenizer_.reset();
+          }
+        }
+      }
+      if (range.start == nullptr) {
+        return false;
+      }
+      tokenizer_.next(range.start, range.end, tokenized_);
+      tokenIndex_ = 0;
+    }
+    token = tokenized_.tokens()[tokenIndex_++];
+    return true;
+  };
+
+  int endDelim() const {
+    if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
+      return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
+    }
+    return tokenized_.lastDelim();
+  }
+
+ private:
+  StringProvider* provider_;
+  Tokenizer tokenizer_;
+  TokenizedString tokenized_;
+  int tokenIndex_;
+  int numPasses_;
+  int pass_{0};
+};
+
+class FileReader : public StringProvider {
+ public:
+  explicit FileReader(const std::string& path, size_t bufferSize = 65536);
+  ~FileReader();
+  void operator()(CharRange& range) override;
+  void reset() override;
+
+ private:
+  const size_t bufferSize_;
+  int fd_;
+  std::unique_ptr<char[]> buffer_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
diff --git a/caffe2/operators/text_file_reader_utils_test.cc b/caffe2/operators/text_file_reader_utils_test.cc
new file mode 100644
index 0000000..c0114e6
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils_test.cc
@@ -0,0 +1,122 @@
+#include <fstream>
+#include "caffe2/core/blob.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/math.h"
+#include <gtest/gtest.h>
+
+#include "caffe2/operators/text_file_reader_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace caffe2 {
+
+TEST(TextFileReaderUtilsTest, TokenizeTest) {
+  TokenizedString tokenized;
+  std::string ch =
+      "label\1text\xc3\xbf\nlabel2\\\nTest\1tex\\\\t2\n"
+      "Two\\\\Escapes\\\1\1Second\n";
+  std::vector<char> seps = {'\n', '\1'};
+  Tokenizer tokenizer(seps, '\\');
+  tokenizer.next(&ch.front(), &ch.back() + 1, tokenized);
+
+  std::vector<std::pair<int, std::string>> expected = {{0, "label"},
+                                                       {1, "text\xc3\xbf"},
+                                                       {0, "label2\nTest"},
+                                                       {1, "tex\\t2"},
+                                                       {0, "Two\\Escapes\1"},
+                                                       {1, "Second"}};
+
+  EXPECT_EQ(expected.size(), tokenized.tokens().size());
+  for (int i = 0; i < expected.size(); ++i) {
+    const auto& token = tokenized.tokens().at(i);
+    EXPECT_EQ(expected.at(i).first, token.startDelimId);
+    EXPECT_EQ(expected.at(i).second, std::string(token.start, token.end));
+  }
+
+  // try each of the subsplits
+  for (int i = 0; i < ch.size() - 1; ++i) {
+    tokenizer.reset();
+    char* mid = &ch.front() + i;
+
+    tokenizer.next(&ch.front(), mid, tokenized);
+    EXPECT_GE(expected.size(), tokenized.tokens().size());
+    for (int j = 0; j < tokenized.tokens().size(); ++j) {
+      const auto& token = tokenized.tokens().at(j);
+      EXPECT_EQ(expected.at(j).first, token.startDelimId);
+      EXPECT_EQ(expected.at(j).second, std::string(token.start, token.end));
+    }
+    int s1 = tokenized.tokens().size();
+
+    tokenizer.next(mid, &ch.back() + 1, tokenized);
+    EXPECT_EQ(expected.size(), s1 + tokenized.tokens().size());
+    for (int j = 0; j < tokenized.tokens().size(); ++j) {
+      const auto& token = tokenized.tokens().at(j);
+      EXPECT_EQ(expected.at(j + s1).first, token.startDelimId);
+      EXPECT_EQ(
+          expected.at(j + s1).second, std::string(token.start, token.end));
+    }
+    EXPECT_EQ(0, tokenized.lastDelim());
+  }
+
+  struct ChunkProvider : public StringProvider {
+    ChunkProvider(const std::string& str) : ch(str) {}
+    std::string ch;
+    size_t charIdx{0};
+    void operator()(CharRange& range) {
+      if (charIdx >= ch.size()) {
+        range.start = nullptr;
+        range.end = nullptr;
+      } else {
+        size_t endIdx = std::min(charIdx + 10, ch.size());
+        range.start = &ch.front() + charIdx;
+        range.end = &ch.front() + endIdx;
+        charIdx = endIdx;
+      }
+    };
+    void reset() {
+      charIdx = 0;
+    }
+  };
+
+  for (int numPasses = 1; numPasses <= 2; ++numPasses) {
+    ChunkProvider chunkProvider(ch);
+    BufferedTokenizer bt(tokenizer, &chunkProvider, numPasses);
+    Token token;
+    int i = 0;
+    for (i = 0; bt.next(token); ++i) {
+      EXPECT_GT(expected.size() * numPasses, i);
+      const auto& expectedToken = expected.at(i % expected.size());
+      EXPECT_EQ(expectedToken.first, token.startDelimId);
+      EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
+    }
+    EXPECT_EQ(expected.size() * numPasses, i);
+    EXPECT_EQ(0, bt.endDelim());
+  }
+
+  char* tmpname = std::tmpnam(nullptr);
+  std::ofstream outFile;
+  outFile.open(tmpname);
+  outFile << ch;
+  outFile.close();
+  for (int numPasses = 1; numPasses <= 2; ++numPasses) {
+    FileReader fr(tmpname, 5);
+    BufferedTokenizer fileTokenizer(tokenizer, &fr, numPasses);
+    Token token;
+    int i;
+    for (i = 0; fileTokenizer.next(token); ++i) {
+      EXPECT_GT(expected.size() * numPasses, i);
+      const auto& expectedToken = expected.at(i % expected.size());
+      EXPECT_EQ(expectedToken.first, token.startDelimId);
+      EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
+    }
+    EXPECT_EQ(expected.size() * numPasses, i);
+    EXPECT_EQ(0, fileTokenizer.endDelim());
+  }
+  std::remove(tmpname);
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/thresholded_relu_op.cc b/caffe2/operators/thresholded_relu_op.cc
new file mode 100644
index 0000000..8b5e6b5
--- /dev/null
+++ b/caffe2/operators/thresholded_relu_op.cc
@@ -0,0 +1,94 @@
+#include "caffe2/operators/thresholded_relu_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool ThresholdedReluOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+
+  ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
+  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
+  Yvec = (Xvec > alpha_).select(Xvec, 0.f);
+  /* Naive implementation
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  for (int i = 0; i < X.size(); ++i) {
+    Xdata[i] -= alpha_;
+    Ydata[i] = std::max(Xdata[i], 0.0f);
+  }
+  */
+  return true;
+}
+
+template <>
+bool ThresholdedReluGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
+  ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
+  ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
+  dXvec = dYvec * Yvec.cwiseSign();
+  /* Non vectorized implementation
+  for (int i = 0; i < Y.size(); ++i) {
+    dXdata[i] = Ydata[i] > 0 ? dYdata[i] : 0;
+  }
+  */
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ThresholdedRelu, ThresholdedReluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    ThresholdedReluGradient,
+    ThresholdedReluGradientOp<float, CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(ThresholdedRelu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .CostInferenceFunction(PointwiseCostInference<2>)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+ThresholdedRelu takes one input data (Tensor) and produces one output data
+(Tensor) where the rectified linear function, y = x for x > alpha, y = 0
+otherwise, is applied to the tensor elementwise.
+)DOC")
+    .Arg("alpha", "(float) defaults to 1.0.")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D input tensor");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(ThresholdedReluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+ThresholdedReluGradient takes both Y and dY and uses this to update dX
+according to the chain rule and derivatives of the rectified linear function.
+)DOC");
+
+class GetThresholdedReluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ThresholdedRelu, GetThresholdedReluGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/thresholded_relu_op.cu b/caffe2/operators/thresholded_relu_op.cu
new file mode 100644
index 0000000..a12ee62
--- /dev/null
+++ b/caffe2/operators/thresholded_relu_op.cu
@@ -0,0 +1,58 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/thresholded_relu_op.h"
+
+namespace caffe2 {
+namespace {
+template <typename T>
+__global__ void ThresholdedReluKernel(const int N, const T* X, T* Y, T alpha_) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = X[i] > alpha_ ? X[i] : 0;
+  }
+}
+
+template <typename T>
+__global__ void
+ThresholdedReluGradientKernel(const int N, const T* Y, const T* dY, T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dX[i] = Y[i] > 0 ? dY[i] : 0;
+  }
+}
+} // namespace
+
+template <>
+bool ThresholdedReluOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE_GT(X.size(), 0);
+  Y->ResizeLike(X);
+  ThresholdedReluKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_);
+  return true;
+}
+
+template <>
+bool ThresholdedReluGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  CAFFE_ENFORCE_GT(Y.size(), 0);
+  CAFFE_ENFORCE_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  ThresholdedReluGradientKernel<<<
+      CAFFE_GET_BLOCKS(Y.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(ThresholdedRelu, ThresholdedReluOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ThresholdedReluGradient,
+    ThresholdedReluGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/thresholded_relu_op.h b/caffe2/operators/thresholded_relu_op.h
new file mode 100644
index 0000000..0194029
--- /dev/null
+++ b/caffe2/operators/thresholded_relu_op.h
@@ -0,0 +1,43 @@
+#ifndef CAFFE2_OPERATORS_THRESHOLDED_RELU_OP_H_
+#define CAFFE2_OPERATORS_THRESHOLDED_RELU_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ThresholdedReluOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ThresholdedReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    alpha_ = OperatorBase::GetSingleArgument<T>("alpha", 1.0);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+template <typename T, class Context>
+class ThresholdedReluGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ThresholdedReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    alpha_ = OperatorBase::GetSingleArgument<T>("alpha", 1.0);
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_THRESHOLDED_RELU_OP_H_
diff --git a/caffe2/operators/tile_op.cc b/caffe2/operators/tile_op.cc
new file mode 100644
index 0000000..dc0f93c
--- /dev/null
+++ b/caffe2/operators/tile_op.cc
@@ -0,0 +1,116 @@
+#include "caffe2/operators/tile_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Tile, TileOp<CPUContext>);
+REGISTER_CPU_OPERATOR(TileGradient, TileGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Tile)
+    .NumInputs(1, 3)
+    .NumOutputs(1)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          vector<TensorShape> out(1);
+          out[0] = TensorShape(in[0]);
+          ArgumentHelper helper(def);
+
+          auto tiles = helper.GetSingleArgument<int32_t>("tiles", 1);
+          auto axis = helper.GetSingleArgument<int32_t>("axis", 0);
+          if (in.size() > 1) {
+            // Tile or axis is specified as input; we can't determine
+            // the size
+            out[0].set_unknown_shape(true);
+          } else {
+            const auto canonical_axis =
+                canonical_axis_index_(axis, out[0].dims().size());
+            out[0].set_dims(
+                canonical_axis, out[0].dims().Get(canonical_axis) * tiles);
+          }
+          return out;
+        })
+    .SetDoc(R"DOC(
+Constructs a tensor by tiling a given tensor along a specified axis. This operation creates a new tensor by replicating the input tensor a number of times specified by the `tiles` argument along the `axis` dimension. The output tensor's `axis` dimension has $(X.dims(axis) * tiles)$ elements.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/tile_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Tile",
+    ["X", "tiles", "axis"],
+    ["Y"]
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(5,5)))
+workspace.FeedBlob("tiles", np.array([5]).astype(np.int32))
+workspace.FeedBlob("axis", np.array([1]).astype(np.int32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Y:", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[9 1 7 1 3]
+ [2 3 6 2 5]
+ [0 9 2 6 4]
+ [5 8 1 5 9]
+ [2 0 1 3 7]]
+Y:
+[[9 1 7 1 3 9 1 7 1 3 9 1 7 1 3 9 1 7 1 3 9 1 7 1 3]
+ [2 3 6 2 5 2 3 6 2 5 2 3 6 2 5 2 3 6 2 5 2 3 6 2 5]
+ [0 9 2 6 4 0 9 2 6 4 0 9 2 6 4 0 9 2 6 4 0 9 2 6 4]
+ [5 8 1 5 9 5 8 1 5 9 5 8 1 5 9 5 8 1 5 9 5 8 1 5 9]
+ [2 0 1 3 7 2 0 1 3 7 2 0 1 3 7 2 0 1 3 7 2 0 1 3 7]]
+
+```
+
+</details>
+
+)DOC")
+    .Arg("tiles", "(*int*): number of replicas")
+    .Arg("axis", "(*int*): axis to replicate along")
+    .Input(0, "X", "(*Tensor*): input tensor")
+    .Input(1, "tiles", "(*Tensor`<int>`*): [OPTIONAL] number of replicas (overrides `tiles` argument)")
+    .Input(2, "axis", "(*Tensor`<int>`*): [OPTIONAL] axis to replicate along (overrides `axis` argument)")
+    .Output(
+        0,
+        "Y",
+        "(*Tensor*): output tensor")
+    .InheritOnnxSchema("Tile");
+
+OPERATOR_SCHEMA(TileGradient).NumInputs(1, 3).NumOutputs(1);
+
+class GetTileGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // Check whether the tiles/axis information was
+    // passed through input arguments
+    vector<std::string> g_inputs({GO(0)});
+    if (Def().input_size() > 1) {
+      g_inputs.push_back(I(1));
+    }
+    if (Def().input_size() > 2) {
+      g_inputs.push_back(I(2));
+    }
+    return SingleGradientDef(
+        "TileGradient", "", g_inputs, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(Tile, GetTileGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tile_op.cu b/caffe2/operators/tile_op.cu
new file mode 100644
index 0000000..cfd4e52
--- /dev/null
+++ b/caffe2/operators/tile_op.cu
@@ -0,0 +1,91 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/tile_op.h"
+
+namespace caffe2 {
+namespace {
+__global__ void TileCopyKernel(
+    int item_size,
+    int outer_dim,
+    int inner_dim,
+    int tiles,
+    const char* input_data,
+    char* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, outer_dim * tiles) {
+    int i = index / tiles;
+    int t = index % tiles;
+    const char* input_ptr = input_data + inner_dim * item_size * i;
+    char* output_ptr = output_data + (i * tiles + t) * inner_dim * item_size;
+    memcpy(output_ptr, input_ptr, inner_dim * item_size);
+  }
+}
+
+template <typename T>
+__global__ void TileGradientAxpyKernel(
+    int outer_dim,
+    int inner_dim,
+    int tiles,
+    const T* input_data,
+    T* output_data) {
+  typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+
+  for (int idx = blockIdx.x; idx < outer_dim * inner_dim; idx += gridDim.x) {
+    int i = idx / inner_dim;
+    int j = idx % inner_dim;
+    T* output_ptr = output_data + inner_dim * i;
+
+    T x = 0.0;
+    for (int t = threadIdx.x; t < tiles; t += blockDim.x) {
+      const T* input_ptr = input_data + (i * tiles + t) * inner_dim;
+      x += input_ptr[j];
+    }
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    T totx = BlockReduce(temp_storage).Sum(x);
+    if (threadIdx.x == 0) {
+      output_ptr[j] = totx;
+    }
+    __syncthreads();
+  }
+}
+} // namespace
+
+template <>
+void TileOp<CUDAContext>::DoTile(
+    const TypeMeta& meta,
+    int item_size,
+    int outer_dim,
+    int inner_dim,
+    const char* input_data,
+    char* output_data) {
+  TileCopyKernel<<<
+      std::min(outer_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      item_size, outer_dim, inner_dim, tiles_, input_data, output_data);
+}
+
+template <>
+void TileGradientOp<float, CUDAContext>::DoTileGradient(
+    const TypeMeta& meta,
+    int item_size,
+    int outer_dim,
+    int inner_dim,
+    const char* input_data,
+    char* output_data) {
+  TileGradientAxpyKernel<float><<<
+      std::min(outer_dim * inner_dim, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      outer_dim,
+      inner_dim,
+      tiles_,
+      reinterpret_cast<const float*>(input_data),
+      reinterpret_cast<float*>(output_data));
+}
+
+REGISTER_CUDA_OPERATOR(Tile, TileOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(TileGradient, TileGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
new file mode 100644
index 0000000..046aaa5
--- /dev/null
+++ b/caffe2/operators/tile_op.h
@@ -0,0 +1,256 @@
+#ifndef CAFFE2_OPERATORS_TILE_OP_H_
+#define CAFFE2_OPERATORS_TILE_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Copy a Blob n times along a specified axis.
+template <class Context>
+class TileOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        tiles_(OperatorBase::GetSingleArgument<int32_t>("tiles", 1)),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 0)) {}
+  ~TileOp() {}
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    std::array<int32_t, 2> temp_params = {{tiles_, axis_}};
+    if (InputSize() > 1) {
+      // We potentially have tiles and/or axis specified as inputs
+      // as well. We will check for them in that order. In other words:
+      // InputSize() == 2: tiles is specified
+      // InputSize() == 3: tiles is specified and axis.
+      // Anything specified as input will override the arguments
+      CAFFE_ENFORCE(
+          Input(1).ndim() == 1 && Input(1).size() == 1,
+          "Input `tiles` should be a vector of size 1.");
+
+      const auto& input1 = Input(1);
+      context_.template CopyItems<Context, CPUContext>(
+          input1.meta(),
+          1,
+          static_cast<const char*>(input1.raw_data()),
+          &(temp_params[0]));
+
+      if (InputSize() > 2) {
+        CAFFE_ENFORCE(
+            Input(2).ndim() == 1 && Input(2).size() == 1,
+            "Input `axis` should be a vector of size 1.");
+
+        const auto& input2 = Input(2);
+        context_.template CopyItems<Context, CPUContext>(
+            input2.meta(),
+            1,
+            static_cast<const char*>(input2.raw_data()),
+            &(temp_params[1]));
+      } else {
+        CAFFE_ENFORCE(
+            OperatorBase::HasArgument("axis"),
+            "Argument `axis` is missing and was not specified as input.");
+      }
+    } else {
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("tiles"),
+          "Argument `tiles` is missing and was not specified as input.");
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("axis"),
+          "Argument `axis` is missing and was not specified as input.");
+    }
+
+    tiles_ = temp_params[0];
+    axis_ = temp_params[1];
+
+    auto* output = Output(0);
+    const auto axis = input.canonical_axis_index(axis_);
+
+    // reshape output to be input tiled along the axis
+    vector<TIndex> output_dims(input.dims());
+    output_dims[axis_] = output_dims[axis_] * tiles_;
+    output->Resize(output_dims);
+
+    // size up to (and not including) axis
+    const auto outer_dim = input.size_to_dim(axis);
+    // size from axis up
+    const auto inner_dim = input.size_from_dim(axis);
+
+    /**
+     * How this works:
+     * Imagine a 2D tensor (matrix) of size 3x10, tiled 2 times.
+     * - Tiling along axis 0 (row) means copying the entire 3x10 Matrix 2
+     * times. outer_dim = 0, inner_dim = 30.
+     * - Tiling along axis 1 (column) means copying each row 2 times, then
+     * proceed to the next row, until the end. outer_dim = 3, inner_dim = 10.
+     */
+    const char* input_data = static_cast<const char*>(input.raw_data());
+    char* output_data =
+        static_cast<char*>(output->raw_mutable_data(input.meta()));
+
+    DoTile(
+        input.meta(),
+        input.itemsize(),
+        outer_dim,
+        inner_dim,
+        input_data,
+        output_data);
+
+    return true;
+  }
+
+ private:
+  void DoTile(
+      const TypeMeta& meta,
+      int item_size,
+      int outer_dim,
+      int inner_dim,
+      const char* input_data,
+      char* output_data) {
+    for (auto i = 0; i < outer_dim; ++i) {
+      for (auto t = 0; t < tiles_; ++t) {
+        context_.template CopyItems<Context, Context>(
+            meta, inner_dim, input_data, output_data);
+        output_data += inner_dim * item_size;
+      }
+      input_data += inner_dim * item_size;
+    }
+  }
+
+  int32_t tiles_;
+  int32_t axis_;
+};
+
+template <typename T, class Context>
+class TileGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TileGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        tiles_(OperatorBase::GetSingleArgument<int32_t>("tiles", 1)),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 0)) {}
+  ~TileGradientOp() {}
+
+  bool RunOnDevice() override {
+    std::array<int32_t, 2> temp_params = {{tiles_, axis_}};
+    if (InputSize() > 1) {
+      // We potentially have tiles and/or axis specified as inputs
+      // as well. We will check for them in that order. In other words:
+      // InputSize() == 2: tiles is specified
+      // InputSize() == 3: tiles is specified and axis.
+      // Anything specified as input will override the arguments
+      CAFFE_ENFORCE(
+          Input(1).ndim() == 1 && Input(1).size() == 1,
+          "Input `tiles` should be a vector of size 1.");
+
+      const auto& input1 = Input(1);
+      context_.template CopyItems<Context, CPUContext>(
+          input1.meta(),
+          1,
+          static_cast<const char*>(input1.raw_data()),
+          &(temp_params[0]));
+
+      if (InputSize() > 2) {
+        CAFFE_ENFORCE(
+            Input(2).ndim() == 1 && Input(2).size() == 1,
+            "Input `axis` should be a vector of size 1.");
+
+        const auto& input2 = Input(2);
+        context_.template CopyItems<Context, CPUContext>(
+            input2.meta(),
+            1,
+            static_cast<const char*>(input2.raw_data()),
+            &(temp_params[1]));
+      } else {
+        CAFFE_ENFORCE(
+            OperatorBase::HasArgument("axis"),
+            "Argument `axis` is missing and was not specified as input.");
+      }
+    } else {
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("tiles"),
+          "Argument `tiles` is missing and was not specified as input.");
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("axis"),
+          "Argument `axis` is missing and was not specified as input.");
+    }
+
+    tiles_ = temp_params[0];
+    axis_ = temp_params[1];
+
+    const auto& input = Input(0);
+    auto* output = Output(0);
+    const auto axis = input.canonical_axis_index(axis_);
+
+    // reshape output to be input "untiled" along the axis
+    vector<TIndex> output_dims(input.dims());
+    output_dims[axis_] = output_dims[axis_] / tiles_;
+    output->Resize(output_dims);
+
+    // size up to (and not including) axis
+    const auto outer_dim = output->size_to_dim(axis);
+    // size from axis up
+    const auto inner_dim = output->size_from_dim(axis);
+
+    /**
+     * How this works:
+     * Imagine a 2D tensor (matrix) of size 3x10, tiled 2 times along axis 1
+     * (column).
+     * This is equivalent to multiplying by a vector of 1s transposed.
+     * The gradient of this is all 1s in the shape of the input matrix
+     * (call it X).
+     * So the output gradient should be the matrix multipication result
+     * of input gradient (gradient of tiled tensor output) and X.
+     */
+    const char* input_data = static_cast<const char*>(input.raw_data());
+    char* output_data =
+        static_cast<char*>(output->raw_mutable_data(input.meta()));
+
+    DoTileGradient(
+        input.meta(),
+        input.itemsize(),
+        outer_dim,
+        inner_dim,
+        input_data,
+        output_data);
+
+    return true;
+  }
+
+ private:
+  void DoTileGradient(
+      const TypeMeta& meta,
+      int item_size,
+      int outer_dim,
+      int inner_dim,
+      const char* input_data,
+      char* output_data) {
+    for (auto i = 0; i < outer_dim; ++i) {
+      context_.template CopyItems<Context, Context>(
+          meta, inner_dim, input_data, output_data);
+      input_data += inner_dim * item_size;
+      for (auto t = 1; t < tiles_; ++t) {
+        math::Axpy<T, Context>(
+            inner_dim,
+            T(1),
+            reinterpret_cast<const T*>(input_data),
+            reinterpret_cast<T*>(output_data),
+            &context_);
+        input_data += inner_dim * item_size;
+      }
+      output_data += inner_dim * item_size;
+    }
+  }
+
+  int32_t tiles_;
+  int32_t axis_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TILE_OP_H_
diff --git a/caffe2/operators/top_k.cc b/caffe2/operators/top_k.cc
new file mode 100644
index 0000000..7c111ea
--- /dev/null
+++ b/caffe2/operators/top_k.cc
@@ -0,0 +1,356 @@
+#include "caffe2/operators/top_k.h"
+
+#include <algorithm>
+#include <functional>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+struct ValueComp {
+  bool operator()(
+      const std::pair<T, TIndex>& lhs,
+      const std::pair<T, TIndex>& rhs) const {
+    return lhs.first > rhs.first ||
+        (lhs.first == rhs.first && lhs.second < rhs.second);
+  }
+};
+
+template <typename T>
+void GetTopK(
+    const T* input,
+    const TIndex n,
+    const TIndex k,
+    const TIndex src_offset,
+    const TIndex dst_offset,
+    const TIndex stride,
+    T* values,
+    TIndex* indices,
+    TIndex* flatten_indices) {
+  const T* src_ptr = input + src_offset;
+  std::vector<std::pair<T, TIndex>> heap_data;
+  heap_data.reserve(k);
+  for (TIndex i = 0; i < k && i < n; ++i) {
+    heap_data.emplace_back(*src_ptr, i);
+    src_ptr += stride;
+  }
+  std::priority_queue<
+      std::pair<T, TIndex>,
+      std::vector<std::pair<T, TIndex>>,
+      ValueComp<T>>
+      pq(ValueComp<T>(), std::move(heap_data));
+  for (TIndex i = k; i < n; ++i) {
+    if (pq.top().first < *src_ptr) {
+      pq.pop();
+      pq.emplace(*src_ptr, i);
+    }
+    src_ptr += stride;
+  }
+  TIndex dst_pos = dst_offset + (std::min(k, n) - 1) * stride;
+  while (!pq.empty()) {
+    const auto& item = pq.top();
+    values[dst_pos] = item.first;
+    indices[dst_pos] = item.second;
+    if (flatten_indices != nullptr) {
+      flatten_indices[dst_pos] = src_offset + item.second * stride;
+    }
+    pq.pop();
+    dst_pos -= stride;
+  }
+}
+
+template <typename T>
+void SetTopKGradient(
+    const T* values,
+    const TIndex* indices,
+    const int k,
+    const TIndex src_offset,
+    const TIndex dst_offset,
+    const TIndex stride,
+    T* gradient) {
+  TIndex src_pos = src_offset;
+  for (int i = 0; i < k; ++i) {
+    if (indices[src_pos] < 0) {
+      continue;
+    }
+    gradient[dst_offset + indices[src_pos] * stride] = values[src_pos];
+    src_pos += stride;
+  }
+}
+
+} // namespace
+
+template <typename T, class Context>
+bool TopKOp<T, Context>::RunOnDevice() {
+  const auto& input = Input(0);
+  auto* values = Output(0);
+  auto* indices = Output(1);
+  auto* flatten_indices = OutputSize() > 2 ? Output(2) : nullptr;
+
+  const std::vector<TIndex>& input_dims = input.dims();
+  if (axis_ == -1) {
+    axis_ = input_dims.size() - 1;
+  }
+  CAFFE_ENFORCE_GE(axis_, 0);
+  CAFFE_ENFORCE_LT(axis_, input_dims.size());
+
+  std::vector<TIndex> output_dims = input_dims;
+  output_dims[axis_] = k_;
+  values->Resize(output_dims);
+  indices->Resize(output_dims);
+  if (flatten_indices != nullptr) {
+    flatten_indices->Resize(indices->size());
+  }
+  const T* input_data = input.template data<T>();
+  T* values_data = values->template mutable_data<T>();
+  TIndex* indices_data = indices->template mutable_data<TIndex>();
+  TIndex* flatten_indices_data = flatten_indices == nullptr
+      ? nullptr
+      : flatten_indices->template mutable_data<TIndex>();
+  // init values as the default value
+  math::Set<T, Context>(values->size(), T(0), values_data, &context_);
+  math::Set<TIndex, Context>(
+      indices->size(), TIndex(-1), indices_data, &context_);
+  if (flatten_indices_data != nullptr) {
+    math::Set<TIndex, Context>(
+        flatten_indices->size(), TIndex(-1), flatten_indices_data, &context_);
+  }
+
+  const TIndex prev_size = std::accumulate(
+      input_dims.cbegin(),
+      input_dims.cbegin() + axis_,
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex next_size = std::accumulate(
+      input_dims.cbegin() + axis_ + 1,
+      input_dims.cend(),
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex src_offset_stride = input_dims[axis_] * next_size;
+  const TIndex dst_offset_stride = k_ * next_size;
+  TIndex src_offset = 0;
+  TIndex dst_offset = 0;
+  for (TIndex i = 0; i < prev_size; ++i) {
+    for (TIndex j = 0; j < next_size; ++j) {
+      GetTopK(
+          input_data,
+          input_dims[axis_],
+          k_,
+          src_offset + j,
+          dst_offset + j,
+          next_size,
+          values_data,
+          indices_data,
+          flatten_indices_data);
+    }
+    src_offset += src_offset_stride;
+    dst_offset += dst_offset_stride;
+  }
+  return true;
+}
+
+template <typename T, class Context>
+bool TopKGradientOp<T, Context>::RunOnDevice() {
+  const auto& values = Input(0);
+  const auto& indices = Input(1);
+  const auto& original_input = Input(2);
+  auto* output = Output(0);
+  const std::vector<TIndex>& values_dims = values.dims();
+  const std::vector<TIndex>& origin_dims = original_input.dims();
+  CAFFE_ENFORCE_EQ(values_dims.size(), origin_dims.size());
+  output->Resize(origin_dims);
+  const T* values_data = values.template data<T>();
+  const TIndex* indices_data = indices.template data<TIndex>();
+  T* output_data = output->template mutable_data<T>();
+  if (axis_ == -1) {
+    axis_ = values_dims.size() - 1;
+  }
+  const int k = values_dims[axis_];
+  math::Set<T, Context>(output->size(), T(0), output_data, &context_);
+  const TIndex prev_size = std::accumulate(
+      values_dims.cbegin(),
+      values_dims.cbegin() + axis_,
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex next_size = std::accumulate(
+      values_dims.cbegin() + axis_ + 1,
+      values_dims.cend(),
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex src_offset_stride = k * next_size;
+  const TIndex dst_offset_stride = origin_dims[axis_] * next_size;
+  TIndex src_offset = 0;
+  TIndex dst_offset = 0;
+  for (TIndex i = 0; i < prev_size; ++i) {
+    for (TIndex j = 0; j < next_size; ++j) {
+      SetTopKGradient(
+          values_data,
+          indices_data,
+          k,
+          src_offset + j,
+          dst_offset + j,
+          next_size,
+          output_data);
+    }
+    src_offset += src_offset_stride;
+    dst_offset += dst_offset_stride;
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(TopK, TopKOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(TopKGradient, TopKGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(TopK)
+    .NumInputs(1)
+    .NumOutputs(2, 3)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out = {in[0], in[0]};
+      ArgumentHelper helper(def);
+      auto k = helper.GetSingleArgument("k", -1);
+      auto dims_size = in[0].dims_size();
+      out[0].set_dims(dims_size - 1, k);
+      out[1].set_dims(dims_size - 1, k);
+      out[1].set_data_type(TensorProto_DataType_INT32);
+      if (def.output_size() > 2) {
+        TensorShape flatten_indices_shape;
+        flatten_indices_shape.set_data_type(TensorProto_DataType_INT32);
+        flatten_indices_shape.add_dims(
+            std::accumulate(
+                in[0].dims().begin(),
+                in[0].dims().end() - 1,
+                1,
+                std::multiplies<long>()) *
+            k);
+        out.push_back(flatten_indices_shape);
+      }
+      return out;
+    })
+    .SetDoc(R"DOC(
+Retrieve the top-K elements of the last dimension. Given an input tensor of shape $(a_1, a_2, ..., a_n, r)$ and integer argument `k`, return up to three outputs:
+
+1. Value tensor of shape $(a_1, a_2, ..., a_n, k)$ which contains the values of the top k elements along the last dimension
+2. Index tensor of shape $(a_1, a_2, ..., a_n, k)$ which contains the indices of the top k elements (original indices from the input tensor).
+3. [OPTIONAL] Flattened index tensor of shape $(a_1 * a_2 * ... * a_n * k,)$.
+
+Given two equivalent values, this operator uses the indices along the last dimension as a tiebreaker. That is, the element with the lower index will appear first.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "TopK",
+    ["X"],
+    ["Values", "Indices", "Flattened_indices"],
+    k=2
+)
+
+workspace.FeedBlob("X", np.random.randint(10, size=(3,3,3)).astype(np.float32))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("Values:", workspace.FetchBlob("Values"))
+print("Indices:", workspace.FetchBlob("Indices"))
+print("Flattened_indices:", workspace.FetchBlob("Flattened_indices"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[[6. 7. 0.]
+  [8. 7. 7.]
+  [1. 5. 6.]]
+
+ [[0. 6. 1.]
+  [2. 8. 4.]
+  [1. 2. 9.]]
+
+ [[4. 3. 7.]
+  [0. 1. 7.]
+  [0. 1. 8.]]]
+Values:
+[[[7. 6.]
+  [8. 7.]
+  [6. 5.]]
+
+ [[6. 1.]
+  [8. 4.]
+  [9. 2.]]
+
+ [[7. 4.]
+  [7. 1.]
+  [8. 1.]]]
+Indices:
+[[[1 0]
+  [0 1]
+  [2 1]]
+
+ [[1 2]
+  [1 2]
+  [2 1]]
+
+ [[2 0]
+  [2 1]
+  [2 1]]]
+Flattened_indices: [ 1  0  3  4  8  7 10 11 13 14 17 16 20 18 23 22 26 25]
+
+```
+
+</details>
+
+  )DOC")
+    .Input(
+      0,
+      "X",
+      "(*Tensor`<float>`*): input tensor of shape $(a_1, a_2, ..., a_n, r)$")
+    .Output(
+        0,
+        "Values",
+        "(*Tensor`<float>`*): output tensor of shape $(a_1, a_2, ..., a_n, k)$")
+    .Output(
+        1,
+        "Indices",
+        "(*Tensor`<int>`*): tensor of indices of shape $(a_1, a_2, ..., a_n, k)$; indices values refer to each element's index in the last dimension of the `X` input tensor")
+    .Output(
+        2,
+        "Flattened_indices",
+        "(*Tensor`<int>`*): tensor of indices of shape $(a_1 * a_2 * ... * a_n * k,)$; indices values refer to each element's index in the flattened input tensor `X`")
+    .Arg("k", "(*int*): number of top elements to retrieve");
+
+OPERATOR_SCHEMA(TopKGradient).NumInputs(3).NumOutputs(1);
+
+class GetTopKGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "TopKGradient",
+        "",
+        vector<string>{GO(0), O(1), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(TopK, GetTopKGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/top_k.cu b/caffe2/operators/top_k.cu
new file mode 100644
index 0000000..ddcb7c6
--- /dev/null
+++ b/caffe2/operators/top_k.cu
@@ -0,0 +1,357 @@
+#include "caffe2/operators/top_k.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/top_k_heap_selection.cuh"
+#include "caffe2/operators/top_k_radix_selection.cuh"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T, int kHeapSize, bool kSelectMax = true>
+void RunHeapSelectionImpl(
+    const T* input,
+    const TIndex outer_size,
+    const TIndex inner_size,
+    const int k,
+    T* values,
+    TIndex* indices,
+    CUDAContext* context) {
+  constexpr int kBlockSize = 256;
+  constexpr int kNumWarps = kBlockSize / kWarpSize;
+  constexpr int smem = kNumWarps * kHeapSize * (sizeof(T) + sizeof(TIndex));
+  constexpr T kInitVal = kSelectMax ? std::numeric_limits<T>::lowest()
+                                    : std::numeric_limits<T>::max();
+  selectRowsViaHeap<T, TIndex, TIndex, kBlockSize, kHeapSize, kSelectMax>
+      <<<outer_size, kBlockSize, smem, context->cuda_stream()>>>(
+          input,
+          values,
+          indices,
+          kInitVal,
+          std::numeric_limits<TIndex>::max(),
+          outer_size,
+          inner_size,
+          k);
+}
+
+template <typename T, bool kSelectMax = true>
+void RunRadixSelectionImpl(
+    const T* input,
+    const TIndex outer_size,
+    const TIndex inner_size,
+    const int k,
+    T* values,
+    TIndex* indices,
+    CUDAContext* context) {
+  const int block = std::min(
+      math::roundUp(static_cast<int>(inner_size), kWarpSize),
+      CAFFE_CUDA_NUM_THREADS);
+  gatherTopK<T, kSelectMax, TIndex>
+      <<<outer_size, block, 0, context->cuda_stream()>>>(
+          input, inner_size, k, outer_size, values, indices);
+  // Unfortunately the output is not currently sorted, and there is no batch
+  // sorting utility available. Iterate over all of the slices and sort them
+  // in-place using Thrust.
+  for (int i = 0; i < outer_size; ++i) {
+    thrust::sort_by_key(
+        thrust::cuda::par.on(context->cuda_stream()),
+        values + i * k,
+        values + i * k + (k <= inner_size ? k : inner_size),
+        indices + i * k,
+        thrust::greater<T>());
+  }
+}
+
+template <typename T>
+void RunTopKOnLastDimCUDAImpl(
+    const T* input,
+    const TIndex outer_size,
+    const TIndex inner_size,
+    const int k,
+    T* values,
+    TIndex* indices,
+    CUDAContext* context) {
+  // If k is small, uses heap selection, otherwise uses radix selection.
+  if (k < 32) {
+    RunHeapSelectionImpl<T, 32>(
+        input, outer_size, inner_size, k, values, indices, context);
+  } else if (k < 128) {
+    RunHeapSelectionImpl<T, 128>(
+        input, outer_size, inner_size, k, values, indices, context);
+  } else if (k < 512) {
+    RunHeapSelectionImpl<T, 512>(
+        input, outer_size, inner_size, k, values, indices, context);
+  } else {
+    RunRadixSelectionImpl<T>(
+        input, outer_size, inner_size, k, values, indices, context);
+  }
+}
+
+__global__ void FlattenIndicesCUDAKernel(
+    const TIndex* src,
+    const TIndex size,
+    const TIndex stride,
+    const TIndex n,
+    const int k,
+    TIndex* dst) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    if (src[i] < 0) {
+      continue;
+    }
+    const TIndex x = i / stride / k;
+    const TIndex y = i % stride;
+#if __CUDA_ARCH__ >= 350
+    dst[i] = __ldg(src + i) * stride + x * n * stride + y;
+#else
+    dst[i] = src[i] * stride + x * n * stride + y;
+#endif
+  }
+}
+
+template <typename T>
+__global__ void SetTopKGradientCUDAKernel(
+    const T* values,
+    const TIndex* indices,
+    const TIndex size,
+    const TIndex stride,
+    const TIndex n,
+    const int k,
+    T* dst) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    if (indices[i] < 0) {
+      continue;
+    }
+    const TIndex x = i / stride / k;
+    const TIndex y = i % stride;
+#if __CUDA_ARCH__ >= 350
+    dst[__ldg(indices + i) * stride + x * n * stride + y] = __ldg(values + i);
+#else
+    dst[indices[i] * stride + x * n * stride + y] = values[i];
+#endif
+  }
+}
+
+} // namespace
+
+template <typename T, typename Context>
+class TopKCudaOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_FUNCTIONS(Context);
+
+  TopKCudaOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "k", k_, -1),
+        OP_SINGLE_ARG(int, "axis", axis_, -1) {
+    CAFFE_ENFORCE(k_ >= 1, "k argument must be >= 1");
+  }
+
+  ~TopKCudaOp(){};
+
+  bool RunOnDevice() override;
+
+ private:
+  const int k_;
+  int axis_;
+
+  // Buffers for CUDAContext.
+  Tensor<Context> input_transposed_buffer_;
+  Tensor<Context> values_transposed_buffer_;
+  Tensor<Context> indices_transposed_buffer_;
+
+  // Shape tensors on device for CUDAContext.
+  Tensor<Context> input_dims_device_;
+  Tensor<Context> input_transposed_dims_device_;
+  Tensor<Context> input_axes_device_;
+
+  Tensor<Context> output_dims_device_;
+  Tensor<Context> output_transposed_dims_device_;
+  Tensor<Context> output_transposed_axes_device_;
+};
+
+template <typename T, typename Context>
+bool TopKCudaOp<T, Context>::RunOnDevice() {
+  const auto& input = Input(0);
+  auto* values = Output(0);
+  auto* indices = Output(1);
+  auto* flatten_indices = OutputSize() > 2 ? Output(2) : nullptr;
+
+  const std::vector<TIndex>& input_dims = input.dims();
+  if (axis_ == -1) {
+    axis_ = input_dims.size() - 1;
+  }
+  CAFFE_ENFORCE_GE(axis_, 0);
+  CAFFE_ENFORCE_LT(axis_, input_dims.size());
+
+  const bool need_transpose = axis_ < input_dims.size() - 1;
+  std::vector<TIndex> output_dims = input_dims;
+  output_dims[axis_] = k_;
+  const TIndex prev_size = std::accumulate(
+      input_dims.cbegin(),
+      input_dims.cbegin() + axis_,
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex next_size = std::accumulate(
+      input_dims.cbegin() + axis_ + 1,
+      input_dims.cend(),
+      TIndex(1),
+      std::multiplies<TIndex>());
+  const TIndex outer_size = input.size() / input_dims[axis_];
+  const TIndex inner_size = input_dims[axis_];
+
+  values->Resize(output_dims);
+  indices->Resize(output_dims);
+  if (flatten_indices != nullptr) {
+    flatten_indices->Resize(indices->size());
+  }
+  const T* input_data = input.template data<T>();
+  T* values_data = values->template mutable_data<T>();
+  TIndex* indices_data = indices->template mutable_data<TIndex>();
+  TIndex* flatten_indices_data = flatten_indices == nullptr
+      ? nullptr
+      : flatten_indices->template mutable_data<TIndex>();
+
+  if (need_transpose) {
+    const std::array<int, 3> dims = {static_cast<int>(prev_size),
+                                     static_cast<int>(inner_size),
+                                     static_cast<int>(next_size)};
+    const std::array<int, 3> axes = {0, 2, 1};
+    input_transposed_buffer_.Resize(
+        std::vector<TIndex>{outer_size, inner_size});
+    values_transposed_buffer_.Resize(std::vector<TIndex>{outer_size, k_});
+    indices_transposed_buffer_.Resize(std::vector<TIndex>{outer_size, k_});
+    math::Transpose(
+        3,
+        dims.data(),
+        axes.data(),
+        input.template data<T>(),
+        input_transposed_buffer_.template mutable_data<T>(),
+        &context_);
+    input_data = input_transposed_buffer_.template data<T>();
+    values_data = values_transposed_buffer_.template mutable_data<T>();
+    indices_data = indices_transposed_buffer_.template mutable_data<TIndex>();
+  }
+
+  // init values as the default value
+  math::Set<T, CUDAContext>(values->size(), T(0), values_data, &context_);
+  math::Set<TIndex, CUDAContext>(
+      indices->size(), TIndex(-1), indices_data, &context_);
+  if (flatten_indices_data != nullptr) {
+    math::Set<TIndex, CUDAContext>(
+        flatten_indices->size(), TIndex(-1), flatten_indices_data, &context_);
+  }
+
+  RunTopKOnLastDimCUDAImpl<T>(
+      input_data,
+      outer_size,
+      inner_size,
+      k_,
+      values_data,
+      indices_data,
+      &context_);
+  if (need_transpose) {
+    const std::array<int, 3> dims = {
+        static_cast<int>(prev_size), static_cast<int>(next_size), k_};
+    const std::array<int, 3> axes = {0, 2, 1};
+    math::Transpose(
+        3,
+        dims.data(),
+        axes.data(),
+        values_transposed_buffer_.template data<T>(),
+        values->template mutable_data<T>(),
+        &context_);
+    math::Transpose(
+        3,
+        dims.data(),
+        axes.data(),
+        indices_transposed_buffer_.template data<TIndex>(),
+        indices->template mutable_data<TIndex>(),
+        &context_);
+  }
+
+  // Flatten the indices if needed.
+  if (flatten_indices != nullptr) {
+    FlattenIndicesCUDAKernel<<<
+        CAFFE_GET_BLOCKS(indices->size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        indices->template data<TIndex>(),
+        indices->size(),
+        next_size,
+        inner_size,
+        k_,
+        flatten_indices->template mutable_data<TIndex>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(TopK, TopKCudaOp<float, CUDAContext>);
+
+template <typename T, typename Context>
+class TopKGradientCudaOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_FUNCTIONS(Context);
+
+  TopKGradientCudaOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "axis", axis_, -1) {}
+
+  ~TopKGradientCudaOp(){};
+
+  bool RunOnDevice() override;
+
+ private:
+  int axis_;
+};
+
+template <typename T, typename Context>
+bool TopKGradientCudaOp<T, Context>::RunOnDevice() {
+  const auto& values = Input(0);
+  const auto& indices = Input(1);
+  const auto& original_input = Input(2);
+  auto* output = Output(0);
+  const std::vector<TIndex>& values_dims = values.dims();
+  const std::vector<TIndex>& origin_dims = original_input.dims();
+  CAFFE_ENFORCE_EQ(values_dims.size(), origin_dims.size());
+  output->Resize(origin_dims);
+  T* output_data = output->template mutable_data<T>();
+  if (axis_ == -1) {
+    axis_ = values_dims.size() - 1;
+  }
+  const int k = values_dims[axis_];
+  math::Set<T, Context>(output->size(), T(0), output_data, &context_);
+  const TIndex stride = std::accumulate(
+      values_dims.cbegin() + axis_ + 1,
+      values_dims.cend(),
+      TIndex(1),
+      std::multiplies<TIndex>());
+  SetTopKGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(indices.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      values.template data<T>(),
+      indices.template data<TIndex>(),
+      values.size(),
+      stride,
+      origin_dims[axis_],
+      k,
+      output_data);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(TopKGradient, TopKGradientCudaOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/top_k.h b/caffe2/operators/top_k.h
new file mode 100644
index 0000000..a59ac70
--- /dev/null
+++ b/caffe2/operators/top_k.h
@@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_TOP_K_H_
+#define CAFFE2_OPERATORS_TOP_K_H_
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class TopKOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  TopKOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "k", k_, -1),
+        OP_SINGLE_ARG(int, "axis", axis_, -1) {
+    CAFFE_ENFORCE(k_ >= 1, "k argument must be >= 1");
+  }
+
+  ~TopKOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  const int k_;
+  int axis_;
+};
+
+template <typename T, class Context>
+class TopKGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  TopKGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "axis", axis_, -1) {}
+
+  ~TopKGradientOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  int axis_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TOP_K_H_
diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh
new file mode 100644
index 0000000..551f625
--- /dev/null
+++ b/caffe2/operators/top_k_heap_selection.cuh
@@ -0,0 +1,287 @@
+#ifndef CAFFE2_OPERATORS_TOP_K_HEAP_SELECTION_H_
+#define CAFFE2_OPERATORS_TOP_K_HEAP_SELECTION_H_
+
+#include "caffe2/utils/GpuBitonicSort.cuh"
+#include "caffe2/utils/GpuDefs.cuh"
+#include "caffe2/utils/math.h"
+#include <cuda_runtime.h>
+
+namespace caffe2 {
+
+template <typename K, typename V>
+struct LTComp {
+  __device__ inline bool
+  operator()(const K& kA, const V& vA, const K& kB, const V& vB) const {
+    // FIXME: adding value comparison is slow
+    return (kA < kB) || ((kA == kB) && (vA < vB));
+  }
+};
+
+template <typename K, typename V>
+struct GTComp {
+  __device__ inline bool
+  operator()(const K& kA, const V& vA, const K& kB, const V& vB) const {
+    // FIXME: adding value comparison is slow
+    // FIXME: it's vA < vB because the sorting order for V (aka
+    // indices) is different in our use case
+    return (kA > kB) || ((kA == kB) && (vA < vB));
+  }
+};
+
+constexpr size_t getHeapSmemSize(
+    size_t keySize,
+    size_t valueSize,
+    int numThreads,
+    int heapSize) {
+  return (numThreads / kWarpSize) * heapSize * (keySize + valueSize);
+}
+
+// Per-warp heap structure in shared memory:
+// [key_0, ..., key_(HeapSize-2)], [empty element] (warp 0)
+// ...
+// [key_0, ..., key_(HeapSize-2)], [empty element] (warp n-1)
+// [value_0, ..., value_(HeapSize-2)], [empty element] (warp 0)
+// ...
+// [value_0, ..., value_(HeapSize-2)], [empty element] (warp n-1)
+
+// Dir == true means we are selecting the largest values, thus
+// the heap is a min-heap
+template <typename K, typename V, int HeapSize, bool Dir>
+__device__ inline void warpHeapInsert(K k, V v, K* keyHeap, V* valueHeap) {
+  // Replace head if we are < head
+  bool valid = Dir ? (k > keyHeap[0]) : (k < keyHeap[0]);
+
+  // Even though this is the single-thread case, another preceding
+  // thread in the warp may have inserted in a new element that
+  // supersedes our element and thus our attempt at an insert would do
+  // nothing.
+  if (!valid) {
+    return;
+  }
+
+  // Swap with head if valid
+  K currentKey = k;
+  V currentValue = v;
+
+  keyHeap[0] = currentKey;
+  valueHeap[0] = currentValue;
+
+  // The number of interior nodes in the heap is log2(HeapSize / 2):
+  // heap size 8 means there are 7 elements in the heap, indices 0-6
+  // (0 12 3456)
+  // log2(8 / 2) = 2 levels of interior nodes for heap size 8 (0 and 12)
+  int i = 0;
+#pragma unroll
+  for (int levels = 0; levels < math::integerLog2(HeapSize / 2); ++levels) {
+    int leftChild = i * 2 + 1;
+    int rightChild = leftChild + 1;
+    K leftKey = keyHeap[leftChild];
+    K rightKey = keyHeap[rightChild];
+
+    // What child might we want to swap with (max heap = larger child;
+    // min heap = smaller child)
+    bool swap = Dir ? (leftKey < rightKey) : (leftKey > rightKey);
+    int childToSwap = swap ? leftChild : rightChild;
+    K keyChildToSwap = swap ? leftKey : rightKey;
+
+    // If we're bigger than both children (max heap), or smaller than
+    // both children (min heap), then we do nothing for the rest of
+    // the iterations
+    valid =
+        Dir ? !(currentKey < keyChildToSwap) : !(currentKey > keyChildToSwap);
+
+    // Swap with childToSwap if still valid
+    keyHeap[i] = valid ? keyChildToSwap : currentKey;
+    valueHeap[i] = valid ? valueHeap[childToSwap] : currentValue;
+
+    keyHeap[childToSwap] = valid ? currentKey : keyChildToSwap;
+    valueHeap[childToSwap] = valid ? currentValue : valueHeap[childToSwap];
+
+    i = childToSwap;
+
+    // This is our new element to potentially downheap
+    currentKey = keyHeap[i];
+    currentValue = valueHeap[i];
+  }
+}
+
+template <typename K, typename V, int HeapSize, bool Dir>
+__device__ inline void
+warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
+  // All threads in the warp have elements
+  bool wantInsert = Dir ? (k > keyHeapHead) : (k < keyHeapHead);
+
+  // Find out all the lanes that have elements to add to the heap
+#if CUDA_VERSION >= 9000
+  unsigned int vote = __ballot_sync(__activemask(), wantInsert);
+#else
+  unsigned int vote = __ballot(wantInsert);
+#endif
+
+  if (!vote) {
+    // Everything the warp has is smaller than our heap
+    return;
+  }
+
+  // Otherwise, we want to serialize execution of the threads
+  // that have elements
+  int index = __popc(getLaneMaskLt() & vote);
+  int total = __popc(vote);
+
+  // FIXME: try switch statement and explicitly handle cases
+  // FIXME: how do cases work?
+  for (int i = 0; i < total; ++i) {
+    if (index == i && wantInsert) {
+      // Insert into our heap
+      warpHeapInsert<K, V, HeapSize, Dir>(k, v, keyHeap, valueHeap);
+
+      // Make sure all smem writes are visible
+      __threadfence_block();
+    }
+  }
+
+  // Re-broadcast the new heap head
+  // FIXME: consider each updater above will broadcast its value with
+  // a shuffle instead?
+  keyHeapHead = keyHeap[0];
+}
+
+template <typename K, typename V, int ThreadsPerBlock, int HeapSize, bool Dir>
+class Heap {
+ public:
+  static constexpr size_t getSmemSize() {
+    return getHeapSmemSize(sizeof(K), sizeof(V), ThreadsPerBlock, HeapSize);
+  }
+
+  __device__ Heap(void* smem, K initKey, V initVal) {
+    heapBase = smem;
+
+    int warpId = threadIdx.x / kWarpSize;
+    int laneId = getLaneId();
+
+    auto kStart = getKeyStart();
+    heapK = &kStart[warpId * HeapSize];
+    auto vStart = getValueStart();
+    heapV = &vStart[warpId * HeapSize];
+
+    heapHead = initKey;
+
+    if (HeapSize < kWarpSize) {
+      if (laneId < HeapSize) {
+        heapK[laneId] = initKey;
+        heapV[laneId] = initVal;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < HeapSize / kWarpSize; ++i) {
+        heapK[laneId + i * kWarpSize] = initKey;
+        heapV[laneId + i * kWarpSize] = initVal;
+      }
+    }
+  }
+
+  // Returns a pointer to the start of our block-wide key storage
+  inline __device__ K* getKeyStart() {
+    return (K*)heapBase;
+  }
+
+  // Returns a pointer to the start of our block-wide value storage
+  inline __device__ V* getValueStart() {
+    constexpr int warpsPerBlock = ThreadsPerBlock / kWarpSize;
+    return (V*)&getKeyStart()[warpsPerBlock * HeapSize];
+  }
+
+  // Returns a pointer past the end of our block-wide heap storage
+  inline __device__ void* getStorageEnd() {
+    constexpr int warpsPerBlock = ThreadsPerBlock / kWarpSize;
+    return getValueStart() + (warpsPerBlock * HeapSize);
+  }
+
+  inline __device__ void add(K k, V v) {
+    warpHeap<K, V, HeapSize, Dir>(k, v, heapHead, heapK, heapV);
+  }
+
+  // Reduce all per-warp heaps to a unified, sorted list
+  inline __device__ void reduceHeaps() {
+    constexpr int allHeapSize = (ThreadsPerBlock / kWarpSize) * HeapSize;
+
+    if (Dir) {
+      bitonicSort<GTComp<K, V>, K, V, allHeapSize, ThreadsPerBlock>(
+          getKeyStart(), getValueStart(), GTComp<K, V>());
+    } else {
+      bitonicSort<LTComp<K, V>, K, V, allHeapSize, ThreadsPerBlock>(
+          getKeyStart(), getValueStart(), LTComp<K, V>());
+    }
+  }
+
+ private:
+  void* heapBase;
+  K heapHead;
+  K* heapK;
+  V* heapV;
+};
+
+template <
+    typename V,
+    typename IndexType,
+    typename OutIndexType,
+    int ThreadsPerBlock,
+    int HeapSize,
+    bool Dir>
+__global__ void selectRowsViaHeap(
+    const V* input, // m x n
+    V* outKeys, // m x k
+    OutIndexType* outIndices, // m x k
+    V initVal,
+    IndexType initIndex,
+    int m,
+    int n,
+    int k) {
+  extern __shared__ float smem[];
+
+  Heap<V, IndexType, ThreadsPerBlock, HeapSize, Dir> heap(
+      smem, initVal, initIndex);
+
+  auto inputStart = &input[blockIdx.x * n];
+
+  // FIXME choose desired unroll level
+  constexpr int Unroll = 1;
+  V vals[Unroll];
+
+  for (int i = threadIdx.x; i < n; i += blockDim.x * Unroll) {
+#pragma unroll
+    for (int j = 0; j < Unroll; ++j) {
+      vals[j] = inputStart[i + j * blockDim.x];
+    }
+
+#pragma unroll
+    for (int j = 0; j < Unroll; ++j) {
+      heap.add(vals[j], (IndexType)i + j * blockDim.x);
+    }
+  }
+
+  // When finished, we restructure the heaps in shared memory
+  // The heaps are actually of size HeapSize - 1 (e.g., 32 -> 31); the
+  // extra element should have remained untouched, so we can still
+  // sort things in-place as a power of 2.
+  __syncthreads();
+
+  heap.reduceHeaps();
+
+  auto outKeysStart = &outKeys[blockIdx.x * k];
+  auto outIndicesStart = &outIndices[blockIdx.x * k];
+
+  auto heapK = heap.getKeyStart();
+  auto heapV = heap.getValueStart();
+
+  // Write out the final k-selected values; they should be all
+  // together
+  for (int i = threadIdx.x; i < n && i < k; i += blockDim.x) {
+    outKeysStart[i] = heapK[i];
+    outIndicesStart[i] = (OutIndexType)heapV[i];
+  }
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TOP_K_HEAP_SELECTION_H_
diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh
new file mode 100644
index 0000000..69a9710
--- /dev/null
+++ b/caffe2/operators/top_k_radix_selection.cuh
@@ -0,0 +1,460 @@
+#ifndef CAFFE2_OPERATORS_TOP_K_RADIX_SELECTION_H_
+#define CAFFE2_OPERATORS_TOP_K_RADIX_SELECTION_H_
+
+#include "caffe2/utils/GpuDefs.cuh"
+#include "caffe2/utils/GpuScanUtils.cuh"
+#include "caffe2/utils/math.h"
+#include <cuda_runtime.h>
+
+namespace caffe2 {
+
+// From the cutorch library
+
+template <typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(T &lhs, T &rhs) {
+    return lhs + rhs;
+  }
+};
+
+template <typename T>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+  typedef unsigned int RadixType;
+
+  // Converts a float to an integer representation with the same
+  // sorting; i.e., for floats f1, f2:
+  // if f1 < f2 then convert(f1) < convert(f2)
+  // We use this to enable radix selection of floating-point values.
+  // This also gives a relative order for NaNs, but that's ok, as they
+  // will all be adjacent
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (x ^ mask);
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<unsigned char> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(unsigned char v) {
+    return v;
+  }
+
+  static inline __device__ unsigned char deconvert(RadixType v) {
+    return v;
+  }
+};
+
+template <>
+struct TopKTypeConfig<char> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(char v) {
+    return 128u + v;
+  }
+
+  static inline __device__ char deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct TopKTypeConfig<short> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(short v) {
+    assert(sizeof(short) == 2);
+    return 32768u + v;
+  }
+
+  static inline __device__ short deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(int v) {
+    assert(sizeof(int) == 4);
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct TopKTypeConfig<long> {
+  typedef unsigned long long int RadixType;
+
+  static inline __device__ RadixType convert(long v) {
+    assert(sizeof(long) == 8);
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ long deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+  typedef unsigned long long int RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (x ^ mask);
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <typename DataType,
+          typename BitDataType,
+          typename CountType,
+          int RadixSize,
+          int RadixBits>
+__device__ void countRadixUsingMask(CountType counts[RadixSize],
+                                    CountType* smem,
+                                    BitDataType desired,
+                                    BitDataType desiredMask,
+                                    int radixDigitPos,
+                                    int sliceSize,
+                                    const DataType* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+  for (int i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    BitDataType val = TopKTypeConfig<DataType>::convert(data[i]);
+
+    bool hasVal = ((val & desiredMask) == desired);
+    BitDataType digitInRadix = Bitfield<BitDataType>::getBitfield(val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (unsigned int j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+#if CUDA_VERSION >= 9000
+      counts[j] += __popc(__ballot_sync(__activemask(), vote));
+#else
+      counts[j] += __popc(__ballot(vote));
+#endif
+    }
+  }
+
+  // Now, for each warp, sum values
+  if (getLaneId() == 0) {
+#pragma unroll
+    for (unsigned int i = 0; i < RadixSize; ++i) {
+      atomicAdd(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (unsigned int i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
+#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
+#define RADIX_MASK (RADIX_SIZE - 1)
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename DataType, typename BitDataType>
+__device__ DataType findPattern(DataType* smem,
+                                const DataType* data,
+                                int sliceSize,
+                                BitDataType desired,
+                                BitDataType desiredMask) {
+  if (threadIdx.x < 32) {
+    smem[threadIdx.x] = (DataType) 0;
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  int numIterations = math::roundUp(sliceSize, (int) blockDim.x);
+  for (int i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    DataType v = inRange ? data[i] : (DataType)0;
+
+    if (inRange && ((TopKTypeConfig<DataType>::convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = (DataType)1;
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    DataType found = smem[0];
+    DataType val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (found != (DataType)0) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  assert(false);
+  return (DataType)0;
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename DataType, typename BitDataType, bool Order>
+__device__ void radixSelect(const DataType* data,
+                            int k,
+                            int sliceSize,
+                            int* smem,
+                            DataType* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  BitDataType desired = 0;
+  BitDataType desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k <= sliceSize ? k : sliceSize;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+#pragma unroll
+  for (int digitPos = sizeof(DataType) * 8 - RADIX_BITS;
+       digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<DataType, BitDataType,
+                        int,
+                        RADIX_SIZE, RADIX_BITS>(
+                          counts, smem,
+                          desired, desiredMask, digitPos,
+                          sliceSize, data);
+
+    // All threads participate in the comparisons below to know the
+    // final result
+
+
+#define CHECK_RADIX(i)                                                  \
+    int count = counts[i];                                              \
+                                                                        \
+    /* All threads have the same value in counts here, so all */        \
+    /* threads will return from the function. */                        \
+    if (count == 1 && kToFind == 1) {                                   \
+      /* There is a unique answer. */                                   \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The answer is now the unique element v such that: */           \
+      /* (v & desiredMask) == desired */                                \
+      /* However, we do not yet know what the actual element is. We */  \
+      /* need to perform a search through the data to find the */       \
+      /* element that matches this pattern. */                          \
+      *topK = findPattern<DataType, BitDataType>(                       \
+        (DataType*) smem, data, sliceSize,                              \
+        desired, desiredMask);                                          \
+      return;                                                           \
+    }                                                                   \
+                                                                        \
+    if (count >= kToFind) {                                             \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The top-Kth element v must now be one such that: */            \
+      /* (v & desiredMask == desired) */                                \
+      /* but we haven't narrowed it down; we must check the next */     \
+      /* least-significant digit */                                     \
+      break;                                                            \
+    }                                                                   \
+                                                                        \
+    kToFind -= count                                                    \
+
+    if (Order) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        CHECK_RADIX(i);
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        CHECK_RADIX(i);
+      }
+    }
+#undef CHECK_RADIX
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = TopKTypeConfig<DataType>::deconvert(desired);
+}
+
+template <typename T, bool Order, typename IndicesType>
+__global__ void gatherTopK(const T* inputPtr,
+                           int inputSliceSize,
+                           int outputSliceSize, // aka `k`
+                           int numInputSlices,
+                           T* topKPtr,
+                           IndicesType* indicesPtr) {
+  __shared__ int smem[32]; // one per each warp, up to warp limit
+
+  int slice = blockIdx.x;
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  const T* inputSliceStart = &inputPtr[slice * inputSliceSize];
+  T* topKSliceStart = &topKPtr[slice * outputSliceSize];
+  IndicesType* indicesSliceStart = &indicesPtr[slice * outputSliceSize];
+
+  // Find the k-th highest element in our input
+  T topKValue = (T)0;
+  radixSelect<T, typename TopKTypeConfig<T>::RadixType, Order>(
+    inputSliceStart, outputSliceSize,
+    inputSliceSize,
+    smem, &topKValue);
+
+  // Every value that is strictly less/greater than `pattern`
+  // (depending on sort dir) in sorted int format is in the top-K.
+  // The top-K value itself might not be unique.
+  //
+  // Since there are a variable number of elements that we see that
+  // are within the top-k, we don't know at what index to write out
+  // the resulting values.
+  // In order to get this, we perform an exclusive prefix sum of
+  // `hasTopK`. This will return the resulting index into which we
+  // need to write the result, if a thread has a result.
+
+  // All threads need to participate in the loop and the prefix sum,
+  // but not necessarily in the load; hence loop bounds being rounded
+  // up to a multiple of the block dim.
+  int numIterations = math::roundUp(inputSliceSize, (int) blockDim.x);
+  int writeIndexStart = 0;
+
+  for (int i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v = inRange ? inputSliceStart[i] : (T)0;
+    bool hasTopK;
+    if (Order) {
+      hasTopK = inRange && (v > topKValue);
+    } else {
+      hasTopK = inRange && (v < topKValue);
+    }
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      int topKOffset = writeIndex;
+      int indexOffset = writeIndex;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i;
+    }
+
+    writeIndexStart += carry;
+  }
+
+  // We need to fill in the rest with actual == top-K values.
+  // The number that we need is outputSliceSize -
+  // writeIndexStart. There might be more than that number available,
+  // in which case we have to choose the first seen set. We do this
+  // via a prefix sum to calculate indices for writing results.
+  assert(outputSliceSize >= writeIndexStart);
+  int topKRemaining = (outputSliceSize - writeIndexStart);
+
+  for (int i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v = inRange ? inputSliceStart[i] : (T)0;
+    bool hasTopK = inRange && (v == topKValue);
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK && index < topKRemaining) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      int topKOffset = writeIndex;
+      int indexOffset = writeIndex;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i;
+    }
+
+    if (carry >= topKRemaining) {
+      break;
+    }
+
+    topKRemaining -= carry;
+    writeIndexStart += carry;
+  }
+}
+
+#undef RADIX_BITS
+#undef RADIX_SIZE
+#undef RADIX_MASK
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TOP_K_RADIX_SELECTION_H_
diff --git a/caffe2/operators/transpose_op.cc b/caffe2/operators/transpose_op.cc
new file mode 100644
index 0000000..bfdc84d
--- /dev/null
+++ b/caffe2/operators/transpose_op.cc
@@ -0,0 +1,130 @@
+#include "caffe2/operators/transpose_op.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Transpose, TransposeOp<CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+// Registering in operator_fallback_mkl.cc results in a linker error in
+// in opt build related to DoRunWithType().
+REGISTER_MKL_OPERATOR(Transpose, mkl::MKLFallbackOp<TransposeOp<CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+OPERATOR_SCHEMA(Transpose)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      vector<int> axes = helper.GetRepeatedArgument<int>("axes");
+      vector<TensorShape> out(1);
+      out[0].set_data_type(in[0].data_type());
+
+      if (axes.empty()) {
+        for (auto axis = in [0].dims().rbegin(); axis != in[0].dims().rend();
+             ++axis) {
+          out[0].add_dims(*axis);
+        }
+      } else {
+        auto tensor_size = in[0].dims().size();
+        auto valid_axes =
+            std::all_of(axes.begin(), axes.end(), [&tensor_size](int& axis) {
+              return axis >= 0 && axis < tensor_size;
+            });
+
+        CAFFE_ENFORCE(valid_axes, "Axes argument passed in had invalid values");
+        CAFFE_ENFORCE(
+            axes.size() == tensor_size,
+            "Axes argument passed in had the incorrect size");
+
+        for (auto axis = axes.begin(); axis != axes.end(); ++axis) {
+          out[0].add_dims(in[0].dims().Get(*axis));
+        }
+      }
+
+      return out;
+    })
+    .SetDoc(R"DOC(
+Transpose the input tensor by permuting the axes of the input according
+to the `axes` argument. Similar to numpy's
+[transpose](https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html)
+function.
+
+For example, when axes=(1, 0, 2), given an input tensor of shape
+(1, 2, 3), the output shape will be (2, 1, 3).
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/transpose_op.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Transpose",
+    ["X"],
+    ["Y"],
+    axes=(0,3,1,2)
+)
+
+x = np.random.rand(1,32,32,3)
+workspace.FeedBlob("X", x)
+print("X.shape (NHWC order):", workspace.FetchBlob("X").shape)
+workspace.RunOperatorOnce(op)
+print("Y.shape (NCHW order):", workspace.FetchBlob("Y").shape)
+```
+
+**Result**
+
+```
+X.shape (NHWC order): (1, 32, 32, 3)
+Y.shape (NCHW order): (1, 3, 32, 32)
+```
+
+</details>
+
+)DOC")
+    .Arg(
+        "axes",
+        "*(type: Tuple(int))* Order to permute axes of input tensor. Reverses "
+        "the dimensions by default.")
+    .Input(0, "X", "*(type: Tensor)* Input tensor.")
+    .Output(0, "Y", "*(type: Tensor)* Transposed output.")
+    .InheritOnnxSchema("Transpose");
+
+class GetTransposeGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  // We will create our own arguments.
+  bool CopyArguments() const override {
+    return false;
+  }
+  vector<OperatorDef> GetGradientDefs() override {
+    auto ops = SingleGradientDef(
+        "Transpose", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    ops[0].mutable_arg()->CopyFrom(Def().arg());
+    if (ArgumentHelper::HasArgument(Def(), "axes")) {
+      // If axes is specified, we will need to figure out the inverse index.
+      const Argument& old_axes = GetArgument(Def(), "axes");
+      const int axes_size = old_axes.ints_size();
+      Argument* new_arg = GetMutableArgument("axes", false, &ops[0]);
+      for (int i = 0; i < axes_size; ++i) {
+        new_arg->set_ints(old_axes.ints(i), i);
+      }
+    }
+    return ops;
+  }
+};
+
+REGISTER_GRADIENT(Transpose, GetTransposeGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/transpose_op.cu b/caffe2/operators/transpose_op.cu
new file mode 100644
index 0000000..d5e9507
--- /dev/null
+++ b/caffe2/operators/transpose_op.cu
@@ -0,0 +1,8 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/transpose_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Transpose, TransposeOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
new file mode 100644
index 0000000..599ce2a
--- /dev/null
+++ b/caffe2/operators/transpose_op.h
@@ -0,0 +1,73 @@
+#ifndef CAFFE2_OPERATORS_TRANSPOSE_H_
+#define CAFFE2_OPERATORS_TRANSPOSE_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class TransposeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  TransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
+    // We will check the legality of axes_: it should be from 0 to axes_.size().
+    std::vector<int> axes_sorted = axes_;
+    std::sort(axes_sorted.begin(), axes_sorted.end());
+    for (std::size_t i = 0; i < axes_sorted.size(); ++i) {
+      if (axes_sorted[i] != i) {
+        CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
+      }
+    }
+  }
+
+  ~TransposeOp() = default;
+
+  bool RunOnDevice() override {
+    // Do the actual transpose, which is implemented in DoRunWithType().
+    return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
+        this, Input(0));
+  }
+
+ private:
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.rbegin(), axes_.rend(), 0);
+    } else {
+      CAFFE_ENFORCE_EQ(ndim, axes_.size());
+    }
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      Y_dims[i] = X_dims[axes_[i]];
+    }
+    Y->Resize(Y_dims);
+    math::Transpose<T, Context>(
+        X_dims.size(),
+        X_dims.data(),
+        axes_.data(),
+        X.template data<T>(),
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+  std::vector<int> axes_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TRANSPOSE_H_
diff --git a/caffe2/operators/transpose_op_cudnn.cc b/caffe2/operators/transpose_op_cudnn.cc
new file mode 100644
index 0000000..b15abe2
--- /dev/null
+++ b/caffe2/operators/transpose_op_cudnn.cc
@@ -0,0 +1,153 @@
+#include "caffe2/operators/transpose_op.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+#define MAX_DIMS 8
+
+class CuDNNTransposeOp final : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  USE_DISPATCH_HELPER;
+
+  CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
+    // We will check the legality of axes_: it should be from 0 to axes_.size().
+    std::vector<int> axes_sorted(axes_);
+    std::sort(axes_sorted.begin(), axes_sorted.end());
+    for (int i = 0; i < axes_sorted.size(); ++i) {
+      if (axes_sorted[i] != i) {
+        CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
+      }
+    }
+
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
+  }
+
+  ~CuDNNTransposeOp() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
+  }
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    X_dims_.assign(X.dims().cbegin(), X.dims().cend());
+    if (axes_.empty()) {
+      axes_.resize(ndim);
+      std::iota(axes_.rbegin(), axes_.rend(), 0);
+    } else {
+      CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
+    }
+    std::vector<int> Y_dims(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      Y_dims[i] = X_dims_[axes_[i]];
+    }
+    Y->Resize(Y_dims);
+    // Do the actual transpose, which is implemented in DoRunWithType().
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
+#else
+    // CUDNN 5.1 does not have int support yet.
+    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+#endif
+  }
+
+ protected:
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& input = Input(0);
+    auto* output = Output(0);
+    int ndim = input.ndim();
+
+    if (ndim == 0) {
+      return true;
+    }
+    if (ndim == 1) {
+      output->CopyFrom(input);
+      return true;
+    }
+
+    cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    if (typedesc == CUDNN_DATA_INT32) {
+      // CUDNN Transpose only support float for now
+      math::Transpose<int, CUDAContext>(
+          X_dims_.size(),
+          X_dims_.data(),
+          axes_.data(),
+          input.template data<int>(),
+          output->template mutable_data<int>(),
+          &context_);
+      return true;
+    }
+#endif
+
+    CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
+
+    stride_y[ndim - 1] = 1;
+    for (int i = ndim - 2; i >= 0; i--) {
+      stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
+    }
+
+    CHECK(axes_.size() >= ndim);
+
+    stride_x[ndim] = 1;
+    for (int i = 0; i < ndim; i++) {
+      stride_x[i] = 1;
+      for (int j = axes_[i] + 1; j < ndim; j++) {
+        stride_x[i] *= input.dim32(j);
+      }
+      dim_y_int[i] = output->dim32(i);
+    }
+
+    // CuDNN requires at least 3-dim tensors
+    for (int i = ndim; i < MAX_DIMS; i++) {
+      stride_x[i] = 1;
+      stride_y[i] = 1;
+      dim_y_int[i] = 1;
+    }
+
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
+
+    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+        yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
+
+    CUDNN_ENFORCE(cudnnTransformTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        xDesc_,
+        static_cast<const void*>(input.template data<T>()),
+        cudnnTypeWrapper<T>::kZero(),
+        yDesc_,
+        static_cast<void*>(output->template mutable_data<T>())));
+    return true;
+  }
+
+  int stride_x[MAX_DIMS];
+  int stride_y[MAX_DIMS];
+  int dim_y_int[MAX_DIMS];
+
+  cudnnTensorDescriptor_t xDesc_;
+  cudnnTensorDescriptor_t yDesc_;
+  CuDNNWrapper cudnn_wrapper_;
+
+  std::vector<int> axes_;
+  std::vector<int> X_dims_;
+};
+
+REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tt_linear_op.cc b/caffe2/operators/tt_linear_op.cc
new file mode 100644
index 0000000..c49762e
--- /dev/null
+++ b/caffe2/operators/tt_linear_op.cc
@@ -0,0 +1,66 @@
+#include "caffe2/operators/tt_linear_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(TT, TTLinearOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(TTLinearGradient, TTLinearGradientOp<float, CPUContext>);
+
+// The TT-layer serves as a low-rank decomposition of a fully connected layer.
+// The inputs are the same as to an FC layer, but the number of the parameters
+// are greatly reduced.
+OPERATOR_SCHEMA(TT)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The TT-layer serves as a low-rank decomposition of a fully connected layer. The
+inputs are the same as to a fully connected layer, but the number of parameters
+are greatly reduced and forward computation time can be drastically reduced
+especially for layers with large weight matrices. The multiplication is computed
+as a product of the input vector with each of the cores that make up the TT
+layer. Given the input sizes (inp_sizes), output sizes(out_sizes), and the ranks
+of each of the cores (tt_ranks), the ith core will have size:
+
+    inp_sizes[i] * tt_ranks[i] * tt_ranks[i + 1] * out_sizes[i].
+
+The complexity of the computation is dictated by the sizes of inp_sizes,
+out_sizes, and tt_ranks, where there is the trade off between accuracy of the
+low-rank decomposition and the speed of the computation.
+)DOC")
+    .Arg(
+        "inp_sizes",
+        "(int[]) Input sizes of cores. Indicates the input size of "
+        "the individual cores; the size of the input vector X must match the "
+        "product of the inp_sizes array.")
+    .Arg(
+        "out_sizes",
+        "(int[]) Output sizes of cores. Indicates the output size "
+        "of the individual cores; the size of the output vector Y must match "
+        "the product of the out_sizes array.")
+    .Arg(
+        "tt_ranks",
+        "(int[]) Ranks of cores. Indicates the ranks of the "
+        "individual cores; lower rank means larger compression, faster "
+        "computation but reduce accuracy.")
+    .Input(
+        0,
+        "X",
+        "Input tensor from previous layer with size (M x K), where "
+        "M is the batch size and K is the input size.")
+    .Input(1, "b", "1D blob containing the bias vector")
+    .Input(
+        2,
+        "cores",
+        "1D blob containing each individual cores with sizes "
+        "specified above.")
+    .Output(
+        0,
+        "Y",
+        "Output tensor from previous layer with size (M x N), "
+        "where M is the batch size and N is the output size.");
+
+OPERATOR_SCHEMA(TTLinearGradient);
+
+GRADIENT_NOT_IMPLEMENTED_YET(TT);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
new file mode 100644
index 0000000..13196bf
--- /dev/null
+++ b/caffe2/operators/tt_linear_op.h
@@ -0,0 +1,189 @@
+#ifndef CAFFE2_OPERATORS_TT_LINEAR_OP_H_
+#define CAFFE2_OPERATORS_TT_LINEAR_OP_H_
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTLinearOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTLinearOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        inp_sizes_(OperatorBase::GetRepeatedArgument<int>("inp_sizes")),
+        out_sizes_(OperatorBase::GetRepeatedArgument<int>("out_sizes")),
+        tt_ranks_(OperatorBase::GetRepeatedArgument<int>("tt_ranks")),
+        Y_temp_(unique_ptr<Blob>(new Blob())) {}
+  ~TTLinearOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0); // Input array
+    const auto& b = Input(1); // Bias array
+    const auto& cores = Input(2); // 1D array containing the TT-cores
+    auto* Y = Output(0);
+
+    CAFFE_ENFORCE(X.ndim() > 1, "Number of dimensions in X: ", X.ndim());
+    CAFFE_ENFORCE(b.ndim() == 1, "Number of dimensions in b: ", b.ndim());
+    CAFFE_ENFORCE(
+        inp_sizes_.size() == out_sizes_.size(),
+        "inp_sizes has size: ",
+        inp_sizes_.size(),
+        ", out_sizes has size: ",
+        out_sizes_.size());
+    CAFFE_ENFORCE(
+        cores.ndim() == 1, "Number of dimensions in cores: ", cores.ndim());
+    // batch size
+    const int batch_size = X.ndim() > 1 ? X.dim32(0) : 1;
+
+    // dimension d of tensors
+    const int d = inp_sizes_.size();
+
+    // Keep track of index of current core in multiplication
+    int cores_idx = 0;
+
+    // Temporary buffer to facilitate multiplication of TT-cores with input
+    auto Y_buf = Y_temp_->GetMutable<Tensor<Context>>();
+    Y_buf->ResizeLike(X);
+    Y_buf->CopyFrom(X);
+
+    // The overall forward pass involves multiplication with each core, where
+    // each core has sizes dictated by inp_sizes_ and out_sizes_. Each core thus
+    // has size inp_sizes_[i] * tt_ranks_[i] * tt_ranks_[i + 1] * out_sizes_[i].
+    for (int i = (d - 1); i >= 0; --i) {
+      int curr_rows = inp_sizes_[i] * tt_ranks_[i + 1];
+      int curr_cols = tt_ranks_[i] * out_sizes_[i];
+
+      // TODO Replace by Reshape(), once wrappers are written
+      Y_buf->Resize(Y_buf->size() / curr_rows, curr_rows);
+      Y->Resize(Y_buf->size() / curr_rows, curr_cols);
+
+      // Defensive checks
+      CAFFE_ENFORCE(Y_buf->size() % curr_rows == 0, Y_buf->size(), curr_rows);
+      CAFFE_ENFORCE(
+          cores_idx + curr_rows * curr_cols <= cores.size(),
+          cores_idx + curr_rows * curr_cols,
+          cores.size());
+
+      // Multiply ith core with the intermediate output
+      math::Gemm<float, Context, Engine>(
+          CblasNoTrans,
+          CblasNoTrans,
+          Y_buf->size() / curr_rows,
+          curr_cols,
+          curr_rows,
+          1,
+          Y_buf->template data<float>(),
+          cores.template data<float>() + cores_idx,
+          0,
+          Y->template mutable_data<float>(),
+          &context_);
+
+      CAFFE_ENFORCE(Y->size() % out_sizes_[i] == 0, Y->size(), out_sizes_[i]);
+
+      // TODO Add GPU support by writing a generic wrapper.
+      auto Y_mat = EigenMatrixMap<float>(
+          Y->template mutable_data<float>(),
+          Y->size() / out_sizes_[i],
+          out_sizes_[i]);
+      Y_mat = ConstEigenMatrixMap<float>(
+                  Y->template data<float>(),
+                  out_sizes_[i],
+                  Y->size() / out_sizes_[i])
+                  .transpose()
+                  .eval();
+
+      // Resize operation
+      Y_buf->Resize(Y->dim32(0), Y->dim32(1));
+      context_.template Copy<float, CPUContext, CPUContext>(
+          Y->size(),
+          Y->template data<float>(),
+          Y_buf->template mutable_data<float>());
+
+      cores_idx += curr_rows * curr_cols;
+    }
+
+    // TODO Add GPU support by writing a generic wrapper.
+    auto Y_mat = EigenMatrixMap<float>(
+        Y->template mutable_data<float>(), batch_size, Y->size() / batch_size);
+    Y_mat = ConstEigenMatrixMap<float>(
+                Y->template data<float>(), Y->size() / batch_size, batch_size)
+                .transpose()
+                .eval();
+    // TODO Replace by Reshape(), once wrappers are written
+    Y->Resize(batch_size, Y->size() / batch_size);
+
+    // Check that output size of Y is the element-wise product of out_sizes
+    int prod_out_sizes = 1;
+    for (int i = 0; i < out_sizes_.size(); i++) {
+      prod_out_sizes *= out_sizes_[i];
+    }
+    CAFFE_ENFORCE(
+        Y->dim32(1) == prod_out_sizes,
+        "Output dimension of Y: ",
+        Y->dim32(1),
+        ", product of out_sizes: ",
+        prod_out_sizes);
+
+    // Add bias term
+    if (bias_multiplier_.size() != batch_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(batch_size);
+      math::Set<T, Context>(
+          batch_size,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans,
+        CblasNoTrans,
+        Y->dim32(0),
+        Y->dim32(1),
+        1,
+        1,
+        bias_multiplier_.template data<T>(),
+        b.template data<T>(),
+        1,
+        Y->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  Tensor<Context> bias_multiplier_;
+  std::vector<int> inp_sizes_;
+  std::vector<int> out_sizes_;
+  std::vector<int> tt_ranks_;
+  std::unique_ptr<Blob> Y_temp_;
+};
+
+// TODO: Complete after verifying utility of TT-layer's forward pass.
+template <typename T, class Context, class Engine = DefaultEngine>
+class TTLinearGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  TTLinearGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~TTLinearGradientOp() {}
+
+  bool RunOnDevice() override {
+    return false;
+  }
+
+ protected:
+  Tensor<Context> bias_multiplier_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TT_LINEAR_OP_H_
diff --git a/caffe2/operators/unique_ops.cc b/caffe2/operators/unique_ops.cc
new file mode 100644
index 0000000..69ef3db
--- /dev/null
+++ b/caffe2/operators/unique_ops.cc
@@ -0,0 +1,107 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/operators/unique_ops.h"
+
+#include <cmath>
+
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool UniqueOp<CPUContext>::DoRunWithType() {
+  auto& inputTensor = Input(0);
+  // use dim32 to enforce that it's fine to have remapping of type int
+  int N = inputTensor.dim32(0);
+  CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
+  auto* uniqueTensor = Output(UNIQUE);
+
+  int* remapping = nullptr;
+  if (REMAPPING < OutputSize()) {
+    auto* remappingTensor = Output(REMAPPING);
+    remappingTensor->ResizeLike(inputTensor);
+    remapping = remappingTensor->template mutable_data<int>();
+  }
+
+  const T* input = inputTensor.template data<T>();
+  // TODO(dzhulgakov): if perf becomes an issue consider doing hash table
+  // instead of sorting
+  order_.resize(N);
+  std::iota(order_.begin(), order_.end(), 0);
+  std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
+    return input[x] < input[y];
+  });
+  int K = N;
+  for (int i = 1; i < N; ++i) {
+    K -= input[order_[i]] == input[order_[i - 1]];
+  }
+  uniqueTensor->Resize(K);
+  T* unique = uniqueTensor->template mutable_data<T>();
+  K = 0;
+  T prev = -1;
+  for (int i = 0; i < N; ++i) {
+    if (i == 0 || prev != input[order_[i]]) {
+      prev = unique[K++] = input[order_[i]];
+    }
+    if (remapping) {
+      remapping[order_[i]] = K - 1;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
+
+OPERATOR_SCHEMA(Unique)
+    .NumInputs(1)
+    .NumOutputs(1, 2)
+    .SetDoc(R"DOC(
+Deduplicates input indices vector and optionally produces reverse remapping.
+There's no guarantees on the ordering of the output indices.
+)DOC")
+    .Input(0, "indices", "1D tensor of int32 or int64 indices.")
+    .Output(0, "unique_indices", "1D tensor of deduped entries.")
+    .Output(
+        1,
+        "remapping",
+        "(optional) mapping from `indices` to `unique_indices`. This has the "
+        "same shape as `indices`. Its elements are the indices into "
+        "`unique_indices` such that `Gather(['unique_indices', 'remapping'])` "
+        "yields `indices`.")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      std::vector<TensorShape> out(1);
+      out[0].set_data_type(in[0].data_type());
+      CAFFE_ENFORCE_EQ(in[0].dims_size(), 1);
+      if (in[0].dims(0) <= 1) {
+        // This special case is useful in some situation, e.g., when feeding
+        // tensor inference with empty tensor (where the first dim is the batch
+        // size)
+        out[0].add_dims(in[0].dims(0));
+      } else {
+        out[0].set_unknown_shape(true);
+      }
+      if (def.output_size() > 1) {
+        // Remapping has the same shape as the input tensor
+        out.push_back(in[0]);
+        out.back().set_data_type(TensorProto::INT32);
+      }
+      return out;
+    });
+
+SHOULD_NOT_DO_GRADIENT(Unique);
+
+} // namespace caffe2
diff --git a/caffe2/operators/unique_ops.cu b/caffe2/operators/unique_ops.cu
new file mode 100644
index 0000000..992488f
--- /dev/null
+++ b/caffe2/operators/unique_ops.cu
@@ -0,0 +1,140 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/operators/unique_ops.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/unique.h>
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+#if THRUST_VERSION >= 100800
+namespace {
+__global__ void remap_kernel(
+    thrust::device_ptr<int> second_order,
+    thrust::device_ptr<int> order,
+    int* output,
+    int N,
+    int K) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i >= K)
+    return;
+  int idx = second_order[i];
+  output[order[idx]] = i;
+  // Maybe cuda 1D kernel?
+  for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) {
+    output[order[idx]] = i;
+  }
+  return;
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool UniqueOp<CUDAContext>::DoRunWithType() {
+  auto& inputTensor = Input(0);
+  // use dim32 to enforce that it's fine to have remapping of type int
+  int N = inputTensor.dim32(0);
+  CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
+  auto* uniqueTensor = Output(UNIQUE);
+
+  int* remapping = nullptr;
+  if (REMAPPING < OutputSize()) {
+    auto* remappingTensor = Output(REMAPPING);
+    remappingTensor->ResizeLike(inputTensor);
+    remapping = remappingTensor->template mutable_data<int>();
+  }
+
+  if (N <= 0) {
+    // if the input is empty, we have nothing to do, not even launch kernel.
+    uniqueTensor->Resize(0);
+    T* unique = uniqueTensor->template mutable_data<T>();
+    return true;
+  }
+
+  const T* input = inputTensor.template data<T>();
+  thrust_unique_buffer_.Resize(N);
+  auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
+  context_.template CopyItems<CUDAContext, CUDAContext>(
+      inputTensor.meta(), N, input, buffer);
+
+  // Create two vectors of {0, 1, ..., N-1} on CUDA device
+  thrust::device_vector<int> order1(N), order2(N);
+  thrust::sequence(
+      thrust::cuda::par.on(context_.cuda_stream()),
+      order1.begin(),
+      order1.end());
+  thrust::sequence(
+      thrust::cuda::par.on(context_.cuda_stream()),
+      order2.begin(),
+      order2.end());
+
+  // Sort the input along with order vector. So now we know where each element
+  // is permutated to. For example:
+  //    input1 = 1,3,5,1,5,7,9
+  //    order1 = 0,1,2,3,4,5,6
+  // Now we have:
+  //    output = 1,1,3,5,5,7,9
+  //    order1 = 0,3,1,2,4,5,6
+  thrust::sort_by_key(
+      thrust::cuda::par.on(context_.cuda_stream()),
+      buffer,
+      buffer + N,
+      order1.begin());
+
+  // Use consequent unique op to get another order_buffer
+  //    input2 = 1,1,3,5,5,7,9
+  //    order2 = 0,1,2,3,4,5,6
+  // Now we have:
+  //    output = 1,3,5,7,9
+  //    order2 = 0,2,3,5,6
+  auto new_last = thrust::unique_by_key(
+      thrust::cuda::par.on(context_.cuda_stream()),
+      buffer,
+      buffer + N,
+      order2.begin());
+  int K = new_last.first - buffer;
+
+  uniqueTensor->Resize(K);
+  T* unique = uniqueTensor->template mutable_data<T>();
+  context_.template CopyItems<CUDAContext, CUDAContext>(
+      thrust_unique_buffer_.meta(), K, buffer, unique);
+
+  // Compute the remapping. For example, for the number 1, if we look at
+  // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
+  // remapped to 0 in final input. And from order1, we know where they come
+  // from. The rest is easy.
+  if (remapping != nullptr) {
+    // record remap
+    remap_kernel<<<
+        CAFFE_GET_BLOCKS(K),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        order2.data(), order1.data(), remapping, N, K);
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);
+
+#endif // THRUST_VERSION >= 100800
+} // namespace caffe2
diff --git a/caffe2/operators/unique_ops.h b/caffe2/operators/unique_ops.h
new file mode 100644
index 0000000..d8af029
--- /dev/null
+++ b/caffe2/operators/unique_ops.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE_OPERATORS_UNIQUE_OPS_H_
+#define CAFFE_OPERATORS_UNIQUE_OPS_H_
+
+#include <cmath>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+/**
+ * Deduplicates input indices vector and optionally produces reverse remapping.
+ * Current implementation produces a sorted list but it's not guaranteed in
+ * general.
+ */
+template <class Context>
+class UniqueOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(UniqueOp)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  vector<int> order_;
+  Tensor<Context> thrust_unique_buffer_;
+  Tensor<Context> cuda_order_buffer_;
+  Tensor<Context> second_order_buffer_;
+
+ public:
+  OUTPUT_TAGS(UNIQUE, REMAPPING);
+};
+
+} // namespace caffe2
+
+#endif // CAFFE_OPERATORS_UNIQUE_OPS_H_
diff --git a/caffe2/operators/upsample_op.cc b/caffe2/operators/upsample_op.cc
new file mode 100644
index 0000000..17009dc
--- /dev/null
+++ b/caffe2/operators/upsample_op.cc
@@ -0,0 +1,170 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/operators/upsample_op.h"
+
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool UpsampleBilinearOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+
+  const int batch_size = X.dim32(0);
+  const int num_channels = X.dim32(1);
+  const int input_height = X.dim32(2);
+  const int input_width = X.dim32(3);
+  int output_width = input_width * width_scale_;
+  int output_height = input_height * height_scale_;
+  Y->Resize(batch_size, num_channels, output_height, output_width);
+
+  const float* input = X.data<float>();
+  float* output = Y->mutable_data<float>();
+  int channels = num_channels * batch_size;
+
+  const float rheight = (output_height > 1)
+      ? (float)(input_height - 1) / (output_height - 1)
+      : 0.f;
+  const float rwidth =
+      (output_width > 1) ? (float)(input_width - 1) / (output_width - 1) : 0.f;
+  for (int h2 = 0; h2 < output_height; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < input_height - 1) ? 1 : 0;
+    const float h1lambda = h1r - h1;
+    const float h0lambda = (float)1. - h1lambda;
+    for (int w2 = 0; w2 < output_width; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < input_width - 1) ? 1 : 0;
+      const float w1lambda = w1r - w1;
+      const float w0lambda = (float)1. - w1lambda;
+      const float* Xdata = &input[h1 * input_width + w1];
+      float* Ydata = &output[h2 * output_width + w2];
+      for (int c = 0; c < channels; ++c) {
+        Ydata[0] = h0lambda * (w0lambda * Xdata[0] + w1lambda * Xdata[w1p]) +
+            h1lambda *
+                (w0lambda * Xdata[h1p * input_width] +
+                 w1lambda * Xdata[h1p * input_width + w1p]);
+        Xdata += input_width * input_height;
+        Ydata += output_width * output_height;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <>
+bool UpsampleBilinearGradientOp<float, CPUContext>::RunOnDevice() {
+  const auto& dY = Input(0);
+  const auto& X = Input(1);
+  auto* dX = Output(0);
+
+  const auto& inputDims = dY.dims();
+  CAFFE_ENFORCE_EQ(4, inputDims.size());
+  const int batch_size = dY.dim32(0);
+  const int num_channels = dY.dim32(1);
+  const int input_height = dY.dim32(2);
+  const int input_width = dY.dim32(3);
+  const int output_height = X.dim32(2);
+  const int output_width = X.dim32(3);
+  dX->Resize(batch_size, num_channels, output_height, output_width);
+  math::Set<float, CPUContext>(
+      dX->size(), 0.0f, dX->mutable_data<float>(), &context_);
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  int channels = num_channels * batch_size;
+
+  const float rheight = (input_height > 1)
+      ? (float)(output_height - 1) / (input_height - 1)
+      : 0.f;
+  const float rwidth =
+      (input_width > 1) ? (float)(output_width - 1) / (input_width - 1) : 0.f;
+
+  for (int h2 = 0; h2 < input_height; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < output_height - 1) ? 1 : 0;
+    const float h1lambda = h1r - h1;
+    const float h0lambda = (float)1. - h1lambda;
+    for (int w2 = 0; w2 < input_width; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < output_width - 1) ? 1 : 0;
+      const float w1lambda = w1r - w1;
+      const float w0lambda = (float)1. - w1lambda;
+      float* pos1 = &dXdata[h1 * output_width + w1];
+      const float* pos2 = &dYdata[h2 * input_width + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * output_width] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * output_width + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += output_width * output_height;
+        pos2 += input_width * input_height;
+      }
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(UpsampleBilinear, UpsampleBilinearOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    UpsampleBilinearGradient,
+    UpsampleBilinearGradientOp<float, CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(UpsampleBilinear)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension")
+    .SetDoc(R"DOC(
+Resizes the spatial dimensions of the input using bilinear
+interpolation. The `width_scale` and `height_scale` arguments
+control the size of the output, which is given by:
+output_width = floor(input_width * width_scale)
+output_height = floor(output_height * height_scale)
+)DOC")
+    .Input(0, "X", "Input tensor")
+    .Output(0, "Y", "Output tensor");
+
+// Input: dY, output: dX
+OPERATOR_SCHEMA(UpsampleBilinearGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension");
+
+class GetUpsampleBilinearGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "UpsampleBilinearGradient",
+        "",
+        vector<string>{GO(0), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(UpsampleBilinear, GetUpsampleBilinearGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/upsample_op.h b/caffe2/operators/upsample_op.h
new file mode 100644
index 0000000..6d926f0
--- /dev/null
+++ b/caffe2/operators/upsample_op.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class UpsampleBilinearOp final : public Operator<Context> {
+ public:
+  UpsampleBilinearOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    if (HasArgument("width_scale")) {
+      width_scale_ = static_cast<T>(
+          OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    }
+    if (HasArgument("height_scale")) {
+      height_scale_ = static_cast<T>(
+          OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    }
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T width_scale_;
+  T height_scale_;
+};
+
+template <typename T, class Context>
+class UpsampleBilinearGradientOp final : public Operator<Context> {
+ public:
+  UpsampleBilinearGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), width_scale_(1), height_scale_(1) {
+    width_scale_ = static_cast<T>(
+        OperatorBase::GetSingleArgument<float>("width_scale", 1));
+    height_scale_ = static_cast<T>(
+        OperatorBase::GetSingleArgument<float>("height_scale", 1));
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T width_scale_;
+  T height_scale_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
new file mode 100644
index 0000000..1abf213
--- /dev/null
+++ b/caffe2/operators/utility_ops.cc
@@ -0,0 +1,1371 @@
+#include "caffe2/operators/utility_ops.h"
+#include <cmath>
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+
+template <>
+bool WeightedSumOp<CPUContext>::RunOnDevice() {
+  return DoRunWithType<float>();
+}
+
+template <>
+bool WeightedSumGradientOp<CPUContext>::RunOnDevice() {
+  return DoRunWithType<float>();
+}
+
+REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
+REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    ScatterWeightedSum,
+    ScatterWeightedSumOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
+
+// From CPU, copy it to whatever the current context
+REGISTER_CPU_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
+REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
+REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(EnsureDense, EnsureDenseOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    AccumulateHistogram,
+    AccumulateHistogramOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(WallClockTime)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Time since epoch in nanoseconds.")
+    .Output(0, "time", "The time in nanoseconds.");
+
+REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);
+
+OPERATOR_SCHEMA(Print)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc("Logs shape and contents of input tensor to stderr or to a file.")
+    .Arg(
+        "to_file",
+        "(bool) if 1, saves contents to the root folder of the current "
+        "workspace, appending the tensor contents to a file named after "
+        "the blob name. Otherwise, logs to stderr.")
+    .Arg(
+        "limit",
+        "(int, default 0) If set, prints the first `limit` elements of tensor. "
+        "If 0, prints the first `k_limit_default`(1000) elements of tensor")
+    .Arg(
+        "every_n",
+        "(int, default 1) Print tensor every `every_n` runs")
+    .Input(0, "tensor", "The tensor to print.");
+
+OPERATOR_SCHEMA(LengthsToShape)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This operator takes a list of $N$ equal integers as input which represent the lengths of $N$ vectors. The output is the calculated shape of the matrix if the $N$ integers were combined into a single matrix.
+
+Github Links:
+
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsToShape",
+    ["X"],
+    ["Y"]
+)
+
+# Create X: Sample softmax output for 5-class model
+X = np.array([2,2,2,2,2,2,2,2,2,2])
+print("X:\n",X)
+
+# Feed X into workspace
+workspace.FeedBlob("X", X.astype(np.int32))
+
+# Run op
+workspace.RunOperatorOnce(op)
+
+# Collect Output
+print("Y:\n", workspace.FetchBlob("Y"))
+
+```
+
+**Result**
+
+```
+
+X:
+ [2 2 2 2 2 2 2 2 2 2]
+Y:
+ [10  2]
+
+```
+
+</details>
+
+    )DOC")
+    .Input(
+        0,
+        "X",
+        "List, of length $N$, of equal integers representing the lengths of several vectors.")
+    .Output(
+        0,
+        "Y",
+        "Vector of length 2 describing the dimensions of the data if the $N$ vectors from the input were combined to a single matrix.");
+OPERATOR_SCHEMA(FlattenToVec)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /*def*/,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      int total = 1;
+      for (auto d : in[0].dims()) {
+        total *= d;
+      }
+      out[0].set_data_type(in[0].data_type());
+      out[0].add_dims(total);
+      return out;
+    })
+    .SetDoc(R"DOC(
+
+The *FlattenToVec* op flattens the input tensor into a 1-D vector. The op accepts a single input tensor and returns a single output tensor.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "FlattenToVec",
+    ["input"],
+    ["output"],
+)
+
+workspace.FeedBlob("input", np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]).astype(np.float32))
+print("input:\n", workspace.FetchBlob("input"))
+
+workspace.RunOperatorOnce(op)
+print("output: \n", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+ [[ 1.  2.  3.]
+ [ 4.  5.  6.]
+ [ 7.  8.  9.]
+ [10. 11. 12.]]
+output:
+ [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "A tensor of rank >= 1.")
+    .Output(0, "output", "A tensor of rank 1 (vector) with the contents of the input tensor.");
+
+OPERATOR_SCHEMA(Alias)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Makes the output and the input share the same underlying storage.
+
+WARNING: in general, in caffe2's operator interface different tensors should
+have different underlying storage, which is the assumption made by
+components such as the dependency engine and memory optimization. Thus, in
+normal situations you should not use the AliasOp, especially in a normal
+forward-backward pass.
+
+The Alias op is provided so one can achieve true asynchrony, such as
+Hogwild, in a graph. But make sure you understand all the implications
+similar to multi-thread computation before you use it explicitly.
+)DOC")
+    .Input(0, "input", "Input tensor whose storage will be shared.")
+    .Output(0, "output", "Tensor of same shape as input, sharing its storage.");
+
+OPERATOR_SCHEMA(ResizeLike)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /*def*/,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      out.push_back(in[1]);
+      out[0].set_data_type(in[0].data_type());
+      return out;
+    })
+    .SetDoc(R"DOC(
+Produces tensor containing data of first input and shape of second input.
+)DOC")
+    .Input(0, "data", "Tensor whose data will be copied into the output.")
+    .Input(1, "shape_tensor", "Tensor whose shape will be applied to output.")
+    .Output(0, "output", "Tensor with data of input 0 and shape of input 1.");
+
+OPERATOR_SCHEMA(SumInt)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .InputsCanCrossDevices()
+    .TensorInferenceFunction([](const OperatorDef& /*def*/,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      out.push_back(in[0]);
+      out[0].set_data_type(TensorProto::INT32);
+      return out;
+    })
+    .AllowInplace({{0, 0}});
+
+OPERATOR_SCHEMA(WeightedSum)
+    .NumInputs([](int n) { return (n > 0 && n % 2 == 0); })
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc(R"DOC(
+Element-wise weighted sum of several data, weight tensor pairs.
+Input should be in the form X_0, weight_0, X_1, weight_1, ... where X_i all
+have the same shape, and weight_i are size 1 tensors that specifies the weight
+of each vector. Note that if one wants to do in-place computation, it could
+only be done with X_0 also as the output, but not other X_i.
+)DOC")
+    .Input(0, "data_0", "First of the input tensors.")
+    .Input(0, "weight_0", "Weight of the first input in the sum.")
+    .Output(0, "output", "Result containing weighted elem-wise sum of inputs.");
+
+OPERATOR_SCHEMA(WeightedSumGradient)
+    .NumInputs([](int n) { return (n > 0 && n % 2 == 1); })
+    .NumOutputs(1, INT_MAX);
+
+OPERATOR_SCHEMA(ScatterWeightedSum)
+    .NumInputs([](int n) { return (n > 3 && (n - 3) % 2 == 0); })
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Similar to WeightedSum, computes the weighted sum of several tensors, with
+the difference that inputs are sliced tensors. The first tensor has to be
+in-place and only slices of it on the first dimension as indexed by INDICES
+will be updated.
+
+Note: The op pretty much ignores the exact shapes of the input arguments and
+cares only about sizes. It's done for performance consideration to avoid
+unnecessary reshapes. Only first dimension of X_0 is important, let's call it
+N. If M is the total size of X_0 and K is the size of INDICES then X_i is
+assumed to be of shape K x (M / N) regardless of the real shape.
+
+Note: Each update in INDICES is applied independently which means that if
+duplicated elements are present in INDICES the corresponding slice of X_0
+will be scaled multiple times. Manual collapsing of INDICES is required
+beforehand if necessary.
+
+Note: Updates are applied sequentially by inputs which might have undesired
+consequences if the input tensor is accessed concurrently by different op
+(e.g. when doing Hogwild). Other threads might see intermediate results even
+on individual slice level, e.g. X_0 scaled by weight_0 but without any
+updates applied.
+
+Currently only works on CPU because of access to INDICES.
+)DOC")
+    .Input(0, "X_0", "Tensor to be updated.")
+    .Input(
+        1,
+        "Weight_0",
+        "Scalar weight for X_0, applied only to slices affected.")
+    .Input(
+        2,
+        "INDICES",
+        "1-D list of indices on the first dimension of X_0 "
+        "that need to be updated")
+    .Input(3, "X_1", "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
+    .Input(4, "Weight_1", "Scalar weight for X_1 update")
+    .Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
+    .EnforceInplace({{0, 0}});
+
+OPERATOR_SCHEMA(ScatterAssign)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Update slices of the tensor in-place by overriding current value.
+
+Note: The op pretty much ignores the exact shapes of the input arguments and
+cares only about sizes. It's done for performance consideration to avoid
+unnecessary reshapes. Only first dimension of X_0 is important, let's call it
+N. If M is the total size of X_0 and K is the size of INDICES then X_i is
+assumed to be of shape K x (M / N) regardless of the real shape.
+
+Note: Each update in INDICES is applied independently which means that if
+duplicated elements are present in INDICES arbitrary one will win.
+
+Currently only works on CPU because of access to INDICES.
+)DOC")
+    .Input(0, "DATA", "Tensor to be updated.")
+    .Input(
+        1,
+        "INDICES",
+        "1-D list of indices on the first dimension"
+        "of X_0 that need to be updated")
+    .Input(
+        2,
+        "SLICES",
+        "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
+    .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
+
+OPERATOR_SCHEMA(Copy)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Copy input tensor into output, potentially across devices.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Copy",
+    ["input"],
+    ["output"]
+)
+
+workspace.FeedBlob("input", np.random.rand(3,3))
+print("input:", workspace.FetchBlob("input"))
+workspace.RunOperatorOnce(op)
+print("output:", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+output:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "(*Tensor*): input tensor to copy")
+    .Output(0, "output", "(*Tensor*): copy of input tensor");
+
+OPERATOR_SCHEMA(CopyGPUToCPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyGPUToCPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cuda_option);
+      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for GPU to CPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyCPUToGPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyCPUToGPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), cuda_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for CPU to GPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyFromCPUInput)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), op_device);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Take a CPU input tensor and copy it to an output in the current
+Context (GPU or CPU). This may involves cross-device MemCpy.
+)DOC")
+    .Input(0, "input", "The input CPU tensor.")
+    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
+
+OPERATOR_SCHEMA(CopyOnDeviceLike)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Copy input tensor into output to the specific device.")
+    .Input(0, "input", "The input tensor.")
+    .Input(1, "dst", "Tensor, on which device the copy will be performed.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(HasElements)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The *HasElements* op accepts a single input $tensor$, and produces a single boolean output $has\_elements$. The output is *True* if and only if $tensor$ has size > 0. Note, this op is the opposite of the *IsEmpty* op.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "HasElements",
+    ["tensor"],
+    ["has_elements"],
+)
+
+# Use a not-empty tensor
+workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32))
+print("tensor:\n", workspace.FetchBlob("tensor"))
+
+workspace.RunOperatorOnce(op)
+print("has_elements: ", workspace.FetchBlob("has_elements"),"\n")
+
+# Use an empty tensor
+workspace.FeedBlob("tensor", np.empty(0))
+print("tensor:\n", workspace.FetchBlob("tensor"))
+
+workspace.RunOperatorOnce(op)
+print("has_elements: ", workspace.FetchBlob("has_elements"))
+
+```
+
+**Result**
+
+```
+
+tensor:
+ [[ 0.6116506  -0.54433197]
+ [ 0.19406661 -0.7338629 ]]
+has_elements:  True
+
+tensor:
+ []
+has_elements:  False
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "tensor", "Input data tensor to check for elements.")
+    .Output(0, "has_elements", "Output scalar boolean tensor. True if input has size > 0.");
+
+OPERATOR_SCHEMA(IsEmpty)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The *IsEmpty* op accepts a single input $tensor$, and produces a single boolean output $is\_empty$. The output is *True* if and only if $tensor$ has size == 0.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "IsEmpty",
+    ["tensor"],
+    ["is_empty"],
+)
+
+# Use a not-empty tensor
+workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32))
+print("tensor:\n", workspace.FetchBlob("tensor"))
+
+workspace.RunOperatorOnce(op)
+print("is_empty: ", workspace.FetchBlob("is_empty"),"\n")
+
+# Use an empty tensor
+workspace.FeedBlob("tensor", np.empty(0))
+print("tensor:\n", workspace.FetchBlob("tensor"))
+
+workspace.RunOperatorOnce(op)
+print("is_empty: ", workspace.FetchBlob("is_empty"))
+
+```
+
+**Result**
+
+```
+
+tensor:
+ [[ 0.26018378  0.6778789 ]
+ [-1.3097627  -0.40083608]]
+is_empty:  False
+
+tensor:
+ []
+is_empty:  True
+
+```
+
+</details>
+
+)DOC")
+    .ScalarType(::caffe2::TensorProto_DataType::TensorProto_DataType_BOOL)
+    .Input(0, "tensor", "Input data tensor to check if empty.")
+    .Output(0, "is_empty", "Output scalar boolean tensor. True if input has size == 0.");
+
+OPERATOR_SCHEMA(Gather)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+
+The *Gather* op accepts a *DATA* tensor of rank $r >= 1$ and *INDICES* tensor of rank $q$ as inputs. It then gathers entries of the outer-most dimension of *DATA*, indexed by *INDICES*, and concatenate them in an output tensor of rank $q + (r - 1)$.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Gather",
+    ["DATA", "INDICES"],
+    ["OUTPUT"]
+)
+data = np.array([[1., 1.2],[2.3, 3.4],[4.5, 5.7]])
+print("DATA:\n",data)
+
+inds = np.array([[0, 1],[1, 2]])
+print("INDICES:\n",inds)
+
+# Feed X into workspace
+workspace.FeedBlob("DATA", data.astype(np.float32))
+workspace.FeedBlob("INDICES", inds.astype(np.int32))
+
+workspace.RunOperatorOnce(op)
+print("OUTPUT:\n", workspace.FetchBlob("OUTPUT"))
+
+```
+
+**Result**
+
+```
+
+DATA:
+ [[1.  1.2]
+ [2.3 3.4]
+ [4.5 5.7]]
+INDICES:
+ [[0 1]
+ [1 2]]
+OUTPUT:
+ [[[1.  1.2]
+  [2.3 3.4]]
+
+ [[2.3 3.4]
+  [4.5 5.7]]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "DATA", "Input data tensor of rank $r>=1$")
+    .Input(1, "INDICES", "Input indices tensor of rank $q$. This tensor must contain integers.")
+    .Output(0, "OUTPUT", "Output tensor of rank $q+(r-1)$")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      for (auto d : in[1].dims()) {
+        out[0].add_dims(d);
+      }
+      for (int i = 1; i < in[0].dims_size(); ++i) {
+        out[0].add_dims(in[0].dims(i));
+      }
+      out[0].set_data_type(in[0].data_type());
+      return out;
+    });
+
+OPERATOR_SCHEMA(GatherRanges)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
+corresponding ranges into a 1-D tensor OUTPUT.
+
+RANGES dimentions description:
+1: represents list of examples within a batch
+2: represents list features
+3: two values which are start and length or a range (to be applied on DATA)
+
+Another output LENGTHS represents each example length within OUTPUT
+
+Example:
+  DATA  = [1, 2, 3, 4, 5, 6]
+  RANGES = [
+    [
+      [0, 1],
+      [2, 2],
+    ],
+    [
+      [4, 1],
+      [5, 1],
+    ]
+  ]
+  OUTPUT = [1, 3, 4, 5, 6]
+  LENGTHS = [3, 2]
+)DOC")
+    .Input(0, "DATA", "Tensor of rank 1.")
+    .Input(
+        1,
+        "RANGES",
+        "Tensor of int32/int64 ranges, of dims (N, M, 2). "
+        "Where N is number of examples and M is a size of each example. "
+        "Last dimension represents a range in the format (start, lengths)")
+    .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
+    .Output(
+        1,
+        "LENGTHS",
+        "1-D tensor of size N with lengths over gathered data"
+        " for each row in a batch. sum(LENGTHS) == OUTPUT.size()")
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      std::vector<TensorShape> out(2);
+
+      int total = 1;
+      for (auto d : in[0].dims()) {
+        total *= d;
+      }
+      out[0].add_dims(total);
+      out[0].set_data_type(in[0].data_type());
+      out[1].add_dims(in[1].dims(0));
+      out[1].set_data_type(in[1].data_type());
+      return out;
+    });
+
+OPERATOR_SCHEMA(LengthsGather)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Gather items from sparse tensor. Sparse tensor is described by items and
+lengths. This operator gathers items corresponding to lengths at the given
+indices. This deliberately doesn't return lengths of OUTPUTS so that both lists
+and maps can be supported without special cases. If you need lengths tensor for
+ OUTPUT, use `Gather`.
+
+Example:
+  ITEMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  LENGTHS = [0, 2, 3, 1, 4]
+  INDICES = [0, 2, 4]
+
+  OUTPUT = [2, 3, 4, 6, 7, 8, 9]
+)DOC")
+    .Input(0, "ITEMS", "items tensor")
+    .Input(1, "LENGTHS", "lengths tensor")
+    .Input(2, "INDICES", "indices into LENGTHS where items should be gathered")
+    .Output(0, "OUTPUT", "1-D tensor containing gathered items");
+
+OPERATOR_SCHEMA(LengthsToSegmentIds)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a vector of segment lengths (*lengths*) the *LengthsToSegmentIds* op returns a zero-based, consecutive vector of segment ids (*segment_ids*). For example, *lengths=[1, 3, 0, 2]* will produce *segment_ids=[0, 1, 1, 1, 3, 3]*. In general, the inverse operation is *SegmentIdsToLengths*. Notice though that trailing empty sequence lengths can't be properly recovered from segment ids.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "LengthsToSegmentIds",
+    ["lengths"],
+    ["segment_ids"],
+)
+
+workspace.FeedBlob("lengths", np.array([1, 3, 0, 2]).astype(np.int32))
+print("lengths:\n", workspace.FetchBlob("lengths"))
+
+workspace.RunOperatorOnce(op)
+print("segment_ids: \n", workspace.FetchBlob("segment_ids"))
+
+```
+
+**Result**
+
+```
+
+lengths:
+ [1 3 0 2]
+segment_ids:
+ [0 1 1 1 3 3]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
+    .Output(0, "segment_ids", "1D tensor of length *sum(lengths)*");
+
+OPERATOR_SCHEMA(LengthsToRanges)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
+      out_shape.push_back(2);
+      return vector<TensorShape>{
+          CreateTensorShape(out_shape, in[0].data_type())};
+    })
+    .SetDoc(R"DOC(
+Given a vector of segment lengths, calculates offsets of each segment and packs
+them next to the lengths. For the input vector of length N the output is a Nx2
+matrix with (offset, lengths) packaged for each segment.
+
+For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`.
+)DOC")
+    .Input(0, "lengths", "1D tensor of int32 segment lengths.")
+    .Output(
+        0,
+        "ranges",
+        "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");
+
+OPERATOR_SCHEMA(SegmentIdsToLengths)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Transfers a vector of segment ids to a vector of segment lengths. This operation
+supports non-consecutive segment ids. Segments not appearing in the input vector
+will have length 0. If the second input is provided, the number of segments =
+the size of its first dimension. Otherwise, the number of segments = the last
+index in the first input vector + 1.
+
+In general, for consecutive, zero-based segment IDs, this is the inverse
+operation of LengthsToSegmentIds, except that a vector of segment IDs
+cannot represent empty segments at the end (if the second input is absent).
+)DOC")
+    .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
+    .Input(
+        1,
+        "data (optional)",
+        "if provided, number of segments = the size of its first dimension")
+    .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
+
+OPERATOR_SCHEMA(SegmentIdsToRanges)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Transfers a vector of segment ids to a vector of segment ranges. This operation
+supports non-consecutive segment ids. Segments not appearing in the input vector
+will have length 0. If the second input is provided, the number of segments =
+the size of its first dimension. Otherwise, the number of segments = the last
+index in the first input vector + 1.
+)DOC")
+    .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
+    .Input(
+        1,
+        "data (optional)",
+        "if provided, number of segments = the size of its first dimension")
+    .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
+
+OPERATOR_SCHEMA(LengthsToWeights)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("power", "n of 1/pow(length,n) for normalization")
+    .SetDoc(R"DOC(
+Similar as LengthsToSegmentIds but output vector of segment
+weights derived by lengths. i.e 1/pow(length, power)
+)DOC")
+    .Input(0, "lengths", "1-D int32_t or int64_t tensor of lengths")
+    .Output(0, "a vector of weights", "1-D float tensor of weights by length");
+
+
+
+SHOULD_NOT_DO_GRADIENT(WallClockTime);
+
+OPERATOR_SCHEMA(UnsafeCoalesce)
+    .NumInputsOutputs([](int inputs, int outputs) {
+      return inputs + 1 == outputs;
+    })
+    .AllowInplace([](int input, int output) { return input == output; })
+    .SetDoc(R"DOC(
+Coalesce the N inputs into N outputs and a single coalesced output blob.
+
+This allows operations that operate over multiple small kernels (e.g.
+biases in a deep CNN) to be coalesced into a single larger operation,
+amortizing the kernel launch overhead, synchronization costs for
+distributed computation, etc.
+
+The operator:
+
+- computes the total size of the coalesced blob by summing the input sizes
+- allocates the coalesced output blob as the total size
+- copies the input vectors into the coalesced blob, at the correct offset.
+- aliases each Output(i) to- point into the coalesced blob, at the corresponding offset for Input(i).
+
+This is 'unsafe' as the output vectors are aliased, so use with
+caution.
+
+)DOC");
+
+OPERATOR_SCHEMA(EnsureDense)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+This operator converts dense or sparse gradients to dense ones.
+Therefore, sparse gradient can be back propagated to Operators that consume
+dense gradients only (e.g., FCGradient).
+
+The operator's behaviors:
+
+- In forward, simply pass in place or copy input to the output.
+- In backward, if the gradient passed-in is sparse gradient, change it to dense gradient in linear time; otherwise, simply pass the dense gradient.
+)DOC")
+    .Input(0, "input", "Input tensors.")
+    .Output(0, "output", "Output tensor. Same dimension as inputs.");
+
+OPERATOR_SCHEMA(AccumulateHistogram)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+This operator calculate thes histogram of values in input tensor.
+There're 2 outputs, one for histogram of current input tensor, and another
+for histogram of the all input tensors accumulated through history.
+The output would contain num_buckets + 2 values. index[1 ... num_buckets]
+for values in [lower_bound, upper_bound) interval. And the rest 2 for values
+smaller than lower_bound or greater than upper_bound respectively.
+)DOC")
+    .Input(0, "X", "Input tensor.")
+    .Output(0, "CurHist", "Output histogram of the current tensor.")
+    .Output(1, "AccHist", "Accumulated histogram of the history tensor.")
+    .Arg("lower_bound", "the lower bound value")
+    .Arg("upper_bound", "the upper bound value")
+    .Arg(
+        "num_buckets",
+        "number of buckets to use in [lower_bound, upper_bound)");
+
+class GetEnsureDenseGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(
+        GradOut(0).IsSparse() || GradOut(0).IsDense(),
+        "Input gradient ",
+        O(0),
+        " should be either sparse or dense.");
+
+    if (GradOut(0).IsDense()) {
+      SetDense(0, GO(0));
+      return vector<OperatorDef>();
+    } else {
+      return SingleGradientDef(
+          "SparseToDense",
+          "",
+          vector<string>{GO_I(0), GO_V(0), I(0)},
+          vector<string>{GI(0)});
+    }
+  }
+};
+REGISTER_GRADIENT(EnsureDense, GetEnsureDenseGradient);
+
+SHOULD_NOT_DO_GRADIENT(Print);
+SHOULD_NOT_DO_GRADIENT(HasElements);
+SHOULD_NOT_DO_GRADIENT(IsEmpty);
+SHOULD_NOT_DO_GRADIENT(LengthsToShape);
+SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);
+
+class GetAliasGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    // We will simply pass-along the gradient. Nothing needs to
+    // be calculated.
+    SetDense(0, GO(0));
+    return vector<OperatorDef>();
+  }
+};
+REGISTER_GRADIENT(Alias, GetAliasGradient);
+
+SHOULD_NOT_DO_GRADIENT(ResizeLike);
+
+class GetSumGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    for (auto i = 0; i < def_.input_size(); ++i) {
+      SetDense(i, GO(0));
+    }
+    return vector<OperatorDef>();
+  }
+};
+REGISTER_GRADIENT(Sum, GetSumGradient);
+
+SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
+SHOULD_NOT_DO_GRADIENT(ScatterAssign);
+
+class GetWeightedSumGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper argsHelper(def_);
+    const bool grad_on_w = argsHelper.GetSingleArgument<bool>("grad_on_w", 0);
+
+    auto inputs = vector<string>{GO(0)};
+    auto outputs = vector<string>();
+    for (int i = 0; i < def_.input_size(); i += 2) {
+      inputs.push_back(I(i));
+      inputs.push_back(I(i + 1));
+      outputs.push_back(GI(i));
+    }
+
+    if (grad_on_w) {
+      for (int i = 0; i < def_.input_size(); i += 2) {
+        outputs.push_back(GI(i + 1));
+      }
+    }
+
+    return SingleGradientDef("WeightedSumGradient", "", inputs, outputs);
+  }
+};
+REGISTER_GRADIENT(WeightedSum, GetWeightedSumGradient);
+
+class GetGatherGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    ArgumentHelper argsHelper(def_);
+    const bool dense_gradient =
+        argsHelper.GetSingleArgument<bool>("dense_gradient", false);
+
+    using Op = GatherOp<CPUContext>;
+
+    if (dense_gradient) {
+      return vector<OperatorDef>{CreateOperatorDef(
+          "SparseToDense",
+          "",
+          vector<string>{I(Op::INDICES), GO(0), I(Op::DATA)},
+          vector<string>{GI(Op::DATA)})};
+    } else {
+      // For now we don't do any reshaping as the consumer of this op would
+      // probably be ScatterUpdate which is intenionally ignores shapes. We
+      // might need to revisit it in the future for correctness purposes. The
+      // right shape for the output woild be to flatten INDICES and collapse
+      // first X dims of GRAD
+      SetSparse(Op::DATA, I(Op::INDICES), GO(0));
+      return vector<OperatorDef>();
+    }
+  }
+};
+REGISTER_GRADIENT(Gather, GetGatherGradient);
+
+struct GetFlattenToVecGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
+
+struct GetCopyGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CopyOnDeviceLike",
+        "",
+        vector<string>{GO(0), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Copy, GetCopyGradient);
+
+struct GetGPUToCPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
+
+struct GetCPUToGPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
+
+SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
+SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
+SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
+SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
+SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
+SHOULD_NOT_DO_GRADIENT(LengthsGather);
+SHOULD_NOT_DO_GRADIENT(AccumulateHistogram);
+
+template <>
+bool NanCheckOp<CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  const int D = X.size();
+  const float* data = X.data<float>();
+  ConstEigenVectorMap<float> input_data(data, D);
+
+  bool all_finite = input_data.allFinite();
+
+  if (!all_finite) {
+    std::cerr << "Tensor contained NaN or inf: [" << this->debug_def().input(0)
+              << "]" << std::endl;
+
+    for (int j = 0; j < InputSize(); j++) {
+      std::cerr << "Tensor name: " << this->debug_def().input(j) << std::endl;
+      std::cerr << "Input tensor:" << std::endl;
+      tensorPrinter_.Print<float>(Input(j));
+      std::cerr << "NaN idxs:" << std::endl;
+      const float* x = Input(j).data<float>();
+      for (size_t i = 0; i < Input(j).size(); ++i) {
+        if (std::isnan(x[i]) || std::isinf(x[i])) {
+          std::cerr << i << " ";
+        }
+      }
+      std::cerr << std::endl;
+    }
+    return false;
+  }
+
+  if (&X != Y) {
+    Y->CopyFrom(X, &context_);
+  }
+  return true;
+}
+REGISTER_CPU_OPERATOR(NanCheck, NanCheckOp<CPUContext>);
+REGISTER_GRADIENT(NanCheck, GetNanCheckGradient);
+
+OPERATOR_SCHEMA(NanCheck)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShapeOfInput(0)
+    .SetDoc("Identity operator, but checks all values for nan or inf")
+    .Input(0, "tensor", "Tensor to check for nan/inf")
+    .Output(
+        0,
+        "output",
+        "Tensor to copy input into if no NaNs or inf."
+        " Can be in-place");
+
+OPERATOR_SCHEMA(Size)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Return a 1D tensor of type *int64* that contains the number of elements of the input tensor.
+
+Github Link:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.cc
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Size",
+    ["X"],
+    ["size"],
+)
+
+workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("size:", workspace.FetchBlob("size"))
+
+workspace.ResetWorkspace()
+
+workspace.FeedBlob("X", (np.random.rand(6,4)))
+print("X:", workspace.FetchBlob("X"))
+workspace.RunOperatorOnce(op)
+print("size:", workspace.FetchBlob("size"))
+
+```
+
+**Result**
+
+```
+
+X:
+[[3 7 0]
+ [0 1 6]
+ [5 0 8]]
+size: 9
+X:
+[[0.92017884 0.32115368 0.68692035 0.64135016]
+ [0.8723328  0.77830265 0.80688656 0.25524236]
+ [0.37970216 0.76407047 0.85689564 0.30692883]
+ [0.69352573 0.42531502 0.16415212 0.59209324]
+ [0.52684188 0.37094846 0.60670079 0.6489272 ]
+ [0.94715906 0.34800557 0.61898769 0.28947359]]
+size: 24
+
+```
+
+</details>
+
+      )DOC")
+    .Input(0, "X", "*(type: Tensor)* Input tensor to calculate number of elements.")
+    .Output(
+        0,
+        "size",
+        "*(type: Tensor)* 1D tensor of type int64 that contains the number of "
+        "elements in the input tensor *X*.");
+
+REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
+NO_GRADIENT(Size);
+
+template <>
+template <typename T>
+bool RangeOp<CPUContext>::DoRunOnDevice(
+    const T& start,
+    const T& step,
+    Tensor<CPUContext>* output) {
+  auto* output_data = output->template mutable_data<T>();
+  for (int i = 0; i < output->size(); ++i) {
+    output_data[i] = i * step + start;
+  }
+  return true;
+}
+
+OPERATOR_SCHEMA(Range)
+    .NumInputs(1, 3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Generates an output tensor within the half-open interval $[start, stop)$ (the interval including start but excluding stop).
+- The `start` input is optional, and defaults to 0 when not set.
+- The `step` input is optional, and defaults to 1 when not set.
+- The type of the `output` tensor is determined by the types of inputs used.
+
+Github Links:
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.h
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.cc
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Range",
+    ["start", "stop", "step"],
+    ["output"]
+)
+
+workspace.FeedBlob("start", np.array(4, dtype=np.int32))
+workspace.FeedBlob("stop", np.array(17, dtype=np.int32))
+workspace.FeedBlob("step", np.array(2, dtype=np.int32))
+print("start:", workspace.FetchBlob("start"))
+print("stop:", workspace.FetchBlob("stop"))
+print("step:", workspace.FetchBlob("step"))
+workspace.RunOperatorOnce(op)
+print("output:", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+start: 4
+stop: 17
+step: 2
+output: [ 4  6  8 10 12 14 16]
+
+```
+
+</details>
+        )DOC")
+    .Input(
+        0,
+        "start",
+        "(*Tensor*): [OPTIONAL] scalar tensor containing the start of the interval (inclusive) (default=0)")
+    .Input(1, "stop", "(*Tensor*): scalar tensor containing the end of the interval (exclusive)")
+    .Input(2, "step", "(*Tensor*): [OPTIONAL] scalar tensor specifying the spacing between values (default=1)")
+    .Output(
+        0,
+        "output",
+        "(*Tensor*): 1D tensor of same type as inputs that contains the sequence");
+
+REGISTER_CPU_OPERATOR(Range, RangeOp<CPUContext>);
+NO_GRADIENT(Range);
+
+REGISTER_CPU_OPERATOR(ThrowException, ThrowExceptionOp);
+OPERATOR_SCHEMA(ThrowException).NumInputs(0).NumOutputs(0);
+SHOULD_NOT_DO_GRADIENT(ThrowException);
+
+REGISTER_CPU_OPERATOR(ThrowChildThreadException, ThrowChildThreadExceptionOp);
+OPERATOR_SCHEMA(ThrowChildThreadException).NumInputs(0).NumOutputs(0);
+SHOULD_NOT_DO_GRADIENT(ThrowChildThreadException);
+
+REGISTER_CPU_OPERATOR(LogFatal, LogFatalOp);
+OPERATOR_SCHEMA(LogFatal).NumInputs(0).NumOutputs(0);
+SHOULD_NOT_DO_GRADIENT(LogFatal);
+
+REGISTER_CPU_OPERATOR(Fail, FailOp);
+OPERATOR_SCHEMA(Fail).NumInputs(0).NumOutputs(0);
+SHOULD_NOT_DO_GRADIENT(Fail);
+
+} // namespace caffe2
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
new file mode 100644
index 0000000..9e68790
--- /dev/null
+++ b/caffe2/operators/utility_ops.cu
@@ -0,0 +1,521 @@
+#include <math.h>
+#include <cfloat>
+// TODO(jamesreed): I would use <cmath> here but std::isnan
+// and std::isinf are declared constexpr there and the nvidia
+// compiler throws an error because of it
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/flatten_op.h"
+#include "caffe2/operators/minmax_ops.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/utils/math.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/unique.h>
+
+namespace caffe2 {
+
+template <>
+bool WeightedSumOp<CUDAContext>::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float>();
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<float16>();
+  } else {
+    CAFFE_THROW("Unsupported inputs");
+  }
+  return false;
+}
+
+template <>
+bool SumOp<CUDAContext>::RunOnDevice() {
+  if (Input(0).IsType<float>()) {
+    return DoRunWithType<float, float>();
+  } else if (Input(0).IsType<float16>()) {
+    return DoRunWithType<float16, float16>();
+  } else {
+    CAFFE_THROW("Unsupported inputs");
+  }
+  return false;
+}
+
+template <>
+class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
+    : public Operator<CUDAContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<Tensor<CUDAContext>>(0);
+    CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
+    output->ResizeLike(input);
+    context.template CopyItems<CUDAContext, CUDAContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Alias, AliasOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(Sum, SumOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<CUDAContext>);
+
+// From CPU, copy it to whatever the current context
+REGISTER_CUDA_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+
+// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
+// since gpu code will be involved.
+REGISTER_CUDA_OPERATOR(
+    CopyGPUToCPU,
+    CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CopyCPUToGPU,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
+// involving different GPUs.
+REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CUDAContext>);
+
+CAFFE_KNOWN_TYPE(const float*);
+
+REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);
+
+__global__ void NanCheckKernel(int N, const float* X, bool* result) {
+  bool has_nan = false;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Note: we have no need to do early return, since only if this fails
+    // will we not need to inspect all elements. No need to optimize the
+    // case that will fail.
+    has_nan = has_nan || isnan(X[i]) || isinf(X[i]);
+  }
+  __syncthreads();
+  if (has_nan) {
+    result[0] = true;
+  }
+}
+
+template <>
+bool NanCheckOp<CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  const size_t N = X.size();
+  const float* data_ptr = X.data<float>();
+
+  scratch_.Resize(1);
+  math::Set<bool, CUDAContext>(
+      1, false, scratch_.mutable_data<bool>(), &context_);
+  NanCheckKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, X.data<float>(), scratch_.mutable_data<bool>());
+
+  bool result = false;
+  {
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+    CUDA_ENFORCE(cudaMemcpyAsync(
+        &result,
+        scratch_.raw_data(),
+        1,
+        cudaMemcpyDefault,
+        context_.cuda_stream()));
+  }
+  // Note: we must synchronize here so we can inspect the result
+  context_.FinishDeviceComputation();
+
+  // Print out diagnostic info if we have a NaN or inf
+  if (result) {
+    std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
+              << std::endl;
+
+    for (int j = 0; j < InputSize(); j++) {
+      TensorCPU cpu_X;
+      cpu_X.ResizeLike(Input(j));
+      // Hack to cause allocaiton happen here, so it won't happen
+      // when we do CopyFrom. We need the mutex then because host->gpu
+      // copies seem to possibly lock with NCCL.
+      cpu_X.mutable_data<float>();
+
+      {
+        std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+        cpu_X.CopyFrom(Input(j), &context_);
+      }
+      context_.FinishDeviceComputation();
+      std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
+                << "]" << std::endl;
+      tensorPrinter_.Print<float>(cpu_X);
+
+      if (j == 0) {
+        std::cerr << "NaN idxs:" << std::endl;
+        auto* cpu_X_data = cpu_X.data<float>();
+        for (size_t i = 0; i < cpu_X.size(); ++i) {
+          if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) {
+            std::cerr << i << " ";
+          }
+        }
+      }
+      std::cerr << std::endl;
+    }
+    return false;
+  }
+
+  // This op should act as an identity matrix if we don't find any NaNs/infs.
+  // Copy over the data if we are not doing this in-place.
+  if (&X != Y) {
+    Y->CopyFrom(X, &context_);
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);
+
+__global__ void
+ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    maxout[i] = max(X[i], Y[i]);
+  }
+}
+
+template <>
+bool MaxOp<float, CUDAContext>::Compute() {
+  float* output_data = Output(0)->mutable_data<float>();
+  const int N = Input(0).size();
+
+  // Run pairwise-maxes
+  for (int i = 1; i < InputSize(); ++i) {
+    ElwiseMaxKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
+        Input(i).data<float>(),
+        output_data,
+        N);
+  }
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
+
+__global__ void
+ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    minout[i] = min(X[i], Y[i]);
+  }
+}
+
+template <>
+bool MinOp<float, CUDAContext>::Compute() {
+  float* output_data = Output(0)->mutable_data<float>();
+  const int N = Input(0).size();
+
+  // Run pairwise-mines
+  for (int i = 1; i < InputSize(); ++i) {
+    ElwiseMinKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
+        Input(i).data<float>(),
+        output_data,
+        N);
+  }
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
+
+template <typename T>
+__global__ void
+MaxMinGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    gi[i] = go[i] * (mx[i] == x[i]);
+  }
+}
+
+template <>
+bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
+  auto& output = Input(0);
+  auto& grad_output = Input(1);
+  const int kInputStartOffset = 2;
+
+  const float* data = output.data<float>();
+
+  for (int i = 0; i < OutputSize(); i++) {
+    auto& input = Input(i + kInputStartOffset);
+    auto* grad_input = Output(i);
+    grad_input->ResizeLike(input);
+    MaxMinGradKernel<<<
+        CAFFE_GET_BLOCKS(input.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        input.size(),
+        output.data<float>(),
+        input.data<float>(),
+        grad_output.data<float>(),
+        grad_input->mutable_data<float>());
+  }
+  return true;
+}
+
+template <typename T_INDEX>
+__global__ void GatherKernel(
+    const float* X,
+    float* Y,
+    const T_INDEX* indices,
+    const int N,
+    const int block_size) {
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    T_INDEX idx = indices[i];
+    const float* src_offset = X + idx * block_size;
+    float* dst_offset = Y + i * block_size;
+    for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
+      dst_offset[j] = src_offset[j];
+    }
+  }
+}
+
+template <>
+bool GatherOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
+}
+
+template <>
+template <typename Index>
+bool GatherOp<CUDAContext>::DoRunWithType() {
+  auto& data = Input(DATA);
+  auto& indices = Input(INDICES);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  auto shape = indices.dims();
+  shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
+  output->Resize(shape);
+
+  int block_size = data.size() / data.dim(0);
+  auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+  CAFFE_ENFORCE(
+      block_bytesize == data.nbytes() / data.dim(0),
+      "block_bytesize should be consistent with data dim");
+  int N = indices.size();
+
+  auto src_base = static_cast<const float*>(data.raw_data());
+  const Index* idxs = indices.template data<Index>();
+  auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));
+
+  // return early when the input is empty, since CUDA kernel will fail for
+  // empty input.
+  if (N <= 0) {
+    return true;
+  }
+
+  GatherKernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(src_base, out, idxs, N, block_size);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);
+
+/**
+ * @brief Update slices of Y in-place with a batch of weighted X's.
+ * Y[idx] = alpha[b] * X[b][i] + Y[idx]
+ * i=0,...,N-1
+ * b=0,...,B-1
+ * idx=Indices[i]
+ */
+template <typename T_INDEX>
+__global__ void AxpySliceKernel(
+    const float* weight0,
+    const TIndex N,
+    const TIndex B,
+    const TIndex slice_size,
+    const float** alpha,
+    const float** X,
+    const T_INDEX* Indices,
+    float* Y,
+    const TIndex M) {
+  // This implementation requires that the first weight is 1.0
+  CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
+  for (int i = blockIdx.x; i < N; i += gridDim.x) {
+    T_INDEX idx = Indices[i];
+    float* y_offset = Y + (idx * slice_size);
+    for (int b = 0; b < B; b++) {
+      float a = *alpha[b];
+      const float* x_offset = X[b] + (i * slice_size);
+      for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
+        atomicAdd(&y_offset[j], a * x_offset[j]);
+      }
+    }
+  }
+}
+
+template <>
+bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
+}
+
+template <>
+template <typename Index>
+bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
+  CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
+  auto& X0 = Input(0);
+  auto& weight0 = Input(1);
+  auto& indices = Input(2);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
+  CAFFE_ENFORCE_GT(X0.size(), 0);
+  CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
+  CAFFE_ENFORCE_EQ(weight0.size(), 1);
+
+  TIndex M = X0.size();
+  TIndex N = X0.dim(0);
+  TIndex K = indices.size();
+  TIndex block_size = M / N;
+
+  T* data = output->template mutable_data<T>();
+
+  // In order to have all device pointers of x_i (and weight_i similarly)
+  // consecutively in device memory, copy pointers to a host vector and then
+  // copy back into a device array.
+  const TIndex B = (InputSize() - 3) / 2;
+  x_data_host_.Resize(B);
+  weights_host_.Resize(B);
+  x_data_device_.Resize(B);
+  weights_device_.Resize(B);
+
+  const float** x_data_host = x_data_host_.mutable_data<const float*>();
+  const float** weights_host = weights_host_.mutable_data<const float*>();
+  const float** x_data_device = x_data_device_.mutable_data<const float*>();
+  const float** weights_device = weights_device_.mutable_data<const float*>();
+
+  for (int inp = 3; inp < InputSize(); inp += 2) {
+    int idx = (inp - 3) / 2;
+    x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
+    weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
+  }
+  context_.Copy<const float*, CPUContext, CUDAContext>(
+      B, x_data_host, x_data_device);
+  context_.Copy<const float*, CPUContext, CUDAContext>(
+      B, weights_host, weights_device);
+
+  AxpySliceKernel<<<
+      std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      weight0.template data<float>(),
+      K,
+      B,
+      block_size,
+      weights_device,
+      x_data_device,
+      indices.template data<Index>(),
+      data,
+      M);
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    ScatterWeightedSum,
+    ScatterWeightedSumOp<float, CUDAContext>);
+
+namespace {
+
+template <typename Index, typename T>
+__global__ void scatter_assign_kernel(
+    T* data,
+    const Index* idxs,
+    const T* slicesData,
+    TIndex N,
+    TIndex K,
+    TIndex block_size) {
+  for (TIndex i = blockIdx.x; i < K; i += gridDim.x) {
+    Index idx = idxs[i];
+    CUDA_KERNEL_ASSERT(0 <= idx && idx < N);
+    const T* src = slicesData + block_size * i;
+    T* dest = data + block_size * idx;
+    for (TIndex j = threadIdx.x; j < block_size; j += blockDim.x) {
+      dest[j] = src[j];
+    }
+  }
+}
+
+} // namespace
+
+template <>
+template <typename Index, typename T>
+void ScatterAssignOp<CUDAContext>::DoScatterAssign(
+    T* data,
+    const Index* idxs,
+    const T* slicesData,
+    TIndex N,
+    TIndex K,
+    TIndex block_size) {
+  scatter_assign_kernel<<<
+      std::min(K, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(data, idxs, slicesData, N, K, block_size);
+}
+
+REGISTER_CUDA_OPERATOR(ScatterAssign, ScatterAssignOp<CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);
+
+template <typename T>
+__global__ void RangeKernel(const int n, T* Y, T offset, T step) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    Y[index] = index * step + offset;
+  }
+}
+
+template <>
+template <typename T>
+bool RangeOp<CUDAContext>::DoRunOnDevice(
+    const T& start,
+    const T& step,
+    Tensor<CUDAContext>* output) {
+  int N = output->size();
+  RangeKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
new file mode 100644
index 0000000..a0eb0f3
--- /dev/null
+++ b/caffe2/operators/utility_ops.h
@@ -0,0 +1,1483 @@
+#ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_
+#define CAFFE2_OPERATORS_UTILITY_OPS_H_
+
+#include <math.h>
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+#include <map>
+#include <utility>
+
+namespace caffe2 {
+
+template <class Context>
+class NanCheckOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  NanCheckOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ private:
+  TensorPrinter tensorPrinter_;
+  Tensor<Context> scratch_;
+};
+
+struct GetNanCheckGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return {CreateOperatorDef(
+        "NanCheck",
+        "",
+        std::vector<string>{GO(0)},
+        std::vector<string>{GI(0)})};
+  }
+};
+
+template <class Context>
+class WallClockTimeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  WallClockTimeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    int64_t nanoseconds = static_cast<long int>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::high_resolution_clock::now().time_since_epoch())
+            .count());
+
+    TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize();
+    *output->template mutable_data<int64_t>() = nanoseconds;
+
+    return true;
+  }
+};
+
+const char kPrintFileExtension[] = ".log";
+
+template <class Context>
+class PrintOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+  PrintOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        tensor_printer_(
+            operator_def.input(0),
+            OperatorBase::GetSingleArgument<int>("to_file", 0)
+                ? ws->RootFolder() + "/" + operator_def.input(0) +
+                    kPrintFileExtension
+                : "",
+            OperatorBase::GetSingleArgument<int>("limit", 0)),
+        every_n_(OperatorBase::GetSingleArgument<int>("every_n", 1)) {
+    CAFFE_ENFORCE_GE(every_n_, 1);
+  }
+
+  bool RunOnDevice() override {
+    if (++occurrences_mod_n_ > every_n_) {
+      occurrences_mod_n_ -= every_n_;
+    }
+    if (occurrences_mod_n_ != 1) {
+      return true;
+    }
+
+    if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
+        !OperatorBase::InputIsType<TensorCPU>(0)) {
+      LOG(INFO) << "Blob of type: "
+                << OperatorBase::Inputs().at(0)->meta().name();
+      return true;
+    }
+    // special-case empty tensors since they may have no meta()
+    if (Input(0).size() == 0) {
+      tensor_printer_.PrintMeta(Input(0));
+      return true;
+    }
+
+    using Types = TensorTypes<
+        float,
+        double,
+        int,
+        long,
+        bool,
+        char,
+        unsigned char,
+        std::string>;
+
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+      return DispatchHelper<Types>::call(
+          this, OperatorBase::Input<TensorCPU>(0));
+    } else {
+      return DispatchHelper<Types>::call(this, Input(0));
+    }
+  }
+
+ private:
+  template <typename T>
+  bool DoRunWithType() {
+    // A simple strategy to copy tensor if needed, and have the tensor pointer
+    // pointing to the right instantiation. Note that tensor_copy_if_needed
+    // will handle memory deallocation itself so no smart pointer is needed.
+    const TensorCPU* tensor;
+    TensorCPU tensor_copy_if_needed;
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+      tensor = &OperatorBase::Input<TensorCPU>(0);
+    } else {
+      tensor_copy_if_needed.CopyFrom(Input(0), &context_);
+      // Make sure that the copy is finished.
+      context_.FinishDeviceComputation();
+      tensor = &tensor_copy_if_needed;
+    }
+    tensor_printer_.Print<T>(*tensor);
+    return true;
+  }
+
+ private:
+  TensorPrinter tensor_printer_;
+  int every_n_;
+  int occurrences_mod_n_{0};
+};
+
+/**
+ * @brief Alias op makes the output and the input share the same underlying
+ * storage.
+ *
+ * WARNING: in general, in caffe2's operator interface different tensors should
+ * have different underlying storage, which is the assumption made by
+ * components such as the dependency engine and memory optimization. Thus, in
+ * normal situations you should not use the AliasOp, especially in a normal
+ * forward-backward pass.
+ *
+ * The Alias op is provided so one can achieve true asynchrony, such as
+ * Hogwild, in a graph. But make sure you understand all the implications
+ * similar to multi-thread computation before you use it explicitly.
+ */
+template <class Context>
+class AliasOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AliasOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    CAFFE_ENFORCE_GE(input.size(), 0, "Tensor is not initialized");
+    Output(0)->ResizeLike(input);
+    Output(0)->ShareData(input);
+    return true;
+  }
+};
+
+/**
+ * @brief Pass inputs to outputs.
+ * Input:
+ *   DATA - dense tensor.
+ * Output:
+ *   DATA - same tensor as input.
+ */
+template <class Context>
+class EnsureDenseOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(EnsureDenseOp)
+
+  bool RunOnDevice() override {
+    const auto& input = Input(0);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_GT(input.ndim(), 0, "Input has to be at least a vector.");
+    // it is allowed to have the output inplace overwrite the input but also
+    // allow the output to be copied from the input
+    if (&input != output) {
+      output->ResizeLike(input);
+      output->CopyFrom(input, &context_);
+    }
+    return true;
+  }
+};
+
+template <class Context>
+class FlattenToVecOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_GE(
+        input.dims().size(), 1, "The rank of the tensor must be >= 1.");
+    output->Resize(input.size());
+
+    context_.template CopyItems<Context, Context>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+// Output gets the data of input(0), but reshapes it like input(1).
+template <class Context>
+class ResizeLikeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ResizeLikeOp);
+
+  bool RunOnDevice() override {
+    auto& input0 = Input(0);
+    auto& input1 = Input(1);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_EQ(input0.size(), input1.size());
+    output->ResizeLike(Input(1));
+    context_.template CopyItems<Context, Context>(
+        input0.meta(),
+        input0.size(),
+        input0.raw_data(),
+        output->raw_mutable_data(input0.meta()));
+    return true;
+  }
+};
+
+template <class Context>
+class SumOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SumOp);
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& input0 = Input(0);
+    auto* output = Output(0);
+    if (InputSize() == 1) {
+      output->CopyFrom(input0, &context_);
+      return true;
+    }
+    output->ResizeLike(input0);
+    T* output_data = output->template mutable_data<T>();
+    // Dimension checking
+    for (int i = 1; i < InputSize(); ++i) {
+      if (output->dims() != Input(i).dims()) {
+        CAFFE_THROW(
+            "Check failed: output->dims() == Input(i).dims().",
+            "Description: Input #",
+            i,
+            ", input dimension:",
+            Input(i).dims(),
+            " should match output dimension: ",
+            output->dims());
+      }
+    }
+
+    // Add the first two - works if in-place or not.
+    math::Add(
+        output->size(),
+        input0.template data<T>(),
+        Input(1).template data<T>(),
+        output_data,
+        &context_);
+    // Add remaining.
+    for (int i = 2; i < InputSize(); ++i) {
+      math::Add(
+          output->size(),
+          output_data,
+          Input(i).template data<T>(),
+          output_data,
+          &context_);
+    }
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    if (Input(0).template IsType<float>()) {
+      return DoRunWithType<float, float>();
+    } else if (Input(0).template IsType<int>()) {
+      return DoRunWithType<int, int>();
+    } else {
+      CAFFE_THROW(
+          "Sum operator only supports 32-bit float and ints, but",
+          " input was of type ",
+          Input(0).meta().name());
+    }
+  }
+};
+
+// WeightedSumOp computes the weighted sum of several tensors. The input should
+// be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same
+// shape, and weight_i are size 1 tensors that specifies the weight of each
+// vector. Note that if one wants to do in-place computation, it could only be
+// done with X_0 also as the output, but not other X_i.
+template <class Context>
+class WeightedSumOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(WeightedSumOp);
+
+  template <typename DstType>
+  bool DoRunWithType() {
+    CAFFE_ENFORCE_EQ(InputSize() % 2, 0);
+    auto& X0 = Input(0);
+    auto& weight0 = Input(1);
+    CAFFE_ENFORCE_GT(X0.size(), 0);
+    CAFFE_ENFORCE_EQ(weight0.size(), 1);
+    int size = X0.size();
+    auto* output = Output(0);
+    output->ResizeLike(X0);
+    math::Scale<DstType, Context>(
+        size,
+        weight0.template data<float>(),
+        X0.template data<DstType>(),
+        output->template mutable_data<DstType>(),
+        &context_);
+    for (int i = 2; i < InputSize(); i += 2) {
+      auto& X = Input(i);
+      // Do a check: if the input is the same as output, we have a problem -
+      // in-place update should always only happen with the zeroth input.
+      if (&X == output) {
+        LOG(ERROR) << "Input #" << i << " is the same as output. "
+                   << "If you want to do in-place updates, put the output as "
+                   << "input #0.";
+        return false;
+      }
+      auto& weight = Input(i + 1);
+      CAFFE_ENFORCE_EQ(X.size(), size);
+      CAFFE_ENFORCE_EQ(weight.size(), 1);
+      math::Axpy<DstType, Context>(
+          size,
+          weight.template data<float>(),
+          X.template data<DstType>(),
+          output->template mutable_data<DstType>(),
+          &context_);
+    }
+    return true;
+  }
+  bool RunOnDevice() override;
+};
+
+template <class Context>
+class WeightedSumGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  WeightedSumGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        grad_on_w_(OperatorBase::GetSingleArgument<bool>("grad_on_w", false)) {}
+
+  template <typename DstType>
+  bool DoRunWithType() {
+    CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
+    auto output_size = grad_on_w_ ? InputSize() - 1 : InputSize() / 2;
+    CAFFE_ENFORCE_EQ(OutputSize(), output_size);
+
+    auto& dY = Input(0);
+    const auto* dY_data = dY.template data<DstType>();
+    int size = dY.size();
+
+    // The input size should be the input size of the forward op plus 1
+    for (int i = 0; i < InputSize() / 2; i++) {
+      auto& cur_w = Input(2 * i + 2);
+      CAFFE_ENFORCE_EQ(cur_w.size(), 1);
+      auto* cur_dX = Output(i);
+      cur_dX->ResizeLike(dY);
+
+      math::Scale<DstType, Context>(
+          size,
+          cur_w.template data<float>(),
+          dY_data,
+          cur_dX->template mutable_data<DstType>(),
+          &context_);
+
+      if (grad_on_w_) {
+        auto& cur_X = Input(2 * i + 1);
+        CAFFE_ENFORCE_EQ(cur_X.size(), size);
+        auto* cur_dw = Output(i + output_size / 2);
+        cur_dw->Resize(1);
+        math::Dot<DstType, Context>(
+            size,
+            dY_data,
+            cur_X.template data<DstType>(),
+            cur_dw->template mutable_data<float>(),
+            &context_);
+      }
+    }
+
+    return true;
+  }
+
+  bool RunOnDevice() override;
+
+ private:
+  bool grad_on_w_;
+};
+
+/**
+ * @brief Update slices of the tensor in-place with weighted sum.
+ *
+ * ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum
+ * of several tensors. The first tensor has to be in-place and only slices of it
+ * on the first dimension as indexed by INDICES will be updated.
+ *
+ * Input:
+ *   X_0 - tensor to be updated
+ *   weight_0 - scalar weight for X_0, applied only to slices affected,
+ *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
+ * updated
+ *   X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
+ *   weight_1 - scalar weight for X_1 update
+ *   X_2, weight_2, ...
+ *
+ * Output:
+ *   X_0 - has to be exactly the same tensor as the input 0
+ *
+ * Note: The op pretty much ignores the exact shapes of the input arguments and
+ * cares only about sizes. It's done for performance consideration to avoid
+ * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
+ * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
+ * assumed to be of shape K x (M / N) regardless of the real shape.
+ *
+ * Note: Each update in INDICES is applied independently which means that if
+ * duplicated elements are present in INDICES the corresponding slice of X_0
+ * will be scaled multiple times. Manual collapsing of INDICES is required
+ * beforehand if necessary.
+ *
+ * Note: Updates are applied sequentially by inputs which might have undesired
+ * consequences if the input tensor is accessed concurrently by different op
+ * (e.g. when doing Hogwild). Other threads might see intermediate results even
+ * on individual slice level, e.g. X_0 scaled by weight_0 but without any
+ * updates applied.
+ *
+ * For now really works only on CPU because of INDICES access
+ */
+template <typename T, class Context>
+class ScatterWeightedSumOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp);
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    TIndex block_size = Input(0).size_from_dim(1);
+    return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
+  }
+
+  template <typename Index, int FixedSize>
+  bool DoRunWithValue() {
+    CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
+    auto& X0 = Input(0);
+    auto& weight0 = Input(1);
+    auto& indices = Input(2);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
+
+    CAFFE_ENFORCE_GT(X0.size(), 0);
+    CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
+    CAFFE_ENFORCE_EQ(weight0.size(), 1);
+    TIndex M = X0.size();
+    TIndex N = X0.dim(0);
+    TIndex K = indices.size();
+    TIndex block_size = M / N;
+    T* data = output->template mutable_data<T>();
+    const Index* idxs = indices.template data<Index>();
+    T w0 = *weight0.template data<T>();
+    // It's most likely a constant so exact comparison is fine
+    if (w0 != 1.0) {
+      for (int i = 0; i < K; ++i) {
+        Index idx = idxs[i];
+        CAFFE_ENFORCE(
+            0 <= idx && idx < N,
+            "Index out of bounds: ",
+            idx,
+            ", range 0 to ",
+            N);
+        math::ScaleFixedSize<T, Context, FixedSize>(
+            block_size,
+            w0,
+            data + block_size * idx,
+            data + block_size * idx,
+            &context_);
+      }
+    }
+    for (int inp = 3; inp < InputSize(); inp += 2) {
+      auto& X = Input(inp);
+      auto& weight = Input(inp + 1);
+      CAFFE_ENFORCE_EQ(X.size(), block_size * K);
+      CAFFE_ENFORCE_EQ(weight.size(), 1);
+      const T* x_data = X.template data<T>();
+      T w = *weight.template data<T>();
+      for (int i = 0; i < K; ++i) {
+        Index idx = idxs[i];
+        // double-checking the indices, but it's fine as it's DCHECK only
+        DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
+                                    << ", range 0 to " << N;
+        math::AxpyFixedSize<T, Context, FixedSize>(
+            block_size,
+            w,
+            x_data + block_size * i,
+            data + block_size * idx,
+            &context_);
+      }
+    }
+    return true;
+  }
+  Tensor<CPUContext> x_data_host_;
+  Tensor<CPUContext> weights_host_;
+  Tensor<Context> x_data_device_;
+  Tensor<Context> weights_device_;
+};
+
+/**
+ * @brief Update slices of the tensor in-place by overriding.
+ *
+ * Input:
+ *   DATA - tensor to be updated
+ *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
+ *             updated
+ *   SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
+ *
+ * Output:
+ *   DATA - has to be exactly the same tensor as the input 0
+ *
+ * Note: The op pretty much ignores the exact shapes of the input arguments and
+ * cares only about sizes. It's done for performance consideration to avoid
+ * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
+ * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
+ * assumed to be of shape K x (M / N) regardless of the real shape.
+ *
+ * Note: Each update in INDICES is applied independently which means that if
+ * duplicated elements are present in INDICES arbitrary one will win.
+ *
+ * For now really works only on CPU because of INDICES access
+ */
+template <class Context>
+class ScatterAssignOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  virtual ~ScatterAssignOp() {}
+
+  ScatterAssignOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        runners_({{{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT},
+                   &ScatterAssignOp::DoRun<int32_t, float>},
+                  {{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT16},
+                   &ScatterAssignOp::DoRun<int32_t, float16>},
+                  {{TensorProto_DataType_INT32, TensorProto_DataType_UINT8},
+                   &ScatterAssignOp::DoRun<int32_t, uint8_t>},
+                  {{TensorProto_DataType_INT32, TensorProto_DataType_INT32},
+                   &ScatterAssignOp::DoRun<int32_t, int32_t>},
+                  {{TensorProto_DataType_INT32, TensorProto_DataType_INT64},
+                   &ScatterAssignOp::DoRun<int32_t, int64_t>},
+                  {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT},
+                   &ScatterAssignOp::DoRun<int64_t, float>},
+                  {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT16},
+                   &ScatterAssignOp::DoRun<int64_t, float16>},
+                  {{TensorProto_DataType_INT64, TensorProto_DataType_UINT8},
+                   &ScatterAssignOp::DoRun<int64_t, uint8_t>},
+                  {{TensorProto_DataType_INT64, TensorProto_DataType_INT32},
+                   &ScatterAssignOp::DoRun<int64_t, int32_t>},
+                  {{TensorProto_DataType_INT64, TensorProto_DataType_INT64},
+                   &ScatterAssignOp::DoRun<int64_t, int64_t>}}) {}
+
+  bool RunOnDevice() override {
+    const auto& data = Input(DATA);
+    const auto& slices = Input(SLICES);
+    auto& indices = Input(INDICES);
+
+    const auto dataType = TypeMetaToDataType(data.meta());
+    const auto slicesType = TypeMetaToDataType(slices.meta());
+    const auto indicesType = TypeMetaToDataType(indices.meta());
+    auto* output = Output(0);
+
+    auto runner = GetRunner(dataType, slicesType, indicesType);
+    (this->*runner)();
+    return true;
+  }
+
+ private:
+  typedef void (ScatterAssignOp::*RunnerType)();
+  typedef std::
+      map<std::pair<TensorProto_DataType, TensorProto_DataType>, RunnerType>
+          RunnerMap;
+
+  RunnerMap runners_;
+
+  RunnerType GetRunner(
+      const TensorProto_DataType dataType,
+      const TensorProto_DataType slicesType,
+      const TensorProto_DataType indicesType) {
+    CAFFE_ENFORCE_EQ(dataType, slicesType, "Data and slice types must match");
+    auto it = runners_.find({indicesType, dataType});
+    CAFFE_ENFORCE(
+        it != runners_.end(),
+        "Could not find the runner corresponding to indicesType, dataType = ",
+        indicesType,
+        " ",
+        dataType);
+    return it->second;
+  }
+
+  template <typename Index, typename T>
+  void DoRun() {
+    auto& input = Input(DATA);
+    auto& indices = Input(INDICES);
+    auto& slices = Input(SLICES);
+    auto* output = Output(0);
+    CAFFE_ENFORCE_EQ(&input, output, "In place operation is required");
+
+    CAFFE_ENFORCE_GT(input.ndim(), 0, "X0 has to be at least the vector");
+    TIndex M = input.size();
+    TIndex N = input.dim(0);
+    TIndex K = indices.size();
+    TIndex block_size = M / N;
+    CAFFE_ENFORCE_EQ(slices.size(), block_size * K);
+    // TODO(dzhulgakov): it can be made to work with arbitrary data type by
+    // using raw_mutable_data
+    T* data = output->template mutable_data<T>();
+    const Index* idxs = indices.template data<Index>();
+    const T* slicesData = slices.template data<T>();
+    DoScatterAssign(data, idxs, slicesData, N, K, block_size);
+  }
+
+  template <typename Index, typename T>
+  void DoScatterAssign(
+      T* data,
+      const Index* idxs,
+      const T* slicesData,
+      TIndex N,
+      TIndex K,
+      TIndex block_size) {
+    for (int i = 0; i < K; ++i) {
+      Index idx = idxs[i];
+      // double-checking the indices, but it's fine as it's DCHECK only
+      DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
+                                  << ", range 0 to " << N;
+      context_.template Copy<T, Context, Context>(
+          block_size, slicesData + block_size * i, data + block_size * idx);
+    }
+  }
+
+  INPUT_TAGS(DATA, INDICES, SLICES);
+};
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(CopyOp);
+
+  bool RunOnDevice() override {
+    auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
+    auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
+    output->ResizeLike(input);
+    this->context_.template CopyItems<SrcContext, DstContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
+};
+
+template <class Context>
+class LengthsToSegmentIdsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
+
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    auto* input_data = input.template data<int32_t>();
+
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto total_length =
+        std::accumulate(input_data, input_data + input.size(), 0);
+
+    output->Resize(total_length);
+    auto* output_data = output->template mutable_data<int32_t>();
+
+    for (int i = 0; i < input.size(); ++i) {
+      auto len = input_data[i];
+      std::fill(output_data, output_data + len, i);
+      output_data += len;
+    }
+    return true;
+  }
+};
+
+template <class Context>
+class LengthsToRangesOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    auto* input_data = input.template data<int32_t>();
+
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto size = input.size();
+
+    output->Resize(size, 2);
+    auto* output_data = output->template mutable_data<int32_t>();
+
+    int32_t offset = 0;
+    for (int i = 0; i < size; ++i) {
+      auto len = input_data[i];
+      output_data[i * 2] = offset;
+      output_data[i * 2 + 1] = len;
+      offset += len;
+    }
+    return true;
+  }
+};
+
+template <class Context>
+class SegmentIdsToLengthsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);
+
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    if (input.ndim() == 2) {
+      CAFFE_ENFORCE(
+          input.dim32(0) == 1 || input.dim32(1) == 1,
+          "Input must be a vector.");
+    } else {
+      CAFFE_ENFORCE_EQ(input.ndim(), 1, "Input must be a vector.");
+    }
+    auto* input_data = input.template data<Index>();
+    auto input_size = input.size();
+    auto* output = Output(0);
+    // segment id starts from 0
+    auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
+    if (InputSize() > 1) {
+      CAFFE_ENFORCE_GE(Input(1).ndim(), 1);
+      CAFFE_ENFORCE_LE(
+          num_segments,
+          Input(1).dim(0),
+          "The number of segments inferred should *NOT* be larger "
+          "than the size of Input(1)'s first dimension");
+      num_segments = Input(1).dim(0);
+    }
+    CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
+    output->Resize(num_segments);
+    auto* output_data = output->template mutable_data<int32_t>();
+    if (num_segments == 0) {
+      return true;
+    }
+    std::fill(output_data, output_data + num_segments, 0);
+    Index prev = 0; // Assume that segment_id >= 0.
+    for (int64_t i = 0; i < input_size; i++) {
+      CAFFE_ENFORCE(
+          prev <= input_data[i],
+          "Segment ids must be sorted: ",
+          prev,
+          " vs ",
+          input_data[i]);
+      prev = input_data[i];
+      output_data[input_data[i]] += 1;
+    }
+
+    return true;
+  }
+};
+
+template <class Context>
+class SegmentIdsToRangesOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp);
+
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto* input_data = input.template data<Index>();
+    auto input_size = input.size();
+    auto* output = Output(0);
+    // segment id starts from 0
+    auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
+    if (InputSize() > 1) {
+      CAFFE_ENFORCE_GE(Input(1).ndim(), 1);
+      CAFFE_ENFORCE_LE(
+          num_segments,
+          Input(1).dim(0),
+          "The number of segments inferred should *NOT* be larger "
+          "than the size of Input(1)'s first dimension");
+      num_segments = Input(1).dim(0);
+    }
+    CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
+    output->Resize(num_segments, 2);
+    auto* output_data = output->template mutable_data<int32_t>();
+    if (num_segments == 0) {
+      return true;
+    }
+    std::fill(output_data, output_data + num_segments * 2, 0);
+    Index prev = input_data[0];
+    for (int64_t i = 0; i < input_size; i++) {
+      CAFFE_ENFORCE(
+          prev <= input_data[i],
+          "Segment ids must be sorted: ",
+          prev,
+          " vs ",
+          input_data[i]);
+      while (prev != input_data[i]) {
+        ++prev;
+        output_data[prev * 2] = i;
+      }
+      output_data[input_data[i] * 2 + 1] += 1;
+    }
+
+    return true;
+  }
+};
+
+template <class Context>
+class LengthsToWeightsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LengthsToWeightsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto* input_data = input.template data<Index>();
+    auto input_size = input.size();
+    auto* output = Output(0);
+
+    int64_t output_size = 0;
+    for (auto i = 0; i < input_size; i++) {
+      CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value");
+      output_size += input_data[i];
+    }
+
+    std::function<float(const int64_t& length, const float& power)> getWeight;
+    if (power_ == 0.5) {
+      getWeight = [](const int64_t& length, const float& /*power*/) {
+        return 1.0 / std::sqrt(length);
+      };
+    } else if (power_ == 1) {
+      getWeight = [](const int64_t& length, const float& /*power*/) {
+        return 1.0 / length;
+      };
+    } else {
+      getWeight = [](const int64_t& length, const float& power) {
+        return 1.0 / std::pow(length, power);
+      };
+    }
+
+    output->Resize(output_size);
+    auto* output_data = output->template mutable_data<float>();
+    int64_t cnt = 0;
+    for (auto i = 0; i < input_size; i++) {
+      auto len = input_data[i];
+      if (len == 0) {
+        continue;
+      }
+      CAFFE_ENFORCE_LE(cnt + len, output_size, "unexpected lengths value");
+
+      float weight_value = getWeight(len, power_);
+      std::fill(output_data + cnt, output_data + cnt + len, weight_value);
+      cnt += len;
+    }
+
+    return true;
+  }
+
+ private:
+  float power_;
+};
+
+template <class Context>
+class HasElementsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(HasElementsOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<TIndex>{});
+    *output->template mutable_data<bool>() = input.size() > 0;
+    return true;
+  }
+};
+
+template <class Context>
+class IsEmptyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<TIndex>{});
+    *output->template mutable_data<bool>() = (input.size() == 0);
+    return true;
+  }
+};
+
+// Return the size of a tensor
+template <class Context>
+class SizeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(SizeOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+
+    output->Resize(vector<TIndex>());
+    auto* output_data = output->template mutable_data<int64_t>();
+
+    auto size = input.size();
+    math::Set<int64_t, Context>(
+        1, static_cast<int64_t>(size), output_data, &context_);
+
+    return true;
+  }
+};
+
+// returns a shape to be passed to Reshape
+template <class Context>
+class LengthsToShapeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto* output = Output(0);
+    auto* input_data = input.template data<int32_t>();
+
+    auto size = input.size();
+    auto first = input_data[0];
+
+    for (int i = 1; i < size; i++) {
+      CAFFE_ENFORCE(
+          input_data[i] == first, "All elements of input must be same ");
+    }
+
+    output->Resize(2);
+    auto* output_data = output->template mutable_data<int32_t>();
+    output_data[0] = size;
+    output_data[1] = first;
+
+    return true;
+  }
+};
+
+template <class Context>
+class GatherOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(GatherOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(INDICES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    // If we endup using it on GPU doing O(N) memcpy is probably not best :)
+    // TODO: implement prefetching if it starts mattering (TF does it)
+    auto& data = Input(DATA);
+    auto& indices = Input(INDICES);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+    auto shape = indices.dims();
+    shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
+    output->Resize(shape);
+
+    int block_size = data.size_from_dim(1);
+    auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+    int N = indices.size();
+
+    auto src_base = static_cast<const char*>(data.raw_data());
+    const Index* idxs = indices.template data<Index>();
+    auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+    for (int i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(0),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(0));
+      auto src = src_base + idx * block_bytesize;
+      context_.template CopyItems<Context, Context>(
+          data.meta(), block_size, src, out + block_bytesize * i);
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, INDICES);
+};
+
+template <class Context>
+class GatherRangesOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(GatherRangesOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(RANGES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& ranges = Input(RANGES);
+    auto* outputData = Output(0);
+    auto* outputLengths = Output(1);
+
+    auto batchSize = ranges.dim(0);
+    CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
+    CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
+    CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
+    CAFFE_ENFORCE_EQ(
+        ranges.dim(2), 2, "Ranges last dimention should be of size 2");
+
+    auto* rawData = static_cast<const char*>(data.raw_data());
+    auto* rangesData = ranges.template data<Index>();
+
+    outputLengths->Resize(batchSize);
+    auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
+    size_t start = 0;
+    size_t blockSize = ranges.size_from_dim(1);
+    for (size_t i = 0; i < batchSize; ++i) {
+      auto end = start + blockSize;
+      outputLengthsPtr[i] = accumulate(rangesData, start, end);
+      start = end;
+    }
+
+    size_t outputSize = accumulate(rangesData, 0, ranges.size());
+    outputData->Resize(outputSize);
+
+    auto outputRawData =
+        static_cast<char*>(outputData->raw_mutable_data(data.meta()));
+    VLOG(1) << "Copying data";
+    size_t outputOffsetBytes = 0;
+    auto itemsize = data.meta().itemsize();
+    for (int i = 0; i < ranges.size(); i += 2) {
+      auto rangeStart = rangesData[i];
+      auto rangeLength = rangesData[i + 1];
+      if (!rangeLength) {
+        continue;
+      }
+      auto rangeSizeBytes = rangeLength * itemsize;
+      CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
+      CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
+      context_.template CopyItems<Context, Context>(
+          data.meta(),
+          rangeLength,
+          rawData + rangeStart * itemsize,
+          outputRawData + outputOffsetBytes);
+      outputOffsetBytes += rangeSizeBytes;
+    }
+    CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
+    return true;
+  }
+
+  INPUT_TAGS(DATA, RANGES, LENGTHS);
+
+ private:
+  template <typename Index>
+  size_t accumulate(Index* ranges, size_t start, size_t end) {
+    size_t result = 0;
+    for (int i = start + 1; i < end; i += 2) {
+      result += ranges[i];
+    }
+    return result;
+  }
+};
+
+template <class Context>
+class LengthsGatherOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(LengthsGatherOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(INDICES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& items = Input(ITEMS);
+    auto& lengths = Input(LENGTHS);
+    auto& indices = Input(INDICES);
+    auto* output = Output(0);
+
+    CAFFE_ENFORCE_GE(items.ndim(), 1, "ITEMS should be at least 1-D");
+    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS should be 1-D");
+    CAFFE_ENFORCE_EQ(indices.ndim(), 1, "INDICES should be 1-D");
+
+    const auto* lengths_data = lengths.template data<int32_t>();
+    const auto* indices_data = indices.template data<Index>();
+
+    TIndex total_length = 0;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      auto idx = indices_data[i];
+      CAFFE_ENFORCE_LT(idx, lengths.size());
+      total_length += lengths_data[idx];
+    }
+    auto shape = items.dims();
+    shape[0] = total_length;
+    output->Resize(shape);
+
+    offsets_.clear();
+    TIndex running_offset = 0;
+    offsets_.reserve(lengths.size());
+    for (size_t i = 0; i < lengths.size(); ++i) {
+      offsets_.push_back(running_offset);
+      running_offset += lengths_data[i];
+    }
+    CAFFE_ENFORCE_EQ(
+        items.dim(0),
+        running_offset,
+        "LENGTHS must match the first dimension of ITEMS");
+
+    auto src_base = static_cast<const char*>(items.raw_data());
+    auto block_size = items.size_from_dim(1);
+    auto block_bytesize = block_size * items.itemsize();
+    auto out = static_cast<char*>(output->raw_mutable_data(items.meta()));
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+      auto idx = indices_data[i];
+      auto length = lengths_data[idx];
+      context_.template CopyItems<Context, Context>(
+          items.meta(),
+          length * block_size,
+          src_base + offsets_[idx] * block_bytesize,
+          out);
+      out += length * block_bytesize;
+    }
+    return true;
+  }
+
+  std::vector<TIndex> offsets_;
+
+  INPUT_TAGS(ITEMS, LENGTHS, INDICES);
+};
+
+template <class Context>
+class UnsafeCoalesceOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    size_t coalesced_size = 0;
+    for (int i = 0; i < InputSize(); ++i) {
+      CAFFE_ENFORCE(
+          !Input(i).meta().ctor(),
+          "Must only coalesce fundamental types, error at input: ",
+          i);
+    }
+
+    auto roundToAlignment = [](size_t bytes) -> size_t {
+      return ((bytes + gCaffe2Alignment - 1) / gCaffe2Alignment) *
+          gCaffe2Alignment;
+    };
+
+    for (int i = 0; i < InputSize(); ++i) {
+      coalesced_size += roundToAlignment(Input(i).nbytes());
+    }
+
+    auto* coalesced = Output(OutputSize() - 1);
+    coalesced->Resize(coalesced_size);
+    math::Set<uint8_t, Context>(
+        coalesced_size,
+        0.0,
+        coalesced->template mutable_data<uint8_t>(),
+        &context_);
+
+    size_t coalesced_offset = 0;
+    for (auto i = 0; i < InputSize(); ++i) {
+      const auto input_nbytes = Input(i).nbytes();
+      context_.template CopyBytes<Context, Context>(
+          input_nbytes,
+          (const uint8_t*)Input(i).raw_data(),
+          coalesced->template mutable_data<uint8_t>() + coalesced_offset);
+
+      // Note: this could cause Input(i) to free it's data if
+      // Output(i) and Input(i) alias each other. This is safe on a
+      // GPU (as the copy will happen-before the free), but it's
+      // worth mentioning.
+
+      Output(i)->ResizeLike(Input(i));
+      Output(i)->ShareExternalPointer(
+          static_cast<void*>(
+              coalesced->template mutable_data<uint8_t>() + coalesced_offset),
+          Input(i).meta(),
+          input_nbytes);
+      coalesced_offset += roundToAlignment(input_nbytes);
+    }
+    return true;
+  }
+};
+
+template <typename T, class Context>
+class AccumulateHistogramOp : public Operator<Context> {
+ public:
+  AccumulateHistogramOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        lower_bound_(
+            OperatorBase::GetSingleArgument<float>("lower_bound", 0.0)),
+        upper_bound_(
+            OperatorBase::GetSingleArgument<float>("upper_bound", 1.0)),
+        num_buckets_(OperatorBase::GetSingleArgument<int>("num_buckets", 1)) {
+    CAFFE_ENFORCE_GT(num_buckets_, 0);
+    // 2 more for histograms < lower_bound, >= upper_bound respectively
+    num_output_buckets_ = num_buckets_ + 2;
+    accumulate_hist_ = std::vector<int64_t>(num_output_buckets_, 0);
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& X = Input(X_IN);
+    auto* X_data = X.template data<T>();
+    int N = X.size();
+    auto* cur_hist = Output(CUR_HIST);
+    auto* acc_hist = Output(ACC_HIST);
+    cur_hist->Resize(num_output_buckets_);
+    acc_hist->Resize(num_output_buckets_);
+    auto* cur_hist_data = cur_hist->template mutable_data<int64_t>();
+    auto* acc_hist_data = acc_hist->template mutable_data<int64_t>();
+    auto segment = (upper_bound_ - lower_bound_) / num_buckets_;
+    math::Set<int64_t, Context>(
+        num_output_buckets_, 0, cur_hist_data, &context_);
+
+    for (int i = 0; i < N; i++) {
+      int bucket_index = -1;
+      if (X_data[i] < lower_bound_) {
+        bucket_index = 0;
+      } else if (X_data[i] >= upper_bound_) {
+        bucket_index = num_buckets_ + 1;
+      } else {
+        bucket_index = (int)((X_data[i] - lower_bound_) / segment) + 1;
+      }
+      cur_hist_data[bucket_index] += 1;
+      accumulate_hist_[bucket_index] += 1;
+    }
+
+    for (int i = 0; i < num_output_buckets_; i++) {
+      acc_hist_data[i] = accumulate_hist_[i];
+    }
+
+    return true;
+  }
+
+ private:
+  float lower_bound_;
+  float upper_bound_;
+  int num_buckets_;
+  int num_output_buckets_;
+  std::vector<int64_t> accumulate_hist_;
+
+  INPUT_TAGS(X_IN);
+  OUTPUT_TAGS(CUR_HIST, ACC_HIST);
+};
+
+template <class Context>
+class RangeOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(RangeOp)
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t, float, double>>::call(
+        this, Input(0));
+  }
+
+  template <typename T>
+  T readScalarInput(const int index) {
+    if (std::is_same<Context, TensorCPU>::value) {
+      return Input(index).template data<T>()[0];
+    } else {
+      local_.template CopyFrom<Context>(Input(index));
+      return local_.template data<T>()[0];
+    }
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    T stop = 0;
+    T start = 0;
+    T step = 1;
+
+    for (int i = 0; i < InputSize(); ++i) {
+      CAFFE_ENFORCE_EQ(Input(0).ndim(), 0, "All inputs must be scalar.");
+    }
+
+    switch (InputSize()) {
+      case 1:
+        stop = readScalarInput<T>(0);
+        break;
+      case 2:
+        start = readScalarInput<T>(0);
+        stop = readScalarInput<T>(1);
+        break;
+      case 3:
+        step = readScalarInput<T>(2);
+        start = readScalarInput<T>(0);
+        stop = readScalarInput<T>(1);
+        break;
+    }
+    CAFFE_ENFORCE_NE(step, 0, "Step size cannot be 0.");
+    int length;
+    auto diff = stop - start;
+    if (std::is_integral<T>::value) {
+      // Avoid casting to and from floats in case it introduces rounding and
+      // avoid mod because the compiler doesn't strip unused code until later.
+      length = diff / step;
+      if (length * step < diff) {
+        length += 1;
+      }
+    } else {
+      length = static_cast<int>(ceil(diff / step));
+    }
+    auto* output = Output(0);
+    // Match numpy's behavior here.
+    if (length <= 0) {
+      output->Resize(0);
+      // Called for the side effect of setting the data.
+      output->template mutable_data<T>();
+      return true;
+    } else {
+      output->Resize(length);
+      return DoRunOnDevice<T>(start, step, output);
+    }
+  }
+
+  template <typename T>
+  bool DoRunOnDevice(const T& start, const T& step, Tensor<Context>* output);
+
+ private:
+  // local CPU tensor for copying constants.
+  TensorCPU local_;
+};
+
+class ThrowExceptionOp : public Operator<CPUContext> {
+ public:
+  ThrowExceptionOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        message_(GetSingleArgument<std::string>(
+            "message",
+            "Exception from ThrowExceptionOp")) {}
+
+  bool RunOnDevice() override {
+    CAFFE_THROW(message_);
+  }
+
+ private:
+  const std::string message_;
+};
+
+class ThrowChildThreadExceptionOp : public Operator<CPUContext> {
+ public:
+  ThrowChildThreadExceptionOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        message_(GetSingleArgument<std::string>(
+            "message",
+            "Exception from ThrowChildThreadExceptionOp")) {}
+
+  bool RunOnDevice() override {
+    std::thread t([this]() { CAFFE_THROW(this->message_); });
+
+    t.join();
+    return true;
+  }
+
+ private:
+  const std::string message_;
+};
+
+class LogFatalOp : public Operator<CPUContext> {
+ public:
+  LogFatalOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        message_(GetSingleArgument<std::string>(
+            "message",
+            "Logging from LogFatalOp")) {}
+
+  bool RunOnDevice() override {
+    LOG(FATAL) << message_;
+    return true;
+  }
+
+ private:
+  const std::string message_;
+};
+
+class FailOp : public Operator<CPUContext> {
+ public:
+  FailOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return false;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_UTILITY_OPS_H_
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
new file mode 100644
index 0000000..4b9b10e
--- /dev/null
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -0,0 +1,51 @@
+#include <iostream>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/utility_ops.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+static void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  option.set_device_type(CUDA);
+  CUDAContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  tensor->Resize(shape);
+  math::Set<float, CUDAContext>(
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
+  return;
+}
+
+TEST(UtilityOpGPUTest, testReshapeWithScalar) {
+  if (!HasCudaGPU())
+    return;
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test_reshape");
+  def.set_type("Reshape");
+  def.add_input("X");
+  def.add_output("XNew");
+  def.add_output("OldShape");
+  def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
+  def.mutable_device_option()->set_device_type(CUDA);
+  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  // execute the op
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_TRUE(op->Run());
+  Blob* XNew = ws.GetBlob("XNew");
+  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
+  EXPECT_EQ(1, XNewTensor.ndim());
+  EXPECT_EQ(1, XNewTensor.size());
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
new file mode 100644
index 0000000..7470517
--- /dev/null
+++ b/caffe2/operators/utility_ops_test.cc
@@ -0,0 +1,45 @@
+#include <iostream>
+
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/utility_ops.h"
+#include <gtest/gtest.h>
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+static void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  math::Set<float, CPUContext>(
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
+  return;
+}
+
+TEST(UtilityOpTest, testReshapeWithScalar) {
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test_reshape");
+  def.set_type("Reshape");
+  def.add_input("X");
+  def.add_output("XNew");
+  def.add_output("OldShape");
+  def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
+  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  // execute the op
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_TRUE(op->Run());
+  Blob* XNew = ws.GetBlob("XNew");
+  const TensorCPU& XNewTensor = XNew->Get<Tensor<CPUContext>>();
+  EXPECT_EQ(1, XNewTensor.ndim());
+  EXPECT_EQ(1, XNewTensor.size());
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/variable_length_sequence_padding.cc b/caffe2/operators/variable_length_sequence_padding.cc
new file mode 100644
index 0000000..dbdb4ac
--- /dev/null
+++ b/caffe2/operators/variable_length_sequence_padding.cc
@@ -0,0 +1,27 @@
+#include "variable_length_sequence_padding.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(
+    VariableLengthSequencePadding,
+    VariableLengthSequencePaddingOp<float, CPUContext>);
+OPERATOR_SCHEMA(VariableLengthSequencePadding)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Super special-case operator. Used to pad a tensor to mimic pytorch's
+pad_packed_sequence.
+
+Given an input tensor INPUT of size NxBxM and an input tensor LENS
+of size B, where
+
+N = maximum sequence length
+B = batch size
+M = hidden size
+
+set each element of INPUT to zero if it is is past the end of the
+corresponding sequence (i.e. if LENS[j] > i for an index (i,j,k)).
+
+)DOC");
+
+} // namespace caffe2
diff --git a/caffe2/operators/variable_length_sequence_padding.h b/caffe2/operators/variable_length_sequence_padding.h
new file mode 100644
index 0000000..7318b2e
--- /dev/null
+++ b/caffe2/operators/variable_length_sequence_padding.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace detail {
+
+template <typename T, typename Context>
+void VariableLengthSequencePadding(
+    int N,
+    int B,
+    int M,
+    T* X,
+    const int32_t* seqLengths,
+    const T padValue,
+    Context* /*context*/) {
+  for (int j = 0; j < B; j++) {
+    for (int i = seqLengths[j]; i < N; i++) {
+      EigenVectorArrayMap<T>(X + B * M * i + M * j, M).setConstant(padValue);
+    }
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Context>
+class VariableLengthSequencePaddingOp : public Operator<Context> {
+ public:
+  VariableLengthSequencePaddingOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    const auto N = Input(INPUT).dim(0);
+    const auto B = Input(INPUT).dim(1);
+    const auto M = Input(INPUT).dim(2);
+
+    auto X = Output(OUTPUT)->template mutable_data<T>();
+
+    auto seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
+
+    detail::VariableLengthSequencePadding<T, Context>(
+        N, B, M, X, seqLengths, 0, &context_);
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(INPUT, SEQ_LENGTHS);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/weighted_multi_sampling_op.cc b/caffe2/operators/weighted_multi_sampling_op.cc
new file mode 100644
index 0000000..6f565c6
--- /dev/null
+++ b/caffe2/operators/weighted_multi_sampling_op.cc
@@ -0,0 +1,113 @@
+#include "caffe2/operators/weighted_multi_sampling_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+bool WeightedMultiSamplingOp<Context>::RunOnDevice() {
+  const auto& weight = Input(0);
+  CAFFE_ENFORCE_EQ(weight.ndim(), 1, "Input should be 1-D vector");
+  auto dims = weight.dims();
+  size_t data_size = weight.dim32(0);
+  auto* indices = Output(0);
+
+  auto num_samples = num_samples_;
+  if (InputSize() == 2) {
+    CAFFE_ENFORCE(
+        !OperatorBase::HasArgument("num_samples"),
+        "New shape is specified by the input blob, do not pass in "
+        "the argument `num_samples`.");
+    num_samples = Input(1).size();
+    indices->ResizeLike(Input(1));
+  } else {
+    indices->Resize(num_samples);
+  }
+
+  int* indices_data = indices->template mutable_data<int>();
+  if (data_size == 0) {
+    indices->Resize(0);
+    return true;
+  }
+
+  const float* weight_data = weight.template data<float>();
+
+  for (int i = 0; i < num_samples; ++i) {
+    float r;
+    math::RandUniform<float, Context>(
+        1, 0.0f, weight_data[data_size - 1], &r, &context_);
+    auto lb = std::lower_bound(weight_data, weight_data + data_size, r);
+    CAFFE_ENFORCE(
+        lb != weight_data + data_size, "Cannot find ", r, " in input CDF.");
+    indices_data[i] = static_cast<int>(lb - weight_data);
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(
+    WeightedMultiSampling,
+    WeightedMultiSamplingOp<CPUContext>);
+
+OPERATOR_SCHEMA(WeightedMultiSampling)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      if (in[0].dims(0) == 0) {
+        out[0].set_data_type(TensorProto::INT32);
+        out[0].add_dims(0);
+        return out;
+      }
+
+      const ArgumentHelper args(def);
+      if (args.HasArgument("num_samples")) {
+        CAFFE_ENFORCE_EQ(
+            in.size(),
+            1,
+            "New shape must not be specified by the input blob and the "
+            "argument `num_samples` at the same time.");
+        int num_samples = args.GetSingleArgument<int64_t>("num_samples", 0);
+        out[0] =
+            CreateTensorShape(vector<int64_t>{num_samples}, TensorProto::INT32);
+        return out;
+      } else {
+        CAFFE_ENFORCE_EQ(
+            in.size(),
+            2,
+            "New shape must be specified by either the input blob or the "
+            "argument `num_samples`.");
+        std::vector<int64_t> output_dims = GetDimsVector(in[1]);
+        out[0] = CreateTensorShape(output_dims, TensorProto::INT32);
+        return out;
+      }
+    })
+    .SetDoc(R"DOC(
+The operator performs sampling based on the input sampling weights.
+All weights are cummulative probability thus sorted. The output is
+a 1-D tensor (Tensor<int>). If two inputs are given, the second input
+is used to provide shape of the output sample tensor. Otherwise, we use
+argument `num_samples` to determine the number of samples to generate.
+)DOC")
+    .Input(
+        0,
+        "sampling_cdf",
+        "An optional 1-D Tensor<float>."
+        "Input cumulative sampling probability (such as [0.2, 0.5, 0.8, 1.5])."
+        " All weights must be non-negative numbers. Note that the last value of"
+        " CDF is not necessary 1. If the last value is not 1, all values in"
+        " sampling_cdf will be scaled by this number.")
+    .Input(
+        1,
+        "shape_tensor (optional)",
+        "Tensor whose shape will be applied to output.")
+    .Output(
+        0,
+        "sampled_indexes",
+        "The output tensor contains indices sampled from distribution given"
+        "by the weight vector in the input tensor"
+        "The output is a 1-D Tensor<int> of size determined by argument"
+        "`num_samples` or the second input tensor.")
+    .Arg("num_samples", "number of samples to sample from the input data");
+
+SHOULD_NOT_DO_GRADIENT(WeightedMultiSample);
+} // namespace caffe2
diff --git a/caffe2/operators/weighted_multi_sampling_op.h b/caffe2/operators/weighted_multi_sampling_op.h
new file mode 100644
index 0000000..c348e89
--- /dev/null
+++ b/caffe2/operators/weighted_multi_sampling_op.h
@@ -0,0 +1,25 @@
+#pragma once
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class WeightedMultiSamplingOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  WeightedMultiSamplingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_samples_(
+            OperatorBase::GetSingleArgument<int64_t>("num_samples", 0)) {
+    CAFFE_ENFORCE_GE(num_samples_, 0);
+  }
+
+  bool RunOnDevice() override;
+
+ private:
+  const int64_t num_samples_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/weighted_sample_op.cc b/caffe2/operators/weighted_sample_op.cc
new file mode 100644
index 0000000..2ffd35f
--- /dev/null
+++ b/caffe2/operators/weighted_sample_op.cc
@@ -0,0 +1,117 @@
+#include "caffe2/operators/weighted_sample_op.h"
+
+namespace caffe2 {
+
+template <>
+bool WeightedSampleOp<float, CPUContext>::RunOnDevice() {
+  CAFFE_ENFORCE_EQ(
+      InputSize(),
+      OutputSize(),
+      "The number of tensors of the input and the output must be the same.");
+  auto& weights = Input(0);
+  int batch_size = weights.dim(0);
+  int weights_dim = weights.dim(1);
+  auto* out_idx = Output(0);
+
+  if (batch_size > 0 && weights_dim > 0) {
+    cum_mass_.resize(weights_dim);
+    const float* mat_weights = weights.template data<float>();
+    const float* mat_values = nullptr;
+    out_idx->Resize(batch_size, 1);
+    int* output_indices = out_idx->template mutable_data<int>();
+    float* output_values = nullptr;
+
+    if (InputSize() == 2) {
+      auto& values = Input(1);
+      CAFFE_ENFORCE_EQ(
+          weights.dims(),
+          values.dims(),
+          "The sampling weights tensor and the sampling values tensor must have the same dimensions.");
+      mat_values = values.template data<float>();
+      auto* out_value = Output(1);
+      out_value->Resize(batch_size, 1);
+      output_values = out_value->template mutable_data<float>();
+    }
+
+    for (int i = 0; i < batch_size; i++) {
+      float r;
+      int offset = i * weights_dim;
+
+      cum_mass_[0] = mat_weights[offset];
+      for (int j = 1; j < weights_dim; j++) {
+        cum_mass_[j] = cum_mass_[j - 1] + mat_weights[offset + j];
+      }
+
+      math::RandUniform<float, CPUContext>(
+          1, 0.0f, cum_mass_[cum_mass_.size() - 1], &r, &context_);
+      // Makes the element in cum_mass_ slightly bigger
+      // to compensate inaccuracy introduced due to rounding,
+      cum_mass_[cum_mass_.size() - 1] += 0.01f;
+      auto lb = lower_bound(cum_mass_.begin(), cum_mass_.end(), r);
+      CAFFE_ENFORCE(lb != cum_mass_.end(), "Cannot find ", r, " in cum_mass_.");
+      output_indices[i] = static_cast<int>(lb - cum_mass_.begin());
+
+      if (output_values) {
+        output_values[i] =
+            static_cast<float>(mat_values[offset + (lb - cum_mass_.begin())]);
+      }
+    }
+  } else {
+    out_idx->Resize(0);
+    out_idx->template mutable_data<int>();
+    if (OutputSize() == 2) {
+      auto* out_value = Output(1);
+      out_value->Resize(0);
+      out_value->template mutable_data<float>();
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(WeightedSample, WeightedSampleOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(WeightedSample)
+    .NumInputs(1, 2)
+    .NumOutputs(1, 2)
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(2);
+      int batch_size = in[0].dims(0);
+      out[0] = CreateTensorShape(vector<int>{batch_size}, TensorProto::INT32);
+      out[1] = CreateTensorShape(vector<int>{batch_size}, TensorProto::FLOAT);
+      return out;
+    })
+    .SetDoc(R"DOC(
+The operator performs sampling based on the input sampling weights for
+each batch. All weights must be non-negative numbers.
+The input is a 2-D tensor (Tensor<float>) of size (batch_size x weights_dim).
+For each batch, an index is randomly sampled from the distribution given by
+the weights of the corresponding batch.
+The output is a 1-D tensor (Tensor<int>) of size (batch_size x 1) and
+contains the index(es) of the sampled output.
+)DOC")
+    .Input(
+        0,
+        "sampling_weights",
+        "A 2-D Tensor<float> of size (batch_size x weights_dim)."
+        "All weights must be non-negative numbers.")
+    .Input(
+        1,
+        "sampling_values",
+        "An optional 2-D Tensor<float> of size (batch_size x weights_dim)."
+        "Its values correspond to the sampling weights.")
+    .Output(
+        0,
+        "sampled_indexes",
+        "The output tensor contains index(es) sampled from distribution given"
+        "by the weight vector(s) in the input tensor"
+        "The output is a 1-D Tensor<int> of size (batch_size x 1)")
+    .Output(
+        1,
+        "sampled_values",
+        "The output tensor contains value(s) selected by the sampled index(es)"
+        "It is a 1-D Tensor<float> of size (batch_size x 1)");
+
+SHOULD_NOT_DO_GRADIENT(WeightedSample);
+} // namespace caffe2
diff --git a/caffe2/operators/weighted_sample_op.cu b/caffe2/operators/weighted_sample_op.cu
new file mode 100644
index 0000000..fa247c6
--- /dev/null
+++ b/caffe2/operators/weighted_sample_op.cu
@@ -0,0 +1,107 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/weighted_sample_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+__global__ void WeightedSampleKernel(
+    const int batch_size,
+    const int weights_dim,
+    const float* in_weights_data,
+    const float* in_val_data,
+    float* samples,
+    int* out_idx_data,
+    float* out_val_data) {
+  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+    int offset = i * weights_dim;
+
+    float sum = 0.0;
+    for (int j = 0; j < weights_dim; j++) {
+      sum += in_weights_data[offset + j];
+    }
+    samples[i] *= sum;
+
+    float cum_sum = 0.0;
+    int j = 0;
+    for (; j < weights_dim; j++) {
+      cum_sum += in_weights_data[offset + j];
+      if (cum_sum >= samples[i]) {
+        break;
+      }
+    }
+    out_idx_data[i] = min(j, weights_dim - 1);
+
+    if (out_val_data) {
+      out_val_data[i] = in_val_data[offset + out_idx_data[i]];
+    }
+  }
+}
+
+} // namespace
+
+template <>
+bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
+  CAFFE_ENFORCE_EQ(
+      InputSize(),
+      OutputSize(),
+      "The number of tensors of the input and the output must be the same.");
+
+  auto& in_weights = Input(0);
+  auto* out_idx = Output(0);
+  int batch_size = in_weights.dim(0);
+  int weights_dim = in_weights.dim(1);
+
+  if (batch_size > 0 && weights_dim > 0) {
+    out_idx->Resize(batch_size, 1);
+    unif_samples_.Resize(batch_size);
+
+    const float* in_weights_data = in_weights.data<float>();
+    const float* in_val_data = nullptr;
+    int* out_idx_data = out_idx->mutable_data<int>();
+    float* out_val_data = nullptr;
+
+    if (OutputSize() == 2) {
+      auto& in_val = Input(1);
+      CAFFE_ENFORCE_EQ(
+          in_weights.dims(),
+          in_val.dims(),
+          "The sampling weights tensor and the sampling values tensor must have the same dimensions.");
+      in_val_data = in_val.data<float>();
+
+      auto* out_val = Output(1);
+      out_val->Resize(batch_size, 1);
+      out_val_data = out_val->mutable_data<float>();
+    }
+
+    float* unif_samples_data = unif_samples_.mutable_data<float>();
+    CURAND_ENFORCE(curandGenerateUniform(
+        context_.curand_generator(), unif_samples_data, batch_size));
+
+    WeightedSampleKernel<<<
+        CAFFE_GET_BLOCKS(batch_size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        batch_size,
+        weights_dim,
+        in_weights_data,
+        in_val_data,
+        unif_samples_data,
+        out_idx_data,
+        out_val_data);
+  } else {
+    out_idx->Resize(0);
+    out_idx->mutable_data<int>();
+    if (OutputSize() == 2) {
+      auto* out_val = Output(1);
+      out_val->Resize(0);
+      out_val->mutable_data<float>();
+    }
+  }
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(WeightedSample, WeightedSampleOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/weighted_sample_op.h b/caffe2/operators/weighted_sample_op.h
new file mode 100644
index 0000000..e870511
--- /dev/null
+++ b/caffe2/operators/weighted_sample_op.h
@@ -0,0 +1,30 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_WEIGHTEDSAMPLE_OP_H_
+#define CAFFE2_OPERATORS_WEIGHTEDSAMPLE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class WeightedSampleOp final : public Operator<Context> {
+ public:
+  WeightedSampleOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ private:
+  vector<float> cum_mass_;
+  Tensor<Context> unif_samples_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_WEIGHTEDSAMPLE_OP_H_
diff --git a/caffe2/operators/while_op.cc b/caffe2/operators/while_op.cc
new file mode 100644
index 0000000..82ed811
--- /dev/null
+++ b/caffe2/operators/while_op.cc
@@ -0,0 +1,22 @@
+#include "caffe2/operators/while_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(While, WhileOp<CPUContext>);
+
+OPERATOR_SCHEMA(While)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .SetDoc(R"DOC(
+'While' control operator, first input is a scalar boolean blob that stores loop's
+condition value. Accepts 'loop_net' (required) and 'cond_net' (optional) arguments for
+loop's body and condition subnets respectively. If condition subnet is specified,
+it is executed before the first and after each iteration. Subnets are executed in
+the same workspace as 'While'.
+    )DOC")
+    .Arg("loop_net", "Net executed on each iteration")
+    .Arg("cond_net", "Net to (re)compute condition value")
+    .Input(0, "condition", "Scalar boolean condition")
+    .AllowInplace([](int in, int out) -> bool { return true; });
+
+} // namespace caffe2
diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h
new file mode 100644
index 0000000..dff6f11
--- /dev/null
+++ b/caffe2/operators/while_op.h
@@ -0,0 +1,72 @@
+#ifndef CAFFE2_OPERATORS_WHILE_OP_H_
+#define CAFFE2_OPERATORS_WHILE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class WhileOp final : public Operator<Context> {
+ public:
+  WhileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    CAFFE_ENFORCE(
+        this->template HasSingleArgumentOfType<NetDef>("loop_net"),
+        "loop_net must be specified in While operator");
+    loop_net_def_ =
+        this->template GetSingleArgument<NetDef>("loop_net", NetDef());
+    loop_net_ = CreateNet(loop_net_def_, ws);
+    CAFFE_ENFORCE(loop_net_, "Failed to initialize loop subnet");
+
+    cond_net_ = nullptr;
+    bool has_cond_net =
+        this->template HasSingleArgumentOfType<NetDef>("cond_net");
+    if (has_cond_net) {
+      cond_net_def_ =
+          this->template GetSingleArgument<NetDef>("cond_net", NetDef());
+      cond_net_ = CreateNet(cond_net_def_, ws);
+      CAFFE_ENFORCE(cond_net_, "Failed to initialize condition subnet");
+    }
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(
+        this->template InputIsType<Tensor<Context>>(0),
+        "Invalid condition in While operator: tensor expected");
+
+    const auto& condition = Input(0);
+    CAFFE_ENFORCE_EQ(
+        condition.size(),
+        1,
+        "Invalid condition tensor in While operator: single value expected");
+
+    while (true) {
+      if (cond_net_ && !cond_net_->Run()) {
+        return false;
+      }
+      if (!*condition.template data<bool>()) {
+        return true;
+      }
+      if (!loop_net_->Run()) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  NetDef loop_net_def_;
+  std::unique_ptr<NetBase> loop_net_;
+
+  NetDef cond_net_def_;
+  std::unique_ptr<NetBase> cond_net_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_WHILE_OP_H_
diff --git a/caffe2/operators/while_op_gpu.cc b/caffe2/operators/while_op_gpu.cc
new file mode 100644
index 0000000..1fc100c
--- /dev/null
+++ b/caffe2/operators/while_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/operators/while_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(While, WhileOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/workspace_ops.cc b/caffe2/operators/workspace_ops.cc
new file mode 100644
index 0000000..d9775aa
--- /dev/null
+++ b/caffe2/operators/workspace_ops.cc
@@ -0,0 +1,42 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+class GetAllBlobNamesOp final : public Operator<CPUContext> {
+ public:
+  GetAllBlobNamesOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        include_shared_(GetSingleArgument<int>("include_shared", true)),
+        ws_(ws) {}
+
+  bool RunOnDevice() override {
+    auto* out = Output(0);
+    const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
+    out->Resize(blobs.size());
+    std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
+    return true;
+  }
+
+ private:
+  bool include_shared_;
+  Workspace* ws_;
+};
+
+REGISTER_CPU_OPERATOR(GetAllBlobNames, GetAllBlobNamesOp);
+OPERATOR_SCHEMA(GetAllBlobNames)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Return a 1D tensor of strings containing the names
+of each blob in the active workspace.
+)DOC")
+    .Arg(
+        "include_shared",
+        "(bool, default true) Whether to include blobs "
+        "inherited from parent workspaces.")
+    .Output(0, "blob_names", "1D tensor of strings containing blob names.");
+SHOULD_NOT_DO_GRADIENT(GetAllBlobNamesOp);
+}
+}
diff --git a/caffe2/operators/zero_gradient_op.cc b/caffe2/operators/zero_gradient_op.cc
new file mode 100644
index 0000000..53c0874
--- /dev/null
+++ b/caffe2/operators/zero_gradient_op.cc
@@ -0,0 +1,28 @@
+#include "caffe2/operators/zero_gradient_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ZeroGradient, ZeroGradientOp<CPUContext>);
+OPERATOR_SCHEMA(ZeroGradient)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+ZeroGradient operators doesn't produce any output blobs. One can use
+this operator to produce 0 gradient for the input blob.
+)DOC");
+
+struct GetZeroGradientOpGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ConstantFill",
+        "",
+        vector<string>{I(0)},
+        vector<string>{GI(0)},
+        vector<Argument>{MakeArgument<float>("value", 0.0)});
+  }
+};
+
+REGISTER_GRADIENT(ZeroGradient, GetZeroGradientOpGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/zero_gradient_op.h b/caffe2/operators/zero_gradient_op.h
new file mode 100644
index 0000000..288d9b6
--- /dev/null
+++ b/caffe2/operators/zero_gradient_op.h
@@ -0,0 +1,20 @@
+
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ZeroGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ZeroGradientOp);
+
+  bool RunOnDevice() override {
+    return true;
+  }
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/zero_gradient_op_gpu.cc b/caffe2/operators/zero_gradient_op_gpu.cc
new file mode 100644
index 0000000..82cfc23
--- /dev/null
+++ b/caffe2/operators/zero_gradient_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/zero_gradient_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(ZeroGradient, ZeroGradientOp<CUDAContext>);
+}
diff --git a/caffe2/opt/CMakeLists.txt b/caffe2/opt/CMakeLists.txt
new file mode 100644
index 0000000..85f99c9
--- /dev/null
+++ b/caffe2/opt/CMakeLists.txt
@@ -0,0 +1,11 @@
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/opt/backend_cutting.cc b/caffe2/opt/backend_cutting.cc
new file mode 100644
index 0000000..2e292f0
--- /dev/null
+++ b/caffe2/opt/backend_cutting.cc
@@ -0,0 +1,388 @@
+#include "caffe2/opt/backend_cutting.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/core/logging.h"
+#include "nomnigraph/Converters/Dot.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+
+#include <algorithm>
+#include <fstream>
+#include <queue>
+
+namespace caffe2 {
+namespace opt {
+
+namespace {
+
+using namespace nom::repr;
+using NodeRef = NNGraph::NodeRef;
+using EdgeRef = NNGraph::EdgeRef;
+
+class GroupAnnotation {
+ public:
+  GroupAnnotation(int i, int g = -1) : group(g), in_degree(i) {}
+  int group;
+  int in_degree;
+  bool needs_transform{true};
+};
+
+struct VisitorContext {
+  VisitorContext(std::function<bool(const caffe2::OperatorDef&)> func)
+      : predicate(func) {}
+  std::unordered_map<NodeRef, GroupAnnotation> infos;
+  std::unordered_set<NodeRef> frontier;
+  std::vector<NodeRef> current_group;
+  std::function<bool(const caffe2::OperatorDef&)> predicate;
+
+  int group{0};
+  bool find_supported{true};
+};
+
+std::string ShowNode(NodeRef node) {
+  if (nn::is<NeuralNetData>(node)) {
+    const auto* nn_tensor = nn::get<NeuralNetData>(node);
+    return MakeString("Tensor: ", nn_tensor->getName());
+  } else if (nn::is<NeuralNetOperator>(node)) {
+    const auto* nn_op = nn::get<NeuralNetOperator>(node);
+    const auto& op_def =
+        dyn_cast<Caffe2Annotation>(nn_op->getAnnotation())->getOperatorDef();
+    return MakeString("Op: ", op_def.type());
+  } else {
+    CAFFE_THROW("Known node");
+  }
+}
+
+void DumpGraph(NNGraph* g) {
+  auto nnprinter = [](typename NNGraph::NodeRef node) {
+    std::map<std::string, std::string> labelMap;
+    assert(node->data() && "Node doesn't have data, can't render it");
+    if (isa<NeuralNetOperator>(node->data())) {
+      auto* op = dyn_cast<NeuralNetOperator>(node->data().get());
+      labelMap["label"] =
+          op->getName() + " (" + caffe2::to_string((unsigned long long)node) + ")";
+      auto* annotation = op->getAnnotation();
+      if (annotation && isa<Caffe2Annotation>(annotation)) {
+        auto device_annotation =
+            dyn_cast<Caffe2Annotation>(annotation);
+        labelMap["label"] += "\\n[" + device_annotation->getDevice() + "]";
+        auto hash = std::hash<std::string>{}(device_annotation->getDevice());
+        std::stringstream hex_stream;
+        hex_stream << std::hex << hash;
+        labelMap["color"] = "#" + hex_stream.str().substr(0, 6);
+        labelMap["fontcolor"] = labelMap["color"];
+      }
+      labelMap["shape"] = "box";
+    } else if (isa<Data>(node->data())) {
+      auto tensor = dyn_cast<NeuralNetData>(node->data().get());
+      labelMap["label"] = tensor->getName();
+      labelMap["label"] += "_" + caffe2::to_string(tensor->getVersion()) + " " +
+          caffe2::to_string((unsigned long long)node);
+    }
+    return labelMap;
+  };
+
+  std::ofstream out("dump.dot");
+  out << nom::converters::convertToDotString(g, nnprinter);
+  out.close();
+}
+
+// Explore the graph in topological order until we hit stopping nodes. This is
+// based on Khan's algorithm:
+// https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
+// Precondition: nodes in `current_frontier` must have satisfy `in_degree == 0`
+void Explore(
+    const std::vector<NodeRef>& current_frontier,
+    VisitorContext* context) {
+  std::queue<NodeRef> q;
+  for (const auto n : current_frontier) {
+    q.push(n);
+  }
+
+  while (!q.empty()) {
+    auto node = q.front();
+    q.pop();
+    auto& info = context->infos.at(node);
+
+    // Check if the node is supported, stop exploring further if not supported
+    if (nn::is<NeuralNetOperator>(node)) {
+      const auto* nn_op =
+        nn::get<NeuralNetOperator>(node);
+      const auto& op_def =
+          dyn_cast<Caffe2Annotation>(nn_op->getAnnotation())->getOperatorDef();
+      bool wanted = context->predicate(op_def);
+      wanted = context->find_supported ? wanted : (!wanted);
+      if (!wanted) {
+        context->frontier.emplace(node);
+        continue;
+      }
+    }
+
+    // Adding to current group
+    info.group = context->group;
+    info.needs_transform = context->find_supported;
+    context->current_group.push_back(node);
+
+    // Continue exploring its fanouts
+    for (const auto& out_edge : node->getOutEdges()) {
+      auto child_node = out_edge->head();
+      auto& child_info = context->infos.at(child_node);
+      if (--child_info.in_degree == 0) {
+        q.push(child_node);
+      }
+    }
+  }
+}
+
+// Note: subgraph always starts with ops and ends with tensors, except for the
+// very first group, which can be all tensors
+struct TransformSubgraph {
+  explicit TransformSubgraph(
+      std::vector<NodeRef>&& f,
+      std::vector<NodeRef>&& n,
+      int id,
+      bool need)
+      : input_nodes(std::move(f)),
+        nodes(std::move(n)),
+        group_id(id),
+        needed(need) {}
+
+  TransformSubgraph(TransformSubgraph&& rhs) noexcept
+      : input_nodes(std::move(rhs.input_nodes)),
+        nodes(std::move(rhs.nodes)),
+        external_input_refs(std::move(rhs.external_input_refs)),
+        external_output_refs(std::move(rhs.external_output_refs)),
+        group_id(rhs.group_id),
+        needed(rhs.needed) {}
+
+  TransformSubgraph& operator=(TransformSubgraph&& rhs) noexcept {
+    input_nodes = std::move(rhs.input_nodes);
+    nodes = std::move(rhs.nodes);
+    external_input_refs = std::move(rhs.external_input_refs);
+    external_output_refs = std::move(rhs.external_output_refs);
+    group_id = rhs.group_id;
+    needed = rhs.needed;
+    return *this;
+  }
+
+  void Print() const {
+    LOG(INFO) << "Group :" << group_id;
+    LOG(INFO) << "  Input Nodes: ";
+    for (const auto i : input_nodes) {
+      LOG(INFO) << "    " << ShowNode(i);
+    }
+    LOG(INFO) << "  Nodes: ";
+    for (const auto i : nodes) {
+      LOG(INFO) << "    " << ShowNode(i);
+    }
+  }
+
+  std::vector<NodeRef> input_nodes;
+  std::vector<NodeRef> nodes;
+  std::unordered_map<std::string, NodeRef> external_input_refs;
+  std::unordered_map<std::string, NodeRef> external_output_refs;
+  int group_id{-1};
+  bool needed{true};
+};
+
+caffe2::NetDef ConvertToC2Net(
+    const TransformSubgraph& sub,
+    const std::unordered_map<NodeRef, GroupAnnotation>& infos) {
+  caffe2::NetDef net;
+  for (auto node : sub.nodes) {
+    if (nn::is<NeuralNetOperator>(node)) {
+      const auto* nn_op = nn::get<NeuralNetOperator>(node);
+      assert(isa<Caffe2Annotation>(nn_op->getAnnotation()) && "Cannot get caffe2 op from NNOp");
+      const auto& op_def =
+          dyn_cast<Caffe2Annotation>(nn_op->getAnnotation())->getOperatorDef();
+      net.add_op()->CopyFrom(op_def);
+    }
+  }
+  for (const auto kv : sub.external_input_refs) {
+    net.add_external_input(kv.first);
+    VLOG(2) << "Adding external input: " << kv.first;
+  }
+  for (const auto& kv : sub.external_output_refs) {
+    net.add_external_output(kv.first);
+    VLOG(2) << "Adding external output: " << kv.first;
+  }
+
+  return net;
+}
+
+void DetectBoundaryReferences(
+    TransformSubgraph* subgraph,
+    const std::unordered_map<NodeRef, GroupAnnotation>& infos) {
+  for (auto node: subgraph->nodes) {
+    // inputs
+    for (auto in_edge : node->getInEdges()) {
+      auto parent_node = in_edge->tail();
+      const auto& info = infos.at(parent_node);
+      if (info.group != subgraph->group_id &&
+          nn::is<NeuralNetData>(parent_node)) {
+        const auto* nn_tensor = nn::get<const NeuralNetData>(parent_node);
+        subgraph->external_input_refs.emplace(
+            nn_tensor->getName(), parent_node);
+      }
+    }
+
+    // outputs
+    if (!nn::is<NeuralNetData>(node)) {
+      continue;
+    }
+    for (auto child_node : nn::getConsumers(node)) {
+      const auto& info = infos.at(child_node);
+      if (info.group != subgraph->group_id) {
+        const auto* nn_tensor = nn::get<const NeuralNetData>(node);
+        subgraph->external_output_refs.emplace(nn_tensor->getName(), node);
+        break;
+      }
+    }
+  }
+}
+
+void ReplaceSubgraph(
+    const TransformSubgraph& subgraph,
+    caffe2::NetDef& net_opt,
+    NNGraph* g) {
+  // Delete the old subgraph starting from the input nodes until we hit boundary
+  // tensors
+  for (auto node : subgraph.nodes) {
+    if (nn::is<NeuralNetData>(node) &&
+        subgraph.external_output_refs.find(
+            nn::get<const NeuralNetData>(node)->getName()) !=
+            subgraph.external_output_refs.end()) {
+      VLOG(2) << "Keeping " << ShowNode(node);
+      continue;
+    }
+    VLOG(2) << "Deleting " << ShowNode(node);
+    g->deleteNode(node);
+  }
+
+  // Convert new NetDef back to NNGraph
+  std::unordered_map<std::string, NodeRef> tensor_map;
+  for (const auto kv: subgraph.external_input_refs) {
+    tensor_map.emplace(kv.first, kv.second);
+  }
+  for (const auto kv: subgraph.external_output_refs) {
+    tensor_map.emplace(kv.first, kv.second);
+  }
+  for (auto& op : *net_opt.mutable_op()) {
+    auto op_node = g->createNode();
+    for (const auto& input : op.input()) {
+      if (!tensor_map.count(input)) {
+        tensor_map[input] = g->createNode(caffe2::make_unique<Tensor>(input));
+      }
+
+      auto tensor_node = tensor_map[input];
+      g->createEdge(tensor_node, op_node);
+    }
+
+    for (const auto& output : op.output()) {
+      if (!tensor_map.count(output)) {
+        tensor_map[output] = g->createNode(caffe2::make_unique<Tensor>(output));
+      }
+      auto tensor_node = tensor_map[output];
+      g->createEdge(op_node, tensor_node);
+    }
+
+    op_node->resetData(convertToNeuralNetOperator(op));
+  }
+}
+
+void PruneUnrefereredNodes(NNGraph* g) {
+  std::vector<NodeRef> to_delete;
+  for (auto node : g->getMutableNodes()) {
+    if (!nn::hasProducer(node) &&
+        !nn::hasConsumer(node)) {
+      to_delete.push_back(node);
+    }
+  }
+  for (auto i : to_delete) {
+    g->deleteNode(i);
+  }
+}
+
+} // namespace
+
+caffe2::NetDef OptimizeForBackend(
+    caffe2::NetDef& net,
+    std::function<bool(const caffe2::OperatorDef&)> supports,
+    std::function<caffe2::NetDef(const caffe2::NetDef&)> transform_func) {
+  auto nn = convertToNNModule(net);
+  auto& dfg = nn.dataFlow;
+
+  // Initialize the group info and figure out the external/input output
+  VisitorContext context(supports);
+  std::vector<NodeRef> external_inputs;
+  std::vector<NodeRef> external_outputs;
+  for (auto node : dfg.getMutableNodes()) {
+    context.infos.emplace(
+        std::piecewise_construct,
+        std::forward_as_tuple(node),
+        std::forward_as_tuple(node->getInEdges().size(), -1));
+
+    if (!nn::is<NeuralNetOperator>(node)) {
+      if (!nn::hasProducer(node)) {
+        external_inputs.push_back(node);
+      }
+      if (!nn::hasConsumer(node)) {
+        external_outputs.push_back(node);
+      }
+    }
+  }
+
+  // Find unsupported and supported groups of nodes alernatively
+  context.frontier.clear();
+  context.current_group.clear();
+  context.find_supported = false;
+  std::vector<TransformSubgraph> subs;
+  for (std::vector<NodeRef> frontier(
+           external_inputs.begin(), external_inputs.end());
+       !frontier.empty();
+       context.find_supported = !context.find_supported) {
+    Explore(frontier, &context);
+    if (context.find_supported) {
+    subs.emplace_back(
+        std::move(frontier),
+        std::move(context.current_group),
+        context.group,
+        context.find_supported);
+    }
+
+    frontier.assign(context.frontier.begin(), context.frontier.end());
+    context.frontier.clear();
+    context.current_group.clear();
+    context.group++;
+  }
+
+  // Transform needed subgraphs one by one
+  std::vector<caffe2::NetDef> opt_subnets;
+  opt_subnets.reserve(subs.size());
+  for (auto& g : subs) {
+    // Generate boundary input/output edges
+    DetectBoundaryReferences(&g, context.infos);
+
+    caffe2::NetDef subnet = ConvertToC2Net(g, context.infos);
+    // Transform the subgraph protobuf def, note that we can have less external
+    // inputs/outputs but not more
+    opt_subnets.emplace_back(transform_func(subnet));
+
+    ReplaceSubgraph(g, opt_subnets.back(), &dfg);
+  }
+
+  // Prune dangling nodes, because after transformation, some weights might be
+  // absorbed
+  PruneUnrefereredNodes(&dfg);
+
+  auto new_net = convertToCaffe2Proto(nn);
+  for (const auto& i: net.external_input()) {
+    new_net.add_external_input(i);
+  }
+  for (const auto& i: net.external_output()) {
+    new_net.add_external_output(i);
+  }
+  new_net.set_name(net.name() + "_opt");
+  return new_net;
+}
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/backend_cutting.h b/caffe2/opt/backend_cutting.h
new file mode 100644
index 0000000..0e2bf7c
--- /dev/null
+++ b/caffe2/opt/backend_cutting.h
@@ -0,0 +1,17 @@
+#pragma once
+
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#include <functional>
+
+namespace caffe2 {
+namespace opt {
+
+caffe2::NetDef OptimizeForBackend(
+    caffe2::NetDef& net,
+    std::function<bool(const caffe2::OperatorDef&)> supports,
+    std::function<caffe2::NetDef(const caffe2::NetDef&)> transform_func);
+}
+} // namespace caffe2
diff --git a/caffe2/opt/backend_cutting_test.cc b/caffe2/opt/backend_cutting_test.cc
new file mode 100644
index 0000000..ae7cd23
--- /dev/null
+++ b/caffe2/opt/backend_cutting_test.cc
@@ -0,0 +1,133 @@
+#include "caffe2/core/common.h"
+#include "caffe2/opt/backend_cutting.h"
+#include "caffe2/utils/string_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace {
+  using caffe2::StartsWith;
+
+  void AddConv(caffe2::NetDef* net, int tick) {
+    auto* op = net->add_op();
+    op->set_type("MyConv");
+    op->add_input("N" + caffe2::to_string(tick));
+    op->add_input("W" + caffe2::to_string(tick));
+    op->add_input("b" + caffe2::to_string(tick));
+    op->add_output("N" + caffe2::to_string(tick+1));
+  }
+
+  bool Supports(const caffe2::OperatorDef& op) {
+    return StartsWith(op.type(), "MyConv") || StartsWith(op.type(), "MyRelu") ||
+        StartsWith(op.type(), "Concat");
+  }
+
+  caffe2::NetDef Transform(const caffe2::NetDef& net) {
+    caffe2::NetDef net_opt;
+    auto * op = net_opt.add_op();
+    op->set_type("BigOpt");
+
+    for (const auto& i: net.external_input()) {
+      // Absorb the weights and bias
+      if (!StartsWith(i, "W") && !StartsWith(i, "b")) {
+        net_opt.add_external_input(i);
+        op->add_input(i);
+      }
+    }
+    for (const auto& i: net.external_output()) {
+      net_opt.add_external_output(i);
+      op->add_output(i);
+    }
+    return net_opt;
+  }
+}
+
+// X -> CopyIn -> MyConv -> MyConv -> CopyOut -> Y
+TEST(BackendCuttingTest, line) {
+  caffe2::NetDef net;
+  net.add_external_input("X");
+  net.add_external_output("Y");
+  auto* op = net.add_op();
+  op->set_type("CopyIn");
+  op->add_input("X");
+  op->add_output("N0");
+  for (int i = 0; i < 2; ++i) {
+    AddConv(&net, i);
+  }
+  op = net.add_op();
+  op->set_type("CopyOut");
+  op->add_input("N2");
+  op->add_output("Y");
+
+  auto net_opt = caffe2::opt::OptimizeForBackend(net, Supports, Transform);
+  EXPECT_EQ(3, net_opt.op_size());
+}
+
+//  X0 -> CopyIn -> MyConv -\
+//                           > Concat -> CopyOut -> Y
+//  N2 -> MyConv -> MyRelu -/
+TEST(BackendCuttingTest, convergedPaths) {
+  caffe2::NetDef net;
+  net.add_external_input("X0");
+  net.add_external_input("X1");
+  net.add_external_input("N2");
+  net.add_external_output("Y");
+  auto* op = net.add_op();
+  op->set_type("CopyIn");
+  op->add_input("X0");
+  op->add_output("N0");
+  AddConv(&net, 0);
+  AddConv(&net, 2);
+  op = net.add_op();
+  op->set_type("MyRelu");
+  op->add_input("N3");
+  op->add_output("N4");
+  op = net.add_op();
+  op->set_type("Concat");
+  op->add_input("X1");
+  op->add_input("N1");
+  op->add_input("N4");
+  op->add_output("N5");
+  op = net.add_op();
+  op->set_type("CopyOut");
+  op->add_input("N5");
+  op->add_output("Y");
+
+  auto net_opt = caffe2::opt::OptimizeForBackend(net, Supports, Transform);
+  EXPECT_EQ(3, net_opt.op_size());
+};
+
+//                -> Random -> Relu -> MyConv4
+//              /                             \
+// N0 -> MyConv -> MyRelu -> MyConv2 ---------- > Concat -> CopyOut -> Y
+TEST(BackendCuttingTest, skipPath) {
+  caffe2::NetDef net;
+  net.add_external_input("N0");
+  net.add_external_output("Y");
+  AddConv(&net, 0);
+  auto* op = net.add_op();
+  op->set_type("MyRelu");
+  op->add_input("N1");
+  op->add_output("N2");
+  op = net.add_op();
+  op->set_type("Random");
+  op->add_input("N1");
+  op->add_output("N4");
+  op = net.add_op();
+  op->set_type("MyRelu");
+  op->add_input("N4");
+  op->add_output("N5");
+  AddConv(&net, 2);
+  AddConv(&net, 5);
+  op = net.add_op();
+  op->set_type("Concat");
+  op->add_input("N3");
+  op->add_input("N6");
+  op->add_output("N7");
+  op = net.add_op();
+  op->set_type("CopyOut");
+  op->add_input("N7");
+  op->add_output("Y");
+
+  auto net_opt = caffe2::opt::OptimizeForBackend(net, Supports, Transform);
+  EXPECT_EQ(4, net_opt.op_size());
+}
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
new file mode 100644
index 0000000..b486661
--- /dev/null
+++ b/caffe2/opt/converter.cc
@@ -0,0 +1,572 @@
+#include "caffe2/opt/converter.h"
+#include "caffe2/core/logging.h"
+
+#include "nomnigraph/Graph/Algorithms.h"
+
+#include "nomnigraph/Support/Casting.h"
+#include "nomnigraph/Support/Pointer.h"
+
+using namespace nom;
+
+namespace {
+
+std::vector<int> getStrides(std::map<std::string, caffe2::Argument> argMap) {
+  std::vector<int> strides;
+  // TODO: include all the other ways of adding these args.
+  // e.g. strides, stride_h, etc.
+  if (argMap.count("stride")) {
+    CAFFE_ENFORCE(argMap["stride"].has_i(), "Invalid stride argument");
+    int stride = static_cast<int>(argMap["stride"].i());
+    strides = {stride, stride};
+  }
+  return strides;
+}
+
+std::vector<int> getPads(std::map<std::string, caffe2::Argument> argMap) {
+  std::vector<int> pads;
+  if (argMap.count("pad")) {
+    CAFFE_ENFORCE(argMap["pad"].has_i(), "Invalid pad argument");
+    int pad = static_cast<int>(argMap["pad"].i());
+    pads = {pad, pad, pad, pad};
+  }
+  return pads;
+}
+
+std::vector<int> getDilations(std::map<std::string, caffe2::Argument> argMap) {
+  std::vector<int> dilations;
+  if (argMap.count("dilation")) {
+    CAFFE_ENFORCE(argMap["dilation"].has_i(), "Invalid dilation argument");
+    int dilation = static_cast<int>(argMap["dilation"].i());
+    dilations = {dilation, dilation};
+  }
+  return dilations;
+}
+
+int getGroup(std::map<std::string, caffe2::Argument>& argMap) {
+  if (argMap.count("group")) {
+    CAFFE_ENFORCE(argMap["group"].has_i() && "Invalid group argument");
+    return static_cast<int>(argMap["group"].i());
+  }
+  return 1;
+}
+
+} // namespace
+
+namespace caffe2 {
+
+CAFFE_DEFINE_REGISTRY(ConverterRegistry, Converter);
+
+std::map<std::string, caffe2::Argument> Converter::getArgumentsFromOperator(
+    caffe2::OperatorDef op) {
+  std::map<std::string, caffe2::Argument> argMap;
+  for (auto arg : op.arg()) {
+    argMap[arg.name()] = arg;
+  }
+  return argMap;
+}
+
+repr::NeuralNetOperator::NNLayout getLayout(std::map<std::string, caffe2::Argument> argMap) {
+  auto arg = argMap.find("order");
+  if (arg != argMap.end()) {
+    auto order = argMap["order"].s();
+    if (order == "NCHW" || order == "nchw") {
+      return repr::NeuralNetOperator::NNLayout::NCHW;
+    } else if (order == "NHWC" || order == "nhwc") {
+      return repr::NeuralNetOperator::NNLayout::NHWC;
+    }
+  }
+  return repr::NeuralNetOperator::NNLayout::Undefined;
+}
+
+OperatorDef Converter::convertToOperatorDef(
+    const nom::repr::NeuralNetOperator* nnOp) {
+  auto* annotation = nnOp->getAnnotation();
+  // Default to using the stored operator.
+  if (isa<Caffe2Annotation>(annotation)) {
+    return dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef();
+  }
+  CAFFE_THROW("TODO: Cannot yet instantiate OperatorDef from nomnigraph");
+}
+
+std::vector<int> getKernelShape(std::map<std::string, caffe2::Argument> argMap) {
+  // There are literally three ways to define shapes in Conv in Caffe2
+  std::vector<int> kernelShape;
+  if (argMap.count("kernel")) {
+    CAFFE_ENFORCE(argMap["kernel"].has_i(), "Invalid kernel argument");
+    int kernel = static_cast<int>(argMap["kernel"].i());
+    kernelShape = {kernel, kernel};
+  } else if (argMap.count("kernels")) {
+    for (auto i : argMap["kernels"].ints()) {
+      kernelShape.push_back(static_cast<int>(i));
+    }
+  } else if (argMap.count("kernel_h") && argMap.count("kernel_w")) {
+    CAFFE_ENFORCE(argMap["kernel_h"].has_i(), "Invalid kernel argument");
+    CAFFE_ENFORCE(argMap["kernel_w"].has_i(), "Invalid kernel argument");
+    int kernelH = static_cast<int>(argMap["kernel_h"].i());
+    int kernelW = static_cast<int>(argMap["kernel_w"].i());
+    kernelShape = {kernelH, kernelW};
+  }
+  return kernelShape;
+}
+
+namespace {
+
+class ConvConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp;
+    auto argMap = getArgumentsFromOperator(op);
+    auto kernelShape = getKernelShape(argMap);
+    nnOp = util::make_unique<repr::Conv>(kernelShape);
+    auto c = dyn_cast<repr::Conv>(nnOp.get());
+
+    c->setStrides(getStrides(argMap));
+    c->setPads(getPads(argMap));
+    c->setDilations(getDilations(argMap));
+    c->setGroup(getGroup(argMap));
+
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~ConvConverter() {}
+};
+
+REGISTER_CONVERTER(Conv, ConvConverter);
+
+TRIVIAL_CONVERTER(Relu);
+REGISTER_CONVERTER(Relu, ReluConverter);
+
+TRIVIAL_CONVERTER(Sum);
+REGISTER_CONVERTER(Sum, SumConverter);
+
+TRIVIAL_CONVERTER(BatchNormalization);
+REGISTER_CONVERTER(SpatialBN, BatchNormalizationConverter);
+
+TRIVIAL_CONVERTER(Flatten);
+REGISTER_CONVERTER(Flatten, FlattenConverter);
+
+TRIVIAL_CONVERTER(BatchGather);
+REGISTER_CONVERTER(BatchGather, BatchGatherConverter);
+
+class AveragePoolConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp;
+    auto argMap = getArgumentsFromOperator(op);
+    auto kernelShape = getKernelShape(argMap);
+    nnOp = util::make_unique<repr::AveragePool>(kernelShape);
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~AveragePoolConverter() {}
+};
+REGISTER_CONVERTER(AveragePool, AveragePoolConverter);
+
+class MaxPoolConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp;
+    auto argMap = getArgumentsFromOperator(op);
+    auto kernelShape = getKernelShape(argMap);
+    nnOp = util::make_unique<repr::MaxPool>(kernelShape);
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~MaxPoolConverter() {}
+};
+REGISTER_CONVERTER(MaxPool, MaxPoolConverter);
+
+class ConcatConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp =
+        util::make_unique<repr::Concat>();
+    auto argMap = getArgumentsFromOperator(op);
+
+    auto c = dyn_cast<repr::Concat>(nnOp.get());
+    if (argMap.count("axis")) {
+      CAFFE_ENFORCE(argMap["axis"].has_i(), "Invalid axis argument");
+      int axis = static_cast<int>(argMap["axis"].i());
+      c->setAxis(axis);
+    }
+    if (argMap.count("add_axis")) {
+      CAFFE_ENFORCE(argMap["add_axis"].has_i(), "Invalid add_axis argument");
+      int add_axis = static_cast<int>(argMap["add_axis"].i());
+      c->setAddAxis(!!add_axis);
+    }
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~ConcatConverter() {}
+};
+REGISTER_CONVERTER(Concat, ConcatConverter);
+
+class BatchMatMulConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp =
+        util::make_unique<repr::BatchMatMul>();
+    auto argMap = getArgumentsFromOperator(op);
+
+    auto c = dyn_cast<repr::BatchMatMul>(nnOp.get());
+    if (argMap.count("trans_a")) {
+      CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument");
+      int trans_a = static_cast<int>(argMap["trans_a"].i());
+      c->setTransA(!!trans_a);
+    }
+    if (argMap.count("trans_b")) {
+      CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument");
+      int trans_b = static_cast<int>(argMap["trans_b"].i());
+      c->setTransB(!!trans_b);
+    }
+    if (argMap.count("broadcast")) {
+      CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument");
+      int broadcast = static_cast<int>(argMap["broadcast"].i());
+      c->setBroadcast(!!broadcast);
+    }
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~BatchMatMulConverter() {}
+};
+REGISTER_CONVERTER(BatchMatMul, BatchMatMulConverter);
+
+} // namespace
+
+std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
+    const caffe2::OperatorDef& op) {
+  auto argMap = Converter::getArgumentsFromOperator(op);
+
+  std::unique_ptr<repr::NeuralNetOperator> nnOp;
+
+  if (ConverterRegistry()->Has(op.type())) {
+    nnOp =
+        ConverterRegistry()->Create(op.type())->convertToNeuralNetOperator(op);
+  }
+
+  if (!nnOp) {
+    nnOp = util::make_unique<repr::GenericOperator>(op.type());
+  }
+
+  // Generic attributes associated with Ops here
+  nnOp->setLayout(getLayout(argMap));
+
+  auto annotation = util::make_unique<Caffe2Annotation>();
+  annotation->setOperatorDef(op);
+
+  auto device_name = op.device_option().node_name();
+  if (device_name != "") {
+    annotation->setDevice(device_name);
+  }
+  annotation->setDeviceType(op.device_option().device_type());
+
+  nnOp->setAnnotation(std::move(annotation));
+
+  return nnOp;
+}
+
+void handleWhileOp(
+    repr::NNGraph& dfg,
+    repr::NNCFGraph& cfg,
+    repr::NNGraph::NodeRef& opNode,
+    repr::NNCFGraph::NodeRef& bbNode,
+    OperatorDef& op,
+    std::unordered_map<std::string, repr::NNGraph::NodeRef>& blobMap
+) {
+  opNode->resetData(util::make_unique<repr::While>());
+  auto argMap = Converter::getArgumentsFromOperator(op);
+  std::string bodyNetSerialized = argMap["body"].s();
+  auto bodyNet = caffe2::NetDef();
+  bodyNet.ParseFromString(bodyNetSerialized);
+
+  std::unordered_map<std::string, repr::NNGraph::NodeRef> bodyBlobMap;
+  auto bodyNN = convertToNNModule(bodyNet, &bodyBlobMap);
+  repr::NNGraph bodyGraph = std::move(bodyNN.dataFlow);
+  repr::NNCFGraph bodyCFGraph = std::move(bodyNN.controlFlow);
+
+  auto rev_sorted = algorithm::tarjans(&bodyGraph);
+
+  for (auto& k : bodyBlobMap) {
+    auto name = k.first;
+    if (blobMap.count(name)) {
+      auto oldNode = blobMap[name];
+      printf("Exit tensor %s is in the parent scope, inserting Phi node...\n", k.first.c_str());
+      auto phiNode = dfg.createNode(util::make_unique<repr::NNPhi>()); // NN variant of a Phi node
+      // Clone the operator.
+      auto tensor = dyn_cast<repr::NeuralNetData>(blobMap[name]->data().get());
+      auto* clonedTensor = tensor->clone();
+      auto phiOut = dfg.createNode(std::unique_ptr<repr::NeuralNetData>(clonedTensor));
+      dfg.createEdge(phiNode, phiOut);
+      dfg.createEdge(oldNode, phiNode);
+      dfg.createEdge(bodyBlobMap[name], phiNode);
+      blobMap[name] = phiOut;
+      for (auto& inEdge : opNode->getInEdges()) {
+        if (inEdge->tail() == oldNode) {
+          dfg.deleteEdge(inEdge);
+          dfg.createEdge(phiOut, opNode);
+        }
+      }
+    }
+  }
+
+  // Dependencies simply have no producers
+  std::unordered_map<repr::NNGraph::NodeRef, repr::NNGraph::NodeRef> inNodeMap;
+  for (auto& n : bodyGraph.getMutableNodes()) {
+    if (!isa<repr::NeuralNetData>(n->data())) { continue; }
+    if (n->getInEdges().size() == 0) {
+      auto name = dyn_cast<repr::NeuralNetData>(n->data().get())->getName();
+      // TODO(bwasti): this may be needed, depending on constraints
+      //assert(blobMap.count(name) != 0 && "Loop body takes undefined dependency.");
+      if (blobMap.count(name)) {
+        inNodeMap[n] = blobMap[name];
+      }
+    }
+  }
+
+  CAFFE_ENFORCE(rev_sorted.front().getNodes().size() == 1,
+      "More than one exit node.");
+  CAFFE_ENFORCE(rev_sorted.back().getNodes().size() == 1,
+      "More than one entry node.");
+
+  auto exit_tensor = *(rev_sorted.front().getNodes().begin());
+  CAFFE_ENFORCE(isa<repr::NeuralNetData>(exit_tensor->data()),
+      "Exit node is not a tensor.");
+
+  auto bodyNodes = bodyGraph.getMutableNodes();
+  auto bodyEdges = bodyGraph.getMutableEdges();
+
+  for (auto node : bodyNodes) {
+    bodyGraph.importNode(node, dfg);
+  }
+
+  for (auto edge : bodyEdges) {
+    bodyGraph.importEdge(edge, dfg);
+  }
+
+  // Merge all dependencies
+  for (auto node : dfg.getMutableNodes()) {
+    if (inNodeMap.count(node)) {
+      dfg.replaceNode(node, inNodeMap[node]);
+      dfg.deleteNode(node);
+    }
+  }
+
+  for (const auto& inEdge : opNode->getInEdges()) {
+    auto* inputData = dyn_cast<repr::NeuralNetData>(inEdge->tail()->data().get());
+    auto* exitData = dyn_cast<repr::NeuralNetData>(exit_tensor->data().get());
+    if (inputData->getName() == exitData->getName()) {
+      dfg.replaceNode(exit_tensor, inEdge->tail());
+      dfg.deleteNode(exit_tensor);
+    }
+  }
+
+  // CFG Handling
+  auto bodyCFNodes = bodyCFGraph.getMutableNodes();
+  auto bodyCFEdges = bodyCFGraph.getMutableEdges();
+
+  // Create a while loop CFG node.
+  auto whileBasicBlock = util::make_unique<repr::BasicBlockType<repr::NNGraph>>();
+  for (auto& inEdge : opNode->getInEdges()) {
+    auto node = inEdge->tail();
+    for (auto& parentInEdge : node->getInEdges()) {
+      auto parentNode = parentInEdge->tail();
+      if (isa<repr::Phi>(parentNode->data().get())) {
+        whileBasicBlock->pushInstructionNode(parentNode);
+      }
+    }
+  }
+  whileBasicBlock->pushInstructionNode(opNode);
+
+  auto whileCFNode = cfg.createNode(std::move(whileBasicBlock));
+  cfg.createEdge(bbNode, whileCFNode, 0);
+
+  // The true path executes the body of the loop, so we
+  // take that BB and point to it.
+  for (auto cfNode : bodyCFNodes) {
+    bodyCFGraph.importNode(cfNode, cfg);
+    // If the CFG node has no children, we loop back to the top of the
+    // while loop.
+    if (cfNode->getOutEdges().size() == 0) {
+      cfg.createEdge(cfNode, whileCFNode, 0);
+    }
+    // TODO check for a single entry point
+    if (cfNode->getInEdges().size() == 0) {
+      cfg.createEdge(whileCFNode, cfNode, 1);
+    }
+  }
+  for (auto cfEdge : bodyCFEdges) {
+    bodyCFGraph.importEdge(cfEdge, cfg);
+  }
+
+  // Now create the false case.
+  bbNode =
+    cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+  cfg.createEdge(whileCFNode, bbNode, -1);
+}
+
+
+/// \brief Ingest a caffe2 protobuf model and output an NNModule.
+/// \param net The caffe2 protobuf NetDef
+/// \param blobMap [optional][output] A pointer to a blobMap to be populated with all the output blobs of the NetDef by name->NodeRef
+repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, repr::NNGraph::NodeRef>* blobMapOut) {
+  repr::NNGraph dfg;
+  repr::NNCFGraph cfg;
+  /// \brief We keep track of the producer of the blob.
+  /// Because Caffe2 Nets are really just ordered operations
+  /// we can just keep track of the most recent producer of
+  /// a blob and draw and edge from that to any consumer we
+  /// come by. If a new operator produces the blob we simply
+  /// replace it in this map.
+  std::unordered_map<std::string, repr::NNGraph::NodeRef> blobMap;
+
+  /// \brief For the construction of the control flow graph we keep track
+  /// of a current basic block, which we split up as we come accross control
+  /// flow operations such as if and while.
+  // std::unique_ptr<repr::BasicBlockType<repr::NNGraph>> currentBasicBlock =
+  auto bbNode =
+      cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+
+  for (auto &op : *net.mutable_op()) {
+    auto opNode = dfg.createNode(); // Create an empty node for the operator.
+    // First calculate in-edges (data dependencies).
+    for (const auto &input : op.input()) {
+      // If we've never seen this tensor, make one.
+      if (!blobMap.count(input)) {
+        auto tensor = util::make_unique<repr::Tensor>(input);
+        blobMap[input] =
+            dfg.createNode(unique_dyn_cast<repr::NeuralNetData>(tensor));
+      }
+
+      auto tensorNode = blobMap[input];
+      dfg.createEdge(tensorNode, opNode);
+    }
+
+    // Then save outputs into the blobMap for later consumption.
+    for (const auto &output : op.output()) {
+      auto tensor = util::make_unique<repr::Tensor>(output);
+      auto tensorNode =
+          dfg.createNode(unique_dyn_cast<repr::NeuralNetData>(tensor));
+      dfg.createEdge(opNode, tensorNode);
+      blobMap[output] = tensorNode;
+    }
+
+    if (op.type() == "While") {
+      handleWhileOp(dfg, cfg, opNode, bbNode, op, blobMap);
+    } else {
+      opNode->resetData(convertToNeuralNetOperator(op));
+      auto currentBasicBlock = bbNode->mutableData()->get();
+      currentBasicBlock->pushInstructionNode(opNode);
+    }
+  }
+
+  repr::NNModule module;
+  module.dataFlow = std::move(dfg);
+  module.controlFlow = std::move(cfg);
+  if (blobMapOut) {
+    *blobMapOut = blobMap;
+  }
+  return module;
+}
+
+caffe2::OperatorDef convertToOperatorDef(
+    const repr::NNGraph::NodeRef& instrNode) {
+  auto *nnOp = repr::nn::get<repr::NeuralNetOperator>(instrNode);
+  auto *annotation = nnOp->getAnnotation();
+  caffe2::OperatorDef op;
+
+  if (ConverterRegistry()->Has(op.type())) {
+    op = ConverterRegistry()->Create(op.type())->convertToOperatorDef(nnOp);
+  } else if (!annotation) {
+    op.set_type(nnOp->getName());
+  } else {
+    if (isa<Caffe2Annotation>(annotation)) {
+      auto c2_annotation = dyn_cast<Caffe2Annotation>(annotation);
+      op = c2_annotation->getOperatorDef();
+      op.mutable_device_option()->set_device_type(
+          c2_annotation->getDeviceType());
+    } else {
+      CAFFE_THROW(
+          "Couldn't convert operator annotation to Caffe2 operator def");
+    }
+  }
+
+  // We may have swapped out some of the edges.
+  op.clear_input();
+  op.clear_output();
+  return op;
+}
+
+caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m) {
+  auto predictNet = caffe2::NetDef();
+  return convertToCaffe2Proto(m, predictNet);
+}
+
+caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& oldNet) {
+  auto predictNet = caffe2::NetDef();
+  // We copy the old net rather than mutate it.
+  predictNet.CopyFrom(oldNet);
+  predictNet.mutable_op()->Clear();
+
+  repr::nn::coalesceInsertedDataDependencies(&m);
+
+  // Simply iterate through the CFG and populate data dependencies
+  // with the DFG
+  for (const auto &bbNode : m.controlFlow.getMutableNodes()) {
+    if (bbNode->getOutEdges().size() > 1) {
+      CAFFE_THROW("Control flow not yet supported in Caffe2 converter.");
+    }
+    auto bb = bbNode->data().get();
+    for (const auto &instrNode : bb->getInstructions()) {
+      caffe2::OperatorDef op = convertToOperatorDef(instrNode);
+
+      for (const auto &inEdge : instrNode->getInEdges()) {
+        auto *tensorNode =
+            dyn_cast<repr::NeuralNetData>(inEdge->tail()->data().get());
+        *op.add_input() = tensorNode->getName();
+      }
+      for (const auto &outEdge : instrNode->getOutEdges()) {
+        auto *tensorNode =
+            dyn_cast<repr::NeuralNetData>(outEdge->head()->data().get());
+        *op.add_output() = tensorNode->getName();
+      }
+
+      auto *nnOp = repr::nn::get<repr::NeuralNetOperator>(instrNode);
+      if (nnOp->getLayout() != repr::NeuralNetOperator::NNLayout::Undefined) {
+
+        caffe2::Argument* arg = nullptr;
+        for (int i = 0; i < op.arg_size(); ++i) {
+          auto arg_ = op.mutable_arg(i);
+          if (arg_->name() == "order") {
+            arg = arg_;
+            break;
+          }
+        }
+
+        if (!arg) {
+          arg = op.add_arg();
+          arg->set_name("order");
+        }
+
+        auto layout = nnOp->getLayout();
+        if (layout == repr::NeuralNetOperator::NNLayout::NCHW) {
+          arg->set_s("NCHW");
+        }
+        if (layout == repr::NeuralNetOperator::NNLayout::NHWC) {
+          arg->set_s("NHWC");
+        }
+      }
+
+      // Save the operator to the net.
+      *predictNet.add_op() = op;
+    }
+  }
+
+  return predictNet;
+}
+
+} // namespace caffe2
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
new file mode 100644
index 0000000..5a1b7d0
--- /dev/null
+++ b/caffe2/opt/converter.h
@@ -0,0 +1,104 @@
+#ifndef CAFFE2_OPT_CONVERTER_H
+#define CAFFE2_OPT_CONVERTER_H
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "nomnigraph/Graph/Graph.h"
+#include "nomnigraph/Representations/ControlFlow.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+
+#include <unordered_map>
+
+namespace caffe2 {
+
+class Caffe2Annotation : public nom::repr::Annotation {
+public:
+  Caffe2Annotation() : Annotation(AnnotationKind::Caffe2) {}
+  Caffe2Annotation(std::string device)
+      : Annotation(AnnotationKind::Caffe2), Device(device) {}
+  virtual ~Caffe2Annotation() {}
+
+  void setDevice(std::string device) { Device = device; }
+  const std::string getDevice() const { return Device; }
+
+  void setDeviceType(int device) {
+    DeviceType = device;
+  }
+  int getDeviceType() const {
+    return DeviceType;
+  }
+
+  void setOperatorDef(const caffe2::OperatorDef& opDef) {
+    OpDef = opDef;
+    OpDefExists = true;
+  }
+  const caffe2::OperatorDef& getOperatorDef() const {
+    CAFFE_ENFORCE(
+        OpDefExists,
+        "OperatorDef was never set.  Use Caffe2Annotation::setOperatorDef.");
+    return OpDef;
+  }
+  caffe2::OperatorDef* getMutableOperatorDef() {
+    CAFFE_ENFORCE(
+        OpDefExists,
+        "OperatorDef was never set.  Use Caffe2Annotation::setOperatorDef.");
+    return &OpDef;
+  }
+
+  static bool classof(const Annotation *A) {
+    return A->getKind() == AnnotationKind::Caffe2;
+  }
+
+private:
+  std::string Device = "";
+  caffe2::OperatorDef OpDef;
+  bool OpDefExists = false;
+  int DeviceType = caffe2::DeviceType::CPU;
+};
+
+nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+
+caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
+
+// Pass in an oldNet to copy all the attributes of that network.
+// Be warned that transformations that modify the graph's inputs or outputs
+// are not reflected in changes to external_input or external_output.
+caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&, const caffe2::NetDef& oldNet);
+
+// Use these functions instead of the registry directly.
+std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+    const caffe2::OperatorDef& op);
+
+caffe2::OperatorDef convertToOperatorDef(
+    const nom::repr::NNGraph::NodeRef& instrNode);
+
+class Converter {
+ public:
+  explicit Converter() {}
+  virtual std::unique_ptr<nom::repr::NeuralNetOperator>
+  convertToNeuralNetOperator(const OperatorDef&) = 0;
+  virtual OperatorDef convertToOperatorDef(const nom::repr::NeuralNetOperator*);
+  static std::map<std::string, caffe2::Argument> getArgumentsFromOperator(
+      caffe2::OperatorDef op);
+
+  virtual ~Converter() {}
+};
+
+CAFFE_DECLARE_REGISTRY(ConverterRegistry, Converter);
+#define REGISTER_CONVERTER(name, cls) \
+  CAFFE_REGISTER_CLASS(ConverterRegistry, name, cls)
+
+#define TRIVIAL_CONVERTER(opName)                                             \
+  class opName##Converter : public Converter {                                \
+    std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator( \
+        const OperatorDef& op) override {                                     \
+      return util::make_unique<repr::opName>();                               \
+    }                                                                         \
+    virtual ~opName##Converter() {}                                           \
+  };
+
+} // namespace caffe2
+
+
+#endif // CAFFE2_OPT_CONVERTER_H
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
new file mode 100644
index 0000000..69f51df
--- /dev/null
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -0,0 +1,112 @@
+#include "caffe2/opt/converter.h"
+
+#include <gtest/gtest.h>
+
+#define ADD_ARG(_op, _name, _type, _val)                                       \
+{                                                                            \
+  caffe2::Argument *arg = _op->add_arg();                                    \
+  arg->set_name(_name);                                                      \
+  arg->set_##_type(_val);                                                    \
+}
+
+TEST(Converter, Basic) {
+  caffe2::NetDef net;
+  for (auto i = 0; i < 10; ++i) {
+    if (rand() % 2) {
+      caffe2::OperatorDef *def = net.add_op();
+      def->set_type("Conv");
+      def->add_input("X");
+      def->add_input("W" + caffe2::to_string(i)); // different weights
+      ADD_ARG(def, "kernel", i, 3);
+      ADD_ARG(def, "stride", i, 1);
+      ADD_ARG(def, "pad", i, 0);
+      ADD_ARG(def, "order", s, "NCHW");
+      def->add_output("X");
+      def->mutable_device_option()->set_node_name("conv_runner");
+    } else {
+      caffe2::OperatorDef *def = net.add_op();
+      def->set_type("Relu");
+      def->add_input("X");
+      def->add_output("X");
+      def->mutable_device_option()->set_node_name("relu_runner");
+    }
+  }
+  auto nn = caffe2::convertToNNModule(net);
+  auto new_netdef = caffe2::convertToCaffe2Proto(nn);
+}
+
+TEST(Converter, UnknownType) {
+  caffe2::NetDef net;
+
+  caffe2::OperatorDef *def = net.add_op();
+  def->set_type("NeverSeen");
+  def->add_input("X");
+  def->add_output("X");
+  def->mutable_device_option()->set_node_name("device_" +
+      caffe2::to_string(rand() % 2));
+  auto nn = caffe2::convertToNNModule(net);
+  auto new_netdef = caffe2::convertToCaffe2Proto(nn);
+}
+
+/* Temporarily disabled While conversion tests
+TEST(Converter, While) {
+  caffe2::NetDef net;
+
+  caffe2::OperatorDef *def = net.add_op();
+  def->set_type("While");
+  def->add_input("X");
+
+  caffe2::NetDef body_net;
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Relu");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  std::string body_net_serialized;
+  assert(body_net.SerializeToString(&body_net_serialized));
+  ADD_ARG(def, "body", s, body_net_serialized);
+
+  auto nn = caffe2::convertToNNModule(net);
+}
+
+TEST(Converter, ComplexWhile) {
+  caffe2::NetDef net;
+
+  {
+    caffe2::OperatorDef *rdef = net.add_op();
+    rdef->set_type("Relu");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+
+  caffe2::OperatorDef *def = net.add_op();
+  def->set_type("While");
+  def->add_input("X");
+
+  caffe2::NetDef body_net;
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr1");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr2");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr3");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  std::string body_net_serialized;
+  assert(body_net.SerializeToString(&body_net_serialized));
+  ADD_ARG(def, "body", s, body_net_serialized);
+
+  auto nn = caffe2::convertToNNModule(net);
+}
+*/
diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc
new file mode 100644
index 0000000..9abca6d
--- /dev/null
+++ b/caffe2/opt/device.cc
@@ -0,0 +1,118 @@
+#include "caffe2/opt/device.h"
+#include "caffe2/core/logging.h"
+#include "nomnigraph/Graph/Algorithms.h"
+
+using namespace nom;
+using namespace nom::repr;
+
+std::vector<NNGraph::EdgeRef> getInputEdges(
+    const NNGraph::SubgraphType& sg,
+    const NNGraph& g) {
+  std::vector<NNGraph::EdgeRef> inputTensorEdges;
+  for (const auto& node : sg.Nodes) {
+    NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
+    NOM_REQUIRE_OR_CONT(nn::hasInputs(node));
+
+    // Check if tensor's parents are in the sg
+    for (const auto& input : nn::getInputs(node)) {
+      NOM_REQUIRE_OR_CONT(
+          !nn::hasProducer(input) ||
+          sg.Nodes.count(nn::getProducer(input)) == 0);
+      inputTensorEdges.emplace_back(g.getEdge(input, node));
+    }
+  }
+  return inputTensorEdges;
+}
+
+std::vector<NNGraph::EdgeRef> getOutputEdges(
+    const NNGraph::SubgraphType& sg,
+    const NNGraph& g) {
+  std::vector<NNGraph::EdgeRef> outputTensorEdges;
+  for (const auto& node : sg.Nodes) {
+    NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
+
+    for (const auto& output : nn::getOutputs(node)) {
+      auto consumers = nn::getConsumers(output);
+      for (const auto& consumer : consumers) {
+        NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0);
+        outputTensorEdges.emplace_back(g.getEdge(node, output));
+      }
+      NOM_REQUIRE_OR_CONT(consumers.size() == 0);
+      outputTensorEdges.emplace_back(g.getEdge(node, output));
+    }
+  }
+  return outputTensorEdges;
+}
+
+namespace caffe2 {
+namespace opt {
+
+void insertCopies(
+    NNModule* nn,
+    std::function<bool(NNGraph::NodeRef)> supported,
+    std::function<NNGraph::NodeRef(NNGraph&)> copyToFn,
+    std::function<NNGraph::NodeRef(NNGraph&)> copyFromFn) {
+  auto matches = nom::algorithm::binaryMatch(&nn->dataFlow, supported);
+
+  // We're doing a lot of inplace mutation so this is necessary.
+  std::set<NNGraph::EdgeRef> changedEdges;
+
+  for (const auto& match : matches) {
+    for (const auto& edge : getInputEdges(match, nn->dataFlow)) {
+      NOM_REQUIRE_OR_CONT(changedEdges.count(edge) == 0);
+      auto input = edge->tail();
+
+      // This may be redundant, but we need the user's function
+      // to get the type of the node they're using.
+      auto copyNode = copyToFn(nn->dataFlow);
+      bool redundant = false;
+      // Rectify redudancies.
+      for (const auto& consumer : nn::getConsumers(input)) {
+        auto copyOp = nn::get<NeuralNetOperator>(copyNode);
+        auto consumerOp = nn::get<NeuralNetOperator>(consumer);
+        // We already have a copy node, let's reuse it.
+        if (consumerOp->getKind() == copyOp->getKind()) {
+          nn->dataFlow.deleteNode(copyNode);
+          copyNode = consumer;
+          redundant = true;
+        }
+      }
+
+      auto data = nn::get<NeuralNetData>(input);
+      auto newInput = redundant
+          ? nn::getOutputs(copyNode).front()
+          : nn->dataFlow.createNode(
+                util::make_unique<repr::Tensor>(data->getName() + "_"));
+      if (!redundant) {
+        nn->dataFlow.createEdge(input, copyNode);
+        nn->dataFlow.createEdge(copyNode, newInput);
+      }
+
+      input->removeOutEdge(edge);
+      edge->setTail(newInput);
+
+      changedEdges.insert(edge);
+    }
+
+    for (const auto& edge : getOutputEdges(match, nn->dataFlow)) {
+      NOM_REQUIRE_OR_CONT(changedEdges.count(edge) == 0);
+      auto output = edge->head();
+
+      auto copyNode = copyFromFn(nn->dataFlow);
+      auto data = nn::get<NeuralNetData>(output);
+      auto newOutput = nn->dataFlow.createNode(
+          util::make_unique<repr::Tensor>(data->getName() + "_"));
+
+      output->removeInEdge(edge);
+      edge->setHead(newOutput);
+
+      changedEdges.insert(edge);
+
+      nn->dataFlow.createEdge(newOutput, copyNode);
+      nn->dataFlow.createEdge(copyNode, output);
+    }
+  }
+}
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/device.h b/caffe2/opt/device.h
new file mode 100644
index 0000000..eeb1646
--- /dev/null
+++ b/caffe2/opt/device.h
@@ -0,0 +1,13 @@
+#include "nomnigraph/Representations/NeuralNet.h"
+
+namespace caffe2 {
+namespace opt {
+
+void insertCopies(
+    nom::repr::NNModule* nn,
+    std::function<bool(nom::repr::NNGraph::NodeRef)> supported,
+    std::function<nom::repr::NNGraph::NodeRef(nom::repr::NNGraph&)> copyToFn,
+    std::function<nom::repr::NNGraph::NodeRef(nom::repr::NNGraph&)> copyFromFn);
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/device_test.cc b/caffe2/opt/device_test.cc
new file mode 100644
index 0000000..d7c9913
--- /dev/null
+++ b/caffe2/opt/device_test.cc
@@ -0,0 +1,77 @@
+#include "caffe2/core/common.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/device.h"
+
+#include <gtest/gtest.h>
+
+using namespace nom::repr;
+
+#define ADD_ARG(_op, _name, _type, _val)    \
+  {                                         \
+    caffe2::Argument* arg = _op->add_arg(); \
+    arg->set_name(_name);                   \
+    arg->set_##_type(_val);                 \
+  }
+
+TEST(DeviceTest, InsertCopies) {
+  caffe2::NetDef net;
+  for (auto i = 0; i < 9; ++i) {
+    if (i % 3 == 0) {
+      caffe2::OperatorDef* def = net.add_op();
+      def->set_type("Conv");
+      def->add_input("X");
+      def->add_input("W" + caffe2::to_string(i));
+      def->add_input("b" + caffe2::to_string(i));
+      ADD_ARG(def, "kernel", i, 3);
+      ADD_ARG(def, "stride", i, 1);
+      ADD_ARG(def, "pad", i, 0);
+      ADD_ARG(def, "order", s, "NCHW");
+      def->add_output("X");
+      def->mutable_device_option()->set_device_type(caffe2::CPU);
+    } else {
+      caffe2::OperatorDef* def = net.add_op();
+      def->set_type("Relu");
+      def->add_input("X");
+      def->add_output("X");
+      def->mutable_device_option()->set_device_type(caffe2::CPU);
+    }
+  }
+  auto nn = caffe2::convertToNNModule(net);
+
+  for (auto node : nn.dataFlow.getMutableNodes()) {
+    if (nn::is<Relu>(node)) {
+      auto annot = nn::get<NeuralNetOperator>(node)->getMutableAnnotation();
+      auto c2_annot = dyn_cast<caffe2::Caffe2Annotation>(annot);
+      c2_annot->setDeviceType(caffe2::OPENCL);
+    }
+  }
+
+  caffe2::opt::insertCopies(
+      &nn,
+      [](NNGraph::NodeRef node) {
+        // Ignore all tensors
+        if (!nn::is<NeuralNetOperator>(node)) {
+          return true;
+        }
+        auto annot = nn::get<NeuralNetOperator>(node)->getMutableAnnotation();
+        NOM_REQUIRE_OR_RET_FALSE(annot);
+        auto c2_annot = dyn_cast<caffe2::Caffe2Annotation>(annot);
+        NOM_REQUIRE_OR_RET_FALSE(c2_annot);
+        return c2_annot->getDeviceType() == caffe2::OPENCL;
+      },
+      [](NNGraph& g) {
+        return g.createNode(nom::util::make_unique<GenericOperator>());
+      },
+      [](NNGraph& g) {
+        return g.createNode(nom::util::make_unique<GenericOperator>());
+      });
+
+  auto proto = caffe2::convertToCaffe2Proto(nn, net);
+
+  // Conv -> Relu -> Relu
+  // becomes
+  // Conv -> Generic -> Relu -> Relu -> Generic
+  // thus
+  // 9 ops of this pattern becomes 15
+  EXPECT_EQ(proto.op().size(), 15);
+}
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
new file mode 100644
index 0000000..5664b6f
--- /dev/null
+++ b/caffe2/opt/fusion.cc
@@ -0,0 +1,90 @@
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/fusion.h"
+#include "caffe2/opt/passes.h"
+
+namespace caffe2 {
+namespace opt {
+
+using namespace nom;
+
+// $$ X_{bn} = \frac{s(X - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
+// $$ X_{conv} = X * W + b_{conv} $$
+// thus, substituting $X$ with $X_{conv}$ in the BN equation we get:
+// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
+// or
+// $$ W' = W\frac{s}{\sqrt{\sigma + \epsilon}}$$
+// $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
+bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
+  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef convNode;
+    repr::Conv* conv;
+    std::tie(conv, convNode) = node_pair;
+
+    auto output = repr::nn::getOutputs(convNode).front();
+    auto consumers = repr::nn::getConsumers(output);
+    if (consumers.size() != 1) {
+      continue;
+    }
+    auto consumer = consumers.front();
+    if (!repr::nn::is<repr::BatchNormalization>(consumer)) {
+      continue;
+    }
+    auto bnNode = consumer;
+    auto bn = repr::nn::get<repr::BatchNormalization>(bnNode);
+
+    auto convInputs = repr::nn::getInputs(convNode);
+    if (convInputs.size() < 3) {
+      assert(0 && "Invalid convolution input size (TODO: optional bias)");
+      continue;
+    }
+
+    auto bnInputs = repr::nn::getInputs(bnNode);
+    if (bnInputs.size() < 5) {
+      assert(0 && "Invalid batch normalization input size");
+      continue;
+    }
+
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                              \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                    \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");           \
+  auto name##Tensor = ws->GetBlob(name->getName())->GetMutable<TensorCPU>(); \
+  auto name##Data = name##Tensor->mutable_data<float>();
+
+    EXPOSE_TENSOR_DATA(filter, 1, convInputs);
+    EXPOSE_TENSOR_DATA(biasConv, 2, convInputs);
+
+    EXPOSE_TENSOR_DATA(scale, 1, bnInputs);
+    EXPOSE_TENSOR_DATA(biasBN, 2, bnInputs);
+    EXPOSE_TENSOR_DATA(mean, 3, bnInputs);
+    EXPOSE_TENSOR_DATA(variance, 4, bnInputs);
+
+#undef EXPOSE_TENSOR_DATA
+
+    // Assume M{CHW,HWC}
+    auto chwDim = filterTensor->dim32(1) * filterTensor->dim32(2) *
+        filterTensor->dim32(3);
+    for (auto c = 0; c < filterTensor->dim32(0); ++c) {
+      float coeff =
+          scaleData[c] / std::sqrt(varianceData[c] + bn->getEpsilon());
+      for (auto i = 0; i < chwDim; ++i) {
+        filterData[c * chwDim + i] *= coeff;
+      }
+      auto bias = (biasConvData[c] - meanData[c]) * coeff + biasBNData[c];
+      biasConvData[c] = bias;
+    }
+
+    nn->dataFlow.deleteNode(bnNode);
+    return true;
+  }
+  return false;
+}
+
+void fuseConvBN(nom::repr::NNModule* nn, caffe2::Workspace* ws) {
+  while (fuseConvBNHelper(nn, ws)) {
+  }
+}
+
+REGISTER_WS_OPT_PASS_FROM_FUNC(FuseConvBN, fuseConvBN);
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h
new file mode 100644
index 0000000..67b2cb7
--- /dev/null
+++ b/caffe2/opt/fusion.h
@@ -0,0 +1,122 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_OPT_FUSION_H_
+#define CAFFE2_OPT_FUSION_H_
+
+#include "caffe2/core/workspace.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+
+namespace caffe2 {
+namespace opt {
+
+using namespace nom;
+
+void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
+
+// Generic activation fusion helper.
+//
+// \tparam OperationT The operator to be fused.
+// \tparam ActivationT The activation to be fused.
+// \param nn Neural network module to be modified in place
+// \param should_fuse Given a conv op, check whether we want to fuse it with
+// subsequent relu or not 
+// \param postprocess Functor to postprocess the conv node,
+// attaching additional attributes if necessary
+template <typename OperationT, typename ActivationT>
+void fuseActivation(
+    repr::NNModule* nn,
+    std::function<bool(const OperationT& conv)> should_fuse,
+    std::function<void(repr::NNGraph::NodeRef conv_node)> postprocess) {
+  for (auto node_pair : repr::nn::dataIterator<OperationT>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef conv_node;
+    OperationT* conv;
+    std::tie(conv, conv_node) = node_pair;
+
+    // Check topological feasibility
+    auto conv_outputs = repr::nn::getOutputs(conv_node);
+    if (conv_outputs.size() != 1) {
+      continue;
+    }
+    auto conv_output = conv_outputs.front();
+
+    auto consumers = repr::nn::getConsumers(conv_output);
+    if (consumers.size() != 1) {
+      continue;
+    }
+    if (!repr::nn::is<ActivationT>(consumers.front())) {
+      continue;
+    }
+    auto relu_node = consumers.front();
+
+    auto relu_outputs = repr::nn::getOutputs(relu_node);
+    if (relu_outputs.size() != 1) {
+      continue;
+    }
+
+    // Check feasibility with application specific logic
+    if (!should_fuse(*conv)) {
+      continue;
+    }
+
+    // Ready to fuse
+    auto relu_output = relu_outputs.front();
+    auto output_tensor = repr::nn::get<repr::Tensor>(relu_output);
+    auto output_node = relu_output;
+    auto input_tensor =
+        repr::nn::get<repr::Tensor>(repr::nn::getInputs(conv_node).front());
+
+    // Conv cannot be in-place
+    if (output_tensor->getName() != input_tensor->getName()) {
+      nn->dataFlow.replaceNode(conv_output, relu_output);
+      nn->dataFlow.deleteNode(relu_node);
+      nn->dataFlow.deleteNode(conv_output);
+    } else {
+      nn->dataFlow.replaceNode(relu_output, conv_output);
+      output_tensor = repr::nn::get<repr::Tensor>(conv_output);
+      output_node = conv_output;
+      nn->dataFlow.deleteNode(relu_node);
+      nn->dataFlow.deleteNode(relu_output);
+    }
+
+    // We may have accidentally made the next op in-place
+    // In future iterations of transformations this won't be an issue,
+    // but current caffe2 predictor usage requires things like
+    // external_input and output to be unchanged.
+    bool rectify_inplace = false;
+    for (auto& consumer : repr::nn::getConsumers(output_node)) {
+      for (auto& consumer_output : repr::nn::getOutputs(consumer)) {
+        auto co_name = repr::nn::get<repr::Tensor>(consumer_output)->getName();
+        if (co_name == output_tensor->getName()) {
+          rectify_inplace = true;
+        }
+      }
+    }
+    if (rectify_inplace) {
+      auto new_output = nn->dataFlow.createNode(
+          make_unique<repr::Tensor>(output_tensor->getName() + "_fusion_fix"));
+      nn->dataFlow.replaceNode(output_node, new_output);
+    }
+
+    // Application specific logic for postprocessing the conv node
+    postprocess(conv_node);
+  }
+}
+
+} // namespace opt
+} // namespace caffe2
+
+#endif // CAFFE2_OPT_FUSION_H_
diff --git a/caffe2/opt/mobile.cc b/caffe2/opt/mobile.cc
new file mode 100644
index 0000000..6d00068
--- /dev/null
+++ b/caffe2/opt/mobile.cc
@@ -0,0 +1,149 @@
+#include "caffe2/opt/mobile.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/fusion.h"
+#include "caffe2/opt/passes.h"
+
+namespace caffe2 {
+namespace opt {
+
+using namespace nom;
+
+void addNNPACK(repr::NNModule* nn, bool low_memory) {
+  for (auto node : nn->dataFlow.getMutableNodes()) {
+    auto* nodeData = node->data().get(); // Let graph retain ownership.
+
+    // Skip blobs.
+    if (!isa<nom::repr::NeuralNetOperator>(nodeData)) {
+      continue;
+    }
+
+    // Check if it is a convolution.
+    auto nnOp = dyn_cast<nom::repr::NeuralNetOperator>(nodeData);
+    if (!isa<nom::repr::Conv>(nnOp)) {
+      continue;
+    }
+
+    // Requires X, W, b for NNPACK
+    if (node->getInEdges().size() < 3) {
+      continue;
+    }
+
+    std::string engine = "NNPACK";
+
+    // Now do some specific checks to see if an NNPACK engine is correct.
+    bool validTransformCandidate = true;
+    auto conv = dyn_cast<nom::repr::Conv>(nnOp);
+
+    if (conv->getLayout() != nom::repr::Conv::NNLayout::NCHW) {
+      continue;
+    }
+
+    // NNPACK only supports stride == 1
+    for (auto stride : conv->getStrides()) {
+      if (stride != 1) {
+        validTransformCandidate = false;
+        break;
+      }
+    }
+    if (!validTransformCandidate) {
+      continue;
+    }
+
+    // NNPACK only supports 2DConv.
+    const auto& kernelShape = conv->getKernelShape();
+    if (kernelShape.size() != 2) {
+      continue;
+    }
+
+    // Kx1 and 1xK convs are inefficient in NNPACK.
+    if (kernelShape[0] != kernelShape[1]) {
+      if (kernelShape[0] == 1 || kernelShape[1] == 1) {
+        continue;
+      }
+    }
+
+    // We're good to use our engine.
+    auto annotation = conv->getMutableAnnotation();
+    if (!annotation || !isa<Caffe2Annotation>(annotation)) {
+      continue;
+    }
+    auto* op = dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
+    op->set_engine(engine);
+    if (!low_memory) {
+      auto* precompute_argument = op->add_arg();
+      precompute_argument->set_name("convolution_transform_strategy");
+      precompute_argument->set_s("PRECOMPUTE");
+    }
+  }
+}
+
+namespace {
+
+inline bool isNNPACKConvReluEfficient(
+    const std::string& algo,
+    const repr::Conv& conv) {
+  if (algo == "AUTO" || algo == "") {
+    for (auto stride : conv.getStrides()) {
+      if (stride > 1) {
+        return false;
+      }
+    }
+    for (auto kernel : conv.getKernelShape()) {
+      if (kernel < 2) {
+        return false;
+      }
+    }
+  } else if (!(algo == "WINOGRAD" || algo == "WINOGRAD_FP16" ||
+               algo == "FT8x8" || algo == "FT16x16")) {
+    return false;
+  }
+  return true;
+}
+
+} // namespace
+
+void fuseNNPACKConvRelu(repr::NNModule* nn) {
+  auto should_fuse = [](const repr::Conv& conv) {
+    const auto annotation = conv.getAnnotation();
+    if (!annotation || !isa<Caffe2Annotation>(annotation)) {
+      return false;
+    }
+    const auto& op = dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef();
+
+    // We only want to fuse for fast NNPACK convs
+    if (op.engine() != "NNPACK") {
+      return false;
+    }
+    caffe2::string algo = "AUTO";
+    for (const auto arg : op.arg()) {
+      if (arg.name() == "algo") {
+        algo = arg.s();
+      }
+    }
+    if (!isNNPACKConvReluEfficient(algo, conv)) {
+      return false;
+    }
+    return true;
+  };
+
+  auto postprocess = [](repr::NNGraph::NodeRef conv_node) {
+    auto conv = repr::nn::get<repr::Conv>(conv_node);
+    auto annotation = conv->getMutableAnnotation();
+    if (!annotation || !isa<Caffe2Annotation>(annotation)) {
+      return;
+    }
+    auto* op = dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
+    auto* arg = op->add_arg();
+    arg->set_name("activation");
+    arg->set_s("Relu");
+  };
+
+  fuseActivation<repr::Conv, repr::Relu>(nn, should_fuse, postprocess);
+}
+
+REGISTER_OPT_PASS_FROM_FUNC(FuseNNPACKConvRelu, fuseNNPACKConvRelu);
+REGISTER_OPT_PASS_FROM_FUNC(AddNNPACK, addNNPACK);
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/mobile.h b/caffe2/opt/mobile.h
new file mode 100644
index 0000000..1bc9a32
--- /dev/null
+++ b/caffe2/opt/mobile.h
@@ -0,0 +1,15 @@
+#ifndef CAFFE2_OPT_MOBILE_H_
+#define CAFFE2_OPT_MOBILE_H_
+
+#include "nomnigraph/Representations/NeuralNet.h"
+
+namespace caffe2 {
+namespace opt {
+
+void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false);
+void fuseNNPACKConvRelu(nom::repr::NNModule* nn);
+
+} // namespace opt
+} // namespace caffe2
+
+#endif // CAFFE2_OPT_MOBILE_H_
diff --git a/caffe2/opt/mobile_test.cc b/caffe2/opt/mobile_test.cc
new file mode 100644
index 0000000..680cefe
--- /dev/null
+++ b/caffe2/opt/mobile_test.cc
@@ -0,0 +1,45 @@
+#include "caffe2/core/common.h"
+#include "caffe2/opt/converter.h"
+#include "mobile.h"
+
+#include <gtest/gtest.h>
+
+#define ADD_ARG(_op, _name, _type, _val)    \
+  {                                         \
+    caffe2::Argument* arg = _op->add_arg(); \
+    arg->set_name(_name);                   \
+    arg->set_##_type(_val);                 \
+  }
+
+TEST(MobileTest, Convolution) {
+  caffe2::NetDef net;
+  for (auto i = 0; i < 10; ++i) {
+    if (i % 3) {
+      caffe2::OperatorDef* def = net.add_op();
+      def->set_type("Conv");
+      def->add_input("X");
+      def->add_input("W" + caffe2::to_string(i));
+      def->add_input("b" + caffe2::to_string(i));
+      ADD_ARG(def, "kernel", i, 3);
+      ADD_ARG(def, "stride", i, 1);
+      ADD_ARG(def, "pad", i, 0);
+      ADD_ARG(def, "order", s, "NCHW");
+      def->add_output("X");
+      def->mutable_device_option()->set_node_name("conv_runner");
+    } else {
+      caffe2::OperatorDef* def = net.add_op();
+      def->set_type("Relu");
+      def->add_input("X");
+      def->add_output("X");
+      def->mutable_device_option()->set_node_name("relu_runner");
+    }
+  }
+  auto nn = caffe2::convertToNNModule(net);
+  caffe2::opt::addNNPACK(&nn);
+  auto optimized_net = caffe2::convertToCaffe2Proto(nn, net);
+  for (auto op : optimized_net.op()) {
+    if (op.type() == "Conv") {
+      assert(op.engine() == "NNPACK");
+    }
+  }
+}
diff --git a/caffe2/opt/onnx_convert.h b/caffe2/opt/onnx_convert.h
new file mode 100644
index 0000000..42a9c95
--- /dev/null
+++ b/caffe2/opt/onnx_convert.h
@@ -0,0 +1,36 @@
+class OnnxAnnotation : public nom::repr::Annotation {
+public:
+  OnnxAnnotation() : Annotation(AnnotationKind::Onnx) {}
+  OnnxAnnotation(std::string device)
+      : Annotation(AnnotationKind::Onnx), Device(device) {}
+
+  void setDevice(std::string device) { Device = device; }
+  const std::string getDevice() const { return Device; }
+
+  void setOperatorDef(caffe2::OperatorDef* opDef) {
+    OpDef = opDef;
+  }
+  const caffe2::OperatorDef* getOperatorDef() const { 
+    assert(OpDef && "OperatorDef was never set.  Use OnnxAnnotation::setOperatorDef.");
+    return OpDef;
+  }
+  caffe2::OperatorDef* getMutableOperatorDef() { 
+    assert(OpDef && "OperatorDef was never set.  Use OnnxAnnotation::setOperatorDef.");
+    return OpDef;
+  }
+
+  static bool classof(const Annotation *A) {
+    return A->getKind() == AnnotationKind::Onnx;
+  }
+
+private:
+  std::string Device = "";
+  caffe2::OperatorDef* OpDef = nullptr;
+};
+
+nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+
+caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&);
+
+std::unique_ptr<nom::repr::NeuralNetOperator> convertToOperatorDef(caffe2::OperatorDef op);
+
diff --git a/caffe2/opt/optimize_ideep.cc b/caffe2/opt/optimize_ideep.cc
new file mode 100644
index 0000000..d880987
--- /dev/null
+++ b/caffe2/opt/optimize_ideep.cc
@@ -0,0 +1,387 @@
+#include "caffe2/opt/optimize_ideep.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/fusion.h"
+
+#ifdef CAFFE2_USE_IDEEP
+#include "caffe2/ideep/ideep_utils.h"
+#endif
+
+namespace caffe2 {
+namespace opt {
+
+using namespace nom;
+
+#ifndef CAFFE2_USE_IDEEP
+void OptimizeForIdeep(
+    repr::NNModule* nn,
+    caffe2::Workspace* ws,
+    bool training_mode) {
+  LOG(WARNING) << "Only support optimizations for IDEEP";
+}
+
+#else
+USE_IDEEP_DEF_ALIASES();
+
+Blob* getBlob(repr::NNGraph::NodeRef node, caffe2::Workspace* ws) {
+  auto tensor = repr::nn::get<repr::Tensor>(node);
+  CAFFE_ENFORCE(ws->HasBlob(tensor->getName()), "Blob not in workspace");
+  return ws->GetBlob(tensor->getName());
+}
+
+template <class T>
+T* getTensor(Blob* blob) {
+  CAFFE_ENFORCE(blob, "Blob is invalid");
+  if (blob && blob->template IsType<T>()) {
+    return blob->template GetMutable<T>();
+  }
+  return nullptr;
+}
+
+const caffe2::OperatorDef& getOpDef(const repr::NeuralNetOperator& nnOp) {
+  auto annotation = nnOp.getAnnotation();
+  if (annotation == nullptr) {
+    CAFFE_THROW("Cannot get Operator annotation");
+  }
+  return dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef();
+}
+
+caffe2::OperatorDef* getMutableOpDef(repr::NeuralNetOperator& nnOp) {
+  auto annotation = nnOp.getMutableAnnotation();
+  if (annotation == nullptr) {
+    CAFFE_THROW("Cannot get Operator annotation");
+  }
+  return dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
+}
+
+bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) {
+  // We only want to fuse for IDEEP convs
+  const auto& op = getOpDef(nnOp);
+  return op.device_option().device_type() == DeviceType::IDEEP;
+}
+
+bool shouldFuseConv(const repr::Conv& conv) {
+  return isOnIdeepDevice(conv) ? (conv.getGroup() <= 1) : false;
+}
+
+void resetConvForFusion(repr::NNGraph::NodeRef convNode, int fusion_type) {
+  // Fusion types:
+  // FUSION_CONV_RELU = 1
+  // FUSION_CONV_SUM = 2
+  // FUSION_CONV_SUM_RELU = 3
+  auto conv = repr::nn::get<repr::Conv>(convNode);
+  auto annotation = conv->getMutableAnnotation();
+  if (!annotation || !isa<Caffe2Annotation>(annotation)) {
+    return;
+  }
+
+  auto* op = getMutableOpDef(*conv);
+  if (op == nullptr) {
+    return;
+  }
+
+  if (op->type() == "ConvFusion") {
+    CAFFE_ENFORCE(fusion_type == 1, "Invalid nest fusion");
+    for (auto& arg : *op->mutable_arg()) {
+      if (arg.name() == "fusion_type") {
+        // Only from FUSION_CONV_SUM to FUSION_CONV_SUM_RELU
+        CAFFE_ENFORCE(arg.i() == 2, "Invalid nest fusion");
+        arg.set_i(3);
+        return;
+      }
+    }
+    return;
+  }
+
+  CAFFE_ENFORCE(fusion_type < 3, "Invalid fusion type");
+  op->set_type("ConvFusion");
+  auto* arg = op->add_arg();
+  arg->set_name("fusion_type");
+  arg->set_i(fusion_type);
+}
+
+bool fuseConvBNHelperForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
+  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
+    bool no_bias = false;
+    repr::NNGraph::NodeRef convNode;
+    repr::Conv* conv;
+    std::tie(conv, convNode) = node_pair;
+
+    if (!isOnIdeepDevice(*conv)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    const auto& op = getOpDef(*conv);
+    if (op.type() == "ConvFusion") {
+      continue;
+    }
+
+    auto convOutput = repr::nn::getOutputs(convNode).front();
+    auto consumers = repr::nn::getConsumers(convOutput);
+    // convOutput is NOT referenced by sequential ops after BN.
+    if (consumers.size() != 1) {
+      continue;
+    }
+
+    auto consumer = consumers.front();
+    if (!repr::nn::is<repr::BatchNormalization>(consumer)) {
+      continue;
+    }
+    auto bnNode = consumer;
+    auto bn = repr::nn::get<repr::BatchNormalization>(bnNode);
+    auto bnOutput = repr::nn::getOutputs(bnNode).front();
+
+    auto convInputs = repr::nn::getInputs(convNode);
+    if (convInputs.size() < 2) {
+      LOG(WARNING) << "Invalid convolution input size";
+      continue;
+    }
+
+    auto bnInputs = repr::nn::getInputs(bnNode);
+    if (bnInputs.size() < 5) {
+      LOG(WARNING) << "Invalid batch normalization input size";
+      continue;
+    }
+
+    // When no bias, borrow BN bias
+    if (convInputs.size() < 3) {
+      no_bias = true;
+      nn->dataFlow.createEdge(bnInputs[2], convNode);
+      convInputs = repr::nn::getInputs(convNode);
+    }
+
+#define EXPOSE_TENSOR_DATA(name, index, nodes)                           \
+  auto* name = getTensor<itensor>(getBlob(nodes[index], ws));            \
+  if (name == nullptr) {                                                 \
+    LOG(WARNING) << #name " not a IDEEP tensor";                         \
+    continue;                                                            \
+  }                                                                      \
+  itensor name##Tensor({name->get_dims(), name->get_data_type()});       \
+  name##Tensor.reorder_from(*name);                                      \
+  CAFFE_ENFORCE(                                                         \
+      name##Tensor.is_public_format(), #name " not with public format"); \
+  auto* name##Data = static_cast<float*>(name##Tensor.get_data_handle());
+
+    EXPOSE_TENSOR_DATA(filter, 1, convInputs);
+    EXPOSE_TENSOR_DATA(biasConv, 2, convInputs);
+
+    EXPOSE_TENSOR_DATA(scale, 1, bnInputs);
+    EXPOSE_TENSOR_DATA(biasBN, 2, bnInputs);
+    EXPOSE_TENSOR_DATA(mean, 3, bnInputs);
+    EXPOSE_TENSOR_DATA(variance, 4, bnInputs);
+
+#undef EXPOSE_TENSOR_DATA
+
+    // Assume M{CHW,HWC}
+    auto chwDim = filterTensor.get_dim(1) * filterTensor.get_dim(2) *
+        filterTensor.get_dim(3);
+    for (auto c = 0; c < filterTensor.get_dim(0); ++c) {
+      float coeff =
+          scaleData[c] / std::sqrt(varianceData[c] + bn->getEpsilon());
+      for (auto i = 0; i < chwDim; ++i) {
+        filterData[c * chwDim + i] *= coeff;
+      }
+      if (no_bias) {
+        biasConvData[c] = biasBNData[c] - meanData[c] * coeff;
+      } else {
+        biasConvData[c] =
+            biasBNData[c] + (biasConvData[c] - meanData[c]) * coeff;
+      }
+    }
+
+    filter->reorder_from(filterTensor);
+    biasConv->reorder_from(biasConvTensor);
+    nn->dataFlow.replaceNode(convOutput, bnOutput);
+
+    nn->dataFlow.deleteNode(bnNode);
+    nn->dataFlow.deleteNode(convOutput);
+
+    return true;
+  }
+
+  return false;
+}
+
+void fuseConvBNForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
+  while (fuseConvBNHelperForIdeep(nn, ws)) {
+  }
+}
+
+void fuseConvSumForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
+  // Assume the order of nodes from getMutableNodes conforms to
+  // the original topo order of operators
+  auto allNodes = nn->dataFlow.getMutableNodes();
+  for (int i = 0; i < allNodes.size(); i++) {
+    auto sumNode = allNodes[i];
+    if (!repr::nn::hasInputs(sumNode)) {
+      continue;
+    }
+
+    if (!repr::nn::is<repr::Sum>(sumNode)) {
+      continue;
+    }
+
+    auto sum = repr::nn::get<repr::Sum>(sumNode);
+    if (!isOnIdeepDevice(*sum)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    auto sumInputs = repr::nn::getInputs(sumNode);
+    if (sumInputs.size() != 2) {
+      continue;
+    }
+
+    bool should_fuse = true;
+    for (auto input : sumInputs) {
+      auto consumer = repr::nn::getConsumers(input).back();
+      if (consumer != sumNode) {
+        should_fuse = false;
+        break;
+      }
+    }
+    // Sum inputs should not be referenced by sequential ops.
+    if (!should_fuse) {
+      continue;
+    }
+
+    int j = i - 1;
+    repr::NNGraph::NodeRef convNode = nullptr;
+    while (j-- >= 0) {
+      if (!repr::nn::hasInputs(sumNode)) {
+        continue;
+      }
+
+      // Find the nearest Op before Sum
+      if (repr::nn::is<repr::NeuralNetOperator>(allNodes[j])) {
+        // The Op must be a Conv
+        if (repr::nn::is<repr::Conv>(allNodes[j])) {
+          convNode = allNodes[j];
+        }
+        break;
+      }
+    }
+    if (convNode == nullptr) {
+      continue;
+    }
+
+    auto conv = repr::nn::get<repr::Conv>(convNode);
+    if (!shouldFuseConv(*conv)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    auto convOutput = repr::nn::getOutputs(convNode).front();
+    repr::NNGraph::NodeRef sumInputX =
+        (sumInputs[0] == convOutput ? sumInputs[1] : sumInputs[0]);
+    CAFFE_ENFORCE(sumInputX != nullptr, "Invalid sum inputs");
+
+    auto preNode = repr::nn::getProducer(sumInputX);
+    if (preNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(preNode)) {
+      LOG(WARNING) << "Can not fuse Conv Sum";
+      continue;
+    }
+
+    auto newOutputName = repr::nn::get<repr::Tensor>(sumInputX)->getName();
+    auto newOutputTensor = util::make_unique<repr::Tensor>(newOutputName);
+    auto newOutput = nn->dataFlow.createNode(
+        unique_dyn_cast<repr::NeuralNetData>(newOutputTensor));
+
+    auto sumOutput = repr::nn::getOutputs(sumNode).front();
+    nn->dataFlow.replaceNode(sumOutput, newOutput);
+
+    // 2 means FUSION_CONV_SUM
+    resetConvForFusion(convNode, 2);
+    nn->dataFlow.createEdge(sumInputX, convNode);
+    nn->dataFlow.createEdge(convNode, newOutput);
+
+    nn->dataFlow.deleteNode(sumNode);
+    nn->dataFlow.deleteNode(sumOutput);
+    nn->dataFlow.deleteNode(convOutput);
+  }
+}
+
+void fuseActivationForIdeep(repr::NNModule* nn) {
+  // Conv+Relu fusion
+  auto should_fuse = shouldFuseConv;
+  auto postprocess = std::bind(resetConvForFusion, std::placeholders::_1, 1);
+  fuseActivation<repr::Conv, repr::Relu>(nn, should_fuse, postprocess);
+}
+
+void enforceFusionInplaceForIdeep(repr::NNModule* nn) {
+  // For fusions of Conv+Sum or Conv+Sum+ReLU, the last input and output must
+  // be inplaced. To enforce inplace, here to re-check whole graph and correct
+  // the ConvFusion Ops.
+  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef convNode;
+    repr::Conv* conv;
+    std::tie(conv, convNode) = node_pair;
+
+    if (!isOnIdeepDevice(*conv)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    const auto& op = getOpDef(*conv);
+    if (op.type() != "ConvFusion") {
+      continue;
+    }
+
+    bool enforce_inplace = false;
+    for (const auto& arg : op.arg()) {
+      // Only check FUSION_SUM & FUSION_SUM_RELU
+      if (arg.name() == "fusion_type" && (arg.i() == 2 || arg.i() == 3)) {
+        enforce_inplace = true;
+        break;
+      }
+    }
+
+    if (!enforce_inplace) {
+      continue;
+    }
+
+    auto convInput = repr::nn::getInputs(convNode).back();
+    auto inputName = repr::nn::get<repr::Tensor>(convInput)->getName();
+    auto convOutput = repr::nn::getOutputs(convNode).front();
+    auto outputName = repr::nn::get<repr::Tensor>(convOutput)->getName();
+    if (inputName == outputName) {
+      continue;
+    }
+
+    auto consumer = repr::nn::getConsumers(convInput).back();
+    if (consumer != convNode) {
+      LOG(ERROR) << "Can not enforce to inplace for fusion";
+      return;
+    }
+
+    auto newOutputTensor = util::make_unique<repr::Tensor>(inputName);
+    auto newOutput = nn->dataFlow.createNode(
+        unique_dyn_cast<repr::NeuralNetData>(newOutputTensor));
+    nn->dataFlow.replaceNode(convOutput, newOutput);
+
+    nn->dataFlow.deleteNode(convOutput);
+  }
+}
+
+void OptimizeForIdeep(
+    repr::NNModule* nn,
+    caffe2::Workspace* ws,
+    bool training_mode) {
+  if (training_mode) {
+    // Only support inference so far
+    return;
+  }
+
+  fuseConvBNForIdeep(nn, ws);
+
+  fuseConvSumForIdeep(nn, ws);
+
+  fuseActivationForIdeep(nn);
+
+  enforceFusionInplaceForIdeep(nn);
+}
+
+#endif // CAFFE2_USE_IDEEP
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/optimize_ideep.h b/caffe2/opt/optimize_ideep.h
new file mode 100644
index 0000000..2463578
--- /dev/null
+++ b/caffe2/opt/optimize_ideep.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+
+namespace caffe2 {
+namespace opt {
+
+void OptimizeForIdeep(
+    nom::repr::NNModule* nn,
+    caffe2::Workspace* ws,
+    bool training_mode = false);
+}
+} // namespace caffe2
diff --git a/caffe2/opt/optimizer.cc b/caffe2/opt/optimizer.cc
new file mode 100644
index 0000000..1c2f7f1
--- /dev/null
+++ b/caffe2/opt/optimizer.cc
@@ -0,0 +1,48 @@
+#include "caffe2/opt/optimizer.h"
+
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/mobile.h"
+#include "caffe2/opt/fusion.h"
+
+namespace caffe2 {
+namespace opt {
+
+void workspaceOptimizations(nom::repr::NNModule* nn, Workspace* ws, int level) {
+  switch (level) {
+    case 1:
+      opt::fuseConvBN(nn, ws);
+    case 0:
+    default:
+      break;
+  }
+}
+
+void graphOptimzations(nom::repr::NNModule* nn, int level) {
+  switch (level) {
+    case 1:
+#ifdef USE_NNPACK 
+      opt::addNNPACK(nn, false);
+      opt::fuseNNPACKConvRelu(nn);
+#endif
+    case 0:
+    default:
+      break;
+  }
+}
+
+NetDef optimize(NetDef net, Workspace* ws, int level) {
+  auto nn = convertToNNModule(net);
+  graphOptimzations(&nn, level);
+  workspaceOptimizations(&nn, ws, level);
+  return convertToCaffe2Proto(nn, net);
+}
+
+NetDef optimize(NetDef net, int level) {
+  auto nn = convertToNNModule(net);
+  graphOptimzations(&nn, level);
+  return convertToCaffe2Proto(nn, net);
+}
+
+} // namespace opt
+} // namespace caffe2
+
diff --git a/caffe2/opt/optimizer.h b/caffe2/opt/optimizer.h
new file mode 100644
index 0000000..e0756d1
--- /dev/null
+++ b/caffe2/opt/optimizer.h
@@ -0,0 +1,17 @@
+#ifndef CAFFE2_OPT_OPTIMIZER_H
+#define CAFFE2_OPT_OPTIMIZER_H
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+namespace opt {
+
+NetDef optimize(NetDef net, Workspace* ws, int level = 1);
+NetDef optimize(NetDef net, int level = 1);
+
+} // namespace opt
+} // namespace caffe2
+
+#endif // CAFFE2_OPT_OPTIMIZER_H
diff --git a/caffe2/opt/passes.cc b/caffe2/opt/passes.cc
new file mode 100644
index 0000000..e9f05a9
--- /dev/null
+++ b/caffe2/opt/passes.cc
@@ -0,0 +1,8 @@
+#include "caffe2/opt/passes.h"
+
+namespace caffe2 {
+
+CAFFE_DEFINE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
+CAFFE_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+
+} // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
new file mode 100644
index 0000000..5857416
--- /dev/null
+++ b/caffe2/opt/passes.h
@@ -0,0 +1,71 @@
+#ifndef CAFFE2_OPT_OPT_PASSS_H
+#define CAFFE2_OPT_OPT_PASSS_H
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#include "nomnigraph/Representations/NeuralNet.h"
+
+using namespace nom::repr;
+
+namespace caffe2 {
+
+/* This file sets up the optimization pass registry.
+ *
+ * You'll want to either create a class that inherits from OptimizationPass
+ * and implements run or use the REGISTER_OPT_PASS_FROM_FUNC(name, func)
+ * to register a function that takes in an NNModule*.
+ *
+ * If you need access to the workspace in the optimization you'll need to
+ * use a different registry and inherit from WorkspaceOptimizationPass.
+ */
+
+class OptimizationPass {
+ public:
+  OptimizationPass(NNModule* nn) : nn_(nn) {}
+  virtual void run() = 0;
+  virtual ~OptimizationPass(){}
+
+ protected:
+  NNModule* nn_;
+};
+
+class WorkspaceOptimizationPass : public OptimizationPass {
+ public:
+  WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
+  virtual ~WorkspaceOptimizationPass(){}
+
+ protected:
+  Workspace* ws_;
+};
+
+CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
+#define REGISTER_WS_OPT_PASS(clsname) \
+  CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
+#define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname)      \
+  class passname : public WorkspaceOptimizationPass {           \
+   public:                                                      \
+    using WorkspaceOptimizationPass::WorkspaceOptimizationPass; \
+    void run() override {                                       \
+      funcname(nn_, ws_);                                       \
+    }                                                           \
+  };                                                            \
+  REGISTER_WS_OPT_PASS(passname);
+
+CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+#define REGISTER_OPT_PASS(clsname) \
+  CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
+#define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
+  class passname : public OptimizationPass {            \
+   public:                                              \
+    using OptimizationPass::OptimizationPass;           \
+    void run() override {                               \
+      funcname(nn_);                                    \
+    }                                                   \
+  };                                                    \
+  REGISTER_OPT_PASS(passname);
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPT_OPT_PASSS_H
diff --git a/caffe2/opt/sink.cc b/caffe2/opt/sink.cc
new file mode 100644
index 0000000..d5c874e
--- /dev/null
+++ b/caffe2/opt/sink.cc
@@ -0,0 +1,50 @@
+#include "caffe2/core/logging.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/mobile.h"
+#include "caffe2/opt/passes.h"
+
+namespace caffe2 {
+namespace opt {
+
+using namespace nom;
+
+void sinkMaxPool(nom::repr::NNModule* nn) {
+  for (auto node_pair : repr::nn::dataIterator<repr::MaxPool>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef max_pool_node;
+    repr::MaxPool* max_pool;
+    std::tie(max_pool, max_pool_node) = node_pair;
+
+    if (repr::nn::getInputs(max_pool_node).size() != 1) {
+      continue;
+    }
+
+    auto max_pool_outputs = repr::nn::getOutputs(max_pool_node);
+    if (max_pool_outputs.size() != 1) {
+      continue;
+    }
+
+    auto consumers = repr::nn::getConsumers(max_pool_outputs.front());
+    if (consumers.size() != 1) {
+      continue;
+    }
+
+    // TODO Sink MaxPool in more cases.
+    auto relu_node = consumers.front();
+    if (!repr::nn::is<repr::Relu>(relu_node)) {
+      continue;
+    }
+
+    if (repr::nn::getOutputs(relu_node).size() != 1) {
+      continue;
+    }
+
+    // input -> MaxPool -> intermediate -> Relu -> output
+    nn->dataFlow.swapNodes(max_pool_node, relu_node);
+    // input -> Relu -> intermediate -> MaxPool -> output
+  }
+}
+
+REGISTER_OPT_PASS_FROM_FUNC(SinkMaxPool, sinkMaxPool);
+
+} // namespace opt
+} // namespace caffe2
diff --git a/caffe2/opt/sink.h b/caffe2/opt/sink.h
new file mode 100644
index 0000000..37ad523
--- /dev/null
+++ b/caffe2/opt/sink.h
@@ -0,0 +1,16 @@
+#ifndef CAFFE2_OPT_SINK_H_
+#define CAFFE2_OPT_SINK_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "nomnigraph/Representations/NeuralNet.h"
+
+namespace caffe2 {
+namespace opt {
+
+void sinkMaxPool(nom::repr::NNModule* nn);
+
+} // namespace opt
+} // namespace caffe2
+
+#endif // CAFFE2_OPT_SINK_H_
diff --git a/caffe2/perfkernels/CMakeLists.txt b/caffe2/perfkernels/CMakeLists.txt
new file mode 100644
index 0000000..a69d6b3
--- /dev/null
+++ b/caffe2/perfkernels/CMakeLists.txt
@@ -0,0 +1,44 @@
+# ---[ CPU files.
+file(GLOB common_srcs *.cc)
+file(GLOB avx_srcs *_avx.cc)
+file(GLOB avx2_srcs *_avx2.cc)
+# exclude avx and avx2 srcs from common_srcs
+exclude(common_srcs "${common_srcs}" ${avx_srcs})
+exclude(common_srcs "${common_srcs}" ${avx2_srcs})
+
+# We will always build common srcs.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${common_srcs})
+
+# We will only build the perf kernel files if the compiler supports avx2
+# extensions.
+# Currently MSVC seems to have a symbol not found error while linking (related
+# to source file order?). As a result we will currently disable the perfkernel
+# in msvc.
+if (NOT MSVC AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+  add_library(Caffe2_perfkernels_avx OBJECT ${avx_srcs})
+  add_library(Caffe2_perfkernels_avx2 OBJECT ${avx2_srcs})
+  add_dependencies(Caffe2_perfkernels_avx Caffe_PROTO Caffe2_PROTO)
+  add_dependencies(Caffe2_perfkernels_avx2 Caffe_PROTO Caffe2_PROTO)
+  if (MSVC)
+    set_target_properties(
+        Caffe2_perfkernels_avx PROPERTIES COMPILE_FLAGS "/arch:AVX")
+    set_target_properties(
+        Caffe2_perfkernels_avx2 PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+  else()
+    set_target_properties(
+        Caffe2_perfkernels_avx PROPERTIES COMPILE_FLAGS "-mavx -mf16c")
+    set_target_properties(
+        Caffe2_perfkernels_avx2 PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavx -mf16c")
+  endif()
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
+      $<TARGET_OBJECTS:Caffe2_perfkernels_avx>
+      $<TARGET_OBJECTS:Caffe2_perfkernels_avx2>)
+endif()
+
+# TODO(jiayq): currently, we only implement the very base files for the
+# perfkernels. This is because to implement avx and avx2 files, we actually
+# need to set up different compilation units and this is a bit more involving
+# in terms of CMakefile changes. This is a stop-gap solution until we get a
+# more proper implementation.
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/perfkernels/common.h b/caffe2/perfkernels/common.h
new file mode 100644
index 0000000..22334e0
--- /dev/null
+++ b/caffe2/perfkernels/common.h
@@ -0,0 +1,83 @@
+// Common utilities for writing performance kernels and easy dispatching of
+// different backends.
+/*
+The general workflow shall be as follows, say we want to
+implement a functionality called void foo(int a, float b).
+
+In foo.h, do:
+   void foo(int a, float b);
+
+In foo_avx2.cc, do:
+   void foo__avx2(int a, float b) {
+     [actual avx2 implementation]
+   }
+
+In foo_avx.cc, do:
+   void foo__avx(int a, float b) {
+     [actual avx implementation]
+   }
+
+In foo.cc, do:
+   // The base implementation should *always* be provided.
+   void foo__base(int a, float b) {
+     [base, possibly slow implementation]
+   }
+   void foo(int a, float b) {
+     // You should always order things by their preference, faster
+     // implementations earlier in the function.
+     AVX2_DO(foo, a, b);
+     AVX_DO(foo, a, b);
+     BASE_DO(foo, a, b);
+   }
+
+*/
+// Details: this functionality basically covers the cases for both build time
+// and run time architecture support.
+//
+// During build time:
+//    The build system should provide flags CAFFE2_PERF_WITH_AVX2 and
+//    CAFFE2_PERF_WITH_AVX that corresponds to the __AVX__ and __AVX2__ flags
+//    the compiler provides. Note that we do not use the compiler flags but
+//    rely on the build system flags, because the common files (like foo.cc
+//    above) will always be built without __AVX__ and __AVX2__.
+// During run time:
+//    we use cpuid to identify cpu support and run the proper functions.
+
+#pragma once
+
+// DO macros: these should be used in your entry function, similar to foo()
+// above, that routes implementations based on CPU capability.
+
+#define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
+
+#ifdef CAFFE2_PERF_WITH_AVX2
+#define AVX2_DO(funcname, ...)                 \
+  decltype(funcname##__base) funcname##__avx2; \
+  if (GetCpuId().avx2()) {                     \
+    return funcname##__avx2(__VA_ARGS__);      \
+  }
+#define AVX2_FMA_DO(funcname, ...)                 \
+  decltype(funcname##__base) funcname##__avx2_fma; \
+  if (GetCpuId().avx2() && GetCpuId().fma()) {     \
+    return funcname##__avx2_fma(__VA_ARGS__);      \
+  }
+#else // CAFFE2_PERF_WITH_AVX2
+#define AVX2_DO(funcname, ...)
+#define AVX2_FMA_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_AVX2
+
+#ifdef CAFFE2_PERF_WITH_AVX
+#define AVX_DO(funcname, ...)                 \
+  decltype(funcname##__base) funcname##__avx; \
+  if (GetCpuId().avx()) {                     \
+    return funcname##__avx(__VA_ARGS__);      \
+  }
+#define AVX_F16C_DO(funcname, ...)                 \
+  decltype(funcname##__base) funcname##__avx_f16c; \
+  if (GetCpuId().avx() && GetCpuId().f16c()) {     \
+    return funcname##__avx_f16c(__VA_ARGS__);      \
+  }
+#else // CAFFE2_PERF_WITH_AVX
+#define AVX_DO(funcname, ...)
+#define AVX_F16C_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_AVX
diff --git a/caffe2/perfkernels/common_avx.cc b/caffe2/perfkernels/common_avx.cc
new file mode 100644
index 0000000..08c70d8
--- /dev/null
+++ b/caffe2/perfkernels/common_avx.cc
@@ -0,0 +1,21 @@
+// This file is here merely to check that the flags are not mixed up: for
+// example, if your compiler did not specify -mavx, you should not provide
+// the CAFFE2_PERF_WITH_AVX macro.
+
+#include "caffe2/core/common.h"
+
+#ifdef CAFFE2_PERF_WITH_AVX
+#ifndef __AVX__
+#error( \
+    "You found a build system error: CAFFE2_PERF_WITH_AVX is defined" \
+    "but __AVX__ is not defined (via e.g. -mavx).");
+#endif // __AVX__
+#endif // CAFFE2_PERF_WITH_AVX
+
+#ifdef __AVX__
+#ifndef CAFFE2_PERF_WITH_AVX
+#error( \
+    "You found a build system error: __AVX__ is defined (via e.g. -mavx) " \
+    "but CAFFE2_PERF_WITH_AVX is not defined.");
+#endif // CAFFE2_PERF_WITH_AVX
+#endif
diff --git a/caffe2/perfkernels/common_avx2.cc b/caffe2/perfkernels/common_avx2.cc
new file mode 100644
index 0000000..3db34aa
--- /dev/null
+++ b/caffe2/perfkernels/common_avx2.cc
@@ -0,0 +1,21 @@
+// This file is here merely to check that the flags are not mixed up: for
+// example, if your compiler did not specify -mavx2, you should not provide
+// the CAFFE2_PERF_WITH_AVX2 macro.
+
+#include "caffe2/core/common.h"
+
+#ifdef CAFFE2_PERF_WITH_AVX2
+#ifndef __AVX2__
+#error( \
+    "You found a build system error: CAFFE2_PERF_WITH_AVX2 is defined" \
+    "but __AVX2__ is not defined (via e.g. -mavx2).");
+#endif // __AVX2__
+#endif // CAFFE2_PERF_WITH_AVX2
+
+#ifdef __AVX2__
+#ifndef CAFFE2_PERF_WITH_AVX2
+#error( \
+    "You found a build system error: __AVX2__ is defined (via e.g. -mavx2) " \
+    "but CAFFE2_PERF_WITH_AVX2 is not defined.");
+#endif // CAFFE2_PERF_WITH_AVX2
+#endif
diff --git a/caffe2/perfkernels/cvtsh_ss_bugfix.h b/caffe2/perfkernels/cvtsh_ss_bugfix.h
new file mode 100644
index 0000000..ee20ce6
--- /dev/null
+++ b/caffe2/perfkernels/cvtsh_ss_bugfix.h
@@ -0,0 +1,56 @@
+#pragma once
+
+// Apple clang was fixed in 8.1
+#if defined(__apple_build_version__) && ((__clang_major__ < 8) || ((__clang_major__ == 8) && (__clang_minor__ < 1)))
+#define __APPLE_NEED_FIX 1
+#endif
+
+// Regular clang was fixed in 3.9
+#if defined(__clang__) && (__clang_major__ < 4) && (__clang_minor__ < 9)
+#define __CLANG_NEED_FIX 1
+#endif
+
+#if __APPLE_NEED_FIX || __CLANG_NEED_FIX
+
+#include <emmintrin.h>
+
+// This version of clang has a bug that _cvtsh_ss is not defined, see
+// https://reviews.llvm.org/D16177
+static __inline float
+    __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_cvtsh_ss(unsigned short a)
+{
+  __v8hi v = {(short)a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
+  return r[0];
+}
+
+#endif // __APPLE_NEED_FIX || __CLANG_NEED_FIX
+
+#undef __APPLE_NEED_FIX
+#undef __CLANG_NEED_FIX
+
+#ifdef _MSC_VER
+
+// It seems that microsoft msvc does not have a _cvtsh_ss implementation so
+// we will add a dummy version to it.
+
+static inline float
+_cvtsh_ss(unsigned short x) {
+  union {
+    uint32_t intval;
+    float floatval;
+  } t1;
+  uint32_t t2, t3;
+  t1.intval = x & 0x7fff; // Non-sign bits
+  t2 = x & 0x8000; // Sign bit
+  t3 = x & 0x7c00; // Exponent
+  t1.intval <<= 13; // Align mantissa on MSB
+  t2 <<= 16; // Shift sign bit into position
+  t1.intval += 0x38000000; // Adjust bias
+  t1.intval = (t3 == 0 ? 0 : t1.intval); // Denormals-as-zero
+  t1.intval |= t2; // Re-insert sign bit
+  return t1.floatval;
+}
+
+#endif // _MSC_VER
diff --git a/caffe2/perfkernels/embedding_lookup.cc b/caffe2/perfkernels/embedding_lookup.cc
new file mode 100644
index 0000000..b076d88
--- /dev/null
+++ b/caffe2/perfkernels/embedding_lookup.cc
@@ -0,0 +1,174 @@
+#include "caffe2/perfkernels/embedding_lookup.h"
+
+#include "caffe2/core/types.h"
+#include "caffe2/perfkernels/common.h"
+#include "caffe2/perfkernels/typed_axpy.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Base implementation does runtime dispatch for each segment of reduction
+template <
+    typename IndexType,
+    typename InType,
+    typename OutType,
+    bool IS_WEIGHT_POSITIONAL = false>
+static void EmbeddingLookupGenericSlow(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const InType* input,
+    const IndexType* indices,
+    const int* lengths,
+    const float* weights, // optional, can be null for sum reducer
+    const float* scale_bias, // optional scale & bias params for uint8 input
+    bool normalize_by_lengths,
+    OutType* out) {
+  TIndex current = 0;
+  for (int m = 0; m < output_size; ++m) {
+    memset(out, 0, sizeof(OutType) * block_size);
+    EigenVectorArrayMap<OutType> out_vector(out, block_size);
+    for (int i = 0; i < lengths[m]; ++i) {
+      CAFFE_ENFORCE_LT(current, index_size);
+      TIndex idx = indices[current];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data_size,
+          "Index ",
+          current,
+          " is out of bounds: ",
+          idx,
+          ", range 0 to ",
+          data_size);
+      CAFFE_ENFORCE_LT(idx, data_size);
+#ifdef __GNUC__
+      if (current + 1 < index_size) {
+        __builtin_prefetch(input + block_size * indices[current + 1], 0, 1);
+      }
+#endif // __GNUC__
+
+      float w = 1.f, b = 0.f;
+      if (weights) {
+        w = weights[IS_WEIGHT_POSITIONAL ? i : current];
+      }
+      if (scale_bias) {
+        b = w * scale_bias[2 * indices[current] + 1];
+        w = w * scale_bias[2 * indices[current]];
+      }
+
+      TypedAxpy<InType, OutType>(
+          block_size, w, input + block_size * indices[current], out);
+
+      if (scale_bias) {
+        out_vector = out_vector + b;
+      }
+
+      ++current;
+    }
+    if (normalize_by_lengths && lengths[m]) {
+      // hack: context is not really used
+      math::Scale<OutType, CPUContext>(
+          block_size, 1.f / lengths[m], out, out, nullptr);
+    }
+    out += block_size;
+  }
+  CAFFE_ENFORCE_EQ(
+      current,
+      index_size,
+      "Your input seems to be incorrect: the sum of lengths values should be "
+      "the size of the indices tensor, but it appears not.");
+}
+
+// Proxy back to generic implementation
+#define EMBEDDING_SPECIALIZATION(                                                          \
+    IndexType, InType, OutType, IS_WEIGHT_POSITIONAL)                                      \
+  void                                                                                     \
+      EmbeddingLookup_##IndexType##_##InType##_##OutType##_##IS_WEIGHT_POSITIONAL##__base( \
+          const TIndex block_size,                                                         \
+          const TIndex output_size,                                                        \
+          const TIndex index_size,                                                         \
+          const TIndex data_size,                                                          \
+          const InType* input,                                                             \
+          const IndexType* indices,                                                        \
+          const int* lengths,                                                              \
+          const float* weights,                                                            \
+          const float* scale_bias,                                                         \
+          bool normalize_by_lengths,                                                       \
+          OutType* out) {                                                                  \
+    EmbeddingLookupGenericSlow<                                                            \
+        IndexType,                                                                         \
+        InType,                                                                            \
+        OutType,                                                                           \
+        IS_WEIGHT_POSITIONAL>(                                                             \
+        block_size,                                                                        \
+        output_size,                                                                       \
+        index_size,                                                                        \
+        data_size,                                                                         \
+        input,                                                                             \
+        indices,                                                                           \
+        lengths,                                                                           \
+        weights,                                                                           \
+        scale_bias,                                                                        \
+        normalize_by_lengths,                                                              \
+        out);                                                                              \
+  }                                                                                        \
+  template <>                                                                              \
+  void EmbeddingLookup<IndexType, InType, OutType, IS_WEIGHT_POSITIONAL>(                  \
+      const TIndex block_size,                                                             \
+      const TIndex output_size,                                                            \
+      const TIndex index_size,                                                             \
+      const TIndex data_size,                                                              \
+      const InType* input,                                                                 \
+      const IndexType* indices,                                                            \
+      const int* lengths,                                                                  \
+      const float* weights,                                                                \
+      const float* scale_bias,                                                             \
+      bool normalize_by_lengths,                                                           \
+      OutType* out) {                                                                      \
+    AVX2_FMA_DO(                                                                           \
+        EmbeddingLookup_##IndexType##_##InType##_##OutType##_##IS_WEIGHT_POSITIONAL,       \
+        block_size,                                                                        \
+        output_size,                                                                       \
+        index_size,                                                                        \
+        data_size,                                                                         \
+        input,                                                                             \
+        indices,                                                                           \
+        lengths,                                                                           \
+        weights,                                                                           \
+        scale_bias,                                                                        \
+        normalize_by_lengths,                                                              \
+        out);                                                                              \
+    BASE_DO(                                                                               \
+        EmbeddingLookup_##IndexType##_##InType##_##OutType##_##IS_WEIGHT_POSITIONAL,       \
+        block_size,                                                                        \
+        output_size,                                                                       \
+        index_size,                                                                        \
+        data_size,                                                                         \
+        input,                                                                             \
+        indices,                                                                           \
+        lengths,                                                                           \
+        weights,                                                                           \
+        scale_bias,                                                                        \
+        normalize_by_lengths,                                                              \
+        out);                                                                              \
+  }
+
+EMBEDDING_SPECIALIZATION(int32_t, float, float, false);
+EMBEDDING_SPECIALIZATION(int64_t, float, float, false);
+EMBEDDING_SPECIALIZATION(int32_t, float16, float, false);
+EMBEDDING_SPECIALIZATION(int64_t, float16, float, false);
+EMBEDDING_SPECIALIZATION(int32_t, uint8_t, float, false);
+EMBEDDING_SPECIALIZATION(int64_t, uint8_t, float, false);
+
+EMBEDDING_SPECIALIZATION(int32_t, float, float, true);
+EMBEDDING_SPECIALIZATION(int64_t, float, float, true);
+EMBEDDING_SPECIALIZATION(int32_t, float16, float, true);
+EMBEDDING_SPECIALIZATION(int64_t, float16, float, true);
+EMBEDDING_SPECIALIZATION(int32_t, uint8_t, float, true);
+EMBEDDING_SPECIALIZATION(int64_t, uint8_t, float, true);
+
+#undef EMBEDDING_SPECIALIZATION
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/embedding_lookup.h b/caffe2/perfkernels/embedding_lookup.h
new file mode 100644
index 0000000..c4c6ccf
--- /dev/null
+++ b/caffe2/perfkernels/embedding_lookup.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+/**
+ * Embedding lookup with reduction.
+ *
+ * `input` of size data_size * block_size
+ * `indices` of size index_size
+ * `lengths` of size output_size
+ * `weights` nullptr or array of size index_size
+ * `out` of size output_size * block_size
+ * sum(lengths[i]) == index_size
+ *
+ * Behavior is roughly equivalent to pseudocode:
+ *
+ * pos = 0
+ * for (i = 0..index_size-1)
+ *   for (k = 0..block_size-1)
+ *     out[i*block_size + k] = 0
+ *   for (j = 0..lengths[i]-1)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] += input[indices[pos]*block_size + k] *
+ *           (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0)
+ *     pos += 1
+ *   if (normalize_weights && lengths[i] > 0)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] /= lengths[i]
+ *
+ */
+template <
+    typename IndexType,
+    typename InType,
+    typename OutType,
+    bool IS_WEIGHT_POSITIONAL = false>
+void EmbeddingLookup(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const InType* input,
+    const IndexType* indices,
+    const int* lengths,
+    const float* weights, // optional, can be null for non-weighted sum
+    const float* scale_bias, // optional scale & bias params for uint8 input
+    bool normalize_by_lengths,
+    OutType* out);
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/embedding_lookup_avx2.cc b/caffe2/perfkernels/embedding_lookup_avx2.cc
new file mode 100644
index 0000000..7c66af9
--- /dev/null
+++ b/caffe2/perfkernels/embedding_lookup_avx2.cc
@@ -0,0 +1,3044 @@
+//// --------------------------
+//// ATTENTION:
+//// THIS CODE IS AUTOGENERATED
+//// BY hp_emblookup_codegen.py
+//// DO NOT MODIFY!!!
+//// --------------------------
+
+#include <caffe2/core/common.h>
+#include <caffe2/core/types.h>
+#include <immintrin.h>
+
+namespace caffe2 {
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int32_t_float_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (64)), vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (72)), vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (80)), vop80);
+        _mm_prefetch((&ip_next_T0[80]), _MM_HINT_T0);
+        vop88 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (88)), vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (96)), vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (104)), vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (112)), vop112);
+        _mm_prefetch((&ip_next_T0[112]), _MM_HINT_T0);
+        vop120 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (120)), vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt, _mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ip[j];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int32_t_float_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_float_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int32_t_float_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_float_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int64_t_float_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (64)), vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (72)), vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (80)), vop80);
+        _mm_prefetch((&ip_next_T0[80]), _MM_HINT_T0);
+        vop88 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (88)), vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (96)), vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (104)), vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (112)), vop112);
+        _mm_prefetch((&ip_next_T0[112]), _MM_HINT_T0);
+        vop120 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (120)), vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt, _mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ip[j];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int64_t_float_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_float_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int64_t_float_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_float_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int32_t_float16_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (64)))),
+            vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (72)))),
+            vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (80)))),
+            vop80);
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (88)))),
+            vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (96)))),
+            vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (104)))),
+            vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (112)))),
+            vop112);
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (120)))),
+            vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtph_ps(_mm_loadu_si128(
+                      reinterpret_cast<const __m128i*>(&ip[j]))),
+                  _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        float16 vtmp1[8] CAFFE2_ALIGNED(64);
+        for (; j < block_size; j++) {
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          op[j] += wgt * ((float*)(&vtmp2))[0];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int32_t_float16_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_float16_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int32_t_float16_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_float16_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int64_t_float16_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (64)))),
+            vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (72)))),
+            vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (80)))),
+            vop80);
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (88)))),
+            vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (96)))),
+            vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (104)))),
+            vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (112)))),
+            vop112);
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (120)))),
+            vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtph_ps(_mm_loadu_si128(
+                      reinterpret_cast<const __m128i*>(&ip[j]))),
+                  _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        float16 vtmp1[8] CAFFE2_ALIGNED(64);
+        for (; j < block_size; j++) {
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          op[j] += wgt * ((float*)(&vtmp2))[0];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int64_t_float16_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_float16_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int64_t_float16_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_float16_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias != nullptr, "scale_bias must not be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
+            _mm256_add_ps(vop72, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
+            _mm256_add_ps(vop80, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
+            _mm256_add_ps(vop88, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
+            _mm256_add_ps(vop96, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[96])
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
+            _mm256_add_ps(vop104, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
+            _mm256_add_ps(vop112, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
+            _mm256_add_ps(vop120, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        assert(scale_bias);
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                      reinterpret_cast<const __m128i*>(&ip[j])))),
+                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ((float)ip[j]) + bio;
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_uint8_t_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int32_t_uint8_t_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void EmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 0;
+  CAFFE_ENFORCE(scale_bias != nullptr, "scale_bias must not be nullptr");
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
+            _mm256_add_ps(vop72, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
+            _mm256_add_ps(vop80, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
+            _mm256_add_ps(vop88, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
+            _mm256_add_ps(vop96, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[96])
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
+            _mm256_add_ps(vop104, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
+            _mm256_add_ps(vop112, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
+            _mm256_add_ps(vop120, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        assert(scale_bias);
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                      reinterpret_cast<const __m128i*>(&ip[j])))),
+                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ((float)ip[j]) + bio;
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void EmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_uint8_t_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+void EmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  EmbeddingLookup_int64_t_uint8_t_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
new file mode 100644
index 0000000..2d5be0f
--- /dev/null
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
@@ -0,0 +1,3026 @@
+//// --------------------------
+//// ATTENTION:
+//// THIS CODE IS AUTOGENERATED
+//// BY hp_emblookup_codegen.py
+//// DO NOT MODIFY!!!
+//// --------------------------
+
+#include <caffe2/core/common.h>
+#include <caffe2/core/types.h>
+#include <immintrin.h>
+
+namespace caffe2 {
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 2;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (64)), vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (72)), vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (80)), vop80);
+        _mm_prefetch((&ip_next_T0[80]), _MM_HINT_T0);
+        vop88 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (88)), vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (96)), vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (104)), vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (112)), vop112);
+        _mm_prefetch((&ip_next_T0[112]), _MM_HINT_T0);
+        vop120 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (120)), vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt, _mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ip[j];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 2;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (64)), vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (72)), vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (80)), vop80);
+        _mm_prefetch((&ip_next_T0[80]), _MM_HINT_T0);
+        vop88 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (88)), vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (96)), vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (104)), vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (112)), vop112);
+        _mm_prefetch((&ip_next_T0[112]), _MM_HINT_T0);
+        vop120 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (120)), vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (32)), vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (40)), vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (48)), vop48);
+        _mm_prefetch((&ip_next_T0[48]), _MM_HINT_T0);
+        vop56 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (56)), vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (16)), vop16);
+        _mm_prefetch((&ip_next_T0[16]), _MM_HINT_T0);
+        vop24 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (24)), vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (0)), vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (8)), vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt, _mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ip[j];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 4;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (64)))),
+            vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (72)))),
+            vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (80)))),
+            vop80);
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (88)))),
+            vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (96)))),
+            vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (104)))),
+            vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (112)))),
+            vop112);
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (120)))),
+            vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtph_ps(_mm_loadu_si128(
+                      reinterpret_cast<const __m128i*>(&ip[j]))),
+                  _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        float16 vtmp1[8] CAFFE2_ALIGNED(64);
+        for (; j < block_size; j++) {
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          op[j] += wgt * ((float*)(&vtmp2))[0];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 4;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (64)))),
+            vop64);
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (72)))),
+            vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (80)))),
+            vop80);
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (88)))),
+            vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (96)))),
+            vop96);
+        _mm_prefetch((&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (104)))),
+            vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (112)))),
+            vop112);
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (120)))),
+            vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (32)))),
+            vop32);
+        _mm_prefetch((&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (40)))),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (48)))),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (56)))),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (16)))),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (24)))),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (0)))),
+            vop0);
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtph_ps(
+                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (8)))),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const float16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const float16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtph_ps(_mm_loadu_si128(
+                      reinterpret_cast<const __m128i*>(&ip[j]))),
+                  _mm256_loadu_ps(&op[j])));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        float16 vtmp1[8] CAFFE2_ALIGNED(64);
+        for (; j < block_size; j++) {
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          op[j] += wgt * ((float*)(&vtmp2))[0];
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const float16* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int32_t prefdist_T0 = 16;
+  const int32_t fused_block_size = block_size + 8;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
+            _mm256_add_ps(vop72, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
+            _mm256_add_ps(vop80, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
+            _mm256_add_ps(vop88, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
+            _mm256_add_ps(vop96, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[96])
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
+            _mm256_add_ps(vop104, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
+            _mm256_add_ps(vop112, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
+            _mm256_add_ps(vop120, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int32_t dataInd = 0;
+    for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int32_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int32_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int32_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int32_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                      reinterpret_cast<const __m128i*>(&ip[j])))),
+                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ((float)ip[j]) + bio;
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int32_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  const int64_t fused_block_size = block_size + 8;
+  if (block_size == 128) {
+    // unrolling 16 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch((&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
+            _mm256_add_ps(vop72, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
+            _mm256_add_ps(vop80, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
+            _mm256_add_ps(vop88, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
+            _mm256_add_ps(vop96, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[96])
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
+            _mm256_add_ps(vop104, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
+            _mm256_add_ps(vop112, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
+            _mm256_add_ps(vop120, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch((&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (normalize_by_lengths == false) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else if (lengths[rangeIndex]) {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    int64_t dataInd = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      TIndex j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        CAFFE_ENFORCE(
+            idx >= 0 && idx < data_size,
+            "Index ",
+            dataInd,
+            " is out of bounds: ",
+            idx,
+            ", range 0 to ",
+            data_size);
+        float wgt = 1.f;
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        const float* scale_bias = reinterpret_cast<const float*>(
+            &input[idx * fused_block_size + block_size]);
+        bio = wgt * scale_bias[1];
+        wgt = wgt * scale_bias[0];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            ? (dataInd + prefdist_T0)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                      reinterpret_cast<const __m128i*>(&ip[j])))),
+                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] += wgt * ((float)ip[j]) + bio;
+        }
+      }
+      if (normalize_by_lengths && lengths[rangeIndex]) {
+        float len_inv = 1.0f / lengths[rangeIndex];
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int* lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    float* out) {
+  Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      lengths,
+      weights,
+      normalize_by_lengths,
+      out);
+}
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
new file mode 100644
index 0000000..675d7c0
--- /dev/null
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
@@ -0,0 +1,166 @@
+#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
+
+#include "caffe2/core/types.h"
+#include "caffe2/perfkernels/common.h"
+#include "caffe2/perfkernels/typed_axpy.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Base implementation does runtime dispatch for each segment of reduction
+template <
+    typename IndexType,
+    typename InType,
+    typename OutType,
+    bool IS_WEIGHT_POSITIONAL = false>
+static void Fused8BitRowwiseEmbeddingLookupGenericSlow(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const InType* input,
+    const IndexType* indices,
+    const int* lengths,
+    const float* weights, // optional, can be null for sum reducer
+    bool normalize_by_lengths,
+    OutType* out) {
+  // block_size is the number of elements and fused_block_size is the size of
+  // an entire row, including scale and bias.
+  const auto scale_bias_offset = 8 / sizeof(InType);
+  const TIndex fused_block_size = block_size + scale_bias_offset;
+  TIndex current = 0;
+  for (int m = 0; m < output_size; ++m) {
+    memset(out, 0, sizeof(OutType) * block_size);
+    EigenVectorArrayMap<OutType> out_vector(out, block_size);
+    for (int i = 0; i < lengths[m]; ++i) {
+      CAFFE_ENFORCE_LT(current, index_size);
+      TIndex idx = indices[current];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data_size,
+          "Index ",
+          current,
+          " is out of bounds: ",
+          idx,
+          ", range 0 to ",
+          data_size);
+      CAFFE_ENFORCE_LT(idx, data_size);
+#ifdef __GNUC__
+      if (current + 1 < index_size) {
+        __builtin_prefetch(
+            input + fused_block_size * indices[current + 1], 0, 1);
+      }
+#endif // __GNUC__
+
+      const float* scale_bias = reinterpret_cast<const float*>(
+          input + fused_block_size * indices[current] + block_size);
+
+      float weight = 1.0f;
+      if (weights) {
+        weight = weights[IS_WEIGHT_POSITIONAL ? i : current];
+      }
+      const float scale = weight * scale_bias[0];
+      const float bias = weight * scale_bias[1];
+
+      TypedAxpy<InType, OutType>(
+          block_size, scale, input + fused_block_size * indices[current], out);
+
+      out_vector += bias;
+
+      ++current;
+    }
+    if (normalize_by_lengths && lengths[m]) {
+      // hack: context is not really used
+      math::Scale<OutType, CPUContext>(
+          block_size, 1.f / lengths[m], out, out, nullptr);
+    }
+    out += block_size;
+  }
+  CAFFE_ENFORCE_EQ(
+      current,
+      index_size,
+      "Your input seems to be incorrect: the sum of lengths values should be "
+      "the size of the indices tensor, but it appears not.");
+}
+
+// Proxy back to generic implementation
+#define FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(                                    \
+    IndexType, InType, OutType)                                                         \
+  void                                                                                  \
+      Fused8BitRowwiseEmbeddingLookup_##IndexType##_##InType##_##OutType##_false__base( \
+          const TIndex block_size,                                                      \
+          const TIndex output_size,                                                     \
+          const TIndex index_size,                                                      \
+          const TIndex data_size,                                                       \
+          const InType* input,                                                          \
+          const IndexType* indices,                                                     \
+          const int* lengths,                                                           \
+          const float* weights,                                                         \
+          bool normalize_by_lengths,                                                    \
+          OutType* out) {                                                               \
+    Fused8BitRowwiseEmbeddingLookupGenericSlow<                                         \
+        IndexType,                                                                      \
+        InType,                                                                         \
+        OutType,                                                                        \
+        false>(                                                                         \
+        block_size,                                                                     \
+        output_size,                                                                    \
+        index_size,                                                                     \
+        data_size,                                                                      \
+        input,                                                                          \
+        indices,                                                                        \
+        lengths,                                                                        \
+        weights,                                                                        \
+        normalize_by_lengths,                                                           \
+        out);                                                                           \
+  }                                                                                     \
+  template <>                                                                           \
+  void Fused8BitRowwiseEmbeddingLookup<IndexType, InType, OutType, false>(              \
+      const TIndex block_size,                                                          \
+      const TIndex output_size,                                                         \
+      const TIndex index_size,                                                          \
+      const TIndex data_size,                                                           \
+      const InType* input,                                                              \
+      const IndexType* indices,                                                         \
+      const int* lengths,                                                               \
+      const float* weights,                                                             \
+      bool normalize_by_lengths,                                                        \
+      OutType* out) {                                                                   \
+    const int32_t one = 1;                                                              \
+    CAFFE_ENFORCE_EQ(                                                                   \
+        reinterpret_cast<const uint8_t*>(&one)[0],                                      \
+        1,                                                                              \
+        "Fused8BitRowwiseEmbeddingLookup is not supported on this platform");           \
+    AVX2_FMA_DO(                                                                        \
+        Fused8BitRowwiseEmbeddingLookup_##IndexType##_##InType##_##OutType##_false,     \
+        block_size,                                                                     \
+        output_size,                                                                    \
+        index_size,                                                                     \
+        data_size,                                                                      \
+        input,                                                                          \
+        indices,                                                                        \
+        lengths,                                                                        \
+        weights,                                                                        \
+        normalize_by_lengths,                                                           \
+        out);                                                                           \
+    BASE_DO(                                                                            \
+        Fused8BitRowwiseEmbeddingLookup_##IndexType##_##InType##_##OutType##_false,     \
+        block_size,                                                                     \
+        output_size,                                                                    \
+        index_size,                                                                     \
+        data_size,                                                                      \
+        input,                                                                          \
+        indices,                                                                        \
+        lengths,                                                                        \
+        weights,                                                                        \
+        normalize_by_lengths,                                                           \
+        out);                                                                           \
+  }
+
+FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(int32_t, uint8_t, float);
+FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(int64_t, uint8_t, float);
+
+#undef FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h
new file mode 100644
index 0000000..9605fbb
--- /dev/null
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+/**
+ * Embedding lookup with reduction.
+ *
+ * `input` of size data_size * (block_size + 8B)
+ * `indices` of size index_size
+ * `lengths` of size output_size
+ * `weights` nullptr or array of size index_size
+ * `out` of size output_size * block_size
+ * sum(lengths[i]) == index_size
+ *
+ * Note that block_size should be the number of quantized values per row in the
+ * data, i.e. excluding the scale and bias. The total (fused) block size is
+ * assumed to be this block_size, plus 4 bytes for scale and 4 bytes for bias.
+ *
+ * Behavior is roughly equivalent to pseudocode:
+ *
+ * pos = 0
+ * fused_block_size = block_size + 8B // quantized values and scale and bias
+ * for (i = 0..index_size-1)
+ *   for (k = 0..block_size-1)
+ *     out[i*block_size + k] = 0
+ *   for (j = 0..lengths[i]-1)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] += input[indices[pos]*(fused_block_size) + k] *
+ *           (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0)
+ *     pos += 1
+ *   if (normalize_weights && lengths[i] > 0)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] /= lengths[i]
+ *
+ */
+
+template <
+    typename IndexType,
+    typename InType,
+    typename OutType,
+    bool IS_WEIGHT_POSITIONAL = false>
+void Fused8BitRowwiseEmbeddingLookup(
+    const TIndex block_size,
+    const TIndex output_size,
+    const TIndex index_size,
+    const TIndex data_size,
+    const InType* input,
+    const IndexType* indices,
+    const int* lengths,
+    const float* weights, // optional, can be null for non-weighted sum
+    bool normalize_by_lengths,
+    OutType* out);
+} // namespace caffe2
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
new file mode 100644
index 0000000..0578a4a
--- /dev/null
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -0,0 +1,397 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import sys
+
+sizeof = {'float': 4, 'float16': 2, 'uint8_t': 1}
+
+
+def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused):
+    def compute(regid, InType, use_weights, isa, prefetch):
+        code = []
+
+        if InType == "float":
+            code.append(
+                "vop%d = _mm256_fmadd_ps(vwgt,  \
+                  _mm256_loadu_ps(ip + (%d)), vop%d);"
+                                                       % (regid, regid, regid)
+            )
+        elif InType == "float16":
+            code.append(
+                "vop%d = _mm256_fmadd_ps(vwgt,  \
+                   _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (%d)))), \
+                   vop%d);"
+                            % (regid, regid, regid)
+            )
+        elif InType == "uint8_t":
+            code.append(
+                "vop%d = _mm256_fmadd_ps(vwgt,  \
+                   _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (%d))))), \
+                   _mm256_add_ps(vop%d, vbio));"
+                                                 % (regid, regid, regid)
+            )
+        else:
+            assert False
+
+        if prefetch:
+            code.append("_mm_prefetch((&ip_next_T0[%d]), _MM_HINT_T0);" % (regid))
+        else:
+            code.append("// skip unnecessary prefetch of (&ip_next_T0[%d])" % (regid))
+
+        return code
+
+    code = []
+    code.append("// unrolling " + str(uf) + " times")
+    code.append(IndexType + " dataInd = 0;")
+    code.append("for (" + IndexType +
+                " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {")
+    code.append(OutType + " *op = &out[rangeIndex * block_size];")
+    for i in range(0, uf):
+        j = 8 * i
+        code.append("__m256 vop" + str(j) + " = _mm256_setzero_ps();")
+
+    # inner loop
+    code.append("for (" + IndexType +
+                " start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) {")
+    code.append("const  " + IndexType + " idx = indices[dataInd];")
+    code.append(
+        'CAFFE_ENFORCE(idx >=0 && idx < data_size, "Index ", dataInd, "'
+        ' is out of bounds: ", idx, ", range 0 to ", data_size);')
+
+    if InType == "uint8_t":
+        code.append(OutType + " wgt = 1.f;")
+        code.append(OutType + " bio;")
+        code.append("if (weights) {")
+        code.append(
+            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
+        code.append("}")
+        if fused:
+            code.append(
+                'const float* scale_bias = reinterpret_cast<'
+                'const float*>(&input[idx * fused_block_size + block_size]);'
+            )
+            code.append("bio = wgt * scale_bias[1];")
+            code.append("wgt = wgt * scale_bias[0];")
+        else:
+            code.append("bio = wgt * scale_bias[2 * idx + 1];")
+            code.append("wgt = wgt * scale_bias[2 * idx];")
+        code.append("__m256 vbio = _mm256_set1_ps(bio);")
+    else:
+        code.append(OutType + " wgt = 1.f;")
+        code.append("if (weights) {")
+        code.append(
+            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
+        code.append("}")
+    code.append("__m256 vwgt = _mm256_set1_ps(wgt);")
+
+    code.append("const {} *ip = &input[idx * fused_block_size];".format(InType))
+    code.append(
+        'const {} next_T0 = (dataInd < index_size - prefdist_T0)'
+        ' ? (dataInd + prefdist_T0) : dataInd;'.format(IndexType)
+    )
+    code.append("const  " + IndexType + " idx_pref_T0 = indices[next_T0];")
+    code.append(
+        "CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
+
+    code.append(
+        'const {} *ip_next_T0 = &input[idx_pref_T0'
+        ' * fused_block_size];'.format(InType)
+    )
+
+    for i in range(0, uf):
+        j = 8 * i
+        cachelinesize = 64
+        byteoffset = sizeof[InType] * j
+        prefetch = (byteoffset % cachelinesize) == 0
+        code.extend(compute(j, InType, use_weights, isa, prefetch))
+    code.append("}")
+
+    code.append("if (normalize_by_lengths == false) {")
+    for i in range(0, uf):
+        j = 8 * i
+        code.append(
+            "_mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
+    code.append("} else if (lengths[rangeIndex]) {")
+    # inv of length
+    code.append(
+        "__m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);")
+    for i in range(0, uf):
+        j = 8 * i
+        code.append(
+            "_mm256_storeu_ps(&op[" + str(j) + "], _mm256_mul_ps(" + "vop" + str(j) + ", vlen_inv));")
+    code.append("}")
+
+    code.append("}")
+    return code
+
+
+def generic(IndexType, InType, OutType, use_weights, isa, fused):
+
+    def compute(InType, use_weights, isa):
+        code = []
+        if InType == "float":
+            code.append(
+                "_mm256_storeu_ps(&op[j], \
+                                 _mm256_fmadd_ps(vwgt,_mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])) \
+                                   );"
+            )
+        elif InType == "float16":
+            code.append(
+                "_mm256_storeu_ps(&op[j], \
+                   _mm256_fmadd_ps(vwgt, \
+                     _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(&ip[j]))), _mm256_loadu_ps(&op[j])) \
+                                   );"
+            )
+        elif InType == "uint8_t":
+            code.append(
+                "_mm256_storeu_ps(&op[j], \
+                   _mm256_fmadd_ps(vwgt, \
+                     _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&ip[j])))), \
+                     _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio) ) \
+                                   );"
+            )
+        else:
+            assert False
+
+        code.append("_mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);")
+
+        return code
+
+    code = []
+    code.append(IndexType + " dataInd = 0;")
+    code.append("for (" + IndexType +
+                " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {")
+    code.append(OutType + " *op = &out[rangeIndex * block_size];")
+
+    # initialize to 0
+    code.append("TIndex j = 0;")
+    code.append("for(; j + 8 <= block_size; j += 8) {")
+    code.append("_mm256_storeu_ps(op + j, _mm256_setzero_ps());")
+    code.append("}")
+    code.append("for(; j < block_size; j++) {")
+    code.append("op[j] = 0.0f;")
+    code.append("}")
+
+    # inner loop
+    code.append("for (" + IndexType +
+                " start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) {")
+    code.append("const  " + IndexType + " idx = indices[dataInd];")
+    code.append(
+        'CAFFE_ENFORCE(idx >=0 && idx < data_size, "Index ", dataInd, "' +
+        ' is out of bounds: ", idx, ", range 0 to ", data_size);')
+
+    if InType == "uint8_t":
+        code.append(OutType + " wgt = 1.f;")
+        code.append(OutType + " bio;")
+        code.append("if (weights) {")
+        code.append(
+            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
+        code.append("}")
+        if fused:
+            code.append(
+                'const float* scale_bias = reinterpret_cast<'
+                'const float*>(&input[idx * fused_block_size + block_size]);'
+            )
+            code.append("bio = wgt * scale_bias[1];")
+            code.append("wgt = wgt * scale_bias[0];")
+        else:
+            code.append("assert (scale_bias);")
+            code.append("bio = wgt * scale_bias[2 * idx + 1];")
+            code.append("wgt = wgt * scale_bias[2 * idx];")
+        code.append("__m256 vbio = _mm256_set1_ps(bio);")
+    else:
+        code.append(OutType + " wgt = 1.f;")
+        code.append("if (weights) {")
+        code.append(
+            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
+        code.append("}")
+    code.append("__m256 vwgt = _mm256_set1_ps(wgt);")
+
+    code.append("const {} *ip = &input[idx * fused_block_size];".format(InType))
+    code.append(
+        'const {} next_T0 = (dataInd < index_size - prefdist_T0)'
+        ' ? (dataInd + prefdist_T0) : dataInd;'.format(IndexType)
+    )
+    code.append("const  " + IndexType + " idx_pref_T0 = indices[next_T0];")
+    code.append(
+        "CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
+    code.append(
+        "const {} *ip_next_T0 = &input[idx_pref_T0 * fused_block_size];".
+        format(InType)
+    )
+
+    # compute and store main loop
+    code.append("j = 0;")
+    code.append("for(; j + 8 <= block_size; j += 8) {")
+    code.extend(compute(InType, use_weights, isa))
+    code.append("}")
+    # leftover
+    if InType == "float16":
+        code.append("float16 vtmp1[8] CAFFE2_ALIGNED(64);")
+    code.append("for(; j < block_size; j++) {")
+    if InType == "float":
+        code.append("op[j] += wgt * ip[j];")
+    elif InType == "float16":
+        code.append("vtmp1[0] = ip[j];")
+        code.append("__m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));")
+        code.append("op[j] += wgt * ((float*)(&vtmp2))[0];")
+    elif InType == "uint8_t":
+        code.append("op[j] += wgt * ((float)ip[j]) + bio;")
+    else:
+        assert False
+
+    code.append("}")
+
+    code.append("}")
+
+    code.append("if (normalize_by_lengths && lengths[rangeIndex]) {")
+    code.append("float len_inv = 1.0f / lengths[rangeIndex];")
+    code.append("__m256 vlen_inv = _mm256_set1_ps(len_inv);")
+    code.append("j = 0;")
+    code.append("for(; j + 8 <= block_size; j += 8) {")
+    code.append(
+        "_mm256_storeu_ps(&op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));")
+    code.append("}")
+    code.append("for(; j < block_size; j++) {")
+    code.append("op[j] = len_inv * op[j];")
+    code.append("}")
+
+    code.append("}")
+
+    code.append("}")
+    return code
+
+
+# start main code
+parser = argparse.ArgumentParser()
+parser.add_argument('-f', '--filename', help="file name")
+parser.add_argument('--fused', action='store_true')
+opts = parser.parse_args()
+if opts.filename:
+    filename = opts.filename
+elif opts.fused:
+    filename = "embedding_lookup_fused_8bit_rowwise_avx2.cc"
+else:
+    filename = "embedding_lookup_avx2.cc"
+fout = open(filename, 'w')
+
+options = [["int32_t", "float", "float"],
+           ["int64_t", "float", "float"],
+           ["int32_t", "float16", "float"],
+           ["int64_t", "float16", "float"],
+           ["int32_t", "uint8_t", "float"],
+           ["int64_t", "uint8_t", "float"]]
+
+code = []
+# includes
+code.append("//// --------------------------")
+code.append("//// ATTENTION:")
+code.append("//// THIS CODE IS AUTOGENERATED")
+code.append("//// BY {}".format(sys.argv[0]))
+code.append("//// DO NOT MODIFY!!!")
+code.append("//// --------------------------\n\n")
+
+code.append("#include <caffe2/core/types.h>")
+code.append("#include <caffe2/core/common.h>")
+code.append("#include <immintrin.h>")
+code.append("\n")
+
+code.append("namespace caffe2 {\n")
+for o in options:
+    [IndexType, InType, OutType] = o
+
+    prefix = 'Fused8BitRowwise' if opts.fused else ''
+    code.append('template <bool IS_WEIGHT_POSITIONAL>')
+    fn_base = '{}EmbeddingLookup_{}_{}_{}'.format(
+        prefix, IndexType, InType, OutType
+    )
+    suffix = '__avx2_fma'
+    fn = "static void " + fn_base + suffix
+    code.append(fn + "(")
+
+    args = []
+    args.append("const TIndex block_size,")
+    args.append("const TIndex output_size,")
+    args.append("const TIndex index_size,")
+    args.append("const TIndex data_size,")
+    args.append("const " + InType + "* input,")
+    args.append("const " + IndexType + "* indices,")
+    args.append("const int* lengths,")
+    args.append("const float* weights,")
+    if not opts.fused:
+        args.append("const float* scale_bias,")
+    args.append("bool normalize_by_lengths,")
+    args.append(OutType + "* out)")
+    code += args
+
+    code.append("{")
+    code.append("const " + IndexType + " prefdist_T0 = 16;")
+    # block_size is the number of elements and fused_block_size is the size of
+    # an entire row, including scale and bias.
+    offset = (8 // sizeof[InType]) if opts.fused else 0
+    code.append(
+        "const {} fused_block_size = block_size + {};".
+        format(IndexType, offset)
+    )
+
+    #code.append("printf(\"calling " + fn + "\\n\");");
+    if not opts.fused:
+        if InType != "uint8_t":
+            code.append(
+                'CAFFE_ENFORCE(scale_bias == nullptr,'
+                ' "scale_bias must be nullptr");'
+            )
+        else:
+            code.append(
+                'CAFFE_ENFORCE(scale_bias != nullptr,'
+                ' "scale_bias must not be nullptr");'
+            )
+
+    code.append("if (block_size == 128) {")
+    code += unroll(16, IndexType, InType, OutType, True, "AVX2", opts.fused)
+    code.append("} else if (block_size == 64) {")
+    code += unroll(8, IndexType, InType, OutType, True, "AVX2", opts.fused)
+    code.append("} else if (block_size == 32) {")
+    code += unroll(4, IndexType, InType, OutType, True, "AVX2", opts.fused)
+    code.append("} else if (block_size == 16) {")
+    code += unroll(2, IndexType, InType, OutType, True, "AVX2", opts.fused)
+    code.append("} else {")
+    code.append("// generic code")
+    code += generic(IndexType, InType, OutType, True, "AVX2", opts.fused)
+    code.append("}")
+
+    code.append("}")
+
+    for is_weight_positional in ['false', 'true']:
+        code.append(
+            "void " + fn_base + "_" + is_weight_positional + suffix + "(")
+        code += args
+        code.append("{")
+        code.append(fn_base + suffix + "<" + is_weight_positional + ">(")
+        code.append("block_size,")
+        code.append("output_size,")
+        code.append("index_size,")
+        code.append("data_size,")
+        code.append("input,")
+        code.append("indices,")
+        code.append("lengths,")
+        code.append("weights,")
+        if not opts.fused:
+            code.append("scale_bias,")
+        code.append("normalize_by_lengths,")
+        code.append("out);")
+        code.append("}")
+
+    code.append("\n")
+
+code.append("} // namespace caffe2")
+
+for c in code:
+    #print(c, file = fout)
+    fout.write(c + "\n")
+fout.close()
+
+
+print("Created " + filename)
diff --git a/caffe2/perfkernels/typed_axpy.cc b/caffe2/perfkernels/typed_axpy.cc
new file mode 100644
index 0000000..bb0fae8
--- /dev/null
+++ b/caffe2/perfkernels/typed_axpy.cc
@@ -0,0 +1,70 @@
+#include "caffe2/perfkernels/typed_axpy.h"
+#include "caffe2/core/types.h"
+#include "caffe2/perfkernels/common.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+void TypedAxpy<float, float>(int N, const float a, const float* x, float* y) {
+  // This uses a hack that axpy implementation actually does not use the
+  // CPUContext, so passing in a nullpointer works.
+  math::Axpy<float, CPUContext>(N, a, x, y, nullptr);
+}
+
+void TypedAxpy_float16_float__base(
+    int N,
+    const float a,
+    const float16* x,
+    float* y) {
+  for (int i = 0; i < N; ++i) {
+    union {
+      uint32_t intval;
+      float floatval;
+    } t1;
+    uint32_t t2, t3;
+    t1.intval = x[i].x & 0x7fff; // Non-sign bits
+    t2 = x[i].x & 0x8000; // Sign bit
+    t3 = x[i].x & 0x7c00; // Exponent
+    t1.intval <<= 13; // Align mantissa on MSB
+    t2 <<= 16; // Shift sign bit into position
+    t1.intval += 0x38000000; // Adjust bias
+    t1.intval = (t3 == 0 ? 0 : t1.intval); // Denormals-as-zero
+    t1.intval |= t2; // Re-insert sign bit
+    y[i] += t1.floatval * a;
+  }
+}
+
+template <>
+void TypedAxpy<float16, float>(
+    int N,
+    const float a,
+    const float16* x,
+    float* y) {
+  AVX2_FMA_DO(TypedAxpy_float16_float, N, a, x, y);
+  AVX_F16C_DO(TypedAxpy_float16_float, N, a, x, y);
+  BASE_DO(TypedAxpy_float16_float, N, a, x, y);
+}
+
+void TypedAxpy_uint8_float__base(
+    int N,
+    const float a,
+    const std::uint8_t* x,
+    float* y) {
+  for (int i = 0; i < N; ++i) {
+    y[i] += (float)(x[i]) * a;
+  }
+}
+
+template <>
+void TypedAxpy<std::uint8_t, float>(
+    int N,
+    const float a,
+    const std::uint8_t* x,
+    float* y) {
+  AVX2_FMA_DO(TypedAxpy_uint8_float, N, a, x, y);
+  BASE_DO(TypedAxpy_uint8_float, N, a, x, y);
+}
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/typed_axpy.h b/caffe2/perfkernels/typed_axpy.h
new file mode 100644
index 0000000..85b1add
--- /dev/null
+++ b/caffe2/perfkernels/typed_axpy.h
@@ -0,0 +1,12 @@
+#pragma once
+
+namespace caffe2 {
+
+// Similar to Axpy that calculate y = a * x + y, but allowing x and y to be
+// of different data types.
+// It also provides a performance optimization hint (use_a) to see if a is going
+// to be 1 or not.
+template <typename IN, typename OUT>
+void TypedAxpy(int N, const OUT a, const IN* x, OUT* y);
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/typed_axpy_avx.cc b/caffe2/perfkernels/typed_axpy_avx.cc
new file mode 100644
index 0000000..643d8c7
--- /dev/null
+++ b/caffe2/perfkernels/typed_axpy_avx.cc
@@ -0,0 +1,47 @@
+#include "caffe2/core/types.h"
+#include "caffe2/perfkernels/cvtsh_ss_bugfix.h"
+#include "caffe2/perfkernels/typed_axpy.h"
+#include "caffe2/utils/math.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+namespace caffe2 {
+
+void TypedAxpy_float16_float__avx_f16c(
+    int N,
+    const float a,
+    const float16* x,
+    float* y) {
+  // if x does not start at the 16 byte boundary, we will process the first few.
+  // before we get to a real one.
+  while (N && (unsigned long)x % 16) {
+    *(y++) += _cvtsh_ss((*(x++)).x) * a;
+    --N;
+  }
+
+  // From now on we can do vectorized additions using __m256, which is 8 floats,
+  // so we will vectorize every 8 element and then resort to cvtsh_ss.
+  __m256 mma = _mm256_set1_ps(a);
+  int current = 0;
+  const int bound = (N % 8) ? N - 8 : N;
+
+  for (; current < bound; current += 8) {
+    __m128i mmx_16 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(x + current));
+    __m256 mmx_32 = _mm256_cvtph_ps(mmx_16);
+    __m256 mmy_in = _mm256_loadu_ps(y + current);
+    __m256 mmmul = _mm256_mul_ps(mmx_32, mma);
+    __m256 mmy_out = _mm256_add_ps(mmmul, mmy_in);
+    _mm256_storeu_ps(y + current, mmy_out);
+  }
+
+  if (bound != N) {
+    while (current < N) {
+      y[current] += _cvtsh_ss(x[current].x) * a;
+      ++current;
+    }
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/typed_axpy_avx2.cc b/caffe2/perfkernels/typed_axpy_avx2.cc
new file mode 100644
index 0000000..5f60736
--- /dev/null
+++ b/caffe2/perfkernels/typed_axpy_avx2.cc
@@ -0,0 +1,82 @@
+#include "caffe2/core/types.h"
+#include "caffe2/perfkernels/cvtsh_ss_bugfix.h"
+#include "caffe2/perfkernels/typed_axpy.h"
+#include "caffe2/utils/math.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+namespace caffe2 {
+
+void TypedAxpy_float16_float__avx2_fma(
+    int N,
+    const float a,
+    const float16* x,
+    float* y) {
+  // if x does not start at the 16 byte boundary, we will process the first few.
+  // before we get to a real one.
+  while (((unsigned long)x % 16) && N) {
+    *(y++) += _cvtsh_ss((*(x++)).x) * a;
+    --N;
+  }
+
+  // From now on we can do vectorized additions using __m256, which is 8 floats,
+  // so we will vectorize every 8 element and then resort to cvtsh_ss.
+  __m256 mma = _mm256_set1_ps(a);
+  int current = 0;
+  const int bound = (N % 8) ? N - 8 : N;
+
+  for (; current < bound; current += 8) {
+    __m128i mmx_16 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(x + current));
+    __m256 mmx_32 = _mm256_cvtph_ps(mmx_16);
+    __m256 mmy = _mm256_loadu_ps(y + current);
+    mmy = _mm256_fmadd_ps(mmx_32, mma, mmy);
+    _mm256_storeu_ps(y + current, mmy);
+  }
+
+  if (bound != N) {
+    while (current < N) {
+      y[current] += _cvtsh_ss(x[current].x) * a;
+      ++current;
+    }
+  }
+}
+
+void TypedAxpy_uint8_float__avx2_fma(
+    int N,
+    const float a,
+    const std::uint8_t* x,
+    float* y) {
+  // if x does not start at the 16 byte boundary, we will process the first few.
+  // before we get to a real one.
+  while (((unsigned long)x % 16) && N) {
+    *(y++) += (float)(*(x++)) * a;
+    --N;
+  }
+
+  // From now on we can do vectorized additions using __m256, which is 8 floats,
+  // so we will vectorize every 8 element and then resort to cvtsh_ss.
+  __m256 mma = _mm256_set1_ps(a);
+  int current = 0;
+  const int bound = (N % 8) ? N - 8 : N;
+
+  for (; current < bound; current += 8) {
+    __m256i mmx_int32 = _mm256_cvtepi8_epi32(
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(x + current)));
+    __m256 mmx_fp32 = _mm256_cvtepi32_ps(mmx_int32);
+
+    __m256 mmy = _mm256_loadu_ps(y + current);
+    mmy = _mm256_fmadd_ps(mmx_fp32, mma, mmy);
+    _mm256_storeu_ps(y + current, mmy);
+  }
+
+  if (bound != N) {
+    while (current < N) {
+      y[current] += (float)(x[current]) * a;
+      ++current;
+    }
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt
new file mode 100644
index 0000000..aeb64a0
--- /dev/null
+++ b/caffe2/proto/CMakeLists.txt
@@ -0,0 +1,17 @@
+file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
+
+caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})
+
+add_library(Caffe2_PROTO OBJECT ${Caffe2_PROTO_HEADERS} ${Caffe2_PROTO_SRCS})
+
+if (MSVC)
+  if(BUILD_SHARED_LIBS)
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
+  else()
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=")
+  endif()
+  target_compile_definitions(
+      Caffe2_PROTO PRIVATE ${Caffe2_API_DEFINE})
+endif()
+
+install(FILES ${Caffe2_PROTO_HEADERS} DESTINATION include/caffe2/proto)
diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
new file mode 100644
index 0000000..6d00143
--- /dev/null
+++ b/caffe2/proto/caffe2.proto
@@ -0,0 +1,336 @@
+syntax = "proto2";
+
+package caffe2;
+
+// A few notes about the Caffe2's protobuffer convention:
+// (1) Most objects are registered by their types, such as operators and nets.
+//     For these, we have a string-type field "type" for registration purposes.
+// (2) We do not use extension because that used to create quite some conflicts
+//     in Caffe's protobuf design.
+// (3) We have not used any proto3 specific features, such as Any or Map. This
+//     is mainly for backward compability purposes but we may consider using
+//     those in the future.
+
+// TensorProto stores serialized Tensor objects.
+message TensorProto {
+  // The dimensions in the tensor.
+  repeated int64 dims = 1;
+  enum DataType {
+    UNDEFINED = 0;
+    FLOAT = 1;  // float
+    INT32 = 2;  // int
+    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
+    STRING = 4;  // string
+    // Less-commonly used data types.
+    BOOL = 5;  // bool
+    UINT8 = 6;  // uint8_t
+    INT8 = 7;  // int8_t
+    UINT16 = 8;  // uint16_t
+    INT16 = 9;  // int16_t
+    INT64 = 10;  // int64_t
+    FLOAT16 = 12;  // caffe2::__f16, caffe2::float16
+    DOUBLE = 13;  // double
+  }
+  optional DataType data_type = 2 [default = FLOAT];
+  // For float
+  repeated float float_data = 3 [packed = true];
+  // For int32, uint8, int8, uint16, int16, bool, and float16
+  // Note about float16: in storage we will basically convert float16 byte-wise
+  // to unsigned short and then store them in the int32_data field.
+  repeated int32 int32_data = 4 [packed = true];
+  // For bytes
+  optional bytes byte_data = 5;
+  // For strings
+  repeated bytes string_data = 6;
+  // For double
+  repeated double double_data = 9 [packed = true];
+  // For int64
+  repeated int64 int64_data = 10 [packed = true];
+  // Optionally, a name for the tensor.
+  optional string name = 7;
+
+  // Optionally, a TensorProto can contain the details about the device that
+  // it was serialized from. This is useful in cases like snapshotting a whole
+  // workspace in a multi-GPU environment.
+  optional DeviceOption device_detail = 8;
+  // When loading from chunks this is going to indicate where to put data in the
+  // full array. When not used full data have to be present
+  message Segment {
+    required int64 begin = 1;
+    required int64 end = 2;
+  }
+  optional Segment segment = 11;
+}
+
+message QTensorProto {
+  repeated int64 dims = 1;
+  required int32 precision = 2;
+  required double scale = 3;
+  required double bias = 4;
+  required bool is_signed = 5;
+  repeated int32 data = 6 [packed = true];
+  optional string name = 7;
+  optional TensorProto.DataType data_type = 8 [default = INT32];
+}
+
+// TensorProtos stores multiple TensorProto objects in one single proto. This
+// is useful for small tensors; For anything big, consider using a DB for
+// storage.
+message TensorProtos {
+  repeated TensorProto protos = 1;
+}
+
+message TensorShape {
+  repeated int64 dims = 1;
+  optional TensorProto.DataType data_type = 2 [default = FLOAT];
+  repeated int32 unknown_dims = 3;
+  optional bool unknown_shape = 4 [default = false];
+  optional string name = 5;
+
+}
+
+message TensorShapes {
+  repeated TensorShape shapes = 1;
+}
+
+// A named argument containing either singular float, integer and string
+// values, or repeated float, int and string arrays.
+message Argument {
+  optional string name = 1;
+  optional float f = 2;
+  optional int64 i = 3;
+  optional bytes s = 4;
+  optional NetDef n = 8;
+  repeated float floats = 5;
+  repeated int64 ints = 6;
+  repeated bytes strings = 7;
+  repeated NetDef nets = 9;
+}
+
+// DeviceType that Caffe2 currently supports.
+// Note: if you add a device type, make sure you add the corresponding device
+// line in the DeviceTypeName() function in caffe2/utils/proto_utils.cc.
+enum DeviceType {
+  CPU = 0;                    // In default, we will use CPU.
+  CUDA = 1;                   // CUDA.
+  MKLDNN = 2;                 // Reserved for explicit MKLDNN
+  OPENGL = 3;                 // OpenGL
+  OPENCL = 4;                 // OpenCL
+  IDEEP = 5;                  // IDEEP.
+  HIP = 6;                    // AMD HIP
+  // Change the following number if you add more devices in the code.
+  COMPILE_TIME_MAX_DEVICE_TYPES = 7;
+  ONLY_FOR_TEST = 20901701;   // This device type is only for test.
+}
+
+// Device-specific options. We do not distinguish DeviceOption protos for
+// different DeviceTypes, so currently all devices share the same DeviceOption
+// proto. Fields that are specific to a device type is ignored if the type does
+// not match.
+// Note: if you add fields to the DeviceOption, make sure you add the
+// corresponding changes to IsSameDevice() function in utils/proto_utils.{h,cc}.
+message DeviceOption {
+  // [general] Options that need to be carried out before running the execution.
+  // optional DeviceType device_type = 1 [ default = CPU ];
+  optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
+  // [CUDA specific] the cuda gpu id.
+  optional int32 cuda_gpu_id = 2;
+  // [general] The random seed to start the device random number generator with.
+  optional uint32 random_seed = 3;
+  // [general] What node this op should execute on.
+  // Used for net transformation purposes. Must be empty at execution time.
+  optional string node_name = 4;
+  // [CPU and Linux specific] NUMA node id
+  optional int32 numa_node_id = 5 [default = -1];
+  // [general] Extra information passed, not used at execution time currently.
+  repeated string extra_info = 6;
+  // [HIP specific] the hip gpu id.
+  optional int32 hip_gpu_id = 7;
+}
+
+// Operator Definition.
+message OperatorDef {
+  repeated string input = 1; // the name of the input blobs
+  repeated string output = 2; // the name of output top blobs
+  optional string name = 3; // the operator name. This is optional.
+  // the operator type. This is needed to create the object from the operator
+  // registry.
+  optional string type = 4;
+  repeated Argument arg = 5;
+
+  // The device option that the operator should run under.
+  optional DeviceOption device_option = 6;
+
+  // Optionally, one can specify an engine when there are multiple
+  // implementations available simultaneously for one device type.
+  // If one specifies an engine but that engine does not exist in the compiled
+  // Caffe2 binary, Caffe2 will fall back to the default engine of that device
+  // type.
+  optional string engine = 7;
+
+
+  // Additional 'fake' inputs used for expressing control dependencies
+  // in the operator graph. This can be used to ensure that an
+  // operator does not run until another operator is ready, for e.g.
+  // scheduling control. These are not passed as actual inputs to the
+  // Operator implementation, and are only used by the Net class for
+  // scheduling purposes.
+  repeated string control_input = 8;
+
+  // is_gradient_op argument is only used as a hint in shape inference
+  // and has no runtime significance
+  optional bool is_gradient_op = 9 [default = false];
+
+  // debug information associated with the construction of the operator.
+  // This is an optional string with no assumed characteristics as
+  // operators can be constructed in any language.
+  optional string debug_info = 10;
+}
+
+// Network definition.
+message NetDef {
+  optional string name = 1; // the network's name
+  // Operators that the network contains.
+  // Note: this is not named "operator" because that is a reserved word in C++.
+  repeated OperatorDef op = 2;
+
+  // The type of network that the net should be run with. This routes the
+  // network instantiation to different execution modes. The default mode,
+  // "simple", runs the operators in a sequential way as the original Caffe
+  // implementation does.
+  optional string type = 3;
+
+  // the number of workers, if the operators in the network is to be carried out
+  // in parallel.
+  // Note: This is to be deprecated. Using the arg field with "num_workers" as
+  // key.
+  optional int32 num_workers = 4 [deprecated=true];
+
+  // The device option for the network. If a network has a specific device
+  // option and one of its operators does not have it set, we will copy over the
+  // device option to the operator. This allows us to basically avoid putting
+  // device options at every operator.
+  optional DeviceOption device_option = 5;
+
+  repeated Argument arg = 6;
+
+  // Two optional fields to declare external input and output of a net.
+  // If these two are set, when a net is created, we will sanity check for
+  // every op whether its input is declared (either as an external input,
+  // or as an intermediate blob created by one of the ops), and sanity check
+  // if all blobs in external_output are produced.
+  //
+  // In cases of memory optimization, declaring external_input and
+  // external_output also ensures that storage of these blobs are persistent:
+  // for any blob in external_input and external_output, after a network run
+  // finishes, their content are actually the right content. Any intermediate
+  // blobs' contents may be overwritten.
+  repeated string external_input = 7;
+  repeated string external_output = 8;
+}
+
+// ExecutionStep is actually a sort-of-hacky way we simulate iteration right
+// now.
+message ExecutionStep {
+  // ExecutionStep should either contain a set of substeps, or a set of
+  // network names to run in this execution step. They should NOT both be set
+  // at the same time.
+  optional string name = 1;
+  // An execution step could be recursive, in which it involves a set of
+  // substeps.
+  repeated ExecutionStep substep = 2;
+  // Alternatively, an execution step could involve one or more networks.
+  // Note that you cannot have both substeps and networks. Choose one.
+  // Note that an execution step refers networks by their name. The actual
+  // network definition of the same name should be included in the network field
+  // of the plan. The reason is that a network object might hold internal states
+  // (think of a data layer), so we want to have the same network object that
+  // multiple steps could ask to run.
+  repeated string network = 3;
+  // Number of iterations to run this step. The substeps or the networks
+  // specified will be run sequentially, and one sequential run is considered
+  // one iteration. If this is not set, the number of iterations is assumed to
+  // be 1.
+  optional int64 num_iter = 4;
+
+  // Criteria network specifies a single output (TensorCPU<bool>) of
+  // size (1), is run on every iteration by the executor, and
+  // execution terminates when the output[0] is `false`.
+  optional string criteria_network = 5 [deprecated=true];
+
+  // DEPRECATED. Use `run_every_ms`.
+  optional string report_net = 7;
+  optional int32 report_interval = 8;
+
+  // If provided, execute this step at every time interval (in millisecs)
+  // while its sibiling execution steps execute in parallel. This step is
+  // guaranteed to run at least once after all non-interval siblings finished.
+  optional int64 run_every_ms = 11;
+
+  // If false or not set, execute sub-steps serially.
+  // If true, execute all substeps concurrently, each one in a separte thread.
+  optional bool concurrent_substeps = 6;
+
+  // Name of a scalar boolean tensor.
+  // ES checks this blob AFTER every substeps/subnets.
+  // If specified, and the value is true, then ES will skip the rest and return
+  // immediately.
+  // This means that the report_net and the first step will always be called.
+  // Use cases:
+  // 1) the first substep stops the rest if data condition not met
+  // 2) the first substep decide which of the rest of the steps should be run.
+  // 3) external control
+  //
+  // ** It is the user's responsibility to not to put this blob in race conditions.
+  // ** For example when setting this blob in concurrent substeps
+  optional string should_stop_blob = 9;
+
+  // if only_once is true, this step will only be executed once. this ONLY takes
+  // effect when using should_stop_blob
+  optional bool only_once = 10;
+
+  // Whether to create a child workspace for this step.
+  // If yes, the workflow and nets are re-created every time this step is run.
+  optional bool create_workspace = 12;
+
+  // How many copies of the children execution steps to run concurrently.
+  optional int32 num_concurrent_instances = 13;
+}
+
+message PlanDef {
+  // All the networks that are used in this execution. Note that networks should
+  // be ordered in the way they are executed, i.e. for a layer in a network, all
+  // its input blobs should already have been initialized by the layers or
+  // networks defined before it.
+  optional string name = 1;
+  // The networks that are going to be used in this plan.
+  repeated NetDef network = 2;
+  repeated ExecutionStep execution_step = 3;
+}
+
+// Protobuf format for blobs that are not Tensors. We use a key to store the
+// type of the blob. For example for a serialized DBProto, the type should
+// be "DBReader" and the content should be a serialized DBProto object.
+message BlobProto {
+  optional string name = 1;
+  optional string type = 2;
+  optional TensorProto tensor = 3;
+  optional bytes content = 4;
+  optional QTensorProto qtensor = 5;
+  // If blob is not Tensor and is divided into chunks, content_num_chunks
+  // contains number of chunks, into which blob was divided.
+  optional int32 content_num_chunks = 6;
+  optional int32 content_chunk_id = 7;
+}
+
+// Protobuf format to serialize DBReader.
+message DBReaderProto {
+  // The name for the DB object in the workspace.
+  optional string name = 1;
+  // The source of the DB
+  optional string source = 2;
+  // The type of the DB
+  optional string db_type = 3;
+  // The current key of the DB if the DB supports seeking.
+  optional string key = 4;
+}
diff --git a/caffe2/proto/caffe2_legacy.proto b/caffe2/proto/caffe2_legacy.proto
new file mode 100644
index 0000000..21af03a
--- /dev/null
+++ b/caffe2/proto/caffe2_legacy.proto
@@ -0,0 +1,35 @@
+syntax = "proto2";
+
+package caffe2;
+
+enum LegacyPadding {
+  NOTSET = 0;  // Do not use old-stype padding strategies.
+
+  // VALID and SAME are two strategies adopted in Google DistBelief: it forces
+  // the input shape as follows. For SAME, the output is:
+  //   R_out = ceil(float(R) / float(S))
+  //   C_out = ceil(float(C) / float(S))
+  // where R and C are row and column, S is the stride, and K is the kernel.
+  // The number of padded pixels is then computed as
+  //   Pr = ((R_out - 1) * S + K - R)
+  //   Pc = ((C_out - 1) * S + K - C)
+  // When Pr and Pc are even numbers, both sides (left and right, or top and
+  // bottom) get half each. When Pr and Pc are odd numbers, the right and the
+  // bottom gets the one additional padding pixel.
+  // For VALID, padding values of 0 are always used.
+  VALID = 1;
+  SAME = 2;
+
+  // CAFFE_LEGACY_POOLING is a flag that notifies the code to use the old Caffe
+  // padding strategy.
+  // Basically, in caffe2, after padding the convolution and pooling use the
+  // same computation strategy: half-windows at the right and bottom are
+  // discarded. In Caffe, convolution follows this strategy but if there are
+  // some pixels in the half-windows, the pooling layer will actually put one
+  // additional output. If you set LegacyPadding to this, we will compute the
+  // equivalent padding strategy in caffe2 so that the output size is
+  // backward compatible with Caffe.
+  // THIS IS NOW DEPRECATED. ANY non-conventional use has to be manually
+  // converted.
+  CAFFE_LEGACY_POOLING = 3;
+}
diff --git a/caffe2/proto/hsm.proto b/caffe2/proto/hsm.proto
new file mode 100644
index 0000000..2e3152c
--- /dev/null
+++ b/caffe2/proto/hsm.proto
@@ -0,0 +1,62 @@
+syntax = "proto2";
+
+package caffe2;
+
+// Hierarchical Softmax protobuffer convention:
+// The HSM operator requires a hierarchy of vocabulary words in the form of a
+// tree from the user. This tree is expressed using the proto format.
+// TreeProto points to the root NodeProto which can recursively contain children
+// NodeProtos (internal nodes) or word_ids (leaf nodes).
+
+// The aforementioned TreeProto is internally translated into a list of word_ids
+// tagged with a list of NodeProtos that lie in the path from the root to that
+// word_id using hsm_util.create_hierarchy(tree_proto).
+// Specifically, HierarchyProto contains a list of PathProtos. Each PathProto
+// belongs to a word_id and contains a list of PathNodeProtos. Each
+// PathNodeProto contains information about the number of children the node has
+// (length), the index of the child node that lies in the path from root to
+// word_id (target) and a cumulative sum of children nodes (index; this acts as
+// the weight parameter matrix offset).
+
+// Each node in the hierarchy contains links to either leaf nodes or more
+// non-terminal nodes
+message NodeProto {
+  // Links to non-terminal children nodes
+  repeated NodeProto children = 1;
+  // Links to terminal (leaf) nodes
+  repeated int32 word_ids = 2;
+  optional int32 offset = 3;
+  optional string name = 4;
+  repeated float scores = 5;
+}
+
+// Protobuf format to accept hierarchy for hierarchical softmax operator.
+// TreeProto points to the root node.
+message TreeProto {
+  optional NodeProto root_node = 1;
+}
+
+// Internal Protobuf format which represents the path in the tree hierarchy for
+// each word in the vocabulary.
+message HierarchyProto {
+  optional int32 size = 1;
+  repeated PathProto paths = 2;
+}
+
+// Each PathProto belongs to a word and is an array of nodes in the
+// path from the root to the leaf (which is the word itself) in the tree.
+message PathProto {
+  optional int32 word_id = 1;
+  repeated PathNodeProto path_nodes = 2;
+}
+
+// Represents a node in the path from the root node all the way down to the
+// word (leaf).
+message PathNodeProto {
+  // Parameter matrix offset for this node
+  optional int32 index = 1;
+  // Number of children
+  optional int32 length = 2;
+  // Index of the next node in the path
+  optional int32 target = 3;
+}
diff --git a/caffe2/proto/metanet.proto b/caffe2/proto/metanet.proto
new file mode 100644
index 0000000..29222bd
--- /dev/null
+++ b/caffe2/proto/metanet.proto
@@ -0,0 +1,45 @@
+syntax = "proto2";
+
+import "caffe2/proto/caffe2.proto";
+
+package caffe2;
+
+message ModelInfo {
+  optional string project = 1;
+  optional string modelClass = 2;
+  optional string version = 3;
+  optional string predictorType = 4 [ default = "SINGLE_PREDICTOR" ];
+  optional string modelId = 5;
+}
+
+message BlobsMap {
+  required string key = 1;
+  repeated string value = 2;
+}
+
+message NetsMap {
+  required string key = 1;
+  required NetDef value = 2;
+}
+
+message PlansMap {
+  required string key = 1;
+  required PlanDef value = 2;
+}
+
+message StringMap {
+  required string key = 1;
+  required string value = 2;
+}
+
+message MetaNetDef {
+  repeated BlobsMap blobs = 1;
+  // Text-format serialized NetDefs.
+  repeated NetsMap nets = 2;
+  // Info about where the model comes from. Possible use cases:
+  // 1) sanity check or diagnose
+  // 2) provide info for evaluation.
+  optional ModelInfo modelInfo = 3;
+  repeated PlansMap plans = 4;
+  repeated StringMap applicationSpecificInfo = 5;
+}
diff --git a/caffe2/proto/predictor_consts.proto b/caffe2/proto/predictor_consts.proto
new file mode 100644
index 0000000..26d84bf
--- /dev/null
+++ b/caffe2/proto/predictor_consts.proto
@@ -0,0 +1,30 @@
+syntax = "proto2";
+
+package caffe2;
+
+message PredictorConsts {
+  // Important - to ensure ordered traversal of the DB, these must be
+  // set in the given (lexicographic) order in the input DBReader.
+  optional string META_NET_DEF = 1 [ default = "!!META_NET_DEF" ];
+
+  // The key the Predictor sets in the global workspace for DBReader
+  // consumed by the LoadOp in GLOBAL_INIT_NET.
+
+  optional string PREDICTOR_DBREADER = 2 [ default = "!!PREDICTOR_DBREADER" ];
+
+  // Blob types used in MetaNetDef blobs
+  optional string PARAMETERS_BLOB_TYPE = 3 [ default = "PARAMETERS_BLOB_TYPE" ];
+  optional string INPUTS_BLOB_TYPE = 4 [ default = "INPUTS_BLOB_TYPE" ];
+  optional string OUTPUTS_BLOB_TYPE = 5 [ default = "OUTPUTS_BLOB_TYPE" ];
+
+  // Net types used in MetaNetDef nets
+  optional string GLOBAL_INIT_NET_TYPE = 6 [ default = "GLOBAL_INIT_NET_TYPE" ];
+  optional string PREDICT_INIT_NET_TYPE = 7
+      [ default = "PREDICT_INIT_NET_TYPE" ];
+  optional string PREDICT_NET_TYPE = 8 [ default = "PREDICT_NET_TYPE" ];
+  optional string SINGLE_PREDICTOR = 9 [ default = "SINGLE_PREDICTOR" ];
+  optional string MULTI_PREDICTOR = 10 [ default = "MULTI_PREDICTOR" ];
+  optional string TRAIN_INIT_PLAN_TYPE = 11
+      [ default = "TRAIN_INIT_PLAN_TYPE" ];
+  optional string TRAIN_PLAN_TYPE = 12 [ default = "TRAIN_PLAN_TYPE" ];
+}
diff --git a/caffe2/proto/prof_dag.proto b/caffe2/proto/prof_dag.proto
new file mode 100644
index 0000000..343cff1
--- /dev/null
+++ b/caffe2/proto/prof_dag.proto
@@ -0,0 +1,56 @@
+syntax = "proto2";
+
+package caffe2;
+
+// A few notes about the Caffe2's protobuffer convention:
+// (1) Most objects are registered by their types, such as operators and nets.
+//     For these, we have a string-type field "type" for registration purposes.
+// (2) We do not use extension because that used to create quite some conflicts
+//     in Caffe's protobuf design.
+// (3) We have not used any proto3 specific features, such as Any or Map. This
+//     is mainly for backward compatibility purposes but we may consider using
+//     those in the future.
+
+// A two number summary for a value. It also has count for restoring.
+message TwoNumberStatsProto {
+  optional float mean = 1;
+  optional float stddev = 2;
+  optional int64 count = 3;
+}
+
+// Blob profiling information. Profile for a blob is created every time
+// a node outputs to the blob.
+message BlobProfile {
+  // Name of the blob (corresponds to OperatorDef.output).
+  optional string name = 1;  // required
+
+  // Profiling statistics.
+  optional TwoNumberStatsProto bytes_used = 3;
+}
+
+// Protobuf format to serialize profiler data.
+message ProfDAGProto {
+  // The name for the operator
+  required string name = 1;
+  // The mean execution time
+  required float mean = 2;
+  // The standard deviation
+  required float stddev = 3;
+
+  // New field to represent the numbers above, and with count.
+  optional TwoNumberStatsProto execution_time = 4;
+
+  // Blob profiles that this node outputs.
+  repeated BlobProfile output_profile = 5;
+}
+
+// Operator profiling information.
+//
+// Note: The indices for elements of 'stats' and the indices of
+// 'output_profile' inside each 'stats' are assumed to match the
+// indices of 'op' elements of a corresponding NetDef and the 'output'
+// indices within each 'op'.
+message ProfDAGProtos {
+  repeated ProfDAGProto stats = 1;
+  optional string net_name = 2;
+}
diff --git a/caffe2/python/CMakeLists.txt b/caffe2/python/CMakeLists.txt
new file mode 100644
index 0000000..2a1d534
--- /dev/null
+++ b/caffe2/python/CMakeLists.txt
@@ -0,0 +1,47 @@
+# ---[ CPU files.
+set(Caffe2_CPU_PYTHON_SRCS
+    "/pybind_state.cc"
+    "/pybind_state_dlpack.cc"
+)
+
+# ---[ MKL binding
+if(USE_MKL AND USE_MKLML)
+  set(Caffe2_CPU_PYTHON_SRCS
+      ${Caffe2_CPU_PYTHON_SRCS}
+      "/pybind_state_mkl.cc"
+  )
+endif()
+
+# ---[ GPU files
+set(Caffe2_GPU_PYTHON_SRCS
+    ${Caffe2_CPU_PYTHON_SRCS}
+    "/pybind_state_gpu.cc"
+)
+
+# ---[ HIP files
+set(Caffe2_HIP_PYTHON_SRCS
+    ${Caffe2_CPU_PYTHON_SRCS}
+    "/pybind_state_hip.cc"
+)
+
+prepend(Caffe2_CPU_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_CPU_PYTHON_SRCS})
+prepend(Caffe2_GPU_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_GPU_PYTHON_SRCS})
+prepend(Caffe2_HIP_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_HIP_PYTHON_SRCS})
+
+
+# --[ Some special handling for ideep binding as we need to build with "-mavx2"
+if(USE_MKL AND USE_IDEEP AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+  file(GLOB_RECURSE ideep_srcs *_ideep.cc)
+  add_library(Caffe2_ideep_pybind OBJECT ${ideep_srcs})
+  add_dependencies(Caffe2_ideep_pybind Caffe_PROTO Caffe2_PROTO)
+  set_target_properties(Caffe2_ideep_pybind PROPERTIES COMPILE_FLAGS "-mavx2")
+  set(Caffe2_CPU_PYTHON_SRCS
+      ${Caffe2_CPU_PYTHON_SRCS}
+      $<TARGET_OBJECTS:Caffe2_ideep_pybind>
+  )
+endif()
+
+set(Caffe2_CPU_PYTHON_SRCS ${Caffe2_CPU_PYTHON_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_PYTHON_SRCS ${Caffe2_GPU_PYTHON_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_PYTHON_SRCS ${Caffe2_HIP_PYTHON_SRCS} PARENT_SCOPE)
+
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
new file mode 100644
index 0000000..5405329
--- /dev/null
+++ b/caffe2/python/_import_c_extension.py
@@ -0,0 +1,55 @@
+## @package _import_c_extension
+# Module caffe2.python._import_c_extension
+import atexit
+import logging
+import sys
+from caffe2.python import extension_loader
+
+# We will first try to load the gpu-enabled caffe2. If it fails, we will then
+# attempt to load the cpu version. The cpu backend is the minimum required, so
+# if that still fails, we will exit loud.
+with extension_loader.DlopenGuard():
+    try:
+        from caffe2.python.caffe2_pybind11_state_gpu import *  # noqa
+        if num_cuda_devices():  # noqa
+            has_gpu_support = True
+        else:
+            has_gpu_support = False
+    except ImportError as e:
+        has_gpu_support = False
+        try:
+            from caffe2.python.caffe2_pybind11_state_hip import *  # noqa
+            if num_hip_devices():
+                has_hip_support = True
+                logging.info('This caffe2 python run has AMD GPU support!')
+            else:
+                has_hip_support = False
+        except ImportError as e:
+            logging.info('Failed to import AMD hip module: {}'.format(e))
+
+            logging.warning(
+                'This caffe2 python run does not have GPU support. '
+                'Will run in CPU only mode.')
+            logging.warning('Debug message: {0}'.format(str(e)))
+            try:
+                from caffe2.python.caffe2_pybind11_state import *  # noqa
+            except ImportError as e:
+                logging.critical(
+                    'Cannot load caffe2.python. Error: {0}'.format(str(e)))
+                sys.exit(1)
+
+# libcaffe2_python contains a global Workspace that we need to properly delete
+# when exiting. Otherwise, cudart will cause segfaults sometimes.
+atexit.register(on_module_exit)  # noqa
+
+
+# Add functionalities for the TensorCPU interface.
+def _TensorCPU_shape(self):
+    return tuple(self._shape)
+
+
+def _TensorCPU_reshape(self, shape):
+    return self._reshape(list(shape))
+
+TensorCPU.shape = property(_TensorCPU_shape)  # noqa
+TensorCPU.reshape = _TensorCPU_reshape  # noqa
diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py
new file mode 100644
index 0000000..61579b3
--- /dev/null
+++ b/caffe2/python/allcompare_test.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+from multiprocessing import Process
+
+import numpy as np
+import tempfile
+import shutil
+
+import caffe2.python.hypothesis_test_util as hu
+
+op_engine = 'GLOO'
+
+
+class TemporaryDirectory:
+    def __enter__(self):
+        self.tmpdir = tempfile.mkdtemp()
+        return self.tmpdir
+
+    def __exit__(self, type, value, traceback):
+        shutil.rmtree(self.tmpdir)
+
+
+def allcompare_process(filestore_dir, process_id, data, num_procs):
+    from caffe2.python import core, data_parallel_model, workspace, dyndep
+    from caffe2.python.model_helper import ModelHelper
+    from caffe2.proto import caffe2_pb2
+    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
+
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir
+        )
+    )
+    rendezvous = dict(
+        kv_handler="store_handler",
+        shard_id=process_id,
+        num_shards=num_procs,
+        engine=op_engine,
+        exit_nets=None
+    )
+
+    model = ModelHelper()
+    model._rendezvous = rendezvous
+
+    workspace.FeedBlob("test_data", data)
+
+    data_parallel_model._RunComparison(
+        model, "test_data", core.DeviceOption(caffe2_pb2.CPU, 0)
+    )
+
+
+class TestAllCompare(hu.HypothesisTestCase):
+    @given(
+        d=st.integers(1, 5), n=st.integers(2, 11), num_procs=st.integers(1, 8)
+    )
+    def test_allcompare(self, d, n, num_procs):
+        dims = []
+        for _ in range(d):
+            dims.append(np.random.randint(1, high=n))
+        test_data = np.random.ranf(size=tuple(dims)).astype(np.float32)
+
+        with TemporaryDirectory() as tempdir:
+            processes = []
+            for idx in range(num_procs):
+                process = Process(
+                    target=allcompare_process,
+                    args=(tempdir, idx, test_data, num_procs)
+                )
+                processes.append(process)
+                process.start()
+
+            while len(processes) > 0:
+                process = processes.pop()
+                process.join()
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py
new file mode 100644
index 0000000..73be94f
--- /dev/null
+++ b/caffe2/python/attention.py
@@ -0,0 +1,424 @@
+## @package attention
+# Module caffe2.python.attention
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew
+
+
+class AttentionType:
+    Regular, Recurrent, Dot, SoftCoverage = tuple(range(4))
+
+
+def s(scope, name):
+    # We have to manually scope due to our internal/external blob
+    # relationships.
+    return "{}/{}".format(str(scope), str(name))
+
+
+# c_i = \sum_j w_{ij}\textbf{s}_j
+def _calc_weighted_context(
+    model,
+    encoder_outputs_transposed,
+    encoder_output_dim,
+    attention_weights_3d,
+    scope,
+):
+    # [batch_size, encoder_output_dim, 1]
+    attention_weighted_encoder_context = brew.batch_mat_mul(
+        model,
+        [encoder_outputs_transposed, attention_weights_3d],
+        s(scope, 'attention_weighted_encoder_context'),
+    )
+    # [batch_size, encoder_output_dim]
+    attention_weighted_encoder_context, _ = model.net.Reshape(
+        attention_weighted_encoder_context,
+        [
+            attention_weighted_encoder_context,
+            s(scope, 'attention_weighted_encoder_context_old_shape'),
+        ],
+        shape=[1, -1, encoder_output_dim],
+    )
+    return attention_weighted_encoder_context
+
+
+# Calculate a softmax over the passed in attention energy logits
+def _calc_attention_weights(
+    model,
+    attention_logits_transposed,
+    scope,
+    encoder_lengths=None,
+):
+    if encoder_lengths is not None:
+        attention_logits_transposed = model.net.SequenceMask(
+            [attention_logits_transposed, encoder_lengths],
+            ['masked_attention_logits'],
+            mode='sequence',
+        )
+
+    # [batch_size, encoder_length, 1]
+    attention_weights_3d = brew.softmax(
+        model,
+        attention_logits_transposed,
+        s(scope, 'attention_weights_3d'),
+        engine='CUDNN',
+        axis=1,
+    )
+    return attention_weights_3d
+
+
+# e_{ij} = \textbf{v}^T tanh \alpha(\textbf{h}_{i-1}, \textbf{s}_j)
+def _calc_attention_logits_from_sum_match(
+    model,
+    decoder_hidden_encoder_outputs_sum,
+    encoder_output_dim,
+    scope,
+):
+    # [encoder_length, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum = model.net.Tanh(
+        decoder_hidden_encoder_outputs_sum,
+        decoder_hidden_encoder_outputs_sum,
+    )
+
+    # [encoder_length, batch_size, 1]
+    attention_logits = brew.fc(
+        model,
+        decoder_hidden_encoder_outputs_sum,
+        s(scope, 'attention_logits'),
+        dim_in=encoder_output_dim,
+        dim_out=1,
+        axis=2,
+        freeze_bias=True,
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_logits_transposed = brew.transpose(
+        model,
+        attention_logits,
+        s(scope, 'attention_logits_transposed'),
+        axes=[1, 0, 2],
+    )
+    return attention_logits_transposed
+
+
+# \textbf{W}^\alpha used in the context of \alpha_{sum}(a,b)
+def _apply_fc_weight_for_sum_match(
+    model,
+    input,
+    dim_in,
+    dim_out,
+    scope,
+    name,
+):
+    output = brew.fc(
+        model,
+        input,
+        s(scope, name),
+        dim_in=dim_in,
+        dim_out=dim_out,
+        axis=2,
+    )
+    output = model.net.Squeeze(
+        output,
+        output,
+        dims=[0],
+    )
+    return output
+
+
+# Implement RecAtt due to section 4.1 in http://arxiv.org/abs/1601.03317
+def apply_recurrent_attention(
+    model,
+    encoder_output_dim,
+    encoder_outputs_transposed,
+    weighted_encoder_outputs,
+    decoder_hidden_state_t,
+    decoder_hidden_state_dim,
+    attention_weighted_encoder_context_t_prev,
+    scope,
+    encoder_lengths=None,
+):
+    weighted_prev_attention_context = _apply_fc_weight_for_sum_match(
+        model=model,
+        input=attention_weighted_encoder_context_t_prev,
+        dim_in=encoder_output_dim,
+        dim_out=encoder_output_dim,
+        scope=scope,
+        name='weighted_prev_attention_context',
+    )
+
+    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
+        model=model,
+        input=decoder_hidden_state_t,
+        dim_in=decoder_hidden_state_dim,
+        dim_out=encoder_output_dim,
+        scope=scope,
+        name='weighted_decoder_hidden_state',
+    )
+    # [1, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
+        [
+            weighted_prev_attention_context,
+            weighted_decoder_hidden_state,
+        ],
+        s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
+    )
+    # [encoder_length, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum = model.net.Add(
+        [
+            weighted_encoder_outputs,
+            decoder_hidden_encoder_outputs_sum_tmp,
+        ],
+        s(scope, 'decoder_hidden_encoder_outputs_sum'),
+        broadcast=1,
+    )
+    attention_logits_transposed = _calc_attention_logits_from_sum_match(
+        model=model,
+        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
+        encoder_output_dim=encoder_output_dim,
+        scope=scope,
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_weights_3d = _calc_attention_weights(
+        model=model,
+        attention_logits_transposed=attention_logits_transposed,
+        scope=scope,
+        encoder_lengths=encoder_lengths,
+    )
+
+    # [batch_size, encoder_output_dim, 1]
+    attention_weighted_encoder_context = _calc_weighted_context(
+        model=model,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        encoder_output_dim=encoder_output_dim,
+        attention_weights_3d=attention_weights_3d,
+        scope=scope,
+    )
+    return attention_weighted_encoder_context, attention_weights_3d, [
+        decoder_hidden_encoder_outputs_sum,
+    ]
+
+
+def apply_regular_attention(
+    model,
+    encoder_output_dim,
+    encoder_outputs_transposed,
+    weighted_encoder_outputs,
+    decoder_hidden_state_t,
+    decoder_hidden_state_dim,
+    scope,
+    encoder_lengths=None,
+):
+    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
+        model=model,
+        input=decoder_hidden_state_t,
+        dim_in=decoder_hidden_state_dim,
+        dim_out=encoder_output_dim,
+        scope=scope,
+        name='weighted_decoder_hidden_state',
+    )
+
+    # [encoder_length, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum = model.net.Add(
+        [weighted_encoder_outputs, weighted_decoder_hidden_state],
+        s(scope, 'decoder_hidden_encoder_outputs_sum'),
+        broadcast=1,
+        use_grad_hack=1,
+    )
+
+    attention_logits_transposed = _calc_attention_logits_from_sum_match(
+        model=model,
+        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
+        encoder_output_dim=encoder_output_dim,
+        scope=scope,
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_weights_3d = _calc_attention_weights(
+        model=model,
+        attention_logits_transposed=attention_logits_transposed,
+        scope=scope,
+        encoder_lengths=encoder_lengths,
+    )
+
+    # [batch_size, encoder_output_dim, 1]
+    attention_weighted_encoder_context = _calc_weighted_context(
+        model=model,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        encoder_output_dim=encoder_output_dim,
+        attention_weights_3d=attention_weights_3d,
+        scope=scope,
+    )
+    return attention_weighted_encoder_context, attention_weights_3d, [
+        decoder_hidden_encoder_outputs_sum,
+    ]
+
+
+def apply_dot_attention(
+    model,
+    encoder_output_dim,
+    # [batch_size, encoder_output_dim, encoder_length]
+    encoder_outputs_transposed,
+    # [1, batch_size, decoder_state_dim]
+    decoder_hidden_state_t,
+    decoder_hidden_state_dim,
+    scope,
+    encoder_lengths=None,
+):
+    if decoder_hidden_state_dim != encoder_output_dim:
+        weighted_decoder_hidden_state = brew.fc(
+            model,
+            decoder_hidden_state_t,
+            s(scope, 'weighted_decoder_hidden_state'),
+            dim_in=decoder_hidden_state_dim,
+            dim_out=encoder_output_dim,
+            axis=2,
+        )
+    else:
+        weighted_decoder_hidden_state = decoder_hidden_state_t
+
+    # [batch_size, decoder_state_dim]
+    squeezed_weighted_decoder_hidden_state = model.net.Squeeze(
+        weighted_decoder_hidden_state,
+        s(scope, 'squeezed_weighted_decoder_hidden_state'),
+        dims=[0],
+    )
+
+    # [batch_size, decoder_state_dim, 1]
+    expanddims_squeezed_weighted_decoder_hidden_state = model.net.ExpandDims(
+        squeezed_weighted_decoder_hidden_state,
+        squeezed_weighted_decoder_hidden_state,
+        dims=[2],
+    )
+
+    # [batch_size, encoder_output_dim, 1]
+    attention_logits_transposed = model.net.BatchMatMul(
+        [
+            encoder_outputs_transposed,
+            expanddims_squeezed_weighted_decoder_hidden_state,
+        ],
+        s(scope, 'attention_logits'),
+        trans_a=1,
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_weights_3d = _calc_attention_weights(
+        model=model,
+        attention_logits_transposed=attention_logits_transposed,
+        scope=scope,
+        encoder_lengths=encoder_lengths,
+    )
+
+    # [batch_size, encoder_output_dim, 1]
+    attention_weighted_encoder_context = _calc_weighted_context(
+        model=model,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        encoder_output_dim=encoder_output_dim,
+        attention_weights_3d=attention_weights_3d,
+        scope=scope,
+    )
+    return attention_weighted_encoder_context, attention_weights_3d, []
+
+
+def apply_soft_coverage_attention(
+    model,
+    encoder_output_dim,
+    encoder_outputs_transposed,
+    weighted_encoder_outputs,
+    decoder_hidden_state_t,
+    decoder_hidden_state_dim,
+    scope,
+    encoder_lengths,
+    coverage_t_prev,
+    coverage_weights,
+):
+
+    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
+        model=model,
+        input=decoder_hidden_state_t,
+        dim_in=decoder_hidden_state_dim,
+        dim_out=encoder_output_dim,
+        scope=scope,
+        name='weighted_decoder_hidden_state',
+    )
+
+    # [encoder_length, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
+        [weighted_encoder_outputs, weighted_decoder_hidden_state],
+        s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
+        broadcast=1,
+    )
+    # [batch_size, encoder_length]
+    coverage_t_prev_2d = model.net.Squeeze(
+        coverage_t_prev,
+        s(scope, 'coverage_t_prev_2d'),
+        dims=[0],
+    )
+    # [encoder_length, batch_size]
+    coverage_t_prev_transposed = brew.transpose(
+        model,
+        coverage_t_prev_2d,
+        s(scope, 'coverage_t_prev_transposed'),
+    )
+
+    # [encoder_length, batch_size, encoder_output_dim]
+    scaled_coverage_weights = model.net.Mul(
+        [coverage_weights, coverage_t_prev_transposed],
+        s(scope, 'scaled_coverage_weights'),
+        broadcast=1,
+        axis=0,
+    )
+
+    # [encoder_length, batch_size, encoder_output_dim]
+    decoder_hidden_encoder_outputs_sum = model.net.Add(
+        [decoder_hidden_encoder_outputs_sum_tmp, scaled_coverage_weights],
+        s(scope, 'decoder_hidden_encoder_outputs_sum'),
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_logits_transposed = _calc_attention_logits_from_sum_match(
+        model=model,
+        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
+        encoder_output_dim=encoder_output_dim,
+        scope=scope,
+    )
+
+    # [batch_size, encoder_length, 1]
+    attention_weights_3d = _calc_attention_weights(
+        model=model,
+        attention_logits_transposed=attention_logits_transposed,
+        scope=scope,
+        encoder_lengths=encoder_lengths,
+    )
+
+    # [batch_size, encoder_output_dim, 1]
+    attention_weighted_encoder_context = _calc_weighted_context(
+        model=model,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        encoder_output_dim=encoder_output_dim,
+        attention_weights_3d=attention_weights_3d,
+        scope=scope,
+    )
+
+    # [batch_size, encoder_length]
+    attention_weights_2d = model.net.Squeeze(
+        attention_weights_3d,
+        s(scope, 'attention_weights_2d'),
+        dims=[2],
+    )
+
+    coverage_t = model.net.Add(
+        [coverage_t_prev, attention_weights_2d],
+        s(scope, 'coverage_t'),
+        broadcast=1,
+    )
+
+    return (
+        attention_weighted_encoder_context,
+        attention_weights_3d,
+        [decoder_hidden_encoder_outputs_sum],
+        coverage_t,
+    )
diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py
new file mode 100644
index 0000000..8393ca7
--- /dev/null
+++ b/caffe2/python/benchmark_generator.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import string
+
+import argparse
+
+import numpy as np
+
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.predictor import mobile_exporter
+from caffe2.python import core, workspace, brew, utils
+
+
+def parse_kwarg(kwarg_str):
+    key, value = map(string.strip, kwarg_str.split("=", 1))
+    try:
+        value = int(value)
+    except ValueError:
+        try:
+            value = float(value)
+        except ValueError:
+            pass
+    return key, value
+
+
+def main(args):
+    # User defined keyword arguments
+    kwargs = {"order": "NCHW"}
+    kwargs.update(dict(args.kwargs))
+
+    model = ModelHelper(name=args.benchmark_name)
+
+    op_type = args.operator  # assumes a brew type op name
+    input_name = args.input_name
+    output_name = args.output_name
+
+    iters = int(args.iters)
+    for i in range(iters):
+        input_blob_name = input_name + (str(i) if i > 0 and args.chain else '')
+        output_blob_name = output_name + str(i + 1)
+        add_op = getattr(brew, op_type)
+        add_op(model, input_blob_name, output_blob_name, **kwargs)
+        if args.chain:
+            input_name, output_name = output_name, input_name
+
+    workspace.RunNetOnce(model.param_init_net)
+    extra_init_net_ops = []
+
+    def make_blob_on_context(blob_name, blob_data, context):
+        if context.upper() != "CPU":
+            blob_name_modified = "{}_CPU".format(blob_name)
+        else:  # CPU case is simple
+            blob_name_modified = blob_name
+
+        fill_op = core.CreateOperator(
+            "GivenTensorFill", [], [blob_name_modified],
+            arg=[
+                utils.MakeArgument("shape", blob_data.shape),
+                utils.MakeArgument("values", blob_data)
+            ]
+        )
+        extra_init_net_ops.append(fill_op)
+
+        # We need to create CPU blobs and add some copy operations in
+        # the init_net
+        if context.upper() == "OPENGL":
+            copy_op = core.CreateOperator("CopyToOpenGL", [blob_name_modified],
+                                          [blob_name])
+            extra_init_net_ops.append(copy_op)
+
+    for unparsed_blob in args.blob:
+        name, unparsed_dims = unparsed_blob.split('=')
+        dims = [int(d) for d in unparsed_dims.split(',')]
+        np_input = np.random.rand(*dims).astype(np.float32)
+        make_blob_on_context(name, np_input, args.context)
+
+    init_net, predict_net = mobile_exporter.Export(
+        workspace, model.net, model.params
+    )
+    init_net.op.extend(extra_init_net_ops)
+
+    # Handle manual rewrite
+    if args.context.upper() == "OPENGL":
+        old_ops = [op for op in predict_net.op]
+        del predict_net.op[:]
+        for op in old_ops:
+            op.type = 'OpenGL{}'.format(op.type)
+        predict_net.op.extend(old_ops)
+
+    if args.debug:
+        print("init_net:")
+        for op in init_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+        print("predict_net:")
+        for op in predict_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+
+    with open(args.predict_net, 'wb') as f:
+        f.write(predict_net.SerializeToString())
+    with open(args.init_net, 'wb') as f:
+        f.write(init_net.SerializeToString())
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Utilitity to generate Caffe2 benchmark models.")
+    parser.add_argument("operator", help="Caffe2 operator to benchmark.")
+    parser.add_argument("-b", "--blob",
+                        help="Instantiate a blob --blob name=dim1,dim2,dim3",
+                        action='append')
+    parser.add_argument("--context", help="Context to run on.", default="CPU")
+    parser.add_argument("--kwargs", help="kwargs to pass to operator.",
+                        nargs="*", type=parse_kwarg, default=[])
+    parser.add_argument("--init_net", help="Output initialization net.",
+                        default="init_net.pb")
+    parser.add_argument("--predict_net", help="Output prediction net.",
+                        default="predict_net.pb")
+    parser.add_argument("--benchmark_name",
+                        help="Name of the benchmark network",
+                        default="benchmark")
+    parser.add_argument("--input_name", help="Name of the input blob.",
+                        default="data")
+    parser.add_argument("--output_name", help="Name of the output blob.",
+                        default="output")
+    parser.add_argument("--iters",
+                        help="Number of iterations to run the operator.",
+                        default="1")
+    parser.add_argument("-d", "--debug", help="Print debug information.",
+                        action='store_true')
+    parser.add_argument("-c", "--chain",
+                        help="Chain ops together (create data dependencies)",
+                        action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py
new file mode 100644
index 0000000..802d610
--- /dev/null
+++ b/caffe2/python/binarysize.py
@@ -0,0 +1,164 @@
+"""A tool to inspect the binary size of a built binary file.
+
+This script prints out a tree of symbols and their corresponding sizes, using
+Linux's nm functionality.
+
+Usage:
+
+    python binary_size.py -- \
+            --target=/path/to/your/target/binary \
+            [--nm_command=/path/to/your/custom/nm] \
+            [--max_depth=10] [--min_size=1024] \
+            [--color] \
+
+To assist visualization, pass in '--color' to make the symbols color coded to
+green, assuming that you have a xterm connection that supports color.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import subprocess
+import sys
+
+
+class Trie(object):
+    """A simple class that represents a Trie."""
+
+    def __init__(self, name):
+        """Initializes a Trie object."""
+        self.name = name
+        self.size = 0
+        self.dictionary = {}
+
+
+def GetSymbolTrie(target, nm_command, max_depth):
+    """Gets a symbol trie with the passed in target.
+
+    Args:
+            target: the target binary to inspect.
+            nm_command: the command to run nm.
+            max_depth: the maximum depth to create the trie.
+    """
+    # Run nm to get a dump on the strings.
+    proc = subprocess.Popen(
+        [nm_command, '--radix=d', '--size-sort', '--print-size', target],
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    nm_out, _ = proc.communicate()
+    if proc.returncode != 0:
+        print('NM command failed. Output is as follows:')
+        print(nm_out)
+        sys.exit(1)
+    # Run c++filt to get proper symbols.
+    proc = subprocess.Popen(['c++filt'],
+                            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT)
+    out, _ = proc.communicate(input=nm_out)
+    if proc.returncode != 0:
+        print('c++filt failed. Output is as follows:')
+        print(out)
+        sys.exit(1)
+    # Splits the output to size and function name.
+    data = []
+    for line in out.split('\n'):
+        if line:
+            content = line.split(' ')
+            if len(content) < 4:
+                # This is a line not representing symbol sizes. skip.
+                continue
+            data.append([int(content[1]), ' '.join(content[3:])])
+    symbol_trie = Trie('')
+    for size, name in data:
+        curr = symbol_trie
+        for c in name:
+            if c not in curr.dictionary:
+                curr.dictionary[c] = Trie(curr.name + c)
+            curr = curr.dictionary[c]
+            curr.size += size
+            if len(curr.name) > max_depth:
+                break
+    symbol_trie.size = sum(t.size for t in symbol_trie.dictionary.values())
+    return symbol_trie
+
+
+def MaybeAddColor(s, color):
+    """Wrap the input string to the xterm green color, if color is set.
+    """
+    if color:
+        return '\033[92m{0}\033[0m'.format(s)
+    else:
+        return s
+
+
+def ReadableSize(num):
+    """Get a human-readable size."""
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if abs(num) <= 1024.0:
+            return '%3.2f%s' % (num, unit)
+        num /= 1024.0
+    return '%.1f TB' % (num,)
+
+
+# Note(jiayq): I know, I know, this is a recursive function, but it is
+# convenient to write.
+def PrintTrie(trie, prefix, max_depth, min_size, color):
+    """Prints the symbol trie in a readable manner.
+    """
+    if len(trie.name) == max_depth or not trie.dictionary.keys():
+        # If we are reaching a leaf node or the maximum depth, we will print the
+        # result.
+        if trie.size > min_size:
+            print('{0}{1} {2}'.format(
+                  prefix,
+                  MaybeAddColor(trie.name, color),
+                  ReadableSize(trie.size)))
+    elif len(trie.dictionary.keys()) == 1:
+        # There is only one child in this dictionary, so we will just delegate
+        # to the downstream trie to print stuff.
+        PrintTrie(
+            trie.dictionary.values()[0], prefix, max_depth, min_size, color)
+    elif trie.size > min_size:
+        print('{0}{1} {2}'.format(
+              prefix,
+              MaybeAddColor(trie.name, color),
+              ReadableSize(trie.size)))
+        keys_with_sizes = [
+            (k, trie.dictionary[k].size) for k in trie.dictionary.keys()]
+        keys_with_sizes.sort(key=lambda x: x[1])
+        for k, _ in keys_with_sizes[::-1]:
+            PrintTrie(
+                trie.dictionary[k], prefix + ' |', max_depth, min_size, color)
+
+
+def main(argv):
+    if not sys.platform.startswith('linux'):
+        raise RuntimeError('Currently this tool only supports Linux.')
+    parser = argparse.ArgumentParser(
+        description="Tool to inspect binary size.")
+    parser.add_argument(
+        '--max_depth', type=int, default=10,
+        help='The maximum depth to print the symbol tree.')
+    parser.add_argument(
+        '--min_size', type=int, default=1024,
+        help='The mininum symbol size to print.')
+    parser.add_argument(
+        '--nm_command', type=str, default='nm',
+        help='The path to the nm command that the tool needs.')
+    parser.add_argument(
+        '--color', action='store_true',
+        help='If set, use ascii color for output.')
+    parser.add_argument(
+        '--target', type=str,
+        help='The binary target to inspect.')
+    args = parser.parse_args(argv)
+    if not args.target:
+        raise RuntimeError('You must specify a target to inspect.')
+    symbol_trie = GetSymbolTrie(
+        args.target, args.nm_command, args.max_depth)
+    PrintTrie(symbol_trie, '', args.max_depth, args.min_size, args.color)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
new file mode 100644
index 0000000..bb6d3f0
--- /dev/null
+++ b/caffe2/python/brew.py
@@ -0,0 +1,130 @@
+## @package model_helper_api
+# Module caffe2.python.model_helper_api
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import copy
+import inspect
+from past.builtins import basestring
+from caffe2.python.model_helper import ModelHelper
+
+# flake8: noqa
+from caffe2.python.helpers.algebra import *
+from caffe2.python.helpers.arg_scope import *
+from caffe2.python.helpers.array_helpers import *
+from caffe2.python.helpers.control_ops import *
+from caffe2.python.helpers.conv import *
+from caffe2.python.helpers.db_input import *
+from caffe2.python.helpers.dropout import *
+from caffe2.python.helpers.elementwise_linear import *
+from caffe2.python.helpers.fc import *
+from caffe2.python.helpers.nonlinearity import *
+from caffe2.python.helpers.normalization import *
+from caffe2.python.helpers.pooling import *
+from caffe2.python.helpers.tools import *
+from caffe2.python.helpers.train import *
+
+
+class HelperWrapper(object):
+    _registry = {
+        'arg_scope': arg_scope,
+        'fc': fc,
+        'packed_fc': packed_fc,
+        'fc_decomp': fc_decomp,
+        'fc_sparse': fc_sparse,
+        'fc_prune': fc_prune,
+        'dropout': dropout,
+        'max_pool': max_pool,
+        'average_pool': average_pool,
+        'max_pool_with_index' : max_pool_with_index,
+        'lrn': lrn,
+        'softmax': softmax,
+        'instance_norm': instance_norm,
+        'spatial_bn': spatial_bn,
+        'spatial_gn': spatial_gn,
+        'relu': relu,
+        'prelu': prelu,
+        'tanh': tanh,
+        'concat': concat,
+        'depth_concat': depth_concat,
+        'sum': sum,
+        'transpose': transpose,
+        'iter': iter,
+        'accuracy': accuracy,
+        'conv': conv,
+        'conv_nd': conv_nd,
+        'conv_transpose': conv_transpose,
+        'group_conv': group_conv,
+        'group_conv_deprecated': group_conv_deprecated,
+        'image_input': image_input,
+        'video_input': video_input,
+        'add_weight_decay': add_weight_decay,
+        'elementwise_linear': elementwise_linear,
+        'layer_norm': layer_norm,
+        'batch_mat_mul' : batch_mat_mul,
+        'cond' : cond,
+        'loop' : loop,
+        'db_input' : db_input,
+    }
+
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+
+    def __getattr__(self, helper_name):
+        if helper_name not in self._registry:
+            raise AttributeError(
+                "Helper function {} not "
+                "registered.".format(helper_name)
+            )
+
+        def scope_wrapper(*args, **kwargs):
+            new_kwargs = {}
+            if helper_name != 'arg_scope':
+                if len(args) > 0 and isinstance(args[0], ModelHelper):
+                    model = args[0]
+                elif 'model' in kwargs:
+                    model = kwargs['model']
+                else:
+                    raise RuntimeError(
+                "The first input of helper function should be model. " \
+                "Or you can provide it in kwargs as model=<your_model>.")
+                new_kwargs = copy.deepcopy(model.arg_scope)
+            func = self._registry[helper_name]
+            var_names, _, varkw, _= inspect.getargspec(func)
+            if varkw is None:
+                # this helper function does not take in random **kwargs
+                new_kwargs = {
+                    var_name: new_kwargs[var_name]
+                    for var_name in var_names if var_name in new_kwargs
+                }
+
+            cur_scope = get_current_scope()
+            new_kwargs.update(cur_scope.get(helper_name, {}))
+            new_kwargs.update(kwargs)
+            return func(*args, **new_kwargs)
+
+        scope_wrapper.__name__ = helper_name
+        return scope_wrapper
+
+    def Register(self, helper):
+        name = helper.__name__
+        if name in self._registry:
+            raise AttributeError(
+                "Helper {} already exists. Please change your "
+                "helper name.".format(name)
+            )
+        self._registry[name] = helper
+
+    def has_helper(self, helper_or_helper_name):
+        helper_name = (
+            helper_or_helper_name
+            if isinstance(helper_or_helper_name, basestring) else
+            helper_or_helper_name.__name__
+        )
+        return helper_name in self._registry
+
+
+sys.modules[__name__] = HelperWrapper(sys.modules[__name__])
diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py
new file mode 100644
index 0000000..8b3d089
--- /dev/null
+++ b/caffe2/python/brew_test.py
@@ -0,0 +1,328 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew, core, scope, workspace
+from caffe2.python.modeling.parameter_info import ParameterTags
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.cnn import CNNModelHelper
+
+import unittest
+import numpy as np
+
+
+class BrewTest(unittest.TestCase):
+    def setUp(self):
+
+        def myhelper(model, val=-1):
+            return val
+
+        if not brew.has_helper(myhelper):
+            brew.Register(myhelper)
+        self.myhelper = myhelper
+
+        def myhelper2(model, val=-1):
+            return val
+
+        if not brew.has_helper(myhelper2):
+            brew.Register(myhelper2)
+        self.myhelper2 = myhelper2
+        self.model = ModelHelper(name="test_model")
+
+    def test_dropout(self):
+        p = 0.2
+        X = np.ones((100, 100)).astype(np.float32) - p
+        workspace.FeedBlob("x", X)
+        model = ModelHelper(name="test_model")
+        brew.dropout(model, "x", "out", is_test=False)
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        out = workspace.FetchBlob("out")
+        self.assertLess(abs(out.mean() - (1 - p)), 0.05)
+
+    def test_fc(self):
+        m, n, k = (15, 15, 15)
+        X = np.random.rand(m, k).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        model = ModelHelper(name="test_model")
+        brew.fc(model, "x", "out_1", k, n)
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+    def test_relu(self):
+        Xpos = np.ones((5, 5)).astype(np.float32) - 0.5
+        Xneg = np.ones((5, 5)).astype(np.float32) - 1.5
+
+        workspace.FeedBlob("xpos", Xpos)
+        workspace.FeedBlob("xneg", Xneg)
+        model = ModelHelper(name="test_model")
+        brew.relu(model, "xpos", "out_xpos")
+        brew.relu(model, "xneg", "out_xneg")
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        pos = workspace.FetchBlob("out_xpos")
+        self.assertAlmostEqual(pos.mean(), 0.5)
+        neg = workspace.FetchBlob("out_xneg")
+        self.assertAlmostEqual(neg.mean(), 0)
+
+    def test_tanh(self):
+        X = np.ones((5, 5)).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        model = ModelHelper(name="test_model")
+        brew.tanh(model, "x", "out_tanh")
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        out = workspace.FetchBlob("out_tanh")
+        self.assertAlmostEqual(out.mean(), np.tanh(0.5), places=5)
+
+    def test_validate(self):
+        model = ModelHelper(name="test_model")
+        model.params.append("aaa")
+        model.params.append("bbb")
+        self.assertEqual(model._Validate(), [])
+
+        model.params.append("xxx")
+        model.params.append("bbb")
+        self.assertEqual(model._Validate(), ["bbb"])
+
+    def test_arg_scope(self):
+        myhelper = self.myhelper
+        myhelper2 = self.myhelper2
+        n = 15
+        with brew.arg_scope([myhelper], val=n):
+            res = brew.myhelper(self.model)
+        self.assertEqual(n, res)
+
+        with brew.arg_scope([myhelper, myhelper2], val=n):
+            res1 = brew.myhelper(self.model)
+            res2 = brew.myhelper2(self.model)
+        self.assertEqual([n, n], [res1, res2])
+
+    def test_arg_scope_single(self):
+        X = np.random.rand(64, 3, 32, 32).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        model = ModelHelper(name="test_model")
+        with brew.arg_scope(
+            brew.conv,
+            stride=2,
+            pad=2,
+            weight_init=('XavierFill', {}),
+            bias_init=('ConstantFill', {})
+        ):
+            brew.conv(
+                model=model,
+                blob_in="x",
+                blob_out="out",
+                dim_in=3,
+                dim_out=64,
+                kernel=3,
+            )
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        out = workspace.FetchBlob("out")
+        self.assertEqual(out.shape, (64, 64, 17, 17))
+
+    def test_arg_scope_nested(self):
+        myhelper = self.myhelper
+        n = 16
+        with brew.arg_scope([myhelper], val=-3), \
+                brew.arg_scope([myhelper], val=-2):
+            with brew.arg_scope([myhelper], val=n):
+                res = brew.myhelper(self.model)
+                self.assertEqual(n, res)
+            res = brew.myhelper(self.model)
+            self.assertEqual(res, -2)
+
+        res = brew.myhelper(self.model, val=15)
+        self.model.Validate()
+        self.assertEqual(res, 15)
+
+    def test_double_register(self):
+        myhelper = self.myhelper
+        with self.assertRaises(AttributeError):
+            brew.Register(myhelper)
+
+    def test_has_helper(self):
+        self.assertTrue(brew.has_helper(brew.conv))
+        self.assertTrue(brew.has_helper("conv"))
+
+        def myhelper3():
+            pass
+
+        self.assertFalse(brew.has_helper(myhelper3))
+
+    def test_model_helper(self):
+        X = np.random.rand(64, 32, 32, 3).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        my_arg_scope = {'order': 'NHWC'}
+        model = ModelHelper(name="test_model", arg_scope=my_arg_scope)
+        with brew.arg_scope(
+            brew.conv,
+            stride=2,
+            pad=2,
+            weight_init=('XavierFill', {}),
+            bias_init=('ConstantFill', {})
+        ):
+            brew.conv(
+                model=model,
+                blob_in="x",
+                blob_out="out",
+                dim_in=3,
+                dim_out=64,
+                kernel=[8, 3]
+            )
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        out = workspace.FetchBlob("out")
+        self.assertEqual(out.shape, (64, 15, 17, 64))
+
+    def test_cnn_model_helper_deprecated(self):
+        X = np.random.rand(64, 32, 32, 3).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        # CNNModelHelper is going to be deprecated soon. This test is only
+        # covering some CNNModelHelper logic
+        model = CNNModelHelper(name="test_model", order='NHWC')
+        self.assertEqual(model.arg_scope['order'], 'NHWC')
+
+    def test_get_params(self):
+        def param(x):
+            return core.ScopedBlobReference(x)
+
+        def to_str_list(x):
+            return sorted([str(p) for p in x])
+
+        model = ModelHelper(name="test_model")
+        model.AddParameter(param("a"))
+        model.AddParameter(param("b"), tags=ParameterTags.COMPUTED_PARAM)
+        with scope.NameScope("c"):
+            model.AddParameter(param("a"))
+            model.AddParameter(param("d"), tags=ParameterTags.COMPUTED_PARAM)
+            self.assertEqual(to_str_list(model.GetParams()), ['c/a'])
+            self.assertEqual(to_str_list(model.GetComputedParams()), ['c/d'])
+            self.assertEqual(to_str_list(model.GetAllParams()), ['c/a', 'c/d'])
+            # Get AllParams from the global Scope
+            self.assertEqual(to_str_list(model.GetAllParams('')), [
+                             'a', 'b', 'c/a', 'c/d'])
+        self.assertEqual(to_str_list(model.GetParams()), ['a', 'c/a'])
+        self.assertEqual(to_str_list(model.GetComputedParams()), ['b', 'c/d'])
+        self.assertEqual(to_str_list(model.GetAllParams()),
+                         ['a', 'b', 'c/a', 'c/d'])
+        self.assertEqual(to_str_list(model.GetAllParams('')),
+                         ['a', 'b', 'c/a', 'c/d'])
+        # Get AllParams from the scope 'c'
+        self.assertEqual(to_str_list(model.GetAllParams('c')), ['c/a', 'c/d'])
+        self.assertEqual(to_str_list(model.GetAllParams('c/')), ['c/a', 'c/d'])
+
+    def test_param_consistence(self):
+        model = ModelHelper(name='test_mode')
+        cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
+        step_model = ModelHelper(name='step_model', param_model=model)
+        a = brew.fc(step_model, cnv, 'a', 100, 200)
+        brew.fc(model, a, 'b', 200, 5)
+        # test the _parameters_info is shared between model and step_model
+        self.assertEqual(model._parameters_info, step_model._parameters_info)
+
+    def test_cond(self):
+        workspace.FeedBlob("cond", np.array(True))
+        workspace.FeedBlob("then_value", np.array(1))
+        workspace.FeedBlob("else_value", np.array(2))
+
+        then_model = ModelHelper(name="then_test_model")
+        then_model.net.Copy("then_value", "output_blob")
+
+        else_model = ModelHelper(name="else_test_model")
+        else_model.net.Copy("else_value", "output_blob")
+
+        model = ModelHelper(name="test_model")
+        brew.cond(
+            model=model,
+            cond_blob="cond",
+            external_blobs=["then_value", "else_value", "output_blob"],
+            then_model=then_model,
+            else_model=else_model)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        output_value = workspace.FetchBlob("output_blob")
+        self.assertEqual(output_value, 1)
+        workspace.FeedBlob("cond", np.array(False))
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        output_value = workspace.FetchBlob("output_blob")
+        self.assertEqual(output_value, 2)
+
+    def test_loop(self):
+        workspace.FeedBlob("cond", np.array(True))
+        workspace.FeedBlob("ONE", np.array(1))
+        workspace.FeedBlob("TWO", np.array(2))
+        workspace.FeedBlob("TEN", np.array(10))
+        workspace.FeedBlob("counter", np.array(0))
+        workspace.FeedBlob("output_blob", np.array(0))
+
+        loop_model = ModelHelper(name="loop_test_model")
+        loop_model.net.Add(["output_blob", "TWO"], "output_blob")
+
+        cond_model = ModelHelper(name="cond_test_model")
+        cond_model.net.Add(["counter", "ONE"], "counter")
+        comp_res = cond_model.net.LT(["counter", "TEN"])
+        cond_model.net.Copy(comp_res, "cond")
+
+        model = ModelHelper(name="test_model")
+        brew.loop(
+            model=model,
+            cond_blob="cond",
+            external_blobs=["cond", "ONE", "TWO", "TEN", "counter", "output_blob"],
+            loop_model=loop_model,
+            cond_model=cond_model)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        output_value = workspace.FetchBlob("output_blob")
+        self.assertEqual(output_value, 18)
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+class BrewGPUTest(unittest.TestCase):
+    def test_relu(self):
+        Xpos = np.ones((5, 5)).astype(np.float32) - 0.5
+        Xneg = np.ones((5, 5)).astype(np.float32) - 1.5
+
+        workspace.FeedBlob("xpos", Xpos)
+        workspace.FeedBlob("xneg", Xneg)
+        model = ModelHelper(name="test_model")
+        brew.relu(model, "xpos", "out_xpos", use_cudnn=True)
+        brew.relu(model, "xneg", "out_xneg", use_cudnn=True)
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        pos = workspace.FetchBlob("out_xpos")
+        self.assertAlmostEqual(pos.mean(), 0.5)
+        neg = workspace.FetchBlob("out_xneg")
+        self.assertAlmostEqual(neg.mean(), 0)
+
+    def test_tanh(self):
+        X = np.ones((5, 5)).astype(np.float32) - 0.5
+
+        workspace.FeedBlob("x", X)
+        model = ModelHelper(name="test_model")
+        brew.tanh(model, "x", "out_tanh", use_cudnn=True)
+        model.Validate()
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        out = workspace.FetchBlob("out_tanh")
+        self.assertAlmostEqual(out.mean(), np.tanh(0.5), places=5)
diff --git a/caffe2/python/build.py b/caffe2/python/build.py
new file mode 100644
index 0000000..0f44726
--- /dev/null
+++ b/caffe2/python/build.py
@@ -0,0 +1,9 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.python._import_c_extension as C
+
+CAFFE2_NO_OPERATOR_SCHEMA = C.define_caffe2_no_operator_schema
+build_options = C.get_build_options()
diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py
new file mode 100644
index 0000000..b67cc4f
--- /dev/null
+++ b/caffe2/python/cached_reader.py
@@ -0,0 +1,125 @@
+## @package cached_reader
+# Module caffe2.python.cached_reader
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+
+from caffe2.python import core
+from caffe2.python.db_file_reader import DBFileReader
+from caffe2.python.pipeline import pipe
+from caffe2.python.task import Cluster, TaskGroup
+
+
+class CachedReader(DBFileReader):
+
+    default_name_suffix = 'cached_reader'
+
+    """Reader with persistent in-file cache.
+
+    Example usage:
+    cached_reader = CachedReader(
+        reader,
+        db_path='/tmp/cache.db',
+        db_type='LevelDB',
+    )
+    build_cache_step = cached_reader.build_cache_step()
+    with LocalSession() as session:
+        session.run(build_cache_step)
+
+    Every time new CachedReader is created, it's expected that
+    db_path exists before calling .setup_ex(...) and .read(...).
+
+    If db_path doesn't exist, it's expected build_cache_step to be called
+    first to build a cache at db_path.
+
+    build_cache_step will check existence of provided db_path and in case
+    it's missing will initialize it by reading data from original reader.
+    All consequent attempts to read will ignore original reader
+    (i.e. no additional data will be read from it).
+
+    Args:
+        original_reader: Reader.
+            If provided, it's the original reader used to build the cache file.
+        db_path: str.
+        db_type: str. DB type of file. A db_type is registed by
+            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
+            Default to 'LevelDB'.
+        name: str or None. Name of CachedReader.
+            Optional name to prepend to blobs that will store the data.
+            Default to '<db_name>_<default_name_suffix>'.
+        batch_size: int.
+            How many examples are read for each time the read_net is run.
+    """
+    def __init__(
+        self,
+        original_reader,
+        db_path,
+        db_type='LevelDB',
+        name=None,
+        batch_size=100,
+    ):
+        assert original_reader is not None, "original_reader can't be None"
+        self.original_reader = original_reader
+
+        super(CachedReader, self).__init__(
+            db_path,
+            db_type,
+            name,
+            batch_size,
+        )
+
+    def _init_reader_schema(self, *args, **kwargs):
+        """Prepare the reader schema.
+
+            Since an original reader is given,
+            use it's schema as ground truth.
+
+            Returns:
+                schema: schema.Struct. Used in Reader.__init__(...).
+        """
+        return self.original_reader._schema
+
+    def build_cache_step(self, overwrite=False):
+        """Build a step for generating cache DB file.
+
+            If self.db_path exists and not overwritting, build an empty step.
+            Overwise, build a step as follows.
+            Pipe original reader to the _DatasetWriter,
+            so that dataset field blobs are populated.
+            Then save these blobs into a file.
+
+            Args:
+                overwrite: bool. If true, ignore the existing file
+                    and build a new one overwritting the existing one anyway.
+
+            Returns:
+                build_cache_step: ExcutionStep.
+                    The step to be run for building a cache DB file.
+        """
+        if os.path.exists(self.db_path) and not overwrite:
+            # cache already exists, no need to rebuild it
+            return core.execution_step('build_step', [])
+
+        init_net = core.Net('init')
+        self._init_field_blobs_as_empty(init_net)
+        with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg:
+            pipe(self.original_reader, self.ds.writer(), num_threads=16)
+            copy_step = copy_tg.to_task().get_step()
+        save_net = core.Net('save')
+        self._save_field_blobs_to_db_file(save_net)
+
+        return core.execution_step('build_cache', [init_net, copy_step, save_net])
+
+    def _save_field_blobs_to_db_file(self, net):
+        """Save dataset field blobs to a DB file at db_path"""
+        net.Save(
+            self.ds.get_blobs(),
+            [],
+            db=self.db_path,
+            db_type=self.db_type,
+            blob_name_overrides=self.ds.field_names(),
+            absolute_path=True,
+        )
diff --git a/caffe2/python/caffe_translator.py b/caffe2/python/caffe_translator.py
new file mode 100644
index 0000000..561abd0
--- /dev/null
+++ b/caffe2/python/caffe_translator.py
@@ -0,0 +1,929 @@
+## @package caffe_translator
+# Module caffe2.python.caffe_translator
+#!/usr/bin/env python2
+
+import argparse
+import copy
+import logging
+import re
+import numpy as np  # noqa
+
+from caffe2.proto import caffe2_pb2, caffe2_legacy_pb2
+from caffe.proto import caffe_pb2
+from caffe2.python import core, utils, workspace
+from google.protobuf import text_format
+
+logging.basicConfig()
+log = logging.getLogger("caffe_translator")
+log.setLevel(logging.INFO)
+
+
+def _StateMeetsRule(state, rule):
+    """A function that reproduces Caffe's StateMeetsRule functionality."""
+    if rule.HasField('phase') and rule.phase != state.phase:
+        return False
+    if rule.HasField('min_level') and state.level < rule.min_level:
+        return False
+    if rule.HasField('max_level') and state.level > rule.max_level:
+        return False
+    curr_stages = set(list(state.stage))
+    # all stages in rule.stages should be in, otherwise it's not a match.
+    if len(rule.stage) and any([s not in curr_stages for s in rule.stage]):
+        return False
+    # none of the stage in rule.stages should be in, otherwise it's not a match.
+    if len(rule.not_stage) and any([s in curr_stages for s in rule.not_stage]):
+        return False
+    # If none of the nonmatch happens, return True.
+    return True
+
+
+def _ShouldInclude(net_state, layer):
+    """A function that reproduces Caffe's inclusion and exclusion rule."""
+    ret = (len(layer.include) == 0)
+    # check exclude rules: if any exclusion is met, we shouldn't include.
+    ret &= not any([_StateMeetsRule(net_state, rule) for rule in layer.exclude])
+    if len(layer.include):
+        # check include rules: if any inclusion is met, we should include.
+        ret |= any([_StateMeetsRule(net_state, rule) for rule in layer.include])
+    return ret
+
+
+def _GetLegacyDims(net, net_params, dummy_input, legacy_pad_ops):
+    dim_map = {}
+    ws = workspace.C.Workspace()
+    for param in net_params.protos:
+        ws.create_blob(param.name) \
+            .feed(utils.Caffe2TensorToNumpyArray(param))
+    external_input = net.op[0].input[0]
+    ws.create_blob(external_input).feed(dummy_input)
+    # Get dimensions with legacy pad
+    for i in range(len(net.op)):
+        op_def = net.op[i]
+        ws._run_operator(op_def.SerializeToString())
+        if i in legacy_pad_ops:
+            output = op_def.output[0]
+            blob_legacy = ws.fetch_blob(output)
+            dim_map[i] = blob_legacy.shape
+    return dim_map
+
+
+def _GetLegacyPadArgs(op_def, arg_map):
+    pads = {}
+    keys = ['pad_l', 'pad_t', 'pad_r', 'pad_b']
+    is_pad = 'pad' in arg_map
+    if is_pad:
+        for k in keys:
+            pads[k] = arg_map['pad'].i
+    else:
+        pads = {x: arg_map[x].i for x in keys}
+    return pads
+
+
+def _AdjustDims(op_def, arg_map, pads, dim1, dim2):
+    n1, c1, h1, w1 = dim1
+    n2, c2, h2, w2 = dim2
+    assert(n1 == n2)
+    assert(c1 == c2)
+    is_pad = 'pad' in arg_map
+    if h1 != h2 or w1 != w2:
+        if h1 == h2 + 1:
+            pads['pad_b'] += 1
+        elif h1 != h2:
+            raise Exception("Unexpected dimensions for height:", h1, h2)
+        if w1 == w2 + 1:
+            pads['pad_r'] += 1
+        elif w1 != w2:
+            raise Exception("Unexpected dimensions for width:", w1, w2)
+        if is_pad:
+            op_def.arg.remove(arg_map['pad'])
+            args = []
+            for name in pads.keys():
+                arg = caffe2_pb2.Argument()
+                arg.name = name
+                arg.i = pads[name]
+                args.append(arg)
+            op_def.arg.extend(args)
+        else:
+            for name in pads.keys():
+                arg_map[name].i = pads[name]
+
+
+def _RemoveLegacyPad(net, net_params, input_dims):
+    legacy_pad_ops = []
+    for i in range(len(net.op)):
+        op_def = net.op[i]
+        if re.match(r'^(Conv|ConvTranspose|MaxPool|AveragePool)(\dD)?$',
+                    op_def.type):
+            for arg in op_def.arg:
+                if arg.name == 'legacy_pad':
+                    legacy_pad_ops.append(i)
+                    break
+    if legacy_pad_ops:
+        n, c, h, w = input_dims
+        dummy_input = np.random.randn(n, c, h, w).astype(np.float32)
+        dim_map = _GetLegacyDims(net, net_params, dummy_input, legacy_pad_ops)
+
+        # Running with the legacy pad argument removed
+        # compare the dimensions and adjust pad argument when necessary
+        ws = workspace.C.Workspace()
+
+        external_input = net.op[0].input[0]
+        ws.create_blob(external_input).feed_blob(dummy_input)
+        for param in net_params.protos:
+            ws.create_blob(param.name) \
+              .feed_blob(utils.Caffe2TensorToNumpyArray(param))
+
+        for i in range(len(net.op)):
+            op_def = net.op[i]
+            if i in legacy_pad_ops:
+                arg_map = {}
+                for arg in op_def.arg:
+                    arg_map[arg.name] = arg
+                pads = _GetLegacyPadArgs(op_def, arg_map)
+                # remove legacy pad arg
+                for j in range(len(op_def.arg)):
+                    arg = op_def.arg[j]
+                    if arg.name == 'legacy_pad':
+                        del op_def.arg[j]
+                        break
+                output = op_def.output[0]
+                # use a new name to avoid the interference with inplace
+                nonlegacy_output = output + '_nonlegacy'
+                op_def.output[0] = nonlegacy_output
+                ws._run_operator(op_def.SerializeToString())
+                blob_nonlegacy = ws.fetch_blob(nonlegacy_output)
+                # reset output name
+                op_def.output[0] = output
+
+                dim1 = dim_map[i]
+                dim2 = blob_nonlegacy.shape
+                _AdjustDims(op_def, arg_map, pads, dim1, dim2)
+
+            ws._run_operator(op_def.SerializeToString())
+    return net
+
+
+def _GetBlobDimMap(net, net_params, dummy_input):
+    dim_map = {}
+    ws = workspace.C.Workspace()
+    for param in net_params.protos:
+        ws.create_blob(param.name) \
+          .feed(utils.Caffe2TensorToNumpyArray(param))
+    external_input = net.op[0].input[0]
+    ws.create_blob(external_input).feed(dummy_input)
+    # Get dimensions with legacy pad
+    for i in range(len(net.op)):
+        op_def = net.op[i]
+        ws._run_operator(op_def.SerializeToString())
+        for output in op_def.output:
+            blob = ws.fetch_blob(output)
+            dim_map[output] = blob.shape
+    return dim_map
+
+
+def _GetInputDims(caffe_net):
+    input_dims = []
+    if caffe_net.input_dim:
+        input_dims = caffe_net.input_dim
+    elif caffe_net.input_shape:
+        input_dims = caffe_net.input_shape[0].dim
+    elif caffe_net.layer[0].input_param.shape:
+        # getting input dimension from first layer
+        input_dims = caffe_net.layer[0].input_param.shape[0].dim
+    return input_dims
+
+
+class TranslatorRegistry(object):
+    registry_ = {}
+
+    @classmethod
+    def Register(cls, op_name):
+        """A decorator for registering gradient mappings."""
+
+        def Wrapper(func):
+            cls.registry_[op_name] = func
+            return func
+
+        return Wrapper
+
+    @classmethod
+    def TranslateLayer(cls, layer, pretrained_blobs, is_test, **kwargs):
+        try:
+            caffe_ops, params = cls.registry_[layer.type](
+                layer, pretrained_blobs, is_test, **kwargs)
+        except KeyError:
+            raise KeyError('No translator registered for layer: %s yet.' %
+                           str(layer))
+        if caffe_ops is None:
+            caffe_ops = []
+        if type(caffe_ops) is not list:
+            caffe_ops = [caffe_ops]
+        return caffe_ops, params
+
+    @classmethod
+    def TranslateModel(
+        cls,
+        caffe_net,
+        pretrained_net,
+        is_test=False,
+        net_state=None,
+        remove_legacy_pad=False,
+        input_dims=None
+    ):
+        net_state = caffe_pb2.NetState() if net_state is None else net_state
+        net = caffe2_pb2.NetDef()
+        net.name = caffe_net.name
+        net_params = caffe2_pb2.TensorProtos()
+        if len(caffe_net.layers) > 0:
+            raise ValueError(
+                'I think something is wrong. This translation script '
+                'only accepts new style layers that are stored in the '
+                'layer field.'
+            )
+        if not input_dims:
+            input_dims = _GetInputDims(caffe_net)
+        for layer in caffe_net.layer:
+            if not _ShouldInclude(net_state, layer):
+                log.info('Current net state does not need layer {}'
+                            .format(layer.name))
+                continue
+            log.info('Translate layer {}'.format(layer.name))
+            # Get pretrained one
+            pretrained_layers = (
+                [l for l in pretrained_net.layer
+                 if l.name == layer.name] + [l
+                                             for l in pretrained_net.layers
+                                             if l.name == layer.name]
+            )
+            if len(pretrained_layers) > 1:
+                raise ValueError(
+                    'huh? more than one pretrained layer of one name?')
+            elif len(pretrained_layers) == 1:
+                pretrained_blobs = [
+                    utils.CaffeBlobToNumpyArray(blob)
+                    for blob in pretrained_layers[0].blobs
+                ]
+            else:
+                # No pretrained layer for the given layer name. We'll just pass
+                # no parameter blobs.
+                # print 'No pretrained layer for layer', layer.name
+                pretrained_blobs = []
+            operators, params = cls.TranslateLayer(
+                layer, pretrained_blobs, is_test, net=net,
+                net_params=net_params, input_dims=input_dims)
+            net.op.extend(operators)
+            net_params.protos.extend(params)
+        if remove_legacy_pad:
+            assert input_dims, \
+                   'Please specify input_dims to remove legacy_pad'
+            net = _RemoveLegacyPad(net, net_params, input_dims)
+        return net, net_params
+
+
+def TranslateModel(*args, **kwargs):
+    return TranslatorRegistry.TranslateModel(*args, **kwargs)
+
+
+def ConvertTensorProtosToInitNet(net_params, input_name):
+    """Takes the net_params returned from TranslateModel, and wrap it as an
+    init net that contain GivenTensorFill.
+
+    This is a very simple feature that only works with float tensors, and is
+    only intended to be used in an environment where you want a single
+    initialization file - for more complex cases, use a db to store the
+    parameters.
+    """
+    init_net = caffe2_pb2.NetDef()
+    for tensor in net_params.protos:
+        if len(tensor.float_data) == 0:
+            raise RuntimeError(
+                "Only float tensors are supported in this util.")
+        op = core.CreateOperator(
+            "GivenTensorFill", [], [tensor.name],
+            arg=[
+                utils.MakeArgument("shape", list(tensor.dims)),
+                utils.MakeArgument("values", tensor.float_data)])
+        init_net.op.extend([op])
+    init_net.op.extend([core.CreateOperator("ConstantFill", [], [input_name], shape=[1])])
+    return init_net
+
+
+def BaseTranslate(layer, caffe2_type):
+    """A simple translate interface that maps the layer input and output."""
+    caffe2_op = caffe2_pb2.OperatorDef()
+    caffe2_op.type = caffe2_type
+    caffe2_op.input.extend(layer.bottom)
+    caffe2_op.output.extend(layer.top)
+    return caffe2_op
+
+
+def AddArgument(op, key, value):
+    """Makes an argument based on the value type."""
+    op.arg.extend([utils.MakeArgument(key, value)])
+
+################################################################################
+# Common translators for layers.
+################################################################################
+
+
+@TranslatorRegistry.Register("Input")
+def TranslateInput(layer, pretrained_blobs, is_test, **kwargs):
+    return [], []
+
+
+@TranslatorRegistry.Register("VideoData")
+def TranslateVideoData(layer, pretrained_blobs, is_test, **kwargs):
+    return [], []
+
+
+@TranslatorRegistry.Register("Data")
+def TranslateData(layer, pretrained_blobs, is_test, **kwargs):
+    return [], []
+
+
+# A function used in convolution, pooling and deconvolution to deal with
+# conv pool specific parameters.
+def _TranslateStridePadKernelHelper(param, caffe_op):
+    try:
+        if (len(param.stride) > 1 or len(param.kernel_size) > 1 or
+                len(param.pad) > 1):
+            raise NotImplementedError(
+                "Translator currently does not support non-conventional "
+                "pad/kernel/stride settings."
+            )
+        stride = param.stride[0] if len(param.stride) else 1
+        pad = param.pad[0] if len(param.pad) else 0
+        kernel = param.kernel_size[0] if len(param.kernel_size) else 0
+    except TypeError:
+        # This catches the case of a PoolingParameter, in which case we are
+        # having non-repeating pad, stride and kernel.
+        stride = param.stride
+        pad = param.pad
+        kernel = param.kernel_size
+    # Get stride
+    if param.HasField("stride_h") or param.HasField("stride_w"):
+        AddArgument(caffe_op, "stride_h", param.stride_h)
+        AddArgument(caffe_op, "stride_w", param.stride_w)
+    else:
+        AddArgument(caffe_op, "stride", stride)
+    # Get pad
+    if param.HasField("pad_h") or param.HasField("pad_w"):
+        if param.pad_h == param.pad_w:
+            AddArgument(caffe_op, "pad", param.pad_h)
+        else:
+            AddArgument(caffe_op, "pad_t", param.pad_h)
+            AddArgument(caffe_op, "pad_b", param.pad_h)
+            AddArgument(caffe_op, "pad_l", param.pad_w)
+            AddArgument(caffe_op, "pad_r", param.pad_w)
+    else:
+        AddArgument(caffe_op, "pad", pad)
+    # Get kernel
+    if param.HasField("kernel_h") or param.HasField("kernel_w"):
+        AddArgument(caffe_op, "kernel_h", param.kernel_h)
+        AddArgument(caffe_op, "kernel_w", param.kernel_w)
+    else:
+        AddArgument(caffe_op, "kernel", kernel)
+
+
+@TranslatorRegistry.Register("Convolution3D")
+def TranslateConvNd(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.convolution3d_param
+    caffe_op = BaseTranslate(layer, "Conv")
+    output = caffe_op.output[0]
+    caffe_op.input.append(output + '_w')
+
+    AddArgument(
+        caffe_op,
+        "kernels",
+        [param.kernel_depth, param.kernel_size, param.kernel_size])
+    AddArgument(
+        caffe_op,
+        "strides",
+        [param.temporal_stride, param.stride, param.stride])
+    temporal_pad = 0
+    spatial_pad = 0
+    if hasattr(param, 'temporal_pad'):
+        temporal_pad = param.temporal_pad
+    if hasattr(param, 'pad'):
+        spatial_pad = param.pad
+    AddArgument(caffe_op, "pads", [temporal_pad, spatial_pad, spatial_pad] * 2)
+
+    # weight
+    params = [
+        utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')]
+    # bias
+    if len(pretrained_blobs) == 2:
+        caffe_op.input.append(output + '_b')
+        params.append(
+            utils.NumpyArrayToCaffe2Tensor(
+                pretrained_blobs[1].flatten(), output + '_b'))
+    return caffe_op, params
+
+
+@TranslatorRegistry.Register("Convolution")
+def TranslateConv(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.convolution_param
+    caffe_op = BaseTranslate(layer, "Conv")
+    output = caffe_op.output[0]
+    caffe_op.input.append(output + '_w')
+    _TranslateStridePadKernelHelper(param, caffe_op)
+    # weight
+    params = [
+        utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')]
+    # bias
+    if len(pretrained_blobs) == 2:
+        caffe_op.input.append(output + '_b')
+        params.append(
+            utils.NumpyArrayToCaffe2Tensor(
+                pretrained_blobs[1].flatten(), output + '_b'))
+    # Group convolution option
+    if param.group != 1:
+        AddArgument(caffe_op, "group", param.group)
+    # Get dilation - not tested. If you have a model and this checks out,
+    # please provide a test and uncomment this.
+    if len(param.dilation) > 0:
+        if len(param.dilation) == 1:
+            AddArgument(caffe_op, "dilation", param.dilation[0])
+        elif len(param.dilation) == 2:
+            AddArgument(caffe_op, "dilation_h", param.dilation[0])
+            AddArgument(caffe_op, "dilation_w", param.dilation[1])
+    return caffe_op, params
+
+
+@TranslatorRegistry.Register("Deconvolution")
+def TranslateDeconv(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.convolution_param
+    if param.group > 1:
+        raise NotImplementedError(
+            "Translator currently does not support group deconvolution."
+        )
+    caffe_op = BaseTranslate(layer, "ConvTranspose")
+    output = caffe_op.output[0]
+    _TranslateStridePadKernelHelper(param, caffe_op)
+    caffe_op.input.extend([output + '_w'])
+    AddArgument(caffe_op, "order", "NCHW")
+    weight = utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')
+    if param.bias_term:
+        bias = utils.NumpyArrayToCaffe2Tensor(
+            pretrained_blobs[1].flatten(), output + '_b'
+        )
+        caffe_op.input.extend([output + '_b'])
+        return caffe_op, [weight, bias]
+    else:
+        return caffe_op, [weight]
+
+
+@TranslatorRegistry.Register("Crop")
+def TranslateCrop(layer, pretrained_blobs, is_test, **kwargs):
+    net, net_params, input_dims = kwargs['net'], kwargs['net_params'], kwargs['input_dims']
+    n, c, h, w = input_dims
+    dummy_input = np.random.randn(n, c, h, w).astype(np.float32)
+    dim_map = _GetBlobDimMap(net, net_params, dummy_input)
+    param = layer.crop_param
+    axis, offsets = param.axis, param.offset
+    caffe_op = BaseTranslate(layer, "Slice")
+    input_1 = caffe_op.input[1]
+    input_1_dim = dim_map[input_1]
+    starts, ends = [], []
+    dims = len(dim_map[input_1])
+    assert len(offsets) == 1, 'Caffe Translator for Crop only works for offset \
+    of 1 for now'
+    for _ in range(axis):
+        starts.append(0)
+        ends.append(-1)
+    end_offset = [int(offsets[0] + input_1_dim[i]) for i in range(axis, dims)]
+    ends.extend(end_offset)
+    starts.extend([offsets[0]] * len(end_offset))
+    op = caffe2_pb2.OperatorDef()
+    op.input.extend([caffe_op.input[0]])
+    op.output.extend(caffe_op.output)
+    op.arg.extend(caffe_op.arg)
+    op.type = caffe_op.type
+    AddArgument(op, "starts", starts)
+    AddArgument(op, "ends", ends)
+    return op, []
+
+@TranslatorRegistry.Register("ReLU")
+def TranslateRelu(layer, pretrained_blobs, is_test, **kwargs):
+    return BaseTranslate(layer, "Relu"), []
+
+
+@TranslatorRegistry.Register("Pooling")
+def TranslatePool(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.pooling_param
+    if param.pool == caffe_pb2.PoolingParameter.MAX:
+        caffe_op = BaseTranslate(layer, "MaxPool")
+    elif param.pool == caffe_pb2.PoolingParameter.AVE:
+        caffe_op = BaseTranslate(layer, "AveragePool")
+    _TranslateStridePadKernelHelper(param, caffe_op)
+    AddArgument(caffe_op, "order", "NCHW")
+    try:
+        # In the Facebook port of Caffe, a torch_pooling field was added to
+        # map the pooling computation of Torch. Essentially, it uses
+        #   floor((height + 2 * padding - kernel) / stride) + 1
+        # instead of
+        #   ceil((height + 2 * padding - kernel) / stride) + 1
+        # which is Caffe's version.
+        # Torch pooling is actually the same as Caffe2 pooling, so we don't
+        # need to do anything.
+        is_torch_pooling = param.torch_pooling
+    except AttributeError:
+        is_torch_pooling = False
+    if not is_torch_pooling:
+        AddArgument(caffe_op, "legacy_pad",
+                    caffe2_legacy_pb2.CAFFE_LEGACY_POOLING)
+    if param.global_pooling:
+        AddArgument(caffe_op, "global_pooling", 1)
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Pooling3D")
+def TranslatePool3D(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.pooling3d_param
+    if param.pool == caffe_pb2.Pooling3DParameter.MAX:
+        caffe_op = BaseTranslate(layer, "MaxPool")
+
+    elif param.pool == caffe_pb2.Pooling3DParameter.AVE:
+        caffe_op = BaseTranslate(layer, "AveragePool")
+    AddArgument(caffe_op, "order", "NCHW")
+    AddArgument(
+        caffe_op,
+        "kernels",
+        [param.kernel_depth, param.kernel_size, param.kernel_size])
+
+    AddArgument(
+        caffe_op,
+        "strides",
+        [param.temporal_stride, param.stride, param.stride])
+    temporal_pad = 0
+    spatial_pad = 0
+    if hasattr(param, 'temporal_pad'):
+        temporal_pad = param.temporal_pad
+    if hasattr(param, 'pad'):
+        spatial_pad = param.pad
+    AddArgument(caffe_op, "pads", [temporal_pad, spatial_pad, spatial_pad] * 2)
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("LRN")
+def TranslateLRN(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "LRN")
+    caffe_op.output.extend(['_' + caffe_op.output[0] + '_scale'])
+    param = layer.lrn_param
+    if param.norm_region != caffe_pb2.LRNParameter.ACROSS_CHANNELS:
+        raise ValueError(
+            "Does not support norm region other than across channels.")
+    AddArgument(caffe_op, "size", int(param.local_size))
+    AddArgument(caffe_op, "alpha", float(param.alpha))
+    AddArgument(caffe_op, "beta", float(param.beta))
+    AddArgument(caffe_op, "bias", float(param.k))
+    AddArgument(caffe_op, "order", "NCHW")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("InnerProduct")
+def TranslateInnerProduct(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.inner_product_param
+    try:
+        if param.axis != 1 or param.transpose:
+            raise ValueError(
+                "We don't have testing case for non-default axis and transpose "
+                "cases yet so we are disabling it for now. If you have a model "
+                "with this, please do send us your model for us to update this "
+                "support, and you are more than welcome to send a PR for this.")
+    except AttributeError:
+        # We might be using an historic Caffe protobuf that does not have axis
+        # and transpose arguments, so we will silently pass.
+        pass
+    caffe_op = BaseTranslate(layer, "FC")
+    output = caffe_op.output[0]
+    caffe_op.input.extend([output + '_w', output + '_b'])
+    # To provide the old-style 4-dimensional blob (1, 1, dim_output, dim_input)
+    # case, we always explicitly reshape the pretrained blob.
+    if pretrained_blobs[0].ndim not in [2, 4]:
+        raise ValueError("Unexpected weight ndim.")
+    if (pretrained_blobs[0].ndim == 4 and
+            list(pretrained_blobs[0].shape[:2]) != [1, 1]):
+        raise ValueError(
+            "If pretrained blob has 4 dims (old-style Caffe), the first two "
+            "should be of value 1, but I got " + str(pretrained_blobs[0].shape))
+    weight = utils.NumpyArrayToCaffe2Tensor(
+        pretrained_blobs[0].reshape(-1, pretrained_blobs[0].shape[-1]),
+        output + '_w'
+    )
+    bias = utils.NumpyArrayToCaffe2Tensor(
+        pretrained_blobs[1].flatten(), output + '_b'
+    )
+    return caffe_op, [weight, bias]
+
+
+@TranslatorRegistry.Register("Dropout")
+def TranslateDropout(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Dropout")
+    caffe_op.output.extend(['_' + caffe_op.output[0] + '_mask'])
+    param = layer.dropout_param
+    AddArgument(caffe_op, "ratio", param.dropout_ratio)
+    if (is_test):
+        AddArgument(caffe_op, "is_test", 1)
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Softmax")
+def TranslateSoftmax(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Softmax")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("SoftmaxWithLoss")
+def TranslateSoftmaxWithLoss(layer, pretrained_blobs, is_test, **kwargs):
+    softmax_op = core.CreateOperator(
+        "Softmax", [layer.bottom[0]],
+        layer.bottom[0] + "_translator_autogen_softmax")
+    xent_op = core.CreateOperator(
+        "LabelCrossEntropy",
+        [softmax_op.output[0], layer.bottom[1]],
+        layer.bottom[0] + "_translator_autogen_xent")
+    loss_op = core.CreateOperator(
+        "AveragedLoss",
+        xent_op.output[0],
+        layer.top[0])
+    return [softmax_op, xent_op, loss_op], []
+
+
+@TranslatorRegistry.Register("Accuracy")
+def TranslateAccuracy(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Accuracy")
+    if layer.accuracy_param.top_k != 1:
+        AddArgument(caffe_op, "top_k", layer.accuracy_param.top_k)
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Concat")
+def TranslateConcat(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Concat")
+    caffe_op.output.extend(['_' + caffe_op.output[0] + '_dims'])
+    AddArgument(caffe_op, "order", "NCHW")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("TanH")
+def TranslateTanH(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Tanh")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("InstanceNorm")
+def TranslateInstanceNorm(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "InstanceNorm")
+    output = caffe_op.output[0]
+    weight = utils.NumpyArrayToCaffe2Tensor(
+        pretrained_blobs[0].flatten(), output + '_w')
+    bias = utils.NumpyArrayToCaffe2Tensor(
+        pretrained_blobs[1].flatten(), output + '_b')
+    caffe_op.input.extend([output + '_w', output + '_b'])
+    AddArgument(caffe_op, "order", "NCHW")
+    return caffe_op, [weight, bias]
+
+
+@TranslatorRegistry.Register("BatchNorm")
+def TranslateBatchNorm(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "SpatialBN")
+    output = caffe_op.output[0]
+    param = layer.batch_norm_param
+    AddArgument(caffe_op, "is_test", is_test)
+    AddArgument(caffe_op, "epsilon", param.eps)
+    AddArgument(caffe_op, "order", "NCHW")
+
+    caffe_op.input.extend(
+        [output + "_scale",
+         output + "_bias",
+         output + "_mean",
+         output + "_var"])
+    if not is_test:
+        caffe_op.output.extend(
+            [output + "_mean",
+             output + "_var",
+             output + "_saved_mean",
+             output + "_saved_var"])
+
+    n_channels = pretrained_blobs[0].shape[0]
+    if pretrained_blobs[2][0] != 0:
+        mean = utils.NumpyArrayToCaffe2Tensor(
+            (1. / pretrained_blobs[2][0]) * pretrained_blobs[0],
+            output + '_mean')
+        var = utils.NumpyArrayToCaffe2Tensor(
+            (1. / pretrained_blobs[2][0]) * pretrained_blobs[1],
+            output + '_var')
+    else:
+        raise RuntimeError("scalar is zero.")
+    pretrained_blobs[2][0] = 1
+    pretrained_blobs[2] = np.tile(pretrained_blobs[2], (n_channels, ))
+    scale = utils.NumpyArrayToCaffe2Tensor(
+        pretrained_blobs[2],
+        output + '_scale')
+    bias = utils.NumpyArrayToCaffe2Tensor(
+        np.zeros_like(pretrained_blobs[2]),
+        output + '_bias')
+
+    return caffe_op, [scale, bias, mean, var]
+
+
+@TranslatorRegistry.Register("Eltwise")
+def TranslateElementWise(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.eltwise_param
+    # TODO(jiayq): if we have a protobuf that uses this, lift this constraint
+    # and verify that we can correctly translate.
+    if len(param.coeff) or param.operation != 1:
+        raise RuntimeError("This eltwise layer is not yet supported.")
+    caffe_op = BaseTranslate(layer, "Sum")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Scale")
+def TranslateScale(layer, pretrained_blobs, is_test, **kwargs):
+    mul_op = BaseTranslate(layer, "Mul")
+    scale_param = layer.scale_param
+    AddArgument(mul_op, "axis", scale_param.axis)
+    AddArgument(mul_op, "broadcast", True)
+    if len(mul_op.input) == 1:
+        # the scale parameter is in pretrained blobs
+        if scale_param.num_axes != 1:
+            raise RuntimeError("This path has not been verified yet.")
+
+        output = mul_op.output[0]
+        mul_op_param = output + '_w'
+        mul_op.input.append(mul_op_param)
+        weights = []
+        weights.append(utils.NumpyArrayToCaffe2Tensor(
+            pretrained_blobs[0].flatten(), mul_op_param))
+
+        add_op = None
+        if len(pretrained_blobs) == 1:
+            # No bias-term in Scale layer
+            pass
+        elif len(pretrained_blobs) == 2:
+            # Caffe Scale layer supports a bias term such that it computes
+            # (scale_param * X + bias), whereas Caffe2 Mul op doesn't.
+            # Include a separate Add op for the bias followed by Mul.
+            add_op = copy.deepcopy(mul_op)
+            add_op.type = "Add"
+            add_op_param = output + '_b'
+            internal_blob = output + "_internal"
+            del mul_op.output[:]
+            mul_op.output.append(internal_blob)
+            del add_op.input[:]
+            add_op.input.append(internal_blob)
+            add_op.input.append(add_op_param)
+            weights.append(utils.NumpyArrayToCaffe2Tensor(
+                pretrained_blobs[1].flatten(), add_op_param))
+        else:
+            raise RuntimeError("Unexpected number of pretrained blobs in Scale")
+
+        caffe_ops = [mul_op]
+        if add_op:
+            caffe_ops.append(add_op)
+        assert len(caffe_ops) == len(weights)
+        return caffe_ops, weights
+    elif len(mul_op.input) == 2:
+        # TODO(jiayq): find a protobuf that uses this and verify.
+        raise RuntimeError("This path has not been verified yet.")
+    else:
+        raise RuntimeError("Unexpected number of inputs.")
+
+
+@TranslatorRegistry.Register("Reshape")
+def TranslateReshape(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Reshape")
+    caffe_op.output.append("_" + caffe_op.input[0] + "_dims")
+    reshape_param = layer.reshape_param
+    AddArgument(caffe_op, 'shape', reshape_param.shape.dim)
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Flatten")
+def TranslateFlatten(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.flatten_param
+    if param.end_axis != -1:
+        raise NotImplementedError("flatten_param.end_axis not supported yet.")
+
+    if param.axis == 0:
+        caffe_op = BaseTranslate(layer, "FlattenToVec")
+    elif param.axis == 1:
+        caffe_op = BaseTranslate(layer, "Flatten")
+    else:
+        # This could be a Reshape op, but dim size is not known here.
+        raise NotImplementedError(
+            "Not supported yet for flatten_param.axis {}.".format(param.axis))
+
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Sigmoid")
+def TranslateSigmoid(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "Sigmoid")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("ROIPooling")
+def TranslateROIPooling(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "RoIPool")
+    AddArgument(caffe_op, "order", "NCHW")
+
+    if is_test:
+        AddArgument(caffe_op, "is_test", is_test)
+    else:
+        # Only used for gradient computation
+        caffe_op.output.append(caffe_op.output[0] + '_argmaxes')
+
+    param = layer.roi_pooling_param
+    if param.HasField('pooled_h'):
+        AddArgument(caffe_op, 'pooled_h', param.pooled_h)
+    if param.HasField('pooled_w'):
+        AddArgument(caffe_op, 'pooled_w', param.pooled_w)
+    if param.HasField('spatial_scale'):
+        AddArgument(caffe_op, 'spatial_scale', param.spatial_scale)
+
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("PReLU")
+def TranslatePRelu(layer, pretrained_blobs, is_test, **kwargs):
+    caffe_op = BaseTranslate(layer, "PRelu")
+    output = caffe_op.output[0]
+    caffe_op.input.extend([output + '_Slope'])
+    slope = utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_Slope')
+
+    return caffe_op, [slope]
+
+
+@TranslatorRegistry.Register("Reduction")
+def TranslateReduction(layer, pretrained_blobs, is_test, **kwargs):
+    param = layer.reduction_param
+    if param.operation == caffe_pb2.ReductionParameter.SUM:
+        caffe_op = BaseTranslate(layer, "ReduceBackSum")
+    elif param.operation == caffe_pb2.ReductionParameter.MEAN:
+        caffe_op = BaseTranslate(layer, "ReduceBackMean")
+    else:
+        raise NotImplementedError("Not yet supported")
+
+    if param.axis > 0:
+        # We can't figure out the number of dims to reduce from positive axis
+        # for back reduction since the shape info is not known here.
+        raise NotImplementedError("Not yet supported")
+    num_reduce_dim = -param.axis
+    AddArgument(caffe_op, "num_reduce_dim", num_reduce_dim)
+
+    return caffe_op, []
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Utilitity to convert pretrained caffe models to Caffe2 models.")
+    parser.add_argument("prototext", help="Caffe prototext.")
+    parser.add_argument("caffemodel", help="Caffe trained model.")
+    parser.add_argument("--init_net", help="Caffe2 initialization net.",
+                        default="init_net.pb")
+    parser.add_argument("--predict_net", help="Caffe2 prediction net.",
+                        default="predict_net.pb")
+    parser.add_argument("--remove_legacy_pad", help="Remove legacy pad \
+                        (Only works for nets with one input blob)",
+                        action="store_true",
+                        default=False)
+    parser.add_argument("--input_dims", help="Dimension of input blob", nargs='+',
+                        type=int, default=[])
+    args = parser.parse_args()
+
+    caffenet = caffe_pb2.NetParameter()
+    caffenet_pretrained = caffe_pb2.NetParameter()
+    input_proto = args.prototext
+    input_caffemodel = args.caffemodel
+    output_init_net = args.init_net
+    output_predict_net = args.predict_net
+
+    text_format.Merge(
+        open(input_proto, 'r').read(), caffenet
+    )
+    caffenet_pretrained.ParseFromString(
+        open(input_caffemodel, 'rb').read()
+    )
+    net, pretrained_params = TranslateModel(
+        caffenet, caffenet_pretrained, is_test=True,
+        remove_legacy_pad=args.remove_legacy_pad,
+        input_dims=args.input_dims
+    )
+
+    # Assume there is one input and one output
+    external_input = net.op[0].input[0]
+    external_output = net.op[-1].output[0]
+
+    net.external_input.extend([external_input])
+    net.external_input.extend([param.name for param in pretrained_params.protos])
+    net.external_output.extend([external_output])
+    init_net = ConvertTensorProtosToInitNet(pretrained_params, external_input)
+
+    with open(output_predict_net, 'wb') as f:
+        f.write(net.SerializeToString())
+    with open(output_predict_net + 'txt', 'w') as f:
+        f.write(str(net))
+    with open(output_init_net, 'wb') as f:
+        f.write(init_net.SerializeToString())
diff --git a/caffe2/python/caffe_translator_test.py b/caffe2/python/caffe_translator_test.py
new file mode 100644
index 0000000..45b2ad1
--- /dev/null
+++ b/caffe2/python/caffe_translator_test.py
@@ -0,0 +1,76 @@
+# This a large test that goes through the translation of the bvlc caffenet
+# model, runs an example through the whole model, and verifies numerically
+# that all the results look right. In default, it is disabled unless you
+# explicitly want to run it.
+
+from caffe.proto import caffe_pb2
+from google.protobuf import text_format
+import numpy as np
+import os
+from caffe2.python import caffe_translator, utils, workspace, test_util
+import sys
+import unittest
+
+
+@unittest.skipIf(not os.path.exists('data/testdata/caffe_translator'),
+                 'No testdata existing for the caffe translator test. Exiting.')
+def setUpModule():
+    # We will do all the computation stuff in the global space.
+    caffenet = caffe_pb2.NetParameter()
+    caffenet_pretrained = caffe_pb2.NetParameter()
+    text_format.Merge(
+        open('data/testdata/caffe_translator/deploy.prototxt').read(), caffenet
+    )
+    caffenet_pretrained.ParseFromString(
+        open(
+            'data/testdata/caffe_translator/bvlc_reference_caffenet.caffemodel')
+        .read()
+    )
+    for remove_legacy_pad in [True, False]:
+        net, pretrained_params = caffe_translator.TranslateModel(
+            caffenet, caffenet_pretrained, is_test=True,
+            remove_legacy_pad=remove_legacy_pad
+        )
+        with open('data/testdata/caffe_translator/'
+                  'bvlc_reference_caffenet.translatedmodel',
+                  'w') as fid:
+            fid.write(str(net))
+        for param in pretrained_params.protos:
+            workspace.FeedBlob(param.name, utils.Caffe2TensorToNumpyArray(param))
+        # Let's also feed in the data from the Caffe test code.
+        data = np.load('data/testdata/caffe_translator/data_dump.npy').astype(
+            np.float32)
+        workspace.FeedBlob('data', data)
+        # Actually running the test.
+        workspace.RunNetOnce(net.SerializeToString())
+
+
+class TestNumericalEquivalence(test_util.TestCase):
+    def testBlobs(self):
+        names = [
+            "conv1", "pool1", "norm1", "conv2", "pool2", "norm2", "conv3",
+            "conv4", "conv5", "pool5", "fc6", "fc7", "fc8", "prob"
+        ]
+        for name in names:
+            print('Verifying {}'.format(name))
+            caffe2_result = workspace.FetchBlob(name)
+            reference = np.load(
+                'data/testdata/caffe_translator/' + name + '_dump.npy'
+            )
+            self.assertEqual(caffe2_result.shape, reference.shape)
+            scale = np.max(caffe2_result)
+            np.testing.assert_almost_equal(
+                caffe2_result / scale,
+                reference / scale,
+                decimal=5
+            )
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print(
+            'If you do not explicitly ask to run this test, I will not run it. '
+            'Pass in any argument to have the test run for you.'
+        )
+        sys.exit(0)
+    unittest.main()
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
new file mode 100644
index 0000000..e983f5c
--- /dev/null
+++ b/caffe2/python/checkpoint.py
@@ -0,0 +1,825 @@
+## @package checkpoint
+# Module caffe2.python.checkpoint
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import logging
+from caffe2.python import core, context
+from caffe2.python.net_builder import ops
+from caffe2.python.task import (
+    final_output,
+    Node,
+    Task,
+    TaskGroup,
+    TaskOutput,
+    WorkspaceType,
+)
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+@context.define_context()
+class Job(object):
+    """
+    A Job defines three TaskGroups: the `init_group`, the `epoch_group` and the
+    `exit_group` which will be run by a JobRunner.
+
+    The `init_group` will be run only once at startup. Its role is to
+    initialize globally persistent blobs such as model weights, accumulators
+    and data file lists.
+
+    The `epoch_group` will be run in a loop after init_group. The loop will
+    exit when any of the stop signals added with `add_stop_condition` is True
+    at the end of an epoch.
+
+    The download_group will be run only once, after all the executions of
+    epoch_group finish. Its role is to collect the distribute scattered
+    parameters back after training.
+
+    The `exit_group` will be run only once at the very end of the job, the
+    role of this group is to save the results of training in the end of the job.
+
+    Jobs are context-driven, so that Tasks can be added to the active Job
+    without having to explicitly pass the job object around.
+
+    Example of usage:
+
+    def build_reader(partitions):
+        with Job.current().init_group:
+            reader = HiveReader(init_reader, ..., partitions)
+            Task(step=init_reader)
+        with Job.current().epoch_group:
+            limited_reader = ReaderWithLimit(reader, num_iter=10000)
+            data_queue = pipe(limited_reader, num_threads=8)
+            Job.current().add_stop_condition(limited_reader.data_finished())
+        return data_queue
+
+    def build_hogwild_trainer(reader, model):
+        with Job.current().init_group:
+            Task(step=model.param_init_net)
+        with Job.current().epoch_group:
+            pipe(reader, processor=model, num_threads=8)
+        with Job.current().exit_group:
+            Task(step=model.save_model_net)
+
+    with Job() as job:
+        reader = build_reader(partitions)
+        model = build_model(params)
+        build_hogwild_trainer(reader, model)
+    """
+    def __init__(self,
+                 init_group=None, epoch_group=None,
+                 download_group=None, exit_group=None,
+                 stop_conditions=None, nodes_to_checkpoint=None):
+        self.init_group = init_group or TaskGroup(
+            workspace_type=WorkspaceType.GLOBAL)
+        self.epoch_group = epoch_group or TaskGroup()
+        self.download_group = download_group or TaskGroup()
+        self.exit_group = exit_group or TaskGroup()
+        self.stop_conditions = stop_conditions or []
+        self._nodes_to_checkpoint = nodes_to_checkpoint
+
+    def nodes_to_checkpoint(self):
+        if self._nodes_to_checkpoint:
+            return self._nodes_to_checkpoint
+        else:
+            return self.init_group.used_nodes()
+
+    def compile(self, session_class):
+        self._nodes_to_checkpoint = self.nodes_to_checkpoint()
+        self.init_group = session_class.compile(self.init_group)
+        self.epoch_group = session_class.compile(self.epoch_group)
+        self.download_group = session_class.compile(self.download_group)
+        self.exit_group = session_class.compile(self.exit_group)
+
+    def __enter__(self):
+        self.epoch_group.__enter__()
+        return self
+
+    def __exit__(self, *args):
+        self.epoch_group.__exit__()
+
+    def add_stop_condition(self, output):
+        if isinstance(output, core.BlobReference):
+            t = Task(outputs=[output], group=self.epoch_group)
+            output = t.outputs()[0]
+        assert isinstance(output, TaskOutput)
+        self.stop_conditions.append(output)
+
+
+def get_ckpt_filename(node_name, epoch):
+    """Returns the checkpoint filename.
+
+    Args:
+        node_name: A string. The name of the node.
+        epoch: An integer. The checkpoint epoch.
+
+    Returns:
+        ckpt_filename: A string. The filename of the checkpoint.
+    """
+    return node_name + '.' + str(epoch)
+
+
+def db_name(epoch, node_name, db_prefix, path_prefix=None):
+    """Returns the full db name where checkpoint files are saved.
+
+    Args:
+        epoch: An integer. The checkpoint epoch.
+        node_name: A string. The name of the node.
+        db_prefix: A string. The prefix used to construct full db name.
+        path_prefix: A string. Optional param used to construct db name or path
+            where checkpoint files are are stored.
+    Returns:
+        db_name: A string. The absolute path of full_db_name where checkpoint
+            files are saved
+    """
+    if path_prefix:
+        db_name = path_prefix + get_ckpt_filename(node_name, epoch)
+    else:
+        ckpt_filename = get_ckpt_filename(node_name, epoch)
+        db_name = os.path.join(db_prefix, ckpt_filename)
+    return db_name
+
+
+class CheckpointManager(object):
+    """
+    Controls saving and loading of workspaces on every epoch boundary of a job.
+    If a CheckpointManager instance is passed to JobRunner, then JobRunner will
+    call `init`, `read` and `save` at different moments in between epoch runs.
+
+    Args:
+        db_prefix: The prefix used to construct full db name. Since `absolute_path`
+            is set to True, this will be used as db_name in SaveOp.
+        node_name: Name of the node where this checkpoint_manager is used.
+        db_type: Type of database to use for storing checkpoint.
+        metadata_handler: An optional object capable of reading/writing
+            checkpoint info in storage of choice.
+    """
+    def __init__(self, db_prefix, node_name, db_type, metadata_handler=None):
+        self._db_prefix = db_prefix
+        self._node_name = node_name
+        self._db_type = db_type
+        self._metadata_handler = metadata_handler
+        # make sure these blobs are the first in the checkpoint file.
+        self._net = core.Net('!!checkpoint_mngr')
+        self._blob_names = self._net.AddExternalInput('blob_names')
+        self._names_output = None
+        self._path_prefix = None
+        self._path_type = None
+        self._current_db_name = None
+        self._current_checkpoint_duration = None
+
+    """
+    Initialize the checkpoint manager. Determines all blobs that need to be saved
+    or loads from a checkpoint.
+
+    Args:
+        nodes: An array of nodes where this checkpoint manager is running. Should
+            only contain a single node.
+        retrieve_from_epoch: Set to a number to load blobs from this epoch.
+        path_prefix: Used to construct db name or path where checkpoint files are
+            stored.
+        path_type: Indicate the type of path where checkpoint files are stored.
+    """
+    def init(
+        self,
+        nodes=None,
+        retrieve_from_epoch=None,
+        path_prefix=None,
+        path_type=None
+    ):
+        """
+        Build a Task that will be run once after the job's `init_group` is run.
+        This task will determine which blobs need to be checkpointed.
+        If retrieve_from_epoch is not None, then the checkpoint metadata is
+        retrieved from a previously saved checkpoint.
+        """
+        assert nodes is None or len(nodes) == 1, (
+            'CheckpointManager only supports single node.')
+
+        with Task(outputs=[self._blob_names]) as task:
+            if retrieve_from_epoch is None:
+                ops.GetAllBlobNames(
+                    [],
+                    self._blob_names,
+                    include_shared=False)
+            else:
+                full_db_name = db_name(retrieve_from_epoch,
+                                        self._node_name, self._db_prefix, path_prefix)
+                db_type = path_type or self._db_type
+                logger.info("Initializing checkpoints from = %s"
+                            % full_db_name)
+                ops.Load(
+                    [], self._blob_names,
+                    db=full_db_name,
+                    db_type=db_type,
+                    absolute_path=True)
+        self._names_output = task.outputs()[0]
+        return task
+
+    def blob_list(self):
+        assert self._names_output
+        return self._names_output.fetch().tolist()
+
+    def _timed_task(self, cp_op_name, add_op):
+        """
+        Build a Task that will measure the time span of checkpoint operations,
+        once operation is done, time can be read from _current_checkpoint_duration.
+
+        Args:
+            cp_op_name: A string name of the checkpoint operation.
+            add_op: A functor to add the checkpoint operation.
+
+        Returns:
+            A task with timer.
+        """
+        with Task(name=cp_op_name) as task:
+            with ops.task_init():
+                timer = ops.TimerBegin([], counter_name=self._node_name)
+            add_op()
+            with ops.task_exit():
+                time_span_blob = ops.TimerGetAndEnd(timer)
+            self._current_checkpoint_duration = final_output(time_span_blob)
+        return task
+
+    def collect_checkpoint_stats(self, stats):
+        """
+        Add one checkpoint stats into the stats.
+
+        Args:
+            stats: A dict of checkpoint stats that will be reported.
+        """
+        if self._current_db_name and self._current_checkpoint_duration:
+            stats[self._current_db_name] = self._current_checkpoint_duration.fetch()[0]
+        else:
+            logger.info(
+                "Failed to collect checkpoint stats: {}".format(
+                    self._current_db_name
+                )
+            )
+
+    def load(self, epoch, path_prefix=None, path_type=None):
+        """
+        Build a Task that will be run by JobRunner when the job is to be
+        resumed from a given epoch. This task will run a Load op that will
+        load and deserialize all relevant blobs from a persistent storage.
+        """
+        self._current_db_name = db_name(
+            epoch, self._node_name, self._db_prefix, path_prefix
+        )
+        db_type = path_type or self._db_type
+        logger.info("Loading checkpoints from = %s" % self._current_db_name)
+
+        def add_op():
+            ops.Load(
+                [],
+                self.blob_list(),
+                db=self._current_db_name,
+                db_type=db_type,
+                absolute_path=True)
+
+        return self._timed_task('checkpoint_load', add_op)
+
+    def load_blobs_from_checkpoint(self, blob_names, epoch):
+        """
+        Builds a Task that loads only the necessary blobs from a checkpoint of
+        the given epoch. The necessary blobs are given in the blob_names
+        argument.
+
+        Args:
+            blob_names: A list of strings. Each string is the name of a
+                blob.
+            epoch: The checkpoint epoch to load from.
+
+        Returns:
+            A Task which loads the specified blobs from the checkpoint of the
+            given epoch.
+        """
+        self._current_db_name = db_name(epoch, self._node_name, self._db_prefix)
+        logger.info('Load from %s' % self._current_db_name)
+
+        def add_op():
+            ops.Load(
+                [],
+                blob_names,
+                db=self._current_db_name,
+                db_type=self._db_type,
+                absolute_path=True,
+                allow_incomplete=True)
+
+        return self._timed_task('checkpoint_partial_load', add_op)
+
+    def check_db_exists(self, epoch):
+        logger.info('Check existence of %s' %
+                    db_name(epoch, self._node_name, self._db_prefix))
+        with Task() as task:
+            existence = ops.Const(False)
+            ops.DBExists(
+                [],
+                [existence],
+                db_name=db_name(epoch, self._node_name, self._db_prefix),
+                db_type=self._db_type,
+                absolute_path=True)
+            task.add_output(existence)
+        return task
+
+    def report_checkpoint_stats(self, action_name):
+        """
+        Report checkpoint operation stats for current node.
+
+        Args:
+            action_name: A string of the name of checkpoint operation.
+        """
+        all_stats = {}
+        self.collect_checkpoint_stats(all_stats)
+        if self._metadata_handler:
+            self._metadata_handler.report(action_name, all_stats)
+
+    def save(self, epoch):
+        """
+        Build a Task that is run once after `init_group` and after each
+        epoch is run. This will execute a Save ops to serialize and persist
+        blobs present in the global workspace.
+        """
+        self._current_db_name = db_name(epoch, self._node_name, self._db_prefix)
+        logger.info('Saving to %s' % self._current_db_name)
+
+        def add_op():
+            ops.Save(
+                self.blob_list(), [],
+                db=self._current_db_name,
+                db_type=self._db_type,
+                absolute_path=True)
+
+        return self._timed_task('checkpoint_save', add_op)
+
+    def write_checkpoint_metadata(self, epoch):
+        """
+        Write metadata for checkpoint
+
+        Args:
+            epoch: An integer. The epoch-id for which checkpoint metadata is
+                written
+        """
+        if self._metadata_handler is not None:
+            self._metadata_handler.write(epoch=epoch)
+
+    def get_resume_from_epoch_id(self, user_epoch=None):
+        """
+        Identify the epoch-id from which Job must resume
+
+        Args:
+            user_epoch: An integer. Optional parameter for user to explicitly
+                identify the epoch-id to load checkpoint from
+        Retruns:
+            epoch: the epoch-id to load checkpoints from
+                or None if no checkpoints were written
+        """
+        last_epoch = user_epoch
+        if self._metadata_handler is not None:
+            last_epoch = self._metadata_handler.last_epoch(user_epoch=user_epoch)
+        return last_epoch
+
+    def set_params(self, nodes, path_prefix=None, path_type=None):
+        """Set parameters associated with CP manager
+
+        Args:
+            nodes: An array of nodes where this checkpoint manager is running.
+            path_prefix: Used to construct db name or path where checkpoint files are
+                stored.
+            path_type: Indicate the type of path where checkpoint files are stored.
+        """
+        if path_prefix:
+            self._path_prefix = path_prefix
+        if path_type:
+            self._path_type = path_type
+        if self._metadata_handler:
+            self._metadata_handler.set_params(
+                db_prefix=self._db_prefix,
+                db_type=self._db_type,
+                node_names=[str(self._node_name)],
+                path_prefix=self._path_prefix,
+                path_type=self._path_type)
+
+    def cp_accessible(self, epoch=None):
+        """Returns True if Checkpoint data is accessible
+
+        Args:
+            epoch: An integer. The epoch of the checkpoint. If None,
+                it implies we need to check if checkpoint directory is accessible
+
+        Returns:
+            is_cp_accessible: A boolean. Returns True if Checkpoint data is accessible
+        """
+        if self._metadata_handler is not None:
+            return self._metadata_handler.cp_accessible(epoch)
+        else:
+            return True
+
+
+class MultiNodeCheckpointManager(object):
+    """
+    Coordinates checkpointing and checkpointing across multiple nodes.
+    Each of `init`, `load` and `save` will build TaskGroups which will
+    trigger checkpointing on each of the nodes involved in a distributed job.
+
+    Args:
+        db_prefix: The prefix used to construct full db name. Since `absolute_path`
+            is set to True, this will be used as db_name in SaveOp.
+        db_type: Type of database to use for storing checkpoint.
+        metadata_handler: An optional object capable of reading/writing
+            checkpoint info in storage of choice.
+    """
+    def __init__(self, db_prefix, db_type, metadata_handler=None):
+        self._node_managers = None
+        self._db_prefix = db_prefix
+        self._db_type = db_type
+        self._metadata_handler = metadata_handler
+        self._path_prefix = None
+        self._path_type = None
+
+    def _task_group(self, func, *args, **kw):
+        assert self._node_managers is not None, 'init must be called first.'
+        with TaskGroup(WorkspaceType.GLOBAL) as task_group:
+            for node, manager in self._node_managers:
+                with Node(node):
+                    func(manager, *args, **kw)
+            return task_group
+
+    """
+    Args:
+        nodes: An array of nodes where this checkpoint manager is running.
+        retrieve_from_epoch: Set to a number to load blobs from this epoch.
+        path_prefix: Used to construct db name or path where checkpoint files are
+            stored.
+        path_type: Indicate the type of path where checkpoint files are stored.
+    """
+    def init(
+        self, nodes, retrieve_from_epoch=None, path_prefix=None, path_type=None
+    ):
+        if self._node_managers is not None:
+            assert [node for node, _ in self._node_managers] == nodes
+            return TaskGroup(WorkspaceType.GLOBAL)
+        self._node_managers = []
+        for node in nodes:
+            with Node(node):
+                manager = CheckpointManager(
+                    db_prefix=self._db_prefix,
+                    node_name=str(node),
+                    db_type=self._db_type)
+                self._node_managers.append((node, manager))
+        return self._task_group(
+            CheckpointManager.init,
+            nodes=[node],
+            retrieve_from_epoch=retrieve_from_epoch,
+            path_prefix=path_prefix,
+            path_type=path_type)
+
+    def load(self, epoch, path_prefix=None, path_type=None):
+        return self._task_group(
+            CheckpointManager.load,
+            epoch,
+            path_prefix=path_prefix,
+            path_type=path_type)
+
+    def load_blobs_locally(self, nodes, blob_names, epoch, session):
+        """Loads the necessary blobs from the checkpoints to the current node.
+
+        Args:
+            blob_names: A list of strings. Each string is the name of a
+                blob.
+            epoch: An integer. The checkpoint epoch to load from.
+            session: A Session object to execute the Load ops.
+        """
+        if self._node_managers is not None:
+            assert [node for node, _ in self._node_managers] == nodes
+        else:
+            self._node_managers = []
+            for node in nodes:
+                with Node(node):
+                    manager = CheckpointManager(
+                        db_prefix=self._db_prefix,
+                        node_name=str(node),
+                        db_type=self._db_type)
+                    self._node_managers.append((node, manager))
+        assert self._node_managers is not None, 'must initialize node managers'
+        for _, manager in self._node_managers:
+            existence_task = manager.check_db_exists(epoch)
+            session.run(existence_task)
+            existence = existence_task.outputs()[0].fetch()
+            if not existence:
+                logger.info('DB %s does not exist!' %
+                            db_name(epoch, manager._node_name, manager._db_prefix))
+                return False
+            load_task = manager.load_blobs_from_checkpoint(blob_names, epoch)
+            session.run(load_task)
+        logger.info('Successfully loaded from checkpoints.')
+        return True
+
+    def get_ckpt_db_name(self, node_name, epoch):
+        """Returns the DB name of the given node and the given epoch.
+
+        The DB name is effectively the checkpoint path of the given node and
+        the given epoch.
+
+        Args:
+            node_name: A string. The node name of interest.
+            epoch: An integer. The epoch of the checkpoint.
+
+        Returns:
+            checkpoint_db_name: A string. The checkpoint path of the given
+                node and the given epoch.
+        """
+        for node, manager in self._node_managers:
+            if str(node) == node_name:
+                return db_name(epoch, manager._node_name, manager._db_prefix)
+
+    def report_checkpoint_stats(self, action_name):
+        """
+        Report the checkpoint stats for all the nodes, we need to aggregate all
+        the node's stats together so that we know which node's checkpoint
+        operation dominates.
+
+        Args:
+            action_name: A string of the name of checkpoint operation.
+        """
+        all_stats = {}
+        for _, manager in self._node_managers:
+            manager.collect_checkpoint_stats(all_stats)
+        logger.debug("checkpoint stats: {}".format(all_stats))
+        if self._metadata_handler:
+            self._metadata_handler.report(action_name, all_stats)
+
+    def save(self, epoch):
+        """
+        Build a Task that will execute a Save ops to serialize and persist
+        blobs present in the global workspace.
+        """
+        return self._task_group(CheckpointManager.save, epoch)
+
+    def write_checkpoint_metadata(self, epoch):
+        """
+        Write metadata for checkpoint
+
+        Args:
+            epoch: An integer. The epoch-id for which checkpoint metadata is
+                written
+        """
+        if self._metadata_handler is not None:
+            self._metadata_handler.write(epoch=epoch)
+
+    def get_resume_from_epoch_id(self, user_epoch=None):
+        """
+        Identify the epoch-id from which Job must resume
+
+        Args:
+            user_epoch: An integer. Optional parameter for user to explicitly
+                identify the epoch-id to load checkpoint from
+        Retruns:
+            epoch: the epoch-id to load checkpoints from
+                or None if no checkpoints were written
+        """
+        last_epoch = user_epoch
+        if self._metadata_handler is not None:
+            last_epoch = self._metadata_handler.last_epoch(user_epoch=user_epoch)
+        return last_epoch
+
+    def set_params(self, nodes, path_prefix=None, path_type=None):
+        """Set parameters associated with CP manager
+
+        Args:
+            nodes: An array of nodes where this checkpoint manager is running.
+            path_prefix: Used to construct db name or path where checkpoint files are
+                stored.
+            path_type: Indicate the type of path where checkpoint files are stored.
+        """
+        self._node_names = [str(node) for node in nodes]
+        if path_prefix:
+            self._path_prefix = path_prefix
+        if path_type:
+            self._path_type = path_type
+        if self._metadata_handler:
+            self._metadata_handler.set_params(
+                db_prefix=self._db_prefix,
+                db_type=self._db_type,
+                node_names=self._node_names,
+                path_prefix=self._path_prefix,
+                path_type=self._path_type)
+
+    def cp_accessible(self, epoch=None):
+        """Returns True if Checkpoint data is accessible
+
+        Args:
+            epoch: An integer. The epoch of the checkpoint. If None,
+                it implies we need to check if checkpoint directory is accessible
+
+        Returns:
+            is_cp_accessible: A boolean. Returns True if Checkpoint data is accessible
+        """
+        if self._metadata_handler is not None:
+            return self._metadata_handler.cp_accessible(epoch)
+        else:
+            return True
+
+
+class UploadTaskGroupBuilder(object):
+    """A simple class to upload checkpoints."""
+    def build(self, epoch, checkpoint_manager):
+        """Builds the task group to upload checkpoints.
+
+        Args:
+            epoch: An integer. The checkpoint epoch to be uploaded.
+            checkpoint_manager: Can be a CheckpointManager for single machine
+                or a MultiNodeCheckpointManager for multi-machine. The manager
+                that initializes/saves/loads checkpoints.
+
+        Raises:
+            NotImplementedError: This base class only has the interface,
+                the implementation will be in the subclasses.
+        """
+        raise NotImplementedError()
+
+
+class JobRunner(object):
+    """
+    Implement the runtime logic for jobs with checkpointing at the level of
+    epoch. Can be used to run either single-host or distributed jobs. Job
+    runner is a callable to be called once from the master, passing a session
+    as an argument. This call will block until the Job execution is complete.
+
+    If a checkpoint_manager is passed, checkpoints will be taken after
+    initialization and after each epoch execution. If, in addition,
+    `resume_from_epoch` is an epoch number, the corresponding checkpoint will
+    be loaded and job execution will continue from the given epoch. In
+    this case, the job's init_group will not be run.
+
+    Refer to checkpoint_test.py for an example.
+    """
+    def __init__(self, job, checkpoint_manager=None, resume_from_epoch=None,
+                 upload_task_group_builder=None):
+        """Initializes the JobRunner.
+
+        Args:
+            job: A Job object. The job to be executed.
+            checkpoint_manager: Can be a CheckpointManager for single machine
+                or a MultiNodeCheckpointManager for multi-machine. The manager
+                that initializes/saves/loads checkpoints.
+            resume_from_epoch: An integer. The epoch to resume from.
+            upload_task_group_builder: A subclass of the
+                UploadTaskGroupBuilder. Creates a task group to upload
+                checkpoints.
+        """
+        self.resume_from_epoch = resume_from_epoch
+        self.checkpoint_manager = checkpoint_manager
+        self.job = job
+        self.upload_task_group_builder = upload_task_group_builder
+
+    def train(self, session):
+        """Runs the training flow.
+
+        Args:
+            session: A Session object. Valid choises are: LocalSession,
+                LocalHostScheduler, and DistributedSession. It is used to
+                execute one TaskGroup a time.
+        """
+        # identify the epoch we must resume from
+        if self.checkpoint_manager:
+            self.checkpoint_manager.set_params(nodes=self.job.nodes_to_checkpoint())
+            self.resume_from_epoch = self.checkpoint_manager.\
+                get_resume_from_epoch_id(self.resume_from_epoch)
+            if self.resume_from_epoch is not None:
+                logger.info('Resuming from epoch {}'.format(self.resume_from_epoch))
+
+        # Initialize all the nodes.
+        from_scratch = self.resume_from_epoch is None
+        if from_scratch:
+            session.run(self.job.init_group)
+
+        if self.checkpoint_manager:
+            logger.info('Preparing checkpoints ...')
+            session.run(self.checkpoint_manager.init(
+                self.job.nodes_to_checkpoint(),
+                retrieve_from_epoch=self.resume_from_epoch))
+            # Save the first checkpoint before training starts, or resume from
+            # a previously saved checkpoint.
+            if from_scratch:
+                self.save_checkpoints(0, session)
+            else:
+                logger.info('Loading checkpoints for epoch {} ...'.format(
+                    self.resume_from_epoch))
+                session.run(
+                    self.checkpoint_manager.load(self.resume_from_epoch))
+                self.checkpoint_manager.report_checkpoint_stats('checkpoint_load')
+                logger.info('Checkpoint loaded')
+
+        logger.info("Finished initializing")
+
+        # Start training.
+        epoch = 1 if from_scratch else self.resume_from_epoch + 1
+        while True:
+            logger.info('Starting epoch %d' % epoch)
+            session.run(self.job.epoch_group)
+            logger.info('Finished epoch %d' % epoch)
+            stop_conditions = [o.fetch() for o in self.job.stop_conditions]
+
+            if self.checkpoint_manager:
+                self.save_checkpoints(epoch, session)
+
+            if any(stop_conditions):
+                logger.info('Stopping')
+                break
+            epoch += 1
+        logger.info('Finished training')
+        # Upload the checkpoints.
+        if (self.upload_task_group_builder):
+            upload_task_group = self.upload_task_group_builder.build(
+                epoch, self.checkpoint_manager)
+            session.run(upload_task_group)
+            logger.info('Finished uploading the checkpoints')
+
+        # Download the parameters to save
+        session.run(self.job.download_group)
+        logger.info('Finished downloading the parameters')
+
+        # Finally run the exit step to save nets
+        session.run(self.job.exit_group)
+        logger.info('Finished running the exit group')
+        return epoch
+
+    def load_blobs_from_checkpoints(self, blob_names, epoch, session):
+        """Loads the necessary blobs from the checkpoints.
+
+        Checkpoints store the snapshots of the workspace in each node.
+        Sometimes we only need to load a subset of the blobs from the
+        checkpoints. One common scenario is to load only the model blobs from
+        the checkpoints for evaluation purpose. Given the names of the
+        necessary blobs, this function goes over all the checkpoints of all the
+        nodes, but only loads the blobs specified in the blob_names to the
+        current workspace.
+
+        Args:
+            blob_names: A list of strings. Each string is the name of a
+                blob.
+            epoch: An integer. The checkpoint epoch to load from.
+            session: A Session object to execute the load ops.
+
+        Raises:
+            ValueError: When the checkpoint manager is invalid.
+        """
+        if not self.checkpoint_manager:
+            raise ValueError('Checkpoint manager is None')
+        logger.info('Loading checkpoint for epoch {} ...'.format(epoch))
+        result = self.checkpoint_manager.load_blobs_locally(
+            self.job.nodes_to_checkpoint(), blob_names, epoch, session)
+        self.checkpoint_manager.report_checkpoint_stats('checkpoint_partial_load')
+        return result
+
+    def save_checkpoints(self, epoch, session):
+        """Triggers operation to save checkpoints
+
+        This method will trigger the Save ops to serialize and persist the
+        blobs present in the global workspaace.
+
+        Args:
+            epoch: An integer. The checkpoint epoch-id that we are saving.
+            session: A Session object to execute the save ops.
+
+        Raises:
+            ValueError: When the checkpoint manager is invalid.
+        """
+        if not self.checkpoint_manager:
+            raise ValueError('Checkpoint manager is None')
+        try:
+            is_accessible = self.checkpoint_manager.cp_accessible(epoch=None)
+            if is_accessible:
+                logger.info('Saving checkpoints for epoch {}'.format(epoch))
+                session.run(self.checkpoint_manager.save(epoch))
+                self.checkpoint_manager.write_checkpoint_metadata(epoch)
+                logger.info('Checkpoints saved')
+                self.checkpoint_manager.report_checkpoint_stats('checkpoint_save')
+            else:
+                logger.warning("Checkpoint files cannot be accessed!")
+        except Exception as ex:
+            logger.warning("Unable to write checkpoint for epoch {}. Error={}".
+                            format(epoch, ex))
+
+
+def epoch_limiter(job, num_epochs):
+    """
+    Creates a task that will output True when a given
+    number of epochs has finished.
+    """
+    with job.init_group:
+        init_net = core.Net('epoch_counter_init')
+        counter = init_net.CreateCounter([], init_count=num_epochs - 1)
+        Task(step=init_net)
+
+    with job.epoch_group:
+        epoch_net = core.Net('epoch_countdown')
+        finished = epoch_net.CountDown(counter)
+        output = Task(step=epoch_net, outputs=finished).outputs()[0]
+    job.add_stop_condition(output)
diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
new file mode 100644
index 0000000..a91bbf9
--- /dev/null
+++ b/caffe2/python/checkpoint_test.py
@@ -0,0 +1,338 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.schema import Struct, ConstRecord
+from caffe2.python import core, workspace, model_helper
+from caffe2.python.session import LocalSession
+from caffe2.python.dataset import Dataset
+from caffe2.python.pipeline import pipe
+from caffe2.python.checkpoint import (
+    CheckpointManager, MultiNodeCheckpointManager, Job, JobRunner, epoch_limiter,
+    UploadTaskGroupBuilder, db_name)
+from caffe2.python.net_builder import ops
+from caffe2.python.task import Node, Task, TaskGroup, WorkspaceType, Cluster
+from caffe2.python.test_util import TestCase
+from caffe2.python.dataio import ReaderWithLimit
+
+import numpy as np
+import os
+import shutil
+import tempfile
+
+
+def build_pipeline(node_id):
+    with Node('trainer_%d' % node_id):
+        with Job.current().init_group, Task():
+            data_arr = Struct(('val', np.array(list(range(10)))))
+            data = ConstRecord(ops, data_arr)
+            ds = Dataset(data, name='dataset:%d' % node_id)
+            full_reader = ds.reader(ops)
+            total = ops.Const([100])
+
+        def inc_total(rec):
+            ops.Add([total, rec.val()], [total])
+
+        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
+        pipe(epoch_reader, processor=inc_total)
+        Job.current().add_stop_condition(epoch_reader.data_finished())
+    return [total]
+
+
+EXPECTED_TOTALS = [103, 115, 136, 145]
+
+
+def local_copy_op(src, dest):
+    def copy_op(inputs, outputs):
+        shutil.copyfile(src, dest)
+    return copy_op
+
+
+class UploadToLocalFile(UploadTaskGroupBuilder):
+    def __init__(self, dest_dir):
+        self.dest_dir = dest_dir
+
+    def build(self, epoch, checkpoint_manager):
+        with TaskGroup(WorkspaceType.GLOBAL) as upload_task_group:
+            for node, manager in checkpoint_manager._node_managers:
+                with Node(str(node)), Task():
+                    src_path = db_name(epoch, manager._node_name, manager._db_prefix)
+                    dest_path = os.path.join(self.dest_dir, str(node))
+                    ops.Python((local_copy_op,
+                                [src_path, dest_path], {}))([], [])
+        return upload_task_group
+
+
+class TestCheckpoint(TestCase):
+    def run_with(self, builder):
+        with Cluster():
+            with Job() as job:
+                outputs = build_pipeline(node_id=0)
+            output_fetcher = Task(step=core.Net('empty'), outputs=outputs)
+
+            def fetch_total(session):
+                session.run(output_fetcher)
+                return output_fetcher.outputs()[0].fetch()
+
+            session, checkpoint = builder()
+            job.compile(LocalSession)
+            num_epochs = JobRunner(job, checkpoint).train(session)
+            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+            self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+
+            for initial_epoch in range(1, num_epochs + 1):
+                session, checkpoint = builder()
+                JobRunner(
+                    job,
+                    checkpoint, resume_from_epoch=initial_epoch
+                ).train(session)
+                self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+
+            for epoch in range(1, num_epochs + 1):
+                session.run(checkpoint.load(epoch))
+                self.assertEquals(fetch_total(session),
+                                  EXPECTED_TOTALS[epoch - 1])
+
+    def test_single_checkpoint(self):
+        # test single node
+        try:
+            tmpdir = tempfile.mkdtemp()
+
+            def builder():
+                ws = workspace.C.Workspace()
+                session = LocalSession(ws)
+                checkpoint = CheckpointManager(tmpdir, 'temp_node', 'minidb')
+                return session, checkpoint
+
+            self.run_with(builder)
+        finally:
+            shutil.rmtree(tmpdir)
+
+        # test multi-node
+        try:
+            tmpdir = tempfile.mkdtemp()
+
+            def builder():
+                ws = workspace.C.Workspace()
+                session = LocalSession(ws)
+                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+                return session, checkpoint
+
+            self.run_with(builder)
+        finally:
+            shutil.rmtree(tmpdir)
+
+    def test_ckpt_name_and_load_model_from_ckpts(self):
+        try:
+            num_nodes = 3
+            tmpdir = tempfile.mkdtemp()
+            # First, check if the checkpoint name generation mechanism is
+            # correct.
+            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+            with Cluster():
+                with Job() as job:
+                    for node_id in range(num_nodes):
+                        build_pipeline(node_id)
+                job.compile(LocalSession)
+                checkpoint.init(job.nodes_to_checkpoint())
+
+                for node_id in range(num_nodes):
+                    epoch = 5
+                    node_name = 'trainer_%d' % node_id
+                    expected_db_name = tmpdir + '/' + node_name + '.5'
+                    self.assertEquals(
+                        checkpoint.get_ckpt_db_name(node_name, epoch),
+                        expected_db_name)
+            shutil.rmtree(tmpdir)
+
+            # Next, check mechanism to load model from checkpoints.
+            tmpdir = tempfile.mkdtemp()
+            workspace.ResetWorkspace()
+            for node_id in range(num_nodes):
+                ws = workspace.C.Workspace()
+                session = LocalSession(ws)
+                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+                with Cluster():
+                    with Job() as job:
+                        build_pipeline(node_id)
+                    job.compile(LocalSession)
+                    job_runner = JobRunner(job, checkpoint)
+                    num_epochs = job_runner.train(session)
+                self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+
+                # There are 17 global blobs after finishing up the job runner.
+                # (only blobs on init_group are checkpointed)
+                self.assertEquals(len(ws.blobs), 17)
+
+            ws = workspace.C.Workspace()
+            session = LocalSession(ws)
+            self.assertEquals(len(ws.blobs), 0)
+            model_blob_names = ['trainer_1/task_2/GivenTensorInt64Fill:0',
+                                'trainer_2/task_2/GivenTensorInt64Fill:0']
+            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+            with Cluster():
+                with Job() as job:
+                    for node_id in range(num_nodes):
+                        build_pipeline(node_id)
+                job.compile(LocalSession)
+                job_runner = JobRunner(job, checkpoint)
+                job_runner.load_blobs_from_checkpoints(
+                    blob_names=model_blob_names, epoch=1, session=session)
+
+                # Check that we can successfully load from checkpoints of epochs
+                # 1 to 4, but not epoch 5.
+                for epoch in range(1, 5):
+                    self.assertTrue(
+                        job_runner.load_blobs_from_checkpoints(
+                            blob_names=model_blob_names, epoch=epoch,
+                            session=session))
+                    # Check that all the model blobs are loaded.
+                    for blob_name in model_blob_names:
+                        self.assertTrue(ws.has_blob(blob_name))
+                        self.assertEquals(
+                            ws.fetch_blob(blob_name),
+                            np.array([EXPECTED_TOTALS[epoch - 1]]))
+                self.assertFalse(
+                    job_runner.load_blobs_from_checkpoints(
+                        blob_names=model_blob_names, epoch=5, session=session))
+
+        finally:
+            shutil.rmtree(tmpdir)
+
+    def test_upload_checkpoint(self):
+        try:
+            tmpdir = tempfile.mkdtemp()
+            upload_dir = os.path.join(tmpdir, "upload")
+            os.mkdir(upload_dir)
+            num_nodes = 3
+
+            # The uploaded files do not exist yet.
+            for node_id in range(num_nodes):
+                node_name = 'trainer_%d' % node_id
+                upload_path = os.path.join(upload_dir, node_name)
+                self.assertFalse(os.path.exists(upload_path))
+
+            # Create and run the job runner.
+            for node_id in range(3):
+                ws = workspace.C.Workspace()
+                session = LocalSession(ws)
+                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+                with Cluster():
+                    with Job() as job:
+                        build_pipeline(node_id)
+                    job.compile(LocalSession)
+                    local_upload_builder = UploadToLocalFile(upload_dir)
+                    job_runner = JobRunner(
+                        job, checkpoint,
+                        upload_task_group_builder=local_upload_builder)
+                    num_epochs = job_runner.train(session)
+                    self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+
+            # The uploaded files should exist now.
+            for node_id in range(num_nodes):
+                node_name = 'trainer_%d' % node_id
+                upload_path = os.path.join(upload_dir, node_name)
+                self.assertTrue(os.path.exists(upload_path))
+
+        finally:
+            shutil.rmtree(tmpdir)
+
+    def test_ckpt_save_failure(self):
+        num_nodes = 3
+        # The goal of this test is to ensure that the job runs
+        # successfully even if saving a checkpoint fails.
+        # Hence tmpdir is a non existent directory to emulate a failure
+        # while saving checkpoints
+        tmpdir = "/tmp/path_does_not_exist/"
+
+        # Check the saving checkpoint failure does not cause job failure
+        workspace.ResetWorkspace()
+        for node_id in range(num_nodes):
+            ws = workspace.C.Workspace()
+            session = LocalSession(ws)
+            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+            with Cluster():
+                with Job() as job:
+                    build_pipeline(node_id)
+                job.compile(LocalSession)
+                job_runner = JobRunner(job, checkpoint)
+                num_epochs = job_runner.train(session)
+            # make sure all epochs are executed even though saving the checkpoint failed
+            # Saving checkpoint failure should not cause job failure
+            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+
+    def test_download_group_simple(self):
+        """
+        A simple test that ensures we have download task group
+        executed between epoch_group and exit_group.
+        """
+        model = model_helper.ModelHelper(name="test_model")
+        download_net = core.Net("download_net")
+
+        for name in ["input1", "input2", "output", "download_result"]:
+            model.param_init_net.ConstantFill([],
+                                              [name],
+                                              shape=[8, ],
+                                              value=1.0,
+                                              run_once=0)
+        model.net.Add(["input1", "input2"], ["output"])
+        download_net.Copy(["output"], ["download_result"])
+
+        # All blob values are initialized as 1.0, after download_net executed
+        # we expect to see download result is the same as training result.
+        with Job() as job:
+            with Node("trainer:0"):
+                with job.init_group:
+                    Task(step=model.param_init_net)
+                with job.epoch_group:
+                    with Task():
+                        with ops.loop(1):
+                            ops.net(model.net)
+                with job.download_group:
+                    Task(step=download_net)
+
+                epoch_limiter(job, 1)
+
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        job_runner = JobRunner(job)
+        job_runner.train(session)
+
+        expected_result = np.full(8, 2.0).astype(np.float32)
+        self.assertTrue(np.array_equal(expected_result,
+                                       ws.fetch_blob("output")))
+        self.assertTrue(np.array_equal(expected_result,
+                                       ws.fetch_blob("download_result")))
+
+    def test_reuse_checkpoint_manager(self):
+        """
+        A simple test that ensures we can reuse a MultiNodeCheckpointManager
+        object.
+        """
+        try:
+            tmpdir = tempfile.mkdtemp()
+            ws = workspace.C.Workspace()
+            session = LocalSession(ws)
+            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
+
+            with Job() as job:
+                outputs = build_pipeline(node_id=0)
+            output_fetcher = Task(step=core.Net('empty'), outputs=outputs)
+            job.compile(LocalSession)
+
+            def fetch_total(session):
+                session.run(output_fetcher)
+                return output_fetcher.outputs()[0].fetch()
+
+            num_epochs = JobRunner(job, checkpoint).train(session)
+            for initial_epoch in range(1, num_epochs + 1):
+                JobRunner(
+                    job,
+                    checkpoint,
+                    resume_from_epoch=initial_epoch
+                ).train(session)
+                self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+
+        finally:
+            shutil.rmtree(tmpdir)
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
new file mode 100644
index 0000000..f927020
--- /dev/null
+++ b/caffe2/python/cnn.py
@@ -0,0 +1,240 @@
+## @package cnn
+# Module caffe2.python.cnn
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew
+from caffe2.python.model_helper import ModelHelper
+from caffe2.proto import caffe2_pb2
+import logging
+
+
+class CNNModelHelper(ModelHelper):
+    """A helper model so we can write CNN models more easily, without having to
+    manually define parameter initializations and operators separately.
+    """
+
+    def __init__(self, order="NCHW", name=None,
+                 use_cudnn=True, cudnn_exhaustive_search=False,
+                 ws_nbytes_limit=None, init_params=True,
+                 skip_sparse_optim=False,
+                 param_model=None):
+        logging.warning(
+            "[====DEPRECATE WARNING====]: you are creating an "
+            "object from CNNModelHelper class which will be deprecated soon. "
+            "Please use ModelHelper object with brew module. For more "
+            "information, please refer to caffe2.ai and python/brew.py, "
+            "python/brew_test.py for more information."
+        )
+
+        cnn_arg_scope = {
+            'order': order,
+            'use_cudnn': use_cudnn,
+            'cudnn_exhaustive_search': cudnn_exhaustive_search,
+        }
+        if ws_nbytes_limit:
+            cnn_arg_scope['ws_nbytes_limit'] = ws_nbytes_limit
+        super(CNNModelHelper, self).__init__(
+            skip_sparse_optim=skip_sparse_optim,
+            name="CNN" if name is None else name,
+            init_params=init_params,
+            param_model=param_model,
+            arg_scope=cnn_arg_scope,
+        )
+
+        self.order = order
+        self.use_cudnn = use_cudnn
+        self.cudnn_exhaustive_search = cudnn_exhaustive_search
+        self.ws_nbytes_limit = ws_nbytes_limit
+        if self.order != "NHWC" and self.order != "NCHW":
+            raise ValueError(
+                "Cannot understand the CNN storage order %s." % self.order
+            )
+
+    def ImageInput(self, blob_in, blob_out, use_gpu_transform=False, **kwargs):
+        return brew.image_input(
+            self,
+            blob_in,
+            blob_out,
+            order=self.order,
+            use_gpu_transform=use_gpu_transform,
+            **kwargs
+        )
+
+    def VideoInput(self, blob_in, blob_out, **kwargs):
+        return brew.video_input(
+            self,
+            blob_in,
+            blob_out,
+            **kwargs
+        )
+
+    def PadImage(self, blob_in, blob_out, **kwargs):
+        # TODO(wyiming): remove this dummy helper later
+        self.net.PadImage(blob_in, blob_out, **kwargs)
+
+    def ConvNd(self, *args, **kwargs):
+        return brew.conv_nd(
+            self,
+            *args,
+            use_cudnn=self.use_cudnn,
+            order=self.order,
+            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
+            ws_nbytes_limit=self.ws_nbytes_limit,
+            **kwargs
+        )
+
+    def Conv(self, *args, **kwargs):
+        return brew.conv(
+            self,
+            *args,
+            use_cudnn=self.use_cudnn,
+            order=self.order,
+            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
+            ws_nbytes_limit=self.ws_nbytes_limit,
+            **kwargs
+        )
+
+    def ConvTranspose(self, *args, **kwargs):
+        return brew.conv_transpose(
+            self,
+            *args,
+            use_cudnn=self.use_cudnn,
+            order=self.order,
+            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
+            ws_nbytes_limit=self.ws_nbytes_limit,
+            **kwargs
+        )
+
+    def GroupConv(self, *args, **kwargs):
+        return brew.group_conv(
+            self,
+            *args,
+            use_cudnn=self.use_cudnn,
+            order=self.order,
+            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
+            ws_nbytes_limit=self.ws_nbytes_limit,
+            **kwargs
+        )
+
+    def GroupConv_Deprecated(self, *args, **kwargs):
+        return brew.group_conv_deprecated(
+            self,
+            *args,
+            use_cudnn=self.use_cudnn,
+            order=self.order,
+            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
+            ws_nbytes_limit=self.ws_nbytes_limit,
+            **kwargs
+        )
+
+    def FC(self, *args, **kwargs):
+        return brew.fc(self, *args, **kwargs)
+
+    def PackedFC(self, *args, **kwargs):
+        return brew.packed_fc(self, *args, **kwargs)
+
+    def FC_Prune(self, *args, **kwargs):
+        return brew.fc_prune(self, *args, **kwargs)
+
+    def FC_Decomp(self, *args, **kwargs):
+        return brew.fc_decomp(self, *args, **kwargs)
+
+    def FC_Sparse(self, *args, **kwargs):
+        return brew.fc_sparse(self, *args, **kwargs)
+
+    def Dropout(self, *args, **kwargs):
+        return brew.dropout(
+            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
+        )
+
+    def LRN(self, *args, **kwargs):
+        return brew.lrn(
+            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
+        )
+
+    def Softmax(self, *args, **kwargs):
+        return brew.softmax(self, *args, use_cudnn=self.use_cudnn, **kwargs)
+
+    def SpatialBN(self, *args, **kwargs):
+        return brew.spatial_bn(self, *args, order=self.order, **kwargs)
+
+    def SpatialGN(self, *args, **kwargs):
+        return brew.spatial_gn(self, *args, order=self.order, **kwargs)
+
+    def InstanceNorm(self, *args, **kwargs):
+        return brew.instance_norm(self, *args, order=self.order, **kwargs)
+
+    def Relu(self, *args, **kwargs):
+        return brew.relu(
+            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
+        )
+
+    def PRelu(self, *args, **kwargs):
+        return brew.prelu(self, *args, **kwargs)
+
+    def Concat(self, *args, **kwargs):
+        return brew.concat(self, *args, order=self.order, **kwargs)
+
+    def DepthConcat(self, *args, **kwargs):
+        """The old depth concat function - we should move to use concat."""
+        print("DepthConcat is deprecated. use Concat instead.")
+        return self.Concat(*args, **kwargs)
+
+    def Sum(self, *args, **kwargs):
+        return brew.sum(self, *args, **kwargs)
+
+    def Transpose(self, *args, **kwargs):
+        return brew.transpose(self, *args, use_cudnn=self.use_cudnn, **kwargs)
+
+    def Iter(self, *args, **kwargs):
+        return brew.iter(self, *args, **kwargs)
+
+    def Accuracy(self, *args, **kwargs):
+        return brew.accuracy(self, *args, **kwargs)
+
+    def MaxPool(self, *args, **kwargs):
+        return brew.max_pool(
+            self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs
+        )
+
+    def MaxPoolWithIndex(self, *args, **kwargs):
+        return brew.max_pool_with_index(self, *args, order=self.order, **kwargs)
+
+    def AveragePool(self, *args, **kwargs):
+        return brew.average_pool(
+            self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs
+        )
+
+    @property
+    def XavierInit(self):
+        return ('XavierFill', {})
+
+    def ConstantInit(self, value):
+        return ('ConstantFill', dict(value=value))
+
+    @property
+    def MSRAInit(self):
+        return ('MSRAFill', {})
+
+    @property
+    def ZeroInit(self):
+        return ('ConstantFill', {})
+
+    def AddWeightDecay(self, weight_decay):
+        return brew.add_weight_decay(self, weight_decay)
+
+    @property
+    def CPU(self):
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CPU
+        return device_option
+
+    @property
+    def GPU(self, gpu_id=0):
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = gpu_id
+        return device_option
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
new file mode 100644
index 0000000..928807b
--- /dev/null
+++ b/caffe2/python/context.py
@@ -0,0 +1,123 @@
+## @package context
+# Module caffe2.python.context
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import threading
+import six
+
+
+class _ContextInfo(object):
+    def __init__(self, cls, allow_default, arg_name):
+        self.cls = cls
+        self.allow_default = allow_default
+        self.arg_name = arg_name
+        self._local_stack = threading.local()
+
+    @property
+    def _stack(self):
+        if not hasattr(self._local_stack, 'obj'):
+            self._local_stack.obj = []
+        return self._local_stack.obj
+
+    def enter(self, value):
+        self._stack.append(value)
+
+    def exit(self, value):
+        assert len(self._stack) > 0, 'Context %s is empty.' % self.cls
+        assert self._stack.pop() == value
+
+    def get_active(self, required=True):
+        if len(self._stack) == 0:
+            if not required:
+                return None
+            assert self.allow_default, (
+                'Context %s is required but none is active.' % self.cls)
+            self.enter(self.cls())
+        return self._stack[-1]
+
+
+class _ContextRegistry(object):
+    def __init__(self):
+        self._ctxs = {}
+
+    def register(self, ctx_info):
+        assert isinstance(ctx_info, _ContextInfo)
+        assert (ctx_info.cls not in self._ctxs), (
+            'Context %s already registered' % ctx_info.cls)
+        self._ctxs[ctx_info.cls] = ctx_info
+
+    def get(self, cls):
+        assert cls in self._ctxs, 'Context %s not registered.' % cls
+        return self._ctxs[cls]
+
+
+_CONTEXT_REGISTRY = _ContextRegistry()
+
+
+def _context_registry():
+    global _CONTEXT_REGISTRY
+    return _CONTEXT_REGISTRY
+
+
+def __enter__(self):
+    if self._prev_enter is not None:
+        self._prev_enter()
+    _context_registry().get(self._ctx_class).enter(self)
+    return self
+
+
+def __exit__(self, *args):
+    _context_registry().get(self._ctx_class).exit(self)
+    if self._prev_exit is not None:
+        self._prev_exit(*args)
+
+
+def __call__(self, func):
+    @six.wraps(func)
+    def wrapper(*args, **kwargs):
+        with self:
+            return func(*args, **kwargs)
+    return wrapper
+
+
+@classmethod
+def _current(cls, value=None, required=True):
+    return _get_active_context(cls, value, required)
+
+
+class define_context(object):
+    def __init__(self, arg_name=None, allow_default=False):
+        self.arg_name = arg_name
+        self.allow_default = allow_default
+
+    def __call__(self, cls):
+        assert not hasattr(cls, '_ctx_class'), (
+            '%s parent class (%s) already defines context.' % (
+                cls, cls._ctx_class))
+        cls._ctx_class = cls
+
+        _context_registry().register(
+            _ContextInfo(cls, self.allow_default, self.arg_name)
+        )
+
+        cls._prev_enter = cls.__enter__ if hasattr(cls, '__enter__') else None
+        cls._prev_exit = cls.__exit__ if hasattr(cls, '__exit__') else None
+
+        cls.__enter__ = __enter__
+        cls.__exit__ = __exit__
+        cls.__call__ = __call__
+        cls.current = _current
+
+        return cls
+
+
+def _get_active_context(cls, val=None, required=True):
+    ctx_info = _context_registry().get(cls)
+    if val is not None:
+        assert isinstance(val, cls), (
+            'Wrong context type. Expected: %s, got %s.' % (cls, type(val)))
+        return val
+    return ctx_info.get_active(required=required)
diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py
new file mode 100644
index 0000000..6a1f77f
--- /dev/null
+++ b/caffe2/python/context_test.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import context, test_util
+from threading import Thread
+
+
+@context.define_context()
+class MyContext(object):
+    pass
+
+
+class TestContext(test_util.TestCase):
+    def use_my_context(self):
+        try:
+            for _ in range(100):
+                with MyContext() as a:
+                    for _ in range(100):
+                        self.assertTrue(MyContext.current() == a)
+        except Exception as e:
+            self._exceptions.append(e)
+
+    def testMultiThreaded(self):
+        threads = []
+        self._exceptions = []
+        for _ in range(8):
+            thread = Thread(target=self.use_my_context)
+            thread.start()
+            threads.append(thread)
+        for t in threads:
+            t.join()
+        for e in self._exceptions:
+            raise e
+
+    @MyContext()
+    def testDecorator(self):
+        self.assertIsNotNone(MyContext.current())
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
new file mode 100644
index 0000000..dd332f7
--- /dev/null
+++ b/caffe2/python/control.py
@@ -0,0 +1,575 @@
+## @package control
+# Module caffe2.python.control
+"""
+Implement functions for controlling execution of nets and steps, including
+  Do
+  DoParallel
+  For-loop
+  While-loop
+  Do-While-loop
+  Switch
+  If
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from future.utils import viewitems
+
+
+# Used to generate names of the steps created by the control functions.
+# It is actually the internal index of these steps.
+_current_idx = 1
+_used_step_names = set()
+
+
+def _get_next_step_name(control_name, base_name):
+    global _current_idx, _used_step_names
+    concat_name = '%s/%s' % (base_name, control_name)
+    next_name = concat_name
+    while next_name in _used_step_names:
+        next_name = '%s_%d' % (concat_name, _current_idx)
+        _current_idx += 1
+    _used_step_names.add(next_name)
+    return next_name
+
+
+def _MakeList(input):
+    """ input is a tuple.
+    Example:
+    (a, b, c)   --> [a, b, c]
+    (a)         --> [a]
+    ([a, b, c]) --> [a, b, c]
+    """
+    if len(input) == 0:
+        raise ValueError(
+            'input cannot be empty.')
+    elif len(input) == 1:
+        output = input[0]
+        if not isinstance(output, list):
+            output = [output]
+    else:
+        output = list(input)
+    return output
+
+
+def _IsNets(nets_or_steps):
+    if isinstance(nets_or_steps, list):
+        return all(isinstance(n, core.Net) for n in nets_or_steps)
+    else:
+        return isinstance(nets_or_steps, core.Net)
+
+
+def _PrependNets(nets_or_steps, *nets):
+    nets_or_steps = _MakeList((nets_or_steps,))
+    nets = _MakeList(nets)
+    if _IsNets(nets_or_steps):
+        return nets + nets_or_steps
+    else:
+        return [Do('prepend', nets)] + nets_or_steps
+
+
+def _AppendNets(nets_or_steps, *nets):
+    nets_or_steps = _MakeList((nets_or_steps,))
+    nets = _MakeList(nets)
+    if _IsNets(nets_or_steps):
+        return nets_or_steps + nets
+    else:
+        return nets_or_steps + [Do('append', nets)]
+
+
+def GetConditionBlobFromNet(condition_net):
+    """
+    The condition blob is the last external_output that must
+    be a single bool
+    """
+    assert len(condition_net.Proto().external_output) > 0, (
+        "Condition net %s must has at least one external output" %
+        condition_net.Proto.name)
+    # we need to use a blob reference here instead of a string
+    # otherwise, it will add another name_scope to the input later
+    # when we create new ops (such as OR of two inputs)
+    return core.BlobReference(condition_net.Proto().external_output[-1])
+
+
+def BoolNet(*blobs_with_bool_value):
+    """A net assigning constant bool values to blobs. It is mainly used for
+    initializing condition blobs, for example, in multi-task learning, we
+    need to access reader_done blobs before reader_net run. In that case,
+    the reader_done blobs must be initialized.
+
+    Args:
+    blobs_with_bool_value: one or more (blob, bool_value) pairs. The net will
+    assign each bool_value to the corresponding blob.
+
+    returns
+    bool_net: A net assigning constant bool values to blobs.
+
+    Examples:
+    - BoolNet((blob_1, bool_value_1), ..., (blob_n, bool_value_n))
+    - BoolNet([(blob_1, net1), ..., (blob_n, bool_value_n)])
+    - BoolNet((cond_1, bool_value_1))
+    """
+    blobs_with_bool_value = _MakeList(blobs_with_bool_value)
+    bool_net = core.Net('bool_net')
+    for blob, bool_value in blobs_with_bool_value:
+        out_blob = bool_net.ConstantFill(
+            [],
+            [blob],
+            shape=[],
+            value=bool_value,
+            dtype=core.DataType.BOOL)
+        bool_net.AddExternalOutput(out_blob)
+
+    return bool_net
+
+
+def NotNet(condition_blob_or_net):
+    """Not of a condition blob or net
+
+    Args:
+    condition_blob_or_net can be either blob or net. If condition_blob_or_net
+    is Net, the condition is its last external_output
+    that must be a single bool.
+
+    returns
+    not_net: the net NOT the input
+    out_blob: the output blob of the not_net
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+    else:
+        condition_blob = condition_blob_or_net
+
+    not_net = core.Net('not_net')
+    out_blob = not_net.Not(condition_blob)
+    not_net.AddExternalOutput(out_blob)
+
+    return not_net, out_blob
+
+
+def _CopyConditionBlobNet(condition_blob):
+    """Make a condition net that copies the condition_blob
+
+    Args:
+    condition_blob is a single bool.
+
+    returns
+    not_net: the net NOT the input
+    out_blob: the output blob of the not_net
+    """
+    condition_net = core.Net('copy_condition_blob_net')
+    out_blob = condition_net.Copy(condition_blob)
+    condition_net.AddExternalOutput(out_blob)
+
+    return condition_net, out_blob
+
+
+def MergeConditionNets(name, condition_nets, relation):
+    """
+    Merge multi condition nets into a single condition nets.
+
+    Args:
+        name: name of the new condition net.
+        condition_nets: a list of condition nets. The last external_output
+                        of each condition net must be single bool value.
+        relation: can be 'And' or 'Or'.
+
+    Returns:
+        - A new condition net. Its last external output is relation of all
+          condition_nets.
+    """
+    if not isinstance(condition_nets, list):
+        return condition_nets
+    if len(condition_nets) <= 1:
+        return condition_nets[0] if condition_nets else None
+
+    merged_net = core.Net(name)
+    for i in range(len(condition_nets)):
+        net_proto = condition_nets[i].Proto()
+        assert net_proto.device_option == merged_net.Proto().device_option
+        assert net_proto.type == merged_net.Proto().type
+        merged_net.Proto().op.extend(net_proto.op)
+        merged_net.Proto().external_input.extend(net_proto.external_input)
+        # discard external outputs as we're combining them together
+        curr_cond = GetConditionBlobFromNet(condition_nets[i])
+        if i == 0:
+            last_cond = curr_cond
+        else:
+            last_cond = merged_net.__getattr__(relation)([last_cond, curr_cond])
+        # merge attributes
+        for k, v in viewitems(condition_nets[i]._attr_dict):
+            merged_net._attr_dict[k] += v
+
+    merged_net.AddExternalOutput(last_cond)
+
+    return merged_net
+
+
+def CombineConditions(name, condition_nets, relation):
+    """
+    Combine conditions of multi nets into a single condition nets. Unlike
+    MergeConditionNets, the actual body of condition_nets is not copied into
+    the combine condition net.
+
+    One example is about multi readers. Each reader net has a reader_done
+    condition. When we want to check whether all readers are done, we can
+    use this function to build a new net.
+
+    Args:
+        name: name of the new condition net.
+        condition_nets: a list of condition nets. The last external_output
+                        of each condition net must be single bool value.
+        relation: can be 'And' or 'Or'.
+
+    Returns:
+        - A new condition net. Its last external output is relation of all
+          condition_nets.
+    """
+    if not condition_nets:
+        return None
+    if not isinstance(condition_nets, list):
+        raise ValueError('condition_nets must be a list of nets.')
+
+    if len(condition_nets) == 1:
+        condition_blob = GetConditionBlobFromNet(condition_nets[0])
+        condition_net, _ = _CopyConditionBlobNet(condition_blob)
+        return condition_net
+
+    combined_net = core.Net(name)
+    for i in range(len(condition_nets)):
+        curr_cond = GetConditionBlobFromNet(condition_nets[i])
+        if i == 0:
+            last_cond = curr_cond
+        else:
+            last_cond = combined_net.__getattr__(relation)(
+                [last_cond, curr_cond])
+
+    combined_net.AddExternalOutput(last_cond)
+
+    return combined_net
+
+
+def Do(name, *nets_or_steps):
+    """
+    Execute the sequence of nets or steps once.
+
+    Examples:
+    - Do('myDo', net1, net2, ..., net_n)
+    - Do('myDo', list_of_nets)
+    - Do('myDo', step1, step2, ..., step_n)
+    - Do('myDo', list_of_steps)
+    """
+    nets_or_steps = _MakeList(nets_or_steps)
+    if (len(nets_or_steps) == 1 and isinstance(
+            nets_or_steps[0], core.ExecutionStep)):
+        return nets_or_steps[0]
+    else:
+        return core.scoped_execution_step(
+            _get_next_step_name('Do', name), nets_or_steps)
+
+
+def DoParallel(name, *nets_or_steps):
+    """
+    Execute the nets or steps in parallel, waiting for all of them to finish
+
+    Examples:
+    - DoParallel('pDo', net1, net2, ..., net_n)
+    - DoParallel('pDo', list_of_nets)
+    - DoParallel('pDo', step1, step2, ..., step_n)
+    - DoParallel('pDo', list_of_steps)
+    """
+    nets_or_steps = _MakeList(nets_or_steps)
+    if (len(nets_or_steps) == 1 and isinstance(
+            nets_or_steps[0], core.ExecutionStep)):
+        return nets_or_steps[0]
+    else:
+        return core.scoped_execution_step(
+            _get_next_step_name('DoParallel', name),
+            nets_or_steps,
+            concurrent_substeps=True)
+
+
+def _RunOnceIf(name, condition_blob_or_net, nets_or_steps):
+    """
+    Execute nets_or_steps once if condition_blob_or_net evaluates as true.
+
+    If condition_blob_or_net is Net, the condition is its last external_output
+    that must be a single bool. And this net will be executed before
+    nets_or_steps so as to get the condition.
+    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+    if isinstance(condition_blob_or_net, core.Net):
+        nets_or_steps = _PrependNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
+    else:
+        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
+
+    def if_step(control_name):
+        return core.scoped_execution_step(
+            _get_next_step_name(control_name, name),
+            nets_or_steps,
+            should_stop_blob=stop_blob,
+            only_once=True,
+        )
+
+    if _IsNets(nets_or_steps):
+        bool_net = BoolNet((stop_blob, False))
+        return Do(name + '/_RunOnceIf',
+                  bool_net, if_step('_RunOnceIf-inner'))
+    else:
+        return if_step('_RunOnceIf')
+
+
+def _RunOnceIfNot(name, condition_blob_or_net, nets_or_steps):
+    """
+    Similar to _RunOnceIf() but Execute nets_or_steps once if
+    condition_blob_or_net evaluates as false.
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
+    else:
+        copy_net, condition_blob = _CopyConditionBlobNet(condition_blob_or_net)
+        nets_or_steps = _PrependNets(nets_or_steps, copy_net)
+
+    return core.scoped_execution_step(
+        _get_next_step_name('_RunOnceIfNot', name),
+        nets_or_steps,
+        should_stop_blob=condition_blob,
+        only_once=True,
+    )
+
+
+def For(name, nets_or_steps, iter_num):
+    """
+    Execute nets_or_steps iter_num times.
+
+    Args:
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.
+    iter_num:    the number times to execute the nets_or_steps.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    init_net = core.Net('init-net')
+    iter_cnt = init_net.CreateCounter([], init_count=iter_num)
+    iter_net = core.Net('For-iter')
+    iter_done = iter_net.CountDown([iter_cnt])
+
+    for_step = core.scoped_execution_step(
+        _get_next_step_name('For-inner', name),
+        _PrependNets(nets_or_steps, iter_net),
+        should_stop_blob=iter_done)
+    return Do(name + '/For',
+              Do(name + '/For-init-net', init_net),
+              for_step)
+
+
+def While(name, condition_blob_or_net, nets_or_steps):
+    """
+    Execute nets_or_steps when condition_blob_or_net returns true.
+
+    Args:
+    condition_blob_or_net: If it is an instance of Net, its last
+      external_output must be a single bool.
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+    if isinstance(condition_blob_or_net, core.Net):
+        nets_or_steps = _PrependNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
+    else:
+        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
+
+    def while_step(control_name):
+        return core.scoped_execution_step(
+            _get_next_step_name(control_name, name),
+            nets_or_steps,
+            should_stop_blob=stop_blob,
+        )
+
+    if _IsNets(nets_or_steps):
+        # In this case, while_step has sub-nets:
+        # [condition_blob_or_net, condition_not_net, nets_or_steps]
+        # If stop_blob is pre-set to True (this may happen when While() is
+        # called twice), the loop will exit after executing
+        # condition_blob_or_net. So we use BootNet to set stop_blob to
+        # False.
+        bool_net = BoolNet((stop_blob, False))
+        return Do(name + '/While', bool_net, while_step('While-inner'))
+    else:
+        return while_step('While')
+
+
+def Until(name, condition_blob_or_net, nets_or_steps):
+    """
+    Similar to While() but execute nets_or_steps when
+    condition_blob_or_net returns false
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
+    else:
+        stop_blob = core.BlobReference(str(condition_blob_or_net))
+
+    return core.scoped_execution_step(
+        _get_next_step_name('Until', name),
+        nets_or_steps,
+        should_stop_blob=stop_blob)
+
+
+def DoWhile(name, condition_blob_or_net, nets_or_steps):
+    """
+    Execute nets_or_steps when condition_blob_or_net returns true. It will
+    execute nets_or_steps before evaluating condition_blob_or_net.
+
+    Args:
+    condition_blob_or_net: if it is an instance of Net, tts last external_output
+      must be a single bool.
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+    if isinstance(condition_blob_or_net, core.Net):
+        nets_or_steps = _AppendNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
+    else:
+        nets_or_steps = _AppendNets(nets_or_steps, condition_not_net)
+
+    # If stop_blob is pre-set to True (this may happen when DoWhile() is
+    # called twice), the loop will exit after executing the first net/step
+    # in nets_or_steps. This is not what we want. So we use BootNet to
+    # set stop_blob to False.
+    bool_net = BoolNet((stop_blob, False))
+    return Do(name + '/DoWhile', bool_net, core.scoped_execution_step(
+        _get_next_step_name('DoWhile-inner', name),
+        nets_or_steps,
+        should_stop_blob=stop_blob,
+    ))
+
+
+def DoUntil(name, condition_blob_or_net, nets_or_steps):
+    """
+    Similar to DoWhile() but execute nets_or_steps when
+    condition_blob_or_net returns false. It will execute
+    nets_or_steps before evaluating condition_blob_or_net.
+
+    Special case: if condition_blob_or_net is a blob and is pre-set to
+    true, then only the first net/step of nets_or_steps will be executed and
+    loop is exited. So you need to be careful about the initial value the
+    condition blob when using DoUntil(), esp when DoUntil() is called twice.
+    """
+    if not isinstance(condition_blob_or_net, core.Net):
+        stop_blob = core.BlobReference(condition_blob_or_net)
+        return core.scoped_execution_step(
+            _get_next_step_name('DoUntil', name),
+            nets_or_steps,
+            should_stop_blob=stop_blob)
+
+    nets_or_steps = _AppendNets(nets_or_steps, condition_blob_or_net)
+    stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+
+    # If stop_blob is pre-set to True (this may happen when DoWhile() is
+    # called twice), the loop will exit after executing the first net/step
+    # in nets_or_steps. This is not what we want. So we use BootNet to
+    # set stop_blob to False.
+    bool_net = BoolNet((stop_blob, False))
+    return Do(name + '/DoUntil', bool_net, core.scoped_execution_step(
+        _get_next_step_name('DoUntil-inner', name),
+        nets_or_steps,
+        should_stop_blob=stop_blob,
+    ))
+
+
+def Switch(name, *conditions):
+    """
+    Execute the steps for which the condition is true.
+    Each condition is a tuple (condition_blob_or_net, nets_or_steps).
+    Note:
+      1. Multi steps can be executed if their conditions are true.
+      2. The conditions_blob_or_net (if it is Net) of all steps will be
+         executed once.
+
+    Examples:
+    - Switch('name', (cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
+    - Switch('name', [(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
+    - Switch('name', (cond_1, net_1))
+    """
+    conditions = _MakeList(conditions)
+    return core.scoped_execution_step(
+        _get_next_step_name('Switch', name),
+        [_RunOnceIf(name + '/Switch', cond, step) for cond, step in conditions])
+
+
+def SwitchNot(name, *conditions):
+    """
+    Similar to Switch() but execute the steps for which the condition is False.
+    """
+    conditions = _MakeList(conditions)
+    return core.scoped_execution_step(
+        _get_next_step_name('SwitchNot', name),
+        [_RunOnceIfNot(name + '/SwitchNot', cond, step)
+         for cond, step in conditions])
+
+
+def If(name, condition_blob_or_net,
+       true_nets_or_steps, false_nets_or_steps=None):
+    """
+    condition_blob_or_net is first evaluated or executed. If the condition is
+    true, true_nets_or_steps is then executed, otherwise, false_nets_or_steps
+    is executed.
+
+    If condition_blob_or_net is Net, the condition is its last external_output
+    that must be a single bool. And this Net will be executred before both
+    true/false_nets_or_steps so as to get the condition.
+    """
+    if not false_nets_or_steps:
+        return _RunOnceIf(name + '/If',
+                          condition_blob_or_net, true_nets_or_steps)
+
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+    else:
+        condition_blob = condition_blob_or_net
+
+    return Do(
+        name + '/If',
+        _RunOnceIf(name + '/If-true',
+                   condition_blob_or_net, true_nets_or_steps),
+        _RunOnceIfNot(name + '/If-false', condition_blob, false_nets_or_steps)
+    )
+
+
+def IfNot(name, condition_blob_or_net,
+          true_nets_or_steps, false_nets_or_steps=None):
+    """
+    If condition_blob_or_net returns false, executes true_nets_or_steps,
+    otherwise executes false_nets_or_steps
+    """
+    if not false_nets_or_steps:
+        return _RunOnceIfNot(name + '/IfNot',
+                             condition_blob_or_net, true_nets_or_steps)
+
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+    else:
+        condition_blob = condition_blob_or_net
+
+    return Do(
+        name + '/IfNot',
+        _RunOnceIfNot(name + '/IfNot-true',
+                      condition_blob_or_net, true_nets_or_steps),
+        _RunOnceIf(name + '/IfNot-false', condition_blob, false_nets_or_steps)
+    )
diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py
new file mode 100644
index 0000000..e004507
--- /dev/null
+++ b/caffe2/python/control_ops_grad.py
@@ -0,0 +1,685 @@
+## @package control_ops_grad
+# Module caffe2.python.control_ops_grad
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+
+
+def gen_do_gradient(op, g_output):
+    """
+    Generates gradient Do operator, given forward Do op and a list
+    of gradient blobs corresponding to forward op's outputs
+    Returns a gradient op and a list of blobs corresponding to input gradients
+    """
+    from caffe2.python.core import BlobReference
+    subnet, outer_to_inner_map, inner_to_outer_map, workspace_blob_name = \
+        _do_op_sanity_check_and_process(op)
+
+    assert len(g_output) == len(op.output), \
+        "Different number of gradient blobs and Do op outputs"
+
+    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
+    g_output = deduped_g_output
+
+    # From the outer net point of view:
+    #  Do is an operator that has some number of inputs and outputs;
+    #  we have to generate a gradient operator that writes into
+    #  corresponding input gradient blobs and has access to inputs, outputs
+    #  and gradient output blobs
+    # From the inner net point of view:
+    #  Do is an operator with a subnet and blob bindings,
+    #  we need to forward Do's output blob gradients into inner workspace,
+    #  use them to run backward pass generation and forward Do's input blob
+    #  gradients back into outer workspace
+
+    op_output = [str(o) for o in op.output]
+    op_output = op_output[:-1]  # remove workspace pointer blob
+    op_input = [str(i) for i in op.input]
+    op_input = op_input[:-1]  # remove workspace pointer blob
+
+    ordered_inner_output_blob_names = [outer_to_inner_map[o] for o in op_output]
+
+    backward_pass_initial_grad_map = {}
+    initial_grad_map = {}
+    for inner_output_name, outer_grad_output_name in \
+            zip(ordered_inner_output_blob_names, g_output):
+        # link inner_output_name to corresponding inner_grad_output_name for
+        # backward pass generation;
+        if outer_grad_output_name:
+            inner_grad_output_name = inner_output_name + "/_DO_OPERATOR_INNER_GRAD_"
+            backward_pass_initial_grad_map[BlobReference(inner_output_name)] = \
+                BlobReference(inner_grad_output_name)
+            initial_grad_map[inner_grad_output_name] = str(outer_grad_output_name)
+    assert len(initial_grad_map) > 0, "Empty initial gradient map for Do op"
+
+    inner_grad_ops, inner_grad_names_map = _gen_subgradient_pass(
+        subnet, backward_pass_initial_grad_map)
+
+    if len(inner_grad_ops) == 0:
+        return [], []
+
+    grad_copy_ops = []
+    g_input = []
+    new_op_outputs = []
+    new_blob_bindings = {}
+    for outer_input_name in op_input:
+        inner_input_name = outer_to_inner_map[outer_input_name]
+        if inner_input_name in inner_grad_names_map:
+            inner_grad_input_name = inner_grad_names_map[inner_input_name]
+            outer_grad_input_name = outer_input_name + "_grad"
+
+            # It is possible that inner_grad_input_name will need to be
+            # linked to another outer blob. For example:
+            #
+            #    // y - param initialized in init_net
+            #    x = ...
+            #    z = ...
+            #    with ops.IfNet(...):
+            #        ops.Add([z, x], y) # inner Do block
+            #    loss = f(..., y, ...)
+            #
+            # In this case x, y and z are external for the inner Do block,
+            # the inputs of the Do block are z and x and the output is y.
+            # When computing the gradient of input x given the gradient
+            # of output y it's easy to see that they are equal.
+            # During the generation of gradient Do operator, we link
+            # external gradient y (y_grad) to the internal name
+            # (y/_DO_OPERATOR_INNER_GRAD_) and generate the backward pass
+            # for the internal Do net. As a result we get gradient operators
+            # for the gradient Do and gradient map that maps internal Do
+            # blobs to their computed gradients.
+            # In this example, gradient map may have blob x linked to
+            # gradient blob y/_DO_OPERATOR_INNER_GRAD_.
+            # We should export gradient for x outside of Do, so
+            # we add a blob mapping from inner gradient blob
+            # (y/_DO_OPERATOR_INNER_GRAD_) to a new outer name (x_grad).
+            #
+            # (Note: since we use transparent blob mapping between outer and
+            # inner (Do's) workspace, these operations do not involve copying
+            # but are merely using blobs in outer workspace in the Do's operator
+            # workspace under (possibly) different names)
+            #
+            # At the same time, we need to add a blob mapping from inner name
+            # y/_DO_OPERATOR_INNER_GRAD_ to the outer blob y_grad
+            # Hence in this case, we cannot use existing blob mapping scheme
+            # that requires a bijection between subset of inner blob names and
+            # a set of all (Do's input and output) outer blob names
+
+            # TODO(iliacher): Remove unnecessary blob copying
+
+            new_inner_grad_input_name = \
+                inner_input_name + "/_DO_OPERATOR_INNER_GRAD_COPY_"
+            grad_copy_ops.append(_prepare_blob_copy_op(
+                inner_grad_input_name, new_inner_grad_input_name))
+
+            new_blob_bindings[new_inner_grad_input_name] = outer_grad_input_name
+            new_op_outputs.append(outer_grad_input_name)
+            g_input.append(outer_grad_input_name)
+        else:
+            g_input.append(None)
+
+    new_op_inputs = []
+    overwritten_names = set()
+    saved_local_blob_names = set()
+    for grad_op in inner_grad_ops:
+        grad_op_input = [str(i) for i in grad_op.input]
+        grad_op_output = [str(o) for o in grad_op.output]
+        for grad_op_input_name in grad_op_input:
+            if grad_op_input_name in overwritten_names:
+                continue
+            # check if this is an external blob
+            outer_name = inner_to_outer_map.get(grad_op_input_name, None)
+            if not outer_name:
+                # check if this is an external gradient blob
+                outer_name = initial_grad_map.get(grad_op_input_name, None)
+            if outer_name:
+                outer_name = str(outer_name)
+                if outer_name not in new_op_inputs:
+                    new_op_inputs.append(outer_name)
+
+                new_blob_bindings[grad_op_input_name] = outer_name
+            else:
+                # this is a local blob, we'll get it's value from
+                # a saved forward op workspace
+                saved_local_blob_names.add(grad_op_input_name)
+        overwritten_names.update(grad_op_output)
+
+    # add inner gradient copy ops
+    inner_grad_ops += grad_copy_ops
+
+    gradient_do_def = _prepare_gradient_do_op(
+        fwd_op=op,
+        fwd_net=subnet,
+        grad_ops=inner_grad_ops,
+        inputs=new_op_inputs,
+        outputs=new_op_outputs,
+        blob_bindings=new_blob_bindings,
+        saved_fwd_blobs=saved_local_blob_names,
+        workspace_blob_name=workspace_blob_name)
+    grad_ops.append(gradient_do_def)
+
+    _do_op_sanity_check_and_process(gradient_do_def)
+
+    return grad_ops, g_input
+
+
+def dedupe_g_output(op, g_output):
+    # When generation a gradient op it's possible to receive the same gradient
+    # blob corresponding to different forward op output blobs, Do operator
+    # requires a bijection between inner and outer names, make sure we do
+    # deduplication
+    grad_ops = []
+    deduped_g_output = []
+    init_grad_map = {}
+    for output_name, grad_name in zip(op.output, g_output):
+        if not grad_name:
+            deduped_g_output.append(grad_name)
+            continue
+
+        if output_name in init_grad_map:
+            deduped_g_output.append(init_grad_map[output_name])
+        else:
+            if grad_name not in init_grad_map.values():
+                init_grad_map[output_name] = grad_name
+                deduped_g_output.append(grad_name)
+            else:
+                deduped_grad_name = output_name + "_" + grad_name + "_DEDUP"
+                assert deduped_grad_name not in init_grad_map.values()
+                grad_copy_op = caffe2_pb2.OperatorDef()
+                grad_copy_op.type = "Copy"
+                grad_copy_op.input.extend([grad_name])
+                grad_copy_op.output.extend([deduped_grad_name])
+                grad_ops.append(grad_copy_op)
+                deduped_g_output.append(deduped_grad_name)
+                init_grad_map[output_name] = deduped_grad_name
+    return grad_ops, deduped_g_output
+
+
+def gen_while_gradient(op, g_output):
+    """
+    Generates gradient While operator
+    """
+    from caffe2.python.core import BlobReference
+    assert op.type == "While", "Expected While op"
+    assert len(op.input) > 0, "Expected at least one input in While op"
+
+    assert len(op.output) == len(g_output), \
+        "Different number of gradient blobs and While op outputs"
+
+    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
+    g_output = deduped_g_output
+
+    init_grad_map = {}
+    op_output = [str(o) for o in op.output]
+    for output_name, grad_output_name in zip(op_output, g_output):
+        if grad_output_name:
+            init_grad_map[BlobReference(output_name)] = \
+                BlobReference(grad_output_name)
+    assert len(init_grad_map) > 0, "Empty initial gradient map for While op"
+
+    loop_net = _get_net_argument(op, "loop_net")
+    assert loop_net, "Expected loop subnet in While op"
+    assert len(loop_net.op) == 1 and loop_net.op[0].type == "Do", \
+        "Gradient While op requires single Do op as a loop body"
+    do_op = loop_net.op[0]
+    do_args = _get_do_arguments(do_op)
+    assert "reuse_workspace" not in do_args or not do_args["reuse_workspace"], \
+        "Gradient While op requires Do loop body op without reuse_workspace set"
+
+    assert len(do_op.output) > 0, "Expected Do op with at least one output"
+    workspace_blob = do_op.output[-1]
+
+    loop_grad_net, loop_grad_map, loop_input_names, loop_output_names = \
+        _gen_subnet_gradient(loop_net, init_grad_map)
+    assert loop_grad_net, "Failed to get gradient net for loop body in While op"
+
+    grad_ops += _prepare_gradient_while_ops(
+        fwd_op=op,
+        input_names=loop_input_names,
+        output_names=loop_output_names,
+        loop_grad_net=loop_grad_net,
+        workspace_blob=workspace_blob,
+        init_grad_map=init_grad_map,
+        loop_grad_map=loop_grad_map)
+
+    op_input = [str(i) for i in op.input]
+    g_input = [loop_grad_map.get(i, None) for i in op_input]
+    return grad_ops, g_input
+
+
+# Constructs gradient While op, arguments:
+#  fwd_op - forward While op
+#  input_names - input blob names for a gradient op
+#  output_names - output blob names for a gradient op
+#  loop_grad_net - gradient loop body net
+#  workspace_blob - blob that holds forward workspaces stack
+#  init_grad_map - initial gradient to forward blob map
+#  loop_grad_map - gradient blob map for loop's body
+def _prepare_gradient_while_ops(
+        fwd_op, input_names, output_names, loop_grad_net, workspace_blob,
+        init_grad_map, loop_grad_map):
+    gradient_while_def = caffe2_pb2.OperatorDef()
+    gradient_while_def.CopyFrom(fwd_op)
+    if gradient_while_def.name:
+        gradient_while_def.name += "_grad"
+
+    loop_net_arg = caffe2_pb2.Argument()
+    loop_net_arg.name = "loop_net"
+    loop_net_arg.n.CopyFrom(loop_grad_net)
+
+    cond_net_arg = caffe2_pb2.Argument()
+    cond_net_arg.name = "cond_net"
+    from caffe2.python.core import Net, BlobReference
+    # Construct condition net - check that there're still forward workspaces
+    # left using HasScope op
+    cond_net = Net('gradient_loop_cond_net')
+    cond_init_net = Net('gradient_loop_cond_net_init')
+    cond_blob = cond_net.NextScopedBlob(cond_net.Name() + '/cond')
+    cond_init_net.HasScope(workspace_blob, cond_blob)
+    cond_net.HasScope(workspace_blob, cond_blob)
+    for blob, init_grad_blob in init_grad_map.items():
+        blob_name = str(blob)
+        init_grad_blob_name = str(init_grad_blob)
+        if blob_name in loop_grad_map and \
+                loop_grad_map[blob_name] != init_grad_blob_name:
+            cond_net.Copy(
+                BlobReference(loop_grad_map[blob_name]), init_grad_blob)
+            cond_init_net.Copy(
+                init_grad_blob, BlobReference(loop_grad_map[blob_name]))
+    cond_net_arg.n.CopyFrom(cond_net.Proto())
+
+    del gradient_while_def.arg[:]
+    gradient_while_def.arg.extend([loop_net_arg, cond_net_arg])
+
+    del gradient_while_def.control_input[:]
+    del gradient_while_def.input[:]
+    gradient_while_def.input.extend(
+        [str(cond_blob).encode('utf-8')] + list(input_names))
+    del gradient_while_def.output[:]
+    gradient_while_def.output.extend(output_names)
+    gradient_while_def.is_gradient_op = True
+    return [o for o in cond_init_net.Proto().op] + [gradient_while_def]
+
+
+def _get_do_arguments(do_op):
+    assert do_op.type == "Do", "Expected Do op"
+    args = {}
+    for arg in do_op.arg:
+        if not arg.name:
+            continue
+        if arg.name == "net":
+            assert arg.n, "Expected non empty net argument"
+            args["net"] = arg.n
+        elif arg.name == "reuse_workspace":
+            assert arg.i, "Expected non empty reuse_workspace argument"
+            args["reuse_workspace"] = bool(arg.i)
+        elif arg.name == "inner_blobs":
+            assert arg.strings, "Expected non empty inner_blobs argument"
+            args["inner_blobs"] = arg.strings
+        elif arg.name == "outer_blobs_idx":
+            assert arg.ints, "Expected non empty outer_blobs_idx argument"
+            args["outer_blobs_idx"] = arg.ints
+    return args
+
+
+def gen_if_gradient(op, g_output):
+    """
+    Generates gradient If operator, given forward If op and a list
+    of gradient blobs corresponding to forward op's outputs
+    Returns a gradient op and a list of blobs corresponding to input gradients
+    """
+    from caffe2.python.core import BlobReference
+    assert op.type == "If", "Expected If op"
+    # first input is the condition blob
+    assert len(op.input) > 0, "Expected at least one input in If op"
+
+    assert len(op.output) == len(g_output), \
+        "Different number of gradient blobs and If op outputs"
+
+    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
+    g_output = deduped_g_output
+
+    init_grad_map = {}  # map from if's output blob to output gradient blob
+    op_input = [str(i) for i in op.input]
+    op_output = [str(o) for o in op.output]
+    for output_name, grad_output_name in zip(op_output, g_output):
+        if grad_output_name:
+            init_grad_map[BlobReference(output_name)] = \
+                BlobReference(grad_output_name)
+    # shouldn't call without at least one output gradient available
+    assert len(init_grad_map) > 0, "Empty initial gradient map for If op"
+
+    grad_map = {}  # map from blob to gradient blob
+    then_net = _get_net_argument(op, "then_net")
+    assert then_net, "Expected then subnet in If op"
+    then_grad_net, then_grad_map, then_input_names, then_output_names = \
+        _gen_subnet_gradient(then_net, init_grad_map)
+    assert then_grad_net, "Failed to get gradient net for then in If op"
+    grad_map.update(then_grad_map)
+
+    else_input_names = set()
+    else_output_names = set()
+    else_grad_map = {}
+    else_grad_net = None
+    else_net = _get_net_argument(op, "else_net")
+    if else_net:
+        else_grad_net, else_grad_map, else_input_names, else_output_names = \
+            _gen_subnet_gradient(else_net, init_grad_map)
+        assert else_grad_net, "Failed to get gradient net for else in If op"
+        # consider case: else doesn't update blob's gradient and keeps original
+        # from init_grad_map, but then updates the gradient
+        for else_blob, else_grad_blob in else_grad_map.items():
+            if else_blob in then_grad_map:
+                then_grad_blob = then_grad_map[else_blob]
+                # if both then and else branches have grad blob name for the same
+                # blob and grad names are different, then one of the branches
+                # doesn't use blob and has original grad blob name in it's grad map,
+                # and another branch uses blob and has <blob_name>_grad name
+                # in it's grad map (might be different from original grad blob)
+                if then_grad_blob != else_grad_blob:
+                    init_grad_name = init_grad_map[else_blob] \
+                        if else_blob in init_grad_map else None
+
+                    if then_grad_blob == init_grad_name:
+                        grad_map[else_blob] = else_grad_blob
+                    elif else_grad_blob == init_grad_name:
+                        grad_map[else_blob] = then_grad_blob
+                    else:
+                        raise "Unexpected grad blob name " + else_blob + ", " + \
+                            else_grad_blob + ", " + then_grad_blob
+            else:
+                grad_map[else_blob] = else_grad_blob
+
+    # make sure gradients of blobs that were not computed
+    # by the selected if's branch are initialized with zeros
+    then_other_output_names = \
+        then_output_names - (then_output_names & else_output_names)
+    then_other_grad_output_names = set(
+        [o for o in then_other_output_names if o in then_grad_map.values()])
+    zero_then = _gen_grad_zero_init_ops(
+        init_grad_map, then_grad_map, then_other_grad_output_names)
+    if else_grad_net:
+        else_grad_net.op.extend(zero_then)
+    elif len(zero_then) > 0:
+        else_grad_net = caffe2_pb2.NetDef()
+        else_grad_net.CopyFrom(then_grad_net)
+        if else_grad_net.name:
+            else_grad_net.name += "_auto_else_zero_blobs_"
+        del else_grad_net.op[:]
+        else_grad_net.op.extend(zero_then)
+        del else_grad_net.external_input[:]
+        del else_grad_net.external_output[:]
+
+    else_other_output_names = \
+        else_output_names - (then_output_names & else_output_names)
+    else_other_grad_output_names = set(
+        [o for o in else_other_output_names if o in else_grad_map.values()])
+    zero_else = _gen_grad_zero_init_ops(
+        init_grad_map, else_grad_map, else_other_grad_output_names)
+    then_grad_net.op.extend(zero_else)
+
+    output_names = list(then_output_names | else_output_names)
+    input_names = then_input_names | else_input_names
+    # make sure condition blob is the first in the list
+    input_names = [op_input[0]] + list(input_names - set(op_input[0]))
+    gradient_if_def = _prepare_gradient_if_op(
+        fwd_op=op,
+        input_names=input_names,
+        output_names=output_names,
+        then_grad_net=then_grad_net,
+        else_grad_net=else_grad_net)
+    g_input = [grad_map.get(i, None) for i in op_input]
+    return grad_ops + [gradient_if_def], g_input
+
+
+def _gen_subnet_gradient(subnet, init_grad):
+    grad_ops, grad_names_map = _gen_subgradient_pass(
+        subnet, init_grad)
+
+    output_names = set()
+    input_names = set()
+    for grad_op in grad_ops:
+        for grad_op_input in grad_op.input:
+            if str(grad_op_input) not in output_names:
+                input_names.add(str(grad_op_input))
+        for grad_op_output in grad_op.output:
+            output_names.add(str(grad_op_output))
+
+    gradient_net_def = caffe2_pb2.NetDef()
+    gradient_net_def.CopyFrom(subnet)
+    if gradient_net_def.name:
+        gradient_net_def.name += "_grad"
+    del gradient_net_def.op[:]
+    gradient_net_def.op.extend(grad_ops)
+    del gradient_net_def.external_input[:]
+    del gradient_net_def.external_output[:]
+
+    return gradient_net_def, grad_names_map, input_names, output_names
+
+
+def _get_net_argument(op, net_name):
+    for arg in op.arg:
+        if arg.name and arg.name == net_name:
+            assert arg.n, "Expected non empty net argument " + net_name
+            return arg.n
+    return None
+
+
+def _gen_subgradient_pass(subnet, init_grad):
+    from caffe2.python.core import IR
+    subnet_ir = IR(subnet.op)
+    grad_ops, grad_blob_map = \
+        subnet_ir.GetBackwardPass(init_grad)
+    grad_names_map = {}
+    for b, g in grad_blob_map.items():
+        grad_names_map[str(b)] = str(g)
+    return grad_ops, grad_names_map
+
+
+def _do_op_sanity_check_and_process(op):
+    assert op.type == "Do", "Expected Do op"
+
+    subnet = _get_net_argument(op, "net")
+    assert subnet, "No net argument found in Do op"
+
+    inner_blobs = None
+    outer_blobs_idx = None
+    for arg in op.arg:
+        if arg.name and arg.name == "inner_blobs":
+            assert not inner_blobs, "inner_blobs redefinition"
+            assert arg.strings and len(arg.strings) > 0, \
+                "Empty inner_blobs argument in Do op"
+            inner_blobs = [s.decode('utf-8') for s in arg.strings]
+        if arg.name and arg.name == "outer_blobs_idx":
+            assert not outer_blobs_idx, "outer_blobs_idx redefinition"
+            assert arg.ints and len(arg.ints) > 0, \
+                "Empty outer_blobs_idx argument in Do op"
+            outer_blobs_idx = arg.ints
+        if inner_blobs and outer_blobs_idx:
+            break
+
+    assert inner_blobs, "No inner_blobs argument found in Do op"
+    assert outer_blobs_idx, "No outer_blobs_idx argument found in Do op"
+
+    assert len(inner_blobs) == len(outer_blobs_idx), \
+        "Arguments inner_blobs and outer_blobs_idx of different length in Do op"
+
+    all_inner_blobs = set(inner_blobs)
+    assert len(all_inner_blobs) == len(inner_blobs), \
+        "Found duplicates in inner_blobs in Do op"
+
+    op_input = [str(i) for i in op.input]
+    assert len(op_input) > 0, "Expected at least one input blob"
+    # remove last input blob that holds pointer to workspace
+    input_workspace_blob_name = op_input[-1]
+    op_input = op_input[:-1]
+
+    op_output = [str(o) for o in op.output]
+    assert len(op_output) > 0, "Expected at least one output blob"
+    # remove last output blob that holds pointer to workspace
+    workspace_blob_name = op_output[-1]
+    assert input_workspace_blob_name == workspace_blob_name, \
+        "Expected same input/output workspace blob"
+    op_output = op_output[:-1]
+
+    all_op_input_blob_names = set(op_input)
+    assert len(all_op_input_blob_names) == len(op_input), \
+        "Found duplicates in Do op inputs"
+    all_op_output_blob_names = set(op_output)
+    assert len(all_op_output_blob_names) == len(op_output), \
+        "Found duplicates in Do op outputs"
+
+    ordered_outer_blob_names = op_input + op_output
+    all_outer_blob_names = set(ordered_outer_blob_names)
+    used_outer_blob_names = set()
+    outer_to_inner_map = {}
+    inner_to_outer_map = {}
+    for inner_name, outer_blob_idx in zip(inner_blobs, outer_blobs_idx):
+        assert outer_blob_idx >= 0 and \
+            outer_blob_idx < len(ordered_outer_blob_names), \
+            "Outer blob index is out of bounds in Do op"
+        outer_name = ordered_outer_blob_names[outer_blob_idx]
+        assert outer_name not in used_outer_blob_names, \
+            "Reusage of outer blob name " + outer_name + " in Do op"
+        used_outer_blob_names.add(outer_name)
+        outer_to_inner_map[outer_name] = inner_name
+        inner_to_outer_map[inner_name] = outer_name
+
+    assert len(used_outer_blob_names) == len(all_outer_blob_names), \
+        "Not all outer blob names are used in blob bindings in Do op"
+
+    return subnet, outer_to_inner_map, inner_to_outer_map, workspace_blob_name
+
+
+def _prepare_blob_copy_op(from_name, to_name):
+    copy_op_def = caffe2_pb2.OperatorDef()
+    copy_op_def.type = "Copy"
+    copy_op_def.input.extend([from_name])
+    copy_op_def.output.extend([to_name])
+    return copy_op_def
+
+
+def _prepare_gradient_do_op(
+        fwd_op, fwd_net, grad_ops, inputs, outputs, blob_bindings, saved_fwd_blobs,
+        workspace_blob_name):
+    gradient_net_def = caffe2_pb2.NetDef()
+    gradient_net_def.CopyFrom(fwd_net)
+    if gradient_net_def.name:
+        gradient_net_def.name += "_grad"
+    del gradient_net_def.op[:]
+    gradient_net_def.op.extend(grad_ops)
+    del gradient_net_def.external_input[:]
+    del gradient_net_def.external_output[:]
+
+    gradient_do_def = caffe2_pb2.OperatorDef()
+    gradient_do_def.CopyFrom(fwd_op)
+    if gradient_do_def.name and len(gradient_do_def.name) > 0:
+        gradient_do_def.name += "_grad"
+
+    del gradient_do_def.input[:]
+    gradient_do_def.input.extend(inputs)
+    # workspace pointer blob
+    gradient_do_def.input.append(workspace_blob_name)
+    del gradient_do_def.output[:]
+    gradient_do_def.output.extend(outputs)
+    # workspace pointer blob
+    gradient_do_def.output.append(workspace_blob_name)
+
+    net_arg = caffe2_pb2.Argument()
+    net_arg.name = "net"
+    net_arg.n.CopyFrom(gradient_net_def)
+
+    ordered_new_outer_names = inputs + outputs
+    inner_blobs = blob_bindings.keys()
+    new_outer_blobs_idx = [ordered_new_outer_names.index(blob_bindings[b])
+                            for b in inner_blobs]
+
+    inner_blobs_arg = caffe2_pb2.Argument()
+    inner_blobs_arg.name = "inner_blobs"
+    inner_blobs_arg.strings.extend([b.encode('utf-8') for b in inner_blobs])
+
+    outer_blobs_idx_arg = caffe2_pb2.Argument()
+    outer_blobs_idx_arg.name = "outer_blobs_idx"
+    outer_blobs_idx_arg.ints.extend(new_outer_blobs_idx)
+
+    saved_blobs_arg = caffe2_pb2.Argument()
+    saved_blobs_arg.name = "saved_fwd_blobs"
+    saved_blobs_arg.strings.extend(
+        [b.encode('utf-8') for b in saved_fwd_blobs])
+
+    del gradient_do_def.arg[:]
+    gradient_do_def.arg.extend([
+        net_arg, inner_blobs_arg, outer_blobs_idx_arg, saved_blobs_arg])
+    del gradient_do_def.control_input[:]
+
+    gradient_do_def.is_gradient_op = True
+
+    return gradient_do_def
+
+
+def _gen_grad_zero_init_ops(init_grad_map, grad_map, grad_output_names):
+    grad_init_ops = []
+    for grad_output in grad_output_names:
+        # get the corresponding output name blob and use it in ConstantFill
+        # so that grad_output has the same shape
+        output_name = None
+        for o, g in grad_map.items():
+            if g == grad_output:
+                output_name = o
+                break
+        assert output_name, "Unknown gradient output " + grad_output
+
+        grad_init_op = None
+        # make sure that we do not overwrite existing gradients with zeros
+        if output_name in init_grad_map:
+            init_grad_name = init_grad_map[output_name]
+            # in case we use a different gradient blob name, copy gradient
+            if init_grad_name != grad_output:
+                grad_init_op = caffe2_pb2.OperatorDef()
+                grad_init_op.type = "Copy"
+                grad_init_op.input.extend([str(init_grad_name)])
+                grad_init_op.output.extend([str(grad_output)])
+        else:
+            grad_init_op = caffe2_pb2.OperatorDef()
+            grad_init_op.type = "ConstantFill"
+            grad_init_op.input.extend([output_name])
+            grad_init_op.output.extend([grad_output])
+            value_arg = caffe2_pb2.Argument()
+            value_arg.name = "value"
+            value_arg.f = 0.0
+            grad_init_op.arg.extend([value_arg])
+
+        if grad_init_op:
+            grad_init_ops.append(grad_init_op)
+    return grad_init_ops
+
+
+def _prepare_gradient_if_op(
+        fwd_op, input_names, output_names, then_grad_net, else_grad_net):
+    gradient_if_def = caffe2_pb2.OperatorDef()
+    gradient_if_def.CopyFrom(fwd_op)
+    del gradient_if_def.input[:]
+    gradient_if_def.input.extend(input_names)
+    del gradient_if_def.output[:]
+    gradient_if_def.output.extend(output_names)
+
+    then_net_arg = caffe2_pb2.Argument()
+    then_net_arg.name = "then_net"
+    then_net_arg.n.CopyFrom(then_grad_net)
+    gradient_args = [then_net_arg]
+    if else_grad_net:
+        else_net_arg = caffe2_pb2.Argument()
+        else_net_arg.name = "else_net"
+        else_net_arg.n.CopyFrom(else_grad_net)
+        gradient_args.append(else_net_arg)
+
+    del gradient_if_def.arg[:]
+    gradient_if_def.arg.extend(gradient_args)
+    if gradient_if_def.name:
+        gradient_if_def.name += "_grad"
+    del gradient_if_def.control_input[:]
+    gradient_if_def.is_gradient_op = True
+    return gradient_if_def
diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py
new file mode 100644
index 0000000..76ab14a
--- /dev/null
+++ b/caffe2/python/control_ops_util.py
@@ -0,0 +1,263 @@
+## @package control_ops_util
+# Module caffe2.python.control_ops_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+
+
+def get_external_blob_names(net, lexical_scope):
+    """
+    Returns a set of blobs a given net depends on and a set of
+    output blobs that are written by the net
+    Inputs:
+        net - net to return input/output blobs for;
+        lexical_scope - all external blob names visible to the net
+    """
+    # Use the blobs that are actually read/written to as external inputs/outputs
+    net_proto = net.Proto()
+    net_ssa, _ = core.get_ssa(net_proto)
+    input_names = core.get_undefined_blobs(net_ssa)
+    for input_name in input_names:
+        assert str(input_name) in lexical_scope, \
+            "Input blob " + input_name + " is undefined"
+
+    output_names = set()
+    for op in net_proto.op:
+        for output in op.output:
+            if output in lexical_scope:
+                output_names.add(output)
+
+    return input_names, output_names
+
+
+def add_if_op(if_net, cond_blob, lexical_scope, then_net, else_net=None):
+    """
+    A helper function to add an If op to the net.
+    Automatically determines whether blobs in the then/else subnets are external
+    (from the outer workspace) or local (visible only inside subnet's workspace)
+    based on lexical scope - set of all outer blob names visible to the 'If'
+    operator. All the blobs in then/else subnets with names matching a name in lexical
+    scope and all the blobs that are first used as the operators' inputs are
+    considered outer blobs - these blobs must exist in the outer workspace,
+    then/else subnets can read their values and new values written into these blobs
+    will be visible outside of the 'If' operator. All other blobs are local - exist
+    only within inner workspaces for then/else.
+    Inputs:
+        if_net - net to add an If op to;
+        cond_blob - scalar bool blob reference, used as If condition;
+        lexical_scope - a set of outer blob names visible to then/else branches;
+        then_net/else_net - nets (core.Net) for then/else branches
+    """
+    then_input_blob_names, then_output_blob_names = get_external_blob_names(
+        then_net, lexical_scope)
+
+    else_input_blob_names = set()
+    else_output_blob_names = set()
+    if else_net:
+        else_input_blob_names, else_output_blob_names = get_external_blob_names(
+            else_net, lexical_scope)
+
+    input_blob_names = then_input_blob_names | else_input_blob_names
+    output_blob_names = then_output_blob_names | else_output_blob_names
+
+    if_inputs = [cond_blob]
+    if_inputs += [core.BlobReference(name=b, net=None) for b in input_blob_names]
+    if_outputs = [core.BlobReference(name=b, net=None) for b in output_blob_names]
+
+    do_then_net = core.Net('do_then_net')
+
+    then_input_blobs = \
+        [core.BlobReference(name=b, net=None) for b in then_input_blob_names]
+    then_output_blobs = \
+        [core.BlobReference(name=b, net=None) for b in then_output_blob_names]
+    then_input_output_names_ordered = [
+        str(b) for b in (then_input_blobs + then_output_blobs)]
+
+    then_outer_blob_names = list(then_input_blob_names | then_output_blob_names)
+    then_outer_blob_names_idx = [
+        then_input_output_names_ordered.index(b) for b in then_outer_blob_names]
+
+    # make sure to use net's name to have unique blob name across multiple subnets
+    do_then_workspace_blob = if_net.NextScopedBlob(if_net.Name() + '/workspace_if_then')
+    then_input_blobs.append(do_then_workspace_blob)
+    then_output_blobs.append(do_then_workspace_blob)
+    # make sure that added workspace pointer blobs are in if inputs/outputs
+    if_inputs.append(do_then_workspace_blob)
+    if_outputs.append(do_then_workspace_blob)
+
+    do_then_net.Do(
+        then_input_blobs,
+        then_output_blobs,
+        net=then_net.Proto(),
+        inner_blobs=then_outer_blob_names,
+        outer_blobs_idx=then_outer_blob_names_idx)
+    do_then_net.AddExternalOutput(*then_output_blobs)
+
+    if_args = {}
+    if_args['then_net'] = do_then_net.Proto()
+
+    do_else_workspace_blob = None
+    if else_net:
+        do_else_net = core.Net('do_else_net')
+
+        else_input_blobs = \
+            [core.BlobReference(name=b, net=None) for b in else_input_blob_names]
+        else_output_blobs = \
+            [core.BlobReference(name=b, net=None) for b in else_output_blob_names]
+        else_input_output_names_ordered = [
+            str(b) for b in (else_input_blobs + else_output_blobs)]
+
+        else_outer_blob_names = list(else_input_blob_names | else_output_blob_names)
+        else_outer_blob_names_idx = [
+            else_input_output_names_ordered.index(b) for b in else_outer_blob_names]
+
+        do_else_workspace_blob = \
+            if_net.NextScopedBlob(if_net.Name() + '/workspace_if_else')
+        else_input_blobs.append(do_else_workspace_blob)
+        else_output_blobs.append(do_else_workspace_blob)
+        # make sure that added workspace pointer blobs are in if inputs/outputs
+        if_inputs.append(do_else_workspace_blob)
+        if_outputs.append(do_else_workspace_blob)
+
+        do_else_net.Do(
+            else_input_blobs,
+            else_output_blobs,
+            net=else_net.Proto(),
+            inner_blobs=else_outer_blob_names,
+            outer_blobs_idx=else_outer_blob_names_idx)
+        do_else_net.AddExternalOutput(*else_output_blobs)
+        if_args['else_net'] = do_else_net.Proto()
+
+    if_net.CreateScope([], [do_then_workspace_blob])
+    if do_else_workspace_blob:
+        if_net.CreateScope([], [do_else_workspace_blob])
+    if_net.If(if_inputs, if_outputs, **if_args)
+    if_net.AddExternalOutput(*if_outputs)
+
+
+def add_while_op(
+        while_net, cond_blob, lexical_scope, loop_body_net, condition_body_net=None):
+    """
+    A helper function to add a While op to the net. Same rules for determining
+    outer and inner blobs as for the 'If' operator apply for the 'While' operator
+    loop and condition subnets. If specified, condition net is executed in a separate
+    workspace before the first and after each iteration, the last operator must have
+    a single scalar boolean output that is written into the condition blob.
+    Inputs:
+        while_net - net to add a While op to;
+        cond_blob - scalar bool blob reference, used as a stop condition;
+        lexical_scope - a set of outer blob names visible to the loop's body;
+        loop_body_net - net to execute on each iteration;
+        condition_body_net - net to compute condition value
+    """
+    input_blob_names, output_blob_names = get_external_blob_names(
+        loop_body_net, lexical_scope)
+
+    # Since it's possible that loop is not going to run even once
+    # we have to add loop's external outputs into inputs
+    input_blob_names |= output_blob_names
+
+    loop_inputs = [core.BlobReference(name=b, net=None) for b in input_blob_names]
+    loop_outputs = [core.BlobReference(name=b, net=None) for b in output_blob_names]
+
+    while_inputs = [cond_blob] + loop_inputs
+    while_outputs = [] + loop_outputs
+
+    do_loop_body_net = core.Net('do_loop_body_net')
+
+    loop_input_output_names_ordered = [
+        str(b) for b in (loop_inputs + loop_outputs)]
+    loop_body_outer_blob_names = list(input_blob_names | output_blob_names)
+    loop_body_outer_blob_names_idx = [
+        loop_input_output_names_ordered.index(b) for b in loop_body_outer_blob_names]
+
+    do_loop_body_workspace_blob = \
+        while_net.NextScopedBlob(while_net.Name() + '/workspace_loop_body')
+
+    loop_inputs.append(do_loop_body_workspace_blob)
+    loop_outputs.append(do_loop_body_workspace_blob)
+    # make sure that added workspace pointer blobs are in While inputs/outputs
+    while_inputs.append(do_loop_body_workspace_blob)
+    while_outputs.append(do_loop_body_workspace_blob)
+
+    do_loop_body_net.Do(
+        loop_inputs,
+        loop_outputs,
+        net=loop_body_net.Proto(),
+        inner_blobs=loop_body_outer_blob_names,
+        outer_blobs_idx=loop_body_outer_blob_names_idx,
+        copy_external_blobs=True)
+    do_loop_body_net.AddExternalOutput(*loop_outputs)
+
+    while_args = {}
+    while_args['loop_net'] = do_loop_body_net.Proto()
+
+    cond_workspace_blob = None
+    if condition_body_net:
+        cond_input_blob_names, cond_output_blob_names = get_external_blob_names(
+            condition_body_net, lexical_scope)
+
+        # make sure condition blob is written by condition net and is
+        # visible outside of it
+        found_condition_output = False
+        for op in condition_body_net.Proto().op:
+            if str(cond_blob) in op.output:
+                found_condition_output = True
+                break
+        assert found_condition_output, \
+            "Condition net does not write into condition blob"
+        if str(cond_blob) not in cond_output_blob_names:
+            cond_output_blob_names.add(str(cond_blob))
+
+        cond_inputs = [core.BlobReference(name=b, net=None)
+                        for b in cond_input_blob_names]
+        assert str(cond_blob) in cond_output_blob_names, \
+            'Condition blob expected in condition net output'
+        cond_outputs = [core.BlobReference(name=b, net=None)
+                        for b in cond_output_blob_names]
+
+        condition_net = core.Net('do_loop_condition_net')
+
+        cond_input_output_names_ordered = [
+            str(b) for b in (cond_inputs + cond_outputs)]
+        cond_body_outer_blob_names = \
+            list(cond_input_blob_names | cond_output_blob_names)
+        cond_body_outer_blob_names_idx = [
+            cond_input_output_names_ordered.index(b)
+            for b in cond_body_outer_blob_names]
+
+        cond_workspace_blob = \
+            while_net.NextScopedBlob(while_net.Name() + '/workspace_loop_cond')
+        cond_inputs.append(cond_workspace_blob)
+        cond_outputs.append(cond_workspace_blob)
+
+        condition_net.Do(
+            cond_inputs,
+            cond_outputs,
+            net=condition_body_net.Proto(),
+            inner_blobs=cond_body_outer_blob_names,
+            outer_blobs_idx=cond_body_outer_blob_names_idx)
+        condition_net.AddExternalOutput(*cond_outputs)
+
+        while_args['cond_net'] = condition_net.Proto()
+
+        while_inputs += [b for b in cond_inputs
+                            if str(b) not in input_blob_names]
+        while_outputs += [b for b in cond_outputs
+                            if str(b) not in output_blob_names]
+
+        if str(cond_blob) not in lexical_scope:
+            while_net.ConstantFill(
+                [],
+                cond_blob,
+                dtype=core.DataType.BOOL,
+                value=False)
+
+    while_net.CreateScope([], [do_loop_body_workspace_blob])
+    if cond_workspace_blob:
+        while_net.CreateScope([], [cond_workspace_blob])
+    while_net.While(while_inputs, while_outputs, **while_args)
+    while_net.AddExternalOutput(*while_outputs)
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
new file mode 100644
index 0000000..e51aeff
--- /dev/null
+++ b/caffe2/python/control_test.py
@@ -0,0 +1,331 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import control, core, test_util, workspace
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class TestControl(test_util.TestCase):
+    def setUp(self):
+        super(TestControl, self).setUp()
+        self.N_ = 10
+
+        self.init_net_ = core.Net("init-net")
+        cnt = self.init_net_.CreateCounter([], init_count=0)
+        const_n = self.init_net_.ConstantFill(
+            [], shape=[], value=self.N_, dtype=core.DataType.INT64)
+        const_0 = self.init_net_.ConstantFill(
+            [], shape=[], value=0, dtype=core.DataType.INT64)
+
+        self.cnt_net_ = core.Net("cnt-net")
+        self.cnt_net_.CountUp([cnt])
+        curr_cnt = self.cnt_net_.RetrieveCount([cnt])
+        self.init_net_.ConstantFill(
+            [], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
+        self.cnt_net_.AddExternalOutput(curr_cnt)
+
+        self.cnt_2_net_ = core.Net("cnt-2-net")
+        self.cnt_2_net_.CountUp([cnt])
+        self.cnt_2_net_.CountUp([cnt])
+        curr_cnt_2 = self.cnt_2_net_.RetrieveCount([cnt])
+        self.init_net_.ConstantFill(
+            [], [curr_cnt_2], shape=[], value=0, dtype=core.DataType.INT64)
+        self.cnt_2_net_.AddExternalOutput(curr_cnt_2)
+
+        self.cond_net_ = core.Net("cond-net")
+        cond_blob = self.cond_net_.LT([curr_cnt, const_n])
+        self.cond_net_.AddExternalOutput(cond_blob)
+
+        self.not_cond_net_ = core.Net("not-cond-net")
+        cond_blob = self.not_cond_net_.GE([curr_cnt, const_n])
+        self.not_cond_net_.AddExternalOutput(cond_blob)
+
+        self.true_cond_net_ = core.Net("true-cond-net")
+        true_blob = self.true_cond_net_.LT([const_0, const_n])
+        self.true_cond_net_.AddExternalOutput(true_blob)
+
+        self.false_cond_net_ = core.Net("false-cond-net")
+        false_blob = self.false_cond_net_.GT([const_0, const_n])
+        self.false_cond_net_.AddExternalOutput(false_blob)
+
+        self.idle_net_ = core.Net("idle-net")
+        self.idle_net_.ConstantFill(
+            [], shape=[], value=0, dtype=core.DataType.INT64)
+
+    def CheckNetOutput(self, nets_and_expects):
+        """
+        Check the net output is expected
+        nets_and_expects is a list of tuples (net, expect)
+        """
+        for net, expect in nets_and_expects:
+            output = workspace.FetchBlob(
+                net.Proto().external_output[-1])
+            self.assertEqual(output, expect)
+
+    def CheckNetAllOutput(self, net, expects):
+        """
+        Check the net output is expected
+        expects is a list of bools.
+        """
+        self.assertEqual(len(net.Proto().external_output), len(expects))
+        for i in range(len(expects)):
+            output = workspace.FetchBlob(
+                net.Proto().external_output[i])
+            self.assertEqual(output, expects[i])
+
+    def BuildAndRunPlan(self, step):
+        plan = core.Plan("test")
+        plan.AddStep(control.Do('init', self.init_net_))
+        plan.AddStep(step)
+        self.assertEqual(workspace.RunPlan(plan), True)
+
+    def ForLoopTest(self, nets_or_steps):
+        step = control.For('myFor', nets_or_steps, self.N_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testForLoopWithNets(self):
+        self.ForLoopTest(self.cnt_net_)
+        self.ForLoopTest([self.cnt_net_, self.idle_net_])
+
+    def testForLoopWithStep(self):
+        step = control.Do('count', self.cnt_net_)
+        self.ForLoopTest(step)
+        self.ForLoopTest([step, self.idle_net_])
+
+    def WhileLoopTest(self, nets_or_steps):
+        step = control.While('myWhile', self.cond_net_, nets_or_steps)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testWhileLoopWithNet(self):
+        self.WhileLoopTest(self.cnt_net_)
+        self.WhileLoopTest([self.cnt_net_, self.idle_net_])
+
+    def testWhileLoopWithStep(self):
+        step = control.Do('count', self.cnt_net_)
+        self.WhileLoopTest(step)
+        self.WhileLoopTest([step, self.idle_net_])
+
+    def UntilLoopTest(self, nets_or_steps):
+        step = control.Until('myUntil', self.not_cond_net_, nets_or_steps)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testUntilLoopWithNet(self):
+        self.UntilLoopTest(self.cnt_net_)
+        self.UntilLoopTest([self.cnt_net_, self.idle_net_])
+
+    def testUntilLoopWithStep(self):
+        step = control.Do('count', self.cnt_net_)
+        self.UntilLoopTest(step)
+        self.UntilLoopTest([step, self.idle_net_])
+
+    def DoWhileLoopTest(self, nets_or_steps):
+        step = control.DoWhile('myDoWhile', self.cond_net_, nets_or_steps)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testDoWhileLoopWithNet(self):
+        self.DoWhileLoopTest(self.cnt_net_)
+        self.DoWhileLoopTest([self.idle_net_, self.cnt_net_])
+
+    def testDoWhileLoopWithStep(self):
+        step = control.Do('count', self.cnt_net_)
+        self.DoWhileLoopTest(step)
+        self.DoWhileLoopTest([self.idle_net_, step])
+
+    def DoUntilLoopTest(self, nets_or_steps):
+        step = control.DoUntil('myDoUntil', self.not_cond_net_, nets_or_steps)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testDoUntilLoopWithNet(self):
+        self.DoUntilLoopTest(self.cnt_net_)
+        self.DoUntilLoopTest([self.cnt_net_, self.idle_net_])
+
+    def testDoUntilLoopWithStep(self):
+        step = control.Do('count', self.cnt_net_)
+        self.DoUntilLoopTest(step)
+        self.DoUntilLoopTest([self.idle_net_, step])
+
+    def IfCondTest(self, cond_net, expect, cond_on_blob):
+        if cond_on_blob:
+            step = control.Do(
+                'if-all',
+                control.Do('count', cond_net),
+                control.If('myIf', cond_net.Proto().external_output[-1],
+                           self.cnt_net_))
+        else:
+            step = control.If('myIf', cond_net, self.cnt_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfCondTrueOnNet(self):
+        self.IfCondTest(self.true_cond_net_, 1, False)
+
+    def testIfCondTrueOnBlob(self):
+        self.IfCondTest(self.true_cond_net_, 1, True)
+
+    def testIfCondFalseOnNet(self):
+        self.IfCondTest(self.false_cond_net_, 0, False)
+
+    def testIfCondFalseOnBlob(self):
+        self.IfCondTest(self.false_cond_net_, 0, True)
+
+    def IfElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
+        if cond_value:
+            run_net = self.cnt_net_
+        else:
+            run_net = self.cnt_2_net_
+        if cond_on_blob:
+            step = control.Do(
+                'if-else-all',
+                control.Do('count', cond_net),
+                control.If('myIfElse', cond_net.Proto().external_output[-1],
+                           self.cnt_net_, self.cnt_2_net_))
+        else:
+            step = control.If('myIfElse', cond_net,
+                              self.cnt_net_, self.cnt_2_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(run_net, expect)])
+
+    def testIfElseCondTrueOnNet(self):
+        self.IfElseCondTest(self.true_cond_net_, True, 1, False)
+
+    def testIfElseCondTrueOnBlob(self):
+        self.IfElseCondTest(self.true_cond_net_, True, 1, True)
+
+    def testIfElseCondFalseOnNet(self):
+        self.IfElseCondTest(self.false_cond_net_, False, 2, False)
+
+    def testIfElseCondFalseOnBlob(self):
+        self.IfElseCondTest(self.false_cond_net_, False, 2, True)
+
+    def IfNotCondTest(self, cond_net, expect, cond_on_blob):
+        if cond_on_blob:
+            step = control.Do(
+                'if-not',
+                control.Do('count', cond_net),
+                control.IfNot('myIfNot', cond_net.Proto().external_output[-1],
+                              self.cnt_net_))
+        else:
+            step = control.IfNot('myIfNot', cond_net, self.cnt_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfNotCondTrueOnNet(self):
+        self.IfNotCondTest(self.true_cond_net_, 0, False)
+
+    def testIfNotCondTrueOnBlob(self):
+        self.IfNotCondTest(self.true_cond_net_, 0, True)
+
+    def testIfNotCondFalseOnNet(self):
+        self.IfNotCondTest(self.false_cond_net_, 1, False)
+
+    def testIfNotCondFalseOnBlob(self):
+        self.IfNotCondTest(self.false_cond_net_, 1, True)
+
+    def IfNotElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
+        if cond_value:
+            run_net = self.cnt_2_net_
+        else:
+            run_net = self.cnt_net_
+        if cond_on_blob:
+            step = control.Do(
+                'if-not-else',
+                control.Do('count', cond_net),
+                control.IfNot('myIfNotElse',
+                              cond_net.Proto().external_output[-1],
+                              self.cnt_net_, self.cnt_2_net_))
+        else:
+            step = control.IfNot('myIfNotElse', cond_net,
+                                 self.cnt_net_, self.cnt_2_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(run_net, expect)])
+
+    def testIfNotElseCondTrueOnNet(self):
+        self.IfNotElseCondTest(self.true_cond_net_, True, 2, False)
+
+    def testIfNotElseCondTrueOnBlob(self):
+        self.IfNotElseCondTest(self.true_cond_net_, True, 2, True)
+
+    def testIfNotElseCondFalseOnNet(self):
+        self.IfNotElseCondTest(self.false_cond_net_, False, 1, False)
+
+    def testIfNotElseCondFalseOnBlob(self):
+        self.IfNotElseCondTest(self.false_cond_net_, False, 1, True)
+
+    def testSwitch(self):
+        step = control.Switch(
+            'mySwitch',
+            (self.false_cond_net_, self.cnt_net_),
+            (self.true_cond_net_, self.cnt_2_net_)
+        )
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, 0), (self.cnt_2_net_, 2)])
+
+    def testSwitchNot(self):
+        step = control.SwitchNot(
+            'mySwitchNot',
+            (self.false_cond_net_, self.cnt_net_),
+            (self.true_cond_net_, self.cnt_2_net_)
+        )
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, 1), (self.cnt_2_net_, 0)])
+
+    def testBoolNet(self):
+        bool_net = control.BoolNet(('a', True))
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True])
+
+        bool_net = control.BoolNet(('a', True), ('b', False))
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True, False])
+
+        bool_net = control.BoolNet([('a', True), ('b', False)])
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True, False])
+
+    def testCombineConditions(self):
+        # combined by 'Or'
+        combine_net = control.CombineConditions(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
+        step = control.Do('combine',
+                          self.true_cond_net_,
+                          self.false_cond_net_,
+                          combine_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(combine_net, True)])
+
+        # combined by 'And'
+        combine_net = control.CombineConditions(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
+        step = control.Do('combine',
+                          self.true_cond_net_,
+                          self.false_cond_net_,
+                          combine_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(combine_net, False)])
+
+    def testMergeConditionNets(self):
+        # merged by 'Or'
+        merge_net = control.MergeConditionNets(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
+        step = control.Do('merge', merge_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(merge_net, True)])
+
+        # merged by 'And'
+        merge_net = control.MergeConditionNets(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
+        step = control.Do('merge', merge_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(merge_net, False)])
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
new file mode 100644
index 0000000..3aac78c
--- /dev/null
+++ b/caffe2/python/convnet_benchmarks.py
@@ -0,0 +1,727 @@
+## @package convnet_benchmarks
+# Module caffe2.python.convnet_benchmarks
+"""
+Benchmark for common convnets.
+
+Speed on Titan X, with 10 warmup steps and 10 main steps and with different
+versions of cudnn, are as follows (time reported below is per-batch time,
+forward / forward+backward):
+
+                    CuDNN V3        CuDNN v4
+AlexNet         32.5 / 108.0    27.4 /  90.1
+OverFeat       113.0 / 342.3    91.7 / 276.5
+Inception      134.5 / 485.8   125.7 / 450.6
+VGG (batch 64) 200.8 / 650.0   164.1 / 551.7
+
+Speed on Inception with varied batch sizes and CuDNN v4 is as follows:
+
+Batch Size   Speed per batch     Speed per image
+ 16             22.8 /  72.7         1.43 / 4.54
+ 32             38.0 / 127.5         1.19 / 3.98
+ 64             67.2 / 233.6         1.05 / 3.65
+128            125.7 / 450.6         0.98 / 3.52
+
+Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
+v4, is as follows:
+
+AlexNet         68.4 / 218.1
+OverFeat       210.5 / 630.3
+Inception      300.2 / 1122.2
+VGG (batch 64) 405.8 / 1327.7
+
+(Note that these numbers involve a "full" backprop, i.e. the gradient
+with respect to the input image is also computed.)
+
+To get the numbers, simply run:
+
+for MODEL in AlexNet OverFeat Inception; do
+  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 128 --model $MODEL --forward_only True
+done
+for MODEL in AlexNet OverFeat Inception; do
+  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size 128 --model $MODEL
+done
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+  --batch_size 64 --model VGGA --forward_only True
+PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+  --batch_size 64 --model VGGA
+
+for BS in 16 32 64 128; do
+  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size $BS --model Inception --forward_only True
+  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
+    --batch_size $BS --model Inception
+done
+
+Note that VGG needs to be run at batch 64 due to memory limit on the backward
+pass.
+"""
+
+import argparse
+
+from caffe2.python import workspace, brew, model_helper
+
+
+def MLP(order, cudnn_ws):
+    model = model_helper.ModelHelper(name="MLP")
+    d = 256
+    depth = 20
+    width = 3
+    for i in range(depth):
+        for j in range(width):
+            current = "fc_{}_{}".format(i, j) if i > 0 else "data"
+            next_ = "fc_{}_{}".format(i + 1, j)
+            brew.fc(
+                model,
+                current,
+                next_,
+                dim_in=d,
+                dim_out=d,
+                weight_init=('XavierFill', {}),
+                bias_init=('XavierFill', {}),
+            )
+    brew.sum(
+        model, ["fc_{}_{}".format(depth, j) for j in range(width)], ["sum"]
+    )
+    brew.fc(
+        model,
+        "sum",
+        "last",
+        dim_in=d,
+        dim_out=1000,
+        weight_init=('XavierFill', {}),
+        bias_init=('XavierFill', {}),
+    )
+    xent = model.net.LabelCrossEntropy(["last", "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, d
+
+
+def AlexNet(order, cudnn_ws):
+    my_arg_scope = {
+        'order': order,
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': True,
+    }
+    if cudnn_ws:
+        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
+    model = model_helper.ModelHelper(
+        name="alexnet",
+        arg_scope=my_arg_scope,
+    )
+    conv1 = brew.conv(
+        model,
+        "data",
+        "conv1",
+        3,
+        64,
+        11, ('XavierFill', {}), ('ConstantFill', {}),
+        stride=4,
+        pad=2
+    )
+    relu1 = brew.relu(model, conv1, "conv1")
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2)
+    conv2 = brew.conv(
+        model,
+        pool1,
+        "conv2",
+        64,
+        192,
+        5,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=2
+    )
+    relu2 = brew.relu(model, conv2, "conv2")
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
+    conv3 = brew.conv(
+        model,
+        pool2,
+        "conv3",
+        192,
+        384,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu3 = brew.relu(model, conv3, "conv3")
+    conv4 = brew.conv(
+        model,
+        relu3,
+        "conv4",
+        384,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu4 = brew.relu(model, conv4, "conv4")
+    conv5 = brew.conv(
+        model,
+        relu4,
+        "conv5",
+        256,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu5 = brew.relu(model, conv5, "conv5")
+    pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
+    fc6 = brew.fc(
+        model,
+        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu6 = brew.relu(model, fc6, "fc6")
+    fc7 = brew.fc(
+        model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu7 = brew.relu(model, fc7, "fc7")
+    fc8 = brew.fc(
+        model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pred = brew.softmax(model, fc8, "pred")
+    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 224
+
+
+def OverFeat(order, cudnn_ws):
+    my_arg_scope = {
+        'order': order,
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': True,
+    }
+    if cudnn_ws:
+        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
+    model = model_helper.ModelHelper(
+        name="overfeat",
+        arg_scope=my_arg_scope,
+    )
+    conv1 = brew.conv(
+        model,
+        "data",
+        "conv1",
+        3,
+        96,
+        11,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        stride=4,
+    )
+    relu1 = brew.relu(model, conv1, "conv1")
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
+    conv2 = brew.conv(
+        model, pool1, "conv2", 96, 256, 5, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu2 = brew.relu(model, conv2, "conv2")
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
+    conv3 = brew.conv(
+        model,
+        pool2,
+        "conv3",
+        256,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu3 = brew.relu(model, conv3, "conv3")
+    conv4 = brew.conv(
+        model,
+        relu3,
+        "conv4",
+        512,
+        1024,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu4 = brew.relu(model, conv4, "conv4")
+    conv5 = brew.conv(
+        model,
+        relu4,
+        "conv5",
+        1024,
+        1024,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu5 = brew.relu(model, conv5, "conv5")
+    pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
+    fc6 = brew.fc(
+        model, pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu6 = brew.relu(model, fc6, "fc6")
+    fc7 = brew.fc(
+        model, relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu7 = brew.relu(model, fc7, "fc7")
+    fc8 = brew.fc(
+        model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pred = brew.softmax(model, fc8, "pred")
+    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 231
+
+
+def VGGA(order, cudnn_ws):
+    my_arg_scope = {
+        'order': order,
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': True,
+    }
+    if cudnn_ws:
+        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
+    model = model_helper.ModelHelper(
+        name="vgga",
+        arg_scope=my_arg_scope,
+    )
+    conv1 = brew.conv(
+        model,
+        "data",
+        "conv1",
+        3,
+        64,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu1 = brew.relu(model, conv1, "conv1")
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
+    conv2 = brew.conv(
+        model,
+        pool1,
+        "conv2",
+        64,
+        128,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu2 = brew.relu(model, conv2, "conv2")
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
+    conv3 = brew.conv(
+        model,
+        pool2,
+        "conv3",
+        128,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu3 = brew.relu(model, conv3, "conv3")
+    conv4 = brew.conv(
+        model,
+        relu3,
+        "conv4",
+        256,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu4 = brew.relu(model, conv4, "conv4")
+    pool4 = brew.max_pool(model, relu4, "pool4", kernel=2, stride=2)
+    conv5 = brew.conv(
+        model,
+        pool4,
+        "conv5",
+        256,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu5 = brew.relu(model, conv5, "conv5")
+    conv6 = brew.conv(
+        model,
+        relu5,
+        "conv6",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu6 = brew.relu(model, conv6, "conv6")
+    pool6 = brew.max_pool(model, relu6, "pool6", kernel=2, stride=2)
+    conv7 = brew.conv(
+        model,
+        pool6,
+        "conv7",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu7 = brew.relu(model, conv7, "conv7")
+    conv8 = brew.conv(
+        model,
+        relu7,
+        "conv8",
+        512,
+        512,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu8 = brew.relu(model, conv8, "conv8")
+    pool8 = brew.max_pool(model, relu8, "pool8", kernel=2, stride=2)
+
+    fcix = brew.fc(
+        model, pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    reluix = brew.relu(model, fcix, "fcix")
+    fcx = brew.fc(
+        model, reluix, "fcx", 4096, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relux = brew.relu(model, fcx, "fcx")
+    fcxi = brew.fc(
+        model, relux, "fcxi", 4096, 1000, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    pred = brew.softmax(model, fcxi, "pred")
+    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 231
+
+
+def _InceptionModule(
+    model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
+    conv5_depths, pool_depth
+):
+    # path 1: 1x1 conv
+    conv1 = brew.conv(
+        model, input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
+        ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv1 = brew.relu(model, conv1, conv1)
+    # path 2: 1x1 conv + 3x3 conv
+    conv3_reduce = brew.conv(
+        model, input_blob, output_name + ":conv3_reduce", input_depth,
+        conv3_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv3_reduce = brew.relu(model, conv3_reduce, conv3_reduce)
+    conv3 = brew.conv(
+        model,
+        conv3_reduce,
+        output_name + ":conv3",
+        conv3_depths[0],
+        conv3_depths[1],
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    conv3 = brew.relu(model, conv3, conv3)
+    # path 3: 1x1 conv + 5x5 conv
+    conv5_reduce = brew.conv(
+        model, input_blob, output_name + ":conv5_reduce", input_depth,
+        conv5_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    conv5_reduce = brew.relu(model, conv5_reduce, conv5_reduce)
+    conv5 = brew.conv(
+        model,
+        conv5_reduce,
+        output_name + ":conv5",
+        conv5_depths[0],
+        conv5_depths[1],
+        5,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=2,
+    )
+    conv5 = brew.relu(model, conv5, conv5)
+    # path 4: pool + 1x1 conv
+    pool = brew.max_pool(
+        model,
+        input_blob,
+        output_name + ":pool",
+        kernel=3,
+        stride=1,
+        pad=1,
+    )
+    pool_proj = brew.conv(
+        model, pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
+        ('XavierFill', {}), ('ConstantFill', {})
+    )
+    pool_proj = brew.relu(model, pool_proj, pool_proj)
+    output = brew.concat(model, [conv1, conv3, conv5, pool_proj], output_name)
+    return output
+
+
+def Inception(order, cudnn_ws):
+    my_arg_scope = {
+        'order': order,
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': True,
+    }
+    if cudnn_ws:
+        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
+    model = model_helper.ModelHelper(
+        name="inception",
+        arg_scope=my_arg_scope,
+    )
+    conv1 = brew.conv(
+        model,
+        "data",
+        "conv1",
+        3,
+        64,
+        7,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        stride=2,
+        pad=3,
+    )
+    relu1 = brew.relu(model, conv1, "conv1")
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=1)
+    conv2a = brew.conv(
+        model, pool1, "conv2a", 64, 64, 1, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    conv2a = brew.relu(model, conv2a, conv2a)
+    conv2 = brew.conv(
+        model,
+        conv2a,
+        "conv2",
+        64,
+        192,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1,
+    )
+    relu2 = brew.relu(model, conv2, "conv2")
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2, pad=1)
+    # Inception modules
+    inc3 = _InceptionModule(
+        model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
+    )
+    inc4 = _InceptionModule(
+        model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
+    )
+    pool5 = brew.max_pool(model, inc4, "pool5", kernel=3, stride=2, pad=1)
+    inc5 = _InceptionModule(
+        model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
+    )
+    inc6 = _InceptionModule(
+        model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
+    )
+    inc7 = _InceptionModule(
+        model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
+    )
+    inc8 = _InceptionModule(
+        model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
+    )
+    inc9 = _InceptionModule(
+        model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
+    )
+    pool9 = brew.max_pool(model, inc9, "pool9", kernel=3, stride=2, pad=1)
+    inc10 = _InceptionModule(
+        model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
+    )
+    inc11 = _InceptionModule(
+        model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
+    )
+    pool11 = brew.average_pool(model, inc11, "pool11", kernel=7, stride=1)
+    fc = brew.fc(
+        model, pool11, "fc", 1024, 1000, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    # It seems that Soumith's benchmark does not have softmax on top
+    # for Inception. We will add it anyway so we can have a proper
+    # backward pass.
+    pred = brew.softmax(model, fc, "pred")
+    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 224
+
+
+def AddParameterUpdate(model):
+    """ Simple plain SGD update -- not tuned to actually train the models """
+    ITER = brew.iter(model, "iter")
+    LR = model.net.LearningRate(
+        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    for param in model.params:
+        param_grad = model.param_to_grad[param]
+        model.net.WeightedSum([param, ONE, param_grad, LR], param)
+
+
+def Benchmark(model_gen, arg):
+    model, input_size = model_gen(arg.order, arg.cudnn_ws)
+    model.Proto().type = arg.net_type
+    model.Proto().num_workers = arg.num_workers
+
+    # In order to be able to run everything without feeding more stuff, let's
+    # add the data and label blobs to the parameter initialization net as well.
+    if arg.order == "NCHW":
+        input_shape = [arg.batch_size, 3, input_size, input_size]
+    else:
+        input_shape = [arg.batch_size, input_size, input_size, 3]
+    if arg.model == "MLP":
+        input_shape = [arg.batch_size, input_size]
+
+    model.param_init_net.GaussianFill(
+        [],
+        "data",
+        shape=input_shape,
+        mean=0.0,
+        std=1.0
+    )
+    model.param_init_net.UniformIntFill(
+        [],
+        "label",
+        shape=[arg.batch_size, ],
+        min=0,
+        max=999
+    )
+
+    if arg.forward_only:
+        print('{}: running forward only.'.format(arg.model))
+    else:
+        print('{}: running forward-backward.'.format(arg.model))
+        model.AddGradientOperators(["loss"])
+        AddParameterUpdate(model)
+        if arg.order == 'NHWC':
+            print(
+                '==WARNING==\n'
+                'NHWC order with CuDNN may not be supported yet, so I might\n'
+                'exit suddenly.'
+            )
+
+    if not arg.cpu:
+        model.param_init_net.RunAllOnGPU()
+        model.net.RunAllOnGPU()
+
+    if arg.engine:
+        for op in model.net.Proto().op:
+            op.engine = arg.engine
+
+    if arg.dump_model:
+        # Writes out the pbtxt for benchmarks on e.g. Android
+        with open(
+            "{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
+        ) as fid:
+            fid.write(str(model.param_init_net.Proto()))
+        with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
+            fid.write(str(model.net.Proto()))
+
+    workspace.RunNetOnce(model.param_init_net)
+    workspace.CreateNet(model.net)
+    workspace.BenchmarkNet(
+        model.net.Proto().name, arg.warmup_iterations, arg.iterations,
+        arg.layer_wise_benchmark)
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size."
+    )
+    parser.add_argument("--model", type=str, help="The model to benchmark.")
+    parser.add_argument(
+        "--order",
+        type=str,
+        default="NCHW",
+        help="The order to evaluate."
+    )
+    parser.add_argument(
+        "--cudnn_ws",
+        type=int,
+        help="The cudnn workspace size."
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10,
+        help="Number of iterations to run the network."
+    )
+    parser.add_argument(
+        "--warmup_iterations",
+        type=int,
+        default=10,
+        help="Number of warm-up iterations before benchmarking."
+    )
+    parser.add_argument(
+        "--forward_only",
+        action='store_true',
+        help="If set, only run the forward pass."
+    )
+    parser.add_argument(
+        "--layer_wise_benchmark",
+        action='store_true',
+        help="If True, run the layer-wise benchmark as well."
+    )
+    parser.add_argument(
+        "--cpu",
+        action='store_true',
+        help="If True, run testing on CPU instead of GPU."
+    )
+    parser.add_argument(
+        "--engine",
+        type=str,
+        default="",
+        help="If set, blindly prefer the given engine(s) for every op.")
+    parser.add_argument(
+        "--dump_model",
+        action='store_true',
+        help="If True, dump the model prototxts to disk."
+    )
+    parser.add_argument("--net_type", type=str, default="dag")
+    parser.add_argument("--num_workers", type=int, default=2)
+    parser.add_argument("--use-nvtx", default=False, action='store_true')
+    parser.add_argument("--htrace_span_log_path", type=str)
+    return parser
+
+
+if __name__ == '__main__':
+    args, extra_args = GetArgumentParser().parse_known_args()
+    if (
+        not args.batch_size or not args.model or not args.order
+    ):
+        GetArgumentParser().print_help()
+    else:
+        workspace.GlobalInit(
+            ['caffe2', '--caffe2_log_level=0'] + extra_args +
+            (['--caffe2_use_nvtx'] if args.use_nvtx else []) +
+            (['--caffe2_htrace_span_log_path=' + args.htrace_span_log_path]
+                if args.htrace_span_log_path else []))
+
+        model_map = {
+            'AlexNet': AlexNet,
+            'OverFeat': OverFeat,
+            'VGGA': VGGA,
+            'Inception': Inception,
+            'MLP': MLP,
+        }
+        Benchmark(model_map[args.model], args)
diff --git a/caffe2/python/convnet_benchmarks_test.py b/caffe2/python/convnet_benchmarks_test.py
new file mode 100644
index 0000000..5916043
--- /dev/null
+++ b/caffe2/python/convnet_benchmarks_test.py
@@ -0,0 +1,22 @@
+import unittest
+from caffe2.python import convnet_benchmarks as cb
+from caffe2.python import test_util, workspace
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
+class TestConvnetBenchmarks(test_util.TestCase):
+    def testConvnetBenchmarks(self):
+        all_args = [
+            '--batch_size 16 --order NCHW --iterations 1 '
+            '--warmup_iterations 1',
+            '--batch_size 16 --order NCHW --iterations 1 '
+            '--warmup_iterations 1 --forward_only',
+        ]
+        for model in [cb.AlexNet, cb.OverFeat, cb.VGGA, cb.Inception]:
+            for arg_str in all_args:
+                args = cb.GetArgumentParser().parse_args(arg_str.split(' '))
+                cb.Benchmark(model, args)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
new file mode 100644
index 0000000..3caa3ee
--- /dev/null
+++ b/caffe2/python/core.py
@@ -0,0 +1,2868 @@
+## @package core
+# Module caffe2.python.core
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import namedtuple, OrderedDict, defaultdict
+from past.builtins import basestring
+from future.utils import viewitems, viewkeys, viewvalues
+from itertools import chain
+from six import binary_type, string_types, text_type
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import scope, utils, workspace
+from caffe2.python.control_ops_grad import \
+    gen_do_gradient, gen_if_gradient, gen_while_gradient
+
+import caffe2.python._import_c_extension as C
+
+import copy
+import pickle
+import numpy as np
+import sys
+import traceback
+import os
+
+# Mac os specific message
+if (sys.platform == 'darwin' and 'leveldb' in C.registered_dbs()):
+    print('If you are using homebrew leveldb on a Mac OS, you might see an '
+          'error warning you that malloc_zone_unregister() failed. This is '
+          'not a caffe2 issue but is due to the homebrew leveldb having an '
+          'incompatible memory allocator. It does not affect usage.')
+
+# Convenience redirections to functions inside scope.
+DeviceScope = scope.DeviceScope
+NameScope = scope.NameScope
+
+
+# Bring datatype enums to the main namespace
+class DataType:
+    pass
+
+
+def _InitDataType():
+    for name, value in caffe2_pb2.TensorProto.DataType.items():
+        setattr(DataType, name, value)
+
+
+_InitDataType()
+
+
+def _GetRegisteredOperators():
+    return set(workspace.RegisteredOperators())
+
+
+_REGISTERED_OPERATORS = _GetRegisteredOperators()
+
+
+def RefreshRegisteredOperators():
+    global _REGISTERED_OPERATORS
+    _REGISTERED_OPERATORS = _GetRegisteredOperators()
+
+
+_GLOBAL_INIT_ARGS = []
+
+
+def GlobalInit(args):
+    _GLOBAL_INIT_ARGS.extend(args[1:])
+    C.global_init(args)
+
+
+def GetGlobalInitArgs():
+    return _GLOBAL_INIT_ARGS[:]
+
+
+def IsOperator(op_type):
+    return IsOperatorWithEngine(op_type, engine='DEFAULT')
+
+
+def IsOperatorWithEngine(op_type, engine):
+    return C.op_registry_key(op_type, engine) in _REGISTERED_OPERATORS
+
+
+def DeviceOption(
+    device_type,
+    cuda_gpu_id=0,
+    random_seed=None,
+    node_name=None,
+    numa_node_id=None,
+    extra_info=None,
+):
+    option = caffe2_pb2.DeviceOption()
+    option.device_type = device_type
+    option.cuda_gpu_id = cuda_gpu_id
+    if node_name is not None:
+        option.node_name = node_name
+    if random_seed is not None:
+        option.random_seed = random_seed
+    if numa_node_id is not None:
+        assert device_type == caffe2_pb2.CPU
+        option.numa_node_id = numa_node_id
+    if extra_info is not None:
+        option.extra_info.extend(extra_info)
+    return option
+
+
+def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=True):
+    if not opt1 or not opt2:
+        return opt1 == opt2
+    if not ignore_node_name and opt1.node_name != opt2.node_name:
+        return False
+    if not ignore_random_seed and opt1.random_seed != opt2.random_seed:
+        return False
+    if not opt1.device_type or not opt2.device_type:
+        # At least one option is for CPU, check if both are for CPU.
+        return not opt1.device_type and not opt2.device_type
+    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
+
+
+def InferBlobDevices(net):
+    '''
+    Compute mapping from parameters to devices by looking at the
+    device option of the op that creates the blob has
+    '''
+    mapping = {}
+    for op in net.Proto().op:
+        op_device = op.device_option
+        if op_device is None:
+            op_device = caffe2_pb2.DeviceOption(caffe2_pb2.CPU)
+        # TODO: T18892922, use device annotations
+        for b in op.output:
+            mapping[b] = op_device
+    return mapping
+
+
+def InferOpBlobDevicesAsDict(op):
+    input_dev_list, output_dev_list = InferOpBlobDevices(op)
+    input_dict = {
+        op.input[i]: input_dev_list[i]
+        for i in range(len(op.input))
+    }
+    output_dict = {
+        op.output[i]: output_dev_list[i]
+        for i in range(len(op.output))
+    }
+    return input_dict, output_dict
+
+
+def InferOpBlobDevices(op):
+    device_info = C.infer_op_input_output_device(op.SerializeToString())
+    input_info = []
+    output_info = []
+    for dev_str in device_info[0]:
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.ParseFromString(dev_str)
+        input_info.append(device_option)
+    for dev_str in device_info[1]:
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.ParseFromString(dev_str)
+        output_info.append(device_option)
+    return input_info, output_info
+
+
+def InferOpDeviceAsBlobDevices(op):
+    op_dev = op.device_option if op.device_option else caffe2_pb2.DeviceOption()
+    input_dev = [op_dev] * len(op.input)
+    output_dev = [op_dev] * len(op.output)
+    return input_dev, output_dev
+
+
+GradientSlice = namedtuple('GradientSlice', ['indices', 'values'])
+
+
+class BlobReference(object):
+    """A wrapper around a blob in a net.
+
+    BlobReference gives us a way to refer to the network that the blob is
+    generated from. Note that blobs are, essentially, just strings in the
+    current workspace.
+    """
+
+    def __init__(self, name, net=None):
+        """Initializes a blob reference.
+
+        Note that this does not prepends the namescope. If needed, use
+        ScopedBlobReference() to prepend the existing namespace.
+        """
+        if isinstance(name, string_types):
+            self._name = name
+        elif isinstance(name, binary_type):
+            self._name = name.decode('utf-8')
+        else:
+            self._name = str(name)
+        self._from_net = net
+        # meta allows helper functions to put whatever metainformation needed
+        # there.
+        self.meta = {}
+
+    def __hash__(self):
+        return hash(self._name)
+
+    def __eq__(self, other):
+        if isinstance(other, string_types):
+            return self._name == other
+        elif isinstance(other, binary_type):
+            return self._name == other.decode('utf-8')
+        elif isinstance(other, BlobReference):
+            return self._name == other._name
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not(self == other)
+
+    def __str__(self):
+        return self._name
+
+    def __repr__(self):
+        return 'BlobReference("{}")'.format(self._name)
+
+    def __add__(self, other):
+        if not isinstance(other, string_types):
+            raise RuntimeError('Cannot add BlobReference to a non-string.')
+        return BlobReference(self._name + other, self._from_net)
+
+    def __radd__(self, other):
+        if not isinstance(other, string_types):
+            raise RuntimeError('Cannot add a non-string to BlobReference.')
+        return BlobReference(other + self._name, self._from_net)
+
+    def Net(self):
+        return self._from_net
+
+    def GetNameScope(self):
+        return self._name[:self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1]
+
+    def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
+        """Internal function that routes the operator generation to the
+        network's __getattr__ function.
+        """
+        inputs = [] if inputs is None else inputs
+        if isinstance(inputs, BlobReference) or isinstance(inputs, string_types):
+            inputs = [inputs]
+        # add self to the input list.
+        inputs.insert(0, self)
+        return self._from_net.__getattr__(op_type)(inputs, *args, **kwargs)
+
+    def __getattr__(self, op_type):
+        """A wrapper allowing one to initiate operators from a blob reference.
+
+        Example: for a blob reference b that comes from network n, doing
+            b.Relu(...)
+        is equivalent to doing
+            net.Relu([b], ...)
+        """
+        if op_type.startswith('__'):
+            raise AttributeError('Attribute {} not found.'.format(op_type))
+        if self._from_net is None:
+            raise RuntimeError(
+                'You cannot use a blob reference that does not have a net '
+                'source to create operators. Create the operator from an '
+                'explicit net object.')
+        if not IsOperator(op_type):
+            raise RuntimeError(
+                'Method ' + op_type + ' is not a registered operator.' +
+                ' Did you mean: [' +
+                ",".join(workspace.C.nearby_opnames(op_type)) + ']'
+            )
+        return lambda *args, **kwargs: self._CreateAndAddToNet(
+            op_type, *args, **kwargs)
+
+    def __dir__(self):
+        additional_methods = [
+            op
+            for op in _REGISTERED_OPERATORS
+            if '_ENGINE_' not in op or '_ENGINE_CUDNN' in op]
+        return sorted(set(chain(
+            dir(type(self)),
+            viewkeys(self.__dict__),
+            additional_methods
+        )))
+
+
+def ScopedName(name):
+    """prefix the name with the current scope."""
+    if isinstance(name, binary_type):
+        name = name.decode('ascii')
+    return scope.CurrentNameScope() + name
+
+
+def ScopedBlobReference(name, *args, **kwargs):
+    """Returns a blob reference with scope prefixed."""
+    return BlobReference(ScopedName(name), *args, **kwargs)
+
+
+def _RectifyInputOutput(blobs, net=None):
+    """A helper function to rectify the input or output of the CreateOperator
+    interface.
+    """
+    if isinstance(blobs, string_types) or isinstance(blobs, binary_type):
+        # If blobs is a single string, prepend scope.CurrentNameScope()
+        # and put it as a list.
+        # TODO(jiayq): enforce using BlobReference instead of raw strings.
+        return [ScopedBlobReference(blobs, net=net)]
+    elif type(blobs) is BlobReference:
+        # If blob is a BlobReference, simply put it as a list.
+        return [blobs]
+    elif type(blobs) in (list, tuple):
+        # If blob is a list, we go through it and type check.
+        rectified = []
+        for blob in blobs:
+            if isinstance(blob, string_types) or isinstance(blob, binary_type):
+                rectified.append(ScopedBlobReference(blob, net=net))
+            elif type(blob) is BlobReference:
+                rectified.append(blob)
+            else:
+                raise TypeError(
+                    "I/O blob #{} of unsupported type: {} of type {}"
+                    .format(len(rectified), str(blob), type(blob)))
+        return rectified
+    else:
+        raise TypeError(
+            "Unknown input/output type: %s of type %s." %
+            (str(blobs), type(blobs))
+        )
+
+
+def CreateOperator(
+    operator_type,
+    inputs,
+    outputs,
+    name='',
+    control_input=None,
+    device_option=None,
+    arg=None,
+    engine=None,
+    **kwargs
+):
+    """A function wrapper that allows one to create operators based on the
+    operator type. The type should be a string corresponding to an operator
+    registered with Caffe2.
+    """
+    operator = caffe2_pb2.OperatorDef()
+    if (os.environ.get('CAFFE2_DEBUG')):
+        stack = traceback.format_stack()
+        operator.debug_info = "".join(stack[:-1])
+
+    operator.type = operator_type
+    operator.name = name
+    # Add rectified inputs and outputs
+    inputs = _RectifyInputOutput(inputs)
+    outputs = _RectifyInputOutput(outputs)
+    operator.input.extend([text_type(i) for i in inputs])
+    operator.output.extend([text_type(o) for o in outputs])
+    if control_input:
+        control_input = _RectifyInputOutput(control_input)
+        operator.control_input.extend([text_type(i) for i in control_input])
+    # Set device option:
+    # (1) If device_option is explicitly set, use device_option.
+    # (2) If not, but scope.CurrentDeviceScope() is set,
+    #     then we use scope.CurrentDeviceScope().
+    # (3) Otherwise, do not set device option.
+    if device_option is not None:
+        operator.device_option.CopyFrom(device_option)
+    elif scope.CurrentDeviceScope() is not None:
+        operator.device_option.CopyFrom(scope.CurrentDeviceScope())
+    if engine is not None:
+        operator.engine = engine
+    # random seed is defined in the device option, so we need to do special
+    # care.
+
+    if 'random_seed' in kwargs:
+        operator.device_option.random_seed = kwargs['random_seed']
+        del kwargs['random_seed']
+    # Add given arguments that do not need parsing
+    if arg is not None:
+        operator.arg.extend(arg)
+    # Add all other arguments
+    for key, value in viewitems(kwargs):
+        if value is not None:
+            operator.arg.add().CopyFrom(utils.MakeArgument(key, value))
+
+    if workspace.IsImmediate():
+        workspace.RunOperatorImmediate(operator)
+    return operator
+
+
+def _RegisterPythonImpl(
+    f, grad_f=None, python_func_type=None, pass_workspace=False
+):
+    if python_func_type:
+        func = python_func_type(f)
+        f = func.forward
+        grad_f = func.backward
+    else:
+        if isinstance(f, tuple):
+            f = f[0](*f[1], **f[2])
+        if isinstance(grad_f, tuple):
+            grad_f = grad_f[0](*grad_f[1], **grad_f[2])
+
+    token = C.register_python_op(f, pass_workspace, '')
+    if grad_f:
+        C.register_python_gradient_op(token, grad_f)
+    return token
+
+
+def CreatePythonOperator(
+    f, inputs,
+    outputs,
+    grad_f=None,
+    pass_workspace=False,
+    python_func_type=None,
+    *args,
+    **kwargs
+):
+    """
+    `f` should have a signature (inputs, outputs)
+
+    If `pass_workspace` is True, the signature is changed to
+    (inputs, outputs, workspace) where `workspace` is the workspace the op
+    is going to run on. This is potentially dangerous (as the op can manipulate
+    the workspace directly), use on your own risk.
+    """
+    kwargs["token"] = _RegisterPythonImpl(
+        f, grad_f, python_func_type, pass_workspace=pass_workspace
+    )
+    return CreateOperator("Python", inputs, outputs, *args, **kwargs)
+
+
+def GetIndexFromGradientList(g_list, name):
+    """A helper function to get the index from a gradient list, None if not
+    matching."""
+    for i, g in enumerate(g_list):
+        if g == name:
+            return i
+        elif type(g) is GradientSlice:
+            if (g.indices == name or g.values == name):
+                return i
+    return None
+
+
+OpSSA = namedtuple('OpSSA', ['op', 'in_versions', 'out_versions'])
+GradGenMeta = namedtuple('GradGenMeta', ['grad_op', 'idx', 'gradient'])
+SparseGradGenMeta = namedtuple('SparseGradGenMeta', [
+    'grad_op_indices', 'idx_indices',
+    'grad_op_values', 'idx_values',
+    'gradient',
+])
+
+
+class IR(object):
+    """A simple IR class to keep track of all intermediate representations used
+    in the gradient computation.
+    """
+
+    def __init__(self, operators):
+        # The IR class holds multiple metadata from the forward pass:
+        # a) ssa: a list of [op, in_versions, out_versions] recording the
+        #    input and the output version of each operator, similar
+        #    to a normal SSA form.
+        # b) input_usages: a dictionary specifying for each blob and
+        #    each of its version, how many times it is used as input for another
+        #    op.
+        # c) frontier: maintaining the current versions of the blobs
+        #    we are having in the workspace, after the execution of all the ops
+        #    added to the IR so far. This is useful because if a gradient is
+        #    trying to access an earlier version of a blob, we can sanity check
+        #    that it is no longer there, and thus throw an error.
+        # d) gradient_frontier: maps the names of blobs to its version that the
+        #    gradient corresponds to.
+        # e) gradient_generators: for each blob and each of its version, maps to
+        #    a list of operators that generates its gradient together with the
+        #    gradient name.
+        self.ssa = []
+        self.input_usages = defaultdict(lambda: defaultdict(list))
+        self.frontier = defaultdict(int)
+        self.gradient_frontier = {}
+        self.gradient_generators = defaultdict(lambda: defaultdict(list))
+        self.out_version_history = defaultdict(list)
+        self.in_version_history = defaultdict(list)
+
+        for op in operators:
+            self.Play(op)
+
+        self.SanityCheck(operators)
+
+    def SanityCheck(self, operators):
+        # Validate StopGradient usage by checking that StopGradient's output
+        # is actually passed forward
+        for op in operators:
+            if op.type == 'StopGradient':
+                if op.output[0] not in self.input_usages:
+                    raise ValueError("""StopGradient's output '{}' is orphan.
+You typically want to specify same input and output for
+StopGradient. Op:\n\n{}""".format(op.output[0], str(op)))
+
+    def Play(self, op):
+        """"Adds an op to the current IR, and update the internal states to
+        reflect the blobs and versions after the execution of the op.
+        """
+        # For input, they are the current version in the dict.
+        in_versions = {}
+        for s in op.input:
+            in_versions[s] = self.frontier[s]
+            self.input_usages[s][self.frontier[s]].append(len(self.ssa))
+            self.in_version_history[s].append((op, self.frontier[s]))
+        # For output, they are the current version plus one. If this is a
+        # newly created blob, its version starts with zero.
+        out_versions = {}
+        for s in op.output:
+            if s in self.frontier:
+                self.frontier[s] += 1
+            out_versions[s] = self.frontier[s]
+            self.out_version_history[s].append((op, self.frontier[s]))
+        # Add to SSA for bookkeeping.
+        self.ssa.append(OpSSA(op, in_versions, out_versions))
+
+    def CheckGradientOperatorInput(
+            self, grad_op_input, g_output, fwd_op_idx, locally_generated_blobs):
+        """Checks if the gradient operators can be correctly carried out."""
+        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
+        original_index = GetIndexFromGradientList(g_output, grad_op_input)
+
+        # Functions to generate debug help for version-mismatches
+        def versionMismatchInfoOut(name):
+            s = "DEBUG HELP:\n"
+            s += "Maybe you use same output blob twice for different ops?\n"
+            s += "== Version history of blob [{}]\n".format(name)
+            for (op, vers) in self.out_version_history[name]:
+                s += "Version (out) {} <-- {}".format(vers, op)
+                s += "\n"
+            return s
+
+        def versionMismatchInfoIn(name):
+            s = "DEBUG HELP:\n"
+            s += "Maybe the blob was overwritten by another op?\n"
+            s += "== Version history of blob [{}]\n".format(name)
+            for (op, vers) in self.in_version_history[name]:
+                s += "version (in) {} <-- {}".format(vers, op)
+                s += "\n"
+            return s
+
+        # If it is a dense or sparse gradient name, it should match the
+        # version of the corresponding output.
+        if original_index is not None:
+            original_name = forward_op.output[original_index]
+            if (out_versions[original_name] !=
+                    self.gradient_frontier[original_name]):
+                raise RuntimeError(
+                    'Gradient name "%s" is expected to correspond '
+                    'to version %d of "%s", but currently we have '
+                    'version %d.\n\n' % (
+                        grad_op_input, out_versions[original_name],
+                        original_name,
+                        self.gradient_frontier[original_name]) +
+                    versionMismatchInfoOut(original_name))
+        # If it is an output name, the current version should match the
+        # version when the operator was run.
+        elif grad_op_input in out_versions:
+            if self.frontier[grad_op_input] != out_versions[grad_op_input]:
+                raise RuntimeError(
+                    'Gradient operator needs output "%s" at version'
+                    ' %d, but currently we have version %d.\n\n' % (
+                        grad_op_input, out_versions[grad_op_input],
+                        self.frontier[grad_op_input]
+                    ) + versionMismatchInfoOut(grad_op_input)
+                )
+        # If it is an input name, the current version should match the
+        # version when the operator was run.
+        elif grad_op_input in in_versions:
+            if (self.frontier[grad_op_input] != in_versions[grad_op_input]):
+                raise RuntimeError(
+                    'Gradient operator needs input "%s" at version '
+                    '%d, but currently we have version %d.\n\n' % (
+                        grad_op_input, in_versions[grad_op_input],
+                        self.frontier[grad_op_input]
+                    ) + versionMismatchInfoIn(grad_op_input)
+                )
+        # If it is none of the above, it should be a blob that is
+        # generated locally by one of the previous gradient operators.
+        else:
+            if grad_op_input not in locally_generated_blobs:
+                raise RuntimeError(
+                    'Blob name "%s" not in the scope of operator: '
+                    '%s\nand is not generated by any of the local '
+                    'gradient operators.' % (grad_op_input, str(forward_op))
+                )
+
+    def AppendSparseGenerators(self, sparse_generators):
+        # merge indices and values generators for sparse gradients
+        for name, input_generators in viewitems(sparse_generators):
+            for version, generators in viewitems(input_generators):
+                if len(generators) == 1:
+                    # either indices or values are generated (but not both)
+                    generator = generators[0]
+                else:
+                    # both indices and values are generated
+                    assert(len(generators) == 2)
+                    op1_i, idx1_i, op1_v, idx1_v, g1 = generators[0]
+                    op2_i, idx2_i, op2_v, idx2_v, g2 = generators[1]
+                    assert(g1 == g2)
+                    assert(op1_i is None or op2_i is None)
+                    assert(op1_v is None or op2_v is None)
+                    assert(idx1_i == 0 or idx2_i == 0)
+                    assert(idx1_v == 0 or idx2_v == 0)
+                    generator = SparseGradGenMeta(
+                        op1_i or op2_i, idx1_i + idx2_i,
+                        op1_v or op2_v, idx1_v + idx2_v,
+                        g1)
+                self.gradient_generators[name][version].append(generator)
+
+    def BuildGradientGenerators(  # NOQA
+            self, fwd_op_idx, gradient_ops, g_output, g_input):
+        """Updates gradient_generators and gradient_frontier"""
+        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
+        locally_generated_blobs = []
+        sparse_generators = defaultdict(lambda: defaultdict(list))
+
+        for grad_op in gradient_ops:
+            # (1) check that inputs are valid
+            for s in grad_op.input:
+                self.CheckGradientOperatorInput(
+                    s, g_output, fwd_op_idx, locally_generated_blobs)
+
+            # (2) add outputs to the locally generated blobs
+            # If an output corresponds to the gradient of an input, we also
+            # record it to gradient_generators
+            locally_generated_blobs.extend([str(s) for s in grad_op.output])
+            for i, output in enumerate(grad_op.output):
+                input_index = GetIndexFromGradientList(g_input, output)
+                if input_index is not None:
+                    input_name = forward_op.input[input_index]
+                    input_version = in_versions[input_name]
+                    g = g_input[input_index]
+                    if type(g) is GradientSlice:
+                        # the output corresponds either to the indices or the
+                        # values of the sparse gradient. In either case we
+                        # create a (partial) SparseGradGenMeta. If necessary,
+                        # we'll merge indices and values generators
+                        # corresponding to the same gradient in step (3)
+                        if g.indices == output:
+                            m = SparseGradGenMeta(grad_op, i, None, 0, g)
+                        else:
+                            assert(g.values == output)
+                            m = SparseGradGenMeta(None, 0, grad_op, i, g)
+                        sparse_generators[input_name][input_version].append(m)
+                    else:
+                        self.gradient_generators[input_name][input_version] \
+                            .append(GradGenMeta(
+                                grad_op, i, g))
+
+        # (3) merge indices and values generators for sparse gradients, and
+        # add them to gradient_generators
+        self.AppendSparseGenerators(sparse_generators)
+
+        # (4) for ops (e.g., Add, Sum, Sub) which have gradient outputs directly
+        # passed from inputs (not computed from gradient ops), we create an
+        # GradGenMeta with None grad_op and idx so that the gradient_generators
+        # knows where the gradients are coming from. This is needed for creating
+        # Sum op to accumulate the gradients from multiple parents.
+        for input_index, g in enumerate(g_input):
+            input_name = forward_op.input[input_index]
+            input_version = in_versions[input_name]
+            if not g:
+                continue
+            if type(g) is GradientSlice:
+                if str(g.indices) not in locally_generated_blobs and \
+                        str(g.values) not in locally_generated_blobs:
+                    self.gradient_generators[input_name][input_version].append(
+                        SparseGradGenMeta(None, 0, None, 0, g))
+            else:
+                if str(g) not in locally_generated_blobs:
+                    self.gradient_generators[input_name][input_version].append(
+                        GradGenMeta(None, 0, g))
+
+        # Finally, for the gradients specified in g_input, we update the
+        # gradient frontier to reflect the input versions that the gradients
+        # correspond to.
+        for i, g in enumerate(g_input):
+            if g is not None:
+                input_name = forward_op.input[i]
+                input_version = in_versions[input_name]
+                self.gradient_frontier[input_name] = input_version
+
+    def _GetSumOpOutputName(self, generator, input_name):
+        def remove_suffix(s, suffix):
+            if s.endswith(suffix):
+                return s[:-len(suffix)]
+            return s
+
+        for g in generator:
+            if type(g) is GradGenMeta:
+                grad_op, idx, _ = g
+                if grad_op:
+                    return grad_op.output[idx]
+            else:
+                assert(type(g) is SparseGradGenMeta)
+                op_i, idx_i, op_v, idx_v, _ = g
+                if op_i:
+                    return remove_suffix(op_i.output[idx_i], '_indices')
+                if op_v:
+                    return remove_suffix(op_v.output[idx_v], '_values')
+
+        return input_name + '_grad'
+
+    def _SetSumOpsDeviceOption(self, sum_ops, generators):
+        # we already checked that device options are consistent so we can just
+        # use the first one we find
+        for generator in generators:
+            grad_op = generator.grad_op if type(generator) is GradGenMeta \
+                else generator.grad_op_values or generator.grad_op_indices
+            if grad_op:
+                if grad_op.HasField('device_option'):
+                    for op in sum_ops:
+                        op.device_option.CopyFrom(grad_op.device_option)
+                break
+
+    def _DisambiguateGradOpOutput(self, grad_op, idx, cnt):
+        grad_op.output[idx] = (
+            '_' + grad_op.output[idx] + '_autosplit_{}'.format(cnt))
+        return grad_op.output[idx], cnt + 1
+
+    def _CheckSumOpsConflict(self, out_base_name, g):
+        if str(out_base_name) == str(g):
+            # TODO not sure what this message really means
+            raise RuntimeError(
+                'The gradient output of empty gradient op can not '
+                'be the same as the normal name of the current '
+                'input gradient.')
+
+    def _MakeDenseSumOps(self, generators, out_base_name):
+        sum_op_input = []
+        cnt = 0
+
+        assert len(generators) > 1
+
+        first_grad_op = True
+        for generator in generators:
+            grad_op, idx, g = generator
+            assert(type(g) is not GradientSlice)
+            if grad_op:
+                if first_grad_op:
+                    first_grad_op = False
+                    out = grad_op.output[idx]
+                else:
+                    out, cnt = self._DisambiguateGradOpOutput(grad_op, idx, cnt)
+                sum_op_input.append(out)
+            else:
+                self._CheckSumOpsConflict(out_base_name, g)
+                sum_op_input.append(str(g))
+
+        if out_base_name in sum_op_input:
+            # Sum inplace mode works only for the first input
+            # So we do a swap
+            idx = sum_op_input.index(out_base_name)
+            sum_op_input[0], sum_op_input[idx] = (
+                sum_op_input[idx], sum_op_input[0]
+            )
+        sum_ops = [CreateOperator(
+            "Sum",
+            [BlobReference(x) for x in sum_op_input],
+            BlobReference(out_base_name))]
+        return sum_ops, out_base_name
+
+    def _MakeSparseSumOps(self, generators, out_base_name):
+        indices_concat_input = []
+        values_concat_input = []
+        cnt_i = 0
+        cnt_v = 0
+
+        for generator in generators:
+            assert(type(generator) is SparseGradGenMeta)
+            op_i, idx_i, op_v, idx_v, g = generator
+            if op_i:
+                out, cnt_i = self._DisambiguateGradOpOutput(op_i, idx_i, cnt_i)
+                indices_concat_input.append(out)
+            else:
+                self._CheckSumOpsConflict(out_base_name, g.indices)
+                indices_concat_input.append(g.indices)
+            if op_v:
+                out, cnt_v = self._DisambiguateGradOpOutput(op_v, idx_v, cnt_v)
+                values_concat_input.append(out)
+            else:
+                self._CheckSumOpsConflict(out_base_name, g.values)
+                values_concat_input.append(g.values)
+
+        indices_concat_output = out_base_name + '_indices_concat'
+        indices_concat_split = out_base_name + '_indices_concat_split'
+        values_concat_output = out_base_name + '_values_concat'
+        values_concat_split = out_base_name + '_values_concat_split'
+        # Sum the given sparse representations by simply concatenating the
+        # indices (resp. values) tensors together. We don't do any deduplication
+        # of indices at this point. This will be done as needed before the
+        # optimizer is called
+        sum_ops = [
+            CreateOperator(
+                "Concat",
+                [BlobReference(x) for x in indices_concat_input],
+                [BlobReference(x) for x in
+                    [indices_concat_output, indices_concat_split]],
+                axis=0
+            ),
+            CreateOperator(
+                "Concat",
+                [BlobReference(x) for x in values_concat_input],
+                [BlobReference(x) for x in
+                    [values_concat_output, values_concat_split]],
+                axis=0
+            ),
+        ]
+        sum_op_output = GradientSlice(
+            indices=indices_concat_output,
+            values=values_concat_output,
+        )
+        return sum_ops, sum_op_output
+
+    def _MakeSumOps(self, input_name, input_version):
+        generators = self.gradient_generators[input_name][input_version]
+        out_base_name = self._GetSumOpOutputName(generators, input_name)
+        types = list(set(type(x) for x in generators))
+        assert(len(types) == 1)
+        if types[0] is GradGenMeta:
+            sum_ops, g = self._MakeDenseSumOps(generators, out_base_name)
+        else:
+            assert(types[0] is SparseGradGenMeta)
+            sum_ops, g = self._MakeSparseSumOps(generators, out_base_name)
+        self._SetSumOpsDeviceOption(sum_ops, generators)
+        return sum_ops, g
+
+    def _VerifyGradientGenerators(self, generator):
+        # (1) check if all gradients are of the same type. Aggregating a mix of
+        # sparse and dense gradients is not supported yet
+        if len({type(g) for g in generator}) > 1:
+            raise RuntimeError(
+                'Automatic aggregation of a mix of sparse and dense gradients '
+                'is not supported yet')
+
+        # If for all the operators that used the operator, none or only one
+        # produced the gradient, then no additional sum needs to be carried
+        # out.
+        if len(generator) < 2:
+            return False
+
+        all_gradient_names = []
+        all_device_options = []
+        for g in generator:
+            if type(g) is GradGenMeta:
+                if g.grad_op:
+                    all_gradient_names.append(g.gradient)
+                    all_device_options.append(g.grad_op.device_option)
+            else:
+                assert(type(g) is SparseGradGenMeta)
+                if g.grad_op_indices:
+                    all_device_options.append(g.grad_op_indices.device_option)
+                if g.grad_op_values:
+                    all_device_options.append(g.grad_op_values.device_option)
+                    all_gradient_names.append(g.gradient.values)
+
+        # Check if all grad op device options are the same.
+        if len(all_device_options) >= 2 and not all(
+                device_option_equal(d, all_device_options[0])
+                for d in all_device_options[1:]):
+            raise RuntimeError('Unexpected behavior: not all grad ops '
+                               'have the same device option.')
+        return True
+
+    def DoGradientAccumulation(self, fwd_op_idx):
+        """For each input name in the forward op, check if we will need to
+        add gradient accumulation. If so, do gradient accumulation and return
+        the list of gradient operators.
+
+        The criteria for doing gradient accumulation is:
+        (1) the specific input version has been used by multiple operators.
+        (2) the current fwd_op_idx is the first to use that input, i.e. in the
+            backward pass, is the last to optionally generate the gradient for
+            the op.
+        (3) For the operators that used the input, their gradient operators
+            have generated more than 1 gradient.
+
+        When accumulating operators, our current solution is to rename all the
+        created gradients with an internal intermediate name, and then add a
+        Sum() operator that adds up all the gradients. This may use more memory
+        due to intermediate storage, but is usually the fastest approach as one
+        can do one single sum for multiple intermediate gradients.
+        """
+        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
+        additional_sum_ops = []
+        grad_map = {}
+        for _i, input_name in enumerate(set(forward_op.input)):
+            input_version = in_versions[input_name]
+            input_usage = self.input_usages[input_name][input_version]
+            if (len(input_usage) <= 1 or fwd_op_idx != input_usage[0]):
+                # We do not need to do gradient accumulation yet.
+                continue
+            generator = self.gradient_generators[input_name][input_version]
+            try:
+                if not self._VerifyGradientGenerators(generator):
+                    continue
+            except RuntimeError as err:
+                raise RuntimeError(
+                    "Gradients for param ''{}'' failed to verify: {}".format(
+                        input_name,
+                        err
+                    )
+                )
+
+            # Finally, let's create the sum operator.
+            sum_ops, g = self._MakeSumOps(input_name, input_version)
+            additional_sum_ops.extend(sum_ops)
+            grad_map[input_name] = g
+        return additional_sum_ops, grad_map
+
+    def _AppendAutoGradGenerator(self, y, grad, autograd_op):
+        # Gradient here is not sparse  as it was generated by
+        # a ConstantFill operator. Autogeneration for sparse gradients is
+        # not supported
+        generator = GradGenMeta(
+            autograd_op, 0 if autograd_op else None, str(grad))
+
+        self.gradient_generators[str(y)][self.frontier[str(y)]].append(
+            generator)
+
+
+    def _GetInitGradients(self, ys):
+        input_to_grad = {}
+        gradient_ops = []
+
+        for y, g in viewitems(ys):
+            autograd_op = None
+            if g is None:
+                autograd_op = CreateOperator(
+                    "ConstantFill", [y], [str(y) + "_autogen_grad"],
+                    value=1.0)
+                gradient_ops.append(autograd_op)
+                g = autograd_op.output[0]
+            # Since the C++ gradient registry does not have notion of
+            # NameScopes, we will convert all references to strings.
+            input_to_grad[str(y)] = (
+                GradientSlice(str(g[0]), str(g[1]))
+                if isinstance(g, GradientSlice) else str(g))
+            # Autogenerated gradients are assumed to be provided for the last
+            # input version
+            if autograd_op is not None:
+                self._AppendAutoGradGenerator(y, g, autograd_op)
+
+        return input_to_grad, gradient_ops
+
+    def _GenerateGradientsForForwardOp(
+            self, forward_op_idx, input_to_grad):
+        new_input_to_grad = {}
+        gradient_ops = []
+        forward_op, in_versions, out_versions = self.ssa[forward_op_idx]
+        g_output = list(
+            input_to_grad.get(name, None) for name in forward_op.output)
+
+        if not all(g is None for g in g_output) or (
+                forward_op.type == "ZeroGradient"):
+            gradient_ops, g_input = GradientRegistry.GetGradientForOp(
+                forward_op, g_output)
+            # Check if the gradient operators are legal, and update
+            # gradient_generators and gradient_frontier
+            self.BuildGradientGenerators(
+                forward_op_idx, gradient_ops, g_output, g_input)
+            # Record the gradient map to all_input_to_grad.
+            for name, grad in zip(forward_op.input, g_input):
+                # Do not overwrite an existing gradient with a None
+                # unless the input is also an output of the op, since
+                # we update the blob version when blob is output of an
+                # operator.
+                if grad is not None or \
+                    name not in input_to_grad or \
+                        name in list(forward_op.output):
+                    new_input_to_grad[name] = grad
+
+        return new_input_to_grad, gradient_ops
+
+    def GetBackwardPass(self, ys):
+        """Gets the backward pass that computes the derivatives of given blobs.
+
+        Inputs:
+          ys: a list or a dictionary specifying what blobs we want to compute
+              derivatives of. If the input is a list, we will automatically
+              generate their gradients with all-one values; if the input is a
+              dictionary, for any dictionary entries that are not None, we will
+              take the corresponding blobs as their gradients; for all those
+              that are None, we will auto-fill them with 1.
+        """
+        if isinstance(ys, list):
+            ys = dict((y, None) for y in ys)
+        elif not isinstance(ys, dict):
+            raise TypeError("ys should either be a list or a dict.")
+
+        # Set the gradient frontier with the initialized external
+        # gradients.
+        for y in viewkeys(ys):
+            self.gradient_frontier[y] = self.frontier[y]
+            self.input_usages[str(y)][self.frontier[str(y)]].append(
+                len(self.ssa))
+
+        all_input_to_grad, all_gradient_ops = self._GetInitGradients(ys)
+
+        # (2) Now, after having the virtual play above, we now play the ops
+        # backwards, creating the gradients along the path. Note that although
+        # we are playing it backwards, we cannot refer to variables that are
+        # at a version older than current_versions because it is already been
+        # overwritten.
+        for forward_op_idx in reversed(range(len(self.ssa))):
+            input_to_grad, gradient_ops = self._GenerateGradientsForForwardOp(
+                forward_op_idx, all_input_to_grad)
+            all_input_to_grad.update(input_to_grad)
+            all_gradient_ops += gradient_ops
+
+            # If there are multiple use blobs, do gradient accumulation.
+            additional_sum_ops, grad_map = self.DoGradientAccumulation(
+                forward_op_idx)
+            # This line is so that if in an accumulation some of the operators
+            # have not produced gradients, they still do not overwrite the
+            # general all_input_to_grad map.
+            all_input_to_grad.update(grad_map)
+            all_gradient_ops += additional_sum_ops
+
+        # (3) Post-processing.
+        # After we have done computation for each op, we now have the gradient
+        # operators ready. For the output map, we will convert everything to
+        # BlobReferences for easier handling in python.
+        all_input_to_grad_out = {}
+        for key, val in viewitems(all_input_to_grad):
+            if val is not None:
+                if (isinstance(val, string_types) or
+                        isinstance(val, binary_type)):
+                    grad_out = BlobReference(val)
+                else:
+                    grad_out = GradientSlice(BlobReference(val[0]),
+                                             BlobReference(val[1]))
+                all_input_to_grad_out[BlobReference(key)] = grad_out
+        return all_gradient_ops, all_input_to_grad_out
+
+
+class GradientRegistry(object):
+    """GradientRegistry holds the mapping from operators to their gradients."""
+    gradient_registry_ = {}
+
+    @classmethod
+    def RegisterGradient(cls, op_type):
+        """A decorator for registering gradient mappings."""
+
+        def Wrapper(func):
+            cls.gradient_registry_[op_type] = func
+            return func
+
+        return Wrapper
+
+    @classmethod
+    def _GetGradientForOpCC(cls, op_def, g_output):
+        # TODO(tulloch) - Propagate GradientWrapper up through the stack.
+        def from_untyped(grad):
+            if grad is None:
+                w = C.GradientWrapper()
+                assert w.is_empty()
+                return w
+            try:
+                (indices, values) = grad
+                w = C.GradientWrapper()
+                w.indices = indices
+                w.values = values
+                assert w.is_sparse()
+                return w
+            except ValueError:
+                w = C.GradientWrapper()
+                w.dense = grad
+                assert w.is_dense()
+                return w
+
+        g_output = [from_untyped(grad) for grad in g_output]
+        grad_defs_str, g_input = C.get_gradient_defs(
+            op_def.SerializeToString(), g_output)
+
+        def to_untyped(grad_wrapper):
+            if grad_wrapper.is_empty():
+                return None
+            if grad_wrapper.is_sparse():
+                return GradientSlice(grad_wrapper.indices, grad_wrapper.values)
+            assert grad_wrapper.is_dense()
+            return grad_wrapper.dense
+
+        g_input = [to_untyped(grad_wrapper) for grad_wrapper in g_input]
+        grad_defs = []
+        for grad_def_str in grad_defs_str:
+            grad_def = caffe2_pb2.OperatorDef()
+            grad_def.ParseFromString(grad_def_str)
+            grad_defs.append(grad_def)
+        return grad_defs, g_input
+
+    @classmethod
+    def GetGradientForOp(cls, op, g_output):
+        try:
+            gradient_ops, g_input = cls._GetGradientForOpCC(op, g_output)
+        except Exception as e:
+            # Not supported in C++; will try python registration next.
+            if op.type in cls.gradient_registry_:
+                gradient_ops, g_input = cls.gradient_registry_[op.type](
+                    op, g_output
+                )
+            else:
+                raise Exception(
+                    "Exception when creating gradient for [{}]:{}.\nOp: \n{}".
+                    format(op.type, e, str(op))
+                )
+
+        if gradient_ops is None:
+            return [], g_input
+        if type(gradient_ops) is not list:
+            gradient_ops = [gradient_ops]
+        return gradient_ops, g_input
+
+    @classmethod
+    def GetBackwardPass(cls, operators, ys, ys_generate_gradient=False):
+        """Gets the backward pass for the list of operators.
+
+        Args:
+            operators: a list of operators constituting the forward pass.
+            ys: a list or a dictionary specifying what blobs we want to compute
+                derivatives of. If the input is a list, we will automatically
+                generate their gradients with all-one values; if the input is a
+                dictionary, for any dictionary entries that are not None, we'll
+                take the corresponding blobs as their gradients; for all those
+                that are None, we will auto-fill them with 1.
+        Returns:
+            gradient_ops: a list of gradient operators to run.
+            all_input_to_grads: a map from input to their corresponding
+                gradients.
+        """
+        ir = IR(operators)
+        return ir.GetBackwardPass(ys)
+
+
+GradientRegistry.RegisterGradient('Do')(gen_do_gradient)
+GradientRegistry.RegisterGradient('If')(gen_if_gradient)
+GradientRegistry.RegisterGradient('While')(gen_while_gradient)
+
+
+def get_ssa(net, blob_versions=None):
+    """
+    Given a net, return a structure containing the version of each input and
+    output blob used by each operator.
+
+    Args:
+        net:            either a Net or a NetDef
+        blob_versions:  (optional) map with current version number for given
+                        blob names. If not provided or blob not found, start
+                        from version 0.
+    Returns:
+        Tuple (ssa, blob_versions)
+        ssa:            list of tuples (versioned_inputs, versioned_outputs)
+                        for each op in the net. A versioned input is a tuple
+                        (blob_name, version).
+        blob_versions:  updated map with latest version of each blob found in
+                        the net.
+    """
+    proto = net.Proto() if isinstance(net, Net) else net
+    assert isinstance(proto, caffe2_pb2.NetDef)
+    if blob_versions is None:
+        blob_versions = {}
+    if isinstance(net, list):
+        return [get_ssa(n, blob_versions) for n in net], blob_versions
+    for i in proto.external_input:
+        if i not in blob_versions:
+            blob_versions[str(i)] = 0
+    ssa = []
+    for op in proto.op:
+        if not proto.external_input:
+            for i in op.input:
+                if i not in blob_versions:
+                    blob_versions[i] = 0
+        inputs = [(str(i), blob_versions.get(str(i), 0)) for i in op.input]
+        for o in op.output:
+            blob_versions[str(o)] = blob_versions.get(str(o), 0) + 1
+        outputs = [(str(o), blob_versions[str(o)]) for o in op.output]
+        ssa.append((inputs, outputs))
+    return ssa, blob_versions
+
+
+def get_undefined_blobs(ssa):
+    """
+    Given a ssa in the format produced by get_ssa(), return a set of blobs that
+    are used before they are defined, which corresponds to inputs at version 0.
+    """
+    undef_blobs = set()
+    for inputs, _outputs in ssa:
+        undef_blobs |= set(name for (name, ver) in inputs if ver == 0)
+    return undef_blobs
+
+
+def get_output_producers(ssa):
+    """
+    Given a ssa in the format produced by get_ssa(), returns a map from
+    versioned blob into the operator index that produces that version of
+    the blob. A versioned blob is a tuple (blob_name, version).
+    """
+    producers = {}
+    for i, (_inputs, outputs) in enumerate(ssa):
+        for o in outputs:
+            producers[o] = i
+    return producers
+
+
+def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
+    """
+    Given a ssa and blob_versions as produced by get_ssa(), returns the list
+    of op indices that are necessary in order to generate the blobs in
+    `outputs`, given blobs in `inputs`.
+    Consider that the `inputs` are given in their latest version.
+    """
+    inputs_set = set((str(i), blob_versions[str(i)]) for i in inputs)
+    producers = get_output_producers(ssa)
+    queue = [(str(o), blob_versions[str(o)]) for o in outputs]
+    used_op_ids = set()
+    while len(queue) > 0:
+        o = queue.pop()
+        if (o not in inputs_set) and (o in producers):
+            op_id = producers[o]
+            if op_id not in used_op_ids:
+                used_op_ids |= {op_id}
+                inputs, _ = ssa[op_id]
+                queue.extend(inputs)
+    return sorted(used_op_ids)
+
+
+def recurrent_network_op_remap(op, prefix, blob_remap):
+    """
+    Parameters
+    ----------
+    op : Caffe2 operator (RecurrentNetworkOp or RecurrentNetworkGradientOp).
+    prefix: this argument is not used in this function, just for legacy support.
+    blob_remap : Dictionary that represents the map from old blob name to new.
+
+    Updates blob names in arguments of RecurrentNetworkOp and
+    RecurrentNetworkGradientOp to conform to cloned input and output of both
+    operators and also makes sure names of locally generated blobs in arguments
+    have the same prefix as the input and output of the operators.
+    """
+
+    def get_remapped_str(blob_str):
+        if isinstance(blob_str, binary_type):
+            blob_str = blob_str.decode('utf-8')
+        return blob_remap.get(blob_str, blob_str).encode('utf-8')
+
+    for argument in op.arg:
+        if len(argument.strings) > 0:
+            for i in range(len(argument.strings)):
+                argument.strings[i] = get_remapped_str(argument.strings[i])
+        elif argument.name == 'timestep':
+            argument.s = get_remapped_str(argument.s)
+        elif argument.name.endswith('step_net'):
+            # argument is a proto
+            remap_proto(argument, blob_remap)
+
+
+def control_op_remap(op, prefix, blob_remap):
+    net_arg_names = []
+    if op.type == "If":
+        net_arg_names = ['then_net', 'else_net']
+    else:
+        net_arg_names = ['loop_net', 'cond_net']
+    for argument in op.arg:
+        if argument.name in net_arg_names:
+            assert argument.n, \
+                "Expected non empty net in " + op.type + "'s " + argument.name + " argument"
+            subnet = Net(argument.n)
+            remapped_subnet = subnet.Clone(
+                name=(subnet._net.name if subnet._net.name else '') + '_remapped',
+                blob_remap=blob_remap)
+            argument.n.CopyFrom(remapped_subnet.Proto())
+
+
+DEFAULT_REMAP_FUNCS = {
+    'RecurrentNetwork': recurrent_network_op_remap,
+    'RecurrentNetworkGradient': recurrent_network_op_remap,
+    'If': control_op_remap,
+    'While': control_op_remap,
+}
+
+
+def remap_proto(argument, blob_remap):
+    subnet = Net(argument.n)
+
+    cloned_sub_net = subnet.Clone(
+        'cloned_sub_net',
+        blob_remap,
+    )
+
+    argument.n.CopyFrom(cloned_sub_net.Proto())
+
+
+def clone_and_bind_net(net, name, prefix, blob_remap=None, inputs=None,
+                       keep_schema=True):
+    """
+    Clone the given Net, binding its input schema to the given `inputs` record.
+    Blob names defined by the net are prepended with the given `prefix`.
+
+    Args:
+        net:        the net to clone
+        name:       the name of the new net
+        prefix:     the prefix to append to local blobs
+        blob_remap: (optional) dict with additional blob name remapping.
+        inputs:     (optional) input record that will provide actual input
+                    values for the cloned net. Must be compatible with the
+                    net's input schema or be a strict superset of it
+        keep_schema: by default (True), the original schema will be kept and
+                     remapped accordingly. otherwise, the schema will be set as
+                     inputs or left empty if inputs is not given.
+    Returns:
+        Tuple (cloned_net, blob_remap)
+        clone_net:  the cloned Net
+        blob_remap: a map from original blob names into remapped blob names
+    """
+    from caffe2.python import schema
+    assert isinstance(net, Net)
+    if blob_remap is None:
+        blob_remap = {}
+    if inputs is not None:
+        assert isinstance(inputs, schema.Field)
+        original = net.input_record()
+        assert original is not None
+        # TODO(azzolini): improve schema type checking
+        diff = set(original.field_names()) - set(inputs.field_names())
+        assert len(diff) == 0, (
+            "Schemas don't match, extra fields {diff} found in the net {name}. "
+            "original: {original}; inputs: {inputs}"
+            .format(
+                diff=diff, name=net.Name(), original=original.field_names(),
+                inputs=inputs.field_names()
+            )
+        )
+        original_mapping = dict(zip(original.field_names(),
+                                    original.field_blobs()))
+        for fn, fb in zip(inputs.field_names(), inputs.field_blobs()):
+            if fn in original_mapping:
+                blob_remap[str(original_mapping[fn])] = str(fb)
+    proto = net.Proto()
+    ssa, blob_versions = get_ssa(proto)
+    undef_blobs = get_undefined_blobs(ssa)
+
+    for blob in viewkeys(blob_versions):
+        if blob in blob_remap:
+            continue
+        elif blob in undef_blobs:
+            blob_remap[blob] = blob
+        else:
+            blob_remap[blob] = prefix + blob
+    cloned_net = net.Clone(name, blob_remap, keep_schema=keep_schema)
+    if not keep_schema and inputs:
+        cloned_net.set_input_record(inputs)
+    return cloned_net, blob_remap
+
+
+def _get_blob_ref(blob_name_or_ref):
+    return (
+        blob_name_or_ref if isinstance(input, BlobReference)
+        else BlobReference(blob_name_or_ref)
+    )
+
+
+def _recover_record_by_prefix(names, prefix=''):
+    """
+    Tries to recover record by taking a subset of blob names with
+    a given prefix name and interpreting them as schema column names
+    """
+    from caffe2.python import schema
+    column_names = [name[len(prefix):] for name in names
+                    if name.startswith(prefix)]
+    if not column_names:
+        return None
+    return schema.from_column_list(
+        column_names,
+        col_blobs=[_get_blob_ref(prefix + name) for name in column_names])
+
+
+class Net(object):
+    _net_names_used = set()
+    operator_registry_ = {}
+
+    @staticmethod
+    def current_prefix():
+        from caffe2.python.net_builder import NetBuilder
+        builder = NetBuilder.current(required=False)
+        return builder.name if builder else ''
+
+    @staticmethod
+    def _get_next_net_name(basename):
+        name = basename = '/'.join(
+            x for x in [Net.current_prefix(), basename] if x
+        )
+        next_idx = 1
+        while name in Net._net_names_used:
+            name = basename + '_' + str(next_idx)
+            next_idx += 1
+        Net._net_names_used |= set([name])
+        return name
+
+    def __init__(self, name_or_proto):
+        """
+        Create a Net.
+        Args:
+            name_or_proto:  If a NetDef is provided, clone it. Otherwise,
+                            create an empty net with the given name.
+        """
+        self._input_record = None
+        self._output_record = None
+        # Register blobs so that it's guaranteed that different calls to
+        # NextBlob/NextScopedBlob always return blobs with different names
+        self._registered_blob_names = set()
+        self._recreate_lookup_tables = False
+        self._op_outputs = set()
+        self._external_input_map = set()
+        self._attr_dict = defaultdict(list)
+        if type(name_or_proto) is caffe2_pb2.NetDef:
+            proto = name_or_proto
+            # We rae initializing a network by a NetDef. In this case, we will
+            # initialize our network with the given netdef.
+            self._net = caffe2_pb2.NetDef()
+            self._net.CopyFrom(proto)
+
+            existing_outputs = [list(op.output) for op in self._net.op]
+
+            self._external_input_map.update(list(self._net.external_input))
+
+            # Set the next name index properly.
+            existing_names = set(
+                sum(
+                    [list(op.input) for op in self._net.op], []
+                ) + sum(
+                    existing_outputs, []
+                )
+            )
+            for outs in existing_outputs:
+                self._op_outputs.update(outs)
+
+            prefix_len = len(self._net.name + '_blob_')
+            autogen_indices = []
+            for s in existing_names:
+                if s.startswith(self._net.name + '_blob_'):
+                    try:
+                        autogen_indices.append(int(s[prefix_len]))
+                    except ValueError:
+                        pass
+            if len(autogen_indices):
+                self._next_name_index = max(autogen_indices) + 1
+            else:
+                self._next_name_index = 0
+            name = self._net.name
+        else:
+            name = name_or_proto
+            self._net = caffe2_pb2.NetDef()
+            self._next_name_index = 0
+
+        # make sure that this net name hasn't been used before
+        self._net.name = Net._get_next_net_name(name)
+
+    def AppendNet(self, net, device_option=None):
+        assert isinstance(net, Net)
+        for i in net.Proto().external_input:
+            if (
+                i not in self.Proto().external_input and
+                i not in self._op_outputs
+            ):
+                self.Proto().external_input.append(i)
+
+        self.Proto().external_output.extend(
+            [
+                o for o in net.Proto().external_output
+                if o not in self.Proto().external_output
+            ]
+        )
+        ops = net.Proto().op
+        if device_option is not None:
+            ops = [copy.deepcopy(op) for op in ops]
+            map(lambda x: x.device_option.CopyFrom(device_option), ops)
+
+        self._ExtendOps(ops)
+        return self
+
+    def LogInfo(self, *msg_or_blobs):
+        for msg_or_blob in msg_or_blobs:
+            if not isinstance(msg_or_blob, BlobReference):
+                blob = self.GivenTensorStringFill(
+                    [], self.NextName('log'),
+                    shape=[], values=[msg_or_blob])
+            else:
+                blob = msg_or_blob
+            self.Print(blob, [])
+
+    def add_attribute(self, name, obj):
+        """
+        Add `obj` to the list of attributes in this net under the given `name`.
+        Attributes are user-defined objects and have no pre-defined semantics.
+        """
+        self._attr_dict[name].append(obj)
+
+    def get_attributes(self, name):
+        """
+        Returns the list of attributes in this net for a given `name`.
+        Attributes are user-defined objects added with `add_attribute'.
+        """
+        return self._attr_dict.get(name, [])
+
+    def set_rand_seed(self, seed=100, sequence_seed=True, seed_on_op_def=False):
+        """
+        Adds a random seed to each op in the net.
+        If sequence_seed is set, the i-th op has rand_seed=`seed + i`
+        If seed_on_op_def is set, the op rand_seed=hash(str(op))
+        sequence_seed and seed_on_op_def cannot be both set to True.
+        """
+        assert not (sequence_seed and seed_on_op_def), (
+            'sequence_seed and seed_on_op_def cannot be both set to True.')
+        for i, op in enumerate(self.Proto().op):
+            if sequence_seed:
+                curr_seed = seed + i
+            elif seed_on_op_def:
+                curr_seed = hash(str(op) + str(seed)) % np.iinfo(np.uint32).max
+            else:
+                curr_seed = seed
+            op.device_option.random_seed = curr_seed
+
+    def Name(self):
+        return self._net.name
+
+    def __str__(self):
+        return self.Name()
+
+    def Const(self, array, blob_out=None, dtype=None):
+        if isinstance(array, bool):
+            return self.ConstantFill(
+                [],
+                blob_out or 1,
+                dtype=DataType.BOOL,
+                value=array)
+
+        if dtype is None:
+            array = np.array(array)
+        else:
+            array = np.array(array, dtype=dtype)
+
+        def do_set(operator):
+            return operator(
+                [],
+                blob_out or 1,
+                shape=array.shape,
+                values=array.flatten().tolist())
+
+        if array.dtype == np.int32:
+            return do_set(self.GivenTensorIntFill)
+        elif array.dtype == np.int64:
+            return do_set(self.GivenTensorInt64Fill)
+        elif array.dtype == np.str:
+            return do_set(self.GivenTensorStringFill)
+        elif array.dtype == np.bool:
+            return do_set(self.GivenTensorBoolFill)
+        else:
+            return do_set(self.GivenTensorFill)
+
+    def BlobIsDefined(self, blob):
+        """
+        Returns true if the given BlobReference is produced as output of
+        an operator in this net, or if it is provided as an external input.
+        """
+        if self._recreate_lookup_tables:
+            self._RecreateLookupTables()
+        name = str(blob)
+        return (name in self._op_outputs) or (name in self._external_input_map)
+
+    def UsesBlob(self, blob):
+        """
+        Returns true iff the given BlobReference is used by any operator
+        or this net, or if it is one of the external inputs of the net.
+        """
+        blob_name = str(blob)
+        for op in self._net.op:
+            for input in op.input:
+                if input == blob_name:
+                    return True
+        return blob_name in self._external_input_map
+
+    def UsedBlobNames(self):
+        """
+        Returns a set of blob names used in the net
+        """
+        blob_names = set()
+        for op in self._net.op:
+            blob_names |= set(op.input)
+            blob_names |= set(op.output)
+        if self._net.external_input:
+            blob_names |= set(self._net.external_input)
+        if self._net.external_output:
+            blob_names |= set(self._net.external_output)
+        return blob_names
+
+    def GetBlobRef(self, blob_name):
+        """
+        Given the name of a blob produced by this net, return a BlobReference
+        to it. If the blob is not produced by any op in this net,
+        raises KeyError.
+        """
+        blob_name = str(blob_name)
+        if not self.BlobIsDefined(blob_name):
+            raise KeyError('Net does not define blob %s' % blob_name)
+        return BlobReference(blob_name, self)
+
+    def Clone(
+        self,
+        name,
+        blob_remap=None,
+        op_id_mask=None,
+        remap_funcs=None,
+        keep_schema=True
+    ):
+        """
+        Clone this net.
+        Args:
+            name:        name of the cloned net
+            blob_remap:  optional map with list of blob names to replace
+            op_id_mask:  optional list of operator indices to include in
+                         the cloned net. If not provided, all ops are included.
+        """
+        orig_remap_funcs = {} if remap_funcs is None else remap_funcs
+        # by default we want to put RecurrentNetworkOp and
+        # RecurrentNetworkGradientOp into remap_funcs, as these two operators
+        # also take blobs and proto into the arguments.
+        remap_funcs = DEFAULT_REMAP_FUNCS.copy()
+        remap_funcs.update(orig_remap_funcs)
+        proto = self._net
+        new_proto = caffe2_pb2.NetDef()
+        new_proto.CopyFrom(proto)
+        new_proto.name = name
+
+        if blob_remap is None:
+            blob_remap = {}
+        if op_id_mask is None:
+            op_id_mask = list(range(0, len(proto.op)))
+
+        def get_remapped_str(blob):
+            blob_str = str(blob)
+            return str(blob_remap.get(blob_str, blob_str))
+
+        def remap_list(proto_list):
+            new_list = [get_remapped_str(b) for b in proto_list]
+            del proto_list[:]
+            proto_list.extend(new_list)
+
+        def remap_op(op):
+            new_op = caffe2_pb2.OperatorDef()
+            new_op.CopyFrom(op)
+            remap_list(new_op.input)
+            remap_list(new_op.output)
+            if new_op.type in remap_funcs:
+                remap_funcs[new_op.type](
+                    new_op,
+                    (name + '/') if name else '',
+                    blob_remap,
+                )
+            return new_op
+
+        del new_proto.op[:]
+        new_proto.op.extend([remap_op(proto.op[op_id]) for op_id in op_id_mask])
+        remap_list(new_proto.external_input)
+        remap_list(new_proto.external_output)
+        new_net = Net(new_proto)
+
+        if keep_schema:
+            from caffe2.python import schema
+            if self._input_record:
+                new_net._input_record = schema.from_blob_list(
+                    self._input_record,
+                    [
+                        BlobReference(get_remapped_str(blob), net=new_net)
+                        for blob in self._input_record.field_blobs()
+                    ],
+                )
+            if self._output_record:
+                new_net._output_record = schema.from_blob_list(
+                    self._output_record,
+                    [
+                        BlobReference(get_remapped_str(blob), net=new_net)
+                        for blob in self._output_record.field_blobs()
+                    ],
+                )
+
+        new_net._attr_dict.update(self._attr_dict)
+        return new_net
+
+    def ClonePartial(self, name, inputs, outputs, remap_funcs=None):
+        """
+        Clone this net, including only ops that are necessary in order to
+        compute `outputs` given `inputs`. Return references to the cloned
+        outputs. Internal blobs (blobs that are produced and consumed inside
+        the net but not used as outputs) will be remapped to avoid name
+        conflict.
+
+        Args:
+            name:    the name of the cloned net
+            inputs:  map where the keys correspond to BlobReferences in the
+                     original net, and the values correspond to external inputs
+                     in the partially cloned net. If `inputs` is a list, don't
+                     remap input names.
+            outputs: outputs to be produced by the cloned net.
+
+        Returns:
+            Tuple (new_net, new_outputs)
+                new_net:       a new Net object.
+                new_outputs:   list of BlobReferences corresponding to the
+                               outputs produced by new_net.
+        """
+        input_is_pair_list = isinstance(inputs, list) and all(
+            isinstance(i, tuple) and len(i) == 2 for i in inputs)
+        inputs = (
+            inputs if isinstance(inputs, (dict, OrderedDict)) else
+            OrderedDict(inputs) if input_is_pair_list else
+            OrderedDict(zip(inputs, inputs)))
+        for output in outputs:
+            assert self.BlobIsDefined(output), "{} is not defined".format(output)
+        input_names = {str(k): str(v) for k, v in viewitems(inputs)}
+        output_names = [str(o) for o in outputs]
+        proto = self._net
+        blob_versions = {str(i): 0 for i in inputs}
+        ssa, blob_versions = get_ssa(proto, blob_versions)
+        used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs)
+        disallowed_op_ids = get_op_ids_in_path(ssa, blob_versions, [], inputs)
+        assert len(set(used_op_ids) & set(disallowed_op_ids)) == 0, (
+            'Cannot partially clone net: some of the ops required would ' +
+            'generate the given input.')
+
+        sub_ssa = [op for i, op in enumerate(ssa) if i in used_op_ids]
+        undef_blobs = get_undefined_blobs(sub_ssa) - set(viewkeys(input_names))
+        prefix = (name + '/') if name else ''
+
+        def remap(blob_name):
+            if blob_name in input_names:
+                return input_names[blob_name]
+            elif blob_name in undef_blobs:
+                return blob_name
+            else:
+                return prefix + blob_name
+
+        blob_mapping = {b: remap(b) for b in viewkeys(blob_versions)}
+        new_net = self.Clone(name, blob_mapping, used_op_ids, remap_funcs)
+        new_in = [
+            blob_mapping[i] for i in viewkeys(input_names)] + list(undef_blobs)
+        new_out = [blob_mapping[o] for o in output_names]
+        del new_net.Proto().external_input[:]
+        new_net.Proto().external_input.extend(new_in)
+        new_net._external_input_map = set(list(new_in))
+        del new_net.Proto().external_output[:]
+        new_net.Proto().external_output.extend(new_out)
+        return new_net, [new_net.GetBlobRef(o) for o in new_out]
+
+    def Proto(self):
+        self._InvalidateLookupTables()
+        return self._net
+
+    def PopulateProtoWithFileName(self):
+        net_tb = workspace.operator_tracebacks.get(self.Name(), None)
+        if net_tb is not None:
+            for idx, op in enumerate(self.Proto().op):
+                if idx in net_tb:
+                    op.name = ':'.join(map(str, net_tb[idx][0]))
+
+    def NextScopedBlob(self, prefix='unnamed'):
+        """Return the blob that has not been defined or registered in the
+        current net. It returns `ScopedBlobReference(prefix)`, if it's valid,
+        otherwise `ScopedBlobReference(prefix) + '_auto_' + ?`. Different calls
+        is guaranteed to return blob with different names.
+        """
+        output_blob_base = ScopedName(prefix)
+        return self.NextBlob(output_blob_base)
+
+    def NextBlob(self, prefix='unnamed'):
+        """Return the blob that has not been defined or registered in the
+        current net. It returns `BlobReference(prefix)`, if it's valid,
+        otherwise `BlobReference(prefix) + '_auto_' + ?`. Different calls
+        is guaranteed to return blob with different names."""
+        output_blob_base = BlobReference(prefix)
+        output_blob = output_blob_base
+        index = 0
+        while str(output_blob) in self._registered_blob_names or (
+                self.BlobIsDefined(output_blob)):
+            output_blob = output_blob_base + '_auto_' + str(index)
+            index += 1
+
+        self._registered_blob_names.add(str(output_blob))
+        return output_blob
+
+    def NextName(self, prefix=None, output_id=None):
+        """Returns the next name to be used, if you do not want to explicitly
+        name your blob. [Deprecated, use NextBlob, NextScopedBlob instead]"""
+        if prefix:
+            output_name_base = self._net.name + '/' + prefix
+            output_name = output_name_base
+            if output_id is not None:
+                output_name += ':' + str(output_id)
+            index = 2
+            while self.BlobIsDefined(str(ScopedBlobReference(output_name))):
+                output_name = output_name_base + '_' + str(index)
+                if output_id is not None:
+                    output_name += ':' + str(output_id)
+                index += 1
+        else:
+            output_name = self._net.name + '_blob_' + str(self._next_name_index)
+            self._next_name_index += 1
+        return str(output_name)
+
+    def _ExtendOps(self, new_ops):
+        self._net.op.extend(new_ops)
+        for op in new_ops:
+            self._op_outputs.update([text_type(o) for o in op.output])
+
+    def _CheckLookupTables(self):
+        '''
+        Called from unit tests to validate the internal lookup tables
+        match the protobuf contents.
+        '''
+        test_op_outputs = set()
+        for op in self._net.op:
+            for o in op.output:
+                test_op_outputs.add(o)
+
+        test_external_inp = set()
+        for inp in self._net.external_input:
+            test_external_inp.add(inp)
+
+        assert test_op_outputs.difference(self._op_outputs) == set()
+        assert test_external_inp.difference(self._external_input_map) == set()
+
+    def _InvalidateLookupTables(self):
+        self._recreate_lookup_tables = True
+
+    def _RecreateLookupTables(self):
+        self._op_outputs = set()
+        for op in self._net.op:
+            for o in op.output:
+                self._op_outputs.add(o)
+
+        self._external_input_map = set()
+        for inp in self._net.external_input:
+            self._external_input_map.add(inp)
+
+        self._recreate_lookup_tables = False
+
+    def AddGradientOperators(self, ys, skip=0):
+        """Add the gradient for operators in the net.
+
+        Inputs:
+          ys: a list or a dictionary specifying what blobs we want to compute
+              derivatives of. If the input is a list, we will automatically
+              generate their gradients with all-one values; if the input is a
+              dictionary, for any dictionary entries that are not None, we will
+              take the corresponding blobs as their gradients; for all those
+              that are None, we will auto-fill them with 1.
+          skip: skips the first n operators. This is provided mainly because a
+              lot of nets may use the first few operators for data generation
+              like stuff which really do not need to have gradients.
+
+        Outputs:
+          returns a map from the blob name in the input network to a blob
+          containing gradient or a GradientSlice in case of sparse gradient
+
+        Currently, this is hard-coded for float operators if there are branches
+        (i.e. a blob is used as input to multiple operators). This is because
+        the gradient accumulation (Sum) is float only right now.
+        """
+
+        grad_ops, input_to_grad = GradientRegistry.GetBackwardPass(
+            self._net.op[skip:], ys)
+        # Check if in immediate mode: the grad_ops are actually being produced
+        # by C++ and bypasses the CreateOperator() call, so in immediate mode
+        # we will have to explicitly run them.
+        if workspace.IsImmediate():
+            for op in grad_ops:
+                workspace.RunOperatorImmediate(op)
+        self._ExtendOps(grad_ops)
+        return input_to_grad
+
+    def AddArgument(self, arg_name, arg_value):
+        self._net.arg.extend([utils.MakeArgument(arg_name, arg_value)])
+
+    def AddExternalInput(self, *inputs):
+        assert len(inputs) > 0
+        refs = []
+        for input in inputs:
+            input_name = str(input)
+            assert str(input) not in self._external_input_map, (
+                'Net already contains an input named %s' % input_name)
+        for input in inputs:
+            input_name = str(input)
+            self._net.external_input.extend([input_name])
+            self._external_input_map.update([input_name])
+            refs.append(_get_blob_ref(input_name))
+
+        return refs[0] if len(refs) == 1 else refs
+
+    def AddExternalOutput(self, *outputs):
+        for output in outputs:
+            assert isinstance(output, BlobReference)
+            assert self.BlobIsDefined(output), "{} is not defined".format(output)
+        for output in outputs:
+            self.Proto().external_output.extend([str(output)])
+
+    def AddScopedExternalInputs(self, *inputs):
+        res = self.AddExternalInput(
+            * [ScopedBlobReference(b) for b in inputs]
+        )
+        if not isinstance(res, list):
+            res = [res]
+        return res
+
+    def AddScopedExternalOutputs(self, *outputs):
+        return self.AddExternalOutput(
+            * [ScopedBlobReference(b) for b in outputs]
+        )
+
+    # This returns a reference to the observer
+    def AddObserver(self, observer_type):
+        return C.add_observer_to_net(self._net.name, observer_type)
+
+    def RemoveObserver(self, observer):
+        C.remove_observer_from_net(self._net.name, observer)
+
+    def NumObservers(self):
+        return C.num_observers_on_net(self._net.name)
+
+    @property
+    def external_inputs(self):
+        return [_get_blob_ref(x) for x in self._net.external_input]
+
+    @property
+    def external_outputs(self):
+        return [_get_blob_ref(x) for x in self._net.external_output]
+
+    def set_input_record(self, input_record):
+        from caffe2.python import schema
+        assert self._input_record is None or (input_record.has_blobs() and
+            set(input_record.field_blobs()) ==
+            set(self._input_record.field_blobs())), (
+            'Input schema cannot be reset')
+        if not input_record.has_blobs():
+            with NameScope(self.Name()):
+                self._input_record = schema.NewRecord(self, input_record)
+        else:
+            self._input_record = input_record
+            for blob in input_record.field_blobs():
+                if blob not in self.external_inputs:
+                    self.AddExternalInput(blob)
+        return self._input_record
+
+    def recover_input_record_by_prefix(self, prefix):
+        """
+        Tries to recover input record by taking a subset of external_inputs with
+        a given prefix name and interpreting them as schema column names
+        """
+        record = _recover_record_by_prefix(self._net.external_input, prefix)
+        if record:
+            self.set_input_record(record)
+
+    def set_output_record(self, record):
+        assert self._output_record is None or (record.has_blobs() and
+            set(record.field_blobs()) ==
+            set(self._output_record.field_blobs())), (
+            'Output schema cannot be reset')
+        for blob in record.field_blobs():
+            assert self.BlobIsDefined(blob), "{} is not defined".format(blob)
+        for blob in record.field_blobs():
+            if blob not in self.external_outputs:
+                self.AddExternalOutput(blob)
+        self._output_record = record
+
+    def recover_output_record_by_prefix(self, prefix):
+        """
+        Tries to recover out record by taking a subset of external_outputs with
+        a given prefix name and interpreting them as schema column names
+        """
+        record = _recover_record_by_prefix(self._net.external_output, prefix)
+        if record:
+            self.set_output_record(record)
+
+    def AppendOutputRecordField(self, field_name, record):
+        from caffe2.python import schema
+        assert self._output_record is not None, (
+            'Tried to append to missing output record'
+        )
+        for blob in record.field_blobs():
+            assert self.BlobIsDefined(blob), "{} is not defined".format(blob)
+        for blob in record.field_blobs():
+            self.AddExternalOutput(blob)
+        self._output_record = self._output_record + schema.Struct(
+            (field_name, record)
+        )
+
+    def input_record(self):
+        return self._input_record
+
+    def output_record(self):
+        return self._output_record
+
+    def AddExternalInputs(self, *inputs):
+        return self.AddExternalInput(*inputs)
+
+    def AddExternalOutputs(self, *outputs):
+        self.AddExternalOutput(*outputs)
+
+    def DeduplicateGradientSlices(self, g, aggregator='sum'):
+        assert isinstance(g, GradientSlice)
+        unique, remapping = self.Unique([g.indices], 2, engine='SparseHash')
+        if aggregator.lower() == 'sum':
+            new_g = self.UnsortedSegmentSum([g.values, remapping], 1)
+        elif aggregator.lower() == 'mean':
+            new_g = self.UnsortedSegmentMean([g.values, remapping], 1)
+        else:
+            raise ValueError('{} is not supported'.format(aggregator))
+        return GradientSlice(indices=unique, values=new_g)
+
+    def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
+        """A convenient function to run everything on the GPU."""
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = gpu_id
+        self._net.device_option.CopyFrom(device_option)
+        if use_cudnn:
+            for op in self._net.op:
+                op.engine = "CUDNN"
+
+    def RunAllOnMKL(self):
+        """A convenient function to run everything using MKLDNN."""
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.MKLDNN
+        self._net.device_option.CopyFrom(device_option)
+
+    def RunAllOnIDEEP(self):
+        """A convenient function to run everything using IDEEP."""
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.IDEEP
+        self._net.device_option.CopyFrom(device_option)
+
+    def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs):
+        """A helper function to create an operator and add it to self.
+        """
+        inputs = _RectifyInputOutput(inputs)
+        for input in inputs:
+            if not self.BlobIsDefined(input):
+                assert input.Net() != self
+                self.AddExternalInput(input)
+        if outputs is None:
+            # If we do not specify an output, we will assume that this op
+            # produces one output in this case.
+            outputs = self.NextName(prefix=op_type)
+        elif type(outputs) is int:
+            # In this case, we will auto-fill the given number of outputs
+            # with auto-generated names.
+            outputs = [
+                self.NextName(prefix=op_type, output_id=i)
+                for i in range(outputs)]
+        outputs = _RectifyInputOutput(outputs, net=self)
+        op = CreateOperator(op_type, inputs, outputs, **kwargs)
+        self._ExtendOps([op])
+
+        workspace.operator_tracebacks[self.Name()][
+            len(self._net.op) - 1] = _extract_stacktrace()
+
+        if len(op.output) == 0:
+            return
+        elif len(op.output) == 1:
+            return BlobReference(op.output[0], self)
+        else:
+            return tuple(BlobReference(o, self) for o in op.output)
+
+    def __getattr__(self, op_type):
+        if op_type.startswith('__'):
+            raise AttributeError('Attribute {} not found.'.format(op_type))
+        if not IsOperator(op_type) and not IsOperatorWithEngine(op_type, "CUDNN"):
+            raise AttributeError(
+                'Method ' + op_type + ' is not a registered operator.' +
+                ' Did you mean: [' +
+                ",".join(workspace.C.nearby_opnames(op_type)) + ']'
+            )
+        return lambda *args, **kwargs: self._CreateAndAddToSelf(
+            op_type, *args, **kwargs)
+
+    def __dir__(self):
+        additional_methods = [
+            op
+            for op in _REGISTERED_OPERATORS
+            if '_ENGINE_' not in op]
+        return sorted(set(chain(
+            dir(type(self)),
+            viewkeys(self.__dict__),
+            additional_methods
+        )))
+
+    def Python(
+        self,
+        f,
+        grad_f=None,
+        python_func_type=None,
+        pass_workspace=False,
+        grad_output_indices=None,
+        grad_input_indices=None
+    ):
+        """
+        Registers and returns a python operator.
+
+        `f` and `grad_f` can be one of the following:
+            - a function with signature (inputs, outputs), where inputs and
+              outputs are a list of CPUTensor objects. This function will be
+              called from C++ everytime the operator is executed.
+            - a tuple (func, args, kwargs), here `func` is a callable, args is
+              an argument list, and kwargs is a dict list. The call:
+                  f = func(*args, kwargs)
+              will be performed locally at node initialization time, on all of
+              the nodes of the job, returning `f`, a callable that will be used
+              as the python operator function to be called during Net execution.
+              This is to be used when using python operator in a distributed
+              context, and allows to create and keep local python state across
+              calls to the operator.
+
+        `python_func_type` is a type of an object that constructed as
+        python_func_type(f) and provides an implementation to forward and
+        backward functions. Its useful in such a case where users needs
+        a statefull PythonOp (ex: use autograd for computing grad_f).
+
+        If `pass_workspace` is True, the signature is changed to
+        (inputs, outputs, workspace) where `workspace` is the workspace the op
+        is going to run on. This is potentially dangerous (as the op can
+        manipulate the workspace directly), use on your own risk.
+
+        If a gradient function is specified (`grad_f`), by default its inputs
+        will be: (1) all inputs to `f`, (2) followed by all outputs of `f`, (3)
+        and then all gradient outputs of `f`. The outputs of `grad_f` will be
+        (by default) all gradient inputs to `f`. If a subset of the gradient
+        outputs or gradient inputs is desired instead, then the subsets can be
+        specified by providing `grad_output_indices` and/or `grad_input_indices`
+        which identify the indices of `f`'s inputs and outputs which have
+        gradients.
+        """
+        assert(IsOperator('Python'))
+
+        def make_builder(t):
+            if not isinstance(t, tuple):
+                return ''
+            assert len(t) == 3, 'Expected builder tuple (func, args, kwargs)'
+            func, args, kwargs = t
+            normalized = (func, tuple(args), dict(kwargs))
+            return pickle.dumps(normalized)
+
+        f_builder = make_builder(f)
+        grad_f_builder = make_builder(grad_f)
+
+        assert (not grad_f) or ((not f_builder) == (not grad_f_builder)), (
+            'A tuple has to be passed to both f and grad_f or neither.')
+
+        core_kwargs = {}
+        if f_builder:
+            core_kwargs['pickled_builder'] = f_builder
+            core_kwargs['pickled_grad_builder'] = grad_f_builder
+            core_kwargs['pass_workspace'] = pass_workspace
+        else:
+            core_kwargs['token'] = _RegisterPythonImpl(
+                f, grad_f, python_func_type, pass_workspace=pass_workspace)
+
+        grad_output_indices = grad_output_indices or []
+        grad_input_indices = grad_input_indices or []
+        return lambda *args, **kwargs: self._CreateAndAddToSelf(
+            'Python',
+            grad_output_indices=grad_output_indices,
+            grad_input_indices=grad_input_indices,
+            *args,
+            **dict(chain(viewitems(kwargs), viewitems(core_kwargs)))
+        )
+
+    def is_external_input(self, blob):
+        name = str(blob)
+        return name in self._external_input_map
+
+    def extend_ops(self, new_ops):
+        return self._ExtendOps(new_ops)
+
+
+def copy_func_between_devices(src, dst):
+    CPU = caffe2_pb2.CPU
+    CUDA = caffe2_pb2.CUDA
+
+    if src.device_type == CPU and dst.device_type == CPU:
+        return None
+
+    if src.device_type == CUDA and dst.device_type == CUDA:
+        if src.cuda_gpu_id == dst.cuda_gpu_id:
+            return None
+        else:
+            def fun(net, *args, **kw):
+                with DeviceScope(dst):
+                    return net.Copy(*args, **kw)
+            return fun
+
+    if src.device_type == CUDA and dst.device_type == CPU:
+        def fun(net, *args, **kw):
+            with DeviceScope(src):
+                return net.CopyGPUToCPU(*args, **kw)
+        return fun
+
+    if src.device_type == CPU and dst.device_type == CUDA:
+        def fun(net, *args, **kw):
+            with DeviceScope(dst):
+                return net.CopyCPUToGPU(*args, **kw)
+        return fun
+
+    raise ValueError('Non-supported devices: %s and %s' % (src, dst))
+
+
+def device_equal(src, dst):
+    '''
+    We are using this fucntion instead of == operator because optional-value
+    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
+    returns not equal in some cases.
+    '''
+    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
+
+
+def update_placeholder_op_output(op, blob_to_device):
+    '''
+    Placeholder ops (for e.g. Recv) always runs on CPU. So ensure their
+    output blobs reside on CPU.
+    '''
+    outputs = []
+    for output in op.output:
+        if (output in blob_to_device and
+                blob_to_device[output].device_type != caffe2_pb2.CPU):
+            output += '_cpu'
+        outputs.append(output)
+    del op.output[:]
+    op.output.extend(outputs)
+
+
+class RemapEntry:
+    def __init__(self, blob, device):
+        self.blob = blob
+        self.device = device
+
+    def __eq__(self, other):
+        return self.blob == other.blob and self.device == other.device
+
+    def __hash__(self):
+        return hash(self.blob + str(self.device))
+
+
+def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None,
+                            placeHolderOps=None):
+    '''
+    Injecting Copy functions between device within a net. Users can provide
+    a net with part of operators using different device_options. This method
+    will automatically create a new net with Copy ops inserted in it.
+
+    Inputs:
+      blob_to_device: If not None, it is a map of blobs and their device locations.
+      blob_remap: If not None, it is a map from a pair (blob, device) to
+                  the name of the blob in the given device. Blobs found in this
+                  map are assumed to be cached and don't need to be copied.
+    Outputs:
+      new_net: A new net with CopyCPUToGPU inserted with correct device option
+
+      required_external_to_device:
+               A mapping between unresolved external inputs and their
+               required device options.
+    Assumptions:
+      1. every external inputs of this net is already in blob_to_device!
+      2. if not, this function will use net device option
+      3. InferOpBlobDevices might fail to get the correct inference for ops like
+         EnsureCPUOutput that could take in input from multiple places.
+    '''
+    new_net = net.Clone(net._net.name + '_cross_device', keep_schema=True)
+    del new_net._net.op[:]
+    if blob_to_device is None:
+        blob_to_device = {}
+    # remapping of input blobs for each op.
+    if blob_remap is None:
+        blob_remap = {}
+    temp_remap = {}
+    net_option = net._net.device_option or caffe2_pb2.DeviceOption()
+
+    # if external_inputs have device remappings generated by previous nets,
+    # then add those remappings as external inputs as well.
+    all_remaps = defaultdict(list)
+    for entry, mapped_blob in blob_remap.items():
+        all_remaps[entry.blob].append(mapped_blob)
+    mapped_external_inputs = []
+    for input in new_net._net.external_input:
+        mapped_external_inputs.extend(all_remaps.get(input) or [])
+    new_net._net.external_input.extend(mapped_external_inputs)
+
+    for op in net._net.op:
+        temp_remap.clear()
+        # Get where inputs and outputs should be. If it is a Placeholder
+        # (i.e. fake) op, then set op's device as blob's devices.
+        input_dev = None
+        output_dev = None
+        if placeHolderOps is not None and op.type in placeHolderOps:
+            input_dev, output_dev = InferOpDeviceAsBlobDevices(op)
+        else:
+            input_dev, output_dev = InferOpBlobDevices(op)
+
+        for dev, input in zip(input_dev, op.input):
+            assert net.BlobIsDefined(input), \
+                "input {} should be defined in the net.".format(input)
+            if input not in blob_to_device:
+                if net.is_external_input(input):
+                    blob_to_device[input] = net_option
+                else:
+                    raise AttributeError(
+                        "No device information found for blob {}.".
+                        format(input)
+                    )
+
+            if not device_equal(blob_to_device[input], dev):
+                # reuse already moved input
+                if (RemapEntry(input, dev) in blob_remap and
+                        blob_to_device[blob_remap[RemapEntry(input, dev)]] == dev):
+                    temp_remap[input] = blob_remap[RemapEntry(input, dev)]
+                else:
+                    # need to make input on correct device.
+                    copy_func = copy_func_between_devices(
+                        blob_to_device[input], dev
+                    )
+
+                    def _gen_new_name(blob, device_option):
+                        CPU = caffe2_pb2.CPU
+                        CUDA = caffe2_pb2.CUDA
+                        if device_option.device_type == CPU:
+                            suffix = '_cpu'
+                        elif device_option.device_type == CUDA:
+                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
+                        else:
+                            raise RuntimeError(
+                                "Unknown device type: {}".
+                                format(device_option.device_type)
+                            )
+                        return blob + suffix
+
+                    new_name = _gen_new_name(input, dev)
+                    copy_func(new_net, input, new_name)
+                    blob_remap[RemapEntry(input, dev)] = new_name
+                    temp_remap[input] = new_name
+                    blob_to_device[new_name] = dev
+
+        if placeHolderOps is not None and op.type in placeHolderOps:
+            update_placeholder_op_output(op, blob_to_device)
+
+        # Enforcing no reuse blob between operators. In-place blob usage in an
+        # op is allowed. This is based on the assumption that in-place op has
+        # same device info
+        for dev, output in zip(output_dev, op.output):
+            if output in blob_to_device and (
+                output not in op.input and
+                not device_equal(blob_to_device[output], dev)
+            ):
+                raise RuntimeError(
+                    "In-place blob: {} is not supported between operators "
+                    "with different device option previous:{} now: {}. "
+                    "Failed op:\n {}".format(
+                        output, blob_to_device[output], dev, op
+                    )
+                )
+        new_op = caffe2_pb2.OperatorDef()
+        new_op.CopyFrom(op)
+
+        new_list = [temp_remap.get(b, b) for b in new_op.input]
+        del new_op.input[:]
+        new_op.input.extend(new_list)
+
+        # keep inplace blobs inplace
+        original_inputs = list(op.input)
+        for i, out in enumerate(new_op.output):
+            try:
+                input_idx = original_inputs.index(out)
+                new_op.output[i] = new_op.input[input_idx]
+            except ValueError:
+                pass
+
+        blob_to_device.update(
+            {o: d for d, o in zip(output_dev, new_op.output)})
+        new_net.extend_ops([new_op])
+
+    return new_net, blob_to_device
+
+
+def InjectDeviceCopiesAmongNets(nets, blob_to_device_init=None):
+    """
+    Takes in a list of nets. They usually represent your whole execution graph.
+    This function will insert cross device copy functions to all nets, and resolve
+    inter-net external inputs dependencies. This method will insert Copy funcitons if
+    external inputs of a net is produced on different device than it is required.
+    Inputs:
+      nets: a list of nets
+    Outputs:
+      new_nets: a list of new nets with device difference solved.
+
+    Some notes from wyiming:
+      1. You MUST pass nets in execution order. e.g. [train_init, train]
+    """
+    assert isinstance(nets, list), \
+        "nets {} should be a list of nets.".format(str(nets))
+    assert all(isinstance(net, Net) for net in nets), \
+        "nets {} should be a list of nets.".format(str(nets))
+    # A holistic blob to device mapping.
+    blob_to_device = blob_to_device_init or {}
+    blob_remap = {}
+    new_nets = []
+
+    for net in nets:
+        new_net, blob_to_device = InjectCrossDeviceCopies(
+            net,
+            blob_to_device=blob_to_device,
+            blob_remap=blob_remap,
+        )
+        new_nets.append(new_net)
+
+    return new_nets, blob_to_device
+
+
+def InjectDeviceCopiesAmongNetsWithoutB2D(nets, blob_to_device_init=None):
+    new_nets, _ = InjectDeviceCopiesAmongNets(nets, blob_to_device_init)
+    return new_nets
+
+
+def get_net_name(netlike):
+    if isinstance(netlike, Net):
+        return netlike.Proto().name
+    elif isinstance(netlike, caffe2_pb2.NetDef):
+        return netlike.name
+    else:
+        return netlike
+
+
+def output_to_list(op_output):
+    """
+    Ensures that the output of an operator is a list.
+    Use when an operator has a variable number of outputs, but a list of
+    outputs is desired even when number of outputs is 1.
+
+    Args:
+        op_output: Either a BlobReferenece or an iterable of BlobReferences.
+
+    Returns:
+        A list of BlobReferences.
+    """
+    assert type(op_output) in (list, tuple, BlobReference)
+    return (
+        [op_output]
+        if isinstance(op_output, BlobReference) else list(op_output))
+
+
+def _add_net_to_dict(net_dict, net):
+    name = get_net_name(net)
+    if name in net_dict:
+        assert net_dict[name] is None or net == net_dict[name], (
+            'Different nets with same name: ' + name)
+        return False
+    else:
+        net_dict[name] = net if isinstance(net, Net) else None
+        return True
+
+
+class ExecutionStep(object):
+    _step_names_used = set()
+
+    @staticmethod
+    def _get_next_step_name(basename):
+        name = basename
+        next_idx = 1
+        while name in ExecutionStep._step_names_used:
+            name = basename + '_' + str(next_idx)
+            next_idx += 1
+        ExecutionStep._step_names_used |= set([name])
+        return name
+
+    def __init__(self, name, nets=None, num_iter=None):
+        self._step = caffe2_pb2.ExecutionStep()
+        self._step.name = name or ExecutionStep._get_next_step_name('step')
+        self._net_dict = OrderedDict()
+        self._is_used = False
+        self._substeps = []
+        if nets is not None:
+            if type(nets) is Net:
+                nets = [nets]
+            for net in nets:
+                if _add_net_to_dict(self._net_dict, net):
+                    self._step.network.extend([get_net_name(net)])
+        if num_iter is not None:
+            self._step.num_iter = num_iter
+
+    def get_net(self, name):
+        return self._net_dict[name]
+
+    def Name(self):
+        return self._step.name
+
+    def __str__(self):
+        return self._step.name
+
+    def _assert_can_mutate(self):
+        assert not self._is_used, (
+            'Cannot mutate a step that has already been added to a plan/step.')
+
+    def _notify_is_used(self):
+        self._is_used = True
+
+    def Proto(self):
+        return self._step
+
+    def HasNets(self):
+        return self._step.network is not None and (
+            len(self._step.network) > 0)
+
+    def HasSubsteps(self):
+        return self._step.substep is not None and (
+            len(self._step.substep) > 0)
+
+    def Nets(self):
+        return list(viewvalues(self._net_dict))
+
+    def Substeps(self):
+        return self._substeps
+
+    def SetIter(self, num_iter):
+        self._assert_can_mutate()
+        self._step.num_iter = num_iter
+
+    def SetCreateWorkspace(self, create_workspace):
+        self._assert_can_mutate()
+        self._step.create_workspace = create_workspace
+
+    def SetNumConcurrentInstances(self, num_concurrent_instances):
+        self._assert_can_mutate()
+        self._step.num_concurrent_instances = num_concurrent_instances
+
+    def SetOnlyOnce(self, only_once):
+        self._assert_can_mutate()
+        self._step.only_once = only_once
+
+    def SetShouldStopBlob(self, should_stop_blob):
+        assert isinstance(should_stop_blob, BlobReference), (
+            "expects BlobReference here, got {}".format(type(should_stop_blob)))
+        self._assert_can_mutate()
+        self._step.should_stop_blob = str(should_stop_blob)
+
+    def RunEveryMillis(self, interval):
+        """
+        Run this step every interval millisecods, as long as its
+        siblings are still running. It is guaranteed that, after all
+        siblings finish, this step will run at least one.
+
+        This property is ignored for top-level ExecutionSteps.
+        """
+        self._step.run_every_ms = interval
+
+    def SetReportNet(self, report_net, report_interval):
+        """ DEPRECATED. Use RunEveryMillis instead. """
+        self._assert_can_mutate()
+        _add_net_to_dict(self._net_dict, report_net)
+        self._step.report_net = get_net_name(report_net)
+        self._step.report_interval = report_interval
+
+    def AddSubstep(self, substep):
+        self._assert_can_mutate()
+        assert not self.HasNets(), 'Cannot have both network and substeps.'
+        if isinstance(substep, ExecutionStep):
+            substep._notify_is_used()
+            if not substep.HasNets() and not substep.HasSubsteps():
+                return self
+            for net in substep.Nets():
+                _add_net_to_dict(self._net_dict, net)
+            self._substeps.append(substep)
+            proto = substep.Proto()
+        else:
+            proto = substep
+        self._step.substep.add().CopyFrom(proto)
+        return self
+
+    def SetConcurrentSubsteps(self, concurrent_substeps):
+        self._assert_can_mutate()
+        assert not self.HasNets(), 'Cannot have both network and substeps.'
+        self._step.concurrent_substeps = concurrent_substeps
+
+    def AddNet(self, net):
+        self._assert_can_mutate()
+        assert not self.HasSubsteps(), 'Cannot have both network and substeps.'
+        assert isinstance(net, Net)
+        _add_net_to_dict(self._net_dict, net)
+        self._step.network.extend([get_net_name(net)])
+        return self
+
+    def get_all_attributes(self, name):
+        """
+        Return the list of all attributes under the given `name`, present in
+        all of the nets used in this execution step and its children.
+        """
+        return [
+            attr
+            for net in viewvalues(self._net_dict)
+            for attr in net.get_attributes(name)
+        ]
+
+    @classmethod
+    def create_from_proto(cls, step_proto, net_obj_dict, net_proto_dict):
+        """
+        Create ExecutionStep from ExecutionStep protobuf recursively
+        """
+        assert isinstance(step_proto, caffe2_pb2.ExecutionStep)
+        assert (len(step_proto.network) > 0 and len(step_proto.substep) == 0) or \
+            (len(step_proto.network) == 0 and len(step_proto.substep) > 0)
+
+        steps_or_nets = []
+        if len(step_proto.substep) > 0:
+            for substep_proto in step_proto.substep:
+                steps_or_nets.append(ExecutionStep.create_from_proto(
+                    substep_proto, net_obj_dict, net_proto_dict))
+        else:
+            for net_name in step_proto.network:
+                if net_name not in net_obj_dict:
+                    assert net_name in net_proto_dict
+                    net = Net(net_proto_dict[net_name])
+                    net_obj_dict[net_name] = net
+                net = net_obj_dict[net_name]
+                assert isinstance(net, Net)
+                steps_or_nets.append(net)
+
+        num_iter = step_proto.num_iter if step_proto.HasField('num_iter') else None
+        concurrent_substeps = step_proto.concurrent_substeps if\
+            step_proto.HasField('concurrent_substeps') else None
+        should_stop_blob = BlobReference(step_proto.should_stop_blob) if\
+            step_proto.HasField('should_stop_blob') else None
+        only_once = step_proto.only_once if\
+            step_proto.HasField('only_once') else None
+        num_concurrent_instances = step_proto.num_concurrent_instances if\
+            step_proto.HasField('num_concurrent_instances') else None
+        create_workspace = step_proto.create_workspace if\
+            step_proto.HasField('create_workspace') else None
+        run_every_ms = step_proto.run_every_ms if\
+            step_proto.HasField('run_every_ms') else None
+
+        return execution_step(
+            step_proto.name,
+            steps_or_nets,
+            num_iter=num_iter,
+            report_net=None,        # DEPRECATED
+            report_interval=None,   # DEPRECATED
+            concurrent_substeps=concurrent_substeps,
+            should_stop_blob=should_stop_blob,
+            only_once=only_once,
+            num_concurrent_instances=num_concurrent_instances,
+            create_workspace=create_workspace,
+            run_every_ms=run_every_ms)
+
+
+def add_nets_in_order(step, net_list):
+    proto = step.Proto()
+    for substep in step.Substeps():
+        add_nets_in_order(substep, net_list)
+    for net in proto.network:
+        if net not in net_list:
+            net_list.append(net)
+    # FIXME(azzolini): This is actually wrong. Report nets should be
+    # instantiated first since they may run before any substep is run.
+    # However, curerntly, Reporter depends on this behavior.
+    if proto.report_net and proto.report_net not in net_list:
+        net_list.append(proto.report_net)
+
+
+class Plan(object):
+
+    def __init__(self, name_or_step):
+        self._plan = caffe2_pb2.PlanDef()
+        self._net_dict = OrderedDict()
+        self._steps = []    # A list of ExecutionStep
+        if isinstance(name_or_step, ExecutionStep):
+            self._plan.name = name_or_step.Name()
+            self.AddStep(name_or_step)
+        elif isinstance(name_or_step, basestring):
+            self._plan.name = name_or_step
+        else:
+            raise ValueError('name_or_step must be a string or ExecutionStep')
+
+    def __str__(self):
+        return self._plan.name
+
+    def Proto(self):
+        return self._plan
+
+    def AddNets(self, nets):
+        for net in nets:
+            if _add_net_to_dict(self._net_dict, net):
+                assert isinstance(net, Net)
+                self._plan.network.add().CopyFrom(net.Proto())
+
+    def Nets(self):
+        return list(viewvalues(self._net_dict))
+
+    def AddStep(self, step):
+        assert isinstance(step, ExecutionStep)
+        step._notify_is_used()
+        if not step.HasNets() and not step.HasSubsteps():
+            return
+        self._plan.execution_step.add().CopyFrom(step.Proto())
+        self._steps.append(step)
+        # nets need to be added to the plan in order of usage
+        net_list = []
+        add_nets_in_order(step, net_list)
+        self.AddNets([step.get_net(n) for n in net_list])
+
+    def Steps(self):
+        return self._steps
+
+    def get_all_attributes(self, name):
+        """
+        Return the list of all attributes under the given `name`, present in
+        all of the nets used in this plan.
+        """
+        return [
+            attr
+            for net in viewvalues(self._net_dict)
+            for attr in net.get_attributes(name)
+        ]
+
+    @classmethod
+    def create_from_proto(cls, plan_proto):
+        assert isinstance(plan_proto, caffe2_pb2.PlanDef)
+        plan = Plan(plan_proto.name)
+        plan._plan.CopyFrom(plan_proto)
+
+        net_obj_dict = {}
+        net_proto_dict = {}
+        for net_proto in plan_proto.network:
+            assert net_proto.name not in net_proto_dict
+            net_proto_dict[net_proto.name] = net_proto
+
+        for step_proto in plan_proto.execution_step:
+            step = ExecutionStep.create_from_proto(
+                step_proto, net_obj_dict, net_proto_dict)
+            plan.AddStep(step)
+
+        return plan
+
+
+def to_execution_step(step_or_nets, default_name=None):
+    from caffe2.python.net_builder import NetBuilder
+    if isinstance(step_or_nets, ExecutionStep):
+        return step_or_nets
+
+    stop_blob = None
+    if not default_name and hasattr(step_or_nets, 'name'):
+        default_name = step_or_nets.name
+    if isinstance(step_or_nets, NetBuilder):
+        stop_blob = step_or_nets._stop_blob
+        step_or_nets = step_or_nets.get()
+    return execution_step(
+        default_name, step_or_nets, should_stop_blob=stop_blob)
+
+
+def execution_step(default_name,
+                   steps_or_nets,
+                   num_iter=None,
+                   report_net=None,
+                   report_interval=None,
+                   concurrent_substeps=None,
+                   should_stop_blob=None,
+                   only_once=None,
+                   num_concurrent_instances=None,
+                   create_workspace=False,
+                   run_every_ms=None):
+    """
+    Helper for creating an ExecutionStep.
+    - steps_or_nets can be:
+      - None
+      - Net
+      - ExecutionStep
+      - list<Net>
+      - list<ExecutionStep>
+    - should_stop_blob is either None or a scalar boolean blob.
+      - This blob is checked AFTER every substeps/subnets.
+      - If specified and true, then this step will return immediately.
+      - Be sure to handle race conditions if setting from concurrent threads.
+    - if no should_stop_blob or num_iter is provided, defaults to num_iter=1
+    """
+    assert should_stop_blob is None or num_iter is None, (
+        'Cannot set both should_stop_blob and num_iter.')
+    if should_stop_blob is None and num_iter is None:
+        num_iter = 1
+
+    step = ExecutionStep(default_name)
+    if should_stop_blob is not None:
+        step.SetShouldStopBlob(should_stop_blob)
+    if num_iter is not None:
+        step.SetIter(num_iter)
+    if only_once is not None:
+        step.SetOnlyOnce(only_once)
+    if concurrent_substeps is not None:
+        step.SetConcurrentSubsteps(concurrent_substeps)
+    if report_net is not None:
+        assert report_interval is not None
+        step.SetReportNet(report_net, report_interval)
+    if num_concurrent_instances is not None:
+        step.SetNumConcurrentInstances(num_concurrent_instances)
+    if create_workspace:
+        step.SetCreateWorkspace(True)
+    if run_every_ms:
+        step.RunEveryMillis(run_every_ms)
+
+    if isinstance(steps_or_nets, ExecutionStep):
+        step.AddSubstep(steps_or_nets)
+    elif isinstance(steps_or_nets, Net):
+        step.AddNet(steps_or_nets)
+    elif isinstance(steps_or_nets, list):
+        if all(isinstance(x, Net) for x in steps_or_nets):
+            for x in steps_or_nets:
+                step.AddNet(x)
+        else:
+            for x in steps_or_nets:
+                step.AddSubstep(to_execution_step(x))
+    elif steps_or_nets:
+        raise ValueError(
+            'steps_or_nets must be a step, a net, or a list of nets or steps.')
+    return step
+
+
+def scoped_execution_step(name, *args, **kwargs):
+    """Same as execution_step() except that the step name is scoped."""
+    default_name = ScopedName(name) if name else name
+    return execution_step(default_name, *args, **kwargs)
+
+
+def _extract_stacktrace():
+    '''
+    This function extracts stacktrace without file system access
+    by purely using sys._getframe() and removes part that belongs to
+    this file (core.py). We are not using inspect module because
+    its just a wrapper on top of sys._getframe() whos
+    logis is based on accessing source files on disk - exactly what
+    we are trying to avoid here. Same stands for traceback module
+
+    The reason for file system access avoidance is that
+    if code is located on an NFS, file access might be slow
+
+    Function returns a list of tuples (file_name, line_number, function)
+    '''
+
+    result = []
+    # Ignore top 3 layers of stack: this function, _CreateAndAddToSelf, and
+    # whatever calls _CreateAndAddToSelf (either __getattr__ or Python)
+    frame = sys._getframe(3)
+    # We just go down the frame stack in a loop
+    while frame:
+        # Its important to extract information from the frame here
+        # as frame's current line most probably will change later.
+        result.append((frame.f_code.co_filename, frame.f_lineno, frame.f_code.co_name))
+        frame = frame.f_back
+    return result
+
+
+SetPerOpEnginePref = C.set_per_op_engine_pref
+SetGlobalEnginePref = C.set_global_engine_pref
+SetEnginePref = C.set_engine_pref
+SetOpEnginePref = C.set_op_engine_pref
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
new file mode 100644
index 0000000..bf25806
--- /dev/null
+++ b/caffe2/python/core_gradients_test.py
@@ -0,0 +1,917 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from future.utils import bytes_to_native_str
+from hypothesis import given
+import hypothesis.strategies as st
+import unittest
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, test_util
+from caffe2.python.core import CreateOperator, GradientRegistry
+from caffe2.python import workspace
+
+import numpy as np
+
+
+# First, we will set up a few gradient registry entries so that we can manually
+# construct some test cases.
+
+
+def NeedAll(op, g_output):
+    """A sanity check to make sure that all the gradient are given."""
+    for name, g in zip(op.output, g_output):
+        if g is None:
+            raise RuntimeError(
+                'Need gradient for "%s" but it is not provided.' % name)
+    return g_output
+
+
+def GIS(op):
+    """A test util function to generate the gradient name for input."""
+    return [s + '_grad' for s in op.input]
+
+
+def CopyDeviceOption(op, src_op):
+    if src_op.HasField('device_option'):
+        op.device_option.CopyFrom(src_op.device_option)
+    return op
+
+
+# First gradient: (in -> out) leading to (out_grad -> in_grad)
+@GradientRegistry.RegisterGradient('Direct')
+def AddDirectGradient(op, g_output):
+    return (
+        CopyDeviceOption(
+            CreateOperator('DirectGradient', NeedAll(op, g_output), GIS(op)),
+            op),
+        GIS(op)
+    )
+
+
+# Second gradient: (in -> out) leading to (out, out_grad -> in_grad)
+@GradientRegistry.RegisterGradient('UseOutput')
+def AddUseOutputGradient(op, g_output):
+    return (
+        CopyDeviceOption(
+            CreateOperator(
+                'UseOutputGradient',
+                list(op.output) + NeedAll(op, g_output), GIS(op)),
+            op),
+        GIS(op)
+    )
+
+
+@GradientRegistry.RegisterGradient('UseInput')
+def AddUseInputGradient(op, g_output):
+    return (
+        CopyDeviceOption(
+            CreateOperator(
+                'UseInputGradient',
+                list(op.input) + NeedAll(op, g_output), GIS(op)),
+            op),
+        GIS(op)
+    )
+
+
+@GradientRegistry.RegisterGradient('Nogradient')
+def AddNogradient(op, g_output):
+    return (
+        [],
+        [None for s in op.input]
+    )
+
+
+class TestGradientCalculation(test_util.TestCase):
+    def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
+        for op in operatorDefList1:
+            op.debug_info = ""
+        for op in operatorDefList2:
+            op.debug_info = ""
+        self.assertEqual(operatorDefList1, operatorDefList2)
+
+    @given(device_option=st.sampled_from([
+        None,
+        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+    def testDirect(self, device_option):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        if device_option:
+            for op in operators:
+                op.device_option.CopyFrom(device_option)
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
+            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
+        ]
+        if device_option:
+            for op in desired_grad_operators:
+                op.device_option.CopyFrom(device_option)
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testDirectImplicitGradientSource(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator(
+                "ConstantFill", 'out', "out_autogen_grad", value=1.0),
+            CreateOperator(
+                'DirectGradient', 'out_autogen_grad', 'hidden_grad'),
+            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
+        ]
+        for op in desired_grad_operators:
+            op.debug_info = ""
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, ['out'])
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testDoesNotGenerateUnnecessaryGradients(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
+        ]
+        for op in desired_grad_operators:
+            op.debug_info = ""
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'hidden': 'hidden_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testDirectButNoOutputGradientGiven(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {})
+        self.assertOperatorListEqual(gradients, [])
+
+    def testDirectInPlace(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'in'),
+            CreateOperator('Direct', 'in', 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'out_grad', 'in_grad'),
+            CreateOperator('DirectGradient', 'in_grad', 'in_grad'),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testVersionMismatch(self):
+        operators = [
+            CreateOperator('Direct', 'x', 'x'),
+            CreateOperator('Direct', 'y', 'x'),
+            CreateOperator('Direct', 'x', 'y'),
+        ]
+        try:
+            gradients, _ = GradientRegistry.GetBackwardPass(
+                operators, {'y': 'y_grad'})
+            self.assertFalse(True, "Should raise exception of incorrect version")
+        except RuntimeError as e:
+            print(e)
+            self.assertTrue("version" in str(e))
+            pass
+
+    def testUseOutput(self):
+        operators = [
+            CreateOperator('UseOutput', 'in', 'hidden'),
+            CreateOperator('UseOutput', 'hidden', 'out'),
+            CreateOperator('Direct', 'out', 'sink'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
+            CreateOperator(
+                'UseOutputGradient',
+                ['out', 'out_grad'], 'hidden_grad'
+            ),
+            CreateOperator(
+                'UseOutputGradient',
+                ['hidden', 'hidden_grad'], 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'sink': 'sink_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testUseOutputInPlace(self):
+        operators = [
+            CreateOperator('UseOutput', 'in', 'in'),
+            CreateOperator('UseOutput', 'in', 'out'),
+            CreateOperator('Direct', 'out', 'sink'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
+            CreateOperator(
+                'UseOutputGradient',
+                ['out', 'out_grad'], 'in_grad'
+            ),
+            CreateOperator(
+                'UseOutputGradient',
+                ['in', 'in_grad'], 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'sink': 'sink_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testUseOutputButOutputHasBeenChanged(self):
+        operators = [
+            CreateOperator('UseOutput', 'in', 'hidden'),
+            # Note here: we overwrite hidden, but hidden will be needed by the
+            # gradient calculation of the first operator, so the gradient
+            # registry should return an error.
+            CreateOperator('Direct', 'hidden', 'hidden'),
+            CreateOperator('UseOutput', 'hidden', 'out'),
+            CreateOperator('Direct', 'out', 'sink'),
+        ]
+        with self.assertRaises(RuntimeError):
+            gradients, _ = GradientRegistry.GetBackwardPass(
+                operators, {'sink': 'sink_grad'})
+
+    def testUseInput(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('UseInput', 'hidden', 'out'),
+            CreateOperator('Direct', 'out', 'sink'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
+            CreateOperator(
+                'UseInputGradient',
+                ['hidden', 'out_grad'], 'hidden_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden_grad', 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'sink': 'sink_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testUseInputButInputHasBeenChanged(self):
+        """Test gradient for the following case:
+
+        in -> out, with UseInput
+        in -> in
+
+        Since we overwrite in in op#1, but in will be needed by the gradient
+        calculation of op#0, the gradient registry should raise an error.
+        """
+        operators = [
+            CreateOperator('UseInput', 'in', 'out'),
+            CreateOperator('Direct', 'in', 'in'),
+        ]
+        with self.assertRaises(RuntimeError):
+            gradients, _ = GradientRegistry.GetBackwardPass(
+                operators, {'out': 'out_grad'})
+
+    @given(device_option=st.sampled_from([
+        None,
+        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+    def testMultiUseInput(self, device_option):
+        """Test gradient for the following case:
+
+        in -> hidden1
+        in -> hidden2
+        hidden1, hidden2 -> out
+        """
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden1'),
+            CreateOperator('Direct', 'in', 'hidden2'),
+            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
+        ]
+        if device_option:
+            for op in operators:
+                op.device_option.CopyFrom(device_option)
+        desired_grad_operators = [
+            CreateOperator(
+                'DirectGradient',
+                'out_grad', ['hidden1_grad', 'hidden2_grad']
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden2_grad', 'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden1_grad', '_in_grad_autosplit_0'
+            ),
+            CreateOperator(
+                'Sum',
+                ['in_grad', '_in_grad_autosplit_0'], 'in_grad'
+            ),
+        ]
+        if device_option:
+            for op in desired_grad_operators:
+                op.device_option.CopyFrom(device_option)
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {"out": "out_grad"})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testMultiUseInputButWithNoGradient(self):
+        """Test gradient for the following case:
+
+        in -> hidden1
+        in -(no gradient)-> hidden2
+        hidden1, hidden2 -> out
+        """
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden1'),
+            CreateOperator('Nogradient', 'in', 'hidden2'),
+            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator(
+                'DirectGradient',
+                'out_grad', ['hidden1_grad', 'hidden2_grad']
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden1_grad', 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testMultiUseInputAndMultipleVersions(self):
+        """Test gradient for the following case:
+
+        in -> in
+        in -> hidden1, hidden2
+        hidden1, hidden2 -> out
+        """
+        operators = [
+            CreateOperator('Direct', 'in', 'in'),
+            CreateOperator('Direct', 'in', 'hidden1'),
+            CreateOperator('Direct', 'in', 'hidden2'),
+            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator(
+                'DirectGradient',
+                'out_grad', ['hidden1_grad', 'hidden2_grad']
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden2_grad', 'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden1_grad', '_in_grad_autosplit_0'
+            ),
+            CreateOperator(
+                'Sum',
+                ['in_grad', '_in_grad_autosplit_0'], 'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'in_grad', 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testMultiUseInputAndMultipleVersionsBig(self):
+        """Test gradient for the following case:
+
+        in -> in
+        in -> hidden1, hidden2
+        hidden1, hidden2 -> in
+        in -> hidden3, hidden4, hidden5
+        hidden3, hidden4, hidden5 -> out
+        """
+        operators = [
+            CreateOperator('Direct', 'in', 'in'),
+            CreateOperator('Direct', 'in', 'hidden1'),
+            CreateOperator('Direct', 'in', 'hidden2'),
+            CreateOperator('Direct', ['hidden1', 'hidden2'], 'in'),
+            CreateOperator('Direct', 'in', 'hidden3'),
+            CreateOperator('Direct', 'in', 'hidden4'),
+            CreateOperator('Direct', 'in', 'hidden5'),
+            CreateOperator('Direct', ['hidden3', 'hidden4', 'hidden5'], 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator(
+                'DirectGradient',
+                'out_grad', ['hidden3_grad', 'hidden4_grad', 'hidden5_grad']
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden5_grad', 'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden4_grad', '_in_grad_autosplit_0'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden3_grad', '_in_grad_autosplit_1'
+            ),
+            CreateOperator(
+                'Sum',
+                ['in_grad', '_in_grad_autosplit_0',
+                 '_in_grad_autosplit_1'],
+                'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'in_grad', ['hidden1_grad', 'hidden2_grad']
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden2_grad', 'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'hidden1_grad', '_in_grad_autosplit_0'
+            ),
+            CreateOperator(
+                'Sum',
+                ['in_grad', '_in_grad_autosplit_0'],
+                'in_grad'
+            ),
+            CreateOperator(
+                'DirectGradient',
+                'in_grad', 'in_grad'
+            ),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        for s in gradients:
+            print(str(s))
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testGradientMappingUsingSumOp(self):
+        """Since Sum is used in accumulating gradients, we will test if
+        it is OK to also explicitly use it in the graph."""
+        operators = [
+            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
+            CreateOperator('Sum', 'fc', 'agg'),
+            CreateOperator('AveragedLoss', 'agg', 'loss'),
+        ]
+        # This should run correctly.
+        gradient_ops, _ = GradientRegistry.GetBackwardPass(
+            operators, {'loss': 'loss_grad'})
+        for s in gradient_ops:
+            print(str(s))
+
+    def testGradientCalculationWithPrint(self):
+        """Test a common use case where we have Print in the forward pass."""
+        operators = [
+            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
+            CreateOperator('Print', 'fc', []),
+            CreateOperator('AveragedLoss', 'fc', 'loss'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('AveragedLossGradient',
+                           ['fc', 'loss_grad'], 'fc_grad'),
+            CreateOperator('FCGradient', ['in', 'w', 'fc_grad'],
+                           ['w_grad', 'b_grad', 'in_grad']),
+        ]
+        for g in desired_grad_operators:
+            g.is_gradient_op = 1
+        # This should run correctly.
+        gradient_ops, _ = GradientRegistry.GetBackwardPass(
+            operators, {'loss': 'loss_grad'})
+        for s in gradient_ops:
+            print(str(s))
+        self.assertOperatorListEqual(gradient_ops, desired_grad_operators)
+
+    def testStopGradient(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('StopGradient', 'hidden', 'hidden2'),
+            CreateOperator('Direct', 'hidden2', 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'out_grad', 'hidden2_grad'),
+        ]
+        gradients, _ = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+
+    def testStopGradientOrphan(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('StopGradient', 'hidden', 'auto_blobx'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        with self.assertRaises(ValueError):
+            # This should complain about incorrect use of StopGradient
+            gradients, _ = GradientRegistry.GetBackwardPass(
+                operators, {'out': 'out_grad'})
+
+    def testStopGradientInplace(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('StopGradient', 'hidden', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
+        ]
+        gradients, grad_map = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+        self.assertEqual(grad_map, {'out': 'out_grad'})
+
+    def testStopGradientWithMultiUseOperators(self):
+        operators = [
+            CreateOperator('Direct', 'in', 'hidden'),
+            CreateOperator('Direct', 'hidden', 'hidden2'),
+            CreateOperator('StopGradient', 'hidden', 'hidden3'),
+            CreateOperator('Direct', ['hidden2', 'hidden3'], 'out'),
+        ]
+        desired_grad_operators = [
+            CreateOperator('DirectGradient', 'out_grad',
+                           ['hidden2_grad', 'hidden3_grad']),
+            CreateOperator('DirectGradient', 'hidden2_grad', 'hidden_grad'),
+            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
+        ]
+        gradients, grad_map = GradientRegistry.GetBackwardPass(
+            operators, {'out': 'out_grad'})
+        self.assertOperatorListEqual(gradients, desired_grad_operators)
+        self.assertEqual(
+            grad_map, {'out': 'out_grad', 'hidden2': 'hidden2_grad',
+                       'hidden3': 'hidden3_grad', 'hidden': 'hidden_grad',
+                       'in': 'in_grad'})
+
+    def test_zero_gradient(self):
+        net = core.Net("zero_grad_test")
+
+        hidden_prev, cell, gates, seq_lengths, timestep =\
+            net.AddExternalInput("h", "c", "g", "s", "t")
+        hidden, cell = net.LSTMUnit(
+            [hidden_prev, cell, gates, seq_lengths, timestep],
+            ["hidden_t", "cell_t"])
+        with self.assertRaises(Exception):
+            net.AddGradientOperators([hidden])
+        net.ZeroGradient(cell, [])
+        net.AddGradientOperators([hidden])
+
+    def test_two_grads(self):
+        net = core.Net("test_two_grads")
+        input, two, three = net.AddExternalInput("input", "two", "three")
+
+        m1 = net.Mul([input, two], "mul_1")
+        m2 = net.Mul([m1, three], "mul_2")
+        grad_map = net.AddGradientOperators([m2, m1])
+        workspace.ResetWorkspace()
+        workspace.blobs[input] = np.array([1]).astype(np.float32)
+        workspace.blobs[two] = np.array([2]).astype(np.float32)
+        workspace.blobs[three] = np.array([3]).astype(np.float32)
+        workspace.RunNetOnce(net)
+        print(net.Proto())
+        for blob in workspace.blobs:
+            print(blob, workspace.blobs[blob])
+        print("Input grad: ", workspace.blobs[grad_map[str(input)]])
+        assert workspace.blobs[grad_map[str(input)]] == 8.0
+
+
+# Skip if sparse operators are not available
+@unittest.skipIf(not core.IsOperator('SparseFunHash'),
+                 'Sparse operators not available')
+class TestSparseGradientsAccumulation(test_util.TestCase):
+    def testSparseAccumulationWithValues(self):
+        # The gradient for "Gather" only computes values. indices are directly
+        # passed from the input
+        #
+        # x1-->Gather-->x4-->
+        #        |          |
+        # x2-----+     DotProduct-->x6
+        #        |          |
+        # x3-->Gather-->x5-->
+        net = core.Net("test_net")
+        net.Gather(["x2", "x1"], "x4")
+        net.Gather(["x2", "x3"], "x5")
+        net.DotProduct(["x4", "x5"], "x6")
+        net.AddGradientOperators(["x6"])
+        sum_op_i = net.Proto().op[-2]
+        sum_op_v = net.Proto().op[-1]
+        self.assertEqual(sum_op_i.input[0], "x3")
+        self.assertEqual(sum_op_i.input[1], "x1")
+        self.assertEqual(sum_op_i.output[0], "x2_grad_indices_concat")
+        self.assertEqual(sum_op_v.input[0], "x5_grad")
+        self.assertEqual(sum_op_v.input[1], "x4_grad")
+        self.assertEqual(sum_op_v.output[0], "x2_grad_values_concat")
+
+    def testSparseGradientToDense(self):
+        #
+        #                                        x1-->Gather-->x4-->
+        #                                                 |        |
+        # x0, w, b-->FC-->x2-->EnsureDenseGradient-->x2---+  DotProduct-->x6
+        #                                                 |        |
+        #                                        x3-->Gather-->x5-->
+        net = core.Net("test_net")
+        net.FC(["x0", "w", "b"], "x2")
+        net.EnsureDense(["x2"], "x2")
+        net.Gather(["x2", "x1"], "x4")
+        net.Gather(["x2", "x3"], "x5")
+        net.DotProduct(["x4", "x5"], "x6")
+        net.AddGradientOperators(["x6"])
+        ensure_dense_op = net.Proto().op[-2]
+        self.assertEqual(ensure_dense_op.input[0], "x2_grad_indices_concat")
+        self.assertEqual(ensure_dense_op.input[1], "x2_grad_values_concat")
+        self.assertEqual(ensure_dense_op.output[0], "x2_grad")
+
+    def testSparseAccumulationWithIndicesAndValues(self):
+        # The gradient for "SparseFunHash" computes both indices and values
+        #
+        # x1-------->
+        #           |
+        # x2---->   |
+        #       |   |
+        # x3---SparseFunHash-->x8
+        #       /               \
+        # x4---+            DotProduct-->x10
+        #       \               /
+        # x5---SparseFunHash-->x9
+        #       |   |
+        # x6---->   |
+        #           |
+        # x7-------->
+        net = core.Net("test_net")
+        net.SparseFunHash(["x1", "x2", "x3", "x4"], "x8")
+        net.SparseFunHash(["x5", "x6", "x7", "x4"], "x9")
+        net.DotProduct(["x8", "x9"], "x10")
+        net.AddGradientOperators(["x10"])
+        sum_op_i = net.Proto().op[-2]
+        sum_op_v = net.Proto().op[-1]
+        self.assertEqual(sum_op_i.input[0], "_x4_grad_indices_autosplit_0")
+        self.assertEqual(sum_op_i.input[1], "_x4_grad_indices_autosplit_1")
+        self.assertEqual(sum_op_i.output[0], "x4_grad_indices_concat")
+        self.assertEqual(sum_op_v.input[0], "_x4_grad_values_autosplit_0")
+        self.assertEqual(sum_op_v.input[1], "_x4_grad_values_autosplit_1")
+        self.assertEqual(sum_op_v.output[0], "x4_grad_values_concat")
+
+
+class TestGradientsAccumulationWithNoGradientOps(test_util.TestCase):
+    def testNormalAccumulation(self):
+        #  x1-->Relu--x2----------------->DotProduct-->x4
+        #                |                 |
+        #                 -->Softmax-->x3-->
+        net = core.Net("test_net")
+        net.Relu("x1", "x2")
+        net.Softmax("x2", "x3")
+        net.DotProduct(["x2", "x3"], "x4")
+        net.AddGradientOperators(["x4"])
+        sum_op = net.Proto().op[-2]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+
+    def testAccumulationWithNoGradientBranch(self):
+        #                 -->PRINT
+        #                |
+        #  x1-->Relu--x2----------------->DotProduct-->x4
+        #                |                 |
+        #                 -->Softmax-->x3-->
+        net = core.Net("test_net")
+        net.Relu("x1", "x2")
+        net.Print("x2", [])
+        net.Softmax("x2", "x3")
+        net.DotProduct(["x2", "x3"], "x4")
+        net.AddGradientOperators(["x4"])
+        sum_op = net.Proto().op[-2]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+
+
+class TestGradientsAccumulationWithPassThroughGradients(test_util.TestCase):
+    def testAddOpInMiddle(self):
+        #  x1-->Relu--x2----------------->Add-->x4
+        #                |                 |
+        #                 -->Softmax-->x3-->
+        #
+        # Expected gradient graph:
+        #
+        #  x1_g<--ReluG<--x2_g<--Sum<------------<---------x4_g
+        #                          |                       |
+        #                           <--_x2_g_split_0<--SoftmaxG
+        net = core.Net("test_net")
+        net.Relu("x1", "x2")
+        net.Softmax("x2", "x3")
+        net.Add(["x2", "x3"], "x4")
+        input_to_grad = net.AddGradientOperators({"x4": "x4_grad"})
+        sum_op = net.Proto().op[-2]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+
+    def testAddAndDynamicConstant(self):
+        net = core.Net("test_net")
+        net.FC(["x1", "x1_w", "x1_b"], ["x2"])
+        net.Relu("x2", "x2")
+        net.ConstantFill(["x2"], ["x3"])
+        net.Add(["x2", "x3"], "x4")
+        net.FC(["x4", "x4_w", "x4_b"], ["x5"])
+        net.SoftmaxWithLoss(["x5", "labels"], ["softmax", "loss"])
+        input_to_grad = net.AddGradientOperators(["loss"])
+        for op in net.Proto().op:
+            self.assertFalse(op.type == 'Sum')
+
+        self.assertTrue("x4" in input_to_grad)
+        self.assertTrue("x1" in input_to_grad)
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+
+    def testAddAndStaticConstant(self):
+        net = core.Net("test_net")
+        net.FC(["x1", "x1_w", "x1_b"], ["x2"])
+        net.Relu("x2", "x2")
+        net.ConstantFill([], ["x3"], shape=[1])
+        net.Add(["x2", "x3"], "x4", broadcast=1)
+        net.FC(["x4", "x4_w", "x4_b"], ["x5"])
+        net.SoftmaxWithLoss(["x5", "labels"], ["softmax", "loss"])
+        input_to_grad = net.AddGradientOperators(["loss"])
+        print(input_to_grad)
+
+        self.assertTrue("x1" in input_to_grad)
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+
+    def testSubOpInMiddle(self):
+        #  x1-->Relu--x2----------------->Sub-->x4
+        #                |                 |
+        #                 -->Softmax-->x3-->
+        #
+        # Expected gradient graph:
+        #
+        #  x1_g<--ReluG<--x2_g<--Sum<------------<-----------------------x4_g
+        #                          |                                      |
+        #                           <--_x2_g_split_0<--SoftmaxG<--x3_g<--neg
+        net = core.Net("test_net")
+        net.Relu("x1", "x2")
+        net.Softmax("x2", "x3")
+        net.Sub(["x2", "x3"], "x4")
+        input_to_grad = net.AddGradientOperators({"x4": "x4_grad"})
+        print(str(net.Proto()))
+        sum_op = net.Proto().op[-2]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+
+    def testAddOpAtLeaf(self):
+        # x1
+        #   \
+        #    -->Add-->x4
+        #   /           \
+        # x2             -->DotProduct-->x6
+        #   \           /
+        #    -->Add-->x5
+        #   /
+        # x3
+        #
+        # Expected gradient graph:
+        #
+        #  x2_g<--Sum<--x4_g<--DotProductG<--x6_g
+        #          |                |                       |
+        #           <---x5_g<-------
+        net = core.Net("test_net")
+        net.Add(["x1", "x2"], "x4")
+        net.Add(["x2", "x3"], "x5")
+        net.DotProduct(["x4", "x5"], "x6")
+        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
+        sum_op = net.Proto().op[-1]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+        self.assertEqual(input_to_grad["x2"], "x2_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
+
+    def testSubOpAtLeaf(self):
+        # x1
+        #   \
+        #    -->Sub-->x4
+        #   /           \
+        # x2             -->DotProduct-->x6
+        #   \           /
+        #    -->Sub-->x5
+        #   /
+        # x3
+        #
+        # Expected gradient graph:
+        #
+        #  x2_g<-------Sum<--x2_g_split_0<--neg<--x4_g<--DotProductG<--x6_g
+        #               |                                       |
+        #  x3_g<--neg<--<--x5_g<--------------------------------
+        net = core.Net("test_net")
+        net.Sub(["x1", "x2"], "x4")
+        net.Sub(["x2", "x3"], "x5")
+        net.DotProduct(["x4", "x5"], "x6")
+        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
+        sum_op = net.Proto().op[-1]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+        self.assertEqual(input_to_grad["x2"], "x2_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
+
+    def testMultiLayerAddOps(self):
+        # x1
+        #   \
+        #    -->Add-->x4
+        #   /           \
+        # x2             -->Add-->x6
+        #   \           /
+        #    -->Add-->x5
+        #   /
+        # x3
+        #
+        # Expected gradient graph:
+        #
+        #  x2_g<--Sum<-----x6_g
+        #          |         |
+        #           <--------
+        net = core.Net("test_net")
+        net.Add(["x1", "x2"], "x4")
+        net.Add(["x2", "x3"], "x5")
+        net.Add(["x4", "x5"], "x6")
+        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
+        sum_op = net.Proto().op[-1]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+        self.assertEqual(input_to_grad["x2"], "x2_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
+
+    def testMultiLayerSubOps(self):
+        # x1
+        #   \
+        #    -->Sub-->x4
+        #   /           \
+        # x2             -->Sub-->x6
+        #   \           /
+        #    -->Sub-->x5
+        #   /
+        # x3
+        #
+        # Expected gradient graph:
+        #
+        #  x2_g<--Sum<-----x6_g
+        #          |         |
+        #           <--------
+        net = core.Net("test_net")
+        net.Sub(["x1", "x2"], "x4")
+        net.Sub(["x2", "x3"], "x5")
+        net.Sub(["x4", "x5"], "x6")
+        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
+        sum_op = net.Proto().op[-1]
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
+        self.assertEqual(sum_op.output[0], "x2_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
+        self.assertEqual(input_to_grad["x2"], "x2_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
+
+    def testAccumulationRuns(self):
+        net = core.Net("test_net")
+        input, one, two, three = net.AddExternalInput(
+            "input", "one", "two", "three")
+
+        m1 = net.Mul([input, two], "mul_1")
+        m2 = net.Mul([input, three], "mul_2")
+        sub = net.Sub([m1, one])
+        grad_map = net.AddGradientOperators([m2, sub])
+
+        workspace.ResetWorkspace()
+        workspace.blobs[one] = np.array([1]).astype(np.float32)
+        workspace.blobs[input] = np.array([1]).astype(np.float32)
+        workspace.blobs[two] = np.array([2]).astype(np.float32)
+        workspace.blobs[three] = np.array([3]).astype(np.float32)
+        workspace.RunNetOnce(net)
+        print("Input grad: ", workspace.blobs[grad_map[str(input)]])
+        assert workspace.blobs[grad_map[str(input)]] == 5.0
+
+    def testIncorrectOperator(self):
+        net = core.Net("test_net")
+        a, b, one = net.AddExternalInput("a", "b", "one")
+        m1 = net.Mul(a, b)  # does not have second output
+        sub = net.Sub([m1, one])
+        try:
+            net.AddGradientOperators([sub])
+            self.assertFalse(True, "Did not throw exception")
+        except Exception as e:
+            self.assertTrue("schema" in str(e))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
new file mode 100644
index 0000000..eafdc53
--- /dev/null
+++ b/caffe2/python/core_test.py
@@ -0,0 +1,1099 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from inspect import currentframe, getframeinfo
+import unittest
+
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, test_util
+from caffe2.python.task import Node, Task
+
+
+class TestScopes(test_util.TestCase):
+    def testBlobReferenceIsIndependentFromNameScope(self):
+        blob_v = core.BlobReference("v")
+        with core.NameScope("foo"):
+            blob_w = core.BlobReference("w")
+            with core.NameScope("bar"):
+                blob_x = core.BlobReference("x")
+        self.assertEqual(str(blob_v), "v")
+        self.assertEqual(str(blob_w), "w")
+        self.assertEqual(str(blob_x), "x")
+
+    def testNameScopeWithOp(self):
+        global_x = core.BlobReference("x")
+        global_y = core.BlobReference("y")
+        with core.NameScope("foo"):
+            # Raw strings should have namescope prepended.
+            op = core.CreateOperator("Relu", "x", "y")
+            self.assertEqual(len(op.input), 1)
+            self.assertEqual(op.input[0], "foo/x")
+            self.assertEqual(len(op.output), 1)
+            self.assertEqual(op.output[0], "foo/y")
+            # BlobReferences should not.
+            op = core.CreateOperator("Relu", global_x, global_y)
+            self.assertEqual(len(op.input), 1)
+            self.assertEqual(op.input[0], "x")
+            self.assertEqual(len(op.output), 1)
+            self.assertEqual(op.output[0], "y")
+
+    def testNameScopeWithReset(self):
+        with core.NameScope("foo"):
+            # foo/
+            op = core.CreateOperator("Relu", "x", "y")
+            self.assertEqual(len(op.input), 1)
+            self.assertEqual(op.input[0], "foo/x")
+            self.assertEqual(len(op.output), 1)
+            self.assertEqual(op.output[0], "foo/y")
+            with core.NameScope("bar"):
+                # foo/bar/
+                op = core.CreateOperator("Relu", "x", "y")
+                self.assertEqual(len(op.input), 1)
+                self.assertEqual(op.input[0], "foo/bar/x")
+                self.assertEqual(len(op.output), 1)
+                self.assertEqual(op.output[0], "foo/bar/y")
+            # Back to foo/
+            op = core.CreateOperator("Relu", "x", "y")
+            self.assertEqual(len(op.input), 1)
+            self.assertEqual(op.input[0], "foo/x")
+            self.assertEqual(len(op.output), 1)
+            self.assertEqual(op.output[0], "foo/y")
+            with core.NameScope("bar", reset=True):
+                # bar/
+                op = core.CreateOperator("Relu", "x", "y")
+                self.assertEqual(len(op.input), 1)
+                self.assertEqual(op.input[0], "bar/x")
+                self.assertEqual(len(op.output), 1)
+                self.assertEqual(op.output[0], "bar/y")
+            # Back to foo/
+            op = core.CreateOperator("Relu", "x", "y")
+            self.assertEqual(len(op.input), 1)
+            self.assertEqual(op.input[0], "foo/x")
+            self.assertEqual(len(op.output), 1)
+            self.assertEqual(op.output[0], "foo/y")
+
+    def testDeviceScope(self):
+        # No device
+        op = core.CreateOperator("Relu", "x", "y")
+        self.assertFalse(op.HasField('device_option'))
+        # explicitly setting a device
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
+        self.assertTrue(op.HasField('device_option'))
+        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        with core.DeviceScope(device_option):
+            # from device scope
+            op = core.CreateOperator("Relu", "x", "y")
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            # from an overridden device option
+            override_device = caffe2_pb2.DeviceOption()
+            override_device.device_type = caffe2_pb2.CPU
+            op = core.CreateOperator(
+                "Relu", "x", "y", device_option=override_device)
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CPU)
+        # back from normal: no device
+        op = core.CreateOperator("Relu", "x", "y")
+        self.assertFalse(op.HasField('device_option'))
+        device_option = caffe2_pb2.DeviceOption()
+
+    def testNameAndDeviceScopeTogether(self):
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        with core.DeviceScope(device_option):
+            with core.NameScope("foo"):
+                op = core.CreateOperator("Relu", "x", "y")
+                self.assertTrue(op.HasField('device_option'))
+                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
+                self.assertEqual(len(op.input), 1)
+                self.assertEqual(op.input[0], "foo/x")
+                self.assertEqual(len(op.output), 1)
+                self.assertEqual(op.output[0], "foo/y")
+
+
+class TestCloneNet(test_util.TestCase):
+    def testPartialClone(self):
+        params = core.Net('params')
+        p1 = params.ConstantFill([], ['p1'])
+        workspace.CreateNet(params)
+        workspace.RunNetOnce(params)
+
+        n = core.Net('original')
+        a1 = n.AddExternalInput('a1')
+        a2 = n.AddExternalInput('a2')
+        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
+        c1 = n.Sum([b1, p1], ['c1'])
+        c2 = n.Sum([b2], ['c2'])
+        d = n.Sum([c1, c2], ['d'])
+
+        # test that gradient ops are ignored when partial-cloning
+        n.AddGradientOperators([d])
+
+        # test some in-place ops
+        k = n.Sum([p1], ['k'])
+        e = n.Sum([d], ['e'])
+        e = n.Sum([e, k], [e])
+        e = n.Sum([e], [e])
+        f = n.Sum(e, ['f'])
+
+        def net_assert(net, num_ops, inputs, outputs, internals):
+            self.assertEqual(len(net.Proto().op), num_ops)
+            self.assertEqual(set(net.Proto().external_input), inputs)
+            self.assertEqual(set(net.Proto().external_output), outputs)
+            all_blobs = set(net.Proto().external_input)
+            all_blobs |= set(net.Proto().external_output)
+            for op in net.Proto().op:
+                all_blobs |= set(op.input) | set(op.output)
+            self.assertEqual(all_blobs, inputs | outputs | internals)
+            # create net to make sure its valid
+            for input in inputs:
+                workspace.FeedBlob(input, np.array([]))
+            workspace.CreateNet(net)
+
+        n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d])
+        net_assert(
+            n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'},
+            {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'})
+        self.assertTrue(isinstance(d22, core.BlobReference))
+        self.assertEqual(d22.Net(), n2)
+        self.assertEqual(str(d22), 'f1/d')
+
+        n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d])
+        net_assert(
+            n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'}, {'f2/c1', 'f2/c2', 'p1'})
+        self.assertEqual(str(d22), 'f2/d')
+
+        n4, (c22, ) = n.ClonePartial('f3', [b1], [c1])
+        net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'})
+        self.assertEqual(str(c22), 'f3/c1')
+
+        n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2])
+        net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'})
+        self.assertEqual(str(c11), 'f4/c1')
+        self.assertEqual(str(c22), 'f4/c2')
+
+        with self.assertRaises(AssertionError):
+            n.ClonePartial('f4', [a1, a2, c2], [d])
+
+        n6, (e22, ) = n.ClonePartial('f5', [d], [e])
+        net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'})
+        self.assertEqual(str(e22), 'f5/e')
+
+        n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f])
+        net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'})
+        self.assertEqual(str(e22), 'f7/e')
+        self.assertEqual(str(f22), 'f7/f')
+
+        params._CheckLookupTables()
+        n._CheckLookupTables()
+
+
+class TestCreateOperator(test_util.TestCase):
+    def testCreate(self):
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        op = core.CreateOperator(
+            "Ludicrous", "x", "y", name="ludicrous",
+            control_input="z", device_option=device_option,
+            engine="WARP", arg1=1, arg2="2", arg3=[1, 2, 3])
+        self.assertEqual(op.type, "Ludicrous")
+        self.assertEqual(op.name, "ludicrous")
+        self.assertEqual(op.engine, "WARP")
+        self.assertEqual(len(op.input), 1)
+        self.assertEqual(op.input[0], "x")
+        self.assertEqual(len(op.output), 1)
+        self.assertEqual(op.output[0], "y")
+        self.assertEqual(len(op.control_input), 1)
+        self.assertEqual(op.control_input[0], "z")
+        self.assertTrue(op.HasField('device_option'))
+        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertTrue(len(op.arg), 3)
+
+        # can't guarantee ordering of kwargs, so generate a set of args
+        # to test with
+        arg_map = {}
+        for arg in op.arg:
+            arg_map[arg.name] = arg
+
+        # Check all elements exist that should
+        self.assertEqual("arg1" in arg_map, True)
+        self.assertEqual("arg2" in arg_map, True)
+        self.assertEqual("arg3" in arg_map, True)
+
+        # Now test that all args were initialized correctly
+        self.assertEqual(arg_map["arg1"].i, 1)
+        self.assertEqual(arg_map["arg2"].s, b"2")
+        self.assertEqual(list(arg_map["arg3"].ints), [1, 2, 3])
+
+
+class TestAutoNaming(test_util.TestCase):
+    def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
+        for op in operatorDefList1:
+            op.debug_info = ""
+        for op in operatorDefList2:
+            op.debug_info = ""
+        self.assertEqual(operatorDefList1, operatorDefList2)
+    """
+    Test that operators are named with different names, and that automatically
+    named blob names don't clash intra or inter networks.
+    """
+    def test_next_blob(self):
+        def create_net():
+            net = core.Net('net')
+            with core.NameScope('foo'):
+                net.Add(['a', 'b'], net.NextScopedBlob('ab'))
+
+            net.Add(['c', 'd'], net.NextBlob('cd'))
+            return net
+
+        net_a = create_net()
+        net_b = create_net()
+        # created net proto is predicatable.
+        self.assertOperatorListEqual(net_a.Proto().op,
+                         net_b.Proto().op)
+        self.assertEqual(net_a.Proto().op[0].output[0], 'foo/ab')
+        self.assertEqual(net_a.Proto().op[1].output[0], 'cd')
+
+        net_c = core.Net('net')
+        # different calls return different blob names
+        self.assertNotEqual(str(net_c.NextBlob('b')), str(net_c.NextBlob('b')))
+
+    def test_auto_naming(self):
+        a = core.Net('net')
+        b = core.Net('net')
+        self.assertNotEqual(a.Proto().name, b.Proto().name)
+        a_in1 = a.AddExternalInput('a')
+        b_in1 = b.AddExternalInput('b')
+        all_outputs_single = []
+        all_outputs_list = []
+
+        def add_ops():
+            all_outputs_single.append(a.Sum([a_in1, a_in1]))
+            all_outputs_single.append(a.Sum([a_in1, a_in1]))
+            all_outputs_single.append(b.Sum([b_in1, b_in1]))
+            all_outputs_single.append(b.Sum([b_in1, b_in1]))
+            all_outputs_list.append(a.Sum([a_in1, a_in1], outputs=2))
+            all_outputs_list.append(a.Sum([a_in1, a_in1], outputs=2))
+            all_outputs_list.append(b.Sum([b_in1, b_in1], outputs=2))
+            all_outputs_list.append(b.Sum([b_in1, b_in1], outputs=2))
+
+        add_ops()
+        with core.NameScope('n1'):
+            add_ops()
+
+        # Force reset of lookup tables
+        a.Proto().name
+
+        with core.NameScope('n2'):
+            add_ops()
+
+        all_outputs = []
+        for s in all_outputs_single:
+            all_outputs.append(str(s))
+        for l in all_outputs_list:
+            for o in l:
+                all_outputs.append(str(o))
+
+        for i, o1 in enumerate(all_outputs):
+            for j, o2 in enumerate(all_outputs):
+                if i != j:
+                    self.assertNotEqual(str(o1), str(o2))
+
+        a._CheckLookupTables()
+        b._CheckLookupTables()
+
+
+class TestAppendNet(test_util.TestCase):
+
+    def test_external_inputs_merged_correctly(self):
+        netA = core.Net("A")
+        netA.Sum(["in1", "in2"], ["sum1"])
+        self.assertTrue("in1" in netA.external_inputs)
+
+        netB = core.Net("B")
+        netB.Sum(["in3", "in4"], ["in1"])
+        netB.AppendNet(netA)
+        self.assertFalse("in1" in netB.external_inputs)
+
+    def test_external_inputs_merged_correctlyB(self):
+        netA = core.Net("A")
+        netA.Sum(["in1", "in2"], ["sum1"])
+        self.assertTrue("in1" in netA.external_inputs)
+
+        netB = core.Net("B")
+        netB.Sum(["in3", "in4"], ["in1"])
+        netA.AppendNet(netB)  # note different order than in prev test
+        self.assertTrue("in1" in netA.external_inputs)
+
+
+class TestExtractPredictorNet(test_util.TestCase):
+
+    def test_extract_simple(self):
+        from caffe2.python import brew
+        from caffe2.python.model_helper import ModelHelper, ExtractPredictorNet
+
+        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
+        [data, label] = brew.image_input(
+            model,
+            "reader", ["xx/data", "label"],
+            is_test=1,
+        )
+        cnv = brew.conv(model, data, 'cnv', 32, 32, 4)
+        a = brew.fc(model, cnv, 'a', 100, 200)
+        pred = brew.fc(model, a, 'pred', 200, 5)
+        brew.softmax(model, [pred, label], "softmax")
+
+        (predict_net, export_blobs) = ExtractPredictorNet(
+            net_proto=model.net.Proto(),
+            input_blobs=["xx/data"],
+            output_blobs=["pred"],
+            renames={"xx/data": "image"},
+        )
+        export_blobs = set(export_blobs)
+
+        ops = list(predict_net.Proto().op)
+        for op in ops:
+            self.assertFalse(op.type == "Softmax")
+            self.assertFalse("xx/data" in op.input)
+
+        # Note: image input should not be included
+        self.assertEquals(ops[0].type, "Conv")
+        self.assertEquals(ops[1].type, "FC")
+        self.assertEquals(ops[2].type, "FC")
+        self.assertEquals(len(ops), 3)
+
+        # test rename happened
+        self.assertEquals(ops[0].input[0], "image")
+
+        # Check export blobs
+        self.assertTrue("image" not in export_blobs)
+        self.assertTrue("xx/data" not in export_blobs)
+        self.assertEqual(set([str(p) for p in model.params]), export_blobs)
+
+        # Check external inputs/outputs
+        self.assertTrue("image" in predict_net.Proto().external_input)
+        self.assertEquals(set(["pred"]), set(predict_net.Proto().external_output))
+        self.assertEqual(
+            set(predict_net.Proto().external_input) -
+            set([str(p) for p in model.params]), set(["image"])
+        )
+
+
+class TestOperatorTraceback(test_util.TestCase):
+    def op_name_check(self, net, cf, line, func):
+        net.PopulateProtoWithFileName()
+        filename = getframeinfo(cf).filename
+        self.assertEqual(net.Proto().op[0].name, '{}:{}:{}'.format(
+            filename, line, func))
+
+    def test_operator_constructor_traceback(self):
+        net = core.Net("test")
+        a, b = net.AddExternalInput("a", "b")
+        net.Mul([a, b], "c"); cf = currentframe(); line = cf.f_lineno
+        func = cf.f_code.co_name
+        with self.assertRaises(Exception):
+            workspace.RunNetOnce(net)
+        with self.assertRaises(Exception):
+            workspace.CreateNet(net)
+        self.op_name_check(net, cf, line, func)
+
+    def test_operator_runtime_traceback(self):
+        net = core.Net("test")
+        a = net.AddExternalInput("a")
+        workspace.blobs[a] = np.array([1, 2, 3], dtype=np.float32)
+        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
+        func = cf.f_code.co_name
+        with self.assertRaises(Exception):
+            workspace.RunNetOnce(net)
+        workspace.CreateNet(net)
+        with self.assertRaises(Exception):
+            workspace.RunNet(net)
+        self.op_name_check(net, cf, line, func)
+
+    def test_c_workspace_constructor(self):
+        net = core.Net("test")
+        a, b = net.AddExternalInput("a", "b")
+        net.Mul([a, b], "c"); cf = currentframe(); line = cf.f_lineno
+        func = cf.f_code.co_name
+        ws = workspace.C.Workspace()
+        with self.assertRaises(Exception):
+            ws.run(net)
+        with self.assertRaises(Exception):
+            ws.create_net(net)
+        self.op_name_check(net, cf, line, func)
+
+    def test_c_workspace_runtime(self):
+        net = core.Net("test")
+        a = net.AddExternalInput("a")
+        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
+        func = cf.f_code.co_name
+        ws = workspace.C.Workspace()
+        ws.create_blob(str(a)).feed(np.array([1, 2, 3], dtype=np.float32))
+        ws.create_net(net)
+        with self.assertRaises(Exception):
+            ws.run(net)
+        self.op_name_check(net, cf, line, func)
+
+    def test_async_exception_handling(self):
+        net = core.Net("test")
+        net.Proto().type = 'dag'  # this runs operators on background threads
+        a = net.AddExternalInput("a")
+        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
+        func = cf.f_code.co_name
+        workspace.FeedBlob(a, np.array([1, 2, 3], dtype=np.float32))
+        with self.assertRaises(Exception) as enforceNotMet:
+            workspace.RunNetOnce(net)
+        self.assertIn('enforce fail', str(enforceNotMet.exception))
+        self.op_name_check(net, cf, line, func)
+
+
+class TestCreatePlan(test_util.TestCase):
+
+    def test_create_plan_from_proto_correctly(self):
+        from caffe2.python.net_builder import ops
+        with Node('trainer'), Task(name='my_task', num_instances=2) as task:
+            with ops.task_init():
+                globl = ops.Const(0)
+            with ops.task_instance_init():
+                local = ops.Const(0)
+            with ops.loop(100):
+                ops.Copy(globl, local)
+            with ops.task_instance_exit():
+                ops.Add([globl, local], [globl])
+            with ops.task_exit():
+                ops.Mul([globl, globl], [globl])
+
+        plan = core.Plan(task.get_step())
+        test_plan = core.Plan.create_from_proto(plan.Proto())
+
+        self.assertEqual(len(plan.Steps()), 1)
+        self.assertEqual(len(test_plan.Steps()), 1)
+        self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
+
+        self.assertEqual(len(plan.Nets()), len(test_plan.Nets()))
+        for idx in range(0, len(plan.Nets())):
+            # When we create Net for test_plan, we will end up with new Net
+            # name with postfix.
+            net_1 = plan.Nets()[idx]
+            net_2 = test_plan.Nets()[idx]
+            trim_size = len(net_1.Name())
+            self.assertEqual(net_1.Name(), net_2.Name()[:trim_size])
+
+
+class TestOpRegistryKey(test_util.TestCase):
+    def test_is_operator(self):
+        self.assertTrue(core.IsOperator('Relu'))
+        self.assertFalse(core.IsOperator('NOEXIST'))
+
+    def test_is_operator_with_engine(self):
+        self.assertTrue(core.IsOperatorWithEngine('Relu', 'DEFAULT'))
+        self.assertFalse(core.IsOperatorWithEngine('Relu', 'NOEXIST'))
+
+
+class TestDeviceOption(test_util.TestCase):
+    def test_check_equal_node_name(self):
+        opt1 = core.DeviceOption(0)
+        opt2 = core.DeviceOption(0)
+        self.assertTrue(core.device_option_equal(opt1, opt2))
+        opt2.node_name = 'test'
+        self.assertTrue(core.device_option_equal(opt1, opt2))
+        self.assertFalse(core.device_option_equal(opt1, opt2, ignore_node_name=False))
+        opt1.node_name = 'test'
+        self.assertTrue(core.device_option_equal(opt1, opt2, ignore_node_name=False))
+
+    def test_check_equal_default_value(self):
+        opt1 = caffe2_pb2.DeviceOption()
+        opt2 = caffe2_pb2.DeviceOption()
+        opt1.device_type = 0
+        self.assertTrue(core.device_option_equal(opt1, opt2))
+        opt1.cuda_gpu_id = 5
+        # opt1 still is on CPU, so the options should be equal
+        self.assertTrue(core.device_option_equal(opt1, opt2))
+        opt2.device_type = 0
+        self.assertTrue(core.device_option_equal(opt1, opt2))
+        opt1.device_type = 1
+        self.assertFalse(core.device_option_equal(opt1, opt2))
+
+
+class TestInferDeviceCpuOnly(test_util.TestCase):
+    def test_inject_copy(self):
+        '''
+        Test inject cross device copies - this is a no-op on CPU only devices.
+        '''
+        send_node = 'node:0'
+        recv_node = 'node:1'
+        # Using placeholder ops for send/recv. Placeholder ops are
+        # decorator/fake ops that don't have operator schema.
+        placeholder_send = 'Placeholder:Dummy:Send'
+        placeholder_recv = 'Placeholder:Dummy:Recv'
+
+        # init_net.
+        init_net = core.Net("init_net")
+        with core.DeviceScope(0, node_name=send_node):
+            init_net.XavierFill([], 'fc_w', shape=[10, 100])
+            init_net.ConstantFill([], 'fc_b', shape=[10, ])
+
+        # train_net.
+        train_net = core.Net("train_net")
+        train_net.Proto().external_input.extend(['fc_w', 'fc_b'])
+        with core.DeviceScope(0, node_name=send_node):
+            op = core.CreateOperator(
+                placeholder_send, ["fc_w", 'fc_b'], [],
+                dst_node=recv_node)
+            train_net.Proto().op.extend([op])
+        with core.DeviceScope(0, node_name=recv_node):
+            # Let's rename the recv blob i.e. fc_w -> fc_w_recv.
+            op = core.CreateOperator(
+                placeholder_recv, [], ['fc_w_recv', 'fc_b'],
+                src_node=send_node)
+            train_net.Proto().op.extend([op])
+            train_net.FC(["data", 'fc_w_recv', 'fc_b'], "fc1")
+
+        # Inject cross device copies.
+        init_net, x_dev_state = core.InjectCrossDeviceCopies(
+            init_net,
+            placeHolderOps=[placeholder_send, placeholder_recv])
+        train_net, x_dev_state = core.InjectCrossDeviceCopies(
+            train_net, x_dev_state,
+            placeHolderOps=[placeholder_send, placeholder_recv])
+
+        # Verify: No Copy operators should be injected since it is CPU only.
+        op = train_net.Proto().op[0]
+        self.assertEqual(op.type, placeholder_send)
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.input[0], "fc_w")
+        self.assertEqual(op.input[1], "fc_b")
+        op = train_net.Proto().op[1]
+        self.assertEqual(op.type, placeholder_recv)
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.output[0], "fc_w_recv")
+        self.assertEqual(op.output[1], "fc_b")
+        op = train_net.Proto().op[2]
+        self.assertEqual(op.type, "FC")
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.input[1], "fc_w_recv")
+        self.assertEqual(op.input[2], "fc_b")
+
+
+@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
+class TestInferDevice(test_util.TestCase):
+
+    def setUp(self):
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        self.cuda_option = device_option
+        self.cpu_option = caffe2_pb2.DeviceOption()
+
+    def _test_op(
+        self,
+        op_name,
+        in_option,
+        out_option,
+        op_option=None,
+        inputs=None,
+        outputs=None
+    ):
+        op_option = self.cuda_option if not op_option else op_option
+        inputs = ["blob_1"] if not inputs else inputs
+        outputs = ["blob_2"] if not outputs else outputs
+        with core.DeviceScope(op_option):
+            op = core.CreateOperator(op_name, inputs, outputs)
+        input_dev, output_dev = core.InferOpBlobDevices(op)
+        if isinstance(in_option, list):
+            assert len(in_option) == len(input_dev), \
+                'Length of input device option should match' \
+                '{} vs. {}'.format(in_option, input_dev)
+            for in_dev, in_opt in zip(input_dev, in_option):
+                self.assertEqual(in_dev, in_opt)
+        else:
+            for in_dev in input_dev:
+                self.assertEqual(in_dev, in_option)
+        if isinstance(out_option, list):
+            assert len(out_option) == len(output_dev), \
+                'Length of output device option should match' \
+                '{} vs. {}'.format(out_option, output_dev)
+            for out_dev, out_opt in zip(output_dev, out_option):
+                self.assertEqual(out_dev, out_opt)
+        else:
+            for out_dev in output_dev:
+                self.assertEqual(out_dev, out_option)
+
+    def test_infer_device(self):
+        self._test_op(
+            "FC",
+            self.cuda_option,
+            self.cuda_option,
+            op_option=self.cuda_option,
+            inputs=["data", "fc_w", "fc_b"],
+            outputs=["fc_1"]
+        )
+
+    def test_infer_device_split_by_lengths(self):
+        self._test_op(
+            "SplitByLengths",
+            [self.cuda_option, self.cpu_option],
+            self.cuda_option,
+            op_option=self.cuda_option,
+            inputs=["data", "fc_w"],
+            outputs=["fc_1"]
+        )
+
+    def test_infer_device_cross_device(self):
+        self._test_op("CopyGPUToCPU", self.cuda_option, self.cpu_option)
+        self._test_op("CopyCPUToGPU", self.cpu_option, self.cuda_option)
+        self._test_op("CopyFromCPUInput", self.cpu_option, self.cuda_option)
+        self._test_op(
+            "CopyFromCPUInput",
+            self.cpu_option,
+            self.cpu_option,
+            op_option=self.cpu_option
+        )
+
+    def test_device_inference_function(self):
+        # ConcatOp.
+        op_option = self.cuda_option
+        with core.DeviceScope(op_option):
+            op = core.CreateOperator(
+                'Concat',
+                ['X_{}'.format(i) for i in range(4)],
+                ['concat_result', 'split_info'],
+                axis=1)
+        input_dev, output_dev = core.InferOpBlobDevices(op)
+        # 2nd output's type is CPU irrespective of Concat op's device option.
+        self.assertEqual(output_dev[1], self.cpu_option)
+
+        #SplitOp.
+        op_option = self.cuda_option
+        with core.DeviceScope(op_option):
+            op = core.CreateOperator(
+                'Split',
+                ['input', 'split'],
+                ['X_{}'.format(i) for i in range(4)],
+                axis=0)
+        input_dev, output_dev = core.InferOpBlobDevices(op)
+        # 2nd input's type is CPU irrespective of Split op's device option.
+        self.assertEqual(input_dev[1], self.cpu_option)
+
+    def test_inject_copy(self):
+        net = core.Net("test")
+        init_net = core.Net("init")
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
+        bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
+
+        with core.DeviceScope(device_option):
+            net.FC(["data", weight, bias], "fc1")
+
+        _, blob_to_device = core.InjectCrossDeviceCopies(init_net)
+        new_net, blob_to_device = core.InjectCrossDeviceCopies(
+            net, blob_to_device
+        )
+        op = new_net._net.op[-1]
+        self.assertEqual(op.type, "FC")
+        self.assertEqual(op.input[0], "data_cuda_1")
+        self.assertEqual(op.input[1], "fc_w_cuda_1")
+        self.assertEqual(op.input[2], "fc_b_cuda_1")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
+        self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
+        self.assertNotEqual(blob_to_device["fc_w"], device_option)
+
+    def test_cross_nets(self):
+        net = core.Net("test")
+        init_net = core.Net("init")
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+        weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
+        bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
+        const = init_net.ConstantFill([], 'const', shape=[], value=1.)
+        with core.DeviceScope(device_option):
+            const = init_net.Add([const, const], [const])
+            fc_out = net.FC(["data", weight, bias], "fc1")
+            net.Add([fc_out, const], [fc_out])
+
+        data_remap = {'data': device_option}
+        nets, _ = core.InjectDeviceCopiesAmongNets(
+            [init_net, net], blob_to_device_init=data_remap
+        )
+        op = nets[1]._net.op[0]
+        self.assertEqual(op.type, "CopyCPUToGPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.output[0], "fc_w_cuda_1")
+        op = nets[1]._net.op[1]
+        self.assertEqual(op.type, "CopyCPUToGPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.output[0], "fc_b_cuda_1")
+        op = nets[1]._net.op[2]
+        self.assertEqual(op.type, "FC")
+        self.assertEqual(op.input[0], "data")
+        self.assertEqual(op.input[1], "fc_w_cuda_1")
+        self.assertEqual(op.input[2], "fc_b_cuda_1")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        op = nets[1]._net.op[3]
+        self.assertEqual(op.type, "Add")
+        self.assertEqual(op.input[0], "fc1")
+        self.assertEqual(op.input[1], "const_cuda_1")
+        # check that moved blob is in input to the new net
+        for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
+            self.assertTrue(c in nets[1]._net.external_input)
+        """
+For reference, net.Proto() should be like:
+name: ""
+op {
+  input: "fc_w"
+  output: "fc_w_cuda_1"
+  name: ""
+  type: "CopyCPUToGPU"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "fc_b"
+  output: "fc_b_cuda_1"
+  name: ""
+  type: "CopyCPUToGPU"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "data"
+  input: "fc_w_cuda_1"
+  input: "fc_b_cuda_1"
+  output: "fc1"
+  name: ""
+  type: "FC"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "fc1"
+  input: "const_cuda_1"
+  output: "fc1"
+  name: ""
+  type: "Add"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+external_input: "data"
+external_input: "fc_w"
+external_input: "fc_b"
+external_input: "const"
+external_input: "const_cuda_1"
+"""
+
+    def test_cross_nets_no_change(self):
+        net = core.Net("test")
+        init_net = core.Net("init")
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+
+        with core.DeviceScope(device_option):
+            weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
+            bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
+            net.FC(["data", weight, bias], "fc1")
+
+        data_remap = {'data': device_option}
+        nets = core.InjectDeviceCopiesAmongNetsWithoutB2D(
+            [init_net, net], blob_to_device_init=data_remap
+        )
+        op = nets[1]._net.op[0]
+        self.assertEqual(op.type, "FC")
+        self.assertEqual(op.input[0], "data")
+        self.assertEqual(op.input[1], "fc_w")
+        self.assertEqual(op.input[2], "fc_b")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        """
+For reference, net.Proto() should be like:
+name: ""
+op {
+  input: "data"
+  input: "fc_w"
+  input: "fc_b"
+  output: "fc1"
+  name: ""
+  type: "FC"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+external_input: "data"
+external_input: "fc_w"
+external_input: "fc_b"
+"""
+
+    def test_inject_copy_multi_use(self):
+        net = core.Net("test")
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+
+        with core.DeviceScope(device_option):
+            net.Relu("data", "relu1")
+        net.Relu("data", "relu2")
+        with core.DeviceScope(device_option):
+            net.Relu("data", "relu3")
+        net.Relu("data", "relu4")
+        device_option.cuda_gpu_id = 0
+        with core.DeviceScope(device_option):
+            net.Relu("data", "relu5")
+        device_option.cuda_gpu_id = 1
+        with core.DeviceScope(device_option):
+            net.Relu("data", "relu6")
+
+        new_net, _ = core.InjectCrossDeviceCopies(net)
+        op = new_net._net.op[0]
+        self.assertEqual(op.type, "CopyCPUToGPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.output[0], "data_cuda_1")
+        op = new_net._net.op[1]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.output[0], "relu1")
+        op = new_net._net.op[2]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.output[0], "relu2")
+        op = new_net._net.op[3]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.input[0], "data_cuda_1")
+        self.assertEqual(op.output[0], "relu3")
+        op = new_net._net.op[4]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.output[0], "relu4")
+        op = new_net._net.op[5]
+        self.assertEqual(op.type, "CopyCPUToGPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.output[0], "data_cuda_0")
+        op = new_net._net.op[6]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.input[0], "data_cuda_0")
+        self.assertEqual(op.output[0], "relu5")
+        op = new_net._net.op[7]
+        self.assertEqual(op.type, "Relu")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.input[0], "data_cuda_1")
+        self.assertEqual(op.output[0], "relu6")
+        """
+For reference, net.Proto() should be like:
+name: ""
+op {
+  input: "data"
+  output: "data_cuda_1"
+  name: ""
+  type: "CopyCPUToGPU"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "data_cuda_1"
+  output: "relu1"
+  name: ""
+  type: "Relu"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "data"
+  output: "relu2"
+  name: ""
+  type: "Relu"
+}
+op {
+  input: "data_cuda_1"
+  output: "relu3"
+  name: ""
+  type: "Relu"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+op {
+  input: "data"
+  output: "relu4"
+  name: ""
+  type: "Relu"
+}
+op {
+  input: "data"
+  output: "data_cuda_0"
+  name: ""
+  type: "CopyCPUToGPU"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 0
+  }
+}
+op {
+  input: "data_cuda_0"
+  output: "relu5"
+  name: ""
+  type: "Relu"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 0
+  }
+}
+op {
+  input: "data_cuda_1"
+  output: "relu6"
+  name: ""
+  type: "Relu"
+  device_option {
+    device_type: 1
+    cuda_gpu_id: 1
+  }
+}
+external_input: "data"
+"""
+
+    def test_inject_copy_placeholder_ops(self):
+        '''
+        Test inject cross device copies with placeholder ops. Placeholder ops
+        are decorator/fake ops that don't have operator schema.
+        '''
+        # Create CPU and GPU devices on 2 nodes.
+        cpu_device = []
+        gpu_device = []
+        for i in range(0, 2):
+            cpu_device.append(caffe2_pb2.DeviceOption())
+            cpu_device[i].node_name = 'node:' + str(i)
+            gpu_device.append(caffe2_pb2.DeviceOption())
+            gpu_device[i].device_type = caffe2_pb2.CUDA
+            gpu_device[i].cuda_gpu_id = 0
+            gpu_device[i].node_name = 'node:' + str(i)
+        send_node = 'node:0'
+        recv_node = 'node:1'
+        placeholder_send = 'Placeholder:Dummy:Send'
+        placeholder_recv = 'Placeholder:Dummy:Recv'
+
+        # init_net.
+        init_net = core.Net("init_net")
+        with core.DeviceScope(gpu_device[0]):
+            weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
+            bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
+        with core.DeviceScope(cpu_device[0]):
+            op = core.CreateOperator(
+                placeholder_send, [weight, bias], [],
+                dst_node=recv_node)
+            init_net._net.op.extend([op])
+
+        # train_net
+        train_net = core.Net("train_net")
+        with core.DeviceScope(cpu_device[1]):
+            # XXX. replace hardcoded op name. Move test to net_transforms.
+            op = core.CreateOperator(
+                placeholder_recv, [], [weight, bias],
+                src_node=send_node)
+            train_net._net.op.extend([op])
+            train_net.FC(["data", weight, bias], "fc1")
+
+        # Inject cross device copies.
+        init_net, x_dev_state = core.InjectCrossDeviceCopies(
+            init_net,
+            placeHolderOps=[placeholder_send, placeholder_recv])
+        train_net, x_dev_state = core.InjectCrossDeviceCopies(
+            train_net, x_dev_state,
+            placeHolderOps=[placeholder_send, placeholder_recv])
+
+        # Verify (init_net)
+        op = init_net._net.op[2]
+        self.assertEqual(op.type, "CopyGPUToCPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.output[0], "fc_w_cpu")
+        op = init_net._net.op[3]
+        self.assertEqual(op.type, "CopyGPUToCPU")
+        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.output[0], "fc_b_cpu")
+        op = init_net._net.op[4]
+        self.assertEqual(op.type, placeholder_send)
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.input[0], "fc_w_cpu")
+        self.assertEqual(op.input[1], "fc_b_cpu")
+        # Verify (train_net)
+        op = train_net._net.op[0]
+        self.assertEqual(op.type, placeholder_recv)
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.output[0], "fc_w_cpu")
+        self.assertEqual(op.output[1], "fc_b_cpu")
+        op = train_net._net.op[3]
+        self.assertEqual(op.type, "FC")
+        self.assertEqual(op.device_option.device_type, 0)
+        self.assertEqual(op.input[1], "fc_w_cpu")
+        self.assertEqual(op.input[2], "fc_b_cpu")
+
+    def test_blob_inplace(self):
+        net = core.Net("test")
+        device_option = caffe2_pb2.DeviceOption()
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = 1
+
+        net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
+        with core.DeviceScope(device_option):
+            net.Relu("param", "param_relu_no_sense")
+        net, _ = core.InjectCrossDeviceCopies(net)
+        op = net._net.op[1]
+        self.assertEqual(op.type, 'CopyCPUToGPU')
+        self.assertEqual(op.input[0], 'param')
+        self.assertEqual(op.output[0], 'param_cuda_1')
+        op = net._net.op[2]
+        self.assertEqual(op.input[0], 'param_cuda_1')
+
+        net.Relu('nonsense_input', 'moment')
+        # should not raise inplace error
+        core.InjectCrossDeviceCopies(net)
+        with core.DeviceScope(device_option):
+            net.Relu('nonsense_input_gpu', 'moment')
+        with self.assertRaises(RuntimeError):
+            core.InjectCrossDeviceCopies(net)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
new file mode 100644
index 0000000..fbe4f1b
--- /dev/null
+++ b/caffe2/python/crf.py
@@ -0,0 +1,375 @@
+## @package crf
+# Module caffe2.python.crf
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, recurrent, model_helper, brew
+import numpy as np
+
+'''
+Due to a limitation in ReccurentNetworkOp, this layer only supports batch_size=1
+In order to support batch_size > 1, we will have to implement the CRFUnit
+and its gradient in C++ and handle the different batches there.
+'''
+
+
+class CRFWithLoss(object):
+    def __init__(self, model, num_classes, transitions_blob=None):
+        self.model = model
+        self.num_classes = num_classes
+        self.num_classes_padded = num_classes + 2  # After adding BOS and EOS
+        if not transitions_blob:
+            transitions_blob = self.model.param_init_net.UniformFill(
+                [],
+                [core.ScopedBlobReference('crf_transitions')],
+                shape=[self.num_classes_padded, self.num_classes_padded],
+                min=-1.0,
+                max=1.0
+            )
+        self.transitions = transitions_blob
+        self.model.params.append(self.transitions)
+
+    def crf_loss(self, predictions, labels, seq_lengths=None):
+        # Since the transitions matrix is a shared parameter, need to
+        # take a snapshot of it at the beginning since it can be updated
+        # in between the operators that uses it when doing parallel updates
+        transitions_snapshot = self.model.net.Copy(
+            self.transitions, core.ScopedBlobReference('transitions_snapshot')
+        )
+        # Compute best path unary score from the logits
+        path_unary_score = self._gather_entries_sum(
+            predictions, labels, self.num_classes
+        )
+        # Append BOS and EOS entries to the predictions and labels
+        predictions = self._pad_predictions(predictions)
+        labels = self._pad_labels(labels)
+        # Compute best path binary scores from the transitions matrix
+        path_binary_score = self._path_binary_scores(
+            labels, transitions_snapshot, seq_lengths
+        )
+        path_total_score = self.model.net.Add(
+            [path_binary_score, path_unary_score],
+            core.ScopedBlobReference('path_total')
+        )
+        # Compute all paths score
+        zero_index = self.model.param_init_net.ConstantFill(
+            [], shape=[1], value=0
+        )
+        initial_state = self.model.net.Gather(
+            [predictions, zero_index],
+            core.ScopedBlobReference('rnn_initial'),
+            dense_gradient=True
+        )
+        input_data, _ = self.model.net.RemovePadding(
+            [predictions],
+            padding_width=1,
+            end_padding_width=0,
+            outputs=2,
+        )
+        input_data = self.model.net.ExpandDims(
+            [input_data],
+            core.ScopedBlobReference('rnn_input_data'),
+            dims=[1]
+        )
+        # Due to a bug in RecurrentNetworkGradientOp, we need to copy the
+        # transitions blob before sending it to the recurrent network
+        transitions_copy = self.model.net.Copy(
+            transitions_snapshot, core.ScopedBlobReference('transitions_copy')
+        )
+        all_paths_scores = self._crf_forward(
+            input_data, initial_state, transitions_copy
+        )
+        loss = self.model.net.Sub(
+            [all_paths_scores, path_total_score],
+            core.ScopedBlobReference('crf_loss')
+        )
+        return loss
+
+    def _pad_predictions(self, predictions):
+        # This function will introduce two labels for beginning of sequence
+        # And end of sequence, it will make the necessary udpates to the
+        # the predictions blob
+
+        low_score = -1000.0  # An arbitray very low number
+        b_scores = np.array(
+            [[low_score] * self.num_classes + [0, low_score]]
+        ).astype(np.float32)
+
+        e_scores = np.array(
+            [[low_score] * self.num_classes + [low_score, 0]]
+        ).astype(np.float32)
+
+        b_scores = self.model.param_init_net.GivenTensorFill(
+            [], "b_scores", shape=[1, self.num_classes_padded], values=b_scores
+        )
+        e_scores = self.model.param_init_net.GivenTensorFill(
+            [], "e_scores", shape=[1, self.num_classes_padded], values=e_scores
+        )
+
+        zero_index = self.model.net.ConstantFill(
+            [], shape=[1, ], value=0
+        )
+        length = self.model.net.Gather(
+            [self.model.net.Shape([predictions]), zero_index],
+        )
+        length = self.model.net.Cast(length, to='int32')
+        t_range = self.model.net.LengthsRangeFill(length)
+        padding = self.model.net.ConstantFill([t_range], value=low_score)
+        padding = self.model.net.ExpandDims(padding, dims=[1])
+        padded_predictions, _ = self.model.net.Concat(
+            [predictions, padding, padding],
+            outputs=2,
+            axis=1
+        )
+        padded_predictions_concat, _ = self.model.net.Concat(
+            [b_scores, padded_predictions, e_scores],
+            outputs=2,
+            axis=0
+        )
+        return padded_predictions_concat
+
+    def _pad_labels(self, labels):
+        bos_i = self.num_classes
+        eos_i = self.num_classes + 1
+        bos_i_b = self.model.param_init_net.ConstantFill(
+            [], shape=[1], value=bos_i
+        )
+        eos_i_b = self.model.param_init_net.ConstantFill(
+            [], shape=[1], value=eos_i
+        )
+        labels = self.model.net.Cast([labels], to='int64')
+        padded_labels, _ = self.model.net.Concat(
+            [bos_i_b, labels, eos_i_b],
+            axis=0,
+            outputs=2
+        )
+        return padded_labels
+
+    def _path_binary_scores(self, labels, transitions, seq_lengths=None):
+        column_ids, _ = self.model.net.RemovePadding(
+            [labels],
+            outputs=2,
+            padding_width=1,
+            end_padding_width=0
+        )
+        row_ids, _ = self.model.net.RemovePadding(
+            [labels],
+            outputs=2,
+            padding_width=0,
+            end_padding_width=1
+        )
+        # Since there is no multi-dimensional gather, I flatten the matrix to
+        # a 1-d vector and transform the ids to (row_ids * num_columns +
+        # column_ids) and do gather in 1-d
+        num_columns_blob = self.model.net.ConstantFill(
+            [row_ids],
+            value=self.num_classes_padded,
+        )
+        flattened_ids = self.model.net.Mul([row_ids, num_columns_blob])
+        flattened_ids = self.model.net.Add([flattened_ids, column_ids])
+        flattened_transitions = self.model.net.FlattenToVec([transitions])
+        entries = self.model.net.Gather(
+            [flattened_transitions, flattened_ids],
+            dense_gradient=True
+        )
+        return self.model.ReduceFrontSum(entries)
+
+    def _gather_entries_sum(self, in_data, indices, index_size):
+        indices = self.model.net.Cast([indices], to='int64')
+        index_size_blob = self.model.param_init_net.ConstantFill(
+            [],
+            shape=[1],
+            value=index_size,
+        )
+        query_one_hot = self.model.net.OneHot(
+            [indices, index_size_blob]
+        )
+        flattend_query = self.model.net.FlattenToVec(query_one_hot)
+        flattend_data = self.model.net.FlattenToVec(in_data)
+        query_scores = self.model.net.DotProduct(
+            [flattend_query, flattend_data]
+        )
+        final_sum = self.model.net.ReduceFrontSum([query_scores])
+        return final_sum
+
+    def _crf_forward(
+        self,
+        input_blob,
+        initial_state,
+        transitions_copy,
+        seq_lengths=None
+    ):
+        # Build the RNN net and get the last timestep output
+        out_last = self.build_crf_net(
+            input_blob, initial_state, transitions_copy
+        )
+        out_last, _ = self.model.net.Reshape(
+            [out_last],
+            outputs=2,
+            shape=(self.num_classes_padded,)
+        )
+        zero_segment_id = self.model.param_init_net.ConstantFill(
+            [],
+            value=0,
+            shape=[self.num_classes_padded],
+            dtype=core.DataType.INT32,
+        )
+
+        # Compute the accumlated total score of all the paths
+        accum_score = self.model.net.SortedSegmentRangeLogSumExp(
+            [out_last, zero_segment_id]
+        )
+        accum_score, _ = self.model.net.Reshape(
+            accum_score,
+            outputs=2,
+            shape=()
+        )
+        return accum_score
+
+    def build_crf_net(self, input_blob, initial_state, transitions):
+            '''
+            Adds the crf_net recurrent operator to the model.
+
+            model: model_helper.ModelHelper object new operators would be added
+            to
+
+            input_blob: the input sequence in a format T x N x D
+            where T is sequence size, N - batch size and D - input dimention
+            ##Only supports batch-size 1##
+
+            seq_lengths: blob containing sequence lengths (unused)
+            '''
+
+            scope = 'crf_net'
+
+            def s(name):
+                ''
+                # We have to manually scope due to our internal/external blob
+                # relationships.
+                return "{}/{}".format(str(scope), str(name))
+
+            step_model = model_helper.ModelHelper(name='crf_step',
+                                                  param_model=self.model)
+            input_t, cell_t_prev, _ = (
+                step_model.net.AddExternalInputs(
+                    core.ScopedBlobReference('input_t'),
+                    core.ScopedBlobReference('cell_t_prev'),
+                    transitions
+                )
+            )
+            zero_segment_id = step_model.param_init_net.ConstantFill(
+                [],
+                [s('zero_segment_id')],
+                value=0,
+                shape=[self.num_classes_padded],
+                dtype=core.DataType.INT32,
+            )
+
+            # A hack to bypass model cloning for test
+            step_model.param_init_net.AddExternalOutput(zero_segment_id)
+            """ the CRF step """
+            # Do tile
+            prev_transpose = brew.transpose(
+                step_model,
+                cell_t_prev,
+                [s('prev_transpose')],
+                axes=(0, 2, 1),
+            )
+            prev_tiled = step_model.net.Tile(
+                prev_transpose,
+                [s('prev_tiled')],
+                tiles=self.num_classes_padded,
+                axis=2,
+            )
+            input_t_tiled = step_model.net.Tile(
+                input_t,
+                [s('input_t_tiled')],
+                tiles=self.num_classes_padded,
+                axis=1,
+            )
+            input_with_prev = step_model.net.Add(
+                [prev_tiled, input_t_tiled],
+                [s('input_with_prev')]
+            )
+            all_with_transitions = step_model.net.Add(
+                [input_with_prev, transitions],
+                [s('prev_with_transitions')],
+                broadcast=1,
+                use_grad_hack=1,
+            )
+            all_with_transitions_reshaped, _ = step_model.net.Reshape(
+                all_with_transitions,
+                [s('all_with_transitions_reshaped'), s('all_with_transitions_orig')],
+                shape=(self.num_classes_padded, self.num_classes_padded)
+            )
+            cell_t = step_model.net.SortedSegmentRangeLogSumExp(
+                [all_with_transitions_reshaped, zero_segment_id],
+                [s('cell_t')],
+            )
+            step_model.net.AddExternalOutputs(cell_t)
+            """ recurrent network """
+            cell_input_blob = initial_state
+            out_all, out_last = recurrent.recurrent_net(
+                net=self.model.net,
+                cell_net=step_model.net,
+                inputs=[(input_t, input_blob)],
+                initial_cell_inputs=[
+                    (cell_t_prev, cell_input_blob),
+                ],
+                links={
+                    cell_t_prev: cell_t,
+                },
+                scope=scope,
+                outputs_with_grads=(1,)
+            )
+            return out_last
+
+    def update_predictions(self, classes):
+
+        def crf_update_predictions_op(inputs, outputs):
+            # This operator will compute the best path of classes by performing
+            # Viterbi decoding and then updates the predictions to make the tag
+            # On the best path has the highest score among the others
+            predictions = inputs[0].data
+            transitions = inputs[1].data
+            predictions = inputs[0].data
+            predictions_shape = inputs[0].shape
+            outputs[0].reshape(predictions_shape)
+
+            trellis = np.zeros(predictions_shape)
+            backpointers = np.zeros(predictions_shape, dtype=np.int32)
+            trellis[0] = predictions[0]
+
+            for t in range(1, predictions_shape[0]):
+                v = np.expand_dims(trellis[t - 1], 1) + transitions
+                trellis[t] = predictions[t] + np.max(v, 0)
+                backpointers[t] = np.argmax(v, 0)
+
+            viterbi = [np.argmax(trellis[-1])]
+            for bp in reversed(backpointers[1:]):
+                viterbi.append(bp[viterbi[-1]])
+            viterbi.reverse()
+
+            new_predictions = np.zeros(predictions_shape)
+            old_bests = []
+            for i, w_predictions in enumerate(predictions):
+                # Get the current tag with the maximum score
+                new_predictions[i] = predictions[i]
+                old_best = np.argmax(w_predictions)
+                old_bests.append(old_best)
+                # Swap the scores of the current best tag and the tag on the
+                # Viterbi path
+                w_predictions[viterbi[i]], w_predictions[old_best] = \
+                    w_predictions[old_best], w_predictions[viterbi[i]]
+                new_predictions[i] = w_predictions
+            # Remove the BOS and EOS entries from the predictions matrix
+            orig_predictions = new_predictions[1:-1, 0:-2]
+            outputs[0].reshape(orig_predictions.shape)
+            outputs[0].data[...] = orig_predictions
+        padded_classes = self._pad_predictions(classes)
+        new_classes = self.model.net.Python(crf_update_predictions_op)(
+            [padded_classes, self.transitions],
+            core.ScopedBlobReference('post_crf_classes')
+        )
+        return new_classes
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
new file mode 100644
index 0000000..b49a795
--- /dev/null
+++ b/caffe2/python/data_parallel_model.py
@@ -0,0 +1,1990 @@
+## @package data_parallel_model
+# Module caffe2.python.data_parallel_model
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+from future.utils import viewitems, viewkeys, viewvalues
+import logging
+import copy
+
+from caffe2.python import \
+    model_helper, dyndep, scope, workspace, core, memonger, utils
+from caffe2.proto import caffe2_pb2
+
+import numpy as np
+import warnings
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
+
+log = logging.getLogger("data_parallel_model")
+log.setLevel(logging.INFO)
+
+_DEFAULT_TIMEOUT_SEC = 30
+_DEFAULT_BARRIER_NET_TIMEOUT_SEC = 300
+
+
+def Parallelize_GPU(*args, **kwargs):
+    kwargs['cpu_device'] = False
+    Parallelize(*args, **kwargs)
+
+
+def Parallelize_CPU(*args, **kwargs):
+    kwargs['cpu_device'] = True
+    Parallelize(*args, **kwargs)
+
+
+def Parallelize(
+    model_helper_obj,
+    input_builder_fun,
+    forward_pass_builder_fun,
+    param_update_builder_fun=None,
+    optimizer_builder_fun=None,
+    post_sync_builder_fun=None,
+    net_transformer_fun=None,
+    devices=None,
+    rendezvous=None,
+    net_type='dag',
+    broadcast_computed_params=True,
+    optimize_gradient_memory=False,
+    dynamic_memory_management=False,
+    blobs_to_keep=None,
+    use_nccl=False,
+    max_concurrent_distributed_ops=16,
+    cpu_device=False,
+    num_threads_per_device=4,
+    shared_model=False,
+    combine_spatial_bn=False,
+    barrier_net_timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC,
+):
+    '''
+    Function to create a model that can run on many GPUs or CPUs.
+      model_helper_obj: an object of ModelHelper
+      input_builder_fun:
+                         Function that adds the input operators
+                         Note: Remember to instantiate reader outside of this
+                         function so all devices share same reader object.
+                         Signature:  input_builder_fun(model)
+      forward_pass_builder_fun:
+                        Function to add the operators to the model.
+                        Must return list of loss-blob references that
+                        are used to build the gradient. Loss scale parameter
+                        is passed, as you should scale the loss of your model
+                        by 1.0 / the total number of devices.
+                        Signature: forward_pass_builder_fun(model, loss_scale)
+      param_update_builder_fun:
+                        Function that adds operators that are run after
+                        gradient update, such as updating the weights and
+                        weight decaying. This is called for each GPU separately.
+                        Signature: param_update_builder_fun(model)
+      optimizer_builder_fun:
+                        Alternative to param_update_builder_fun, allows one
+                        to add an optimizer for the whole model. Called only
+                        once, without name or devicescope.
+      net_transformer_fun:
+                        Optional function to transform the network after the
+                        network is built. It will be called once (NOT once per
+                        GPU.)
+                        Signature:
+                        net_transformer_fun(
+                            model, num_devices, device_prefix, device_type)
+      post_sync_builder_fun:
+                        Function applied after initial parameter sync has been
+                        completed, such as keeping multi-precision parameters
+                        in sync.
+                        Signature: post_sync_builder_fun(model)
+      devices:          List of GPU ids, such as [0, 1, 2, 3],
+      rendezvous:       used for rendezvous in distributed computation, if None
+                        then only one node is used. To create rendezvous,
+                        use <TBD>.
+      net_type:         Network type
+      optimize_gradient_memory: whether to apply 'memonger' to share blobs
+      shared_model      (only for CPU) use same parameters on each device
+                        in gradient computation to reduce memory footprint.
+      dynamic_memory_management: Whether to apply dynamic memory optimization
+                        by freeing unused blobs. The underlying (de)allocation
+                        uses cached allocator. For GPU training PLEASE MAKE SURE
+                        caffe2_cuda_memory_pool is set.
+      blobs_to_keep :   A list of blob names to keep and don't free during
+                        dynamic memory optimization (for example loss blob).
+      cpu_device        Use CPU instead of GPU.
+      combine_spatial_bn:
+                        When set to True, applies batch normalization across
+                        all devices within the node. If False, batch
+                        normalization will be done separately for each device.
+                        This option is currently only supported on the CPU.
+      barrier_net_timeout_sec:
+                        The timeout in seconds of the barrier net, which is run
+                        to synchronize shards before a training epoch starts.
+                        Defaults to 300 seconds.
+    '''
+    assert scope.CurrentDeviceScope() is None \
+        or scope.CurrentDeviceScope().device_type == caffe2_pb2.CPU, \
+        "Parallelize must be called without device-scope, \
+        device scope was: {}".format(scope.CurrentDeviceScope())
+
+    if devices is None:
+        devices = list(range(0, workspace.NumCudaDevices())),
+
+    if not cpu_device:
+        for gpu in devices:
+            if gpu >= workspace.NumCudaDevices():
+                log.warning("** Only {} GPUs available, GPUs {} requested".format(
+                    workspace.NumCudaDevices(), devices))
+                break
+        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_prefix = "gpu"
+        model_helper_obj._shared_model = False
+        device_name = "GPU"
+        assert shared_model is False, "Shared model only supported on CPU"
+    else:
+        model_helper_obj._device_type = caffe2_pb2.CPU
+        model_helper_obj._device_prefix = "cpu"
+        device_name = "CPU"
+        model_helper_obj._shared_model = shared_model
+        if shared_model and rendezvous is not None:
+            assert "Shared model only supported on single-node currently"
+
+    log.info("Parallelizing model for devices: {}".format(devices))
+    extra_workers = 8 if rendezvous is not None else 0  # best-guess
+    num_workers = len(devices) * num_threads_per_device + extra_workers
+    max_concurrent_distributed_ops =\
+        min(max_concurrent_distributed_ops, num_workers - 1)
+    model_helper_obj.net.Proto().num_workers = num_workers
+    model_helper_obj.net.Proto().type = net_type
+
+    # Store some information in the model -- a bit ugly
+    model_helper_obj._devices = devices
+    model_helper_obj._rendezvous = rendezvous
+    model_helper_obj._sync_barrier_net = None
+
+    model_helper_obj._broadcast_context = None
+    model_helper_obj._grad_names = []
+
+    assert isinstance(model_helper_obj, model_helper.ModelHelper)
+
+    # Keep track of params that were in the model before: they are not
+    # data parallel, so we need to handle them separately
+    non_datapar_params = copy.copy(model_helper_obj.params)
+
+    # Add input and model
+    log.info("Create input and model training operators")
+
+    losses_by_gpu = {}
+    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
+    loss_scale = 1.0 / (len(devices) * num_shards)
+
+    has_parameter_updates = param_update_builder_fun is not None or \
+        optimizer_builder_fun is not None
+    assert not (
+        param_update_builder_fun is not None and
+        optimizer_builder_fun is not None
+    ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun'
+
+    # Check that a model that is used for validation/testing has
+    # init_params False, otherwise running the param init net will overwrite
+    # synchronized values by the training net
+    if not has_parameter_updates and model_helper_obj.init_params:
+        log.warning('')
+        log.warning("############# WARNING #############")
+        log.warning("Model {}/{} is used for testing/validation but".format(
+            model_helper_obj.name, model_helper_obj))
+        log.warning("has init_params=True!")
+        log.warning("This can conflict with model training.")
+        log.warning("Please ensure model = ModelHelper(init_params=False)")
+        log.warning('####################################')
+        log.warning('')
+        # TODO: make into assert
+
+    for device in devices:
+        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+        with core.DeviceScope(device_opt):
+            with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
+                                               device)):
+                log.info("Model for {} : {}".format(device_name, device))
+                input_builder_fun(model_helper_obj)
+                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
+                # Losses are not needed for test net
+                if has_parameter_updates:
+                    assert isinstance(losses, list), \
+                        'Model builder function must return list of loss blobs'
+                    for loss in losses:
+                        assert isinstance(loss, core.BlobReference), \
+                            'Model builder func must return list of loss blobs'
+
+                losses_by_gpu[device] = losses
+    _ValidateParams(model_helper_obj.params)
+
+    # Create parameter map
+    model_helper_obj._device_grouped_blobs =\
+        _GroupByDevice(model_helper_obj, devices,
+                       model_helper_obj.params, non_datapar_params)
+
+    # computed params
+    computed_params_grouped =\
+        _GroupByDevice(model_helper_obj, devices,
+                       model_helper_obj.GetComputedParams(''), [])
+    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)
+
+    model_helper_obj._param_names =\
+        list(viewkeys(model_helper_obj._device_grouped_blobs))
+    model_helper_obj._computed_param_names =\
+        list(viewkeys(computed_params_grouped))
+
+    if has_parameter_updates:
+        log.info("Adding gradient operators")
+        _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
+
+    if net_transformer_fun:
+        net_transformer_fun(
+            model_helper_obj,
+            len(devices),
+            model_helper_obj._device_prefix,
+            model_helper_obj._device_type)
+
+    if not has_parameter_updates:
+        log.info("Parameter update function not defined --> only forward")
+        _InferBlobDevice(model_helper_obj)
+        return
+
+    if combine_spatial_bn:
+        assert(cpu_device), \
+            'combine_spatial_bn is currently only supported on the CPU'
+        assert(has_parameter_updates), \
+            'combine_spatial_bn should only be used for train model'
+        _InterleaveOps(model_helper_obj)
+        _InterDeviceBatchNormalization(model_helper_obj)
+
+    _ValidateParams(model_helper_obj.params)
+
+    # Group gradients by device and register to blob lookup
+    param_to_grad = model_helper_obj.param_to_grad
+    grads_ordered = [param_to_grad[p] for p in
+                     model_helper_obj.params if p in param_to_grad]
+    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]
+
+    gradients_grouped = _GroupByDevice(
+        model_helper_obj,
+        devices,
+        grads_ordered,
+        non_datapar_grads
+    )
+    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
+    model_helper_obj._grad_names = list(viewkeys(gradients_grouped))
+    model_helper_obj._losses_by_gpu = losses_by_gpu
+
+    _InferBlobDevice(model_helper_obj)
+
+    log.info("Add gradient all-reduces for SyncSGD")
+    if broadcast_computed_params:
+        _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl)
+
+    if len(model_helper_obj._grad_names) > 0:
+        # Gradients in reverse order
+        reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj)
+        assert(len(reverse_ordered_grads) > 0)
+        _AllReduceBlobs(
+            reverse_ordered_grads,
+            devices,
+            model_helper_obj,
+            model_helper_obj.net,
+            rendezvous,
+            use_nccl,
+            max_concurrent_distributed_ops,
+        )
+    else:
+        log.info("NOTE: Param builder function did not create any parameters.")
+
+    log.info("Post-iteration operators for updating params")
+    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
+
+    all_params = set(model_helper_obj.GetParams(''))
+    if shared_model:
+        _PruneParametersForSharing(model_helper_obj)
+
+    if param_update_builder_fun is not None:
+        for device in devices:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            with core.DeviceScope(device_opt):
+                with core.NameScope(
+                    "{}_{}".format(model_helper_obj._device_prefix, device)
+                ):
+                    param_update_builder_fun(model_helper_obj)
+    else:
+        log.info("Calling optimizer builder function")
+        optimizer = optimizer_builder_fun(model_helper_obj)
+        model_helper_obj._optimizer = optimizer
+
+    (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj)
+    sync_blobs_grouped = _GroupByDevice(
+        model_helper_obj,
+        devices,
+        sync_blobs,
+        [],
+    )
+    model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped)
+
+    _InferBlobDevice(model_helper_obj)
+    _AnalyzeOperators(model_helper_obj)
+
+    # Configure dagnet to run with only one worker on the first iteration,
+    # to prevent concurrency problems with allocs and nccl.
+    arg = model_helper_obj.Proto().arg.add()
+    arg.name = "first_iter_only_one_worker"
+    arg.i = 1
+
+    # Add initial parameter syncs
+    log.info("Add initial parameter sync")
+    _SyncAllParams(
+        devices,
+        model_helper_obj,
+        model_helper_obj.param_init_net,
+        model_helper_obj.param_init_net,
+        rendezvous,
+        sync_names,
+        max_concurrent_distributed_ops=1
+    )
+
+    # Handle any operations that need to be done after parameter sync
+    # i.e. making sure multi-precision copies of parameters are up-to-date
+    if post_sync_builder_fun is not None:
+        for device in devices:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            with core.DeviceScope(device_opt):
+                with core.NameScope(
+                    "{}_{}".format(model_helper_obj._device_prefix, device)
+                ):
+                    post_sync_builder_fun(model_helper_obj)
+
+    assert not (optimize_gradient_memory and dynamic_memory_management), \
+        """It is not advised to use gradient optimization ('memonger')
+        with dynamic memory management."""
+
+    if optimize_gradient_memory:
+        _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices)
+
+    if dynamic_memory_management:
+        _AddDynamicMemoryOptimization(model_helper_obj, blobs_to_keep, devices)
+
+
+    model_helper_obj._data_parallel_model_init_nets = [
+        model_helper_obj.param_init_net,
+    ]
+
+    model_helper_obj._data_parallel_model_nets = [
+        model_helper_obj.net
+    ]
+    _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec)
+
+    if shared_model:
+        _RemapParameterBlobsForSharedModel(model_helper_obj, all_params)
+
+
+def Parallelize_GPU_BMUF(*args, **kwargs):
+    kwargs['cpu_device'] = False
+    Parallelize_BMUF(*args, **kwargs)
+
+
+def Parallelize_CPU_BMUF(*args, **kwargs):
+    kwargs['cpu_device'] = True
+    Parallelize_BMUF(*args, **kwargs)
+
+
+def Parallelize_BMUF(
+    model_helper_obj,
+    input_builder_fun,
+    forward_pass_builder_fun,
+    param_update_builder_fun,
+    block_learning_rate=1.0,
+    block_momentum=None,
+    devices=None,
+    rendezvous=None,
+    net_type='dag',
+    master_device=None,
+    use_nccl=False,
+    nesterov=False,
+    optimize_gradient_memory=False,
+    reset_momentum_sgd=False,
+    warmup_iterations=None,
+    max_concurrent_distributed_ops=4,
+    add_blobs_to_sync=None,
+    num_threads_per_device=4,
+    cpu_device=False,
+    barrier_net_timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC,
+):
+    '''
+    Function to create model that run on many GPUs and creates a net for
+    parameter_updates that can be run independently for number of iterations
+    then followed by another net that runs once to compute the final parameter
+    updates according to block wise model update filtering rule described
+    in : Scalable Training of Deep Learning Machines by Incremental Block
+    Training with Intra-block Parallel Optimization and Blockwise Model-Update
+    Filtering (ICASSP 2016).
+    '''
+    assert scope.CurrentDeviceScope() is None \
+        or scope.CurrentDeviceScope().device_type == caffe2_pb2.CPU, \
+        "Parallelize must be called without device-scope, \
+        device scope was: {}".format(scope.CurrentDeviceScope())
+
+    assert isinstance(model_helper_obj, model_helper.ModelHelper)
+
+    if devices is None:
+        devices = list(range(0, workspace.NumCudaDevices()))
+    if master_device is None:
+        master_device = devices[0]
+
+    if not cpu_device:
+        for gpu in devices:
+            if gpu >= workspace.NumCudaDevices():
+                log.warning("** Only {} GPUs available, GPUs {} requested".format(
+                    workspace.NumCudaDevices(), devices))
+                break
+        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_prefix = "gpu"
+    else:
+        model_helper_obj._device_type = caffe2_pb2.CPU
+        model_helper_obj._device_prefix = "cpu"
+
+    model_helper_obj._devices = devices
+    model_helper_obj._rendezvous = rendezvous
+    model_helper_obj._sync_barrier_net = None
+    model_helper_obj._broadcast_context = None
+    model_helper_obj._shared_model = False
+    master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device)
+
+    # question: rendezvous structure
+    num_shards = rendezvous['num_shards'] if rendezvous else 1
+    # num_devices is #devices across all machines
+    num_devices = len(devices) * num_shards
+    # num_workers is #threads to execute the DAG per shard
+    num_workers = num_threads_per_device * len(devices)
+    if rendezvous:
+        num_workers += 8
+
+    loss_scale = 1.0 / num_devices
+    if block_momentum is None:
+        block_momentum = 1.0 - 1.0 / num_devices
+
+    max_concurrent_distributed_ops = min(
+        max_concurrent_distributed_ops,
+        num_workers - 1
+    )
+
+    model_helper_obj.net.Proto().num_workers = num_workers
+    model_helper_obj.net.Proto().type = net_type
+
+    # A net for initializing global model parameters. Its called once in the
+    # same step as net parameters initialization.
+    model_helper_obj._global_model_init_net = core.Net('global_model_init')
+    model_helper_obj._global_model_init_net.Proto().type = net_type
+    model_helper_obj._global_model_init_net.Proto().num_workers = \
+        num_workers
+
+    # A net for computing final parameter updates. Its will run once after
+    # running net (local models updates) for `num_local_iterations` times.
+    model_helper_obj._global_model_param_updates_net = core.Net('global_model')
+    model_helper_obj._global_model_param_updates_net.Proto().type = net_type
+    model_helper_obj._global_model_param_updates_net.Proto().num_workers = \
+        num_workers
+
+    def _v(param):
+        return "{}_v".format(param)
+
+    def _g(param):
+        return "{}_g".format(param)
+
+    def _v_prev(param):
+        return "{}_prev".format(param)
+
+    # Keep track of params that were in the model before: they are not
+    # data parallel, so we need to handle them separately
+    non_datapar_params = copy.copy(model_helper_obj.params)
+    model_helper_obj._losses_by_gpu = {}
+
+    def _InitializeModels(gpu_id):
+        input_builder_fun(model_helper_obj)
+        loss = forward_pass_builder_fun(model_helper_obj, loss_scale)
+        model_helper_obj._losses_by_gpu[gpu_id] = loss
+    _ForEachDevice(
+        devices,
+        _InitializeModels,
+        device_type=model_helper_obj._device_type,
+        device_prefix=model_helper_obj._device_prefix,
+        scoped=True
+    )
+    _ValidateParams(model_helper_obj.params)
+
+    model_helper_obj._device_grouped_blobs =\
+        _GroupByDevice(model_helper_obj, devices,
+                       model_helper_obj.params, non_datapar_params)
+
+    model_helper_obj._param_names =\
+        list(viewkeys(model_helper_obj._device_grouped_blobs))
+
+    _AddGradientOperators(
+        devices, model_helper_obj, model_helper_obj._losses_by_gpu
+    )
+    _ValidateParams(model_helper_obj.params)
+
+    _InferBlobDevice(model_helper_obj)
+
+    def _InitializeParamUpdate(gpu_id):
+        param_update_builder_fun(model_helper_obj)
+    _ForEachDevice(
+        devices,
+        _InitializeParamUpdate,
+        device_type=model_helper_obj._device_type,
+        device_prefix=model_helper_obj._device_prefix,
+        scoped=True
+    )
+
+    model_parameter_names = list(
+        viewkeys(model_helper_obj._device_grouped_blobs)
+    )
+    if warmup_iterations is not None:
+        model_helper_obj._warmup_iterations = warmup_iterations
+        # A net for broadcasting gpu-0 (master shard) parameters after
+        # running net for `warmup_iterartions`.
+        model_helper_obj._warmup_broadcast = core.Net('warmup-broadcast')
+        model_helper_obj._warmup_broadcast.Proto().type = net_type
+        model_helper_obj._warmup_broadcast.Proto().num_workers = \
+           num_workers
+
+        _SyncAllParams(
+            devices,
+            model_helper_obj,
+            model_helper_obj.param_init_net,
+            model_helper_obj._warmup_broadcast,
+            rendezvous,
+            model_parameter_names,
+            max_concurrent_distributed_ops
+        )
+        for param_name in viewkeys(model_helper_obj._device_grouped_blobs):
+            param = model_helper_obj._device_grouped_blobs[param_name][master_device]
+            with core.DeviceScope(master_dev_opt):
+                model_helper_obj._warmup_broadcast.Copy(param, _g(param))
+
+    # (Step-0) Initialize momentum parameters on master device.
+    for param_name in viewkeys(model_helper_obj._device_grouped_blobs):
+        param = model_helper_obj._device_grouped_blobs[param_name][master_device]
+        with core.DeviceScope(master_dev_opt):
+            model_helper_obj._global_model_init_net.ConstantFill(
+                param, _v(param), value=0.0
+            )
+            model_helper_obj._global_model_init_net.Copy(param, _g(param))
+            if nesterov:
+                model_helper_obj._global_model_init_net.ConstantFill(
+                    param, _v_prev(param), value=0.0
+                )
+
+    # (Step-1) Update models for num_local_iterations.
+
+    # (Step-2) Compute post-local-updates average of the params.
+    # Sum model params across GPUs and store resutls in param_avg blob.
+    _AllReduceBlobs(
+        model_parameter_names,
+        devices,
+        model_helper_obj,
+        model_helper_obj._global_model_param_updates_net,
+        rendezvous,
+        use_nccl,
+        max_concurrent_distributed_ops
+    )
+
+    # (Step-3) Update momentum params :
+    # param_v = block_momentum * param_v
+    # + block_learning_Rate * (param_avg - param)
+    # if nesterov momentum:
+    # param = param + param_v
+    # - block_momentum * (param_v - param_v_prev)
+    # param_v_prev = param_v
+    # else:
+    # param = param + param_v
+    for param_name in model_parameter_names:
+        param = model_helper_obj._device_grouped_blobs[param_name][master_device]
+        with core.DeviceScope(master_dev_opt):
+            # TODO(ataei) : Stop building the graph here to get model average ?
+            model_helper_obj._global_model_param_updates_net.Scale(
+                param, param, scale=1.0 / num_devices
+            )
+            model_helper_obj._global_model_param_updates_net.Sub(
+                [param, _g(param)], param
+            )
+            model_helper_obj._global_model_param_updates_net.Scale(
+                param, param, scale=block_learning_rate
+            )
+            model_helper_obj._global_model_param_updates_net.Scale(
+                _v(param), _v(param), scale=block_momentum
+            )
+            model_helper_obj._global_model_param_updates_net.Add(
+                [_v(param), param], _v(param)
+            )
+            model_helper_obj._global_model_param_updates_net.Add(
+                [_g(param), _v(param)], _g(param)
+            )
+            if nesterov:
+                model_helper_obj._global_model_param_updates_net.Sub(
+                    [_v(param), _v_prev(param)], _v_prev(param)
+                )
+                model_helper_obj._global_model_param_updates_net.Scale(
+                    _v_prev(param), _v_prev(param), scale=block_momentum
+                )
+                model_helper_obj._global_model_param_updates_net.Sub(
+                    [_g(param), _v_prev(param)], _g(param)
+                )
+                model_helper_obj._global_model_param_updates_net.Copy(
+                    _v(param), _v_prev(param)
+                )
+            model_helper_obj._global_model_param_updates_net.Copy(
+                _g(param), param
+            )
+
+
+    _SyncAllParams(
+        devices,
+        model_helper_obj,
+        model_helper_obj.param_init_net,
+        model_helper_obj._global_model_param_updates_net,
+        rendezvous,
+        model_parameter_names,
+        max_concurrent_distributed_ops
+    )
+
+    # Add additional syncs
+    if add_blobs_to_sync is not None:
+        AddBlobSync(
+            model_helper_obj,
+            add_blobs_to_sync,
+            net=model_helper_obj._global_model_param_updates_net)
+
+    # Reset momentum-SGD parameters
+    if reset_momentum_sgd:
+        momentum_ops = [op for op in model_helper_obj.net.Proto().op
+                        if op.type == 'MomentumSGDUpdate']
+        for op in momentum_ops:
+            momentum_blob = op.input[1]
+            with core.DeviceScope(op.device_option):
+                model_helper_obj._global_model_param_updates_net.ConstantFill(
+                    [momentum_blob], momentum_blob, value=0.0
+                )
+
+    if optimize_gradient_memory:
+        _OptimizeGradientMemorySimple(
+            model_helper_obj, model_helper_obj._losses_by_gpu, devices
+        )
+
+    model_helper_obj._data_parallel_model_init_nets = [
+        model_helper_obj.param_init_net,
+        model_helper_obj._global_model_init_net
+    ]
+
+    model_helper_obj._data_parallel_model_nets = [
+        model_helper_obj.net,
+        (model_helper_obj._global_model_param_updates_net, 1)
+    ]
+    _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec)
+
+
+def RunInitNet(model):
+    for init_net in model._data_parallel_model_init_nets:
+        workspace.RunNetOnce(init_net)
+    for net_iters in model._data_parallel_model_nets:
+        if isinstance(net_iters, tuple):
+            workspace.CreateNet(net_iters[0])
+        else:
+            workspace.CreateNet(net_iters)
+
+
+def RunWarmup(model):
+    workspace.RunNet(model.net, model._warmup_iterations)
+    workspace.RunNetOnce(model._warmup_broadcast)
+
+
+def RunNet(model, num_iterations):
+    for net_iter in model._data_parallel_model_nets:
+        if isinstance(net_iter, tuple):
+            workspace.RunNet(net_iter[0].Proto().name, net_iter[1])
+        else:
+            workspace.RunNet(net_iter, num_iterations)
+
+
+def _AddBarrierToModelNets(model, barrier_net_timeout_sec):
+    if model._rendezvous is not None and model._rendezvous['engine'] == 'GLOO':
+        # Synchronize DPM at the start of each epoch. This allows shards that
+        # starts an epoch sooner to wait for slower shards.  Without this,
+        # shards that are faster than others will begin training the next epoch
+        # while stragglers are blocked on IO, and may timeout after 30 seconds
+        # (_DEFAULT_TIMEOUT_SEC).
+        # We pass in model.param_init_net so that the barrier net can be run as
+        # part of the param_init_net.
+        model._barrier_net = _CreateBarrierNet(model, model.param_init_net,
+                "pre_training", barrier_net_timeout_sec)
+        model._data_parallel_model_nets.insert(0, model._barrier_net)
+
+
+def _CreateBarrierNet(model, init_net, name_prefix, timeout_sec):
+    log.info("Creating barrier net")
+    assert model._rendezvous['engine'] == 'GLOO', "Engine does not support barrier"
+    comm_world = _CreateOrCloneCommonWorld(
+        init_net,
+        name_prefix + "_barrier_cw",
+        rendezvous=model._rendezvous,
+        timeout_sec=timeout_sec,
+    )
+    barrier_net = core.Net(name_prefix + "_barrier_net")
+    barrier_net.Barrier(
+        inputs=[comm_world],
+        outputs=[],
+        engine=model._rendezvous['engine'],
+    )
+    return barrier_net
+
+
+# DEPRECATED: See warnings below.
+def Synchronize(model, timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC):
+    warnings.warn("The Synchronize API has been deprecated.  We now have a "
+            "barrier net which runs before training to ensure all hosts wait "
+            "before training starts.  The default timeout for the barrier is "
+            "300s and it can be overridden using the barrier_net_timeout_sec "
+            "parameter when calling Parallelize.",
+            category=DeprecationWarning, stacklevel=2)
+    if model._rendezvous is None or model._rendezvous['num_shards'] <= 1:
+        # Single host case
+        return
+
+    if model._sync_barrier_net is None:
+        barrier_init_net = core.Net("sync_barrier_init_net")
+        model._sync_barrier_net = _CreateBarrierNet(
+            model, barrier_init_net, "sync", timeout_sec)
+        workspace.RunNetOnce(barrier_init_net)
+        workspace.CreateNet(model._sync_barrier_net)
+        model._sync_barrier_net_timeout = timeout_sec
+    assert model._sync_barrier_net_timeout == timeout_sec, \
+        "Must use fixed timeout, {} != {}".format(
+            model._sync_barrier_net_timeout, timeout_sec
+        )
+    log.info("Synchronize run barrier net.")
+    workspace.RunNet(model._sync_barrier_net)
+
+
+def ConvertNetForDevice(net, device=None):
+    '''
+    Converts all blobs in the net to have namescope gpu_X, and correct
+    device scope. You can use this to enable AppendNet with a
+    forward_pass_builder_fun:
+
+       def builder_fun(model):
+          ...
+          model.net.AppendNet(
+             data_parallel_model.ConvertNetForDevice(othermodel.net))
+          model.param_init_net.AppendNet(
+             data_parallel_model.ConvertNetForDevice(othermodel.param_init_net))
+    '''
+    mnet = copy.deepcopy(net)
+
+    if device is None:
+        device = scope.CurrentDeviceScope()
+
+    device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
+
+    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
+    for op in mnet.Proto().op:
+        if "RecurrentNetwork" in op.type:
+            raise("RecurrentNetwork conversion not yet supported")
+        for i, inputb in enumerate(op.input):
+            op.input[i] = namescope + inputb
+        for i, outputb in enumerate(op.output):
+            op.output[i] = namescope + outputb
+        for i, blob in enumerate(op.control_input):
+            op.control_input[i] = namescope + blob
+        op.device_option.CopyFrom(device)
+    for i, einp in enumerate(mnet.Proto().external_input):
+        mnet.Proto().external_input[i] = namescope + einp
+    for i, eoutp in enumerate(mnet.Proto().external_output):
+        mnet.Proto().external_output[i] = namescope + eoutp
+    return mnet
+
+
+def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False,
+                   *args, **kwargs):
+    for device in devices:
+        device_opt = core.DeviceOption(device_type, device)
+        with core.DeviceScope(device_opt):
+            if scoped:
+                with core.NameScope("{}_{}".format(device_prefix, device)):
+                    f(device, *args, **kwargs)
+            else:
+                f(device, *args, **kwargs)
+
+
+def _AddGradientOperators(devices, model, losses_by_gpu):
+    def create_grad(lossp):
+        return model.ConstantFill(lossp, str(lossp) + "_grad", value=1.0)
+
+    loss_grad = {}
+    # Explicitly need to create gradients on each GPU
+    for gpu_id in devices:
+        device = core.DeviceOption(model._device_type, gpu_id)
+        with core.DeviceScope(device):
+            for l in losses_by_gpu[gpu_id]:
+                lg = create_grad(l)
+                loss_grad[str(l)] = str(lg)
+
+    model.AddGradientOperators(loss_grad)
+
+
+def ExtractPredictorNet(model, inputs, outputs, device):
+    '''
+    Returns (net, params) that can be exported to be used as a prediction
+    net.
+    '''
+    master_device = model._devices[0]
+    prefix = "{}_{}/".format(model._device_prefix, master_device)
+    prefix_inputs = [prefix + str(b) for b in inputs]
+    prefix_outputs = [prefix + str(b) for b in outputs]
+    (predictor_net, export_blobs) = model_helper.ExtractPredictorNet(
+        net_proto=model.net.Proto(),
+        input_blobs=prefix_inputs,
+        output_blobs=prefix_outputs,
+        device=device,
+        renames={
+            a: b
+            for (a, b) in zip(prefix_inputs + prefix_outputs, inputs + outputs)
+        },
+    )
+
+    return (predictor_net, export_blobs)
+
+
+def GetCheckpointParams(model):
+    '''
+    Returns a set of blobs that are needed for a complete check point.
+    They are blobs for the first gpu and iteration blobs.
+    '''
+    (all_blobs, _) = _ComputeBlobsToSync(model)
+    first_gpu_blobs = {
+        b
+        for b in all_blobs
+        if str(b)
+        .startswith("{}_{}/".format(model._device_prefix, model._devices[0]))
+    }
+
+    # Add iteration blobs that do not have namescope separately, since
+    # it is important to checkpoint iteration counter
+    iteration_blobs = set()
+    for op in model.net.Proto().op:
+        if op.type == 'Iter' or op.type == 'AtomicIter':
+            if not op.output[0].startswith("{}_".format(model._device_prefix)):
+                iteration_blobs.add(op.output[0])
+
+    return first_gpu_blobs.union(iteration_blobs)
+
+
+def FinalizeAfterCheckpoint(model, blobs=None):
+    '''
+    This function should be called after loading parameters from a
+    checkpoint / initial parameters file.
+    '''
+
+    if not hasattr(model, "_checkpoint_net"):
+        if blobs is None:
+            (_, uniq_blob_names) = _ComputeBlobsToSync(model)
+        else:
+            uniq_blob_names = [stripBlobName(p) for p in blobs]
+
+        # Synchronize to the blob lookup map, as the provided
+        # blobs might have non-parameters, such as momemtum blobs.
+        log.info("Creating checkpoint synchronization net")
+        devices = model.GetDevices()
+        for name in uniq_blob_names:
+            if name not in model._device_grouped_blobs:
+                grouped = {
+                    d:
+                    core.BlobReference("{}_{}{}{}".format(
+                        model._device_prefix,
+                        d,
+                        scope._NAMESCOPE_SEPARATOR,
+                        name)
+                    ) for d in devices}
+                model._device_grouped_blobs[name] = grouped
+
+        model._checkpoint_net = core.Net("checkpoint_sync_net")
+        model._checkpoint_net.RunAllOnGPU()
+
+        checkpoint_init_net = None
+        if (model._rendezvous is not None and model._rendezvous['num_shards'] > 1):
+            checkpoint_init_net = core.Net("checkpoint_init_net")
+            checkpoint_init_net.RunAllOnGPU()
+
+        _SyncAllParams(
+            devices,
+            model,
+            checkpoint_init_net,
+            model._checkpoint_net,
+            model._rendezvous,
+            uniq_blob_names,
+            max_concurrent_distributed_ops=1
+        )
+        if (checkpoint_init_net):
+            workspace.RunNetOnce(checkpoint_init_net)
+
+        workspace.CreateNet(model._checkpoint_net)
+
+    # Run the sync
+    log.info("Run checkpoint net")
+    workspace.RunNet(model._checkpoint_net.Proto().name)
+
+
+def GetLearningRateBlobNames(model):
+    '''
+    Returns a list of learning rates blob names used in the optimizer.
+    '''
+    if model._optimizer is not None:
+        if model._device_type == caffe2_pb2.CPU:
+            return [model._optimizer.get_cpu_blob_name('lr')]
+        elif model._device_type == caffe2_pb2.CUDA:
+            return [model._optimizer.get_gpu_blob_name('lr', gpu, '')
+                    for gpu in model._devices]
+        else:
+            raise Exception(
+                "Unsupported device type : {}".format(model._device_type)
+            )
+    else:
+        lr_blob_names = []
+        for op in model.net.Proto().op:
+            if op.type == "LearningRate":
+                lr_blob_names.append(op.output(0))
+        return lr_blob_names
+
+
+def _Broadcast(devices, model, net, param, use_nccl=False):
+    # Copy params from gpu_0 to other
+    master_dev = devices[0]
+
+    if use_nccl:
+        if _IsGPUBlob(model, param):
+            master_device_opt = core.DeviceOption(model._device_type, master_dev)
+            with core.DeviceScope(master_device_opt):
+                # Note that the root is the root _rank_ and not the root
+                # _device_. Thus we always use root=0, regardless of the
+                # devices used.
+                net.NCCLBroadcast(
+                    list(viewvalues(model._device_grouped_blobs[param])),
+                    list(viewvalues(model._device_grouped_blobs[param])),
+                    root=0,
+                )
+                return
+
+    for dev_idx in devices[1:]:
+        if _IsGPUBlob(model, param):
+            device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx)
+        else:
+            device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
+        with core.DeviceScope(device_opt):
+            net.Copy(
+                model._device_grouped_blobs[param][master_dev],
+                model._device_grouped_blobs[param][dev_idx]
+            )
+
+
+def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None):
+    blobs_group = list(viewvalues(model._device_grouped_blobs[param]))
+    if model._device_type == caffe2_pb2.CUDA and use_nccl:
+        # TODO: for _shared_model, do only NCCLReduce
+        model.NCCLAllreduce(
+            blobs_group, blobs_group, control_input=control_input
+        )
+        return
+
+    if model._device_type == caffe2_pb2.CUDA:
+        p2p_access_pattern = workspace.GetCudaPeerAccessPattern()
+    else:
+        p2p_access_pattern = None
+
+    def sumN(*dev_indices):
+        """Create a Sum op for 2 or more blobs on different devices.
+        Saves the result on the first device.
+
+        Arguments:
+        dev_indices -- a list of device indices, which can be translated into
+                       CUDA identifiers with model._devices
+        """
+        devices = [model._devices[idx] for idx in dev_indices]
+        blobs = [blobs_group[idx] for idx in dev_indices]
+        for i, peer in enumerate(devices):
+            if i == 0:
+                continue  # Skip the first device
+            if p2p_access_pattern is not None and not p2p_access_pattern[
+                devices[0], peer
+            ]:
+                # Copy from peer to d0
+                blobs[i] = model.Copy(
+                    blobs[i],
+                    'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer)
+                )
+        device_opt = core.DeviceOption(model._device_type, devices[0])
+        with core.DeviceScope(device_opt):
+            net.Sum(blobs, [blobs[0]], name='dpm')
+
+    if len(devices) == 16:
+        # Special tree reduction for 16 gpus, TODO generalize like in muji.py
+        for j in range(8):
+            sumN(j * 2, j * 2 + 1)
+        for j in range(4):
+            sumN(j * 4, j * 4 + 2)
+        for j in range(2):
+            sumN(j * 8, j * 8 + 4)
+        sumN(0, 8)
+    elif len(devices) == 8:
+        for j in range(4):
+            sumN(j * 2, j * 2 + 1)
+        for j in range(2):
+            sumN(j * 4, j * 4 + 2)
+        sumN(0, 4)
+    elif len(devices) == 4:
+        sumN(0, 1)
+        sumN(2, 3)
+        sumN(0, 2)
+    else:
+        sumN(*range(len(devices)))
+    # TODO: for _shared_model, no need to broadcast
+    _Broadcast(devices, model, net, param)
+
+
+def _SyncAllParams(
+    devices,
+    model,
+    init_net,
+    net,
+    rendezvous,
+    unique_param_names,
+    max_concurrent_distributed_ops=4
+):
+    if rendezvous is None or rendezvous['num_shards'] <= 1:
+        _SyncAllParamsSingleHost(devices, model, net, unique_param_names)
+    else:
+        _SyncAllParamsDistributed(
+            devices,
+            model,
+            init_net,
+            net,
+            rendezvous,
+            unique_param_names,
+            max_concurrent_distributed_ops
+        )
+
+
+def AddBlobSync(model, blobs, net=None):
+    '''
+    Sync a blob across devices and hosts
+    '''
+    if len(blobs) == 0:
+        return
+    net = model.net if net is None else net
+    for b in blobs:
+        assert not b.startswith(model._device_prefix), \
+            "Provide unprefixed blob name: {}".format(b)
+        model._device_grouped_blobs[b] = {
+            d: core.BlobReference("{}_{}/{}".format(model._device_prefix, d, b))
+            for d in model._devices
+        }
+
+    _SyncAllParams(
+        model._devices,
+        model,
+        model.param_init_net,
+        net,
+        model._rendezvous,
+        set(blobs))
+
+
+def AddDistributedBlobSync(model, blobs):
+    '''
+    Sync blobs across machines (but not across devices)
+    '''
+    if model._rendezvous is None:
+        return
+    synth_name = "_".join([str(b) for b in blobs])
+    comm_world = _CreateOrCloneCommonWorld(
+        model.param_init_net,
+        "blob_sync_cw_" + synth_name,
+        rendezvous=model._rendezvous,
+    )
+
+    model.net.Allreduce(
+        inputs=[comm_world] + blobs,
+        outputs=blobs,
+        engine=model._rendezvous['engine'],
+    )
+
+
+def _SyncAllParamsDistributed(
+    devices,
+    model,
+    init_net,
+    net,
+    rendezvous,
+    unique_param_names,
+    max_concurrent_distributed_ops
+):
+    assert rendezvous['num_shards'] > 1
+
+    gpu_device_opt = core.DeviceOption(model._device_type, devices[0])
+    cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
+
+    if model._broadcast_context is None:
+        model._broadcast_context = CollectivesConcurrencyControl(
+            "broadcast",
+            max_concurrent_distributed_ops,
+            init_net,
+            rendezvous
+        )
+    context = model._broadcast_context
+
+    for param_name in sorted(unique_param_names):
+        master_param = model._device_grouped_blobs[param_name][devices[0]]
+        params_group = list(viewvalues(model._device_grouped_blobs[param_name]))
+
+        def broadcast(params):
+            comm_world, control_input = context.get_control_and_context(params)
+            net.Broadcast(
+                inputs=[comm_world] + params,
+                outputs=params,
+                name=param_name,
+                engine=rendezvous['engine'],
+                control_input=control_input
+            )
+
+        device_opt = gpu_device_opt if _IsGPUBlob(
+            model, param_name
+        ) else cpu_device_opt
+
+        if rendezvous['engine'] == 'GLOO':
+            with core.DeviceScope(device_opt):
+                broadcast(params_group)
+        else:
+            # Copy between GPU and CPU
+            with core.DeviceScope(device_opt):
+                param_cpu = net.CopyGPUToCPU(
+                    master_param,
+                    str(master_param) + "cpu"
+                )
+            with core.DeviceScope(cpu_device_opt):
+                broadcast([param_cpu])
+            with core.DeviceScope(device_opt):
+                net.CopyCPUToGPU(param_cpu, master_param)
+
+            # Broadcast locally
+            _Broadcast(devices, model, net, param_name)
+
+
+def _SyncAllParamsSingleHost(devices, model, net, unique_param_names):
+    for param in unique_param_names:
+        _Broadcast(devices, model, net, param)
+
+
+def _AllReduceBlobs(blob_names, devices, model, net, rendezvous, use_nccl,
+                    max_concurrent_distributed_ops):
+    if rendezvous is None or rendezvous['num_shards'] <= 1:
+        _AllReduceBlobsSingleHost(
+            blob_names,
+            devices,
+            model,
+            net,
+            use_nccl
+        )
+    else:
+        _AllReduceBlobsDistributed(
+            blob_names,
+            devices,
+            model,
+            net,
+            rendezvous,
+            max_concurrent_distributed_ops,
+        )
+
+
+def _PruneParametersForSharing(model):
+    assert model._shared_model
+    master_prefix = "{}_{}/".format(model._device_prefix, model._devices[0])
+
+    # Remove non-master parameters so that they will not receive parameter
+    # update operators.
+    model.params = model.GetParams(master_prefix)
+    paramset = set(model.params)
+
+    model.param_to_grad = {
+        p: model.param_to_grad[p]
+        for p in model.param_to_grad if p in paramset
+    }
+    model.weights = [w for w in model.weights if w in paramset]
+    model.biases = [w for w in model.biases if w in paramset]
+
+
+def _RemapParameterBlobsForSharedModel(model, all_params):
+    assert model._shared_model
+    master_prefix = "{}_{}/".format(
+        model._device_prefix, model._devices[0])
+    log.info("Remapping param blobs to master -> {}".format(master_prefix))
+    master_params = set(model.GetParams())
+
+    # Remove all but master params
+    def modify_ops(net):
+        ops = []
+        for op in net.Proto().op:
+            delete_op = False
+            # Delete ops that output non-master version of parameter
+            for outp in op.output:
+                if outp in all_params and outp not in master_params:
+                    delete_op = True
+                    log.debug("Delete b/c {}:  {}".format(outp, str(op)))
+                    break
+            if delete_op:
+                continue
+            # Remap inputs to point to the master param
+            for j, inp in enumerate(op.input):
+                if inp in all_params and inp not in master_params:
+                    op.input[j] = master_prefix + stripBlobName(inp)
+            ops.append(op)
+        del net.Proto().op[:]
+        net.Proto().op.extend(ops)
+
+    modify_ops(model.param_init_net)
+    modify_ops(model.net)
+
+
+class CollectivesConcurrencyControl(object):
+    """
+    Creates common worlds (up to max_concurrent_context) and manage the
+    sequential execution of collectives that shares the same context with
+    cyclic control inputs.
+    """
+    def __init__(
+        self,
+        name,
+        max_concurrent_context,
+        param_init_net,
+        rendezvous
+    ):
+        self.name = name
+        self.param_init_net = param_init_net
+        self.max_concurrent_context = max_concurrent_context
+        self.counter = 0
+        self.common_worlds = []
+        self.control_inputs = []
+        self.rendezvous = rendezvous
+
+    def get_control_and_context(self, control_output_blob):
+        common_world, control_input = [None, None]
+        current_slot = self.counter % self.max_concurrent_context
+        if len(self.common_worlds) < self.max_concurrent_context:
+            common_world = _CreateOrCloneCommonWorld(
+                self.param_init_net,
+                "{}_{}_cw".format(self.name, current_slot),
+                rendezvous=self.rendezvous,
+            )
+            self.common_worlds.append(common_world)
+            self.control_inputs.append(control_output_blob)
+        else:
+            common_world = self.common_worlds[current_slot]
+            control_input = self.control_inputs[current_slot]
+            self.control_inputs[current_slot] = control_output_blob
+        self.counter += 1
+        return common_world, control_input
+
+
+def _AllReduceBlobsDistributed(
+    blob_names,
+    devices,
+    model,
+    net,
+    rendezvous,
+    max_concurrent_distributed_ops,
+):
+    num_workers = model.net.Proto().num_workers
+    assert num_workers > 1, "Please specify more than 1 worker"
+    all_reduce_engine = rendezvous['engine']
+
+    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+
+    reducing_device_opt = master_device_opt
+
+    context = CollectivesConcurrencyControl(
+        "allreduce",
+        max_concurrent_distributed_ops,
+        model.param_init_net,
+        rendezvous
+    )
+
+    nccl_control_blob = None
+
+    for blob_name in blob_names:
+        master_blob = model._device_grouped_blobs[blob_name][devices[0]]
+        blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name]))
+
+        assert master_blob in blobs_group
+
+        # Remark: NCCLReduce does not support in-place modifications
+        # so we need a temporary blob
+        reduced_blob = str(master_blob) + "_red"
+
+        def allreduce(blobs, **kwargs):
+            with core.DeviceScope(reducing_device_opt):
+                comm_world, control_input = \
+                    context.get_control_and_context(blobs[0])
+                net.Allreduce(
+                    inputs=[comm_world] + blobs,
+                    outputs=blobs,
+                    name=blob_name,
+                    engine=all_reduce_engine,
+                    control_input=control_input,
+                    **kwargs
+                )
+
+        if rendezvous['engine'] == 'GLOO':
+            # With Gloo cross GPU and cross machine allreduce
+            # can be executed in a single operation.
+            # Try to use GPUDirect if transport == ibverbs.
+            allreduce(
+                blobs_group,
+                gpu_direct=(rendezvous.get("transport", None) == "ibverbs"),
+            )
+        else:
+            # Step 1: sum blobs from local GPUs to master GPU
+            with core.DeviceScope(master_device_opt):
+                model.ConstantFill(master_blob, reduced_blob, value=0.0)
+
+                # Temp fix since NCCLReduce does not work
+                net.NCCLAllreduce(
+                    blobs_group,
+                    blobs_group,
+                    control_input=nccl_control_blob,
+                )
+                nccl_control_blob = blobs_group[0]
+                net.Copy(master_blob, reduced_blob)
+
+            # Step 2: allreduce between all hosts, between master GPUs
+            allreduce([reduced_blob])
+
+            with core.DeviceScope(master_device_opt):
+                net.Copy(reduced_blob, master_blob)
+
+            # Step 3: broadcast locally
+            _Broadcast(devices, model, net, blob_name)
+
+
+def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
+    """Performs NCCL AllReduce to distribute blobs to all the GPUs."""
+
+    if len(devices) == 1:
+        return
+
+    # Now we need to Allreduce blobs on all the GPUs.
+    # Pick GPU #0 as a master GPU.
+    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+    last_out = None
+    concatenated_idx = set()
+
+    for blob_name in blob_names:
+        # Group by blob_name for reduce.
+        blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name]))
+        if len(blobs_group) == 1:
+            # Non-reducible
+            continue
+        assert len(blobs_group) == len(devices), \
+            "Each GPU from {}, should have a copy of {}.".format(
+                devices, blob_name)
+
+        if _IsGPUBlob(model, blob_name):
+            with core.DeviceScope(master_device_opt):
+                if not isinstance(blobs_group[0], core.GradientSlice):
+                    _AllReduce(
+                        devices, model, net, blob_name, use_nccl, last_out
+                    )
+                    # last_out is used to serialize the execution of nccls
+                    last_out = blobs_group[0]
+
+                else:
+                    # Sparse gradients: all-gather for indices and values
+                    master_ns = "{}_{}".format(model._device_prefix, devices[0])
+                    '''
+                    Skip if we have already copied concatenated indices
+                    to the indices of GradientSlice. This happens when two
+                    or more grad blobs are gathered with the same indices
+                    blob
+                    '''
+                    skip_idx_concat = False
+                    for g in blobs_group:
+                        if g.indices in concatenated_idx:
+                            skip_idx_concat = True
+
+                    if not skip_idx_concat:
+                        grad_idx_concat, _ = net.Concat(
+                            [g.indices for g in blobs_group],
+                            ["{}/{}_index_concat".format(master_ns, blob_name),
+                             "{}/{}_index_splitinfo".format(master_ns, blob_name)],
+                            axis=0,
+                            name="note:data_parallel_model")
+
+                        for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
+                            device_opt = core.DeviceOption(model._device_type, gpu)
+                            with core.DeviceScope(device_opt):
+                                model.Copy(grad_idx_concat, g.indices)
+                                concatenated_idx.add(g.indices)
+
+                    grad_val_concat, _ = net.Concat(
+                        [g.values for g in blobs_group],
+                        ["{}/{}_val_concat".format(master_ns, blob_name),
+                         "{}/{}_val_splitinfo".format(master_ns, blob_name)],
+                        axis=0, name="note:data_parallel_model")
+
+                    for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
+                        device_opt = core.DeviceOption(model._device_type, gpu)
+                        with core.DeviceScope(device_opt):
+                            model.Copy(grad_val_concat, g.values)
+
+        else:
+            assert not isinstance(blobs_group[0], core.GradientSlice), \
+                "Synchronizing gradient slices not supported"
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                # Poor man's allreduce
+                net.Sum(blobs_group, [blobs_group[0]])
+                if not model._shared_model:
+                    _Broadcast(devices, model, net, blob_name)
+
+
+def _BroadcastComputedParams(devices, model, rendezvous, use_nccl=False):
+    if rendezvous is None:
+        _BroadcastComputedParamsSingleHost(devices, model, use_nccl)
+    else:
+        _BroadcastComputedParamsDistributed(devices, model, rendezvous, use_nccl)
+
+
+def _BroadcastComputedParamsDistributed(
+    devices,
+    model,
+    rendezvous,
+    use_nccl=False
+):
+    _BroadcastComputedParamsSingleHost(devices, model, use_nccl)
+    log.warn("Distributed broadcast of computed params is not implemented yet")
+
+
+def _BroadcastComputedParamsSingleHost(devices, model, use_nccl=False):
+    '''
+    Average computed params over all devices
+    '''
+    if len(devices) == 1:
+        return
+
+    for param_name in model._computed_param_names:
+        # Copy from master to others -- averaging would be perhaps better,
+        # but currently NCCLAllReduce is too prone to deadlock
+        _Broadcast(devices, model, model.net, param_name, use_nccl)
+
+
+def _GetReverseOrderedGrads(model):
+    '''
+    Returns the gradients in reverse order (namespace stripped),
+    for the optimal synchronization order.
+    '''
+    return list(reversed(model._grad_names))
+
+
+# A helper function to extract a parameter's name
+def stripBlobName(param):
+    # Format is "a/b/c/d" -> "b/c/d"
+    if isinstance(param, core.GradientSlice):
+        return stripBlobName(param.indices) + ":" + stripBlobName(param.values)
+    else:
+        name = str(param)
+    return name[name.index(scope._NAMESCOPE_SEPARATOR) + 1:]
+
+
+def _AnalyzeOperators(model):
+    '''
+    Look at all the operators and check that they do not cross device scopes
+    '''
+    for op in model.Proto().op:
+        if "NCCL" in op.type or "Copy" in op.type or "Concat" in op.type:
+            continue
+        if "Sum" == op.type and op.name == "dpm":
+            continue
+        if "Allreduce" in op.type and "GLOO" in op.engine:
+            continue
+
+        op_dev = op.device_option
+        op_gpu = op_dev.cuda_gpu_id
+
+        # This avoids failing on operators that are only for CPU
+        if op_dev.device_type != caffe2_pb2.CUDA:
+            continue
+
+        namescope = "{}_{}/".format(model._device_prefix, op_gpu)
+        for inp in list(op.input) + list(op.output):
+            if inp.startswith("{}_".format(model._device_prefix)
+                             ) and not inp.startswith(namescope):
+                raise Exception(
+                    "Blob {} of op {}, should have namescope {}. Op: {}".format(
+                        inp,
+                        op.type,
+                        "{}_{}/".format(model._device_prefix, op_gpu),
+                        str(op),
+                    )
+                )
+
+
+def _InferBlobDevice(model):
+    '''
+    Assign blob to device option based on the operator outputing it
+    '''
+    mapping = {}
+
+    def map_ops(proto):
+        for op in proto.op:
+            device_option = op.device_option
+            if op.type == "Iter":
+                # Hack for Iters which have blob in CPU context
+                device_option = caffe2_pb2.DeviceOption()
+                device_option.device_type = caffe2_pb2.CPU
+            for b in list(op.input) + list(op.output):
+                if b not in mapping:
+                    mapping[b] = device_option
+            if op.type.startswith('RecurrentNetwork'):
+                step_args = [a for a in op.arg if a.name.endswith("step_net")]
+                for step_arg in step_args:
+                    map_ops(step_arg.n)
+    map_ops(model.param_init_net.Proto())
+    map_ops(model.net.Proto())
+    model._blob_to_device = mapping
+
+def _IsGPUBlob(model, blob_name):
+    if blob_name in model._blob_to_device:
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+    else:
+        blob_name = "{}_{}/{}".format(
+            model._device_prefix, model._devices[0], blob_name
+        )
+        if blob_name not in model._blob_to_device:
+            return model._device_type == caffe2_pb2.CUDA
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+
+
+def _GroupByDevice(model, devices, params, non_data_params):
+    '''
+    Groups blobs by device, returning a map of [blobname] = {0: BlobRef, 1: ..}.
+    Returns ordered dictionary, ensuring the original order.
+    '''
+    grouped = OrderedDict()
+    # Only consider params that were created to be  "data parallel"
+    params = params[len(non_data_params):]
+
+    for _i, p in enumerate(params):
+        assert isinstance(p, core.BlobReference) or \
+            isinstance(p, core.GradientSlice), \
+            "Param {} is not BlobReference or GradientSlice".format(p)
+
+        name = stripBlobName(p)
+        gpuid = None
+
+        if isinstance(p, core.BlobReference):
+            gpuid = int(p.GetNameScope().split("_")[1].split("/")[0])
+            assert "{}_{}/".format(model._device_prefix, gpuid) in p.GetNameScope(),\
+                "Param {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
+        else:
+            gpuid = int(p.indices.GetNameScope().split("_")[1].split("/")[0])
+            assert "{}_{}/".format(model._device_prefix, gpuid) in p.indices.GetNameScope(),\
+                "Indices {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
+            assert "{}_{}/".format(model._device_prefix, gpuid) in p.values.GetNameScope(),\
+                "Values {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
+
+        if name not in grouped:
+            grouped[name] = {}
+        grouped[name][gpuid] = p
+
+    return grouped
+
+
+def _ValidateParams(params):
+    set_params = set(params)
+    if len(params) > len(set_params):
+        dupes = []
+        sp = sorted(params)
+        for j, p in enumerate(sp):
+            if j > 0 and sp[j - 1] == p:
+                dupes.append(p)
+
+        assert len(params) == len(set_params), \
+            "Duplicate entries in params: {}".format(dupes)
+
+
+def _ComputeBlobsToSync(model):
+    '''
+    We sync all blobs that are generated by param init net and
+    are 'data parallel', i.e assigned to a device
+    '''
+    sync_names = set()
+
+    # We don't sync params if the model is shared
+    if model._shared_model:
+        blobs_to_sync = [str(p) for p in model.GetComputedParams('')]
+        sync_names = [stripBlobName(p) for p in blobs_to_sync]
+    else:
+        blobs_to_sync = []
+
+        for op in model.param_init_net.Proto().op:
+            dp_outputs = [
+                o for o in op.output
+                if o.startswith("{}_".format(model._device_prefix))
+            ]
+            sync_names.update([stripBlobName(o) for o in dp_outputs])
+            blobs_to_sync.extend(dp_outputs)
+
+        # Sanity check
+        diff = set(model._param_names) - sync_names
+        assert diff == set(), \
+           "Some params not instantiated in param init net: {}".format(diff)
+
+    # Remove duplicates and sort
+    prefixlen = len(model._device_prefix) + 1
+
+    def extract_sort_key(b):
+        # Sort first based on device id, and then by whole string
+        deviceid = int(b[prefixlen:b.index(scope._NAMESCOPE_SEPARATOR)])
+        return (deviceid, b)
+
+    blobs_to_sync = sorted(
+        list(set(blobs_to_sync)),
+        key=extract_sort_key)
+
+    blobs_to_sync = [core.BlobReference(b) for b in blobs_to_sync]
+    return (blobs_to_sync, sync_names)
+
+
+def _OptimizeGradientMemorySimple(model, losses_by_gpu, devices):
+    log.warning("------- DEPRECATED API, please use " +
+                   "data_parallel_model.OptimizeGradientMemory() ----- ")
+    for device in devices:
+        namescope = "{}_{}/".format(model._device_prefix, device)
+        model.net._net = memonger.share_grad_blobs(
+            model.net,
+            losses_by_gpu[device],
+            set(viewvalues(model.param_to_grad)),
+            namescope,
+            share_activations=False,
+        )
+
+
+def _AddDynamicMemoryOptimization(model, blobs_to_keep, devices):
+    blobs_to_keep_all_devices = set()
+    if blobs_to_keep is not None:
+        for device in devices:
+            for blob_name in blobs_to_keep:
+                blobs_to_keep_all_devices.add(
+                    "{}_{}/{}".format(model._device_prefix, device, blob_name)
+                )
+
+    if model._rendezvous is not None:
+        # GLOO operators expect the tensor addresses to remain same over
+        # iterations so we need to remove param grads from the dynamic memory
+        # management.
+        blobs_to_keep_all_devices.update(
+            [str(b) for b in viewvalues(model.param_to_grad)]
+        )
+
+    model.net._net = memonger.release_blobs_when_used(
+        model.net.Proto(),
+        blobs_to_keep_all_devices
+    )
+
+
+def OptimizeGradientMemory(model,
+                           input_shapes,
+                           excluded_blobs,
+                           recycle_activations):
+    """
+    Optimize memory usage of the backward pass by recycling blobs for gradient
+    inputs that have been 'used'.
+    input_shapes:  dict of blob name to shape for the inputs of the model.
+                   Pass empty dictionary if not known.
+    excluded_blobs: list of blobs that cannot be recycled. These are blobs
+                   that you will access externally.
+    recycle_activations: whether to also recycle forward pass activations
+    """
+    if input_shapes is not None:
+        input_shapes_all_devices = {}
+        for b, shp in viewitems(input_shapes):
+            for d in model._devices:
+                input_shapes_all_devices["{}_{}/{}".
+                                         format(model._device_prefix, d, b)] = shp
+
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [model.param_init_net, model.net],
+            input_shapes_all_devices,
+        )
+    else:
+        shapes = None
+
+    for device in model._devices:
+        namescope = "{}_{}/".format(model._device_prefix, device)
+        excluded_blobs_by_device = set(namescope + b for b in excluded_blobs)
+        model.net._net = memonger.share_grad_blobs(
+            model.net,
+            model._losses_by_gpu[device],
+            set(viewvalues(model.param_to_grad)),
+            namescope,
+            dont_share_blobs=excluded_blobs_by_device,
+            share_activations=recycle_activations,
+            blob_shapes=shapes,
+        )
+
+
+def _CreateOrCloneCommonWorld(
+        net,
+        common_world_blob,
+        rendezvous,
+        name=None,
+        timeout_sec=None):
+
+    if timeout_sec is None:
+        timeout_sec = _DEFAULT_TIMEOUT_SEC
+
+    timeout_ms = timeout_sec * 1000
+
+    # Check if there is an existing CreateCommonWorld
+    # with the same timeout we're looking for. If so,
+    # we can clone it instead of creating a new one.
+    existing = None
+    for op in net.Proto().op:
+        if op.type != "CreateCommonWorld":
+            continue
+
+        # Find common world timeout
+        op_timeout_ms = -1
+        for arg in op.arg:
+            if arg.name == 'timeout_ms':
+                op_timeout_ms = arg.i
+                break
+        if op_timeout_ms != timeout_ms:
+            continue
+
+        # This common world was created with the same timeout we're
+        # looking for, so we can clone it
+        existing = op.output[0]
+        break
+
+    if name is None:
+        name = "{}_op".format(common_world_blob)
+
+    if existing is not None:
+        comm_world = net.CloneCommonWorld(
+            [existing],
+            common_world_blob,
+            name=name,
+            engine=rendezvous['engine'],
+        )
+    else:
+        kwargs=dict()
+        if 'transport' in rendezvous:
+            kwargs['transport'] = rendezvous['transport']
+        if 'interface' in rendezvous:
+            kwargs['interface'] = rendezvous['interface']
+        if 'mpi_rendezvous' in rendezvous:
+            kwargs['mpi_rendezvous'] = rendezvous['mpi_rendezvous']
+        comm_world = net.CreateCommonWorld(
+            rendezvous['kv_handler'] or [],
+            common_world_blob,
+            name=name,
+            size=rendezvous['num_shards'],
+            rank=rendezvous['shard_id'],
+            engine=rendezvous['engine'],
+            timeout_ms=timeout_ms,
+            **kwargs
+        )
+
+    return comm_world
+
+
+def _RunComparison(model, blob_name, device=None):
+    if device is None:
+        device = model._blob_to_device[blob_name]
+    with core.DeviceScope(device):
+        rendezvous = model._rendezvous
+        if rendezvous is None or rendezvous['num_shards'] == 1:
+            return True
+
+        test_data_arr = np.zeros(rendezvous['num_shards']).astype(np.float32)
+        test_data_arr[rendezvous['shard_id']] = 1
+        workspace.FeedBlob("compare_arr", test_data_arr)
+
+        comparison_net = core.Net("allcompare_net")
+
+        kwargs=dict()
+        if 'mpi_rendezvous' in rendezvous:
+            kwargs['mpi_rendezvous'] = rendezvous['mpi_rendezvous']
+        comm_world = comparison_net.CreateCommonWorld(
+            rendezvous['kv_handler'] or [],
+            "initial_sync",
+            name=model.net.Proto().name + ".cw_master_select",
+            size=rendezvous['num_shards'],
+            rank=rendezvous['shard_id'],
+            engine=rendezvous['engine'],
+            **kwargs
+        )
+
+        blob_name_checksum = blob_name + "_checksum"
+        comparison_net.SumSqrElements(
+            [blob_name], [blob_name_checksum], average=False
+        )
+
+        blob_name_gather = blob_name + "_gather"
+        comparison_net.Mul(
+            inputs=["compare_arr", blob_name_checksum],
+            outputs=blob_name_gather,
+            broadcast=1
+        )
+
+        comparison_net.Allreduce(
+            inputs=[comm_world, blob_name_gather],
+            outputs=[blob_name_gather],
+            engine=rendezvous['engine'],
+        )
+
+        workspace.RunNetOnce(comparison_net)
+        gather_arr = workspace.FetchBlob(blob_name_gather)
+
+        baseline = gather_arr[0]
+        for i in range(rendezvous['num_shards']):
+            assert gather_arr[i] == baseline, \
+                "allcompare failed on shard {}.".format(rendezvous['shard_id'])
+
+        return True
+
+
+def _InterleaveOps(model):
+    '''
+    Data Parallel Model creates a net with ops in one device grouped together.
+    This will interleave the ops so that each op for each device is next
+    to each other in the net. Kind of like combining decks of cards. This
+    ensures that progress is made along the critical path roughly concurrently
+    for each device, which is important due to the extra intra-node
+    synchronization required for multi-device batch normalization.
+    '''
+    orig_ops = list(model.net.Proto().op)
+    num_devices = len(model._devices)
+    num_ops_per_dev = len(orig_ops) // num_devices
+    assert num_devices * num_ops_per_dev == len(orig_ops), \
+           'Number of ops per device in original net is not uniform'
+    new_ops = []
+    ops = {d: [] for d in range(num_devices)}
+    for op in orig_ops:
+        ops[op.device_option.cuda_gpu_id].append(op)
+
+    for j in range(num_ops_per_dev):
+        tp = None
+        for d in model._devices:
+            if tp is None:
+                tp = ops[d][j].type
+            new_ops.append(ops[d][j])
+            # Sanity
+            assert ops[d][j].type == tp, \
+                "Type mismatch {} / {}".format(tp, ops[d][j].type)
+
+    del model.net.Proto().op[:]
+    model.net.Proto().op.extend(new_ops)
+
+
+def _InterDeviceBatchNormalization(model):
+    orig_ops = list(model.net.Proto().op)
+    new_ops = []
+    num_devices = len(model._devices)
+    batch_norm_ops = []
+    injected_ops = []
+
+    spatial_bn_phase = False
+    sums_blobs = []
+    sumsq_blobs = []
+    name = []
+    input_blob_name = None
+
+    spatial_bn_gradient_phase = False
+    scale_grad_blobs = []
+    bias_grad_blobs = []
+
+    for op in orig_ops:
+        if op.type != 'SpatialBN' and op.type != 'SpatialBNGradient':
+            if spatial_bn_phase:
+                new_ops.extend(injected_ops)
+                new_ops.append(
+                    core.CreateOperator("Sum",
+                                        sums_blobs,
+                                        input_blob_name + "_sums_combined"))
+                new_ops.append(
+                    core.CreateOperator("Sum",
+                                        sumsq_blobs,
+                                        input_blob_name + "_sumsq_combined"))
+                new_ops.extend(batch_norm_ops)
+                injected_ops = []
+                batch_norm_ops = []
+                sums_blobs = []
+                sumsq_blobs = []
+                spatial_bn_phase = False
+                input_blob_name = None
+            elif spatial_bn_gradient_phase:
+                new_ops.extend(injected_ops)
+                scale_blob = \
+                    "cpu_0/" + stripBlobName(scale_grad_blobs[0]) + "_combined"
+                bias_blob = \
+                    "cpu_0/" + stripBlobName(bias_grad_blobs[0]) + "_combined"
+                new_ops.append(
+                    core.CreateOperator("Sum", scale_grad_blobs, scale_blob))
+                new_ops.append(
+                    core.CreateOperator("Sum", bias_grad_blobs, bias_blob))
+                for blob in scale_grad_blobs:
+                    new_ops.append(
+                        core.CreateOperator("Copy", scale_blob, blob))
+                for blob in bias_grad_blobs:
+                    new_ops.append(core.CreateOperator("Copy", bias_blob, blob))
+                new_ops.extend(batch_norm_ops)
+                injected_ops = []
+                batch_norm_ops = []
+                scale_grad_blobs = []
+                bias_grad_blobs = []
+                spatial_bn_gradient_phase = False
+            new_ops.append(op)
+        elif op.type == 'SpatialBN':
+            spatial_bn_phase = True
+            if input_blob_name is None:
+                input_blob_name = op.input[0]
+            name = op.input[0]
+            injected_ops.append(
+                core.CreateOperator(
+                    "ChannelStats",
+                    name,
+                    [name + "_sums", name + "_sumsq"]))
+            sums_blobs.append(name + "_sums")
+            sumsq_blobs.append(name + "_sumsq")
+            op.input.append(input_blob_name + "_sums_combined")
+            op.input.append(input_blob_name + "_sumsq_combined")
+            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
+            batch_norm_ops.append(op)
+        elif op.type == 'SpatialBNGradient':
+            spatial_bn_gradient_phase = True
+            injected_ops.append(
+                core.CreateOperator("ChannelBackpropStats",
+                                    [op.input[0], op.input[3], op.input[4],
+                                     op.input[2]],
+                                    [op.output[1], op.output[2]]))
+            scale_grad_blobs.append(op.output[1])
+            bias_grad_blobs.append(op.output[2])
+            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
+            op.input.extend([op.output[1], op.output[2]])
+            batch_norm_ops.append(op)
+
+    assert not spatial_bn_phase, \
+        "Net modification for inter-device batch normalization failed"
+    del model.net.Proto().op[:]
+    model.net.Proto().op.extend(new_ops)
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
new file mode 100644
index 0000000..b3e2622
--- /dev/null
+++ b/caffe2/python/data_parallel_model_test.py
@@ -0,0 +1,1133 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from future.utils import viewkeys
+from multiprocessing import Process, Queue
+import numpy as np
+import os
+import shutil
+import tempfile
+import unittest
+import time
+from mock import Mock
+from hypothesis import assume, given
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import brew, core, cnn, data_parallel_model, dyndep, \
+    model_helper, optimizer, rnn_cell, workspace
+from caffe2.python.test_util import TestCase
+
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
+
+
+class TemporaryDirectory:
+    def __enter__(self):
+        self.tmpdir = tempfile.mkdtemp()
+        return self.tmpdir
+
+    def __exit__(self, type, value, traceback):
+        shutil.rmtree(self.tmpdir)
+
+# Note(jiayq): we are yet to find out why Travis gives out an error in gloo
+# like:
+# RuntimeError: [enforce fail at /home/travis/build/caffe2/caffe2/third_party/gloo/gloo/transport/tcp/device.cc:113] ifa != nullptr. Unable to find interface for: [127.0.1.1]
+# See for example https://travis-ci.org/caffe2/caffe2/jobs/262433866
+# As a result, we will check if this is travis, and if yes, disable it.
+@unittest.skipIf(os.environ.get("TRAVIS"), "DPMTest has a known issue with Travis.")
+class DataParallelModelTest(TestCase):
+
+    def run_model(self, devices, gpu):
+        '''
+        Helper function for test_equiv
+        '''
+        def input_builder_fun(model):
+            return None
+
+        def model_build_fun(model, loss_scale):
+            fc = model.FC("data", "fc", 16, 1,
+                          ("ConstantFill", {}), ("ConstantFill", {}))
+            fc_fl = model.FlattenToVec(fc, "fc_fl")
+            sigm = model.Sigmoid(fc_fl, "sigm")
+            sq = model.SquaredL2Distance([sigm, "label"], "sq")
+            loss = model.AveragedLoss(sq, "loss")
+            loss = model.Scale(loss, scale=loss_scale)
+
+            # For testing explicit sync
+            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
+            return [loss]
+
+        def add_optimizer(model):
+            return optimizer.build_sgd(
+                model,
+                0.1,
+                policy="fixed",
+                max_gradient_norm=5.0,
+                allow_lr_injection=True,
+            )
+
+        workspace.ResetWorkspace()
+        model = cnn.CNNModelHelper(
+            order="NHWC",
+            name="test{}".format(devices),
+        )
+        data_parallel_model.Parallelize(
+            model,
+            input_builder_fun=input_builder_fun,
+            forward_pass_builder_fun=model_build_fun,
+            optimizer_builder_fun=add_optimizer,
+            devices=devices,
+            cpu_device=not gpu,
+            shared_model=not gpu,
+            combine_spatial_bn=not gpu,
+        )
+        data_parallel_model.AddBlobSync(model, ["sync_num"])
+
+        # Light test for LR names
+        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
+        self.assertGreater(len(lr_names), 0)
+
+        np.random.seed(2603)
+
+        # Each run has same input, independent of number of gpus
+        batch_size = 64
+        for i in range(0, 10):
+            full_data = np.random.rand(batch_size, 16)
+            full_labels = np.round(full_data[:, 0])
+            batch_per_device = batch_size // len(devices)
+
+            for (j, g) in enumerate(devices):
+                st = j * batch_per_device
+                en = st + batch_per_device
+                data = full_data[st:en, :].astype(np.float32)
+                labels = full_labels[st:en].astype(np.float32)
+                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
+                    workspace.FeedBlob(
+                        "{}_{}/data".format(model._device_prefix, g), data
+                    )
+                    workspace.FeedBlob(
+                        "{}_{}/label".format(model._device_prefix, g), labels
+                    )
+
+            if i == 0:
+                workspace.RunNetOnce(model.param_init_net)
+                workspace.CreateNet(model.net)
+
+            workspace.FeedBlob(
+                model._device_prefix + "_0/sync_num",
+                np.array([i * 2]).astype(np.float32),
+                device_option=core.DeviceOption(model._device_type, 0))
+            workspace.RunNet(model.net.Proto().name)
+
+            # Test AddBlobSync
+            for j in model._devices:
+                sync = workspace.FetchBlob(
+                    model._device_prefix + "_{}/sync_num".format(j))[0]
+                self.assertTrue(abs(sync - i * 2) < 0.01)
+
+        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
+
+    def run_test_locally(self, fn, device_option=None, **kwargs):
+        # Queue for assertion errors on subprocesses
+        queue = Queue()
+
+        # Capture any exception thrown by the subprocess
+        def run_fn(*args, **kwargs):
+            try:
+                if device_option is None:
+                    fn(*args, **kwargs)
+                    workspace.ResetWorkspace()
+                else:
+                    with core.DeviceScope(device_option):
+                        fn(*args, **kwargs)
+                        workspace.ResetWorkspace()
+            except Exception as ex:
+                queue.put(ex)
+
+        # Start N processes in the background
+        procs = []
+        for i in range(kwargs['comm_size']):
+            kwargs['comm_rank'] = i
+            proc = Process(
+                target=run_fn,
+                kwargs=kwargs)
+            proc.start()
+            procs.append(proc)
+
+        # Test complete, join background processes
+        while len(procs) > 0:
+            proc = procs.pop(0)
+            while proc.is_alive():
+                proc.join(1)
+
+                # Raise exception if we find any.
+                # Note that the following is executed ALSO after
+                # the last process was joined, so if ANY exception
+                # was raised, it will be re-raised here.
+                if not queue.empty():
+                    raise queue.get()
+
+    def test_equiv(self):
+        '''
+        Test that the model produces exactly same results given
+        total batchsize, independent of number of GPUs.
+        '''
+        for gpu in [True, False]:
+            if gpu and (not workspace.has_gpu_support or
+                        workspace.NumCudaDevices() < 2):
+                continue
+            result_2gpus = self.run_model([0, 1], gpu=gpu)
+            result_1gpus = self.run_model([0], gpu=gpu)
+
+            self.assertTrue(np.allclose(result_1gpus, result_2gpus))
+
+            if not gpu or workspace.NumCudaDevices() >= 4:
+                result_4gpus = self.run_model(list(range(4)), gpu=gpu)
+                self.assertTrue(np.allclose(result_1gpus, result_4gpus))
+
+            if not gpu or workspace.NumCudaDevices() >= 8:
+                result_8gpus = self.run_model(list(range(8)), gpu=gpu)
+                self.assertTrue(np.allclose(result_1gpus, result_8gpus))
+
+            if not gpu or workspace.NumCudaDevices() >= 16:
+                result_16gpus = self.run_model(list(range(16)), gpu=gpu)
+                self.assertTrue(np.allclose(result_1gpus, result_16gpus))
+
+    def test_checkpoint_params(self):
+        def add_input_ops(model):
+            pass
+
+        def add_model_ops(model, loss_scale):
+            model.NHWC2NCHW("data", "data_nchw")
+            model.Conv("data_nchw", 'conv1', 3, 64,
+                       weight_init=("MSRAFill", {}), kernel=7,
+                       stride=2, pad=3, no_bias=0)
+            model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
+            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
+            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
+            model.Sigmoid('fc', 'fc_sigm')
+            model.Softmax('fc_sigm', 'softmax')
+            model.LabelCrossEntropy(['softmax', 'label'], 'xent')
+            loss = model.AveragedLoss('xent', 'loss')
+
+            # Add a duplicate param init to ensure it does not cause issues
+            model.param_init_net.ConstantFill(
+                [], ["fc_w"], shape=((64 * 56 * 56), 1000)
+            )
+            return [loss]
+
+        def add_optimizer(model):
+            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)
+
+        model = cnn.CNNModelHelper(
+            order="NHWC",
+            name="test",
+        )
+        data_parallel_model.Parallelize_CPU(
+            model,
+            input_builder_fun=add_input_ops,
+            forward_pass_builder_fun=add_model_ops,
+            optimizer_builder_fun=add_optimizer,
+            devices=[1, 2, 3],
+        )
+
+        # Only gpu_1 params should be returned (gpu_1 is the first gpu)
+        checkpoint_params = data_parallel_model.GetCheckpointParams(model)
+        for p in model.GetParams("cpu_1/"):
+            self.assertTrue(p in checkpoint_params)
+            self.assertTrue(p + "_momentum" in checkpoint_params)
+        for p in model.GetParams("cpu_2/"):
+            self.assertFalse(p in checkpoint_params)
+        self.assertTrue(
+            core.BlobReference("cpu_1/fc_w_momentum") in checkpoint_params)
+        for c in model.GetComputedParams("cpu_1/"):
+            self.assertTrue(c in checkpoint_params)
+        for c in model.GetComputedParams("cpu_2/"):
+            self.assertFalse(c in checkpoint_params)
+        self.assertFalse(core.BlobReference("cpu_1/data") in checkpoint_params)
+        self.assertTrue(core.BlobReference("optimizer_iteration") in checkpoint_params)
+
+    def test_net_conversion_and_append_net(self):
+        other = model_helper.ModelHelper()
+        fc1 = brew.fc(other, "data", "other_fc1", dim_in=3*227*227, dim_out=10)
+        fc2 = brew.fc(other, fc1, "other_fc2", dim_in=10, dim_out=10)
+        brew.fc(other, fc2, "other_fc3", dim_in=10, dim_out=10)
+
+        def add_input_ops(model):
+            model.net.UniformFill([], ["data"], shape=[4, 227, 227, 3])
+            model.net.UniformFill([], ["label"], shape=[4])
+
+        def add_model_ops(model, loss_scale):
+            model.NHWC2NCHW("data", "data_nchw")
+            model.Conv("data_nchw", 'conv1', 3, 64,
+                       weight_init=("MSRAFill", {}), kernel=7,
+                       stride=2, pad=3, no_bias=0)
+            model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
+            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
+            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=10)
+
+            # Append the net and param_init_net of the other model
+            appendnet = data_parallel_model.ConvertNetForDevice(other.net)
+            model.net.AppendNet(appendnet)
+
+            model.param_init_net.AppendNet(
+                data_parallel_model.ConvertNetForDevice(other.param_init_net))
+
+            model.Sigmoid('fc', 'fc_sigm')
+            model.Softmax('fc_sigm', 'softmax')
+            loss = model.AveragedLoss('softmax', 'loss')
+            return [loss]
+
+        def add_optimizer(model):
+            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)
+
+        model = cnn.CNNModelHelper(
+            order="NCHW",
+            name="test",
+        )
+        data_parallel_model.Parallelize_CPU(
+            model,
+            input_builder_fun=add_input_ops,
+            forward_pass_builder_fun=add_model_ops,
+            optimizer_builder_fun=add_optimizer,
+            devices=range(4)
+        )
+
+        # Just create and run net and confirm no exception is thrown
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+        workspace.RunNet(model.net)
+
+    def test_synchronization_barrier(self):
+        def run(comm_rank, comm_size, tmpdir):
+            def add_input_ops(model):
+                pass
+
+            def add_model_ops(model, loss_scale):
+                return []
+
+            def add_optimizer(model):
+                pass
+
+            store_handler = "store_handler"
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "FileStoreHandlerCreate",
+                    [],
+                    [store_handler],
+                    path=tmpdir))
+            rendezvous = dict(
+                kv_handler=store_handler,
+                shard_id=comm_rank,
+                num_shards=comm_size,
+                engine='GLOO',
+            )
+
+            model = cnn.CNNModelHelper(
+                order="NHWC",
+                name="test",
+            )
+            data_parallel_model.Parallelize_CPU(
+                model,
+                input_builder_fun=add_input_ops,
+                forward_pass_builder_fun=add_model_ops,
+                optimizer_builder_fun=add_optimizer,
+                devices=[1, 2, 3],
+                rendezvous=rendezvous
+            )
+            data_parallel_model.RunInitNet(model)
+
+            for _ in range(2):
+                data_parallel_model.Synchronize(model)
+
+        with TemporaryDirectory() as tmpdir:
+            self.run_test_locally(
+                run,
+                comm_size=2,
+                device_option=None,
+                tmpdir=tmpdir)
+
+    def test_pre_train_synchronization_barrier(self):
+        def run(comm_rank, comm_size, tmpdir):
+            def add_input_ops(model):
+                pass
+
+            def add_model_ops(model, loss_scale):
+                return []
+
+            def add_optimizer(model):
+                pass
+
+            workspace.ResetWorkspace()
+            store_handler = "store_handler"
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "FileStoreHandlerCreate",
+                    [],
+                    [store_handler],
+                    path=tmpdir))
+            rendezvous = dict(
+                kv_handler=store_handler,
+                shard_id=comm_rank,
+                num_shards=comm_size,
+                engine='GLOO',
+            )
+
+            model = cnn.CNNModelHelper(
+                order="NHWC",
+                name="test",
+            )
+            # Set network timeout to 2 seconds, and add a 3 seconds
+            # sleep for 1 host.  Make sure there is no timeout on the
+            # second RunNet.
+            data_parallel_model._DEFAULT_TIMEOUT_SEC = 2
+            data_parallel_model.Parallelize_CPU(
+                model,
+                input_builder_fun=add_input_ops,
+                forward_pass_builder_fun=add_model_ops,
+                optimizer_builder_fun=add_optimizer,
+                devices=[1, 2, 3],
+                rendezvous=rendezvous,
+                barrier_net_timeout_sec=5
+            )
+            data_parallel_model.RunInitNet(model)
+            data_parallel_model.RunNet(model, 2)
+            if comm_rank == 0:
+                time.sleep(data_parallel_model._DEFAULT_TIMEOUT_SEC)
+            data_parallel_model.RunNet(model, 2)
+
+        with TemporaryDirectory() as tmpdir:
+            self.run_test_locally(
+                run,
+                comm_size=2,
+                device_option=None,
+                tmpdir=tmpdir)
+
+    def test_device_scope_check(self):
+        with self.assertRaises(AssertionError):
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+                data_parallel_model.Parallelize_GPU(None, None, None)
+
+    def test_net_transformer_function(self):
+        devices = [1, 2, 3]
+
+        def add_input_ops(model):
+            model.param_init_net.UniformFill([], ["data"], shape=[32, 8])
+
+        def add_optimizer(model):
+            optimizer.build_sgd(model, 0.1)
+
+        def add_model_ops(model, loss_scale):
+            fc1 = brew.fc(model, "data", "fc1", dim_in=8, dim_out=8)
+            return [fc1]
+
+        kwargs = {
+            'input_builder_fun': add_input_ops,
+            'forward_pass_builder_fun': add_model_ops,
+            'devices': devices,
+        }
+
+        # assert that the transformer is called for both train and test cases
+        transform = Mock()
+        kwargs['net_transformer_fun'] = transform
+        model = model_helper.ModelHelper(name="r", init_params=False)
+        data_parallel_model.Parallelize_CPU(model, **kwargs)
+        self.assertTrue(transform.called)
+        self.assertEqual(transform.call_count, 1)
+
+        transform = Mock()
+        kwargs['net_transformer_fun'] = transform
+        kwargs['optimizer_builder_fun'] = add_optimizer
+        model = model_helper.ModelHelper(name="r", init_params=True)
+        data_parallel_model.Parallelize_CPU(model, **kwargs)
+        self.assertTrue(transform.called)
+        self.assertEqual(transform.call_count, 1)
+
+
+class RecurrentNetworkParallelTest(TestCase):
+
+    def run_model(self, devices, gpu):
+
+        '''
+        Helper function for test_equiv
+        '''
+        def input_builder_fun(model):
+            return None
+
+        def model_build_fun(model, loss_scale):
+            workspace.FeedBlob(
+                core.ScopedBlobReference("seq_lengths"),
+                np.array([self.T] * self.batch_per_device, dtype=np.int32)
+            )
+            model.param_init_net.ConstantFill(
+                [],
+                "hidden_init",
+                value=0.0,
+                shape=[1, self.batch_per_device, self.hidden_dim]
+            )
+            model.param_init_net.ConstantFill(
+                [],
+                "cell_init",
+                value=0.0,
+                shape=[1, self.batch_per_device, self.hidden_dim]
+            )
+
+            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
+                model=model,
+                input_blob="data",
+                seq_lengths="seq_lengths",
+                initial_states=("hidden_init", "cell_init"),
+                dim_in=self.input_dim,
+                dim_out=self.hidden_dim,
+                scope="partest",
+            )
+
+            # A silly loss function
+            loss = model.AveragedLoss(
+                model.Sub([output, "target"], "dist"),
+                "loss",
+            )
+            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
+            return [loss]
+
+        def param_update_fun(model):
+            ITER = model.Iter("ITER")
+            LR = model.net.LearningRate(
+                [ITER],
+                "LR",
+                base_lr=(-0.1),
+                policy="fixed",
+            )
+            ONE = model.param_init_net.ConstantFill(
+                [], "ONE", shape=[1], value=1.0,
+            )
+            for param in model.GetParams():
+                param_grad = model.param_to_grad[param]
+                model.WeightedSum([param, ONE, param_grad, LR], param)
+
+            assert len(model.GetParams()) == len(model.params) // len(model._devices)
+
+        workspace.ResetWorkspace()
+        model = cnn.CNNModelHelper(
+            name="recurrent_test{}".format(devices),
+        )
+
+        self.T = 8
+        self.batch_size = 64
+        self.input_dim = 8
+        self.hidden_dim = 31
+        self.batch_per_device = self.batch_size // len(devices)
+
+        data_parallel_model.Parallelize(
+            model,
+            input_builder_fun=input_builder_fun,
+            forward_pass_builder_fun=model_build_fun,
+            param_update_builder_fun=param_update_fun,
+            devices=devices,
+            optimize_gradient_memory=True,
+            cpu_device=not gpu,
+        )
+
+        # Change all initialization to be ConstantFills so that
+        # the everything is deterministic
+        for op in model.param_init_net.Proto().op:
+            if op.type.endswith('Fill'):
+                op.type = 'ConstantFill'
+
+        # Each run has same input, independent of number of gpus
+        np.random.seed(20150210)
+        for i in range(0, 10):
+            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
+            full_target = np.random.rand(
+                self.T, self.batch_size, self.hidden_dim
+            )
+
+            for (j, g) in enumerate(devices):
+                st = j * self.batch_per_device
+                en = st + self.batch_per_device
+                data = full_data[:, st:en, :].astype(np.float32)
+                targets = full_target[:, st:en, :].astype(np.float32)
+                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
+                    workspace.FeedBlob(
+                        "{}_{}/data".format(model._device_prefix, g), data
+                    )
+                    workspace.FeedBlob(
+                        "{}_{}/target".format(model._device_prefix, g), targets
+                    )
+
+            if i == 0:
+                workspace.RunNetOnce(model.param_init_net)
+                workspace.CreateNet(model.net)
+
+            workspace.RunNet(model.net.Proto().name)
+
+        return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
+
+    def test_equiv_recurrent(self):
+        '''
+        Test that the model produces exactly same results given
+        total batchsize, independent of number of GPUs/CPUs.
+        '''
+        for gpu in [True, False]:
+            if gpu and not workspace.has_gpu_support:
+                continue
+            result_2gpus = self.run_model([0, 1], gpu)
+            result_1gpus = self.run_model([0], gpu)
+
+            self.assertTrue(np.allclose(result_1gpus, result_2gpus))
+
+            if not gpu or workspace.NumCudaDevices() >= 4:
+                result_4gpus = self.run_model(list(range(4)), gpu)
+                self.assertTrue(np.allclose(result_1gpus, result_4gpus))
+
+            if not gpu or workspace.NumCudaDevices() >= 8:
+                result_8gpus = self.run_model(list(range(8)), gpu)
+                self.assertTrue(np.allclose(result_1gpus, result_8gpus))
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+class SparseDataParallelModelTest(TestCase):
+
+    '''
+    Create and run the model. We try with both storing indices for gather
+    on CPU and on GPU
+    '''
+    def run_model(self, V, gpu_devices, cpu_indices):
+
+        def input_builder_fun(model):
+            return None
+
+        def model_build_fun(model, loss_scale):
+            if cpu_indices:
+                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                    gathered_cpu = model.net.Gather(
+                        [self.vecs, 'indices'], 'gathered_cpu')
+
+                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
+            else:
+                gpu_vecs = model.param_init_net.CopyCPUToGPU(
+                    self.vecs, "gpuvecs",
+                )
+                model.params.append(gpu_vecs)
+                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
+            flattened = model.Flatten(gathered, "flattened")
+            fc = model.FC(flattened, "fc", 16 * 16, 1,
+                          ("ConstantFill", {}), ("ConstantFill", {}))
+            fc_fl = model.FlattenToVec(fc, "fc_fl")
+            sigm = model.Sigmoid(fc_fl, "sigm")
+            sq = model.SquaredL2Distance([sigm, "label"], "sq")
+            loss = model.AveragedLoss(sq, "loss")
+            loss = model.Scale(loss, scale=loss_scale)
+            return [loss]
+
+        def param_update_fun(model):
+            ONE = model.param_init_net.ConstantFill(
+                [], "ONE", shape=[1], value=1.0,
+            )
+            LR = model.CopyCPUToGPU(self.LR, "LR")
+            for param in model.GetParams():
+                param_grad = model.param_to_grad[param]
+                if not isinstance(param_grad, core.GradientSlice):
+                    model.WeightedSum([param, ONE, param_grad, LR], param)
+                else:
+                    param_momentum = model.param_init_net.ConstantFill(
+                        [param],
+                        param + '_momentum',
+                        value=0.0,
+                    )
+                    model.net.SparseMomentumSGDUpdate(
+                        [
+                            param_grad.values,
+                            param_momentum,
+                            LR,
+                            param,
+                            param_grad.indices,
+                        ],
+                        [
+                            param_grad.values, param_momentum, param
+                        ],
+                        momentum=0.1,
+                        nesterov=0,
+                    )
+
+        workspace.ResetWorkspace()
+        model = cnn.CNNModelHelper(
+            order="NHWC",
+            name="sparse_test{}".format(gpu_devices),
+        )
+
+        with core.NameScope("cpu"):
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                self.ITER = model.Iter("ITER")
+                self.LR = model.net.LearningRate(
+                    [self.ITER],
+                    "LR",
+                    base_lr=(-0.1),
+                    policy="fixed",
+                )
+                self.vecs = model.param_init_net.UniformFill(
+                    [], "vecs", shape=[V, 16])
+                if cpu_indices:
+                    model.params.append(self.vecs)
+                self.ONE_CPU = model.param_init_net.ConstantFill(
+                    [], "ONE_CPU", shape=[1], value=1.0,
+                )
+
+        data_parallel_model.Parallelize_GPU(
+            model,
+            input_builder_fun=input_builder_fun,
+            forward_pass_builder_fun=model_build_fun,
+            param_update_builder_fun=param_update_fun,
+            devices=gpu_devices,
+        )
+
+        # Update the vecs
+        if cpu_indices:
+            with core.NameScope("cpu"):
+                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                    for param in model.GetParams():
+                        param_grad = model.param_to_grad[param]
+                        model.ScatterWeightedSum([param, self.ONE_CPU,
+                                                  param_grad.indices,
+                                                  param_grad.values,
+                                                  self.LR],
+                                                  self.vecs)
+        else:
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)
+
+        np.random.seed(2603)
+
+        # Each run has same input, independent of number of gpus
+        batch_size = 64
+        for i in range(0, 10):
+            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
+                batch_size, 16
+            )
+            full_labels = full_indices[:, 0] % 2
+            batch_per_device = batch_size // len(gpu_devices)
+
+            for (j, g) in enumerate(gpu_devices):
+                st = j * batch_per_device
+                en = st + batch_per_device
+                indices = full_indices[st:en, :].astype(np.int32)
+                labels = full_labels[st:en].astype(np.float32)
+
+                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
+                if not cpu_indices:
+                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)
+
+                with core.DeviceScope(device_for_indices):
+                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
+
+                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
+                    workspace.FeedBlob("gpu_{}/label".format(g), labels)
+
+            if i == 0:
+                workspace.RunNetOnce(model.param_init_net)
+                # Force vecs to be same on all runs
+                orig_vecs = np.random.rand(V, 16).astype(np.float32)
+                workspace.FeedBlob(
+                    self.vecs,
+                    orig_vecs
+                )
+                if not cpu_indices:
+                    for g in gpu_devices:
+                        workspace.FeedBlob(
+                            "gpu_{}/gpuvecs".format(g),
+                            orig_vecs,
+                            device_option=core.DeviceOption(caffe2_pb2.CUDA, g),
+                        )
+                workspace.CreateNet(model.net)
+
+            workspace.RunNet(model.net.Proto().name)
+            if len(gpu_devices) == 2:
+                if not cpu_indices:
+                    idx = workspace.FetchBlob("gpu_0/indices")
+                    idx = list(idx.flatten())
+                    n = len(idx)
+                    nu = len(set(idx))
+                    assert n == nu, "We cannot have duplicate indices"
+
+        # Sanity check to see the vecs were updated
+        self.assertFalse(
+            np.allclose(workspace.FetchBlob(self.vecs), orig_vecs))
+        return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
+                workspace.FetchBlob("gpu_0/fc_w")]
+
+    def _test_equiv_sparse(self, cpu_indices):
+        '''
+            Test that the model produces exactly same results given
+            total batchsize, independent of number of GPUs.
+        '''
+        V = 10000
+        result_2gpus = self.run_model(V, [0, 1], cpu_indices)
+        result_1gpus = self.run_model(V, [0], cpu_indices)
+
+        self.assertTrue(np.allclose(result_1gpus[0], result_2gpus[0]))
+        self.assertTrue(np.allclose(result_1gpus[1], result_2gpus[1]))
+
+        if workspace.NumCudaDevices() >= 4:
+            result_4gpus = self.run_model(V, list(range(4)), cpu_indices)
+            self.assertTrue(np.allclose(result_1gpus[0], result_4gpus[0]))
+            self.assertTrue(np.allclose(result_1gpus[1], result_4gpus[1]))
+
+        if workspace.NumCudaDevices() >= 8:
+            result_8gpus = self.run_model(V, list(range(8)), cpu_indices)
+            self.assertTrue(np.allclose(result_1gpus[0], result_8gpus[0]))
+            self.assertTrue(np.allclose(result_1gpus[1], result_8gpus[1]))
+
+    def test_equiv_sparse(self):
+        self._test_equiv_sparse(True)
+        self._test_equiv_sparse(False)
+
+
+@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+class ParallelizeBMUFTest(TestCase):
+
+    def _run_model(self, gpu_devices):
+        '''
+        Helper function for test_equiv
+        '''
+        def input_builder_fun(model):
+            return None
+
+    def _model_build_fun(self, model, loss_scale):
+        fc = model.FC(
+            "data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})
+        )
+        fc_fl = model.FlattenToVec(fc, "fc_fl")
+        sigm = model.Sigmoid(fc_fl, "sigm")
+        sq = model.SquaredL2Distance([sigm, "label"], "sq")
+        loss = model.AveragedLoss(sq, "loss")
+        loss = model.Scale(loss, scale=loss_scale)
+
+        return [loss]
+
+    def _param_update_fun(self, model):
+        ITER = model.Iter("ITER")
+        LR = model.net.LearningRate(
+            [ITER],
+            "LR",
+            base_lr=(-0.1),
+            policy="fixed",
+        )
+        ONE = model.param_init_net.ConstantFill(
+            [], "ONE", shape=[1], value=1.0,
+        )
+        for param in model.GetParams():
+            grad = model.param_to_grad[param]
+            model.WeightedSum([param, ONE, grad, LR], param)
+
+    def _generate_data(self, devices, device_type, device_prefix):
+        np.random.seed(26)
+        # Each run has same input, independent of number of gpus
+        batch_size = 64
+        for _ in range(0, 10):
+            full_data = np.random.rand(batch_size, 16)
+            full_labels = np.round(full_data[:, 0])
+            batch_per_device = batch_size // len(devices)
+
+            for (j, g) in enumerate(devices):
+                st = j * batch_per_device
+                en = st + batch_per_device
+                data = full_data[st:en, :].astype(np.float32)
+                labels = full_labels[st:en].astype(np.float32)
+                with core.DeviceScope(core.DeviceOption(device_type, g)):
+                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data)
+                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels)
+
+    @given(
+        cpu_device=st.booleans()
+    )
+    def test_parallelize_bmuf(self, cpu_device):
+        assume(cpu_device or workspace.has_gpu_support)
+
+        workspace.ResetWorkspace()
+
+        model = cnn.CNNModelHelper(
+            order="NHWC",
+            name="test"
+        )
+        devices = [0, 1]
+
+        def input_builder_fun(model):
+            return None
+
+        if not cpu_device:
+            device_type = caffe2_pb2.CUDA
+            device_prefix = "gpu"
+        else:
+            device_type = caffe2_pb2.CPU
+            device_prefix = "cpu"
+        self._generate_data(devices, device_type, device_prefix)
+
+        data_parallel_model.Parallelize_BMUF(
+            model,
+            input_builder_fun,
+            self._model_build_fun,
+            self._param_update_fun,
+            devices=devices,
+            cpu_device=cpu_device
+        )
+
+        data_parallel_model.RunInitNet(model)
+
+        # Check initial momentum params are zeros
+        self.assertEqual(
+            list(viewkeys(model._device_grouped_blobs)), ['fc_w', 'fc_b']
+        )
+        self.assertEqual(workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix)), 0)
+        np.testing.assert_equal(
+            workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix)),
+            np.zeros(16).astype(np.float32).reshape(1, 16)
+        )
+
+        # Run the algorithm for one iteration to have non-zero params.
+        data_parallel_model.RunNet(model, 1)
+
+        # Save iteration momentum and post local update params
+        v_b_ = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
+        v_w_ = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))
+
+        workspace.RunNetOnce(model.net)
+
+        b_0_ = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
+        w_0_ = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
+        b_1_ = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))
+        w_1_ = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))
+
+        # Compute block gradients.
+        b_g_ = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
+        w_g_ = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
+        workspace.RunNetOnce(model._global_model_param_updates_net)
+
+        g_b = (b_0_ + b_1_) / 2 - b_g_
+        g_w = (w_0_ + w_1_) / 2 - w_g_
+        v_b = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
+        v_w = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))
+
+        w_g = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
+        b_g = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
+        w_0 = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
+        b_0 = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
+        w_1 = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))
+        b_1 = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))
+
+        # Check momentum update step
+        np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b)
+        np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w)
+
+        np.testing.assert_equal(w_g, w_0)
+        np.testing.assert_equal(w_g, w_1)
+        np.testing.assert_equal(b_g, b_0)
+        np.testing.assert_equal(b_g, b_1)
+
+        # Check params update step
+        np.testing.assert_equal(w_0, w_g_ + v_w)
+        np.testing.assert_equal(b_0, b_g_ + v_b)
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+class SparseDataParallelModelTestWithSharedIndices(TestCase):
+
+    '''
+    Create and run the model. We try with both storing indices for gather
+    on CPU and on GPU
+    '''
+    def run_model(self, V, gpu_devices):
+
+        def input_builder_fun(model):
+            return None
+
+        def model_build_fun(model, loss_scale):
+            gpu_vecs_gathered = []
+            gpu_vecs = []
+            for num, vec in enumerate(self.vecs):
+                gpu_vec = model.param_init_net.CopyCPUToGPU(
+                    vec, 'gpuvec_{}'.format(num),
+                )
+                if num != 2:
+                    model.params.append(gpu_vec)
+                gpu_vecs.append(gpu_vec)
+            for num, gpu_vec in enumerate(gpu_vecs):
+                gpu_vec_gathered = model.net.Gather(
+                    [gpu_vec, 'indices'],
+                    ['gpu_vec_gathered_{}'.format(num)]
+                )
+                gpu_vecs_gathered.append(gpu_vec_gathered)
+
+            assert len(gpu_vecs_gathered) == 3
+
+            fc = model.net.FC(
+                [
+                    gpu_vecs_gathered[2],
+                    gpu_vecs_gathered[0],
+                    gpu_vecs_gathered[1],
+                ],
+                ['fc'],
+            )
+            _, loss = model.net.SoftmaxWithLoss(
+                [fc, 'label'],
+                ['ce_loss', 'avg_loss'],
+                only_loss=True,
+            )
+            loss = model.Scale(loss, scale=loss_scale)
+            model.net.Print(loss, [], limit=10)
+            return [loss]
+
+        def param_update_fun(model):
+            ONE = model.param_init_net.ConstantFill(
+                [], "ONE", shape=[1], value=1.0,
+            )
+            LR = model.CopyCPUToGPU(self.LR, "LR")
+            for param in model.GetParams():
+                param_grad = model.param_to_grad[param]
+                if not isinstance(param_grad, core.GradientSlice):
+                    model.WeightedSum([param, ONE, param_grad, LR], param)
+                else:
+                    model.net.ScatterWeightedSum(
+                        [
+                            param,
+                            ONE,
+                            param_grad.indices,
+                            param_grad.values,
+                            ONE,
+                        ],
+                        param,
+                    )
+
+        workspace.ResetWorkspace()
+        model = cnn.CNNModelHelper(
+            order="NHWC",
+            name="sparse_test{}".format(gpu_devices),
+        )
+        batch_size = 32
+        batch_per_device = batch_size // len(gpu_devices)
+
+        with core.NameScope("cpu"):
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                self.ITER = model.Iter("ITER")
+                self.LR = model.net.LearningRate(
+                    [self.ITER],
+                    "LR",
+                    base_lr=(-0.1),
+                    policy="fixed",
+                )
+                '''
+                self.vecs consists of 3 big blobs on which we call Gather:
+                1) FC weights, shape=(V, 16)
+                2) FC bias, shape=(V)
+                3) FC input, shape=(batch_per_device, 16)
+                '''
+                self.vecs = [
+                    model.param_init_net.UniformFill(
+                        [], "vec_{}".format(num), shape=[V, 16])
+                    for num in range(2)
+                ]
+                self.vecs.append(
+                    model.param_init_net.UniformFill(
+                        [],
+                        "vec_2", shape=[batch_per_device, 16]
+                    )
+                )
+                self.ONE_CPU = model.param_init_net.ConstantFill(
+                    [], "ONE_CPU", shape=[1], value=1.0,
+                )
+
+        data_parallel_model.Parallelize_GPU(
+            model,
+            input_builder_fun=input_builder_fun,
+            forward_pass_builder_fun=model_build_fun,
+            param_update_builder_fun=param_update_fun,
+            devices=gpu_devices,
+        )
+
+        # Update the vecs
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            for num, vec in enumerate(self.vecs[:-1]):
+                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)
+
+        # Each run has same input, independent of number of gpus
+        for i in range(0, 10):
+            np.random.seed(2603)
+            full_indices = np.random.permutation(V)[:batch_size].reshape(
+                batch_size
+            )
+            full_labels = full_indices[:] % batch_per_device
+
+            for (j, g) in enumerate(gpu_devices):
+                st = j * batch_per_device
+                en = st + batch_per_device
+                indices = full_indices[st:en].astype(np.int32)
+                labels = full_labels[st:en].astype(np.int32)
+
+                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
+                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
+                    workspace.FeedBlob("gpu_{}/label".format(g), labels)
+
+            if i == 0:
+                workspace.RunNetOnce(model.param_init_net)
+                # Force vecs to be same on all runs
+                orig_vecs = [
+                    np.random.rand(V, 16).astype(np.float32),
+                    np.random.rand(V).astype(np.float32),
+                    np.random.rand(V, 16).astype(np.float32),
+                ]
+                for vec, orig_vec in zip(self.vecs, orig_vecs):
+                    workspace.FeedBlob(
+                        vec,
+                        orig_vec
+                    )
+                for g in gpu_devices:
+                    for num, orig_vec in enumerate(orig_vecs):
+                        workspace.FeedBlob(
+                            "gpu_{}/gpuvec_{}".format(g, num),
+                            orig_vec,
+                            device_option=core.DeviceOption(
+                                caffe2_pb2.CUDA, g),
+                        )
+                workspace.CreateNet(model.net)
+
+            workspace.RunNet(model.net.Proto().name)
+
+            idx = workspace.FetchBlob('gpu_0/indices')
+            grad_slices = [
+                workspace.FetchBlob(
+                    'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num))
+                for g in gpu_devices for num in range(2)
+            ]
+            for grad_slice in grad_slices:
+                # print (len(idx), len(grad_slice))
+                assert len(idx) == len(grad_slice), (
+                    'Number of indices {} is not same as number of gradient '
+                    'slices {}. This might lead to illegal memory access'.format(
+                        len(idx), len(grad_slice)
+                    )
+                )
+
+    def test_sparse_shared_indices_gpu(self):
+        '''
+            Test that the model has same number of indices and gradient rows
+            given total batchsize, independent of number of GPUs.
+        '''
+        V = 10000
+        self.run_model(V, [0, 1])
+        self.run_model(V, [0])
+
+        if workspace.NumCudaDevices() >= 4:
+            self.run_model(V, list(range(4)))
+
+        if workspace.NumCudaDevices() >= 8:
+            self.run_model(V, list(range(8)))
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py
new file mode 100644
index 0000000..65866c3
--- /dev/null
+++ b/caffe2/python/data_workers.py
@@ -0,0 +1,465 @@
+## @package data_workers
+# Module caffe2.python.data_workers
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+'''
+This module provides a python-land multithreaded data input mechanism
+for Caffe2 nets.
+
+Basic usage is as follows:
+   coordinator = data_workers.init_data_input_workers(
+      net,
+      ["data", "label"],
+      my_fetch_fun,
+      batch_size=32,
+      input_source_name="train",
+      dont_rebatch=False
+   )
+   ...
+   coordinator.start()
+
+First argument is the Caffe2 net (or model helper), and second argument
+is list of input blobs that are to be fed.
+
+Argument 'input_source_name' is used to distinguish different sources of data,
+such as train or test data. This is to ensure the data does not get mixed up,
+although two nets would share blobs.
+
+To do the actual data loading, one defines a "fetcher function"
+that has call signature
+   my_fetch_fun(worker_id, batch_size)
+
+Optionally, one can define a "init function" that is called once before
+threads start, and has call signature:
+   my_init_fun(data_coordinator, global_coordinator)
+
+If dont_rebatch is set to True, the data input is not batched into equal sized
+chunks but data directly provided by fetchers is used.
+
+'batch_columns' can be used to specify which dimension is the batch dimension,
+for each of the inputs. Default is 0 for all iputs.
+
+'timeout' is the timeout in seconds after which if no data is available, the
+net will fail (default 600s = 10 mins).
+
+This function returns a list of numpy arrays corresponding to the different
+input blobs. In the example above, it would return two arrays, one for the
+data blob and another for the labels. These arrays can have arbitrary number
+of elements (i.e they do not need to match the batch size). The batch size
+is provided for the function as a hint only.
+
+For example, fetcher function could download images from a remote service or
+load random images from a directory on a file system.
+
+For a dummy example, see the data_workers_test unit test.
+
+Note that for data_parallel_models, init_data_input_workers will be called
+for each GPU. Note that the 'coordinator' returned by the function is same
+each time.
+'''
+
+try:
+    import Queue
+except ImportError:
+    # Py3
+    import queue as Queue
+from itertools import chain
+import logging
+import threading
+import numpy as np
+import time
+
+from caffe2.python import workspace, core, scope, utils
+from caffe2.proto import caffe2_pb2
+from caffe2.python.parallel_workers import Metrics, State, \
+    WorkerCoordinator, GlobalWorkerCoordinator, Worker, run_worker
+
+log = logging.getLogger("data_workers")
+log.setLevel(logging.INFO)
+LOG_INT_SECS = 60
+
+
+def get_worker_ids(num_workers):
+    return list(range(0, num_workers))
+
+
+def init_data_input_workers(
+    net,
+    input_blob_names,
+    fetch_fun,
+    batch_size,
+    num_worker_threads=2,
+    input_source_name="train",
+    max_buffered_batches=800,
+    init_fun=None,
+    external_loggers=None,
+    dont_rebatch=False,
+    batch_columns=None,
+    timeout=600
+):
+    global global_coordinator
+    device_option = scope.CurrentDeviceScope()
+    if (device_option is None):
+        device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU)
+
+    metrics = Metrics(external_loggers)
+    batch_feeder = BatchFeeder(
+        net,
+        input_blob_names,
+        batch_size,
+        device_option,
+        scope.CurrentNameScope(),
+        input_source_name,
+        global_coordinator.get_queue(input_source_name, max_buffered_batches),
+        metrics,
+        dont_rebatch,
+        batch_columns,
+        timeout=timeout
+    )
+
+    # Launch fetch worker threads
+    worker_ids = [
+        global_coordinator.get_new_worker_id()
+        for i in range(num_worker_threads)
+    ]
+
+    # Create coordinator object
+    coordinator = WorkerCoordinator(
+        input_source_name, worker_ids, init_fun, batch_feeder)
+
+    workers = [
+        threading.Thread(
+            target=run_worker,
+            name="data_workers fetcher id {}".format(worker_id),
+            args=[coordinator,
+                  DataWorker(coordinator, worker_id, fetch_fun, metrics,
+                             batch_size, batch_feeder)],
+        ) for worker_id in worker_ids
+    ]
+
+    workers.append(threading.Thread(
+        target=enqueuer,
+        name="Enqueuer {} {}".format(input_source_name, scope.CurrentNameScope()),
+        args=[coordinator, batch_feeder]))
+    coordinator._workers = workers
+    global_coordinator.add(coordinator)
+
+    return global_coordinator
+
+
+class BatchFeeder(State):
+    def __init__(self, net, input_blob_names, batch_size,
+                 device_option, namescope, input_source_name, queue,
+                 metrics, dont_rebatch, batch_columns, timeout=600):
+        self._counter = 0
+        self._input_blob_names = input_blob_names
+        self._batch_size = batch_size
+        self._internal_queue = queue
+        self._queues = []
+        self._device_option = device_option
+        self._namescope = namescope
+        self._timeout = timeout
+        self._input_source_name = input_source_name
+        self._c2_queue_capacity = 4
+        self._create_caffe2_queues(net)
+        self._create_caffe2_ops(net)
+        self._inputs = 0
+        self._prev_seconds = 0
+        self._last_warning = time.time()
+        self._dont_rebatch = dont_rebatch
+        self._init_scratch()
+        self._metrics = metrics
+
+        if batch_columns is None:
+            batch_columns = [0 for _ in input_blob_names]
+        self._batch_columns = batch_columns
+
+    def start(self):
+        self._inputs = 0
+        self._prev_seconds = time.time()
+
+    def stop(self):
+        try:
+            for q in self._queues:
+                workspace.RunOperatorOnce(
+                    core.CreateOperator("CloseBlobsQueue", [q], [])
+                )
+        finally:
+            self._log_inputs_per_interval(0, force=True)
+
+    def cleanup(self):
+        utils.ResetBlobs(self._scratch_blob.values())
+        utils.ResetBlobs(self._scratch_status.values())
+
+    def _get(self, data_input_coordinator):
+        start_time = time.time()
+        last_warning = time.time()
+        while data_input_coordinator.is_active():
+            try:
+                return self._internal_queue.get(block=True, timeout=0.5)
+            except Queue.Empty:
+                if time.time() - last_warning > 10.0:
+                    log.warning("** Data input is slow: (still) no data in {} secs.".format(
+                        time.time() - start_time))
+                    last_warning = time.time()
+                continue
+        return None
+
+    def _validate_chunk(self, chunk):
+        if chunk is None:
+            log.warning("Fetcher function returned None")
+            return False
+
+        assert len(chunk) == len(self._input_blob_names), \
+            "Expecting data blob for each input"
+        for d in chunk:
+            assert isinstance(d, np.ndarray), \
+                "Fetcher function must return a numpy array"
+        if not self._dont_rebatch:
+            j = 1
+            for d in chunk[1:]:
+                assert d.shape[self._batch_columns[j]] == \
+                    chunk[0].shape[self._batch_columns[0]], \
+                    "Each returned input must have equal number of samples"
+                j += 1
+
+        if len(chunk) == 0:
+            log.warning("Worker provided zero length input")
+            return False
+
+        return True
+
+    def put(self, chunk, data_input_coordinator):
+        if not self._validate_chunk(chunk):
+            return
+
+        while data_input_coordinator.is_active():
+            try:
+                qsize = self._internal_queue.qsize()
+                if qsize < 2 and (time.time() - self._last_warning) > LOG_INT_SECS:
+                    log.warning("Warning, data loading lagging behind: " +
+                                "name={}".format(qsize, self._input_source_name))
+                    self._last_warning = time.time()
+                self._counter += 1
+                self._internal_queue.put(chunk, block=True, timeout=0.5)
+                self._log_inputs_per_interval(chunk[0].shape[0])
+                return
+            except Queue.Full:
+                log.debug("Queue full: stalling fetchers...")
+                continue
+
+    def _enqueue_batch_direct(self, data_input_coordinator):
+        data = self._get(data_input_coordinator)
+        if data is None:
+            return
+        if data_input_coordinator.is_active():
+            for b, q, c in zip(self._input_blob_names, self._queues, data):
+                self._enqueue(b, q, c)
+
+    def _enqueue_batch(self, data_input_coordinator):
+        '''
+        This pulls data from the python-side queue and collects them
+        into batch-sized pieces, unless dont_rebatch is set to true.
+        '''
+        if self._dont_rebatch:
+            self._enqueue_batch_direct(data_input_coordinator)
+            return
+
+        cur_batch = [np.array([]) for d in self._input_blob_names]
+        first_batch_col = self._batch_columns[0]
+
+        # Collect data until we have a full batch size
+        while (
+            cur_batch[0].shape[0] == 0 or
+            cur_batch[0].shape[first_batch_col] < self._batch_size
+        ) and data_input_coordinator.is_active():
+            chunk = self._get(data_input_coordinator)
+            if chunk is None:
+                continue
+
+            for j, chunk_elem in enumerate(chunk):
+                if cur_batch[j].shape[0] == 0:
+                    cur_batch[j] = chunk_elem.copy()
+                else:
+                    cur_batch[j] = np.append(
+                        cur_batch[j], chunk_elem, axis=self._batch_columns[j]
+                    )
+
+        start_time = time.time()
+        try:
+            # Return data over the batch size back to queue
+            if cur_batch[0].shape[0] > 0 and cur_batch[0].shape[
+                first_batch_col
+            ] > self._batch_size:
+                leftover = []
+                trimmed_batch = []
+                for j, b in enumerate(cur_batch):
+                    [c, l] = np.split(
+                        b, [self._batch_size], axis=self._batch_columns[j]
+                    )
+                    leftover.append(l)
+                    trimmed_batch.append(c)
+                cur_batch = trimmed_batch
+                try:
+                    self._internal_queue.put(leftover, block=False)
+                except Queue.Full:
+                    pass
+
+                assert cur_batch[0].shape[first_batch_col] == self._batch_size
+
+            if data_input_coordinator.is_active():
+                for b, q, c in zip(
+                    self._input_blob_names, self._queues, cur_batch
+                ):
+                    self._enqueue(b, q, c)
+        finally:
+            self._metrics.put_metric('enqueue_time', time.time() - start_time)
+
+    def _init_scratch(self):
+        self._scratch_blob = {}
+        self._scratch_status = {}
+        for blob_name in self._input_blob_names:
+            scratch_name = self._namescope + blob_name + \
+                "_scratch_" + self._input_source_name
+            self._scratch_blob[blob_name] = core.BlobReference(scratch_name)
+            self._scratch_status[blob_name] = core.BlobReference(
+                scratch_name + "_status"
+            )
+
+        # Feed empty arrays to the scratch blobs here, so that there won't be
+        # race conditions when calling FeedBlob (which calls wworkspace
+        # CreateBlob()) from enqueue threads
+        for b in chain(
+            self._scratch_blob.values(), self._scratch_status.values()
+        ):
+            workspace.FeedBlob(
+                b,
+                np.array([]).astype(np.float32),
+                device_option=self._device_option,
+            )
+
+    def _enqueue(self, blob_name, queue, data_arr):
+        '''
+        Enqueue the correctly sized batch arrays to Caffe2's queue.
+        '''
+        workspace.FeedBlob(
+            self._scratch_blob[blob_name],
+            data_arr,
+            device_option=self._device_option
+        )
+
+        op = core.CreateOperator(
+            "SafeEnqueueBlobs",
+            [queue, self._scratch_blob[blob_name]],
+            [self._scratch_blob[blob_name], self._scratch_status[blob_name]],
+            device_option=self._device_option
+        )
+        workspace.RunOperatorOnce(op)
+
+    def _create_caffe2_queues(self, net):
+        '''
+        Creates queues on caffe2 side
+        '''
+        def create_queue(queue_name, num_blobs, capacity):
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "CreateBlobsQueue",
+                    [], [queue_name],
+                    num_blobs=1,
+                    capacity=capacity))
+            return core.ScopedBlobReference(queue_name)
+
+        for blob_name in self._input_blob_names:
+            qname = blob_name + "_c2queue" + "_" + self._input_source_name
+            q = create_queue(
+                qname, num_blobs=1, capacity=self._c2_queue_capacity
+            )
+            self._queues.append(q)
+
+    def _create_caffe2_ops(self, net):
+        '''
+        Creates dequeue-ops on caffe2 side
+        '''
+        for q, blob_name in zip(self._queues, self._input_blob_names):
+            # Add operator to the Caffe2 network to dequeue
+            net.DequeueBlobs(q, blob_name, timeout_secs=float(self._timeout))
+
+    def _log_inputs_per_interval(self, inputs, force=False):
+        self._inputs += inputs
+        current_seconds = time.time()
+        delta_seconds = current_seconds - self._prev_seconds
+        if delta_seconds >= LOG_INT_SECS or force:
+            inputs_per_sec = int(self._inputs / delta_seconds)
+            qsize = self._internal_queue.qsize()
+            log.info("{}/{}: {} inputs/sec".format(
+                self._input_source_name,
+                self._namescope,
+                inputs_per_sec,
+            ))
+            log.info("-- queue: {} batches".format(qsize))
+            # log and reset perf metrics
+            self._metrics.put_metric(
+                'inputs_per_sec', inputs_per_sec, False)
+            self._metrics.put_metric('queue_size', qsize, False)
+            self._metrics.put_metric(
+                'time_elapsed', delta_seconds, False)
+            self._metrics.log_metrics()
+            self._metrics.reset_metrics()
+            self._inputs = 0
+            self._prev_seconds = current_seconds
+
+
+class GlobalCoordinator(GlobalWorkerCoordinator):
+    def __init__(self):
+        GlobalWorkerCoordinator.__init__(self)
+        self._queues = {}
+
+    def get_queue(self, queue_name, max_buffered_batches):
+        assert isinstance(max_buffered_batches, int)
+        if queue_name not in self._queues:
+            self._queues[queue_name] = Queue.Queue(maxsize=max_buffered_batches)
+        return self._queues[queue_name]
+
+    def reset_data_input(self, namescope, name, net, batch_size):
+        log.info("Reset data input {}, batch size {}: ".format(name, batch_size))
+        for c in self._coordinators:
+            if c._worker_name == name and c._state._namescope == namescope:
+                c._state._batch_size = batch_size
+                c._state._create_caffe2_ops(net)
+
+
+class DataWorker(Worker):
+    def __init__(
+        self,
+        coordinator,
+        worker_id,
+        worker_fun,
+        metrics,
+        batch_size,
+        batch_feeder
+    ):
+        Worker.__init__(self, coordinator, worker_id, worker_fun=worker_fun,
+                        metrics=metrics)
+        self._batch_size = batch_size
+        self._batch_feeder = batch_feeder
+
+    def run(self):
+        input_data = self._worker_fun(self._worker_id, self._batch_size)
+
+        self._batch_feeder.put(input_data, self._coordinator)
+
+    def finish(self):
+        self._metrics.put_metric(
+            'fetcher_time', time.time() - self._start_time)
+
+
+global_coordinator = GlobalCoordinator()
+
+
+def enqueuer(coordinator, batch_feeder):
+    while coordinator.is_active():
+        batch_feeder._enqueue_batch(coordinator)
diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py
new file mode 100644
index 0000000..ecf6279
--- /dev/null
+++ b/caffe2/python/data_workers_test.py
@@ -0,0 +1,195 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+import time
+
+from caffe2.python import workspace, model_helper
+from caffe2.python import timeout_guard
+import caffe2.python.data_workers as data_workers
+
+
+def dummy_fetcher(fetcher_id, batch_size):
+    # Create random amount of values
+    n = np.random.randint(64) + 1
+    data = np.zeros((n, 3))
+    labels = []
+    for j in range(n):
+        data[j, :] *= (j + fetcher_id)
+        labels.append(data[j, 0])
+
+    return [np.array(data), np.array(labels)]
+
+
+def dummy_fetcher_rnn(fetcher_id, batch_size):
+    # Hardcoding some input blobs
+    T = 20
+    N = batch_size
+    D = 33
+    data = np.random.rand(T, N, D)
+    label = np.random.randint(N, size=(T, N))
+    seq_lengths = np.random.randint(N, size=(N))
+    return [data, label, seq_lengths]
+
+
+class DataWorkersTest(unittest.TestCase):
+
+    def testNonParallelModel(self):
+        workspace.ResetWorkspace()
+
+        model = model_helper.ModelHelper(name="test")
+        old_seq_id = data_workers.global_coordinator._fetcher_id_seq
+        coordinator = data_workers.init_data_input_workers(
+            model,
+            ["data", "label"],
+            dummy_fetcher,
+            32,
+            2,
+            input_source_name="unittest"
+        )
+        new_seq_id = data_workers.global_coordinator._fetcher_id_seq
+        self.assertEqual(new_seq_id, old_seq_id + 2)
+
+        coordinator.start()
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+
+        for _i in range(500):
+            with timeout_guard.CompleteInTimeOrDie(5):
+                workspace.RunNet(model.net.Proto().name)
+
+            data = workspace.FetchBlob("data")
+            labels = workspace.FetchBlob("label")
+
+            self.assertEqual(data.shape[0], labels.shape[0])
+            self.assertEqual(data.shape[0], 32)
+
+            for j in range(32):
+                self.assertEqual(labels[j], data[j, 0])
+                self.assertEqual(labels[j], data[j, 1])
+                self.assertEqual(labels[j], data[j, 2])
+
+        coordinator.stop_coordinator("unittest")
+        self.assertEqual(coordinator._coordinators, [])
+
+    def testRNNInput(self):
+        workspace.ResetWorkspace()
+        model = model_helper.ModelHelper(name="rnn_test")
+        old_seq_id = data_workers.global_coordinator._fetcher_id_seq
+        coordinator = data_workers.init_data_input_workers(
+            model,
+            ["data1", "label1", "seq_lengths1"],
+            dummy_fetcher_rnn,
+            32,
+            2,
+            dont_rebatch=False,
+            batch_columns=[1, 1, 0],
+        )
+        new_seq_id = data_workers.global_coordinator._fetcher_id_seq
+        self.assertEqual(new_seq_id, old_seq_id + 2)
+
+        coordinator.start()
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+
+        while coordinator._coordinators[0]._state._inputs < 100:
+            time.sleep(0.01)
+
+        # Run a couple of rounds
+        workspace.RunNet(model.net.Proto().name)
+        workspace.RunNet(model.net.Proto().name)
+
+        # Wait for the enqueue thread to get blocked
+        time.sleep(0.2)
+
+        # We don't dequeue on caffe2 side (as we don't run the net)
+        # so the enqueue thread should be blocked.
+        # Let's now shutdown and see it succeeds.
+        self.assertTrue(coordinator.stop())
+
+    def testInputOrder(self):
+        #
+        # Create two models (train and validation) with same input blobs
+        # names and ensure that both will get the data in correct order
+        #
+        workspace.ResetWorkspace()
+        self.counters = {0: 0, 1: 1}
+
+        def dummy_fetcher_rnn_ordered1(fetcher_id, batch_size):
+            # Hardcoding some input blobs
+            T = 20
+            N = batch_size
+            D = 33
+            data = np.zeros((T, N, D))
+            data[0][0][0] = self.counters[fetcher_id]
+            label = np.random.randint(N, size=(T, N))
+            label[0][0] = self.counters[fetcher_id]
+            seq_lengths = np.random.randint(N, size=(N))
+            seq_lengths[0] = self.counters[fetcher_id]
+            self.counters[fetcher_id] += 1
+            return [data, label, seq_lengths]
+
+        workspace.ResetWorkspace()
+        model = model_helper.ModelHelper(name="rnn_test_order")
+
+        coordinator = data_workers.init_data_input_workers(
+            model,
+            input_blob_names=["data2", "label2", "seq_lengths2"],
+            fetch_fun=dummy_fetcher_rnn_ordered1,
+            batch_size=32,
+            max_buffered_batches=1000,
+            num_worker_threads=1,
+            dont_rebatch=True,
+            input_source_name='train'
+        )
+        coordinator.start()
+
+        val_model = model_helper.ModelHelper(name="rnn_test_order_val")
+        coordinator1 = data_workers.init_data_input_workers(
+            val_model,
+            input_blob_names=["data2", "label2", "seq_lengths2"],
+            fetch_fun=dummy_fetcher_rnn_ordered1,
+            batch_size=32,
+            max_buffered_batches=1000,
+            num_worker_threads=1,
+            dont_rebatch=True,
+            input_source_name='val'
+        )
+        coordinator1.start()
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+        workspace.CreateNet(val_model.net)
+
+        while coordinator._coordinators[0]._state._inputs < 900:
+            time.sleep(0.01)
+
+        with timeout_guard.CompleteInTimeOrDie(5):
+            for m in (model, val_model):
+                print(m.net.Proto().name)
+                workspace.RunNet(m.net.Proto().name)
+                last_data = workspace.FetchBlob('data2')[0][0][0]
+                last_lab = workspace.FetchBlob('label2')[0][0]
+                last_seq = workspace.FetchBlob('seq_lengths2')[0]
+
+                # Run few rounds
+                for _i in range(10):
+                    workspace.RunNet(m.net.Proto().name)
+                    data = workspace.FetchBlob('data2')[0][0][0]
+                    lab = workspace.FetchBlob('label2')[0][0]
+                    seq = workspace.FetchBlob('seq_lengths2')[0]
+                    self.assertEqual(data, last_data + 1)
+                    self.assertEqual(lab, last_lab + 1)
+                    self.assertEqual(seq, last_seq + 1)
+                    last_data = data
+                    last_lab = lab
+                    last_seq = seq
+
+            time.sleep(0.2)
+
+            self.assertTrue(coordinator.stop())
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
new file mode 100644
index 0000000..a51251a
--- /dev/null
+++ b/caffe2/python/dataio.py
@@ -0,0 +1,604 @@
+## @package dataio
+# Module caffe2.python.dataio
+"""
+Defines the base interface for reading and writing operations.
+
+Readers/Writers are objects that produce operations that read/write sequences
+of data. Each operation reads or writes a list of BlobReferences.
+
+Readers and Writers must be implemented such that read and write operations
+are atomic and thread safe.
+
+Examples of possible Readers and Writers:
+    QueueReader, QueueWriter,
+    DatasetReader, DatasetWriter,
+
+See `dataset.py` for an example of implementation.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.python.schema import Field, Struct, from_blob_list
+import numpy as np
+import time
+
+
+class Reader(object):
+    """
+    Reader is an abstract class to be implemented in order to provide
+    operations capable of iterating through a dataset or stream of data.
+
+    A Reader must implement at least one operation, `read`, which
+    adds operations to a net that read the next batch of data. Readers can
+    optionally support the `reset` operation, which is useful when multiple
+    passes over the data are required.
+    """
+    def __init__(self, schema=None):
+        if schema is not None:
+            assert isinstance(schema, Field)
+        self._schema = schema
+
+    def schema(self):
+        assert self._schema is not None, 'Schema not provided for this reader.'
+        return self._schema
+
+    def _set_schema(self, schema):
+        self._schema = schema
+
+    def setup_ex(self, init_net, finish_net):
+        """Setup nets to run at task initialization and cleanup time.
+
+        Args:
+            global_init_net: A net invoked at task init time.
+            global_finish_net: A net invoked at task cleanup time.
+        """
+        pass
+
+    def read_ex(self, local_init_net, local_finish_net):
+        read_net = core.Net('reader_body')
+        return ([read_net], ) + self.read(read_net)
+
+    def read_record_ex(self, local_init_net, local_finish_net):
+        nets, should_stop, fields = self.read_ex(
+            local_init_net, local_finish_net)
+        if self._schema:
+            fields = from_blob_list(self._schema, fields)
+        return nets, should_stop, fields
+
+    def read(self, read_net):
+        """Append operations to read_net that will read a batch from the
+        underlying data soruce.
+
+        Operations added to `read_net` must be thread safe and atomic, that is,
+        it should be possible to clone `read_net` and run multiple instances of
+        it in parallel.
+
+        Args:
+            read_net: the net that will be appended with read operations
+
+        Returns:
+            A tuple (should_stop, fields), with:
+                should_stop: BlobReference pointing to a boolean scalar
+                    blob that indicates whether the read operation
+                    was succesfull or whether the end of data has
+                    been reached.
+                fields: A tuple of BlobReference containing the latest batch
+                    of data that was read.
+        """
+        raise NotImplementedError('Readers must implement `read`.')
+
+    def reset(self, net):
+        """Append operations to `net` that will reset the reader.
+
+        This can be used to read the data multiple times.
+        Not all readers support this operation.
+        """
+        raise NotImplementedError('This reader cannot be resetted.')
+
+    def read_record(self, read_net):
+        should_stop, fields = self.read(read_net)
+        if self._schema:
+            fields = from_blob_list(self._schema, fields)
+        return should_stop, fields
+
+    def execution_step(self, reader_net_name=None, external_should_stop=None):
+        """Create an execution step with a net containing read operators.
+
+        The execution step will contain a `stop_blob` that knows how to stop
+        the execution loop when end of data was reached.
+
+        E.g.:
+
+            read_step, fields = reader.execution_step()
+            consume_net = core.Net('consume')
+            consume_net.Print(fields[0], [])
+            p = core.Plan('reader')
+            p.AddStep(read_step.AddNet(consume_net))
+            core.RunPlan(p)
+
+        Args:
+            reader_net_name: (optional) the name of the reader_net to be
+                             created. The execution step will
+                             be named accordingly.
+
+        Returns:
+            A tuple (read_step, fields), with:
+                read_step: A newly created execution step containing a net with
+                           read operations. The step will have `stop_blob` set,
+                           in order to stop the loop on end of data.
+                fields: A tuple of BlobReference containing the latest batch
+                        of data that was read.
+        """
+        reader_net = core.Net(reader_net_name or 'reader')
+        should_stop, fields = self.read_record(reader_net)
+        if external_should_stop is not None:
+            should_stop = reader_net.Or([external_should_stop, should_stop])
+        read_step = core.execution_step(
+            '{}_step'.format(reader_net_name),
+            reader_net,
+            should_stop_blob=should_stop)
+        return (read_step, fields)
+
+
+class Writer(object):
+    """
+    Writer is an abstract class to be implemented in order to provide
+    operations capable of feeding a data stream or a dataset.
+
+    A Writer must implement 2 operations:
+    `write`, which adds operations to a net that write the write batch of
+    data, and `commit`, which adds operations to a net in order to indicate
+    that no more data will be written.
+    """
+    _schema = None
+
+    def schema(self):
+        return self._schema
+
+    def write(self, writer_net, fields):
+        """Add operations to `writer_net` that write the next batch of data.
+
+        Operations added to the net must be thread-safe and unique, that is:
+        multiple writers must be able to write to the dataset in parallel.
+
+        Args:
+            fields: a tuple of BlobReference containing the batch of data to
+                    write.
+        """
+        raise NotImplementedError('Writers must implement write.')
+
+    def write_record(self, writer_net, fields):
+        if isinstance(fields, Field):
+            self._schema = fields
+            fields = fields.field_blobs()
+        self.write(writer_net, fields)
+
+    def setup_ex(self, init_net, finish_net):
+        """Experimental, don't use yet"""
+        self.commit(finish_net)
+
+    def write_ex(self, fields, local_init_net, local_finish_net, stop_blob):
+        """Experimental extension to the interface. Don't use yet"""
+        write_net = core.Net('write_net')
+        self.write(write_net, fields)
+        return [write_net]
+
+    def write_record_ex(
+            self, fields, local_init_net, local_finish_net, stop_blob=None):
+        """Experimental extension to the interface. Don't use yet."""
+        if isinstance(fields, Field):
+            self._schema = fields
+            fields = fields.field_blobs()
+        if stop_blob is None:
+            stop_blob = local_init_net.NextName("dequeue_status")
+        write_nets = self.write_ex(
+            fields, local_init_net, local_finish_net, stop_blob)
+        return (write_nets, stop_blob)
+
+    def commit(self, finish_net):
+        """Add operations to `finish_net` that signal end of data.
+
+        This must be implemented by all Writers, but may be no-op for some
+        of them.
+        """
+        pass
+
+
+class ReaderBuilder(object):
+    """ Allow usage of a reader in distributed fashion. """
+    def schema(self):
+        raise NotImplementedError()
+
+    def setup(self, **kwargs):
+        """
+        Optionally, perform one-time setup before calling new_reader().
+        Subclass should make sure this function is only called once.
+        """
+        raise NotImplementedError()
+
+    def new_reader(self, **kwargs):
+        raise NotImplementedError()
+
+
+class PipedReaderBuilder(ReaderBuilder):
+    """ReaderBuilder that modifies underlying builder by calling `piper`
+    function on each new reader produced, and return the result of
+    the function. This way, it is possible to append data processing
+    pipelines that will be replicated for each reader that gets created.
+
+    E.g.:
+
+    PipedReaderBuilder(
+        ReaderBuilder(...),
+        lambda reader: pipe(reader, processor=my_proc))
+    """
+
+    def __init__(self, builder, piper):
+        self._builder = builder
+        self._piper = piper
+
+    def schema(self):
+        return self._builder.schema()
+
+    def setup(self, **kwargs):
+        return self._builder.setup(**kwargs)
+
+    def new_reader(self, **kwargs):
+        # Passing everything down since you could wrap a PipedReaderBuilder in
+        # another PipedReaderBuilder
+        output = self._piper(
+            reader=self._builder.new_reader(**kwargs),
+            **kwargs
+        )
+        return output if isinstance(output, Reader) else output.reader()
+
+
+class Pipe(object):
+    def __init__(self, schema=None, obj_key=None):
+        self._num_writers = 0
+        self._num_readers = 0
+        self._schema = schema
+        self._obj_key = obj_key
+
+    def schema(self):
+        return self._schema
+
+    def setup(self, global_init_net):
+        pass
+
+    def reader(self):
+        raise NotImplementedError()
+
+    def writer(self):
+        raise NotImplementedError()
+
+    def num_readers(self):
+        return self._num_readers
+
+    def num_writers(self):
+        return self._num_writers
+
+    def _new_writer(self, writer_schema, writer_init_net):
+        if writer_schema is not None and self._schema is None:
+            self._schema = writer_schema
+        self._num_writers += 1
+        if self._obj_key is not None:
+            writer_init_net.add_attribute(self._obj_key, self)
+
+    def _new_reader(self, reader_init_net):
+        self._num_readers += 1
+        if self._obj_key is not None:
+            reader_init_net.add_attribute(self._obj_key, self)
+
+
+class CounterReader(Reader):
+    """ Reader that produces increasing integers. """
+    def __init__(self):
+        Reader.__init__(self, schema=Struct(('iter', np.int64)))
+        self.counter = None
+        self.should_stop = None
+
+    def setup_ex(self, global_init_net, global_finish_net):
+        if self.counter is None:
+            self.counter = global_init_net.CreateCounter([], init_count=0)
+            self.should_stop = global_init_net.ConstantFill(
+                [], shape=[], dtype=core.DataType.BOOL, value=False)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        count_net = core.Net('limited_reader_counter')
+        value = count_net.CountUp([self.counter], 1)
+        return [count_net], self.should_stop, [value]
+
+
+class ReaderWithLimitBase(Reader):
+    """Abstract Reader constrained by certain conditions.
+
+    Base class for Reader classes which check for certain conditions to stop
+    further processing (e.g. max number of iterations or time limit).
+    Also produces a boolean blob (data_finished) that can be used to see if
+    the reader exausted all input data (true) or stopped for another reason
+    (false).
+    """
+
+    def __init__(self, reader):
+        Reader.__init__(self, schema=reader._schema)
+        self.reader = reader
+        self.net = core.Net('reader_with_limit')
+        self._data_finished = self.net.AddExternalInput(
+            self.net.NextName('data_finished'))
+        self.should_stop = None
+
+    def setup_ex(self, global_init_net, global_finish_net):
+        global_init_net.ConstantFill(
+            [], [self._data_finished],
+            shape=[], value=False, dtype=core.DataType.BOOL)
+        self.reader.setup_ex(global_init_net, global_finish_net)
+        self.setup_limiter(global_init_net, global_finish_net)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        """Reads from an underlying Reader class, but may stop due to additional
+        constraints.
+
+        Build and return network(s) to read data from a Reader with
+        additional constraints, depending on which derived class is used.
+        Derived classes implement setup_limited and check_limiter_condition
+        which determine the nature of the constraint imposed on the reader,
+        e.g. iteration limits or time limit.
+
+        Args:
+            local_init_net: A net invoked at task instance init time (Once per
+                parallel thread).
+            local_finish_net: A net invoked at task instance cleanup time (Once
+                per parallel thread).
+        """
+
+        # Check if limiting constraint is met.
+        stop_condition_net = core.Net('limited_reader_condition')
+        should_stop = self.check_limiter_condition(stop_condition_net)
+
+        # Call original reader.
+        nets, local_data_finished, fields = self.reader.read_ex(
+            local_init_net, local_finish_net)
+        self._set_schema(self.reader._schema)
+
+        # Check if original reader is done.
+        check_done_net = core.Net('limited_reader_post')
+        # Copy to the same blob as the counter output to trigger reader
+        # stopping - this is ok because execution will check should_stop_blob
+        # after every single operation, so it has already been checked on this
+        # iteration by this point.
+        check_done_net.Copy(local_data_finished, should_stop)
+        # Update externally-accessible flag indicating if reader is done
+        check_done_net.Or([self._data_finished, local_data_finished],
+                          [self._data_finished])
+
+        return [stop_condition_net] + nets + [check_done_net], should_stop, fields
+
+    def setup_limiter(self, global_init_net, global_finish_net):
+        """Configure task level init/cleanup nets required to implement limit
+        condition. Must be implemented by subclass.
+
+        Args:
+            global_init_net: A net invoked at task init time.
+            global_finish_net: A net invoked at task cleanup time.
+        """
+        raise NotImplementedError("Subclass must implement `setup_limiter`")
+
+    def check_limiter_condition(self, stop_condition_net):
+        """Configure a net that is invoked between reading batches to see if
+        limit condition is met. Must be implemented by subclass.
+
+        Args:
+            stop_condition_net: A net invoked to evaluate an early termination
+                condition.
+        """
+        raise NotImplementedError("Subclass must implement `check_limiter_condition")
+
+    def data_finished(self):
+        """
+        Return a blob that can be checked after the end of the reading task,
+        which will contain a scalar float indicating whether the underlying
+        reader has been exhausted (True) or whether we stopped because reached
+        the limit of iterations (False).
+        """
+        return self._data_finished
+
+
+class ReaderWithLimit(ReaderWithLimitBase):
+    """Reader that stops after `num_iter` batches.
+
+    If `num_iter` <= 0 or is None, reverts to an unconstrained reader that
+    exports a boolean blob indicating that the reader has exhausted
+    the data steam.
+    """
+    def __init__(self, reader, num_iter=1):
+        """Class initializer.
+
+        Args:
+            reader: The underlying reader object doing the actual read.
+            num_iter: Number of batches to read. If `None`,
+                the class reverts to a normal reader except that it also
+                produces a data_finished blob as a side effect to indicate
+                whether the input stream is exhausted.
+        """
+        super(ReaderWithLimit, self).__init__(reader)
+        self.counter = None
+        self.num_iter = num_iter
+        if self.num_iter is not None:
+            self.counter = self.net.AddExternalInput(
+                self.net.NextName('counter'))
+
+    def setup_limiter(self, global_init_net, global_finish_net):
+        if self.counter:
+            global_init_net.CreateCounter(
+                [], [self.counter], init_count=int(self.num_iter))
+
+    def check_limiter_condition(self, stop_condition_net):
+        if self.counter:
+            return stop_condition_net.CountDown([self.counter], 1)
+        else:
+            return stop_condition_net.ConstantFill(
+                [], 1,
+                shape=[], value=False, dtype=core.DataType.BOOL)
+
+
+def CountUntil(num_iter):
+    return ReaderWithLimit(CounterReader(), num_iter)
+
+
+class ReaderWithTimeLimit(ReaderWithLimitBase):
+    """Reader that stops after `duration` seconds.
+
+    If `duration` <= 0 or is None, reverts to an unconstrained reader that
+    exports a boolean blob indicating that the reader has exhausted
+    the data steam.
+    """
+    def __init__(self, reader, duration=0):
+        """Class initializer.
+
+        Args:
+            reader: The underlying reader object doing the actual read.
+            duration: Number of seconds to read. If un-specified, None, or <= 0,
+                the class reverts to a normal reader except that it also
+                produces a data_finished blob as a side effect to indicate
+                whether the input stream is exhausted.
+        """
+        super(ReaderWithTimeLimit, self).__init__(reader)
+
+        self.timer = None
+        self.duration = duration
+        self.duration_ns_blob = None
+
+    def setup_limiter(self, global_init_net, global_finish_net):
+        if self.duration is not None and self.duration > 0:
+            duration_ns = int(self.duration * (10**9))
+
+            self.timer = global_init_net.TimerBegin(
+                [], counter_name='epoch_timer')
+            start_time = global_init_net.TimerGet(self.timer)
+            self.duration_ns_blob = global_init_net.ConstantFill(
+                [start_time], value=duration_ns)
+
+            global_finish_net.TimerEnd([self.timer], [])
+
+    def check_limiter_condition(self, stop_condition_net):
+        if self.duration:
+            time_elapsed = stop_condition_net.TimerGet(self.timer)
+            return stop_condition_net.GE(
+                [time_elapsed, self.duration_ns_blob], str(self.should_stop))
+        else:
+            return stop_condition_net.ConstantFill(
+                [], 1, shape=[], value=False, dtype=core.DataType.BOOL
+            )
+
+
+class ReaderWithDelay(Reader):
+    """Test reader class that inserts a delay between reading batches."""
+
+    def __init__(self, reader, delay):
+        Reader.__init__(self, schema=reader._schema)
+        self.reader = reader
+        self.delay = delay
+
+    def setup_ex(self, global_init_net, global_finish_net):
+        self.reader.setup_ex(global_init_net, global_finish_net)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        read_net = core.Net("reader_body")
+
+        def sleep_op(*args, **argd):
+            time.sleep(self.delay)
+
+        read_net.Python(sleep_op)([], [])
+        return ([read_net],) + self.reader.read(read_net)
+
+
+class CompositeReader(Reader):
+    """
+    Base class for a reader that wrap multiple readers, e.g., reading from
+    multiple sources simultaneously.
+    """
+    def __init__(self, names, readers):
+        """
+        Args:
+            names: list[str] names of readers; used as schema keys
+            readers: list[Reader] Reader instances, must have schema
+        """
+        assert len(names) == len(readers)
+        super(CompositeReader, self).__init__(schema=Struct(*[
+            (name, reader.schema()) for name, reader in zip(names, readers)
+        ]))
+        self._names = names
+        self._readers = readers
+
+    def setup_ex(self, init_net, finish_net):
+        for reader in self._readers:
+            reader.setup_ex(init_net, finish_net)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        """
+        Stops when one of the reader finished
+        """
+        local_should_stop = local_init_net.ConstantFill(
+            [], shape=[], dtype=core.DataType.BOOL, value=False)
+        read_nets = []
+        fields = []
+        for name, reader in zip(self._names, self._readers):
+            sub_read_nets, should_stop, record = reader.read_record_ex(
+                local_init_net, local_finish_net)
+            stop_net = core.Net("{}_stop".format(name))
+            stop_net.Copy(should_stop, local_should_stop)
+            sub_read_nets.append(stop_net)
+            read_nets.extend(sub_read_nets)
+            fields.extend(record.field_blobs())
+        return read_nets, local_should_stop, fields
+
+    def reset(self, net):
+        for reader in self._readers:
+            reader.reset(net)
+
+
+class CompositeReaderBuilder(ReaderBuilder):
+    """
+    A reader builder for CompositeReader
+    """
+    def __init__(self, names, reader_builders):
+        """
+        Args:
+            names: list[str] names of readers; used as schema keys
+            reader_builders: list[ReaderBuilder] ReaderBuilder instances;
+                must have schema
+        """
+        super(CompositeReaderBuilder, self).__init__()
+        self._names = names
+        self._reader_builders = reader_builders
+        self._schema = Struct(*[
+            (name, reader_builder.schema())
+            for name, reader_builder in zip(names, reader_builders)
+        ])
+
+    def schema(self):
+        return self._schema
+
+    def setup(self, **kwargs):
+        for reader_builder in self._reader_builders:
+            reader_builder.setup(**kwargs)
+
+    def new_reader(self, **kwargs):
+        readers = []
+        for reader_builder in self._reader_builders:
+            reader = reader_builder.new_reader(**kwargs)
+            if isinstance(reader, Reader):
+                pass
+            elif hasattr(reader, 'reader'):
+                reader = reader.reader()
+            else:
+                raise ValueError('reader must be an instance of Reader or Pipe')
+            readers.append(reader)
+
+        multi_reader = CompositeReader(self._names, readers)
+        assert multi_reader.schema() == self._schema
+        return multi_reader
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
new file mode 100644
index 0000000..13e1e43
--- /dev/null
+++ b/caffe2/python/dataio_test.py
@@ -0,0 +1,420 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.dataio import (
+    CompositeReader,
+    CompositeReaderBuilder,
+    Reader,
+    ReaderBuilder,
+    ReaderWithDelay,
+    ReaderWithLimit,
+    ReaderWithTimeLimit,
+)
+from caffe2.python.dataset import Dataset
+from caffe2.python.db_file_reader import DBFileReader
+from caffe2.python.pipeline import pipe
+from caffe2.python.schema import Struct, NewRecord, FeedRecord
+from caffe2.python.session import LocalSession
+from caffe2.python.task import TaskGroup, final_output, WorkspaceType
+from caffe2.python.test_util import TestCase
+from caffe2.python.cached_reader import CachedReader
+from caffe2.python import core, workspace, schema
+from caffe2.python.net_builder import ops
+
+import numpy as np
+import numpy.testing as npt
+import os
+import shutil
+import unittest
+import tempfile
+import time
+
+
+def make_source_dataset(ws, size=100, offset=0, name=None):
+    name = name or "src"
+    src_init = core.Net("{}_init".format(name))
+    with core.NameScope(name):
+        src_values = Struct(('label', np.array(range(offset, offset + size))))
+        src_blobs = NewRecord(src_init, src_values)
+        src_ds = Dataset(src_blobs, name=name)
+        FeedRecord(src_blobs, src_values, ws)
+    ws.run(src_init)
+    return src_ds
+
+
+def make_destination_dataset(ws, schema, name=None):
+    name = name or 'dst'
+    dst_init = core.Net('{}_init'.format(name))
+    with core.NameScope(name):
+        dst_ds = Dataset(schema, name=name)
+        dst_ds.init_empty(dst_init)
+    ws.run(dst_init)
+    return dst_ds
+
+
+class TestReaderBuilder(ReaderBuilder):
+    def __init__(self, name, size, offset):
+        self._schema = schema.Struct(
+            ('label', schema.Scalar()),
+        )
+        self._name = name
+        self._size = size
+        self._offset = offset
+        self._src_ds = None
+
+    def schema(self):
+        return self._schema
+
+    def setup(self, ws):
+        self._src_ds = make_source_dataset(ws, offset=self._offset, size=self._size,
+                                    name=self._name)
+
+    def new_reader(self, **kwargs):
+        return self._src_ds
+
+
+class TestCompositeReader(TestCase):
+    @unittest.skipIf(os.environ.get('JENKINS_URL'), 'Flaky test on Jenkins')
+    def test_composite_reader(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        num_srcs = 3
+        names = ["src_{}".format(i) for i in range(num_srcs)]
+        size = 100
+        offsets = [i * size for i in range(num_srcs)]
+        src_dses = [make_source_dataset(ws, offset=offset, size=size, name=name)
+                    for (name, offset) in zip(names, offsets)]
+
+        data = [ws.fetch_blob(str(src.field_blobs[0])) for src in src_dses]
+        # Sanity check we didn't overwrite anything
+        for d, offset in zip(data, offsets):
+            npt.assert_array_equal(d, range(offset, offset + size))
+
+        # Make an identically-sized empty destnation dataset
+        dst_ds_schema = schema.Struct(
+            *[
+                (name, src_ds.content().clone_schema())
+                for name, src_ds in zip(names, src_dses)
+            ]
+        )
+        dst_ds = make_destination_dataset(ws, dst_ds_schema)
+
+        with TaskGroup() as tg:
+            reader = CompositeReader(names,
+                                     [src_ds.reader() for src_ds in src_dses])
+            pipe(reader, dst_ds.writer(), num_runtime_threads=3)
+        session.run(tg)
+
+        for i in range(num_srcs):
+            written_data = sorted(
+                ws.fetch_blob(str(dst_ds.content()[names[i]].label())))
+            npt.assert_array_equal(data[i], written_data, "i: {}".format(i))
+
+    @unittest.skipIf(os.environ.get('JENKINS_URL'), 'Flaky test on Jenkins')
+    def test_composite_reader_builder(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        num_srcs = 3
+        names = ["src_{}".format(i) for i in range(num_srcs)]
+        size = 100
+        offsets = [i * size for i in range(num_srcs)]
+        src_ds_builders = [
+            TestReaderBuilder(offset=offset, size=size, name=name)
+            for (name, offset) in zip(names, offsets)
+        ]
+
+        # Make an identically-sized empty destnation dataset
+        dst_ds_schema = schema.Struct(
+            *[
+                (name, src_ds_builder.schema())
+                for name, src_ds_builder in zip(names, src_ds_builders)
+            ]
+        )
+        dst_ds = make_destination_dataset(ws, dst_ds_schema)
+
+        with TaskGroup() as tg:
+            reader_builder = CompositeReaderBuilder(
+                names, src_ds_builders)
+            reader_builder.setup(ws=ws)
+            pipe(reader_builder.new_reader(), dst_ds.writer(),
+                 num_runtime_threads=3)
+        session.run(tg)
+
+        for name, offset in zip(names, offsets):
+            written_data = sorted(
+                ws.fetch_blob(str(dst_ds.content()[name].label())))
+            npt.assert_array_equal(range(offset, offset + size), written_data,
+                                   "name: {}".format(name))
+
+
+class TestReaderWithLimit(TestCase):
+    def test_runtime_threads(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        src_ds = make_source_dataset(ws)
+        totals = [None] * 3
+
+        def proc(rec):
+            # executed once
+            with ops.task_init():
+                counter1 = ops.CreateCounter([], ['global_counter'])
+                counter2 = ops.CreateCounter([], ['global_counter2'])
+                counter3 = ops.CreateCounter([], ['global_counter3'])
+            # executed once per thread
+            with ops.task_instance_init():
+                task_counter = ops.CreateCounter([], ['task_counter'])
+            # executed on each iteration
+            ops.CountUp(counter1)
+            ops.CountUp(task_counter)
+            # executed once per thread
+            with ops.task_instance_exit():
+                with ops.loop(ops.RetrieveCount(task_counter)):
+                    ops.CountUp(counter2)
+                ops.CountUp(counter3)
+            # executed once
+            with ops.task_exit():
+                totals[0] = final_output(ops.RetrieveCount(counter1))
+                totals[1] = final_output(ops.RetrieveCount(counter2))
+                totals[2] = final_output(ops.RetrieveCount(counter3))
+            return rec
+
+        # Read full data set from original reader
+        with TaskGroup() as tg:
+            pipe(src_ds.reader(), num_runtime_threads=8, processor=proc)
+        session.run(tg)
+        self.assertEqual(totals[0].fetch(), 100)
+        self.assertEqual(totals[1].fetch(), 100)
+        self.assertEqual(totals[2].fetch(), 8)
+
+        # Read with a count-limited reader
+        with TaskGroup() as tg:
+            q1 = pipe(src_ds.reader(), num_runtime_threads=2)
+            q2 = pipe(
+                ReaderWithLimit(q1.reader(), num_iter=25),
+                num_runtime_threads=3)
+            pipe(q2, processor=proc, num_runtime_threads=6)
+        session.run(tg)
+        self.assertEqual(totals[0].fetch(), 25)
+        self.assertEqual(totals[1].fetch(), 25)
+        self.assertEqual(totals[2].fetch(), 6)
+
+    def _test_limit_reader_init_shared(self, size):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+
+        # Make source dataset
+        src_ds = make_source_dataset(ws, size=size)
+
+        # Make an identically-sized empty destination Dataset
+        dst_ds = make_destination_dataset(ws, src_ds.content().clone_schema())
+
+        return ws, session, src_ds, dst_ds
+
+    def _test_limit_reader_shared(self, reader_class, size, expected_read_len,
+                                  expected_finish, num_threads, read_delay,
+                                  **limiter_args):
+        ws, session, src_ds, dst_ds = \
+            self._test_limit_reader_init_shared(size)
+
+        # Read without limiter
+        # WorkspaceType.GLOBAL is required because we are fetching
+        # reader.data_finished() after the TaskGroup finishes.
+        with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
+            if read_delay > 0:
+                reader = reader_class(ReaderWithDelay(src_ds.reader(),
+                                                      read_delay),
+                                      **limiter_args)
+            else:
+                reader = reader_class(src_ds.reader(), **limiter_args)
+            pipe(reader, dst_ds.writer(), num_runtime_threads=num_threads)
+        session.run(tg)
+        read_len = len(sorted(ws.blobs[str(dst_ds.content().label())].fetch()))
+        self.assertEqual(read_len, expected_read_len)
+        self.assertEqual(
+            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
+            list(range(expected_read_len))
+        )
+        self.assertEqual(ws.blobs[str(reader.data_finished())].fetch(),
+                         expected_finish)
+
+    def test_count_limit_reader_without_limit(self):
+        # No iter count specified, should read all records.
+        self._test_limit_reader_shared(ReaderWithLimit,
+                                       size=100,
+                                       expected_read_len=100,
+                                       expected_finish=True,
+                                       num_threads=8,
+                                       read_delay=0,
+                                       num_iter=None)
+
+    def test_count_limit_reader_with_zero_limit(self):
+        # Zero iter count specified, should read 0 records.
+        self._test_limit_reader_shared(ReaderWithLimit,
+                                       size=100,
+                                       expected_read_len=0,
+                                       expected_finish=False,
+                                       num_threads=8,
+                                       read_delay=0,
+                                       num_iter=0)
+
+    def test_count_limit_reader_with_low_limit(self):
+        # Read with limit smaller than size of dataset
+        self._test_limit_reader_shared(ReaderWithLimit,
+                                       size=100,
+                                       expected_read_len=10,
+                                       expected_finish=False,
+                                       num_threads=8,
+                                       read_delay=0,
+                                       num_iter=10)
+
+    def test_count_limit_reader_with_high_limit(self):
+        # Read with limit larger than size of dataset
+        self._test_limit_reader_shared(ReaderWithLimit,
+                                       size=100,
+                                       expected_read_len=100,
+                                       expected_finish=True,
+                                       num_threads=8,
+                                       read_delay=0,
+                                       num_iter=110)
+
+    def test_time_limit_reader_without_limit(self):
+        # No duration specified, should read all records.
+        self._test_limit_reader_shared(ReaderWithTimeLimit,
+                                       size=100,
+                                       expected_read_len=100,
+                                       expected_finish=True,
+                                       num_threads=8,
+                                       read_delay=0.1,
+                                       duration=0)
+
+    def test_time_limit_reader_with_short_limit(self):
+        # Read with insufficient time limit
+        size = 50
+        num_threads = 4
+        sleep_duration = 0.25
+        duration = 1
+        expected_read_len = int(round(num_threads * duration / sleep_duration))
+        # Because the time limit check happens before the delay + read op,
+        # subtract a little bit of time to ensure we don't get in an extra read
+        duration = duration - 0.25 * sleep_duration
+        self._test_limit_reader_shared(ReaderWithTimeLimit,
+                                       size=size,
+                                       expected_read_len=expected_read_len,
+                                       expected_finish=False,
+                                       num_threads=num_threads,
+                                       read_delay=sleep_duration,
+                                       duration=duration)
+
+    def test_time_limit_reader_with_long_limit(self):
+        # Read with ample time limit
+        self._test_limit_reader_shared(ReaderWithTimeLimit,
+                                       size=50,
+                                       expected_read_len=50,
+                                       expected_finish=True,
+                                       num_threads=4,
+                                       read_delay=0.25,
+                                       duration=6)
+
+
+class TestDBFileReader(TestCase):
+    def setUp(self):
+        self.temp_paths = []
+
+    def tearDown(self):
+        # In case any test method fails, clean up temp paths.
+        for path in self.temp_paths:
+            self._delete_path(path)
+
+    @staticmethod
+    def _delete_path(path):
+        if os.path.isfile(path):
+            os.remove(path)  # Remove file.
+        elif os.path.isdir(path):
+            shutil.rmtree(path)  # Remove dir recursively.
+
+    def _make_temp_path(self):
+        # Make a temp path as db_path.
+        with tempfile.NamedTemporaryFile() as f:
+            temp_path = f.name
+        self.temp_paths.append(temp_path)
+        return temp_path
+
+    @staticmethod
+    def _build_source_reader(ws, size):
+        src_ds = make_source_dataset(ws, size)
+        return src_ds.reader()
+
+    @staticmethod
+    def _read_all_data(ws, reader, session):
+        dst_ds = make_destination_dataset(ws, reader.schema().clone_schema())
+
+        with TaskGroup() as tg:
+            pipe(reader, dst_ds.writer(), num_runtime_threads=8)
+        session.run(tg)
+
+        return ws.blobs[str(dst_ds.content().label())].fetch()
+
+    def test_cached_reader(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        db_path = self._make_temp_path()
+
+        # Read data for the first time.
+        cached_reader1 = CachedReader(
+            self._build_source_reader(ws, 100), db_path,
+        )
+        build_cache_step = cached_reader1.build_cache_step()
+        session.run(build_cache_step)
+
+        data = self._read_all_data(ws, cached_reader1, session)
+        self.assertEqual(sorted(data), list(range(100)))
+
+        # Read data from cache.
+        cached_reader2 = CachedReader(
+            self._build_source_reader(ws, 200), db_path,
+        )
+        build_cache_step = cached_reader2.build_cache_step()
+        session.run(build_cache_step)
+
+        data = self._read_all_data(ws, cached_reader2, session)
+        self.assertEqual(sorted(data), list(range(100)))
+
+        self._delete_path(db_path)
+
+        # We removed cache so we expect to receive data from original reader.
+        cached_reader3 = CachedReader(
+            self._build_source_reader(ws, 300), db_path,
+        )
+        build_cache_step = cached_reader3.build_cache_step()
+        session.run(build_cache_step)
+
+        data = self._read_all_data(ws, cached_reader3, session)
+        self.assertEqual(sorted(data), list(range(300)))
+
+        self._delete_path(db_path)
+
+    def test_db_file_reader(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+        db_path = self._make_temp_path()
+
+        # Build a cache DB file.
+        cached_reader = CachedReader(
+            self._build_source_reader(ws, 100),
+            db_path=db_path,
+            db_type='LevelDB',
+        )
+        build_cache_step = cached_reader.build_cache_step()
+        session.run(build_cache_step)
+
+        # Read data from cache DB file.
+        db_file_reader = DBFileReader(
+            db_path=db_path,
+            db_type='LevelDB',
+        )
+        data = self._read_all_data(ws, db_file_reader, session)
+        self.assertEqual(sorted(data), list(range(100)))
+
+        self._delete_path(db_path)
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
new file mode 100644
index 0000000..387dbba
--- /dev/null
+++ b/caffe2/python/dataset.py
@@ -0,0 +1,344 @@
+## @package dataset
+# Module caffe2.python.dataset
+"""
+Implementation of an in-memory dataset with structured schema.
+
+Use this to store and iterate through datasets with complex schema that
+fit in memory.
+
+Iterating through entries of this dataset is very fast since the dataset
+is stored as a set of native Caffe2 tensors, thus no type conversion or
+deserialization is necessary.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.dataio import Reader, Writer
+from caffe2.python.schema import (
+    Struct, from_blob_list, from_column_list, InitEmptyRecord)
+import numpy as np
+
+
+class _DatasetReader(Reader):
+    def __init__(self, dataset, name, batch_size=1, enforce_batch_size=False):
+        """Don't call this directly. Instead, use dataset.reader()"""
+        Reader.__init__(self, dataset.content())
+        self.dataset = dataset
+        self.name = name or (dataset.name + '_cursor')
+        self.batch_size = batch_size
+        self.enforce_batch_size = enforce_batch_size
+        self.cursor = None
+
+    def setup_ex(self, init_net, exit_net):
+        if self.cursor is None:
+            self.cursor = init_net.CreateTreeCursor(
+                [],
+                init_net.NextScopedBlob(self.name),
+                fields=self.dataset.fields)
+
+    def read(self, read_net):
+        assert self.cursor, 'setup not called.'
+        content = self.dataset.content()
+        with core.NameScope(read_net.NextName(self.name)):
+            fields = read_net.ReadNextBatch(
+                [self.cursor] + content.field_blobs(),
+                content.field_names(),
+                batch_size=self.batch_size,
+                enforce_batch_size=self.enforce_batch_size)
+            fields = core.output_to_list(fields)
+            return (read_net.IsEmpty([fields[0]]), fields)
+
+    def reset(self, net):
+        net.ResetCursor([self.cursor], [])
+
+
+class _DatasetRandomReader(Reader):
+    def __init__(self, dataset, name, indices, batch_size=1, loop_over=False,
+                 enforce_batch_size=False):
+        """Don't call this directly. Instead, use dataset.random_reader()"""
+        Reader.__init__(self, dataset.content())
+        self.dataset = dataset
+        self.cursor = None
+        self.name = name or (dataset.name + '_cursor')
+        self.indices = indices
+        self.batch_size = batch_size
+        self.loop_over = loop_over
+        self.enforce_batch_size = enforce_batch_size
+
+    def setup_ex(self, init_net, exit_net):
+        if self.cursor is None:
+            self.cursor = init_net.CreateTreeCursor(
+                [],
+                init_net.NextScopedBlob(self.name),
+                fields=self.dataset.fields)
+
+    def reset(self, net):
+        net.ResetCursor([self.cursor], [])
+
+    def computeoffset(self, net):
+        self.reset(net)
+        offsets = net.ComputeOffset(
+            [self.cursor] + self.dataset.content().field_blobs(),
+            'offsets')
+        self.offsets = offsets
+
+    def sort_and_shuffle(self, net, sort_by_field=None,
+                         shuffle_size=1, batch_size=1):
+        # no sorting by default
+        content = self.dataset.content()
+        sort_by_field_idx = -1
+        if sort_by_field:
+            assert sort_by_field in content.field_names(), (
+                'Must be valid field.')
+            sort_by_field_idx = content.field_names().index(sort_by_field)
+        self.reset(net)
+
+        indices = net.SortAndShuffle(
+            [self.cursor] + content.field_blobs(),
+            'indices',
+            sort_by_field_idx=sort_by_field_idx,
+            shuffle_size=shuffle_size,
+            batch_size=batch_size)
+        self.indices = indices
+
+    def read(self, read_net):
+        assert self.cursor, 'setup_ex not called'
+        assert self.indices, 'sort_and_shuffle not called'
+        assert self.offsets, 'computeoffset not called'
+        content = self.dataset.content()
+        with core.NameScope(read_net.NextName(self.name)):
+            fields = read_net.ReadRandomBatch(
+                [self.cursor, self.indices, self.offsets] + (
+                    content.field_blobs()),
+                content.field_names(),
+                batch_size=self.batch_size,
+                enforce_batch_size=self.enforce_batch_size,
+                loop_over=self.loop_over)
+            fields = core.output_to_list(fields)
+            return (read_net.IsEmpty([fields[0]]), fields)
+
+
+class _DatasetWriter(Writer):
+    def __init__(self, content):
+        """Don't call this directly. Use dataset.writer() instead."""
+        self._content = content
+        self.mutex = None
+
+    def setup_ex(self, init_net, exit_net):
+        if self.mutex is None:
+            self.mutex = init_net.CreateMutex([])
+
+    def write(self, writer_net, fields):
+        """
+        Add operations to `net` that append the blobs in `fields` to the end
+        of the dataset. An additional operator will also be added that checks
+        the consistency of the data in `fields` against the dataset schema.
+
+        Args:
+            writer_net: The net that will contain the Append operators.
+            fields: A list of BlobReference to be appeneded to this dataset.
+        """
+        assert self.mutex is not None, 'setup not called.'
+        field_blobs = self._content.field_blobs()
+        assert len(fields) == len(field_blobs), (
+            'Expected %s fields, got %s.' % (len(field_blobs), len(fields)))
+        writer_net.CheckDatasetConsistency(
+            fields, [], fields=self._content.field_names())
+        writer_net.AtomicAppend(
+            [self.mutex] + field_blobs + list(fields),
+            field_blobs)
+
+    def commit(self, finish_net):
+        """Commit is a no-op for an in-memory dataset."""
+        pass
+
+
+def Const(net, value, dtype=None, name=None):
+    """
+    Create a 'constant' by first creating an external input in the given
+    net, and then feeding the corresponding blob with its provided value
+    in the current workspace. The name is automatically generated in order
+    to avoid clashes with existing blob names.
+    """
+    assert isinstance(net, core.Net), 'net must be a core.Net instance.'
+    value = np.array(value, dtype=dtype)
+    blob = net.AddExternalInput(net.NextName(prefix=name))
+    workspace.FeedBlob(str(blob), value)
+    return blob
+
+
+def execution_step_with_progress(name, init_net, substeps, rows_read):
+    # progress reporter
+    report_net = core.Net('report_net')
+    report_net.Print([rows_read], [])
+    return core.execution_step(
+        name,
+        substeps,
+        report_net=report_net,
+        concurrent_substeps=True,
+        report_interval=5)
+
+
+class Dataset(object):
+    """Represents an in-memory dataset with fixed schema.
+
+    Use this to store and iterate through datasets with complex schema that
+    fit in memory.
+
+    Iterating through entries of this dataset is very fast since the dataset
+    is stored as a set of native Caffe2 tensors, thus no type conversion or
+    deserialization is necessary.
+    """
+
+    def __init__(self, fields, name=None):
+        """Create an un-initialized dataset with schema provided by `fields`.
+
+        Before this dataset can be used, it must be initialized, either by
+        `init_empty` or `init_from_dataframe`.
+
+        Args:
+            fields: either a schema.Struct or a list of field names in a format
+                    compatible with the one described in schema.py.
+            name: optional name to prepend to blobs that will store the data.
+        """
+        assert isinstance(fields, list) or isinstance(fields, Struct), (
+            'fields must be either a Struct or a list of raw field names.')
+        if isinstance(fields, list):
+            fields = from_column_list(fields)
+        self.schema = fields
+        self.fields = fields.field_names()
+        self.field_types = fields.field_types()
+        self.name = name or 'dataset'
+        self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
+
+    def trim(self, net, multiple_of):
+        """
+        Trims the contents of this dataset so that the number of records is
+        multiple of the given argument.
+        """
+        net.TrimDataset(
+            self.field_blobs,
+            self.field_blobs,
+            fields=self.fields,
+            multiple_of=multiple_of)
+
+    def init_empty(self, init_net):
+        """Initialize the blobs for this dataset with empty values.
+
+        Empty arrays will be immediately fed into the current workspace,
+        and `init_net` will take those blobs as external inputs.
+        """
+        self.field_blobs = InitEmptyRecord(
+            init_net, self.schema.clone_schema()).field_blobs()
+
+    def init_from_dataframe(self, net, dataframe):
+        """Initialize the blobs for this dataset from a Pandas dataframe.
+
+        Each column of the dataframe will be immediately fed into the current
+        workspace, and the `net` will take this blobs as external inputs.
+        """
+        assert len(self.fields) == len(dataframe.columns)
+        self.field_blobs = [
+            Const(net, dataframe.as_matrix([col]).flatten(), name=field)
+            for col, field in enumerate(self.fields)]
+
+    def get_blobs(self):
+        """
+        Return the list of BlobReference pointing to the blobs that contain
+        the data for this dataset.
+        """
+        assert self
+        return self.field_blobs
+
+    def content(self):
+        """
+        Return a Record of BlobReferences pointing to the full content of
+        this dataset.
+        """
+        return from_blob_list(self.schema, self.field_blobs)
+
+    def field_names(self):
+        """Return the list of field names for this dataset."""
+        return self.fields
+
+    def field_types(self):
+        """
+        Return the list of field dtypes for this dataset.
+
+        If a list of strings, not a schema.Struct, was passed to the
+        constructor, this will return a list of dtype(np.void).
+        """
+        return self.field_types
+
+    def reader(self, init_net=None, cursor_name=None, batch_size=1,
+               enforce_batch_size=False):
+        """Create a Reader object that is used to iterate through the dataset.
+
+        This will append operations to `init_net` that create a TreeCursor,
+        used to iterate through the data.
+
+        NOTE: Currently, it is not safe to append to a dataset while reading.
+
+        Args:
+            init_net: net that will be run once to create the cursor.
+            cursor_name: optional name for the blob containing a pointer
+                         to the cursor.
+            batch_size: how many samples to read per iteration.
+
+        Returns:
+            A _DatasetReader that can be used to create operators that will
+            iterate through the dataset.
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        reader = _DatasetReader(self, cursor_name, batch_size,
+                                enforce_batch_size)
+        if init_net is not None:
+            reader.setup_ex(init_net, None)
+        return reader
+
+    def random_reader(self, init_net=None, indices=None, cursor_name=None,
+                      batch_size=1, loop_over=False, enforce_batch_size=False):
+        """Create a Reader object that is used to iterate through the dataset.
+
+        NOTE: The reader order depends on the order in indices.
+
+        Args:
+            init_net: net that will be run once to create the cursor.
+            indices: blob of reading order
+            cursor_name: optional name for the blob containing a pointer
+                         to the cursor.
+            batch_size: how many samples to read per iteration.
+            loop_over: repeat the dataset indefinitely (in the same order)
+
+        Returns:
+            A DatasetReader that can be used to create operators that will
+            iterate through the dataset according to indices.
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        reader = _DatasetRandomReader(
+            self, cursor_name, indices, batch_size, loop_over,
+            enforce_batch_size)
+        if init_net is not None:
+            reader.setup_ex(init_net, None)
+        return reader
+
+    def writer(self, init_net=None):
+        """Create a Writer that can be used to append entries into the dataset.
+
+        NOTE: Currently, it is not safe to append to a dataset
+              while reading from it.
+        NOTE: Currently implementation of writer is not thread safe.
+              TODO: fixme
+
+        Args:
+            init_net: net that will be run once in order to create the writer.
+                      (currently not used)
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        writer = _DatasetWriter(self.content())
+        if init_net is not None:
+            writer.setup_ex(init_net, None)
+        return writer
diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py
new file mode 100644
index 0000000..815535a
--- /dev/null
+++ b/caffe2/python/db_file_reader.py
@@ -0,0 +1,179 @@
+## @package db_file_reader
+# Module caffe2.python.db_file_reader
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope, workspace, _import_c_extension as C
+from caffe2.python.dataio import Reader
+from caffe2.python.dataset import Dataset
+from caffe2.python.schema import from_column_list
+
+import os
+
+
+class DBFileReader(Reader):
+
+    default_name_suffix = 'db_file_reader'
+
+    """Reader reads from a DB file.
+
+    Example usage:
+    db_file_reader = DBFileReader(db_path='/tmp/cache.db', db_type='LevelDB')
+
+    Args:
+        db_path: str.
+        db_type: str. DB type of file. A db_type is registed by
+            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
+        name: str or None. Name of DBFileReader.
+            Optional name to prepend to blobs that will store the data.
+            Default to '<db_name>_<default_name_suffix>'.
+        batch_size: int.
+            How many examples are read for each time the read_net is run.
+        loop_over: bool.
+            If True given, will go through examples in random order endlessly.
+        field_names: List[str]. If the schema.field_names() should not in
+            alphabetic order, it must be specified.
+            Otherwise, schema will be automatically restored with
+            schema.field_names() sorted in alphabetic order.
+    """
+    def __init__(
+        self,
+        db_path,
+        db_type,
+        name=None,
+        batch_size=100,
+        loop_over=False,
+        field_names=None,
+    ):
+        assert db_path is not None, "db_path can't be None."
+        assert db_type in C.registered_dbs(), \
+            "db_type [{db_type}] is not available. \n" \
+            "Choose one of these: {registered_dbs}.".format(
+                db_type=db_type,
+                registered_dbs=C.registered_dbs(),
+        )
+
+        self.db_path = os.path.expanduser(db_path)
+        self.db_type = db_type
+        self.name = name or '{db_name}_{default_name_suffix}'.format(
+            db_name=self._extract_db_name_from_db_path(),
+            default_name_suffix=self.default_name_suffix,
+        )
+        self.batch_size = batch_size
+        self.loop_over = loop_over
+
+        # Before self._init_reader_schema(...),
+        # self.db_path and self.db_type are required to be set.
+        super(DBFileReader, self).__init__(self._init_reader_schema(field_names))
+        self.ds = Dataset(self._schema, self.name + '_dataset')
+        self.ds_reader = None
+
+    def _init_name(self, name):
+        return name or self._extract_db_name_from_db_path(
+        ) + '_db_file_reader'
+
+    def _init_reader_schema(self, field_names=None):
+        """Restore a reader schema from the DB file.
+
+        If `field_names` given, restore scheme according to it.
+
+        Overwise, loade blobs from the DB file into the workspace,
+        and restore schema from these blob names.
+        It is also assumed that:
+        1). Each field of the schema have corresponding blobs
+            stored in the DB file.
+        2). Each blob loaded from the DB file corresponds to
+            a field of the schema.
+        3). field_names in the original schema are in alphabetic order,
+            since blob names loaded to the workspace from the DB file
+            will be in alphabetic order.
+
+        Load a set of blobs from a DB file. From names of these blobs,
+        restore the DB file schema using `from_column_list(...)`.
+
+        Returns:
+            schema: schema.Struct. Used in Reader.__init__(...).
+        """
+        if field_names:
+            return from_column_list(field_names)
+
+        assert os.path.exists(self.db_path), \
+            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
+        with core.NameScope(self.name):
+            # blob_prefix is for avoiding name conflict in workspace
+            blob_prefix = scope.CurrentNameScope()
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                'Load',
+                [],
+                [],
+                absolute_path=True,
+                db=self.db_path,
+                db_type=self.db_type,
+                load_all=True,
+                add_prefix=blob_prefix,
+            )
+        )
+        col_names = [
+            blob_name[len(blob_prefix):] for blob_name in workspace.Blobs()
+            if blob_name.startswith(blob_prefix)
+        ]
+        schema = from_column_list(col_names)
+        return schema
+
+    def setup_ex(self, init_net, finish_net):
+        """From the Dataset, create a _DatasetReader and setup a init_net.
+
+        Make sure the _init_field_blobs_as_empty(...) is only called once.
+
+        Because the underlying NewRecord(...) creats blobs by calling
+        NextScopedBlob(...), so that references to previously-initiated
+        empty blobs will be lost, causing accessibility issue.
+        """
+        if self.ds_reader:
+            self.ds_reader.setup_ex(init_net, finish_net)
+        else:
+            self._init_field_blobs_as_empty(init_net)
+            self._feed_field_blobs_from_db_file(init_net)
+            self.ds_reader = self.ds.random_reader(
+                init_net,
+                batch_size=self.batch_size,
+                loop_over=self.loop_over,
+            )
+            self.ds_reader.sort_and_shuffle(init_net)
+            self.ds_reader.computeoffset(init_net)
+
+    def read(self, read_net):
+        assert self.ds_reader, 'setup_ex must be called first'
+        return self.ds_reader.read(read_net)
+
+    def _init_field_blobs_as_empty(self, init_net):
+        """Initialize dataset field blobs by creating an empty record"""
+        with core.NameScope(self.name):
+            self.ds.init_empty(init_net)
+
+    def _feed_field_blobs_from_db_file(self, net):
+        """Load from the DB file at db_path and feed dataset field blobs"""
+        assert os.path.exists(self.db_path), \
+            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
+        net.Load(
+            [],
+            self.ds.get_blobs(),
+            db=self.db_path,
+            db_type=self.db_type,
+            absolute_path=True,
+            source_blob_names=self.ds.field_names(),
+        )
+
+    def _extract_db_name_from_db_path(self):
+        """Extract DB name from DB path
+
+            E.g. given self.db_path=`/tmp/sample.db`,
+            it returns `sample`.
+
+            Returns:
+                db_name: str.
+        """
+        return os.path.basename(self.db_path).rsplit('.', 1)[0]
diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py
new file mode 100644
index 0000000..f642202
--- /dev/null
+++ b/caffe2/python/db_test.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace
+
+import os
+import tempfile
+import unittest
+
+
+class TestDB(unittest.TestCase):
+    def setUp(self):
+        handle, self.file_name = tempfile.mkstemp()
+        os.close(handle)
+        self.data = [
+            (
+                "key{}".format(i).encode("ascii"),
+                "value{}".format(i).encode("ascii")
+            )
+            for i in range(1, 10)
+        ]
+
+    def testSimple(self):
+        db = workspace.C.create_db(
+            "minidb", self.file_name, workspace.C.Mode.write)
+
+        for key, value in self.data:
+            transaction = db.new_transaction()
+            transaction.put(key, value)
+            del transaction
+
+        del db  # should close DB
+
+        db = workspace.C.create_db(
+            "minidb", self.file_name, workspace.C.Mode.read)
+        cursor = db.new_cursor()
+        data = []
+        while cursor.valid():
+            data.append((cursor.key(), cursor.value()))
+            cursor.next()  # noqa: B305
+        del cursor
+
+        db.close()  # test explicit db closer
+        self.assertEqual(data, self.data)
diff --git a/caffe2/python/device_checker.py b/caffe2/python/device_checker.py
new file mode 100644
index 0000000..d4eb6a1
--- /dev/null
+++ b/caffe2/python/device_checker.py
@@ -0,0 +1,120 @@
+## @package device_checker
+# Module caffe2.python.device_checker
+import numpy as np
+import copy
+from caffe2.python import workspace
+from caffe2.python.core import InferOpBlobDevicesAsDict
+from future.utils import viewitems
+
+
+class DeviceChecker(object):
+    """A device checker in Python to check consistency across multiple devices.
+
+    This is not the most efficient way to check devices, as the Python interface
+    will involve a lot of copies back and forth operations. Use at your own risk.
+    """
+
+    def __init__(self, threshold, device_options):
+        self._threshold = threshold
+        self._device_options = device_options
+
+    def CheckSimple(self, op, inputs, outputs_to_check,
+                    input_device_options=None):
+        """Checks the operator with different device implementations.
+
+        Inputs:
+          op: the operator to be checked.
+          inputs: the input data in numpy arrays.
+          outputs_to_check: the outputs to check between devices.
+          input_device_options: a mapping from input name to a device to use
+            (instead of self._device_options)
+        Outputs:
+          boolean: True if it passes, False if it does not pass.
+        """
+        op = copy.deepcopy(op)
+        # Entering the checker workspace
+        old_ws_name = workspace.CurrentWorkspace()
+        results = []
+        workspace.SwitchWorkspace("_device_check_", True)
+        for i, device_option in enumerate(self._device_options):
+            op.device_option.CopyFrom(device_option)
+            _input_device_options = input_device_options or \
+                InferOpBlobDevicesAsDict(op)[0]
+            print(_input_device_options)
+            for i, arr in enumerate(inputs):
+                workspace.FeedBlob(
+                    op.input[i], np.array(arr),
+                    _input_device_options.get(op.input[i], device_option)
+                )
+            workspace.RunOperatorOnce(op)
+            results.append(
+                [workspace.FetchBlob(op.output[idx])
+                 for idx in outputs_to_check])
+            # Everything is done, reset the workspace.
+            workspace.ResetWorkspace()
+        # After running on all devices, check correctness
+        success = True
+        for i in range(1, len(self._device_options)):
+            for j in range(len(outputs_to_check)):
+                x = results[i][j]
+                y = results[0][j]
+                if not np.allclose(x, y,
+                                   atol=self._threshold, rtol=self._threshold):
+                    print('Failure in checking device option {}'
+                          ' and output {}. The outputs are:'
+                          .format(i, op.output[outputs_to_check[j]]))
+                    print(x.flatten())
+                    print(y.flatten())
+                    print(np.max(np.abs(x - y)))
+                    success = False
+                # else:
+                #     print ('Passed device pair (0, %d), %s %s' %
+                #            (i, outputs_to_check[j], y.shape))
+        workspace.SwitchWorkspace(old_ws_name)
+        return success
+
+    def CheckNet(self, net, inputs=None, blobs_to_check=None, ignore=None):
+        """Checks a network by inspecting all of its intermediate results, and
+        see if things match.
+        """
+        if inputs is None:
+            inputs = {}
+        if ignore is None:
+            ignore = set()
+        old_ws_name = workspace.CurrentWorkspace()
+        results = []
+        if blobs_to_check is None:
+            blobs_to_check = sum([list(op.output) for op in net.op], [])
+        blobs_to_check = [b for b in blobs_to_check if b not in ignore]
+        workspace.SwitchWorkspace("_device_check_", True)
+        for device_option in self._device_options:
+            for name, arr in viewitems(inputs):
+                # print 'feeding', name
+                workspace.FeedBlob(name, arr, device_option)
+            for op in net.op:
+                op.device_option.CopyFrom(device_option)
+            workspace.RunNetOnce(net)
+            results.append(
+                [workspace.FetchBlob(name) for name in blobs_to_check]
+            )
+        # After running on all devices, check correctness
+        success = True
+        for i in range(1, len(results)):
+            for j in range(len(blobs_to_check)):
+                x = results[i][j]
+                y = results[0][j]
+                if not np.allclose(x, y,
+                                   atol=self._threshold, rtol=self._threshold):
+                    print('Failure in checking device option {}'
+                          ' and output {}. The outputs are:'
+                          .format(i, blobs_to_check[j]))
+                    print(x.flatten())
+                    print(y.flatten())
+                    print(np.max(np.abs(x - y)))
+                    success = False
+                # else:
+                #     print ('Passed device pair (%d, %d), %s %s: %s' %
+                #            (i, j, blobs_to_check[j], y.shape,
+                #             str(y.flatten())))
+        workspace.SwitchWorkspace(old_ws_name)
+        return success
diff --git a/caffe2/python/dlpack.h b/caffe2/python/dlpack.h
new file mode 100644
index 0000000..31fcdb1
--- /dev/null
+++ b/caffe2/python/dlpack.h
@@ -0,0 +1,141 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+
+// Copied from pytorch/torch/lib/ATen/dlpack.h
+
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 010
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLContext.
+ */
+typedef enum {
+  kCPU = 1,
+  kGPU = 2,
+  // kCPUPinned = kCPU | kGPU
+  kCPUPinned = 3,
+  kOpenCL = 4,
+  kMetal = 8,
+  kVPI = 9,
+  kROCM = 10,
+} DLDeviceType;
+
+/*!
+ * \brief A Device context for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*! \brief The device index */
+  int device_id;
+} DLContext;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  kInt = 0U,
+  kUInt = 1U,
+  kFloat = 2U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */
+  void* data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor.
+ */
+typedef struct DLManagedTensor {
+  /*! \DLTensor which is being memory managed */
+  DLTensor dlTensor;
+  /*! \brief context in which DLManagedTensor is used in a framework. It can
+   *   also be NULL
+   */
+  void * ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct ctx which holds the DLManagedTensor. It can be NULL if there
+   *   is no way for the caller to provide a reasonable destructor.
+   */
+  void (*destructor)(DLManagedTensor * self);
+} DLManagedTensor;
+
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py
new file mode 100644
index 0000000..0a16420
--- /dev/null
+++ b/caffe2/python/docs/formatter.py
@@ -0,0 +1,104 @@
+## @package formatter
+# Module caffe2.python.docs.formatter
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python.docs.parser import Parser
+
+
+class Formatter(object):
+    def __init__(self):
+        self.content = ""
+
+    def clone(self):
+        return self.__class__()
+
+    def dump(self):
+        return self.content
+
+    def parseAndAdd(self, text):
+        text = Parser(text, self).parse()
+        self.addRaw(text)
+
+    def addRaw(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addLine(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addLinebreak(self):
+        raise Exception('Not yet implemented.')
+
+    def addHeader(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addEmphasis(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addList(self, textList):
+        raise Exception('Not yet implemented.')
+
+    def addLink(self, text, url):
+        raise Exception('Not yet implemented.')
+
+    def addCode(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addCodeLink(self, text):
+        raise Exception('Not yet implemented.')
+
+    def addTable(self, table):
+        raise Exception('Not yet implemented.')
+
+    def addBreak(self):
+        raise Exception('Not yet implemented.')
+
+
+class Markdown(Formatter):
+    def addRaw(self, text):
+        self.content += "{text}".format(text=text)
+
+    def addLine(self, text, new_line=False):
+        self.content += "{line}{text}\n".format(line=('\n' if new_line else ''),
+                                                text=text)
+
+    def addLinebreak(self):
+        self.content += "\n"
+
+    def addHeader(self, text, h=1):
+        self.addLine("{header} {text}".format(header=h * '#', text=text), True)
+
+    def addEmphasis(self, text, s=1):
+        self.addRaw("{stars}{text}{stars}".format(stars=s * '*', text=text))
+
+    def addList(self, textList):
+        for text in textList:
+            self.addLine("- {text}".format(text=text), True)
+        self.addLinebreak()
+
+    def addLink(self, text, url):
+        self.addRaw("[{text}]({url})".format(text=text, url=url))
+
+    def addCodeLink(self, path, options=None):
+        self.addRaw("({path})".format(path=path))
+
+    def addCode(self, text, inline=False):
+        if (inline):
+            self.content += "`{text}`".format(text=text)
+        else:
+            self.addRaw("\n\n```\n{text}```\n\n".format(text=text))
+
+    def addTable(self, table, noTitle=False):
+        self.addLinebreak()
+        assert(len(table) > 1)
+        if noTitle:
+            table.insert(0, [' ' for i in range(len(table[0]))])
+        self.addLine(' | '.join(table[0]))
+        self.addLine(' | '.join(['----' for i in range(len(table[0]))]))
+        for row in table[1:]:
+            self.addLine(' | '.join(row))
+        self.addLinebreak()
+
+    def addBreak(self):
+        self.addLine('\n---\n', True)
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
new file mode 100644
index 0000000..1bc41b7
--- /dev/null
+++ b/caffe2/python/docs/generator.py
@@ -0,0 +1,231 @@
+## @package generator
+# Module caffe2.python.docs.generator
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import os
+from caffe2.python import core, workspace
+from caffe2.python.docs.formatter import Markdown
+from future.utils import viewitems, viewvalues
+
+OpSchema = workspace.C.OpSchema
+
+
+class DocUploader(object):
+    def __init__(self):
+        pass
+
+    def upload(self, text):
+        pass
+
+
+class DocGenerator(object):
+    def __init__(self, formatter, uploader):
+        self.formatter = formatter
+        self.uploader = uploader
+        self.content_body = ""
+
+    def create_body(self):
+        pass
+
+    def update(self):
+        self.uploader.upload(self.content_body)
+
+
+class OpDocGenerator(DocGenerator):
+    def getOperatorDoc(self, name, schema, priority):
+        return OperatorDoc(name, schema, priority)
+
+    def getOperatorEngine(self, name):
+        return OperatorEngine(name)
+
+    def getOperators(self):
+        # map: op_name -> operator
+        self.operators = {}
+        # map: op_name -> [engine, engine]
+        self.engines = {}
+
+        def filePriority(x):
+            if x == "caffe2/caffe2/operators":
+                return 0
+            if 'contrib' in x.split('/'):
+                return 2
+            if 'experiments' in x.split('/'):
+                return 3
+            return 1
+
+        for name in core._GetRegisteredOperators():
+            schema = OpSchema.get(name)
+            if schema:
+                priority = filePriority(os.path.dirname(schema.file))
+                operator = self.getOperatorDoc(name, schema, priority)
+                self.operators[name] = operator
+
+            # Engine
+            elif name.find("_ENGINE_") != -1:
+                engine = self.getOperatorEngine(name)
+                if engine.base_op_name in self.engines:
+                    self.engines[engine.base_op_name].append(engine)
+                else:
+                    self.engines[engine.base_op_name] = [engine]
+
+            # No schema
+            else:
+                priority = 4
+                self.operators[name] = self.getOperatorDoc(name, schema, priority)
+
+        for name, engines in viewitems(self.engines):
+            if name in self.operators:
+                self.operators[name].addEngines(engines)
+
+        # Generate a sorted list of operators
+        return sorted(
+            viewvalues(self.operators),
+            key=lambda op: (op.priority, op.name)
+        )
+
+    def createBody(self):
+        operators = self.getOperators()
+
+        for operator in operators:
+            operator.generateSchema(self.formatter)
+
+        self.content_body += self.formatter.dump()
+
+
+class OperatorEngine(object):
+    def __init__(self, name):
+        self.op_name = name
+        self.base_op_name, self.engine = name.split("_ENGINE_", 1)
+
+    def getDeviceImpl(self):
+        deviceImplList = []
+        for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)),
+                             ('CUDA', OpSchema.get_cuda_impl(self.op_name))]:
+            if not impl:
+                continue
+            deviceImplList.append((device, impl))
+        return deviceImplList
+
+    def generateDoc(self, formatter):
+        for device, impl in self.getDeviceImpl():
+            formatter.addLine(
+                '{engine} on {device}: {impl}'.format(engine=self.engine,
+                                                      device=device,
+                                                      impl=impl))
+
+
+class OperatorDoc(object):
+    def __init__(self, name, schema, priority):
+        self.name = name
+        self.schema = schema
+        self.priority = priority
+        print("Gathering docs for {}...".format(self.name))
+        self.engines = []
+
+    def addEngines(self, engines):
+        self.engines = engines
+
+    def generateDoc(self, formatter):
+        if self.schema.doc:
+            formatter.parseAndAdd(self.schema.doc)
+            formatter.addLinebreak()
+        else:
+            formatter.addLine("No documentation yet.")
+
+    def generateTable(self, formatter, tuples, title_row, title):
+        if tuples:
+            if title:
+                formatter.addHeader(title, 3)
+            table = []
+            if title_row:
+                table = [title_row]
+            for name, doc in tuples:
+                table.append([name, doc or ''])
+            formatter.addTable(table, (table == []))
+
+    def generateInterface(self, formatter):
+        def makeDesc(title, args):
+            f = formatter.clone()
+            f.addEmphasis(title, 1)
+            out = [(f.dump(), '')]
+            for arg in args:
+                f = formatter.clone()
+                if isinstance(arg, tuple):
+                    name = arg[0]
+                    if len(arg) > 1:
+                        description = arg[1] or ''
+                    else:
+                        description = ''
+                else:
+                    name = arg.name
+                    description = arg.description or ''
+                f.addCode(name, inline=True)
+                out.append((f.dump(), description or ''))
+            return out
+
+        tuples = []
+
+        if self.schema.args:
+            tuples += makeDesc('Arguments', self.schema.args)
+
+        if self.schema.input_desc:
+            tuples += makeDesc('Inputs', self.schema.input_desc)
+
+        if self.schema.output_desc:
+            tuples += makeDesc('Outputs', self.schema.output_desc)
+
+        self.generateTable(formatter, tuples, None, 'Interface')
+        print("Generated interface for {}".format(self.name))
+
+    def generateCodeLink(self, formatter):
+        formatter.addHeader("Code", 3)
+        formatter.addLinebreak()
+        formatter.addCodeLink(self.schema.file)
+
+    def getInfo(self, formatter, name, impl):
+        pass
+
+    def generateDevices(self, formatter):
+        formatter.addHeader("Devices", 3)
+        devices = [
+            self.getInfo(formatter,
+                         'CPU', OpSchema.get_cpu_impl(self.name)),
+            self.getInfo(formatter,
+                         'GPU', OpSchema.get_cuda_impl(self.name)),
+        ]
+        formatter.addList([i for i in devices if i])
+
+    def generateEngines(self, formatter):
+        if not len(self.engines):
+            return
+        formatter.addHeader("Engines", 3)
+        for engine in self.engines:
+            engine.generateDoc(formatter)
+
+    def generateSchema(self, formatter):
+        formatter.addHeader(self.name, 2)
+        if self.schema:
+            self.generateDoc(formatter)
+            self.generateInterface(formatter)
+            self.generateCodeLink(formatter)
+            self.generateDevices(formatter)
+            self.generateEngines(formatter)
+            formatter.addBreak()
+        else:
+            formatter.addLine("No schema documented yet.")
+            self.generateDevices(formatter)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Operators catalog generator.")
+    parser.add_argument('catalog_path', type=str,
+                        help='operators-catalogue.md to write out to')
+    args = parser.parse_args()
+
+    with open(args.catalog_path, 'w') as fp:
+        ops = OpDocGenerator(Markdown(), DocUploader())
+        ops.createBody()
+        fp.write(ops.content_body)
diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py
new file mode 100644
index 0000000..cf0d5c9
--- /dev/null
+++ b/caffe2/python/docs/github.py
@@ -0,0 +1,125 @@
+## @package github
+# Module caffe2.python.docs.github
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import os
+from caffe2.python.docs.formatter import Markdown
+from caffe2.python.docs.generator import OpDocGenerator, DocUploader
+from caffe2.python.docs.generator import OperatorDoc, OperatorEngine
+
+
+class GHOpDocUploader(DocUploader):
+    def __init__(self):
+        pass
+
+    def upload(self, content_body):
+        print(content_body)
+
+
+class GHMarkdown(Markdown):
+    def addHeader(self, text, h=1):
+        self.addLine("\n{header} {text}\n".format(header=h * '#', text=text), True)
+
+    def addDocHeader(self):
+        self.addLine("---")
+        self.addLine("docid: operators-catalog")
+        self.addLine("title: Operators Catalog")
+        self.addLine("layout: operators")
+        self.addLine("permalink: /docs/operators-catalogue.html")
+        self.addLine("---")
+        self.addLine("* TOC")
+        self.addLine("{:toc}")
+
+    def addTable(self, table, noTitle=False):
+        self.addLinebreak()
+        assert(len(table) > 1)
+        self.addLine(' | '.join(['----------' for i in range(len(table[0]))]))
+        self.addLine(' | '.join(table[0]))
+        for row in table[1:]:
+            self.addLine(' | '.join(row))
+
+    def addTableHTML(self, table, noTitle=False):
+        self.addRaw("<table>")
+        for row in table:
+            self.addRaw("<tr>")
+            for cell in row:
+                self.addRaw("<td>")
+                self.addLine("{cell}".format(cell=cell))
+                self.addRaw("</td>")
+            self.addRaw("</tr>")
+        self.addRaw("</table>")
+
+def getCodeLink(formatter, schema):
+    formatter = formatter.clone()
+    path = os.path.join("caffe2", os.path.relpath(schema.file, "caffe2"))
+    schemaLink = ('https://github.com/caffe2/caffe2/blob/master/{path}'
+                  .format(path=path))
+    formatter.addLink('{path}'.format(path=path), schemaLink)
+    return formatter.dump()
+
+
+class GHOperatorEngine(OperatorEngine):
+    def generateDoc(self, formatter):
+        for device, _ in self.getDeviceImpl():
+            formatter.addCode('{engine}'.format(engine=self.engine), True)
+            if device:
+                formatter.addRaw(' on ')
+                formatter.addEmphasis("{device}".format(device=device), 1)
+
+
+class GHOperatorDoc(OperatorDoc):
+    def generateCodeLink(self, formatter):
+        formatter.addHeader("Code", 3)
+        formatter.addLinebreak()
+        formatter.addRaw(getCodeLink(formatter, self.schema))
+
+    def getInfo(self, formatter, name, impl):
+        formatter = formatter.clone()
+        if impl:
+            formatter.addEmphasis('{name}'.format(name=name), 1)
+            formatter.addRaw(' ')
+            formatter.addCode('{impl}'.format(impl=impl), True)
+        return formatter.dump()
+
+    def generateSchema(self, formatter):
+        formatter.addHeader(self.name, 2)
+        if self.schema:
+            self.generateDoc(formatter)
+            self.generateInterface(formatter)
+            self.generateCodeLink(formatter)
+            formatter.addBreak()
+        else:
+            formatter.addLine("No schema documented yet.")
+
+
+class GHOpDocGenerator(OpDocGenerator):
+    def getOperatorDoc(self, name, schema, priority):
+        return GHOperatorDoc(name, schema, priority)
+
+    def getOperatorEngine(self, name):
+        return GHOperatorEngine(name)
+
+    def createBody(self):
+        self.formatter.addDocHeader()
+        operators = self.getOperators()
+
+        for operator in operators:
+            operator.generateSchema(self.formatter)
+
+        self.content_body += self.formatter.dump()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Operators catalog generator.")
+    parser.add_argument('catalog_path', type=str,
+                        help='operators-catalogue.md to write out to')
+    args = parser.parse_args()
+
+    with open(args.catalog_path, 'w') as fp:
+        ops = GHOpDocGenerator(GHMarkdown(), GHOpDocUploader)
+        ops.createBody()
+        fp.write(ops.content_body)
+        print("Updated {}!".format(args.catalog_path))
diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py
new file mode 100644
index 0000000..0d57e4d
--- /dev/null
+++ b/caffe2/python/docs/parser.py
@@ -0,0 +1,96 @@
+## @package parser
+# Module caffe2.python.docs.parser
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import re
+
+
+class Parser(object):
+    # List of tuples (regex_str, lambda(regex_match, formatter))
+    # If a lambda returns True it will be called repeatedly with replacement
+    # otherwise it will only be called on text that hasn't been parsed yet.
+    regexes = [
+        # Code blocks of various formats
+        ('````(.+?)````',
+         lambda m, f: f.addCode(m.group(1))
+         ),
+        ('```(.+?)```',
+         lambda m, f: f.addCode(m.group(1))
+         ),
+        ('((( {2})+)(\S.*)(\n\s*\n|\n))+',
+         lambda m, f: f.addCode(m.group(0))
+         ),
+        ('([^\.])\n',
+         lambda m, f: f.addRaw('{c} '.format(c=m.group(1))) or True
+         ),
+        ('`(.+?)`',
+         lambda m, f: f.addCode(m.group(1), True)
+         ),
+        # Make links clickable
+        ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
+         '|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
+         lambda m, f: f.addLink(m.group(0), m.group(0))
+         ),
+        ('\*\*(.+?)\*\*',
+         lambda m, f: f.addEmphasis(m.group(1), 2)
+         ),
+        ('\*(.+?)\*',
+         lambda m, f: f.addEmphasis(m.group(1), 1)
+         ),
+    ]
+
+    def __init__(self, text, formatter):
+        self.text = text
+        self.lines = []
+        self.formatter = formatter
+
+    def parseText(self):
+        UNPARSED = 0
+        PARSED = 1
+        parsed_block = [(UNPARSED, self.text)]
+        for regex, func in self.regexes:
+            index = 0
+            while index < len(parsed_block):
+                label, text = parsed_block[index]
+
+                # Already been parsed
+                if (label == PARSED):
+                    index += 1
+                    continue
+
+                match = re.search(regex, text)
+                if match:
+                    parsed_block.pop(index)
+                    start = match.start(0)
+                    end = match.end(0)
+
+                    f = self.formatter.clone()
+                    merge = func(match, f)
+
+                    if merge:
+                        merged = text[:start] + f.dump() + text[end:]
+                        parsed_block.insert(index, (UNPARSED, merged))
+                    else:
+                        if text[:start]:
+                            parsed_block.insert(index,
+                                                (UNPARSED, text[:start]))
+
+                        index += 1
+                        parsed_block.insert(index, (PARSED, f.dump()))
+
+                        index += 1
+                        if text[end:]:
+                            parsed_block.insert(index,
+                                                (UNPARSED, text[end:]))
+
+                else:
+                    index += 1
+
+        self.lines += [i for _, i in parsed_block]
+        self.text = ' '.join(self.lines)
+
+    def parse(self):
+        self.parseText()
+        return self.text
diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py
new file mode 100644
index 0000000..be85af5
--- /dev/null
+++ b/caffe2/python/dyndep.py
@@ -0,0 +1,50 @@
+## @package dyndep
+# Module caffe2.python.dyndep
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import ctypes
+import os
+
+from caffe2.python import core, extension_loader
+
+
+def InitOpsLibrary(name):
+    """Loads a dynamic library that contains custom operators into Caffe2.
+
+    Since Caffe2 uses static variable registration, you can optionally load a
+    separate .so file that contains custom operators and registers that into
+    the caffe2 core binary. In C++, this is usually done by either declaring
+    dependency during compilation time, or via dynload. This allows us to do
+    registration similarly on the Python side.
+
+    Args:
+        name: a name that ends in .so, such as "my_custom_op.so". Otherwise,
+            the command will simply be ignored.
+    Returns:
+        None
+    """
+    if not os.path.exists(name):
+        # Note(jiayq): if the name does not exist, instead of immediately
+        # failing we will simply print a warning, deferring failure to the
+        # time when an actual call is made.
+        print('Ignoring {} as it is not a valid file.'.format(name))
+        return
+    _init_impl(name)
+
+
+_IMPORTED_DYNDEPS = set()
+
+
+def GetImportedOpsLibraries():
+    return _IMPORTED_DYNDEPS
+
+
+def _init_impl(path):
+    _IMPORTED_DYNDEPS.add(path)
+    with extension_loader.DlopenGuard():
+        ctypes.CDLL(path)
+    # reinitialize available ops
+    core.RefreshRegisteredOperators()
diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py
new file mode 100644
index 0000000..a4d6603
--- /dev/null
+++ b/caffe2/python/embedding_generation_benchmark.py
@@ -0,0 +1,196 @@
+## @package embedding_generation_benchmark
+# Module caffe2.python.embedding_generation_benchmark
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, core, utils, model_helper
+
+import argparse
+import numpy as np
+import time
+
+import logging
+
+logging.basicConfig()
+log = logging.getLogger("embedding_generation_benchmark")
+log.setLevel(logging.DEBUG)
+
+
+def generate_data(T, batch_size, max_seq_length):
+    '''
+    Fill a queue with input data
+    '''
+    log.info("Generating T={} batches".format(T))
+
+    generate_input_init_net = core.Net('generate_input_init')
+    queue = generate_input_init_net.CreateBlobsQueue(
+        [], "inputqueue", num_blobs=1, capacity=T,
+    )
+    workspace.RunNetOnce(generate_input_init_net)
+
+    generate_input_net = core.Net('generate_input')
+    generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
+    np.random.seed(2603)
+
+    for t in range(T):
+        if (t % (max(10, T // 10)) == 0):
+            log.info("Generating data {}/{}".format(t, T))
+        X = np.tile(np.arange(max_seq_length), [batch_size, 1]).transpose()
+        workspace.FeedBlob("scratch", X)
+        workspace.RunNetOnce(generate_input_net.Proto())
+
+    log.info("Finished data generation")
+    return queue
+
+
+def generate_embedding_table(vocab_size, embedding_size):
+    log.info("Generating embedding table with dimensions {}"
+             .format([vocab_size, embedding_size]))
+
+    generate_table_net = core.Net('generate_table')
+    table = generate_table_net.GaussianFill(
+        [],
+        ['embedding_table'],
+        shape=[vocab_size, embedding_size],
+    )
+
+    workspace.RunNetOnce(generate_table_net)
+    return table
+
+
+def create_model(args, queue, embedding_table, embedding_size):
+    model = model_helper.ModelHelper(name='embedding_generation_bench')
+    input_blob = model.net.DequeueBlobs(queue, 'input_data')
+
+    if args.implementation == 'sinusoid':
+        model.net.SinusoidPositionEncoding(
+            [input_blob],
+            ['output'],
+            embedding_size=embedding_size
+        )
+    else:
+        model.net.Gather(
+            [embedding_table, input_blob],
+            ['output'],
+        )
+
+    return model
+
+
+def Caffe2EmbeddingGeneration(args):
+    T = args.data_size // args.batch_size
+
+    queue = generate_data(T, args.batch_size, args.seq_length)
+
+    embedding_table = None
+    if args.implementation == 'table':
+        embedding_table = generate_embedding_table(
+            args.seq_length,
+            args.embedding_size,
+        )
+
+    model = create_model(args, queue, embedding_table, args.embedding_size)
+
+    workspace.RunNetOnce(model.param_init_net)
+    workspace.CreateNet(model.net)
+
+    start_time = time.time()
+    num_iters = T
+    total_iters = 0
+
+    # Run the Benchmark
+    log.info("------ Warming up ------")
+    workspace.RunNet(model.net.Proto().name)
+
+    log.info("------ Starting benchmark ------")
+    start_time = time.time()
+    last_time = time.time()
+    for iteration in range(1, num_iters, args.iters_to_report):
+        iters_once = min(args.iters_to_report, num_iters - iteration)
+        total_iters += iters_once
+        workspace.RunNet(model.net.Proto().name, iters_once)
+
+        new_time = time.time()
+        log.info(
+            "Iter: {} / {}. Embeddings Generated Per Second: {}k.".format(
+                iteration,
+                num_iters,
+                (iters_once * args.batch_size * args.seq_length) /
+                (new_time - last_time) // 100 / 10,
+            )
+        )
+        last_time = new_time
+
+    total_per_sec = (num_iters - 1) * args.batch_size * args.seq_length
+    total_per_sec = total_per_sec / (time.time() - start_time) // 100 / 10
+
+    log.info("Done. Total embeddings generated per second " +
+             "excluding 1st iteration: {}k".format(total_per_sec))
+
+    return time.time() - start_time
+
+
+@utils.debug
+def Benchmark(args):
+    return Caffe2EmbeddingGeneration(args)
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(
+        description="Embedding generation benchmark."
+    )
+
+    parser.add_argument(
+        "--embedding_size",
+        type=int,
+        default=512,
+        help="Embedding size",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=16,
+        help="The batch size."
+    )
+    parser.add_argument(
+        "--data_size",
+        type=int,
+        default=10000,
+        help="Number of sequences to generate"
+    )
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=128,
+        help="Max sequence length"
+    )
+    parser.add_argument(
+        "--iters_to_report",
+        type=int,
+        default=20,
+        help="Number of iterations to report progress"
+    )
+    parser.add_argument(
+        "--implementation",
+        type=str,
+        default="sinusoid",
+        help="'table' or 'sinusoid'",
+    )
+    return parser
+
+
+if __name__ == '__main__':
+    args, extra_args = GetArgumentParser().parse_known_args()
+
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+        '--caffe2_print_blob_sizes_at_exit=0'] + extra_args)
+
+    device = core.DeviceOption(caffe2_pb2.CPU)
+
+    with core.DeviceScope(device):
+        Benchmark(args)
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
new file mode 100644
index 0000000..a74d489
--- /dev/null
+++ b/caffe2/python/examples/char_rnn.py
@@ -0,0 +1,276 @@
+## @package char_rnn
+# Module caffe2.python.examples.char_rnn
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace, model_helper, utils, brew
+from caffe2.python.rnn_cell import LSTM
+from caffe2.proto import caffe2_pb2
+from caffe2.python.optimizer import build_sgd
+
+
+import argparse
+import logging
+import numpy as np
+from datetime import datetime
+
+'''
+This script takes a text file as input and uses a recurrent neural network
+to learn to predict next character in a sequence.
+'''
+
+logging.basicConfig()
+log = logging.getLogger("char_rnn")
+log.setLevel(logging.DEBUG)
+
+
+# Default set() here is intentional as it would accumulate values like a global
+# variable
+def CreateNetOnce(net, created_names=set()): # noqa
+    name = net.Name()
+    if name not in created_names:
+        created_names.add(name)
+        workspace.CreateNet(net)
+
+
+class CharRNN(object):
+    def __init__(self, args):
+        self.seq_length = args.seq_length
+        self.batch_size = args.batch_size
+        self.iters_to_report = args.iters_to_report
+        self.hidden_size = args.hidden_size
+
+        with open(args.train_data) as f:
+            self.text = f.read()
+
+        self.vocab = list(set(self.text))
+        self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
+        self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
+        self.D = len(self.char_to_idx)
+
+        print("Input has {} characters. Total input size: {}".format(
+            len(self.vocab), len(self.text)))
+
+    def CreateModel(self):
+        log.debug("Start training")
+        model = model_helper.ModelHelper(name="char_rnn")
+
+        input_blob, seq_lengths, hidden_init, cell_init, target = \
+            model.net.AddExternalInputs(
+                'input_blob',
+                'seq_lengths',
+                'hidden_init',
+                'cell_init',
+                'target',
+            )
+
+        hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(
+            model, input_blob, seq_lengths, (hidden_init, cell_init),
+            self.D, self.hidden_size, scope="LSTM")
+        output = brew.fc(
+            model,
+            hidden_output_all,
+            None,
+            dim_in=self.hidden_size,
+            dim_out=self.D,
+            axis=2
+        )
+
+        # axis is 2 as first two are T (time) and N (batch size).
+        # We treat them as one big batch of size T * N
+        softmax = model.net.Softmax(output, 'softmax', axis=2)
+
+        softmax_reshaped, _ = model.net.Reshape(
+            softmax, ['softmax_reshaped', '_'], shape=[-1, self.D])
+
+        # Create a copy of the current net. We will use it on the forward
+        # pass where we don't need loss and backward operators
+        self.forward_net = core.Net(model.net.Proto())
+
+        xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
+        # Loss is average both across batch and through time
+        # Thats why the learning rate below is multiplied by self.seq_length
+        loss = model.net.AveragedLoss(xent, 'loss')
+        model.AddGradientOperators([loss])
+
+        # use build_sdg function to build an optimizer
+        build_sgd(
+            model,
+            base_learning_rate=0.1 * self.seq_length,
+            policy="step",
+            stepsize=1,
+            gamma=0.9999
+        )
+
+        self.model = model
+        self.predictions = softmax
+        self.loss = loss
+
+        self.prepare_state = core.Net("prepare_state")
+        self.prepare_state.Copy(self.hidden_output, hidden_init)
+        self.prepare_state.Copy(self.cell_state, cell_init)
+
+    def _idx_at_pos(self, pos):
+        return self.char_to_idx[self.text[pos]]
+
+    def TrainModel(self):
+        log.debug("Training model")
+
+        workspace.RunNetOnce(self.model.param_init_net)
+
+        # As though we predict the same probability for each character
+        smooth_loss = -np.log(1.0 / self.D) * self.seq_length
+        last_n_iter = 0
+        last_n_loss = 0.0
+        num_iter = 0
+        N = len(self.text)
+
+        # We split text into batch_size pieces. Each piece will be used only
+        # by a corresponding batch during the training process
+        text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
+        text_block_size = N // self.batch_size
+        text_block_starts = list(range(0, N, text_block_size))
+        text_block_sizes = [text_block_size] * self.batch_size
+        text_block_sizes[self.batch_size - 1] += N % self.batch_size
+        assert sum(text_block_sizes) == N
+
+        # Writing to output states which will be copied to input
+        # states within the loop below
+        workspace.FeedBlob(self.hidden_output, np.zeros(
+            [1, self.batch_size, self.hidden_size], dtype=np.float32
+        ))
+        workspace.FeedBlob(self.cell_state, np.zeros(
+            [1, self.batch_size, self.hidden_size], dtype=np.float32
+        ))
+        workspace.CreateNet(self.prepare_state)
+
+        # We iterate over text in a loop many times. Each time we peak
+        # seq_length segment and feed it to LSTM as a sequence
+        last_time = datetime.now()
+        progress = 0
+        while True:
+            workspace.FeedBlob(
+                "seq_lengths",
+                np.array([self.seq_length] * self.batch_size,
+                         dtype=np.int32)
+            )
+            workspace.RunNet(self.prepare_state.Name())
+
+            input = np.zeros(
+                [self.seq_length, self.batch_size, self.D]
+            ).astype(np.float32)
+            target = np.zeros(
+                [self.seq_length * self.batch_size]
+            ).astype(np.int32)
+
+            for e in range(self.batch_size):
+                for i in range(self.seq_length):
+                    pos = text_block_starts[e] + text_block_positions[e]
+                    input[i][e][self._idx_at_pos(pos)] = 1
+                    target[i * self.batch_size + e] =\
+                        self._idx_at_pos((pos + 1) % N)
+                    text_block_positions[e] = (
+                        text_block_positions[e] + 1) % text_block_sizes[e]
+                    progress += 1
+
+            workspace.FeedBlob('input_blob', input)
+            workspace.FeedBlob('target', target)
+
+            CreateNetOnce(self.model.net)
+            workspace.RunNet(self.model.net.Name())
+
+            num_iter += 1
+            last_n_iter += 1
+
+            if num_iter % self.iters_to_report == 0:
+                new_time = datetime.now()
+                print("Characters Per Second: {}". format(
+                    int(progress / (new_time - last_time).total_seconds())
+                ))
+                print("Iterations Per Second: {}". format(
+                    int(self.iters_to_report /
+                        (new_time - last_time).total_seconds())
+                ))
+
+                last_time = new_time
+                progress = 0
+
+                print("{} Iteration {} {}".
+                      format('-' * 10, num_iter, '-' * 10))
+
+            loss = workspace.FetchBlob(self.loss) * self.seq_length
+            smooth_loss = 0.999 * smooth_loss + 0.001 * loss
+            last_n_loss += loss
+
+            if num_iter % self.iters_to_report == 0:
+                self.GenerateText(500, np.random.choice(self.vocab))
+
+                log.debug("Loss since last report: {}"
+                          .format(last_n_loss / last_n_iter))
+                log.debug("Smooth loss: {}".format(smooth_loss))
+
+                last_n_loss = 0.0
+                last_n_iter = 0
+
+    def GenerateText(self, num_characters, ch):
+        # Given a starting symbol we feed a fake sequence of size 1 to
+        # our RNN num_character times. After each time we use output
+        # probabilities to pick a next character to feed to the network.
+        # Same character becomes part of the output
+        CreateNetOnce(self.forward_net)
+
+        text = '' + ch
+        for _i in range(num_characters):
+            workspace.FeedBlob(
+                "seq_lengths", np.array([1] * self.batch_size, dtype=np.int32))
+            workspace.RunNet(self.prepare_state.Name())
+
+            input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
+            input[0][0][self.char_to_idx[ch]] = 1
+
+            workspace.FeedBlob("input_blob", input)
+            workspace.RunNet(self.forward_net.Name())
+
+            p = workspace.FetchBlob(self.predictions)
+            next = np.random.choice(self.D, p=p[0][0])
+
+            ch = self.idx_to_char[next]
+            text += ch
+
+        print(text)
+
+
+@utils.debug
+def main():
+    parser = argparse.ArgumentParser(
+        description="Caffe2: Char RNN Training"
+    )
+    parser.add_argument("--train_data", type=str, default=None,
+                        help="Path to training data in a text file format",
+                        required=True)
+    parser.add_argument("--seq_length", type=int, default=25,
+                        help="One training example sequence length")
+    parser.add_argument("--batch_size", type=int, default=1,
+                        help="Training batch size")
+    parser.add_argument("--iters_to_report", type=int, default=500,
+                        help="How often to report loss and generate text")
+    parser.add_argument("--hidden_size", type=int, default=100,
+                        help="Dimension of the hidden representation")
+    parser.add_argument("--gpu", action="store_true",
+                        help="If set, training is going to use GPU 0")
+
+    args = parser.parse_args()
+
+    device = core.DeviceOption(
+        caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
+    with core.DeviceScope(device):
+        model = CharRNN(args)
+        model.CreateModel()
+        model.TrainModel()
+
+
+if __name__ == '__main__':
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
+    main()
diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py
new file mode 100644
index 0000000..b29b3b8
--- /dev/null
+++ b/caffe2/python/examples/lmdb_create_example.py
@@ -0,0 +1,107 @@
+## @package lmdb_create_example
+# Module caffe2.python.examples.lmdb_create_example
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import numpy as np
+
+import lmdb
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, model_helper
+
+'''
+Simple example to create an lmdb database of random image data and labels.
+This can be used a skeleton to write your own data import.
+
+It also runs a dummy-model with Caffe2 that reads the data and
+validates the checksum is same.
+'''
+
+
+def create_db(output_file):
+    print(">>> Write database...")
+    LMDB_MAP_SIZE = 1 << 40   # MODIFY
+    env = lmdb.open(output_file, map_size=LMDB_MAP_SIZE)
+
+    checksum = 0
+    with env.begin(write=True) as txn:
+        for j in range(0, 128):
+            # MODIFY: add your own data reader / creator
+            label = j % 10
+            width = 64
+            height = 32
+
+            img_data = np.random.rand(3, width, height)
+            # ...
+
+            # Create TensorProtos
+            tensor_protos = caffe2_pb2.TensorProtos()
+            img_tensor = tensor_protos.protos.add()
+            img_tensor.dims.extend(img_data.shape)
+            img_tensor.data_type = 1
+
+            flatten_img = img_data.reshape(np.prod(img_data.shape))
+            img_tensor.float_data.extend(flatten_img)
+
+            label_tensor = tensor_protos.protos.add()
+            label_tensor.data_type = 2
+            label_tensor.int32_data.append(label)
+            txn.put(
+                '{}'.format(j).encode('ascii'),
+                tensor_protos.SerializeToString()
+            )
+
+            checksum += np.sum(img_data) * label
+            if (j % 16 == 0):
+                print("Inserted {} rows".format(j))
+
+    print("Checksum/write: {}".format(int(checksum)))
+    return checksum
+
+
+def read_db_with_caffe2(db_file, expected_checksum):
+    print(">>> Read database...")
+    model = model_helper.ModelHelper(name="lmdbtest")
+    batch_size = 32
+    data, label = model.TensorProtosDBInput(
+        [], ["data", "label"], batch_size=batch_size,
+        db=db_file, db_type="lmdb")
+
+    checksum = 0
+
+    workspace.RunNetOnce(model.param_init_net)
+    workspace.CreateNet(model.net)
+
+    for _ in range(0, 4):
+        workspace.RunNet(model.net.Proto().name)
+
+        img_datas = workspace.FetchBlob("data")
+        labels = workspace.FetchBlob("label")
+        for j in range(batch_size):
+            checksum += np.sum(img_datas[j, :]) * labels[j]
+
+    print("Checksum/read: {}".format(int(checksum)))
+    assert np.abs(expected_checksum - checksum < 0.1), \
+        "Read/write checksums dont match"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Example LMDB creation"
+    )
+    parser.add_argument("--output_file", type=str, default=None,
+                        help="Path to write the database to",
+                        required=True)
+
+    args = parser.parse_args()
+    checksum = create_db(args.output_file)
+
+    # For testing reading:
+    read_db_with_caffe2(args.output_file, checksum)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
new file mode 100644
index 0000000..e171685
--- /dev/null
+++ b/caffe2/python/examples/resnet50_trainer.py
@@ -0,0 +1,603 @@
+# Module caffe2.python.examples.resnet50_trainer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import logging
+import numpy as np
+import time
+import os
+
+from caffe2.python import core, workspace, experiment_util, data_parallel_model
+from caffe2.python import dyndep, optimizer
+from caffe2.python import timeout_guard, model_helper, brew
+from caffe2.proto import caffe2_pb2
+
+import caffe2.python.models.resnet as resnet
+from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
+import caffe2.python.predictor.predictor_exporter as pred_exp
+import caffe2.python.predictor.predictor_py_utils as pred_utils
+from caffe2.python.predictor_constants import predictor_constants as predictor_constants
+
+'''
+Parallelized multi-GPU distributed trainer for Resnet 50. Can be used to train
+on imagenet data, for example.
+
+To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
+
+To run the trainer in multi-machine multi-gpu mode with M machines,
+run the same program on all machines, specifying num_shards = M, and
+shard_id = a unique integer in the set [0, M-1].
+
+For rendezvous (the trainer processes have to know about each other),
+you can either use a directory path that is visible to all processes
+(e.g. NFS directory), or use a Redis instance. Use the former by
+passing the `file_store_path` argument. Use the latter by passing the
+`redis_host` and `redis_port` arguments.
+'''
+
+logging.basicConfig()
+log = logging.getLogger("resnet50_trainer")
+log.setLevel(logging.DEBUG)
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
+dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
+
+
+def AddImageInput(model, reader, batch_size, img_size, dtype, is_test):
+    '''
+    The image input operator loads image and label data from the reader and
+    applies transformations to the images (random cropping, mirroring, ...).
+    '''
+    data, label = brew.image_input(
+        model,
+        reader, ["data", "label"],
+        batch_size=batch_size,
+        output_type=dtype,
+        use_gpu_transform=True if model._device_type == 1 else False,
+        use_caffe_datum=True,
+        mean=128.,
+        std=128.,
+        scale=256,
+        crop=img_size,
+        mirror=1,
+        is_test=is_test,
+    )
+
+    data = model.StopGradient(data, data)
+
+
+def AddNullInput(model, reader, batch_size, img_size, dtype):
+    '''
+    The null input function uses a gaussian fill operator to emulate real image
+    input. A label blob is hardcoded to a single value. This is useful if you
+    want to test compute throughput or don't have a dataset available.
+    '''
+    suffix = "_fp16" if dtype == "float16" else ""
+    model.param_init_net.GaussianFill(
+        [],
+        ["data" + suffix],
+        shape=[batch_size, 3, img_size, img_size],
+    )
+    if dtype == "float16":
+        model.param_init_net.FloatToHalf("data" + suffix, "data")
+
+    model.param_init_net.ConstantFill(
+        [],
+        ["label"],
+        shape=[batch_size],
+        value=1,
+        dtype=core.DataType.INT32,
+    )
+
+
+def SaveModel(args, train_model, epoch):
+    prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
+    predictor_export_meta = pred_exp.PredictorExportMeta(
+        predict_net=train_model.net.Proto(),
+        parameters=data_parallel_model.GetCheckpointParams(train_model),
+        inputs=[prefix + "/data"],
+        outputs=[prefix + "/softmax"],
+        shapes={
+            prefix + "/softmax": (1, args.num_labels),
+            prefix + "/data": (args.num_channels, args.image_size, args.image_size)
+        }
+    )
+
+    # save the train_model for the current epoch
+    model_path = "%s/%s_%d.mdl" % (
+        args.file_store_path,
+        args.save_model_name,
+        epoch,
+    )
+
+    # set db_type to be "minidb" instead of "log_file_db", which breaks
+    # the serialization in save_to_db. Need to switch back to log_file_db
+    # after migration
+    pred_exp.save_to_db(
+        db_type="minidb",
+        db_destination=model_path,
+        predictor_export_meta=predictor_export_meta,
+    )
+
+
+def LoadModel(path, model):
+    '''
+    Load pretrained model from file
+    '''
+    log.info("Loading path: {}".format(path))
+    meta_net_def = pred_exp.load_from_db(path, 'minidb')
+    init_net = core.Net(pred_utils.GetNet(
+        meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
+    predict_init_net = core.Net(pred_utils.GetNet(
+        meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
+
+    predict_init_net.RunAllOnGPU()
+    init_net.RunAllOnGPU()
+
+    assert workspace.RunNetOnce(predict_init_net)
+    assert workspace.RunNetOnce(init_net)
+
+    # Hack: fix iteration counter which is in CUDA context after load model
+    itercnt = workspace.FetchBlob("optimizer_iteration")
+    workspace.FeedBlob(
+        "optimizer_iteration",
+        itercnt,
+        device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
+    )
+
+
+def RunEpoch(
+    args,
+    epoch,
+    train_model,
+    test_model,
+    total_batch_size,
+    num_shards,
+    expname,
+    explog,
+):
+    '''
+    Run one epoch of the trainer.
+    TODO: add checkpointing here.
+    '''
+    # TODO: add loading from checkpoint
+    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
+    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
+    for i in range(epoch_iters):
+        # This timeout is required (temporarily) since CUDA-NCCL
+        # operators might deadlock when synchronizing between GPUs.
+        timeout = 600.0 if i == 0 else 60.0
+        with timeout_guard.CompleteInTimeOrDie(timeout):
+            t1 = time.time()
+            workspace.RunNet(train_model.net.Proto().name)
+            t2 = time.time()
+            dt = t2 - t1
+
+        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
+        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
+        prefix = "{}_{}".format(
+            train_model._device_prefix,
+            train_model._devices[0])
+        accuracy = workspace.FetchBlob(prefix + '/accuracy')
+        loss = workspace.FetchBlob(prefix + '/loss')
+        train_fmt = "Training loss: {}, accuracy: {}"
+        log.info(train_fmt.format(loss, accuracy))
+
+    num_images = epoch * epoch_iters * total_batch_size
+    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
+    accuracy = workspace.FetchBlob(prefix + '/accuracy')
+    loss = workspace.FetchBlob(prefix + '/loss')
+    learning_rate = workspace.FetchBlob(
+        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
+    )
+    test_accuracy = 0
+    if (test_model is not None):
+        # Run 100 iters of testing
+        ntests = 0
+        for _ in range(0, 100):
+            workspace.RunNet(test_model.net.Proto().name)
+            for g in test_model._devices:
+                test_accuracy += np.asscalar(workspace.FetchBlob(
+                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
+                ))
+                ntests += 1
+        test_accuracy /= ntests
+    else:
+        test_accuracy = (-1)
+
+    explog.log(
+        input_count=num_images,
+        batch_count=(i + epoch * epoch_iters),
+        additional_values={
+            'accuracy': accuracy,
+            'loss': loss,
+            'learning_rate': learning_rate,
+            'epoch': epoch,
+            'test_accuracy': test_accuracy,
+        }
+    )
+    assert loss < 40, "Exploded gradients :("
+
+    # TODO: add checkpointing
+    return epoch + 1
+
+
+def Train(args):
+    # Either use specified device list or generate one
+    if args.gpus is not None:
+        gpus = [int(x) for x in args.gpus.split(',')]
+        num_gpus = len(gpus)
+    else:
+        gpus = list(range(args.num_gpus))
+        num_gpus = args.num_gpus
+
+    log.info("Running on GPUs: {}".format(gpus))
+
+    # Verify valid batch size
+    total_batch_size = args.batch_size
+    batch_per_device = total_batch_size // num_gpus
+    assert \
+        total_batch_size % num_gpus == 0, \
+        "Number of GPUs must divide batch size"
+
+    # Round down epoch size to closest multiple of batch size across machines
+    global_batch_size = total_batch_size * args.num_shards
+    epoch_iters = int(args.epoch_size / global_batch_size)
+
+    assert \
+        epoch_iters > 0, \
+        "Epoch size must be larger than batch size times shard count"
+
+    args.epoch_size = epoch_iters * global_batch_size
+    log.info("Using epoch size: {}".format(args.epoch_size))
+
+    # Create ModelHelper object
+    train_arg_scope = {
+        'order': 'NCHW',
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': True,
+        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
+    }
+    train_model = model_helper.ModelHelper(
+        name="resnet50", arg_scope=train_arg_scope
+    )
+
+    num_shards = args.num_shards
+    shard_id = args.shard_id
+
+    # Expect interfaces to be comma separated.
+    # Use of multiple network interfaces is not yet complete,
+    # so simply use the first one in the list.
+    interfaces = args.distributed_interfaces.split(",")
+
+    # Rendezvous using MPI when run with mpirun
+    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
+        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
+        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
+        if num_shards > 1:
+            rendezvous = dict(
+                kv_handler=None,
+                num_shards=num_shards,
+                shard_id=shard_id,
+                engine="GLOO",
+                transport=args.distributed_transport,
+                interface=interfaces[0],
+                mpi_rendezvous=True,
+                exit_nets=None)
+
+    elif num_shards > 1:
+        # Create rendezvous for distributed computation
+        store_handler = "store_handler"
+        if args.redis_host is not None:
+            # Use Redis for rendezvous if Redis host is specified
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "RedisStoreHandlerCreate", [], [store_handler],
+                    host=args.redis_host,
+                    port=args.redis_port,
+                    prefix=args.run_id,
+                )
+            )
+        else:
+            # Use filesystem for rendezvous otherwise
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "FileStoreHandlerCreate", [], [store_handler],
+                    path=args.file_store_path,
+                    prefix=args.run_id,
+                )
+            )
+
+        rendezvous = dict(
+            kv_handler=store_handler,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            engine="GLOO",
+            transport=args.distributed_transport,
+            interface=interfaces[0],
+            exit_nets=None)
+
+    else:
+        rendezvous = None
+
+    # Model building functions
+    def create_resnet50_model_ops(model, loss_scale):
+        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
+                       else Initializer)
+
+        with brew.arg_scope([brew.conv, brew.fc],
+                            WeightInitializer=initializer,
+                            BiasInitializer=initializer,
+                            enable_tensor_core=args.enable_tensor_core,
+                            float16_compute=args.float16_compute):
+            pred = resnet.create_resnet50(
+                model,
+                "data",
+                num_input_channels=args.num_channels,
+                num_labels=args.num_labels,
+                no_bias=True,
+                no_loss=True,
+            )
+
+        if args.dtype == 'float16':
+            pred = model.net.HalfToFloat(pred, pred + '_fp32')
+
+        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
+                                              ['softmax', 'loss'])
+        loss = model.Scale(loss, scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], "accuracy")
+        return [loss]
+
+    def add_optimizer(model):
+        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
+
+        if args.float16_compute:
+            # TODO: merge with multi-prceision optimizer
+            opt = optimizer.build_fp16_sgd(
+                model,
+                args.base_learning_rate,
+                momentum=0.9,
+                nesterov=1,
+                weight_decay=args.weight_decay,   # weight decay included
+                policy="step",
+                stepsize=stepsz,
+                gamma=0.1
+            )
+        else:
+            optimizer.add_weight_decay(model, args.weight_decay)
+            opt = optimizer.build_multi_precision_sgd(
+                model,
+                args.base_learning_rate,
+                momentum=0.9,
+                nesterov=1,
+                policy="step",
+                stepsize=stepsz,
+                gamma=0.1
+            )
+        return opt
+
+    # Define add_image_input function.
+    # Depends on the "train_data" argument.
+    # Note that the reader will be shared with between all GPUS.
+    if args.train_data == "null":
+        def add_image_input(model):
+            AddNullInput(
+                model,
+                None,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+            )
+    else:
+        reader = train_model.CreateDB(
+            "reader",
+            db=args.train_data,
+            db_type=args.db_type,
+            num_shards=num_shards,
+            shard_id=shard_id,
+        )
+
+        def add_image_input(model):
+            AddImageInput(
+                model,
+                reader,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+                is_test=False,
+            )
+
+    def add_post_sync_ops(model):
+        """Add ops applied after initial parameter sync."""
+        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
+            if param_info.blob_copy is not None:
+                model.param_init_net.HalfToFloat(
+                    param_info.blob,
+                    param_info.blob_copy[core.DataType.FLOAT]
+                )
+
+    # Create parallelized model
+    data_parallel_model.Parallelize(
+        train_model,
+        input_builder_fun=add_image_input,
+        forward_pass_builder_fun=create_resnet50_model_ops,
+        optimizer_builder_fun=add_optimizer,
+        post_sync_builder_fun=add_post_sync_ops,
+        devices=gpus,
+        rendezvous=rendezvous,
+        optimize_gradient_memory=False,
+        cpu_device=args.use_cpu,
+        shared_model=args.use_cpu,
+        combine_spatial_bn=args.use_cpu,
+    )
+
+    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
+
+    workspace.RunNetOnce(train_model.param_init_net)
+    workspace.CreateNet(train_model.net)
+
+    # Add test model, if specified
+    test_model = None
+    if (args.test_data is not None):
+        log.info("----- Create test net ----")
+        test_arg_scope = {
+            'order': "NCHW",
+            'use_cudnn': True,
+            'cudnn_exhaustive_search': True,
+        }
+        test_model = model_helper.ModelHelper(
+            name="resnet50_test", arg_scope=test_arg_scope, init_params=False
+        )
+
+        test_reader = test_model.CreateDB(
+            "test_reader",
+            db=args.test_data,
+            db_type=args.db_type,
+        )
+
+        def test_input_fn(model):
+            AddImageInput(
+                model,
+                test_reader,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+                is_test=True,
+            )
+
+        data_parallel_model.Parallelize(
+            test_model,
+            input_builder_fun=test_input_fn,
+            forward_pass_builder_fun=create_resnet50_model_ops,
+            post_sync_builder_fun=add_post_sync_ops,
+            param_update_builder_fun=None,
+            devices=gpus,
+            cpu_device=args.use_cpu,
+        )
+        workspace.RunNetOnce(test_model.param_init_net)
+        workspace.CreateNet(test_model.net)
+
+    epoch = 0
+    # load the pre-trained model and reset epoch
+    if args.load_model_path is not None:
+        LoadModel(args.load_model_path, train_model)
+
+        # Sync the model params
+        data_parallel_model.FinalizeAfterCheckpoint(train_model)
+
+        # reset epoch. load_model_path should end with *_X.mdl,
+        # where X is the epoch number
+        last_str = args.load_model_path.split('_')[-1]
+        if last_str.endswith('.mdl'):
+            epoch = int(last_str[:-4])
+            log.info("Reset epoch to {}".format(epoch))
+        else:
+            log.warning("The format of load_model_path doesn't match!")
+
+    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
+        args.num_gpus,
+        total_batch_size,
+        args.num_labels,
+        args.base_learning_rate,
+    )
+
+    explog = experiment_util.ModelTrainerLog(expname, args)
+
+    # Run the training one epoch a time
+    while epoch < args.num_epochs:
+        epoch = RunEpoch(
+            args,
+            epoch,
+            train_model,
+            test_model,
+            total_batch_size,
+            num_shards,
+            expname,
+            explog
+        )
+
+        # Save the model for each epoch
+        SaveModel(args, train_model, epoch)
+
+        model_path = "%s/%s_" % (
+            args.file_store_path,
+            args.save_model_name
+        )
+        # remove the saved model from the previous epoch if it exists
+        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
+            os.remove(model_path + str(epoch - 1) + ".mdl")
+
+
+def main():
+    # TODO: use argv
+    parser = argparse.ArgumentParser(
+        description="Caffe2: Resnet-50 training"
+    )
+    parser.add_argument("--train_data", type=str, default=None, required=True,
+                        help="Path to training data (or 'null' to simulate)")
+    parser.add_argument("--test_data", type=str, default=None,
+                        help="Path to test data")
+    parser.add_argument("--db_type", type=str, default="lmdb",
+                        help="Database type (such as lmdb or leveldb)")
+    parser.add_argument("--gpus", type=str,
+                        help="Comma separated list of GPU devices to use")
+    parser.add_argument("--num_gpus", type=int, default=1,
+                        help="Number of GPU devices (instead of --gpus)")
+    parser.add_argument("--num_channels", type=int, default=3,
+                        help="Number of color channels")
+    parser.add_argument("--image_size", type=int, default=227,
+                        help="Input image size (to crop to)")
+    parser.add_argument("--num_labels", type=int, default=1000,
+                        help="Number of labels")
+    parser.add_argument("--batch_size", type=int, default=32,
+                        help="Batch size, total over all GPUs")
+    parser.add_argument("--epoch_size", type=int, default=1500000,
+                        help="Number of images/epoch, total over all machines")
+    parser.add_argument("--num_epochs", type=int, default=1000,
+                        help="Num epochs.")
+    parser.add_argument("--base_learning_rate", type=float, default=0.1,
+                        help="Initial learning rate.")
+    parser.add_argument("--weight_decay", type=float, default=1e-4,
+                        help="Weight decay (L2 regularization)")
+    parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
+                        help="CuDNN workspace limit in MBs")
+    parser.add_argument("--num_shards", type=int, default=1,
+                        help="Number of machines in distributed run")
+    parser.add_argument("--shard_id", type=int, default=0,
+                        help="Shard id.")
+    parser.add_argument("--run_id", type=str,
+                        help="Unique run identifier (e.g. uuid)")
+    parser.add_argument("--redis_host", type=str,
+                        help="Host of Redis server (for rendezvous)")
+    parser.add_argument("--redis_port", type=int, default=6379,
+                        help="Port of Redis server (for rendezvous)")
+    parser.add_argument("--file_store_path", type=str, default="/tmp",
+                        help="Path to directory to use for rendezvous")
+    parser.add_argument("--save_model_name", type=str, default="resnet50_model",
+                        help="Save the trained model to a given name")
+    parser.add_argument("--load_model_path", type=str, default=None,
+                        help="Load previously saved model to continue training")
+    parser.add_argument("--use_cpu", type=bool, default=False,
+                        help="Use CPU instead of GPU")
+    parser.add_argument('--dtype', default='float',
+                        choices=['float', 'float16'],
+                        help='Data type used for training')
+    parser.add_argument('--float16_compute', action='store_true',
+                        help="Use float 16 compute, if available")
+    parser.add_argument('--enable_tensor_core', action='store_true',
+                        help='Enable Tensor Core math for Conv and FC ops')
+    parser.add_argument("--distributed_transport", type=str, default="tcp",
+                        help="Transport to use for distributed run [tcp|ibverbs]")
+    parser.add_argument("--distributed_interfaces", type=str, default="",
+                        help="Network interfaces to use for distributed run")
+
+    args = parser.parse_args()
+
+    Train(args)
+
+if __name__ == '__main__':
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
+    main()
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
new file mode 100644
index 0000000..cbe9491
--- /dev/null
+++ b/caffe2/python/experiment_util.py
@@ -0,0 +1,115 @@
+## @package experiment_util
+# Module caffe2.python.experiment_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import datetime
+import time
+import logging
+import socket
+import abc
+import six
+
+from collections import OrderedDict
+from future.utils import viewkeys, viewvalues
+
+'''
+Utilities for logging experiment run stats, such as accuracy
+and loss over time for different runs. Runtime arguments are stored
+in the log.
+
+Optionally, ModelTrainerLog calls out to a logger to log to
+an external log destination.
+'''
+
+
+class ExternalLogger(object):
+    six.add_metaclass(abc.ABCMeta)
+
+    @abc.abstractmethod
+    def set_runtime_args(self, runtime_args):
+        """
+            Set runtime arguments for the logger.
+            runtime_args: dict of runtime arguments.
+        """
+        raise NotImplementedError(
+            'Must define set_runtime_args function to use this base class'
+        )
+
+    @abc.abstractmethod
+    def log(self, log_dict):
+        """
+            log a dict of key/values to an external destination
+            log_dict: input dict
+        """
+        raise NotImplementedError(
+            'Must define log function to use this base class'
+        )
+
+
+class ModelTrainerLog():
+
+    def __init__(self, expname, runtime_args, external_loggers=None):
+        now = datetime.datetime.fromtimestamp(time.time())
+        self.experiment_id = \
+            "{}_{}".format(expname, now.strftime('%Y%m%d_%H%M%S'))
+        self.filename = "{}.log".format(self.experiment_id)
+        self.logstr("# %s" % str(runtime_args))
+        self.headers = None
+        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.last_input_count = 0
+        self.external_loggers = None
+
+        if external_loggers is not None:
+            self.external_loggers = external_loggers
+            if not isinstance(runtime_args, dict):
+                runtime_args = dict(vars(runtime_args))
+            runtime_args['experiment_id'] = self.experiment_id
+            runtime_args['hostname'] = socket.gethostname()
+            for logger in self.external_loggers:
+                logger.set_runtime_args(runtime_args)
+        else:
+            self.external_loggers = []
+
+    def logstr(self, str):
+        with open(self.filename, "a") as f:
+            f.write(str + "\n")
+            f.close()
+        logging.getLogger("experiment_logger").info(str)
+
+    def log(self, input_count, batch_count, additional_values):
+        logdict = OrderedDict()
+        delta_t = time.time() - self.last_time
+        delta_count = input_count - self.last_input_count
+        self.last_time = time.time()
+        self.last_input_count = input_count
+
+        logdict['time_spent'] = delta_t
+        logdict['cumulative_time_spent'] = time.time() - self.start_time
+        logdict['input_count'] = delta_count
+        logdict['cumulative_input_count'] = input_count
+        logdict['cumulative_batch_count'] = batch_count
+        if delta_t > 0:
+            logdict['inputs_per_sec'] = delta_count / delta_t
+        else:
+            logdict['inputs_per_sec'] = 0.0
+
+        for k in sorted(viewkeys(additional_values)):
+            logdict[k] = additional_values[k]
+
+        # Write the headers if they are not written yet
+        if self.headers is None:
+            self.headers = list(viewkeys(logdict))
+            self.logstr(",".join(self.headers))
+
+        self.logstr(",".join(str(v) for v in viewvalues(logdict)))
+
+        for logger in self.external_loggers:
+            try:
+                logger.log(logdict)
+            except Exception as e:
+                logging.warn(
+                    "Failed to call ExternalLogger: {}".format(e), e)
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
new file mode 100644
index 0000000..fe85d53
--- /dev/null
+++ b/caffe2/python/extension_loader.py
@@ -0,0 +1,23 @@
+## @package extension_loader
+# Module caffe2.python.extension_loader
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import contextlib
+import ctypes
+import sys
+
+
+_set_global_flags = (
+    hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'))
+
+
+@contextlib.contextmanager
+def DlopenGuard():
+    if _set_global_flags:
+        old_flags = sys.getdlopenflags()
+        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+    yield
+    if _set_global_flags:
+        sys.setdlopenflags(old_flags)
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
new file mode 100644
index 0000000..6ec25ab
--- /dev/null
+++ b/caffe2/python/functional.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from collections import namedtuple
+from six import string_types
+
+OpSchema = workspace.C.OpSchema
+
+
+def namedtupledict(typename, field_names, *args, **kwargs):
+    field_names_map = {n: i for i, n in enumerate(field_names)}
+    # Some output names are invalid python identifier, e.g. "0"
+    kwargs.setdefault('rename', True)
+    data = namedtuple(typename, field_names, *args, **kwargs)
+
+    def getitem(self, key):
+        if isinstance(key, string_types):
+            key = field_names_map[key]
+        return super(type(self), self).__getitem__(key)
+
+    data.__getitem__ = getitem
+    return data
+
+
+class _Functional(object):
+    def __getattribute__(self, op_type):
+        def op_func(*inputs, **args):
+            ws = workspace.C.Workspace()
+            schema = OpSchema.get(op_type)
+            input_prefix = 'input_'
+            output_prefix = 'output_'
+
+            def get_name_list(prefix, num, max_num):
+                return [prefix + str(x) for x in range(min(num, max_num))]
+
+            input_names, output_names = [], []
+            input_names = get_name_list(
+                input_prefix, len(inputs), schema.max_input
+            )
+            # verify the length of input name is in range
+            # of schema
+            num_input = len(input_names)
+            if num_input > schema.max_input or num_input < \
+               schema.min_input or not schema.num_inputs_allowed(num_input):
+                raise ValueError(
+                    "Functional C2: Number of inputs not in \
+                range: {} - {} or not allowed."
+                    .format(schema.min_input, schema.max_input)
+                )
+
+            if 'num_output' in args:
+                num_output = args['num_output']
+                if num_output > schema.max_output or \
+                   num_output < schema.min_output or \
+                   not schema.num_outputs_allowed(num_output) or \
+                   not schema.num_inputs_outputs_allowed(num_input,
+                                                         num_output):
+                    raise ValueError(
+                        "Functional C2: Number of output \
+                    not in range: {} - {} or not allowed"
+                        .format(schema.min_output, schema.max_output)
+                    )
+                output_names = get_name_list(
+                    output_prefix, num_output, schema.max_output
+                )
+                args.pop('num_output')
+            calculated = schema.CalculateOutput(num_input)
+            if not output_names and calculated != -1:
+                output_names = get_name_list(
+                    output_prefix, calculated, schema.max_output
+                )
+
+            if not output_names:
+                max_output = schema.max_output
+                # For an op with max_output == inf
+                # and no Output defined in schema
+                # user should pass output_size explicitly
+                if schema.inf == max_output:
+                    raise ValueError(
+                        "For operators with max_output == inf,\
+                        user should pass num_output explicity."
+                    )
+                output_names = get_name_list(
+                    output_prefix, max_output, max_output
+                )
+            for i, input_blob in enumerate(inputs):
+                ws.create_blob(input_names[i]).feed(input_blob)
+
+            op = core.CreateOperator(
+                op_type, input_names, output_names, **args
+            )
+            ws._run_operator(op.SerializeToString())
+            # RunOperator
+            output_values = [ws.fetch_blob(x) for x in output_names]
+            return namedtupledict('output', output_names)(*output_values)
+
+        return op_func
+
+
+Functional = _Functional()
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
new file mode 100644
index 0000000..731c1a7
--- /dev/null
+++ b/caffe2/python/functional_test.py
@@ -0,0 +1,116 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import workspace
+from caffe2.python.functional import Functional
+import numpy as np
+
+
+@st.composite
+def _tensor_splits(draw, add_axis=False):
+    """Generates (axis, split_info, tensor_splits) tuples."""
+    tensor = draw(hu.tensor(min_value=4))  # Each dim has at least 4 elements.
+    axis = draw(st.integers(0, len(tensor.shape) - 1))
+    if add_axis:
+        # Simple case: get individual slices along one axis, where each of them
+        # is (N-1)-dimensional. The axis will be added back upon concatenation.
+        return (
+            axis, np.ones(tensor.shape[axis], dtype=np.int32), [
+                np.array(tensor.take(i, axis=axis))
+                for i in range(tensor.shape[axis])
+            ]
+        )
+    else:
+        # General case: pick some (possibly consecutive, even non-unique)
+        # indices at which we will split the tensor, along the given axis.
+        splits = sorted(
+            draw(
+                st.
+                lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
+            ) + [0, tensor.shape[axis]]
+        )
+        return (
+            axis, np.array(np.diff(splits), dtype=np.int32), [
+                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
+                for i in range(len(splits) - 1)
+            ],
+        )
+
+
+class TestFunctional(hu.HypothesisTestCase):
+    @given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]))
+    def test_relu(self, X, engine):
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+        output = Functional.Relu(X)
+        Y_l = output[0]
+        Y_d = output["output_0"]
+
+        with workspace.WorkspaceGuard("tmp_workspace"):
+            op = core.CreateOperator("Relu", ["X"], ["Y"], engine=engine)
+            workspace.FeedBlob("X", X)
+            workspace.RunOperatorOnce(op)
+            Y_ref = workspace.FetchBlob("Y")
+
+        np.testing.assert_array_equal(
+            Y_l, Y_ref, err_msg='Functional Relu result mismatch'
+        )
+
+        np.testing.assert_array_equal(
+            Y_d, Y_ref, err_msg='Functional Relu result mismatch'
+        )
+
+    @given(tensor_splits=_tensor_splits())
+    def test_concat(self, tensor_splits):
+        # Input Size: 1 -> inf
+        axis, _, splits = tensor_splits
+        concat_result, split_info = Functional.Concat(*splits, axis=axis)
+
+        concat_result_ref = np.concatenate(splits, axis=axis)
+        split_info_ref = np.array([a.shape[axis] for a in splits])
+
+        np.testing.assert_array_equal(
+            concat_result,
+            concat_result_ref,
+            err_msg='Functional Concat result mismatch'
+        )
+
+        np.testing.assert_array_equal(
+            split_info,
+            split_info_ref,
+            err_msg='Functional Concat split info mismatch'
+        )
+
+    @given(tensor_splits=_tensor_splits(), split_as_arg=st.booleans())
+    def test_split(self, tensor_splits, split_as_arg):
+        # Output Size: 1 - inf
+        axis, split_info, splits = tensor_splits
+
+        split_as_arg = True
+
+        if split_as_arg:
+            input_tensors = [np.concatenate(splits, axis=axis)]
+            kwargs = dict(axis=axis, split=split_info, num_output=len(splits))
+        else:
+            input_tensors = [np.concatenate(splits, axis=axis), split_info]
+            kwargs = dict(axis=axis, num_output=len(splits))
+        result = Functional.Split(*input_tensors, **kwargs)
+
+        def split_ref(input, split=split_info):
+            s = np.cumsum([0] + list(split))
+            return [
+                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
+                for i in range(len(split))
+            ]
+
+        result_ref = split_ref(*input_tensors)
+        for i, ref in enumerate(result_ref):
+            np.testing.assert_array_equal(
+                result[i], ref, err_msg='Functional Relu result mismatch'
+            )
diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
new file mode 100644
index 0000000..36efaa4
--- /dev/null
+++ b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
@@ -0,0 +1,99 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+import struct
+from hypothesis import given
+
+# Eigen/Python round 0.5 away from 0, Numpy rounds to even
+round_to_nearest = np.vectorize(round)
+
+
+def bytes_to_floats(byte_matrix):
+    floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32)
+    for i, byte_values in enumerate(byte_matrix):
+        floats[i], = struct.unpack('f', bytearray(byte_values))
+    return floats
+
+
+def floats_to_bytes(floats):
+    byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8)
+    for i, value in enumerate(floats):
+        assert isinstance(value, np.float32), (value, floats)
+        as_bytes = struct.pack('f', value)
+        # In Python3 bytes will be a list of int, in Python2 a list of string
+        if isinstance(as_bytes[0], int):
+            byte_matrix[i] = list(as_bytes)
+        else:
+            byte_matrix[i] = list(map(ord, as_bytes))
+    return byte_matrix
+
+
+def fused_rowwise_8bit_quantize_reference(data):
+    minimum = np.min(data, axis=1, keepdims=True)
+    maximum = np.max(data, axis=1, keepdims=True)
+    span = maximum - minimum
+    bias = minimum
+    scale = span / 255.0
+    inverse_scale = 255.0 / (span + 1e-8)
+    quantized_data = round_to_nearest((data - bias) * inverse_scale)
+    scale_bytes = floats_to_bytes(scale.reshape(-1))
+    bias_bytes = floats_to_bytes(bias.reshape(-1))
+    return np.concatenate([quantized_data, scale_bytes, bias_bytes], axis=1)
+
+
+def fused_rowwise_8bit_quantize_dequantize_reference(data):
+    fused_quantized = fused_rowwise_8bit_quantize_reference(data)
+    scale = bytes_to_floats(fused_quantized[:, -8:-4].astype(np.uint8))
+    bias = bytes_to_floats(fused_quantized[:, -4:].astype(np.uint8))
+    quantized_data = fused_quantized[:, :-8]
+    return quantized_data * scale + bias
+
+
+class TestFused8BitRowwiseQuantizationConversion(hu.HypothesisTestCase):
+    @given(input_data=hu.tensor(min_dim=2, max_dim=2))
+    def test_quantize_op(self, input_data):
+        quantize = core.CreateOperator(
+            'FloatToFused8BitRowwiseQuantized',
+            ['input_data'],
+            ['quantized_data'],
+        )
+        workspace.FeedBlob('input_data', input_data)
+        workspace.RunOperatorOnce(quantize)
+
+        quantized_data = workspace.FetchBlob('quantized_data')
+
+        reference = fused_rowwise_8bit_quantize_reference(
+            input_data.astype(np.float32)
+        )
+        np.testing.assert_array_almost_equal(quantized_data, reference)
+
+    @given(input_data=hu.tensor(min_dim=2, max_dim=2))
+    def test_quantize_and_dequantize_op(self, input_data):
+        quantize = core.CreateOperator(
+            'FloatToFused8BitRowwiseQuantized',
+            ['input_data'],
+            ['quantized_data'],
+        )
+        workspace.FeedBlob('input_data', input_data)
+        workspace.RunOperatorOnce(quantize)
+
+        quantized_data = workspace.FetchBlob('quantized_data')
+
+        dequantize = core.CreateOperator(
+            'Fused8BitRowwiseQuantizedToFloat',
+            ['quantized_data'],
+            ['dequantized_data'],
+        )
+        workspace.FeedBlob('quantized_data', quantized_data)
+        workspace.RunOperatorOnce(dequantize)
+
+        dequantized_data = workspace.FetchBlob('dequantized_data')
+
+        reference = fused_rowwise_8bit_quantize_dequantize_reference(input_data)
+        np.testing.assert_array_almost_equal(dequantized_data, reference)
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
new file mode 100644
index 0000000..f1c190a
--- /dev/null
+++ b/caffe2/python/gradient_check_test.py
@@ -0,0 +1,555 @@
+# TODO(jiayq): as more and more tests are moving to hypothesis test, we
+# can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS
+# FILE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import (
+    brew,
+    core,
+    device_checker,
+    gradient_checker,
+    model_helper,
+    test_util,
+    workspace,
+)
+from caffe2.python.gradient_checker import NetGradientChecker
+from caffe2.python.net_builder import ops, NetBuilder
+from caffe2.proto import caffe2_pb2
+
+import unittest
+
+
+if workspace.has_gpu_support and workspace.NumCudaDevices() > 0:
+    gpu_device_option = caffe2_pb2.DeviceOption()
+    gpu_device_option.device_type = caffe2_pb2.CUDA
+    cpu_device_option = caffe2_pb2.DeviceOption()
+    gpu_device_checker = device_checker.DeviceChecker(
+        0.01, [gpu_device_option]
+    )
+    device_checker = device_checker.DeviceChecker(
+        0.01, [gpu_device_option, cpu_device_option]
+    )
+    gpu_gradient_checkers = [
+        gradient_checker.GradientChecker(
+            0.005, 0.05, gpu_device_option, "gpu_checker_ws"
+        ),
+    ]
+    gradient_checkers = [
+        gradient_checker.GradientChecker(
+            0.005, 0.05, gpu_device_option, "gpu_checker_ws"
+        ),
+        gradient_checker.GradientChecker(
+            0.01, 0.05, cpu_device_option, "cpu_checker_ws"
+        ),
+    ]
+else:
+    cpu_device_option = caffe2_pb2.DeviceOption()
+    gpu_device_option = None
+    gpu_device_checker = device_checker.DeviceChecker(
+        0.01, []
+    )
+    device_checker = device_checker.DeviceChecker(0.01, [cpu_device_option])
+
+    gradient_checkers = [
+        gradient_checker.GradientChecker(
+            0.01, 0.05, cpu_device_option, "cpu_checker_ws"
+        )
+    ]
+    gpu_gradient_checkers = []
+
+
+class TestLRN(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [(6, 10), (3, 13), ]
+
+    def testLRN(self):
+        for input_size, depth in self.test_configs:
+            op = core.CreateOperator("LRN",
+                                     ["X"],
+                                     ["Y", "Y_scale"],
+                                     size=11,
+                                     alpha=0.001,
+                                     beta=0.5,
+                                     bias=2.0,
+                                     order="NHWC"
+                                     )
+            X = np.random.rand(2, input_size, input_size,
+                               depth).astype(np.float32)
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+
+class TestFlatten(test_util.TestCase):
+
+    def testFlatten(self):
+        op = core.CreateOperator("Flatten", ["X"], ["Y"])
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        res = device_checker.CheckSimple(op, [X], [0])
+        self.assertTrue(res)
+        for checker in gradient_checkers:
+            res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+            self.assertTrue(res)
+
+
+class TestConcat(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # input_size, depth1, depth2, depth3, depth4
+            (3, 2, 3, 4, 5),
+            (4, 5, 4, 3, 2),
+        ]
+
+    def testConcatNHWC(self):
+        for input_size, d1, d2, d3, d4 in self.test_configs:
+            op = core.CreateOperator("Concat",
+                                     ["X1", "X2", "X3", "X4"],
+                                     ["Y", "Y_dims"],
+                                     order="NHWC"
+                                     )
+            Xs = [
+                np.random.rand(2, input_size, input_size,
+                               d1).astype(np.float32),
+                np.random.rand(2, input_size, input_size,
+                               d2).astype(np.float32),
+                np.random.rand(2, input_size, input_size,
+                               d3).astype(np.float32),
+                np.random.rand(2, input_size, input_size, d4).astype(np.float32)
+            ]
+            for i in range(4):
+                res = device_checker.CheckSimple(op, Xs, [0])
+                self.assertTrue(res)
+                for checker in gradient_checkers:
+                    res, grad, grad_estimated = checker.CheckSimple(op, Xs, i,
+                                                                    [0])
+                    self.assertTrue(res)
+
+    def testConcatNCHW(self):
+        for input_size, d1, d2, d3, d4 in self.test_configs:
+            op = core.CreateOperator("Concat",
+                                     ["X1", "X2", "X3", "X4"],
+                                     ["Y", "Y_dims"],
+                                     order="NCHW"
+                                     )
+            Xs = [
+                np.random.rand(2, d1, input_size,
+                               input_size).astype(np.float32),
+                np.random.rand(2, d2, input_size,
+                               input_size).astype(np.float32),
+                np.random.rand(2, d3, input_size,
+                               input_size).astype(np.float32),
+                np.random.rand(2, d4, input_size, input_size).astype(np.float32)
+            ]
+            for i in range(4):
+                res = device_checker.CheckSimple(op, Xs, [0])
+                self.assertTrue(res)
+                for checker in gradient_checkers:
+                    res, grad, grad_estimated = checker.CheckSimple(op, Xs, i,
+                                                                    [0])
+                    self.assertTrue(res)
+
+
+class TestRelu(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # input size
+            # (0, 1),
+            (1, 1),
+            (2, 1),
+            (1, 3, 3, 1),
+            (2, 3, 3, 1),
+            (1, 5, 5, 3),
+            (2, 5, 5, 3),
+        ]
+
+    def testRelu(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Relu", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32)
+            # go away from the origin point to avoid kink problems
+            X += 0.01 * np.sign(X)
+            X[X == 0] = 0.01
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+
+class TestTanh(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # (0, 1),
+            (1, 1),
+            (2, 1),
+            (1, 2, 3, 4),
+        ]
+
+    def testTanh(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Tanh", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+
+class TestAbs(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            (1, 1),
+            (2, 3),
+            (2, 3, 4),
+            (2, 3, 4, 5),
+        ]
+
+    def testAbs(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Abs", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32)
+            # go away from the origin point to avoid kink problems
+            X += 0.01 * np.sign(X)
+            X[X == 0] = 0.01
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+class TestExp(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # (0, 1),
+            (1, 1),
+            (2, 1),
+            (1, 2, 3, 4),
+        ]
+
+    def testExp(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Exp", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+class TestCos(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            (1, 1),
+            (2, 3),
+            (2, 3, 4),
+            (2, 3, 4, 5),
+        ]
+
+    def testCos(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Cos", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+class TestSin(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            (1, 1),
+            (2, 3),
+            (2, 3, 4),
+            (2, 3, 4, 5),
+        ]
+
+    def testSin(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Sin", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+class TestSigmoid(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # (0, 1),
+            (1, 1),
+            (2, 1),
+            (1, 2, 3, 4),
+        ]
+
+    def testSigmoid(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("Sigmoid", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+
+class TestSum(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # ((0, 1), False),
+            ((1, 2, 3, 4), True),
+            ((1, 2, 3, 4), False)]
+
+    def testSum(self):
+        for (input_size, in_place) in self.test_configs:
+            op = core.CreateOperator("Sum", ["X1", "X2"],
+                                     ["Y" if not in_place else "X1"])
+            X1 = np.random.rand(*input_size).astype(np.float32) - 0.5
+            X2 = np.random.rand(*input_size).astype(np.float32) - 0.5
+            res = device_checker.CheckSimple(op, [X1, X2], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(
+                    op, [X1, X2], 0, [0])
+                self.assertTrue(res)
+                res, grad, grad_estimated = checker.CheckSimple(
+                    op, [X1, X2], 1, [0])
+                self.assertTrue(res)
+
+
+class TestMakeTwoClass(test_util.TestCase):
+
+    def setUp(self):
+        self.test_configs = [
+            # input size
+            # (0, 1),
+            (1,),
+            (7,),
+            (1, 3),
+            (2, 5),
+        ]
+
+    def testMakeTwoClass(self):
+        for input_size in self.test_configs:
+            op = core.CreateOperator("MakeTwoClass", ["X"], ["Y"])
+            X = np.random.rand(*input_size).astype(np.float32)
+            # step a little to avoid gradient problems
+            X[X < 0.01] += 0.01
+            X[X > 0.99] -= 0.01
+            res = device_checker.CheckSimple(op, [X], [0])
+            self.assertTrue(res)
+            for checker in gradient_checkers:
+                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
+                self.assertTrue(res)
+
+
+class TestNetGradientChecker(test_util.TestCase):
+    def test_net_gradient_checker(self):
+        model = model_helper.ModelHelper(name="test")
+        const = model.net.AddExternalInputs("const1", "const2")
+        fc = brew.fc(model, dim_in=3, dim_out=4, blob_in="X", blob_out="Y", axis=0)
+        dist = [model.net.SquaredL2Distance([fc, c]) for c in const]
+        losses = [model.net.AveragedLoss(d) for d in dist]  # using two losses here
+
+        workspace.RunNetOnce(model.param_init_net)
+        NetGradientChecker.Check(
+            model.net,
+            outputs_with_grad=losses,
+            input_values={"X": np.array([1, 2, 3], dtype="float32"),
+                          const[0]: np.array([1, 1, 1, 1], dtype="float32"),
+                          const[1]: np.array([2, 2, 2, 2], dtype="float32")},
+            input_to_check="X",
+        )
+
+    def test_net_comparison(self):
+        # (a + b) * (c + d) == a * c + a * d + b * c + b * d
+        net1 = core.Net("net1")
+        a, b, c, d = net1.AddExternalInputs("a", "b", "c", "d")
+        a_b = net1.Sum([a, b], "a+b")
+        c_d = net1.Sum([c, d], "c+d")
+        x = net1.Mul([a_b, c_d], "x")
+
+        net2 = core.Net("net2")
+        ac = net2.Mul([a, c], "ac")
+        ad = net2.Mul([a, d], "ad")
+        bc = net2.Mul([b, c], "bc")
+        bd = net2.Mul([b, d], "bd")
+        y = net2.Sum([ac, ad, bc, bd], "y")
+
+        input_values = {blob: np.array([i], dtype=np.float32)
+                        for i, blob in enumerate([a, b, c, d])}
+
+        NetGradientChecker.CompareNets(
+            [net1, net2], [[x], [y]], [0],
+            inputs_with_grads=[a, b, c, d],
+            input_values=input_values,
+        )
+
+
+class TestIf(test_util.TestCase):
+    def testIf(self):
+        W_a_values = [2.0, 1.5]
+        B_a_values = [0.5]
+        W_b_values = [7.0, 3.5]
+        B_b_values = [1.5]
+
+        with NetBuilder(_use_control_ops=True) as init_nb:
+            W_a = ops.UniformFill([], "W_a", shape=[1, 2], min=-1., max=1.)
+            B_a = ops.ConstantFill([], "B_a", shape=[1], value=0.0)
+            W_b = ops.UniformFill([], "W_b", shape=[1, 2], min=-1., max=1.)
+            B_b = ops.ConstantFill([], "B_b", shape=[1], value=0.0)
+
+            W_gt_a = ops.GivenTensorFill(
+                [], "W_gt_a", shape=[1, 2], values=W_a_values)
+            B_gt_a = ops.GivenTensorFill([], "B_gt_a", shape=[1], values=B_a_values)
+            W_gt_b = ops.GivenTensorFill(
+                [], "W_gt_b", shape=[1, 2], values=W_b_values)
+            B_gt_b = ops.GivenTensorFill([], "B_gt_b", shape=[1], values=B_b_values)
+
+        params = [W_gt_a, B_gt_a, W_a, B_a, W_gt_b, B_gt_b, W_b, B_b]
+
+        with NetBuilder(_use_control_ops=True, initial_scope=params) as train_nb:
+            Y_pred = ops.ConstantFill([], "Y_pred", shape=[1], value=0.0)
+            Y_noise = ops.ConstantFill([], "Y_noise", shape=[1], value=0.0)
+
+            switch = ops.UniformFill(
+                [], "switch", shape=[1], min=-1., max=1., run_once=0)
+            zero = ops.ConstantFill([], "zero", shape=[1], value=0.0)
+            X = ops.GaussianFill(
+                [], "X", shape=[4096, 2], mean=0.0, std=1.0, run_once=0)
+            noise = ops.GaussianFill(
+                [], "noise", shape=[4096, 1], mean=0.0, std=1.0, run_once=0)
+
+            with ops.IfNet(ops.LT([switch, zero])):
+                Y_gt = ops.FC([X, W_gt_a, B_gt_a], "Y_gt")
+                ops.Add([Y_gt, noise], Y_noise)
+                ops.FC([X, W_a, B_a], Y_pred)
+            with ops.Else():
+                Y_gt = ops.FC([X, W_gt_b, B_gt_b], "Y_gt")
+                ops.Add([Y_gt, noise], Y_noise)
+                ops.FC([X, W_b, B_b], Y_pred)
+
+            dist = ops.SquaredL2Distance([Y_noise, Y_pred], "dist")
+            loss = dist.AveragedLoss([], ["loss"])
+
+        assert len(init_nb.get()) == 1, "Expected a single init net produced"
+        assert len(train_nb.get()) == 1, "Expected a single train net produced"
+
+        train_net = train_nb.get()[0]
+        gradient_map = train_net.AddGradientOperators([loss])
+
+        init_net = init_nb.get()[0]
+        ITER = init_net.ConstantFill(
+            [], "ITER", shape=[1], value=0, dtype=core.DataType.INT32)
+        train_net.Iter(ITER, ITER)
+        LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
+                                        policy="step", stepsize=20, gamma=0.9)
+        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
+        train_net.WeightedSum([W_a, ONE, gradient_map[W_a], LR], W_a)
+        train_net.WeightedSum([B_a, ONE, gradient_map[B_a], LR], B_a)
+        train_net.WeightedSum([W_b, ONE, gradient_map[W_b], LR], W_b)
+        train_net.WeightedSum([B_b, ONE, gradient_map[B_b], LR], B_b)
+
+        workspace.RunNetOnce(init_net)
+        workspace.CreateNet(train_net)
+        # print("Before training, W_a is: {}".format(workspace.FetchBlob("W_a")))
+        # print("Before training, B_a is: {}".format(workspace.FetchBlob("B_a")))
+        # print("Before training, W_b is: {}".format(workspace.FetchBlob("W_b")))
+        # print("Before training, B_b is: {}".format(workspace.FetchBlob("B_b")))
+
+        for _epoch in range(1000):
+            workspace.RunNet(train_net.Proto().name)
+
+        # print("After training, W_a is: {}".format(workspace.FetchBlob("W_a")))
+        # print("After training, B_a is: {}".format(workspace.FetchBlob("B_a")))
+        # print("After training, W_b is: {}".format(workspace.FetchBlob("W_b")))
+        # print("After training, B_b is: {}".format(workspace.FetchBlob("B_b")))
+        # print("Ground truth W_a is: {}".format(workspace.FetchBlob("W_gt_a")))
+        # print("Ground truth B_a is: {}".format(workspace.FetchBlob("B_gt_a")))
+        # print("Ground truth W_b is: {}".format(workspace.FetchBlob("W_gt_b")))
+        # print("Ground truth B_b is: {}".format(workspace.FetchBlob("B_gt_b")))
+
+        values_map = {
+            "W_a": W_a_values,
+            "B_a": B_a_values,
+            "W_b": W_b_values,
+            "B_b": B_b_values,
+        }
+
+        train_eps = 0.01
+
+        for blob_name, values in values_map.items():
+            trained_values = workspace.FetchBlob(blob_name)
+            if trained_values.ndim == 2:
+                self.assertEqual(trained_values.shape[0], 1)
+                trained_values = trained_values[0][:]
+            else:
+                self.assertEqual(trained_values.ndim, 1)
+
+            self.assertEqual(trained_values.size, len(values))
+            for idx in range(len(trained_values)):
+                self.assertTrue(abs(trained_values[idx] - values[idx]) < train_eps)
+
+
+class TestWhile(test_util.TestCase):
+    @unittest.skip("Skip flaky test.")
+    def testWhile(self):
+        with NetBuilder(_use_control_ops=True) as nb:
+            ops.Copy(ops.Const(0), "i")
+            ops.Copy(ops.Const(1), "one")
+            ops.Copy(ops.Const(2), "two")
+            ops.Copy(ops.Const(2.0), "x")
+            ops.Copy(ops.Const(3.0), "y")
+            ops.Copy(ops.Const(2.0), "z")
+            # raises x to the power of 4 and y to the power of 2
+            # and z to the power of 3
+            with ops.WhileNet():
+                with ops.Condition():
+                    ops.Add(["i", "one"], "i")
+                    ops.LE(["i", "two"])
+                ops.Pow("x", "x", exponent=2.0)
+                with ops.IfNet(ops.LT(["i", "two"])):
+                    ops.Pow("y", "y", exponent=2.0)
+                with ops.Else():
+                    ops.Pow("z", "z", exponent=3.0)
+
+            ops.Add(["x", "y"], "x_plus_y")
+            ops.Add(["x_plus_y", "z"], "s")
+
+        assert len(nb.get()) == 1, "Expected a single net produced"
+        net = nb.get()[0]
+
+        net.AddGradientOperators(["s"])
+        workspace.RunNetOnce(net)
+        # (x^4)' = 4x^3
+        self.assertAlmostEqual(workspace.FetchBlob("x_grad"), 32)
+        self.assertAlmostEqual(workspace.FetchBlob("x"), 16)
+        # (y^2)' = 2y
+        self.assertAlmostEqual(workspace.FetchBlob("y_grad"), 6)
+        self.assertAlmostEqual(workspace.FetchBlob("y"), 9)
+        # (z^3)' = 3z^2
+        self.assertAlmostEqual(workspace.FetchBlob("z_grad"), 12)
+        self.assertAlmostEqual(workspace.FetchBlob("z"), 8)
+
+
+if __name__ == '__main__':
+    workspace.GlobalInit(["python"])
+    unittest.main()
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
new file mode 100644
index 0000000..6b93aec
--- /dev/null
+++ b/caffe2/python/gradient_checker.py
@@ -0,0 +1,319 @@
+## @package gradient_checker
+# Module caffe2.python.gradient_checker
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core, workspace, net_drawer
+from caffe2.proto import caffe2_pb2
+
+
+def _get_grad_blob(grad_map, input_to_check):
+    grad_blob = grad_map[input_to_check]
+
+    if isinstance(grad_blob, core.BlobReference):
+        return workspace.blobs[grad_blob]
+
+    # If grad_blob is not a single blob, it should be a gradient slice.
+    # To make it comparable with the estimiated gradient which is dense,
+    # we need to first convert grad_blob to dense gradient.
+    assert isinstance(grad_blob, core.GradientSlice)
+    dense_grad = 'tmp_dense_grad'
+    sparse_to_dense_op = core.CreateOperator(
+        'SparseToDense',
+        [grad_blob.indices, grad_blob.values, input_to_check],
+        dense_grad,
+    )
+    workspace.RunOperatorOnce(sparse_to_dense_op)
+    return workspace.blobs[dense_grad]
+
+
+def _get_grad(net, outputs, outputs_with_grad, input_values, inputs_with_grads):
+    grad_net = net.Clone(net.Name() + "_copy")
+    grad_map = grad_net.AddGradientOperators(outputs_with_grad)
+
+    for name, value in (input_values or {}).items():
+        workspace.blobs[name] = value
+
+    for input_to_check in inputs_with_grads:
+        assert input_to_check in grad_map, (
+            '{} has no gradient, cannot check net gradient.'.format(
+                input_to_check))
+        assert str(input_to_check) in workspace.blobs
+
+    workspace.RunNetOnce(grad_net)
+    forward_results = [(output, workspace.blobs[output]) for output in outputs]
+    grads = {input_to_check: _get_grad_blob(grad_map, input_to_check)
+             for input_to_check in inputs_with_grads}
+
+    return forward_results, grads, grad_net
+
+
+def _assert_close(value1, value2, threshold, err_msg=''):
+    np.testing.assert_allclose(
+        value1, value2,
+        atol=threshold, rtol=threshold,
+        err_msg=err_msg,
+    )
+
+    delta = np.abs(value1 - value2).flatten()
+    return np.mean(delta), max(delta)
+
+
+class NetGradientChecker(object):
+    @staticmethod
+    def CompareNets(nets, outputs, outputs_with_grad_ids,
+                    inputs_with_grads, input_values=None,
+                    threshold=0.0000001, print_net_images=False):
+        def _get_output_with_grad_names(net_outputs):
+            return [net_outputs[i] for i in outputs_with_grad_ids]
+
+        if print_net_images:
+            for i, net in enumerate(nets):
+                png = net_drawer.GetPydotGraph(net).create_png()
+                with open("caffe2_net_forward_" + str(i) + net.Name() + ".png",
+                          'wb') \
+                        as f:
+                    f.write(png)
+
+        results = [
+            _get_grad(net, net_outputs,
+                      _get_output_with_grad_names(net_outputs),
+                      input_values, inputs_with_grads)
+            for net, net_outputs in zip(nets, outputs)
+        ]
+
+        if print_net_images:
+            _, _, backward_nets = zip(*results)
+            for i, net in enumerate(backward_nets):
+                png = net_drawer.GetPydotGraph(net).create_png()
+                with open("caffe2_net_" + str(i) + net.Name() + ".png", 'wb') \
+                        as f:
+                    f.write(png)
+
+        first_net_results, first_net_grads, _ = results[0]
+        for net_results, net_grads, _ in results[1:]:
+            assert len(net_results) == len(first_net_results)
+            for idx, ((blob1, blob_value1), (blob2, blob_value2)) in enumerate(
+                    zip(first_net_results, net_results)):
+                _assert_close(
+                    blob_value1, blob_value2, threshold,
+                    err_msg="Different forward pass results for output id {}. "
+                    "Corresponding output blobs: {} and {}".format(
+                        idx, blob1, blob2))
+
+            assert net_grads.keys() == first_net_grads.keys()
+            for blob, blob_grad_value in net_grads.items():
+                _assert_close(
+                    first_net_grads[blob], blob_grad_value, threshold,
+                    err_msg="Different gradients for input {}".format(blob))
+
+    @staticmethod
+    def Check(net, outputs_with_grad, input_values,
+              input_to_check, step_size=0.0001,
+              threshold=0.05, print_net=True):
+
+        net_results, net_grads, full_net = _get_grad(
+            net, [], outputs_with_grad, input_values, [input_to_check])
+        analytic_grad = net_grads[input_to_check]
+
+        def GetLoss(new_value):
+            workspace.blobs[input_to_check] = new_value
+            workspace.RunNetOnce(full_net)
+            return sum([
+                workspace.blobs[output]
+                for output in outputs_with_grad
+            ]).sum()
+
+        def GetValue(dim, delta):
+            input_value = input_values[input_to_check].copy()
+            input_value.flat[dim] += delta
+            return input_value
+
+        grad_estimate = np.zeros_like(input_values[input_to_check])
+        for dim in range(input_values[input_to_check].size):
+            pos_loss = GetLoss(GetValue(dim, step_size))
+            neg_loss = GetLoss(GetValue(dim, -step_size))
+            grad_estimate.flat[dim] = (pos_loss - neg_loss) / step_size / 2
+
+        err_msg = "Error in gradient check for net_copy {}".format(
+            net.Name())
+        if print_net:
+            err_msg += ": {}".format(net.Proto())
+
+        return _assert_close(analytic_grad, grad_estimate, threshold, err_msg)
+
+
+class GradientChecker:
+    """A gradient checker in Python.
+
+    This is not the most efficient way to check gradients, as the Python
+    interface will involve a lot of copies back and forth operations. Use at your
+    own risk.
+    """
+
+    def __init__(
+        self,
+        stepsize,
+        threshold,
+        device_option=None,
+        workspace_name="gradient_check",
+        input_device_options=None,
+    ):
+        self._stepsize = stepsize
+        self._threshold = threshold
+        self._device_option = device_option or caffe2_pb2.DeviceOption()
+        self._workspace_name = workspace_name
+        if input_device_options is None:
+            self._input_device_options = {}
+        else:
+            self._input_device_options = input_device_options
+
+    def GetLossAndGrad(
+        self, op, grad_ops, inputs, input_names, input_to_check, grad_name,
+        outputs_with_grads
+    ):
+        for i in range(len(inputs)):
+            workspace.FeedBlob(input_names[i], inputs[i],
+                               self._input_device_options.get(
+                input_names[i], self._device_option))
+        x = inputs[input_to_check]
+        # Run.
+        workspace.RunOperatorOnce(op)
+        loss = 0.
+        # Get Loss and feed in the gradients, run gradient ops.
+        for idx in outputs_with_grads:
+            name = op.output[idx]
+            arr = workspace.FetchBlob(name)
+            loss += (arr**2).sum()
+            workspace.FeedBlob(name + '_grad', arr, self._device_option)
+        loss /= 2.
+        # Run gradient ops
+        workspace.RunOperatorsOnce(grad_ops)
+        # Get gradients
+        if isinstance(grad_name, core.GradientSlice):
+            workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32))
+            workspace.FeedBlob('ones', np.ones(1, dtype=np.float32))
+            gv_cpu_op = core.CreateOperator(
+                'EnsureCPUOutput', grad_name.values, grad_name.values + '_cpu',
+                device_option=self._device_option
+            )
+            gi_cpu_op = core.CreateOperator(
+                'EnsureCPUOutput', grad_name.indices, grad_name.indices + '_cpu',
+                device_option=self._device_option
+            )
+            sparse_to_dense_op = core.CreateOperator(
+                'ScatterWeightedSum',
+                [
+                    'zeros', 'ones', grad_name.indices + '_cpu',
+                    grad_name.values + '_cpu', 'ones'
+                ],
+                'zeros',
+            )
+            workspace.RunOperatorOnce(gv_cpu_op)
+            workspace.RunOperatorOnce(gi_cpu_op)
+            workspace.RunOperatorOnce(sparse_to_dense_op)
+            grad = workspace.FetchBlob('zeros')
+        else:
+            grad = workspace.FetchBlob(grad_name)
+        return loss, grad
+
+    def CheckSimple(
+        self,
+        op,
+        inputs,
+        input_to_check,
+        outputs_with_grads,
+        grad_ops=None,
+        input_device_options=None
+    ):
+        """Checks the operator in a very simple fashion by stacking a sum of
+        squares on the top.
+
+        Inputs:
+          op: the operator to be checked.
+          inputs: the input data in numpy arrays.
+          input_to_check: an index specifying which input blob we should
+              check.
+          outputs_with_grads: indices specifying which output blobs will we
+              need to check gradients with. For these outputs, we will collect a
+              squared sum and also feed in their gradients.
+          grad_operator: the gradient operator. If not given, we will get the
+              gradient operator from the gradient registry.
+          input_device_options: an optional mapping from input names to
+              DeviceOptions (to override the default DeviceOption)
+        Outputs:
+          boolean: True if it passes, False if it does not pass.
+        """
+        # Entering the checker workspace
+        old_ws_name = workspace.CurrentWorkspace()
+        if self._workspace_name != old_ws_name:
+            workspace.SwitchWorkspace(self._workspace_name, True)
+
+        op.device_option.CopyFrom(self._device_option)
+        if grad_ops is None:
+            # TODO(jiayq): use the gradient registration instead of the old
+            # hack.
+            grad_ops, g_input = core.GradientRegistry.GetGradientForOp(
+                op, [s + '_grad' for s in op.output])
+
+        dims_to_check = inputs[input_to_check].size
+        _input_device_options = input_device_options or \
+            core.InferOpBlobDevicesAsDict(op)[0]
+        # First, feed in the input.
+        for i, arr in enumerate(inputs):
+            workspace.FeedBlob(
+                op.input[i], arr,
+                _input_device_options.get(
+                    op.input[i], self._device_option))
+
+        # Get the loss and gradient for the original.
+        grad_name = g_input[input_to_check]
+        loss, grad = self.GetLossAndGrad(
+            op, grad_ops, inputs, op.input, input_to_check, grad_name,
+            outputs_with_grads
+        )
+        grad_estimate = np.zeros_like(inputs[input_to_check])
+        if grad_estimate.shape != grad.shape:
+            raise Exception(
+                "Mismatched gradient shapes: estimated ({}), grad ({})".format(
+                    grad_estimate.shape, grad.shape))
+
+        for current_dim in range(dims_to_check):
+            # Positive gradient
+            inputs[input_to_check].flat[current_dim] += self._stepsize
+            pos_loss, _ = self.GetLossAndGrad(
+                op, grad_ops, inputs, op.input, input_to_check, grad_name,
+                outputs_with_grads
+            )
+            # Negative gradient
+            inputs[input_to_check].flat[current_dim] -= self._stepsize * 2
+            neg_loss, _ = self.GetLossAndGrad(
+                op, grad_ops, inputs, op.input, input_to_check, grad_name,
+                outputs_with_grads
+            )
+            # Recover the value
+            inputs[input_to_check].flat[current_dim] += self._stepsize
+            grad_estimate.flat[current_dim] = (
+                pos_loss - neg_loss) / self._stepsize / 2
+        # Now, check correctness
+        fail_mat = ~np.isclose(
+            grad, grad_estimate, atol=self._threshold, rtol=self._threshold)
+        if np.any(fail_mat):
+            idx = np.flatnonzero(fail_mat)
+            print('Failed. [idx, grad, grad_estimate] are:')
+            print(np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T)
+            ret = False
+        else:
+            ret = True
+        # After finishing, cleaning up things.
+        if self._workspace_name != old_ws_name:
+            # We reset the workspace to make sure everything intermediate is
+            # cleaned up. Note that there is no need to delete a workspace -
+            # when empty it takes a very limited amount of memory.
+            workspace.ResetWorkspace()
+            workspace.SwitchWorkspace(old_ws_name)
+        return ret, grad, grad_estimate
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
new file mode 100644
index 0000000..e6caa2c
--- /dev/null
+++ b/caffe2/python/gru_cell.py
@@ -0,0 +1,172 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+from caffe2.python import brew, rnn_cell
+
+
+class GRUCell(rnn_cell.RNNCell):
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        forget_bias,  # Currently unused!  Values here will be ignored.
+        memory_optimization,
+        drop_states=False,
+        linear_before_reset=False,
+        **kwargs
+    ):
+        super(GRUCell, self).__init__(**kwargs)
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.forget_bias = float(forget_bias)
+        self.memory_optimization = memory_optimization
+        self.drop_states = drop_states
+        self.linear_before_reset = linear_before_reset
+
+    # Unlike LSTMCell, GRUCell needs the output of one gate to feed into another.
+    # (reset gate -> output_gate)
+    # So, much of the logic to calculate the reset gate output and modified
+    # output gate input is set here, in the graph definition.
+    # The remaining logic lives in in gru_unit_op.{h,cc}.
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev = states[0]
+
+        # Split input tensors to get inputs for each gate.
+        input_t_reset, input_t_update, input_t_output = model.net.Split(
+            [
+                input_t,
+            ],
+            [
+                self.scope('input_t_reset'),
+                self.scope('input_t_update'),
+                self.scope('input_t_output'),
+            ],
+            axis=2,
+        )
+
+        # Fully connected layers for reset and update gates.
+        reset_gate_t = brew.fc(
+            model,
+            hidden_t_prev,
+            self.scope('reset_gate_t'),
+            dim_in=self.hidden_size,
+            dim_out=self.hidden_size,
+            axis=2,
+        )
+        update_gate_t = brew.fc(
+            model,
+            hidden_t_prev,
+            self.scope('update_gate_t'),
+            dim_in=self.hidden_size,
+            dim_out=self.hidden_size,
+            axis=2,
+        )
+
+        # Calculating the modified hidden state going into output gate.
+        reset_gate_t = model.net.Sum(
+            [reset_gate_t, input_t_reset],
+            self.scope('reset_gate_t')
+        )
+        reset_gate_t_sigmoid = model.net.Sigmoid(
+            reset_gate_t,
+            self.scope('reset_gate_t_sigmoid')
+        )
+
+        # `self.linear_before_reset = True` matches cudnn semantics
+        if self.linear_before_reset:
+            output_gate_fc = brew.fc(
+                model,
+                hidden_t_prev,
+                self.scope('output_gate_t'),
+                dim_in=self.hidden_size,
+                dim_out=self.hidden_size,
+                axis=2,
+            )
+            output_gate_t = model.net.Mul(
+                [reset_gate_t_sigmoid, output_gate_fc],
+                self.scope('output_gate_t_mul')
+            )
+        else:
+            modified_hidden_t_prev = model.net.Mul(
+                [reset_gate_t_sigmoid, hidden_t_prev],
+                self.scope('modified_hidden_t_prev')
+            )
+            output_gate_t = brew.fc(
+                model,
+                modified_hidden_t_prev,
+                self.scope('output_gate_t'),
+                dim_in=self.hidden_size,
+                dim_out=self.hidden_size,
+                axis=2,
+            )
+
+        # Add input contributions to update and output gate.
+        # We already (in-place) added input contributions to the reset gate.
+        update_gate_t = model.net.Sum(
+            [update_gate_t, input_t_update],
+            self.scope('update_gate_t'),
+        )
+        output_gate_t = model.net.Sum(
+            [output_gate_t, input_t_output],
+            self.scope('output_gate_t_summed'),
+        )
+
+        # Join gate outputs and add input contributions
+        gates_t, _gates_t_concat_dims = model.net.Concat(
+            [
+                reset_gate_t,
+                update_gate_t,
+                output_gate_t,
+            ],
+            [
+                self.scope('gates_t'),
+                self.scope('_gates_t_concat_dims'),
+            ],
+            axis=2,
+        )
+
+        if seq_lengths is not None:
+            inputs = [hidden_t_prev, gates_t, seq_lengths, timestep]
+        else:
+            inputs = [hidden_t_prev, gates_t, timestep]
+
+        hidden_t = model.net.GRUUnit(
+            inputs,
+            list(self.get_state_names()),
+            forget_bias=self.forget_bias,
+            drop_states=self.drop_states,
+            sequence_lengths=(seq_lengths is not None),
+        )
+        model.net.AddExternalOutputs(hidden_t)
+        return (hidden_t,)
+
+    def prepare_input(self, model, input_blob):
+        return brew.fc(
+            model,
+            input_blob,
+            self.scope('i2h'),
+            dim_in=self.input_size,
+            dim_out=3 * self.hidden_size,
+            axis=2,
+        )
+
+    def get_state_names(self):
+        return (self.scope('hidden_t'),)
+
+    def get_output_dim(self):
+        return self.hidden_size
+
+
+GRU = functools.partial(rnn_cell._LSTM, GRUCell)
diff --git a/caffe2/python/helpers/__init__.py b/caffe2/python/helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
new file mode 100644
index 0000000..6bc3779
--- /dev/null
+++ b/caffe2/python/helpers/algebra.py
@@ -0,0 +1,26 @@
+## @package algebra
+# Module caffe2.python.helpers.algebra
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs):
+    """Transpose."""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    return model.net.Transpose(blob_in, blob_out, **kwargs)
+
+
+def sum(model, blob_in, blob_out, **kwargs):
+    """Sum"""
+    return model.net.Sum(blob_in, blob_out, **kwargs)
+
+
+def batch_mat_mul(model, blob_in, blob_out,
+                  enable_tensor_core=False, **kwargs):
+    if enable_tensor_core:
+        kwargs['engine'] = 'TENSORCORE'
+
+    return model.net.BatchMatMul(blob_in, blob_out, **kwargs)
diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py
new file mode 100644
index 0000000..ac6978b
--- /dev/null
+++ b/caffe2/python/helpers/arg_scope.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import copy
+import threading
+
+_threadlocal_scope = threading.local()
+
+
+@contextlib.contextmanager
+def arg_scope(single_helper_or_list, **kwargs):
+    global _threadlocal_scope
+    if not isinstance(single_helper_or_list, list):
+        assert callable(single_helper_or_list), \
+            "arg_scope is only supporting single or a list of helper functions."
+        single_helper_or_list = [single_helper_or_list]
+    old_scope = copy.deepcopy(get_current_scope())
+    for helper in single_helper_or_list:
+        assert callable(helper), \
+            "arg_scope is only supporting a list of callable helper functions."
+        helper_key = helper.__name__
+        if helper_key not in old_scope:
+            _threadlocal_scope.current_scope[helper_key] = {}
+        _threadlocal_scope.current_scope[helper_key].update(kwargs)
+
+    yield
+    _threadlocal_scope.current_scope = old_scope
+
+
+def get_current_scope():
+    global _threadlocal_scope
+    if not hasattr(_threadlocal_scope, "current_scope"):
+        _threadlocal_scope.current_scope = {}
+    return _threadlocal_scope.current_scope
diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py
new file mode 100644
index 0000000..3f89553
--- /dev/null
+++ b/caffe2/python/helpers/array_helpers.py
@@ -0,0 +1,25 @@
+## @package arra_helpers
+# Module caffe2.python.helpers.array_helpers
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def concat(model, blobs_in, blob_out, **kwargs):
+    """Depth Concat."""
+    if kwargs.get('order') and kwargs.get('axis'):
+        # The backend throws an error if both are given
+        kwargs.pop('order')
+
+    return model.net.Concat(
+        blobs_in,
+        [blob_out, "_" + blob_out + "_concat_dims"],
+        **kwargs
+    )[0]
+
+
+def depth_concat(model, blobs_in, blob_out, **kwargs):
+    """The old depth concat function - we should move to use concat."""
+    print("DepthConcat is deprecated. use Concat instead.")
+    return concat(blobs_in, blob_out, **kwargs)
diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py
new file mode 100644
index 0000000..a738a71
--- /dev/null
+++ b/caffe2/python/helpers/control_ops.py
@@ -0,0 +1,28 @@
+## @package control_ops
+# Module caffe2.python.helpers.control_ops
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.control_ops_util import add_if_op, add_while_op
+
+
+def cond(model, cond_blob, external_blobs, then_model, else_model=None):
+    """Condition"""
+    add_if_op(
+        model.net,
+        cond_blob,
+        external_blobs,
+        then_model.net,
+        else_model.net if else_model else None)
+
+
+def loop(model, cond_blob, external_blobs, loop_model, cond_model=None):
+    """Loop"""
+    add_while_op(
+        model.net,
+        cond_blob,
+        external_blobs,
+        loop_model.net,
+        cond_model.net if cond_model else None)
diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py
new file mode 100644
index 0000000..bb88b2e
--- /dev/null
+++ b/caffe2/python/helpers/conv.py
@@ -0,0 +1,362 @@
+## @package conv
+# Module caffe2.python.helpers.conv
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.python.modeling import initializers
+from caffe2.python.modeling.parameter_info import ParameterTags
+
+def _ConvBase(
+    model,
+    is_nd,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    WeightInitializer=None,
+    BiasInitializer=None,
+    group=1,
+    transform_inputs=None,
+    use_cudnn=False,
+    order="NCHW",
+    cudnn_exhaustive_search=False,
+    ws_nbytes_limit=None,
+    float16_compute=False,
+    **kwargs
+):
+    kernels = []
+    if is_nd:
+        if not isinstance(kernel, list):
+            kernels = [kernel]
+        else:
+            kernels = kernel
+    else:
+        if isinstance(kernel, list):
+            assert len(kernel) == 2, "Conv support only a 2D kernel."
+            kernels = kernel
+        else:
+            kernels = [kernel] * 2
+
+    requested_engine = kwargs.get('engine')
+    if requested_engine is not None:
+        if use_cudnn and requested_engine != 'CUDNN':
+            raise ValueError(
+                'When use_cudnn=True, the only engine you can specify is '
+                '"CUDNN"')
+        elif not use_cudnn and requested_engine == 'CUDNN':
+            raise ValueError(
+                'When use_cudnn=False, the only engine you can specify is '
+                '""')
+
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+        kwargs['exhaustive_search'] = cudnn_exhaustive_search
+        if ws_nbytes_limit:
+            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
+
+    use_bias =\
+            False if ("no_bias" in kwargs and kwargs["no_bias"]) else True
+    blob_out = blob_out or model.net.NextName()
+    weight_shape = [dim_out]
+    if order == "NCHW":
+        weight_shape.append(int(dim_in / group))
+        weight_shape.extend(kernels)
+    else:
+        weight_shape.extend(kernels)
+        weight_shape.append(int(dim_in / group))
+
+    WeightInitializer = initializers.update_initializer(
+        WeightInitializer, weight_init, ("XavierFill", {})
+    )
+    BiasInitializer = initializers.update_initializer(
+        BiasInitializer, bias_init, ("ConstantFill", {})
+    )
+    if not model.init_params:
+        WeightInitializer = initializers.ExternalInitializer()
+        BiasInitializer = initializers.ExternalInitializer()
+
+    weight = model.create_param(
+        param_name=blob_out + '_w',
+        shape=weight_shape,
+        initializer=WeightInitializer,
+        tags=ParameterTags.WEIGHT
+    )
+    if use_bias:
+        bias = model.create_param(
+            param_name=blob_out + '_b',
+            shape=[dim_out, ],
+            initializer=BiasInitializer,
+            tags=ParameterTags.BIAS
+        )
+
+    if use_bias:
+        inputs = [blob_in, weight, bias]
+    else:
+        inputs = [blob_in, weight]
+
+    if transform_inputs is not None:
+        transform_inputs(model, blob_out, inputs)
+
+    # Enable float 16 compute kernel (relevant for CUDA)
+    if float16_compute:
+        kwargs['float16_compute'] = True
+
+    # For the operator, we no longer need to provide the no_bias field
+    # because it can automatically figure this out from the number of
+    # inputs.
+    if 'no_bias' in kwargs:
+        del kwargs['no_bias']
+    if group != 1:
+        kwargs['group'] = group
+    if is_nd:
+        return model.net.Conv(
+            inputs,
+            blob_out,
+            kernels=kernels,
+            order=order,
+            **kwargs)
+    else:
+        if isinstance(kernel, list):
+            return model.net.Conv(
+                inputs,
+                blob_out,
+                kernel_h=kernel[0],
+                kernel_w=kernel[1],
+                order=order,
+                **kwargs)
+        else:
+            return model.net.Conv(
+                inputs,
+                blob_out,
+                kernel=kernel,
+                order=order,
+                **kwargs)
+
+
+
+def conv_nd(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    WeightInitializer=None,
+    BiasInitializer=None,
+    group=1,
+    transform_inputs=None,
+    order="NCHW",
+    **kwargs
+):
+    """N-dimensional convolution for inputs with NCHW storage order.
+    """
+    assert order == "NCHW", "ConvNd only supported for NCHW storage."
+    return _ConvBase(model, True, blob_in, blob_out, dim_in, dim_out, kernel,
+                     weight_init, bias_init, WeightInitializer, BiasInitializer,
+                     group, transform_inputs, order=order, **kwargs)
+
+
+def conv(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    WeightInitializer=None,
+    BiasInitializer=None,
+    group=1,
+    transform_inputs=None,
+    **kwargs
+):
+    """2-dimensional convolution.
+    """
+    return _ConvBase(model, False, blob_in, blob_out, dim_in, dim_out, kernel,
+                     weight_init, bias_init, WeightInitializer, BiasInitializer,
+                     group, transform_inputs, **kwargs)
+
+
+def conv_transpose(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    use_cudnn=False,
+    order="NCHW",
+    cudnn_exhaustive_search=False,
+    ws_nbytes_limit=None,
+    **kwargs
+):
+    """ConvTranspose.
+    """
+    weight_init = weight_init if weight_init else ('XavierFill', {})
+    bias_init = bias_init if bias_init else ('ConstantFill', {})
+    blob_out = blob_out or model.net.NextName()
+    weight_shape = (
+        [dim_in, dim_out, kernel, kernel]
+        if order == "NCHW" else [dim_in, kernel, kernel, dim_out]
+    )
+    if model.init_params:
+        weight = model.param_init_net.__getattr__(weight_init[0])(
+            [],
+            blob_out + '_w',
+            shape=weight_shape,
+            **weight_init[1]
+        )
+        bias = model.param_init_net.__getattr__(bias_init[0])(
+            [],
+            blob_out + '_b',
+            shape=[dim_out, ],
+            **bias_init[1]
+        )
+    else:
+        weight = core.ScopedBlobReference(
+            blob_out + '_w', model.param_init_net)
+        bias = core.ScopedBlobReference(
+            blob_out + '_b', model.param_init_net)
+    model.AddParameter(weight, ParameterTags.WEIGHT)
+    model.AddParameter(bias, ParameterTags.BIAS)
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+        kwargs['exhaustive_search'] = cudnn_exhaustive_search
+        if ws_nbytes_limit:
+            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
+    return model.net.ConvTranspose(
+        [blob_in, weight, bias],
+        blob_out,
+        kernel=kernel,
+        order=order,
+        **kwargs
+    )
+
+
+def group_conv(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    group=1,
+    **kwargs
+):
+    """Group Convolution.
+
+    This is essentially the same as Conv with a group argument passed in.
+    We specialize this for backward interface compatibility.
+    """
+    return conv(model, blob_in, blob_out, dim_in, dim_out, kernel,
+                weight_init=weight_init, bias_init=bias_init,
+                group=group, **kwargs)
+
+
+def group_conv_deprecated(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    dim_out,
+    kernel,
+    weight_init=None,
+    bias_init=None,
+    group=1,
+    use_cudnn=False,
+    order="NCHW",
+    cudnn_exhaustive_search=False,
+    ws_nbytes_limit=None,
+    **kwargs
+):
+    """GroupConvolution's deprecated interface.
+
+    This is used to simulate a group convolution via split and concat. You
+    should always use the new group convolution in your new code.
+    """
+    weight_init = weight_init if weight_init else ('XavierFill', {})
+    bias_init = bias_init if bias_init else ('ConstantFill', {})
+    use_bias = False if ("no_bias" in kwargs and kwargs["no_bias"]) else True
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+        kwargs['exhaustive_search'] = cudnn_exhaustive_search
+        if ws_nbytes_limit:
+            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
+            if dim_in % group:
+                raise ValueError("dim_in should be divisible by group.")
+    if dim_out % group:
+        raise ValueError("dim_out should be divisible by group.")
+    splitted_blobs = model.net.DepthSplit(
+        blob_in,
+        ['_' + blob_out + '_gconv_split_' + str(i) for i in range(group)],
+        dimensions=[int(dim_in / group) for i in range(group)],
+        order=order
+    )
+    weight_shape = (
+        [dim_out / group, dim_in / group, kernel, kernel]
+        if order == "NCHW" else
+        [dim_out / group, kernel, kernel, dim_in / group]
+    )
+    # Make sure that the shapes are of int format. Especially for py3 where
+    # int division gives float output.
+    weight_shape = [int(v) for v in weight_shape]
+    conv_blobs = []
+    for i in range(group):
+        if model.init_params:
+            weight = model.param_init_net.__getattr__(weight_init[0])(
+                [],
+                blob_out + '_gconv_%d_w' % i,
+                shape=weight_shape,
+                **weight_init[1]
+            )
+            if use_bias:
+                bias = model.param_init_net.__getattr__(bias_init[0])(
+                    [],
+                    blob_out + '_gconv_%d_b' % i,
+                    shape=[int(dim_out / group)],
+                    **bias_init[1]
+                )
+        else:
+            weight = core.ScopedBlobReference(
+                blob_out + '_gconv_%d_w' % i, model.param_init_net)
+            if use_bias:
+                bias = core.ScopedBlobReference(
+                    blob_out + '_gconv_%d_b' % i, model.param_init_net)
+        model.AddParameter(weight, ParameterTags.WEIGHT)
+        if use_bias:
+            model.AddParameter(bias, ParameterTags.BIAS)
+        if use_bias:
+            inputs = [weight, bias]
+        else:
+            inputs = [weight]
+        if 'no_bias' in kwargs:
+            del kwargs['no_bias']
+        conv_blobs.append(
+            splitted_blobs[i].Conv(
+                inputs,
+                blob_out + '_gconv_%d' % i,
+                kernel=kernel,
+                order=order,
+                **kwargs
+            )
+        )
+    concat, concat_dims = model.net.Concat(
+        conv_blobs,
+        [blob_out,
+         "_" + blob_out + "_concat_dims"],
+        order=order
+    )
+    return concat
diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py
new file mode 100644
index 0000000..6e642a3
--- /dev/null
+++ b/caffe2/python/helpers/db_input.py
@@ -0,0 +1,17 @@
+## @package db_input
+# Module caffe2.python.helpers.db_input
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+def db_input(model, blobs_out, batch_size, db, db_type):
+    dbreader_name = "dbreader_" + db
+    dbreader = model.param_init_net.CreateDB(
+        [],
+        dbreader_name,
+        db=db,
+        db_type=db_type,
+    )
+    return model.net.TensorProtosDBInput(
+        dbreader, blobs_out, batch_size=batch_size)
diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py
new file mode 100644
index 0000000..6fbb5bc
--- /dev/null
+++ b/caffe2/python/helpers/dropout.py
@@ -0,0 +1,17 @@
+## @package dropout
+# Module caffe2.python.helpers.dropout
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs):
+    """dropout"""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    else:
+        kwargs['engine'] = 'DEFAULT'
+    assert 'is_test' in kwargs, "Argument 'is_test' is required"
+    return model.net.Dropout(
+        blob_in, [blob_out, "_" + blob_out + "_mask"], **kwargs)[0]
diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py
new file mode 100644
index 0000000..55fbd70
--- /dev/null
+++ b/caffe2/python/helpers/elementwise_linear.py
@@ -0,0 +1,46 @@
+## @package elementwise_linear
+# Module caffe2.python.helpers.elementwise_linear
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.python.modeling.parameter_info import ParameterTags
+
+
+def _elementwise_linear(
+    model, op_call, blob_in, blob_out, dim,
+    weight_init=None, bias_init=None, **kwargs
+):
+    """Elementwise_Linear"""
+    weight_init = weight_init or ('ConstantFill', {'value': 1.0})
+    bias_init = bias_init or ('ConstantFill', {'value': 0.0})
+    blob_out = blob_out or model.net.NextName()
+    if model.init_params:
+        weight = model.param_init_net.__getattr__(weight_init[0])(
+            [],
+            blob_out + '_w',
+            shape=[dim],
+            **weight_init[1]
+        )
+        bias = model.param_init_net.__getattr__(bias_init[0])(
+            [],
+            blob_out + '_b',
+            shape=[dim],
+            **bias_init[1]
+        )
+    else:
+        weight = core.ScopedBlobReference(
+            blob_out + '_w', model.param_init_net)
+        bias = core.ScopedBlobReference(
+            blob_out + '_b', model.param_init_net)
+
+    model.AddParameter(weight, ParameterTags.WEIGHT)
+    model.AddParameter(bias, ParameterTags.BIAS)
+    return op_call([blob_in, weight, bias], blob_out, **kwargs)
+
+
+def elementwise_linear(model, *args, **kwargs):
+    return _elementwise_linear(
+        model, model.net.ElementwiseLinear, *args, **kwargs)
diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py
new file mode 100644
index 0000000..e60d008
--- /dev/null
+++ b/caffe2/python/helpers/fc.py
@@ -0,0 +1,197 @@
+## @package fc
+# Module caffe2.python.helpers.fc
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.python.modeling import initializers
+from caffe2.python.modeling.parameter_info import ParameterTags
+
+
+def _FC_or_packed_FC(
+    model, op_call, blob_in, blob_out, dim_in, dim_out, weight_init=None,
+        bias_init=None, WeightInitializer=None, BiasInitializer=None,
+        enable_tensor_core=False, float16_compute=False, **kwargs
+):
+    WeightInitializer = initializers.update_initializer(
+        WeightInitializer, weight_init, ("XavierFill", {})
+    )
+    BiasInitializer = initializers.update_initializer(
+        BiasInitializer, bias_init, ("ConstantFill", {})
+    )
+    if not model.init_params:
+        WeightInitializer = initializers.ExternalInitializer()
+        BiasInitializer = initializers.ExternalInitializer()
+
+    blob_out = blob_out or model.net.NextName()
+    bias_tags = [ParameterTags.BIAS]
+    if 'freeze_bias' in kwargs:
+        bias_tags.append(ParameterTags.COMPUTED_PARAM)
+
+    weight = model.create_param(
+        param_name=blob_out + '_w',
+        shape=[dim_out, dim_in],
+        initializer=WeightInitializer,
+        tags=ParameterTags.WEIGHT
+    )
+    bias = model.create_param(
+        param_name=blob_out + '_b',
+        shape=[dim_out, ],
+        initializer=BiasInitializer,
+        tags=bias_tags
+    )
+
+    # enable TensorCore by setting appropriate engine
+    if enable_tensor_core:
+        kwargs['engine'] = 'TENSORCORE'
+
+    # Enable float 16 compute kernel (relevant for CUDA)
+    if float16_compute:
+        kwargs['float16_compute'] = True
+
+    return op_call([blob_in, weight, bias], blob_out, **kwargs)
+
+
+def fc(model, *args, **kwargs):
+    return _FC_or_packed_FC(model, model.net.FC, *args, **kwargs)
+
+
+def packed_fc(model, *args, **kwargs):
+    return _FC_or_packed_FC(model, model.net.PackedFC, *args, **kwargs)
+
+
+def fc_decomp(
+    model, blob_in, blob_out, dim_in, dim_out,
+    rank_approx=5, weight_init=None, bias_init=None,
+    WeightInitializer=None, BiasInitializer=None, **kwargs
+):
+    """FC_Decomp version
+    Here we assume that the rank of original input is bigger than 5.
+    """
+    WeightInitializer = initializers.update_initializer(
+        WeightInitializer, weight_init, ("XavierFill", {})
+    )
+    BiasInitializer = initializers.update_initializer(
+        BiasInitializer, bias_init, ("ConstantFill", {})
+    )
+    blob_out = blob_out or model.net.NextName()
+    u = model.create_param(
+        param_name=blob_out + '_u',
+        shape=[dim_out, rank_approx],
+        initializer=WeightInitializer,
+    )
+    v = model.create_param(
+        param_name=blob_out + '_v',
+        shape=[dim_in, rank_approx],
+        initializer=WeightInitializer,
+    )
+    bias = model.create_param(
+        param_name=blob_out + '_b',
+        shape=[dim_out, ],
+        initializer=BiasInitializer,
+    )
+    return model.net.FC_Decomp([blob_in, u, v, bias], blob_out, **kwargs)
+
+
+def fc_prune(
+    model, blob_in, blob_out, dim_in, dim_out,
+    weight_init=None, bias_init=None, mask_init=None,
+    threshold=0.00001, need_compress_rate=False,
+    comp_lb=0.05,
+    **kwargs
+):
+    """FC_Prune version
+    Runnable so far. Great!:)
+    """
+    weight_init = weight_init if weight_init else ('XavierFill', {})
+    bias_init = bias_init if bias_init else ('ConstantFill', {})
+    mask_init = mask_init if mask_init else ('ConstantFill', {})
+    blob_out = blob_out or model.net.NextName()
+    compress_rate = blob_out + '_compress_rate'
+    if model.init_params:
+        compress_lb = model.param_init_net.ConstantFill(
+            [],
+            blob_out + '_lb',
+            shape=[1],
+            value=comp_lb
+        )
+        weight = model.param_init_net.__getattr__(weight_init[0])(
+            [],
+            blob_out + '_w',
+            shape=[dim_out, dim_in],
+            **weight_init[1]
+        )
+        mask = model.param_init_net.ConstantFill(
+            [],
+            blob_out + '_m',
+            shape=[dim_out, dim_in],
+            value=1.0
+        )
+        ag_dw = model.param_init_net.__getattr__(mask_init[0])(
+            [],
+            blob_out + '_ag_dw',
+            shape=[dim_out, dim_in],
+            **mask_init[1]
+        )
+        bias = model.param_init_net.__getattr__(bias_init[0])(
+            [],
+            blob_out + '_b',
+            shape=[dim_out, ],
+            **bias_init[1]
+        )
+        mask_seq = model.param_init_net.__getattr__(mask_init[0])(
+            [],
+            blob_out + '_mask_seq',
+            shape=[dim_out, dim_in],
+            **mask_init[1]
+        )
+        thres = model.param_init_net.ConstantFill(
+            [],
+            blob_out + '_thres',
+            shape=[1],
+            value=threshold
+        )
+    else:
+        compress_lb = core.ScopedBlobReference(
+            blob_out + '_lb', model.param_init_net)
+        weight = core.ScopedBlobReference(
+            blob_out + '_w', model.param_init_net)
+        bias = core.ScopedBlobReference(
+            blob_out + '_b', model.param_init_net)
+        mask = core.ScopedBlobReference(
+            blob_out + '_m', model.param_init_net)
+        ag_dw = core.ScopedBlobReference(
+            blob_out + '_ag_dw', model.param_init_net)
+        mask_seq = core.ScopedBlobReference(
+            blob_out + '_mask_seq', model.param_init_net)
+        thres = core.ScopedBlobReference(
+            blob_out + '_thres', model.param_init_net)
+
+    model.AddParameter(weight)
+    model.AddParameter(bias)
+    if need_compress_rate:
+        return model.net.FC_Prune([blob_in, weight, mask, bias, ag_dw, mask_seq,
+                                   thres, compress_lb],
+                                  [blob_out, compress_rate], **kwargs)
+    else:
+        return model.net.FC_Prune([blob_in, weight, mask,
+                                   bias, ag_dw, mask_seq,
+                                   thres, compress_lb],
+                                  blob_out, **kwargs)
+
+
+def fc_sparse(
+    model, blob_in, blob_out, w_csr, iw, jw, bias,
+    **kwargs
+):
+    """FC_Sparse: Only takes in alocated weights"""
+    if not (w_csr and iw and jw and bias):
+        print("Warning...")
+    model.AddParameter(w_csr)
+    model.AddParameter(iw)
+    model.AddParameter(jw)
+    model.AddParameter(bias)
+    return model.net.FC_Sparse([blob_in, w_csr, iw, jw, bias],
+                               blob_out, **kwargs)
diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py
new file mode 100644
index 0000000..f773cc3
--- /dev/null
+++ b/caffe2/python/helpers/nonlinearity.py
@@ -0,0 +1,43 @@
+## @package nonlinearity
+# Module caffe2.python.helpers.nonlinearity
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+
+
+def prelu(model, blob_in, blob_out, num_channels=1, slope_init=None,
+          **kwargs):
+    """PRelu"""
+    slope_init = (
+        slope_init if slope_init else ('ConstantFill', {'value': 0.25}))
+    if model.init_params:
+        slope = model.param_init_net.__getattr__(slope_init[0])(
+            [],
+            blob_out + '_slope',
+            shape=[num_channels],
+            **slope_init[1]
+        )
+    else:
+        slope = core.ScopedBlobReference(
+            blob_out + '_slope', model.param_init_net)
+
+    model.AddParameter(slope)
+
+    return model.net.PRelu([blob_in, slope], [blob_out])
+
+
+def relu(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
+    """Relu."""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    return model.net.Relu(blob_in, blob_out, order=order, **kwargs)
+
+
+def tanh(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
+    """Tanh."""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    return model.net.Tanh(blob_in, blob_out, order=order, **kwargs)
diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py
new file mode 100644
index 0000000..a47ac58
--- /dev/null
+++ b/caffe2/python/helpers/normalization.py
@@ -0,0 +1,292 @@
+## @package normalization
+# Module caffe2.python.helpers.normalization
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import scope
+from caffe2.python.modeling.parameter_info import ParameterTags
+from caffe2.proto import caffe2_pb2
+from caffe2.python.modeling import initializers
+
+
+def lrn(model, blob_in, blob_out, order="NCHW", use_cudnn=False, **kwargs):
+    """LRN"""
+    dev = kwargs['device_option'] if 'device_option' in kwargs \
+        else scope.CurrentDeviceScope()
+    is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU
+    if use_cudnn and (not is_cpu):
+        kwargs['engine'] = 'CUDNN'
+        blobs_out = blob_out
+    else:
+        blobs_out = [blob_out, "_" + blob_out + "_scale"]
+    lrn = model.net.LRN(
+        blob_in,
+        blobs_out,
+        order=order,
+        **kwargs
+    )
+
+    if use_cudnn and (not is_cpu):
+        return lrn
+    else:
+        return lrn[0]
+
+
+def softmax(model, blob_in, blob_out=None, use_cudnn=False, **kwargs):
+    """Softmax."""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    if blob_out is not None:
+        return model.net.Softmax(blob_in, blob_out, **kwargs)
+    else:
+        return model.net.Softmax(blob_in, **kwargs)
+
+
+def instance_norm(model, blob_in, blob_out, dim_in, order="NCHW", **kwargs):
+    blob_out = blob_out or model.net.NextName()
+    # Input: input, scale, bias
+    # Output: output, saved_mean, saved_inv_std
+    # scale: initialize with ones
+    # bias: initialize with zeros
+
+    def init_blob(value, suffix):
+        return model.param_init_net.ConstantFill(
+            [], blob_out + "_" + suffix, shape=[dim_in], value=value)
+    scale, bias = init_blob(1.0, "s"), init_blob(0.0, "b")
+
+    model.AddParameter(scale, ParameterTags.WEIGHT)
+    model.AddParameter(bias, ParameterTags.BIAS)
+    blob_outs = [blob_out, blob_out + "_sm", blob_out + "_siv"]
+    if 'is_test' in kwargs and kwargs['is_test']:
+        blob_outputs = model.net.InstanceNorm(
+            [blob_in, scale, bias], [blob_out],
+            order=order, **kwargs)
+        return blob_outputs
+    else:
+        blob_outputs = model.net.InstanceNorm(
+            [blob_in, scale, bias], blob_outs,
+            order=order, **kwargs)
+        # Return the output
+        return blob_outputs[0]
+
+
+def spatial_bn(model, blob_in, blob_out, dim_in,
+               init_scale=1., init_bias=0.,
+               ScaleInitializer=None, BiasInitializer=None,
+               RunningMeanInitializer=None, RunningVarianceInitializer=None,
+               order="NCHW", **kwargs):
+    blob_out = blob_out or model.net.NextName()
+    # Input: input, scale, bias, est_mean, est_inv_var
+    # Output: output, running_mean, running_inv_var, saved_mean,
+    #         saved_inv_var
+    # scale: initialize with init_scale (default 1.)
+    # bias: initialize with init_bias (default 0.)
+    # est mean: zero
+    # est var: ones
+
+    if model.init_params:
+        scale_init = ("ConstantFill", {'value': init_scale})
+        bias_init = ("ConstantFill", {'value': init_bias})
+        rm_init = ("ConstantFill", {'value': 0.0})
+        riv_init = ("ConstantFill", {'value': 1.0})
+
+        ScaleInitializer = initializers.update_initializer(
+            ScaleInitializer, scale_init, ("ConstantFill", {})
+        )
+        BiasInitializer = initializers.update_initializer(
+            BiasInitializer, bias_init, ("ConstantFill", {})
+        )
+        RunningMeanInitializer = initializers.update_initializer(
+            RunningMeanInitializer, rm_init, ("ConstantFill", {})
+        )
+        RunningVarianceInitializer = initializers.update_initializer(
+            RunningVarianceInitializer, riv_init, ("ConstantFill", {})
+        )
+    else:
+        ScaleInitializer = initializers.ExternalInitializer()
+        BiasInitializer = initializers.ExternalInitializer()
+        RunningMeanInitializer = initializers.ExternalInitializer()
+        RunningVarianceInitializer = initializers.ExternalInitializer()
+
+    scale = model.create_param(
+        param_name=blob_out + '_s',
+        shape=[dim_in],
+        initializer=ScaleInitializer,
+        tags=ParameterTags.WEIGHT
+    )
+
+    bias = model.create_param(
+        param_name=blob_out + '_b',
+        shape=[dim_in],
+        initializer=BiasInitializer,
+        tags=ParameterTags.BIAS
+    )
+
+    running_mean = model.create_param(
+        param_name=blob_out + '_rm',
+        shape=[dim_in],
+        initializer=RunningMeanInitializer,
+        tags=ParameterTags.COMPUTED_PARAM
+    )
+
+    running_inv_var = model.create_param(
+        param_name=blob_out + '_riv',
+        shape=[dim_in],
+        initializer=RunningVarianceInitializer,
+        tags=ParameterTags.COMPUTED_PARAM
+    )
+
+    blob_outs = [blob_out, running_mean, running_inv_var,
+                 blob_out + "_sm", blob_out + "_siv"]
+    if 'is_test' in kwargs and kwargs['is_test']:
+        blob_outputs = model.net.SpatialBN(
+            [blob_in, scale, bias, blob_outs[1], blob_outs[2]], [blob_out],
+            order=order, **kwargs)
+        return blob_outputs
+    else:
+        blob_outputs = model.net.SpatialBN(
+            [blob_in, scale, bias, blob_outs[1], blob_outs[2]], blob_outs,
+            order=order, **kwargs)
+        # Return the output
+        return blob_outputs[0]
+
+
+def spatial_gn(model, blob_in, blob_out, dim_in,
+               init_scale=1., init_bias=0.,
+               ScaleInitializer=None, BiasInitializer=None,
+               RunningMeanInitializer=None, RunningVarianceInitializer=None,
+               order="NCHW", **kwargs):
+    '''
+    Group normalizes the input, cf. https://arxiv.org/abs/1803.08494.
+    '''
+
+    blob_out = blob_out or model.net.NextName()
+    # Input: input, scale, bias
+    # Output: output, group_mean, group_inv_std
+    # scale: initialize with init_scale (default 1.)
+    # [recommendation: set init_scale = 0. in the last layer for each res block]
+    # bias: initialize with init_bias (default 0.)
+
+    if model.init_params:
+        scale_init = ("ConstantFill", {'value': init_scale})
+        bias_init = ("ConstantFill", {'value': init_bias})
+
+        ScaleInitializer = initializers.update_initializer(
+            ScaleInitializer, scale_init, ("ConstantFill", {})
+        )
+        BiasInitializer = initializers.update_initializer(
+            BiasInitializer, bias_init, ("ConstantFill", {})
+        )
+    else:
+        ScaleInitializer = initializers.ExternalInitializer()
+        BiasInitializer = initializers.ExternalInitializer()
+
+    scale = model.create_param(
+        param_name=blob_out + '_s',
+        shape=[dim_in],
+        initializer=ScaleInitializer,
+        tags=ParameterTags.WEIGHT
+    )
+
+    bias = model.create_param(
+        param_name=blob_out + '_b',
+        shape=[dim_in],
+        initializer=BiasInitializer,
+        tags=ParameterTags.BIAS
+    )
+
+    blob_outs = [blob_out,
+                 blob_out + "_mean", blob_out + "_std"]
+
+    blob_outputs = model.net.GroupNorm(
+        [blob_in, scale, bias],
+        blob_outs,
+        **kwargs)
+    # Return the output
+    return blob_outputs[0]
+
+
+def layer_norm(
+    model,
+    blob_in,
+    blob_out,
+    dim_in,
+    axis=1,
+    epsilon=1e-4,
+    initial_scale=1.0,
+    initial_bias=0.0,
+):
+    '''
+    Layer normalizes the input, cf. https://arxiv.org/pdf/1607.06450.pdf.
+
+    Args:
+        blob_in: The input blob to layer normalize.
+        blob_out: The layer normalized output blob.
+        dim_in: The dimension of the scale and bias. For example, if blob_in is
+            a 2D design matrix and axis is 1, this would be the number of
+            columns.
+        axis: (optional) The axis to normalize. Typically the feature axis.
+            Defaults to 1.
+        epsilon: (optional) A small value used for numerical stability in
+            calculation. Defaults to 1e-4.
+        initial_scale: (optional) The initial value for the learned scale
+            parameter. Defaults to 1.0
+        initial_bias: (optional) The initial value for the learned bias
+            parameter of the layerwise standard deviation. Defaults to 0.0.
+
+    Returns:
+        A 3-tuple consisting of:
+            - The layer normalized input blob.
+            - The mean of the input blob across the given axis.
+            - The standard deviation of the input blob acress the given axis.
+    '''
+
+    # The LayerNorm operator only performs the layerwise z-shift, without
+    # scaling and shifting by the learned scale and bias parameters. We have
+    # to do that separately below.
+    normalized, mean, stdev = model.net.LayerNorm(
+        [blob_in],
+        [blob_out, blob_out + "_mean", blob_out + "_stdev"],
+        axis=axis,
+        epsilon=epsilon,
+    )
+
+    # The learned multiplicative scale or "gain".
+    scale = model.create_param(
+        param_name='{}_scale'.format(blob_out),
+        shape=[dim_in],
+        initializer=initializers.Initializer(
+            'ConstantFill',
+            value=initial_scale,
+        ),
+        tags=ParameterTags.WEIGHT,
+    )
+
+    # The learned additive bias or "shift".
+    bias = model.create_param(
+        param_name='{}_bias'.format(blob_out),
+        shape=[dim_in],
+        initializer=initializers.Initializer(
+            'ConstantFill',
+            value=initial_bias,
+        ),
+        tags=ParameterTags.BIAS,
+    )
+
+    scaled = model.net.Mul(
+        [normalized, scale],
+        ['{}_scaled'.format(blob_out)],
+        broadcast=1,
+        axis=axis,
+    )
+
+    biased = model.net.Add(
+        [scaled, bias],
+        ['{}_biased'.format(blob_out)],
+        broadcast=1,
+        axis=axis,
+    )
+
+    return biased, mean, stdev
diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py
new file mode 100644
index 0000000..412d554
--- /dev/null
+++ b/caffe2/python/helpers/pooling.py
@@ -0,0 +1,38 @@
+## @package pooling
+# Module caffe2.python.helpers.pooling
+## @package fc
+# Module caffe2.python.helpers.pooling
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
+    """Max pooling"""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    return model.net.MaxPool(blob_in, blob_out, order=order, **kwargs)
+
+
+def average_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW",
+                 **kwargs):
+    """Average pooling"""
+    if use_cudnn:
+        kwargs['engine'] = 'CUDNN'
+    return model.net.AveragePool(
+        blob_in,
+        blob_out,
+        order=order,
+        **kwargs
+    )
+
+
+def max_pool_with_index(model, blob_in, blob_out, order="NCHW", **kwargs):
+    """Max pooling with an explicit index of max position"""
+    return model.net.MaxPoolWithIndex(
+        blob_in,
+        [blob_out, blob_out + "_index"],
+        order=order,
+        **kwargs
+    )[0]
diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py
new file mode 100644
index 0000000..59defe9
--- /dev/null
+++ b/caffe2/python/helpers/tools.py
@@ -0,0 +1,34 @@
+## @package tools
+# Module caffe2.python.helpers.tools
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def image_input(
+    model, blob_in, blob_out, order="NCHW", use_gpu_transform=False, **kwargs
+):
+    assert 'is_test' in kwargs, "Argument 'is_test' is required"
+    if order == "NCHW":
+        if (use_gpu_transform):
+            kwargs['use_gpu_transform'] = 1 if use_gpu_transform else 0
+            # GPU transform will handle NHWC -> NCHW
+            outputs = model.net.ImageInput(blob_in, blob_out, **kwargs)
+            pass
+        else:
+            outputs = model.net.ImageInput(
+                blob_in, [blob_out[0] + '_nhwc'] + blob_out[1:], **kwargs
+            )
+            outputs_list = list(outputs)
+            outputs_list[0] = model.net.NHWC2NCHW(outputs_list[0], blob_out[0])
+            outputs = tuple(outputs_list)
+    else:
+        outputs = model.net.ImageInput(blob_in, blob_out, **kwargs)
+    return outputs
+
+
+def video_input(model, blob_in, blob_out, **kwargs):
+    # size of outputs can vary depending on kwargs
+    outputs = model.net.VideoInput(blob_in, blob_out, **kwargs)
+    return outputs
diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py
new file mode 100644
index 0000000..bee3634
--- /dev/null
+++ b/caffe2/python/helpers/train.py
@@ -0,0 +1,78 @@
+## @package train
+# Module caffe2.python.helpers.train
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope
+from caffe2.proto import caffe2_pb2
+
+
+def _get_weights(model, namescope=None):
+    if namescope is None:
+        namescope = scope.CurrentNameScope()
+
+    if namescope == '':
+        return model.weights[:]
+    else:
+        return [w for w in model.weights if w.GetNameScope() == namescope]
+
+
+def iter(model, blob_out, **kwargs):
+    if 'device_option' in kwargs:
+        del kwargs['device_option']
+    model.param_init_net.ConstantFill(
+        [],
+        blob_out,
+        shape=[1],
+        value=0,
+        dtype=core.DataType.INT64,
+        device_option=core.DeviceOption(caffe2_pb2.CPU, 0),
+        **kwargs
+    )
+    return model.net.Iter(blob_out, blob_out, **kwargs)
+
+
+def accuracy(model, blob_in, blob_out, **kwargs):
+    dev = kwargs['device_option'] if 'device_option' in kwargs \
+        else scope.CurrentDeviceScope()
+    is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU
+
+    # We support top_k > 1 only on CPU
+    if not is_cpu and 'top_k' in kwargs and kwargs['top_k'] > 1:
+        pred_host = model.net.CopyGPUToCPU(blob_in[0], blob_in[0] + "_host")
+        label_host = model.net.CopyGPUToCPU(blob_in[1], blob_in[1] + "_host")
+
+        # Now use the Host version of the accuracy op
+        model.net.Accuracy(
+            [pred_host, label_host],
+            blob_out,
+            device_option=core.DeviceOption(caffe2_pb2.CPU, 0),
+            **kwargs
+        )
+    else:
+        model.net.Accuracy(blob_in, blob_out)
+
+
+def add_weight_decay(model, weight_decay):
+    """Adds a decay to weights in the model.
+
+    This is a form of L2 regularization.
+
+    Args:
+        weight_decay: strength of the regularization
+    """
+    if weight_decay <= 0.0:
+        return
+    wd = model.param_init_net.ConstantFill(
+        [], 'wd', shape=[1], value=weight_decay
+    )
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    for param in _get_weights(model):
+        #  Equivalent to: grad += wd * param
+        grad = model.param_to_grad[param]
+        model.net.WeightedSum(
+            [grad, ONE, param, wd],
+            grad,
+        )
diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py
new file mode 100644
index 0000000..e98056f
--- /dev/null
+++ b/caffe2/python/hsm_util.py
@@ -0,0 +1,70 @@
+## @package hsm_util
+# Module caffe2.python.hsm_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import hsm_pb2
+
+'''
+    Hierarchical softmax utility methods that can be used to:
+    1) create TreeProto structure given list of word_ids or NodeProtos
+    2) create HierarchyProto structure using the user-inputted TreeProto
+'''
+
+
+def create_node_with_words(words, name='node'):
+    node = hsm_pb2.NodeProto()
+    node.name = name
+    for word in words:
+        node.word_ids.append(word)
+    return node
+
+
+def create_node_with_nodes(nodes, name='node'):
+    node = hsm_pb2.NodeProto()
+    node.name = name
+    for child_node in nodes:
+        new_child_node = node.children.add()
+        new_child_node.MergeFrom(child_node)
+    return node
+
+
+def create_hierarchy(tree_proto):
+    max_index = 0
+
+    def create_path(path, word):
+        path_proto = hsm_pb2.PathProto()
+        path_proto.word_id = word
+        for entry in path:
+            new_path_node = path_proto.path_nodes.add()
+            new_path_node.index = entry[0]
+            new_path_node.length = entry[1]
+            new_path_node.target = entry[2]
+        return path_proto
+
+    def recursive_path_builder(node_proto, path, hierarchy_proto, max_index):
+        node_proto.offset = max_index
+        path.append([max_index,
+                    len(node_proto.word_ids) + len(node_proto.children), 0])
+        max_index += len(node_proto.word_ids) + len(node_proto.children)
+        if hierarchy_proto.size < max_index:
+            hierarchy_proto.size = max_index
+        for target, node in enumerate(node_proto.children):
+            path[-1][2] = target
+            max_index = recursive_path_builder(node, path, hierarchy_proto,
+                                               max_index)
+        for target, word in enumerate(node_proto.word_ids):
+            path[-1][2] = target + len(node_proto.children)
+            path_entry = create_path(path, word)
+            new_path_entry = hierarchy_proto.paths.add()
+            new_path_entry.MergeFrom(path_entry)
+        del path[-1]
+        return max_index
+
+    node = tree_proto.root_node
+    hierarchy_proto = hsm_pb2.HierarchyProto()
+    path = []
+    max_index = recursive_path_builder(node, path, hierarchy_proto, max_index)
+    return hierarchy_proto
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
new file mode 100644
index 0000000..8e59549
--- /dev/null
+++ b/caffe2/python/hypothesis_test.py
@@ -0,0 +1,2386 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import copy
+import time
+from functools import partial, reduce
+from future.utils import viewitems, viewkeys
+from hypothesis import assume, given, settings, HealthCheck
+import hypothesis.strategies as st
+import unittest
+
+from caffe2.python import core, workspace, tt_core, dyndep
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto.caffe2_pb2 import TensorProto
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/optimizers:sgd_simd_ops')
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+@st.composite
+def _tensor_and_prefix(draw, dtype, elements, min_dim=1, max_dim=4, **kwargs):
+    dims_ = draw(
+        st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
+    extra_ = draw(
+        st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
+    assume(len(dims_) + len(extra_) < max_dim)
+    return (draw(hu.arrays(dims_ + extra_, dtype, elements)),
+            draw(hu.arrays(extra_, dtype, elements)))
+
+
+def _tensor_and_indices(min_dim=1, max_dim=4, dtype=np.float32,
+                        elements=None, **kwargs):
+    """ generates a tensor and a list of indices of larger tensor of same dim"""
+    data_dims_ = st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    original_dim = st.integers(min_value=2, max_value=10)
+    return st.tuples(data_dims_, original_dim).flatmap(lambda pair: st.tuples(
+        st.just(pair[1]),  # original dimension
+        hu.arrays(pair[0], dtype, elements),  # data tensor
+        hu.arrays(pair[0][0], dtype=np.int64, elements=st.integers(
+            min_value=0, max_value=pair[1] - 1)),
+    ))
+
+
+_NUMPY_TYPE_TO_ENUM = {
+    np.float32: core.DataType.FLOAT,
+    np.int32: core.DataType.INT32,
+    np.bool: core.DataType.BOOL,
+    np.uint8: core.DataType.UINT8,
+    np.int8: core.DataType.INT8,
+    np.uint16: core.DataType.UINT16,
+    np.int16: core.DataType.INT16,
+    np.int64: core.DataType.INT64,
+    np.float64: core.DataType.DOUBLE,
+}
+
+
+def _dtypes(dtypes=None):
+    dtypes = dtypes if dtypes else [np.int32, np.int64, np.float32]
+    return st.sampled_from(dtypes)
+
+
+def _test_binary(name, ref, filter_=None, gcs=hu.gcs,
+                 test_gradient=False, allow_inplace=False, dtypes=_dtypes):
+    @given(
+        inputs=dtypes().flatmap(
+            lambda dtype: hu.tensors(
+                n=2, dtype=dtype,
+                elements=hu.elements_of_type(dtype, filter_=filter_))),
+        out=st.sampled_from(('Y', 'X1', 'X2') if allow_inplace else ('Y',)),
+        **gcs)
+    @settings(max_examples=3, timeout=100)
+    def test_binary(self, inputs, out, gc, dc):
+        op = core.CreateOperator(name, ["X1", "X2"], [out])
+        X1, X2 = inputs
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        # We only do gradient check with float32 types.
+        if test_gradient and X1.dtype == np.float32:
+            self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
+        self.assertReferenceChecks(gc, op, [X1, X2], ref)
+
+    return test_binary
+
+
+def _test_binary_broadcast(name, ref, filter_=None,
+                           gcs=hu.gcs, allow_inplace=False, dtypes=_dtypes):
+    @given(
+        inputs=dtypes().flatmap(lambda dtype: _tensor_and_prefix(
+            dtype=dtype,
+            elements=hu.elements_of_type(dtype, filter_=filter_))),
+        in_place=(st.booleans() if allow_inplace else st.just(False)),
+        **gcs)
+    @settings(max_examples=3, timeout=100)
+    def test_binary_broadcast(self, inputs, in_place, gc, dc):
+        op = core.CreateOperator(
+            name, ["X1", "X2"], ["X1" if in_place else "Y"], broadcast=1)
+        X1, X2 = inputs
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+
+        def cast_ref(x, y):
+            return (np.array(ref(x, y)[0], dtype=x.dtype), )
+
+        # gradient not implemented yet
+        # self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
+        self.assertReferenceChecks(gc, op, [X1, X2], cast_ref)
+
+    return test_binary_broadcast
+
+
+class TestOperators(hu.HypothesisTestCase):
+
+    def test_comparison_ops(self):
+        ops = {"LT": lambda x1, x2: [x1 < x2],
+               "LE": lambda x1, x2: [x1 <= x2],
+               "GT": lambda x1, x2: [x1 > x2],
+               "GE": lambda x1, x2: [x1 >= x2]}
+        for name, ref in viewitems(ops):
+            _test_binary(name, ref, gcs=hu.gcs_cpu_only)(self)
+            _test_binary_broadcast(name, ref, gcs=hu.gcs_cpu_only)(self)
+
+    @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
+    def test_sum(self, inputs, in_place, gc, dc):
+        op = core.CreateOperator("Sum", ["X1", "X2"],
+                                        ["Y" if not in_place else "X1"])
+        X1, X2 = inputs
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
+
+    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
+    def test_row_mul(self, inputs, gc, dc):
+        op = core.CreateOperator("RowMul", ["X1", "X2"], ["Y"])
+        X1, Xtmp = inputs
+        X2 = Xtmp[:, 0]
+
+        def ref(x, y):
+            ret = np.zeros(shape=x.shape, dtype=x.dtype)
+            for i in range(y.size):
+                ret[i, ] = x[i, ] * y[i]
+            return [ret]
+
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        for i in range(2):
+            self.assertGradientChecks(gc, op, [X1, X2], i, [0])
+        self.assertReferenceChecks(gc, op, [X1, X2], ref)
+
+    @given(inputs=hu.tensors(n=2), **hu.gcs_cpu_only)
+    def test_max(self, inputs, gc, dc):
+        op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])
+
+        X1, X2 = inputs
+        # Make X1 and X2 far from each other, since X1=X2 is not differentiable
+        # and the step size of gradient checker is 0.05
+        X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
+        X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        for i in range(2):
+            self.assertGradientChecks(gc, op, [X1, X2], i, [0])
+
+        def elementwise_max(X, Y):
+            return [np.maximum(X, Y)]
+        self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)
+
+    def test_add(self):
+        def not_overflow(x):
+            if not isinstance(x, float):
+                return abs(x) < (1 << 30) - 1
+            return True
+
+        def ref(x, y):
+            return (x + y, )
+        _test_binary("Add", ref, filter_=not_overflow, test_gradient=True)(self)
+        _test_binary_broadcast("Add", ref, filter_=not_overflow)(self)
+
+    def test_sub(self):
+        def ref(x, y):
+            return (x - y, )
+        # TODO(jiayq): enable gradient test when implemented.
+        _test_binary("Sub", ref, test_gradient=True)(self)
+        _test_binary_broadcast("Sub", ref)(self)
+
+    def test_mul(self):
+        def not_overflow(x):
+            if not isinstance(x, float):
+                return abs(x) < (1 << 15) - 1
+            return True
+
+        def ref(x, y):
+            return (x * y, )
+        _test_binary("Mul", ref, filter_=not_overflow, test_gradient=True)(self)
+        _test_binary_broadcast("Mul", ref, filter_=not_overflow)(self)
+
+    def test_div(self):
+        def ref(x, y):
+            return (x / y, )
+
+        def non_zero(x):
+            return abs(x) > 1e-2
+
+        def div_dtypes():
+            return st.sampled_from([np.float32, np.float64])
+
+        _test_binary(
+            "Div", ref, filter_=non_zero, test_gradient=True,
+            dtypes=div_dtypes, gcs=hu.gcs_cpu_only
+        )(self)
+        _test_binary(
+            "Div", ref, filter_=non_zero, test_gradient=False,
+            dtypes=div_dtypes
+        )(self)
+        _test_binary_broadcast(
+            "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)
+
+    @given(X=hu.tensor(), in_place=st.booleans(), **hu.gcs)
+    def test_negative(self, X, in_place, gc, dc):
+        op = core.CreateOperator("Negative", ["X"],
+                                 ["Y" if not in_place else "X"])
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(), **hu.gcs)
+    def test_tanh(self, X, gc, dc):
+        op = core.CreateOperator("Tanh", "X", "Y")
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(), **hu.gcs)
+    def test_averaged_loss(self, X, gc, dc):
+        op = core.CreateOperator("AveragedLoss", ["X"], ["loss"])
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
+    def test_softsign(self, X, inplace, gc, dc):
+        op = core.CreateOperator("Softsign", ["X"], ["X" if inplace else "Y"])
+
+        def softsign(X):
+            return (X / (1 + np.abs(X)),)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertReferenceChecks(gc, op, [X], softsign)
+        if inplace:
+            with self.assertRaises(Exception):
+                self.assertGradientChecks(gc, op, [X], 0, [0])
+        else:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(
+        device_options=st.lists(
+            min_size=2,
+            max_size=4,
+            elements=st.sampled_from(hu.expanded_device_options)),
+        set_seed=st.booleans())
+    def test_random_seed_behaviour(self, device_options, set_seed):
+        # Assume we are always operating on CUDA or CPU, since RNG is
+        # inconsistent between CPU and GPU.
+        device_options = copy.deepcopy(device_options)
+        assume(len({do.device_type for do in device_options}) == 1)
+        if set_seed:
+            for do in device_options:
+                do.random_seed = 1000
+
+        def run(do):
+            # Reset each time because 'Y' may already exist in the workspace
+            #   on a different device
+            workspace.ResetWorkspace()
+            ws = workspace.C.Workspace()
+            op = core.CreateOperator(
+                "XavierFill", [], ["Y"],
+                device_option=do,
+                shape=[2])
+            ws.run(op)
+            return ws.blobs["Y"].fetch()
+
+        ys = [run(do) for do in device_options]
+        for y in ys[1:]:
+            if set_seed:
+                np.testing.assert_array_equal(ys[0], y)
+            else:
+                with self.assertRaises(AssertionError):
+                    np.testing.assert_array_equal(ys[0], y)
+
+    @given(axis=st.integers(min_value=1, max_value=4),
+           num_output=st.integers(min_value=4, max_value=8),
+           engine=st.sampled_from(["", "PACKED"]),
+           **hu.gcs)
+    def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
+        np.random.seed(1)
+        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
+
+        def prod(xs):
+            p = 1
+            for x in xs:
+                p *= x
+            return p
+
+        K = prod(list(X.shape)[axis:])
+        N = num_output
+        W = np.random.randn(N, K).astype(np.float32)
+        b = np.random.randn(N).astype(np.float32)
+
+        op = core.CreateOperator(
+            "FC",
+            ["X", "W", "b"],
+            ["Y"],
+            engine=engine,
+            axis=axis)
+        for name, param in [("X", X), ("W", W), ("b", b)]:
+            self.ws.create_blob(name).feed(param)
+        self.ws.run(op)
+        Y = self.ws.blobs["Y"].fetch()
+        self.assertEqual(list(Y.shape), list(X.shape)[:axis] + [N])
+
+        inputs = [X, W, b]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for param, _ in enumerate(inputs):
+            self.assertGradientChecks(gc, op, inputs, param, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "Skipping test due to no gpu present.")
+    @given(hidden_size=st.integers(min_value=1, max_value=3),
+           num_layers=st.integers(min_value=1, max_value=3),
+           bidirectional=st.just(False),  # TODO
+           rnn_mode=st.sampled_from(["lstm"]),   # TODO: "gru"
+           input_mode=st.sampled_from(["linear"]),
+           dropout=st.floats(min_value=1.0, max_value=1.0),
+           T=st.integers(min_value=2, max_value=6),
+           N=st.integers(min_value=1, max_value=4),
+           D=st.integers(min_value=1, max_value=4))
+    def test_recurrent(self, hidden_size, num_layers, bidirectional, rnn_mode,
+                       input_mode, dropout, T, N, D):
+
+        # Random seed, this one happens to pass
+        seed = 1234
+        np.random.seed(seed)
+
+        input_weight_size = hidden_size * D
+        upper_layer_input_weight_size = hidden_size * hidden_size
+        recurrent_weight_size = hidden_size * hidden_size
+        input_bias_size = hidden_size
+        recurrent_bias_size = hidden_size
+        num_directions = 2 if bidirectional else 1
+        first_layer_sz = input_weight_size + recurrent_weight_size + \
+                         input_bias_size + recurrent_bias_size
+        upper_layer_sz = upper_layer_input_weight_size + \
+                         recurrent_weight_size + input_bias_size + \
+                         recurrent_bias_size
+        total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)
+        total_sz *= num_directions
+
+        W = np.random.rand(total_sz).astype(np.float32)
+        self.ws.create_blob("WEIGHT").feed(W, device_option=hu.gpu_do)
+
+        op = core.CreateOperator(
+            "Recurrent",
+            ["INPUT", "HIDDEN_INPUT", "CELL_INPUT", "WEIGHT"],
+            ["OUTPUT", "HIDDEN_OUTPUT", "CELL_OUTPUT",
+             "RNN_SCRATCH", "DROPOUT_STATES"],
+            hidden_size=hidden_size,
+            bidirectional=bidirectional,
+            rnn_mode=rnn_mode,
+            dropout=dropout,
+            input_mode=input_mode,
+            num_layers=num_layers,
+            seed=seed,
+            engine="CUDNN")
+        X = np.random.randn(T, N, D).astype(np.float32)
+        self.ws.create_blob("INPUT").feed(X, device_option=hu.gpu_do)
+        W = self.ws.blobs["WEIGHT"].fetch()
+        H = np.random.randn(
+            num_layers, N, hidden_size * num_directions).astype(
+                np.float32)
+        C = np.random.randn(
+            num_layers, N, hidden_size * num_directions).astype(
+                np.float32) if rnn_mode == "lstm" else \
+            np.empty((1,)).astype(np.float32)  # unused in GRU
+        inputs = [X, H, C, W]
+        input_idxs = [i for (i, _) in enumerate(inputs)] \
+            if rnn_mode == "lstm" else [0, 1, 3]  # ignore C
+
+        for input_idx in input_idxs:
+            self.assertGradientChecks(
+                hu.gpu_do, op, inputs, input_idx, [0],
+                stepsize=0.01, threshold=0.01)
+
+    @given(ndim=st.integers(1, 4),
+           axis=st.integers(0, 3),
+           add_axis=st.integers(0, 1),
+           num_inputs=st.integers(2, 4), **hu.gcs)
+    def test_depth_concat(self, ndim, axis, add_axis, num_inputs, gc, dc):
+        assume(axis < ndim)
+        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
+        shape = [2, 3, 5, 7][:ndim]
+        individual_dims = [1, 2, 3, 4, 5][:num_inputs]
+        inputs = []
+        for i in range(num_inputs):
+            if add_axis == 0:
+                # Sets a unique dim and create the input.
+                shape[axis] = individual_dims[i]
+            inputs.append(np.random.randn(*shape).astype(np.float32))
+        op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
+                                 axis=axis, add_axis=add_axis)
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(num_inputs):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+        # Reference
+        def depth_concat(*inputs):
+            inputs = list(inputs)
+            if add_axis:
+                for i in range(len(inputs)):
+                    inputs[i] = np.expand_dims(inputs[i], axis)
+            input_dims = np.array([np.shape(x)[axis] for x in inputs])
+            return [np.concatenate(inputs, axis=axis), input_dims]
+
+        self.assertReferenceChecks(gc, op, inputs, depth_concat)
+
+    @given(num_inputs=st.integers(2, 4),
+           order=st.sampled_from([("NCHW", 1), ("NHWC", 3)]),
+           **hu.gcs)
+    def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
+        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
+        shape = [2, 3, 5, 7]
+        individual_dims = [1, 2, 3, 4][:num_inputs]
+        inputs = []
+        for i in range(num_inputs):
+            # Sets a unique dim and create the input.
+            shape[order[1]] = individual_dims[i]
+            inputs.append(np.random.rand(*shape).astype(np.float32))
+        op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
+                                 order=order[0])
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(num_inputs):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+        # Reference
+        def depth_concat_with_order(*inputs):
+            inputs = list(inputs)
+            axis = order[1]
+            input_dims = np.array([np.shape(x)[axis] for x in inputs])
+            return [np.concatenate(inputs, axis=axis), input_dims]
+
+        self.assertReferenceChecks(gc, op, inputs, depth_concat_with_order)
+
+    @given(X=hu.arrays(dims=[5, 2],
+                       elements=st.floats(min_value=1.0, max_value=10.0)),
+           **hu.gcs_cpu_only)
+    def test_last_n_windows(self, X, gc, dc):
+        workspace.FeedBlob('input', X)
+        workspace.FeedBlob('next', np.array(0, dtype=np.int32))
+        workspace.CreateBlob('output')
+        collect_net = core.Net('collect_net')
+        collect_net.LastNWindowCollector(
+            ['output', 'next', 'input'],
+            ['output', 'next'],
+            num_to_collect=7,
+        )
+        plan = core.Plan('collect_data')
+        plan.AddStep(core.execution_step('collect_data',
+                                         [collect_net], num_iter=2))
+        workspace.RunPlan(plan)
+        output = workspace.FetchBlob('output')
+        inputs = workspace.FetchBlob('input')
+        new_output = np.zeros([7, inputs.shape[1]])
+        for i in range(inputs.shape[0] * 2):
+            new_output[i % 7] = inputs[i % inputs.shape[0]]
+        import numpy.testing as npt
+        npt.assert_almost_equal(output, new_output, decimal=5)
+
+    @given(dtype=st.sampled_from([np.float32, np.float64, np.int32, np.bool]))
+    def test_print(self, dtype):
+        data = np.random.permutation(6).astype(dtype)
+        self.ws.create_blob("data").feed(data)
+        op = core.CreateOperator("Print", "data", [])
+        self.ws.run(op)
+
+    @given(inputs=hu.tensors(n=2),
+           in_place=st.booleans(),
+           momentum=st.floats(min_value=0.1, max_value=0.9),
+           nesterov=st.booleans(),
+           lr=st.floats(min_value=0.1, max_value=0.9),
+           **hu.gcs)
+    def test_momentum_sgd(
+            self, inputs, in_place, momentum, nesterov, lr, gc, dc):
+        grad, m = inputs
+        lr = np.asarray([lr], dtype=np.float32)
+        op = core.CreateOperator(
+            "MomentumSGD",
+            ["grad", "m", "lr"],
+            ["grad" if in_place else "grad_o",
+             "m" if in_place else "m_o"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [grad, m, lr], [0])
+
+        # Reference
+        def momentum_sgd(grad, m, lr):
+            lr = lr[0]
+            if not nesterov:
+                adjusted_gradient = lr * grad + momentum * m
+                return (adjusted_gradient, adjusted_gradient)
+            else:
+                m_new = momentum * m + lr * grad
+                return ((1 + momentum) * m_new - momentum * m, m_new)
+
+        self.assertReferenceChecks(gc, op, [grad, m, lr], momentum_sgd)
+
+    @given(inputs=hu.tensors(n=3),
+           in_place=st.booleans(),
+           decay=st.floats(min_value=0.1, max_value=0.9),
+           momentum=st.floats(min_value=0.1, max_value=0.9),
+           lr=st.floats(min_value=0.1, max_value=0.9),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           **hu.gcs)
+    def test_rmsprop_sgd(self, inputs, in_place, decay, momentum, lr, epsilon,
+                         gc, dc):
+        grad, ms, mom = inputs
+        ms = np.abs(ms) + 0.01
+        lr = np.asarray([lr], dtype=np.float32)
+        op = core.CreateOperator(
+            "RmsProp",
+            ["grad", "ms", "mom", "lr"],
+            ["grad" if in_place else "grad_o",
+             "ms" if in_place else "ms_o",
+             "mom" if in_place else "mom_o"],
+            momentum=momentum, decay=decay, epsilon=epsilon, device_option=gc)
+        self.assertDeviceChecks(dc, op, [grad, ms, mom, lr], [0])
+
+        def rmsprop(grad, ms, mom, lr):
+            lr = lr[0]
+            ms_o = ms + (1. - decay) * (np.square(grad) - ms)
+            mom_o = momentum * mom + lr * grad / np.sqrt(epsilon + ms_o)
+            grad_o = mom_o
+            return (grad_o, ms_o, mom_o)
+        self.assertReferenceChecks(gc, op, [grad, ms, mom, lr], rmsprop)
+
+    # Reference
+    @staticmethod
+    def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
+        if isinstance(alpha, np.ndarray):
+            alpha = np.asscalar(alpha)
+        n = np.take(nz, 0, axis=-1)
+        z = np.take(nz, 1, axis=-1)
+        # python port of Sigrid's implementation
+        g2 = g * g
+        sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
+        z += g - sigma * w
+        n += g2
+        w = (np.sign(z) * lambda1 - z) / (
+            (beta + np.sqrt(n)) / alpha + lambda2)
+        w[np.abs(z) <= lambda1] = 0
+        return (w, np.stack([n, z], axis=-1))
+
+    @given(inputs=hu.tensors(n=4),
+           in_place=st.booleans(),
+           alpha=st.floats(min_value=0.01, max_value=0.1),
+           beta=st.floats(min_value=0.1, max_value=0.9),
+           lambda1=st.floats(min_value=0.001, max_value=0.1),
+           lambda2=st.floats(min_value=0.001, max_value=0.1),
+           engine=st.sampled_from([None, "SIMD"]),
+           **hu.gcs_cpu_only)
+    def test_ftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
+                      engine, gc, dc):
+        var, n, z, grad = inputs
+        n = np.abs(n)
+        nz = np.stack([n, z], axis=-1)
+        op = core.CreateOperator(
+            "Ftrl",
+            ["var", "nz", "grad"],
+            ["var" if in_place else "var_o",
+             "nz" if in_place else "nz_o"],
+            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
+            engine=engine,
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [var, nz, grad], [0])
+
+        self.assertReferenceChecks(
+            gc, op, [var, nz, grad],
+            partial(self._dense_ftrl, alpha, beta, lambda1, lambda2))
+
+    # Reference
+    @staticmethod
+    def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
+        if isinstance(alpha, np.ndarray):
+            alpha = np.asscalar(alpha)
+
+        old_shape = g.shape
+
+        n = np.take(nz, 0, axis=-1)
+        z = np.take(nz, 1, axis=-1)
+
+        output_dim = g.shape[0]
+
+        w = w.reshape(output_dim, -1)
+        g = g.reshape(output_dim, -1)
+
+        n = n.reshape(output_dim, -1)
+        z = z.reshape(output_dim, -1)
+
+        input_dim = g.shape[1]
+
+        g2 = g * g
+        sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
+        z += g - sigma * w
+        n += g2
+
+        z_norms = np.linalg.norm(z, 2, axis=0)
+
+        z_norms = z_norms + 1e-6
+        w = z * ((lambda1 * np.sqrt(output_dim)) / z_norms - 1) / \
+                    ((beta + np.sqrt(n)) / alpha + lambda2)
+        for i in range(input_dim):
+            if z_norms[i] <= lambda1 * np.sqrt(output_dim):
+                w[:, i] = 0
+
+        w = w.reshape(old_shape)
+        n = n.reshape(old_shape)
+        z = z.reshape(old_shape)
+        return (w, np.stack([n, z], axis=-1))
+
+    @given(inputs=hu.tensors(n=4),
+           in_place=st.booleans(),
+           alpha=st.floats(min_value=0.01, max_value=0.1),
+           beta=st.floats(min_value=0.1, max_value=0.9),
+           lambda1=st.floats(min_value=0.001, max_value=0.1),
+           lambda2=st.floats(min_value=0.001, max_value=0.1),
+           engine=st.sampled_from([None]),
+           **hu.gcs_cpu_only)
+    def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
+                      engine, gc, dc):
+        var, n, z, grad = inputs
+        n = np.abs(n)
+        nz = np.stack([n, z], axis=-1)
+        op = core.CreateOperator(
+            "GFtrl",
+            ["var", "nz", "grad"],
+            ["var" if in_place else "var_o",
+             "nz" if in_place else "nz_o"],
+            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
+            engine=engine,
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [var, nz, grad], [0])
+
+        self.assertReferenceChecks(
+            gc, op, [var, nz, grad],
+            partial(self._dense_gftrl, alpha, beta, lambda1, lambda2))
+
+    @given(inputs=hu.tensors(n=4),
+           alpha=st.floats(min_value=0.01, max_value=0.1),
+           beta=st.floats(min_value=0.1, max_value=0.9),
+           lambda1=st.floats(min_value=0.001, max_value=0.1),
+           lambda2=st.floats(min_value=0.001, max_value=0.1),
+           engine=st.sampled_from([None, "SIMD"]),
+           **hu.gcs_cpu_only)
+    def test_sparse_ftrl_sgd(self, inputs, alpha, beta, lambda1, lambda2,
+                             engine, gc, dc):
+        var, n, z, grad = inputs
+        # generate fake subset manually because hypothesis is too complicated :)
+        indices = np.arange(var.shape[0])
+        indices = indices[indices % 2 == 0]
+        grad = grad[indices]
+        n = np.abs(n)
+        nz = np.stack([n, z], axis=-1)
+        op = core.CreateOperator(
+            "SparseFtrl",
+            ["var", "nz", "indices", "grad"],
+            ["var", "nz"],
+            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
+            engine=engine,
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [var, nz, indices, grad], [0])
+
+        # Reference
+        def ftrl(w, nz, i, g):
+            sw, snz = self._dense_ftrl(alpha, beta, lambda1, lambda2,
+                                       w[i], nz[i], g)
+            w[i] = sw
+            nz[i] = snz
+            return (w, nz)
+
+        self.assertReferenceChecks(gc, op, [var, nz, indices, grad], ftrl)
+
+    # Reference
+    @staticmethod
+    def _dense_ftrl_send_alpha_by_input(beta, lambda1, lambda2, w, nz, g, alpha):
+        return TestOperators._dense_ftrl(alpha, beta, lambda1, lambda2, w, nz,
+                                         g)
+
+    @given(inputs=hu.tensors(n=4),
+           in_place=st.booleans(),
+           alpha=st.floats(min_value=0.01, max_value=0.1),
+           beta=st.floats(min_value=0.1, max_value=0.9),
+           lambda1=st.floats(min_value=0.001, max_value=0.1),
+           lambda2=st.floats(min_value=0.001, max_value=0.1),
+           engine=st.sampled_from([None, "SIMD"]),
+           **hu.gcs_cpu_only)
+    def test_ftrl_sgd_send_alpha_by_input(self, inputs, in_place, alpha, beta,
+                                          lambda1, lambda2, engine, gc, dc):
+        var, n, z, grad = inputs
+        n = np.abs(n)
+        nz = np.stack([n, z], axis=-1)
+        alpha = np.array(alpha).astype(np.float32)
+        op = core.CreateOperator(
+            "Ftrl",
+            ["var", "nz", "grad", "alpha"],
+            ["var" if in_place else "var_o",
+             "nz" if in_place else "nz_o"],
+            beta=beta, lambda1=lambda1, lambda2=lambda2,
+            engine=engine,
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [var, nz, grad, alpha], [0])
+
+        self.assertReferenceChecks(
+            gc, op, [var, nz, grad, alpha],
+            partial(self._dense_ftrl_send_alpha_by_input, beta, lambda1, lambda2))
+
+    @given(inputs=hu.tensors(n=4),
+           alpha=st.floats(min_value=0.01, max_value=0.1),
+           beta=st.floats(min_value=0.1, max_value=0.9),
+           lambda1=st.floats(min_value=0.001, max_value=0.1),
+           lambda2=st.floats(min_value=0.001, max_value=0.1),
+           engine=st.sampled_from([None, "SIMD"]),
+           **hu.gcs_cpu_only)
+    def test_sparse_ftrl_sgd_send_alpha_by_input(self, inputs, alpha, beta,
+                                                 lambda1, lambda2, engine, gc,
+                                                 dc):
+        var, n, z, grad = inputs
+        # generate fake subset manually because hypothesis is too complicated :)
+        indices = np.arange(var.shape[0])
+        indices = indices[indices % 2 == 0]
+        grad = grad[indices]
+        n = np.abs(n)
+        nz = np.stack([n, z], axis=-1)
+        alpha = np.array(alpha).astype(np.float32)
+        op = core.CreateOperator(
+            "SparseFtrl",
+            ["var", "nz", "indices", "grad", "alpha"],
+            ["var", "nz"],
+            beta=beta, lambda1=lambda1, lambda2=lambda2,
+            engine=engine,
+            device_option=gc)
+        self.assertDeviceChecks(
+            dc, op, [var, nz, indices, grad, alpha], [0])
+
+        # Reference
+        def ftrl(w, nz, i, g, alpha):
+            sw, snz = self._dense_ftrl_send_alpha_by_input(beta, lambda1,
+                                                           lambda2, w[i], nz[i],
+                                                           g, alpha)
+            w[i] = sw
+            nz[i] = snz
+            return (w, nz)
+
+        self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha],
+                                   ftrl)
+
+    @given(input=hu.tensor(max_value=20,
+                           max_dim=1,
+                           dtype=np.int32,
+                           elements=st.integers(min_value=0, max_value=10)),
+           with_remapping=st.booleans(),
+           **hu.gcs)
+    def test_unique(self, input, with_remapping, gc, dc):
+        op = core.CreateOperator(
+            "Unique",
+            ["input"],
+            ["unique"] + (["remapping"] if with_remapping else []),
+            device_option=gc)
+        self.assertDeviceChecks(dc, op, [input], [0])
+
+        # Validator
+        def unique_valid(input, unique, remapping=None):
+            self.assertEqual(unique.size, len(set(input)))
+            self.assertEqual(sorted(unique), sorted(set(input)))
+            if with_remapping:
+                self.assertEqual(remapping.shape, input.shape)
+                remapped = [unique[remapping[i]] for i in range(len(input))]
+                np.testing.assert_array_equal(remapped, input)
+
+        self.assertValidationChecks(gc, op, [input], unique_valid)
+
+    @given(prediction=hu.arrays(dims=[10, 3],
+                                elements=st.floats(allow_nan=False,
+                                                   allow_infinity=False,
+                                                   min_value=0,
+                                                   max_value=1)),
+           labels=hu.arrays(dims=[10],
+                            dtype=np.int32,
+                            elements=st.integers(min_value=0,
+                                                 max_value=3 - 1)),
+           top_k=st.integers(min_value=1, max_value=3),
+           **hu.gcs)
+    def test_accuracy(self, prediction, labels, top_k, gc, dc):
+        if(top_k > 1):
+            gc = hu.cpu_do
+
+        op = core.CreateOperator(
+            "Accuracy",
+            ["prediction", "labels"],
+            ["accuracy"],
+            top_k=top_k,
+            device_option=gc
+        )
+
+        def op_ref(prediction, labels, top_k):
+            N = prediction.shape[0]
+            correct = 0
+            for i in range(0, len(prediction)):
+                pred_sorted = sorted(
+                    ([item, j] for j, item in enumerate(prediction[i])),
+                    key=lambda x: x[0],
+                    reverse=True
+                )
+                max_ids = [x[1] for x in pred_sorted[0:top_k]]
+                for m in max_ids:
+                    if m == labels[i]:
+                        correct += 1
+            accuracy = correct / N
+            return (accuracy,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[prediction, labels, top_k],
+            reference=op_ref)
+
+    @given(target_probabilities=hu.arrays(
+        dims=[10], elements=st.floats(allow_nan=False,
+                                      allow_infinity=False,
+                                      min_value=0.01,
+                                      max_value=1)),
+           **hu.gcs)
+    def test_perplexity(self, target_probabilities, gc, dc):
+        op = core.CreateOperator(
+            "Perplexity",
+            ["target_probabilities"],
+            ["perplexity"]
+        )
+
+        def op_ref(target_probabilities):
+            N = target_probabilities.shape[0]
+            perplexities = np.power(target_probabilities, -1.0 / N)
+            perplexity = reduce(lambda x, y: x * y, perplexities)
+            return (perplexity,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[target_probabilities],
+            reference=op_ref)
+
+    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs_cpu_only)
+    def test_lengths_to_segment_ids(self, lengths, gc, dc):
+        op = core.CreateOperator(
+            "LengthsToSegmentIds",
+            ["lengths"],
+            ["segment_ids"])
+
+        def op_ref(lengths):
+            sids = []
+            for i, l in enumerate(lengths):
+                sids.extend(l * [i])
+            return (np.array(sids, dtype=np.int32), )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=op_ref)
+
+    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs_cpu_only)
+    def test_lengths_range_fill(self, lengths, gc, dc):
+        op = core.CreateOperator(
+            "LengthsRangeFill",
+            ["lengths"],
+            ["increasing_seq"])
+
+        def op_ref(lengths):
+            sids = []
+            for _, l in enumerate(lengths):
+                sids.extend(list(range(l)))
+            return (np.array(sids, dtype=np.int32), )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=op_ref)
+
+    @given(**hu.gcs_cpu_only)
+    def test_segment_ids_to_ranges(self, gc, dc):
+        lengths = [4, 6, 3, 2, 0, 4]
+        op = core.CreateOperator(
+            "SegmentIdsToRanges",
+            ["segment_ids"],
+            ["ranges"])
+
+        def op_ref(segment_ids):
+            ranges = [np.array([0, 0], dtype=np.int32)]
+            prev = 0
+            for i, sid in enumerate(segment_ids):
+                while sid != prev:
+                    prev += 1
+                    ranges.append(np.array([i, 0], dtype=np.int32))
+                ranges[-1][1] += 1
+            return (np.array(ranges, dtype=np.int32), )
+
+        def lengths_to_segment_ids(lengths):
+            sids = []
+            for i, l in enumerate(lengths):
+                sids.extend(l * [i])
+            return (np.array(sids, dtype=np.int32), )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=np.array(lengths_to_segment_ids(lengths), dtype=np.int32),
+            reference=op_ref)
+
+    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs_cpu_only)
+    def test_lengths_to_ranges(self, lengths, gc, dc):
+        op = core.CreateOperator(
+            "LengthsToRanges",
+            ["lengths"],
+            ["ranges"])
+
+        def op_ref(x):
+            if not x.size:
+                return (x.reshape((0, 2)), )
+            return (np.column_stack((np.concatenate(([0], np.cumsum(x)[:-1])),
+                                     x)), )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=op_ref)
+
+    @given(prediction=hu.arrays(dims=[10, 3],
+                                elements=st.floats(allow_nan=False,
+                                                   allow_infinity=False,
+                                                   min_value=0,
+                                                   max_value=1)),
+           labels=hu.arrays(dims=[10],
+                            dtype=np.int32,
+                            elements=st.integers(min_value=0,
+                                                 max_value=3 - 1)),
+            **hu.gcs)
+    def test_multi_class_accuracy(self, prediction, labels, gc, dc):
+        op = core.CreateOperator(
+            "MultiClassAccuracy",
+            ["prediction", "labels"],
+            ["accuracies", "amounts"]
+        )
+
+        def op_ref(prediction, labels):
+            N = prediction.shape[0]
+            D = prediction.shape[1]
+            accuracies = np.empty(D, dtype=float)
+            accuracies.fill(0)
+            amounts = np.empty(D, dtype=int)
+            amounts.fill(0)
+            max_ids = np.argmax(prediction, axis=1)
+            for i in range(0, N):
+                max_id = max_ids[i]
+                label_id = labels[i]
+                if max_id == label_id:
+                    accuracies[label_id] += 1
+                amounts[label_id] += 1
+            for i in range(0, D):
+                amount = amounts[i]
+                if amount:
+                    accuracies[i] /= amount
+            return (accuracies, amounts,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[prediction, labels],
+            reference=op_ref)
+
+    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs_cpu_only)
+    def test_segment_ids_to_lengths(self, lengths, gc, dc):
+        op = core.CreateOperator(
+            "SegmentIdsToLengths",
+            ["segment_ids"],
+            ["lengths"])
+
+        def lengths_to_ids(lengths):
+            sids = []
+            for i, l in enumerate(lengths):
+                sids.extend(l * [i])
+            return sids
+
+        segment_ids = lengths_to_ids(lengths)
+
+        def ids_to_lengths(ids):
+            ids_length = len(ids)
+            if ids_length == 0:
+                return (np.array([], dtype=np.int32),)
+
+            lengths = []
+            # segment id starts with 0
+            prev_id = -1
+            tmp_length = 0
+            for idx in range(ids_length):
+                cur_id = ids[idx]
+                if cur_id != prev_id:
+                    if idx != 0:
+                        lengths.append(tmp_length)
+                    while prev_id + 1 != cur_id:
+                        lengths.append(0)
+                        prev_id += 1
+                    prev_id = cur_id
+                    tmp_length = 0
+                tmp_length += 1
+            lengths.append(tmp_length)
+            return (np.array(lengths, dtype=np.int32),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(segment_ids, dtype=np.int32)],
+            reference=ids_to_lengths)
+
+    @given(lengths=st.lists(st.integers(min_value=1, max_value=10),
+                            min_size=0,
+                            max_size=10),
+            power=st.sampled_from([0.5, 1.0, 1.5, 2.0]),
+           **hu.gcs_cpu_only)
+    def test_lengths_to_weights(self, lengths, power, gc, dc):
+        op = core.CreateOperator(
+            "LengthsToWeights",
+            ["lengths"],
+            ["weights"],
+            power=power)
+
+        def lengths_to_weights(lengths):
+            weighted_length = []
+            for l in lengths:
+                weighted_length.extend(l * [1 / pow(l, power)])
+
+            return (np.array(weighted_length, dtype=float),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=lengths_to_weights)
+
+    @given(input_tensor=hu.arrays(
+        dims=[10], elements=st.floats(allow_nan=False,
+                                      allow_infinity=False)),
+           **hu.gcs)
+    def test_abs(self, input_tensor, gc, dc):
+        op = core.CreateOperator(
+            "Abs",
+            ["input"],
+            ["output"]
+        )
+
+        def abs_ref(input_tensor):
+            return (np.abs(input_tensor),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_tensor],
+            reference=abs_ref)
+
+    @given(input_tensor=hu.arrays(
+        dims=[10], elements=st.floats(min_value=-10,
+                                      max_value=10)),
+           **hu.gcs)
+    def test_cos(self, input_tensor, gc, dc):
+        op = core.CreateOperator(
+            "Cos",
+            ["input"],
+            ["output"]
+        )
+
+        def cos_ref(input_tensor):
+            return (np.cos(input_tensor),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_tensor],
+            reference=cos_ref)
+
+    @given(input_tensor=hu.arrays(
+        dims=[10], elements=st.floats(min_value=-10,
+                                      max_value=10)),
+           **hu.gcs)
+    def test_sin(self, input_tensor, gc, dc):
+        op = core.CreateOperator(
+            "Sin",
+            ["input"],
+            ["output"]
+        )
+
+        def sin_ref(input_tensor):
+            return (np.sin(input_tensor),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_tensor],
+            reference=sin_ref)
+
+    @given(input_tensor=hu.arrays(
+        dims=[10], elements=st.floats(allow_nan=False,
+                                      allow_infinity=False)),
+           **hu.gcs)
+    def test_exp(self, input_tensor, gc, dc):
+        op = core.CreateOperator(
+            "Exp",
+            ["input"],
+            ["output"]
+        )
+
+        def exp_ref(input_tensor):
+            return (np.exp(input_tensor),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_tensor],
+            reference=exp_ref)
+
+    @given(input_tensor=hu.arrays(
+        dims=[10], elements=st.floats(min_value=1,
+                                      max_value=10000)),
+           **hu.gcs_cpu_only)
+    def test_log(self, input_tensor, gc, dc):
+        op = core.CreateOperator(
+            "Log",
+            ["input"],
+            ["output"]
+        )
+
+        def log_ref(input_tensor):
+            return (np.log(input_tensor),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_tensor],
+            reference=log_ref)
+        self.assertGradientChecks(gc, op, [input_tensor], 0, [0])
+
+    def test_blobs_dequeue_timeout(self):
+        op = core.CreateOperator(
+            "CreateBlobsQueue",
+            [],
+            ["queue"],
+            capacity=5,
+            num_blobs=1)
+        self.ws.run(op)
+        t = time.time()
+        op = core.CreateOperator(
+            "DequeueBlobs",
+            ["queue"],
+            ["out"],
+            timeout_secs=0.2)
+        self.assertRaises(RuntimeError, lambda: self.ws.run(op))
+        t = time.time() - t
+        self.assertGreater(t, 0.19)
+
+    @given(num_threads=st.integers(1, 10),  # noqa
+           num_elements=st.integers(1, 100),
+           capacity=st.integers(1, 5),
+           num_blobs=st.integers(1, 3),
+           do=st.sampled_from(hu.device_options))
+    def test_blobs_queue_threading(self, num_threads, num_elements,
+                                   capacity, num_blobs, do):
+        """
+        - Construct matrices of size N x D
+        - Start K threads
+        - Push all N rows into the queue of capacity C
+        - Pull all N rows out of the queue.
+        - Verify that the output matrices are permutation of the rows of the
+          original matrices.
+        """
+        import threading
+        try:
+            import queue
+        except ImportError:
+            # Py3
+            import Queue as queue
+        op = core.CreateOperator(
+            "CreateBlobsQueue",
+            [],
+            ["queue"],
+            capacity=capacity,
+            num_blobs=num_blobs,
+            device_option=do)
+        self.ws.run(op)
+
+        xs = [np.random.randn(num_elements, 5).astype(np.float32)
+              for _ in range(num_blobs)]
+        q = queue.Queue()
+        for i in range(num_elements):
+            q.put([x[i] for x in xs])
+
+        def enqueue(t):
+            while True:
+                feed_blobs = ["x_{}_{}".format(i, t) for i in range(num_blobs)]
+                op = core.CreateOperator(
+                    "EnqueueBlobs",
+                    ["queue"] + feed_blobs,
+                    feed_blobs,
+                    device_option=do)
+                try:
+                    elems = q.get_nowait()
+                    for elem, feed_blob in zip(elems, feed_blobs):
+                        self.ws.create_blob(feed_blob).feed(
+                            elem, device_option=do)
+                    self.ws.run(op)
+                except queue.Empty:
+                    return
+
+        # Create all blobs before racing on multiple threads
+        # (blob creation is not threadsafe)
+        for t in range(num_threads):
+            for i in range(num_blobs):
+                self.ws.create_blob("x_{}_{}".format(i, t))
+
+        threads = [threading.Thread(target=enqueue, args=(t,))
+                   for t in range(num_threads)]
+        for thread in threads:
+            thread.start()
+
+        for n in range(num_elements):
+            dequeue_blobs = ["y_{}_{}".format(i, n) for i in range(num_blobs)]
+            op = core.CreateOperator(
+                "DequeueBlobs",
+                ["queue"],
+                dequeue_blobs,
+                device_option=do)
+            self.ws.run(op)
+        for thread in threads:
+            thread.join()
+        op = core.CreateOperator("CloseBlobsQueue", ["queue"], [])
+        self.ws.run(op)
+        ys = [np.vstack([self.ws.blobs["y_{}_{}".format(i, n)].fetch()
+                         for n in range(num_elements)])
+              for i in range(num_blobs)]
+        for i in range(num_blobs):
+            self.assertEqual(ys[i].shape, xs[i].shape)
+            for j in range(num_elements):
+                # Verify that the rows of the returned blob are a
+                # permutation. The order may be different due to
+                # different threads racing.
+                self.assertTrue(
+                    any(np.array_equal(xs[i][j], ys[i][k])
+                        for k in range(num_elements)))
+
+    @given(num_producers=st.integers(1, 10),
+           num_consumers=st.integers(1, 10),
+           capacity=st.integers(1, 5),
+           num_blobs=st.integers(1, 3),
+           do=st.sampled_from(hu.device_options))
+    def test_safe_blobs_queue(self, num_producers, num_consumers,
+                              capacity, num_blobs, do):
+        init_net = core.Net('init_net')
+        queue = init_net.CreateBlobsQueue(
+            [], 1, capacity=capacity, num_blobs=num_blobs)
+        producer_steps = []
+        truth = 0
+        for i in range(num_producers):
+            name = 'producer_%d' % i
+            net = core.Net(name)
+            blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
+                     for times in range(num_blobs)]
+            status = net.NextName()
+            net.SafeEnqueueBlobs([queue] + blobs, blobs + [status])
+            count = (i + 1) * 10
+            step = core.execution_step(name, net, num_iter=count)
+            truth += count
+            producer_steps.append(step)
+        producer_exit_net = core.Net('producer_exit_net')
+        producer_exit_net.CloseBlobsQueue([queue], 0)
+        producer_step = core.execution_step('producer', [
+            core.execution_step(
+                'producers', producer_steps, concurrent_substeps=True),
+            core.execution_step('producer_exit', producer_exit_net)]
+        )
+
+        consumer_steps = []
+        counters = []
+        const_1 = init_net.ConstantFill([], 1, value=1.0)
+        for i in range(num_consumers):
+            name = 'consumer_%d' % i
+            net1 = core.Net(name)
+            blobs = net1.SafeDequeueBlobs([queue], num_blobs + 1)
+            status = blobs[-1]
+
+            net2 = core.Net(name + '_counter')
+            counter = init_net.ConstantFill([], 1, value=0.0)
+            counters.append(counter)
+            net2.Add([counter, const_1], counter)
+            consumer_steps.append(core.execution_step(
+                name, [net1, net2], should_stop_blob=status))
+        consumer_step = core.execution_step(
+            'consumer', consumer_steps, concurrent_substeps=True)
+
+        init_step = core.execution_step('init', init_net)
+        worker_step = core.execution_step(
+            'worker', [consumer_step, producer_step], concurrent_substeps=True)
+
+        plan = core.Plan('test')
+        plan.AddStep(init_step)
+        plan.AddStep(worker_step)
+
+        self.ws.run(plan)
+        v = 0
+        for counter in counters:
+            v += self.ws.blobs[str(counter)].fetch().tolist()
+        self.assertEqual(v, truth)
+
+    @given(num_queues=st.integers(1, 5),
+           num_iter=st.integers(5, 10),
+           capacity=st.integers(1, 5),
+           num_blobs=st.integers(1, 3))
+    def test_weighted_sample_blobs_queue(
+        self, num_queues, num_iter, capacity, num_blobs
+    ):
+        # Create BlobsQueue for each input queue
+        print("num_queues", num_queues)
+        init_net = core.Net('init_net')
+        queues = [
+            init_net.CreateBlobsQueue(
+                [], 1, capacity=capacity, num_blobs=num_blobs
+            ) for _ in range(num_queues)
+        ]
+
+        # Create multiple producer nets and one producer exist net
+        producer_steps = []
+        producer_exit_nets = []
+        for i in range(num_queues):
+            name = 'producer_%d' % i
+            net = core.Net(name)
+            blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
+                     for _ in range(num_blobs)]
+            status = net.NextName()
+            net.SafeEnqueueBlobs([queues[i]] + blobs, blobs + [status])
+
+            exit_net = core.Net('producer_exit_%d' % i)
+            exit_net.CloseBlobsQueue(queues[i], 0)
+            producer_exit_nets.append(exit_net)
+
+            step = core.execution_step(
+                name, [
+                    core.execution_step(
+                        'producer_%d' % i, [net], num_iter=num_iter
+                    ),
+                    core.execution_step('producer_exit_%d' % i, [exit_net]),
+                ]
+            )
+            producer_steps.append(step)
+
+        producer_step = core.execution_step(
+            'producer', [
+                core.execution_step(
+                    'producers',
+                    producer_steps,
+                    concurrent_substeps=True,
+                ),
+            ]
+        )
+
+        status_lst = []
+
+        def append(ins, outs):
+            status_lst.append(ins)
+
+        # Create one consumer dequeue net and one consumer exist net
+        consumer_net = core.Net('weight_sample_dequeue_net')
+        table_idx_blob = np.random.randint(low=-1, high=num_blobs, size=1)
+        blobs = consumer_net.WeightedSampleDequeueBlobs(
+            queues,
+            num_blobs + 1,
+            weights=np.random.uniform(low=0.0, high=1.0, size=(num_queues,)),
+            table_idx_blob=table_idx_blob[0],
+        )
+        status = blobs[-1]
+        consumer_net.Python(append)(status)
+
+        consumer_step = core.execution_step(
+            'consumer',
+            [
+                core.execution_step(
+                    'consumer', [consumer_net], should_stop_blob=status
+                ),
+                core.execution_step('producer_exit', producer_exit_nets)
+            ]
+        )
+
+        init_step = core.execution_step('init', init_net)
+        worker_step = core.execution_step(
+            'worker', [producer_step, consumer_step], concurrent_substeps=True)
+
+        plan = core.Plan('test')
+        plan.AddStep(init_step)
+        plan.AddStep(worker_step)
+
+        self.ws.run(plan)
+        assert len(status_lst) >= num_iter + 1
+        assert len(status_lst) <= num_iter * num_queues + 1
+
+    @given(
+        data=hu.tensor(),
+        **hu.gcs_cpu_only)
+    def test_squeeze_expand_dims(self, data, gc, dc):
+        dims = [0, 0]
+        if len(data.shape) > 2:
+            dims.append(2)
+        op = core.CreateOperator(
+            "ExpandDims",
+            ["data"],
+            ["expanded"],
+            dims=dims)
+
+        def expand_dims_ref(data, *args, **kw):
+            inc_dims = list(set(dims))
+            inc_dims.sort()
+            r = data
+            for dim in inc_dims:
+                r = np.expand_dims(r, axis=dim)
+            return (r, )
+
+        def squeeze_ref(data, *args, **kw):
+            dec_dims = list(set(dims))
+            dec_dims.sort(reverse=True)
+            r = data
+            for dim in dec_dims:
+                r = np.squeeze(r, axis=dim)
+            return (r, )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data],
+            reference=expand_dims_ref,
+            output_to_grad='expanded',
+            grad_reference=squeeze_ref)
+
+    @given(**hu.gcs_cpu_only)
+    def test_tt_layer(self, gc, dc):
+        seed = 1234
+        np.random.seed(seed)
+
+        inp_sizes = [2, 2, 2, 2]
+        out_sizes = [2, 2, 2, 2]
+        tt_ranks = [1, 3, 3, 3, 1]
+
+        op = core.CreateOperator(
+            "TT",
+            ["X", "b", "cores"],
+            ["Y"],
+            inp_sizes=inp_sizes,
+            out_sizes=out_sizes,
+            tt_ranks=tt_ranks,
+        )
+
+        X = np.expand_dims(
+            np.random.rand(16).astype(np.float32), axis=0)
+        b = np.array([0] * 16).astype(np.float32)
+        cores = tt_core.init_tt_cores(inp_sizes, out_sizes, tt_ranks)
+
+        self.ws.create_blob("X").feed(X)
+        self.ws.create_blob("b").feed(b)
+        self.ws.create_blob("cores").feed(cores)
+        self.ws.run(op)
+
+        Y = self.ws.blobs[("Y")].fetch()
+        Y = Y.reshape([16])
+
+        golden = np.array([-9.51763490e-07, -1.28442286e-06,
+                           -2.86281141e-07, 2.28865644e-07,
+                           -1.96180017e-06, -1.78920531e-06,
+                           9.31094666e-07, -2.04273989e-07,
+                           1.70017107e-06, 1.64845711e-06,
+                           -1.06099132e-06, -4.69111137e-07,
+                           6.57552358e-08, -1.28942040e-08,
+                           -2.29114004e-07, -1.04262714e-06])
+
+        # This golden array is dependent on the specified inp_sizes, out_sizes,
+        # tt_ranks, and seed. Changing these will cause the test to fail.
+        self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=1e-10)
+
+    @given(num_workers=st.integers(1, 10),
+           net_type=st.sampled_from(
+               ["simple", "dag"] +
+               (["async_dag"] if workspace.has_gpu_support else [])),
+           do=st.sampled_from(hu.device_options))
+    def test_dag_net_forking(self, net_type, num_workers, do):
+        from caffe2.python.model_helper import ModelHelper
+        from caffe2.python import brew
+        m = ModelHelper(name="test_model")
+        n = 10
+        d = 2
+        depth = 2
+        iters = 5
+        np.random.seed(1701)
+        # Build a binary tree of FC layers, summing at each node.
+        for i in reversed(range(depth)):
+            for j in range(2 ** i):
+                bottom_1 = "{}_{}".format(i + 1, 2 * j)
+                bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
+                mid_1 = "{}_{}_m".format(i + 1, 2 * j)
+                mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
+                top = "{}_{}".format(i, j)
+                brew.fc(
+                    m,
+                    bottom_1, mid_1,
+                    dim_in=d, dim_out=d,
+                    weight_init=('ConstantFill', dict(value=np.random.randn())),
+                    bias_init=('ConstantFill', dict(value=np.random.randn())))
+                brew.fc(
+                    m,
+                    bottom_2, mid_2,
+                    dim_in=d, dim_out=d,
+                    weight_init=('ConstantFill', dict(value=np.random.randn())),
+                    bias_init=('ConstantFill', dict(value=np.random.randn())))
+                m.net.Sum([mid_1, mid_2], top)
+        m.net.SquaredL2Distance(["0_0", "label"], "xent")
+        m.net.AveragedLoss("xent", "loss")
+        input_to_grad = m.AddGradientOperators(["loss"])
+        m.Proto().device_option.CopyFrom(do)
+        m.param_init_net.Proto().device_option.CopyFrom(do)
+
+        m.Proto().type = net_type
+        m.Proto().num_workers = num_workers
+
+        self.ws.run(m.param_init_net)
+
+        print(str(m.Proto()))
+
+        def run():
+            import numpy as np
+            np.random.seed(1701)
+            input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
+            for input_blob in input_blobs:
+                self.ws.create_blob(input_blob).feed(
+                    np.random.randn(n, d).astype(np.float32),
+                    device_option=do)
+                self.ws.create_blob("label").feed(
+                    np.random.randn(n, d).astype(np.float32),
+                    device_option=do)
+            self.ws.run(m.net)
+            gradients = [
+                self.ws.blobs[str(input_to_grad[input_blob])].fetch()
+                for input_blob in input_blobs]
+            return gradients
+
+        outputs = [run() for _ in range(iters)]
+        for output in outputs[1:]:
+            np.testing.assert_array_equal(outputs[0], output)
+            self.assertAlmostEqual(np.sum(np.square(output)), 91.81752,
+                                   delta=1e-2)
+
+    @given(input=hu.tensor(min_dim=2, max_dim=6, dtype=np.int32,
+                           elements=st.integers(min_value=0,
+                                                max_value=2**32 - 1)),
+           slice_dim=st.integers(),
+           a=st.integers(),
+           b=st.integers(),
+           is_empty=st.booleans(),
+           **hu.gcs_cpu_only)
+    def test_slice(self, input, slice_dim, a, b, is_empty, gc, dc):
+        slice_dim = slice_dim % len(input.shape)
+        if (is_empty):
+            input = np.random.rand(*([0] + list(input.shape))).astype(np.int32)
+            slice_dim += 1
+
+        a = a % input.shape[slice_dim]
+        b = b % input.shape[slice_dim] + 1
+        start_vec = np.zeros(len(input.shape), dtype=np.int32)
+        end_vec = np.ones(len(input.shape), dtype=np.int32) * -1
+        start_vec[slice_dim] = min(a, b)
+        end_vec[slice_dim] = max(a, b)
+        op = core.CreateOperator(
+            "Slice",
+            ["input", "start", "end"],
+            ["output"])
+
+        def slice_ref(x, s, e):
+            if len(s.shape) == 0:
+                return x
+            slc = [slice(si, None if ei == -1 else ei) for si, ei in zip(s, e)]
+            return (x[slc], )
+
+        self.assertReferenceChecks(gc, op, [input, start_vec, end_vec],
+                                   slice_ref)
+
+    @given(data=hu.tensor(), **hu.gcs_cpu_only)
+    def test_shape(self, data, gc, dc):
+        op = core.CreateOperator("Shape", ["data"], ["shape"])
+        self.assertReferenceChecks(gc, op, [data], lambda x: (x.shape, ))
+
+    @given(data=hu.tensor(), **hu.gcs_cpu_only)
+    def test_shape_with_axes(self, data, gc, dc):
+        def shape_ref(x, y):
+            return ([x.shape[i] for i in y],)
+        axes = np.random.randint(len(data.shape), size=10).tolist()
+        op = core.CreateOperator("Shape", ["data"], ["shape"], axes=axes)
+        self.assertReferenceChecks(gc, op, [data, axes], shape_ref)
+
+    @given(data=hu.tensor(), **hu.gcs_cpu_only)
+    def test_has_elements(self, data, gc, dc):
+        op = core.CreateOperator("HasElements", ["data"], ["has_elements"])
+        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) > 0, ))
+
+        op = core.CreateOperator("IsEmpty", ["data"], ["is_empty"])
+        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) == 0, ))
+
+    @given(initial_iters=st.integers(0, 100),
+           max_iters=st.integers(0, 100))
+    def test_should_stop_as_criteria_net_execution_step(
+            self, initial_iters, max_iters):
+        net = core.Net("net")
+        net.Iter(["iter"], ["iter"])
+        self.ws.create_blob("iter").feed(
+            np.asarray([initial_iters]).astype(np.int64))
+        self.ws.create_blob("num_iters").feed(
+            np.asarray([max_iters]).astype(np.int64))
+        criteria_net = core.Net("criteria")
+        criteria_net.GE(["iter", "num_iters"], ["stop"])
+        criteria_net.Proto().external_output.extend(["stop"])
+
+        plan = core.Plan('plan')
+        plan.AddStep(core.execution_step(
+            'step', [criteria_net, net],
+            should_stop_blob=core.BlobReference("stop")))
+        self.ws.run(plan)
+        iters = self.ws.blobs[("iter")].fetch()
+        self.assertEqual(iters.dtype, np.int64)
+        self.assertEqual(iters[0], max(initial_iters, max_iters))
+
+    def test_disabled_execution_step(self):
+        def createNets(i, disabled):
+            should_stop = 'should_stop_{}'.format(i)
+            output = 'output_{}'.format(i)
+
+            # init content and stop signal
+            init = core.Net("init_{}".format(i))
+            init.ConstantFill(
+                [],
+                [output],
+                shape=[1],
+                value=0.0
+            )
+            init.Cast([output], [should_stop], to='bool')
+
+            # decide if disabled or not
+            criterion = core.Net("criterion_{}".format(i))
+            tmp = criterion.ConstantFill(
+                [],
+                shape=[1],
+                value=1.0 if disabled else 0.0
+            )
+            criterion.Cast([tmp], [should_stop], to='bool')
+            criterion.Proto().external_output.extend([should_stop])
+
+            # the body net is just to turn a 0 blob to 1
+            net = core.Net("net_{}".format(i))
+            net.ConstantFill(
+                [],
+                [output],
+                shape=[1],
+                value=1.0
+            )
+
+            # always end the loop
+            ender = core.Net("ender_{}".format(i))
+            tmp = ender.ConstantFill(
+                [],
+                shape=[1],
+                value=1.0
+            )
+            ender.Cast([tmp], [should_stop], to='bool')
+            ender.Proto().external_output.extend([should_stop])
+
+            return [init, criterion, net, ender]
+
+        nets = [createNets(1, False),
+                createNets(2, True),
+                createNets(3, False)]
+        steps = [
+            core.execution_step(
+                'step_1', nets[0],
+                should_stop_blob=core.BlobReference('should_stop_1')),
+            core.execution_step(
+                'step_2', nets[1],
+                should_stop_blob=core.BlobReference('should_stop_2')),
+            core.execution_step('step_3', nets[2])
+        ]
+        expected = [1.0, 0.0, 1.0]
+
+        plan = core.Plan('plan')
+        plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
+        self.ws.run(plan)
+
+        for i, _ in enumerate(nets):
+            self.assertEqual(
+                self.ws.blobs['output_{}'.format(i + 1)].fetch()[0],
+                expected[i])
+
+    @given(initial_iters=st.integers(0, 100),
+           num_iters=st.integers(0, 100))
+    def test_iter_count_with_execution_step(self, initial_iters, num_iters):
+        net = core.Net("net")
+        net.Iter(["iter"], ["iter"])
+        self.ws.create_blob("iter").feed(
+            np.asarray([initial_iters]).astype(np.int64))
+
+        step = core.ExecutionStep("step", [net])
+        step.SetIter(num_iters)
+
+        plan = core.Plan("plan")
+        plan.AddStep(step)
+        self.ws.run(plan)
+        iters = self.ws.blobs[("iter")].fetch()
+        self.assertEqual(iters.dtype, np.int64)
+        self.assertEqual(iters[0], initial_iters + num_iters)
+
+
+    @given(initial_iters=st.integers(0, 100),
+           num_iters=st.integers(0, 100),
+           num_nets=st.integers(0, 5))
+    def test_atomic_iter_with_concurrent_steps(self, initial_iters, num_iters,
+                                               num_nets):
+        init_net = core.Net("init_net")
+        iter_mutex = init_net.CreateMutex([], ["iter_mutex"])
+        self.ws.create_blob("iter").feed(
+            np.asarray([initial_iters]).astype(np.int64))
+        concurrent_steps = core.ExecutionStep("concurrent_steps",
+                                              num_iter=num_iters)
+        for i in range(num_nets):
+            net = core.Net("net_{}".format(i))
+            net.AtomicIter([iter_mutex, "iter"], ["iter"])
+            step = core.ExecutionStep("step", [net])
+            concurrent_steps.AddSubstep(step)
+
+        concurrent_steps.SetConcurrentSubsteps(True)
+        plan = core.Plan("plan")
+        plan.AddStep(concurrent_steps)
+
+        stats_net = core.Net("stats_net")
+        stats_net.StatRegistryExport([], ["stats_key", "stats_val", "stats_ts"])
+
+        self.ws.run(init_net)
+        self.ws.run(plan)
+        self.ws.run(stats_net)
+        iters = self.ws.blobs[("iter")].fetch()
+        self.assertEqual(iters.dtype, np.int64)
+        self.assertEqual(iters[0], initial_iters + num_iters * num_nets)
+
+        if num_iters * num_nets > 0:
+            stats_key = self.ws.blobs[("stats_key")].fetch()
+            atomic_iter_key = b'atomic_iter/stats/iter/num_iter'
+            self.assertTrue(atomic_iter_key in stats_key)
+            stat_val = self.ws.blobs[("stats_val")].fetch()
+            self.assertEqual(num_iters * num_nets, stat_val[list(stats_key).index(atomic_iter_key)])
+
+
+    @given(a=hu.tensor(),
+           src=st.sampled_from(list(viewkeys(_NUMPY_TYPE_TO_ENUM))),
+           dst=st.sampled_from(list(viewkeys(_NUMPY_TYPE_TO_ENUM))),
+           use_name=st.booleans(),
+           **hu.gcs)
+    def test_cast(self, a, src, dst, use_name, gc, dc):
+        a = a.astype(src)
+
+        # Casting from a float type outside the range of the integral
+        # type is UB.
+        ftypes = [np.float32, np.float64]
+        if src in ftypes and dst not in ftypes and dst is not np.bool:
+            info = np.iinfo(dst)
+            a = np.clip(a, info.min, info.max)
+
+        def ref(data):
+            return [data.astype(dst)]
+
+        to = _NUMPY_TYPE_TO_ENUM[dst]
+        if use_name:
+            to = TensorProto.DataType.Name(to).lower()
+        op = core.CreateOperator('Cast', ["X"], ["Y"], to=to)
+        self.assertDeviceChecks(dc, op, [a], [0])
+        out, = self.assertReferenceChecks(gc, op, [a], ref)
+        self.assertEqual(dst, out.dtype)
+
+    @given(a=hu.tensor(),
+           eps=st.floats(min_value=1e-4, max_value=1e-2),
+           a_grad=hu.tensor(elements=st.floats(min_value=0.01, max_value=0.99)),
+           eps_grad=st.floats(min_value=1e-4, max_value=1e-3),
+           **hu.gcs)
+    def test_logit(self, a, eps, a_grad, eps_grad, gc, dc):
+        def ref(data):
+            data = np.clip(data, eps, 1.0 - eps)
+            return (np.log(data / (1 - data)), )
+        # forward testing carried out in the full range of input
+        # to ensure original test coverage.
+        # gradient test carried out with reduced input range
+        # because the sharp increase of the logit curve at 0 and 1
+        # error increases dramtically when input is close to 0 or 1
+        # and it will fail the test.
+        # So we only run gradient test in the range of (0.01, 0.99)
+        # very occationally, test may fail due to random accumulated error
+        # reduce test range to (0.02, 0.98) will improve test stability
+        op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps)
+        self.assertDeviceChecks(dc, op, [a], [0])
+        self.assertReferenceChecks(gc, op, [a], ref)
+        op_grad = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps_grad)
+        self.assertGradientChecks(gc, op_grad, [a_grad], 0, [0],
+                                  threshold=0.04, stepsize=2e-3)
+
+    @given(a=hu.tensor(elements=st.floats(allow_nan=True)),
+           value=st.floats(min_value=-10, max_value=10),
+           **hu.gcs)
+    def test_replace_nan(self, a, value, gc, dc):
+        def ref(data):
+            out = np.copy(data)
+            out[np.isnan(data)] = value
+            return (out, )
+
+        op = core.CreateOperator('ReplaceNaN', ["X"], ["Y"], value=value)
+        self.assertDeviceChecks(dc, op, [a], [0])
+        self.assertReferenceChecks(gc, op, [a], ref)
+
+    @given(data=_dtypes(dtypes=[np.int32, np.int64, np.float32, np.bool]).
+           flatmap(lambda dtype: hu.tensor(
+               min_dim=1, dtype=dtype, elements=hu.elements_of_type(dtype))),
+           has_input=st.booleans(),
+           has_extra_shape=st.booleans(),
+           extra_shape=st.lists(
+           min_size=1, max_size=5, elements=st.integers(1, 5)),
+           **hu.gcs)
+    def test_constant_fill(self, data, has_input, has_extra_shape, extra_shape,
+                           gc, dc):
+        dtype = data.dtype.type
+        # in opt mode, np.bool is converted into np.bool_
+        if data.dtype == np.dtype(np.bool):
+            dtype = np.bool
+
+        value = data.item(0)
+        gt_shape = data.shape
+        inputs = [data]
+        enum_type = _NUMPY_TYPE_TO_ENUM[dtype]
+
+        if has_input:
+            if has_extra_shape:
+                op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
+                                         dtype=enum_type,
+                                         extra_shape=extra_shape,
+                                         value=value)
+                gt_shape += tuple(extra_shape)
+            else:
+                op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
+                                         dtype=enum_type,
+                                         value=value)
+        else:
+            op = core.CreateOperator('ConstantFill', [], ["Y"],
+                                     dtype=enum_type,
+                                     value=value,
+                                     shape=list(gt_shape))
+            inputs = []
+
+        def ref(inputs=None):
+            outputs = np.full(shape=gt_shape, fill_value=value, dtype=dtype)
+            return [outputs]
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        out, = self.assertReferenceChecks(gc, op, inputs, ref)
+        self.assertEqual(dtype, out.dtype)
+
+    @given(t=st.integers(1, 5),
+           n=st.integers(1, 5),
+           d=st.integers(1, 5))
+    def test_elman_recurrent_network(self, t, n, d):
+        from caffe2.python import model_helper, brew
+        np.random.seed(1701)
+        step_net = model_helper.ModelHelper(name="Elman")
+        # TODO: name scope external inputs and outputs
+        step_net.Proto().external_input.extend(
+            ["input_t", "seq_lengths", "timestep",
+             "hidden_t_prev", "gates_t_w", "gates_t_b"])
+        step_net.Proto().type = "simple"
+        step_net.Proto().external_output.extend(["hidden_t", "gates_t"])
+        brew.fc(step_net,
+                "hidden_t_prev", "gates_t", dim_in=d, dim_out=d, axis=2)
+        step_net.net.Sum(["gates_t", "input_t"], ["gates_t"])
+        step_net.net.Sigmoid(["gates_t"], ["hidden_t"])
+
+        # Initialize params for step net in the parent net
+        for op in step_net.param_init_net.Proto().op:
+            workspace.RunOperatorOnce(op)
+
+        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
+            step_net.Proto().op, {"hidden_t": "hidden_t_grad"})
+        backward_mapping = {
+            str(k): str(v) for k, v in viewitems(backward_mapping)
+        }
+        backward_step_net = core.Net("ElmanBackward")
+        del backward_step_net.Proto().op[:]
+        backward_step_net.Proto().op.extend(backward_ops)
+        assert backward_mapping["input_t"] == "gates_t_grad"
+        links = [
+            ("hidden_t_prev", "hidden", 0),
+            ("hidden_t", "hidden", 1),
+            ("input_t", "input", 0),
+        ]
+        link_internal, link_external, link_offset = zip(*links)
+        backward_links = [
+            ("hidden_t_prev_grad", "hidden_grad", 0),
+            ("hidden_t_grad", "hidden_grad", 1),
+            ("gates_t_grad", "input_grad", 0),
+        ]
+        backward_link_internal, backward_link_external, backward_link_offset = \
+            zip(*backward_links)
+        backward_step_net.Proto().external_input.extend(["hidden_t_grad"])
+        backward_step_net.Proto().external_input.extend(
+            step_net.Proto().external_input)
+        backward_step_net.Proto().external_input.extend(
+            step_net.Proto().external_output)
+        inputs = ["input", "seq_lengths", "gates_t_w", "gates_t_b", "hidden_input"]
+        recurrent_inputs = ["hidden_input"]
+        op = core.CreateOperator(
+            "RecurrentNetwork",
+            inputs,
+            ["output", "hidden", "hidden_output", "step_workspaces"],
+            alias_src=["hidden", "hidden"],
+            alias_dst=["output", "hidden_output"],
+            alias_offset=[1, -1],
+            recurrent_states=["hidden"],
+            initial_recurrent_state_ids=[
+                inputs.index(i) for i in recurrent_inputs
+            ],
+            link_internal=link_internal,
+            link_external=link_external,
+            link_offset=link_offset,
+            backward_link_internal=backward_link_internal,
+            backward_link_external=backward_link_external,
+            backward_link_offset=backward_link_offset,
+            param=[inputs.index(p) for p in step_net.params],
+            step_net=step_net.Proto(),
+            backward_step_net=backward_step_net.Proto(),
+            outputs_with_grads=[0],
+        )
+        workspace.FeedBlob(
+            "input", np.random.randn(t, n, d).astype(np.float32))
+        workspace.FeedBlob(
+            "hidden_input", np.random.randn(1, n, d).astype(np.float32))
+        workspace.FeedBlob(
+            "seq_lengths", np.random.randint(0, t, size=(n,)).astype(np.int32))
+
+        def reference(input, seq_lengths, gates_w, gates_b, hidden_input):
+            T = input.shape[0]
+            N = input.shape[1]
+            D = input.shape[2]
+            hidden = np.zeros(shape=(T + 1, N, D))
+            assert hidden.shape[0] == T + 1
+            assert hidden.shape[1] == N
+            assert hidden.shape[2] == D
+
+            hidden[0, :, :] = hidden_input
+            for t in range(T):
+                input_t = input[t].reshape(1, N, D)
+                hidden_t_prev = hidden[t].reshape(1, N, D)
+                gates = np.dot(hidden_t_prev, gates_w.T)
+                gates = gates.reshape(1, N, D) + input_t.reshape(1, N, D)
+                hidden[t + 1] = sigmoid(gates)
+            return hidden[1:], hidden, hidden[-1].reshape(1, N, D)
+
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [workspace.FetchBlob(name)
+             for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
+                          "hidden_input"]],
+            reference,
+            outputs_to_check=[0, 1, 2])
+
+        for param in [0, 2, 3]:
+            self.assertGradientChecks(
+                hu.cpu_do,
+                op,
+                [workspace.FetchBlob(name)
+                 for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
+                              "hidden_input"]],
+                param,
+                [0])
+
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(n=st.integers(1, 5),
+           c=st.integers(1, 5),
+           h=st.integers(1, 5),
+           w=st.integers(1, 5),
+           pad=st.integers(0, 2),
+           block_size=st.integers(2, 3),
+           **hu.gcs)
+    def test_space_to_batch(self, n, c, h, w, pad, block_size, gc, dc):
+        assume((h + 2 * pad) % block_size == 0)
+        assume((w + 2 * pad) % block_size == 0)
+        X = np.random.randn(n, c, h, w).astype(np.float32)
+        op = core.CreateOperator("SpaceToBatch", ["X"], ["Y"],
+                                 pad=pad, block_size=block_size)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(n=st.integers(1, 5),
+           c=st.integers(1, 5),
+           h=st.integers(1, 5),
+           w=st.integers(1, 5),
+           pad=st.integers(0, 2),
+           block_size=st.integers(2, 3),
+           **hu.gcs)
+    def test_batch_to_space(self, n, c, h, w, pad, block_size, gc, dc):
+        assume((h + 2 * pad) % block_size == 0)
+        assume((w + 2 * pad) % block_size == 0)
+        X = np.random.randn(
+            n * block_size * block_size,
+            c,
+            (h + 2 * pad) // block_size,
+            (w + 2 * pad) // block_size).astype(np.float32)
+        op = core.CreateOperator("BatchToSpace", ["X"], ["Y"],
+                                 pad=pad, block_size=block_size)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           scale=st.floats(min_value=-2.0, max_value=2.0),
+           **hu.gcs)
+    def test_scale(self, X, in_place, scale, gc, dc):
+        op = core.CreateOperator(
+            "Scale", ["X"], ["Y" if not in_place else "X"],
+            scale=scale)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(s=st.text())
+    def test_string_serde(self, s):
+        s = s.encode('ascii', 'ignore')
+        self.ws.create_blob("a").feed(s)
+        serialized = self.ws.blobs["a"].serialize("a")
+        self.ws.create_blob("b").deserialize(serialized)
+        self.assertEqual(s, self.ws.blobs[("a")].fetch())
+        self.assertEqual(s, self.ws.blobs[("b")].fetch())
+
+    @given(pad=st.integers(0, 3),
+           size=st.integers(1, 10),
+           input_channels=st.integers(1, 5),
+           batch_size=st.integers(1, 5),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           mode=st.sampled_from(["constant", "reflect", "edge"]),
+           **hu.gcs)
+    def test_same_pad_image(self, pad, size, input_channels, batch_size, order,
+                            mode, gc, dc):
+        assume(size > pad)
+
+        op = core.CreateOperator(
+            "PadImage",
+            ["X"],
+            ["Y"],
+            pad=pad,
+            mode=mode,
+            order=order,
+        )
+        if order == "NHWC":
+            X = np.random.rand(
+                batch_size, size, size, input_channels).astype(np.float32) - 0.5
+
+            def numpy_pad_ref(x):
+                return (np.pad(
+                    x, ((0, 0), (pad, pad), (pad, pad), (0, 0)), mode),)
+
+        else:
+            X = np.random.rand(
+                batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+            def numpy_pad_ref(x):
+                return (np.pad(
+                    x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode),)
+
+        self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           size=st.integers(1, 10),
+           input_channels=st.integers(1, 5),
+           batch_size=st.integers(1, 5),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           mode=st.sampled_from(["constant", "reflect", "edge"]),
+           **hu.gcs)
+    def test_pad_image(self, pad_t, pad_l, pad_b, pad_r, size, input_channels,
+                       batch_size, order, mode, gc, dc):
+        assume(size > max(pad_b, pad_r, pad_t, pad_l))
+
+        op = core.CreateOperator(
+            "PadImage",
+            ["X"],
+            ["Y"],
+            pad_t=pad_t,
+            pad_l=pad_l,
+            pad_b=pad_b,
+            pad_r=pad_r,
+            mode=mode,
+            order=order,
+        )
+        if order == "NHWC":
+            X = np.random.rand(
+                batch_size, size, size, input_channels).astype(np.float32) - 0.5
+
+            def numpy_pad_ref(x):
+                return (np.pad(
+                    x, ((0, 0), (pad_t, pad_b), (pad_l, pad_r), (0, 0)),
+                    mode),)
+
+        else:
+            X = np.random.rand(
+                batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+            def numpy_pad_ref(x):
+                return (np.pad(
+                    x, ((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)),
+                    mode),)
+
+        self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-4, max_value=1e-2),
+           **hu.gcs_cpu_only)
+    def test_instance_norm(self, size, input_channels, batch_size, order,
+                           epsilon, gc, dc):
+        op = core.CreateOperator(
+            "InstanceNorm",
+            ["X", "scale", "bias"],
+            ["Y"],
+            order=order,
+            epsilon=epsilon,
+        )
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        def ref_nchw(x, scale, bias):
+            x = x.reshape(batch_size * input_channels, size * size)
+            y = (x - x.mean(1)[:, np.newaxis])
+            y /= np.sqrt(x.var(1) + epsilon)[:, np.newaxis]
+            y = y.reshape(batch_size, input_channels, size, size)
+            y = y * scale.reshape(1, input_channels, 1, 1)
+            y = y + bias.reshape(1, input_channels, 1, 1)
+            return (y, )
+
+        def ref_nhwc(x, scale, bias):
+            x = x.swapaxes(2, 3).swapaxes(1, 2)
+            y = ref_nchw(x, scale, bias)[0]
+            return (y.swapaxes(1, 2).swapaxes(2, 3), )
+
+        self.assertReferenceChecks(
+            gc, op, [X, scale, bias],
+            ref_nchw if order == "NCHW" else ref_nhwc)
+        # TODO(jiayq): when there are backward and GPU implementations, enable
+        # these two.
+        # self.assertDeviceChecks(dc, op, [X, scale, bias], [0])
+        # self.assertGradientChecks(gc, op, [X, scale, bias], 0, [0])
+
+        ws = workspace.C.Workspace()
+        feeds = [("X", X), ("scale", scale), ("bias", bias)]
+        for blob, arr in feeds:
+            ws.create_blob(blob).feed(arr)
+        for _ in range(100):
+            ws.run(op)
+        for blob, arr in feeds:
+            np.testing.assert_array_equal(ws.blobs[blob].fetch(), arr)
+
+    @given(sizes=st.lists(st.integers(1, 100), min_size=1),
+           in_place=st.booleans(),
+           **hu.gcs)
+    def test_unsafe_coalesce(self, sizes, in_place, gc, dc):
+        gAlignment = 32
+        Xs = [np.random.randn(size)
+              .astype(np.random.choice([np.float32, np.float64, np.uint8]))
+              for size in sizes]
+        op = core.CreateOperator(
+            "UnsafeCoalesce",
+            ["X_{}".format(i) for i, _ in enumerate(sizes)],
+            [("X_{}" if in_place else "Y_{}").format(i)
+             for i, _ in enumerate(sizes)] + ["coalesced"])
+        self.assertDeviceChecks(dc, op, Xs, list(range(len(sizes) + 1)))
+
+        def unsafe_coalesce(*xs):
+            def to_uint8(x):
+                x_aligned_bytes = ((x.nbytes + gAlignment - 1) // gAlignment) \
+                    * gAlignment
+                x_aligned = np.zeros(
+                    shape=(x_aligned_bytes // x.dtype.itemsize, ),
+                    dtype=x.dtype)
+                x_aligned[:x.size] = x
+                x_cast = np.fromstring(x_aligned.tobytes(), dtype='<u1')
+                return x_cast
+            flat = [to_uint8(x) for x in xs]
+            coalesced = np.concatenate(flat)
+            return list(xs) + [coalesced]
+        self.assertReferenceChecks(gc, op, Xs, unsafe_coalesce)
+
+    @given(inp=_dtypes().flatmap(lambda dt: _tensor_and_indices(
+        elements=st.floats(min_value=0, max_value=1), dtype=dt)),
+        **hu.gcs)
+    def test_sparse_to_dense(self, inp, gc, dc):
+        first_dim, X, I = inp
+        if X.dtype != np.dtype('float32') and gc.device_type == 1:
+            # Cuda only support 32 bit float
+            print("Bailout {}".format(X.dtype))
+            return
+        if gc.device_type == 1:
+            # Cuda version only support int32
+            I = I.astype(np.int32)
+
+        # values don't matter
+        D = np.zeros((first_dim,) + X.shape[1:]).astype(X.dtype)
+
+        op = core.CreateOperator("SparseToDense", ["I", "X", "D"], ["Y"])
+
+        def sparse_to_dense(I, X, D):
+            O = np.zeros(D.shape)
+            for i, p in enumerate(I):
+                O[p] += X[i]
+            return [O]
+
+        self.assertReferenceChecks(gc, op, [I, X, D], sparse_to_dense)
+        X = X.astype(np.float32)
+        self.assertGradientChecks(gc, op, [I, X, D], 1, [0])
+
+    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
+    def test_dot_product(self, inputs, gc, dc):
+        X, Y = inputs
+        op = core.CreateOperator("DotProduct", ["X", "Y"], 'out')
+
+        def dotproduct(X, Y):
+            return (np.sum(X * Y, axis=1), )
+
+        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(N=st.integers(min_value=2, max_value=10),
+           M=st.integers(min_value=2, max_value=10),
+           K=st.integers(min_value=2, max_value=10),
+           pad_value=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs_cpu_only)
+    def test_dot_product_with_padding(self, N, M, K, pad_value, gc, dc):
+        X = np.random.rand(N, M).astype(np.float32) - 0.5
+        Y = np.random.rand(N, K).astype(np.float32) - 0.5
+        op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
+                                 pad_value=pad_value)
+
+        def dotproduct(X, Y):
+            Z = np.ones((N, max(M, K))).astype(np.float32) * pad_value
+            if M < K:
+                Z[:, :M] = X
+                return (np.sum(Z * Y, axis=1), )
+            else:
+                Z[:, :K] = Y
+                return (np.sum(Z * X, axis=1), )
+
+        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(N=st.integers(min_value=2, max_value=10),
+           M=st.integers(min_value=2, max_value=10),
+           pad_value=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs_cpu_only)
+    def test_dot_product_with_rep_padding(self, N, M, pad_value, gc, dc):
+        K = 2 * M
+        X = np.random.rand(N, M).astype(np.float32) - 0.5
+        Y = np.random.rand(N, K).astype(np.float32) - 0.5
+        op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
+                                 replicate=True,
+                                 pad_value=pad_value)
+
+        def dotproduct(X, Y):
+            import numpy.matlib as npm
+            if M < K:
+                Z = npm.repmat(X, 1, K // M)
+                return (np.sum(Z * Y, axis=1), )
+            else:
+                Z = npm.repmat(Y, 1, M // K)
+                return (np.sum(Z * X, axis=1), )
+
+        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(N=st.integers(min_value=2, max_value=10),
+           M=st.integers(min_value=2, max_value=10), **hu.gcs_cpu_only)
+    def test_ensure_dense(self, N, M, gc, dc):
+        # in place
+        X = np.random.rand(N, M).astype(np.float32) - 0.5
+        op = core.CreateOperator("EnsureDense", ["X"], "X")
+        self.assertReferenceChecks(gc, op, [X], lambda x: [x])
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # or not
+        X = np.random.rand(N, M).astype(np.float32) - 0.5
+        op = core.CreateOperator("EnsureDense", ["X"], "out")
+        self.assertReferenceChecks(gc, op, [X], lambda x: [x])
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(N=st.integers(min_value=10, max_value=100),
+           M=st.integers(min_value=2, max_value=10),
+           num_buckets=st.integers(min_value=1, max_value=5),
+           **hu.gcs_cpu_only)
+    def test_accumulate_histogram_op(self, N, M, num_buckets, gc, dc):
+        X = np.random.rand(N, M).astype(np.float32)
+        lower_bound, upper_bound = 0.1, 0.9
+        op = core.CreateOperator("AccumulateHistogram", ["X"],
+                                 ['cur_hist', 'acc_hist'],
+                                 lower_bound=lower_bound,
+                                 upper_bound=upper_bound,
+                                 num_buckets=num_buckets)
+
+        def histogram(X):
+            hist = np.zeros((num_buckets + 2, ), dtype=np.int32)
+            segment = (upper_bound - lower_bound) / num_buckets
+            Y = np.zeros((N, M), dtype=np.int32)
+            Y[X < lower_bound] = 0
+            Y[X >= upper_bound] = num_buckets + 1
+            Y[(X >= lower_bound) & (X < upper_bound)] = \
+                ((X[(X >= lower_bound) & (X < upper_bound)] - lower_bound) /
+                        segment + 1).astype(np.int32)
+
+            for i in range(Y.shape[0]):
+                for j in range(Y.shape[1]):
+                    hist[Y[i][j]] += 1
+            cur_hist, acc_hist = hist, hist
+
+            return [cur_hist, acc_hist]
+
+        self.assertDeviceChecks(dc, op, [X], [0, 1])
+        self.assertReferenceChecks(gc, op, [X], histogram)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
new file mode 100644
index 0000000..9deb588
--- /dev/null
+++ b/caffe2/python/hypothesis_test_util.py
@@ -0,0 +1,654 @@
+## @package hypothesis_test_util
+# Module caffe2.python.hypothesis_test_util
+"""
+The Hypothesis library uses *property-based testing* to check
+invariants about the code under test under a variety of random inputs.
+
+ The key idea here is to express properties of the code under test
+(e.g. that it passes a gradient check, that it implements a reference
+function, etc), and then generate random instances and verify they
+satisfy these properties.
+
+The main functions of interest are exposed on `HypothesisTestCase`.
+You can usually just add a short function in this to generate an
+arbitrary number of test cases for your operator.
+
+The key functions are:
+
+- `assertDeviceChecks(devices, op, inputs, outputs)`. This asserts that the
+  operator computes the same outputs, regardless of which device it is executed
+  on.
+- `assertGradientChecks(device, op, inputs, output_,
+  outputs_with_grads)`. This implements a standard numerical gradient checker
+  for the operator in question.
+- `assertReferenceChecks(device, op, inputs, reference)`. This runs the
+  reference function (effectively calling `reference(*inputs)`, and comparing
+  that to the output of output.
+
+`hypothesis_test_util.py` exposes some useful pre-built samplers.
+
+- `hu.gcs` - a gradient checker device (`gc`) and device checker devices (`dc`)
+
+- `hu.gcs_cpu_only` - a CPU-only gradient checker device (`gc`) and
+  device checker devices (`dc`). Used for when your operator is only
+  implemented on the CPU.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.proto import caffe2_pb2
+from caffe2.python import (
+    workspace, device_checker, gradient_checker, test_util, core)
+import contextlib
+import copy
+import functools
+import hypothesis
+import hypothesis.extra.numpy
+import hypothesis.strategies as st
+import logging
+import numpy as np
+import os
+
+
+def is_sandcastle():
+    if os.getenv('SANDCASTLE') == '1':
+        return True
+    elif os.getenv('TW_JOB_USER') == 'sandcastle':
+        return True
+    return False
+
+
+def is_travis():
+    return 'TRAVIS' in os.environ
+
+
+hypothesis.settings.register_profile(
+    "sandcastle",
+    hypothesis.settings(
+        derandomize=True,
+        suppress_health_check=[hypothesis.HealthCheck.too_slow],
+        database=None,
+        min_satisfying_examples=1,
+        max_examples=100,
+        verbosity=hypothesis.Verbosity.verbose))
+
+hypothesis.settings.register_profile(
+    "dev",
+    hypothesis.settings(
+        suppress_health_check=[hypothesis.HealthCheck.too_slow],
+        database=None,
+        max_examples=10,
+        min_satisfying_examples=1,
+        verbosity=hypothesis.Verbosity.verbose))
+hypothesis.settings.register_profile(
+    "debug",
+    hypothesis.settings(
+        suppress_health_check=[hypothesis.HealthCheck.too_slow],
+        database=None,
+        max_examples=1000,
+        min_satisfying_examples=1,
+        verbosity=hypothesis.Verbosity.verbose))
+hypothesis.settings.load_profile(
+    'sandcastle' if is_sandcastle() else os.getenv('CAFFE2_HYPOTHESIS_PROFILE',
+                                                   'dev')
+)
+
+
+def dims(min_value=1, max_value=5):
+    return st.integers(min_value=min_value, max_value=max_value)
+
+
+def elements_of_type(dtype=np.float32, filter_=None):
+    elems = None
+    if dtype in (np.float16, np.float32, np.float64):
+        elems = st.floats(min_value=-1.0, max_value=1.0)
+    elif dtype is np.int32:
+        elems = st.integers(min_value=0, max_value=2 ** 31 - 1)
+    elif dtype is np.int64:
+        elems = st.integers(min_value=0, max_value=2 ** 63 - 1)
+    elif dtype is np.bool:
+        elems = st.booleans()
+    else:
+        raise ValueError("Unexpected dtype without elements provided")
+    return elems if filter_ is None else elems.filter(filter_)
+
+
+def arrays(dims, dtype=np.float32, elements=None):
+    if elements is None:
+        elements = elements_of_type(dtype)
+    return hypothesis.extra.numpy.arrays(
+        dtype,
+        dims,
+        elements=elements,
+    )
+
+
+def tensor(min_dim=1,
+           max_dim=4,
+           dtype=np.float32,
+           elements=None,
+           **kwargs):
+    dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    return dims_.flatmap(
+        lambda dims: arrays(dims, dtype, elements))
+
+
+def tensor1d(min_len=1, max_len=64, dtype=np.float32, elements=None):
+    return tensor(1, 1, dtype, elements, min_value=min_len, max_value=max_len)
+
+
+def segment_ids(size, is_sorted):
+    if size == 0:
+        return st.just(np.empty(shape=[0], dtype=np.int32))
+    if is_sorted:
+        return arrays(
+            [size],
+            dtype=np.int32,
+            elements=st.booleans()).map(
+                lambda x: np.cumsum(x, dtype=np.int32) - x[0])
+    else:
+        return arrays(
+            [size],
+            dtype=np.int32,
+            elements=st.integers(min_value=0, max_value=2 * size))
+
+
+def lengths(size, min_segments=None, max_segments=None, **kwargs):
+    # First generate number of boarders between segments
+    # Then create boarder values and add 0 and size
+    # By sorting and computing diff we convert them to lengths of
+    # possible 0 value
+    if min_segments is None:
+        min_segments = 0
+    if max_segments is None:
+        max_segments = size
+    assert min_segments >= 0
+    assert min_segments <= max_segments
+    if size == 0 and max_segments == 0:
+        return st.just(np.empty(shape=[0], dtype=np.int32))
+    assert max_segments > 0, "size is not 0, need at least one segment"
+    return st.integers(
+        min_value=max(min_segments - 1, 0), max_value=max_segments - 1
+    ).flatmap(
+        lambda num_borders:
+        hypothesis.extra.numpy.arrays(
+            np.int32, num_borders, elements=st.integers(
+                min_value=0, max_value=size
+            )
+        )
+    ).map(
+        lambda x: np.append(x, np.array([0, size], dtype=np.int32))
+    ).map(sorted).map(np.diff)
+
+
+def segmented_tensor(
+    min_dim=1,
+    max_dim=4,
+    dtype=np.float32,
+    is_sorted=True,
+    elements=None,
+    segment_generator=segment_ids,
+    allow_empty=False,
+    **kwargs
+):
+    gen_empty = st.booleans() if allow_empty else st.just(False)
+    data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    data_dims_ = st.tuples(
+        gen_empty, data_dims_
+    ).map(lambda pair: ([0] if pair[0] else []) + pair[1])
+    return data_dims_.flatmap(lambda data_dims: st.tuples(
+        arrays(data_dims, dtype, elements),
+        segment_generator(data_dims[0], is_sorted=is_sorted),
+    ))
+
+
+def lengths_tensor(min_segments=None, max_segments=None, *args, **kwargs):
+    gen = functools.partial(
+        lengths, min_segments=min_segments, max_segments=max_segments)
+    return segmented_tensor(*args, segment_generator=gen, **kwargs)
+
+
+def sparse_segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32,
+                            is_sorted=True, elements=None, allow_empty=False,
+                            segment_generator=segment_ids, itype=np.int64,
+                            **kwargs):
+    gen_empty = st.booleans() if allow_empty else st.just(False)
+    data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    all_dims_ = st.tuples(gen_empty, data_dims_).flatmap(
+        lambda pair: st.tuples(
+            st.just(pair[1]),
+            (st.integers(min_value=1, max_value=pair[1][0]) if not pair[0]
+             else st.just(0)),
+        ))
+    return all_dims_.flatmap(lambda dims: st.tuples(
+        arrays(dims[0], dtype, elements),
+        arrays(dims[1], dtype=itype, elements=st.integers(
+            min_value=0, max_value=dims[0][0] - 1)),
+        segment_generator(dims[1], is_sorted=is_sorted),
+    ))
+
+
+def sparse_lengths_tensor(**kwargs):
+    return sparse_segmented_tensor(segment_generator=lengths, **kwargs)
+
+
+def tensors(n, min_dim=1, max_dim=4, dtype=np.float32, elements=None, **kwargs):
+    dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    return dims_.flatmap(
+        lambda dims: st.lists(
+            arrays(dims, dtype, elements),
+            min_size=n,
+            max_size=n))
+
+
+def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
+    return tensors(
+        n, 1, 1, dtype, elements, min_value=min_len, max_value=max_len
+    )
+
+
+cpu_do = caffe2_pb2.DeviceOption()
+gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
+device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
+# Include device option for each GPU
+expanded_device_options = [cpu_do] + (
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
+     for i in range(workspace.NumCudaDevices())]
+    if workspace.has_gpu_support else [])
+
+
+def device_checker_device_options():
+    return st.just(device_options)
+
+
+def gradient_checker_device_option():
+    return st.sampled_from(device_options)
+
+
+gcs = dict(
+    gc=gradient_checker_device_option(),
+    dc=device_checker_device_options()
+)
+
+gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
+gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+
+
+@contextlib.contextmanager
+def temp_workspace(name=b"temp_ws"):
+    old_ws_name = workspace.CurrentWorkspace()
+    workspace.SwitchWorkspace(name, True)
+    yield
+    workspace.ResetWorkspace()
+    workspace.SwitchWorkspace(old_ws_name)
+
+
+def runOpBenchmark(
+    device_option,
+    op,
+    inputs,
+    input_device_options=None,
+    iterations=10,
+):
+    op = copy.deepcopy(op)
+    op.device_option.CopyFrom(device_option)
+    net = caffe2_pb2.NetDef()
+    net.op.extend([op])
+    net.name = op.name if op.name else "test"
+
+    with temp_workspace():
+        _input_device_options = input_device_options or \
+            core.InferOpBlobDevicesAsDict(op)[0]
+        for (n, b) in zip(op.input, inputs):
+            workspace.FeedBlob(
+                n,
+                b,
+                device_option=_input_device_options.get(n, device_option)
+            )
+        workspace.CreateNet(net)
+        ret = workspace.BenchmarkNet(net.name, 1, iterations, True)
+    return ret
+
+
+class HypothesisTestCase(test_util.TestCase):
+    """
+    A unittest.TestCase subclass with some helper functions for
+    utilizing the `hypothesis` (hypothesis.readthedocs.io) library.
+    """
+    def assertDeviceChecks(
+        self,
+        device_options,
+        op,
+        inputs,
+        outputs_to_check,
+        input_device_options=None,
+        threshold=0.01
+    ):
+        """
+        Asserts that the operator computes the same outputs, regardless of
+        which device it is executed on.
+
+        Useful for checking the consistency of GPU and CPU
+        implementations of operators.
+
+        Usage example:
+
+            @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
+            def test_sum(self, inputs, in_place, gc, dc):
+                op = core.CreateOperator("Sum", ["X1", "X2"],
+                                                ["Y" if not in_place else "X1"])
+                X1, X2 = inputs
+                self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        """
+        dc = device_checker.DeviceChecker(
+            threshold,
+            device_options=device_options
+        )
+        self.assertTrue(
+            dc.CheckSimple(op, inputs, outputs_to_check, input_device_options)
+        )
+
+    def assertGradientChecks(
+        self,
+        device_option,
+        op,
+        inputs,
+        outputs_to_check,
+        outputs_with_grads,
+        grad_ops=None,
+        threshold=0.005,
+        stepsize=0.05,
+        input_device_options=None,
+    ):
+        """
+        Implements a standard numerical gradient checker for the operator
+        in question.
+
+        Useful for checking the consistency of the forward and
+        backward implementations of operators.
+
+        Usage example:
+
+            @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
+            def test_sum(self, inputs, in_place, gc, dc):
+                op = core.CreateOperator("Sum", ["X1", "X2"],
+                                                ["Y" if not in_place else "X1"])
+                X1, X2 = inputs
+                self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
+        """
+        gc = gradient_checker.GradientChecker(
+            stepsize=stepsize,
+            threshold=threshold,
+            device_option=device_option,
+            workspace_name=str(device_option),
+            input_device_options=input_device_options,
+        )
+        res, grad, grad_estimated = gc.CheckSimple(
+            op, inputs, outputs_to_check, outputs_with_grads,
+            grad_ops=grad_ops,
+            input_device_options=input_device_options
+        )
+        self.assertEqual(grad.shape, grad_estimated.shape)
+        self.assertTrue(
+            res,
+            "Gradient check failed for input " + str(op.input[outputs_to_check])
+        )
+
+    def _assertGradReferenceChecks(
+        self,
+        op,
+        inputs,
+        ref_outputs,
+        output_to_grad,
+        grad_reference,
+        threshold=1e-4,
+    ):
+        grad_blob_name = output_to_grad + '_grad'
+        grad_ops, grad_map = core.GradientRegistry.GetBackwardPass(
+            [op], {output_to_grad: grad_blob_name})
+        output_grad = workspace.FetchBlob(output_to_grad)
+        grad_ref_outputs = grad_reference(output_grad, ref_outputs, inputs)
+        workspace.FeedBlob(grad_blob_name, workspace.FetchBlob(output_to_grad))
+        workspace.RunOperatorsOnce(grad_ops)
+
+        self.assertEqual(len(grad_ref_outputs), len(inputs))
+        for (n, ref) in zip(op.input, grad_ref_outputs):
+            grad_names = grad_map.get(n)
+            if not grad_names:
+                # no grad for this input
+                self.assertIsNone(ref)
+            else:
+                if isinstance(grad_names, core.BlobReference):
+                    # dense gradient
+                    ref_vals = ref
+                    ref_indices = None
+                    val_name = grad_names
+                else:
+                    # sparse gradient
+                    ref_vals, ref_indices = ref
+                    val_name = grad_names.values
+                vals = workspace.FetchBlob(str(val_name))
+                np.testing.assert_allclose(
+                    vals,
+                    ref_vals,
+                    atol=threshold,
+                    rtol=threshold,
+                    err_msg='Gradient {0} (x) is not matching the reference (y)'
+                    .format(val_name),
+                )
+                if ref_indices is not None:
+                    indices = workspace.FetchBlob(str(grad_names.indices))
+                    np.testing.assert_allclose(indices, ref_indices,
+                                               atol=1e-4, rtol=1e-4)
+
+    def _assertInferTensorChecks(self, name, shapes, types, output):
+        if name not in shapes:
+            # No inferred shape or type available
+            return
+        output = workspace.FetchBlob(name)
+        if type(output) is np.ndarray:
+            if output.dtype == np.dtype('float64'):
+                correct_type = caffe2_pb2.TensorProto.DOUBLE
+            elif output.dtype == np.dtype('float32'):
+                correct_type = caffe2_pb2.TensorProto.FLOAT
+            elif output.dtype == np.dtype('int32'):
+                correct_type = caffe2_pb2.TensorProto.INT32
+            elif output.dtype == np.dtype('int64'):
+                correct_type = caffe2_pb2.TensorProto.INT64
+            else:
+                correct_type = "unknown {}".format(np.dtype)
+        else:
+            correct_type = str(type(output))
+        try:
+            np.testing.assert_array_equal(
+                np.array(shapes[name]).astype(np.int32),
+                np.array(output.shape).astype(np.int32),
+                err_msg='Shape {} mismatch: {} vs. {}'.format(
+                    name,
+                    shapes[name],
+                    output.shape))
+            # BUG: Workspace blob type not being set correctly T16121392
+            if correct_type != caffe2_pb2.TensorProto.INT32:
+                return
+            np.testing.assert_equal(
+                types[name],
+                correct_type,
+                err_msg='Type {} mismatch: {} vs. {}'.format(
+                    name, types[name], correct_type,
+                )
+            )
+        except AssertionError as e:
+            # Temporarily catch these assertion errors when validating
+            # inferred shape and type info
+            logging.warning(str(e))
+            if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1':
+                raise e
+
+    def assertReferenceChecks(
+        self,
+        device_option,
+        op,
+        inputs,
+        reference,
+        input_device_options=None,
+        threshold=1e-4,
+        output_to_grad=None,
+        grad_reference=None,
+        atol=None,
+        outputs_to_check=None,
+    ):
+        """
+        This runs the reference Python function implementation
+        (effectively calling `reference(*inputs)`, and compares that
+        to the output of output, with an absolute/relative tolerance
+        given by the `threshold` parameter.
+
+        Useful for checking the implementation matches the Python
+        (typically NumPy) implementation of the same functionality.
+
+        Usage example:
+
+            @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
+            def test_softsign(self, X, inplace, gc, dc):
+                op = core.CreateOperator(
+                    "Softsign", ["X"], ["X" if inplace else "Y"])
+
+                def softsign(X):
+                    return (X / (1 + np.abs(X)),)
+
+                self.assertReferenceChecks(gc, op, [X], softsign)
+        """
+        op = copy.deepcopy(op)
+        op.device_option.CopyFrom(device_option)
+
+        with temp_workspace():
+            if (len(op.input) > len(inputs)):
+                raise ValueError(
+                    'must supply an input for each input on the op: %s vs %s' %
+                    (op.input, inputs))
+            _input_device_options = input_device_options or \
+                core.InferOpBlobDevicesAsDict(op)[0]
+            for (n, b) in zip(op.input, inputs):
+                workspace.FeedBlob(
+                    n,
+                    b,
+                    device_option=_input_device_options.get(n, device_option)
+                )
+            net = core.Net("opnet")
+            net.Proto().op.extend([op])
+            test_shape_inference = False
+            try:
+                (shapes, types) = workspace.InferShapesAndTypes([net])
+                test_shape_inference = True
+            except RuntimeError as e:
+                # Temporarily catch runtime errors when inferring shape
+                # and type info
+                logging.warning(str(e))
+                if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1':
+                    raise e
+            workspace.RunNetOnce(net)
+            reference_outputs = reference(*inputs)
+            if not (isinstance(reference_outputs, tuple) or
+                    isinstance(reference_outputs, list)):
+                raise RuntimeError(
+                    "You are providing a wrong reference implementation. A "
+                    "proper one should return a tuple/list of numpy arrays.")
+            if not outputs_to_check:
+                self.assertEqual(len(reference_outputs), len(op.output))
+                outputs_to_check = list(range(len(op.output)))
+            outs = []
+            for (output_index, ref) in zip(outputs_to_check, reference_outputs):
+                output_blob_name = op.output[output_index]
+                output = workspace.FetchBlob(output_blob_name)
+                if output.dtype.kind in ('S', 'O'):
+                    np.testing.assert_array_equal(output, ref)
+                else:
+                    if atol is None:
+                        atol = threshold
+                    np.testing.assert_allclose(
+                        output, ref, atol=atol, rtol=threshold,
+                        err_msg=(
+                            'Output {0} is not matching the reference'.format(
+                                output_blob_name,
+                            )),
+                    )
+                if test_shape_inference:
+                    self._assertInferTensorChecks(
+                        output_blob_name, shapes, types, output)
+                outs.append(output)
+            if grad_reference is not None:
+                assert output_to_grad is not None, \
+                    "If grad_reference is set," \
+                    "output_to_grad has to be set as well"
+
+                with core.DeviceScope(device_option):
+                    self._assertGradReferenceChecks(
+                        op, inputs, reference_outputs,
+                        output_to_grad, grad_reference,
+                        threshold=threshold)
+            return outs
+
+    def assertValidationChecks(
+            self,
+            device_option,
+            op,
+            inputs,
+            validator,
+            input_device_options=None,
+            as_kwargs=True,
+            init_net=None,
+    ):
+        if as_kwargs:
+            assert len(set(list(op.input) + list(op.output))) == \
+                len(op.input) + len(op.output), \
+                "in-place ops are not supported in as_kwargs mode"
+        op = copy.deepcopy(op)
+        op.device_option.CopyFrom(device_option)
+
+        with temp_workspace():
+            _input_device_options = input_device_options or \
+                core.InferOpBlobDevicesAsDict(op)[0]
+            for (n, b) in zip(op.input, inputs):
+                workspace.FeedBlob(
+                    n,
+                    b,
+                    device_option=_input_device_options.get(n, device_option)
+                )
+            if init_net:
+                workspace.RunNetOnce(init_net)
+            workspace.RunOperatorOnce(op)
+            outputs = [workspace.FetchBlob(n) for n in op.output]
+            if as_kwargs:
+                validator(**dict(zip(
+                    list(op.input) + list(op.output), inputs + outputs)))
+            else:
+                validator(inputs=inputs, outputs=outputs)
+
+    def assertRunOpRaises(
+        self,
+        device_option,
+        op,
+        inputs,
+        input_device_options=None,
+        exception=(Exception,),
+        regexp=None,
+    ):
+        op = copy.deepcopy(op)
+        op.device_option.CopyFrom(device_option)
+
+        with temp_workspace():
+            _input_device_options = input_device_options or \
+                core.InferOpBlobDevicesAsDict(op)[0]
+            for (n, b) in zip(op.input, inputs):
+                workspace.FeedBlob(
+                    n,
+                    b,
+                    device_option=_input_device_options.get(n, device_option)
+                )
+            if regexp is None:
+                self.assertRaises(exception, workspace.RunOperatorOnce, op)
+            else:
+                self.assertRaisesRegexp(
+                    exception, regexp, workspace.RunOperatorOnce, op)
diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py
new file mode 100644
index 0000000..92165df
--- /dev/null
+++ b/caffe2/python/ideep/LRN_op_test.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class LRNTest(hu.HypothesisTestCase):
+    @given(input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           im_size=st.integers(1, 10),
+           order=st.sampled_from(["NCHW"]),
+           **mu.gcs)
+
+    def test_LRN(self, input_channels,
+                            batch_size, im_size, order,
+                             gc, dc):
+        op = core.CreateOperator(
+            "LRN",
+            ["X"],
+            ["Y", "Y_scale"],
+            size=5,
+            alpha=0.001,
+            beta=0.75,
+            bias=2.0,
+            order=order,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, im_size, im_size).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py
new file mode 100644
index 0000000..a90f933
--- /dev/null
+++ b/caffe2/python/ideep/concat_split_op_test.py
@@ -0,0 +1,118 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, workspace
+from hypothesis import given, settings
+import caffe2.python.ideep_test_util as mu
+
+@st.composite
+def _tensor_splits(draw, add_axis=False):
+    """Generates (axis, split_info, tensor_splits) tuples."""
+    tensor = draw(hu.tensor(min_dim=2, min_value=4))  # Each dim has at least 4 elements.
+    axis = draw(st.integers(0, len(tensor.shape) - 1))
+    if add_axis:
+        # Simple case: get individual slices along one axis, where each of them
+        # is (N-1)-dimensional. The axis will be added back upon concatenation.
+        return (
+            axis,
+            np.ones(tensor.shape[axis], dtype=np.int32),
+            [
+                np.array(tensor.take(i, axis=axis))
+                for i in range(tensor.shape[axis])
+            ]
+        )
+    else:
+        # General case: pick some (possibly consecutive, even non-unique)
+        # indices at which we will split the tensor, along the given axis.
+        splits = sorted(draw(
+            st.lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
+        ) + [0, tensor.shape[axis]])
+        # Not support empty tensor
+        splits = list(set(splits))
+        return (
+            axis,
+            np.array(np.diff(splits), dtype=np.int32),
+            [
+                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
+                for i in range(len(splits) - 1)
+            ],
+        )
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestConcatSplitOps(hu.HypothesisTestCase):
+    @given(tensor_splits=_tensor_splits(),
+           **mu.gcs)
+    def test_concat(self, tensor_splits, gc, dc):
+        axis, _, splits = tensor_splits
+
+        op = core.CreateOperator(
+            "Concat",
+            ['X_{}'.format(i) for i in range(len(splits))],
+            ['concat_result', 'split_info'],
+            axis=axis
+        )
+
+        self.assertDeviceChecks(dc, op, splits, [0, 1])
+        self.assertGradientChecks(gc, op, splits, 0, [0])
+
+    @given(tensor_splits=_tensor_splits(),
+           split_as_arg=st.booleans(),
+           **mu.gcs)
+    def test_split(self, tensor_splits, split_as_arg, gc, dc):
+        axis, split_info, splits = tensor_splits
+
+        split_as_arg = True
+
+        if split_as_arg:
+            input_names = ['input']
+            input_tensors = [np.concatenate(splits, axis=axis)]
+            kwargs = dict(axis=axis, split=split_info)
+        else:
+            input_names = ['input', 'split']
+            input_tensors = [np.concatenate(splits, axis=axis), split_info]
+            kwargs = dict(axis=axis)
+
+        op = core.CreateOperator(
+            "Split",
+            input_names,
+            ['X_{}'.format(i) for i in range(len(split_info))],
+            **kwargs
+        )
+
+        def split_ref(input, split=split_info):
+            s = np.cumsum([0] + list(split))
+            return [
+                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
+                for i in range(len(split))
+            ]
+        outputs_with_grad = range(len(split_info))
+        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
+        self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad)
+
+    @given(tensor_splits=_tensor_splits(add_axis=True), **mu.gcs)
+    def test_concat_add_axis(self, tensor_splits, gc, dc):
+        axis, _, splits = tensor_splits
+        op = core.CreateOperator(
+            "Concat",
+            ['X_{}'.format(i) for i in range(len(splits))],
+            ['concat_result', 'split_info'],
+            axis=axis,
+            add_axis=1
+        )
+
+        self.assertDeviceChecks(dc, op, splits, [0, 1])
+
+        for i in range(len(splits)):
+            self.assertGradientChecks(gc, op, splits, i, [0])
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
new file mode 100644
index 0000000..352b889
--- /dev/null
+++ b/caffe2/python/ideep/conv_op_test.py
@@ -0,0 +1,124 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+from caffe2.python.transformations import optimizeForIDEEP
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class ConvTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 10),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 5),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           training_mode=st.booleans(),
+           group=st.integers(1, 2),
+           **mu.gcs)
+    def test_convolution(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, training_mode, group, gc, dc):
+        training = 1 if training_mode else 0
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            training_mode=training,
+        )
+        X = np.random.rand(
+            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels * group, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+        if training_mode:
+            for i in range(len(inputs)):
+                self.assertGradientChecks(gc, op, inputs, i, [0], threshold=0.01)
+
+    @given(batch_size=st.integers(1, 3), **mu.gcs)
+    def test_depthwise_convolution(self, batch_size, gc, dc):
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"],
+            ["Y"],
+            stride=1,
+            pad=0,
+            kernel=1,
+            group=4,
+            device_option=dc[0]
+        )
+        op1 = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"],
+            ["Y"],
+            stride=1,
+            pad=0,
+            kernel=1,
+            group=4,
+            device_option=dc[1]
+        )
+        X = np.random.rand(batch_size, 544, 14, 14).astype(np.float32)
+        w = np.random.rand(544, 136, 1, 1).astype(np.float32)
+        b = np.random.rand(544).astype(np.float32)
+
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X', X, dc[0])
+        workspace.FeedBlob('w', w, dc[0])
+        workspace.FeedBlob('b', b, dc[0])
+        workspace.RunOperatorOnce(op)
+        Y0 = workspace.FetchBlob('Y')
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X', X, dc[1])
+        workspace.FeedBlob('w', w, dc[1])
+        workspace.FeedBlob('b', b, dc[1])
+        net = core.Net("net")
+        old_net = caffe2_pb2.NetDef()
+        old_net.op.extend([op1])
+        net.Proto().CopyFrom(old_net)
+        optimizeForIDEEP(net)
+        workspace.RunOperatorOnce(net.Proto().op[0])
+        Y1 = workspace.FetchBlob('Y')
+
+        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
+            print(Y1.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y1 - Y0)))
+            self.assertTrue(False)
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X', X, dc[1])
+        workspace.FeedBlob('w', w, dc[1])
+        workspace.FeedBlob('b', b, dc[1])
+        workspace.RunOperatorOnce(op1)
+        Y2 = workspace.FetchBlob('Y')
+
+        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
+            print(Y2.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y2 - Y0)))
+            self.assertTrue(False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
new file mode 100644
index 0000000..0863e2b
--- /dev/null
+++ b/caffe2/python/ideep/convfusion_op_test.py
@@ -0,0 +1,372 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import copy
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+from caffe2.python.transformations import optimizeForIDEEP
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class ConvFusionTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 20),
+           input_channels=st.integers(1, 16),
+           output_channels=st.integers(1, 16),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           group=st.integers(1, 1),
+           **mu.gcs)
+    def test_convolution_relu_fusion(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, group, gc, dc):
+        conv = core.CreateOperator(
+            "Conv",
+            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
+            ["Y0"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            device_option=dc[0]
+        )
+        relu = core.CreateOperator(
+            "Relu",
+            ["Y0"],
+            ["Y0"],
+            device_option=dc[0]
+        )
+
+        # Manual fusion for Conv + ReLU
+        conv_fusion = core.CreateOperator(
+            "ConvFusion",
+            ["X1", "w1", "b1"] if use_bias else ["X1", "w1"],
+            ["Y1"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            fusion_type = 1,
+            device_option=dc[1]
+        )
+
+        X = np.random.rand(
+            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels * group, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
+
+        old_ws_name = workspace.CurrentWorkspace()
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X0', X, dc[0])
+        workspace.FeedBlob('w0', w, dc[0])
+        workspace.FeedBlob('b0', b, dc[0])
+        workspace.RunOperatorOnce(conv)
+        workspace.RunOperatorOnce(relu)
+        Y0 = workspace.FetchBlob('Y0')
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X1', X, dc[1])
+        workspace.FeedBlob('w1', w, dc[1])
+        workspace.FeedBlob('b1', b, dc[1])
+        workspace.RunOperatorOnce(conv_fusion)
+        Y1 = workspace.FetchBlob('Y1')
+        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
+            print(Y1.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y1 - Y0)))
+            self.assertTrue(False)
+
+        # Auto fusion for Conv + ReLU
+        workspace.ResetWorkspace()
+        old_net = caffe2_pb2.NetDef()
+        conv_old = caffe2_pb2.OperatorDef()
+        conv_old.CopyFrom(conv)
+        conv_old.device_option.CopyFrom(dc[1])
+        relu_old = caffe2_pb2.OperatorDef()
+        relu_old.CopyFrom(relu)
+        relu_old.device_option.CopyFrom(dc[1])
+        old_net.op.extend([conv_old, relu_old])
+        workspace.FeedBlob('X0', X, dc[1])
+        workspace.FeedBlob('w0', w, dc[1])
+        workspace.FeedBlob('b0', b, dc[1])
+        net = core.Net("net")
+        net.Proto().CopyFrom(old_net)
+        optimizeForIDEEP(net)
+        self.assertTrue(len(net.Proto().op) == 1)
+        self.assertTrue(net.Proto().op[0].type == "ConvFusion")
+        workspace.RunOperatorOnce(net.Proto().op[0])
+        Y2 = workspace.FetchBlob('Y0')
+        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
+            print(Y2.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y2 - Y0)))
+            self.assertTrue(False)
+
+        workspace.SwitchWorkspace(old_ws_name)
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 20),
+           input_channels=st.integers(1, 16),
+           output_channels=st.integers(1, 16),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           group=st.integers(1, 1),
+           **mu.gcs)
+    def test_convolution_sum_fusion(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, group, gc, dc):
+        relu_S0 = core.CreateOperator(
+            "Relu",
+            ["S0"],
+            ["S0"],
+            device_option=dc[0]
+        )
+        conv = core.CreateOperator(
+            "Conv",
+            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
+            ["Y0"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            device_option=dc[0]
+        )
+        sum = core.CreateOperator(
+            "Sum",
+            ["S0", "Y0"],
+            ["S0"],
+            device_option=dc[0]
+        )
+
+        # Manual fusion for Conv + Sum
+        relu_S1 = core.CreateOperator(
+            "Relu",
+            ["S1"],
+            ["S1"],
+            device_option=dc[1]
+        )
+        conv_fusion = core.CreateOperator(
+            "ConvFusion",
+            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
+            ["S1"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            fusion_type = 2,
+            device_option=dc[1]
+        )
+        X = np.random.rand(
+            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels * group, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
+
+        old_ws_name = workspace.CurrentWorkspace()
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X0', X, dc[0])
+        workspace.FeedBlob('w0', w, dc[0])
+        workspace.FeedBlob('b0', b, dc[0])
+        workspace.RunOperatorOnce(conv)
+        Y0 = workspace.FetchBlob('Y0')
+        S = np.random.rand(*Y0.shape).astype(np.float32) - 0.5
+        workspace.FeedBlob('S0', S, dc[0])
+        workspace.RunOperatorOnce(relu_S0)
+        workspace.RunOperatorOnce(sum)
+        S0 = workspace.FetchBlob('S0')
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X1', X, dc[1])
+        workspace.FeedBlob('w1', w, dc[1])
+        workspace.FeedBlob('b1', b, dc[1])
+        workspace.FeedBlob('S1', S, dc[1])
+        workspace.RunOperatorOnce(relu_S1)
+        workspace.RunOperatorOnce(conv_fusion)
+        S1 = workspace.FetchBlob('S1')
+
+        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
+            print(S1.flatten())
+            print(S0.flatten())
+            print(np.max(np.abs(S1 - S0)))
+            self.assertTrue(False)
+
+        # Auto fusion for Conv + Sum
+        workspace.ResetWorkspace()
+        old_net = caffe2_pb2.NetDef()
+        relu_S0_old = caffe2_pb2.OperatorDef()
+        relu_S0_old.CopyFrom(relu_S0)
+        relu_S0_old.device_option.CopyFrom(dc[1])
+        conv_old = caffe2_pb2.OperatorDef()
+        conv_old.CopyFrom(conv)
+        conv_old.device_option.CopyFrom(dc[1])
+        sum_old = caffe2_pb2.OperatorDef()
+        sum_old.CopyFrom(sum)
+        sum_old.device_option.CopyFrom(dc[1])
+        old_net.op.extend([relu_S0_old, conv_old, sum_old])
+        workspace.FeedBlob('X0', X, dc[1])
+        workspace.FeedBlob('w0', w, dc[1])
+        workspace.FeedBlob('b0', b, dc[1])
+        workspace.FeedBlob('S0', S, dc[1])
+        net = core.Net("net")
+        net.Proto().CopyFrom(old_net)
+        optimizeForIDEEP(net)
+        self.assertTrue(len(net.Proto().op) == 2)
+        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
+        workspace.RunNetOnce(net.Proto())
+        S2 = workspace.FetchBlob('S0')
+        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
+            print(S2.flatten())
+            print(S0.flatten())
+            print(np.max(np.abs(S2 - S0)))
+            self.assertTrue(False)
+
+        workspace.SwitchWorkspace(old_ws_name)
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 20),
+           input_channels=st.integers(1, 16),
+           output_channels=st.integers(1, 16),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           group=st.integers(1, 1),
+           **mu.gcs)
+    def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, group, gc, dc):
+        relu_S0 = core.CreateOperator(
+            "Relu",
+            ["S0"],
+            ["S0"],
+            device_option=dc[0]
+        )
+        conv = core.CreateOperator(
+            "Conv",
+            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
+            ["Y0"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            device_option=dc[0]
+        )
+        sum = core.CreateOperator(
+            "Sum",
+            ["S0", "Y0"],
+            ["S0"],
+            device_option=dc[0]
+        )
+        relu = core.CreateOperator(
+            "Relu",
+            ["S0"],
+            ["S0"],
+            device_option=dc[0]
+        )
+
+        # Manual fusion for Conv + Sum + ReLU
+        relu_S1 = core.CreateOperator(
+            "Relu",
+            ["S1"],
+            ["S1"],
+            device_option=dc[1]
+        )
+        conv_fusion = core.CreateOperator(
+            "ConvFusion",
+            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
+            ["S1"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group,
+            fusion_type = 3,
+            device_option=dc[1]
+        )
+        X = np.random.rand(
+            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels * group, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
+
+        old_ws_name = workspace.CurrentWorkspace()
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X0', X, dc[0])
+        workspace.FeedBlob('w0', w, dc[0])
+        workspace.FeedBlob('b0', b, dc[0])
+        workspace.RunOperatorOnce(conv)
+        Y0 = workspace.FetchBlob('Y0')
+        S = np.random.rand(*Y0.shape).astype(np.float32) - 0.5
+        workspace.FeedBlob('S0', S, dc[0])
+        workspace.RunOperatorOnce(relu_S0)
+        workspace.RunOperatorOnce(sum)
+        workspace.RunOperatorOnce(relu)
+        S0 = workspace.FetchBlob('S0')
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X1', X, dc[1])
+        workspace.FeedBlob('w1', w, dc[1])
+        workspace.FeedBlob('b1', b, dc[1])
+        workspace.FeedBlob('S1', S, dc[1])
+        workspace.RunOperatorOnce(relu_S1)
+        workspace.RunOperatorOnce(conv_fusion)
+        S1 = workspace.FetchBlob('S1')
+
+        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
+            print(S1.flatten())
+            print(S0.flatten())
+            print(np.max(np.abs(S1 - S0)))
+            self.assertTrue(False)
+
+        # Auto fusion for Conv + Sum + ReLU
+        workspace.ResetWorkspace()
+        old_net = caffe2_pb2.NetDef()
+        relu_S0_old = caffe2_pb2.OperatorDef()
+        relu_S0_old.CopyFrom(relu_S0)
+        relu_S0_old.device_option.CopyFrom(dc[1])
+        conv_old = caffe2_pb2.OperatorDef()
+        conv_old.CopyFrom(conv)
+        conv_old.device_option.CopyFrom(dc[1])
+        sum_old = caffe2_pb2.OperatorDef()
+        sum_old.CopyFrom(sum)
+        sum_old.device_option.CopyFrom(dc[1])
+        relu_old = caffe2_pb2.OperatorDef()
+        relu_old.CopyFrom(relu)
+        relu_old.device_option.CopyFrom(dc[1])
+        old_net.op.extend([relu_S0_old, conv_old, sum_old, relu_old])
+        workspace.FeedBlob('X0', X, dc[1])
+        workspace.FeedBlob('w0', w, dc[1])
+        workspace.FeedBlob('b0', b, dc[1])
+        workspace.FeedBlob('S0', S, dc[1])
+        net = core.Net("net")
+        net.Proto().CopyFrom(old_net)
+        optimizeForIDEEP(net)
+        self.assertTrue(len(net.Proto().op) == 2)
+        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
+        workspace.RunNetOnce(net.Proto())
+        S2 = workspace.FetchBlob('S0')
+        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
+            print(S2.flatten())
+            print(S0.flatten())
+            print(np.max(np.abs(S2 - S0)))
+            self.assertTrue(False)
+
+        workspace.SwitchWorkspace(old_ws_name)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py
new file mode 100644
index 0000000..55d243b
--- /dev/null
+++ b/caffe2/python/ideep/copy_op_test.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+from random import randint
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class CopyTest(unittest.TestCase):
+    def _get_deep_device(self):
+        return caffe2_pb2.DeviceOption(device_type=caffe2_pb2.IDEEP)
+
+    def test_copy_to_ideep(self):
+        op = core.CreateOperator(
+                "CopyCPUToIDEEP",
+                ["X"],
+                ["X_ideep"],
+            )
+        op.device_option.CopyFrom(self._get_deep_device())
+        n = randint(1, 128)
+        c = randint(1, 64)
+        h = randint(1, 128)
+        w = randint(1, 128)
+        X = np.random.rand(n, c, h, w).astype(np.float32)
+        workspace.FeedBlob("X", X)
+        workspace.RunOperatorOnce(op)
+        X_ideep = workspace.FetchBlob("X_ideep")
+        np.testing.assert_allclose(X, X_ideep)
+
+    def test_copy_from_ideep(self):
+        op = core.CreateOperator(
+                "CopyIDEEPToCPU",
+                ["X_ideep"],
+                ["X"],
+            )
+        op.device_option.CopyFrom(self._get_deep_device())
+        n = randint(1, 128)
+        c = randint(1, 64)
+        h = randint(1, 128)
+        w = randint(1, 128)
+        X = np.random.rand(n, c, h, w).astype(np.float32)
+        workspace.FeedBlob("X_ideep", X, self._get_deep_device())
+        workspace.RunOperatorOnce(op)
+        X_ideep = workspace.FetchBlob("X")
+        np.testing.assert_allclose(X, X_ideep)
+
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
new file mode 100644
index 0000000..2e36b02
--- /dev/null
+++ b/caffe2/python/ideep/dropout_op_test.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class DropoutTest(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           ratio=st.floats(0, 0.999),
+           **mu.gcs)
+    def test_dropout_is_test(self, X, in_place, ratio, gc, dc):
+        """Test with is_test=True for a deterministic reference impl."""
+        op = core.CreateOperator('Dropout', ['X'],
+                                 ['X' if in_place else 'Y'],
+                                 ratio=ratio, is_test=True)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # No sense in checking gradients for test phase
+
+        def reference_dropout_test(x):
+            return x, np.ones(x.shape, dtype=np.bool)
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_test,
+            # The 'mask' output may be uninitialized
+            outputs_to_check=[0])
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           **mu.gcs)
+    @unittest.skipIf(True, "Skip duo to different rand seed.")
+    def test_dropout_ratio0(self, X, in_place, output_mask, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        is_test = not output_mask
+        op = core.CreateOperator('Dropout', ['X'],
+                                 ['X' if in_place else 'Y'] +
+                                 (['mask'] if output_mask else []),
+                                 ratio=0.0, is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        def reference_dropout_ratio0(x):
+            return (x,) if is_test else (x, np.ones(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio0, outputs_to_check=[0])
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py
new file mode 100644
index 0000000..ea4b857
--- /dev/null
+++ b/caffe2/python/ideep/elementwise_sum_op_test.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class ElementwiseSumTest(hu.HypothesisTestCase):
+    @given(size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           inputs=st.integers(2, 7),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_elementwise_sum(self,
+                                 size,
+                                 input_channels,
+                                 batch_size,
+                                 inputs,
+                                 inplace,
+                                 gc,
+                                 dc):
+        op = core.CreateOperator(
+            "Sum",
+            ["X_{}".format(i) for i in range(inputs)],
+            ["X_0" if inplace else "Y"],
+        )
+        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
+            np.float32) for _ in range(inputs)]
+        self.assertDeviceChecks(dc, op, Xs, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
new file mode 100644
index 0000000..8492ab8
--- /dev/null
+++ b/caffe2/python/ideep/fc_op_test.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class FcTest(hu.HypothesisTestCase):
+    @given(n=st.integers(1, 5), m=st.integers(1, 5),
+           k=st.integers(1, 5), **mu.gcs)
+    def test_fc(self,n, m, k, gc, dc):
+        X = np.random.rand(m, k).astype(np.float32) - 0.5
+        W = np.random.rand(n, k).astype(np.float32) - 0.5
+        b = np.random.rand(n).astype(np.float32) - 0.5
+
+        op = core.CreateOperator(
+            'FC',
+            ['X', 'W', 'b'],
+            ["Y"]
+            )
+
+        self.assertDeviceChecks(dc, op, [X, W, b], [0])
+
+        for i in range(3):
+            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py
new file mode 100644
index 0000000..1d5890a
--- /dev/null
+++ b/caffe2/python/ideep/pool_op_test.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings, assume
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class PoolTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           method=st.sampled_from(["MaxPool", "AveragePool"]),
+           **mu.gcs)
+    def test_pooling(self, stride, pad, kernel, size,
+                         input_channels, batch_size,
+                         method, gc, dc):
+        assume(pad < kernel)
+        op = core.CreateOperator(
+            method,
+            ["X"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        if 'MaxPool' not in method:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py
new file mode 100644
index 0000000..d2f01fc
--- /dev/null
+++ b/caffe2/python/ideep/relu_op_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class ReluTest(hu.HypothesisTestCase):
+    @given(X=hu.tensor(),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_relu(self, X, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Relu",
+            ["X"],
+            ["Y"] if not inplace else ["X"],
+        )
+        # go away from the origin point to avoid kink problems
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py
new file mode 100644
index 0000000..59eeaf9
--- /dev/null
+++ b/caffe2/python/ideep/softmax_op_test.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class SoftmaxTest(hu.HypothesisTestCase):
+    @given(size=st.integers(8, 20),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_softmax(self, size, input_channels, batch_size, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Softmax",
+            ["X"],
+            ["Y"],
+            axis=1,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
new file mode 100644
index 0000000..437be04
--- /dev/null
+++ b/caffe2/python/ideep/spatial_bn_op_test.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given, settings
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+from caffe2.python import brew, core, workspace
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.model_helper import ModelHelper
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestSpatialBN(hu.HypothesisTestCase):
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(7, 10),
+           batch_size=st.integers(1, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           inplace=st.sampled_from([True, False]),
+           **mu.gcs)
+    def test_spatialbn_test_mode(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["X" if inplace else "Y"],
+            order=order,
+            is_test=True,
+            epsilon=epsilon
+        )
+
+        def reference_spatialbn_test(X, scale, bias, mean, var):
+            if order == "NCHW":
+                scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
+                bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
+                mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
+                var = var[np.newaxis, :, np.newaxis, np.newaxis]
+            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
+
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(7, 10),
+           batch_size=st.integers(1, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW"]),
+           epsilon=st.floats(1e-5, 1e-2),
+           inplace=st.sampled_from([True, False]),
+           **mu.gcs)
+    def test_spatialbn_train_mode(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, gc, dc):
+        print("dc0: {}, dc1: {}".format(dc[0], dc[1]))
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "running_mean", "running_var"],
+            ["X" if inplace else "Y",
+            "running_mean", "running_var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+        )
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        running_mean = np.random.randn(input_channels).astype(np.float32)
+        running_var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        # TODO: It looks like IDEEP spatial_bn op outputs save_var (output[4])
+        # as the reciprocal of CPU op's output. Need to check back and add
+        # output[4] for comparison
+        self.assertDeviceChecks(dc, op, [X, scale, bias, running_mean, running_var],
+            [0, 1, 2, 3])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(1, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           **mu.gcs)
+    def test_spatialbn_train_mode_gradient_check(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["Y", "mean", "var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+        )
+        np.random.seed(seed)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
+            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
+                                      input_to_check, [0])
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/squeeze_op_test.py b/caffe2/python/ideep/squeeze_op_test.py
new file mode 100644
index 0000000..2a2f85d
--- /dev/null
+++ b/caffe2/python/ideep/squeeze_op_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class SqueezeTest(hu.HypothesisTestCase):
+    @given(
+        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
+        inplace=st.booleans(),
+        **mu.gcs
+    )
+    def test_squeeze(self, squeeze_dims, inplace, gc, dc):
+        shape = [
+            1 if dim in squeeze_dims else np.random.randint(1, 5)
+            for dim in range(4)
+        ]
+        X = np.random.rand(*shape).astype(np.float32)
+        op = core.CreateOperator(
+            "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
new file mode 100644
index 0000000..b0483cf
--- /dev/null
+++ b/caffe2/python/ideep/test_ideep_net.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+from caffe2.python.models.download import ModelDownloader
+import numpy as np
+import argparse
+import time
+import os.path
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size."
+    )
+    parser.add_argument("--model", type=str, help="The model to benchmark.")
+    parser.add_argument(
+        "--order",
+        type=str,
+        default="NCHW",
+        help="The order to evaluate."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="CPU",
+        help="device to evaluate on."
+    )
+    parser.add_argument(
+        "--cudnn_ws",
+        type=int,
+        help="The cudnn workspace size."
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10,
+        help="Number of iterations to run the network."
+    )
+    parser.add_argument(
+        "--warmup_iterations",
+        type=int,
+        default=10,
+        help="Number of warm-up iterations before benchmarking."
+    )
+    parser.add_argument(
+        "--forward_only",
+        action='store_true',
+        help="If set, only run the forward pass."
+    )
+    parser.add_argument(
+        "--layer_wise_benchmark",
+        action='store_true',
+        help="If True, run the layer-wise benchmark as well."
+    )
+    parser.add_argument(
+        "--engine",
+        type=str,
+        default="",
+        help="If set, blindly prefer the given engine(s) for every op.")
+    parser.add_argument(
+        "--dump_model",
+        action='store_true',
+        help="If True, dump the model prototxts to disk."
+    )
+    parser.add_argument("--net_type", type=str, default="simple")
+    parser.add_argument("--num_workers", type=int, default=2)
+    parser.add_argument("--use-nvtx", default=False, action='store_true')
+    parser.add_argument("--htrace_span_log_path", type=str)
+    return parser
+
+
+def benchmark(args):
+    print('Batch size: {}'.format(args.batch_size))
+    mf = ModelDownloader()
+    init_net, pred_net, value_info = mf.get_c2_model(args.model)
+    input_shapes = {k : [args.batch_size] + v[-1][1:] for (k, v) in value_info.items()}
+    print("input info: {}".format(input_shapes))
+    external_inputs = {}
+    for k, v in input_shapes.items():
+        external_inputs[k] = np.random.randn(*v).astype(np.float32)
+
+    if args.device == 'CPU':
+        device_option = core.DeviceOption(caffe2_pb2.CPU)
+    elif args.device == 'MKL':
+        device_option = core.DeviceOption(caffe2_pb2.MKLDNN)
+    elif args.device == 'IDEEP':
+        device_option = core.DeviceOption(caffe2_pb2.IDEEP)
+    else:
+        raise Exception("Unknown device: {}".format(args.device))
+    print("Device option: {}, {}".format(args.device, device_option))
+    pred_net.device_option.CopyFrom(device_option)
+    for op in pred_net.op:
+        op.device_option.CopyFrom(device_option)
+
+    # Hack to initialized weights into MKL/IDEEP context
+    workspace.RunNetOnce(init_net)
+    bb = workspace.Blobs()
+    weights = {}
+    for b in bb:
+        weights[b] = workspace.FetchBlob(b)
+    for k, v in external_inputs.items():
+        weights[k] = v
+    workspace.ResetWorkspace()
+
+    with core.DeviceScope(device_option):
+        for name, blob in weights.items():
+            #print("{}".format(name))
+            workspace.FeedBlob(name, blob, device_option)
+        workspace.CreateNet(pred_net)
+        start = time.time()
+        res = workspace.BenchmarkNet(pred_net.name,
+                                     args.warmup_iterations,
+                                     args.iterations,
+                                     args.layer_wise_benchmark)
+        print("FPS: {:.2f}".format(1/res[0]*1000*args.batch_size))
+
+if __name__ == '__main__':
+    args, extra_args = GetArgumentParser().parse_known_args()
+    if (
+        not args.batch_size or not args.model or not args.order
+    ):
+        GetArgumentParser().print_help()
+    benchmark(args)
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
new file mode 100644
index 0000000..d420a63
--- /dev/null
+++ b/caffe2/python/ideep/transform_ideep_net.py
@@ -0,0 +1,341 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import copy
+import json
+import os.path
+
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, utils
+import caffe2.python._import_c_extension as C
+
+
+
+def pairwise(iterable):
+    from itertools import tee
+    a, b = tee(iterable)
+    next(b, None)
+    return zip(a, b)
+
+
+def last_producer(ops, blob):
+    for (i, op) in reversed(list(enumerate(ops))):
+        if blob in op.output:
+            return i
+    raise ValueError("Failed to find last producer of blob, %s", blob)
+
+
+def blob_uses(net, blob):
+    u = []
+    for i, op in enumerate(net.op):
+        if blob in op.input or blob in op.control_input:
+            u.append(i)
+    return u
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(description="Caffe2 optimization")
+    parser.add_argument("--init_net",
+                        type=argparse.FileType('rb'),
+                        help="init net")
+    parser.add_argument("--pred_net",
+                        type=argparse.FileType('rb'),
+                        help="predict net")
+    parser.add_argument("--verify_input",
+                        type=argparse.FileType('r'),
+                        help="input dims for verification")
+    parser.add_argument("--fuse_bn", default=False, action='store_true')
+    parser.add_argument("--fuse_mul_add", default=False, action='store_true')
+    parser.add_argument("--fuse_conv_relu", default=False, action='store_true')
+    return parser
+
+
+def fuse_first_bn(net, params, removed_tensors):
+    net = copy.deepcopy(net)
+    params = copy.deepcopy(params)
+
+    for ((i, current), (j, next_)) in pairwise(enumerate(net.op)):
+        if next_.input[0] != current.output[0]:
+            continue
+
+        if current.type not in ("Conv", "ConvTranspose") \
+           or next_.type != "SpatialBN":
+            continue
+        if len(blob_uses(net, current.output[0])) != 1:
+            # Can't fuse if more than one user
+            continue
+
+        # else, can fuse
+        conv = current
+        bn = next_
+        fused_conv = copy.deepcopy(conv)
+        fused_conv.output[0] = bn.output[0]
+
+        # Fix fused_conv to ensure we have a bias passed.
+        if len(fused_conv.input) != 3:
+            bias_name = "{}_bias".format(conv.input[1])
+            net.external_input.extend([bias_name])
+            fused_conv.input.extend([bias_name])
+            for arg in fused_conv.arg:
+                if arg.name == "no_bias":
+                    arg.i = 0
+
+        conv_weight = params[conv.input[1]]
+        conv_bias = params[conv.input[2]] if len(conv.input) == 3 \
+            else np.zeros(shape=(conv_weight.shape[0])).astype(np.float32)
+
+        bn_scale = params[bn.input[1]]
+        bn_bias = params[bn.input[2]]
+        bn_running_mean = params[bn.input[3]]
+        bn_running_var = params[bn.input[4]]
+
+        # First, BN computation can be phrased as follows:
+        # (X - running_mean) * (1.0 / sqrt(running_var + eps)) *
+        # bn_scale + bias
+        # Thus, we can rewrite bn_scale as:
+        # X * bn_scale * 1.0 / (sqrt(running_var + eps)) + (bias -
+        # running_mean * (1.0 / sqrt(running_var + eps)) * bn_scale)
+        # Thus, can just have the affine transform
+        # X * A + B
+        # where
+        # A = bn_scale * 1.0 / (sqrt(running_var + eps))
+        # B =  (bias - running_mean * (1.0 / sqrt(running_var + eps))
+        # * bn_scale)
+        eps = 1.0e-5
+        for arg in bn.arg:
+            if arg.name == "epsilon":
+                eps = arg.f
+        A = bn_scale * 1.0 / (np.sqrt(bn_running_var + eps))
+        B = bn_bias - bn_running_mean * A
+
+        # This identify should hold if we have correctly fused
+        # np.testing.assert_array_equal(
+        #     params[conv.output[0]] * A + B,
+        #     params[bn.output[0]])
+
+        # Now, we have that the computation made is the following:
+        # ((X `conv` W) + b) * A + B
+        # Then, we can simply fuse this as follows:
+        # (X `conv` (W * A)) + b * A + B
+        # which is simply
+        # (X `conv` Q) + C
+        # where
+
+        # Q = W * A
+        # C = b * A + B
+
+        # For ConvTranspose, from the view of convolutions as a
+        # Toepeliz multiplication, we have W_ = W^T, so the weights
+        # are laid out as (R, S, K, K) (vs (S, R, K, K) for a Conv),
+        # so the weights broadcast slightly differently. Remember, our
+        # BN scale 'B' is of size (S,)
+
+        A_ = A.reshape(-1, 1, 1, 1) if conv.type == "Conv" else \
+            A.reshape(1, -1, 1, 1)
+
+        C = conv_bias * A + B
+        Q = conv_weight * A_
+
+        params[fused_conv.input[1]] = Q
+        params[fused_conv.input[2]] = C
+        new_ops = net.op[:i] + [fused_conv] + net.op[j + 1:]
+        del net.op[:]
+        removed_tensors.append(bn.input[1])
+        removed_tensors.append(bn.input[2])
+        removed_tensors.append(bn.input[3])
+        removed_tensors.append(bn.input[4])
+        del params[bn.input[1]]
+        del params[bn.input[2]]
+        del params[bn.input[3]]
+        del params[bn.input[4]]
+        net.op.extend(new_ops)
+        break
+    return net, params, removed_tensors
+
+
+def fuse_bn(net, params, ignore_failure):
+    # Run until we hit a fixed point
+    removed_tensors = []
+    while True:
+        (next_net, next_params, removed_tensors) = \
+            fuse_first_bn(net, params, removed_tensors)
+        if len(next_net.op) == len(net.op):
+            if (
+                any(op.type == "SpatialBN" for op in next_net.op) and
+                not ignore_failure
+            ):
+                raise Exception(
+                    "Model contains SpatialBN op after fusion: %s", next_net)
+            return (next_net, next_params, removed_tensors)
+        net, params, removed_tensors = (next_net, next_params, removed_tensors)
+
+
+def fuse_first_mul_add(net, params, removed_tensors):
+    net = copy.deepcopy(net)
+    params = copy.deepcopy(params)
+
+    for ((i, current), (j, next_)) in pairwise(enumerate(net.op)):
+        if current.type != "Mul" or next_.type != "Add":
+            continue
+
+        if next_.input[0] != current.output[0]:
+            raise Exception("Failure to fuse")
+
+        if len(blob_uses(net, current.output[0])) != 1:
+            raise Exception("Failure to fuse")
+
+        log.info("Fusing at index %s", i)
+        mul_ = current
+        add_ = next_
+        batch_norm = copy.deepcopy(mul_)
+        batch_norm.type = "SpatialBN"
+        batch_norm.arg.extend([utils.MakeArgument("is_test", 1)])
+        batch_norm.arg.extend([utils.MakeArgument("epsilon", float(1e-9))])
+
+        def s(x):
+            return "{}{}".format(add_.output[0], x)
+        fake_mean = s("_mean")
+        fake_var = s("_var")
+
+        del batch_norm.input[:]
+        batch_norm.input.extend([mul_.input[0],
+                                 mul_.input[1],
+                                 add_.input[1],
+                                 fake_mean,
+                                 fake_var])
+        params[fake_mean] = np.zeros_like(params[mul_.input[1]])
+        params[fake_var] = np.ones_like(params[mul_.input[1]])
+        net.external_input.extend([fake_mean, fake_var])
+
+        batch_norm.output[0] = add_.output[0]
+        new_ops = net.op[:i] + [batch_norm] + net.op[j + 1:]
+        del net.op[:]
+        net.op.extend(new_ops)
+        break
+    return net, params, removed_tensors
+
+
+def fuse_mul_add(net, params):
+    # Run until we hit a fixed point
+    removed_tensors = []
+    while True:
+        (next_net, next_params, removed_tensors) = \
+            fuse_first_mul_add(net, params, removed_tensors)
+        if len(next_net.op) == len(net.op):
+            return (next_net, next_params, removed_tensors)
+        net, params, removed_tensors = (next_net, next_params, removed_tensors)
+
+
+def add_tensor(net, name, blob):
+    ''' Create an operator to store the tensor 'blob',
+        run the operator to put the blob to workspace.
+        uint8 is stored as an array of string with one element.
+    '''
+    kTypeNameMapper = {
+        np.dtype('float32'): "GivenTensorFill",
+        np.dtype('int32'): "GivenTensorIntFill",
+        np.dtype('int64'): "GivenTensorInt64Fill",
+        np.dtype('uint8'): "GivenTensorStringFill",
+    }
+
+    shape = blob.shape
+    values = blob
+    # pass array of uint8 as a string to save storage
+    # storing uint8_t has a large overhead for now
+    if blob.dtype == np.dtype('uint8'):
+        shape = [1]
+        values = [str(blob.data)]
+
+    op = core.CreateOperator(
+        kTypeNameMapper[blob.dtype],
+        [], [name],
+        arg=[
+            utils.MakeArgument("shape", shape),
+            utils.MakeArgument("values", values),
+        ]
+    )
+    net.op.extend([op])
+
+
+def gen_init_net_from_blobs(blobs):
+    ''' Generate an initialization net based on a blob dict '''
+    ret = caffe2_pb2.NetDef()
+    for name, blob in blobs.items():
+        add_tensor(ret, name, blob)
+    return ret
+
+
+def fuse_conv_relu(net):
+    net = copy.deepcopy(net)
+    device_option = core.DeviceOption(caffe2_pb2.IDEEP)
+    for op in net.op:
+        op.device_option.CopyFrom(device_option)
+
+    new_net = caffe2_pb2.NetDef()
+    new_net.ParseFromString(C.transform_optimizeForIDEEP(net.SerializeToString()))
+    return new_net
+
+
+def Optimize(args):
+    init_net = caffe2_pb2.NetDef()
+    predict_net = caffe2_pb2.NetDef()
+    init_net.ParseFromString(args.init_net.read())
+    predict_net.ParseFromString(args.pred_net.read())
+
+    workspace.ResetWorkspace()
+    workspace.RunNetOnce(init_net)
+    param_dict = {p: workspace.FetchBlob(p) for p in workspace.Blobs()}
+
+    external_inputs = {}
+    external_outputs = {}
+    if args.verify_input:
+        value_info = json.load(args.verify_input)
+        input_shapes = {k : v[-1] for (k, v) in value_info.items()}
+        print("input info: {}".format(input_shapes))
+        for k, v in input_shapes.items():
+            external_inputs[k] = np.random.randn(*v).astype(np.float32)
+            workspace.FeedBlob(k, external_inputs[k])
+        workspace.RunNetOnce(predict_net)
+        for o in predict_net.external_output:
+            external_outputs[o] = workspace.FetchBlob(o)
+
+    if args.fuse_mul_add:
+        predict_net, param_dict, _ = fuse_mul_add(predict_net, param_dict)
+    if args.fuse_bn:
+        predict_net, param_dict, _ = fuse_bn(predict_net, param_dict, False)
+    if args.fuse_conv_relu:
+        predict_net = fuse_conv_relu(predict_net)
+
+    external_outputs_opt = {}
+    if args.verify_input:
+        workspace.ResetWorkspace()
+        device_option = core.DeviceOption(caffe2_pb2.IDEEP) if args.fuse_conv_relu else core.DeviceOption(caffe2_pb2.CPU)
+        with core.DeviceScope(device_option):
+            for k, v in param_dict.items():
+                workspace.FeedBlob(k, v, device_option)
+            for k, v in external_inputs.items():
+                workspace.FeedBlob(k, v, device_option)
+            workspace.RunNetOnce(predict_net)
+            for o in predict_net.external_output:
+                external_outputs_opt[o] = workspace.FetchBlob(o)
+                assert np.allclose(external_outputs[o],
+                                   external_outputs_opt[o],
+                                   atol=1e-3,
+                                   rtol=1e-3)
+
+    for i, o in enumerate(predict_net.op):
+        print("op[{}]: {}".format(i, o.type))
+    init_net = gen_init_net_from_blobs(param_dict)
+    with open('init_net.pb', 'wb') as f:
+        f.write(init_net.SerializeToString())
+    with open('predict_net.pb', 'wb') as f:
+        f.write(predict_net.SerializeToString())
+
+if __name__ == '__main__':
+    args = GetArgumentParser().parse_args()
+    Optimize(args)
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
new file mode 100644
index 0000000..e131ee0
--- /dev/null
+++ b/caffe2/python/ideep_test_util.py
@@ -0,0 +1,40 @@
+## @package ideep_test_util
+# Module caffe2.python.ideep_test_util
+"""
+The IDEEP test utils is a small addition on top of the hypothesis test utils
+under caffe2/python, which allows one to more easily test IDEEP related
+operators.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace
+from caffe2.python import hypothesis_test_util as hu
+
+cpu_do = hu.cpu_do
+ideep_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.IDEEP)
+device_options = hu.device_options + ([ideep_do])
+
+
+def device_checker_device_options():
+    return st.just(device_options)
+
+
+def gradient_checker_device_option():
+    return st.sampled_from(device_options)
+
+
+gcs = dict(
+    gc=gradient_checker_device_option(),
+    dc=device_checker_device_options()
+)
+
+gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
+gcs_ideep_only = dict(gc=st.sampled_from([ideep_do]), dc=st.just([ideep_do]))
+gcs_cpu_ideep = dict(gc=st.sampled_from([cpu_do, ideep_do]), dc=st.just([cpu_do, ideep_do]))
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
new file mode 100644
index 0000000..ce709b4
--- /dev/null
+++ b/caffe2/python/layer_model_helper.py
@@ -0,0 +1,652 @@
+# @package layer_model_helper
+# Module caffe2.python.layer_model_helper
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, model_helper, schema, scope, utils, muji
+from caffe2.python.modeling.parameter_info import (
+    ParameterInfo,
+)
+from caffe2.python.modeling.parameter_sharing import (
+    parameter_sharing_context,
+)
+from caffe2.python.modeling.net_modifier import NetModifier
+
+from caffe2.python.optimizer import get_param_device
+from caffe2.python.regularizer import Regularizer, RegularizationBy
+from caffe2.python.layers import layers
+from caffe2.proto import caffe2_pb2
+from future.utils import viewitems, viewvalues
+
+import logging
+import numpy as np
+import six
+import copy
+logger = logging.getLogger(__name__)
+
+
+class LayerModelHelper(model_helper.ModelHelper):
+    """
+    Model helper for building models on top of layers abstractions.
+
+    Each layer is the abstraction that is higher level than Operator. Layer
+    is responsible for ownership of it's own parameters and can easily be
+    instantiated in multiple nets possible with different sets of ops.
+    As an example: one can easily instantiate predict and train nets from
+    the same set of layers, where predict net will have subset of the
+    operators from train net.
+    """
+
+    def __init__(self, name, input_feature_schema, trainer_extra_schema,
+                 keep_blobs=False):
+        ''' TODO(amalevich): more documnetation on input args
+        '''
+
+        super(LayerModelHelper, self).__init__(name=name)
+        self._layer_names = set()
+        self._layers = []
+        self._param_to_shape = {}
+
+        # seed default
+        self._seed = None
+        self._sequence_seed = True
+
+        # optimizer bookkeeping
+        self.param_to_optim = {}
+        self.param_to_reg = {}
+
+        self._default_optimizer = None
+        self._loss = None
+        self._prediction = []
+        self._output_schema = None
+
+        self._post_grad_net_modifiers = []
+        self._final_net_modifiers = []
+
+        # breakdown map; breakdown features are categorical (like dense) but not
+        # necessarily used to represent data for training
+        self._breakdown_map = None
+
+        # Connect Schema to self.net. That particular instance of schmea will be
+        # use for generation of the Layers accross the network and would be used
+        # for connection with Readers.
+        self._input_feature_schema = schema.NewRecord(
+            self.net,
+            input_feature_schema
+        ) if not keep_blobs else input_feature_schema.clone()
+        self._trainer_extra_schema = schema.NewRecord(
+            self.net,
+            trainer_extra_schema
+        ) if not keep_blobs else trainer_extra_schema.clone()
+        self._metrics_schema = schema.Struct()
+
+        self._preproc_output_schema = None
+
+        self._init_global_constants()
+        self.param_init_net = self.create_init_net('param_init_net')
+        self._initialize_params = True
+
+        # additional (hard-coded) diagnose_options to report based on the model
+        # TODO(xlwang): it's hack!
+        self.ad_hoc_diagnose_blobs_and_operations = []
+        self.ad_hoc_plot_blobs = []
+
+    def clear_output_schema(self):
+        self._output_schema = None
+
+    def set_initialize_params(self, initialize_params):
+        self._initialize_params = initialize_params
+
+    def add_metric_field(self, name, value):
+        assert name not in self._metrics_schema.fields, (
+            "Try to add metric field twice: {}".format(name))
+        self._metrics_schema = self._metrics_schema + schema.Struct(
+            (name, value)
+        )
+
+    def add_ad_hoc_plot_blob(self, blob, dtype=None):
+        dtype = dtype or (np.float, (1, ))
+        self.add_metric_field(str(blob), schema.Scalar(dtype, blob))
+        self.ad_hoc_plot_blobs.append(blob)
+
+    @staticmethod
+    def _get_global_constant_initializer_op(
+        blob_name, array=None, dtype=None, initializer=None
+    ):
+        # to add a global constant to model, one first need to get the
+        # initializer
+        if array is not None:
+            assert initializer is None,\
+                "Only one from array and initializer should be specified"
+            if dtype is None:
+                array = np.array(array)
+            else:
+                array = np.array(array, dtype=dtype)
+
+            # TODO: make GivenTensor generic
+            op_name = None
+            if array.dtype == np.int32:
+                op_name = 'GivenTensorIntFill'
+            elif array.dtype == np.int64:
+                op_name = 'GivenTensorInt64Fill'
+            elif array.dtype == np.str:
+                op_name = 'GivenTensorStringFill'
+            elif array.dtype == np.bool:
+                op_name = 'GivenTensorBoolFill'
+            else:
+                op_name = 'GivenTensorFill'
+
+            def initializer(blob_name):
+                return core.CreateOperator(
+                    op_name, [],
+                    blob_name,
+                    shape=array.shape,
+                    values=array.flatten().tolist()
+                )
+        else:
+            assert initializer is not None
+        initializer_op = initializer(blob_name)
+        return initializer_op
+
+    def add_global_constant(
+        self, name, array=None, dtype=None, initializer=None
+    ):
+        assert isinstance(name, six.string_types), (
+            'name should be a string as we are using it as map key')
+        # This is global namescope for constants. They will be created in all
+        # init_nets and there should be very few of them.
+        assert name not in self.global_constants, \
+            "%s already added in global_constants" % name
+        blob_name = self.net.NextBlob(name)
+        self.global_constants[name] = blob_name
+        initializer_op = LayerModelHelper._get_global_constant_initializer_op(
+            blob_name, array, dtype, initializer
+        )
+        assert blob_name not in self.global_constant_initializers, \
+            "there is already a initializer op associated with blob %s" % \
+            blob_name
+        self.global_constant_initializers[blob_name] = initializer_op
+        return blob_name
+
+    def maybe_add_global_constant(self, name, *args, **kwargs):
+        # To ad hoc add new global constants without duplication
+        # if the name was already registered in global_constants, it will not be
+        # added even if the intended value is different from its original value
+
+        if name in self.global_constants:
+            blob_name = self.global_constants[name]
+            initializer_op = \
+                LayerModelHelper._get_global_constant_initializer_op(
+                    blob_name, *args, **kwargs
+                )
+            # check if the original initializer is the same as the one intended
+            # now
+            assert utils.OpAlmostEqual(
+                initializer_op,
+                self.global_constant_initializers[blob_name],
+                'debug_info'
+            ), \
+                "conflict initializers for global constant %s, " \
+                "previous %s, now %s" % (
+                    blob_name, str(initializer_op),
+                    str(self.global_constant_initializers[blob_name]))
+            return blob_name
+        return self.add_global_constant(name, *args, **kwargs)
+
+    def _init_global_constants(self):
+        self.global_constants = {}
+        self.global_constant_initializers = {}
+        self.add_global_constant('ONE', 1.0)
+        self.add_global_constant('ZERO', 0.0)
+        self.add_global_constant('ZERO_RANGE', [0, 0], dtype='int32')
+
+    def _add_global_constants(self, init_net):
+        for initializer_op in viewvalues(self.global_constant_initializers):
+            init_net._net.op.extend([initializer_op])
+
+    def create_init_net(self, name):
+        init_net = core.Net(name)
+        self._add_global_constants(init_net)
+        return init_net
+
+    def _validate_param_shape(self, param_name, shape):
+        if param_name not in self._param_to_shape:
+            return
+
+        ref_shape = self._param_to_shape[param_name]
+
+        if shape != ref_shape:
+            raise ValueError(
+                "Got inconsistent shapes between shared parameters "
+                "when trying to map a blob in scope {0} to {1}. ref_shape : "
+                " {2}, shape : {3}".format(
+                    scope.CurrentNameScope(), param_name, ref_shape, shape)
+            )
+
+    def create_param(self, param_name, shape, initializer, optimizer=None,
+                     ps_param=None, regularizer=None):
+        if isinstance(param_name, core.BlobReference):
+            param_name = str(param_name)
+        elif isinstance(param_name, six.string_types):
+            # Parameter name will be equal to current Namescope that got
+            # resolved with the respect of parameter sharing of the scopes.
+            param_name = parameter_sharing_context.get_parameter_name(
+                param_name)
+        else:
+            raise "Unsupported type for param_name"
+
+        param_blob = core.BlobReference(param_name)
+
+        if len(initializer) == 1:
+            init_op_args = {}
+        else:
+            assert len(initializer) == 2
+            init_op_args = copy.deepcopy(initializer[1])
+        if shape is not None:
+            assert 'shape' not in init_op_args
+            init_op_args.update({'shape': shape})
+
+        initializer_op = None
+        if self._initialize_params:
+            initializer_op = core.CreateOperator(
+                initializer[0],
+                [],
+                param_blob,
+                **init_op_args
+            )
+
+        param = layers.LayerParameter(
+            parameter=param_blob,
+            initializer=initializer_op,
+            optimizer=optimizer,
+            ps_param=ps_param,
+            regularizer=regularizer
+        )
+
+        self._validate_param_shape(param_name, shape)
+
+        self._param_to_shape[param_name] = shape
+
+        return param
+
+    def next_layer_name(self, prefix):
+        base_name = core.ScopedName(prefix)
+        name = base_name
+        index = 0
+        while name in self._layer_names:
+            name = base_name + '_auto_' + str(index)
+            index += 1
+
+        self._layer_names.add(name)
+        return name
+
+    def add_layer(self, layer):
+        self._layers.append(layer)
+        for param in layer.get_parameters():
+            assert isinstance(param.parameter, core.BlobReference)
+
+            self.param_to_optim[str(param.parameter)] = \
+                param.optimizer or self.default_optimizer
+
+            self.params.append(param.parameter)
+            if isinstance(param, layers.LayerParameter):
+                self.param_to_reg[param.parameter] = param.regularizer
+            elif isinstance(param, ParameterInfo):
+                # TODO:
+                # Currently, LSTM and RNNcells, which use ModelHelper instead of
+                # LayerModelHelper as super class, are called in pooling_methods
+                # In ModelHelper, regularization is not supported in create_param
+                # We will unify the way of create_param of ModelHelper and
+                # LayerModelHelper in the future.
+                logger.info('regularization is unsupported for ParameterInfo object')
+            else:
+                raise ValueError(
+                    'unknown object type besides ParameterInfo and LayerParameter: {}'
+                    .format(param)
+                )
+
+        # The primary value of adding everything to self.net - generation of the
+        # operators right away, i.e. if error happens it'll be detected
+        # immediately. Other than this - create_x_net should be called.
+        layer.add_operators(self.net, self.param_init_net)
+        return layer.output_schema
+
+    def get_parameter_blobs(self):
+        param_blobs = []
+        for layer in self._layers:
+            for param in layer.get_parameters():
+                param_blobs.append(param.parameter)
+
+        return param_blobs
+
+    def add_post_grad_net_modifiers(self, modifier):
+        assert modifier not in self._post_grad_net_modifiers,\
+            "{0} is already in {1}".format(modifier, self._post_grad_net_modifiers)
+        assert isinstance(modifier, NetModifier),\
+            "{} has to be a NetModifier instance".format(modifier)
+        self._post_grad_net_modifiers.append(modifier)
+
+    def add_final_net_modifiers(self, modifier):
+        assert modifier not in self._final_net_modifiers,\
+            "{0} is already in {1}".format(modifier, self._final_net_modifiers)
+        assert isinstance(modifier, NetModifier),\
+            "{} has to be a NetModifier instance".format(modifier)
+        self._final_net_modifiers.append(modifier)
+
+    @property
+    def seed(self):
+        return self._seed
+
+    @property
+    def sequence_seed(self):
+        return self._sequence_seed
+
+    def store_seed(self, seed, sequence_seed=True):
+        # Store seed config that will be applied to each op in the net.
+        self._seed = seed
+        # If sequence_seed is True, the i-th op has rand_seed=`seed + i`
+        self._sequence_seed = sequence_seed
+
+    def apply_seed(self, net):
+        if self._seed:
+            net.set_rand_seed(self._seed, self._sequence_seed)
+
+    @property
+    def default_optimizer(self):
+        return self._default_optimizer
+
+    @default_optimizer.setter
+    def default_optimizer(self, optimizer):
+        self._default_optimizer = optimizer
+
+    @property
+    def input_feature_schema(self):
+        return self._input_feature_schema
+
+    @property
+    def trainer_extra_schema(self):
+        return self._trainer_extra_schema
+
+    @property
+    def metrics_schema(self):
+        """
+        Returns the schema that represents model output that should be used for
+        metric reporting.
+
+        During the training/evaluation this schema will be appended to the
+        schema that represents model output.
+        """
+        return self._metrics_schema
+
+    @property
+    def output_schema(self):
+        assert self._output_schema is not None
+        return self._output_schema
+
+    @output_schema.setter
+    def output_schema(self, schema):
+        assert self._output_schema is None
+        self._output_schema = schema
+
+    @property
+    def preproc_output_schema(self):
+        assert self._preproc_output_schema is not None
+        return self._preproc_output_schema
+
+    @preproc_output_schema.setter
+    def preproc_output_schema(self, schema):
+        assert self._preproc_output_schema is None
+        self._preproc_output_schema = schema
+
+    @property
+    def prediction(self):
+        assert self._prediction, "model prediction is empty"
+        return self._prediction
+
+    def add_prediction(self, prediction, weight=1.0):
+        assert prediction is not None, "Added prediction should not be None"
+        self._prediction.append((prediction, weight))
+
+    @property
+    def loss(self):
+        assert self._loss is not None
+        return self._loss
+
+    @loss.setter
+    def loss(self, loss):
+        assert self._loss is None
+        self._loss = loss
+
+    def has_loss(self):
+        return self._loss is not None
+
+    def add_loss(self, loss, name='unnamed'):
+        assert loss is not None, "Added loss should not be None"
+        assert isinstance(loss, schema.Scalar) or isinstance(
+            loss, schema.Struct
+        ), "Added loss should be a scalar or a struct"
+        if self._loss is None:
+            self._loss = schema.Struct((name, loss))
+        else:
+            # loss could've been set through model.loss directly which could be
+            # a scalar
+            if isinstance(self._loss, schema.Scalar):
+                self._loss = schema.Struct(('unnamed', self._loss))
+
+            prefix_base = name + '_auto_'
+            index = 0
+            prefix = name
+            while prefix in self._loss:
+                prefix = prefix_base + str(index)
+                index += 1
+            loss_struct = schema.Struct((prefix, loss))
+            self._loss = self._loss + loss_struct
+
+    def add_output_schema(self, name, value):
+        assert value is not None, \
+            'Added output schema {} should not be None'.format(name)
+        assert isinstance(value, schema.Scalar) or \
+            isinstance(value, schema.Struct), \
+            'Added output schema {} should be a scalar or a struct.\n\
+            Now it is {}.'.format(name, type(value))
+        if self._output_schema is None:  # be the first field
+            self._output_schema = schema.Struct((name, value))
+        else:  # merge with other fields
+            assert name not in self._output_schema.fields, \
+                'Output Schema Field {} already exists'.format(name)
+            self._output_schema = \
+                self._output_schema + schema.Struct((name, value))
+
+    def add_trainer_extra_schema(self, trainer_extra_schema):
+        trainer_extra_record = schema.NewRecord(self.net, trainer_extra_schema)
+        self._trainer_extra_schema += trainer_extra_record
+
+    def __getattr__(self, layer):
+        def is_functional_layer(layer):
+            if core.IsOperator(layer):
+                return True
+            elif layer.startswith('FunctionalLayer'):
+                return True
+            else:
+                return False
+
+        def resolve_functional_layer(layer):
+            if core.IsOperator(layer):
+                return layer
+            elif layer.startswith('FunctionalLayer'):
+                return layer[len('FunctionalLayer'):]
+            else:
+                raise ValueError(
+                    '%s cannot be resolved as functional layer' % layer
+                )
+
+        if layer.startswith('__'):
+            raise AttributeError(layer)
+
+        # TODO(amalevich): Add add support for ifbpy inline documentation
+        if layers.layer_exists(layer):
+            def wrapper(*args, **kwargs):
+                new_layer = layers.create_layer(layer, self, *args, **kwargs)
+                if kwargs.get("output_to_metrics", False):
+                    new_layer.export_output_for_metrics()
+                if kwargs.get("params_to_metrics", False):
+                    new_layer.export_params_for_metrics()
+                return self.add_layer(new_layer)
+            return wrapper
+        elif is_functional_layer(layer):
+            # TODO(xlwang): Desginated layer shadows the usage of an op as a
+            # single layer. To enforce using an op (e.g. Split) as functional
+            # layer, one can call 'model.FunctionalLayerSplit'
+            layer = resolve_functional_layer(layer)
+
+            def wrapper(*args, **kwargs):
+                def apply_operator(net, in_record, out_record, **kwargs):
+                    # TODO(amalevich): Switch to net.operator as soon as it gets
+                    # landed
+                    net.__getattr__(layer)(in_record.field_blobs(),
+                                           out_record.field_blobs(),
+                                           **kwargs)
+
+                if 'name' not in kwargs:
+                    kwargs['name'] = layer
+
+                new_layer = layers.create_layer(
+                    'Functional',
+                    self, *args, function=apply_operator,
+                    **kwargs
+                )
+
+                if kwargs.get("output_to_metrics", False):
+                    new_layer.export_output_for_metrics()
+                if kwargs.get("params_to_metrics", False):
+                    new_layer.export_params_for_metrics()
+
+                return self.add_layer(new_layer)
+            return wrapper
+        else:
+            raise ValueError(
+                "Trying to create non-registered layer: {}".format(layer))
+
+    @property
+    def layers(self):
+        return self._layers
+
+    def apply_regularizers_on_loss(
+        self,
+        train_net,
+        train_init_net,
+        blob_to_device=None,
+    ):
+        for param, regularizer in viewitems(self.param_to_reg):
+            if regularizer is None:
+                continue
+            assert isinstance(regularizer, Regularizer)
+            added_loss_blob = regularizer(train_net, train_init_net, param, grad=None,
+                                          by=RegularizationBy.ON_LOSS)
+            if added_loss_blob is not None:
+                self.add_loss(
+                    schema.Scalar(blob=added_loss_blob),
+                    str(added_loss_blob)
+                )
+
+    def apply_regularizers_after_optimizer(
+        self,
+        train_net,
+        train_init_net,
+        grad_map,
+        blob_to_device=None,
+    ):
+        CPU = muji.OnCPU()
+        # if given, blob_to_device is a map from blob to device_option
+        blob_to_device = blob_to_device or {}
+        for param, regularizer in viewitems(self.param_to_reg):
+            if regularizer is None:
+                continue
+            assert isinstance(regularizer, Regularizer)
+            device = get_param_device(
+                param,
+                grad_map.get(str(param)),
+                param_to_device=blob_to_device,
+                default_device=CPU,
+            )
+            with core.DeviceScope(device):
+                regularizer(
+                    train_net, train_init_net, param, grad=grad_map.get(str(param)),
+                    by=RegularizationBy.AFTER_OPTIMIZER
+                )
+
+    def apply_post_grad_net_modifiers(
+        self,
+        trainer_net,
+        trainer_init_net,
+        grad_map,
+        blob_to_device=None,
+        modify_output_record=False,
+    ):
+        param_grad_map = {param: grad_map[param]
+                          for param in self.param_to_optim.keys() if param in grad_map}
+
+        for modifier in self._post_grad_net_modifiers:
+            modifier(trainer_net, trainer_init_net, param_grad_map,
+                     blob_to_device=blob_to_device,
+                     modify_output_record=modify_output_record)
+
+    def apply_final_net_modifiers(
+        self,
+        trainer_net,
+        trainer_init_net,
+        grad_map,
+        blob_to_device=None,
+        modify_output_record=False,
+    ):
+        for modifier in self._final_net_modifiers:
+            modifier(trainer_net, trainer_init_net, grad_map,
+                     blob_to_device=blob_to_device,
+                     modify_output_record=modify_output_record)
+
+    def apply_optimizers(
+        self,
+        train_net,
+        train_init_net,
+        grad_map,
+        blob_to_device=None,
+    ):
+        CPU = muji.OnCPU()
+        # if given, blob_to_device is a map from blob to device_option
+        blob_to_device = blob_to_device or {}
+        for param, optimizer in viewitems(self.param_to_optim):
+            assert optimizer is not None, \
+                "default optimizer must have been set in add_layer"
+            # note that not all params has gradient and thus we sent None if
+            # gradient does not exists
+            device = get_param_device(
+                param,
+                grad_map.get(str(param)),
+                param_to_device=blob_to_device,
+                default_device=CPU,
+            )
+            with core.DeviceScope(device):
+                optimizer(
+                    train_net, train_init_net, param, grad_map.get(str(param)))
+
+    def _GetOne(self):
+        return self.global_constants['ONE']
+
+    # An optimizer which allows us to do NO optimization
+    def NoOptim(self, *args, **kwargs):
+        pass
+
+    @property
+    def breakdown_map(self):
+        return self._breakdown_map
+
+    @breakdown_map.setter
+    def breakdown_map(self, breakdown_map):
+        # TODO(xlwang): provide more rich feature information in breakdown_map;
+        # and change the assertion accordingly
+        assert isinstance(breakdown_map, dict)
+        assert all(isinstance(k, six.string_types) for k in breakdown_map)
+        assert sorted(list(breakdown_map.values())) == range(len(breakdown_map))
+        self._breakdown_map = breakdown_map
diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py
new file mode 100644
index 0000000..9ceb131
--- /dev/null
+++ b/caffe2/python/layer_model_instantiator.py
@@ -0,0 +1,113 @@
+## @package layer_model_instantiator
+# Module caffe2.python.layer_model_instantiator
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import InstantiationContext
+from caffe2.python.layers.tags import Tags
+
+
+def _filter_layers(layers, include_tags):
+    if include_tags is None:
+        return layers
+    include_tags = set(include_tags)
+    return [l for l in layers if not include_tags.isdisjoint(l.tags)]
+
+
+def shrink_output_schema(net, out_schema):
+    if len(out_schema.field_names()) <= 1:
+        return out_schema
+    exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()]
+    return schema.from_column_list(
+        [
+            col_name for ok, col_name in
+            zip(exists, out_schema.field_names()) if ok
+        ],
+        [
+            col_type for ok, col_type in
+            zip(exists, out_schema.field_types()) if ok
+        ],
+        [
+            col_blob for ok, col_blob in
+            zip(exists, out_schema.field_blobs()) if ok
+        ],
+        [
+            col_meta for ok, col_meta in
+            zip(exists, out_schema.field_metadata()) if ok
+        ]
+    )
+
+
+def generate_predict_net(model, include_tags=None):
+    predict_net = core.Net('predict_net')
+
+    for layer in _filter_layers(model.layers, include_tags):
+        if Tags.EXCLUDE_FROM_PREDICTION not in layer.tags:
+            layer.add_operators(
+                predict_net, context=InstantiationContext.PREDICTION)
+
+    predict_net.set_input_record(model.input_feature_schema.clone())
+    output_schema = shrink_output_schema(
+        predict_net, model.output_schema.clone()
+    )
+    predict_net.set_output_record(output_schema)
+    return predict_net
+
+
+def generate_eval_net(model, include_tags=None):
+    eval_net = core.Net('eval_net')
+
+    for layer in _filter_layers(model.layers, include_tags):
+        if Tags.EXCLUDE_FROM_EVAL not in layer.tags:
+            layer.add_operators(eval_net, context=InstantiationContext.EVAL)
+
+    input_schema = model.input_feature_schema + model.trainer_extra_schema
+    eval_net.set_input_record(input_schema)
+    output_schema = shrink_output_schema(
+        eval_net, model.output_schema + model.metrics_schema
+    )
+    eval_net.set_output_record(output_schema)
+    return eval_net
+
+
+def _generate_training_net_only(model, include_tags=None):
+    train_net = core.Net('train_net')
+    train_init_net = model.create_init_net('train_init_net')
+
+    for layer in _filter_layers(model.layers, include_tags):
+        if Tags.EXCLUDE_FROM_TRAIN not in layer.tags:
+            layer.add_operators(train_net, train_init_net)
+
+    input_schema = model.input_feature_schema + model.trainer_extra_schema
+    train_net.set_input_record(input_schema)
+    output_schema = shrink_output_schema(
+        train_net, model.output_schema + model.metrics_schema
+    )
+    train_net.set_output_record(output_schema)
+    return train_init_net, train_net
+
+
+def generate_training_nets_forward_only(model, include_tags=None):
+    train_init_net, train_net = _generate_training_net_only(model, include_tags)
+    return train_init_net, train_net
+
+
+def generate_training_nets(model, include_tags=None):
+    train_init_net, train_net = _generate_training_net_only(model, include_tags)
+
+    model.apply_regularizers_on_loss(train_net, train_init_net)
+    if not model.has_loss():
+        return train_init_net, train_net
+    loss = model.loss
+    grad_map = train_net.AddGradientOperators(loss.field_blobs())
+    model.apply_post_grad_net_modifiers(train_net, train_init_net, grad_map,
+                                        modify_output_record=True)
+    model.apply_optimizers(train_net, train_init_net, grad_map)
+    model.apply_regularizers_after_optimizer(train_net, train_init_net, grad_map)
+    model.apply_final_net_modifiers(train_net, train_init_net, grad_map,
+                                    modify_output_record=True)
+
+    return train_init_net, train_net
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
new file mode 100644
index 0000000..148ac79
--- /dev/null
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -0,0 +1,150 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope
+from caffe2.python.modeling.parameter_sharing import (
+    ParameterSharing,
+)
+from caffe2.python.layer_test_util import LayersTestCase
+
+
+class ParameterSharingTest(LayersTestCase):
+
+    def test_layer_parameter_name(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            fc1_output = self.model.FC(
+                self.model.input_feature_schema.float_features,
+                output_dims
+            )
+            self.assertEquals(self.model.layers[-1].w, 'global_scope/fc/w')
+            self.assertEquals(fc1_output(), 'global_scope/fc/output')
+
+            with scope.NameScope('nested_scope'):
+                fc2_output = self.model.FC(
+                    fc1_output,
+                    output_dims
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/nested_scope/fc/w')
+                self.assertEquals(fc2_output(),
+                                  'global_scope/nested_scope/fc/output')
+
+                fc3_output = self.model.FC(
+                    fc1_output,
+                    output_dims
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/nested_scope/fc_auto_0/w')
+                self.assertEquals(fc3_output(),
+                                  'global_scope/nested_scope/fc_auto_0/output')
+
+    def test_layer_shared_parameter_name_different_namescopes(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'scope_1': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims
+                    )
+                    self.assertEquals(self.model.layers[-1].w,
+                                      'global_scope/scope_0/fc/w')
+                    self.assertEquals(fc1_output(),
+                                      'global_scope/scope_0/fc/output')
+
+                with scope.NameScope('scope_1'):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims
+                    )
+                    self.assertEquals(self.model.layers[-1].w,
+                                      'global_scope/scope_0/fc/w')
+                    self.assertEquals(fc2_output(),
+                                      'global_scope/scope_1/fc/output')
+
+    def test_layer_shared_parameter_name_within_same_namescope(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'fc_auto_0': 'fc'}):
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/fc/w')
+
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/fc/w')
+
+    def test_layer_shared_parameter_name_within_same_namescope_customized_name(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'new_fc': 'shared_fc'}):
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims,
+                    name='shared_fc'
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/shared_fc/w')
+
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims,
+                    name='new_fc'
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/shared_fc/w')
+
+    def test_layer_shared_parameter_name_different_shapes(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'fc_auto_0': 'fc'}):
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims
+                )
+                self.assertEquals(self.model.layers[-1].w,
+                                  'global_scope/fc/w')
+
+                with self.assertRaisesRegexp(ValueError, 'Got inconsistent shapes .*'):
+                    self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims + 1
+                    )
+
+    def test_layer_duplicated_parameter_init(self):
+        output_dims = 2
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'new_fc': 'shared_fc'}):
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims,
+                    name='shared_fc'
+                )
+                self.model.FC(
+                    self.model.input_feature_schema.float_features,
+                    output_dims,
+                    name='new_fc'
+                )
+
+        train_init_net = core.Net('train_init_net')
+        train_net = core.Net('train_net')
+        for layer in self.model.layers:
+            layer.add_operators(train_net, train_init_net)
+        op_outputs = []
+        for op in train_init_net._net.op:
+            op_outputs.extend(op.output)
+
+        # only fill these parameter blobs once
+        self.assertEquals(
+            sorted(op_outputs),
+            ['global_scope/shared_fc/b', 'global_scope/shared_fc/w']
+        )
diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py
new file mode 100644
index 0000000..2f2e230
--- /dev/null
+++ b/caffe2/python/layer_test_util.py
@@ -0,0 +1,137 @@
+## @package layer_test_util
+# Module caffe2.python.layer_test_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import namedtuple
+
+from caffe2.python import (
+    core,
+    layer_model_instantiator,
+    layer_model_helper,
+    schema,
+    test_util,
+    workspace,
+    utils,
+)
+from caffe2.proto import caffe2_pb2
+import numpy as np
+
+
+class OpSpec(namedtuple("OpSpec", "type input output arg")):
+
+    def __new__(cls, op_type, op_input, op_output, op_arg=None):
+        return super(OpSpec, cls).__new__(cls, op_type, op_input,
+                                          op_output, op_arg)
+
+
+class LayersTestCase(test_util.TestCase):
+
+    def setUp(self):
+        super(LayersTestCase, self).setUp()
+        self.setup_example()
+
+    def setup_example(self):
+        """
+        This is undocumented feature in hypothesis,
+        https://github.com/HypothesisWorks/hypothesis-python/issues/59
+        """
+        workspace.ResetWorkspace()
+        self.reset_model()
+
+    def reset_model(self, input_feature_schema=None, trainer_extra_schema=None):
+        input_feature_schema = input_feature_schema or schema.Struct(
+            ('float_features', schema.Scalar((np.float32, (32,)))),
+        )
+        trainer_extra_schema = trainer_extra_schema or schema.Struct()
+        self.model = layer_model_helper.LayerModelHelper(
+            'test_model',
+            input_feature_schema=input_feature_schema,
+            trainer_extra_schema=trainer_extra_schema)
+
+    def new_record(self, schema_obj):
+        return schema.NewRecord(self.model.net, schema_obj)
+
+    def get_training_nets(self, add_constants=False):
+        """
+        We don't use
+        layer_model_instantiator.generate_training_nets_forward_only()
+        here because it includes initialization of global constants, which make
+        testing tricky
+        """
+        train_net = core.Net('train_net')
+        if add_constants:
+            train_init_net = self.model.create_init_net('train_init_net')
+        else:
+            train_init_net = core.Net('train_init_net')
+        for layer in self.model.layers:
+            layer.add_operators(train_net, train_init_net)
+        return train_init_net, train_net
+
+    def get_eval_net(self):
+        return layer_model_instantiator.generate_eval_net(self.model)
+
+    def get_predict_net(self):
+        return layer_model_instantiator.generate_predict_net(self.model)
+
+    def run_train_net(self):
+        self.model.output_schema = schema.Struct()
+        train_init_net, train_net = \
+            layer_model_instantiator.generate_training_nets(self.model)
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+    def run_train_net_forward_only(self, num_iter=1):
+        self.model.output_schema = schema.Struct()
+        train_init_net, train_net = \
+            layer_model_instantiator.generate_training_nets_forward_only(
+                self.model)
+        workspace.RunNetOnce(train_init_net)
+        assert num_iter > 0, 'num_iter must be larger than 0'
+        workspace.CreateNet(train_net)
+        workspace.RunNet(train_net.Proto().name, num_iter=num_iter)
+
+    def assertBlobsEqual(self, spec_blobs, op_blobs):
+        """
+        spec_blobs can either be None or a list of blob names. If it's None,
+        then no assertion is performed. The elements of the list can be None,
+        in that case, it means that position will not be checked.
+        """
+        if spec_blobs is None:
+            return
+        self.assertEqual(len(spec_blobs), len(op_blobs))
+        for spec_blob, op_blob in zip(spec_blobs, op_blobs):
+            if spec_blob is None:
+                continue
+            self.assertEqual(spec_blob, op_blob)
+
+    def assertArgsEqual(self, spec_args, op_args):
+        self.assertEqual(len(spec_args), len(op_args))
+        keys = [a.name for a in op_args]
+
+        def parse_args(args):
+            operator = caffe2_pb2.OperatorDef()
+            # Generate the expected value in the same order
+            for k in keys:
+                v = args[k]
+                arg = utils.MakeArgument(k, v)
+                operator.arg.add().CopyFrom(arg)
+            return operator.arg
+
+        self.assertEqual(parse_args(spec_args), op_args)
+
+    def assertNetContainOps(self, net, op_specs):
+        """
+        Given a net and a list of OpSpec's, check that the net match the spec
+        """
+        ops = net.Proto().op
+        self.assertEqual(len(op_specs), len(ops))
+        for op, op_spec in zip(ops, op_specs):
+            self.assertEqual(op_spec.type, op.type)
+            self.assertBlobsEqual(op_spec.input, op.input)
+            self.assertBlobsEqual(op_spec.output, op.output)
+            if op_spec.arg is not None:
+                self.assertArgsEqual(op_spec.arg, op.arg)
+        return ops
diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py
new file mode 100644
index 0000000..2a09dc8
--- /dev/null
+++ b/caffe2/python/layers/__init__.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from importlib import import_module
+import pkgutil
+import sys
+from . import layers
+
+
+def import_recursive(package):
+    """
+    Takes a package and imports all modules underneath it
+    """
+
+    pkg_dir = package.__path__
+    module_location = package.__name__
+    for (_module_loader, name, ispkg) in pkgutil.iter_modules(pkg_dir):
+        module_name = "{}.{}".format(module_location, name)  # Module/package
+        module = import_module(module_name)
+        if ispkg:
+            import_recursive(module)
+
+
+def find_subclasses_recursively(base_cls, sub_cls):
+    cur_sub_cls = base_cls.__subclasses__()
+    sub_cls.update(cur_sub_cls)
+    for cls in cur_sub_cls:
+        find_subclasses_recursively(cls, sub_cls)
+
+
+import_recursive(sys.modules[__name__])
+
+model_layer_subcls = set()
+find_subclasses_recursively(layers.ModelLayer, model_layer_subcls)
+
+for cls in list(model_layer_subcls):
+    layers.register_layer(cls.__name__, cls)
diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py
new file mode 100644
index 0000000..c081e85
--- /dev/null
+++ b/caffe2/python/layers/adaptive_weight.py
@@ -0,0 +1,160 @@
+# @package adaptive_weight
+# Module caffe2.fb.python.layers.adaptive_weight
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import numpy as np
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+from caffe2.python.regularizer import BoundedGradientProjection, LogBarrier
+
+
+"""
+Implementation of adaptive weighting: https://arxiv.org/pdf/1705.07115.pdf
+"""
+
+
+class AdaptiveWeight(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name="adaptive_weight",
+        optimizer=None,
+        weights=None,
+        enable_diagnose=False,
+        estimation_method="log_std",
+        pos_optim_method="log_barrier",
+        reg_lambda=0.1,
+        **kwargs
+    ):
+        super(AdaptiveWeight, self).__init__(model, name, input_record, **kwargs)
+        self.output_schema = schema.Scalar(
+            np.float32, self.get_next_blob_reference("adaptive_weight")
+        )
+        self.data = self.input_record.field_blobs()
+        self.num = len(self.data)
+        self.optimizer = optimizer
+        if weights is not None:
+            assert len(weights) == self.num
+        else:
+            weights = [1. / self.num for _ in range(self.num)]
+        assert min(weights) > 0, "initial weights must be positive"
+        self.weights = np.array(weights).astype(np.float32)
+        self.estimation_method = str(estimation_method).lower()
+        # used in positivity-constrained parameterization as when the estimation method
+        # is inv_var, with optimization method being either log barrier, or grad proj
+        self.pos_optim_method = str(pos_optim_method).lower()
+        self.reg_lambda = float(reg_lambda)
+        self.enable_diagnose = enable_diagnose
+        self.init_func = getattr(self, self.estimation_method + "_init")
+        self.weight_func = getattr(self, self.estimation_method + "_weight")
+        self.reg_func = getattr(self, self.estimation_method + "_reg")
+        self.init_func()
+        if self.enable_diagnose:
+            self.weight_i = [
+                self.get_next_blob_reference("adaptive_weight_%d" % i)
+                for i in range(self.num)
+            ]
+            for i in range(self.num):
+                self.model.add_ad_hoc_plot_blob(self.weight_i[i])
+
+    def concat_data(self, net):
+        reshaped = [net.NextScopedBlob("reshaped_data_%d" % i) for i in range(self.num)]
+        # coerce shape for single real values
+        for i in range(self.num):
+            net.Reshape(
+                [self.data[i]],
+                [reshaped[i], net.NextScopedBlob("new_shape_%d" % i)],
+                shape=[1],
+            )
+        concated = net.NextScopedBlob("concated_data")
+        net.Concat(
+            reshaped, [concated, net.NextScopedBlob("concated_new_shape")], axis=0
+        )
+        return concated
+
+    def log_std_init(self):
+        """
+        mu = 2 log sigma, sigma = standard variance
+        per task objective:
+        min 1 / 2 / e^mu X + mu / 2
+        """
+        values = np.log(1. / 2. / self.weights)
+        initializer = (
+            "GivenTensorFill",
+            {"values": values, "dtype": core.DataType.FLOAT},
+        )
+        self.mu = self.create_param(
+            param_name="mu",
+            shape=[self.num],
+            initializer=initializer,
+            optimizer=self.optimizer,
+        )
+
+    def log_std_weight(self, x, net, weight):
+        """
+        min 1 / 2 / e^mu X + mu / 2
+        """
+        mu_neg = net.NextScopedBlob("mu_neg")
+        net.Negative(self.mu, mu_neg)
+        mu_neg_exp = net.NextScopedBlob("mu_neg_exp")
+        net.Exp(mu_neg, mu_neg_exp)
+        net.Scale(mu_neg_exp, weight, scale=0.5)
+
+    def log_std_reg(self, net, reg):
+        net.Scale(self.mu, reg, scale=0.5)
+
+    def inv_var_init(self):
+        """
+        k = 1 / variance
+        per task objective:
+        min 1 / 2 * k  X - 1 / 2 * log k
+        """
+        values = 2. * self.weights
+        initializer = (
+            "GivenTensorFill",
+            {"values": values, "dtype": core.DataType.FLOAT},
+        )
+        if self.pos_optim_method == "log_barrier":
+            regularizer = LogBarrier(reg_lambda=self.reg_lambda)
+        elif self.pos_optim_method == "pos_grad_proj":
+            regularizer = BoundedGradientProjection(lb=0, left_open=True)
+        else:
+            raise TypeError(
+                "unknown positivity optimization method: {}".format(
+                    self.pos_optim_method
+                )
+            )
+        self.k = self.create_param(
+            param_name="k",
+            shape=[self.num],
+            initializer=initializer,
+            optimizer=self.optimizer,
+            regularizer=regularizer,
+        )
+
+    def inv_var_weight(self, x, net, weight):
+        net.Scale(self.k, weight, scale=0.5)
+
+    def inv_var_reg(self, net, reg):
+        log_k = net.NextScopedBlob("log_k")
+        net.Log(self.k, log_k)
+        net.Scale(log_k, reg, scale=-0.5)
+
+    def _add_ops_impl(self, net, enable_diagnose):
+        x = self.concat_data(net)
+        weight = net.NextScopedBlob("weight")
+        reg = net.NextScopedBlob("reg")
+        weighted_x = net.NextScopedBlob("weighted_x")
+        weighted_x_add_reg = net.NextScopedBlob("weighted_x_add_reg")
+        self.weight_func(x, net, weight)
+        self.reg_func(net, reg)
+        net.Mul([weight, x], weighted_x)
+        net.Add([weighted_x, reg], weighted_x_add_reg)
+        net.SumElements(weighted_x_add_reg, self.output_schema())
+        if enable_diagnose:
+            for i in range(self.num):
+                net.Slice(weight, self.weight_i[i], starts=[i], ends=[i + 1])
+
+    def add_ops(self, net):
+        self._add_ops_impl(net, self.enable_diagnose)
diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py
new file mode 100644
index 0000000..0ffa46a
--- /dev/null
+++ b/caffe2/python/layers/add_bias.py
@@ -0,0 +1,44 @@
+## @package add_bias
+# Module caffe2.python.layers.add_bias
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+import math
+
+
+class AddBias(ModelLayer):
+
+    def __init__(self, model, input_record, bias_init=None,
+                 bias_optim=None, name='add_bias'):
+        super(AddBias, self).__init__(model, name, input_record)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        assert len(input_record.field_type().shape) > 0, (
+            "AddBias expects limited dimensions of the input tensor")
+
+        input_dims = input_record.field_type().shape[0]
+        assert input_dims > 0, (
+            "AddBias expects input dimensions > 0, got {}".format(input_dims))
+
+        scale = math.sqrt(1.0 / input_dims)
+        bias_init = bias_init if bias_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+
+        self.b = self.create_param(
+            param_name='b',
+            shape=[input_dims, ],
+            initializer=bias_init,
+            optimizer=bias_optim,
+        )
+
+        self.output_schema = schema.Scalar(
+            (input_record.field_type().base, (input_dims, )),
+            self.get_next_blob_reference('output')
+        )
+
+    def add_ops(self, net):
+        net.Add(self.input_record.field_blobs() + [self.b],
+                self.output_schema.field_blobs(), broadcast=1)
diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py
new file mode 100644
index 0000000..2409eca
--- /dev/null
+++ b/caffe2/python/layers/arc_cosine_feature_map.py
@@ -0,0 +1,179 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+import numpy as np
+
+
+class ArcCosineFeatureMap(ModelLayer):
+    """
+    A general version of the arc-cosine kernel feature map (s = 1 restores
+    the original arc-cosine kernel feature map).
+
+    Applies H(x) * x^s, where H is the Heaviside step function and x is the
+    input after applying FC (such that x = w * x_orig + b).
+
+    For more information, see the original paper:
+        http://cseweb.ucsd.edu/~saul/papers/nips09_kernel.pdf
+
+    Inputs :
+        output_dims -- dimensions of the output vector
+        s -- degree to raise transformed features
+        scale -- amount to scale the standard deviation
+        weight_init -- initialization distribution for weight parameter
+        bias_init -- initialization distribution for bias pararmeter
+        weight_optim -- optimizer for weight params; None for random features
+        bias_optim -- optimizer for bias param; None for random features
+        set_weight_as_global_constant -- if True, initialized random parameters
+                                         will be constant across all distributed
+                                         instances of the layer
+        initialize_output_schema -- if True, initialize output schema as Scalar
+                                    from Arc Cosine; else output schema is None
+    """
+    def __init__(
+            self,
+            model,
+            input_record,
+            output_dims,
+            s=1,
+            scale=1.0,
+            weight_init=None,
+            bias_init=None,
+            weight_optim=None,
+            bias_optim=None,
+            set_weight_as_global_constant=False,
+            initialize_output_schema=True,
+            name='arc_cosine_feature_map',
+            **kwargs):
+
+        super(ArcCosineFeatureMap, self).__init__(model, name, input_record,
+                                                  **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        self.params = []
+        self.model = model
+        self.set_weight_as_global_constant = set_weight_as_global_constant
+
+        self.input_dims = input_record.field_type().shape[0]
+        assert self.input_dims >= 1, "Expected input dimensions >= 1, got %s" \
+                                     % self.input_dims
+
+        if initialize_output_schema:
+            self.output_schema = schema.Scalar(
+                (np.float32, (output_dims, )),
+                model.net.NextScopedBlob(name + '_output')
+            )
+
+        self.output_dims = output_dims
+        assert self.output_dims >= 1, "Expected output dimensions >= 1, got %s" \
+                                      % self.output_dims
+        self.s = s
+        assert (self.s >= 0), "Expected s >= 0, got %s" % self.s
+        assert isinstance(self.s, int), "Expected s to be type int, got type %s" \
+                                        % type(self.s)
+
+        assert (scale > 0.0), "Expected scale > 0, got %s" % scale
+        self.stddev = scale * np.sqrt(1.0 / self.input_dims)
+
+        # Initialize train_init_net parameters
+        # Random Parameters
+        if set_weight_as_global_constant:
+            w_init = np.random.normal(scale=self.stddev,
+                                      size=(self.output_dims, self.input_dims))
+            b_init = np.random.uniform(low=-0.5 * self.stddev,
+                                       high=0.5 * self.stddev,
+                                       size=self.output_dims)
+            self.random_w = self.model.add_global_constant(
+                name=self.name + "_fixed_rand_W",
+                array=w_init
+            )
+            self.random_b = self.model.add_global_constant(
+                name=self.name + "_fixed_rand_b",
+                array=b_init
+            )
+        else:
+            (self.random_w, self.random_b) = self._initialize_params(
+                'random_w',
+                'random_b',
+                w_init=weight_init,
+                b_init=bias_init,
+                w_optim=weight_optim,
+                b_optim=bias_optim
+            )
+
+    def _initialize_params(self, w_name, b_name, w_init=None, b_init=None,
+                           w_optim=None, b_optim=None):
+        """
+        Initializes the Layer Parameters for weight and bias terms for features
+
+        Inputs :
+            w_blob -- blob to contain w values
+            b_blob -- blob to contain b values
+            w_init -- initialization distribution for weight parameter
+            b_init -- initialization distribution for bias parameter
+            w_optim -- optimizer to use for w; if None, then will use no optimizer
+            b_optim -- optimizer to user for b; if None, then will use no optimizer
+        """
+
+        w_init = w_init if w_init else (
+            'GaussianFill', {'mean': 0.0, 'std': self.stddev}
+        )
+        w_optim = w_optim if w_optim else self.model.NoOptim
+
+        b_init = b_init if b_init else (
+            'UniformFill', {'min': -0.5 * self.stddev, 'max': 0.5 * self.stddev}
+        )
+        b_optim = b_optim if b_optim else self.model.NoOptim
+
+        w_param = self.create_param(param_name=w_name,
+                                    shape=(self.output_dims, self.input_dims),
+                                    initializer=w_init,
+                                    optimizer=w_optim)
+
+        b_param = self.create_param(param_name=b_name,
+                                    shape=[self.output_dims],
+                                    initializer=b_init,
+                                    optimizer=b_optim)
+
+        return [w_param, b_param]
+
+    def _heaviside_with_power(self, net, input_features, output_blob, s):
+        """
+        Applies Heaviside step function and Relu / exponentiation to features
+        depending on the value of s.
+
+        Inputs:
+            net -- net with operators
+            input_features -- features to processes
+            output_blob -- output blob reference
+            s -- degree to raise the transformed features
+        """
+        if s == 0:
+            softsign_features = net.Softsign([input_features],
+                                             net.NextScopedBlob('softsign'))
+            return net.Relu(softsign_features, output_blob)
+        elif s == 1:
+            return net.Relu([input_features],
+                            output_blob)
+        else:
+            relu_features = net.Relu([input_features],
+                                     net.NextScopedBlob('relu_rand'))
+            pow_features = net.Pow([input_features],
+                                   net.NextScopedBlob('pow_rand'),
+                                   exponent=float(s - 1))
+            return net.Mul([relu_features, pow_features],
+                           output_blob)
+
+    def add_ops(self, net):
+        input_blob = self.input_record.field_blobs()
+
+        # Random features: wx + b
+        random_features = net.FC(input_blob + [self.random_w, self.random_b],
+                                 net.NextScopedBlob('random_features'))
+        # Process random features
+        self._heaviside_with_power(net,
+                                   random_features,
+                                   self.output_schema.field_blobs(),
+                                   self.s)
diff --git a/caffe2/python/layers/batch_distill_lr_loss.py b/caffe2/python/layers/batch_distill_lr_loss.py
new file mode 100644
index 0000000..fc73e81
--- /dev/null
+++ b/caffe2/python/layers/batch_distill_lr_loss.py
@@ -0,0 +1,191 @@
+## @package batch_distill_lr_loss
+# Module caffe2.python.layers.batch_distill_lr_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class BatchDistillLRLoss(ModelLayer):
+
+    def __init__(
+            self, model, input_record,
+            name='batch_distill_lr_loss', teacher_weight=0.0,
+            filter_invalid_teacher_label=False, **kwargs):
+
+        super(BatchDistillLRLoss, self).__init__(model, name, input_record, **kwargs)
+
+        assert teacher_weight >= 0 and teacher_weight <= 1, (
+            'teacher_weight=%0.2f should be in [0, 1]' % teacher_weight
+        )
+
+        self._teacher_weight = teacher_weight
+        self._filter_invalid_teacher_label = filter_invalid_teacher_label
+        # hyper-parameter determines whether to filter out bad teacehr labels,
+        # i.e., teacher labels that are zero.
+        if self._filter_invalid_teacher_label:
+            self.threshold = model.add_global_constant(
+                str(model.net.NextScopedBlob('threshold')),
+                [0.0],   # threshold for filtering teacher weight.
+                dtype=np.float
+            )
+            self.neg_ONE = model.add_global_constant(
+                str(model.net.NextScopedBlob('neg_ONE')),
+                [-1.0],
+                dtype=np.float
+            )
+            self.ONE = model._GetOne()
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('teacher_label', schema.Scalar()),
+                ('label', schema.Scalar()),
+                ('logit', schema.Scalar()),
+            ),
+            input_record
+        )
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+
+        self.output_schema = schema.Scalar(
+            np.float32,
+            self.get_next_blob_reference('output')
+        )
+
+    def add_ops(self, net):
+        label = self.input_record.label()
+        if self.input_record.label.field_type() != np.float32:
+            label = net.Cast(
+                label,
+                net.NextScopedBlob('float_label'),
+                to=core.DataType.FLOAT,
+            )
+
+        # Assuming 1-D input
+        label = net.ExpandDims(label, net.NextScopedBlob('expanded_label'),
+                               dims=[1])
+
+        teacher_label = self.input_record.teacher_label()
+
+        if self.input_record.teacher_label.field_type() != np.float32:
+            teacher_label = net.Cast(
+                teacher_label,
+                net.NextScopedBlob('float_teacher_label'),
+                to=core.DataType.FLOAT,
+            )
+        teacher_label = net.ExpandDims(
+            teacher_label, net.NextScopedBlob('expanded_teacher_label'),
+            dims=[1])
+
+        true_xent = net.SigmoidCrossEntropyWithLogits(
+            [self.input_record.logit(), label],
+            net.NextScopedBlob('cross_entropy')
+        )
+
+        teacher_xent = net.SigmoidCrossEntropyWithLogits(
+            [self.input_record.logit(), teacher_label],
+            net.NextScopedBlob('teacher_cross_entropy')
+        )
+        if self._filter_invalid_teacher_label:
+            squeezed_teacher_label = net.Squeeze(
+                teacher_label,
+                net.NextScopedBlob('squeezed_teacher_label'),
+                dims=[1]
+            )
+            # blob used to contain the original teacher weights
+            keep_weights = net.ConstantFill(
+                [squeezed_teacher_label],
+                net.NextScopedBlob('keep_weights'),
+                value=self._teacher_weight,
+                dtype=core.DataType.FLOAT
+            )
+            #blob used to zero out the teacher weights
+            zero_weights = net.ConstantFill(
+                [squeezed_teacher_label],
+                net.NextScopedBlob('zero_weights'),
+                value=0.0,
+                dtype=core.DataType.FLOAT
+            )
+
+            #Indicating which teacher labels are bad, i.e., are zero.
+            judge = net.GT(
+                [squeezed_teacher_label, self.threshold],
+                net.NextScopedBlob('judge'),
+                broadcast=1
+            )
+            #zero out bad teacher weights corresponding to bad teacher labels.
+            screened_teacher_weights = net.Conditional(
+                [judge, keep_weights, zero_weights],
+                net.NextScopedBlob('screened_teacher_weights')
+            )
+            neg_screened_teacher_weights = net.Mul(
+                [screened_teacher_weights, self.neg_ONE],
+                net.NextScopedBlob('neg_screened_teacher_weights'),
+                broadcast=1
+            )
+            one_minus_screened_teacher_weights = net.Add(
+                [neg_screened_teacher_weights, self.ONE],
+                net.NextScopedBlob('one_minus_screened_teacher_weights'),
+                broadcast=1
+            )
+            scaled_true_xent = net.Mul(
+                [true_xent, one_minus_screened_teacher_weights],
+                net.NextScopedBlob('scaled_cross_entropy'),
+                broadcast=1
+            )
+            scaled_teacher_xent = net.Mul(
+                [teacher_xent, screened_teacher_weights],
+                net.NextScopedBlob('scaled_teacher_cross_entropy'),
+                broadcast=1
+            )
+        else:
+            scaled_true_xent = net.Scale(
+                true_xent,
+                net.NextScopedBlob('scaled_cross_entropy'),
+                scale=1.0 - self._teacher_weight,
+            )
+            scaled_teacher_xent = net.Scale(
+                teacher_xent,
+                net.NextScopedBlob('scaled_teacher_cross_entropy'),
+                scale=self._teacher_weight,
+            )
+        if 'weight' in self.input_record.fields:
+            weight_blob = self.input_record.weight()
+            if self.input_record.weight.field_type().base != np.float32:
+                weight_blob = net.Cast(
+                    weight_blob,
+                    weight_blob + '_float32',
+                    to=core.DataType.FLOAT
+                )
+            weight_blob = net.StopGradient(
+                [weight_blob],
+                [net.NextScopedBlob('weight_stop_gradient')],
+            )
+            scaled_true_xent = net.Mul(
+                [scaled_true_xent, weight_blob],
+                net.NextScopedBlob('weighted_xent_label'),
+            )
+            scaled_teacher_xent = net.Mul(
+                [scaled_teacher_xent, weight_blob],
+                net.NextScopedBlob('weighted_xent_teacher'),
+            )
+
+        true_loss = net.AveragedLoss(
+            scaled_true_xent,
+            net.NextScopedBlob('true_loss')
+        )
+        teacher_loss = net.AveragedLoss(
+            scaled_teacher_xent,
+            net.NextScopedBlob('teacher_loss')
+        )
+        net.Add(
+            [true_loss, teacher_loss],
+            self.output_schema.field_blobs()
+        )
diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
new file mode 100644
index 0000000..2d9cc80
--- /dev/null
+++ b/caffe2/python/layers/batch_lr_loss.py
@@ -0,0 +1,193 @@
+## @package batch_lr_loss
+# Module caffe2.python.layers.batch_lr_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class BatchLRLoss(ModelLayer):
+
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='batch_lr_loss',
+        average_loss=True,
+        jsd_weight=0.0,
+        pos_label_target=1.0,
+        neg_label_target=0.0,
+        homotopy_weighting=False,
+        log_D_trick=False,
+        unjoined_lr_loss=False,
+        **kwargs
+    ):
+        super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
+
+        self.average_loss = average_loss
+
+        assert (schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar()),
+                ('logit', schema.Scalar())
+            ),
+            input_record
+        ))
+
+        self.jsd_fuse = False
+        assert jsd_weight >= 0 and jsd_weight <= 1
+        if jsd_weight > 0 or homotopy_weighting:
+            assert 'prediction' in input_record
+            self.init_weight(jsd_weight, homotopy_weighting)
+            self.jsd_fuse = True
+        self.homotopy_weighting = homotopy_weighting
+
+        assert pos_label_target <= 1 and pos_label_target >= 0
+        assert neg_label_target <= 1 and neg_label_target >= 0
+        assert pos_label_target >= neg_label_target
+        self.pos_label_target = pos_label_target
+        self.neg_label_target = neg_label_target
+
+        assert not (log_D_trick and unjoined_lr_loss)
+        self.log_D_trick = log_D_trick
+        self.unjoined_lr_loss = unjoined_lr_loss
+
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+
+        self.output_schema = schema.Scalar(
+            np.float32,
+            self.get_next_blob_reference('output')
+        )
+
+    def init_weight(self, jsd_weight, homotopy_weighting):
+        if homotopy_weighting:
+            self.mutex = self.create_param(
+                param_name=('%s_mutex' % self.name),
+                shape=None,
+                initializer=('CreateMutex', ),
+                optimizer=self.model.NoOptim,
+            )
+            self.counter = self.create_param(
+                param_name=('%s_counter' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': 0,
+                        'dtype': core.DataType.INT64
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
+            self.xent_weight = self.create_param(
+                param_name=('%s_xent_weight' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': 1.,
+                        'dtype': core.DataType.FLOAT
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
+            self.jsd_weight = self.create_param(
+                param_name=('%s_jsd_weight' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': 0.,
+                        'dtype': core.DataType.FLOAT
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
+        else:
+            self.jsd_weight = self.model.add_global_constant(
+                '%s_jsd_weight' % self.name, jsd_weight
+            )
+            self.xent_weight = self.model.add_global_constant(
+                '%s_xent_weight' % self.name, 1. - jsd_weight
+            )
+
+    def update_weight(self, net):
+        net.AtomicIter([self.mutex, self.counter], [self.counter])
+        # iter = 0: lr = 1;
+        # iter = 1e6; lr = 0.5^0.1  = 0.93
+        # iter = 1e9; lr = 1e-3^0.1 = 0.50
+        net.LearningRate([self.counter], [self.xent_weight], base_lr=1.0,
+                         policy='inv', gamma=1e-6, power=0.1,)
+        net.Sub(
+            [self.model.global_constants['ONE'], self.xent_weight],
+            [self.jsd_weight]
+        )
+        return self.xent_weight, self.jsd_weight
+
+    def add_ops(self, net):
+        # numerically stable log-softmax with crossentropy
+        label = self.input_record.label()
+        # mandatory cast to float32
+        # self.input_record.label.field_type().base is np.float32 but
+        # label type is actually int
+        label = net.Cast(
+            label,
+            net.NextScopedBlob('label_float32'),
+            to=core.DataType.FLOAT)
+        label = net.ExpandDims(label, net.NextScopedBlob('expanded_label'),
+                                dims=[1])
+        if self.pos_label_target != 1.0 or self.neg_label_target != 0.0:
+            label = net.StumpFunc(
+                label,
+                net.NextScopedBlob('smoothed_label'),
+                threshold=0.5,
+                low_value=self.neg_label_target,
+                high_value=self.pos_label_target,
+            )
+        xent = net.SigmoidCrossEntropyWithLogits(
+            [self.input_record.logit(), label],
+            net.NextScopedBlob('cross_entropy'),
+            log_D_trick=self.log_D_trick,
+            unjoined_lr_loss=self.unjoined_lr_loss
+        )
+        # fuse with JSD
+        if self.jsd_fuse:
+            jsd = net.BernoulliJSD(
+                [self.input_record.prediction(), label],
+                net.NextScopedBlob('jsd'),
+            )
+            if self.homotopy_weighting:
+                self.update_weight(net)
+            loss = net.WeightedSum(
+                [xent, self.xent_weight, jsd, self.jsd_weight],
+                net.NextScopedBlob('loss'),
+            )
+        else:
+            loss = xent
+        if 'weight' in self.input_record.fields:
+            weight_blob = self.input_record.weight()
+            if self.input_record.weight.field_type().base != np.float32:
+                weight_blob = net.Cast(
+                    weight_blob,
+                    weight_blob + '_float32',
+                    to=core.DataType.FLOAT
+                )
+            weight_blob = net.StopGradient(
+                [weight_blob],
+                [net.NextScopedBlob('weight_stop_gradient')],
+            )
+            loss = net.Mul(
+                [loss, weight_blob],
+                net.NextScopedBlob('weighted_cross_entropy'),
+            )
+
+        if self.average_loss:
+            net.AveragedLoss(loss, self.output_schema.field_blobs())
+        else:
+            net.ReduceFrontSum(loss, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py
new file mode 100644
index 0000000..6621bca
--- /dev/null
+++ b/caffe2/python/layers/batch_mse_loss.py
@@ -0,0 +1,82 @@
+## @package batch_mse_loss
+# Module caffe2.python.layers.batch_mse_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class BatchMSELoss(ModelLayer):
+
+    def __init__(self, model, input_record, name='batch_mse_loss', **kwargs):
+        super(BatchMSELoss, self).__init__(model, name, input_record, **kwargs)
+
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar()),
+                ('prediction', schema.Scalar())
+            ),
+            input_record
+        )
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+
+        self.output_schema = schema.Scalar(
+            np.float32,
+            self.get_next_blob_reference('output'))
+
+    def add_ops(self, net):
+        prediction = net.Squeeze(
+            self.input_record.prediction(),
+            net.NextScopedBlob('squeezed_prediction'),
+            dims=[1]
+        )
+
+        label = self.input_record.label.field_blobs()
+        if self.input_record.label.field_type().base != (
+                self.input_record.prediction.field_type().base):
+
+            label = net.Cast(
+                label,
+                net.NextScopedBlob('cast_label'),
+                to=schema.data_type_for_dtype(
+                    self.input_record.prediction.field_type()
+                )
+            )
+
+        label = net.StopGradient(
+            label,
+            net.NextScopedBlob('stopped_label')
+        )
+
+        l2dist = net.SquaredL2Distance(
+            [label, prediction],
+            net.NextScopedBlob('l2')
+        )
+
+        if 'weight' in self.input_record.fields:
+            weight_blob = self.input_record.weight()
+            if self.input_record.weight.field_type().base != np.float32:
+                weight_blob = net.Cast(
+                    weight_blob,
+                    weight_blob + '_float32',
+                    to=core.DataType.FLOAT
+                )
+            weight_blob = net.StopGradient(
+                [weight_blob],
+                [net.NextScopedBlob('weight_stop_gradient')],
+            )
+            l2dist = net.Mul(
+                [l2dist, weight_blob],
+                net.NextScopedBlob('weighted_l2_distance'),
+            )
+
+        net.AveragedLoss(l2dist, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
new file mode 100644
index 0000000..31ff7a1
--- /dev/null
+++ b/caffe2/python/layers/batch_normalization.py
@@ -0,0 +1,107 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+
+import numpy as np
+
+
+class BatchNormalization(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='batch_normalization',
+        scale_optim=None,
+        bias_optim=None,
+        momentum=0.9,
+        order='NCHW',
+        **kwargs
+    ):
+        super(BatchNormalization, self).__init__(
+            model, name, input_record, **kwargs)
+
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+
+        self.input_shape = input_record.field_type().shape
+
+        if len(self.input_shape) == 3:
+            if order == "NCHW":
+                input_dims = self.input_shape[0]
+            elif order == "NHWC":
+                input_dims = self.input_shape[2]
+            else:
+                raise ValueError("Please specify a correct order")
+        else:
+            assert len(self.input_shape) == 1, (
+                "This layer supports only 4D or 2D tesnors")
+            input_dims = self.input_shape[0]
+
+        self.output_schema = schema.Scalar(
+            (np.float32, self.input_shape),
+            self.get_next_blob_reference('output')
+        )
+
+        self.momentum = momentum
+        self.order = order
+
+        self.scale = self.create_param(param_name='scale',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 1.0}),
+                                       optimizer=scale_optim)
+        self.bias = self.create_param(param_name='bias',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 0.0}),
+                                       optimizer=bias_optim)
+        self.rm = self.create_param(param_name='running_mean',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 0.0}),
+                                       optimizer=model.NoOptim)
+        self.riv = self.create_param(param_name='running_inv_var',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 1.0}),
+                                       optimizer=model.NoOptim)
+
+    def _add_ops(self, net, is_test, out_blob=None):
+        original_input_blob = self.input_record.field_blobs()
+        input_blob = net.NextScopedBlob('expand_input')
+        if len(self.input_shape) == 1:
+            input_blob = net.ExpandDims(original_input_blob,
+                                        dims=[2, 3])
+        else:
+            input_blob = original_input_blob[0]
+
+        if out_blob is None:
+            bn_output = self.output_schema.field_blobs()
+        else:
+            bn_output = out_blob
+        if is_test:
+            output_blobs = bn_output
+        else:
+            output_blobs = bn_output + [self.rm, self.riv,
+                                        net.NextScopedBlob('bn_saved_mean'),
+                                        net.NextScopedBlob('bn_saved_iv')]
+
+        net.SpatialBN([input_blob, self.scale,
+                       self.bias, self.rm, self.riv],
+                      output_blobs,
+                      momentum=self.momentum,
+                      is_test=is_test,
+                      order=self.order)
+
+        if len(self.input_shape) == 1:
+            net.Squeeze(bn_output,
+                        bn_output,
+                        dims=[2, 3])
+
+    def add_train_ops(self, net):
+        self._add_ops(net, is_test=False)
+
+    def add_eval_ops(self, net):
+        self._add_ops(net, is_test=True)
+
+    def add_ops(self, net):
+        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
new file mode 100644
index 0000000..9ef8cf5
--- /dev/null
+++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
@@ -0,0 +1,49 @@
+## @package batch_sigmoid_cross_entropy_loss
+# Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+from caffe2.python.layers.tags import Tags
+import numpy as np
+
+
+class BatchSigmoidCrossEntropyLoss(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='batch_sigmoid_cross_entropy_loss',
+        **kwargs
+    ):
+        super(BatchSigmoidCrossEntropyLoss, self).__init__(
+            model, name, input_record, **kwargs)
+
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar(np.float32)),
+                ('prediction', schema.Scalar(np.float32)),
+            ),
+            input_record
+        )
+        assert input_record.prediction.field_type().shape == \
+            input_record.label.field_type().shape, \
+            "prediction and label must have the same shape"
+
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+
+        self.output_schema = schema.Scalar(
+            (np.float32, tuple()), self.get_next_blob_reference('loss')
+        )
+
+    def add_ops(self, net):
+        sigmoid_cross_entropy = net.SigmoidCrossEntropyWithLogits(
+            [self.input_record.prediction(), self.input_record.label()],
+            net.NextScopedBlob('sigmoid_cross_entropy')
+        )
+
+        net.AveragedLoss(
+            sigmoid_cross_entropy, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py
new file mode 100644
index 0000000..2a36e68
--- /dev/null
+++ b/caffe2/python/layers/batch_softmax_loss.py
@@ -0,0 +1,122 @@
+## @package batch_softmax_loss
+# Module caffe2.python.layers.batch_softmax_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+import numpy as np
+
+
+class BatchSoftmaxLoss(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='batch_softmax_loss',
+        label_smoothing_matrix=None,
+        label_prob=False,
+        **kwargs
+    ):
+        super(BatchSoftmaxLoss, self).__init__(
+            model, name, input_record, **kwargs)
+
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar()),
+                ('prediction', schema.Scalar()),
+            ),
+            input_record
+        )
+        self.label_prob = label_prob
+
+        # label smoothing matrix: a K * K matrix where K is the label
+        # cardinality; (i, j) element is the value of for label i
+        # treated/smoothed as label j
+        self.label_smoothing_matrix = label_smoothing_matrix
+        if self.label_smoothing_matrix is not None:
+            self.initialize_label_smoothing_constants()
+
+        self.output_schema = schema.Struct(
+            (
+                'softmax', schema.Scalar(
+                    input_record.prediction.field_type(),
+                    self.get_next_blob_reference('softmax')
+                )
+            ),
+            (
+                'loss', schema.Scalar(
+                    np.float32, self.get_next_blob_reference('loss')
+                )
+            ),
+        )
+
+    def initialize_label_smoothing_constants(self):
+        assert self.label_smoothing_matrix is not None
+        self.label_smoothing_matrix = np.array(
+            self.label_smoothing_matrix).astype(np.float32)
+        assert len(self.label_smoothing_matrix.shape) == 2
+        label_dim = self.label_smoothing_matrix.shape[0]
+        assert label_dim == self.label_smoothing_matrix.shape[1]
+
+        self.label_smoothing_matrix = self.model.add_global_constant(
+            '%s_label_smoothing_matrix' % self.name,
+            array=self.label_smoothing_matrix,
+            dtype=np.dtype(np.float32),
+        )
+        self.label_dim = self.model.add_global_constant(
+            '%s_label_dim' % self.name,
+            array=label_dim,
+            dtype=np.dtype(np.int64),
+        )
+        # default case: label is given NOT as target distribution
+        # but when used in label smoothing, the label must be in probabilities
+        self.label_prob = True
+
+    def compute_smoothed_label(self, net):
+        assert self.label_smoothing_matrix is not None
+        label = self.input_record.label()
+        original_label_type = self.input_record.label.field_type()
+        if original_label_type.base != np.int64:
+            int64_label = net.NextScopedBlob('int64_label')
+            net.Cast([label], [int64_label], to=core.DataType.INT64)
+        else:
+            int64_label = label
+        one_hot_label = net.NextScopedBlob('one_hot_label')
+        smoothed_label = net.NextScopedBlob('smoothed_label')
+        net.OneHot([int64_label, self.label_dim], [one_hot_label])
+        net.MatMul([one_hot_label, self.label_smoothing_matrix], smoothed_label)
+        return smoothed_label
+
+    def add_ops(self, net):
+        label = self.input_record.label.field_blobs()
+        if self.label_smoothing_matrix is not None:
+            label = [self.compute_smoothed_label(net)]
+        elif not self.label_prob:
+            if self.input_record.label.field_types()[0].base != np.int32:
+                label = [
+                    net.Cast(label,
+                             net.NextScopedBlob('int32_label'),
+                             to=core.DataType.INT32)
+                ]
+
+        softmax_input = self.input_record.prediction.field_blobs() + label
+
+        if 'weight' in self.input_record:
+            weight_blob = self.input_record.weight()
+            if self.input_record.weight.field_type().base != np.float32:
+                weight_blob = net.Cast(
+                    weight_blob,
+                    weight_blob + '_float32',
+                    to=core.DataType.FLOAT
+                )
+
+            softmax_input += [weight_blob]
+
+        net.SoftmaxWithLoss(
+            softmax_input,
+            self.output_schema.field_blobs(),
+            label_prob=self.label_prob,
+        )
diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py
new file mode 100644
index 0000000..cf8ecfd
--- /dev/null
+++ b/caffe2/python/layers/blob_weighted_sum.py
@@ -0,0 +1,73 @@
+## @package BlobWeightedSum
+# Module caffe2.python.layers.blob_weighted_sum
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class BlobWeightedSum(ModelLayer):
+    """
+    This layer implements the weighted sum:
+    weighted element-wise sum of input blobs.
+    """
+    def __init__(
+        self,
+        model,
+        input_record,
+        init_weights=None,
+        weight_optim=None,
+        name='blob_weighted_sum',
+        **kwargs
+    ):
+        super(BlobWeightedSum, self).__init__(model, name, input_record, **kwargs)
+
+        self.blobs = self.input_record.field_blobs()
+
+        self.num_weights = len(self.blobs)
+        assert self.num_weights > 1, (
+            "BlobWeightedSum expects more than one input blobs"
+        )
+
+        assert len(input_record.field_types()[0].shape) > 0, (
+            "BlobWeightedSum expects limited dimensions of the input tensor"
+        )
+
+        assert all(
+            input_record.field_types()[0].shape == input_record.field_types()[i].shape
+            for i in range(1, self.num_weights)
+        ), "Shape of input blobs should be the same shape {}".format(
+            input_record.field_types()[0].shape
+        )
+
+        if init_weights:
+            assert self.num_weights == len(init_weights), (
+                "the size of init_weights should be the same as input blobs, "
+                "expects {}, got {}".format(self.num_weights, len(init_weights))
+            )
+        else:
+            init_weights = [1.0] * self.num_weights
+
+        self.weights = [
+            self.create_param(
+                param_name="w_{}".format(idx),
+                shape=[1],
+                initializer=('ConstantFill', {'value': float(init_weights[idx])}),
+                optimizer=weight_optim
+            ) for idx in range(self.num_weights)
+        ]
+
+        self.output_schema = schema.Scalar(
+            input_record.field_types()[0],
+            self.get_next_blob_reference('blob_weighted_sum_out')
+        )
+
+    def add_ops(self, net):
+        net.WeightedSum(
+            [x for pair in zip(self.blobs, self.weights) for x in pair],
+            self.output_schema(),
+            grad_on_w=True,
+        )
diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py
new file mode 100644
index 0000000..c177c38
--- /dev/null
+++ b/caffe2/python/layers/build_index.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class MapToRange(ModelLayer):
+    """
+    This layer aims to build a mapping from raw keys to indices within [0, max_index).
+    The mapping is continuously built during training. The mapping will be frozen during
+    evaluation and prediction. Unseen keys will be assigned to index 0.
+    """
+
+    def __init__(
+        self, model,
+        input_record,
+        max_index,
+        name='map_to_range',
+        **kwargs
+    ):
+        super(MapToRange, self).__init__(model, name, input_record, **kwargs)
+
+        assert max_index > 0
+        assert isinstance(input_record, schema.Scalar)
+
+        self.max_index = max_index
+
+        self.handler = self.create_param(
+            param_name='handler',
+            shape=None,
+            initializer=('LongIndexCreate', {'max_elements': self.max_index}),
+            optimizer=model.NoOptim
+        )
+
+        self.output_schema = schema.Struct(
+            ('indices', schema.Scalar(
+                np.int64, self.get_next_blob_reference("indices")
+            )),
+            ('handler', schema.Scalar(
+                np.void, self.handler
+            )),
+        )
+
+    def add_train_ops(self, net):
+        if self.input_record.field_type().base != np.int64:
+            keys = net.Cast(
+                self.input_record(),
+                net.NextScopedBlob("indices_before_mapping"),
+                to=core.DataType.INT64
+            )
+        else:
+            keys = self.input_record()
+
+        # Load keys into indices
+        indices = net.IndexGet([self.handler, keys],
+                                self.output_schema.indices())
+
+        net.StopGradient(indices, indices)
+
+    def add_eval_ops(self, net):
+        net.IndexFreeze(self.handler, self.handler)
+        self.add_train_ops(net)
+
+    def add_ops(self, net):
+        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py
new file mode 100644
index 0000000..126a421
--- /dev/null
+++ b/caffe2/python/layers/concat.py
@@ -0,0 +1,107 @@
+## @package concat
+# Module caffe2.python.layers.concat
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from future.utils import viewitems
+import numpy as np
+
+import logging
+logger = logging.getLogger(__name__)
+
+class Concat(ModelLayer):
+    """
+    Construct Concat layer
+    Assume that first dimension is batch,
+
+    Example:
+
+        embedding_dim = 64
+        input_record = self.new_record(schema.Struct(
+            ('input1', schema.Scalar((np.float32, (embedding_dim, )))),
+            ('input2', schema.Scalar((np.float32, (embedding_dim, )))),
+            ('input3', schema.Scalar((np.float32, (embedding_dim, )))),
+        ))
+
+        output = self.model.Concat(input_record)
+        self.assertEqual(
+            schema.Scalar((np.float32, ((len(input_record.fields) * embedding_dim, )))),
+            output
+        )
+
+        # Note that in Concat layer we assume first dimension is batch.
+        # so input is B * embedding_dim
+        # add_axis=1 make it B * 1 * embedding_dim
+        # Concat on axis=1 make it B * N * embedding_dim
+
+        output = self.model.Concat(input_record, axis=1, add_axis=1)
+        self.assertEqual(
+            schema.Scalar((np.float32, ((len(input_record.fields), embedding_dim)))),
+            output
+        )
+    """
+
+    def __init__(self, model, input_record, axis=1, add_axis=0,
+                 name='concat', **kwargs):
+        super(Concat, self).__init__(model, name, input_record, **kwargs)
+        self.axis = axis
+        self.add_axis = add_axis
+        assert not (axis == 0 and add_axis == 1), \
+            "It's not allowed to add axis=0"
+        assert isinstance(input_record, schema.Struct),\
+            "Incorrect input type. Excpected Struct, but received: {0}".\
+            format(input_record)
+
+        shapes = []
+        for field_name, field_type in viewitems(input_record.fields):
+            assert isinstance(field_type, schema.Scalar),\
+                "Incorrect input type for {}. Excpected Scalar, but got: {}".\
+                format(field_name, field_type)
+            # Assume that first dimension is batch, so actual axis in shape is
+            # axis - 1
+            shape = list(field_type.field_type().shape)
+            if add_axis:
+                shape.insert(axis - 1, 1)
+            assert len(shape) >= axis,\
+                "Concat expects that limited dimensions of the input tensor"
+            shapes.append(shape)
+        logger.info('Concat Layer input shapes: ' + str(shapes))
+
+        if axis == 0:
+            self.output_schema = schema.from_blob_list(
+                input_record[0],
+                [self.get_next_blob_reference('output')]
+            )
+            return
+
+        concat_dim = 0
+        for shape in shapes:
+            concat_dim += shape[axis - 1]
+            shape[axis - 1] = 0
+            assert shape == shapes[0],\
+                "Shapes {0} and {1} are not compatible for Concat".\
+                format(shape, shapes[0])
+        output_dims = shapes[0]
+        output_dims[axis - 1] = concat_dim
+
+        logger.info('Concat Layer output_dims: ' + str(output_dims))
+        self.output_schema = schema.Scalar(
+            (np.float32, output_dims),
+            self.get_next_blob_reference('output'))
+
+    def add_ops(self, net):
+        net.Concat(
+            self.input_record.field_blobs(),
+            [
+                self.output_schema.field_blobs()[0],
+                self.output_schema.field_blobs()[0] + "_concat_dims"
+            ],
+            axis=self.axis,
+            add_axis=self.add_axis,
+        )
diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py
new file mode 100644
index 0000000..06e9d9c
--- /dev/null
+++ b/caffe2/python/layers/constant_weight.py
@@ -0,0 +1,44 @@
+# @package constant_weight
+# Module caffe2.fb.python.layers.constant_weight
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+import numpy as np
+
+
+class ConstantWeight(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        weights=None,
+        name='constant_weight',
+        **kwargs
+    ):
+        super(ConstantWeight,
+              self).__init__(model, name, input_record, **kwargs)
+        self.output_schema = schema.Scalar(
+            np.float32, self.get_next_blob_reference('constant_weight')
+        )
+        self.data = self.input_record.field_blobs()
+        self.num = len(self.data)
+        weights = (
+            weights if weights is not None else
+            [1. / self.num for _ in range(self.num)]
+        )
+        assert len(weights) == self.num
+        self.weights = [
+            self.model.add_global_constant(
+                '%s_weight_%d' % (self.name, i), float(weights[i])
+            ) for i in range(self.num)
+        ]
+
+    def add_ops(self, net):
+        net.WeightedSum(
+            [b for x_w_pair in zip(self.data, self.weights) for b in x_w_pair],
+            self.output_schema()
+        )
diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py
new file mode 100644
index 0000000..bb22acf
--- /dev/null
+++ b/caffe2/python/layers/conv.py
@@ -0,0 +1,135 @@
+## @package conv
+# Module caffe2.python.layers.conv
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+import numpy as np
+
+
+class Conv(ModelLayer):
+    """
+        Convolutional layer
+        Input:
+        - input_record: at least has the shape info of C (num_channels)
+        - output_dim: number of convolutional filters
+        - kernel_h, kernel_w: kernel size for h and w
+        - stride_h, stride_w: stride for h and w
+        - pad_b, pad_l, pad_r, pad_t: padding sizes, if stride == 1,
+                                      'None' value will do auto padding
+        - order: either 'NHWC' or 'NCHW'
+    """
+
+    def __init__(self, model, input_record, output_dim, kernel_h, kernel_w,
+                 stride_h, stride_w, pad_b=None, pad_l=None, pad_r=None,
+                 pad_t=None, order='NHWC', kernel_init=None, bias_init=None,
+                 kernel_optim=None, bias_optim=None,
+                 name='conv', **kwargs):
+
+        super(Conv, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        # input num_channels (C) is needed
+        input_dims = input_record.field_type().shape
+
+        assert (kernel_h > 0 and isinstance(kernel_h, int)), (
+            "kernel_h should be positive integer")
+        assert (kernel_w > 0 and isinstance(kernel_w, int)), (
+            "kernel_w should be positive integer")
+        self.kernel_h = kernel_h
+        self.kernel_w = kernel_w
+
+        assert (stride_h > 0 and isinstance(stride_h, int)), (
+            "stride_h should be positive integer")
+        assert (stride_w > 0 and isinstance(stride_w, int)), (
+            "stride_w should be positive integer")
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+
+        # output_dim calculation (http://cs231n.github.io/convolutional-networks/)
+        # output_dim_w = (input_dim_w - kernel_w + pad_r + pad_l) / stride_w + 1
+        # so, do auto_padding requires
+        # pad_r, pad_l = [(input_dim_w - 1) * stride_w - input_dim_w + kernel_w] / 2
+        # similair for pad_t and pad_b to auto pad kernel_h
+        # here we only do auto padding for stride = 1 case
+        if stride_h == 1:
+            pad_t = int((kernel_h - 1) / 2) if pad_t is None else pad_t
+            pad_b = int((kernel_h - 1) / 2) if pad_b is None else pad_b
+        else:
+            pad_t = 0 if pad_t is None else pad_t
+            pad_b = 0 if pad_b is None else pad_b
+
+        if stride_w == 1:
+            pad_r = int((kernel_w - 1) / 2) if pad_r is None else pad_r
+            pad_l = int((kernel_w - 1) / 2) if pad_l is None else pad_l
+        else:
+            pad_r = 0 if pad_r is None else pad_r
+            pad_l = 0 if pad_l is None else pad_l
+
+        assert (pad_t >= 0 and isinstance(pad_t, int)), "pad_t should be int >= 0"
+        assert (pad_b >= 0 and isinstance(pad_b, int)), "pad_b should be int >= 0"
+        assert (pad_r >= 0 and isinstance(pad_r, int)), "pad_r should be int >= 0"
+        assert (pad_l >= 0 and isinstance(pad_l, int)), "pad_l should be int >= 0"
+        self.pad_t = pad_t
+        self.pad_b = pad_b
+        self.pad_r = pad_r
+        self.pad_l = pad_l
+
+        assert order in ['NHWC', 'NCHW'], "order should either 'NHWC' or 'NCHW'"
+        self.order = order
+
+        if order == 'NHWC':
+            input_c = input_dims[-1]
+            kernel_shape = [output_dim, kernel_h, kernel_w, input_c]
+        elif order == 'NCHW':
+            input_c = input_dims[0]
+            kernel_shape = [output_dim, input_c, kernel_h, kernel_w]
+        assert input_c > 0, (
+            "Number of input channels in conv parameters should be positive")
+
+        kernel_init = kernel_init if kernel_init else (
+            'XavierFill', {}
+        )
+        bias_init = bias_init if bias_init else (
+            'ConstantFill', {'value': 0.0}
+        )
+
+        self.kernel = self.create_param(
+            param_name='conv_kernel',
+            shape=kernel_shape,
+            initializer=kernel_init,
+            optimizer=kernel_optim,
+        )
+
+        self.bias = self.create_param(
+            param_name='conv_bias',
+            shape=[output_dim],
+            initializer=bias_init,
+            optimizer=bias_optim,
+        )
+
+        # the output_schema only has the num of output channels
+        # output_h and output_w would be inferred internally
+        self.output_schema = schema.Scalar(
+            (np.float32, (output_dim,)),
+            self.get_next_blob_reference('output')
+        )
+
+    def add_ops(self, net):
+        net.Conv(
+            self.input_record.field_blobs() + [self.kernel, self.bias],
+            self.output_schema.field_blobs(),
+            kernel_h=self.kernel_h,
+            kernel_w=self.kernel_w,
+            stride_h=self.stride_h,
+            stride_w=self.stride_w,
+            pad_t=self.pad_t,
+            pad_l=self.pad_l,
+            pad_b=self.pad_b,
+            pad_r=self.pad_r,
+            order=self.order
+        )
diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py
new file mode 100644
index 0000000..f2a8873
--- /dev/null
+++ b/caffe2/python/layers/dropout.py
@@ -0,0 +1,48 @@
+# Module caffe2.python.layers.dropout
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class Dropout(ModelLayer):
+
+    def __init__(
+            self,
+            model,
+            input_record,
+            name='dropout',
+            ratio=0.5,
+            **kwargs):
+
+        super(Dropout, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        assert (ratio >= 0 and ratio < 1.0), \
+            "Expected 0 <= ratio < 1, but got ratio of %s" % ratio
+
+        self.output_schema = input_record.clone_schema()
+        self.output_schema.set_value(self.get_next_blob_reference('output'))
+
+        self.ratio = ratio
+
+    def _add_ops(self, net, is_test):
+        input_blob = self.input_record.field_blobs()
+        output_blobs = self.output_schema.field_blobs() \
+                     + [net.NextScopedBlob('d_mask')]
+
+        net.Dropout(input_blob,
+                    output_blobs,
+                    ratio=self.ratio,
+                    is_test=is_test)
+
+    def add_train_ops(self, net):
+        self._add_ops(net, is_test=False)
+
+    def add_eval_ops(self, net):
+        self._add_ops(net, is_test=True)
+
+    def add_ops(self, net):
+        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py
new file mode 100644
index 0000000..b949cc5
--- /dev/null
+++ b/caffe2/python/layers/fc.py
@@ -0,0 +1,151 @@
+## @package fc
+# Module caffe2.python.layers.fc
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
+import math
+import numpy as np
+
+
+class FC(SamplingTrainableMixin, ModelLayer):
+
+    def __init__(self, model, input_record, output_dims, weight_init=None,
+                 bias_init=None, weight_optim=None, bias_optim=None, name='fc',
+                 weight_reg=None, bias_reg=None, clip_param=None, max_fc_size=None,
+                 **kwargs):
+        super(FC, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Scalar), (
+            "Incorrect input type {}".format(input_record))
+        assert len(input_record.field_types()[0].shape) > 0, (
+            "FC expects limited dimensions of the input tensor")
+
+        input_dims = input_record.field_types()[0].shape[0]
+        assert input_dims > 0, (
+            "FC expects input dimensions > 0, got {}".format(input_dims))
+
+        self.clip_args = None
+        if (clip_param is not None):
+            assert len(clip_param) == 2, (
+                'clip_param must be a tuple / list '
+                'of length 2 and in the form of (clip_min, clip max)'
+            )
+            clip_min, clip_max = clip_param
+            assert clip_min is not None or clip_max is not None, (
+                'clip_min, and clip_max in clip_param cannot both be None'
+            )
+            assert (
+                (clip_min is None or clip_max is None) or clip_min < clip_max
+            ), (
+                'clip_param = [clip_min, clip_max] must have clip_min < clip_max'
+            )
+            self.clip_args = {}
+            if clip_min is not None:
+                self.clip_args['min'] = clip_min
+            if clip_max is not None:
+                self.clip_args['max'] = clip_max
+
+        scale = math.sqrt(1.0 / input_dims)
+        weight_init = weight_init if weight_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+        bias_init = bias_init if bias_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+
+        self.output_dim_vec = FC.calculate_fc_output_dims(
+            max_fc_size, input_dims, output_dims)
+
+        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
+            self.w = self.create_param(param_name='w',
+                                       shape=[output_dims, input_dims],
+                                       initializer=weight_init,
+                                       optimizer=weight_optim,
+                                       regularizer=weight_reg)
+
+            self.b = self.create_param(param_name='b',
+                                       shape=[output_dims, ],
+                                       initializer=bias_init,
+                                       optimizer=bias_optim,
+                                       regularizer=bias_reg)
+        else:
+            self.w_vec = []
+            self.b_vec = []
+
+            for idx, output_dim in enumerate(self.output_dim_vec):
+                self.w_vec.append(self.create_param(param_name='w_sub_{}'.format(idx),
+                                             shape=[output_dim, input_dims],
+                                             initializer=weight_init,
+                                             optimizer=weight_optim,
+                                             regularizer=weight_reg))
+
+                self.b_vec.append(self.create_param(param_name='b_sub_{}'.format(idx),
+                                             shape=[output_dim, ],
+                                             initializer=weight_init,
+                                             optimizer=weight_optim,
+                                             regularizer=weight_reg))
+
+        self.output_schema = schema.Scalar(
+            (np.float32, (output_dims, )),
+            self.get_next_blob_reference('output')
+        )
+
+    @staticmethod
+    def calculate_fc_output_dims(max_fc_size, input_dim, output_dim):
+
+        if not max_fc_size or max_fc_size < 0:
+            return None
+
+        assert max_fc_size >= input_dim, "Currently we split along the output " \
+            "dimension. So we need max_fc_size >= input_dim. But, max_fc_size: " \
+            "{}, input_dim: {}".format(max_fc_size, input_dim)
+
+        output_dim_allowed = int(np.floor(max_fc_size / input_dim))
+        num_fc = int(np.floor((output_dim - 1) / output_dim_allowed) + 1)
+
+        output_dim_vec = [output_dim_allowed] * (num_fc - 1)
+
+        output_dim_vec.append(output_dim - sum(output_dim_vec))
+
+        return output_dim_vec
+
+    def _add_ops(self, net, params):
+        if self.clip_args is not None:
+            clipped_params = [net.NextScopedBlob(
+                'clipped_%s' % str(p)) for p in params]
+            for p, cp in zip(params, clipped_params):
+                net.Clip([p], [cp], **self.clip_args)
+
+            params = clipped_params
+
+        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
+            net.FC(self.input_record.field_blobs() + params,
+                   self.output_schema.field_blobs(), **self.kwargs)
+        else:
+            w_vec = params[:int(len(params) / 2)]
+            b_vec = params[int(len(params) / 2):]
+
+            assert len(w_vec) == len(b_vec)
+
+            output_blob_vec = []
+
+            for i in range(len(self.output_dim_vec)):
+                output_blob = net.NextScopedBlob(
+                    'output_sub_{}'.format(i))
+                output_blob_vec.append(
+                    net.FC(self.input_record.field_blobs() +
+                           [w_vec[i], b_vec[i]],
+                           [output_blob], **self.kwargs))
+
+            net.Concat(output_blob_vec,
+                       self.output_schema.field_blobs() +
+                       [self.output_schema.field_blobs()[0] + "_concat_dims"])
+
+    @property
+    def param_blobs(self):
+        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
+            return [self.w, self.b]
+        else:
+            return self.w_vec + self.b_vec
diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py
new file mode 100644
index 0000000..fad300d
--- /dev/null
+++ b/caffe2/python/layers/fc_without_bias.py
@@ -0,0 +1,62 @@
+## @package fc_without_bias
+# Module caffe2.python.layers.fc_without_bias
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
+
+import math
+import numpy as np
+
+
+class FCWithoutBias(SamplingTrainableMixin, ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        output_dims,
+        weight_init=None,
+        weight_optim=None,
+        name='fc_without_bias',
+        **kwargs
+    ):
+        super(FCWithoutBias, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        assert len(input_record.field_types()[0].shape) > 0, (
+            "FCWithoutBias expects limited dimensions of the input tensor"
+        )
+
+        input_dims = input_record.field_types()[0].shape[0]
+        assert input_dims > 0, (
+            "FCWithoutBias expects input dimensions > 0, got {}".format(input_dims)
+        )
+
+        self.output_schema = schema.Scalar(
+            (np.float32, (output_dims, )),
+            self.get_next_blob_reference('output')
+        )
+
+        scale = math.sqrt(1.0 / input_dims)
+        weight_init = weight_init if weight_init else (
+            'UniformFill', {'min': -scale,
+                            'max': scale}
+        )
+
+        self.w = self.create_param(param_name='w',
+                                   shape=[output_dims, input_dims],
+                                   initializer=weight_init,
+                                   optimizer=weight_optim)
+
+    def _add_ops(self, net, params):
+        net.MatMul(
+            self.input_record.field_blobs() + params,
+            self.output_schema.field_blobs(), trans_b=1, **self.kwargs
+        )
+
+    @property
+    def param_blobs(self):
+        return [self.w]
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
new file mode 100644
index 0000000..70ff121
--- /dev/null
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -0,0 +1,242 @@
+# @package sparse_to_dense
+# Module caffe2.python.layers.sparse_to_dense
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+import numpy as np
+
+
+class FeatureSparseToDense(ModelLayer):
+
+    def __init__(self, model, input_record, input_specs,
+                 name='feature_sparse_to_dense', **kwargs):
+        """
+        `input_specs` follows the format of FeatureSpec from schema. To be more
+        precise it's a namedtuple that should have:
+            'feature_type', 'feature_names', 'feature_ids'
+        """
+        super(FeatureSparseToDense, self).__init__(model, name,
+                                            input_record, **kwargs)
+
+        self.input_specs = input_specs
+
+        outputs = []
+        for field, feature_specs in self.input_specs:
+            assert len(feature_specs.feature_names) ==\
+                len(feature_specs.feature_ids)
+            if feature_specs.feature_type == 'FLOAT':
+                outputs.append((
+                    field,
+                    schema.Scalar(
+                        (np.float32, (len(feature_specs.feature_ids), )),
+                        self.get_next_blob_reference(field + '_output')
+                    )
+                ))
+            elif feature_specs.feature_type == 'ID_LIST':
+                outputs.append((
+                    field,
+                    schema.Struct(
+                        ('ranges',
+                            schema.Scalar(
+                                (
+                                    np.int32,
+                                    (len(feature_specs.feature_ids), 2)
+                                ),
+                                self.get_next_blob_reference(
+                                    field + '_ranges')
+                            ),
+                         ),
+                        ('values',
+                         schema.Scalar(np.int64,
+                                       self.get_next_blob_reference(
+                                           field + '_values')
+                                       ),
+                         )
+                    )
+                ))
+            elif feature_specs.feature_type == 'ID_SCORE_LIST':
+                outputs.append((
+                    field,
+                    schema.Struct(
+                        ('ranges',
+                            schema.Scalar(
+                                (
+                                    np.int32,
+                                    (len(feature_specs.feature_ids), 2)
+                                ),
+                                self.get_next_blob_reference(
+                                    field + '_ranges')
+                            ),
+                         ),
+                        ('ids',
+                         schema.Scalar(np.int64,
+                                       self.get_next_blob_reference(
+                                           field + '_ids')
+                                       ),
+                         ),
+                        ('scores',
+                         schema.Scalar(np.float32,
+                                       self.get_next_blob_reference(
+                                           field + '_scores')
+                                       ),
+                         )
+                    )
+                ))
+            elif feature_specs.feature_type == 'EMBEDDING':
+                # We don't know dimensions of embeddings in input data.
+                # Even though they should match dimensions from feature config,
+                # we keep ranges blob to check input data later.
+                outputs.append((
+                    field,
+                    schema.Struct(
+                        ('ranges',
+                            schema.Scalar(
+                                (
+                                    np.int32,
+                                    (len(feature_specs.feature_ids), 2)
+                                ),
+                                self.get_next_blob_reference(
+                                    field + '_ranges')
+                            ),
+                         ),
+                        ('values',
+                         schema.Scalar(np.float32,
+                                       self.get_next_blob_reference(
+                                           field + '_values')
+                                       ),
+                         )
+                    )
+                ))
+            else:
+                raise TypeError(
+                    "Unsupported input type: {0}".
+                    format(feature_specs.feature_type))
+
+        # TODO(amalevich): This schema is producing ranges. And thus if there is
+        # something using it it should support ranges as well. It might be
+        # confusing, if we don't add better support for ranges/have it as a
+        # first layer
+        self.output_schema = schema.Struct(
+            *outputs
+        )
+
+        # TODO(amalevich): Consider moving this data to schema, instead
+        # Structs doens't support attaching metadata to them and clonning
+        # will break things badly, but this is the most elegant way to pass
+        # this info around. Should we change it or it'll be too much work and
+        # not worse it?
+        for field, feature_specs in input_specs:
+            schema.attach_metadata_to_scalars(
+                self.output_schema[field],
+                schema.Metadata(
+                    feature_specs=feature_specs)
+            )
+        self.zero = model.global_constants['ZERO']
+        self.zero_range = model.global_constants['ZERO_RANGE']
+
+    # Add operators to all types that need to be densified
+    def add_ops(self, net):
+        record = self.input_record
+        for field, feature_specs in self.input_specs:
+            if feature_specs.feature_type == 'FLOAT':
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(),
+                        record[field].values(),
+                        self.zero,
+                        record[field].lengths(),
+                    ],
+                    [
+                        self.output_schema[field](),
+                    ],
+                    mask=feature_specs.feature_ids,
+                )
+            elif feature_specs.feature_type == 'ID_LIST':
+                id_list_ranges = net.LengthsToRanges(
+                    record[field].values.lengths(),
+                    net.NextScopedBlob('id_list_ranges')
+                )
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(), id_list_ranges, self.zero_range,
+                        record[field].lengths()
+                    ],
+                    self.output_schema[field].ranges(),
+                    mask=feature_specs.feature_ids,
+                )
+                # Alias helps to enforce the fact that all SparseToDense calls
+                # produce new blobs.
+                # Reusing blob names might result in some weird consequences
+                # during the delivery time, when content of the blobs is
+                # generated based on the inputSpecs.
+                net.Alias(record[field].values.items(),
+                          self.output_schema[field].values())
+            elif feature_specs.feature_type == 'ID_SCORE_LIST':
+                # TODO: merge this to the case above?
+                id_list_ranges = net.LengthsToRanges(
+                    record[field].values.lengths(),
+                    net.NextScopedBlob('id_score_list_ranges')
+                )
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(), id_list_ranges, self.zero_range,
+                        record[field].lengths()
+                    ],
+                    self.output_schema[field].ranges(),
+                    mask=feature_specs.feature_ids,
+                )
+                # Alias helps to enforce the fact that all SparseToDense calls
+                # produce new blobs.
+                # Reusing blob names might result in some weird consequences
+                # during the delivery time, when content of the blobs is
+                # generated based on the inputSpecs.
+                net.Alias(record[field].values.keys(),
+                          self.output_schema[field].ids())
+                net.Alias(record[field].values.values(),
+                          self.output_schema[field].scores())
+            elif feature_specs.feature_type == 'EMBEDDING':
+                ranges = net.LengthsToRanges(
+                    record[field].values.lengths(),
+                    net.NextScopedBlob('embeddings_ranges')
+                )
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(),
+                        ranges,
+                        self.zero_range,
+                        record[field].lengths()
+                    ],
+                    self.output_schema[field].ranges(),
+                    mask=feature_specs.feature_ids,
+                )
+                # Alias helps to enforce the fact that all SparseToDense calls
+                # produce new blobs.
+                # Reusing blob names might result in some weird consequences
+                # during the delivery time, when content of the blobs is
+                # generated based on the inputSpecs.
+                net.Alias(record[field].values.items(),
+                          self.output_schema[field].values())
+
+    def get_metadata(self):
+        metadata = []
+        for field, feature_specs in self.input_specs:
+            metadata.append(
+                (
+                    {
+                        'type': feature_specs.feature_type,
+                        'names': feature_specs.feature_names,
+                        'ids': feature_specs.feature_ids,
+                    },
+                    self.output_schema[field].field_blobs(),
+                    self.output_schema[field].field_types()
+                )
+            )
+            if feature_specs.feature_type == 'FLOAT':
+                metadata[-1][0]['cardinality'] = 1
+        return metadata
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
new file mode 100644
index 0000000..84e4f99
--- /dev/null
+++ b/caffe2/python/layers/functional.py
@@ -0,0 +1,124 @@
+# @package functional
+# Module caffe2.python.layers.functional
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema, scope, workspace
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+import caffe2.proto.caffe2_pb2 as caffe2_pb2
+import numpy as np
+import six
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Functional(ModelLayer):
+
+    def __init__(self, model, input_record, output_names_or_num, function,
+                 name='functional', output_dtypes=None, tags=None, **kwargs):
+
+        # allow coercion
+        input_record = schema.as_record(input_record)
+
+        super(Functional, self).__init__(model, name, input_record, tags=tags, **kwargs)
+        self._function = function
+        self._kwargs = kwargs
+        return_struct = (
+            isinstance(output_names_or_num, list) or
+            (isinstance(output_names_or_num, six.integer_types) and
+             output_names_or_num != 1)
+        )
+
+        with scope.NameScope(self.name, reset=True):
+            if isinstance(output_names_or_num, int):
+                struct_output_schema = schema.NewRecord(
+                    model.net, schema.RawTuple(output_names_or_num))
+            elif isinstance(output_names_or_num, schema.Field):
+                self.output_schema = output_names_or_num.clone(keep_blobs=True)
+                return
+            else:
+                if not isinstance(output_names_or_num, list):
+                    output_names_or_num = [output_names_or_num]
+                out_tuple = [(out, np.void) for out in output_names_or_num]
+                struct_output_schema = schema.NewRecord(
+                    model.net, schema.Struct(*out_tuple))
+
+        num_outputs = len(struct_output_schema.field_blobs())
+
+        # functional layer returns Struct if more than one outputs or output is
+        # a list, otherwise Scalar
+        if return_struct:
+            self.output_schema = struct_output_schema
+        else:
+            self.output_schema = struct_output_schema[0]
+
+        # If output_dtypes is provided, use it for output schema. Otherwise
+        # the shape and type will be inferred.
+        if output_dtypes is not None:
+            if not isinstance(output_dtypes, list):
+                output_dtypes = [output_dtypes] * num_outputs
+            assert len(output_dtypes) == num_outputs
+            for dtype, scalar in zip(output_dtypes,
+                                     self.output_schema.all_scalars()):
+                scalar.set_type(dtype)
+            return
+
+        # Fake execution of the function to infer shapes and types automatically
+        had_issues = False
+        try:
+            type_net = core.Net('_temp_type_and_shape_inference_net')
+            schema.InitEmptyRecord(type_net, input_record, enforce_types=True)
+
+            function(type_net, self.input_record, self.output_schema, **kwargs)
+            (shapes, types) = workspace.InferShapesAndTypes([type_net], {})
+            for i in range(num_outputs):
+                scalar_schema = (self.output_schema[i] if return_struct
+                                 else self.output_schema)
+                blob = scalar_schema()
+                if blob not in types or blob not in shapes:
+                    had_issues = True
+                    continue
+                if shapes[blob] == []:
+                    # Scalar type
+                    shape = tuple()
+                elif shapes[blob][0] == 0:
+                    shape = tuple(shapes[blob][1:])
+                else:
+                    logger.warning("unexpeced shape: {}".format(shapes[blob]))
+                    # If batch dimension is not first - give up on shape
+                    # inference for that blob
+                    had_issues = True
+                    continue
+
+                # TODO(amalevich): Move it to some shared library
+                dtype = None
+                if types[blob] == caffe2_pb2.TensorProto.DOUBLE:
+                    dtype = (np.float64, shape)
+                elif types[blob] == caffe2_pb2.TensorProto.FLOAT:
+                    dtype = (np.float32, shape)
+                elif types[blob] == caffe2_pb2.TensorProto.INT32:
+                    dtype = (np.int32, shape)
+                elif types[blob] == caffe2_pb2.TensorProto.INT64:
+                    dtype = (np.int64, shape)
+                elif types[blob] == caffe2_pb2.TensorProto.FLOAT16:
+                    dtype = (np.float16, shape)
+
+                if dtype is not None:
+                    scalar_schema.set_type(dtype)
+        except TypeError as ex:
+            had_issues = True
+            logger.warning(str(ex))
+
+        if had_issues:
+            logger.warning(
+                "Type inference had problems for layer: {}".format(self.name))
+
+    def add_ops(self, net):
+        self._function(
+            net, self.input_record, self.output_schema, **(self._kwargs))
diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py
new file mode 100644
index 0000000..1289c09
--- /dev/null
+++ b/caffe2/python/layers/gather_record.py
@@ -0,0 +1,89 @@
+## @package gather_record
+# Module caffe2.python.layers.gather_record
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class GatherRecord(ModelLayer):
+    """
+    Given 1-D `indices` tensor, gather elements at `i` in `indices` from all the
+    blobs in `record`. If a blob is a values blob of a list, all the elements
+    included by the list's lengths blob are gathered. For example,
+
+    Input:
+        indices = [0, 2]
+        record:a = [[0, 1], [2, 3], [4, 5], [6, 7]]
+        record:b:lengths = [0, 1, 2, 3]
+        record:b:items = [0, 1, 2, 3, 4, 5]
+
+    Output:
+        a = [[0, 1], [4, 5]]
+        b:lengths = [0, 2]
+        b:items = [1, 2]
+
+    This supports nested list.
+    """
+
+    def __init__(self, model, input_record, name='gather_record', **kwargs):
+        super(GatherRecord, self).__init__(model, name, input_record, **kwargs)
+
+        assert 'indices' in input_record
+        assert 'record' in input_record
+
+        self.output_schema = schema.NewRecord(
+            model.net, input_record.record.clone_schema())
+
+        self._indices = self.input_record.indices()
+
+    def _gather_scalar(self, net, record, lengths_blob, output_record):
+        if lengths_blob is None:
+            net.Gather([record(), self._indices], output_record())
+        else:
+            net.LengthsGather([record(), lengths_blob, self._indices],
+                              output_record())
+
+    def _gather_struct(self, net, record, lengths_blob, output_record):
+        for name, field in record.get_children():
+            self._dispatch(net, field, lengths_blob, output_record[name])
+
+    def _gather_list(self, net, record, lengths_blob, output_record):
+        self._gather_scalar(
+            net, record.lengths, lengths_blob, output_record.lengths)
+        if lengths_blob is None:
+            lengths_blob = record.lengths()
+        else:
+            # TODO(kittipat): This is a hacky solution until LengthsSum for int
+            # is implemented
+            lengths_float = net.Cast(
+                record.lengths(),
+                net.NextScopedBlob(str(record.lengths()) + '_float'),
+                to=core.DataType.FLOAT,
+            )
+            lengths_blob_float = net.LengthsSum(
+                [lengths_float, lengths_blob],
+                net.NextScopedBlob(str(record.lengths()) + "_nested_float")
+            )
+            lengths_blob = net.Cast(
+                lengths_blob_float,
+                net.NextScopedBlob(str(record.lengths()) + "_nested"),
+                to=core.DataType.INT32,
+            )
+        self._dispatch(net, record._items, lengths_blob, output_record._items)
+
+    def _dispatch(self, net, record, lengths_blob, output_record):
+        if isinstance(record, schema.Scalar):
+            self._gather_scalar(net, record, lengths_blob, output_record)
+        elif isinstance(record, schema.Struct):
+            self._gather_struct(net, record, lengths_blob, output_record)
+        elif isinstance(record, schema.List):
+            self._gather_list(net, record, lengths_blob, output_record)
+        else:
+            raise NotImplementedError
+
+    def add_ops(self, net):
+        self._dispatch(net, self.input_record.record, None, self.output_schema)
diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py
new file mode 100644
index 0000000..63da1f0
--- /dev/null
+++ b/caffe2/python/layers/homotopy_weight.py
@@ -0,0 +1,124 @@
+# @package homotopy_weight
+# Module caffe2.fb.python.layers.homotopy_weight
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+'''
+Homotopy Weighting between two weights x, y by doing:
+    alpha x + beta y
+where alpha is a decreasing scalar parameter ranging from [min, max] (default,
+[0, 1]), and alpha + beta = max + min, which means that beta is increasing in
+the range [min, max];
+
+Homotopy methods first solves an "easy" problem (one to which the solution is
+well known), and is gradually transformed into the target problem
+'''
+
+
+class HomotopyWeight(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='homotopy_weight',
+        min_weight=0.,
+        max_weight=1.,
+        half_life=1e6,
+        quad_life=3e6,
+        atomic_iter=None,
+        **kwargs
+    ):
+        super(HomotopyWeight,
+              self).__init__(model, name, input_record, **kwargs)
+        self.output_schema = schema.Scalar(
+            np.float32, self.get_next_blob_reference('homotopy_weight')
+        )
+        data = self.input_record.field_blobs()
+        assert len(data) == 2
+        self.x = data[0]
+        self.y = data[1]
+        # TODO: currently model building does not have access to iter counter or
+        # learning rate; it's added at optimization time;
+        self.use_external_iter = (atomic_iter is not None)
+        self.atomic_iter = (
+            atomic_iter if self.use_external_iter else self.create_atomic_iter()
+        )
+        # to map lr to [min, max]; alpha = scale * lr + offset
+        assert max_weight > min_weight
+        self.scale = float(max_weight - min_weight)
+        self.offset = self.model.add_global_constant(
+            '%s_offset_1dfloat' % self.name, float(min_weight)
+        )
+        self.gamma, self.power = self.solve_inv_lr_params(half_life, quad_life)
+
+    def solve_inv_lr_params(self, half_life, quad_life):
+        # ensure that the gamma, power is solvable
+        assert half_life > 0
+        # convex monotonically decreasing
+        assert quad_life > 2 * half_life
+        t = float(quad_life) / float(half_life)
+        x = t * (1.0 + np.sqrt(2.0)) / 2.0 - np.sqrt(2.0)
+        gamma = (x - 1.0) / float(half_life)
+        power = np.log(2.0) / np.log(x)
+        logger.info(
+            'homotopy_weighting: found lr param: gamma=%g, power=%g' %
+            (gamma, power)
+        )
+        return gamma, power
+
+    def create_atomic_iter(self):
+        self.mutex = self.create_param(
+            param_name=('%s_mutex' % self.name),
+            shape=None,
+            initializer=('CreateMutex', ),
+            optimizer=self.model.NoOptim,
+        )
+        self.atomic_iter = self.create_param(
+            param_name=('%s_atomic_iter' % self.name),
+            shape=[1],
+            initializer=(
+                'ConstantFill', {
+                    'value': 0,
+                    'dtype': core.DataType.INT64
+                }
+            ),
+            optimizer=self.model.NoOptim,
+        )
+        return self.atomic_iter
+
+    def update_weight(self, net):
+        alpha = net.NextScopedBlob('alpha')
+        beta = net.NextScopedBlob('beta')
+        lr = net.NextScopedBlob('lr')
+        comp_lr = net.NextScopedBlob('complementary_lr')
+        scaled_lr = net.NextScopedBlob('scaled_lr')
+        scaled_comp_lr = net.NextScopedBlob('scaled_complementary_lr')
+        if not self.use_external_iter:
+            net.AtomicIter([self.mutex, self.atomic_iter], [self.atomic_iter])
+        net.LearningRate(
+            [self.atomic_iter],
+            [lr],
+            policy='inv',
+            gamma=self.gamma,
+            power=self.power,
+            base_lr=1.0,
+        )
+        net.Sub([self.model.global_constants['ONE'], lr], [comp_lr])
+        net.Scale([lr], [scaled_lr], scale=self.scale)
+        net.Scale([comp_lr], [scaled_comp_lr], scale=self.scale)
+        net.Add([scaled_lr, self.offset], [alpha])
+        net.Add([scaled_comp_lr, self.offset], [beta])
+        return alpha, beta
+
+    def add_ops(self, net):
+        alpha, beta = self.update_weight(net)
+        # alpha x + beta y
+        net.WeightedSum([self.x, alpha, self.y, beta], self.output_schema())
diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py
new file mode 100644
index 0000000..e2282e0
--- /dev/null
+++ b/caffe2/python/layers/label_smooth.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+# @package label_smooth
+# Module caffe2.python.layers.label_smooth
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+import numpy as np
+
+
+class LabelSmooth(ModelLayer):
+    def __init__(
+        self, model, label, smooth_matrix, name='label_smooth', **kwargs
+    ):
+        super(LabelSmooth, self).__init__(model, name, label, **kwargs)
+        self.label = label
+        # shape as a list
+        smooth_matrix = np.array(smooth_matrix).astype(np.float32).flatten()
+        self.set_dim(smooth_matrix)
+        self.set_smooth_matrix(smooth_matrix)
+        self.output_schema = schema.Scalar(
+            (np.float32, (self.dim, )),
+            self.get_next_blob_reference('smoothed_label')
+        )
+
+    def set_dim(self, smooth_matrix):
+        num_elements = smooth_matrix.size
+        self.binary_prob_label = (num_elements == 2)
+        if self.binary_prob_label:
+            self.dim = 1
+        else:
+            assert np.sqrt(num_elements)**2 == num_elements
+            self.dim = int(np.sqrt(num_elements))
+
+    def set_smooth_matrix(self, smooth_matrix):
+        if not self.binary_prob_label:
+            self.smooth_matrix = self.model.add_global_constant(
+                '%s_label_smooth_matrix' % self.name,
+                array=smooth_matrix.reshape((self.dim, self.dim)),
+                dtype=np.dtype(np.float32),
+            )
+            self.len = self.model.add_global_constant(
+                '%s_label_dim' % self.name,
+                array=self.dim,
+                dtype=np.dtype(np.int64),
+            )
+        else:
+            self.smooth_matrix = smooth_matrix
+
+    def add_ops_for_binary_prob_label(self, net):
+        if self.label.field_type().base != np.float32:
+            float32_label = net.NextScopedBlob('float32_label')
+            net.Cast([self.label()], [float32_label], to=core.DataType.FLOAT)
+        else:
+            float32_label = self.label()
+        net.StumpFunc(
+            float32_label,
+            self.output_schema(),
+            threshold=0.5,
+            low_value=self.smooth_matrix[0],
+            high_value=self.smooth_matrix[1],
+        )
+
+    def add_ops_for_categorical_label(self, net):
+        if self.label.field_type().base != np.int64:
+            int64_label = net.NextScopedBlob('int64_label')
+            net.Cast([self.label()], [int64_label], to=core.DataType.INT64)
+        else:
+            int64_label = self.label()
+        one_hot_label = net.NextScopedBlob('one_hot_label')
+        net.OneHot([int64_label, self.len], [one_hot_label])
+        net.MatMul([one_hot_label, self.smooth_matrix], self.output_schema())
+
+    def add_ops(self, net):
+        if self.binary_prob_label:
+            self.add_ops_for_binary_prob_label(net)
+        else:
+            self.add_ops_for_categorical_label(net)
diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
new file mode 100644
index 0000000..d46b92b
--- /dev/null
+++ b/caffe2/python/layers/last_n_window_collector.py
@@ -0,0 +1,74 @@
+## @package last_n_window_collector
+# Module caffe2.python.layers.last_n_window_collector
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class LastNWindowCollector(ModelLayer):
+    """
+    Collect last-N samples from input record. If you have complex data,
+    use PackRecords to pack it before using this layer.
+
+    This layer is not thread safe.
+    """
+
+    def __init__(self, model, input_record, num_to_collect,
+                 name='last_n_window_collector', **kwargs):
+        super(LastNWindowCollector, self).__init__(
+            model, name, input_record, **kwargs)
+        assert num_to_collect > 0
+        self.num_to_collect = num_to_collect
+        assert isinstance(input_record, schema.Scalar), \
+            "Got {!r}".format(input_record)
+
+        self.last_n = self.create_param(param_name='last_n',
+                                        shape=[0],
+                                        initializer=('ConstantFill', {}),
+                                        optimizer=model.NoOptim)
+
+        self.next_blob = self.create_param(
+            param_name='next',
+            shape=[],
+            initializer=('ConstantFill',
+                         {'value': 0, 'dtype': core.DataType.INT32}),
+            optimizer=model.NoOptim
+        )
+
+        self.mutex = self.create_param(
+            param_name='mutex',
+            shape=None,
+            initializer=('CreateMutex',),
+            optimizer=model.NoOptim,
+        )
+
+        self.num_visited_blob = self.create_param(
+            param_name='num_visited',
+            shape=[],
+            initializer=('ConstantFill', {
+                'value': 0,
+                'dtype': core.DataType.INT64,
+            }),
+            optimizer=model.NoOptim,
+        )
+
+        self.output_schema = schema.Struct(
+            (
+                'last_n',
+                schema.from_blob_list(input_record, [self.last_n])
+            ),
+            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
+            ('mutex', schema.Scalar(blob=self.mutex)),
+        )
+
+    def add_ops(self, net):
+        net.LastNWindowCollector(
+            [self.last_n, self.next_blob, self.input_record(), self.mutex,
+             self.num_visited_blob],
+            [self.last_n, self.next_blob, self.num_visited_blob],
+            num_to_collect=self.num_to_collect,
+        )
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
new file mode 100644
index 0000000..db34adb
--- /dev/null
+++ b/caffe2/python/layers/layer_normalization.py
@@ -0,0 +1,76 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+
+import numpy as np
+
+
+class LayerNormalization(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='layer_normalization',
+        scale_optim=None,
+        bias_optim=None,
+        epsilon=1e-4,
+        axis=1,
+        **kwargs
+    ):
+        super(LayerNormalization, self).__init__(
+            model, name, input_record, **kwargs)
+
+        assert isinstance(input_record, schema.Scalar), (
+            "Incorrect input type: {}".format(input_record))
+
+        self.input_shape = input_record.field_type().shape
+        self.epsilon = epsilon
+        self.axis = axis
+
+        assert len(self.input_shape) >= 1, (
+            "This layer supports only >= 2D tesnors")
+        input_dims = self.input_shape[0]
+
+        self.output_schema = schema.Scalar(
+            (np.float32, self.input_shape),
+            self.get_next_blob_reference('output')
+        )
+
+        self.scale = self.create_param(param_name='scale',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 1.0}),
+                                       optimizer=scale_optim)
+        self.bias = self.create_param(param_name='bias',
+                                       shape=[input_dims],
+                                       initializer=('ConstantFill', {'value': 0.0}),
+                                       optimizer=bias_optim)
+
+    def add_ops(self, net):
+        input_blob = self.input_record.field_blobs()
+        ln_output = self.output_schema.field_blobs()
+
+        output_blobs = [net.NextScopedBlob('ln_output'), net.NextScopedBlob('ln_mean'),
+                        net.NextScopedBlob('ln_stdev')]
+
+        normalized, mean, stdev = net.LayerNorm(input_blob,
+            output_blobs,
+            axis=self.axis,
+            epsilon=self.epsilon)
+
+        scaled = net.Mul(
+            [normalized, self.scale],
+            [net.NextScopedBlob('ln_scaled')],
+            broadcast=1,
+            axis=self.axis,
+        )
+
+        net.Add(
+            [scaled, self.bias],
+            ln_output,
+            broadcast=1,
+            axis=self.axis,
+        )
diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py
new file mode 100644
index 0000000..1cf6d6e
--- /dev/null
+++ b/caffe2/python/layers/layers.py
@@ -0,0 +1,423 @@
+## @package layers
+# Module caffe2.python.layers.layers
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+
+from caffe2.python import core, schema, scope, utils, workspace
+from caffe2.python.layers.tags import TagContext
+from caffe2.proto import caffe2_pb2
+
+from collections import namedtuple
+import numpy as np
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Some types to simplify descriptions of things traveling between ops
+IdList = schema.List(np.int64)
+IdScoreList = schema.Map(np.int64, np.float32)
+
+
+def get_key(record):
+    if schema.equal_schemas(record, IdList):
+        key = 'values'
+    elif schema.equal_schemas(record, IdScoreList, check_field_types=False):
+        key = 'values:keys'
+    else:
+        raise NotImplementedError('Not implemented for {}'.format(record))
+    assert record[key].metadata is not None, (
+        "Blob {} doesn't have metadata".format(str(record[key]())))
+    return record[key]
+
+
+def get_categorical_limit(record):
+    key = get_key(record)
+    return key.metadata.categorical_limit
+
+
+def get_avg_length(record):
+    return record['lengths'].metadata.expected_value
+
+
+def set_request_only(field):
+    for f in field.all_scalars():
+        categorical_limit, expected_value = None, None
+        if not f.metadata:
+            feature_specs = schema.FeatureSpec(
+                feature_is_request_only=True,
+            )
+        elif not f.metadata.feature_specs:
+            categorical_limit = f.metadata.categorical_limit
+            expected_value = f.metadata.expected_value
+            feature_specs = schema.FeatureSpec(
+                feature_is_request_only=True,
+            )
+        else:
+            categorical_limit = f.metadata.categorical_limit
+            expected_value = f.metadata.expected_value
+            feature_specs = schema.FeatureSpec(
+                feature_type=f.metadata.feature_specs.feature_type,
+                feature_names=f.metadata.feature_specs.feature_names,
+                feature_ids=f.metadata.feature_specs.feature_ids,
+                feature_is_request_only=True,
+                desired_hash_size=f.metadata.feature_specs.desired_hash_size,
+            )
+
+        # make sure not to set categorical_limit for a non-integer field
+        if not np.issubdtype(f.field_type(), np.integer):
+            assert categorical_limit is None, \
+                "categorical_limit shouldn't be set for no-integer field"
+
+        f.set_metadata(
+            schema.Metadata(
+                categorical_limit=categorical_limit,
+                expected_value=expected_value,
+                feature_specs=feature_specs,
+            )
+        )
+
+
+class InstantiationContext(object):
+    """
+    List of contexts where layer could be instantitated
+    """
+    # The layers support this context will accumulate predictions, labels,
+    # weights. The accumulated data can later be used to compute
+    # calibration or for other
+    # purpose.
+    ACCUMULATE_PRED = 'accumulate_pred'
+    EVAL = 'eval'
+    PREDICTION = 'prediction'
+    TRAINING = 'training'
+
+
+_LAYER_REGISTRY = {}
+
+
+def register_layer(name, layer):
+    assert name not in _LAYER_REGISTRY, "{0} already exists".format(name)
+    _LAYER_REGISTRY[name] = layer
+
+
+def layer_exists(name):
+    return name in _LAYER_REGISTRY
+
+
+def get_layer_class(name):
+    return _LAYER_REGISTRY[name]
+
+
+def create_layer(layer_name, *args, **kwargs):
+    return _LAYER_REGISTRY[layer_name](*args, **kwargs)
+
+
+LayerPsParam = namedtuple('LayerPsParam', ['sparse_key', 'average_length'])
+
+
+class LayerParameter(object):
+
+    def __init__(self, parameter=None, optimizer=None, initializer=None,
+                 ps_param=None, regularizer=None):
+        assert isinstance(parameter, core.BlobReference), \
+            "expect {0} to be a blob reference".format(str(parameter))
+        # need to put the following line (shape) before initialier
+        # shape will be updated once initializer is (re)set
+        self._shape = None
+        self.parameter = parameter
+        self.optimizer = optimizer
+        self.initializer = initializer
+        self.ps_param = ps_param
+        self.regularizer = regularizer
+
+    @property
+    def initializer(self):
+        return self._initializer
+
+    @initializer.setter
+    def initializer(self, op):
+        assert op is None or core.IsOperator(getattr(op, 'type', None)), \
+            "initializer expects an operator, got type: {}".format(type(op))
+        self._initializer = op
+        if op is not None:
+            self.shape = self._infer_shape_from_initializer()
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, shape):
+        assert self.shape is None or self.shape == shape, \
+            "inconsistent shape for layer parameter:"\
+            " {}, expect: {}, but got {}".format(self, self.shape, shape)
+        self._shape = shape
+
+    def _infer_shape_from_initializer(self):
+        for arg in self.initializer.arg:
+            if arg.name == 'shape':
+                return list(arg.ints)
+        with workspace.WorkspaceGuard("model_init_by_loading_params"):
+            try:
+                net = core.Net("shape_checker")
+                net._net.op.extend([self.initializer])
+                shape_blob = net.NextScopedBlob(self.parameter + "_shape")
+                net.Shape([self.parameter], shape_blob)
+                workspace.RunNetOnce(net)
+                shape = workspace.FetchBlob(shape_blob).tolist()
+                # ResetWorkspace to save memory
+                workspace.ResetWorkspace()
+                return shape
+            except RuntimeError as exp:
+                logger.warning(
+                    "Cannot infer the shape of blob {} from operator {}: {}".format(
+                        self.parameter, self.initializer.type, exp)
+                )
+                workspace.ResetWorkspace()
+                return None
+
+    def __str__(self):
+        return str(self.parameter)
+
+
+def is_request_only_scalar(scalar):
+    if len(scalar.field_metadata()) == 0:
+        return False
+    for metadata in scalar.field_metadata():
+        if not (metadata and metadata.feature_specs and getattr(
+                metadata.feature_specs, 'feature_is_request_only', False)):
+            return False
+    return True
+
+
+class ModelLayer(object):
+
+    def __init__(self, model, prefix, input_record,
+                 predict_input_record_fields=None, tags=None, **kwargs):
+        """
+        Base class for model layers. Layer is an abstraction that allows to
+        provide model description in terms of meta-operators, where each of the
+        meta-operators can have different implementations for training,
+        evaluation and prediction, that are instantiated later. As an example
+        SampledSoftmax can do something related to sampling depending on
+        supervision during the training and just apply softmax if it's used for
+        prediction/evaluation.
+
+        All inputs/outputs from layers are represented as a record (instance of
+        schema bounded to blobs) and are accessible through input_record and
+        output_schema. If Layer needs to have only a subset of inputs/provides
+        subset of outputs during the inference - it should provide
+        predict_input_record and predict_output_schema correspondingly (those
+        records are expected to be a subset of input_record/output_schema).
+
+        Each layer has a list of Tags associated with it, that depends on
+        current context and arguments. It's possible to use those tags during
+        the instantiation time.
+
+        """
+        self.name = model.next_layer_name(prefix)
+        self.model = model
+        self.kwargs = kwargs
+        self._input_record = input_record
+        if predict_input_record_fields:
+            if not isinstance(predict_input_record_fields, list):
+                predict_input_record_fields = [predict_input_record_fields]
+            self._predict_input_record = self._input_record[
+                predict_input_record_fields]
+        else:
+            self._predict_input_record = None
+
+        self.request_only = True
+        if len(input_record.all_scalars()) == 0:
+            self.request_only = False
+        for scalar in input_record.all_scalars():
+            if not is_request_only_scalar(scalar):
+                self.request_only = False
+                break
+
+        self.precomputation_request_only = False
+        self.precomputation_object_only = False
+
+        self._output_schema = None
+        self._predict_output_schema = None
+        self.eval_output_schema = None
+        self.tags = set(tags or [])
+        self.tags.update(TagContext.current().tags)
+        self.params = []
+        self._export_output_for_metrics = False
+        self._export_params_for_metrics = False
+
+    def get_type(self):
+        return self.__class__.__name__
+
+    def _check_output_schema(self):
+        assert self._output_schema is not None, "Schema is not initialized"
+        assert (self._predict_output_schema is None or
+                schema.is_schema_subset(self._predict_output_schema,
+                                        self._output_schema)), (
+            "predict_output_schema is not a subset of the output_schema")
+
+    @property
+    def predict_input_record(self):
+        return self._predict_input_record or self._input_record
+
+    @property
+    def input_record(self):
+        return self._input_record
+
+    @property
+    def predict_output_schema(self):
+        self._check_output_schema()
+        return self._predict_output_schema or self._output_schema
+
+    @predict_output_schema.setter
+    def predict_output_schema(self, output_schema):
+        assert self._predict_output_schema is None
+        self._predict_output_schema = output_schema
+
+    @property
+    def output_schema(self):
+        if self.request_only:
+            set_request_only(self._output_schema)
+        self._check_output_schema()
+        return self._output_schema
+
+    @output_schema.setter
+    def output_schema(self, output_schema):
+        assert self._output_schema is None
+        self._output_schema = output_schema
+
+    def get_parameters(self):
+        return self.params
+
+    def get_fp16_compatible_parameters(self):
+        """Return a subset of parameters which can be converted to fp16"""
+        return []
+
+    def get_memory_usage(self):
+        return 0
+
+    def add_init_params(self, init_net):
+        '''
+        Adds layer initialization operators to passed net.
+        '''
+        for param in self.params:
+            # TODO(amalevich): Either return back to lambdas, that add
+            # all params (looks a bit safer and breaking less
+            # abstractions) or extend Net interface to this type of
+            # operations better
+            # TODO(xlwang) init_net._net.op has type google.protobuf.\
+            # internal.containers.RepeatedCompositeFieldContainer, but
+            # the version of protobuf in fbcode does not support append
+            # so extend is used
+            init_op = param.initializer
+            current_device_scope = scope.CurrentDeviceScope()
+            if not init_op:
+                continue
+
+            if not init_op.HasField('device_option') and\
+                    current_device_scope:
+                init_op = caffe2_pb2.OperatorDef()
+                init_op.CopyFrom(param.initializer)
+                init_op.device_option.CopyFrom(current_device_scope)
+
+            # do not add duplicated init ops
+            if any(utils.OpAlmostEqual(op, init_op, 'debug_info')
+                   for op in init_net._net.op):
+                continue
+
+            init_net._net.op.extend([init_op])
+
+    def create_param(self, param_name, shape, initializer, optimizer,
+                     ps_param=None, regularizer=None):
+        with scope.NameScope(self.name, reset=True):
+            param = self.model.create_param(param_name=param_name,
+                                            shape=shape,
+                                            initializer=initializer,
+                                            optimizer=optimizer,
+                                            ps_param=ps_param,
+                                            regularizer=regularizer)
+
+            # make sure we don't share parameters in the same layer
+            assert all(param.parameter != p.parameter for p in self.params)
+
+            self.params.append(param)
+            return param.parameter
+
+    def get_next_blob_reference(self, name):
+        with scope.NameScope(self.name, reset=True):
+            return self.model.net.NextScopedBlob(name)
+
+    def add_operators(self, net, init_net=None,
+                      context=InstantiationContext.TRAINING):
+        '''
+        Adds layer trainig or initialization operators to the passed in net.
+        init_net can be None and can be called independently from add_init_params
+        '''
+        # Namescope below should warranty that all intermediate blobs will be
+        # assiciated with the layer that produces them
+        with scope.NameScope(self.name):
+            if context not in {InstantiationContext.PREDICTION,
+                               InstantiationContext.EVAL,
+                               InstantiationContext.ACCUMULATE_PRED}:
+                assert init_net, (
+                    "Only prediction and eval context don't need init_net")
+            if init_net:
+                self.add_init_params(init_net)
+            if context == InstantiationContext.TRAINING:
+                self.add_train_ops(net)
+            elif context == InstantiationContext.EVAL:
+                self.add_eval_ops(net)
+            elif context == InstantiationContext.ACCUMULATE_PRED:
+                self.add_ops_to_accumulate_pred(net)
+            else:
+                self.add_ops(net)
+
+            if context in {InstantiationContext.TRAINING,
+                           InstantiationContext.EVAL} \
+               and self._export_params_for_metrics:
+                self.add_param_copy_operators(net)
+
+    def add_ops(self, net):
+        # Predict layer implementation.
+        raise NotImplementedError
+
+    def add_eval_ops(self, net):
+        # Default eval layer implementation is completely matching
+        # predict layer implementation.
+        self.add_ops(net)
+
+    def add_train_ops(self, net):
+        # Default train layer implementation is completely matching
+        # eval layer implementation.
+        self.add_eval_ops(net)
+
+    def add_ops_to_accumulate_pred(self, net):
+        # This adds operators to accumulate predictions/labels/weights. The
+        # accumulated data can later be used to compute calibration or for other
+        # purpose. Default layer implementation is completely matching eval
+        # layer implementation.
+        self.add_eval_ops(net)
+
+    def add_param_copy_operators(self, net):
+        for param in self.params:
+            param_copy_ref = self.model.metrics_schema[str(param.parameter)]
+            net.Copy([param.parameter], param_copy_ref.field_blobs())
+
+    def export_output_for_metrics(self):
+        self._export_output_for_metrics = True
+
+        # Export output of the layer directly
+        export_name = self.name + "/output"
+        self.model.add_metric_field(export_name, self.output_schema)
+
+    def export_params_for_metrics(self):
+        self._export_params_for_metrics = True
+
+        # Export copies of parameters
+        for param in self.params:
+            param_copy_ref = self.get_next_blob_reference(
+                str(param).split("/")[-1] + "_copy")
+            self.model.add_metric_field(str(param.parameter), param_copy_ref)
diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py
new file mode 100644
index 0000000..1526775
--- /dev/null
+++ b/caffe2/python/layers/margin_rank_loss.py
@@ -0,0 +1,62 @@
+## @package random_neg_rank_loss
+# Module caffe2.python.layers.random_neg_rank_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema, core
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class MarginRankLoss(ModelLayer):
+
+    def __init__(self, model, input_record, name='margin_rank_loss',
+                 margin=0.1, average_loss=False, **kwargs):
+        super(MarginRankLoss, self).__init__(model, name, input_record, **kwargs)
+        assert margin >= 0, ('For hinge loss, margin should be no less than 0')
+        self._margin = margin
+        self._average_loss = average_loss
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('pos_prediction', schema.Scalar()),
+                ('neg_prediction', schema.List(np.float32)),
+            ),
+            input_record
+        )
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+        self.output_schema = schema.Scalar(
+            np.float32,
+            self.get_next_blob_reference('output'))
+
+    def add_ops(self, net):
+        neg_score = self.input_record.neg_prediction['values']()
+
+        pos_score = net.LengthsTile(
+            [
+                self.input_record.pos_prediction(),
+                self.input_record.neg_prediction['lengths']()
+            ],
+            net.NextScopedBlob('pos_score_repeated')
+        )
+        const_1 = net.ConstantFill(
+            neg_score,
+            net.NextScopedBlob('const_1'),
+            value=1,
+            dtype=core.DataType.INT32
+        )
+        rank_loss = net.MarginRankingCriterion(
+            [pos_score, neg_score, const_1],
+            net.NextScopedBlob('rank_loss'),
+            margin=self._margin,
+        )
+        if self._average_loss:
+            net.AveragedLoss(rank_loss, self.output_schema.field_blobs())
+        else:
+            net.ReduceFrontSum(rank_loss, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
new file mode 100644
index 0000000..117dd79
--- /dev/null
+++ b/caffe2/python/layers/merge_id_lists.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    get_categorical_limit,
+    ModelLayer,
+    IdList
+)
+
+import numpy as np
+
+
+class MergeIdLists(ModelLayer):
+    """Merge multiple ID_LISTs into a single ID_LIST
+
+    Arguments:
+        model: A layer model instance
+        input_record: Tuple (Struct) of ID_LIST features to be
+        merged
+
+    Returns:
+        the merged ID_LIST feature
+    """
+    def __init__(self, model, input_record, name='merged'):
+        super(MergeIdLists, self).__init__(model, name, input_record)
+        assert all(schema.equal_schemas(x, IdList) for x in input_record), \
+            "Inputs to MergeIdLists should all be IdLists."
+
+        assert all(record.items.metadata is not None
+                   for record in self.input_record), \
+            "Features without metadata are not supported"
+
+        merge_dim = max(get_categorical_limit(record)
+                        for record in self.input_record)
+        assert merge_dim is not None, "Unbounded features are not supported"
+
+        self.output_schema = schema.NewRecord(
+            model.net, schema.List(
+                schema.Scalar(
+                    np.int64,
+                    blob=model.net.NextBlob(name),
+                    metadata=schema.Metadata(categorical_limit=merge_dim)
+                )))
+
+    def add_ops(self, net):
+        return net.MergeIdLists(self.input_record.field_blobs(),
+                                self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py
new file mode 100644
index 0000000..300c2f3
--- /dev/null
+++ b/caffe2/python/layers/pairwise_similarity.py
@@ -0,0 +1,96 @@
+## @package dot_product
+# Module caffe2.python.layers.dot_product
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+
+
+class PairwiseSimilarity(ModelLayer):
+
+    def __init__(self, model, input_record, output_dim, pairwise_similarity_func='dot',
+                 name='pairwise_similarity', **kwargs):
+        super(PairwiseSimilarity, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Struct), (
+            "Incorrect input type. Excpected Struct, but received: {0}".
+            format(input_record))
+        assert (
+            ('all_embeddings' in input_record) ^
+            ('x_embeddings' in input_record and 'y_embeddings' in input_record)
+        ), (
+            "either (all_embeddings) xor (x_embeddings and y_embeddings) " +
+            "should be given."
+        )
+        self.pairwise_similarity_func = pairwise_similarity_func
+        if 'all_embeddings' in input_record:
+            x_embeddings = input_record['all_embeddings']
+            y_embeddings = input_record['all_embeddings']
+        else:
+            x_embeddings = input_record['x_embeddings']
+            y_embeddings = input_record['y_embeddings']
+
+        assert isinstance(x_embeddings, schema.Scalar), (
+            "Incorrect input type for x. Expected Scalar, " +
+            "but received: {0}".format(x_embeddings))
+        assert isinstance(y_embeddings, schema.Scalar), (
+            "Incorrect input type for y. Expected Scalar, " +
+            "but received: {0}".format(y_embeddings)
+        )
+
+        if 'indices_to_gather' in input_record:
+            indices_to_gather = input_record['indices_to_gather']
+            assert isinstance(indices_to_gather, schema.Scalar), (
+                "Incorrect type of indices_to_gather. "
+                "Expected Scalar, but received: {0}".format(indices_to_gather)
+            )
+            self.indices_to_gather = indices_to_gather
+        else:
+            self.indices_to_gather = None
+
+        self.x_embeddings = x_embeddings
+        self.y_embeddings = y_embeddings
+
+        dtype = x_embeddings.field_types()[0].base
+
+        self.output_schema = schema.Scalar(
+            (dtype, (output_dim,)),
+            self.get_next_blob_reference('output')
+        )
+
+    def add_ops(self, net):
+        if self.pairwise_similarity_func == "cosine_similarity":
+            x_embeddings_norm = net.Normalize(self.x_embeddings(), axis=1)
+            y_embeddings_norm = net.Normalize(self.y_embeddings(), axis=1)
+            Y = net.BatchMatMul(
+                [x_embeddings_norm, y_embeddings_norm],
+                [x_embeddings_norm + '_matmul'],
+                trans_b=1,
+            )
+        elif self.pairwise_similarity_func == "dot":
+            Y = net.BatchMatMul(
+                [self.x_embeddings(), self.y_embeddings()],
+                [self.x_embeddings() + '_matmul'],
+                trans_b=1,
+            )
+        else:
+            raise NotImplementedError(
+                "pairwise_similarity_func={} is not valid".format(
+                    self.pairwise_similarity_func
+                )
+            )
+
+        if self.indices_to_gather:
+            flattened = net.Flatten(
+                Y, Y + '_flatten',
+            )
+            net.BatchGather(
+                [flattened, self.indices_to_gather()],
+                self.output_schema(),
+            )
+        else:
+            net.Flatten(Y, self.output_schema())
diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py
new file mode 100644
index 0000000..2b40ed7
--- /dev/null
+++ b/caffe2/python/layers/position_weighted.py
@@ -0,0 +1,65 @@
+## @package position_weighted
+# Module caffe2.python.layers.position_weighted
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+import numpy as np
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    get_categorical_limit,
+    ModelLayer,
+)
+
+from caffe2.python.layers.tags import Tags
+
+logger = logging.getLogger(__name__)
+
+
+class PositionWeighted(ModelLayer):
+    def __init__(self, model, input_record, weight_optim=None,
+                 name="position_weights"):
+        super(PositionWeighted, self).__init__(model, name, input_record)
+
+        assert isinstance(input_record, schema.List), "Incorrect input type"
+        length_metadata = input_record.lengths.metadata
+        max_length = (length_metadata.categorical_limit if length_metadata is
+                      not None else None)
+        if max_length is not None:
+            self.shape = max_length
+        else:
+            self.shape = get_categorical_limit(input_record)
+            logger.warning(
+                '{}: categorical_limit of lengths is not available, using '
+                'categorical_limit of the keys: {}'.format(
+                    str(input_record.lengths()), self.shape))
+
+        self.pos_w = self.create_param(param_name='pos_w',
+                                       shape=[self.shape, ],
+                                       initializer=('ConstantFill', {'value': 1.0}),
+                                       optimizer=weight_optim)
+
+        self.output_schema = schema.Struct(
+            ('position_weights',
+                schema.Scalar((np.float32, self.shape),
+                              self.get_next_blob_reference("pos_w_gather")))
+        )
+
+        self.tags.update({Tags.HANDLE_AS_SPARSE_LAYER})
+        self.tags.update({Tags.GRADIENT_FROM_PS})
+
+    def get_memory_usage(self):
+        return self.shape
+
+    def add_ops(self, net):
+        inc_seq = net.LengthsRangeFill(
+            [self.input_record.lengths()],
+            self.input_record.lengths() + '_pos_w_seq'
+        )
+
+        net.Gather(
+            [self.pos_w, inc_seq],
+            self.output_schema.position_weights.field_blobs())
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
new file mode 100644
index 0000000..23b6bed
--- /dev/null
+++ b/caffe2/python/layers/random_fourier_features.py
@@ -0,0 +1,91 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer
+
+import numpy as np
+
+
+class RandomFourierFeatures(ModelLayer):
+    """
+    Implementation of random fourier feature map for feature processing.
+
+    Applies sqrt(2 / output_dims) * cos(wx+b), where:
+        output_dims is the output feature dimensions, and
+        wx + b applies FC using randomized, fixed weight and bias parameters
+
+    For more information, see the original paper:
+        https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
+
+    Inputs:
+        output_dims -- output feature dimensions
+        sigma -- bandwidth for the Gaussian kernel estimator
+        w_init -- initalization options for weight parameter
+        b_init -- initalization options for bias parameter
+
+    """
+    def __init__(
+            self,
+            model,
+            input_record,
+            output_dims,
+            sigma,  # bandwidth
+            w_init=None,
+            b_init=None,
+            name='random_fourier_features',
+            **kwargs):
+
+        super(RandomFourierFeatures, self).__init__(model, name, input_record,
+                                                    **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+
+        input_dims = input_record.field_type().shape[0]
+        assert input_dims >= 1, "Expected input dimensions >= 1, got %s" \
+                                % input_dims
+        self.output_dims = output_dims
+        assert self.output_dims >= 1, "Expected output dimensions >= 1, got %s" \
+                                      % self.output_dims
+
+        self.output_schema = schema.Scalar(
+            (np.float32, (self.output_dims, )),
+            self.get_next_blob_reference('output')
+        )
+
+        assert sigma > 0.0, "Expected bandwidth > 0, got %s" % sigma
+
+        # Initialize train_init_net parameters
+        w_init = w_init if w_init else (
+            'GaussianFill', {'mean': 0.0, 'std': 1.0 / sigma}
+        )
+
+        b_init = b_init if b_init else (
+            'UniformFill', {'min': 0.0, 'max': 2 * np.pi}
+        )
+
+        self.w = self.create_param(param_name='w',
+                                   shape=[self.output_dims, input_dims],
+                                   initializer=w_init,
+                                   optimizer=model.NoOptim)
+
+        self.b = self.create_param(param_name='b',
+                                   shape=[self.output_dims],
+                                   initializer=b_init,
+                                   optimizer=model.NoOptim)
+
+    def add_ops(self, net):
+        # Random features: wx + b
+        cosine_arg = net.FC(self.input_record.field_blobs() + [self.w, self.b],
+                            net.NextScopedBlob("cosine_arg"))
+
+        # Apply cosine to new vectors
+        new_feature_vec = net.Cos([cosine_arg],
+                                  net.NextScopedBlob('new_feature_vec'))
+
+        # Multiply each element in vector by sqrt(2/D)
+        scale = np.sqrt(2.0 / self.output_dims)
+        net.Scale([new_feature_vec],
+                  self.output_schema.field_blobs(),
+                  scale=scale)
diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py
new file mode 100644
index 0000000..cdc8e10
--- /dev/null
+++ b/caffe2/python/layers/reservoir_sampling.py
@@ -0,0 +1,89 @@
+## @package reservoir_sampling
+# Module caffe2.python.layers.reservoir_sampling
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class ReservoirSampling(ModelLayer):
+    """
+    Collect samples from input record w/ reservoir sampling. If you have complex
+    data, use PackRecords to pack it before using this layer.
+
+    This layer is not thread safe.
+    """
+
+    def __init__(self, model, input_record, num_to_collect,
+                 name='reservoir_sampling', **kwargs):
+        super(ReservoirSampling, self).__init__(
+            model, name, input_record, **kwargs)
+        assert num_to_collect > 0
+        self.num_to_collect = num_to_collect
+
+        self.reservoir = self.create_param(
+            param_name='reservoir',
+            shape=[0],
+            initializer=('ConstantFill',),
+            optimizer=model.NoOptim,
+        )
+        self.num_visited_blob = self.create_param(
+            param_name='num_visited',
+            shape=[],
+            initializer=('ConstantFill', {
+                'value': 0,
+                'dtype': core.DataType.INT64,
+            }),
+            optimizer=model.NoOptim,
+        )
+        self.mutex = self.create_param(
+            param_name='mutex',
+            shape=None,
+            initializer=('CreateMutex',),
+            optimizer=model.NoOptim,
+        )
+
+        self.extra_input_blobs = []
+        self.extra_output_blobs = []
+        if 'object_id' in input_record:
+            object_to_pos = self.create_param(
+                param_name='object_to_pos',
+                shape=None,
+                initializer=('CreateMap', {
+                    'key_dtype': core.DataType.INT64,
+                    'valued_dtype': core.DataType.INT32,
+                }),
+                optimizer=model.NoOptim,
+            )
+            pos_to_object = self.create_param(
+                param_name='pos_to_object',
+                shape=[0],
+                initializer=('ConstantFill', {
+                    'value': 0,
+                    'dtype': core.DataType.INT64,
+                }),
+                optimizer=model.NoOptim,
+            )
+            self.extra_input_blobs.append(input_record.object_id())
+            self.extra_input_blobs.extend([object_to_pos, pos_to_object])
+            self.extra_output_blobs.extend([object_to_pos, pos_to_object])
+
+        self.output_schema = schema.Struct(
+            (
+                'reservoir',
+                schema.from_blob_list(input_record.data, [self.reservoir])
+            ),
+            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
+            ('mutex', schema.Scalar(blob=self.mutex)),
+        )
+
+    def add_ops(self, net):
+        net.ReservoirSampling(
+            [self.reservoir, self.num_visited_blob, self.input_record.data(),
+             self.mutex] + self.extra_input_blobs,
+            [self.reservoir, self.num_visited_blob] + self.extra_output_blobs,
+            num_to_collect=self.num_to_collect,
+        )
diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py
new file mode 100644
index 0000000..1c61732
--- /dev/null
+++ b/caffe2/python/layers/sampling_train.py
@@ -0,0 +1,71 @@
+## @package sampling_train
+# Module caffe2.python.layers.sampling_train
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import ModelLayer, get_layer_class
+from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
+
+
+class SamplingTrain(ModelLayer):
+    def __init__(
+        self,
+        model,
+        input_record,
+        prediction_layer,
+        output_dims,
+        subtract_log_odd=True,
+        name='sampling_train',
+        **kwargs
+    ):
+        super(SamplingTrain, self).__init__(
+            model, name, input_record, **kwargs
+        )
+
+        layer_class = get_layer_class(prediction_layer)
+        assert issubclass(layer_class, SamplingTrainableMixin)
+
+        assert 'indices' in input_record
+        assert isinstance(input_record.indices, schema.Scalar),\
+            "input_record.indices is expected to be a schema.Scalar"
+        assert 'input' in input_record
+
+        self.subtract_log_odd = subtract_log_odd
+        if self.subtract_log_odd:
+            assert 'sampling_prob' in input_record
+
+        self._prediction_layer = layer_class(
+            model,
+            input_record.input,
+            output_dims=output_dims,
+            **kwargs
+        )
+
+        self._prediction_layer.train_param_blobs = [
+            model.net.NextBlob(str(blob) + '_sampled')
+            for blob in self._prediction_layer.param_blobs
+        ]
+
+        self.params = self._prediction_layer.params
+
+        self.output_schema = self._prediction_layer.output_schema
+
+    def add_ops(self, net):
+        self._prediction_layer.add_ops(net)
+
+    def add_train_ops(self, net):
+        for full_blob, sampled_blob in zip(
+            self._prediction_layer.param_blobs,
+            self._prediction_layer.train_param_blobs
+        ):
+            net.Gather([full_blob, self.input_record.indices()], sampled_blob)
+        self._prediction_layer.add_train_ops(net)
+        if not self.subtract_log_odd:
+            return
+        log_q = net.Log(self.input_record.sampling_prob(),
+                        net.NextScopedBlob("log_q"))
+        net.Sub([self.output_schema(), log_q], self.output_schema(),
+                broadcast=1, use_grad_hack=1)
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
new file mode 100644
index 0000000..911fd83
--- /dev/null
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@@ -0,0 +1,54 @@
+## @package sampling_trainable_mixin
+# Module caffe2.python.layers.sampling_trainable_mixin
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import abc
+import six
+
+
+class SamplingTrainableMixin(six.with_metaclass(abc.ABCMeta, object)):
+
+    def __init__(self, *args, **kwargs):
+        super(SamplingTrainableMixin, self).__init__(*args, **kwargs)
+        self._train_param_blobs = None
+        self._train_param_blobs_frozen = False
+
+    @property
+    @abc.abstractmethod
+    def param_blobs(self):
+        """
+        List of parameter blobs for prediction net
+        """
+        pass
+
+    @property
+    def train_param_blobs(self):
+        """
+        If train_param_blobs is not set before used, default to param_blobs
+        """
+        if self._train_param_blobs is None:
+            self.train_param_blobs = self.param_blobs
+        return self._train_param_blobs
+
+    @train_param_blobs.setter
+    def train_param_blobs(self, blobs):
+        assert not self._train_param_blobs_frozen
+        assert blobs is not None
+        self._train_param_blobs_frozen = True
+        self._train_param_blobs = blobs
+
+    @abc.abstractmethod
+    def _add_ops(self, net, param_blobs):
+        """
+        Add ops to the given net, using the given param_blobs
+        """
+        pass
+
+    def add_ops(self, net):
+        self._add_ops(net, self.param_blobs)
+
+    def add_train_ops(self, net):
+        self._add_ops(net, self.train_param_blobs)
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
new file mode 100644
index 0000000..87b1ef0
--- /dev/null
+++ b/caffe2/python/layers/select_record_by_context.py
@@ -0,0 +1,77 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    InstantiationContext,
+    ModelLayer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class SelectRecordByContext(ModelLayer):
+    """
+    Allowing model to follow different paths for each instatiation context and
+    join later at some point. The implementation use `Alias` because schema
+    sometimes clone fields internally so we need static blob name for output
+    """
+
+    def __init__(
+        self,
+        model,
+        input_record,
+        name='select_record_by_context',
+        check_field_metas=True,
+        use_copy=False,
+        default_output_record_field=None,
+        **kwargs
+    ):
+        super(SelectRecordByContext, self).__init__(model, name, input_record,
+                                                    **kwargs)
+
+        assert isinstance(input_record, schema.Struct)
+        assert len(input_record) > 1
+
+        self.use_copy = use_copy
+        self.default_output_record = (
+            input_record[default_output_record_field]
+            if (default_output_record_field is not None) else None
+        )
+        ref_record = input_record[0]
+        for record in input_record:
+            assert schema.equal_schemas(record, ref_record,
+                                        check_field_metas=check_field_metas)
+
+        self.output_schema = schema.NewRecord(model.net, ref_record)
+
+    def _set_output_blobs(self, net, context):
+        record = self.input_record.get(context, self.default_output_record)
+        assert record is not None, (
+            "{} context is not in input record without providing default"
+            " output".format(context)
+        )
+        for in_blob, out_blob in zip(
+                record.field_blobs(), self.output_schema.field_blobs()
+        ):
+            if self.use_copy:
+                net.Copy(in_blob, out_blob)
+            else:
+                net.Alias(in_blob, out_blob)
+
+    def add_ops(self, net):
+        self._set_output_blobs(net, InstantiationContext.PREDICTION)
+
+    def add_eval_ops(self, net):
+        self._set_output_blobs(net, InstantiationContext.EVAL)
+
+    def add_train_ops(self, net):
+        self._set_output_blobs(net, InstantiationContext.TRAINING)
+
+    def add_ops_to_accumulate_pred(self, net):
+        self._set_output_blobs(net, InstantiationContext.ACCUMULATE_PRED)
diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py
new file mode 100644
index 0000000..d7b96d9
--- /dev/null
+++ b/caffe2/python/layers/semi_random_features.py
@@ -0,0 +1,144 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap
+import numpy as np
+
+
+class SemiRandomFeatures(ArcCosineFeatureMap):
+    """
+    Implementation of the semi-random kernel feature map.
+
+    Applies H(x_rand) * x_rand^s * x_learned, where
+        H is the Heaviside step function,
+        x_rand is the input after applying FC with randomized parameters,
+        and x_learned is the input after applying FC with learnable parameters.
+
+    If using multilayer model with semi-random layers, then input and output records
+    should have a 'full' and 'random' Scalar. The random Scalar will be passed as
+    input to process the random features.
+
+    For more information, see the original paper:
+        https://arxiv.org/pdf/1702.08882.pdf
+
+    Inputs :
+        output_dims -- dimensions of the output vector
+        s -- if s == 0, will obtain linear semi-random features;
+             else if s == 1, will obtain squared semi-random features;
+             else s >= 2, will obtain higher order semi-random features
+        scale_random -- amount to scale the standard deviation
+                        (for random parameter initialization when weight_init or
+                        bias_init hasn't been specified)
+        scale_learned -- amount to scale the standard deviation
+                        (for learned parameter initialization when weight_init or
+                        bias_init hasn't been specified)
+
+        weight_init_random -- initialization distribution for random weight parameter
+                              (if None, will use Gaussian distribution)
+        bias_init_random -- initialization distribution for random bias pararmeter
+                            (if None, will use Uniform distribution)
+        weight_init_learned -- initialization distribution for learned weight parameter
+                               (if None, will use Gaussian distribution)
+        bias_init_learned -- initialization distribution for learned bias pararmeter
+                             (if None, will use Uniform distribution)
+        weight_optim -- optimizer for weight params for learned features
+        bias_optim -- optimizer for bias param for learned features
+
+        set_weight_as_global_constant -- if True, initialized random parameters
+                                         will be constant across all distributed
+                                         instances of the layer
+    """
+    def __init__(
+            self,
+            model,
+            input_record,
+            output_dims,
+            s=1,
+            scale_random=1.0,
+            scale_learned=1.0,
+            weight_init_random=None,
+            bias_init_random=None,
+            weight_init_learned=None,
+            bias_init_learned=None,
+            weight_optim=None,
+            bias_optim=None,
+            set_weight_as_global_constant=False,
+            name='semi_random_features',
+            **kwargs):
+
+        if isinstance(input_record, schema.Struct):
+            schema.is_schema_subset(
+                schema.Struct(
+                    ('full', schema.Scalar()),
+                    ('random', schema.Scalar()),
+                ),
+                input_record
+            )
+            self.input_record_full = input_record.full
+            self.input_record_random = input_record.random
+
+        elif isinstance(input_record, schema.Scalar):
+            self.input_record_full = input_record
+            self.input_record_random = input_record
+
+        super(SemiRandomFeatures, self).__init__(
+            model,
+            self.input_record_full,
+            output_dims,
+            s=s,
+            scale=scale_random,       # To initialize the random parameters
+            weight_init=weight_init_random,
+            bias_init=bias_init_random,
+            weight_optim=None,
+            bias_optim=None,
+            set_weight_as_global_constant=set_weight_as_global_constant,
+            initialize_output_schema=False,
+            name=name,
+            **kwargs)
+
+        self.output_schema = schema.Struct(
+            ('full', schema.Scalar(
+                (np.float32, output_dims),
+                model.net.NextScopedBlob(name + '_full_output')
+            ),),
+            ('random', schema.Scalar(
+                (np.float32, output_dims),
+                model.net.NextScopedBlob(name + '_random_output')
+            ),),
+        )
+
+        # To initialize the learnable parameters
+        assert (scale_learned > 0.0), \
+            "Expected scale (learned) > 0, got %s" % scale_learned
+        self.stddev = scale_learned * np.sqrt(1.0 / self.input_dims)
+
+        # Learned Parameters
+        (self.learned_w, self.learned_b) = self._initialize_params(
+            'learned_w',
+            'learned_b',
+            w_init=weight_init_learned,
+            b_init=bias_init_learned,
+            w_optim=weight_optim,
+            b_optim=bias_optim
+        )
+
+    def add_ops(self, net):
+        # Learned features: wx + b
+        learned_features = net.FC(self.input_record_full.field_blobs() +
+                                  [self.learned_w, self.learned_b],
+                                  net.NextScopedBlob('learned_features'))
+        # Random features: wx + b
+        random_features = net.FC(self.input_record_random.field_blobs() +
+                                 [self.random_w, self.random_b],
+                                 net.NextScopedBlob('random_features'))
+        processed_random_features = self._heaviside_with_power(
+            net,
+            random_features,
+            self.output_schema.random.field_blobs(),
+            self.s
+        )
+        net.Mul([processed_random_features, learned_features],
+                self.output_schema.full.field_blobs())
diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py
new file mode 100644
index 0000000..b17afc0
--- /dev/null
+++ b/caffe2/python/layers/sparse_feature_hash.py
@@ -0,0 +1,96 @@
+## @package sparse_feature_hash
+# Module caffe2.python.layers.sparse_feature_hash
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema, core
+from caffe2.python.layers.layers import (
+    ModelLayer,
+    IdList,
+    IdScoreList,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+
+import numpy as np
+
+
+class SparseFeatureHash(ModelLayer):
+
+    def __init__(self, model, input_record, seed=0, modulo=None,
+                 use_hashing=True, name='sparse_feature_hash', **kwargs):
+        super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs)
+
+        self.seed = seed
+        self.use_hashing = use_hashing
+        if schema.equal_schemas(input_record, IdList):
+            self.modulo = modulo or self.extract_hash_size(input_record.items.metadata)
+            metadata = schema.Metadata(
+                categorical_limit=self.modulo,
+                feature_specs=input_record.items.metadata.feature_specs,
+                expected_value=input_record.items.metadata.expected_value
+            )
+            with core.NameScope(name):
+                self.output_schema = schema.NewRecord(model.net, IdList)
+            self.output_schema.items.set_metadata(metadata)
+
+        elif schema.equal_schemas(input_record, IdScoreList):
+            self.modulo = modulo or self.extract_hash_size(input_record.keys.metadata)
+            metadata = schema.Metadata(
+                categorical_limit=self.modulo,
+                feature_specs=input_record.keys.metadata.feature_specs,
+                expected_value=input_record.keys.metadata.expected_value
+            )
+            with core.NameScope(name):
+                self.output_schema = schema.NewRecord(model.net, IdScoreList)
+            self.output_schema.keys.set_metadata(metadata)
+
+        else:
+            assert False, "Input type must be one of (IdList, IdScoreList)"
+
+        assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
+        if input_record.lengths.metadata:
+            self.output_schema.lengths.set_metadata(input_record.lengths.metadata)
+
+        # operators in this layer do not have CUDA implementation yet.
+        # In addition, since the sparse feature keys that we are hashing are
+        # typically on CPU originally, it makes sense to have this layer on CPU.
+        self.tags.update([Tags.CPU_ONLY])
+
+    def extract_hash_size(self, metadata):
+        if metadata.feature_specs and metadata.feature_specs.desired_hash_size:
+            return metadata.feature_specs.desired_hash_size
+        elif metadata.categorical_limit is not None:
+            return metadata.categorical_limit
+        else:
+            assert False, "desired_hash_size or categorical_limit must be set"
+
+    def add_ops(self, net):
+        net.Copy(
+            self.input_record.lengths(),
+            self.output_schema.lengths()
+        )
+        if schema.equal_schemas(self.output_schema, IdList):
+            input_blob = self.input_record.items()
+            output_blob = self.output_schema.items()
+        elif schema.equal_schemas(self.output_schema, IdScoreList):
+            input_blob = self.input_record.keys()
+            output_blob = self.output_schema.keys()
+            net.Copy(
+                self.input_record.values(),
+                self.output_schema.values()
+            )
+        else:
+            raise NotImplementedError()
+
+        if self.use_hashing:
+            net.IndexHash(
+                input_blob, output_blob, seed=self.seed, modulo=self.modulo
+            )
+        else:
+            net.Mod(
+                input_blob, output_blob, divisor=self.modulo, sign_follow_divisor=True
+            )
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
new file mode 100644
index 0000000..60dd233
--- /dev/null
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -0,0 +1,331 @@
+## @package sparse_lookup
+# Module caffe2.python.layers.sparse_lookup
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.helpers.arg_scope import get_current_scope
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    get_categorical_limit,
+    get_key,
+    IdList,
+    IdScoreList,
+    LayerPsParam,
+    ModelLayer,
+)
+import collections
+import functools
+import math
+import numpy as np
+import operator
+
+
+def get_sparse_lookup_predictor_version(version):
+    assert version in {'fp32', 'fp16', 'uint8rowwise', 'fused_uint8rowwise'},\
+        "Unexpected version of sparse_lookup layer {0}".format(version)
+    return version
+
+
+def _is_id_list(input_record):
+    return schema.equal_schemas(input_record, IdList)
+
+
+def _is_id_score_list(input_record):
+    return schema.equal_schemas(input_record,
+                                IdScoreList,
+                                check_field_types=False)
+
+
+class SparseLookup(ModelLayer):
+    _id_list_supported_reducers = [
+        'LogMeanExp', 'LogSumExp', 'Max', 'Mean', 'Sum',
+        'WeightedSum', 'WeightedMean', 'Sqrt', 'None']
+
+    _id_score_list_supported_reducers = [
+        'PositionWeighted', 'Mean', 'Sum', 'WeightedSum', 'WeightedMean', 'None']
+
+    def __init__(self, model, input_record, inner_shape, reducer,
+                 weight_init=None, weight_optim=None,
+                 name='sparse_lookup', regularizer=None, **kwargs):
+
+        super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
+
+        # TODO Add some asserts about input type
+        if isinstance(inner_shape, int):
+            inner_shape = [inner_shape]
+        assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\
+            "Unexpected type for inner_shape, expected list or tuple, got {0}".\
+            format(type(inner_shape))
+
+        if reducer == "PositionWeighted":
+            assert _is_id_score_list(self.input_record), (
+                "PositionWeighted only support IdScoreList, but got {} " +
+                "please use PositionWeighted layer to convert IdList " +
+                "to IdScoreList").format(repr(self.input_record))
+            self.external_weights = input_record.values()
+        self.reducer = reducer
+
+        input_dim = get_categorical_limit(input_record)
+        assert input_dim > 0, (
+            "{} should have categorical limit > 0, but got {}".format(
+                get_key(input_record)(), input_dim))
+
+        scale = math.sqrt(1.0 / input_dim)
+        self.shape = [input_dim] + inner_shape
+        self.weight_init = weight_init if weight_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+
+        if _is_id_list(self.input_record):
+            sparse_key = self.input_record.items()
+        elif _is_id_score_list(self.input_record):
+            sparse_key = self.input_record.keys()
+        else:
+            raise NotImplementedError()
+
+        if self.input_record.lengths.metadata:
+            avg_length = self.input_record.lengths.metadata.expected_value
+        else:
+            avg_length = None
+
+        self.w = self.create_param(
+            param_name='w',
+            shape=self.shape,
+            initializer=self.weight_init,
+            optimizer=weight_optim,
+            ps_param=LayerPsParam(
+                sparse_key=sparse_key,
+                average_length=avg_length),
+            regularizer=regularizer
+        )
+
+        self.scale_bias_init = ('ConstantFill', {'value': 0.0})
+
+        self.scale_bias = self.create_param(
+            param_name='scale_bias',
+            shape=[],
+            initializer=self.scale_bias_init,
+            optimizer=model.NoOptim,
+        )
+
+        self.output_schema = schema.Scalar(
+            (np.float32, inner_shape),
+            self.get_next_blob_reference('output'),
+        )
+
+    def get_memory_usage(self):
+        return functools.reduce(operator.mul, self.shape) * 4
+
+    def get_fp16_compatible_parameters(self):
+        return [self.w]
+
+    def support_8bit(self):
+        # Rowwise quantization makes sense only if shape it's 2D matrix with
+        # second dimension >= 8
+        if len(self.shape) != 2 or self.shape[1] < 8:
+            return False
+        return True
+
+    def get_8bits_compatible_parameters(self, fused=True):
+        if not self.support_8bit():
+            return []
+        if fused:
+            RowwiseQuantized8BitsWeight = collections.namedtuple(
+                'RowwiseQuantized8BitsWeight', 'w'
+            )
+            return [RowwiseQuantized8BitsWeight(self.w)]
+        else:
+            RowwiseQuantized8BitsWeight = collections.namedtuple(
+                'RowwiseQuantized8BitsWeight', 'w, scale_bias'
+            )
+            return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)]
+
+    def _gather_wrapper(self, net, version, in_indices, out):
+        # Gather can work on all kinds of input data types, and output
+        # data with the same type. Convert the output of Gather to float,
+        # because the follow-up Ops expect fp32.
+        if version == 'fp32':
+            return net.Gather([self.w, in_indices], out)
+        elif version == 'fp16':
+            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
+
+            return net.HalfToFloat(gathered_w, out)
+        elif version == 'uint8rowwise':
+            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
+            gathered_scale_bias = net.Gather(
+                [self.scale_bias, in_indices],
+                'gathered_scale_bias'
+            )
+
+            return net.Rowwise8BitQuantizedToFloat(
+                [gathered_w, gathered_scale_bias], out)
+        elif version == 'fused_uint8rowwise':
+            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
+            return net.Fused8BitRowwiseQuantizedToFloat(gathered_w, out)
+        else:
+            raise "Unsupported version of operators in SparseLookup " +\
+                "layer: {0}".format(version)
+
+    def _sparse_lengths_weighted_reducer(
+            self, in_indices, weights, reducer,
+            net, version, grad_on_weights=0):
+        op_input = [
+            self.w,
+            weights,
+            in_indices,
+            self.input_record.lengths()
+        ]
+        layer_name = 'SparseLengths' + reducer
+
+        if version in ['fp32', 'fp16']:
+            # SparseLengths* Ops will accept either fp16 or fp32 embedding
+            # matrix and output fp32 pooled embedding
+            net.__getattr__(layer_name)(
+                op_input,
+                self.output_schema.field_blobs(),
+                grad_on_weights=grad_on_weights,
+            )
+        elif version == 'uint8rowwise':
+            op_input.insert(len(op_input), self.scale_bias)
+            net.__getattr__(layer_name + '8BitsRowwise')(
+                op_input, self.output_schema.field_blobs())
+        elif version == 'fused_uint8rowwise':
+            net.__getattr__(layer_name + 'Fused8BitRowwise')(
+                op_input, self.output_schema.field_blobs())
+        else:
+            raise "Unsupported version of operator in SparseLookUp " +\
+                "layer: {0}".format(version)
+
+    # deal with sparse features of id_list type
+    def _add_ops_id_list(self, net, version):
+        assert self.reducer in self._id_list_supported_reducers, (
+            "Unsupported reducer: {} for ID_LIST".format(self.reducer)
+        )
+        if self.reducer in ['Sum', 'Mean', 'WeightedSum', 'WeightedMean']:
+            op_input = [self.w,
+                        self.input_record.items(),
+                        self.input_record.lengths()]
+
+            # For id list features, the behaviors of 'Sum' and
+            # 'WeightedSum' are identical, since we can regard the weight on each
+            # id as 1. Similarly, for 'Mean' and 'WeightedMean'.
+            if self.reducer == 'WeightedSum':
+                self.reducer = 'Sum'
+            elif self.reducer == 'WeightedMean':
+                self.reducer = 'Mean'
+
+            layer_name = 'SparseLengths' + self.reducer
+            if version in ['fp32', 'fp16']:
+                # SparseLengths* Ops will accept either fp16 or fp32 embedding
+                # matrix and output fp32 pooled embedding
+                net.__getattr__(layer_name)(
+                    op_input,
+                    self.output_schema.field_blobs(),
+                )
+            elif version == 'uint8rowwise':
+                op_input.insert(len(op_input), self.scale_bias)
+                net.__getattr__(layer_name + '8BitsRowwise')(
+                    op_input, self.output_schema.field_blobs())
+            elif version == 'fused_uint8rowwise':
+                net.__getattr__(layer_name + 'Fused8BitRowwise')(
+                    op_input, self.output_schema.field_blobs())
+            else:
+                raise "Unsupported version of operator in SparseLookUp " +\
+                    "layer: {0}".format(version)
+
+        elif self.reducer == 'Sqrt':
+            sqrt_weight = net.LengthsToWeights(
+                [self.input_record.lengths()],
+                [net.NextScopedBlob('lengths_sqrt')],
+                power=0.5,
+            )
+            self._sparse_lengths_weighted_reducer(
+                self.input_record.items(),
+                sqrt_weight,
+                'WeightedSum', net, version)
+
+        elif self.reducer == 'None':
+            # Gather operator will gather the embedding for each id of
+            # each IdList.
+            self._gather_wrapper(net, version, self.input_record.items(),
+                                 self.output_schema.field_blobs())
+
+        else:
+            table_rows = self._gather_wrapper(
+                net, version, self.input_record.items(), 'table_rows')
+
+            segment_ids = net.LengthsToSegmentIds(
+                self.input_record.lengths(),
+                self.input_record.lengths() + '_sid')
+            net.__getattr__('SortedSegmentRange' + self.reducer)(
+                [table_rows, segment_ids],
+                self.output_schema.field_blobs(),
+            )
+
+    # deal with sparse features of id_score_list type
+    def _add_ops_id_score_list(self, net, version):
+        assert self.reducer in self._id_score_list_supported_reducers, (
+            "Unsupported reducer: {} for ID_SCORE_LIST".format(self.reducer)
+        )
+        if self.reducer in ['WeightedSum', 'WeightedMean']:
+            self._sparse_lengths_weighted_reducer(
+                self.input_record.keys(),
+                self.input_record.values(),
+                self.reducer, net, version)
+
+        elif self.reducer in ['Sum', 'Mean']:
+            op_input = [self.w,
+                        self.input_record.keys(),
+                        self.input_record.lengths()]
+
+            layer_name = 'SparseLengths' + self.reducer
+
+            if version in ['fp32', 'fp16']:
+                net.__getattr__(layer_name)(
+                    op_input,
+                    self.output_schema.field_blobs(),
+                )
+            elif version == 'uint8rowwise':
+                net.__getattr__(layer_name + '8BitsRowwise')(
+                    op_input, self.output_schema.field_blobs())
+            elif version == 'fused_uint8rowwise':
+                net.__getattr__(layer_name + 'Fused8BitRowwise')(
+                    op_input, self.output_schema.field_blobs())
+            else:
+                raise "Unsupported version of operator in SparseLookUp " +\
+                    "layer: {0}".format(version)
+
+        elif self.reducer == 'PositionWeighted':
+            self._sparse_lengths_weighted_reducer(
+                self.input_record.keys(),
+                self.external_weights,
+                'WeightedSum', net, version, grad_on_weights=1)
+
+        elif self.reducer == 'None':
+            # Gather operator will gather the embedding for each id of
+            # each IdList.
+            self._gather_wrapper(net, version, self.input_record.keys(),
+                                 self.output_schema.field_blobs())
+        else:
+            raise "Only Sum, Mean, None are supported for IdScoreList input." +\
+                "Trying to create with {}".format(self.reducer)
+
+    def add_ops(self, net):
+        cur_scope = get_current_scope()
+        version = get_sparse_lookup_predictor_version(
+            **cur_scope.get(get_sparse_lookup_predictor_version.__name__,
+                            {'version': 'fp32'}))
+
+        # TODO(amalevich): Layer should not be responsible for decision about
+        # quantization.
+        if not self.support_8bit() and version in {'uint8rowwise',
+                                                   'fused_uint8rowwise'}:
+            version = 'fp32'
+
+        if _is_id_list(self.input_record):
+            self._add_ops_id_list(net, version=version)
+        elif _is_id_score_list(self.input_record):
+            self._add_ops_id_score_list(net, version=version)
+        else:
+            raise "Unsupported input type {0}".format(self.input_record)
diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py
new file mode 100644
index 0000000..449e2c6
--- /dev/null
+++ b/caffe2/python/layers/split.py
@@ -0,0 +1,52 @@
+## @package split
+# Module caffe2.python.layers.split
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+
+
+class Split(ModelLayer):
+
+    def __init__(self, model, input_record, num_splits, axis=1,
+                 name='split', **kwargs):
+        super(Split, self).__init__(model, name, input_record, **kwargs)
+        self.axis = axis
+        # Assume that first dimension is batch, so actual axis in shape is
+        # axis - 1
+        axis -= 1
+        assert axis >= 0
+
+        assert isinstance(input_record, schema.Scalar),\
+            "Incorrect input type. Excpected Scalar, but received: {0}".\
+            format(input_record)
+
+        input_shape = input_record.field_type().shape
+        assert len(input_shape) >= axis
+        assert input_shape[axis] % num_splits == 0
+
+        output_shape = list(input_shape)
+        output_shape[axis] = int(output_shape[axis] / num_splits)
+
+        data_type = input_record.field_type().base
+
+        output_scalars = [
+            schema.Scalar(
+                (data_type, output_shape),
+                self.get_next_blob_reference('output_{}'.format(i)),
+            )
+            for i in range(num_splits)
+        ]
+        self.output_schema = schema.Tuple(*output_scalars)
+
+    def add_ops(self, net):
+        net.Split(
+            self.input_record.field_blobs(),
+            self.output_schema.field_blobs(),
+            axis=self.axis,
+        )
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
new file mode 100644
index 0000000..e57b0f8
--- /dev/null
+++ b/caffe2/python/layers/tags.py
@@ -0,0 +1,114 @@
+## @package tags
+# Module caffe2.python.layers.tags
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import six
+
+from caffe2.python import context
+
+
+@context.define_context(allow_default=True)
+class TagContext(object):
+    """
+    Scope driven way to provide tags to the layers.
+    """
+
+    def __init__(self, tags=None):
+        # Tags is expected to be list to keep order of adding/removing things
+        self.tags = tags or []
+
+    def add_tags(self, tags):
+        self.tags.extend(tags)
+
+    def remove_tags(self, tags):
+        assert self.tags[-len(tags):] == tags
+        self.tags = self.tags[:-len(tags)]
+
+
+class Tags(object):
+    # TODO(amalevich): Tags might need to live in their own contexts, add this
+    # split later
+    EXCLUDE_FROM_TRAIN = 'exclude_from_train'
+    EXCLUDE_FROM_EVAL = 'exclude_from_eval'
+    EXCLUDE_FROM_PREDICTION = 'exclude_from_prediction'
+    EXCLUDE_FROM_ACCUMULATE_PRED = 'exclude_from_accumulate_pred'
+    PREPROCESSING = 'preprocessing'
+    HANDLE_AS_SPARSE_LAYER = 'handle_as_sparse_layer'
+    GRADIENT_FROM_PS = 'gradient_from_ps'
+    PREFER_GPU = 'prefer_gpu'
+    CPU_ONLY = 'cpu_only'
+
+    # The following three tags are hints to **distributed training framework**.
+    """
+    Indicates a layer contains a sparse shardable parameter.  The parameter
+    should be sharded nd operators on those parameters should be done on
+    distributed parameter servers.
+    """
+    SPARSE_SHARDED = 'sparse_sharded'
+    """
+    Indicates a layer contains a sparse parameters among others, and that the
+    parameters should not be sharded (i.e. should be placed together on a node).
+    """
+    SPARSE_DONT_SHARD = 'sparse_dont_shard'
+    """
+    Used to manually indicate a component for an operator.  Parameters for
+    all operators with the same component should be colocated on the same
+    parameter server.
+    """
+    COMPONENT = 'component:'
+    """
+    Valid tag prefixes for distributed training framework.
+    """
+    """
+    Used to pass on info to the 'extra_info' field in the net
+    Proto. Typically to provide info for distributed training.
+    """
+    EXTRA_INFO = 'extra_info:'
+    """
+    An empty tag, used to make conditional statement on with(Tags) block more concise
+    """
+    EMPTY_TAG = 'empty_tag'
+
+    DT_TAGS = (SPARSE_SHARDED, SPARSE_DONT_SHARD, COMPONENT)
+
+    # In certain cases we want to have different schema for training and
+    # prediction, as an example in prediction we might need to have only
+    # subset of ids present in the orignal schema. This tag is one of the ways
+    # to mark operators that will be removed from prediction and should
+    # override schema for predictors.
+    PREDICTION_SCHEMA = 'prediction_schema'
+
+    # This is to mark layers in the feature transform process.
+    FEATURE_TRANSFORM = 'feature_transform'
+    # This is to mark the output layers in the feature transform process
+    FEATURE_TRANSFORM_SCHEMA = 'feature_transform_schema'
+
+    def __init__(self, tags):
+        if not isinstance(tags, list):
+            tags = [tags]
+        self.tags = tags
+
+    def __enter__(self):
+        TagContext.current().add_tags(self.tags)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        TagContext.current().remove_tags(self.tags)
+
+    def __call__(self, func):
+        @six.wraps(func)
+        def wrapper(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+        return wrapper
+
+
+Tags.TRAIN_ONLY = [Tags.EXCLUDE_FROM_PREDICTION, Tags.EXCLUDE_FROM_EVAL,
+                   Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
+Tags.EVAL_ONLY = [Tags.EXCLUDE_FROM_PREDICTION, Tags.EXCLUDE_FROM_TRAIN,
+                  Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
+Tags.PREDICTION_ONLY = [Tags.EXCLUDE_FROM_TRAIN, Tags.EXCLUDE_FROM_EVAL,
+                        Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py
new file mode 100644
index 0000000..46ed29b
--- /dev/null
+++ b/caffe2/python/layers/uniform_sampling.py
@@ -0,0 +1,83 @@
+## @package uniform_sampling
+# Module caffe2.python.layers.uniform_sampling
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import ModelLayer
+
+
+class UniformSampling(ModelLayer):
+    """
+    Uniform sampling `num_samples - len(input_record)` unique elements from the
+    range [0, num_elements). `samples` is the concatenation of input_record and
+    the samples. input_record is expected to be unique.
+    """
+
+    def __init__(
+        self,
+        model,
+        input_record,
+        num_samples,
+        num_elements,
+        name='uniform_sampling',
+        **kwargs
+    ):
+        super(UniformSampling, self).__init__(
+            model, name, input_record, **kwargs
+        )
+
+        assert num_elements > num_samples > 0
+        assert isinstance(input_record, schema.Scalar)
+
+        self.num_elements = num_elements
+
+        num_examples_init = ('GivenTensorInt64Fill',
+                             {'values': [num_samples]})
+        self.num_samples = self.create_param(param_name='num_examples',
+                                              shape=(1,),
+                                              initializer=num_examples_init,
+                                              optimizer=model.NoOptim)
+
+        sampling_blob_init = ('ConstantFill',
+                              {'value': float(num_samples) / num_elements,
+                               'dtype': core.DataType.FLOAT})
+        self.sampling_prob = self.create_param(param_name='prob',
+                                               shape=(num_samples,),
+                                               initializer=sampling_blob_init,
+                                               optimizer=model.NoOptim)
+
+        self.output_schema = schema.Struct(
+            (
+                'samples', schema.Scalar(
+                    np.int32, self.get_next_blob_reference("samples")
+                )
+            ),
+            ('sampling_prob', schema.Scalar(np.float32, self.sampling_prob)),
+        )
+
+    def add_ops(self, net):
+        net.StopGradient(self.sampling_prob, self.sampling_prob)
+
+        shape = net.Shape([self.input_record()], net.NextScopedBlob("shape"))
+        shape = net.Sub([self.num_samples, shape], shape)
+        samples = net.UniqueUniformFill(
+            [shape, self.input_record()],
+            net.NextScopedBlob("samples_before_concat"),
+            min=0,
+            max=self.num_elements - 1,
+            input_as_shape=True
+        )
+
+        net.Concat(
+            [self.input_record(), samples],
+            [self.output_schema.samples(), net.NextScopedBlob("split_info")],
+            axis=0
+        )
+        net.StopGradient(
+            self.output_schema.samples(), self.output_schema.samples()
+        )
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
new file mode 100644
index 0000000..00f32d6
--- /dev/null
+++ b/caffe2/python/layers_test.py
@@ -0,0 +1,2013 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+import numpy as np
+import numpy.testing as npt
+import unittest
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+
+from caffe2.python import (
+    layer_model_instantiator,
+    core,
+    schema,
+    workspace,
+)
+from caffe2.python.layers.layers import (
+    InstantiationContext,
+)
+from caffe2.python.layers.tags import Tags
+from caffe2.python.layer_test_util import (
+    LayersTestCase,
+    OpSpec,
+)
+from caffe2.python.layers.layers import (
+    IdList,
+    set_request_only,
+    is_request_only_scalar,
+    get_key,
+)
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class TestLayers(LayersTestCase):
+    def testAddLoss(self):
+        input_record_LR = self.new_record(
+            schema.Struct(
+                ('label', schema.Scalar((np.float64, (1, )))),
+                ('logit', schema.Scalar((np.float32, (2, )))),
+                ('weight', schema.Scalar((np.float64, (1, ))))
+            )
+        )
+        loss_LR = self.model.BatchLRLoss(input_record_LR)
+
+        self.model.add_loss(loss_LR)
+        assert 'unnamed' in self.model.loss
+        self.assertEqual(
+            schema.Scalar((np.float32, tuple())), self.model.loss.unnamed
+        )
+        self.assertEqual(loss_LR, self.model.loss.unnamed)
+
+        self.model.add_loss(loss_LR, 'addLoss')
+        assert 'addLoss' in self.model.loss
+        self.assertEqual(
+            schema.Scalar((np.float32, tuple())), self.model.loss.addLoss
+        )
+        self.assertEqual(loss_LR, self.model.loss.addLoss)
+
+        self.model.add_loss(
+            schema.Scalar(
+                dtype=np.float32, blob=core.BlobReference('loss_blob_1')
+            ), 'addLoss'
+        )
+        assert 'addLoss_auto_0' in self.model.loss
+        self.assertEqual(
+            schema.Scalar((np.float32, tuple())), self.model.loss.addLoss_auto_0
+        )
+        assert core.BlobReference('loss_blob_1') in self.model.loss.field_blobs()
+
+        self.model.add_loss(
+            schema.Struct(
+                (
+                    'structName', schema.Scalar(
+                        dtype=np.float32,
+                        blob=core.BlobReference('loss_blob_2')
+                    )
+                )
+            ), 'addLoss'
+        )
+        assert 'addLoss_auto_1' in self.model.loss
+        self.assertEqual(
+            schema.Struct(('structName', schema.Scalar((np.float32, tuple())))),
+            self.model.loss.addLoss_auto_1
+        )
+        assert core.BlobReference('loss_blob_2') in self.model.loss.field_blobs()
+
+        loss_in_tuple_0 = schema.Scalar(
+            dtype=np.float32, blob=core.BlobReference('loss_blob_in_tuple_0')
+        )
+
+        loss_in_tuple_1 = schema.Scalar(
+            dtype=np.float32, blob=core.BlobReference('loss_blob_in_tuple_1')
+        )
+
+        loss_tuple = schema.NamedTuple(
+            'loss_in_tuple', * [loss_in_tuple_0, loss_in_tuple_1]
+        )
+        self.model.add_loss(loss_tuple, 'addLoss')
+        assert 'addLoss_auto_2' in self.model.loss
+        self.assertEqual(
+            schema.Struct(
+                ('loss_in_tuple_0', schema.Scalar((np.float32, tuple()))),
+                ('loss_in_tuple_1', schema.Scalar((np.float32, tuple())))
+            ), self.model.loss.addLoss_auto_2
+        )
+        assert core.BlobReference('loss_blob_in_tuple_0')\
+         in self.model.loss.field_blobs()
+        assert core.BlobReference('loss_blob_in_tuple_1')\
+         in self.model.loss.field_blobs()
+
+    def testAddOutputSchema(self):
+        # add the first field
+        self.model.add_output_schema('struct', schema.Struct())
+        expected_output_schema = schema.Struct(('struct', schema.Struct()))
+        self.assertEqual(
+            self.model.output_schema,
+            expected_output_schema,
+        )
+
+        # add the second field
+        self.model.add_output_schema('scalar', schema.Scalar(np.float64))
+        expected_output_schema = schema.Struct(
+            ('struct', schema.Struct()),
+            ('scalar', schema.Scalar(np.float64)),
+        )
+        self.assertEqual(
+            self.model.output_schema,
+            expected_output_schema,
+        )
+
+        # overwrite a field should raise
+        with self.assertRaises(AssertionError):
+            self.model.add_output_schema('scalar', schema.Struct())
+
+    def _test_net(self, net, ops_list):
+        """
+        Helper function to assert the net contains some set of operations and
+        then to run the net.
+
+        Inputs:
+            net -- the network to test and run
+            ops_list -- the list of operation specifications to check for
+                        in the net
+        """
+        ops_output = self.assertNetContainOps(net, ops_list)
+        workspace.RunNetOnce(net)
+        return ops_output
+
+    def testFCWithoutBias(self):
+        output_dims = 2
+        fc_without_bias = self.model.FCWithoutBias(
+            self.model.input_feature_schema.float_features, output_dims)
+        self.model.output_schema = fc_without_bias
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (output_dims, ))),
+            fc_without_bias
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("UniformFill", None, None),
+            ]
+        )
+
+        mat_mul_spec = OpSpec(
+            "MatMul",
+            [
+                self.model.input_feature_schema.float_features(),
+                init_ops[0].output[0],
+            ],
+            fc_without_bias.field_blobs()
+        )
+
+        self.assertNetContainOps(train_net, [mat_mul_spec])
+
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(predict_net, [mat_mul_spec])
+
+    def testSparseLookupSumPooling(self):
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('sparse', schema.Struct(
+                ('sparse_feature_0', schema.List(
+                    schema.Scalar(np.int64,
+                                  metadata=schema.Metadata(categorical_limit=1000)))),
+            )),
+        ))
+        embedding_dim = 64
+        embedding_after_pooling = self.model.SparseLookup(
+            record.sparse.sparse_feature_0, [embedding_dim], 'Sum')
+        self.model.output_schema = schema.Struct()
+        self.assertEqual(
+            schema.Scalar((np.float32, (embedding_dim, ))),
+            embedding_after_pooling
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("UniformFill", None, None),
+                OpSpec("ConstantFill", None, None),
+            ]
+        )
+        sparse_lookup_op_spec = OpSpec(
+            'SparseLengthsSum',
+            [
+                init_ops[0].output[0],
+                record.sparse.sparse_feature_0.items(),
+                record.sparse.sparse_feature_0.lengths(),
+            ],
+            [embedding_after_pooling()]
+        )
+        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
+
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
+
+    @given(
+        use_hashing=st.booleans(),
+        modulo=st.integers(min_value=100, max_value=200),
+    )
+    def testSparseFeatureHashIdList(self, use_hashing, modulo):
+        record = schema.NewRecord(
+            self.model.net,
+            schema.List(schema.Scalar(
+                np.int64,
+                metadata=schema.Metadata(categorical_limit=60000)
+            ))
+        )
+        output_schema = self.model.SparseFeatureHash(
+            record,
+            modulo=modulo,
+            use_hashing=use_hashing)
+
+        self.model.output_schema = output_schema
+
+        self.assertEqual(len(self.model.layers), 1)
+        self.assertEqual(output_schema._items.metadata.categorical_limit,
+                modulo)
+        train_init_net, train_net = self.get_training_nets()
+
+    @given(
+        use_hashing=st.booleans(),
+        modulo=st.integers(min_value=100, max_value=200),
+    )
+    def testSparseFeatureHashIdScoreList(self, use_hashing, modulo):
+        record = schema.NewRecord(self.model.net,
+                schema.Map(schema.Scalar(np.int64,
+                    metadata=schema.Metadata(
+                        categorical_limit=60000)),
+                    np.float32))
+
+        output_schema = self.model.SparseFeatureHash(
+            record,
+            modulo=modulo,
+            use_hashing=use_hashing)
+
+        self.model.output_schema = output_schema
+
+        self.assertEqual(len(self.model.layers), 1)
+        self.assertEqual(output_schema._items.keys.metadata.categorical_limit,
+                modulo)
+        train_init_net, train_net = self.get_training_nets()
+
+    def testSparseLookupIncorrectPositionWeightedOnIdList(self):
+        '''
+        Currently the implementation of SparseLookup assumed input is id_score_list
+        when use PositionWeighted.
+        '''
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('sparse', schema.Struct(
+                ('sparse_feature_0', schema.List(
+                    schema.Scalar(np.int64,
+                                  metadata=schema.Metadata(categorical_limit=1000)))),
+            )),
+        ))
+
+        embedding_dim = 64
+        with self.assertRaises(AssertionError):
+            self.model.SparseLookup(
+                record.sparse.sparse_feature_0, [embedding_dim], 'PositionWeighted')
+
+    def testSparseLookupPositionWeightedOnIdList(self):
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('sparse', schema.Struct(
+                ('sparse_feature_0', schema.List(
+                    schema.Scalar(np.int64,
+                                  metadata=schema.Metadata(categorical_limit=1000)))),
+            )),
+        ))
+
+        # convert id_list to id_score_list with PositionWeighted layer
+        sparse_segment = record.sparse.sparse_feature_0
+        pos_w_layer = self.model.PositionWeighted(sparse_segment)
+
+        sparse_segment = schema.Map(
+            keys=get_key(sparse_segment),
+            values=pos_w_layer.position_weights,
+            lengths_blob=sparse_segment.lengths
+        )
+
+        embedding_dim = 64
+        embedding_after_pooling = self.model.SparseLookup(
+            sparse_segment, [embedding_dim], 'PositionWeighted')
+        self.model.output_schema = schema.Struct()
+        self.assertEqual(
+            schema.Scalar((np.float32, (embedding_dim, ))),
+            embedding_after_pooling
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("ConstantFill", None, None),  # position_weights/pos_w
+                OpSpec("UniformFill", None, None),
+                OpSpec("ConstantFill", None, None),
+            ]
+        )
+        self.assertNetContainOps(train_net, [
+            OpSpec("LengthsRangeFill", None, None),
+            OpSpec("Gather", None, None),
+            OpSpec("SparseLengthsWeightedSum", None, None),
+        ])
+
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(predict_net, [
+            OpSpec("LengthsRangeFill", None, None),
+            OpSpec("Gather", None, None),
+            OpSpec("SparseLengthsWeightedSum", None, None),
+        ])
+
+    def testSparseLookupPositionWeightedOnIdScoreList(self):
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('sparse', schema.Struct(
+                ('id_score_list_0', schema.Map(
+                    schema.Scalar(
+                        np.int64,
+                        metadata=schema.Metadata(
+                            categorical_limit=1000
+                        ),
+                    ),
+                    np.float32
+                )),
+            )),
+        ))
+
+        embedding_dim = 64
+        embedding_after_pooling = self.model.SparseLookup(
+            record.sparse.id_score_list_0, [embedding_dim], 'PositionWeighted')
+        self.model.output_schema = schema.Struct()
+        self.assertEqual(
+            schema.Scalar((np.float32, (embedding_dim, ))),
+            embedding_after_pooling
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("UniformFill", None, None),
+                OpSpec("ConstantFill", None, None),
+            ]
+        )
+        sparse_lookup_op_spec = OpSpec(
+            'SparseLengthsWeightedSum',
+            [
+                init_ops[0].output[0],
+                record.sparse.id_score_list_0.values(),
+                record.sparse.id_score_list_0.keys(),
+                record.sparse.id_score_list_0.lengths(),
+            ],
+            [embedding_after_pooling()]
+        )
+        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
+
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
+
+    def testPairwiseSimilarityWithAllEmbeddings(self):
+        embedding_dim = 64
+        N = 5
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('all_embeddings', schema.Scalar(
+                ((np.float32, (N, embedding_dim)))
+            )),
+        ))
+        current = self.model.PairwiseSimilarity(
+            record, N * N)
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (N * N, ))),
+            current
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+        self.assertNetContainOps(train_init_net, [])
+        self.assertNetContainOps(train_net, [
+            OpSpec("BatchMatMul", None, None),
+            OpSpec("Flatten", None, None),
+        ])
+
+    def testPairwiseSimilarityWithXandYEmbeddings(self):
+        embedding_dim = 64
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('x_embeddings', schema.Scalar(
+                ((np.float32, (5, embedding_dim)))
+            )),
+            ('y_embeddings', schema.Scalar(
+                ((np.float32, (6, embedding_dim)))
+            )),
+        ))
+        current = self.model.PairwiseSimilarity(
+            record, 5 * 6)
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (5 * 6, ))),
+            current
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+        self.assertNetContainOps(train_init_net, [])
+        self.assertNetContainOps(train_net, [
+            OpSpec("BatchMatMul", None, None),
+            OpSpec("Flatten", None, None),
+        ])
+
+    def testPairwiseSimilarityWithXandYEmbeddingsAndGather(self):
+        embedding_dim = 64
+
+        output_idx = [1, 3, 5]
+        output_idx_blob = self.model.add_global_constant(
+            str(self.model.net.NextScopedBlob('pairwise_dot_product_gather')),
+            output_idx,
+            dtype=np.int32,
+        )
+        indices_to_gather = schema.Scalar(
+            (np.int32, len(output_idx)),
+            output_idx_blob,
+        )
+
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('x_embeddings', schema.Scalar(
+                ((np.float32, (5, embedding_dim)))
+            )),
+            ('y_embeddings', schema.Scalar(
+                ((np.float32, (6, embedding_dim)))
+            )),
+            ('indices_to_gather', indices_to_gather),
+        ))
+        current = self.model.PairwiseSimilarity(
+            record, len(output_idx))
+
+        # This assert is not necessary,
+        # output size is passed into PairwiseSimilarity
+        self.assertEqual(
+            schema.Scalar((np.float32, (len(output_idx), ))),
+            current
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+        self.assertNetContainOps(train_init_net, [])
+        self.assertNetContainOps(train_net, [
+            OpSpec("BatchMatMul", None, None),
+            OpSpec("Flatten", None, None),
+            OpSpec("BatchGather", None, None),
+        ])
+
+    def testPairwiseSimilarityIncorrectInput(self):
+        embedding_dim = 64
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('x_embeddings', schema.Scalar(
+                ((np.float32, (5, embedding_dim)))
+            )),
+        ))
+        with self.assertRaises(AssertionError):
+            self.model.PairwiseSimilarity(
+                record, 25)
+
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('all_embeddings', schema.List(np.float32))
+        ))
+        with self.assertRaises(AssertionError):
+            self.model.PairwiseSimilarity(
+                record, 25)
+
+    def testConcat(self):
+        embedding_dim = 64
+        input_record = self.new_record(schema.Struct(
+            ('input1', schema.Scalar((np.float32, (embedding_dim, )))),
+            ('input2', schema.Scalar((np.float32, (embedding_dim, )))),
+            ('input3', schema.Scalar((np.float32, (embedding_dim, )))),
+        ))
+
+        output = self.model.Concat(input_record)
+        self.assertEqual(
+            schema.Scalar((np.float32, ((len(input_record.fields) * embedding_dim, )))),
+            output
+        )
+
+        # Note that in Concat layer we assume first dimension is batch.
+        # so input is B * embedding_dim
+        # add_axis=1 make it B * 1 * embedding_dim
+        # concat on axis=1 make it B * N * embedding_dim
+        output = self.model.Concat(input_record, axis=1, add_axis=1)
+        self.assertEqual(
+            schema.Scalar((np.float32, ((len(input_record.fields), embedding_dim)))),
+            output
+        )
+
+    def testSamplingTrain(self):
+        output_dims = 1000
+
+        indices = self.new_record(schema.Scalar((np.int32, (10,))))
+        sampling_prob = self.new_record(schema.Scalar((np.float32, (10, ))))
+
+        sampled_fc = self.model.SamplingTrain(
+            schema.Struct(
+                ('input', self.model.input_feature_schema.float_features),
+                ('indices', indices),
+                ('sampling_prob', sampling_prob),
+            ),
+            "FC",
+            output_dims,
+        )
+        self.model.output_schema = sampled_fc
+
+        # Check that we don't add prediction layer into the model
+        self.assertEqual(1, len(self.model.layers))
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (output_dims, ))),
+            sampled_fc
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("UniformFill", None, None),
+                OpSpec("UniformFill", None, None),
+            ]
+        )
+
+        sampled_fc_layer = self.model.layers[0]
+
+        gather_w_spec = OpSpec(
+            "Gather",
+            [
+                init_ops[0].output[0],
+                indices(),
+            ],
+            [
+                sampled_fc_layer._prediction_layer.train_param_blobs[0]
+            ]
+        )
+        gather_b_spec = OpSpec(
+            "Gather",
+            [
+                init_ops[1].output[0],
+                indices(),
+            ],
+            [
+                sampled_fc_layer._prediction_layer.train_param_blobs[1]
+            ]
+        )
+        train_fc_spec = OpSpec(
+            "FC",
+            [
+                self.model.input_feature_schema.float_features(),
+            ] + sampled_fc_layer._prediction_layer.train_param_blobs,
+            sampled_fc.field_blobs()
+        )
+        log_spec = OpSpec("Log", [sampling_prob()], [None])
+        sub_spec = OpSpec(
+            "Sub",
+            [sampled_fc.field_blobs()[0], None],
+            sampled_fc.field_blobs()
+        )
+
+        train_ops = self.assertNetContainOps(
+            train_net,
+            [gather_w_spec, gather_b_spec, train_fc_spec, log_spec, sub_spec])
+
+        self.assertEqual(train_ops[3].output[0], train_ops[4].input[1])
+
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(
+            predict_net,
+            [
+                OpSpec(
+                    "FC",
+                    [
+                        self.model.input_feature_schema.float_features(),
+                        init_ops[0].output[0],
+                        init_ops[1].output[0],
+                    ],
+                    sampled_fc.field_blobs()
+                )
+            ]
+        )
+
+    def testDistillBatchLRLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float64, (1,)))),
+            ('logit', schema.Scalar((np.float32, (2,)))),
+            ('teacher_label', schema.Scalar((np.float32(1,)))),
+            ('weight', schema.Scalar((np.float64, (1,))))
+        ))
+        loss = self.model.BatchDistillLRLoss(input_record)
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
+    def testDistillBatchLRLossWithTeacherWeightScreen(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float32, (2,)))),
+            ('logit', schema.Scalar((np.float32, (2, 1)))),
+            ('teacher_label', schema.Scalar((np.float32(2,)))),
+            ('weight', schema.Scalar((np.float64, (2,))))
+        ))
+        label_items = np.array([1.0, 1.0], dtype=np.float32)
+        logit_items = np.array([[1.0], [1.0]], dtype=np.float32)
+        teacher_label_items = np.array([0.8, -1.0], dtype=np.float32)
+        weight_items = np.array([1.0, 1.0], dtype=np.float32)
+        schema.FeedRecord(
+            input_record,
+            [label_items, logit_items, teacher_label_items, weight_items]
+        )
+        loss = self.model.BatchDistillLRLoss(
+            input_record,
+            teacher_weight=0.5,
+            filter_invalid_teacher_label=True
+        )
+        self.run_train_net_forward_only()
+        tensor_loss = workspace.FetchBlob(loss.field_blobs()[0])
+
+        def cross_entropy(label, logit):
+            return logit - logit * label + np.log(1 + np.exp(-1.0 * logit))
+
+        def cal_cross_entropy(
+            label_items, logit_items, teacher_label_items, weight_items
+        ):
+            total_ce = 0
+            for i in range(label_items.shape[0]):
+                true_xent = cross_entropy(label_items[i], logit_items[i, 0])
+                if teacher_label_items[i] > 0:
+                    teacher_xent = cross_entropy(
+                        teacher_label_items[i], logit_items[i, 0]
+                    )
+                else:
+                    teacher_xent = 0
+                teacher_weight = 0.5 if teacher_label_items[i] > 0 else 0
+                total_ce += (true_xent * (1 - teacher_weight) +
+                            teacher_xent * teacher_weight) * weight_items[i]
+            return total_ce / label_items.shape[0]
+
+        correct_ace = cal_cross_entropy(
+            label_items,
+            logit_items,
+            teacher_label_items,
+            weight_items
+        )
+        self.assertAlmostEqual(
+            tensor_loss,
+            np.array(correct_ace),
+            delta=0.0000001,
+            msg="Wrong cross entropy {}".format(tensor_loss)
+        )
+
+    def testBatchLRLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float64, (1,)))),
+            ('logit', schema.Scalar((np.float32, (2,)))),
+            ('weight', schema.Scalar((np.float64, (1,))))
+        ))
+        loss = self.model.BatchLRLoss(input_record)
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
+    def testMarginRankLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('pos_prediction', schema.Scalar((np.float32, (1,)))),
+            ('neg_prediction', schema.List(np.float32)),
+        ))
+        pos_items = np.array([0.1, 0.2, 0.3], dtype=np.float32)
+        neg_lengths = np.array([1, 2, 3], dtype=np.int32)
+        neg_items = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=np.float32)
+        schema.FeedRecord(
+            input_record,
+            [pos_items, neg_lengths, neg_items]
+        )
+        loss = self.model.MarginRankLoss(input_record)
+        self.run_train_net_forward_only()
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
+    def testBatchMSELoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float64, (1,)))),
+            ('prediction', schema.Scalar((np.float32, (2,)))),
+        ))
+        loss = self.model.BatchMSELoss(input_record)
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
+    def testBatchSigmoidCrossEntropyLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float32, (32,)))),
+            ('prediction', schema.Scalar((np.float32, (32,))))
+        ))
+        loss = self.model.BatchSigmoidCrossEntropyLoss(input_record)
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
+    def testBatchSoftmaxLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float32, tuple()))),
+            ('prediction', schema.Scalar((np.float32, (32,))))
+        ))
+        loss = self.model.BatchSoftmaxLoss(input_record)
+        self.assertEqual(schema.Struct(
+            ('softmax', schema.Scalar((np.float32, (32,)))),
+            ('loss', schema.Scalar(np.float32)),
+        ), loss)
+
+    def testBatchSoftmaxLossWeight(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float32, tuple()))),
+            ('prediction', schema.Scalar((np.float32, (32,)))),
+            ('weight', schema.Scalar((np.float64, (1,))))
+        ))
+        loss = self.model.BatchSoftmaxLoss(input_record)
+        self.assertEqual(schema.Struct(
+            ('softmax', schema.Scalar((np.float32, (32,)))),
+            ('loss', schema.Scalar(np.float32)),
+        ), loss)
+
+    @given(
+        X=hu.arrays(dims=[2, 5]),
+    )
+    def testBatchNormalization(self, X):
+        input_record = self.new_record(schema.Scalar((np.float32, (5,))))
+        schema.FeedRecord(input_record, [X])
+        bn_output = self.model.BatchNormalization(input_record)
+        self.assertEqual(schema.Scalar((np.float32, (5,))), bn_output)
+        self.model.output_schema = schema.Struct()
+
+        train_init_net, train_net = self.get_training_nets()
+
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("ConstantFill", None, None),
+                OpSpec("ConstantFill", None, None),
+                OpSpec("ConstantFill", None, None),
+                OpSpec("ConstantFill", None, None),
+            ]
+        )
+
+        input_blob = input_record.field_blobs()[0]
+        output_blob = bn_output.field_blobs()[0]
+
+        expand_dims_spec = OpSpec(
+            "ExpandDims",
+            [input_blob],
+            None,
+        )
+
+        train_bn_spec = OpSpec(
+            "SpatialBN",
+            [None, init_ops[0].output[0], init_ops[1].output[0],
+                init_ops[2].output[0], init_ops[3].output[0]],
+            [output_blob, init_ops[2].output[0], init_ops[3].output[0], None, None],
+            {'is_test': 0, 'order': 'NCHW', 'momentum': 0.9},
+        )
+
+        test_bn_spec = OpSpec(
+            "SpatialBN",
+            [None, init_ops[0].output[0], init_ops[1].output[0],
+                init_ops[2].output[0], init_ops[3].output[0]],
+            [output_blob],
+            {'is_test': 1, 'order': 'NCHW', 'momentum': 0.9},
+        )
+
+        squeeze_spec = OpSpec(
+            "Squeeze",
+            [output_blob],
+            [output_blob],
+        )
+
+        self.assertNetContainOps(
+            train_net,
+            [expand_dims_spec, train_bn_spec, squeeze_spec]
+        )
+
+        eval_net = self.get_eval_net()
+
+        self.assertNetContainOps(
+            eval_net,
+            [expand_dims_spec, test_bn_spec, squeeze_spec]
+        )
+
+        predict_net = self.get_predict_net()
+
+        self.assertNetContainOps(
+            predict_net,
+            [expand_dims_spec, test_bn_spec, squeeze_spec]
+        )
+
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+        schema.FeedRecord(input_record, [X])
+        workspace.RunNetOnce(eval_net)
+
+        schema.FeedRecord(input_record, [X])
+        workspace.RunNetOnce(predict_net)
+
+    @given(
+        X=hu.arrays(dims=[2, 5, 6]),
+    )
+    def testLayerNormalization(self, X):
+        input_record = self.new_record(schema.Scalar((np.float32, (5, 6,))))
+        schema.FeedRecord(input_record, [X])
+        ln_output = self.model.LayerNormalization(input_record)
+        self.assertEqual(schema.Scalar((np.float32, (5, 6,))), ln_output)
+        self.model.output_schema = schema.Struct()
+
+        train_init_net, train_net = self.get_training_nets()
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+    @given(
+        X=hu.arrays(dims=[5, 2]),
+        num_to_collect=st.integers(min_value=1, max_value=10),
+    )
+    def testLastNWindowCollector(self, X, num_to_collect):
+        input_record = self.new_record(schema.Scalar(np.float32))
+        schema.FeedRecord(input_record, [X])
+        last_n = self.model.LastNWindowCollector(input_record, num_to_collect)
+        self.run_train_net_forward_only()
+        output_record = schema.FetchRecord(last_n.last_n)
+        start = max(0, 5 - num_to_collect)
+        npt.assert_array_equal(X[start:], output_record())
+        num_visited = schema.FetchRecord(last_n.num_visited)
+        npt.assert_array_equal([5], num_visited())
+
+    @given(
+        X=hu.arrays(dims=[5, 2]),
+        num_to_collect=st.integers(min_value=3, max_value=3),
+    )
+    def testReservoirSamplingWithID(self, X, num_to_collect):
+        ID = np.array([1, 2, 3, 1, 2], dtype=np.int64)
+        input_record = self.new_record(
+            schema.Struct(
+                ('record', schema.Struct(
+                    ('dense', schema.Scalar()),
+                )),
+                ('object_id', schema.Scalar(np.int64)),
+            )
+        )
+        schema.FeedRecord(input_record, [X, ID])
+        packed_record = self.model.PackRecords(
+            input_record.record, 1, fields=input_record.record.field_names())
+        reservoir_input = schema.Struct(
+            ('data', packed_record),
+            ('object_id', input_record.object_id),
+        )
+        reservoir = self.model.ReservoirSampling(
+            reservoir_input, num_to_collect)
+        self.model.output_schema = schema.Struct()
+        train_init_net, train_net = \
+            layer_model_instantiator.generate_training_nets_forward_only(
+                self.model)
+        workspace.RunNetOnce(train_init_net)
+        workspace.CreateNet(train_net)
+        workspace.RunNet(train_net.Proto().name, num_iter=2)
+        num_visited = schema.FetchRecord(reservoir.num_visited)
+        npt.assert_array_equal([3], num_visited())
+        for param in self.model.params:
+            serialized = workspace.SerializeBlob(str(param))
+            workspace.DeserializeBlob(str(param), serialized)
+        ID = np.array([3, 5, 3, 3, 5], dtype=np.int64)
+        schema.FeedRecord(input_record.object_id, [ID])
+        workspace.RunNet(train_net.Proto().name, num_iter=2)
+        num_visited = schema.FetchRecord(reservoir.num_visited)
+        npt.assert_array_equal([2], num_visited())
+
+    def testUniformSampling(self):
+        input_record = self.new_record(schema.Scalar(np.int32))
+        input_array = np.array([3, 10, 11, 15, 20, 99], dtype=np.int32)
+        schema.FeedRecord(input_record, [input_array])
+        num_samples = 20
+        num_elements = 100
+        uniform_sampling_output = self.model.UniformSampling(
+            input_record, num_samples, num_elements)
+        self.model.loss = uniform_sampling_output
+        self.run_train_net()
+        samples = workspace.FetchBlob(uniform_sampling_output.samples())
+        sampling_prob = workspace.FetchBlob(
+            uniform_sampling_output.sampling_prob())
+        self.assertEqual(num_samples, len(samples))
+        np.testing.assert_array_equal(input_array, samples[:len(input_array)])
+        np.testing.assert_almost_equal(
+            np.array([float(num_samples) / num_elements] * num_samples,
+                     dtype=np.float32),
+            sampling_prob
+        )
+
+    def testUniformSamplingWithIncorrectSampleSize(self):
+        input_record = self.new_record(schema.Scalar(np.int32))
+        num_samples = 200
+        num_elements = 100
+        with self.assertRaises(AssertionError):
+            self.model.UniformSampling(input_record, num_samples, num_elements)
+
+    def testGatherRecord(self):
+        indices = np.array([1, 3, 4], dtype=np.int32)
+        dense = np.array(list(range(20)), dtype=np.float32).reshape(10, 2)
+        lengths = np.array(list(range(10)), dtype=np.int32)
+        items = np.array(list(range(lengths.sum())), dtype=np.int64)
+        items_lengths = np.array(list(range(lengths.sum())), dtype=np.int32)
+        items_items = np.array(list(range(items_lengths.sum())), dtype=np.int64)
+        record = self.new_record(schema.Struct(
+            ('dense', schema.Scalar(np.float32)),
+            ('sparse', schema.Struct(
+                ('list', schema.List(np.int64)),
+                ('list_of_list', schema.List(schema.List(np.int64))),
+            )),
+            ('empty_struct', schema.Struct())
+        ))
+        indices_record = self.new_record(schema.Scalar(np.int32))
+        input_record = schema.Struct(
+            ('indices', indices_record),
+            ('record', record),
+        )
+        schema.FeedRecord(
+            input_record,
+            [indices, dense, lengths, items, lengths, items_lengths,
+             items_items])
+        gathered_record = self.model.GatherRecord(input_record)
+        self.assertTrue(schema.equal_schemas(gathered_record, record))
+
+        self.run_train_net_forward_only()
+        gathered_dense = workspace.FetchBlob(gathered_record.dense())
+        np.testing.assert_array_equal(
+            np.concatenate([dense[i:i + 1] for i in indices]), gathered_dense)
+        gathered_lengths = workspace.FetchBlob(
+            gathered_record.sparse.list.lengths())
+        np.testing.assert_array_equal(
+            np.concatenate([lengths[i:i + 1] for i in indices]),
+            gathered_lengths)
+        gathered_items = workspace.FetchBlob(
+            gathered_record.sparse.list.items())
+        offsets = lengths.cumsum() - lengths
+        np.testing.assert_array_equal(
+            np.concatenate([
+                items[offsets[i]: offsets[i] + lengths[i]]
+                for i in indices
+            ]), gathered_items)
+
+        gathered_items_lengths = workspace.FetchBlob(
+            gathered_record.sparse.list_of_list.items.lengths())
+        np.testing.assert_array_equal(
+            np.concatenate([
+                items_lengths[offsets[i]: offsets[i] + lengths[i]]
+                for i in indices
+            ]),
+            gathered_items_lengths
+        )
+
+        nested_offsets = []
+        nested_lengths = []
+        nested_offset = 0
+        j = 0
+        for l in lengths:
+            nested_offsets.append(nested_offset)
+            nested_length = 0
+            for _i in range(l):
+                nested_offset += items_lengths[j]
+                nested_length += items_lengths[j]
+                j += 1
+            nested_lengths.append(nested_length)
+
+        gathered_items_items = workspace.FetchBlob(
+            gathered_record.sparse.list_of_list.items.items())
+        np.testing.assert_array_equal(
+            np.concatenate([
+                items_items[nested_offsets[i]:
+                            nested_offsets[i] + nested_lengths[i]]
+                for i in indices
+            ]),
+            gathered_items_items
+        )
+
+    def testMapToRange(self):
+        input_record = self.new_record(schema.Scalar(np.int32))
+        indices_blob = self.model.MapToRange(input_record,
+                                             max_index=100).indices
+        self.model.output_schema = schema.Struct()
+
+        train_init_net, train_net = self.get_training_nets()
+
+        schema.FeedRecord(
+            input_record,
+            [np.array([10, 3, 20, 99, 15, 11, 3, 11], dtype=np.int32)]
+        )
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+        indices = workspace.FetchBlob(indices_blob())
+        np.testing.assert_array_equal(
+            np.array([1, 2, 3, 4, 5, 6, 2, 6], dtype=np.int32),
+            indices
+        )
+
+        schema.FeedRecord(
+            input_record,
+            [np.array([10, 3, 23, 35, 60, 15, 10, 15], dtype=np.int32)]
+        )
+        workspace.RunNetOnce(train_net)
+        indices = workspace.FetchBlob(indices_blob())
+        np.testing.assert_array_equal(
+            np.array([1, 2, 7, 8, 9, 5, 1, 5], dtype=np.int32),
+            indices
+        )
+
+        eval_net = self.get_eval_net()
+
+        schema.FeedRecord(
+            input_record,
+            [np.array([10, 3, 23, 35, 60, 15, 200], dtype=np.int32)]
+        )
+        workspace.RunNetOnce(eval_net)
+        indices = workspace.FetchBlob(indices_blob())
+        np.testing.assert_array_equal(
+            np.array([1, 2, 7, 8, 9, 5, 0], dtype=np.int32),
+            indices
+        )
+
+        schema.FeedRecord(
+            input_record,
+            [np.array([10, 3, 23, 15, 101, 115], dtype=np.int32)]
+        )
+        workspace.RunNetOnce(eval_net)
+        indices = workspace.FetchBlob(indices_blob())
+        np.testing.assert_array_equal(
+            np.array([1, 2, 7, 5, 0, 0], dtype=np.int32),
+            indices
+        )
+
+        predict_net = self.get_predict_net()
+
+        schema.FeedRecord(
+            input_record,
+            [np.array([3, 3, 20, 23, 151, 35, 60, 15, 200], dtype=np.int32)]
+        )
+        workspace.RunNetOnce(predict_net)
+        indices = workspace.FetchBlob(indices_blob())
+        np.testing.assert_array_equal(
+            np.array([2, 2, 3, 7, 0, 8, 9, 5, 0], dtype=np.int32),
+            indices
+        )
+
+    def testSelectRecordByContext(self):
+        float_features = self.model.input_feature_schema.float_features
+
+        float_array = np.array([1.0, 2.0], dtype=np.float32)
+
+        schema.FeedRecord(float_features, [float_array])
+
+        with Tags(Tags.EXCLUDE_FROM_PREDICTION):
+            log_float_features = self.model.Log(float_features, 1)
+        joined = self.model.SelectRecordByContext(
+            schema.Struct(
+                (InstantiationContext.PREDICTION, float_features),
+                (InstantiationContext.TRAINING, log_float_features),
+                # TODO: TRAIN_ONLY layers are also generated in eval
+                (InstantiationContext.EVAL, log_float_features),
+            )
+        )
+
+        # model.output_schema has to a struct
+        self.model.output_schema = schema.Struct((
+            'joined', joined
+        ))
+        predict_net = layer_model_instantiator.generate_predict_net(self.model)
+        workspace.RunNetOnce(predict_net)
+        predict_output = schema.FetchRecord(predict_net.output_record())
+        npt.assert_array_equal(float_array,
+                               predict_output['joined']())
+        eval_net = layer_model_instantiator.generate_eval_net(self.model)
+        workspace.RunNetOnce(eval_net)
+        eval_output = schema.FetchRecord(eval_net.output_record())
+        npt.assert_array_equal(np.log(float_array),
+                               eval_output['joined']())
+        _, train_net = (
+            layer_model_instantiator.generate_training_nets_forward_only(
+                self.model
+            )
+        )
+        workspace.RunNetOnce(train_net)
+        train_output = schema.FetchRecord(train_net.output_record())
+        npt.assert_array_equal(np.log(float_array),
+                               train_output['joined']())
+
+    def testFunctionalLayer(self):
+        def normalize(net, in_record, out_record):
+            mean = net.ReduceFrontMean(in_record(), 1)
+            net.Sub(
+                [in_record(), mean],
+                out_record(),
+                broadcast=1)
+        normalized = self.model.Functional(
+            self.model.input_feature_schema.float_features, 1,
+            normalize, name="normalizer")
+
+        # Attach metadata to one of the outputs and use it in FC
+        normalized.set_type((np.float32, 32))
+        self.model.output_schema = self.model.FC(normalized, 2)
+
+        predict_net = layer_model_instantiator.generate_predict_net(
+            self.model)
+        ops = predict_net.Proto().op
+        assert len(ops) == 3
+        assert ops[0].type == "ReduceFrontMean"
+        assert ops[1].type == "Sub"
+        assert ops[2].type == "FC"
+        assert len(ops[0].input) == 1
+        assert ops[0].input[0] ==\
+            self.model.input_feature_schema.float_features()
+        assert len(ops[1].output) == 1
+        assert ops[1].output[0] in ops[2].input
+
+    def testFunctionalLayerHelper(self):
+        mean = self.model.ReduceFrontMean(
+            self.model.input_feature_schema.float_features, 1)
+        normalized = self.model.Sub(
+            schema.Tuple(
+                self.model.input_feature_schema.float_features, mean),
+            1, broadcast=1)
+        # Attach metadata to one of the outputs and use it in FC
+        normalized.set_type((np.float32, (32,)))
+        self.model.output_schema = self.model.FC(normalized, 2)
+
+        predict_net = layer_model_instantiator.generate_predict_net(
+            self.model)
+        ops = predict_net.Proto().op
+        assert len(ops) == 3
+        assert ops[0].type == "ReduceFrontMean"
+        assert ops[1].type == "Sub"
+        assert ops[2].type == "FC"
+        assert len(ops[0].input) == 1
+        assert ops[0].input[0] ==\
+            self.model.input_feature_schema.float_features()
+        assert len(ops[1].output) == 1
+        assert ops[1].output[0] in ops[2].input
+
+    def testFunctionalLayerHelperAutoInference(self):
+        softsign = self.model.Softsign(
+            schema.Tuple(self.model.input_feature_schema.float_features),
+            1)
+        assert softsign.field_type().base == np.float32
+        assert softsign.field_type().shape == (32,)
+        self.model.output_schema = self.model.FC(softsign, 2)
+
+        predict_net = layer_model_instantiator.generate_predict_net(
+            self.model)
+        ops = predict_net.Proto().op
+        assert len(ops) == 2
+        assert ops[0].type == "Softsign"
+        assert ops[1].type == "FC"
+        assert len(ops[0].input) == 1
+        assert ops[0].input[0] ==\
+            self.model.input_feature_schema.float_features()
+        assert len(ops[0].output) == 1
+        assert ops[0].output[0] in ops[1].input
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    def testHalfToFloatTypeInference(self):
+        input = self.new_record(schema.Scalar((np.float32, (32,))))
+
+        output = self.model.FloatToHalf(input, 1)
+        assert output.field_type().base == np.float16
+        assert output.field_type().shape == (32, )
+
+        output = self.model.HalfToFloat(output, 1)
+        assert output.field_type().base == np.float32
+        assert output.field_type().shape == (32, )
+
+    def testFunctionalLayerHelperAutoInferenceScalar(self):
+        loss = self.model.AveragedLoss(self.model.input_feature_schema, 1)
+        self.assertEqual(1, len(loss.field_types()))
+        self.assertEqual(np.float32, loss.field_types()[0].base)
+        self.assertEqual(tuple(), loss.field_types()[0].shape)
+
+    def testFunctionalLayerInputCoercion(self):
+        one = self.model.global_constants['ONE']
+        two = self.model.Add([one, one], 1)
+        self.model.loss = two
+        self.run_train_net()
+        data = workspace.FetchBlob(two.field_blobs()[0])
+        np.testing.assert_array_equal([2.0], data)
+
+    def testFunctionalLayerWithOutputNames(self):
+        k = 3
+        topk = self.model.TopK(
+            self.model.input_feature_schema,
+            output_names_or_num=['values', 'indices'],
+            k=k,
+        )
+        self.assertEqual(2, len(topk.field_types()))
+        self.assertEqual(np.float32, topk.field_types()[0].base)
+        self.assertEqual((k,), topk.field_types()[0].shape)
+        self.assertEqual(np.int32, topk.field_types()[1].base)
+        self.assertEqual((k,), topk.field_types()[1].shape)
+        self.assertEqual(['TopK/values', 'TopK/indices'], topk.field_blobs())
+
+    def testFunctionalLayerSameOperatorOutputNames(self):
+        Con1 = self.model.ConstantFill([], 1, value=1)
+        Con2 = self.model.ConstantFill([], 1, value=2)
+        self.assertNotEqual(str(Con1), str(Con2))
+
+    def testFunctionalLayerWithOutputDtypes(self):
+        loss = self.model.AveragedLoss(
+            self.model.input_feature_schema,
+            1,
+            output_dtypes=(np.float32, (1,)),
+        )
+        self.assertEqual(1, len(loss.field_types()))
+        self.assertEqual(np.float32, loss.field_types()[0].base)
+        self.assertEqual((1,), loss.field_types()[0].shape)
+
+    def testPropagateRequestOnly(self):
+        # test case when output is request only
+        input_record = self.new_record(schema.Struct(
+            ('input1', schema.Scalar((np.float32, (32, )))),
+            ('input2', schema.Scalar((np.float32, (64, )))),
+            ('input3', schema.Scalar((np.float32, (16, )))),
+        ))
+
+        set_request_only(input_record)
+        concat_output = self.model.Concat(input_record)
+        self.assertEqual(is_request_only_scalar(concat_output), True)
+
+        # test case when output is not request only
+        input_record2 = self.new_record(schema.Struct(
+            ('input4', schema.Scalar((np.float32, (100, ))))
+        )) + input_record
+
+        concat_output2 = self.model.Concat(input_record2)
+        self.assertEqual(is_request_only_scalar(concat_output2), False)
+
+    def testSetRequestOnly(self):
+        input_record = schema.Scalar(np.int64)
+        schema.attach_metadata_to_scalars(
+            input_record,
+            schema.Metadata(
+                categorical_limit=100000000,
+                expected_value=99,
+                feature_specs=schema.FeatureSpec(
+                    feature_ids=[1, 100, 1001]
+                )
+            )
+        )
+
+        set_request_only(input_record)
+        self.assertEqual(input_record.metadata.categorical_limit, 100000000)
+        self.assertEqual(input_record.metadata.expected_value, 99)
+        self.assertEqual(
+            input_record.metadata.feature_specs.feature_ids,
+            [1, 100, 1001]
+        )
+
+    @given(
+        X=hu.arrays(dims=[5, 5]),  # Shape of X is irrelevant
+    )
+    def testDropout(self, X):
+        input_record = self.new_record(schema.Scalar((np.float32, (1,))))
+        schema.FeedRecord(input_record, [X])
+        d_output = self.model.Dropout(input_record)
+        self.assertEqual(schema.Scalar((np.float32, (1,))), d_output)
+        self.model.output_schema = schema.Struct()
+
+        train_init_net, train_net = self.get_training_nets()
+
+        input_blob = input_record.field_blobs()[0]
+        output_blob = d_output.field_blobs()[0]
+
+        train_d_spec = OpSpec(
+            "Dropout",
+            [input_blob],
+            [output_blob, None],
+            {'is_test': 0, 'ratio': 0.5}
+        )
+
+        test_d_spec = OpSpec(
+            "Dropout",
+            [input_blob],
+            [output_blob, None],
+            {'is_test': 1, 'ratio': 0.5}
+        )
+
+        self.assertNetContainOps(
+            train_net,
+            [train_d_spec]
+        )
+
+        eval_net = self.get_eval_net()
+
+        self.assertNetContainOps(
+            eval_net,
+            [test_d_spec]
+        )
+
+        predict_net = self.get_predict_net()
+
+        self.assertNetContainOps(
+            predict_net,
+            [test_d_spec]
+        )
+
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+        schema.FeedRecord(input_record, [X])
+        workspace.RunNetOnce(eval_net)
+
+        schema.FeedRecord(input_record, [X])
+        workspace.RunNetOnce(predict_net)
+
+    @given(
+        num_inputs=st.integers(1, 3),
+        batch_size=st.integers(5, 10)
+    )
+    def testMergeIdListsLayer(self, num_inputs, batch_size):
+        inputs = []
+        for _ in range(num_inputs):
+            lengths = np.random.randint(5, size=batch_size).astype(np.int32)
+            size = lengths.sum()
+            values = np.random.randint(1, 10, size=size).astype(np.int64)
+            inputs.append(lengths)
+            inputs.append(values)
+        input_schema = schema.Tuple(
+            *[schema.List(
+                schema.Scalar(dtype=np.int64, metadata=schema.Metadata(
+                    categorical_limit=20
+                ))) for _ in range(num_inputs)]
+        )
+
+        input_record = schema.NewRecord(self.model.net, input_schema)
+        schema.FeedRecord(input_record, inputs)
+        output_schema = self.model.MergeIdLists(input_record)
+        assert schema.equal_schemas(
+            output_schema, IdList,
+            check_field_names=False)
+
+    @given(
+        batch_size=st.integers(min_value=2, max_value=10),
+        input_dims=st.integers(min_value=5, max_value=10),
+        output_dims=st.integers(min_value=5, max_value=10),
+        bandwidth=st.floats(min_value=0.1, max_value=5),
+    )
+    def testRandomFourierFeatures(self, batch_size, input_dims, output_dims, bandwidth):
+
+        def _rff_hypothesis_test(rff_output, X, W, b, scale):
+            """
+            Runs hypothesis test for Semi Random Features layer.
+
+            Inputs:
+                rff_output -- output of net after running random fourier features layer
+                X -- input data
+                W -- weight parameter from train_init_net
+                b -- bias parameter from train_init_net
+                scale -- value by which to scale the output vector
+            """
+            output = workspace.FetchBlob(rff_output)
+            output_ref = scale * np.cos(np.dot(X, np.transpose(W)) + b)
+            npt.assert_allclose(output, output_ref, rtol=1e-3, atol=1e-3)
+
+        X = np.random.random((batch_size, input_dims)).astype(np.float32)
+        scale = np.sqrt(2.0 / output_dims)
+        input_record = self.new_record(schema.Scalar((np.float32, (input_dims,))))
+        schema.FeedRecord(input_record, [X])
+        input_blob = input_record.field_blobs()[0]
+        rff_output = self.model.RandomFourierFeatures(input_record,
+                                                      output_dims,
+                                                      bandwidth)
+        self.model.output_schema = schema.Struct()
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (output_dims, ))),
+            rff_output
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        # Init net assertions
+        init_ops_list = [
+            OpSpec("GaussianFill", None, None),
+            OpSpec("UniformFill", None, None),
+        ]
+        init_ops = self._test_net(train_init_net, init_ops_list)
+        W = workspace.FetchBlob(self.model.layers[0].w)
+        b = workspace.FetchBlob(self.model.layers[0].b)
+
+        # Operation specifications
+        fc_spec = OpSpec("FC", [input_blob, init_ops[0].output[0],
+                         init_ops[1].output[0]], None)
+        cosine_spec = OpSpec("Cos", None, None)
+        scale_spec = OpSpec("Scale", None, rff_output.field_blobs(),
+                            {'scale': scale})
+        ops_list = [
+            fc_spec,
+            cosine_spec,
+            scale_spec
+        ]
+
+        # Train net assertions
+        self._test_net(train_net, ops_list)
+        _rff_hypothesis_test(rff_output(), X, W, b, scale)
+
+        # Eval net assertions
+        eval_net = self.get_eval_net()
+        self._test_net(eval_net, ops_list)
+        _rff_hypothesis_test(rff_output(), X, W, b, scale)
+
+        # Predict net assertions
+        predict_net = self.get_predict_net()
+        self._test_net(predict_net, ops_list)
+        _rff_hypothesis_test(rff_output(), X, W, b, scale)
+
+    @given(
+        batch_size=st.integers(min_value=2, max_value=10),
+        input_dims=st.integers(min_value=5, max_value=10),
+        output_dims=st.integers(min_value=5, max_value=10),
+        s=st.integers(min_value=0, max_value=3),
+        scale=st.floats(min_value=0.1, max_value=5),
+        set_weight_as_global_constant=st.booleans()
+    )
+    def testArcCosineFeatureMap(self, batch_size, input_dims, output_dims, s, scale,
+                                set_weight_as_global_constant):
+
+        def _arc_cosine_hypothesis_test(ac_output, X, W, b, s):
+            """
+            Runs hypothesis test for Arc Cosine layer.
+
+            Inputs:
+                ac_output -- output of net after running arc cosine layer
+                X -- input data
+                W -- weight parameter from train_init_net
+                b -- bias parameter from train_init_net
+                s -- degree parameter
+            """
+            # Get output from net
+            net_output = workspace.FetchBlob(ac_output)
+
+            # Computing output directly
+            x_rand = np.matmul(X, np.transpose(W)) + b
+            x_pow = np.power(x_rand, s)
+            if s > 0:
+                h_rand_features = np.piecewise(x_rand,
+                                               [x_rand <= 0, x_rand > 0],
+                                               [0, 1])
+            else:
+                h_rand_features = np.piecewise(x_rand,
+                                               [x_rand <= 0, x_rand > 0],
+                                               [0, lambda x: x / (1 + x)])
+            output_ref = np.multiply(x_pow, h_rand_features)
+
+            # Comparing net output and computed output
+            npt.assert_allclose(net_output, output_ref, rtol=1e-3, atol=1e-3)
+
+        X = np.random.normal(size=(batch_size, input_dims)).astype(np.float32)
+        input_record = self.new_record(schema.Scalar((np.float32, (input_dims,))))
+        schema.FeedRecord(input_record, [X])
+        input_blob = input_record.field_blobs()[0]
+
+        ac_output = self.model.ArcCosineFeatureMap(
+            input_record,
+            output_dims,
+            s=s,
+            scale=scale,
+            set_weight_as_global_constant=set_weight_as_global_constant
+        )
+        self.model.output_schema = schema.Struct()
+        self.assertEqual(
+            schema.Scalar((np.float32, (output_dims, ))),
+            ac_output
+        )
+
+        train_init_net, train_net = self.get_training_nets()
+
+        # Run create_init_net to initialize the global constants, and W and b
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(self.model.create_init_net(name='init_net'))
+
+        if set_weight_as_global_constant:
+            W = workspace.FetchBlob(
+                self.model.global_constants['arc_cosine_feature_map_fixed_rand_W']
+            )
+            b = workspace.FetchBlob(
+                self.model.global_constants['arc_cosine_feature_map_fixed_rand_b']
+            )
+        else:
+            W = workspace.FetchBlob(self.model.layers[0].random_w)
+            b = workspace.FetchBlob(self.model.layers[0].random_b)
+
+        # Operation specifications
+        fc_spec = OpSpec("FC", [input_blob, None, None], None)
+        softsign_spec = OpSpec("Softsign", None, None)
+        relu_spec = OpSpec("Relu", None, None)
+        relu_spec_output = OpSpec("Relu", None, ac_output.field_blobs())
+        pow_spec = OpSpec("Pow", None, None, {'exponent': float(s - 1)})
+        mul_spec = OpSpec("Mul", None, ac_output.field_blobs())
+
+        if s == 0:
+            ops_list = [
+                fc_spec,
+                softsign_spec,
+                relu_spec_output,
+            ]
+        elif s == 1:
+            ops_list = [
+                fc_spec,
+                relu_spec_output,
+            ]
+        else:
+            ops_list = [
+                fc_spec,
+                relu_spec,
+                pow_spec,
+                mul_spec,
+            ]
+
+        # Train net assertions
+        self._test_net(train_net, ops_list)
+        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
+
+        # Eval net assertions
+        eval_net = self.get_eval_net()
+        self._test_net(eval_net, ops_list)
+        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
+
+        # Predict net assertions
+        predict_net = self.get_predict_net()
+        self._test_net(predict_net, ops_list)
+        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
+
+    @given(
+        batch_size=st.integers(min_value=2, max_value=10),
+        input_dims=st.integers(min_value=5, max_value=10),
+        output_dims=st.integers(min_value=5, max_value=10),
+        s=st.integers(min_value=0, max_value=3),
+        scale=st.floats(min_value=0.1, max_value=5),
+        set_weight_as_global_constant=st.booleans(),
+        use_struct_input=st.booleans(),
+    )
+    def testSemiRandomFeatures(self, batch_size, input_dims, output_dims, s, scale,
+                               set_weight_as_global_constant, use_struct_input):
+
+        def _semi_random_hypothesis_test(srf_output, X_full, X_random, rand_w,
+                                         rand_b, s):
+            """
+            Runs hypothesis test for Semi Random Features layer.
+
+            Inputs:
+                srf_output -- output of net after running semi random features layer
+                X_full -- full input data
+                X_random -- random-output input data
+                rand_w -- random-initialized weight parameter from train_init_net
+                rand_b -- random-initialized bias parameter from train_init_net
+                s -- degree parameter
+
+            """
+            # Get output from net
+            net_output = workspace.FetchBlob(srf_output)
+
+            # Fetch learned parameter blobs
+            learned_w = workspace.FetchBlob(self.model.layers[0].learned_w)
+            learned_b = workspace.FetchBlob(self.model.layers[0].learned_b)
+
+            # Computing output directly
+            x_rand = np.matmul(X_random, np.transpose(rand_w)) + rand_b
+            x_learn = np.matmul(X_full, np.transpose(learned_w)) + learned_b
+            x_pow = np.power(x_rand, s)
+            if s > 0:
+                h_rand_features = np.piecewise(x_rand,
+                                               [x_rand <= 0, x_rand > 0],
+                                               [0, 1])
+            else:
+                h_rand_features = np.piecewise(x_rand,
+                                               [x_rand <= 0, x_rand > 0],
+                                               [0, lambda x: x / (1 + x)])
+            output_ref = np.multiply(np.multiply(x_pow, h_rand_features), x_learn)
+
+            # Comparing net output and computed output
+            npt.assert_allclose(net_output, output_ref, rtol=1e-3, atol=1e-3)
+
+        X_full = np.random.normal(size=(batch_size, input_dims)).astype(np.float32)
+        if use_struct_input:
+            X_random = np.random.normal(size=(batch_size, input_dims)).\
+                astype(np.float32)
+            input_data = [X_full, X_random]
+            input_record = self.new_record(schema.Struct(
+                ('full', schema.Scalar(
+                    (np.float32, (input_dims,))
+                )),
+                ('random', schema.Scalar(
+                    (np.float32, (input_dims,))
+                ))
+            ))
+        else:
+            X_random = X_full
+            input_data = [X_full]
+            input_record = self.new_record(schema.Scalar(
+                (np.float32, (input_dims,))
+            ))
+
+        schema.FeedRecord(input_record, input_data)
+        srf_output = self.model.SemiRandomFeatures(
+            input_record,
+            output_dims,
+            s=s,
+            scale_random=scale,
+            scale_learned=scale,
+            set_weight_as_global_constant=set_weight_as_global_constant
+        )
+
+        self.model.output_schema = schema.Struct()
+
+        self.assertEqual(
+            schema.Struct(
+                ('full', schema.Scalar(
+                    (np.float32, (output_dims,))
+                )),
+                ('random', schema.Scalar(
+                    (np.float32, (output_dims,))
+                ))
+            ),
+            srf_output
+        )
+
+        init_ops_list = [
+            OpSpec("GaussianFill", None, None),
+            OpSpec("UniformFill", None, None),
+            OpSpec("GaussianFill", None, None),
+            OpSpec("UniformFill", None, None),
+        ]
+        train_init_net, train_net = self.get_training_nets()
+
+        # Need to run to initialize the global constants for layer
+        workspace.RunNetOnce(self.model.create_init_net(name='init_net'))
+
+        if set_weight_as_global_constant:
+            # If weight params are global constants, they won't be in train_init_net
+            init_ops = self._test_net(train_init_net, init_ops_list[:2])
+            rand_w = workspace.FetchBlob(
+                self.model.global_constants['semi_random_features_fixed_rand_W']
+            )
+            rand_b = workspace.FetchBlob(
+                self.model.global_constants['semi_random_features_fixed_rand_b']
+            )
+
+            # Operation specifications
+            fc_random_spec = OpSpec("FC", [None, None, None], None)
+            fc_learned_spec = OpSpec("FC", [None, init_ops[0].output[0],
+                                     init_ops[1].output[0]], None)
+        else:
+            init_ops = self._test_net(train_init_net, init_ops_list)
+            rand_w = workspace.FetchBlob(self.model.layers[0].random_w)
+            rand_b = workspace.FetchBlob(self.model.layers[0].random_b)
+
+            # Operation specifications
+            fc_random_spec = OpSpec("FC", [None, init_ops[0].output[0],
+                                    init_ops[1].output[0]], None)
+            fc_learned_spec = OpSpec("FC", [None, init_ops[2].output[0],
+                                     init_ops[3].output[0]], None)
+
+        softsign_spec = OpSpec("Softsign", None, None)
+        relu_spec = OpSpec("Relu", None, None)
+        relu_output_spec = OpSpec("Relu", None, srf_output.random.field_blobs())
+        pow_spec = OpSpec("Pow", None, None, {'exponent': float(s - 1)})
+        mul_interim_spec = OpSpec("Mul", None, srf_output.random.field_blobs())
+        mul_spec = OpSpec("Mul", None, srf_output.full.field_blobs())
+
+        if s == 0:
+            ops_list = [
+                fc_learned_spec,
+                fc_random_spec,
+                softsign_spec,
+                relu_output_spec,
+                mul_spec,
+            ]
+        elif s == 1:
+            ops_list = [
+                fc_learned_spec,
+                fc_random_spec,
+                relu_output_spec,
+                mul_spec,
+            ]
+        else:
+            ops_list = [
+                fc_learned_spec,
+                fc_random_spec,
+                relu_spec,
+                pow_spec,
+                mul_interim_spec,
+                mul_spec,
+            ]
+
+        # Train net assertions
+        self._test_net(train_net, ops_list)
+        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
+                                     rand_w, rand_b, s)
+
+        # Eval net assertions
+        eval_net = self.get_eval_net()
+        self._test_net(eval_net, ops_list)
+        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
+                                     rand_w, rand_b, s)
+
+        # Predict net assertions
+        predict_net = self.get_predict_net()
+        self._test_net(predict_net, ops_list)
+        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
+                                     rand_w, rand_b, s)
+
+    def testConv(self):
+        batch_size = 50
+        H = 1
+        W = 10
+        C = 50
+        output_dims = 32
+        kernel_h = 1
+        kernel_w = 3
+        stride_h = 1
+        stride_w = 1
+        pad_t = 0
+        pad_b = 0
+        pad_r = None
+        pad_l = None
+
+        input_record = self.new_record(schema.Scalar((np.float32, (H, W, C))))
+        X = np.random.random((batch_size, H, W, C)).astype(np.float32)
+        schema.FeedRecord(input_record, [X])
+        conv = self.model.Conv(
+            input_record,
+            output_dims,
+            kernel_h=kernel_h,
+            kernel_w=kernel_w,
+            stride_h=stride_h,
+            stride_w=stride_w,
+            pad_t=pad_t,
+            pad_b=pad_b,
+            pad_r=pad_r,
+            pad_l=pad_l,
+            order='NHWC'
+        )
+
+        self.assertEqual(
+            schema.Scalar((np.float32, (output_dims,))),
+            conv
+        )
+
+        self.run_train_net_forward_only()
+        output_record = schema.FetchRecord(conv)
+        # check the number of output channels is the same as input in this example
+        assert output_record.field_types()[0].shape == (H, W, output_dims)
+        assert output_record().shape == (batch_size, H, W, output_dims)
+
+        train_init_net, train_net = self.get_training_nets()
+        # Init net assertions
+        init_ops = self.assertNetContainOps(
+            train_init_net,
+            [
+                OpSpec("XavierFill", None, None),
+                OpSpec("ConstantFill", None, None),
+            ]
+        )
+        conv_spec = OpSpec(
+            "Conv",
+            [
+                input_record.field_blobs()[0],
+                init_ops[0].output[0],
+                init_ops[1].output[0],
+            ],
+            conv.field_blobs()
+        )
+
+        # Train net assertions
+        self.assertNetContainOps(train_net, [conv_spec])
+
+        # Predict net assertions
+        predict_net = self.get_predict_net()
+        self.assertNetContainOps(predict_net, [conv_spec])
+
+        # Eval net assertions
+        eval_net = self.get_eval_net()
+        self.assertNetContainOps(eval_net, [conv_spec])
+
+    @given(
+        num=st.integers(min_value=10, max_value=100),
+        feed_weight=st.booleans(),
+        use_inv_var_parameterization=st.booleans(),
+        use_log_barrier=st.booleans(),
+        enable_diagnose=st.booleans(),
+        **hu.gcs
+    )
+    def testAdaptiveWeight(
+        self, num, feed_weight, use_inv_var_parameterization, use_log_barrier,
+        enable_diagnose, gc, dc
+    ):
+        input_record = self.new_record(schema.RawTuple(num))
+        data = np.random.random(num)
+        schema.FeedRecord(
+            input_record, [np.array(x).astype(np.float32) for x in data]
+        )
+        weights = np.random.random(num) if feed_weight else None
+        result = self.model.AdaptiveWeight(
+            input_record,
+            weights=weights,
+            estimation_method=(
+                'inv_var' if use_inv_var_parameterization else 'log_std'
+            ),
+            pos_optim_method=(
+                'log_barrier' if use_log_barrier else 'pos_grad_proj'
+            ),
+            enable_diagnose=enable_diagnose
+        )
+        train_init_net, train_net = self.get_training_nets(True)
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+        result = workspace.FetchBlob(result())
+        if not feed_weight:
+            weights = np.array([1. / num for _ in range(num)])
+        expected = np.sum(weights * data + 0.5 * np.log(1. / 2. / weights))
+        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
+        if enable_diagnose:
+            assert len(self.model.ad_hoc_plot_blobs) == num
+            reconst_weights_from_ad_hoc = np.array(
+                [workspace.FetchBlob(b) for b in self.model.ad_hoc_plot_blobs]
+            ).flatten()
+            npt.assert_allclose(
+                reconst_weights_from_ad_hoc, weights, atol=1e-4, rtol=1e-4
+            )
+        else:
+            assert len(self.model.ad_hoc_plot_blobs) == 0
+
+    @given(num=st.integers(min_value=10, max_value=100), **hu.gcs)
+    def testConstantWeight(self, num, gc, dc):
+        input_record = self.new_record(schema.RawTuple(num))
+        data = np.random.random(num)
+        schema.FeedRecord(
+            input_record, [np.array(x).astype(np.float32) for x in data]
+        )
+        weights = np.random.random(num)
+        result = self.model.ConstantWeight(input_record, weights=weights)
+        train_init_net, train_net = self.get_training_nets(True)
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+        result = workspace.FetchBlob(result())
+        expected = np.sum(weights * data)
+        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
+
+    @given(**hu.gcs)
+    def testHomotopyWeight(self, gc, dc):
+        input_record = self.new_record(schema.RawTuple(2))
+        data = np.random.random(2)
+        schema.FeedRecord(
+            input_record, [np.array(x).astype(np.float32) for x in data]
+        )
+        # ensure: quad_life > 2 * half_life
+        half_life = int(np.random.random() * 1e2 + 1)
+        quad_life = int(np.random.random() * 1e3 + 2 * half_life + 1)
+        min_weight = np.random.random()
+        max_weight = np.random.random() + min_weight + 1e-5
+        result = self.model.HomotopyWeight(
+            input_record,
+            min_weight=min_weight,
+            max_weight=max_weight,
+            half_life=half_life,
+            quad_life=quad_life,
+        )
+        train_init_net, train_net = self.get_training_nets(True)
+        workspace.RunNetOnce(train_init_net)
+        workspace.CreateNet(train_net)
+        workspace.RunNet(train_net.Name(), num_iter=half_life)
+        half_life_result = workspace.FetchBlob(result())
+        workspace.RunNet(train_net.Name(), num_iter=quad_life - half_life)
+        quad_life_result = workspace.FetchBlob(result())
+
+        alpha = (min_weight + max_weight) / 2.
+        beta = (min_weight + max_weight) / 2.
+        expected_half_life_result = alpha * data[0] + beta * data[1]
+        alpha = (3 * min_weight + max_weight) / 4.
+        beta = (min_weight + 3 * max_weight) / 4.
+        expected_quad_life_result = alpha * data[0] + beta * data[1]
+        npt.assert_allclose(
+            expected_half_life_result, half_life_result, atol=1e-2, rtol=1e-2
+        )
+        npt.assert_allclose(
+            expected_quad_life_result, quad_life_result, atol=1e-2, rtol=1e-2
+        )
+
+    def _testLabelSmooth(self, categories, binary_prob_label, bsz):
+        label = self.new_record(schema.Scalar((np.float32, (1, ))))
+        label_np = np.random.randint(categories, size=bsz).astype(np.float32)
+        schema.FeedRecord(label, [label_np])
+        smooth_matrix_shape = (
+            2 if binary_prob_label else (categories, categories)
+        )
+        smooth_matrix = np.random.random(smooth_matrix_shape)
+        smoothed_label = self.model.LabelSmooth(label, smooth_matrix)
+        train_init_net, train_net = self.get_training_nets(True)
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+        smoothed_label_np = workspace.FetchBlob(smoothed_label())
+        if binary_prob_label:
+            expected = np.array(
+                [
+                    smooth_matrix[0] if x == 0.0 else smooth_matrix[1]
+                    for x in label_np
+                ]
+            )
+        else:
+            expected = np.array([smooth_matrix[int(x)] for x in label_np])
+        npt.assert_allclose(expected, smoothed_label_np, atol=1e-4, rtol=1e-4)
+
+    @given(
+        categories=st.integers(min_value=2, max_value=10),
+        bsz=st.integers(min_value=10, max_value=100),
+        **hu.gcs
+    )
+    def testLabelSmoothForCategoricalLabel(self, categories, bsz, gc, dc):
+        self._testLabelSmooth(categories, False, bsz)
+
+    @given(
+        bsz=st.integers(min_value=10, max_value=100),
+        **hu.gcs
+    )
+    def testLabelSmoothForBinaryProbLabel(self, bsz, gc, dc):
+        self._testLabelSmooth(2, True, bsz)
+
+    @given(
+        num_inputs=st.integers(min_value=2, max_value=10),
+        batch_size=st.integers(min_value=2, max_value=10),
+        input_dim=st.integers(min_value=5, max_value=10),
+        seed=st.integers(1, 10),
+    )
+    def testBlobWeightedSum(self, num_inputs, batch_size, input_dim, seed):
+
+        def get_blob_weighted_sum():
+            weights = []
+            for i in range(num_inputs):
+                w_blob_name = 'blob_weighted_sum/w_{0}'.format(i)
+                assert workspace.HasBlob(w_blob_name), (
+                    "cannot fine blob {}".format(w_blob_name)
+                )
+                w = workspace.FetchBlob(w_blob_name)
+                weights.append(w)
+
+            result = np.sum([
+                input_data[idx] * weights[idx] for idx in range(num_inputs)
+            ], axis=0)
+            return result
+
+        np.random.seed(seed)
+        expected_output_schema = schema.Scalar((np.float32, (input_dim,)))
+        input_schema = schema.Tuple(
+            *[expected_output_schema for _ in range(num_inputs)]
+        )
+        input_data = [
+            np.random.random((batch_size, input_dim)).astype(np.float32)
+            for _ in range(num_inputs)
+        ]
+        input_record = self.new_record(input_schema)
+        schema.FeedRecord(input_record, input_data)
+
+        # test output schema
+        ws_output = self.model.BlobWeightedSum(input_record)
+        self.assertEqual(len(self.model.layers), 1)
+        assert schema.equal_schemas(ws_output, expected_output_schema)
+
+        # test train net
+        train_init_net, train_net = self.get_training_nets()
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+        output = workspace.FetchBlob(ws_output())
+        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
+
+        self.run_train_net_forward_only()
+        output = workspace.FetchBlob(ws_output())
+        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
+
+        # test eval net
+        eval_net = self.get_eval_net()
+        workspace.RunNetOnce(eval_net)
+        output = workspace.FetchBlob(ws_output())
+        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
+
+        # test pred net
+        pred_net = self.get_predict_net()
+        workspace.RunNetOnce(pred_net)
+        output = workspace.FetchBlob(ws_output())
+        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
new file mode 100644
index 0000000..18d1e15
--- /dev/null
+++ b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
@@ -0,0 +1,127 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class TestLengthsReducerOpsFused8BitRowwise(hu.HypothesisTestCase):
+    @given(
+        input_data=hu.tensor(min_dim=2, max_dim=2),
+        weighted=st.booleans(),
+        seed=st.integers(0, 2**32 - 1),
+    )
+    def test_sparse_lengths_sum(self, input_data, weighted, seed):
+        net = core.Net("bench")
+
+        np.random.seed(seed)
+
+        input_data = input_data.astype(np.float32)
+        indices = np.random.randint(
+            low=0,
+            high=len(input_data),
+            size=[np.random.randint(len(input_data))],
+            dtype=np.int32
+        )
+        weights = np.random.uniform(size=[len(indices)]).astype(np.float32)
+        lengths_split = np.clip(1, len(indices) // 2, 10)
+        lengths = np.ones(
+            [len(indices) // lengths_split], dtype=np.int32
+        ) * lengths_split
+        print(indices, weights, lengths)
+
+        quantized_data = net.FloatToFused8BitRowwiseQuantized(
+            'input_data', 'quantized_data'
+        )
+        dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
+            quantized_data, 'dequantized_data'
+        )
+
+        if weighted:
+            net.SparseLengthsWeightedSum(
+                [dequantized_data, 'weights', 'indices', 'lengths'],
+                'sum_reference',
+                engine='fp16'
+            )
+            net.SparseLengthsWeightedSumFused8BitRowwise(
+                [quantized_data, 'weights', 'indices', 'lengths'],
+                'sum_quantized'
+            )
+        else:
+            net.SparseLengthsSum(
+                [dequantized_data, 'indices', 'lengths'],
+                'sum_reference',
+                engine='fp16'
+            )
+            net.SparseLengthsSumFused8BitRowwise(
+                [quantized_data, 'indices', 'lengths'], 'sum_quantized'
+            )
+
+        workspace.FeedBlob('input_data', input_data)
+        workspace.FeedBlob('weights', weights)
+        workspace.FeedBlob('indices', indices)
+        workspace.FeedBlob('lengths', lengths)
+
+        workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+        workspace.CreateNet(net)
+        workspace.RunNetOnce(net)
+
+        sum_reference = workspace.FetchBlob('sum_reference')
+        sum_quantized = workspace.FetchBlob('sum_quantized')
+        np.testing.assert_array_almost_equal(sum_reference, sum_quantized)
+
+    @given(
+        input_data=hu.tensor(min_dim=2, max_dim=2),
+        seed=st.integers(0, 2**32 - 1),
+    )
+    def test_sparse_lengths_mean(self, input_data, seed):
+        net = core.Net("bench")
+
+        np.random.seed(seed)
+
+        input_data = input_data.astype(np.float32)
+        indices = np.random.randint(
+            low=0,
+            high=len(input_data),
+            size=[np.random.randint(len(input_data))],
+            dtype=np.int32
+        )
+        lengths_split = np.clip(1, len(indices) // 2, 10)
+        lengths = np.ones(
+            [len(indices) // lengths_split], dtype=np.int32
+        ) * lengths_split
+        print(indices, lengths)
+
+        quantized_data = net.FloatToFused8BitRowwiseQuantized(
+            'input_data', 'quantized_data'
+        )
+        dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
+            quantized_data, 'dequantized_data'
+        )
+
+        net.SparseLengthsMean(
+            [dequantized_data, 'indices', 'lengths'],
+            'mean_reference',
+            engine='fp16'
+        )
+        net.SparseLengthsMeanFused8BitRowwise(
+            [quantized_data, 'indices', 'lengths'], 'mean_quantized'
+        )
+
+        workspace.FeedBlob('input_data', input_data)
+        workspace.FeedBlob('indices', indices)
+        workspace.FeedBlob('lengths', lengths)
+
+        workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+        workspace.CreateNet(net)
+        workspace.RunNetOnce(net)
+
+        mean_reference = workspace.FetchBlob('mean_reference')
+        mean_quantized = workspace.FetchBlob('mean_quantized')
+        np.testing.assert_array_almost_equal(mean_reference, mean_quantized)
diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
new file mode 100644
index 0000000..d73db5a
--- /dev/null
+++ b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
@@ -0,0 +1,151 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+def FakeQuantization8BitsRowwise(data):
+    min_el = np.min(data, axis=1)
+    max_el = np.max(data, axis=1)
+    scale = (max_el - min_el) / 255.
+    bias = min_el
+    inv_scale = 1. / scale
+    data = data.T
+    data = np.round((data - bias) * inv_scale) * scale + bias
+    return data.T
+
+
+class TestQuantize8bits(hu.HypothesisTestCase):
+
+    def test_quantize_op(self):
+        op = core.CreateOperator(
+            'FloatToRowwiseQuantized8Bits',
+            ['input_data'],
+            ['quantized_input', 'scale_bias'])
+        input_data = np.float32(np.asarray([[801., 786, 235.2, 2353.3434],
+                                            [5., 11., 9., -2.]]))
+        workspace.FeedBlob('input_data', input_data)
+        workspace.RunOperatorOnce(op)
+        op1 = core.CreateOperator(
+            'Rowwise8BitQuantizedToFloat',
+            ['quantized_input', 'scale_bias'],
+            ['dequantized_input'])
+        workspace.RunOperatorOnce(op1)
+        result = workspace.FetchBlob('dequantized_input')
+        ground_truth = FakeQuantization8BitsRowwise(input_data)
+        np.testing.assert_array_almost_equal(
+            result, ground_truth)
+
+    def test_quantize_tensor_with_const_row_op(self):
+        op = core.CreateOperator(
+            'FloatToRowwiseQuantized8Bits',
+            ['input_data'],
+            ['quantized_input', 'scale_bias'])
+        input_data = np.float32(np.asarray([[801., 786, 235.2, 2353.3434],
+                                            [9., 9., 9., 9.]]))
+        workspace.FeedBlob('input_data', input_data)
+        workspace.RunOperatorOnce(op)
+        op1 = core.CreateOperator(
+            'Rowwise8BitQuantizedToFloat',
+            ['quantized_input', 'scale_bias'],
+            ['dequantized_input'])
+        workspace.RunOperatorOnce(op1)
+        result = workspace.FetchBlob('dequantized_input')
+        ground_truth = FakeQuantization8BitsRowwise(input_data)
+        ground_truth[1, :] = 9.
+        np.testing.assert_array_almost_equal(
+            result, ground_truth)
+
+    def test_SparseSegmentUint8(self):
+
+        init_net = core.Net("init")
+        net = core.Net("bench")
+        size = 10**3
+        isize = 10**2
+
+        # input preparation
+        d = init_net.UniformFill([], shape=[size, 32])
+        w = init_net.UniformFill([], shape=[isize, ])
+        i = init_net.UniformIntFill([], shape=[isize], max=size - 1)
+        i = init_net.Cast([i], to=core.DataType.INT64)
+        l = init_net.ConstantFill(
+            [],
+            ['l'],
+            shape=[isize // 10],
+            value=10,
+            dtype=core.DataType.INT32,
+        )
+        net.FloatToRowwiseQuantized8Bits([d],
+                                         ['quantized_data', 'scale_bias'])
+        net.Rowwise8BitQuantizedToFloat(['quantized_data', 'scale_bias'],
+                                        ['dequantized_data'])
+
+        # SparseLengthsWeightedSum
+        net.SparseLengthsWeightedSum(['dequantized_data', w, i, l],
+                                     ['PositionWeighted_0'], engine='fp16')
+        net.SparseLengthsWeightedSum8BitsRowwise(
+            ['quantized_data', w, i, l, 'scale_bias'],
+            ['PositionWeighted_1'])
+
+        # SparseLengthsSum
+        net.SparseLengthsSum(['dequantized_data', i, l],
+                             ['Sum_0'], engine='fp16')
+
+        net.SparseLengthsSum8BitsRowwise(
+            ['quantized_data', i, l, 'scale_bias'],
+            ['Sum_1'])
+
+        # SparseLengthsWeightedMean
+        # net.SparseLengthsWeightedMean(['dequantized_data', w, i, l],
+        #                              ['WeightedMean_0'])
+        # net.SparseLengthsWeightedMean8BitsRowwise(
+        #     ['quantized_data', w, i, l, 'scale_bias'],
+        #     ['WeightedMean_1'])
+
+        # SparseLengthsMean
+        net.SparseLengthsMean(['dequantized_data', i, l],
+                              ['Mean_0'], engine='fp16')
+
+        net.SparseLengthsMean8BitsRowwise(
+            ['quantized_data', i, l, 'scale_bias'],
+            ['Mean_1'])
+
+        gathered_w = net.Gather(['quantized_data', i],
+                                engine='fp16')
+
+        gathered_scale_bias = net.Gather(['scale_bias', i],
+                                         engine='fp16')
+        net.Rowwise8BitQuantizedToFloat(
+            [gathered_w, gathered_scale_bias],
+            'Gathered_1')
+
+        net.Gather(['dequantized_data', i], 'Gathered_0')
+
+        workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+        workspace.RunNetOnce(init_net)
+        workspace.CreateNet(net)
+        workspace.RunNetOnce(net)
+
+        PositionWeighted_1 = workspace.FetchBlob('PositionWeighted_1')
+        ground_truth_posw = workspace.FetchBlob('PositionWeighted_0')
+        np.testing.assert_array_almost_equal(PositionWeighted_1,
+                                             ground_truth_posw, decimal=5)
+        Sum_1 = workspace.FetchBlob('Sum_1')
+        ground_truth_sum = workspace.FetchBlob('Sum_0')
+        np.testing.assert_array_almost_equal(Sum_1,
+                                             ground_truth_sum, decimal=5)
+
+        Mean_1 = workspace.FetchBlob('Mean_1')
+        ground_truth_mean = workspace.FetchBlob('Mean_0')
+        np.testing.assert_array_almost_equal(Mean_1,
+                                             ground_truth_mean, decimal=5)
+
+        Gathered_1 = workspace.FetchBlob('Gathered_1')
+        ground_truth_gathered = workspace.FetchBlob('Gathered_0')
+        np.testing.assert_array_almost_equal(Gathered_1,
+                                             ground_truth_gathered, decimal=5)
diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py
new file mode 100644
index 0000000..deefb12
--- /dev/null
+++ b/caffe2/python/lstm_benchmark.py
@@ -0,0 +1,347 @@
+## @package lstm_benchmark
+# Module caffe2.python.lstm_benchmark
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, core, utils, rnn_cell, model_helper
+from caffe2.python import recurrent
+
+import argparse
+import numpy as np
+import time
+
+import logging
+
+logging.basicConfig()
+log = logging.getLogger("lstm_bench")
+log.setLevel(logging.DEBUG)
+
+
+def generate_data(T, shape, num_labels, fixed_shape):
+    '''
+    Fill a queue with input data
+    '''
+    log.info("Generating T={} sequence batches".format(T))
+
+    generate_input_init_net = core.Net('generate_input_init')
+    queue = generate_input_init_net.CreateBlobsQueue(
+        [], "inputqueue", num_blobs=1, capacity=T,
+    )
+    label_queue = generate_input_init_net.CreateBlobsQueue(
+        [], "labelqueue", num_blobs=1, capacity=T,
+    )
+
+    workspace.RunNetOnce(generate_input_init_net)
+    generate_input_net = core.Net('generate_input')
+
+    generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
+    generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
+    np.random.seed(2603)
+
+    entry_counts = []
+    for t in range(T):
+        if (t % (max(10, T // 10)) == 0):
+            print("Generating data {}/{}".format(t, T))
+        # Randomize the seqlength
+        random_shape = (
+            [np.random.randint(1, shape[0])] + shape[1:]
+            if t > 0 and not fixed_shape else shape
+        )
+        X = np.random.rand(*random_shape).astype(np.float32)
+        batch_size = random_shape[1]
+        L = num_labels * batch_size
+        labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
+        workspace.FeedBlob("scratch", X)
+        workspace.FeedBlob("label_scr", labels)
+        workspace.RunNetOnce(generate_input_net.Proto())
+        entry_counts.append(random_shape[0] * random_shape[1])
+
+    log.info("Finished data generation")
+
+    return queue, label_queue, entry_counts
+
+
+def create_model(args, queue, label_queue, input_shape):
+    model = model_helper.ModelHelper(name="LSTM_bench")
+    seq_lengths, target = \
+        model.net.AddExternalInputs(
+            'seq_lengths',
+            'target',
+        )
+
+    input_blob = model.net.DequeueBlobs(queue, "input_data")
+    labels = model.net.DequeueBlobs(label_queue, "label")
+
+    init_blobs = []
+    if args.implementation in ["own", "static", "static_dag"]:
+        T = None
+        if "static" in args.implementation:
+            assert args.fixed_shape, \
+                "Random input length is not static RNN compatible"
+            T = args.seq_length
+            print("Using static RNN of size {}".format(T))
+
+        for i in range(args.num_layers):
+            hidden_init, cell_init = model.net.AddExternalInputs(
+                "hidden_init_{}".format(i),
+                "cell_init_{}".format(i)
+            )
+            init_blobs.extend([hidden_init, cell_init])
+
+        output, last_hidden, _, last_state = rnn_cell.LSTM(
+            model=model,
+            input_blob=input_blob,
+            seq_lengths=seq_lengths,
+            initial_states=init_blobs,
+            dim_in=args.input_dim,
+            dim_out=[args.hidden_dim] * args.num_layers,
+            scope="lstm1",
+            memory_optimization=args.memory_optimization,
+            forward_only=args.forward_only,
+            drop_states=True,
+            return_last_layer_only=True,
+            static_rnn_unroll_size=T,
+        )
+
+        if "dag" in args.implementation:
+            print("Using DAG net type")
+            model.net.Proto().type = 'dag'
+            model.net.Proto().num_workers = 4
+
+    elif args.implementation == "cudnn":
+        # We need to feed a placeholder input so that RecurrentInitOp
+        # can infer the dimensions.
+        init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
+        model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
+        output, last_hidden, _ = rnn_cell.cudnn_LSTM(
+            model=model,
+            input_blob=input_blob,
+            initial_states=init_blobs,
+            dim_in=args.input_dim,
+            dim_out=args.hidden_dim,
+            scope="cudnnlstm",
+            num_layers=args.num_layers,
+        )
+
+    else:
+        assert False, "Unknown implementation"
+
+    weights = model.net.UniformFill(labels, "weights")
+    softmax, loss = model.net.SoftmaxWithLoss(
+        [model.Flatten(output), labels, weights],
+        ['softmax', 'loss'],
+    )
+
+    if not args.forward_only:
+        model.AddGradientOperators([loss])
+
+    # carry states over
+    for init_blob in init_blobs:
+        model.net.Copy(last_hidden, init_blob)
+
+        sz = args.hidden_dim
+        if args.implementation == "cudnn":
+            sz *= args.num_layers
+        workspace.FeedBlob(init_blob, np.zeros(
+            [1, args.batch_size, sz], dtype=np.float32
+        ))
+
+    if args.rnn_executor:
+        for op in model.net.Proto().op:
+            if op.type.startswith('RecurrentNetwork'):
+                recurrent.set_rnn_executor_config(
+                    op,
+                    num_threads=args.rnn_executor_num_threads,
+                    max_cuda_streams=args.rnn_executor_max_cuda_streams,
+                )
+    return model, output
+
+
+def Caffe2LSTM(args):
+    T = args.data_size // args.batch_size
+
+    input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
+    queue, label_queue, entry_counts = generate_data(T // args.seq_length,
+                                       input_blob_shape,
+                                       args.hidden_dim,
+                                       args.fixed_shape)
+
+    workspace.FeedBlob(
+        "seq_lengths",
+        np.array([args.seq_length] * args.batch_size, dtype=np.int32)
+    )
+
+    model, output = create_model(args, queue, label_queue, input_blob_shape)
+
+    workspace.RunNetOnce(model.param_init_net)
+    workspace.CreateNet(model.net)
+
+    start_time = time.time()
+    num_iters = T // args.seq_length
+    total_iters = 0
+
+    # Run the Benchmark
+    log.info("------ Warming up ------")
+    workspace.RunNet(model.net.Proto().name)
+
+    if (args.gpu):
+        log.info("Memory stats:")
+        stats = utils.GetGPUMemoryUsageStats()
+        log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
+
+    log.info("------ Starting benchmark ------")
+    start_time = time.time()
+    last_time = time.time()
+    for iteration in range(1, num_iters, args.iters_to_report):
+        iters_once = min(args.iters_to_report, num_iters - iteration)
+        total_iters += iters_once
+        workspace.RunNet(model.net.Proto().name, iters_once)
+
+        new_time = time.time()
+        log.info(
+            "Iter: {} / {}. Entries Per Second: {}k.".format(
+                iteration,
+                num_iters,
+                np.sum(entry_counts[iteration:iteration + iters_once]) /
+                (new_time - last_time) // 100 / 10,
+            )
+        )
+        last_time = new_time
+
+    log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
+         np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
+         " (with RNN executor)" if args.rnn_executor else "",
+    ))
+
+    if (args.gpu):
+        log.info("Memory stats:")
+        stats = utils.GetGPUMemoryUsageStats()
+        log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
+        if (stats['max_total'] != stats['total']):
+            log.warning(
+                "Max usage differs from current total usage: {} > {}".
+                format(stats['max_total'], stats['total'])
+            )
+            log.warning("This means that costly deallocations occured.")
+
+    return time.time() - start_time
+
+
+@utils.debug
+def Benchmark(args):
+    return Caffe2LSTM(args)
+
+
+def GetArgumentParser():
+    parser = argparse.ArgumentParser(description="LSTM benchmark.")
+
+    parser.add_argument(
+        "--hidden_dim",
+        type=int,
+        default=800,
+        help="Hidden dimension",
+    )
+    parser.add_argument(
+        "--input_dim",
+        type=int,
+        default=40,
+        help="Input dimension",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size."
+    )
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=20,
+        help="Max sequence length"
+    )
+    parser.add_argument(
+        "--data_size",
+        type=int,
+        default=1000000,
+        help="Number of data points to generate"
+    )
+    parser.add_argument(
+        "--iters_to_report",
+        type=int,
+        default=20,
+        help="Number of iteration to report progress"
+    )
+    parser.add_argument(
+        "--gpu",
+        action="store_true",
+        help="Run all on GPU",
+    )
+    parser.add_argument(
+        "--implementation",
+        type=str,
+        default="own",
+        help="'cudnn', 'own', 'static' or 'static_dag'",
+    )
+    parser.add_argument(
+        "--fixed_shape",
+        action="store_true",
+        help=("Whether to randomize shape of input batches. "
+              "Static RNN requires fixed shape"),
+    )
+    parser.add_argument(
+        "--memory_optimization",
+        action="store_true",
+        help="Whether to use memory optimized LSTM or not",
+    )
+    parser.add_argument(
+        "--forward_only",
+        action="store_true",
+        help="Whether to run only forward pass"
+    )
+    parser.add_argument(
+        "--num_layers",
+        type=int,
+        default=1,
+        help="Number of LSTM layers. All output dimensions are going to be"
+             "of hidden_dim size",
+    )
+    parser.add_argument(
+        "--rnn_executor",
+        action="store_true",
+        help="Whether to use RNN executor"
+    )
+    parser.add_argument(
+        "--rnn_executor_num_threads",
+        type=int,
+        default=None,
+        help="Number of threads used by CPU RNN Executor"
+    )
+    parser.add_argument(
+        "--rnn_executor_max_cuda_streams",
+        type=int,
+        default=None,
+        help="Maximum number of CUDA streams used by RNN executor on GPU"
+    )
+    return parser
+
+
+if __name__ == '__main__':
+    args, extra_args = GetArgumentParser().parse_known_args()
+
+    rnn_executor_opt = 1 if args.rnn_executor else 0
+
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+        '--caffe2_print_blob_sizes_at_exit=0',
+        '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
+        '--caffe2_gpu_memory_tracking=1'] + extra_args)
+
+    device = core.DeviceOption(
+        caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 4)
+
+    with core.DeviceScope(device):
+        Benchmark(args)
diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py
new file mode 100644
index 0000000..e21f976
--- /dev/null
+++ b/caffe2/python/memonger.py
@@ -0,0 +1,990 @@
+## @package memonger
+# Module caffe2.python.memonger
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import networkx as nx
+import collections
+import time
+import copy
+from caffe2.python import workspace, core
+from caffe2.proto import caffe2_pb2
+import enum
+import logging
+from future.utils import viewitems, viewvalues
+import caffe2.python._import_c_extension as C
+
+log = logging.getLogger("memonger")
+log.setLevel(logging.INFO)
+LiveRange = collections.namedtuple('LiveRange', ["defined", "used", "size"])
+
+
+def share_grad_blobs(
+    net,
+    losses,
+    param_grads,
+    namescope,
+    dont_share_blobs=None,
+    share_activations=False,
+    blob_shapes=None,
+):
+    '''
+    Implements similar optimization as Torch's shareGradInput():
+    for the gradients that are passed between layers, share blobs between
+    operators when possible. This yields significant memory savings with
+    deep networks.
+
+    Returns an optimized protobuf (assign to net._net)
+    '''
+    def is_grad_blob(b):
+        name = str(b)
+        # Note: need to look at _{namescope} pattern as it matches
+        # to handle the auto-split gradients
+        return name.endswith("_grad") and (name.startswith(namescope) or
+            name.startswith("_" + namescope)) and name not in param_grads
+
+    def is_grad_op(op):
+        # TODO: something smarter
+        for b in list(op.input) + list(op.output):
+            if is_grad_blob(b):
+                return True
+        return False
+
+    log.warn("NOTE: Executing memonger to optimize gradient memory")
+
+    # Collect ops that have something to do with gradients
+    if namescope != "" and not namescope.endswith("/"):
+        namescope += "/"
+
+    netproto = copy.deepcopy(net.Proto())
+    activations = []
+    external_output = set(net.Proto().external_output)
+
+    # Hacky way to get activations, think of a better way
+    for op in net.Proto().op:
+        for b in op.output:
+            if b + "_w" in op.input and b not in external_output:
+                activations.append(b)
+
+    # Remove last activations, as they are usually accessed externally
+    activations = set(activations[:-2])
+
+    # Gradient ops
+    grad_op_indices = []
+    for idx, op in enumerate(netproto.op):
+        if (is_grad_op(op)):
+            grad_op_indices.append(idx)
+
+    shared_blobs = set()
+    for op in net.Proto().op:
+        for b in list(op.input) + list(op.output):
+            if is_grad_blob(b) or (share_activations and b in activations):
+                shared_blobs.add(b)
+    start_time = time.time()
+    optim_str = C.memonger_compute_blob_recycling_for_dag(
+        netproto.SerializeToString(),
+        [str(s).encode('utf-8') for s in losses],
+        grad_op_indices,
+        set(str(s).encode('utf-8') for s in shared_blobs),
+        namescope.encode('utf-8'),
+        set() if dont_share_blobs is None else dont_share_blobs,
+        {} if blob_shapes is None else blob_shapes
+    )
+
+    log.info("Memonger memory optimization took {} secs".format(
+        time.time() - start_time),
+    )
+
+    optim = caffe2_pb2.NetDef()
+    optim.ParseFromString(optim_str)
+    assert verify_graph_equality(net.Proto(), optim), \
+        "Memonger graph is not equal to original."
+    assert verify_inplace_blobs(net.Proto(), optim), \
+        "Inplace assignments differ in memonger net."
+    return optim
+
+
+def optimize_inference_for_dag(net, input_blobs, namescope=""):
+    netproto = copy.deepcopy(net.Proto())
+    external_input = set(net.Proto().external_input)
+    external_output = set(net.Proto().external_output)
+
+    def is_activation_blob(b):
+        return b not in external_input and b not in external_output
+
+    activation_blobs = set()
+    seen_as_output = set()
+    ops = list(net.Proto().op)
+    op_indices = [index for index, op in enumerate(net.Proto().op)]
+
+    # Sanity check: check that all external inputs are properlyh accounted
+    # and that no gradient ops are included in 'net'
+    for op in ops:
+        for b in op.input:
+            if is_activation_blob(b):
+                activation_blobs.add(b)
+                if b not in seen_as_output:
+                    assert False, "{} not in external input".format(b)
+        for b in op.output:
+            if is_activation_blob(b):
+                activation_blobs.add(b)
+        seen_as_output = seen_as_output.union(set(op.output))
+        assert not op.is_gradient_op, \
+            "You can only pass inference-only nets to optimize_inference_for_dag"
+    start_time = time.time()
+    optim_str = C.memonger_compute_blob_recycling_for_dag(
+        netproto.SerializeToString(),
+        [str(s).encode('utf-8') for s in input_blobs],
+        op_indices,
+        set(str(s).encode('utf-8') for s in activation_blobs),
+        namescope.encode('utf-8'),
+        set(),
+        {}
+    )
+
+    log.info("Memonger memory optimization took {} secs".format(
+        time.time() - start_time),
+    )
+
+    optim = caffe2_pb2.NetDef()
+    optim.ParseFromString(optim_str)
+
+    assert verify_graph_equality(net.Proto(), optim), \
+        "Memonger graph is not equal to original."
+    assert verify_inplace_blobs(net.Proto(), optim), \
+        "Inplace assignments differ in memonger net."
+    return optim
+
+
+def estimate_memory_usage(protos, shapes, types, devicescope):
+    import numpy as np
+    '''
+    Estimate memory usage of a model. This is an estimate because
+    we assume a single threaded execution and miss some internal
+    memory usage of operators. Only estimates the memory for a given
+    device scope.
+
+    Also, currently it does not handle correctly if blob sizes vary
+    during execution, as it uses only the final blob size.
+
+    Returns (total, highwater, by op type) memory allocation in bytes.
+    '''
+    sizeofs = {
+        caffe2_pb2.TensorProto.DOUBLE: 8,
+        caffe2_pb2.TensorProto.FLOAT: 4,
+        caffe2_pb2.TensorProto.FLOAT16: 2,
+        caffe2_pb2.TensorProto.INT32: 4,
+        caffe2_pb2.TensorProto.INT8: 1,
+        caffe2_pb2.TensorProto.UINT8: 1,
+        caffe2_pb2.TensorProto.UINT16: 2,
+        caffe2_pb2.TensorProto.INT16: 2,
+        caffe2_pb2.TensorProto.BOOL: 1,
+        caffe2_pb2.TensorProto.INT64: 8,
+    }
+
+    def split_net(proto):
+        ops = [op for op in proto.op if
+               op.device_option == devicescope or op.type in {"Free", "Alias"}]
+        del proto.op[:]
+        proto.op.extend(ops)
+        return proto
+
+    def num_bytes(blob):
+        if blob not in shapes or blob not in types:
+            log.warning("Unknown blob encountered: {}".format(blob))
+            return 0
+        sizeof = sizeofs[types[blob]]
+        return sizeof * np.prod(shapes[blob])
+
+    protos = [split_net(proto) for proto in protos]
+    allocs_by_ops = collections.defaultdict(lambda: 0)
+
+    # Evaluate
+    current_allocated = 0
+    max_allocated = 0
+    total_allocated = 0
+    allocated = set()
+    for proto in protos:
+        for op in proto.op:
+            if op.type == "Free" or op.type == "Alias":
+                for o in op.output:
+                    if o in allocated:
+                        current_allocated -= num_bytes(o)
+                        allocated.remove(o)
+            else:
+                for output in op.output:
+                    if output not in allocated:
+                        nbytes = num_bytes(output)
+                        total_allocated += nbytes
+                        current_allocated += nbytes
+                        max_allocated = max(max_allocated, current_allocated)
+                        allocated.add(output)
+                        allocs_by_ops[op.type] += nbytes
+
+    return (total_allocated, max_allocated, allocs_by_ops)
+
+
+def release_blobs_when_used(netproto, dont_free_blobs, selector_fun=None):
+    '''
+    Insert Free-ops after a blob has been used the last time, so that its
+    memory can be reclaimed. Use this only with efficient caching memory
+    managers (such as CUB, --caffe2_cuda_memory_pool=cub).
+
+    Blobs used with Alias op won't be freed.
+
+    @dont_free_blobs:  is a set of blobs that should not be freed
+    @selector_fun:     optional lambda that return True if blob name
+                       can be released. Use for easy special filtering, like
+                       excluding blobs with "loss" in the name.
+
+    Returns a new protobuffer. To use with a model, use:
+        model.net._net = memonger.release_blobs_when_used(..)
+    '''
+    input_blobs = set()
+    can_release = set()
+    alias_blobs = set()
+    netproto = copy.deepcopy(netproto)
+
+    for op in netproto.op:
+        if op.type == 'Alias':
+            alias_blobs.add(op.input[0])
+            continue
+        for inp in op.input:
+            input_blobs.add(inp)
+        for outp in op.output:
+            if outp not in input_blobs:
+                if selector_fun is None or selector_fun(outp):
+                    can_release.add(outp)
+
+    # Remove such blobs that are not input at all and external outputs
+    can_release = can_release - set(netproto.external_output)
+    can_release = can_release.intersection(input_blobs)
+    can_release = can_release - dont_free_blobs
+    can_release = can_release - alias_blobs
+
+    ops = list(netproto.op)
+
+    # .. then find last use of each can-release blob, and insert a Free op
+    for j in reversed(range(0, len(netproto.op))):
+        op = netproto.op[j]
+        for inp in op.input:
+            if inp in can_release:
+                can_release.remove(inp)
+                ops.insert(j + 1, core.CreateOperator("Free", [inp], [inp]))
+
+    del netproto.op[:]
+    netproto.op.extend(ops)
+    return netproto
+
+
+def _find_source_nodes(g):
+    ''' Return nodes without predecessors '''
+    ret = []
+    for cn in g:
+        cur_pred = list(g.predecessors(cn))
+        if not cur_pred:
+            ret.append(cn)
+    return ret
+
+
+def _find_target_nodes(g):
+    ''' Return nodes without successors '''
+    ret = []
+    for cn in g:
+        cur_succ = list(g.successors(cn))
+        if not cur_succ:
+            ret.append(cn)
+    return ret
+
+
+def _add_single_target_ifneeded(g):
+    targets = _find_target_nodes(g)
+    assert len(targets) >= 1
+    if len(targets) == 1:
+        return g
+    ret = copy.deepcopy(g)
+
+    def _next_available_idx(g):
+        ret = -1
+        for cn in g:
+            if cn > ret:
+                ret = cn
+        ret += 1
+        return ret
+
+    target_node_idx = _next_available_idx(g)
+    ret.add_node(target_node_idx)
+    for cn in targets:
+        ret.add_edge(cn, target_node_idx)
+
+    return ret
+
+
+def _get_path(pred_list, dist_list):
+    ''' Get the path from nx.bellman_ford()'s output '''
+
+    # distances are negative
+    assert all(dist_list[x] <= 0 for x in dist_list)
+    # node with longest distance to source is the target
+    target = min(dist_list, key=lambda x: dist_list[x])
+
+    ret = []
+    cur = target
+
+
+    while cur is not None:
+        ret.append(cur)
+        # Hack to get networkx 2.0 happy: it uses list in pred.
+        # TODO(tulloch): are there cases with multiple predecessors?
+        try:
+            cur = pred_list[cur][0]
+        except TypeError:
+            cur = pred_list[cur]
+
+    return list(reversed(ret))
+
+
+def _get_longest_paths(g, source_nodes):
+    ''' Get the longest path for nodes in 'source_nodes'
+        Find with bellman_ford() by setting weight = -1
+    '''
+
+    ng = copy.deepcopy(g)
+    for u, v in ng.edges():
+        ng[u][v]["weight"] = -1
+
+    ret = {}
+    for cn in source_nodes:
+        pred, dist = nx.bellman_ford(ng, cn, weight="weight")
+        path = _get_path(pred, dist)
+        assert path[0] == cn
+        assert len(path) - 1 == -dist[path[-1]]
+        ret[cn] = path
+
+    return ret
+
+
+def _build_tree(paths):
+    ''' Build a tree for given paths based on common elements.
+        Last elements of all paths are the same, which is the root of the tree.
+    '''
+    assert all(cp[-1] == paths[0][-1] for cp in paths)
+    g = nx.DiGraph()
+    node_set = {y for x in paths for y in x}
+    g.add_nodes_from(node_set)
+    for cp in paths:
+        for ce in zip(cp[0:-1], cp[1:]):
+            g.add_edge(ce[1], ce[0])
+
+    root = paths[0][-1]
+    _compute_tree_height(g, root)
+
+    return (g, root)
+
+
+def _compute_tree_height(g, root):
+    ''' Compute the heights of the tree for all nodes
+        Height of leaves are 0
+    '''
+    def _get_height(root):
+        children = list(g.successors(root))
+        height = 0
+        if children:
+            child_heights = [_get_height(x) for x in children]
+            height = max(child_heights) + 1
+        g.node[root]["height"] = height
+        return height
+
+    _get_height(root)
+
+
+def _sort_tree_leaves(g, root):
+    ''' For each node, sort its child nodes based on the height of the nodes.
+        Return the leaf nodes of the tree after sorting.
+    '''
+    def _get_height(root):
+        return g.node[root]["height"]
+
+    def _get_sorted_leaves(root):
+        children = list(g.successors(root))
+        if not children:
+            return [root]
+        child_heights = [_get_height(x) for x in children]
+        order = sorted(range(len(children)), key=lambda x: child_heights[x])
+        ret = []
+        for co in order:
+            cr = children[co]
+            ret += _get_sorted_leaves(cr)
+
+        return ret
+
+    return _get_sorted_leaves(root)
+
+
+def topological_sort_traversal_longest_path(g):
+    ''' The graph 'g' may contain several source nodes (nodes without incoming
+        edge), which could be in any order and still be a valid
+        topological sorting result. We would like to arrange these source nodes
+        so that the average live spans of the computed blobs are shorter.
+        The idea is to sort the source nodes based on the length of their path to
+        the target node so that the one with longer path is used first.
+        This is done by:
+        - Add a single target node if there are multiple target nodes in 'g'.
+        - Find the longest path between each source and the target node.
+        - Convert the longest paths to a tree with the target node being the root
+          and source nodes being the leaves.
+        - Sort the nodes of the tree based on the height of the tree.
+    '''
+    gt = _add_single_target_ifneeded(g)
+    source_nodes = _find_source_nodes(gt)
+    lpaths = _get_longest_paths(gt, source_nodes)
+    tree, root = _build_tree(list(viewvalues(lpaths)))
+    sorted_sources = _sort_tree_leaves(tree, root)
+    assert(sorted(sorted_sources) == sorted(source_nodes))
+
+    if nx.__version__ < '2.0':
+        ret = nx.topological_sort(g, sorted_sources)
+    else:
+        # Manually making a sorted descendent list
+        dependency_order = list(sorted_sources)
+        seen_nodes = set(sorted_sources)
+        for s in sorted_sources:
+            desc = nx.descendants(g, s)
+            for d in desc:
+                if d not in seen_nodes:
+                    seen_nodes.add(d)
+                    dependency_order.append(d)
+        sort_key = dict((v, len(dependency_order) - i) for i, v in enumerate(dependency_order))
+        ret = nx.algorithms.dag.lexicographical_topological_sort(
+            g, key=lambda x: sort_key[x])
+        ret = list(ret)
+    assert(len(ret) == len(g.node))
+    return ret
+
+
+def topological_sort_traversal(g):
+    return list(nx.topological_sort(g))
+
+
+def compute_ranges(linearized_ops, blob_sizes=None):
+    if not blob_sizes:
+        log.warning('Provide blob sizes to get more accurate assignments.')
+
+    blobs = collections.defaultdict(
+        lambda: LiveRange(defined=None, used=None, size=None))
+    for i, op in enumerate(linearized_ops):
+        for blob in op.input:
+            used = blobs[blob].used
+            if used is None:
+                used = i
+            else:
+                used = max(used, i)
+            blobs[blob] = blobs[blob]._replace(used=used)
+            blob_size = blob_sizes[blob] if blob_sizes else None
+            assert not blob_sizes or blob_size is not None
+            blobs[blob] = blobs[blob]._replace(size=blob_size)
+        for blob in op.output:
+            defined = blobs[blob].defined
+            if defined is None:
+                defined = i
+            else:
+                defined = min(defined, i)
+            blobs[blob] = blobs[blob]._replace(defined=defined)
+            blob_size = blob_sizes[blob] if blob_sizes else None
+            assert not blob_sizes or blob_size is not None
+            blobs[blob] = blobs[blob]._replace(size=blob_size)
+
+    return blobs
+
+
+def is_compatible(candidate_range, assignment, static_blobs):
+    (name, range_) = assignment[-1]
+    if name in static_blobs:
+        return False
+    if candidate_range.defined is None or range_.defined is None \
+      or range_.used is None:
+        return False
+    return candidate_range.defined > range_.used
+
+
+def compute_blob_assignments(assignments):
+    blob_assignments = {}
+    for assignment in assignments:
+        if len(assignment) == 1:
+            continue
+        last_blob, _ = assignment[-1]
+        for (blob, _) in assignment:
+            blob_assignments[blob] = last_blob
+    return blob_assignments
+
+
+def _get_max_size(assignment):
+    if not assignment:
+        return 0
+    ret = max([x[1].size for x in assignment])
+    ret = 0 if ret is None else ret
+    return ret
+
+
+def get_memory_usage(assignments):
+    ret = 0
+    for cur in assignments:
+        ret += _get_max_size(cur)
+    return ret
+
+
+def compute_assignments_greedy(ranges_sorted, init_assignments=None):
+    assignments = init_assignments or []
+    visited = {y[0] for x in assignments for y in x}
+
+    for (name, range_) in ranges_sorted:
+        if name in visited:
+            continue
+        assigned = False
+        best_assignment = 0
+        min_dist = float("inf")
+        candidate_size = range_.size or 0
+        for idx, assignment in enumerate(assignments):
+            if is_compatible(range_, assignment, []):
+                assigned = True
+                dist = abs(_get_max_size(assignment) - candidate_size)
+                if dist < min_dist:
+                    min_dist = dist
+                    best_assignment = idx
+        if assigned:
+            assignment = assignments[best_assignment]
+            assignment.append((name, range_))
+        else:
+            assignments.append([(name, range_)])
+    return assignments
+
+
+def _get_count(assignments):
+    ''' Return number of blobs in assignments '''
+    if assignments:
+        return sum([len(x) for x in assignments])
+    return 0
+
+
+def compute_assignments_dp(ranges_sorted, init_assignment, counter=None):
+    ''' Compute assignment for blobs in 'ranges_sorted' on top of 'init_assignment'
+        using dynamic programming + recursion.
+
+        ranges_sorted: blobs sorted by 'used'
+        init_assignment: assignment to start with, blobs in 'ranges_sorted' should
+                         not be used in 'init_assignment'
+
+        Using f(b, k, init) to represent the best assignment for blobs b[0:k]
+        given initial assignment 'init', we have
+            f(b, k, init) = f(b, j, init) +
+                            find_best(b[j:k], f(b, j, init))
+        where j is the index of the last best assignment that is independent of
+        blob b[k - 1] (b[k - 1] is compatible with all assignments in
+        f(b, j, init)), and find_best(b1, init1) gives the best assignment
+        for blobs in 'b1' based on the initial assignment 'init1', and blobs
+        b1[0:-1] should be incompatible with b1[-1]. f(b, len(b), []) gives
+        the best assignment for blobs 'b'.
+
+        For find_best(b, init), since b[0:-1] are not compatible with b[-1], we
+        could reduce it to a smaller problem to find best assignment for b[0:-1]
+        as
+            find_best(b, init) = min {
+                f(b[0:-1], len(b) - 1, init - x) + [x, b[-1]] for x in init, or
+                f(b[0:-1], len(b) - 1, init) + [b[-1]]
+            }
+        where min{} gives the assignment with minimum memory usage.
+    '''
+
+    def _get_compatible_prev(candidate_range, best_assignments, cur_idx):
+        ''' Find closest position k of best_assignments that is independent of
+            candidate_range that candiate_range is compatible with all assignments
+            in best_assignments[k].
+            Return -1 if not found.
+        '''
+        def is_compatible_all(candidate_range, assignments):
+            ''' return true if compatiable for all assignments in assignments '''
+            return all([is_compatible(candidate_range[1], x, []) for x in assignments])
+
+        ii = cur_idx - 1
+        while ii >= 0:
+            cba = best_assignments[ii]
+            if is_compatible_all(candidate_range, cba):
+                return ii
+            ii -= 1
+        return -1
+
+    def _find_best(ranges, init_assignment, prev_best_assignment, counter):
+        ''' Find the best assignment for blobs 'ranges' given an initialized
+            assignment 'init_assignment'.
+
+            Blobs in ranges[0:-1] should be incompatible with blob range[-1].
+            'prev_best_assignment': best assignment for blobs in ranges[:-1]
+
+            By assigning ranges[-1] to each assignment k in 'init_assignment' or
+            in a new assignment, the problem becomes a smaller problem to find
+            the best assignment for ranges[0:-1] given the initial assignment
+            init_assigment[0:k, (k+1):-1].
+        '''
+        # Blob to check
+        find_range = ranges[-1]
+        # Blobs in ranges[0:-1] are incompatible with ranges[-1] so that we can
+        # reduce it to a smaller problem.
+        assert all(not is_compatible(x[1], [find_range], []) for x in ranges[0:-1])
+
+        sz = len(init_assignment)
+        best_candidates = []
+        # Try to assign 'find_range' to each assignment in init_assignment
+        for ii in range(sz):
+            if not is_compatible(find_range[1], init_assignment[ii], []):
+                continue
+            cur_best = copy.deepcopy(init_assignment)
+            cur_best[ii].append(find_range)
+            if len(ranges) > 1:
+                cur_best_tmp = [x for i, x in enumerate(cur_best) if i != ii]
+                # reduce to a smaller dp problem
+                cur_best_tmp = compute_assignments_dp(
+                    ranges[:-1], cur_best_tmp, counter)
+                cur_best = cur_best_tmp + [cur_best[ii]]
+            best_candidates.append(cur_best)
+        # Try to put 'find_range' in a new assignment
+        best_candidates.append(prev_best_assignment + [[find_range]])
+
+        ret = min(best_candidates, key=lambda x: get_memory_usage(x))
+        return ret
+
+    if not counter:
+        counter = [0]
+    counter[0] += 1
+
+    if counter and counter[0] % 5000 == 0:
+        rs = [ranges_sorted[0][1].defined, ranges_sorted[-1][1].used]
+        log.info('Finding assignments {} ({} -> {})...'.format(
+            counter[0], rs[0], rs[1]))
+
+    init_assignment = init_assignment or []
+    # best_assignments[k]: best assignments for first k blobs ranges_sorted[0:(k+1)]
+    best_assignments = []
+    # Find best assignment for blobs ranges_sorted[0:ii]
+    for ii, cur_range in enumerate(ranges_sorted):
+        # closest best_assignment that is independent of ranges_sorted[ii]
+        prev_idx = _get_compatible_prev(cur_range, best_assignments, ii)
+        prev_best = copy.deepcopy(init_assignment) if prev_idx < 0 else \
+                    copy.deepcopy(best_assignments[prev_idx])
+        # Need to find best assignment for blobs in 'ranges_part'
+        ranges_part = ranges_sorted[(prev_idx + 1):(ii + 1)]
+        cur_best = _find_best(
+            ranges_part, prev_best,
+            best_assignments[-1] if best_assignments else init_assignment,
+            counter)
+        assert _get_count(cur_best) == _get_count(prev_best) + len(ranges_part)
+        best_assignments.append(copy.deepcopy(cur_best))
+
+    assert len(best_assignments) == len(ranges_sorted)
+
+    best = best_assignments[-1]
+
+    return best
+
+
+def get_updated_ranges(ranges, max_live=None):
+    ''' Set LiveRange.defined = -1 if it is None
+        Set LiveRange.used = max_live if it is None
+        Set LiveRanee.size = 1 if it is None
+    '''
+
+    def _get_max_live(ranges):
+        max_live = max(x[1].used for x in ranges if x[1].used) + 1
+        return max_live
+
+    def _update_range(x, max_live, size):
+        cx = x
+        if x[1].defined is None:
+            cx = (cx[0], cx[1]._replace(defined=-1))
+        if x[1].used is None:
+            cx = (cx[0], cx[1]._replace(used=max_live))
+        if x[1].size is None:
+            cx = (cx[0], cx[1]._replace(size=size))
+        return cx
+
+    if max_live is None:
+        max_live = _get_max_live(ranges)
+    ranges = [_update_range(x, max_live, 1) for x in ranges]
+
+    return ranges
+
+
+def compute_assignments(ranges, static_blobs, algo):
+    '''
+    algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
+          AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
+          AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
+          cost of more computation.
+          AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
+          not provided.
+    '''
+
+    # Sort the ranges based on when they are last used.
+    # If LiveRange.used is None, then the blob is never used and could
+    # be consumed externally. Sort these to the end of the list as opposed
+    # to the beginning so that they can be shared as well.
+    ranges = sorted(
+        viewitems(ranges),
+        key=lambda p: (p[1].used is None, p[1].used),
+    )
+    # Update None values
+    ranges = get_updated_ranges(ranges)
+
+    # Sharable blobs
+    ranges_sharable = [x for x in ranges if x[0] not in static_blobs]
+    # Static blobs, not sharable
+    ranges_static = [x for x in ranges if x[0] in static_blobs]
+
+    log.info("Total sharable blobs {}".format(len(ranges_sharable)))
+
+    best_assignment = []
+    if algo == AssignmentAlgorithm.DYNAMIC_PROGRAMMING:
+        best_assignment = compute_assignments_dp(ranges_sharable, [])
+    elif algo == AssignmentAlgorithm.GREEDY:
+        best_assignment = compute_assignments_greedy(ranges_sharable, [])
+    else:
+        assert "Invalid algo name {}".format(algo)
+    best_assignment += [[x] for x in ranges_static]
+
+    # verify_assignments(best_assignment)
+
+    return best_assignment
+
+
+def verify_assignments(assignments):
+    for cur in assignments:
+        for x, y in zip(cur[0:-1], cur[1:]):
+            assert x[1].used < y[1].defined
+
+
+def compute_interference_graph(ops):
+    g = nx.DiGraph()
+    for i, op in enumerate(ops):
+        g.add_node(i, op=op)
+    for i, parent_op in enumerate(ops):
+        for j, child_op in enumerate(ops):
+            if i >= j:
+                continue
+            if any(output in child_op.input for output in parent_op.output):
+                deps = set(child_op.input).intersection(parent_op.output)
+                g.add_edge(i, j, deps=deps)
+                assert nx.is_directed_acyclic_graph(g), child_op
+    return g
+
+
+Optimization = collections.namedtuple(
+    'Optimization', ['net', 'assignments', 'blob_assignments'])
+
+
+def apply_assignments(net, blob_assignments):
+    def canonical_name(blob):
+        if blob not in blob_assignments:
+            return blob
+        return blob_assignments[blob]
+
+    for op in net.op:
+        # Descend into subnets of the recurrent network
+        if op.type.startswith('RecurrentNetwork'):
+            apply_recurrent_blob_assignments(op, blob_assignments, canonical_name)
+
+        for i, input_ in enumerate(op.input):
+            op.input[i] = canonical_name(input_)
+        for i, output in enumerate(op.output):
+            op.output[i] = canonical_name(output)
+
+
+
+def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
+    log.debug("Applying assignments to recurrent op: {}".format(op.type))
+    step_args = [a for a in op.arg if a.name.endswith("step_net")]
+    for step_arg in step_args:
+        apply_assignments(step_arg.n, blob_assignments)
+        for i, einp in enumerate(step_arg.n.external_input):
+            if einp in blob_assignments:
+                step_arg.n.external_input[i] = canonical_name(einp)
+    # Store renamings
+    for blob, renamed in viewitems(blob_assignments):
+        if blob in list(op.input) + list(op.output):
+            a = caffe2_pb2.Argument()
+            a.name = blob + ".rename"
+            a.s = str(renamed).encode("ascii")
+            op.arg.extend([a])
+
+
+class AssignmentAlgorithm(enum.Enum):
+    GREEDY = 0
+    DYNAMIC_PROGRAMMING = 1
+
+
+def optimize_inference_fast(net, static_blobs):
+    optim = caffe2_pb2.NetDef()
+    optim_str = C.memonger_optimize_inference_net(
+        net.SerializeToString(),
+        [str(s).encode('utf-8') for s in static_blobs]
+    )
+    optim.ParseFromString(optim_str)
+    return optim
+
+
+def optimize_interference(net, static_blobs,
+                          ordering_function=topological_sort_traversal,
+                          blob_sizes=None,
+                          algo=AssignmentAlgorithm.GREEDY):
+    """
+    ordering_function: topological_sort_traversal or
+                       topological_sort_traversal_longest_path.
+                       topological_sort_traversal_longest_path gives better
+                       results but needs a bit more computation.
+    algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
+          AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
+          AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
+          cost of more computation.
+          AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
+          not provided.
+    """
+
+    """
+    1) Use a BFS traversal of the execution graph to generate an
+       ordering of the node executions.
+    2) Generate use-def ranges for each `blob` in the BFS traversal
+       order.
+    3) Assign blobs to `canonical blobs`
+    4) Rename blobs to canonical blobs
+    """
+
+    net = copy.deepcopy(net)
+    g = compute_interference_graph(net.op)
+    ordering = ordering_function(g)
+    linearized_ops = [net.op[i] for i in ordering]
+
+    # Reorder ops in net based on the computed linearlized order.
+    # If the graph has multiple topological orderings and if the NetDef's
+    # ordering differs from the order used to compute ranges, then the
+    # runtime might end up overwriting blobs before they are used.
+    del net.op[:]
+    net.op.extend(linearized_ops)
+
+    ranges = compute_ranges(linearized_ops, blob_sizes)
+    assignments = compute_assignments(ranges, static_blobs, algo)
+    blob_assignments = compute_blob_assignments(assignments)
+    apply_assignments(net, blob_assignments)
+    return Optimization(
+        net=net,
+        blob_assignments=blob_assignments,
+        assignments=assignments)
+
+
+def verify_inplace_blobs(net_a, net_b):
+    """
+    Verifies that net_a and net_b have the same in-place blob assignments.
+    Particularly, that memonger did not add an in-place assignment when that
+    did not exist before.
+    """
+    def get_inplaces(op):
+        out = list(op.output)
+        inplaces = []
+        for j, inp in enumerate(op.input):
+            if inp in out:
+                inplaces.append([j, out.index(inp)])
+        return inplaces
+
+    for op_a, op_b in zip(net_a.op, net_b.op):
+        if op_a.type != op_b.type:
+            return False
+        if get_inplaces(op_a) != get_inplaces(op_b):
+            return False
+    return True
+
+
+def verify_graph_equality(net_a, net_b):
+    """
+    Determines if the execution of two graphs are identical.
+    That is, all inputs blobs are mapped to the same output blobs
+    for each operator in their respective positions.
+
+    This is meant to check the output of memonger with the original graph.
+    It assumes that the nets have same external input and output.
+
+    O(E) runtime + O(1) amortized cost to hash for python dict
+    """
+
+    def parent_list(ops):
+        parent_list = [[] for _ in ops]
+        edge_owner = {}
+        for i, op in enumerate(ops):
+            for blob in op.input:
+                parent_id = edge_owner.get(blob)
+                if parent_id is not None:
+                    parent_list[i].append(parent_id)
+            for blob in op.output:
+                edge_owner[blob] = i
+
+        return parent_list
+
+    # Operator wise equality checks
+    if (len(net_a.op) != len(net_b.op)):
+        return False
+    for op_a, op_b in zip(net_a.op, net_b.op):
+        if (op_a.type != op_b.type or
+                op_a.device_option != op_b.device_option or
+                op_a.engine != op_b.engine):
+            return False
+
+    # Print debug info
+    parent_list_a = parent_list(net_a.op)
+    parent_list_b = parent_list(net_b.op)
+    if parent_list_a != parent_list_b:
+        j = 0
+        for a, b in zip(parent_list_a, parent_list_b):
+            if a != b:
+                print("Difference {} vs {} \n {}".format(
+                    j, net_a.op[j], net_b.op[j]))
+                print("Parents: {} vs {}".format(a, b))
+
+            j += 1
+
+    # Net wise equality check
+    return parent_list_a == parent_list_b
+
+
+Statistics = collections.namedtuple(
+    'Statistics', ['baseline_nbytes', 'optimized_nbytes'])
+
+
+def blob_nbytes(blob):
+    sz = 0
+    try:
+        sz = workspace.FetchBlob(blob).nbytes
+    except Exception:
+        log.warning('Error when fetching blob {}'.format(blob))
+    return sz
+
+
+def compute_statistics(assignments):
+    blob_bytes = {
+        blob: blob_nbytes(blob) for assignment in assignments
+        for (blob, _) in assignment}
+    baseline_nbytes = sum(viewvalues(blob_bytes))
+    optimized_nbytes = sum(
+        max(blob_bytes[blob] for (blob, _) in assignment)
+        for assignment in assignments)
+    return Statistics(
+        baseline_nbytes=baseline_nbytes,
+        optimized_nbytes=optimized_nbytes)
+
+
+def collect_blob_sizes(net):
+    blobs = {}
+    for op in net.op:
+        for blob in op.input:
+            blobs[blob] = blob_nbytes(blob)
+        for blob in op.output:
+            blobs[blob] = blob_nbytes(blob)
+
+    return blobs
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
new file mode 100644
index 0000000..6536280
--- /dev/null
+++ b/caffe2/python/memonger_test.py
@@ -0,0 +1,761 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import workspace, memonger, core, model_helper, brew
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+from future.utils import viewvalues
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import unittest
+
+
+def has_blob(proto, needle):
+    for op in proto.op:
+        for inp in op.input:
+            if inp == needle:
+                return True
+        for outp in op.output:
+            if outp == needle:
+                return True
+    return False
+
+
+def count_blobs(proto):
+    blobs = set()
+    for op in proto.op:
+        blobs = blobs.union(set(op.input)).union(set(op.output))
+    return len(blobs)
+
+
+class MemongerTest(hu.HypothesisTestCase):
+    @given(input_dim=st.integers(min_value=1, max_value=10),
+           output_dim=st.integers(min_value=1, max_value=10),
+           batch_size=st.integers(min_value=1, max_value=10),
+           do=st.sampled_from(hu.device_options),
+           algo=st.sampled_from(memonger.AssignmentAlgorithm))
+    @settings(max_examples=5, timeout=120)
+    def test_simple_memonger(self, input_dim, output_dim, batch_size, do, algo):
+        m = model_helper.ModelHelper()
+        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+        fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+
+        fc3.Relu([], fc3)\
+           .Softmax([], "pred") \
+           .LabelCrossEntropy(["label"], ["xent"]) \
+           .AveragedLoss([], "loss")
+        input_to_grad = m.AddGradientOperators(["loss"])
+        m.net.Proto().device_option.CopyFrom(do)
+        m.param_init_net.Proto().device_option.CopyFrom(do)
+        static_blobs = \
+            [o for op in m.param_init_net.Proto().op for o in op.output] + \
+            ["data", "label", "loss", input_to_grad["fc1_w"]]
+
+        optimization = memonger.optimize_interference(
+            m.Proto(), static_blobs, algo=algo)
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("data", data, device_option=do)
+        workspace.FeedBlob("label", label, device_option=do)
+        workspace.RunNetOnce(m.net)
+        loss = workspace.FetchBlob("loss")
+        grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
+        workspace.RunNetOnce(optimization.net)
+        optimized_loss = workspace.FetchBlob("loss")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
+        np.testing.assert_almost_equal(loss, optimized_loss)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+        stats = memonger.compute_statistics(optimization.assignments)
+        self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
+
+        # run with blob sizes
+        blob_sizes = memonger.collect_blob_sizes(m.Proto())
+        optimization1 = memonger.optimize_interference(
+            m.Proto(), static_blobs, blob_sizes=blob_sizes, algo=algo)
+        workspace.RunNetOnce(optimization1.net)
+        optimized_loss = workspace.FetchBlob("loss")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
+        np.testing.assert_almost_equal(loss, optimized_loss)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+        stats = memonger.compute_statistics(optimization1.assignments)
+        self.assertLessEqual(stats.optimized_nbytes, stats.baseline_nbytes)
+
+    @given(input_dim=st.integers(min_value=1, max_value=10),
+           output_dim=st.integers(min_value=1, max_value=10),
+           batch_size=st.integers(min_value=1, max_value=10),
+           do=st.sampled_from(hu.device_options))
+    @settings(max_examples=5, timeout=120)
+    def test_fast_memonger(self, input_dim, output_dim, batch_size, do):
+        m = model_helper.ModelHelper()
+        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+        fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+
+        fc3.Relu([], fc3)\
+           .Softmax([], "pred") \
+           .LabelCrossEntropy(["label"], ["xent"]) \
+           .AveragedLoss([], "loss")
+        input_to_grad = m.AddGradientOperators(["loss"])
+        m.net.Proto().device_option.CopyFrom(do)
+        m.param_init_net.Proto().device_option.CopyFrom(do)
+        static_blobs = \
+            [o for op in m.param_init_net.Proto().op for o in op.output] + \
+            ["data", "label", "loss", input_to_grad["fc1_w"]]
+
+        optimized_net = memonger.optimize_inference_fast(
+            m.Proto(), static_blobs)
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("data", data, device_option=do)
+        workspace.FeedBlob("label", label, device_option=do)
+        workspace.RunNetOnce(m.net)
+        loss = workspace.FetchBlob("loss")
+        grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
+        workspace.RunNetOnce(optimized_net)
+        optimized_loss = workspace.FetchBlob("loss")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
+        np.testing.assert_almost_equal(loss, optimized_loss)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+
+        self.assertLess(count_blobs(optimized_net), count_blobs(m.Proto()))
+
+    def test_fast_memonger_unique_outputs(self):
+        m = model_helper.ModelHelper()
+        fc = []
+        for i in range(2):
+            z = brew.fc(
+                m, "data{}".format(i), "fc".format(i), dim_in=2, dim_out=2)
+            fc.append(z)
+        r = []
+        # Trick is here to have same input appear twice in a same Sum
+        for x in fc:
+            for y in fc:
+                r.append(brew.sum(m, [x, y], 1))
+        concated = brew.concat(m, r, "concated")
+        brew.relu(m, concated, "merged")
+
+        static_blobs = \
+            [o for op in m.param_init_net.Proto().op for o in op.output] + \
+            ["merged"] + ["data{}".format(i) for i in range(len(fc))]
+
+        optimized_net = memonger.optimize_inference_fast(
+            m.Proto(), static_blobs)
+        for op in optimized_net.op:
+            self.assertEqual(len(op.output), len(set(op.output)), str(op))
+
+    @given(input_dim=st.integers(min_value=1, max_value=4),
+           output_dim=st.integers(min_value=1, max_value=4),
+           batch_size=st.integers(min_value=1, max_value=4))
+    def test_gradient_optim(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
+            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
+            fc5.Relu([], fc5)\
+               .Softmax([], "pred") \
+               .LabelCrossEntropy(["label"], ["xent"]) \
+               .AveragedLoss([], "loss")
+        input_to_grad = m.AddGradientOperators(["name_x/loss"])
+
+        blobs_before = count_blobs(m.net.Proto())
+        optim_proto = memonger.share_grad_blobs(
+            m.net,
+            ["name_x/loss"],
+            set(viewvalues(m.param_to_grad)),
+            "name_x/",
+            share_activations=False,
+        )
+        blobs_after = count_blobs(optim_proto)
+        self.assertLess(blobs_after, blobs_before)
+
+        optim_proto_wacts = memonger.share_grad_blobs(
+            m.net,
+            ["name_x/loss"],
+            set(viewvalues(m.param_to_grad)),
+            "name_x/",
+            share_activations=True,
+            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
+        )
+        blobs_wact_optim = count_blobs(optim_proto_wacts)
+        self.assertLessEqual(blobs_wact_optim, blobs_after)
+
+        # Check that the last activations are not shared
+        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
+        self.assertTrue(
+            has_blob(optim_proto_wacts, "name_x/fc5"),
+            "Dont remap final activation",
+        )
+
+        # Test networks produce exactly same gradients
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("name_x/data", data)
+        workspace.FeedBlob("name_x/label", label)
+        workspace.RunNetOnce(m.net)
+        loss = workspace.FetchBlob("name_x/loss")
+        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss = workspace.FetchBlob("name_x/loss")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
+        np.testing.assert_almost_equal(loss, optimized_loss)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+
+        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
+
+        # Run with the forward optimization
+        workspace.RunNetOnce(optim_proto_wacts)
+        optimized_loss = workspace.FetchBlob("name_x/loss")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
+        np.testing.assert_almost_equal(loss, optimized_loss)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    def test_memonger_mix_cpu_gpu(self):
+        '''
+        Check that memonger does not make blobs cross CPU/GPU boundary
+        '''
+        m = model_helper.ModelHelper()
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
+            fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
+            fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
+            fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
+            fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
+            fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
+            fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
+            fc7_cpu.Relu([], fc7_cpu) \
+               .Softmax([], "pred") \
+               .LabelCrossEntropy(["label"], ["xent"]) \
+               .AveragedLoss([], "loss")
+        m.AddGradientOperators(["loss"])
+
+        blobs_before = count_blobs(m.net.Proto())
+        optim_proto = memonger.share_grad_blobs(
+            m.net,
+            ["loss"],
+            set(viewvalues(m.param_to_grad)),
+            "",
+            share_activations=True,
+            dont_share_blobs=set(),
+        )
+        blobs_after = count_blobs(optim_proto)
+        self.assertLess(blobs_after, blobs_before)
+
+        # Create set of blobs on CPU side and GPU side and check they don't
+        # overlap
+        device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
+        for op in optim_proto.op:
+            if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
+                dev = op.device_option.device_type
+                for b in list(op.input) + list(op.output):
+                    device_blobs[dev].add(b)
+
+        device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
+            device_blobs[caffe2_pb2.CUDA]
+        )
+        self.assertEquals(device_crossers, set())
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
+            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
+            fc5.Relu([], fc5) \
+               .Softmax([], "pred1") \
+               .LabelCrossEntropy(["label"], ["xent1"]) \
+               .AveragedLoss([], "loss1")
+            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
+            fc6.Relu([], fc6) \
+               .Softmax([], "pred2") \
+               .LabelCrossEntropy(["label"], ["xent2"]) \
+               .AveragedLoss([], "loss2")
+        input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
+
+        blobs_before = count_blobs(m.net.Proto())
+        optim_proto = memonger.share_grad_blobs(
+            m.net,
+            ["name_x/loss1", "name_x/loss2"],
+            set(viewvalues(m.param_to_grad)),
+            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
+            share_activations=True,
+            dont_share_blobs=set(['name_x/fc6', 'name_x/fc5',
+                                   str(input_to_grad["name_x/fc1_w"])]),
+        )
+        blobs_after = count_blobs(optim_proto)
+        self.assertLess(blobs_after, blobs_before)
+        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))
+
+        # Test networks produce exactly same gradients
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("name_x/data", data)
+        workspace.FeedBlob("name_x/label", label)
+        workspace.RunNetOnce(m.net)
+        loss1 = workspace.FetchBlob("name_x/loss1")
+        loss2 = workspace.FetchBlob("name_x/loss2")
+        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
+        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
+
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
+        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
+        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        np.testing.assert_almost_equal(loss2, optimized_loss2)
+        np.testing.assert_almost_equal(grad, optimized_grad)
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.Proto().type = "dag"
+        m.Proto().num_workers = 4
+
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+
+            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
+            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
+
+            # Branch
+            fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
+            fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
+            fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
+
+            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
+
+            fc5.Relu([], fc5sum) \
+               .Softmax([], "pred1") \
+               .LabelCrossEntropy(["label"], ["xent1"]) \
+               .AveragedLoss([], "loss1")
+            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
+            fc6.Relu([], fc6) \
+               .Softmax([], "pred2") \
+               .LabelCrossEntropy(["label"], ["xent2"]) \
+               .AveragedLoss([], "loss2")
+
+        blobs_before = count_blobs(m.net.Proto())
+        optim_proto = memonger.optimize_inference_for_dag(
+            m.net, ["name_x/data"], "name_x"
+        )
+        blobs_after = count_blobs(optim_proto)
+        self.assertLess(blobs_after, blobs_before)
+
+        # Test networks produce exactly same results
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("name_x/data", data)
+        workspace.FeedBlob("name_x/label", label)
+        workspace.RunNetOnce(m.net)
+        loss1 = workspace.FetchBlob("name_x/loss1")
+        loss2 = workspace.FetchBlob("name_x/loss2")
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
+        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        np.testing.assert_almost_equal(loss2, optimized_loss2)
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.net.Proto().type = "dag"
+        m.net.Proto().num_workers = 4
+        m.net.AddExternalInput("label")
+        m.net.AddExternalInput("data")
+
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+
+            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
+            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
+            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
+
+            # Branch
+            fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
+            fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
+            fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
+
+            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
+            fc5sum.Relu([], "relu1") \
+               .Softmax([], "pred1") \
+               .LabelCrossEntropy(["label"], ["xent1"]) \
+               .AveragedLoss([], "loss1")
+            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
+            fc6.Relu([], fc6) \
+               .Softmax([], "pred2") \
+               .LabelCrossEntropy(["label"], ["xent2"]) \
+               .AveragedLoss([], "loss2")
+
+        blobs_before = count_blobs(m.net.Proto())
+        optim_proto = memonger.optimize_inference_for_dag(
+            m.net, ["name_x/data"], "name_x/"
+        )
+
+        blobs_after = count_blobs(optim_proto)
+
+        # Extra test with when one of the parameters is also an input.
+        # This caused a bug before.
+        optim_proto_extra_input = memonger.optimize_inference_for_dag(
+            m.net, ["name_x/data", "name_x/fc1_w"], "name_x/"
+        )
+        blobs_after_extra_input = count_blobs(optim_proto_extra_input)
+        self.assertEqual(blobs_after, blobs_after_extra_input)
+        ###
+
+        print(str(optim_proto))
+        self.assertLess(blobs_after, blobs_before)
+
+        # Test networks produce exactly same results
+        data = np.random.randn(batch_size, input_dim).astype(np.float32)
+        label = np.random.randint(
+            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
+        workspace.RunNetOnce(m.param_init_net)
+        workspace.FeedBlob("name_x/data", data)
+        workspace.FeedBlob("name_x/label", label)
+        workspace.RunNetOnce(m.net)
+        loss1 = workspace.FetchBlob("name_x/loss1")
+        loss2 = workspace.FetchBlob("name_x/loss2")
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
+        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        np.testing.assert_almost_equal(loss2, optimized_loss2)
+
+    def test_rnn(self):
+        from caffe2.python import rnn_cell
+        T = 5
+        model = model_helper.ModelHelper()
+        seq_lengths, labels = \
+            model.net.AddExternalInputs(
+                'seq_lengths', 'labels',
+            )
+        init_blobs = []
+        for i in range(2):
+            hidden_init, cell_init = model.net.AddExternalInputs(
+                "hidden_init_{}".format(i),
+                "cell_init_{}".format(i)
+            )
+            init_blobs.extend([hidden_init, cell_init])
+        model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10])
+        output, last_hidden, _, last_state = rnn_cell.LSTM(
+            model=model,
+            input_blob="input",
+            seq_lengths=seq_lengths,
+            initial_states=init_blobs,
+            dim_in=10,
+            dim_out=[10, 10],
+            scope="lstm1",
+            forward_only=False,
+            drop_states=True,
+            return_last_layer_only=True,
+        )
+        softmax, loss = model.net.SoftmaxWithLoss(
+            [model.Flatten(output), "labels"],
+            ['softmax', 'loss'],
+        )
+
+        model.AddGradientOperators([loss])
+        blobs_before = count_blobs(model.net.Proto())
+        optim_proto = memonger.share_grad_blobs(
+            model.net,
+            ["loss"],
+            set(viewvalues(model.param_to_grad)),
+            "",
+            share_activations=True,
+            dont_share_blobs=set(),
+        )
+        blobs_after = count_blobs(optim_proto)
+        self.assertLess(blobs_after, blobs_before)
+
+        # Run once to see all blobs are set up correctly
+        for init_blob in init_blobs:
+            workspace.FeedBlob(init_blob, np.zeros(
+                [1, 4, 10], dtype=np.float32
+            ))
+        workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32))
+        workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+    def test_compute_interference_graph_inplace_ops(self):
+        m = model_helper.ModelHelper()
+        m.Copy("b1", "b1")
+        m.Copy("b1", "b1")
+        m.Copy("b1", "b1")
+        g = memonger.compute_interference_graph(m.net.Proto().op)
+        self.assertEqual(list(g.edges()), [(0, 1), (0, 2), (1, 2)])
+
+    def test_topological_sort_longest_path(self):
+        m = model_helper.ModelHelper()
+        # 0
+        m.Copy("conv0_w_comp", "conv0_w")
+        # 1
+        conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
+        # 2
+        m.Copy("conv2_w", "conv2_w")
+        # 3
+        brew.conv(m, conv0, "conv2", 16, 32, 4)
+
+        g = memonger.compute_interference_graph(m.net.Proto().op)
+
+        orders_org = memonger.topological_sort_traversal(g)
+        orders_gt_org = [2, 0, 1, 3]
+        self.assertEqual(orders_gt_org, list(orders_org))
+
+        orders = memonger.topological_sort_traversal_longest_path(g)
+        # longer path is in front of the shorter one
+        orders_gt = [0, 1, 2, 3]
+        self.assertEqual(orders_gt, list(orders))
+
+    def test_topological_sort_longest_path_multi_target(self):
+        # two outputs: conv2 and data4
+        m = model_helper.ModelHelper()
+        # 0
+        m.Copy("conv0_w_comp", "conv0_w")
+        # 1
+        conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
+        # 2
+        m.Copy("conv2_w", "conv2_w")
+        # 3
+        brew.conv(m, conv0, "conv2", 16, 32, 4)
+        # 4
+        m.Copy("data1", "data2")
+        # 5
+        m.Copy("data2", "data3")
+
+        g = memonger.compute_interference_graph(m.net.Proto().op)
+
+        orders_org = memonger.topological_sort_traversal(g)
+        orders_gt_org = [4, 5, 2, 0, 1, 3]
+        self.assertEqual(orders_gt_org, list(orders_org))
+
+        orders = memonger.topological_sort_traversal_longest_path(g)
+        # longer path is in front of the shorter one
+        orders_gt = [0, 1, 2, 3, 4, 5]
+        self.assertEqual(orders_gt, list(orders))
+
+    def test_topological_sort_longest_path_single_node(self):
+        # single node
+        m = model_helper.ModelHelper()
+        # 0
+        m.Copy("conv0_w_comp", "conv0_w")
+
+        g = memonger.compute_interference_graph(m.net.Proto().op)
+
+        orders_org = memonger.topological_sort_traversal(g)
+        orders_gt_org = [0]
+        self.assertEqual(orders_gt_org, list(orders_org))
+
+        orders = memonger.topological_sort_traversal_longest_path(g)
+        # longer path is in front of the shorter one
+        orders_gt = [0]
+        self.assertEqual(orders_gt, list(orders))
+
+    def test_compute_assignments_greedy(self):
+        LiveRange = memonger.LiveRange
+        ranges_sorted = [
+            ('b1', LiveRange(1, 3, 10)),
+            ('b2', LiveRange(3, 4, 1)),
+            ('b3', LiveRange(5, 6, 1)),
+            ('b4', LiveRange(5, 7, 10)),
+        ]
+        assignment_gt = [
+            [ranges_sorted[0], ranges_sorted[3]],
+            [ranges_sorted[1], ranges_sorted[2]],
+        ]
+
+        best = memonger.compute_assignments_greedy(ranges_sorted, None)
+        self.assertEqual(memonger.get_memory_usage(best), 11)
+        self.assertEqual(best, assignment_gt)
+
+    def test_compute_assignments_dp(self):
+        LiveRange = memonger.LiveRange
+        ranges_sorted = [
+            ('b1', LiveRange(1, 3, 10)),
+            ('b2', LiveRange(3, 4, 1)),
+            ('b3', LiveRange(5, 6, 1)),
+            ('b4', LiveRange(5, 7, 10)),
+        ]
+
+        best = memonger.compute_assignments_dp(ranges_sorted, None)
+        self.assertEqual(memonger.get_memory_usage(best), 11)
+
+    def test_compute_assignments_dp1(self):
+        LiveRange = memonger.LiveRange
+        ranges_sorted = [
+            ('b1', LiveRange(1, 2, 10)),
+            ('b2', LiveRange(4, 6, 1)),
+            ('b3', LiveRange(5, 6, 10)),
+        ]
+
+        best = memonger.compute_assignments_dp(ranges_sorted, [])
+        self.assertEqual(memonger.get_memory_usage(best), 11)
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_verify_graph_equality(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.Proto().type = "dag"
+        m.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m, [fc2, fc3], "out")
+
+        m2 = model_helper.ModelHelper()
+        m2.Proto().type = "dag"
+        m2.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m2, "data", "other_x", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m2, fc1, "other_y", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m2, fc1, "other_z", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m2, [fc2, fc3], "out")
+
+        self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_verify_graph_equality_harder(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.Proto().type = "dag"
+        m.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
+            fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
+            fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m, [fc3a, fc3b], "out")
+
+        m2 = model_helper.ModelHelper()
+        m2.Proto().type = "dag"
+        m2.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc2b = brew.fc(m2, fc1, "z", dim_in=output_dim, dim_out=output_dim)
+            fc3a = brew.fc(m2, fc2a, "y", dim_in=output_dim, dim_out=output_dim)
+            fc3b = brew.fc(m2, fc2b, "z", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m2, [fc3a, fc3b], "out")
+
+        self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_verify_graph_inequality(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.Proto().type = "dag"
+        m.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m, [fc2, fc3], "out")
+
+        m2 = model_helper.ModelHelper()
+        m2.Proto().type = "dag"
+        m2.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m2, [fc2, fc3], "out")
+
+        self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
+
+    @given(input_dim=st.integers(min_value=4, max_value=4),
+           output_dim=st.integers(min_value=4, max_value=4),
+           batch_size=st.integers(min_value=4, max_value=4))
+    def test_verify_graph_inequality_harder(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        m.Proto().type = "dag"
+        m.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
+            fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
+            fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m, [fc3a, fc3b], "out")
+
+        m2 = model_helper.ModelHelper()
+        m2.Proto().type = "dag"
+        m2.Proto().num_workers = 4
+        with core.NameScope("name_x"):
+            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
+            fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc2b = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
+            fc3a = brew.fc(m2, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
+            fc3b = brew.fc(m2, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
+            brew.sum(m2, [fc3a, fc3b], "out")
+
+        self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
+
+    def test_release_blobs_when_used(self):
+        m = model_helper.ModelHelper()
+        fc1 = brew.fc(m, "data", "x", dim_in=2, dim_out=2)
+        fc2 = brew.fc(m, fc1, "y", dim_in=2, dim_out=2)
+        fc3 = brew.fc(m, fc1, "z", dim_in=2, dim_out=2)
+        fc4 = brew.fc(m, fc2, "u", dim_in=2, dim_out=2)
+        m.net.Alias(["u"], ["u_alias"])
+
+        brew.sum(m, [fc3, fc4], "out")
+
+        with_frees = memonger.release_blobs_when_used(m.net.Proto(), set("data"))
+
+        expect_frees = {"x", "y", "z"}  # out is external output
+                                        # and u is aliased so cannot be freed
+        found_frees = set()
+        for op in with_frees.op:
+            if op.type == "Free":
+                self.assertFalse(op.input[0] in found_frees)  # no double frees
+                found_frees.add(op.input[0])
+            else:
+                # Check a freed blob is not used anymore
+                for inp in op.input:
+                    self.assertFalse(inp in found_frees)
+                for outp in op.output:
+                    self.assertFalse(outp in found_frees)
+
+        self.assertEqual(expect_frees, found_frees)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mint/app.py b/caffe2/python/mint/app.py
new file mode 100644
index 0000000..c5da820
--- /dev/null
+++ b/caffe2/python/mint/app.py
@@ -0,0 +1,187 @@
+## @package app
+# Module caffe2.python.mint.app
+import argparse
+import flask
+import glob
+import numpy as np
+import nvd3
+import os
+import sys
+import tornado.httpserver
+import tornado.wsgi
+
+__folder__ = os.path.abspath(os.path.dirname(__file__))
+
+app = flask.Flask(
+    __name__,
+    template_folder=os.path.join(__folder__, "templates"),
+    static_folder=os.path.join(__folder__, "static")
+)
+args = None
+
+
+def jsonify_nvd3(chart):
+    chart.buildcontent()
+    # Note(Yangqing): python-nvd3 does not seem to separate the built HTML part
+    # and the script part. Luckily, it seems to be the case that the HTML part is
+    # only a <div>, which can be accessed by chart.container; the script part,
+    # while the script part occupies the rest of the html content, which we can
+    # then find by chart.htmlcontent.find['<script>'].
+    script_start = chart.htmlcontent.find('<script>') + 8
+    script_end = chart.htmlcontent.find('</script>')
+    return flask.jsonify(
+        result=chart.container,
+        script=chart.htmlcontent[script_start:script_end].strip()
+    )
+
+
+def visualize_summary(filename):
+    try:
+        data = np.loadtxt(filename)
+    except Exception as e:
+        return 'Cannot load file {}: {}'.format(filename, str(e))
+    chart_name = os.path.splitext(os.path.basename(filename))[0]
+    chart = nvd3.lineChart(
+        name=chart_name + '_summary_chart',
+        height=args.chart_height,
+        y_axis_format='.03g'
+    )
+    if args.sample < 0:
+        step = max(data.shape[0] / -args.sample, 1)
+    else:
+        step = args.sample
+    xdata = np.arange(0, data.shape[0], step)
+    # data should have 4 dimensions.
+    chart.add_serie(x=xdata, y=data[xdata, 0], name='min')
+    chart.add_serie(x=xdata, y=data[xdata, 1], name='max')
+    chart.add_serie(x=xdata, y=data[xdata, 2], name='mean')
+    chart.add_serie(x=xdata, y=data[xdata, 2] + data[xdata, 3], name='m+std')
+    chart.add_serie(x=xdata, y=data[xdata, 2] - data[xdata, 3], name='m-std')
+    return jsonify_nvd3(chart)
+
+
+def visualize_print_log(filename):
+    try:
+        data = np.loadtxt(filename)
+        if data.ndim == 1:
+            data = data[:, np.newaxis]
+    except Exception as e:
+        return 'Cannot load file {}: {}'.format(filename, str(e))
+    chart_name = os.path.splitext(os.path.basename(filename))[0]
+    chart = nvd3.lineChart(
+        name=chart_name + '_log_chart',
+        height=args.chart_height,
+        y_axis_format='.03g'
+    )
+    if args.sample < 0:
+        step = max(data.shape[0] / -args.sample, 1)
+    else:
+        step = args.sample
+    xdata = np.arange(0, data.shape[0], step)
+    # if there is only one curve, we also show the running min and max
+    if data.shape[1] == 1:
+        # We also print the running min and max for the steps.
+        trunc_size = data.shape[0] / step
+        running_mat = data[:trunc_size * step].reshape((trunc_size, step))
+        chart.add_serie(
+            x=xdata[:trunc_size],
+            y=running_mat.min(axis=1),
+            name='running_min'
+        )
+        chart.add_serie(
+            x=xdata[:trunc_size],
+            y=running_mat.max(axis=1),
+            name='running_max'
+        )
+        chart.add_serie(x=xdata, y=data[xdata, 0], name=chart_name)
+    else:
+        for i in range(0, min(data.shape[1], args.max_curves)):
+            # data should have 4 dimensions.
+            chart.add_serie(
+                x=xdata,
+                y=data[xdata, i],
+                name='{}[{}]'.format(chart_name, i)
+            )
+
+    return jsonify_nvd3(chart)
+
+
+def visualize_file(filename):
+    fullname = os.path.join(args.root, filename)
+    if filename.endswith('summary'):
+        return visualize_summary(fullname)
+    elif filename.endswith('log'):
+        return visualize_print_log(fullname)
+    else:
+        return flask.jsonify(
+            result='Unsupport file: {}'.format(filename),
+            script=''
+        )
+
+
+@app.route('/')
+def index():
+    files = glob.glob(os.path.join(args.root, "*.*"))
+    files.sort()
+    names = [os.path.basename(f) for f in files]
+    return flask.render_template(
+        'index.html',
+        root=args.root,
+        names=names,
+        debug_messages=names
+    )
+
+
+@app.route('/visualization/<string:name>')
+def visualization(name):
+    ret = visualize_file(name)
+    return ret
+
+
+def main(argv):
+    parser = argparse.ArgumentParser("The mint visualizer.")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=5000,
+        help="The flask port to use."
+    )
+    parser.add_argument(
+        '-r',
+        '--root',
+        type=str,
+        default='.',
+        help="The root folder to read files for visualization."
+    )
+    parser.add_argument(
+        '--max_curves',
+        type=int,
+        default=5,
+        help="The max number of curves to show in a dump tensor."
+    )
+    parser.add_argument(
+        '--chart_height',
+        type=int,
+        default=300,
+        help="The chart height for nvd3."
+    )
+    parser.add_argument(
+        '-s',
+        '--sample',
+        type=int,
+        default=-200,
+        help="Sample every given number of data points. A negative "
+        "number means the total points we will sample on the "
+        "whole curve. Default 100 points."
+    )
+    global args
+    args = parser.parse_args(argv)
+    server = tornado.httpserver.HTTPServer(tornado.wsgi.WSGIContainer(app))
+    server.listen(args.port)
+    print("Tornado server starting on port {}.".format(args.port))
+    tornado.ioloop.IOLoop.instance().start()
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/caffe2/python/mint/static/css/simple-sidebar.css b/caffe2/python/mint/static/css/simple-sidebar.css
new file mode 100644
index 0000000..6bb18e9
--- /dev/null
+++ b/caffe2/python/mint/static/css/simple-sidebar.css
@@ -0,0 +1,125 @@
+/*!
+ * Start Bootstrap - Simple Sidebar HTML Template (http://startbootstrap.com)
+ * Code licensed under the Apache License v2.0.
+ * For details, see http://www.apache.org/licenses/LICENSE-2.0.
+ */
+
+/* Toggle Styles */
+
+#wrapper {
+    padding-left: 0;
+    -webkit-transition: all 0.5s ease;
+    -moz-transition: all 0.5s ease;
+    -o-transition: all 0.5s ease;
+    transition: all 0.5s ease;
+}
+
+#wrapper.toggled {
+    padding-left: 250px;
+}
+
+#sidebar-wrapper {
+    z-index: 1000;
+    position: fixed;
+    left: 250px;
+    width: 0;
+    height: 100%;
+    margin-left: -250px;
+    overflow-y: auto;
+    background: rgb(193,237,201);
+    -webkit-transition: all 0.5s ease;
+    -moz-transition: all 0.5s ease;
+    -o-transition: all 0.5s ease;
+    transition: all 0.5s ease;
+}
+
+#wrapper.toggled #sidebar-wrapper {
+    width: 250px;
+}
+
+#page-content-wrapper {
+    width: 100%;
+    position: absolute;
+    padding: 15px;
+}
+
+#wrapper.toggled #page-content-wrapper {
+    position: absolute;
+    margin-right: -250px;
+}
+
+/* Sidebar Styles */
+
+.sidebar-nav {
+    position: absolute;
+    top: 0;
+    width: 250px;
+    margin-bottom: 40px;
+    padding: 0;
+    list-style: none;
+}
+
+.sidebar-nav li {
+    text-indent: 20px;
+    line-height: 30px;
+}
+
+.sidebar-nav li a {
+    display: block;
+    text-decoration: none;
+    color: #999999;
+}
+
+.sidebar-nav li a:hover {
+    text-decoration: none;
+    color: #000;
+    background: rgba(255,255,255,0.8);
+}
+
+.sidebar-nav li a:active,
+.sidebar-nav li a:focus {
+    text-decoration: none;
+}
+
+.sidebar-nav > .sidebar-brand {
+    height: 65px;
+    font-size: 18px;
+    line-height: 60px;
+}
+
+.sidebar-nav > .sidebar-brand a {
+    color: #999999;
+}
+
+.sidebar-nav > .sidebar-brand a:hover {
+    color: #fff;
+    background: none;
+}
+
+@media(min-width:768px) {
+    #wrapper {
+        padding-left: 250px;
+    }
+
+    #wrapper.toggled {
+        padding-left: 0;
+    }
+
+    #sidebar-wrapper {
+        width: 250px;
+    }
+
+    #wrapper.toggled #sidebar-wrapper {
+        width: 0;
+    }
+
+    #page-content-wrapper {
+        padding: 20px;
+        position: relative;
+    }
+
+    #wrapper.toggled #page-content-wrapper {
+        position: relative;
+        margin-right: 0;
+    }
+}
\ No newline at end of file
diff --git a/caffe2/python/mint/templates/index.html b/caffe2/python/mint/templates/index.html
new file mode 100644
index 0000000..506f969
--- /dev/null
+++ b/caffe2/python/mint/templates/index.html
@@ -0,0 +1,134 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Mint</title>
+    <!-- Latest compiled and minified CSS -->
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
+    <!--  NVD3 css -->
+    <link href="http://nvd3.org/assets/css/nv.d3.css" type="text/css" rel="stylesheet"/>
+    <!-- static sidebar -->
+    <link href="{{ url_for('static', filename='css/simple-sidebar.css') }}" type="text/css" rel="stylesheet"/>
+  </head>
+  <body>
+    <script src="http://nvd3.org/assets/lib/d3.v3.js"></script>
+    <script src="http://nvd3.org/assets/js/nv.d3.js"></script>
+    <!-- Simple Ajax content -->
+    <script>
+    function getCurrentTimeString() {
+      var now = new Date();
+      return now.toLocaleString();
+    }
+    function loadVisualization(content) {
+      var xmlhttp = new XMLHttpRequest();
+      xmlhttp.onreadystatechange=function() {
+        if (xmlhttp.readyState==4 && xmlhttp.status==200) {
+          var response = JSON.parse(xmlhttp.responseText);
+          document.getElementById("visualization_" + content).innerHTML =
+              response['result'];
+          document.getElementById("visualization_" + content + "_time").innerHTML =
+              getCurrentTimeString();
+          eval(response['script']);
+        }
+      }
+      document.getElementById("visualization_" + content).innerHTML =
+          "Loading...";
+      xmlhttp.open("GET", "visualization/" + content, true);
+      xmlhttp.send();
+    }
+    </script>
+
+    <div id="wrapper">
+      <!-- Sidebar -->
+      <div id="sidebar-wrapper">
+        <ul class="sidebar-nav">
+          <li class="sidebar-brand">
+            <h3><span class="glyphicon glyphicon-leaf" aria-hidden="true"></span>&nbspCaffe-Mint</h3>
+          </li>
+          <li><a href="#page-content-wrapper"><strong>Top</strong></a></li>
+          {% for name in names %}
+            <li><a href="#visualization_{{name}}_panel"> {{name}} </a></li>
+          {% endfor %}
+        </ul>
+      </div> <!-- /#sidebar-wrapper -->
+
+      <div id="page-content-wrapper">
+        <p>
+          Visualizing folder: {{ root }}.<br/>
+          <a class="btn btn-default" id="menu-toggle">Toggle sidebar</a>
+          <a class="btn btn-default" id="menu-toggle" onclick="refreshAll()">Refresh all</a>
+        </p>
+        <div role="tabpanel">
+          <!-- Nav tabs -->
+          <ul class="nav nav-tabs" role="tablist">
+            <li role="presentation" class="active"><a href="#visualizations" aria-controls="visualizations" role="tab" data-toggle="tab">Visualizations</a></li>
+            <li role="presentation"><a href="#debug" aria-controls="debug" role="tab" data-toggle="tab">Debug</a></li>
+          </ul>
+
+          <!-- Tab panes -->
+          <div class="tab-content">
+            <p></p>
+            <div role="tabpanel" class="tab-pane active" id="visualizations">
+              {% for name in names %}
+                <div class="panel panel-default" id="visualization_{{name}}_panel">
+                  <div class="panel-heading">
+                    {{ name }}
+                    <a onclick="loadVisualization('{{name}}')">
+                      <span class="glyphicon glyphicon-refresh" aria-hidden="true"></span>
+                    </a>
+                    <a href="#page-content-wrapper" class="pull-right"> Top </a>
+                  </div>
+                  <div class="panel-body">
+                    <div id="visualization_{{name}}"> Loading... </div>
+                    <p> Last updated: <span id="visualization_{{name}}_time">NA</span></p>
+                  </div>
+                </div>
+              {% endfor %}
+            </div>
+            <div role="tabpanel" class="tab-pane" id="debug">
+              <ul>
+                {% for message in debug_messages %}
+                  <li>{{ message }}</li>
+                {% endfor %}
+              </ul>
+            </div>
+          </div>
+        </div>
+
+
+        <hr>
+        <div id="footer">
+          <div class="container">
+            <p>
+              Mint is a minimal Caffe2 visualization tool by
+              <a href="http://daggerfs.com/" target="_blank">Yangqing</a>.
+            </p>
+          </div>
+        </div>
+      </div> <!-- /#page-content-wrapper -->
+    </div><!-- /#wrapper -->
+
+    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
+    <!-- Latest compiled and minified JavaScript -->
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+    <!-- Menu Toggle Script -->
+    <script>
+    $("#menu-toggle").click(function(e) {
+        e.preventDefault();
+        $("#wrapper").toggleClass("toggled");
+    });
+    </script>
+    <!-- load all contents for the first time when this page loads. -->
+    <script>
+      function refreshAll() {
+        {% for name in names %}
+          loadVisualization("{{name}}");
+        {% endfor %}
+      }
+      refreshAll();
+    </script>
+  </body>
+</html>
\ No newline at end of file
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
new file mode 100644
index 0000000..73df482
--- /dev/null
+++ b/caffe2/python/mkl/mkl_LRN_op_test.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+
+
+class MKLLRNTest(hu.HypothesisTestCase):
+    @given(input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           im_size=st.integers(1, 10),
+           order=st.sampled_from(["NCHW"]),
+           **mu.gcs)
+
+    def test_mkl_LRN(self, input_channels,
+                            batch_size, im_size, order,
+                             gc, dc):
+        op = core.CreateOperator(
+            "LRN",
+            ["X"],
+            ["Y", "Y_scale"],
+            size=5,
+            alpha=0.001,
+            beta=0.75,
+            bias=2.0,
+            order=order,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, im_size, im_size).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
new file mode 100644
index 0000000..35eae62
--- /dev/null
+++ b/caffe2/python/mkl/mkl_LRN_speed_test.py
@@ -0,0 +1,79 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import cnn, core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testLRNSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 2, 224, 224).astype(np.float32)
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.LRN("X", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW")
+        net.LRN("X_mkl", ["Y_mkl", "Y_Scale_mkl"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("LRN CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    def testConvReluLRNSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5
+        W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5
+        b = np.random.rand(64).astype(np.float32) - 0.5
+
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+
+        net = core.Net("test")
+
+        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11)
+        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
+                 pad=1, stride=1, kernel=11, device_option=mkl_do)
+        net.Relu("C", "R")
+        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
+        net.LRN("R", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW")
+        net.LRN("R_mkl", ["Y_mkl", "Y_Scale_mkl"],size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py
new file mode 100644
index 0000000..a1a96ca
--- /dev/null
+++ b/caffe2/python/mkl/mkl_concat_op_test.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(
+    not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn."
+)
+class MKLConcatTest(hu.HypothesisTestCase):
+    @given(
+        batch_size=st.integers(1, 10),
+        channel_splits=st.lists(st.integers(1, 10), min_size=1, max_size=3),
+        height=st.integers(1, 10),
+        width=st.integers(1, 10),
+        **mu.gcs
+    )
+    def test_mkl_concat(
+        self, batch_size, channel_splits, height, width, gc, dc
+    ):
+        Xs = [
+            np.random.rand(batch_size, channel,
+                           height, width).astype(np.float32)
+            for channel in channel_splits
+        ]
+        op = core.CreateOperator(
+            "Concat",
+            ["X_{}".format(i) for i in range(len(Xs))],
+            ["concat_result", "split_info"],
+            order="NCHW",
+        )
+        self.assertDeviceChecks(dc, op, Xs, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
new file mode 100644
index 0000000..38ceb68
--- /dev/null
+++ b/caffe2/python/mkl/mkl_conv_op_test.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLConvTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 20),
+           input_channels=st.integers(1, 16),
+           output_channels=st.integers(1, 16),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           group=st.integers(1, 8),
+           **mu.gcs)
+    def test_mkl_convolution(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, group, gc, dc):
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            group=group
+        )
+        X = np.random.rand(
+            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels * group, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py
new file mode 100644
index 0000000..633865c
--- /dev/null
+++ b/caffe2/python/mkl/mkl_copy_op_test.py
@@ -0,0 +1,68 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+import caffe2.proto.caffe2_pb2 as pb2
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKCopyTest(hu.HypothesisTestCase):
+    @given(width=st.integers(7, 9),
+           height=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **mu.gcs)
+    def test_mkl_copy(self,
+                      width,
+                      height,
+                      input_channels,
+                      batch_size,
+                      gc, dc):
+        X = np.random.rand(
+            batch_size, input_channels, width, height).astype(np.float32)
+        self.ws.create_blob("X").feed(X, pb2.DeviceOption())
+        self.ws.run(core.CreateOperator(
+            "CopyCPUToMKL",
+            ["X"],
+            ["X_MKL"],
+            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
+        ))
+        self.ws.run(core.CreateOperator(
+            "CopyMKLToCPU",
+            ["X_MKL"],
+            ["X_copy"],
+            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
+        ))
+        np.testing.assert_array_equal(X, self.ws.blobs["X_copy"].fetch())
+
+    @given(n=st.sampled_from([0, 10]))
+    def test_mkl_zero_copy(self, n):
+        shape = (0, n)
+        X = np.zeros(shape=shape).astype(np.float32)
+        self.ws.create_blob("X").feed(X, pb2.DeviceOption())
+        self.ws.run(core.CreateOperator(
+            "CopyCPUToMKL",
+            ["X"],
+            ["X_MKL"],
+            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
+        ))
+        self.ws.run(core.CreateOperator(
+            "CopyMKLToCPU",
+            ["X_MKL"],
+            ["X_copy"],
+            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
+        ))
+        np.testing.assert_equal(shape, self.ws.blobs["X_copy"].fetch().shape)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
new file mode 100644
index 0000000..eab454f
--- /dev/null
+++ b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLElementwiseAddTest(hu.HypothesisTestCase):
+    @given(size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_mkl_elementwise_add(self,
+                                 size,
+                                 input_channels,
+                                 batch_size,
+                                 inplace,
+                                 gc,
+                                 dc):
+        op = core.CreateOperator(
+            "Add",
+            ["X0", "X1"],
+            ["X0" if inplace else "Y"],
+        )
+        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
+            np.float32) for _ in range(2)]
+        self.assertDeviceChecks(dc, op, Xs, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
new file mode 100644
index 0000000..71e0754
--- /dev/null
+++ b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLElementwiseSumTest(hu.HypothesisTestCase):
+    @given(size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           inputs=st.integers(1, 3),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_mkl_elementwise_sum(self,
+                                 size,
+                                 input_channels,
+                                 batch_size,
+                                 inputs,
+                                 inplace,
+                                 gc,
+                                 dc):
+        op = core.CreateOperator(
+            "Sum",
+            ["X_{}".format(i) for i in range(inputs)],
+            ["X_0" if inplace else "Y"],
+        )
+        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
+            np.float32) for _ in range(inputs)]
+        self.assertDeviceChecks(dc, op, Xs, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
new file mode 100644
index 0000000..01e8c9b
--- /dev/null
+++ b/caffe2/python/mkl/mkl_fc_op_test.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLFcTest(hu.HypothesisTestCase):
+    @given(n=st.integers(1, 5), m=st.integers(1, 5),
+           k=st.integers(1, 5), **mu.gcs)
+
+    def test_mkl_fc(self,n, m, k, gc, dc):
+        X = np.random.rand(m, k).astype(np.float32) - 0.5
+        W = np.random.rand(n, k).astype(np.float32) - 0.5
+        b = np.random.rand(n).astype(np.float32) - 0.5
+
+        op = core.CreateOperator(
+            'FC',
+            ['X', 'W', 'b'],
+            ["Y"]
+            )
+
+        self.assertDeviceChecks(dc, op, [X, W, b], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
new file mode 100644
index 0000000..7cabadf
--- /dev/null
+++ b/caffe2/python/mkl/mkl_fc_speed_test.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import cnn, core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testFCSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 256, 6, 6).astype(np.float32) - 0.5
+        #X = np.random.rand(32, 256*6*6).astype(np.float32) - 0.5
+        W = np.random.rand(4096, 9216).astype(np.float32) - 0.5
+        b = np.random.rand(4096).astype(np.float32) - 0.5
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.FC(["X", "W", "b"], "Y")
+        net.FC(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl", device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    def testConvReluMaxPoolFcSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 256, 13, 13).astype(np.float32) - 0.5
+        W = np.random.rand(256, 256, 3, 3).astype(np.float32) - 0.5
+        b = np.random.rand(256).astype(np.float32) - 0.5
+
+        w_fc = np.random.rand(4096, 9216).astype(np.float32) - 0.5
+        b_fc = np.random.rand(4096).astype(np.float32) - 0.5
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("w_fc", w_fc)
+        workspace.FeedBlob("b_fc", b_fc)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+        workspace.FeedBlob("w_fc_mkl", w_fc, device_option=mkl_do)
+        workspace.FeedBlob("b_fc_mkl", b_fc, device_option=mkl_do)
+
+        net = core.Net("test")
+
+        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=3)
+        net.Relu("C", "R")
+        net.MaxPool("R", "P", stride=2, kernel=3)
+        net.FC(["P","w_fc", "b_fc"], "Y")
+
+        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
+                 pad=1, stride=1, kernel=3, device_option=mkl_do)
+        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
+        net.MaxPool("R_mkl", "P_mkl",
+                 stride=2, kernel=3, device_option=mkl_do)
+        net.FC(["P_mkl","w_fc_mkl", "b_fc_mkl"], "Y_mkl", device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
new file mode 100644
index 0000000..dbdf12c
--- /dev/null
+++ b/caffe2/python/mkl/mkl_fill_op_test.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLFillTest(hu.HypothesisTestCase):
+    @given(n=st.integers(1, 4), c=st.integers(1, 4),
+           h=st.integers(1, 4), w=st.integers(1, 4),
+           filler=st.sampled_from(
+               ["XavierFill", "ConstantFill", "GaussianFill", "MSRAFill"]
+           ),
+           seed=st.integers(5, 10),
+           **mu.gcs_cpu_mkl)
+    def test_mkl_fill(self, n, c, h, w, filler, seed, gc, dc):
+        op = core.CreateOperator(
+            filler,
+            [],
+            ["Y"],
+            shape=[n, c, h, w],
+        )
+        for d in dc:
+            d.random_seed = seed
+        self.assertDeviceChecks(dc, op, [], [0])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py
new file mode 100644
index 0000000..a4d553f
--- /dev/null
+++ b/caffe2/python/mkl/mkl_pool_op_test.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings, assume
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLPoolTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           method=st.sampled_from(["MaxPool", "AveragePool"]),
+           **mu.gcs)
+    @settings(max_examples=2, timeout=100)
+    def test_mkl_pooling(self, stride, pad, kernel, size,
+                         input_channels, batch_size,
+                         method, gc, dc):
+        assume(pad < kernel)
+        op = core.CreateOperator(
+            method,
+            ["X"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
new file mode 100644
index 0000000..a0fa8ca
--- /dev/null
+++ b/caffe2/python/mkl/mkl_pool_speed_test.py
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import cnn, core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testMaxPoolingSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 64, 224, 224).astype(np.float32)
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.MaxPool("X", "Y", stride=2, kernel=3)
+        net.MaxPool("X_mkl", "Y_mkl",
+                 stride=2, kernel=3, device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("Maxpooling CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    def testAveragePoolingSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 64, 224, 224).astype(np.float32)
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.AveragePool("X", "Y", stride=2, kernel=3)
+        net.AveragePool("X_mkl", "Y_mkl",
+                 stride=2, kernel=3, device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("Averagepooling CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    def testConvReluMaxPoolSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5
+        W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5
+        b = np.random.rand(64).astype(np.float32) - 0.5
+
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+
+        net = core.Net("test")
+
+        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11)
+        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
+                 pad=1, stride=1, kernel=11, device_option=mkl_do)
+        net.Relu("C", "R")
+        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
+        net.AveragePool("R", "Y", stride=2, kernel=3)
+        net.AveragePool("R_mkl", "Y_mkl",
+                 stride=2, kernel=3, device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py
new file mode 100644
index 0000000..90e365d
--- /dev/null
+++ b/caffe2/python/mkl/mkl_relu_op_test.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLReluTest(hu.HypothesisTestCase):
+    @given(size=st.integers(8, 20),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           inplace=st.booleans(),
+           **mu.gcs)
+    def test_mkl_relu(self, size, input_channels, batch_size, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Relu",
+            ["X"],
+            ["Y"] if not inplace else ["X"],
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
new file mode 100644
index 0000000..4a5fad2
--- /dev/null
+++ b/caffe2/python/mkl/mkl_sbn_op_test.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLSpatialBNTest(hu.HypothesisTestCase):
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(1, 3),
+           seed=st.integers(0, 65535),
+           #order=st.sampled_from(["NCHW", "NHWC"]),
+           order=st.sampled_from(["NCHW"]),
+           epsilon=st.floats(1e-5, 1e-2),
+           **mu.gcs)
+    def test_spatialbn_test_mode(self, size, input_channels,
+                                 batch_size, seed, order, epsilon, gc, dc):
+        np.random.seed(seed)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["Y"],
+            order=order,
+            is_test=True,
+            epsilon=epsilon,
+        )
+
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(1, 3),
+           seed=st.integers(0, 65535),
+           #order=st.sampled_from(["NCHW", "NHWC"]),
+           order=st.sampled_from(["NCHW"]),
+           epsilon=st.floats(1e-5, 1e-2),
+           **mu.gcs)
+    def test_spatialbn_train_mode(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "running_mean", "running_var"],
+            ["Y", "running_mean", "running_var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+        )
+        np.random.seed(seed)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        # Note: it seems that the running mean and var do not pass the device
+        # test, suggesting that the semantics are a bit different. Only
+        # checking the output and saved mean and var at this stage.
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var],
+                                [0, 3, 4])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
new file mode 100644
index 0000000..d37bef3
--- /dev/null
+++ b/caffe2/python/mkl/mkl_sbn_speed_test.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import cnn, core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testSpatialBNTestingSpeed(self):
+
+        input_channel = 10
+        X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5
+        scale = np.random.rand(input_channel).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channel).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channel).astype(np.float32)
+        var = np.random.rand(input_channel).astype(np.float32) + 0.5
+
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("scale", scale)
+        workspace.FeedBlob("bias", bias)
+        workspace.FeedBlob("mean", mean)
+        workspace.FeedBlob("var", var)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do)
+        workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do)
+        workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do)
+        workspace.FeedBlob("var_mkl", var, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.SpatialBN(["X", "scale", "bias","mean","var"], "Y", order="NCHW",
+            is_test=True,
+            epsilon=1e-5)
+        net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"], "Y_mkl", order="NCHW",
+            is_test=True,
+            epsilon=1e-5, device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    def testSpatialBNTrainingSpeed(self):
+        input_channel = 10
+        X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5
+        scale = np.random.rand(input_channel).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channel).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channel).astype(np.float32)
+        var = np.random.rand(input_channel).astype(np.float32) + 0.5
+
+        #mean = np.zeros(input_channel)
+        #var = np.zeros(input_channel)
+
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("scale", scale)
+        workspace.FeedBlob("bias", bias)
+        workspace.FeedBlob("mean", mean)
+        workspace.FeedBlob("var", var)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do)
+        workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do)
+        workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do)
+        workspace.FeedBlob("var_mkl", var, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.SpatialBN(["X", "scale", "bias","mean", "var"],
+            ["Y", "mean", "var", "saved_mean", "saved_var"],
+            order="NCHW",
+            is_test=False,
+            epsilon=1e-5)
+        net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"],
+            ["Y_mkl", "mean_mkl", "var_mkl", "saved_mean_mkl", "saved_var_mkl"],
+            order="NCHW",
+            is_test=False,
+            epsilon=1e-5,
+            device_option=mkl_do)
+
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        np.testing.assert_allclose(
+            workspace.FetchBlob("mean"),
+            workspace.FetchBlob("mean_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        np.testing.assert_allclose(
+            workspace.FetchBlob("var"),
+            workspace.FetchBlob("var_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py
new file mode 100644
index 0000000..654008c
--- /dev/null
+++ b/caffe2/python/mkl/mkl_sigmoid_op_test.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLSigmoidTest(hu.HypothesisTestCase):
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), inplace=st.booleans(),
+           **mu.gcs)
+    def test_mkl_sigmoid(self, n, m, inplace, gc, dc):
+        X = np.random.rand(m, n).astype(np.float32)
+        op = core.CreateOperator(
+            "Sigmoid",
+            ["X"],
+            ["Y" if not inplace else "X"]
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
new file mode 100644
index 0000000..4034705
--- /dev/null
+++ b/caffe2/python/mkl/mkl_speed_test.py
@@ -0,0 +1,80 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import cnn, core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testReLUSpeed(self):
+        X = np.random.randn(128, 4096).astype(np.float32)
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.Relu("X", "Y")
+        net.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-10,
+            rtol=1e-10)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        # The returned runtime is the time of
+        # [whole_net, cpu_op, mkl_op]
+        # so we will assume that the MKL one runs faster than the CPU one.
+
+        # Note(Yangqing): in fact, it seems that in optimized mode, this is
+        # not always guaranteed - MKL runs slower than the Eigen vectorized
+        # version, so I am turning this assertion off.
+        #self.assertTrue(runtime[1] >= runtime[2])
+
+        print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+
+    def testConvSpeed(self):
+        # We randomly select a shape to test the speed. Intentionally we
+        # test a batch size of 1 since this may be the most frequent use
+        # case for MKL during deployment time.
+        X = np.random.rand(1, 256, 27, 27).astype(np.float32) - 0.5
+        W = np.random.rand(192, 256, 3, 3).astype(np.float32) - 0.5
+        b = np.random.rand(192).astype(np.float32) - 0.5
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.Conv(["X", "W", "b"], "Y", pad=1, stride=1, kernel=3)
+        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl",
+                 pad=1, stride=1, kernel=3, device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-2,
+            rtol=1e-2)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        print("Conv CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py
new file mode 100644
index 0000000..1e4b579
--- /dev/null
+++ b/caffe2/python/mkl/mkl_squeeze_op_test.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(
+    not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn."
+)
+class MKLSqueezeTest(hu.HypothesisTestCase):
+    @given(
+        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
+        inplace=st.booleans(),
+        **mu.gcs
+    )
+    def test_mkl_squeeze(self, squeeze_dims, inplace, gc, dc):
+        shape = [
+            1 if dim in squeeze_dims else np.random.randint(1, 5)
+            for dim in range(4)
+        ]
+        X = np.random.rand(*shape).astype(np.float32)
+        op = core.CreateOperator(
+            "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
new file mode 100644
index 0000000..ae8b6cd
--- /dev/null
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -0,0 +1,94 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import copy
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+import caffe2.python._import_c_extension as C
+
+
+def rewrite_init_net_simple(net, ideep=True):
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
+    for op in net.op:
+        op.device_option.device_type = device
+
+def last_producer(ops, blob):
+    for (i, op) in reversed(list(enumerate(ops))):
+        if blob in op.output:
+            return i
+    raise ValueError("Failed to find last producer of blob, %s", blob)
+
+
+def fix_BoxWithNMSLimit(net):
+    outputs = set()
+    for op in net.op:
+        if op.type == 'BoxWithNMSLimit':
+            outputs.add(op.output[0])
+            outputs.add(op.output[1])
+            outputs.add(op.output[2])
+    for op in net.op:
+        if op.type == 'CopyIDEEPToCPU':
+            if op.input[0] in outputs:
+                print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
+                op.type = 'Copy'
+                op.device_option.device_type = caffe2_pb2.CPU
+
+
+def rewrite_run_net_simple(net, ideep=True):
+    # Simple rewrite for now - assume entire graph can be executed
+    # with MKL, so just insert copy ops for external_input[0] and
+    # external_output[0]
+    def mkl_tmp(name):
+        return "{}__MKL__".format(name)
+
+    input_blob = net.external_input[0]
+    if input_blob != net.op[0].input[0]:
+        raise Exception(
+            "Input blob: {} is not consumed by first op: {}".format(
+                input_blob, net.op[0]))
+    # Modify input/outputs to point to copied MKL blobs.
+    from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL"
+    to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU"
+    copy_input_op = core.CreateOperator(
+        from_cpu, input_blob, mkl_tmp(input_blob))
+    net.op[0].input[0] = mkl_tmp(input_blob)
+
+    copy_output_ops = [
+        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
+        for output_blob in net.external_output]
+
+    for output_blob in net.external_output:
+        last_producer_idx = last_producer(net.op, output_blob)
+        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
+                           for blob in net.op[last_producer_idx].output]
+        net.op[last_producer_idx].output[:] = renamed_outputs
+        # Rename any subsequent consumers of an output blob.
+        for op in net.op[last_producer_idx + 1:]:
+            renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
+                             for blob in op.input]
+            op.input[:] = renamed_input
+
+    ops = [copy_input_op] + net.op[:] + copy_output_ops
+    del net.op[:]
+    net.op.extend(ops)
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
+    for op in net.op:
+        op.device_option.MergeFrom(
+            core.DeviceOption(device_type=device))
+        op.engine = ""
+
+    if ideep:
+        # Temporarily disbale conv+relu fusion until we verify further
+        # net.ParseFromString(
+        #     C.transform_optimizeForIDEEP(net.SerializeToString()))
+        fix_BoxWithNMSLimit(net)
+
+
+def rewrite_model_helper_simple(model, ideep=True):
+    model = copy.deepcopy(model)
+    # All parameter initialization should run on MKL
+    rewrite_init_net_simple(model.param_init_net.Proto(), ideep)
+    rewrite_run_net_simple(model.net.Proto(), ideep)
+    return model
diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py
new file mode 100644
index 0000000..87b08d5
--- /dev/null
+++ b/caffe2/python/mkl/rewrite_graph_test.py
@@ -0,0 +1,260 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+import copy
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.models import resnet
+from caffe2.python import workspace, brew
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl.rewrite_graph as rewrite_graph
+
+
+def deterministic_io(model):
+    model = copy.deepcopy(model)
+    for i, op in enumerate(model.InitProto().op):
+        op.device_option.random_seed = i + 1
+    if not model.Proto().external_output:
+        model.Proto().external_output.extend([model.Proto().op[-1].output[0]])
+    return model
+
+def simple_fc():
+    model = ModelHelper(name="r")
+    brew.fc(model, "data", "fc", 10, 10)
+    return model, [(1, 10)]
+
+def double_matmul():
+    model = ModelHelper(name="r")
+    fc0 = brew.fc(model, "data", "fc0", 10, 10)
+    fc1 = brew.fc(model, fc0, "fc1", 10, 10)
+    model.Proto().external_output[:] = [str(fc0), str(fc1)]
+    return model, [(1, 10)]
+
+def simple_relu():
+    model = ModelHelper(name="r")
+    brew.relu(model, "data", "fc")
+    return model, [(1, 10)]
+
+
+def simple_mlp():
+    model = ModelHelper(name="r")
+    brew.relu(
+        model,
+        brew.fc(
+            model,
+            brew.relu(
+                model,
+                brew.fc(
+                    model,
+                    "data",
+                    "fc1",
+                    10,
+                    10),
+                "rl1"),
+            "fc2",
+            10,
+            10),
+        "rl2")
+    return model, [(1, 10)]
+
+
+def simple_cnn():
+    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
+    brew.conv(
+        model, "data", 'conv1', 3, 16, kernel=3, stride=1
+    )
+    brew.spatial_bn(
+        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3
+    )
+    brew.relu(model, 'conv1_spatbn', 'relu1')
+    return model, [(1, 3, 32, 32)]
+
+
+def alexnet():
+    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
+    conv1 = brew.conv(
+        model,
+        "data",
+        "conv1",
+        3,
+        64,
+        11, ('XavierFill', {}), ('ConstantFill', {}),
+        stride=4,
+        pad=2
+    )
+    relu1 = brew.relu(model, conv1, "conv1")
+    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=0,
+                          legacy_pad=3)
+    lrn1 = brew.lrn(
+        model, pool1, "pool1_lrn", size=5, alpha=1.0e-4, beta=0.75, bias=1.0)
+    conv2 = brew.conv(
+        model,
+        lrn1,
+        "conv2",
+        64,
+        192,
+        5,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=2
+    )
+    relu2 = brew.relu(model, conv2, "conv2")
+    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
+    lrn2 = brew.lrn(
+        model, pool2, "pool2_lrn", size=5, alpha=1.0e-4, beta=0.75, bias=1.0)
+    conv3 = brew.conv(
+        model,
+        lrn2,
+        "conv3",
+        192,
+        384,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu3 = brew.relu(model, conv3, "conv3")
+    conv4 = brew.conv(
+        model,
+        relu3,
+        "conv4",
+        384,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu4 = brew.relu(model, conv4, "conv4")
+    conv5 = brew.conv(
+        model,
+        relu4,
+        "conv5",
+        256,
+        256,
+        3,
+        ('XavierFill', {}),
+        ('ConstantFill', {}),
+        pad=1
+    )
+    relu5 = brew.relu(model, conv5, "conv5")
+    pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
+    fc6 = brew.fc(
+        model,
+        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
+        ('ConstantFill', {})
+    )
+    relu6 = brew.relu(model, fc6, "fc6")
+    fc7 = brew.fc(
+        model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu7 = brew.relu(model, fc7, "fc7")
+    drop7 = brew.dropout(model, relu7, "fc7_dropout", is_test=1, ratio=0.5)
+    fc8 = brew.fc(
+        model, drop7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
+    )
+    relu8 = brew.relu(model, fc8, "fc8")
+    _ = brew.dropout(model, relu8, "fc8_dropout", is_test=1, ratio=0.5)
+    return model, [(1, 3, 224, 224)]
+
+
+def simple_resnet():
+    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
+    resnet.create_resnet_32x32(
+        model, "data", num_input_channels=1, num_groups=1, num_labels=5,
+        is_test=True)
+    return model, [(1, 1, 32, 32)]
+
+
+def complex_resnet():
+    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
+    resnet.create_resnet50(
+        model, "data", num_input_channels=1, num_labels=5, is_test=True,
+        no_loss=True)
+    return model, [(1, 1, 224, 224)]
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn or not workspace.C.use_ideep,
+                 "Skipping as we do not have MKLDNN and IDEEP.")
+class MKLRewriteTest(hu.HypothesisTestCase):
+    @given(gen=st.sampled_from([simple_relu, simple_fc,
+                                simple_mlp, simple_cnn]),
+           ideep=st.booleans())
+    def test_mkl_simple_rewrite(self, gen, ideep):
+        cpu_model, (shape,) = gen()
+        cpu_model = deterministic_io(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
+        X = np.random.randn(*shape).astype(np.float32)
+
+        def run(model):
+            self.ws.run(model.InitProto())
+            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
+            self.ws.run(model.Proto())
+            return self.ws.blobs[model.Proto().external_output[0]].fetch()
+
+        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
+                                   atol=1e-4, rtol=1e-4)
+
+    @given(ideep=st.booleans())
+    def test_mkl_resnet_rewrite(self, ideep):
+        cpu_model, (shape,) = complex_resnet()
+        cpu_model = deterministic_io(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
+        np.random.seed(1701)
+        X = np.random.randn(*shape).astype(np.float32)
+
+        def run(model):
+            self.ws.run(model.InitProto())
+            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
+            self.ws.run(model.Proto())
+            return self.ws.blobs[model.Proto().external_output[0]].fetch()
+        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
+                                   atol=1e-4, rtol=1e-4)
+
+    @given(ideep=st.booleans())
+    def test_mkl_multi_output_rewrite(self, ideep):
+        cpu_model, shapes = double_matmul()
+        cpu_model = deterministic_io(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
+        np.random.seed(1701)
+        Xs = [np.random.randn(*shape).astype(np.float32) for shape in shapes]
+
+        def run(model):
+            self.ws.run(model.InitProto())
+            for (name, X) in zip(model.Proto().external_input, Xs):
+                self.ws.create_blob(name).feed(X)
+            print(model.Proto())
+            self.ws.run(model.Proto())
+            return [self.ws.blobs[name].fetch()
+                    for name in model.Proto().external_output]
+
+        run(mkl_model)
+
+        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
+                                   atol=1e-4, rtol=1e-4)
+
+    @given(ideep=st.booleans())
+    def test_mkl_alexnet_rewrite(self, ideep):
+        cpu_model, (shape,) = alexnet()
+        cpu_model = deterministic_io(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
+        np.random.seed(1701)
+        X = np.random.randn(*shape).astype(np.float32)
+
+        def run(model):
+            self.ws.run(model.InitProto())
+            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
+            self.ws.run(model.Proto())
+            return self.ws.blobs[model.Proto().external_output[0]].fetch()
+        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
+                                   atol=1e-4, rtol=1e-4)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py
new file mode 100644
index 0000000..5d8f665
--- /dev/null
+++ b/caffe2/python/mkl_test_util.py
@@ -0,0 +1,44 @@
+## @package mkl_test_util
+# Module caffe2.python.mkl_test_util
+"""
+The MKL test utils is a small addition on top of the hypothesis test utils
+under caffe2/python, which allows one to more easily test MKL related
+operators.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace
+from caffe2.python import hypothesis_test_util as hu
+
+cpu_do = hu.cpu_do
+gpu_do = hu.gpu_do
+mkl_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.MKLDNN)
+device_options = hu.device_options + (
+    [mkl_do] if workspace.C.has_mkldnn else [])
+
+
+def device_checker_device_options():
+    return st.just(device_options)
+
+
+def gradient_checker_device_option():
+    return st.sampled_from(device_options)
+
+
+gcs = dict(
+    gc=gradient_checker_device_option(),
+    dc=device_checker_device_options()
+)
+
+gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
+gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_mkl_only = dict(gc=st.sampled_from([mkl_do]), dc=st.just([mkl_do]))
+
+gcs_cpu_mkl = dict(gc=st.sampled_from([cpu_do, mkl_do]), dc=st.just([cpu_do, mkl_do]))
diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py
new file mode 100644
index 0000000..31cba3f
--- /dev/null
+++ b/caffe2/python/model_device_test.py
@@ -0,0 +1,152 @@
+import numpy as np
+import unittest
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import (
+    workspace,
+    device_checker,
+    test_util,
+    model_helper,
+    brew,
+)
+
+
+class TestMiniAlexNet(test_util.TestCase):
+
+    def _MiniAlexNetNoDropout(self, order):
+        # First, AlexNet using the cnn wrapper.
+        model = model_helper.ModelHelper(name="alexnet")
+        conv1 = brew.conv(
+            model,
+            "data",
+            "conv1",
+            3,
+            16,
+            11,
+            ("XavierFill", {}),
+            ("ConstantFill", {}),
+            stride=4,
+            pad=0
+        )
+        relu1 = brew.relu(model, conv1, "relu1")
+        norm1 = brew.lrn(model, relu1, "norm1", size=5, alpha=0.0001, beta=0.75)
+        pool1 = brew.max_pool(model, norm1, "pool1", kernel=3, stride=2)
+        conv2 = brew.group_conv(
+            model,
+            pool1,
+            "conv2",
+            16,
+            32,
+            5,
+            ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.1}),
+            group=2,
+            stride=1,
+            pad=2
+        )
+        relu2 = brew.relu(model, conv2, "relu2")
+        norm2 = brew.lrn(model, relu2, "norm2", size=5, alpha=0.0001, beta=0.75)
+        pool2 = brew.max_pool(model, norm2, "pool2", kernel=3, stride=2)
+        conv3 = brew.conv(
+            model,
+            pool2,
+            "conv3",
+            32,
+            64,
+            3,
+            ("XavierFill", {'std': 0.01}),
+            ("ConstantFill", {}),
+            pad=1
+        )
+        relu3 = brew.relu(model, conv3, "relu3")
+        conv4 = brew.group_conv(
+            model,
+            relu3,
+            "conv4",
+            64,
+            64,
+            3,
+            ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.1}),
+            group=2,
+            pad=1
+        )
+        relu4 = brew.relu(model, conv4, "relu4")
+        conv5 = brew.group_conv(
+            model,
+            relu4,
+            "conv5",
+            64,
+            32,
+            3,
+            ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.1}),
+            group=2,
+            pad=1
+        )
+        relu5 = brew.relu(model, conv5, "relu5")
+        pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
+        fc6 = brew.fc(
+            model, pool5, "fc6", 1152, 1024, ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.1})
+        )
+        relu6 = brew.relu(model, fc6, "relu6")
+        fc7 = brew.fc(
+            model, relu6, "fc7", 1024, 1024, ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.1})
+        )
+        relu7 = brew.relu(model, fc7, "relu7")
+        fc8 = brew.fc(
+            model, relu7, "fc8", 1024, 5, ("XavierFill", {}),
+            ("ConstantFill", {"value": 0.0})
+        )
+        pred = brew.softmax(model, fc8, "pred")
+        xent = model.LabelCrossEntropy([pred, "label"], "xent")
+        loss = model.AveragedLoss([xent], ["loss"])
+        model.AddGradientOperators([loss])
+        return model
+
+    def _testMiniAlexNet(self, order):
+        # First, we get all the random initialization of parameters.
+        model = self._MiniAlexNetNoDropout(order)
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(model.param_init_net)
+        inputs = dict(
+            [(str(name), workspace.FetchBlob(str(name))) for name in
+             model.params]
+        )
+        if order == "NCHW":
+            inputs["data"] = np.random.rand(4, 3, 227, 227).astype(np.float32)
+        else:
+            inputs["data"] = np.random.rand(4, 227, 227, 3).astype(np.float32)
+        inputs["label"] = np.array([1, 2, 3, 4]).astype(np.int32)
+
+        cpu_device = caffe2_pb2.DeviceOption()
+        cpu_device.device_type = caffe2_pb2.CPU
+        gpu_device = caffe2_pb2.DeviceOption()
+        gpu_device.device_type = caffe2_pb2.CUDA
+
+        checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
+        ret = checker.CheckNet(
+            model.net.Proto(),
+            inputs,
+            # The indices sometimes may be sensitive to small numerical
+            # differences in the input, so we ignore checking them.
+            ignore=['_pool1_idx', '_pool2_idx', '_pool5_idx']
+        )
+        self.assertEqual(ret, True)
+
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "No GPU support. Skipping test.")
+    def testMiniAlexNetNCHW(self):
+        self._testMiniAlexNet("NCHW")
+
+    # No Group convolution support for NHWC right now
+    #@unittest.skipIf(not workspace.has_gpu_support,
+    #                 "No GPU support. Skipping test.")
+    #def testMiniAlexNetNHWC(self):
+    #    self._testMiniAlexNet("NHWC")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
new file mode 100644
index 0000000..ed3070d
--- /dev/null
+++ b/caffe2/python/model_helper.py
@@ -0,0 +1,624 @@
+## @package model_helper
+# Module caffe2.python.model_helper
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope, workspace, helpers
+from caffe2.python.modeling import parameter_info
+from caffe2.python.modeling.parameter_sharing import (
+    parameter_sharing_context,
+)
+from caffe2.python.optimizer_context import (
+    OptimizerContext,
+    DEFAULT_OPTIM,
+)
+from caffe2.python.regularizer_context import RegularizerContext
+
+from future.utils import viewitems, viewkeys
+from itertools import chain
+
+import logging
+import six
+
+
+# _known_working_ops are operators that do not need special care.
+_known_working_ops = [
+    "Accuracy",
+    "Adam",
+    "Add",
+    "Adagrad",
+    "SparseAdagrad",
+    "AveragedLoss",
+    "Cast",
+    "Checkpoint",
+    "ConstantFill",
+    "Copy",
+    "CopyGPUToCPU",
+    "CopyCPUToGPU",
+    "DequeueBlobs",
+    "EnsureCPUOutput",
+    "ExpandDims",
+    "Flatten",
+    "FlattenToVec",
+    "LabelCrossEntropy",
+    "LearningRate",
+    "MakeTwoClass",
+    "MatMul",
+    "NCCLAllreduce",
+    "NHWC2NCHW",
+    "PackSegments",
+    "Print",
+    "PRelu",
+    "ReduceFrontSum",
+    "Scale",
+    "ScatterWeightedSum",
+    "Sigmoid",
+    "SortedSegmentSum",
+    "Snapshot",  # Note: snapshot is deprecated, use Checkpoint
+    "Softmax",
+    "SoftmaxWithLoss",
+    "SquaredL2Distance",
+    "Squeeze",
+    "StopGradient",
+    "Summarize",
+    "Tanh",
+    "Transpose",
+    "UnpackSegments",
+    "WeightedSum",
+    "YellowFin"
+]
+
+
+class ModelHelper(object):
+    """A helper model so we can manange models more easily. It contains net def
+    and parameter storages. You can add an Operator yourself, e.g.
+
+        model = model_helper.ModelHelper(name="train_net")
+        # init your weight and bias as w and b
+        w = model.param_init_net.XavierFill(...)
+        b = model.param_init_net.ConstantFill(...)
+        fc1 = model.FC([input, w, b], output, **kwargs)
+
+    or you can use helper functions in brew module without manually
+    defining parameter initializations and operators.
+
+        model = model_helper.ModelHelper(name="train_net")
+        fc1 = brew.fc(model, input, output, dim_in, dim_out, **kwargs)
+
+    """
+
+    def __init__(self, name=None, init_params=True, allow_not_known_ops=True,
+                 skip_sparse_optim=False, param_model=None, arg_scope=None):
+        self.name = name or "model"
+        self.net = core.Net(self.name)
+
+        if param_model is not None:
+            self.param_init_net = param_model.param_init_net
+            self.param_to_grad = param_model.param_to_grad
+            self.params = param_model.params
+            self._parameters_info = param_model._parameters_info
+            self._computed_params = param_model._computed_params
+        else:
+            self.param_init_net = core.Net(self.name + '_init')
+            self.param_to_grad = {}
+            self.params = []
+            self._parameters_info = {}
+            self._computed_params = []
+
+        self._param_info_deprecated = []
+        self._devices = []
+        self.gradient_ops_added = False
+        self.init_params = init_params
+        self.allow_not_known_ops = allow_not_known_ops
+        self.skip_sparse_optim = skip_sparse_optim
+        self.weights = []
+        self.biases = []
+        self._arg_scope = {
+            'order': "NCHW",
+            'use_cudnn': True,
+            'cudnn_exhaustive_search': False,
+        }
+        if arg_scope is not None:
+            # Please notice value as None is not acceptable. We are not checking it
+            # here because we already have check in MakeArgument.
+            self._arg_scope.update(arg_scope)
+
+    @property
+    def arg_scope(self):
+        return self._arg_scope
+
+    def get_name(self):
+        return self.name
+
+    def _infer_param_shape(self, param):
+        for op in self.param_init_net.Proto().op:
+            if str(param) in op.output:
+                for arg in op.arg:
+                    if arg.name == "shape":
+                        return list(arg.ints)
+        return None
+
+    def _update_param_info_deprecated(self):
+        assert len(self._param_info_deprecated) <= len(self.params)
+        for param in self.params[len(self._param_info_deprecated):]:
+            if not isinstance(param, core.BlobReference):
+                raise ValueError(
+                    "Param %s must be a BlobReference!" % str(param))
+            self._param_info_deprecated.append(parameter_info.ParameterInfo(
+                param_id=len(self._param_info_deprecated),
+                param=param,
+                shape=self._infer_param_shape(param)))
+        for info in self._param_info_deprecated:
+            info.grad = self.param_to_grad.get(info.name)
+
+    def _normalize_tags(self, tags):
+        tags = tags or []
+        return set(tags) if isinstance(tags, list) else set([tags])
+
+    def create_param(self, param_name, shape, initializer, tags=None):
+        """
+        Creates parameter with a given name and initializer.
+
+        If param_name is instance of BlobRefernce - then this blob will be used
+        to store parameter (no any logic will affect it's location).
+
+        If param_name is instance of a string type, then the final blob will
+        be created in the CurrentNameScope with the respect of all parameter
+        sharing logic, i.e. 'resolved_name_scope/param_name'.
+
+        Parameter sharing logic is going to override CurrentNameScope accoring
+        to the rules that are specified through ParameterSharing contexts,
+        all ParameterSharing contexts are applied recursively until there are no
+        extra overrides present, where on each step the best match will be
+        applied first.
+
+        The following examples should clarify the way ParameterSharing logic
+        works:
+
+        As an example if this function is called with parameter 'w':
+        a. Call from some scope 'global_scope' with no Parameter sharing:
+          'global_scope/w'
+        b. Call from scope 'scope_b', with override {'scope_b': 'scope_a'}:
+          'scope_a/w'
+        c. Call from scope 'scope_a', with override {'scope_a': ''}:
+          'scope_a/w'
+        d. Call from scope 'scope_b/shared', with overrides
+          {'scope_b/shared': 'scope_b', 'scope_b': 'scope_a'}:
+          'scope_a/w'
+        d. Call from scope 'scope_b/unshared', with overrides
+          {'scope_b/shared': 'scope_b', 'scope_b': 'scope_a'}:
+          'scope_a/unshared/w'
+        """
+        # ParameterSharing works only for case when param_name is instance of
+        # a string type. If param_name is a BlobReference - no attempt for
+        # ParameterSharing will be applied.
+        if isinstance(param_name, core.BlobReference):
+            param_name = str(param_name)
+        elif isinstance(param_name, six.string_types):
+            # Parameter name will be equal to current Namescope that got
+            # resolved with the respect of parameter sharing of the scopes.
+            param_name = parameter_sharing_context.get_parameter_name(
+                param_name)
+        else:
+            raise "Unsupported type for param_name"
+
+        if param_name in self._parameters_info:
+            assert self._parameters_info[param_name].shape == shape
+            return self._parameters_info[param_name].blob
+
+        param_info = initializer.create_param(
+            param_name=core.BlobReference(param_name),
+            init_net=self.param_init_net,
+            shape=shape,
+        )
+        optim_context = OptimizerContext.current()
+        for tag in self._normalize_tags(tags):
+            if optim_context.has_optimizer(tag):
+                # param_info will check optimizer has not been set
+                param_info.optimizer = optim_context.get_optimizer(tag)
+        if not param_info.optimizer and optim_context.has_optimizer(DEFAULT_OPTIM):
+            param_info.optimizer = optim_context.get_optimizer(DEFAULT_OPTIM)
+
+        reg_context = RegularizerContext.current()
+        param_info.regularizer = reg_context
+
+        self._parameters_info[param_name] = param_info
+        # Add param to legacy structs as well, so all other functions for
+        # parameters are still working.
+        self.AddParameter(param_info.blob, tags)
+        return param_info.blob
+
+    def get_param_info(self, param):
+        assert isinstance(param, core.BlobReference), \
+            "Param {} is not a BlobReference".format(param)
+        return self._parameters_info.get(param, None)
+
+    # This method is deprecated, use create_param method which
+    # also does parameter initialization when needed
+    def add_param_DEPRECATED(self, param, key=None, shape=None, length=None):
+        logging.warning("add_param method is DEPRECATED")
+        self._update_param_info_deprecated()
+        self.AddParameter(param)
+        if key is not None and self.net.input_record() is not None:
+            idx = self.net.input_record().field_blobs().index(key)
+            key = self.net.input_record().field_names()[idx]
+        shape = shape if shape is not None else self._infer_param_shape(param)
+        if not isinstance(param, core.BlobReference):
+            raise ValueError("Param %s must be a BlobReference!" % str(param))
+        self._param_info_deprecated.append(parameter_info.ParameterInfo(
+            param_id=len(self._param_info_deprecated),
+            param=param,
+            shape=shape,
+            key=key,
+            length=length,
+        ))
+        return self._param_info_deprecated[-1]
+
+    # This method is deprecated, use get_param_info method
+    def param_info(self, grad_type=None, id=None):
+        logging.info("param_info method is DEPRECATED")
+        self._update_param_info_deprecated()
+        if id is not None:
+            assert grad_type is None
+            info = self._param_info_deprecated[id]
+            assert info.param_id == id
+            return info
+        elif grad_type is not None:
+            return [
+                info for info in self._param_info_deprecated
+                if info.grad_type() == grad_type]
+        else:
+            return self._param_info_deprecated
+
+    def AddParameter(self, param, tags=None):
+        assert isinstance(param, core.BlobReference)
+        tags = self._normalize_tags(tags)
+        if parameter_info.ParameterTags.COMPUTED_PARAM in tags:
+            self._computed_params.append(param)
+        else:
+            self.params.append(param)
+
+        if parameter_info.ParameterTags.WEIGHT in tags:
+            self.weights.append(param)
+        if parameter_info.ParameterTags.BIAS in tags:
+            self.biases.append(param)
+
+    @staticmethod
+    def _NormalizeNamescope(namescope):
+        if namescope is None:
+            return scope.CurrentNameScope()
+        elif namescope == '' or namescope.endswith(scope._NAMESCOPE_SEPARATOR):
+            return namescope
+        else:
+            return namescope + scope._NAMESCOPE_SEPARATOR
+
+    def GetParams(self, namescope=None, top_scope=False):
+        '''
+        Returns the params in current namescope
+        '''
+        namescope = ModelHelper._NormalizeNamescope(namescope)
+
+        if namescope == '':
+            return self.params[:]
+        elif top_scope:
+            return [
+                p for p in self.params
+                if p.GetNameScope().startswith(namescope)
+            ]
+        else:
+            return [p for p in self.params if
+                    p.GetNameScope().startswith(namescope)]
+
+    def Proto(self):
+        return self.net.Proto()
+
+    def InitProto(self):
+        return self.param_init_net.Proto()
+
+    def RunAllOnGPU(self, *args, **kwargs):
+        self.param_init_net.RunAllOnGPU(*args, **kwargs)
+        self.net.RunAllOnGPU(*args, **kwargs)
+
+    def CreateDB(self, blob_out, db, db_type, **kwargs):
+        dbreader = self.param_init_net.CreateDB(
+            [], blob_out, db=db, db_type=db_type, **kwargs)
+        return dbreader
+
+    def AddGradientOperators(self, *args, **kwargs):
+        if self.gradient_ops_added:
+            raise RuntimeError("You cannot run AddGradientOperators twice.")
+        self.Validate()
+
+        self.gradient_ops_added = True
+        self.grad_map = self.net.AddGradientOperators(*args, **kwargs)
+        self.param_to_grad = self.get_param_to_grad(self.params)
+
+        # Populate ParameterInfo for all parameters if missing
+        # and add gradient blob information. So optimizers can use it
+        for param, grad in self.param_to_grad.items():
+            param_info = self.get_param_info(param)
+            if param_info:
+                param_info.grad = grad
+            else:
+                self._parameters_info[param] = parameter_info.ParameterInfo(
+                    param_id=None,
+                    param=param,
+                    grad=grad,
+                )
+
+        return self.grad_map
+
+    def get_param_to_grad(self, params):
+        '''
+        Given a list of parameters returns a dict from a parameter
+        to a corresponding gradient
+        '''
+
+        param_to_grad = {}
+        if not self.gradient_ops_added:
+            raise RuntimeError("You need to run AddGradientOperators first.")
+        # We need to use empty namescope when creating the gradients
+        # to prevent duplicating the namescope prefix for gradient blobs.
+        for p in params:
+            if str(p) in self.grad_map:
+                param_to_grad[p] = self.grad_map[str(p)]
+        return param_to_grad
+
+    def GetOptimizationParamInfo(self, params=None):
+        '''
+        Returns a map for param => grad.
+        If params is not specified, all parameters will be considered.
+        '''
+        if not self.gradient_ops_added:
+            raise RuntimeError("Need to call AddGradientOperators first")
+
+        param_to_grad = self.param_to_grad
+        if params:
+            param_to_grad = self.get_param_to_grad(params)
+
+        return [
+            self.get_param_info(param) for param, grad in viewitems(param_to_grad)
+            if (
+                not self.skip_sparse_optim or
+                not isinstance(grad, core.GradientSlice)
+            )
+        ]
+
+    def _Validate(self):
+        '''
+        Check for duplicate params
+        '''
+        params_list = [str(p) for p in self.params]
+        params_set = set(params_list)
+
+        dupes = []
+        if len(params_set) != len(params_list):
+            params_list = sorted(params_list)
+            for j, p in enumerate(params_list):
+                if j > 0 and params_list[j - 1] == p:
+                    if p not in dupes:
+                        dupes.append(p)
+
+        return dupes
+
+    def Validate(self):
+        dupes = self._Validate()
+        assert dupes == [], "Duplicate params: {}".format(dupes)
+
+    def GetComputedParams(self, namescope=None):
+        '''
+        Returns the computed params in current namescope. 'Computed params'
+        are such parameters that are not optimized via gradient descent but are
+        directly computed from data, such as the running mean and variance
+        of Spatial Batch Normalization.
+        '''
+        namescope = ModelHelper._NormalizeNamescope(namescope)
+
+        if namescope == '':
+            return self._computed_params[:]
+        else:
+            return [p for p in self._computed_params
+                    if p.GetNameScope().startswith(namescope)]
+
+    def GetAllParams(self, namescope=None):
+        return self.GetParams(namescope) + self.GetComputedParams(namescope)
+
+    def TensorProtosDBInput(
+        self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
+    ):
+        """TensorProtosDBInput."""
+        assert len(unused_blob_in) == 0, \
+            """You cannot pass reader to model_helper.TensorProtosDBInput.
+               Use model.net.TensorProtosDBInput instead to create the op."""
+
+        return helpers.db_input.db_input(
+            self, blob_out, batch_size, db, db_type, **kwargs)
+
+    def GetDevices(self):
+        assert len(self._devices) > 0, \
+            "Use data_parallel_model to run model on multiple GPUs."
+        return self._devices
+
+    def __getattr__(self, op_type):
+        """Catch-all for all other operators, mostly those without params."""
+        if op_type.startswith('__'):
+            raise AttributeError(op_type)
+
+        if not core.IsOperator(op_type):
+            raise AttributeError(
+                'Method ' + op_type + ' is not a registered operator.' +
+                ' Did you mean: [' +
+                ','.join(workspace.C.nearby_opnames(op_type)) + ']'
+            )
+        if op_type not in _known_working_ops:
+            if not self.allow_not_known_ops:
+                raise AttributeError(
+                    "Operator {} is not known to be safe".format(op_type))
+
+            logging.warning("You are creating an op that the ModelHelper "
+                            "does not recognize: {}.".format(op_type))
+        return self.net.__getattr__(op_type)
+
+    def __dir__(self):
+        return sorted(set(chain(
+            dir(type(self)),
+            viewkeys(self.__dict__),
+            _known_working_ops
+        )))
+
+
+def ExtractPredictorNet(
+    net_proto,
+    input_blobs,
+    output_blobs,
+    device=None,
+    renames=None,
+    disabled_inputs=None,
+):
+    '''
+    Takes a model net for training and returns a net which can be
+    used for prediction. For example, all gradient operators and
+    input operators are removed.
+    @param net_proto protobuf of the net you want to process (net.Proto())
+    @param input_blobs list/set of blob names that are the inputs of predictor
+    @param output_blobs list/set of blob names that are outputs of predictor
+    @param device optional device option that is assigned
+    @param renames dictionary of blob name to a new name (optional)
+    @param disabled_inputs optional set of blobs that are 'switched off'. This
+                will cause branches with those blobs as inputs to be removed
+    '''
+    predict_net = core.Net(net_proto.name + "_predict")
+    predict_proto = predict_net.Proto()
+
+    orig_external_inputs = set(net_proto.external_input)
+    orig_external_outputs = set(net_proto.external_output)
+    input_blobs = {str(b) for b in input_blobs}
+    known_blobs = set(orig_external_inputs).union(input_blobs)
+    output_blobs = {str(b) for b in output_blobs}
+    external_inputs = set(input_blobs)
+    external_outputs = set(output_blobs)
+
+    if renames is None:
+        renames = {}
+
+    if disabled_inputs is not None:
+        known_blobs = known_blobs - set(disabled_inputs)
+
+    ops = list(net_proto.op)
+
+    # Find the range of ops that we should include
+    try:
+        first_op_with_input = min(
+            [
+                j for j in range(len(ops))
+                if input_blobs.intersection(ops[j].input) and ops[j].type !=
+                'StopGradient'
+            ]
+        )
+    except ValueError:
+        raise Exception("No ops with input={}".format(input_blobs))
+    try:
+        last_op_with_output = max(
+            [
+                j for j in range(len(ops))
+                if output_blobs.intersection(ops[j].output)
+            ]
+        )
+    except ValueError:
+        raise Exception("No ops with output={}".format(output_blobs))
+
+    def validate_op(op):
+        # Check that the op does not have is_test = 0 set. This is a common
+        # pitfall with SpatialBN op, at lest.
+        for arg in op.arg:
+            if arg.name == "is_test" and arg.i == 0:
+                raise Exception(
+                    "An operator had is_test=0, did you try to extract a " +
+                    "predictor from a train model (instead of test model)?" +
+                    " Op was: {}".format(str(op))
+                )
+
+    def rename_list(proto_list):
+        # proto lists don't support assignments
+        new_list = proto_list[:]
+        for j, b in enumerate(new_list):
+            if b in renames:
+                new_list[j] = renames[b]
+
+        del proto_list[:]
+        proto_list.extend(new_list)
+
+    # Iterate through the ops and only include those whose inputs
+    # we can satisfy.
+    for op in ops[first_op_with_input:(last_op_with_output + 1)]:
+        if known_blobs.issuperset(op.input):
+
+            # Special handling for recurrent nets
+            # TODO: when standard argument type for "nets" is introduced,
+            # this can be more general
+            if op.type == 'RecurrentNetwork':
+                for arg in op.arg:
+                    if arg.name == 'backward_step_net':
+                        arg.ClearField(str('n'))
+                    elif arg.name == 'step_net':
+                        for step_op in arg.n.op:
+                            rename_list(step_op.input)
+                            rename_list(step_op.output)
+                            if device is not None:
+                                step_op.device_option.device_type = device.device_type
+                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
+
+                        rename_list(arg.n.external_input)
+                        rename_list(arg.n.external_output)
+
+                        # Add additional external inputs
+                        external_inputs.update(
+                            set(arg.n.external_input).intersection(
+                                orig_external_inputs
+                            )
+                        )
+
+            if device is not None:
+                op.device_option.device_type = device.device_type
+                op.device_option.cuda_gpu_id = device.cuda_gpu_id
+            validate_op(op)
+            predict_proto.op.extend([op])
+            known_blobs.update(op.output)
+            external_inputs.update(
+                set(op.input).intersection(orig_external_inputs)
+            )
+            external_outputs.update(
+                set(op.output).intersection(orig_external_outputs)
+            )
+
+        else:
+            logging.debug(
+                "Op {} had unknown inputs: {}".format(
+                    op.type, set(op.input).difference(known_blobs)
+                )
+            )
+
+    # Predictor net's external inputs and outputs include only those
+    # that are part of this net.
+    predict_proto.external_input.extend(external_inputs)
+    predict_proto.external_output.extend(external_outputs)
+
+    rename_list(predict_proto.external_input)
+    rename_list(predict_proto.external_output)
+
+    renamed_input_blobs = []
+    for b in input_blobs:
+        if b in renames:
+            renamed_input_blobs.append(renames[b])
+        else:
+            renamed_input_blobs.append(b)
+
+    for op in predict_proto.op:
+        rename_list(op.input)
+        rename_list(op.output)
+
+    return predict_net, list(
+        set(predict_proto.external_input) - set(renamed_input_blobs)
+    )
diff --git a/caffe2/python/modeling/__init__.py b/caffe2/python/modeling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py
new file mode 100644
index 0000000..9802222
--- /dev/null
+++ b/caffe2/python/modeling/compute_histogram_for_blobs.py
@@ -0,0 +1,94 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.modeling.net_modifier import NetModifier
+
+import numpy as np
+
+
+class ComputeHistogramForBlobs(NetModifier):
+    """
+    This class modifies the net passed in by adding ops to compute histogram for
+    certain blobs.
+
+    Args:
+        blobs: list of blobs to compute histogram for
+        logging_frequency: frequency for printing
+        lower_bound: left boundary of histogram values
+        upper_bound: right boundary of histogram values
+        num_buckets: number of buckets to use in [lower_bound, upper_bound)
+        accumulate: boolean to output accumulate or per-batch histogram
+    """
+
+    def __init__(self, blobs, logging_frequency, num_buckets=30,
+            lower_bound=0.0, upper_bound=1.0, accumulate=False):
+        self._blobs = blobs
+        self._logging_frequency = logging_frequency
+        self._accumulate = accumulate
+        if self._accumulate:
+            self._field_name_suffix = '_acc_normalized_hist'
+        else:
+            self._field_name_suffix = '_curr_normalized_hist'
+
+        self._num_buckets = int(num_buckets)
+        assert self._num_buckets > 0, (
+            "num_buckets need to be greater than 0, got {}".format(num_buckets))
+        self._lower_bound = float(lower_bound)
+        self._upper_bound = float(upper_bound)
+
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                   modify_output_record=False):
+        for blob_name in self._blobs:
+            blob = core.BlobReference(blob_name)
+            if not net.BlobIsDefined(blob):
+                raise Exception('blob {0} is not defined in net {1}'.format(
+                    blob, net.Name()))
+
+            blob_float = net.Cast(blob, net.NextScopedBlob(prefix=blob +
+                '_float'), to=core.DataType.FLOAT)
+            curr_hist, acc_hist = net.AccumulateHistogram(
+                [blob_float],
+                [net.NextScopedBlob(prefix=blob + '_curr_hist'),
+                 net.NextScopedBlob(prefix=blob + '_acc_hist')],
+                num_buckets=self._num_buckets,
+                lower_bound=self._lower_bound,
+                upper_bound=self._upper_bound)
+
+            if self._accumulate:
+                hist = net.Cast(
+                    acc_hist,
+                    net.NextScopedBlob(prefix=blob + '_cast_hist'),
+                    to=core.DataType.FLOAT)
+            else:
+                hist = net.Cast(
+                    curr_hist,
+                    net.NextScopedBlob(prefix=blob + '_cast_hist'),
+                    to=core.DataType.FLOAT)
+
+            normalized_hist = net.NormalizeL1(
+                hist,
+                net.NextScopedBlob(prefix=blob + self._field_name_suffix)
+            )
+
+            if self._logging_frequency >= 1:
+                net.Print(normalized_hist, [], every_n=self._logging_frequency)
+
+            if modify_output_record:
+                output_field_name = str(blob) + self._field_name_suffix
+                output_scalar = schema.Scalar((np.float32, (self._num_buckets + 2,)),
+                    normalized_hist)
+
+                if net.output_record() is None:
+                    net.set_output_record(
+                        schema.Struct((output_field_name, output_scalar))
+                    )
+                else:
+                    net.AppendOutputRecordField(
+                        output_field_name,
+                        output_scalar)
+
+    def field_name_suffix(self):
+        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
new file mode 100644
index 0000000..6c3b599
--- /dev/null
+++ b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
@@ -0,0 +1,119 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import workspace, brew, model_helper
+from caffe2.python.modeling.compute_histogram_for_blobs import (
+    ComputeHistogramForBlobs
+)
+
+import numpy as np
+
+
+class ComputeHistogramForBlobsTest(unittest.TestCase):
+
+    def histogram(self, X, lower_bound=0.0, upper_bound=1.0, num_buckets=20):
+        assert X.ndim == 2, ('this test assume 2d array,  but X.ndim is {0}'.
+            format(X.ndim))
+        N, M = X.shape
+        hist = np.zeros((num_buckets + 2, ), dtype=np.int32)
+        segment = (upper_bound - lower_bound) / num_buckets
+        Y = np.zeros((N, M), dtype=np.int32)
+        Y[X < lower_bound] = 0
+        Y[X >= upper_bound] = num_buckets + 1
+        Y[(X >= lower_bound) & (X < upper_bound)] = \
+            ((X[(X >= lower_bound) & (X < upper_bound)] - lower_bound) /
+                    segment + 1).astype(np.int32)
+
+        for i in range(Y.shape[0]):
+            for j in range(Y.shape[1]):
+                hist[Y[i][j]] += 1
+
+        cur_hist = hist.astype(np.float32) / (N * M)
+        acc_hist = cur_hist
+        return [cur_hist, acc_hist]
+
+    def test_compute_histogram_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        num_buckets = 20
+        lower_bound = 0.2
+        upper_bound = 0.8
+        accumulate = False
+        net_modifier = ComputeHistogramForBlobs(blobs=['fc1_w', 'fc2_w'],
+                                                logging_frequency=10,
+                                                num_buckets=num_buckets,
+                                                lower_bound=lower_bound,
+                                                upper_bound=upper_bound,
+                                                accumulate=accumulate)
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_curr_normalized_hist = workspace.FetchBlob('fc1_w_curr_normalized_hist')
+        cur_hist, acc_hist = self.histogram(fc1_w,
+                                            lower_bound=lower_bound,
+                                            upper_bound=upper_bound,
+                                            num_buckets=num_buckets)
+
+        self.assertEqual(fc1_w_curr_normalized_hist.size, num_buckets + 2)
+        self.assertAlmostEqual(np.linalg.norm(
+            fc1_w_curr_normalized_hist - cur_hist), 0.0, delta=1e-5)
+        self.assertEqual(len(model.net.Proto().op), 12)
+
+        assert model.net.output_record() is None
+
+    def test_compute_histogram_for_blobs_modify_output_record(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        num_buckets = 20
+        lower_bound = 0.2
+        upper_bound = 0.8
+        accumulate = False
+        net_modifier = ComputeHistogramForBlobs(blobs=['fc1_w', 'fc2_w'],
+                                                logging_frequency=10,
+                                                num_buckets=num_buckets,
+                                                lower_bound=lower_bound,
+                                                upper_bound=upper_bound,
+                                                accumulate=accumulate)
+        net_modifier(model.net, modify_output_record=True)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_curr_normalized_hist = workspace.FetchBlob('fc1_w_curr_normalized_hist')
+        cur_hist, acc_hist = self.histogram(fc1_w,
+                                            lower_bound=lower_bound,
+                                            upper_bound=upper_bound,
+                                            num_buckets=num_buckets)
+
+        self.assertEqual(fc1_w_curr_normalized_hist.size, num_buckets + 2)
+        self.assertAlmostEqual(np.linalg.norm(
+            fc1_w_curr_normalized_hist - cur_hist), 0.0, delta=1e-5)
+        self.assertEqual(len(model.net.Proto().op), 12)
+
+        assert 'fc1_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
+        assert 'fc2_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py
new file mode 100644
index 0000000..8a1928e
--- /dev/null
+++ b/caffe2/python/modeling/compute_norm_for_blobs.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema, muji
+from caffe2.python.modeling.net_modifier import NetModifier
+
+
+import numpy as np
+
+
+class ComputeNormForBlobs(NetModifier):
+    """
+    This class modifies the net passed in by adding ops to compute norms for
+    certain blobs.
+
+    Args:
+        blobs: list of blobs to compute norm for
+        logging_frequency: frequency for printing norms to logs
+        p: type of norm. Currently it supports p=1 or p=2
+        compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size)
+    """
+
+    def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False):
+        self._blobs = blobs
+        self._logging_frequency = logging_frequency
+        self._p = p
+        self._compute_averaged_norm = compute_averaged_norm
+        self._field_name_suffix = '_l{}_norm'.format(p)
+        if compute_averaged_norm:
+            self._field_name_suffix = '_averaged' + self._field_name_suffix
+
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                   modify_output_record=False):
+
+        p = self._p
+        compute_averaged_norm = self._compute_averaged_norm
+
+        CPU = muji.OnCPU()
+        # if given, blob_to_device is a map from blob to device_option
+        blob_to_device = blob_to_device or {}
+        for blob_name in self._blobs:
+            blob = core.BlobReference(blob_name)
+            if not net.BlobIsDefined(blob):
+                raise Exception('blob {0} is not defined in net {1}'.format(
+                    blob, net.Name()))
+            if blob in blob_to_device:
+                device = blob_to_device[blob]
+            else:
+                device = CPU
+
+            with core.DeviceScope(device):
+                norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
+                cast_blob = net.Cast(
+                    blob,
+                    net.NextScopedBlob(prefix=blob + '_float'),
+                    to=core.DataType.FLOAT
+                )
+                norm = net.LpNorm(
+                    cast_blob, norm_name, p=p, average=compute_averaged_norm
+                )
+
+                if self._logging_frequency >= 1:
+                    net.Print(norm, [], every_n=self._logging_frequency)
+
+                if modify_output_record:
+                    output_field_name = str(blob) + self._field_name_suffix
+                    output_scalar = schema.Scalar((np.float, (1,)), norm)
+
+                    if net.output_record() is None:
+                        net.set_output_record(
+                            schema.Struct((output_field_name, output_scalar))
+                        )
+                    else:
+                        net.AppendOutputRecordField(
+                            output_field_name,
+                            output_scalar)
+
+    def field_name_suffix(self):
+        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py
new file mode 100644
index 0000000..f4e8d1e
--- /dev/null
+++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py
@@ -0,0 +1,205 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import workspace, brew, model_helper
+from caffe2.python.modeling.compute_norm_for_blobs import ComputeNormForBlobs
+
+import numpy as np
+
+
+class ComputeNormForBlobsTest(unittest.TestCase):
+    def test_compute_norm_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
+
+        self.assertEqual(fc1_w_l2_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_l2_norm[0],
+                               np.linalg.norm(fc1_w)**2,
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
+
+        assert model.net.output_record() is None
+
+    def test_compute_norm_for_blobs_modify_output_record(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+        )
+
+        net_modifier(model.net, modify_output_record=True)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
+
+        self.assertEqual(fc1_w_l2_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_l2_norm[0],
+                               np.linalg.norm(fc1_w)**2,
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
+
+        assert 'fc1_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
+        assert 'fc2_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
+
+    def test_compute_averaged_norm_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+            compute_averaged_norm=True,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_l2_averaged_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm')
+
+        self.assertEqual(fc1_w_l2_averaged_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_l2_averaged_norm[0],
+                               np.linalg.norm(fc1_w)**2 / fc1_w.size,
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
+
+    def test_compute_norm_for_blobs_no_print(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=-1,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
+
+        self.assertEqual(fc1_w_l2_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_l2_norm[0],
+                               np.linalg.norm(fc1_w)**2,
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 6)
+
+    def test_compute_l1_norm_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+            p=1,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_l1_norm = workspace.FetchBlob('fc1_w_l1_norm')
+
+        self.assertEqual(fc1_w_l1_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_l1_norm[0],
+                               np.sum(np.abs(fc1_w)),
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
+
+    def test_compute_l1_averaged_norm_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+            p=1,
+            compute_averaged_norm=True,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_averaged_l1_norm = workspace.FetchBlob('fc1_w_averaged_l1_norm')
+
+        self.assertEqual(fc1_w_averaged_l1_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_averaged_l1_norm[0],
+                               np.sum(np.abs(fc1_w)) / fc1_w.size,
+                               delta=1e-5)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py
new file mode 100644
index 0000000..c51a4de
--- /dev/null
+++ b/caffe2/python/modeling/compute_statistics_for_blobs.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.modeling.net_modifier import NetModifier
+
+import numpy as np
+
+
+class ComputeStatisticsForBlobs(NetModifier):
+    """
+    This class modifies the net passed in by adding ops to compute statistics
+    for certain blobs. For each blob in the list, its min, max, mean and standard
+    deviation will be computed.
+
+    Args:
+        blobs: list of blobs to compute norm for
+        logging_frequency: frequency for printing norms to logs
+    """
+
+    def __init__(self, blobs, logging_frequency):
+        self._blobs = blobs
+        self._logging_frequency = logging_frequency
+        self._field_name_suffix = '_summary'
+
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                   modify_output_record=False):
+
+        for blob_name in self._blobs:
+            blob = core.BlobReference(blob_name)
+            if not net.BlobIsDefined(blob):
+                raise Exception('blob {0} is not defined in net {1}'.format(
+                    blob, net.Name()))
+
+            cast_blob = net.Cast(blob, to=core.DataType.FLOAT)
+            stats_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
+            stats = net.Summarize(cast_blob, stats_name, to_file=0)
+            net.Print(stats, [], every_n=self._logging_frequency)
+
+            if modify_output_record:
+                output_field_name = str(blob) + self._field_name_suffix
+                output_scalar = schema.Scalar((np.float, (1,)), stats)
+
+                if net.output_record() is None:
+                    net.set_output_record(
+                        schema.Struct((output_field_name, output_scalar))
+                    )
+                else:
+                    net.AppendOutputRecordField(
+                        output_field_name,
+                        output_scalar)
+
+    def field_name_suffix(self):
+        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
new file mode 100644
index 0000000..e880f3e
--- /dev/null
+++ b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
@@ -0,0 +1,84 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import workspace, brew, model_helper
+from caffe2.python.modeling.compute_statistics_for_blobs import (
+    ComputeStatisticsForBlobs
+)
+
+import numpy as np
+
+
+class ComputeStatisticsForBlobsTest(unittest.TestCase):
+    def test_compute_statistics_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeStatisticsForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_summary = workspace.FetchBlob('fc1_w_summary')
+
+        # std is unbiased here
+        stats_ref = np.array([fc1_w.flatten().min(), fc1_w.flatten().max(),
+                     fc1_w.flatten().mean(), fc1_w.flatten().std(ddof=1)])
+
+        self.assertAlmostEqual(np.linalg.norm(stats_ref - fc1_w_summary), 0,
+                               delta=1e-5)
+        self.assertEqual(fc1_w_summary.size, 4)
+
+        assert model.net.output_record() is None
+
+    def test_compute_statistics_for_blobs_modify_output_record(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        net_modifier = ComputeStatisticsForBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+        )
+
+        net_modifier(model.net, modify_output_record=True)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_summary = workspace.FetchBlob('fc1_w_summary')
+
+        # std is unbiased here
+        stats_ref = np.array([fc1_w.flatten().min(), fc1_w.flatten().max(),
+                     fc1_w.flatten().mean(), fc1_w.flatten().std(ddof=1)])
+
+        self.assertAlmostEqual(np.linalg.norm(stats_ref - fc1_w_summary), 0,
+                               delta=1e-5)
+        self.assertEqual(fc1_w_summary.size, 4)
+
+        self.assertEqual(len(model.net.Proto().op), 8)
+        assert 'fc1_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs()
+        assert 'fc2_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py
new file mode 100644
index 0000000..49d7e84
--- /dev/null
+++ b/caffe2/python/modeling/get_entry_from_blobs.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.modeling.net_modifier import NetModifier
+
+import numpy as np
+
+
+class GetEntryFromBlobs(NetModifier):
+    """
+    This class modifies the net passed in by adding ops to get a certain entry
+    from certain blobs.
+
+    Args:
+        blobs: list of blobs to get entry from
+        logging_frequency: frequency for printing entry values to logs
+        i1, i2: the first, second dimension of the blob. (currently, we assume
+        the blobs to be 2-dimensional blobs)
+    """
+
+    def __init__(self, blobs, logging_frequency, i1=0, i2=0):
+        self._blobs = blobs
+        self._logging_frequency = logging_frequency
+        self._i1 = i1
+        self._i2 = i2
+        self._field_name_suffix = '_{0}_{1}'.format(i1, i2)
+
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                    modify_output_record=False):
+
+        i1, i2 = [self._i1, self._i2]
+        if i1 < 0 or i2 < 0:
+            raise ValueError('index is out of range')
+
+        for blob_name in self._blobs:
+            blob = core.BlobReference(blob_name)
+            if not net.BlobIsDefined(blob):
+                raise Exception('blob {0} is not defined in net {1}'.format(
+                    blob, net.Name()))
+
+            blob_i1 = net.Slice([blob], starts=[i1, 0], ends=[i1 + 1, -1])
+            blob_i1_i2 = net.Slice([blob_i1],
+                            net.NextScopedBlob(prefix=blob + '_{0}_{1}'.format(i1, i2)),
+                            starts=[0, i2], ends=[-1, i2 + 1])
+
+            if self._logging_frequency >= 1:
+                net.Print(blob_i1_i2, [], every_n=self._logging_frequency)
+
+            if modify_output_record:
+                output_field_name = str(blob) + self._field_name_suffix
+                output_scalar = schema.Scalar((np.float, (1,)), blob_i1_i2)
+
+                if net.output_record() is None:
+                    net.set_output_record(
+                        schema.Struct((output_field_name, output_scalar))
+                    )
+                else:
+                    net.AppendOutputRecordField(output_field_name, output_scalar)
+
+    def field_name_suffix(self):
+        return self._field_name_suffix
diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py
new file mode 100644
index 0000000..f6023c6
--- /dev/null
+++ b/caffe2/python/modeling/get_entry_from_blobs_test.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import workspace, brew, model_helper
+from caffe2.python.modeling.get_entry_from_blobs import GetEntryFromBlobs
+
+import numpy as np
+
+
+class GetEntryFromBlobsTest(unittest.TestCase):
+    def test_get_entry_from_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=10, dim_out=8)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=8, dim_out=4)
+        i1, i2 = np.random.randint(4, size=2)
+        net_modifier = GetEntryFromBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+            i1=i1,
+            i2=i2,
+        )
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 10).astype(np.float32))
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
+
+        self.assertEqual(fc1_w_entry.size, 1)
+        self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
+        assert model.net.output_record() is None
+
+    def test_get_entry_from_blobs_modify_output_record(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=4)
+
+        # no operator name set, will use default
+        brew.fc(model, fc1, "fc2", dim_in=4, dim_out=4)
+        i1, i2 = np.random.randint(4, size=2)
+        net_modifier = GetEntryFromBlobs(
+            blobs=['fc1_w', 'fc2_w'],
+            logging_frequency=10,
+            i1=i1,
+            i2=i2,
+        )
+        net_modifier(model.net, modify_output_record=True)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
+
+        self.assertEqual(fc1_w_entry.size, 1)
+        self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
+
+        assert 'fc1_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
+        assert 'fc2_w' + net_modifier.field_name_suffix() in\
+            model.net.output_record().field_blobs(),\
+            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py
new file mode 100644
index 0000000..1c7fa5d
--- /dev/null
+++ b/caffe2/python/modeling/gradient_clipping.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.proto import caffe2_pb2
+from caffe2.python.optimizer import get_param_device
+from caffe2.python.modeling.net_modifier import NetModifier
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class GradientClipping(NetModifier):
+
+    L1_NORM = 'l1_norm'
+    L2_NORM = 'l2_norm'
+
+    BY_NORM = 'by_norm'
+    BY_VALUE = 'by_value'
+
+    GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
+    CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
+
+    def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
+                clip_threshold=0.1, use_parameter_norm=False,
+                compute_norm_ratio=False, clip_max=1, clip_min=-1):
+        """
+        Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
+
+        Args:
+        grad_clip_method: ways to clip the gradients
+        clip_norm_type: type of norm used in the necessary computation
+        clip_threshold: threshold used to determine whether to clip
+        use_parameter_norm: a boolean to indicate whether to incorporate
+            the norm of the parameter
+        compute_norm_ratio: a boolean to compute the ratio between gradient norm
+            and parameter norm explicitly for debugging purpose
+        clip_max: when clipping by_value, any value that is greater than
+            clip_max will be clipped to clip_max
+        clip_min: when clipping by_value, any value that is smaller than
+            clip_min will be clipped to clip_min
+        """
+
+        assert grad_clip_method in self.GRAD_CLIP_METHODS, (
+            "This method of clipping, {}, has not been implemented.".format(
+                clip_norm_type))
+        if clip_norm_type is not None:
+            assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
+                "This method of clipping, {}, has not been implemented.".format(
+                    clip_norm_type))
+
+        self.grad_clip_method = grad_clip_method
+        self.clip_norm_type = clip_norm_type
+        self.clip_threshold = float(clip_threshold)
+        self.use_parameter_norm = use_parameter_norm
+        self.compute_norm_ratio = compute_norm_ratio
+        self.clip_max = float(clip_max)
+        self.clip_min = float(clip_min)
+
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                   modify_output_record=False):
+
+        assert grad_map is not None
+
+        CPU = core.DeviceOption(caffe2_pb2.CPU)
+
+        for param, grad in grad_map.items():
+
+            # currently sparse gradients won't be clipped
+            # futher implementation is needed to enable it
+            if isinstance(grad, core.GradientSlice):
+                continue
+
+            device = get_param_device(
+                param,
+                grad_map[str(param)],
+                param_to_device=blob_to_device,
+                default_device=CPU,
+            )
+
+            with core.DeviceScope(device):
+                if self.grad_clip_method == self.BY_NORM:
+                    if self.clip_norm_type == self.L2_NORM:
+                        p = 2
+                    elif self.clip_norm_type == self.L1_NORM:
+                        p = 1
+
+                    grad_norm = net.LpNorm(
+                        [grad],
+                        net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
+                        p=p,
+                    )
+
+                    if p == 2:
+                        grad_norm = net.Pow([grad_norm], exponent=0.5)
+
+                    op_inputs = [grad, grad_norm]
+
+                    if self.use_parameter_norm:
+                        param_norm = net.LpNorm(
+                            [param],
+                            net.NextScopedBlob(
+                                prefix=str(param) + '_l{}_norm'.format(p)),
+                            p=p,
+                        )
+
+                        if p == 2:
+                            param_norm = net.Pow([param_norm], exponent=0.5)
+
+                        op_inputs.append(param_norm)
+
+                        if self.compute_norm_ratio:
+                            net.Div(
+                                [grad_norm, param_norm],
+                                [net.NextScopedBlob(
+                                    prefix=str(param) + '_norm_ratio')]
+                            )
+
+                    net.ClipTensorByScaling(
+                        op_inputs,
+                        [grad],
+                        threshold=self.clip_threshold,
+                    )
+                elif self.grad_clip_method == self.BY_VALUE:
+                    net.Clip(
+                        [grad],
+                        [grad],
+                        max=self.clip_max,
+                        min=self.clip_min,
+                    )
diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py
new file mode 100644
index 0000000..e5fc2ba
--- /dev/null
+++ b/caffe2/python/modeling/gradient_clipping_test.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import workspace, brew, model_helper
+from caffe2.python.modeling.gradient_clipping import GradientClipping
+
+import numpy as np
+
+
+class GradientClippingTest(unittest.TestCase):
+    def test_gradient_clipping_by_norm(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        sigm = model.net.Sigmoid(fc2, 'sigm')
+        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
+        loss = model.net.SumElements(sq, 'loss')
+
+        grad_map = model.AddGradientOperators([loss])
+
+        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
+
+        net_modifier = GradientClipping(
+            grad_clip_method='by_norm',
+            clip_norm_type='l2_norm',
+            clip_threshold=0.1,
+        )
+
+        net_modifier(model.net, grad_map=grad_map_for_param)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        # 5 forward ops + 6 backward ops + 2 * (3 gradient clipping ops)
+        self.assertEqual(len(model.net.Proto().op), 17)
+
+    def test_gradient_clipping_by_norm_l1_norm(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        sigm = model.net.Sigmoid(fc2, 'sigm')
+        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
+        loss = model.net.SumElements(sq, 'loss')
+
+        grad_map = model.AddGradientOperators([loss])
+
+        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
+
+        net_modifier = GradientClipping(
+            grad_clip_method='by_norm',
+            clip_norm_type='l1_norm',
+            clip_threshold=0.1,
+        )
+
+        net_modifier(model.net, grad_map=grad_map_for_param)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        # 5 forward ops + 6 backward ops + 2 * (2 gradient clipping ops)
+        self.assertEqual(len(model.net.Proto().op), 15)
+
+    def test_gradient_clipping_by_norm_using_param_norm(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        sigm = model.net.Sigmoid(fc2, 'sigm')
+        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
+        loss = model.net.SumElements(sq, 'loss')
+
+        grad_map = model.AddGradientOperators([loss])
+
+        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
+
+        net_modifier = GradientClipping(
+            grad_clip_method='by_norm',
+            clip_norm_type='l2_norm',
+            clip_threshold=0.1,
+            use_parameter_norm=True,
+        )
+
+        net_modifier(model.net, grad_map=grad_map_for_param)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        # 5 forward ops + 6 backward ops + 2 * (5 gradient clipping ops)
+        self.assertEqual(len(model.net.Proto().op), 21)
+
+    def test_gradient_clipping_by_norm_compute_norm_ratio(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        sigm = model.net.Sigmoid(fc2, 'sigm')
+        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
+        loss = model.net.SumElements(sq, 'loss')
+
+        grad_map = model.AddGradientOperators([loss])
+
+        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
+
+        net_modifier = GradientClipping(
+            grad_clip_method='by_norm',
+            clip_norm_type='l2_norm',
+            clip_threshold=0.1,
+            use_parameter_norm=True,
+            compute_norm_ratio=True,
+        )
+
+        net_modifier(model.net, grad_map=grad_map_for_param)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        # 5 forward ops + 6 backward ops + 2 * (6 gradient clipping ops)
+        self.assertEqual(len(model.net.Proto().op), 23)
+
+    def test_gradient_clipping_by_value(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
+
+        sigm = model.net.Sigmoid(fc2, 'sigm')
+        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
+        loss = model.net.SumElements(sq, 'loss')
+
+        grad_map = model.AddGradientOperators([loss])
+
+        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
+
+        clip_max = 1e-8
+        clip_min = 0
+        net_modifier = GradientClipping(
+            grad_clip_method='by_value',
+            clip_max=clip_max,
+            clip_min=clip_min,
+        )
+
+        net_modifier(model.net, grad_map=grad_map_for_param)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        # 5 forward ops + 6 backward ops + 2 * (1 gradient clipping ops)
+        self.assertEqual(len(model.net.Proto().op), 13)
+
+        fc1_w_grad = workspace.FetchBlob('fc1_w_grad')
+        self.assertLessEqual(np.amax(fc1_w_grad), clip_max)
+        self.assertGreaterEqual(np.amin(fc1_w_grad), clip_min)
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
new file mode 100644
index 0000000..aa7617b
--- /dev/null
+++ b/caffe2/python/modeling/initializers.py
@@ -0,0 +1,155 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.core import DataType, BlobReference, ScopedBlobReference
+from caffe2.python.modeling.parameter_info import ParameterInfo
+
+import six
+
+
+class Initializer(object):
+    '''
+    This class abstracts out parameter creation. One can come up with a new
+    Initializer in order to implement more complex parameter initializaion logic
+    '''
+
+    def __init__(self, operator_name=None, **kwargs):
+        self.operator_name = operator_name
+        self.operator_kwargs = kwargs
+
+    def update(self, operator_name, kwargs):
+        if self.operator_name is not None:
+            raise Exception("Operator name overwrites are not allowed")
+        self.operator_name = operator_name
+        self.operator_kwargs = kwargs
+
+    def create_param(self, param_name, init_net, shape):
+        param = init_net.__getattr__(self.operator_name)(
+            [], param_name, shape=shape, **self.operator_kwargs)
+        return ParameterInfo(
+            param_id=None,
+            param=param,
+            shape=shape,
+        )
+
+
+class ExternalInitializer(object):
+    '''
+    This class is used in cases when the parameter should not be initialized by
+    the initializer, but rather provided in the workspace when param_init_net is
+    executed.
+
+    Current version is not doing any real sanity checks to the parameter.
+    '''
+
+    def create_param(self, param_name, init_net, shape):
+        if isinstance(param_name, BlobReference):
+            param = BlobReference(str(param_name), init_net)
+        elif isinstance(param_name, six.string_types):
+            param = ScopedBlobReference(param_name, init_net)
+        else:
+            raise "Unsupported type for param_name"
+        # TODO(amalevich): Add operator that will check param in the workspace
+        return ParameterInfo(
+            param_id=None,
+            param=param,
+            shape=shape,
+        )
+
+
+class PseudoFP16Initializer(Initializer):
+    '''
+    Used in cases when the parameter should be used at half (16-bit) precision
+    for compute purposes (i.e. on the forward and backward pass) but
+    needs to be stored and optimized at single (32-bit) precision so tiny
+    gradients with small learning rates don't underflow FP16 precision.
+    A 32-bit copy of the 16-bit blob is stored in the ParameterInfo.
+    This is helpful for mixed-precision training, see
+    https://arxiv.org/abs/1710.03740 for details.
+    '''
+    def update(self, operator_name, kwargs):
+        if self.operator_name is not None:
+            raise Exception("Operator name overwrites are not allowed")
+        self.operator_name = operator_name
+        self.operator_kwargs = kwargs
+
+    def create_param(self, param_name, init_net, shape):
+        # create master fp32 copy
+        param_fp32 = init_net.__getattr__(self.operator_name)(
+            [], param_name + "_fp32", shape=shape,
+            **self.operator_kwargs)
+        # cast to fp16 copy
+        param = init_net.FloatToHalf(
+            param_fp32, param_name)
+
+        return ParameterInfo(
+            param_id=None,
+            param=param,
+            shape=shape,
+            blob_copy={DataType.FLOAT: param_fp32}
+        )
+
+
+class ReversePseudoFP16Initializer(Initializer):
+    '''
+    Like PseudoFP16Initializer above, except the primary blob is taken to
+    be the 32-bit precision parameter, and the 16-bit version of the blob
+    is stored in blob_copy instead.
+    '''
+    def update(self, operator_name, kwargs):
+        if self.operator_name is not None:
+            raise Exception("Operator name overwrites are not allowed")
+        self.operator_name = operator_name
+        self.operator_kwargs = kwargs
+
+    def create_param(self, param_name, init_net, shape):
+        # create master fp32 copy
+        param_fp32 = init_net.__getattr__(self.operator_name)(
+            [], param_name, shape=shape,
+            **self.operator_kwargs)
+        # cast to fp16 copy
+        param_fp16 = init_net.FloatToHalf(
+            param_fp32, param_name + "_fp16")
+
+        return ParameterInfo(
+            param_id=None,
+            param=param_fp32,
+            shape=shape,
+            blob_copy={DataType.FLOAT16: param_fp16}
+        )
+
+def update_initializer(initializer_class,
+                       operator_name_and_kwargs,
+                       default_operator_name_and_kwargs):
+    '''
+    A helper function to convert from operator_name_and_kwargs to new
+    object of type initializer_class. This function serves two purposes:
+
+    1. Support for custom initialization operators being passed in
+    2. Allow user to specify a custom Initializer without overwriting
+       default operators used for initialization
+
+    If initializer_class is None, creates a default initializer using
+    the Initializer class and operator_name_and_kwargs provided
+
+    If operator_name_and_kwargs is None, uses default_operator_name_and_kwargs
+
+    returns an instantiated Initializer object
+    '''
+    def get_initializer_args():
+        return (
+            operator_name_and_kwargs or
+            default_operator_name_and_kwargs
+        )
+
+    if initializer_class is not None:
+        init = initializer_class(get_initializer_args()[0],
+                                 **get_initializer_args()[1])
+    else:
+        init = Initializer(
+            get_initializer_args()[0],
+            **get_initializer_args()[1]
+        )
+    return init
diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py
new file mode 100644
index 0000000..0355d18
--- /dev/null
+++ b/caffe2/python/modeling/initializers_test.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+from caffe2.python import brew, model_helper, workspace
+from caffe2.python.modeling.initializers import (
+        Initializer, PseudoFP16Initializer)
+
+
+class InitializerTest(unittest.TestCase):
+    def test_fc_initializer(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)
+
+        # no operator name set, will use default
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=1, dim_out=1,
+                      WeightInitializer=Initializer)
+
+        # no operator name set, will use custom
+        fc3 = brew.fc(model, fc2, "fc3", dim_in=1, dim_out=1,
+                      WeightInitializer=Initializer,
+                      weight_init=("ConstantFill", {}),
+        )
+
+        # operator name set, no initializer class set
+        fc4 = brew.fc(model, fc3, "fc4", dim_in=1, dim_out=1,
+                      WeightInitializer=None,
+                      weight_init=("ConstantFill", {})
+        )
+
+    @unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
+    def test_fc_fp16_initializer(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)
+
+        # default operator, PseudoFP16Initializer
+        fc2 = brew.fc(model, fc1, "fc2", dim_in=1, dim_out=1,
+                      WeightInitializer=PseudoFP16Initializer
+        )
+
+        # specified operator, PseudoFP16Initializer
+        fc3 = brew.fc(model, fc2, "fc3", dim_in=1, dim_out=1,
+                      weight_init=("ConstantFill", {}),
+                      WeightInitializer=PseudoFP16Initializer
+        )
+
+    def test_fc_external_initializer(self):
+        model = model_helper.ModelHelper(name="test", init_params=False)
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)  # noqa
+        self.assertEqual(len(model.net.Proto().op), 1)
+        self.assertEqual(len(model.param_init_net.Proto().op), 0)
diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py
new file mode 100644
index 0000000..0f0ac75
--- /dev/null
+++ b/caffe2/python/modeling/net_modifier.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import abc
+import six
+
+
+class NetModifier(six.with_metaclass(abc.ABCMeta, object)):
+    """
+    An abstraction class for supporting modifying a generated net.
+    Inherited classes should implement the modify_net method where
+    related operators are added to the net.
+
+    Example usage:
+        modifier = SomeNetModifier(opts)
+        modifier(net)
+    """
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None):
+        pass
+
+    def __call__(self, net, init_net=None, grad_map=None, blob_to_device=None,
+                 modify_output_record=False):
+        self.modify_net(
+            net,
+            init_net=init_net,
+            grad_map=grad_map,
+            blob_to_device=blob_to_device,
+            modify_output_record=modify_output_record)
diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py
new file mode 100644
index 0000000..f2c5c6a
--- /dev/null
+++ b/caffe2/python/modeling/parameter_info.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+
+import numpy as np
+
+
+class ParameterTags(object):
+    BIAS = 'BIAS'
+    WEIGHT = 'WEIGHT'
+    COMPUTED_PARAM = 'COMPUTED_PARAM'
+
+
+class ParameterType(object):
+    DENSE = 'dense'
+    SPARSE = 'sparse'
+
+
+class ParameterInfo(object):
+
+    def __init__(
+            self, param_id, param, key=None, shape=None, length=None,
+            grad=None, blob_copy=None):
+        assert isinstance(param, core.BlobReference)
+        self.param_id = param_id
+        self.name = str(param)
+        self.blob = param
+        self.key = key
+        self.shape = shape
+        self.size = None if shape is None else np.prod(shape)
+        self.length = max(1, length if length is not None else 1)
+        self.grad = grad
+        self._cloned_init_net = None
+        # Optionally store equivalent copies of the blob
+        # in different precisions (i.e. half and float copies)
+        # stored as a dict of TensorProto.DataType -> BlobReference
+        self.blob_copy = blob_copy
+        # each param_info can have its own optimizer. It can be set within
+        # OptimizerContext (caffe2/python/optimizer.py)
+        self._optimizer = None
+
+    def grad_type(self):
+        # self.grad could be None for model parallelism with parameter server
+        if self.grad is None:
+            return
+        return (
+            ParameterType.SPARSE if isinstance(self.grad, core.GradientSlice)
+            else ParameterType.DENSE)
+
+    @property
+    def parameter(self):
+        return self.blob
+
+    @property
+    def optimizer(self):
+        return self._optimizer
+
+    @optimizer.setter
+    def optimizer(self, value):
+        assert self._optimizer is None, "optimizer has already been set"
+        self._optimizer = value
+
+    def __str__(self):
+        return self.name
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
new file mode 100644
index 0000000..80590f4
--- /dev/null
+++ b/caffe2/python/modeling/parameter_sharing.py
@@ -0,0 +1,118 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import scope
+
+import contextlib
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class ParameterSharingContext(object):
+    """
+    This class manages scope driven way of parameter sharing across different
+    NameScopes.
+    """
+
+    def __init__(self):
+        self._scope_overrides = {}
+        self._contexts = []
+
+    def _resolve_scope_overrides(self, candidate_scope):
+        """
+        Recursively resolves all scope overrides, i.e multiple steps of
+        override can be used.
+
+        For example, if one provides following scope overrides:
+        {'scope_b': 'scope_a'} and within 'scope_b' - {'shared_child': ''},
+        then name 'w' will get resolved to the following blobs depending on the
+        namescope:
+          a. 'scope_a' -> 'scope_a/w'
+          b. 'scope_b' -> 'scope_a/w'
+          c. 'scope_c' -> 'scope_c/w'
+          d. 'scope_b/shared_child' -> 'scope_a/w'
+          d. 'scope_b/unshared_child' -> 'scope_a/unshared_child/w'
+        """
+        best_scope = candidate_scope
+        best_scope_idx = 0
+        sub_scopes = candidate_scope.split(scope._NAMESCOPE_SEPARATOR)
+
+        cur_scope = ''
+        for idx, sub_scope in enumerate(sub_scopes):
+            cur_scope = cur_scope + sub_scope + scope._NAMESCOPE_SEPARATOR
+            if cur_scope in self._scope_overrides:
+                best_scope = self._scope_overrides[cur_scope]
+                best_scope_idx = idx
+        if best_scope == candidate_scope:
+            return candidate_scope
+        else:
+            return (self._resolve_scope_overrides(best_scope) +
+                    scope._NAMESCOPE_SEPARATOR.join(
+                        sub_scopes[best_scope_idx + 1:]))
+
+    def get_parameter_name(self, name):
+        candidate_scope = scope.CurrentNameScope()
+        best_scope = self._resolve_scope_overrides(candidate_scope)
+        if best_scope != candidate_scope:
+            logger.info("Overwiting scope {0} with scope {1}".format(
+                candidate_scope, best_scope))
+
+        return best_scope + name
+
+    def add_scope_overrides(self, shared_scopes):
+        self._contexts.append(shared_scopes)
+        self._scope_overrides.update(shared_scopes)
+
+    def pop(self):
+        assert len(self._contexts) > 0
+        self._contexts.pop()
+        self._scope_overrides = {}
+        for x in self._contexts:
+            self._scope_overrides.update(x)
+
+
+parameter_sharing_context = ParameterSharingContext()
+
+
+def _normalize_namescope(namescope):
+    if namescope and namescope[-1] != scope._NAMESCOPE_SEPARATOR:
+        return namescope + scope._NAMESCOPE_SEPARATOR
+    else:
+        return namescope
+
+
+@contextlib.contextmanager
+def ParameterSharing(shared_scopes):
+    """
+    Helper function for sharing scopes.
+    All the parameters within the shared_scopes, will be remapped with the
+    respect of CurrentNamescope()
+
+    I.e. if one calls ParameterSharing with {'scope_b': 'scope_'a'}, from the
+    scope 'some_global_scope', it'll effectively mean, that all parameters from
+    'some_global_scope/scope_b' will shared with the parameters from
+    'some_global_scope/scope_a'
+    """
+    assert isinstance(shared_scopes, dict)
+
+    shared_scope_overrides = {}
+    current_scope = scope.CurrentNameScope()
+    for k, v in shared_scopes.items():
+        assert not v.startswith(k), (
+            "Illegal override for parameter sharing. {} is prefix of {}".
+            format(k, v))
+        k = current_scope + k
+        v = current_scope + v
+        # Normalize all the scopes, so scope_a and scope_a/ are equivalent
+        k = _normalize_namescope(k)
+        v = _normalize_namescope(v)
+        shared_scope_overrides[k] = v
+
+    try:
+        parameter_sharing_context.add_scope_overrides(shared_scope_overrides)
+        yield
+    finally:
+        parameter_sharing_context.pop()
diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py
new file mode 100644
index 0000000..f616fc1
--- /dev/null
+++ b/caffe2/python/modeling/parameter_sharing_test.py
@@ -0,0 +1,155 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew, model_helper, scope
+from caffe2.python.modeling.parameter_sharing import (
+    ParameterSharing,
+    parameter_sharing_context,
+)
+from caffe2.python.modeling.initializers import (
+    Initializer
+)
+import unittest
+
+
+class ParameterSharingTest(unittest.TestCase):
+
+    def test_parameter_sharing_default_scopes(self):
+        # Test no sharing default scopes
+        param_1 = parameter_sharing_context.get_parameter_name('w')
+        self.assertEquals(param_1, 'w')
+        with scope.NameScope('scope'):
+            param_2 = parameter_sharing_context.get_parameter_name('w')
+            self.assertEquals(param_2, 'scope/w')
+            with scope.NameScope('scope_2'):
+                param_3 = parameter_sharing_context.get_parameter_name('w')
+                self.assertEquals(param_3, 'scope/scope_2/w')
+
+    def test_parameter_sharing_nested_scopes(self):
+        # Test parameter sharing
+        with scope.NameScope('global_scope'):
+            with ParameterSharing({'model_b': 'model_a'}):
+                param_global = parameter_sharing_context.get_parameter_name('w')
+                self.assertEquals(param_global, 'global_scope/w')
+                # This scope is overridden to match 'model_a'
+                with scope.NameScope('model_b'):
+                    with ParameterSharing({'shared_scope': ''}):
+                        param_4 = parameter_sharing_context.get_parameter_name(
+                            'w')
+                        self.assertEquals(param_4, 'global_scope/model_a/w')
+                        with scope.NameScope('shared_scope'):
+                            param_5 = parameter_sharing_context.\
+                                get_parameter_name('w')
+                            self.assertEquals(param_5, 'global_scope/model_a/w')
+                # This scope is supposed to have not sharing
+                with scope.NameScope('model_c'):
+                    with ParameterSharing({'shared_scope': ''}):
+                        param_4 = parameter_sharing_context.get_parameter_name(
+                            'w')
+                        self.assertEquals(param_4, 'global_scope/model_c/w')
+                        with scope.NameScope('shared_scope'):
+                            param_5 = parameter_sharing_context.\
+                                get_parameter_name('w')
+                            self.assertEquals(param_5, 'global_scope/model_c/w')
+
+    def test_parameter_sharing_subscopes(self):
+        # Sharing only one of the subscopes
+        with ParameterSharing({'global_scope/b': 'global_scope/a'}):
+            with scope.NameScope('global_scope'):
+                param_6 = parameter_sharing_context.get_parameter_name('w')
+                self.assertEquals(param_6, 'global_scope/w')
+                with scope.NameScope('a'):
+                    param_7 = parameter_sharing_context.get_parameter_name('w')
+                    self.assertEquals(param_7, 'global_scope/a/w')
+                with scope.NameScope('b'):
+                    param_8 = parameter_sharing_context.get_parameter_name('w')
+                    self.assertEquals(param_8, 'global_scope/a/w')
+                with scope.NameScope('c'):
+                    param_9 = parameter_sharing_context.get_parameter_name('w')
+                    self.assertEquals(param_9, 'global_scope/c/w')
+
+    def test_create_param(self):
+        model = model_helper.ModelHelper(name="test")
+        # Test no sharing default scopes
+        p1 = model.create_param(
+            'w',
+            shape=[2],
+            initializer=Initializer("ConstantFill")
+        )
+        with scope.NameScope('some_global_scope'):
+            p2 = model.create_param(
+                'w',
+                shape=[2],
+                initializer=Initializer("ConstantFill")
+            )
+        self.assertNotEqual(model.get_param_info(p1), None)
+        self.assertNotEqual(model.get_param_info(p2), None)
+        self.assertNotEqual(model.get_param_info(p1), model.get_param_info(p2))
+        model.Validate()
+
+    def test_deep_hierarchy(self):
+        model = model_helper.ModelHelper(name="test")
+        with ParameterSharing({'a': 'b'}):
+            with scope.NameScope('a'):
+                with ParameterSharing({'c': 'd'}):
+                    with scope.NameScope('c'):
+                        with ParameterSharing({'e': 'f'}):
+                            with scope.NameScope('e'):
+                                p = model.create_param(
+                                    'w',
+                                    shape=[2],
+                                    initializer=Initializer("ConstantFill")
+                                )
+        self.assertNotEqual(model.get_param_info(p), None)
+
+
+    def test_parameter_sharing_brew(self):
+        # Test no sharing default scopes
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=16, dim_out=16)
+        # Shared params are expected to share the same shape and fail if it's
+        # not true
+        with self.assertRaises(AssertionError):
+            _ = brew.fc(model, data, "fc1", dim_in=2, dim_out=2)  # noqa
+
+        output_blobs = set()
+        with scope.NameScope('some_global_scope'):
+            with scope.NameScope('model_a'):
+                output_blobs.add(str(brew.fc(model, fc1, 'output', 16, 16)))
+            with ParameterSharing({'model_b': 'model_a'}),\
+                    scope.NameScope('model_b'):
+                with ParameterSharing({'shared_1': '', 'shared_2': ''}):
+                    # All params in DenseLayers from shared_1, shared_2 and
+                    # model_a are shared and will be pointing to:
+                    # [some_global_scope/model_a/output_W,
+                    #  some_global_scope/model_a/output_b]
+                    with scope.NameScope('shared_1'):
+                        output_blobs.add(
+                            str(brew.fc(model, fc1, 'output', 16, 16)))
+                    with scope.NameScope('shared_2'):
+                        output_blobs.add(
+                            str(brew.fc(model, fc1, 'output', 16, 16)))
+                    # Params of this layer are not shared with anyone unless
+                    # there is some explicit sharing with model_a/unshared (not
+                    # in this example).
+                    # Names of the blobs are
+                    # [some_global_scope/model_a/unshared/output_W,
+                    #  some_global_scope/model_a/unshared/output_b]
+                    with scope.NameScope('unshared'):
+                        output_blobs.add(
+                            str(brew.fc(model, fc1, 'output', 16, 16)))
+
+        self.assertEqual(len(model._parameters_info), 6)
+        self.assertEqual(len(output_blobs), 4)
+        self.assertEqual(sorted(model._parameters_info.keys()), [
+            'fc1_b',
+            'fc1_w',
+            'some_global_scope/model_a/output_b',
+            'some_global_scope/model_a/output_w',
+            'some_global_scope/model_a/unshared/output_b',
+            'some_global_scope/model_a/unshared/output_w',
+        ])
+        model.Validate()
diff --git a/caffe2/python/models/__init__.py b/caffe2/python/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py
new file mode 100644
index 0000000..79f0458
--- /dev/null
+++ b/caffe2/python/models/__sym_init__.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import os
+from caffe2.proto import caffe2_pb2
+
+
+def _parseFile(filename):
+    out_net = caffe2_pb2.NetDef()
+    # TODO(bwasti): A more robust handler for pathnames.
+    dir_path = os.path.dirname(__file__)
+    with open('{dir_path}/{filename}'.format(dir_path=dir_path,
+                                             filename=filename), 'rb') as f:
+        out_net.ParseFromString(f.read())
+    return out_net
+
+
+init_net = _parseFile('init_net.pb')
+predict_net = _parseFile('predict_net.pb')
diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py
new file mode 100644
index 0000000..2501040
--- /dev/null
+++ b/caffe2/python/models/download.py
@@ -0,0 +1,211 @@
+## @package download
+# Module caffe2.python.models.download
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import os
+import sys
+import signal
+import re
+import json
+
+from caffe2.proto import caffe2_pb2
+
+# Import urllib
+try:
+    import urllib.error as urlliberror
+    import urllib.request as urllib
+    HTTPError = urlliberror.HTTPError
+    URLError = urlliberror.URLError
+except ImportError:
+    import urllib2 as urllib
+    HTTPError = urllib.HTTPError
+    URLError = urllib.URLError
+
+# urllib requires more work to deal with a redirect, so not using vanity url
+DOWNLOAD_BASE_URL = "https://s3.amazonaws.com/download.caffe2.ai/models/"
+DOWNLOAD_COLUMNS = 70
+
+
+# Don't let urllib hang up on big downloads
+def signalHandler(signal, frame):
+    print("Killing download...")
+    exit(0)
+
+
+signal.signal(signal.SIGINT, signalHandler)
+
+
+def deleteDirectory(top_dir):
+    for root, dirs, files in os.walk(top_dir, topdown=False):
+        for name in files:
+            os.remove(os.path.join(root, name))
+        for name in dirs:
+            os.rmdir(os.path.join(root, name))
+    os.rmdir(top_dir)
+
+
+def progressBar(percentage):
+    full = int(DOWNLOAD_COLUMNS * percentage / 100)
+    bar = full * "#" + (DOWNLOAD_COLUMNS - full) * " "
+    sys.stdout.write(u"\u001b[1000D[" + bar + "] " + str(percentage) + "%")
+    sys.stdout.flush()
+
+
+def downloadFromURLToFile(url, filename, show_progress=True):
+    try:
+        print("Downloading from {url}".format(url=url))
+        response = urllib.urlopen(url)
+        size = int(response.info().get('Content-Length').strip())
+        chunk = min(size, 8192)
+        print("Writing to {filename}".format(filename=filename))
+        if show_progress:
+            downloaded_size = 0
+            progressBar(0)
+        with open(filename, "wb") as local_file:
+            while True:
+                data_chunk = response.read(chunk)
+                if not data_chunk:
+                    break
+                local_file.write(data_chunk)
+                if show_progress:
+                    downloaded_size += len(data_chunk)
+                    progressBar(int(100 * downloaded_size / size))
+        print("")  # New line to fix for progress bar
+    except HTTPError as e:
+        raise Exception("Could not download model. [HTTP Error] {code}: {reason}."
+                        .format(code=e.code, reason=e.reason))
+    except URLError as e:
+        raise Exception("Could not download model. [URL Error] {reason}."
+                        .format(reason=e.reason))
+    except Exception as e:
+        raise e
+
+
+def getURLFromName(name, filename):
+    return "{base_url}{name}/{filename}".format(base_url=DOWNLOAD_BASE_URL,
+                                                name=name, filename=filename)
+
+
+def downloadModel(model, args):
+    # Figure out where to store the model
+    model_folder = '{folder}'.format(folder=model)
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    if args.install:
+        model_folder = '{dir_path}/{folder}'.format(dir_path=dir_path,
+                                                    folder=model)
+
+    # Check if that folder is already there
+    if os.path.exists(model_folder) and not os.path.isdir(model_folder):
+        if not args.force:
+            raise Exception("Cannot create folder for storing the model,\
+                            there exists a file of the same name.")
+        else:
+            print("Overwriting existing file! ({filename})"
+                  .format(filename=model_folder))
+            os.remove(model_folder)
+    if os.path.isdir(model_folder):
+        if not args.force:
+            response = ""
+            query = "Model already exists, continue? [y/N] "
+            try:
+                response = raw_input(query)
+            except NameError:
+                response = input(query)
+            if response.upper() == 'N' or not response:
+                print("Cancelling download...")
+                exit(0)
+        print("Overwriting existing folder! ({filename})".format(filename=model_folder))
+        deleteDirectory(model_folder)
+
+    # Now we can safely create the folder and download the model
+    os.makedirs(model_folder)
+    for f in ['predict_net.pb', 'init_net.pb']:
+        try:
+            downloadFromURLToFile(getURLFromName(model, f),
+                                  '{folder}/{f}'.format(folder=model_folder,
+                                                        f=f))
+        except Exception as e:
+            print("Abort: {reason}".format(reason=str(e)))
+            print("Cleaning up...")
+            deleteDirectory(model_folder)
+            exit(0)
+
+    if args.install:
+        os.symlink("{folder}/__sym_init__.py".format(folder=dir_path),
+                   "{folder}/__init__.py".format(folder=model_folder))
+
+
+def validModelName(name):
+    invalid_names = ['__init__']
+    if name in invalid_names:
+        return False
+    if not re.match("^[/0-9a-zA-Z_-]+$", name):
+        return False
+    return True
+
+class ModelDownloader:
+    def _model_dir(self, model):
+        caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2'))
+        models_dir = os.getenv('CAFFE2_MODELS', os.path.join(caffe2_home, 'models'))
+        return os.path.join(models_dir, model)
+
+    def _download(self, model):
+        model_dir = self._model_dir(model)
+        assert not os.path.exists(model_dir)
+        os.makedirs(model_dir)
+        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+            url = getURLFromName(model, f)
+            dest = os.path.join(model_dir, f)
+            try:
+                try:
+                    downloadFromURLToFile(url, dest,
+                                          show_progress=False)
+                except TypeError:
+                    # show_progress not supported prior to
+                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                    # (Sep 17, 2017)
+                    downloadFromURLToFile(url, dest)
+            except Exception as e:
+                print("Abort: {reason}".format(reason=e))
+                print("Cleaning up...")
+                deleteDirectory(model_dir)
+                exit(1)
+
+    def get_c2_model(self, model_name):
+        model_dir = self._model_dir(model_name)
+        if not os.path.exists(model_dir):
+            self._download(model_name)
+        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
+        c2_predict_net = caffe2_pb2.NetDef()
+        with open(c2_predict_pb, 'rb') as f:
+            c2_predict_net.ParseFromString(f.read())
+        c2_predict_net.name = model_name
+
+        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
+        c2_init_net = caffe2_pb2.NetDef()
+        with open(c2_init_pb, 'rb') as f:
+            c2_init_net.ParseFromString(f.read())
+        c2_init_net.name = model_name + '_init'
+
+        value_info = json.load(open(os.path.join(model_dir, 'value_info.json')))
+        return c2_init_net, c2_predict_net, value_info
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Download or install pretrained models.')
+    parser.add_argument('model', nargs='+',
+                        help='Model to download/install.')
+    parser.add_argument('-i', '--install', action='store_true',
+                        help='Install the model.')
+    parser.add_argument('-f', '--force', action='store_true',
+                        help='Force a download/installation.')
+    args = parser.parse_args()
+    for model in args.model:
+        if validModelName(model):
+            downloadModel(model, args)
+        else:
+            print("'{}' is not a valid model name.".format(model))
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
new file mode 100644
index 0000000..60e00ed
--- /dev/null
+++ b/caffe2/python/models/resnet.py
@@ -0,0 +1,340 @@
+## @package resnet
+# Module caffe2.python.models.resnet
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import brew
+'''
+Utility for creating ResNets
+See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+'''
+
+
+class ResNetBuilder():
+    '''
+    Helper class for constructing residual blocks.
+    '''
+
+    def __init__(self, model, prev_blob, no_bias, is_test, spatial_bn_mom=0.9):
+        self.model = model
+        self.comp_count = 0
+        self.comp_idx = 0
+        self.prev_blob = prev_blob
+        self.is_test = is_test
+        self.spatial_bn_mom = spatial_bn_mom
+        self.no_bias = 1 if no_bias else 0
+
+    def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
+        self.comp_idx += 1
+        self.prev_blob = brew.conv(
+            self.model,
+            self.prev_blob,
+            'comp_%d_conv_%d' % (self.comp_count, self.comp_idx),
+            in_filters,
+            out_filters,
+            weight_init=("MSRAFill", {}),
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=self.no_bias,
+        )
+        return self.prev_blob
+
+    def add_relu(self):
+        self.prev_blob = brew.relu(
+            self.model,
+            self.prev_blob,
+            self.prev_blob,  # in-place
+        )
+        return self.prev_blob
+
+    def add_spatial_bn(self, num_filters):
+        self.prev_blob = brew.spatial_bn(
+            self.model,
+            self.prev_blob,
+            'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
+            num_filters,
+            epsilon=1e-3,
+            momentum=self.spatial_bn_mom,
+            is_test=self.is_test,
+        )
+        return self.prev_blob
+
+    '''
+    Add a "bottleneck" component as decribed in He et. al. Figure 3 (right)
+    '''
+
+    def add_bottleneck(
+        self,
+        input_filters,   # num of feature maps from preceding layer
+        base_filters,    # num of filters internally in the component
+        output_filters,  # num of feature maps to output
+        down_sampling=False,
+        spatial_batch_norm=True,
+    ):
+        self.comp_idx = 0
+        shortcut_blob = self.prev_blob
+
+        # 1x1
+        self.add_conv(
+            input_filters,
+            base_filters,
+            kernel=1,
+            stride=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(base_filters)
+
+        self.add_relu()
+
+        # 3x3 (note the pad, required for keeping dimensions)
+        self.add_conv(
+            base_filters,
+            base_filters,
+            kernel=3,
+            stride=(1 if down_sampling is False else 2),
+            pad=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(base_filters)
+        self.add_relu()
+
+        # 1x1
+        last_conv = self.add_conv(base_filters, output_filters, kernel=1)
+        if spatial_batch_norm:
+            last_conv = self.add_spatial_bn(output_filters)
+
+        # Summation with input signal (shortcut)
+        # If we need to increase dimensions (feature maps), need to
+        # do a projection for the short cut
+        if (output_filters > input_filters):
+            shortcut_blob = brew.conv(
+                self.model,
+                shortcut_blob,
+                'shortcut_projection_%d' % self.comp_count,
+                input_filters,
+                output_filters,
+                weight_init=("MSRAFill", {}),
+                kernel=1,
+                stride=(1 if down_sampling is False else 2),
+                no_bias=self.no_bias,
+            )
+            if spatial_batch_norm:
+                shortcut_blob = brew.spatial_bn(
+                    self.model,
+                    shortcut_blob,
+                    'shortcut_projection_%d_spatbn' % self.comp_count,
+                    output_filters,
+                    epsilon=1e-3,
+                    momentum=self.spatial_bn_mom,
+                    is_test=self.is_test,
+                )
+
+        self.prev_blob = brew.sum(
+            self.model, [shortcut_blob, last_conv],
+            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+        )
+        self.comp_idx += 1
+        self.add_relu()
+
+        # Keep track of number of high level components if this ResNetBuilder
+        self.comp_count += 1
+
+    def add_simple_block(
+        self,
+        input_filters,
+        num_filters,
+        down_sampling=False,
+        spatial_batch_norm=True
+    ):
+        self.comp_idx = 0
+        shortcut_blob = self.prev_blob
+
+        # 3x3
+        self.add_conv(
+            input_filters,
+            num_filters,
+            kernel=3,
+            stride=(1 if down_sampling is False else 2),
+            pad=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(num_filters)
+        self.add_relu()
+
+        last_conv = self.add_conv(num_filters, num_filters, kernel=3, pad=1)
+        if spatial_batch_norm:
+            last_conv = self.add_spatial_bn(num_filters)
+
+        # Increase of dimensions, need a projection for the shortcut
+        if (num_filters != input_filters):
+            shortcut_blob = brew.conv(
+                self.model,
+                shortcut_blob,
+                'shortcut_projection_%d' % self.comp_count,
+                input_filters,
+                num_filters,
+                weight_init=("MSRAFill", {}),
+                kernel=1,
+                stride=(1 if down_sampling is False else 2),
+                no_bias=self.no_bias,
+            )
+            if spatial_batch_norm:
+                shortcut_blob = brew.spatial_bn(
+                    self.model,
+                    shortcut_blob,
+                    'shortcut_projection_%d_spatbn' % self.comp_count,
+                    num_filters,
+                    epsilon=1e-3,
+                    is_test=self.is_test,
+                )
+
+        self.prev_blob = brew.sum(
+            self.model, [shortcut_blob, last_conv],
+            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+        )
+        self.comp_idx += 1
+        self.add_relu()
+
+        # Keep track of number of high level components if this ResNetBuilder
+        self.comp_count += 1
+
+
+# The conv1 and final_avg kernel/stride args provide a basic mechanism for
+# adapting resnet50 for different sizes of input images.
+def create_resnet50(
+    model,
+    data,
+    num_input_channels,
+    num_labels,
+    label=None,
+    is_test=False,
+    no_loss=False,
+    no_bias=0,
+    conv1_kernel=7,
+    conv1_stride=2,
+    final_avg_kernel=7,
+):
+    # conv1 + maxpool
+    brew.conv(
+        model,
+        data,
+        'conv1',
+        num_input_channels,
+        64,
+        weight_init=("MSRAFill", {}),
+        kernel=conv1_kernel,
+        stride=conv1_stride,
+        pad=3,
+        no_bias=no_bias
+    )
+
+    brew.spatial_bn(
+        model,
+        'conv1',
+        'conv1_spatbn_relu',
+        64,
+        epsilon=1e-3,
+        momentum=0.1,
+        is_test=is_test
+    )
+    brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu')
+    brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+
+    # Residual blocks...
+    builder = ResNetBuilder(model, 'pool1', no_bias=no_bias,
+                            is_test=is_test, spatial_bn_mom=0.1)
+
+    # conv2_x (ref Table 1 in He et al. (2015))
+    builder.add_bottleneck(64, 64, 256)
+    builder.add_bottleneck(256, 64, 256)
+    builder.add_bottleneck(256, 64, 256)
+
+    # conv3_x
+    builder.add_bottleneck(256, 128, 512, down_sampling=True)
+    for _ in range(1, 4):
+        builder.add_bottleneck(512, 128, 512)
+
+    # conv4_x
+    builder.add_bottleneck(512, 256, 1024, down_sampling=True)
+    for _ in range(1, 6):
+        builder.add_bottleneck(1024, 256, 1024)
+
+    # conv5_x
+    builder.add_bottleneck(1024, 512, 2048, down_sampling=True)
+    builder.add_bottleneck(2048, 512, 2048)
+    builder.add_bottleneck(2048, 512, 2048)
+
+    # Final layers
+    final_avg = brew.average_pool(
+        model,
+        builder.prev_blob,
+        'final_avg',
+        kernel=final_avg_kernel,
+        stride=1,
+        global_pooling=True,
+    )
+
+    # Final dimension of the "image" is reduced to 7x7
+    last_out = brew.fc(
+        model, final_avg, 'last_out_L{}'.format(num_labels), 2048, num_labels
+    )
+
+    if no_loss:
+        return last_out
+
+    # If we create model for training, use softmax-with-loss
+    if (label is not None):
+        (softmax, loss) = model.SoftmaxWithLoss(
+            [last_out, label],
+            ["softmax", "loss"],
+        )
+
+        return (softmax, loss)
+    else:
+        # For inference, we just return softmax
+        return brew.softmax(model, last_out, "softmax")
+
+
+def create_resnet_32x32(
+    model, data, num_input_channels, num_groups, num_labels, is_test=False
+):
+    '''
+    Create residual net for smaller images (sec 4.2 of He et. al (2015))
+    num_groups = 'n' in the paper
+    '''
+    # conv1 + maxpool
+    brew.conv(
+        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
+    )
+    brew.spatial_bn(
+        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
+    )
+    brew.relu(model, 'conv1_spatbn', 'relu1')
+
+    # Number of blocks as described in sec 4.2
+    filters = [16, 32, 64]
+
+    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
+    prev_filters = 16
+    for groupidx in range(0, 3):
+        for blockidx in range(0, 2 * num_groups):
+            builder.add_simple_block(
+                prev_filters if blockidx == 0 else filters[groupidx],
+                filters[groupidx],
+                down_sampling=(True if blockidx == 0 and
+                               groupidx > 0 else False))
+        prev_filters = filters[groupidx]
+
+    # Final layers
+    brew.average_pool(
+        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
+    )
+    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
+    softmax = brew.softmax(model, 'last_out', 'softmax')
+    return softmax
diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py
new file mode 100644
index 0000000..90a5c5c
--- /dev/null
+++ b/caffe2/python/models/resnet_test.py
@@ -0,0 +1,206 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import time
+
+from caffe2.python import workspace, cnn, memonger, core
+import caffe2.python.models.resnet as resnet
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import caffe2.python.hypothesis_test_util as hu
+
+
+def has_blob(proto, needle):
+    for op in proto.op:
+        for inp in op.input:
+            if inp == needle:
+                return True
+        for outp in op.output:
+            if outp == needle:
+                return True
+    return False
+
+
+def count_blobs(proto):
+    blobs = set()
+    for op in proto.op:
+        blobs = blobs.union(set(op.input)).union(set(op.output))
+    return len(blobs)
+
+
+def count_shared_blobs(proto):
+    blobs = set()
+    for op in proto.op:
+        blobs = blobs.union(set(op.input)).union(set(op.output))
+    return len([b for b in blobs if "_shared" in b])
+
+
+class ResnetMemongerTest(hu.HypothesisTestCase):
+
+    @given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
+    @settings(max_examples=2, timeout=120)
+    def test_resnet_shared_grads(self, with_shapes, gc, dc):
+        model = cnn.CNNModelHelper(
+            order="NCHW",
+            name="test",
+            cudnn_exhaustive_search=True,
+        )
+        with core.NameScope("gpu_0"):
+            data = model.net.AddExternalInput("gpu_0/data")
+            label = model.net.AddExternalInput("gpu_0/label")
+            (_softmax, loss) = resnet.create_resnet50(
+                model,
+                data,
+                num_input_channels=3,
+                num_labels=1000,
+                label=label,
+                is_test=False,
+            )
+
+        param_to_grad = model.AddGradientOperators([loss])
+
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [model.param_init_net, model.net],
+            {'gpu_0/data': [4, 3, 227, 227],
+                         'gpu_0/label': [4]},
+        )
+
+        count_before = count_blobs(model.net.Proto())
+        optim_proto = memonger.share_grad_blobs(
+            model.net,
+            ["gpu_0/loss"],
+            set(model.param_to_grad.values()),
+            "gpu_0/",
+            share_activations=True,
+            dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
+            blob_shapes=shapes if with_shapes else None,
+        )
+        count_after = count_blobs(optim_proto)
+        self.assertTrue(count_after < count_before)
+
+        # Run model and compare results. We check that the loss is same
+        # and also that the final gradient (conv1_w_grad is same)
+        workspace.RunNetOnce(model.param_init_net)
+        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+        label = (np.random.rand(4) * 1000).astype(np.int32)
+
+        workspace.FeedBlob("gpu_0/data", data)
+        workspace.FeedBlob("gpu_0/label", label)
+
+        workspace.RunNetOnce(model.net)
+        model.net.Proto().type = 'dag'
+        model.net.Proto().num_workers = 4
+        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+        conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
+        workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))
+
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+        optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
+
+        print("before: {} after: {}".format(count_before, count_after))
+
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
+
+    def test_resnet_forward_only(self):
+        model = cnn.CNNModelHelper(
+            order="NCHW",
+            name="test",
+            cudnn_exhaustive_search=True,
+        )
+        with core.NameScope("gpu_0"):
+                data = model.net.AddExternalInput("gpu_0/data")
+                resnet.create_resnet50(
+                    model,
+                    data,
+                    num_input_channels=3,
+                    num_labels=1000,
+                    is_test=True
+                )
+
+        count_before = count_blobs(model.net.Proto())
+        optim_proto = memonger.optimize_inference_for_dag(
+            model.net, ["gpu_0/data"], "gpu_0/"
+        )
+        count_after = count_blobs(optim_proto)
+        num_shared_blobs = count_shared_blobs(optim_proto)
+
+        # Run model and compare results
+        workspace.RunNetOnce(model.param_init_net)
+        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+
+        workspace.FeedBlob("gpu_0/data", data)
+        workspace.RunNetOnce(model.net)
+        model.net.Proto().type = 'dag'
+        model.net.Proto().num_workers = 4
+        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+        self.assertTrue(count_after < count_before)
+        self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+
+    def test_resnet_forward_only_fast_simplenet(self):
+        '''
+        Test C++ memonger that is only for simple nets
+        '''
+        model = cnn.CNNModelHelper(
+            order="NCHW",
+            name="test",
+            cudnn_exhaustive_search=True,
+        )
+        with core.NameScope("gpu_0"):
+                data = model.net.AddExternalInput("gpu_0/data")
+                resnet.create_resnet50(
+                    model,
+                    data,
+                    num_input_channels=3,
+                    num_labels=1000,
+                    is_test=True
+                )
+
+        count_before = count_blobs(model.net.Proto())
+        t = time.time()
+        optim_proto = memonger.optimize_inference_fast(
+            model.net.Proto(),
+            set(["gpu_0/data", "gpu_0/last_out_L1000"]).union(
+                set(model.net.Proto().external_input))
+        )
+        print("Optimization took {} secs".format(time.time() - t))
+        count_after = count_blobs(optim_proto)
+        num_shared_blobs = count_shared_blobs(optim_proto)
+
+        self.assertTrue(count_after < count_before)
+        print(count_after, count_before, num_shared_blobs)
+        self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
+
+        # Run model and compare results
+        workspace.RunNetOnce(model.param_init_net)
+        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+
+        workspace.FeedBlob("gpu_0/data", data)
+        model.net.Proto().type = 'simple'
+
+        workspace.RunNetOnce(model.net)
+        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+
+        workspace.RunNetOnce(optim_proto)
+        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
+        np.testing.assert_almost_equal(loss1, optimized_loss1)
+
+
+if __name__ == "__main__":
+    import unittest
+    import random
+    random.seed(2603)
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+        '--caffe2_print_blob_sizes_at_exit=0',
+        '--caffe2_gpu_memory_tracking=1'])
+    unittest.main()
diff --git a/caffe2/python/models/seq2seq/__init__.py b/caffe2/python/models/seq2seq/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py
new file mode 100644
index 0000000..7b90969
--- /dev/null
+++ b/caffe2/python/models/seq2seq/beam_search.py
@@ -0,0 +1,493 @@
+## @package beam_search
+# Module caffe2.python.models.seq2seq.beam_search
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import namedtuple
+from caffe2.python import core
+import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
+from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
+
+
+class BeamSearchForwardOnly(object):
+    """
+    Class generalizing forward beam search for seq2seq models.
+
+    Also provides types to specify the recurrent structure of decoding:
+
+    StateConfig:
+        initial_value: blob providing value of state at first step_model
+        state_prev_link: LinkConfig describing how recurrent step receives
+            input from global state blob in each step
+        state_link: LinkConfig describing how step writes (produces new state)
+            to global state blob in each step
+
+    LinkConfig:
+        blob: blob connecting global state blob to step application
+        offset: offset from beginning of global blob for link in time dimension
+        window: width of global blob to read/write in time dimension
+    """
+
+    LinkConfig = namedtuple('LinkConfig', ['blob', 'offset', 'window'])
+
+    StateConfig = namedtuple(
+        'StateConfig',
+        ['initial_value', 'state_prev_link', 'state_link'],
+    )
+
+    def __init__(
+        self,
+        beam_size,
+        model,
+        eos_token_id,
+        go_token_id=seq2seq_util.GO_ID,
+        post_eos_penalty=None,
+    ):
+        self.beam_size = beam_size
+        self.model = model
+        self.step_model = Seq2SeqModelHelper(
+            name='step_model',
+            param_model=self.model,
+        )
+        self.go_token_id = go_token_id
+        self.eos_token_id = eos_token_id
+        self.post_eos_penalty = post_eos_penalty
+
+        (
+            self.timestep,
+            self.scores_t_prev,
+            self.tokens_t_prev,
+            self.hypo_t_prev,
+            self.attention_t_prev,
+        ) = self.step_model.net.AddExternalInputs(
+            'timestep',
+            'scores_t_prev',
+            'tokens_t_prev',
+            'hypo_t_prev',
+            'attention_t_prev',
+        )
+        tokens_t_prev_int32 = self.step_model.net.Cast(
+            self.tokens_t_prev,
+            'tokens_t_prev_int32',
+            to=core.DataType.INT32,
+        )
+        self.tokens_t_prev_int32_flattened, _ = self.step_model.net.Reshape(
+            [tokens_t_prev_int32],
+            [tokens_t_prev_int32, 'input_t_int32_old_shape'],
+            shape=[1, -1],
+        )
+
+    def get_step_model(self):
+        return self.step_model
+
+    def get_previous_tokens(self):
+        return self.tokens_t_prev_int32_flattened
+
+    def get_timestep(self):
+        return self.timestep
+
+    # TODO: make attentions a generic state
+    # data_dependencies is a list of blobs that the operator should wait for
+    # before beginning execution. This ensures that ops are run in the correct
+    # order when the RecurrentNetwork op is embedded in a DAGNet, for ex.
+    def apply(
+        self,
+        inputs,
+        length,
+        log_probs,
+        attentions,
+        state_configs,
+        data_dependencies,
+        word_rewards=None,
+        possible_translation_tokens=None,
+        go_token_id=None,
+    ):
+        ZERO = self.model.param_init_net.ConstantFill(
+            [],
+            'ZERO',
+            shape=[1],
+            value=0,
+            dtype=core.DataType.INT32,
+        )
+        on_initial_step = self.step_model.net.EQ(
+            [ZERO, self.timestep],
+            'on_initial_step',
+        )
+
+        if self.post_eos_penalty is not None:
+            eos_token = self.model.param_init_net.ConstantFill(
+                [],
+                'eos_token',
+                shape=[self.beam_size],
+                value=self.eos_token_id,
+                dtype=core.DataType.INT32,
+            )
+            finished_penalty = self.model.param_init_net.ConstantFill(
+                [],
+                'finished_penalty',
+                shape=[1],
+                value=float(self.post_eos_penalty),
+                dtype=core.DataType.FLOAT,
+            )
+            ZERO_FLOAT = self.model.param_init_net.ConstantFill(
+                [],
+                'ZERO_FLOAT',
+                shape=[1],
+                value=0.0,
+                dtype=core.DataType.FLOAT,
+            )
+            finished_penalty = self.step_model.net.Conditional(
+                [on_initial_step, ZERO_FLOAT, finished_penalty],
+                'possible_finished_penalty',
+            )
+
+            tokens_t_flat = self.step_model.net.FlattenToVec(
+                self.tokens_t_prev,
+                'tokens_t_flat',
+            )
+            tokens_t_flat_int = self.step_model.net.Cast(
+                tokens_t_flat,
+                'tokens_t_flat_int',
+                to=core.DataType.INT32,
+            )
+
+            predecessor_is_eos = self.step_model.net.EQ(
+                [tokens_t_flat_int, eos_token],
+                'predecessor_is_eos',
+            )
+            predecessor_is_eos_float = self.step_model.net.Cast(
+                predecessor_is_eos,
+                'predecessor_is_eos_float',
+                to=core.DataType.FLOAT,
+            )
+            predecessor_is_eos_penalty = self.step_model.net.Mul(
+                [predecessor_is_eos_float, finished_penalty],
+                'predecessor_is_eos_penalty',
+                broadcast=1,
+            )
+
+            log_probs = self.step_model.net.Add(
+                [log_probs, predecessor_is_eos_penalty],
+                'log_probs_penalized',
+                broadcast=1,
+                axis=0,
+            )
+
+        # [beam_size, beam_size]
+        best_scores_per_hypo, best_tokens_per_hypo = self.step_model.net.TopK(
+            log_probs,
+            ['best_scores_per_hypo', 'best_tokens_per_hypo_indices'],
+            k=self.beam_size,
+        )
+        if possible_translation_tokens:
+            # [beam_size, beam_size]
+            best_tokens_per_hypo = self.step_model.net.Gather(
+                [possible_translation_tokens, best_tokens_per_hypo],
+                ['best_tokens_per_hypo']
+            )
+
+        # [beam_size]
+        scores_t_prev_squeezed, _ = self.step_model.net.Reshape(
+            self.scores_t_prev,
+            ['scores_t_prev_squeezed', 'scores_t_prev_old_shape'],
+            shape=[self.beam_size],
+        )
+        # [beam_size, beam_size]
+        output_scores = self.step_model.net.Add(
+            [best_scores_per_hypo, scores_t_prev_squeezed],
+            'output_scores',
+            broadcast=1,
+            axis=0,
+        )
+        if word_rewards is not None:
+            # [beam_size, beam_size]
+            word_rewards_for_best_tokens_per_hypo = self.step_model.net.Gather(
+                [word_rewards, best_tokens_per_hypo],
+                'word_rewards_for_best_tokens_per_hypo',
+            )
+            # [beam_size, beam_size]
+            output_scores = self.step_model.net.Add(
+                [output_scores, word_rewards_for_best_tokens_per_hypo],
+                'output_scores',
+            )
+        # [beam_size * beam_size]
+        output_scores_flattened, _ = self.step_model.net.Reshape(
+            [output_scores],
+            [output_scores, 'output_scores_old_shape'],
+            shape=[-1],
+        )
+        MINUS_ONE_INT32 = self.model.param_init_net.ConstantFill(
+            [],
+            'MINUS_ONE_INT32',
+            value=-1,
+            shape=[1],
+            dtype=core.DataType.INT32,
+        )
+        BEAM_SIZE = self.model.param_init_net.ConstantFill(
+            [],
+            'beam_size',
+            shape=[1],
+            value=self.beam_size,
+            dtype=core.DataType.INT32,
+        )
+
+        # current_beam_size (predecessor states from previous step)
+        # is 1 on first step (so we just need beam_size scores),
+        # and beam_size subsequently (so we need all beam_size * beam_size
+        # scores)
+        slice_end = self.step_model.net.Conditional(
+            [on_initial_step, BEAM_SIZE, MINUS_ONE_INT32],
+            ['slice_end'],
+        )
+
+        # [current_beam_size * beam_size]
+        output_scores_flattened_slice = self.step_model.net.Slice(
+            [output_scores_flattened, ZERO, slice_end],
+            'output_scores_flattened_slice',
+        )
+        # [1, current_beam_size * beam_size]
+        output_scores_flattened_slice, _ = self.step_model.net.Reshape(
+            output_scores_flattened_slice,
+            [
+                output_scores_flattened_slice,
+                'output_scores_flattened_slice_old_shape',
+            ],
+            shape=[1, -1],
+        )
+        # [1, beam_size]
+        scores_t, best_indices = self.step_model.net.TopK(
+            output_scores_flattened_slice,
+            ['scores_t', 'best_indices'],
+            k=self.beam_size,
+        )
+        BEAM_SIZE_64 = self.model.param_init_net.Cast(
+            BEAM_SIZE,
+            'BEAM_SIZE_64',
+            to=core.DataType.INT64,
+        )
+        # [1, beam_size]
+        hypo_t_int32 = self.step_model.net.Div(
+            [best_indices, BEAM_SIZE_64],
+            'hypo_t_int32',
+            broadcast=1,
+        )
+        hypo_t = self.step_model.net.Cast(
+            hypo_t_int32,
+            'hypo_t',
+            to=core.DataType.FLOAT,
+        )
+
+        # [beam_size, encoder_length, 1]
+        attention_t = self.step_model.net.Gather(
+            [attentions, hypo_t_int32],
+            'attention_t',
+        )
+        # [1, beam_size, encoder_length]
+        attention_t, _ = self.step_model.net.Reshape(
+            attention_t,
+            [attention_t, 'attention_t_old_shape'],
+            shape=[1, self.beam_size, -1],
+        )
+        # [beam_size * beam_size]
+        best_tokens_per_hypo_flatten, _ = self.step_model.net.Reshape(
+            best_tokens_per_hypo,
+            [
+                'best_tokens_per_hypo_flatten',
+                'best_tokens_per_hypo_old_shape',
+            ],
+            shape=[-1],
+        )
+        tokens_t_int32 = self.step_model.net.Gather(
+            [best_tokens_per_hypo_flatten, best_indices],
+            'tokens_t_int32',
+        )
+        tokens_t = self.step_model.net.Cast(
+            tokens_t_int32,
+            'tokens_t',
+            to=core.DataType.FLOAT,
+        )
+
+        def choose_state_per_hypo(state_config):
+            state_flattened, _ = self.step_model.net.Reshape(
+                state_config.state_link.blob,
+                [
+                    state_config.state_link.blob,
+                    state_config.state_link.blob + '_old_shape',
+                ],
+                shape=[self.beam_size, -1],
+            )
+            state_chosen_per_hypo = self.step_model.net.Gather(
+                [state_flattened, hypo_t_int32],
+                str(state_config.state_link.blob) + '_chosen_per_hypo',
+            )
+            return self.StateConfig(
+                initial_value=state_config.initial_value,
+                state_prev_link=state_config.state_prev_link,
+                state_link=self.LinkConfig(
+                    blob=state_chosen_per_hypo,
+                    offset=state_config.state_link.offset,
+                    window=state_config.state_link.window,
+                )
+            )
+        state_configs = [choose_state_per_hypo(c) for c in state_configs]
+        initial_scores = self.model.param_init_net.ConstantFill(
+            [],
+            'initial_scores',
+            shape=[1],
+            value=0.0,
+            dtype=core.DataType.FLOAT,
+        )
+        if go_token_id:
+            initial_tokens = self.model.net.Copy(
+                [go_token_id],
+                'initial_tokens',
+            )
+        else:
+            initial_tokens = self.model.param_init_net.ConstantFill(
+                [],
+                'initial_tokens',
+                shape=[1],
+                value=float(self.go_token_id),
+                dtype=core.DataType.FLOAT,
+            )
+
+        initial_hypo = self.model.param_init_net.ConstantFill(
+            [],
+            'initial_hypo',
+            shape=[1],
+            value=0.0,
+            dtype=core.DataType.FLOAT,
+        )
+        encoder_inputs_flattened, _ = self.model.net.Reshape(
+            inputs,
+            ['encoder_inputs_flattened', 'encoder_inputs_old_shape'],
+            shape=[-1],
+        )
+        init_attention = self.model.net.ConstantFill(
+            encoder_inputs_flattened,
+            'init_attention',
+            value=0.0,
+            dtype=core.DataType.FLOAT,
+        )
+        state_configs = state_configs + [
+            self.StateConfig(
+                initial_value=initial_scores,
+                state_prev_link=self.LinkConfig(self.scores_t_prev, 0, 1),
+                state_link=self.LinkConfig(scores_t, 1, 1),
+            ),
+            self.StateConfig(
+                initial_value=initial_tokens,
+                state_prev_link=self.LinkConfig(self.tokens_t_prev, 0, 1),
+                state_link=self.LinkConfig(tokens_t, 1, 1),
+            ),
+            self.StateConfig(
+                initial_value=initial_hypo,
+                state_prev_link=self.LinkConfig(self.hypo_t_prev, 0, 1),
+                state_link=self.LinkConfig(hypo_t, 1, 1),
+            ),
+            self.StateConfig(
+                initial_value=init_attention,
+                state_prev_link=self.LinkConfig(self.attention_t_prev, 0, 1),
+                state_link=self.LinkConfig(attention_t, 1, 1),
+            ),
+        ]
+        fake_input = self.model.net.ConstantFill(
+            length,
+            'beam_search_fake_input',
+            input_as_shape=True,
+            extra_shape=[self.beam_size, 1],
+            value=0.0,
+            dtype=core.DataType.FLOAT,
+        )
+        all_inputs = (
+            [fake_input] +
+            self.step_model.params +
+            [state_config.initial_value for state_config in state_configs] +
+            data_dependencies
+        )
+        forward_links = []
+        recurrent_states = []
+        for state_config in state_configs:
+            state_name = str(state_config.state_prev_link.blob) + '_states'
+            recurrent_states.append(state_name)
+            forward_links.append((
+                state_config.state_prev_link.blob,
+                state_name,
+                state_config.state_prev_link.offset,
+                state_config.state_prev_link.window,
+            ))
+            forward_links.append((
+                state_config.state_link.blob,
+                state_name,
+                state_config.state_link.offset,
+                state_config.state_link.window,
+            ))
+        link_internal, link_external, link_offset, link_window = (
+            zip(*forward_links)
+        )
+        all_outputs = [
+            str(s) + '_all'
+            for s in [scores_t, tokens_t, hypo_t, attention_t]
+        ]
+        results = self.model.net.RecurrentNetwork(
+            all_inputs,
+            all_outputs + ['step_workspaces'],
+            param=[all_inputs.index(p) for p in self.step_model.params],
+            alias_src=[
+                str(s) + '_states'
+                for s in [
+                    self.scores_t_prev,
+                    self.tokens_t_prev,
+                    self.hypo_t_prev,
+                    self.attention_t_prev,
+                ]
+            ],
+            alias_dst=all_outputs,
+            alias_offset=[0] * 4,
+            recurrent_states=recurrent_states,
+            initial_recurrent_state_ids=[
+                all_inputs.index(state_config.initial_value)
+                for state_config in state_configs
+            ],
+            link_internal=[str(l) for l in link_internal],
+            link_external=[str(l) for l in link_external],
+            link_offset=link_offset,
+            link_window=link_window,
+            backward_link_internal=[],
+            backward_link_external=[],
+            backward_link_offset=[],
+            step_net=self.step_model.net.Proto(),
+            timestep=str(self.timestep),
+            outputs_with_grads=[],
+            enable_rnn_executor=1,
+            rnn_executor_debug=0
+        )
+        score_t_all, tokens_t_all, hypo_t_all, attention_t_all = results[:4]
+
+        output_token_beam_list = self.model.net.Cast(
+            tokens_t_all,
+            'output_token_beam_list',
+            to=core.DataType.INT32,
+        )
+        output_prev_index_beam_list = self.model.net.Cast(
+            hypo_t_all,
+            'output_prev_index_beam_list',
+            to=core.DataType.INT32,
+        )
+        output_score_beam_list = self.model.net.Alias(
+            score_t_all,
+            'output_score_beam_list',
+        )
+        output_attention_weights_beam_list = self.model.net.Alias(
+            attention_t_all,
+            'output_attention_weights_beam_list',
+        )
+
+        return (
+            output_token_beam_list,
+            output_prev_index_beam_list,
+            output_score_beam_list,
+            output_attention_weights_beam_list,
+        )
diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
new file mode 100644
index 0000000..0ee1f6e
--- /dev/null
+++ b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
@@ -0,0 +1,215 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import os
+import tempfile
+
+from caffe2.python import test_util, workspace
+import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
+from caffe2.python.models.seq2seq.train import Seq2SeqModelCaffe2
+from caffe2.python.models.seq2seq.translate import (
+    Seq2SeqModelCaffe2EnsembleDecoder,
+)
+
+
+class Seq2SeqBeamSearchTest(test_util.TestCase):
+
+    def _build_seq2seq_model(
+        self,
+        model_params,
+        tmp_dir,
+        source_vocab_size=20,
+        target_vocab_size=20,
+        num_gpus=0,
+        batch_size=2,
+    ):
+        training_params = dict(
+            model_params,
+            batch_size=batch_size,
+            optimizer_params=dict(
+                learning_rate=0.1,
+            ),
+            max_gradient_norm=1.0,
+        )
+
+        model_obj = Seq2SeqModelCaffe2(
+            training_params,
+            source_vocab_size,
+            target_vocab_size,
+            num_gpus,
+        )
+        model_obj.initialize_from_scratch()
+
+        checkpoint_path_prefix = os.path.join(tmp_dir, 'checkpoint')
+        checkpoint_path = model_obj.save(
+            checkpoint_path_prefix=checkpoint_path_prefix,
+            current_step=0,
+        )
+
+        return model_obj, checkpoint_path
+
+    def _run_compare_train_inference(self, model_params):
+        tmp_dir = tempfile.mkdtemp()
+
+        model_obj, checkpoint_path = self._build_seq2seq_model(
+            model_params,
+            tmp_dir=tmp_dir,
+            source_vocab_size=20,
+            target_vocab_size=20,
+            num_gpus=0,
+            batch_size=2,
+        )
+        assert model_obj is not None
+
+        translate_params = dict(
+            ensemble_models=[dict(
+                source_vocab={i: str(i) for i in range(20)},
+                target_vocab={i: str(i) for i in range(20)},
+                model_params=model_params,
+                model_file=checkpoint_path,
+            )],
+            decoding_params=dict(
+                beam_size=3,
+                word_reward=0,
+                unk_reward=0,
+            ),
+        )
+
+        beam_decoder_model = Seq2SeqModelCaffe2EnsembleDecoder(translate_params)
+        beam_decoder_model.load_models()
+
+        encoder_lengths = 5
+        decoder_lengths = 7
+
+        for _ in range(3):
+            encoder_inputs = np.random.random_integers(
+                low=3,  # after GO_ID (1) and EOS_ID (2)
+                high=19,
+                size=encoder_lengths,
+            )
+            targets, _, beam_model_score = beam_decoder_model.decode(
+                encoder_inputs,
+                decoder_lengths,
+            )
+            targets_2, _, beam_model_score = beam_decoder_model.decode(
+                encoder_inputs,
+                decoder_lengths,
+            )
+            self.assertEqual(targets, targets_2)
+
+            workspace.FeedBlob(
+                'encoder_inputs',
+                np.array(
+                    [list(reversed(encoder_inputs))]
+                ).transpose().astype(dtype=np.int32))
+            workspace.FeedBlob(
+                'encoder_lengths',
+                np.array([len(encoder_inputs)]).astype(dtype=np.int32),
+            )
+            decoder_inputs = [seq2seq_util.GO_ID] + targets[:-1]
+            workspace.FeedBlob(
+                'decoder_inputs',
+                np.array([decoder_inputs]).transpose().astype(dtype=np.int32),
+            )
+            workspace.FeedBlob(
+                'decoder_lengths',
+                np.array([len(decoder_inputs)]).astype(dtype=np.int32),
+            )
+            workspace.FeedBlob(
+                'targets',
+                np.array([targets]).transpose().astype(dtype=np.int32),
+            )
+            workspace.FeedBlob(
+                'target_weights',
+                np.array([[1.0] * len(targets)]).astype(dtype=np.float32),
+            )
+
+            workspace.RunNet(model_obj.forward_net)
+            train_model_score = workspace.FetchBlob('total_loss_scalar')
+
+            np.testing.assert_almost_equal(
+                beam_model_score,
+                train_model_score,
+                decimal=4,
+            )
+
+    def test_attention(self):
+        model_params = dict(
+            attention='regular',
+            decoder_layer_configs=[
+                dict(
+                    num_units=32,
+                ),
+            ],
+            encoder_type=dict(
+                encoder_layer_configs=[
+                    dict(
+                        num_units=16,
+                    ),
+                ],
+                use_bidirectional_encoder=True,
+            ),
+            encoder_embedding_size=8,
+            decoder_embedding_size=8,
+            decoder_softmax_size=None,
+        )
+        self._run_compare_train_inference(model_params)
+
+    def test_2layer_attention(self):
+        model_params = dict(
+            attention='regular',
+            decoder_layer_configs=[
+                dict(
+                    num_units=32,
+                ),
+                dict(
+                    num_units=32,
+                ),
+            ],
+            encoder_type=dict(
+                encoder_layer_configs=[
+                    dict(
+                        num_units=16,
+                    ),
+                    dict(
+                        num_units=32,
+                    ),
+                ],
+                use_bidirectional_encoder=True,
+            ),
+            encoder_embedding_size=8,
+            decoder_embedding_size=8,
+            decoder_softmax_size=None,
+        )
+        self._run_compare_train_inference(model_params)
+
+    def test_multi_decoder(self):
+        model_params = dict(
+            attention='regular',
+            decoder_layer_configs=[
+                dict(
+                    num_units=32,
+                ),
+                dict(
+                    num_units=32,
+                ),
+                dict(
+                    num_units=32,
+                ),
+            ],
+            encoder_type=dict(
+                encoder_layer_configs=[
+                    dict(
+                        num_units=32,
+                    ),
+                ],
+                use_bidirectional_encoder=False,
+            ),
+            encoder_embedding_size=8,
+            decoder_embedding_size=8,
+            decoder_softmax_size=None,
+        )
+        self._run_compare_train_inference(model_params)
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
new file mode 100644
index 0000000..b2a50c4
--- /dev/null
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
@@ -0,0 +1,86 @@
+## @package seq2seq_model_helper
+# Module caffe2.python.models.seq2seq.seq2seq_model_helper
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import scope
+from caffe2.python.model_helper import ModelHelper
+
+
+class Seq2SeqModelHelper(ModelHelper):
+
+    def __init__(self, init_params=True, **kwargs):
+        arg_scope = {
+            'use_cudnn': kwargs.pop('use_cudnn', True),
+            'cudnn_exhaustive_search': kwargs.pop('cudnn_exhaustive_search', False),
+            'order': 'NHWC',
+        }
+        if kwargs.get('ws_nbytes_limit', None):
+            arg_scope['ws_nbytes_limit'] = kwargs.pop('ws_nbytes_limit')
+
+        super(Seq2SeqModelHelper, self).__init__(
+            init_params=init_params,
+            arg_scope=arg_scope,
+            **kwargs
+        )
+        self.non_trainable_params = []
+
+    def AddParam(self, name, init=None, init_value=None, trainable=True):
+        """Adds a parameter to the model's net and it's initializer if needed
+
+        Args:
+            init: a tuple (<initialization_op_name>, <initialization_op_kwargs>)
+            init_value: int, float or str. Can be used instead of `init` as a
+                simple constant initializer
+            trainable: bool, whether to compute gradient for this param or not
+        """
+        if init_value is not None:
+            assert init is None
+            assert type(init_value) in [int, float, str]
+            init = ('ConstantFill', dict(
+                shape=[1],
+                value=init_value,
+            ))
+
+        if self.init_params:
+            param = self.param_init_net.__getattr__(init[0])(
+                [],
+                name,
+                **init[1]
+            )
+        else:
+            param = self.net.AddExternalInput(name)
+
+        if trainable:
+            self.params.append(param)
+        else:
+            self.non_trainable_params.append(param)
+
+        return param
+
+    def GetNonTrainableParams(self, namescope=None):
+        '''
+        Returns the params in current namescope
+        '''
+        if namescope is None:
+            namescope = scope.CurrentNameScope()
+        else:
+            if not namescope.endswith(scope._NAMESCOPE_SEPARATOR):
+                namescope += scope._NAMESCOPE_SEPARATOR
+
+        if namescope == '':
+            return self.non_trainable_params[:]
+        else:
+            return [
+                p for p in self.non_trainable_params
+                if p.GetNameScope() == namescope
+            ]
+
+    def GetAllParams(self, namescope=None):
+        return (
+            self.GetParams(namescope) +
+            self.GetComputedParams(namescope) +
+            self.GetNonTrainableParams(namescope)
+        )
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
new file mode 100644
index 0000000..8095440
--- /dev/null
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.models.seq2seq import seq2seq_model_helper
+from caffe2.python import scope, test_util
+
+
+class Seq2SeqModelHelperTest(test_util.TestCase):
+    def testConstuctor(self):
+        model_name = 'TestModel'
+        m = seq2seq_model_helper.Seq2SeqModelHelper(name=model_name)
+
+        self.assertEqual(m.name, model_name)
+        self.assertEqual(m.init_params, True)
+
+        self.assertEqual(m.arg_scope, {
+            'use_cudnn': True,
+            'cudnn_exhaustive_search': False,
+            'order': 'NHWC'
+        })
+
+    def testAddParam(self):
+        m = seq2seq_model_helper.Seq2SeqModelHelper()
+
+        param_name = 'test_param'
+        param = m.AddParam(param_name, init_value=1)
+        self.assertEqual(str(param), param_name)
+
+    def testGetNonTrainableParams(self):
+        m = seq2seq_model_helper.Seq2SeqModelHelper()
+
+        m.AddParam('test_param1', init_value=1, trainable=True)
+        p2 = m.AddParam('test_param2', init_value=2, trainable=False)
+
+        self.assertEqual(
+            m.GetNonTrainableParams(),
+            [p2]
+        )
+
+        with scope.NameScope('A', reset=True):
+            p3 = m.AddParam('test_param3', init_value=3, trainable=False)
+            self.assertEqual(
+                m.GetNonTrainableParams(),
+                [p3]
+            )
+
+        self.assertEqual(
+            m.GetNonTrainableParams(),
+            [p2, p3]
+        )
+
+    def testGetAllParams(self):
+        m = seq2seq_model_helper.Seq2SeqModelHelper()
+
+        p1 = m.AddParam('test_param1', init_value=1, trainable=True)
+        p2 = m.AddParam('test_param2', init_value=2, trainable=False)
+
+        self.assertEqual(
+            m.GetAllParams(),
+            [p1, p2]
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    import random
+    random.seed(2221)
+    unittest.main()
diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py
new file mode 100644
index 0000000..d070288
--- /dev/null
+++ b/caffe2/python/models/seq2seq/seq2seq_util.py
@@ -0,0 +1,672 @@
+## @package seq2seq_util
+# Module caffe2.python.examples.seq2seq_util
+""" A bunch of util functions to build Seq2Seq models with Caffe2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import collections
+from future.utils import viewitems
+
+import caffe2.proto.caffe2_pb2 as caffe2_pb2
+from caffe2.python import attention, core, rnn_cell, brew
+
+
+PAD_ID = 0
+PAD = '<PAD>'
+GO_ID = 1
+GO = '<GO>'
+EOS_ID = 2
+EOS = '<EOS>'
+UNK_ID = 3
+UNK = '<UNK>'
+
+
+def gen_vocab(corpus, unk_threshold):
+    vocab = collections.defaultdict(lambda: len(vocab))
+    freqs = collections.defaultdict(lambda: 0)
+    # Adding padding tokens to the vocabulary to maintain consistency with IDs
+    vocab[PAD]
+    vocab[GO]
+    vocab[EOS]
+    vocab[UNK]
+
+    with open(corpus) as f:
+        for sentence in f:
+            tokens = sentence.strip().split()
+            for token in tokens:
+                freqs[token] += 1
+    for token, freq in viewitems(freqs):
+        if freq > unk_threshold:
+            vocab[token]
+
+    return vocab
+
+
+def get_numberized_sentence(sentence, vocab):
+    numerized_sentence = []
+    for token in sentence.strip().split():
+        if token in vocab:
+            numerized_sentence.append(vocab[token])
+        else:
+            numerized_sentence.append(vocab[UNK])
+    return numerized_sentence
+
+
+def rnn_unidirectional_layer(
+    model,
+    inputs,
+    input_lengths,
+    input_size,
+    num_units,
+    dropout_keep_prob,
+    forward_only,
+    return_sequence_output,
+    return_final_state,
+    scope=None,
+):
+    """ Unidirectional LSTM encoder."""
+    with core.NameScope(scope):
+        initial_cell_state = model.param_init_net.ConstantFill(
+            [],
+            'initial_cell_state',
+            shape=[num_units],
+            value=0.0,
+        )
+        initial_hidden_state = model.param_init_net.ConstantFill(
+            [],
+            'initial_hidden_state',
+            shape=[num_units],
+            value=0.0,
+        )
+
+    cell = rnn_cell.LSTMCell(
+        input_size=input_size,
+        hidden_size=num_units,
+        forget_bias=0.0,
+        memory_optimization=False,
+        name=(scope + '/' if scope else '') + 'lstm',
+        forward_only=forward_only,
+    )
+
+    dropout_ratio = (
+        None if dropout_keep_prob is None else (1.0 - dropout_keep_prob)
+    )
+    if dropout_ratio is not None:
+        cell = rnn_cell.DropoutCell(
+            internal_cell=cell,
+            dropout_ratio=dropout_ratio,
+            name=(scope + '/' if scope else '') + 'dropout',
+            forward_only=forward_only,
+            is_test=False,
+        )
+
+    outputs_with_grads = []
+    if return_sequence_output:
+        outputs_with_grads.append(0)
+    if return_final_state:
+        outputs_with_grads.extend([1, 3])
+
+    outputs, (_, final_hidden_state, _, final_cell_state) = (
+        cell.apply_over_sequence(
+            model=model,
+            inputs=inputs,
+            seq_lengths=input_lengths,
+            initial_states=(initial_hidden_state, initial_cell_state),
+            outputs_with_grads=outputs_with_grads,
+        )
+    )
+    return outputs, final_hidden_state, final_cell_state
+
+
+def rnn_bidirectional_layer(
+    model,
+    inputs,
+    input_lengths,
+    input_size,
+    num_units,
+    dropout_keep_prob,
+    forward_only,
+    return_sequence_output,
+    return_final_state,
+    scope=None,
+):
+    outputs_fw, final_hidden_fw, final_cell_fw = rnn_unidirectional_layer(
+        model,
+        inputs,
+        input_lengths,
+        input_size,
+        num_units,
+        dropout_keep_prob,
+        forward_only,
+        return_sequence_output,
+        return_final_state,
+        scope=(scope + '/' if scope else '') + 'fw',
+    )
+    with core.NameScope(scope):
+        reversed_inputs = model.net.ReversePackedSegs(
+            [inputs, input_lengths],
+            ['reversed_inputs'],
+        )
+    outputs_bw, final_hidden_bw, final_cell_bw = rnn_unidirectional_layer(
+        model,
+        reversed_inputs,
+        input_lengths,
+        input_size,
+        num_units,
+        dropout_keep_prob,
+        forward_only,
+        return_sequence_output,
+        return_final_state,
+        scope=(scope + '/' if scope else '') + 'bw',
+    )
+    with core.NameScope(scope):
+        outputs_bw = model.net.ReversePackedSegs(
+            [outputs_bw, input_lengths],
+            ['outputs_bw'],
+        )
+
+    # Concatenate forward and backward results
+    if return_sequence_output:
+        with core.NameScope(scope):
+            outputs, _ = model.net.Concat(
+                [outputs_fw, outputs_bw],
+                ['outputs', 'outputs_dim'],
+                axis=2,
+            )
+    else:
+        outputs = None
+
+    if return_final_state:
+        with core.NameScope(scope):
+            final_hidden_state, _ = model.net.Concat(
+                [final_hidden_fw, final_hidden_bw],
+                ['final_hidden_state', 'final_hidden_state_dim'],
+                axis=2,
+            )
+            final_cell_state, _ = model.net.Concat(
+                [final_cell_fw, final_cell_bw],
+                ['final_cell_state', 'final_cell_state_dim'],
+                axis=2,
+            )
+    else:
+        final_hidden_state = None
+        final_cell_state = None
+
+    return outputs, final_hidden_state, final_cell_state
+
+
+def build_embeddings(
+    model,
+    vocab_size,
+    embedding_size,
+    name,
+    freeze_embeddings,
+):
+    embeddings = model.param_init_net.GaussianFill(
+        [],
+        name,
+        shape=[vocab_size, embedding_size],
+        std=0.1,
+    )
+    if not freeze_embeddings:
+        model.params.append(embeddings)
+    return embeddings
+
+
+def get_layer_scope(scope, layer_type, i):
+    prefix = (scope + '/' if scope else '') + layer_type
+    return '{}/layer{}'.format(prefix, i)
+
+
+def build_embedding_encoder(
+    model,
+    encoder_params,
+    num_decoder_layers,
+    inputs,
+    input_lengths,
+    vocab_size,
+    embeddings,
+    embedding_size,
+    use_attention,
+    num_gpus=0,
+    forward_only=False,
+    scope=None,
+):
+    with core.NameScope(scope or ''):
+        if num_gpus == 0:
+            embedded_encoder_inputs = model.net.Gather(
+                [embeddings, inputs],
+                ['embedded_encoder_inputs'],
+            )
+        else:
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                embedded_encoder_inputs_cpu = model.net.Gather(
+                    [embeddings, inputs],
+                    ['embedded_encoder_inputs_cpu'],
+                )
+            embedded_encoder_inputs = model.CopyCPUToGPU(
+                embedded_encoder_inputs_cpu,
+                'embedded_encoder_inputs',
+            )
+
+    layer_inputs = embedded_encoder_inputs
+    layer_input_size = embedding_size
+    encoder_units_per_layer = []
+    final_encoder_hidden_states = []
+    final_encoder_cell_states = []
+
+    num_encoder_layers = len(encoder_params['encoder_layer_configs'])
+    use_bidirectional_encoder = encoder_params.get(
+        'use_bidirectional_encoder',
+        False,
+    )
+
+    for i, layer_config in enumerate(encoder_params['encoder_layer_configs']):
+
+        if use_bidirectional_encoder and i == 0:
+            layer_func = rnn_bidirectional_layer
+            output_dims = 2 * layer_config['num_units']
+        else:
+            layer_func = rnn_unidirectional_layer
+            output_dims = layer_config['num_units']
+        encoder_units_per_layer.append(output_dims)
+
+        is_final_layer = (i == num_encoder_layers - 1)
+
+        dropout_keep_prob = layer_config.get(
+            'dropout_keep_prob',
+            None,
+        )
+
+        return_final_state = i >= (num_encoder_layers - num_decoder_layers)
+        (
+            layer_outputs,
+            final_layer_hidden_state,
+            final_layer_cell_state,
+        ) = layer_func(
+            model=model,
+            inputs=layer_inputs,
+            input_lengths=input_lengths,
+            input_size=layer_input_size,
+            num_units=layer_config['num_units'],
+            dropout_keep_prob=dropout_keep_prob,
+            forward_only=forward_only,
+            return_sequence_output=(not is_final_layer) or use_attention,
+            return_final_state=return_final_state,
+            scope=get_layer_scope(scope, 'encoder', i),
+        )
+
+        if not is_final_layer:
+            layer_inputs = layer_outputs
+            layer_input_size = output_dims
+        final_encoder_hidden_states.append(final_layer_hidden_state)
+        final_encoder_cell_states.append(final_layer_cell_state)
+
+    encoder_outputs = layer_outputs
+    weighted_encoder_outputs = None
+
+    return (
+        encoder_outputs,
+        weighted_encoder_outputs,
+        final_encoder_hidden_states,
+        final_encoder_cell_states,
+        encoder_units_per_layer,
+    )
+
+
+class LSTMWithAttentionDecoder(object):
+
+    def scope(self, name):
+        return self.name + '/' + name if self.name is not None else name
+
+    def _get_attention_type(self, attention_type_as_string):
+        if attention_type_as_string == 'regular':
+            return attention.AttentionType.Regular
+        elif attention_type_as_string == 'recurrent':
+            return attention.AttentionType.Recurrent
+        else:
+            assert False, 'Unknown type ' + attention_type_as_string
+
+    def __init__(
+        self,
+        encoder_outputs,
+        encoder_output_dim,
+        encoder_lengths,
+        vocab_size,
+        attention_type,
+        embedding_size,
+        decoder_num_units,
+        decoder_cells,
+        residual_output_layers=None,
+        name=None,
+        weighted_encoder_outputs=None,
+    ):
+        self.name = name
+        self.num_layers = len(decoder_cells)
+        if attention_type == 'none':
+            self.cell = rnn_cell.MultiRNNCell(
+                decoder_cells,
+                name=self.scope('decoder'),
+                residual_output_layers=residual_output_layers,
+            )
+            self.use_attention = False
+            self.decoder_output_dim = decoder_num_units
+            self.output_indices = self.cell.output_indices
+        else:
+            decoder_cell = rnn_cell.MultiRNNCell(
+                decoder_cells,
+                name=self.scope('decoder'),
+                residual_output_layers=residual_output_layers,
+            )
+            self.cell = rnn_cell.AttentionCell(
+                encoder_output_dim=encoder_output_dim,
+                encoder_outputs=encoder_outputs,
+                encoder_lengths=encoder_lengths,
+                decoder_cell=decoder_cell,
+                decoder_state_dim=decoder_num_units,
+                name=self.scope('attention_decoder'),
+                attention_type=self._get_attention_type(attention_type),
+                weighted_encoder_outputs=weighted_encoder_outputs,
+                attention_memory_optimization=True,
+            )
+            self.use_attention = True
+            self.decoder_output_dim = decoder_num_units + encoder_output_dim
+
+            self.output_indices = decoder_cell.output_indices
+            self.output_indices.append(2 * self.num_layers)
+
+    def get_state_names(self):
+        return self.cell.get_state_names()
+
+    def get_outputs_with_grads(self):
+        # sequence (all) output locations are at twice their state index
+        return [2 * i for i in self.output_indices]
+
+    def get_output_dim(self):
+        return self.decoder_output_dim
+
+    def get_attention_weights(self):
+        assert self.use_attention
+        # [batch_size, encoder_length, 1]
+        return self.cell.get_attention_weights()
+
+    def apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+    ):
+        return self.cell.apply(
+            model=model,
+            input_t=input_t,
+            seq_lengths=seq_lengths,
+            states=states,
+            timestep=timestep,
+        )
+
+    def apply_over_sequence(
+        self,
+        model,
+        inputs,
+        seq_lengths,
+        initial_states,
+    ):
+        return self.cell.apply_over_sequence(
+            model=model,
+            inputs=inputs,
+            seq_lengths=seq_lengths,
+            initial_states=initial_states,
+            outputs_with_grads=self.get_outputs_with_grads(),
+        )
+
+
+def build_initial_rnn_decoder_states(
+    model,
+    encoder_units_per_layer,
+    decoder_units_per_layer,
+    final_encoder_hidden_states,
+    final_encoder_cell_states,
+    use_attention,
+):
+    num_encoder_layers = len(encoder_units_per_layer)
+    num_decoder_layers = len(decoder_units_per_layer)
+    if num_encoder_layers > num_decoder_layers:
+        offset = num_encoder_layers - num_decoder_layers
+    else:
+        offset = 0
+
+    initial_states = []
+    for i, decoder_num_units in enumerate(decoder_units_per_layer):
+
+        if (
+            final_encoder_hidden_states and
+            len(final_encoder_hidden_states) > (i + offset)
+        ):
+            final_encoder_hidden_state = final_encoder_hidden_states[i + offset]
+        else:
+            final_encoder_hidden_state = None
+
+        if final_encoder_hidden_state is None:
+            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
+                [],
+                'decoder_initial_hidden_state_{}'.format(i),
+                shape=[decoder_num_units],
+                value=0.0,
+            )
+            model.params.append(decoder_initial_hidden_state)
+        elif decoder_num_units != encoder_units_per_layer[i + offset]:
+            decoder_initial_hidden_state = brew.fc(
+                model,
+                final_encoder_hidden_state,
+                'decoder_initial_hidden_state_{}'.format(i),
+                encoder_units_per_layer[i + offset],
+                decoder_num_units,
+                axis=2,
+            )
+        else:
+            decoder_initial_hidden_state = final_encoder_hidden_state
+        initial_states.append(decoder_initial_hidden_state)
+
+        if (
+            final_encoder_cell_states and
+            len(final_encoder_cell_states) > (i + offset)
+        ):
+            final_encoder_cell_state = final_encoder_cell_states[i + offset]
+        else:
+            final_encoder_cell_state = None
+
+        if final_encoder_cell_state is None:
+            decoder_initial_cell_state = model.param_init_net.ConstantFill(
+                [],
+                'decoder_initial_cell_state_{}'.format(i),
+                shape=[decoder_num_units],
+                value=0.0,
+            )
+            model.params.append(decoder_initial_cell_state)
+        elif decoder_num_units != encoder_units_per_layer[i + offset]:
+            decoder_initial_cell_state = brew.fc(
+                model,
+                final_encoder_cell_state,
+                'decoder_initial_cell_state_{}'.format(i),
+                encoder_units_per_layer[i + offset],
+                decoder_num_units,
+                axis=2,
+            )
+        else:
+            decoder_initial_cell_state = final_encoder_cell_state
+        initial_states.append(decoder_initial_cell_state)
+
+    if use_attention:
+        initial_attention_weighted_encoder_context = (
+            model.param_init_net.ConstantFill(
+                [],
+                'initial_attention_weighted_encoder_context',
+                shape=[encoder_units_per_layer[-1]],
+                value=0.0,
+            )
+        )
+        model.params.append(initial_attention_weighted_encoder_context)
+        initial_states.append(initial_attention_weighted_encoder_context)
+
+    return initial_states
+
+
+def build_embedding_decoder(
+    model,
+    decoder_layer_configs,
+    inputs,
+    input_lengths,
+    encoder_lengths,
+    encoder_outputs,
+    weighted_encoder_outputs,
+    final_encoder_hidden_states,
+    final_encoder_cell_states,
+    encoder_units_per_layer,
+    vocab_size,
+    embeddings,
+    embedding_size,
+    attention_type,
+    forward_only,
+    num_gpus=0,
+    scope=None,
+):
+    with core.NameScope(scope or ''):
+        if num_gpus == 0:
+            embedded_decoder_inputs = model.net.Gather(
+                [embeddings, inputs],
+                ['embedded_decoder_inputs'],
+            )
+        else:
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                embedded_decoder_inputs_cpu = model.net.Gather(
+                    [embeddings, inputs],
+                    ['embedded_decoder_inputs_cpu'],
+                )
+            embedded_decoder_inputs = model.CopyCPUToGPU(
+                embedded_decoder_inputs_cpu,
+                'embedded_decoder_inputs',
+            )
+
+    decoder_cells = []
+    decoder_units_per_layer = []
+    for i, layer_config in enumerate(decoder_layer_configs):
+        num_units = layer_config['num_units']
+        decoder_units_per_layer.append(num_units)
+
+        if i == 0:
+            input_size = embedding_size
+        else:
+            input_size = decoder_cells[-1].get_output_dim()
+
+        cell = rnn_cell.LSTMCell(
+            forward_only=forward_only,
+            input_size=input_size,
+            hidden_size=num_units,
+            forget_bias=0.0,
+            memory_optimization=False,
+        )
+
+        dropout_keep_prob = layer_config.get('dropout_keep_prob', None)
+        if dropout_keep_prob is not None:
+            dropout_ratio = 1.0 - layer_config.dropout_keep_prob
+            cell = rnn_cell.DropoutCell(
+                internal_cell=cell,
+                dropout_ratio=dropout_ratio,
+                forward_only=forward_only,
+                is_test=False,
+                name=get_layer_scope(scope, 'decoder_dropout', i),
+            )
+
+        decoder_cells.append(cell)
+
+    states = build_initial_rnn_decoder_states(
+        model=model,
+        encoder_units_per_layer=encoder_units_per_layer,
+        decoder_units_per_layer=decoder_units_per_layer,
+        final_encoder_hidden_states=final_encoder_hidden_states,
+        final_encoder_cell_states=final_encoder_cell_states,
+        use_attention=(attention_type != 'none'),
+    )
+    attention_decoder = LSTMWithAttentionDecoder(
+        encoder_outputs=encoder_outputs,
+        encoder_output_dim=encoder_units_per_layer[-1],
+        encoder_lengths=encoder_lengths,
+        vocab_size=vocab_size,
+        attention_type=attention_type,
+        embedding_size=embedding_size,
+        decoder_num_units=decoder_units_per_layer[-1],
+        decoder_cells=decoder_cells,
+        weighted_encoder_outputs=weighted_encoder_outputs,
+        name=scope,
+    )
+    decoder_outputs, _ = attention_decoder.apply_over_sequence(
+        model=model,
+        inputs=embedded_decoder_inputs,
+        seq_lengths=input_lengths,
+        initial_states=states,
+    )
+
+    # we do softmax over the whole sequence
+    # (max_length in the batch * batch_size) x decoder embedding size
+    # -1 because we don't know max_length yet
+    decoder_outputs_flattened, _ = model.net.Reshape(
+        [decoder_outputs],
+        [
+            'decoder_outputs_flattened',
+            'decoder_outputs_and_contexts_combination_old_shape',
+        ],
+        shape=[-1, attention_decoder.get_output_dim()],
+    )
+
+    decoder_outputs = decoder_outputs_flattened
+    decoder_output_dim = attention_decoder.get_output_dim()
+
+    return (decoder_outputs, decoder_output_dim)
+
+
+def output_projection(
+    model,
+    decoder_outputs,
+    decoder_output_size,
+    target_vocab_size,
+    decoder_softmax_size,
+):
+    if decoder_softmax_size is not None:
+        decoder_outputs = brew.fc(
+            model,
+            decoder_outputs,
+            'decoder_outputs_scaled',
+            dim_in=decoder_output_size,
+            dim_out=decoder_softmax_size,
+        )
+        decoder_output_size = decoder_softmax_size
+
+    output_projection_w = model.param_init_net.XavierFill(
+        [],
+        'output_projection_w',
+        shape=[target_vocab_size, decoder_output_size],
+    )
+
+    output_projection_b = model.param_init_net.XavierFill(
+        [],
+        'output_projection_b',
+        shape=[target_vocab_size],
+    )
+    model.params.extend([
+        output_projection_w,
+        output_projection_b,
+    ])
+    output_logits = model.net.FC(
+        [
+            decoder_outputs,
+            output_projection_w,
+            output_projection_b,
+        ],
+        ['output_logits'],
+    )
+    return output_logits
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
new file mode 100644
index 0000000..1e5b286
--- /dev/null
+++ b/caffe2/python/models/seq2seq/train.py
@@ -0,0 +1,769 @@
+## @package train
+# Module caffe2.python.models.seq2seq.train
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import collections
+import logging
+import math
+import numpy as np
+import random
+import time
+import sys
+import os
+
+import caffe2.proto.caffe2_pb2 as caffe2_pb2
+from caffe2.python import core, workspace, data_parallel_model
+import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
+from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler(sys.stderr))
+
+Batch = collections.namedtuple('Batch', [
+    'encoder_inputs',
+    'encoder_lengths',
+    'decoder_inputs',
+    'decoder_lengths',
+    'targets',
+    'target_weights',
+])
+
+
+def prepare_batch(batch):
+    encoder_lengths = [len(entry[0]) for entry in batch]
+    max_encoder_length = max(encoder_lengths)
+    decoder_lengths = []
+    max_decoder_length = max([len(entry[1]) for entry in batch])
+
+    batch_encoder_inputs = []
+    batch_decoder_inputs = []
+    batch_targets = []
+    batch_target_weights = []
+
+    for source_seq, target_seq in batch:
+        encoder_pads = (
+            [seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
+        )
+        batch_encoder_inputs.append(
+            list(reversed(source_seq)) + encoder_pads
+        )
+
+        decoder_pads = (
+            [seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
+        )
+        target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
+        decoder_lengths.append(len(target_seq_with_go_token))
+        batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
+
+        target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
+        targets = target_seq_with_eos + decoder_pads
+        batch_targets.append(targets)
+
+        if len(source_seq) + len(target_seq) == 0:
+            target_weights = [0] * len(targets)
+        else:
+            target_weights = [
+                1 if target != seq2seq_util.PAD_ID else 0
+                for target in targets
+            ]
+        batch_target_weights.append(target_weights)
+
+    return Batch(
+        encoder_inputs=np.array(
+            batch_encoder_inputs,
+            dtype=np.int32,
+        ).transpose(),
+        encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
+        decoder_inputs=np.array(
+            batch_decoder_inputs,
+            dtype=np.int32,
+        ).transpose(),
+        decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
+        targets=np.array(
+            batch_targets,
+            dtype=np.int32,
+        ).transpose(),
+        target_weights=np.array(
+            batch_target_weights,
+            dtype=np.float32,
+        ).transpose(),
+    )
+
+
+class Seq2SeqModelCaffe2(object):
+
+    def _build_model(
+        self,
+        init_params,
+    ):
+        model = Seq2SeqModelHelper(init_params=init_params)
+        self._build_shared(model)
+        self._build_embeddings(model)
+
+        forward_model = Seq2SeqModelHelper(init_params=init_params)
+        self._build_shared(forward_model)
+        self._build_embeddings(forward_model)
+
+        if self.num_gpus == 0:
+            loss_blobs = self.model_build_fun(model)
+            model.AddGradientOperators(loss_blobs)
+            self.norm_clipped_grad_update(
+                model,
+                scope='norm_clipped_grad_update'
+            )
+            self.forward_model_build_fun(forward_model)
+
+        else:
+            assert (self.batch_size % self.num_gpus) == 0
+
+            data_parallel_model.Parallelize_GPU(
+                forward_model,
+                input_builder_fun=lambda m: None,
+                forward_pass_builder_fun=self.forward_model_build_fun,
+                param_update_builder_fun=None,
+                devices=list(range(self.num_gpus)),
+            )
+
+            def clipped_grad_update_bound(model):
+                self.norm_clipped_grad_update(
+                    model,
+                    scope='norm_clipped_grad_update',
+                )
+
+            data_parallel_model.Parallelize_GPU(
+                model,
+                input_builder_fun=lambda m: None,
+                forward_pass_builder_fun=self.model_build_fun,
+                param_update_builder_fun=clipped_grad_update_bound,
+                devices=list(range(self.num_gpus)),
+            )
+        self.norm_clipped_sparse_grad_update(
+            model,
+            scope='norm_clipped_sparse_grad_update',
+        )
+        self.model = model
+        self.forward_net = forward_model.net
+
+    def _build_shared(self, model):
+        optimizer_params = self.model_params['optimizer_params']
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            self.learning_rate = model.AddParam(
+                name='learning_rate',
+                init_value=float(optimizer_params['learning_rate']),
+                trainable=False,
+            )
+            self.global_step = model.AddParam(
+                name='global_step',
+                init_value=0,
+                trainable=False,
+            )
+            self.start_time = model.AddParam(
+                name='start_time',
+                init_value=time.time(),
+                trainable=False,
+            )
+
+    def _build_embeddings(self, model):
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            sqrt3 = math.sqrt(3)
+            self.encoder_embeddings = model.param_init_net.UniformFill(
+                [],
+                'encoder_embeddings',
+                shape=[
+                    self.source_vocab_size,
+                    self.model_params['encoder_embedding_size'],
+                ],
+                min=-sqrt3,
+                max=sqrt3,
+            )
+            model.params.append(self.encoder_embeddings)
+            self.decoder_embeddings = model.param_init_net.UniformFill(
+                [],
+                'decoder_embeddings',
+                shape=[
+                    self.target_vocab_size,
+                    self.model_params['decoder_embedding_size'],
+                ],
+                min=-sqrt3,
+                max=sqrt3,
+            )
+            model.params.append(self.decoder_embeddings)
+
+    def model_build_fun(self, model, forward_only=False, loss_scale=None):
+        encoder_inputs = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'encoder_inputs',
+        )
+        encoder_lengths = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'encoder_lengths',
+        )
+        decoder_inputs = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'decoder_inputs',
+        )
+        decoder_lengths = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'decoder_lengths',
+        )
+        targets = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'targets',
+        )
+        target_weights = model.net.AddExternalInput(
+            workspace.GetNameScope() + 'target_weights',
+        )
+        attention_type = self.model_params['attention']
+        assert attention_type in ['none', 'regular', 'dot']
+
+        (
+            encoder_outputs,
+            weighted_encoder_outputs,
+            final_encoder_hidden_states,
+            final_encoder_cell_states,
+            encoder_units_per_layer,
+        ) = seq2seq_util.build_embedding_encoder(
+            model=model,
+            encoder_params=self.encoder_params,
+            num_decoder_layers=len(self.model_params['decoder_layer_configs']),
+            inputs=encoder_inputs,
+            input_lengths=encoder_lengths,
+            vocab_size=self.source_vocab_size,
+            embeddings=self.encoder_embeddings,
+            embedding_size=self.model_params['encoder_embedding_size'],
+            use_attention=(attention_type != 'none'),
+            num_gpus=self.num_gpus,
+        )
+
+        (
+            decoder_outputs,
+            decoder_output_size,
+        ) = seq2seq_util.build_embedding_decoder(
+            model,
+            decoder_layer_configs=self.model_params['decoder_layer_configs'],
+            inputs=decoder_inputs,
+            input_lengths=decoder_lengths,
+            encoder_lengths=encoder_lengths,
+            encoder_outputs=encoder_outputs,
+            weighted_encoder_outputs=weighted_encoder_outputs,
+            final_encoder_hidden_states=final_encoder_hidden_states,
+            final_encoder_cell_states=final_encoder_cell_states,
+            encoder_units_per_layer=encoder_units_per_layer,
+            vocab_size=self.target_vocab_size,
+            embeddings=self.decoder_embeddings,
+            embedding_size=self.model_params['decoder_embedding_size'],
+            attention_type=attention_type,
+            forward_only=False,
+            num_gpus=self.num_gpus,
+        )
+
+        output_logits = seq2seq_util.output_projection(
+            model=model,
+            decoder_outputs=decoder_outputs,
+            decoder_output_size=decoder_output_size,
+            target_vocab_size=self.target_vocab_size,
+            decoder_softmax_size=self.model_params['decoder_softmax_size'],
+        )
+        targets, _ = model.net.Reshape(
+            [targets],
+            ['targets', 'targets_old_shape'],
+            shape=[-1],
+        )
+        target_weights, _ = model.net.Reshape(
+            [target_weights],
+            ['target_weights', 'target_weights_old_shape'],
+            shape=[-1],
+        )
+        _, loss_per_word = model.net.SoftmaxWithLoss(
+            [output_logits, targets, target_weights],
+            ['OutputProbs_INVALID', 'loss_per_word'],
+            only_loss=True,
+        )
+
+        num_words = model.net.SumElements(
+            [target_weights],
+            'num_words',
+        )
+        total_loss_scalar = model.net.Mul(
+            [loss_per_word, num_words],
+            'total_loss_scalar',
+        )
+        total_loss_scalar_weighted = model.net.Scale(
+            [total_loss_scalar],
+            'total_loss_scalar_weighted',
+            scale=1.0 / self.batch_size,
+        )
+        return [total_loss_scalar_weighted]
+
+    def forward_model_build_fun(self, model, loss_scale=None):
+        return self.model_build_fun(
+            model=model,
+            forward_only=True,
+            loss_scale=loss_scale
+        )
+
+    def _calc_norm_ratio(self, model, params, scope, ONE):
+        with core.NameScope(scope):
+            grad_squared_sums = []
+            for i, param in enumerate(params):
+                logger.info(param)
+                grad = (
+                    model.param_to_grad[param]
+                    if not isinstance(
+                        model.param_to_grad[param],
+                        core.GradientSlice,
+                    ) else model.param_to_grad[param].values
+                )
+                grad_squared = model.net.Sqr(
+                    [grad],
+                    'grad_{}_squared'.format(i),
+                )
+                grad_squared_sum = model.net.SumElements(
+                    grad_squared,
+                    'grad_{}_squared_sum'.format(i),
+                )
+                grad_squared_sums.append(grad_squared_sum)
+
+            grad_squared_full_sum = model.net.Sum(
+                grad_squared_sums,
+                'grad_squared_full_sum',
+            )
+            global_norm = model.net.Pow(
+                grad_squared_full_sum,
+                'global_norm',
+                exponent=0.5,
+            )
+            clip_norm = model.param_init_net.ConstantFill(
+                [],
+                'clip_norm',
+                shape=[],
+                value=float(self.model_params['max_gradient_norm']),
+            )
+            max_norm = model.net.Max(
+                [global_norm, clip_norm],
+                'max_norm',
+            )
+            norm_ratio = model.net.Div(
+                [clip_norm, max_norm],
+                'norm_ratio',
+            )
+            return norm_ratio
+
+    def _apply_norm_ratio(
+        self, norm_ratio, model, params, learning_rate, scope, ONE
+    ):
+        for param in params:
+            param_grad = model.param_to_grad[param]
+            nlr = model.net.Negative(
+                [learning_rate],
+                'negative_learning_rate',
+            )
+            with core.NameScope(scope):
+                update_coeff = model.net.Mul(
+                    [nlr, norm_ratio],
+                    'update_coeff',
+                    broadcast=1,
+                )
+            if isinstance(param_grad, core.GradientSlice):
+                param_grad_values = param_grad.values
+
+                model.net.ScatterWeightedSum(
+                    [
+                        param,
+                        ONE,
+                        param_grad.indices,
+                        param_grad_values,
+                        update_coeff,
+                    ],
+                    param,
+                )
+            else:
+                model.net.WeightedSum(
+                    [
+                        param,
+                        ONE,
+                        param_grad,
+                        update_coeff,
+                    ],
+                    param,
+                )
+
+    def norm_clipped_grad_update(self, model, scope):
+
+        if self.num_gpus == 0:
+            learning_rate = self.learning_rate
+        else:
+            learning_rate = model.CopyCPUToGPU(self.learning_rate, 'LR')
+
+        params = []
+        for param in model.GetParams(top_scope=True):
+            if param in model.param_to_grad:
+                if not isinstance(
+                    model.param_to_grad[param],
+                    core.GradientSlice,
+                ):
+                    params.append(param)
+
+        ONE = model.param_init_net.ConstantFill(
+            [],
+            'ONE',
+            shape=[1],
+            value=1.0,
+        )
+        logger.info('Dense trainable variables: ')
+        norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
+        self._apply_norm_ratio(
+            norm_ratio, model, params, learning_rate, scope, ONE
+        )
+
+    def norm_clipped_sparse_grad_update(self, model, scope):
+        learning_rate = self.learning_rate
+
+        params = []
+        for param in model.GetParams(top_scope=True):
+            if param in model.param_to_grad:
+                if isinstance(
+                    model.param_to_grad[param],
+                    core.GradientSlice,
+                ):
+                    params.append(param)
+
+        ONE = model.param_init_net.ConstantFill(
+            [],
+            'ONE',
+            shape=[1],
+            value=1.0,
+        )
+        logger.info('Sparse trainable variables: ')
+        norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
+        self._apply_norm_ratio(
+            norm_ratio, model, params, learning_rate, scope, ONE
+        )
+
+    def total_loss_scalar(self):
+        if self.num_gpus == 0:
+            return workspace.FetchBlob('total_loss_scalar')
+        else:
+            total_loss = 0
+            for i in range(self.num_gpus):
+                name = 'gpu_{}/total_loss_scalar'.format(i)
+                gpu_loss = workspace.FetchBlob(name)
+                total_loss += gpu_loss
+            return total_loss
+
+    def _init_model(self):
+        workspace.RunNetOnce(self.model.param_init_net)
+
+        def create_net(net):
+            workspace.CreateNet(
+                net,
+                input_blobs=[str(i) for i in net.external_inputs],
+            )
+
+        create_net(self.model.net)
+        create_net(self.forward_net)
+
+    def __init__(
+        self,
+        model_params,
+        source_vocab_size,
+        target_vocab_size,
+        num_gpus=1,
+        num_cpus=1,
+    ):
+        self.model_params = model_params
+        self.encoder_type = 'rnn'
+        self.encoder_params = model_params['encoder_type']
+        self.source_vocab_size = source_vocab_size
+        self.target_vocab_size = target_vocab_size
+        self.num_gpus = num_gpus
+        self.num_cpus = num_cpus
+        self.batch_size = model_params['batch_size']
+
+        workspace.GlobalInit([
+            'caffe2',
+            # NOTE: modify log level for debugging purposes
+            '--caffe2_log_level=0',
+            # NOTE: modify log level for debugging purposes
+            '--v=0',
+            # Fail gracefully if one of the threads fails
+            '--caffe2_handle_executor_threads_exceptions=1',
+            '--caffe2_mkl_num_threads=' + str(self.num_cpus),
+        ])
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        workspace.ResetWorkspace()
+
+    def initialize_from_scratch(self):
+        logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
+        self._build_model(init_params=True)
+        self._init_model()
+        logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
+
+    def get_current_step(self):
+        return workspace.FetchBlob(self.global_step)[0]
+
+    def inc_current_step(self):
+        workspace.FeedBlob(
+            self.global_step,
+            np.array([self.get_current_step() + 1]),
+        )
+
+    def step(
+        self,
+        batch,
+        forward_only
+    ):
+        if self.num_gpus < 1:
+            batch_obj = prepare_batch(batch)
+            for batch_obj_name, batch_obj_value in zip(
+                Batch._fields,
+                batch_obj,
+            ):
+                workspace.FeedBlob(batch_obj_name, batch_obj_value)
+        else:
+            for i in range(self.num_gpus):
+                gpu_batch = batch[i::self.num_gpus]
+                batch_obj = prepare_batch(gpu_batch)
+                for batch_obj_name, batch_obj_value in zip(
+                    Batch._fields,
+                    batch_obj,
+                ):
+                    name = 'gpu_{}/{}'.format(i, batch_obj_name)
+                    if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
+                        dev = core.DeviceOption(caffe2_pb2.CPU)
+                    else:
+                        dev = core.DeviceOption(caffe2_pb2.CUDA, i)
+                    workspace.FeedBlob(name, batch_obj_value, device_option=dev)
+
+        if forward_only:
+            workspace.RunNet(self.forward_net)
+        else:
+            workspace.RunNet(self.model.net)
+            self.inc_current_step()
+
+        return self.total_loss_scalar()
+
+    def save(self, checkpoint_path_prefix, current_step):
+        checkpoint_path = '{0}-{1}'.format(
+            checkpoint_path_prefix,
+            current_step,
+        )
+
+        assert workspace.RunOperatorOnce(core.CreateOperator(
+            'Save',
+            self.model.GetAllParams(),
+            [],
+            absolute_path=True,
+            db=checkpoint_path,
+            db_type='minidb',
+        ))
+
+        checkpoint_config_path = os.path.join(
+            os.path.dirname(checkpoint_path_prefix),
+            'checkpoint',
+        )
+        with open(checkpoint_config_path, 'w') as checkpoint_config_file:
+            checkpoint_config_file.write(
+                'model_checkpoint_path: "' + checkpoint_path + '"\n'
+                'all_model_checkpoint_paths: "' + checkpoint_path + '"\n'
+            )
+            logger.info('Saved checkpoint file to ' + checkpoint_path)
+
+        return checkpoint_path
+
+def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
+                batch_size, max_length):
+    with open(source_corpus) as source, open(target_corpus) as target:
+        parallel_sentences = []
+        for source_sentence, target_sentence in zip(source, target):
+            numerized_source_sentence = seq2seq_util.get_numberized_sentence(
+                source_sentence,
+                source_vocab,
+            )
+            numerized_target_sentence = seq2seq_util.get_numberized_sentence(
+                target_sentence,
+                target_vocab,
+            )
+            if (
+                len(numerized_source_sentence) > 0 and
+                len(numerized_target_sentence) > 0 and
+                (
+                    max_length is None or (
+                        len(numerized_source_sentence) <= max_length and
+                        len(numerized_target_sentence) <= max_length
+                    )
+                )
+            ):
+                parallel_sentences.append((
+                    numerized_source_sentence,
+                    numerized_target_sentence,
+                ))
+    parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))
+
+    batches, batch = [], []
+    for sentence_pair in parallel_sentences:
+        batch.append(sentence_pair)
+        if len(batch) >= batch_size:
+            batches.append(batch)
+            batch = []
+    if len(batch) > 0:
+        while len(batch) < batch_size:
+            batch.append(batch[-1])
+        assert len(batch) == batch_size
+        batches.append(batch)
+    random.shuffle(batches)
+    return batches
+
+
+def run_seq2seq_model(args, model_params=None):
+    source_vocab = seq2seq_util.gen_vocab(
+        args.source_corpus,
+        args.unk_threshold,
+    )
+    target_vocab = seq2seq_util.gen_vocab(
+        args.target_corpus,
+        args.unk_threshold,
+    )
+    logger.info('Source vocab size {}'.format(len(source_vocab)))
+    logger.info('Target vocab size {}'.format(len(target_vocab)))
+
+    batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
+                          target_vocab, model_params['batch_size'],
+                          args.max_length)
+    logger.info('Number of training batches {}'.format(len(batches)))
+
+    batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
+                               source_vocab, target_vocab,
+                               model_params['batch_size'], args.max_length)
+    logger.info('Number of eval batches {}'.format(len(batches_eval)))
+
+    with Seq2SeqModelCaffe2(
+        model_params=model_params,
+        source_vocab_size=len(source_vocab),
+        target_vocab_size=len(target_vocab),
+        num_gpus=args.num_gpus,
+        num_cpus=20,
+    ) as model_obj:
+        model_obj.initialize_from_scratch()
+        for i in range(args.epochs):
+            logger.info('Epoch {}'.format(i))
+            total_loss = 0
+            for batch in batches:
+                total_loss += model_obj.step(
+                    batch=batch,
+                    forward_only=False,
+                )
+            logger.info('\ttraining loss {}'.format(total_loss))
+            total_loss = 0
+            for batch in batches_eval:
+                total_loss += model_obj.step(
+                    batch=batch,
+                    forward_only=True,
+                )
+            logger.info('\teval loss {}'.format(total_loss))
+            if args.checkpoint is not None:
+                model_obj.save(args.checkpoint, i)
+
+
+def main():
+    random.seed(31415)
+    parser = argparse.ArgumentParser(
+        description='Caffe2: Seq2Seq Training'
+    )
+    parser.add_argument('--source-corpus', type=str, default=None,
+                        help='Path to source corpus in a text file format. Each '
+                        'line in the file should contain a single sentence',
+                        required=True)
+    parser.add_argument('--target-corpus', type=str, default=None,
+                        help='Path to target corpus in a text file format',
+                        required=True)
+    parser.add_argument('--max-length', type=int, default=None,
+                        help='Maximal lengths of train and eval sentences')
+    parser.add_argument('--unk-threshold', type=int, default=50,
+                        help='Threshold frequency under which token becomes '
+                        'labeled unknown token')
+
+    parser.add_argument('--batch-size', type=int, default=32,
+                        help='Training batch size')
+    parser.add_argument('--epochs', type=int, default=10,
+                        help='Number of iterations over training data')
+    parser.add_argument('--learning-rate', type=float, default=0.5,
+                        help='Learning rate')
+    parser.add_argument('--max-gradient-norm', type=float, default=1.0,
+                        help='Max global norm of gradients at the end of each '
+                        'backward pass. We do clipping to match the number.')
+    parser.add_argument('--num-gpus', type=int, default=0,
+                        help='Number of GPUs for data parallel model')
+
+    parser.add_argument('--use-bidirectional-encoder', action='store_true',
+                        help='Set flag to use bidirectional recurrent network '
+                        'for first layer of encoder')
+    parser.add_argument('--use-attention', action='store_true',
+                        help='Set flag to use seq2seq with attention model')
+    parser.add_argument('--source-corpus-eval', type=str, default=None,
+                        help='Path to source corpus for evaluation in a text '
+                        'file format', required=True)
+    parser.add_argument('--target-corpus-eval', type=str, default=None,
+                        help='Path to target corpus for evaluation in a text '
+                        'file format', required=True)
+    parser.add_argument('--encoder-cell-num-units', type=int, default=512,
+                        help='Number of cell units per encoder layer')
+    parser.add_argument('--encoder-num-layers', type=int, default=2,
+                        help='Number encoder layers')
+    parser.add_argument('--decoder-cell-num-units', type=int, default=512,
+                        help='Number of cell units in the decoder layer')
+    parser.add_argument('--decoder-num-layers', type=int, default=2,
+                        help='Number decoder layers')
+    parser.add_argument('--encoder-embedding-size', type=int, default=256,
+                        help='Size of embedding in the encoder layer')
+    parser.add_argument('--decoder-embedding-size', type=int, default=512,
+                        help='Size of embedding in the decoder layer')
+    parser.add_argument('--decoder-softmax-size', type=int, default=None,
+                        help='Size of softmax layer in the decoder')
+
+    parser.add_argument('--checkpoint', type=str, default=None,
+                        help='Path to checkpoint')
+
+    args = parser.parse_args()
+
+    encoder_layer_configs = [
+        dict(
+            num_units=args.encoder_cell_num_units,
+        ),
+    ] * args.encoder_num_layers
+
+    if args.use_bidirectional_encoder:
+        assert args.encoder_cell_num_units % 2 == 0
+        encoder_layer_configs[0]['num_units'] /= 2
+
+    decoder_layer_configs = [
+        dict(
+            num_units=args.decoder_cell_num_units,
+        ),
+    ] * args.decoder_num_layers
+
+    run_seq2seq_model(args, model_params=dict(
+        attention=('regular' if args.use_attention else 'none'),
+        decoder_layer_configs=decoder_layer_configs,
+        encoder_type=dict(
+            encoder_layer_configs=encoder_layer_configs,
+            use_bidirectional_encoder=args.use_bidirectional_encoder,
+        ),
+        batch_size=args.batch_size,
+        optimizer_params=dict(
+            learning_rate=args.learning_rate,
+        ),
+        encoder_embedding_size=args.encoder_embedding_size,
+        decoder_embedding_size=args.decoder_embedding_size,
+        decoder_softmax_size=args.decoder_softmax_size,
+        max_gradient_norm=args.max_gradient_norm,
+    ))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
new file mode 100644
index 0000000..b1c0e1c
--- /dev/null
+++ b/caffe2/python/models/seq2seq/translate.py
@@ -0,0 +1,639 @@
+## @package translate
+# Module caffe2.python.models.seq2seq.translate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+from future.utils import viewitems
+import logging
+import numpy as np
+import sys
+
+from caffe2.python import core, rnn_cell, workspace
+from caffe2.python.models.seq2seq.beam_search import BeamSearchForwardOnly
+from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
+import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler(sys.stderr))
+
+
+def _weighted_sum(model, values, weight, output_name):
+    values_weights = zip(values, [weight] * len(values))
+    values_weights_flattened = [x for v_w in values_weights for x in v_w]
+    return model.net.WeightedSum(
+        values_weights_flattened,
+        output_name,
+    )
+
+
+class Seq2SeqModelCaffe2EnsembleDecoder(object):
+
+    def scope(self, scope_name, blob_name):
+        return (
+            scope_name + '/' + blob_name
+            if scope_name is not None
+            else blob_name
+        )
+
+    def _build_decoder(
+        self,
+        model,
+        step_model,
+        model_params,
+        scope,
+        previous_tokens,
+        timestep,
+        fake_seq_lengths,
+    ):
+        attention_type = model_params['attention']
+        assert attention_type in ['none', 'regular']
+        use_attention = (attention_type != 'none')
+
+        with core.NameScope(scope):
+            encoder_embeddings = seq2seq_util.build_embeddings(
+                model=model,
+                vocab_size=self.source_vocab_size,
+                embedding_size=model_params['encoder_embedding_size'],
+                name='encoder_embeddings',
+                freeze_embeddings=False,
+            )
+
+        (
+            encoder_outputs,
+            weighted_encoder_outputs,
+            final_encoder_hidden_states,
+            final_encoder_cell_states,
+            encoder_units_per_layer,
+        ) = seq2seq_util.build_embedding_encoder(
+            model=model,
+            encoder_params=model_params['encoder_type'],
+            num_decoder_layers=len(model_params['decoder_layer_configs']),
+            inputs=self.encoder_inputs,
+            input_lengths=self.encoder_lengths,
+            vocab_size=self.source_vocab_size,
+            embeddings=encoder_embeddings,
+            embedding_size=model_params['encoder_embedding_size'],
+            use_attention=use_attention,
+            num_gpus=0,
+            forward_only=True,
+            scope=scope,
+        )
+        with core.NameScope(scope):
+            if use_attention:
+                # [max_source_length, beam_size, encoder_output_dim]
+                encoder_outputs = model.net.Tile(
+                    encoder_outputs,
+                    'encoder_outputs_tiled',
+                    tiles=self.beam_size,
+                    axis=1,
+                )
+
+            if weighted_encoder_outputs is not None:
+                weighted_encoder_outputs = model.net.Tile(
+                    weighted_encoder_outputs,
+                    'weighted_encoder_outputs_tiled',
+                    tiles=self.beam_size,
+                    axis=1,
+                )
+
+            decoder_embeddings = seq2seq_util.build_embeddings(
+                model=model,
+                vocab_size=self.target_vocab_size,
+                embedding_size=model_params['decoder_embedding_size'],
+                name='decoder_embeddings',
+                freeze_embeddings=False,
+            )
+            embedded_tokens_t_prev = step_model.net.Gather(
+                [decoder_embeddings, previous_tokens],
+                'embedded_tokens_t_prev',
+            )
+
+        decoder_cells = []
+        decoder_units_per_layer = []
+        for i, layer_config in enumerate(model_params['decoder_layer_configs']):
+            num_units = layer_config['num_units']
+            decoder_units_per_layer.append(num_units)
+            if i == 0:
+                input_size = model_params['decoder_embedding_size']
+            else:
+                input_size = (
+                    model_params['decoder_layer_configs'][i - 1]['num_units']
+                )
+
+            cell = rnn_cell.LSTMCell(
+                forward_only=True,
+                input_size=input_size,
+                hidden_size=num_units,
+                forget_bias=0.0,
+                memory_optimization=False,
+            )
+            decoder_cells.append(cell)
+
+        with core.NameScope(scope):
+            if final_encoder_hidden_states is not None:
+                for i in range(len(final_encoder_hidden_states)):
+                    if final_encoder_hidden_states[i] is not None:
+                        final_encoder_hidden_states[i] = model.net.Tile(
+                            final_encoder_hidden_states[i],
+                            'final_encoder_hidden_tiled_{}'.format(i),
+                            tiles=self.beam_size,
+                            axis=1,
+                        )
+            if final_encoder_cell_states is not None:
+                for i in range(len(final_encoder_cell_states)):
+                    if final_encoder_cell_states[i] is not None:
+                        final_encoder_cell_states[i] = model.net.Tile(
+                            final_encoder_cell_states[i],
+                            'final_encoder_cell_tiled_{}'.format(i),
+                            tiles=self.beam_size,
+                            axis=1,
+                        )
+            initial_states = \
+                seq2seq_util.build_initial_rnn_decoder_states(
+                    model=model,
+                    encoder_units_per_layer=encoder_units_per_layer,
+                    decoder_units_per_layer=decoder_units_per_layer,
+                    final_encoder_hidden_states=final_encoder_hidden_states,
+                    final_encoder_cell_states=final_encoder_cell_states,
+                    use_attention=use_attention,
+                )
+
+        attention_decoder = seq2seq_util.LSTMWithAttentionDecoder(
+            encoder_outputs=encoder_outputs,
+            encoder_output_dim=encoder_units_per_layer[-1],
+            encoder_lengths=None,
+            vocab_size=self.target_vocab_size,
+            attention_type=attention_type,
+            embedding_size=model_params['decoder_embedding_size'],
+            decoder_num_units=decoder_units_per_layer[-1],
+            decoder_cells=decoder_cells,
+            weighted_encoder_outputs=weighted_encoder_outputs,
+            name=scope,
+        )
+        states_prev = step_model.net.AddExternalInputs(*[
+            '{}/{}_prev'.format(scope, s)
+            for s in attention_decoder.get_state_names()
+        ])
+        decoder_outputs, states = attention_decoder.apply(
+            model=step_model,
+            input_t=embedded_tokens_t_prev,
+            seq_lengths=fake_seq_lengths,
+            states=states_prev,
+            timestep=timestep,
+        )
+
+        state_configs = [
+            BeamSearchForwardOnly.StateConfig(
+                initial_value=initial_state,
+                state_prev_link=BeamSearchForwardOnly.LinkConfig(
+                    blob=state_prev,
+                    offset=0,
+                    window=1,
+                ),
+                state_link=BeamSearchForwardOnly.LinkConfig(
+                    blob=state,
+                    offset=1,
+                    window=1,
+                ),
+            )
+            for initial_state, state_prev, state in zip(
+                initial_states,
+                states_prev,
+                states,
+            )
+        ]
+
+        with core.NameScope(scope):
+            decoder_outputs_flattened, _ = step_model.net.Reshape(
+                [decoder_outputs],
+                [
+                    'decoder_outputs_flattened',
+                    'decoder_outputs_and_contexts_combination_old_shape',
+                ],
+                shape=[-1, attention_decoder.get_output_dim()],
+            )
+            output_logits = seq2seq_util.output_projection(
+                model=step_model,
+                decoder_outputs=decoder_outputs_flattened,
+                decoder_output_size=attention_decoder.get_output_dim(),
+                target_vocab_size=self.target_vocab_size,
+                decoder_softmax_size=model_params['decoder_softmax_size'],
+            )
+            # [1, beam_size, target_vocab_size]
+            output_probs = step_model.net.Softmax(
+                output_logits,
+                'output_probs',
+            )
+            output_log_probs = step_model.net.Log(
+                output_probs,
+                'output_log_probs',
+            )
+            if use_attention:
+                attention_weights = attention_decoder.get_attention_weights()
+            else:
+                attention_weights = step_model.net.ConstantFill(
+                    [self.encoder_inputs],
+                    'zero_attention_weights_tmp_1',
+                    value=0.0,
+                )
+                attention_weights = step_model.net.Transpose(
+                    attention_weights,
+                    'zero_attention_weights_tmp_2',
+                )
+                attention_weights = step_model.net.Tile(
+                    attention_weights,
+                    'zero_attention_weights_tmp',
+                    tiles=self.beam_size,
+                    axis=0,
+                )
+
+        return (
+            state_configs,
+            output_log_probs,
+            attention_weights,
+        )
+
+    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
+        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
+        word_rewards[seq2seq_util.PAD_ID] = 0
+        word_rewards[seq2seq_util.GO_ID] = 0
+        word_rewards[seq2seq_util.EOS_ID] = 0
+        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
+        return word_rewards
+
+    def __init__(
+        self,
+        translate_params,
+    ):
+        self.models = translate_params['ensemble_models']
+        decoding_params = translate_params['decoding_params']
+        self.beam_size = decoding_params['beam_size']
+
+        assert len(self.models) > 0
+        source_vocab = self.models[0]['source_vocab']
+        target_vocab = self.models[0]['target_vocab']
+        for model in self.models:
+            assert model['source_vocab'] == source_vocab
+            assert model['target_vocab'] == target_vocab
+
+        self.source_vocab_size = len(source_vocab)
+        self.target_vocab_size = len(target_vocab)
+
+        self.decoder_scope_names = [
+            'model{}'.format(i) for i in range(len(self.models))
+        ]
+
+        self.model = Seq2SeqModelHelper(init_params=True)
+
+        self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs')
+        self.encoder_lengths = self.model.net.AddExternalInput(
+            'encoder_lengths'
+        )
+        self.max_output_seq_len = self.model.net.AddExternalInput(
+            'max_output_seq_len'
+        )
+
+        fake_seq_lengths = self.model.param_init_net.ConstantFill(
+            [],
+            'fake_seq_lengths',
+            shape=[self.beam_size],
+            value=100000,
+            dtype=core.DataType.INT32,
+        )
+
+        beam_decoder = BeamSearchForwardOnly(
+            beam_size=self.beam_size,
+            model=self.model,
+            go_token_id=seq2seq_util.GO_ID,
+            eos_token_id=seq2seq_util.EOS_ID,
+        )
+        step_model = beam_decoder.get_step_model()
+
+        state_configs = []
+        output_log_probs = []
+        attention_weights = []
+        for model, scope_name in zip(
+            self.models,
+            self.decoder_scope_names,
+        ):
+            (
+                state_configs_per_decoder,
+                output_log_probs_per_decoder,
+                attention_weights_per_decoder,
+            ) = self._build_decoder(
+                model=self.model,
+                step_model=step_model,
+                model_params=model['model_params'],
+                scope=scope_name,
+                previous_tokens=beam_decoder.get_previous_tokens(),
+                timestep=beam_decoder.get_timestep(),
+                fake_seq_lengths=fake_seq_lengths,
+            )
+            state_configs.extend(state_configs_per_decoder)
+            output_log_probs.append(output_log_probs_per_decoder)
+            if attention_weights_per_decoder is not None:
+                attention_weights.append(attention_weights_per_decoder)
+
+        assert len(attention_weights) > 0
+        num_decoders_with_attention_blob = (
+            self.model.param_init_net.ConstantFill(
+                [],
+                'num_decoders_with_attention_blob',
+                value=1 / float(len(attention_weights)),
+                shape=[1],
+            )
+        )
+        # [beam_size, encoder_length, 1]
+        attention_weights_average = _weighted_sum(
+            model=step_model,
+            values=attention_weights,
+            weight=num_decoders_with_attention_blob,
+            output_name='attention_weights_average',
+        )
+
+        num_decoders_blob = self.model.param_init_net.ConstantFill(
+            [],
+            'num_decoders_blob',
+            value=1 / float(len(output_log_probs)),
+            shape=[1],
+        )
+        # [beam_size, target_vocab_size]
+        output_log_probs_average = _weighted_sum(
+            model=step_model,
+            values=output_log_probs,
+            weight=num_decoders_blob,
+            output_name='output_log_probs_average',
+        )
+        word_rewards = self.model.param_init_net.ConstantFill(
+            [],
+            'word_rewards',
+            shape=[self.target_vocab_size],
+            value=0.0,
+            dtype=core.DataType.FLOAT,
+        )
+        (
+            self.output_token_beam_list,
+            self.output_prev_index_beam_list,
+            self.output_score_beam_list,
+            self.output_attention_weights_beam_list,
+        ) = beam_decoder.apply(
+            inputs=self.encoder_inputs,
+            length=self.max_output_seq_len,
+            log_probs=output_log_probs_average,
+            attentions=attention_weights_average,
+            state_configs=state_configs,
+            data_dependencies=[],
+            word_rewards=word_rewards,
+        )
+
+        workspace.RunNetOnce(self.model.param_init_net)
+        workspace.FeedBlob(
+            'word_rewards',
+            self.build_word_rewards(
+                vocab_size=self.target_vocab_size,
+                word_reward=translate_params['decoding_params']['word_reward'],
+                unk_reward=translate_params['decoding_params']['unk_reward'],
+            )
+        )
+
+        workspace.CreateNet(
+            self.model.net,
+            input_blobs=[
+                str(self.encoder_inputs),
+                str(self.encoder_lengths),
+                str(self.max_output_seq_len),
+            ],
+        )
+
+        logger.info('Params created: ')
+        for param in self.model.params:
+            logger.info(param)
+
+    def load_models(self):
+        db_reader = 'reader'
+        for model, scope_name in zip(
+            self.models,
+            self.decoder_scope_names,
+        ):
+            params_for_current_model = [
+                param
+                for param in self.model.GetAllParams()
+                if str(param).startswith(scope_name)
+            ]
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'CreateDB',
+                [], [db_reader],
+                db=model['model_file'],
+                db_type='minidb')
+            ), 'Failed to create db {}'.format(model['model_file'])
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'Load',
+                [db_reader],
+                params_for_current_model,
+                load_all=1,
+                add_prefix=scope_name + '/',
+                strip_prefix='gpu_0/',
+            ))
+            logger.info('Model {} is loaded from a checkpoint {}'.format(
+                scope_name,
+                model['model_file'],
+            ))
+
+    def decode(self, numberized_input, max_output_seq_len):
+        workspace.FeedBlob(
+            self.encoder_inputs,
+            np.array([
+                [token_id] for token_id in reversed(numberized_input)
+            ]).astype(dtype=np.int32),
+        )
+        workspace.FeedBlob(
+            self.encoder_lengths,
+            np.array([len(numberized_input)]).astype(dtype=np.int32),
+        )
+        workspace.FeedBlob(
+            self.max_output_seq_len,
+            np.array([max_output_seq_len]).astype(dtype=np.int64),
+        )
+
+        workspace.RunNet(self.model.net)
+
+        num_steps = max_output_seq_len
+        score_beam_list = workspace.FetchBlob(self.output_score_beam_list)
+        token_beam_list = (
+            workspace.FetchBlob(self.output_token_beam_list)
+        )
+        prev_index_beam_list = (
+            workspace.FetchBlob(self.output_prev_index_beam_list)
+        )
+
+        attention_weights_beam_list = (
+            workspace.FetchBlob(self.output_attention_weights_beam_list)
+        )
+        best_indices = (num_steps, 0)
+        for i in range(num_steps + 1):
+            for hyp_index in range(self.beam_size):
+                if (
+                    (
+                        token_beam_list[i][hyp_index][0] ==
+                        seq2seq_util.EOS_ID or
+                        i == num_steps
+                    ) and
+                    (
+                        score_beam_list[i][hyp_index][0] >
+                        score_beam_list[best_indices[0]][best_indices[1]][0]
+                    )
+                ):
+                    best_indices = (i, hyp_index)
+
+        i, hyp_index = best_indices
+        output = []
+        attention_weights_per_token = []
+        best_score = -score_beam_list[i][hyp_index][0]
+        while i > 0:
+            output.append(token_beam_list[i][hyp_index][0])
+            attention_weights_per_token.append(
+                attention_weights_beam_list[i][hyp_index]
+            )
+            hyp_index = prev_index_beam_list[i][hyp_index][0]
+            i -= 1
+
+        attention_weights_per_token = reversed(attention_weights_per_token)
+        # encoder_inputs are reversed, see get_batch func
+        attention_weights_per_token = [
+            list(reversed(attention_weights))[:len(numberized_input)]
+            for attention_weights in attention_weights_per_token
+        ]
+        output = list(reversed(output))
+        return output, attention_weights_per_token, best_score
+
+
+def run_seq2seq_beam_decoder(args, model_params, decoding_params):
+    source_vocab = seq2seq_util.gen_vocab(
+        args.source_corpus,
+        args.unk_threshold,
+    )
+    logger.info('Source vocab size {}'.format(len(source_vocab)))
+    target_vocab = seq2seq_util.gen_vocab(
+        args.target_corpus,
+        args.unk_threshold,
+    )
+    inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)}
+    logger.info('Target vocab size {}'.format(len(target_vocab)))
+
+    decoder = Seq2SeqModelCaffe2EnsembleDecoder(
+        translate_params=dict(
+            ensemble_models=[dict(
+                source_vocab=source_vocab,
+                target_vocab=target_vocab,
+                model_params=model_params,
+                model_file=args.checkpoint,
+            )],
+            decoding_params=decoding_params,
+        ),
+    )
+    decoder.load_models()
+
+    for line in sys.stdin:
+        numerized_source_sentence = seq2seq_util.get_numberized_sentence(
+            line,
+            source_vocab,
+        )
+        translation, alignment, _ = decoder.decode(
+            numerized_source_sentence,
+            2 * len(numerized_source_sentence) + 5,
+        )
+        print(' '.join([inversed_target_vocab[tid] for tid in translation]))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Caffe2: Seq2Seq Translation',
+    )
+    parser.add_argument('--source-corpus', type=str, default=None,
+                        help='Path to source corpus in a text file format. Each '
+                        'line in the file should contain a single sentence',
+                        required=True)
+    parser.add_argument('--target-corpus', type=str, default=None,
+                        help='Path to target corpus in a text file format',
+                        required=True)
+    parser.add_argument('--unk-threshold', type=int, default=50,
+                        help='Threshold frequency under which token becomes '
+                        'labeled unknown token')
+
+    parser.add_argument('--use-bidirectional-encoder', action='store_true',
+                        help='Set flag to use bidirectional recurrent network '
+                        'in encoder')
+    parser.add_argument('--use-attention', action='store_true',
+                        help='Set flag to use seq2seq with attention model')
+    parser.add_argument('--encoder-cell-num-units', type=int, default=512,
+                        help='Number of cell units per encoder layer')
+    parser.add_argument('--encoder-num-layers', type=int, default=2,
+                        help='Number encoder layers')
+    parser.add_argument('--decoder-cell-num-units', type=int, default=512,
+                        help='Number of cell units in the decoder layer')
+    parser.add_argument('--decoder-num-layers', type=int, default=2,
+                        help='Number decoder layers')
+    parser.add_argument('--encoder-embedding-size', type=int, default=256,
+                        help='Size of embedding in the encoder layer')
+    parser.add_argument('--decoder-embedding-size', type=int, default=512,
+                        help='Size of embedding in the decoder layer')
+    parser.add_argument('--decoder-softmax-size', type=int, default=None,
+                        help='Size of softmax layer in the decoder')
+
+    parser.add_argument('--beam-size', type=int, default=6,
+                        help='Size of beam for the decoder')
+    parser.add_argument('--word-reward', type=float, default=0.0,
+                        help='Reward per each word generated.')
+    parser.add_argument('--unk-reward', type=float, default=0.0,
+                        help='Reward per each UNK token generated. '
+                        'Typically should be negative.')
+
+    parser.add_argument('--checkpoint', type=str, default=None,
+                        help='Path to checkpoint', required=True)
+
+    args = parser.parse_args()
+
+    encoder_layer_configs = [
+        dict(
+            num_units=args.encoder_cell_num_units,
+        ),
+    ] * args.encoder_num_layers
+
+    if args.use_bidirectional_encoder:
+        assert args.encoder_cell_num_units % 2 == 0
+        encoder_layer_configs[0]['num_units'] /= 2
+
+    decoder_layer_configs = [
+        dict(
+            num_units=args.decoder_cell_num_units,
+        ),
+    ] * args.decoder_num_layers
+
+    run_seq2seq_beam_decoder(
+        args,
+        model_params=dict(
+            attention=('regular' if args.use_attention else 'none'),
+            decoder_layer_configs=decoder_layer_configs,
+            encoder_type=dict(
+                encoder_layer_configs=encoder_layer_configs,
+                use_bidirectional_encoder=args.use_bidirectional_encoder,
+            ),
+            encoder_embedding_size=args.encoder_embedding_size,
+            decoder_embedding_size=args.decoder_embedding_size,
+            decoder_softmax_size=args.decoder_softmax_size,
+        ),
+        decoding_params=dict(
+            beam_size=args.beam_size,
+            word_reward=args.word_reward,
+            unk_reward=args.unk_reward,
+        ),
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py
new file mode 100644
index 0000000..6b76633
--- /dev/null
+++ b/caffe2/python/modifier_context.py
@@ -0,0 +1,67 @@
+# @package modifier_context
+# Module caffe2.python.modifier_context
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+DEFAULT_MODIFIER = 'DEFAULT'
+
+
+class ModifierContext(object):
+    """
+    provide context to allow param_info to have different modifiers
+    """
+
+    def __init__(self):
+        self._modifiers = {}
+        self._modifiers_list = []
+
+    def _rebuild_modifiers(self):
+        self._modifiers = {}
+        for m in self._modifiers_list:
+            self._modifiers.update(m)
+
+    def _has_modifier(self, name):
+        return name in self._modifiers
+
+    def _get_modifier(self, name):
+        return self._modifiers.get(name)
+
+    def push_modifiers(self, modifiers):
+        # modifier override is allowed
+        self._modifiers_list.append(modifiers)
+        self._modifiers.update(modifiers)
+
+    def pop_modifiers(self):
+        assert len(self._modifiers_list) > 0
+        self._modifiers_list.pop()
+        self._rebuild_modifiers()
+
+
+class UseModifierBase(object):
+    '''
+    context class to allow setting the current context.
+    Example useage with layer:
+        modifiers = {'modifier1': modifier1, 'modifier2': modifier2}
+        with Modifiers(modifiers):
+            modifier = ModifierContext.current().get_modifier('modifier1')
+            layer(modifier=modifier)
+    '''
+
+    def __init__(self, modifier_or_dict):
+        if isinstance(modifier_or_dict, dict):
+            self._modifiers = modifier_or_dict
+        else:
+            self._modifiers = {DEFAULT_MODIFIER: modifier_or_dict}
+
+    def _context_class(self):
+        raise NotImplementedError
+
+    def __enter__(self):
+        self._context_class().current().push_modifiers(self._modifiers)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._context_class().current().pop_modifiers()
diff --git a/caffe2/python/mpi_python.cc b/caffe2/python/mpi_python.cc
new file mode 100644
index 0000000..ef7eca4
--- /dev/null
+++ b/caffe2/python/mpi_python.cc
@@ -0,0 +1,47 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(mpi_utils, m) {
+  m.doc() = "MPI helper functions";
+  m.def(
+      "SetupPeers",
+      &MPISetupPeers,
+      py::arg("replicas"),
+      py::arg("role"),
+      py::arg("job_path"));
+  m.def("CommSize", [] {
+    auto comm = GlobalMPIComm();
+    return MPICommSize(comm);
+  });
+  m.def("CommRank", [] {
+    auto comm = GlobalMPIComm();
+    return MPICommRank(comm);
+  });
+  m.def("Finalize", [] {
+    // NOTE(pietern): Doesn't seem to work when calling it
+    // from Python. It ends up calling pthread_join on a
+    // thread that doesn't exit. For now, running mpirun
+    // with `-quiet` and skipping the finalize call.
+    MPI_Finalize();
+  });
+  m.def("Broadcast", [](py::bytes in) -> py::bytes {
+    std::string str = in;
+    auto comm = GlobalMPIComm();
+    auto length = str.length();
+    MPI_Bcast(&length, sizeof(length), MPI_CHAR, 0, comm);
+    auto ptr = caffe2::make_unique<char[]>(length);
+    if (MPICommRank(comm) == 0) {
+      memcpy(ptr.get(), str.data(), str.length());
+    }
+    MPI_Bcast(ptr.get(), length, MPI_CHAR, 0, comm);
+    return std::string(ptr.get(), length);
+  });
+}
+
+} // namespace caffe2
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
new file mode 100644
index 0000000..b407f96
--- /dev/null
+++ b/caffe2/python/muji.py
@@ -0,0 +1,264 @@
+## @package muji
+# Module caffe2.python.muji
+"""muji.py does multi-gpu training for caffe2 with no need to change the c++
+side code. Everything is defined on the computation graph level.
+
+We support the following use cases:
+  - 2 gpus, where peer access is enabled between them.
+  - 4 gpus, where peer access are enabled between all of them.
+  - 4 gpus, where peer access are enabled in two groups,
+    between {1, 2} and {3, 4}
+  - 8 gpus, where peer access are enabled in two groups,
+    between {1, 2, 3, 4} and {5, 6, 7, 8}.
+If above cases are not satisfied, a fallback function which does not rely on
+peer access will be called.
+"""
+
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace
+
+
+def OnGPU(gpu_id):
+    """A utility function that returns a device option protobuf of the
+  specified gpu id.
+  """
+    device_option = caffe2_pb2.DeviceOption()
+    device_option.device_type = caffe2_pb2.CUDA
+    device_option.cuda_gpu_id = gpu_id
+    return device_option
+
+
+def OnCPU():
+    device_option = caffe2_pb2.DeviceOption()
+    device_option.device_type = caffe2_pb2.CPU
+    return device_option
+
+
+def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
+    """The general Allreduce interface that reroutes the function calls.
+    CPUs and AMD GPUs are not supported because
+    GetCudaPeerAccessPattern is called to get gpu peer access pattern.
+  """
+    if gpu_indices is None:
+        gpu_indices = list(range(len(blobs)))
+    if len(gpu_indices) != len(blobs):
+        raise RuntimeError(
+            "gpu_indices length and blobs length mismatch: %d vs %d" %
+            (len(gpu_indices), len(blobs))
+        )
+    pattern = workspace.GetCudaPeerAccessPattern()
+    if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
+        return Allreduce2(net, blobs, reduced_affix, gpu_indices)
+    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
+        return Allreduce4(net, blobs, reduced_affix, gpu_indices)
+    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
+        return Allreduce4Group2(net, blobs, reduced_affix, gpu_indices)
+    elif len(blobs) == 8 and pattern.shape[0] >= 8 and np.all(pattern[:8, :8]):
+        return Allreduce8(net, blobs, reduced_affix, gpu_indices)
+    else:
+        return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
+
+
+def Allreduce2(net, blobs, reduced_affix, gpu_indices):
+    """Allreduce for 2 gpus.
+
+  Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced"
+  """
+    a, b = blobs
+    gpu_a, gpu_b = gpu_indices
+    a_reduced = net.Add([a, b], a + reduced_affix, device_option=OnGPU(gpu_a))
+    b_reduced = a_reduced.Copy(
+        [],
+        b + reduced_affix,
+        device_option=OnGPU(gpu_b)
+    )
+    return a_reduced, b_reduced
+
+
+def Allreduce4(net, blobs, reduced_affix, gpu_indices):
+    """Allreduce for 4 gpus.
+
+  Algorithm: 2 level reduction.
+      0r <- 0 + 1, 2r <- 2 + 3
+      0r <- 0r + 2r
+      2r <- 0r,
+      1r <- 0r, 3r <- 2r
+  """
+    a, b, c, d = blobs
+    gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
+    # a_reduced <- a+b, c_reduced <- c + d
+    a_reduced = net.Add(
+        [a, b],
+        str(a) + reduced_affix,
+        device_option=OnGPU(gpu_a)
+    )
+    c_reduced = net.Add(
+        [c, d],
+        str(c) + reduced_affix,
+        device_option=OnGPU(gpu_c)
+    )
+    # a_reduced <- a_reduced + c_reduced
+    a_reduced = a_reduced.Add(c_reduced, a_reduced, device_option=OnGPU(gpu_a))
+    # broadcast a_reduced to c_reduced
+    c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
+    # broadcast to b and d
+    b_reduced = a_reduced.Copy(
+        [],
+        str(b) + reduced_affix,
+        device_option=OnGPU(gpu_b)
+    )
+    d_reduced = c_reduced.Copy(
+        [],
+        str(d) + reduced_affix,
+        device_option=OnGPU(gpu_d)
+    )
+    return a_reduced, b_reduced, c_reduced, d_reduced
+
+
+def Allreduce4Group2(net, blobs, reduced_affix, gpu_indices):
+    """Allreduce for 4 gpus where peer access are enabled in {0,1} and {2,3}
+
+  Algorithm: 2 level reduction.
+      0r <- 0 + 1, 2r <- 2 + 3
+      0r <- 0r + 2r
+      2r <- 0r,
+      1r <- 0r, 3r <- 2r
+  """
+    a, b, c, d = blobs
+    gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
+    # a_reduced <- a+b, c_reduced <- c + d
+    a_reduced = net.Add(
+        [a, b],
+        str(a) + reduced_affix,
+        device_option=OnGPU(gpu_a)
+    )
+    c_reduced = net.Add(
+        [c, d],
+        str(c) + reduced_affix,
+        device_option=OnGPU(gpu_c)
+    )
+    # copy from c_reduce(gpu_c) to c_reduce_copy(gpu_a)
+    c_reduced_copy = c_reduced.Copy(
+        [],
+        str(c_reduced) + '_copy',
+        device_option=OnGPU(gpu_a)
+    )
+    # a_reduced <- a_reduced + c_reduced_copy
+    a_reduced = a_reduced.Add(c_reduced_copy, a_reduced, device_option=OnGPU(gpu_a))
+    # broadcast a_reduced to c_reduced
+    c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
+    # broadcast to b and d
+    b_reduced = a_reduced.Copy(
+        [],
+        str(b) + reduced_affix,
+        device_option=OnGPU(gpu_b)
+    )
+    d_reduced = c_reduced.Copy(
+        [],
+        str(d) + reduced_affix,
+        device_option=OnGPU(gpu_d)
+    )
+    return a_reduced, b_reduced, c_reduced, d_reduced
+
+
+def Allreduce8(net, blobs, reduced_affix, gpu_indices):
+    """Allreduce for 8 gpus.
+
+  Algorithm: 3 level reduction.
+      0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7
+      0r <- 0r + 2r, 4r <- 4r + 6r
+      0r <- 0r + 4r
+      4r <- 0r
+      2r <- 0r, 6r <- 4r
+      1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r
+  """
+    reduced = [None] * 8
+    # Reduction level 1
+    for i in [0, 2, 4, 6]:
+        reduced[i] = net.Add(
+            [blobs[i], blobs[i + 1]],
+            blobs[i] + reduced_affix,
+            device_option=OnGPU(gpu_indices[i])
+        )
+    # Reduction level 2
+    for i in [0, 4]:
+        reduced[i] = net.Add(
+            [reduced[i], reduced[i + 2]],
+            str(blobs[i]) + reduced_affix,
+            device_option=OnGPU(gpu_indices[i])
+        )
+    # Reduction level 3: this involves a copy.
+    reduced_4_copy = reduced[4].Copy(
+        [],
+        str(reduced[4]) + '_copy',
+        device_option=OnGPU(gpu_indices[0])
+    )
+    reduced[0] = reduced[0].Add(
+        reduced_4_copy,
+        reduced[0],
+        device_option=OnGPU(gpu_indices[0])
+    )
+    # Broadcast level 1
+    reduced[4] = reduced[0].Copy(
+        [],
+        reduced[4],
+        device_option=OnGPU(gpu_indices[4])
+    )
+    # Broadcast level 2
+    for i in [2, 6]:
+        reduced[i] = reduced[i - 2].Copy(
+            [],
+            reduced[i],
+            device_option=OnGPU(gpu_indices[i])
+        )
+    # Broadcast level 3
+    for i in [1, 3, 5, 7]:
+        reduced[i] = reduced[i - 1].Copy(
+            [],
+            blobs[i] + reduced_affix,
+            device_option=OnGPU(gpu_indices[i])
+        )
+    return reduced
+
+
+def AllreduceFallback(net, blobs, reduced_affix, gpu_indices):
+    """A fallback option for Allreduce with no assumption on p2p.
+
+  Algorithm: a flat operation on gpu 0
+      0r <- 0
+      0r <- 0r + i for i in gpu_indices[1:]
+      ir <- 0r for i in gpu_indices[1:]
+  """
+    reduced = [None] * len(gpu_indices)
+    if reduced_affix != '':
+        # copy first
+        reduced[0] = net.Copy(
+            blobs[0],
+            blobs[0] + reduced_affix,
+            device_option=OnGPU(gpu_indices[0])
+        )
+    else:
+        reduced[0] = blobs[0]
+    # do temp copy and add
+    temp_name = reduced[0] + '_temp_copy'
+    for i in range(1, len(gpu_indices)):
+        temp = net.Copy(
+            blobs[i],
+            temp_name,
+            device_option=OnGPU(gpu_indices[0])
+        )
+        reduced[0] = net.Add(
+            [temp, reduced[0]],
+            reduced[0],
+            device_option=OnGPU(gpu_indices[0])
+        )
+    # Broadcast to everyone else
+    for i in range(1, len(gpu_indices)):
+        reduced[i] = net.Copy(
+            reduced[0],
+            blobs[i] + reduced_affix,
+            device_option=OnGPU(gpu_indices[i])
+        )
+    return reduced
diff --git a/caffe2/python/muji_test.py b/caffe2/python/muji_test.py
new file mode 100644
index 0000000..cca0ca0
--- /dev/null
+++ b/caffe2/python/muji_test.py
@@ -0,0 +1,82 @@
+import numpy as np
+import unittest
+
+from caffe2.python import core, workspace, muji, test_util
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
+class TestMuji(test_util.TestCase):
+    def RunningAllreduceWithGPUs(self, gpu_ids, allreduce_function):
+        """A base function to test different scenarios."""
+        net = core.Net("mujitest")
+        for id in gpu_ids:
+            net.ConstantFill(
+                [],
+                "testblob_gpu_" + str(id),
+                shape=[1, 2, 3, 4],
+                value=float(id + 1),
+                device_option=muji.OnGPU(id)
+            )
+        allreduce_function(
+            net, ["testblob_gpu_" + str(i)
+                  for i in gpu_ids], "_reduced", gpu_ids
+        )
+        workspace.RunNetOnce(net)
+        target_value = sum(gpu_ids) + len(gpu_ids)
+        all_blobs = workspace.Blobs()
+        all_blobs.sort()
+        for blob in all_blobs:
+            print('{} {}'.format(blob, workspace.FetchBlob(blob)))
+
+        for idx in gpu_ids:
+            blob = workspace.FetchBlob("testblob_gpu_" + str(idx) + "_reduced")
+            np.testing.assert_array_equal(
+                blob,
+                target_value,
+                err_msg="gpu id %d of %s" % (idx, str(gpu_ids))
+            )
+
+    def testAllreduceFallback(self):
+        self.RunningAllreduceWithGPUs(
+            list(range(workspace.NumCudaDevices())), muji.AllreduceFallback
+        )
+
+    def testAllreduceSingleGPU(self):
+        for i in range(workspace.NumCudaDevices()):
+            self.RunningAllreduceWithGPUs([i], muji.Allreduce)
+
+    def testAllreduceWithTwoGPUs(self):
+        pattern = workspace.GetCudaPeerAccessPattern()
+        if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
+            self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2)
+        else:
+            print('Skipping allreduce with 2 gpus. Not peer access ready.')
+
+    def testAllreduceWithFourGPUs(self):
+        pattern = workspace.GetCudaPeerAccessPattern()
+        if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
+            self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4)
+        else:
+            print('Skipping allreduce with 4 gpus. Not peer access ready.')
+
+    def testAllreduceWithFourGPUsAndTwoGroups(self):
+        pattern = workspace.GetCudaPeerAccessPattern()
+        if pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
+            self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4Group2)
+        else:
+            print('Skipping allreduce with 4 gpus and 2 groups. Not peer access ready.')
+
+    def testAllreduceWithEightGPUs(self):
+        pattern = workspace.GetCudaPeerAccessPattern()
+        if (
+            pattern.shape[0] >= 8 and np.all(pattern[:4, :4]) and
+            np.all(pattern[4:, 4:])
+        ):
+            self.RunningAllreduceWithGPUs(
+                list(range(8)), muji.Allreduce8)
+        else:
+            print('Skipping allreduce with 8 gpus. Not peer access ready.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
new file mode 100644
index 0000000..f1af8c3
--- /dev/null
+++ b/caffe2/python/net_builder.py
@@ -0,0 +1,742 @@
+## @package net_builder
+# Module caffe2.python.net_builder
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, context
+from caffe2.python.task import Task, TaskGroup
+from caffe2.python.control_ops_util import add_if_op, add_while_op
+
+
+@context.define_context()
+class NetBuilder(object):
+    """
+    Scope-driven mechanism for building nets, loops and conditional blocks.
+    Arguments:
+      name: NetBuilder's name
+      initial_scope: list of blobs that are available for reading/writing
+    Example:
+        from caffe2.python.net_builder import NetBuilder, ops
+        with NetBuilder() as nb:
+            c = ops.Const(5)
+            d = ops.Const(0)
+            with ops.loop():
+                ops.stop_if(ops.LE([c, ops.Const(0)]))
+                ops.Add([c, ops.Const(-1)], [c])
+                with ops.If(ops.GE([c, ops.Const(3)])):
+                    ops.Add([d, ops.Const(10)], [d])
+            ops.Print(c, [])
+            ops.Print(d, [])
+        step = core.to_execution_step(nb)
+    """
+    def __init__(self, name=None, initial_scope=None, _stop_blob_required=False,
+                 _stop_blob=None, _fullname=None, _use_control_ops=False):
+        parent = NetBuilder.current(required=False)
+        assert not _fullname or not name, 'Cannot set both _fullname and name'
+        assert not _use_control_ops or \
+            (not _stop_blob_required and not _stop_blob), \
+            'Stop blobs are not used with control operators'
+        self.name = _fullname or '/'.join(
+            n for n in (parent.name if parent else None, name) if n
+        )
+        self._frozen = False
+        self._current_net = None
+        self._children = []
+        if parent:
+            # make sure parent has an up to date lexical scope computed
+            parent._update_lexical_scope()
+        self._init_lexical_scope = set(parent._lexical_scope) if parent else set()
+        if initial_scope:
+            self._init_lexical_scope |= set([str(b) for b in initial_scope])
+        self._lexical_scope = set(self._init_lexical_scope)
+        self._stop_blob = _stop_blob
+        self._stop_blob_required = _stop_blob_required
+        self._use_control_ops = _use_control_ops
+
+    def stop_blob(self):
+        """
+        Returns the BlobReference to the stop_blob of this NetBuilder.
+        If one is not yet available, creates one.
+        This function assumes that the stop_blob() will be used immediatelly
+        in the current net, so it doesn't initialize it if the current net is
+        the first of the builder.
+        """
+        assert not self._use_control_ops, \
+            'Stop blobs are not used with control operators'
+        if self._stop_blob is None:
+            net = self.current_net()
+            self._stop_blob = core.BlobReference(
+                net.NextName('stop_blob'), net=net)
+            net.Const(False, blob_out=self._stop_blob)
+            if self._current_net != self._children[0]:
+                self._children.insert(0, core.Net('stop_blob_init'))
+                self._children[0].Const(False, blob_out=self._stop_blob)
+        return self._stop_blob
+
+    def stop_if(self, blob):
+        assert not self._use_control_ops, \
+            'Stop blobs are not used with control operators'
+        stop_blob = self.stop_blob()
+        ops.Or([stop_blob, blob], [stop_blob])
+        self._current_net = None
+
+    def _assert_mutable(self):
+        assert not self._frozen, (
+            'This NetBuilder (%s) has been built already.' % self.name)
+
+    def _update_lexical_scope(self):
+        """
+        Updates lexical scope based on the current list of children.
+        Lexical scope contains names of blobs that are currently available
+        and were introduced in the net builder
+        """
+        self._lexical_scope = set(self._init_lexical_scope)
+        for child in self._children:
+            if isinstance(child, core.Net):
+                self._lexical_scope |= child.UsedBlobNames()
+            elif isinstance(child, NetBuilder) and child._use_control_ops:
+                self._lexical_scope |= child._lexical_scope
+
+    def _reset_children(self):
+        self._current_net = None
+        self._children = []
+        self._lexical_scope = set(self._init_lexical_scope)
+
+    def add(self, child):
+        self._assert_mutable()
+
+        if self._use_control_ops:
+            assert isinstance(child, core.Net) or (
+                isinstance(child, NetBuilder) and child._use_control_ops), \
+                "Expected Net or NetBuilder with control ops"
+
+        self._current_net = None
+        self._children.append(child)
+        # to-do : check it's not a dag net
+        if isinstance(child, core.Net):
+            self._current_net = child
+        self._update_lexical_scope()
+        return child
+
+    def current_net(self, name=None):
+        self._assert_mutable()
+        if self._current_net is None or name is not None:
+            self.add(core.Net(name))
+        return self._current_net
+
+    def freeze(self):
+        for child in self._children:
+            if hasattr(child, 'freeze'):
+                child.freeze()
+        self._current_net = None
+        self._frozen = True
+
+    def get(self):
+        self.freeze()
+        return self._children
+
+    def __exit__(self, etype, *args):
+        if self._use_control_ops and len(self._children) > 0:
+            _children = self._children
+            self._reset_children()
+            merged_net = NetBuilder.merge_nets(
+                _children, self._lexical_scope)
+            assert merged_net, "Expected a non-empty merge of children"
+            self._children = [merged_net]
+
+        self.freeze()
+        if etype is not None:
+            return
+        assert (not self._stop_blob_required) or self._stop_blob is not None, (
+            'This NetBuilder (%s) requires a stop condition ' % self.name +
+            'to be set with `stop` or `stop_if`')
+
+    @staticmethod
+    def merge_nets(nets_or_builders, outer_blob_names):
+        # Only nets or builders with control ops are allowed.
+        # Need to pay attention to external outputs, e.g.
+        #   ...
+        #   IfNet1 (cond_blob):
+        #       (Net1)
+        #           X = 1
+        #       IfNet2 (...):
+        #           X = X + 1
+        #   ...
+        # In this example there're two children in then branch of IfNet1:
+        # a subnet Net1 that creates blob X and sets its value to one, and
+        # a net builder IfNet2 that (conditionally) increments X.
+        # From IfNet2's point of view X is an external input
+        # and output blob, it will be put into IfNet2 net's external_output.
+        # At the same time, from the point of view of IfNet1 X is purely local.
+        # Net.AppendNet just merges external outputs of the networks, so
+        # without checking this the result of Net1.AppendNet(IfNet2's net)
+        # would have blob X in external_output
+
+        net = None
+        for n in nets_or_builders:
+            cur = None
+            if isinstance(n, NetBuilder):
+                assert n._use_control_ops, \
+                    "Merging of NetBuilder supported only for control ops"
+                nets = n.get()
+                assert len(nets) == 1 and isinstance(nets[0], core.Net), \
+                    "Invalid control op net builder"
+                cur = nets[0]
+            else:
+                assert isinstance(n, core.Net)
+                cur = n
+            if net:
+                net.AppendNet(cur)
+            else:
+                net = cur
+        if net:
+            # correct external output
+            external_outputs = [o for o in net.Proto().external_output
+                                    if o in outer_blob_names]
+            net.Proto().external_output[:] = external_outputs
+        return net
+
+    def __str__(self):
+        return self.name or 'Un-named NetBuilder'
+
+
+class Operations(object):
+    """
+    Operations to be used in the context of a NetBuilder.
+    """
+    def net(self, net=None, name=None):
+        """
+        Retrieves the current net, or add a new net to the builder.
+        Args:
+            net:   If provided, add the given net to the active builder.
+                   Else, returns the current Net or creates a new one as needed.
+            name:  if provided, creates a new Net with given name and makes
+                   it the new current net of the active builder. Cannot
+                   be provided if net is provided.
+        """
+        assert name is None or net is None, (
+            'Cannot provide both `net` and `name`.')
+        if net is not None:
+            NetBuilder.current().add(net)
+            return net
+        return NetBuilder.current().current_net(name=name)
+
+    def __getattr__(self, op_type):
+        """
+        Adds an operator call to the currently active Net.
+        """
+        if op_type.startswith('__'):
+            raise AttributeError()
+        # We want hasattr to work properly even if no context is active.
+        if NetBuilder.current(required=False) is None:
+            raise AttributeError('No active NetBuilder.')
+        return getattr(self.net(), op_type)
+
+    def task_group(self):
+        """
+        Creates a local task group which will execute as the next step of
+        the current NetBuilder.
+        """
+        from caffe2.python import task
+        group = NetBuilder.current()
+        with task.Cluster():
+            with task.Node('local'):
+                tg = task.TaskGroup()
+                group.add(tg)
+                return tg
+
+    def stop(self):
+        """
+        Stop execution of the current execution step.
+            Example:
+                ops.Print(a, 0)
+                ops.stop()
+                ops.Print(b, 0)
+            In the example, 'b' will never be printed.
+        """
+        return self.stop_if(ops.Const(True))
+
+    def stop_if(self, blob):
+        """
+        Stop execution of the current execution step if the
+        condition `blob` is met.
+            Example:
+                ops.Print(a, 0)
+                ops.stop_if(ops.LE([x, ops.Const(0)]))
+                ops.Print(b, 0)
+            In the example, 'b' will only be printed if the value of scalar
+            tensor 'x' is greater than 0.
+        """
+        return NetBuilder.current().stop_if(blob)
+
+    def loop(self, iters=None, name=None):
+        """
+        Creates a NetBuilder that will execute in a loop as the next step of
+        the current NetBuilder. If `iters` is provided, the loop will execute
+        for `iters` iterations and then stop. `iters` can be a constant or a
+        BlobReference. If `iters` is not provided, the loop will execute
+        until `ops.stop` or `ops.stop_if` is called.
+            Examples:
+                a = ops.Const(5)
+                with ops.loop():
+                    ops.stop_if(ops.LE([a, ops.Const(0)]))
+                    ops.Print(a, 0)
+                    ops.Add([a, ops.Const(-1)], [a])
+            Above, 'a' will be printed 5 times, with values 5 to 1.
+
+                with ops.loop(10) as loop:
+                    ops.LogInfo(loop.iter())
+            This will print the numbers from 0 to 9.
+
+                x = ops.Add([ops.Const(10), ops.Const(10)])
+                with ops.loop(x) as loop:
+                    ops.LogInfo(loop.iter())
+            This will print the numbers from 0 to 19.
+        """
+        return NetBuilder.current().add(_Loop(iters, name=name))
+
+    def stop_guard(self, has_stopped_blob=None, name=None):
+        """
+        Creates a NetBuilder that will execute once as the next step of the
+        current NetBuilder. After execution, a bool tensor will indicate
+        whether the inner execution was halted with `stop` or `stop_if`.
+            Example:
+                a = ops.Const(True)
+                with ops.stop_guard() as sg1:
+                    ops.stop_if(a)
+                    ops.Print(ops.Const('did not stop'))
+                b = ops.Const(False)
+                with ops.stop_guard() as sg2:
+                    ops.stop_if(b)
+                    ops.Print(ops.Const('did not stop'))
+                ops.Print(sg1.has_stopped(), [])
+                ops.Print(sg2.has_stopped(), [])
+            In the example, 'did not stop' will be printed once,
+            followed by True and False.
+        """
+        return NetBuilder.current().add(
+            _StopGuard(has_stopped_blob=has_stopped_blob, name=name))
+
+    def If(self, cond, name=None):
+        """
+        Creates a NetBuilder that will execute once as the next step of the
+        current NetBuilder if the blob `cond` is True.
+            Example:
+                with ops.If(ops.Const(True)):
+                    ops.Print(ops.Const('Will print'))
+                with ops.If(ops.Const(False)):
+                    ops.Print(ops.Const('Wont print'))
+            The example will print 'Will print' once.
+        """
+        return NetBuilder.current().add(_RunIf(cond, name=name))
+
+    def IfNet(self, cond, name=None):
+        """
+        Same as If, but uses 'If' operator instead of execution step logic
+        """
+        return NetBuilder.current().add(_RunIfNet(cond, name=name))
+
+    def Else(self, name=None):
+        """
+        Else branch of IfNet, has to be specified immediately after IfNet.
+            Example:
+                with ops.IfNet(ops.LT([x, y])):
+                    ...
+                with ops.Else():
+                    ...
+        """
+        return _RunElseNet(name=name)
+
+    def WhileNet(self, name=None):
+        """
+        NetBuilder for 'While' control operator
+        """
+        return NetBuilder.current().add(_RunWhileNet(name=name))
+
+    def Condition(self, name=None):
+        """
+        Loop's condition, executed within WhileNet context
+        """
+        assert isinstance(NetBuilder.current(), _RunWhileNet), \
+            "Use of Condition outside of WhileNet"
+        return _RunWhileCondition(name=name)
+
+    def task_init(self):
+        """
+        Defines operations that will be executed once at task startup.
+        Useful when implementing processors, that don't have access to the Task
+        top-level structure.
+
+        This setup will be run only once, even if multiple instances of the task
+        will run in parallel. For instance-local initialization, use
+        `task_instance_init` instead.
+
+            Example:
+                def my_processor(rec):
+                    with ops.task_init():
+                        one = ops.Const(1)
+                        two = ops.Const(1)
+                    return Tuple(
+                        ops.Add(rec[0](), zero), ops.Add(rec[1](), two))
+        """
+        setup = _SetupBuilder(_SetupBuilder.INIT)
+        self.net().add_attribute(Task.TASK_SETUP, setup)
+        return setup
+
+    def task_exit(self):
+        """
+        Define operations to be executed once at task shutdown.
+        Useful when implementing processors, that don't have access to the Task
+        top-level structure.
+
+        This shutdown will be run only once, after all concurrent instances of
+        the task have already finished. For instance-local shutdown,
+        use `task_instance_exit` instead.
+
+            Example:
+                def read_queue(queue):
+                    with ops.task_exit():
+                        queue.close(ops.net())
+                    return queue.read(ops.net())
+        """
+        setup = _SetupBuilder(_SetupBuilder.EXIT)
+        self.net().add_attribute(Task.TASK_SETUP, setup)
+        return setup
+
+    def task_instance_init(self):
+        """
+        Defines operations that will be executed once at startup of each
+        instance of a task. This can be seen as "thread_local" initialization.
+        It is guaranteed to run only after all `task_init` logic finishes.
+
+        This setup will be run concurrently for each instance of a task.
+        For global task initialization, use `task_init` instead.
+        """
+        setup = _SetupBuilder(_SetupBuilder.INIT)
+        self.net().add_attribute(Task.TASK_INSTANCE_SETUP, setup)
+        return setup
+
+    def task_instance_exit(self):
+        """
+        Defines operations that will be executed once at shutdown of each
+        instance of a task. This can be seen as "thread_local" finalization.
+
+        This shutdown will be run concurrently for each instance of a task.
+        For global task shutdown, use `task_exit` instead.
+        """
+        setup = _SetupBuilder(_SetupBuilder.EXIT)
+        self.net().add_attribute(Task.TASK_INSTANCE_SETUP, setup)
+        return setup
+
+    def local_init(self):
+        """
+        Similar to `task_init`, but executes at TaskGroup's startup instead,
+        before any task of the group starts executing. This will run only
+        once on each node, before initialization of any task, so it can be
+        used e.g. to initialize blobs shared across tasks.
+        """
+        setup = _SetupBuilder(_SetupBuilder.INIT)
+        self.net().add_attribute(TaskGroup.LOCAL_SETUP, setup)
+        return setup
+
+    def local_exit(self, name=None):
+        """
+        Similar to `task_exit`, but executes at TaskGroup's exit instead,
+        after all tasks of the group finished execution.
+        This will run only once on each node.
+        """
+        setup = _SetupBuilder(_SetupBuilder.EXIT, name)
+        self.net().add_attribute(TaskGroup.LOCAL_SETUP, setup)
+        return setup
+
+    def task_reporter(self, interval_ms=1000, name=None):
+        """
+        Define operations to be executed at every time interval from
+        task start-up to finish. These operations are guaranteed to
+        execute at least once after all other operations of the task are
+        finished.
+
+            Example:
+                with ops.task_reporter(interval_ms=10000):
+                    ops.LogInfo('10s elapsed')
+        """
+        return _ReporterBuilder(interval_ms, net=self.net(), name=name)
+
+    def local_reporter(self, interval_ms=1000, name=None):
+        """
+        Similar to task_report, but operations defined within this block
+        will run repeatedly for as long as any of the tasks in the current
+        TaskGroup have not finished.
+        """
+        return _ReporterBuilder(interval_ms, name=name)
+
+
+ops = Operations()
+
+
+class _ReporterBuilder(NetBuilder):
+    def __init__(self, interval_ms, net=None, name=None):
+        NetBuilder.__init__(self, name)
+        self._net = net
+        self.interval_ms = interval_ms
+
+    def __exit__(self, etype, *args):
+        if etype is None:
+            step = core.to_execution_step(self)
+            step.RunEveryMillis(self.interval_ms)
+            if self._net:
+                self._net.add_attribute(Task.REPORT_STEP, step)
+            else:
+                TaskGroup.current().report_step(
+                    step, interval_ms=self.interval_ms)
+        NetBuilder.__exit__(self, etype, *args)
+
+
+class _SetupBuilder(NetBuilder):
+    INIT = 'init'
+    EXIT = 'exit'
+
+    def __init__(self, type, name=None):
+        NetBuilder.__init__(self, name)
+        self.type = type
+
+    def setup(self, net):
+        if self.type == _SetupBuilder.INIT:
+            return core.to_execution_step(self)
+
+    def exit(self, net):
+        if self.type == _SetupBuilder.EXIT:
+            return core.to_execution_step(self)
+
+
+class _RunOnce(NetBuilder):
+    def __init__(self, name=None):
+        NetBuilder.__init__(self, name)
+
+    def __exit__(self, etype, *args):
+        if etype is None and self._stop_blob is not None:
+            ops.stop()
+        NetBuilder.__exit__(self, etype, *args)
+
+
+class _StopGuard(_RunOnce):
+    def __init__(self, has_stopped_blob=None, name=None):
+        _RunOnce.__init__(self, name)
+        self._stopped = has_stopped_blob
+        self._ran = False
+
+    def __enter__(self):
+        r = _RunOnce.__enter__(self)
+        self._stopped = ops.Const(True, blob_out=self._stopped)
+        return r
+
+    def __exit__(self, etype, *args):
+        if etype is None:
+            self._ran = True
+            ops.Const(False, blob_out=self._stopped)
+        _RunOnce.__exit__(self, etype, *args)
+
+    def has_stopped(self):
+        """
+        Return a blob that will be set to scalar bool `True` after
+        this net builder ran, iff it was halted early.
+        """
+        assert self._ran, 'Context not used yet.'
+        return self._stopped
+
+
+class _Loop(NetBuilder):
+    def __init__(self, iters=None, name=None):
+        NetBuilder.__init__(self, name, _stop_blob_required=True)
+        if iters is not None:
+            self._inc = ops.Const(1)
+            self._iter = ops.Const(0)
+            self._num_iters = (
+                iters if isinstance(iters, core.BlobReference)
+                else ops.Const(iters))
+        else:
+            self._num_iters = None
+
+    def iter(self):
+        assert self._num_iters is not None, (
+            'This loop does not have a number of iterations.')
+        assert self._iter is not None, (
+            'iter() must be called from inside the loop context')
+        return self._iter
+
+    def __enter__(self):
+        builder = NetBuilder.__enter__(self)
+        if self._num_iters is not None:
+            ops.stop_if(ops.GE([self._iter, self._num_iters]))
+        return builder
+
+    def __exit__(self, type, *args):
+        if type is None and self._num_iters is not None:
+            self.current_net().Add([self._iter, self._inc], [self._iter])
+        NetBuilder.__exit__(self, type, *args)
+
+
+class _RunIf(_RunOnce):
+    def __init__(self, cond_blob=None, name=None, _already_ran=None):
+        _RunOnce.__init__(self, name)
+        assert cond_blob or _already_ran
+        self._is_else = cond_blob is None
+        if _already_ran is None:
+            self._else_blob = ops.Not(cond_blob)
+            self._already_ran = ops.Const(False)
+        else:
+            self._already_ran = _already_ran
+            self._else_blob = _already_ran if cond_blob is None else (
+                ops.Or([_already_ran, ops.Not(cond_blob)]))
+
+    def __enter__(self):
+        r = _RunOnce.__enter__(self)
+        ops.stop_if(self._else_blob)
+        ops.Const(True, blob_out=self._already_ran)
+        return r
+
+    def Elif(self, cond, name=None):
+        assert not self._is_else, 'Else not allowed for an Else.'
+        return NetBuilder.current().add(_RunIf(
+            cond, name=name or self.name, _already_ran=self._already_ran))
+
+    def Else(self, name=None):
+        assert not self._is_else, 'Elif not allowed for an Else.'
+        return NetBuilder.current().add(
+            _RunIf(name=name or self.name, _already_ran=self._already_ran))
+
+
+class _RunIfNet(NetBuilder):
+    """
+    Generates a single net that uses If operator
+    """
+    def __init__(self, cond_blob, name=None):
+        NetBuilder.__init__(self, name=name, _use_control_ops=True)
+        assert cond_blob, 'Conditional blob is not specified for an If net'
+        self._cond_blob = cond_blob
+        self._then_net = None
+        self._else_net = None
+
+    def add(self, child):
+        return NetBuilder.add(self, child)
+
+    def __exit__(self, type, *args):
+        if type is None:
+            _then_nets = self._children
+            self._reset_children()
+
+            self._then_net = NetBuilder.merge_nets(
+                _then_nets, self._lexical_scope)
+            if not self._then_net:
+                self._then_net = core.Net('empty_then_net')
+
+            if_net = core.Net(self.name + '/if_net')
+            add_if_op(if_net, self._cond_blob, self._lexical_scope,
+                        self._then_net, self._else_net)
+
+            self._current_net = if_net
+            self._children = [if_net]
+        NetBuilder.__exit__(self, type, *args)
+
+
+class _RunElseNet(NetBuilder):
+    """
+    Else branch for _RunIfNet builder
+    """
+    def __init__(self, name=None):
+        NetBuilder.__init__(self, name=name, _use_control_ops=True)
+        parent = NetBuilder.current(required=False)
+        assert parent and len(parent._children) > 0 and \
+            isinstance(parent._children[-1], _RunIfNet), \
+            'Invalid use of Else builder'
+        self._if_builder = parent._children[-1]
+
+    def __exit__(self, type, *args):
+        if type is None:
+            _else_nets = self._children
+            self._reset_children()
+
+            self._if_builder._else_net = NetBuilder.merge_nets(
+                _else_nets, self._lexical_scope)
+            if self._if_builder._else_net:
+                if_else_net = core.Net(self.name + '/if_else_net')
+                add_if_op(
+                    if_else_net,
+                    self._if_builder._cond_blob,
+                    self._lexical_scope,
+                    self._if_builder._then_net,
+                    self._if_builder._else_net)
+                self._if_builder._current_net = if_else_net
+                self._if_builder._children = [if_else_net]
+        NetBuilder.__exit__(self, type, *args)
+
+
+class _RunWhileNet(NetBuilder):
+    """
+    Generates a single net that uses While operator
+    """
+    def __init__(self, name=None):
+        NetBuilder.__init__(self, name=name, _use_control_ops=True)
+        self._cond_builder = None
+
+    def __exit__(self, type, *args):
+        if type is None:
+            assert self._cond_builder, \
+                'Condition builder must be specified in While op'
+
+            _cond_blob = self._cond_builder._cond_blob
+            _cond_net = self._cond_builder._cond_net
+
+            loop_body = self._children
+            self._reset_children()
+            loop_body_net = NetBuilder.merge_nets(
+                loop_body, self._lexical_scope)
+            if not loop_body_net:
+                loop_body_net = core.Net('empty_loop_body_net')
+
+            while_net = core.Net(self.name + '/while_net')
+            add_while_op(while_net, _cond_blob, self._lexical_scope,
+                            loop_body_net, _cond_net)
+
+            self._current_net = while_net
+            self._children = [while_net]
+        NetBuilder.__exit__(self, type, *args)
+
+
+class _RunWhileCondition(NetBuilder):
+    """
+    Computes loop's condition, used in the context of WhileNet.
+    Last operator must have a single scalar boolean output that will be used
+    as a condition value, no other blobs created in the condition net are
+    visible outside of it
+    """
+    def __init__(self, name=None):
+        NetBuilder.__init__(self, name=name, _use_control_ops=True)
+        parent = NetBuilder.current(required=False)
+        assert parent and isinstance(parent, _RunWhileNet), \
+            'Invalid use of loop condition builder'
+        assert not parent._cond_builder, \
+            'Multiple loop condition builders specified'
+        assert len(parent._children) == 0, \
+            'Condition definition must be specified before the loop\'s body'
+        parent._cond_builder = self
+        self._cond_blob = None
+        self._cond_net = None
+
+    def __exit__(self, type, *args):
+        if type is None:
+            condition_body = self._children
+            self._reset_children()
+            self._cond_net = NetBuilder.merge_nets(
+                condition_body, self._lexical_scope)
+            assert self._cond_net, 'Invalid loop condition specified'
+            assert len(self._cond_net.Proto().op) > 0, 'Invalid condition net'
+            last_op = self._cond_net.Proto().op[-1]
+            assert len(last_op.output) == 1, 'Invalid condition net'
+            self._cond_blob = core.BlobReference(name=last_op.output[0], net=None)
+
+            self._current_net = self._cond_net
+            self._children = [self._cond_net]
+        NetBuilder.__exit__(self, type, *args)
diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py
new file mode 100644
index 0000000..169419c
--- /dev/null
+++ b/caffe2/python/net_builder_test.py
@@ -0,0 +1,332 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace
+from caffe2.python.core import Plan, to_execution_step, Net
+from caffe2.python.task import Task, TaskGroup, final_output
+from caffe2.python.net_builder import ops, NetBuilder
+from caffe2.python.session import LocalSession
+import unittest
+import threading
+
+
+class PythonOpStats(object):
+    lock = threading.Lock()
+    num_instances = 0
+    num_calls = 0
+
+
+def python_op_builder():
+    PythonOpStats.lock.acquire()
+    PythonOpStats.num_instances += 1
+    PythonOpStats.lock.release()
+
+    def my_op(inputs, outputs):
+        PythonOpStats.lock.acquire()
+        PythonOpStats.num_calls += 1
+        PythonOpStats.lock.release()
+
+    return my_op
+
+
+def _test_loop():
+    x = ops.Const(5)
+    y = ops.Const(0)
+    with ops.loop():
+        ops.stop_if(ops.EQ([x, ops.Const(0)]))
+        ops.Add([x, ops.Const(-1)], [x])
+        ops.Add([y, ops.Const(1)], [y])
+    return y
+
+
+def _test_inner_stop(x):
+    ops.stop_if(ops.LT([x, ops.Const(5)]))
+
+
+def _test_outer():
+    x = ops.Const(10)
+    # test stop_if(False)
+    with ops.stop_guard() as g1:
+        _test_inner_stop(x)
+
+    # test stop_if(True)
+    y = ops.Const(3)
+    with ops.stop_guard() as g2:
+        _test_inner_stop(y)
+
+    # test no stop
+    with ops.stop_guard() as g4:
+        ops.Const(0)
+
+    # test empty clause
+    with ops.stop_guard() as g3:
+        pass
+
+    return (
+        g1.has_stopped(), g2.has_stopped(), g3.has_stopped(), g4.has_stopped())
+
+
+def _test_if(x):
+    y = ops.Const(1)
+    with ops.If(ops.GT([x, ops.Const(50)])):
+        ops.Const(2, blob_out=y)
+    with ops.If(ops.LT([x, ops.Const(50)])):
+        ops.Const(3, blob_out=y)
+        ops.stop()
+        ops.Const(4, blob_out=y)
+    return y
+
+
+class TestNetBuilder(unittest.TestCase):
+    def test_ops(self):
+        with NetBuilder() as nb:
+            y = _test_loop()
+            z, w, a, b = _test_outer()
+            p = _test_if(ops.Const(75))
+            q = _test_if(ops.Const(25))
+        plan = Plan('name')
+        plan.AddStep(to_execution_step(nb))
+        ws = workspace.C.Workspace()
+        ws.run(plan)
+        expected = [
+            (y, 5),
+            (z, False),
+            (w, True),
+            (a, False),
+            (b, False),
+            (p, 2),
+            (q, 3),
+        ]
+        for b, expected in expected:
+            actual = ws.blobs[str(b)].fetch()
+            self.assertEquals(actual, expected)
+
+    def _expected_loop(self):
+        total = 0
+        total_large = 0
+        total_small = 0
+        total_tiny = 0
+        for loop_iter in range(10):
+            outer = loop_iter * 10
+            for inner_iter in range(loop_iter):
+                val = outer + inner_iter
+                if val >= 80:
+                    total_large += val
+                elif val >= 50:
+                    total_small += val
+                else:
+                    total_tiny += val
+                total += val
+        return total, total_large, total_small, total_tiny
+
+    def _actual_loop(self):
+        total = ops.Const(0)
+        total_large = ops.Const(0)
+        total_small = ops.Const(0)
+        total_tiny = ops.Const(0)
+        with ops.loop(10) as loop:
+            outer = ops.Mul([loop.iter(), ops.Const(10)])
+            with ops.loop(loop.iter()) as inner:
+                val = ops.Add([outer, inner.iter()])
+                with ops.If(ops.GE([val, ops.Const(80)])) as c:
+                    ops.Add([total_large, val], [total_large])
+                with c.Elif(ops.GE([val, ops.Const(50)])) as c:
+                    ops.Add([total_small, val], [total_small])
+                with c.Else():
+                    ops.Add([total_tiny, val], [total_tiny])
+                ops.Add([total, val], total)
+        return [
+            final_output(x)
+            for x in [total, total_large, total_small, total_tiny]
+        ]
+
+    def test_net_multi_use(self):
+        with Task() as task:
+            total = ops.Const(0)
+            net = Net('my_net')
+            net.Add([total, net.Const(1)], [total])
+            ops.net(net)
+            ops.net(net)
+            result = final_output(total)
+        with LocalSession() as session:
+            session.run(task)
+            self.assertEquals(2, result.fetch())
+
+    def test_loops(self):
+        with Task() as task:
+            out_actual = self._actual_loop()
+        with LocalSession() as session:
+            session.run(task)
+            expected = self._expected_loop()
+            actual = [o.fetch() for o in out_actual]
+            for e, a in zip(expected, actual):
+                self.assertEquals(e, a)
+
+    def test_setup(self):
+        with Task() as task:
+            with ops.task_init():
+                one = ops.Const(1)
+            two = ops.Add([one, one])
+            with ops.task_init():
+                three = ops.Const(3)
+            accum = ops.Add([two, three])
+            # here, accum should be 5
+            with ops.task_exit():
+                # here, accum should be 6, since this executes after lines below
+                seven_1 = ops.Add([accum, one])
+            six = ops.Add([accum, one])
+            ops.Add([accum, one], [accum])
+            seven_2 = ops.Add([accum, one])
+            o6 = final_output(six)
+            o7_1 = final_output(seven_1)
+            o7_2 = final_output(seven_2)
+        with LocalSession() as session:
+            session.run(task)
+            self.assertEquals(o6.fetch(), 6)
+            self.assertEquals(o7_1.fetch(), 7)
+            self.assertEquals(o7_2.fetch(), 7)
+
+    def test_multi_instance_python_op(self):
+        """
+        When task instances are created at runtime, C++ concurrently creates
+        multiple instances of operators in C++, and concurrently destroys them
+        once the task is finished. This means that the destructor of PythonOp
+        will be called concurrently, so the GIL must be acquired. This
+        test exercises this condition.
+        """
+        with Task(num_instances=64) as task:
+            with ops.loop(4):
+                ops.Python((python_op_builder, [], {}))([], [])
+        with LocalSession() as session:
+            PythonOpStats.num_instances = 0
+            PythonOpStats.num_calls = 0
+            session.run(task)
+            self.assertEquals(PythonOpStats.num_instances, 64)
+            self.assertEquals(PythonOpStats.num_calls, 256)
+
+    def test_multi_instance(self):
+        NUM_INSTANCES = 10
+        NUM_ITERS = 15
+        with TaskGroup() as tg:
+            with Task(num_instances=NUM_INSTANCES):
+                with ops.task_init():
+                    counter1 = ops.CreateCounter([], ['global_counter'])
+                    counter2 = ops.CreateCounter([], ['global_counter2'])
+                    counter3 = ops.CreateCounter([], ['global_counter3'])
+                # both task_counter and local_counter should be thread local
+                with ops.task_instance_init():
+                    task_counter = ops.CreateCounter([], ['task_counter'])
+                local_counter = ops.CreateCounter([], ['local_counter'])
+                with ops.loop(NUM_ITERS):
+                    ops.CountUp(counter1)
+                    ops.CountUp(task_counter)
+                    ops.CountUp(local_counter)
+                # gather sum of squares of local counters to make sure that
+                # each local counter counted exactly up to NUM_ITERS, and
+                # that there was no false sharing of counter instances.
+                with ops.task_instance_exit():
+                    count2 = ops.RetrieveCount(task_counter)
+                    with ops.loop(ops.Mul([count2, count2])):
+                        ops.CountUp(counter2)
+                # This should have the same effect as the above
+                count3 = ops.RetrieveCount(local_counter)
+                with ops.loop(ops.Mul([count3, count3])):
+                    ops.CountUp(counter3)
+                # The code below will only run once
+                with ops.task_exit():
+                    total1 = final_output(ops.RetrieveCount(counter1))
+                    total2 = final_output(ops.RetrieveCount(counter2))
+                    total3 = final_output(ops.RetrieveCount(counter3))
+
+        with LocalSession() as session:
+            session.run(tg)
+            self.assertEquals(total1.fetch(), NUM_INSTANCES * NUM_ITERS)
+            self.assertEquals(total2.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
+            self.assertEquals(total3.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
+
+    def test_if_net(self):
+        with NetBuilder() as nb:
+            x0 = ops.Const(0)
+            x1 = ops.Const(1)
+            x2 = ops.Const(2)
+            y0 = ops.Const(0)
+            y1 = ops.Const(1)
+            y2 = ops.Const(2)
+
+            # basic logic
+            first_res = ops.Const(0)
+            with ops.IfNet(ops.Const(True)):
+                ops.Const(1, blob_out=first_res)
+            with ops.Else():
+                ops.Const(2, blob_out=first_res)
+
+            second_res = ops.Const(0)
+            with ops.IfNet(ops.Const(False)):
+                ops.Const(1, blob_out=second_res)
+            with ops.Else():
+                ops.Const(2, blob_out=second_res)
+
+            # nested and sequential ifs,
+            # empty then/else,
+            # passing outer blobs into branches,
+            # writing into outer blobs, incl. into input blob
+            # using local blobs
+            with ops.IfNet(ops.LT([x0, x1])):
+                local_blob = ops.Const(900)
+                ops.Add([ops.Const(100), local_blob], [y0])
+
+                gt = ops.GT([x1, x2])
+                with ops.IfNet(gt):
+                    # empty then
+                    pass
+                with ops.Else():
+                    ops.Add([y1, local_blob], [local_blob])
+                    ops.Add([ops.Const(100), y1], [y1])
+
+                with ops.IfNet(ops.EQ([local_blob, ops.Const(901)])):
+                    ops.Const(7, blob_out=y2)
+                    ops.Add([y1, y2], [y2])
+            with ops.Else():
+                # empty else
+                pass
+
+        plan = Plan('if_net_test')
+        plan.AddStep(to_execution_step(nb))
+        ws = workspace.C.Workspace()
+        ws.run(plan)
+
+        first_res_value = ws.blobs[str(first_res)].fetch()
+        second_res_value = ws.blobs[str(second_res)].fetch()
+        y0_value = ws.blobs[str(y0)].fetch()
+        y1_value = ws.blobs[str(y1)].fetch()
+        y2_value = ws.blobs[str(y2)].fetch()
+
+        self.assertEquals(first_res_value, 1)
+        self.assertEquals(second_res_value, 2)
+        self.assertEquals(y0_value, 1000)
+        self.assertEquals(y1_value, 101)
+        self.assertEquals(y2_value, 108)
+        self.assertTrue(str(local_blob) not in ws.blobs)
+
+    def test_while_net(self):
+        with NetBuilder() as nb:
+            x = ops.Const(0)
+            y = ops.Const(0)
+            with ops.WhileNet():
+                with ops.Condition():
+                    ops.Add([x, ops.Const(1)], [x])
+                    ops.LT([x, ops.Const(7)])
+                ops.Add([x, y], [y])
+
+        plan = Plan('while_net_test')
+        plan.AddStep(to_execution_step(nb))
+        ws = workspace.C.Workspace()
+        ws.run(plan)
+
+        x_value = ws.blobs[str(x)].fetch()
+        y_value = ws.blobs[str(y)].fetch()
+
+        self.assertEqual(x_value, 7)
+        self.assertEqual(y_value, 21)
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
new file mode 100644
index 0000000..ee124bc
--- /dev/null
+++ b/caffe2/python/net_drawer.py
@@ -0,0 +1,406 @@
+## @package net_drawer
+# Module caffe2.python.net_drawer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import json
+import logging
+from collections import defaultdict
+from caffe2.python import utils
+from future.utils import viewitems
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+try:
+    import pydot
+except ImportError:
+    logger.info(
+        'Cannot import pydot, which is required for drawing a network. This '
+        'can usually be installed in python with "pip install pydot". Also, '
+        'pydot requires graphviz to convert dot files to pdf: in ubuntu, this '
+        'can usually be installed with "sudo apt-get install graphviz".'
+    )
+    print(
+        'net_drawer will not run correctly. Please install the correct '
+        'dependencies.'
+    )
+    pydot = None
+
+from caffe2.proto import caffe2_pb2
+
+OP_STYLE = {
+    'shape': 'box',
+    'color': '#0F9D58',
+    'style': 'filled',
+    'fontcolor': '#FFFFFF'
+}
+BLOB_STYLE = {'shape': 'octagon'}
+
+
+def _rectify_operator_and_name(operators_or_net, name):
+    """Gets the operators and name for the pydot graph."""
+    if isinstance(operators_or_net, caffe2_pb2.NetDef):
+        operators = operators_or_net.op
+        if name is None:
+            name = operators_or_net.name
+    elif hasattr(operators_or_net, 'Proto'):
+        net = operators_or_net.Proto()
+        if not isinstance(net, caffe2_pb2.NetDef):
+            raise RuntimeError(
+                "Expecting NetDef, but got {}".format(type(net)))
+        operators = net.op
+        if name is None:
+            name = net.name
+    else:
+        operators = operators_or_net
+        if name is None:
+            name = "unnamed"
+    return operators, name
+
+
+def _escape_label(name):
+    # json.dumps is poor man's escaping
+    return json.dumps(name)
+
+
+def GetOpNodeProducer(append_output, **kwargs):
+    def ReallyGetOpNode(op, op_id):
+        if op.name:
+            node_name = '%s/%s (op#%d)' % (op.name, op.type, op_id)
+        else:
+            node_name = '%s (op#%d)' % (op.type, op_id)
+        if append_output:
+            for output_name in op.output:
+                node_name += '\n' + output_name
+        return pydot.Node(node_name, **kwargs)
+    return ReallyGetOpNode
+
+
+def GetPydotGraph(
+    operators_or_net,
+    name=None,
+    rankdir='LR',
+    node_producer=None
+):
+    if node_producer is None:
+        node_producer = GetOpNodeProducer(False, **OP_STYLE)
+    operators, name = _rectify_operator_and_name(operators_or_net, name)
+    graph = pydot.Dot(name, rankdir=rankdir)
+    pydot_nodes = {}
+    pydot_node_counts = defaultdict(int)
+    for op_id, op in enumerate(operators):
+        op_node = node_producer(op, op_id)
+        graph.add_node(op_node)
+        # print 'Op: %s' % op.name
+        # print 'inputs: %s' % str(op.input)
+        # print 'outputs: %s' % str(op.output)
+        for input_name in op.input:
+            if input_name not in pydot_nodes:
+                input_node = pydot.Node(
+                    _escape_label(
+                        input_name + str(pydot_node_counts[input_name])),
+                    label=_escape_label(input_name),
+                    **BLOB_STYLE
+                )
+                pydot_nodes[input_name] = input_node
+            else:
+                input_node = pydot_nodes[input_name]
+            graph.add_node(input_node)
+            graph.add_edge(pydot.Edge(input_node, op_node))
+        for output_name in op.output:
+            if output_name in pydot_nodes:
+                # we are overwriting an existing blob. need to updat the count.
+                pydot_node_counts[output_name] += 1
+            output_node = pydot.Node(
+                _escape_label(
+                    output_name + str(pydot_node_counts[output_name])),
+                label=_escape_label(output_name),
+                **BLOB_STYLE
+            )
+            pydot_nodes[output_name] = output_node
+            graph.add_node(output_node)
+            graph.add_edge(pydot.Edge(op_node, output_node))
+    return graph
+
+
+def GetPydotGraphMinimal(
+    operators_or_net,
+    name=None,
+    rankdir='LR',
+    minimal_dependency=False,
+    node_producer=None,
+):
+    """Different from GetPydotGraph, hide all blob nodes and only show op nodes.
+
+    If minimal_dependency is set as well, for each op, we will only draw the
+    edges to the minimal necessary ancestors. For example, if op c depends on
+    op a and b, and op b depends on a, then only the edge b->c will be drawn
+    because a->c will be implied.
+    """
+    if node_producer is None:
+        node_producer = GetOpNodeProducer(False, **OP_STYLE)
+    operators, name = _rectify_operator_and_name(operators_or_net, name)
+    graph = pydot.Dot(name, rankdir=rankdir)
+    # blob_parents maps each blob name to its generating op.
+    blob_parents = {}
+    # op_ancestry records the ancestors of each op.
+    op_ancestry = defaultdict(set)
+    for op_id, op in enumerate(operators):
+        op_node = node_producer(op, op_id)
+        graph.add_node(op_node)
+        # Get parents, and set up op ancestry.
+        parents = [
+            blob_parents[input_name] for input_name in op.input
+            if input_name in blob_parents
+        ]
+        op_ancestry[op_node].update(parents)
+        for node in parents:
+            op_ancestry[op_node].update(op_ancestry[node])
+        if minimal_dependency:
+            # only add nodes that do not have transitive ancestry
+            for node in parents:
+                if all(
+                    [node not in op_ancestry[other_node]
+                     for other_node in parents]
+                ):
+                    graph.add_edge(pydot.Edge(node, op_node))
+        else:
+            # Add all parents to the graph.
+            for node in parents:
+                graph.add_edge(pydot.Edge(node, op_node))
+        # Update blob_parents to reflect that this op created the blobs.
+        for output_name in op.output:
+            blob_parents[output_name] = op_node
+    return graph
+
+
+def GetOperatorMapForPlan(plan_def):
+    operator_map = {}
+    for net_id, net in enumerate(plan_def.network):
+        if net.HasField('name'):
+            operator_map[plan_def.name + "_" + net.name] = net.op
+        else:
+            operator_map[plan_def.name + "_network_%d" % net_id] = net.op
+    return operator_map
+
+
+def _draw_nets(nets, g):
+    nodes = []
+    for i, net in enumerate(nets):
+        nodes.append(pydot.Node(_escape_label(net)))
+        g.add_node(nodes[-1])
+        if i > 0:
+            g.add_edge(pydot.Edge(nodes[-2], nodes[-1]))
+    return nodes
+
+
+def _draw_steps(steps, g, skip_step_edges=False):  # noqa
+    kMaxParallelSteps = 3
+
+    def get_label():
+        label = [step.name + '\n']
+        if step.report_net:
+            label.append('Reporter: {}'.format(step.report_net))
+        if step.should_stop_blob:
+            label.append('Stopper: {}'.format(step.should_stop_blob))
+        if step.concurrent_substeps:
+            label.append('Concurrent')
+        if step.only_once:
+            label.append('Once')
+        return '\n'.join(label)
+
+    def substep_edge(start, end):
+        return pydot.Edge(start, end, arrowhead='dot', style='dashed')
+
+    nodes = []
+    for i, step in enumerate(steps):
+        parallel = step.concurrent_substeps
+
+        nodes.append(pydot.Node(_escape_label(get_label()), **OP_STYLE))
+        g.add_node(nodes[-1])
+
+        if i > 0 and not skip_step_edges:
+            g.add_edge(pydot.Edge(nodes[-2], nodes[-1]))
+
+        if step.network:
+            sub_nodes = _draw_nets(step.network, g)
+        elif step.substep:
+            if parallel:
+                sub_nodes = _draw_steps(
+                    step.substep[:kMaxParallelSteps], g, skip_step_edges=True)
+            else:
+                sub_nodes = _draw_steps(step.substep, g)
+        else:
+            raise ValueError('invalid step')
+
+        if parallel:
+            for sn in sub_nodes:
+                g.add_edge(substep_edge(nodes[-1], sn))
+            if len(step.substep) > kMaxParallelSteps:
+                ellipsis = pydot.Node('{} more steps'.format(
+                    len(step.substep) - kMaxParallelSteps), **OP_STYLE)
+                g.add_node(ellipsis)
+                g.add_edge(substep_edge(nodes[-1], ellipsis))
+        else:
+            g.add_edge(substep_edge(nodes[-1], sub_nodes[0]))
+
+    return nodes
+
+
+def GetPlanGraph(plan_def, name=None, rankdir='TB'):
+    graph = pydot.Dot(name, rankdir=rankdir)
+    _draw_steps(plan_def.execution_step, graph)
+    return graph
+
+
+def GetGraphInJson(operators_or_net, output_filepath):
+    operators, _ = _rectify_operator_and_name(operators_or_net, None)
+    blob_strid_to_node_id = {}
+    node_name_counts = defaultdict(int)
+    nodes = []
+    edges = []
+    for op_id, op in enumerate(operators):
+        op_label = op.name + '/' + op.type if op.name else op.type
+        op_node_id = len(nodes)
+        nodes.append({
+            'id': op_node_id,
+            'label': op_label,
+            'op_id': op_id,
+            'type': 'op'
+        })
+        for input_name in op.input:
+            strid = _escape_label(
+                input_name + str(node_name_counts[input_name]))
+            if strid not in blob_strid_to_node_id:
+                input_node = {
+                    'id': len(nodes),
+                    'label': input_name,
+                    'type': 'blob'
+                }
+                blob_strid_to_node_id[strid] = len(nodes)
+                nodes.append(input_node)
+            else:
+                input_node = nodes[blob_strid_to_node_id[strid]]
+            edges.append({
+                'source': blob_strid_to_node_id[strid],
+                'target': op_node_id
+            })
+        for output_name in op.output:
+            strid = _escape_label(
+                output_name + str(node_name_counts[output_name]))
+            if strid in blob_strid_to_node_id:
+                # we are overwriting an existing blob. need to update the count.
+                node_name_counts[output_name] += 1
+                strid = _escape_label(
+                    output_name + str(node_name_counts[output_name]))
+
+            if strid not in blob_strid_to_node_id:
+                output_node = {
+                    'id': len(nodes),
+                    'label': output_name,
+                    'type': 'blob'
+                }
+                blob_strid_to_node_id[strid] = len(nodes)
+                nodes.append(output_node)
+            edges.append({
+                'source': op_node_id,
+                'target': blob_strid_to_node_id[strid]
+            })
+
+    with open(output_filepath, 'w') as f:
+        json.dump({'nodes': nodes, 'edges': edges}, f)
+
+
+# A dummy minimal PNG image used by GetGraphPngSafe as a
+# placeholder when rendering fail to run.
+_DummyPngImage = (
+    b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00'
+    b'\x01\x01\x00\x00\x00\x007n\xf9$\x00\x00\x00\nIDATx\x9cc`\x00\x00'
+    b'\x00\x02\x00\x01H\xaf\xa4q\x00\x00\x00\x00IEND\xaeB`\x82')
+
+
+def GetGraphPngSafe(func, *args, **kwargs):
+    """
+    Invokes `func` (e.g. GetPydotGraph) with args. If anything fails - returns
+    and empty image instead of throwing Exception
+    """
+    try:
+        graph = func(*args, **kwargs)
+        if not isinstance(graph, pydot.Dot):
+            raise ValueError("func is expected to return pydot.Dot")
+        return graph.create_png()
+    except Exception as e:
+        logger.error("Failed to draw graph: {}".format(e))
+        return _DummyPngImage
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Caffe2 net drawer.")
+    parser.add_argument(
+        "--input",
+        type=str, required=True,
+        help="The input protobuf file."
+    )
+    parser.add_argument(
+        "--output_prefix",
+        type=str, default="",
+        help="The prefix to be added to the output filename."
+    )
+    parser.add_argument(
+        "--minimal", action="store_true",
+        help="If set, produce a minimal visualization."
+    )
+    parser.add_argument(
+        "--minimal_dependency", action="store_true",
+        help="If set, only draw minimal dependency."
+    )
+    parser.add_argument(
+        "--append_output", action="store_true",
+        help="If set, append the output blobs to the operator names.")
+    parser.add_argument(
+        "--rankdir", type=str, default="LR",
+        help="The rank direction of the pydot graph."
+    )
+    args = parser.parse_args()
+    with open(args.input, 'r') as fid:
+        content = fid.read()
+        graphs = utils.GetContentFromProtoString(
+            content, {
+                caffe2_pb2.PlanDef: lambda x: GetOperatorMapForPlan(x),
+                caffe2_pb2.NetDef: lambda x: {x.name: x.op},
+            }
+        )
+    for key, operators in viewitems(graphs):
+        if args.minimal:
+            graph = GetPydotGraphMinimal(
+                operators,
+                name=key,
+                rankdir=args.rankdir,
+                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE),
+                minimal_dependency=args.minimal_dependency)
+        else:
+            graph = GetPydotGraph(
+                operators,
+                name=key,
+                rankdir=args.rankdir,
+                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE))
+        filename = args.output_prefix + graph.get_name() + '.dot'
+        graph.write(filename, format='raw')
+        pdf_filename = filename[:-3] + 'pdf'
+        try:
+            graph.write_pdf(pdf_filename)
+        except Exception:
+            print(
+                'Error when writing out the pdf file. Pydot requires graphviz '
+                'to convert dot files to pdf, and you may not have installed '
+                'graphviz. On ubuntu this can usually be installed with "sudo '
+                'apt-get install graphviz". We have generated the .dot file '
+                'but will not be able to generate pdf file for now.'
+            )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
new file mode 100644
index 0000000..4b5cddb
--- /dev/null
+++ b/caffe2/python/net_printer.py
@@ -0,0 +1,422 @@
+## @package net_printer
+# Module caffe2.python.net_printer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
+from caffe2.python.checkpoint import Job
+from caffe2.python.core import Net, ExecutionStep, Plan
+from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput
+from collections import defaultdict
+from contextlib import contextmanager
+from copy import copy
+from future.utils import viewkeys
+from itertools import chain
+from six import binary_type, text_type
+
+
+class Visitor(object):
+    @classmethod
+    def register(cls, Type):
+        if not(hasattr(cls, 'visitors')):
+            cls.visitors = {}
+        else:
+            assert Type not in cls.visitors, \
+                '{} already registered!'.format(Type)
+
+        def _register(func):
+            cls.visitors[Type] = func
+            return func
+
+        return _register
+
+    def __call__(self, obj, *args, **kwargs):
+        if obj is None:
+            return
+
+        Type = type(obj)
+        if Type not in self.__class__.visitors:
+            raise TypeError('%s: unsupported object type: %s' % (
+                self.__class__.__name__, Type))
+
+        func = self.__class__.visitors[Type]
+        return func(self, obj, *args, **kwargs)
+
+
+class Analyzer(Visitor):
+    PREFIXES_TO_IGNORE = {'distributed_ctx_init'}
+
+    def __init__(self):
+        self.workspaces = defaultdict(lambda: defaultdict(lambda: 0))
+        self.workspace_ctx = []
+
+    @property
+    def workspace(self):
+        return self.workspace_ctx[-1]
+
+    @contextmanager
+    def set_workspace(self, node=None, ws=None, do_copy=False):
+        if ws is not None:
+            ws = ws
+        elif node is not None:
+            ws = self.workspaces[str(node)]
+        else:
+            ws = self.workspace
+        if do_copy:
+            ws = copy(ws)
+        self.workspace_ctx.append(ws)
+        yield ws
+        del self.workspace_ctx[-1]
+
+    def define_blob(self, blob):
+        self.workspace[blob] += 1
+
+    def need_blob(self, blob):
+        if any(blob.startswith(p) for p in Analyzer.PREFIXES_TO_IGNORE):
+            return
+        assert blob in self.workspace, 'Blob undefined: %s' % blob
+
+
+@Analyzer.register(OperatorDef)
+def analyze_op(analyzer, op):
+    for x in op.input:
+        analyzer.need_blob(x)
+    for x in op.output:
+        analyzer.define_blob(x)
+
+
+@Analyzer.register(Net)
+def analyze_net(analyzer, net):
+    for x in net.Proto().op:
+        analyzer(x)
+
+
+@Analyzer.register(ExecutionStep)
+def analyze_step(analyzer, step):
+    proto = step.Proto()
+    with analyzer.set_workspace(do_copy=proto.create_workspace):
+        if proto.report_net:
+            with analyzer.set_workspace(do_copy=True):
+                analyzer(step.get_net(proto.report_net))
+        all_new_blobs = set()
+        substeps = step.Substeps() + [step.get_net(n) for n in proto.network]
+        for substep in substeps:
+            with analyzer.set_workspace(
+                    do_copy=proto.concurrent_substeps) as ws_in:
+                analyzer(substep)
+                if proto.should_stop_blob:
+                    analyzer.need_blob(proto.should_stop_blob)
+            if proto.concurrent_substeps:
+                new_blobs = set(viewkeys(ws_in)) - set(viewkeys(analyzer.workspace))
+                assert len(all_new_blobs & new_blobs) == 0, (
+                    'Error: Blobs created by multiple parallel steps: %s' % (
+                        ', '.join(all_new_blobs & new_blobs)))
+                all_new_blobs |= new_blobs
+    for x in all_new_blobs:
+        analyzer.define_blob(x)
+
+
+@Analyzer.register(Task)
+def analyze_task(analyzer, task):
+    # check that our plan protobuf is not too large (limit of 64Mb)
+    step = task.get_step()
+    plan = Plan(task.node)
+    plan.AddStep(step)
+    proto_len = len(plan.Proto().SerializeToString())
+    assert proto_len < 2 ** 26, (
+        'Due to a protobuf limitation, serialized tasks must be smaller '
+        'than 64Mb, but this task has {} bytes.' % proto_len)
+
+    is_private = task.workspace_type() != WorkspaceType.GLOBAL
+    with analyzer.set_workspace(do_copy=is_private):
+        analyzer(step)
+
+
+@Analyzer.register(TaskGroup)
+def analyze_task_group(analyzer, tg):
+    for task in tg.tasks_by_node().tasks():
+        with analyzer.set_workspace(node=task.node):
+            analyzer(task)
+
+
+@Analyzer.register(Job)
+def analyze_job(analyzer, job):
+    analyzer(job.init_group)
+    analyzer(job.epoch_group)
+
+
+def analyze(obj):
+    """
+    Given a Job, visits all the execution steps making sure that:
+      - no undefined blobs will be found during excution
+      - no blob with same name is defined in concurrent steps
+    """
+    Analyzer()(obj)
+
+
+class Text(object):
+    def __init__(self):
+        self._indent = 0
+        self._lines_in_context = [0]
+        self.lines = []
+
+    @contextmanager
+    def context(self, text):
+        if text is not None:
+            self.add('with %s:' % text)
+            self._indent += 4
+            self._lines_in_context.append(0)
+        yield
+        if text is not None:
+            if self._lines_in_context[-1] == 0:
+                self.add('pass')
+            self._indent -= 4
+            del self._lines_in_context[-1]
+
+    def add(self, text):
+        self._lines_in_context[-1] += 1
+        self.lines.append((' ' * self._indent) + text)
+
+    def __str__(self):
+        return '\n'.join(self.lines)
+
+
+class Printer(Visitor, Text):
+    def __init__(self, factor_prefixes=False, c2_syntax=True):
+        super(Visitor, self).__init__()
+        super(Text, self).__init__()
+        self.factor_prefixes = factor_prefixes
+        self.c2_syntax = c2_syntax
+        self.c2_net_name = None
+
+
+def _sanitize_str(s):
+    if isinstance(s, text_type):
+        sanitized = s
+    elif isinstance(s, binary_type):
+        sanitized = s.decode('ascii', errors='ignore')
+    else:
+        sanitized = str(s)
+    if len(sanitized) < 64:
+        return "'%s'" % sanitized
+    else:
+        return "'%s'" % sanitized[:64] + '...<+len=%d>' % (len(sanitized) - 64)
+
+
+def _arg_val(arg):
+    if arg.HasField('f'):
+        return str(arg.f)
+    if arg.HasField('i'):
+        return str(arg.i)
+    if arg.HasField('s'):
+        return _sanitize_str(arg.s)
+    if arg.floats:
+        return str(list(arg.floats))
+    if arg.ints:
+        return str(list(arg.ints))
+    if arg.strings:
+        return str([_sanitize_str(s) for s in arg.strings])
+    return '[]'
+
+
+def commonprefix(m):
+    "Given a list of strings, returns the longest common prefix"
+    if not m:
+        return ''
+    s1 = min(m)
+    s2 = max(m)
+    for i, c in enumerate(s1):
+        if c != s2[i]:
+            return s1[:i]
+    return s1
+
+
+def format_value(val):
+    if isinstance(val, list):
+        return '[%s]' % ', '.join("'%s'" % str(v) for v in val)
+    else:
+        return str(val)
+
+
+def factor_prefix(vals, do_it):
+    vals = [format_value(v) for v in vals]
+    prefix = commonprefix(vals) if len(vals) > 1 and do_it else ''
+    joined = ', '.join(v[len(prefix):] for v in vals)
+    return '%s[%s]' % (prefix, joined) if prefix else joined
+
+
+def call(op, inputs=None, outputs=None, factor_prefixes=False):
+    if not inputs:
+        inputs = ''
+    else:
+        inputs_v = [a for a in inputs if not isinstance(a, tuple)]
+        inputs_kv = [a for a in inputs if isinstance(a, tuple)]
+        inputs = ', '.join(
+            x
+            for x in chain(
+                [factor_prefix(inputs_v, factor_prefixes)],
+                ('%s=%s' % kv for kv in inputs_kv),
+            )
+            if x
+        )
+    call = '%s(%s)' % (op, inputs)
+    return call if not outputs else '%s = %s' % (
+        factor_prefix(outputs, factor_prefixes), call)
+
+
+def format_device_option(dev_opt):
+    if not dev_opt or not (
+            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
+        return None
+    return call(
+        'DeviceOption',
+        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
+
+
+@Printer.register(OperatorDef)
+def print_op(text, op):
+    args = [(a.name, _arg_val(a)) for a in op.arg]
+    dev_opt_txt = format_device_option(op.device_option)
+    if dev_opt_txt:
+        args.append(('device_option', dev_opt_txt))
+
+    if text.c2_net_name:
+        text.add(call(
+            text.c2_net_name + '.' + op.type,
+            [list(op.input), list(op.output)] + args))
+    else:
+        text.add(call(
+            op.type,
+            list(op.input) + args,
+            op.output,
+            factor_prefixes=text.factor_prefixes))
+    for arg in op.arg:
+        if arg.HasField('n'):
+            with text.context('arg: %s' % arg.name):
+                text(arg.n)
+
+
+@Printer.register(NetDef)
+def print_net_def(text, net_def):
+    if text.c2_syntax:
+        text.add(call('core.Net', ["'%s'" % net_def.name], [net_def.name]))
+        text.c2_net_name = net_def.name
+    else:
+        text.add('# net: %s' % net_def.name)
+    for op in net_def.op:
+        text(op)
+    if text.c2_syntax:
+        text.c2_net_name = None
+
+
+@Printer.register(Net)
+def print_net(text, net):
+    text(net.Proto())
+
+
+def _get_step_context(step):
+    proto = step.Proto()
+    if proto.should_stop_blob:
+        return call('loop'), False
+    if proto.num_iter and proto.num_iter != 1:
+        return call('loop', [proto.num_iter]), False
+    if proto.num_concurrent_instances > 1:
+        return (
+            call('parallel',
+                 [('num_instances', proto.num_concurrent_instances)]),
+            len(step.Substeps()) > 1)
+    concurrent = proto.concurrent_substeps and len(step.Substeps()) > 1
+    if concurrent:
+        return call('parallel'), True
+    if proto.report_net:
+        return call('run_once'), False
+    return None, False
+
+
+@Printer.register(ExecutionStep)
+def print_step(text, step):
+    proto = step.Proto()
+    step_ctx, do_substep = _get_step_context(step)
+    with text.context(step_ctx):
+        if proto.report_net:
+            with text.context(call('report_net', [proto.report_interval])):
+                text(step.get_net(proto.report_net))
+        substeps = step.Substeps() + [step.get_net(n) for n in proto.network]
+        for substep in substeps:
+            sub_proto = (
+                substep.Proto() if isinstance(substep, ExecutionStep) else None)
+            if sub_proto is not None and sub_proto.run_every_ms:
+                substep_ctx = call(
+                    'reporter',
+                    [str(substep), ('interval_ms', sub_proto.run_every_ms)])
+            elif do_substep:
+                title = (
+                    'workspace'
+                    if sub_proto is not None and sub_proto.create_workspace else
+                    'step')
+                substep_ctx = call(title, [str(substep)])
+            else:
+                substep_ctx = None
+            with text.context(substep_ctx):
+                text(substep)
+                if proto.should_stop_blob:
+                    text.add(call('yield stop_if', [proto.should_stop_blob]))
+
+
+def _print_task_output(x):
+    assert isinstance(x, TaskOutput)
+    return 'Output[' + ', '.join(str(x) for x in x.names) + ']'
+
+
+@Printer.register(Task)
+def print_task(text, task):
+    outs = ', '.join(_print_task_output(o) for o in task.outputs())
+    context = [('node', task.node), ('name', task.name), ('outputs', outs)]
+    with text.context(call('Task', context)):
+        text(task.get_step())
+
+
+@Printer.register(TaskGroup)
+def print_task_group(text, tg, header=None):
+    with text.context(header or call('TaskGroup')):
+        for task in tg.tasks_by_node().tasks():
+            text(task)
+
+
+@Printer.register(Job)
+def print_job(text, job):
+    text(job.init_group, 'Job.current().init_group')
+    text(job.epoch_group, 'Job.current().epoch_group')
+    with text.context('Job.current().stop_conditions'):
+        for out in job.stop_conditions:
+            text.add(_print_task_output(out))
+    text(job.download_group, 'Job.current().download_group')
+    text(job.exit_group, 'Job.current().exit_group')
+
+
+def to_string(obj, **kwargs):
+    """
+    Given a Net, ExecutionStep, Task, TaskGroup or Job, produces a string
+    with detailed description of the execution steps.
+    """
+    printer = Printer(**kwargs)
+    printer(obj)
+    return str(printer)
+
+
+def debug_net(net):
+    """
+    Given a Net, produce another net that logs info about the operator call
+    before each operator execution. Use for debugging purposes.
+    """
+    assert isinstance(net, Net)
+    debug_net = Net(str(net))
+    assert isinstance(net, Net)
+    for op in net.Proto().op:
+        text = Text()
+        print_op(op, text)
+        debug_net.LogInfo(str(text))
+        debug_net.Proto().op.extend([op])
+    return debug_net
diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py
new file mode 100644
index 0000000..bc086c3
--- /dev/null
+++ b/caffe2/python/net_printer_test.py
@@ -0,0 +1,99 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import net_printer
+from caffe2.python.checkpoint import Job
+from caffe2.python.net_builder import ops
+from caffe2.python.task import Task, final_output, WorkspaceType
+import unittest
+
+
+def example_loop():
+    with Task():
+        total = ops.Const(0)
+        total_large = ops.Const(0)
+        total_small = ops.Const(0)
+        total_tiny = ops.Const(0)
+        with ops.loop(10) as loop:
+            outer = ops.Mul([loop.iter(), ops.Const(10)])
+            with ops.loop(loop.iter()) as inner:
+                val = ops.Add([outer, inner.iter()])
+                with ops.If(ops.GE([val, ops.Const(80)])) as c:
+                    ops.Add([total_large, val], [total_large])
+                with c.Elif(ops.GE([val, ops.Const(50)])) as c:
+                    ops.Add([total_small, val], [total_small])
+                with c.Else():
+                    ops.Add([total_tiny, val], [total_tiny])
+                ops.Add([total, val], total)
+
+
+def example_task():
+    with Task():
+        with ops.task_init():
+            one = ops.Const(1)
+        two = ops.Add([one, one])
+        with ops.task_init():
+            three = ops.Const(3)
+        accum = ops.Add([two, three])
+        # here, accum should be 5
+        with ops.task_exit():
+            # here, accum should be 6, since this executes after lines below
+            seven_1 = ops.Add([accum, one])
+        six = ops.Add([accum, one])
+        ops.Add([accum, one], [accum])
+        seven_2 = ops.Add([accum, one])
+        o6 = final_output(six)
+        o7_1 = final_output(seven_1)
+        o7_2 = final_output(seven_2)
+
+    with Task(num_instances=2):
+        with ops.task_init():
+            one = ops.Const(1)
+        with ops.task_instance_init():
+            local = ops.Const(2)
+        ops.Add([one, local], [one])
+        ops.LogInfo('ble')
+
+    return o6, o7_1, o7_2
+
+def example_job():
+    with Job() as job:
+        with job.init_group:
+            example_loop()
+        example_task()
+    return job
+
+
+class TestNetPrinter(unittest.TestCase):
+    def test_print(self):
+        self.assertTrue(len(net_printer.to_string(example_job())) > 0)
+
+    def test_valid_job(self):
+        job = example_job()
+        with job:
+            with Task():
+                # distributed_ctx_init_* ignored by analyzer
+                ops.Add(['distributed_ctx_init_a', 'distributed_ctx_init_b'])
+        # net_printer.analyze(example_job())
+        print(net_printer.to_string(example_job()))
+
+    def test_undefined_blob(self):
+        job = example_job()
+        with job:
+            with Task():
+                ops.Add(['a', 'b'])
+        with self.assertRaises(AssertionError) as e:
+            net_printer.analyze(job)
+        self.assertEqual("Blob undefined: a", str(e.exception))
+
+    def test_multiple_definition(self):
+        job = example_job()
+        with job:
+            with Task(workspace_type=WorkspaceType.GLOBAL):
+                ops.Add([ops.Const(0), ops.Const(1)], 'out1')
+            with Task(workspace_type=WorkspaceType.GLOBAL):
+                ops.Add([ops.Const(2), ops.Const(3)], 'out1')
+        with self.assertRaises(AssertionError):
+            net_printer.analyze(job)
diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py
new file mode 100644
index 0000000..815f177
--- /dev/null
+++ b/caffe2/python/normalizer.py
@@ -0,0 +1,41 @@
+# @package optimizer
+# Module caffe2.python.normalizer
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+class Normalizer(object):
+    def __init__(self):
+        pass
+    """
+    Adds normalization to train_net for given parameter. Its factor ahead of
+    regularization is given when initialization.
+    The param should be a BlobReference.
+    """
+
+    def __call__(self, net, param):
+        return self._run(net, param)
+
+    def _run(self, net, param):
+        raise Exception("Not Impelemented")
+
+
+class BatchNormalizer(Normalizer):
+    def __init__(self, momentum):
+        super(BatchNormalizer, self).__init__()
+        self._momentum = float(momentum)
+
+    def _run(self, layer_model, param):
+        return layer_model.BatchNormalization(
+            param, momentum=self._momentum
+        )
+
+
+class LayerNormalizer(Normalizer):
+    def __init__(self, epsilon):
+        super(LayerNormalizer, self).__init__()
+        self._epsilon = float(epsilon)
+
+    def _run(self, layer_model, param):
+        return layer_model.LayerNormalization(
+            param, epsilon=self._epsilon
+        )
diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py
new file mode 100644
index 0000000..1dd58f8
--- /dev/null
+++ b/caffe2/python/normalizer_context.py
@@ -0,0 +1,38 @@
+# @package regularizer_context
+# Module caffe2.python.normalizer_context
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import context
+from caffe2.python.modifier_context import (
+    ModifierContext, UseModifierBase)
+
+
+@context.define_context(allow_default=True)
+class NormalizerContext(ModifierContext):
+    """
+    provide context to allow param_info to have different normalizers
+    """
+
+    def has_normalizer(self, name):
+        return self._has_modifier(name)
+
+    def get_normalizer(self, name):
+        assert self.has_normalizer(name), (
+            "{} normalizer is not provided!".format(name))
+        return self._get_modifier(name)
+
+
+class UseNormalizer(UseModifierBase):
+    '''
+    context class to allow setting the current context.
+    Example useage with layer:
+        normalizers = {'norm1': norm1, 'norm2': norm2}
+        with UseNormalizer(normalizers):
+            norm = NormalizerContext.current().get_normalizer('norm1')
+            layer(norm=norm)
+    '''
+    def _context_class(self):
+        return NormalizerContext
diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py
new file mode 100644
index 0000000..1f4cb48
--- /dev/null
+++ b/caffe2/python/normalizer_test.py
@@ -0,0 +1,15 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext
+from caffe2.python.normalizer import BatchNormalizer
+from caffe2.python.layer_test_util import LayersTestCase
+
+
+class TestNormalizerContext(LayersTestCase):
+    def test_normalizer_context(self):
+        bn = BatchNormalizer(momentum=0.1)
+        with UseNormalizer({'BATCH': bn}):
+            normalizer = NormalizerContext.current().get_normalizer('BATCH')
+            self.assertEquals(bn, normalizer)
diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py
new file mode 100644
index 0000000..21c1cb1
--- /dev/null
+++ b/caffe2/python/numa_benchmark.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+import time
+
+SHAPE_LEN = 4096
+NUM_ITER = 1000
+GB = 1024 * 1024 * 1024
+NUM_REPLICAS = 48
+
+
+def build_net(net_name, cross_socket):
+    init_net = core.Net(net_name + "_init")
+    init_net.Proto().type = "async_scheduling"
+    numa_device_option = caffe2_pb2.DeviceOption()
+    numa_device_option.device_type = caffe2_pb2.CPU
+    numa_device_option.numa_node_id = 0
+    for replica_id in range(NUM_REPLICAS):
+        init_net.XavierFill([], net_name + "/input_blob_" + str(replica_id),
+            shape=[SHAPE_LEN, SHAPE_LEN], device_option=numa_device_option)
+
+    net = core.Net(net_name)
+    net.Proto().type = "async_scheduling"
+    if cross_socket:
+        numa_device_option.numa_node_id = 1
+    for replica_id in range(NUM_REPLICAS):
+        net.Copy(net_name + "/input_blob_" + str(replica_id),
+                net_name + "/output_blob_" + str(replica_id),
+                device_option=numa_device_option)
+    return init_net, net
+
+
+def main():
+    assert workspace.IsNUMAEnabled() and workspace.GetNumNUMANodes() >= 2
+
+    single_init, single_net = build_net("single_net", False)
+    cross_init, cross_net = build_net("cross_net", True)
+
+    workspace.CreateNet(single_init)
+    workspace.RunNet(single_init.Name())
+    workspace.CreateNet(cross_init)
+    workspace.RunNet(cross_init.Name())
+
+    workspace.CreateNet(single_net)
+    workspace.CreateNet(cross_net)
+
+    for _ in range(4):
+        t = time.time()
+        workspace.RunNet(single_net.Name(), NUM_ITER)
+        dt = time.time() - t
+        print("Single socket time:", dt)
+        single_bw = 4 * SHAPE_LEN * SHAPE_LEN * NUM_REPLICAS * NUM_ITER / dt / GB
+        print("Single socket BW: {} GB/s".format(single_bw))
+
+        t = time.time()
+        workspace.RunNet(cross_net.Name(), NUM_ITER)
+        dt = time.time() - t
+        print("Cross socket time:", dt)
+        cross_bw = 4 * SHAPE_LEN * SHAPE_LEN * NUM_REPLICAS * NUM_ITER / dt / GB
+        print("Cross socket BW: {} GB/s".format(cross_bw))
+        print("Single BW / Cross BW: {}".format(single_bw / cross_bw))
+
+
+if __name__ == '__main__':
+    core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])
+    main()
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
new file mode 100644
index 0000000..8d3a362
--- /dev/null
+++ b/caffe2/python/numa_test.py
@@ -0,0 +1,54 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+from caffe2.python.test_util import TestCase
+import unittest
+
+core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])
+
+
+def build_test_net(net_name):
+    net = core.Net(net_name)
+    net.Proto().type = "async_scheduling"
+
+    numa_device_option = caffe2_pb2.DeviceOption()
+    numa_device_option.device_type = caffe2_pb2.CPU
+    numa_device_option.numa_node_id = 0
+
+    net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
+                         device_option=numa_device_option)
+
+    numa_device_option.numa_node_id = 1
+    net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
+                         device_option=numa_device_option)
+
+    gpu_device_option = caffe2_pb2.DeviceOption()
+    gpu_device_option.device_type = caffe2_pb2.CUDA
+    gpu_device_option.cuda_gpu_id = 0
+
+    net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
+                        device_option=gpu_device_option)
+    net.CopyCPUToGPU("output_blob_1", "output_blob_1_gpu",
+                        device_option=gpu_device_option)
+
+    return net
+
+
+@unittest.skipIf(not workspace.IsNUMAEnabled(), "NUMA is not enabled")
+@unittest.skipIf(workspace.GetNumNUMANodes() < 2, "Not enough NUMA nodes")
+@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
+class NUMATest(TestCase):
+    def test_numa(self):
+        net = build_test_net("test_numa")
+
+        workspace.RunNetOnce(net)
+
+        self.assertEqual(workspace.GetBlobNUMANode("output_blob_0"), 0)
+        self.assertEqual(workspace.GetBlobNUMANode("output_blob_1"), 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py
new file mode 100644
index 0000000..38265d9
--- /dev/null
+++ b/caffe2/python/observer_test.py
@@ -0,0 +1,153 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import brew, core, model_helper, rnn_cell
+import caffe2.python.workspace as ws
+
+
+class TestObservers(unittest.TestCase):
+    def setUp(self):
+        core.GlobalInit(["python", "caffe2"])
+        ws.ResetWorkspace()
+        self.model = model_helper.ModelHelper()
+        brew.fc(self.model, "data", "y",
+                    dim_in=4, dim_out=2,
+                    weight_init=('ConstantFill', dict(value=1.0)),
+                    bias_init=('ConstantFill', dict(value=0.0)),
+                    axis=0)
+        ws.FeedBlob("data", np.zeros([4], dtype='float32'))
+
+        ws.RunNetOnce(self.model.param_init_net)
+        ws.CreateNet(self.model.net)
+
+    def testObserver(self):
+        ob = self.model.net.AddObserver("TimeObserver")
+        ws.RunNet(self.model.net)
+        print(ob.average_time())
+        num = self.model.net.NumObservers()
+        self.model.net.RemoveObserver(ob)
+        assert(self.model.net.NumObservers() + 1 == num)
+
+    @given(
+        num_layers=st.integers(1, 4),
+        forward_only=st.booleans()
+    )
+    def test_observer_rnn_executor(self, num_layers, forward_only):
+        '''
+        Test that the RNN executor produces same results as
+        the non-executor (i.e running step nets as sequence of simple nets).
+        '''
+
+        Tseq = [2, 3, 4]
+        batch_size = 10
+        input_dim = 3
+        hidden_dim = 3
+
+        run_cnt = [0] * len(Tseq)
+        avg_time = [0] * len(Tseq)
+        for j in range(len(Tseq)):
+            T = Tseq[j]
+
+            ws.ResetWorkspace()
+            ws.FeedBlob(
+                "seq_lengths",
+                np.array([T] * batch_size, dtype=np.int32)
+            )
+            ws.FeedBlob("target", np.random.rand(
+                T, batch_size, hidden_dim).astype(np.float32))
+            ws.FeedBlob("hidden_init", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+            ws.FeedBlob("cell_init", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+
+            model = model_helper.ModelHelper(name="lstm")
+            model.net.AddExternalInputs(["input"])
+
+            init_blobs = []
+            for i in range(num_layers):
+                hidden_init, cell_init = model.net.AddExternalInputs(
+                    "hidden_init_{}".format(i),
+                    "cell_init_{}".format(i)
+                )
+                init_blobs.extend([hidden_init, cell_init])
+
+            output, last_hidden, _, last_state = rnn_cell.LSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seq_lengths",
+                initial_states=init_blobs,
+                dim_in=input_dim,
+                dim_out=[hidden_dim] * num_layers,
+                drop_states=True,
+                forward_only=forward_only,
+                return_last_layer_only=True,
+            )
+
+            loss = model.AveragedLoss(
+                model.SquaredL2Distance([output, "target"], "dist"),
+                "loss"
+            )
+            # Add gradient ops
+            if not forward_only:
+                model.AddGradientOperators([loss])
+
+            # init
+            for init_blob in init_blobs:
+                ws.FeedBlob(init_blob, np.zeros(
+                    [1, batch_size, hidden_dim], dtype=np.float32
+                ))
+            ws.RunNetOnce(model.param_init_net)
+
+            # Run with executor
+            self.enable_rnn_executor(model.net, 1, forward_only)
+
+            np.random.seed(10022015)
+            input_shape = [T, batch_size, input_dim]
+            ws.FeedBlob(
+                "input",
+                np.random.rand(*input_shape).astype(np.float32)
+            )
+            ws.FeedBlob(
+                "target",
+                np.random.rand(
+                    T,
+                    batch_size,
+                    hidden_dim
+                ).astype(np.float32)
+            )
+            ws.CreateNet(model.net, overwrite=True)
+
+            time_ob = model.net.AddObserver("TimeObserver")
+            run_cnt_ob = model.net.AddObserver("RunCountObserver")
+            ws.RunNet(model.net)
+            avg_time[j] = time_ob.average_time()
+            run_cnt[j] = int(''.join(x for x in run_cnt_ob.debug_info() if x.isdigit()))
+            model.net.RemoveObserver(time_ob)
+            model.net.RemoveObserver(run_cnt_ob)
+
+        print(avg_time)
+        print(run_cnt)
+        self.assertTrue(run_cnt[1] > run_cnt[0] and run_cnt[2] > run_cnt[1])
+        self.assertEqual(run_cnt[1] - run_cnt[0], run_cnt[2] - run_cnt[1])
+
+    def enable_rnn_executor(self, net, value, forward_only):
+        num_found = 0
+        for op in net.Proto().op:
+            if op.type.startswith("RecurrentNetwork"):
+                for arg in op.arg:
+                    if arg.name == 'enable_rnn_executor':
+                        arg.i = value
+                        num_found += 1
+        # This sanity check is so that if someone changes the
+        # enable_rnn_executor parameter name, the test will
+        # start failing as this function will become defective.
+        self.assertEqual(1 if forward_only else 2, num_found)
diff --git a/caffe2/python/onnx/ONNXOpCoverage.md b/caffe2/python/onnx/ONNXOpCoverage.md
new file mode 100644
index 0000000..a3743f0
--- /dev/null
+++ b/caffe2/python/onnx/ONNXOpCoverage.md
@@ -0,0 +1,121 @@
+# Tracking why operators are not covered
+[ONNX backend test script](https://github.com/onnx/onnx-caffe2/blob/master/tests/onnx_backend_test.py)
+reports the coverage on the operators and attributes. But we have various of reasons for the missing test coverage on operators.
+This doc keeps tracking why operators are not covered by the testcases.
+
+- &#x1F49A; The ONNX operator can map to a Caffe2 operator.
+- &#x1F49B; The solution is not perfect/finished, for example, the operator can map to a combination of Caffe2 operators.
+- &#x1F494; Hard to find a solution with existing Caffe2 operators.
+
+| Operator | Test Coverage | PyTorch | Caffe2 |
+|---|:--:|:---:|:---:|
+|Abs|Yes|OK|&#x1F49A;OK|
+|Acos|Yes|OK|&#x1F49A;OK|
+|Add|Yes|OK|&#x1F49A;OK|
+|And|Yes|Support int tensor, but no bool tensor|&#x1F49A;OK|
+|ArgMax|||&#x1F49A;OK|
+|ArgMin|||&#x1F49A;OK|
+|Asin|||&#x1F49A;OK|
+|Atan|||&#x1F49A;OK|
+|AveragePool||OK|&#x1F49A;OK|
+|BatchNormalization||OK|&#x1F49A;OK|
+|Cast|Yes||&#x1F494;Need extendtion|
+|Ceil|Yes||&#x1F49A;OK|
+|Clip|Yes|OK|&#x1F49A;OK|
+|Concat|Yes|OK|&#x1F49A;OK|
+|Constant|Yes|OK|&#x1F49B;Special handling|
+|Conv|Yes|OK|&#x1F49A;OK|
+|ConvTranspose|Yes||&#x1F49A;OK, under enhancement|
+|Cos|Yes|OK|&#x1F49A;OK|
+|DepthToSpace|Yes||&#x1F494;No op|
+|Div|Yes|OK|&#x1F49A;OK|
+|Dropout|Yes|OK|&#x1F49A;OK|
+|Elu|Yes|OK|&#x1F49A;OK|
+|Equal|Yes|OK|&#x1F49A;OK|
+|Exp|Yes|OK|&#x1F49A;OK|
+|Flatten|Yes|OK|&#x1F49A;OK|
+|Floor|Yes||&#x1F49A;OK|
+|GRU|||&#x1F49A;|
+|Gather|Yes|OK|&#x1F49B;C2 only support axis=0 or 1, under development|
+|Gemm|Yes|OK|&#x1F49B;C2 use FC or MatMul + Add|
+|GlobalAveragePool|Yes|No direct mapping|&#x1F49A;OK|
+|GlobalLpPool|||&#x1F494;No mapping yet|
+|GlobalMaxPool|||&#x1F49A;OK|
+|Greater|Yes||&#x1F49A;OK|
+|HardSigmoid|Yes||&#x1F494;No op|
+|Hardmax|Yes||&#x1F494;No op|
+|InstanceNormalization|||&#x1F49A;OK|
+|LRN||OK|&#x1F49A;OK|
+|LSTM|||&#x1F49A;OK|
+|LeakyRelu|Yes|OK|&#x1F49A;OK|
+|Less|Yes||&#x1F49A;OK|
+|Log|Yes|OK|&#x1F49A;OK|
+|LogSoftmax||OK|&#x1F49A;No op, translated in onnx-caffe2|
+|LpNormalization|||&#x1F494;ONNX and C2 have different definition|
+|LpPool|||&#x1F49A;Should be LpPool, no tests|
+|MatMul|Yes|OK|&#x1F49A;OK|
+|Max|Yes|OK|&#x1F49A;OK|
+|MaxPool||OK|&#x1F49A;OK|
+|MaxRoiPool|||&#x1F494;No mapping yet|
+|Mean|||&#x1F49A;OK, need broadcasting support|
+|Min|Yes|OK|&#x1F49A;OK, need broadcasting support|
+|Mul|Yes|OK|&#x1F49A;OK, need broadcasting support|
+|Multinomial|Yes|OK|&#x1F494;no op|
+|Neg|Yes|OK|&#x1F49A;OK|
+|Not|Yes||&#x1F49A;OK|
+|Or|Yes||&#x1F49A;OK|
+|PRelu|Yes|OK|&#x1F49B;Need to enhance C2 implementation|
+|Pad|Yes|OK|&#x1F49A;OK|
+|Pow|Yes|OK|&#x1F49A;OK|
+|RNN|||&#x1F49A;OK|
+|RandomNormal|||&#x1F494;No op|
+|RandomNormalLike|||&#x1F494;No op|
+|RandomUniform|||&#x1F494;No op|
+|RandomUniformLike|||&#x1F494;No op|
+|Reciprocal|Yes||&#x1F49A;Use Pow to implement|
+|ReduceL1|||&#x1F494;No op|
+|ReduceL2|||&#x1F494;No op|
+|ReduceLogSum|||&#x1F494;No op|
+|ReduceLogSumExp|||&#x1F494;No op|
+|ReduceMax|||&#x1F49A;OK|
+|ReduceMean|||&#x1F49A;OK|
+|ReduceMin|||&#x1F49A;OK|
+|ReduceProd|||&#x1F49A;OK|
+|ReduceSum|||&#x1F49A;OK|
+|ReduceSumSquare|||&#x1F494;No op|
+|Relu|Yes|OK|&#x1F49A;OK|
+|Reshape|Yes|OK|&#x1F49A;OK|
+|Selu|Yes|OK|&#x1F49A;OK|
+|Sigmoid|Yes|OK|&#x1F49A;OK|
+|Sin|Yes|OK|&#x1F49A;OK|
+|Size|Yes|OK|&#x1F49A;OK|
+|Slice|Yes|OK|&#x1F494;ScatterAssign + Cast, very hacky implementaion, Slice in C2 only supports one dimension|
+|Softmax|Yes|OK|&#x1F494;Axis and dim has different semantics|
+|Softplus|Yes|OK|&#x1F49A;OK|
+|Softsign|Yes||&#x1F49A;OK|
+|SpaceToDepth|||&#x1F494;No op|
+|Split|Yes|OK|&#x1F49A;OK|
+|Sqrt|Yes||&#x1F49A;OK|
+|Squeeze|Yes||&#x1F49A;OK|
+|Sub|Yes|OK|&#x1F49A;OK|
+|Sum|Yes|OK|&#x1F49A;OK, need broadcasting support|
+|Tanh|Yes|OK|&#x1F49A;OK|
+|Tile||OK|&#x1F49B;OK, need some enhance|
+|TopK||OK|&#x1F49A;OK|
+|Transpose|Yes|OK|&#x1F49A;OK|
+|Upsample|||&#x1F49B;No bilinear|
+|Xor|Yes||&#x1F49A;OK|
+|experimental ATen|||&#x1F49A;OK|
+|experimental Affine|||&#x1F494;No op|
+|experimental ConstantFill|||&#x1F49A;OK|
+|experimental Crop|||&#x1F494;No op|
+|experimental FC|||&#x1F49A;OK|
+|experimental GRUUnit|||&#x1F49A;OK, no tests|
+|experimental GivenTensorFill|||&#x1F49A;OK|
+|experimental Identity|||&#x1F49A;OK|
+|experimental ImageScaler|||&#x1F494;No op|
+|experimental MeanVarianceNormalization|||&#x1F494;No op|
+|experimental ParametricSoftplus|||&#x1F494;No op|
+|experimental Scale|||&#x1F49A;OK|
+|experimental ScaledTanh|||&#x1F494;No op|
+|experimental ThresholdedRelu|Yes||&#x1F49A;OK|
diff --git a/caffe2/python/onnx/README.md b/caffe2/python/onnx/README.md
new file mode 100644
index 0000000..441914f
--- /dev/null
+++ b/caffe2/python/onnx/README.md
@@ -0,0 +1,54 @@
+Caffe2 implementation of Open Neural Network Exchange (ONNX)
+========
+
+# Usage
+
+* [ONNX to Caffe2](https://github.com/onnx/tutorials/blob/master/tutorials/OnnxCaffe2Import.ipynb)
+* [Caffe2 to ONNX](https://github.com/onnx/tutorials/blob/master/tutorials/Caffe2OnnxExport.ipynb)
+* [other end-to-end tutorials](https://github.com/onnx/tutorials)
+
+# Installation
+
+onnx-caffe2 is installed as a part of Caffe2.
+Please follow the [instructions](https://caffe2.ai/docs/getting-started.html) to install Caffe2.
+
+
+# Folder Structure
+
+- ./: the main folder that all code lies under
+  - frontend.py: translate from caffe2 model to onnx model
+  - backend.py: execution engine that runs onnx on caffe2
+- tests/: test files
+
+# Testing
+
+onnx-caffe2 uses [pytest](https://docs.pytest.org) as test driver. In order to run tests, first you need to install pytest:
+
+
+```
+pip install pytest-cov
+```
+
+After installing pytest, do
+
+```
+pytest
+```
+
+to run tests.
+
+Testing coverage issues/status: https://github.com/caffe2/caffe2/blob/master/caffe2/python/onnx/ONNXOpCoverage.md
+
+# Development
+
+During development it's convenient to install caffe2 in development mode:
+
+```
+cd /path/to/caffe2
+pip install -e caffe2/
+```
+
+# License
+
+[MIT License](LICENSE)
+
diff --git a/caffe2/python/onnx/__init__.py b/caffe2/python/onnx/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
new file mode 100644
index 0000000..3d7e76a
--- /dev/null
+++ b/caffe2/python/onnx/backend.py
@@ -0,0 +1,1009 @@
+## @package onnx
+# Module caffe2.python.onnx.backend
+
+"""Backend for running ONNX on Caffe2
+
+To run this, you will need to have Caffe2 installed as well.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import collections
+from subprocess import Popen, PIPE
+import zipfile
+import itertools
+
+# When onnx is built against a version of protobuf that is older than
+# that which is vendored with caffe2, onnx will crash if caffe2's
+# vendored protobuf is loaded first. We can work around this by
+# importing onnx first, which will cause it to go out and pick up the
+# system protobuf.
+import onnx.backend
+
+import caffe2
+from caffe2.python import core, workspace, rnn_cell, gru_cell
+from caffe2.python.model_helper import ModelHelper
+from caffe2.proto import caffe2_pb2
+import caffe2.python.utils
+import numpy as np
+import onnx
+from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto
+import onnx.numpy_helper
+import onnx.defs
+import onnx.optimizer
+import onnx.shape_inference
+from onnx.backend.base import Backend, Device, DeviceType, namedtupledict
+
+from caffe2.python.onnx.workspace import Workspace
+from caffe2.python.onnx.backend_rep import Caffe2Rep
+from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep
+
+import caffe2.python._import_c_extension as C
+
+import warnings
+
+def force_unicode(s):
+    try:
+        return s.decode('utf-8')
+    except AttributeError:
+        return s
+
+def get_device_option(device):
+    m = {DeviceType.CPU: caffe2_pb2.CPU,
+         DeviceType.CUDA: caffe2_pb2.CUDA}
+    return core.DeviceOption(m[device.type], device.device_id)
+
+
+class OnnxAttributes(dict):
+    """
+    This is a more convenient way to work with ONNX/Caffe2 attributes
+    that is not the protobuf representation.
+    """
+    @staticmethod
+    def from_onnx(args):
+        d = OnnxAttributes()
+        for arg in args:
+            d[arg.name] = convertAttributeProto(arg)
+        return d
+
+    def caffe2(self, kmap=lambda k: k):
+        for k, v in self.items():
+            if kmap(k) != '':
+                yield caffe2.python.utils.MakeArgument(kmap(k), v)
+
+# TODO: Move this into ONNX main library
+def convertAttributeProto(onnx_arg):
+    """
+    Convert an ONNX AttributeProto into an appropriate Python object
+    for the type.
+
+    NB: Tensor attribute gets returned as the straight proto.
+    """
+    if onnx_arg.HasField('f'):
+        return onnx_arg.f
+    elif onnx_arg.HasField('i'):
+        return onnx_arg.i
+    elif onnx_arg.HasField('s'):
+        return onnx_arg.s
+    elif onnx_arg.HasField('t'):
+        return onnx_arg.t  # this is a proto!
+    elif onnx_arg.HasField('g'):
+        return Caffe2Backend._graph_to_net(onnx_arg.g, Caffe2Backend._known_opset_version)
+    elif len(onnx_arg.floats):
+        return list(onnx_arg.floats)
+    elif len(onnx_arg.ints):
+        return list(onnx_arg.ints)
+    elif len(onnx_arg.strings):
+        return list(onnx_arg.strings)
+    elif len(onnx_arg.graphs):
+        retval = []
+        # TODO: this doesn't work with RNN ops
+        for g in onnx_arg.graphs:
+            retval.append(Caffe2Backend._graph_to_net(g, Caffe2Backend._known_opset_version))
+        return retval
+    else:
+        raise ValueError("Unsupported ONNX attribute: {}".format(onnx_arg))
+
+
+# TODO: Move this into ONNX main library
+class OnnxNode(object):
+    """
+    Reimplementation of NodeProto from ONNX, but in a form
+    more convenient to work with from Python.
+
+    We may temporarily edit these nodes to get them into Caffe2 form,
+    before actually translating into the Caffe2 protobuf, since this
+    is easier than decomposing everything, and putting it back together
+    when we're ready.
+    """
+    def __init__(self, node):
+        self.name = str(node.name)
+        self.op_type = str(node.op_type)
+        self.attrs = OnnxAttributes.from_onnx(node.attribute)
+        self.inputs = list(node.input)
+        self.outputs = list(node.output)
+
+
+Caffe2Ops = collections.namedtuple('Caffe2Ops', ['ops', 'init_ops', 'interface_blobs'])
+
+
+class Caffe2Backend(Backend):
+
+    # The greatest version of the ONNX operator set which we are aware of.
+    # Models whose version is larger than this will cause us to emit a warning
+    # that we are attempting to translate on a "best effort" basis.
+    #
+    # If you increase this, make SURE you cross-reference all BC-breaking
+    # changes from one version to the next, and any that you did not
+    # implement, mark as broken in _broken_operators
+    _known_opset_version = 7
+
+    # This dictionary will record operators which are KNOWN to be
+    # broken, so we give a good error message rather than do something
+    # bogus and then fail.
+    _broken_operators = {
+        # 'BrokenOp': version_it_was_broken_in
+    }
+
+    # Operators that are different between Caffe2 and
+    # ONNX but only in their name.
+    # In most cases, this should be empty - as the effort of ONNX is
+    # to unify the operator definitions.
+    _renamed_operators = {
+        'Caffe2ConvTranspose':   'ConvTranspose',
+        'GlobalMaxPool':         'MaxPool',
+        'GlobalAveragePool':     'AveragePool',
+        'Pad':                   'PadImage',
+        'Neg':                   'Negative',
+        'BatchNormalization':    'SpatialBN',
+        'InstanceNormalization': 'InstanceNorm',
+        'MatMul':                'BatchMatMul',
+        'Upsample':              'ResizeNearest',
+        'Identity':              'Copy',
+        'InstanceNormalization': 'InstanceNorm',
+        'Equal':                 'EQ',
+        'Less':                  'LT',
+        'Greater':               'GT',
+        'Unsqueeze':             'ExpandDims',
+        'Loop':                  'ONNXWhile',
+        'Tile':                  'NumpyTile',
+    }
+
+    _global_renamed_attrs = {'kernel_shape': 'kernels'}
+    _per_op_renamed_attrs = {
+        'Squeeze':              {'axes': 'dims'},
+        'Unsqueeze':            {'axes': 'dims'},
+        'Transpose':            {'perm': 'axes'},
+        'Upsample':             {'mode': '',
+                                 'scales': ''},
+        'ConvTranspose':        {'output_padding': 'adjs'},
+        'Selu':                 {'gamma': 'scale'},
+        'If':                   {'then_branch': 'then_net',
+                                 'else_branch': 'else_net'}
+    }
+
+    # operators whose behavior is different beyond renaming
+    # the value is an attribute of this class that is a
+    # function from ToffeIR node_def to caffe2 op_def
+    _special_operators = {
+        'LSTM': '_create_rnn_variant',
+        'GRU': '_create_rnn_variant',
+        'RNN': '_create_rnn_variant',
+        'Loop': '_create_loop',
+        'If': '_create_if',
+        'Upsample': '_create_upsample',
+    }
+
+    # Dummy name generator
+    _dummy_name = C.DummyName()
+
+    @classmethod
+    def dummy_name(cls):
+        return cls._dummy_name.new_dummy_name()
+
+    # NB: By default, you will use the LATEST definition of the operator,
+    # so this interface MAY make BC-breaking changes.  Specify an
+    # opset_version if you don't want this to version.
+    @classmethod
+    def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None):
+        super(Caffe2Backend, cls).run_node(node, inputs, device=device,
+                                           outputs_info=outputs_info, opset_version=opset_version)
+
+        device_option = get_device_option(Device(device))
+        ws = Workspace()
+        with core.DeviceScope(device_option):  # temporary!
+            if isinstance(inputs, dict):
+                for key, value in inputs.items():
+                    ws.FeedBlob(key, value)
+            else:
+                assert len(node.input) == len(inputs), "{}: expected {} but got {}".format(
+                    node.op_type, len(node.input), len(inputs))
+                for key, value in zip(node.input, inputs):
+                    ws.FeedBlob(key, value)
+
+            ops = []
+            cbackend = C.Caffe2Backend(cls._dummy_name)
+            ops_str = cbackend.convert_node(node.SerializeToString(), opset_version)
+            for s in ops_str[0] + ops_str[1]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                op.device_option.CopyFrom(device_option)
+                ops.append(op)
+            # For testing
+            if "ONNX_CAFFE2_DEBUG" in os.environ:
+                init_ops, ops2, _ = cls._onnx_node_to_caffe2_op(
+                    None, None, node, opset_version or cls._known_opset_version)
+                ops2 = init_ops + ops2
+                for op in ops2:
+                    op.device_option.CopyFrom(device_option)
+                print("\nC++:\n{}\nPython:\n{}".format(ops, ops2))
+            ws.RunOperatorsOnce(ops)
+            output_values = [ws.FetchBlob(name) for name in node.output]
+            return namedtupledict('Outputs', node.output)(*output_values)
+
+    @classmethod
+    def _create_tensor_filling_op(cls, onnx_tensor, name=None):
+        """
+        Given an Onnx TensorProto, translate it into a Caffe2 operator
+        which produces the given tensor filling op.
+        """
+        assert name or onnx_tensor.name
+        name = name or onnx_tensor.name
+
+        c2_op = caffe2_pb2.OperatorDef()
+
+        c2_values = c2_op.arg.add()
+        c2_values.name = "values"
+
+        def tensor2list(onnx_tensor):
+            # Use the onnx.numpy_helper because the data may be raw
+            return onnx.numpy_helper.to_array(onnx_tensor).flatten().tolist()
+
+        if onnx_tensor.data_type in [TensorProto.FLOAT]:
+            c2_op.type = 'GivenTensorFill'
+            c2_values.floats.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [TensorProto.DOUBLE]:
+            c2_op.type = 'GivenTensorDoubleFill'
+            c2_values.floats.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [TensorProto.INT64,
+                                       TensorProto.UINT32]:
+            c2_op.type = 'GivenTensorInt64Fill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [TensorProto.UINT8,
+                                       TensorProto.INT8,
+                                       TensorProto.UINT16,
+                                       TensorProto.INT16,
+                                       TensorProto.INT32]:
+            c2_op.type = 'GivenTensorIntFill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type == TensorProto.BOOL:
+            c2_op.type = 'GivenTensorBoolFill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type == TensorProto.STRING:
+            c2_op.type = 'GivenTensorStringFill'
+            c2_values.strings.extend(onnx_tensor.string_data)
+        else:
+            raise RuntimeError(
+                "unrecognized tensor type {}".format(onnx_tensor.data_type))
+
+        c2_shape = c2_op.arg.add()
+        c2_shape.name = "shape"
+        c2_shape.ints.extend(onnx_tensor.dims)
+
+        c2_op.output.append(name)
+
+        return c2_op
+
+    @classmethod
+    def _rnn_reform_weights(cls, reforms, name, hidden_size, init_net, gates, reorder_indices):
+        for name_from, name_to, do_concat, extra_dims in reforms:
+            gate_blobs = ['%s/%s_%s' % (name, prefix, name_to) for prefix in gates]
+            for i, x in enumerate(gate_blobs):
+                dim0 = i * hidden_size, (i+1) * hidden_size
+                starts, ends = zip(dim0, *extra_dims)
+                init_net.Slice(name_from, x, starts=starts, ends=ends)
+            if do_concat:
+                reordered_gate_blobs = [gate_blobs[i] for i in reorder_indices]
+                init_net.Concat(reordered_gate_blobs, ['%s/%s' % (name, name_to), cls.dummy_name()], axis=0)
+
+    @classmethod
+    def _make_rnn_direction(cls, input_blob, B, W, R, initial_states_and_names, sequence_lens,
+                            pred_mh, init_net,
+                            input_size, hidden_size, num_gates, direction_offset,
+                            Bi, Br, W_, R_,
+                            reform, make_cell, keep_outputs):
+        name = cls.dummy_name()
+
+        # input and recurrence biases are squashed together in onnx
+        # but not in caffe2
+        gates_hidden_size = num_gates * hidden_size
+        bias_offset = 2 * direction_offset * gates_hidden_size
+        weight_offset = direction_offset * gates_hidden_size
+        Bi = init_net.Slice(B, name + Bi,
+                            starts=[bias_offset + 0 * gates_hidden_size],
+                            ends  =[bias_offset + 1 * gates_hidden_size])
+        Br = init_net.Slice(B, name + Br,
+                            starts=[bias_offset + 1 * gates_hidden_size],
+                            ends  =[bias_offset + 2 * gates_hidden_size])
+        W_ = init_net.Slice(W, name + W_,
+                            starts=[weight_offset + 0 * gates_hidden_size, 0],
+                            ends  =[weight_offset + 1 * gates_hidden_size,-1])
+        R_ = init_net.Slice(R, name + R_,
+                            starts=[weight_offset + 0 * gates_hidden_size, 0],
+                            ends  =[weight_offset + 1 * gates_hidden_size,-1])
+
+        initial_states_sliced = []
+        for initial_state, name_suffix in initial_states_and_names:
+            initial_states_sliced.append(
+                pred_mh.net.Slice(initial_state, name + name_suffix,
+                                  starts=[direction_offset + 0, 0, 0],
+                                  ends  =[direction_offset + 1,-1,-1]))
+
+        if direction_offset == 1:
+            if sequence_lens is not None:
+                seq_lens_for_reverse = sequence_lens
+            else:
+                input_shape = pred_mh.net.Shape(input_blob, name + '/input_shape')
+                batch_size = pred_mh.net.Slice(input_shape, name + '/batch_size_slice', starts=[1], ends=[2])
+                seq_len = pred_mh.net.Slice(input_shape, name + '/seq_len_slice', starts=[0], ends=[1])
+                dummy_sequence_lens = pred_mh.net.Tile([seq_len, batch_size], name + '/dummy_sequence_lens', axis=0)
+                pred_mh.net.Reshape(dummy_sequence_lens, [dummy_sequence_lens, cls.dummy_name()], shape=[-1])
+                seq_lens_for_reverse = pred_mh.net.Cast(dummy_sequence_lens, name + '/seq_lens_for_reverse', to=core.DataType.INT32)
+        reform(Bi, Br, W_, R_, name, hidden_size, init_net)
+
+        if direction_offset == 1:
+            input = pred_mh.net.ReversePackedSegs(
+                [input_blob, seq_lens_for_reverse], name + "/input-reversed")
+        else:
+            input = input_blob
+
+        outputs = keep_outputs(list(make_cell(
+            pred_mh,
+            input,
+            sequence_lens,
+            initial_states_sliced,
+            input_size,
+            hidden_size,
+            name,
+            drop_states=False,
+            forward_only=True,
+        )))
+
+        if direction_offset == 1:
+            outputs[0] = pred_mh.net.ReversePackedSegs(
+                [outputs[0], seq_lens_for_reverse], name + "/output-reversed")
+
+        return outputs
+
+    @classmethod
+    def _create_upsample(cls, init_model, pred_model, n, opset_version):
+        c2_op = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n, opset_version)
+        if opset_version >= 7:
+            if len(n.attrs['scales']) != 4:
+                raise ValueError("The scales argument should have size 4")
+            elif not (np.isclose(n.attrs['scales'][0], 1) and np.isclose(n.attrs['scales'][1], 1)):
+                raise ValueError("The first two elements in the scales argument must be 1")
+            c2_op.arg.extend([caffe2.python.utils.MakeArgument('height_scale', n.attrs['scales'][2])])
+            c2_op.arg.extend([caffe2.python.utils.MakeArgument('width_scale', n.attrs['scales'][3])])
+
+        return c2_op
+
+    @classmethod
+    def _create_rnn_variant(cls, init_model, pred_model, n, opset_version):
+        assert init_model is not None, "cannot convert RNNs without access to the full model"
+        assert pred_model is not None, "cannot convert RNNs without access to the full model"
+
+        attrs = dict(n.attrs) # make a copy, which is safe to mutate
+        hidden_size = attrs.pop('hidden_size')
+        direction = force_unicode(attrs.pop('direction', 'forward'))
+
+        if n.op_type == 'RNN':
+            activation = force_unicode(attrs.pop('activations', ('tanh',))[0])
+        elif n.op_type == 'GRU':
+            linear_before_reset = attrs.pop('linear_before_reset', 0)
+
+        assert not attrs, "unsupported RNN attributes: " + str(attrs.keys())
+        assert direction in ['forward', 'bidirectional'], "unsupported backwards RNN/GRU/LSTM"
+
+        if n.op_type in ['RNN', 'GRU']:
+            input_blob, W, R, B, sequence_lens, initial_h = n.inputs
+        elif n.op_type == 'LSTM':
+            input_blob, W, R, B, sequence_lens, initial_h, initial_c = n.inputs
+
+        if sequence_lens == "":
+            sequence_lens = None
+
+        for x in itertools.chain(init_model.graph.input,
+                                 init_model.graph.value_info,
+                                 pred_model.graph.input,
+                                 pred_model.graph.value_info):
+            if x.name == W:
+                input_size = x.type.tensor_type.shape.dim[2].dim_value
+                break
+        else:
+            raise RuntimeError("best-effort shape inference for RNN/GRU/LSTM failed")
+
+        pred_mh = ModelHelper()
+        init_net = core.Net("init-net")
+
+        init_net.Reshape(W, [W, cls.dummy_name()], shape=[1,-1,0])
+        init_net.Squeeze(W, W, dims=[0])
+        init_net.Reshape(R, [R, cls.dummy_name()], shape=[1,-1,0])
+        init_net.Squeeze(R, R, dims=[0])
+        init_net.Reshape(B, [B, cls.dummy_name()], shape=[1,-1])
+        init_net.Squeeze(B, B, dims=[0])
+
+        if n.op_type == 'RNN':
+            def reform(*args):
+                pass
+
+            def make_cell(*args, **kwargs):
+                return rnn_cell.BasicRNN(*args, activation=activation, **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens,
+                    pred_mh, init_net, input_size, hidden_size, 1, direction_offset,
+                    "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w",
+                    reform, make_cell, lambda x: x)
+
+        elif n.op_type == 'GRU':
+            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
+                # caffe2 has a different order from onnx. We need to rearrange
+                #  z r h  -> r z h
+                reforms = ((W_, 'i2h_w',    True,  [(0,-1)]),
+                           (R_, 'gate_t_w', False, [(0,-1)]),
+                           (Bi, 'i2h_b',    True,  []),
+                           (Br, 'gate_t_b', False, []))
+                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
+                                        ['update', 'reset', 'output'], [1, 0, 2])
+
+            def make_cell(*args, **kwargs):
+                return gru_cell.GRU(*args, linear_before_reset=linear_before_reset, **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens,
+                    pred_mh, init_net, input_size, hidden_size, 3, direction_offset,
+                    "_bias_i2h", "_bias_gates", "/i2h_w_pre", "/gates_t_w_pre",
+                    reform, make_cell, lambda x: x)
+
+        elif n.op_type == 'LSTM':
+            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
+                # caffe2 has a different order from onnx. We need to rearrange
+                #   i o f c -> i f o c
+                reforms = ((W_, 'i2h_w',     True, [(0, -1)]),
+                           (R_, 'gates_t_w', True, [(0, -1)]),
+                           (Bi, 'i2h_b'    , True, []),
+                           (Br, 'gates_t_b', True, []))
+                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
+                                        ['input', 'output', 'forget', 'cell'], [0, 2, 1, 3])
+
+            def make_cell(*args, **kwargs):
+                return rnn_cell.LSTM(*args, **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h'), (initial_c, '/initial_c')], sequence_lens,
+                    pred_mh, init_net, input_size, hidden_size, 4, direction_offset,
+                    "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w",
+                    reform, make_cell, lambda x: [x[0], x[1], x[3]])
+
+        if direction == 'forward':
+            outputs = make_rnn(0)
+
+            # in the forward case, storage is shared between the
+            # last outputs. We need to decouple them so that the
+            # VariableLengthSequencePadding only mutates
+            # n.outputs[0]
+            for i in range(1, len(outputs)):
+                pred_mh.net.Copy(outputs[i], n.outputs[i])
+
+            if sequence_lens is not None:
+                pred_mh.net.VariableLengthSequencePadding(
+                    [outputs[0], sequence_lens], [outputs[0]])
+            pred_mh.net.ExpandDims([outputs[0]], [n.outputs[0]], dims=[1])
+        elif direction == 'bidirectional':
+            outputs_f = make_rnn(0)
+            outputs_b = make_rnn(1)
+
+            concatted_output, _ = pred_mh.net.Concat(
+                [outputs_f[0], outputs_b[0]], [cls.dummy_name(), cls.dummy_name()], axis=2)
+            if sequence_lens is not None:
+                pred_mh.net.VariableLengthSequencePadding(
+                    [concatted_output, sequence_lens], [concatted_output])
+            reshaped_output, _ = pred_mh.net.Reshape(concatted_output, [cls.dummy_name(), cls.dummy_name()], shape=[0,0,-1,2])
+            pred_mh.net.Transpose(reshaped_output, n.outputs[0], axes=[0,3,1,2])
+            for i in range(1, len(n.outputs)):
+                pred_mh.net.Concat([outputs_f[i], outputs_b[i]],
+                                   [n.outputs[i], cls.dummy_name()], axis=0)
+
+        # We want to decide whether to put all of our weight-reshaping
+        # operators in the init net or the predict net. We can put
+        # them in the init net iff the inputs to those operators are
+        # already available, either as graph initializers, or as the
+        # output of other operators in the init net. The latter case
+        # occurs, for example, when exporting from pytorch to onnx.
+        # In most production use, we expect has_initializers to be
+        # true.
+        initializers = {i.name for i in init_model.graph.initializer}
+        outputs = {output for node in init_model.graph.node for output in node.output}
+        has_initializers = all(x in initializers or x in outputs for x in (W, R, B))
+
+        pred_ops = []
+        init_ops = []
+        (init_ops if has_initializers else pred_ops).extend(init_net.Proto().op)
+        pred_ops.extend(pred_mh.Proto().op)
+
+        return Caffe2Ops(pred_ops, init_ops, list(pred_mh.Proto().external_input))
+
+    @classmethod
+    def _create_control_op(cls, init_model, pred_model, n, opset_version):
+        control_inputs = []
+        if '__control_inputs' in n.attrs:
+            control_inputs.extend(n.attrs['__control_inputs'])
+        node = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n, opset_version)
+        node.control_input.extend(control_inputs)
+        return Caffe2Ops([node], [], [])
+
+    @classmethod
+    def _remove_ssa(cls, net, remap_dict):
+        for op in net.op:
+            for i, name in enumerate(op.output):
+                if name in remap_dict:
+                    op.output[i] = remap_dict[name]
+        for i, out in enumerate(net.external_output):
+            if out in remap_dict:
+                net.external_output[i] = remap_dict[out]
+
+    @classmethod
+    def _create_if(cls, init_model, pred_model, n, opset_version):
+        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
+        assert ops[0][0].type == 'If'
+        if_op = ops[0][0]
+        then_net = else_net = None
+        for arg in if_op.arg:
+            if arg.name == 'then_net':
+                then_net = arg.n
+            if arg.name == 'else_net':
+                else_net = arg.n
+        assert then_net and else_net
+        then_net_outs = then_net.external_output
+        else_net_outs = else_net.external_output
+        op_outputs = if_op.output
+        assert len(then_net_outs) == len(else_net_outs)
+        assert len(else_net_outs) == len(op_outputs)
+        then_net_remap = {}
+        else_net_remap = {}
+        # Un-SSA branch outputs - since we're emitting everything into the same
+        # namespace we don't need the graph output names and the op output
+        # names to be unique
+        for then_name, else_name, op_name in zip(then_net_outs, else_net_outs, op_outputs):
+            then_net_remap[then_name] = op_name
+            else_net_remap[else_name] = op_name
+        cls._remove_ssa(then_net, then_net_remap)
+        cls._remove_ssa(else_net, else_net_remap)
+        return ops
+
+    @classmethod
+    def _create_loop(cls, init_model, pred_model, n, opset_version):
+        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
+        assert ops[0][0].type == 'ONNXWhile'
+        while_op = ops[0][0]
+        while_op.arg.extend([caffe2.python.utils.MakeArgument('has_trip_count', True)])
+        while_op.arg.extend([caffe2.python.utils.MakeArgument('has_cond', True)])
+        while_op.arg.extend([caffe2.python.utils.MakeArgument('disable_scopes', True)])
+        control_inputs = []
+        for arg in while_op.arg:
+            if arg.name == '__control_inputs':
+                control_inputs = arg.strings
+        num_loop_carried_deps = 0
+        for arg in while_op.arg:
+            if arg.name == 'body':
+                num_loop_carried_deps = len(arg.n.external_input) - 2
+                arg.n.external_input.extend(control_inputs)
+        while_op.arg.extend([
+            caffe2.python.utils.MakeArgument('num_loop_carried_deps',
+                                             num_loop_carried_deps)
+        ])
+
+        return ops
+
+    @classmethod
+    def _substitute_raw_value(cls, tp, raw_values_dict):
+        if tp.HasField('raw_data') and tp.raw_data == bytes(b'__EXTERNAL'):
+            if tp.name not in raw_values_dict:
+                raise RuntimeError('TensorProto for value {} referenced raw data but it was not found!'.format(tp.name))
+            else:
+                tp.raw_data = raw_values_dict[tp.name]
+
+    @classmethod
+    def _visit_and_substitute_raw_values(cls, nodes, raw_values_dict):
+        for node in nodes:
+            for attr in node.attribute:
+                if attr.HasField('t'):
+                    cls._substitute_raw_value(attr.t, raw_values_dict)
+                for t in attr.tensors:
+                    cls._substitute_raw_value(t, raw_values_dict)
+                if attr.HasField('g'):
+                    cls._visit_and_substitute_raw_values(attr.g.node, raw_values_dict)
+                for g in attr.graphs:
+                    cls._visit_and_substitute_raw_values(g.node, raw_values_dict)
+
+    @classmethod
+    def _external_value_resolution_pass(cls, model, raw_values_dict):
+        for init in model.graph.initializer:
+            cls._substitute_raw_value(init, raw_values_dict)
+
+        cls._visit_and_substitute_raw_values(model.graph.node, raw_values_dict)
+
+
+    @classmethod
+    def _substitute_raw_value(cls, tp, raw_values_dict):
+        if tp.HasField('raw_data') and tp.raw_data == bytes(b'__EXTERNAL'):
+            if tp.name not in raw_values_dict:
+                raise RuntimeError('TensorProto for value {} referenced raw data but it was not found!'.format(tp.name))
+            else:
+                tp.raw_data = raw_values_dict[tp.name]
+
+    @classmethod
+    def _visit_and_substitute_raw_values(cls, nodes, raw_values_dict):
+        for node in nodes:
+            for attr in node.attribute:
+                if attr.HasField('t'):
+                    cls._substitute_raw_value(attr.t, raw_values_dict)
+                for t in attr.tensors:
+                    cls._substitute_raw_value(t, raw_values_dict)
+                if attr.HasField('g'):
+                    cls._visit_and_substitute_raw_values(attr.g.node, raw_values_dict)
+                for g in attr.graphs:
+                    cls._visit_and_substitute_raw_values(g.node, raw_values_dict)
+
+    @classmethod
+    def _external_value_resolution_pass(cls, model, raw_values_dict):
+        for init in model.graph.initializer:
+            cls._substitute_raw_value(init, raw_values_dict)
+
+        cls._visit_and_substitute_raw_values(model.graph.node, raw_values_dict)
+
+
+    @classmethod
+    def _direct_initialize_parameters(cls, initializer, ws, device_option):
+        for tp in initializer:
+            ws.FeedBlob(tp.name, onnx.numpy_helper.to_array(tp), device_option)
+
+    @classmethod
+    def _direct_initialize_inputs(cls, inputs, initialized, ws, device_option):
+        for value_info in inputs:
+            if value_info.name in initialized:
+                continue
+            shape = list(d.dim_value for d in value_info.type.tensor_type.shape.dim)
+            ws.FeedBlob(value_info.name, np.ones(shape), device_option)
+
+    @staticmethod
+    def optimize_onnx(input, init=False, predict=False):
+        passes =  ['fuse_consecutive_transposes',
+                   'eliminate_nop_transpose',
+                   'fuse_transpose_into_gemm',
+                   'lift_lexical_references']
+        if init:
+            passes.append('split_init')
+        if predict:
+            passes.append('split_predict')
+        out = onnx.optimizer.optimize(input, passes)
+        return out
+
+    @classmethod
+    def prepare_zip_archive(cls, file, device='CPU', **kwargs):
+        with zipfile.ZipFile(file, mode='r') as z:
+            with z.open('__MODEL_PROTO', 'r') as f:
+                model = onnx.load(f);
+            blob_names = set(z.namelist()) - set('__MODEL_PROTO')
+            # TODO: make this more efficient
+            raw_values_dict = {}
+            for name in blob_names:
+                with z.open(name, 'r') as blob_file:
+                    raw_values_dict[name] = blob_file.read()
+
+        return cls.prepare(model, device, raw_values_dict=raw_values_dict, **kwargs)
+
+    @classmethod
+    def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
+        '''
+        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,
+
+        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
+        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
+        there is no way we can know which blob is the input of the predict_graph.
+        '''
+        if not kwargs.pop('no_check_UNSAFE', False):
+            super(Caffe2Backend, cls).prepare(model, device, **kwargs)
+        opset_version = None
+        for imp in model.opset_import:
+            if not imp.HasField("domain") or imp.domain == "":
+                opset_version = imp.version
+                if imp.version > cls._known_opset_version:
+                    warnings.warn("This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail.".format(cls._known_opset_version, imp.version))
+            else:
+                warnings.warn("Unrecognized operator set {}".format(imp.domain))
+        if opset_version is None:
+            if model.ir_version >= 0x00000003:
+                raise RuntimeError("Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)")
+            else:
+                opset_version = 1
+
+        model = onnx.shape_inference.infer_shapes(model)
+
+        # Check whether we have RNN related ops
+        pred_model = cls.optimize_onnx(model, predict=True)
+        rnn_nodes = []
+        for node in pred_model.graph.node:
+            if node.op_type in {'LSTM', 'GRU', 'RNN'}:
+                rnn_nodes.append(node)
+
+        # Build the C++ backend
+        # TODO: build a predictor that supports GPU
+        #       And for RNN nets, we need to avoid adding init_net
+        use_cpp_backend = device == 'CPU' and not rnn_nodes
+        # use python backend for now
+        use_cpp_backend = False
+        if use_cpp_backend:
+            c2_rnn_ops = []
+            if rnn_nodes:
+                init_model = cls.optimize_onnx(model, init=True)
+                for node in rnn_nodes:
+                    c2ops = cls._onnx_node_to_caffe2_op(
+                        init_model, pred_model, node, opset_version)
+                    init_ops = [x.SerializeToString() for x in c2ops.init_ops]
+                    ops = [x.SerializeToString() for x in c2ops.ops]
+                    external_inputs = c2ops.interface_blobs
+                    c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs))
+                del init_model
+
+            cbackend = C.Caffe2Backend(cls._dummy_name)
+            if raw_values_dict:
+                cls._external_value_resolution_pass(model, raw_values_dict)
+            rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops)
+            # For testing
+            # Dump the net descriptions to file for comparison with the Python ones
+            if "ONNX_CAFFE2_DEBUG" in os.environ:
+                pred_net_str = rep.pred_net()
+                pn = caffe2_pb2.NetDef()
+                pn.ParseFromString(pred_net_str)
+                init_net_str = rep.init_net()
+                inn = caffe2_pb2.NetDef()
+                inn.ParseFromString(init_net_str)
+                with open("cpp.txt", "w") as f:
+                    f.write("pred_net: \n{}".format(pn))
+
+            rep_wrapper = Caffe2CppRep(rep)
+            return rep_wrapper
+        else:
+            ws = Workspace()
+            device_option = get_device_option(Device(device))
+
+            init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
+
+            if raw_values_dict:
+                cls._external_value_resolution_pass(model, raw_values_dict)
+
+            # Directly load initializer data into blobs in workspace
+            cls._direct_initialize_parameters(
+                model.graph.initializer,
+                ws,
+                device_option,
+            )
+
+            initialized = {init.name for init in model.graph.initializer}
+
+            cls._direct_initialize_inputs(
+                model.graph.input,
+                initialized,
+                ws,
+                device_option,
+            )
+
+            uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
+
+            if "ONNX_CAFFE2_DEBUG" in os.environ:
+                with open("python.txt", "w") as f:
+                    f.write("pred_net: \n{}".format(predict_net))
+            retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
+            return retval
+
+
+    @classmethod
+    # TODO: This method needs a refactor for clarity
+    def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version):
+        cbackend = C.Caffe2Backend(cls._dummy_name)
+        if cbackend.support_onnx_import(node_def.op_type):
+            op_strs = cbackend.convert_node(node_def.SerializeToString(), opset_version)
+            init_ops = []
+            for s in op_strs[0]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                init_ops.append(op)
+            ops = []
+            for s in op_strs[1]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                ops.append(op)
+            return Caffe2Ops(ops, init_ops, [])
+
+        if node_def.op_type in cls._special_operators:
+            translator = getattr(cls, cls._special_operators[node_def.op_type])
+        else:
+            translator = cls._common_onnx_node_to_caffe2_op
+        ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
+        if isinstance(ops, Caffe2Ops):
+            return ops
+        if not isinstance(ops, collections.Iterable):
+            ops = [ops]
+        return Caffe2Ops(ops, [], [])
+
+    _broadcast_operators = {
+        'Add',
+        'Sub',
+    }
+
+    @classmethod
+    def _common_onnx_node_to_caffe2_op(cls, init_model, pred_model, onnx_node, opset_version):
+        """
+        This translator performs the basic translation of ONNX nodes into
+        Caffe2 operators.  Besides doing a straightforward marshalling from
+        one format to another, it also does these extra things:
+
+          - Renames operators based on '_renamed_operators'
+          - Renames attributes based on '_global_renamed_attrs' and
+            '_per_op_renamed_attrs'
+
+        If you're writing a custom translator, consider calling this first,
+        and then fixing things up further.
+        """
+        c2_op = caffe2_pb2.OperatorDef()
+
+        c2_op.input.extend(onnx_node.inputs)
+        c2_op.output.extend(onnx_node.outputs)
+        c2_op.name = onnx_node.name
+
+
+        onnx_op_type = onnx_node.op_type
+        broken_version = cls._broken_operators.get(onnx_op_type, float('Inf'))
+        if broken_version <= opset_version:
+            raise ValueError(
+                "Don't know how to translate op {} in ONNX operator set v{} (I only support prior to v{})".format(onnx_op_type, opset_version, broken_version))
+        c2_op.type = cls._renamed_operators.get(onnx_op_type, onnx_op_type)
+        if not core.IsOperator(c2_op.type):
+            raise ValueError(
+                "Don't know how to translate op {}".format(onnx_op_type))
+
+        def kmap(k):
+            if (onnx_op_type in cls._per_op_renamed_attrs and
+                    k in cls._per_op_renamed_attrs[onnx_op_type]):
+                return cls._per_op_renamed_attrs[onnx_op_type][k]
+            if k in cls._global_renamed_attrs:
+                return cls._global_renamed_attrs[k]
+            return k
+        c2_op.arg.extend(onnx_node.attrs.caffe2(kmap=kmap))
+        if c2_op.type in cls._broadcast_operators:
+            already_broadcast = False
+            for arg in c2_op.arg:
+                if arg.name == 'broadcast':
+                    already_broadcast = True
+            if not already_broadcast:
+                c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)])
+
+        return c2_op
+
+    @staticmethod
+    def _all_names_in_graph(graph):
+        if graph is None:
+            return set()
+
+        names = set()
+        names.update(value_info.name for value_info in graph.input)
+        names.update(value_info.name for value_info in graph.output)
+        for node in graph.node:
+            names.update(node.input)
+            names.update(node.output)
+        return names
+
+    @classmethod
+    def _graph_to_net(cls, onnx_graph, opset_version):
+        net = caffe2_pb2.NetDef()
+        for node in onnx_graph.node:
+            try:
+                c2ops = cls._onnx_node_to_caffe2_op(
+                    None, None, node, opset_version)
+            except Exception as e:
+                success = False
+                print('ONNX FATAL:', e)
+                continue
+            net.op.extend(c2ops.init_ops)
+            net.op.extend(c2ops.ops)
+            net.external_input.extend(c2ops.interface_blobs)
+        net.external_output.extend(
+            value_info.name for value_info in onnx_graph.output)
+        net.external_input.extend(
+            value_info.name for value_info in onnx_graph.input)
+        return net
+
+    @classmethod
+    def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers):
+        device_option = get_device_option(Device(device))
+
+        init_model = cls.optimize_onnx(onnx_model, init=True)
+        pred_model = cls.optimize_onnx(onnx_model, predict=True)
+
+        init_net = caffe2_pb2.NetDef()
+        pred_net = caffe2_pb2.NetDef()
+
+        init_net.name = onnx_model.graph.name + '_init'
+        pred_net.name = onnx_model.graph.name + '_predict'
+
+        if include_initializers:
+            init_net.op.extend(cls._create_tensor_filling_op(tp) for tp in onnx_model.graph.initializer)
+
+        cls._dummy_name.reset(cls._all_names_in_graph(init_model.graph) | cls._all_names_in_graph(pred_model.graph))
+
+        success = True
+        for net, model in ( (init_net, init_model), (pred_net, pred_model) ):
+            net.device_option.CopyFrom(device_option)
+            for node in model.graph.node:
+                try:
+                    c2ops = cls._onnx_node_to_caffe2_op(
+                        init_model, pred_model, node, opset_version)
+                except Exception as e:
+                    success = False
+                    print('ONNX FATAL:', e)
+                    continue
+                init_net.op.extend(c2ops.init_ops)
+                net.op.extend(c2ops.ops)
+                net.external_input.extend(c2ops.interface_blobs)
+            net.external_output.extend(
+                value_info.name for value_info in model.graph.output)
+            net.external_input.extend(
+                value_info.name for value_info in model.graph.input)
+
+        if not success:
+            raise RuntimeError('ONNX conversion failed')
+
+        return init_net, pred_net
+
+    # wrapper for backwards compatability
+    @classmethod
+    def onnx_graph_to_caffe2_net(cls, model, device="CPU", opset_version=_known_opset_version):
+        return cls._onnx_model_to_caffe2_net(model, device=device, opset_version=opset_version, include_initializers=True)
+
+    @classmethod
+    def supports_device(cls, device_str):
+        device = Device(device_str)
+        if device.type == DeviceType.CPU:
+            return True
+        elif device.type == DeviceType.CUDA:
+            return workspace.has_gpu_support
+        return False
+
+    @classmethod
+    def is_compatible(cls, model, device='CPU', **kwargs):
+        if hasattr(super(Caffe2Backend, cls), 'is_compatible') \
+           and callable(super(Caffe2Backend, cls).is_compatible):
+            if not super(Caffe2Backend, cls).is_compatible(model, device, **kwargs):
+                return False
+        # TODO: should have an unspported list of operators, be optimistic for now
+        return True
+
+prepare = Caffe2Backend.prepare
+
+prepare_zip_archive = Caffe2Backend.prepare_zip_archive
+
+run_node = Caffe2Backend.run_node
+
+run_model = Caffe2Backend.run_model
+
+supports_device = Caffe2Backend.supports_device  # noqa
+
+is_compatible = Caffe2Backend.is_compatible
diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py
new file mode 100644
index 0000000..27135b3
--- /dev/null
+++ b/caffe2/python/onnx/backend_cpp_rep.py
@@ -0,0 +1,52 @@
+## @package onnx
+# Module caffe2.python.onnx.backend_rep_cpp
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from onnx.backend.base import BackendRep, namedtupledict
+
+# This is a wrapper around C++ Caffe2BackendRep,
+# mainly to handle the different input and output types for convenience of Python
+class Caffe2CppRep(BackendRep):
+    def __init__(self, cpp_rep):
+        super(Caffe2CppRep, self).__init__()
+        self.__core = cpp_rep
+        self.__external_outputs = cpp_rep.external_outputs()
+        self.__external_inputs = cpp_rep.external_inputs()
+        self.__uninitialized_inputs = cpp_rep.uninitialized_inputs()
+
+    def init_net(self):
+        return self.__core.init_net()
+
+    def pred_net(self):
+        return self.__core.pred_net()
+
+    def external_outputs(self):
+        return self.__core.external_outputs()
+
+    def external_inputs(self):
+        return self.__core.external_inputs()
+
+    def run(self, inputs):
+        output_values = None
+        if isinstance(inputs, dict):
+            output_values = self.__core.run(inputs)
+        elif isinstance(inputs, list) or isinstance(inputs, tuple):
+            if len(inputs) != len(self.__uninitialized_inputs):
+                raise RuntimeError('Expected {} values for uninitialized '
+                                   'graph inputs ({}), but got {}.'.format(
+                                        len(self.__uninitialized_inputs),
+                                        ', '.join(self.__uninitialized_inputs),
+                                        len(inputs)))
+            input_map = {}
+            for k, v in zip(self.__uninitialized_inputs, inputs):
+                input_map[k] = v
+            output_values = self.__core.run(input_map)
+        else:
+            # single input
+            output_values = self.__core.run([inputs])
+        return namedtupledict('Outputs', self.__external_outputs)(*output_values)
+
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
new file mode 100644
index 0000000..8cc3f9e
--- /dev/null
+++ b/caffe2/python/onnx/backend_rep.py
@@ -0,0 +1,61 @@
+## @package onnx
+# Module caffe2.python.onnx.backend_rep
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.proto import caffe2_pb2
+from onnx.backend.base import BackendRep, namedtupledict
+
+class Caffe2Rep(BackendRep):
+    def __init__(self, init_net, predict_net, workspace, uninitialized):
+        super(Caffe2Rep, self).__init__()
+        self.init_net = init_net
+        self.predict_net = predict_net
+        self.workspace = workspace
+        # The list of uninitialized external_inputs in workspace, we need this to
+        # pair the name with given sequence inputs.
+        self.uninitialized = uninitialized
+        self.nets_created = False
+        self.ran_init_net = False
+
+    @property
+    def _name_scope(self):
+        if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
+            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
+        return ''
+
+    def run(self, inputs, **kwargs):
+        super(Caffe2Rep, self).run(inputs, **kwargs)
+        with core.DeviceScope(self.predict_net.device_option):
+            if isinstance(inputs, dict):
+                with core.NameScope(self._name_scope):
+                    for key, value in inputs.items():
+                        self.workspace.FeedBlob(key, value)
+            elif isinstance(inputs, list) or isinstance(inputs, tuple):
+                if len(self.uninitialized) != len(inputs):
+                    raise RuntimeError('Expected {} values for uninitialized '
+                                       'graph inputs ({}), but got {}.'.format(
+                                           len(self.uninitialized),
+                                           ', '.join(self.uninitialized),
+                                           len(inputs)))
+                for i, value in enumerate(inputs):
+                    # namescope already baked into protobuf
+                    self.workspace.FeedBlob(self.uninitialized[i], value)
+            else:
+                # single input
+                self.workspace.FeedBlob(self.uninitialized[0], inputs)
+            if not self.nets_created:
+                self.workspace.CreateNet(self.init_net)
+                self.workspace.CreateNet(self.predict_net)
+                self.nets_created = True
+            if not self.ran_init_net:
+                self.workspace.RunNet(self.init_net.name)
+                self.ran_init_net = True
+            self.workspace.RunNet(self.predict_net.name)
+        output_values = [self.workspace.FetchBlob(name)
+                         for name in self.predict_net.external_output]
+        return namedtupledict('Outputs',
+                              self.predict_net.external_output)(*output_values)
diff --git a/caffe2/python/onnx/bin/__init__.py b/caffe2/python/onnx/bin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
new file mode 100644
index 0000000..a30ebdf
--- /dev/null
+++ b/caffe2/python/onnx/bin/conversion.py
@@ -0,0 +1,89 @@
+## @package onnx
+# Module caffe2.python.onnx.bin.conversion
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from caffe2.proto import caffe2_pb2
+import click
+import numpy as np
+from onnx import checker, ModelProto
+
+from caffe2.python.onnx.backend import Caffe2Backend as c2
+import caffe2.python.onnx.frontend as c2_onnx
+
+
+@click.command(
+    help='convert caffe2 net to onnx model',
+    context_settings={
+        'help_option_names': ['-h', '--help']
+    }
+)
+@click.argument('caffe2_net', type=click.File('rb'))
+@click.option('--caffe2-net-name',
+              type=str,
+              help="Name of the caffe2 net")
+@click.option('--caffe2-init-net',
+              type=click.File('rb'),
+              help="Path of the caffe2 init net pb file")
+@click.option('--value-info',
+              type=str,
+              help='A json string providing the '
+              'type and shape information of the inputs')
+@click.option('-o', '--output', required=True,
+              type=click.File('wb'),
+              help='Output path for the onnx model pb file')
+def caffe2_to_onnx(caffe2_net,
+                   caffe2_net_name,
+                   caffe2_init_net,
+                   value_info,
+                   output):
+    c2_net_proto = caffe2_pb2.NetDef()
+    c2_net_proto.ParseFromString(caffe2_net.read())
+    if not c2_net_proto.name and not caffe2_net_name:
+        raise click.BadParameter(
+            'The input caffe2 net does not have name, '
+            '--caffe2-net-name must be provided')
+    c2_net_proto.name = caffe2_net_name or c2_net_proto.name
+    if caffe2_init_net:
+        c2_init_net_proto = caffe2_pb2.NetDef()
+        c2_init_net_proto.ParseFromString(caffe2_init_net.read())
+        c2_init_net_proto.name = '{}_init'.format(caffe2_net_name)
+    else:
+        c2_init_net_proto = None
+
+    if value_info:
+        value_info = json.loads(value_info)
+
+    onnx_model = c2_onnx.caffe2_net_to_onnx_model(
+        predict_net=c2_net_proto,
+        init_net=c2_init_net_proto,
+        value_info=value_info)
+
+    output.write(onnx_model.SerializeToString())
+
+
+@click.command(
+    help='convert onnx model to caffe2 net',
+    context_settings={
+        'help_option_names': ['-h', '--help']
+    }
+)
+@click.argument('onnx_model', type=click.File('rb'))
+@click.option('-o', '--output', required=True,
+              type=click.File('wb'),
+              help='Output path for the caffe2 net file')
+@click.option('--init-net-output',
+              required=True,
+              type=click.File('wb'),
+              help='Output path for the caffe2 init net file')
+def onnx_to_caffe2(onnx_model, output, init_net_output):
+    onnx_model_proto = ModelProto()
+    onnx_model_proto.ParseFromString(onnx_model.read())
+
+    init_net, predict_net = c2.onnx_graph_to_caffe2_net(onnx_model_proto)
+    init_net_output.write(init_net.SerializeToString())
+    output.write(predict_net.SerializeToString())
diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py
new file mode 100644
index 0000000..da72af2
--- /dev/null
+++ b/caffe2/python/onnx/error.py
@@ -0,0 +1,8 @@
+## @package onnx
+# Module caffe2.python.onnx.error
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+class BaseException(Exception): pass
+class Unsupported(BaseException): pass
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
new file mode 100644
index 0000000..5fd470c
--- /dev/null
+++ b/caffe2/python/onnx/frontend.py
@@ -0,0 +1,341 @@
+## @package onnx
+# Module caffe2.python.onnx.frontend
+
+"""Caffe2 Protobuf to ONNX converter
+
+To run this, you will need to have Caffe2 installed as well.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import itertools
+import collections
+import logging
+import re
+
+from caffe2.python import core as caffe2_core
+from caffe2.proto import caffe2_legacy_pb2
+from enum import Enum
+from onnx import (defs, checker, helper, numpy_helper, mapping,
+                  ModelProto, GraphProto, NodeProto, AttributeProto, TensorProto, OperatorSetIdProto)
+from onnx.helper import make_tensor, make_tensor_value_info, make_attribute, make_model
+import numpy as np
+
+from caffe2.python.onnx.helper import c2_native_run_net
+from caffe2.python.onnx.error import Unsupported
+
+import caffe2.python._import_c_extension as C
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Caffe2Frontend(object):
+    # This number controls the semantics of the operators we target.  Whenever
+    # ONNX makes a BC breaking change to semantics of operators, having this set
+    # to an accurate number will prevent our models form exporting.  However,
+    # we should strive to keep this up-to-date as much as possible.
+    target_opset_version = 8
+
+    _renamed_operators = {
+        'SpatialBN': 'BatchNormalization',
+        'Conv1D': 'Conv',
+        'Conv2D': 'Conv',
+        'Conv3D': 'Conv',
+        'ConvTranspose1D': 'ConvTranspose',
+        'ConvTranspose2D': 'ConvTranspose',
+        'ConvTranspose3D': 'ConvTranspose',
+        'MaxPool1D': 'MaxPool',
+        'MaxPool2D': 'MaxPool',
+        'MaxPool3D': 'MaxPool',
+        'AveragePool1D': 'AveragePool',
+        'AveragePool2D': 'AveragePool',
+        'AveragePool3D': 'AveragePool',
+    }
+
+    # caffe2 arguments that are completely removed in onnx
+    _blacklist_caffe2_args = {
+        'order': {b'NCHW'},
+        'cudnn_exhaustive_search': {0, 1},
+        'use_cudnn': {0, 1},
+    }
+
+    _global_renamed_args = {
+        'kernels': 'kernel_shape',
+    }
+
+    _per_op_renamed_args = {
+        'Squeeze': {'dims': 'axes'},
+        'Transpose': {'axes': 'perm'},
+    }
+
+    _special_operators = {}
+
+    # Dummy name generator
+    _dummy_name = C.DummyName()
+
+    @classmethod
+    def dummy_name(cls):
+        return cls._dummy_name.new_dummy_name()
+
+    @classmethod
+    def _common_caffe2_arg_to_onnx_attr(cls, op_def, arg):
+        # name
+        op_type = op_def.type
+        if op_type in cls._per_op_renamed_args:
+            name = cls._per_op_renamed_args[op_type].get(
+                arg.name, arg.name)
+        else:
+            name = cls._global_renamed_args.get(arg.name, arg.name)
+
+        # value
+        if arg.HasField('f'):
+            value = arg.f
+        elif arg.HasField('i'):
+            value = arg.i
+        elif arg.HasField('s'):
+            value = arg.s
+        elif arg.floats:
+            value = arg.floats
+        elif arg.ints:
+            value = arg.ints
+        elif arg.strings:
+            value = arg.strings
+        else:
+            raise ValueError('Could not find data field in arg: {}'.format(arg))
+
+        if name in cls._blacklist_caffe2_args:
+            assert value in cls._blacklist_caffe2_args[arg.name]
+            return None
+
+        return helper.make_attribute(name, value)
+
+    @classmethod
+    def caffe2_arg_to_onnx_attr(cls, op_def, arg):
+        return cls._common_caffe2_arg_to_onnx_attr(op_def, arg)
+
+    @classmethod
+    def _common_caffe2_op_to_onnx_node(cls, op_def, shapes):
+        node_def = NodeProto()
+        node_def.name = op_def.name
+
+        node_def.op_type = cls._renamed_operators.get(op_def.type, op_def.type)
+
+        node_def.input.extend(op_def.input)
+        node_def.output.extend(op_def.output)
+
+        attrs = filter(None, [cls.caffe2_arg_to_onnx_attr(op_def, arg)
+                              for arg in op_def.arg])
+        node_def.attribute.extend(attrs)
+
+        return node_def
+
+    @classmethod
+    def caffe2_op_to_onnx_node(cls, op_def, shapes):
+        if C.support_onnx_export(op_def.type):
+            node_strs, tensor_strs = C.export_to_onnx(cls._dummy_name, op_def.SerializeToString(), shapes)
+            nodes = []
+            for s in node_strs:
+                node = NodeProto()
+                node.ParseFromString(s)
+                nodes.append(node)
+            const_tensors = []
+            for s in tensor_strs:
+                tensor = TensorProto()
+                tensor.ParseFromString(s)
+                const_tensors.append(tensor)
+            return nodes, const_tensors
+        elif op_def.type in cls._special_operators:
+            translator = getattr(cls, cls._special_operators[op_def.type])
+        else:
+            translator = cls._common_caffe2_op_to_onnx_node
+        nodes = translator(op_def, shapes)
+        const_tensors = []
+        if isinstance(nodes, tuple):
+            nodes, const_tensors = nodes
+        if not isinstance(nodes, collections.Iterable):
+            nodes = [nodes]
+        return nodes, const_tensors
+
+    @staticmethod
+    def _all_names_in_net(net):
+        if net is None:
+            return set()
+
+        names = set()
+        names.update(net.external_input)
+        names.update(net.external_output)
+        for op in net.op:
+            names.update(op.input)
+            names.update(op.output)
+        return names
+
+    @staticmethod
+    def _extract_value_info(tensor):
+        return make_tensor_value_info(
+            name=tensor.name,
+            elem_type=tensor.data_type,
+            shape=tensor.dims)
+
+    @classmethod
+    def caffe2_net_to_onnx_graph(cls,
+                                 predict_net,
+                                 init_net=None,
+                                 value_info=None):
+        if value_info is None:
+            value_info = {}
+        if not isinstance(value_info, dict):
+            raise ValueError('Please pass value_info as a '
+                             'name -> (type, shape) dictionary')
+
+        cls._filter_fake_init(init_net, value_info)
+        cls._ssa_rewrite(predict_net, init_net, value_info)
+
+        if init_net:
+            initializer = cls.caffe2_init_net_to_initializer(init_net)
+            value_info.update({init.name: (init.data_type, init.dims)
+                               for init in initializer})
+        else:
+            initializer = []
+
+        # Check whether we have got type shape info of all input
+        missing = (set(list(predict_net.external_input)) -
+                   set(value_info.keys()))
+        if missing:
+            raise RuntimeError('Could not find value info of inputs: {}'.format(
+                ', '.join(missing)))
+
+        inputs = {}
+        for name in predict_net.external_input:
+            elem_type, shape = value_info[name]
+            inputs[name] = np.random.randn(*shape).astype(
+                mapping.TENSOR_TYPE_TO_NP_TYPE[elem_type])
+
+        ws, outputs = c2_native_run_net(
+            init_net,
+            predict_net,
+            inputs)
+
+        for name in predict_net.external_output:
+            output = outputs[name]
+            elem_type = mapping.NP_TYPE_TO_TENSOR_TYPE[output.dtype]
+            shape = output.shape
+            value_info[name] = (elem_type, shape)
+
+        graph_def = GraphProto()
+        graph_def.name = predict_net.name
+        graph_def.initializer.extend(initializer)
+        # This is a mapping from Caffe2 names to ONNX names
+        graph_def.input.extend(
+            make_tensor_value_info(
+                name=name,
+                elem_type=value_info[name][0],
+                shape=value_info[name][1])
+            for name in predict_net.external_input)
+
+        cls._dummy_name.reset(cls._all_names_in_net(predict_net) | cls._all_names_in_net(init_net))
+
+        for op in predict_net.op:
+            shapes = {}
+            for name in itertools.chain(op.input, op.output):
+                blob = ws.FetchBlob(name)
+                if hasattr(blob, 'shape'):
+                    shapes[name] = blob.shape
+            nodes, const_tensors = cls.caffe2_op_to_onnx_node(op, shapes=shapes)
+            graph_def.node.extend(nodes)
+            graph_def.initializer.extend(const_tensors)
+            graph_def.input.extend([cls._extract_value_info(tensor) for tensor in const_tensors])
+
+        all_output = set(sum((list(node.output) for node in graph_def.node),
+                             [init.name for init in graph_def.initializer]))
+        redundant_output = set(vi.name for vi in graph_def.output) - all_output
+        if redundant_output:
+            logger.warning(
+                'There are graph output not produced by any node or initializer: {}'
+                '! Will drop them.'.format(', '.join(redundant_output)))
+        graph_def.output.extend(
+            make_tensor_value_info(
+                name=name,
+                elem_type=value_info[name][0],
+                shape=value_info[name][1])
+            for name in predict_net.external_output
+            if name in all_output)
+
+        return graph_def
+
+    @classmethod
+    def caffe2_init_net_to_initializer(cls, init_net):
+        ws, _ = c2_native_run_net(init_net=None, predict_net=init_net, inputs=[])
+        output_names = []
+        for op in init_net.op:
+            output_names.extend(op.output)
+        initializer = [numpy_helper.from_array(ws.FetchBlob(name), name=name)
+                       for name in sorted(set(output_names))]
+        return initializer
+
+    @classmethod
+    def _filter_fake_init(cls, init_net, value_info):
+        if init_net:
+            fake_inits = [op for op in init_net.op
+                          if len(op.output) == 1 and op.output[0] in value_info and
+                          re.match('GivenTensor.*Fill|ConstantFill', op.type)]
+            for fake_init in fake_inits:
+                init_net.op.remove(fake_init)
+            del fake_inits[:]
+            del fake_inits
+
+    @classmethod
+    def ssa_rewrite(cls, net, init_net, value_info):
+        return cls._ssa_rewrite(net, init_net, value_info)
+
+    @classmethod
+    def _ssa_rewrite(cls, net, init_net, value_info):
+        def ssa_name(name, version):
+            return '{}_{}'.format(name, version)
+
+        if init_net:
+            for op in init_net.op:
+                assert re.match('GivenTensor.*Fill', op.type), "type is {}, \n{}".format(op.type, op)
+                assert len(op.output) == 1
+                op.output[0] = ssa_name(op.output[0], 0)
+            init_net.external_input[:] = [ssa_name(name, 0)
+                                          for name in init_net.external_input]
+            init_net.external_output[:] = [ssa_name(name, 0)
+                                           for name in init_net.external_output]
+        if value_info:
+            ssa_value_info = {ssa_name(name, 0): value
+                              for name, value in value_info.items()}
+            value_info.clear()
+            value_info.update(ssa_value_info)
+        net.external_input[:] = [ssa_name(name, 0)
+                                 for name in net.external_input]
+        ssa, blob_versions = caffe2_core.get_ssa(net)
+        assert len(net.op) == len(ssa)
+        for op, (versioned_inputs, versioned_outputs) in zip(net.op, ssa):
+            op.input[:] = [ssa_name(name, version)
+                           for name, version in versioned_inputs]
+            op.output[:] = [ssa_name(name, version)
+                            for name, version in versioned_outputs]
+        net.external_output[:] = [ssa_name(name, blob_versions[name])
+                                  for name in net.external_output]
+
+    @classmethod
+    def caffe2_net_to_onnx_model(cls, *args, **kwargs):
+        opset_id = OperatorSetIdProto()
+        opset_id.domain = ''  # ONNX default domain
+        opset_id.version = cls.target_opset_version
+        model = make_model(cls.caffe2_net_to_onnx_graph(*args, **kwargs),
+                           opset_imports=[opset_id],  # current supported opset version
+                           producer_name='onnx-caffe2',  # producer name
+                           )
+        checker.check_model(model)
+        return model
+
+
+caffe2_net_to_onnx_graph = Caffe2Frontend.caffe2_net_to_onnx_graph
+caffe2_net_to_onnx_model = Caffe2Frontend.caffe2_net_to_onnx_model
+caffe2_init_net_to_initializer = Caffe2Frontend.caffe2_init_net_to_initializer
+ssa_rewrite = Caffe2Frontend.ssa_rewrite
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
new file mode 100644
index 0000000..5ec7051
--- /dev/null
+++ b/caffe2/python/onnx/helper.py
@@ -0,0 +1,115 @@
+## @package onnx
+# Module caffe2.python.onnx.helper
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from onnx.backend.base import namedtupledict
+
+from caffe2.python.onnx.workspace import Workspace
+import caffe2.python._import_c_extension as C
+
+import io
+import logging
+import time
+
+
+log = logging.getLogger(__name__)
+
+
+def c2_native_run_op(op_def, inputs):
+    ws = Workspace()
+    if isinstance(inputs, dict):
+        for key, value in inputs.items():
+            ws.FeedBlob(key, value, op_def.device_option)
+    else:
+        assert(len(op_def.input) == len(inputs))
+        for key, value in zip(op_def.input, inputs):
+            ws.FeedBlob(key, value, op_def.device_option)
+
+    ws.RunOperatorOnce(op_def)
+
+    output_names = op_def.output
+    output_values = [ws.FetchBlob(name) for name in output_names]
+    return ws, namedtupledict('Outputs', output_names)(*output_values)
+
+
+def c2_native_run_net(init_net, predict_net, inputs):
+    ws = Workspace()
+    if init_net:
+        ws.RunNetOnce(init_net)
+
+    if isinstance(inputs, dict):
+        for key, value in inputs.items():
+            ws.FeedBlob(key, value, predict_net.device_option)
+    else:
+        uninitialized = [input_name
+                         for input_name in predict_net.external_input
+                         if not ws.HasBlob(input_name)]
+        if len(uninitialized) == len(inputs):
+            for key, value in zip(uninitialized, inputs):
+                ws.FeedBlob(key, value, predict_net.device_option)
+        else:
+            # If everything is initialized,
+            # we just initialized the first len(inputs) external_input.
+            assert(len(inputs) <= len(predict_net.external_input))
+            for i in range(len(inputs)):
+                ws.FeedBlob(predict_net.external_input[i], inputs[i],
+                            predict_net.device_option)
+
+    ws.RunNetOnce(predict_net)
+
+    output_names = predict_net.external_output
+    output_values = [ws.FetchBlob(name) for name in output_names]
+    return ws, namedtupledict('Outputs', output_names)(*output_values)
+
+
+def load_caffe2_net(file):
+    net = caffe2_pb2.NetDef()
+    with open(file, "rb") as f:
+        net.ParseFromString(f.read())
+    return net
+
+
+def save_caffe2_net(net, file, output_txt=False):
+    with open(file, "wb") as f:
+        f.write(net.SerializeToString())
+    if output_txt:
+        with open(file + "txt", "w") as f:
+            f.write(str(net))
+
+
+def benchmark_caffe2_model(init_net, predict_net, warmup_iters=3, main_iters=10, layer_details=True):
+    '''
+        Run the benchmark net on the target model.
+        Return the execution time per iteration (millisecond).
+    '''
+    ws = Workspace()
+    if init_net:
+        ws.RunNetOnce(init_net)
+    ws.CreateNet(predict_net)
+    results = ws.BenchmarkNet(predict_net.name, warmup_iters, main_iters, layer_details)
+    del ws
+    return results[0]
+
+
+def benchmark_pytorch_model(model, inputs, training=False, warmup_iters=3,
+                            main_iters=10, verbose=False):
+    '''
+        Run the model several times, and measure the execution time.
+        Return the execution time per iteration (millisecond).
+    '''
+    for _i in range(warmup_iters):
+        model(*inputs)
+    total_pytorch_time = 0.0
+    for _i in range(main_iters):
+        ts = time.time()
+        model(*inputs)
+        te = time.time()
+        total_pytorch_time += te - ts
+    log.info("The PyTorch model execution time per iter is {} milliseconds, "
+             "{} iters per second.".format(total_pytorch_time / main_iters * 1000,
+                                           main_iters / total_pytorch_time))
+    return total_pytorch_time * 1000 / main_iters
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
new file mode 100644
index 0000000..002287c
--- /dev/null
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+
+import onnx
+import onnx.defs
+from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+from caffe2.python.onnx.tests.test_utils import TestCase
+
+class OnnxifiTest(TestCase):
+    @unittest.skip("Need ONNXIFI backend support")
+    def test_relu_graph(self):
+        batch_size = 1
+        X = np.random.randn(batch_size, 1, 3, 2).astype(np.float32)
+        graph_def = make_graph(
+            [make_node("Relu", ["X"], ["Y"])],
+            name="test",
+            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT,
+                [batch_size, 1, 3, 2])],
+            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT,
+                [batch_size, 1, 3, 2])])
+        model_def = make_model(graph_def, producer_name='relu-test')
+        op = core.CreateOperator(
+            "Onnxifi",
+            ["X"],
+            ["Y"],
+            onnx_model=model_def.SerializeToString(),
+            output_size_hint_0=[batch_size, 1, 3, 2])
+        workspace.FeedBlob("X", X)
+        workspace.RunOperatorOnce(op)
+        Y = workspace.FetchBlob("Y")
+        np.testing.assert_almost_equal(Y, np.maximum(X, 0))
+
+    @unittest.skip("Need ONNXIFI backend support")
+    def test_conv_graph(self):
+        X = np.array([[[[0., 1., 2., 3., 4.],  # (1, 1, 5, 5) input tensor
+                        [5., 6., 7., 8., 9.],
+                        [10., 11., 12., 13., 14.],
+                        [15., 16., 17., 18., 19.],
+                        [20., 21., 22., 23., 24.]]]]).astype(np.float32)
+        W = np.array([[[[1., 1., 1.],  # (1, 1, 3, 3) tensor for convolution weights
+                        [1., 1., 1.],
+                        [1., 1., 1.]]]]).astype(np.float32)
+        Y_without_padding = np.array([[[[54., 63., 72.],  # (1, 1, 3, 3) output tensor
+                                        [99., 108., 117.],
+                                        [144., 153., 162.]]]]).astype(np.float32)
+        graph_def = make_graph(
+            [make_node(
+                'Conv',
+                inputs=['X', 'W'],
+                outputs=['Y'],
+                kernel_shape=[3, 3],
+                # Default values for other attributes: strides=[1, 1], dilations=[1, 1], groups=1
+                pads=[0, 0, 0, 0],
+            )],
+            name="test",
+            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 1, 5, 5]),
+                make_tensor_value_info("W", onnx.TensorProto.FLOAT, [1, 1, 3, 3]),
+            ],
+            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT,
+                [1, 1, 3, 3])])
+        model_def = make_model(graph_def, producer_name='conv-test')
+        op = core.CreateOperator(
+            "Onnxifi",
+            ["X", "W"],
+            ["Y"],
+            onnx_model=model_def.SerializeToString(),
+            initializers=["W", "W"],
+            output_size_hint_0=[1, 1, 3, 3])
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.RunOperatorOnce(op)
+        Y = workspace.FetchBlob("Y")
+        np.testing.assert_almost_equal(Y, Y_without_padding)
+
+
diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py
new file mode 100644
index 0000000..e0a02b9
--- /dev/null
+++ b/caffe2/python/onnx/tests/__init__.py
@@ -0,0 +1,4 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
new file mode 100644
index 0000000..2198b08
--- /dev/null
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -0,0 +1,405 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.c2_ref_test
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import json
+import os
+import unittest
+
+from caffe2.python import core
+from caffe2.proto import caffe2_pb2
+
+import onnx
+from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
+
+from onnx import defs, mapping
+import caffe2.python.onnx.frontend as c2_onnx
+import caffe2.python.onnx.backend as c2
+
+import numpy as np
+from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
+
+from caffe2.python.onnx.tests.test_utils import TestCase
+
+import caffe2.python._import_c_extension as C
+
+
+class TestCaffe2Basic(TestCase):
+    def test_dummy_name(self):
+        g = C.DummyName()
+        n1 = g.new_dummy_name()
+        n2 = g.new_dummy_name()
+        assert n1 != n2, "Got same names in different calls: {}".format(n1)
+
+    def test_check_arguments(self):
+        b2 = C.Caffe2Backend()
+
+        node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"])
+        output = b2.convert_node(node_def.SerializeToString(), 6)
+
+        bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56)
+        with self.assertRaisesRegexp(
+            RuntimeError,
+            ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
+            b2.convert_node(bad_node_def.SerializeToString(), 6)
+
+    def test_relu_graph(self):
+        X = np.random.randn(3, 2).astype(np.float32)
+        Y_ref = np.clip(X, 0, np.inf)
+
+        node_def = make_node(
+            "Relu", ["X"], ["Y"])
+        output = c2.run_node(
+            node_def, {"X": X})
+        np.testing.assert_almost_equal(output.Y, Y_ref)
+
+        graph_def = make_graph(
+            [node_def],
+            name="test",
+            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [3, 2])],
+            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [3, 2])])
+        c2_rep = c2.prepare(make_model(graph_def, producer_name='caffe2-ref-test'))
+        output = c2_rep.run(X)
+        np.testing.assert_almost_equal(output.Y, Y_ref)
+
+    def test_initializer(self):
+        X = np.array([[1, 2], [3, 4]]).astype(np.float32)
+        Y = np.array([[1, 2], [3, 4]]).astype(np.float32)
+        weight = np.array([[1, 0], [0, 1]])
+        graph_def = make_graph(
+            [make_node("Add", ["X", "Y"], ["Z0"]),
+             make_node("Cast", ["Z0"], ["Z"], to=onnx.TensorProto.FLOAT),
+             make_node("Mul", ["Z", "weight"], ["W0"]),
+             make_node("Tanh", ["W0"], ["W1"]),
+             make_node("Sigmoid", ["W1"], ["W2"]),
+             make_node("Scale", ["W2"], ["W3"], scale=-1.0)],
+            name="test_initializer",
+            inputs=[
+                make_tensor_value_info("X", onnx.TensorProto.FLOAT, (2, 2)),
+                make_tensor_value_info("Y", onnx.TensorProto.FLOAT, (2, 2)),
+                make_tensor_value_info("weight", onnx.TensorProto.FLOAT, (2, 2)),
+            ],
+            outputs=[
+                make_tensor_value_info("W3", onnx.TensorProto.FLOAT, (2, 2))
+            ],
+            initializer=[make_tensor("weight",
+                                     onnx.TensorProto.FLOAT,
+                                     [2, 2],
+                                     weight.flatten().astype(float))]
+        )
+
+        def sigmoid(x):
+            return 1 / (1 + np.exp(-x))
+
+        W_ref = -sigmoid(np.tanh((X + Y) * weight))
+        c2_rep = c2.prepare(make_model(graph_def, producer_name='caffe2-ref-test'))
+        output = c2_rep.run({"X": X, "Y": Y})
+        np.testing.assert_almost_equal(output["W3"], W_ref)
+
+    def test_gemm(self):
+        # simple
+        A = np.random.randn(3, 2).astype(np.float32)
+        B = np.random.randn(2, 4).astype(np.float32)
+        C = np.random.randn(3, 4).astype(np.float32)
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"])
+        output = c2.run_node(node_def, [A, B, C])
+        np.testing.assert_almost_equal(output["Y"], np.dot(A, B) + C)
+
+        # transA
+        A = np.transpose(A)
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            transA=True)
+        output = c2.run_node(node_def, [A, B, C])
+        np.testing.assert_almost_equal(
+            output["Y"],
+            np.dot(np.transpose(A), B) + C)
+        # revert A
+        A = np.transpose(A)
+
+        # transB
+        B = np.transpose(B)
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            transB=True)
+        output = c2.run_node(node_def, [A, B, C])
+        np.testing.assert_almost_equal(
+            output["Y"],
+            np.dot(A, np.transpose(B)) + C)
+        # revert A
+        B = np.transpose(B)
+
+        # scale
+        alpha = np.random.random()
+        beta = np.random.random()
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta)
+        output = c2.run_node(node_def, [A, B, C])
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, B) + beta * C)
+
+        # broadcast
+        C = np.random.randn(4).astype(np.float32)
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta)
+        output = c2.run_node(node_def, [A, B, C])
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, B) + beta * C)
+
+    def test_tensor_filling_ops(self):
+        for dtype in [
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.DOUBLE,
+                onnx.TensorProto.BOOL,
+                onnx.TensorProto.INT8,
+                onnx.TensorProto.INT16,
+                onnx.TensorProto.INT32,
+                onnx.TensorProto.INT64,
+                onnx.TensorProto.UINT8,
+                onnx.TensorProto.UINT16,
+                onnx.TensorProto.UINT32,
+        ]:
+            shape = (1, 2, 3)
+            vals = np.random.randn(*shape)
+            if dtype != onnx.TensorProto.BOOL:
+                vals *= 5
+            vals = vals.astype(
+                mapping.TENSOR_TYPE_TO_NP_TYPE[dtype])
+            tensor = make_tensor(
+                name='test-tensor-{}'.format(dtype),
+                data_type=dtype,
+                dims=[1, 2, 3],
+                vals=vals.flatten().tolist(),
+            )
+            op = c2.Caffe2Backend._create_tensor_filling_op(tensor)
+            self.assertEqual(len(op.input), 0)
+            self.assertEqual(op.output, [tensor.name])
+            ws, output = c2_native_run_op(op, inputs=[])
+            self.assertEqual(len(output), 1)
+            np.testing.assert_almost_equal(output[0], vals)
+            np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
+
+    def test_tensor_filling_ops_c_backend(self):
+        for dtype in [
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.DOUBLE,
+                onnx.TensorProto.BOOL,
+                onnx.TensorProto.INT8,
+                onnx.TensorProto.INT16,
+                onnx.TensorProto.INT32,
+                onnx.TensorProto.INT64,
+                onnx.TensorProto.UINT8,
+                onnx.TensorProto.UINT16,
+                onnx.TensorProto.UINT32,
+        ]:
+            shape = (1, 2, 3)
+            vals = np.random.randn(*shape)
+            if dtype != onnx.TensorProto.BOOL:
+                vals *= 5
+            vals = vals.astype(
+                mapping.TENSOR_TYPE_TO_NP_TYPE[dtype])
+            tensor = make_tensor(
+                name='test-tensor-{}'.format(dtype),
+                data_type=dtype,
+                dims=[1, 2, 3],
+                vals=vals.flatten().tolist(),
+            )
+            b = C.Caffe2Backend()
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(b._build_tensor_filling_op(tensor.SerializeToString(), ''))
+            self.assertEqual(len(op.input), 0)
+            self.assertEqual(op.output, [tensor.name])
+            ws, output = c2_native_run_op(op, inputs=[])
+            self.assertEqual(len(output), 1)
+            np.testing.assert_almost_equal(output[0], vals)
+            np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
+
+
+    def test_slice(self):
+        X = np.random.randn(1, 2, 3).astype(np.float32)
+        starts = np.array([0, 1, 0], dtype=np.int32)
+        ends = np.array([-1, 2, 3], dtype=np.int32)
+
+        predict_net = caffe2_pb2.NetDef()
+        predict_net.name = 'test-slice-net'
+        predict_net.external_input[:] = ['X']
+        predict_net.external_output[:] = ['Y']
+        predict_net.op.extend([
+            core.CreateOperator(
+                'Slice',
+                inputs=['X'],
+                outputs=['Y'],
+                starts=starts,
+                ends=ends,
+            ),
+        ])
+        ws, c2_outputs = c2_native_run_net(
+            init_net=None,
+            predict_net=predict_net,
+            inputs=[X])
+
+        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
+            predict_net=predict_net,
+            value_info={
+                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
+            })
+        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
+        self.assertSameOutputs(c2_outputs, onnx_outputs)
+
+    def test_cast(self):
+        X = np.random.randn(1, 2, 3).astype(np.float32)
+
+        for to_type in ['INT8', caffe2_pb2.TensorProto.INT8,
+                        'DOUBLE', caffe2_pb2.TensorProto.DOUBLE]:
+            predict_net = caffe2_pb2.NetDef()
+            predict_net.name = 'test-cast-net'
+            predict_net.external_input[:] = ['X']
+            predict_net.external_output[:] = ['Y']
+            predict_net.op.extend([
+                core.CreateOperator(
+                    'Cast',
+                    inputs=['X'],
+                    outputs=['Y'],
+                    to=to_type,
+                ),
+            ])
+            ws, c2_outputs = c2_native_run_net(
+                init_net=None,
+                predict_net=predict_net,
+                inputs=[X])
+
+            onnx_model = c2_onnx.caffe2_net_to_onnx_model(
+                predict_net=predict_net,
+                value_info={
+                    'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
+                })
+            onnx_outputs = c2.run_model(onnx_model, inputs=[X])
+            self.assertSameOutputs(c2_outputs, onnx_outputs)
+
+
+class TestCaffe2End2End(TestCase):
+    def _model_dir(self, model):
+        caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2'))
+        models_dir = os.getenv('ONNX_MODELS', os.path.join(caffe2_home, 'models'))
+        return os.path.join(models_dir, model)
+
+    def _test_net(self,
+                  net_name,
+                  input_blob_dims=(1, 3, 224, 224),
+                  decimal=7):
+        np.random.seed(seed=0)
+        model_dir = self._model_dir(net_name)
+        if not os.path.exists(model_dir):
+            self._download(net_name)
+        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
+        c2_predict_net = caffe2_pb2.NetDef()
+        with open(c2_predict_pb, 'rb') as f:
+            c2_predict_net.ParseFromString(f.read())
+        c2_predict_net.name = net_name
+
+        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
+        c2_init_net = caffe2_pb2.NetDef()
+        with open(c2_init_pb, 'rb') as f:
+            c2_init_net.ParseFromString(f.read())
+        c2_init_net.name = net_name + '_init'
+
+        n, c, h, w = input_blob_dims
+        data = np.random.randn(n, c, h, w).astype(np.float32)
+        inputs = [data]
+        _, c2_outputs = c2_native_run_net(c2_init_net, c2_predict_net, inputs)
+        del _
+
+        model = c2_onnx.caffe2_net_to_onnx_model(
+            predict_net=c2_predict_net,
+            init_net=c2_init_net,
+            value_info=json.load(open(os.path.join(model_dir, 'value_info.json'))))
+        c2_ir = c2.prepare(model)
+        onnx_outputs = c2_ir.run(inputs)
+        self.assertSameOutputs(c2_outputs, onnx_outputs, decimal=decimal)
+
+    def _download(self, model):
+        model_dir = self._model_dir(model)
+        assert not os.path.exists(model_dir)
+        os.makedirs(model_dir)
+        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+            url = getURLFromName(model, f)
+            dest = os.path.join(model_dir, f)
+            try:
+                try:
+                    downloadFromURLToFile(url, dest,
+                                          show_progress=False)
+                except TypeError:
+                    # show_progress not supported prior to
+                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                    # (Sep 17, 2017)
+                    downloadFromURLToFile(url, dest)
+            except Exception as e:
+                print("Abort: {reason}".format(reason=e))
+                print("Cleaning up...")
+                deleteDirectory(model_dir)
+                exit(1)
+
+    def test_alexnet(self):
+        self._test_net('bvlc_alexnet', decimal=4)
+
+    def test_resnet50(self):
+        self._test_net('resnet50')
+
+    @unittest.skipIf(
+        os.environ.get('JENKINS_URL'),
+        'Taking too long to download!')
+    def test_vgg16(self):
+        self._test_net('vgg16')
+
+    @unittest.skipIf(
+        os.environ.get('JENKINS_URL'),
+        'Taking too long to download!')
+    def test_zfnet(self):
+        self._test_net('zfnet')
+
+    def test_inception_v1(self):
+        self._test_net('inception_v1', decimal=2)
+
+    def test_inception_v2(self):
+        self._test_net('inception_v2')
+
+    def test_squeezenet(self):
+        self._test_net('squeezenet')
+
+    def test_densenet121(self):
+        self._test_net('densenet121')
+
+    def test_bvlc_googlenet(self):
+        self._test_net('bvlc_googlenet')
+
+    def test_bvlc_reference_caffenet(self):
+        self._test_net('bvlc_reference_caffenet')
+
+    def test_bvlc_reference_rcnn_ilsvrc13(self):
+        self._test_net('bvlc_reference_rcnn_ilsvrc13')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py
new file mode 100644
index 0000000..311b009
--- /dev/null
+++ b/caffe2/python/onnx/tests/conversion_test.py
@@ -0,0 +1,362 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.conversion_test
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import tempfile
+import textwrap
+import traceback
+import unittest
+import zipfile
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import brew, core
+from caffe2.python.model_helper import ModelHelper
+from click.testing import CliRunner
+import numpy as np
+from onnx import helper, ModelProto, TensorProto
+from caffe2.python.onnx.helper import c2_native_run_net
+
+from caffe2.python.onnx.bin.conversion import caffe2_to_onnx, onnx_to_caffe2
+import caffe2.python.onnx.backend as c2
+from caffe2.python.onnx.tests.test_utils import TestCase
+
+
+class TestConversion(TestCase):
+    def _run_command(self, cmd, *args, **kwargs):
+        runner = CliRunner()
+        result = runner.invoke(cmd, *args, **kwargs)
+        self.assertEqual(result.exit_code, 0, textwrap.dedent('''
+        Command exited with non-zero exit code:
+        output: {}
+        exception: {}
+        exc_info: {}
+        '''.format(result.output,
+                   result.exception,
+                   traceback.format_exception(*result.exc_info))))
+        return result
+
+    def test_caffe2_to_onnx(self):
+        caffe2_net = tempfile.NamedTemporaryFile()
+        caffe2_init_net = tempfile.NamedTemporaryFile()
+        output = tempfile.NamedTemporaryFile()
+
+        model = ModelHelper(name='caffe2-to-onnx-test')
+        brew.relu(model, ["X"], "Y")
+        caffe2_net.write(model.net.Proto().SerializeToString())
+        caffe2_net.flush()
+
+        init_model = ModelHelper(name='caffe2-to-onnx-init-test')
+        init_model.net.GivenTensorFill([], 'X', shape=[2, 2],
+                                       values=np.zeros((2, 2)).flatten().astype(float))
+        caffe2_init_net.write(init_model.net.Proto().SerializeToString())
+        caffe2_init_net.flush()
+
+        result = self._run_command(
+            caffe2_to_onnx, [
+                caffe2_net.name,
+                '--caffe2-init-net', caffe2_init_net.name,
+                '--output', output.name,
+            ],
+            catch_exceptions=False,
+        )
+
+        onnx_model = ModelProto()
+        onnx_model.ParseFromString(output.read())
+        self.assertEqual(len(onnx_model.graph.node), 1)
+        self.assertEqual(onnx_model.graph.node[0].op_type, 'Relu')
+        self.assertEqual(len(onnx_model.graph.initializer), 1)
+        self.assertEqual(onnx_model.graph.initializer[0].name, onnx_model.graph.input[0].name)
+
+    def test_caffe2_to_onnx_value_info(self):
+        caffe2_net = tempfile.NamedTemporaryFile()
+        output = tempfile.NamedTemporaryFile()
+
+        model = ModelHelper(name='caffe2-to-onnx-test')
+        brew.relu(model, ["X"], "Y")
+        caffe2_net.write(model.net.Proto().SerializeToString())
+        caffe2_net.flush()
+
+        args = [caffe2_net.name, '--output', output.name]
+        self.assertRaisesRegexp(Exception,
+                                'value info',
+                                self._run_command, caffe2_to_onnx, args)
+
+        args.extend([
+            '--value-info',
+            json.dumps({
+                'X': (TensorProto.FLOAT, (2, 2)),
+            })])
+        result = self._run_command(caffe2_to_onnx, args)
+
+        onnx_model = ModelProto()
+        onnx_model.ParseFromString(output.read())
+        self.assertEqual(len(onnx_model.graph.node), 1)
+        self.assertEqual(onnx_model.graph.node[0].op_type, 'Relu')
+        self.assertEqual(len(onnx_model.graph.initializer), 0)
+
+    def test_onnx_to_caffe2(self):
+        onnx_model = tempfile.NamedTemporaryFile()
+        output = tempfile.NamedTemporaryFile()
+        init_net_output = tempfile.NamedTemporaryFile()
+
+        node_def = helper.make_node(
+            "Mul", ["X", "W"], ["Y"])
+        graph_def = helper.make_graph(
+            [node_def],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("W", TensorProto.FLOAT, (3, 2))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
+            initializer=[helper.make_tensor("W",
+                                            TensorProto.FLOAT,
+                                            [3, 2],
+                                            np.zeros((3, 2)).flatten().astype(float))])
+        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
+        onnx_model.write(model_def.SerializeToString())
+        onnx_model.flush()
+
+        result = self._run_command(
+            onnx_to_caffe2, [
+                onnx_model.name,
+                '--output', output.name,
+                '--init-net-output', init_net_output.name,
+            ])
+
+        caffe2_net = caffe2_pb2.NetDef()
+        caffe2_net.ParseFromString(output.read())
+        self.assertEqual(len(caffe2_net.op), 1)
+        self.assertEqual(caffe2_net.op[0].type, 'Mul')
+
+        caffe2_init_net = caffe2_pb2.NetDef()
+        caffe2_init_net.ParseFromString(init_net_output.read())
+        self.assertEqual(len(caffe2_init_net.op), 1)
+        self.assertEqual(set(sum([list(init_op.output)
+                                  for init_op in caffe2_init_net.op], [])),
+                         {'W'})
+
+
+    def test_onnx_to_caffe2_zipfile(self):
+        buf = tempfile.NamedTemporaryFile()
+        onnx_model = zipfile.ZipFile(buf, 'w')
+        output = tempfile.NamedTemporaryFile()
+        init_net_output = tempfile.NamedTemporaryFile()
+
+        node_def = helper.make_node(
+            "MatMul", ["X", "W"], ["Y"])
+        X = np.random.rand(2, 3).astype(np.float32)
+        W = np.random.rand(3, 2).flatten().astype(np.float32)
+        graph_def = helper.make_graph(
+            [node_def],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("W", TensorProto.FLOAT, (3, 2))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
+            initializer=[helper.make_tensor("W",
+                                            TensorProto.FLOAT,
+                                            [3, 2],
+                                            b'__EXTERNAL',
+                                            raw=True)])
+        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
+        onnx_model.writestr('__MODEL_PROTO', model_def.SerializeToString())
+        onnx_model.writestr('W', W.tobytes())
+        onnx_model.close()
+
+        W = W.reshape((3, 2))
+        Y_expect = np.matmul(X, W)
+
+        c2_model = c2.prepare_zip_archive(buf)
+        Y = c2_model.run(X).Y
+        np.testing.assert_allclose(Y, Y_expect)
+
+    def _make_fake_if_op(self, true_nodes, false_nodes, output_types):
+        true = helper.make_tensor("condition", TensorProto.BOOL, (), [True])
+        true_graph = helper.make_graph(true_nodes, "true_graph", [], [
+            helper.make_tensor_value_info("_Y", TensorProto.FLOAT, (2, 2)),
+        ])
+        false_graph = helper.make_graph(false_nodes, "false_graph", [], [
+            helper.make_tensor_value_info("_Y", TensorProto.FLOAT, (2, 2)),
+        ])
+        if_inputs = ["condition"]
+        if_outputs = [name for _, _, name in output_types]
+        retval_nodes = [
+            helper.make_node("Constant", [], ["condition"], value=true),
+            helper.make_node("If", if_inputs, if_outputs, then_branch=true_graph,
+                             else_branch=false_graph)
+        ]
+        return retval_nodes
+
+    def test_onnx_to_caffe2_if(self):
+        true_nodes = [helper.make_node(
+            "MatMul", ["X", "W"], ["_Y"])]
+        false_nodes = [helper.make_node("Slice", ["X"], ["_Y"], axes=[0, 1], starts=[0, 0], ends=[0, 2])]
+        nodes = self._make_fake_if_op(true_nodes, false_nodes, [(TensorProto.FLOAT, (2, 2), "Y")])
+        X = np.random.rand(2, 3).astype(np.float32)
+        W = np.random.rand(3, 2).flatten().astype(np.float32)
+        graph_def = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("W", TensorProto.FLOAT, (3, 2))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
+            initializer=[helper.make_tensor("W",
+                                            TensorProto.FLOAT,
+                                            [3, 2],
+                                            W.tolist())]
+        )
+        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
+
+        p = c2.prepare(model_def)
+        Y = np.matmul(X, W.reshape(3, 2))
+        out = p.run(X)
+        np.testing.assert_allclose(out.Y, Y)
+
+    # input_types and output_types are lists of triples of (name, type, shape)
+    def _make_fake_loop_op(self, body_nodes, input_types, output_types):
+        ten = helper.make_tensor("trip_count_value", TensorProto.INT64, (1,), [10])
+        true = helper.make_tensor("condition", TensorProto.BOOL, (1,), [True])
+        # lcd is a dummy loop-carried dependency that only exists because
+        # right now the schema checker is broken and assumes a variadic
+        # input needs at least one value.
+        graph_inputs = [helper.make_tensor_value_info("i", TensorProto.INT32, ()),
+                        helper.make_tensor_value_info("cond", TensorProto.BOOL, ())]
+        for type, shape, name in input_types:
+            graph_inputs.append(helper.make_tensor_value_info("_" + name, type, shape))
+        graph_outputs = [helper.make_tensor_value_info("cond", TensorProto.BOOL, ())]
+        for type, shape, name in output_types:
+            graph_outputs.append(helper.make_tensor_value_info("_" + name, type, shape))
+        body_graph = helper.make_graph(body_nodes, "body_graph", graph_inputs,
+                                       graph_outputs)
+        loop_inputs = ["trip_count", "condition"]
+        loop_inputs.extend([name for _, _, name in input_types])
+        loop_outputs = [name for _, _, name in output_types]
+        retval_nodes = [
+            helper.make_node("Constant", [], ["trip_count"], value=ten),
+            helper.make_node("Constant", [], ["condition"], value=true),
+            helper.make_node("Loop", loop_inputs, loop_outputs, body=body_graph)
+        ]
+        return retval_nodes
+
+    def test_onnx_to_caffe2_loop(self):
+        body_nodes = [helper.make_node(
+            "MatMul", ["_X", "W"], ["_Y"])]
+        nodes = self._make_fake_loop_op(body_nodes,
+                                        [(TensorProto.FLOAT, (2, 2), "X")],
+                                        [(TensorProto.FLOAT, (2, 2), "Y")])
+        X = np.random.rand(2, 2).astype(np.float32)
+        W = np.random.rand(2, 2).flatten().astype(np.float32)
+        graph_def = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 2)),
+             helper.make_tensor_value_info("W", TensorProto.FLOAT, (2, 2))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
+            initializer=[helper.make_tensor("W",
+                                            TensorProto.FLOAT,
+                                            [2, 2],
+                                            W.tolist())]
+        )
+        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
+        Y = X
+        for _ in range(10):
+            Y = np.matmul(Y, W.reshape(2, 2))
+        p = c2.prepare(model_def)
+        out = p.run(X)
+        np.testing.assert_allclose(out.Y, Y)
+
+    # TODO investigate why this is failing after changing Reshape
+    # operator from taking the new shape as attribute to as input
+    @unittest.skip('Start failing after Reshape op change')
+    def test_convert_end2end(self):
+        predict_net_f = tempfile.NamedTemporaryFile()
+        init_net_f = tempfile.NamedTemporaryFile()
+        onnx_model_f = tempfile.NamedTemporaryFile()
+
+        x = 'X'
+        w = 'W'
+        b = 'b'
+        y = 'Y'
+
+        predict_net = caffe2_pb2.NetDef()
+        predict_net.name = 'test-convert-end2end'
+        predict_net.external_input[:] = [x, w, b]
+        predict_net.external_output[:] = [y]
+        predict_net.op.extend([
+            core.CreateOperator(
+                'FC',
+                inputs=[x, w, b],
+                outputs=[y],
+                axis=2,
+            ),
+        ])
+        predict_net_f.write(predict_net.SerializeToString())
+        predict_net_f.flush()
+
+        init_net = caffe2_pb2.NetDef()
+        init_net.name = 'test-convert-end2end-init'
+        init_net.external_output[:] = [w, b]
+        x_val = np.random.randn(1, 3, 2).astype(np.float32)
+        w_val = np.random.randn(4, 2).astype(np.float32)
+        b_val = np.random.randn(4).astype(np.float32)
+        init_net.op.extend([
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                [w],
+                values=w_val,
+                shape=w_val.shape,
+            ),
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                [b],
+                values=b_val,
+                shape=b_val.shape,
+            ),
+        ])
+        init_net_f.write(init_net.SerializeToString())
+        init_net_f.flush()
+
+        y_val = np.matmul(x_val, w_val.transpose()) + b_val
+        for _ in range(5):
+            self._run_command(
+                caffe2_to_onnx, [
+                    predict_net_f.name,
+                    '--caffe2-init-net', init_net_f.name,
+                    '--output', onnx_model_f.name,
+                    '--value-info',
+                    json.dumps({
+                        x: (TensorProto.FLOAT, (1, 3, 2)),
+                    }),
+                ],
+                catch_exceptions=False,
+            )
+
+            onnx_model_f.seek(0)
+            onnx_model = ModelProto()
+            onnx_model.ParseFromString(onnx_model_f.read())
+            np.testing.assert_almost_equal(
+                c2.run_model(
+                    onnx_model, {onnx_model.graph.input[0].name: x_val}),
+                [y_val])
+
+            self._run_command(
+                onnx_to_caffe2, [
+                    onnx_model_f.name,
+                    '--output', predict_net_f.name,
+                    '--init-net-output', init_net_f.name,
+                ])
+            predict_net_f.seek(0)
+            predict_net = caffe2_pb2.NetDef()
+            predict_net.ParseFromString(predict_net_f.read())
+            init_net_f.seek(0)
+            init_net = caffe2_pb2.NetDef()
+            init_net.ParseFromString(init_net_f.read())
+            x = predict_net.external_input[0]
+            np.testing.assert_almost_equal(c2_native_run_net(init_net=init_net,
+                                                             predict_net=predict_net,
+                                                             inputs={x: x_val})[1],
+                                           [y_val])
diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py
new file mode 100644
index 0000000..e368278
--- /dev/null
+++ b/caffe2/python/onnx/tests/helper_test.py
@@ -0,0 +1,34 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.helper_test
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+
+from caffe2.python.onnx.tests.test_utils import TestCase
+import caffe2.python._import_c_extension as C
+
+
+class TestCaffe2Basic(TestCase):
+    def test_dummy_name(self):
+        g = C.DummyName()
+        g.reset()
+        names_1 = [g.new_dummy_name() for _ in range(3)]
+        g.reset()
+        names_2 = [g.new_dummy_name() for _ in range(3)]
+        self.assertEqual(names_1, names_2)
+
+        g.reset(set(names_1))
+        names_3 = [g.new_dummy_name() for _ in range(3)]
+        self.assertFalse(set(names_1) & set(names_3))
+
+        g.reset(set(names_1))
+        names_4 = [g.new_dummy_name() for _ in range(3)]
+        self.assertFalse(set(names_1) & set(names_4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
new file mode 100644
index 0000000..24d6bc8
--- /dev/null
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -0,0 +1,64 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.onnx_backend_test
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+
+import unittest
+import onnx.backend.test
+
+import caffe2.python.onnx.backend as c2
+
+# This is a pytest magic variable to load extra plugins
+pytest_plugins = 'onnx.backend.test.report',
+
+backend_test = onnx.backend.test.BackendTest(c2, __name__)
+
+backend_test.exclude(r'(test_hardsigmoid'  # Does not support Hardsigmoid.
+                     '|test_hardmax'  # Does not support Hardmax.
+                     '|test_cast.*FLOAT16.*'  # Does not support Cast on Float16.
+                     '|test_depthtospace.*'  # Does not support DepthToSpace.
+                     '|test_reduce_l1.*'  # Does not support ReduceL1.
+                     '|test_reduce_l2.*'  # Does not support ReduceL2.
+                     '|test_reduce_log_sum.*'  # Does not support ReduceLogSum.
+                     '|test_reduce_prod.*'  # Does not support ReduceProd.
+                     '|test_reduce_sum_square.*'  # Does not support ReduceSumSquare
+                     '|test_tile.*'  # Tile's Caffe2 implementation needs some tweak
+                     '|test_lstm.*'  # Seems LSTM case has some problem
+                     '|test_simple_rnn.*'  # Seems simple RNN case has some problem
+                     '|test_gru.*'  # Seems GRU case has some problem
+                     '|test_prelu.*'  # PRelu is not compliant with ONNX yet
+                     '|test_operator_repeat.*'  # Tile is not compliant with ONNX yet
+                     '|test_.*pool_.*same.*'  # Does not support pool same.
+                     '|test_convtranspose.*'  # ConvTranspose needs some more complicated translation
+                     ')')
+
+# Quick patch to unbreak master CI, is working on the debugging.
+backend_test.exclude('(test_cast_.*'
+                     '|test_Conv1d_.*cuda'
+                     '|test_Conv3d_groups_cuda'
+                     '|test_rnn_seq_length'
+                     '|test_operator_add.*_cuda'
+                     '|test_operator_lstm_cuda'
+                     '|test_operator_rnn.*_cuda'
+                     '|test_lrn_default_cuda)')
+
+# Temporarily skip some ONNX backend tests with broadcasting.
+backend_test.exclude('(test_pow_bcast'
+                     ')')
+
+# Skip vgg to speed up CI
+if 'JENKINS_URL' in os.environ:
+    backend_test.exclude(r'(test_vgg19|test_vgg)')
+
+# import all test cases at global scope to make them visible to python.unittest
+globals().update(backend_test
+                 .enable_report()
+                 .test_cases)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
new file mode 100644
index 0000000..91c6601
--- /dev/null
+++ b/caffe2/python/onnx/tests/ssa_test.py
@@ -0,0 +1,107 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.ssa_test
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import onnx
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from onnx import helper, TensorProto
+
+import caffe2.python.onnx.frontend as c2_onnx
+from caffe2.python.onnx.helper import c2_native_run_net
+from caffe2.python.onnx.tests.test_utils import TestCase
+
+
+class TestFrontendSSAConversion(TestCase):
+    def test_ssa(self):
+        X = np.random.randn(4, 2).astype(np.float32)
+        W = np.random.randn(3, 2).astype(np.float32)
+        b = np.random.randn(3).astype(np.float32)
+        s = np.random.randn(1).astype(np.float32)
+        np_result = X.dot(W.transpose()) + b + s
+
+        net = caffe2_pb2.NetDef()
+        net.name = 'test-ssa'
+        net.external_input[:] = ['W', 'X', 'b', 's']
+        net.op.extend([
+            core.CreateOperator(
+                'FC',
+                ['X', 'W', 'b'],
+                ['Y']
+            ),
+            core.CreateOperator(
+                'Add',
+                ['Y', 's'],
+                ['Y'],
+                broadcast=True,
+            )
+        ])
+        net.external_output[:] = ['Y']
+
+        init_net = caffe2_pb2.NetDef()
+        init_net.name = 'test-ssa-init'
+        init_net.op.extend([
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                ['W'],
+                values=W,
+                shape=W.shape,
+            ),
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                ['b'],
+                values=b,
+                shape=b.shape,
+            ),
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                ['s'],
+                values=s,
+                shape=s.shape,
+            )
+        ])
+        init_net.external_output[:] = ['W', 'b', 's']
+
+        _, orig_output = c2_native_run_net(
+            predict_net=net,
+            init_net=init_net,
+            inputs=[X])
+
+        value_info = {'X': (TensorProto.FLOAT, X.shape)}
+        c2_onnx.Caffe2Frontend._ssa_rewrite(
+            net,
+            init_net,
+            value_info)
+
+        self.assertEqual(net.external_input, ['W_0', 'X_0', 'b_0', 's_0'])
+        self.assertEqual(net.op[0].input, ['X_0', 'W_0', 'b_0'])
+        self.assertEqual(net.op[0].output, ['Y_1'])
+        self.assertEqual(net.op[1].input, ['Y_1', 's_0'])
+        self.assertEqual(net.op[1].output, ['Y_2'])
+        self.assertEqual(net.external_output, ['Y_2'])
+
+        self.assertEqual(init_net.external_input, [])
+        self.assertEqual(init_net.op[0].input, [])
+        self.assertEqual(init_net.op[0].output, ['W_0'])
+        self.assertEqual(init_net.op[1].input, [])
+        self.assertEqual(init_net.op[1].output, ['b_0'])
+        self.assertEqual(init_net.op[2].input, [])
+        self.assertEqual(init_net.op[2].output, ['s_0'])
+        self.assertEqual(init_net.external_output, ['W_0', 'b_0', 's_0'])
+        self.assertEqual(value_info, {'X_0': (TensorProto.FLOAT, X.shape)})
+
+        _, ssa_output = c2_native_run_net(
+            predict_net=net,
+            init_net=init_net,
+            inputs=[X])
+
+        self.assertSameOutputs(ssa_output, orig_output)
+        self.assertSameOutputs(ssa_output, [np_result])
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
new file mode 100644
index 0000000..dae98f3
--- /dev/null
+++ b/caffe2/python/onnx/tests/test_utils.py
@@ -0,0 +1,29 @@
+## @package onnx
+# Module caffe2.python.onnx.tests.test_utils
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+
+import numpy as np
+
+
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(seed=0)
+
+    def assertSameOutputs(self, outputs1, outputs2, decimal=7):
+        self.assertEqual(len(outputs1), len(outputs2))
+        for o1, o2 in zip(outputs1, outputs2):
+            self.assertEqual(o1.dtype, o2.dtype)
+            np.testing.assert_almost_equal(o1, o2, decimal=decimal)
+
+    def add_test_case(name, test_func):
+        if not name.startswith('test_'):
+            raise ValueError('Test name must start with test_: {}'.format(name))
+        if hasattr(self, name):
+            raise ValueError('Duplicated test name: {}'.format(name))
+        setattr(self, name, test_func)
diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py
new file mode 100644
index 0000000..a311ec3
--- /dev/null
+++ b/caffe2/python/onnx/workspace.py
@@ -0,0 +1,71 @@
+## @package onnx
+# Module caffe2.python.onnx.workspace
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import uuid
+
+from caffe2.python import workspace
+
+# Separating out the context manager part so that users won't
+# (mis-)use Workspace instances as context managers
+class _WorkspaceCtx(object):
+    def __init__(self, workspace_id):
+        self.workspace_id = workspace_id
+        # A stack, so that the context manager is reentrant.
+        self.workspace_stack = []
+
+    def __enter__(self):
+        self.workspace_stack.append(workspace.CurrentWorkspace())
+        workspace.SwitchWorkspace(self.workspace_id, create_if_missing=True)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        w = self.workspace_stack.pop()
+        # Strictly speaking, create_if_missing here is unnecessary, since a user
+        # is not supposed to be allowed to destruct a workspace while we're in
+        # it.  However, empirically, it has been observed that during abnormal
+        # shutdown, Caffe2 deletes its default workspace fairly early in the
+        # final calls to destructors.  In this case, we may attempt to exit
+        # to a default workspace which no longer exists.  create_if_missing=True
+        # will (harmlessly) recreate the workspace before we finally quit.)
+        workspace.SwitchWorkspace(w, create_if_missing=True)
+
+
+class Workspace(object):
+    """
+    An object representing a Caffe2 workspace.  It is a context manager,
+    so you can say 'with workspace:' to use the represented workspace
+    as your global workspace.  It also supports every method supported
+    by caffe2.python.workspace, but instead of running these operations
+    in the global workspace, it runs them in the workspace represented
+    by this object.  When this object goes dead, the workspace (and all
+    nets and blobs within it) are freed.
+
+    Why do we need this class?  Caffe2's workspace model is very "global state"
+    oriented, in that there is always some ambient global workspace you are
+    working in which holds on to all of your networks and blobs.  This class
+    makes it possible to work with workspaces more locally, and without
+    forgetting to deallocate everything in the end.
+    """
+    def __init__(self):
+        # Caffe2 (apparently) doesn't provide any native method of generating
+        # a fresh, unused workspace, so we have to fake it by generating
+        # a unique ID and hoping it's not used already / will not be used
+        # directly in the future.
+        self._ctx = _WorkspaceCtx(str(uuid.uuid4()))
+
+    def __getattr__(self, attr):
+        def f(*args, **kwargs):
+            with self._ctx:
+                return getattr(workspace, attr)(*args, **kwargs)
+        return f
+
+    def __del__(self):
+        # NB: This is a 'self' call because we need to switch into the workspace
+        # we want to reset before we actually reset it.  A direct call to
+        # workspace.ResetWorkspace() will reset the ambient workspace, which
+        # is not want we want.
+        self.ResetWorkspace()
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
new file mode 100644
index 0000000..0d35110
--- /dev/null
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -0,0 +1,214 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+import unittest
+
+
+class TestActivations(hu.HypothesisTestCase):
+    @given(X=hu.tensor(), in_place=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
+    def test_relu(self, X, in_place, engine, gc, dc):
+        if gc == mu.mkl_do:
+            in_place = False
+
+        op = core.CreateOperator(
+            "Relu",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+            engine=engine,
+        )
+
+        def relu_ref(X):
+            return [np.maximum(X, 0.0)]
+
+        # go away from the origin point to avoid kink problems
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+
+        self.assertReferenceChecks(gc, op, [X], relu_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "Relu for float16 can only run on GPU now.")
+    @given(X=hu.tensor(dtype=np.float16), in_place=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_gpu_only)
+    def test_relu_fp16(self, X, in_place, engine, gc, dc):
+        op = core.CreateOperator(
+            "Relu",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+            engine=engine,
+        )
+
+        def relu_ref(X):
+            return [np.maximum(X, 0.0)]
+
+        def relu_grad_ref(g_out, outputs, fwd_inputs):
+            dY = g_out
+            [Y] = outputs
+            dX = dY
+            dX[Y == 0] = 0
+            return [dX]
+
+        # go away from the origin point to avoid kink problems
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+
+        self.assertReferenceChecks(
+            hu.gpu_do,
+            op,
+            [X],
+            relu_ref,
+            output_to_grad="X" if in_place else "Y",
+            grad_reference=relu_grad_ref)
+
+    @given(X=hu.tensor(elements=st.floats(-3.0, 3.0)),
+           n=st.floats(min_value=0.5, max_value=2.0),
+           in_place=st.booleans(), **hu.gcs)
+    def test_relu_n(self, X, n, in_place, gc, dc):
+        op = core.CreateOperator(
+            "ReluN",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+            n=n,
+        )
+
+        def relu_n_ref(X):
+            return [np.minimum(np.maximum(X, 0.0), n)]
+
+        # go away from 0 and n to avoid kink problems
+        X += 0.04 * np.sign(X)
+        X[X == 0.0] += 0.04
+        X -= n
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] -= 0.02
+        X += n
+
+        self.assertReferenceChecks(gc, op, [X], relu_n_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005)
+
+    @given(X=hu.tensor(),
+           alpha=st.floats(min_value=0.1, max_value=2.0),
+           in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_elu(self, X, alpha, in_place, engine, gc, dc):
+        op = core.CreateOperator(
+            "Elu",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+            alpha=alpha,
+            engine=engine,
+        )
+
+        def elu_ref(X):
+            Y = X
+            Y[X < 0] = alpha * (np.exp(X[X < 0]) - 1.0)
+            return [Y]
+
+        # go away from the origin point to avoid kink problems
+        X += 0.04 * np.sign(X)
+        X[X == 0.0] += 0.04
+
+        self.assertReferenceChecks(gc, op, [X], elu_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2)
+
+    @given(X=hu.tensor(min_dim=4, max_dim=4),
+           alpha=st.floats(min_value=0.1, max_value=2.0),
+           inplace=st.booleans(),
+           shared=st.booleans(),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           seed=st.sampled_from([20, 100]),
+           **hu.gcs)
+    def test_prelu(self, X, alpha, inplace, shared, order, seed, gc, dc):
+        np.random.seed(seed)
+        W = np.random.randn(
+            X.shape[1] if order == "NCHW" else X.shape[3]).astype(np.float32)
+
+        if shared:
+            W = np.random.randn(1).astype(np.float32)
+
+        # go away from the origin point to avoid kink problems
+        X += 0.04 * np.sign(X)
+        X[X == 0.0] += 0.04
+
+        def prelu_ref(X, W):
+            Y = X.copy()
+            W = W.reshape(1, -1, 1, 1) if order == "NCHW" \
+                else W.reshape(1, 1, 1, -1)
+            assert len(X.shape) == 4
+            neg_indices = X <= 0
+            assert len(neg_indices.shape) == 4
+            assert X.shape == neg_indices.shape
+            Y[neg_indices] = (Y * W)[neg_indices]
+            return (Y,)
+
+        op = core.CreateOperator(
+            "PRelu", ["X", "W"], ["Y" if not inplace else "X"],
+            alpha=alpha, order=order)
+        self.assertReferenceChecks(gc, op, [X, W], prelu_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, W], [0])
+
+        if not inplace:
+            # Gradient check wrt X
+            self.assertGradientChecks(gc, op, [X, W], 0, [0], stepsize=1e-2)
+            # Gradient check wrt W
+            self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2)
+
+    @given(X=hu.tensor(),
+           alpha=st.floats(min_value=0.1, max_value=2.0),
+           inplace=st.booleans(),
+           **hu.gcs)
+    def test_leaky_relu(self, X, alpha, inplace, gc, dc):
+        # go away from the origin point to avoid kink problems
+        X += 0.04 * np.sign(X)
+        X[X == 0.0] += 0.04
+
+        def leaky_relu_ref(X):
+            Y = X.copy()
+            neg_indices = X <= 0
+            Y[neg_indices] = Y[neg_indices] * alpha
+            return (Y,)
+
+        op = core.CreateOperator(
+            "LeakyRelu",
+            ["X"], ["Y" if not inplace else "X"],
+            alpha=alpha)
+        self.assertReferenceChecks(gc, op, [X], leaky_relu_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(X=hu.tensor(),
+           inplace=st.booleans(),
+           **hu.gcs)
+    def test_leaky_relu_default(self, X, inplace, gc, dc):
+        # go away from the origin point to avoid kink problems
+        X += 0.04 * np.sign(X)
+        X[X == 0.0] += 0.04
+
+        def leaky_relu_ref(X):
+            Y = X.copy()
+            neg_indices = X <= 0
+            Y[neg_indices] = Y[neg_indices] * 0.01
+            return (Y,)
+
+        op = core.CreateOperator(
+            "LeakyRelu",
+            ["X"], ["Y" if not inplace else "X"])
+        self.assertReferenceChecks(gc, op, [X], leaky_relu_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
new file mode 100644
index 0000000..c3c58d0
--- /dev/null
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -0,0 +1,261 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+
+import hypothesis
+from hypothesis import given, settings, HealthCheck
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.operator_test.adagrad_test_helper import (
+    ref_adagrad, adagrad_sparse_test_helper
+)
+
+
+class TestAdagrad(hu.HypothesisTestCase):
+    @staticmethod
+    def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
+        mom_out = mom_in + np.mean(np.square(grad))
+        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
+        param_out = param_in + grad_adj
+        return (param_out, mom_out)
+
+    @given(inputs=hu.tensors(n=3),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs)
+    def test_adagrad(self, inputs, lr, epsilon, gc, dc):
+        param, momentum, grad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Adagrad",
+            ["param", "momentum", "grad", "lr"],
+            ["param", "momentum"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, momentum, grad, lr],
+            functools.partial(ref_adagrad, epsilon=epsilon))
+
+    @given(inputs=hu.tensors(n=3),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_adagrad_output_effective_lr(self, inputs, lr, epsilon, gc, dc):
+        param, momentum, grad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Adagrad",
+            ["param", "momentum", "grad", "lr"],
+            ["param", "momentum", "effective_lr"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, momentum, grad, lr],
+            functools.partial(ref_adagrad, epsilon=epsilon,
+                              output_effective_lr=True))
+
+    @given(inputs=hu.tensors(n=3),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_adagrad_output_effective_lr_and_update(
+            self, inputs, lr, epsilon, gc, dc):
+        param, momentum, grad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Adagrad",
+            ["param", "momentum", "grad", "lr"],
+            ["param", "momentum", "effective_lr", "update"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, momentum, grad, lr],
+            functools.partial(ref_adagrad, epsilon=epsilon,
+                              output_effective_lr_and_update=True))
+
+    # Suppress filter_too_much health check.
+    # Likely caused by `assume` call falling through too often.
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(inputs=hu.tensors(n=3),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs)
+    def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc):
+        return adagrad_sparse_test_helper(self, inputs, lr, epsilon,
+            None, ref_adagrad, gc, dc)
+
+    @given(inputs=hu.tensors(n=2),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_sparse_adagrad_empty(self, inputs, lr, epsilon,
+                                  data_strategy, gc, dc):
+        param, momentum = inputs
+        momentum = np.abs(momentum)
+        lr = np.array([lr], dtype=np.float32)
+
+        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
+        indices = np.empty(shape=(0,), dtype=np.int64)
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        op = core.CreateOperator(
+            "SparseAdagrad",
+            ["param", "momentum", "indices", "grad", "lr"],
+            ["param", "momentum"],
+            epsilon=epsilon,
+            device_option=gc)
+
+        def ref_sparse(param, momentum, indices, grad, lr):
+            param_out = np.copy(param)
+            momentum_out = np.copy(momentum)
+            return (param_out, momentum_out)
+
+        ref_using_fp16_values = [False]
+        if dc == hu.gpu_do:
+            ref_using_fp16_values.append(True)
+
+        for ref_using_fp16 in ref_using_fp16_values:
+            if(ref_using_fp16):
+                print('test_sparse_adagrad_empty with half precision embedding')
+                momentum_i = momentum.astype(np.float16)
+                param_i = param.astype(np.float16)
+            else:
+                print('test_sparse_adagrad_empty with full precision embedding')
+                momentum_i = momentum.astype(np.float32)
+                param_i = param.astype(np.float32)
+
+            self.assertReferenceChecks(
+                gc, op, [param_i, momentum_i, indices, grad, lr], ref_sparse
+            )
+
+    # Suppress filter_too_much health check.
+    # Likely caused by `assume` call falling through too often.
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(inputs=hu.tensors(n=2),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon,
+                                     data_strategy, gc, dc):
+        param, grad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        # Create a 1D row-wise average sum of squared gradients tensor.
+        momentum = data_strategy.draw(
+            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
+                        elements=hu.elements_of_type(dtype=np.float32))
+        )
+        momentum = np.abs(momentum)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(dtype=np.int64,
+                      elements=st.sampled_from(np.arange(grad.shape[0]))),
+        )
+
+        # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment
+        # tensor that is strictly 1-dimensional and equal in length to the
+        # first dimension of the parameters, so indices must also be
+        # 1-dimensional.
+        indices = indices.flatten()
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        # The indices must be unique
+        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "RowWiseSparseAdagrad",
+            ["param", "momentum", "indices", "grad", "lr"],
+            ["param", "momentum"],
+            epsilon=epsilon,
+            device_option=gc)
+
+        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
+            param_out = np.copy(param)
+            momentum_out = np.copy(momentum)
+            for i, index in enumerate(indices):
+                param_out[index], momentum_out[index] = self.ref_row_wise_adagrad(
+                    param[index], momentum[index], grad[i], lr, epsilon)
+            return (param_out, momentum_out)
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, momentum, indices, grad, lr],
+            ref_row_wise_sparse)
+
+    @given(inputs=hu.tensors(n=1),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon,
+                                           data_strategy, gc, dc):
+        param = inputs[0]
+        lr = np.array([lr], dtype=np.float32)
+
+        momentum = data_strategy.draw(
+            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
+                        elements=hu.elements_of_type(dtype=np.float32))
+        )
+        momentum = np.abs(momentum)
+
+        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
+        indices = np.empty(shape=(0,), dtype=np.int64)
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        op = core.CreateOperator(
+            "RowWiseSparseAdagrad",
+            ["param", "momentum", "indices", "grad", "lr"],
+            ["param", "momentum"],
+            epsilon=epsilon,
+            device_option=gc)
+
+        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
+            param_out = np.copy(param)
+            momentum_out = np.copy(momentum)
+            return (param_out, momentum_out)
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, momentum, indices, grad, lr],
+            ref_row_wise_sparse)
diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py
new file mode 100644
index 0000000..743285e
--- /dev/null
+++ b/caffe2/python/operator_test/adagrad_test_helper.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+def ref_adagrad(param_in, mom_in, grad, lr, epsilon, using_fp16=False,
+                output_effective_lr=False,
+                output_effective_lr_and_update=False):
+    mom_in_f32 = mom_in
+    param_in_f32 = param_in
+    if(using_fp16):
+        mom_in_f32 = mom_in.astype(np.float32)
+        param_in_f32 = param_in.astype(np.float32)
+
+    mom_out = mom_in_f32 + np.square(grad)
+    effective_lr = lr / (np.sqrt(mom_out) + epsilon)
+    grad_adj = effective_lr * grad
+    param_out = param_in_f32 + grad_adj
+
+    if output_effective_lr_and_update:
+        if(using_fp16):
+            return (param_out.astype(np.float16), mom_out.astype(np.float16),
+                    effective_lr.astype(np.float16),
+                    grad_adj.astype(np.float16))
+        else:
+            return (param_out.astype(np.float32), mom_out.astype(np.float32),
+                    effective_lr.astype(np.float32),
+                    grad_adj.astype(np.float32))
+    elif output_effective_lr:
+        if(using_fp16):
+            return (param_out.astype(np.float16), mom_out.astype(np.float16),
+                    effective_lr.astype(np.float16))
+        else:
+            return (param_out.astype(np.float32), mom_out.astype(np.float32),
+                    effective_lr.astype(np.float32))
+
+    if(using_fp16):
+        return (param_out.astype(np.float16), mom_out.astype(np.float16))
+    else:
+        return (param_out.astype(np.float32), mom_out.astype(np.float32))
+
+
+def adagrad_sparse_test_helper(parent_test, inputs, lr, epsilon,
+     engine, ref_adagrad, gc, dc):
+    param, momentum, grad = inputs
+    momentum = np.abs(momentum)
+    lr = np.array([lr], dtype=np.float32)
+
+    # Create an indexing array containing values that are lists of indices,
+    # which index into grad
+    indices = np.random.choice(np.arange(grad.shape[0]),
+        size=np.random.randint(grad.shape[0]), replace=False)
+
+    # Sparsify grad
+    grad = grad[indices]
+
+    op = core.CreateOperator(
+        "SparseAdagrad",
+        ["param", "momentum", "indices", "grad", "lr"],
+        ["param", "momentum"],
+        epsilon=epsilon,
+        engine=engine,
+        device_option=gc)
+
+    def ref_sparse(param, momentum, indices, grad, lr, ref_using_fp16=False):
+        param_out = np.copy(param)
+        momentum_out = np.copy(momentum)
+        for i, index in enumerate(indices):
+            param_out[index], momentum_out[index] = ref_adagrad(
+                param[index],
+                momentum[index],
+                grad[i],
+                lr,
+                epsilon,
+                using_fp16=ref_using_fp16
+            )
+        return (param_out, momentum_out)
+
+    ref_using_fp16_values = [False]
+    if dc == hu.gpu_do:
+        ref_using_fp16_values.append(True)
+
+    for ref_using_fp16 in ref_using_fp16_values:
+        if(ref_using_fp16):
+            print('test_sparse_adagrad with half precision embedding')
+            momentum_i = momentum.astype(np.float16)
+            param_i = param.astype(np.float16)
+        else:
+            print('test_sparse_adagrad with full precision embedding')
+            momentum_i = momentum.astype(np.float32)
+            param_i = param.astype(np.float32)
+
+        parent_test.assertReferenceChecks(
+            gc, op, [param_i, momentum_i, indices, grad, lr, ref_using_fp16],
+            ref_sparse
+        )
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
new file mode 100644
index 0000000..9cf3f7c
--- /dev/null
+++ b/caffe2/python/operator_test/adam_test.py
@@ -0,0 +1,258 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+
+import hypothesis
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestAdam(hu.HypothesisTestCase):
+
+    @staticmethod
+    def ref_adam(param, mom1, mom2, grad, LR, ITER,
+                 beta1, beta2, epsilon, output_grad=False):
+        t = ITER + 1
+        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
+            (1 - np.power(beta1, t))
+        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
+        mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad)
+        grad_out = corrected_local_rate * mom1_out / \
+            (np.sqrt(mom2_out) + epsilon)
+        param_out = param + LR * grad_out
+        if output_grad:
+            return param_out, mom1_out, mom2_out, grad_out
+        else:
+            return param_out, mom1_out, mom2_out
+
+    @staticmethod
+    def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
+                          beta1, beta2, epsilon):
+        t = ITER + 1
+        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
+            (1 - np.power(beta1, t))
+        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
+        mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
+        param_out = param + corrected_local_rate * mom1_out / \
+            (np.sqrt(mom2_out) + epsilon)
+        return (param_out, mom1_out, mom2_out)
+
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs)
+    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
+        param, mom1, mom2, grad = inputs
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Adam",
+            ["param", "mom1", "mom2", "grad", "lr", "iter"],
+            ["output_param", "output_mom1", "output_mom2"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, grad, LR, ITER],
+            functools.partial(
+                self.ref_adam,
+                beta1=beta1, beta2=beta2, epsilon=epsilon),
+            input_device_options=input_device_options)
+
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
+        param, mom1, mom2, grad = inputs
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Adam",
+            ["param", "mom1", "mom2", "grad", "lr", "iter"],
+            ["output_param", "output_mom1", "output_mom2", "output_grad"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, grad, LR, ITER],
+            functools.partial(
+                self.ref_adam,
+                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
+            input_device_options=input_device_options)
+
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
+                         data_strategy, gc, dc):
+        param, mom1, mom2, grad = inputs
+        mom2 = np.absolute(mom2)
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(
+            np.array_equal(
+                np.unique(indices.flatten()),
+                np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "SparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index] = \
+                    self.ref_adam(param[index], mom1[index], mom2[index],
+                                  grad[i], LR, ITER,
+                                  beta1, beta2, epsilon)
+            return (param_out, mom1_out, mom2_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            ref_sparse,
+            input_device_options=input_device_options)
+
+    @given(inputs=hu.tensors(n=3),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+               **hu.gcs_cpu_only)
+    def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
+                                  data_strategy, gc, dc):
+        param, mom1, grad = inputs
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create a 1D row-wise average 2nd moment tensor.
+        mom2 = data_strategy.draw(
+            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
+                        elements=hu.elements_of_type(dtype=np.float32))
+        )
+        mom2 = np.absolute(mom2)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
+        # tensor that is strictly 1-dimensional and equal in length to the
+        # first dimension of the parameters, so indices must also be
+        # 1-dimensional.
+        indices = indices.flatten()
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "RowWiseSparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index] = \
+                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
+                                           grad[i], LR, ITER,
+                                           beta1, beta2, epsilon)
+            return (param_out, mom1_out, mom2_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            ref_row_wise_sparse,
+            input_device_options=input_device_options)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
new file mode 100644
index 0000000..70aa45d
--- /dev/null
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -0,0 +1,112 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+
+class TestAffineChannelOp(hu.HypothesisTestCase):
+    def affine_channel_nchw_ref(self, X, scale, bias):
+        dims = X.shape
+        N = dims[0]
+        C = dims[1]
+        X = X.reshape(N, C, -1)
+        scale = scale.reshape(C, 1)
+        bias = bias.reshape(C, 1)
+        Y = X * scale + bias
+        return [Y.reshape(dims)]
+
+    def affine_channel_nhwc_ref(self, X, scale, bias):
+        dims = X.shape
+        N = dims[0]
+        C = dims[-1]
+        X = X.reshape(N, -1, C)
+        Y = X * scale + bias
+        return [Y.reshape(dims)]
+
+    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
+           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
+           is_learnable=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
+           in_place=st.booleans(), **hu.gcs)
+    def test_affine_channel_2d(
+            self, N, C, H, W, order, is_learnable, engine, in_place, gc, dc):
+        op = core.CreateOperator(
+            "AffineChannel",
+            ["X", "scale", "bias"],
+            ["X"] if in_place and not is_learnable else ["Y"],
+            order=order,
+            is_learnable=is_learnable,
+            engine=engine,
+        )
+
+        if order == "NCHW":
+            X = np.random.randn(N, C, H, W).astype(np.float32)
+        else:
+            X = np.random.randn(N, H, W, C).astype(np.float32)
+        scale = np.random.randn(C).astype(np.float32)
+        bias = np.random.randn(C).astype(np.float32)
+        inputs = [X, scale, bias]
+
+        def ref_op(X, scale, bias):
+            if order == "NCHW":
+                return self.affine_channel_nchw_ref(X, scale, bias)
+            else:
+                return self.affine_channel_nhwc_ref(X, scale, bias)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        num_grad = len(inputs) if is_learnable else 1
+        for i in range(num_grad):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3),
+           H=st.integers(1, 3), W=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]), in_place=st.booleans(),
+           **hu.gcs)
+    def test_affine_channel_3d(
+            self, N, C, T, H, W, order, is_learnable, engine, in_place, gc, dc):
+        op = core.CreateOperator(
+            "AffineChannel",
+            ["X", "scale", "bias"],
+            ["X"] if in_place and not is_learnable else ["Y"],
+            order=order,
+            is_learnable=is_learnable,
+            engine=engine,
+        )
+
+        if order == "NCHW":
+            X = np.random.randn(N, C, T, H, W).astype(np.float32)
+        else:
+            X = np.random.randn(N, T, H, W, C).astype(np.float32)
+        scale = np.random.randn(C).astype(np.float32)
+        bias = np.random.randn(C).astype(np.float32)
+        inputs = [X, scale, bias]
+
+        def ref_op(X, scale, bias):
+            if order == "NCHW":
+                return self.affine_channel_nchw_ref(X, scale, bias)
+            else:
+                return self.affine_channel_nhwc_ref(X, scale, bias)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        num_grad = len(inputs) if is_learnable else 1
+        for i in range(num_grad):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py
new file mode 100644
index 0000000..2404a23
--- /dev/null
+++ b/caffe2/python/operator_test/apmeter_test.py
@@ -0,0 +1,84 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def calculate_ap(predictions, labels):
+    N, D = predictions.shape
+    ap = np.zeros(D)
+    num_range = np.arange((N), dtype=np.float32) + 1
+    for k in range(D):
+        scores = predictions[:N, k]
+        label = labels[:N, k]
+        sortind = np.argsort(-scores, kind='mergesort')
+        truth = label[sortind]
+        precision = np.cumsum(truth) / num_range
+        ap[k] = precision[truth.astype(np.bool)].sum() / max(1, truth.sum())
+    return ap
+
+
+class TestAPMeterOps(hu.HypothesisTestCase):
+    @given(predictions=hu.arrays(dims=[10, 3],
+           elements=st.floats(allow_nan=False,
+                              allow_infinity=False,
+                              min_value=0.1,
+                              max_value=1)),
+           labels=hu.arrays(dims=[10, 3],
+                            dtype=np.int32,
+                            elements=st.integers(min_value=0,
+                                                 max_value=1)),
+           **hu.gcs_cpu_only)
+    def test_average_precision(self, predictions, labels, gc, dc):
+        op = core.CreateOperator(
+            "APMeter",
+            ["predictions", "labels"],
+            ["AP"],
+            buffer_size=10,
+        )
+
+        def op_ref(predictions, labels):
+            ap = calculate_ap(predictions, labels)
+            return (ap, )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[predictions, labels],
+            reference=op_ref)
+
+    @given(predictions=hu.arrays(dims=[10, 3],
+           elements=st.floats(allow_nan=False,
+                              allow_infinity=False,
+                              min_value=0.1,
+                              max_value=1)),
+           labels=hu.arrays(dims=[10, 3],
+                            dtype=np.int32,
+                            elements=st.integers(min_value=0,
+                                                 max_value=1)),
+           **hu.gcs_cpu_only)
+    def test_average_precision_small_buffer(self, predictions, labels, gc, dc):
+        op_small_buffer = core.CreateOperator(
+            "APMeter",
+            ["predictions", "labels"],
+            ["AP"],
+            buffer_size=5,
+        )
+
+        def op_ref(predictions, labels):
+            # We can only hold the last 5 in the buffer
+            ap = calculate_ap(predictions[5:], labels[5:])
+            return (ap, )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op_small_buffer,
+            inputs=[predictions, labels],
+            reference=op_ref
+        )
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
new file mode 100644
index 0000000..6492189
--- /dev/null
+++ b/caffe2/python/operator_test/arg_ops_test.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestArgOps(hu.HypothesisTestCase):
+    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+           keepdims=st.booleans(), **hu.gcs)
+    def test_argmax(self, X, axis, keepdims, gc, dc):
+        if axis >= len(X.shape):
+            axis %= len(X.shape)
+        op = core.CreateOperator(
+            "ArgMax", ["X"], ["Indices"], axis=axis, keepdims=keepdims,
+            device_option=gc)
+
+        def argmax_ref(X):
+            indices = np.argmax(X, axis=axis)
+            if keepdims:
+                out_dims = list(X.shape)
+                out_dims[axis] = 1
+                indices = indices.reshape(tuple(out_dims))
+            return [indices]
+
+        self.assertReferenceChecks(gc, op, [X], argmax_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+           keepdims=st.booleans(), **hu.gcs)
+    def test_argmin(self, X, axis, keepdims, gc, dc):
+        if axis >= len(X.shape):
+            axis %= len(X.shape)
+        op = core.CreateOperator(
+            "ArgMin", ["X"], ["Indices"], axis=axis, keepdims=keepdims,
+            device_option=gc)
+
+        def argmin_ref(X):
+            indices = np.argmin(X, axis=axis)
+            if keepdims:
+                out_dims = list(X.shape)
+                out_dims[axis] = 1
+                indices = indices.reshape(tuple(out_dims))
+            return [indices]
+
+        self.assertReferenceChecks(gc, op, [X], argmin_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py
new file mode 100644
index 0000000..0d61dc7
--- /dev/null
+++ b/caffe2/python/operator_test/assert_test.py
@@ -0,0 +1,28 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestAssert(hu.HypothesisTestCase):
+    @given(
+        dtype=st.sampled_from(['bool_', 'int32', 'int64']),
+        shape=st.lists(elements=st.integers(1, 10), min_size=1, max_size=4),
+        **hu.gcs)
+    def test_assert(self, dtype, shape, gc, dc):
+        test_tensor = np.random.rand(*shape).astype(np.dtype(dtype))
+
+        op = core.CreateOperator('Assert', ['X'], [])
+
+        def assert_ref(X):
+            return []
+
+        try:
+            self.assertReferenceChecks(gc, op, [test_tensor], assert_ref)
+        except Exception:
+            assert(not np.all(test_tensor))
diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py
new file mode 100644
index 0000000..5b41ac5
--- /dev/null
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+
+class TestAtomicOps(TestCase):
+    def test_atomic_ops(self):
+        """
+        Test that both countdown and checksum are update atomically by having
+        cowntdown count from 20k to 0 from parallel the workers and updating
+        the checksum to the value fetched. If operations are trully atomic,
+        each value from 1 to 20k should be fetched exactly once from the
+        countdown, and fed exactly once to the checksum, such that at the end
+        checksum must contain the exact value of sum[i=0..20000](i).
+        """
+        init_net = core.Net('init')
+        mutex_countdown = init_net.CreateMutex([])
+        mutex_checksum = init_net.CreateMutex([])
+        countdown = init_net.ConstantFill([], shape=[], value=20000,
+                                          dtype=core.DataType.INT32)
+        checksum = init_net.ConstantFill(
+            [], shape=[], value=0, dtype=core.DataType.INT32)
+        minus_one = init_net.ConstantFill(
+            [], shape=[], value=-1, dtype=core.DataType.INT32)
+        steps = []
+        for i in range(0, 100):
+            net = core.Net('net:%d' % i)
+            _, fetched_count = net.AtomicFetchAdd(
+                [mutex_countdown, countdown, minus_one],
+                [countdown, 'fetched_count:%d' % i])
+            net.AtomicFetchAdd(
+                [mutex_checksum, checksum, fetched_count],
+                [checksum, 'not_used'])
+            steps.append(
+                core.execution_step('worker:%d' % i, net, num_iter=200))
+        super_step = core.execution_step(
+            'parent', steps, concurrent_substeps=True)
+        plan = core.Plan('plan')
+        plan.AddStep(core.execution_step('init', init_net))
+        plan.AddStep(super_step)
+        workspace.RunPlan(plan)
+        # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
+        self.assertEquals(workspace.FetchBlob(checksum), 200010000)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py
new file mode 100644
index 0000000..516c066
--- /dev/null
+++ b/caffe2/python/operator_test/basic_rnn_test.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace, core, rnn_cell
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.rnn.rnn_cell_test_util import tanh
+import caffe2.python.hypothesis_test_util as hu
+
+from hypothesis import given
+from hypothesis import settings as ht_settings
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+
+def basic_rnn_reference(input, hidden_initial,
+                        i2h_w, i2h_b,
+                        gate_w, gate_b,
+                        seq_lengths,
+                        drop_states,
+                        use_sequence_lengths):
+    D = hidden_initial.shape[-1]
+    T = input.shape[0]
+    N = input.shape[1]
+
+    if seq_lengths is not None:
+        seq_lengths = (np.ones(shape=(N, D)) *
+                       seq_lengths.reshape(N, 1)).astype(np.int32)
+
+    ret = []
+
+    hidden_prev = hidden_initial
+
+    for t in range(T):
+        input_fc = np.dot(input[t], i2h_w.T) + i2h_b
+        recur_fc = np.dot(hidden_prev, gate_w.T) + gate_b
+        hidden_t = tanh(input_fc + recur_fc)
+
+        if seq_lengths is not None:
+            valid = (t < seq_lengths).astype(np.int32)
+            assert valid.shape == (N, D), (valid.shape, (N, D))
+            hidden_t = hidden_t * valid + \
+                       hidden_prev * (1 - valid) * (1 - drop_states)
+
+        ret.append(hidden_t)
+        hidden_prev = hidden_t
+    return ret
+
+
+class BasicRNNCellTest(hu.HypothesisTestCase):
+    @given(
+        seed=st.integers(0, 2**32 - 1),
+        seq_length=st.integers(min_value=1, max_value=5),
+        batch_size=st.integers(min_value=1, max_value=5),
+        input_size=st.integers(min_value=1, max_value=5),
+        hidden_size=st.integers(min_value=1, max_value=5),
+        drop_states=st.booleans(),
+        sequence_lengths=st.booleans(),
+        **hu.gcs
+    )
+    @ht_settings(max_examples=15)
+    def test_basic_rnn(self, seed, seq_length, batch_size, input_size, hidden_size,
+                       drop_states, sequence_lengths, gc, dc):
+        np.random.seed(seed)
+
+        seq_lengths_data = np.random.randint(
+            1, seq_length + 1, size=(batch_size,)).astype(np.int32)
+        input_blob_data = np.random.randn(
+            seq_length, batch_size, input_size).astype(np.float32)
+        initial_h_data = np.random.randn(
+            batch_size, hidden_size).astype(np.float32)
+        gates_t_w_data = np.random.randn(
+            hidden_size, hidden_size).astype(np.float32)
+        gates_t_b_data = np.random.randn(
+            hidden_size).astype(np.float32)
+        i2h_w_data = np.random.randn(
+            hidden_size, input_size).astype(np.float32)
+        i2h_b_data = np.random.randn(
+            hidden_size).astype(np.float32)
+
+        with core.DeviceScope(gc):
+            with hu.temp_workspace():
+                workspace.FeedBlob(
+                    'input_blob', input_blob_data, device_option=gc)
+                workspace.FeedBlob(
+                    'seq_lengths', seq_lengths_data, device_option=gc)
+                workspace.FeedBlob(
+                    'initial_h', initial_h_data, device_option=gc)
+                workspace.FeedBlob(
+                    'basic_rnn/gates_t_w', gates_t_w_data, device_option=gc)
+                workspace.FeedBlob(
+                    'basic_rnn/gates_t_b', gates_t_b_data, device_option=gc)
+                workspace.FeedBlob(
+                    'basic_rnn/i2h_w', i2h_w_data, device_option=gc)
+                workspace.FeedBlob(
+                    'basic_rnn/i2h_b', i2h_b_data, device_option=gc)
+
+                model = ModelHelper(name='model')
+                hidden_t_all, _ = rnn_cell.BasicRNN(
+                    model,
+                    'input_blob',
+                    'seq_lengths' if sequence_lengths else None,
+                    ['initial_h'],
+                    input_size,
+                    hidden_size,
+                    "basic_rnn",
+                    activation='tanh',
+                    forward_only=True,
+                    drop_states=drop_states)
+
+                workspace.RunNetOnce(model.net)
+
+                result = workspace.FetchBlob(hidden_t_all)
+
+        reference = basic_rnn_reference(
+            input_blob_data,
+            initial_h_data,
+            i2h_w_data,
+            i2h_b_data,
+            gates_t_w_data,
+            gates_t_b_data,
+            seq_lengths_data if sequence_lengths else None,
+            drop_states=drop_states,
+            use_sequence_lengths=sequence_lengths
+        )
+
+        np.testing.assert_allclose(result, reference, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+    ])
+    unittest.main()
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
new file mode 100644
index 0000000..f8bc77b
--- /dev/null
+++ b/caffe2/python/operator_test/batch_box_cox_test.py
@@ -0,0 +1,136 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+# The reference implementation is susceptible to numerical cancellation when
+# *lambda1* is small and *data* is near one. We leave it up to the caller to
+# truncate lambda to zero or bound data away from one. Unfortunately, the C++
+# implementation may be using higher precision than the python version, which
+# could cause this test to fail. We bound inputs away from the critical values.
+# (Note that a tolerance of 1e-6 on _either_ parameter is typically sufficient
+# to avoid catastrophic cancellation when the other is far from zero/one.)
+TOLERANCE = 1e-3
+
+
+@st.composite
+def _inputs(draw):
+    N = draw(st.integers(min_value=0, max_value=5))
+    D = draw(st.integers(min_value=1, max_value=5))
+    # N, D, data, lambda1, lambda2
+    return (
+        N,
+        D,
+        draw(st.lists(
+            min_size=N * D,
+            max_size=N * D,
+            elements=st.one_of(
+                st.floats(min_value=-10, max_value=1 - TOLERANCE),
+                st.floats(min_value=1 + TOLERANCE, max_value=10))
+        )),
+        draw(st.lists(
+            elements=st.one_of(
+                st.floats(min_value=-2, max_value=-TOLERANCE),
+                st.floats(min_value=TOLERANCE, max_value=2)),
+            min_size=D,
+            max_size=D,
+        )),
+        draw(st.lists(
+            elements=st.floats(min_value=-2, max_value=2),
+            min_size=D,
+            max_size=D,
+        )),
+    )
+
+
+class TestBatchBoxCox(hu.HypothesisTestCase):
+    @given(
+        inputs=_inputs(),
+        **hu.gcs_cpu_only
+    )
+    def test_batch_box_cox(self, inputs, gc, dc):
+        self.batch_box_cox(inputs, gc, dc)
+
+    @given(**hu.gcs_cpu_only)
+    def test_lambda1_is_all_zero(self, gc, dc):
+        inputs = (1, 1, [[2]], [0], [0])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (2, 1, [[2], [4]], [0], [0])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (1, 3, [[1, 2, 3]], [0, 0, 0], [0, 0, 0])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (2, 3, [[1, 2, 3], [4, 5, 6]], [0, 0, 0], [0, 0, 0])
+        self.batch_box_cox(inputs, gc, dc)
+
+    @given(**hu.gcs_cpu_only)
+    def test_lambda1_is_partially_zero(self, gc, dc):
+        inputs = (1, 5, [[1, 2, 3, 4, 5]],
+                  [0, -.5, 0, .5, 0], [0.1, 0.2, 0.3, 0.4, 0.5])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (3, 5, [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [1, 2, 3, 4, 5]],
+                  [0, -.5, 0, .5, 0], [0.1, 0.2, 0.3, 0.4, 0.5])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (2, 6, [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+                  [0, -.5, 0, .5, 0, 1], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+        self.batch_box_cox(inputs, gc, dc)
+        inputs = (2, 7, [[1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14]],
+                  [0, -.5, 0, .5, 0, 1, 0], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
+        self.batch_box_cox(inputs, gc, dc)
+
+    @given(**hu.gcs_cpu_only)
+    def test_bound_base_away_from_zero(self, gc, dc):
+        inputs = (2, 3, [[1e-5, 1e-6, 1e-7], [1e-7, -1e-6, 1e-5]],
+                  [0, 0, 0], [0, 0, 1e-6])
+        self.batch_box_cox(inputs, gc, dc)
+
+    def batch_box_cox(self, inputs, gc, dc):
+        N, D, data, lambda1, lambda2 = inputs
+
+        data = np.array(data, dtype=np.float32).reshape(N, D)
+        lambda1 = np.array(lambda1, dtype=np.float32)
+        lambda2 = np.array(lambda2, dtype=np.float32)
+
+        # Bound data away from one. See comment in _inputs() above.
+        base = data + lambda2
+        data[(base > 1 - TOLERANCE) & (base < 1 + TOLERANCE)] += 2 * TOLERANCE
+
+        def ref(data, lambda1, lambda2):
+            dim_1 = data.shape[1]
+            output = np.copy(data)
+            if data.size <= 0:
+                return [output]
+
+            for i in range(dim_1):
+                output[:, i] = data[:, i] + lambda2[i]
+                output[:, i] = np.maximum(output[:, i], 1e-6)
+                if lambda1[i] == 0:
+                    output[:, i] = np.log(output[:, i])
+                else:
+                    output[:, i] =\
+                        (np.power(output[:, i], lambda1[i]) - 1) / lambda1[i]
+            return [output]
+
+        for naive in [False, True]:
+            op = core.CreateOperator(
+                'BatchBoxCox',
+                ['data', 'lambda1', 'lambda2'],
+                ['output'],
+                naive=naive,
+                # Note examples above with D=5, 6, 7.
+                # A zero value falls back to the naive implementation.
+                min_block_size=0 if naive else 6
+            )
+            self.assertReferenceChecks(gc, op, [data, lambda1, lambda2], ref)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
new file mode 100644
index 0000000..711240d
--- /dev/null
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -0,0 +1,90 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestBatchBucketize(hu.HypothesisTestCase):
+    @given(**hu.gcs_cpu_only)
+    def test_batch_bucketize_example(self, gc, dc):
+        op = core.CreateOperator('BatchBucketize',
+                                 ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
+                                 ["O"])
+        float_feature = np.array([[1.42, 2.07, 3.19, 0.55, 4.32],
+                                  [4.57, 2.30, 0.84, 4.48, 3.09],
+                                  [0.89, 0.26, 2.41, 0.47, 1.05],
+                                  [0.03, 2.97, 2.43, 4.36, 3.11],
+                                  [2.74, 5.77, 0.90, 2.63, 0.38]], dtype=np.float32)
+        indices = np.array([0, 1, 4], dtype=np.int32)
+        lengths = np.array([2, 3, 1], dtype=np.int32)
+        boundaries = np.array([0.5, 1.0, 1.5, 2.5, 3.5, 2.5], dtype=np.float32)
+
+        def ref(float_feature, indices, boundaries, lengths):
+            output = np.array([[2, 1, 1],
+                               [2, 1, 1],
+                               [1, 0, 0],
+                               [0, 2, 1],
+                               [2, 3, 0]], dtype=np.int32)
+            return (output,)
+
+        self.assertReferenceChecks(gc, op,
+                                   [float_feature, indices, boundaries, lengths],
+                                   ref)
+
+    @given(
+        x=hu.tensor(
+            min_dim=2, max_dim=2, dtype=np.float32,
+            elements=st.floats(min_value=0, max_value=5),
+            min_value=5),
+        seed=st.integers(min_value=2, max_value=1000),
+        **hu.gcs_cpu_only)
+    def test_batch_bucketize(self, x, seed, gc, dc):
+        op = core.CreateOperator('BatchBucketize',
+                                 ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
+                                 ['O'])
+        np.random.seed(seed)
+        d = x.shape[1]
+        lens = np.random.randint(low=1, high=3, size=d - 3)
+        indices = np.random.choice(range(d), d - 3, replace=False)
+        indices.sort()
+        boundaries = []
+        for i in range(d - 3):
+            # add [0, 0] as duplicated bounary for duplicated bucketization
+            if lens[i] > 2:
+                cur_boundary = np.append(
+                    np.random.randn(lens[i] - 2) * 5, [0, 0])
+            else:
+                cur_boundary = np.random.randn(lens[i]) * 5
+            cur_boundary.sort()
+            boundaries += cur_boundary.tolist()
+
+        lens = np.array(lens, dtype=np.int32)
+        boundaries = np.array(boundaries, dtype=np.float32)
+        indices = np.array(indices, dtype=np.int32)
+
+        def ref(x, indices, boundaries, lens):
+            output_dim = indices.shape[0]
+            ret = np.zeros((x.shape[0], output_dim)).astype(np.int32)
+            boundary_offset = 0
+            for i, l in enumerate(indices):
+                temp_bound = boundaries[boundary_offset : lens[i] + boundary_offset]
+                for j in range(x.shape[0]):
+                    for k, bound_val in enumerate(temp_bound):
+                        if k == len(temp_bound) - 1 and x[j, l] > bound_val:
+                            ret[j, i] = k + 1
+                        elif x[j, l] > bound_val:
+                            continue
+                        else:
+                            ret[j, i] = k
+                            break
+                boundary_offset += lens[i]
+            return (ret,)
+
+        self.assertReferenceChecks(gc, op, [x, indices, boundaries, lens], ref)
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
new file mode 100644
index 0000000..a8e04fb
--- /dev/null
+++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestBatchSparseToDense(hu.HypothesisTestCase):
+
+    @given(
+        batch_size=st.integers(5, 10),
+        dense_last_dim=st.integers(5, 10),
+        default_value=st.floats(min_value=2.0, max_value=3.0),
+        **hu.gcs_cpu_only
+    )
+    def test_batch_sparse_to_dense(
+        self, batch_size, dense_last_dim, default_value, gc, dc
+    ):
+        L = np.random.randint(1, dense_last_dim + 1, size=(batch_size))
+        num_data = L.sum()
+        # The following logic ensure that indices in each batch will not be duplicated
+        I = np.array([]).astype(np.int32)
+        for l in L:
+            I_l = np.random.choice(dense_last_dim, l, replace=False)
+            I = np.concatenate((I, I_l))
+        V = np.random.rand(num_data).astype(np.float32)
+
+        op = core.CreateOperator(
+            'BatchSparseToDense',
+            ['L', 'I', 'V'],
+            ['O'],
+            dense_last_dim=dense_last_dim,
+            default_value=default_value,
+        )
+
+        S = np.random.rand(batch_size, dense_last_dim).astype(np.float32)
+        op2 = core.CreateOperator(
+            'BatchSparseToDense',
+            ['L', 'I', 'V', 'S'],
+            ['O'],
+            default_value=default_value,
+        )
+
+        def batch_sparse_to_dense_ref(L, I, V, S=None):
+            if S is None:
+                ret = np.zeros((batch_size, dense_last_dim))
+            else:
+                ret = np.zeros(S.shape)
+            ret.fill(default_value)
+            batch = 0
+            v_idx = 0
+            for length in L:
+                for _ in range(length):
+                    ret[batch][I[v_idx]] = V[v_idx]
+                    v_idx += 1
+                batch += 1
+            return [ret]
+
+        self.assertDeviceChecks(dc, op, [L, I, V], [0])
+        self.assertReferenceChecks(gc, op, [L, I, V], batch_sparse_to_dense_ref)
+        self.assertGradientChecks(gc, op, [L, I, V], 2, [0])
+        self.assertDeviceChecks(dc, op2, [L, I, V, S], [0])
+        self.assertReferenceChecks(gc, op2, [L, I, V, S], batch_sparse_to_dense_ref)
+        self.assertGradientChecks(gc, op2, [L, I, V, S], 2, [0])
+
+    @given(
+        batch_size=st.integers(5, 10),
+        dense_last_dim=st.integers(5, 10),
+        **hu.gcs_cpu_only
+    )
+    def test_batch_dense_to_sparse(self, batch_size, dense_last_dim, gc, dc):
+        L = np.random.randint(1, dense_last_dim + 1, size=(batch_size))
+        # The following logic ensure that indices in each batch will not be duplicated
+        I = np.array([]).astype(np.int32)
+        for l in L:
+            I_l = np.random.choice(dense_last_dim, l, replace=False)
+            I = np.concatenate((I, I_l))
+        D = np.random.rand(batch_size, dense_last_dim).astype(np.float32)
+
+        op = core.CreateOperator(
+            'BatchDenseToSparse',
+            ['L', 'I', 'D'],
+            ['V'],
+        )
+
+        def batch_dense_to_sparse_ref(L, I, D):
+            ret = np.zeros(I.shape)
+            batch = 0
+            i_idx = 0
+            for length in L:
+                for _ in range(length):
+                    ret[i_idx] = D[batch][I[i_idx]]
+                    i_idx += 1
+                batch += 1
+            return [ret]
+        print(L, I, D)
+
+        self.assertDeviceChecks(dc, op, [L, I, D], [0])
+        self.assertReferenceChecks(gc, op, [L, I, D], batch_dense_to_sparse_ref)
+        self.assertGradientChecks(gc, op, [L, I, D], 2, [0])
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
new file mode 100644
index 0000000..b54a443
--- /dev/null
+++ b/caffe2/python/operator_test/bbox_transform_test.py
@@ -0,0 +1,354 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+# Reference implementation from detectron/lib/utils/boxes.py
+def bbox_transform(boxes, deltas, weights=(1.0, 1.0, 1.0, 1.0)):
+    """Forward transform that maps proposal boxes to predicted ground-truth
+    boxes using bounding-box regression deltas. See bbox_transform_inv for a
+    description of the weights argument.
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+
+    # Prevent sending too large values into np.exp()
+    BBOX_XFORM_CLIP = np.log(1000. / 16.)
+    dw = np.minimum(dw, BBOX_XFORM_CLIP)
+    dh = np.minimum(dh, BBOX_XFORM_CLIP)
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+    return pred_boxes
+
+
+# Reference implementation from detectron/lib/utils/boxes.py
+def clip_tiled_boxes(boxes, im_shape):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert (
+        boxes.shape[1] % 4 == 0
+    ), "boxes.shape[1] is {:d}, but must be divisible by 4.".format(
+        boxes.shape[1]
+    )
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def generate_rois(roi_counts, im_dims):
+    assert len(roi_counts) == len(im_dims)
+    all_rois = []
+    for i, num_rois in enumerate(roi_counts):
+        if num_rois == 0:
+            continue
+        # [batch_idx, x1, y1, x2, y2]
+        rois = np.random.uniform(0, im_dims[i], size=(roi_counts[i], 5)).astype(
+            np.float32
+        )
+        rois[:, 0] = i  # batch_idx
+        # Swap (x1, x2) if x1 > x2
+        rois[:, 1], rois[:, 3] = (
+            np.minimum(rois[:, 1], rois[:, 3]),
+            np.maximum(rois[:, 1], rois[:, 3]),
+        )
+        # Swap (y1, y2) if y1 > y2
+        rois[:, 2], rois[:, 4] = (
+            np.minimum(rois[:, 2], rois[:, 4]),
+            np.maximum(rois[:, 2], rois[:, 4]),
+        )
+        all_rois.append(rois)
+    if len(all_rois) > 0:
+        return np.vstack(all_rois)
+    return np.empty((0, 5)).astype(np.float32)
+
+
+def bbox_transform_rotated(
+    boxes,
+    deltas,
+    weights=(1.0, 1.0, 1.0, 1.0),
+    angle_bound_on=True,
+    angle_bound_lo=-90,
+    angle_bound_hi=90,
+):
+    """
+    Similar to bbox_transform but for rotated boxes with angle info.
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    ctr_x = boxes[:, 0]
+    ctr_y = boxes[:, 1]
+    widths = boxes[:, 2]
+    heights = boxes[:, 3]
+    angles = boxes[:, 4]
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::5] / wx
+    dy = deltas[:, 1::5] / wy
+    dw = deltas[:, 2::5] / ww
+    dh = deltas[:, 3::5] / wh
+    da = deltas[:, 4::5] * 180.0 / np.pi
+
+    # Prevent sending too large values into np.exp()
+    BBOX_XFORM_CLIP = np.log(1000. / 16.)
+    dw = np.minimum(dw, BBOX_XFORM_CLIP)
+    dh = np.minimum(dh, BBOX_XFORM_CLIP)
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    pred_boxes[:, 0::5] = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_boxes[:, 1::5] = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_boxes[:, 2::5] = np.exp(dw) * widths[:, np.newaxis]
+    pred_boxes[:, 3::5] = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_angle = da + angles[:, np.newaxis]
+    if angle_bound_on:
+        period = angle_bound_hi - angle_bound_lo
+        assert period % 180 == 0
+        pred_angle[np.where(pred_angle < angle_bound_lo)] += period
+        pred_angle[np.where(pred_angle > angle_bound_hi)] -= period
+    pred_boxes[:, 4::5] = pred_angle
+
+    return pred_boxes
+
+
+def clip_tiled_boxes_rotated(boxes, im_shape, angle_thresh=1.0):
+    """
+    Similar to clip_tiled_boxes but for rotated boxes with angle info.
+    Only clips almost horizontal boxes within angle_thresh. The rest are
+    left unchanged.
+    """
+    assert (
+        boxes.shape[1] % 5 == 0
+    ), "boxes.shape[1] is {:d}, but must be divisible by 5.".format(
+        boxes.shape[1]
+    )
+
+    (H, W) = im_shape[:2]
+
+    # Filter boxes that are almost upright within angle_thresh tolerance
+    idx = np.where(np.abs(boxes[:, 4::5]) <= angle_thresh)
+    idx5 = idx[1] * 5
+    # convert to (x1, y1, x2, y2)
+    x1 = boxes[idx[0], idx5] - (boxes[idx[0], idx5 + 2] - 1) / 2.0
+    y1 = boxes[idx[0], idx5 + 1] - (boxes[idx[0], idx5 + 3] - 1) / 2.0
+    x2 = boxes[idx[0], idx5] + (boxes[idx[0], idx5 + 2] - 1) / 2.0
+    y2 = boxes[idx[0], idx5 + 1] + (boxes[idx[0], idx5 + 3] - 1) / 2.0
+    # clip
+    x1 = np.maximum(np.minimum(x1, W - 1), 0)
+    y1 = np.maximum(np.minimum(y1, H - 1), 0)
+    x2 = np.maximum(np.minimum(x2, W - 1), 0)
+    y2 = np.maximum(np.minimum(y2, H - 1), 0)
+    # convert back to (xc, yc, w, h)
+    boxes[idx[0], idx5] = (x1 + x2) / 2.0
+    boxes[idx[0], idx5 + 1] = (y1 + y2) / 2.0
+    boxes[idx[0], idx5 + 2] = x2 - x1 + 1
+    boxes[idx[0], idx5 + 3] = y2 - y1 + 1
+
+    return boxes
+
+
+def generate_rois_rotated(roi_counts, im_dims):
+    rois = generate_rois(roi_counts, im_dims)
+    # [batch_id, ctr_x, ctr_y, w, h, angle]
+    rotated_rois = np.empty((rois.shape[0], 6)).astype(np.float32)
+    rotated_rois[:, 0] = rois[:, 0]  # batch_id
+    rotated_rois[:, 1] = (rois[:, 1] + rois[:, 3]) / 2.  # ctr_x = (x1 + x2) / 2
+    rotated_rois[:, 2] = (rois[:, 2] + rois[:, 4]) / 2.  # ctr_y = (y1 + y2) / 2
+    rotated_rois[:, 3] = rois[:, 3] - rois[:, 1] + 1.0  # w = x2 - x1 + 1
+    rotated_rois[:, 4] = rois[:, 4] - rois[:, 2] + 1.0  # h = y2 - y1 + 1
+    rotated_rois[:, 5] = np.random.uniform(-90.0, 90.0)  # angle in degrees
+    return rotated_rois
+
+
+class TestBBoxTransformOp(hu.HypothesisTestCase):
+    @given(
+        num_rois=st.integers(1, 10),
+        num_classes=st.integers(1, 10),
+        im_dim=st.integers(100, 600),
+        skip_batch_id=st.booleans(),
+        rotated=st.booleans(),
+        angle_bound_on=st.booleans(),
+        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
+        **hu.gcs_cpu_only
+    )
+    def test_bbox_transform(
+        self,
+        num_rois,
+        num_classes,
+        im_dim,
+        skip_batch_id,
+        rotated,
+        angle_bound_on,
+        clip_angle_thresh,
+        gc,
+        dc,
+    ):
+        """
+        Test with all rois belonging to a single image per run.
+        """
+        rois = (
+            generate_rois_rotated([num_rois], [im_dim])
+            if rotated
+            else generate_rois([num_rois], [im_dim])
+        )
+        box_dim = 5 if rotated else 4
+        if skip_batch_id:
+            rois = rois[:, 1:]
+        deltas = np.random.randn(num_rois, box_dim * num_classes).astype(np.float32)
+        im_info = np.array([im_dim, im_dim, 1.0]).astype(np.float32).reshape(1, 3)
+
+        def bbox_transform_ref(rois, deltas, im_info):
+            boxes = rois if rois.shape[1] == box_dim else rois[:, 1:]
+            im_shape = im_info[0, 0:2]
+            if rotated:
+                box_out = bbox_transform_rotated(
+                    boxes, deltas, angle_bound_on=angle_bound_on
+                )
+                box_out = clip_tiled_boxes_rotated(
+                    box_out, im_shape, angle_thresh=clip_angle_thresh
+                )
+            else:
+                box_out = bbox_transform(boxes, deltas)
+                box_out = clip_tiled_boxes(box_out, im_shape)
+            return [box_out]
+
+        op = core.CreateOperator(
+            "BBoxTransform",
+            ["rois", "deltas", "im_info"],
+            ["box_out"],
+            apply_scale=False,
+            correct_transform_coords=True,
+            rotated=rotated,
+            angle_bound_on=angle_bound_on,
+            clip_angle_thresh=clip_angle_thresh,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[rois, deltas, im_info],
+            reference=bbox_transform_ref,
+        )
+
+    @given(
+        roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
+        num_classes=st.integers(1, 10),
+        rotated=st.booleans(),
+        angle_bound_on=st.booleans(),
+        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
+        **hu.gcs_cpu_only
+    )
+    def test_bbox_transform_batch(
+        self,
+        roi_counts,
+        num_classes,
+        rotated,
+        angle_bound_on,
+        clip_angle_thresh,
+        gc,
+        dc,
+    ):
+        """
+        Test with rois for multiple images in a batch
+        """
+        batch_size = len(roi_counts)
+        total_rois = sum(roi_counts)
+        im_dims = np.random.randint(100, 600, batch_size)
+        rois = (
+            generate_rois_rotated(roi_counts, im_dims)
+            if rotated
+            else generate_rois(roi_counts, im_dims)
+        )
+        box_dim = 5 if rotated else 4
+        deltas = np.random.randn(total_rois, box_dim * num_classes).astype(np.float32)
+        im_info = np.zeros((batch_size, 3)).astype(np.float32)
+        im_info[:, 0] = im_dims
+        im_info[:, 1] = im_dims
+        im_info[:, 2] = 1.0
+
+        def bbox_transform_ref(rois, deltas, im_info):
+            box_out = []
+            offset = 0
+            for i, num_rois in enumerate(roi_counts):
+                if num_rois == 0:
+                    continue
+                cur_boxes = rois[offset : offset + num_rois, 1:]
+                cur_deltas = deltas[offset : offset + num_rois]
+                im_shape = im_info[i, 0:2]
+                if rotated:
+                    cur_box_out = bbox_transform_rotated(
+                        cur_boxes, cur_deltas, angle_bound_on=angle_bound_on
+                    )
+                    cur_box_out = clip_tiled_boxes_rotated(
+                        cur_box_out, im_shape, angle_thresh=clip_angle_thresh
+                    )
+                else:
+                    cur_box_out = bbox_transform(cur_boxes, cur_deltas)
+                    cur_box_out = clip_tiled_boxes(cur_box_out, im_shape)
+                box_out.append(cur_box_out)
+                offset += num_rois
+
+            if len(box_out) > 0:
+                box_out = np.vstack(box_out)
+            else:
+                box_out = np.empty(deltas.shape).astype(np.float32)
+            return [box_out, roi_counts]
+
+        op = core.CreateOperator(
+            "BBoxTransform",
+            ["rois", "deltas", "im_info"],
+            ["box_out", "roi_batch_splits"],
+            apply_scale=False,
+            correct_transform_coords=True,
+            rotated=rotated,
+            angle_bound_on=angle_bound_on,
+            clip_angle_thresh=clip_angle_thresh,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[rois, deltas, im_info],
+            reference=bbox_transform_ref,
+        )
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
new file mode 100644
index 0000000..6112635
--- /dev/null
+++ b/caffe2/python/operator_test/blobs_queue_db_test.py
@@ -0,0 +1,94 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+
+import caffe2.proto.caffe2_pb2 as caffe2_pb2
+from caffe2.python import core, workspace, timeout_guard
+
+
+class BlobsQueueDBTest(unittest.TestCase):
+    def test_create_blobs_queue_db_string(self):
+        def add_blobs(queue, num_samples):
+            blob = core.BlobReference("blob")
+            status = core.BlobReference("blob_status")
+            for i in range(num_samples):
+                self._add_blob_to_queue(
+                    queue, self._create_test_tensor_protos(i), blob, status
+                )
+        self._test_create_blobs_queue_db(add_blobs)
+
+    def test_create_blobs_queue_db_tensor(self):
+        def add_blobs(queue, num_samples):
+            blob = core.BlobReference("blob")
+            status = core.BlobReference("blob_status")
+            for i in range(num_samples):
+                data = self._create_test_tensor_protos(i)
+                data = np.array([data], dtype=str)
+                self._add_blob_to_queue(
+                    queue, data, blob, status
+                )
+        self._test_create_blobs_queue_db(add_blobs)
+
+    def _test_create_blobs_queue_db(self, add_blobs_fun):
+        num_samples = 10000
+        batch_size = 10
+        init_net = core.Net('init_net')
+        net = core.Net('test_create_blobs_queue_db')
+        queue = init_net.CreateBlobsQueue([], 'queue', capacity=num_samples)
+        reader = init_net.CreateBlobsQueueDB(
+            [queue],
+            'blobs_queue_db_reader',
+            value_blob_index=0,
+            timeout_secs=0.1,
+        )
+        workspace.RunNetOnce(init_net)
+
+        add_blobs_fun(queue, num_samples)
+
+        net.TensorProtosDBInput(
+            [reader], ['image', 'label'], batch_size=batch_size)
+        workspace.CreateNet(net)
+
+        close_net = core.Net('close_net')
+        close_net.CloseBlobsQueue([queue], [])
+
+        for i in range(int(num_samples / batch_size)):
+            print("Running net, iteration {}".format(i))
+            with timeout_guard.CompleteInTimeOrDie(2.0):
+                workspace.RunNet(net)
+
+            images = workspace.FetchBlob('image')
+            labels = workspace.FetchBlob('label')
+            self.assertEqual(batch_size, len(images))
+            self.assertEqual(batch_size, len(labels))
+            for idx, item in enumerate(images):
+                self.assertEqual(
+                    "foo{}".format(i * batch_size + idx).encode('utf-8'), item
+                )
+            for item in labels:
+                self.assertEqual(1, item)
+        workspace.RunNetOnce(close_net)
+
+    def _add_blob_to_queue(self, queue, data, blob, status):
+        workspace.FeedBlob(blob, data)
+        op = core.CreateOperator(
+            "SafeEnqueueBlobs",
+            [queue, blob],
+            [blob, status],
+        )
+        workspace.RunOperatorOnce(op)
+
+    def _create_test_tensor_protos(self, idx):
+        item = caffe2_pb2.TensorProtos()
+        data = item.protos.add()
+        data.data_type = core.DataType.STRING
+        data.string_data.append("foo{}".format(idx).encode('utf-8'))
+        label = item.protos.add()
+        label.data_type = core.DataType.INT32
+        label.int32_data.append(1)
+
+        return item.SerializeToString()
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
new file mode 100644
index 0000000..638248d
--- /dev/null
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -0,0 +1,386 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import assume, given
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestBooleanMaskOp(hu.HypothesisTestCase):
+
+    @given(x=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_boolean_mask(self, x, gc, dc):
+        op = core.CreateOperator("BooleanMask",
+                                 ["data", "mask"],
+                                 "masked_data")
+        mask = np.random.choice(a=[True, False], size=x.shape[0])
+
+        def ref(x, mask):
+            return (x[mask],)
+
+        self.assertReferenceChecks(gc, op, [x, mask], ref)
+        self.assertDeviceChecks(dc, op, [x, mask], [0])
+
+    @given(x=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_boolean_mask_indices(self, x, gc, dc):
+        op = core.CreateOperator("BooleanMask",
+                                 ["data", "mask"],
+                                 ["masked_data", "masked_indices"])
+        mask = np.random.choice(a=[True, False], size=x.shape[0])
+
+        def ref(x, mask):
+            return (x[mask], np.where(mask)[0])
+
+        self.assertReferenceChecks(gc, op, [x, mask], ref)
+        self.assertDeviceChecks(dc, op, [x, mask], [0])
+
+    @staticmethod
+    def _dtype_conversion(x, dtype, gc, dc):
+        """SequenceMask only supports fp16 with CUDA."""
+        if dtype == np.float16:
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
+            x = x.astype(dtype)
+        return x, dc
+
+    @given(x=hu.tensor(min_dim=2,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_with_lengths(self, x, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        op = core.CreateOperator("SequenceMask",
+                                 ["data", "lengths"],
+                                 ["masked_data"],
+                                 mode="sequence",
+                                 axis=len(x.shape) - 1,
+                                 fill_val=fill_val)
+        elem_dim = x.shape[-1]
+        leading_dim = 1
+        for dim in x.shape[:-1]:
+            leading_dim *= dim
+        lengths = np.random.randint(0, elem_dim, [leading_dim])\
+            .astype(np.int32)
+
+        def ref(x, lengths):
+            ref = np.reshape(x, [leading_dim, elem_dim])
+            for i in range(leading_dim):
+                for j in range(elem_dim):
+                    if j >= lengths[i]:
+                        ref[i, j] = fill_val
+            return [ref.reshape(x.shape)]
+
+        self.assertReferenceChecks(gc, op, [x, lengths], ref)
+        self.assertDeviceChecks(dc, op, [x, lengths], [0])
+
+    @given(x=hu.tensor(min_dim=2,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_with_window(self, x, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        radius = 2
+        op = core.CreateOperator("SequenceMask",
+                                 ["data", "centers"],
+                                 ["masked_data"],
+                                 mode="window",
+                                 radius=radius,
+                                 axis=len(x.shape) - 1,
+                                 fill_val=fill_val)
+        elem_dim = x.shape[-1]
+        leading_dim = 1
+        for dim in x.shape[:-1]:
+            leading_dim *= dim
+        centers = np.random.randint(0, elem_dim, [leading_dim])\
+            .astype(np.int32)
+
+        def ref(x, centers):
+            ref = np.reshape(x, [leading_dim, elem_dim])
+            for i in range(leading_dim):
+                for j in range(elem_dim):
+                    if j > centers[i] + radius or j < centers[i] - radius:
+                        ref[i, j] = fill_val
+            return [ref.reshape(x.shape)]
+
+        self.assertReferenceChecks(gc, op, [x, centers], ref)
+        self.assertDeviceChecks(dc, op, [x, centers], [0])
+
+        # Gradient check with np.float16 is found to be flakey, disable for now
+        # with high threshold (to repro, set threshold to 0.4).
+        threshold = 1.0 if dtype == np.float16 else 0.005
+        self.assertGradientChecks(gc, op, [x, centers], 0, [0],
+                                  threshold=threshold)
+
+    @given(x=hu.tensor(min_dim=2,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           mode=st.sampled_from(['upper', 'lower', 'upperdiag', 'lowerdiag']),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_triangle(self, x, mode, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        op = core.CreateOperator("SequenceMask",
+                                 ["data"],
+                                 ["masked_data"],
+                                 mode=mode,
+                                 axis=len(x.shape) - 1,
+                                 fill_val=fill_val)
+        elem_dim = x.shape[-1]
+        leading_dim = 1
+        for dim in x.shape[:-1]:
+            leading_dim *= dim
+
+        if mode == 'upper':
+            def compare(i, j):
+                return j > i
+        elif mode == 'lower':
+            def compare(i, j):
+                return j < i
+        elif mode == 'upperdiag':
+            def compare(i, j):
+                return j >= i
+        elif mode == 'lowerdiag':
+            def compare(i, j):
+                return j <= i
+
+        def ref(x):
+            ref = np.reshape(x, [leading_dim, elem_dim])
+            for i in range(leading_dim):
+                for j in range(elem_dim):
+                    if compare(i, j):
+                        ref[i, j] = fill_val
+            return [ref.reshape(x.shape)]
+
+        self.assertReferenceChecks(gc, op, [x], ref)
+        self.assertDeviceChecks(dc, op, [x], [0])
+
+        # Gradient check with np.float16 is found to be flakey, disable for now
+        # with high threshold (to repro, set threshold to 0.4).
+        threshold = 1.0 if dtype == np.float16 else 0.005
+        stepsize = 0.1 if dtype == np.float16 else 0.05
+        self.assertGradientChecks(gc, op, [x], 0, [0],
+                                  threshold=threshold, stepsize=stepsize)
+
+    @given(x=hu.tensor(min_dim=2,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_batching_lengths(self, x, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        # choose _different_ batch and axis dimensions, w/ axis != 0.
+        axis = 0
+        batch = 0
+        while axis == 0 or axis < batch:
+            inds = np.arange(len(x.shape))
+            np.random.shuffle(inds)
+            batch = inds[0]
+            axis = inds[1]
+        op = core.CreateOperator("SequenceMask",
+                                 ["data", "lengths"],
+                                 ["masked_data"],
+                                 mode='sequence',
+                                 axis=axis,
+                                 fill_val=fill_val,
+                                 batch=batch)
+
+        before = int(np.prod(x.shape[:batch + 1]))
+        between = int(np.prod(x.shape[batch + 1:axis]))
+        after = int(np.prod(x.shape[axis:]))
+
+        lengths = np.random.randint(0, after, [between])\
+            .astype(np.int32)
+
+        def ref(z, l):
+            w = np.reshape(z, [before, between, after])
+
+            for b in range(before):
+                r = w[b, :, :]
+                for i in range(between):
+                    for j in range(after):
+                        if j >= l[i]:
+                            r[i, j] = fill_val
+            return [w.reshape(z.shape)]
+
+        self.assertReferenceChecks(gc, op, [x, lengths], ref)
+        self.assertDeviceChecks(dc, op, [x, lengths], [0])
+
+        # Gradient check with np.float16 is found to be flakey, disable for now
+        # with high threshold (to repro, set threshold to 0.4).
+        threshold = 1.0 if dtype == np.float16 else 0.005
+        self.assertGradientChecks(gc, op, [x, lengths], 0, [0],
+                                  threshold=threshold)
+
+    @given(x=hu.tensor(min_dim=4,
+                       max_dim=4,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_batching_window(self, x, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        radius = 1
+        # choose _different_ batch and axis dimensions, w/ axis != 0.
+        axis = 0
+        batch = 0
+        while axis == 0 or axis < batch:
+            inds = np.arange(len(x.shape))
+            np.random.shuffle(inds)
+            batch = inds[0]
+            axis = inds[1]
+        op = core.CreateOperator("SequenceMask",
+                                 ["data", "centers"],
+                                 ["masked_data"],
+                                 mode='window',
+                                 radius=radius,
+                                 axis=axis,
+                                 fill_val=fill_val,
+                                 batch=batch)
+
+        before = int(np.prod(x.shape[:batch + 1]))
+        between = int(np.prod(x.shape[batch + 1:axis]))
+        after = int(np.prod(x.shape[axis:]))
+
+        centers = np.random.randint(0, after, [between])\
+            .astype(np.int32)
+
+        def ref(z, c):
+            w = np.reshape(z, [before, between, after])
+
+            for b in range(before):
+                r = w[b, :, :]
+                for i in range(between):
+                    for j in range(after):
+                        if j > c[i] + radius or j < c[i] - radius:
+                            r[i, j] = fill_val
+            return [w.reshape(z.shape)]
+
+        self.assertReferenceChecks(gc, op, [x, centers], ref)
+        self.assertDeviceChecks(dc, op, [x, centers], [0])
+
+        # Gradient check with np.float16 is found to be flakey, disable for now
+        # with high threshold (to repro, set threshold to 0.4).
+        threshold = 1.0 if dtype == np.float16 else 0.005
+        self.assertGradientChecks(gc, op, [x, centers], 0, [0],
+                                  threshold=threshold)
+
+    @given(x=hu.tensor(min_dim=3,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           mode=st.sampled_from(['upper', 'lower', 'upperdiag', 'lowerdiag']),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_batching_triangle(self, x, mode, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        # choose _different_ batch and axis dimensions, w/ axis != 0.
+        axis = 0
+        batch = 0
+        while axis == 0 or axis < batch:
+            inds = np.arange(len(x.shape))
+            np.random.shuffle(inds)
+            batch = inds[0]
+            axis = inds[1]
+        op = core.CreateOperator("SequenceMask",
+                                 ["data"],
+                                 ["masked_data"],
+                                 mode=mode,
+                                 axis=axis,
+                                 fill_val=fill_val,
+                                 batch=batch)
+
+        if mode == 'upper':
+            def compare(i, j):
+                return j > i
+        elif mode == 'lower':
+            def compare(i, j):
+                return j < i
+        elif mode == 'upperdiag':
+            def compare(i, j):
+                return j >= i
+        elif mode == 'lowerdiag':
+            def compare(i, j):
+                return j <= i
+
+        def ref(z):
+            before = int(np.prod(z.shape[:batch + 1]))
+            between = int(np.prod(z.shape[batch + 1:axis]))
+            after = int(np.prod(z.shape[axis:]))
+
+            w = np.reshape(z, [before, between, after])
+
+            for b in range(before):
+                r = w[b, :, :]
+                for i in range(between):
+                    for j in range(after):
+                        if compare(i, j):
+                            r[i, j] = fill_val
+            return [w.reshape(z.shape)]
+
+        self.assertReferenceChecks(gc, op, [x], ref)
+        self.assertDeviceChecks(dc, op, [x], [0])
+
+        # Gradient check with np.float16 is found to be flakey, disable for now
+        # with high threshold (to repro, set threshold to 0.4).
+        threshold = 1.0 if dtype == np.float16 else 0.005
+        stepsize = 0.1 if dtype == np.float16 else 0.05
+        self.assertGradientChecks(gc, op, [x], 0, [0],
+                                  threshold=threshold, stepsize=stepsize)
+
+    @given(x=hu.tensor(min_dim=3,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_sequence_mask_repeated(self, x, dtype, gc, dc):
+        x, dc = self._dtype_conversion(x, dtype, gc, dc)
+        # finite fill value needed for gradient check
+        fill_val = 1e-3 if dtype == np.float16 else 1e-9
+        op = core.CreateOperator("SequenceMask",
+                                 ["data", "lengths"],
+                                 ["masked_data"],
+                                 mode="sequence",
+                                 axis=len(x.shape) - 2,
+                                 repeat_from_axis=-1,
+                                 fill_val=fill_val)
+
+        elem_dim = x.shape[-2]
+        leading_dim = 1
+        for dim in x.shape[:-2]:
+            leading_dim *= dim
+        lengths = np.random.randint(0, elem_dim, [leading_dim])\
+            .astype(np.int32)
+
+        def ref(x, lengths):
+            ref = np.reshape(x, [leading_dim, elem_dim, -1])
+            for i in range(leading_dim):
+                for j in range(elem_dim):
+                    if j >= lengths[i]:
+                        ref[i, j, :] = fill_val
+            return [ref.reshape(x.shape)]
+
+        self.assertReferenceChecks(gc, op, [x, lengths], ref)
+        self.assertDeviceChecks(dc, op, [x, lengths], [0])
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
new file mode 100644
index 0000000..86b2fed
--- /dev/null
+++ b/caffe2/python/operator_test/boolean_unmask_test.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestUnmaskOp(hu.HypothesisTestCase):
+    @given(N=st.integers(min_value=2, max_value=20),
+           dtype=st.sampled_from([
+               np.bool_,
+               np.int8,
+               np.int16,
+               np.int32,
+               np.int64,
+               np.uint8,
+               np.uint16,
+               np.float16,
+               np.float32,
+               np.float64]),
+           **hu.gcs)
+    def test(self, N, dtype, gc, dc):
+        if dtype is np.bool_:
+            all_value = np.random.choice(a=[True, False], size=N)
+        else:
+            all_value = (np.random.rand(N) * N).astype(dtype)
+
+        M = np.random.randint(1, N)
+        split = sorted(np.random.randint(1, N, size=M))
+        indices = np.random.permutation(N)
+        pieces = np.split(indices, split)
+
+        def ref(*args, **kwargs):
+            return (all_value,)
+
+        inputs = []
+        inputs_names = []
+        for i, piece in enumerate(pieces):
+            piece.sort()
+            mask = np.zeros(N, dtype=np.bool_)
+            mask[piece] = True
+            values = all_value[piece]
+            inputs.extend([mask, values])
+            inputs_names.extend(["mask%d" % i, "value%d" % i])
+
+        op = core.CreateOperator(
+            'BooleanUnmask',
+            inputs_names,
+            'output')
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py
new file mode 100644
index 0000000..39195c8
--- /dev/null
+++ b/caffe2/python/operator_test/cast_op_test.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+from hypothesis import given
+import numpy as np
+
+
+class TestCastOp(hu.HypothesisTestCase):
+
+    @given(**hu.gcs)
+    def test_cast_int_float(self, gc, dc):
+        data = np.random.rand(5, 5).astype(np.int32)
+        # from int to float
+        op = core.CreateOperator('Cast', 'data', 'data_cast', to=1, from_type=2)
+        self.assertDeviceChecks(dc, op, [data], [0])
+        # This is actually 0
+        self.assertGradientChecks(gc, op, [data], 0, [0])
+
+    @given(**hu.gcs)
+    def test_cast_int_float_empty(self, gc, dc):
+        data = np.random.rand(0).astype(np.int32)
+        # from int to float
+        op = core.CreateOperator('Cast', 'data', 'data_cast', to=1, from_type=2)
+        self.assertDeviceChecks(dc, op, [data], [0])
+        # This is actually 0
+        self.assertGradientChecks(gc, op, [data], 0, [0])
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
new file mode 100644
index 0000000..79293fc
--- /dev/null
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestCeil(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_ceil(self, X, gc, dc, engine):
+        op = core.CreateOperator("Ceil", ["X"], ["Y"], engine=engine)
+
+        def ceil_ref(X):
+            return (np.ceil(X),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=ceil_ref)
+
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
new file mode 100644
index 0000000..5c59b8d
--- /dev/null
+++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
+import unittest
+
+
+class TestChannelBackpropStats(hu.HypothesisTestCase):
+    @given(
+        size=st.integers(7, 10),
+        inputChannels=st.integers(1, 10),
+        batchSize=st.integers(1, 3),
+        **hu.gcs
+    )
+    def testChannelBackpropStats(self, size, inputChannels, batchSize, gc, dc):
+
+        op = core.CreateOperator(
+            "ChannelBackpropStats",
+            ["X", "mean", "invStdDev", "outputGrad"],
+            ["scaleGrad", "biasGrad"],
+        )
+
+        def referenceChannelBackpropStatsTest(X, mean, invStdDev, outputGrad):
+            scaleGrad = np.zeros(inputChannels)
+            biasGrad = np.zeros(inputChannels)
+            for n in range(batchSize):
+                for c in range(inputChannels):
+                    for h in range(size):
+                        for w in range(size):
+                            biasGrad[c] += outputGrad[n, c, h, w]
+                            scaleGrad[c] += (
+                                X[n, c, h, w] - mean[c]
+                            ) * invStdDev[c] * outputGrad[n, c, h, w]
+            return scaleGrad, biasGrad
+
+        X = np.random.rand(batchSize, inputChannels, size, size)\
+                     .astype(np.float32) - 0.5
+        sums = np.sum(X, axis=(0, 2, 3), keepdims=False)
+        numPixels = size * size * batchSize
+        mean = sums / numPixels
+        sumsq = np.sum(X**2, axis=(0, 2, 3), keepdims=False)
+        var = ((sumsq -
+                (sums * sums) / numPixels) / numPixels).astype(np.float32)
+        invStdDev = 1 / np.sqrt(var)
+        outputGrad = np.random.rand(batchSize, inputChannels, size, size)\
+            .astype(np.float32) - 0.5
+        self.assertReferenceChecks(
+            gc, op, [X, mean, invStdDev, outputGrad],
+            referenceChannelBackpropStatsTest
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
new file mode 100644
index 0000000..93b9328
--- /dev/null
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class ChannelShuffleOpsTest(hu.HypothesisTestCase):
+    @given(
+        channels_per_group=st.integers(min_value=1, max_value=5),
+        groups=st.integers(min_value=1, max_value=5),
+        n=st.integers(min_value=1, max_value=2),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        **hu.gcs)
+    def test_channel_shuffle(self, channels_per_group, groups, n, order, gc, dc):
+        X = np.random.randn(
+            n, channels_per_group * groups, 5, 6).astype(np.float32)
+        if order == "NHWC":
+            # NCHW -> NHWC
+            X = X.transpose((0, 2, 3, 1))
+
+        op = core.CreateOperator("ChannelShuffle", ["X"], ["Y"],
+                                 group=groups, kernel=1, order=order,
+                                 device_option=gc)
+
+        def channel_shuffle_ref(X):
+            if order == "NHWC":
+                # NHWC -> NCHW
+                X = X.transpose((0, 3, 1, 2))
+            Y_r = X.reshape(X.shape[0],
+                            groups,
+                            X.shape[1] // groups,
+                            X.shape[2],
+                            X.shape[3])
+            Y_trns = Y_r.transpose((0, 2, 1, 3, 4))
+            Y_reshaped = Y_trns.reshape(X.shape)
+            if order == "NHWC":
+                # NCHW -> NHWC
+                Y_reshaped = Y_reshaped.transpose((0, 2, 3, 1))
+            return (Y_reshaped,)
+
+        self.assertReferenceChecks(gc, op, [X], channel_shuffle_ref)
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+        self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
new file mode 100644
index 0000000..2a238e6
--- /dev/null
+++ b/caffe2/python/operator_test/channel_stats_op_test.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
+import unittest
+
+
+class TestChannelStats(hu.HypothesisTestCase):
+    @given(
+        size=st.integers(7, 10),
+        inputChannels=st.integers(1, 10),
+        batchSize=st.integers(1, 3),
+        **hu.gcs
+    )
+    def testChannelStats(self, size, inputChannels, batchSize, gc, dc):
+
+        op = core.CreateOperator(
+            "ChannelStats",
+            ["X"],
+            ["sum", "sumsq"],
+        )
+
+        def referenceChannelStatsTest(X):
+            sums = np.sum(X, axis=(0, 2, 3), keepdims=False)
+            sumsq = np.zeros(inputChannels)
+            sumsq = np.sum(X**2, axis=(0, 2, 3), keepdims=False)
+            return sums, sumsq
+
+        X = np.random.rand(batchSize, inputChannels, size, size)\
+                .astype(np.float32) - 0.5
+        self.assertReferenceChecks(gc, op, [X], referenceChannelStatsTest)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py
new file mode 100644
index 0000000..dc42d47
--- /dev/null
+++ b/caffe2/python/operator_test/checkpoint_test.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class CheckpointTest(unittest.TestCase):
+    """A simple test case to make sure that the checkpoint behavior is correct.
+    """
+
+    def testCheckpoint(self):
+        temp_root = tempfile.mkdtemp()
+        net = core.Net("test_checkpoint")
+        # Note(jiayq): I am being a bit lazy here and am using the old iter
+        # convention that does not have an input. Optionally change it to the
+        # new style if needed.
+        net.Iter([], "iter")
+        net.ConstantFill([], "value", shape=[1, 2, 3])
+        net.Checkpoint(["iter", "value"], [],
+                     db=os.path.join(temp_root, "test_checkpoint_at_%05d"),
+                     db_type="leveldb", every=10, absolute_path=True)
+        self.assertTrue(workspace.CreateNet(net))
+        for i in range(100):
+            self.assertTrue(workspace.RunNet("test_checkpoint"))
+        for i in range(1, 10):
+            # Print statements are only for debugging purposes.
+            # print("Asserting %d" % i)
+            # print(os.path.join(temp_root, "test_checkpoint_at_%05d" % (i * 10)))
+            self.assertTrue(os.path.exists(
+                os.path.join(temp_root, "test_checkpoint_at_%05d" % (i * 10))))
+
+        # Finally, clean up.
+        shutil.rmtree(temp_root)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
new file mode 100644
index 0000000..38499a6
--- /dev/null
+++ b/caffe2/python/operator_test/clip_op_test.py
@@ -0,0 +1,62 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestClip(hu.HypothesisTestCase):
+    @given(X=hu.tensor(),
+           min_=st.floats(min_value=-2, max_value=0),
+           max_=st.floats(min_value=0, max_value=2),
+           inplace=st.booleans(),
+           **hu.gcs)
+    def test_clip(self, X, min_, max_, inplace, gc, dc):
+        # go away from the origin point to avoid kink problems
+
+        X[np.abs(X - min_) < 0.05] += 0.1
+        X[np.abs(X - max_) < 0.05] += 0.1
+
+        def clip_ref(X):
+            X = X.clip(min_, max_)
+            return (X,)
+
+        op = core.CreateOperator(
+            "Clip",
+            ["X"], ["Y" if not inplace else "X"],
+            min=min_,
+            max=max_)
+        self.assertReferenceChecks(gc, op, [X], clip_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(),
+           inplace=st.booleans(),
+           **hu.gcs)
+    def test_clip_default(self, X, inplace, gc, dc):
+        # go away from the origin point to avoid kink problems
+        X += 0.04 * np.sign(X)
+
+        def clip_ref(X):
+            return (X,)
+
+        op = core.CreateOperator(
+            "Clip",
+            ["X"], ["Y" if not inplace else "X"])
+        self.assertReferenceChecks(gc, op, [X], clip_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
new file mode 100644
index 0000000..bea2133
--- /dev/null
+++ b/caffe2/python/operator_test/clip_tensor_op_test.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestClipTensorByScalingOp(hu.HypothesisTestCase):
+
+    @given(n=st.integers(5, 8), d=st.integers(2, 4),
+           threshold=st.floats(0.1, 10),
+           additional_threshold=st.floats(0.1, 10),
+           use_additional_threshold=st.booleans(),
+           inplace=st.booleans(),
+           **hu.gcs_cpu_only)
+    def test_clip_tensor_by_scaling(self, n, d, threshold, additional_threshold,
+                                    use_additional_threshold, inplace, gc, dc):
+
+        tensor = np.random.rand(n, d).astype(np.float32)
+        val = np.array(np.linalg.norm(tensor))
+        additional_threshold = np.array([additional_threshold]).astype(np.float32)
+
+        def clip_tensor_by_scaling_ref(tensor_data, val_data,
+                                       additional_threshold=None):
+
+            if additional_threshold is not None:
+                final_threshold = threshold * additional_threshold
+            else:
+                final_threshold = threshold
+
+            if val_data > final_threshold:
+                ratio = final_threshold / float(val_data)
+                tensor_data = tensor_data * ratio
+
+            return [tensor_data]
+
+        op = core.CreateOperator(
+            "ClipTensorByScaling",
+            ["tensor", "val"] if not use_additional_threshold else (
+                ["tensor", "val", "additional_threshold"]),
+            ['Y'] if not inplace else ["tensor"],
+            threshold=threshold,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[tensor, val] if not use_additional_threshold else (
+                [tensor, val, additional_threshold]),
+            reference=clip_tensor_by_scaling_ref,
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
new file mode 100644
index 0000000..b2c05a5
--- /dev/null
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -0,0 +1,212 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+
+from hypothesis import given, settings
+import hypothesis.strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, utils
+from caffe2.proto import caffe2_pb2
+
+#
+# Should match original Detectron code at
+# https://github.com/facebookresearch/Detectron/blob/master/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
+#
+
+def boxes_area(boxes):
+    """Compute the area of an array of boxes."""
+    w = (boxes[:, 2] - boxes[:, 0] + 1)
+    h = (boxes[:, 3] - boxes[:, 1] + 1)
+    areas = w * h
+    assert np.all(areas >= 0), 'Negative areas founds'
+    return areas
+
+
+def map_rois_to_fpn_levels(
+    rois,
+    k_min, k_max,
+    roi_canonical_scale, roi_canonical_level):
+    """Determine which FPN level each RoI in a set of RoIs should map to based
+    on the heuristic in the FPN paper.
+    """
+    # Compute level ids
+    s = np.sqrt(boxes_area(rois))
+
+    # Eqn.(1) in FPN paper
+    target_lvls = np.floor(
+        roi_canonical_level +
+        np.log2(s / roi_canonical_scale + 1e-6))
+    target_lvls = np.clip(target_lvls, k_min, k_max)
+    return target_lvls
+
+
+def collect(inputs, **args):
+    post_nms_topN = args['rpn_post_nms_topN']
+    num_lvls = args['rpn_num_levels']
+    roi_inputs = inputs[:num_lvls]
+    score_inputs = inputs[num_lvls:]
+
+    # rois are in [[batch_idx, x0, y0, x1, y2], ...] format
+    # Combine predictions across all levels and retain the top scoring
+    #
+    # equivalent to Detectron code
+    #   rois = np.concatenate([blob.data for blob in roi_inputs])
+    #   scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
+    rois = np.concatenate(roi_inputs)
+    scores = np.concatenate(score_inputs).squeeze()
+    assert rois.shape[0] == scores.shape[0]
+    inds = np.argsort(-scores, kind='mergesort')[:post_nms_topN]
+    rois = rois[inds, :]
+    return rois
+
+
+def distribute(rois, _, outputs, **args):
+    """To understand the output blob order see return value of
+    roi_data.fast_rcnn.get_fast_rcnn_blob_names(is_training=False)
+    """
+    # equivalent to Detectron code
+    #   lvl_min = cfg.FPN.ROI_MIN_LEVEL
+    #   lvl_max = cfg.FPN.ROI_MAX_LEVEL
+    lvl_min = args['roi_min_level']
+    lvl_max = lvl_min + args['roi_num_levels'] - 1
+    lvls = map_rois_to_fpn_levels(
+        rois[:, 1:5],
+        lvl_min, lvl_max,
+        args['roi_canonical_scale'],
+        args['roi_canonical_level'])
+
+    # equivalent to Detectron code
+    #   outputs[0].reshape(rois.shape)
+    #   outputs[0].data[...] = rois
+    outputs[0] = rois
+
+    # Create new roi blobs for each FPN level
+    # (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying
+    # to generalize to support this particular case.)
+    rois_idx_order = np.empty((0, ))
+    for output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)):
+        idx_lvl = np.where(lvls == lvl)[0]
+        blob_roi_level = rois[idx_lvl, :]
+        # equivalent to Detectron code
+        #   outputs[output_idx + 1].reshape(blob_roi_level.shape)
+        #   outputs[output_idx + 1].data[...] = blob_roi_level
+        outputs[output_idx + 1] = blob_roi_level
+        rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
+    rois_idx_restore = np.argsort(rois_idx_order, kind='mergesort')
+    # equivalent to Detectron code
+    #   py_op_copy_blob(
+    #       rois_idx_restore.astype(np.int32), outputs[-1])
+    outputs[-1] = rois_idx_restore.astype(np.int32)
+
+
+def collect_and_distribute_fpn_rpn_ref(*inputs):
+    assert inputs
+    args = inputs[-1]
+    inputs = inputs[:-1]
+
+    num_rpn_lvls = args['rpn_num_levels']
+    assert len(inputs) == 2 * num_rpn_lvls
+    N = inputs[0].shape[0]
+    for i in range(num_rpn_lvls):
+        assert len(inputs[i].shape) == 2
+        assert inputs[i].shape[0] == N
+        assert inputs[i].shape[1] == 5
+    for i in range(num_rpn_lvls, 2 * num_rpn_lvls):
+        assert len(inputs[i].shape) == 1
+        assert inputs[i].shape[0] == N
+
+    num_roi_lvls = args['roi_num_levels']
+    outputs = (num_roi_lvls + 2) * [None]
+    rois = collect(inputs, **args)
+    distribute(rois, None, outputs, **args)
+
+    return outputs
+
+
+class TestCollectAndDistributeFpnRpnProposals(hu.HypothesisTestCase):
+    @given(proposal_count=st.integers(min_value=1000, max_value=8000),
+           rpn_min_level=st.integers(min_value=1, max_value=4),
+           rpn_num_levels=st.integers(min_value=1, max_value=6),
+           roi_min_level=st.integers(min_value=1, max_value=4),
+           roi_num_levels=st.integers(min_value=1, max_value=6),
+           rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
+           roi_canonical_scale=st.integers(min_value=100, max_value=300),
+           roi_canonical_level=st.integers(min_value=1, max_value=8),
+           **hu.gcs_cpu_only)
+    def test_collect_and_dist(
+        self,
+        proposal_count,
+        rpn_min_level, rpn_num_levels,
+        roi_min_level, roi_num_levels,
+        rpn_post_nms_topN,
+        roi_canonical_scale, roi_canonical_level,
+        gc, dc):
+
+        np.random.seed(0)
+
+        input_names = []
+        inputs = []
+
+        for lvl in range(rpn_num_levels):
+            rpn_roi = (
+                roi_canonical_scale *
+                np.random.rand(proposal_count, 5).astype(np.float32)
+            )
+            for i in range(proposal_count):
+                # Make RoIs have positive area, since they
+                # are in the format [[batch_idx, x0, y0, x1, y2], ...]
+                rpn_roi[i][3] += rpn_roi[i][1]
+                rpn_roi[i][4] += rpn_roi[i][2]
+            input_names.append('rpn_rois_fpn{}'.format(lvl + rpn_min_level))
+            inputs.append(rpn_roi)
+        for lvl in range(rpn_num_levels):
+            rpn_roi_score = np.random.rand(proposal_count).astype(np.float32)
+            input_names.append('rpn_roi_probs_fpn{}'.format(lvl + rpn_min_level))
+            inputs.append(rpn_roi_score)
+
+        output_names = [
+            'rois',
+        ]
+        for lvl in range(roi_num_levels):
+            output_names.append('rois_fpn{}'.format(lvl + roi_min_level))
+        output_names.append('rois_idx_restore')
+
+        op = core.CreateOperator(
+            'CollectAndDistributeFpnRpnProposals',
+            input_names,
+            output_names,
+            arg=[
+                utils.MakeArgument("roi_canonical_scale", roi_canonical_scale),
+                utils.MakeArgument("roi_canonical_level", roi_canonical_level),
+                utils.MakeArgument("roi_max_level", roi_min_level + roi_num_levels - 1),
+                utils.MakeArgument("roi_min_level", roi_min_level),
+                utils.MakeArgument("rpn_max_level", rpn_min_level + rpn_num_levels - 1),
+                utils.MakeArgument("rpn_min_level", rpn_min_level),
+                utils.MakeArgument("rpn_post_nms_topN", rpn_post_nms_topN),
+            ],
+            device_option=gc)
+        args = {
+            'proposal_count' : proposal_count,
+            'rpn_min_level' : rpn_min_level,
+            'rpn_num_levels' : rpn_num_levels,
+            'roi_min_level' : roi_min_level,
+            'roi_num_levels' : roi_num_levels,
+            'rpn_post_nms_topN' : rpn_post_nms_topN,
+            'roi_canonical_scale' : roi_canonical_scale,
+            'roi_canonical_level' : roi_canonical_level}
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs+[args],
+            reference=collect_and_distribute_fpn_rpn_ref,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
new file mode 100644
index 0000000..af8d548
--- /dev/null
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -0,0 +1,182 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from hypothesis import given
+
+
+@st.composite
+def _tensor_splits(draw, add_axis=False):
+    """Generates (axis, split_info, tensor_splits) tuples."""
+    tensor = draw(hu.tensor(min_value=4))  # Each dim has at least 4 elements.
+    axis = draw(st.integers(-len(tensor.shape), len(tensor.shape) - 1))
+    if add_axis:
+        # Simple case: get individual slices along one axis, where each of them
+        # is (N-1)-dimensional. The axis will be added back upon concatenation.
+        return (
+            axis,
+            np.ones(tensor.shape[axis], dtype=np.int32),
+            [
+                np.array(tensor.take(i, axis=axis))
+                for i in range(tensor.shape[axis])
+            ]
+        )
+    else:
+        # General case: pick some (possibly consecutive, even non-unique)
+        # indices at which we will split the tensor, along the given axis.
+        splits = sorted(draw(
+            st.lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
+        ) + [0, tensor.shape[axis]])
+        return (
+            axis,
+            np.array(np.diff(splits), dtype=np.int32),
+            [
+                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
+                for i in range(len(splits) - 1)
+            ],
+        )
+
+
+class TestConcatSplitOps(hu.HypothesisTestCase):
+    @given(tensor_splits=_tensor_splits(),
+           **hu.gcs)
+    def test_concat(self, tensor_splits, gc, dc):
+        axis, _, splits = tensor_splits
+
+        op = core.CreateOperator(
+            "Concat",
+            ['X_{}'.format(i) for i in range(len(splits))],
+            ['concat_result', 'split_info'],
+            axis=axis
+        )
+
+        self.assertReferenceChecks(
+            gc, op, splits, lambda *splits: (
+                np.concatenate(splits, axis=axis),
+                np.array([a.shape[axis] for a in splits])
+            )
+        )
+        self.assertDeviceChecks(dc, op, splits, [0, 1])
+        self.assertGradientChecks(gc, op, splits, 0, [0])
+
+    @given(tensor_splits=_tensor_splits(add_axis=True),
+           **hu.gcs)
+    def test_concat_add_axis(self, tensor_splits, gc, dc):
+        axis, _, splits = tensor_splits
+
+        op = core.CreateOperator(
+            "Concat",
+            ['X_{}'.format(i) for i in range(len(splits))],
+            ['concat_result', 'split_info'],
+            axis=axis,
+            add_axis=1
+        )
+
+        self.assertReferenceChecks(
+            gc, op, splits, lambda *splits: (
+                np.concatenate(
+                    [np.expand_dims(a, axis) for a in splits],
+                    axis=axis
+                ),
+                np.array([1] * len(splits))
+            )
+        )
+        self.assertDeviceChecks(dc, op, splits, [0, 1])
+        for i in range(len(splits)):
+            self.assertGradientChecks(gc, op, splits, i, [0])
+
+    @given(tensor_splits=_tensor_splits(),
+           split_as_arg=st.booleans(),
+           **hu.gcs)
+    def test_split(self, tensor_splits, split_as_arg, gc, dc):
+        axis, split_info, splits = tensor_splits
+
+        split_as_arg = True
+
+        if split_as_arg:
+            input_names = ['input']
+            input_tensors = [np.concatenate(splits, axis=axis)]
+            kwargs = dict(axis=axis, split=split_info)
+        else:
+            input_names = ['input', 'split']
+            input_tensors = [np.concatenate(splits, axis=axis), split_info]
+            kwargs = dict(axis=axis)
+
+        op = core.CreateOperator(
+            "Split",
+            input_names,
+            ['X_{}'.format(i) for i in range(len(split_info))],
+            **kwargs
+        )
+
+        def split_ref(input, split=split_info):
+            s = np.cumsum([0] + list(split))
+            return [
+                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
+                for i in range(len(split))
+            ]
+        outputs_with_grad = range(len(split_info))
+        self.assertReferenceChecks(gc, op, input_tensors, split_ref)
+        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
+        self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad)
+
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True,
+        ),
+        **hu.gcs
+    )
+    def test_split_by_lengths(self, inputs, gc, dc):
+        data, lengths = inputs
+        len_len = len(lengths)
+
+        def _find_factor_simple(x):
+            for i in [2, 3, 5]:
+                if x % i == 0:
+                    return i
+            return x
+
+        num_output = _find_factor_simple(len_len)
+        axis = 0
+        op = core.CreateOperator(
+            "SplitByLengths",
+            ["data", "lengths"],
+            ['X_{}'.format(i) for i in range(num_output)],
+            axis=axis,
+        )
+
+        def split_by_lengths_ref(data, lengths, num_output=num_output, axis=0):
+            idxs = np.cumsum([0] + list(lengths)).astype(np.int32)
+            return [
+                np.array(
+                    data.take(
+                        np.arange(
+                            idxs[i * len_len // num_output],
+                            idxs[(i + 1) * len_len // num_output]
+                        ),
+                        axis=axis
+                    )
+                ) for i in range(num_output)
+            ]
+        outputs_with_grad = range(num_output)
+        input_tensors = [data, lengths]
+        self.assertReferenceChecks(
+            hu.cpu_do, op, input_tensors, split_by_lengths_ref)
+        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
+        self.assertGradientChecks(
+            hu.cpu_do, op, input_tensors, 0, outputs_with_grad,
+            input_device_options={"lengths": hu.cpu_do})
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
new file mode 100644
index 0000000..b96b530
--- /dev/null
+++ b/caffe2/python/operator_test/conditional_test.py
@@ -0,0 +1,29 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestConditionalOp(hu.HypothesisTestCase):
+    @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
+    def test_conditional(self, rows_num, gc, dc):
+        op = core.CreateOperator(
+            "Conditional", ["condition", "data_t", "data_f"], "output"
+        )
+        data_t = np.random.random((rows_num, 10, 20)).astype(np.float32)
+        data_f = np.random.random((rows_num, 10, 20)).astype(np.float32)
+        condition = np.random.choice(a=[True, False], size=rows_num)
+
+        def ref(condition, data_t, data_f):
+            output = [
+                data_t[i] if condition[i] else data_f[i]
+                for i in range(rows_num)
+            ]
+            return (output,)
+
+        self.assertReferenceChecks(gc, op, [condition, data_t, data_f], ref)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
new file mode 100644
index 0000000..1798c21
--- /dev/null
+++ b/caffe2/python/operator_test/conv_test.py
@@ -0,0 +1,552 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+import numpy as np
+from hypothesis import assume, given
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import brew, core, workspace
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.model_helper import ModelHelper
+
+
+def _cudnn_supports(
+        dilation=False,
+        nhwc=False,
+        backward=False,
+):
+    """Return True if cuDNN supports this configuration."""
+    v = workspace.GetCuDNNVersion()
+    if backward:
+        if nhwc:
+            # nhwc isn't supported in backward ops.
+            return False
+    else:
+        # Forward mode.
+        if dilation and v < 6000:
+            # Dilation not supported until v6
+            return False
+        if dilation and nhwc:
+            # Dilation and NHWC not supported together
+            return False
+    return True
+
+
+class TestConvolution(hu.HypothesisTestCase):
+    # CUDNN does NOT support different padding values and we skip it
+    @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
+           stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(1, 8),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           engine=st.sampled_from(["", "EIGEN"]),
+           shared_buffer=st.booleans(),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_separate_stride_pad_gradients(self, op_type,
+                                                       stride_h, stride_w,
+                                                       pad_t, pad_l, pad_b,
+                                                       pad_r, kernel, size,
+                                                       input_channels,
+                                                       output_channels,
+                                                       batch_size, order,
+                                                       engine, shared_buffer,
+                                                       use_bias,
+                                                       gc, dc):
+        op = core.CreateOperator(
+            op_type,
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride_h=stride_h,
+            stride_w=stride_w,
+            pad_t=pad_t,
+            pad_l=pad_l,
+            pad_b=pad_b,
+            pad_r=pad_r,
+            kernel=kernel,
+            order=order,
+            engine=engine,
+            shared_buffer=int(shared_buffer),
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+
+        # Error handling path.
+        if size + pad_r + pad_l < kernel or size + pad_t + pad_b < kernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    # CUDNN does NOT support different padding values and we skip it
+    @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
+           stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           engine=st.sampled_from(["", "EIGEN"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_separate_stride_pad_layout(self, op_type,
+                                                    stride_h, stride_w,
+                                                    pad_t, pad_l, pad_b, pad_r,
+                                                    kernel, size,
+                                                    input_channels,
+                                                    output_channels, batch_size,
+                                                    engine, use_bias, gc, dc):
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        outputs = {}
+        for order in ["NCHW", "NHWC"]:
+            op = core.CreateOperator(
+                op_type,
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y"],
+                stride_h=stride_h,
+                stride_w=stride_w,
+                kernel=kernel,
+                pad_t=pad_t,
+                pad_l=pad_l,
+                pad_b=pad_b,
+                pad_r=pad_r,
+                order=order,
+                engine=engine,
+                device_option=gc,
+            )
+            if order == "NCHW":
+                X_f = X.transpose((0, 3, 1, 2))
+                w_f = w.transpose((0, 3, 1, 2))
+            else:
+                X_f = X
+                w_f = w
+            self.ws.create_blob("X").feed(X_f, device_option=gc)
+            self.ws.create_blob("w").feed(w_f, device_option=gc)
+            self.ws.create_blob("b").feed(b, device_option=gc)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs["Y"].fetch()
+        np.testing.assert_allclose(
+            outputs["NCHW"],
+            outputs["NHWC"].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
+           stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_gradients(self, op_type, stride, pad, kernel, dilation,
+                                   size, input_channels, output_channels,
+                                   batch_size, order, engine, use_bias, gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+
+        if engine == 'CUDNN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC'),
+                                   backward=True))
+
+        assume(engine != "MKLDNN" or use_bias is True)
+
+        op = core.CreateOperator(
+            op_type,
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        # Error handling path.
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    def _nd_convolution_nchw(self, n, input_channels, output_channels,
+                            batch_size, stride, size, kernel, dilation, pad,
+                            use_bias, gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+        for op_type in ["Conv", "Conv" + str(n) + "D"]:
+            op = core.CreateOperator(
+                op_type,
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y"],
+                strides=[stride] * n,
+                kernels=[kernel] * n,
+                dilations=[dilation] * n,
+                pads=[pad] * n * 2,
+                order="NCHW",
+                engine="",
+            )
+
+            input_dims = [batch_size, input_channels]
+            input_dims.extend([size] * n)
+            filter_dims = [output_channels, input_channels]
+            filter_dims.extend([kernel] * n)
+
+            X = np.random.rand(*input_dims).astype(np.float32) - 0.5
+            w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
+            b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+            inputs = [X, w, b] if use_bias else [X, w]
+
+            if size + pad + pad < dkernel or size + pad + pad < dkernel:
+                with self.assertRaises(RuntimeError):
+                    self.assertDeviceChecks(dc, op, inputs, [0])
+                return
+
+            self.assertDeviceChecks(dc, op, inputs, [0])
+            for i in range(len(inputs)):
+                self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @given(input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 2),
+           batch_size=st.integers(1, 3),
+           stride=st.integers(1, 3),
+           size=st.integers(7, 10),
+           kernel=st.integers(1, 2),
+           dilation=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_1d_convolution_nchw(self, input_channels, output_channels,
+                                 batch_size, stride, size, kernel, dilation,
+                                 pad, use_bias, gc, dc):
+        self._nd_convolution_nchw(
+            1, input_channels, output_channels, batch_size, stride, size,
+            kernel, dilation, pad, use_bias, gc, dc
+        )
+
+    @given(input_channels=st.integers(1, 2),
+           output_channels=st.integers(1, 2),
+           batch_size=st.integers(1, 2),
+           stride=st.integers(1, 2),
+           size=st.integers(4, 5),
+           kernel=st.integers(1, 2),
+           dilation=st.integers(1, 2),
+           pad=st.integers(0, 2),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_3d_convolution_nchw(self, input_channels, output_channels,
+                                 batch_size, stride, size, kernel, dilation,
+                                 pad, use_bias, gc, dc):
+        self._nd_convolution_nchw(
+            3, input_channels, output_channels, batch_size, stride, size,
+            kernel, dilation, pad, use_bias, gc, dc
+        )
+
+    @given(op_type=st.sampled_from(["Conv", "Conv3D"]),
+           batch_size=st.integers(1, 2),
+           stride=st.integers(1, 2),
+           size=st.integers(3, 5),
+           kernel=st.integers(1, 2),
+           dilation=st.integers(1, 2),
+           pad=st.integers(0, 2),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_3d_convolution_cudnn_nchw(self, op_type, batch_size, stride, size,
+                                       kernel, dilation, pad, use_bias, gc, dc):
+        input_channels = 1
+        output_channels = 1
+        n = 3
+        dkernel = dilation * (kernel - 1) + 1
+        order = "NCHW"
+
+        op = core.CreateOperator(
+            op_type,
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            strides=[stride] * n,
+            kernels=[kernel] * n,
+            dilations=[dilation] * n,
+            pads=[pad] * n * 2,
+            order=order,
+            engine="CUDNN",
+        )
+
+        input_dims = [batch_size, input_channels]
+        input_dims.extend([size] * n)
+        filter_dims = [output_channels, input_channels]
+        filter_dims.extend([kernel] * n)
+        X = np.random.rand(*input_dims).astype(np.float32) - 0.5
+        w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+        inputs = [X, w, b] if use_bias else [X, w]
+
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
+           stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_layout(self, op_type, stride, pad, kernel, dilation,
+                                size, input_channels, output_channels,
+                                batch_size, use_bias, gc, dc):
+        assume(size >= dilation * (kernel - 1) + 1)
+
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        Output = collections.namedtuple("Output", ["Y", "engine", "order"])
+        outputs = []
+
+        for order in ["NCHW", "NHWC"]:
+            engine_list = ['']
+            if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
+                engine_list.append('CUDNN')
+
+            for engine in engine_list:
+                op = core.CreateOperator(
+                    op_type,
+                    ["X", "w", "b"] if use_bias else ["X", "w"],
+                    ["Y"],
+                    stride=stride,
+                    kernel=kernel,
+                    dilation=dilation,
+                    pad=pad,
+                    order=order,
+                    engine=engine,
+                    device_option=gc,
+                    exhaustive_search=True,
+                )
+                if order == "NCHW":
+                    X_f = X.transpose((0, 3, 1, 2))
+                    w_f = w.transpose((0, 3, 1, 2))
+                else:
+                    X_f = X
+                    w_f = w
+                self.assertDeviceChecks(
+                    dc,
+                    op,
+                    [X_f, w_f, b] if use_bias else [X_f, w_f],
+                    [0])
+                self.ws.create_blob("X").feed(X_f, device_option=gc)
+                self.ws.create_blob("w").feed(w_f, device_option=gc)
+                self.ws.create_blob("b").feed(b, device_option=gc)
+                self.ws.run(op)
+                outputs.append(Output(
+                    Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order))
+
+        def canonical(o):
+            if o.order == "NHWC":
+                return o.Y.transpose((0, 3, 1, 2))
+            else:
+                return o.Y
+
+        for o in outputs:
+            np.testing.assert_allclose(
+                canonical(outputs[0]),
+                canonical(o),
+                atol=1e-4,
+                rtol=1e-4)
+
+    @given(num_workers=st.integers(1, 4),
+           net_type=st.sampled_from(
+               ["simple", "dag"] +
+               (["async_dag"] if workspace.has_gpu_support else [])),
+           do=st.sampled_from(hu.device_options),
+           engine=st.sampled_from(["CUDNN", ""]))
+    def test_convolution_sync(self, net_type, num_workers, do, engine):
+        m = ModelHelper(name="test_model")
+        n = 1
+        d = 2
+        depth = 3
+        iters = 5
+        h = 5
+        w = 5
+        workspace.ResetWorkspace()
+
+        use_cudnn = (engine == 'CUDNN')
+
+        np.random.seed(1701)
+        # Build a binary tree of conv layers, summing at each node.
+        for i in reversed(range(depth)):
+            for j in range(2 ** i):
+                bottom_1 = "{}_{}".format(i + 1, 2 * j)
+                bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
+                mid_1 = "{}_{}_m".format(i + 1, 2 * j)
+                mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
+                top = "{}_{}".format(i, j)
+                w1, b1, w2, b2 = np.random.randn(4).tolist()
+                brew.conv(
+                    m, bottom_1, mid_1,
+                    dim_in=d, dim_out=d,
+                    kernel=3,
+                    weight_init=('ConstantFill', dict(value=w1)),
+                    bias_init=('ConstantFill', dict(value=b1)),
+                    cudnn_state=np.random.randint(0, 3),
+                    stride=1,
+                    pad=1,
+                    deterministic=1,
+                    use_cudnn=use_cudnn,
+                    engine=engine)
+                brew.conv(
+                    m, bottom_2, mid_2,
+                    dim_in=d, dim_out=d,
+                    kernel=3,
+                    stride=1,
+                    pad=1,
+                    weight_init=('ConstantFill', dict(value=w2)),
+                    bias_init=('ConstantFill', dict(value=b2)),
+                    deterministic=1,
+                    cudnn_state=np.random.randint(0, 3),
+                    use_cudnn=use_cudnn,
+                    engine=engine)
+                m.net.Sum([mid_1, mid_2], top)
+
+        m.net.Flatten(["0_0"], ["0_0_flat"])
+        m.net.SquaredL2Distance(["0_0_flat", "label"], "xent")
+        m.net.AveragedLoss("xent", "loss")
+        input_to_grad = m.AddGradientOperators(["loss"])
+        m.Proto().device_option.CopyFrom(do)
+        m.param_init_net.Proto().device_option.CopyFrom(do)
+        m.Proto().type = net_type
+        m.Proto().num_workers = num_workers
+        self.ws.run(m.param_init_net)
+
+        def run():
+            import numpy as np
+            np.random.seed(1701)
+            input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
+            for input_blob in input_blobs:
+                self.ws.create_blob(input_blob).feed(
+                    np.random.randn(n, d, h, w).astype(np.float32),
+                    device_option=do)
+                self.ws.create_blob("label").feed(
+                    np.random.randn(n, d * h * w).astype(np.float32),
+                    device_option=do)
+            self.ws.run(m.net)
+            gradients = [
+                self.ws.blobs[str(input_to_grad[input_blob])].fetch()
+                for input_blob in input_blobs]
+            return gradients
+
+        outputs = [run() for _ in range(iters)]
+        for output in outputs[1:]:
+            np.testing.assert_array_equal(outputs[0], output)
+            np.testing.assert_allclose(
+                np.sum(np.square(output)),
+                1763719461732352.0,
+                rtol=1e-5)
+
+    def test_use_cudnn_engine_interactions(self):
+        """Make sure the use_cudnn and engine kwargs work as expected."""
+        for model_default in [None, True, False]:
+            arg_scope = {}
+            if model_default is not None:
+                arg_scope['use_cudnn'] = model_default
+            else:
+                model_default = True  # the default
+
+            model = ModelHelper(arg_scope=arg_scope)
+            self.assertEqual(model.arg_scope['use_cudnn'], model_default)
+            f = functools.partial(brew.conv, model,
+                                  'conv_in', 'conv_out', 10, 10, 5)
+
+            for op_cudnn in [None, True, False]:
+                for op_engine in [None, '', 'CUDNN']:
+                    kwargs = {}
+                    if op_cudnn is not None:
+                        kwargs['use_cudnn'] = op_cudnn
+                    else:
+                        op_cudnn = False  # the default
+                    if op_engine is not None:
+                        kwargs['engine'] = op_engine
+
+                    calculated_cudnn = kwargs.get('use_cudnn', model_default)
+                    expected_engine = kwargs.get(
+                        'engine',
+                        'CUDNN' if calculated_cudnn else '')
+
+                    if ((calculated_cudnn is True and op_engine == '') or
+                            (calculated_cudnn is False and op_engine == 'CUDNN')):
+                        with self.assertRaises(ValueError):
+                            f(**kwargs)
+                    else:
+                        f(**kwargs)
+                        self.assertEqual(model.Proto().op[-1].engine,
+                                         expected_engine)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
new file mode 100644
index 0000000..30df1d1
--- /dev/null
+++ b/caffe2/python/operator_test/conv_transpose_test.py
@@ -0,0 +1,352 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import assume, given, settings
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestConvolutionTranspose(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           adj=st.integers(0, 2),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
+           shared_buffer=st.booleans(),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_transpose_layout_legacy_args(
+            self, stride, pad, kernel, adj,
+            size, input_channels,
+            output_channels, batch_size,
+            engine, shared_buffer, use_bias, gc, dc):
+        assume(adj < stride)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            input_channels, kernel, kernel, output_channels)\
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        outputs = {}
+        for order in ["NCHW", "NHWC"]:
+            op = core.CreateOperator(
+                "ConvTranspose",
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y"],
+                stride=stride,
+                kernel=kernel,
+                pad=pad,
+                adj=adj,
+                order=order,
+                engine=engine,
+                shared_buffer=int(shared_buffer),
+                device_option=gc,
+            )
+            if order == "NCHW":
+                X_f = X.transpose((0, 3, 1, 2))
+                w_f = w.transpose((0, 3, 1, 2))
+            else:
+                X_f = X
+                w_f = w
+            self.assertDeviceChecks(
+                dc,
+                op,
+                [X_f, w_f, b] if use_bias else [X_f, w_f],
+                [0])
+            self.ws.create_blob("X").feed(X_f, device_option=gc)
+            self.ws.create_blob("w").feed(w_f, device_option=gc)
+            self.ws.create_blob("b").feed(b, device_option=gc)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs["Y"].fetch()
+        output_size = (size - 1) * stride + kernel + adj - 2 * pad
+        self.assertEqual(
+            outputs["NCHW"].shape,
+            (batch_size, output_channels, output_size, output_size))
+        np.testing.assert_allclose(
+            outputs["NCHW"],
+            outputs["NHWC"].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           adj=st.integers(0, 2),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
+           shared_buffer=st.booleans(),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_transpose_layout(
+            self, stride, pad, kernel, adj,
+            size, input_channels,
+            output_channels, batch_size,
+            engine, shared_buffer, use_bias, gc, dc):
+        assume(adj < stride)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            input_channels, kernel, kernel, output_channels)\
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        outputs = {}
+        for order in ["NCHW", "NHWC"]:
+            op = core.CreateOperator(
+                "ConvTranspose",
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y"],
+                strides=[stride] * 2,
+                kernels=[kernel] * 2,
+                pads=[pad] * 4,
+                adjs=[adj] * 2,
+                order=order,
+                engine=engine,
+                shared_buffer=int(shared_buffer),
+                device_option=gc,
+            )
+            if order == "NCHW":
+                X_f = X.transpose((0, 3, 1, 2))
+                w_f = w.transpose((0, 3, 1, 2))
+            else:
+                X_f = X
+                w_f = w
+            self.assertDeviceChecks(
+                dc,
+                op,
+                [X_f, w_f, b] if use_bias else [X_f, w_f],
+                [0])
+            self.ws.create_blob("X").feed(X_f, device_option=gc)
+            self.ws.create_blob("w").feed(w_f, device_option=gc)
+            self.ws.create_blob("b").feed(b, device_option=gc)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs["Y"].fetch()
+        output_size = (size - 1) * stride + kernel + adj - 2 * pad
+        self.assertEqual(
+            outputs["NCHW"].shape,
+            (batch_size, output_channels, output_size, output_size))
+        np.testing.assert_allclose(
+            outputs["NCHW"],
+            outputs["NHWC"].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    # CUDNN does not support separate stride and pad so we skip it.
+    @given(stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           adj_h=st.integers(0, 2),
+           adj_w=st.integers(0, 2),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           engine=st.sampled_from(["", "BLOCK"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_transpose_separate_stride_pad_adj_layout(
+            self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
+            adj_h, adj_w, size, input_channels, output_channels, batch_size,
+            engine, use_bias, gc, dc):
+        assume(adj_h < stride_h)
+        assume(adj_w < stride_w)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            input_channels, kernel, kernel, output_channels)\
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        outputs = {}
+        for order in ["NCHW", "NHWC"]:
+            op = core.CreateOperator(
+                "ConvTranspose",
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y"],
+                stride_h=stride_h,
+                stride_w=stride_w,
+                kernel=kernel,
+                pad_t=pad_t,
+                pad_l=pad_l,
+                pad_b=pad_b,
+                pad_r=pad_r,
+                adj_h=adj_h,
+                adj_w=adj_w,
+                order=order,
+                engine=engine,
+                device_option=gc,
+            )
+            if order == "NCHW":
+                X_f = X.transpose((0, 3, 1, 2))
+                w_f = w.transpose((0, 3, 1, 2))
+            else:
+                X_f = X
+                w_f = w
+            self.assertDeviceChecks(
+                dc,
+                op,
+                [X_f, w_f, b] if use_bias else [X_f, w_f],
+                [0])
+            self.ws.create_blob("X").feed(X_f, device_option=gc)
+            self.ws.create_blob("w").feed(w_f, device_option=gc)
+            self.ws.create_blob("b").feed(b, device_option=gc)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs["Y"].fetch()
+        output_h = (size - 1) * stride_h + kernel + adj_h - pad_t - pad_b
+        output_w = (size - 1) * stride_w + kernel + adj_w - pad_l - pad_r
+        self.assertEqual(
+            outputs["NCHW"].shape,
+            (batch_size, output_channels, output_h, output_w))
+        np.testing.assert_allclose(
+            outputs["NCHW"],
+            outputs["NHWC"].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           adj=st.integers(0, 2),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
+           use_bias=st.booleans(),
+           compute_dX=st.booleans(),
+           **hu.gcs)
+    @settings(max_examples=2, timeout=100)
+    def test_convolution_transpose_gradients(self, stride, pad, kernel, adj,
+                                             size, input_channels,
+                                             output_channels, batch_size,
+                                             order, engine, use_bias,
+                                             compute_dX, gc, dc):
+        assume(adj < stride)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            input_channels, kernel, kernel, output_channels)\
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        op = core.CreateOperator(
+            "ConvTranspose",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            pad=pad,
+            adj=adj,
+            order=order,
+            engine=engine,
+            no_gradient_to_input=not compute_dX,
+        )
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+        if use_bias and compute_dX:
+            # w, b, X
+            outputs_to_check = [1, 2, 0]
+        elif use_bias:
+            # w, b
+            outputs_to_check = [1, 2]
+        elif compute_dX:
+            # w, X
+            outputs_to_check = [1, 0]
+        else:
+            # w
+            outputs_to_check = [1]
+        for i in outputs_to_check:
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    # CUDNN does not support separate stride and pad so we skip it.
+    @given(stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           adj_h=st.integers(0, 2),
+           adj_w=st.integers(0, 2),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           engine=st.sampled_from(["", "BLOCK"]),
+           use_bias=st.booleans(),
+           compute_dX=st.booleans(),
+           **hu.gcs)
+    @settings(max_examples=2, timeout=100)
+    def test_convolution_transpose_separate_stride_pad_adj_gradient(
+            self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
+            adj_h, adj_w, size, input_channels, output_channels, batch_size,
+            order, engine, use_bias, compute_dX, gc, dc):
+        assume(adj_h < stride_h)
+        assume(adj_w < stride_w)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            input_channels, kernel, kernel, output_channels)\
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        op = core.CreateOperator(
+            "ConvTranspose",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride_h=stride_h,
+            stride_w=stride_w,
+            kernel=kernel,
+            pad_t=pad_t,
+            pad_l=pad_l,
+            pad_b=pad_b,
+            pad_r=pad_r,
+            adj_h=adj_h,
+            adj_w=adj_w,
+            order=order,
+            engine=engine,
+            no_gradient_to_input=not compute_dX,
+        )
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+        if use_bias and compute_dX:
+            # w, b, X
+            outputs_to_check = [1, 2, 0]
+        elif use_bias:
+            # w, b
+            outputs_to_check = [1, 2]
+        elif compute_dX:
+            # w, X
+            outputs_to_check = [1, 0]
+        else:
+            # w
+            outputs_to_check = [1]
+        for i in outputs_to_check:
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
new file mode 100644
index 0000000..05a018f
--- /dev/null
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -0,0 +1,188 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+import unittest
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, core, model_helper, brew
+
+
+class CopyOpsTest(unittest.TestCase):
+
+    def tearDown(self):
+        # Reset workspace after each test
+        # Otherwise, the multi-GPU test will use previously created tensors,
+        #   which may have been placed on the wrong device
+        workspace.ResetWorkspace()
+
+    def run_test_copy_gradient(self, device_opt):
+        model = model_helper.ModelHelper(name="copy_test")
+        with core.DeviceScope(device_opt):
+            x = model.net.AddExternalInputs("x")
+            y = model.Copy(x, "y")
+            loss = model.AveragedLoss(y, "loss")
+            gradient_map = model.AddGradientOperators([loss])
+            workspace.FeedBlob(x, np.random.rand(32).astype(np.float32))
+            workspace.RunNetOnce(model.param_init_net)
+            workspace.RunNetOnce(model.net)
+            self.assertTrue(np.array_equal(
+                workspace.FetchBlob(x),
+                workspace.FetchBlob(y),
+            ))
+            self.assertTrue(np.array_equal(
+                workspace.FetchBlob(gradient_map[x]),
+                workspace.FetchBlob(gradient_map[y]),
+            ))
+
+    def test_copy_gradient_cpu(self):
+        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))
+
+    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    def test_copy_gradient_gpu(self):
+        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.")
+    def test_copy_gradient_multiple_gpus(self):
+        model = model_helper.ModelHelper(name="copy_test")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            x_cpu = model.net.AddExternalInputs("x_cpu")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)):
+            x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
+            loss = model.AveragedLoss(x_gpu_2, "loss")
+            gradient_map = model.AddGradientOperators([loss])
+
+        workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32))
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        self.assertTrue(np.array_equal(
+            workspace.FetchBlob("x_gpu_1"),
+            workspace.FetchBlob("x_gpu_2"),
+        ))
+        self.assertTrue(np.array_equal(
+            workspace.FetchBlob(gradient_map["x_gpu_1"]),
+            workspace.FetchBlob(gradient_map["x_gpu_2"]),
+        ))
+
+        def get_op_with_output(model, output_blob_name):
+            for op in model.net.Proto().op:
+                if len(op.output) == 1 and op.output[0] == output_blob_name:
+                    return op
+            return None
+
+        self.assertEqual(
+            get_op_with_output(model, "x_gpu_2_grad").device_option,
+            core.DeviceOption(caffe2_pb2.CUDA, 1),
+        )
+        self.assertEqual(
+            get_op_with_output(model, "x_cpu_grad").device_option,
+            core.DeviceOption(caffe2_pb2.CUDA, 0),
+        )
+
+    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
+        model = model_helper.ModelHelper(name="copy_test")
+        v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
+        indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
+        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+
+        with core.DeviceScope(gpu_opt):
+            vcpu = model.CopyGPUToCPU(v, "vcpu")
+
+        with core.DeviceScope(cpu_opt):
+            g = model.Gather([vcpu, indices], "g")
+
+        with core.DeviceScope(gpu_opt):
+            ggpu = model.CopyCPUToGPU(g, "ggpu")
+            f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6)
+            (softmax, loss) = model.SoftmaxWithLoss(
+                [f, "label"],
+                ["softmax", "loss"],
+            )
+        gradient_map = model.AddGradientOperators([loss])
+        self.assertTrue("v" in gradient_map)
+        self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))
+
+    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    def test_cpu2gpu_gpu2cpu_gradients(self):
+        model = model_helper.ModelHelper(name="copy_test")
+
+        batch = 32
+        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+
+        with core.NameScope("cpu"):
+            with core.DeviceScope(cpu_opt):
+                x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8)
+
+        with core.NameScope("gpu_0"):
+            with core.DeviceScope(gpu_opt):
+                x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu")
+                pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4)
+                pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu")
+
+        with core.DeviceScope(cpu_opt):
+            with core.NameScope("cpu"):
+                (softmax, loss) = model.SoftmaxWithLoss(
+                    [pred_cpu, "label"],
+                    ["softmax", "loss"],
+                )
+
+        gradient_map = model.AddGradientOperators([loss])
+
+        # Add param updates (for cpu and gpu)
+        init_net = model.param_init_net
+        with core.DeviceScope(cpu_opt):
+            with core.NameScope("cpu"):
+                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
+                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
+                for param in model.GetParams():
+                    model.WeightedSum(
+                        [param, ONE, gradient_map[param], LR],
+                        param,
+                    )
+
+        with core.NameScope("gpu_0"):
+            with core.DeviceScope(gpu_opt):
+                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
+                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
+                for param in model.GetParams():
+                    model.WeightedSum(
+                        [param, ONE, gradient_map[param], LR],
+                        param,
+                    )
+
+        with core.DeviceScope(cpu_opt):
+            workspace.FeedBlob(
+                'cpu/data',
+                np.random.rand(batch, 16).astype(np.float32),
+            )
+            workspace.FeedBlob(
+                'cpu/label',
+                np.random.randint(4, size=batch).astype(np.int32),
+            )
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+
+        initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
+        workspace.RunNet(model.net.Proto().name)
+        updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
+
+        for p in model.GetParams():
+            g = gradient_map[p]
+            expected = initial_params[p] - 2.0 * workspace.FetchBlob(g)
+            actual = updated_params[p]
+            self.assertTrue(
+                np.array_equal(expected, updated_params[p]),
+                "Mismatch: {}: {}, {}".format(p, expected, actual),
+            )
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
new file mode 100644
index 0000000..d67df5f
--- /dev/null
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestCosineEmbeddingCriterion(hu.HypothesisTestCase):
+    @given(N=st.integers(min_value=10, max_value=20),
+           seed=st.integers(min_value=0, max_value=65535),
+           margin=st.floats(min_value=-0.5, max_value=0.5),
+           **hu.gcs)
+    def test_cosine_embedding_criterion(self, N, seed, margin, gc, dc):
+        np.random.seed(seed)
+        S = np.random.randn(N).astype(np.float32)
+        Y = np.random.choice([-1, 1], size=N).astype(np.int32)
+        op = core.CreateOperator(
+            "CosineEmbeddingCriterion", ["S", "Y"], ["output"],
+            margin=margin)
+
+        def ref_cec(S, Y):
+            result = (1 - S) * (Y == 1) + np.maximum(S - margin, 0) * (Y == -1)
+            return (result, )
+
+        # This checks the op implementation against a reference function in
+        # python.
+        self.assertReferenceChecks(gc, op, [S, Y], ref_cec)
+        # This checks the op implementation over multiple device options (e.g.
+        # CPU and CUDA). [0] means that the 0-th output is checked.
+        self.assertDeviceChecks(dc, op, [S, Y], [0])
+
+        # Now, since this operator's output has a "kink" around the margin
+        # value, we move the S vector away from the margin a little bit. This
+        # is a standard trick to avoid gradient check to fail on subgradient
+        # points.
+        S[np.abs(S - margin) < 0.1] += 0.2
+        # This checks the operator's gradient. the first 0 means that we are
+        # checking the gradient of the first input (S), and the second [0] means
+        # that the gradient check should initiate from the 0-th output.
+        self.assertGradientChecks(gc, op, [S, Y], 0, [0])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py
new file mode 100644
index 0000000..3ebe264
--- /dev/null
+++ b/caffe2/python/operator_test/counter_ops_test.py
@@ -0,0 +1,84 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import tempfile
+
+
+class TestCounterOps(TestCase):
+
+    def test_counter_ops(self):
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CreateCounter', [], ['c'], init_count=1))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t1']))  # 1 -> 0
+        assert not workspace.FetchBlob('t1')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t2']))  # 0 -> -1
+        assert workspace.FetchBlob('t2')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountUp', ['c'], ['t21']))  # -1 -> 0
+        assert workspace.FetchBlob('t21') == -1
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'RetrieveCount', ['c'], ['t22']))
+        assert workspace.FetchBlob('t22') == 0
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ResetCounter', ['c'], [], init_count=1))  # -> 1
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t3']))  # 1 -> 0
+        assert not workspace.FetchBlob('t3')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ResetCounter', ['c'], ['t31'], init_count=5))  # 0 -> 5
+        assert workspace.FetchBlob('t31') == 0
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ResetCounter', ['c'], ['t32']))  # 5 -> 0
+        assert workspace.FetchBlob('t32') == 5
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ConstantFill', [], ['t4'], value=False, shape=[],
+            dtype=core.DataType.BOOL))
+        assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ConstantFill', [], ['t5'], value=True, shape=[],
+            dtype=core.DataType.BOOL))
+        assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2')
+
+        assert workspace.RunOperatorOnce(core.CreateOperator(
+            'And', ['t1', 't2'], ['t6']))
+        assert not workspace.FetchBlob('t6')  # True && False
+
+        assert workspace.RunOperatorOnce(core.CreateOperator(
+            'And', ['t2', 't5'], ['t7']))
+        assert workspace.FetchBlob('t7')  # True && True
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CreateCounter', [], ['serialized_c'], init_count=22))
+        with tempfile.NamedTemporaryFile() as tmp:
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'Save', ['serialized_c'], [], absolute_path=1,
+                db_type='minidb', db=tmp.name))
+            for i in range(10):
+                workspace.RunOperatorOnce(core.CreateOperator(
+                    'CountDown', ['serialized_c'], ['t8']))
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'RetrieveCount', ['serialized_c'], ['t8']))
+            assert workspace.FetchBlob('t8') == 12
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'Load', [], ['serialized_c'], absolute_path=1,
+                db_type='minidb', db=tmp.name))
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'RetrieveCount', ['serialized_c'], ['t8']))
+            assert workspace.FetchBlob('t8') == 22
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
new file mode 100644
index 0000000..603b7dd
--- /dev/null
+++ b/caffe2/python/operator_test/crf_test.py
@@ -0,0 +1,138 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import workspace, crf, brew
+from caffe2.python.model_helper import ModelHelper
+import numpy as np
+from scipy.misc import logsumexp
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+from hypothesis import given
+
+
+class TestCRFOp(hu.HypothesisTestCase):
+
+    @given(num_tags=st.integers(2, 4),
+           num_words=st.integers(2, 15))
+    def test_crf_with_loss_op(self, num_tags, num_words):
+        model = ModelHelper(name='external')
+        embeddings_dim = 200
+        embeddings = np.random.randn(num_words, embeddings_dim).astype(np.float32)
+        transitions = np.random.uniform(
+            low=-1, high=1, size=(num_tags + 2, num_tags + 2)
+        ).astype(np.float32)
+        labels = np.random.randint(num_tags, size=(num_words)).astype(np.int64)
+        embeddings_blob, labels_blob, transitions_blob = (
+            model.net.AddExternalInputs(
+                'embeddings_blob',
+                'labels_blob',
+                'crf_transitions')
+        )
+        workspace.FeedBlob(str(embeddings_blob), embeddings)
+        workspace.FeedBlob(str(labels_blob), labels)
+        workspace.FeedBlob(str(transitions_blob), transitions)
+        predictions_blob = brew.fc(
+            model,
+            embeddings_blob, "fc_0",
+            embeddings_dim, num_tags,
+            ('UniformFill', {'min': -1.0}, {'max': 1.0}),
+            ('UniformFill', {'min': -1.0}, {'max': 1.0})
+        )
+        crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
+        crf_loss = crf_layer.crf_loss(predictions_blob, labels_blob)
+        model.net.AddGradientOperators([crf_loss])
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        loss = workspace.FetchBlob(str(crf_loss))
+        predictions = workspace.FetchBlob(str(predictions_blob))
+        np.testing.assert_allclose(
+            loss,
+            self._compute_loss_manual(
+                predictions, num_tags, labels, transitions
+            ),
+            atol=0.001,
+            rtol=0.001,
+            err_msg='CRF LOSS is not matching the reference'
+        )
+
+    @given(num_tags=st.integers(1, 4),
+           num_words=st.integers(2, 4))
+    def test_crf_gradient(self, num_tags, num_words):
+        base_model = ModelHelper(name='base_model')
+        transitions = np.random.randn(
+            num_tags + 2, num_tags + 2
+        ).astype(np.float32)
+        predictions = np.random.randn(num_words, 1, num_tags + 2).astype(np.float32)
+        initial = np.random.randn(1, num_tags + 2).astype(np.float32)
+        predictions_blob, transitions_blob, initial_blob = (
+            base_model.net.AddExternalInputs(
+                'predictions_blob', 'crf_transitions', 'inital_blob'
+            )
+        )
+
+        workspace.FeedBlob(str(predictions_blob), predictions)
+        workspace.FeedBlob(str(transitions_blob), transitions)
+        workspace.FeedBlob(str(initial_blob), initial)
+
+        crf_layer = crf.CRFWithLoss(base_model, num_tags, transitions_blob)
+        crf_layer.build_crf_net(
+            predictions_blob, initial_blob, transitions_blob
+        )
+        op = base_model.net._net.op[-1]
+        workspace.RunNetOnce(base_model.param_init_net)
+        gradients_to_check = (
+            index for (index, input_name) in enumerate(op.input)
+            if input_name != "crf_net/zero_segment_id"
+        )
+
+        inputs = [workspace.FetchBlob(name) for name in op.input]
+        for param in gradients_to_check:
+            self.assertGradientChecks(
+                device_option=hu.cpu_do,
+                op=op,
+                inputs=inputs,
+                outputs_to_check=param,
+                outputs_with_grads=[1],
+                threshold=0.05,
+                stepsize=0.001,
+            )
+
+    def _compute_loss_manual(self, predictions, num_tags, labels, transitions):
+        low_score = -1000
+        b_s = np.array(
+            [[low_score] * num_tags + [0, low_score]]
+        ).astype(np.float32)
+        e_s = np.array(
+            [[low_score] * num_tags + [low_score, 0]]
+        ).astype(np.float32)
+        predictions = np.concatenate(
+            [predictions, low_score * np.ones((predictions.shape[0], 2))],
+            axis=1
+        )
+        predictions = np.concatenate(
+            [b_s, predictions, e_s],
+            axis=0
+        )
+        b_id = np.array([num_tags], dtype=np.int32)
+        e_id = np.array([num_tags + 1], dtype=np.int32)
+        labels = np.concatenate(
+            [b_id, labels, e_id],
+            axis=0
+        )
+        curr_state = predictions[0]
+        input_states = predictions[1:]
+
+        for input_state in input_states:
+            prev = np.expand_dims(curr_state, axis=1)
+            curr_input = np.expand_dims(input_state, axis=0)
+            curr_state = logsumexp(prev + curr_input + transitions, axis=0)
+
+        total_score = logsumexp(curr_state, axis=0)
+        # Compute best path score
+        unary_scores = sum(w[labels[i]] for i, w in enumerate(predictions))
+        binary_scores = sum(
+            transitions[a][b] for a, b in zip(labels[:-1], labels[1:])
+        )
+        loss = total_score - (binary_scores + unary_scores)
+        return loss
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
new file mode 100644
index 0000000..5ee60d8
--- /dev/null
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -0,0 +1,288 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def sigmoid_cross_entropy_with_logits(x, z):
+    return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x)))
+
+
+def sigmoid_cross_entropy_with_logits_grad(x, z):
+    return z - sigmoid(x)
+
+
+def sigmoid_cross_entropy_with_logits_with_log_D_trick(x, z):
+    return -(2 * z - 1.) * np.log(sigmoid(x))
+
+
+def sigmoid_cross_entropy_with_logits_with_log_D_trick_grad(x, z):
+    return (2 * z - 1.) * (1 - sigmoid(x))
+
+
+def unjoined_sigmoid_cross_entropy(x, z):
+    return -z * x + (1. - z) * np.maximum(x, 0) \
+        + (1. - z) * np.log(1 + np.exp(-np.abs(x)))
+
+
+def unjoined_sigmoid_cross_entropy_grad(x, z):
+    return z - (1. - z) / (1. + np.exp(-x))
+
+
+class TestCrossEntropyOps(hu.HypothesisTestCase):
+    @given(
+        inputs=st.lists(
+            elements=st.integers(min_value=1, max_value=5),
+            min_size=1,
+            max_size=2,
+            average_size=2,
+        ).flatmap(
+            lambda shape: st.tuples(
+                hu.arrays(
+                    dims=shape,
+                    elements=st.one_of(
+                        st.floats(min_value=-1.0, max_value=-0.1),
+                        st.floats(min_value=0.1, max_value=1.0),
+                    )),
+                hu.arrays(
+                    dims=shape,
+                    elements=st.sampled_from([0.0, 1.0]),
+                ),
+            )
+        ),
+        options=st.one_of(
+            st.tuples(st.just(True), st.just(False)),
+            st.tuples(st.just(False), st.just(True)),
+            st.tuples(st.just(False), st.just(False))
+        ),
+        **hu.gcs
+    )
+    def test_sigmoid_cross_entropy_with_logits(
+        self, inputs, options, gc, dc
+    ):
+        logits, targets = inputs
+        log_D_trick, unjoined_lr_loss = options
+
+        def sigmoid_xentr_logit_ref(logits, targets):
+            if unjoined_lr_loss:
+                s = unjoined_sigmoid_cross_entropy(logits, targets)
+            else:
+                s = (
+                    sigmoid_cross_entropy_with_logits(logits, targets)
+                    if not log_D_trick else
+                    sigmoid_cross_entropy_with_logits_with_log_D_trick(
+                        logits, targets
+                    )
+                )
+            m = np.mean(s, axis=len(logits.shape) - 1)
+            return (m, )
+
+        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
+            fwd_logits, fwd_targets = fwd_inputs
+            inner_size = fwd_logits.shape[-1]
+            if unjoined_lr_loss:
+                m = unjoined_sigmoid_cross_entropy_grad(logits, targets)
+            else:
+                m = (
+                    sigmoid_cross_entropy_with_logits_grad(fwd_logits, fwd_targets)
+                    if not log_D_trick else
+                    sigmoid_cross_entropy_with_logits_with_log_D_trick_grad(
+                        fwd_logits, fwd_targets
+                    )
+                )
+            # m = fwd_targets - sigmoid(fwd_logits)
+            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
+            return (g_in, None)
+
+        op = core.CreateOperator(
+            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
+            ['xentropy'],
+            log_D_trick=log_D_trick,
+            unjoined_lr_loss=unjoined_lr_loss
+        )
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[logits, targets],
+            reference=sigmoid_xentr_logit_ref,
+            output_to_grad='xentropy',
+            grad_reference=sigmoid_xentr_logit_grad_ref)
+
+    @given(
+        log_D_trick=st.just(False),
+        **hu.gcs_cpu_only
+    )
+    def test_cross_entropy_and_unjoied_cross_entropy_relation(
+        self, log_D_trick, gc, dc
+    ):
+        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
+                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332], dtype='f')
+        targets = np.array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.], dtype='f')
+        lr_size = targets.size
+        unjoined_lr_loss = False
+
+        def sigmoid_xentr_logit_ref(logits, targets):
+            if unjoined_lr_loss:
+                s = unjoined_sigmoid_cross_entropy(logits, targets)
+            else:
+                s = sigmoid_cross_entropy_with_logits(logits, targets)
+            m = np.mean(s, axis=len(logits.shape) - 1)
+            return (m, )
+
+        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
+            fwd_logits, fwd_targets = fwd_inputs
+            inner_size = fwd_logits.shape[-1]
+            if unjoined_lr_loss:
+                m = unjoined_sigmoid_cross_entropy_grad(logits, targets)
+            else:
+                m = sigmoid_cross_entropy_with_logits_grad(
+                    fwd_logits, fwd_targets)
+
+            # m = fwd_targets - sigmoid(fwd_logits)
+            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
+            return (g_in, None)
+
+        op = core.CreateOperator(
+            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
+            ['xentropy'],
+            log_D_trick=log_D_trick,
+            unjoined_lr_loss=unjoined_lr_loss
+        )
+        output_lr = self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[logits, targets],
+            reference=sigmoid_xentr_logit_ref,
+            output_to_grad='xentropy',
+            grad_reference=sigmoid_xentr_logit_grad_ref)
+
+        # Unjoined dataset where labels change later
+        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
+                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332, 1.4720, 0.3500,
+                    -0.6529, -1.1908, 0.8357, -1.0774], dtype='f')
+        targets = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0.,
+                            0., 1., 1., 1., 1., 1., 1.], dtype='f')
+        unjoined_lr_loss = True
+        unjoined_lr_size = targets.size
+
+        op = core.CreateOperator(
+            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
+            ['xentropy'],
+            log_D_trick=log_D_trick,
+            unjoined_lr_loss=unjoined_lr_loss
+        )
+        outputs_unjoined_lr = self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[logits, targets],
+            reference=sigmoid_xentr_logit_ref,
+            output_to_grad='xentropy',
+            grad_reference=sigmoid_xentr_logit_grad_ref)
+
+        self.assertAlmostEqual(
+            output_lr[0].item(0) * lr_size / unjoined_lr_size,
+            outputs_unjoined_lr[0].item(0),
+            delta=0.0001)
+
+    @given(
+        inputs=st.lists(
+            elements=st.integers(min_value=1, max_value=5),
+            min_size=1,
+            max_size=2,
+            average_size=2,
+        ).flatmap(
+            lambda shape: st.tuples(
+                hu.arrays(
+                    dims=shape,
+                    elements=st.one_of(
+                        st.floats(min_value=-1.0, max_value=-0.1),
+                        st.floats(min_value=0.1, max_value=1.0),
+                    )),
+                hu.arrays(
+                    dims=shape,
+                    elements=st.sampled_from([0.0, 1.0]),
+                ),
+                hu.arrays(
+                    dims=shape,
+                    elements=st.floats(min_value=0.1, max_value=1.0),
+                ),
+            )
+        ),
+        **hu.gcs
+    )
+    def test_weighted_sigmoid_cross_entropy_with_logits(self, inputs, gc, dc):
+        logits, targets, weights = inputs
+
+        def weighted_sigmoid_xentr_logit_ref(logits, targets, weights):
+            s = sigmoid_cross_entropy_with_logits(logits, targets)
+            s = np.multiply(s, weights)
+            m = np.mean(s, axis=len(logits.shape) - 1)
+            return (m, )
+
+        def weighted_sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
+            fwd_logits, fwd_targets, fwd_weights = fwd_inputs
+            inner_size = fwd_logits.shape[-1]
+            m = fwd_targets - sigmoid(fwd_logits)
+            m = np.multiply(m, weights)
+            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
+            return (g_in, None, None)
+
+        op = core.CreateOperator(
+            'WeightedSigmoidCrossEntropyWithLogits',
+            ['logits', 'targets', 'weights'],
+            ['xentropy'])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[logits, targets, weights],
+            reference=weighted_sigmoid_xentr_logit_ref,
+            output_to_grad='xentropy',
+            grad_reference=weighted_sigmoid_xentr_logit_grad_ref)
+
+    @given(n=st.integers(2, 10),
+           b=st.integers(1, 5),
+           **hu.gcs_cpu_only)
+    def test_soft_label_cross_entropy(self, n, b, gc, dc):
+        # Initialize X and add 1e-2 for numerical stability
+        X = np.random.rand(b, n).astype(np.float32)
+        X = X + 1e-2
+        for i in range(b):
+            X[i] = X[i] / np.sum(X[i])
+
+        # Initialize label
+        label = np.random.rand(b, n).astype(np.float32)
+        for i in range(b):
+            label[i] = label[i] / np.sum(label[i])
+
+        # Reference implementation of cross entropy with soft labels
+        def soft_label_xentr_ref(X, label):
+            xent = [np.sum((-label[j][i] * np.log(max(X[j][i], 1e-20))
+                            for i in range(len(X[0])))) for j in range(b)]
+            return (xent,)
+
+        op = core.CreateOperator("CrossEntropy", ["X", "label"], ["Y"])
+
+        # TODO(surya) Once CrossEntropyOp is ported to GPU, add the respective
+        # tests to this unit test.
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label],
+            reference=soft_label_xentr_ref,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
new file mode 100644
index 0000000..632a44d
--- /dev/null
+++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
@@ -0,0 +1,90 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+import unittest
+
+
+class TestCTCGreedyDecoderOp(hu.HypothesisTestCase):
+
+    @given(
+        batch=st.sampled_from([2, 4, 128, 256]),
+        max_time=st.sampled_from([2, 10, 30, 50]),
+        num_classes=st.sampled_from([2, 10, 26, 40]),
+        merge_repeated=st.sampled_from([True, False]),
+        **hu.gcs_cpu_only
+    )
+    def test_ctc_greedy_decoder(
+        self, batch, max_time,
+        num_classes, merge_repeated, gc, dc
+    ):
+
+        def input_generater():
+            inputs = np.random.rand(max_time, batch, num_classes)\
+                .astype(np.float32)
+            seq_len = np.random.randint(1, max_time + 1, size=batch)\
+                .astype(np.int32)
+            return inputs, seq_len
+
+        def ref_ctc_decoder(inputs, seq_len):
+            merge = merge_repeated
+            output_len = np.array([]).astype(np.int32)
+            val = np.array([]).astype(np.int32)
+            for i in range(batch):
+                prev_id = 0
+                t_dec = 0
+                len_i = seq_len[i] if seq_len is not None else max_time
+                for t in range(len_i):
+                    max_id = np.argmax(inputs[t, i, :])
+                    if max_id == 0:
+                        prev_id = max_id
+                        continue
+                    if max_id == prev_id and merge:
+                        prev_id = max_id
+                        continue
+                    t_dec += 1
+                    val = np.append(val, max_id)
+                    prev_id = max_id
+                output_len = np.append(output_len, t_dec)
+
+            return [output_len, val]
+
+        def ref_ctc_decoder_max_time(inputs):
+            return ref_ctc_decoder(inputs, None)
+
+        inputs, seq_len = input_generater()
+        op = core.CreateOperator('CTCGreedyDecoder',
+            ['INPUTS', 'SEQ_LEN'],
+            ['OUTPUT_LEN', 'VALUES'],
+            merge_repeated=merge_repeated)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[inputs, seq_len],
+            reference=ref_ctc_decoder,
+        )
+
+        op_1 = core.CreateOperator('CTCGreedyDecoder',
+            ['INPUTS'],
+            ['OUTPUT_LEN', 'VALUES'],
+            merge_repeated=merge_repeated)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op_1,
+            inputs=[inputs],
+            reference=ref_ctc_decoder_max_time,
+        )
+
+
+if __name__ == "__main__":
+    import random
+    random.seed(2603)
+    unittest.main()
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
new file mode 100644
index 0000000..40fe88b
--- /dev/null
+++ b/caffe2/python/operator_test/cudnn_recurrent_test.py
@@ -0,0 +1,153 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import model_helper, workspace, core, rnn_cell
+from caffe2.proto import caffe2_pb2
+from future.utils import viewitems
+import numpy as np
+
+import unittest
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+class TestLSTMs(unittest.TestCase):
+
+    def testEqualToCudnn(self):
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA)):
+            T = 8
+            batch_size = 4
+            input_dim = 8
+            hidden_dim = 31
+
+            workspace.FeedBlob(
+                "seq_lengths",
+                np.array([T] * batch_size, dtype=np.int32)
+            )
+            workspace.FeedBlob("target", np.zeros(
+                [T, batch_size, hidden_dim], dtype=np.float32
+            ))
+            workspace.FeedBlob("hidden_init", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+            workspace.FeedBlob("cell_init", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+
+            own_model = model_helper.ModelHelper(name="own_lstm")
+
+            input_shape = [T, batch_size, input_dim]
+            cudnn_model = model_helper.ModelHelper(name="cudnn_lstm")
+            input_blob = cudnn_model.param_init_net.UniformFill(
+                [], "input", shape=input_shape)
+            workspace.FeedBlob("CUDNN/hidden_init_cudnn", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+            workspace.FeedBlob("CUDNN/cell_init_cudnn", np.zeros(
+                [1, batch_size, hidden_dim], dtype=np.float32
+            ))
+
+            cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM(
+                model=cudnn_model,
+                input_blob=input_blob,
+                initial_states=("hidden_init_cudnn", "cell_init_cudnn"),
+                dim_in=input_dim,
+                dim_out=hidden_dim,
+                scope="CUDNN",
+                return_params=True,
+            )
+            cudnn_loss = cudnn_model.AveragedLoss(
+                cudnn_model.SquaredL2Distance(
+                    [cudnn_output, "target"], "CUDNN/dist"
+                ), "CUDNN/loss"
+            )
+
+            own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM(
+                model=own_model,
+                input_blob=input_blob,
+                seq_lengths="seq_lengths",
+                initial_states=("hidden_init", "cell_init"),
+                dim_in=input_dim,
+                dim_out=hidden_dim,
+                scope="OWN",
+                return_params=True,
+            )
+            own_loss = own_model.AveragedLoss(
+                own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"),
+                "OWN/loss"
+            )
+
+            # Add gradients
+            cudnn_model.AddGradientOperators([cudnn_loss])
+            own_model.AddGradientOperators([own_loss])
+
+            # Add parameter updates
+            LR = cudnn_model.param_init_net.ConstantFill(
+                [], shape=[1], value=0.01
+            )
+            ONE = cudnn_model.param_init_net.ConstantFill(
+                [], shape=[1], value=1.0
+            )
+            for param in cudnn_model.GetParams():
+                cudnn_model.WeightedSum(
+                    [param, ONE, cudnn_model.param_to_grad[param], LR], param
+                )
+            for param in own_model.GetParams():
+                own_model.WeightedSum(
+                    [param, ONE, own_model.param_to_grad[param], LR], param
+                )
+
+            # Copy states over
+            own_model.net.Copy(own_last_hidden, "hidden_init")
+            own_model.net.Copy(own_last_state, "cell_init")
+            cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn")
+            cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn")
+
+            workspace.RunNetOnce(cudnn_model.param_init_net)
+            workspace.CreateNet(cudnn_model.net)
+
+            ##
+            ##  CUDNN LSTM MODEL EXECUTION
+            ##
+            # Get initial values from CuDNN LSTM so we can feed them
+            # to our own.
+            (param_extract_net, param_extract_mapping) = param_extract
+            workspace.RunNetOnce(param_extract_net)
+            cudnn_lstm_params = {
+                input_type: {
+                    k: workspace.FetchBlob(v[0])
+                    for k, v in viewitems(pars)
+                }
+                for input_type, pars in viewitems(param_extract_mapping)
+            }
+
+            # Run the model 3 times, so that some parameter updates are done
+            workspace.RunNet(cudnn_model.net.Proto().name, 3)
+
+            ##
+            ## OWN LSTM MODEL EXECUTION
+            ##
+            # Map the cuDNN parameters to our own
+            workspace.RunNetOnce(own_model.param_init_net)
+            rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params)
+
+            # Run the model 3 times, so that some parameter updates are done
+            workspace.CreateNet(own_model.net)
+            workspace.RunNet(own_model.net.Proto().name, 3)
+
+            ##
+            ## COMPARE RESULTS
+            ##
+            # Then compare that final results after 3 runs are equal
+            own_output_data = workspace.FetchBlob(own_output)
+            own_last_hidden = workspace.FetchBlob(own_last_hidden)
+            own_loss = workspace.FetchBlob(own_loss)
+
+            cudnn_output_data = workspace.FetchBlob(cudnn_output)
+            cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden)
+            cudnn_loss = workspace.FetchBlob(cudnn_loss)
+
+            self.assertTrue(np.allclose(own_output_data, cudnn_output_data))
+            self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden))
+            self.assertTrue(np.allclose(own_loss, cudnn_loss))
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
new file mode 100644
index 0000000..5cec140
--- /dev/null
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -0,0 +1,660 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import core, workspace, dataset
+from caffe2.python.dataset import Const
+from caffe2.python.schema import (
+    List, Field, Struct, Scalar, Map, from_blob_list, FetchRecord, NewRecord,
+    FeedRecord
+)
+from caffe2.python.test_util import TestCase
+
+import numpy.testing as npt
+
+import string
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+def _assert_arrays_equal(actual, ref, err_msg):
+    if ref.dtype.kind in ('S', 'O', 'U'):
+        np.testing.assert_array_equal(actual, ref, err_msg=err_msg)
+    else:
+        np.testing.assert_allclose(
+            actual, ref, atol=1e-4,
+            rtol=1e-4, err_msg=err_msg
+        )
+
+
+def _assert_records_equal(actual, ref):
+    assert isinstance(actual, Field)
+    assert isinstance(ref, Field)
+    b1 = actual.field_blobs()
+    b2 = ref.field_blobs()
+    assert (len(b1) == len(b2)), 'Records have different lengths: %d vs. %d' % (
+        len(b1), len(b2)
+    )
+    for name, d1, d2 in zip(ref.field_names(), b1, b2):
+        _assert_arrays_equal(d1, d2, err_msg='Mismatch in field %s.' % name)
+
+
+@st.composite
+def _sparse_features_map(draw, num_records, **kwargs):
+    sparse_maps_lengths = draw(
+        st.lists(
+            st.integers(min_value=1, max_value=10),
+            min_size=num_records,
+            max_size=num_records
+        )
+    )
+
+    sparse_maps_total_length = sum(sparse_maps_lengths)
+
+    sparse_keys = draw(
+        st.lists(
+            st.integers(min_value=1, max_value=100),
+            min_size=sparse_maps_total_length,
+            max_size=sparse_maps_total_length,
+            unique=True
+        )
+    )
+
+    sparse_values_lengths = draw(
+        st.lists(
+            st.integers(min_value=1, max_value=10),
+            min_size=sparse_maps_total_length,
+            max_size=sparse_maps_total_length
+        )
+    )
+
+    total_sparse_values_lengths = sum(sparse_values_lengths)
+
+    sparse_values = draw(
+        # max_value is max int64
+        st.lists(
+            st.integers(min_value=1, max_value=9223372036854775807),
+            min_size=total_sparse_values_lengths,
+            max_size=total_sparse_values_lengths
+        )
+    )
+
+    return [
+        sparse_maps_lengths,
+        sparse_keys,
+        sparse_values_lengths,
+        sparse_values,
+    ]
+
+
+@st.composite
+def _dense_features_map(draw, num_records, **kwargs):
+    float_lengths = draw(
+        st.lists(
+            st.integers(min_value=1, max_value=10),
+            min_size=num_records,
+            max_size=num_records
+        )
+    )
+
+    total_length = sum(float_lengths)
+
+    float_keys = draw(
+        st.lists(
+            st.integers(min_value=1, max_value=100),
+            min_size=total_length,
+            max_size=total_length,
+            unique=True
+        )
+    )
+
+    float_values = draw(
+        st.lists(st.floats(),
+                 min_size=total_length,
+                 max_size=total_length)
+    )
+
+    return [float_lengths, float_keys, float_values]
+
+
+@st.composite
+def _dataset(draw, min_elements=3, max_elements=10, **kwargs):
+    schema = Struct(
+        # Dense Features Map
+        ('floats', Map(
+            Scalar(np.int32), Scalar(np.float32)
+        )),
+        # Sparse Features Map
+        ('int_lists', Map(
+            Scalar(np.int32),
+            List(Scalar(np.int64)),
+        )),
+        # Complex Type
+        ('text', Scalar(str)),
+    )
+
+    num_records = draw(
+        st.integers(min_value=min_elements,
+                    max_value=max_elements)
+    )
+
+    raw_dense_features_map_contents = draw(_dense_features_map(num_records))
+
+    raw_sparse_features_map_contents = draw(_sparse_features_map(num_records))
+
+    raw_text_contents = [
+        draw(
+            st.lists(
+                st.text(alphabet=string.ascii_lowercase),
+                min_size=num_records,
+                max_size=num_records
+            )
+        )
+    ]
+
+    # Concatenate all raw contents to a single one
+    contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents
+
+    contents = from_blob_list(schema, contents_raw)
+
+    return (schema, contents, num_records)
+
+
+class TestDatasetOps(TestCase):
+    @given(_dataset())
+    def test_pack_unpack(self, input):
+        """
+        Tests if packing and unpacking of the whole dataset is an identity.
+        """
+        (schema, contents, num_records) = input
+
+        dataset_fields = schema.field_names()
+
+        net = core.Net('pack_unpack_net')
+
+        batch = NewRecord(net, contents)
+        FeedRecord(batch, contents)
+
+        packed = net.PackRecords(
+            batch.field_blobs(), 1,
+            fields=dataset_fields
+        )
+
+        unpacked = packed.UnPackRecords(
+            [], len(dataset_fields),
+            fields=dataset_fields
+        )
+
+        workspace.RunNetOnce(net)
+
+        for initial_tensor, unpacked_tensor in zip(
+            batch.field_blobs(), unpacked
+        ):
+            npt.assert_array_equal(
+                workspace.FetchBlob(initial_tensor),
+                workspace.FetchBlob(unpacked_tensor)
+            )
+
+    def test_dataset_ops(self):
+        """
+        1. Defining the schema of our dataset.
+
+        This example schema could represent, for example, a search query log.
+        """
+        schema = Struct(
+            # fixed size vector, which will be stored as a matrix when batched
+            ('dense', Scalar((np.float32, 3))),
+            # could represent a feature map from feature ID to float value
+            ('floats', Map(
+                Scalar(np.int32), Scalar(np.float32)
+            )),
+            # could represent a multi-valued categorical feature map
+            ('int_lists', Map(
+                Scalar(np.int32),
+                List(Scalar(np.int64)),
+            )),
+            # could represent a multi-valued, weighted categorical feature map
+            (
+                'id_score_pairs', Map(
+                    Scalar(np.int32),
+                    Map(
+                        Scalar(np.int64),
+                        Scalar(np.float32),
+                        keys_name='ids',
+                        values_name='scores'
+                    ),
+                )
+            ),
+            # additional scalar information
+            (
+                'metadata', Struct(
+                    ('user_id', Scalar(np.int64)),
+                    ('user_embed', Scalar((np.float32, 2))),
+                    ('query', Scalar(str)),
+                )
+            ),
+        )
+        """
+        This is what the flattened fields for this schema look like, along
+        with its type. Each one of these fields will be stored, read and
+        writen as a tensor.
+        """
+        expected_fields = [
+            ('dense', (np.float32, 3)),
+            ('floats:lengths', np.int32),
+            ('floats:values:keys', np.int32),
+            ('floats:values:values', np.float32),
+            ('int_lists:lengths', np.int32),
+            ('int_lists:values:keys', np.int32),
+            ('int_lists:values:values:lengths', np.int32),
+            ('int_lists:values:values:values', np.int64),
+            ('id_score_pairs:lengths', np.int32),
+            ('id_score_pairs:values:keys', np.int32),
+            ('id_score_pairs:values:values:lengths', np.int32),
+            ('id_score_pairs:values:values:values:ids', np.int64),
+            ('id_score_pairs:values:values:values:scores', np.float32),
+            ('metadata:user_id', np.int64),
+            ('metadata:user_embed', (np.float32, 2)),
+            ('metadata:query', str),
+        ]
+        zipped = zip(
+            expected_fields, schema.field_names(), schema.field_types()
+        )
+        for (ref_name, ref_type), name, dtype in zipped:
+            self.assertEquals(ref_name, name)
+            self.assertEquals(np.dtype(ref_type), dtype)
+        """
+        2. The contents of our dataset.
+
+        Contents as defined below could represent, for example, a log of
+        search queries along with dense, sparse features and metadata.
+        The datset below has 3 top-level entries.
+        """
+        contents_raw = [
+            # dense
+            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
+            # floats
+            [1, 2, 3],  # len
+            [11, 21, 22, 31, 32, 33],  # key
+            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
+            # int lists
+            [2, 0, 1],  # len
+            [11, 12, 31],  # key
+            [2, 4, 3],  # value:len
+            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
+            # id score pairs
+            [1, 2, 2],  # len
+            [11, 21, 22, 31, 32],  # key
+            [1, 1, 2, 2, 3],  # value:len
+            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
+            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
+            # metadata
+            [123, 234, 456],  # user_id
+            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
+            ['dog posts', 'friends who like to', 'posts about ca'],  # query
+        ]
+        # convert the above content to ndarrays, checking against the schema
+        contents = from_blob_list(schema, contents_raw)
+        """
+        3. Creating and appending to the dataset.
+        We first create an empty dataset with the given schema.
+        Then, a Writer is used to append these entries to the dataset.
+        """
+        ds = dataset.Dataset(schema)
+        net = core.Net('init')
+        with core.NameScope('init'):
+            ds.init_empty(net)
+
+            content_blobs = NewRecord(net, contents)
+            FeedRecord(content_blobs, contents)
+            writer = ds.writer(init_net=net)
+            writer.write_record(net, content_blobs)
+        workspace.RunNetOnce(net)
+        """
+        4. Iterating through the dataset contents.
+
+        If we were to iterate through the top level entries of our dataset,
+        this is what we should expect to see:
+        """
+        entries_raw = [
+            (
+                [[1.1, 1.2, 1.3]],  # dense
+                [1],
+                [11],
+                [1.1],  # floats
+                [2],
+                [11, 12],
+                [2, 4],
+                [111, 112, 121, 122, 123, 124],  # intlst
+                [1],
+                [11],
+                [1],
+                [111],
+                [11.1],  # id score pairs
+                [123],
+                [[0.2, 0.8]],
+                ['dog posts'],  # metadata
+            ),
+            (
+                [[2.1, 2.2, 2.3]],  # dense
+                [2],
+                [21, 22],
+                [2.1, 2.2],  # floats
+                [0],
+                [],
+                [],
+                [],  # int list
+                [2],
+                [21, 22],
+                [1, 2],
+                [211, 221, 222],
+                [21.1, 22.1, 22.2],
+                [234],
+                [[0.5, 0.5]],
+                ['friends who like to'],  # metadata
+            ),
+            (
+                [[3.1, 3.2, 3.3]],  # dense
+                [3],
+                [31, 32, 33],
+                [3.1, 3.2, 3.3],  # floats
+                [1],
+                [31],
+                [3],
+                [311, 312, 313],  # int lst
+                [2],
+                [31, 32],
+                [2, 3],
+                [311, 312, 321, 322, 323],
+                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
+                [456],
+                [[0.7, 0.3]],
+                ['posts about ca'],  # metadata
+            ),
+            # after the end of the dataset, we will keep getting empty vectors
+            ([], ) * 16,
+            ([], ) * 16,
+        ]
+        entries = [from_blob_list(schema, e) for e in entries_raw]
+        """
+        Let's go ahead and create the reading nets.
+        We will run `read` net multiple times and assert that we are reading the
+        entries the way we stated above.
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+        reader = ds.reader(read_init_net)
+        should_continue, batch = reader.read_record(read_next_net)
+
+        workspace.RunNetOnce(read_init_net)
+        workspace.CreateNet(read_next_net, True)
+
+        for entry in entries:
+            workspace.RunNet(str(read_next_net))
+            actual = FetchRecord(batch)
+            _assert_records_equal(actual, entry)
+        """
+        5. Reading/writing in a single plan
+
+        If all of operations on the data are expressible as Caffe2 operators,
+        we don't need to load the data to python, iterating through the dataset
+        in a single Plan.
+
+        Where we will process the dataset a little and store it in a second
+        dataset. We can reuse the same Reader since it supports reset.
+        """
+        reset_net = core.Net('reset_net')
+        reader.reset(reset_net)
+        read_step, batch = reader.execution_step()
+        """ We will add the line number * 1000 to the feature ids. """
+        process_net = core.Net('process')
+        line_no = Const(process_net, 0, dtype=np.int32)
+        const_one = Const(process_net, 1000, dtype=np.int32)
+        process_net.Add([line_no, const_one], [line_no])
+        field = batch.floats.keys.get()
+        process_net.Print(field, [])
+        process_net.Add([field, line_no], field, broadcast=1, axis=0)
+        """ Lets create a second dataset and append to it. """
+        ds2 = dataset.Dataset(schema, name='dataset2')
+        ds2.init_empty(reset_net)
+        writer = ds2.writer(reset_net)
+        writer.write_record(process_net, batch)
+        # commit is not necessary for DatasetWriter but will add it for
+        # generality of the example
+        commit_net = core.Net('commit')
+        writer.commit(commit_net)
+        """ Time to create and run a plan which will do the processing """
+        plan = core.Plan('process')
+        plan.AddStep(core.execution_step('reset', reset_net))
+        plan.AddStep(read_step.AddNet(process_net))
+        plan.AddStep(core.execution_step('commit', commit_net))
+        workspace.RunPlan(plan)
+        """
+        Now we should have dataset2 populated.
+        """
+        ds2_data = FetchRecord(ds2.content())
+        field = ds2_data.floats.keys
+        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
+        _assert_records_equal(contents, ds2_data)
+        """
+        6. Slicing a dataset
+
+        You can create a new schema from pieces of another schema and reuse
+        the same data.
+        """
+        subschema = Struct(('top_level', schema.int_lists.values))
+        int_list_contents = contents.int_lists.values.field_names()
+        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
+        """
+        7. Random Access a dataset
+
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+
+        idx = np.array([2, 1, 0])
+        indices_blob = Const(read_init_net, idx, name='indices')
+        reader = ds.random_reader(read_init_net, indices_blob)
+        reader.computeoffset(read_init_net)
+
+        should_stop, batch = reader.read_record(read_next_net)
+
+        workspace.CreateNet(read_init_net, True)
+        workspace.RunNetOnce(read_init_net)
+
+        workspace.CreateNet(read_next_net, True)
+
+        for i in range(len(entries)):
+            k = idx[i] if i in idx else i
+            entry = entries[k]
+            workspace.RunNet(str(read_next_net))
+            actual = FetchRecord(batch)
+            _assert_records_equal(actual, entry)
+        workspace.RunNet(str(read_next_net))
+        self.assertEquals(True, workspace.FetchBlob(should_stop))
+        """
+        8. Random Access a dataset with loop_over = true
+
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+
+        idx = np.array([2, 1, 0])
+        indices_blob = Const(read_init_net, idx, name='indices')
+        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
+        reader.computeoffset(read_init_net)
+
+        should_stop, batch = reader.read_record(read_next_net)
+
+        workspace.CreateNet(read_init_net, True)
+        workspace.RunNetOnce(read_init_net)
+
+        workspace.CreateNet(read_next_net, True)
+
+        for _ in range(len(entries) * 3):
+            workspace.RunNet(str(read_next_net))
+            self.assertEquals(False, workspace.FetchBlob(should_stop))
+        """
+        9. Sort and shuffle a dataset
+
+        This sort the dataset using the score of a certain column,
+        and then shuffle within each chunk of size batch_size * shuffle_size
+        before shuffling the chunks.
+
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+
+        reader = ds.random_reader(read_init_net)
+        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
+        reader.computeoffset(read_init_net)
+
+        should_continue, batch = reader.read_record(read_next_net)
+
+        workspace.CreateNet(read_init_net, True)
+        workspace.RunNetOnce(read_init_net)
+
+        workspace.CreateNet(read_next_net, True)
+
+        expected_idx = np.array([2, 1, 0])
+        for i in range(len(entries)):
+            k = expected_idx[i] if i in expected_idx else i
+            entry = entries[k]
+            workspace.RunNet(str(read_next_net))
+            actual = FetchRecord(batch)
+            _assert_records_equal(actual, entry)
+
+        """
+        Trim a dataset
+        """
+        trim_net = core.Net('trim_ds')
+        ds.trim(trim_net, multiple_of=2)
+        workspace.RunNetOnce(trim_net)
+        trimmed = FetchRecord(ds.content())
+        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
+        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
+        self.assertEquals(EXPECTED_SIZES, actual_sizes)
+
+    def test_last_n_window_ops(self):
+        collect_net = core.Net('collect_net')
+        collect_net.GivenTensorFill(
+            [],
+            'input',
+            shape=[3, 2],
+            values=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+        )
+        input_array =\
+            np.array(list(range(1, 7)), dtype=np.float32).reshape(3, 2)
+
+        workspace.CreateBlob('output')
+        workspace.FeedBlob('next', np.array(0, dtype=np.int32))
+        collect_net.LastNWindowCollector(
+            ['output', 'next', 'input'],
+            ['output', 'next'],
+            num_to_collect=7,
+        )
+        plan = core.Plan('collect_data')
+        plan.AddStep(
+            core.execution_step('collect_data', [collect_net],
+                                num_iter=1)
+        )
+        workspace.RunPlan(plan)
+        reference_result = workspace.FetchBlob('output')
+        npt.assert_array_equal(input_array, reference_result)
+
+        plan = core.Plan('collect_data')
+        plan.AddStep(
+            core.execution_step('collect_data', [collect_net],
+                                num_iter=2)
+        )
+        workspace.RunPlan(plan)
+        reference_result = workspace.FetchBlob('output')
+        npt.assert_array_equal(input_array[[1, 2, 2, 0, 1, 2, 0]],
+                               reference_result)
+
+        plan = core.Plan('collect_data')
+        plan.AddStep(
+            core.execution_step('collect_data', [collect_net],
+                                num_iter=3)
+        )
+        workspace.RunPlan(plan)
+        reference_result = workspace.FetchBlob('output')
+        npt.assert_array_equal(input_array[[2, 0, 1, 2, 2, 0, 1]],
+                               reference_result)
+
+    def test_collect_tensor_ops(self):
+        init_net = core.Net('init_net')
+        blobs = ['blob_1', 'blob_2', 'blob_3']
+        bvec_map = {}
+        ONE = init_net.ConstantFill([], 'ONE', shape=[1, 2], value=1)
+        for b in blobs:
+            init_net.ConstantFill([], [b], shape=[1, 2], value=0)
+            bvec_map[b] = b + '_vec'
+            init_net.CreateTensorVector([], [bvec_map[b]])
+
+        reader_net = core.Net('reader_net')
+        for b in blobs:
+            reader_net.Add([b, ONE], [b])
+
+        collect_net = core.Net('collect_net')
+        num_to_collect = 1000
+        max_example_to_cover = 100000
+        bvec = [bvec_map[b] for b in blobs]
+        collect_net.CollectTensor(
+            bvec + blobs,
+            bvec,
+            num_to_collect=num_to_collect,
+        )
+
+        print('Collect Net Proto: {}'.format(collect_net.Proto()))
+
+        plan = core.Plan('collect_data')
+        plan.AddStep(core.execution_step('collect_init', init_net))
+        plan.AddStep(
+            core.execution_step(
+                'collect_data', [reader_net, collect_net],
+                num_iter=max_example_to_cover
+            )
+        )
+        workspace.RunPlan(plan)
+
+        # concat the collected tensors
+        concat_net = core.Net('concat_net')
+        bconcated_map = {}
+        bsize_map = {}
+        for b in blobs:
+            bconcated_map[b] = b + '_concated'
+            bsize_map[b] = b + '_size'
+            concat_net.ConcatTensorVector([bvec_map[b]], [bconcated_map[b]])
+            concat_net.TensorVectorSize([bvec_map[b]], [bsize_map[b]])
+
+        workspace.RunNetOnce(concat_net)
+
+        # check data
+        reference_result = workspace.FetchBlob(bconcated_map[blobs[0]])
+        self.assertEqual(
+            reference_result.shape,
+            (min(num_to_collect, max_example_to_cover), 2)
+        )
+        size = workspace.FetchBlob(bsize_map[blobs[0]])
+        self.assertEqual(tuple(), size.shape)
+        self.assertEqual(min(num_to_collect, max_example_to_cover), size.item())
+
+        hist, _ = np.histogram(
+            reference_result[:, 0],
+            bins=10,
+            range=(1, max_example_to_cover)
+        )
+        print('Sample histogram: {}'.format(hist))
+
+        self.assertTrue(all(hist > 0.6 * (num_to_collect / 10)))
+        for i in range(1, len(blobs)):
+            result = workspace.FetchBlob(bconcated_map[blobs[i]])
+            self.assertEqual(reference_result.tolist(), result.tolist())
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
new file mode 100644
index 0000000..03bea2c
--- /dev/null
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -0,0 +1,521 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import assume, given
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+import unittest
+
+
+def _cudnn_supports(
+        dilation=False,
+        nhwc=False,
+):
+    """Return True if cuDNN supports this configuration."""
+    v = workspace.GetCuDNNVersion()
+    if dilation and v < 6000:
+        # Dilation not supported until v6
+        return False
+    if dilation and nhwc:
+        # Dilation and NHWC not supported together
+        return False
+    return True
+
+
+def _conv_1d_output_size(size, kernel, pad, dilation, stride):
+    return max(
+        1,
+        int((size + pad * 2 - (dilation * (kernel - 1) + 1)) / stride) + 1
+    )
+
+
+def _conv_2d_output_size(size, kernel, pad_h, pad_w, dilation,
+                         stride_h, stride_w):
+    return [
+        _conv_1d_output_size(size, kernel, pad_h, dilation, stride_h),
+        _conv_1d_output_size(size, kernel, pad_w, dilation, stride_w)
+    ]
+
+
+def _conv_2d_offsets_dims(
+        batch_size,
+        size,
+        kernel,
+        pad_h,
+        pad_w,
+        dilation,
+        stride_h,
+        stride_w,
+        deformable_group
+):
+    dims = [batch_size, 2 * kernel * kernel * deformable_group]
+    dims.extend(_conv_2d_output_size(size, kernel, pad_h, pad_w,
+                                     dilation, stride_h, stride_w))
+    return dims
+
+
+def _conv_2d_random_offsets(
+    batch_size,
+    kernel,
+    dims,
+    num_deformable_group
+):
+    o = []
+    for y0 in range(0, kernel):
+        for x0 in range(0, kernel):
+            # stay away from integer offsets which correspond to "ridges" on the
+            # interpolated surface resulting in less precise estimates
+            x = np.random.randint(0, kernel) + np.random.uniform(0.05, 0.95)
+            y = np.random.randint(0, kernel) + np.random.uniform(0.05, 0.95)
+            o.append(y - y0)
+            o.append(x - x0)
+    o = o * num_deformable_group
+    e = []
+    for v in o:
+        e.append([[v] * dims[1]] * dims[0])
+    return np.array([e] * batch_size).astype(np.float32)
+
+
+def _conv_2d_shuffle_offsets(
+    batch_size,
+    kernel,
+    dims,
+    num_deformable_group,
+    input_channels,
+    output_channels
+):
+    o = []
+    w0 = [[0 for x in range(kernel)] for y in range(kernel)]
+    for y0 in range(0, kernel):
+        for x0 in range(0, kernel):
+            x = np.random.randint(0, kernel)
+            y = np.random.randint(0, kernel)
+            o.append(y - y0)
+            o.append(x - x0)
+            w0[y][x] += 1
+    o = o * num_deformable_group
+    e = []
+    for v in o:
+        e.append([[v] * int(dims[1])] * int(dims[0]))
+    w0 = [[w0] * input_channels] * output_channels
+    return (
+        np.array([e] * batch_size).astype(np.float32),
+        np.array(w0).astype(np.float32).transpose((0, 2, 3, 1))
+    )
+
+
+class TestConvolution(hu.HypothesisTestCase):
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           use_bias=st.booleans(),
+           deformable_group=st.integers(1, 3),
+           **hu.gcs_gpu_only)
+    def test_null_offset_convolution(self, stride, pad, kernel, dilation, size,
+                                     input_channels, output_channels, batch_size,
+                                     order, engine, use_bias, deformable_group,
+                                     gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+
+        if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC')))
+
+        assume(engine != "MKLDNN" or use_bias is True)
+
+        op = core.CreateOperator(
+            "DeformConv",
+            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            engine=engine,
+            deformable_group=deformable_group,
+        )
+        offset_dims = _conv_2d_offsets_dims(batch_size, size, kernel, pad, pad,
+                                            dilation, stride, stride,
+                                            deformable_group)
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        o = np.zeros(tuple(offset_dims), np.float32)
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, o, w, b] if use_bias else [X, o, w]
+
+        # Error handling path.
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if input_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if output_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        def reference_conv_op(*args):
+            reference_op = core.CreateOperator(
+                "Conv",
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y0"],
+                stride=stride,
+                kernel=kernel,
+                dilation=dilation,
+                pad=pad,
+                order=order,
+                engine=engine,
+                device_option=gc
+            )
+            workspace.RunOperatorOnce(reference_op)
+            reference_blob = workspace.FetchBlob("Y0")
+            return (reference_blob,)
+
+        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 0),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           use_bias=st.booleans(),
+           deformable_group=st.integers(1, 4),
+           **hu.gcs_gpu_only)
+    def test_flat_input_convolution(self, stride, pad, kernel, dilation, size,
+                                    input_channels, output_channels, batch_size,
+                                    order, engine, use_bias,
+                                    deformable_group, gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+
+        if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC')))
+
+        assume(engine != "MKLDNN" or use_bias is True)
+
+        op = core.CreateOperator(
+            "DeformConv",
+            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            engine=engine,
+            deformable_group=deformable_group,
+        )
+        X = np.ones((batch_size, size, size, input_channels), np.float32) - 0.5
+        output_size = _conv_2d_output_size(size, kernel, pad, pad,
+                                           dilation, stride, stride)
+        o = _conv_2d_random_offsets(batch_size, kernel, output_size,
+                                    deformable_group)
+        w = np.ones((output_channels, kernel, kernel, input_channels), np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, o, w, b] if use_bias else [X, o, w]
+
+        # Error handling path.
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if input_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if output_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        def reference_conv_op(*args):
+            reference_op = core.CreateOperator(
+                "Conv",
+                ["X", "w", "b"] if use_bias else ["X", "w"],
+                ["Y0"],
+                stride=stride,
+                kernel=kernel,
+                dilation=dilation,
+                pad=pad,
+                order=order,
+                engine=engine,
+                device_option=gc
+            )
+            workspace.RunOperatorOnce(reference_op)
+            reference_blob = workspace.FetchBlob("Y0")
+            return (reference_blob,)
+
+        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(stride=st.integers(1, 1),
+           pad=st.integers(0, 0),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 1),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           use_bias=st.booleans(),
+           deformable_group=st.integers(1, 4),
+           **hu.gcs_gpu_only)
+    def test_shuffle_input_convolution(self, stride, pad, kernel, dilation, size,
+                                       input_channels, output_channels, batch_size,
+                                       order, engine, use_bias,
+                                       deformable_group, gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+
+        if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC')))
+
+        assume(engine != "MKLDNN" or use_bias is True)
+
+        op = core.CreateOperator(
+            "DeformConv",
+            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            engine=engine,
+            deformable_group=deformable_group,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        output_size = _conv_2d_output_size(size, kernel, pad, pad,
+                                           dilation, stride, stride)
+        o, w0 = _conv_2d_shuffle_offsets(batch_size, kernel, output_size,
+                                         deformable_group, input_channels,
+                                         output_channels)
+        w = np.ones((output_channels, kernel, kernel, input_channels), np.float32)
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+            w0 = w0.transpose((0, 3, 1, 2))
+
+        inputs = [X, o, w, b] if use_bias else [X, o, w]
+
+        # Error handling path.
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if input_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if output_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        def reference_conv_op(*args):
+            with core.DeviceScope(gc):
+                workspace.FeedBlob("w0", w0)
+            reference_op = core.CreateOperator(
+                "Conv",
+                ["X", "w0", "b"] if use_bias else ["X", "w0"],
+                ["Y0"],
+                stride=stride,
+                kernel=kernel,
+                dilation=dilation,
+                pad=pad,
+                order=order,
+                engine=engine,
+                device_option=gc
+            )
+            workspace.RunOperatorOnce(reference_op)
+            reference_blob = workspace.FetchBlob("Y0")
+            return (reference_blob,)
+
+        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
+
+    # CUDNN does NOT support different padding values and we skip it
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_h=st.integers(0, 3),
+           pad_w=st.integers(0, 3),
+           kernel=st.integers(2, 5),
+           size=st.integers(1, 8),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["", "EIGEN"]),
+           shared_buffer=st.booleans(),
+           use_bias=st.booleans(),
+           deformable_group=st.integers(1, 3),
+           **hu.gcs_gpu_only)
+    def test_conv_separate_stride_pad_gradients(self, stride_h, stride_w,
+                                                pad_h, pad_w, kernel, size,
+                                                input_channels, output_channels,
+                                                batch_size, order, engine,
+                                                shared_buffer, use_bias,
+                                                deformable_group, gc, dc):
+        op = core.CreateOperator(
+            "DeformConv",
+            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
+            ["Y"],
+            stride_h=stride_h,
+            stride_w=stride_w,
+            pad_t=pad_h,
+            pad_l=pad_w,
+            pad_b=pad_h,
+            pad_r=pad_w,
+            kernel=kernel,
+            order=order,
+            engine=engine,
+            shared_buffer=int(shared_buffer),
+            deformable_group=deformable_group,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        output_size = _conv_2d_output_size(size, kernel, pad_h, pad_w, 1,
+                                           stride_h, stride_w)
+        o = _conv_2d_random_offsets(batch_size, kernel, output_size,
+                                    deformable_group)
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, o, w, b] if use_bias else [X, o, w]
+
+        # Error handling path.
+        if size + pad_h < kernel or size + pad_w < kernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if input_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if output_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           input_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           use_bias=st.booleans(),
+           deformable_group=st.integers(1, 3),
+           **hu.gcs_gpu_only)
+    def test_conv_gradients(self, stride, pad, kernel, dilation, size,
+                            input_channels, output_channels, batch_size, order,
+                            engine, use_bias, deformable_group, gc, dc):
+        dkernel = dilation * (kernel - 1) + 1
+
+        if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC')))
+
+        assume(engine != "MKLDNN" or use_bias is True)
+
+        op = core.CreateOperator(
+            "DeformConv",
+            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            engine=engine,
+            deformable_group=deformable_group,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        output_size = _conv_2d_output_size(size, kernel, pad, pad,
+                                           dilation, stride, stride)
+        o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group)
+        w = np.random.rand(
+            output_channels, kernel, kernel, input_channels).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, o, w, b] if use_bias else [X, o, w]
+        # Error handling path.
+        if size + pad + pad < dkernel or size + pad + pad < dkernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if input_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+        if output_channels % deformable_group != 0:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
new file mode 100644
index 0000000..5691fef
--- /dev/null
+++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
@@ -0,0 +1,55 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, dyndep, workspace
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class Depthwise3x3ConvOpsTest(hu.HypothesisTestCase):
+    @given(pad=st.integers(0, 1),
+           kernel=st.integers(3, 3),
+           size=st.integers(4, 8),
+           channels=st.integers(2, 4),
+           batch_size=st.integers(1, 1),
+           order=st.sampled_from(["NCHW"]),
+           engine=st.sampled_from(["DEPTHWISE_3x3"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_convolution_gradients(self, pad, kernel, size,
+                                   channels, batch_size,
+                                   order, engine, use_bias, gc, dc):
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            kernel=kernel,
+            pad=pad,
+            group=channels,
+            order=order,
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, size, size, channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            channels, kernel, kernel, 1).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+        # Error handling path.
+        if size + pad + pad < kernel or size + pad + pad < kernel:
+            with self.assertRaises(RuntimeError):
+                self.assertDeviceChecks(dc, op, inputs, [0])
+            return
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
new file mode 100644
index 0000000..0a5f9a3
--- /dev/null
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -0,0 +1,108 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class DistanceTest(hu.HypothesisTestCase):
+    @given(n=st.integers(1, 3),
+           dim=st.integers(4, 16),
+           **hu.gcs)
+    def test_cosine_similarity(self, n, dim, gc, dc):
+        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        self.ws.create_blob("X").feed(X)
+        self.ws.create_blob("Y").feed(Y)
+        kEps = 1e-12
+        cos_op = core.CreateOperator("CosineSimilarity", ["X", "Y"], ["cos"])
+        self.ws.run(cos_op)
+        cos = np.divide(np.multiply(X, Y).sum(axis=1),
+                        np.multiply(np.linalg.norm(X, axis=1) + kEps,
+                                    np.linalg.norm(Y, axis=1) + kEps))
+        np.testing.assert_allclose(self.ws.blobs[("cos")].fetch(), cos,
+                                   rtol=1e-4, atol=1e-4)
+        self.assertGradientChecks(gc, cos_op, [X, Y], 0, [0],
+                                  stepsize=1e-2, threshold=1e-2)
+        self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0],
+                                  stepsize=1e-2, threshold=1e-2)
+
+    @given(inputs=hu.tensors(n=2,
+                             min_dim=1,
+                             max_dim=2,
+                             dtype=np.float32),
+           **hu.gcs)
+    def test_dot_product(self, inputs, gc, dc):
+        X, Y = inputs
+        op = core.CreateOperator(
+            'DotProduct',
+            ['X', 'Y'],
+            ['DOT'],
+        )
+
+        def dot_ref(X, Y):
+            return ([np.dot(x, y) for x, y in zip(X, Y)],)
+
+        # Check against numpy dot reference
+        self.assertReferenceChecks(gc, op, [X, Y], dot_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        # Gradient check wrt Y
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(n=st.integers(1, 3),
+           dim=st.integers(4, 16),
+           **hu.gcs)
+    def test_L1_distance(self, n, dim, gc, dc):
+        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        # avoid kinks by moving away from 0
+        X += 0.02 * np.sign(X - Y)
+        X[(X - Y) == 0.0] += 0.02
+
+        self.ws.create_blob("X").feed(X)
+        self.ws.create_blob("Y").feed(Y)
+        op = core.CreateOperator(
+            'L1Distance',
+            ['X', 'Y'],
+            ['l1_dist'],
+        )
+        self.ws.run(op)
+        np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(),
+                                    [np.linalg.norm(x - y, ord=1)
+                                        for x, y in zip(X, Y)],
+                                    rtol=1e-4, atol=1e-4)
+
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0],
+                                  stepsize=1e-2, threshold=1e-2)
+        # Gradient check wrt Y
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0],
+                                  stepsize=1e-2, threshold=1e-2)
+
+    @given(n=st.integers(1, 3),
+           dim=st.integers(4, 16),
+           **hu.gcs)
+    def test_L2_distance(self, n, dim, gc, dc):
+        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
+        self.ws.create_blob("X").feed(X)
+        self.ws.create_blob("Y").feed(Y)
+        l2_op = core.CreateOperator("SquaredL2Distance",
+                                    ["X", "Y"], ["l2_dist"])
+        self.ws.run(l2_op)
+        np.testing.assert_allclose(self.ws.blobs[("l2_dist")].fetch(),
+                                   np.square(X - Y).sum(axis=1) * 0.5,
+                                   rtol=1e-4, atol=1e-4)
+        self.assertGradientChecks(gc, l2_op, [X, Y], 0, [0],
+                                  stepsize=1e-2, threshold=1e-2)
+        self.assertGradientChecks(gc, l2_op, [X, Y], 1, [0],
+                                  stepsize=1e-2, threshold=1e-2)
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
new file mode 100644
index 0000000..49eaf74
--- /dev/null
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -0,0 +1,74 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestDropout(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           ratio=st.floats(0, 0.999),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_dropout_is_test(self, X, in_place, ratio, engine, gc, dc):
+        """Test with is_test=True for a deterministic reference impl."""
+        # TODO(lukeyeager): enable this path when the GPU path is fixed
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(not (gc.device_type == caffe2_pb2.CUDA and engine == ''))
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"],
+                                 ratio=ratio, engine=engine, is_test=True)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # No sense in checking gradients for test phase
+
+        def reference_dropout_test(x):
+            return x, np.ones(x.shape, dtype=np.bool)
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_test,
+            # The 'mask' output may be uninitialized
+            outputs_to_check=[0])
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_dropout_ratio0(self, X, in_place, output_mask, engine, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        # TODO(lukeyeager): enable this path when the op is fixed
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(gc.device_type != caffe2_pb2.CUDA)
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+        is_test = not output_mask
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"] +
+                                 (["mask"] if output_mask else []),
+                                 ratio=0.0, engine=engine,
+                                 is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not is_test:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+        def reference_dropout_ratio0(x):
+            return (x,) if is_test else (x, np.ones(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio0,
+            # Don't check the mask with cuDNN because it's packed data
+            outputs_to_check=None if engine != 'CUDNN' else [0])
diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py
new file mode 100644
index 0000000..385e69f
--- /dev/null
+++ b/caffe2/python/operator_test/duplicate_operands_test.py
@@ -0,0 +1,28 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+
+class TestDuplicateOperands(TestCase):
+    def test_duplicate_operands(self):
+        net = core.Net('net')
+        shape = (2, 4)
+        x_in = np.random.uniform(size=shape)
+        x = net.GivenTensorFill([], 'X', shape=shape,
+                                values=x_in.flatten().tolist())
+        xsq = net.Mul([x, x])
+        y = net.DotProduct([xsq, xsq])
+        net.AddGradientOperators([y])
+        workspace.RunNetOnce(net)
+        self.assertTrue(np.allclose(workspace.FetchBlob('X_grad'),
+                                    4 * x_in**3))
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
new file mode 100644
index 0000000..c67a849
--- /dev/null
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestElementwiseLinearOp(hu.HypothesisTestCase):
+
+    @given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
+    # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only)
+    def test(self, n, d, gc, dc):
+        X = np.random.rand(n, d).astype(np.float32)
+        a = np.random.rand(d).astype(np.float32)
+        b = np.random.rand(d).astype(np.float32)
+
+        def ref_op(X, a, b):
+            d = a.shape[0]
+            return [np.multiply(X, a.reshape(1, d)) + b.reshape(1, d)]
+
+        op = core.CreateOperator(
+            "ElementwiseLinear",
+            ["X", "a", "b"],
+            ["Y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, a, b],
+            reference=ref_op,
+        )
+
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, a, b], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, a, b], 0, [0])
+        # Gradient check wrt a
+        self.assertGradientChecks(gc, op, [X, a, b], 1, [0])
+        # # Gradient check wrt b
+        self.assertGradientChecks(gc, op, [X, a, b], 2, [0])
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
new file mode 100644
index 0000000..7279dd5
--- /dev/null
+++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py
@@ -0,0 +1,134 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+import unittest
+
+
+def mux(select, left, right):
+    return [np.vectorize(lambda c, x, y: x if c else y)(select, left, right)]
+
+
+def rowmux(select_vec, left, right):
+    select = [[s] * len(left) for s in select_vec]
+    return mux(select, left, right)
+
+
+class TestWhere(hu.HypothesisTestCase):
+
+    def test_reference(self):
+        self.assertTrue((
+            np.array([1, 4]) == mux([True, False],
+                                    [1, 2],
+                                    [3, 4])[0]
+        ).all())
+        self.assertTrue((
+            np.array([[1], [4]]) == mux([[True], [False]],
+                                        [[1], [2]],
+                                        [[3], [4]])[0]
+        ).all())
+
+    @given(N=st.integers(min_value=1, max_value=10),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    def test_where(self, N, gc, dc, engine):
+        C = np.random.rand(N).astype(bool)
+        X = np.random.rand(N).astype(np.float32)
+        Y = np.random.rand(N).astype(np.float32)
+        op = core.CreateOperator("Where", ["C", "X", "Y"], ["Z"], engine=engine)
+        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
+        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
+
+    @given(N=st.integers(min_value=1, max_value=10),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    def test_where_dim2(self, N, gc, dc, engine):
+        C = np.random.rand(N, N).astype(bool)
+        X = np.random.rand(N, N).astype(np.float32)
+        Y = np.random.rand(N, N).astype(np.float32)
+        op = core.CreateOperator("Where", ["C", "X", "Y"], ["Z"], engine=engine)
+        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
+        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
+
+
+class TestRowWhere(hu.HypothesisTestCase):
+
+    def test_reference(self):
+        self.assertTrue((
+            np.array([1, 2]) == rowmux([True],
+                                       [1, 2],
+                                       [3, 4])[0]
+        ).all())
+        self.assertTrue((
+            np.array([[1, 2], [7, 8]]) == rowmux([True, False],
+                                                 [[1, 2], [3, 4]],
+                                                 [[5, 6], [7, 8]])[0]
+        ).all())
+
+    @given(N=st.integers(min_value=1, max_value=10),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    def test_rowwhere(self, N, gc, dc, engine):
+        C = np.random.rand(N).astype(bool)
+        X = np.random.rand(N).astype(np.float32)
+        Y = np.random.rand(N).astype(np.float32)
+        op = core.CreateOperator(
+            "Where",
+            ["C", "X", "Y"],
+            ["Z"],
+            broadcast_on_rows=True,
+            engine=engine,
+        )
+        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
+        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
+
+    @given(N=st.integers(min_value=1, max_value=10),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    def test_rowwhere_dim2(self, N, gc, dc, engine):
+        C = np.random.rand(N).astype(bool)
+        X = np.random.rand(N, N).astype(np.float32)
+        Y = np.random.rand(N, N).astype(np.float32)
+        op = core.CreateOperator(
+            "Where",
+            ["C", "X", "Y"],
+            ["Z"],
+            broadcast_on_rows=True,
+            engine=engine,
+        )
+        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
+        self.assertReferenceChecks(gc, op, [C, X, Y], rowmux)
+
+
+class TestIsMemberOf(hu.HypothesisTestCase):
+
+    @given(N=st.integers(min_value=1, max_value=10),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    def test_is_member_of(self, N, gc, dc, engine):
+        X = np.random.randint(10, size=N).astype(np.int64)
+        values = [0, 3, 4, 6, 8]
+        op = core.CreateOperator(
+            "IsMemberOf",
+            ["X"],
+            ["Y"],
+            value=np.array(values),
+            engine=engine,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        values = set(values)
+
+        def test(x):
+            return [np.vectorize(lambda x: x in values)(x)]
+        self.assertReferenceChecks(gc, op, [X], test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
new file mode 100644
index 0000000..e767a0d
--- /dev/null
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -0,0 +1,488 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+
+from hypothesis import given
+import numpy as np
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+# TODO(jiayq): make them hypothesis tests for better coverage.
+class TestElementwiseBroadcast(hu.HypothesisTestCase):
+    @given(**hu.gcs)
+    def test_broadcast_Add(self, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(4, 5).astype(np.float32)
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X + Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(3, 4).astype(np.float32)
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X + Y[:, :, np.newaxis])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+        # broadcasting the first dimension
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(2).astype(np.float32)
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1, axis=0)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X + Y[:, np.newaxis, np.newaxis, np.newaxis])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+        # broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(1, 4, 1).astype(np.float32)
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X + Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(**hu.gcs)
+    def test_broadcast_Mul(self, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(4, 5).astype(np.float32)
+        op = core.CreateOperator("Mul", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X * Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(3, 4).astype(np.float32)
+        op = core.CreateOperator("Mul", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X * Y[:, :, np.newaxis])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting the first dimension
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(2).astype(np.float32)
+        op = core.CreateOperator("Mul", ["X", "Y"], "out", broadcast=1, axis=0)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X * Y[:, np.newaxis, np.newaxis, np.newaxis])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(1, 4, 1).astype(np.float32)
+        op = core.CreateOperator("Mul", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X * Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(**hu.gcs)
+    def test_broadcast_Sub(self, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(4, 5).astype(np.float32)
+        op = core.CreateOperator("Sub", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X - Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(3, 4).astype(np.float32)
+        op = core.CreateOperator("Sub", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X - Y[:, :, np.newaxis])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting the first dimension
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(2).astype(np.float32)
+        op = core.CreateOperator("Sub", ["X", "Y"], "out", broadcast=1, axis=0)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X - Y[:, np.newaxis, np.newaxis, np.newaxis])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(1, 4, 1).astype(np.float32)
+        op = core.CreateOperator("Sub", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X - Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(**hu.gcs)
+    def test_broadcast_powt(self, gc, dc):
+        np.random.seed(101)
+
+        #operator
+        def powt_op(X, Y):
+            return [np.power(X, Y)]
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        def powt_grad(g_out, outputs, fwd_inputs):
+            [X, Y] = fwd_inputs
+            Z = outputs[0]
+            return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out)
+
+        #1. Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
+        Y = np.random.rand(4, 5).astype(np.float32) + 2.0
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        #latter gradient is sumed over 1 and 0 dims to account for broadcast
+        def powt_grad_broadcast(g_out, outputs, fwd_inputs):
+            [GX, GY] = powt_grad(g_out, outputs, fwd_inputs)
+            return ([GX, np.sum(np.sum(GY, 1), 0)])
+
+        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1)
+        self.assertReferenceChecks(device_option=gc,
+                                   op=op,
+                                   inputs=[X, Y],
+                                   reference=powt_op,
+                                   output_to_grad="Z",
+                                   grad_reference=powt_grad_broadcast)
+
+        #2. broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
+        Y = np.random.rand(3, 4).astype(np.float32) + 2.0
+
+        #pow op with the latter array increased by one dim
+        def powt_op_axis1(X, Y):
+            return powt_op(X, Y[:, :, np.newaxis])
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        #latter gradient is sumed over 3 and 0 dims to account for broadcast
+        def powt_grad_axis1(g_out, outputs, fwd_inputs):
+            [X, Y] = fwd_inputs
+            [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, :, np.newaxis]])
+            return ([GX, np.sum(np.sum(GY, 3), 0)])
+
+        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
+        self.assertReferenceChecks(device_option=gc,
+                                   op=op,
+                                   inputs=[X, Y],
+                                   reference=powt_op_axis1,
+                                   output_to_grad="Z",
+                                   grad_reference=powt_grad_axis1)
+
+        #3. broadcasting the first dimension
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
+        Y = np.random.rand(2).astype(np.float32) + 2.0
+
+        #pow op with the latter array increased by one dim
+        def powt_op_axis0(X, Y):
+            return powt_op(X, Y[:, np.newaxis, np.newaxis, np.newaxis])
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        #latter gradient is sumed over 3, 2 and 1 dims to account for broadcast
+        def powt_grad_axis0(g_out, outputs, fwd_inputs):
+            [X, Y] = fwd_inputs
+            [GX, GY] = powt_grad(g_out,
+                                 outputs,
+                                 [X, Y[:, np.newaxis, np.newaxis, np.newaxis]])
+            return ([GX, np.sum(np.sum(np.sum(GY, 3), 2), 1)])
+
+        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=0)
+        self.assertReferenceChecks(device_option=gc,
+                                   op=op,
+                                   inputs=[X, Y],
+                                   reference=powt_op_axis0,
+                                   output_to_grad="Z",
+                                   grad_reference=powt_grad_axis0)
+
+        #4. broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
+        Y = np.random.rand(1, 4, 1).astype(np.float32) + 2.0
+
+        #pow op with the latter array increased by one dim
+        def powt_op_mixed(X, Y):
+            return powt_op(X, Y[np.newaxis, :, :, :])
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        #latter gradient is sumed over 0 and 1 dims to account for broadcast
+        def powt_grad_mixed(g_out, outputs, fwd_inputs):
+            [X, Y] = fwd_inputs
+            [GX, GY] = powt_grad(g_out, outputs, [X, Y[np.newaxis, :, :, :]])
+            return ([GX, np.reshape(np.sum(np.sum(np.sum(GY, 3), 1), 0),
+                                    (1, 4, 1))])
+
+        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
+        self.assertReferenceChecks(device_option=gc,
+                                   op=op,
+                                   inputs=[X, Y],
+                                   reference=powt_op_mixed,
+                                   output_to_grad="Z",
+                                   grad_reference=powt_grad_mixed)
+
+    @given(**hu.gcs)
+    def test_broadcast_scalar(self, gc, dc):
+        # broadcasting constant
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(1).astype(np.float32)
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X + Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting scalar
+        X = np.random.rand(1).astype(np.float32)
+        Y = np.random.rand(1).astype(np.float32).reshape([])
+        op = core.CreateOperator("Add", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X + Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+    @given(**hu.gcs)
+    def test_semantic_broadcast(self, gc, dc):
+        # NCHW as default
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(3).astype(np.float32)
+        op = core.CreateOperator(
+            "Add", ["X", "Y"], "out", broadcast=1, axis_str="C")
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(
+            out, X + Y[:, np.newaxis, np.newaxis])
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # NHWC
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(5).astype(np.float32)
+        op = core.CreateOperator(
+            "Add", ["X", "Y"], "out", broadcast=1, axis_str="C", order="NHWC")
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        np.testing.assert_array_almost_equal(out, X + Y)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+    @given(**hu.gcs)
+    def test_sum_reduce_empty_blob(self, gc, dc):
+        net = core.Net('test')
+
+        with core.DeviceScope(gc):
+            net.GivenTensorFill([], ["X"], values=[], shape=[2, 0, 5])
+            net.GivenTensorFill([], ["Y"], values=[], shape=[2, 0])
+            net.SumReduceLike(["X", "Y"], "out", axis=0)
+            workspace.RunNetOnce(net)
+
+    @given(**hu.gcs)
+    def test_sum_reduce(self, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(4, 5).astype(np.float32)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        res = np.sum(X, axis=0)
+        res = np.sum(res, axis=0)
+        np.testing.assert_array_almost_equal(out, res)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(2, 3).astype(np.float32)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        res = np.sum(X, axis=3)
+        res = np.sum(res, axis=2)
+        np.testing.assert_array_almost_equal(out, res, decimal=3)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(3, 4).astype(np.float32)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        res = np.sum(X, axis=0)
+        res = np.sum(res, axis=2)
+        np.testing.assert_array_almost_equal(out, res)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
+        Y = np.random.rand(1).astype(np.float64)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        res = np.array(np.sum(X))
+        np.testing.assert_array_almost_equal(out, res, decimal=0)
+
+        # broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        workspace.RunOperatorOnce(op)
+        out = workspace.FetchBlob("out")
+        res = np.sum(X, axis=0)
+        res = np.sum(res, axis=2).reshape(Y.shape)
+        np.testing.assert_array_almost_equal(out, res)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+        # fp64 is not supported with the CUDA op
+        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
+        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs_gpu_only)
+    def test_sum_reduce_fp16(self, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
+        Y = np.random.rand(4, 5).astype(np.float16)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc)
+
+        def ref_op(X, Y):
+            res = np.sum(X, axis=0)
+            res = np.sum(res, axis=0)
+            return [res]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=ref_op,
+            threshold=1e-3)
+
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
+        Y = np.random.rand(2, 3).astype(np.float16)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
+
+        def ref_op(X, Y):
+            res = np.sum(X, axis=3)
+            res = np.sum(res, axis=2)
+            return [res]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=ref_op,
+            threshold=1e-3)
+
+        # broadcasting intermediate dimensions
+        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
+        Y = np.random.rand(3, 4).astype(np.float16)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
+
+        def ref_op(X, Y):
+            res = np.sum(X, axis=0)
+            res = np.sum(res, axis=2)
+            return [res]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=ref_op,
+            threshold=1e-3)
+
+        # broadcasting with single elem dimensions at both ends
+        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
+        Y = np.random.rand(1, 3, 4, 1).astype(np.float16)
+        op = core.CreateOperator(
+            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
+
+        def ref_op(X, Y):
+            res = np.sum(X, axis=0)
+            res = np.sum(res, axis=2)
+            return [res.reshape(Y.shape)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=ref_op,
+            threshold=1e-3)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
new file mode 100644
index 0000000..bbbcbf9
--- /dev/null
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -0,0 +1,601 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestElementwiseOps(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    def test_abs(self, X, gc, dc):
+        op = core.CreateOperator(
+            "Abs",
+            ["X"],
+            ["Y"],
+        )
+
+        def abs_ref(X):
+            return [np.absolute(X)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=abs_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(), **hu.gcs)
+    def test_exp(self, X, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Exp",
+            ["X"],
+            ["X"] if inplace else ["Y"],
+        )
+
+        def exp_ref(X):
+            return [np.exp(X)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=exp_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6),
+           seed=st.integers(0, 1000), **hu.gcs)
+    def test_log(self, n, m, gc, dc, seed):
+        np.random.seed(seed)
+        X = np.random.rand(n, m).astype(np.float32) + 1.0
+
+        def log_op(X):
+            return [np.log(X)]
+
+        op = core.CreateOperator(
+            "Log",
+            ["X"],
+            ["Z"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=log_op,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(0, 10), m=st.integers(4, 6),
+           d=st.integers(2, 3), seed=st.integers(0, 1000), **hu.gcs)
+    def test_powt(self, n, m, d, gc, dc, seed):
+        np.random.seed(seed)
+        X = np.random.rand(n, m, d).astype(np.float32) + 1.0
+        Y = np.random.rand(n, m, d).astype(np.float32) + 2.0
+
+        def powt_op(X, Y):
+            return [np.power(X, Y)]
+
+        #two gradients Y*X^(Y-1) and X^Y * ln(X)
+        def powt_grad(g_out, outputs, fwd_inputs):
+            [X, Y] = fwd_inputs
+            Z = outputs[0]
+            return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out)
+
+        op = core.CreateOperator(
+            "Pow",
+            ["X", "Y"],
+            ["Z"]
+        )
+
+        self.assertReferenceChecks(device_option=gc,
+                                   op=op,
+                                   inputs=[X, Y],
+                                   reference=powt_op,
+                                   output_to_grad="Z",
+                                   grad_reference=powt_grad)
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6),
+           seed=st.integers(0, 1000), **hu.gcs)
+    def test_sqr(self, n, m, gc, dc, seed):
+        np.random.seed(seed)
+        X = np.random.rand(n, m).astype(np.float32)
+
+        def sqr_op(X):
+            return [np.square(X)]
+
+        op = core.CreateOperator(
+            "Sqr",
+            ["X"],
+            ["Z"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sqr_op,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+    @given(
+        X=hu.tensor(
+            elements=st.floats(0.1, 10),
+            # allow empty tensor
+            min_value=0),
+        inplace=st.booleans(),
+        **hu.gcs
+    )
+    def test_sqrt(self, X, inplace, gc, dc):
+        def sqrt_op(X):
+            return [np.sqrt(X)]
+
+        op = core.CreateOperator(
+            "Sqrt",
+            ["X"],
+            ["X"] if inplace else ["Y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sqrt_op,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # stepsize need to be smaller than the possible minimum X, so the
+        # sqrt is well defined
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-2)
+
+    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(), **hu.gcs)
+    def test_softsign(self, X, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Softsign",
+            ["X"],
+            ["X"] if inplace else ["Y"],
+        )
+
+        def softsign_ref(X):
+            return [X / (1.0 + np.absolute(X))]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=softsign_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not inplace:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(elements=st.floats(0.1, 10.0), dtype=np.float32),
+           inplace=st.booleans(), **hu.gcs)
+    def test_rsqrt(self, X, inplace, gc, dc):
+        op = core.CreateOperator(
+            "Rsqrt",
+            ["X"],
+            ["X"] if inplace else ["Y"],
+        )
+
+        def rsqrt_ref(X):
+            return [1.0 / np.sqrt(X)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=rsqrt_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=5e-3)
+
+    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    def test_cube(self, X, gc, dc):
+        op = core.CreateOperator(
+            "Cube",
+            ["X"],
+            ["Y"],
+        )
+
+        def cube_ref(X):
+            return [np.power(X, 3)]
+
+        def cube_grad_ref(g_out, outputs, fwd_inputs):
+            dY = g_out
+            [X] = fwd_inputs
+            return [dY * np.square(X) * 3]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=cube_ref,
+            output_to_grad="Y",
+            grad_reference=cube_grad_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), **hu.gcs)
+    def test_cbrt(self, X, in_place, gc, dc):
+        op = core.CreateOperator(
+            "Cbrt",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+        )
+
+        def cbrt_ref(X):
+            return [np.cbrt(X)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=cbrt_ref,
+        )
+
+    @given(X=hu.tensor(elements=st.floats(1.0, 10.0), dtype=np.float32),
+           in_place=st.booleans(), **hu.gcs)
+    def test_cbrt_grad(self, X, in_place, gc, dc):
+        op = core.CreateOperator(
+            "Cbrt",
+            ["X"],
+            ["X"] if in_place else ["Y"],
+        )
+
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+        self.assertGradientChecks(gc, op, [-X], 0, [0])
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6),
+           seed=st.integers(0, 1000), **hu.gcs)
+    def test_swish(self, n, m, gc, dc, seed):
+        np.random.seed(seed)
+        X = np.random.rand(n, m).astype(np.float32)
+
+        def swish(X):
+            return [np.divide(X, (1. + np.exp(-X)))]
+
+        op = core.CreateOperator(
+            "Swish",
+            ["X"],
+            ["Z"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=swish,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6),
+           seed=st.integers(0, 1000), **hu.gcs)
+    def test_swish_gradient_inplace(self, n, m, gc, dc, seed):
+        np.random.seed(seed)
+
+        def swish(X):
+            return [np.divide(X, (1. + np.exp(-X)))]
+
+        def swish_gradient(X, Y, dY):
+            return [dY * (Y + np.divide(1. - Y, 1. + np.exp(-X)))]
+
+        X = np.random.rand(n, m).astype(np.float32)
+        Y = swish(X)[0]
+        dY = np.random.rand(n, m).astype(np.float32)
+        op = core.CreateOperator(
+            "SwishGradient",
+            ["X", "Y", "grad"],
+            "grad"
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y, dY],
+            reference=swish_gradient,
+        )
+
+    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_sigmoid(self, X, inplace, engine, gc, dc):
+        op = core.CreateOperator(
+            "Sigmoid",
+            ["X"],
+            ["X"] if inplace else ["Y"],
+            engine=engine,
+        )
+
+        def sigmoid_ref(X):
+            return [1.0 / (1.0 + np.exp(-X))]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sigmoid_ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6), **hu.gcs)
+    def test_eq(self, n, m, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.randint(2, size=(n, m))
+        Y = np.random.randint(2, size=(n, m))
+        op = core.CreateOperator("EQ", ["X", "Y"], "out", broadcast=1)
+
+        def eq(X, Y):
+            return [X == Y]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=eq,
+        )
+
+        workspace.FeedBlob('X', X)
+        workspace.FeedBlob('Y', Y)
+
+        net = core.Net("batch_bucket_one_hot_test")
+        result = net.EQ(["X", "Y"], 1)
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+
+        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
+        self.assertEqual(shapes[result], list(X.shape))
+        self.assertEqual(types[result], core.DataType.BOOL)
+
+    @given(n=st.integers(0, 6), m=st.integers(4, 6), **hu.gcs)
+    def test_eq_bcast(self, n, m, gc, dc):
+        # Set broadcast and no axis, i.e. broadcasting last dimensions.
+        X = np.random.randint(2, size=(n, m))
+        Y = np.random.randint(2, size=(m,))
+        op = core.CreateOperator("EQ", ["X", "Y"], "out", broadcast=1)
+
+        def eq(X, Y):
+            return [X == Y]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, Y],
+            reference=eq,
+        )
+
+        workspace.FeedBlob('X', X)
+        workspace.FeedBlob('Y', Y)
+
+        net = core.Net("eq_bast")
+        result = net.EQ(["X", "Y"], 1, broadcast=1)
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+        self.assertTrue(str(result) in shapes)
+        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
+        self.assertEqual(shapes[result], list(X.shape))
+        self.assertEqual(types[result], core.DataType.BOOL)
+
+        net_2 = core.Net("eq_bast_invalid")
+        result_2 = net_2.EQ(["X", "Y"], 1)
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        self.assertTrue(str(result_2) not in shapes)
+
+    def _run_single_test(
+            self, op, ref, A, B, reverse_inputs, test_grad, gc, dc):
+        inputs = [A, B]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        if test_grad:
+            for i in range(len(inputs)):
+                self.assertGradientChecks(gc, op, inputs, i, [0])
+
+        if reverse_inputs:
+            inputs = [B, A]
+            self.assertReferenceChecks(
+                device_option=gc,
+                op=op,
+                inputs=inputs,
+                reference=ref,
+            )
+            self.assertDeviceChecks(dc, op, inputs, [0])
+            if test_grad:
+                for i in range(len(inputs)):
+                    self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    def _test_binary_op(
+            self, op_name, np_ref, n, m, k, t, bias, test_grad, gc, dc):
+        op = core.CreateOperator(
+            op_name,
+            ["A", "B"],
+            ["C"],
+        )
+
+        def ref(A, B):
+            return [np_ref(A, B)]
+
+        A = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+        A = np.random.rand(1).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+        A = np.random.rand(k, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+        A = np.random.rand(n, m, 1, 1).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+        A = np.random.rand(m, 1, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+        A = np.random.rand(1, m, 1, t).astype(np.float32) + bias
+        B = np.random.rand(n, 1, k, 1).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
+
+    def _test_binary_op_in_place(
+            self, op_name, np_ref, n, m, bias, test_grad, in_place_2nd, gc, dc):
+        def ref(A, B):
+            return [np_ref(A, B)]
+
+        op = core.CreateOperator(
+            op_name,
+            ["A", "B"],
+            ["A"],
+        )
+        A = np.random.rand(n, m).astype(np.float32) + bias
+        B = np.random.rand(m).astype(np.float32) + bias
+
+        self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
+        A = np.random.rand(n, m).astype(np.float32) + bias
+        B = np.random.rand(n, 1).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
+
+        if in_place_2nd:
+            op = core.CreateOperator(
+                op_name,
+                ["A", "B"],
+                ["B"],
+            )
+            A = np.random.rand(m).astype(np.float32) + bias
+            B = np.random.rand(n, m).astype(np.float32) + bias
+            self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
+            A = np.random.rand(n, 1).astype(np.float32) + bias
+            B = np.random.rand(n, m).astype(np.float32) + bias
+            self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
+
+    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
+           t=st.integers(0, 5), **hu.gcs)
+    def test_add(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Add", np.add, n, m, k, t, -0.5, True, gc, dc)
+        self._test_binary_op_in_place(
+            "Add", np.add, n, m, -0.5, True, True, gc, dc)
+
+    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
+           t=st.integers(0, 5), **hu.gcs)
+    def test_sub(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Sub", np.subtract, n, m,
+                             k, t, -0.5, True, gc, dc)
+        self._test_binary_op_in_place(
+            "Sub", np.subtract, n, m, -0.5, True, True, gc, dc)
+
+    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
+           t=st.integers(0, 5), **hu.gcs)
+    def test_mul(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Mul", np.multiply, n, m,
+                             k, t, -0.5, True, gc, dc)
+
+    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
+           t=st.integers(0, 5), **hu.gcs)
+    def test_div(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Div", np.divide, n, m, k, t, 1.0, True, gc, dc)
+        self._test_binary_op_in_place(
+            "Div", np.divide, n, m, 1.0, True, False, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), broadcast=st.booleans(),
+           **hu.gcs)
+    def test_div_legacy_grad(self, n, m, broadcast, gc, dc):
+        op = core.CreateOperator(
+            "DivGradient",
+            ["B", "C", "dC"],
+            ["dA", "dB"],
+        )
+
+        def div_grad_ref(B, C, dC):
+            dA = dC / B
+            dB = -dC * C / B
+            if broadcast:
+                dB = np.sum(dB, axis=0)
+            return [dA, dB]
+
+        if broadcast:
+            B = np.random.rand(m).astype(np.float32) + 1.0
+        else:
+            B = np.random.rand(n, m).astype(np.float32) + 1.0
+        C = np.random.randn(n, m).astype(np.float32)
+        dC = np.random.randn(n, m).astype(np.float32)
+        inputs = [B, C, dC]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=div_grad_ref,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0, 1])
+
+    def _test_bitwise_binary_op(self, op_name, np_ref, n, m, k, t, gc, dc):
+        op = core.CreateOperator(
+            op_name,
+            ["A", "B"],
+            ["C"],
+        )
+
+        def ref(A, B):
+            return [np_ref(A, B)]
+
+        A = np.random.randint(128, size=(n, m, k, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+        A = np.random.randint(128, size=1)
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+        A = np.random.randint(128, size=(k, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+        A = np.random.randint(128, size=(n, m, 1, 1))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+        A = np.random.randint(128, size=(m, 1, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+        A = np.random.randint(128, size=(1, m, 1, t))
+        B = np.random.randint(128, size=(n, 1, k, 1))
+        self._run_single_test(op, ref, A, B, True, False, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_and(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseAnd", np.bitwise_and, n, m, k, t, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_or(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseOr", np.bitwise_or, n, m, k, t, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_xor(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseXor", np.bitwise_xor, n, m, k, t, gc, dc)
diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py
new file mode 100644
index 0000000..a04e9d0
--- /dev/null
+++ b/caffe2/python/operator_test/emptysample_ops_test.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+lengths = [[0], [1, 2], [1, 0, 2, 0]]
+features1 = [[],
+             [1, 2, 2],
+             [[1, 1], [2, 2], [2, 2]]
+             ]
+features2 = [[],
+             [2, 4, 4],
+             [[2, 2], [4, 4], [4, 4]]
+             ]
+
+lengths_exp = [[1], [1, 2], [1, 1, 2, 1]]
+features1_exp = [[0],
+                 [1, 2, 2],
+                 [[1, 1], [0, 0], [2, 2], [2, 2], [0, 0]]]
+features2_exp = [[0],
+                 [2, 4, 4],
+                 [[2, 2], [0, 0], [4, 4], [4, 4], [0, 0]]]
+
+
+class TestEmptySampleOps(TestCase):
+    def test_emptysample(self):
+        for i in range(0, 3):
+            PadEmptyTest = core.CreateOperator(
+                'PadEmptySamples',
+                ['lengths', 'features1', 'features2'],
+                ['out_lengths', 'out_features1', 'out_features2'],
+            )
+            workspace.FeedBlob(
+                'lengths',
+                np.array(lengths[i], dtype=np.int32))
+            workspace.FeedBlob(
+                'features1',
+                np.array(features1[i], dtype=np.int64))
+            workspace.FeedBlob(
+                'features2',
+                np.array(features2[i], dtype=np.int64))
+            workspace.RunOperatorOnce(PadEmptyTest)
+            np.testing.assert_allclose(
+                lengths_exp[i],
+                workspace.FetchBlob('out_lengths'),
+                atol=1e-4, rtol=1e-4, err_msg='Mismatch in lengths')
+            np.testing.assert_allclose(
+                features1_exp[i],
+                workspace.FetchBlob('out_features1'),
+                atol=1e-4, rtol=1e-4, err_msg='Mismatch in features1')
+            np.testing.assert_allclose(
+                features2_exp[i],
+                workspace.FetchBlob('out_features2'),
+                atol=1e-4, rtol=1e-4, err_msg='Mismatch in features2')
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
new file mode 100644
index 0000000..7241977
--- /dev/null
+++ b/caffe2/python/operator_test/enforce_finite_op_test.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import numpy as np
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+
+class TestEnforceFinite(hu.HypothesisTestCase):
+    @given(
+        X=hu.tensor(
+            # allow empty
+            min_value=0,
+            elements=st.floats(allow_nan=True, allow_infinity=True),
+        ),
+        **hu.gcs
+    )
+    def test_enforce_finite(self, X, gc, dc):
+
+        def all_finite_value(X):
+            if X.size <= 0:
+                return True
+
+            return np.isfinite(X).all()
+
+        net = core.Net('test_net')
+        net.Const(array=X, blob_out="X")
+        net.EnforceFinite("X", [])
+
+        if all_finite_value(X):
+            self.assertTrue(workspace.RunNetOnce(net))
+        else:
+            with self.assertRaises(RuntimeError):
+                workspace.RunNetOnce(net)
diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py
new file mode 100644
index 0000000..d7c2c9c
--- /dev/null
+++ b/caffe2/python/operator_test/ensure_clipped_test.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import numpy.testing as npt
+from caffe2.python import core, workspace
+from hypothesis import given
+
+
+class TestEnsureClipped(hu.HypothesisTestCase):
+    @given(
+        X=hu.arrays(dims=[5, 10], elements=st.floats(min_value=-1.0, max_value=1.0)),
+        in_place=st.booleans(),
+        sparse=st.booleans(),
+        indices=hu.arrays(dims=[5], elements=st.booleans()),
+        **hu.gcs_cpu_only
+    )
+    def test_ensure_clipped(self, X, in_place, sparse, indices, gc, dc):
+        if (not in_place) and sparse:
+            return
+        param = X.astype(np.float32)
+        m, n = param.shape
+        indices = np.array(np.nonzero(indices)[0], dtype=np.int64)
+        grad = np.random.rand(len(indices), n)
+        workspace.FeedBlob("indices", indices)
+        workspace.FeedBlob("grad", grad)
+        workspace.FeedBlob("param", param)
+        input = ["param", "indices", "grad"] if sparse else ["param"]
+        output = "param" if in_place else "output"
+        op = core.CreateOperator("EnsureClipped", input, output, min=0.0)
+        workspace.RunOperatorOnce(op)
+
+        def ref():
+            return (
+                np.array(
+                    [np.clip(X[i], 0, None) if i in indices else X[i] for i in range(m)]
+                )
+                if sparse
+                else np.clip(X, 0, None)
+            )
+
+        npt.assert_allclose(workspace.blobs[output], ref(), rtol=1e-3)
diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
new file mode 100644
index 0000000..509c28a
--- /dev/null
+++ b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import numpy as np
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+@st.composite
+def _dev_options(draw):
+    op_dev = draw(st.sampled_from(hu.device_options))
+    if op_dev == hu.cpu_do:
+        # the CPU op can only handle CPU tensor
+        input_blob_dev = hu.cpu_do
+    else:
+        input_blob_dev = draw(st.sampled_from(hu.device_options))
+
+    return op_dev, input_blob_dev
+
+
+class TestEnsureCPUOutputOp(hu.HypothesisTestCase):
+
+    @given(
+        input=hu.tensor(dtype=np.float32),
+        dev_options=_dev_options()
+    )
+    def test_ensure_cpu_output(self, input, dev_options):
+        op_dev, input_blob_dev = dev_options
+        net = core.Net('test_net')
+        data = net.GivenTensorFill(
+            [],
+            ["data"],
+            values=input,
+            shape=input.shape,
+            device_option=input_blob_dev
+        )
+
+        data_cpu = net.EnsureCPUOutput(
+            [data],
+            ["data_cpu"],
+            device_option=op_dev
+        )
+        workspace.RunNetOnce(net)
+
+        data_cpu_value = workspace.FetchBlob(data_cpu)
+        np.testing.assert_allclose(input, data_cpu_value)
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
new file mode 100644
index 0000000..1cd3cde
--- /dev/null
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestExpandOp(hu.HypothesisTestCase):
+    def run_expand_op_test_rand(
+        self, op_name, X, gc, dc):
+        shape_length = np.random.randint(5)
+        shape_list = []
+        j = shape_length - 1
+        i = X.ndim - 1
+        while i >= 0 or j >= 0:
+            k = np.random.randint(5) + 1
+            if i >= 0 and X.shape[i] != 1:
+                if np.random.randint(2) == 0:
+                    k = 1
+                else:
+                    k = X.shape[i]
+            shape_list.insert(0, k)
+            i -= 1
+            j -= 1
+        shape = np.array(shape_list, dtype=np.int64)
+
+        op = core.CreateOperator(
+            op_name,
+            ["X", "shape"],
+            ["Y"],
+        )
+        def ref(X, shape):
+            return (X * np.ones(shape),)
+
+        self.assertReferenceChecks(gc, op, [X, shape], ref)
+        self.assertDeviceChecks(dc, op, [X, shape], [0])
+        self.assertGradientChecks(gc, op, [X, shape], 0, [0])
+
+    def run_expand_op_test_nonrand(
+        self, op_name, X, gc, dc, shape):
+        shape = np.array(shape)
+        op = core.CreateOperator(
+            op_name,
+            ["X", "shape"],
+            ["Y"],
+        )
+        def ref(X, shape):
+            return (X * np.ones(shape),)
+
+        self.assertReferenceChecks(gc, op, [X, shape], ref)
+        self.assertDeviceChecks(dc, op, [X, shape], [0])
+        self.assertGradientChecks(gc, op, [X, shape], 0, [0])
+
+    @given(X=hu.tensor(max_dim=5, dtype=np.float32),
+           **hu.gcs)
+    def test_expand_rand_shape(self, X, gc, dc):
+        self.run_expand_op_test_rand(
+            "Expand", X, gc, dc)
+
+    @given(X=st.sampled_from([np.ones([1, 3, 1]),
+                             np.ones([3, 1, 3]),
+                             np.ones([1, 3])]),
+           **hu.gcs)
+    def test_expand_nonrand_shape1(self, X, gc, dc):
+        self.run_expand_op_test_nonrand(
+            "Expand", X, gc, dc, [3, 1, 3])
+
+
+    @given(X=st.sampled_from([np.ones([4, 4, 2, 1]),
+                             np.ones([1, 4, 1, 2]),
+                             np.ones([4, 1, 2])]),
+           **hu.gcs)
+    def test_expand_nonrand_shape2(self, X, gc, dc):
+        self.run_expand_op_test_nonrand(
+            "Expand", X, gc, dc, [4, 1, 2, 2])
diff --git a/caffe2/python/operator_test/extend_tensor_op_test.py b/caffe2/python/operator_test/extend_tensor_op_test.py
new file mode 100644
index 0000000..bf9162e
--- /dev/null
+++ b/caffe2/python/operator_test/extend_tensor_op_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestExtendTensorOp(TestCase):
+    def test_extend_tensor(self):
+        # Tensor of size 6 holding info about elements 0 to 5
+        old_tensor = np.array([1, 2, 3, 4, 5, 6], dtype=np.int32)
+        workspace.FeedBlob('old_tensor', old_tensor)
+
+        indices = np.array([0, 5, 8, 2, 3, 7], dtype=np.int32)
+        workspace.FeedBlob('indices', indices)
+
+        new_tensor_expected = np.array([1, 2, 3, 4, 5, 6, 0, 0, 0],
+                                       dtype=np.int32)
+
+        extend_tensor_op = core.CreateOperator(
+            'ExtendTensor',
+            ['old_tensor', 'indices'],
+            ['old_tensor'])
+
+        workspace.RunOperatorOnce(extend_tensor_op)
+        new_tensor_observed = workspace.FetchBlob('old_tensor')
+
+        np.testing.assert_array_equal(new_tensor_expected, new_tensor_observed)
+
+    def test_counting(self):
+        # Tensor of size 6 holding counts of elements with indices 0 to 5
+        counts = np.array([1, 2, 3, 4, 5, 6], dtype=np.float32)
+        workspace.FeedBlob('counts', counts)
+
+        # Indices of new words to be counted
+        indices = np.array([0, 5, 8, 2, 3, 7, 7], dtype=np.int32)
+        workspace.FeedBlob('indices', indices)
+
+        # Extend the 'counts' tensor if necessary (if new words are seen)
+        extend_tensor_op = core.CreateOperator(
+            'ExtendTensor',
+            ['counts', 'indices'],
+            ['counts'])
+        workspace.RunOperatorOnce(extend_tensor_op)
+
+        ones_counts = np.array([1], dtype=np.float32)
+        ones_indices = np.array(
+            [1 for i in range(len(indices))], dtype=np.float32)
+        one = np.array([1], dtype=np.float32)
+        workspace.FeedBlob('ones_counts', ones_counts)
+        workspace.FeedBlob('ones_indices', ones_indices)
+        workspace.FeedBlob('one', one)
+
+        ins = ['counts', 'ones_counts', 'indices', 'ones_indices', 'one']
+        op = core.CreateOperator('ScatterWeightedSum', ins, ['counts'])
+        workspace.RunOperatorOnce(op)
+
+        new_tensor_expected = np.array([2, 2, 4, 5, 5, 7, 0, 2, 1],
+                                       dtype=np.float32)
+        new_tensor_observed = workspace.FetchBlob('counts')
+
+        np.testing.assert_array_equal(new_tensor_expected, new_tensor_observed)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
new file mode 100644
index 0000000..ff3a102
--- /dev/null
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from hypothesis import assume, given, settings
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestFcOperator(hu.HypothesisTestCase):
+    def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc):
+        if dtype == np.float16:
+            # fp16 only supported with CUDA
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
+
+        if engine == 'TENSORCORE':
+            # TensorCore only makes sense with CUDA
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            # ensures TensorCore kernels can be called
+            m *= 8
+            k *= 8
+            n *= 8
+
+        X = np.random.rand(m, k).astype(dtype) - 0.5
+        if multi_dim:
+            if transposed:
+                W = np.random.rand(k, n, 1, 1).astype(dtype) - 0.5
+            else:
+                W = np.random.rand(n, k, 1, 1).astype(dtype) - 0.5
+        else:
+            if transposed:
+                W = np.random.rand(k, n).astype(dtype) - 0.5
+            else:
+                W = np.random.rand(n, k).astype(dtype) - 0.5
+        b = np.random.rand(n).astype(dtype) - 0.5
+
+        def fc_op(X, W, b):
+            return [np.dot(X, W.reshape(n, k).transpose()) + b.reshape(n)]
+
+        def fc_tranposed_op(X, W, b):
+            return [np.dot(X, W.reshape(k, n)) + b.reshape(n)]
+
+        op = core.CreateOperator(
+            'FCTransposed' if transposed else 'FC',
+            ['X', 'W', 'b'],
+            'out',
+            engine=engine,
+        )
+
+        if dtype == np.float16 and gc.device_type == caffe2_pb2.CUDA:
+            a = caffe2_pb2.Argument()
+            a.i = 1
+            a.name = "float16_compute"
+            op.arg.extend([a])
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, W, b],
+            reference=fc_tranposed_op if transposed else fc_op,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, W, b], [0])
+
+        # Gradient checks
+        threshold = 0.5 if dtype == np.float16 else 0.005
+        stepsize = 0.5 if dtype == np.float16 else 0.05
+        for i in range(3):
+            self.assertGradientChecks(gc, op, [X, W, b], i, [0],
+                                      threshold=threshold, stepsize=stepsize)
+
+    @settings(max_examples=50)
+    @given(n=st.integers(1, 5),
+           m=st.integers(0, 5),
+           k=st.integers(1, 5),
+           multi_dim=st.sampled_from([True, False]),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           engine=st.sampled_from(['', 'TENSORCORE']),
+           **hu.gcs)
+    def test_fc(self, **kwargs):
+        self._run_test(transposed=False, **kwargs)
+
+    @settings(max_examples=50)
+    @given(n=st.integers(1, 5),
+           m=st.integers(0, 5),
+           k=st.integers(1, 5),
+           multi_dim=st.sampled_from([True, False]),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           engine=st.sampled_from(['', 'TENSORCORE']),
+           **hu.gcs)
+    def test_fc_transposed(self, **kwargs):
+        self._run_test(transposed=True, **kwargs)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
new file mode 100644
index 0000000..c776c02
--- /dev/null
+++ b/caffe2/python/operator_test/feature_maps_ops_test.py
@@ -0,0 +1,668 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace, dyndep
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestFeatureMapsOps(TestCase):
+
+    def test_merge_single_scalar_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeSingleScalarFeatureTensors",
+            [
+                "in1", "in1_presence",
+                "in2", "in2_presence",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values",
+            ],
+            feature_ids=[11, 12]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1",
+            np.array([11.1, 0.0], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2",
+            np.array([12.1, 12.2], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([2, 1], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 12, 12], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values"),
+            np.array([11.1, 12.1, 12.2], dtype=np.float)
+        )
+
+    def test_merge_single_scalar_feature_tensors_gradient(self):
+        op = core.CreateOperator(
+            "MergeSingleScalarFeatureTensorsGradient",
+            [
+                "in1_presence",
+                "in2_presence",
+                "in3_presence",
+                "out_values_grad",
+            ],
+            [
+                "in1_grad", "in2_grad", "in3_grad",
+            ],
+        )
+
+        # Inputs 1, 2 & 3.
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+        workspace.FeedBlob(
+            "in3_presence",
+            np.array([False, True], dtype=np.bool)
+        )
+        # Input 4.
+        workspace.FeedBlob(
+            "out_values_grad",
+            np.array([0.1, 1.1, 1.2, 2.3], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in1_grad"),
+            np.array([0.1, 0], dtype=np.float)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in2_grad"),
+            np.array([1.1, 1.2], dtype=np.float)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in3_grad"),
+            np.array([0, 2.3], dtype=np.float)
+        )
+
+    def test_merge_single_scalar_feature_tensors_gradient_with_strings(self):
+        op = core.CreateOperator(
+            "MergeSingleScalarFeatureTensorsGradient",
+            [
+                "in1_presence",
+                "in2_presence",
+                "in3_presence",
+                "out_values_grad",
+            ],
+            [
+                "in1_grad", "in2_grad", "in3_grad",
+            ],
+        )
+
+        # Inputs 1, 2 & 3.
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+        workspace.FeedBlob(
+            "in3_presence",
+            np.array([False, True], dtype=np.bool)
+        )
+        # Input 4.
+        workspace.FeedBlob(
+            "out_values_grad",
+            np.array(["0.1", "1.1", "1.2", "2.3"], dtype=np.unicode_)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in1_grad"),
+            np.array(["0.1", ""], dtype=np.bytes_)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in2_grad"),
+            np.array(["1.1", "1.2"], dtype=np.bytes_)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in3_grad"),
+            np.array(["", "2.3"], dtype=np.bytes_)
+        )
+
+    def test_merge_single_list_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeSingleListFeatureTensors",
+            [
+                "in1_lengths", "in1_values", "in1_presence",
+                "in2_lengths", "in2_values", "in2_presence",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values_lengths",
+                "out_values_values",
+            ],
+            feature_ids=[11, 12]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([2, 0], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_values",
+            np.array([11.1, 11.2], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_values",
+            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([2, 1], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 12, 12], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_lengths"),
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_values"),
+            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+
+    def test_merge_single_list_feature_tensors_gradient(self):
+        self._test_merge_single_list_or_map_feature_tensors_gradient(
+            "MergeSingleListFeatureTensorsGradient"
+        )
+
+    def test_merge_single_map_feature_tensors_gradient(self):
+        self._test_merge_single_list_or_map_feature_tensors_gradient(
+            "MergeSingleMapFeatureTensorsGradient"
+        )
+
+    def _test_merge_single_list_or_map_feature_tensors_gradient(self, op_name):
+        op = core.CreateOperator(
+            op_name,
+            [
+                "in1_lengths", "in1_presence",
+                "in2_lengths", "in2_presence",
+                "out_values_values_grad",
+            ],
+            [
+                "in1_values_grad",
+                "in2_values_grad",
+            ],
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([2, 0], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+        workspace.FeedBlob(
+            "out_values_values_grad",
+            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in1_values_grad"),
+            np.array([11.1, 11.2], dtype=np.float)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in2_values_grad"),
+            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+
+    def test_merge_single_map_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeSingleMapFeatureTensors",
+            [
+                "in1_lengths", "in1_keys", "in1_values", "in1_presence",
+                "in2_lengths", "in2_keys", "in2_values", "in2_presence",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values_lengths",
+                "out_values_keys", "out_values_values",
+            ],
+            feature_ids=[11, 12]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([2, 0], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_keys",
+            np.array([111, 112], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in1_values",
+            np.array([11.1, 11.2], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in1_presence",
+            np.array([True, False], dtype=np.bool)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_keys",
+            np.array([121, 122, 123, 124], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in2_values",
+            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+        workspace.FeedBlob(
+            "in2_presence",
+            np.array([True, True], dtype=np.bool)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([2, 1], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 12, 12], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_lengths"),
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_keys"),
+            np.array([111, 112, 121, 122, 123, 124], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_values"),
+            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float)
+        )
+
+    def test_merge_multi_scalar_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeMultiScalarFeatureTensors",
+            [
+                "in1_lengths", "in1_keys", "in1_values",
+                "in2_lengths", "in2_keys", "in2_values",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values",
+            ]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([1, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_keys",
+            np.array([11, 12, 13], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in1_values",
+            np.array([11.0, 12.0, 13.0], dtype=np.float)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 1], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_keys",
+            np.array([14, 15, 16], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in2_values",
+            np.array([14.0, 15.0, 16.0], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([3, 3], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values"),
+            np.array([11.0, 14.0, 15.0, 12.0, 13.0, 16.0], dtype=np.float)
+        )
+
+    def test_merge_multi_scalar_feature_tensors_gradient(self):
+        op = core.CreateOperator(
+            "MergeMultiScalarFeatureTensorsGradient",
+            [
+                "in1_lengths",
+                "in2_lengths",
+                "out_values_grad"
+            ],
+            [
+                "in1_values_grad",
+                "in2_values_grad",
+            ]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([1, 2, 0], dtype=np.int32)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 1, 1], dtype=np.int32)
+        )
+        # Grad input.
+        workspace.FeedBlob(
+            "out_values_grad",
+            np.array([11.0, 14.0, 15.0, 12.0, 13.0, 16.0, 17.0], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in1_values_grad"),
+            np.array([11.0, 12.0, 13.0], dtype=np.float)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in2_values_grad"),
+            np.array([14.0, 15.0, 16.0, 17.0], dtype=np.float)
+        )
+
+    def test_merge_multi_list_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeMultiListFeatureTensors",
+            [
+                "in1_lengths", "in1_keys", "in1_values_lengths",
+                "in1_values_values",
+                "in2_lengths", "in2_keys", "in2_values_lengths",
+                "in2_values_values",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values_lengths",
+                "out_values_values"
+            ]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([1, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_keys",
+            np.array([11, 12, 13], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in1_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_values_values",
+            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 1], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_keys",
+            np.array([14, 15, 16], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in2_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_values_values",
+            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([3, 3], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_lengths"),
+            np.array([2, 2, 2, 2, 2, 2], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_values"),
+            np.array(
+                [
+                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
+                    16.1, 16.2
+                ],
+                dtype=np.float
+            )
+        )
+
+    def test_merge_multi_map_feature_tensors(self):
+        op = core.CreateOperator(
+            "MergeMultiMapFeatureTensors",
+            [
+                "in1_lengths", "in1_keys", "in1_values_lengths",
+                "in1_values_keys", "in1_values_values",
+                "in2_lengths", "in2_keys", "in2_values_lengths",
+                "in2_values_keys", "in2_values_values",
+            ],
+            [
+                "out_lengths", "out_keys", "out_values_lengths",
+                "out_values_keys", "out_values_values"
+            ]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([1, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_keys",
+            np.array([11, 12, 13], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in1_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_values_keys",
+            np.array([111, 112, 121, 122, 131, 132], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in1_values_values",
+            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 1], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_keys",
+            np.array([14, 15, 16], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in2_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_values_keys",
+            np.array([141, 142, 151, 152, 161, 162], dtype=np.int64)
+        )
+        workspace.FeedBlob(
+            "in2_values_values",
+            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float)
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_lengths"),
+            np.array([3, 3], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_keys"),
+            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_lengths"),
+            np.array([2, 2, 2, 2, 2, 2], dtype=np.int32)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_keys"),
+            np.array(
+                [111, 112, 141, 142, 151, 152, 121, 122, 131, 132, 161, 162],
+                dtype=np.int64
+            )
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("out_values_values"),
+            np.array(
+                [
+                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
+                    16.1, 16.2
+                ],
+                dtype=np.float
+            )
+        )
+
+    def test_merge_multi_list_feature_tensors_gradient(self):
+        self._test_merge_multi_list_or_map_feature_tensors_gradient(
+            "MergeMultiListFeatureTensorsGradient"
+        )
+
+    def test_merge_multi_map_feature_tensors_gradient(self):
+        self._test_merge_multi_list_or_map_feature_tensors_gradient(
+            "MergeMultiMapFeatureTensorsGradient"
+        )
+
+    def _test_merge_multi_list_or_map_feature_tensors_gradient(self, op_name):
+        op = core.CreateOperator(
+            op_name,
+            [
+                "in1_lengths", "in1_values_lengths",
+                "in2_lengths", "in2_values_lengths",
+                "out_values_values_grad"
+            ],
+            [
+                "in1_values_values_grad",
+                "in2_values_values_grad",
+            ]
+        )
+
+        # Input 1.
+        workspace.FeedBlob(
+            "in1_lengths",
+            np.array([1, 2], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in1_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        # Input 2.
+        workspace.FeedBlob(
+            "in2_lengths",
+            np.array([2, 1], dtype=np.int32)
+        )
+        workspace.FeedBlob(
+            "in2_values_lengths",
+            np.array([2, 2, 2], dtype=np.int32)
+        )
+        # Grad Input.
+        workspace.FeedBlob(
+            "out_values_values_grad",
+            np.array(
+                [
+                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
+                    16.1, 16.2
+                ],
+                dtype=np.float
+            )
+        )
+
+        workspace.RunOperatorOnce(op)
+
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in1_values_values_grad"),
+            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float)
+        )
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("in2_values_values_grad"),
+            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float)
+        )
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
new file mode 100644
index 0000000..df13cba
--- /dev/null
+++ b/caffe2/python/operator_test/filler_ops_test.py
@@ -0,0 +1,233 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+def _fill_diagonal(shape, value):
+    result = np.zeros(shape)
+    np.fill_diagonal(result, value)
+    return (result,)
+
+
+class TestFillerOperator(hu.HypothesisTestCase):
+
+    @given(**hu.gcs)
+    def test_shape_error(self, gc, dc):
+        op = core.CreateOperator(
+            'GaussianFill',
+            [],
+            'out',
+            shape=32,  # illegal parameter
+            mean=0.0,
+            std=1.0,
+        )
+        exception = False
+        try:
+            workspace.RunOperatorOnce(op)
+        except Exception:
+            exception = True
+        self.assertTrue(exception, "Did not throw exception on illegal shape")
+
+        op = core.CreateOperator(
+            'ConstantFill',
+            [],
+            'out',
+            shape=[],  # scalar
+            value=2.0,
+        )
+        exception = False
+        self.assertTrue(workspace.RunOperatorOnce(op))
+        self.assertEqual(workspace.FetchBlob('out'), [2.0])
+
+    @given(**hu.gcs)
+    def test_int64_shape(self, gc, dc):
+        large_dim = 2 ** 31 + 1
+        net = core.Net("test_shape_net")
+        net.UniformFill(
+            [],
+            'out',
+            shape=[0, large_dim],
+            min=0.0,
+            max=1.0,
+        )
+        self.assertTrue(workspace.CreateNet(net))
+        self.assertTrue(workspace.RunNet(net.Name()))
+        self.assertEqual(workspace.blobs['out'].shape, (0, large_dim))
+
+    @given(
+        shape=hu.dims().flatmap(
+            lambda dims: hu.arrays(
+                [dims], dtype=np.int64,
+                elements=st.integers(min_value=0, max_value=20)
+            )
+        ),
+        a=st.integers(min_value=0, max_value=100),
+        b=st.integers(min_value=0, max_value=100),
+        **hu.gcs
+    )
+    def test_uniform_int_fill_op_blob_input(self, shape, a, b, gc, dc):
+        net = core.Net('test_net')
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            shape_blob = net.Const(shape, dtype=np.int64)
+        a_blob = net.Const(a, dtype=np.int32)
+        b_blob = net.Const(b, dtype=np.int32)
+        uniform_fill = net.UniformIntFill([shape_blob, a_blob, b_blob],
+                                          1, input_as_shape=1)
+
+        workspace.RunNetOnce(net)
+
+        blob_out = workspace.FetchBlob(uniform_fill)
+        if b < a:
+            new_shape = shape[:]
+            new_shape[0] = 0
+            np.testing.assert_array_equal(new_shape, blob_out.shape)
+        else:
+            np.testing.assert_array_equal(shape, blob_out.shape)
+            self.assertTrue((blob_out >= a).all())
+            self.assertTrue((blob_out <= b).all())
+
+    @given(
+        **hu.gcs
+    )
+    def test_uniform_fill_using_arg(self, gc, dc):
+        net = core.Net('test_net')
+        shape = [2**3, 5]
+        # uncomment this to test filling large blob
+        # shape = [2**30, 5]
+        min_v = -100
+        max_v = 100
+        output_blob = net.UniformIntFill(
+            [],
+            ['output_blob'],
+            shape=shape,
+            min=min_v,
+            max=max_v,
+        )
+
+        workspace.RunNetOnce(net)
+        output_data = workspace.FetchBlob(output_blob)
+
+        np.testing.assert_array_equal(shape, output_data.shape)
+        min_data = np.min(output_data)
+        max_data = np.max(output_data)
+
+        self.assertGreaterEqual(min_data, min_v)
+        self.assertLessEqual(max_data, max_v)
+
+        self.assertNotEqual(min_data, max_data)
+
+    @given(
+        shape=st.sampled_from(
+            [
+                [3, 3],
+                [5, 5, 5],
+                [7, 7, 7, 7],
+            ]
+        ),
+        **hu.gcs
+    )
+    def test_diagonal_fill_op_float(self, shape, gc, dc):
+        value = 2.5
+        op = core.CreateOperator(
+            'DiagonalFill',
+            [],
+            'out',
+            shape=shape,  # scalar
+            value=value,
+        )
+
+        for device_option in dc:
+            op.device_option.CopyFrom(device_option)
+            # Check against numpy reference
+            self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
+
+    @given(**hu.gcs)
+    def test_diagonal_fill_op_int(self, gc, dc):
+        value = 2
+        shape = [3, 3]
+        op = core.CreateOperator(
+            'DiagonalFill',
+            [],
+            'out',
+            shape=shape,
+            dtype=core.DataType.INT32,
+            value=value,
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
+
+    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs)
+    def test_lengths_range_fill(self, lengths, gc, dc):
+        op = core.CreateOperator(
+            "LengthsRangeFill",
+            ["lengths"],
+            ["increasing_seq"])
+
+        def _len_range_fill(lengths):
+            sids = []
+            for _, l in enumerate(lengths):
+                sids.extend(list(range(l)))
+            return (np.array(sids, dtype=np.int32), )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=_len_range_fill)
+
+    @given(**hu.gcs)
+    def test_gaussian_fill_op(self, gc, dc):
+        op = core.CreateOperator(
+            'GaussianFill',
+            [],
+            'out',
+            shape=[17, 3, 3],  # sample odd dimensions
+            mean=0.0,
+            std=1.0,
+        )
+
+        for device_option in dc:
+            op.device_option.CopyFrom(device_option)
+            assert workspace.RunOperatorOnce(op), "GaussianFill op did not run "
+            "successfully"
+
+            blob_out = workspace.FetchBlob('out')
+            assert np.count_nonzero(blob_out) > 0, "All generated elements are "
+            "zeros. Is the random generator functioning correctly?"
+
+    @given(**hu.gcs)
+    def test_msra_fill_op(self, gc, dc):
+        op = core.CreateOperator(
+            'MSRAFill',
+            [],
+            'out',
+            shape=[15, 5, 3],  # sample odd dimensions
+        )
+        for device_option in dc:
+            op.device_option.CopyFrom(device_option)
+            assert workspace.RunOperatorOnce(op), "MSRAFill op did not run "
+            "successfully"
+
+            blob_out = workspace.FetchBlob('out')
+            assert np.count_nonzero(blob_out) > 0, "All generated elements are "
+            "zeros. Is the random generator functioning correctly?"
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
new file mode 100644
index 0000000..febf6ef
--- /dev/null
+++ b/caffe2/python/operator_test/find_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+import hypothesis.strategies as st
+from hypothesis import given
+
+
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+class TestFindOperator(hu.HypothesisTestCase):
+
+    @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
+           idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
+           **hu.gcs)
+    def test_find(self, n, idxsize, gc, dc):
+        maxval = 10
+
+        def findop(idx, X):
+            res = []
+            for j in list(X.flatten()):
+                i = np.where(idx == j)[0]
+                if len(i) == 0:
+                    res.append(-1)
+                else:
+                    res.append(i[-1])
+
+            print("Idx: {} X: {}".format(idx, X))
+            print("Res: {}".format(res))
+            return [np.array(res).astype(np.int32)]
+
+        X = (np.random.rand(n) * maxval).astype(np.int32)
+        idx = (np.random.rand(idxsize) * maxval).astype(np.int32)
+
+        op = core.CreateOperator(
+            "Find",
+            ["idx", "X"],
+            ["y"],
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[idx, X],
+            reference=findop,
+        )
diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py
new file mode 100644
index 0000000..19d204e
--- /dev/null
+++ b/caffe2/python/operator_test/flatten_op_test.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestFlatten(hu.HypothesisTestCase):
+    @given(X=hu.tensor(min_dim=2, max_dim=4),
+           **hu.gcs)
+    def test_flatten(self, X, gc, dc):
+        for axis in range(X.ndim + 1):
+            op = core.CreateOperator(
+                "Flatten",
+                ["X"],
+                ["Y"],
+                axis=axis)
+
+            def flatten_ref(X):
+                shape = X.shape
+                outer = np.prod(shape[:axis]).astype(int)
+                inner = np.prod(shape[axis:]).astype(int)
+                return np.copy(X).reshape(outer, inner),
+
+            self.assertReferenceChecks(gc, op, [X], flatten_ref)
+
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
new file mode 100644
index 0000000..08f079b
--- /dev/null
+++ b/caffe2/python/operator_test/flexible_top_k_test.py
@@ -0,0 +1,74 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestFlexibleTopK(hu.HypothesisTestCase):
+    def flexible_top_k_ref(self, X, k):
+        X_flat = X.reshape((-1, X.shape[-1]))
+        indices_ref = np.ndarray(shape=sum(k), dtype=np.int32)
+        values_ref = np.ndarray(shape=sum(k), dtype=np.float32)
+        offset = 0
+        for i in range(X_flat.shape[0]):
+            od = OrderedDict()
+            for j in range(X_flat.shape[1]):
+                val = X_flat[i, j]
+                if val not in od:
+                    od[val] = []
+                od[val].append(j)
+            k_ = 0
+            for val, idxs in sorted(od.items(), reverse=True):
+                for idx in idxs:
+                    indices_ref[offset + k_] = idx
+                    values_ref[offset + k_] = val
+                    k_ += 1
+                    if k_ >= k[i]:
+                        break
+                if k_ >= k[i]:
+                    break
+            offset += k[i]
+
+        return (values_ref, indices_ref)
+
+    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
+    def test_flexible_top_k(self, X, gc, dc):
+        X = X.astype(dtype=np.float32)
+        k_shape = (int(X.size / X.shape[-1]), )
+        k = np.random.randint(1, high=X.shape[-1] + 1, size=k_shape)
+
+        output_list = ["Values", "Indices"]
+        op = core.CreateOperator("FlexibleTopK", ["X", "k"], output_list,
+                                 device_option=gc)
+
+        def bind_ref(X_loc, k):
+            ret = self.flexible_top_k_ref(X_loc, k)
+            return ret
+
+        self.assertReferenceChecks(gc, op, [X, k], bind_ref)
+
+    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
+    def test_flexible_top_k_grad(self, X, gc, dc):
+        X = X.astype(np.float32)
+        k_shape = (int(X.size / X.shape[-1]), )
+        k = np.random.randint(1, high=X.shape[-1] + 1, size=k_shape)
+
+        # this try to make sure adding stepsize (0.05)
+        # will not change TopK selections at all
+        # since dims max_value = 5 as defined in
+        # caffe2/caffe2/python/hypothesis_test_util.py
+        for i in range(X.shape[-1]):
+            X[..., i] = i * 1.0 / X.shape[-1]
+
+        op = core.CreateOperator(
+            "FlexibleTopK", ["X", "k"], ["Values", "Indices"], device_option=gc
+        )
+
+        self.assertGradientChecks(gc, op, [X, k], 0, [0])
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
new file mode 100644
index 0000000..aac1e81
--- /dev/null
+++ b/caffe2/python/operator_test/floor_op_test.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestFloor(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_floor(self, X, gc, dc, engine):
+        op = core.CreateOperator("Floor", ["X"], ["Y"], engine=engine)
+
+        def floor_ref(X):
+            return (np.floor(X),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=floor_ref)
+
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
new file mode 100644
index 0000000..2c2bc33
--- /dev/null
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import hypothesis.extra.numpy as hnp
+
+
+class TestGatherOps(hu.HypothesisTestCase):
+    @given(rows_num=st.integers(1, 10000),
+           index_num=st.integers(0, 5000),
+           **hu.gcs)
+    def test_gather_ops(self, rows_num, index_num, gc, dc):
+        data = np.random.random((rows_num, 10, 20)).astype(np.float32)
+        ind = np.random.randint(rows_num, size=(index_num, )).astype('int32')
+        op = core.CreateOperator(
+            'Gather',
+            ['data', 'ind'],
+            ['output'])
+
+        def ref_gather(data, ind):
+            if ind.size == 0:
+                return [np.zeros((0, 10, 20)).astype(np.float32)]
+
+            output = [r for r in [data[i] for i in ind]]
+            return [output]
+
+        self.assertReferenceChecks(gc, op, [data, ind], ref_gather)
+
+
+@st.composite
+def _inputs(draw):
+    rows_num = draw(st.integers(1, 100))
+    index_num = draw(st.integers(1, 10))
+    batch_size = draw(st.integers(2, 10))
+    return (
+        draw(hnp.arrays(
+            np.float32,
+            (batch_size, rows_num, 2),
+            elements=st.floats(-10.0, 10.0),
+        )),
+        draw(hnp.arrays(
+            np.int32,
+            (index_num, 1),
+            elements=st.integers(0, rows_num - 1),
+        )),
+    )
+
+
+class TestBatchGatherOps(hu.HypothesisTestCase):
+    @given(inputs=_inputs(),
+           **hu.gcs)
+    def test_batch_gather_ops(self, inputs, gc, dc):
+        data, ind = inputs
+        op = core.CreateOperator(
+            'BatchGather',
+            ['data', 'ind'],
+            ['output'])
+
+        def ref_batch_gather(data, ind):
+            output = []
+            for b in range(data.shape[0]):
+                output.append([r for r in [data[b][i] for i in ind]])
+            return [output]
+
+        self.assertReferenceChecks(gc, op, [data, ind], ref_batch_gather)
+        self.assertGradientChecks(gc, op, [data, ind], 0, [0])
+
+
+class TestGatherFused8BitRowwise(hu.HypothesisTestCase):
+    @given(rows_num=st.integers(1, 10000),
+           cols_num=st.integers(1, 128),
+           index_num=st.integers(0, 5000),
+           **hu.gcs)
+    def test_batch_gather_ops(self, rows_num, cols_num, index_num, gc, dc):
+        data = np.random.random((rows_num, cols_num)).astype(np.float32)
+        ind = np.random.randint(rows_num, size=(index_num, )).astype('int32')
+
+        net = core.Net("bench")
+
+        quantized_data = net.FloatToFused8BitRowwiseQuantized(
+            'data', 'quantized_data')
+        dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
+            quantized_data, 'dequantized_data')
+
+        net.Gather(
+            [dequantized_data, 'ind'], 'gather_reference')
+        net.GatherFused8BitRowwise(
+            [quantized_data, 'ind'], 'gather_quantized')
+
+        workspace.FeedBlob('data', data)
+        workspace.FeedBlob('ind', ind)
+        workspace.CreateNet(net)
+        workspace.RunNetOnce(net)
+
+        gather_reference = workspace.FetchBlob('gather_reference')
+        gather_quantized = workspace.FetchBlob('gather_quantized')
+        np.testing.assert_array_almost_equal(gather_reference, gather_quantized)
+
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
new file mode 100644
index 0000000..d653dd3
--- /dev/null
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -0,0 +1,197 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+
+def batched_boarders_and_data(
+        data_min_size=5, data_max_size=10,
+        examples_min_number=1, examples_max_number=4,
+        example_min_size=1, example_max_size=3,
+        dtype=np.float32, elements=None):
+    dims_ = st.tuples(
+        st.integers(min_value=data_min_size,
+                    max_value=data_max_size),
+        st.integers(min_value=examples_min_number,
+                    max_value=examples_max_number),
+        st.integers(min_value=example_min_size,
+                    max_value=example_max_size),
+    )
+    return dims_.flatmap(
+        lambda dims: st.tuples(
+            hu.arrays(
+                [dims[1], dims[2], 2], dtype=np.int32,
+                elements=st.integers(min_value=0, max_value=dims[0])
+            ),
+            hu.arrays([dims[0]], dtype, elements)
+        ))
+
+
+@st.composite
+def _tensor_splits(draw):
+    lengths = draw(st.lists(st.integers(1, 5), min_size=1, max_size=10))
+    batch_size = draw(st.integers(1, 5))
+    element_pairs = [
+        (batch, r) for batch in range(batch_size) for r in range(len(lengths))
+    ]
+    perm = draw(st.permutations(element_pairs))
+    perm = perm[:-1]  # skip one range
+    ranges = [[(0, 0)] * len(lengths) for _ in range(batch_size)]
+    offset = 0
+    for pair in perm:
+        ranges[pair[0]][pair[1]] = (offset, lengths[pair[1]])
+        offset += lengths[pair[1]]
+
+    data = draw(st.lists(
+        st.floats(min_value=-1.0, max_value=1.0),
+        min_size=offset,
+        max_size=offset
+    ))
+
+    key = draw(st.permutations(range(offset)))
+
+    return (
+        np.array(data).astype(np.float32), np.array(ranges),
+        np.array(lengths), np.array(key).astype(np.int64)
+    )
+
+
+def gather_ranges(data, ranges):
+    lengths = []
+    output = []
+    for example_ranges in ranges:
+        length = 0
+        for range in example_ranges:
+            assert len(range) == 2
+            output.extend(data[range[0]:range[0] + range[1]])
+            length += range[1]
+        lengths.append(length)
+    return output, lengths
+
+
+def gather_ranges_to_dense(data, ranges, lengths):
+    outputs = []
+    assert len(ranges)
+    batch_size = len(ranges)
+    assert len(ranges[0])
+    num_ranges = len(ranges[0])
+    assert ranges.shape[2] == 2
+    for i in range(num_ranges):
+        out = []
+        for j in range(batch_size):
+            start, length = ranges[j][i]
+            if not length:
+                out.append([0] * lengths[i])
+            else:
+                assert length == lengths[i]
+                out.append(data[start:start + length])
+        outputs.append(np.array(out))
+    return outputs
+
+
+def gather_ranges_to_dense_with_key(data, ranges, key, lengths):
+    outputs = []
+    assert len(ranges)
+    batch_size = len(ranges)
+    assert len(ranges[0])
+    num_ranges = len(ranges[0])
+    assert ranges.shape[2] == 2
+    for i in range(num_ranges):
+        out = []
+        for j in range(batch_size):
+            start, length = ranges[j][i]
+            if not length:
+                out.append([0] * lengths[i])
+            else:
+                assert length == lengths[i]
+                key_data_list = zip(
+                    key[start:start + length],
+                    data[start:start + length])
+                sorted_key_data_list = sorted(key_data_list, key=lambda x: x[0])
+                sorted_data = [d for (k, d) in sorted_key_data_list]
+                out.append(sorted_data)
+        outputs.append(np.array(out))
+    return outputs
+
+
+class TestGatherRanges(hu.HypothesisTestCase):
+    @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
+    def test_gather_ranges(self, boarders_and_data, gc, dc):
+        boarders, data = boarders_and_data
+
+        def boarders_to_range(boarders):
+            assert len(boarders) == 2
+            boarders = sorted(boarders)
+            return [boarders[0], boarders[1] - boarders[0]]
+
+        ranges = np.apply_along_axis(boarders_to_range, 2, boarders)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator("GatherRanges",
+                                   ["data", "ranges"],
+                                   ["output", "lengths"]),
+            inputs=[data, ranges],
+            reference=gather_ranges,
+        )
+
+    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
+    def test_gather_ranges_split(self, tensor_splits, gc, dc):
+        data, ranges, lengths, _ = tensor_splits
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator(
+                "GatherRangesToDense",
+                ['data', 'ranges'],
+                ['X_{}'.format(i) for i in range(len(lengths))],
+                lengths=lengths
+            ),
+            inputs=[data, ranges, lengths],
+            reference=gather_ranges_to_dense
+        )
+
+    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
+    def test_gather_ranges_with_key_split(self, tensor_splits, gc, dc):
+        data, ranges, lengths, key = tensor_splits
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator(
+                "GatherRangesToDense",
+                ['data', 'ranges', 'key'],
+                ['X_{}'.format(i) for i in range(len(lengths))],
+                lengths=lengths
+            ),
+            inputs=[data, ranges, key, lengths],
+            reference=gather_ranges_to_dense_with_key
+        )
+
+    def test_shape_and_type_inference(self):
+        with hu.temp_workspace("shape_type_inf_int32"):
+            net = core.Net('test_net')
+            net.ConstantFill(
+                [], "ranges", shape=[3, 5, 2], dtype=core.DataType.INT32,
+            )
+            net.ConstantFill(
+                [], "values", shape=[64], dtype=core.DataType.INT64,
+            )
+            net.GatherRanges(['values', 'ranges'], ['values_output', 'lengths_output'])
+            (shapes, types) = workspace.InferShapesAndTypes([net], {})
+
+            self.assertEqual(shapes["values_output"], [64])
+            self.assertEqual(types["values_output"], core.DataType.INT64)
+            self.assertEqual(shapes["lengths_output"], [3])
+            self.assertEqual(types["lengths_output"], core.DataType.INT32)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py
new file mode 100644
index 0000000..36333f4
--- /dev/null
+++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestGivenTensorFillOps(hu.HypothesisTestCase):
+    @given(X=hu.tensor(min_dim=1, max_dim=4, dtype=np.int32),
+           t=st.sampled_from([
+               (core.DataType.BOOL, np.bool_, "GivenTensorFill"),
+               (core.DataType.INT32, np.int32, "GivenTensorFill"),
+               (core.DataType.FLOAT, np.float32, "GivenTensorFill"),
+               (core.DataType.INT32, np.int32, "GivenTensorIntFill"),
+               (core.DataType.INT64, np.int64, "GivenTensorInt64Fill"),
+               (core.DataType.BOOL, np.bool_, "GivenTensorBoolFill"),
+               (core.DataType.DOUBLE, np.double, "GivenTensorDoubleFill"),
+               (core.DataType.INT32, np.double, "GivenTensorDoubleFill"),
+           ]),
+           **hu.gcs)
+    def test_given_tensor_fill(self, X, t, gc, dc):
+        X = X.astype(t[1])
+        print('X: ', str(X))
+        op = core.CreateOperator(
+            t[2], [], ["Y"],
+            shape=X.shape,
+            dtype=t[0],
+            values=X.reshape((1, X.size)),
+        )
+
+        def constant_fill(*args, **kw):
+            return [X]
+
+        self.assertReferenceChecks(gc, op, [], constant_fill)
+        self.assertDeviceChecks(dc, op, [], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
new file mode 100644
index 0000000..98ebc9b
--- /dev/null
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import assume, given, settings, HealthCheck
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestGlu(hu.HypothesisTestCase):
+    # Suppress filter_too_much health check.
+    # Reproduce by commenting @settings and uncommenting @seed.
+    # @seed(302934307671667531413257853548643485645)
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(
+        X=hu.tensor(),
+        axis=st.integers(min_value=0, max_value=3),
+        **hu.gcs
+    )
+    def test_glu_old(self, X, axis, gc, dc):
+        def glu_ref(X):
+            x1, x2 = np.split(X, [X.shape[axis] // 2], axis=axis)
+            Y = x1 * (1. / (1. + np.exp(-x2)))
+            return [Y]
+
+        # Test only valid tensors.
+        assume(axis < X.ndim)
+        assume(X.shape[axis] % 2 == 0)
+        op = core.CreateOperator("Glu", ["X"], ["Y"], dim=axis)
+        self.assertReferenceChecks(gc, op, [X], glu_ref)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
new file mode 100644
index 0000000..f0068c2
--- /dev/null
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -0,0 +1,71 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import assume, given, settings
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+import unittest
+
+
+class TestGroupConvolution(hu.HypothesisTestCase):
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           size=st.integers(7, 10),
+           group=st.integers(1, 4),
+           input_channels_per_group=st.integers(1, 8),
+           output_channels_per_group=st.integers(1, 8),
+           batch_size=st.integers(1, 3),
+           # TODO(jiayq): if needed, add NHWC support.
+           order=st.sampled_from(["NCHW"]),
+           # Note: Eigen does not support group convolution, but it should
+           # fall back to the default engine without failing.
+           engine=st.sampled_from(["", "CUDNN", "EIGEN"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    @settings(max_examples=2, timeout=100)
+    def test_group_convolution(
+            self, stride, pad, kernel, size, group,
+            input_channels_per_group, output_channels_per_group, batch_size,
+            order, engine, use_bias, gc, dc):
+        assume(size >= kernel)
+        input_channels = input_channels_per_group * group
+        output_channels = output_channels_per_group * group
+
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            pad=pad,
+            order=order,
+            engine=engine,
+            group=group,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32) - 0.5
+        w = np.random.rand(
+            output_channels, kernel, kernel,
+            input_channels_per_group).astype(np.float32)\
+            - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+            w = w.transpose((0, 3, 1, 2))
+
+        inputs = [X, w, b] if use_bias else [X, w]
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
new file mode 100644
index 0000000..3461261
--- /dev/null
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+
+class TestGroupNormOp(hu.HypothesisTestCase):
+    def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon):
+        dims = X.shape
+        N = dims[0]
+        C = dims[1]
+        G = group
+        D = int(C / G)
+        X = X.reshape(N, G, D, -1)
+        mu = np.mean(X, axis=(2, 3), keepdims=True)
+        std = np.sqrt((np.var(X, axis=(2, 3), keepdims=True) + epsilon))
+        gamma = gamma.reshape(G, D, 1)
+        beta = beta.reshape(G, D, 1)
+        Y = gamma * (X - mu) / std + beta
+        return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
+
+    def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon):
+        dims = X.shape
+        N = dims[0]
+        C = dims[-1]
+        G = group
+        D = int(C / G)
+        X = X.reshape(N, -1, G, D)
+        mu = np.mean(X, axis=(1, 3), keepdims=True)
+        std = np.sqrt((np.var(X, axis=(1, 3), keepdims=True) + epsilon))
+        gamma = gamma.reshape(G, D)
+        beta = beta.reshape(G, D)
+        Y = gamma * (X - mu) / std + beta
+        return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
+
+    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(2, 2),
+           H=st.integers(2, 5), W=st.integers(2, 5),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
+           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    def test_group_norm_2d(
+            self, N, G, D, H, W, epsilon, order, gc, dc):
+        op = core.CreateOperator(
+            "GroupNorm",
+            ["X", "gamma", "beta"],
+            ["Y", "mean", "inv_std"],
+            group=G,
+            epsilon=epsilon,
+            order=order,
+        )
+
+        C = G * D
+        if order == "NCHW":
+            X = np.random.randn(N, C, H, W).astype(np.float32) + 1.0
+        else:
+            X = np.random.randn(N, H, W, C).astype(np.float32) + 1.0
+        gamma = np.random.randn(C).astype(np.float32)
+        beta = np.random.randn(C).astype(np.float32)
+        inputs = [X, gamma, beta]
+
+        def ref_op(X, gamma, beta):
+            if order == "NCHW":
+                return self.group_norm_nchw_ref(X, gamma, beta, G, epsilon)
+            else:
+                return self.group_norm_nhwc_ref(X, gamma, beta, G, epsilon)
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref_op,
+            threshold=5e-3,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
+
+    @given(N=st.integers(1, 5), G=st.integers(1, 3), D=st.integers(2, 3),
+           T=st.integers(2, 4), H=st.integers(2, 4), W=st.integers(2, 4),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
+           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    def test_group_norm_3d(
+            self, N, G, D, T, H, W, epsilon, order, gc, dc):
+        op = core.CreateOperator(
+            "GroupNorm",
+            ["X", "gamma", "beta"],
+            ["Y", "mean", "inv_std"],
+            group=G,
+            epsilon=epsilon,
+            order=order,
+        )
+
+        C = G * D
+        if order == "NCHW":
+            X = np.random.randn(N, C, T, H, W).astype(np.float32) + 1.0
+        else:
+            X = np.random.randn(N, T, H, W, C).astype(np.float32) + 1.0
+        gamma = np.random.randn(C).astype(np.float32)
+        beta = np.random.randn(C).astype(np.float32)
+        inputs = [X, gamma, beta]
+
+        def ref_op(X, gamma, beta):
+            if order == "NCHW":
+                return self.group_norm_nchw_ref(X, gamma, beta, G, epsilon)
+            else:
+                return self.group_norm_nhwc_ref(X, gamma, beta, G, epsilon)
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref_op,
+            threshold=5e-3,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
+
+    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(2, 2),
+           H=st.integers(2, 5), W=st.integers(2, 5),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
+           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    def test_group_norm_grad(
+            self, N, G, D, H, W, epsilon, order, gc, dc):
+        op = core.CreateOperator(
+            "GroupNorm",
+            ["X", "gamma", "beta"],
+            ["Y", "mean", "inv_std"],
+            group=G,
+            epsilon=epsilon,
+            order=order,
+        )
+
+        C = G * D
+        X = np.arange(N * C * H * W).astype(np.float32)
+        np.random.shuffle(X)
+        if order == "NCHW":
+            X = X.reshape((N, C, H, W))
+        else:
+            X = X.reshape((N, H, W, C))
+        gamma = np.random.randn(C).astype(np.float32)
+        beta = np.random.randn(C).astype(np.float32)
+        inputs = [X, gamma, beta]
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
new file mode 100644
index 0000000..d807b66
--- /dev/null
+++ b/caffe2/python/operator_test/gru_test.py
@@ -0,0 +1,389 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace, core, scope, gru_cell
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
+
+from functools import partial
+from hypothesis import given
+from hypothesis import settings as ht_settings
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+
+def gru_unit(*args, **kwargs):
+    '''
+    Implements one GRU unit, for one time step
+
+    Shapes:
+    hidden_t_prev.shape     = (1, N, D)
+    gates_out_t.shape       = (1, N, G)
+    seq_lenths.shape        = (N,)
+    '''
+
+    drop_states = kwargs.get('drop_states', False)
+    sequence_lengths = kwargs.get('sequence_lengths', True)
+
+    if sequence_lengths:
+        hidden_t_prev, gates_out_t, seq_lengths, timestep = args
+    else:
+        hidden_t_prev, gates_out_t, timestep = args
+
+    N = hidden_t_prev.shape[1]
+    D = hidden_t_prev.shape[2]
+    G = gates_out_t.shape[2]
+    t = (timestep * np.ones(shape=(N, D))).astype(np.int32)
+    assert t.shape == (N, D)
+    assert G == 3 * D
+    # Calculate reset, update, and output gates separately
+    # because output gate depends on reset gate.
+    gates_out_t = gates_out_t.reshape(N, 3, D)
+    reset_gate_t = gates_out_t[:, 0, :].reshape(N, D)
+    update_gate_t = gates_out_t[:, 1, :].reshape(N, D)
+    output_gate_t = gates_out_t[:, 2, :].reshape(N, D)
+
+    # Calculate gate outputs.
+    reset_gate_t = sigmoid(reset_gate_t)
+    update_gate_t = sigmoid(update_gate_t)
+    output_gate_t = tanh(output_gate_t)
+
+    if sequence_lengths:
+        seq_lengths = (np.ones(shape=(N, D)) *
+                       seq_lengths.reshape(N, 1)).astype(np.int32)
+        assert seq_lengths.shape == (N, D)
+        valid = (t < seq_lengths).astype(np.int32)
+    else:
+        valid = np.ones(shape=(N, D))
+    assert valid.shape == (N, D)
+    hidden_t = update_gate_t * hidden_t_prev + \
+        (1 - update_gate_t) * output_gate_t
+    hidden_t = hidden_t * valid + hidden_t_prev * \
+        (1 - valid) * (1 - drop_states)
+    hidden_t = hidden_t.reshape(1, N, D)
+
+    return (hidden_t, )
+
+
+def gru_reference(input, hidden_input,
+                  reset_gate_w, reset_gate_b,
+                  update_gate_w, update_gate_b,
+                  output_gate_w, output_gate_b,
+                  seq_lengths, drop_states=False,
+                  linear_before_reset=False):
+    D = hidden_input.shape[hidden_input.ndim - 1]
+    T = input.shape[0]
+    N = input.shape[1]
+    G = input.shape[2]
+    print("Dimensions: T= ", T, " N= ", N, " G= ", G, " D= ", D)
+    hidden = np.zeros(shape=(T + 1, N, D))
+    hidden[0, :, :] = hidden_input
+
+    for t in range(T):
+        input_t = input[t].reshape(1, N, G)
+        hidden_t_prev = hidden[t].reshape(1, N, D)
+
+        # Split input contributions for three gates.
+        input_t = input_t.reshape(N, 3, D)
+        input_reset = input_t[:, 0, :].reshape(N, D)
+        input_update = input_t[:, 1, :].reshape(N, D)
+        input_output = input_t[:, 2, :].reshape(N, D)
+
+        reset_gate = np.dot(hidden_t_prev, reset_gate_w.T) + reset_gate_b
+        reset_gate = reset_gate + input_reset
+
+        update_gate = np.dot(hidden_t_prev, update_gate_w.T) + update_gate_b
+        update_gate = update_gate + input_update
+
+        if linear_before_reset:
+            with_linear = np.dot(
+                hidden_t_prev, output_gate_w.T) + output_gate_b
+            output_gate = sigmoid(reset_gate) * with_linear
+        else:
+            with_reset = hidden_t_prev * sigmoid(reset_gate)
+            output_gate = np.dot(with_reset, output_gate_w.T) + output_gate_b
+        output_gate = output_gate + input_output
+
+        gates_out_t = np.concatenate(
+            (reset_gate, update_gate, output_gate),
+            axis=2,
+        )
+        print(reset_gate, update_gate, output_gate, gates_out_t, sep="\n")
+
+        (hidden_t, ) = gru_unit(
+            hidden_t_prev,
+            gates_out_t,
+            seq_lengths,
+            t,
+            drop_states=drop_states
+        )
+        hidden[t + 1] = hidden_t
+
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, N, D),
+    )
+
+
+def gru_unit_op_input():
+    '''
+    Create input tensor where each dimension is from 1 to 4, ndim=3 and
+    last dimension size is a factor of 3
+
+    hidden_t_prev.shape     = (1, N, D)
+    '''
+    dims_ = st.tuples(
+        st.integers(min_value=1, max_value=1),  # 1, one timestep
+        st.integers(min_value=1, max_value=4),  # n
+        st.integers(min_value=1, max_value=4),  # d
+    )
+
+    def create_input(dims):
+        dims = list(dims)
+        dims[2] *= 3
+        return hu.arrays(dims)
+
+    return dims_.flatmap(create_input)
+
+
+def gru_input():
+    '''
+    Create input tensor where each dimension is from 1 to 4, ndim=3 and
+    last dimension size is a factor of 3
+    '''
+    dims_ = st.tuples(
+        st.integers(min_value=1, max_value=4),  # t
+        st.integers(min_value=1, max_value=4),  # n
+        st.integers(min_value=1, max_value=4),  # d
+    )
+
+    def create_input(dims):
+        dims = list(dims)
+        dims[2] *= 3
+        return hu.arrays(dims)
+
+    return dims_.flatmap(create_input)
+
+
+def _prepare_gru_unit_op(gc, n, d, outputs_with_grads,
+                         forward_only=False, drop_states=False,
+                         sequence_lengths=False,
+                         two_d_initial_states=None):
+    print("Dims: (n,d) = ({},{})".format(n, d))
+
+    def generate_input_state(n, d):
+        if two_d_initial_states:
+            return np.random.randn(n, d).astype(np.float32)
+        else:
+            return np.random.randn(1, n, d).astype(np.float32)
+
+    model = ModelHelper(name='external')
+
+    with scope.NameScope("test_name_scope"):
+        if sequence_lengths:
+            hidden_t_prev, gates_t, seq_lengths, timestep = \
+                model.net.AddScopedExternalInputs(
+                    "hidden_t_prev",
+                    "gates_t",
+                    'seq_lengths',
+                    "timestep",
+                )
+        else:
+            hidden_t_prev, gates_t, timestep = \
+                model.net.AddScopedExternalInputs(
+                    "hidden_t_prev",
+                    "gates_t",
+                    "timestep",
+                )
+        workspace.FeedBlob(
+            hidden_t_prev,
+            generate_input_state(n, d).astype(np.float32),
+            device_option=gc
+        )
+        workspace.FeedBlob(
+            gates_t,
+            generate_input_state(n, 3 * d).astype(np.float32),
+            device_option=gc
+        )
+
+        if sequence_lengths:
+            inputs = [hidden_t_prev, gates_t, seq_lengths, timestep]
+        else:
+            inputs = [hidden_t_prev, gates_t, timestep]
+
+        hidden_t = model.net.GRUUnit(
+            inputs,
+            ['hidden_t'],
+            forget_bias=0.0,
+            drop_states=drop_states,
+            sequence_lengths=sequence_lengths,
+        )
+        model.net.AddExternalOutputs(hidden_t)
+        workspace.RunNetOnce(model.param_init_net)
+
+        if sequence_lengths:
+            # 10 is used as a magic number to simulate some reasonable timestep
+            # and generate some reasonable seq. lengths
+            workspace.FeedBlob(
+                seq_lengths,
+                np.random.randint(1, 10, size=(n,)).astype(np.int32),
+                device_option=gc
+            )
+
+        workspace.FeedBlob(
+            timestep,
+            np.random.randint(1, 10, size=(1,)).astype(np.int32),
+            device_option=core.DeviceOption(caffe2_pb2.CPU),
+        )
+        print("Feed {}".format(timestep))
+
+    return hidden_t, model.net
+
+
+class GRUCellTest(hu.HypothesisTestCase):
+
+    # Test just for GRUUnitOp
+    @given(
+        seed=st.integers(0, 2**32 - 1),
+        input_tensor=gru_unit_op_input(),
+        fwd_only=st.booleans(),
+        drop_states=st.booleans(),
+        sequence_lengths=st.booleans(),
+        **hu.gcs
+    )
+    @ht_settings(max_examples=15)
+    def test_gru_unit_op(self, seed, input_tensor, fwd_only,
+                         drop_states, sequence_lengths, gc, dc):
+        np.random.seed(seed)
+        outputs_with_grads = [0]
+        ref = gru_unit
+        ref = partial(ref)
+
+        t, n, d = input_tensor.shape
+        assert d % 3 == 0
+        d = d // 3
+        ref = partial(ref, drop_states=drop_states,
+                      sequence_lengths=sequence_lengths)
+
+        with core.DeviceScope(gc):
+            net = _prepare_gru_unit_op(gc, n, d,
+                                       outputs_with_grads=outputs_with_grads,
+                                       forward_only=fwd_only,
+                                       drop_states=drop_states,
+                                       sequence_lengths=sequence_lengths)[1]
+        # here we don't provide a real input for the net but just for one of
+        # its ops (RecurrentNetworkOp). So have to hardcode this name
+        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
+                           input_tensor,
+                           device_option=gc)
+        print(str(net.Proto()))
+        op = net._net.op[-1]
+        inputs = [workspace.FetchBlob(name) for name in op.input]
+
+        self.assertReferenceChecks(
+            gc,
+            op,
+            inputs,
+            ref,
+            input_device_options={"test_name_scope/timestep": hu.cpu_do},
+            outputs_to_check=[0],
+        )
+
+        # Checking for hidden_prev and gates gradients
+        if not fwd_only:
+            for param in range(2):
+                print("Check param {}".format(param))
+                self.assertGradientChecks(
+                    device_option=gc,
+                    op=op,
+                    inputs=inputs,
+                    outputs_to_check=param,
+                    outputs_with_grads=outputs_with_grads,
+                    threshold=0.0001,
+                    stepsize=0.005,
+                    input_device_options={
+                        "test_name_scope/timestep": hu.cpu_do},
+                )
+
+    @given(
+        seed=st.integers(0, 2**32 - 1),
+        input_tensor=gru_input(),
+        fwd_only=st.booleans(),
+        drop_states=st.booleans(),
+        linear_before_reset=st.booleans(),
+        **hu.gcs
+    )
+    @ht_settings(max_examples=20)
+    def test_gru_main(self, seed, **kwargs):
+        np.random.seed(seed)
+        for outputs_with_grads in [[0], [1], [0, 1]]:
+            self.gru_base(gru_cell.GRU, gru_reference,
+                          outputs_with_grads=outputs_with_grads,
+                          **kwargs)
+
+    def gru_base(self, create_rnn, ref, outputs_with_grads,
+                 input_tensor, fwd_only, drop_states, linear_before_reset, gc, dc):
+
+        print("GRU test parameters: ", locals())
+        t, n, d = input_tensor.shape
+        assert d % 3 == 0
+        d = d // 3
+        ref = partial(ref,
+                      drop_states=drop_states,
+                      linear_before_reset=linear_before_reset)
+        with core.DeviceScope(gc):
+            net = _prepare_rnn(
+                t, n, d, create_rnn,
+                outputs_with_grads=outputs_with_grads,
+                memory_optim=False,
+                forget_bias=0.0,
+                forward_only=fwd_only,
+                drop_states=drop_states,
+                linear_before_reset=linear_before_reset,
+                num_states=1,
+            )[1]
+        # here we don't provide a real input for the net but just for one of
+        # its ops (RecurrentNetworkOp). So have to hardcode this name
+        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
+                           input_tensor,
+                           device_option=gc)
+        op = net._net.op[-1]
+        inputs = [workspace.FetchBlob(name) for name in op.input]
+
+        self.assertReferenceChecks(
+            gc,
+            op,
+            inputs,
+            ref,
+            input_device_options={"test_name_scope/timestep": hu.cpu_do},
+            outputs_to_check=list(range(2)),
+        )
+
+        # Checking for input, gates_t_w and gates_t_b gradients
+        if not fwd_only:
+            for param in range(2):
+                print("Check param {}".format(param))
+                self.assertGradientChecks(
+                    device_option=gc,
+                    op=op,
+                    inputs=inputs,
+                    outputs_to_check=param,
+                    outputs_with_grads=outputs_with_grads,
+                    threshold=0.001,
+                    stepsize=0.005,
+                    input_device_options={
+                        "test_name_scope/timestep": hu.cpu_do},
+                )
+
+
+if __name__ == "__main__":
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+    ])
+    unittest.main()
diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py
new file mode 100644
index 0000000..4927a45
--- /dev/null
+++ b/caffe2/python/operator_test/hsm_test.py
@@ -0,0 +1,251 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from hypothesis import given
+import numpy as np
+import unittest
+
+from caffe2.proto import caffe2_pb2, hsm_pb2
+from caffe2.python import workspace, core, gradient_checker
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.hsm_util as hsmu
+
+# User inputs tree using protobuf file or, in this case, python utils
+# The hierarchy in this test looks as shown below. Note that the final subtrees
+# (with word_ids as leaves) have been collapsed for visualization
+#           *
+#         /  \
+#        *    5,6,7,8
+#       / \
+#  0,1,2   3,4
+tree = hsm_pb2.TreeProto()
+words = [[0, 1, 2], [3, 4], [5, 6, 7, 8]]
+node1 = hsmu.create_node_with_words(words[0], "node1")
+node2 = hsmu.create_node_with_words(words[1], "node2")
+node3 = hsmu.create_node_with_words(words[2], "node3")
+node4 = hsmu.create_node_with_nodes([node1, node2], "node4")
+node = hsmu.create_node_with_nodes([node4, node3], "node5")
+tree.root_node.MergeFrom(node)
+
+# structure:
+# node5: [0, 2, ["node4", "node3"]] # offset, length, "node4, node3"
+# node4: [2, 2, ["node1", "node2"]]
+# node1: [4, 3, [0, 1 ,2]]
+# node2: [7, 2, [3, 4]
+# node3: [9, 4, [5, 6, 7, 8]
+struct = [[0, 2, ["node4", "node3"], "node5"],
+            [2, 2, ["node1", "node2"], "node4"],
+            [4, 3, [0, 1, 2], "node1"],
+            [7, 2, [3, 4], "node2"],
+            [9, 4, [5, 6, 7, 8], "node3"]]
+
+# Internal util to translate input tree to list of (word_id,path). serialized
+# hierarchy is passed into the operator_def as a string argument,
+hierarchy_proto = hsmu.create_hierarchy(tree)
+arg = caffe2_pb2.Argument()
+arg.name = "hierarchy"
+arg.s = hierarchy_proto.SerializeToString()
+
+beam = 5
+args_search = []
+arg_search = caffe2_pb2.Argument()
+arg_search.name = "tree"
+arg_search.s = tree.SerializeToString()
+args_search.append(arg_search)
+arg_search = caffe2_pb2.Argument()
+arg_search.name = "beam"
+arg_search.f = beam
+args_search.append(arg_search)
+
+
+class TestHsm(hu.HypothesisTestCase):
+    def test_hsm_search(self):
+        samples = 10
+        dim_in = 5
+        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
+        w = np.random.rand(hierarchy_proto.size, dim_in) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
+        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
+            .astype(np.int32)
+
+        workspace.GlobalInit(['caffe2'])
+        workspace.FeedBlob("data", X)
+        workspace.FeedBlob("weights", w)
+        workspace.FeedBlob("bias", b)
+        workspace.FeedBlob("labels", labels)
+        op = core.CreateOperator(
+            'HSoftmaxSearch',
+            ['data', 'weights', 'bias'],
+            ['names', 'scores'],
+            'HSoftmaxSearch',
+            arg=args_search)
+        workspace.RunOperatorOnce(op)
+        names = workspace.FetchBlob('names')
+        scores = workspace.FetchBlob('scores')
+
+        def simulation_hsm_search():
+            names = []
+            scores = []
+            for line in struct:
+                s, e = line[0], line[0] + line[1]
+                score = np.dot(X, w[s:e].transpose()) + b[s:e]
+                score = np.exp(score - np.max(score, axis=1, keepdims=True))
+                score /= score.sum(axis=1, keepdims=True)
+                score = -np.log(score)
+
+                score = score.transpose()
+                idx = -1
+                for j, n in enumerate(names):
+                    if n == line[3]:
+                        idx = j
+                        score += scores[j]
+                if idx == -1:
+                    score[score > beam] = np.inf
+                else:
+                    score[score - scores[idx] > beam] = np.inf
+
+                for i, name in enumerate(line[2]):
+                    scores.append(score[i])
+                    names.append(name)
+            scores = np.vstack(scores)
+            return names, scores.transpose()
+
+        p_names, p_scores = simulation_hsm_search()
+        idx = np.argsort(p_scores, axis=1)
+        p_scores = np.sort(p_scores, axis=1)
+        p_names = np.array(p_names)[idx]
+        for i in range(names.shape[0]):
+            for j in range(names.shape[1]):
+                if names[i][j]:
+                    self.assertEquals(
+                        names[i][j], p_names[i][j].item().encode('utf-8'))
+                    self.assertAlmostEqual(
+                        scores[i][j], p_scores[i][j], delta=0.001)
+
+    def test_hsm_run_once(self):
+        workspace.GlobalInit(['caffe2'])
+        workspace.FeedBlob("data",
+                           np.random.randn(1000, 100).astype(np.float32))
+        workspace.FeedBlob("weights",
+                           np.random.randn(1000, 100).astype(np.float32))
+        workspace.FeedBlob("bias", np.random.randn(1000).astype(np.float32))
+        workspace.FeedBlob("labels", np.random.rand(1000).astype(np.int32) * 9)
+        op = core.CreateOperator(
+            'HSoftmax',
+            ['data', 'weights', 'bias', 'labels'],
+            ['output', 'intermediate_output'],
+            'HSoftmax',
+            arg=[arg])
+        self.assertTrue(workspace.RunOperatorOnce(op))
+
+    # Test to check value of sum of squared losses in forward pass for given
+    # input
+    def test_hsm_forward(self):
+        cpu_device_option = caffe2_pb2.DeviceOption()
+        grad_checker = gradient_checker.GradientChecker(
+            0.01, 0.05, cpu_device_option, "default")
+        samples = 9
+        dim_in = 5
+        X = np.zeros((samples, dim_in)).astype(np.float32) + 1
+        w = np.zeros((hierarchy_proto.size, dim_in)).astype(np.float32) + 1
+        b = np.array([i for i in range(hierarchy_proto.size)])\
+            .astype(np.float32)
+        labels = np.array([i for i in range(samples)]).astype(np.int32)
+
+        workspace.GlobalInit(['caffe2'])
+        workspace.FeedBlob("data", X)
+        workspace.FeedBlob("weights", w)
+        workspace.FeedBlob("bias", b)
+        workspace.FeedBlob("labels", labels)
+
+        op = core.CreateOperator(
+            'HSoftmax',
+            ['data', 'weights', 'bias', 'labels'],
+            ['output', 'intermediate_output'],
+            'HSoftmax',
+            arg=[arg])
+        grad_ops, g_input = core.GradientRegistry.GetGradientForOp(
+            op, [s + '_grad' for s in op.output])
+
+        loss, _ = grad_checker.GetLossAndGrad(
+            op, grad_ops, [X, w, b, labels], op.input, 0, g_input[0], [0]
+        )
+        self.assertAlmostEqual(loss, 44.269, delta=0.001)
+
+    # Test to compare gradient calculated using the gradient operator and the
+    # symmetric derivative calculated using Euler Method
+    # TODO : convert to both cpu and gpu test when ready.
+    @given(**hu.gcs_cpu_only)
+    def test_hsm_gradient(self, gc, dc):
+        samples = 10
+        dim_in = 5
+        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
+        w = np.random.rand(hierarchy_proto.size, dim_in) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
+        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
+            .astype(np.int32)
+
+        workspace.GlobalInit(['caffe2'])
+        workspace.FeedBlob("data", X)
+        workspace.FeedBlob("weights", w)
+        workspace.FeedBlob("bias", b)
+        workspace.FeedBlob("labels", labels)
+
+        op = core.CreateOperator(
+            'HSoftmax',
+            ['data', 'weights', 'bias', 'labels'],
+            ['output', 'intermediate_output'],
+            'HSoftmax',
+            arg=[arg])
+
+        self.assertDeviceChecks(dc, op, [X, w, b, labels], [0])
+
+        for i in range(3):
+            self.assertGradientChecks(gc, op, [X, w, b, labels], i, [0])
+
+    def test_huffman_tree_hierarchy(self):
+        workspace.GlobalInit(['caffe2'])
+        labelSet = list(range(0, 6))
+        counts = [1, 2, 3, 4, 5, 6]
+        labels = sum([[l] * c for (l, c) in zip(labelSet, counts)], [])
+        Y = np.array(labels).astype(np.int64)
+        workspace.FeedBlob("labels", Y)
+        arg = caffe2_pb2.Argument()
+        arg.name = 'num_classes'
+        arg.i = 6
+        op = core.CreateOperator(
+            'HuffmanTreeHierarchy',
+            ['labels'],
+            ['huffman_tree'],
+            'HuffmanTreeHierarchy',
+            arg=[arg])
+        workspace.RunOperatorOnce(op)
+        huffmanTreeOutput = workspace.FetchBlob('huffman_tree')
+        treeOutput = hsm_pb2.TreeProto()
+        treeOutput.ParseFromString(huffmanTreeOutput[0])
+        treePathOutput = hsmu.create_hierarchy(treeOutput)
+
+        label_to_path = {}
+        for path in treePathOutput.paths:
+            label_to_path[path.word_id] = path
+
+        def checkPath(label, indices, code):
+            path = label_to_path[label]
+            self.assertEqual(len(path.path_nodes), len(code))
+            self.assertEqual(len(path.path_nodes), len(code))
+            for path_node, index, target in \
+                    zip(path.path_nodes, indices, code):
+                self.assertEqual(path_node.index, index)
+                self.assertEqual(path_node.target, target)
+        checkPath(0, [0, 4, 6, 8], [1, 0, 0, 0])
+        checkPath(1, [0, 4, 6, 8], [1, 0, 0, 1])
+        checkPath(2, [0, 4, 6], [1, 0, 1])
+        checkPath(3, [0, 2], [0, 0])
+        checkPath(4, [0, 2], [0, 1])
+        checkPath(5, [0, 4], [1, 1])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
new file mode 100644
index 0000000..dbb308f
--- /dev/null
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestHyperbolicOps(hu.HypothesisTestCase):
+    def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc):
+        op = core.CreateOperator(
+            op_name,
+            ["X"],
+            ["X"] if in_place else ["Y"],
+            engine=engine,)
+
+        def ref(X):
+            return [np_ref(X)]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    def test_sinh(self, X, gc, dc):
+        self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc)
+
+    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    def test_cosh(self, X, gc, dc):
+        self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc)
+
+    @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_tanh(self, X, in_place, engine, gc, dc):
+        self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc)
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
new file mode 100644
index 0000000..6db6cae
--- /dev/null
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import assume, given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestReduceFrontSum(hu.HypothesisTestCase):
+    @given(batch_size=st.integers(1, 3),
+           stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           channels=st.integers(1, 8),
+           **hu.gcs)
+    def test_im2col_layout(self, batch_size, stride, pad, kernel, dilation,
+                           size, channels, gc, dc):
+
+        dkernel = (dilation * (kernel - 1) + 1)
+        assume(size >= dkernel)
+
+        NCHW_TO_NHWC = (0, 2, 3, 1)
+        NHWC_TO_NCHW = (0, 3, 1, 2)
+        COL_NHWC_TO_NCHW = (4, 2, 3, 0, 1)
+
+        N = batch_size
+        C = channels
+        H = size
+        W = size
+
+        out_h = int((H + (2 * pad) - dkernel) / stride + 1)
+        out_w = int((W + (2 * pad) - dkernel) / stride + 1)
+
+        im_nchw = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
+        im_nhwc = im_nchw.transpose(NCHW_TO_NHWC)
+
+        op_im2col_nchw = core.CreateOperator(
+            "Im2Col",
+            ["im_nchw"], ["col_nchw"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order="NCHW",
+            device_option=gc)
+
+        op_im2col_nhwc = core.CreateOperator(
+            "Im2Col",
+            ["im_nhwc"], ["col_nhwc"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order="NHWC",
+            device_option=gc)
+
+        self.ws.create_blob("im_nchw").feed(im_nchw, device_option=gc)
+        self.ws.create_blob("im_nhwc").feed(im_nhwc, device_option=gc)
+        self.ws.run(op_im2col_nchw)
+        self.ws.run(op_im2col_nhwc)
+
+        # there is probably a clever way to spell this in np
+        col_nchw = self.ws.blobs["col_nchw"].fetch()
+        col_nhwc = self.ws.blobs["col_nhwc"].fetch()
+        col_nchw_ = col_nchw.reshape(N, C, kernel, kernel, out_h, out_w)
+        col_nhwc_ = col_nhwc.reshape(N, out_h, out_w, kernel, kernel, C)
+        for i in range(0, N):
+            np.testing.assert_allclose(
+                col_nchw_[i],
+                col_nhwc_[i].transpose(COL_NHWC_TO_NCHW),
+                atol=1e-4,
+                rtol=1e-4)
+
+        op_col2im_nchw = core.CreateOperator(
+            "Col2Im",
+            ["col_nchw", "im_nchw"],
+            ["out_nchw"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order="NCHW",
+            device_option=gc)
+
+        op_col2im_nhwc = core.CreateOperator(
+            "Col2Im",
+            ["col_nhwc", "im_nhwc"],
+            ["out_nhwc"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order="NHWC",
+            device_option=gc)
+
+        self.ws.run(op_col2im_nchw)
+        self.ws.run(op_col2im_nhwc)
+
+        out_nchw = self.ws.blobs["out_nchw"].fetch()
+        out_nhwc = self.ws.blobs["out_nhwc"].fetch()
+        np.testing.assert_allclose(
+            out_nchw,
+            out_nhwc.transpose(NHWC_TO_NCHW),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(batch_size=st.integers(1, 3),
+           stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           dilation=st.integers(1, 3),
+           size=st.integers(7, 10),
+           channels=st.integers(1, 8),
+           order=st.sampled_from(["NCHW"]),
+           **hu.gcs)
+    def test_col2im_gradients(self, batch_size, stride, pad, kernel,
+                              dilation, size, channels, order, gc, dc):
+        assume(size >= dilation * (kernel - 1) + 1)
+        op = core.CreateOperator(
+            "Im2Col",
+            ["X"], ["Y"],
+            stride=stride,
+            kernel=kernel,
+            dilation=dilation,
+            pad=pad,
+            order=order,
+            device_option=gc)
+        X = np.random.rand(batch_size, channels, size, size).astype(np.float32)
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+        return
diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py
new file mode 100644
index 0000000..fc2f7ba
--- /dev/null
+++ b/caffe2/python/operator_test/image_input_op_test.py
@@ -0,0 +1,433 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+try:
+    import cv2
+    import lmdb
+except ImportError:
+    pass  # Handled below
+
+from PIL import Image
+import numpy as np
+import shutil
+import six
+import sys
+import tempfile
+
+# TODO: This test does not test scaling because
+# the algorithms used by OpenCV in the C and Python
+# version seem to differ slightly. It does test
+# most other features
+
+from hypothesis import given, settings, Verbosity
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+
+from caffe2.python import workspace, core
+
+
+# Verification routines (applies transformations to image to
+# verify if the operator produces same result)
+def verify_apply_bounding_box(img, box):
+    import skimage.util
+    if any(type(box[f]) is not int or np.isnan(box[f] or box[f] < 0)
+           for f in range(0, 4)):
+        return img
+    # Box is ymin, xmin, bound_height, bound_width
+    y_bounds = (box[0], img.shape[0] - box[0] - box[2])
+    x_bounds = (box[1], img.shape[1] - box[1] - box[3])
+    c_bounds = (0, 0)
+
+    if any(el < 0 for el in list(y_bounds) + list(x_bounds) + list(c_bounds)):
+        return img
+
+    bboxed = skimage.util.crop(img, (y_bounds, x_bounds, c_bounds))
+    return bboxed
+
+
+# This function is called but not used. It will trip on assert False if
+# the arguments are wrong (improper example)
+def verify_rescale(img, minsize):
+    # Here we use OpenCV transformation to match the C code
+    scale_amount = float(minsize) / min(img.shape[0], img.shape[1])
+    if scale_amount <= 1.0:
+        return img
+
+    print("Scale amount is %f -- should be < 1.0; got shape %s" %
+          (scale_amount, str(img.shape)))
+    assert False
+    img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    output_shape = (int(np.ceil(scale_amount * img_cv.shape[0])),
+                    int(np.ceil(scale_amount * img_cv.shape[1])))
+    resized = cv2.resize(img_cv,
+                         dsize=output_shape,
+                         interpolation=cv2.INTER_AREA)
+
+    resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+    assert resized.shape[0] >= minsize
+    assert resized.shape[1] >= minsize
+    return resized
+
+
+def verify_crop(img, crop):
+    import skimage.util
+    assert img.shape[0] >= crop
+    assert img.shape[1] >= crop
+    y_offset = 0
+    if img.shape[0] > crop:
+        y_offset = (img.shape[0] - crop) // 2
+
+    x_offset = 0
+    if img.shape[1] > crop:
+        x_offset = (img.shape[1] - crop) // 2
+
+    y_bounds = (y_offset, img.shape[0] - crop - y_offset)
+    x_bounds = (x_offset, img.shape[1] - crop - x_offset)
+    c_bounds = (0, 0)
+    cropped = skimage.util.crop(img, (y_bounds, x_bounds, c_bounds))
+    assert cropped.shape[0] == crop
+    assert cropped.shape[1] == crop
+    return cropped
+
+
+def verify_color_normalize(img, means, stds):
+    # Note the RGB/BGR inversion
+    # Operate on integers like the C version
+    img = img * 255.0
+    img[:, :, 0] = (img[:, :, 0] - means[2]) / stds[2]
+    img[:, :, 1] = (img[:, :, 1] - means[1]) / stds[1]
+    img[:, :, 2] = (img[:, :, 2] - means[0]) / stds[0]
+    return img * (1.0 / 255.0)
+
+
+# Printing function (for debugging)
+def caffe2_img(img):
+    # Convert RGB to BGR
+    img = img[:, :, (2, 1, 0)]
+    # Convert HWC to CHW
+    img = img.swapaxes(1, 2).swapaxes(0, 1)
+    img = img * 255.0
+    return img.astype(np.int32)
+
+
+# Bounding box is ymin, xmin, height, width
+def create_test(output_dir, width, height, default_bound, minsize, crop, means,
+                stds, count, label_type, num_labels, output1=None,
+                output2_size=None):
+    print("Creating a temporary lmdb database of %d pictures..." % (count))
+
+    if default_bound is None:
+        default_bound = [-1] * 4
+
+    LMDB_MAP_SIZE = 1 << 40
+    env = lmdb.open(output_dir, map_size=LMDB_MAP_SIZE, subdir=True)
+    index = 0
+    # Create images and the expected results
+    expected_results = []
+    with env.begin(write=True) as txn:
+        while index < count:
+            img_array = np.random.random_integers(
+                0, 255, [height, width, 3]).astype(np.uint8)
+            img_obj = Image.fromarray(img_array)
+            img_str = six.BytesIO()
+            img_obj.save(img_str, 'PNG')
+
+            # Create a random bounding box for every other image
+            # ymin, xmin, bound_height, bound_width
+            # TODO: To ensure that we never need to scale, we
+            # ensure that the bounding-box is larger than the
+            # minsize parameter
+            bounding_box = list(default_bound)
+            do_default_bound = True
+            if index % 2 == 0:
+                if height > minsize and width > minsize:
+                    do_default_bound = False
+                    bounding_box[0:2] = [np.random.randint(a) for a in
+                                         (height - minsize, width - minsize)]
+                    bounding_box[2:4] = [np.random.randint(a) + minsize for a in
+                                         (height - bounding_box[0] - minsize + 1,
+                                          width - bounding_box[1] - minsize + 1)]
+                    # print("Bounding box is %s" % (str(bounding_box)))
+            # Create expected result
+            img_expected = img_array.astype(np.float32) * (1.0 / 255.0)
+            # print("Orig image: %s" % (str(caffe2_img(img_expected))))
+            img_expected = verify_apply_bounding_box(
+                img_expected,
+                bounding_box)
+            # print("Bounded image: %s" % (str(caffe2_img(img_expected))))
+
+            img_expected = verify_rescale(img_expected, minsize)
+
+            img_expected = verify_crop(img_expected, crop)
+            # print("Crop image: %s" % (str(caffe2_img(img_expected))))
+
+            img_expected = verify_color_normalize(img_expected, means, stds)
+            # print("Color image: %s" % (str(caffe2_img(img_expected))))
+
+            tensor_protos = caffe2_pb2.TensorProtos()
+            image_tensor = tensor_protos.protos.add()
+            image_tensor.data_type = 4  # string data
+            image_tensor.string_data.append(img_str.getvalue())
+            img_str.close()
+
+            label_tensor = tensor_protos.protos.add()
+            label_tensor.data_type = 2  # int32 data
+            assert (label_type >= 0 and label_type <= 3)
+            if label_type == 0:
+                label_tensor.int32_data.append(index)
+                expected_label = index
+            elif label_type == 1:
+                binary_labels = np.random.randint(2, size=num_labels)
+                for idx, val in enumerate(binary_labels.tolist()):
+                    if val == 1:
+                        label_tensor.int32_data.append(idx)
+                expected_label = binary_labels
+            elif label_type == 2:
+                embedding_label = np.random.randint(100, size=num_labels)
+                for _idx, val in enumerate(embedding_label.tolist()):
+                    label_tensor.int32_data.append(val)
+                expected_label = embedding_label
+            elif label_type == 3:
+                weight_tensor = tensor_protos.protos.add()
+                weight_tensor.data_type = 1  # float weights
+                binary_labels = np.random.randint(2, size=num_labels)
+                expected_label = np.zeros(num_labels).astype(np.float32)
+                for idx, val in enumerate(binary_labels.tolist()):
+                    expected_label[idx] = val * idx
+                    if val == 1:
+                        label_tensor.int32_data.append(idx)
+                        weight_tensor.float_data.append(idx)
+
+            if output1:
+                output1_tensor = tensor_protos.protos.add()
+                output1_tensor.data_type = 1  # float data
+                output1_tensor.float_data.append(output1)
+
+            output2 = []
+            if output2_size:
+                output2_tensor = tensor_protos.protos.add()
+                output2_tensor.data_type = 2  # int32 data
+                values = np.random.randint(1024, size=output2_size)
+                for val in values.tolist():
+                    output2.append(val)
+                    output2_tensor.int32_data.append(val)
+
+            expected_results.append(
+                [caffe2_img(img_expected), expected_label, output1, output2])
+
+            if not do_default_bound:
+                bounding_tensor = tensor_protos.protos.add()
+                bounding_tensor.data_type = 2  # int32 data
+                bounding_tensor.int32_data.extend(bounding_box)
+
+            txn.put(
+                '{}'.format(index).encode('ascii'),
+                tensor_protos.SerializeToString()
+            )
+            index = index + 1
+        # End while
+    # End with
+    return expected_results
+
+
+def run_test(
+        size_tuple, means, stds, label_type, num_labels, is_test, scale_jitter_type,
+        color_jitter, color_lighting, dc, validator, output1=None, output2_size=None):
+    # TODO: Does not test on GPU and does not test use_gpu_transform
+    # WARNING: Using ModelHelper automatically does NHWC to NCHW
+    # transformation if needed.
+    width, height, minsize, crop = size_tuple
+    means = [float(m) for m in means]
+    stds = [float(s) for s in stds]
+    out_dir = tempfile.mkdtemp()
+    count_images = 2  # One with bounding box and one without
+    expected_images = create_test(
+        out_dir,
+        width=width,
+        height=height,
+        default_bound=(3, 5, height - 3, width - 5),
+        minsize=minsize,
+        crop=crop,
+        means=means,
+        stds=stds,
+        count=count_images,
+        label_type=label_type,
+        num_labels=num_labels,
+        output1=output1,
+        output2_size=output2_size
+    )
+    for device_option in dc:
+        with hu.temp_workspace():
+            reader_net = core.Net('reader')
+            reader_net.CreateDB(
+                [],
+                'DB',
+                db=out_dir,
+                db_type="lmdb"
+            )
+            workspace.RunNetOnce(reader_net)
+            outputs = ['data', 'label']
+            output_sizes = []
+            if output1:
+                outputs.append('output1')
+                output_sizes.append(1)
+            if output2_size:
+                outputs.append('output2')
+                output_sizes.append(output2_size)
+            imageop = core.CreateOperator(
+                'ImageInput',
+                ['DB'],
+                outputs,
+                batch_size=count_images,
+                color=3,
+                minsize=minsize,
+                crop=crop,
+                is_test=is_test,
+                bounding_ymin=3,
+                bounding_xmin=5,
+                bounding_height=height - 3,
+                bounding_width=width - 5,
+                mean_per_channel=means,
+                std_per_channel=stds,
+                use_gpu_transform=(device_option.device_type == 1),
+                label_type=label_type,
+                num_labels=num_labels,
+                output_sizes=output_sizes,
+                scale_jitter_type=scale_jitter_type,
+                color_jitter=color_jitter,
+                color_lighting=color_lighting
+            )
+
+            imageop.device_option.CopyFrom(device_option)
+            main_net = core.Net('main')
+            main_net.Proto().op.extend([imageop])
+            workspace.RunNetOnce(main_net)
+            validator(expected_images, device_option, count_images)
+            # End for
+        # End with
+    # End for
+    shutil.rmtree(out_dir)
+# end run_test
+
+
+@unittest.skipIf('cv2' not in sys.modules, 'python-opencv is not installed')
+@unittest.skipIf('lmdb' not in sys.modules, 'python-lmdb is not installed')
+class TestImport(hu.HypothesisTestCase):
+    def validate_image_and_label(
+            self, expected_images, device_option, count_images, label_type,
+            is_test, scale_jitter_type, color_jitter, color_lighting):
+        l = workspace.FetchBlob('label')
+        result = workspace.FetchBlob('data').astype(np.int32)
+        # If we don't use_gpu_transform, the output is in NHWC
+        # Our reference output is CHW so we swap
+        if device_option.device_type != 1:
+            expected = [img.swapaxes(0, 1).swapaxes(1, 2) for
+                        (img, _, _, _) in expected_images]
+        else:
+            expected = [img for (img, _, _, _) in expected_images]
+        for i in range(count_images):
+            if label_type == 0:
+                self.assertEqual(l[i], expected_images[i][1])
+            else:
+                self.assertEqual(
+                    (l[i] - expected_images[i][1] > 0).sum(), 0)
+            if is_test == 0:
+                # when traing data preparation is randomized (e.g. random cropping,
+                # Inception-style random sized cropping, color jittering,
+                # color lightin), we only compare blob shape
+                for (s1, s2) in zip(expected[i].shape, result[i].shape):
+                    self.assertEqual(s1, s2)
+            else:
+                self.assertEqual((expected[i] - result[i] > 1).sum(), 0)
+        # End for
+    # end validate_image_and_label
+
+    @given(size_tuple=st.tuples(
+        st.integers(min_value=8, max_value=4096),
+        st.integers(min_value=8, max_value=4096)).flatmap(lambda t: st.tuples(
+            st.just(t[0]), st.just(t[1]),
+            st.just(min(t[0] - 6, t[1] - 4)),
+            st.integers(min_value=1, max_value=min(t[0] - 6, t[1] - 4)))),
+        means=st.tuples(st.integers(min_value=0, max_value=255),
+                        st.integers(min_value=0, max_value=255),
+                        st.integers(min_value=0, max_value=255)),
+        stds=st.tuples(st.floats(min_value=1, max_value=10),
+                       st.floats(min_value=1, max_value=10),
+                       st.floats(min_value=1, max_value=10)),
+        label_type=st.integers(0, 3),
+        num_labels=st.integers(min_value=8, max_value=4096),
+        is_test=st.integers(min_value=0, max_value=1),
+        scale_jitter_type=st.integers(min_value=0, max_value=1),
+        color_jitter=st.integers(min_value=0, max_value=1),
+        color_lighting=st.integers(min_value=0, max_value=1),
+        **hu.gcs)
+    @settings(verbosity=Verbosity.verbose)
+    def test_imageinput(
+            self, size_tuple, means, stds, label_type,
+            num_labels, is_test, scale_jitter_type, color_jitter, color_lighting,
+            gc, dc):
+        def validator(expected_images, device_option, count_images):
+            self.validate_image_and_label(
+                expected_images, device_option, count_images, label_type,
+                is_test, scale_jitter_type, color_jitter, color_lighting)
+        # End validator
+        run_test(
+            size_tuple, means, stds, label_type, num_labels, is_test,
+            scale_jitter_type, color_jitter, color_lighting, dc, validator)
+    # End test_imageinput
+
+    @given(size_tuple=st.tuples(
+        st.integers(min_value=8, max_value=4096),
+        st.integers(min_value=8, max_value=4096)).flatmap(lambda t: st.tuples(
+            st.just(t[0]), st.just(t[1]),
+            st.just(min(t[0] - 6, t[1] - 4)),
+            st.integers(min_value=1, max_value=min(t[0] - 6, t[1] - 4)))),
+        means=st.tuples(st.integers(min_value=0, max_value=255),
+                        st.integers(min_value=0, max_value=255),
+                        st.integers(min_value=0, max_value=255)),
+        stds=st.tuples(st.floats(min_value=1, max_value=10),
+                       st.floats(min_value=1, max_value=10),
+                       st.floats(min_value=1, max_value=10)),
+        label_type=st.integers(0, 3),
+        num_labels=st.integers(min_value=8, max_value=4096),
+        is_test=st.integers(min_value=0, max_value=1),
+        scale_jitter_type=st.integers(min_value=0, max_value=1),
+        color_jitter=st.integers(min_value=0, max_value=1),
+        color_lighting=st.integers(min_value=0, max_value=1),
+        output1=st.floats(min_value=1, max_value=10),
+        output2_size=st.integers(min_value=2, max_value=10),
+        **hu.gcs)
+    @settings(verbosity=Verbosity.verbose)
+    def test_imageinput_with_additional_outputs(
+            self, size_tuple, means, stds, label_type,
+            num_labels, is_test, scale_jitter_type, color_jitter, color_lighting,
+            output1, output2_size, gc, dc):
+        def validator(expected_images, device_option, count_images):
+            self.validate_image_and_label(
+                expected_images, device_option, count_images, label_type,
+                is_test, scale_jitter_type, color_jitter, color_lighting)
+
+            output1_result = workspace.FetchBlob('output1')
+            output2_result = workspace.FetchBlob('output2')
+
+            for i in range(count_images):
+                self.assertEqual(output1_result[i], expected_images[i][2])
+                self.assertEqual(
+                    (output2_result[i] - expected_images[i][3] > 0).sum(), 0)
+            # End for
+        # End validator
+        run_test(
+            size_tuple, means, stds, label_type, num_labels, is_test,
+            scale_jitter_type, color_jitter, color_lighting, dc,
+            validator, output1, output2_size)
+    # End test_imageinput
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
new file mode 100644
index 0000000..1f8b734
--- /dev/null
+++ b/caffe2/python/operator_test/index_hash_ops_test.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestIndexHashOps(hu.HypothesisTestCase):
+    @given(
+        indices=st.sampled_from([
+            np.int32, np.int64
+        ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)),
+        seed=st.integers(min_value=0, max_value=10),
+        modulo=st.integers(min_value=100000, max_value=200000),
+        **hu.gcs_cpu_only
+    )
+    def test_index_hash_ops(self, indices, seed, modulo, gc, dc):
+        op = core.CreateOperator("IndexHash",
+                                 ["indices"], ["hashed_indices"],
+                                 seed=seed, modulo=modulo)
+
+        def index_hash(indices):
+            dtype = np.array(indices).dtype
+            assert dtype == np.int32 or dtype == np.int64
+            hashed_indices = []
+            for index in indices:
+                hashed = dtype.type(0xDEADBEEF * seed)
+                indices_bytes = np.array([index], dtype).view(np.int8)
+                for b in indices_bytes:
+                    hashed = dtype.type(hashed * 65537 + b)
+                hashed = (modulo + hashed % modulo) % modulo
+                hashed_indices.append(hashed)
+            return [hashed_indices]
+
+        self.assertDeviceChecks(dc, op, [indices], [0])
+        self.assertReferenceChecks(gc, op, [indices], index_hash)
+
+    def test_shape_and_type_inference(self):
+        with hu.temp_workspace("shape_type_inf_int64"):
+            net = core.Net('test_net')
+            net.ConstantFill(
+                [], "values", shape=[64], dtype=core.DataType.INT64,
+            )
+            net.IndexHash(['values'], ['values_output'])
+            (shapes, types) = workspace.InferShapesAndTypes([net], {})
+
+            self.assertEqual(shapes["values_output"], [64])
+            self.assertEqual(types["values_output"], core.DataType.INT64)
+
+        with hu.temp_workspace("shape_type_inf_int32"):
+            net = core.Net('test_net')
+            net.ConstantFill(
+                [], "values", shape=[2, 32], dtype=core.DataType.INT32,
+            )
+            net.IndexHash(['values'], ['values_output'])
+            (shapes, types) = workspace.InferShapesAndTypes([net], {})
+
+            self.assertEqual(shapes["values_output"], [2, 32])
+            self.assertEqual(types["values_output"], core.DataType.INT32)
diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py
new file mode 100644
index 0000000..642f340
--- /dev/null
+++ b/caffe2/python/operator_test/index_ops_test.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+import tempfile
+
+
+class TestIndexOps(TestCase):
+    def _test_index_ops(self, entries, dtype, index_create_op):
+        workspace.RunOperatorOnce(core.CreateOperator(
+            index_create_op,
+            [],
+            ['index'],
+            max_elements=10))
+        my_entries = np.array(
+            [entries[0], entries[1], entries[2]], dtype=dtype)
+
+        workspace.FeedBlob('entries', my_entries)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexLoad',
+            ['index', 'entries'],
+            ['index']))
+        query1 = np.array(
+            [entries[0], entries[3], entries[0], entries[4]],
+            dtype=dtype)
+
+        workspace.FeedBlob('query1', query1)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexGet',
+            ['index', 'query1'],
+            ['result1']))
+        result1 = workspace.FetchBlob('result1')
+        np.testing.assert_array_equal([1, 4, 1, 5], result1)
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexFreeze',
+            ['index'],
+            ['index']))
+
+        query2 = np.array(
+            [entries[5], entries[4], entries[0], entries[6], entries[7]],
+            dtype=dtype)
+        workspace.FeedBlob('query2', query2)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexGet',
+            ['index', 'query2'],
+            ['result2']))
+        result2 = workspace.FetchBlob('result2')
+        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexSize',
+            ['index'],
+            ['index_size']))
+        size = workspace.FetchBlob('index_size')
+        self.assertEquals(size, 6)
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexStore',
+            ['index'],
+            ['stored_entries']))
+        stored_actual = workspace.FetchBlob('stored_entries')
+        new_entries = np.array([entries[3], entries[4]], dtype=dtype)
+        expected = np.concatenate((my_entries, new_entries))
+        if dtype is str:
+            # we'll always get bytes back from Caffe2
+            expected = np.array([
+                x.item().encode('utf-8') if isinstance(x, np.str_) else x
+                for x in expected
+            ], dtype=object)
+        np.testing.assert_array_equal(expected, stored_actual)
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            index_create_op,
+            [],
+            ['index2']))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexLoad',
+            ['index2', 'stored_entries'],
+            ['index2'],
+            skip_first_entry=1))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexSize',
+            ['index2'],
+            ['index2_size']))
+        index2_size = workspace.FetchBlob('index2_size')
+        self.assertEquals(index2_size, 5)
+
+        # test serde
+        with tempfile.NamedTemporaryFile() as tmp:
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'Save',
+                ['index'],
+                [],
+                absolute_path=1,
+                db_type='minidb',
+                db=tmp.name))
+            # frees up the blob
+            workspace.FeedBlob('index', np.array([]))
+            # reloads the index
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'Load',
+                [],
+                ['index'],
+                absolute_path=1,
+                db_type='minidb',
+                db=tmp.name))
+            query3 = np.array(
+                [entries[0], entries[3], entries[0], entries[4], entries[4]],
+                dtype=dtype)
+
+            workspace.FeedBlob('query3', query3)
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'IndexGet', ['index', 'query3'], ['result3']))
+            result3 = workspace.FetchBlob('result3')
+            np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
+
+    def test_string_index_ops(self):
+        self._test_index_ops([
+            'entry1', 'entry2', 'entry3', 'new_entry1',
+            'new_entry2', 'miss1', 'miss2', 'miss3',
+        ], str, 'StringIndexCreate')
+
+    def test_int_index_ops(self):
+        self._test_index_ops(list(range(8)), np.int32, 'IntIndexCreate')
+
+    def test_long_index_ops(self):
+        self._test_index_ops(list(range(8)), np.int64, 'LongIndexCreate')
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
new file mode 100644
index 0000000..4377b82
--- /dev/null
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -0,0 +1,268 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given, assume
+import hypothesis.strategies as st
+
+from caffe2.python import core, model_helper, brew
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestInstanceNorm(hu.HypothesisTestCase):
+
+    def _get_inputs(self, N, C, H, W, order):
+        if order == 'NCHW':
+            input_data = np.random.rand(N, C, H, W).astype(np.float32)
+        elif order == 'NHWC':
+            # Allocate in the same order as NCHW and transpose to make sure
+            # the inputs are identical on freshly-seeded calls.
+            input_data = np.random.rand(N, C, H, W).astype(np.float32)
+            input_data = np.transpose(input_data, axes=(0, 2, 3, 1))
+        else:
+            raise Exception('unknown order type ({})'.format(order))
+
+        scale_data = np.random.rand(C).astype(np.float32)
+        bias_data = np.random.rand(C).astype(np.float32)
+        return input_data, scale_data, bias_data
+
+    def _get_op(self, device_option, store_mean, store_inv_stdev, epsilon,
+                order, inplace=False):
+        outputs = ['output' if not inplace else "input"]
+        if store_mean or store_inv_stdev:
+            outputs += ['mean']
+        if store_inv_stdev:
+            outputs += ['inv_stdev']
+        op = core.CreateOperator(
+            'InstanceNorm',
+            ['input', 'scale', 'bias'],
+            outputs,
+            order=order,
+            epsilon=epsilon,
+            device_option=device_option)
+        return op
+
+    def _feed_inputs(self, input_blobs, device_option):
+        names = ['input', 'scale', 'bias']
+        for name, blob in zip(names, input_blobs):
+            self.ws.create_blob(name).feed(blob, device_option=device_option)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 3),
+           C=st.integers(2, 3),
+           H=st.integers(2, 3),
+           W=st.integers(2, 3),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           epsilon=st.floats(1e-6, 1e-4),
+           store_mean=st.booleans(),
+           seed=st.integers(0, 1000),
+           store_inv_stdev=st.booleans())
+    def test_instance_norm_gradients(
+            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
+            epsilon, seed):
+        np.random.seed(seed)
+
+        # force store_inv_stdev if store_mean to match existing forward pass
+        # implementation
+        store_inv_stdev |= store_mean
+
+        op = self._get_op(
+            device_option=gc,
+            store_mean=store_mean,
+            store_inv_stdev=store_inv_stdev,
+            epsilon=epsilon,
+            order=order)
+        input_blobs = self._get_inputs(N, C, H, W, order)
+
+        output_indices = [0]
+        # if store_inv_stdev is turned on, store_mean must also be forced on
+        if store_mean or store_inv_stdev:
+            output_indices += [1]
+        if store_inv_stdev:
+            output_indices += [2]
+        self.assertDeviceChecks(dc, op, input_blobs, output_indices)
+        # The gradient only flows from output #0 since the other two only
+        # store the temporary mean and inv_stdev buffers.
+        # Check dl/dinput
+        self.assertGradientChecks(gc, op, input_blobs, 0, [0], stepsize=0.005,
+                                  threshold=0.01)
+        # Check dl/dscale
+        self.assertGradientChecks(gc, op, input_blobs, 1, [0])
+        # Check dl/dbias
+        self.assertGradientChecks(gc, op, input_blobs, 2, [0])
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           seed=st.integers(0, 1000),
+           epsilon=st.floats(1e-6, 1e-4),
+           store_mean=st.booleans(),
+           store_inv_stdev=st.booleans())
+    def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean,
+                                  store_inv_stdev, epsilon, seed):
+        # force store_inv_stdev if store_mean to match existing forward pass
+        # implementation
+        store_inv_stdev |= store_mean
+
+        outputs = {}
+        for order in ('NCHW', 'NHWC'):
+            np.random.seed(seed)
+            input_blobs = self._get_inputs(N, C, H, W, order)
+            self._feed_inputs(input_blobs, device_option=gc)
+            op = self._get_op(
+                device_option=gc,
+                store_mean=store_mean,
+                store_inv_stdev=store_inv_stdev,
+                epsilon=epsilon,
+                order=order)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs['output'].fetch()
+        np.testing.assert_allclose(
+            outputs['NCHW'],
+            outputs['NHWC'].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           epsilon=st.floats(1e-6, 1e-4),
+           store_mean=st.booleans(),
+           seed=st.integers(0, 1000),
+           store_inv_stdev=st.booleans(),
+           inplace=st.booleans())
+    def test_instance_norm_reference_check(
+            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
+            epsilon, seed, inplace):
+        np.random.seed(seed)
+
+        # force store_inv_stdev if store_mean to match existing forward pass
+        # implementation
+        store_inv_stdev |= store_mean
+        if order != "NCHW":
+            assume(not inplace)
+
+        inputs = self._get_inputs(N, C, H, W, order)
+        op = self._get_op(
+            device_option=gc,
+            store_mean=store_mean,
+            store_inv_stdev=store_inv_stdev,
+            epsilon=epsilon,
+            order=order,
+            inplace=inplace)
+
+        def ref(input_blob, scale_blob, bias_blob):
+            if order == 'NHWC':
+                input_blob = np.transpose(input_blob, axes=(0, 3, 1, 2))
+
+            mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2)
+            inv_stdev_blob = 1.0 / \
+                np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon)
+            # _bc indicates blobs that are reshaped for broadcast
+            scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis]
+            mean_bc = mean_blob[:, :, np.newaxis, np.newaxis]
+            inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis]
+            bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis]
+            normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \
+                + bias_bc
+
+            if order == 'NHWC':
+                normalized_blob = np.transpose(
+                    normalized_blob, axes=(0, 2, 3, 1))
+
+            if not store_mean and not store_inv_stdev:
+                return normalized_blob,
+            elif not store_inv_stdev:
+                return normalized_blob, mean_blob
+            else:
+                return normalized_blob, mean_blob, inv_stdev_blob
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           epsilon=st.floats(1e-6, 1e-4),
+           store_mean=st.booleans(),
+           seed=st.integers(0, 1000),
+           store_inv_stdev=st.booleans())
+    def test_instance_norm_device_check(
+            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
+            epsilon, seed):
+        np.random.seed(seed)
+
+        # force store_inv_stdev if store_mean to match existing forward pass
+        # implementation
+        store_inv_stdev |= store_mean
+
+        inputs = self._get_inputs(N, C, H, W, order)
+        op = self._get_op(
+            device_option=gc,
+            store_mean=store_mean,
+            store_inv_stdev=store_inv_stdev,
+            epsilon=epsilon,
+            order=order)
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+    @given(is_test=st.booleans(),
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           epsilon=st.floats(1e-6, 1e-4),
+           seed=st.integers(0, 1000))
+    def test_instance_norm_model_helper(
+            self, N, C, H, W, order, epsilon, seed, is_test):
+        np.random.seed(seed)
+        model = model_helper.ModelHelper(name="test_model")
+        brew.instance_norm(
+            model,
+            'input',
+            'output',
+            C,
+            epsilon=epsilon,
+            order=order,
+            is_test=is_test)
+
+        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
+        if order == 'NHWC':
+            input_blob = np.transpose(input_blob, axes=(0, 2, 3, 1))
+
+        self.ws.create_blob('input').feed(input_blob)
+
+        self.ws.create_net(model.param_init_net).run()
+        self.ws.create_net(model.net).run()
+
+        if is_test:
+            scale = self.ws.blobs['output_s'].fetch()
+            assert scale is not None
+            assert scale.shape == (C, )
+            bias = self.ws.blobs['output_b'].fetch()
+            assert bias is not None
+            assert bias.shape == (C, )
+
+        output_blob = self.ws.blobs['output'].fetch()
+        if order == 'NHWC':
+            output_blob = np.transpose(output_blob, axes=(0, 3, 1, 2))
+
+        assert output_blob.shape == (N, C, H, W)
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
new file mode 100644
index 0000000..011720d
--- /dev/null
+++ b/caffe2/python/operator_test/integral_image_ops_test.py
@@ -0,0 +1,90 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from hypothesis import given
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestIntegralImageOps(hu.HypothesisTestCase):
+    @given(batch_size=st.integers(1, 3),
+           height=st.integers(7, 10),
+           width=st.integers(7, 10),
+           channels=st.integers(1, 8),
+           **hu.gcs)
+    def test_integral_image_ops(self, batch_size, height, width, channels, gc, dc):
+        N = batch_size
+        C = channels
+        H = height
+        W = width
+
+        im = np.random.rand(N, C, H, W).astype(np.float32)
+        op = core.CreateOperator("IntegralImage",
+                                 ["im"], ["y"])
+
+        def integral_image(im):
+            y = np.random.rand(N, C, H + 1, W + 1).astype(np.float32)
+            for i1 in range(N):
+                for i2 in range(C):
+                    for i3 in range(W + 1):
+                        y[i1, i2, 0, i3] = 0
+                    for i3 in range(H + 1):
+                        y[i1, i2, i3, 0] = 0
+                    for i3 in range(1, H + 1):
+                        for i4 in range(1, W + 1):
+                            y[i1, i2, i3, i4] = im[i1, i2, i3 - 1, i4 - 1] + \
+                                y[i1, i2, i3 - 1, i4] + \
+                                y[i1, i2, i3, i4 - 1] - \
+                                y[i1, i2, i3 - 1, i4 - 1]
+
+            return [y]
+
+        self.assertDeviceChecks(dc, op, [im], [0])
+        self.assertReferenceChecks(gc, op, [im], integral_image)
+
+    @given(batch_size=st.integers(1, 3),
+           height=st.integers(7, 10),
+           width=st.integers(7, 10),
+           channels=st.integers(1, 8),
+           **hu.gcs)
+    def test_integral_image_gradient_ops(self, batch_size,
+    height, width, channels, gc, dc):
+        N = batch_size
+        C = channels
+        H = height
+        W = width
+
+        X = np.random.rand(N, C, H, W).astype(np.float32)
+        dY = np.random.rand(N, C, H + 1, W + 1).astype(np.float32)
+        op = core.CreateOperator(
+            "IntegralImageGradient",
+            ["X", "dY"],
+            ["dX"])
+
+        def integral_image_gradient(X, dY):
+            dX = np.random.rand(N, C, H, W).astype(np.float32)
+            dX1 = np.random.rand(N, C, H + 1, W).astype(np.float32)
+            #H+1,W+1=>H+1, W
+            for i1 in range(N):
+                for i2 in range(C):
+                    for i3 in range(H + 1):
+                        dX1[i1, i2, i3, 0] = dY[i1, i2, i3, 0]
+                        for i4 in range(1, W):
+                            dX1[i1, i2, i3, i4] = dY[i1, i2, i3, i4] + \
+                                dX1[i1, i2, i3, i4 - 1]
+
+            #H+1,W=>H,W
+            for i1 in range(N):
+                for i2 in range(C):
+                    for i3 in range(W):
+                        dX[i1, i2, 0, i3] = dX1[i1, i2, 0, i3]
+                        for i4 in range(1, H):
+                            dX[i1, i2, i4, i3] = dX1[i1, i2, i4, i3] + \
+                                dX[i1, i2, i4 - 1, i3]
+            return [dX]
+
+        self.assertDeviceChecks(dc, op, [X, dY], [0])
+        self.assertReferenceChecks(gc, op, [X, dY], integral_image_gradient)
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
new file mode 100644
index 0000000..97a1fba
--- /dev/null
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def entropy(p):
+    q = 1. - p
+    return -p * np.log(p) - q * np.log(q)
+
+
+def jsd(p, q):
+    return [entropy(p / 2. + q / 2.) - entropy(p) / 2. - entropy(q) / 2.]
+
+
+def jsd_grad(go, o, pq_list):
+    p, q = pq_list
+    m = (p + q) / 2.
+    return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None]
+
+
+class TestJSDOps(hu.HypothesisTestCase):
+    @given(n=st.integers(10, 100), **hu.gcs_cpu_only)
+    def test_bernoulli_jsd(self, n, gc, dc):
+        p = np.random.rand(n).astype(np.float32)
+        q = np.random.rand(n).astype(np.float32)
+        op = core.CreateOperator("BernoulliJSD", ["p", "q"], ["l"])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[p, q],
+            reference=jsd,
+            output_to_grad='l',
+            grad_reference=jsd_grad,
+        )
diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py
new file mode 100644
index 0000000..be38ee3
--- /dev/null
+++ b/caffe2/python/operator_test/key_split_ops_test.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+class TestKeySplitOps(hu.HypothesisTestCase):
+    @given(
+        X=hu.arrays(
+            dims=[1000],
+            dtype=np.int64,
+            elements=st.integers(min_value=0, max_value=100)
+        ),
+        **hu.gcs_cpu_only
+    )
+    def test_key_split_op(self, X, gc, dc):
+        categorical_limit = max(X) + 1
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('X', X)
+        output_blobs = ['Y_%d' % i for i in range(categorical_limit)]
+        op = core.CreateOperator(
+            'KeySplit', ['X'],
+            output_blobs,
+            categorical_limit=categorical_limit
+        )
+        workspace.RunOperatorOnce(op)
+        output_vecs = [
+            workspace.blobs[output_blobs[i]] for i in range(categorical_limit)
+        ]
+        expected_output_vecs = [[] for _ in range(categorical_limit)]
+        for i, x in enumerate(X):
+            expected_output_vecs[x].append(i)
+        for i in range(categorical_limit):
+            np.testing.assert_array_equal(
+                output_vecs[i],
+                np.array(expected_output_vecs[i], dtype=np.int32)
+            )
diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py
new file mode 100755
index 0000000..da983d2
--- /dev/null
+++ b/caffe2/python/operator_test/lars_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLars(hu.HypothesisTestCase):
+
+    @given(offset=st.floats(min_value=0, max_value=100), **hu.gcs)
+    def test_lars(self, offset, dc, gc):
+        X = np.random.rand(6, 7, 8, 9).astype(np.float32)
+        dX = np.random.rand(6, 7, 8, 9).astype(np.float32)
+
+        def ref_lars(X, dX):
+            return [1. / (np.linalg.norm(dX) / np.linalg.norm(X) + offset)]
+
+        op = core.CreateOperator(
+            "Lars",
+            ["X", "dX"],
+            ["rescale_factor"],
+            offset=offset
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, dX],
+            reference=ref_lars
+        )
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
new file mode 100644
index 0000000..e8bdfc3
--- /dev/null
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -0,0 +1,149 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew, core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+from caffe2.python.model_helper import ModelHelper
+
+class TestLayerNormOp(hu.HypothesisTestCase):
+    @given(X=hu.tensors(n=1), **hu.gcs)
+    def test_layer_norm_grad_op(self, X, gc, dc):
+        X = X[0]
+        if len(X.shape) == 1:
+            X = np.expand_dims(X, axis=0)
+        axis = np.random.randint(0, len(X.shape))
+        epsilon = 1e-4
+        op = core.CreateOperator(
+            "LayerNormGradient",
+            ["gout", "out", "mean", "stdev", "in"],
+            ["gin"],
+            axis=axis,
+            epsilon=epsilon,
+        )
+
+        def layer_norm_ref(X):
+            left = int(np.prod(X.shape[:axis]))
+            reshaped = np.reshape(X, [left, -1])
+            mean = np.mean(reshaped, axis=1).reshape([left, 1])
+            stdev = np.sqrt(
+                np.mean(np.square(reshaped), axis=1).reshape([left, 1]) -
+                np.power(mean, 2) + epsilon
+            )
+            norm = (reshaped - mean) / (stdev)
+            norm = np.reshape(norm, X.shape)
+            mean = np.reshape(mean, X.shape[:axis] + (1,))
+            stdev = np.reshape(stdev, X.shape[:axis] + (1,))
+            return [norm, mean, stdev]
+
+        norm, mean, stdev = layer_norm_ref(X)
+        gout = norm
+
+        def layer_norm_grad_ref(gout_full, norm, mean_full, stdev_full, X_full):
+            left = int(np.prod(X_full.shape[:axis]))
+            right = int(np.prod(X_full.shape[axis:]))
+            X = np.reshape(X_full, [left, right])
+            stdev = np.reshape(stdev_full, [left, 1])
+            mean = np.reshape(mean_full, [left, 1])
+            gout = np.reshape(gout_full, [left, right])
+            dstdev_end = (-1.0) / np.power(stdev, 2.0) \
+                    * np.sum((X - mean) * gout, axis=1).reshape([left, 1])
+            dmean_end = np.sum(-1.0 / stdev * gout, axis=1).reshape([left, 1])
+            dx_end = 1.0 / stdev * gout
+
+            # stdev block
+            dmean_stdev = -1.0 * mean / stdev * dstdev_end
+            dx_stdev = X / (right * stdev) * dstdev_end
+
+            # mean block
+            dmean = dmean_end + dmean_stdev
+            dxmean = (1.0 / right) * dmean
+
+            # final outputs
+            dx = dx_end + dx_stdev + dxmean
+            dx = dx.reshape(X_full.shape)
+
+            return [dx]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[gout, norm, mean, stdev, X],
+            reference=layer_norm_grad_ref
+        )
+        self.assertDeviceChecks(
+            device_options=dc,
+            op=op,
+            inputs=[gout, norm, mean, stdev, X],
+            outputs_to_check=[0],
+        )
+
+    @given(X=hu.tensors(n=1), **hu.gcs)
+    def test_layer_norm_op(self, X, gc, dc):
+        X = X[0]
+        if len(X.shape) == 1:
+            X = np.expand_dims(X, axis=0)
+        axis = np.random.randint(0, len(X.shape))
+        epsilon = 1e-4
+        op = core.CreateOperator(
+            "LayerNorm",
+            ["input"],
+            ["output", "mean", "stdev"],
+            axis=axis,
+            epsilon=epsilon,
+        )
+
+        def layer_norm_ref(X):
+            left = int(np.prod(X.shape[:axis]))
+            reshaped = np.reshape(X, [left, -1])
+            mean = np.mean(reshaped, axis=1).reshape([left, 1])
+            stdev = np.sqrt(
+                np.mean(np.power(reshaped, 2), axis=1).reshape([left, 1]) -
+                np.power(mean, 2) + epsilon
+            )
+            norm = (reshaped - mean) / (stdev)
+            norm = np.reshape(norm, X.shape)
+            mean = np.reshape(mean, X.shape[:axis] + (1,))
+            stdev = np.reshape(stdev, X.shape[:axis] + (1,))
+            return [norm, mean, stdev]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=layer_norm_ref
+        )
+        self.assertDeviceChecks(
+            device_options=dc,
+            op=op,
+            inputs=[X],
+            outputs_to_check=[0, 1, 2],
+        )
+
+    @given(X=hu.tensors(n=1), **hu.gcs)
+    def test_layer_norm_brew_wrapper(self, X, gc, dc):
+        X = X[0]
+        if len(X.shape) == 1:
+            X = np.expand_dims(X, axis=0)
+        axis = np.random.randint(0, len(X.shape))
+        scale_dim = [1] * np.ndim(X)
+        scale_dim[axis] = X.shape[axis]
+
+        self.ws.create_blob('input').feed(X)
+
+        model = ModelHelper(name='test_layer_norm_brew_wrapper')
+        brew.layer_norm(
+            model,
+            'input',
+            'output',
+            dim_in=X.shape[axis],
+            axis=axis,
+            epsilon=1e-4,
+        )
+
+        self.ws.create_net(model.param_init_net).run()
+        self.ws.create_net(model.net).run()
diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py
new file mode 100644
index 0000000..3ba195b
--- /dev/null
+++ b/caffe2/python/operator_test/leaky_relu_test.py
@@ -0,0 +1,177 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given, assume
+import hypothesis.strategies as st
+
+from caffe2.python import core, model_helper
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestLeakyRelu(hu.HypothesisTestCase):
+
+    def _get_inputs(self, N, C, H, W, order):
+        input_data = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
+
+        # default step size is 0.05
+        input_data[np.logical_and(
+            input_data >= 0, input_data <= 0.051)] = 0.051
+        input_data[np.logical_and(
+            input_data <= 0, input_data >= -0.051)] = -0.051
+
+        if order == 'NHWC':
+            input_data = np.transpose(input_data, axes=(0, 2, 3, 1))
+
+        return input_data,
+
+    def _get_op(self, device_option, alpha, order, inplace=False):
+        outputs = ['output' if not inplace else "input"]
+        op = core.CreateOperator(
+            'LeakyRelu',
+            ['input'],
+            outputs,
+            alpha=alpha,
+            device_option=device_option)
+        return op
+
+    def _feed_inputs(self, input_blobs, device_option):
+        names = ['input', 'scale', 'bias']
+        for name, blob in zip(names, input_blobs):
+            self.ws.create_blob(name).feed(blob, device_option=device_option)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 3),
+           C=st.integers(2, 3),
+           H=st.integers(2, 3),
+           W=st.integers(2, 3),
+           alpha=st.floats(0, 1),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           seed=st.integers(0, 1000))
+    def test_leaky_relu_gradients(self, gc, dc, N, C, H, W, order, alpha, seed):
+        np.random.seed(seed)
+
+        op = self._get_op(
+            device_option=gc,
+            alpha=alpha,
+            order=order)
+        input_blobs = self._get_inputs(N, C, H, W, order)
+
+        self.assertDeviceChecks(dc, op, input_blobs, [0])
+        self.assertGradientChecks(gc, op, input_blobs, 0, [0])
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           alpha=st.floats(0, 1),
+           seed=st.integers(0, 1000))
+    def test_leaky_relu_layout(self, gc, dc, N, C, H, W, alpha, seed):
+        outputs = {}
+        for order in ('NCHW', 'NHWC'):
+            np.random.seed(seed)
+            input_blobs = self._get_inputs(N, C, H, W, order)
+            self._feed_inputs(input_blobs, device_option=gc)
+            op = self._get_op(
+                device_option=gc,
+                alpha=alpha,
+                order=order)
+            self.ws.run(op)
+            outputs[order] = self.ws.blobs['output'].fetch()
+        np.testing.assert_allclose(
+            outputs['NCHW'],
+            outputs['NHWC'].transpose((0, 3, 1, 2)),
+            atol=1e-4,
+            rtol=1e-4)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           alpha=st.floats(0, 1),
+           seed=st.integers(0, 1000),
+           inplace=st.booleans())
+    def test_leaky_relu_reference_check(self, gc, dc, N, C, H, W, order, alpha,
+                                        seed, inplace):
+        np.random.seed(seed)
+
+        if order != "NCHW":
+            assume(not inplace)
+
+        inputs = self._get_inputs(N, C, H, W, order)
+        op = self._get_op(
+            device_option=gc,
+            alpha=alpha,
+            order=order,
+            inplace=inplace)
+
+        def ref(input_blob):
+            result = input_blob.copy()
+            result[result < 0] *= alpha
+            return result,
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(gc=hu.gcs['gc'],
+           dc=hu.gcs['dc'],
+           N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           alpha=st.floats(0, 1),
+           seed=st.integers(0, 1000))
+    def test_leaky_relu_device_check(self, gc, dc, N, C, H, W, order, alpha,
+                                     seed):
+        np.random.seed(seed)
+
+        inputs = self._get_inputs(N, C, H, W, order)
+        op = self._get_op(
+            device_option=gc,
+            alpha=alpha,
+            order=order)
+
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+    @given(N=st.integers(2, 10),
+           C=st.integers(3, 10),
+           H=st.integers(5, 10),
+           W=st.integers(7, 10),
+           order=st.sampled_from(['NCHW', 'NHWC']),
+           alpha=st.floats(0, 1),
+           seed=st.integers(0, 1000))
+    def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha, seed):
+        np.random.seed(seed)
+        arg_scope = {'order': order}
+        model = model_helper.ModelHelper(name="test_model", arg_scope=arg_scope)
+        model.LeakyRelu(
+            'input',
+            'output',
+            alpha=alpha)
+
+        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
+        if order == 'NHWC':
+            input_blob = np.transpose(input_blob, axes=(0, 2, 3, 1))
+
+        self.ws.create_blob('input').feed(input_blob)
+
+        self.ws.create_net(model.param_init_net).run()
+        self.ws.create_net(model.net).run()
+
+        output_blob = self.ws.blobs['output'].fetch()
+        if order == 'NHWC':
+            output_blob = np.transpose(output_blob, axes=(0, 3, 1, 2))
+
+        assert output_blob.shape == (N, C, H, W)
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
new file mode 100644
index 0000000..2284fdb
--- /dev/null
+++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
@@ -0,0 +1,79 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+class TestLearningRateAdaption(hu.HypothesisTestCase):
+    @given(inputs=hu.tensors(n=2),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr_alpha=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_learning_rate_adaption_op_normalization(self, inputs, lr, lr_alpha,
+                                                     gc, dc):
+        grad, effgrad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            'LearningRateAdaption',
+            ['lr', 'grad', 'effgrad'],
+            ['output_lr'],
+            lr_alpha=lr_alpha)
+
+        def ref(lr, grad, effgrad):
+            flattened_grad = grad.flatten()
+            flattened_effgrad = effgrad.flatten()
+            x = np.dot(flattened_grad, flattened_effgrad)
+            kEps = 1e-12
+            y = np.linalg.norm(flattened_grad, ord=2)
+            y = np.maximum(y, kEps)
+            z = np.linalg.norm(flattened_effgrad, ord=2)
+            z = np.maximum(z, kEps)
+            output_lr = lr
+            output_lr[0] -= lr[0] * lr_alpha * float(x / (y * z))
+            return output_lr,
+
+        self.assertReferenceChecks(
+            gc, op,
+            [lr, grad, effgrad],
+            ref)
+
+    @given(inputs=hu.tensors(n=2),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr_alpha=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_learning_rate_adaption_op_without_normalization(self, inputs, lr,
+                                                             lr_alpha, gc, dc):
+        grad, effgrad = inputs
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            'LearningRateAdaption',
+            ['lr', 'grad', 'effgrad'],
+            ['output_lr'],
+            lr_alpha=lr_alpha,
+            normalized_lr_adaption=False)
+
+        def ref(lr, grad, effgrad):
+            flattened_grad = grad.flatten()
+            flattened_effgrad = effgrad.flatten()
+            x = np.dot(flattened_grad, flattened_effgrad)
+            output_lr = lr
+            output_lr[0] -= lr_alpha * x
+            return output_lr,
+
+        self.assertReferenceChecks(
+            gc, op,
+            [lr, grad, effgrad],
+            ref)
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
new file mode 100644
index 0000000..73710e5
--- /dev/null
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -0,0 +1,191 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+import copy
+from functools import partial
+import math
+import numpy as np
+
+
+class TestLearningRate(hu.HypothesisTestCase):
+    @given(**hu.gcs_cpu_only)
+    def test_alter_learning_rate_op(self, gc, dc):
+        iter = np.random.randint(low=1, high=1e5, size=1)
+        active_period = int(np.random.randint(low=1, high=1e3, size=1))
+        inactive_period = int(np.random.randint(low=1, high=1e3, size=1))
+        base_lr = float(np.random.random(1))
+
+        def ref(iter):
+            iter = float(iter)
+            reminder = iter % (active_period + inactive_period)
+            if reminder < active_period:
+                return (np.array(base_lr), )
+            else:
+                return (np.array(0.), )
+
+        op = core.CreateOperator(
+            'LearningRate',
+            'iter',
+            'lr',
+            policy="alter",
+            active_first=True,
+            base_lr=base_lr,
+            active_period=active_period,
+            inactive_period=inactive_period
+        )
+
+        self.assertReferenceChecks(gc, op, [iter], ref)
+
+    @given(**hu.gcs_cpu_only)
+    def test_hill_learning_rate_op(self, gc, dc):
+        iter = np.random.randint(low=1, high=1e5, size=1)
+
+        num_iter = int(np.random.randint(low=1e2, high=1e3, size=1))
+        start_multiplier = 1e-4
+        gamma = 1.0
+        power = 0.5
+        end_multiplier = 1e-2
+        base_lr = float(np.random.random(1))
+
+        def ref(iter):
+            iter = float(iter)
+            if iter < num_iter:
+                lr = start_multiplier + (
+                    1.0 - start_multiplier
+                ) * iter / num_iter
+            else:
+                iter -= num_iter
+                lr = math.pow(1.0 + gamma * iter, -power)
+                lr = max(lr, end_multiplier)
+            return (np.array(base_lr * lr), )
+
+        op = core.CreateOperator(
+            'LearningRate',
+            'data',
+            'out',
+            policy="hill",
+            base_lr=base_lr,
+            num_iter=num_iter,
+            start_multiplier=start_multiplier,
+            gamma=gamma,
+            power=power,
+            end_multiplier=end_multiplier,
+        )
+        self.assertReferenceChecks(gc, op, [iter], ref)
+
+    @given(gc=hu.gcs['gc'],
+            min_num_iter=st.integers(min_value=10, max_value=20),
+            max_num_iter=st.integers(min_value=50, max_value=100))
+    def test_composite_learning_rate_op(self, gc, min_num_iter, max_num_iter):
+        np.random.seed(65535)
+        # Generate the iteration numbers for sub policy
+        # The four sub policies are as follows:
+        # 1. exp; 2. step; 3. fix; 4. exp
+        num_lr_policy = 4
+        iter_nums = np.random.randint(
+            low=min_num_iter, high=max_num_iter, size=num_lr_policy)
+        accu_iter_num = copy.deepcopy(iter_nums)
+        for i in range(1, num_lr_policy):
+            accu_iter_num[i] += accu_iter_num[i - 1]
+        total_iter_nums = accu_iter_num[-1]
+
+        policy_lr_scale = np.random.uniform(low=2.0, high=2.0, size=num_lr_policy)
+
+        # args for StepLRPolicy
+        step_size = np.random.randint(low=2, high=min_num_iter // 2)
+        step_gamma = np.random.random()
+        # args for ExpLRPolicy
+        exp_gamma = np.random.random()
+        # common args
+        base_lr = 0.1
+
+        # StepLRPolicy
+        def step_lr(iter, lr_scale):
+            return math.pow(step_gamma, iter // step_size) * lr_scale
+
+        # ExpLRPolicy
+        def exp_lr(iter, lr_scale):
+            return math.pow(exp_gamma, iter) * lr_scale
+
+        # FixedLRPolicy
+        def fixed_lr(iter, lr_scale):
+            return lr_scale
+
+        # test one sub policy case
+        def one_policy_check_ref(iter, lr_scale):
+            iter = int(iter)
+            exp_lr_val = exp_lr(iter, lr_scale=lr_scale)
+            return (np.array(base_lr * exp_lr_val), )
+
+        op = core.CreateOperator(
+            'LearningRate',
+            'data',
+            'out',
+            policy='composite',
+            sub_policy_num_iters=iter_nums[:1],
+            sub_policy_0_lr_scale=policy_lr_scale[0],
+            sub_policy_0_policy='exp',
+            sub_policy_0_gamma=exp_gamma,
+            base_lr=base_lr,
+        )
+        for iter_idx in range(1, total_iter_nums + 1):
+            self.assertReferenceChecks(
+                gc, op, [np.asarray([iter_idx])],
+                partial(one_policy_check_ref, lr_scale=policy_lr_scale[0]))
+
+        # all the case with all four sub policies
+        def all_sub_policy_check_ref(iter, lr_scale):
+            assert iter <= accu_iter_num[3]
+            if iter <= accu_iter_num[0]:
+                lr = exp_lr(iter, lr_scale=lr_scale)
+            elif iter <= accu_iter_num[1]:
+                lr = step_lr(iter, lr_scale=lr_scale)
+            elif iter <= accu_iter_num[2]:
+                lr = fixed_lr(iter, lr_scale=lr_scale)
+            else:
+                lr = exp_lr(iter, lr_scale=lr_scale)
+            return (np.array(base_lr * lr), )
+
+        op = core.CreateOperator(
+            'LearningRate',
+            'data',
+            'out',
+            policy='composite',
+            sub_policy_num_iters=iter_nums,
+            sub_policy_0_policy='exp',
+            sub_policy_0_lr_scale=policy_lr_scale[0],
+            sub_policy_0_gamma=exp_gamma,
+            sub_policy_1_policy='step',
+            sub_policy_1_lr_scale=policy_lr_scale[1],
+            sub_policy_1_stepsize=step_size,
+            sub_policy_1_gamma=step_gamma,
+            sub_policy_2_policy='fixed',
+            sub_policy_2_lr_scale=policy_lr_scale[2],
+            sub_policy_3_policy='exp',
+            sub_policy_3_gamma=exp_gamma,
+            sub_policy_3_lr_scale=policy_lr_scale[3],
+            base_lr=base_lr,
+        )
+
+        iter_policy = 0
+        for iter_idx in range(1, total_iter_nums + 1):
+            if iter_idx > accu_iter_num[iter_policy]:
+                iter_policy += 1
+            self.assertReferenceChecks(
+                gc, op, [np.asarray([iter_idx])],
+                partial(all_sub_policy_check_ref,
+                        lr_scale=policy_lr_scale[iter_policy])
+            )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
new file mode 100644
index 0000000..f879b70
--- /dev/null
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLengthsPadOp(hu.HypothesisTestCase):
+
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True,
+        ),
+        delta_length=st.integers(0, 10),
+        padding_value=st.floats(-10.0, 10.0),
+        **hu.gcs
+    )
+    def test_lengths_pad(self, inputs, delta_length, padding_value, gc, dc):
+        data, lengths = inputs
+        max_length = np.max(lengths) if len(lengths) > 0 else 0
+        target_length = max(max_length + delta_length, 1)
+
+        def lengths_pad_op(data, lengths):
+            N = len(lengths)
+            output = np.ndarray(
+                shape=(target_length * N, ) + data.shape[1:], dtype=np.float32)
+            output.fill(padding_value)
+            ptr1, ptr2 = 0, 0
+            for i in range(N):
+                output[ptr1:ptr1 + lengths[i]] = data[ptr2:ptr2 + lengths[i]]
+                ptr1 += target_length
+                ptr2 += lengths[i]
+
+            return [output]
+
+        op = core.CreateOperator(
+            "LengthsPad",
+            ["data", "lengths"],
+            ["data_padded"],
+            target_length=target_length,
+            padding_value=padding_value,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            reference=lengths_pad_op,
+        )
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
new file mode 100644
index 0000000..d37904c
--- /dev/null
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLengthsTileOp(hu.HypothesisTestCase):
+
+    @given(
+        inputs=st.integers(min_value=1, max_value=20).flatmap(
+            lambda size: st.tuples(
+                hu.arrays([size]),
+                hu.arrays([size], dtype=np.int32,
+                          elements=st.integers(min_value=0, max_value=20)),
+            )
+        ),
+        **hu.gcs)
+    def test_lengths_tile(self, inputs, gc, dc):
+        data, lengths = inputs
+
+        def lengths_tile_op(data, lengths):
+            return [np.concatenate([
+                [d] * l for d, l in zip(data, lengths)
+            ])]
+
+        op = core.CreateOperator(
+            "LengthsTile",
+            ["data", "lengths"],
+            ["output"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            reference=lengths_tile_op,
+        )
+
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            outputs_to_check=0,
+            outputs_with_grads=[0]
+        )
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
new file mode 100644
index 0000000..6ffb5fc
--- /dev/null
+++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLengthsTopKOps(hu.HypothesisTestCase):
+    @given(N=st.integers(min_value=0, max_value=10),
+           K=st.integers(min_value=1, max_value=10),
+           **hu.gcs_cpu_only)
+    def test_lengths_top_k_op(self, N, K, gc, dc):
+        lens = np.random.randint(low=1, high=2 * K + 1, size=N).astype(np.int32)
+        X = []
+        for i in lens:
+            X.extend(map(lambda x: x / 100.0, range(0, 6 * i, 6)))
+        X = np.array(X, dtype=np.float32)
+        op = core.CreateOperator("LengthsTopK", ["X", "Y"], ["values", "indices"], k=K)
+
+        def lengths_top_k(X, lens):
+            N, si = lens.shape[0], 0
+            values, indices = [], []
+            for i in range(N):
+                cur_indices = X[si:si + lens[i]].argsort()[-K:][::-1]
+                cur_values = X[si:si + lens[i]][cur_indices]
+                values.extend(cur_values)
+                indices.extend(cur_indices)
+                si += lens[i]
+                if lens[i] < K:
+                    values.extend([0] * (K - lens[i]))
+                    indices.extend([-1] * (K - lens[i]))
+
+            return (np.array(values, dtype=np.float32).reshape(-1, K),
+                    np.array(indices, dtype=np.int32).reshape(-1, K))
+
+        self.assertDeviceChecks(dc, op, [X, lens], [0, 1])
+        self.assertReferenceChecks(gc, op, [X, lens], lengths_top_k)
+        self.assertGradientChecks(gc, op, [X, lens], 0, [0])
+
+    @given(N=st.integers(min_value=0, max_value=10),
+           K=st.integers(min_value=1, max_value=10),
+           **hu.gcs_cpu_only)
+    def test_lengths_top_k_empty_op(self, N, K, gc, dc):
+        lens = np.zeros((N, ), dtype=np.int32)
+        X = np.array([], dtype=np.float32)
+        op = core.CreateOperator("LengthsTopK", ["X", "Y"], ["values", "indices"], k=K)
+
+        def lengths_top_k(X, lens):
+            return (np.zeros((N, K), dtype=np.float32),
+                    -1 * np.ones((N, K), dtype=np.int32))
+
+        self.assertDeviceChecks(dc, op, [X, lens], [0, 1])
+        self.assertReferenceChecks(gc, op, [X, lens], lengths_top_k)
+        self.assertGradientChecks(gc, op, [X, lens], 0, [0])
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
new file mode 100644
index 0000000..c690b3a
--- /dev/null
+++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py
@@ -0,0 +1,114 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestListwiseL2rOps(hu.HypothesisTestCase):
+    def ref_lambda_rank_loss(self, y, r, use_ndcg_as_loss):
+        n = len(y)
+
+        def get_discounts(v):
+            x = np.argsort(v)
+            d = [0 for _ in range(n)]
+            for i in range(n):
+                d[x[i]] = 1. / np.log2(n - i + 1.)
+            return d
+
+        def sigm(x):
+            return 1 / (1 + np.exp(-x))
+
+        def log_sigm(x):
+            return -np.log(1 + np.exp(-x))
+
+        dy = np.zeros(n)
+        loss = 0
+        if(np.sum(np.abs(r)) < 1e-6):
+            return loss, dy
+
+        g = [2**r[i] for i in range(n)]
+        d = get_discounts(r)
+        idcg = sum([g[i] * d[i] for i in range(n)])
+
+        if (idcg < 1e-5):
+            idcg = 1e-5
+
+        d = get_discounts(y)
+
+        if use_ndcg_as_loss:
+            dcg = sum(g[i] * d[i] for i in range(n))
+            loss = 1.0 - dcg / idcg
+        for i in range(n):
+            for j in range(n):
+                if i == j:
+                    continue
+                lambda_weight = np.abs((2**r[i] - 2**r[j]) * (d[i] - d[j]))
+                rank_loss = -log_sigm(
+                    y[i] - y[j] if r[i] > r[j] else y[j] - y[i]
+                )
+                rank_dy = (0. if r[i] > r[j] else 1.) - sigm(-y[i] + y[j])
+                if(not use_ndcg_as_loss):
+                    loss += lambda_weight * rank_loss / idcg
+                dy[i] += lambda_weight * rank_dy / idcg
+        return loss, dy
+
+    @given(n=st.integers(1, 20), k=st.integers(2, 5), m=st.integers(3, 5))
+    def test_lambda_rank_loss(self, n, k, m):
+        y = np.random.rand(n * m).astype(np.float32)
+        r = np.random.randint(k, size=n * m).astype(np.float32)
+        # m sessions of length n
+        session_lengths = np.repeat(n, m).astype(np.int32)
+        ref_loss = np.empty(0)
+        ref_ndcg_loss = np.empty(0)
+        ref_dy = np.empty(0)
+        for i in range(m):
+            r_loss, r_dy = self.ref_lambda_rank_loss(
+                y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], False)
+            r_ndcg_loss, _ = self.ref_lambda_rank_loss(
+                y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True)
+            ref_loss = np.append(ref_loss, r_loss)
+            ref_dy = np.append(ref_dy, r_dy)
+            ref_ndcg_loss = np.append(ref_ndcg_loss, r_ndcg_loss)
+
+        dloss = np.random.random(m).astype(np.float32)
+
+        workspace.blobs['y'] = y
+        workspace.blobs['r'] = r
+        workspace.blobs['session_lengths'] = session_lengths
+        workspace.blobs['dloss'] = dloss
+
+        op = core.CreateOperator(
+            'LambdaRankNdcg', ['y', 'r', 'session_lengths'], ['loss', 'dy'],
+            use_ndcg_as_loss=False)
+        workspace.RunOperatorOnce(op)
+        loss = workspace.blobs['loss']
+        dy = workspace.blobs['dy']
+        np.testing.assert_allclose(loss, ref_loss, rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)
+
+        op = core.CreateOperator(
+            'LambdaRankNdcg', ['y', 'r', 'session_lengths'], ['loss', 'dy'],
+            use_ndcg_as_loss=True)
+        workspace.RunOperatorOnce(op)
+        loss = workspace.blobs['loss']
+        dy = workspace.blobs['dy']
+        np.testing.assert_allclose(loss, ref_ndcg_loss, rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)
+
+        op = core.CreateOperator(
+            'LambdaRankNdcgGradient',
+            ['y', 'session_lengths', 'dy', 'dloss'],
+            ['dy_back']
+        )
+        workspace.RunOperatorOnce(op)
+        dy_back = workspace.blobs['dy_back']
+        for i in range(m):
+            np.testing.assert_allclose(
+                dy_back[i * n:(i + 1) * n],
+                dloss[i] * ref_dy[i * n:(i + 1) * n],
+                rtol=1e-5, atol=1e-6)
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
new file mode 100644
index 0000000..07f378b
--- /dev/null
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -0,0 +1,460 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import errno
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+import os
+import shutil
+import tempfile
+import unittest
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, test_util, workspace
+
+if workspace.has_gpu_support:
+    DEVICES = [caffe2_pb2.CPU, caffe2_pb2.CUDA]
+    max_gpuid = workspace.NumCudaDevices() - 1
+else:
+    DEVICES = [caffe2_pb2.CPU]
+    max_gpuid = 0
+
+
+# Utility class for other loading tests, don't add test functions here
+# Inherit from this test instead. If you add a test here,
+# each derived class will inherit it as well and cause test duplication
+class TestLoadSaveBase(test_util.TestCase):
+
+    def __init__(self, methodName, db_type='minidb'):
+        super(TestLoadSaveBase, self).__init__(methodName)
+        self._db_type = db_type
+
+    @given(src_device_type=st.sampled_from(DEVICES),
+           src_gpu_id=st.integers(min_value=0, max_value=max_gpuid),
+           dst_device_type=st.sampled_from(DEVICES),
+           dst_gpu_id=st.integers(min_value=0, max_value=max_gpuid))
+    def load_save(self, src_device_type, src_gpu_id,
+                  dst_device_type, dst_gpu_id):
+        workspace.ResetWorkspace()
+        dtypes = [np.float16, np.float32, np.float64, np.bool, np.int8,
+                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
+        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
+                  for T in dtypes]
+        src_device_option = core.DeviceOption(
+            src_device_type, src_gpu_id)
+        dst_device_option = core.DeviceOption(
+            dst_device_type, dst_gpu_id)
+
+        for i, arr in enumerate(arrays):
+            self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option))
+            self.assertTrue(workspace.HasBlob(str(i)))
+
+        try:
+            # Saves the blobs to a local db.
+            tmp_folder = tempfile.mkdtemp()
+            op = core.CreateOperator(
+                "Save",
+                [str(i) for i in range(len(arrays))], [],
+                absolute_path=1,
+                db=os.path.join(tmp_folder, "db"), db_type=self._db_type)
+            self.assertTrue(workspace.RunOperatorOnce(op))
+
+            # Reset the workspace so that anything we load is surely loaded
+            # from the serialized proto.
+            workspace.ResetWorkspace()
+            self.assertEqual(len(workspace.Blobs()), 0)
+
+            def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
+                """A helper subfunction to test keep and not keep."""
+                op = core.CreateOperator(
+                    "Load",
+                    [], blobs,
+                    absolute_path=1,
+                    db=os.path.join(tmp_folder, "db"), db_type=self._db_type,
+                    device_option=dst_device_option,
+                    keep_device=keep_device,
+                    load_all=loadAll)
+                self.assertTrue(workspace.RunOperatorOnce(op))
+                for i, arr in enumerate(arrays):
+                    self.assertTrue(workspace.HasBlob(str(i)))
+                    fetched = workspace.FetchBlob(str(i))
+                    self.assertEqual(fetched.dtype, arr.dtype)
+                    np.testing.assert_array_equal(
+                        workspace.FetchBlob(str(i)), arr)
+                    proto = caffe2_pb2.BlobProto()
+                    proto.ParseFromString(workspace.SerializeBlob(str(i)))
+                    self.assertTrue(proto.HasField('tensor'))
+                    self.assertEqual(proto.tensor.device_detail.device_type,
+                                     device_type)
+                    if device_type == caffe2_pb2.CUDA:
+                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
+                                         gpu_id)
+
+            blobs = [str(i) for i in range(len(arrays))]
+            # Load using device option stored in the proto, i.e.
+            # src_device_option
+            _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
+            # Load again, but this time load into dst_device_option.
+            _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)
+            # Load back to the src_device_option to see if both paths are able
+            # to reallocate memory.
+            _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
+            # Reset the workspace, and load directly into the dst_device_option.
+            workspace.ResetWorkspace()
+            _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)
+
+            # Test load all which loads all blobs in the db into the workspace.
+            workspace.ResetWorkspace()
+            _LoadTest(1, src_device_type, src_gpu_id, [], 1)
+            # Load again making sure that overwrite functionality works.
+            _LoadTest(1, src_device_type, src_gpu_id, [], 1)
+            # Load again with different device.
+            _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
+            workspace.ResetWorkspace()
+            _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
+        finally:
+            # clean up temp folder.
+            try:
+                shutil.rmtree(tmp_folder)
+            except OSError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+
+    def saveFile(self, tmp_folder, db_name, db_type, start_blob_id):
+        dtypes = [np.float16, np.float32, np.float64, np.bool, np.int8,
+                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
+        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
+                  for T in dtypes]
+
+        for i, arr in enumerate(arrays):
+            self.assertTrue(workspace.FeedBlob(str(i + start_blob_id), arr))
+            self.assertTrue(workspace.HasBlob(str(i + start_blob_id)))
+
+        # Saves the blobs to a local db.
+        tmp_file = os.path.join(tmp_folder, db_name)
+        op = core.CreateOperator(
+            "Save",
+            [str(i + start_blob_id) for i in range(len(arrays))], [],
+            absolute_path=1,
+            db=tmp_file, db_type=db_type)
+        workspace.RunOperatorOnce(op)
+        return tmp_file, arrays
+
+
+class TestLoadSave(TestLoadSaveBase):
+
+    def testLoadSave(self):
+        self.load_save()
+
+    def testRepeatedArgs(self):
+        dtypes = [np.float16, np.float32, np.float64, np.bool, np.int8,
+                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
+        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
+                  for T in dtypes]
+
+        for i, arr in enumerate(arrays):
+            self.assertTrue(workspace.FeedBlob(str(i), arr))
+            self.assertTrue(workspace.HasBlob(str(i)))
+
+        # Saves the blobs to a local db.
+        tmp_folder = tempfile.mkdtemp()
+        op = core.CreateOperator(
+            "Save",
+            [str(i) for i in range(len(arrays))] * 2, [],
+            absolute_path=1,
+            db=os.path.join(tmp_folder, "db"), db_type=self._db_type)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testLoadExcessblobs(self):
+        tmp_folder = tempfile.mkdtemp()
+        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
+
+        op = core.CreateOperator(
+            "Load",
+            [], [str(i) for i in range(len(arrays))] * 2,
+            absolute_path=1,
+            db=tmp_file, db_type=self._db_type,
+            load_all=False)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testTruncatedFile(self):
+        tmp_folder = tempfile.mkdtemp()
+        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
+
+        with open(tmp_file, 'wb+') as fdest:
+            fdest.seek(20, os.SEEK_END)
+            fdest.truncate()
+
+        op = core.CreateOperator(
+            "Load",
+            [], [str(i) for i in range(len(arrays))],
+            absolute_path=1,
+            db=tmp_file, db_type=self._db_type,
+            load_all=False)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+        op = core.CreateOperator(
+            "Load",
+            [], [],
+            absolute_path=1,
+            db=tmp_file, db_type=self._db_type,
+            load_all=True)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testBlobNameOverrides(self):
+        original_names = ['blob_a', 'blob_b', 'blob_c']
+        new_names = ['x', 'y', 'z']
+        blobs = [np.random.permutation(6) for i in range(3)]
+        for i, blob in enumerate(blobs):
+            self.assertTrue(workspace.FeedBlob(original_names[i], blob))
+            self.assertTrue(workspace.HasBlob(original_names[i]))
+        self.assertEqual(len(workspace.Blobs()), 3)
+
+        try:
+            # Saves the blobs to a local db.
+            tmp_folder = tempfile.mkdtemp()
+            with self.assertRaises(RuntimeError):
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "Save", original_names, [],
+                        absolute_path=1,
+                        strip_prefix='.temp',
+                        blob_name_overrides=new_names,
+                        db=os.path.join(tmp_folder, "db"),
+                        db_type=self._db_type
+                    )
+                )
+            self.assertTrue(
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "Save", original_names, [],
+                        absolute_path=1,
+                        blob_name_overrides=new_names,
+                        db=os.path.join(tmp_folder, "db"),
+                        db_type=self._db_type
+                    )
+                )
+            )
+            self.assertTrue(workspace.ResetWorkspace())
+            self.assertEqual(len(workspace.Blobs()), 0)
+            self.assertTrue(
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "Load", [], [],
+                        absolute_path=1,
+                        db=os.path.join(tmp_folder, "db"),
+                        db_type=self._db_type,
+                        load_all=1
+                    )
+                )
+            )
+            self.assertEqual(len(workspace.Blobs()), 3)
+            for i, name in enumerate(new_names):
+                self.assertTrue(workspace.HasBlob(name))
+                self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
+            # moved here per @cxj's suggestion
+            load_new_names = ['blob_x', 'blob_y', 'blob_z']
+            # load 'x' into 'blob_x'
+            self.assertTrue(
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "Load", [], load_new_names[0:1],
+                        absolute_path=1,
+                        db=os.path.join(tmp_folder, "db"),
+                        db_type=self._db_type,
+                        source_blob_names=new_names[0:1]
+                    )
+                )
+            )
+            # we should have 'blob_a/b/c/' and 'blob_x' now
+            self.assertEqual(len(workspace.Blobs()), 4)
+            for i, name in enumerate(load_new_names[0:1]):
+                self.assertTrue(workspace.HasBlob(name))
+                self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
+            self.assertTrue(
+                workspace.RunOperatorOnce(
+                    core.CreateOperator(
+                        "Load", [], load_new_names[0:3],
+                        absolute_path=1,
+                        db=os.path.join(tmp_folder, "db"),
+                        db_type=self._db_type,
+                        source_blob_names=new_names[0:3]
+                    )
+                )
+            )
+            # we should have 'blob_a/b/c/' and 'blob_x/y/z' now
+            self.assertEqual(len(workspace.Blobs()), 6)
+            for i, name in enumerate(load_new_names[0:3]):
+                self.assertTrue(workspace.HasBlob(name))
+                self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
+        finally:
+            # clean up temp folder.
+            try:
+                shutil.rmtree(tmp_folder)
+            except OSError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+
+    def testMissingFile(self):
+        tmp_folder = tempfile.mkdtemp()
+        tmp_file = os.path.join(tmp_folder, "missing_db")
+
+        op = core.CreateOperator(
+            "Load",
+            [], [],
+            absolute_path=1,
+            db=tmp_file, db_type=self._db_type,
+            load_all=True)
+        with self.assertRaises(RuntimeError):
+            try:
+                workspace.RunOperatorOnce(op)
+            except RuntimeError as e:
+                print(e)
+                raise
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testLoadMultipleFilesGivenSourceBlobNames(self):
+        tmp_folder = tempfile.mkdtemp()
+        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
+        db_file_2, arrays_2 = self.saveFile(
+            tmp_folder, "db2", self._db_type, len(arrays_1)
+        )
+        db_files = [db_file_1, db_file_2]
+        blobs_names = [str(i) for i in range(len(arrays_1) + len(arrays_2))]
+
+        workspace.ResetWorkspace()
+        self.assertEqual(len(workspace.Blobs()), 0)
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "Load",
+                    [], blobs_names,
+                    absolute_path=1,
+                    dbs=db_files, db_type=self._db_type,
+                    source_blob_names=blobs_names
+                )
+            )
+        )
+        self.assertEqual(len(workspace.Blobs()), len(blobs_names))
+        for i in range(len(arrays_1)):
+            np.testing.assert_array_equal(
+                workspace.FetchBlob(str(i)), arrays_1[i]
+            )
+        for i in range(len(arrays_2)):
+            np.testing.assert_array_equal(
+                workspace.FetchBlob(str(i + len(arrays_1))), arrays_2[i]
+            )
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testLoadAllMultipleFiles(self):
+        tmp_folder = tempfile.mkdtemp()
+        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
+        db_file_2, arrays_2 = self.saveFile(
+            tmp_folder, "db2", self._db_type, len(arrays_1)
+        )
+        db_files = [db_file_1, db_file_2]
+
+        workspace.ResetWorkspace()
+        self.assertEqual(len(workspace.Blobs()), 0)
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "Load",
+                    [], [],
+                    absolute_path=1,
+                    dbs=db_files, db_type=self._db_type,
+                    load_all=True
+                )
+            )
+        )
+        self.assertEqual(len(workspace.Blobs()), len(arrays_1) + len(arrays_2))
+        for i in range(len(arrays_1)):
+            np.testing.assert_array_equal(
+                workspace.FetchBlob(str(i)), arrays_1[i]
+            )
+        for i in range(len(arrays_2)):
+            np.testing.assert_array_equal(
+                workspace.FetchBlob(str(i + len(arrays_1))), arrays_2[i]
+            )
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testLoadAllMultipleFilesWithSameKey(self):
+        tmp_folder = tempfile.mkdtemp()
+        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
+        db_file_2, arrays_2 = self.saveFile(tmp_folder, "db2", self._db_type, 0)
+
+        db_files = [db_file_1, db_file_2]
+        workspace.ResetWorkspace()
+        self.assertEqual(len(workspace.Blobs()), 0)
+        op = core.CreateOperator(
+            "Load",
+            [], [],
+            absolute_path=1,
+            dbs=db_files, db_type=self._db_type,
+            load_all=True)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def testLoadRepeatedFiles(self):
+        tmp_folder = tempfile.mkdtemp()
+        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
+
+        db_files = [tmp_file, tmp_file]
+        workspace.ResetWorkspace()
+        self.assertEqual(len(workspace.Blobs()), 0)
+        op = core.CreateOperator(
+            "Load",
+            [], [str(i) for i in range(len(arrays))],
+            absolute_path=1,
+            dbs=db_files, db_type=self._db_type,
+            load_all=False)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+        try:
+            shutil.rmtree(tmp_folder)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
new file mode 100644
index 0000000..29ff72c
--- /dev/null
+++ b/caffe2/python/operator_test/locally_connected_op_test.py
@@ -0,0 +1,220 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestLocallyConnectedOp(hu.HypothesisTestCase):
+    @given(N=st.integers(1, 3),
+           C=st.integers(1, 3),
+           H=st.integers(1, 5),
+           W=st.integers(1, 5),
+           M=st.integers(1, 3),
+           kernel=st.integers(1, 3),
+           op_name=st.sampled_from(["LC", "LC2D"]),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_lc_2d(
+            self, N, C, H, W, M, kernel, op_name, order, use_bias, gc, dc):
+        if H < kernel:
+            kernel = H
+        if W < kernel:
+            kernel = W
+
+        op = core.CreateOperator(
+            op_name,
+            ["X", "W", "b"] if use_bias else ["X", "W"],
+            ["Y"],
+            kernels=[kernel, kernel],
+            order=order,
+            engine="",
+        )
+
+        Y_H = H - kernel + 1
+        Y_W = W - kernel + 1
+        if order == "NCHW":
+            X = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
+            W = np.random.rand(Y_H, Y_W, M, C, kernel,
+                               kernel).astype(np.float32) - 0.5
+        else:
+            X = np.random.rand(N, H, W, C).astype(np.float32) - 0.5
+            W = np.random.rand(Y_H, Y_W, M, kernel, kernel,
+                               C).astype(np.float32) - 0.5
+        b = np.random.rand(Y_H, Y_W, M).astype(np.float32) - 0.5
+        inputs = [X, W, b] if use_bias else [X, W]
+
+        def lc_2d_nchw(X, W, b=None):
+            N, C, XH, XW = X.shape
+            YH, YW, M, _, KH, KW = W.shape
+
+            def conv(n, m, yh, yw):
+                sum = b[yh, yw, m] if b is not None else 0
+                for c in range(C):
+                    for kh in range(KH):
+                        for kw in range(KW):
+                            hh = yh + kh
+                            ww = yw + kw
+                            sum += X[n, c, hh, ww] * W[yh, yw, m, c, kh, kw]
+                return sum
+
+            output = np.zeros((N, M, YH, YW), dtype=np.float32)
+            for n in range(N):
+                for m in range(M):
+                    for yh in range(YH):
+                        for yw in range(YW):
+                            output[n, m, yh, yw] = conv(n, m, yh, yw)
+            return [output]
+
+        def lc_2d_nhwc(X, W, b=None):
+            XT = np.transpose(X, [0, 3, 1, 2])
+            WT = np.transpose(W, [0, 1, 2, 5, 3, 4])
+            output = lc_2d_nchw(XT, WT, b)
+            return [np.transpose(output[0], [0, 2, 3, 1])]
+
+        ref_op = lc_2d_nchw if order == "NCHW" else lc_2d_nhwc
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @given(N=st.integers(1, 3),
+           C=st.integers(1, 3),
+           size=st.integers(1, 5),
+           M=st.integers(1, 3),
+           kernel=st.integers(1, 3),
+           op_name=st.sampled_from(["LC", "LC1D"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_lc_1d(self, N, C, size, M, kernel, op_name, use_bias, gc, dc):
+        if size < kernel:
+            kernel = size
+
+        op = core.CreateOperator(
+            op_name,
+            ["X", "W", "b"] if use_bias else ["X", "W"],
+            ["Y"],
+            kernels=[kernel],
+            order="NCHW",
+            engine="",
+        )
+
+        L = size - kernel + 1
+        X = np.random.rand(N, C, size).astype(np.float32) - 0.5
+        W = np.random.rand(L, M, C, kernel).astype(np.float32) - 0.5
+        b = np.random.rand(L, M).astype(np.float32) - 0.5
+        inputs = [X, W, b] if use_bias else [X, W]
+
+        def lc_1d_nchw(X, W, b=None):
+            N, C, XL = X.shape
+            YL, M, _, KL = W.shape
+
+            def conv(n, m, yl):
+                sum = b[yl, m] if b is not None else 0
+                for c in range(C):
+                    for kl in range(KL):
+                        ll = yl + kl
+                        sum += X[n, c, ll] * W[yl, m, c, kl]
+                return sum
+
+            output = np.zeros((N, M, YL), dtype=np.float32)
+            for n in range(N):
+                for m in range(M):
+                    for yl in range(YL):
+                        output[n, m, yl] = conv(n, m, yl)
+            return [output]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=lc_1d_nchw,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    @given(N=st.integers(1, 1),
+           C=st.integers(1, 1),
+           T=st.integers(2, 2),
+           H=st.integers(2, 2),
+           W=st.integers(2, 2),
+           M=st.integers(1, 1),
+           kernel=st.integers(2, 2),
+           op_name=st.sampled_from(["LC", "LC3D"]),
+           use_bias=st.booleans(),
+           **hu.gcs)
+    def test_lc_3d(self, N, C, T, H, W, M, kernel, op_name, use_bias, gc, dc):
+        if T < kernel:
+            kernel = T
+        if H < kernel:
+            kernel = H
+        if W < kernel:
+            kernel = W
+
+        op = core.CreateOperator(
+            op_name,
+            ["X", "W", "b"] if use_bias else ["X", "W"],
+            ["Y"],
+            kernels=[kernel, kernel, kernel],
+            order="NCHW",
+            engine="",
+        )
+
+        Y_T = T - kernel + 1
+        Y_H = H - kernel + 1
+        Y_W = W - kernel + 1
+        X = np.random.rand(N, C, T, H, W).astype(np.float32) - 0.5
+        W = np.random.rand(Y_T, Y_H, Y_W, M, C, kernel,
+                           kernel, kernel).astype(np.float32) - 0.5
+        b = np.random.rand(Y_T, Y_H, Y_W, M).astype(np.float32) - 0.5
+        inputs = [X, W, b] if use_bias else [X, W]
+
+        def lc_3d_nchw(X, W, b=None):
+            N, C, XT, XH, XW = X.shape
+            YT, YH, YW, M, _, KT, KH, KW = W.shape
+
+            def conv(n, m, yt, yh, yw):
+                sum = b[yt, yh, yw, m] if b is not None else 0
+                for c in range(C):
+                    for kt in range(KT):
+                        for kh in range(KH):
+                            for kw in range(KW):
+                                tt = yt + kt
+                                hh = yh + kh
+                                ww = yw + kw
+                                sum += X[n, c, tt, hh, ww] * \
+                                    W[yt, yh, yw, m, c, kt, kh, kw]
+                return sum
+
+            output = np.zeros((N, M, YT, YH, YW), dtype=np.float32)
+            for n in range(N):
+                for m in range(M):
+                    for yt in range(YT):
+                        for yh in range(YH):
+                            for yw in range(YW):
+                                output[n, m, yt, yh, yw] = conv(
+                                    n, m, yt, yh, yw)
+            return [output]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=lc_3d_nchw,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
new file mode 100644
index 0000000..a6ea88e
--- /dev/null
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLossOps(hu.HypothesisTestCase):
+
+    @given(n=st.integers(1, 8), **hu.gcs)
+    def test_averaged_loss(self, n, gc, dc):
+        X = np.random.rand(n).astype(np.float32)
+
+        def avg_op(X):
+            return [np.mean(X)]
+
+        op = core.CreateOperator(
+            "AveragedLoss",
+            ["X"],
+            ["y"],
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=avg_op,
+        )
+
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+        )
diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py
new file mode 100644
index 0000000..3a81b6c
--- /dev/null
+++ b/caffe2/python/operator_test/lpnorm_op_test.py
@@ -0,0 +1,91 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class LpnormTest(hu.HypothesisTestCase):
+    @given(inputs=hu.tensors(n=1,
+                             min_dim=1,
+                             max_dim=3,
+                             dtype=np.float32),
+           **hu.gcs_cpu_only)
+    def test_Lp_Norm(self, inputs, gc, dc):
+        X = inputs[0]
+        # avoid kinks by moving away from 0
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+        self.ws.create_blob("X").feed(X)
+        op = core.CreateOperator(
+            'LpNorm',
+            ['X'],
+            ['l1_norm'],
+            p=1,
+        )
+        self.ws.run(op)
+
+        np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(),
+                                     np.linalg.norm((X).flatten(), ord=1),
+                                    rtol=1e-4, atol=1e-4)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
+
+        op = core.CreateOperator(
+            'LpNorm',
+            ['X'],
+            ['l2_norm'],
+            p=2,
+        )
+        self.ws.run(op)
+
+        np.testing.assert_allclose(
+            self.ws.blobs[("l2_norm")].fetch(),
+            np.linalg.norm((X).flatten(), ord=2)**2,
+            rtol=1e-4,
+            atol=1e-4
+        )
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
+
+        op = core.CreateOperator(
+            'LpNorm',
+            ['X'],
+            ['l2_averaged_norm'],
+            p=2,
+            average=True
+        )
+        self.ws.run(op)
+
+        np.testing.assert_allclose(
+            self.ws.blobs[("l2_averaged_norm")].fetch(),
+            np.linalg.norm((X).flatten(), ord=2)**2 / X.size,
+            rtol=1e-4,
+            atol=1e-4
+        )
+
+    @given(x=hu.tensor(
+        min_dim=1, max_dim=10, dtype=np.float32,
+        elements=st.integers(min_value=-100, max_value=100)),
+        p=st.integers(1, 2),
+        average=st.integers(0, 1)
+    )
+    def test_lpnorm_shape_inference(self, x, p, average):
+        workspace.FeedBlob('x', x)
+
+        net = core.Net("lpnorm_test")
+        result = net.LpNorm(['x'], p=p, average=bool(average))
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+
+        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
+        self.assertEqual(types[result], core.DataType.FLOAT)
diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py
new file mode 100644
index 0000000..add86a3
--- /dev/null
+++ b/caffe2/python/operator_test/map_ops_test.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import itertools
+import numpy as np
+import tempfile
+import unittest
+import os
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestMap(hu.HypothesisTestCase):
+
+    def test_create_map(self):
+        dtypes = [core.DataType.INT32, core.DataType.INT64]
+        for key_dtype, value_dtype in itertools.product(dtypes, dtypes):
+            op = core.CreateOperator(
+                'CreateMap',
+                [],
+                ['map'],
+                key_dtype=key_dtype,
+                value_dtype=value_dtype,
+            )
+            workspace.RunOperatorOnce(op)
+            self.assertTrue(workspace.HasBlob('map'))
+
+    def test_map(self):
+
+        def test_map_func(KEY_T, VALUE_T):
+            model_file = os.path.join(tempfile.mkdtemp(), 'db')
+            key_data = np.asarray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=KEY_T)
+            value_data = np.asarray([2, 3, 3, 3, 3, 2, 3, 3, 3, 3], dtype=VALUE_T)
+            workspace.FeedBlob("key_data", key_data)
+            workspace.FeedBlob("value_data", value_data)
+            save_net = core.Net("save_net")
+            save_net.KeyValueToMap(["key_data", "value_data"], "map_data")
+            save_net.Save(
+                ["map_data"], [],
+                db=model_file,
+                db_type="minidb",
+                absolute_path=True
+            )
+            workspace.RunNetOnce(save_net)
+            workspace.ResetWorkspace()
+            load_net = core.Net("load_net")
+            load_net.Load(
+                [], ["map_data"],
+                db=model_file,
+                db_type="minidb",
+                load_all=True,
+                absolute_path=True
+            )
+            load_net.MapToKeyValue("map_data", ["key_data", "value_data"])
+            workspace.RunNetOnce(load_net)
+            key_data2 = workspace.FetchBlob("key_data")
+            value_data2 = workspace.FetchBlob("value_data")
+            assert(set(zip(key_data, value_data)) == set(zip(key_data2, value_data2)))
+
+        test_map_func(np.int64, np.int64)
+        test_map_func(np.int64, np.int32)
+        test_map_func(np.int32, np.int32)
+        test_map_func(np.int32, np.int64)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
new file mode 100644
index 0000000..28f8e0a
--- /dev/null
+++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestMarginRankingCriterion(hu.HypothesisTestCase):
+    @given(N=st.integers(min_value=10, max_value=20),
+           seed=st.integers(min_value=0, max_value=65535),
+           margin=st.floats(min_value=-0.5, max_value=0.5),
+           **hu.gcs)
+    def test_margin_ranking_criterion(self, N, seed, margin, gc, dc):
+        np.random.seed(seed)
+        X1 = np.random.randn(N).astype(np.float32)
+        X2 = np.random.randn(N).astype(np.float32)
+        Y = np.random.choice([-1, 1], size=N).astype(np.int32)
+        op = core.CreateOperator(
+            "MarginRankingCriterion", ["X1", "X2", "Y"], ["loss"],
+            margin=margin)
+
+        def ref_cec(X1, X2, Y):
+            result = np.maximum(-Y * (X1 - X2) + margin, 0)
+            return (result, )
+
+        inputs = [X1, X2, Y]
+        # This checks the op implementation against a reference function in
+        # python.
+        self.assertReferenceChecks(gc, op, inputs, ref_cec)
+        # This checks the op implementation over multiple device options (e.g.
+        # CPU and CUDA). [0] means that the 0-th output is checked.
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+        # Make singular points less sensitive
+        X1[np.abs(margin - Y * (X1 - X2)) < 0.1] += 0.1
+        X2[np.abs(margin - Y * (X1 - X2)) < 0.1] -= 0.1
+
+        # Check dX1
+        self.assertGradientChecks(gc, op, inputs, 0, [0])
+        # Check dX2
+        self.assertGradientChecks(gc, op, inputs, 1, [0])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
new file mode 100644
index 0000000..4661c77
--- /dev/null
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+from hypothesis import strategies as st
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+import unittest
+
+
+class TestMathOps(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           exponent=st.floats(min_value=2.0, max_value=3.0),
+           **hu.gcs)
+    def test_elementwise_power(self, X, exponent, gc, dc):
+        def powf(X):
+            return (X ** exponent,)
+
+        def powf_grad(g_out, outputs, fwd_inputs):
+            return (exponent * (fwd_inputs[0] ** (exponent - 1)) * g_out,)
+
+        op = core.CreateOperator(
+            "Pow", ["X"], ["Y"], exponent=exponent)
+
+        self.assertReferenceChecks(gc, op, [X], powf,
+                                   output_to_grad="Y",
+                                   grad_reference=powf_grad),
+
+    @given(X=hu.tensor(),
+           exponent=st.floats(min_value=-3.0, max_value=3.0),
+           **hu.gcs)
+    def test_sign(self, X, exponent, gc, dc):
+        def signf(X):
+            return [np.sign(X)]
+
+        op = core.CreateOperator(
+            "Sign", ["X"], ["Y"])
+
+        self.assertReferenceChecks(gc, op, [X], signf),
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
new file mode 100644
index 0000000..67fdf2c
--- /dev/null
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -0,0 +1,292 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import inspect
+
+import numpy as np
+
+from hypothesis import assume, given, settings
+import hypothesis.strategies as st
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestMatMul(hu.HypothesisTestCase):
+    @given(
+        M=st.integers(min_value=1, max_value=10),
+        K=st.integers(min_value=1, max_value=10),
+        N=st.integers(min_value=1, max_value=10),
+        trans_a=st.booleans(),
+        trans_b=st.booleans(),
+        **hu.gcs
+    )
+    def test_matmul(self, M, K, N, trans_a, trans_b, gc, dc):
+        X = np.random.rand(M, K).astype(np.float32) - 0.5
+        if trans_a:
+            X = X.transpose()
+
+        Y = np.random.rand(K, N).astype(np.float32) - 0.5
+        if trans_b:
+            Y = Y.transpose()
+
+        op = core.CreateOperator(
+            'MatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
+        )
+
+        def matmul_ref(X, Y, trans_a, trans_b):
+            XX = X.transpose() if trans_a else X
+            YY = Y.transpose() if trans_b else Y
+            return (XX.dot(YY), )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b], matmul_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        # Gradient check wrt Y
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+    @given(
+        M=st.integers(min_value=1, max_value=10),
+        K=st.integers(min_value=1, max_value=10),
+        N=st.integers(min_value=1, max_value=10),
+        axis_a=st.sampled_from([-3, -2, -1, 1, 2, 3]),
+        axis_b=st.sampled_from([-3, -2, -1, 1, 2, 3]),
+        trans_a=st.booleans(),
+        trans_b=st.booleans(),
+        **hu.gcs
+    )
+    def test_matmul_axis(
+        self, M, K, N, axis_a, axis_b, trans_a, trans_b, gc, dc
+    ):
+        X = np.random.rand(M, K).astype(np.float32) - 0.5
+        if trans_a:
+            X = X.transpose()
+        shape_x = [X.shape[0], 1, 1, 1]
+        shape_x[axis_a] = X.shape[1]
+        X = X.reshape(*shape_x)
+
+        Y = np.random.rand(K, N).astype(np.float32) - 0.5
+        if trans_b:
+            Y = Y.transpose()
+        shape_y = [Y.shape[0], 1, 1, 1]
+        shape_y[axis_b] = Y.shape[1]
+        Y = Y.reshape(*shape_y)
+        op = core.CreateOperator(
+            'MatMul', ['X', 'Y'],
+            'out',
+            axis_a=axis_a,
+            axis_b=axis_b,
+            trans_a=trans_a,
+            trans_b=trans_b
+        )
+
+        def size_to_dim(X, axis):
+            dim = 1
+            for i in range(axis):
+                dim *= X.shape[i]
+            return dim
+
+        def size_from_dim(X, axis):
+            dim = 1
+            for i in range(axis, X.ndim):
+                dim *= X.shape[i]
+            return dim
+
+        def reshape(X, axis):
+            dim_0, dim_1 = size_to_dim(X, axis), size_from_dim(X, axis)
+            return X.reshape(dim_0, dim_1)
+
+        def canonical_axis(axis, ndim):
+            return ndim + axis if axis < 0 else axis
+
+        def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b):
+            can_axis_a = canonical_axis(axis_a, X.ndim)
+            can_axis_b = canonical_axis(axis_b, Y.ndim)
+            X, Y = reshape(X, can_axis_a), reshape(Y, can_axis_b)
+            XX = X.transpose() if trans_a else X
+            YY = Y.transpose() if trans_b else Y
+            return (XX.dot(YY), )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            gc, op, [X, Y, axis_a, axis_b, trans_a, trans_b], matmul_ref
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        # Gradient check wrt Y
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+
+class TestBatchMatMul(hu.HypothesisTestCase):
+    @settings(max_examples=30)
+    @given(
+        C=st.integers(min_value=0, max_value=3),  # number of batch dims
+        M=st.integers(min_value=1, max_value=10),
+        K=st.integers(min_value=1, max_value=10),
+        N=st.integers(min_value=1, max_value=10),
+        trans_a=st.booleans(),
+        trans_b=st.booleans(),
+        dtype=st.sampled_from([np.float32, np.float16]),
+        **hu.gcs
+    )
+    def test_batch_matmul(self, C, M, K, N, trans_a, trans_b, dtype, gc, dc):
+        if dtype == np.float16:
+            # fp16 is only supported with CUDA
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
+
+        batch_dims = np.random.randint(
+            low=1,
+            high=3,
+            size=C,
+            dtype=np.int64).tolist()
+        X = np.random.rand(*(batch_dims + [M, K])).astype(dtype) - 0.5
+        if trans_a:
+            X = X.swapaxes(-1, -2)
+        Y = np.random.rand(*(batch_dims + [K, N])).astype(dtype) - 0.5
+        if trans_b:
+            Y = Y.swapaxes(-1, -2)
+
+        op = core.CreateOperator(
+            'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
+        )
+
+        def matmul_ref(X, Y, trans_a, trans_b, dtype):
+            XX = (X.swapaxes(-1, -2) if trans_a else X).astype(np.float32)
+            YY = (Y.swapaxes(-1, -2) if trans_b else Y).astype(np.float32)
+            return (np.matmul(XX, YY).astype(dtype),)
+
+        # relaxing the "threshold" for fp16 to 150x of the default
+        def relax_fp16_check(check_func, *args, **kwargs):
+            # inspect the default "threshold" value in check_func
+            argspec = inspect.getargspec(check_func)
+            threshold = argspec.defaults[
+                argspec.args.index('threshold') -
+                (len(argspec.args) - len(argspec.defaults))]
+
+            if dtype == np.float16:
+                threshold = 150 * threshold
+            check_func(*args, threshold=threshold, **kwargs)
+
+        # Check against numpy reference
+        relax_fp16_check(self.assertReferenceChecks, gc, op, [X, Y, trans_a, trans_b, dtype], matmul_ref)
+        # Check over multiple devices
+        relax_fp16_check(self.assertDeviceChecks, dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        relax_fp16_check(self.assertGradientChecks, gc, op, [X, Y], 0, [0])
+        # Gradient check wrt Y
+        relax_fp16_check(self.assertGradientChecks, gc, op, [X, Y], 1, [0])
+
+    def _test_batch_matmul_with_broadcast_common(
+        self,
+        X,
+        Y,
+        dtype,
+        gc,
+        dc,
+        trans_a=None,
+        trans_b=None,
+    ):
+        if trans_a is not None and trans_b is not None:
+            op = core.CreateOperator(
+                'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b, broadcast=1
+            )
+        else:
+            op = core.CreateOperator(
+                'BatchMatMul', ['X', 'Y'], 'out', broadcast=1
+            )
+
+        def matmul_ref(X, Y, trans_a, trans_b, dtype):
+            XX = (X.swapaxes(-1, -2) if trans_a else X).astype(np.float32)
+            YY = (Y.swapaxes(-1, -2) if trans_b else Y).astype(np.float32)
+            return (np.matmul(XX, YY).astype(dtype),)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b, dtype], matmul_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+
+    @given(
+        C_1=st.integers(min_value=0, max_value=3),  # number of batch dims
+        C_2=st.integers(min_value=0, max_value=3),
+        M=st.integers(min_value=1, max_value=10),
+        K=st.integers(min_value=1, max_value=10),
+        N=st.integers(min_value=1, max_value=10),
+        trans_a=st.booleans(),
+        trans_b=st.booleans(),
+        **hu.gcs
+    )
+    def test_numpy_batch_matmul(self, C_1, C_2, M, K, N, trans_a, trans_b, gc, dc):
+        np.set_printoptions(threshold=np.nan)
+        dtype = np.float32
+        batch_dims = np.random.randint(
+            low=0,
+            high=3,
+            size=max(C_1, C_2),
+            dtype=np.int64).tolist()
+        lbd = len(batch_dims)
+        X = np.random.rand(*(batch_dims[lbd - C_1:] + [M, K])).astype(dtype) - 0.5
+        if trans_a:
+            X = X.swapaxes(-1, -2)
+        Y = np.random.rand(*(batch_dims[lbd - C_2:] + [K, N])).astype(dtype) - 0.5
+        if trans_b:
+            Y = Y.swapaxes(-1, -2)
+
+        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc, trans_a, trans_b)
+
+    @settings(max_examples=30)
+    @given(
+        K=st.integers(min_value=1, max_value=10),
+        **hu.gcs
+    )
+    def test_numpy_batch_matmul_1d(self, K, gc, dc):
+        np.set_printoptions(threshold=np.nan)
+        dtype = np.float32
+        X = np.random.rand(K).astype(dtype) - 0.5
+        # TODO: test trans_a and trans_b
+        Y = np.random.rand(K).astype(dtype) - 0.5
+
+        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
+
+    @settings(max_examples=30)
+    @given(
+        K=st.integers(min_value=1, max_value=10),
+        N=st.integers(min_value=1, max_value=10),
+        **hu.gcs
+    )
+    def test_numpy_batch_matmul_1d_2d(self, K, N, gc, dc):
+        np.set_printoptions(threshold=np.nan)
+        dtype = np.float32
+        X = np.random.rand(K).astype(dtype) - 0.5
+        # TODO: test trans_a and trans_b
+        Y = np.random.rand(*[K, N]).astype(dtype) - 0.5
+
+        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
+
+    @settings(max_examples=30)
+    @given(
+        M=st.integers(min_value=1, max_value=10),
+        K=st.integers(min_value=1, max_value=10),
+        **hu.gcs
+    )
+    def test_numpy_batch_matmul_2d_1d(self, M, K, gc, dc):
+        np.set_printoptions(threshold=np.nan)
+        dtype = np.float32
+        X = np.random.rand(*[M, K]).astype(dtype) - 0.5
+        # TODO: test trans_a and trans_b
+        Y = np.random.rand(K).astype(dtype) - 0.5
+
+        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
new file mode 100644
index 0000000..cbb1adc
--- /dev/null
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestMean(hu.HypothesisTestCase):
+    @given(
+        k=st.integers(1, 5),
+        n=st.integers(1, 10),
+        m=st.integers(1, 10),
+        in_place=st.booleans(),
+        seed=st.integers(0, 2**32 - 1),
+        **hu.gcs
+    )
+    def test_mean(self, k, n, m, in_place, seed, gc, dc):
+        np.random.seed(seed)
+        input_names = []
+        input_vars = []
+
+        for i in range(k):
+            X_name = 'X' + str(i)
+            input_names.append(X_name)
+            var = np.random.randn(n, m).astype(np.float32)
+            input_vars.append(var)
+
+        def mean_ref(*args):
+            return [np.mean(args, axis=0)]
+
+        op = core.CreateOperator(
+            "Mean",
+            input_names,
+            ['Y' if not in_place else 'X0'],
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=input_vars,
+            reference=mean_ref,
+        )
+
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=input_vars,
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+        )
+
+        self.assertDeviceChecks(dc, op, input_vars, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
new file mode 100644
index 0000000..1b4322e
--- /dev/null
+++ b/caffe2/python/operator_test/merge_id_lists_op_test.py
@@ -0,0 +1,84 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+import hypothesis.extra.numpy as hnp
+
+
+@st.composite
+def id_list_batch(draw):
+    num_inputs = draw(st.integers(1, 3))
+    batch_size = draw(st.integers(5, 10))
+    values_dtype = draw(st.sampled_from([np.int32, np.int64]))
+    inputs = []
+    for _ in range(num_inputs):
+        size = draw(st.integers(5, 10))
+        values = draw(hnp.arrays(values_dtype, size, st.integers(1, 10)))
+        lengths = draw(hu.lengths(len(values),
+                                  min_segments=batch_size,
+                                  max_segments=batch_size))
+        inputs.append(lengths)
+        inputs.append(values)
+    return inputs
+
+
+def merge_id_lists_ref(*args):
+    n = len(args)
+    assert n > 0
+    assert n % 2 == 0
+    batch_size = len(args[0])
+    num_inputs = int(n / 2)
+    lengths = np.array([np.insert(args[2 * i], 0, 0)
+                        for i in range(num_inputs)])
+    values = [args[2 * i + 1] for i in range(num_inputs)]
+    offsets = [np.cumsum(lengths[j]) for j in range(num_inputs)]
+
+    def merge_arrays(vs, offs, j):
+        concat = np.concatenate([vs[i][offs[i][j]:offs[i][j + 1]]
+                                for i in range(num_inputs)])
+        return np.sort(np.unique(concat))
+
+    merged = [merge_arrays(values, offsets, j) for j in range(batch_size)]
+    merged_lengths = np.array([len(x) for x in merged])
+    merged_values = np.concatenate(merged)
+    return merged_lengths, merged_values
+
+
+class TestMergeIdListsOp(hu.HypothesisTestCase):
+    def test_merge_id_lists_ref(self):
+        # Verify that the reference implementation is correct!
+        lengths_0 = np.array([3, 0, 4], dtype=np.int32)
+        values_0 = np.array([1, 5, 6, 2, 4, 5, 6], dtype=np.int64)
+        lengths_1 = np.array([3, 2, 1], dtype=np.int32)
+        values_1 = np.array([5, 8, 9, 14, 9, 5], dtype=np.int64)
+
+        merged_lengths, merged_values = merge_id_lists_ref(
+            lengths_0, values_0, lengths_1, values_1)
+        expected_lengths = np.array([5, 2, 4], dtype=np.int32)
+        expected_values = np.array([1, 5, 6, 8, 9, 9, 14, 2, 4, 5, 6], dtype=np.int64)
+
+        np.testing.assert_array_equal(merged_lengths, expected_lengths)
+        np.testing.assert_array_equal(merged_values, expected_values)
+
+    @given(inputs=id_list_batch(),
+           **hu.gcs_cpu_only)
+    def test_merge_id_lists_op(self, inputs, gc, dc):
+        num_inputs = int(len(inputs) / 2)
+        op = core.CreateOperator(
+            "MergeIdLists",
+            ["{prefix}_{i}".format(prefix=p, i=i)
+                for i in range(num_inputs)
+                for p in ["lengths", "values"]],
+            ["merged_lengths", "merged_values"]
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        self.assertReferenceChecks(gc, op, inputs, merge_id_lists_ref)
diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py
new file mode 100644
index 0000000..ac75673
--- /dev/null
+++ b/caffe2/python/operator_test/mkl_conv_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.mkl_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn,
+                 "Skipping as we do not have mkldnn.")
+class MKLConvTest(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 8),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **mu.gcs)
+    @settings(max_examples=2, timeout=100)
+    def test_mkl_convolution(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, gc, dc):
+        op = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(
+                output_channels, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+        inputs = [X, w, b]
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
new file mode 100644
index 0000000..59546d3
--- /dev/null
+++ b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
@@ -0,0 +1,76 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+@unittest.skipIf(not core.IsOperator("PackedFC"),
+                 "PackedFC is not supported in this caffe2 build.")
+class PackedFCTest(hu.HypothesisTestCase):
+    @given(seed=st.integers(0, 65536),
+           M=st.integers(16, 32),
+           K=st.integers(128, 1024),
+           N=st.integers(128, 1024),
+           **hu.gcs_cpu_only)
+    @unittest.skipIf(not core.C.builtin_cpu_supports_avx2(),
+                     "Intel MKL sgemm_pack has a known numerical issue with "
+                     "non-avx2 machines that will be fixed in a later build.")
+    def test_packed_fc(self, seed, M, K, N, gc, dc):
+        np.random.seed(seed)
+        X = np.random.rand(M, K).astype(np.float32) - 0.5
+        W = np.random.rand(N, K).astype(np.float32) - 0.5
+        b = np.random.rand(N).astype(np.float32) - 0.5
+
+        # If you are debugging, the following hard-coded ones might help.
+        # X = np.ones((24, 256)).astype(np.float32)
+        # W = np.ones((128, 256)).astype(np.float32)
+        # b = np.zeros(128).astype(np.float32)
+
+        def ref(X, W, b):
+            return (np.dot(X, W.T) + b,)
+
+        for name in ["FC", "PackedFC"]:
+            op = core.CreateOperator(
+                name,
+                ["X", "W", "b"],
+                ["Y"],
+            )
+            self.assertReferenceChecks(gc, op, [X, W, b], ref)
+
+    @unittest.skipIf(not core.C.builtin_cpu_supports_avx2(),
+                     "Intel MKL sgemm_pack has a known numerical issue with "
+                     "non-avx2 machines that will be fixed in a later build.")
+    @given(axis=st.integers(min_value=1, max_value=4),
+           num_output=st.integers(min_value=4, max_value=8),
+           **hu.gcs_cpu_only)
+    def test_packed_fc_axis(self, axis, num_output, gc, dc):
+        np.random.seed(1701)
+        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
+        K = np.prod(X.shape[axis:])
+        N = num_output
+        W = np.random.randn(N, K).astype(np.float32)
+        b = np.random.randn(N).astype(np.float32)
+
+        op = core.CreateOperator(
+            "PackedFC",
+            ["X", "W", "b"],
+            ["Y"],
+            axis=axis)
+
+        def ref(X, W, b):
+            output_axes = list(X.shape[:axis]) + [N]
+            return (
+                np.dot(X.reshape(int(X.size / K), K), W.T).reshape(output_axes) + b,)
+
+        self.assertReferenceChecks(gc, op, [X, W, b], ref)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/mkl_speed_test.py b/caffe2/python/operator_test/mkl_speed_test.py
new file mode 100644
index 0000000..6f2348f
--- /dev/null
+++ b/caffe2/python/operator_test/mkl_speed_test.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import numpy as np
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, test_util
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
+class TestMKLBasic(test_util.TestCase):
+    def testReLUSpeed(self):
+        X = np.random.randn(128, 4096).astype(np.float32)
+        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+        # Makes sure that feed works.
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+        net = core.Net("test")
+        # Makes sure that we can run relu.
+        net.Relu("X", "Y")
+        net.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        # makes sure that the results are good.
+        np.testing.assert_allclose(
+            workspace.FetchBlob("Y"),
+            workspace.FetchBlob("Y_mkl"),
+            atol=1e-10,
+            rtol=1e-10)
+        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+
+        # The returned runtime is the time of
+        # [whole_net, cpu_op, mkl_op]
+        # so we will assume that the MKL one runs faster than the CPU one.
+
+        # Note(Yangqing): in fact, it seems that in optimized mode, this is
+        # not always guaranteed - MKL runs slower than the Eigen vectorized
+        # version, so I am turning this assertion off.
+        #self.assertTrue(runtime[1] >= runtime[2])
+
+        print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+    # Note(Zhicheng): Disable the test below we implement to use
+    # RegisterTensorInfoFunction to register for Tensor<MKLContext>
+
+    # def testConvSpeed(self):
+    #     # We randomly select a shape to test the speed. Intentionally we
+    #     # test a batch size of 1 since this may be the most frequent use
+    #     # case for MKL during deployment time.
+    #     X = np.random.rand(1, 256, 27, 27).astype(np.float32) - 0.5
+    #     W = np.random.rand(192, 256, 3, 3).astype(np.float32) - 0.5
+    #     b = np.random.rand(192).astype(np.float32) - 0.5
+    #     mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
+    #     # Makes sure that feed works.
+    #     workspace.FeedBlob("X", X)
+    #     workspace.FeedBlob("W", W)
+    #     workspace.FeedBlob("b", b)
+    #     workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
+    #     workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
+    #     workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
+    #     net = core.Net("test")
+    #     # Makes sure that we can run relu.
+    #     net.Conv(["X", "W", "b"], "Y", pad=1, stride=1, kernel=3)
+    #     net.Conv(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl",
+    #              pad=1, stride=1, kernel=3, device_option=mkl_do)
+    #     workspace.CreateNet(net)
+    #     workspace.RunNet(net)
+    #     # makes sure that the results are good.
+    #     np.testing.assert_allclose(
+    #         workspace.FetchBlob("Y"),
+    #         workspace.FetchBlob("Y_mkl"),
+    #         atol=1e-2,
+    #         rtol=1e-2)
+    #     runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
+    #
+    #     print("Conv CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py
new file mode 100644
index 0000000..92a318f
--- /dev/null
+++ b/caffe2/python/operator_test/mod_op_test.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+@st.composite
+def _data(draw):
+    return draw(
+        hu.tensor(dtype=np.int64,
+            elements=st.integers(
+                min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
+            )
+        )
+    )
+
+
+class TestMod(hu.HypothesisTestCase):
+    @given(
+        data=_data(),
+        divisor=st.integers(
+            min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
+        ),
+        inplace=st.booleans(),
+        sign_follow_divisor=st.booleans(),
+        **hu.gcs_cpu_only
+    )
+    def test_mod(
+        self, data, divisor, inplace, sign_follow_divisor, gc, dc
+    ):
+        if divisor == 0:
+            # invalid test case
+            return None
+
+        def ref(data):
+            if sign_follow_divisor:
+                output = data % divisor
+            else:
+                output = numpy.fmod(data, divisor)
+            return [output]
+
+        op = core.CreateOperator(
+            'Mod',
+            ['data'],
+            ['data' if inplace else 'output'],
+            divisor=divisor,
+            sign_follow_divisor=sign_follow_divisor
+        )
+
+        self.assertReferenceChecks(gc, op, [data], ref)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
new file mode 100644
index 0000000..fa456c8
--- /dev/null
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import itertools as it
+
+
+class TestMomentsOp(hu.HypothesisTestCase):
+    def run_moments_test(self, X, axes, keepdims, gc, dc):
+        if axes is None:
+            op = core.CreateOperator(
+                "Moments",
+                ["X"],
+                ["mean", "variance"],
+                keepdims=keepdims,
+            )
+        else:
+            op = core.CreateOperator(
+                "Moments",
+                ["X"],
+                ["mean", "variance"],
+                axes=axes,
+                keepdims=keepdims,
+            )
+
+        def ref(X):
+            mean = np.mean(X, axis=None if axes is None else tuple(
+                axes), keepdims=keepdims)
+            variance = np.var(X, axis=None if axes is None else tuple(
+                axes), keepdims=keepdims)
+            return [mean, variance]
+
+        self.assertReferenceChecks(gc, op, [X], ref)
+        self.assertDeviceChecks(dc, op, [X], [0, 1])
+        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
+
+    @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
+           num_axes=st.integers(1, 4), **hu.gcs)
+    def test_moments(self, X, keepdims, num_axes, gc, dc):
+        self.run_moments_test(X, None, keepdims, gc, dc)
+        num_dims = len(X.shape)
+        if num_dims < num_axes:
+            self.run_moments_test(X, range(num_dims), keepdims, gc, dc)
+        else:
+            for axes in it.combinations(range(num_dims), num_axes):
+                self.run_moments_test(X, axes, keepdims, gc, dc)
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
new file mode 100644
index 0000000..7bfceb6
--- /dev/null
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@@ -0,0 +1,185 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+import hypothesis
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+
+class TestMomentumSGD(hu.HypothesisTestCase):
+    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
+    def test_momentum_sgd(self, n, nesterov, gc, dc):
+        param = np.random.rand(n).astype(np.float32)
+        grad = np.random.rand(n).astype(np.float32)
+        lr = np.random.rand(1).astype(np.float32)
+        param_momentum = np.random.rand(n).astype(np.float32)
+        momentum = 0.9
+
+        def momentum_sgd(grad, param_momentum, lr, param=None):
+            if not nesterov:
+                adjusted_gradient = lr * grad + momentum * param_momentum
+                if param is None:
+                    return [adjusted_gradient, adjusted_gradient]
+                else:
+                    paramup = param - adjusted_gradient
+                    return [adjusted_gradient, adjusted_gradient, paramup]
+            else:
+                m_new = momentum * param_momentum + lr * grad
+                grad_new = (1 + momentum) * m_new - momentum * param_momentum
+                if param is None:
+                    return [grad_new, m_new]
+                else:
+                    paramup = param - grad_new
+                    return [grad_new, m_new, paramup]
+
+        op = core.CreateOperator(
+            "MomentumSGDUpdate",
+            ["grad", "param_momentum", "lr", "param"],
+            ["grad", "param_momentum", "param"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[grad, param_momentum, lr, param],
+            reference=momentum_sgd
+        )
+
+        op_noparam = core.CreateOperator(
+            "MomentumSGD",
+            ["grad", "param_momentum", "lr"],
+            ["grad", "param_momentum"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op_noparam,
+            inputs=[grad, param_momentum, lr],
+            reference=momentum_sgd
+        )
+
+    @given(
+        inputs=hu.tensors(n=3),
+        momentum=st.floats(min_value=0.1, max_value=0.9),
+        nesterov=st.booleans(),
+        lr=st.floats(min_value=0.1, max_value=0.9),
+        data_strategy=st.data(),
+        **hu.gcs
+    )
+    def test_sparse_momentum_sgd(
+        self, inputs, momentum, nesterov, lr, data_strategy, gc, dc
+    ):
+        w, grad, m = inputs
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(
+            np.array_equal(
+                np.unique(indices.flatten()),
+                np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        # Make momentum >= 0
+        m = np.abs(m)
+
+        # Convert lr to a numpy array
+        lr = np.asarray([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "SparseMomentumSGDUpdate", ["grad", "m", "lr", "param", "indices"],
+            ["adjusted_grad", "m", "param"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+            device_option=gc
+        )
+
+        # Reference
+        def momentum_sgd(grad, m, lr):
+            lr = lr[0]
+            if not nesterov:
+                adjusted_gradient = lr * grad + momentum * m
+                return (adjusted_gradient, adjusted_gradient)
+            else:
+                m_new = momentum * m + lr * grad
+                return ((1 + momentum) * m_new - momentum * m, m_new)
+
+        def sparse(grad, m, lr, param, i):
+            grad_new, m_new = momentum_sgd(grad, m[i], lr)
+            m[i] = m_new
+            param[i] -= grad_new
+            return (grad_new, m, param)
+
+        self.assertReferenceChecks(
+            gc,
+            op,
+            [grad, m, lr, w, indices],
+            sparse)
+
+    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs_gpu_only)
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    def test_fp16momentum_sgd(self, n, nesterov, gc, dc):
+        gpuvers = workspace.GetDeviceProperties(0)["major"]
+        if gpuvers < 6:
+            print("No FP16 support because major version {} < 6".format(gpuvers))
+            return
+
+        param = np.random.rand(n).astype(np.float16)
+        grad = np.random.rand(n).astype(np.float16)
+        lr = np.random.rand(1).astype(np.float32)
+        param_momentum = np.random.rand(n).astype(np.float16)
+        momentum = 0.9
+        nesterov = True
+
+        def momentum_sgd(grad, param_momentum, lr, param=None):
+            if not nesterov:
+                adjusted_gradient = lr * grad + momentum * param_momentum
+                paramup = param - adjusted_gradient
+                return [adjusted_gradient, adjusted_gradient, paramup]
+            else:
+                m_new = momentum * param_momentum + lr * grad
+                grad_new = (1 + momentum) * m_new - momentum * param_momentum
+                paramup = param - grad_new
+                return [grad_new, m_new, paramup]
+
+        op = core.CreateOperator(
+            "FP16MomentumSGDUpdate",
+            ["grad", "param_momentum", "lr", "param"],
+            ["grad", "param_momentum", "param"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+            weight_decay=0.0,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[grad, param_momentum, lr, param],
+            reference=momentum_sgd
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
new file mode 100644
index 0000000..0885289
--- /dev/null
+++ b/caffe2/python/operator_test/mpi_test.py
@@ -0,0 +1,198 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+import numpy as np
+import unittest
+
+from caffe2.python import core, workspace, dyndep
+import caffe2.python.hypothesis_test_util as hu
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
+
+_has_mpi =False
+COMM = None
+RANK = 0
+SIZE = 0
+
+def SetupMPI():
+    try:
+        from mpi4py import MPI
+        global _has_mpi, COMM, RANK, SIZE
+        _has_mpi = core.IsOperatorWithEngine("CreateCommonWorld", "MPI")
+        COMM = MPI.COMM_WORLD
+        RANK = COMM.Get_rank()
+        SIZE = COMM.Get_size()
+    except ImportError:
+        _has_mpi = False
+
+
+@unittest.skipIf(not _has_mpi,
+                 "MPI is not available. Skipping.")
+class TestMPI(hu.HypothesisTestCase):
+    @given(X=hu.tensor(),
+           root=st.integers(min_value=0, max_value=SIZE - 1),
+           device_option=st.sampled_from(hu.device_options),
+           **hu.gcs)
+    def test_broadcast(self, X, root, device_option, gc, dc):
+        # Use mpi4py's broadcast to make sure that all nodes inherit the
+        # same hypothesis test.
+        X = COMM.bcast(X)
+        root = COMM.bcast(root)
+        device_option = COMM.bcast(device_option)
+        X[:] = RANK
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "CreateCommonWorld", [], "comm", engine="MPI",
+                    device_option=device_option)))
+        self.assertTrue(workspace.FeedBlob("X", X, device_option))
+        mpi_op = core.CreateOperator(
+            "Broadcast", ["comm", "X"], "X", engine="MPI", root=root,
+            device_option=device_option)
+        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
+        new_X = workspace.FetchBlob("X")
+        np.testing.assert_array_equal(new_X, root)
+        workspace.ResetWorkspace()
+
+    @given(X=hu.tensor(),
+           root=st.integers(min_value=0, max_value=SIZE - 1),
+           device_option=st.sampled_from(hu.device_options),
+           **hu.gcs)
+    def test_reduce(self, X, root, device_option, gc, dc):
+        # Use mpi4py's broadcast to make sure that all nodes inherit the
+        # same hypothesis test.
+        X = COMM.bcast(X)
+        root = COMM.bcast(root)
+        device_option = COMM.bcast(device_option)
+        X[:] = RANK
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "CreateCommonWorld", [], "comm", engine="MPI",
+                    device_option=device_option)))
+        self.assertTrue(workspace.FeedBlob("X", X, device_option))
+        mpi_op = core.CreateOperator(
+            "Reduce", ["comm", "X"], "X_reduced", engine="MPI", root=root,
+            device_option=device_option)
+        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
+        if (RANK == root):
+            new_X = workspace.FetchBlob("X")
+            np.testing.assert_array_equal(new_X, root)
+        workspace.ResetWorkspace()
+
+    @given(X=hu.tensor(),
+           root=st.integers(min_value=0, max_value=SIZE - 1),
+           device_option=st.sampled_from(hu.device_options),
+           inplace=st.booleans(),
+           **hu.gcs)
+    def test_allreduce(self, X, root, device_option, inplace, gc, dc):
+        # Use mpi4py's broadcast to make sure that all nodes inherit the
+        # same hypothesis test.
+        X = COMM.bcast(X)
+        root = COMM.bcast(root)
+        device_option = COMM.bcast(device_option)
+        inplace = COMM.bcast(inplace)
+        X[:] = RANK
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "CreateCommonWorld", [], "comm", engine="MPI",
+                    device_option=device_option)))
+        # Use mpi4py's broadcast to make sure that all copies have the same
+        # tensor size.
+        X = COMM.bcast(X)
+        X[:] = RANK
+        self.assertTrue(workspace.FeedBlob("X", X, device_option))
+        mpi_op = core.CreateOperator(
+            "Allreduce", ["comm", "X"],
+            "X" if inplace else "X_reduced",
+            engine="MPI", root=root,
+            device_option=device_option)
+        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
+        new_X = workspace.FetchBlob("X" if inplace else "X_reduced")
+        np.testing.assert_array_equal(new_X, SIZE * (SIZE - 1) / 2)
+        workspace.ResetWorkspace()
+
+    @given(X=hu.tensor(),
+           device_option=st.sampled_from(hu.device_options),
+           specify_send_blob=st.booleans(),
+           specify_recv_blob=st.booleans(),
+           **hu.gcs)
+    def test_sendrecv(
+            self, X, device_option, specify_send_blob, specify_recv_blob,
+            gc, dc):
+        # Use mpi4py's broadcast to make sure that all nodes inherit the
+        # same hypothesis test.
+        X = COMM.bcast(X)
+        device_option = COMM.bcast(device_option)
+        specify_send_blob = COMM.bcast(specify_send_blob)
+        specify_recv_blob = COMM.bcast(specify_recv_blob)
+        X[:] = RANK
+
+        self.assertTrue(
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "CreateCommonWorld", [], "comm", engine="MPI",
+                    device_option=device_option)))
+        self.assertTrue(workspace.FeedBlob("X", X, device_option))
+        for src in range(SIZE):
+            for dst in range(SIZE):
+                tag = src * SIZE + dst
+                if src == dst:
+                    continue
+                elif RANK == src:
+                    X[:] = RANK
+                    self.assertTrue(workspace.FeedBlob("X", X, device_option))
+                    if specify_send_blob:
+                        self.assertTrue(workspace.FeedBlob(
+                            "dst", np.array(dst, dtype=np.int32)))
+                        self.assertTrue(workspace.FeedBlob(
+                            "tag", np.array(tag, dtype=np.int32)))
+                        mpi_op = core.CreateOperator(
+                            "SendTensor", ["comm", "X", "dst", "tag"], [],
+                            engine="MPI", raw_buffer=True,
+                            device_option=device_option)
+                    else:
+                        mpi_op = core.CreateOperator(
+                            "SendTensor", ["comm", "X"], [], engine="MPI",
+                            dst=dst, tag=tag, raw_buffer=True,
+                            device_option=device_option)
+                    self.assertTrue(workspace.RunOperatorOnce(mpi_op))
+                elif RANK == dst:
+                    if specify_recv_blob:
+                        self.assertTrue(workspace.FeedBlob(
+                            "src", np.array(src, dtype=np.int32)))
+                        self.assertTrue(workspace.FeedBlob(
+                            "tag", np.array(tag, dtype=np.int32)))
+                        mpi_op = core.CreateOperator(
+                            "ReceiveTensor", ["comm", "X", "src", "tag"],
+                            ["X", "src", "tag"],
+                            engine="MPI",
+                            src=src, tag=tag, raw_buffer=True,
+                            device_option=device_option)
+                    else:
+                        mpi_op = core.CreateOperator(
+                            "ReceiveTensor", ["comm", "X"], ["X", "src", "tag"],
+                            engine="MPI",
+                            src=src, tag=tag, raw_buffer=True,
+                            device_option=device_option)
+                    self.assertTrue(workspace.RunOperatorOnce(mpi_op))
+                    received = workspace.FetchBlob("X")
+                    np.testing.assert_array_equal(received, src)
+                    src_blob = workspace.FetchBlob("src")
+                    np.testing.assert_array_equal(src_blob, src)
+                    tag_blob = workspace.FetchBlob("tag")
+                    np.testing.assert_array_equal(tag_blob, tag)
+                # simply wait for the guys to finish
+                COMM.barrier()
+        workspace.ResetWorkspace()
+
+if __name__ == "__main__":
+    SetupMPI()
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
new file mode 100644
index 0000000..d37955a
--- /dev/null
+++ b/caffe2/python/operator_test/negate_gradient_op_test.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import workspace, core
+
+
+class TestNegateGradient(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           inplace=st.booleans(),
+            **hu.gcs)
+    def test_forward(self, X, inplace, gc, dc):
+        def neg_grad_ref(X):
+            return (X,)
+
+        op = core.CreateOperator("NegateGradient", ["X"], ["Y" if not inplace else "X"])
+        self.assertReferenceChecks(gc, op, [X], neg_grad_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(size=st.lists(st.integers(min_value=1, max_value=20),
+                         min_size=1, max_size=5))
+    def test_grad(self, size):
+        X = np.random.random_sample(size)
+        workspace.ResetWorkspace()
+        workspace.FeedBlob("X", X.astype(np.float32))
+
+        net = core.Net("negate_grad_test")
+        Y = net.NegateGradient(["X"], ["Y"])
+
+        grad_map = net.AddGradientOperators([Y])
+        workspace.RunNetOnce(net)
+
+        # check X_grad == negate of Y_grad
+        x_val, y_val = workspace.FetchBlobs(['X', 'Y'])
+        x_grad_val, y_grad_val = workspace.FetchBlobs([grad_map['X'],
+                                                        grad_map['Y']])
+        np.testing.assert_array_equal(x_val, y_val)
+        np.testing.assert_array_equal(x_grad_val, y_grad_val * (-1))
diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py
new file mode 100644
index 0000000..70aad5c
--- /dev/null
+++ b/caffe2/python/operator_test/ngram_ops_test.py
@@ -0,0 +1,75 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+
+
+class TestNGramOps(hu.HypothesisTestCase):
+    @given(
+        seed=st.integers(0, 2**32 - 1),
+        N=st.integers(min_value=10, max_value=100),
+        D=st.integers(min_value=2, max_value=10),
+        out_of_vcb=st.floats(min_value=0, max_value=0.5),
+        max_categorical_limit=st.integers(min_value=5, max_value=20),
+        max_in_vcb_val=st.integers(min_value=1000, max_value=10000),
+        **hu.gcs_cpu_only
+    )
+    def test_ngram_from_categorical_op(
+        self,
+        seed,
+        N,
+        D,
+        out_of_vcb,
+        max_categorical_limit,
+        max_in_vcb_val,
+        gc,
+        dc,
+    ):
+        np.random.seed(seed)
+        col_num = max(int(D / 2), 1)
+        col_ids = np.random.choice(D, col_num, False).astype(np.int32)
+        categorical_limits = np.random.randint(
+            2, high=max_categorical_limit, size=col_num
+        ).astype(np.int32)
+        vcb = [
+            np.random.choice(max_in_vcb_val, x, False)
+            for x in categorical_limits
+        ]
+        vals = np.array([x for l in vcb for x in l], dtype=np.int32)
+
+        # Enforce round(floats) to be negative.
+        floats = np.random.rand(N, D).astype(np.float32) - 2
+        expected_output = []
+        for i in range(N):
+            val = 0
+            for (k, j) in enumerate(col_ids):
+                base = np.prod(categorical_limits[:k])
+                r = np.random.randint(categorical_limits[k])
+                p = np.random.rand()
+                if p > out_of_vcb:
+                    val += base * r
+                    floats[i][j] = vcb[k][r]
+            expected_output.append(val)
+        expected_output = np.array(expected_output, dtype=np.int32)
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob('floats', floats)
+        op = core.CreateOperator(
+            "NGramFromCategorical",
+            ['floats'],
+            ['output'],
+            col_ids=col_ids,
+            categorical_limits=categorical_limits,
+            vals=vals,
+        )
+        workspace.RunOperatorOnce(op)
+        output = workspace.blobs['output']
+        np.testing.assert_array_equal(output, expected_output)
diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py
new file mode 100644
index 0000000..965bbe7
--- /dev/null
+++ b/caffe2/python/operator_test/normalize_op_test.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestNormalizeOp(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_normalize(self, X, gc, dc):
+        def ref_normalize(X, axis):
+            x_normed = X / (
+                np.sqrt((X**2).sum(axis=axis, keepdims=True)) + np.finfo(X.dtype).tiny)
+            return (x_normed,)
+
+        for axis in range(-X.ndim, X.ndim):
+            op = core.CreateOperator("Normalize", "X", "Y", axis=axis)
+            self.assertReferenceChecks(
+                gc,
+                op,
+                [X],
+                functools.partial(ref_normalize, axis=axis))
+            self.assertDeviceChecks(dc, op, [X], [0])
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(X=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_normalize_L1(self, X, gc, dc):
+        def ref(X, axis):
+            norm = abs(X).sum(axis=axis, keepdims=True)
+            return (X / norm,)
+
+        for axis in range(-X.ndim, X.ndim):
+            print('axis: ', axis)
+            op = core.CreateOperator("NormalizeL1", "X", "Y", axis=axis)
+            self.assertReferenceChecks(
+                gc,
+                op,
+                [X],
+                functools.partial(ref, axis=axis))
+            self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
new file mode 100644
index 0000000..c1d02de
--- /dev/null
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+import unittest
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestNumpyTile(hu.HypothesisTestCase):
+    @given(ndim=st.integers(min_value=1, max_value=4),
+           seed=st.integers(min_value=0, max_value=65536),
+           **hu.gcs_cpu_only)
+    def test_numpy_tile(self, ndim, seed, gc, dc):
+        np.random.seed(seed)
+
+        input_dims = np.random.randint(1, 4, size=ndim)
+        input = np.random.randn(*input_dims)
+        repeats = np.random.randint(1, 5, size=ndim)
+
+        op = core.CreateOperator(
+            'NumpyTile', ['input', 'repeats'], 'out',
+        )
+
+        def tile_ref(input, repeats):
+            tiled_data = np.tile(input, repeats)
+            return (tiled_data,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [input, repeats],
+                                   tile_ref)
+
+    @given(ndim=st.integers(min_value=1, max_value=4),
+           seed=st.integers(min_value=0, max_value=65536),
+           **hu.gcs_cpu_only)
+    def test_numpy_tile_zero_dim(self, ndim, seed, gc, dc):
+        np.random.seed(seed)
+
+        input_dims = np.random.randint(0, 4, size=ndim)
+        input = np.random.randn(*input_dims)
+        repeats = np.random.randint(0, 5, size=ndim)
+
+        op = core.CreateOperator(
+            'NumpyTile', ['input', 'repeats'], 'out',
+        )
+
+        def tile_ref(input, repeats):
+            tiled_data = np.tile(input, repeats)
+            return (tiled_data,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [input, repeats],
+                                   tile_ref)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
new file mode 100644
index 0000000..da1c11f
--- /dev/null
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -0,0 +1,205 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def _one_hots():
+    index_size = st.integers(min_value=1, max_value=5)
+    lengths = st.lists(
+        elements=st.integers(min_value=0, max_value=5))
+    return st.tuples(index_size, lengths).flatmap(
+        lambda x: st.tuples(
+            st.just(x[0]),
+            st.just(x[1]),
+            st.lists(
+                elements=st.integers(min_value=0, max_value=x[0] - 1),
+                min_size=sum(x[1]),
+                max_size=sum(x[1]))))
+
+
+class TestOneHotOps(hu.HypothesisTestCase):
+    @given(
+        x=hu.tensor(
+            min_dim=2, max_dim=2, dtype=np.int32,
+            elements=st.integers(min_value=0, max_value=10)),
+        **hu.gcs_cpu_only)
+    def test_batch_one_hot(self, x, gc, dc):
+        d = x.shape[1]
+        lens = []
+        vals = []
+        for i in range(0, d):
+            val = np.unique(x[:, i])
+            vals.extend(val)
+            lens.append(len(val))
+        lens = np.array(lens, dtype=np.int32)
+        vals = np.array(vals, dtype=np.int32)
+
+        def ref(x, lens, vals):
+            output_dim = vals.size
+            ret = np.zeros((x.shape[0], output_dim)).astype(x.dtype)
+            p = 0
+            for i, l in enumerate(lens):
+                for j in range(0, l):
+                    v = vals[p + j]
+                    ret[x[:, i] == v, p + j] = 1
+                p += lens[i]
+            return (ret, )
+
+        op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"])
+        self.assertReferenceChecks(gc, op, [x, lens, vals], ref)
+
+    @given(
+        x=hu.tensor(
+            min_dim=2, max_dim=2, dtype=np.float32,
+            elements=st.integers(min_value=-5, max_value=5)),
+        seed=st.integers(min_value=0, max_value=1000),
+        **hu.gcs_cpu_only)
+    def test_batch_bucketized_one_hot(self, x, seed, gc, dc):
+        np.random.seed(seed)
+        d = x.shape[1]
+        lens = np.random.randint(low=1, high=5, size=d)
+        boundaries = []
+        for i in range(d):
+            # add [0, 0] as duplicated bounary for duplicated bucketization
+            if lens[i] > 2:
+                cur_boundary = np.append(
+                    np.random.randn(lens[i] - 2) * 5, [0, 0])
+            else:
+                cur_boundary = np.random.randn(lens[i]) * 5
+            cur_boundary.sort()
+            boundaries += cur_boundary.tolist()
+
+        lens = np.array(lens, dtype=np.int32)
+        boundaries = np.array(boundaries, dtype=np.float32)
+
+        def ref(x, lens, boundaries):
+            output_dim = lens.size + boundaries.size
+            ret = np.zeros((x.shape[0], output_dim)).astype(x.dtype)
+            boundary_offset = 0
+            output_offset = 0
+            for i, l in enumerate(lens):
+                bucket_idx_right = np.digitize(
+                    x[:, i],
+                    boundaries[boundary_offset:boundary_offset + l],
+                    right=True
+                )
+                bucket_idx_left = np.digitize(
+                    x[:, i],
+                    boundaries[boundary_offset:boundary_offset + l],
+                    right=False
+                )
+                bucket_idx = np.floor_divide(
+                    np.add(bucket_idx_right, bucket_idx_left), 2)
+                for j in range(x.shape[0]):
+                    ret[j, output_offset + bucket_idx[j]] = 1.0
+                boundary_offset += lens[i]
+                output_offset += (lens[i] + 1)
+            return (ret, )
+
+        op = core.CreateOperator('BatchBucketOneHot',
+                                 ["X", "LENS", "BOUNDARIES"], ["Y"])
+        self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref)
+
+    @given(
+        hot_indices=hu.tensor(
+            min_dim=1, max_dim=1, dtype=np.int64,
+            elements=st.integers(min_value=0, max_value=42)),
+        end_padding=st.integers(min_value=0, max_value=2),
+        **hu.gcs)
+    def test_one_hot(self, hot_indices, end_padding, gc, dc):
+
+        def one_hot_ref(hot_indices, size):
+            out = np.zeros([len(hot_indices), size], dtype=float)
+            x = enumerate(hot_indices)
+            for i, x in enumerate(hot_indices):
+                out[i, x] = 1.
+            return (out, )
+
+        size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64)
+        if size == 0:
+            size = 1
+        op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output'])
+        self.assertReferenceChecks(
+            gc,
+            op,
+            [hot_indices, size],
+            one_hot_ref,
+            input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)})
+
+    @given(hot_indices=_one_hots())
+    def test_segment_one_hot(self, hot_indices):
+        index_size, lengths, indices = hot_indices
+
+        index_size = np.array(index_size, dtype=np.int64)
+        lengths = np.array(lengths, dtype=np.int32)
+        indices = np.array(indices, dtype=np.int64)
+
+        def segment_one_hot_ref(lengths, hot_indices, size):
+            offset = 0
+            out = np.zeros([len(lengths), size], dtype=float)
+            for i, length in enumerate(lengths):
+                for idx in hot_indices[offset:offset + length]:
+                    out[i, idx] = 1.
+                offset += length
+            return (out, )
+
+        op = core.CreateOperator(
+            'SegmentOneHot',
+            ['lengths', 'hot_indices', 'size'],
+            ['output'])
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [lengths, indices, index_size],
+            segment_one_hot_ref)
+
+    @given(
+        x=hu.tensor(
+            min_dim=2, max_dim=2, dtype=np.float32,
+            elements=st.integers(min_value=-5, max_value=5)),
+        seed=st.integers(min_value=0, max_value=1000),
+        **hu.gcs_cpu_only)
+    def test_batch_bucket_one_hot_shape_inference(self, x, seed, gc, dc):
+        np.random.seed(seed)
+        d = x.shape[1]
+        lens = np.random.randint(low=1, high=5, size=d)
+        boundaries = []
+        for i in range(d):
+            # add [0, 0] as duplicated bounary for duplicated bucketization
+            if lens[i] > 2:
+                cur_boundary = np.append(
+                    np.random.randn(lens[i] - 2) * 5, [0, 0])
+            else:
+                cur_boundary = np.random.randn(lens[i]) * 5
+            cur_boundary.sort()
+            boundaries += cur_boundary.tolist()
+
+        lens = np.array(lens, dtype=np.int32)
+        boundaries = np.array(boundaries, dtype=np.float32)
+
+        workspace.FeedBlob('lens', lens)
+        workspace.FeedBlob('boundaries', boundaries)
+        workspace.FeedBlob('x', x)
+
+        net = core.Net("batch_bucket_one_hot_test")
+        result = net.BatchBucketOneHot(["x", "lens", "boundaries"], 1)
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+
+        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
+        self.assertEqual(
+            shapes[result], [x.shape[0], lens.shape[0] + boundaries.shape[0]])
+        self.assertEqual(types[result], core.DataType.FLOAT)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
new file mode 100644
index 0000000..0cba805
--- /dev/null
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, workspace
+from hypothesis import given
+from caffe2.proto import caffe2_pb2
+
+
+class TestONNXWhile(hu.HypothesisTestCase):
+    @given(
+        condition=st.booleans(),
+        max_trip_count=st.integers(0, 100),
+        save_scopes=st.booleans(),
+        disable_scopes=st.booleans(),
+        seed=st.integers(0, 65535),
+        **hu.gcs_cpu_only)
+    def test_onnx_while_fibb(
+            self, condition, max_trip_count, save_scopes, disable_scopes, seed, gc, dc):
+        np.random.seed(seed)
+        if disable_scopes:
+            save_scopes = False
+
+        # Create body net
+        body_net = caffe2_pb2.NetDef()
+        # Two loop carried dependencies: first and second
+        body_net.external_input.extend(['i', 'cond', 'first', 'second'])
+        body_net.external_output.extend(['cond_new', 'second', 'third', 'third'])
+        add_op = core.CreateOperator(
+            'Add',
+            ['first', 'second'],
+            ['third'],
+        )
+        print3 = core.CreateOperator(
+            'Print',
+            ['third'],
+            [],
+        )
+        limit_const = core.CreateOperator(
+            'ConstantFill',
+            [],
+            ['limit_const'],
+            shape=[1],
+            dtype=caffe2_pb2.TensorProto.FLOAT,
+            value=100.0,
+        )
+        cond = core.CreateOperator(
+            'LT',
+            ['third', 'limit_const'],
+            ['cond_new'],
+        )
+        body_net.op.extend([add_op, print3, limit_const, cond])
+
+        while_op = core.CreateOperator(
+            'ONNXWhile',
+            ['max_trip_count', 'condition', 'first_init', 'second_init'],
+            ['first_a', 'second_a', 'third_a'],
+            body=body_net,
+            has_cond=True,
+            has_trip_count=True,
+            save_scopes=save_scopes,
+            disable_scopes=disable_scopes,
+        )
+
+        condition_arr = np.array(condition).astype(np.bool)
+        max_trip_count_arr = np.array(max_trip_count).astype(np.int64)
+        first_init = np.array([1]).astype(np.float32)
+        second_init = np.array([1]).astype(np.float32)
+
+        def ref(max_trip_count, condition, first_init, second_init):
+            first = 1
+            second = 1
+            results = []
+            if condition:
+                for _ in range(max_trip_count):
+                    third = first + second
+                    first = second
+                    second = third
+                    results.append(third)
+                    if third > 100:
+                        break
+            return (first, second, np.array(results).astype(np.float32))
+
+        self.assertReferenceChecks(
+            gc,
+            while_op,
+            [max_trip_count_arr, condition_arr, first_init, second_init],
+            ref,
+        )
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
new file mode 100644
index 0000000..97b2eaf
--- /dev/null
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -0,0 +1,313 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+from hypothesis import given
+from hypothesis import strategies as st
+import numpy as np
+import time
+
+
+class TestTensorPackOps(hu.HypothesisTestCase):
+
+    def pack_segments_ref(self, return_presence_mask=False, max_length=None):
+        def pack_segments_ref(lengths, data, max_length=max_length):
+            arr = []
+            constant_values = 0
+            if data.dtype.char == 'S':
+                constant_values = ''
+            if max_length is not None:
+                assert(max_length > np.max(lengths))
+            else:
+                max_length = np.max(lengths)
+            for idx in range(np.size(lengths)):
+                chunk = data[np.sum(lengths[:idx]):np.sum(lengths[:idx + 1])]
+                pad_length = max_length - lengths[idx]
+
+                # ((0, pad_length), (0, 0)) says add pad_length rows of padding
+                # below chunk and 0 rows of padding elsewhere
+                arr.append(
+                    np.pad(
+                        chunk, ((0, pad_length), (0, 0)),
+                        mode=str("constant"),
+                        constant_values=constant_values
+                    )
+                )
+            result = [arr]
+            if return_presence_mask:
+                presence_arr = []
+                for length in lengths:
+                    pad_length = max_length - length
+                    presence_arr.append(
+                        np.pad(
+                            np.ones((length), dtype=np.bool), ((0, pad_length)),
+                            mode=str("constant")
+                        )
+                    )
+                result.append(presence_arr)
+            return result
+
+        return pack_segments_ref
+
+    @given(
+        num_seq=st.integers(10, 100),
+        cell_size=st.integers(1, 10),
+        **hu.gcs
+    )
+    def test_pack_with_max_length_ops(self, num_seq, cell_size, gc, dc):
+        # create data
+        lengths = np.arange(num_seq, dtype=np.int32) + 1
+        num_cell = np.sum(lengths)
+        data = np.zeros(num_cell * cell_size, dtype=np.float32)
+        left = np.cumsum(np.arange(num_seq) * cell_size)
+        right = np.cumsum(lengths * cell_size)
+        for i in range(num_seq):
+            data[left[i]:right[i]] = i + 1.0
+        data.resize(num_cell, cell_size)
+        print("\nnum seq:{},    num cell: {},   cell size:{}\n".format(
+            num_seq, num_cell, cell_size)
+            + "=" * 60
+        )
+        # run test
+        max_length = num_seq + 1
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t'], max_length=max_length)
+        workspace.FeedBlob('l', lengths)
+        workspace.FeedBlob('d', data)
+        start = time.time()
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[lengths, data, max_length],
+            reference=self.pack_segments_ref(max_length=max_length),
+        )
+        end = time.time()
+        print("{} used time: {}".format(gc, end - start).replace('\n', ' '))
+
+        with core.DeviceScope(gc):
+            workspace.FeedBlob('l', lengths)
+            workspace.FeedBlob('d', data)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'PackSegments',
+            ['l', 'd'],
+            ['t'],
+            max_length=max_length,
+            device_option=gc))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'UnpackSegments',
+            ['l', 't'],
+            ['newd'],
+            device_option=gc))
+        assert(workspace.FetchBlob('t').shape[1] == max_length)
+        assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())
+
+    @given(
+        num_seq=st.integers(10, 500),
+        cell_size=st.integers(1, 10),
+        **hu.gcs
+    )
+    def test_pack_ops(self, num_seq, cell_size, gc, dc):
+        # create data
+        lengths = np.arange(num_seq, dtype=np.int32) + 1
+        num_cell = np.sum(lengths)
+        data = np.zeros(num_cell * cell_size, dtype=np.float32)
+        left = np.cumsum(np.arange(num_seq) * cell_size)
+        right = np.cumsum(lengths * cell_size)
+        for i in range(num_seq):
+            data[left[i]:right[i]] = i + 1.0
+        data.resize(num_cell, cell_size)
+        print("\nnum seq:{},    num cell: {},   cell size:{}\n".format(
+            num_seq, num_cell, cell_size)
+            + "=" * 60
+        )
+        # run test
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t'])
+        workspace.FeedBlob('l', lengths)
+        workspace.FeedBlob('d', data)
+
+        start = time.time()
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[lengths, data],
+            reference=self.pack_segments_ref(),
+        )
+        end = time.time()
+        print("{} used time: {}".format(gc, end - start).replace('\n', ' '))
+
+        with core.DeviceScope(gc):
+            workspace.FeedBlob('l', lengths)
+            workspace.FeedBlob('d', data)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'PackSegments',
+            ['l', 'd'],
+            ['t'],
+            device_option=gc))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'UnpackSegments',
+            ['l', 't'],
+            ['newd'],
+            device_option=gc))
+        assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())
+
+    @given(
+        **hu.gcs_cpu_only
+    )
+    def test_pack_ops_str(self, gc, dc):
+        # GPU does not support string. Test CPU implementation only.
+        workspace.FeedBlob('l', np.array([1, 2, 3], dtype=np.int64))
+        strs = np.array([
+            ["a", "a"],
+            ["b", "b"],
+            ["bb", "bb"],
+            ["c", "c"],
+            ["cc", "cc"],
+            ["ccc", "ccc"]],
+            dtype='|S')
+        workspace.FeedBlob('d', strs)
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'PackSegments',
+            ['l', 'd'],
+            ['t'],
+            device_option=gc))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'UnpackSegments',
+            ['l', 't'],
+            ['newd'],
+            device_option=gc))
+        assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())
+
+    def test_pad_minf(self):
+        workspace.FeedBlob('l', np.array([1, 2, 3], dtype=np.int32))
+        workspace.FeedBlob(
+            'd',
+            np.array([
+                [1.0, 1.1],
+                [2.0, 2.1],
+                [2.2, 2.2],
+                [3.0, 3.1],
+                [3.2, 3.3],
+                [3.4, 3.5]],
+                dtype=np.float32))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t'], pad_minf=True))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'Exp', ['t'], ['r']
+        ))
+        result = workspace.FetchBlob('t')
+        assert(result[0, -1, 0] < -1000.0)
+
+        # The whole point of padding with -inf is that when we exponentiate it
+        # then it should be zero.
+        exponentiated = workspace.FetchBlob('r')
+        assert(exponentiated[0, -1, 0] == 0.0)
+
+    @given(**hu.gcs_cpu_only)
+    def test_presence_mask(self, gc, dc):
+        lengths = np.array([1, 2, 3], dtype=np.int32)
+        data = np.array(
+            [
+                [1.0, 1.0], [2.0, 2.0], [2.0, 2.0], [3.0, 3.0], [3.0, 3.0],
+                [3.0, 3.0]
+            ],
+            dtype=np.float32
+        )
+
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
+        )
+        workspace.FeedBlob('l', lengths)
+        workspace.FeedBlob('d', data)
+        inputs = [lengths, data]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=self.pack_segments_ref(return_presence_mask=True),
+        )
+
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
+        )
+        workspace.RunOperatorOnce(op)
+
+        output = workspace.FetchBlob('t')
+        expected_output_shape = (3, 3, 2)
+        self.assertEquals(output.shape, expected_output_shape)
+
+        presence_mask = workspace.FetchBlob('p')
+        expected_presence_mask = np.array(
+            [[True, False, False], [True, True, False], [True, True, True]],
+            dtype=np.bool
+        )
+        self.assertEqual(presence_mask.shape, expected_presence_mask.shape)
+        np.testing.assert_array_equal(presence_mask, expected_presence_mask)
+
+    def test_presence_mask_empty(self):
+        lengths = np.array([], dtype=np.int32)
+        data = np.array([], dtype=np.float32)
+
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
+        )
+        workspace.FeedBlob('l', lengths)
+        workspace.FeedBlob('d', data)
+        workspace.RunOperatorOnce(op)
+
+        output = workspace.FetchBlob('p')
+        expected_output_shape = (0, 0)
+        self.assertEquals(output.shape, expected_output_shape)
+
+    @given(**hu.gcs_cpu_only)
+    def test_out_of_bounds(self, gc, dc):
+        # Copy pasted from test_pack_ops but with 3 changed to 4
+        lengths = np.array([1, 2, 4], dtype=np.int32)
+        data = np.array([
+            [1.0, 1.0],
+            [2.0, 2.0],
+            [2.0, 2.0],
+            [3.0, 3.0],
+            [3.0, 3.0],
+            [3.0, 3.0]], dtype=np.float32)
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t'])
+
+        inputs = [lengths, data]
+        self.assertRunOpRaises(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            exception=RuntimeError
+        )
+
+    @given(**hu.gcs_cpu_only)
+    def test_under_bounds(self, gc, dc):
+        # Copy pasted from test_pack_ops but with 3 changed to 2
+        lengths = np.array([1, 2, 2], dtype=np.int32)
+        data = np.array([
+            [1.0, 1.0],
+            [2.0, 2.0],
+            [2.0, 2.0],
+            [3.0, 3.0],
+            [3.0, 3.0],
+            [3.0, 3.0]], dtype=np.float32)
+        op = core.CreateOperator(
+            'PackSegments', ['l', 'd'], ['t'])
+
+        inputs = [lengths, data]
+        self.assertRunOpRaises(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            exception=RuntimeError
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
new file mode 100644
index 0000000..a5a3d6d
--- /dev/null
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -0,0 +1,90 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestPackRNNSequenceOperator(hu.HypothesisTestCase):
+
+    @given(n=st.integers(0, 10), k=st.integers(1, 5),
+           dim=st.integers(1, 5), **hu.gcs_cpu_only)
+    def test_pack_rnn_seqence(self, n, k, dim, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        values = np.random.rand(sum(lengths), dim).astype(np.float32)
+
+        def pack_op(values, lengths):
+            T = max(lengths) if any(lengths) else 0
+            N = lengths.size
+            output = np.zeros((T, N) + values.shape[1:]).astype(np.float32)
+            offset = 0
+            for c in range(N):
+                for r in range(lengths[c]):
+                    output[r][c] = values[offset + r]
+                offset += lengths[c]
+            return [output]
+
+        op = core.CreateOperator(
+            'PackRNNSequence',
+            ['values', 'lengths'],
+            'out'
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[values, lengths],
+            reference=pack_op,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [values, lengths], [0])
+        # Gradient check
+        self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
+
+    @given(n=st.integers(0, 10), k=st.integers(2, 5),
+           dim=st.integers(1, 5), **hu.gcs_cpu_only)
+    def test_unpack_rnn_seqence(self, n, k, dim, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        T = max(lengths) if any(lengths) else 0
+        N = lengths.size
+        values = np.random.rand(T, N, dim).astype(np.float32)
+
+        def unpack_op(values, lengths):
+            M = sum(lengths)
+            output = np.zeros((M,) + values.shape[2:]).astype(np.float32)
+            N = lengths.size
+            offset = 0
+            for c in range(N):
+                for r in range(lengths[c]):
+                    output[offset + r] = values[r][c]
+                offset += lengths[c]
+            return [output]
+
+        op = core.CreateOperator(
+            'UnpackRNNSequence',
+            ['values', 'lengths'],
+            'out'
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[values, lengths],
+            reference=unpack_op,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [values, lengths], [0])
+        # Gradient check
+        self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
new file mode 100644
index 0000000..ee5e001
--- /dev/null
+++ b/caffe2/python/operator_test/pad_test.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core
+from hypothesis import given
+
+
+class TestPad(hu.HypothesisTestCase):
+    @given(pad_t=st.integers(-5, 0),
+           pad_l=st.integers(-5, 0),
+           pad_b=st.integers(-5, 0),
+           pad_r=st.integers(-5, 0),
+           mode=st.sampled_from(["constant", "reflect", "edge"]),
+           size_w=st.integers(16, 128),
+           size_h=st.integers(16, 128),
+           size_c=st.integers(1, 4),
+           size_n=st.integers(1, 4),
+           **hu.gcs)
+    def test_crop(self,
+                  pad_t, pad_l, pad_b, pad_r,
+                  mode,
+                  size_w, size_h, size_c, size_n,
+                  gc, dc):
+        op = core.CreateOperator(
+            "PadImage",
+            ["X"],
+            ["Y"],
+            pad_t=pad_t,
+            pad_l=pad_l,
+            pad_b=pad_b,
+            pad_r=pad_r,
+        )
+        X = np.random.rand(
+            size_n, size_c, size_h, size_w).astype(np.float32)
+
+        def ref(X):
+            return (X[:, :, -pad_t:pad_b or None, -pad_l:pad_r or None],)
+
+        self.assertReferenceChecks(gc, op, [X], ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
new file mode 100644
index 0000000..a5a7db1
--- /dev/null
+++ b/caffe2/python/operator_test/partition_ops_test.py
@@ -0,0 +1,181 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase, rand_array
+
+
+class TestPartitionOps(TestCase):
+
+    def test_configs(self):
+        # (main dims, partitions,  main type, [list of (extra dims, type)])
+        configs = [
+            ((10, ), 3),
+            ((4, ), 10),
+            ((10, 10), 4),
+            ((100, ), 2),
+            ((5, ), 1),
+            ((1, ), 1),
+            ((2, 10), 2),
+        ]
+        suffixes = [
+            [],
+            [((2, 2), np.float32)],
+            [((3, ), np.int64), ((2, ), np.float32)],
+        ]
+        return [
+            (main_dims, parts, main_type, extra, pack)
+            for main_dims, parts in configs
+            for main_type in [np.int32, np.int64] for extra in suffixes
+            for pack in [False, True]
+        ]
+
+    def testPartition(self):
+        for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
+            ins = ['in' + str(i) for i in range(1 + len(extra_ins))]
+            outs = [
+                'in{}_p{}'.format(j, i)
+                for i in range(parts) for j in range(1 + len(extra_ins))
+            ]
+            op = core.CreateOperator(
+                'Partition', ins, outs, pack_first_input=(1 if pack else 0))
+            x = []
+            for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
+                if t in [np.float32, np.float64]:
+                    d = rand_array(*(main_dims + dims))
+                else:
+                    d = np.random.randint(-100, 100, (main_dims + dims))
+                d = d.astype(t)
+                workspace.FeedBlob(ins[i], d)
+                x.append(d)
+
+            def sharding(x):
+                # numpy has proper modulo op that yields non-negative results
+                shards = (x[0] % parts).reshape([-1])
+                out = []
+                for i in range(parts):
+                    for ind, v in enumerate(x):
+                        suffix_shape = v.shape[len(x[0].shape):]
+                        accum = []
+                        data = v.reshape((-1, ) + suffix_shape)
+
+                        if pack and ind == 0:
+                            data = data // parts
+
+                        for j, s in enumerate(shards):
+                            if s == i:
+                                accum.append(data[j])
+
+                        def join(a):
+                            if not a:
+                                return np.empty(shape=(0, ) + suffix_shape)
+                            return np.stack(a)
+
+                        out.append(join(accum))
+                return out
+
+            workspace.RunOperatorOnce(op)
+            ref = sharding(x)
+            print(x)
+            print(ref)
+            for name, expected in zip(outs, ref):
+                np.testing.assert_array_equal(
+                    expected, workspace.FetchBlob(name)
+                )
+
+            # test inverse operation (GatherByKey)
+            if len(main_dims) == 1:
+                # currently only 1D key tensor supported
+                for i in range(len(extra_ins)):
+                    expected_out = ins[i + 1]
+                    gather_ins = [ins[0]] + [
+                        outs[len(ins) * p + i + 1] for p in range(parts)]
+                    actual_out = expected_out + '_actual'
+                    op = core.CreateOperator(
+                        'GatherByKey', gather_ins, actual_out)
+                    workspace.RunOperatorOnce(op)
+                    expected = workspace.FetchBlob(expected_out)
+                    actual = workspace.FetchBlob(actual_out)
+                    np.testing.assert_array_equal(expected, actual)
+
+
+    def testLengthsPartition(self):
+        for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
+            # For LengthsSharding only 1-D tensors supported as a first input
+            if len(main_dims) > 1:
+                continue
+            ins = ['in' + str(i) for i in range(2 + len(extra_ins))]
+            outs = [
+                'in{}_p{}'.format(j, i)
+                for i in range(parts) for j in range(2 + len(extra_ins))
+            ]
+            op = core.CreateOperator(
+                'LengthsPartition', ins, outs,
+                pack_first_input=(1 if pack else 0)
+            )
+            x = []
+            for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
+                if t in [np.float32, np.float64]:
+                    d = rand_array(*(main_dims + dims))
+                else:
+                    d = np.random.randint(-100, 100, (main_dims + dims))
+                d = d.astype(t)
+                workspace.FeedBlob(ins[i + 1], d)
+                x.append(d)
+
+            # Randomly generate length tensor as well
+            elements = np.random.randint(2, 10)
+            lengths = []
+            total_length = 0
+            for _ in range(elements - 1):
+                lengths.append(np.random.randint(main_dims[0] - total_length))
+                total_length += lengths[-1]
+            lengths.append(main_dims[0] - total_length)
+            workspace.FeedBlob(ins[0], np.array(lengths, dtype=np.int32))
+
+            def sharding(x):
+                # numpy has proper modulo op that yields non-negative results
+                shards = (x[0] % parts).reshape([-1])
+                out = []
+                for i in range(parts):
+                    idx = 0
+                    sharded_lengths = np.zeros(elements)
+                    for ind, length in enumerate(lengths):
+                        for _ in range(length):
+                            if shards[idx] == i:
+                                sharded_lengths[ind] += 1
+                            idx += 1
+                    out.append(sharded_lengths)
+
+                    for ind, v in enumerate(x):
+                        suffix_shape = v.shape[len(x[0].shape):]
+                        accum = []
+                        data = v.reshape((-1, ) + suffix_shape)
+
+                        if pack and ind == 0:
+                            data = data // parts
+
+                        for j, s in enumerate(shards):
+                            if s == i:
+                                accum.append(data[j])
+
+                        def join(a):
+                            if not a:
+                                return np.empty(shape=(0, ) + suffix_shape)
+                            return np.stack(a)
+
+                        out.append(join(accum))
+                return out
+
+            workspace.RunOperatorOnce(op)
+            ref = sharding(x)
+            for name, expected in zip(outs, ref):
+                np.testing.assert_array_equal(
+                    expected, workspace.FetchBlob(name)
+                )
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
new file mode 100644
index 0000000..54c42bf
--- /dev/null
+++ b/caffe2/python/operator_test/percentile_op_test.py
@@ -0,0 +1,130 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace, dyndep
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+
+class TestPercentileOp(hu.HypothesisTestCase):
+    def _test_percentile_op(
+        self,
+        original_inp,
+        value_to_pct_map,
+        dist_lengths,
+        expected_values
+    ):
+        op = core.CreateOperator(
+            'Percentile',
+            ['original_values', 'value_to_pct_map', 'dist_lengths'],
+            ['percentile_values']
+        )
+        workspace.FeedBlob('original_values', np.array(
+            original_inp, dtype=np.float32))
+        workspace.FeedBlob(
+            'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
+        workspace.FeedBlob('dist_lengths', np.array(
+            dist_lengths, dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        np.testing.assert_array_almost_equal(
+            workspace.FetchBlob('percentile_values'),
+            np.array(expected_values),
+            decimal=5
+        )
+        self._test_shape_inference(
+            original_inp,
+            value_to_pct_map,
+            dist_lengths,
+            expected_values
+        )
+
+    def _test_shape_inference(
+        self,
+        original_inp,
+        value_to_pct_map,
+        dist_lengths,
+        expected_values
+    ):
+        net = core.Net('test_shape_inference')
+        result = net.Percentile(
+            ['original_values', 'value_to_pct_map', 'dist_lengths'],
+            ['percentile_values']
+        )
+        workspace.FeedBlob('original_values', np.array(
+            original_inp, dtype=np.float32))
+        workspace.FeedBlob(
+            'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
+        workspace.FeedBlob('dist_lengths', np.array(
+            dist_lengths, dtype=np.int32))
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
+        self.assertEqual(shapes[result], list(workspace.blobs['original_values'].shape))
+        self.assertEqual(types[result], core.DataType.FLOAT)
+
+    def test_percentile_op_with_only_one_dist(self):
+        self._test_percentile_op(
+            original_inp=[[5]],
+            value_to_pct_map=[[5, 0.4]],
+            dist_lengths=[1],
+            expected_values=[[0.4]]
+        )
+
+    def test_percentile_op_with_all_elements_in_map(self):
+        self._test_percentile_op(
+            original_inp=[[3, 4], [10, 4]],
+            value_to_pct_map=[[3, 0.3], [4, 0.6],
+                              [10, 0.8], [4, 0.5], [5, 0.6]],
+            dist_lengths=[3, 2],
+            expected_values=[[0.3, 0.5], [0.8, 0.5]],
+        )
+
+    def test_percentile_op_with_same_value(self):
+        self._test_percentile_op(
+            original_inp=[[1, 1], [1, 2]],
+            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.5]],
+            dist_lengths=[2, 1],
+            expected_values=[[0.1, 0.0], [0.1, 0.5]]
+        )
+
+    def test_percentile_op_with_elements_bigger_than_map_range(self):
+        self._test_percentile_op(
+            original_inp=[[1, 5], [3, 4]],
+            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.1], [3, 0.3]],
+            dist_lengths=[2, 2],
+            expected_values=[[0.1, 1.], [0.3, 1.0]]
+        )
+
+    def test_percentile_op_with_elements_smaller_than_map_range(self):
+        self._test_percentile_op(
+            original_inp=[[1], [5], [6]],
+            value_to_pct_map=[[2, 0.2], [5, 0.5], [7, 0.5]],
+            dist_lengths=[3],
+            expected_values=[[0.0], [0.5], [0.5]]
+        )
+
+    def test_percentile_op_with_interpolation(self):
+        self._test_percentile_op(
+            original_inp=[[3, 2, 5], [6, 7, 8]],
+            value_to_pct_map=[[1, 0.1], [4, 0.7], [4.5, 0.8],
+                              [6, 0.5], [8, 0.9],
+                              [8, 0.6]],
+            dist_lengths=[3, 2, 1],
+            expected_values=[[0.5, 0.0, 0.0], [1.0, 0.7, 0.6]]
+        )
+
+    def test_percentile_op_with_large_sample_size_per_dist(self):
+        self._test_percentile_op(
+            original_inp=[[3, 1], [5, 7]],
+            value_to_pct_map=[[3, 0.5], [4, 0.6], [5, 0.7],
+                              [1, 0.2], [2, 0.3], [5, 0.8]],
+            dist_lengths=[3, 3],
+            expected_values=[[0.5, 0.2], [0.7, 1.0]]
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
new file mode 100644
index 0000000..f09dcdc
--- /dev/null
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -0,0 +1,165 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestPiecewiseLinearTransform(hu.HypothesisTestCase):
+    def constrain(self, v, min_val, max_val):
+        def constrain_internal(x):
+            return min(max(x, min_val), max_val)
+        return np.array([constrain_internal(x) for x in v])
+
+    def transform(self, x, bounds, slopes, intercepts):
+        n = len(slopes)
+        x_ = self.constrain(x, bounds[0], bounds[-1])
+        index = np.minimum(
+            np.maximum(
+                np.searchsorted(bounds, x_) - 1,
+                0
+            ),
+            n - 1
+        )
+        y = slopes[index] * x_ + intercepts[index]
+        return y
+
+    @given(n=st.integers(1, 100), **hu.gcs)
+    def test_multi_predictions_params_from_arg(self, n, gc, dc):
+        slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
+        intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
+        bounds = np.random.uniform(0.1, 0.9,
+                                   (2, n + 1)).astype(np.float32)
+        bounds.sort()
+        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
+
+        op = core.CreateOperator(
+            "PiecewiseLinearTransform", ["X"], ["Y"],
+            bounds=bounds.flatten().tolist(),
+            slopes=slopes.flatten().tolist(),
+            intercepts=intercepts.flatten().tolist(),
+        )
+
+        def piecewise(x, *args, **kw):
+            x_0 = self.transform(
+                x[:, 0], bounds[0, :], slopes[0, :], intercepts[0, :])
+            x_1 = self.transform(
+                x[:, 1], bounds[1, :], slopes[1, :], intercepts[1, :])
+
+            return [np.vstack((x_0, x_1)).transpose()]
+
+        self.assertReferenceChecks(gc, op, [X], piecewise)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(n=st.integers(1, 100), **hu.gcs)
+    def test_binary_predictions_params_from_arg(self, n, gc, dc):
+        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
+        bounds.sort()
+
+        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
+        X[:, 0] = 1 - X[:, 1]
+
+        op = core.CreateOperator(
+            "PiecewiseLinearTransform", ["X"], ["Y"],
+            bounds=bounds.flatten().tolist(),
+            slopes=slopes.flatten().tolist(),
+            intercepts=intercepts.flatten().tolist(),
+            pieces=n,
+            binary=True,
+        )
+
+        def piecewise(x):
+            x_ = self.transform(x[:, 1], bounds, slopes, intercepts)
+            return [np.vstack((1 - x_, x_)).transpose()]
+
+        self.assertReferenceChecks(gc, op, [X], piecewise)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(n=st.integers(1, 100), **hu.gcs)
+    def test_multi_predictions_params_from_input(self, n, gc, dc):
+        slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
+        intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
+        bounds = np.random.uniform(0.1, 0.9,
+                                   (2, n + 1)).astype(np.float32)
+        bounds.sort()
+        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
+
+        op = core.CreateOperator(
+            "PiecewiseLinearTransform",
+            ["X", "bounds", "slopes", "intercepts"],
+            ["Y"],
+        )
+
+        def piecewise(x, bounds, slopes, intercepts):
+            x_0 = self.transform(
+                x[:, 0], bounds[0, :], slopes[0, :], intercepts[0, :])
+            x_1 = self.transform(
+                x[:, 1], bounds[1, :], slopes[1, :], intercepts[1, :])
+
+            return [np.vstack((x_0, x_1)).transpose()]
+
+        self.assertReferenceChecks(
+            gc, op, [X, bounds, slopes, intercepts], piecewise)
+        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
+
+    @given(n=st.integers(1, 100), **hu.gcs)
+    def test_binary_predictions_params_from_input(self, n, gc, dc):
+        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
+        bounds.sort()
+
+        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
+        X[:, 0] = 1 - X[:, 1]
+
+        op = core.CreateOperator(
+            "PiecewiseLinearTransform",
+            ["X", "bounds", "slopes", "intercepts"],
+            ["Y"],
+            binary=True,
+        )
+
+        def piecewise(x, bounds, slopes, intercepts):
+            x_ = self.transform(x[:, 1], bounds, slopes, intercepts)
+            return [np.vstack((1 - x_, x_)).transpose()]
+
+        self.assertReferenceChecks(
+            gc, op, [X, bounds, slopes, intercepts], piecewise)
+        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
+
+    @given(n=st.integers(1, 100), **hu.gcs)
+    def test_1D_predictions_params_from_input(self, n, gc, dc):
+        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
+        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
+        bounds.sort()
+
+        X = np.random.uniform(0, 1, size=n).astype(np.float32)
+
+        op = core.CreateOperator(
+            "PiecewiseLinearTransform",
+            ["X", "bounds", "slopes", "intercepts"],
+            ["Y"],
+            binary=True,
+        )
+
+        def piecewise(x, bounds, slopes, intercepts):
+            x_ = self.transform(x, bounds, slopes, intercepts)
+            return [x_]
+
+        self.assertReferenceChecks(
+            gc, op, [X, bounds, slopes, intercepts], piecewise)
+        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
new file mode 100644
index 0000000..956d0ec
--- /dev/null
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -0,0 +1,329 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import assume, given, settings
+import hypothesis.strategies as st
+import os
+import unittest
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestPooling(hu.HypothesisTestCase):
+    # CUDNN does NOT support different padding values and we skip it
+    @given(stride_h=st.integers(1, 3),
+           stride_w=st.integers(1, 3),
+           pad_t=st.integers(0, 3),
+           pad_l=st.integers(0, 3),
+           pad_b=st.integers(0, 3),
+           pad_r=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
+                                   "MaxPool2D", "AveragePool2D"]),
+           **hu.gcs)
+    def test_pooling_separate_stride_pad(self, stride_h, stride_w,
+                                         pad_t, pad_l, pad_b,
+                                         pad_r, kernel, size,
+                                         input_channels,
+                                         batch_size, order,
+                                         op_type,
+                                         gc, dc):
+        assume(np.max([pad_t, pad_l, pad_b, pad_r]) < kernel)
+
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            stride_h=stride_h,
+            stride_w=stride_w,
+            pad_t=pad_t,
+            pad_l=pad_l,
+            pad_b=pad_b,
+            pad_r=pad_r,
+            kernel=kernel,
+            order=order,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32)
+
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    # This test is to check if CUDNN works for bigger batch size or not
+    @unittest.skipIf(not os.getenv('CAFFE2_DEBUG'),
+                     "This is a test that reproduces a cudnn error. If you "
+                     "want to run it, set env variable CAFFE2_DEBUG=1.")
+    @given(**hu.gcs_gpu_only)
+    def test_pooling_big_batch(self, gc, dc):
+        op = core.CreateOperator(
+            "AveragePool",
+            ["X"],
+            ["Y"],
+            stride=1,
+            kernel=7,
+            pad=0,
+            order="NHWC",
+            engine="CUDNN",
+        )
+        X = np.random.rand(70000, 7, 7, 81).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool",
+                                    "MaxPool1D", "AveragePool1D"]),
+           **hu.gcs)
+    def test_pooling_1d(self, stride, pad, kernel, size, input_channels,
+                        batch_size, order, op_type, gc, dc):
+        assume(pad < kernel)
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            strides=[stride],
+            kernels=[kernel],
+            pads=[pad, pad],
+            order=order,
+            engine="",
+        )
+        X = np.random.rand(
+            batch_size, size, input_channels).astype(np.float32)
+        if order == "NCHW":
+            X = X.transpose((0, 2, 1))
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 2),
+           kernel=st.integers(1, 6),
+           size=st.integers(3, 5),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool",
+                                    "MaxPool3D", "AveragePool3D"]),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_pooling_3d(self, stride, pad, kernel, size, input_channels,
+                        batch_size, order, op_type, engine, gc, dc):
+        assume(pad < kernel)
+        assume(size + pad + pad >= kernel)
+        # some case here could be calculated with global pooling, but instead
+        # calculated with general implementation, slower but should still
+        # be corect.
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            strides=[stride] * 3,
+            kernels=[kernel] * 3,
+            pads=[pad] * 6,
+            order=order,
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, size, size, size, input_channels).astype(np.float32)
+        if order == "NCHW":
+            X = X.transpose((0, 4, 1, 2, 3))
+
+        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
+
+    @given(kernel=st.integers(3, 6),
+           size=st.integers(3, 5),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool",
+                                    "MaxPool3D", "AveragePool3D"]),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_global_pooling_3d(self, kernel, size, input_channels,
+                               batch_size, order, op_type, engine, gc, dc):
+        # pad and stride ignored because they will be infered in global_pooling
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            kernels=[kernel] * 3,
+            order=order,
+            global_pooling=True,
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, size, size, size, input_channels).astype(np.float32)
+        if order == "NCHW":
+            X = X.transpose((0, 4, 1, 2, 3))
+
+        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **hu.gcs_gpu_only)
+    def test_pooling_with_index(self, stride, pad, kernel, size,
+                                input_channels, batch_size, gc, dc):
+        assume(pad < kernel)
+        op = core.CreateOperator(
+            "MaxPoolWithIndex",
+            ["X"],
+            ["Y", "Y_index"],
+            stride=stride,
+            kernel=kernel,
+            pad=pad,
+            order="NCHW",
+            deterministic=1,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32)
+
+        # transpose due to order = NCHW
+        X = X.transpose((0, 3, 1, 2))
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(sz=st.integers(1, 20),
+           batch_size=st.integers(1, 4),
+           engine=st.sampled_from(["", "CUDNN"]),
+           op_type=st.sampled_from(["AveragePool", "AveragePool2D"]),
+           **hu.gcs)
+    @settings(max_examples=3, timeout=10)
+    def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
+        ''' Special test to stress the fast path of NCHW average pool '''
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            stride=1,
+            kernel=sz,
+            pad=0,
+            order="NCHW",
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, 3, sz, sz).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(sz=st.integers(1, 20),
+           batch_size=st.integers(1, 4),
+           engine=st.sampled_from(["", "CUDNN"]),
+           op_type=st.sampled_from(["MaxPool", "MaxPool2D"]),
+           **hu.gcs)
+    @settings(max_examples=3, timeout=10)
+    def test_global_max_pool_nchw(self, op_type, sz,
+                                  batch_size, engine, gc, dc):
+        ''' Special test to stress the fast path of NCHW max pool '''
+        # CuDNN 5 does not support deterministic max pooling.
+        assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            stride=1,
+            kernel=sz,
+            pad=0,
+            order="NCHW",
+            engine=engine,
+            deterministic=1,
+        )
+
+        np.random.seed(1234)
+        X = np.random.rand(
+            batch_size, 3, sz, sz).astype(np.float32)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-4)
+
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(1, 5),
+           size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
+                                   "MaxPool2D", "AveragePool2D"]),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_pooling(self, stride, pad, kernel, size,
+                     input_channels, batch_size,
+                     order, op_type, engine, gc, dc):
+        assume(pad < kernel)
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            stride=stride,
+            kernel=kernel,
+            pad=pad,
+            order=order,
+            engine=engine,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32)
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @given(size=st.integers(7, 9),
+           input_channels=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_global_pooling(self, size, input_channels, batch_size,
+                            order, op_type, engine, gc, dc):
+        # CuDNN 5 does not support deterministic max pooling.
+        assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool")
+        op = core.CreateOperator(
+            op_type,
+            ["X"],
+            ["Y"],
+            order=order,
+            engine=engine,
+            global_pooling=True,
+        )
+        X = np.random.rand(
+            batch_size, size, size, input_channels).astype(np.float32)
+        if order == "NCHW":
+            X = X.transpose((0, 3, 1, 2))
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if 'MaxPool' not in op_type:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py
new file mode 100644
index 0000000..a5b7d01
--- /dev/null
+++ b/caffe2/python/operator_test/prepend_dim_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+from caffe2.proto import caffe2_pb2
+
+
+class TestPrependDim(TestCase):
+    def _test_fwd_bwd(self):
+        old_shape = (128, 2, 4)
+        new_shape = (8, 16, 2, 4)
+        X = np.random.rand(*old_shape).astype(np.float32)
+        Y = np.random.rand(*new_shape).astype(np.float32)
+
+        net = core.Net('net')
+
+        net.GivenTensorFill([], 'X', shape=old_shape, values=X.flatten())
+        net.GivenTensorFill([], 'Y', shape=new_shape, values=Y.flatten())
+
+        net.PrependDim(['X'], ['X_out'], dim_size=8)
+        net.DotProduct(['X_out', 'Y'], 'Z')
+        net.AddGradientOperators(['Z'])
+
+        workspace.RunNetOnce(net)
+
+        X_out = workspace.FetchBlob('X_out')
+        X_grad = workspace.FetchBlob('X_grad')
+        Y_grad = workspace.FetchBlob('Y_grad')
+
+        # Check the shape of the gradient
+        np.testing.assert_array_equal(X_out.shape, Y.shape)
+        np.testing.assert_array_equal(X_grad.shape, X.shape)
+        np.testing.assert_array_equal(Y_grad.shape, Y.shape)
+
+    def test_prepend_dim(self):
+        devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
+        if workspace.NumCudaDevices() > 0:
+            devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+        for device_opt in devices:
+            with core.DeviceScope(device_opt):
+                self._test_fwd_bwd()
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py
new file mode 100644
index 0000000..582ea3e
--- /dev/null
+++ b/caffe2/python/operator_test/python_op_test.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.core import CreatePythonOperator
+import caffe2.python.hypothesis_test_util as hu
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+if workspace.is_asan:
+    # Numba seems to be not compatible with ASAN (at least at Facebook)
+    # so if we are in asan mode, we disable Numba which further disables
+    # the numba python op test.
+    HAS_NUMBA = False
+else:
+    try:
+        import numba
+        HAS_NUMBA = True
+    except ImportError:
+        HAS_NUMBA = False
+
+
+class PythonOpTest(hu.HypothesisTestCase):
+    @unittest.skipIf(not HAS_NUMBA, "")
+    @given(x=hu.tensor(),
+           n=st.integers(min_value=1, max_value=20),
+           w=st.integers(min_value=1, max_value=20))
+    def test_multithreaded_evaluation_numba_nogil(self, x, n, w):
+        @numba.jit(nopython=True, nogil=True)
+        def g(input_, output):
+            output[...] = input_
+
+        def f(inputs, outputs):
+            outputs[0].reshape(inputs[0].shape)
+            g(inputs[0].data, outputs[0].data)
+
+        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
+        net = core.Net("net")
+        net.Proto().op.extend(ops)
+        net.Proto().type = "dag"
+        net.Proto().num_workers = w
+        iters = 100
+        plan = core.Plan("plan")
+        plan.AddStep(core.ExecutionStep("test-step", net, iters))
+        workspace.FeedBlob("x", x)
+        workspace.RunPlan(plan.Proto().SerializeToString())
+        for i in range(n):
+            y = workspace.FetchBlob(str(i))
+            np.testing.assert_almost_equal(x, y)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
new file mode 100644
index 0000000..97e352e
--- /dev/null
+++ b/caffe2/python/operator_test/rank_loss_operator_test.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestPairWiseLossOps(hu.HypothesisTestCase):
+    @given(X=hu.arrays(dims=[2, 1],
+                       elements=st.floats(min_value=0.0, max_value=10.0)),
+           label=hu.arrays(dims=[2, 1],
+                           elements=st.integers(min_value=0, max_value=1),
+                           dtype=np.float32),
+           **hu.gcs_cpu_only)
+    def test_pair_wise_loss_predictions(self, X, label, gc, dc):
+        workspace.FeedBlob('X', X)
+        workspace.FeedBlob('label', label)
+        new_label = np.array([label[1], label[0]])
+        new_x = np.array([X[1], X[0]])
+        workspace.FeedBlob('new_x', new_x)
+        workspace.FeedBlob('new_label', new_label)
+        net = core.Net('net')
+        net.PairWiseLoss(['X', 'label'], ['output'])
+        net.PairWiseLoss(['new_x', 'new_label'], ['new_output'])
+        plan = core.Plan('predict_data')
+        plan.AddStep(core.execution_step('predict_data',
+                                         [net], num_iter=1))
+        workspace.RunPlan(plan)
+        output = workspace.FetchBlob('output')
+        new_output = workspace.FetchBlob('new_output')
+        sign = 1 if label[0] > label[1] else -1
+        if label[0] == label[1]:
+            self.assertEqual(np.asscalar(output), 0)
+            return
+
+        self.assertAlmostEqual(
+            np.asscalar(output),
+            np.asscalar(np.log(1 + np.exp(sign * (X[1] - X[0])))),
+            delta=1e-4
+        )
+        # check swapping row order doesn't alter overall loss
+        self.assertAlmostEqual(output, new_output)
+
+    @given(X=hu.arrays(dims=[2, 1],
+                       elements=st.floats(min_value=0.0, max_value=10.0)),
+           label=hu.arrays(dims=[2, 1],
+                           elements=st.integers(min_value=0, max_value=1),
+                           dtype=np.float32),
+           dY=hu.arrays(dims=[1],
+                        elements=st.floats(min_value=1, max_value=10)),
+           **hu.gcs_cpu_only)
+    def test_pair_wise_loss_gradient(self, X, label, dY, gc, dc):
+        workspace.FeedBlob('X', X)
+        workspace.FeedBlob('dY', dY)
+        workspace.FeedBlob('label', label)
+        net = core.Net('net')
+        net.PairWiseLossGradient(
+            ['X', 'label', 'dY'],
+            ['dX'],
+        )
+        plan = core.Plan('predict_data')
+        plan.AddStep(core.execution_step('predict_data',
+                                         [net], num_iter=1))
+        workspace.RunPlan(plan)
+        dx = workspace.FetchBlob('dX')
+        sign = 1 if label[0] > label[1] else -1
+        if label[0] == label[1]:
+            self.assertEqual(np.asscalar(dx[0]), 0)
+            return
+        self.assertAlmostEqual(
+            np.asscalar(dx[0]),
+            np.asscalar(-dY[0] * sign / (1 + np.exp(sign * (X[0] - X[1])))),
+            delta=1e-2 * abs(np.asscalar(dx[0])))
+
+        self.assertEqual(np.asscalar(dx[0]), np.asscalar(-dx[1]))
+        delta = 1e-3
+        up_x = np.array([[X[0] + delta], [X[1]]], dtype=np.float32)
+        down_x = np.array([[X[0] - delta], [X[1]]], dtype=np.float32)
+        workspace.FeedBlob('up_x', up_x)
+        workspace.FeedBlob('down_x', down_x)
+        new_net = core.Net('new_net')
+        new_net.PairWiseLoss(['up_x', 'label'], ['up_output'])
+        new_net.PairWiseLoss(['down_x', 'label'], ['down_output'])
+
+        plan = core.Plan('predict_data')
+        plan.AddStep(core.execution_step('predict_data', [new_net], num_iter=1))
+        workspace.RunPlan(plan)
+        down_output_pred = workspace.FetchBlob('down_output')
+        up_output_pred = workspace.FetchBlob('up_output')
+        np.testing.assert_allclose(
+            np.asscalar(dx[0]),
+            np.asscalar(
+                0.5 * dY[0] *
+                (up_output_pred[0] - down_output_pred[0]) / delta),
+            rtol=1e-2, atol=1e-2)
+
+    @given(n=st.integers(0, 10), k=st.integers(1, 5), **hu.gcs_cpu_only)
+    def test_pair_wise_loss_batch(self, n, k, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        X = np.random.rand(sum(lengths)).astype(np.float32)
+        label = np.random.randint(k, size=sum(lengths)).astype(np.float32)
+
+        def pair_wise_op(X, label, lengths):
+            N = lengths.size
+            output = np.zeros(N).astype(np.float32)
+
+            def f(x):
+                return np.log(1 + np.exp(x))
+
+            offset = 0
+            for idx in range(N):
+                offset += lengths[idx - 1] if idx > 0 else 0
+                count = 0
+                for i in range(offset, offset + lengths[idx]):
+                    for j in range(offset, i):
+                        if label[i] == label[j]:
+                            continue
+                        sign = 1 if label[i] > label[j] else -1
+                        output[idx] += f(sign * (X[j] - X[i]))
+                        count += 1
+                if count > 0:
+                    output[idx] /= count
+            return [output]
+
+        op = core.CreateOperator(
+            'PairWiseLoss',
+            ['X', 'label', 'lengths'],
+            'out'
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label, lengths],
+            reference=pair_wise_op,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, label, lengths], [0])
+        # Gradient check
+        self.assertGradientChecks(gc, op, [X, label, lengths], 0, [0])
diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py
new file mode 100644
index 0000000..af207be
--- /dev/null
+++ b/caffe2/python/operator_test/rebatching_queue_test.py
@@ -0,0 +1,287 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+import numpy as np
+import numpy.testing as npt
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+import functools
+
+
+def primefac(n):
+    ret = []
+    divisor = 2
+    while divisor * divisor <= n:
+        while (n % divisor) == 0:
+            ret.append(divisor)
+            n = n // divisor
+        divisor = divisor + 1
+    if n > 1:
+        ret.append(n)
+    return ret
+
+
+class TestReBatchingQueue(TestCase):
+    def test_rebatching_queue_single_enqueue_dequeue(self):
+        net = core.Net('net')
+
+        tensors = [
+            net.ConstantFill([], 1, value=1.0, run_once=False)
+            for times in range(3)
+        ]
+
+        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
+
+        net.EnqueueRebatchingQueue([queue, tensors[0]], [])
+        net.EnqueueRebatchingQueue([queue, tensors[1]], [])
+        net.EnqueueRebatchingQueue([queue, tensors[2]], [])
+
+        results = [
+            net.DequeueRebatchingQueue([queue], 1),
+            net.DequeueRebatchingQueue([queue], 1),
+            net.DequeueRebatchingQueue([queue], 1),
+        ]
+
+        workspace.RunNetOnce(net)
+
+        for idx in range(3):
+            self.assertEquals(workspace.FetchBlob(results[idx]), [1.0])
+
+    def test_rebatching_queue_multi_enqueue_dequeue(self):
+        net = core.Net('net')
+        workspace.FeedBlob(
+            "tensors", np.array([x for x in range(10)], np.int32)
+        )
+
+        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
+
+        net.EnqueueRebatchingQueue([queue, "tensors"], [], enqueue_batch=True)
+
+        results = [
+            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
+            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
+        ]
+
+        workspace.RunNetOnce(net)
+
+        npt.assert_array_equal(
+            workspace.FetchBlob(results[0]), workspace.FetchBlob("tensors")[:5]
+        )
+        npt.assert_array_equal(
+            workspace.FetchBlob(results[1]), workspace.FetchBlob("tensors")[5:]
+        )
+
+    def test_rebatching_queue_closes_properly(self):
+        net = core.Net('net')
+        workspace.FeedBlob(
+            "tensors", np.array([x for x in range(10)], np.int32)
+        )
+
+        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
+
+        net.EnqueueRebatchingQueue([queue, "tensors"], 0, enqueue_batch=True)
+
+        net.CloseRebatchingQueue([queue], 0)
+
+        results = [
+            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
+            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
+        ]
+
+        workspace.RunNetOnce(net)
+
+        npt.assert_array_equal(
+            workspace.FetchBlob(results[0]), workspace.FetchBlob("tensors")[:5]
+        )
+        npt.assert_array_equal(
+            workspace.FetchBlob(results[1]), workspace.FetchBlob("tensors")[5:]
+        )
+
+        # Enqueuing more should fail now since the queue is closed
+        net.EnqueueRebatchingQueue([queue, "tensors"], [], enqueue_batch=True)
+
+        with self.assertRaises(RuntimeError):
+            workspace.RunNetOnce(net)
+
+        # Dequeuing more should fail now since the queue is closed
+        results = [
+            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
+        ]
+
+        with self.assertRaises(RuntimeError):
+            workspace.RunNetOnce(net)
+
+    def test_rebatching_queue_multiple_components(self):
+        NUM_BLOBS = 4
+        NUM_ELEMENTS = 10
+
+        net = core.Net('net')
+
+        workspace.blobs['complex_tensor'] = np.array(
+            [[x, x + 1] for x in range(NUM_ELEMENTS)], dtype=np.int32
+        )
+
+        tensors = [
+            net.GivenTensorIntFill(
+                [],
+                1,
+                shape=[NUM_ELEMENTS],
+                values=[x for x in range(NUM_ELEMENTS)]
+            ),
+            net.GivenTensorFill(
+                [],
+                1,
+                shape=[NUM_ELEMENTS],
+                values=[x * 1.0 for x in range(NUM_ELEMENTS)]
+            ),
+            net.GivenTensorBoolFill(
+                [],
+                1,
+                shape=[NUM_ELEMENTS],
+                values=[(x % 2 == 0) for x in range(NUM_ELEMENTS)]
+            ),
+            'complex_tensor',
+        ]
+
+        queue = net.CreateRebatchingQueue(
+            [], 1, capacity=10, num_blobs=NUM_BLOBS
+        )
+
+        net.EnqueueRebatchingQueue([queue] + tensors, [], enqueue_batch=True)
+
+        results = net.DequeueRebatchingQueue([queue], NUM_BLOBS, num_elements=5)
+
+        workspace.RunNetOnce(net)
+
+        for idx in range(NUM_BLOBS):
+            npt.assert_array_equal(
+                workspace.FetchBlob(results[idx]),
+                workspace.FetchBlob(tensors[idx])[:5]
+            )
+
+    @given(
+        num_producers=st.integers(1, 5),
+        num_consumers=st.integers(1, 5),
+        producer_input_size=st.integers(1, 10),
+        producer_num_iterations=st.integers(1, 10),
+        capacity=st.integers(1, 10)
+    )
+    def test_rebatching_parallel_producer_consumer(
+        self, num_producers, num_consumers, producer_input_size,
+        producer_num_iterations, capacity
+    ):
+        ### Init ###
+        total_inputs = producer_num_iterations * producer_input_size * num_producers
+        inputs = []
+        init_net = core.Net('init_net')
+        queue = init_net.CreateRebatchingQueue(
+            [], 1, capacity=capacity, num_blobs=1
+        )
+
+        ### Producers ###
+        producer_steps = []
+        for i in range(num_producers):
+            name = 'producer_%d' % i
+            net = core.Net(name)
+            values = [
+                producer_input_size * i + x for x in range(producer_input_size)
+            ]
+            for _ in range(producer_num_iterations):
+                inputs.extend(values)
+            tensors = net.GivenTensorIntFill(
+                [], 1, shape=[producer_input_size], values=values
+            )
+
+            net.EnqueueRebatchingQueue([queue, tensors], [], enqueue_batch=True)
+
+            step = core.execution_step(
+                name, net, num_iter=producer_num_iterations
+            )
+            producer_steps.append(step)
+
+        producer_step = core.execution_step(
+            'producer', [
+                core.execution_step(
+                    'producers', producer_steps, concurrent_substeps=True
+                )
+            ]
+        )
+
+        ### Consumers ###
+        outputs = []
+
+        def append(ins, outs):
+            # Extend is atomic
+            outputs.extend(ins[0].data.tolist())
+
+        consumer_steps = []
+        for i in range(num_consumers):
+            # This is just a way of deterministally read all the elements.
+            # We make `num_consumers` almost equal splits
+            # (the reminder goes to the last consumer).
+            num_elements_to_read = total_inputs // num_consumers
+            if i == num_consumers - 1:
+                num_elements_to_read = num_elements_to_read \
+                    + total_inputs % num_consumers
+
+            # If we have nothing to read this consumer will be idle
+            if (num_elements_to_read == 0):
+                continue
+
+            # Now we have to make a split on number of iterations and the read
+            # size for each iteration. This is again just one of many
+            # deterministic  ways of doing it. We factorize the total number of
+            # elements we have to read and assign half of the factors to the
+            # iterations half to the read size.
+            factors = list(primefac(num_elements_to_read))
+
+            num_elements_per_iteration = functools.reduce(
+                lambda x, y: x * y, factors[len(factors) // 2:], 1
+            )
+
+            num_iterations = functools.reduce(
+                lambda x, y: x * y, factors[:len(factors) // 2], 1
+            )
+
+            name = 'consumer_%d' % i
+            net = core.Net(name)
+            blobs = net.DequeueRebatchingQueue(
+                [queue], 1, num_elements=num_elements_per_iteration
+            )
+            net.Python(append)([blobs], 0)
+            consumer_steps.append(
+                core.execution_step(name, net, num_iter=num_iterations)
+            )
+
+        consumer_step = core.execution_step(
+            'consumer', consumer_steps, concurrent_substeps=True
+        )
+
+        init_step = core.execution_step('init', init_net)
+        worker_step = core.execution_step(
+            'worker', [consumer_step, producer_step], concurrent_substeps=True
+        )
+
+        ### Execute Plan ###
+        plan = core.Plan('test')
+        plan.AddStep(init_step)
+        plan.AddStep(worker_step)
+
+        self.ws.run(plan)
+
+        ### Check Results ###
+        # We check that the outputs are a permutation of inputs
+        inputs.sort()
+        outputs.sort()
+        self.assertEquals(inputs, outputs)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py
new file mode 100644
index 0000000..d32b3e7
--- /dev/null
+++ b/caffe2/python/operator_test/record_queue_test.py
@@ -0,0 +1,92 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.dataset import Dataset
+from caffe2.python.schema import (
+    Struct, Map, Scalar, from_blob_list, NewRecord, FeedRecord)
+from caffe2.python.record_queue import RecordQueue
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestRecordQueue(TestCase):
+    def test_record_queue(self):
+        num_prod = 8
+        num_consume = 3
+        schema = Struct(
+            ('floats', Map(
+                Scalar(np.int32),
+                Scalar(np.float32))),
+        )
+        contents_raw = [
+            [1, 2, 3],  # len
+            [11, 21, 22, 31, 32, 33],  # key
+            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
+        ]
+        contents = from_blob_list(schema, contents_raw)
+        ds = Dataset(schema)
+        net = core.Net('init')
+        ds.init_empty(net)
+
+        content_blobs = NewRecord(net, contents)
+        FeedRecord(content_blobs, contents)
+        writer = ds.writer(init_net=net)
+        writer.write_record(net, content_blobs)
+        reader = ds.reader(init_net=net)
+
+        # prepare receiving dataset
+        rec_dataset = Dataset(contents, name='rec')
+        rec_dataset.init_empty(init_net=net)
+        rec_dataset_writer = rec_dataset.writer(init_net=net)
+
+        workspace.RunNetOnce(net)
+
+        queue = RecordQueue(contents, num_threads=num_prod)
+
+        def process(net, fields):
+            new_fields = []
+            for f in fields.field_blobs():
+                new_f = net.Copy(f)
+                new_fields.append(new_f)
+            new_fields = from_blob_list(fields, new_fields)
+            return new_fields
+
+        q_reader, q_step, q_exit, fields = queue.build(reader, process)
+        producer_step = core.execution_step('producer', [q_step, q_exit])
+
+        consumer_steps = []
+        for i in range(num_consume):
+            name = 'queue_reader_' + str(i)
+            net_consume = core.Net(name)
+            should_stop, fields = q_reader.read_record(net_consume)
+            step_consume = core.execution_step(name, net_consume)
+
+            name = 'dataset_writer_' + str(i)
+            net_dataset = core.Net(name)
+            rec_dataset_writer.write(net_dataset, fields.field_blobs())
+            step_dataset = core.execution_step(name, net_dataset)
+
+            step = core.execution_step(
+                'consumer_' + str(i),
+                [step_consume, step_dataset],
+                should_stop_blob=should_stop)
+            consumer_steps.append(step)
+        consumer_step = core.execution_step(
+            'consumers', consumer_steps, concurrent_substeps=True)
+
+        work_steps = core.execution_step(
+            'work', [producer_step, consumer_step], concurrent_substeps=True)
+
+        plan = core.Plan('test')
+        plan.AddStep(work_steps)
+        core.workspace.RunPlan(plan)
+        data = workspace.FetchBlobs(rec_dataset.get_blobs())
+        self.assertEqual(6, sum(data[0]))
+        self.assertEqual(150, sum(data[1]))
+        self.assertAlmostEqual(15, sum(data[2]), places=5)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py
new file mode 100644
index 0000000..d5a2f49
--- /dev/null
+++ b/caffe2/python/operator_test/recurrent_net_executor_test.py
@@ -0,0 +1,266 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import model_helper, workspace, core, rnn_cell
+from caffe2.python.attention import AttentionType
+
+import numpy as np
+
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+from hypothesis import given
+
+
+class TestRNNExecutor(unittest.TestCase):
+
+    def setUp(self):
+        self.batch_size = 8
+        self.input_dim = 20
+        self.hidden_dim = 30
+        self.encoder_dim = 40
+
+    @given(
+        T=st.integers(10, 100),
+        forward_only=st.booleans(),
+        **hu.gcs)
+    def test_lstm_with_attention_equal_simplenet(self, T, forward_only, gc, dc):
+        self.Tseq = [T, T // 2, T // 2 + T // 4, T, T // 2 + 1]
+        workspace.ResetWorkspace()
+        with core.DeviceScope(gc):
+            print("Run with device: {}, forward only: {}".format(
+                gc, forward_only))
+
+            workspace.FeedBlob(
+                "seq_lengths",
+                np.array([T] * self.batch_size, dtype=np.int32)
+            )
+            workspace.FeedBlob("target", np.random.rand(
+                T, self.batch_size, self.hidden_dim).astype(np.float32))
+            workspace.FeedBlob("hidden_init", np.zeros(
+                [1, self.batch_size, self.hidden_dim], dtype=np.float32
+            ))
+            workspace.FeedBlob("cell_init", np.zeros(
+                [1, self.batch_size, self.hidden_dim], dtype=np.float32
+            ))
+
+            model = model_helper.ModelHelper(name="lstm")
+            model.net.AddExternalInputs(["input"])
+
+            init_blobs = []
+            hidden_init, cell_init, encoder_outputs = model.net.AddExternalInputs(
+                "hidden_init",
+                "cell_init",
+                "encoder_outputs"
+            )
+
+            awec_init = model.net.AddExternalInputs([
+                'initial_attention_weighted_encoder_context',
+            ])
+            init_blobs.extend([hidden_init, cell_init])
+
+            workspace.FeedBlob(
+                awec_init,
+                np.random.rand(1, self.batch_size, self.encoder_dim).astype(
+                    np.float32),
+            )
+            workspace.FeedBlob(
+                encoder_outputs,
+                np.random.rand(1, self.batch_size, self.encoder_dim).astype(
+                    np.float32),
+            )
+
+            outputs = rnn_cell.LSTMWithAttention(
+                model=model,
+                decoder_inputs="input",
+                decoder_input_lengths="seq_lengths",
+                initial_decoder_hidden_state=hidden_init,
+                initial_decoder_cell_state=cell_init,
+                initial_attention_weighted_encoder_context=awec_init,
+                encoder_output_dim=self.encoder_dim,
+                encoder_outputs=encoder_outputs,
+                encoder_lengths=None,
+                decoder_input_dim=self.input_dim,
+                decoder_state_dim=self.hidden_dim,
+                scope="",
+                attention_type=AttentionType.Recurrent,
+                forward_only=forward_only,
+                outputs_with_grads=[0],
+            )
+            output = outputs[0]
+
+            print(outputs)
+            loss = model.AveragedLoss(
+                model.SquaredL2Distance([output, "target"], "dist"),
+                "loss"
+            )
+            # Add gradient ops
+            if not forward_only:
+                model.AddGradientOperators([loss])
+
+            # init
+            for init_blob in init_blobs:
+                workspace.FeedBlob(init_blob, np.zeros(
+                    [1, self.batch_size, self.hidden_dim], dtype=np.float32
+                ))
+
+            self._compare(model, forward_only)
+
+    @given(
+        num_layers=st.integers(1, 8),
+        T=st.integers(4, 100),
+        forward_only=st.booleans(),
+        **hu.gcs)
+    def test_lstm_equal_simplenet(self, num_layers, T, forward_only, gc, dc):
+        '''
+        Test that the RNN executor produces same results as
+        the non-executor (i.e running step nets as sequence of simple nets).
+        '''
+        self.Tseq = [T, T // 2, T // 2 + T // 4, T, T // 2 + 1]
+
+        workspace.ResetWorkspace()
+        with core.DeviceScope(gc):
+            print("Run with device: {}, forward only: {}".format(
+                gc, forward_only))
+
+            workspace.FeedBlob(
+                "seq_lengths",
+                np.array([T] * self.batch_size, dtype=np.int32)
+            )
+            workspace.FeedBlob("target", np.random.rand(
+                T, self.batch_size, self.hidden_dim).astype(np.float32))
+            workspace.FeedBlob("hidden_init", np.zeros(
+                [1, self.batch_size, self.hidden_dim], dtype=np.float32
+            ))
+            workspace.FeedBlob("cell_init", np.zeros(
+                [1, self.batch_size, self.hidden_dim], dtype=np.float32
+            ))
+
+            model = model_helper.ModelHelper(name="lstm")
+            model.net.AddExternalInputs(["input"])
+
+            init_blobs = []
+            for i in range(num_layers):
+                hidden_init, cell_init = model.net.AddExternalInputs(
+                    "hidden_init_{}".format(i),
+                    "cell_init_{}".format(i)
+                )
+                init_blobs.extend([hidden_init, cell_init])
+
+            output, last_hidden, _, last_state = rnn_cell.LSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seq_lengths",
+                initial_states=init_blobs,
+                dim_in=self.input_dim,
+                dim_out=[self.hidden_dim] * num_layers,
+                scope="",
+                drop_states=True,
+                forward_only=forward_only,
+                return_last_layer_only=True,
+            )
+
+            loss = model.AveragedLoss(
+                model.SquaredL2Distance([output, "target"], "dist"),
+                "loss"
+            )
+            # Add gradient ops
+            if not forward_only:
+                model.AddGradientOperators([loss])
+
+            # init
+            for init_blob in init_blobs:
+                workspace.FeedBlob(init_blob, np.zeros(
+                    [1, self.batch_size, self.hidden_dim], dtype=np.float32
+                ))
+
+            self._compare(model, forward_only)
+
+    def _compare(self, model, forward_only):
+        # Store list of blobs that exist in the beginning
+        workspace.RunNetOnce(model.param_init_net)
+        init_ws = {k: workspace.FetchBlob(k) for k in workspace.Blobs()}
+
+        # Run with executor
+        for enable_executor in [0, 1]:
+            self.enable_rnn_executor(model.net, enable_executor, forward_only)
+            workspace.ResetWorkspace()
+
+            # Reset original state
+            for k, v in init_ws.items():
+                workspace.FeedBlob(k, v)
+
+            np.random.seed(10022015)
+            ws = {}
+            for j in range(len(self.Tseq)):
+                input_shape = [self.Tseq[j], self.batch_size, self.input_dim]
+                workspace.FeedBlob(
+                    "input", np.random.rand(*input_shape).astype(np.float32))
+                workspace.FeedBlob(
+                    "target",
+                    np.random.rand(
+                        self.Tseq[j], self.batch_size, self.hidden_dim
+                    ).astype(np.float32))
+                if j == 0:
+                    workspace.CreateNet(model.net, overwrite=True)
+
+                workspace.RunNet(model.net.Proto().name)
+
+                # Store results for each iteration
+                for k in workspace.Blobs():
+                    ws[k + "." + str(j)] = workspace.FetchBlob(k)
+
+            if enable_executor:
+                rnn_exec_ws = ws
+            else:
+                non_exec_ws = ws
+
+        # Test that all blobs are equal after running with executor
+        # or without.
+        self.assertEqual(list(non_exec_ws.keys()), list(rnn_exec_ws.keys()))
+
+        mismatch = False
+        for k in rnn_exec_ws.keys():
+            non_exec_v = non_exec_ws[k]
+            rnn_exec_v = rnn_exec_ws[k]
+            if type(non_exec_v) is np.ndarray:
+                if not np.allclose(non_exec_v, rnn_exec_v):
+                    print("Mismatch: {}".format(k))
+                    nv = non_exec_v.flatten()
+                    rv = rnn_exec_v.flatten()
+                    c = 0
+                    for j in range(len(nv)):
+                        if rv[j] != nv[j]:
+                            print(j, rv[j], nv[j])
+                            c += 1
+                            if c == 10:
+                                break
+
+                    mismatch = True
+
+        self.assertFalse(mismatch)
+
+    def enable_rnn_executor(self, net, value, forward_only):
+        num_found = 0
+        for op in net.Proto().op:
+            if op.type.startswith("RecurrentNetwork"):
+                for arg in op.arg:
+                    if arg.name == 'enable_rnn_executor':
+                        arg.i = value
+                        num_found += 1
+        # This sanity check is so that if someone changes the
+        # enable_rnn_executor parameter name, the test will
+        # start failing as this function will become defective.
+        self.assertEqual(1 if forward_only else 2, num_found)
+
+    if __name__ == "__main__":
+        import unittest
+        import random
+        random.seed(2603)
+        workspace.GlobalInit([
+            'caffe2',
+            '--caffe2_log_level=0',
+            '--caffe2_rnn_executor=1'])
+        unittest.main()
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
new file mode 100644
index 0000000..afb9e1e
--- /dev/null
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -0,0 +1,381 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import recurrent, workspace
+from caffe2.python.model_helper import ModelHelper
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class RecurrentNetworkTest(hu.HypothesisTestCase):
+    @given(T=st.integers(1, 4),
+           n=st.integers(1, 5),
+           d=st.integers(1, 5))
+    def test_sum_mul(self, T, n, d):
+        model = ModelHelper(name='external')
+
+        input_blob, initial_input_blob = model.net.AddExternalInputs(
+            'input', 'initial_input')
+
+        step = ModelHelper(name='step', param_model=model)
+        input_t, output_t_prev = step.net.AddExternalInput(
+            'input_t', 'output_t_prev')
+        output_t_internal = step.net.Sum([input_t, output_t_prev])
+        output_t = step.net.Mul([input_t, output_t_internal])
+        step.net.AddExternalOutput(output_t)
+
+        self.simple_rnn(T, n, d, model, step, input_t, output_t, output_t_prev,
+                        input_blob, initial_input_blob)
+
+    @given(T=st.integers(1, 4),
+           n=st.integers(1, 5),
+           d=st.integers(1, 5))
+    def test_mul(self, T, n, d):
+        model = ModelHelper(name='external')
+
+        input_blob, initial_input_blob = model.net.AddExternalInputs(
+            'input', 'initial_input')
+
+        step = ModelHelper(name='step', param_model=model)
+        input_t, output_t_prev = step.net.AddExternalInput(
+            'input_t', 'output_t_prev')
+        output_t = step.net.Mul([input_t, output_t_prev])
+        step.net.AddExternalOutput(output_t)
+
+        self.simple_rnn(T, n, d, model, step, input_t, output_t, output_t_prev,
+                        input_blob, initial_input_blob)
+
+    @given(T=st.integers(1, 4),
+           n=st.integers(1, 5),
+           d=st.integers(1, 5))
+    def test_extract(self, T, n, d):
+        model = ModelHelper(name='external')
+        workspace.ResetWorkspace()
+
+        input_blob, initial_input_blob = model.net.AddExternalInputs(
+            'input', 'initial_input')
+
+        step = ModelHelper(name='step', param_model=model)
+        input_t, output_t_prev = step.net.AddExternalInput(
+            'input_t', 'output_t_prev')
+        output_t = step.net.Mul([input_t, output_t_prev])
+        step.net.AddExternalOutput(output_t)
+
+        inputs = np.random.randn(T, n, d).astype(np.float32)
+        initial_input = np.random.randn(1, n, d).astype(np.float32)
+        recurrent.recurrent_net(
+            net=model.net,
+            cell_net=step.net,
+            inputs=[(input_t, input_blob)],
+            initial_cell_inputs=[(output_t_prev, initial_input_blob)],
+            links={output_t_prev: output_t},
+            scope="test_rnn_sum_mull",
+        )
+
+        workspace.blobs[input_blob] = inputs
+        workspace.blobs[initial_input_blob] = initial_input
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+
+        prefix = "extractTest"
+
+        workspace.RunNet(model.net.Proto().name, T)
+        retrieved_blobs = recurrent.retrieve_step_blobs(
+            model.net, prefix
+        )
+
+        # needed for python3.6, which returns bytearrays instead of str
+        retrieved_blobs = [x.decode() for x in retrieved_blobs]
+
+        for i in range(T):
+            blob_name = prefix + "_" + "input_t" + str(i)
+            self.assertTrue(
+                blob_name in retrieved_blobs,
+                "blob extraction failed on timestep {}\
+                    . \n\n Extracted Blobs: {} \n\n Looking for {}\
+                    .".format(i, retrieved_blobs, blob_name)
+            )
+
+    def simple_rnn(self, T, n, d, model, step, input_t, output_t, output_t_prev,
+                   input_blob, initial_input_blob):
+
+        input = np.random.randn(T, n, d).astype(np.float32)
+        initial_input = np.random.randn(1, n, d).astype(np.float32)
+        print(locals())
+        recurrent.recurrent_net(
+            net=model.net,
+            cell_net=step.net,
+            inputs=[(input_t, input_blob)],
+            initial_cell_inputs=[(output_t_prev, initial_input_blob)],
+            links={output_t_prev: output_t},
+            scope="test_rnn_sum_mull",
+        )
+        workspace.blobs[input_blob] = input
+        workspace.blobs[initial_input_blob] = initial_input
+
+        op = model.net._net.op[-1]
+        # Just conviniently store all inputs in an array in the same
+        # order as op.input
+        inputs = [workspace.blobs[name] for name in op.input]
+
+        def reference(input, initial_input):
+            global_ws_name = workspace.CurrentWorkspace()
+            input_all = workspace.blobs[input_blob]
+
+            workspace.SwitchWorkspace("ref", create_if_missing=True)
+            workspace.blobs[input_blob] = input
+            workspace.blobs[output_t_prev] = initial_input.reshape(n, d)
+            res_all = np.zeros(shape=input.shape, dtype=np.float32)
+
+            for t_cur in range(T):
+                workspace.blobs[input_t] = input_all[t_cur]
+                workspace.RunNetOnce(step.net)
+                result_t = workspace.blobs[output_t]
+                workspace.blobs[output_t_prev] = result_t
+                res_all[t_cur] = result_t
+
+            workspace.SwitchWorkspace(global_ws_name)
+
+            shape = list(input.shape)
+            shape[0] = 1
+            return (res_all, res_all[-1].reshape(shape))
+
+        self.assertReferenceChecks(
+            device_option=hu.cpu_do,
+            op=op,
+            inputs=inputs,
+            reference=reference,
+            output_to_grad=op.output[0],
+            outputs_to_check=[0, 1],
+        )
+
+        self.assertGradientChecks(
+            device_option=hu.cpu_do,
+            op=op,
+            inputs=inputs,
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+            threshold=0.01,
+            stepsize=0.005,
+        )
+
+    # Hacky version of 1-D convolution
+    def _convolution_1d(
+        self,
+        model,
+        inputs,
+        conv_window,
+        conv_filter,
+        conv_bias,
+        output_name,
+        left_pad,
+    ):
+        if left_pad:
+            padding_width = conv_window - 1
+        else:
+            padding_width = 0
+
+        # [batch_size, inputs_length, state_size]
+        inputs_transposed = model.net.Transpose(
+            inputs,
+            'inputs_transposed',
+            axes=[1, 0, 2],
+        )
+        # [batch_size, 1, inputs_length, state_size]
+        inputs_transposed_4d = model.net.ExpandDims(
+            inputs_transposed,
+            'inputs_transposed_4d',
+            dims=[1],
+        )
+        # [batch_size, 1, inputs_length - conv_window + 1, state_size]
+        output_transposed_4d = model.net.Conv(
+            [inputs_transposed_4d, conv_filter, conv_bias],
+            output_name + '_transposed_4d',
+            kernel_h=1,
+            kernel_w=conv_window,
+            order='NHWC',
+            pad_t=0,
+            pad_l=padding_width,
+            pad_b=0,
+            pad_r=0,
+        )
+        # [batch_size, inputs_length - conv_window + 1, state_size]
+        output_transposed = model.net.Squeeze(
+            output_transposed_4d,
+            output_name + '_transposed',
+            dims=[1],
+        )
+        # [inputs_length - conv_window + 1, batch_size, state_size]
+        output = model.net.Transpose(
+            output_transposed,
+            output_name,
+            axes=[1, 0, 2],
+        )
+        return output
+
+    @given(sequence_length=st.integers(3, 7),
+           conv_window=st.integers(1, 3),
+           batch_size=st.integers(1, 5),
+           state_size=st.integers(1, 5))
+    def test_stateful_convolution_forward_only(
+        self,
+        sequence_length,
+        conv_window,
+        batch_size,
+        state_size,
+    ):
+        '''
+        This unit test demonstrates another ways of using RecurrentNetwork.
+
+        Imagine, that you want to compute convolution over a sequence,
+        but sequence elements are not given to you from the beginning,
+        so you have to loop over the sequence and compute convolution
+        for each element separately. This situation can occur,
+        during inference/generation step of the neural networks.
+
+        First of all, you have to provide actual input via recurrent states,
+        since the input of RecurrentNetwork should be known in advance.
+        Here, we use `fake_inputs` as the input,
+        and it's used by the op to extract batch size and sequence length.
+        The actual input sequence is stored in the recurrent state
+        `input_state`. At every step we generate a new element via input_state_t
+        (in this example, input_state_t is generated at random, but
+        in a real situation it can be created using convolution output
+        from the previous step).
+
+        A few important differences from regular RecurrentNetwork usecase:
+
+        1. input_state_t_prev is not only a single previous element of
+        input_state sequence. It is last conv_window elements including (!)
+        the current one - input_state_t. We specify that using `link_window`
+        argument of RecurrentNetwork. We need that many elements to
+        compute a single convolution step. Also, note that `link_window`
+        specifies how many elements to link starting at
+        `timestep` + `link_offset` position.
+
+        2. First few steps might require additional zero padding from the left,
+        since there is no enough element of input_state sequence are available.
+        So the initial_state for input_state contains several elements
+        (exactly how many pads we need for the first step). Also, because of
+        that all offseting over input_state sequnece is being shifted
+        by length of initial_input_state: see `link_offset` and `alias_offset`
+        arguments of RecurrentNetwork.
+
+        In this test, we assert that we get the same result
+        if we apply convolution over all elements simultaneously,
+        since the whole input_state sequence was generated at the end.
+    '''
+        model = ModelHelper(name='model')
+        fake_inputs = model.param_init_net.UniformFill(
+            [],
+            'fake_inputs',
+            min=-1.0,
+            max=1.0,
+            shape=[sequence_length, batch_size, state_size],
+        )
+        initial_input_state = model.param_init_net.ConstantFill(
+            [],
+            'initial_input_state',
+            value=0.0,
+            shape=[conv_window - 1, batch_size, state_size],
+        )
+        initial_output_state = model.param_init_net.ConstantFill(
+            [],
+            'initial_output_state',
+            value=0.0,
+            shape=[1, batch_size, state_size],
+        )
+        step_model = ModelHelper(name='step_model', param_model=model)
+        (
+            fake_input_t,
+            timestep,
+            input_state_t_prev,
+        ) = step_model.net.AddExternalInputs(
+            'fake_input_t',
+            'timestep',
+            'input_state_t_prev',
+        )
+        conv_filter = step_model.param_init_net.XavierFill(
+            [],
+            'conv_filter',
+            shape=[state_size, 1, conv_window, state_size],
+        )
+        conv_bias = step_model.param_init_net.ConstantFill(
+            [],
+            'conv_bias',
+            shape=[state_size],
+            value=0.0,
+        )
+        step_model.params.extend([conv_filter, conv_bias])
+        input_state_t = step_model.net.UniformFill(
+            [],
+            'input_state_t',
+            min=-1.0,
+            max=1.0,
+            shape=[1, batch_size, state_size],
+        )
+        output_state_t = self._convolution_1d(
+            model=step_model,
+            inputs=input_state_t_prev,
+            conv_window=conv_window,
+            conv_filter=conv_filter,
+            conv_bias=conv_bias,
+            output_name='output_state_t',
+            left_pad=False,
+        )
+        initial_recurrent_states = [initial_input_state, initial_output_state]
+        all_inputs = (
+            [fake_inputs] + step_model.params + initial_recurrent_states
+        )
+        all_outputs = ['input_state_all', 'output_state_all']
+        recurrent_states = ['input_state', 'output_state']
+        input_state_all, output_state_all, _ = model.net.RecurrentNetwork(
+            all_inputs,
+            all_outputs + ['step_workspaces'],
+            param=[all_inputs.index(p) for p in step_model.params],
+            alias_src=recurrent_states,
+            alias_dst=all_outputs,
+            alias_offset=[conv_window - 1, 1],
+            recurrent_states=recurrent_states,
+            initial_recurrent_state_ids=[
+                all_inputs.index(s) for s in initial_recurrent_states
+            ],
+            link_internal=[
+                str(input_state_t_prev),
+                str(input_state_t),
+                str(output_state_t),
+            ],
+            link_external=['input_state', 'input_state', 'output_state'],
+            link_offset=[0, conv_window - 1, 1],
+            link_window=[conv_window, 1, 1],
+            backward_link_internal=[],
+            backward_link_external=[],
+            backward_link_offset=[],
+            step_net=step_model.net.Proto(),
+            timestep='timestep' if timestep is None else str(timestep),
+            outputs_with_grads=[],
+        )
+
+        output_states_2 = self._convolution_1d(
+            model=model,
+            inputs=input_state_all,
+            conv_window=conv_window,
+            conv_filter=conv_filter,
+            conv_bias=conv_bias,
+            output_name='output_states_2',
+            left_pad=True,
+        )
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        np.testing.assert_almost_equal(
+            workspace.FetchBlob(output_state_all),
+            workspace.FetchBlob(output_states_2),
+            decimal=3,
+        )
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
new file mode 100644
index 0000000..e5f3d13
--- /dev/null
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -0,0 +1,418 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import itertools as it
+
+
+class TestReduceOps(hu.HypothesisTestCase):
+    def run_reduce_op_test_impl(
+            self, op_name, X, axes, keepdims, ref_func, gc, dc):
+        if axes is None:
+            op = core.CreateOperator(
+                op_name,
+                ["X"],
+                ["Y"],
+                keepdims=keepdims,
+            )
+        else:
+            op = core.CreateOperator(
+                op_name,
+                ["X"],
+                ["Y"],
+                axes=axes,
+                keepdims=keepdims,
+            )
+
+        def ref(X):
+            return [ref_func(
+                X, axis=None if axes is None else tuple(axes),
+                keepdims=keepdims)]
+
+        self.assertReferenceChecks(gc, op, [X], ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    def run_reduce_op_test(
+            self, op_name, X, keepdims, num_axes, ref_func, gc, dc):
+        self.run_reduce_op_test_impl(
+            op_name, X, None, keepdims, ref_func, gc, dc)
+
+        num_dims = len(X.shape)
+        if num_dims < num_axes:
+            self.run_reduce_op_test_impl(
+                op_name, X, range(num_dims), keepdims, ref_func, gc, dc)
+        else:
+            for axes in it.combinations(range(num_dims), num_axes):
+                self.run_reduce_op_test_impl(
+                    op_name, X, axes, keepdims, ref_func, gc, dc)
+
+    @given(X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
+           num_axes=st.integers(1, 3), **hu.gcs)
+    def test_reduce_min(self, X, keepdims, num_axes, gc, dc):
+        X_dims = X.shape
+        X_size = X.size
+        X = np.arange(X_size, dtype=np.float32)
+        np.random.shuffle(X)
+        X = X.reshape(X_dims)
+        self.run_reduce_op_test(
+            "ReduceMin", X, keepdims, num_axes, np.min, gc, dc)
+
+    @given(X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
+           num_axes=st.integers(1, 3), **hu.gcs)
+    def test_reduce_max(self, X, keepdims, num_axes, gc, dc):
+        X_dims = X.shape
+        X_size = X.size
+        X = np.arange(X_size, dtype=np.float32)
+        np.random.shuffle(X)
+        X = X.reshape(X_dims)
+        self.run_reduce_op_test(
+            "ReduceMax", X, keepdims, num_axes, np.max, gc, dc)
+
+    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
+           t=st.integers(0, 5), keepdims=st.booleans(),
+           num_axes=st.integers(1, 3), **hu.gcs)
+    def test_reduce_sum(self, n, m, k, t, keepdims, num_axes, gc, dc):
+        X = np.random.randn(n, m, k, t).astype(np.float32)
+        self.run_reduce_op_test(
+            "ReduceSum", X, keepdims, num_axes, np.sum, gc, dc)
+
+    @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
+           num_axes=st.integers(1, 4), **hu.gcs)
+    def test_reduce_mean(self, X, keepdims, num_axes, gc, dc):
+        self.run_reduce_op_test(
+            "ReduceMean", X, keepdims, num_axes, np.mean, gc, dc)
+
+    @given(n=st.integers(1, 3), m=st.integers(1, 3), k=st.integers(1, 3),
+           keepdims=st.booleans(), num_axes=st.integers(1, 3), **hu.gcs_cpu_only)
+    def test_reduce_l1(self, n, m, k, keepdims, num_axes, gc, dc):
+        X = np.arange(n * m * k, dtype=np.float32) - 0.5
+        np.random.shuffle(X)
+        X = X.reshape((m, n, k))
+        self.run_reduce_op_test(
+            "ReduceL1", X, keepdims, num_axes, getNorm(1), gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           keepdims=st.booleans(), num_axes=st.integers(1, 3), **hu.gcs_cpu_only)
+    def test_reduce_l2(self, n, m, k, keepdims, num_axes, gc, dc):
+        X = np.random.randn(n, m, k).astype(np.float32)
+        self.run_reduce_op_test(
+            "ReduceL2", X, keepdims, num_axes, getNorm(2), gc, dc)
+
+
+def getNorm(p):
+    if p == 1:
+        def norm(X, axis, keepdims):
+            return np.sum(np.abs(X), axis=axis, keepdims=keepdims)
+    elif p == 2:
+        def norm(X, axis, keepdims):
+            return np.sqrt(np.sum(np.power(X, 2), axis=axis, keepdims=keepdims))
+    else:
+        raise RuntimeError("Only L1 and L2 norms supported")
+    return norm
+
+
+class TestReduceFrontReductions(hu.HypothesisTestCase):
+    def grad_variant_input_test(self, grad_op_name, X, ref, num_reduce_dim):
+        workspace.ResetWorkspace()
+
+        Y = np.array(ref(X)[0]).astype(np.float32)
+        dY = np.array(np.random.rand(*Y.shape)).astype(np.float32)
+        shape = np.array(X.shape).astype(np.int64)
+
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("dY", dY)
+        workspace.FeedBlob("shape", shape)
+
+        grad_op = core.CreateOperator(
+            grad_op_name, ["dY", "X"], ["dX"], num_reduce_dim=num_reduce_dim)
+
+        grad_op1 = core.CreateOperator(
+            grad_op_name, ["dY", "shape"], ["dX1"],
+            num_reduce_dim=num_reduce_dim)
+
+        workspace.RunOperatorOnce(grad_op)
+        workspace.RunOperatorOnce(grad_op1)
+
+        dX = workspace.FetchBlob("dX")
+        dX1 = workspace.FetchBlob("dX1")
+        np.testing.assert_array_equal(dX, dX1)
+
+    def max_op_test(
+            self, op_name, num_reduce_dim, gc, dc, in_data, in_names, ref_max):
+
+        op = core.CreateOperator(
+            op_name,
+            in_names,
+            ["outputs"],
+            num_reduce_dim=num_reduce_dim
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=in_data,
+            reference=ref_max,
+        )
+
+        # Skip gradient check because it is too unreliable with max.
+        # Just check CPU and CUDA have same results
+        Y = np.array(ref_max(*in_data)[0]).astype(np.float32)
+        dY = np.array(np.random.rand(*Y.shape)).astype(np.float32)
+        if len(in_data) == 2:
+            grad_in_names = ["dY", in_names[0], "Y", in_names[1]]
+            grad_in_data = [dY, in_data[0], Y, in_data[1]]
+        else:
+            grad_in_names = ["dY", in_names[0], "Y"]
+            grad_in_data = [dY, in_data[0], Y]
+
+        grad_op = core.CreateOperator(
+            op_name + "Gradient",
+            grad_in_names,
+            ["dX"],
+            num_reduce_dim=num_reduce_dim
+        )
+        self.assertDeviceChecks(dc, grad_op, grad_in_data, [0])
+
+    def reduce_op_test(self, op_name, op_ref, in_data, in_names,
+                       num_reduce_dims, device):
+        op = core.CreateOperator(
+            op_name,
+            in_names,
+            ["outputs"],
+            num_reduce_dim=num_reduce_dims
+        )
+
+        self.assertReferenceChecks(
+            device_option=device,
+            op=op,
+            inputs=in_data,
+            reference=op_ref
+        )
+
+        self.assertGradientChecks(
+            device, op, in_data, 0, [0], stepsize=1e-2, threshold=1e-2)
+
+    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    def test_reduce_front_sum(self, num_reduce_dim, gc, dc):
+        X = np.random.rand(7, 4, 3, 5).astype(np.float32)
+
+        def ref_sum(X):
+            return [np.sum(X, axis=(tuple(range(num_reduce_dim))))]
+
+        self.reduce_op_test(
+            "ReduceFrontSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
+        self.grad_variant_input_test(
+            "ReduceFrontSumGradient", X, ref_sum, num_reduce_dim)
+
+    @given(num_reduce_dim=st.integers(0, 4), seed=st.integers(0, 4), **hu.gcs)
+    def test_reduce_front_sum_empty_batch(self, num_reduce_dim, seed, gc, dc):
+        np.random.seed(seed)
+        X = np.random.rand(0, 4, 3, 5).astype(np.float32)
+
+        def ref_sum(X):
+            return [np.sum(X, axis=(tuple(range(num_reduce_dim))))]
+
+        self.reduce_op_test(
+            "ReduceFrontSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
+        self.grad_variant_input_test(
+            "ReduceFrontSumGradient", X, ref_sum, num_reduce_dim)
+
+        # test the second iteration
+        not_empty_X = np.random.rand(2, 4, 3, 5).astype(np.float32)
+        net = core.Net('test')
+        with core.DeviceScope(gc):
+            net.ReduceFrontSum(
+                ['X'], ['output'],
+                num_reduce_dim=num_reduce_dim
+            )
+            workspace.CreateNet(net)
+
+            workspace.FeedBlob('X', not_empty_X)
+            workspace.RunNet(workspace.GetNetName(net))
+            output = workspace.FetchBlob('output')
+            np.testing.assert_allclose(
+                output, ref_sum(not_empty_X)[0], atol=1e-3)
+
+            workspace.FeedBlob('X', X)
+            workspace.RunNet(workspace.GetNetName(net))
+            output = workspace.FetchBlob('output')
+            np.testing.assert_allclose(output, ref_sum(X)[0], atol=1e-3)
+
+    @given(**hu.gcs)
+    def test_reduce_front_sum_with_length(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_sum(X, lengths):
+            Y = X.reshape(d, lengths.size)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.sum(Y[:lengths[ii], ii])
+            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
+
+        self.reduce_op_test(
+            "ReduceFrontSum", ref_sum, [X, lengths], ["input", "lengths"],
+            num_reduce_dim, gc)
+
+    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    def test_reduce_front_mean(self, num_reduce_dim, gc, dc):
+        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
+
+        def ref_mean(X):
+            return [np.mean(X, axis=(tuple(range(num_reduce_dim))))]
+
+        self.reduce_op_test(
+            "ReduceFrontMean", ref_mean, [X], ["input"], num_reduce_dim, gc)
+        self.grad_variant_input_test(
+            "ReduceFrontMeanGradient", X, ref_mean, num_reduce_dim)
+
+    @given(**hu.gcs)
+    def test_reduce_front_mean_with_length(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_mean(X, lengths):
+            Y = X.reshape(d, lengths.size)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.mean(Y[:lengths[ii], ii])
+            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
+
+        self.reduce_op_test(
+            "ReduceFrontMean", ref_mean, [X, lengths], ["input", "lengths"],
+            num_reduce_dim, gc)
+
+    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    def test_reduce_front_max(self, num_reduce_dim, gc, dc):
+        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
+
+        def ref_frontmax(X):
+            return [np.max(X, axis=(tuple(range(num_reduce_dim))))]
+
+        self.max_op_test(
+            "ReduceFrontMax", num_reduce_dim, gc, dc, [X], ["X"], ref_frontmax)
+
+    @given(**hu.gcs)
+    def test_reduce_front_max_with_length(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_max(X, lengths):
+            Y = X.reshape(d, lengths.size)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.max(Y[:lengths[ii], ii])
+            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
+
+        self.max_op_test(
+            "ReduceFrontMax", num_reduce_dim, gc, dc, [X, lengths],
+            ["X", "lengths"], ref_max)
+
+    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    def test_reduce_back_max(self, num_reduce_dim, gc, dc):
+        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
+
+        def ref_backmax(X):
+            return [np.max(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
+
+        self.max_op_test(
+            "ReduceBackMax", num_reduce_dim, gc, dc, [X], ["X"], ref_backmax)
+
+    @given(**hu.gcs)
+    def test_reduce_back_max_with_length(self, gc, dc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_max(X, lengths):
+            Y = X.reshape(lengths.size, d)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.max(Y[ii, :lengths[ii]])
+            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
+
+        self.max_op_test(
+            "ReduceBackMax", num_reduce_dim, gc, dc, [X, lengths],
+            ["X", "lengths"], ref_max)
+
+    @given(**hu.gcs)
+    def test_reduce_back_sum(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
+
+        def ref_sum(X):
+            return [np.sum(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
+
+        self.reduce_op_test(
+            "ReduceBackSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
+        self.grad_variant_input_test(
+            "ReduceBackSumGradient", X, ref_sum, num_reduce_dim)
+
+    @given(**hu.gcs)
+    def test_reduce_back_sum_with_length(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_sum(X, lengths):
+            Y = X.reshape(lengths.size, d)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.sum(Y[ii, :lengths[ii]])
+            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
+
+        self.reduce_op_test(
+            "ReduceBackSum", ref_sum, [X, lengths], ["input", "lengths"],
+            num_reduce_dim, gc)
+
+    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    def test_reduce_back_mean(self, num_reduce_dim, dc, gc):
+        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
+
+        def ref_mean(X):
+            return [np.mean(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
+
+        self.reduce_op_test(
+            "ReduceBackMean", ref_mean, [X], ["input"], num_reduce_dim, gc)
+        self.grad_variant_input_test(
+            "ReduceBackMeanGradient", X, ref_mean, num_reduce_dim)
+
+    @given(**hu.gcs)
+    def test_reduce_back_mean_with_length(self, dc, gc):
+        num_reduce_dim = 1
+        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
+        d = 120 // batch_size
+        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
+
+        def ref_mean(X, lengths):
+            Y = X.reshape(lengths.size, d)
+            rv = np.zeros((lengths.size, 1)).astype(np.float32)
+            for ii in range(lengths.size):
+                rv[ii] = np.mean(Y[ii, :lengths[ii]])
+            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
+
+        self.reduce_op_test(
+            "ReduceBackMean", ref_mean, [X, lengths], ["input", "lengths"],
+            num_reduce_dim, gc)
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
new file mode 100644
index 0000000..17bdce7
--- /dev/null
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -0,0 +1,167 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import assume, given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.proto import caffe2_pb2
+
+
+class TestReductionOps(hu.HypothesisTestCase):
+
+    @given(n=st.integers(5, 8), **hu.gcs)
+    def test_elementwise_sum(self, n, gc, dc):
+        X = np.random.rand(n).astype(np.float32)
+
+        def sum_op(X):
+            return [np.sum(X)]
+
+        op = core.CreateOperator(
+            "SumElements",
+            ["X"],
+            ["y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sum_op,
+        )
+
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+        )
+
+    @given(n=st.integers(5, 8), **hu.gcs)
+    def test_elementwise_int_sum(self, n, gc, dc):
+        X = np.random.rand(n).astype(np.int32)
+
+        def sum_op(X):
+            return [np.sum(X)]
+
+        op = core.CreateOperator(
+            "SumElementsInt",
+            ["X"],
+            ["y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sum_op,
+        )
+
+    @given(n=st.integers(1, 65536),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           **hu.gcs)
+    def test_elementwise_sqrsum(self, n, dtype, gc, dc):
+        if dtype == np.float16:
+            # fp16 is only supported with CUDA
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
+
+        X = np.random.rand(n).astype(dtype)
+
+        def sumsqr_op(X):
+            return [np.sum(X * X)]
+
+        op = core.CreateOperator(
+            "SumSqrElements",
+            ["X"],
+            ["y"]
+        )
+
+        threshold = 0.01 if dtype == np.float16 else 0.005
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sumsqr_op,
+            threshold=threshold,
+        )
+
+    @given(n=st.integers(5, 8), **hu.gcs)
+    def test_elementwise_avg(self, n, gc, dc):
+        X = np.random.rand(n).astype(np.float32)
+
+        def avg_op(X):
+            return [np.mean(X)]
+
+        op = core.CreateOperator(
+            "SumElements",
+            ["X"],
+            ["y"],
+            average=1
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=avg_op,
+        )
+
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+        )
+
+    @given(batch_size=st.integers(1, 3),
+           m=st.integers(1, 3),
+           n=st.integers(1, 4),
+           **hu.gcs)
+    def test_rowwise_max(self, batch_size, m, n, gc, dc):
+        X = np.random.rand(batch_size, m, n).astype(np.float32)
+
+        def rowwise_max(X):
+            return [np.max(X, axis=2)]
+
+        op = core.CreateOperator(
+            "RowwiseMax",
+            ["x"],
+            ["y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=rowwise_max,
+        )
+
+    @given(batch_size=st.integers(1, 3),
+           m=st.integers(1, 3),
+           n=st.integers(1, 4),
+           **hu.gcs)
+    def test_columnwise_max(self, batch_size, m, n, gc, dc):
+        X = np.random.rand(batch_size, m, n).astype(np.float32)
+
+        def columnwise_max(X):
+            return [np.max(X, axis=1)]
+
+        op = core.CreateOperator(
+            "ColwiseMax",
+            ["x"],
+            ["y"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=columnwise_max,
+        )
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
new file mode 100644
index 0000000..6221652
--- /dev/null
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -0,0 +1,148 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+from caffe2.proto import caffe2_pb2
+
+
+class TestLengthsToShapeOps(TestCase):
+    def test_lengths_to_shape_ops(self):
+        workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int32))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'LengthsToShape', ['l'], ['s']))
+        workspace.FeedBlob('res', np.array([3, 200], dtype=np.int32))
+        assert ((workspace.FetchBlob('s') == workspace.FetchBlob('res')).all())
+
+    def test_reshape_ops(self):
+        workspace.FeedBlob('res', np.array([[0, 0, 0, 0]], dtype=np.float32))
+        workspace.FeedBlob('shape', np.array([1, 4], dtype=np.int32))
+        workspace.FeedBlob('input', np.zeros((2, 2), dtype=np.float32))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'Reshape', ['input', 'shape'], ['output', 'old_shape']))
+        assert ((workspace.FetchBlob('output') ==
+                 workspace.FetchBlob('res')).all())
+
+    def test_basic_reshape(self):
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(2, 4))
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(2, 4), arg_shape=False)
+
+    def test_missing_dim(self):
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8))
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8), arg_shape=False)
+
+    def test_in_place(self):
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8), in_place=True)
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8),
+                     in_place=True, arg_shape=False)
+
+    def test_zero_dim(self):
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
+                     expected_shape=(4, 2, 1))
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
+                     expected_shape=(4, 2, 1), arg_shape=False)
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
+                     expected_shape=(4, 2, 1))
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
+                     expected_shape=(4, 2, 1), arg_shape=False)
+        _test_reshape(old_shape=(0, 0), new_shape=(0, 0, 0),
+                     expected_shape=(0, 0, 0), arg_shape=False)
+
+    def test_zero_dim_and_missing_dim(self):
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
+                     expected_shape=(4, 2, 1))
+        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
+                     expected_shape=(4, 2, 1), arg_shape=False)
+        _test_reshape(old_shape=(4, 3, 2), new_shape=(-1, 0),
+                     expected_shape=(8, 3))
+        _test_reshape(old_shape=(4, 3, 2), new_shape=(-1, 0),
+                     expected_shape=(8, 3), arg_shape=False)
+
+        self.assertRaisesRegexp(RuntimeError, "size is zero",
+                                _test_reshape, old_shape=(2, 0), new_shape=(-1, 0),
+                                expected_shape=(2, 0), arg_shape=False)
+
+    def test_backprop(self):
+        old_shape = (4, 2, 1)
+        new_shape = (1, 8)
+        X = np.random.rand(*old_shape).astype(np.float32)
+        Y = np.random.rand(*new_shape).astype(np.float32)
+
+        net = core.Net('net')
+
+        net.GivenTensorFill([], 'X', shape=old_shape, values=X.flatten())
+        net.GivenTensorFill([], 'Y', shape=new_shape, values=Y.flatten())
+
+        net.Reshape(['X'], ['X_out', 'old_shape'], shape=new_shape)
+        net.DotProduct(['X_out', 'Y'], 'Z')
+        net.AddGradientOperators(['Z'])
+
+        workspace.RunNetOnce(net)
+
+        Z = workspace.FetchBlob('Z')
+        X_grad = workspace.FetchBlob('X_grad')
+
+        # Check forward computation
+        np.testing.assert_allclose(
+            Z.squeeze(), X.reshape(new_shape).dot(Y.T).squeeze(), rtol=1e-5)
+
+        # Check the shape of the gradient
+        np.testing.assert_array_equal(X_grad.shape, X.shape)
+
+        # Check the gradient
+        np.testing.assert_allclose(X_grad, Y.reshape(old_shape), rtol=1e-5)
+
+    def test_input_shape_changes(self):
+        workspace.FeedBlob(
+            'input_blob',
+            np.array(np.random.rand(10, 20, 10), dtype=np.float32))
+        net = core.Net('mynet')
+        z, _ = net.Reshape('input_blob',
+                           ['z_reshape', 'dummy_size'],
+                           shape=(-1, 10))
+        workspace.CreateNet(net)
+        workspace.RunNet(net)
+        workspace.FeedBlob(
+            'input_blob',
+            np.array(np.random.rand(10, 40, 10), dtype=np.float32))
+        workspace.RunNet(net)
+
+
+def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True,
+                 in_place=False):
+    devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
+    if workspace.NumCudaDevices() > 0:
+        devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+    for device_opt in devices:
+        with core.DeviceScope(device_opt):
+            if expected_shape is None:
+                expected_shape = new_shape
+            X = np.random.rand(*old_shape).astype(np.float32)
+
+            blob_in = 'X'
+            blob_out = blob_in if in_place else blob_in + '_out'
+
+            if arg_shape:
+                op = core.CreateOperator('Reshape',
+                                         [blob_in],
+                                         [blob_out, 'old_shape'],
+                                         shape=new_shape)
+            else:
+                op = core.CreateOperator('Reshape',
+                                         [blob_in, 'new_shape'],
+                                         [blob_out, 'old_shape'])
+                workspace.FeedBlob('new_shape', np.asarray(new_shape))
+
+            workspace.FeedBlob(blob_in, X)
+            workspace.RunOperatorOnce(op)
+
+            Y = workspace.FetchBlob(blob_out)
+            np.testing.assert_allclose(Y, X.reshape(expected_shape))
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py
new file mode 100644
index 0000000..f0c9c77
--- /dev/null
+++ b/caffe2/python/operator_test/resize_op_test.py
@@ -0,0 +1,107 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core
+from hypothesis import given
+
+
+class TestResize(hu.HypothesisTestCase):
+    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
+           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
+           height=st.integers(4, 32),
+           width=st.integers(4, 32),
+           num_channels=st.integers(1, 4),
+           batch_size=st.integers(1, 4),
+           seed=st.integers(0, 65535),
+           **hu.gcs)
+    def test_nearest(self, height_scale, width_scale, height, width,
+                     num_channels, batch_size, seed,
+                     gc, dc):
+
+        np.random.seed(seed)
+        op = core.CreateOperator(
+            "ResizeNearest",
+            ["X"],
+            ["Y"],
+            width_scale=width_scale,
+            height_scale=height_scale,
+        )
+
+        X = np.random.rand(
+            batch_size, num_channels, height, width).astype(np.float32)
+
+        def ref(X):
+            output_height = np.int32(height * height_scale)
+            output_width = np.int32(width * width_scale)
+
+            output_h_idxs, output_w_idxs = np.meshgrid(np.arange(output_height),
+                                                       np.arange(output_width),
+                                                       indexing='ij')
+
+            input_h_idxs = np.minimum(
+                output_h_idxs / height_scale, height - 1).astype(np.int32)
+            input_w_idxs = np.minimum(
+                output_w_idxs / width_scale, width - 1).astype(np.int32)
+
+            Y = X[:, :, input_h_idxs, input_w_idxs]
+
+            return Y,
+
+        self.assertReferenceChecks(gc, op, [X], ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.1, threshold=1e-2)
+
+    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
+           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
+           height=st.integers(4, 32),
+           width=st.integers(4, 32),
+           num_channels=st.integers(1, 4),
+           batch_size=st.integers(1, 4),
+           seed=st.integers(0, 65535),
+           **hu.gcs)
+    def test_nearest_grad(self, height_scale, width_scale, height, width,
+                          num_channels, batch_size, seed, gc, dc):
+
+        np.random.seed(seed)
+
+        output_height = np.int32(height * height_scale)
+        output_width = np.int32(width * width_scale)
+        X = np.random.rand(batch_size,
+                           num_channels,
+                           height,
+                           width).astype(np.float32)
+        dY = np.random.rand(batch_size,
+                            num_channels,
+                            output_height,
+                            output_width).astype(np.float32)
+
+        op = core.CreateOperator(
+            "ResizeNearestGradient",
+            ["dY", "X"],
+            ["dX"],
+            width_scale=width_scale,
+            height_scale=height_scale,
+        )
+
+        def ref(dY, X):
+            dX = np.zeros_like(X)
+
+            for i in range(output_height):
+                for j in range(output_width):
+                    input_i = np.minimum(i / height_scale, height - 1).astype(np.int32)
+                    input_j = np.minimum(j / width_scale, width - 1).astype(np.int32)
+                    dX[:, :, input_i, input_j] += dY[:, :, i, j]
+
+            return dX,
+
+        self.assertDeviceChecks(dc, op, [dY, X], [0])
+        self.assertReferenceChecks(gc, op, [dY, X], ref)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py
new file mode 100644
index 0000000..5b6b19c
--- /dev/null
+++ b/caffe2/python/operator_test/rmac_regions_op_test.py
@@ -0,0 +1,98 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class RMACRegionsOpTest(hu.HypothesisTestCase):
+    @given(
+        n=st.integers(500, 500),
+        h=st.integers(1, 10),
+        w=st.integers(1, 10),
+        scales=st.integers(1, 3),
+        **hu.gcs
+    )
+    def test(self, n, h, w, scales, gc, dc):
+        X = np.random.rand(n, 64, h, w).astype(np.float32)
+        overlap = 0.4
+
+        def ref_op(X):
+            N, H, W = X.shape[0], X.shape[2], X.shape[3]
+
+            # Possible regions for the long dimension
+            steps = np.array((2, 3, 4, 5, 6, 7), dtype=np.float32)
+            minW = np.minimum(H, W)
+
+            # steps(idx) regions for long dimension
+            b = (np.maximum(H, W) - minW) / (steps - 1)
+            idx = np.argmin(
+                np.abs(((minW**2 - minW * b) / minW**2) - overlap)) + 1
+
+            # Region overplus per dimension
+            Wd = 0
+            Hd = 0
+            if H < W:
+                Wd = idx
+            elif H > W:
+                Hd = idx
+
+            regions_xywh = []
+            for l in range(1, scales + 1):
+                wl = np.floor(2 * minW / (l + 1))
+
+                # Center coordinates
+                if l + Wd - 1 > 0:
+                    b = (W - wl) / (l + Wd - 1)
+                else:
+                    b = 0
+                cenW = np.floor(b * np.arange(l - 1 + Wd + 1))
+
+                # Center coordinates
+                if l + Hd - 1 > 0:
+                    b = (H - wl) / (l + Hd - 1)
+                else:
+                    b = 0
+                cenH = np.floor(b * np.arange(l - 1 + Hd + 1))
+
+                for i_ in cenW:
+                    for j_ in cenH:
+                        regions_xywh.append([i_, j_, wl, wl])
+
+            # Round the regions. Careful with the borders!
+            for i in range(len(regions_xywh)):
+                for j in range(4):
+                    regions_xywh[i][j] = int(round(regions_xywh[i][j]))
+                if regions_xywh[i][0] + regions_xywh[i][2] > W:
+                    regions_xywh[i][0] -= (
+                        (regions_xywh[i][0] + regions_xywh[i][2]) - W
+                    )
+                if regions_xywh[i][1] + regions_xywh[i][3] > H:
+                    regions_xywh[i][1] -= (
+                        (regions_xywh[i][1] + regions_xywh[i][3]) - H
+                    )
+            # Filter out 0-sized regions
+            regions_xywh = [r for r in regions_xywh if r[2] * r[3] > 0]
+
+            # Convert to ROIPoolOp format: (batch_index x1 y1 x2 y2)
+            regions = [
+                [i, x, y, x + w - 1, y + h - 1]
+                for i in np.arange(N) for x, y, w, h in regions_xywh
+            ]
+            return (np.array(regions).astype(np.float32), )
+
+        op = core.CreateOperator(
+            'RMACRegions',
+            ['X'],
+            ['RMAC_REGIONS'],
+            scales=scales,
+            overlap=overlap,
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X], ref_op)
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
new file mode 100644
index 0000000..9d9bb38
--- /dev/null
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -0,0 +1,1768 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import (
+    core, gradient_checker, rnn_cell, workspace, scope, utils
+)
+from caffe2.python.attention import AttentionType
+from caffe2.python.model_helper import ModelHelper, ExtractPredictorNet
+from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+
+from functools import partial
+from hypothesis import assume, given
+from hypothesis import settings as ht_settings
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+
+def lstm_unit(*args, **kwargs):
+    forget_bias = kwargs.get('forget_bias', 0.0)
+    drop_states = kwargs.get('drop_states', False)
+    sequence_lengths = kwargs.get('sequence_lengths', True)
+
+    if sequence_lengths:
+        hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep = args
+    else:
+        hidden_t_prev, cell_t_prev, gates, timestep = args
+    D = cell_t_prev.shape[2]
+    G = gates.shape[2]
+    N = gates.shape[1]
+    t = (timestep * np.ones(shape=(N, D))).astype(np.int32)
+    assert t.shape == (N, D)
+    assert G == 4 * D
+    # Resize to avoid broadcasting inconsistencies with NumPy
+    gates = gates.reshape(N, 4, D)
+    cell_t_prev = cell_t_prev.reshape(N, D)
+    i_t = gates[:, 0, :].reshape(N, D)
+    f_t = gates[:, 1, :].reshape(N, D)
+    o_t = gates[:, 2, :].reshape(N, D)
+    g_t = gates[:, 3, :].reshape(N, D)
+    i_t = sigmoid(i_t)
+    f_t = sigmoid(f_t + forget_bias)
+    o_t = sigmoid(o_t)
+    g_t = tanh(g_t)
+    if sequence_lengths:
+        seq_lengths = (np.ones(shape=(N, D)) *
+                       seq_lengths.reshape(N, 1)).astype(np.int32)
+        assert seq_lengths.shape == (N, D)
+        valid = (t < seq_lengths).astype(np.int32)
+    else:
+        valid = np.ones(shape=(N, D))
+    assert valid.shape == (N, D)
+    cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \
+        (1 - valid) * cell_t_prev * (1 - drop_states)
+    assert cell_t.shape == (N, D)
+    hidden_t = (o_t * tanh(cell_t)) * valid + hidden_t_prev * (
+        1 - valid) * (1 - drop_states)
+    hidden_t = hidden_t.reshape(1, N, D)
+    cell_t = cell_t.reshape(1, N, D)
+    return hidden_t, cell_t
+
+
+def layer_norm_with_scale_and_bias_ref(X, scale, bias, axis=-1, epsilon=1e-4):
+    left = np.prod(X.shape[:axis])
+    reshaped = np.reshape(X, [left, -1])
+    mean = np.mean(reshaped, axis=1).reshape([left, 1])
+    stdev = np.sqrt(
+        np.mean(np.square(reshaped), axis=1).reshape([left, 1]) -
+        np.square(mean) + epsilon
+    )
+    norm = (reshaped - mean) / stdev
+    norm = np.reshape(norm, X.shape)
+    adjusted = scale * norm + bias
+
+    return adjusted
+
+
+def layer_norm_lstm_reference(
+    input,
+    hidden_input,
+    cell_input,
+    gates_w,
+    gates_b,
+    gates_t_norm_scale,
+    gates_t_norm_bias,
+    seq_lengths,
+    forget_bias,
+    drop_states=False
+):
+    T = input.shape[0]
+    N = input.shape[1]
+    G = input.shape[2]
+    D = hidden_input.shape[hidden_input.ndim - 1]
+    hidden = np.zeros(shape=(T + 1, N, D))
+    cell = np.zeros(shape=(T + 1, N, D))
+    assert hidden.shape[0] == T + 1
+    assert cell.shape[0] == T + 1
+    assert hidden.shape[1] == N
+    assert cell.shape[1] == N
+    cell[0, :, :] = cell_input
+    hidden[0, :, :] = hidden_input
+    for t in range(T):
+        input_t = input[t].reshape(1, N, G)
+        print(input_t.shape)
+        hidden_t_prev = hidden[t].reshape(1, N, D)
+        cell_t_prev = cell[t].reshape(1, N, D)
+        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
+        gates = gates + input_t
+
+        gates = layer_norm_with_scale_and_bias_ref(
+            gates, gates_t_norm_scale, gates_t_norm_bias
+        )
+
+        hidden_t, cell_t = lstm_unit(
+            hidden_t_prev,
+            cell_t_prev,
+            gates,
+            seq_lengths,
+            t,
+            forget_bias=forget_bias,
+            drop_states=drop_states,
+        )
+        hidden[t + 1] = hidden_t
+        cell[t + 1] = cell_t
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, N, D),
+        cell[1:],
+        cell[-1].reshape(1, N, D)
+    )
+
+
+def lstm_reference(input, hidden_input, cell_input,
+                   gates_w, gates_b, seq_lengths, forget_bias,
+                   drop_states=False):
+    T = input.shape[0]
+    N = input.shape[1]
+    G = input.shape[2]
+    D = hidden_input.shape[hidden_input.ndim - 1]
+    hidden = np.zeros(shape=(T + 1, N, D))
+    cell = np.zeros(shape=(T + 1, N, D))
+    assert hidden.shape[0] == T + 1
+    assert cell.shape[0] == T + 1
+    assert hidden.shape[1] == N
+    assert cell.shape[1] == N
+    cell[0, :, :] = cell_input
+    hidden[0, :, :] = hidden_input
+    for t in range(T):
+        input_t = input[t].reshape(1, N, G)
+        hidden_t_prev = hidden[t].reshape(1, N, D)
+        cell_t_prev = cell[t].reshape(1, N, D)
+        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
+        gates = gates + input_t
+        hidden_t, cell_t = lstm_unit(
+            hidden_t_prev,
+            cell_t_prev,
+            gates,
+            seq_lengths,
+            t,
+            forget_bias=forget_bias,
+            drop_states=drop_states,
+        )
+        hidden[t + 1] = hidden_t
+        cell[t + 1] = cell_t
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, N, D),
+        cell[1:],
+        cell[-1].reshape(1, N, D)
+    )
+
+
+def multi_lstm_reference(input, hidden_input_list, cell_input_list,
+                            i2h_w_list, i2h_b_list, gates_w_list, gates_b_list,
+                            seq_lengths, forget_bias, drop_states=False):
+    num_layers = len(hidden_input_list)
+    assert len(cell_input_list) == num_layers
+    assert len(i2h_w_list) == num_layers
+    assert len(i2h_b_list) == num_layers
+    assert len(gates_w_list) == num_layers
+    assert len(gates_b_list) == num_layers
+
+    for i in range(num_layers):
+        layer_input = np.dot(input, i2h_w_list[i].T) + i2h_b_list[i]
+        h_all, h_last, c_all, c_last = lstm_reference(
+            layer_input,
+            hidden_input_list[i],
+            cell_input_list[i],
+            gates_w_list[i],
+            gates_b_list[i],
+            seq_lengths,
+            forget_bias,
+            drop_states=drop_states,
+        )
+        input = h_all
+    return h_all, h_last, c_all, c_last
+
+
+def compute_regular_attention_logits(
+    hidden_t,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    attention_weighted_encoder_context_t_prev,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    attention_v,
+    weighted_encoder_outputs,
+    encoder_outputs_for_dot_product,
+    coverage_prev,
+    coverage_weights,
+):
+    weighted_hidden_t = np.dot(
+        hidden_t,
+        weighted_decoder_hidden_state_t_w.T,
+    ) + weighted_decoder_hidden_state_t_b
+    attention_v = attention_v.reshape([-1])
+    return np.sum(
+        attention_v * np.tanh(weighted_encoder_outputs + weighted_hidden_t),
+        axis=2,
+    )
+
+
+def compute_recurrent_attention_logits(
+    hidden_t,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    attention_weighted_encoder_context_t_prev,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    attention_v,
+    weighted_encoder_outputs,
+    encoder_outputs_for_dot_product,
+    coverage_prev,
+    coverage_weights,
+):
+    weighted_hidden_t = np.dot(
+        hidden_t,
+        weighted_decoder_hidden_state_t_w.T,
+    ) + weighted_decoder_hidden_state_t_b
+    weighted_prev_attention_context = np.dot(
+        attention_weighted_encoder_context_t_prev,
+        weighted_prev_attention_context_w.T
+    ) + weighted_prev_attention_context_b
+    attention_v = attention_v.reshape([-1])
+    return np.sum(
+        attention_v * np.tanh(
+            weighted_encoder_outputs + weighted_hidden_t +
+            weighted_prev_attention_context
+        ),
+        axis=2,
+    )
+
+
+def compute_dot_attention_logits(
+    hidden_t,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    attention_weighted_encoder_context_t_prev,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    attention_v,
+    weighted_encoder_outputs,
+    encoder_outputs_for_dot_product,
+    coverage_prev,
+    coverage_weights,
+):
+    hidden_t_for_dot_product = np.transpose(hidden_t, axes=[1, 2, 0])
+    if (
+        weighted_decoder_hidden_state_t_w is not None and
+        weighted_decoder_hidden_state_t_b is not None
+    ):
+        hidden_t_for_dot_product = np.matmul(
+            weighted_decoder_hidden_state_t_w,
+            hidden_t_for_dot_product,
+        ) + np.expand_dims(weighted_decoder_hidden_state_t_b, axis=1)
+    attention_logits_t = np.sum(
+        np.matmul(
+            encoder_outputs_for_dot_product,
+            hidden_t_for_dot_product,
+        ),
+        axis=2,
+    )
+    return np.transpose(attention_logits_t)
+
+
+def compute_coverage_attention_logits(
+    hidden_t,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    attention_weighted_encoder_context_t_prev,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    attention_v,
+    weighted_encoder_outputs,
+    encoder_outputs_for_dot_product,
+    coverage_prev,
+    coverage_weights,
+):
+    weighted_hidden_t = np.dot(
+        hidden_t,
+        weighted_decoder_hidden_state_t_w.T,
+    ) + weighted_decoder_hidden_state_t_b
+    coverage_part = coverage_prev.T * coverage_weights
+    encoder_part = weighted_encoder_outputs + coverage_part
+    attention_v = attention_v.reshape([-1])
+    return np.sum(
+        attention_v * np.tanh(encoder_part + weighted_hidden_t),
+        axis=2,
+    )
+
+
+def lstm_with_attention_reference(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    encoder_outputs_transposed,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    weighted_encoder_outputs,
+    coverage_weights,
+    attention_v,
+    attention_zeros,
+    compute_attention_logits,
+):
+    encoder_outputs = np.transpose(encoder_outputs_transposed, axes=[2, 0, 1])
+    encoder_outputs_for_dot_product = np.transpose(
+        encoder_outputs_transposed,
+        [0, 2, 1],
+    )
+    decoder_input_length = input.shape[0]
+    batch_size = input.shape[1]
+    decoder_input_dim = input.shape[2]
+    decoder_state_dim = initial_hidden_state.shape[2]
+    encoder_output_dim = encoder_outputs.shape[2]
+    hidden = np.zeros(
+        shape=(decoder_input_length + 1, batch_size, decoder_state_dim))
+    cell = np.zeros(
+        shape=(decoder_input_length + 1, batch_size, decoder_state_dim))
+    attention_weighted_encoder_context = np.zeros(
+        shape=(decoder_input_length + 1, batch_size, encoder_output_dim))
+    cell[0, :, :] = initial_cell_state
+    hidden[0, :, :] = initial_hidden_state
+    attention_weighted_encoder_context[0, :, :] = (
+        initial_attention_weighted_encoder_context
+    )
+    encoder_length = encoder_outputs.shape[0]
+    coverage = np.zeros(
+        shape=(decoder_input_length + 1, batch_size, encoder_length))
+    for t in range(decoder_input_length):
+        input_t = input[t].reshape(1, batch_size, decoder_input_dim)
+        hidden_t_prev = hidden[t].reshape(1, batch_size, decoder_state_dim)
+        cell_t_prev = cell[t].reshape(1, batch_size, decoder_state_dim)
+        attention_weighted_encoder_context_t_prev = (
+            attention_weighted_encoder_context[t].reshape(
+                1, batch_size, encoder_output_dim)
+        )
+        gates_input = np.concatenate(
+            (hidden_t_prev, attention_weighted_encoder_context_t_prev),
+            axis=2,
+        )
+        gates = np.dot(gates_input, gates_w.T) + gates_b
+        gates = gates + input_t
+        hidden_t, cell_t = lstm_unit(hidden_t_prev, cell_t_prev, gates,
+                                     decoder_input_lengths, t)
+        hidden[t + 1] = hidden_t
+        cell[t + 1] = cell_t
+
+        coverage_prev = coverage[t].reshape(1, batch_size, encoder_length)
+
+        attention_logits_t = compute_attention_logits(
+            hidden_t,
+            weighted_decoder_hidden_state_t_w,
+            weighted_decoder_hidden_state_t_b,
+            attention_weighted_encoder_context_t_prev,
+            weighted_prev_attention_context_w,
+            weighted_prev_attention_context_b,
+            attention_v,
+            weighted_encoder_outputs,
+            encoder_outputs_for_dot_product,
+            coverage_prev,
+            coverage_weights,
+        )
+
+        attention_logits_t_exp = np.exp(attention_logits_t)
+        attention_weights_t = (
+            attention_logits_t_exp /
+            np.sum(attention_logits_t_exp, axis=0).reshape([1, -1])
+        )
+        coverage[t + 1, :, :] = coverage[t, :, :] + attention_weights_t.T
+        attention_weighted_encoder_context[t + 1] = np.sum(
+            (
+                encoder_outputs *
+                attention_weights_t.reshape([-1, batch_size, 1])
+            ),
+            axis=0,
+        )
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, batch_size, decoder_state_dim),
+        cell[1:],
+        cell[-1].reshape(1, batch_size, decoder_state_dim),
+        attention_weighted_encoder_context[1:],
+        attention_weighted_encoder_context[-1].reshape(
+            1,
+            batch_size,
+            encoder_output_dim,
+        )
+    )
+
+
+def lstm_with_regular_attention_reference(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    weighted_encoder_outputs,
+    attention_v,
+    attention_zeros,
+    encoder_outputs_transposed,
+):
+    return lstm_with_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_prev_attention_context_w=None,
+        weighted_prev_attention_context_b=None,
+        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
+        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
+        weighted_encoder_outputs=weighted_encoder_outputs,
+        coverage_weights=None,
+        attention_v=attention_v,
+        attention_zeros=attention_zeros,
+        compute_attention_logits=compute_regular_attention_logits,
+    )
+
+
+def lstm_with_recurrent_attention_reference(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    weighted_prev_attention_context_w,
+    weighted_prev_attention_context_b,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    weighted_encoder_outputs,
+    attention_v,
+    attention_zeros,
+    encoder_outputs_transposed,
+):
+    return lstm_with_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_prev_attention_context_w=weighted_prev_attention_context_w,
+        weighted_prev_attention_context_b=weighted_prev_attention_context_b,
+        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
+        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
+        weighted_encoder_outputs=weighted_encoder_outputs,
+        coverage_weights=None,
+        attention_v=attention_v,
+        attention_zeros=attention_zeros,
+        compute_attention_logits=compute_recurrent_attention_logits,
+    )
+
+
+def lstm_with_dot_attention_reference(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    encoder_outputs_transposed,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+):
+    return lstm_with_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_prev_attention_context_w=None,
+        weighted_prev_attention_context_b=None,
+        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
+        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
+        weighted_encoder_outputs=None,
+        coverage_weights=None,
+        attention_v=None,
+        attention_zeros=None,
+        compute_attention_logits=compute_dot_attention_logits,
+    )
+
+
+def lstm_with_dot_attention_reference_same_dim(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    encoder_outputs_transposed,
+):
+    return lstm_with_dot_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_decoder_hidden_state_t_w=None,
+        weighted_decoder_hidden_state_t_b=None,
+    )
+
+
+def lstm_with_dot_attention_reference_different_dim(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    encoder_outputs_transposed,
+):
+    return lstm_with_dot_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
+        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
+    )
+
+
+def lstm_with_coverage_attention_reference(
+    input,
+    initial_hidden_state,
+    initial_cell_state,
+    initial_attention_weighted_encoder_context,
+    initial_coverage,
+    gates_w,
+    gates_b,
+    decoder_input_lengths,
+    weighted_decoder_hidden_state_t_w,
+    weighted_decoder_hidden_state_t_b,
+    weighted_encoder_outputs,
+    coverage_weights,
+    attention_v,
+    attention_zeros,
+    encoder_outputs_transposed,
+):
+    return lstm_with_attention_reference(
+        input=input,
+        initial_hidden_state=initial_hidden_state,
+        initial_cell_state=initial_cell_state,
+        initial_attention_weighted_encoder_context=(
+            initial_attention_weighted_encoder_context
+        ),
+        gates_w=gates_w,
+        gates_b=gates_b,
+        decoder_input_lengths=decoder_input_lengths,
+        encoder_outputs_transposed=encoder_outputs_transposed,
+        weighted_prev_attention_context_w=None,
+        weighted_prev_attention_context_b=None,
+        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
+        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
+        weighted_encoder_outputs=weighted_encoder_outputs,
+        coverage_weights=coverage_weights,
+        attention_v=attention_v,
+        attention_zeros=attention_zeros,
+        compute_attention_logits=compute_coverage_attention_logits,
+    )
+
+
+def milstm_reference(
+        input,
+        hidden_input,
+        cell_input,
+        gates_w,
+        gates_b,
+        alpha,
+        beta1,
+        beta2,
+        b,
+        seq_lengths,
+        forget_bias,
+        drop_states=False):
+    T = input.shape[0]
+    N = input.shape[1]
+    G = input.shape[2]
+    D = hidden_input.shape[hidden_input.ndim - 1]
+    hidden = np.zeros(shape=(T + 1, N, D))
+    cell = np.zeros(shape=(T + 1, N, D))
+    assert hidden.shape[0] == T + 1
+    assert cell.shape[0] == T + 1
+    assert hidden.shape[1] == N
+    assert cell.shape[1] == N
+    cell[0, :, :] = cell_input
+    hidden[0, :, :] = hidden_input
+    for t in range(T):
+        input_t = input[t].reshape(1, N, G)
+        hidden_t_prev = hidden[t].reshape(1, N, D)
+        cell_t_prev = cell[t].reshape(1, N, D)
+        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
+        gates = (alpha * gates * input_t) + \
+                    (beta1 * gates) + \
+                    (beta2 * input_t) + \
+                    b
+        hidden_t, cell_t = lstm_unit(
+            hidden_t_prev,
+            cell_t_prev,
+            gates,
+            seq_lengths,
+            t,
+            forget_bias=forget_bias,
+            drop_states=drop_states,
+        )
+        hidden[t + 1] = hidden_t
+        cell[t + 1] = cell_t
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, N, D),
+        cell[1:],
+        cell[-1].reshape(1, N, D)
+    )
+
+
+def layer_norm_milstm_reference(
+        input,
+        hidden_input,
+        cell_input,
+        gates_w,
+        gates_b,
+        alpha,
+        beta1,
+        beta2,
+        b,
+        gates_t_norm_scale,
+        gates_t_norm_bias,
+        seq_lengths,
+        forget_bias,
+        drop_states=False):
+    T = input.shape[0]
+    N = input.shape[1]
+    G = input.shape[2]
+    D = hidden_input.shape[hidden_input.ndim - 1]
+    hidden = np.zeros(shape=(T + 1, N, D))
+    cell = np.zeros(shape=(T + 1, N, D))
+    assert hidden.shape[0] == T + 1
+    assert cell.shape[0] == T + 1
+    assert hidden.shape[1] == N
+    assert cell.shape[1] == N
+    cell[0, :, :] = cell_input
+    hidden[0, :, :] = hidden_input
+    for t in range(T):
+        input_t = input[t].reshape(1, N, G)
+        hidden_t_prev = hidden[t].reshape(1, N, D)
+        cell_t_prev = cell[t].reshape(1, N, D)
+        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
+        gates = (alpha * gates * input_t) + \
+                    (beta1 * gates) + \
+                    (beta2 * input_t) + \
+                    b
+        gates = layer_norm_with_scale_and_bias_ref(
+            gates, gates_t_norm_scale, gates_t_norm_bias
+        )
+        hidden_t, cell_t = lstm_unit(
+            hidden_t_prev,
+            cell_t_prev,
+            gates,
+            seq_lengths,
+            t,
+            forget_bias=forget_bias,
+            drop_states=drop_states,
+        )
+        hidden[t + 1] = hidden_t
+        cell[t + 1] = cell_t
+    return (
+        hidden[1:],
+        hidden[-1].reshape(1, N, D),
+        cell[1:],
+        cell[-1].reshape(1, N, D)
+    )
+
+
+def lstm_input():
+    '''
+    Create input tensor where each dimension is from 1 to 4, ndim=3 and
+    last dimension size is a factor of 4
+    '''
+    dims_ = st.tuples(
+        st.integers(min_value=1, max_value=4),  # t
+        st.integers(min_value=1, max_value=4),  # n
+        st.integers(min_value=1, max_value=4),  # d
+    )
+
+    def create_input(dims):
+        dims = list(dims)
+        dims[2] *= 4
+        return hu.arrays(dims)
+
+    return dims_.flatmap(create_input)
+
+
+def _prepare_attention(t, n, dim_in, encoder_dim,
+                          forward_only=False, T=None,
+                          dim_out=None, residual=False,
+                          final_dropout=False):
+    if dim_out is None:
+        dim_out = [dim_in]
+    print("Dims: t={} n={} dim_in={} dim_out={}".format(t, n, dim_in, dim_out))
+
+    model = ModelHelper(name='external')
+
+    def generate_input_state(shape):
+        return np.random.random(shape).astype(np.float32)
+
+    initial_states = []
+    for layer_id, d in enumerate(dim_out):
+        h, c = model.net.AddExternalInputs(
+            "hidden_init_{}".format(layer_id),
+            "cell_init_{}".format(layer_id),
+        )
+        initial_states.extend([h, c])
+        workspace.FeedBlob(h, generate_input_state((1, n, d)))
+        workspace.FeedBlob(c, generate_input_state((1, n, d)))
+
+    awec_init = model.net.AddExternalInputs([
+        'initial_attention_weighted_encoder_context',
+    ])
+    initial_states.append(awec_init)
+    workspace.FeedBlob(
+        awec_init,
+        generate_input_state((1, n, encoder_dim)),
+    )
+
+    # Due to convoluted RNN scoping logic we make sure that things
+    # work from a namescope
+    with scope.NameScope("test_name_scope"):
+        (
+            input_blob,
+            seq_lengths,
+            encoder_outputs,
+            weighted_encoder_outputs,
+        ) = model.net.AddScopedExternalInputs(
+            'input_blob',
+            'seq_lengths',
+            'encoder_outputs',
+            'weighted_encoder_outputs',
+        )
+
+        layer_input_dim = dim_in
+        cells = []
+        for layer_id, d in enumerate(dim_out):
+
+            cell = rnn_cell.MILSTMCell(
+                name='decoder_{}'.format(layer_id),
+                forward_only=forward_only,
+                input_size=layer_input_dim,
+                hidden_size=d,
+                forget_bias=0.0,
+                memory_optimization=False,
+            )
+            cells.append(cell)
+            layer_input_dim = d
+
+        decoder_cell = rnn_cell.MultiRNNCell(
+            cells,
+            name='decoder',
+            residual_output_layers=range(1, len(cells)) if residual else None,
+        )
+
+        attention_cell = rnn_cell.AttentionCell(
+            encoder_output_dim=encoder_dim,
+            encoder_outputs=encoder_outputs,
+            encoder_lengths=None,
+            decoder_cell=decoder_cell,
+            decoder_state_dim=dim_out[-1],
+            name='attention_decoder',
+            attention_type=AttentionType.Recurrent,
+            weighted_encoder_outputs=weighted_encoder_outputs,
+            attention_memory_optimization=True,
+        )
+        if final_dropout:
+            # dropout ratio of 0.0 used to test mechanism but not interfere
+            # with numerical tests
+            attention_cell = rnn_cell.DropoutCell(
+                internal_cell=attention_cell,
+                dropout_ratio=0.0,
+                name='dropout',
+                forward_only=forward_only,
+                is_test=False,
+            )
+
+        attention_cell = (
+            attention_cell if T is None
+            else rnn_cell.UnrolledCell(attention_cell, T)
+        )
+
+        output_indices = decoder_cell.output_indices
+        output_indices.append(2 * len(cells))
+        outputs_with_grads = [2 * i for i in output_indices]
+
+        final_output, state_outputs = attention_cell.apply_over_sequence(
+            model=model,
+            inputs=input_blob,
+            seq_lengths=seq_lengths,
+            initial_states=initial_states,
+            outputs_with_grads=outputs_with_grads,
+        )
+
+    workspace.RunNetOnce(model.param_init_net)
+
+    workspace.FeedBlob(
+        seq_lengths,
+        np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
+    )
+
+    return {
+        'final_output': final_output,
+        'net': model.net,
+        'initial_states': initial_states,
+        'input_blob': input_blob,
+        'encoder_outputs': encoder_outputs,
+        'weighted_encoder_outputs': weighted_encoder_outputs,
+        'outputs_with_grads': outputs_with_grads,
+    }
+
+
+class MulCell(rnn_cell.RNNCell):
+    def _apply(self, model, input_t,
+               seq_lengths, states, timestep, extra_inputs):
+        assert len(states) == 1
+        result = model.net.Mul([input_t, states[0]])
+        model.net.AddExternalOutput(result)
+        return [result]
+
+    def get_state_names(self):
+        return [self.scope("state")]
+
+
+def prepare_mul_rnn(model, input_blob, shape, T, outputs_with_grad, num_layers):
+    print("Shape: ", shape)
+    t, n, d = shape
+    cells = [MulCell(name="layer_{}".format(i)) for i in range(num_layers)]
+    cell = rnn_cell.MultiRNNCell(name="multi_mul_rnn", cells=cells)
+    if T is not None:
+        cell = rnn_cell.UnrolledCell(cell, T=T)
+    states = [
+        model.param_init_net.ConstantFill(
+            [], "initial_state_{}".format(i), value=1.0, shape=[1, n, d])
+        for i in range(num_layers)]
+    _, results = cell.apply_over_sequence(
+        model=model,
+        inputs=input_blob,
+        initial_states=states,
+        outputs_with_grads=[
+            x + 2 * (num_layers - 1) for x in outputs_with_grad
+        ],
+        seq_lengths=None,
+    )
+    return results[-2:]
+
+
+class RNNCellTest(hu.HypothesisTestCase):
+    @given(
+        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
+        num_layers=st.integers(1, 4),
+        outputs_with_grad=st.sampled_from(
+            [[0], [1], [0, 1]]
+        ),
+    )
+    @ht_settings(max_examples=10)
+    def test_unroll_mul(self, input_tensor, num_layers, outputs_with_grad):
+        outputs = []
+        nets = []
+        input_blob = None
+        for T in [input_tensor.shape[0], None]:
+            model = ModelHelper("rnn_mul_{}".format(
+                "unroll" if T else "dynamic"))
+            input_blob = model.net.AddExternalInputs("input_blob")
+            outputs.append(
+                prepare_mul_rnn(model, input_blob, input_tensor.shape, T,
+                                outputs_with_grad, num_layers))
+            workspace.RunNetOnce(model.param_init_net)
+            nets.append(model.net)
+            workspace.blobs[input_blob] = input_tensor
+
+        gradient_checker.NetGradientChecker.CompareNets(
+            nets, outputs, outputs_with_grad_ids=outputs_with_grad,
+            inputs_with_grads=[input_blob],
+        )
+
+    @given(
+        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
+        forget_bias=st.floats(-10.0, 10.0),
+        drop_states=st.booleans(),
+        dim_out=st.lists(
+            elements=st.integers(min_value=1, max_value=3),
+            min_size=1, max_size=3,
+        ),
+        outputs_with_grads=st.sampled_from(
+            [[0], [1], [0, 1], [0, 2], [0, 1, 2, 3]]
+        )
+    )
+    @ht_settings(max_examples=10)
+    @utils.debug
+    def test_unroll_lstm(self, input_tensor, dim_out, outputs_with_grads,
+                         **kwargs):
+        lstms = [
+            _prepare_rnn(
+                *input_tensor.shape,
+                create_rnn=rnn_cell.LSTM,
+                outputs_with_grads=outputs_with_grads,
+                T=T,
+                two_d_initial_states=False,
+                dim_out=dim_out,
+                **kwargs
+            ) for T in [input_tensor.shape[0], None]
+        ]
+        outputs, nets, inputs = zip(*lstms)
+        workspace.FeedBlob(inputs[0][-1], input_tensor)
+
+        assert inputs[0] == inputs[1]
+        gradient_checker.NetGradientChecker.CompareNets(
+            nets, outputs, outputs_with_grads,
+            inputs_with_grads=inputs[0],
+        )
+
+    @given(
+        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
+        encoder_length=st.integers(min_value=1, max_value=3),
+        encoder_dim=st.integers(min_value=1, max_value=3),
+        hidden_units=st.integers(min_value=1, max_value=3),
+        num_layers=st.integers(min_value=1, max_value=3),
+        residual=st.booleans(),
+        final_dropout=st.booleans(),
+    )
+    @ht_settings(max_examples=10)
+    @utils.debug
+    def test_unroll_attention(self, input_tensor, encoder_length,
+                                    encoder_dim, hidden_units,
+                                    num_layers, residual,
+                                    final_dropout):
+
+        dim_out = [hidden_units] * num_layers
+        encoder_tensor = np.random.random(
+            (encoder_length, input_tensor.shape[1], encoder_dim),
+        ).astype('float32')
+
+        print('Decoder input shape: {}'.format(input_tensor.shape))
+        print('Encoder output shape: {}'.format(encoder_tensor.shape))
+
+        # Necessary because otherwise test fails for networks with fewer
+        # layers than previous test. TODO: investigate why.
+        workspace.ResetWorkspace()
+
+        net, unrolled = [
+            _prepare_attention(
+                t=input_tensor.shape[0],
+                n=input_tensor.shape[1],
+                dim_in=input_tensor.shape[2],
+                encoder_dim=encoder_dim,
+                T=T,
+                dim_out=dim_out,
+                residual=residual,
+                final_dropout=final_dropout,
+            ) for T in [input_tensor.shape[0], None]
+        ]
+
+        workspace.FeedBlob(net['input_blob'], input_tensor)
+        workspace.FeedBlob(net['encoder_outputs'], encoder_tensor)
+        workspace.FeedBlob(
+            net['weighted_encoder_outputs'],
+            np.random.random(encoder_tensor.shape).astype('float32'),
+        )
+
+        for input_name in [
+            'input_blob',
+            'encoder_outputs',
+            'weighted_encoder_outputs',
+        ]:
+            assert net[input_name] == unrolled[input_name]
+        for state_name, unrolled_state_name in zip(
+            net['initial_states'],
+            unrolled['initial_states'],
+        ):
+            assert state_name == unrolled_state_name
+
+        inputs_with_grads = net['initial_states'] + [
+            net['input_blob'],
+            net['encoder_outputs'],
+            net['weighted_encoder_outputs'],
+        ]
+
+        gradient_checker.NetGradientChecker.CompareNets(
+            [net['net'], unrolled['net']],
+            [[net['final_output']], [unrolled['final_output']]],
+            [0],
+            inputs_with_grads=inputs_with_grads,
+            threshold=0.000001,
+        )
+
+    @given(
+        input_tensor=hu.tensor(min_dim=3, max_dim=3),
+        forget_bias=st.floats(-10.0, 10.0),
+        forward_only=st.booleans(),
+        drop_states=st.booleans(),
+    )
+    @ht_settings(max_examples=10)
+    def test_layered_lstm(self, input_tensor, **kwargs):
+        for outputs_with_grads in [[0], [1], [0, 1, 2, 3]]:
+            for memory_optim in [False, True]:
+                _, net, inputs = _prepare_rnn(
+                    *input_tensor.shape,
+                    create_rnn=rnn_cell.LSTM,
+                    outputs_with_grads=outputs_with_grads,
+                    memory_optim=memory_optim,
+                    **kwargs
+                )
+                workspace.FeedBlob(inputs[-1], input_tensor)
+                workspace.RunNetOnce(net)
+                workspace.ResetWorkspace()
+
+    def test_lstm(self):
+        self.lstm_base(lstm_type=(rnn_cell.LSTM, lstm_reference))
+
+    def test_milstm(self):
+        self.lstm_base(lstm_type=(rnn_cell.MILSTM, milstm_reference))
+
+    @unittest.skip("This is currently numerically unstable")
+    def test_norm_lstm(self):
+        self.lstm_base(
+            lstm_type=(rnn_cell.LayerNormLSTM, layer_norm_lstm_reference),
+        )
+
+    @unittest.skip("This is currently numerically unstable")
+    def test_norm_milstm(self):
+        self.lstm_base(
+            lstm_type=(rnn_cell.LayerNormMILSTM, layer_norm_milstm_reference)
+        )
+
+    @given(
+        seed=st.integers(0, 2**32 - 1),
+        input_tensor=lstm_input(),
+        forget_bias=st.floats(-10.0, 10.0),
+        fwd_only=st.booleans(),
+        drop_states=st.booleans(),
+        memory_optim=st.booleans(),
+        outputs_with_grads=st.sampled_from([[0], [1], [0, 1, 2, 3]]),
+    )
+    def lstm_base(self, seed, lstm_type, outputs_with_grads, memory_optim,
+                  input_tensor, forget_bias, fwd_only, drop_states):
+        np.random.seed(seed)
+        create_lstm, ref = lstm_type
+        ref = partial(ref, forget_bias=forget_bias)
+
+        t, n, d = input_tensor.shape
+        assert d % 4 == 0
+        d = d // 4
+        ref = partial(ref, forget_bias=forget_bias, drop_states=drop_states)
+
+        net = _prepare_rnn(t, n, d, create_lstm,
+                            outputs_with_grads=outputs_with_grads,
+                            memory_optim=memory_optim,
+                            forget_bias=forget_bias,
+                            forward_only=fwd_only,
+                            drop_states=drop_states)[1]
+        # here we don't provide a real input for the net but just for one of
+        # its ops (RecurrentNetworkOp). So have to hardcode this name
+        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
+                           input_tensor)
+        op = net._net.op[-1]
+        inputs = [workspace.FetchBlob(name) for name in op.input]
+
+        # Validate forward only mode is in effect
+        if fwd_only:
+            for arg in op.arg:
+                self.assertFalse(arg.name == 'backward_step_net')
+
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            inputs,
+            ref,
+            outputs_to_check=list(range(4)),
+        )
+
+        # Checking for input, gates_t_w and gates_t_b gradients
+        if not fwd_only:
+            for param in range(5):
+                self.assertGradientChecks(
+                    device_option=hu.cpu_do,
+                    op=op,
+                    inputs=inputs,
+                    outputs_to_check=param,
+                    outputs_with_grads=outputs_with_grads,
+                    threshold=0.01,
+                    stepsize=0.005,
+                )
+
+    def test_lstm_extract_predictor_net(self):
+        model = ModelHelper(name="lstm_extract_test")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            output, _, _, _ = rnn_cell.LSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seqlengths",
+                initial_states=("hidden_init", "cell_init"),
+                dim_in=20,
+                dim_out=40,
+                scope="test",
+                drop_states=True,
+                return_last_layer_only=True,
+            )
+        # Run param init net to get the shapes for all inputs
+        shapes = {}
+        workspace.RunNetOnce(model.param_init_net)
+        for b in workspace.Blobs():
+            shapes[b] = workspace.FetchBlob(b).shape
+
+        # But export in CPU
+        (predict_net, export_blobs) = ExtractPredictorNet(
+            net_proto=model.net.Proto(),
+            input_blobs=["input"],
+            output_blobs=[output],
+            device=core.DeviceOption(caffe2_pb2.CPU, 1),
+        )
+
+        # Create the net and run once to see it is valid
+        # Populate external inputs with correctly shaped random input
+        # and also ensure that the export_blobs was constructed correctly.
+        workspace.ResetWorkspace()
+        shapes['input'] = [10, 4, 20]
+        shapes['cell_init'] = [1, 4, 40]
+        shapes['hidden_init'] = [1, 4, 40]
+
+        print(predict_net.Proto().external_input)
+        self.assertTrue('seqlengths' in predict_net.Proto().external_input)
+        for einp in predict_net.Proto().external_input:
+            if einp == 'seqlengths':
+                workspace.FeedBlob(
+                    "seqlengths",
+                    np.array([10] * 4, dtype=np.int32)
+                )
+            else:
+                workspace.FeedBlob(
+                    einp,
+                    np.zeros(shapes[einp]).astype(np.float32),
+                )
+                if einp != 'input':
+                    self.assertTrue(einp in export_blobs)
+
+        print(str(predict_net.Proto()))
+        self.assertTrue(workspace.CreateNet(predict_net.Proto()))
+        self.assertTrue(workspace.RunNet(predict_net.Proto().name))
+
+        # Validate device options set correctly for the RNNs
+        for op in predict_net.Proto().op:
+            if op.type == 'RecurrentNetwork':
+                for arg in op.arg:
+                    if arg.name == "step_net":
+                        for step_op in arg.n.op:
+                            self.assertEqual(0, step_op.device_option.device_type)
+                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
+                    elif arg.name == 'backward_step_net':
+                        self.assertEqual(caffe2_pb2.NetDef(), arg.n)
+
+    def test_lstm_params(self):
+        model = ModelHelper(name="lstm_params_test")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            output, _, _, _ = rnn_cell.LSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seqlengths",
+                initial_states=None,
+                dim_in=20,
+                dim_out=40,
+                scope="test",
+                drop_states=True,
+                return_last_layer_only=True,
+            )
+        for param in model.GetParams():
+            self.assertNotEqual(model.get_param_info(param), None)
+
+    def test_milstm_params(self):
+        model = ModelHelper(name="milstm_params_test")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            output, _, _, _ = rnn_cell.MILSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seqlengths",
+                initial_states=None,
+                dim_in=20,
+                dim_out=[40, 20],
+                scope="test",
+                drop_states=True,
+                return_last_layer_only=True,
+            )
+        for param in model.GetParams():
+            self.assertNotEqual(model.get_param_info(param), None)
+
+    def test_layer_norm_lstm_params(self):
+        model = ModelHelper(name="layer_norm_lstm_params_test")
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
+            output, _, _, _ = rnn_cell.LayerNormLSTM(
+                model=model,
+                input_blob="input",
+                seq_lengths="seqlengths",
+                initial_states=None,
+                dim_in=20,
+                dim_out=40,
+                scope="test",
+                drop_states=True,
+                return_last_layer_only=True,
+            )
+        for param in model.GetParams():
+            self.assertNotEqual(model.get_param_info(param), None)
+
+    @given(encoder_output_length=st.integers(1, 3),
+           encoder_output_dim=st.integers(1, 3),
+           decoder_input_length=st.integers(1, 3),
+           decoder_state_dim=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **hu.gcs)
+    def test_lstm_with_regular_attention(
+        self,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        gc,
+        dc,
+    ):
+        self.lstm_with_attention(
+            partial(
+                rnn_cell.LSTMWithAttention,
+                attention_type=AttentionType.Regular,
+            ),
+            encoder_output_length,
+            encoder_output_dim,
+            decoder_input_length,
+            decoder_state_dim,
+            batch_size,
+            lstm_with_regular_attention_reference,
+            gc,
+        )
+
+    @given(encoder_output_length=st.integers(1, 3),
+           encoder_output_dim=st.integers(1, 3),
+           decoder_input_length=st.integers(1, 3),
+           decoder_state_dim=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **hu.gcs)
+    def test_lstm_with_recurrent_attention(
+        self,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        gc,
+        dc,
+    ):
+        self.lstm_with_attention(
+            partial(
+                rnn_cell.LSTMWithAttention,
+                attention_type=AttentionType.Recurrent,
+            ),
+            encoder_output_length,
+            encoder_output_dim,
+            decoder_input_length,
+            decoder_state_dim,
+            batch_size,
+            lstm_with_recurrent_attention_reference,
+            gc,
+        )
+
+    @given(encoder_output_length=st.integers(2, 2),
+           encoder_output_dim=st.integers(4, 4),
+           decoder_input_length=st.integers(3, 3),
+           decoder_state_dim=st.integers(4, 4),
+           batch_size=st.integers(5, 5),
+           **hu.gcs)
+    def test_lstm_with_dot_attention_same_dim(
+        self,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        gc,
+        dc,
+    ):
+        self.lstm_with_attention(
+            partial(
+                rnn_cell.LSTMWithAttention,
+                attention_type=AttentionType.Dot,
+            ),
+            encoder_output_length,
+            encoder_output_dim,
+            decoder_input_length,
+            decoder_state_dim,
+            batch_size,
+            lstm_with_dot_attention_reference_same_dim,
+            gc,
+        )
+
+    @given(encoder_output_length=st.integers(1, 3),
+           encoder_output_dim=st.integers(4, 4),
+           decoder_input_length=st.integers(1, 3),
+           decoder_state_dim=st.integers(5, 5),
+           batch_size=st.integers(1, 3),
+           **hu.gcs)
+    def test_lstm_with_dot_attention_different_dim(
+        self,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        gc,
+        dc,
+    ):
+        self.lstm_with_attention(
+            partial(
+                rnn_cell.LSTMWithAttention,
+                attention_type=AttentionType.Dot,
+            ),
+            encoder_output_length,
+            encoder_output_dim,
+            decoder_input_length,
+            decoder_state_dim,
+            batch_size,
+            lstm_with_dot_attention_reference_different_dim,
+            gc,
+        )
+
+    @given(encoder_output_length=st.integers(2, 3),
+           encoder_output_dim=st.integers(1, 3),
+           decoder_input_length=st.integers(1, 3),
+           decoder_state_dim=st.integers(1, 3),
+           batch_size=st.integers(1, 3),
+           **hu.gcs)
+    def test_lstm_with_coverage_attention(
+        self,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        gc,
+        dc,
+    ):
+        self.lstm_with_attention(
+            partial(
+                rnn_cell.LSTMWithAttention,
+                attention_type=AttentionType.SoftCoverage,
+            ),
+            encoder_output_length,
+            encoder_output_dim,
+            decoder_input_length,
+            decoder_state_dim,
+            batch_size,
+            lstm_with_coverage_attention_reference,
+            gc,
+        )
+
+    def lstm_with_attention(
+        self,
+        create_lstm_with_attention,
+        encoder_output_length,
+        encoder_output_dim,
+        decoder_input_length,
+        decoder_state_dim,
+        batch_size,
+        ref,
+        gc,
+    ):
+        model = ModelHelper(name='external')
+        with core.DeviceScope(gc):
+            (
+                encoder_outputs,
+                decoder_inputs,
+                decoder_input_lengths,
+                initial_decoder_hidden_state,
+                initial_decoder_cell_state,
+                initial_attention_weighted_encoder_context,
+            ) = model.net.AddExternalInputs(
+                'encoder_outputs',
+                'decoder_inputs',
+                'decoder_input_lengths',
+                'initial_decoder_hidden_state',
+                'initial_decoder_cell_state',
+                'initial_attention_weighted_encoder_context',
+            )
+            create_lstm_with_attention(
+                model=model,
+                decoder_inputs=decoder_inputs,
+                decoder_input_lengths=decoder_input_lengths,
+                initial_decoder_hidden_state=initial_decoder_hidden_state,
+                initial_decoder_cell_state=initial_decoder_cell_state,
+                initial_attention_weighted_encoder_context=(
+                    initial_attention_weighted_encoder_context
+                ),
+                encoder_output_dim=encoder_output_dim,
+                encoder_outputs=encoder_outputs,
+                encoder_lengths=None,
+                decoder_input_dim=decoder_state_dim,
+                decoder_state_dim=decoder_state_dim,
+                scope='external/LSTMWithAttention',
+            )
+            op = model.net._net.op[-2]
+        workspace.RunNetOnce(model.param_init_net)
+
+        # This is original decoder_inputs after linear layer
+        decoder_input_blob = op.input[0]
+
+        workspace.FeedBlob(
+            decoder_input_blob,
+            np.random.randn(
+                decoder_input_length,
+                batch_size,
+                decoder_state_dim * 4,
+            ).astype(np.float32))
+        workspace.FeedBlob(
+            'external/LSTMWithAttention/encoder_outputs_transposed',
+            np.random.randn(
+                batch_size,
+                encoder_output_dim,
+                encoder_output_length,
+            ).astype(np.float32),
+        )
+        workspace.FeedBlob(
+            'external/LSTMWithAttention/weighted_encoder_outputs',
+            np.random.randn(
+                encoder_output_length,
+                batch_size,
+                encoder_output_dim,
+            ).astype(np.float32),
+        )
+        workspace.FeedBlob(
+            'external/LSTMWithAttention/coverage_weights',
+            np.random.randn(
+                encoder_output_length,
+                batch_size,
+                encoder_output_dim,
+            ).astype(np.float32),
+        )
+        workspace.FeedBlob(
+            decoder_input_lengths,
+            np.random.randint(
+                0,
+                decoder_input_length + 1,
+                size=(batch_size,)
+            ).astype(np.int32))
+        workspace.FeedBlob(
+            initial_decoder_hidden_state,
+            np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)
+        )
+        workspace.FeedBlob(
+            initial_decoder_cell_state,
+            np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)
+        )
+        workspace.FeedBlob(
+            initial_attention_weighted_encoder_context,
+            np.random.randn(
+                1, batch_size, encoder_output_dim).astype(np.float32)
+        )
+        workspace.FeedBlob(
+            'external/LSTMWithAttention/initial_coverage',
+            np.zeros((1, batch_size, encoder_output_length)).astype(np.float32),
+        )
+        inputs = [workspace.FetchBlob(name) for name in op.input]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref,
+            grad_reference=None,
+            output_to_grad=None,
+            outputs_to_check=list(range(6)),
+        )
+        gradients_to_check = [
+            index for (index, input_name) in enumerate(op.input)
+            if input_name != 'decoder_input_lengths'
+        ]
+        for param in gradients_to_check:
+            self.assertGradientChecks(
+                device_option=gc,
+                op=op,
+                inputs=inputs,
+                outputs_to_check=param,
+                outputs_with_grads=[0, 4],
+                threshold=0.01,
+                stepsize=0.001,
+            )
+
+    @given(seed=st.integers(0, 2**32 - 1),
+           n=st.integers(1, 10),
+           d=st.integers(1, 10),
+           t=st.integers(1, 10),
+           dtype=st.sampled_from([np.float32, np.float16]),
+           use_sequence_lengths=st.booleans(),
+           **hu.gcs)
+    def test_lstm_unit_recurrent_network(
+            self, seed, n, d, t, dtype, dc, use_sequence_lengths, gc):
+        np.random.seed(seed)
+        if dtype == np.float16:
+            # only supported with CUDA
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [do for do in dc if do.device_type == caffe2_pb2.CUDA]
+
+        if use_sequence_lengths:
+            op_inputs = ['hidden_t_prev', 'cell_t_prev', 'gates_t',
+                         'seq_lengths', 'timestep']
+        else:
+            op_inputs = ['hidden_t_prev', 'cell_t_prev', 'gates_t', 'timestep']
+        op = core.CreateOperator(
+            'LSTMUnit',
+            op_inputs,
+            ['hidden_t', 'cell_t'],
+            sequence_lengths=use_sequence_lengths,
+        )
+        cell_t_prev = np.random.randn(1, n, d).astype(dtype)
+        hidden_t_prev = np.random.randn(1, n, d).astype(dtype)
+        gates = np.random.randn(1, n, 4 * d).astype(dtype)
+        seq_lengths = np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
+        timestep = np.random.randint(0, t, size=(1,)).astype(np.int32)
+        if use_sequence_lengths:
+            inputs = [hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep]
+        else:
+            inputs = [hidden_t_prev, cell_t_prev, gates, timestep]
+        input_device_options = {'timestep': hu.cpu_do}
+        self.assertDeviceChecks(
+            dc, op, inputs, [0],
+            input_device_options=input_device_options)
+
+        kwargs = {}
+        if dtype == np.float16:
+            kwargs['threshold'] = 1e-1  # default is 1e-4
+
+        def lstm_unit_reference(*args, **kwargs):
+            return lstm_unit(*args, sequence_lengths=use_sequence_lengths, **kwargs)
+
+        self.assertReferenceChecks(
+            gc, op, inputs, lstm_unit_reference,
+            input_device_options=input_device_options,
+            **kwargs)
+
+        kwargs = {}
+        if dtype == np.float16:
+            kwargs['threshold'] = 0.5  # default is 0.005
+
+        for i in range(2):
+            self.assertGradientChecks(
+                gc, op, inputs, i, [0, 1],
+                input_device_options=input_device_options,
+                **kwargs)
+
+    @given(input_length=st.integers(2, 5),
+           dim_in=st.integers(1, 3),
+           max_num_units=st.integers(1, 3),
+           num_layers=st.integers(2, 3),
+           batch_size=st.integers(1, 3))
+    def test_multi_lstm(
+        self,
+        input_length,
+        dim_in,
+        max_num_units,
+        num_layers,
+        batch_size,
+    ):
+        model = ModelHelper(name='external')
+        (
+            input_sequence,
+            seq_lengths,
+        ) = model.net.AddExternalInputs(
+            'input_sequence',
+            'seq_lengths',
+        )
+        dim_out = [
+            np.random.randint(1, max_num_units + 1)
+            for _ in range(num_layers)
+        ]
+        h_all, h_last, c_all, c_last = rnn_cell.LSTM(
+            model=model,
+            input_blob=input_sequence,
+            seq_lengths=seq_lengths,
+            initial_states=None,
+            dim_in=dim_in,
+            dim_out=dim_out,
+            # scope='test',
+            outputs_with_grads=(0,),
+            return_params=False,
+            memory_optimization=False,
+            forget_bias=0.0,
+            forward_only=False,
+            return_last_layer_only=True,
+        )
+
+        workspace.RunNetOnce(model.param_init_net)
+
+        seq_lengths_val = np.random.randint(
+            1,
+            input_length + 1,
+            size=(batch_size),
+        ).astype(np.int32)
+        input_sequence_val = np.random.randn(
+            input_length,
+            batch_size,
+            dim_in,
+        ).astype(np.float32)
+        workspace.FeedBlob(seq_lengths, seq_lengths_val)
+        workspace.FeedBlob(input_sequence, input_sequence_val)
+
+        hidden_input_list = []
+        cell_input_list = []
+        i2h_w_list = []
+        i2h_b_list = []
+        gates_w_list = []
+        gates_b_list = []
+
+        for i in range(num_layers):
+            hidden_input_list.append(
+                workspace.FetchBlob(
+                    'layer_{}/initial_hidden_state'.format(i)),
+            )
+            cell_input_list.append(
+                workspace.FetchBlob(
+                    'layer_{}/initial_cell_state'.format(i)),
+            )
+            # Input projection for the first layer is produced outside
+            # of the cell ans thus not scoped
+            prefix = 'layer_{}/'.format(i) if i > 0 else ''
+            i2h_w_list.append(
+                workspace.FetchBlob('{}i2h_w'.format(prefix)),
+            )
+            i2h_b_list.append(
+                workspace.FetchBlob('{}i2h_b'.format(prefix)),
+            )
+            gates_w_list.append(
+                workspace.FetchBlob('layer_{}/gates_t_w'.format(i)),
+            )
+            gates_b_list.append(
+                workspace.FetchBlob('layer_{}/gates_t_b'.format(i)),
+            )
+
+        workspace.RunNetOnce(model.net)
+        h_all_calc = workspace.FetchBlob(h_all)
+        h_last_calc = workspace.FetchBlob(h_last)
+        c_all_calc = workspace.FetchBlob(c_all)
+        c_last_calc = workspace.FetchBlob(c_last)
+
+        h_all_ref, h_last_ref, c_all_ref, c_last_ref = multi_lstm_reference(
+            input_sequence_val,
+            hidden_input_list,
+            cell_input_list,
+            i2h_w_list,
+            i2h_b_list,
+            gates_w_list,
+            gates_b_list,
+            seq_lengths_val,
+            forget_bias=0.0,
+        )
+
+        h_all_delta = np.abs(h_all_ref - h_all_calc).sum()
+        h_last_delta = np.abs(h_last_ref - h_last_calc).sum()
+        c_all_delta = np.abs(c_all_ref - c_all_calc).sum()
+        c_last_delta = np.abs(c_last_ref - c_last_calc).sum()
+
+        self.assertAlmostEqual(h_all_delta, 0.0, places=5)
+        self.assertAlmostEqual(h_last_delta, 0.0, places=5)
+        self.assertAlmostEqual(c_all_delta, 0.0, places=5)
+        self.assertAlmostEqual(c_last_delta, 0.0, places=5)
+
+        input_values = {
+            'input_sequence': input_sequence_val,
+            'seq_lengths': seq_lengths_val,
+        }
+        for param in model.GetParams():
+            value = workspace.FetchBlob(param)
+            input_values[str(param)] = value
+
+        output_sum = model.net.SumElements(
+            [h_all],
+            'output_sum',
+            average=True,
+        )
+        fake_loss = model.net.Tanh(
+            output_sum,
+        )
+        for param in model.GetParams():
+            gradient_checker.NetGradientChecker.Check(
+                model.net,
+                outputs_with_grad=[fake_loss],
+                input_values=input_values,
+                input_to_check=str(param),
+                print_net=False,
+                step_size=0.0001,
+                threshold=0.05,
+            )
+
+
+if __name__ == "__main__":
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+    ])
+    unittest.main()
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
new file mode 100644
index 0000000..0c82104
--- /dev/null
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -0,0 +1,206 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import copy
+
+
+class RoIAlignRotatedOp(hu.HypothesisTestCase):
+    def bbox_xywh_to_xyxy(self, boxes):
+        """
+        Convert from [center_x center_y w h] format to [x1 y1 x2 y2].
+        """
+        w, h = boxes[:, 2], boxes[:, 3]
+        boxes[:, 0] -= w / 2.0  # x1 = center_x - width/2
+        boxes[:, 1] -= h / 2.0  # y1 = center_y - height/2
+        boxes[:, 2] = boxes[:, 0] + w  # x2 = x1 + width
+        boxes[:, 3] = boxes[:, 1] + h  # y2 = y1 + height
+        return boxes
+
+    @given(
+        H=st.integers(min_value=50, max_value=100),
+        W=st.integers(min_value=50, max_value=100),
+        C=st.integers(min_value=1, max_value=3),
+        num_rois=st.integers(min_value=0, max_value=10),
+        pooled_size=st.sampled_from([7, 14]),
+        **hu.gcs
+    )
+    def test_horizontal_rois(self, H, W, C, num_rois, pooled_size, gc, dc):
+        """
+        Test that results match with RoIAlign when angle=0.
+        """
+        X = np.random.randn(1, C, H, W).astype(np.float32)
+        R = np.zeros((num_rois, 6)).astype(np.float32)
+        angle = 0.0
+        for i in range(num_rois):
+            x = np.random.uniform(1, W - 1)
+            y = np.random.uniform(1, H - 1)
+            w = np.random.uniform(1, min(x, W - x))
+            h = np.random.uniform(1, min(y, H - y))
+            R[i] = [0, x, y, w, h, angle]
+
+        op = core.CreateOperator(
+            "RoIAlignRotated",
+            ["X", "R"],
+            ["Y"],
+            pooled_h=pooled_size,
+            pooled_w=pooled_size,
+            sampling_ratio=0,
+        )
+
+        def roialign_ref(X, R):
+            # Remove angle and convert from [center_x center_y w h]
+            # to [x1 y1 x2 y2] format.
+            R_ref = copy.deepcopy(R[:, 0:5])
+            R_ref[:, 1:5] = self.bbox_xywh_to_xyxy(R_ref[:, 1:5])
+
+            ref_op = core.CreateOperator(
+                "RoIAlign",
+                ["X_ref", "R_ref"],
+                ["Y_ref"],
+                pooled_h=pooled_size,
+                pooled_w=pooled_size,
+                sampling_ratio=0,
+            )
+            workspace.FeedBlob("X_ref", X)
+            workspace.FeedBlob("R_ref", R_ref)
+            workspace.RunOperatorOnce(ref_op)
+            return [workspace.FetchBlob("Y_ref")]
+
+        self.assertReferenceChecks(
+            device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
+        )
+        if gc.device_type == caffe2_pb2.CUDA:
+            self.assertGradientChecks(gc, op, [X, R], 0, [0])
+
+    @given(
+        H=st.integers(min_value=50, max_value=100),
+        W=st.integers(min_value=50, max_value=100),
+        C=st.integers(min_value=1, max_value=3),
+        num_rois=st.integers(min_value=0, max_value=10),
+        pooled_size=st.sampled_from([7, 14]),
+        angle=st.sampled_from([-270, -180, -90, 90, 180, 270]),
+        **hu.gcs
+    )
+    def test_simple_rotations(
+        self, H, W, C, num_rois, pooled_size, angle, gc, dc
+    ):
+        """
+        Test with right-angled rotations that don't need interpolation.
+        """
+        X = np.random.randn(1, C, H, W).astype(np.float32)
+        R = np.zeros((num_rois, 6)).astype(np.float32)
+        for i in range(num_rois):
+            x = np.random.uniform(1, W - 1)
+            y = np.random.uniform(1, H - 1)
+            w = np.random.uniform(1, min(x, W - x, y, H - y))
+            h = np.random.uniform(1, min(x, W - x, y, H - y))
+            R[i] = [0, x, y, w, h, angle]
+
+        op = core.CreateOperator(
+            "RoIAlignRotated",
+            ["X", "R"],
+            ["Y"],
+            pooled_h=pooled_size,
+            pooled_w=pooled_size,
+            sampling_ratio=0,
+        )
+
+        def roialign_rot90(m, k=1, axes=(0,1)):
+            axes = tuple(axes)
+            if len(axes) != 2:
+                raise ValueError("len(axes) must be 2.")
+
+            m = np.asanyarray(m)
+
+            if axes[0] == axes[1] or np.absolute(axes[0] - axes[1]) == m.ndim:
+                raise ValueError("Axes must be different.")
+
+            if (axes[0] >= m.ndim or axes[0] < -m.ndim or
+                    axes[1] >= m.ndim or axes[1] < -m.ndim):
+                raise ValueError(
+                    "Axes={} out of range for array of ndim={}.".format(axes, m.ndim))
+
+            k %= 4
+
+            if k == 0:
+                return m[:]
+            if k == 2:
+                return roialign_flip(roialign_flip(m, axes[0]), axes[1])
+
+            axes_list = np.arange(0, m.ndim)
+            (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
+                                                        axes_list[axes[0]])
+
+            if k == 1:
+                return np.transpose(roialign_flip(m,axes[1]), axes_list)
+            else:
+                # k == 3
+                return roialign_flip(np.transpose(m, axes_list), axes[1])
+
+        def roialign_flip(m, axis):
+            if not hasattr(m, 'ndim'):
+                m = np.asarray(m)
+            indexer = [slice(None)] * m.ndim
+            try:
+                indexer[axis] = slice(None, None, -1)
+            except IndexError:
+                raise ValueError("axis=%i is invalid for the %i-dimensional input array"
+                                 % (axis, m.ndim))
+            return m[tuple(indexer)]
+
+        def roialign_ref(X, R):
+            # `angle` denotes counter-clockwise rotation. Rotate the input
+            # feature map in the opposite (clockwise) direction and perform
+            # standard RoIAlign. We assume all RoIs have the same angle.
+            #
+            # Also note that we need to have our own version of np.rot90,
+            # since axes isn't an argument until 1.12.0 and doesn't exist
+            # on all tested platforms.
+            norm_angle = (angle + 360) % 360
+            X_ref = roialign_rot90(X, k=-norm_angle / 90, axes=(2, 3))
+
+            # Rotate RoIs clockwise wrt the center of the input feature
+            # map to make them horizontal and convert from
+            # [center_x center_y w h] to [x1 y1 x2 y2] format.
+            roi_x, roi_y = R[:, 1], R[:, 2]
+            if norm_angle == 90:
+                new_roi_x = H - roi_y - 1
+                new_roi_y = roi_x
+            elif norm_angle == 180:
+                new_roi_x = W - roi_x - 1
+                new_roi_y = H - roi_y - 1
+            elif norm_angle == 270:
+                new_roi_x = roi_y
+                new_roi_y = W - roi_x - 1
+            else:
+                raise NotImplementedError
+            R_ref = copy.deepcopy(R[:, 0:5])
+            R_ref[:, 1], R_ref[:, 2] = new_roi_x, new_roi_y
+            R_ref[:, 1:5] = self.bbox_xywh_to_xyxy(R_ref[:, 1:5])
+
+            ref_op = core.CreateOperator(
+                "RoIAlign",
+                ["X_ref", "R_ref"],
+                ["Y_ref"],
+                pooled_h=pooled_size,
+                pooled_w=pooled_size,
+                sampling_ratio=0,
+            )
+            workspace.FeedBlob("X_ref", X_ref)
+            workspace.FeedBlob("R_ref", R_ref)
+            workspace.RunOperatorOnce(ref_op)
+            return [workspace.FetchBlob("Y_ref")]
+
+        self.assertReferenceChecks(
+            device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
+        )
+        if gc.device_type == caffe2_pb2.CUDA:
+            self.assertGradientChecks(gc, op, [X, R], 0, [0])
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
new file mode 100644
index 0000000..b732d91
--- /dev/null
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -0,0 +1,713 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import partial
+from hypothesis import given
+
+import numpy as np
+import unittest
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TesterBase:
+    def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
+        segments = self.split(data, segment_ids, indices)
+        output = np.zeros((len(segments), ) + data.shape[1:])
+        for i, segment in enumerate(segments):
+            if len(segment) > 0:
+                output[i] = reducer(segment)
+            else:
+                output[i] = 0.0
+        return output
+
+    def segment_reduce_grad_op(
+        self,
+        data,
+        segment_ids,
+        reducer_grad,
+        grad_out,
+        output,
+        indices=None
+    ):
+        segments = self.split(data, segment_ids, indices)
+        segment_grads = [
+            reducer_grad(grad_out[i], [output[i]], [segment])
+            for i, segment in enumerate(segments)
+        ]
+        return self.unsplit(data.shape[1:], segment_grads, segment_ids)
+
+    def _test(self, prefix, input_strategy, refs, gpu=False, **kwargs):
+        tester = self
+        operator_args = kwargs.pop('operator_args', {})
+        threshold = kwargs.pop('threshold', 1e-4)
+        grad_check = kwargs.pop('grad_check', True)
+
+        @given(X=input_strategy, **hu.gcs)
+        def test_segment_ops(self, X, gc, dc):
+            if not gpu and gc.device_type > 0:
+                return
+            for op_name, ref, grad_ref in refs:
+                inputs = ['input%d' % i for i in range(0, len(X))]
+                op = core.CreateOperator(
+                    prefix + op_name, inputs, ['output'], **operator_args
+                )
+                print('Operator %s, ' % op.type, gc.device_type)
+
+                def seg_reduce(data, *args):
+                    indices, segments = (
+                        args if len(args) == 2 else (None, args[0])
+                    )
+                    out = tester.segment_reduce_op(
+                        data=data,
+                        segment_ids=segments,
+                        indices=indices,
+                        reducer=ref
+                    )
+                    return (out, )
+
+                def seg_reduce_grad(grad_out, outputs, inputs):
+                    data = inputs[0]
+                    args = inputs[1:]
+                    indices, segments = (
+                        args if len(args) == 2 else (None, args[0])
+                    )
+                    # grad r.t. data
+                    grad_val = tester.segment_reduce_grad_op(
+                        data, segments, grad_ref, grad_out, outputs[0], indices
+                    )
+                    # if sparse, include indices along with data gradient
+                    data_grad_slice = (
+                        (grad_val, indices) if indices is not None else grad_val
+                    )
+                    # other inputs don't have gradient
+                    return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
+
+                kwargs = {}
+                if grad_check:
+                    kwargs['output_to_grad'] = 'output'
+                    kwargs['grad_reference'] = seg_reduce_grad
+                self.assertReferenceChecks(
+                    device_option=gc,
+                    op=op,
+                    inputs=X,
+                    reference=seg_reduce,
+                    threshold=threshold,
+                    **kwargs
+                )
+        return test_segment_ops
+
+
+class SegmentsTester(TesterBase):
+    def split(self, data, segment_ids, indices=None):
+        """
+        Given:
+          data[M1 x M2 x ... x Md]
+                          the input data
+          indices[N]      the index of each entry of segment_ids into data,
+                          where 0 <= index[i] < M1,
+                          with default indices=[0,1,...N]
+          segment_ids[N]  the segment_id for each entry of indices,
+
+        returns K outputs, each one containing data entries corresponding
+        to one of the segments present in `segment_ids`.
+        """
+        if segment_ids.size == 0:
+            return []
+        K = max(segment_ids) + 1
+        outputs = [
+            np.zeros(
+                (np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
+                dtype=data.dtype
+            ) for seg_id in range(0, K)
+        ]
+        counts = np.zeros(K, dtype=int)
+        for i, seg_id in enumerate(segment_ids):
+            data_idx = i if indices is None else indices[i]
+            outputs[seg_id][counts[seg_id]] = data[data_idx]
+            counts[seg_id] += 1
+        return outputs
+
+    def unsplit(self, extra_shape, inputs, segment_ids):
+        """ Inverse operation to `split`, with indices=None """
+        output = np.zeros((len(segment_ids), ) + extra_shape)
+        if len(segment_ids) == 0:
+            return output
+        K = max(segment_ids) + 1
+        counts = np.zeros(K, dtype=int)
+        for i, seg_id in enumerate(segment_ids):
+            output[i] = inputs[seg_id][counts[seg_id]]
+            counts[seg_id] += 1
+        return output
+
+
+class LengthsTester(TesterBase):
+    def split(self, data, lengths, indices=None):
+        K = len(lengths)
+        outputs = [
+            np.zeros((lengths[seg_id], ) + data.shape[1:],
+                     dtype=data.dtype) for seg_id in range(0, K)
+        ]
+        start = 0
+        for i in range(0, K):
+            for j in range(0, lengths[i]):
+                data_index = start + j
+                if indices is not None:
+                    data_index = indices[data_index]
+                outputs[i][j] = data[data_index]
+            start += lengths[i]
+        return outputs
+
+    def unsplit(self, extra_shape, inputs, lengths):
+        N = sum(lengths)
+        output = np.zeros((N, ) + extra_shape)
+        K = len(lengths)
+        assert len(inputs) == K
+        current = 0
+        for i in range(0, K):
+            for j in range(0, lengths[i]):
+                output[current] = inputs[i][j]
+                current += 1
+        return output
+
+
+def sum_grad(grad_out, outputs, inputs):
+    return np.repeat(
+        np.expand_dims(grad_out, axis=0),
+        inputs[0].shape[0],
+        axis=0
+    )
+
+
+def logsumexp(x):
+    return np.log(np.sum(np.exp(x), axis=0))
+
+
+def logsumexp_grad(grad_out, outputs, inputs):
+    sum_exps = np.sum(np.exp(inputs[0]), axis=0)
+    return np.repeat(
+        np.expand_dims(grad_out / sum_exps, 0),
+        inputs[0].shape[0],
+        axis=0
+    ) * np.exp(inputs[0])
+
+
+def logmeanexp(x):
+    return np.log(np.mean(np.exp(x), axis=0))
+
+
+def mean(x):
+    return np.mean(x, axis=0)
+
+
+def mean_grad(grad_out, outputs, inputs):
+    return np.repeat(
+        np.expand_dims(grad_out / inputs[0].shape[0], 0),
+        inputs[0].shape[0],
+        axis=0
+    )
+
+
+def max_fwd(x):
+    return np.amax(x, axis=0)
+
+
+def max_grad(grad_out, outputs, inputs):
+    flat_inputs = inputs[0].flatten()
+    flat_outputs = np.array(outputs[0]).flatten()
+    flat_grad_in = np.zeros(flat_inputs.shape)
+    flat_grad_out = np.array(grad_out).flatten()
+    blocks = inputs[0].shape[0]
+    if blocks == 0:
+        return np.zeros(inputs[0].shape)
+    block_size = flat_inputs.shape[0] // blocks
+
+    for i in range(block_size):
+        out_grad = flat_grad_out[i]
+        out = flat_outputs[i]
+        for j in range(blocks):
+            idx = j * block_size + i
+            # we can produce multiple outputs for max
+            if out == flat_inputs[idx]:
+                flat_grad_in[idx] = out_grad
+
+    return np.resize(flat_grad_in, inputs[0].shape)
+
+
+REFERENCES_ALL = [
+    ('Sum', partial(np.sum, axis=0), sum_grad),
+    ('Mean', partial(np.mean, axis=0), mean_grad),
+]
+
+REFERENCES_SORTED = [
+    ('RangeSum', partial(np.sum, axis=0), sum_grad),
+    ('RangeLogSumExp', logsumexp, logsumexp_grad),
+    # gradient is the same as sum
+    ('RangeLogMeanExp', logmeanexp, logsumexp_grad),
+    ('RangeMean', mean, mean_grad),
+    ('RangeMax', max_fwd, max_grad),
+]
+
+REFERENCES_LENGTHS_ONLY = [
+    ('Max', partial(np.amax, axis=0), max_grad),
+]
+
+
+def sparse_lengths_weighted_sum_ref(D, W, I, L):
+    R = np.zeros(shape=(len(L), ) + D.shape[1:], dtype=D.dtype)
+    line = 0
+    for g in range(len(L)):
+        for _ in range(L[g]):
+            if len(D.shape) > 1:
+                R[g, :] += W[line] * D[I[line], :]
+            else:
+                R[g] += W[line] * D[I[line]]
+            line += 1
+    return [R]
+
+
+def sparse_lengths_weighted_sum_grad_ref(
+        GO, fwd_out, fwd_in, grad_on_weights=False):
+    D, W, I, L = fwd_in
+    GI = np.zeros(shape=(len(I), ) + D.shape[1:], dtype=D.dtype)
+    GW = np.zeros(shape=W.shape, dtype=W.dtype) if grad_on_weights else None
+    line = 0
+    for g in range(len(L)):
+        for _ in range(L[g]):
+            if len(GO.shape) > 1:
+                GI[line, :] = W[line] * GO[g, :]
+            else:
+                GI[line] = W[line] * GO[g]
+            if GW is not None:
+                if len(GO.shape) > 1:
+                    GW[line] = np.dot(GO[g].flatten(), D[I[line], :].flatten())
+                else:
+                    GW[line] = np.dot(GO[g].flatten(), D[I[line]].flatten())
+            line += 1
+    print(GW)
+    return [(GI, I), GW, None, None]
+
+
+class TestSegmentOps(hu.HypothesisTestCase):
+    def test_sorted_segment_ops(self):
+        SegmentsTester()._test(
+            'SortedSegment',
+            hu.segmented_tensor(
+                dtype=np.float32,
+                is_sorted=True,
+                allow_empty=True
+            ),
+            REFERENCES_ALL + REFERENCES_SORTED
+        )(self)
+
+    def test_unsorted_segment_ops(self):
+        SegmentsTester()._test(
+            'UnsortedSegment',
+            hu.segmented_tensor(
+                dtype=np.float32,
+                is_sorted=False,
+                allow_empty=True
+            ),
+            REFERENCES_ALL,
+        )(self)
+
+    def test_unsorted_segment_ops_gpu(self):
+        SegmentsTester()._test(
+            'UnsortedSegment',
+            hu.segmented_tensor(
+                dtype=np.float32,
+                is_sorted=False,
+                allow_empty=True,
+            ),
+            REFERENCES_ALL,
+            gpu=workspace.has_gpu_support,
+            grad_check=False,
+        )(self)
+
+    def test_sparse_sorted_segment_ops(self):
+        SegmentsTester()._test(
+            'SparseSortedSegment',
+            hu.sparse_segmented_tensor(
+                dtype=np.float32,
+                is_sorted=True,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
+
+    def test_sparse_unsorted_segment_ops(self):
+        SegmentsTester()._test(
+            'SparseUnsortedSegment',
+            hu.sparse_segmented_tensor(
+                dtype=np.float32,
+                is_sorted=False,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
+
+    def test_lengths_ops(self):
+        LengthsTester()._test(
+            'Lengths',
+            hu.lengths_tensor(
+                dtype=np.float32,
+                min_value=1,
+                max_value=5,
+                allow_empty=True
+            ),
+            REFERENCES_ALL + REFERENCES_LENGTHS_ONLY,
+        )(self)
+
+    def test_sparse_lengths_ops(self):
+        for itype in [np.int32, np.int64]:
+            LengthsTester()._test(
+                'SparseLengths',
+                hu.sparse_lengths_tensor(
+                    dtype=np.float32,
+                    min_value=1,
+                    max_value=5,
+                    allow_empty=True,
+                    itype=itype,
+                ),
+                REFERENCES_ALL,
+            )(self)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs)
+    def test_unsorted_sums_large(self, gc, dc):
+        X = np.random.rand(10000, 32, 12).astype(np.float32)
+        segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
+        op = core.CreateOperator("UnsortedSegmentSum", ["X", "segments"], "out")
+        self.assertDeviceChecks(dc, op, [X, segments], [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs)
+    def test_sorted_segment_range_mean(self, gc, dc):
+        X = np.random.rand(6, 32, 12).astype(np.float32)
+        segments = np.array([0, 0, 1, 1, 2, 3]).astype(np.int32)
+        op = core.CreateOperator(
+            "SortedSegmentRangeMean",
+            ["X", "segments"],
+            "out"
+        )
+        self.assertDeviceChecks(dc, op, [X, segments], [0])
+        self.assertGradientChecks(gc, op, [X, segments], 0, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs)
+    def test_sorted_segment_range_log_mean_exp(self, gc, dc):
+        X = np.random.rand(7, 32, 12).astype(np.float32)
+        segments = np.array([0, 0, 1, 1, 2, 2, 3]).astype(np.int32)
+        op = core.CreateOperator(
+            "SortedSegmentRangeLogMeanExp",
+            ["X", "segments"],
+            "out"
+        )
+        self.assertDeviceChecks(dc, op, [X, segments], [0])
+        self.assertGradientChecks(gc, op, [X, segments], 0, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs)
+    def test_unsorted_means_large(self, gc, dc):
+        X = np.random.rand(10000, 31, 19).astype(np.float32)
+        segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
+        op = core.CreateOperator("UnsortedSegmentMean", ["X", "segments"], "out")
+        self.assertDeviceChecks(dc, op, [X, segments], [0])
+
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True,
+        ),
+        **hu.gcs
+    )
+    def test_lengths_sum(self, inputs, gc, dc):
+        X, Y = inputs
+        op = core.CreateOperator("LengthsSum", ["X", "Y"], "out")
+
+        def ref(D, L):
+            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
+            line = 0
+            for g in range(L.size):
+                for _ in range(L[g]):
+                    if len(D.shape) > 1:
+                        R[g, :] += D[line, :]
+                    else:
+                        R[g] += D[line]
+                    line += 1
+            return [R]
+
+        self.assertReferenceChecks(gc, op, [X, Y], ref)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+
+    @given(
+        inputs=hu.sparse_lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True
+        ),
+        **hu.gcs
+    )
+    def test_sparse_lengths_sum(self, inputs, gc, dc):
+        X, Y, Z = inputs
+        op = core.CreateOperator("SparseLengthsSum", ["X", "Y", "Z"], "out")
+
+        def ref(D, I, L):
+            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
+            line = 0
+            for g in range(L.size):
+                for _ in range(L[g]):
+                    if len(D.shape) > 1:
+                        R[g, :] += D[I[line], :]
+                    else:
+                        R[g] += D[I[line]]
+                    line += 1
+            return [R]
+
+        self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
+        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
+        self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
+
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True,
+        ),
+        **hu.gcs
+    )
+    def test_lengths_mean(self, inputs, gc, dc):
+        X, Y = inputs
+        op = core.CreateOperator("LengthsMean", ["X", "Y"], "out")
+
+        def ref(D, L):
+            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
+            line = 0
+            for g in range(L.size):
+                for _ in range(L[g]):
+                    if len(D.shape) > 1:
+                        R[g, :] += D[line, :]
+                    else:
+                        R[g] += D[line]
+                    line += 1
+                if L[g] > 1:
+                    if len(D.shape) > 1:
+                        R[g, :] = R[g, :] / L[g]
+                    else:
+                        R[g] = R[g] / L[g]
+
+            return [R]
+
+        self.assertReferenceChecks(gc, op, [X, Y], ref)
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+
+    @given(
+        inputs=hu.sparse_lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True
+        ),
+        **hu.gcs
+    )
+    def test_sparse_lengths_mean(self, inputs, gc, dc):
+        X, Y, Z = inputs
+        op = core.CreateOperator("SparseLengthsMean", ["X", "Y", "Z"], "out")
+
+        def ref(D, I, L):
+            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
+            line = 0
+            for g in range(L.size):
+                for _ in range(L[g]):
+                    if len(D.shape) > 1:
+                        R[g, :] += D[I[line], :]
+                    else:
+                        R[g] += D[I[line]]
+                    line += 1
+
+                if L[g] > 1:
+                    if len(D.shape) > 1:
+                        R[g, :] = R[g, :] / L[g]
+                    else:
+                        R[g] = R[g] / L[g]
+
+            return [R]
+
+        self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
+        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
+        self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
+
+    @given(
+        grad_on_weights=st.booleans(),
+        inputs=hu.sparse_lengths_tensor(
+            dtype=np.float32,
+            min_value=1,
+            max_value=5,
+            allow_empty=True
+        ),
+        seed=st.integers(min_value=0, max_value=100),
+        **hu.gcs
+    )
+    def test_sparse_lengths_weighted_sum(
+            self, grad_on_weights, inputs, seed, gc, dc):
+        D, I, L = inputs
+
+        np.random.seed(int(seed))
+
+        W = np.random.rand(I.size).astype(np.float32)
+        op = core.CreateOperator(
+            "SparseLengthsWeightedSum",
+            ["D", "W", "I", "L"],
+            "out",
+            grad_on_weights=grad_on_weights)
+        self.assertDeviceChecks(dc, op, [D, W, I, L], [0])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[D, W, I, L],
+            reference=sparse_lengths_weighted_sum_ref,
+            threshold=1e-4,
+            output_to_grad='out',
+            grad_reference=partial(
+                sparse_lengths_weighted_sum_grad_ref,
+                grad_on_weights=grad_on_weights),
+        )
+        self.assertGradientChecks(gc, op, [D, W, I, L], 0, [0])
+        if grad_on_weights:
+            self.assertGradientChecks(gc, op, [D, W, I, L], 1, [0])
+
+    @given(**hu.gcs)
+    def test_sparse_lengths_indices_in_gradient_sum_gpu(self, gc, dc):
+        X = np.random.rand(3, 3, 4, 5).astype(np.float32)
+        Y = np.asarray([3, 3, 2]).astype(np.int32)
+        Z = np.random.randint(0, 50, size=8).astype(np.int64)
+        op = core.CreateOperator(
+            "SparseLengthsIndicesInGradientSumGradient", ["X", "Y", "Z"], "out"
+        )
+        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
+
+    @given(**hu.gcs)
+    def test_sparse_lengths_indices_in_gradient_mean_gpu(self, gc, dc):
+        X = np.random.rand(3, 3, 4, 5).astype(np.float32)
+        Y = np.asarray([3, 3, 2]).astype(np.int32)
+        Z = np.random.randint(0, 50, size=8).astype(np.int64)
+        op = core.CreateOperator(
+            "SparseLengthsIndicesInGradientMeanGradient", ["X", "Y", "Z"], "out"
+        )
+        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
+
+    @given(**hu.gcs_cpu_only)
+    def test_legacy_sparse_and_lengths_sum_gradient(self, gc, dc):
+        X = np.random.rand(3, 64).astype(np.float32)
+        Y = np.asarray([20, 20, 10]).astype(np.int32)
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("Y", Y)
+        test_net = core.Net("test_net")
+        test_net.SparseLengthsSumGradient(["X", "Y"], "out1")
+        test_net.LengthsSumGradient(["X", "Y"], "out2")
+        workspace.RunNetOnce(test_net)
+        out1 = workspace.FetchBlob("out1")
+        out2 = workspace.FetchBlob("out2")
+        self.assertTrue((out1 == out2).all())
+
+    @given(**hu.gcs)
+    def test_sparse_lengths_sum_invalid_index(self, gc, dc):
+        D = np.random.rand(50, 3, 4, 5).astype(np.float32)
+        I = (np.random.randint(0, 10000, size=10) + 10000).astype(np.int64)
+        L = np.asarray([4, 4, 2]).astype(np.int32)
+        op = core.CreateOperator(
+            "SparseLengthsSum",
+            ["D", "I", "L"],
+            "out")
+        workspace.FeedBlob('D', D)
+        workspace.FeedBlob('I', I)
+        workspace.FeedBlob('L', L)
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+    @given(**hu.gcs_cpu_only)
+    def test_sparse_lengths_positional_weighted_sum(
+            self, gc, dc):
+        D = np.random.rand(50, 3, 4, 5).astype(np.float32)
+        W = np.random.rand(50).astype(np.float32)
+        indices = np.random.randint(0, 50, size=10).astype(np.int64)
+        L = np.asarray([4, 4, 2]).astype(np.int32)
+        op = core.CreateOperator(
+            "SparseLengthsPositionalWeightedSum",
+            ["D", "W", "indices", "L"],
+            "out")
+
+        def ref_sparse(D, W, indices, L):
+            workspace.FeedBlob("L", L)
+            lengths_range_fill_op = core.CreateOperator(
+                "LengthsRangeFill", ["L"], ["L_pos_seq"])
+            workspace.RunOperatorOnce(lengths_range_fill_op)
+
+            workspace.FeedBlob("W", W)
+            gather_op = core.CreateOperator(
+                "Gather", ["W", "L_pos_seq"], ["W_gathered"])
+            workspace.RunOperatorOnce(gather_op)
+
+            workspace.FeedBlob("D", D)
+            workspace.FeedBlob("indices", indices)
+            sparse_op = core.CreateOperator(
+                "SparseLengthsWeightedSum",
+                ["D", "W_gathered", "indices", "L"],
+                "out_ref")
+            workspace.RunOperatorOnce(sparse_op)
+
+            return (workspace.FetchBlob("out_ref"),)
+
+        self.assertReferenceChecks(
+            gc, op, [D, W, indices, L], ref_sparse)
+
+   # @given(
+   #     inputs=hu.lengths_tensor(
+   #         dtype=np.float32,
+   #         min_value=1,
+   #         max_value=5,
+   #         min_dim=1,
+   #         max_dim=1,
+   #         allow_empty=False,
+   #     ),
+   #     **hu.gcs
+   # )
+   # def test_lengths_max_gpu(self, inputs, gc, dc):
+   #     def lengths_max_ref(I, L):
+   #         R = np.zeros(shape=(len(L)), dtype=I.dtype)
+   #         line = 0
+   #         for g in range(len(L)):
+   #             for i in range(L[g]):
+   #                 if i == 0:
+   #                     R[g] = I[line]
+   #                 else:
+   #                     R[g] = max(R[g], I[line])
+   #                 line += 1
+   #         return [R]
+
+   #     X, lengths = inputs
+   #     op = core.CreateOperator("LengthsMax", ["X", "lengths"], "out")
+   #     self.assertDeviceChecks(dc, op, [X, lengths], [0])
+   #     self.assertReferenceChecks(
+   #         device_option=gc,
+   #         op=op,
+   #         inputs=[X, lengths],
+   #         reference=lengths_max_ref,
+   #         threshold=1e-4,
+   #         output_to_grad='out',
+   #     )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py
new file mode 100644
index 0000000..a0321b1
--- /dev/null
+++ b/caffe2/python/operator_test/selu_op_test.py
@@ -0,0 +1,97 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestSelu(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+            **hu.gcs)
+    def test_selu_1(self, X, gc, dc, engine):
+        alpha = 1.0
+        scale = 2.0
+        op = core.CreateOperator("Selu", ["X"], ["Y"],
+                                 alpha=alpha, scale=scale, engine=engine)
+        X = TestSelu.fix0(X)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+        self.assertReferenceChecks(
+            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
+        )
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+            **hu.gcs)
+    def test_selu_2(self, X, gc, dc, engine):
+        alpha = 1.6732
+        scale = 1.0507
+        op = core.CreateOperator("Selu", ["X"], ["Y"],
+                                 alpha=alpha, scale=scale, engine=engine)
+
+        X = TestSelu.fix0(X)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
+        self.assertReferenceChecks(
+            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
+        )
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+            **hu.gcs)
+    def test_selu_3(self, X, gc, dc, engine):
+        alpha = 1.3
+        scale = 1.1
+        op = core.CreateOperator("Selu", ["X"], ["Y"],
+                                 alpha=alpha, scale=scale, engine=engine)
+
+        X = TestSelu.fix0(X)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+        self.assertReferenceChecks(
+            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
+        )
+
+    @given(X=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+            **hu.gcs)
+    def test_selu_inplace(self, X, gc, dc, engine):
+        alpha = 1.3
+        scale = 1.1
+        op = core.CreateOperator("Selu", ["X"], ["X"],
+                                 alpha=alpha, scale=scale, engine=engine)
+
+        X = TestSelu.fix0(X)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        # inplace gradient
+        Y = TestSelu.selu_ref(X, alpha=alpha, scale=scale)
+        dX = np.ones_like(X)
+        op2 = core.CreateOperator("SeluGradient", ["Y", "dX"], ["dX"],
+                                  alpha=alpha, scale=scale, engine=engine)
+        self.assertDeviceChecks(dc, op2, [Y, dX], [0])
+
+    @staticmethod
+    def fix0(X):
+        # go away from the origin point to avoid kink problems
+        X += 0.02 * np.sign(X)
+        X[X == 0.0] += 0.02
+        return X
+
+    @staticmethod
+    def selu_ref(x, scale, alpha):
+        ret = scale * ((x > 0) * x + (x <= 0) * (alpha * (np.exp(x) - 1)))
+        return [ret]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
new file mode 100644
index 0000000..6c35ce5
--- /dev/null
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -0,0 +1,313 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+from functools import partial
+
+
+def _gen_test_add_padding(with_pad_data=True,
+                          is_remove=False):
+    def gen_with_size(args):
+        lengths, inner_shape = args
+        data_dim = [sum(lengths)] + inner_shape
+        lengths = np.array(lengths, dtype=np.int32)
+        if with_pad_data:
+            return st.tuples(
+                st.just(lengths),
+                hu.arrays(data_dim),
+                hu.arrays(inner_shape),
+                hu.arrays(inner_shape))
+        else:
+            return st.tuples(st.just(lengths), hu.arrays(data_dim))
+
+    min_len = 4 if is_remove else 0
+    lengths = st.lists(
+        st.integers(min_value=min_len, max_value=10),
+        min_size=0,
+        max_size=5)
+    inner_shape = st.lists(
+        st.integers(min_value=1, max_value=3),
+        min_size=0,
+        max_size=2)
+    return st.tuples(lengths, inner_shape).flatmap(gen_with_size)
+
+
+def _add_padding_ref(
+        start_pad_width, end_pad_width, ret_lengths,
+        data, lengths, start_padding=None, end_padding=None):
+    if start_padding is None:
+        start_padding = np.zeros(data.shape[1:], dtype=data.dtype)
+    end_padding = (
+        end_padding if end_padding is not None else start_padding)
+    out_size = data.shape[0] + (
+        start_pad_width + end_pad_width) * len(lengths)
+    out = np.ndarray((out_size,) + data.shape[1:])
+    in_ptr = 0
+    out_ptr = 0
+    for length in lengths:
+        out[out_ptr:(out_ptr + start_pad_width)] = start_padding
+        out_ptr += start_pad_width
+        out[out_ptr:(out_ptr + length)] = data[in_ptr:(in_ptr + length)]
+        in_ptr += length
+        out_ptr += length
+        out[out_ptr:(out_ptr + end_pad_width)] = end_padding
+        out_ptr += end_pad_width
+    lengths_out = lengths + (start_pad_width + end_pad_width)
+    if ret_lengths:
+        return (out, lengths_out)
+    else:
+        return (out, )
+
+
+def _remove_padding_ref(start_pad_width, end_pad_width, data, lengths):
+    pad_width = start_pad_width + end_pad_width
+    out_size = data.shape[0] - (
+        start_pad_width + end_pad_width) * len(lengths)
+    out = np.ndarray((out_size,) + data.shape[1:])
+    in_ptr = 0
+    out_ptr = 0
+    for length in lengths:
+        out_length = length - pad_width
+        out[out_ptr:(out_ptr + out_length)] = data[
+            (in_ptr + start_pad_width):(in_ptr + length - end_pad_width)]
+        in_ptr += length
+        out_ptr += out_length
+    lengths_out = lengths - (start_pad_width + end_pad_width)
+    return (out, lengths_out)
+
+
+def _gather_padding_ref(start_pad_width, end_pad_width, data, lengths):
+    start_padding = np.zeros(data.shape[1:], dtype=data.dtype)
+    end_padding = np.zeros(data.shape[1:], dtype=data.dtype)
+    pad_width = start_pad_width + end_pad_width
+    ptr = 0
+    for length in lengths:
+        for _ in range(start_pad_width):
+            start_padding += data[ptr]
+            ptr += 1
+        ptr += length - pad_width
+        for _ in range(end_pad_width):
+            end_padding += data[ptr]
+            ptr += 1
+    return (start_padding, end_padding)
+
+
+class TestSequenceOps(hu.HypothesisTestCase):
+    @given(start_pad_width=st.integers(min_value=1, max_value=2),
+           end_pad_width=st.integers(min_value=0, max_value=2),
+           args=_gen_test_add_padding(with_pad_data=True),
+           ret_lengths=st.booleans(),
+           **hu.gcs)
+    def test_add_padding(
+        self, start_pad_width, end_pad_width, args, ret_lengths, gc, dc
+    ):
+        lengths, data, start_padding, end_padding = args
+        start_padding = np.array(start_padding, dtype=np.float32)
+        end_padding = np.array(end_padding, dtype=np.float32)
+        outputs = ['output', 'lengths_out'] if ret_lengths else ['output']
+        op = core.CreateOperator(
+            'AddPadding', ['data', 'lengths', 'start_padding', 'end_padding'],
+            outputs,
+            padding_width=start_pad_width,
+            end_padding_width=end_pad_width
+        )
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths, start_padding, end_padding],
+            reference=partial(
+                _add_padding_ref, start_pad_width, end_pad_width, ret_lengths
+            )
+        )
+
+    @given(start_pad_width=st.integers(min_value=1, max_value=2),
+           end_pad_width=st.integers(min_value=0, max_value=2),
+           args=_gen_test_add_padding(with_pad_data=False),
+           **hu.gcs)
+    def test_add_zero_padding(self, start_pad_width, end_pad_width, args, gc, dc):
+        lengths, data = args
+        op = core.CreateOperator(
+            'AddPadding',
+            ['data', 'lengths'],
+            ['output', 'lengths_out'],
+            padding_width=start_pad_width,
+            end_padding_width=end_pad_width)
+        self.assertReferenceChecks(
+            gc,
+            op,
+            [data, lengths],
+            partial(_add_padding_ref, start_pad_width, end_pad_width, True))
+
+    @given(start_pad_width=st.integers(min_value=1, max_value=2),
+           end_pad_width=st.integers(min_value=0, max_value=2),
+           data=hu.tensor(min_dim=1, max_dim=3),
+           **hu.gcs)
+    def test_add_padding_no_length(self, start_pad_width, end_pad_width, data, gc, dc):
+        op = core.CreateOperator(
+            'AddPadding',
+            ['data'],
+            ['output', 'output_lens'],
+            padding_width=start_pad_width,
+            end_padding_width=end_pad_width)
+        self.assertReferenceChecks(
+            gc,
+            op,
+            [data],
+            partial(
+                _add_padding_ref, start_pad_width, end_pad_width, True,
+                lengths=np.array([data.shape[0]])))
+
+    # Uncomment the following seed to make this fail.
+    # @seed(302934307671667531413257853548643485645)
+    # See https://github.com/caffe2/caffe2/issues/1547
+    @unittest.skip("flaky test")
+    @given(start_pad_width=st.integers(min_value=1, max_value=2),
+           end_pad_width=st.integers(min_value=0, max_value=2),
+           args=_gen_test_add_padding(with_pad_data=False, is_remove=True),
+           **hu.gcs)
+    def test_remove_padding(self, start_pad_width, end_pad_width, args, gc, dc):
+        lengths, data = args
+        op = core.CreateOperator(
+            'RemovePadding',
+            ['data', 'lengths'],
+            ['output', 'lengths_out'],
+            padding_width=start_pad_width,
+            end_padding_width=end_pad_width)
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            reference=partial(_remove_padding_ref, start_pad_width, end_pad_width))
+
+    @given(start_pad_width=st.integers(min_value=0, max_value=2),
+           end_pad_width=st.integers(min_value=0, max_value=2),
+           args=_gen_test_add_padding(with_pad_data=True),
+           **hu.gcs)
+    def test_gather_padding(self, start_pad_width, end_pad_width, args, gc, dc):
+        lengths, data, start_padding, end_padding = args
+        padded_data, padded_lengths = _add_padding_ref(
+            start_pad_width, end_pad_width, True, data,
+            lengths, start_padding, end_padding)
+        op = core.CreateOperator(
+            'GatherPadding',
+            ['data', 'lengths'],
+            ['start_padding', 'end_padding'],
+            padding_width=start_pad_width,
+            end_padding_width=end_pad_width)
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[padded_data, padded_lengths],
+            reference=partial(_gather_padding_ref, start_pad_width, end_pad_width))
+
+    @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
+                          elements=st.floats(min_value=-np.inf,
+                                             max_value=np.inf),
+                          min_value=1, max_value=10),
+                          **hu.gcs)
+    def test_reverse_packed_segs(self, data, gc, dc):
+        max_length = data.shape[0]
+        batch_size = data.shape[1]
+        lengths = np.random.randint(max_length + 1, size=batch_size)
+
+        op = core.CreateOperator(
+            "ReversePackedSegs",
+            ["data", "lengths"],
+            ["reversed_data"])
+
+        def op_ref(data, lengths):
+            rev_data = np.array(data, copy=True)
+            for i in range(batch_size):
+                seg_length = lengths[i]
+                for j in range(seg_length):
+                    rev_data[j][i] = data[seg_length - 1 - j][i]
+            return (rev_data,)
+
+        def op_grad_ref(grad_out, outputs, inputs):
+            return op_ref(grad_out, inputs[1]) + (None,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            reference=op_ref,
+            output_to_grad='reversed_data',
+            grad_reference=op_grad_ref)
+
+    @given(data=hu.tensor(min_dim=1, max_dim=3, dtype=np.float32,
+                          elements=st.floats(min_value=-np.inf,
+                                             max_value=np.inf),
+                          min_value=10, max_value=10),
+           indices=st.lists(st.integers(min_value=0, max_value=9),
+                            min_size=0,
+                            max_size=10),
+           **hu.gcs_cpu_only)
+    def test_remove_data_blocks(self, data, indices, gc, dc):
+        indices = np.array(indices)
+
+        op = core.CreateOperator(
+            "RemoveDataBlocks",
+            ["data", "indices"],
+            ["shrunk_data"])
+
+        def op_ref(data, indices):
+            unique_indices = np.unique(indices)
+            sorted_indices = np.sort(unique_indices)
+            shrunk_data = np.delete(data, sorted_indices, axis=0)
+            return (shrunk_data,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, indices],
+            reference=op_ref)
+
+    @given(elements=st.lists(st.integers(min_value=0, max_value=9),
+                             min_size=0,
+                             max_size=10),
+           **hu.gcs_cpu_only)
+    def test_find_duplicate_elements(self, elements, gc, dc):
+        mapping = {
+            0: "a",
+            1: "b",
+            2: "c",
+            3: "d",
+            4: "e",
+            5: "f",
+            6: "g",
+            7: "h",
+            8: "i",
+            9: "j"}
+        data = np.array([mapping[e] for e in elements], dtype='|S')
+
+        op = core.CreateOperator(
+            "FindDuplicateElements",
+            ["data"],
+            ["indices"])
+
+        def op_ref(data):
+            unique_data = []
+            indices = []
+            for i, e in enumerate(data):
+                if e in unique_data:
+                    indices.append(i)
+                else:
+                    unique_data.append(e)
+            return (np.array(indices, dtype=np.int64),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data],
+            reference=op_ref)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
new file mode 100644
index 0000000..59eb899
--- /dev/null
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -0,0 +1,583 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, test_util, model_helper, brew, build
+
+
+@unittest.skipIf(build.CAFFE2_NO_OPERATOR_SCHEMA,
+                 'Built with CAFFE2_NO_OPERATOR_SCHEMA')
+class TestShapeInference(test_util.TestCase):
+
+    def testShapeInferenceSimpleFC(self):
+        m = model_helper.ModelHelper(name="test_model")
+
+        brew.fc(m, "data", "fc1", dim_in=96, dim_out=32)
+        brew.fc(m, "fc1", "fc2", dim_in=32, dim_out=55)
+
+        for b in [0, 64]:
+            (shapes, types) = workspace.InferShapesAndTypes(
+                [m.param_init_net, m.net],
+                {'data': [b, 96]}
+            )
+
+            self.assertEquals(shapes['data'], [b, 96])
+            self.assertEquals(shapes['fc1_w'], [32, 96])
+            self.assertEquals(shapes['fc1_b'], [32])
+            self.assertEquals(shapes['fc1'], [b, 32])
+            self.assertEquals(shapes['fc2_w'], [55, 32])
+            self.assertEquals(shapes['fc2_b'], [55])
+            self.assertEquals(shapes['fc2'], [b, 55])
+
+    def testFCAxis2(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.FC(["x", "w", "b"], ["y"], axis=2)
+        workspace.FeedBlob("x", np.random.rand(4, 20, 36).astype(np.float32))
+        workspace.FeedBlob("w", np.random.rand(36, 36).astype(np.float32))
+        workspace.FeedBlob("b", np.random.rand(36,).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testFCTransposed(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.FCTransposed(["x", "wt", "b"], ["y"])
+        workspace.FeedBlob("x", np.random.rand(20, 36).astype(np.float32))
+        workspace.FeedBlob("wt", np.random.rand(36, 48).astype(np.float32))
+        workspace.FeedBlob("b", np.random.rand(48,).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceSlice(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.Slice(["x"], ["y"], starts=[0, 0, 0, 0], ends=[-1, -1, -3, -1])
+        workspace.FeedBlob("x", np.random.rand(64, 1, 255, 384).astype(np.float32))
+
+        slice_starts = np.array([0, 0, 0, 0]).astype(np.int32)
+        slice_ends = np.array([-1, -1, -3, -1]).astype(np.int32)
+        slice_starts = model.net.GivenTensorIntFill(
+            [], shape=[4], values=slice_starts)
+        slice_ends = model.net.GivenTensorIntFill(
+            [], shape=[4], values=slice_ends)
+        model.net.Slice(["x2", slice_starts, slice_ends], ["y2"])
+        workspace.FeedBlob("x2", np.random.rand(64, 1, 255, 384).astype(np.float32))
+
+        self.InferTensorRunAndCompare(model, ["y2"])
+
+    def testShapeInferenceDistances(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.L1Distance(["x1", "y1"], "dl1_D1")
+        model.net.SquaredL2Distance(["x1", "y1"], "dl2_D1")
+        model.net.CosineSimilarity(["x1", "y1"], "dcos_D1")
+        model.net.DotProduct(["x1", "y1"], "ddot_D1")
+        model.net.DotProductWithPadding(["x1", "y1"], "ddotpad_D1")
+
+        model.net.L1Distance(["x2", "y2"], "dl1_D2")
+        model.net.SquaredL2Distance(["x2", "y2"], "dl2_D2")
+        model.net.CosineSimilarity(["x2", "y2"], "dcos_D2")
+        model.net.DotProduct(["x2", "y2"], "ddot_D2")
+        model.net.DotProductWithPadding(["x2", "z2"], "ddotpad_D2")
+
+        workspace.FeedBlob("x1", np.random.rand(10).astype(np.float32))
+        workspace.FeedBlob("y1", np.random.rand(10).astype(np.float32))
+
+        workspace.FeedBlob("x2", np.random.rand(10, 5).astype(np.float32))
+        workspace.FeedBlob("y2", np.random.rand(10, 5).astype(np.float32))
+        workspace.FeedBlob("z2", np.random.rand(10, 4).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceReduceBackFrontX(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.ReduceBackSum(["x"], ["x_back_sum"])
+        model.net.ReduceBackMean(["x"], ["x_back_mean"])
+        model.net.ReduceBackMax(["x"], ["x_back_max"])
+        model.net.ReduceFrontSum(["x"], ["x_front_sum"])
+        model.net.ReduceFrontMean(["x"], ["x_front_mean"])
+        model.net.ReduceFrontMax(["x"], ["x_front_max"])
+
+        workspace.FeedBlob("x", np.random.rand(10, 12, 18).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testGather(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.net.Gather(["X", "idx"], "Y")
+        workspace.FeedBlob("X", np.random.rand(100, 4, 5).astype(np.float32))
+        workspace.FeedBlob("idx", np.array([[3, 18], [99, 4], [2, 5]]).astype(np.int32))
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceConvNet(self):
+        model = model_helper.ModelHelper(name="convtest")
+        model.NHWC2NCHW("data", "data_nchw")
+        brew.conv(model, "data_nchw", 'conv1', 3, 64,
+                   weight_init=("MSRAFill", {}), kernel=7,
+                   stride=2, pad=3, no_bias=0)
+        brew.spatial_bn(model, 'conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
+        brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu')
+        brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+        brew.fc(model, 'pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
+        brew.dropout(model, 'fc', 'fc_drop', is_test=False)
+        model.Sigmoid('fc_drop', 'fc_sigm')
+        brew.softmax(model, 'fc_sigm', 'softmax')
+        model.LabelCrossEntropy(['softmax', 'label'], 'xent')
+        loss = model.AveragedLoss('xent', 'loss')
+
+        model.AddGradientOperators([loss])
+
+        LR = model.param_init_net.ConstantFill(
+            [], 'LR', shape=[1], value=0.1
+        )
+
+        for param in model.GetParams():
+            param_grad = model.param_to_grad[param]
+            param_momentum = model.param_init_net.ConstantFill(
+                [param], param + '_momentum', value=0.0
+            )
+            model.net.MomentumSGDUpdate(
+                [param_grad, param_momentum, LR, param],
+                [param_grad, param_momentum, param],
+            )
+
+        workspace.FeedBlob(
+            "data",
+            np.random.rand(16, 227, 227, 3).astype(np.float32),
+        )
+        workspace.FeedBlob(
+            "label",
+            (100 * np.random.rand(16)).astype(np.int32),
+        )
+        workspace.FeedBlob(
+            "label",
+            (100 * np.random.rand(16)).astype(np.int32),
+        )
+        # Then do automatic comparison test: run the next once to
+        # initialize everything
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceTranspose(self):
+        model = model_helper.ModelHelper(name="test_model")
+
+        workspace.FeedBlob(
+            "tensor",
+            np.random.rand(4, 2, 3, 3, 5).astype(np.float32)
+        )
+
+        # Testing with axes undefined
+        brew.transpose(
+            model,
+            ["tensor"],
+            "transpose",
+        )
+        self.InferTensorRunAndCompare(model)
+
+        # Testing with axes defined
+        brew.transpose(
+            model,
+            ["tensor"],
+            "transpose",
+            axes=np.random.permutation(5)
+        )
+
+        return self.InferTensorRunAndCompare(model)
+
+    def testShapeInferencePad(self):
+        model = model_helper.ModelHelper(name="padtest")
+        model.PadImage("data", 'padded', pad_t=100, pad_l=37, pad_b=28,
+                       pad_r=20, mode="constant", order="NCHW")
+
+        workspace.FeedBlob(
+            "data",
+            np.random.rand(16, 3, 228, 228).astype(np.float32),
+        )
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceTwoClass(self):
+        model = model_helper.ModelHelper(name="twoclass")
+        model.MakeTwoClass("v", "v2")
+        workspace.FeedBlob("v", np.random.rand(32).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferencePadZero(self):
+        model = model_helper.ModelHelper(name="padtest")
+        model.PadImage("data", 'padded', pad=0, mode="constant",
+                       order="NCHW")
+
+        workspace.FeedBlob(
+            "data",
+            np.random.rand(16, 3, 228, 228).astype(np.float32),
+        )
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceMatMul(self):
+        model = model_helper.ModelHelper(name="test_model")
+
+        model.MatMul(["x", "y"], "MatMul")
+
+        workspace.FeedBlob("x", np.random.rand(10, 5).astype(np.float32))
+        workspace.FeedBlob("y", np.random.rand(5, 10).astype(np.float32))
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceSoftmaxWithLoss(self):
+        model = model_helper.ModelHelper(name="test_model")
+
+        model.SoftmaxWithLoss(
+            ["logits", "labels"],
+            ["softmax", "loss"],
+        )
+
+        # 2D Shape of [batch_size, num_classes]
+        workspace.FeedBlob(
+            "logits",
+            np.random.rand(4, 3).astype(np.float32),
+        )
+
+        # Shape of size batch_size with all values [0, num_classes)
+        workspace.FeedBlob(
+            "labels",
+            np.random.randint(low=0, high=3, size=(4, 1)).astype(np.int32),
+        )
+        self.InferTensorRunAndCompare(model)
+
+        # Testing with 1D labels arg
+        workspace.FeedBlob(
+            "logits",
+            np.random.rand(4, 3).astype(np.float32),
+        )
+
+        workspace.FeedBlob(
+            "labels",
+            np.random.randint(low=0, high=3, size=4).astype(np.int32),
+        )
+        self.InferTensorRunAndCompare(model)
+
+        # Testing with weight_tensor
+        model.SoftmaxWithLoss(
+            ["logits", "labels", "weight_tensor"],
+            ["softmax", "loss"],
+        )
+
+        workspace.FeedBlob(
+            "logits",
+            np.random.rand(4, 3).astype(np.float32),
+        )
+
+        workspace.FeedBlob(
+            "labels",
+            np.random.randint(low=0, high=3, size=4).astype(np.int32),
+        )
+
+        workspace.FeedBlob(
+            "weight_tensor",
+            np.random.rand(4).astype(np.float32),
+        )
+        self.InferTensorRunAndCompare(model)
+
+        # Test spatial model
+        model = model_helper.ModelHelper(name="test_model")
+        workspace.FeedBlob(
+            "img",
+            np.random.rand(32, 19, 33, 28).astype(np.float32)
+        )
+        workspace.FeedBlob(
+            "img_labels",
+            (np.random.rand(32, 33, 28) * 19).astype(np.int32)
+        )
+        model.SpatialSoftmaxWithLoss(
+            ["img", "img_labels"],
+            ["softmax_img", "loss"],
+        )
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceIm2Col(self):
+        # Test with NCHW
+        model = model_helper.ModelHelper(name="test_model")
+        model.Im2Col("X", "Y", pad=1, kernel=4, dilation=2, stride=2,
+                     order="NCHW")
+
+        workspace.FeedBlob(
+            "X",
+            np.random.rand(16, 3, 228, 228).astype(np.float32),
+        )
+
+        self.InferTensorRunAndCompare(model)
+
+        # Test with NHWC
+        model = model_helper.ModelHelper(name="test_model")
+        model.Im2Col("X", "Y", pad=1, kernel=4, dilation=2, stride=2,
+                     order="NHWC")
+
+        workspace.FeedBlob(
+            "X",
+            np.random.rand(16, 228, 228, 3).astype(np.float32),
+        )
+
+        self.InferTensorRunAndCompare(model)
+
+        # Test with different width and height
+        model = model_helper.ModelHelper(name="test_model")
+        model.Im2Col("X", "Y", pad=1, kernel_h=8, kernel_w=4,
+                     dilation=2, stride=2)
+
+        workspace.FeedBlob(
+            "X",
+            np.random.rand(16, 3, 228, 114).astype(np.float32),
+        )
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceTile(self):
+        m = model_helper.ModelHelper(name="test_model")
+
+        workspace.FeedBlob(
+            "tensor",
+            np.random.rand(4, 2, 3, 3, 5).astype(np.float32)
+        )
+
+        # Testing with axes undefined
+        for i in range(0, 4):
+            m.net.Tile(
+                "tensor", "tiled_tensor_{}".format(i), tiles=5, axis=i)
+        self.InferTensorRunAndCompare(m)
+
+    def testShapeInferenceFlatten(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.FlattenToVec("X", "FlatVec")
+        model.FlattenToVec("empty", "EmptyFlatVec")
+        workspace.FeedBlob("X", np.random.rand(17, 5, 13).astype(np.float32))
+        workspace.FeedBlob("empty", np.random.rand(0, 2, 3).astype(np.float32))
+
+        self.InferTensorRunAndCompare(model)
+
+        # test Flatten with default axis (=1)
+        model = model_helper.ModelHelper(name="test_model")
+        model.Flatten("X", "Flat")
+        model.Flatten("empty", "EmptyFlat")
+        workspace.FeedBlob("X", np.random.rand(17, 5, 13).astype(np.float32))
+        workspace.FeedBlob("empty", np.random.rand(0, 2, 3).astype(np.float32))
+
+        self.InferTensorRunAndCompare(model)
+
+        # test Flatten with axis
+        model = model_helper.ModelHelper(name="test_model")
+        x = np.random.randn(17, 5, 13)
+        for axis in range(x.ndim + 1):
+            model.Flatten("x", "Flat", axis=axis)
+            workspace.FeedBlob("x", x)
+            self.InferTensorRunAndCompare(model)
+
+        empty = np.random.randn(0, 5, 13)
+        for axis in range(empty.ndim + 1):
+            model.Flatten("empty", "Flat", axis=axis)
+            workspace.FeedBlob("empty", empty)
+            self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceReshape(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.Reshape("X", ["Reshaped", "Old_Shape"], shape=[8, 0, -1, 2])
+        workspace.FeedBlob("X", np.random.rand(4, 26, 32).astype(np.float32))
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceUnique(self):
+        for n in [0, 1]:
+            model = model_helper.ModelHelper(name="test_model")
+            model.Unique("X", ["Y"])
+            model.Unique("X", ["Z", "remap"])
+            workspace.FeedBlob("X", np.random.rand(n).astype(np.int64))
+            self.InferTensorRunAndCompare(model)
+
+    def testLengthsSum(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.LengthsSum(["X", "length"], ["sum"])
+        workspace.FeedBlob("X", np.random.rand(6, 32).astype(np.float32))
+        workspace.FeedBlob("length", np.array([1, 2, 3], dtype=np.int32))
+
+        self.InferTensorRunAndCompare(model)
+
+    def testLengthsPad(self):
+        model = model_helper.ModelHelper(name="test_model")
+        model.LengthsPad(
+            ["X", "length"],
+            ["X_padded"],
+            target_length=10,
+            padding_value=-1.0,
+        )
+        workspace.FeedBlob("X", np.random.rand(6, 32).astype(np.float32))
+        workspace.FeedBlob("length", np.array([1, 2, 3], dtype=np.int32))
+
+        self.InferTensorRunAndCompare(model)
+
+    def testConcat(self):
+        net = core.Net("concat")
+
+        net.Concat(["A", "B"], ["C", "splits"], axis=1)
+        net.Concat(["C", "D"], ["E"], order="NCHW")
+        net.Concat(["E", "F"], ["G"], add_axis=1, order="NHWC")
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [net],
+            {
+                'A': [10, 12, 9, 10],
+                'B': [10, 9, 9, 10],
+                'D': [10, 2, 9, 10],
+                'F': [10, 23, 9, 10]
+            }
+        )
+        self.assertEqual(shapes['C'], [10, 21, 9, 10])
+        self.assertEqual(shapes['splits'], [2])
+        self.assertEqual(shapes['E'], [10, 23, 9, 10])
+        self.assertEqual(shapes['G'], [10, 23, 9, 2, 10])
+
+    def testSqueeze(self):
+        net = core.Net("sq")
+        net.Squeeze(["data"], ["data_squeezed"], dims=[3, 1])
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [net],
+            {'data': [64, 1, 96, 1, 4]}
+        )
+        self.assertEqual(shapes['data_squeezed'], [64, 96, 4])
+
+    def testCast(self):
+        model = model_helper.ModelHelper(name="test_model")
+
+        types = [
+            ('bool', np.bool, caffe2_pb2.TensorProto.BOOL),
+            #('byte', None, caffe2_pb2.TensorProto.BYTE),
+            ('int8', np.int8, caffe2_pb2.TensorProto.INT8),
+            ('uint8', np.uint8, caffe2_pb2.TensorProto.UINT8),
+            ('int16', np.int16, caffe2_pb2.TensorProto.INT16),
+            ('uint16', np.uint16, caffe2_pb2.TensorProto.UINT16),
+            #('float16', np.float16, caffe2_pb2.TensorProto.FLOAT16),
+            ('int32', np.int32, caffe2_pb2.TensorProto.INT32),
+            ('float', np.float32, caffe2_pb2.TensorProto.FLOAT),
+            ('int64', np.int64, caffe2_pb2.TensorProto.INT64),
+            ('double', np.float64, caffe2_pb2.TensorProto.DOUBLE),
+            #('string', None, caffe2_pb2.TensorProto.STRING),
+        ]
+
+        for (xstr, xnp, _) in types:
+            xname = 'X%s' % xstr
+            workspace.FeedBlob(xname, np.random.rand(1).astype(xnp))
+            for (ystr, _, yc2) in types:
+                yname = 'Y%s_to_%s' % (xstr, ystr)
+                model.Cast(xname, yname, to=yc2)
+
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeInferenceRoiPool(self):
+        for is_test in [True, False]:
+            model = model_helper.ModelHelper(name="test_model")
+            outputs = ['Y'] if is_test else ['Y', 'argmaxes']
+            model.net.RoIPool(
+                ['X', 'R'], outputs, pooled_h=4, pooled_w=5, is_test=is_test)
+            workspace.FeedBlob(
+                "X",
+                np.random.rand(100, 3, 4, 5).astype(np.float32))
+            workspace.FeedBlob(
+                "R",
+                np.random.rand(2, 5).astype(np.float32))
+            self.InferTensorRunAndCompare(model)
+
+    def testShapeInferencePow(self):
+        model = model_helper.ModelHelper(name="powtest")
+        model.Pow("x", 'y', exponent=-1.0)
+        workspace.FeedBlob('x', np.random.rand(1, 2, 3, 4).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testInt8Conversion(self):
+        model = model_helper.ModelHelper(name="int8_conversion_test")
+        model.FloatToFused8BitRowwiseQuantized('x', 'x_8bit')
+        model.Fused8BitRowwiseQuantizedToFloat('x_8bit', 'x_recovered')
+        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def testShapeOp(self):
+        model = model_helper.ModelHelper(name="shape_op_test")
+        model.Shape('x', 'y')
+        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
+    def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None):
+        '''
+        Runs shape inference, and then the model to check
+        that the inferred shapes agree with the actual ones
+
+        'expected_uninferred_blobs' is the list of blobs for which type and
+        shape cannot be inferred.
+        '''
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [model.param_init_net, model.net],
+        )
+
+        # .. Create net
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net, True)
+        workspace.RunNet(model.Proto().name)
+
+        # ... and then check the shapes mismatch
+        correct_shapes = {}
+        correct_types = {}
+        for b in workspace.Blobs():
+            arr = workspace.FetchBlob(b)
+            correct_shapes[b] = arr.shape
+            if type(arr) is np.ndarray:
+                if arr.dtype == np.dtype('float32'):
+                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT
+                elif arr.dtype == np.dtype('int32'):
+                    correct_types[b] = caffe2_pb2.TensorProto.INT32
+                # BYTE
+                # STRING
+                elif arr.dtype == np.dtype('bool'):
+                    correct_types[b] = caffe2_pb2.TensorProto.BOOL
+                elif arr.dtype == np.dtype('uint8'):
+                    correct_types[b] = caffe2_pb2.TensorProto.UINT8
+                elif arr.dtype == np.dtype('int8'):
+                    correct_types[b] = caffe2_pb2.TensorProto.INT8
+                elif arr.dtype == np.dtype('uint16'):
+                    correct_types[b] = caffe2_pb2.TensorProto.UINT16
+                elif arr.dtype == np.dtype('int16'):
+                    correct_types[b] = caffe2_pb2.TensorProto.INT16
+                elif arr.dtype == np.dtype('int64'):
+                    correct_types[b] = caffe2_pb2.TensorProto.INT64
+                elif arr.dtype == np.dtype('float16'):
+                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT16
+                elif arr.dtype == np.dtype('float64'):
+                    correct_types[b] = caffe2_pb2.TensorProto.DOUBLE
+                else:
+                    correct_types[b] = "unknown {}".format(arr.dtype)
+            else:
+                correct_types[b] = str(type(arr))
+
+        if expected_uninferred_blobs is None:
+            expected_uninferred_blobs = []
+        for b in correct_shapes:
+            # skip blobs for which shape couldn't be inferred
+            if b in expected_uninferred_blobs:
+                continue
+            self.assertTrue(
+                np.array_equal(
+                    np.array(shapes[b]).astype(np.int32),
+                    np.array(correct_shapes[b]).astype(np.int32)
+                ),
+                "Shape {} mismatch: {} vs. correct {}".format(
+                    b, shapes[b], correct_shapes[b]
+                )
+            )
+            self.assertFalse(
+                b not in types and b in correct_types,
+                "Type for {} not defined".format(b),
+            )
+            self.assertEqual(
+                types[b],
+                correct_types[b],
+                "Type {} mismatch: {} vs. {}".format(
+                    b, types[b], correct_types[b],
+                )
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
new file mode 100644
index 0000000..576c798
--- /dev/null
+++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import math
+
+MAX_TEST_EMBEDDING_SIZE = 20
+MAX_TEST_SEQUENCE_LENGTH = 10
+MAX_TEST_BATCH_SIZE = 5
+MIN_TEST_ALPHA = 5000.0
+MAX_TEST_ALPHA = 20000.0
+MIN_TEST_AMPLITUDE = 0.1
+MAX_TEST_AMPLITUDE = 10.0
+
+
+class TestSinusoidPositionEncodingOp(hu.HypothesisTestCase):
+    @given(
+        positions_vec=hu.arrays(
+            dims=[MAX_TEST_SEQUENCE_LENGTH],
+            dtype=np.int32,
+            elements=st.integers(1, MAX_TEST_SEQUENCE_LENGTH)
+        ),
+        embedding_size=st.integers(1, MAX_TEST_EMBEDDING_SIZE),
+        batch_size=st.integers(1, MAX_TEST_BATCH_SIZE),
+        alpha=st.floats(MIN_TEST_ALPHA, MAX_TEST_ALPHA),
+        amplitude=st.floats(MIN_TEST_AMPLITUDE, MAX_TEST_AMPLITUDE),
+        **hu.gcs_cpu_only
+    )
+    def test_sinusoid_embedding(
+        self, positions_vec, embedding_size, batch_size, alpha, amplitude, gc, dc
+    ):
+        positions = np.tile(positions_vec, [batch_size, 1]).transpose()
+
+        op = core.CreateOperator(
+            "SinusoidPositionEncoding",
+            ["positions"],
+            ["output"],
+            embedding_size=embedding_size,
+            alpha=alpha,
+            amplitude=amplitude,
+        )
+
+        def sinusoid_encoding(dim, position):
+            x = 1. * position / math.pow(alpha, 1. * dim / embedding_size)
+            if dim % 2 == 0:
+                return amplitude * math.sin(x)
+            else:
+                return amplitude * math.cos(x)
+
+        def sinusoid_embedding_op(positions):
+            output_shape = (len(positions), len(positions[0]), embedding_size)
+            ar = np.zeros(output_shape)
+            for i, position_vector in enumerate(positions):
+                for j, position in enumerate(position_vector):
+                    for k in range(embedding_size):
+                        ar[i, j, k] = sinusoid_encoding(k, position)
+            return [ar]
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[positions],
+            reference=sinusoid_embedding_op,
+        )
diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py
new file mode 100644
index 0000000..ff2b0a5
--- /dev/null
+++ b/caffe2/python/operator_test/softmax_ops_test.py
@@ -0,0 +1,671 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+import unittest
+
+
+class TestSoftmaxOps(hu.HypothesisTestCase):
+
+    @given(n=st.sampled_from([0, 2, 4, 71, 103]),
+           D=st.sampled_from([4, 8, 64, 79, 256, 333]),
+           engine=st.sampled_from([None, 'CUDNN']),
+           **hu.gcs)
+    def test_softmax(self, n, D, engine, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax(X):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros(n)
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            return [probs]
+
+        op = core.CreateOperator(
+            "Softmax",
+            ["X"],
+            ["probs"],
+            engine=engine
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=label_softmax,
+        )
+
+    @given(n=st.sampled_from([0, 2, 4, 71, 103, 555, 751, 1201]),
+           D=st.sampled_from([4, 8, 64, 79, 256, 333, 1000]),
+           engine=st.sampled_from([None, 'CUDNN']),
+           **hu.gcs)
+    def test_softmax_grad(self, n, D, engine, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        Y = np.random.rand(n, D).astype(np.float32)
+        dY = np.random.rand(n, D).astype(np.float32)
+        Y = Y + 1e-2
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_grad(X, dY):
+            dX = Y * 0.0
+            for i in range(n):
+                d = np.dot(Y[i, :], dY[i, :])
+                dX[i, :] = Y[i, :] * (dY[i, :] - d)
+            return [dX]
+
+        op = core.CreateOperator(
+            "SoftmaxGradient",
+            ["Y", "dY"],
+            ["dX"],
+            engine=engine
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[Y, dY],
+            reference=label_softmax_grad,
+        )
+
+    @given(axis=st.integers(min_value=1, max_value=4),
+           engine=st.sampled_from([None, 'CUDNN']),
+           **hu.gcs)
+    def test_softmax_axis(self, axis, engine, gc, dc):
+        np.random.seed(1)
+        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
+        X = X + 1e-2
+
+        def prod(xs):
+            p = 1
+            for x in xs:
+                p *= x
+            return p
+
+        N = prod(list(X.shape)[:axis])
+        D = prod(list(X.shape)[axis:])
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax(X):
+            X_ = X.reshape(N, D)
+            probs = np.zeros((N, D))
+            rowmax = np.zeros(N)
+            for i in range(N):
+                rowmax[i] = max(X_[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X_[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            return [probs.reshape(*X.shape)]
+
+        op = core.CreateOperator(
+            "Softmax",
+            ["X"],
+            ["probs"],
+            axis=axis,
+            engine=engine,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=label_softmax,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(2, 10), D=st.integers(4, 16),
+           only_loss=st.booleans(), **hu.gcs)
+    def test_softmax_with_loss(self, n, D, gc, only_loss, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        np.random.seed(2603)
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        # Initialize label
+        label = (np.random.rand(n) * D).astype(np.int32)
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_crossent(X, label):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros(n)
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            label_xent = [-np.log(max(probs[i][label[i]], 1e-20))
+                          for i in range(n)]
+            avgloss = np.sum(label_xent) / float(n)
+            return (probs, avgloss)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label"],
+            ["probs", "avgloss"],
+            only_loss=only_loss,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label],
+            reference=label_softmax_crossent,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @given(
+        n=st.integers(2, 5),
+        D=st.integers(4, 16),
+        only_loss=st.booleans(),
+        label_prob=st.booleans(),
+        **hu.gcs
+    )
+    def test_softmax_with_loss_axis_2(
+        self, n, D, only_loss, label_prob,
+        gc, dc
+    ):
+        np.random.seed(2603)
+        X = np.random.rand(n, n, D).astype(np.float32)
+        X = X + 1e-2
+
+        if label_prob:
+            label = np.random.rand(n, n, D).astype(np.float32)
+            label /= label.sum(axis=2, keepdims=True)
+        else:
+            label = (np.random.rand(n, n) * D).astype(np.int32)
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_crossent(X, label):
+            probs = np.zeros((n, n, D))
+            rowmax = np.zeros((n, n))
+            for i in range(n):
+                for j in range(n):
+                    rowmax[i, j] = max(X[i, j, ])
+                    # We need to subtract the max to avoid numerical issues
+                    probs[i, j] = X[i, j] - rowmax[i, j]
+                    exps = np.exp(probs[i, j, ])
+                    norm = sum(exps)
+                    probs[i, j, ] = exps / norm
+            label_xent = 0
+            for i in range(n):
+                for j in range(n):
+                    if label_prob:
+                        for k in range(D):
+                            label_xent += (
+                                -np.log(max(probs[i, j, k], 1e-20)) *
+                                label[i, j, k]
+                            )
+                    else:
+                        label_xent += -np.log(max(probs[i, j, label[i, j]], 1e-20))
+
+            avgloss = label_xent / float(n * n)
+            return (probs, avgloss)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label"],
+            ["probs", "avgloss"],
+            only_loss=only_loss,
+            label_prob=label_prob,
+            axis=2,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label],
+            reference=label_softmax_crossent,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(**hu.gcs_gpu_only)
+    def test_softmax_with_loss_large(self, gc, dc):
+        np.random.seed(2603)
+        for n in [32]:
+            for D in [1000, 2000, 20000]:
+                # n = number of examples, D = |labels|
+                # Initialize X and add 1e-2 for numerical stability
+                X = np.random.rand(n, D).astype(np.float32)
+                X = X + 1e-2
+
+                # Initialize label
+                label = (np.random.rand(n) * D).astype(np.int32)
+
+                # Reference implementation of cross entropy with soft labels
+                def label_softmax_crossent(X, label):
+                    probs = np.zeros((n, D))
+                    rowmax = np.zeros(n)
+                    for i in range(n):
+                        rowmax[i] = max(X[i, ])
+                        # We need to subtract the max to avoid numerical issues
+                        probs[i] = X[i] - rowmax[i]
+                        exps = np.exp(probs[i, ])
+                        norm = sum(exps)
+                        probs[i, ] = exps / norm
+
+                    label_xent = [-np.log(max(probs[i][label[i]], 1e-20))
+                                  for i in range(n)]
+                    avgloss = np.sum(label_xent) / float(n)
+                    return (probs, avgloss)
+
+                op = core.CreateOperator(
+                    "SoftmaxWithLoss",
+                    ["X", "label"],
+                    ["probs", "avgloss"]
+                )
+
+                self.assertReferenceChecks(
+                    device_option=gc,
+                    op=op,
+                    inputs=[X, label],
+                    reference=label_softmax_crossent,
+                )
+
+    @given(n=st.integers(2, 10), D=st.integers(4, 16), **hu.gcs)
+    def test_softmax_with_loss_label_prob(self, n, D, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        np.random.seed(2603)
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        # Initialize label
+        label = np.random.rand(D, n).astype(np.float32)
+
+        # normalize labels to sum to 1
+        label /= np.sum(label, axis=0)
+        label = label.transpose()
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_crossent(X, label):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros(n)
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            label_xent = np.zeros(X.shape)
+            for i in range(n):
+                for j in range(D):
+                    label_xent[i][j] = -np.log(
+                        max(probs[i, j], 1e-20)) * label[i, j]
+            avgloss = np.sum(label_xent) / float(n)
+            return (probs, avgloss)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label"],
+            ["probs", "avgloss"],
+            label_prob=1
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label],
+            reference=label_softmax_crossent,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @given(
+        n=st.integers(2, 10),
+        D=st.integers(4, 16),
+        only_loss=st.booleans(),
+        **hu.gcs)
+    def test_softmax_with_loss_weighted(self, n, D, only_loss, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        np.random.seed(2603)
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        # Initialize label
+        label = (np.random.rand(n) * D).astype(np.int32)
+
+        # Init weights (weight by sample)
+        weights = np.random.rand(n).astype(np.float32)
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_crossent_weighted(X, label, weights):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros(n)
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            label_xent = [-weights[i] * np.log(max(probs[i][label[i]], 1e-20))
+                          for i in range(n)]
+            avgloss = np.sum(label_xent) / sum(weights)
+            return (probs, avgloss)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label", "weights"],
+            ["probs", "avgloss"],
+            only_loss=only_loss,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label, weights],
+            reference=label_softmax_crossent_weighted,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label, weights], 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(2, 10), D=st.integers(4, 16), **hu.gcs)
+    def test_softmax_with_loss_label_prob_weighted(self, n, D, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        # Initialize label
+        label = np.random.rand(D, n).astype(np.float32)
+
+        # normalize labels to sum to 1
+        label /= np.sum(label, axis=0)
+        label = label.transpose()
+
+        # Init weights (weight by sample)
+        weights = np.random.rand(n).astype(np.float32)
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_crossent_weighted(X, label, weights):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros(n)
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+
+            label_xent = np.zeros(X.shape)
+            for i in range(n):
+                for j in range(D):
+                    label_xent[i][j] = -np.log(
+                        max(probs[i, j], 1e-20)) * label[i, j] * weights[i]
+            avgloss = np.sum(label_xent) / sum(weights)
+            return (probs, avgloss)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label", "weights"],
+            ["probs", "avgloss"],
+            label_prob=1,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X, label, weights],
+            reference=label_softmax_crossent_weighted,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X, label, weights], 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(2, 5), D=st.integers(2, 4),
+           weighted=st.booleans(), **hu.gcs)
+    def test_spatial_softmax_with_loss(self, n, D, weighted, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        W = 18
+        H = 12
+        np.random.seed(2603)
+        X = np.random.rand(n, D, H, W).astype(np.float32)
+        X = X + 1e-2
+
+        weighted = True
+        weights = None
+        if weighted:
+            weights = np.random.rand(n, H, W).astype(np.float32)
+
+        # Initialize label. Some of the labels are (-1), i.e "DONT CARE"
+        label = (np.random.rand(n, H, W) * (D + 1)).astype(np.int32) - 1
+
+        def label_softmax_crossent_spatial(X, label, weights=None):
+            probs = np.zeros((n, D, H, W))
+            rowmax = np.zeros((n, H, W))
+            label_xent = np.zeros((n, H, W))
+            for i in range(n):
+                for x in range(W):
+                    for y in range(H):
+                        rowmax[i, y, x] = max(X[i, :, y, x])
+                        # We need to subtract the max to avoid numerical issues
+                        probs[i, :, y, x] = X[i, :, y, x] - rowmax[i, y, x]
+                        exps = np.exp(probs[i, :, y, x])
+                        probs[i, :, y, x] = exps / sum(exps)
+
+                        label_xent[:, y, x] = \
+                            [-np.log(max(probs[j, label[i, y, x], y, x], 1e-20))
+                             for j in range(n)]
+
+            total_xent = 0.0
+            total_weight = 0.0
+            for y in range(H):
+                for x in range(W):
+                    for i in range(n):
+                        l = label[i, y, x]
+                        if (l != (-1)):
+                            w = 1.0 if weights is None else weights[i, y, x]
+                            total_xent += \
+                                -np.log(max(probs[i, l, y, x], 1e-20)) * w
+                            total_weight += w
+            print("Total weight {}".format(total_weight))
+
+            return (probs, total_xent / total_weight)
+
+        op = core.CreateOperator(
+            "SpatialSoftmaxWithLoss",
+            ["X", "label"] + ([] if weights is None else ["weights"]),
+            ["probs", "avgloss"],
+        )
+
+        inputs = [X, label] + ([] if weights is None else [weights])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=label_softmax_crossent_spatial,
+        )
+
+        self.assertGradientChecks(
+            gc, op, inputs, 0, [1], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(4, 5), D=st.integers(3, 4),
+           weighted=st.booleans(), **hu.gcs)
+    def test_spatial_softmax_with_loss_allignore(self, n, D, weighted, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        W = 18
+        H = 12
+        np.random.seed(2603)
+        X = np.random.rand(n, D, H, W).astype(np.float32)
+        X = X + 1e-2
+
+        weighted = True
+        weights = None
+        if weighted:
+            weights = np.random.rand(n, H, W).astype(np.float32)
+
+        # Initialize label. All labels as "DONT CARE"
+        label = np.zeros((n, H, W)).astype(np.int32) - 1
+        print(label)
+
+        def label_softmax_crossent_spatial(X, label, weights=None):
+            probs = np.zeros((n, D, H, W))
+            rowmax = np.zeros((n, H, W))
+            label_xent = np.zeros((n, H, W))
+            for i in range(n):
+                for x in range(W):
+                    for y in range(H):
+                        rowmax[i, y, x] = max(X[i, :, y, x])
+                        # We need to subtract the max to avoid numerical issues
+                        probs[i, :, y, x] = X[i, :, y, x] - rowmax[i, y, x]
+                        exps = np.exp(probs[i, :, y, x])
+                        probs[i, :, y, x] = exps / sum(exps)
+
+                        label_xent[:, y, x] = \
+                            [-np.log(max(probs[j, label[i, y, x], y, x], 1e-20))
+                            for j in range(n)]
+
+            return (probs, 0.0)
+
+        op = core.CreateOperator(
+            "SpatialSoftmaxWithLoss",
+            ["X", "label"] + ([] if weights is None else ["weights"]),
+            ["probs", "avgloss"],
+        )
+
+        inputs = [X, label] + ([] if weights is None else [weights])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=label_softmax_crossent_spatial,
+        )
+
+    @given(n=st.integers(4, 5), D=st.integers(3, 4),
+           weighted=st.booleans(), **hu.gcs)
+    def test_softmax_with_loss_zero_weight(self, n, D, weighted, gc, dc):
+        # n = number of examples, D = |labels|
+        # Initialize X and add 1e-2 for numerical stability
+        np.random.seed(2603)
+        X = np.random.rand(n, D).astype(np.float32)
+        X = X + 1e-2
+
+        weights = np.zeros(n).astype(np.float32)
+
+        # Initialize label
+        label = (np.random.rand(n) * D).astype(np.int32)
+
+        def label_softmax_crossent(X, label, weights=None):
+            probs = np.zeros((n, D))
+            rowmax = np.zeros((n))
+            for i in range(n):
+                rowmax[i] = max(X[i, ])
+                # We need to subtract the max to avoid numerical issues
+                probs[i] = X[i] - rowmax[i]
+                exps = np.exp(probs[i, ])
+                norm = sum(exps)
+                probs[i, ] = exps / norm
+            return (probs, 0.0)
+
+        op = core.CreateOperator(
+            "SoftmaxWithLoss",
+            ["X", "label", "weights"],
+            ["probs", "avgloss"]
+        )
+
+        inputs = [X, label] + ([] if weights is None else [weights])
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=label_softmax_crossent,
+        )
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    def test_compare_cpugpu(self):
+        '''
+        Additional test that checks CPU and GPU returns same values
+        with larger examples. This is mainly to test the more complex
+        GPU implementation is correct.
+        '''
+        from caffe2.proto import caffe2_pb2
+
+        for _j in range(3):
+            gpuop = core.CreateOperator(
+                "SpatialSoftmaxWithLoss",
+                ["X_gpu", "label_gpu"],
+                ["probs_gpu", "avgloss_gpu"],
+                device_option=core.DeviceOption(caffe2_pb2.CUDA, 0)
+            )
+
+            cpuop = core.CreateOperator(
+                "SpatialSoftmaxWithLoss",
+                ["X_cpu", "label_cpu"],
+                ["probs_cpu", "avgloss_cpu"],
+                device_option=core.DeviceOption(caffe2_pb2.CPU)
+            )
+
+            n = 8
+            D = 4
+            W = 64 + int(np.random.rand(1) * 1024)
+            H = 64 + int(np.random.rand(1) * 1024)
+
+            print("W: {} H: {}".format(W, H))
+
+            X = np.random.rand(n, D, H, W).astype(np.float32)
+            X = X + 1e-2
+
+            # Initialize label. Some of the labels are (-1), i.e "DONT CARE"
+            label = (np.random.rand(n, H, W) * (D + 1)).astype(np.int32) - 1
+
+            gpu0 = core.DeviceOption(caffe2_pb2.CUDA, 0)
+            workspace.FeedBlob("X_cpu", X)
+            workspace.FeedBlob("label_cpu", label)
+            workspace.FeedBlob("X_gpu", X, device_option=gpu0)
+            workspace.FeedBlob("label_gpu", label, device_option=gpu0)
+
+            workspace.RunOperatorOnce(gpuop)
+            workspace.RunOperatorOnce(cpuop)
+
+            probs_gpu = workspace.FetchBlob("probs_gpu")
+            probs_cpu = workspace.FetchBlob("probs_cpu")
+            loss_gpu = workspace.FetchBlob("avgloss_gpu")
+            loss_cpu = workspace.FetchBlob("avgloss_cpu")
+
+            np.testing.assert_allclose(probs_gpu, probs_cpu, rtol=1e-4)
+            np.testing.assert_allclose(loss_gpu, loss_cpu, rtol=1e-1)
+
+if __name__ == "__main__":
+    import unittest
+    import random
+    random.seed(2603)
+    unittest.main()
diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py
new file mode 100644
index 0000000..37bd2b5
--- /dev/null
+++ b/caffe2/python/operator_test/softplus_op_test.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import unittest
+
+
+class TestSoftplus(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(),
+           **hu.gcs)
+    def test_softplus(self, X, gc, dc):
+        op = core.CreateOperator("Softplus", ["X"], ["Y"])
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
new file mode 100644
index 0000000..130a2e7
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestSparseGradient(hu.HypothesisTestCase):
+    @given(M=st.integers(min_value=5, max_value=20),
+           N=st.integers(min_value=5, max_value=20),
+           K=st.integers(min_value=5, max_value=15),
+           sparsity=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs_cpu_only)
+    def test_sparse_gradient(self, M, N, K, sparsity, gc, dc):
+        X = np.random.randn(M, K).astype(np.float32)
+        X[X > sparsity] = 0
+        X_coo = coo_matrix(X)
+        val, key, seg = X_coo.data, X_coo.col, X_coo.row
+
+        val = val.astype(np.float32)
+        key = key.astype(np.int64)
+        seg = seg.astype(np.int32)
+
+        Y = np.random.randn(K, N).astype(np.float32)
+
+        op = core.CreateOperator(
+            'SparseUnsortedSegmentWeightedSum',
+            ['Y', 'val', 'key', 'seg'],
+            ['out'],
+            num_segments=M)
+
+        # Gradient check wrt Y
+        self.assertGradientChecks(
+            gc, op, [Y, val, key, seg], 0, [0])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
new file mode 100644
index 0000000..47b516a
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import numpy as np
+import datetime
+
+from caffe2.python import core, workspace
+
+DTYPES = {
+    'uint8': np.uint8,
+    'uint8_fused': np.uint8,
+    'float': np.float32,
+    'float16': np.float16,
+}
+
+
+def benchmark_sparse_lengths_sum(
+        dtype_str,
+        categorical_limit,
+        embedding_size,
+        average_len,
+        batch_size,
+        iterations):
+    print('Preparing lookup table. ' + str(datetime.datetime.now()))
+
+    # We will use a constant, but non-trivial value so we save initialization
+    # time.
+    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
+    data *= 17.01
+
+    if dtype_str == 'uint8':
+        scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32)
+        workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32))
+    elif dtype_str == 'uint8_fused':
+        scale_bias = np.random.randint(255, size=(categorical_limit, 8))
+        data = np.concatenate([data, scale_bias], axis=1)
+
+    print('Data has shape {} {}'.format(data.shape, datetime.datetime.now()))
+    workspace.FeedBlob("X", data.astype(DTYPES[dtype_str]))
+
+    # In order to produce truly random lengths and indices, we will embed a
+    # Python operator in the net to generate them.
+    def f(_, outputs):
+        lengths = np.random.randint(
+            int(average_len * 0.75),
+            int(average_len * 1.25),
+            batch_size).astype(np.int32)
+        indices = np.random.randint(
+            0, categorical_limit, np.sum(lengths)).astype(np.int64)
+        outputs[0].feed(indices)
+        outputs[1].feed(lengths)
+
+    net = core.Net("mynet")
+    net.Python(f)([], ["indices", "lengths", ])
+    if dtype_str == "uint8":
+        net.SparseLengthsSum8BitsRowwise(["X", "indices", "lengths", "scale_bias"], "Y")
+    elif dtype_str == "uint8_fused":
+        net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y")
+    else:
+        net.SparseLengthsSum(["X", "indices", "lengths"], "Y")
+    workspace.CreateNet(net)
+
+    # Set random seed, so that repeated runs will keep the same sequence of
+    # random indices.
+    np.random.seed(1701)
+
+    print('Preparation finished. ' + str(datetime.datetime.now()))
+
+    workspace.BenchmarkNet(net.Name(), 1, iterations, True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="minimal benchmark for sparse lengths sum.")
+    parser.add_argument(
+        '-d', "--dtype", choices=list(DTYPES.keys()), default="float",
+        help="The data type for the input lookup table.")
+    parser.add_argument(
+        '-e', "--embedding-size", type=int, default=6000000,
+        help="Lookup table size.")
+    parser.add_argument(
+        "--embedding-dim", type=int, default=128,
+        help="Embedding dimension.")
+    parser.add_argument(
+        "--average_len", type=int, default=27,
+        help="Sparse feature average lengths, default is 27")
+    parser.add_argument(
+        "--batch_size", type=int, default=100,
+        help="The batch size.")
+    parser.add_argument(
+        '-i', "--iteration", type=int, default=100000,
+        help="The number of iterations.")
+    args, extra_args = parser.parse_known_args()
+    core.GlobalInit(['python'] + extra_args)
+    benchmark_sparse_lengths_sum(
+        args.dtype,
+        args.embedding_size,
+        args.embedding_dim,
+        args.average_len,
+        args.batch_size,
+        args.iteration)
diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py
new file mode 100644
index 0000000..e80a2bc
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_normalize_test.py
@@ -0,0 +1,75 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis
+from hypothesis import given, settings, HealthCheck
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestSparseNormalize(hu.HypothesisTestCase):
+
+    @staticmethod
+    def ref_normalize(param_in, use_max_norm, norm):
+        param_norm = np.linalg.norm(param_in) + 1e-12
+        if (use_max_norm and param_norm > norm) or not use_max_norm:
+            param_in = param_in * norm / param_norm
+        return param_in
+
+    # Suppress filter_too_much health check.
+    # Likely caused by `assume` call falling through too often.
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2),
+           use_max_norm=st.booleans(),
+           norm=st.floats(min_value=1.0, max_value=4.0),
+           data_strategy=st.data(),
+           **hu.gcs_cpu_only)
+    def test_sparse_normalize(self, inputs, use_max_norm, norm,
+                              data_strategy, gc, dc):
+        param, grad = inputs
+        param += 0.02 * np.sign(param)
+        param[param == 0.0] += 0.02
+
+        # Create an indexing array containing values that are lists of indices,
+        # which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(dtype=np.int64, min_dim=1, max_dim=1,
+                      elements=st.sampled_from(np.arange(grad.shape[0]))),
+        )
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        # For now, the indices must be unique
+        hypothesis.assume(np.array_equal(np.unique(indices.flatten()),
+                                         np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "SparseNormalize",
+            ["param", "indices", "grad"],
+            ["param"],
+            use_max_norm=use_max_norm,
+            norm=norm,
+        )
+
+        def ref_sparse_normalize(param, indices, grad):
+            param_out = np.copy(param)
+            for _, index in enumerate(indices):
+                param_out[index] = self.ref_normalize(
+                    param[index],
+                    use_max_norm,
+                    norm,
+                )
+            return (param_out,)
+
+        # self.assertDeviceChecks(dc, op, [param, indices, grad], [0])
+        self.assertReferenceChecks(
+            gc, op, [param, indices, grad],
+            ref_sparse_normalize
+        )
diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py
new file mode 100644
index 0000000..38dfdf3
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_ops_test.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import core
+from caffe2.python.test_util import rand_array
+import caffe2.python.hypothesis_test_util as hu
+from hypothesis import given
+import hypothesis.strategies as st
+
+class TestScatterOps(hu.HypothesisTestCase):
+    # TODO(dzhulgakov): add test cases for failure scenarios
+    @given(num_args=st.integers(1, 5),
+           first_dim=st.integers(1, 20),
+           index_dim=st.integers(1, 10),
+           extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
+           ind_type=st.sampled_from([np.int32, np.int64]),
+           **hu.gcs)
+    def testScatterWeightedSum(
+        self, num_args, first_dim, index_dim, extra_dims, ind_type, gc, dc):
+        ins = ['data', 'w0', 'indices']
+        for i in range(1, num_args + 1):
+            ins.extend(['x' + str(i), 'w' + str(i)])
+        op = core.CreateOperator(
+            'ScatterWeightedSum',
+            ins,
+            ['data'],
+            device_option=gc)
+        def ref(d, w0, ind, *args):
+            r = d.copy()
+            for i in ind:
+                r[i] *= w0
+            for i in range(0, len(args), 2):
+                x = args[i]
+                w = args[i+1]
+                for i, j in enumerate(ind):
+                    r[j] += w * x[i]
+            return [r]
+
+        d = rand_array(first_dim, *extra_dims)
+        ind = np.random.randint(0, first_dim, index_dim).astype(ind_type)
+        # ScatterWeightedSumOp only supports w0=1.0 in CUDAContext
+        if(gc == hu.gpu_do):
+            w0 = np.array(1.0).astype(np.float32)
+        else:
+            w0 = rand_array()
+        inputs = [d, w0, ind]
+        for _ in range(1, num_args + 1):
+            x = rand_array(index_dim, *extra_dims)
+            w = rand_array()
+            inputs.extend([x,w])
+        self.assertReferenceChecks(gc, op, inputs, ref, threshold=1e-3)
+
+    @given(first_dim=st.integers(1, 20),
+           index_dim=st.integers(1, 10),
+           extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
+           data_type=st.sampled_from([np.float16, np.float32, np.int32, np.int64]),
+           ind_type=st.sampled_from([np.int32, np.int64]),
+           **hu.gcs)
+    def testScatterAssign(
+            self, first_dim, index_dim, extra_dims, data_type, ind_type, gc, dc):
+        op = core.CreateOperator('ScatterAssign',
+                                 ['data', 'indices', 'slices'], ['data'])
+        def ref(d, ind, x):
+            r = d.copy()
+            r[ind] = x
+            return [r]
+
+        # let's have indices unique
+        if first_dim < index_dim:
+            first_dim, index_dim = index_dim, first_dim
+        d = (rand_array(first_dim, *extra_dims) * 10).astype(data_type)
+        ind = np.random.choice(first_dim, index_dim,
+                               replace=False).astype(ind_type)
+        x = (rand_array(index_dim, *extra_dims) * 10).astype(data_type)
+        self.assertReferenceChecks(gc, op, [d, ind, x], ref, threshold=1e-3)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
new file mode 100644
index 0000000..a8eeac0
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestFcOperator(hu.HypothesisTestCase):
+
+    @given(n=st.integers(1, 10), k=st.integers(1, 5),
+           use_length=st.booleans(), **hu.gcs_cpu_only)
+    def test_sparse_to_dense_mask(self, n, k, use_length, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        N = sum(lengths)
+        indices = np.random.randint(5, size=N)
+        values = np.random.rand(N, 2).astype(np.float32)
+        default = np.random.rand(2).astype(np.float32)
+        mask = np.arange(3)
+        np.random.shuffle(mask)
+
+        input_str = ['indices', 'values', 'default']
+        input_data = [indices, values, default]
+        if use_length and n > 1:
+            input_str.append('lengths')
+            input_data.append(lengths)
+        output_str = ['output']
+
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            input_str,
+            output_str,
+            mask=mask,
+        )
+
+        # Check over multiple devices
+        self.assertDeviceChecks(
+            dc, op, input_data, [0])
+        # Gradient check for values
+        self.assertGradientChecks(
+            gc, op, input_data, 1, [0])
+
+    @given(n=st.integers(1, 10), k=st.integers(1, 5),
+           use_length=st.booleans(), **hu.gcs_cpu_only)
+    def test_sparse_to_dense_mask_with_int64(self, n, k, use_length, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        N = sum(lengths)
+        int64_mask = 10000000000
+        indices = np.random.randint(5, size=N) + int64_mask
+        values = np.random.rand(N, 2).astype(np.float32)
+        default = np.random.rand(2).astype(np.float32)
+        mask = np.arange(3) + int64_mask
+        np.random.shuffle(mask)
+
+        input_str = ['indices', 'values', 'default']
+        input_data = [indices, values, default]
+        if use_length and n > 1:
+            input_str.append('lengths')
+            input_data.append(lengths)
+        output_str = ['output']
+
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            input_str,
+            output_str,
+            mask=mask,
+        )
+
+        # Check over multiple devices
+        self.assertDeviceChecks(
+            dc, op, input_data, [0])
+        # Gradient check for values
+        self.assertGradientChecks(
+            gc, op, input_data, 1, [0])
+
+    @given(n=st.integers(1, 10), k=st.integers(1, 5),
+           dim=st.integers(1, 3), **hu.gcs_cpu_only)
+    def test_sparse_to_dense_mask_high_dim(self, n, k, dim, gc, dc):
+        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
+        N = sum(lengths)
+        indices = np.random.randint(5, size=N)
+        shape = np.random.randint(5, size=dim).astype(np.int32) + 1
+        values = np.random.rand(*((N,) + tuple(shape))).astype(np.float32)
+        default = np.random.rand(*shape).astype(np.float32)
+        mask = np.arange(3)
+        np.random.shuffle(mask)
+
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=mask,
+        )
+
+        # Check over multiple devices
+        self.assertDeviceChecks(
+            dc, op, [indices, values, default, lengths], [0])
+        # Gradient check for values
+        self.assertGradientChecks(
+            gc, op, [indices, values, default, lengths], 1, [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
new file mode 100644
index 0000000..c659e77
--- /dev/null
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -0,0 +1,285 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import brew, core, workspace
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.model_helper import ModelHelper
+
+
+import unittest
+
+class TestSpatialBN(hu.HypothesisTestCase):
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           inplace=st.sampled_from([True, False]),
+           **hu.gcs)
+    def test_spatialbn_test_mode_3d(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["X" if inplace else "Y"],
+            order=order,
+            is_test=True,
+            epsilon=epsilon,
+            engine="CUDNN",
+        )
+
+        def reference_spatialbn_test(X, scale, bias, mean, var):
+            if order == "NCHW":
+                scale = scale[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
+                bias = bias[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
+                mean = mean[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
+                var = var[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
+
+            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
+
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(batch_size, input_channels, size, size, size)\
+                .astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.transpose(0, 2, 3, 4, 1)
+        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
+                                   reference_spatialbn_test)
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           inplace=st.sampled_from([True, False]),
+           **hu.gcs)
+    def test_spatialbn_test_mode_1d(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["X" if inplace else "Y"],
+            order=order,
+            is_test=True,
+            epsilon=epsilon,
+            engine="CUDNN",
+        )
+
+        def reference_spatialbn_test(X, scale, bias, mean, var):
+            if order == "NCHW":
+                scale = scale[np.newaxis, :, np.newaxis]
+                bias = bias[np.newaxis, :, np.newaxis]
+                mean = mean[np.newaxis, :, np.newaxis]
+                var = var[np.newaxis, :, np.newaxis]
+            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
+
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size).astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.swapaxes(1, 2)
+        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
+                                   reference_spatialbn_test)
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           engine=st.sampled_from(["", "CUDNN"]),
+           inplace=st.sampled_from([True, False]),
+           **hu.gcs)
+    def test_spatialbn_test_mode(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, engine, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["X" if inplace else "Y"],
+            order=order,
+            is_test=True,
+            epsilon=epsilon,
+            engine=engine
+        )
+
+        def reference_spatialbn_test(X, scale, bias, mean, var):
+            if order == "NCHW":
+                scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
+                bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
+                mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
+                var = var[np.newaxis, :, np.newaxis, np.newaxis]
+            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
+
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
+                                   reference_spatialbn_test)
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(1e-5, 1e-2),
+           engine=st.sampled_from(["", "CUDNN"]),
+           inplace=st.sampled_from([True, False]),
+           **hu.gcs)
+    def test_spatialbn_train_mode(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            inplace, engine, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "running_mean", "running_var"],
+            ["X" if inplace else "Y",
+             "running_mean", "running_var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+            engine=engine,
+        )
+        np.random.seed(1701)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var],
+                                [0, 1, 2, 3, 4])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_spatialbn_train_mode_gradient_check(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            engine, gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["Y", "mean", "var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+            engine=engine
+        )
+        np.random.seed(seed)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        if order == "NHWC":
+            X = X.swapaxes(1, 2).swapaxes(2, 3)
+
+        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
+            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
+                                      input_to_check, [0])
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           order=st.sampled_from(["NCHW", "NHWC"]),
+           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           **hu.gcs)
+    def test_spatialbn_train_mode_gradient_check_1d(
+            self, size, input_channels, batch_size, seed, order, epsilon,
+            gc, dc):
+        op = core.CreateOperator(
+            "SpatialBN",
+            ["X", "scale", "bias", "mean", "var"],
+            ["Y", "mean", "var", "saved_mean", "saved_var"],
+            order=order,
+            is_test=False,
+            epsilon=epsilon,
+            engine="CUDNN",
+        )
+        np.random.seed(seed)
+        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
+        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
+        mean = np.random.randn(input_channels).astype(np.float32)
+        var = np.random.rand(input_channels).astype(np.float32) + 0.5
+        X = np.random.rand(
+            batch_size, input_channels, size).astype(np.float32) - 0.5
+        if order == "NHWC":
+            X = X.swapaxes(1, 2)
+
+        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
+            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
+                                      input_to_check, [0], stepsize=0.01)
+
+    @given(size=st.integers(7, 10),
+           input_channels=st.integers(1, 10),
+           batch_size=st.integers(0, 3),
+           seed=st.integers(0, 65535),
+           epsilon=st.floats(1e-5, 1e-2),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_spatialbn_brew_wrapper(
+            self, size, input_channels, batch_size, seed, epsilon,
+            engine, gc, dc):
+        np.random.seed(seed)
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32)
+
+        workspace.FeedBlob('X', X)
+
+        model = ModelHelper(name='test_spatialbn_brew_wrapper')
+
+        brew.spatial_bn(
+            model,
+            'X',
+            'Y',
+            input_channels,
+            epsilon=epsilon,
+            is_test=False,
+        )
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py
new file mode 100644
index 0000000..1ce9e5e
--- /dev/null
+++ b/caffe2/python/operator_test/specialized_segment_ops_test.py
@@ -0,0 +1,270 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import unittest
+
+
+class TestSpecializedSegmentOps(hu.HypothesisTestCase):
+
+    @given(batchsize=st.integers(1, 20),
+           fptype=st.sampled_from([np.float16, np.float32]),
+           fp16asint=st.booleans(),
+           blocksize=st.sampled_from([8, 17, 32, 64, 85, 96, 128, 163]),
+           normalize_by_lengths=st.booleans(), **hu.gcs)
+    def test_sparse_lengths_sum_cpu(
+            self, batchsize, fptype, fp16asint, blocksize, normalize_by_lengths, gc, dc):
+
+        if normalize_by_lengths == False:
+            print("<test_sparse_lengths_sum_cpu>")
+        else:
+            print("<test_sparse_lengths_sum_mean_cpu>")
+
+        tblsize = 300
+        if fptype == np.float32:
+            Tbl = np.random.rand(tblsize, blocksize).astype(np.float32)
+            atol = 1e-5
+        else:
+            if fp16asint:
+                Tbl = (10.0 * np.random.rand(tblsize, blocksize)
+                       ).round().astype(np.float16)
+                atol = 1e-3
+            else:
+                Tbl = np.random.rand(tblsize, blocksize).astype(np.float16)
+                atol = 1e-1
+
+        # array of each row length
+        Lengths = np.random.randint(1, 30, size=batchsize).astype(np.int32)
+        # flat indices
+        Indices = np.random.randint(
+            0, tblsize, size=sum(Lengths)).astype(np.int64)
+
+        if normalize_by_lengths == False:
+            op = core.CreateOperator("SparseLengthsSum", [
+                                     "Tbl", "Indices", "Lengths"], "out")
+        else:
+            op = core.CreateOperator("SparseLengthsMean", [
+                                     "Tbl", "Indices", "Lengths"], "out")
+
+        self.ws.create_blob("Tbl").feed(Tbl)
+        self.ws.create_blob("Indices").feed(Indices)
+        self.ws.create_blob("Lengths").feed(Lengths)
+        self.ws.run(op)
+
+        def sparse_lengths_sum_ref(Tbl, Indices, Lengths):
+            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
+            out = np.zeros((len(Lengths), blocksize))
+            if normalize_by_lengths == False:
+                for i in range(0, len(rptr[0:-1])):
+                    out[i] = Tbl[Indices[rptr[i]:rptr[i + 1]]].sum(axis=0)
+            else:
+                for i in range(0, len(rptr[0:-1])):
+                    out[i] = Tbl[Indices[rptr[i]:rptr[i + 1]]
+                                 ].sum(axis=0) * 1.0 / float(Lengths[i])
+
+            return out
+
+        np.testing.assert_allclose(self.ws.blobs[("out")].fetch(),
+                                   sparse_lengths_sum_ref(Tbl, Indices, Lengths), rtol=1e-3, atol=atol)
+
+    @given(batchsize=st.integers(1, 20),
+           fptype=st.sampled_from([np.float16, np.float32]),
+           fp16asint=st.booleans(),
+           blocksize=st.sampled_from([8, 17, 32, 64, 85, 96, 128, 163]),
+           **hu.gcs)
+    def test_sparse_lengths_weightedsum_cpu(
+            self, batchsize, fptype, fp16asint, blocksize, gc, dc):
+
+        print("<test_sparse_lengths_weightedsum_cpu>")
+
+        tblsize = 300
+        if fptype == np.float32:
+            Tbl = np.random.rand(tblsize, blocksize).astype(np.float32)
+            atol = 1e-5
+        else:
+            if fp16asint:
+                Tbl = (10.0 * np.random.rand(tblsize, blocksize)
+                       ).round().astype(np.float16)
+                atol = 1e-3
+            else:
+                Tbl = np.random.rand(tblsize, blocksize).astype(np.float16)
+                atol = 1e-1
+
+        # array of each row length
+        Lengths = np.random.randint(1, 30, size=batchsize).astype(np.int32)
+        # flat indices
+        Indices = np.random.randint(
+            0, tblsize, size=sum(Lengths)).astype(np.int64)
+        Weights = np.random.rand(sum(Lengths)).astype(np.float32)
+
+        op = core.CreateOperator("SparseLengthsWeightedSum", [
+                                 "Tbl", "Weights", "Indices", "Lengths"], "out")
+
+        self.ws.create_blob("Tbl").feed(Tbl)
+        self.ws.create_blob("Indices").feed(Indices)
+        self.ws.create_blob("Lengths").feed(Lengths)
+        self.ws.create_blob("Weights").feed(Weights)
+        self.ws.run(op)
+
+        def sparse_lengths_weightedsum_ref(Tbl, Weights, Indices, Lengths):
+            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
+            out = np.zeros((len(Lengths), blocksize))
+            for i in range(0, len(rptr[0:-1])):
+                w = Weights[rptr[i]:rptr[i + 1]]
+                out[i] = (Tbl[Indices[rptr[i]:rptr[i + 1]]]
+                          * w[:, np.newaxis]).sum(axis=0)
+            return out
+
+        np.testing.assert_allclose(self.ws.blobs[("out")].fetch(),
+                                   sparse_lengths_weightedsum_ref(Tbl, Weights, Indices, Lengths), rtol=1e-3, atol=atol)
+
+
+    @given(batchsize=st.integers(1, 20),
+           blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
+           normalize_by_lengths=st.booleans(), **hu.gcs)
+    def test_sparse_lengths_weightedsum_8BitsRowwiseOp_cpu(
+            self, batchsize, blocksize, normalize_by_lengths, gc, dc):
+
+        if normalize_by_lengths == False:
+            print("<test_sparse_lengths_weightedsum_SparseLengthsWeightedSum8BitsRowwise_cpu>")
+        else:
+            print("<test_sparse_lengths_weightedsum_SparseLengthsWeightedMean8BitsRowwise_cpu>")
+
+        tblsize = 300
+        Tbl = np.random.randint(7, size = (tblsize, blocksize)).astype(np.uint8)
+        atol = 1e-5
+
+        # array of each row length
+        Lengths = np.random.randint(1, 30, size=batchsize).astype(np.int32)
+        # flat indices
+        Indices = np.random.randint(
+            0, tblsize, size=sum(Lengths)).astype(np.int64)
+        Weights = np.random.rand(sum(Lengths)).astype(np.float32)
+        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
+
+        if normalize_by_lengths == False:
+            op = core.CreateOperator("SparseLengthsWeightedSum8BitsRowwise", [
+                                     "Tbl", "Weights", "Indices", "Lengths", "Scale_Bias"], "out")
+        else:
+            op = core.CreateOperator("SparseLengthsWeightedMean8BitsRowwise", [
+                                     "Tbl", "Weights", "Indices", "Lengths", "Scale_Bias"], "out")
+
+        self.ws.create_blob("Tbl").feed(Tbl)
+        self.ws.create_blob("Weights").feed(Weights)
+        self.ws.create_blob("Indices").feed(Indices)
+        self.ws.create_blob("Lengths").feed(Lengths)
+        self.ws.create_blob("Scale_Bias").feed(Scale_Bias)
+        self.ws.run(op)
+
+        def sparse_lengths_weightedsum_8BitsRowwiseOp_cpu_ref(Tbl, Weights, Indices, Lengths, Scale_Bias):
+            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
+            out = np.zeros((len(Lengths), blocksize))
+            for i in range(0, len(rptr[0:-1])):
+                w = Weights[rptr[i]:rptr[i + 1]]
+                s = Scale_Bias[Indices[rptr[i]:rptr[i + 1]], 0][:, np.newaxis]
+                b = Scale_Bias[Indices[rptr[i]:rptr[i + 1]], 1][:, np.newaxis]
+                f = 1.0
+                if normalize_by_lengths == True:
+                    f = 1.0 / float(Lengths[i])
+                out[i] = (w[:, np.newaxis] *
+                          (s * Tbl[Indices[rptr[i]:rptr[i + 1]]] + b)).sum(axis=0) * f
+            return out
+
+        np.testing.assert_allclose(self.ws.blobs[("out")].fetch(),
+                                   sparse_lengths_weightedsum_8BitsRowwiseOp_cpu_ref(Tbl, Weights, Indices, Lengths, Scale_Bias),
+                                   rtol=1e-3, atol=atol)
+
+
+
+    @given(batchsize=st.integers(1, 20),
+           blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
+           normalize_by_lengths=st.booleans(), **hu.gcs)
+    def test_sparse_lengths_sum_8BitsRowwiseOp_cpu(
+            self, batchsize, blocksize, normalize_by_lengths, gc, dc):
+
+        if normalize_by_lengths == False:
+            print("<test_sparse_lengths_sum_SparseLengthsSum8BitsRowwise_cpu>")
+        else:
+            print("<test_sparse_lengths_sum_SparseLengthsMean8BitsRowwise_cpu>")
+
+        tblsize = 300
+        Tbl = np.random.randint(7, size = (tblsize, blocksize)).astype(np.uint8)
+        atol = 1e-5
+
+        # array of each row length
+        Lengths = np.random.randint(1, 30, size=batchsize).astype(np.int32)
+        # flat indices
+        Indices = np.random.randint(
+            0, tblsize, size=sum(Lengths)).astype(np.int64)
+        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
+
+        if normalize_by_lengths == False:
+            op = core.CreateOperator("SparseLengthsSum8BitsRowwise", [
+                                     "Tbl", "Indices", "Lengths", "Scale_Bias"], "out")
+        else:
+            op = core.CreateOperator("SparseLengthsMean8BitsRowwise", [
+                                     "Tbl", "Indices", "Lengths", "Scale_Bias"], "out")
+
+        self.ws.create_blob("Tbl").feed(Tbl)
+        self.ws.create_blob("Indices").feed(Indices)
+        self.ws.create_blob("Lengths").feed(Lengths)
+        self.ws.create_blob("Scale_Bias").feed(Scale_Bias)
+        self.ws.run(op)
+
+        def sparse_lengths_sum_8BitsRowwiseOp_cpu_reg(Tbl, Indices, Lengths, Scale_Bias):
+            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
+            out = np.zeros((len(Lengths), blocksize))
+            for i in range(0, len(rptr[0:-1])):
+                s = Scale_Bias[Indices[rptr[i]:rptr[i + 1]], 0][:, np.newaxis]
+                b = Scale_Bias[Indices[rptr[i]:rptr[i + 1]], 1][:, np.newaxis]
+                f = 1.0
+                if normalize_by_lengths == True:
+                    f = 1.0 / float(Lengths[i])
+                out[i] = (s * Tbl[Indices[rptr[i]:rptr[i + 1]]] + b).sum(axis=0) * f
+            return out
+
+        np.testing.assert_allclose(self.ws.blobs[("out")].fetch(),
+                                   sparse_lengths_sum_8BitsRowwiseOp_cpu_reg(Tbl, Indices, Lengths, Scale_Bias),
+                                   rtol=1e-3, atol=atol)
+
+
+    @given(batchsize=st.integers(1, 20),
+           blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
+           normalize_by_lengths=st.booleans(), **hu.gcs)
+    def test_sparse_lengths_sum_8BitsRowwiseOp_cpu_invalid_index(
+            self, batchsize, blocksize, normalize_by_lengths, gc, dc):
+
+        tblsize = 300
+        Tbl = np.random.randint(7, size = (tblsize, blocksize)).astype(np.uint8)
+
+        # array of each row length
+        Lengths = np.random.randint(1, 30, size=batchsize).astype(np.int32)
+        # flat indices
+        Indices = np.random.randint(
+            0, tblsize, size=sum(Lengths)).astype(np.int64)
+        Indices[0] += 1000
+        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
+
+        if normalize_by_lengths == False:
+            op = core.CreateOperator("SparseLengthsSum8BitsRowwise", [
+                                     "Tbl", "Indices", "Lengths", "Scale_Bias"], "out")
+        else:
+            op = core.CreateOperator("SparseLengthsMean8BitsRowwise", [
+                                     "Tbl", "Indices", "Lengths", "Scale_Bias"], "out")
+
+        self.ws.create_blob("Tbl").feed(Tbl)
+        self.ws.create_blob("Indices").feed(Indices)
+        self.ws.create_blob("Lengths").feed(Lengths)
+        self.ws.create_blob("Scale_Bias").feed(Scale_Bias)
+        with self.assertRaises(RuntimeError):
+            self.ws.run(op)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
new file mode 100644
index 0000000..253184a
--- /dev/null
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -0,0 +1,76 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from functools import partial
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import math
+import numpy as np
+
+
+def _data_and_scale(
+        data_min_size=4, data_max_size=10,
+        examples_min_number=1, examples_max_number=4,
+        dtype=np.float32, elements=None):
+    params_ = st.tuples(
+        st.integers(min_value=examples_min_number,
+                    max_value=examples_max_number),
+        st.integers(min_value=data_min_size,
+                    max_value=data_max_size),
+        st.sampled_from([np.float32, np.int32, np.int64])
+    )
+    return params_.flatmap(
+        lambda param_: st.tuples(
+            hu.arrays([param_[0], param_[1]], dtype=dtype),
+            hu.arrays(
+                [param_[0]], dtype=param_[2],
+                elements=(st.floats(0.0, 10000.0) if param_[2] in [np.float32]
+                          else st.integers(0, 10000)),
+            ),
+        )
+    )
+
+
+def divide_by_square_root(data, scale):
+    output = np.copy(data)
+    num_examples = len(scale)
+
+    assert num_examples == data.shape[0]
+    assert len(data.shape) == 2
+
+    for i in range(0, num_examples):
+        if scale[i] > 0:
+            output[i] = np.multiply(data[i], 1 / math.sqrt(scale[i]))
+
+    return (output, )
+
+
+def grad(output_grad, ref_outputs, inputs):
+    return (divide_by_square_root(output_grad, inputs[1])[0],
+            None)
+
+
+class TestSquareRootDivide(hu.HypothesisTestCase):
+    @given(data_and_scale=_data_and_scale(),
+           **hu.gcs_cpu_only)
+    def test_square_root_divide(self, data_and_scale, gc, dc):
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator("SquareRootDivide",
+                                   ["data", "scale"],
+                                   ["output"]),
+            inputs=list(data_and_scale),
+            reference=partial(divide_by_square_root),
+            output_to_grad="output",
+            grad_reference=grad,
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py
new file mode 100644
index 0000000..edc36fa
--- /dev/null
+++ b/caffe2/python/operator_test/stats_ops_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestCounterOps(TestCase):
+
+    def test_stats_ops(self):
+        # The global StatRegistry isn't reset when the workspace is reset,
+        #   so there may be existing stats from a previous test
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['prev_k', 'prev_v', 'prev_ts']))
+        previous_keys = workspace.FetchBlob('prev_k')
+        existing = len(previous_keys)
+
+        prefix = '/'.join([__name__, 'TestCounterOps', 'test_stats_ops'])
+        keys = [
+            (prefix + '/key1').encode('ascii'),
+            (prefix + '/key2').encode('ascii')
+        ]
+        values = [34, 45]
+        workspace.FeedBlob('k', np.array(keys, dtype=str))
+        workspace.FeedBlob('v', np.array(values, dtype=np.int64))
+        for _ in range(2):
+            workspace.RunOperatorOnce(core.CreateOperator(
+                'StatRegistryUpdate', ['k', 'v'], []))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k2', 'v2', 't2']))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryCreate', [], ['reg']))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryUpdate', ['k2', 'v2', 'reg'], []))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', ['reg'], ['k3', 'v3', 't3']))
+
+        k3 = workspace.FetchBlob('k3')
+        v3 = workspace.FetchBlob('v3')
+        t3 = workspace.FetchBlob('t3')
+
+        self.assertEqual(len(k3) - existing, 2)
+        self.assertEqual(len(v3), len(k3))
+        self.assertEqual(len(t3), len(k3))
+        for key in keys:
+            self.assertIn(key, k3)
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
new file mode 100644
index 0000000..49c25bc
--- /dev/null
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -0,0 +1,118 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def _string_lists(alphabet=None):
+    return st.lists(
+        elements=st.text(alphabet=alphabet, average_size=3),
+        min_size=0,
+        max_size=3)
+
+
+class TestStringOps(hu.HypothesisTestCase):
+    @given(strings=_string_lists())
+    def test_string_prefix(self, strings):
+        length = 3
+        # although we are utf-8 encoding below to avoid python exceptions,
+        # StringPrefix op deals with byte-length prefixes, which may produce
+        # an invalid utf-8 string. The goal here is just to avoid python
+        # complaining about the unicode -> str conversion.
+        strings = np.array(
+            [a.encode('utf-8') for a in strings], dtype=np.object
+        )
+
+        def string_prefix_ref(strings):
+            return (
+                np.array([a[:length] for a in strings], dtype=object),
+            )
+
+        op = core.CreateOperator(
+            'StringPrefix',
+            ['strings'],
+            ['stripped'],
+            length=length)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_prefix_ref)
+
+    @given(strings=_string_lists())
+    def test_string_suffix(self, strings):
+        length = 3
+        strings = np.array(
+            [a.encode('utf-8') for a in strings], dtype=np.object
+        )
+
+        def string_suffix_ref(strings):
+            return (
+                np.array([a[-length:] for a in strings], dtype=object),
+            )
+
+        op = core.CreateOperator(
+            'StringSuffix',
+            ['strings'],
+            ['stripped'],
+            length=length)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_suffix_ref)
+
+    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    def test_string_starts_with(self, strings):
+        prefix = 'a'
+        strings = np.array(
+            [str(a) for a in strings], dtype=np.object
+        )
+
+        def string_starts_with_ref(strings):
+            return (
+                np.array([a.startswith(prefix) for a in strings], dtype=bool),
+            )
+
+        op = core.CreateOperator(
+            'StringStartsWith',
+            ['strings'],
+            ['bools'],
+            prefix=prefix)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_starts_with_ref)
+
+    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    def test_string_ends_with(self, strings):
+        suffix = 'a'
+        strings = np.array(
+            [str(a) for a in strings], dtype=np.object
+        )
+
+        def string_ends_with_ref(strings):
+            return (
+                np.array([a.endswith(suffix) for a in strings], dtype=bool),
+            )
+
+        op = core.CreateOperator(
+            'StringEndsWith',
+            ['strings'],
+            ['bools'],
+            suffix=suffix)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_ends_with_ref)
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py
new file mode 100644
index 0000000..41ba814
--- /dev/null
+++ b/caffe2/python/operator_test/text_file_reader_test.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.text_file_reader import TextFileReader
+from caffe2.python.test_util import TestCase
+from caffe2.python.schema import Struct, Scalar, FetchRecord
+import tempfile
+import numpy as np
+
+
+class TestTextFileReader(TestCase):
+    def test_text_file_reader(self):
+        schema = Struct(
+            ('field1', Scalar(dtype=str)),
+            ('field2', Scalar(dtype=str)),
+            ('field3', Scalar(dtype=np.float32)))
+        num_fields = 3
+        col_data = [
+            ['l1f1', 'l2f1', 'l3f1', 'l4f1'],
+            ['l1f2', 'l2f2', 'l3f2', 'l4f2'],
+            [0.456, 0.789, 0.10101, -24342.64],
+        ]
+        row_data = list(zip(*col_data))
+        with tempfile.NamedTemporaryFile(mode='w+', delete=False) as txt_file:
+            txt_file.write(
+                '\n'.join(
+                    '\t'.join(str(x) for x in f)
+                    for f in row_data
+                ) + '\n'
+            )
+            txt_file.flush()
+
+            for num_passes in range(1, 3):
+                for batch_size in range(1, len(row_data) + 2):
+                    init_net = core.Net('init_net')
+                    reader = TextFileReader(
+                        init_net,
+                        filename=txt_file.name,
+                        schema=schema,
+                        batch_size=batch_size,
+                        num_passes=num_passes)
+                    workspace.RunNetOnce(init_net)
+
+                    net = core.Net('read_net')
+                    should_stop, record = reader.read_record(net)
+
+                    results = [np.array([])] * num_fields
+                    while True:
+                        workspace.RunNetOnce(net)
+                        arrays = FetchRecord(record).field_blobs()
+                        for i in range(num_fields):
+                            results[i] = np.append(results[i], arrays[i])
+                        if workspace.FetchBlob(should_stop):
+                            break
+                    for i in range(num_fields):
+                        col_batch = np.tile(col_data[i], num_passes)
+                        if col_batch.dtype in (np.float32, np.float64):
+                            np.testing.assert_array_almost_equal(
+                                col_batch, results[i], decimal=3)
+                        else:
+                            np.testing.assert_array_equal(col_batch, results[i])
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py
new file mode 100644
index 0000000..6c50f02
--- /dev/null
+++ b/caffe2/python/operator_test/thresholded_relu_op_test.py
@@ -0,0 +1,71 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import hypothesis.strategies as st
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+import unittest
+
+
+class TestThresholdedRelu(hu.HypothesisTestCase):
+
+    # test case 1 - default alpha - we do reference and dc checks.
+    # test case 2 does dc and reference checks over range of alphas.
+    # test case 3 does gc over range of alphas.
+    @given(input=hu.tensor(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_thresholded_relu_1(self, input, gc, dc, engine):
+        X = input
+        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
+                                 engine=engine)
+
+        def defaultRef(X):
+            Y = np.copy(X)
+            Y[Y <= 1.0] = 0.0
+            return (Y,)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertReferenceChecks(gc, op, [X], defaultRef)
+
+    @given(input=hu.tensor(),
+           alpha=st.floats(min_value=1.0, max_value=5.0),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_thresholded_relu_2(self, input, alpha, gc, dc, engine):
+        X = input
+        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
+                                 alpha=alpha, engine=engine)
+
+        def ref(X):
+            Y = np.copy(X)
+            Y[Y <= alpha] = 0.0
+            return (Y,)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertReferenceChecks(gc, op, [X], ref)
+
+    @given(input=hu.tensor(),
+           alpha=st.floats(min_value=1.1, max_value=5.0),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    def test_thresholded_relu_3(self, input, alpha, gc, dc, engine):
+        X = TestThresholdedRelu.fix_input(input)
+        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
+                                 alpha=float(alpha), engine=engine)
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @staticmethod
+    def fix_input(input):
+        # go away from alpha to avoid derivative discontinuities
+        input += 0.02 * np.sign(input)
+        return input
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py
new file mode 100644
index 0000000..b04f07f
--- /dev/null
+++ b/caffe2/python/operator_test/tile_op_test.py
@@ -0,0 +1,114 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+import unittest
+
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestTile(hu.HypothesisTestCase):
+    @given(M=st.integers(min_value=1, max_value=10),
+           K=st.integers(min_value=1, max_value=10),
+           N=st.integers(min_value=1, max_value=10),
+           tiles=st.integers(min_value=1, max_value=3),
+           axis=st.integers(min_value=0, max_value=2),
+           **hu.gcs)
+    def test_tile(self, M, K, N, tiles, axis, gc, dc):
+        X = np.random.rand(M, K, N).astype(np.float32)
+
+        op = core.CreateOperator(
+            'Tile', ['X'], 'out',
+            tiles=tiles,
+            axis=axis,
+        )
+
+        def tile_ref(X, tiles, axis):
+            dims = np.asarray([1, 1, 1], dtype=np.int)
+            dims[axis] = tiles
+            tiled_data = np.tile(X, dims)
+            return (tiled_data,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, tiles, axis],
+                                   tile_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @given(M=st.integers(min_value=1, max_value=200),
+           N=st.integers(min_value=1, max_value=200),
+           tiles=st.integers(min_value=50, max_value=100),
+           **hu.gcs)
+    def test_tile_grad(self, M, N, tiles, gc, dc):
+        X = np.random.rand(M, N).astype(np.float32)
+        axis = 1
+
+        op = core.CreateOperator(
+            'Tile', ['X'], 'out',
+            tiles=tiles,
+            axis=axis,
+        )
+
+        def tile_ref(X, tiles, axis):
+            dims = np.asarray([1, 1], dtype=np.int)
+            dims[axis] = tiles
+            tiled_data = np.tile(X, dims)
+            return (tiled_data,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, tiles, axis],
+                                   tile_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+        # Gradient check wrt X
+        grad_op = core.CreateOperator(
+            'TileGradient', ['dOut'], 'dX',
+            tiles=tiles,
+            axis=axis,
+        )
+        dX = np.random.rand(M, N * tiles).astype(np.float32)
+        self.assertDeviceChecks(dc, grad_op, [dX], [0])
+
+    @given(M=st.integers(min_value=1, max_value=10),
+           K=st.integers(min_value=1, max_value=10),
+           N=st.integers(min_value=1, max_value=10),
+           tiles=st.integers(min_value=1, max_value=3),
+           axis=st.integers(min_value=0, max_value=2),
+           **hu.gcs)
+    def test_tilewinput(self, M, K, N, tiles, axis, gc, dc):
+        X = np.random.rand(M, K, N).astype(np.float32)
+
+        tiles_arg = np.array([tiles], dtype=np.int32)
+        axis_arg = np.array([axis], dtype=np.int32)
+
+        op = core.CreateOperator(
+            'Tile', ['X', 'tiles', 'axis'], 'out',
+        )
+
+        def tile_ref(X, tiles, axis):
+            dims = np.asarray([1, 1, 1], dtype=np.int)
+            dims[axis] = tiles
+            tiled_data = np.tile(X, dims)
+            return (tiled_data,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, tiles_arg, axis_arg],
+                                   tile_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, tiles_arg, axis_arg], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, tiles_arg, axis_arg], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py
new file mode 100644
index 0000000..b1306c4
--- /dev/null
+++ b/caffe2/python/operator_test/top_k_test.py
@@ -0,0 +1,242 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestTopK(hu.HypothesisTestCase):
+
+    def top_k_ref(self, X, k, flatten_indices, axis=-1):
+        in_dims = X.shape
+        out_dims = list(in_dims)
+        out_dims[axis] = k
+        out_dims = tuple(out_dims)
+        if axis == -1:
+            axis = len(in_dims) - 1
+        prev_dims = 1
+        next_dims = 1
+        for i in range(axis):
+            prev_dims *= in_dims[i]
+        for i in range(axis + 1, len(in_dims)):
+            next_dims *= in_dims[i]
+        n = in_dims[axis]
+        X_flat = X.reshape((prev_dims, n, next_dims))
+
+        values_ref = np.ndarray(
+            shape=(prev_dims, k, next_dims), dtype=np.float32)
+        values_ref.fill(0)
+        indices_ref = np.ndarray(
+            shape=(prev_dims, k, next_dims), dtype=np.int64)
+        indices_ref.fill(-1)
+        flatten_indices_ref = np.ndarray(
+            shape=(prev_dims, k, next_dims), dtype=np.int64)
+        flatten_indices_ref.fill(-1)
+        for i in range(prev_dims):
+            for j in range(next_dims):
+                kv = []
+                for x in range(n):
+                    val = X_flat[i, x, j]
+                    y = x * next_dims + i * in_dims[axis] * next_dims + j
+                    kv.append((val, x, y))
+                cnt = 0
+                for val, x, y in sorted(
+                        kv, key=lambda x: (x[0], -x[1]), reverse=True):
+                    values_ref[i, cnt, j] = val
+                    indices_ref[i, cnt, j] = x
+                    flatten_indices_ref[i, cnt, j] = y
+                    cnt += 1
+                    if cnt >= k or cnt >= n:
+                        break
+
+        values_ref = values_ref.reshape(out_dims)
+        indices_ref = indices_ref.reshape(out_dims)
+        flatten_indices_ref = flatten_indices_ref.flatten()
+
+        if flatten_indices:
+            return (values_ref, indices_ref, flatten_indices_ref)
+        else:
+            return (values_ref, indices_ref)
+
+    @given(
+        X=hu.tensor(),
+        flatten_indices=st.booleans(),
+        seed=st.integers(0, 10),
+        **hu.gcs
+    )
+    def test_top_k(self, X, flatten_indices, seed, gc, dc):
+        X = X.astype(dtype=np.float32)
+        np.random.seed(seed)
+        # `k` can be larger than the total size
+        k = np.random.randint(1, X.shape[-1] + 4)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(1, 1), k=st.integers(1, 1),
+           flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_1(self, bs, n, k, flatten_indices, gc, dc):
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(1, 10000), k=st.integers(1, 1),
+           flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_2(self, bs, n, k, flatten_indices, gc, dc):
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(1, 10000),
+           k=st.integers(1, 1024), flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_3(self, bs, n, k, flatten_indices, gc, dc):
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(100, 10000),
+           flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_4(self, bs, n, flatten_indices, gc, dc):
+        k = np.random.randint(n // 3, 3 * n // 4)
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(1, 1024),
+           flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_5(self, bs, n, flatten_indices, gc, dc):
+        k = n
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(bs=st.integers(1, 3), n=st.integers(1, 5000),
+           flatten_indices=st.booleans(), **hu.gcs)
+    def test_top_k_6(self, bs, n, flatten_indices, gc, dc):
+        k = n
+        X = np.random.rand(bs, n).astype(dtype=np.float32)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator("TopK", ["X"], output_list,
+                                 k=k, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(X=hu.tensor(dtype=np.float32), k=st.integers(1, 5),
+           axis=st.integers(-1, 5), flatten_indices=st.booleans(),
+           **hu.gcs)
+    def test_top_k_axis(self, X, k, axis, flatten_indices, gc, dc):
+        dims = X.shape
+        if axis >= len(dims):
+            axis %= len(dims)
+
+        output_list = ["Values", "Indices"]
+        if flatten_indices:
+            output_list.append("FlattenIndices")
+        op = core.CreateOperator(
+            "TopK", ["X"], output_list, k=k, axis=axis, device_option=gc)
+
+        def bind_ref(X_loc):
+            return self.top_k_ref(X_loc, k, flatten_indices, axis)
+
+        self.assertReferenceChecks(gc, op, [X], bind_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+    @given(X=hu.tensor(dtype=np.float32), k=st.integers(1, 5),
+           axis=st.integers(-1, 5), **hu.gcs)
+    def test_top_k_grad(self, X, k, axis, gc, dc):
+        dims = X.shape
+        if axis >= len(dims):
+            axis %= len(dims)
+
+        input_axis = len(dims) - 1 if axis == -1 else axis
+        prev_dims = 1
+        next_dims = 1
+        for i in range(input_axis):
+            prev_dims *= dims[i]
+        for i in range(input_axis + 1, len(dims)):
+            next_dims *= dims[i]
+
+        X_flat = X.reshape((prev_dims, dims[input_axis], next_dims))
+        for i in range(prev_dims):
+            for j in range(next_dims):
+                # this try to make sure adding stepsize (0.05)
+                # will not change TopK selections at all
+                X_flat[i, :, j] = np.arange(dims[axis], dtype=np.float32) / 5
+                np.random.shuffle(X_flat[i, :, j])
+        X = X_flat.reshape(dims)
+
+        op = core.CreateOperator(
+            "TopK", ["X"], ["Values", "Indices"], k=k, axis=axis,
+            device_option=gc)
+
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.05)
diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py
new file mode 100644
index 0000000..e0112ff
--- /dev/null
+++ b/caffe2/python/operator_test/transpose_op_test.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+import numpy as np
+import unittest
+
+
+class TestTransposeOp(hu.HypothesisTestCase):
+    @given(X=hu.tensor(dtype=np.float32), use_axes=st.booleans(), **hu.gcs)
+    def test_transpose(self, X, use_axes, gc, dc):
+        ndim = len(X.shape)
+        axes = np.arange(ndim)
+        np.random.shuffle(axes)
+
+        if (use_axes):
+            op = core.CreateOperator(
+                "Transpose", ["X"], ["Y"], axes=axes, device_option=gc)
+        else:
+            op = core.CreateOperator(
+                "Transpose", ["X"], ["Y"], device_option=gc)
+
+        def transpose_ref(X):
+            if use_axes:
+                return [np.transpose(X, axes=axes)]
+            else:
+                return [np.transpose(X)]
+
+        self.assertReferenceChecks(gc, op, [X], transpose_ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0])
+
+    @unittest.skipIf(not workspace.has_gpu_support, "no gpu support")
+    @given(X=hu.tensor(dtype=np.float32), use_axes=st.booleans(),
+           **hu.gcs_gpu_only)
+    def test_transpose_cudnn(self, X, use_axes, gc, dc):
+        ndim = len(X.shape)
+        axes = np.arange(ndim)
+        np.random.shuffle(axes)
+
+        if (use_axes):
+            op = core.CreateOperator(
+                "Transpose", ["X"], ["Y"], axes=axes, engine="CUDNN",
+                device_option=hu.gpu_do)
+        else:
+            op = core.CreateOperator(
+                "Transpose", ["X"], ["Y"], engine="CUDNN",
+                device_option=hu.gpu_do)
+
+        def transpose_ref(X):
+            if use_axes:
+                return [np.transpose(X, axes=axes)]
+            else:
+                return [np.transpose(X)]
+
+        self.assertReferenceChecks(hu.gpu_do, op, [X], transpose_ref)
+        self.assertGradientChecks(hu.gpu_do, op, [X], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py
new file mode 100644
index 0000000..8344911
--- /dev/null
+++ b/caffe2/python/operator_test/trigonometric_op_test.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+from hypothesis import strategies as st
+import caffe2.python.hypothesis_test_util as hu
+
+import numpy as np
+import unittest
+
+
+class TestTrigonometricOp(hu.HypothesisTestCase):
+    @given(X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)), **hu.gcs)
+    def test_acos(self, X, gc, dc):
+        self.assertTrigonometricChecks("Acos", X, lambda x: (np.arccos(X),), gc, dc)
+
+    @given(X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)), **hu.gcs)
+    def test_asin(self, X, gc, dc):
+        self.assertTrigonometricChecks("Asin", X, lambda x: (np.arcsin(X),), gc, dc)
+
+    @given(X=hu.tensor(elements=st.floats(min_value=-100, max_value=100)), **hu.gcs)
+    def test_atan(self, X, gc, dc):
+        self.assertTrigonometricChecks("Atan", X, lambda x: (np.arctan(X),), gc, dc)
+
+    @given(X=hu.tensor(elements=st.floats(min_value=-0.5, max_value=0.5)), **hu.gcs)
+    def test_tan(self, X, gc, dc):
+        self.assertTrigonometricChecks("Tan", X, lambda x: (np.tan(X),), gc, dc)
+
+    def assertTrigonometricChecks(self, op_name, input, reference, gc, dc):
+        op = core.CreateOperator(op_name, ["X"], ["Y"])
+        self.assertReferenceChecks(gc, op, [input], reference)
+        self.assertDeviceChecks(dc, op, [input], [0])
+        self.assertGradientChecks(gc, op, [input], 0, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py
new file mode 100644
index 0000000..dc7c7b2
--- /dev/null
+++ b/caffe2/python/operator_test/unique_ops_test.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+import numpy as np
+from hypothesis import given
+from functools import partial
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+def _unique_ref(x, return_inverse):
+    ret = np.unique(x, return_inverse=return_inverse)
+    if not return_inverse:
+        ret = [ret]
+    return ret
+
+
+class TestUniqueOps(hu.HypothesisTestCase):
+    @given(
+        X=hu.tensor1d(
+            # allow empty
+            min_len=0,
+            dtype=np.int32,
+            # allow negatives
+            elements=st.integers(min_value=-10, max_value=10)),
+        return_remapping=st.booleans(),
+        **hu.gcs
+    )
+    def test_unique_op(self, X, return_remapping, gc, dc):
+        # impl of unique op does not guarantees return order, sort the input
+        # so different impl return same outputs
+        X = np.sort(X)
+
+        op = core.CreateOperator(
+            "Unique",
+            ['X'],
+            ["U", "remap"] if return_remapping else ["U"],
+        )
+        self.assertDeviceChecks(
+            device_options=dc,
+            op=op,
+            inputs=[X],
+            outputs_to_check=[0, 1] if return_remapping else [0]
+        )
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=partial(_unique_ref, return_inverse=return_remapping),
+        )
diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
new file mode 100644
index 0000000..f858e8f
--- /dev/null
+++ b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+import unittest
+
+
+class TestUniqueUniformFillOp(hu.HypothesisTestCase):
+    @given(
+        r=st.integers(1000, 10000),
+        avoid=st.lists(
+            st.integers(1, 1000),
+            min_size=1,
+            max_size=100,
+            unique=True
+        ),
+        dtypes=st.sampled_from(
+            [
+                (np.int32, core.DataType.INT32),
+                (np.int64, core.DataType.INT64)
+            ]
+        ),
+        s=st.integers(10, 500),
+        **hu.gcs_cpu_only
+    )
+    def test_unique_uniform_int_fill(self, r, avoid, dtypes, s, gc, dc):
+        net = core.Net("net")
+        workspace.FeedBlob("X", np.array([s], dtype=np.int64))
+        workspace.FeedBlob("AVOID", np.array(avoid, dtype=dtypes[0]))
+        net.UniqueUniformFill(
+            ["X", "AVOID"], ["Y"],
+            min=1,
+            max=r,
+            input_as_shape=True,
+            dtype=dtypes[1]
+        )
+        workspace.RunNetOnce(net)
+        y = workspace.FetchBlob("Y")
+        self.assertEqual(s, len(y))
+        self.assertEqual(s, len(set(y)))
+        self.assertEqual(s, len(set(y) - set(avoid)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py
new file mode 100644
index 0000000..c4dde06
--- /dev/null
+++ b/caffe2/python/operator_test/upsample_op_test.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core
+from hypothesis import given
+
+
+class TestUpSample(hu.HypothesisTestCase):
+    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
+           width_scale=st.floats(1.0, 4.0) | st.just(2.0),
+           height=st.integers(4, 32),
+           width=st.integers(4, 32),
+           num_channels=st.integers(1, 4),
+           batch_size=st.integers(1, 4),
+           seed=st.integers(0, 65535),
+           **hu.gcs_cpu_only)
+    def test_upsample(self, height_scale, width_scale, height, width,
+                     num_channels, batch_size, seed,
+                     gc, dc):
+
+        np.random.seed(seed)
+        op = core.CreateOperator(
+            "UpsampleBilinear",
+            ["X"],
+            ["Y"],
+            width_scale=width_scale,
+            height_scale=height_scale,
+        )
+
+        X = np.random.rand(
+            batch_size, num_channels, height, width).astype(np.float32)
+
+        def ref(X):
+            output_height = np.int32(height * height_scale)
+            output_width = np.int32(width * width_scale)
+
+            Y = np.random.rand(
+                batch_size, num_channels, output_height,
+                output_width).astype(np.float32)
+
+            rheight = ((height - 1) / (output_height - 1)
+                    if output_height > 1
+                    else float(0))
+            rwidth = (width - 1) / (output_width - 1) if output_width > 1 else float(0)
+
+            for i in range(output_height):
+                h1r = rheight * i
+                h1 = int(h1r)
+                h1p = 1 if h1 < height - 1 else 0
+                h1lambda = h1r - h1
+                h0lambda = float(1) - h1lambda
+                for j in range(output_width):
+                    w1r = rwidth * j
+                    w1 = int(w1r)
+                    w1p = 1 if w1 < width - 1 else 0
+                    w1lambda = w1r - w1
+                    w0lambda = float(1) - w1lambda
+                    Y[:, :, i, j] = (h0lambda * (w0lambda * X[:, :, h1, w1] +
+                        w1lambda * X[:, :, h1, w1 + w1p]) +
+                        h1lambda * (w0lambda * X[:, :, h1 + h1p, w1] +
+                        w1lambda * X[:, :, h1 + h1p, w1 + w1p]))
+
+            return Y,
+
+        self.assertReferenceChecks(gc, op, [X], ref)
+        self.assertDeviceChecks(dc, op, [X], [0])
+        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.1, threshold=1e-2)
+
+    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
+           width_scale=st.floats(1.0, 4.0) | st.just(2.0),
+           height=st.integers(4, 32),
+           width=st.integers(4, 32),
+           num_channels=st.integers(1, 4),
+           batch_size=st.integers(1, 4),
+           seed=st.integers(0, 65535),
+           **hu.gcs_cpu_only)
+    def test_upsample_grad(self, height_scale, width_scale, height, width,
+                          num_channels, batch_size, seed, gc, dc):
+
+        np.random.seed(seed)
+
+        output_height = np.int32(height * height_scale)
+        output_width = np.int32(width * width_scale)
+        X = np.random.rand(batch_size,
+                           num_channels,
+                           height,
+                           width).astype(np.float32)
+        dY = np.random.rand(batch_size,
+                            num_channels,
+                            output_height,
+                            output_width).astype(np.float32)
+
+        op = core.CreateOperator(
+            "UpsampleBilinearGradient",
+            ["dY", "X"],
+            ["dX"],
+            width_scale=width_scale,
+            height_scale=height_scale,
+        )
+
+        def ref(dY, X):
+            dX = np.zeros_like(X)
+
+            rheight = ((height - 1) / (output_height - 1)
+                    if output_height > 1
+                    else float(0))
+            rwidth = (width - 1) / (output_width - 1) if output_width > 1 else float(0)
+
+            for i in range(output_height):
+                h1r = rheight * i
+                h1 = int(h1r)
+                h1p = 1 if h1 < height - 1 else 0
+                h1lambda = h1r - h1
+                h0lambda = float(1) - h1lambda
+                for j in range(output_width):
+                    w1r = rwidth * j
+                    w1 = int(w1r)
+                    w1p = 1 if w1 < width - 1 else 0
+                    w1lambda = w1r - w1
+                    w0lambda = float(1) - w1lambda
+                    dX[:, :, h1, w1] += h0lambda * w0lambda * dY[:, :, i, j]
+                    dX[:, :, h1, w1 + w1p] += h0lambda * w1lambda * dY[:, :, i, j]
+                    dX[:, :, h1 + h1p, w1] += h1lambda * w0lambda * dY[:, :, i, j]
+                    dX[:, :, h1 + h1p, w1 + w1p] += h1lambda * w1lambda * dY[:, :, i, j]
+
+            return dX,
+
+        self.assertDeviceChecks(dc, op, [dY, X], [0])
+        self.assertReferenceChecks(gc, op, [dY, X], ref)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
new file mode 100644
index 0000000..76d3dee
--- /dev/null
+++ b/caffe2/python/operator_test/utility_ops_test.py
@@ -0,0 +1,417 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from hypothesis import assume, given
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import random
+import unittest
+
+
+class TestUtilityOps(hu.HypothesisTestCase):
+
+    @given(X=hu.tensor(), args=st.booleans(), **hu.gcs)
+    def test_slice(self, X, args, gc, dc):
+        X = X.astype(dtype=np.float32)
+        dim = random.randint(0, X.ndim - 1)
+        slice_start = random.randint(0, X.shape[dim] - 1)
+        slice_end = random.randint(slice_start, X.shape[dim] - 1)
+        starts = np.array([0] * X.ndim).astype(np.int32)
+        ends = np.array([-1] * X.ndim).astype(np.int32)
+        starts[dim] = slice_start
+        ends[dim] = slice_end
+
+        if args:
+            op = core.CreateOperator(
+                "Slice", ["X"], ["Y"], starts=starts, ends=ends, device_option=gc
+            )
+
+            def slice_ref(X):
+                slc = [slice(None)] * X.ndim
+                slc[dim] = slice(slice_start, slice_end)
+                return [X[slc]]
+            inputs = [X]
+        else:
+            op = core.CreateOperator(
+                "Slice", ["X", "starts", "ends"], ["Y"], device_option=gc
+            )
+
+            def slice_ref(x, starts, ends):
+                slc = [slice(None)] * x.ndim
+                slc[dim] = slice(slice_start, slice_end)
+                return [x[slc]]
+            inputs = [X, starts, ends]
+
+        self.assertReferenceChecks(gc, op, inputs, slice_ref)
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        self.assertGradientChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            outputs_to_check=0,
+            outputs_with_grads=[0],
+        )
+
+    @given(dtype=st.sampled_from([np.float32, np.int32]),
+           ndims=st.integers(min_value=1, max_value=5),
+           seed=st.integers(min_value=0, max_value=65536),
+           null_axes=st.booleans(),
+           engine=st.sampled_from(['CUDNN', None]),
+           **hu.gcs)
+    def test_transpose(self, dtype, ndims, seed, null_axes, engine, gc, dc):
+        if (gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN"):
+            # cudnn 5.1 does not support int.
+            assume(workspace.GetCuDNNVersion() >= 6000 or dtype != np.int32)
+
+        dims = (np.random.rand(ndims) * 16 + 1).astype(np.int32)
+        X = (np.random.rand(*dims) * 16).astype(dtype)
+
+        if null_axes:
+            axes = None
+            op = core.CreateOperator(
+                "Transpose",
+                ["input"], ["output"],
+                engine=engine)
+        else:
+            np.random.seed(int(seed))
+            axes = [int(v) for v in list(np.random.permutation(X.ndim))]
+            op = core.CreateOperator(
+                "Transpose",
+                ["input"], ["output"],
+                axes=axes,
+                engine=engine)
+
+        def transpose_ref(x, axes):
+            return (np.transpose(x, axes),)
+
+        self.assertReferenceChecks(gc, op, [X, axes],
+                                   transpose_ref)
+
+    @given(m=st.integers(5, 10), n=st.integers(5, 10),
+           o=st.integers(5, 10), nans=st.booleans(), **hu.gcs)
+    def test_nan_check(self, m, n, o, nans, gc, dc):
+        other = np.array([1, 2, 3]).astype(np.float32)
+        X = np.random.rand(m, n, o).astype(np.float32)
+        if nans:
+            x_nan = np.random.randint(0, m)
+            y_nan = np.random.randint(0, n)
+            z_nan = np.random.randint(0, o)
+            X[x_nan, y_nan, z_nan] = float('NaN')
+
+        # print('nans: {}'.format(nans))
+        # print(X)
+
+        def nan_reference(X, Y):
+            if not np.isnan(X).any():
+                return [X]
+            else:
+                return [np.array([])]
+
+        op = core.CreateOperator(
+            "NanCheck",
+            ["X", "other"],
+            ["Y"]
+        )
+
+        try:
+            self.assertReferenceChecks(
+                device_option=gc,
+                op=op,
+                inputs=[X, other],
+                reference=nan_reference,
+            )
+            if nans:
+                self.assertTrue(False, "Did not fail when presented with NaN!")
+        except RuntimeError:
+            self.assertTrue(nans, "No NaNs but failed")
+
+        try:
+            self.assertGradientChecks(
+                device_option=gc,
+                op=op,
+                inputs=[X],
+                outputs_to_check=0,
+                outputs_with_grads=[0],
+            )
+            if nans:
+                self.assertTrue(False, "Did not fail when gradient had NaN!")
+        except RuntimeError:
+            pass
+
+    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+           d=st.integers(2, 3), **hu.gcs)
+    def test_elementwise_max(self, n, m, d, gc, dc):
+        X = np.random.rand(n, m, d).astype(np.float32)
+        Y = np.random.rand(n, m, d).astype(np.float32)
+        Z = np.random.rand(n, m, d).astype(np.float32)
+        inputs = [X, Y, Z]
+
+        def max_op(X, Y, Z):
+            return [np.maximum(np.maximum(X, Y), Z)]
+
+        op = core.CreateOperator(
+            "Max",
+            ["X", "Y", "Z"],
+            ["mx"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=max_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+           d=st.integers(2, 3), **hu.gcs)
+    def test_elementwise_max_grad(self, n, m, d, gc, dc):
+        go = np.random.rand(n, m, d).astype(np.float32)
+        X = np.random.rand(n, m, d).astype(np.float32)
+        Y = np.random.rand(n, m, d).astype(np.float32)
+        Z = np.random.rand(n, m, d).astype(np.float32)
+        mx = np.maximum(np.maximum(X, Y), Z)
+        inputs = [mx, go, X, Y, Z]
+
+        def max_grad_op(mx, go, X, Y, Z):
+            def mx_grad(a):
+                return go * (mx == a)
+
+            return [mx_grad(a) for a in [X, Y, Z]]
+
+        op = core.CreateOperator(
+            "MaxGradient",
+            ["mx", "go", "X", "Y", "Z"],
+            ["gX", "gY", "gZ"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=max_grad_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
+
+    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+           d=st.integers(2, 3), **hu.gcs)
+    def test_elementwise_min(self, n, m, d, gc, dc):
+        X = np.random.rand(n, m, d).astype(np.float32)
+        Y = np.random.rand(n, m, d).astype(np.float32)
+        Z = np.random.rand(n, m, d).astype(np.float32)
+        inputs = [X, Y, Z]
+
+        def min_op(X, Y, Z):
+            return [np.minimum(np.minimum(X, Y), Z)]
+
+        op = core.CreateOperator(
+            "Min",
+            ["X", "Y", "Z"],
+            ["mx"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=min_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+
+    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+           d=st.integers(2, 3), **hu.gcs)
+    def test_elementwise_min_grad(self, n, m, d, gc, dc):
+        go = np.random.rand(n, m, d).astype(np.float32)
+        X = np.random.rand(n, m, d).astype(np.float32)
+        Y = np.random.rand(n, m, d).astype(np.float32)
+        Z = np.random.rand(n, m, d).astype(np.float32)
+        mx = np.minimum(np.minimum(X, Y), Z)
+        inputs = [mx, go, X, Y, Z]
+
+        def min_grad_op(mx, go, X, Y, Z):
+            def mx_grad(a):
+                return go * (mx == a)
+
+            return [mx_grad(a) for a in [X, Y, Z]]
+
+        op = core.CreateOperator(
+            "MinGradient",
+            ["mx", "go", "X", "Y", "Z"],
+            ["gX", "gY", "gZ"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=min_grad_op,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
+
+    @given(
+        inputs=hu.lengths_tensor().flatmap(
+            lambda pair: st.tuples(
+                st.just(pair[0]),
+                st.just(pair[1]),
+                hu.dims(max_value=len(pair[1])),
+            )
+        ).flatmap(
+            lambda tup: st.tuples(
+                st.just(tup[0]),
+                st.just(tup[1]),
+                hu.arrays(
+                    tup[2], dtype=np.int32,
+                    elements=st.integers(
+                        min_value=0, max_value=len(tup[1]) - 1)),
+            )
+        ),
+        **hu.gcs_cpu_only)
+    def test_lengths_gather(self, inputs, gc, dc):
+        items = inputs[0]
+        lengths = inputs[1]
+        indices = inputs[2]
+
+        def lengths_gather_op(items, lengths, indices):
+            ends = np.cumsum(lengths)
+            return [np.concatenate(
+                list(items[ends[i] - lengths[i]:ends[i]] for i in indices))]
+
+        op = core.CreateOperator(
+            "LengthsGather",
+            ["items", "lengths", "indices"],
+            ["output"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[items, lengths, indices],
+            reference=lengths_gather_op,
+        )
+
+    @given(
+        inputs=hu.lengths_tensor(),
+        **hu.gcs_cpu_only)
+    def test_lengths_to_ranges(self, inputs, gc, dc):
+        _, lengths = inputs
+
+        def lengths_to_ranges_op(lengths):
+            return [
+                [[x, y] for x, y in zip(np.cumsum(np.append([0], lengths)),
+                                        lengths)]
+            ]
+
+        op = core.CreateOperator(
+            "LengthsToRanges",
+            ["lengths"],
+            ["output"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[lengths],
+            reference=lengths_to_ranges_op,
+        )
+
+        # Test shape inference logic
+        net = core.Net("test_shape_inference")
+
+        workspace.FeedBlob("lengths", lengths)
+        output = net.LengthsToRanges(
+            ["lengths"],
+            ["output"]
+        )
+        (shapes, types) = workspace.InferShapesAndTypes([net])
+        workspace.RunNetOnce(net)
+        self.assertEqual(shapes[output], list(workspace.blobs[output].shape))
+        self.assertEqual(shapes[output], list(lengths.shape) + [2])
+        self.assertEqual(types[output], core.DataType.INT32)
+
+    @given(**hu.gcs)
+    def test_size_op(self, gc, dc):
+        X = np.array([[1, 2], [3, 4]]).astype(np.float32)
+
+        def size_op(tensor):
+            return [np.prod(tensor.shape)]
+
+        op = core.CreateOperator(
+            "Size",
+            ["X"],
+            ["output"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=size_op,
+        )
+
+    def test_alias_op(self):
+        """ Don't use hypothesis because there are only 2 cases to check"""
+        for size in [0, 5]:
+            X = np.arange(size).astype(np.float32)
+            workspace.FeedBlob('X', X)
+
+            op = core.CreateOperator(
+                "Alias",
+                ["X"],
+                ["Y"]
+            )
+            workspace.RunOperatorOnce(op)
+            Y = workspace.FetchBlob('Y')
+            np.testing.assert_array_equal(X, Y)
+
+    @given(**hu.gcs)
+    def test_range(self, gc, dc):
+        names = [
+            ('stop_',),
+            ('start_', 'stop_'),
+            ('start_', 'stop_', 'step_'),
+        ]
+        # Most random values aren't great here, so use a fixed set instead of
+        # hypothesis.
+        for inputs in (
+            (10,),
+            (np.float32(10.0),),
+            (0,),
+            (0, 0),
+            (10., 5.0, -1.),
+            (2, 10000),
+            (2, 10000, 20000),
+            (2, 10000, -1),
+        ):
+            inputs = [np.array(v) for v in inputs]
+            op = core.CreateOperator(
+                "Range",
+                names[len(inputs) - 1],
+                ["Y"]
+            )
+
+            self.assertReferenceChecks(
+                device_option=gc,
+                op=op,
+                inputs=inputs,
+                reference=lambda *x: [np.arange(*x)],
+            )
+            self.assertDeviceChecks(dc, op, inputs, [0])
+
+        with self.assertRaisesRegexp(RuntimeError, 'Step size cannot be 0'):
+            inputs = (np.array(0), np.array(10), np.array(0))
+            op = core.CreateOperator(
+                "Range",
+                names[len(inputs) - 1],
+                ["Y"]
+            )
+            self.assertReferenceChecks(
+                device_option=gc,
+                op=op,
+                inputs=inputs,
+                reference=lambda *x: [np.arange(*x)],
+            )
diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py
new file mode 100644
index 0000000..3218e45
--- /dev/null
+++ b/caffe2/python/operator_test/video_input_op_test.py
@@ -0,0 +1,560 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+try:
+    import lmdb
+except ImportError:
+    raise unittest.SkipTest('python-lmdb is not installed')
+
+import sys
+import os
+import shutil
+import tempfile
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, model_helper
+import numpy as np
+
+
+class VideoInputOpTest(unittest.TestCase):
+
+    def create_a_list(self, output_file, line, n):
+        # create a list that repeat a line n times
+        # used for creating a list file for simple test input
+        with open(output_file, 'w') as file:
+            for _i in range(n):
+                file.write(line)
+
+    def create_video_db(self, list_file, output_file, use_list=False):
+        # Write to lmdb database...
+        LMDB_MAP_SIZE = 1 << 40   # MODIFY
+        env = lmdb.open(output_file, map_size=LMDB_MAP_SIZE)
+        total_size = 0
+
+        file_name = []
+        start_frame = []
+        label = []
+        index = 0
+
+        with env.begin(write=True) as txn:
+            with open(list_file, 'r') as data:
+                for line in data:
+                    p = line.split()
+                    file_name = p[0]
+                    start_frame = int(p[1])
+                    label = int(p[2])
+
+                    if not use_list:
+                        with open(file_name, mode='rb') as file:
+                            video_data = file.read()
+                    else:
+                        video_data = file_name
+
+                    tensor_protos = caffe2_pb2.TensorProtos()
+                    video_tensor = tensor_protos.protos.add()
+                    video_tensor.data_type = 4  # string data
+                    video_tensor.string_data.append(video_data)
+
+                    label_tensor = tensor_protos.protos.add()
+                    label_tensor.data_type = 2
+                    label_tensor.int32_data.append(label)
+
+                    start_frame_tensor = tensor_protos.protos.add()
+                    start_frame_tensor.data_type = 2
+                    start_frame_tensor.int32_data.append(start_frame)
+
+                    txn.put(
+                        '{}'.format(index).encode('ascii'),
+                        tensor_protos.SerializeToString()
+                    )
+                    index = index + 1
+                    total_size = total_size + len(video_data) + sys.getsizeof(int)
+        return total_size
+
+    # sample one clip randomly from the video
+    def test_rgb_with_temporal_jittering(self):
+        random_label = np.random.randint(0, 100)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=16,
+            clip_per_video=1,
+            crop_size=112,
+            scale_w=171,
+            scale_h=128,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            decode_type=0,
+            video_res_type=0)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label, random_label)
+        np.testing.assert_equal(data.shape, [16, 3, 8, 112, 112])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # sample multiple clips uniformly from the video
+    def test_rgb_with_uniform_sampling(self):
+        random_label = np.random.randint(0, 100)
+        clip_per_video = np.random.randint(2, 11)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=3,
+            clip_per_video=clip_per_video,
+            crop_size=112,
+            scale_w=171,
+            scale_h=128,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            decode_type=1,
+            video_res_type=0)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label, random_label)
+        np.testing.assert_equal(data.shape, [3 * clip_per_video, 3, 8, 112, 112])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # sample multiple clips uniformly from the video, rectangle cropping.
+    # VideoResType is USE_WIDTH_HEIGHT
+    def test_rgb_with_uniform_sampling_rectangle_cropping_use_width_height(self):
+        batch_size = 3
+        crop_height, crop_width = 112, 144
+        random_label = np.random.randint(0, 100)
+        clip_per_video = np.random.randint(2, 11)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=clip_per_video,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            scale_w=171,
+            scale_h=128,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            color_jitter=True,
+            color_lighting=True,
+            decode_type=1,
+            video_res_type=0)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(
+            label.shape, [batch_size * clip_per_video])
+        for i in range(batch_size * clip_per_video):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape,
+            [batch_size * clip_per_video, 3, 8, crop_height, crop_width])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # sample multiple clips uniformly from the video, rectangle cropping.
+    # VideoResType is USE_MINIMAL_WIDTH_HEIGHT
+    def test_rgb_with_uniform_sampling_rectangle_cropping_use_minimal_width_height(
+        self
+    ):
+        batch_size = 3
+        height_min, width_min = 128, 166
+        crop_height, crop_width = 112, 144
+        random_label = np.random.randint(0, 100)
+        clip_per_video = np.random.randint(2, 11)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=clip_per_video,
+            height_min=height_min,
+            width_min=width_min,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            color_jitter=True,
+            color_lighting=True,
+            decode_type=1,
+            video_res_type=1)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(
+            label.shape, [batch_size * clip_per_video])
+        for i in range(batch_size * clip_per_video):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape,
+            [batch_size * clip_per_video, 3, 8, crop_height, crop_width])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # sample multiple clips uniformly from the video, while color jitterring
+    # and lighting are enabled
+    def test_rgb_with_uniform_sampling_color_jittering_lighting(self):
+        batch_size = 3
+        random_label = np.random.randint(0, 100)
+        clip_per_video = np.random.randint(2, 11)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=clip_per_video,
+            crop_size=112,
+            scale_w=171,
+            scale_h=128,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            color_jitter=True,
+            color_lighting=True,
+            decode_type=1,
+            video_res_type=0)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(
+            label.shape, [batch_size * clip_per_video])
+        for i in range(batch_size * clip_per_video):
+            np.testing.assert_equal(label[i], random_label)
+
+        np.testing.assert_equal(
+            data.shape,
+            [batch_size * clip_per_video, 3, 8, 112, 112])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # sample multiple clips uniformly from the video
+    def test_rgb_with_uniform_sampling_and_multi_cropping(self):
+        # we take left-top, central-top, right-top, left-bottom, central-bottom,
+        # right-bottom and central-central croppings as well as their mirrorings
+        # In total, 14 croppings
+        multi_crop_count = 14
+        batch_size = 3
+        random_label = np.random.randint(0, 100)
+        clip_per_video = np.random.randint(2, 11)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+
+        # build the model
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=clip_per_video,
+            crop_size=112,
+            scale_w=171,
+            scale_h=128,
+            length_rgb=8,
+            sampling_rate_rgb=1,
+            decode_type=1,
+            multi_crop=True,
+            video_res_type=0)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(
+            label.shape, [batch_size * clip_per_video * multi_crop_count])
+        for i in range(batch_size * clip_per_video * multi_crop_count):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape,
+            [batch_size * clip_per_video * multi_crop_count, 3, 8, 112, 112])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # test optical flow
+    def test_optical_flow_with_temporal_jittering(self):
+        random_label = np.random.randint(0, 100)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, 16)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=16,
+            clip_per_video=1,
+            crop_size=112,
+            scale_w=171,
+            scale_h=128,
+            length_of=8,
+            sampling_rate_of=1,
+            frame_gap_of=1,
+            decode_type=0,
+            video_res_type=0,
+            get_rgb=False,
+            get_optical_flow=True)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label, random_label)
+        np.testing.assert_equal(data.shape, [16, 2, 8, 112, 112])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # test optical flow, rectangle cropping, VideoResType is
+    # USE_WIDTH_HEIGHT
+    def test_optical_flow_with_rectangle_cropping_use_width_height(self):
+        batch_size = 16
+        scale_h, scale_w = 128, 166
+        crop_height, crop_width = 112, 144
+        random_label = np.random.randint(0, 100)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, batch_size)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=1,
+            scale_h=scale_h,
+            scale_w=scale_w,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            length_of=8,
+            sampling_rate_of=1,
+            frame_gap_of=1,
+            decode_type=0,
+            video_res_type=0,
+            get_rgb=False,
+            get_optical_flow=True)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label.shape, [batch_size])
+        for i in range(batch_size):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape, [batch_size, 2, 8, crop_height, crop_width])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # test optical flow, rectangle cropping, VideoResType is
+    # USE_MINIMAL_WIDTH_HEIGHT
+    def test_optical_flow_with_rectangle_cropping_use_minimal_width_height(self):
+        batch_size = 16
+        height_min, width_min = 128, 166
+        crop_height, crop_width = 112, 144
+        random_label = np.random.randint(0, 100)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, batch_size)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=1,
+            height_min=height_min,
+            width_min=width_min,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            length_of=8,
+            sampling_rate_of=1,
+            frame_gap_of=1,
+            decode_type=0,
+            video_res_type=1,
+            get_rgb=False,
+            get_optical_flow=True)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label.shape, [batch_size])
+        for i in range(batch_size):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape, [batch_size, 2, 8, crop_height, crop_width])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+    # test optical flow, multi-cropping
+    def test_optical_flow_with_multi_cropping(self):
+        multi_crop_count = 14
+        batch_size = 16
+        height_min, width_min = 128, 166
+        crop_height, crop_width = 112, 144
+        random_label = np.random.randint(0, 100)
+        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
+        if not os.path.exists(VIDEO):
+            raise unittest.SkipTest('Missing data')
+        temp_list = tempfile.NamedTemporaryFile(delete=False).name
+        line_str = '{} 0 {}\n'.format(VIDEO, random_label)
+        self.create_a_list(temp_list, line_str, batch_size)
+        video_db_dir = tempfile.mkdtemp()
+
+        self.create_video_db(temp_list, video_db_dir)
+        model = model_helper.ModelHelper(name="Video Loader from LMDB")
+        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
+        model.net.VideoInput(
+            reader,
+            ["data", "label"],
+            name="data",
+            batch_size=batch_size,
+            clip_per_video=1,
+            height_min=height_min,
+            width_min=width_min,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            length_of=8,
+            sampling_rate_of=1,
+            frame_gap_of=1,
+            decode_type=0,
+            multi_crop=True,
+            video_res_type=1,
+            get_rgb=False,
+            get_optical_flow=True)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+        data = workspace.FetchBlob("data")
+        label = workspace.FetchBlob("label")
+
+        np.testing.assert_equal(label.shape, [batch_size * multi_crop_count])
+        for i in range(batch_size * multi_crop_count):
+            np.testing.assert_equal(label[i], random_label)
+        np.testing.assert_equal(
+            data.shape,
+            [batch_size * multi_crop_count, 2, 8, crop_height, crop_width])
+        os.remove(temp_list)
+        shutil.rmtree(video_db_dir)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py
new file mode 100644
index 0000000..8b09665
--- /dev/null
+++ b/caffe2/python/operator_test/weighted_multi_sample_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+from caffe2.python import workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestWeightedMultiSample(hu.HypothesisTestCase):
+    @given(
+        num_samples=st.integers(min_value=0, max_value=128),
+        data_len=st.integers(min_value=0, max_value=10000),
+        **hu.gcs_cpu_only
+    )
+    def test_weighted_multi_sample(self, num_samples, data_len, gc, dc):
+        weights = np.zeros((data_len))
+        expected_indices = []
+        if data_len > 0:
+            weights[-1] = 1.5
+            expected_indices = np.repeat(data_len - 1, num_samples)
+
+        workspace.FeedBlob("weights", weights.astype(np.float32))
+
+        op = core.CreateOperator(
+            "WeightedMultiSampling",
+            ["weights"],
+            ["sample_indices"],
+            num_samples=num_samples,
+        )
+        workspace.RunOperatorOnce(op)
+        result_indices = workspace.FetchBlob("sample_indices")
+        np.testing.assert_allclose(expected_indices, result_indices)
+        self.assertDeviceChecks(
+            dc,
+            op,
+            [weights.astype(np.float32)],
+            [0]
+        )
+
+        # test shape input
+        shape = np.zeros((num_samples))
+        workspace.FeedBlob("shape", shape)
+        op2 = core.CreateOperator(
+            "WeightedMultiSampling",
+            ["weights", "shape"],
+            ["sample_indices_2"]
+        )
+        workspace.RunOperatorOnce(op2)
+        result_indices_2 = workspace.FetchBlob("sample_indices_2")
+        if data_len > 0:
+            assert len(result_indices_2) == num_samples
+            for i in range(num_samples):
+                assert 0 <= result_indices_2[i] < data_len
+        else:
+            assert len(result_indices_2) == 0
+
+        self.assertDeviceChecks(dc, op2, [weights.astype(np.float32), shape], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py
new file mode 100644
index 0000000..24326d6
--- /dev/null
+++ b/caffe2/python/operator_test/weighted_sample_test.py
@@ -0,0 +1,80 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+from caffe2.python import workspace
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestWeightedSample(hu.HypothesisTestCase):
+    @given(
+        batch=st.integers(min_value=0, max_value=128),
+        weights_len=st.integers(min_value=0, max_value=128),
+        **hu.gcs
+    )
+    def test_weighted_sample(self, batch, weights_len, gc, dc):
+
+        weights = np.zeros((batch, weights_len))
+        values = np.zeros((batch, weights_len))
+        rand_indices = []
+        rand_values = []
+        if batch > 0 and weights_len > 0:
+            for i in range(batch):
+                rand_tmp = np.random.randint(0, weights_len)
+                rand_val = np.random.rand()
+                rand_indices.append(rand_tmp)
+                rand_values.append(rand_val)
+                weights[i, rand_tmp] = 1.0
+                values[i, rand_tmp] = rand_val
+
+        rand_indices = np.array(rand_indices, dtype=np.float32)
+        rand_values = np.array(rand_values, dtype=np.float32)
+        workspace.FeedBlob("weights", weights.astype(np.float32))
+        workspace.FeedBlob("values", values.astype(np.float32))
+
+        # output both indices and values
+        op = core.CreateOperator(
+            "WeightedSample", ["weights", "values"],
+            ["sample_indices", "sample_values"]
+        )
+        workspace.RunOperatorOnce(op)
+        result_indices = workspace.FetchBlob("sample_indices")
+        result_values = workspace.FetchBlob("sample_values")
+        if batch > 0 and weights_len > 0:
+            for i in range(batch):
+                np.testing.assert_allclose(rand_indices[i], result_indices[i])
+                np.testing.assert_allclose(rand_values[i], result_values[i])
+        else:
+            np.testing.assert_allclose(rand_indices, result_indices)
+            np.testing.assert_allclose(rand_values, result_values)
+        self.assertDeviceChecks(
+            dc,
+            op,
+            [weights.astype(np.float32), values.astype(np.float32)],
+            [0, 1]
+        )
+
+        # output indices only
+        op2 = core.CreateOperator(
+            "WeightedSample", ["weights"], ["sample_indices_2"]
+        )
+        workspace.RunOperatorOnce(op2)
+        result = workspace.FetchBlob("sample_indices_2")
+        if batch > 0 and weights_len > 0:
+            for i in range(batch):
+                np.testing.assert_allclose(rand_indices[i], result[i])
+        else:
+            np.testing.assert_allclose(rand_indices, result)
+        self.assertDeviceChecks(dc, op2, [weights.astype(np.float32)], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
new file mode 100644
index 0000000..9fe0526
--- /dev/null
+++ b/caffe2/python/operator_test/weighted_sum_test.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestWeightedSumOp(hu.HypothesisTestCase):
+
+    @given(n=st.integers(5, 8), m=st.integers(1, 1),
+           d=st.integers(2, 4), grad_on_w=st.booleans(),
+           **hu.gcs_cpu_only)
+    def test_weighted_sum(self, n, m, d, grad_on_w, gc, dc):
+        input_names = []
+        input_vars = []
+        for i in range(m):
+            X_name = 'X' + str(i)
+            w_name = 'w' + str(i)
+            input_names.extend([X_name, w_name])
+            var = np.random.rand(n, d).astype(np.float32)
+            vars()[X_name] = var
+            input_vars.append(var)
+            var = np.random.rand(1).astype(np.float32)
+            vars()[w_name] = var
+            input_vars.append(var)
+
+        def weighted_sum_op_ref(*args):
+            res = np.zeros((n, d))
+            for i in range(m):
+                res = res + args[2 * i + 1] * args[2 * i]
+
+            return (res, )
+
+        op = core.CreateOperator(
+            "WeightedSum",
+            input_names,
+            ['Y'],
+            grad_on_w=grad_on_w,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=input_vars,
+            reference=weighted_sum_op_ref,
+        )
+
+        output_to_check_grad = range(2 * m) if grad_on_w else range(0, 2 * m, 2)
+        for i in output_to_check_grad:
+            self.assertGradientChecks(
+                device_option=gc,
+                op=op,
+                inputs=input_vars,
+                outputs_to_check=i,
+                outputs_with_grads=[0],
+            )
diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py
new file mode 100644
index 0000000..7f81dc2
--- /dev/null
+++ b/caffe2/python/operator_test/wngrad_test.py
@@ -0,0 +1,215 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+
+import logging
+
+import hypothesis
+from hypothesis import given, settings, HealthCheck
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+logger = logging.getLogger(__name__)
+
+def ref_wngrad(param_in, seq_b_in, grad, lr, epsilon,
+                output_effective_lr=False,
+                output_effective_lr_and_update=False):
+    # helper functions for wngrad operator test
+    seq_b_out = seq_b_in + 1.0 / (seq_b_in + epsilon) * np.sum(grad * grad)
+    effective_lr = lr / (seq_b_in + epsilon)
+    grad_adj = effective_lr * grad
+    param_out = param_in + grad_adj
+    if output_effective_lr_and_update:
+        return (param_out.astype(np.float32), seq_b_out.astype(np.float32),
+                effective_lr.astype(np.float32),
+                grad_adj.astype(np.float32))
+    elif output_effective_lr:
+        return (param_out.astype(np.float32), seq_b_out.astype(np.float32),
+                effective_lr.astype(np.float32))
+    return (param_out.astype(np.float32), seq_b_out.astype(np.float32))
+
+
+def wngrad_sparse_test_helper(parent_test, inputs, seq_b, lr, epsilon,
+     engine, gc, dc):
+    # helper functions for wngrad operator test
+    param, grad = inputs
+    seq_b = np.array([seq_b, ], dtype=np.float32)
+    lr = np.array([lr], dtype=np.float32)
+
+    # Create an indexing array containing values that are lists of indices,
+    # which index into grad
+    indices = np.random.choice(np.arange(grad.shape[0]),
+        size=np.random.randint(grad.shape[0]), replace=False)
+
+    # Sparsify grad
+    grad = grad[indices]
+
+    op = core.CreateOperator(
+        "SparseWngrad",
+        ["param", "seq_b", "indices", "grad", "lr"],
+        ["param", "seq_b"],
+        epsilon=epsilon,
+        engine=engine,
+        device_option=gc)
+
+    def ref_sparse(param, seq_b, indices, grad, lr):
+        param_out = np.copy(param)
+        seq_b_out = np.copy(seq_b)
+        seq_b_out = seq_b + 1.0 / seq_b * np.sum(grad * grad)
+        for i, index in enumerate(indices):
+            param_out[index] = param[index] + lr / (seq_b + epsilon) * grad[i]
+        return (param_out, seq_b_out)
+
+    logger.info('test_sparse_adagrad with full precision embedding')
+    seq_b_i = seq_b.astype(np.float32)
+    param_i = param.astype(np.float32)
+
+    parent_test.assertReferenceChecks(
+        gc, op, [param_i, seq_b_i, indices, grad, lr],
+        ref_sparse
+    )
+
+
+class TestWngrad(hu.HypothesisTestCase):
+    @given(inputs=hu.tensors(n=2),
+           seq_b=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_wngrad_dense_base(self, inputs, seq_b, lr, epsilon, gc, dc):
+        param, grad = inputs
+        seq_b = np.array([seq_b, ], dtype=np.float32)
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Wngrad",
+            ["param", "seq_b", "grad", "lr"],
+            ["param", "seq_b"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, seq_b, grad, lr],
+            functools.partial(ref_wngrad, epsilon=epsilon))
+
+    @given(inputs=hu.tensors(n=2),
+           seq_b=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_wngrad_dense_output_effective_lr(self, inputs, seq_b,
+                                              lr, epsilon, gc, dc):
+        param, grad = inputs
+        seq_b = np.array([seq_b, ], dtype=np.float32)
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Wngrad",
+            ["param", "seq_b", "grad", "lr"],
+            ["param", "seq_b", "effective_lr"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, seq_b, grad, lr],
+            functools.partial(ref_wngrad, epsilon=epsilon,
+                              output_effective_lr=True))
+
+    @given(inputs=hu.tensors(n=2),
+           seq_b=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_wngrad_dense_output_effective_lr_and_update(
+            self, inputs, seq_b, lr, epsilon, gc, dc):
+        param, grad = inputs
+        seq_b = np.abs(np.array([seq_b, ], dtype=np.float32))
+        lr = np.array([lr], dtype=np.float32)
+
+        op = core.CreateOperator(
+            "Wngrad",
+            ["param", "seq_b", "grad", "lr"],
+            ["param", "seq_b", "effective_lr", "update"],
+            epsilon=epsilon,
+            device_option=gc,
+        )
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, seq_b, grad, lr],
+            functools.partial(ref_wngrad, epsilon=epsilon,
+                              output_effective_lr_and_update=True))
+
+    # Suppress filter_too_much health check.
+    # Likely caused by `assume` call falling through too often.
+    @settings(suppress_health_check=[HealthCheck.filter_too_much])
+    @given(inputs=hu.tensors(n=2),
+           seq_b=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           **hu.gcs_cpu_only)
+    def test_sparse_wngrad(self, inputs, seq_b, lr, epsilon, gc, dc):
+        return wngrad_sparse_test_helper(self, inputs, seq_b, lr, epsilon,
+            None, gc, dc)
+
+    @given(inputs=hu.tensors(n=1),
+           lr=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           seq_b=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs_cpu_only)
+    def test_sparse_wngrad_empty(self, inputs, seq_b, lr, epsilon,
+                                  data_strategy, gc, dc):
+        param = inputs[0]
+        seq_b = np.array([seq_b, ], dtype=np.float32)
+        lr = np.array([lr], dtype=np.float32)
+
+        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
+        indices = np.empty(shape=(0,), dtype=np.int64)
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        op = core.CreateOperator(
+            "SparseWngrad",
+            ["param", "seq_b", "indices", "grad", "lr"],
+            ["param", "seq_b"],
+            epsilon=epsilon,
+            device_option=gc)
+
+        def ref_sparse(param, seq_b, indices, grad, lr):
+            param_out = np.copy(param)
+            seq_b_out = np.copy(seq_b)
+            return (param_out, seq_b_out)
+
+        print('test_sparse_adagrad_empty with full precision embedding')
+        seq_b_i = seq_b.astype(np.float32)
+        param_i = param.astype(np.float32)
+
+        self.assertReferenceChecks(
+            gc, op, [param_i, seq_b_i, indices, grad, lr], ref_sparse
+        )
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
new file mode 100644
index 0000000..12660ad
--- /dev/null
+++ b/caffe2/python/optimizer.py
@@ -0,0 +1,1426 @@
+# @package optimizer
+# Module caffe2.python.optimizer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import namedtuple, defaultdict
+from past.builtins import basestring
+
+import numpy as np
+
+from caffe2.python import core, scope, utils, workspace
+from caffe2.python.modeling import parameter_info
+from caffe2.proto import caffe2_pb2
+
+
+_LEARNING_RATE_INJECTION = "lr_injection"
+
+AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
+_optimizer_instance_count = defaultdict(int)
+
+
+class Optimizer(object):
+    def __init__(self):
+        self._aux_params = AuxOptimizerParams(local=[], shared=[])
+        self._instance_num = _optimizer_instance_count[self.__class__.__name__]
+        _optimizer_instance_count[self.__class__.__name__] += 1
+        self._lr_multiplier = None
+        self._local_lr_multiplier = None
+        self._local_lr_multiplier_on_gpu = False
+
+    '''
+    Adds optimization operators to the net for given parameter and its gradient
+    Parameter is specified by either 'param' being a ParameterInfo object.
+    In this case  param.grad has to be set
+
+    Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
+    gradient.
+    '''
+    def __call__(self, net, param_init_net, param, grad=None):
+        if grad is None:
+            assert isinstance(param, parameter_info.ParameterInfo), (
+                "Expected parameter to be of type ParameterInfo, got {}".format(
+                    param
+                ))
+            assert param.grad is not None
+        else:
+            if isinstance(param, basestring):
+                param = core.BlobReference(param)
+            param = parameter_info.ParameterInfo(
+                param_id=None, param=param, grad=grad)
+
+        self._run(net, param_init_net, param)
+
+    def _run(self, net, param_init_net, param_info):
+        raise Exception("Not Implemented")
+
+    def get_cpu_blob_name(self, base_str, node_name=''):
+        classname = self.__class__.__name__
+        return '%s_%d_%s%s_cpu' % (classname, self._instance_num, base_str, node_name)
+
+    def get_gpu_blob_name(self, base_str, gpu_id, node_name):
+        classname = self.__class__.__name__
+        return '%s_%d_%s%s_gpu%d' % (
+            classname, self._instance_num, base_str, node_name, gpu_id,
+        )
+
+    def make_unique_blob_name(self, base_str):
+        """
+        Returns a blob name that will be unique to the current device
+        and optimizer instance.
+        """
+        current_scope = scope.CurrentDeviceScope()
+        if current_scope is None:
+            return self.get_cpu_blob_name(base_str)
+
+        if current_scope.device_type == caffe2_pb2.CUDA:
+            return self.get_gpu_blob_name(
+                base_str, current_scope.cuda_gpu_id, current_scope.node_name
+            )
+        else:
+            return self.get_cpu_blob_name(base_str, current_scope.node_name)
+
+    def build_lr(self, net, param_init_net, base_learning_rate,
+                 learning_rate_blob=None, policy="fixed",
+                 iter_val=0, **kwargs):
+        if learning_rate_blob is None:
+            learning_rate_blob = self.make_unique_blob_name('lr')
+
+        iteration = utils.BuildUniqueMutexIter(
+            param_init_net,
+            net,
+            iter_val=iter_val
+        )
+
+        if not net.BlobIsDefined(learning_rate_blob):
+            # There is one interesting thing here: since we are minimizing, we are
+            # doing "descent" so the learning rate is set to be negative.
+            lr = net.LearningRate(
+                [iteration],
+                learning_rate_blob,
+                base_lr=-base_learning_rate,
+                policy=policy,
+                **kwargs
+            )
+        else:
+            lr = net.GetBlobRef(learning_rate_blob)
+
+        if self._lr_multiplier is not None:
+            lr_multiplier = net.CopyFromCPUInput(
+                self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')
+            )
+
+            lr = net.Mul(
+                [lr, lr_multiplier],
+                self.make_unique_blob_name('scaled_lr'),
+                broadcast=1,
+            )
+
+        if self._local_lr_multiplier is not None:
+            current_scope = scope.CurrentDeviceScope()
+            if (current_scope is not None
+                    and current_scope.device_type == caffe2_pb2.CUDA
+                    and not self._local_lr_multiplier_on_gpu):
+                local_lr_multiplier = net.CopyFromCPUInput(
+                    self._local_lr_multiplier,
+                    self.make_unique_blob_name('local_lr_multiplier')
+                )
+            else:
+                local_lr_multiplier = self._local_lr_multiplier
+
+            lr = net.Mul(
+                [lr, local_lr_multiplier],
+                self.make_unique_blob_name('local_scaled_lr'),
+                broadcast=1,
+            )
+
+        return lr, iteration
+
+    def add_lr_multiplier(self, lr_multiplier):
+        """
+        Set the global learning rate multiplier. If a multiplier already
+        existed, this will overwrite the existing multiplier. The multiplier is
+        used for all future calls to _run(), unless it is overwritten.
+        """
+        self._lr_multiplier = lr_multiplier
+
+    def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
+        """
+        Set the local learning rate multiplier. This local multiplier is
+        multiplied with the global learning rate multiplier if it exists. As
+        with the global learning rate multiplier, this multiplier will be
+        used for all future calls to _run(), so please call
+        _clear_local_lr_multiplier() at the beginning of the optimizer's _run()
+        before optionally calling this function.
+        """
+        self._local_lr_multiplier = local_lr_multiplier
+        self._local_lr_multiplier_on_gpu = is_gpu_blob
+
+    def _clear_local_lr_multiplier(self):
+        self._local_lr_multiplier = None
+        self._local_lr_multiplier_on_gpu = False
+
+    @staticmethod
+    def dedup(net, sparse_dedup_aggregator, grad):
+        assert isinstance(grad, core.GradientSlice), (
+            "Dedup only works for sparse gradient, got {}".format(grad))
+        if sparse_dedup_aggregator:
+            return net.DeduplicateGradientSlices(
+                grad, aggregator=sparse_dedup_aggregator)
+        else:
+            return grad
+
+    def get_auxiliary_parameters(self):
+        """Returns a list of auxiliary parameters.
+
+        Returns:
+            aux_params: A namedtuple, AuxParams.
+
+            aux_params.local stores a list of blobs. Each blob is a local
+            auxiliary parameter. A local auxiliary parameter is a parameter in
+            parallel to a learning rate parameter. Take adagrad as an example,
+            the local auxiliary parameter is the squared sum parameter, because
+            every learning rate has a squared sum associated with it.
+
+            aux_params.shared also stores a list of blobs. Each blob is a shared
+            auxiliary parameter. A shared auxiliary parameter is a parameter
+            that is shared across all the learning rate parameters. Take adam as
+            an example, the iteration parameter is a shared parameter, because
+            all the learning rates share the same iteration parameter.
+        """
+        return self._aux_params
+
+    # TODO(xlwang): In transfer learning, parameter initialized from pretrained
+    # model might require a different learning rate than otherwise initialized.
+    # To this end, here we implement a python solution where
+    # `base_learning_rate` is scaled by `scale`, by calling
+    # `scale_learning_rate`; Alternatively, we can achieve same effect by
+    # rewriting the LearningRate operator in C++
+    # Note that it is the responsibility of specific optimizer to decide what
+    # logic should be used for `scale_learning_rate`
+    def scale_learning_rate(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Optimizer Need to Implement `scale_learning_rate` method.")
+
+
+class SgdOptimizer(Optimizer):
+    def __init__(self, base_learning_rate=0.01, policy='fixed',
+                 momentum=0.0, nesterov=1, sparse_dedup_aggregator=None,
+                 lars=None, **kwargs):
+        super(SgdOptimizer, self).__init__()
+        self.base_learning_rate = base_learning_rate
+        self.policy = policy
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.lars = lars
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+        if self.base_learning_rate == 0:
+            return
+        assert self.base_learning_rate > 0, (
+            "Expect positive base learning rate, got {}".format(
+                self.base_learning_rate))
+
+        self._clear_local_lr_multiplier()
+
+        # TODO(zqq): support LARS for sparse parameters
+        if self.lars is not None and not isinstance(grad, core.GradientSlice):
+            assert self.lars >= 0, (
+                'Lars offset must be nonnegative, got {}'.format(self.lars))
+            lr_lars_multiplier = net.Lars(
+                [param, grad],
+                self.make_unique_blob_name(str(param) + "_lars"),
+                offset=self.lars)
+            current_scope = scope.CurrentDeviceScope()
+            self._add_local_lr_multiplier(
+                lr_lars_multiplier,
+                is_gpu_blob=(current_scope is not None
+                    and current_scope.device_type == caffe2_pb2.CUDA),
+            )
+
+        # We need negative sign for LR when used directly with WeightedSum
+        # below.
+        lr_sign = -1 if self.momentum else 1
+        lr, _ = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=self.base_learning_rate * lr_sign,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        dev = scope.CurrentDeviceScope()
+        if dev is None:
+            dev = core.DeviceOption(caffe2_pb2.CPU)
+
+        # Each GPU/CPU must have its own ONE blob, thus modify the name
+        # to include device information.
+        ONE = param_init_net.ConstantFill(
+            [],
+            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
+            shape=[1],
+            value=1.0
+        )
+
+        self._aux_params.shared.append(ONE)
+
+        if self.momentum > 0:
+            momentum_data = param_init_net.ConstantFill(
+                param, str(param) + "_momentum", value=0.)
+            self._aux_params.local.append(momentum_data)
+
+        if isinstance(grad, core.GradientSlice):
+            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
+            if self.momentum > 0.:
+                net.SparseMomentumSGDUpdate(
+                    [grad.values, momentum_data, lr, param, grad.indices],
+                    [grad.values, momentum_data, param],
+                    momentum=self.momentum,
+                    nesterov=self.nesterov)
+            else:
+                net.ScatterWeightedSum(
+                    [param, ONE, grad.indices, grad.values, lr],
+                    param
+                )
+        else:
+            if self.momentum > 0.:
+                net.MomentumSGDUpdate(
+                    [grad, momentum_data, lr, param],
+                    [grad, momentum_data, param],
+                    momentum=self.momentum,
+                    nesterov=self.nesterov)
+            else:
+                coeff = lr
+
+                net.WeightedSum(
+                    [param, ONE, grad, coeff],
+                    param
+                )
+
+    def scale_learning_rate(self, scale):
+        self.base_learning_rate *= scale
+        return
+
+
+class MultiPrecisionSgdOptimizer(SgdOptimizer):
+    def __init__(self, base_learning_rate=0.1, momentum=0.0,
+                 policy="fixed", nesterov=1, sparse_dedup_aggregator=None,
+                 **kwargs):
+        super(MultiPrecisionSgdOptimizer, self).__init__(
+            base_learning_rate=base_learning_rate,
+            policy=policy,
+            momentum=momentum,
+            nesterov=nesterov,
+            sparse_dedup_aggregator=sparse_dedup_aggregator,
+            **kwargs
+        )
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
+                if param_info.blob_copy is not None else None
+
+        # If we have a straight fp32 parameter, run the base class
+        if param_fp32 is None:
+            return SgdOptimizer._run(self, net, param_init_net, param_info)
+
+        grad = param_info.grad
+        if self.base_learning_rate == 0:
+            return
+        assert self.base_learning_rate > 0, (
+            "Expect positive base learning rate, got {}".format(
+                self.base_learning_rate))
+
+        lr, _ = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=-self.base_learning_rate,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        momentum_data = param_init_net.ConstantFill(
+            param_fp32, str(param) + "_momentum", value=0.)
+        self._aux_params.local.append(momentum_data)
+
+        assert not isinstance(grad, core.GradientSlice), (
+            "MultiPrecisionSgd does not support sparse gradients")
+
+        # Copy gradient to fp32
+        grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
+
+        # update (fused) in fp32
+        net.MomentumSGDUpdate(
+            [grad_fp32, momentum_data, lr, param_fp32],
+            [grad_fp32, momentum_data, param_fp32],
+            momentum=self.momentum,
+            nesterov=self.nesterov)
+
+        # Copy updated param back to fp16
+        net.FloatToHalf(param_fp32, param)
+
+
+class FP16SgdOptimizer(SgdOptimizer):
+    def __init__(self, base_learning_rate=0.1, momentum=0.0,
+                 policy="fixed", nesterov=1, weight_decay=0.0001,
+                 sparse_dedup_aggregator=None,
+                 **kwargs):
+        super(FP16SgdOptimizer, self).__init__(
+            base_learning_rate=base_learning_rate,
+            policy=policy,
+            momentum=momentum,
+            nesterov=nesterov,
+            sparse_dedup_aggregator=sparse_dedup_aggregator,
+            **kwargs
+        )
+        self.weight_decay = weight_decay
+
+    def _run(self, net, param_init_net, param_info, fp32_update=False):
+
+        fp32_update_flag = 0
+        param_name = str(param_info.blob)
+
+        # should only be triggered in FP16 training by SpatialBN, which
+        # requires FP32 params in CuDNN.
+        if param_name.find("spatbn") != -1:
+            fp32_update = True
+
+        if fp32_update:
+            # doing a 32bit update
+            # Have to assume param_info.blob is FP32 as there is no way
+            # (that i currently know of) to query a blob's type in python
+            fp32_update_flag = 1
+            param = param_info.blob
+            param_fp32 = param_info.blob
+        else:
+            if param_info.blob_copy is None:
+                # doing a 32bit update
+                # Have to assume param_info.blob is FP32 as there is no way
+                # (that i currently know of) to query a blob's type in python
+                fp32_update_flag = 1
+                param = param_info.blob
+                param_fp32 = param_info.blob
+            else:
+                if core.DataType.FLOAT in param_info.blob_copy:
+                    param = param_info.blob
+                    param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
+                elif core.DataType.FLOAT16 in param_info.blob_copy:
+                    param = param_info.blob_copy[core.DataType.FLOAT16]
+                    param_fp32 = param_info.blob
+                else:
+                    assert (False), (
+                        "Unrecognized parameter format to be updated "
+                        "by FP16 Optimizer. Parameter: {}".format(param_info.name)
+                    )
+
+        grad = param_info.grad
+
+        if self.base_learning_rate == 0:
+            return
+        assert self.base_learning_rate > 0, (
+            "Expect positive base learning rate, got {}".format(
+                self.base_learning_rate))
+
+        lr, _ = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=-self.base_learning_rate,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        momentum_data_fp32 = param_init_net.ConstantFill(
+            param_fp32, str(param) + "_momentum_fp32", value=0.)
+
+        momentum_data = param_init_net.FloatToHalf(
+            momentum_data_fp32, str(param) + "_momentum")
+
+        self._aux_params.local.append(momentum_data)
+
+        assert not isinstance(grad, core.GradientSlice), (
+            "FP16Sgd does not support sparse gradients")
+
+        if fp32_update_flag == 0:
+            net.FP16MomentumSGDUpdate(
+                [grad, momentum_data, lr, param],
+                [grad, momentum_data, param],
+                momentum=self.momentum,
+                nesterov=self.nesterov,
+                weight_decay=self.weight_decay)
+        else:
+            # flag set to 1, therefore doing FP32 update
+            net.FP32MomentumSGDUpdate(
+                [grad, momentum_data_fp32, lr, param],
+                [grad, momentum_data_fp32, param],
+                momentum=self.momentum,
+                nesterov=self.nesterov,
+                weight_decay=self.weight_decay)
+
+
+class WeightDecayBuilder(Optimizer):
+    def __init__(self, weight_decay):
+        self.weight_decay = weight_decay
+
+    def _run(self, net, param_init_net, param_info):
+        dev = scope.CurrentDeviceScope()
+        if dev is None:
+            dev = core.DeviceOption(caffe2_pb2.CPU)
+
+        ONE = param_init_net.ConstantFill(
+            [],
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            shape=[1],
+            value=1.0
+        )
+        WD = param_init_net.ConstantFill(
+            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            shape=[1], value=self.weight_decay
+        )
+
+        if isinstance(param_info.grad, core.GradientSlice):
+            raise ValueError(
+                "Weight decay does not yet support sparse gradients")
+        else:
+            net.WeightedSum(
+                [param_info.grad, ONE, param_info.blob, WD],
+                param_info.grad,
+            )
+
+
+class AdagradOptimizer(Optimizer):
+    def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
+                 sparse_dedup_aggregator=None, rowWise=False, engine='',
+                 lars=None, output_effective_lr=False,
+                 output_effective_lr_and_update=False, **kwargs):
+        super(AdagradOptimizer, self).__init__()
+        self.alpha = alpha
+        self.epsilon = epsilon
+        self.decay = decay
+        self.policy = policy
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.rowWise = rowWise
+        self.engine = engine
+        self.lars = lars
+        self.output_effective_lr = output_effective_lr
+        self.output_effective_lr_and_update = output_effective_lr_and_update
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        if self.alpha <= 0:
+            return
+
+        self._clear_local_lr_multiplier()
+
+        if self.lars is not None and not isinstance(grad, core.GradientSlice):
+            assert self.lars >= 0, (
+                'Lars offset must be nonnegative, got {}'.format(self.lars))
+            lr_lars_multiplier = net.Lars(
+                [param, grad],
+                self.make_unique_blob_name(str(param) + "_lars"),
+                offset=self.lars)
+            current_scope = scope.CurrentDeviceScope()
+            self._add_local_lr_multiplier(
+                lr_lars_multiplier,
+                is_gpu_blob=(current_scope is not None
+                    and current_scope.device_type == caffe2_pb2.CUDA),
+            )
+
+        lr, _ = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=self.alpha,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        if self.rowWise:
+            shapes, types = workspace.InferShapesAndTypes([param_init_net])
+            if str(param) not in shapes:
+                # Type/shape inference is not available for this param, fallback
+                # on Shape/Slice logic
+                shape = param_init_net.Shape(param, str(param) + "_shape")
+                num_rows = param_init_net.Slice(
+                    [shape],
+                    str(shape) + "_numrows",
+                    starts=[0], ends=[1]
+                )
+                param_squared_sum = param_init_net.ConstantFill(
+                    num_rows,
+                    str(param) + "_avg_squared_sum",
+                    input_as_shape=1,
+                    value=0.0
+                )
+            else:
+                param_squared_sum = param_init_net.ConstantFill(
+                    [],
+                    str(param) + "_avg_squared_sum",
+                    shape=[shapes[str(param)][0]],
+                    value=0.0
+                )
+
+        else:
+            param_squared_sum = param_init_net.ConstantFill(
+                [param],
+                str(param) + "_squared_sum",
+                value=0.0
+            )
+
+        self._aux_params.local.append(param_squared_sum)
+
+        if self.rowWise:
+            assert isinstance(grad, core.GradientSlice),\
+                'If SparseAdagrad with rowWise=True, gradient must be '\
+                'a gradientslice. PLease ensure that rowWise is not enabled '\
+                'for the dense Adagrad optimizer, as it is not supported.'
+        if isinstance(grad, core.GradientSlice):
+            assert self.decay == 1.,\
+                'Decay is not implemented for SparseAdagrad and must be set to 1'
+            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
+            if self.rowWise:
+                op = 'RowWiseSparseAdagrad'
+            else:
+                op = 'SparseAdagrad'
+            net.__getattr__(op)(
+                [param, param_squared_sum, grad.indices, grad.values, lr],
+                [param, param_squared_sum],
+                epsilon=self.epsilon,
+                engine=self.engine
+            )
+        else:
+            output_args = [param, param_squared_sum]
+            if self.output_effective_lr_and_update:
+                output_args.append(str(param) + '_effective_lr')
+                output_args.append(str(param) + '_update')
+            elif self.output_effective_lr:
+                output_args.append(str(param) + '_effective_lr')
+
+            net.Adagrad(
+                [param, param_squared_sum, grad, lr],
+                output_args,
+                epsilon=self.epsilon,
+                decay=float(self.decay),
+                engine=self.engine
+            )
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class WngradOptimizer(Optimizer):
+    def __init__(self, alpha=1.0, epsilon=1e-9, policy="fixed",
+                 sparse_dedup_aggregator=None, engine='', moment_init=100.0,
+                 lars=None, output_effective_lr=False,
+                 output_effective_lr_and_update=False, **kwargs):
+        super(WngradOptimizer, self).__init__()
+        self.alpha = alpha
+        self.epsilon = epsilon
+        self.policy = policy
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.engine = engine
+        self.moment_init = moment_init
+        self.lars = lars
+        self.output_effective_lr = output_effective_lr
+        self.output_effective_lr_and_update = output_effective_lr_and_update
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        if self.alpha <= 0:
+            return
+
+        self._clear_local_lr_multiplier()
+
+        if self.lars is not None and not isinstance(grad, core.GradientSlice):
+            assert self.lars >= 0, (
+                'Lars offset must be nonnegative, got {}'.format(self.lars))
+            lr_lars_multiplier = net.Lars(
+                [param, grad],
+                self.make_unique_blob_name(str(param) + "_lars"),
+                offset=self.lars)
+            current_scope = scope.CurrentDeviceScope()
+            self._add_local_lr_multiplier(
+                lr_lars_multiplier,
+                is_gpu_blob=(current_scope is not None
+                    and current_scope.device_type == caffe2_pb2.CUDA),
+            )
+
+        lr, _ = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=self.alpha,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        moment = param_init_net.ConstantFill(
+            [],
+            str(param) + "_moment",
+            shape=[1],
+            value=self.moment_init
+        )
+
+        self._aux_params.local.append(moment)
+
+        if isinstance(grad, core.GradientSlice):
+            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
+            net.SparseWngrad(
+                [param, moment, grad.indices, grad.values, lr],
+                [param, moment],
+                epsilon=self.epsilon,
+                engine=self.engine
+            )
+        else:
+            output_args = [param, moment]
+            if self.output_effective_lr_and_update:
+                output_args.append(str(param) + '_effective_lr')
+                output_args.append(str(param) + '_update')
+            elif self.output_effective_lr:
+                output_args.append(str(param) + '_effective_lr')
+
+            net.Wngrad(
+                [param, moment, grad, lr],
+                output_args,
+                epsilon=self.epsilon,
+                engine=self.engine
+            )
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class FtrlOptimizer(Optimizer):
+    def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
+                 sparse_dedup_aggregator=None, engine=''):
+        super(FtrlOptimizer, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        self.lambda1 = lambda1
+        self.lambda2 = lambda2
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.engine = engine
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        if self.alpha <= 0:
+            return
+
+        nz = param_init_net.ConstantFill(
+            [param],
+            str(param) + "_ftrl_nz",
+            extra_shape=[2],
+            value=0.0
+        )
+        self._aux_params.local.append(nz)
+        if isinstance(grad, core.GradientSlice):
+            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
+            net.SparseFtrl(
+                [param, nz, grad.indices, grad.values],
+                [param, nz],
+                engine=self.engine,
+                alpha=self.alpha,
+                beta=self.beta,
+                lambda1=self.lambda1,
+                lambda2=self.lambda2
+            )
+        else:
+            net.Ftrl(
+                [param, nz, grad],
+                [param, nz],
+                engine=self.engine,
+                alpha=self.alpha,
+                beta=self.beta,
+                lambda1=self.lambda1,
+                lambda2=self.lambda2
+            )
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class GFtrlOptimizer(Optimizer):
+    """Group Lasso FTRL Optimizer."""
+
+    def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
+                 sparse_dedup_aggregator=None, engine=''):
+        super(GFtrlOptimizer, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        self.lambda1 = lambda1
+        self.lambda2 = lambda2
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.engine = engine
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        if self.alpha <= 0:
+            return
+
+        nz = param_init_net.ConstantFill(
+            [param],
+            str(param) + "_gftrl_nz",
+            extra_shape=[2],
+            value=0.0
+        )
+        self._aux_params.local.append(nz)
+        net.GFtrl(
+            [param, nz, grad],
+            [param, nz],
+            engine=self.engine,
+            alpha=self.alpha,
+            beta=self.beta,
+            lambda1=self.lambda1,
+            lambda2=self.lambda2
+        )
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class AdamOptimizer(Optimizer):
+    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 policy='fixed', use_lr_adaption=False, lr_alpha=0.01,
+                 normalized_lr_adaption=True, sparse_dedup_aggregator=None,
+                 rowWise=False, engine='', **kwargs):
+        super(AdamOptimizer, self).__init__()
+        self.alpha = alpha
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.policy = policy
+        self.use_lr_adaption = use_lr_adaption
+        self.lr_alpha = lr_alpha
+        self.normalized_lr_adaption = normalized_lr_adaption
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.rowWise = rowWise
+        self.engine = engine
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        if self.alpha <= 0:
+            return
+
+        lr, iteration = self.build_lr(
+            net, param_init_net,
+            base_learning_rate=self.alpha,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        if self.use_lr_adaption:
+            effective_grad = param_init_net.ConstantFill(
+                [param],
+                param + "_effgrad",
+                value=0.0
+            )
+            self._aux_params.local.append(effective_grad)
+            net.LearningRateAdaption(
+                [lr, grad, effective_grad],
+                [lr],
+                lr_alpha=self.lr_alpha,
+                normalized_lr_adaption=self.normalized_lr_adaption)
+
+        m1 = param_init_net.ConstantFill(
+            [param],
+            param + "_first_moment",
+            value=0.0
+        )
+
+        if self.rowWise:
+            shapes, types = workspace.InferShapesAndTypes([param_init_net])
+            m2 = param_init_net.ConstantFill(
+                [],
+                param + "_avg_second_moment",
+                shape=[shapes[param][0]],
+                value=0.0
+            )
+        else:
+            m2 = param_init_net.ConstantFill(
+                [param],
+                param + "_second_moment",
+                value=0.0
+            )
+
+        self._aux_params.shared.append(iteration)
+        self._aux_params.local.append(m1)
+        self._aux_params.local.append(m2)
+
+        if self.rowWise:
+            assert isinstance(grad, core.GradientSlice),\
+                'If SparseAdam with rowWise=True, gradient must be '\
+                'a gradientslice. PLease ensure that rowWise is not enabled '\
+                'for the dense Adam optimizer, as it is not supported.'
+        if isinstance(grad, core.GradientSlice):
+            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
+            if self.rowWise:
+                op = 'RowWiseSparseAdam'
+            else:
+                op = 'SparseAdam'
+            net.__getattr__(op)(
+                [param, m1, m2, grad.indices, grad.values, lr, iteration],
+                [param, m1, m2],
+                beta1=self.beta1,
+                beta2=self.beta2,
+                epsilon=self.epsilon
+            )
+
+        else:
+            if self.use_lr_adaption:
+                net.Adam(
+                    [param, m1, m2, grad, lr, iteration],
+                    [param, m1, m2, effective_grad],
+                    beta1=self.beta1,
+                    beta2=self.beta2,
+                    epsilon=self.epsilon)
+            else:
+                net.Adam(
+                    [param, m1, m2, grad, lr, iteration],
+                    [param, m1, m2],
+                    beta1=self.beta1,
+                    beta2=self.beta2,
+                    epsilon=self.epsilon)
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class YellowFinOptimizer(Optimizer):
+    """YellowFin: An automatic tuner for momentum SGD
+
+    See https://arxiv.org/abs/1706.03471 for more details. This implementation
+    has separate learning rate and momentum per each parameter."""
+
+    def __init__(self,
+                 alpha=0.1,
+                 mu=0.0,
+                 beta=0.999,
+                 curv_win_width=20,
+                 zero_debias=True,
+                 epsilon=0.1**6,
+                 policy='fixed',
+                 sparse_dedup_aggregator=None,
+                 **kwargs):
+        super(YellowFinOptimizer, self).__init__()
+        self.alpha = alpha
+        self.mu = mu
+        self.beta = beta
+        self.curv_win_width = curv_win_width
+        self.zero_debias = zero_debias
+        self.epsilon = epsilon
+        self.policy = policy
+        self.sparse_dedup_aggregator = sparse_dedup_aggregator
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+
+        # Note: This is number of persistent scalars in YellowFin optimizer.
+        #       It should always be the number of scalars being used. The same
+        #       number should be used in class for the operation.
+        SCALARS_MEMORY_SIZE = 5
+
+        param = param_info.blob
+        grad = param_info.grad
+        moment = param_init_net.ConstantFill(
+            [param],
+            param + "_moment",
+            value=0.0
+        )
+        curv_win = param_init_net.ConstantFill(
+            [],
+            param + "_curv_win",
+            shape=[self.curv_win_width],
+            value=0.0
+        )
+        g_avg = param_init_net.ConstantFill(
+            [param],
+            param + "_g_avg",
+            value=0.0
+        )
+        g2_avg = param_init_net.ConstantFill(
+            [param],
+            param + "_g2_avg",
+            value=0.0
+        )
+        lr_avg = param_init_net.ConstantFill(
+            [],
+            param + "_lr_avg",
+            shape=[1],
+            value=self.alpha
+        )
+        mu_avg = param_init_net.ConstantFill(
+            [],
+            param + "_mu_avg",
+            shape=[1],
+            value=self.mu
+        )
+        scalars_memory = param_init_net.ConstantFill(
+            [],
+            param + "_scalars_memory",
+            shape=[SCALARS_MEMORY_SIZE],
+            value=0.0
+        )
+
+        assert self.alpha > 0
+        assert not isinstance(grad, core.GradientSlice), \
+            "YellowFin does not support sparse gradients"
+
+        iteration = utils.BuildUniqueMutexIter(
+            param_init_net,
+            net,
+            iter_val=0
+        )
+
+        self._aux_params.shared.append(iteration)
+        self._aux_params.local.append(moment)
+        self._aux_params.local.append(lr_avg)
+        self._aux_params.local.append(mu_avg)
+        self._aux_params.local.append(curv_win)
+        self._aux_params.local.append(g_avg)
+        self._aux_params.local.append(g2_avg)
+        self._aux_params.local.append(scalars_memory)
+
+        yf_in_out_args = [
+            param,
+            moment,
+            lr_avg,
+            mu_avg,
+            curv_win,
+            g_avg,
+            g2_avg,
+            scalars_memory
+        ]
+
+        net.YellowFin(
+            yf_in_out_args + [grad, iteration],
+            yf_in_out_args,
+            beta=self.beta,
+            epsilon=self.epsilon,
+            curv_win_width=self.curv_win_width,
+            zero_debias=self.zero_debias)
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+class RmsPropOptimizer(Optimizer):
+    def __init__(
+        self,
+        alpha=0.01,
+        decay=0.9,
+        momentum=0.0,
+        epsilon=1e-5,
+        policy='fixed',
+        engine='',
+        **kwargs
+    ):
+        super(RmsPropOptimizer, self).__init__()
+        self.alpha = alpha
+        self.decay = decay
+        self.momentum = momentum
+        self.epsilon = epsilon
+        self.policy = policy
+        self.engine = engine
+        self.init_kwargs = kwargs
+
+    def _run(self, net, param_init_net, param_info):
+        param = param_info.blob
+        grad = param_info.grad
+
+        assert self.alpha > 0
+        assert not isinstance(grad, core.GradientSlice), \
+            "RmsPropOptimizer doesn't support sparse gradients"
+
+        dev = scope.CurrentDeviceScope()
+        if dev is None:
+            dev = core.DeviceOption(caffe2_pb2.CPU)
+
+        ONE = param_init_net.ConstantFill(
+            [],
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            shape=[1],
+            value=1.0
+        )
+
+        lr, _ = self.build_lr(
+            net,
+            param_init_net,
+            base_learning_rate=-self.alpha,
+            policy=self.policy,
+            **(self.init_kwargs)
+        )
+
+        grad_o = param_init_net.ConstantFill(
+            [param],
+            str(param) + "_grad_o",
+            values=0.0,
+        )
+
+        ms = param_init_net.ConstantFill(
+            [param],
+            str(param) + "_mean_squares",
+            values=0.0,
+        )
+
+        mom = param_init_net.ConstantFill(
+            [param],
+            str(param) + "_momentum",
+            values=0.0,
+        )
+
+        self._aux_params.local.append(ms)
+        self._aux_params.local.append(mom)
+
+        net.RmsProp(
+            [grad, ms, mom, ONE],
+            [grad_o, ms, mom],
+            decay=self.decay,
+            momentum=self.momentum,
+            epsilon=self.epsilon,
+            engine=self.engine,
+        )
+
+        net.MomentumSGDUpdate(
+            [grad_o, mom, lr, param],
+            [grad_o, mom, param],
+        )
+
+    def scale_learning_rate(self, scale):
+        self.alpha *= scale
+        return
+
+
+def _get_param_to_device(model):
+    # Infer blob devices by going through the net and param_init_net
+    # ops and observing the device used to create or use the blob.
+    param_to_device = core.InferBlobDevices(model.net)
+    param_to_device.update(core.InferBlobDevices(model.param_init_net))
+    return param_to_device
+
+
+def get_param_device(param_name, grad, param_to_device=None, default_device=None):
+    device = default_device
+    param_to_device = param_to_device or {}
+    # We first check if parameter's device has been inferred. If not,
+    # we check the gradient. This can happen if parameter is not output
+    # by any blob but created by a FetchBlob.
+    if param_name in param_to_device:
+        device = param_to_device[param_name]
+    else:
+        if isinstance(grad, core.GradientSlice):
+            grad = grad
+            if str(grad.values) in param_to_device:
+                device = param_to_device[str(grad.values)]
+            elif str(grad.indices) in param_to_device:
+                device = param_to_device[str(grad.indices)]
+        else:
+            grad_name = str(grad)
+            if grad_name in param_to_device:
+                device = param_to_device[grad_name]
+
+    assert device is not None,\
+        "Cannot infer device for {}: no op creates it".format(param_name)
+    return device
+
+
+def get_lr_injection():
+    """
+    Gets current value for lr_injection, a multiplier for all base
+    learning rates.
+    Must set allow_lr_injection=True when building optimizer, as it
+    relies on synchronization over CPU.
+    """
+    return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
+
+
+def set_lr_injection(lr_injection_value):
+    """
+    Sets lr_injection, a multiplier for all base learning rates.
+    Must set allow_lr_injection=True when building optimizer, as it
+    relies on synchronization over CPU.
+    """
+    workspace.FeedBlob(
+        _LEARNING_RATE_INJECTION,
+        np.array(
+            [float(lr_injection_value)],
+            dtype=np.float32,
+        ),
+    )
+
+
+def _calc_norm_ratio(
+    model, params, name_scope, param_to_device, max_gradient_norm
+):
+    with core.NameScope(name_scope):
+        grad_squared_sums = []
+        for i, param in enumerate(params):
+            device = get_param_device(
+                str(param.blob), param.grad, param_to_device
+            )
+
+            with core.DeviceScope(device):
+                grad = (
+                    param.grad
+                    if not isinstance(
+                        param.grad,
+                        core.GradientSlice,
+                    ) else param.grad.values
+                )
+
+                grad_squared_sum_name = 'grad_{}_squared_sum'.format(i)
+                grad_squared_sum = model.net.SumSqrElements(
+                    grad,
+                    grad_squared_sum_name,
+                )
+                grad_squared_sum_cpu = model.net.EnsureCPUOutput(
+                    grad_squared_sum
+                )
+                grad_squared_sums.append(grad_squared_sum_cpu)
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            grad_squared_full_sum = model.net.Sum(
+                grad_squared_sums,
+                'grad_squared_full_sum',
+            )
+            global_norm = model.net.Pow(
+                grad_squared_full_sum,
+                'global_norm',
+                exponent=0.5,
+            )
+            clip_norm = model.param_init_net.ConstantFill(
+                [],
+                'clip_norm',
+                shape=[],
+                value=float(max_gradient_norm),
+            )
+            max_norm = model.net.Max(
+                [global_norm, clip_norm],
+                'max_norm',
+            )
+            norm_ratio = model.net.Div(
+                [clip_norm, max_norm],
+                'norm_ratio',
+            )
+            return norm_ratio
+
+
+def _build(
+    model,
+    optimizer,
+    weights_only=False,
+    use_param_info_optim=True,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+):
+    param_to_device = _get_param_to_device(model)
+
+    # Validate there are no duplicate params
+    model.Validate()
+
+    params = []
+    for param_info in model.GetOptimizationParamInfo():
+        if weights_only and param_info.blob not in model.weights:
+            continue
+        params.append(param_info)
+
+    lr_multiplier = None
+    if max_gradient_norm is not None:
+        lr_multiplier = _calc_norm_ratio(
+            model,
+            params,
+            'norm_clipped_grad_update',
+            param_to_device,
+            max_gradient_norm,
+        )
+
+    if allow_lr_injection:
+        if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
+            lr_injection = model.param_init_net.ConstantFill(
+                [],
+                _LEARNING_RATE_INJECTION,
+                shape=[1],
+                value=1.0,
+            )
+        else:
+            lr_injection = _LEARNING_RATE_INJECTION
+
+        if lr_multiplier is None:
+            lr_multiplier = lr_injection
+        else:
+            lr_multiplier = model.net.Mul(
+                [lr_multiplier, lr_injection],
+                'lr_multiplier',
+                broadcast=1,
+            )
+    optimizer.add_lr_multiplier(lr_multiplier)
+
+    for param_info in params:
+        param_name = str(param_info.blob)
+        device = get_param_device(param_name, param_info.grad, param_to_device)
+        with core.DeviceScope(device):
+            if param_info.optimizer and use_param_info_optim:
+                param_info.optimizer(
+                    model.net, model.param_init_net, param_info)
+            else:
+                optimizer(model.net, model.param_init_net, param_info)
+    return optimizer
+
+
+def add_weight_decay(model, weight_decay):
+    """Adds a decay to weights in the model.
+
+    This is a form of L2 regularization.
+
+    Args:
+        weight_decay: strength of the regularization
+    """
+    _build(
+        model,
+        WeightDecayBuilder(weight_decay=weight_decay),
+        weights_only=True,
+        use_param_info_optim=False,
+    )
+
+
+def build_sgd(
+    model,
+    base_learning_rate,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
+    return _build(
+        model,
+        sgd_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
+
+
+def build_multi_precision_sgd(
+    model,
+    base_learning_rate,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(
+        base_learning_rate, **kwargs
+    )
+    return _build(
+        model,
+        multi_prec_sgd_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
+
+
+def build_fp16_sgd(model, base_learning_rate, **kwargs):
+    fp16_sgd_optimizer = FP16SgdOptimizer(
+        base_learning_rate, **kwargs
+    )
+    return _build(model, fp16_sgd_optimizer)
+
+
+def build_ftrl(model, engine="SIMD", **kwargs):
+    if engine == "SIMD":
+        assert core.IsOperator('Ftrl_ENGINE_SIMD')
+        assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
+    ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
+    return _build(model, ftrl_optimizer)
+
+
+def build_gftrl(model, engine="", **kwargs):
+    # SIMD version of GFTRL is not supported
+    gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
+    return _build(model, gftrl_optimizer)
+
+
+def build_adagrad(
+    model,
+    base_learning_rate,
+    parameters=None,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
+    return _build(
+        model,
+        adagrad_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
+
+
+def build_wngrad(
+    model,
+    base_learning_rate,
+    parameters=None,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    wngrad_optimizer = WngradOptimizer(alpha=base_learning_rate, **kwargs)
+    return _build(
+        model,
+        wngrad_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
+
+def build_adam(
+    model,
+    base_learning_rate,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
+    return _build(
+        model,
+        adam_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
+
+
+def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
+    yellowfin_optimizer = YellowFinOptimizer(
+        alpha=base_learning_rate,
+        **kwargs)
+    return _build(model, yellowfin_optimizer)
+
+
+def build_rms_prop(
+    model,
+    base_learning_rate,
+    max_gradient_norm=None,
+    allow_lr_injection=False,
+    **kwargs
+):
+    rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
+    return _build(
+        model,
+        rms_prop_optimizer,
+        max_gradient_norm=max_gradient_norm,
+        allow_lr_injection=allow_lr_injection,
+    )
diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py
new file mode 100644
index 0000000..454d721
--- /dev/null
+++ b/caffe2/python/optimizer_context.py
@@ -0,0 +1,54 @@
+## @package optimizer_context
+# Module caffe2.python.optimizer_context
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import context
+from caffe2.python.modifier_context import (
+    ModifierContext, UseModifierBase)
+
+
+DEFAULT_OPTIM = 'DEFAULT'
+
+
+@context.define_context(allow_default=True)
+class OptimizerContext(ModifierContext):
+    """
+    provide context to allow param_info to have different optimizers
+    """
+
+    def has_optimizer(self, name):
+        return self._has_modifier(name)
+
+    def get_optimizer(self, name):
+        assert self.has_optimizer(name), (
+            "{} optimizer is not provided!".format(name))
+        return self._get_modifier(name)
+
+
+class UseOptimizer(UseModifierBase):
+    '''
+    context class to allow setting the current context.
+    Example usage with brew:
+        - with UseOptimizer(optim):
+            brew.func
+        - with UseOptimizer({'WEIGHT': weight_optim}):
+            brew.func
+        - with UseOptimizer({'DEFAULT': optim, 'BIAS': bias_optim,
+                                'WEIGHT': weight_optim}):
+            brew.func
+        - with UseOptimizer(optim1):
+            brew.func
+            with UseOptimizer(optim2):
+                brew.func
+
+    Example useage with layer:
+        optimizers = {'optim1': optim1, 'optim2': optim2}
+        with Optimizers(optimizers):
+            optim = OptimizerContext.current().get_optimizer('optim1')
+            layer(optim=optim)
+    '''
+    def _context_class(self):
+        return OptimizerContext
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
new file mode 100644
index 0000000..a90c2ec
--- /dev/null
+++ b/caffe2/python/optimizer_test.py
@@ -0,0 +1,627 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from caffe2.proto import caffe2_pb2
+import caffe2.python.optimizer as optimizer
+from caffe2.python.optimizer import (
+    build_sgd, build_multi_precision_sgd, build_ftrl, build_gftrl, build_adagrad,
+    build_wngrad, build_adam, build_yellowfin,
+    build_rms_prop, add_weight_decay, SgdOptimizer)
+from caffe2.python.optimizer_context import UseOptimizer
+from caffe2.python.optimizer_test_util import (
+    OptimizerTestBase, LRModificationTestBase
+)
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+from numpy.testing import assert_allclose, assert_equal
+import math
+import unittest
+
+
+class TestLars(OptimizerTestBase, TestCase):
+    def testSparse(self):
+        raise unittest.SkipTest("no sparse support")
+
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_sgd(model, base_learning_rate=0.1, lars=0.5, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertFalse(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            tensor = workspace.FetchBlob(param)
+            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
+
+
+class TestMomentumSgd(OptimizerTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_sgd(model, base_learning_rate=0.1, momentum=0.1, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            tensor = workspace.FetchBlob(param)
+            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
+
+
+class TestSgd(OptimizerTestBase, LRModificationTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_sgd(model, base_learning_rate=0.1, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertFalse(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            tensor = workspace.FetchBlob(param)
+            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
+
+
+class TestMultiPrecisionSgd(
+    OptimizerTestBase, LRModificationTestBase, TestCase
+):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_multi_precision_sgd(
+            model, base_learning_rate=0.1, **kwargs
+        )
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertFalse(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            tensor = workspace.FetchBlob(param)
+            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
+    def testGPUDense(self):
+        super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
+
+
+class TestFtrl(OptimizerTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = True
+        return build_ftrl(
+            model,
+            engine=None,
+            alpha=1.0,
+            beta=0.1,
+            lambda1=0.0,
+            lambda2=0.0,
+            **kwargs
+        )
+
+    def check_optimizer(self, optimizer):
+        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+
+class TestGFtrl(OptimizerTestBase, TestCase):
+    def testSparse(self):
+        raise unittest.SkipTest("no sparse support")
+
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = True
+        return build_gftrl(
+            model,
+            engine=None,
+            alpha=1.0,
+            beta=0.1,
+            lambda1=0.0,
+            lambda2=0.0,
+            **kwargs
+        )
+
+    def check_optimizer(self, optimizer):
+        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+
+class TestAdagrad(OptimizerTestBase, LRModificationTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_adagrad(model, base_learning_rate=1.0, lars=0.5, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+
+class TestWngrad(OptimizerTestBase, LRModificationTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = True
+        return build_wngrad(model, base_learning_rate=25.0, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+
+class TestAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_adam(model, base_learning_rate=0.1, **kwargs)
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
+        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
+        np.testing.assert_allclose(np.array([2000]),
+                                   iteration_tensor,
+                                   atol=1e-5)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            workspace.FetchBlob(param)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+
+class TestYellowFin(OptimizerTestBase, TestCase):
+    # YellowFin: An automatic tuner for momentum SGD
+    # (https://arxiv.org/abs/1706.03471)
+    def build_optimizer(self, model):
+        self._skip_gpu = False
+        return build_yellowfin(model, base_learning_rate=0.1)
+
+    def check_optimizer(self, optimizer):
+        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
+        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
+        np.testing.assert_allclose(np.array([2000]),
+                                   iteration_tensor,
+                                   atol=1e-5)
+        for param in optimizer.get_auxiliary_parameters().shared:
+            workspace.FetchBlob(param)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+    def testSparse(self):
+        raise unittest.SkipTest("no sparse support")
+
+    def deb(self, val, beta, i, zero_debias):
+        if zero_debias:
+            return val / (1.0 - beta ** i)
+        else:
+            return val
+
+    def get_lr_mu(self, distance, grad_var, h_min, h_max):
+        # First tune based on dynamic range
+        if grad_var == 0:
+            dr = h_max / h_min
+            mu = ((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2
+            lr_min = (1 + np.sqrt(mu)) ** 2 / h_max
+            return lr_min, mu
+
+        p = distance ** 2 * h_min ** 2 / 2 / grad_var
+        w3 = (-math.sqrt(p * p + 4.0 / 27.0 * p * p * p) - p) / 2.0
+        w = (1.0 if w3 > 0.0 else -1.0) * math.pow(math.fabs(w3), 1.0 / 3.0)
+        y = w - p / 3.0 / w
+        root = y + 1
+        root = min(root, 1.0 - 1e-6)
+        dr = h_max / h_min
+        mu = max(((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2, root**2)
+        lr_min = (1 - np.sqrt(mu)) ** 2 / h_min
+        return lr_min, mu
+
+    def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
+        caffe2_res = {}
+
+        alpha = 1.0
+        mu = 0.0
+        beta = 0.999
+        curv_win_width = 20
+        epsilon = 1e-6
+
+        net = core.Net("net")
+        param_init_net = core.Net("param_init_net")
+        workspace.ResetWorkspace()
+
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            iteration = param_init_net.ConstantFill(
+                [],
+                "iteration",
+                shape=[1],
+                value=0,
+                dtype=core.DataType.INT64)
+            iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
+            net.AtomicIter([iter_mutex, iteration], [iteration])
+        pre_grad = param_init_net.ConstantFill(
+            [],
+            "pre_grad",
+            shape=[n_dim],
+            value=grad_coef
+        )
+        if gpu:
+            iteration = net.CopyCPUToGPU(
+                [iteration],
+                "iteration_cpu"
+            )
+        iteration_float = net.Cast([iteration], "iteration_float")
+        grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
+        w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)
+
+        # a hack to create an object with __dict__
+        param_info = lambda: None
+        param_info.blob = w
+        param_info.grad = grad
+
+        optimizer.YellowFinOptimizer(
+            alpha=alpha,
+            mu=mu,
+            beta=beta,
+            curv_win_width=curv_win_width,
+            epsilon=epsilon,
+            zero_debias=zero_debias
+        )._run(
+            net,
+            param_init_net,
+            param_info
+        )
+
+        workspace.RunNetOnce(param_init_net)
+        workspace.CreateNet(net, overwrite=True)
+        for i in range(n_iter):
+            workspace.RunNet(net)
+            scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
+            g_norm2_avg = scalars_memory_blob[1]
+            g_norm2_min_avg = scalars_memory_blob[2]
+            g_norm2_max_avg = scalars_memory_blob[3]
+            distance_avg = scalars_memory_blob[4]
+            g_avg_blob = workspace.FetchBlob("w_g_avg")
+            res_lr = workspace.FetchBlob("w_lr_avg")[0]
+            res_mu = workspace.FetchBlob("w_mu_avg")[0]
+            g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
+            variance = max(
+                self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
+                g_deb.dot(g_deb),
+                epsilon
+            )
+            if i > 0:
+                caffe2_res[i] = {
+                    'h_max': np.exp(self.deb(g_norm2_max_avg,
+                                             beta,
+                                             i + 1,
+                                             zero_debias)),
+                    'h_min': np.exp(self.deb(g_norm2_min_avg,
+                                             beta,
+                                             i + 1,
+                                             zero_debias)),
+                    'var': variance,
+                    'dist': self.deb(distance_avg, beta, i + 1, zero_debias),
+                    'lr': res_lr,
+                    'mu': res_mu
+                }
+        return caffe2_res
+
+    def numpy_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
+        numpy_res = {}
+
+        target_h_max = 0.0
+        target_h_min = 0.0
+        target_g_norm_squared_avg = 0.0
+        target_g_norm_avg = 0.0
+        target_g_avg = 0.0
+        target_dist_avg = 0.0
+        target_lr = 1.0
+        target_mu = 0.0
+
+        for i in range(n_iter):
+            grad_val = (i + 1) * grad_coef
+            target_g_norm_squared_avg = 0.999 * target_g_norm_squared_avg + \
+                0.001 * np.sum((grad_val * np.ones([n_dim, ])) ** 2)
+            target_g_norm_avg = 0.999 * target_g_norm_avg + \
+                0.001 * np.linalg.norm(grad_val * np.ones([n_dim, ]))
+            target_g_avg = 0.999 * target_g_avg + 0.001 * grad_val
+
+            target_h_max = 0.999 * target_h_max + \
+                0.001 * np.log(grad_val ** 2 * n_dim)
+            target_h_min = 0.999 * target_h_min + \
+                0.001 * np.log((max(1, i + 2 - 20) * grad_coef) ** 2 * n_dim)
+            if zero_debias:
+                target_var = target_g_norm_squared_avg / \
+                    (1 - 0.999 ** (i + 1)) - \
+                    target_g_avg ** 2 * n_dim / (1 - 0.999 ** (i + 1)) ** 2
+            else:
+                target_var = target_g_norm_squared_avg - \
+                    target_g_avg ** 2 * n_dim
+            target_dist_avg = 0.999 * target_dist_avg + \
+                0.001 * target_g_norm_avg / target_g_norm_squared_avg
+
+            if i > 0:
+                if zero_debias:
+                    lr, mu = self.get_lr_mu(
+                        target_dist_avg / (1.0 - 0.999 ** (i + 1)),
+                        target_var,
+                        np.exp(target_h_min / (1.0 - 0.999 ** (i + 1))),
+                        np.exp(target_h_max / (1.0 - 0.999 ** (i + 1))))
+                    target_lr = 0.999 * target_lr + 0.001 * lr
+                    target_mu = 0.999 * target_mu + 0.001 * mu
+                    numpy_res[i] = {
+                        'h_max': np.exp(target_h_max / (1 - 0.999 ** (i + 1))),
+                        'h_min': np.exp(target_h_min / (1 - 0.999 ** (i + 1))),
+                        'var': target_var,
+                        'dist': target_dist_avg / (1 - 0.999 ** (i + 1)),
+                        'lr': target_lr,
+                        'mu': target_mu
+                    }
+                else:
+                    lr, mu = self.get_lr_mu(
+                        target_dist_avg,
+                        target_var,
+                        np.exp(target_h_min),
+                        np.exp(target_h_max))
+                    target_lr = 0.999 * target_lr + 0.001 * lr
+                    target_mu = 0.999 * target_mu + 0.001 * mu
+                    numpy_res[i] = {
+                        'h_max': np.exp(target_h_max),
+                        'h_min': np.exp(target_h_min),
+                        'var': target_var,
+                        'dist': target_dist_avg,
+                        'lr': target_lr,
+                        'mu': target_mu
+                    }
+        return numpy_res
+
+    def compare_yellowfin_models(self,
+                                 model0,
+                                 model1,
+                                 zero_debias,
+                                 grad_coef,
+                                 n_dim,
+                                 n_iter,
+                                 gpu):
+        model0_res = model0(zero_debias, grad_coef, n_dim, n_iter, gpu)
+        model1_res = model1(zero_debias, grad_coef, n_dim, n_iter, gpu)
+        assert_equal(len(model0_res), len(model1_res))
+        for i in range(1, len(model0_res)):
+            assert_equal(model0_res[i].keys(), model1_res[i].keys())
+            for feat in model0_res[i].keys():
+                err_msg = \
+                    'i=' + str(i) + ',\n' + \
+                    'feat=' + feat + ',\n' + \
+                    'grad_coef=' + str(grad_coef) + ',\n' + \
+                    'zero_debias=' + str(zero_debias)
+                assert_allclose(model0_res[i][feat],
+                                model1_res[i][feat],
+                                rtol=1e-2,
+                                err_msg=err_msg)
+
+    @unittest.skip("Results might vary too much. Only for individual use.")
+    def test_caffe2_cpu_vs_numpy(self):
+        n_dim = 1000000
+        n_iter = 50
+        cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
+        with core.DeviceScope(cpu_device_opt):
+            for zero_debias, grad_coef in [
+                (False, 1.0),
+                (False, 0.1),
+                (False, 0.01),
+                (True, 1.0)
+            ]:
+                self.compare_yellowfin_models(
+                    self.caffe2_yellowfin,
+                    self.numpy_yellowfin,
+                    zero_debias,
+                    grad_coef,
+                    n_dim,
+                    n_iter,
+                    gpu=False
+                )
+
+    @unittest.skip("Results might vary too much. Only for individual use.")
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    def test_caffe2_gpu_vs_numpy(self):
+        n_dim = 1000000
+        n_iter = 50
+        gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        with core.DeviceScope(gpu_device_opt):
+            for zero_debias in [False, True]:
+                for grad_coef in [1.0, 0.1, 0.01]:
+                    self.compare_yellowfin_models(
+                        self.caffe2_yellowfin,
+                        self.numpy_yellowfin,
+                        zero_debias,
+                        grad_coef,
+                        n_dim,
+                        n_iter,
+                        gpu=True
+                    )
+
+
+class TestRmsProp(OptimizerTestBase, LRModificationTestBase, TestCase):
+    def build_optimizer(self, model, **kwargs):
+        self._skip_gpu = False
+        return build_rms_prop(
+            model, base_learning_rate=0.1, epsilon=0.1, **kwargs
+        )
+
+    def check_optimizer(self, optimizer):
+        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
+        self.assertTrue(optimizer.get_auxiliary_parameters().local)
+        for param in optimizer.get_auxiliary_parameters().local:
+            workspace.FetchBlob(param)
+
+    def testSparse(self):
+        raise unittest.SkipTest("no sparse support")
+
+
+class TestMultiOptimizers(TestCase):
+    def test_multiple_optimizers(self):
+        from caffe2.python import brew, core, optimizer
+        from caffe2.python.model_helper import ModelHelper
+
+        model = ModelHelper(name="test")
+        fc1 = brew.fc(model, 'data', 'fc1', 100, 50)
+        fc2 = brew.fc(model, fc1, 'fc2', 50, 25)
+        pred = brew.fc(model, fc2, 'fc3', 25, 10)
+        (softmax, loss) = model.SoftmaxWithLoss(
+            [pred, 'label'],
+            ['softmax', 'loss'],
+        )
+        model.AddGradientOperators([loss])
+
+        param_to_device = optimizer._get_param_to_device(model)
+
+        def infer_blob_device(blob_name):
+            return optimizer.get_param_device(
+                blob_name, "{}_grad".format(blob_name), param_to_device
+            )
+
+        sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1)
+        sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2)
+        adagrad = optimizer.AdagradOptimizer()
+
+        # Check same optimizer share the same learning rate.
+        with core.DeviceScope(infer_blob_device("fc1_w")):
+            sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad")
+        with core.DeviceScope(infer_blob_device("fc1_b")):
+            sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad")
+        fc1_lr_blobs = []
+        for op in model.net.Proto().op:
+            if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \
+                    op.input[0] == 'fc1_b':
+                fc1_lr_blobs.append(op.input[3])
+        self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1])
+
+        # Check different instance of the same optimizer has a different lr.
+        with core.DeviceScope(infer_blob_device("fc2_w")):
+            sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad")
+        with core.DeviceScope(infer_blob_device("fc2_b")):
+            sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad")
+        fc2_lr_blobs = []
+        for op in model.net.Proto().op:
+            if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \
+                    op.input[0] == 'fc2_b':
+                self.assertTrue(op.input[3] not in fc1_lr_blobs)
+                fc2_lr_blobs.append(op.input[3])
+        self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1])
+
+        # Check different optimizer type case
+        with core.DeviceScope(infer_blob_device("fc3_w")):
+            adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad")
+        with core.DeviceScope(infer_blob_device("fc3_b")):
+            adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad")
+        fc3_lr_blobs = []
+        for op in model.net.Proto().op:
+            if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \
+                    op.input[0] == 'fc3_b':
+                self.assertTrue(op.input[3] not in fc2_lr_blobs)
+                self.assertTrue(op.input[3] not in fc1_lr_blobs)
+                fc3_lr_blobs.append(op.input[3])
+        self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
+
+
+class TestWeightDecay(TestCase):
+
+    def test_weight_decay(self):
+        from caffe2.python import brew
+        from caffe2.python.model_helper import ModelHelper
+
+        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
+        cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
+        a = brew.fc(model, cnv, 'a', 100, 200)
+        pred = brew.fc(model, a, 'b', 200, 5)
+        (softmax, loss) = model.SoftmaxWithLoss(
+            [pred, 'label'],
+            ['softmax', 'loss'],
+        )
+        model.AddGradientOperators([loss])
+
+        add_weight_decay(model, weight_decay=1e-4)
+        build_sgd(model, 0.11)
+
+        expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
+
+        # Check the proto that all weights are decayed and not non-weights
+        # are decayed.
+        for op in model.net.Proto().op:
+            if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
+                if op.output[0] not in expected_weight_grad:
+                    print(
+                        "Unexpected param for weight_decay: {}".
+                        format(op.output[0])
+                    )
+                self.assertTrue(op.output[0] in expected_weight_grad)
+                expected_weight_grad.remove(op.output[0])
+
+        self.assertEqual(
+            expected_weight_grad,
+            set(),
+            "Not all weights were decayed: {}".format(expected_weight_grad)
+        )
+
+
+class TestOptimizerContext(TestCase):
+
+    def test_optimizer_context(self):
+        from caffe2.python import brew, optimizer
+        from caffe2.python.model_helper import ModelHelper
+
+        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
+        count = optimizer._optimizer_instance_count['SgdOptimizer']
+        cnv_optim = SgdOptimizer(0.15)
+        weight_optim = SgdOptimizer(0.2)
+        bias_optim = SgdOptimizer(0.1)
+
+        with UseOptimizer(cnv_optim):
+            cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
+        with UseOptimizer({'WEIGHT': weight_optim, 'BIAS': bias_optim}):
+            a = brew.fc(model, cnv, 'a', 100, 200)
+        pred = brew.fc(model, a, 'b', 200, 5)
+        (softmax, loss) = model.SoftmaxWithLoss(
+            [pred, 'label'],
+            ['softmax', 'loss'],
+        )
+        model.AddGradientOperators([loss])
+
+        add_weight_decay(model, weight_decay=1e-4)
+        # use the following optimizer if none specified in param_info
+        build_sgd(model, 0.11)
+        expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
+        expected_learning_rate = {
+            "SgdOptimizer_{}_lr_cpu".format(count): -0.15,
+            "SgdOptimizer_{}_lr_cpu".format(count + 1): -0.2,
+            "SgdOptimizer_{}_lr_cpu".format(count + 2): -0.1,
+            "SgdOptimizer_{}_lr_cpu".format(count + 3): -0.11
+        }
+
+        for op in model.net.Proto().op:
+            # Check the proto that all weights are decayed and not non-weights
+            # are decayed.
+            if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
+                if op.output[0] not in expected_weight_grad:
+                    print(
+                        "Unexpected param for weight_decay: {}".
+                        format(op.output[0])
+                    )
+                self.assertTrue(op.output[0] in expected_weight_grad)
+                expected_weight_grad.remove(op.output[0])
+            # Check the learning rate for each parameter
+            if op.type == 'LearningRate':
+                val = 0
+                for arg in op.arg:
+                    if arg.name == 'base_lr':
+                        val = arg.f
+                self.assertAlmostEqual(
+                    val,
+                    expected_learning_rate[op.output[0]]
+                )
+
+        self.assertEqual(
+            expected_weight_grad,
+            set(),
+            "Not all weights were decayed: {}".format(expected_weight_grad)
+        )
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
new file mode 100644
index 0000000..dbb0dbe
--- /dev/null
+++ b/caffe2/python/optimizer_test_util.py
@@ -0,0 +1,238 @@
+## @package optimizer_test_util
+# Module caffe2.python.optimizer_test_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+from caffe2.python import brew, core, workspace, cnn, optimizer
+from caffe2.proto import caffe2_pb2
+from caffe2.python.modeling.initializers import (
+    Initializer, PseudoFP16Initializer)
+
+from caffe2.python.model_helper import ModelHelper
+
+
+class OptimizerTestBase(object):
+    """
+    This is an abstract base class.
+    Don't inherit from unittest.TestCase, and don't name it 'Test*'.
+    Do, however, do these things in classes which inherit from this.
+    """
+
+    def _createDense(self, dtype=core.DataType.FLOAT):
+        perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
+        np.random.seed(123)  # make test deterministic
+        numpy_dtype = np.float32 if dtype == core.DataType.FLOAT else np.float16
+        initializer = Initializer if dtype == core.DataType.FLOAT else \
+            PseudoFP16Initializer
+        data = np.random.randint(
+            2,
+            size=(20, perfect_model.size)).astype(numpy_dtype)
+        label = np.dot(data, perfect_model)[:, np.newaxis]
+
+        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
+        out = brew.fc(
+            model,
+            'data', 'fc', perfect_model.size, 1, ('ConstantFill', {}),
+            ('ConstantFill', {}), axis=0,
+            WeightInitializer=initializer, BiasInitializer=initializer
+        )
+        if dtype == core.DataType.FLOAT16:
+            out = model.HalfToFloat(out, out + "_fp32")
+        sq = model.SquaredL2Distance([out, 'label'])
+        loss = model.AveragedLoss(sq, "avg_loss")
+        grad_map = model.AddGradientOperators([loss])
+        self.assertIsInstance(grad_map['fc_w'], core.BlobReference)
+        return (model, perfect_model, data, label)
+
+    def testDense(self):
+        model, perfect_model, data, label = self._createDense()
+        optimizer = self.build_optimizer(model)
+        workspace.FeedBlob('data', data[0])
+        workspace.FeedBlob('label', label[0])
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net, True)
+        for _ in range(2000):
+            idx = np.random.randint(data.shape[0])
+            workspace.FeedBlob('data', data[idx])
+            workspace.FeedBlob('label', label[idx])
+            workspace.RunNet(model.net.Proto().name)
+
+        np.testing.assert_allclose(
+            perfect_model[np.newaxis, :],
+            workspace.FetchBlob('fc_w'),
+            atol=1e-2
+        )
+        self.check_optimizer(optimizer)
+
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    def testGPUDense(self, dtype=core.DataType.FLOAT):
+        device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        with core.DeviceScope(device_opt):
+            model, _perfect_model, data, label = self._createDense(dtype)
+            if dtype == core.DataType.FLOAT16:
+                fc_fp32_for_host = model.HalfToFloat('fc', 'fc_fp32_for_host')
+                model.CopyGPUToCPU(fc_fp32_for_host, 'fc_cpu')
+            else:
+                model.CopyGPUToCPU('fc', 'fc_cpu')
+            workspace.FeedBlob('data', data[0])
+            workspace.FeedBlob('label', label[0])
+
+        # Add some CPU ops
+        brew.fc(model, 'fc_cpu', 'fc2', dim_in=1, dim_out=10, axis=0)
+
+        # Create optimizer in default device scope
+        self.build_optimizer(model)
+
+        if self._skip_gpu:
+            return
+
+        # Run net to see it does not crash
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net, True)
+        workspace.RunNet(model.net.Proto().name)
+
+    def testSparse(self):
+        # to test duplicated indices we assign two indices to each weight and
+        # thus each weight might count once or twice
+        DUPLICATION = 2
+        perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
+        np.random.seed(123)  # make test deterministic
+        data = np.random.randint(
+            2,
+            size=(20, perfect_model.size * DUPLICATION)).astype(np.float32)
+        label = np.dot(data, np.repeat(perfect_model, DUPLICATION))
+
+        model = cnn.CNNModelHelper("NCHW", name="test")
+        # imitate what model wrapper does
+        w = model.param_init_net.ConstantFill(
+            [], 'w', shape=[perfect_model.size], value=0.0)
+        model.params.append(w)
+        picked = model.net.Gather([w, 'indices'], 'gather')
+        out = model.ReduceFrontSum(picked, 'sum')
+
+        sq = model.SquaredL2Distance([out, 'label'])
+        loss = model.AveragedLoss(sq, "avg_loss")
+        grad_map = model.AddGradientOperators([loss])
+        self.assertIsInstance(grad_map['w'], core.GradientSlice)
+        optimizer = self.build_optimizer(model)
+
+        workspace.CreateBlob('indices')
+        workspace.CreateBlob('label')
+
+        for indices_type in [np.int32, np.int64]:
+            workspace.RunNetOnce(model.param_init_net)
+            workspace.CreateNet(model.net, True)
+            for _ in range(2000):
+                idx = np.random.randint(data.shape[0])
+                # transform into indices of binary features
+                indices = np.repeat(np.arange(perfect_model.size),
+                                    DUPLICATION)[data[idx] == 1]
+                if indices.size == 0:
+                    continue
+                workspace.FeedBlob(
+                    'indices',
+                    indices.reshape((indices.size,)).astype(indices_type)
+                )
+                workspace.FeedBlob('label',
+                                   np.array(label[idx]).astype(np.float32))
+                workspace.RunNet(model.net.Proto().name)
+
+            np.testing.assert_allclose(
+                perfect_model,
+                workspace.FetchBlob('w'),
+                atol=1e-2
+            )
+        self.check_optimizer(optimizer)
+
+
+class LRModificationTestBase(object):
+    """
+    This is an abstract base class.
+    Don't inherit from unittest.TestCase, and don't name it 'Test*'.
+    Do, however, do these things in classes which inherit from this.
+    """
+
+    def _gradient_ratio_reference(self, model, params, max_gradient_norm):
+        from caffe2.python import core
+        sum_squared_norms = 0.0
+        for param in params:
+            grad = (
+                model.param_to_grad[param]
+                if not isinstance(
+                    model.param_to_grad[param],
+                    core.GradientSlice,
+                ) else model.param_to_grad[param].values
+            )
+            val = workspace.FetchBlob(grad)
+            sum_squared_norms += np.power(np.linalg.norm(val), 2.0)
+        global_norm = np.sqrt(sum_squared_norms)
+        clip_norm = max_gradient_norm
+        norm_ratio = clip_norm / np.maximum(clip_norm, global_norm)
+        return norm_ratio
+
+    def test_global_norm_based_gradient_clipping(self):
+        max_gradient_norm = 1.0
+        model, perfect_model, data, label = self._createDense()
+        opt = self.build_optimizer(model, max_gradient_norm=max_gradient_norm)
+
+        params = []
+        for param in model.GetParams(top_scope=True):
+            if param in model.param_to_grad:
+                if not isinstance(
+                    model.param_to_grad[param],
+                    core.GradientSlice,
+                ):
+                    params.append(param)
+
+        workspace.FeedBlob('data', data[0])
+        workspace.FeedBlob('label', label[0])
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net, True)
+        self.assertIsNotNone(opt._lr_multiplier)
+
+        # Run net once
+        idx = np.random.randint(data.shape[0])
+        workspace.FeedBlob('data', data[idx])
+        workspace.FeedBlob('label', label[idx])
+        workspace.RunNet(model.net.Proto().name)
+
+        reference = self._gradient_ratio_reference(
+            model,
+            params,
+            max_gradient_norm,
+        )
+        norm_ratio = workspace.FetchBlob(
+            'norm_clipped_grad_update/norm_ratio')
+        np.testing.assert_almost_equal(norm_ratio, reference)
+        self.assertTrue(
+            reference < 1.0, "Bad test, gradient not being scaled."
+        )
+
+    def test_lr_injection(self):
+        model, perfect_model, data, label = self._createDense()
+        opt = self.build_optimizer(
+            model, max_gradient_norm=1, allow_lr_injection=True
+        )
+
+        workspace.FeedBlob('data', data[0])
+        workspace.FeedBlob('label', label[0])
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net, True)
+
+        # Test LR injection initialized properly
+        self.assertIsNotNone(opt._lr_multiplier)
+        self.assertEqual(optimizer.get_lr_injection(), 1)
+
+        # Test that we're able to modify the value of the lr_injection
+        optimizer.set_lr_injection(0)
+        self.assertEqual(optimizer.get_lr_injection(), 0)
+
+        # Test that setting the lr_injector properly propogates to the
+        # lr_multiplier. Here, we have both lr_injector and norm_ratio that
+        # affect the lr_multiplier
+        workspace.RunNet(model.net.Proto().name)
+        self.assertEqual(workspace.FetchBlob('lr_multiplier'), 0)
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
new file mode 100644
index 0000000..73f19df
--- /dev/null
+++ b/caffe2/python/parallel_workers.py
@@ -0,0 +1,295 @@
+# @package parallel_workers
+# Module caffe2.python.parallel_workers
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+'''
+This module provides a python-land multithreaded mechanism for executing work.
+
+Basic usage is as follows:
+   coordinator = parallel_workers.init_workers(
+      my_worker_fun,
+      worker_name="train"
+   )
+   ...
+   coordinator.start()
+
+First argument is the function to run in a loop on potentially multiple threads.
+It has the call signature
+    worker_fun(worker_id)
+
+Argument 'worker_name' is used to distinguish different workers,
+such as workers processing train data or workers processing test data.
+
+Optionally, one can define an "init function" that is called once before
+threads start, and has call signature:
+   my_init_fun(worker_coordinator, global_coordinator)
+
+Note that for data_parallel_models, init_workers will be called
+for each GPU. Note that the 'coordinator' returned by the function is same
+each time.
+'''
+
+import logging
+import threading
+import atexit
+import time
+import collections
+import six
+import traceback
+
+from abc import ABCMeta, abstractmethod
+
+log = logging.getLogger("parallel_workers")
+log.setLevel(logging.INFO)
+LOG_INT_SECS = 60
+
+
+def init_workers(
+    worker_fun,
+    num_worker_threads=2,
+    worker_name="train",
+    init_fun=None,
+    external_loggers=None,
+    shutdown_fun=None,
+):
+    global global_coordinator
+
+    metrics = Metrics(external_loggers)
+
+    worker_ids = [
+        global_coordinator.get_new_worker_id()
+        for i in range(num_worker_threads)
+    ]
+
+    # Create coordinator object
+    coordinator = WorkerCoordinator(
+        worker_name, worker_ids, init_fun, shutdown_fun=shutdown_fun)
+
+    # Launch fetch worker threads
+    workers = [
+        threading.Thread(
+            target=run_worker,
+            name="parallel_workers worker id {}".format(worker_id),
+            args=[coordinator,
+                  Worker(coordinator, worker_id, worker_fun, metrics)],
+        ) for worker_id in worker_ids
+    ]
+
+    coordinator._workers = workers
+    global_coordinator.add(coordinator)
+
+    return global_coordinator
+
+
+class Metrics(object):
+    def __init__(self, external_loggers):
+        self._metrics = collections.defaultdict(lambda: 0)
+        self._external_loggers = external_loggers
+
+    def reset_metrics(self):
+        self._metrics = collections.defaultdict(lambda: 0)
+
+    def log_metrics(self):
+        if not self._external_loggers:
+            return
+        for logger in self._external_loggers:
+            try:
+                logger.log(self._metrics)
+            except Exception as e:
+                print("Failed to call ExternalLogger: {}".format(e))
+
+    def put_metric(self, key, value, count=True):
+        self._metrics[key] += value
+        if count:
+            count_key = '{}_count'.format(key)
+            self._metrics[count_key] += 1
+
+
+class State():
+    six.add_metaclass(ABCMeta)
+
+    @abstractmethod
+    def start(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    @abstractmethod
+    def cleanup(self):
+        pass
+
+
+class WorkerCoordinator(object):
+    def __init__(
+        self, worker_name, worker_ids, init_fun,
+        state=None, shutdown_fun=None
+    ):
+        self._active = True
+        self._started = False
+        self._workers = []
+        self._worker_name = worker_name
+        self._worker_ids = worker_ids
+        self._init_fun = init_fun
+        self._state = state
+        self._shutdown_fun = shutdown_fun
+
+    def is_active(self):
+        return self._active
+
+    def init(self, global_coordinator):
+        if self._init_fun and not self._started:
+            data_coordinator = self
+            self._init_fun(data_coordinator, global_coordinator)
+
+    def _start(self):
+        if self._started:
+            return
+        self._active = True
+        self._started = True
+        if self._state:
+            self._state.start()
+
+        for w in self._workers:
+            w.daemon = True
+            w.start()
+
+    def _stop(self, reason=None):
+        self._active = False
+        if reason is not None:
+            log.error("Data input failed due to an error: {}".format(reason))
+        if self._shutdown_fun and self._started:
+            self._shutdown_fun()
+        if self._state:
+            self._state.stop()
+
+        self._started = False
+
+    def _wait_finish(self, cleanup=None):
+        print("Wait for workers to die: {}".format(self._worker_name))
+        for w in self._workers:
+            if w != threading.current_thread():
+                w.join(5.0)  # don't wait forever, thread may be blocked in i/o
+        success = True
+        for w in self._workers:
+            if w.isAlive():
+                print("Worker {} failed to close while waiting".format(w))
+                success = False
+
+        # Release memory for the scratch blobs
+        if success and self._state:
+            self._state.cleanup()
+
+        print("All workers terminated: {}".format(success))
+        return success
+
+    def get_worker_ids(self):
+        return self._worker_ids
+
+
+class GlobalWorkerCoordinator(object):
+    def __init__(self):
+        self._coordinators = []
+        self._fetcher_id_seq = 0
+        self._worker_ids = []
+        self.register_shutdown_handler()
+
+    def add(self, coordinator):
+        self._coordinators.append(coordinator)
+
+    def get_new_worker_id(self):
+        worker_id = self._fetcher_id_seq
+        self._worker_ids.append(worker_id)
+        self._fetcher_id_seq += 1
+        return worker_id
+
+    def get_worker_ids(self):
+        return self._worker_ids
+
+    def start(self):
+        # run init and start in separate for loop to
+        # ensure init happens serially before threads are spawn.
+        for c in self._coordinators:
+            c.init(self)
+        for c in self._coordinators:
+            c._start()
+
+    def stop(self):
+        all_success = True
+        for c in self._coordinators:
+            c._stop()
+        for c in self._coordinators:
+            success = c._wait_finish()
+            all_success = all_success and success
+        self._coordinators = []
+        return all_success
+
+    def stop_coordinator(self, worker_name):
+        '''
+        Stop a specific coordinator
+        '''
+        for c in self._coordinators:
+            if c._worker_name == worker_name:
+                c._stop()
+                c._wait_finish()
+        self._coordinators = [
+            c for c in self._coordinators
+            if c._worker_name != worker_name
+        ]
+
+    def register_shutdown_handler(self):
+        def cleanup():
+            self.stop()
+
+        atexit.register(cleanup)
+
+
+class Worker(object):
+    def __init__(
+        self,
+        coordinator,
+        worker_id,
+        worker_fun=None,
+        metrics=None
+    ):
+        self._coordinator = coordinator
+        self._worker_id = worker_id
+        self._worker_fun = worker_fun
+        self._metrics = metrics
+
+    def start(self):
+        self._start_time = time.time()
+
+    def run(self):
+        self._worker_fun(self._worker_id)
+
+    def handle_exception(self, e):
+        traceback.print_exc()
+        logging.exception("Exception in worker", e)
+        self._coordinator._stop("Exception in worker {}: {}".format(
+            self._worker_id, e
+        ))
+
+    def finish(self):
+        self._metrics.put_metric(
+            'worker_time', time.time() - self._start_time)
+        self._metrics.log_metrics()
+
+
+global_coordinator = GlobalWorkerCoordinator()
+
+
+def run_worker(coordinator, worker):
+    while coordinator.is_active():
+        worker.start()
+        try:
+            worker.run()
+        except Exception as e:
+            worker.handle_exception(e)
+        finally:
+            worker.finish()
diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py
new file mode 100644
index 0000000..344bd15
--- /dev/null
+++ b/caffe2/python/parallel_workers_test.py
@@ -0,0 +1,110 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+
+from caffe2.python import workspace, core
+import caffe2.python.parallel_workers as parallel_workers
+
+
+def create_queue():
+    queue = 'queue'
+
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "CreateBlobsQueue", [], [queue], num_blobs=1, capacity=1000
+        )
+    )
+
+    return queue
+
+
+def create_worker(queue, get_blob_data):
+    def dummy_worker(worker_id):
+        blob = 'blob_' + str(worker_id)
+
+        workspace.FeedBlob(blob, get_blob_data(worker_id))
+
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                'SafeEnqueueBlobs', [queue, blob], [blob, 'status_blob']
+            )
+        )
+
+    return dummy_worker
+
+
+def dequeue_value(queue):
+    dequeue_blob = 'dequeue_blob'
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "SafeDequeueBlobs", [queue], [dequeue_blob, 'status_blob']
+        )
+    )
+
+    return workspace.FetchBlob(dequeue_blob)
+
+
+class ParallelWorkersTest(unittest.TestCase):
+    def testParallelWorkers(self):
+        workspace.ResetWorkspace()
+
+        queue = create_queue()
+        dummy_worker = create_worker(queue, lambda worker_id: str(worker_id))
+        worker_coordinator = parallel_workers.init_workers(dummy_worker)
+        worker_coordinator.start()
+
+        for _ in range(10):
+            value = dequeue_value(queue)
+            self.assertTrue(
+                value in [b'0', b'1'], 'Got unexpected value ' + str(value)
+            )
+
+        self.assertTrue(worker_coordinator.stop())
+
+    def testParallelWorkersInitFun(self):
+        workspace.ResetWorkspace()
+
+        queue = create_queue()
+        dummy_worker = create_worker(
+            queue, lambda worker_id: workspace.FetchBlob('data')
+        )
+        workspace.FeedBlob('data', 'not initialized')
+
+        def init_fun(worker_coordinator, global_coordinator):
+            workspace.FeedBlob('data', 'initialized')
+
+        worker_coordinator = parallel_workers.init_workers(
+            dummy_worker, init_fun=init_fun
+        )
+        worker_coordinator.start()
+
+        for _ in range(10):
+            value = dequeue_value(queue)
+            self.assertEqual(
+                value, b'initialized', 'Got unexpected value ' + str(value)
+            )
+
+        self.assertTrue(worker_coordinator.stop())
+
+    def testParallelWorkersShutdownFun(self):
+        workspace.ResetWorkspace()
+
+        queue = create_queue()
+        dummy_worker = create_worker(queue, lambda worker_id: str(worker_id))
+        workspace.FeedBlob('data', 'not shutdown')
+
+        def shutdown_fun():
+            workspace.FeedBlob('data', 'shutdown')
+
+        worker_coordinator = parallel_workers.init_workers(
+            dummy_worker, shutdown_fun=shutdown_fun
+        )
+        worker_coordinator.start()
+
+        self.assertTrue(worker_coordinator.stop())
+
+        data = workspace.FetchBlob('data')
+        self.assertEqual(data, b'shutdown', 'Got unexpected value ' + str(data))
diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py
new file mode 100644
index 0000000..afce7c6
--- /dev/null
+++ b/caffe2/python/parallelize_bmuf_distributed_test.py
@@ -0,0 +1,288 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from multiprocessing import Process, Manager
+
+import numpy as np
+import unittest
+import tempfile
+import shutil
+import logging
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+log = logging.getLogger("parallelize_bmuf_distributed_test")
+log.setLevel(logging.INFO)
+
+
+def bmuf_process(filestore_dir, process_id, shared_results,
+                 cpu_device=False, nesterov=False):
+    # We need to import caffe2 in every process to initialize CUDA independently.
+    from caffe2.python import core, cnn, data_parallel_model, dyndep, workspace
+    from caffe2.proto import caffe2_pb2
+    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
+
+    if not cpu_device:
+        if not workspace.has_gpu_support:
+            log.info('No GPU support test is Ignored.')
+            return
+        if workspace.NumCudaDevices() < 4:
+            log.info('Not enough GPU support, test IGNORED')
+            return
+
+    model = cnn.CNNModelHelper(
+        order="NHWC",
+        name="test"
+    )
+    if not cpu_device:
+        device_type = caffe2_pb2.CUDA
+        device_prefix = "gpu"
+    else:
+        device_type = caffe2_pb2.CPU
+        device_prefix = "cpu"
+
+    devices = [0, 1] if process_id == 0 else [2, 3]
+
+    def _model_build_fun(model, loss_scale):
+        fc = model.FC(
+            "data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})
+        )
+        fc_fl = model.FlattenToVec(fc, "fc_fl")
+        sigm = model.Sigmoid(fc_fl, "sigm")
+        sq = model.SquaredL2Distance([sigm, "label"], "sq")
+        loss = model.AveragedLoss(sq, "loss")
+        loss = model.Scale(loss, scale=loss_scale)
+
+        # For testing explicit sync
+        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
+        return [loss]
+
+    def _input_builder_fun(model):
+        return None
+
+    def _param_update_fun(model):
+        ITER = model.Iter("ITER")
+        LR = model.net.LearningRate(
+            [ITER],
+            "LR",
+            base_lr=(-0.1),
+            policy="fixed",
+        )
+        ONE = model.param_init_net.ConstantFill(
+            [], "ONE", shape=[1], value=1.0,
+        )
+        for param in model.GetParams():
+            grad = model.param_to_grad[param]
+            model.WeightedSum([param, ONE, grad, LR], param)
+
+    def _generate_data(devices, process_id, device_type, device_prefix):
+        np.random.seed(26 + process_id * 10)
+        # Each run has same input, independent of number of gpus
+        batch_size = 64
+        for _ in range(0, 10):
+            full_data = np.random.rand(batch_size, 16)
+            full_labels = np.round(full_data[:, 0])
+            batch_per_device = batch_size // len(devices)
+
+            for (j, g) in enumerate(devices):
+                st = j * batch_per_device
+                en = st + batch_per_device
+                data = full_data[st:en, :].astype(np.float32)
+                labels = full_labels[st:en].astype(np.float32)
+                with core.DeviceScope(core.DeviceOption(device_type, g)):
+                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data)
+                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels)
+
+    _generate_data(devices, process_id, device_type, device_prefix)
+
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "FileStoreHandlerCreate", [], ["store_handler"],
+            path=filestore_dir
+        )
+    )
+    rendezvous = dict(
+        kv_handler="store_handler",
+        shard_id=process_id,
+        num_shards=2,
+        engine="GLOO",
+        exit_nets=None
+    )
+
+    data_parallel_model.Parallelize_BMUF(
+        model,
+        _input_builder_fun,
+        _model_build_fun,
+        _param_update_fun,
+        devices=devices,
+        rendezvous=rendezvous,
+        nesterov=nesterov,
+        add_blobs_to_sync=["sync_num"],
+        cpu_device=cpu_device
+    )
+
+    data_parallel_model.RunInitNet(model)
+
+    def _device_pid(device, pid):
+        if pid == 1:
+            return device + 2
+        return device
+
+    np.testing.assert_equal(
+        workspace.FetchBlob("{}_{}/fc_w_v".format(
+            device_prefix, _device_pid(0, process_id))),
+        np.zeros(16).astype(np.float32).reshape(1, 16)
+    )
+
+    # Run the algorithm for one iteration to have non-zero params.
+    data_parallel_model.RunNet(model, 1)
+
+    # Save iteration momentum and post local update params
+    results = {}
+    v_b_ = workspace.FetchBlob(
+        "{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id)))
+    v_w_ = workspace.FetchBlob(
+        "{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id)))
+
+    results['v_b_'] = v_b_
+    results['v_w_'] = v_w_
+
+    workspace.RunNetOnce(model.net)
+
+    b_0_ = workspace.FetchBlob(
+        "{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id)))
+    w_0_ = workspace.FetchBlob(
+        "{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id)))
+    b_1_ = workspace.FetchBlob(
+        "{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id)))
+    w_1_ = workspace.FetchBlob(
+        "{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id)))
+
+    results['b_0_'] = b_0_
+    results['w_0_'] = w_0_
+    results['b_1_'] = b_1_
+    results['w_1_'] = w_1_
+
+    # Test sync
+    if process_id == 0:
+        workspace.FeedBlob(
+            device_prefix + "_0/sync_num",
+            np.array([2603]).astype(np.float32),
+            device_option=core.DeviceOption(device_type, 0))
+
+    # Compute block gradients.
+    b_g_ = workspace.FetchBlob(
+        "{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id)))
+    w_g_ = workspace.FetchBlob(
+        "{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id)))
+    results['b_g_'] = b_g_
+    results['w_g_'] = w_g_
+    workspace.RunNetOnce(model._global_model_param_updates_net)
+
+    #  g_b = (b_0_ + b_1_) / 2 - b_g_
+    #  g_w = (w_0_ + w_1_) / 2 - w_g_
+    v_b = workspace.FetchBlob(
+        "{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id)))
+    v_w = workspace.FetchBlob(
+        "{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id)))
+    w_g = workspace.FetchBlob(
+        "{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id)))
+    b_g = workspace.FetchBlob(
+        "{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id)))
+    w_0 = workspace.FetchBlob(
+        "{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id)))
+    b_0 = workspace.FetchBlob(
+        "{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id)))
+    w_1 = workspace.FetchBlob(
+        "{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id)))
+    b_1 = workspace.FetchBlob(
+        "{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id)))
+    results['v_b'] = v_b
+    results['v_w'] = v_w
+    results['w_g'] = w_g
+    results['b_g'] = b_g
+    results['w_0'] = w_0
+    results['b_0'] = b_0
+    results['w_1'] = w_1
+    results['b_1'] = b_1
+
+    # Test add_blobs_to_sync
+    for j in devices:
+        sync = workspace.FetchBlob(
+            device_prefix + "_{}/sync_num".format(j))[0]
+        results['sync_{}'.format(j)] = sync
+
+    shared_results[process_id] = results
+
+
+class DistributedTest(unittest.TestCase):
+
+    @given(
+        cpu_device=st.booleans(),
+        nesterov=st.booleans()
+    )
+    def test_bmuf_distributed(self, cpu_device, nesterov):
+        self._test_bmuf_distributed(cpu_device=cpu_device, nesterov=nesterov)
+
+    def _test_bmuf_distributed(self, cpu_device=False, nesterov=False):
+        processes = []
+        filestore_dir = tempfile.mkdtemp()
+        results = Manager().dict()
+        for idx in range(0, 2):
+            process = Process(
+                target=bmuf_process,
+                args=(filestore_dir, idx, results, cpu_device, nesterov)
+            )
+            processes.append(process)
+            process.start()
+
+        while len(processes) > 0:
+            process = processes.pop()
+            process.join()
+        shutil.rmtree(filestore_dir)
+
+        if len(results) == 0:
+            return
+
+        w_0 = results[0]['w_0']
+        w_1 = results[0]['w_1']
+        b_0 = results[0]['b_0']
+        b_1 = results[0]['b_1']
+        # Check parameters are in sync.
+        np.testing.assert_equal(w_0, w_1)
+        np.testing.assert_equal(w_0, results[1]['w_0'])
+        np.testing.assert_equal(w_0, results[1]['w_1'])
+        np.testing.assert_equal(b_0, b_1)
+        np.testing.assert_equal(b_0, results[1]['b_0'])
+        np.testing.assert_equal(b_0, results[1]['b_1'])
+
+        w_g_ = results[0]['w_g_']
+        b_g_ = results[0]['b_g_']
+
+        g_b = (results[0]['b_0_'] + results[1]['b_0_'] + results[0]['b_1_'] +
+               results[1]['b_1_']) / 4 - b_g_
+        g_w = (results[0]['w_0_'] + results[1]['w_0_'] + results[0]['w_1_'] +
+               results[1]['w_1_']) / 4 - w_g_
+        v_b_ = results[0]['v_b_']
+        v_b = results[0]['v_b']
+        v_w_ = results[0]['v_w_']
+        v_w = results[0]['v_w']
+
+        for pid in results.keys():
+            for k in results[pid].keys():
+                if k.startswith("sync_num"):
+                    self.assertEqual(2603, results[pid][k])
+
+        # Check block gradients are correct.
+        np.testing.assert_almost_equal(v_b, 0.75 * v_b_ + g_b)
+        np.testing.assert_almost_equal(v_w, 0.75 * v_w_ + g_w)
+
+        # Check params update step
+        if nesterov:
+            np.testing.assert_equal(w_0, w_g_ + v_w - 0.75 * (v_w - v_w_))
+            np.testing.assert_equal(b_0, b_g_ + v_b - 0.75 * (v_b - v_b_))
+        else:
+            np.testing.assert_equal(w_0, w_g_ + v_w)
+            np.testing.assert_equal(b_0, b_g_ + v_b)
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
new file mode 100644
index 0000000..e2a8ac0
--- /dev/null
+++ b/caffe2/python/pipeline.py
@@ -0,0 +1,439 @@
+## @package pipeline
+# Module caffe2.python.pipeline
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, queue_util
+from caffe2.python.dataio import Reader, Writer
+from caffe2.python.net_builder import NetBuilder, ops
+from caffe2.python.schema import as_record, Field
+from caffe2.python.task import Node, Task, TaskGroup
+
+
+class Output(object):
+    """
+    Represents the result of a processor function. A processor can either
+    return an Output, or it can return a record, in which case an Output will be
+    created for it afterwards.
+    """
+    def __init__(self, nets=None, record=None, should_stop=None):
+        builder_children = NetBuilder.current().get()
+        assert nets is None or len(builder_children) == 0, (
+            'Cannot both use `ops` syntax and return a list of nets.')
+        if nets is None:
+            nets = builder_children
+        if isinstance(nets, core.Net):
+            nets = [nets]
+        self.nets = [] if nets is None else list(nets)
+        self.record = None if record is None else as_record(record)
+        self.should_stop = should_stop
+
+
+DEFAULT_QUEUE_CAPACITY = 100
+
+
+def _init_output(output, capacity, global_init_net, global_exit_net):
+    if output is None:
+        out_queue = queue_util.Queue(
+            capacity=(
+                capacity if capacity is not None
+                else DEFAULT_QUEUE_CAPACITY))
+        writer = out_queue.writer()
+    elif isinstance(output, Writer):
+        assert capacity is None, 'capacity would not be used.'
+        out_queue = None
+        writer = output
+    elif hasattr(output, 'writer'):
+        assert capacity is None, 'capacity would not be used.'
+        out_queue = output
+        writer = output.writer()
+    else:
+        raise ValueError('output must be a reader, queue or stream.')
+    writer.setup_ex(global_init_net, global_exit_net)
+    return out_queue, writer
+
+
+def make_processor(processor):
+    if processor is None:
+        return lambda rec: rec
+    elif isinstance(processor, core.Net):
+        return NetProcessor(processor)
+    else:
+        return processor
+
+
+def normalize_processor_output(output):
+    """
+    Allow for processors to return results in several formats.
+    TODO(azzolini): simplify once all processors use NetBuilder API.
+    """
+    if isinstance(output, Output):
+        """ Processor returned an Output. """
+        return output
+    elif isinstance(output, Field):
+        """ Processor returned a record. """
+        return Output(record=output)
+    elif isinstance(output, tuple):
+        is_record_and_blob = (
+            len(output) == 2 and
+            isinstance(output[0], Field) and
+            isinstance(output[1], core.BlobReference))
+        if is_record_and_blob:
+            """ Processor returned (record, stop_blob) """
+            return Output(None, *output)
+        else:
+            """ Processor returned (nets, record, stop_blob) """
+            return Output(*output)
+    else:
+        """ Processor returned nets, no output """
+        return Output(output)
+
+
+def pipe(
+        input, output=None, num_threads=1, processor=None, name=None,
+        capacity=None, group=None, num_runtime_threads=1):
+    """
+    Given a Reader, Queue or DataStream in `input`, and optionally, a Writer,
+    Queue or DataStream in `output`, creates a Task that, when run, will
+    pipe the input into the output, using multiple parallel threads.
+    Additionally, if a processor is given, it will be called between reading
+    and writing steps, allowing it to transform the record.
+
+    Args:
+        input:       either a Reader, Queue or DataStream that will be read
+                     until a stop is signaled either by the reader or the
+                     writer.
+        output:      either a Writer, a Queue or a DataStream that will be
+                     writen to as long as neither reader nor writer signal
+                     a stop condition. If output is not provided or is None,
+                     a Queue is created with given `capacity` and writen to.
+        num_threads: number of concurrent threads used for processing and
+                     piping. If set to 0, no Task is created, and a
+                     reader is returned instead -- the reader returned will
+                     read from the reader passed in and process it.
+                     ** DEPRECATED **. Use `num_runtime_threads` instead.
+                     This option will be removed once all readers/processors
+                     support `num_runtime_threads`.
+        processor:   (optional) function that takes an input record and
+                     optionally returns a record; this will be called
+                     between read and write steps. If the processor does
+                     not return a record, a writer will not be instantiated.
+                     Processor can also be a core.Net with input and output
+                     records properly set. In that case, a NetProcessor is
+                     instantiated, cloning the net for each of the threads.
+        name:        (optional) name of the task to be created.
+        capacity:    when output is not passed, a queue of given `capacity`
+                     is created and written to.
+        group:       (optional) explicitly add the created Task to this
+                     TaskGroup, instead of using the currently active one.
+        num_runtime_threads: Similar to `num_threads`, but instead of expanding
+                     the tasks with a `for` loop in python, does that at
+                     runtime. This is preferable to `num_threads`, but some
+                     processors/readers still require to be called multiple
+                     times in python.
+
+    Returns:
+        Output Queue, DataStream, Reader, or None, depending on the parameters
+        passed.
+    """
+    result, _ = _pipe_step(
+        input, output, num_threads, processor, name, capacity, group,
+        num_runtime_threads)
+    return result
+
+
+def pipe_and_output(
+        input, output=None, num_threads=1, processor=None, name=None,
+        capacity=None, group=None, num_runtime_threads=1, final_outputs=None):
+    """
+    Similar to `pipe`, with the additional ability for the pipe Task to
+    return output values to the `Session` once done.
+
+    Returns:
+        Tuple (out_queue, *task_outputs)
+            out_queue:    same as return value of `pipe`.
+            task_outputs: TaskOutput object, fetchable from the client after
+                          session.run() returns.
+    """
+    assert num_threads > 0
+    result, task = _pipe_step(
+        input, output, num_threads, processor, name, capacity, group,
+        num_runtime_threads, final_outputs)
+    output = None
+    if final_outputs is not None:
+        output = task.outputs()
+        if type(final_outputs) not in (list, tuple):
+            output = output[0]
+    return result, output
+
+
+def processor_name(processor):
+    if hasattr(processor, 'name'):
+        return processor.name
+    if hasattr(processor, 'func_name'):
+        if processor.func_name == '<lambda>':
+            return processor.__module__
+        if hasattr(processor, 'im_class'):
+            return '%s.%s' % (processor.im_class.__name__, processor.func_name)
+        return processor.func_name
+    return processor.__class__.__name__
+
+
+def _runtime_threads_task(name, group, final_outputs, reader, num_threads,
+                          output, capacity):
+    node_name = str(Node.current())
+    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
+        node_name,
+        "pipe",
+        name,
+        processor_name(input) if input else "NoInput",
+        processor_name(output) if output else "NoOutput")
+
+    with Task(name=name, group=group, outputs=final_outputs,
+              num_instances=num_threads) as task:
+        global_exit_net = core.Net('pipe:exit')
+        global_init_net = core.Net('pipe:init')
+        reader.setup_ex(global_init_net, global_exit_net)
+
+        init_net = core.Net('pipe:instance:init')
+        exit_net = core.Net('pipe:instance:exit')
+        read_nets, status, rec = reader.read_record_ex(init_net, exit_net)
+        init_net.ConstantFill(
+            [], [status],
+            shape=[],
+            value=False,
+            dtype=core.DataType.BOOL
+        )
+
+        if rec is not None:
+            out_queue, writer = _init_output(
+                output, capacity, global_init_net, global_exit_net)
+            write_nets, _ = writer.write_record_ex(
+                rec, init_net, exit_net, status)
+        else:
+            out_queue = None
+            write_nets = []
+
+        with ops.task_init():
+            ops.net(global_init_net)
+        with ops.task_instance_init():
+            ops.net(init_net)
+
+        timer_start_net = core.Net('timer_start')
+        timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
+        timer_end_net = core.Net('timer_end')
+        timer_end_net.TimerEnd(timer, [])
+
+        ops.net(core.execution_step(
+            'body',
+            [timer_start_net] + list(read_nets) + list(write_nets) +
+            [timer_end_net],
+            should_stop_blob=status))
+        ops.net(timer_end_net)
+
+        with ops.task_instance_exit():
+            ops.net(exit_net)
+        with ops.task_exit():
+            ops.net(global_exit_net)
+
+    return out_queue, task
+
+
+def _static_threads_task(name, group, final_outputs, reader, num_threads,
+                         output, capacity):
+    node_name = str(Node.current())
+    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
+        node_name,
+        "pipe",
+        name,
+        processor_name(input) if input else "NoInput",
+        processor_name(output) if output else "NoOutput")
+
+    with Task(name=name, group=group, outputs=final_outputs) as task:
+        global_exit_net = core.Net('exit')
+        global_init_net = core.Net('init')
+        reader.setup_ex(global_init_net, global_exit_net)
+
+        out_queue = None
+        writer = None
+
+        steps = []
+        for thread_id in range(num_threads):
+            with NetBuilder(name='t:%d' % thread_id) as nb:
+                init_net = core.Net('init')
+                exit_net = core.Net('exit')
+                read_nets, status, rec = reader.read_record_ex(
+                    init_net, exit_net)
+                init_net.ConstantFill(
+                    [], [status],
+                    shape=[],
+                    value=False,
+                    dtype=core.DataType.BOOL
+                )
+
+                if rec is not None:
+                    if writer is None:
+                        # hack so that the out queue gets the right name prefix
+                        # (otherwise they would be prefixed with the thread id)
+                        with NetBuilder(_fullname=task.name):
+                            out_queue, writer = _init_output(
+                                output, capacity, global_init_net,
+                                global_exit_net)
+                    write_nets, _ = writer.write_record_ex(
+                        rec, init_net, exit_net, status)
+                else:
+                    write_nets = []
+
+                timer_start_net = core.Net('timer_start')
+                timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
+                timer_end_net = core.Net('timer_end')
+                timer_end_net.TimerEnd(timer, [])
+
+                ops.net(init_net)
+                ops.net(core.execution_step(
+                    'body',
+                    [timer_start_net] + list(read_nets) + list(write_nets) +
+                    [timer_end_net],
+                    should_stop_blob=status))
+                ops.net(timer_end_net)
+                ops.net(exit_net)
+            steps.append(core.to_execution_step(nb))
+        ops.net(global_init_net)
+        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
+        ops.net(global_exit_net)
+    return out_queue, task
+
+
+def _pipe_step(
+        input, output=None, num_threads=1, processor=None, name=None,
+        capacity=None, group=None, num_runtime_threads=None, final_outputs=None):
+    """
+    """
+    assert num_threads <= 1 or num_runtime_threads <= 1, (
+        'Only one of num_threads or num_runtime_threads must be set.')
+
+    if isinstance(input, Reader):
+        reader = input
+    elif hasattr(input, 'reader'):
+        reader = input.reader()
+    else:
+        raise ValueError('in must be a reader, queue or stream.')
+
+    if processor is not None:
+        reader = ProcessingReader(reader, processor)
+
+    if num_threads == 0 or num_runtime_threads == 0:
+        assert output is None
+        return reader, None
+
+    if name is None and processor is not None:
+        name = processor_name(processor)
+    if name is None and output is not None:
+        name = 'pipe_into:%s' % processor_name(output)
+    if name is None:
+        name = 'pipe_from:%s' % processor_name(input)
+
+    if num_threads > 1:
+        return _static_threads_task(
+            name, group, final_outputs, reader, num_threads, output, capacity)
+    else:
+        return _runtime_threads_task(
+            name, group, final_outputs, reader, num_runtime_threads, output,
+            capacity)
+
+
+class ProcessingReader(Reader):
+    """
+    Reader that reads from an upstream reader, calls the processor, and returns
+    the processed record.
+    """
+    def __init__(self, reader, processor):
+        Reader.__init__(self)
+        self.reader = reader
+        self.processor = make_processor(processor)
+
+    def setup_ex(self, init_net, finish_net):
+        self.reader.setup_ex(init_net, finish_net)
+
+    def read_ex(self, init_net, exit_net):
+        read_nets, status, rec = self.reader.read_record_ex(init_net, exit_net)
+        # We don't use status as stop_blob of NetBuilder it's not guarantee that
+        # it would end up being the true stob_blob. For example,
+        # ReaderWithLimitBase doesn't pass the status through but rather copy
+        # from it.
+        with NetBuilder() as nb:
+            # Current NetBuilder is optionally used inside the processor,
+            # then its children are retrived inside of
+            # normalize_processor_output.
+            # Once readers and writers also use NetBuilder,
+            # this logic will be more natural.
+            result = normalize_processor_output(self.processor(rec))
+        read_nets += result.nets
+        if result.should_stop or nb._stop_blob:
+            stop_net = core.Net('stop_net')
+            if result.should_stop:
+                stop_net.Or([status, result.should_stop], [status])
+            if nb._stop_blob:
+                stop_net.Or([status, nb._stop_blob], [status])
+            read_nets.append(stop_net)
+        if hasattr(self.processor, 'setup'):
+            init_net.add_attribute(TaskGroup.LOCAL_SETUP, self.processor)
+        self._set_schema(result.record)
+        fields = result.record.field_blobs() if result.record else None
+        return read_nets, status, fields
+
+
+class NetProcessor(object):
+    """
+    Processor that clones a core.Net each time it's called, executing
+    the cloned net as the processor. It requires the Net to have input
+    and (optionally) output records set, with net.set_input_record() and
+    net.set_output_record().
+    """
+    def __init__(self, net, stop_signal=None, thread_init_nets=None, name=None):
+        assert isinstance(net, core.Net)
+        assert stop_signal is None or isinstance(
+            stop_signal, core.BlobReference)
+        self.name = name or str(net)
+        self.thread_init_nets = thread_init_nets or []
+        self.net = net
+        self._stop_signal = stop_signal
+        self._blob_maps = []
+        self._frozen = False
+        self._cloned_init_nets = []
+
+    def setup(self, init_net):
+        self._frozen = True
+        cloned_init_nets = self._cloned_init_nets
+        self._cloned_init_nets = []
+        return cloned_init_nets
+
+    def __call__(self, rec):
+        assert not self._frozen
+        prefix = NetBuilder.current().name + '/'
+        blob_remap = {}
+        for net in self.thread_init_nets:
+            new_net, _ = core.clone_and_bind_net(
+                net, str(net) + prefix, prefix, blob_remap)
+            self._cloned_init_nets.append(new_net)
+
+        new_net, remappings = core.clone_and_bind_net(
+            self.net, str(self.net) + prefix, prefix, blob_remap, rec)
+
+        if self._stop_signal is None:
+            stop_signal = None
+        elif str(self._stop_signal) in remappings:
+            stop_signal = core.BlobReference(
+                remappings[str(self._stop_signal)],
+                net=new_net)
+        else:
+            stop_signal = self._stop_signal
+
+        self._blob_maps.append(remappings)
+        return Output([new_net], new_net.output_record(), stop_signal)
+
+    def blob_maps(self):
+        self._frozen = True
+        return self._blob_maps
diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py
new file mode 100644
index 0000000..5f57355
--- /dev/null
+++ b/caffe2/python/pipeline_test.py
@@ -0,0 +1,77 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.schema import (
+    Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
+from caffe2.python import core, workspace
+from caffe2.python.session import LocalSession
+from caffe2.python.dataset import Dataset
+from caffe2.python.pipeline import pipe
+from caffe2.python.queue_util import Queue
+from caffe2.python.task import TaskGroup
+from caffe2.python.test_util import TestCase
+from caffe2.python.net_builder import ops
+import numpy as np
+import math
+
+
+class TestPipeline(TestCase):
+    def test_dequeue_many(self):
+        init_net = core.Net('init')
+        N = 17
+        NUM_DEQUEUE_RECORDS = 3
+        src_values = Struct(
+            ('uid', np.array(range(N))),
+            ('value', 0.1 * np.array(range(N))))
+        expected_dst = Struct(
+            ('uid', 2 * np.array(range(N))),
+            ('value', np.array(N * [0.0])))
+
+        with core.NameScope('init'):
+            src_blobs = NewRecord(init_net, src_values)
+            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
+            counter = init_net.Const(0)
+            ONE = init_net.Const(1)
+
+        def proc1(rec):
+            with core.NameScope('proc1'):
+                out = NewRecord(ops, rec)
+            ops.Add([rec.uid(), rec.uid()], [out.uid()])
+            out.value.set(blob=rec.value(), unsafe=True)
+            return out
+
+        def proc2(rec):
+            with core.NameScope('proc2'):
+                out = NewRecord(ops, rec)
+            out.uid.set(blob=rec.uid(), unsafe=True)
+            ops.Sub([rec.value(), rec.value()], [out.value()])
+            ops.Add([counter, ONE], [counter])
+            return out
+
+        src_ds = Dataset(src_blobs)
+        dst_ds = Dataset(dst_blobs)
+
+        with TaskGroup() as tg:
+            out1 = pipe(
+                src_ds.reader(),
+                output=Queue(
+                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
+                processor=proc1)
+            out2 = pipe(out1, processor=proc2)
+            pipe(out2, dst_ds.writer())
+
+        ws = workspace.C.Workspace()
+        FeedRecord(src_blobs, src_values, ws)
+        session = LocalSession(ws)
+        session.run(init_net)
+        session.run(tg)
+        output = FetchRecord(dst_blobs, ws=ws)
+        num_dequeues = ws.blobs[str(counter)].fetch()
+
+        self.assertEquals(
+            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))
+
+        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
+            np.testing.assert_array_equal(a, b)
diff --git a/caffe2/python/predictor/__init__.py b/caffe2/python/predictor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
new file mode 100644
index 0000000..07f88de
--- /dev/null
+++ b/caffe2/python/predictor/mobile_exporter.py
@@ -0,0 +1,94 @@
+## @package mobile_exporter
+# Module caffe2.python.mobile_exporter
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, utils
+from caffe2.proto import caffe2_pb2
+import numpy as np
+
+
+def add_tensor(net, name, blob):
+    ''' Create an operator to store the tensor 'blob',
+        run the operator to put the blob to workspace.
+        uint8 is stored as an array of string with one element.
+    '''
+    kTypeNameMapper = {
+        np.dtype('float32'): "GivenTensorFill",
+        np.dtype('int32'): "GivenTensorIntFill",
+        np.dtype('int64'): "GivenTensorInt64Fill",
+        np.dtype('uint8'): "GivenTensorStringFill",
+    }
+
+    shape = blob.shape
+    values = blob
+    # pass array of uint8 as a string to save storage
+    # storing uint8_t has a large overhead for now
+    if blob.dtype == np.dtype('uint8'):
+        shape = [1]
+        values = [str(blob.data)]
+
+    op = core.CreateOperator(
+        kTypeNameMapper[blob.dtype],
+        [], [name],
+        arg=[
+            utils.MakeArgument("shape", shape),
+            utils.MakeArgument("values", values),
+        ]
+    )
+    net.op.extend([op])
+
+
+def Export(workspace, net, params):
+    """Returns init_net and predict_net suitable for writing to disk
+       and loading into a Predictor"""
+    proto = net if isinstance(net, caffe2_pb2.NetDef) else net.Proto()
+    predict_net = caffe2_pb2.NetDef()
+    predict_net.CopyFrom(proto)
+    init_net = caffe2_pb2.NetDef()
+    # Populate the init_net.
+    ssa, blob_versions = core.get_ssa(net)
+    inputs = []
+    for versioned_inputs, _ in ssa:
+        inputs += [name for name, _ in versioned_inputs]
+
+    input_blobs = [blob_name for blob_name, version in
+                   blob_versions.items()
+                   if version == 0 and blob_name not in params]
+    # Blobs that are never used as an input to another layer,
+    # i.e. strictly output blobs.
+    output_blobs = [blob_name for blob_name, version in
+                    blob_versions.items()
+                    if version != 0 and blob_name not in inputs]
+
+    for blob_ref in params:
+        blob_name = str(blob_ref)
+        blob = workspace.FetchBlob(blob_name)
+        add_tensor(init_net, blob_name, blob)
+    # We have to make sure the blob exists in the namespace
+    # and we can do so with fake data. (Which is immediately overwritten
+    # by any typical usage)
+    for blob_name in input_blobs:
+        init_net.op.extend(
+            [
+                core.CreateOperator(
+                    "GivenTensorFill", [], [blob_name],
+                    arg=[
+                        utils.MakeArgument("shape", [1, 1]),
+                        utils.MakeArgument("values", [0.0])
+                    ]
+                )
+            ]
+        )
+
+    # Now we make input/output_blobs line up with what Predictor expects.
+    del predict_net.external_input[:]
+    predict_net.external_input.extend(input_blobs)
+    # For populating weights
+    predict_net.external_input.extend(proto.external_input)
+    # Ensure the output is also consistent with what we want
+    del predict_net.external_output[:]
+    predict_net.external_output.extend(output_blobs)
+    return init_net, predict_net
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
new file mode 100644
index 0000000..e7bbe2c
--- /dev/null
+++ b/caffe2/python/predictor/mobile_exporter_test.py
@@ -0,0 +1,119 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python.test_util import TestCase
+from caffe2.python import workspace, brew
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.predictor import mobile_exporter
+import numpy as np
+
+
+class TestMobileExporter(TestCase):
+    def test_mobile_exporter(self):
+        model = ModelHelper(name="mobile_exporter_test_model")
+        # Test LeNet
+        brew.conv(model, 'data', 'conv1', dim_in=1, dim_out=20, kernel=5)
+        brew.max_pool(model, 'conv1', 'pool1', kernel=2, stride=2)
+        brew.conv(model, 'pool1', 'conv2', dim_in=20, dim_out=50, kernel=5)
+        brew.max_pool(model, 'conv2', 'pool2', kernel=2, stride=2)
+        brew.fc(model, 'pool2', 'fc3', dim_in=50 * 4 * 4, dim_out=500)
+        brew.relu(model, 'fc3', 'fc3')
+        brew.fc(model, 'fc3', 'pred', 500, 10)
+        brew.softmax(model, 'pred', 'out')
+
+        # Create our mobile exportable networks
+        workspace.RunNetOnce(model.param_init_net)
+        init_net, predict_net = mobile_exporter.Export(
+            workspace, model.net, model.params
+        )
+
+        # Populate the workspace with data
+        np_data = np.random.rand(1, 1, 28, 28).astype(np.float32)
+        workspace.FeedBlob("data", np_data)
+
+        workspace.CreateNet(model.net)
+        workspace.RunNet(model.net)
+        ref_out = workspace.FetchBlob("out")
+
+        # Clear the workspace
+        workspace.ResetWorkspace()
+
+        # Populate the workspace with data
+        workspace.RunNetOnce(init_net)
+        # Fake "data" is populated by init_net, we have to replace it
+        workspace.FeedBlob("data", np_data)
+
+        # Overwrite the old net
+        workspace.CreateNet(predict_net, True)
+        workspace.RunNet(predict_net.name)
+        manual_run_out = workspace.FetchBlob("out")
+        np.testing.assert_allclose(
+            ref_out, manual_run_out, atol=1e-10, rtol=1e-10
+        )
+
+        # Clear the workspace
+        workspace.ResetWorkspace()
+
+        # Predictor interface test (simulates writing to disk)
+        predictor = workspace.Predictor(
+            init_net.SerializeToString(), predict_net.SerializeToString()
+        )
+
+        # Output is a vector of outputs but we only care about the first and only result
+        predictor_out = predictor.run([np_data])
+        assert len(predictor_out) == 1
+        predictor_out = predictor_out[0]
+
+        np.testing.assert_allclose(
+            ref_out, predictor_out, atol=1e-10, rtol=1e-10
+        )
+
+    def test_mobile_exporter_datatypes(self):
+        model = ModelHelper(name="mobile_exporter_test_model")
+        model.Copy("data_int", "out")
+        model.params.append("data_int")
+
+        # Create our mobile exportable networks
+        workspace.RunNetOnce(model.param_init_net)
+        np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32)
+        workspace.FeedBlob("data_int", np_data_int)
+
+        init_net, predict_net = mobile_exporter.Export(
+            workspace, model.net, model.params
+        )
+
+        workspace.CreateNet(model.net)
+        workspace.RunNet(model.net)
+        ref_out = workspace.FetchBlob("out")
+
+        # Clear the workspace
+        workspace.ResetWorkspace()
+
+        # Populate the workspace with data
+        workspace.RunNetOnce(init_net)
+
+        # Overwrite the old net
+        workspace.CreateNet(predict_net, True)
+        workspace.RunNet(predict_net.name)
+        manual_run_out = workspace.FetchBlob("out")
+        np.testing.assert_allclose(
+            ref_out, manual_run_out, atol=1e-10, rtol=1e-10
+        )
+
+        # Clear the workspace
+        workspace.ResetWorkspace()
+
+        # Predictor interface test (simulates writing to disk)
+        predictor = workspace.Predictor(
+            init_net.SerializeToString(), predict_net.SerializeToString()
+        )
+
+        # Output is a vector of outputs but we only care about the first and only result
+        predictor_out = predictor.run([])
+        assert len(predictor_out) == 1
+        predictor_out = predictor_out[0]
+
+        np.testing.assert_allclose(
+            ref_out, predictor_out, atol=1e-10, rtol=1e-10
+        )
diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py
new file mode 100644
index 0000000..60b4ab7
--- /dev/null
+++ b/caffe2/python/predictor/predictor_exporter.py
@@ -0,0 +1,244 @@
+## @package predictor_exporter
+# Module caffe2.python.predictor.predictor_exporter
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.proto import metanet_pb2
+from caffe2.python import workspace, core, scope
+from caffe2.python.predictor_constants import predictor_constants
+import caffe2.python.predictor.serde as serde
+import caffe2.python.predictor.predictor_py_utils as utils
+from builtins import bytes
+import collections
+
+
+def get_predictor_exporter_helper(submodelNetName):
+    """ constracting stub for the PredictorExportMeta
+        Only used to construct names to subfields,
+        such as calling to predict_net_name
+        Args:
+            submodelNetName - name of the model
+    """
+    stub_net = core.Net(submodelNetName)
+    pred_meta = PredictorExportMeta(predict_net=stub_net,
+                                    parameters=[],
+                                    inputs=[],
+                                    outputs=[],
+                                    shapes=None,
+                                    name=submodelNetName,
+                                    extra_init_net=None)
+    return pred_meta
+
+
+class PredictorExportMeta(collections.namedtuple(
+    'PredictorExportMeta',
+        'predict_net, parameters, inputs, outputs, shapes, name, \
+        extra_init_net, net_type, num_workers, trainer_prefix')):
+    """
+    Metadata to be used for serializaing a net.
+
+    parameters, inputs, outputs could be either BlobReference or blob's names
+
+    predict_net can be either core.Net, NetDef, PlanDef or object
+
+    Override the named tuple to provide optional name parameter.
+    name will be used to identify multiple prediction nets.
+
+    net_type is the type field in caffe2 NetDef - can be 'simple', 'dag', etc.
+
+    num_workers specifies for net type 'dag' how many threads should run ops
+
+    trainer_prefix specifies the type of trainer.
+    """
+    def __new__(
+        cls,
+        predict_net,
+        parameters,
+        inputs,
+        outputs,
+        shapes=None,
+        name="",
+        extra_init_net=None,
+        net_type=None,
+        num_workers=None,
+        trainer_prefix=None,
+    ):
+        inputs = [str(i) for i in inputs]
+        outputs = [str(o) for o in outputs]
+        assert len(set(inputs)) == len(inputs), (
+            "All inputs to the predictor should be unique")
+        parameters = [str(p) for p in parameters]
+        assert set(parameters).isdisjoint(inputs), (
+            "Parameters and inputs are required to be disjoint. "
+            "Intersection: {}".format(set(parameters).intersection(inputs)))
+        assert set(parameters).isdisjoint(outputs), (
+            "Parameters and outputs are required to be disjoint. "
+            "Intersection: {}".format(set(parameters).intersection(outputs)))
+        shapes = shapes or {}
+
+        if isinstance(predict_net, (core.Net, core.Plan)):
+            predict_net = predict_net.Proto()
+
+        assert isinstance(predict_net, (caffe2_pb2.NetDef, caffe2_pb2.PlanDef))
+        return super(PredictorExportMeta, cls).__new__(
+            cls, predict_net, parameters, inputs, outputs, shapes, name,
+            extra_init_net, net_type, num_workers, trainer_prefix)
+
+    def inputs_name(self):
+        return utils.get_comp_name(predictor_constants.INPUTS_BLOB_TYPE,
+                                   self.name)
+
+    def outputs_name(self):
+        return utils.get_comp_name(predictor_constants.OUTPUTS_BLOB_TYPE,
+                                   self.name)
+
+    def parameters_name(self):
+        return utils.get_comp_name(predictor_constants.PARAMETERS_BLOB_TYPE,
+                                   self.name)
+
+    def global_init_name(self):
+        return utils.get_comp_name(predictor_constants.GLOBAL_INIT_NET_TYPE,
+                                   self.name)
+
+    def predict_init_name(self):
+        return utils.get_comp_name(predictor_constants.PREDICT_INIT_NET_TYPE,
+                                   self.name)
+
+    def predict_net_name(self):
+        return utils.get_comp_name(predictor_constants.PREDICT_NET_TYPE,
+                                   self.name)
+
+    def train_init_plan_name(self):
+        plan_name = utils.get_comp_name(predictor_constants.TRAIN_INIT_PLAN_TYPE,
+                                   self.name)
+        return self.trainer_prefix + '_' + plan_name \
+            if self.trainer_prefix else plan_name
+
+    def train_plan_name(self):
+        plan_name = utils.get_comp_name(predictor_constants.TRAIN_PLAN_TYPE,
+                                   self.name)
+        return self.trainer_prefix + '_' + plan_name \
+            if self.trainer_prefix else plan_name
+
+
+def prepare_prediction_net(filename, db_type, device_option=None):
+    '''
+    Helper function which loads all required blobs from the db
+    and returns prediction net ready to be used
+    '''
+    metanet_def = load_from_db(filename, db_type, device_option)
+
+    global_init_net = utils.GetNet(
+        metanet_def, predictor_constants.GLOBAL_INIT_NET_TYPE)
+    workspace.RunNetOnce(global_init_net)
+
+    predict_init_net = utils.GetNet(
+        metanet_def, predictor_constants.PREDICT_INIT_NET_TYPE)
+    workspace.RunNetOnce(predict_init_net)
+
+    predict_net = core.Net(
+        utils.GetNet(metanet_def, predictor_constants.PREDICT_NET_TYPE))
+    workspace.CreateNet(predict_net)
+
+    return predict_net
+
+
+def _global_init_net(predictor_export_meta):
+    net = core.Net("global-init")
+    net.Load(
+        [predictor_constants.PREDICTOR_DBREADER],
+        predictor_export_meta.parameters)
+    net.Proto().external_input.extend([predictor_constants.PREDICTOR_DBREADER])
+    net.Proto().external_output.extend(predictor_export_meta.parameters)
+
+    # Add the model_id in the predict_net to the global_init_net
+    utils.AddModelIdArg(predictor_export_meta, net.Proto())
+    return net.Proto()
+
+
+def get_meta_net_def(predictor_export_meta, ws=None):
+    """
+    """
+
+    ws = ws or workspace.C.Workspace.current
+    meta_net_def = metanet_pb2.MetaNetDef()
+
+    # Predict net is the core network that we use.
+    utils.AddNet(meta_net_def, predictor_export_meta.predict_init_name(),
+                 utils.create_predict_init_net(ws, predictor_export_meta))
+    utils.AddNet(meta_net_def, predictor_export_meta.global_init_name(),
+                 _global_init_net(predictor_export_meta))
+    utils.AddNet(meta_net_def, predictor_export_meta.predict_net_name(),
+                 utils.create_predict_net(predictor_export_meta))
+    utils.AddBlobs(meta_net_def, predictor_export_meta.parameters_name(),
+                   predictor_export_meta.parameters)
+    utils.AddBlobs(meta_net_def, predictor_export_meta.inputs_name(),
+                   predictor_export_meta.inputs)
+    utils.AddBlobs(meta_net_def, predictor_export_meta.outputs_name(),
+                   predictor_export_meta.outputs)
+    return meta_net_def
+
+
+def set_model_info(meta_net_def, project_str, model_class_str, version):
+    assert isinstance(meta_net_def, metanet_pb2.MetaNetDef)
+    meta_net_def.modelInfo.project = project_str
+    meta_net_def.modelInfo.modelClass = model_class_str
+    meta_net_def.modelInfo.version = version
+
+
+def save_to_db(db_type, db_destination, predictor_export_meta):
+    meta_net_def = get_meta_net_def(predictor_export_meta)
+    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+        workspace.FeedBlob(
+            predictor_constants.META_NET_DEF,
+            serde.serialize_protobuf_struct(meta_net_def)
+        )
+
+    blobs_to_save = [predictor_constants.META_NET_DEF] + \
+        predictor_export_meta.parameters
+    op = core.CreateOperator(
+        "Save",
+        blobs_to_save, [],
+        absolute_path=True,
+        db=db_destination, db_type=db_type)
+
+    workspace.RunOperatorOnce(op)
+
+
+def load_from_db(filename, db_type, device_option=None):
+    # global_init_net in meta_net_def will load parameters from
+    # predictor_constants.PREDICTOR_DBREADER
+    create_db = core.CreateOperator(
+        'CreateDB', [],
+        [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)],
+        db=filename, db_type=db_type)
+    assert workspace.RunOperatorOnce(create_db), (
+        'Failed to create db {}'.format(filename))
+
+    # predictor_constants.META_NET_DEF is always stored before the parameters
+    load_meta_net_def = core.CreateOperator(
+        'Load',
+        [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)],
+        [core.BlobReference(predictor_constants.META_NET_DEF)])
+    assert workspace.RunOperatorOnce(load_meta_net_def)
+
+    blob = workspace.FetchBlob(predictor_constants.META_NET_DEF)
+    meta_net_def = serde.deserialize_protobuf_struct(
+        blob if isinstance(blob, bytes)
+        else str(blob).encode('utf-8'),
+        metanet_pb2.MetaNetDef)
+
+    if device_option is None:
+        device_option = scope.CurrentDeviceScope()
+
+    if device_option is not None:
+        # Set the device options of all loaded blobs
+        for kv in meta_net_def.nets:
+            net = kv.value
+            for op in net.op:
+                op.device_option.CopyFrom(device_option)
+
+    return meta_net_def
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
new file mode 100644
index 0000000..b4c7153
--- /dev/null
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -0,0 +1,207 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import tempfile
+import unittest
+import numpy as np
+from caffe2.python import cnn, workspace, core
+from future.utils import viewitems
+
+from caffe2.python.predictor_constants import predictor_constants as pc
+import caffe2.python.predictor.predictor_exporter as pe
+import caffe2.python.predictor.predictor_py_utils as pred_utils
+from caffe2.proto import caffe2_pb2, metanet_pb2
+
+
+class MetaNetDefTest(unittest.TestCase):
+    def test_minimal(self):
+        '''
+        Tests that a NetsMap message can be created with a NetDef message
+        '''
+        # This calls the constructor for a metanet_pb2.NetsMap
+        metanet_pb2.NetsMap(key="test_key", value=caffe2_pb2.NetDef())
+
+    def test_adding_net(self):
+        '''
+        Tests that NetDefs can be added to MetaNetDefs
+        '''
+        meta_net_def = metanet_pb2.MetaNetDef()
+        net_def = caffe2_pb2.NetDef()
+        meta_net_def.nets.add(key="test_key", value=net_def)
+
+class PredictorExporterTest(unittest.TestCase):
+    def _create_model(self):
+        m = cnn.CNNModelHelper()
+        m.FC("data", "y",
+             dim_in=5, dim_out=10,
+             weight_init=m.XavierInit,
+             bias_init=m.XavierInit)
+        return m
+
+    def setUp(self):
+        np.random.seed(1)
+        m = self._create_model()
+
+        self.predictor_export_meta = pe.PredictorExportMeta(
+            predict_net=m.net.Proto(),
+            parameters=[str(b) for b in m.params],
+            inputs=["data"],
+            outputs=["y"],
+            shapes={"y": (1, 10), "data": (1, 5)},
+        )
+        workspace.RunNetOnce(m.param_init_net)
+
+        self.params = {
+            param: workspace.FetchBlob(param)
+            for param in self.predictor_export_meta.parameters}
+        # Reset the workspace, to ensure net creation proceeds as expected.
+        workspace.ResetWorkspace()
+
+    def test_meta_constructor(self):
+        '''
+        Test that passing net itself instead of proto works
+        '''
+        m = self._create_model()
+        pe.PredictorExportMeta(
+            predict_net=m.net,
+            parameters=m.params,
+            inputs=["data"],
+            outputs=["y"],
+            shapes={"y": (1, 10), "data": (1, 5)},
+        )
+
+    def test_param_intersection(self):
+        '''
+        Test that passes intersecting parameters and input/output blobs
+        '''
+        m = self._create_model()
+        with self.assertRaises(Exception):
+            pe.PredictorExportMeta(
+                predict_net=m.net,
+                parameters=m.params,
+                inputs=["data"] + m.params,
+                outputs=["y"],
+                shapes={"y": (1, 10), "data": (1, 5)},
+            )
+        with self.assertRaises(Exception):
+            pe.PredictorExportMeta(
+                predict_net=m.net,
+                parameters=m.params,
+                inputs=["data"],
+                outputs=["y"] + m.params,
+                shapes={"y": (1, 10), "data": (1, 5)},
+            )
+
+    def test_meta_net_def_net_runs(self):
+        for param, value in viewitems(self.params):
+            workspace.FeedBlob(param, value)
+
+        extra_init_net = core.Net('extra_init')
+        extra_init_net.ConstantFill('data', 'data', value=1.0)
+        pem = pe.PredictorExportMeta(
+            predict_net=self.predictor_export_meta.predict_net,
+            parameters=self.predictor_export_meta.parameters,
+            inputs=self.predictor_export_meta.inputs,
+            outputs=self.predictor_export_meta.outputs,
+            shapes=self.predictor_export_meta.shapes,
+            extra_init_net=extra_init_net,
+            net_type='dag',
+        )
+
+        db_type = 'minidb'
+        db_file = tempfile.NamedTemporaryFile(
+            delete=False, suffix=".{}".format(db_type))
+        pe.save_to_db(
+            db_type=db_type,
+            db_destination=db_file.name,
+            predictor_export_meta=pem)
+
+        workspace.ResetWorkspace()
+
+        meta_net_def = pe.load_from_db(
+            db_type=db_type,
+            filename=db_file.name,
+        )
+
+        self.assertTrue("data" not in workspace.Blobs())
+        self.assertTrue("y" not in workspace.Blobs())
+
+        init_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_INIT_NET_TYPE)
+
+        # 0-fills externalblobs blobs and runs extra_init_net
+        workspace.RunNetOnce(init_net)
+
+        self.assertTrue("data" in workspace.Blobs())
+        self.assertTrue("y" in workspace.Blobs())
+
+        print(workspace.FetchBlob("data"))
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("data"), np.ones(shape=(1, 5)))
+        np.testing.assert_array_equal(
+            workspace.FetchBlob("y"), np.zeros(shape=(1, 10)))
+
+        # Load parameters from DB
+        global_init_net = pred_utils.GetNet(meta_net_def,
+                                            pc.GLOBAL_INIT_NET_TYPE)
+        workspace.RunNetOnce(global_init_net)
+
+        # Run the net with a reshaped input and verify we are
+        # producing good numbers (with our custom implementation)
+        workspace.FeedBlob("data", np.random.randn(2, 5).astype(np.float32))
+        predict_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_NET_TYPE)
+        self.assertEqual(predict_net.type, 'dag')
+        workspace.RunNetOnce(predict_net)
+        np.testing.assert_array_almost_equal(
+            workspace.FetchBlob("y"),
+            workspace.FetchBlob("data").dot(self.params["y_w"].T) +
+            self.params["y_b"])
+
+    def test_load_device_scope(self):
+        for param, value in self.params.items():
+            workspace.FeedBlob(param, value)
+
+        pem = pe.PredictorExportMeta(
+            predict_net=self.predictor_export_meta.predict_net,
+            parameters=self.predictor_export_meta.parameters,
+            inputs=self.predictor_export_meta.inputs,
+            outputs=self.predictor_export_meta.outputs,
+            shapes=self.predictor_export_meta.shapes,
+            net_type='dag',
+        )
+
+        db_type = 'minidb'
+        db_file = tempfile.NamedTemporaryFile(
+            delete=False, suffix=".{}".format(db_type))
+        pe.save_to_db(
+            db_type=db_type,
+            db_destination=db_file.name,
+            predictor_export_meta=pem)
+
+        workspace.ResetWorkspace()
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 1)):
+            meta_net_def = pe.load_from_db(
+                db_type=db_type,
+                filename=db_file.name,
+            )
+
+        init_net = core.Net(pred_utils.GetNet(meta_net_def,
+                            pc.GLOBAL_INIT_NET_TYPE))
+        predict_init_net = core.Net(pred_utils.GetNet(
+            meta_net_def, pc.PREDICT_INIT_NET_TYPE))
+
+        # check device options
+        for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
+            self.assertEqual(1, op.device_option.cuda_gpu_id)
+            self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
+
+    def test_db_fails_without_params(self):
+        with self.assertRaises(Exception):
+            for db_type in ["minidb"]:
+                db_file = tempfile.NamedTemporaryFile(
+                    delete=False, suffix=".{}".format(db_type))
+                pe.save_to_db(
+                    db_type=db_type,
+                    db_destination=db_file.name,
+                    predictor_export_meta=self.predictor_export_meta)
diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py
new file mode 100644
index 0000000..22d0444
--- /dev/null
+++ b/caffe2/python/predictor/predictor_py_utils.py
@@ -0,0 +1,168 @@
+## @package predictor_py_utils
+# Module caffe2.python.predictor.predictor_py_utils
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope
+
+
+def create_predict_net(predictor_export_meta):
+    """
+    Return the input prediction net.
+    """
+    # Construct a new net to clear the existing settings.
+    net = core.Net(predictor_export_meta.predict_net.name or "predict")
+    net.Proto().op.extend(predictor_export_meta.predict_net.op)
+    net.Proto().external_input.extend(
+        predictor_export_meta.inputs + predictor_export_meta.parameters)
+    net.Proto().external_output.extend(predictor_export_meta.outputs)
+    net.Proto().arg.extend(predictor_export_meta.predict_net.arg)
+    if predictor_export_meta.net_type is not None:
+        net.Proto().type = predictor_export_meta.net_type
+    if predictor_export_meta.num_workers is not None:
+        net.Proto().num_workers = predictor_export_meta.num_workers
+    return net.Proto()
+
+
+def create_predict_init_net(ws, predictor_export_meta):
+    """
+    Return an initialization net that zero-fill all the input and
+    output blobs, using the shapes from the provided workspace. This is
+    necessary as there is no shape inference functionality in Caffe2.
+    """
+    net = core.Net("predict-init")
+
+    def zero_fill(blob):
+        shape = predictor_export_meta.shapes.get(blob)
+        if shape is None:
+            if blob not in ws.blobs:
+                raise Exception(
+                    "{} not in workspace but needed for shape: {}".format(
+                        blob, ws.blobs))
+
+            shape = ws.blobs[blob].fetch().shape
+
+        # Explicitly null-out the scope so users (e.g. PredictorGPU)
+        # can control (at a Net-global level) the DeviceOption of
+        # these filling operators.
+        with scope.EmptyDeviceScope():
+            net.ConstantFill([], blob, shape=shape, value=0.0)
+
+    external_blobs = predictor_export_meta.inputs + \
+        predictor_export_meta.outputs
+    for blob in external_blobs:
+        zero_fill(blob)
+
+    net.Proto().external_input.extend(external_blobs)
+    if predictor_export_meta.extra_init_net:
+        net.AppendNet(predictor_export_meta.extra_init_net)
+
+    # Add the model_id in the predict_net to the init_net
+    AddModelIdArg(predictor_export_meta, net.Proto())
+
+    return net.Proto()
+
+
+def get_comp_name(string, name):
+    if name:
+        return string + '_' + name
+    return string
+
+
+def _ProtoMapGet(field, key):
+    '''
+    Given the key, get the value of the repeated field.
+    Helper function used by protobuf since it doesn't have map construct
+    '''
+    for v in field:
+        if (v.key == key):
+            return v.value
+    return None
+
+
+def GetPlan(meta_net_def, key):
+    return _ProtoMapGet(meta_net_def.plans, key)
+
+
+def GetPlanOriginal(meta_net_def, key):
+    return _ProtoMapGet(meta_net_def.plans, key)
+
+
+def GetBlobs(meta_net_def, key):
+    blobs = _ProtoMapGet(meta_net_def.blobs, key)
+    if blobs is None:
+        return []
+    return blobs
+
+
+def GetBlobsByTypePrefix(meta_net_def, blob_type_prefix):
+    blob_map = {}
+    for b in meta_net_def.blobs:
+        if b.key.startswith(blob_type_prefix):
+            for blob in b.value:
+                if blob not in blob_map:
+                    blob_map[blob] = len(blob_map)
+    return sorted(blob_map, key=lambda blob: blob_map[blob])
+
+
+def GetNet(meta_net_def, key):
+    return _ProtoMapGet(meta_net_def.nets, key)
+
+
+def GetNetOriginal(meta_net_def, key):
+    return _ProtoMapGet(meta_net_def.nets, key)
+
+
+def GetApplicationSpecificInfo(meta_net_def, key):
+    return _ProtoMapGet(meta_net_def.applicationSpecificInfo, key)
+
+
+def AddBlobs(meta_net_def, blob_name, blob_def):
+    blobs = _ProtoMapGet(meta_net_def.blobs, blob_name)
+    if blobs is None:
+        blobs = meta_net_def.blobs.add()
+        blobs.key = blob_name
+        blobs = blobs.value
+    for blob in blob_def:
+        blobs.append(blob)
+
+
+def AddPlan(meta_net_def, plan_name, plan_def):
+    meta_net_def.plans.add(key=plan_name, value=plan_def)
+
+
+def AddNet(meta_net_def, net_name, net_def):
+    meta_net_def.nets.add(key=net_name, value=net_def)
+
+
+def GetArgumentByName(net_def, arg_name):
+    for arg in net_def.arg:
+        if arg.name == arg_name:
+            return arg
+    return None
+
+
+def AddModelIdArg(meta_net_def, net_def):
+    """Takes the model_id from the predict_net of meta_net_def (if it is
+    populated) and adds it to the net_def passed in. This is intended to be
+    called on init_nets, as their model_id is not populated by default, but
+    should be the same as that of the predict_net
+    """
+    # Get model_id from the predict_net, assuming it's an integer
+    model_id = GetArgumentByName(meta_net_def.predict_net, "model_id")
+    if model_id is None:
+        return
+    model_id = model_id.i
+
+    # If there's another model_id on the net, replace it with the new one
+    old_id = GetArgumentByName(net_def, "model_id")
+    if old_id is not None:
+        old_id.i = model_id
+        return
+
+    # Add as an integer argument, this is also assumed above
+    arg = net_def.arg.add()
+    arg.name = "model_id"
+    arg.i = model_id
diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py
new file mode 100644
index 0000000..26c4cae
--- /dev/null
+++ b/caffe2/python/predictor/predictor_test.py
@@ -0,0 +1,73 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+from caffe2.python import workspace, core
+
+from caffe2.proto import caffe2_pb2
+
+
+class TestPredictor(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(1)
+        self.predict_net = self._predict_net
+        self.init_net = self._init_net
+
+    @property
+    def _predict_net(self):
+        net = caffe2_pb2.NetDef()
+        net.name = 'test-predict-net'
+        net.external_input[:] = ['A', 'B']
+        net.external_output[:] = ['C']
+        net.op.extend([
+            core.CreateOperator(
+                'MatMul',
+                ['A', 'B'],
+                ['C'],
+            )
+        ])
+        return net.SerializeToString()
+
+    @property
+    def _init_net(self):
+        net = caffe2_pb2.NetDef()
+        net.name = 'test-init-net'
+        net.external_output[:] = ['A', 'B']
+        net.op.extend([
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                ['A'],
+                shape=(2, 3),
+                values=np.zeros((2, 3), np.float32).flatten().tolist(),
+            ),
+            core.CreateOperator(
+                'GivenTensorFill',
+                [],
+                ['B'],
+                shape=(3, 4),
+                values=np.zeros((3, 4), np.float32).flatten().tolist(),
+            ),
+        ])
+        return net.SerializeToString()
+
+    def test_run(self):
+        A = np.ones((2, 3), np.float32)
+        B = np.ones((3, 4), np.float32)
+        predictor = workspace.Predictor(self.init_net, self.predict_net)
+        outputs = predictor.run([A, B])
+        self.assertEqual(len(outputs), 1)
+        np.testing.assert_almost_equal(np.dot(A, B), outputs[0])
+
+    def test_run_map(self):
+        A = np.zeros((2, 3), np.float32)
+        B = np.ones((3, 4), np.float32)
+        predictor = workspace.Predictor(self.init_net, self.predict_net)
+        outputs = predictor.run({
+            'B': B,
+        })
+        self.assertEqual(len(outputs), 1)
+        np.testing.assert_almost_equal(np.dot(A, B), outputs[0])
diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py
new file mode 100644
index 0000000..af48b29
--- /dev/null
+++ b/caffe2/python/predictor/serde.py
@@ -0,0 +1,16 @@
+## @package serde
+# Module caffe2.python.predictor.serde
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def serialize_protobuf_struct(protobuf_struct):
+    return protobuf_struct.SerializeToString()
+
+
+def deserialize_protobuf_struct(serialized_protobuf, struct_type):
+    deser = struct_type()
+    deser.ParseFromString(serialized_protobuf)
+    return deser
diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py
new file mode 100644
index 0000000..c1e1ded
--- /dev/null
+++ b/caffe2/python/predictor_constants.py
@@ -0,0 +1,9 @@
+## @package predictor_constants
+# Module caffe2.python.predictor_constants
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import caffe2.proto.predictor_consts_pb2 as predictor_consts
+
+predictor_constants = predictor_consts.PredictorConsts()
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
new file mode 100644
index 0000000..7c421ff
--- /dev/null
+++ b/caffe2/python/pybind_state.cc
@@ -0,0 +1,1630 @@
+#include "pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/contrib/script/compiler.h"
+#include "caffe2/core/asan.h"
+#include "caffe2/core/blob_stats.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/numa.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/transform.h"
+#include "caffe2/mkl/mkl_utils.h"
+#include "caffe2/observers/runcnt_observer.h"
+#include "caffe2/observers/time_observer.h"
+#include "caffe2/onnx/backend.h"
+#include "caffe2/onnx/helper.h"
+#include "caffe2/onnx/onnx_exporter.h"
+#include "caffe2/opt/converter.h"
+#include "caffe2/opt/fusion.h"
+#include "caffe2/opt/mobile.h"
+#include "caffe2/opt/optimize_ideep.h"
+#include "caffe2/opt/passes.h"
+#include "caffe2/opt/sink.h"
+#include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+namespace python {
+
+// A dummy variable to overcome the pybind11 py::arg::operator= ambiguity
+// for some earlier versions of pybind11.
+constexpr bool kPyBindFalse = false;
+
+namespace py = pybind11;
+
+// gWorkspaces allows us to define and switch between multiple workspaces in
+// Python.
+static std::map<std::string, std::unique_ptr<Workspace>> gWorkspaces;
+// gWorkspace is the pointer to the current workspace. The ownership is kept
+// by the gWorkspaces map.
+static Workspace* gWorkspace = nullptr;
+static std::string gCurrentWorkspaceName;
+
+BlobFetcherBase::~BlobFetcherBase() {}
+BlobFeederBase::~BlobFeederBase() {}
+
+CAFFE_DEFINE_TYPED_REGISTRY(
+    BlobFetcherRegistry,
+    CaffeTypeId,
+    BlobFetcherBase,
+    std::unique_ptr);
+CAFFE_DEFINE_TYPED_REGISTRY(
+    BlobFeederRegistry,
+    int,
+    BlobFeederBase,
+    std::unique_ptr);
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCPU>()), TensorFetcher<CPUContext>);
+REGISTER_BLOB_FEEDER(CPU, TensorFeeder<CPUContext>);
+
+Workspace* GetCurrentWorkspace() {
+  return gWorkspace;
+}
+
+class StringFetcher : public BlobFetcherBase {
+ public:
+  py::object Fetch(const Blob& blob) override {
+    return py::bytes(blob.Get<string>());
+  }
+};
+REGISTER_BLOB_FETCHER((TypeMeta::Id<string>()), StringFetcher);
+
+static_assert(
+    sizeof(int) == sizeof(int32_t),
+    "We make an assumption that int is always int32 for numpy "
+    "type mapping.");
+int CaffeToNumpyType(const TypeMeta& meta) {
+  static std::map<CaffeTypeId, int> numpy_type_map{
+      {TypeMeta::Id<bool>(), NPY_BOOL},
+      {TypeMeta::Id<double>(), NPY_DOUBLE},
+      {TypeMeta::Id<float>(), NPY_FLOAT},
+      {TypeMeta::Id<float16>(), NPY_FLOAT16},
+      {TypeMeta::Id<int>(), NPY_INT},
+      {TypeMeta::Id<int8_t>(), NPY_INT8},
+      {TypeMeta::Id<int16_t>(), NPY_INT16},
+      {TypeMeta::Id<int64_t>(), NPY_LONGLONG},
+      {TypeMeta::Id<uint8_t>(), NPY_UINT8},
+      {TypeMeta::Id<uint16_t>(), NPY_UINT16},
+      {TypeMeta::Id<std::string>(), NPY_OBJECT},
+      // Note: Add more types here.
+  };
+  const auto it = numpy_type_map.find(meta.id());
+  return it == numpy_type_map.end() ? -1 : it->second;
+}
+
+const TypeMeta& NumpyTypeToCaffe(int numpy_type) {
+  static std::map<int, TypeMeta> caffe_type_map{
+      {NPY_BOOL, TypeMeta::Make<bool>()},
+      {NPY_DOUBLE, TypeMeta::Make<double>()},
+      {NPY_FLOAT, TypeMeta::Make<float>()},
+      {NPY_FLOAT16, TypeMeta::Make<float16>()},
+      {NPY_INT, TypeMeta::Make<int>()},
+      {NPY_INT8, TypeMeta::Make<int8_t>()},
+      {NPY_INT16, TypeMeta::Make<int16_t>()},
+      {NPY_INT64, TypeMeta::Make<int64_t>()},
+      {NPY_LONG,
+       sizeof(long) == sizeof(int) ? TypeMeta::Make<int>()
+                                   : TypeMeta::Make<int64_t>()},
+      {NPY_LONGLONG, TypeMeta::Make<int64_t>()},
+      {NPY_UINT8, TypeMeta::Make<uint8_t>()},
+      {NPY_UINT16, TypeMeta::Make<uint16_t>()},
+      {NPY_OBJECT, TypeMeta::Make<std::string>()},
+      {NPY_UNICODE, TypeMeta::Make<std::string>()},
+      {NPY_STRING, TypeMeta::Make<std::string>()},
+      // Note: Add more types here.
+  };
+  static TypeMeta unknown_type;
+  const auto it = caffe_type_map.find(numpy_type);
+  return it == caffe_type_map.end() ? unknown_type : it->second;
+}
+
+template <typename Registry>
+std::function<const char*(const string&)> DefinitionGetter(
+    const Registry* registry) {
+  return [registry](const string& name) { return registry->HelpMessage(name); };
+}
+
+void switchWorkspaceInternal(const std::string& name, bool create_if_missing) {
+  if (gWorkspaces.count(name)) {
+    gCurrentWorkspaceName = name;
+    gWorkspace = gWorkspaces[name].get();
+    return;
+  }
+
+  CAFFE_ENFORCE(create_if_missing);
+  std::unique_ptr<Workspace> new_workspace(new Workspace());
+  gWorkspace = new_workspace.get();
+  gWorkspaces.insert(std::make_pair(name, std::move(new_workspace)));
+  gCurrentWorkspaceName = name;
+}
+
+namespace python_detail {
+// Python Op implementations.
+using FuncRegistry = std::unordered_map<std::string, Func>;
+
+FuncRegistry& gRegistry() {
+  // Always leak the objects registered here.
+  static FuncRegistry* r = new FuncRegistry();
+  return *r;
+}
+
+const Func& getOpFunc(const std::string& token) {
+  CAFFE_ENFORCE(
+      gRegistry().count(token),
+      "Python operator for ",
+      token,
+      " is not available. If you use distributed training it probably means "
+      "that python implementation has to be registered in each of the workers");
+  return gRegistry()[token];
+}
+
+const Func& getGradientFunc(const std::string& token) {
+  return getOpFunc(token + "_gradient");
+}
+
+py::object fetchBlob(Workspace* ws, const std::string& name) {
+  CAFFE_ENFORCE(ws->HasBlob(name), "Can't find blob: ", name);
+  const caffe2::Blob& blob = *(ws->GetBlob(name));
+  auto fetcher = CreateFetcher(blob.meta().id());
+  if (fetcher) {
+    return fetcher->Fetch(blob);
+  } else {
+    // If there is no fetcher registered, return a metainfo string.
+    // If all branches failed, we will return a metainfo string.
+    std::stringstream ss;
+    ss << caffe2::string(name) << ", a C++ native class of type "
+       << blob.TypeName() << ".";
+    return py::bytes(ss.str());
+  }
+}
+} // namespace python_detail
+
+class GetPythonGradient : public GradientMakerBase {
+ public:
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(Def().type() == "Python" || Def().type() == "PythonDLPack");
+    ArgumentHelper helper(Def());
+    auto gradOutputIndices =
+        helper.GetRepeatedArgument<int>("grad_output_indices");
+    auto gradInputIndices =
+        helper.GetRepeatedArgument<int>("grad_input_indices");
+    std::vector<std::string> gradientInputs;
+    for (int i = 0; i < def_.input_size(); ++i) {
+      gradientInputs.push_back(I(i));
+    }
+    for (int i = 0; i < def_.output_size(); ++i) {
+      gradientInputs.push_back(O(i));
+    }
+    if (gradOutputIndices.size() > 0) {
+      for (int i = 0; i < gradOutputIndices.size(); ++i) {
+        int GO_i = gradOutputIndices[i];
+        gradientInputs.push_back(GO(GO_i));
+      }
+    } else {
+      for (int i = 0; i < def_.output_size(); ++i) {
+        gradientInputs.push_back(GO(i));
+      }
+    }
+    std::vector<std::string> gradientOutputs;
+    if (gradInputIndices.size() > 0) {
+      for (int i = 0; i < gradInputIndices.size(); ++i) {
+        int GI_i = gradInputIndices[i];
+        gradientOutputs.push_back(GI(GI_i));
+      }
+    } else {
+      for (int i = 0; i < def_.input_size(); ++i) {
+        gradientOutputs.push_back(GI(i));
+      }
+    }
+
+    std::string grad_op_name = "PythonGradient";
+    if (Def().type() == "PythonDLPack") {
+      grad_op_name = "PythonDLPackGradient";
+    }
+    return SingleGradientDef(grad_op_name, "", gradientInputs, gradientOutputs);
+  }
+};
+
+REGISTER_CPU_OPERATOR(Python, PythonOp<CPUContext, false>);
+REGISTER_CPU_OPERATOR(PythonGradient, PythonGradientOp<CPUContext, false>);
+// Always allow running in-place
+OPERATOR_SCHEMA(Python).AllowInplace([](int, int) { return true; });
+OPERATOR_SCHEMA(PythonGradient).AllowInplace([](int, int) { return true; });
+REGISTER_GRADIENT(Python, GetPythonGradient);
+
+REGISTER_CPU_OPERATOR(PythonDLPack, PythonOp<CPUContext, true>);
+REGISTER_CPU_OPERATOR(PythonDLPackGradient, PythonGradientOp<CPUContext, true>);
+OPERATOR_SCHEMA(PythonDLPack).AllowInplace([](int, int) { return true; });
+OPERATOR_SCHEMA(PythonDLPackGradient).AllowInplace([](int, int) {
+  return true;
+});
+REGISTER_GRADIENT(PythonDLPack, GetPythonGradient);
+
+void addObjectMethods(py::module& m) {
+  py::class_<NetBase>(m, "Net").def("run", [](NetBase* net) {
+    py::gil_scoped_release g;
+    CAFFE_ENFORCE(net->Run());
+  });
+
+  py::class_<ObserverBase<NetBase>>(m, "Observer")
+      .def(
+          "average_time",
+          [](ObserverBase<NetBase>* ob) {
+            auto* cast_ob = dynamic_cast_if_rtti<TimeObserver*>(ob);
+            CAFFE_ENFORCE(
+                cast_ob, "Observer does not implement this function.");
+            return cast_ob->average_time();
+          })
+      .def(
+          "average_time_children",
+          [](ObserverBase<NetBase>* ob) {
+            auto* cast_ob = dynamic_cast_if_rtti<TimeObserver*>(ob);
+            CAFFE_ENFORCE(
+                cast_ob, "Observer does not implement this function.");
+            return cast_ob->average_time_children();
+          })
+      .def("debug_info", [](ObserverBase<NetBase>* ob) {
+        return ob->debugInfo();
+      });
+
+  py::class_<Blob>(m, "Blob")
+      .def(
+          "serialize",
+          [](const Blob& blob, const std::string& name) -> py::bytes {
+            return blob.Serialize(name);
+          })
+      .def(
+          "deserialize",
+          [](Blob* blob, py::bytes serialized) {
+            blob->Deserialize(serialized);
+          })
+      .def(
+          "fetch",
+          [](const Blob& blob) {
+            auto fetcher = CreateFetcher(blob.meta().id());
+            CAFFE_ENFORCE(
+                fetcher,
+                "Could not fetch for blob of type: ",
+                blob.meta().name());
+            return fetcher->Fetch(blob);
+          })
+      .def(
+          "tensor",
+          [](Blob* blob) { return py::cast(blob->GetMutable<TensorCPU>()); },
+          py::return_value_policy::reference_internal)
+      .def(
+          "_feed",
+          [](Blob* blob,
+             const py::object& arg,
+             const py::object device_option) {
+            DeviceOption option;
+            if (!device_option.is(py::none())) {
+              // If we have a device option passed in, read it.
+              CAFFE_ENFORCE(ParseProtoFromLargeString(
+                  py::bytes(device_option).cast<std::string>(), &option));
+            }
+            if (PyArray_Check(arg.ptr())) { // numpy array
+              PyArrayObject* array =
+                  reinterpret_cast<PyArrayObject*>(arg.ptr());
+              auto feeder = CreateFeeder(option.device_type());
+              CAFFE_ENFORCE(
+                  feeder, "Unknown device type encountered in FeedBlob.");
+              feeder->Feed(option, array, blob);
+              return true;
+            }
+
+            if (PyBytes_Check(arg.ptr()) || PyUnicode_Check(arg.ptr())) {
+              *blob->GetMutable<std::string>() = arg.cast<std::string>();
+              return true;
+            }
+            CAFFE_THROW(
+                "Unexpected type of argument - only numpy array or string are "
+                "supported for feeding");
+          },
+          "Feed an input array or string, with the (optional) DeviceOption",
+          py::arg("arg"),
+          py::arg("device_option") = py::none());
+
+  py::class_<DLPackWrapper<CPUContext>>(m, "DLPackTensorCPU")
+      .def_property_readonly(
+          "data",
+          [](DLPackWrapper<CPUContext>* t) -> py::object {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                CPU,
+                "Expected CPU device option for CPU tensor");
+            return t->data();
+          },
+          "Return DLPack tensor with tensor's data.")
+      .def(
+          "feed",
+          [](DLPackWrapper<CPUContext>* t, py::object obj) {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                CPU,
+                "Expected CPU device option for CPU tensor");
+            t->feed(obj);
+          },
+          "Copy data from given DLPack tensor into this tensor.")
+      .def_property_readonly(
+          "_shape",
+          [](const DLPackWrapper<CPUContext>& t) {
+            auto* tensor = t.tensor;
+            return tensor->dims();
+          })
+      .def(
+          "_reshape",
+          [](DLPackWrapper<CPUContext>* t, std::vector<TIndex> dims) {
+            auto* tensor = t->tensor;
+            tensor->Resize(dims);
+          });
+
+  py::class_<TensorCPU>(m, "TensorCPU")
+      .def_property_readonly(
+          "data",
+          [](TensorCPU* t) -> py::object {
+            if (t->meta() == TypeMeta{}) {
+              // keep this behavior for backward compatibility
+              t->mutable_data<float>();
+            }
+            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, false);
+            return res.obj;
+          },
+          "Return numpy array pointing to this tensor's data if possible. "
+          "Otherwise (e.g. for strings) copies the data (same as fetch).")
+      .def(
+          "feed",
+          [](TensorCPU* t, py::object obj) {
+            if (!PyArray_Check(obj.ptr())) {
+              CAFFE_THROW(
+                  "Unexpected type of argument -- expected numpy array");
+            }
+            TensorFeeder<CPUContext>().FeedTensor(
+                DeviceOption{}, reinterpret_cast<PyArrayObject*>(obj.ptr()), t);
+          },
+          "Copy data from given numpy array into this tensor.")
+      .def(
+          "fetch",
+          [](TensorCPU* t) {
+            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, true);
+            return res.obj;
+          },
+          "Copy data from this tensor into a new numpy array.")
+      .def(
+          "init",
+          [](TensorCPU* t, std::vector<TIndex> dims, int caffe_type) {
+            const auto& meta =
+                DataTypeToTypeMeta((TensorProto::DataType)caffe_type);
+            CAFFE_ENFORCE(
+                !TensorFetcher<CPUContext>().NeedsCopy(meta),
+                "Cannot init tensor of this type. Use `feed` instead.");
+            t->Resize(dims);
+            t->raw_mutable_data(meta);
+          },
+          "Initialize this tensor to given shape and data type. "
+          "Fail if the given data type cannot be accessed from python.")
+      .def_property_readonly(
+          "_shape", [](const TensorCPU& t) { return t.dims(); })
+      .def("_reshape", [](TensorCPU* t, std::vector<TIndex> dims) {
+        t->Resize(dims);
+      });
+
+  py::class_<Workspace>(m, "Workspace")
+      .def(py::init<>())
+      .def(py::init<Workspace*>())
+      .def_property_readonly(
+          "nets",
+          [](Workspace* self) {
+            CHECK_NOTNULL(self);
+            std::map<std::string, py::object> nets;
+            for (const auto& name : self->Nets()) {
+              LOG(INFO) << "name: " << name;
+              nets[name] = py::cast(self->GetNet(name));
+            }
+            return nets;
+          },
+          py::return_value_policy::reference_internal)
+      .def_property_readonly(
+          "blobs",
+          [](Workspace* self) {
+            CHECK_NOTNULL(self);
+            std::map<std::string, py::object> blobs;
+            for (const auto& name : self->Blobs()) {
+              blobs[name] = py::cast(self->GetBlob(name));
+            }
+            return blobs;
+          },
+          py::return_value_policy::reference_internal)
+      .def(
+          "_create_net",
+          [](Workspace* self, py::bytes def, bool overwrite) -> py::object {
+            caffe2::NetDef proto;
+            CAFFE_ENFORCE(
+                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+            NetBase* net = self->CreateNet(proto, overwrite);
+            CAFFE_ENFORCE(net);
+            return py::cast(net);
+          },
+          py::return_value_policy::reference_internal,
+          py::arg("def"),
+          py::arg("overwrite") = kPyBindFalse)
+      .def(
+          "create_blob",
+          [](Workspace* self, const std::string& name) -> py::object {
+            return py::cast(self->CreateBlob(name));
+          },
+          py::return_value_policy::reference_internal)
+      .def("fetch_blob", &python_detail::fetchBlob)
+      .def(
+          "has_blob",
+          [](Workspace* self, const std::string& name) {
+            return self->HasBlob(name);
+          })
+      .def(
+          "_run_net",
+          [](Workspace* self, py::bytes def) {
+            caffe2::NetDef proto;
+            CAFFE_ENFORCE(
+                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+            py::gil_scoped_release g;
+            CAFFE_ENFORCE(self->RunNetOnce(proto));
+          })
+      .def(
+          "_run_operator",
+          [](Workspace* self, py::bytes def) {
+            caffe2::OperatorDef proto;
+            CAFFE_ENFORCE(
+                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+            py::gil_scoped_release g;
+            CAFFE_ENFORCE(self->RunOperatorOnce(proto));
+          })
+      .def(
+          "_run_plan",
+          [](Workspace* self, py::bytes def) {
+            caffe2::PlanDef proto;
+            CAFFE_ENFORCE(
+                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+            py::gil_scoped_release g;
+            CAFFE_ENFORCE(self->RunPlan(proto));
+          })
+      .def(
+          "_last_failed_op_net_position",
+          [](Workspace* self) {
+            CAFFE_ENFORCE(self);
+            return (int)self->last_failed_op_net_position;
+          })
+      .def_property_readonly_static("current", [](py::object /* type */) {
+        auto ws = gWorkspaces.find(gCurrentWorkspaceName);
+        CAFFE_ENFORCE(ws != gWorkspaces.end());
+        CAFFE_ENFORCE(ws->second.get());
+        return py::cast(ws->second.get(), py::return_value_policy::reference);
+      });
+
+  // Gradients
+  py::class_<GradientWrapper>(m, "GradientWrapper")
+      .def(py::init<>())
+      .def_readwrite("dense", &GradientWrapper::dense_)
+      .def_readwrite("indices", &GradientWrapper::indices_)
+      .def_readwrite("values", &GradientWrapper::values_)
+      .def("is_sparse", &GradientWrapper::IsSparse)
+      .def("is_dense", &GradientWrapper::IsDense)
+      .def("is_empty", &GradientWrapper::IsEmpty);
+
+  m.def(
+      "get_gradient_defs",
+      [](py::bytes op_def, std::vector<GradientWrapper> output_gradients) {
+        OperatorDef def;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+        CAFFE_ENFORCE(caffe2::GradientRegistry()->Has(def.type()));
+        const auto& meta = GetGradientForOp(def, output_gradients);
+        std::vector<py::bytes> grad_ops;
+        for (const auto& op : meta.ops_) {
+          grad_ops.push_back(op.SerializeAsString());
+        }
+        return std::pair<std::vector<py::bytes>, std::vector<GradientWrapper>>{
+            grad_ops, meta.g_input_};
+      },
+      pybind11::return_value_policy::copy);
+
+  // DB
+  py::class_<db::Transaction>(m, "Transaction")
+      .def("put", &db::Transaction::Put)
+      .def("commit", &db::Transaction::Commit);
+  py::class_<db::Cursor>(m, "Cursor")
+      .def("supports_seek", &db::Cursor::SupportsSeek)
+      .def("seek_to_first", &db::Cursor::SeekToFirst)
+      .def("next", &db::Cursor::Next)
+      .def("key", [](db::Cursor* self) -> py::bytes { return self->key(); })
+      .def("value", [](db::Cursor* self) -> py::bytes { return self->value(); })
+      .def("valid", &db::Cursor::Valid);
+  py::enum_<db::Mode>(m, "Mode")
+      .value("read", db::Mode::READ)
+      .value("write", db::Mode::WRITE)
+      .value("new", db::Mode::NEW)
+      .export_values();
+  py::class_<db::DB /*, std::unique_ptr<DB>*/>(m, "DB")
+      .def("new_transaction", &db::DB::NewTransaction)
+      .def("new_cursor", &db::DB::NewCursor)
+      .def("close", &db::DB::Close);
+  m.def("create_db", &db::CreateDB);
+  m.def("registered_dbs", []() {
+    return caffe2::db::Caffe2DBRegistry()->Keys();
+  });
+
+  // OpSchema
+  py::class_<OpSchema> op_schema(m, "OpSchema");
+  op_schema.def_property_readonly("file", &OpSchema::file)
+      .def_property_readonly("line", &OpSchema::line)
+      .def_property_readonly("private", &OpSchema::private_op)
+      .def_property_readonly(
+          "doc", &OpSchema::doc, py::return_value_policy::reference)
+      .def_property_readonly("args", &OpSchema::args)
+      .def_property_readonly("input_desc", &OpSchema::input_desc)
+      .def_property_readonly("output_desc", &OpSchema::output_desc)
+      .def_property_readonly("max_input", &OpSchema::max_input)
+      .def_property_readonly("max_output", &OpSchema::max_output)
+      .def_property_readonly("min_input", &OpSchema::min_input)
+      .def_property_readonly("min_output", &OpSchema::min_output)
+      .def_property_readonly("inf", &OpSchema::inf)
+      // Note: this does not work yet, we will need to figure out how to pass
+      // protobuf objects.
+      .def("infer_tensor", &OpSchema::InferTensor)
+      .def("CalculateOutput", &OpSchema::CalculateOutput)
+      .def("num_inputs_allowed", &OpSchema::num_inputs_allowed)
+      .def("num_outputs_allowed", &OpSchema::num_outputs_allowed)
+      .def("num_inputs_outputs_allowed", &OpSchema::num_inputs_outputs_allowed)
+      .def_static(
+          "get", &OpSchemaRegistry::Schema, py::return_value_policy::reference)
+      .def_static(
+          "get_cpu_impl",
+          DefinitionGetter(CPUOperatorRegistry()),
+          py::return_value_policy::reference)
+      .def_static(
+          "get_cuda_impl",
+          DefinitionGetter(CUDAOperatorRegistry()),
+          py::return_value_policy::reference)
+      .def_static(
+          "get_gradient_impl",
+          DefinitionGetter(GradientRegistry()),
+          py::return_value_policy::reference);
+
+  py::class_<OpSchema::Argument>(op_schema, "Argument")
+      .def_property_readonly("name", &OpSchema::Argument::name)
+      .def_property_readonly("description", &OpSchema::Argument::description)
+      .def_property_readonly("required", &OpSchema::Argument::is_required);
+
+  py::class_<caffe2::onnx::Caffe2Ops>(m, "Caffe2Ops")
+      .def(py::init([](const std::vector<py::bytes>& init_ops,
+                       const std::vector<py::bytes>& ops,
+                       const std::vector<std::string>& interface_blobs) {
+        auto* c2ops = new caffe2::onnx::Caffe2Ops();
+        for (const auto& s : init_ops) {
+          ParseProtoFromLargeString(
+              s.cast<std::string>(), c2ops->init_ops.Add());
+        }
+        for (const auto& s : ops) {
+          ParseProtoFromLargeString(s.cast<std::string>(), c2ops->ops.Add());
+        }
+        for (const auto& s : interface_blobs) {
+          auto* tmp = c2ops->interface_blobs.Add();
+          *tmp = s;
+        }
+        return c2ops;
+      }));
+
+  py::class_<caffe2::onnx::DummyName>(m, "DummyName")
+      .def(py::init<>())
+      .def(
+          "reset",
+          [](caffe2::onnx::DummyName& instance, const py::object& args) {
+            if (args.is(py::none())) {
+              instance.Reset(std::unordered_set<std::string>());
+            } else {
+              instance.Reset(args.cast<std::unordered_set<std::string>>());
+            }
+          },
+          "Reset the dummy name generator",
+          py::arg("args") = py::none())
+      .def(
+          "new_dummy_name",
+          [](caffe2::onnx::DummyName& instance) -> std::string {
+            return instance.NewDummyName();
+          });
+
+  py::class_<caffe2::onnx::Caffe2BackendRep>(m, "Caffe2BackenRep")
+      .def(py::init<>())
+      .def(
+          "init_net",
+          [](caffe2::onnx::Caffe2BackendRep& instance) {
+            const auto& init_net = instance.init_net();
+            std::string out;
+            init_net.SerializeToString(&out);
+            return py::bytes(out);
+          })
+
+      .def(
+          "pred_net",
+          [](caffe2::onnx::Caffe2BackendRep& instance) {
+            const auto& pred_net = instance.pred_net();
+            std::string out;
+            pred_net.SerializeToString(&out);
+            return py::bytes(out);
+          })
+      .def(
+          "external_outputs",
+          [](caffe2::onnx::Caffe2BackendRep& instance) {
+            std::vector<std::string> outputs;
+            for (const auto& o : instance.pred_net().external_output()) {
+              outputs.emplace_back(o);
+            }
+            return outputs;
+          })
+      .def(
+          "external_inputs",
+          [](caffe2::onnx::Caffe2BackendRep& instance) {
+            std::vector<std::string> inputs;
+            for (const auto& o : instance.pred_net().external_input()) {
+              inputs.emplace_back(o);
+            }
+            return inputs;
+          })
+      .def(
+          "uninitialized_inputs",
+          [](caffe2::onnx::Caffe2BackendRep& instance) {
+            return instance.uninitialized_inputs();
+          })
+      .def(
+          "run",
+          [](caffe2::onnx::Caffe2BackendRep& instance,
+             std::map<std::string, py::object> inputs)
+              -> std::vector<py::object> {
+            Predictor::TensorMap tensors;
+            std::map<std::string, TensorCPU> tensors_data{};
+            for (const auto pair : inputs) {
+              const auto& name = pair.first;
+              const auto& input = pair.second;
+              CAFFE_ENFORCE(
+                  PyArray_Check(input.ptr()),
+                  "Input must be of type numpy array.");
+              PyArrayObject* array =
+                  reinterpret_cast<PyArrayObject*>(input.ptr());
+              TensorFeeder<CPUContext>().FeedTensor(
+                  DeviceOption(), array, &tensors_data[name]);
+              tensors.insert(std::make_pair(name, &tensors_data[name]));
+            }
+
+
+            std::vector<TensorCPU*> out;
+            instance.RunMap(tensors, &out);
+            std::vector<py::object> pyout;
+            for (auto t : out) {
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+            }
+            return pyout;
+          })
+      .def(
+          "run",
+          [](caffe2::onnx::Caffe2BackendRep& instance,
+             std::vector<py::object> inputs) -> std::vector<py::object> {
+            Predictor::TensorVector tensors;
+            std::vector<TensorCPU> tensors_data(inputs.size());
+            for (auto i = 0; i < inputs.size(); ++i) {
+              auto input = inputs[i];
+              CAFFE_ENFORCE(
+                  PyArray_Check(input.ptr()),
+                  "Input must be of type numpy array.");
+              PyArrayObject* array =
+                  reinterpret_cast<PyArrayObject*>(input.ptr());
+              TensorFeeder<CPUContext>().FeedTensor(
+                  DeviceOption(), array, &(tensors_data[i]));
+              tensors.push_back(&(tensors_data[i]));
+            }
+            std::vector<TensorCPU*> out;
+            instance.Run(tensors, &out);
+            std::vector<py::object> pyout;
+            for (auto t : out) {
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+            }
+            return pyout;
+          });
+
+  py::class_<caffe2::onnx::Caffe2Backend>(m, "Caffe2Backend")
+      .def(py::init<>())
+      .def(py::init<caffe2::onnx::DummyName*>())
+      .def(
+          "support_onnx_import",
+          [](caffe2::onnx::Caffe2Backend& instance,
+             const std::string& op) -> bool { return instance.SupportOp(op); })
+      .def(
+          "prepare",
+          [](caffe2::onnx::Caffe2Backend& instance,
+             const py::bytes& onnx_model_str,
+             const std::string& device,
+             const std::vector<caffe2::onnx::Caffe2Ops>& extras) {
+            auto* rep = instance.Prepare(
+                onnx_model_str.cast<std::string>(), device, extras);
+            return rep;
+          })
+      .def(
+          "convert_node",
+          [](caffe2::onnx::Caffe2Backend& instance,
+             const py::bytes& node_str,
+             int opset_version) -> std::vector<std::vector<py::bytes>> {
+            // Note that we return two lists of serialized ops. The first set is
+            // init_ops and the second set is ops for pred net. When converting
+            // RNN related op, it is possible that we will create ops in the
+            // init_net. Hence the return structure here
+            auto c2ops = instance.ConvertNode(
+                node_str.cast<std::string>(), opset_version);
+            std::vector<std::vector<py::bytes>> vals;
+            vals.emplace_back();
+            auto& init_vals = vals.back();
+            for (const auto& init_op : c2ops.init_ops) {
+              std::string out;
+              init_op.SerializeToString(&out);
+              init_vals.emplace_back(py::bytes(out));
+            }
+            vals.emplace_back();
+            auto& normal_vals = vals.back();
+            for (const auto& op : c2ops.ops) {
+              std::string out;
+              op.SerializeToString(&out);
+              normal_vals.emplace_back(py::bytes(out));
+            }
+            return vals;
+          })
+      .def(
+        "_build_tensor_filling_op",
+        [](caffe2::onnx::Caffe2Backend& instance,
+           const py::bytes& tensor_proto_str,
+           const std::string& name="") -> py::bytes {
+            caffe2::OperatorDef op;
+            ::ONNX_NAMESPACE::TensorProto tp;
+            ParseProtoFromLargeString(tensor_proto_str, &tp);
+            instance.BuildTensorFillingOp(&op, tp, name);
+            std::string out;
+            op.SerializeToString(&out);
+            return py::bytes(out);
+        });
+
+  py::class_<Predictor>(m, "Predictor")
+      .def(
+          py::init([](py::bytes init_net, py::bytes predict_net) {
+            CAFFE_ENFORCE(gWorkspace);
+            NetDef init_net_, predict_net_;
+            CAFFE_ENFORCE(ParseProtoFromLargeString(
+                init_net.cast<std::string>(), &init_net_));
+            CAFFE_ENFORCE(ParseProtoFromLargeString(
+                predict_net.cast<std::string>(), &predict_net_));
+            return new Predictor(init_net_, predict_net_, gWorkspace);
+          }))
+      .def(
+          "run",
+          [](Predictor& instance,
+             std::vector<py::object> inputs) -> std::vector<py::object> {
+            Predictor::TensorVector tensors;
+            std::vector<TensorCPU> tensors_data(inputs.size());
+            for (auto i = 0; i < inputs.size(); ++i) {
+              auto input = inputs[i];
+              CAFFE_ENFORCE(
+                  PyArray_Check(input.ptr()),
+                  "Input must be of type numpy array.");
+              PyArrayObject* array =
+                  reinterpret_cast<PyArrayObject*>(input.ptr());
+              TensorFeeder<CPUContext>().FeedTensor(
+                  DeviceOption(), array, &(tensors_data[i]));
+              tensors.push_back(&(tensors_data[i]));
+            }
+            std::vector<TensorCPU*> out;
+            instance.run(tensors, &out);
+            std::vector<py::object> pyout;
+            for (auto t : out) {
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+            }
+            return pyout;
+          })
+      .def(
+          "run",
+          [](Predictor& instance, std::map<std::string, py::object> inputs)
+              -> std::vector<py::object> {
+            Predictor::TensorMap tensors;
+            std::map<std::string, TensorCPU> tensors_data{};
+            for (const auto pair : inputs) {
+              const auto& name = pair.first;
+              const auto& input = pair.second;
+              CAFFE_ENFORCE(
+                  PyArray_Check(input.ptr()),
+                  "Input must be of type numpy array.");
+              PyArrayObject* array =
+                  reinterpret_cast<PyArrayObject*>(input.ptr());
+              TensorFeeder<CPUContext>().FeedTensor(
+                  DeviceOption(), array, &tensors_data[name]);
+              tensors.insert(std::make_pair(name, &tensors_data[name]));
+            }
+            std::vector<TensorCPU*> out;
+            instance.run_map(tensors, &out);
+            std::vector<py::object> pyout;
+            for (auto t : out) {
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+            }
+            return pyout;
+          });
+
+  py::class_<script::CompilationUnit>(m, "CompilationUnit")
+      .def(py::init<>())
+      .def("define", &script::CompilationUnit::define)
+      .def("get_proto", &script::CompilationUnit::getProto)
+      .def(
+          "create_net",
+          [](script::CompilationUnit* self, const std::string& name) {
+            auto net = self->createNet(gWorkspace, name);
+            CAFFE_ENFORCE(net);
+            return net;
+          })
+      .def(
+          "extern",
+          [](script::CompilationUnit* self,
+             const std::string& name,
+             py::object py_proto) {
+            py::bytes bytes = py_proto.attr("SerializeToString")();
+            std::unique_ptr<caffe2::NetDef> proto(new NetDef());
+            CAFFE_ENFORCE(ParseProtoFromLargeString(
+                bytes.cast<std::string>(), proto.get()));
+            self->defineExtern(name, std::move(proto));
+          });
+}
+
+void addGlobalMethods(py::module& m) {
+  m.attr("is_asan") = py::bool_(CAFFE2_ASAN_ENABLED);
+  m.def("get_build_options", []() { return GetBuildOptions(); });
+
+  m.attr("has_mkldnn") = py::bool_(
+#ifdef CAFFE2_HAS_MKL_DNN
+      true
+#else // CAFFE2_HAS_MKL_DNN
+      false
+#endif // CAFFE2_HAS_MKL_DNN
+      );
+
+  m.attr("use_ideep") = py::bool_(
+#ifdef CAFFE2_USE_IDEEP
+      true
+#else // CAFFE2_USE_IDEEP
+      false
+#endif // CAFFE2_USE_IDEEP
+      );
+
+  m.attr("use_trt") = py::bool_(
+#ifdef CAFFE2_USE_TRT
+      true
+#else // CAFFE2_USE_TRT
+      false
+#endif // CAFFE2_USE_TRT
+  );
+
+  m.attr("define_caffe2_no_operator_schema") = py::bool_(
+#ifdef CAFFE2_NO_OPERATOR_SCHEMA
+      true
+#else // CAFFE2_NO_OPERATOR_SCHEMA
+      false
+#endif // CAFFE2_NO_OPERATOR_SCHEMA
+  );
+
+  m.def("set_per_op_engine_pref", [](const PerOpEnginePrefType& pref) -> void {
+    caffe2::SetPerOpEnginePref(pref);
+  });
+
+  m.def("set_global_engine_pref", [](const GlobalEnginePrefType& pref) -> void {
+    caffe2::SetGlobalEnginePref(pref);
+  });
+  m.def(
+      "set_engine_pref",
+      [](const PerOpEnginePrefType& per_op_pref,
+         const GlobalEnginePrefType& global_pref) -> void {
+        caffe2::SetEnginePref(per_op_pref, global_pref);
+      });
+  m.def(
+      "set_op_engine_pref",
+      [](const std::string& op_type,
+         const CaffeMap<int, EnginePrefType>& op_pref) -> void {
+        caffe2::SetOpEnginePref(op_type, op_pref);
+      });
+
+  m.def(
+      "op_registry_key",
+      [](const std::string& op_type,
+         const std::string& engine) -> const std::string {
+        return caffe2::OpRegistryKey(op_type, engine);
+      });
+  m.def("global_init", [](std::vector<std::string> args) -> void {
+    int argc = args.size();
+    std::vector<char*> argv;
+    for (auto& arg : args) {
+      argv.push_back(const_cast<char*>(arg.data()));
+    }
+    char** pargv = argv.data();
+    CAFFE_ENFORCE(caffe2::GlobalInit(&argc, &pargv));
+  });
+
+  m.def("registered_operators", []() {
+    std::set<string> all_keys = caffe2::GetRegisteredOperators();
+
+    // Ensure we are lexicographically ordered.
+    std::vector<std::string> keys;
+    for (const auto& key : all_keys) {
+      keys.push_back(key);
+    }
+    return keys;
+  });
+  m.def("on_module_exit", []() { gWorkspaces.clear(); });
+  // create_if_missing not used by necessary for pybind to do
+  // properly do function overloading.
+  m.def(
+      "switch_workspace",
+      [](Workspace* ws, py::object /*create_if_missing*/) { gWorkspace = ws; });
+  m.def(
+      "switch_workspace",
+      [](const std::string& name, const py::object create_if_missing) {
+        if (create_if_missing.is(py::none())) {
+          return switchWorkspaceInternal(name, false);
+        }
+        return switchWorkspaceInternal(name, create_if_missing.cast<bool>());
+      },
+      "Switch to the specified workspace, creating if necessary",
+      py::arg("name"),
+      py::arg("create_if_missing") = py::none());
+  m.def(
+      "reset_workspace",
+      [](const py::object& root_folder) {
+        VLOG(1) << "Resetting workspace.";
+        if (root_folder.is(py::none())) {
+          gWorkspaces[gCurrentWorkspaceName].reset(new Workspace());
+        } else {
+          gWorkspaces[gCurrentWorkspaceName].reset(
+              new Workspace(root_folder.cast<std::string>()));
+        }
+        gWorkspace = gWorkspaces[gCurrentWorkspaceName].get();
+        return true;
+      },
+      "Reset the workspace",
+      py::arg("root_folder") = py::none());
+
+  m.def("root_folder", []() {
+    CAFFE_ENFORCE(gWorkspace);
+    return gWorkspace->RootFolder();
+  });
+  m.def("current_workspace", []() { return gCurrentWorkspaceName; });
+  m.def("workspaces", []() {
+    std::vector<std::string> names;
+    for (const auto& kv : gWorkspaces) {
+      names.push_back(kv.first);
+    }
+    return names;
+  });
+  m.def("nearby_opnames", [](const std::string& name) {
+    std::vector<std::string> alternatives;
+    int editTolerance = 3;
+    for (auto it : caffe2::CPUOperatorRegistry()->Keys()) {
+      if (editDistance(it, name, editTolerance) < editTolerance + 1) {
+        alternatives.push_back(it);
+      }
+    }
+    return alternatives;
+  });
+  m.def("local_blobs", []() {
+    CAFFE_ENFORCE(gWorkspace);
+    return gWorkspace->LocalBlobs();
+  });
+  m.def("blobs", []() {
+    CAFFE_ENFORCE(gWorkspace);
+    return gWorkspace->Blobs();
+  });
+  m.def("has_blob", [](const std::string& name) {
+    CAFFE_ENFORCE(gWorkspace);
+    return gWorkspace->HasBlob(name);
+  });
+  m.def(
+      "create_net",
+      [](py::bytes net_def, bool overwrite) {
+        CAFFE_ENFORCE(gWorkspace);
+        caffe2::NetDef proto;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(net_def.cast<std::string>(), &proto),
+            "Can't parse net proto: ",
+            net_def.cast<std::string>());
+        CAFFE_ENFORCE(
+            gWorkspace->CreateNet(proto, overwrite),
+            "Error creating net with proto: ",
+            net_def.cast<std::string>());
+        return true;
+      },
+      py::arg("net_def"),
+      py::arg("overwrite") = kPyBindFalse);
+  m.def("run_net", [](const std::string& name, int num_iter, bool allow_fail) {
+    CAFFE_ENFORCE(gWorkspace);
+    CAFFE_ENFORCE(gWorkspace->GetNet(name), "Can't find net ", name);
+    py::gil_scoped_release g;
+    for (int i = 0; i < num_iter; i++) {
+      bool success = gWorkspace->RunNet(name);
+      if (!allow_fail) {
+        CAFFE_ENFORCE(success, "Error running net ", name);
+      } else {
+        if (!success) {
+          return false;
+        }
+      }
+    }
+    return true;
+  });
+  m.def(
+      "add_observer_to_net",
+      [](const std::string& net_name, const std::string& observer_type) {
+        CAFFE_ENFORCE(gWorkspace);
+        CAFFE_ENFORCE(
+            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+        py::gil_scoped_release g;
+
+        NetBase* net = gWorkspace->GetNet(net_name);
+        const Observable<NetBase>::Observer* observer = nullptr;
+
+#define REGISTER_PYTHON_EXPOSED_OBSERVER(ob_type)             \
+  {                                                           \
+    if (observer_type.compare(#ob_type) == 0) {               \
+      unique_ptr<ob_type> net_ob = make_unique<ob_type>(net); \
+      observer = net->AttachObserver(std::move(net_ob));      \
+    }                                                         \
+  }
+
+        REGISTER_PYTHON_EXPOSED_OBSERVER(TimeObserver);
+#undef REGISTER_PYTHON_EXPOSED_OBSERVER
+
+        if (observer_type.compare("RunCountObserver") == 0) {
+          unique_ptr<RunCountNetObserver> net_ob =
+              make_unique<RunCountNetObserver>(net);
+          observer = net->AttachObserver(std::move(net_ob));
+        }
+
+        CAFFE_ENFORCE(observer != nullptr);
+        return py::cast(observer);
+      });
+  m.def(
+      "remove_observer_from_net",
+      [](const std::string& net_name, const ObserverBase<NetBase>* observer) {
+        CAFFE_ENFORCE(gWorkspace);
+        CAFFE_ENFORCE(
+            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+        py::gil_scoped_release g;
+
+        NetBase* net = gWorkspace->GetNet(net_name);
+        net->DetachObserver(observer);
+      });
+  m.def("num_observers_on_net", [](const std::string& net_name) {
+    CAFFE_ENFORCE(gWorkspace);
+    CAFFE_ENFORCE(gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+    py::gil_scoped_release g;
+
+    NetBase* net = gWorkspace->GetNet(net_name);
+    return net->NumObservers();
+  });
+  m.def(
+      "benchmark_net",
+      [](const std::string& name,
+         size_t warmup_runs,
+         size_t main_runs,
+         bool run_individual) {
+        CAFFE_ENFORCE(gWorkspace);
+        auto* net = gWorkspace->GetNet(name);
+        CAFFE_ENFORCE(net, "Didn't find net: ", name);
+        py::gil_scoped_release g;
+        vector<float> stat =
+            net->TEST_Benchmark(warmup_runs, main_runs, run_individual);
+        return stat;
+      });
+
+  m.def("delete_net", [](const std::string& name) {
+    CAFFE_ENFORCE(gWorkspace);
+    gWorkspace->DeleteNet(name);
+    return true;
+  });
+  m.def("nets", []() { return gWorkspace->Nets(); });
+  m.def("run_operator_once", [](const py::bytes& op_def) {
+    CAFFE_ENFORCE(gWorkspace);
+    OperatorDef def;
+    CAFFE_ENFORCE(
+        ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+    py::gil_scoped_release g;
+    CAFFE_ENFORCE(gWorkspace->RunOperatorOnce(def));
+    return true;
+  });
+  m.def(
+      "get_operator_cost",
+      [](const py::bytes& op_def, const std::vector<string>& input_blobs) {
+        CAFFE_ENFORCE(gWorkspace);
+        OperatorDef def;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(op_def.cast<std::string>(), &def),
+            "Couldn't parse operator proto.");
+        const auto op_type = def.type();
+        auto* schema = OpSchemaRegistry::Schema(op_type);
+        CAFFE_ENFORCE(schema);
+        vector<TensorShape> shapes;
+        for (const auto& blob_name : input_blobs) {
+          auto* blob = gWorkspace->GetBlob(blob_name);
+          shapes.emplace_back(GetTensorShapeOfBlob(blob));
+        }
+        const auto c = schema->InferCost(def, shapes);
+        return std::make_tuple(c.flops, c.bytes_written);
+      });
+  m.def("run_net_once", [](const py::bytes& net_def) {
+    CAFFE_ENFORCE(gWorkspace);
+    NetDef def;
+    CAFFE_ENFORCE(
+        ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
+    py::gil_scoped_release g;
+    CAFFE_ENFORCE(gWorkspace->RunNetOnce(def));
+    return true;
+  });
+  m.def("run_plan", [](const py::bytes& plan_def) {
+    CAFFE_ENFORCE(gWorkspace);
+    PlanDef def;
+    CAFFE_ENFORCE(
+        ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
+    py::gil_scoped_release g;
+    CAFFE_ENFORCE(gWorkspace->RunPlan(def));
+    return true;
+  });
+  m.def(
+      "apply_transform",
+      [](const string& transform_key, const py::bytes& net_def) {
+        NetDef def;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
+        py::gil_scoped_release g;
+
+        auto transformed_net = ApplyTransform(transform_key, def);
+
+        std::string protob;
+        CAFFE_ENFORCE(transformed_net.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def(
+      "apply_transform_if_faster",
+      [](const string& transform_key,
+         const py::bytes& net_def_bytes,
+         const py::bytes& init_def_bytes,
+         int warmup_runs,
+         int main_runs,
+         double improvement_threshold) {
+        NetDef def;
+        CAFFE_ENFORCE(ParseProtoFromLargeString(
+            net_def_bytes.cast<std::string>(), &def));
+        NetDef init_def;
+        CAFFE_ENFORCE(ParseProtoFromLargeString(
+            init_def_bytes.cast<std::string>(), &init_def));
+        py::gil_scoped_release g;
+
+        std::string protob;
+
+        auto transformed_net = ApplyTransformIfFaster(
+            transform_key,
+            def,
+            init_def,
+            warmup_runs,
+            main_runs,
+            improvement_threshold);
+
+        CAFFE_ENFORCE(transformed_net.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def(
+      "memonger_compute_blob_recycling_for_dag",
+      [](const py::bytes& net_def,
+         const std::vector<string>& input_blobs,
+         const std::vector<int>& op_indices,
+         const std::unordered_set<string>& shareable_blob_names,
+         const string& namescope,
+         const std::unordered_set<string>& dont_share_blob_names,
+         const std::unordered_map<string, vector<int>>& blob_shapes) {
+        py::gil_scoped_release g;
+        NetDef net;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(net_def.cast<std::string>(), &net));
+        NetDef optimized_proto =
+            caffe2::memonger::compute_blob_recycling_for_dag(
+                net,
+                input_blobs,
+                op_indices,
+                shareable_blob_names,
+                namescope,
+                dont_share_blob_names,
+                blob_shapes);
+        std::string protob;
+        CAFFE_ENFORCE(optimized_proto.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def(
+      "memonger_optimize_inference_net",
+      [](const py::bytes& net_def,
+         const std::vector<std::string>& static_blobs) {
+        NetDef def;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
+        py::gil_scoped_release g;
+
+        std::set<string> static_blobs_set(
+            static_blobs.begin(), static_blobs.end());
+        NetDef optimized =
+            caffe2::memonger::optimize_inference_net(def, static_blobs_set);
+
+        std::string protob;
+        CAFFE_ENFORCE(optimized.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def(
+      "infer_shapes_and_types_from_workspace",
+      [](const std::vector<py::bytes>& net_protos) {
+        CAFFE_ENFORCE(gWorkspace);
+
+        // Parse protobuffers to NetDefs
+        std::vector<std::unique_ptr<caffe2::NetDef>> nets;
+        std::vector<caffe2::NetDef*> nets_ptr;
+        for (auto proto : net_protos) {
+          std::unique_ptr<NetDef> def(new NetDef());
+          CAFFE_ENFORCE(def->ParseFromString(proto));
+          nets_ptr.push_back(def.get());
+          nets.push_back(std::move(def));
+        }
+
+        auto blob_info = InferBlobShapesAndTypesFromWorkspace(gWorkspace, nets_ptr);
+
+        std::string protob;
+        CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def(
+      "infer_shapes_and_types_from_map",
+      [](const std::vector<py::bytes>& net_protos,
+         const std::map<std::string, std::vector<TIndex>> blob_dimensions) {
+        // Parse protobuffers to NetDefs
+        std::vector<std::unique_ptr<caffe2::NetDef>> nets;
+        std::vector<caffe2::NetDef*> nets_ptr;
+        for (auto proto : net_protos) {
+          std::unique_ptr<NetDef> def(new NetDef());
+          CAFFE_ENFORCE(def->ParseFromString(proto));
+          nets_ptr.push_back(def.get());
+          nets.push_back(std::move(def));
+        }
+
+        auto blob_info = InferBlobShapesAndTypesFromMap(blob_dimensions, nets_ptr);
+
+        std::string protob;
+        CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
+        return py::bytes(protob);
+      });
+  m.def("create_blob", [](const std::string& name) {
+    CAFFE_ENFORCE(gWorkspace);
+    CAFFE_ENFORCE(gWorkspace->CreateBlob(name));
+    return true;
+  });
+  m.def("fetch_blob", [](const std::string& name) -> py::object {
+    return python_detail::fetchBlob(gWorkspace, name);
+  });
+  m.def(
+      "feed_blob",
+      [](const std::string& name, py::object arg, py::object device_option) {
+        DeviceOption option;
+        if (!device_option.is(py::none())) {
+          // If we have a device option passed in, read it.
+          CAFFE_ENFORCE(ParseProtoFromLargeString(
+              py::bytes(device_option).cast<std::string>(), &option));
+        }
+        auto* blob = gWorkspace->CreateBlob(name);
+        if (PyArray_Check(arg.ptr())) { // numpy array
+          PyArrayObject* array = reinterpret_cast<PyArrayObject*>(arg.ptr());
+          auto feeder = CreateFeeder(option.device_type());
+          CAFFE_ENFORCE(
+              feeder,
+              "Unknown device type encountered in FeedBlob: ",
+              option.device_type());
+          feeder->Feed(option, array, blob);
+          return true;
+        }
+        if (PyBytes_Check(arg.ptr()) || PyUnicode_Check(arg.ptr())) { // string
+          *blob->GetMutable<std::string>() = arg.cast<std::string>();
+          return true;
+        }
+        CAFFE_THROW(
+            "Unexpected type of argument - only numpy array or string are "
+            "supported for feeding");
+        return false;
+      },
+      "",
+      py::arg("name"),
+      py::arg("arg"),
+      py::arg("device_option") = py::none());
+  m.def("serialize_blob", [](const std::string& name) {
+    CAFFE_ENFORCE(gWorkspace);
+    auto* blob = gWorkspace->GetBlob(name);
+    CAFFE_ENFORCE(blob);
+    return py::bytes(blob->Serialize(name));
+  });
+  m.def(
+      "deserialize_blob",
+      [](const std::string& name, const py::bytes& serialized) {
+        CAFFE_ENFORCE(gWorkspace);
+        auto* blob = gWorkspace->CreateBlob(name);
+        blob->Deserialize(serialized.cast<std::string>());
+      });
+
+  // we support 2 possible signatures of python op: (inputs, outputs) or
+  // (inputs, outputs, workspace)
+  m.def(
+      "register_python_op",
+      [](py::object func, bool pass_workspace, std::string name) {
+        using namespace python_detail;
+        CAFFE_ENFORCE(!func.is(py::none()));
+        if (!name.empty()) {
+          name += ":";
+        }
+        name += func.attr("__name__").cast<std::string>();
+        std::string token = name;
+        for (int i = 1; gRegistry().count(token) > 0; ++i) {
+          token = name + ":" + to_string(i);
+        }
+        gRegistry()[token] = Func{func, pass_workspace};
+        return token;
+      });
+  m.def(
+      "register_python_gradient_op",
+      [](const std::string& token, py::object func) {
+        using namespace python_detail;
+        CAFFE_ENFORCE(!func.is(py::none()));
+        CAFFE_ENFORCE(gRegistry().find(token) != gRegistry().end());
+        // For global sanity gradient ops shouldn't access workspace
+        gRegistry()[token + "_gradient"] = Func{func, false};
+      });
+  m.def("infer_op_input_output_device", [](const py::bytes& op) {
+    std::unique_ptr<caffe2::OperatorDef> def(new caffe2::OperatorDef());
+    CAFFE_ENFORCE(def.get()->ParseFromString(op));
+    // device_info is a pair of vector of DeviceOption.
+    // `first` is for inputs, `second` is for outputs.
+    auto device_info = InferOpInputOutputDevice(*def);
+
+    std::vector<py::bytes> in_res;
+    std::vector<py::bytes> out_res;
+    for (auto& in_dev : device_info.first) {
+      std::string protob;
+      CAFFE_ENFORCE(in_dev.SerializeToString(&protob));
+      in_res.push_back(py::bytes(protob));
+    }
+    for (auto& out_dev : device_info.second) {
+      std::string protob;
+      CAFFE_ENFORCE(out_dev.SerializeToString(&protob));
+      out_res.push_back(py::bytes(protob));
+    }
+    return std::make_pair(in_res, out_res);
+  });
+  m.def("get_stats", []() {
+    ExportedStatList stats;
+    StatRegistry::get().publish(stats);
+    std::unordered_map<std::string, int> stats_map;
+    for (const auto& stat : stats) {
+      stats_map[stat.key] = stat.value;
+    }
+    return stats_map;
+  });
+  m.def("is_numa_enabled", []() { return IsNUMAEnabled(); });
+  m.def("get_num_numa_nodes", []() { return GetNumNUMANodes(); });
+  m.def("get_blob_numa_node", [](const std::string& blob_name) {
+    CAFFE_ENFORCE(gWorkspace);
+    auto* blob = gWorkspace->GetBlob(blob_name);
+    CAFFE_ENFORCE(blob);
+    const TensorCPU& tensor = blob->Get<TensorCPU>();
+    const void* raw_data = tensor.raw_data();
+    CAFFE_ENFORCE(raw_data);
+    return GetNUMANode(raw_data);
+  });
+  m.def("get_blob_size_bytes", [](const std::string& blob_name) {
+    CAFFE_ENFORCE(gWorkspace);
+    auto* blob = gWorkspace->GetBlob(blob_name);
+    CAFFE_ENFORCE(blob);
+    return BlobStat::sizeBytes(*blob);
+  });
+  m.def("support_onnx_export", [](const std::string& op) -> bool {
+    const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
+    if (!schema) {
+      return false;
+    }
+    return !schema->onnx_schema().empty();
+  });
+  m.def(
+      "export_to_onnx",
+      [](
+        caffe2::onnx::DummyName* dummy,
+        const py::bytes& c2op,
+         const std::unordered_map<std::string, std::vector<int>>& shapes)
+          -> std::pair<std::vector<py::bytes>, std::vector<py::bytes>> {
+        OperatorDef op;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(c2op.cast<std::string>(), &op));
+        const auto& type = op.type();
+        const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(type);
+        CAFFE_ENFORCE(schema);
+        std::unordered_map<std::string, TensorShape> tensor_shapes;
+        for (const auto& it: shapes) {
+          tensor_shapes.emplace(
+              it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
+        }
+        auto results =
+            onnx::OnnxExporter(dummy).Caffe2OpToOnnxNodes(op, tensor_shapes);
+        std::pair<std::vector<py::bytes>, std::vector<py::bytes>> ret;
+        auto& nodes_str = ret.first;
+        auto& tensors_str = ret.second;
+        for (const auto& node: results.first) {
+          std::string out;
+          node.SerializeToString(&out);
+          nodes_str.emplace_back(py::bytes(out));
+        }
+        for (const auto& tensor: results.second) {
+          std::string out;
+          tensor.SerializeToString(&out);
+          tensors_str.emplace_back(py::bytes(out));
+        }
+        return ret;
+      });
+
+#define CAFFE2_CPU_FEATURE_SUPPORT(feature) \
+  m.def("builtin_cpu_supports_" #feature, []() { return GetCpuId().feature(); })
+
+  CAFFE2_CPU_FEATURE_SUPPORT(avx2);
+
+#undef CAFFE2_CPU_FEATURE_SUPPORT
+  m.def("transform_exists", [](const std::string& transform_name) {
+    return OptimizationPassRegistry()->Has(transform_name);
+  });
+  m.def("workspace_transform_exists", [](const std::string& transform_name) {
+    return WorkspaceOptimizationPassRegistry()->Has(transform_name);
+  });
+  m.def("run_transform", [](const std::string& transform_name, py::bytes def) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+    auto nn = caffe2::convertToNNModule(proto);
+    auto pass = OptimizationPassRegistry()->Create(transform_name, &nn);
+
+    CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
+    pass->run();
+
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+  m.def(
+      "run_workspace_transform",
+      [](const std::string& transform_name, py::bytes def) {
+        CAFFE_ENFORCE(gWorkspace);
+        caffe2::NetDef proto;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+        auto nn = caffe2::convertToNNModule(proto);
+        auto pass = WorkspaceOptimizationPassRegistry()->Create(
+            transform_name, &nn, gWorkspace);
+
+        CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
+        pass->run();
+
+        auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+        std::string out;
+        new_proto.SerializeToString(&out);
+        return py::bytes(out);
+      });
+
+  // Transformations are exposed as functions here and wrapped
+  // into a python interface in transformations.py
+  // Prefix the transformation with transform_ to avoid clobbering the
+  // function namespace.
+  m.def("transform_optimizeForIDEEP", [](py::bytes def, bool training_mode) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+    auto nn = caffe2::convertToNNModule(proto);
+    opt::OptimizeForIdeep(&nn, gWorkspace, training_mode);
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+
+  m.def("transform_addNNPACK", [](py::bytes def) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+    auto nn = caffe2::convertToNNModule(proto);
+    opt::addNNPACK(&nn);
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+
+  m.def("transform_fuseConvBN", [](py::bytes def) {
+    CAFFE_ENFORCE(gWorkspace);
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+    auto nn = caffe2::convertToNNModule(proto);
+    opt::fuseConvBN(&nn, gWorkspace);
+    auto new_proto = caffe2::convertToCaffe2Proto(nn);
+
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+
+  m.def("transform_fuseNNPACKConvRelu", [](py::bytes def) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+    auto nn = caffe2::convertToNNModule(proto);
+    opt::fuseNNPACKConvRelu(&nn);
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+
+  m.def("transform_sinkMaxPool", [](py::bytes def) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+    auto nn = caffe2::convertToNNModule(proto);
+    opt::sinkMaxPool(&nn);
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+
+  auto initialize = [&]() {
+    // Initialization of the module
+    ([]() -> void {
+      // import_array1() forces a void return value.
+      import_array1();
+    })();
+    // Single threaded, so safe
+    static bool initialized = false;
+    if (initialized) {
+      return;
+    }
+    // We will create a default workspace for us to run stuff.
+    switchWorkspaceInternal("default", true);
+    gCurrentWorkspaceName = "default";
+    initialized = true;
+  };
+
+  initialize();
+};
+
+PYBIND11_MODULE(caffe2_pybind11_state, m) {
+  m.doc() = "pybind11 stateful interface to Caffe2 workspaces";
+
+  addGlobalMethods(m);
+  addObjectMethods(m);
+}
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
new file mode 100644
index 0000000..f46972a
--- /dev/null
+++ b/caffe2/python/pybind_state.h
@@ -0,0 +1,464 @@
+#pragma once
+
+#include <unordered_map>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/memonger.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/scope_guard.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/python/pybind_state_dlpack.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#define PY_ARRAY_UNIQUE_SYMBOL caffe2_python_ARRAY_API
+#include <numpy/arrayobject.h>
+
+// Temporary solution for numpy < 1.7 versions: old macro, no promises.
+// You're strongly advised to upgrade to >= 1.7.
+#ifndef NPY_ARRAY_C_CONTIGUOUS
+#define NPY_ARRAY_C_CONTIGUOUS NPY_C_CONTIGUOUS
+#define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
+#endif
+
+namespace caffe2 {
+namespace python {
+
+namespace py = pybind11;
+
+// Add methods common to both CPU and GPU mode.
+void addGlobalMethods(pybind11::module& m);
+// Expose Workspace, Net, Blob
+void addObjectMethods(pybind11::module& m);
+
+// Get current workspace
+Workspace* GetCurrentWorkspace();
+
+class CAFFE2_EXPORT BlobFetcherBase {
+ public:
+  struct FetchedBlob {
+    pybind11::object obj;
+    bool copied;
+  };
+  virtual ~BlobFetcherBase();
+  virtual pybind11::object Fetch(const Blob& blob) = 0;
+};
+
+class BlobFeederBase {
+ public:
+  virtual ~BlobFeederBase();
+  virtual void
+  Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0;
+};
+
+CAFFE2_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
+    BlobFetcherRegistry,
+    CaffeTypeId,
+    BlobFetcherBase,
+    std::unique_ptr);
+#define REGISTER_BLOB_FETCHER(id, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
+inline unique_ptr<BlobFetcherBase> CreateFetcher(CaffeTypeId id) {
+  return BlobFetcherRegistry()->Create(id);
+}
+
+CAFFE_DECLARE_TYPED_REGISTRY(
+    BlobFeederRegistry,
+    int,
+    BlobFeederBase,
+    std::unique_ptr);
+#define REGISTER_BLOB_FEEDER(device_type, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
+inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
+  return BlobFeederRegistry()->Create(device_type);
+}
+
+static_assert(
+    sizeof(int) == sizeof(int32_t),
+    "We make an assumption that int is always int32 for numpy "
+    "type mapping.");
+
+int CaffeToNumpyType(const TypeMeta& meta);
+const TypeMeta& NumpyTypeToCaffe(int numpy_type);
+
+template <class Context>
+class TensorFetcher : public BlobFetcherBase {
+ public:
+  pybind11::object Fetch(const Blob& blob) override {
+    return FetchTensor(blob.Get<Tensor<Context>>(), true).obj;
+  }
+
+  bool NeedsCopy(const TypeMeta& meta) const {
+    return !std::is_same<Context, CPUContext>::value ||
+        CaffeToNumpyType(meta) == NPY_OBJECT;
+  }
+
+  FetchedBlob FetchTensor(const Tensor<Context>& tensor, bool force_copy) {
+    FetchedBlob result;
+    CAFFE_ENFORCE_GE(tensor.size(), 0, "Trying to fetch unitilized tensor");
+    const int numpy_type = CaffeToNumpyType(tensor.meta());
+    CAFFE_ENFORCE(
+        numpy_type != -1,
+        "This tensor's data type is not supported: ",
+        tensor.meta().name(),
+        ".");
+    std::vector<npy_intp> npy_dims;
+    for (const auto dim : tensor.dims()) {
+      npy_dims.push_back(dim);
+    }
+    result.copied = force_copy || NeedsCopy(tensor.meta());
+    void* outPtr;
+    if (result.copied) {
+      result.obj = py::reinterpret_steal<py::object>(
+          PyArray_SimpleNew(tensor.ndim(), npy_dims.data(), numpy_type));
+      outPtr = static_cast<void*>(
+          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
+    } else {
+      outPtr = const_cast<Tensor<Context>&>(tensor).raw_mutable_data();
+      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
+          tensor.ndim(), npy_dims.data(), numpy_type, outPtr));
+    }
+
+    if (numpy_type == NPY_OBJECT) {
+      PyObject** outObj = reinterpret_cast<PyObject**>(outPtr);
+      auto* str = tensor.template data<std::string>();
+      for (int i = 0; i < tensor.size(); ++i) {
+        outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
+        str++;
+        // cleanup on failure
+        if (outObj[i] == nullptr) {
+          for (int j = 0; j < i; ++j) {
+            Py_DECREF(outObj[j]);
+          }
+          CAFFE_THROW("Failed to allocate string for ndarray of strings.");
+        }
+      }
+      return result;
+    }
+
+    if (result.copied) {
+      Context context;
+      context.template CopyBytes<Context, CPUContext>(
+          tensor.nbytes(), tensor.raw_data(), outPtr);
+      context.FinishDeviceComputation();
+    }
+    return result;
+  }
+};
+
+template <class Context>
+class TensorFeeder : public BlobFeederBase {
+ public:
+  void FeedTensor(
+      const DeviceOption& option,
+      PyArrayObject* original_array,
+      Tensor<Context>* tensor) {
+    PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
+    auto g = MakeGuard([&]() { Py_XDECREF(array); });
+
+    const auto npy_type = PyArray_TYPE(array);
+    const TypeMeta& meta = NumpyTypeToCaffe(npy_type);
+    CAFFE_ENFORCE(
+        meta.id() != CaffeTypeId::uninitialized(),
+        "This numpy data type is not supported: ",
+        PyArray_TYPE(array),
+        ".");
+    Context context(option);
+    context.SwitchToDevice();
+    // numpy requires long int as its dims.
+    int ndim = PyArray_NDIM(array);
+    npy_intp* npy_dims = PyArray_DIMS(array);
+    std::vector<TIndex> dims;
+    for (int i = 0; i < ndim; ++i) {
+      dims.push_back(npy_dims[i]);
+    }
+    tensor->Resize(dims);
+
+    // Now, copy the data to the tensor.
+    switch (npy_type) {
+      case NPY_OBJECT: {
+        PyObject** input = reinterpret_cast<PyObject**>(PyArray_DATA(array));
+        auto* outPtr = tensor->template mutable_data<std::string>();
+        for (int i = 0; i < tensor->size(); ++i) {
+          char* str;
+          Py_ssize_t strSize;
+#if PY_MAJOR_VERSION > 2
+          if (PyBytes_Check(input[i])) {
+            CAFFE_ENFORCE(
+                PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
+                "Had a PyBytes object but cannot convert it to a string.");
+          } else if (PyUnicode_Check(input[i])) { // string
+            str = const_cast<char*>(PyUnicode_AsUTF8AndSize(input[i], &strSize));
+            CAFFE_ENFORCE(
+                str,
+                "Had a PyUnicode object but cannot convert it to a string.");
+          } else {
+            CAFFE_THROW("Unsupported python object type passed into ndarray.");
+          }
+#else
+          CAFFE_ENFORCE(
+              PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
+              "Unsupported python object type passed into ndarray.");
+#endif // PY_MAJOR_VERSION > 2
+          outPtr[i] = std::string(str, strSize);
+        }
+        break;
+      }
+      case NPY_UNICODE:
+        CAFFE_THROW(
+            "You are feeding in a numpy array of unicode. Caffe2 C++ does not "
+            "support unicode yet. Please ensure that you are passing in bytes "
+            "instead of unicode strings.");
+        break;
+      default:
+        context.template CopyBytes<CPUContext, Context>(
+            tensor->size() * meta.itemsize(),
+            static_cast<void*>(PyArray_DATA(array)),
+            tensor->raw_mutable_data(meta));
+    }
+    context.FinishDeviceComputation();
+  }
+
+  virtual void
+  Feed(const DeviceOption& option, PyArrayObject* original_array, Blob* blob) {
+    FeedTensor(option, original_array, blob->GetMutable<Tensor<Context>>());
+  }
+};
+
+namespace python_detail {
+struct Func {
+  py::object py_func;
+  bool needs_workspace;
+};
+
+const Func& getOpFunc(const std::string& token);
+
+const Func& getGradientFunc(const std::string& token);
+
+} // namespace python_detail
+
+template <class Context, bool use_dlpack>
+class PythonOpBase : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  PythonOpBase(
+      const OperatorDef& operator_def,
+      Workspace* ws,
+      const std::string& pickled_builder_arg_name)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        token_(OperatorBase::template GetSingleArgument<std::string>(
+            "token",
+            "")) {
+    using namespace python_detail;
+    auto pickled = OperatorBase::template GetSingleArgument<std::string>(
+        pickled_builder_arg_name, "");
+    auto forced_cpu_outputs_arg =
+        OperatorBase::template GetRepeatedArgument<int>("forced_cpu_outputs");
+    forced_cpu_outputs_.insert(
+        forced_cpu_outputs_arg.begin(), forced_cpu_outputs_arg.end());
+    CAFFE_ENFORCE(
+        !pickled.empty() || !token_.empty(),
+        "PythonOp requires either pickled_builder or token arg.");
+    if (!pickled.empty()) {
+      py::gil_scoped_acquire g;
+      try {
+        auto pickle =
+            py::reinterpret_steal<py::object>(PyImport_ImportModule("pickle"));
+        CAFFE_ENFORCE(pickle);
+        auto loads = pickle.attr("loads").cast<py::object>();
+        CAFFE_ENFORCE(loads);
+        auto builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
+        CAFFE_ENFORCE(builder_call);
+        CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
+        auto func = builder_call[0].cast<py::object>();
+        auto args = builder_call[1].cast<py::tuple>();
+        auto kwargs = builder_call[2].cast<py::dict>();
+        auto built_func = func(*args, **kwargs);
+        CAFFE_ENFORCE(built_func);
+        built_func_.reset(
+            new Func{built_func,
+                     OperatorBase::template GetSingleArgument<bool>(
+                         "pass_workspace", false)});
+      } catch (const py::error_already_set& e) {
+        std::stringstream error;
+        error << "Python exception encountered while creating PythonOp: "
+              << e.what();
+        LOG(ERROR) << error.str();
+        CAFFE_THROW(error.str());
+      }
+    }
+  }
+
+  bool RunOnDevice() override final {
+    auto* pyFunc = built_func_ ? built_func_.get() : &getFunc(token_);
+    CAFFE_ENFORCE(pyFunc);
+    {
+      // Acquire GIL for call to Python runtime.
+      py::gil_scoped_acquire g;
+
+      DeviceOption cpu_option;
+      cpu_option.set_device_type(CPU);
+
+      std::vector<py::object> inputs;
+      inputs.reserve(InputSize());
+      for (auto i = 0; i < InputSize(); ++i) {
+        const auto* blob = &InputBlob(i);
+        // Allow CPU tensors in addition to operator context's tensors
+        py::object py_obj;
+        if (blob->template IsType<Tensor<CPUContext>>()) {
+          if (use_dlpack) {
+            DLPackWrapper<CPUContext> wrapper(
+                const_cast<Tensor<CPUContext>*>(
+                    &blob->template Get<Tensor<CPUContext>>()),
+                cpu_option);
+            // copy wrapper
+            py_obj = py::cast(wrapper, py::return_value_policy::copy);
+          } else {
+            py_obj = py::cast(
+                &blob->template Get<Tensor<CPUContext>>(),
+                py::return_value_policy::reference);
+          }
+        } else {
+          if (use_dlpack) {
+            DLPackWrapper<Context> wrapper(
+                const_cast<Tensor<Context>*>(
+                    &blob->template Get<Tensor<Context>>()),
+                this->device_option());
+            py_obj = py::cast(wrapper, py::return_value_policy::copy);
+          } else {
+            py_obj = py::cast(
+                &blob->template Get<Tensor<Context>>(),
+                py::return_value_policy::reference);
+          }
+        }
+        inputs.push_back(py_obj);
+      }
+      std::vector<py::object> outputs;
+      outputs.reserve(OutputSize());
+      for (auto i = 0; i < OutputSize(); ++i) {
+        auto* blob = OutputBlob(i);
+
+        // Python op is always used with CPUContext only and treats inputs and
+        // outputs as CPU tensors, CUDA version of PythonOp is implemented
+        // through GPUFallbackOp that copies input CUDA blobs to CPU and copies
+        // outputs from CUDA to CPU.
+        // GPUFallbackOp also allows keeping some of the output blobs on CPU
+        // by specifying their indices explicitly in template parameters.
+
+        // PythonDLPack op allows working with CUDA and CPU blobs directly
+        // through DLPack tensors. In order to properly setup mapping we need
+        // to know in advance a type (CUDA or CPU) of an output blob.
+        // Output blob might not be initialized yet, so by default we treat
+        // output blobs as having the same type as operator's context.
+        // This can be overwritten though forced_cpu_outputs argument
+
+        // make sure output blob is initialized before creating the binding
+        if (forced_cpu_outputs_.count(i)) {
+          blob->template GetMutable<Tensor<CPUContext>>();
+        } else {
+          blob->template GetMutable<Tensor<Context>>();
+        }
+
+        py::object py_obj;
+        if (blob->template IsType<Tensor<CPUContext>>()) {
+          if (use_dlpack) {
+            DLPackWrapper<CPUContext> wrapper(
+                blob->template GetMutable<Tensor<CPUContext>>(), cpu_option);
+            py_obj = py::cast(wrapper, py::return_value_policy::copy);
+          } else {
+            py_obj = py::cast(
+                blob->template GetMutable<Tensor<CPUContext>>(),
+                py::return_value_policy::reference);
+          }
+        } else {
+          if (use_dlpack) {
+            DLPackWrapper<Context> wrapper(
+                blob->template GetMutable<Tensor<Context>>(),
+                this->device_option());
+            py_obj = py::cast(wrapper, py::return_value_policy::copy);
+          } else {
+            py_obj = py::cast(
+                blob->template GetMutable<Tensor<Context>>(),
+                py::return_value_policy::reference);
+          }
+        }
+        outputs.push_back(py_obj);
+      }
+
+      try {
+        if (pyFunc->needs_workspace) {
+          pyFunc->py_func(inputs, outputs, ws_);
+        } else {
+          pyFunc->py_func(inputs, outputs);
+        }
+      } catch (const py::error_already_set& e) {
+        std::stringstream error;
+        error << "Exception encountered running PythonOp function: "
+              << e.what();
+        LOG(ERROR) << error.str();
+        CAFFE_THROW(error.str());
+      }
+    }
+    return true;
+  }
+
+  virtual ~PythonOpBase() {
+    if (built_func_) {
+      // since it may trigger python interpreter when refcount reaches zero
+      py::gil_scoped_acquire g;
+      built_func_.reset();
+    }
+  }
+
+ protected:
+  virtual const python_detail::Func& getFunc(const std::string& token) = 0;
+  Workspace* ws_;
+  // output indices forced to be on CPU
+  std::unordered_set<int> forced_cpu_outputs_;
+
+ private:
+  const std::string token_;
+  std::unique_ptr<python_detail::Func> built_func_;
+};
+
+template <class Context, bool use_dlpack>
+class PythonOp : public PythonOpBase<Context, use_dlpack> {
+ public:
+  PythonOp(const OperatorDef& operator_def, Workspace* ws)
+      : PythonOpBase<Context, use_dlpack>(operator_def, ws, "pickled_builder") {
+  }
+
+ protected:
+  const python_detail::Func& getFunc(const std::string& token) override {
+    return python_detail::getOpFunc(token);
+  }
+};
+
+template <class Context, bool use_dlpack>
+class PythonGradientOp : public PythonOpBase<Context, use_dlpack> {
+ public:
+  PythonGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : PythonOpBase<Context, use_dlpack>(
+            operator_def,
+            ws,
+            "pickled_grad_builder") {}
+
+ protected:
+  const python_detail::Func& getFunc(const std::string& token) override {
+    return python_detail::getGradientFunc(token);
+  }
+};
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_dlpack.cc b/caffe2/python/pybind_state_dlpack.cc
new file mode 100644
index 0000000..1fea0a7
--- /dev/null
+++ b/caffe2/python/pybind_state_dlpack.cc
@@ -0,0 +1,73 @@
+#include "pybind_state_dlpack.h"
+
+namespace caffe2 {
+namespace python {
+
+namespace py = pybind11;
+
+const DLDeviceType* CaffeToDLDeviceType(int device_type) {
+  static std::map<int, DLDeviceType> dl_device_type_map{
+      {CPU, kCPU},
+      {CUDA, kGPU},
+  };
+  const auto it = dl_device_type_map.find(device_type);
+  return it == dl_device_type_map.end() ? nullptr : &it->second;
+}
+
+const DLDataType* CaffeToDLType(const TypeMeta& meta) {
+  static std::map<CaffeTypeId, DLDataType> dl_type_map{
+      {TypeMeta::Id<int8_t>(), DLDataType{0, 8, 1}},
+      {TypeMeta::Id<int16_t>(), DLDataType{0, 16, 1}},
+      {TypeMeta::Id<int32_t>(), DLDataType{0, 32, 1}},
+      {TypeMeta::Id<int64_t>(), DLDataType{0, 64, 1}},
+      {TypeMeta::Id<uint8_t>(), DLDataType{1, 8, 1}},
+      {TypeMeta::Id<uint16_t>(), DLDataType{1, 16, 1}},
+      {TypeMeta::Id<float16>(), DLDataType{2, 16, 1}},
+      {TypeMeta::Id<float>(), DLDataType{2, 32, 1}},
+      {TypeMeta::Id<double>(), DLDataType{2, 64, 1}},
+  };
+  const auto it = dl_type_map.find(meta.id());
+  return it == dl_type_map.end() ? nullptr : &it->second;
+}
+
+const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type) {
+  try {
+    if (dl_type.lanes != 1) {
+      throw std::invalid_argument("invalid type");
+    }
+    static std::map<int, std::map<int, TypeMeta>> dl_caffe_type_map{
+        {0,
+         std::map<int, TypeMeta>{
+             {8, TypeMeta::Make<int8_t>()},
+             {16, TypeMeta::Make<int16_t>()},
+             {32, TypeMeta::Make<int32_t>()},
+             {64, TypeMeta::Make<int64_t>()},
+         }},
+        {1,
+         std::map<int, TypeMeta>{
+             {8, TypeMeta::Make<uint8_t>()},
+             {16, TypeMeta::Make<uint16_t>()},
+         }},
+        {2,
+         std::map<int, TypeMeta>{
+             {16, TypeMeta::Make<float16>()},
+             {32, TypeMeta::Make<float>()},
+             {64, TypeMeta::Make<double>()},
+         }},
+    };
+    if (!dl_caffe_type_map.count(dl_type.code)) {
+      throw std::invalid_argument("invalid type");
+    }
+    const auto& bits_map = dl_caffe_type_map.at(dl_type.code);
+    if (!bits_map.count(dl_type.bits)) {
+      throw std::invalid_argument("invalid type");
+    }
+    return bits_map.at(dl_type.bits);
+  } catch (const std::invalid_argument& e) {
+    CAFFE_THROW(
+        "Unsupported DLDataType: ", dl_type.code, dl_type.bits, dl_type.lanes);
+  }
+}
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
new file mode 100644
index 0000000..1ba3f0f
--- /dev/null
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/python/dlpack.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace caffe2 {
+namespace python {
+
+namespace py = pybind11;
+
+const DLDeviceType* CaffeToDLDeviceType(int device_type);
+
+const DLDataType* CaffeToDLType(const TypeMeta& meta);
+
+const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type);
+
+template <class Context>
+class DLPackWrapper {
+ public:
+  DLPackWrapper(Tensor<Context>* tensor, DeviceOption device_option)
+      : tensor(tensor), device_option(device_option) {}
+
+  py::object data() {
+    DLContext tensor_context;
+    auto device_type_ptr = CaffeToDLDeviceType(device_option.device_type());
+    CAFFE_ENFORCE(
+        device_type_ptr,
+        "Unsupported device type: ",
+        device_option.device_type());
+    tensor_context.device_type = *device_type_ptr;
+    tensor_context.device_id = device_option.cuda_gpu_id();
+
+    if (tensor->size() <= 0) {
+      tensor->Resize(0);
+    }
+    if (tensor->meta().id() == CaffeTypeId::uninitialized()) {
+      // treat uninitialized tensor as float tensor
+      tensor->template mutable_data<float>();
+    }
+    CAFFE_ENFORCE_GT(tensor->ndim(), 0);
+
+    auto type_ptr = CaffeToDLType(tensor->meta());
+    CAFFE_ENFORCE(
+        type_ptr,
+        "Tensor type is not supported in DLPack: ",
+        tensor->meta().name());
+    DLDataType tensor_type = *type_ptr;
+
+    DLTensor dlTensor;
+    dlTensor.data = const_cast<void*>(tensor->raw_data());
+    dlTensor.ctx = tensor_context;
+    dlTensor.ndim = tensor->ndim();
+    dlTensor.dtype = tensor_type;
+    dlTensor.shape = const_cast<int64_t*>(&(tensor->dims()[0]));
+    dlTensor.strides = nullptr;
+    dlTensor.byte_offset = 0;
+
+    managed_tensor.dlTensor = dlTensor;
+    // C2 Tensor memory is managed by C2
+    managed_tensor.ctx = nullptr;
+    managed_tensor.destructor = [](DLManagedTensor*) {};
+
+    return py::reinterpret_steal<py::object>(
+        PyCapsule_New(&managed_tensor, "dltensor", nullptr));
+  }
+
+  void feed(py::object obj) {
+    CAFFE_ENFORCE(PyCapsule_CheckExact(obj.ptr()), "Expected DLPack capsule");
+    DLManagedTensor* dlMTensor =
+        (DLManagedTensor*)PyCapsule_GetPointer(obj.ptr(), "dltensor");
+    CAFFE_ENFORCE(dlMTensor, "Invalid DLPack capsule");
+    DLTensor* dlTensor = &dlMTensor->dlTensor;
+    auto device_type_ptr = CaffeToDLDeviceType(device_option.device_type());
+    CAFFE_ENFORCE(
+        device_type_ptr,
+        "Unsupported device type: ",
+        device_option.device_type());
+    CAFFE_ENFORCE(
+        dlTensor->ctx.device_type == *device_type_ptr,
+        "DLPack tensor device type mismatch");
+    int dlpack_device_id = dlTensor->ctx.device_id;
+    CAFFE_ENFORCE_EQ(
+        dlpack_device_id,
+        device_option.cuda_gpu_id(),
+        "Expected same device id for DLPack and C2 tensors");
+
+    std::vector<TIndex> dims;
+    dims.reserve(dlTensor->ndim);
+    for (int idx = 0; idx < dlTensor->ndim; ++idx) {
+      dims.push_back(dlTensor->shape[idx]);
+    }
+
+    if (dlTensor->strides) {
+      int64_t stride = 1;
+      for (int idx = dims.size() - 1; idx >= 0; --idx) {
+        CAFFE_ENFORCE_EQ(
+            stride,
+            dlTensor->strides[idx],
+            "Tensors with non-standard strides are not supported");
+        stride *= dims[idx];
+      }
+    }
+
+    tensor->Resize(dims);
+    const auto& meta = DLTypeToCaffe(dlTensor->dtype);
+    tensor->ShareExternalPointer(
+        ((int8_t*)dlTensor->data) + dlTensor->byte_offset,
+        meta,
+        0,
+        [dlMTensor](void*) {
+          if (dlMTensor->destructor) {
+            dlMTensor->destructor(dlMTensor);
+          }
+        });
+  }
+
+  Tensor<Context>* tensor;
+  DeviceOption device_option;
+  DLManagedTensor managed_tensor;
+};
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
new file mode 100644
index 0000000..9ceec10
--- /dev/null
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -0,0 +1,156 @@
+// Note(jiayq): the import_array function is done inside
+// caffe2_python.cc. Read
+// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
+// for more details.
+
+#define NO_IMPORT_ARRAY
+
+#include "pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/core/common_cudnn.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+#ifdef CAFFE2_USE_TRT
+#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
+#endif // CAFFE2_USE_TRT
+
+namespace caffe2 {
+namespace python {
+
+REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp<PythonOp<CPUContext, false>>);
+REGISTER_CUDA_OPERATOR(
+    PythonGradient,
+    GPUFallbackOp<PythonGradientOp<CPUContext, false>>);
+
+REGISTER_CUDA_OPERATOR(PythonDLPack, PythonOp<CUDAContext, true>);
+REGISTER_CUDA_OPERATOR(
+    PythonDLPackGradient,
+    PythonGradientOp<CUDAContext, true>);
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCUDA>()), TensorFetcher<CUDAContext>);
+REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
+
+namespace py = pybind11;
+
+void addCUDAGlobalMethods(py::module& m) {
+  m.def("num_cuda_devices", &NumCudaDevices);
+  m.def("get_cuda_version", &CudaVersion);
+  m.def("get_cudnn_version", &cudnnCompiledVersion);
+  m.def("get_cuda_peer_access_pattern", []() {
+    std::vector<std::vector<bool>> pattern;
+    CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
+    return pattern;
+  });
+  m.def("get_device_properties", [](int deviceid) {
+    auto& prop = GetDeviceProperty(deviceid);
+    std::map<std::string, py::object> obj;
+    obj["name"] = py::cast(prop.name);
+    obj["major"] = py::cast(prop.major);
+    obj["minor"] = py::cast(prop.minor);
+    obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
+    return obj;
+  });
+  m.def(
+      "onnx_to_trt_op",
+      [](const py::bytes& onnx_model_str,
+         const std::unordered_map<std::string, std::vector<int>>&
+             output_size_hints,
+         int max_batch_size,
+         int max_workspace_size,
+         int verbosity,
+         bool debug_builder) -> py::bytes {
+#ifdef CAFFE2_USE_TRT
+        TensorRTTransformer t(
+            max_batch_size, max_workspace_size, verbosity, debug_builder);
+        auto op_def =
+            t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
+        std::string out;
+        op_def.SerializeToString(&out);
+        return py::bytes(out);
+#else
+        CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
+#endif // CAFFE2_USE_TRT
+      });
+  m.def(
+      "transform_trt",
+      [](const py::bytes& pred_net_str,
+         const std::unordered_map<std::string, std::vector<int>>& shapes,
+         int max_batch_size,
+         int max_workspace_size,
+         int verbosity,
+         bool debug_builder,
+         bool build_serializable_op) -> py::bytes {
+#ifdef CAFFE2_USE_TRT
+        caffe2::NetDef pred_net;
+        if (!ParseProtoFromLargeString(
+                pred_net_str.cast<std::string>(), &pred_net)) {
+          LOG(ERROR) << "broken pred_net protobuf";
+        }
+        std::unordered_map<std::string, TensorShape> tensor_shapes;
+        for (const auto& it : shapes) {
+          tensor_shapes.emplace(
+              it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
+        }
+        TensorRTTransformer ts(
+            max_batch_size,
+            max_workspace_size,
+            verbosity,
+            debug_builder,
+            build_serializable_op);
+        ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
+        std::string pred_net_str2;
+        pred_net.SerializeToString(&pred_net_str2);
+        return py::bytes(pred_net_str2);
+#else
+        CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
+#endif // CAFFE2_USE_TRT
+      });
+};
+
+void addCUDAObjectMethods(py::module& m) {
+  py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
+      .def_property_readonly(
+          "data",
+          [](DLPackWrapper<CUDAContext>* t) -> py::object {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                CUDA,
+                "Expected CUDA device option for CUDA tensor");
+
+            return t->data();
+          },
+          "Return DLPack tensor with tensor's data.")
+      .def(
+          "feed",
+          [](DLPackWrapper<CUDAContext>* t, py::object obj) {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                CUDA,
+                "Expected CUDA device option for CUDA tensor");
+            t->feed(obj);
+          },
+          "Copy data from given DLPack tensor into this tensor.")
+      .def_property_readonly(
+          "_shape",
+          [](const DLPackWrapper<CUDAContext>& t) { return t.tensor->dims(); })
+      .def(
+          "_reshape",
+          [](DLPackWrapper<CUDAContext>* t, std::vector<TIndex> dims) {
+            t->tensor->Resize(dims);
+          });
+}
+
+PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
+  m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
+
+  addGlobalMethods(m);
+  addCUDAGlobalMethods(m);
+  addObjectMethods(m);
+  addCUDAObjectMethods(m);
+}
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_hip.cc b/caffe2/python/pybind_state_hip.cc
new file mode 100644
index 0000000..b770ea0
--- /dev/null
+++ b/caffe2/python/pybind_state_hip.cc
@@ -0,0 +1,91 @@
+#define NO_IMPORT_ARRAY
+
+#include "pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/core/hip/common_miopen.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/operators/hip/operator_fallback_hip.h"
+
+namespace caffe2 {
+namespace python {
+
+REGISTER_HIP_OPERATOR(Python, GPUFallbackOp<PythonOp<CPUContext, false>>);
+REGISTER_HIP_OPERATOR(
+    PythonGradient,
+    GPUFallbackOp<PythonGradientOp<CPUContext, false>>);
+
+REGISTER_HIP_OPERATOR(PythonDLPack, PythonOp<HIPContext, true>);
+REGISTER_HIP_OPERATOR(PythonDLPackGradient, PythonGradientOp<HIPContext, true>);
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorHIP>()), TensorFetcher<HIPContext>);
+REGISTER_BLOB_FEEDER(HIP, TensorFeeder<HIPContext>);
+
+namespace py = pybind11;
+
+void addHIPGlobalMethods(py::module& m) {
+  m.def("num_hip_devices", &NumHipDevices);
+  m.def("set_default_gpu_id", &SetDefaultGPUID);
+  m.def("get_default_gpu_id", &GetDefaultGPUID);
+  m.def("get_hip_version", &HipVersion);
+  m.def("get_miopen_version", &miopenCompiledVersion);
+  m.def("get_hip_peer_access_pattern", []() {
+    std::vector<std::vector<bool>> pattern;
+    CAFFE_ENFORCE(caffe2::GetHipPeerAccessPattern(&pattern));
+    return pattern;
+  });
+  m.def("get_device_properties", [](int deviceid) {
+    auto& prop = GetDeviceProperty(deviceid);
+    std::map<std::string, py::object> obj;
+    obj["name"] = py::cast(prop.name);
+    obj["major"] = py::cast(prop.major);
+    obj["minor"] = py::cast(prop.minor);
+    return obj;
+  });
+};
+
+void addHIPObjectMethods(py::module& m) {
+  py::class_<DLPackWrapper<HIPContext>>(m, "DLPackTensorHIP")
+      .def_property_readonly(
+          "data",
+          [](DLPackWrapper<HIPContext>* t) -> py::object {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                HIP,
+                "Expected HIP device option for HIP tensor");
+
+            return t->data();
+          },
+          "Return DLPack tensor with tensor's data.")
+      .def(
+          "feed",
+          [](DLPackWrapper<HIPContext>* t, py::object obj) {
+            CAFFE_ENFORCE_EQ(
+                t->device_option.device_type(),
+                HIP,
+                "Expected HIP device option for HIP tensor");
+            t->feed(obj);
+          },
+          "Copy data from given DLPack tensor into this tensor.")
+      .def_property_readonly(
+          "_shape",
+          [](const DLPackWrapper<HIPContext>& t) { return t.tensor->dims(); })
+      .def(
+          "_reshape",
+          [](DLPackWrapper<HIPContext>* t, std::vector<TIndex> dims) {
+            t->tensor->Resize(dims);
+          });
+}
+
+PYBIND11_MODULE(caffe2_pybind11_state_hip, m) {
+  m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
+
+  addGlobalMethods(m);
+  addHIPGlobalMethods(m);
+  addObjectMethods(m);
+  addHIPObjectMethods(m);
+}
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
new file mode 100644
index 0000000..8b4b98d
--- /dev/null
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -0,0 +1,158 @@
+// Note(jiayq): the import_array function is done inside
+// caffe2_python.cc. Read
+// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
+// for more details.
+#define NO_IMPORT_ARRAY
+
+#include "pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+namespace python {
+
+USE_IDEEP_DEF_ALIASES();
+
+class IDeepFetcher;
+class IDeepFeeder;
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()),IDeepFetcher);
+REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder);
+
+class IDeepFetcher : public BlobFetcherBase {
+  TypeMeta type_transform(const itensor &atensor) {
+    switch(atensor.get_data_type()) {
+      case itensor::data_type::f32:
+        return TypeMeta::Make<float>();
+      case itensor::data_type::s16:
+        return TypeMeta::Make<float16>();
+      case itensor::data_type::s32:
+        return TypeMeta::Make<int>();
+      case itensor::data_type::s8:
+        return TypeMeta::Make<int8_t>();
+      case itensor::data_type::u8:
+        return TypeMeta::Make<uint8_t>();
+      default:
+        // Should we throw exception?
+        return TypeMeta();
+    }
+  }
+
+ public:
+  pybind11::object Fetch(const Blob& blob) override {
+    try {
+      return FetchTensor(blob.Get<itensor>(), true).obj;
+    } catch (ideep::error& e) {
+      VLOG(1) << "IDEEP error: " << e.message;
+      throw;
+    }
+  }
+
+  FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) {
+    FetchedBlob result;
+    CAFFE_ENFORCE(atensor.materialized(),
+        "Trying to fetch uninitialized tensor");
+    const int numpy_type = CaffeToNumpyType(type_transform(atensor));
+    CAFFE_ENFORCE(
+        numpy_type != -1,
+        "Unsupported ideep memory data type? This usually should not happen "
+        "since ideep memory usually only do float and double.");
+    itensor::dims dims = atensor.get_dims();
+    std::vector<npy_intp> npy_dims(dims.begin(), dims.end());
+
+    result.copied = force_copy || atensor.need_reorder();
+    void* outPtr;
+    if (result.copied) {
+      result.obj = py::reinterpret_steal<py::object>(
+          PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type));
+      outPtr = static_cast<void *>(
+          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
+    } else {
+      outPtr = atensor.get_data_handle();
+      result.obj = py::reinterpret_steal<py::object>(
+          PyArray_SimpleNewFromData(
+            atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
+    }
+
+    if (numpy_type == NPY_OBJECT) {
+      CAFFE_THROW("We don't support strings.");
+    }
+
+    if (result.copied) {
+      atensor.reorder_to(outPtr);
+    }
+
+    return result;
+  }
+};
+
+class IDeepFeeder : public BlobFeederBase {
+  itensor::data_type type_transform(const TypeMeta &meta) {
+    if (meta == TypeMeta::Make<float>())
+      return itensor::data_type::f32;
+    else if (meta == TypeMeta::Make<int>())
+      return itensor::data_type::s32;
+    else if (meta == TypeMeta::Make<float16>())
+      return itensor::data_type::s16;
+    else if (meta == TypeMeta::Make<int8_t>())
+      return itensor::data_type::s8;
+    else if (meta == TypeMeta::Make<uint8_t>())
+      return itensor::data_type::u8;
+    else
+      return itensor::data_type::data_undef;
+  }
+
+ public:
+   void FeedTensor(
+       const DeviceOption& option,
+       PyArrayObject *original_array,
+       itensor *tensor) {
+     PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+     auto g = MakeGuard([&]() {Py_XDECREF(array); });
+
+     const auto npy_type = PyArray_TYPE(array);
+     const TypeMeta& meta = NumpyTypeToCaffe(npy_type);
+     CAFFE_ENFORCE(
+        meta.id() != CaffeTypeId::uninitialized(),
+        "This numpy data type is not supported: ",
+        PyArray_TYPE(array),
+        ".");
+
+     int ndim = PyArray_NDIM(array);
+     npy_intp* npy_dims = PyArray_DIMS(array);
+
+     itensor::dims adims;
+     for (int i = 0; i < ndim; i++) {
+       adims.push_back(static_cast<itensor::dims::value_type>(
+             npy_dims[i]));
+     }
+
+     switch (npy_type) {
+      case NPY_OBJECT:
+      case NPY_UNICODE:
+        CAFFE_THROW("IDeep doesn't support string");
+        break;
+      default:
+        auto type = type_transform(meta);
+        tensor->resize(adims, type);
+        tensor->reorder_from(adims, type,
+            static_cast<void *>(PyArray_DATA(array)));
+     }
+   }
+
+   void Feed(const DeviceOption& option, PyArrayObject* original_array,
+       Blob* blob) {
+      try {
+        FeedTensor(option, original_array, blob->GetMutable<itensor>());
+      } catch (ideep::error& e) {
+        VLOG(1) << "IDEEP error: " << e.message;
+        throw;
+      }
+   }
+};
+
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_int8.cc b/caffe2/python/pybind_state_int8.cc
new file mode 100644
index 0000000..683a4ce
--- /dev/null
+++ b/caffe2/python/pybind_state_int8.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Note(jiayq): the import_array function is done inside
+// caffe2_python.cc. Read
+// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
+// for more details.
+#define NO_IMPORT_ARRAY
+#include "caffe2/python/pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/core/tensor_int8.h"
+
+namespace caffe2 {
+namespace python {
+
+class Int8TensorFetcher : public BlobFetcherBase {
+ public:
+  pybind11::object Fetch(const Blob& blob) override {
+    const caffe2::int8::Int8TensorCPU& src =
+        blob.template Get<caffe2::int8::Int8TensorCPU>();
+    const int numpy_type = CaffeToNumpyType(src.t.meta());
+    CAFFE_ENFORCE(numpy_type != -1, "Int8Tensor contains unknown type data");
+    std::vector<npy_intp> npy_dims;
+    for (const auto dim : src.t.dims()) {
+      npy_dims.push_back(dim);
+    }
+    auto data_array = pybind11::reinterpret_steal<pybind11::object>(
+        PyArray_SimpleNew(src.t.dims().size(), npy_dims.data(), numpy_type));
+    void* ptr = static_cast<void*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(data_array.ptr())));
+    CPUContext context;
+    context.template CopyBytes<CPUContext, CPUContext>(
+        src.t.nbytes(), src.t.raw_data(), ptr);
+    context.FinishDeviceComputation();
+
+    auto result = pybind11::cast<pybind11::object>(
+        pybind11::make_tuple(data_array, src.scale, src.zero_point));
+    return result;
+  }
+};
+
+REGISTER_BLOB_FETCHER(
+    (TypeMeta::Id<caffe2::int8::Int8TensorCPU>()),
+    caffe2::python::Int8TensorFetcher);
+} // namespace  python
+
+} // namespace caffe2
diff --git a/caffe2/python/pybind_state_mkl.cc b/caffe2/python/pybind_state_mkl.cc
new file mode 100644
index 0000000..ea11113
--- /dev/null
+++ b/caffe2/python/pybind_state_mkl.cc
@@ -0,0 +1,100 @@
+// Note(jiayq): the import_array function is done inside
+// caffe2_python.cc. Read
+// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
+// for more details.
+#define NO_IMPORT_ARRAY
+
+#include "pybind_state.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/mkl/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace python {
+
+template <typename T>
+using MKLMemory = caffe2::mkl::MKLMemory<T>;
+
+template <typename T>
+class MKLMemoryFetcher : public BlobFetcherBase {
+ public:
+  pybind11::object Fetch(const Blob& blob) override {
+    const MKLMemory<T>& src = blob.Get<MKLMemory<T>>();
+    CAFFE_ENFORCE(src.buffer(), "Trying to fetch unitilized tensor");
+    const int numpy_type = CaffeToNumpyType(TypeMeta::Make<T>());
+    CAFFE_ENFORCE(
+        numpy_type != -1,
+        "Unsupported mkl memory data type? This usually should not happen "
+        "since MKLMemory usually only do float and double.");
+    std::vector<npy_intp> npy_dims;
+    for (const auto dim : src.dims()) {
+      npy_dims.push_back(dim);
+    }
+    auto result = pybind11::reinterpret_steal<pybind11::object>(
+        PyArray_SimpleNew(src.dims().size(), npy_dims.data(), numpy_type));
+    void* ptr = static_cast<void*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.ptr())));
+    src.CopyTo(ptr);
+    return result;
+  }
+};
+
+class MKLMemoryFeeder : public BlobFeederBase {
+ public:
+  void Feed(const DeviceOption&, PyArrayObject* original_array, Blob* blob)
+      override {
+    PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
+    auto g = MakeGuard([&]() { Py_XDECREF(array); });
+
+    const auto npy_type = PyArray_TYPE(array);
+    const TypeMeta& meta = NumpyTypeToCaffe(npy_type);
+    // TODO: if necessary, use dispatcher.
+    if (meta.Match<float>()) {
+      FeedMKL<float>(array, blob);
+    } else if (meta.Match<double>()) {
+      FeedMKL<double>(array, blob);
+    } else {
+      CAFFE_THROW(
+          "This numpy data type is not supported: ",
+          PyArray_TYPE(array),
+          ". Only float and double are supported by MKLDNN.");
+    }
+  }
+
+  template <typename T>
+  void FeedMKL(PyArrayObject* array, Blob* blob) {
+    // numpy requires long int as its dims.
+    int ndim = PyArray_NDIM(array);
+    npy_intp* npy_dims = PyArray_DIMS(array);
+    std::vector<TIndex> dims;
+    for (int i = 0; i < ndim; ++i) {
+      dims.push_back(npy_dims[i]);
+    }
+    // See if we already have the right MKLMemory object. The reason is that if
+    // there is already an existing MKLMemory, we want to keep the internal
+    // layout that is already specified by the object.
+    if (!blob->IsType<MKLMemory<T>>() ||
+        dims != blob->Get<MKLMemory<T>>().dims()) {
+      blob->Reset(new MKLMemory<T>(dims));
+    }
+    blob->GetMutable<MKLMemory<T>>()->CopyFrom(
+        static_cast<const void*>(PyArray_DATA(array)));
+  }
+};
+
+REGISTER_BLOB_FETCHER(
+    (TypeMeta::Id<MKLMemory<float>>()),
+    MKLMemoryFetcher<float>);
+REGISTER_BLOB_FETCHER(
+    (TypeMeta::Id<MKLMemory<double>>()),
+    MKLMemoryFetcher<double>);
+REGISTER_BLOB_FEEDER(MKLDNN, MKLMemoryFeeder);
+
+} // namespace python
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_DNN
diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py
new file mode 100644
index 0000000..3eb3d4d
--- /dev/null
+++ b/caffe2/python/python_op_test.py
@@ -0,0 +1,232 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.core import CreatePythonOperator
+import caffe2.python.hypothesis_test_util as hu
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+
+
+def SubFunctionThatThrowsRuntimeError():
+    raise RuntimeError("This is an intentional exception.")
+
+
+def MainOpFunctionThatThrowsRuntimeError(inputs, _):
+    return SubFunctionThatThrowsRuntimeError()
+
+
+def op_builder(name, index, extra):
+    iterations = [0]
+    assert name == 'name'
+    assert index == 5
+    assert extra - 4.2 < 0.0001
+
+    def my_op(inputs, outputs):
+        assert inputs[0].data[0] == iterations[0]
+        assert name == 'name'
+        assert index == 5
+        assert extra - 4.2 < 0.0001
+        iterations[0] += 1
+
+    return my_op
+
+
+class PythonOpTest(hu.HypothesisTestCase):
+    @given(x=hu.tensor())
+    def test_feed(self, x):
+        def f(inputs, _):
+            self.assertEqual(x.shape, inputs[0].shape)
+            self.assertEqual(type(inputs[0].shape), tuple)
+            self.assertEqual(type(inputs[0].data), np.ndarray)
+            np.testing.assert_almost_equal(x, inputs[0].data)
+        op = CreatePythonOperator(f, ["x"], [])
+        workspace.FeedBlob("x", x)
+        workspace.RunOperatorOnce(op)
+
+    def test_exception(self):
+        op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], [])
+        with self.assertRaisesRegexp(
+            RuntimeError, "This is an intentional exception."
+        ):
+            workspace.RunOperatorOnce(op)
+
+    @given(x=hu.tensor())
+    def test_feed_with_helper_function(self, x):
+        def f(inputs, _):
+            self.assertEqual(x.shape, inputs[0].shape)
+            self.assertEqual(type(inputs[0].shape), tuple)
+            self.assertEqual(type(inputs[0].data), np.ndarray)
+            np.testing.assert_almost_equal(x, inputs[0].data)
+        net = core.Net("test")
+        net.Python(f)(["x"], [])
+        workspace.FeedBlob("x", x)
+        workspace.RunNetOnce(net)
+
+    def test_builder_tuple(self):
+        net = core.Net("builder_template")
+        iter_blob = 'iter'
+        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
+        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
+        for repeat in range(2):
+            # check that the builder will be called exactly once for each
+            # PythonOp constructor. Cloning the net will also trigger a call
+            # to the builder when the net is created.
+            cloned_net = net.Clone('builder_%d' % repeat)
+            workspace.FeedBlob(iter_blob, np.array([0]))
+            # Builder gets called once per python op in the line below
+            workspace.CreateNet(cloned_net)
+            for i in range(10):
+                workspace.FeedBlob(iter_blob, np.array([i]))
+                workspace.RunNet(cloned_net)
+
+    @given(x=hu.tensor())
+    def test_feed_with_gc(self, x):
+        def f(inputs, _):
+            self.assertEqual(x.shape, inputs[0].shape)
+            np.testing.assert_almost_equal(x, inputs[0].data)
+        op = CreatePythonOperator(f, ["x"], [])
+        workspace.FeedBlob("x", x)
+        workspace.RunOperatorOnce(op)
+        del f
+        workspace.FeedBlob("x", x)
+        workspace.RunOperatorOnce(op)
+
+    @given(x=hu.tensor())
+    def test_reshape(self, x):
+        def f(inputs, outputs):
+            outputs[0].reshape(inputs[0].shape)
+            self.assertEqual(x.shape, inputs[0].shape)
+            self.assertEqual(x.shape, outputs[0].shape)
+            outputs[0].data[...] = inputs[0].data
+
+        op = CreatePythonOperator(f, ["x"], ["y"])
+        workspace.FeedBlob("x", x)
+        workspace.RunOperatorOnce(op)
+        y = workspace.FetchBlob("y")
+        np.testing.assert_almost_equal(x, y)
+
+    @given(x=hu.tensor())
+    def test_workspace_manipulation(self, x):
+        """
+        Verify that python op can manipulate workspace directly
+        """
+        def f(inputs, outputs, ws):
+            fetched = ws.blobs['internal'].fetch()
+            np.testing.assert_almost_equal(fetched, x)
+
+        ws = workspace.C.Workspace()
+        net = core.Net("test")
+        net.GivenTensorFill([], ['internal'], values=x, shape=x.shape)
+        net.Python(f, pass_workspace=True)([], [])
+        ws.run(net)
+
+    @given(x=hu.tensor())
+    def test_caught_exception_doesnt_terminate(self, x):
+        def f(inputs, outputs):
+            try:
+                raise Exception("Exception in handler")
+            except Exception:
+                pass
+
+        op = CreatePythonOperator(f, ["x"], ["y"])
+        workspace.FeedBlob("x", x)
+        workspace.RunOperatorOnce(op)
+
+    @given(x=hu.tensor(),
+           n=st.integers(min_value=1, max_value=20),
+           w=st.integers(min_value=1, max_value=20))
+    def test_multithreaded_evaluation(self, x, n, w):
+        def f(inputs, outputs):
+            outputs[0].reshape(inputs[0].shape)
+            outputs[0].data[...] = inputs[0].data
+        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
+        net = core.Net("net")
+        net.Proto().op.extend(ops)
+        net.Proto().type = "dag"
+        net.Proto().num_workers = w
+        iters = 100
+        plan = core.Plan("plan")
+        plan.AddStep(core.ExecutionStep("test-step", net, iters))
+        workspace.FeedBlob("x", x)
+        workspace.RunPlan(plan.Proto().SerializeToString())
+        for i in range(n):
+            y = workspace.FetchBlob(str(i))
+            np.testing.assert_almost_equal(x, y)
+
+    @given(x=hu.tensor(), in_place=st.booleans(), **hu.gcs)
+    def test_gradient(self, x, in_place, gc, dc):
+        def f(inputs, outputs):
+            outputs[0].reshape(inputs[0].shape)
+            outputs[0].data[...] = inputs[0].data * 2
+
+        def grad_f(inputs, outputs):
+            # Ordering is [inputs, outputs, grad_outputs]
+            grad_output = inputs[2]
+
+            grad_input = outputs[0]
+            grad_input.reshape(grad_output.shape)
+            grad_input.data[...] = grad_output.data * 2
+
+        op = CreatePythonOperator(
+            f, ["x"], ["x" if in_place else "y"], grad_f=grad_f)
+        self.assertGradientChecks(gc, op, [x], 0, [0])
+        self.assertDeviceChecks(dc, op, [x], [0])
+
+    @given(inputs=hu.tensors(n=2), **hu.gcs)
+    def test_gradient_multiple(self, inputs, gc, dc):
+        (x1, x2) = inputs
+
+        def f(inputs, outputs):
+            for idx in [0, 1]:
+                self.assertEqual(type(inputs[idx].shape), tuple)
+                outputs[idx].reshape(inputs[idx].shape)
+                outputs[idx].data[...] = inputs[idx].data * 2
+
+        def grad_f(inputs, outputs):
+            # Ordering is [inputs, outputs, grad_outputs]
+            self.assertEqual(len(inputs), 6)
+            self.assertEqual(len(outputs), 2)
+            for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]:
+                grad_output = inputs[grad_output_idx]
+                grad_input = outputs[grad_input_idx]
+                grad_input.reshape(grad_output.shape)
+                grad_input.data[...] = grad_output.data * 2
+
+        op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f)
+
+        for idx in [0, 1]:
+            self.assertGradientChecks(gc, op, [x1, x2], idx, [0, 1])
+        self.assertDeviceChecks(dc, op, [x1, x2], [0, 1])
+
+    @given(inputs=hu.tensors(n=3), **hu.gcs)
+    def test_gradient_multiple_with_indices(self, inputs, gc, dc):
+        (x1, x2, x3) = inputs
+
+        def f(inputs, outputs):
+            for idx in [0, 1, 2]:
+                self.assertEqual(type(inputs[idx].shape), tuple)
+                outputs[idx].reshape(inputs[idx].shape)
+                outputs[idx].data[...] = inputs[idx].data * 2
+
+        def grad_f(inputs, outputs):
+            # Ordering is [inputs, outputs, grad_outputs]
+            self.assertEqual(len(inputs), 8)
+            self.assertEqual(len(outputs), 1)
+            for (grad_output_idx, grad_input_idx) in [(6, 0)]:
+                grad_output = inputs[grad_output_idx]
+                grad_input = outputs[grad_input_idx]
+                grad_input.reshape(grad_output.shape)
+                grad_input.data[...] = grad_output.data * 2
+
+        op = CreatePythonOperator(
+            f, ["x1", "x2", "x3"], ["y1", "y2", "y3"],
+            grad_f=grad_f,
+            grad_output_indices=[0, 2],  # Receive grad outputs for y1 and y3
+            grad_input_indices=[0]       # Produce grad inputs for x1
+        )
+
+        self.assertGradientChecks(gc, op, [x1, x2, x3], 0, [0, 2])
+        self.assertDeviceChecks(dc, op, [x1, x2, x3], [0, 1, 2])
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
new file mode 100644
index 0000000..6226575
--- /dev/null
+++ b/caffe2/python/queue_util.py
@@ -0,0 +1,136 @@
+## @package queue_util
+# Module caffe2.python.queue_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, dataio
+from caffe2.python.task import TaskGroup
+
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class _QueueReader(dataio.Reader):
+    def __init__(self, wrapper, num_dequeue_records=1):
+        assert wrapper.schema is not None, (
+            'Queue needs a schema in order to be read from.')
+        dataio.Reader.__init__(self, wrapper.schema())
+        self._wrapper = wrapper
+        self._num_dequeue_records = num_dequeue_records
+
+    def setup_ex(self, init_net, exit_net):
+        exit_net.CloseBlobsQueue([self._wrapper.queue()], 0)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        self._wrapper._new_reader(local_init_net)
+        dequeue_net = core.Net('dequeue')
+        fields, status_blob = dequeue(
+            dequeue_net,
+            self._wrapper.queue(),
+            len(self.schema().field_names()),
+            field_names=self.schema().field_names(),
+            num_records=self._num_dequeue_records)
+        return [dequeue_net], status_blob, fields
+
+    def read(self, net):
+        net, _, fields = self.read_ex(net, None)
+        return net, fields
+
+
+class _QueueWriter(dataio.Writer):
+    def __init__(self, wrapper):
+        self._wrapper = wrapper
+
+    def setup_ex(self, init_net, exit_net):
+        exit_net.CloseBlobsQueue([self._wrapper.queue()], 0)
+
+    def write_ex(self, fields, local_init_net, local_finish_net, status):
+        self._wrapper._new_writer(self.schema(), local_init_net)
+        enqueue_net = core.Net('enqueue')
+        enqueue(enqueue_net, self._wrapper.queue(), fields, status)
+        return [enqueue_net]
+
+
+class QueueWrapper(dataio.Pipe):
+    def __init__(self, handler, schema=None, num_dequeue_records=1):
+        dataio.Pipe.__init__(self, schema, TaskGroup.LOCAL_SETUP)
+        self._queue = handler
+        self._num_dequeue_records = num_dequeue_records
+
+    def reader(self):
+        return _QueueReader(
+            self, num_dequeue_records=self._num_dequeue_records)
+
+    def writer(self):
+        return _QueueWriter(self)
+
+    def queue(self):
+        return self._queue
+
+
+class Queue(QueueWrapper):
+    def __init__(self, capacity, schema=None, name='queue',
+                 num_dequeue_records=1):
+        # find a unique blob name for the queue
+        net = core.Net(name)
+        queue_blob = net.AddExternalInput(net.NextName('handler'))
+        QueueWrapper.__init__(
+            self, queue_blob, schema, num_dequeue_records=num_dequeue_records)
+        self.capacity = capacity
+        self._setup_done = False
+
+    def setup(self, global_init_net):
+        assert self._schema, 'This queue does not have a schema.'
+        self._setup_done = True
+        global_init_net.CreateBlobsQueue(
+            [],
+            [self._queue],
+            capacity=self.capacity,
+            num_blobs=len(self._schema.field_names()),
+            field_names=self._schema.field_names())
+
+
+def enqueue(net, queue, data_blobs, status=None):
+    if status is None:
+        status = net.NextName('status')
+    # Enqueueing moved the data into the queue;
+    # duplication will result in data corruption
+    queue_blobs = []
+    for blob in data_blobs:
+        if blob not in queue_blobs:
+            queue_blobs.append(blob)
+        else:
+            logger.warning("Need to copy blob {} to enqueue".format(blob))
+            queue_blobs.append(net.Copy(blob))
+    results = net.SafeEnqueueBlobs([queue] + queue_blobs, queue_blobs + [status])
+    return results[-1]
+
+
+def dequeue(net, queue, num_blobs, status=None, field_names=None,
+            num_records=1):
+    if field_names is not None:
+        assert len(field_names) == num_blobs
+        data_names = [net.NextName(name) for name in field_names]
+    else:
+        data_names = [net.NextName('data', i) for i in range(num_blobs)]
+    if status is None:
+        status = net.NextName('status')
+    results = net.SafeDequeueBlobs(
+        queue, data_names + [status], num_records=num_records)
+    results = list(results)
+    status_blob = results.pop(-1)
+    return results, status_blob
+
+
+def close_queue(step, *queues):
+    close_net = core.Net("close_queue_net")
+    for queue in queues:
+        close_net.CloseBlobsQueue([queue], 0)
+    close_step = core.execution_step("%s_step" % str(close_net), close_net)
+    return core.execution_step(
+        "%s_wraper_step" % str(close_net),
+        [step, close_step])
diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py
new file mode 100644
index 0000000..d5f129a
--- /dev/null
+++ b/caffe2/python/record_queue.py
@@ -0,0 +1,118 @@
+## @package record_queue
+# Module caffe2.python.record_queue
+"""
+Implementation of a queue wrapper.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from caffe2.python.dataio import Reader, Writer
+from caffe2.python.schema import (
+    Struct, Field, from_column_list)
+
+
+class _QueueReader(Reader):
+    def __init__(self, blobs_queue, schema, name=None):
+        """Don't call this directly. Instead, use dataset.reader()"""
+        super(_QueueReader, self).__init__(schema)
+        self.blobs_queue = blobs_queue
+        self.name = name
+
+    def read(self, read_net):
+        with core.NameScope(read_net.NextName(self.name)):
+            status = read_net.NextName()
+            fields = read_net.SafeDequeueBlobs(
+                self.blobs_queue, self._schema.field_names() + [status])
+            return (fields[-1], fields[:-1])
+
+
+class _QueueWriter(Writer):
+    def __init__(self, blobs_queue, schema):
+        self.blobs_queue = blobs_queue
+        self.schema = schema
+
+    def write(self, writer_net, fields):
+        if isinstance(fields, Field):
+            fields = fields.field_blobs()
+        writer_net.CheckDatasetConsistency(
+            fields, [], fields=self.schema.field_names())
+        status = writer_net.NextName()
+        writer_net.SafeEnqueueBlobs(
+            [self.blobs_queue] + fields, fields + [status])
+        return status
+
+
+class RecordQueue(object):
+    """ The class is used to feed data with some process from a reader into a
+        queue and provider a reader interface for data fetching from the queue.
+    """
+    def __init__(self, fields, name=None, capacity=1,
+                 enforce_unique_name=False, num_threads=1):
+        assert isinstance(fields, list) or isinstance(fields, Struct), (
+            'fields must be either a Struct or a list of raw field names.')
+        if isinstance(fields, list):
+            fields = from_column_list(fields)
+        self.schema = fields
+        self.name = name or 'queue'
+        self.num_threads = num_threads
+        num_blobs = len(self.schema.field_names())
+        init_net = core.Net(self.name + '/init_net')
+        self.blobs_queue = init_net.CreateBlobsQueue(
+            [], 1,
+            capacity=capacity,
+            num_blobs=num_blobs,
+            enforce_unique_name=enforce_unique_name)
+        core.workspace.RunNetOnce(init_net)
+
+        self.writer = _QueueWriter(self.blobs_queue, self.schema)
+        reader_name = self.name + '_reader'
+        self.reader = _QueueReader(self.blobs_queue, self.schema, reader_name)
+
+        exit_net = core.Net(self.name + '/exit_net')
+        exit_net.CloseBlobsQueue(self.blobs_queue, 0)
+        self.exit_step = core.execution_step(
+            '{}_close_step'.format(str(exit_net)),
+            exit_net)
+
+    def build(self, reader, process=None):
+        """
+        Build the producer_step to feed data from reader into the queue, and
+        return the reader interface.
+        Inputs:
+            reader:           read data which will be stored in the queue.
+            process:          preprocess data before enqueue.
+        Outputs:
+            reader:           reader to fetch the data from the queue.
+            producer_step:    the step insert the data into the queue. Should be
+                              run with comsume_step together.
+            exit_step:        the step to close queue
+            schema:           the schema for the reader.
+        """
+        producer_steps = []
+        for i in range(self.num_threads):
+            name = 'reader_' + str(i)
+            net_reader = core.Net(name)
+            should_stop, fields = reader.read_record(net_reader)
+            step_read = core.execution_step(name, net_reader)
+
+            name = 'queue_writer' + str(i)
+            net_prod = core.Net(name)
+            field_blobs = fields.field_blobs()
+            if process:
+                field_blobs = process(net_prod, fields).field_blobs()
+
+            self.writer.write(net_prod, field_blobs)
+            step_prod = core.execution_step(name, net_prod)
+            step = core.execution_step(
+                'producer_' + str(i),
+                [step_read, step_prod],
+                should_stop_blob=should_stop)
+            producer_steps.append(step)
+        producer_step = core.execution_step(
+            'producers',
+            producer_steps,
+            concurrent_substeps=True)
+        return self.reader, producer_step, self.exit_step, self.schema
diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py
new file mode 100644
index 0000000..e5b4889
--- /dev/null
+++ b/caffe2/python/recurrent.py
@@ -0,0 +1,333 @@
+## @package recurrent
+# Module caffe2.python.recurrent
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from future.utils import viewitems, viewkeys
+
+def recurrent_net(
+        net, cell_net, inputs, initial_cell_inputs,
+        links, timestep=None, scope=None, outputs_with_grads=(0,),
+        recompute_blobs_on_backward=None, forward_only=False,
+):
+    '''
+    net: the main net operator should be added to
+
+    cell_net: cell_net which is executed in a recurrent fasion
+
+    inputs: sequences to be fed into the recurrent net. Currently only one input
+    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
+    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions
+
+    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
+    Format for each input is:
+        (cell_net_input_name, external_blob_with_data)
+
+    links: a dictionary from cell_net input names in moment t+1 and
+    output names of moment t. Currently we assume that each output becomes
+    an input for the next timestep.
+
+    timestep: name of the timestep blob to be used. If not provided "timestep"
+    is used.
+
+    scope: Internal blobs are going to be scoped in a format
+    <scope_name>/<blob_name>
+    If not provided we generate a scope name automatically
+
+    outputs_with_grads : position indices of output blobs which will receive
+    error gradient (from outside recurrent network) during backpropagation
+
+    recompute_blobs_on_backward: specify a list of blobs that will be
+                 recomputed for backward pass, and thus need not to be
+                 stored for each forward timestep.
+
+    forward_only: if True, only forward steps are executed
+    '''
+    assert len(inputs) == 1, "Only one input blob is supported so far"
+
+    input_blobs = [str(i[0]) for i in inputs]
+    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
+    op_name = net.NextName('recurrent')
+
+    def s(name):
+        # We have to manually scope due to our internal/external blob
+        # relationships.
+        scope_name = op_name if scope is None else scope
+        return "{}/{}".format(str(scope_name), str(name))
+
+    # determine inputs that are considered to be references
+    # it is those that are not referred to in inputs or initial_cell_inputs
+    known_inputs = [str(b) for b in input_blobs + initial_input_blobs]
+    known_inputs += [str(x[0]) for x in initial_cell_inputs]
+    if timestep is not None:
+        known_inputs.append(str(timestep))
+    references = [
+        core.BlobReference(b) for b in cell_net.Proto().external_input
+        if b not in known_inputs]
+
+    inner_outputs = list(cell_net.Proto().external_output)
+    # These gradients are expected to be available during the backward pass
+    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}
+
+    # compute the backward pass of the cell net
+    if not forward_only:
+        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
+            cell_net.Proto().op, inner_outputs_map)
+        backward_mapping = {str(k): v for k, v in viewitems(backward_mapping)}
+
+        backward_cell_net = core.Net("RecurrentBackwardStep")
+        del backward_cell_net.Proto().op[:]
+
+        if recompute_blobs_on_backward is not None:
+            # Insert operators to re-compute the specified blobs.
+            # They are added in the same order as for the forward pass, thus
+            # the order is correct.
+            recompute_blobs_on_backward = {str(b) for b in
+                                           recompute_blobs_on_backward}
+
+            for op in cell_net.Proto().op:
+                if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
+                    backward_cell_net.Proto().op.extend([op])
+                    # This fires if other outputs than the declared
+                    # are computed by the ops that are recomputed
+                    assert set(op.output).issubset(recompute_blobs_on_backward)
+
+        backward_cell_net.Proto().op.extend(backward_ops)
+        # compute blobs used but not defined in the backward pass
+        backward_ssa, backward_blob_versions = core.get_ssa(
+            backward_cell_net.Proto())
+        undefined = core.get_undefined_blobs(backward_ssa)
+
+        # also add to the output list the intermediate outputs of fwd_step that
+        # are used by backward.
+        ssa, blob_versions = core.get_ssa(cell_net.Proto())
+        scratches = [
+            blob
+            for blob, ver in viewitems(blob_versions)
+            if (ver > 0 and
+                blob in undefined and
+                blob not in cell_net.Proto().external_output)
+        ]
+        backward_cell_net.Proto().external_input.extend(scratches)
+        backward_cell_net.Proto().type = 'simple'
+    else:
+        backward_cell_net = None
+
+    all_inputs = [i[1] for i in inputs] + [
+        x[1] for x in initial_cell_inputs] + references
+    all_outputs = []
+
+    cell_net.Proto().type = 'simple'
+
+    # Internal arguments used by RecurrentNetwork operator
+
+    # Links are in the format blob_name, recurrent_states, offset.
+    # In the moment t we know that corresponding data block is at
+    # t + offset position in the recurrent_states tensor
+    forward_links = []
+    backward_links = []
+
+    # Aliases are used to expose outputs to external world
+    # Format (internal_blob, external_blob, offset)
+    # Negative offset stands for going from the end,
+    # positive - from the beginning
+    aliases = []
+
+    # States held inputs to the cell net
+    recurrent_states = []
+
+    for cell_input, _ in initial_cell_inputs:
+        cell_input = str(cell_input)
+        # Recurrent_states is going to be (T + 1) x ...
+        # It stores all inputs and outputs of the cell net over time.
+        # Or their gradients in the case of the backward pass.
+        state = s(cell_input + "_states")
+        states_grad = state + "_grad"
+        cell_output = links[str(cell_input)]
+        forward_links.append((cell_input, state, 0))
+        forward_links.append((cell_output, state, 1))
+
+        aliases.append((state, cell_output + "_all", 1))
+        aliases.append((state, cell_output + "_last", -1))
+        all_outputs.extend([cell_output + "_all", cell_output + "_last"])
+
+        recurrent_states.append(state)
+
+        if backward_cell_net is not None:
+            backward_links.append((cell_output + "_grad", states_grad, 1))
+            backward_cell_net.Proto().external_input.append(
+                str(cell_output) + "_grad")
+
+            recurrent_input_grad = cell_input + "_grad"
+            if not backward_blob_versions.get(recurrent_input_grad, 0):
+                # If nobody writes to this recurrent input gradient, we need
+                # to make sure it gets to the states grad blob after all.
+                # We do this by using backward_links which triggers an alias
+                # This logic is being used for example in a SumOp case
+                backward_links.append(
+                    (backward_mapping[cell_input], states_grad, 0))
+            else:
+                backward_links.append((recurrent_input_grad, states_grad, 0))
+
+
+    for input_t, input_blob in inputs:
+        forward_links.append((str(input_t), str(input_blob), 0))
+
+    if backward_cell_net is not None:
+        for input_t, input_blob in inputs:
+            backward_links.append((
+                backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
+            ))
+        backward_cell_net.Proto().external_input.extend(
+            cell_net.Proto().external_input)
+        backward_cell_net.Proto().external_input.extend(
+            cell_net.Proto().external_output)
+
+    def unpack_triple(x):
+        if x:
+            a, b, c = zip(*x)
+            return a, b, c
+        return [], [], []
+
+    # Splitting to separate lists so we can pass them to c++
+    # where we ensemle them back
+    link_internal, link_external, link_offset = unpack_triple(forward_links)
+    alias_src, alias_dst, alias_offset = unpack_triple(aliases)
+
+    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]
+
+    # Make sure that recurrent gradients accumulate with internal gradients
+    # (if a blob in the backward_cell_net receives gradient from both an
+    # external connection as well as from within the backward_cell_net,
+    # those gradients need to be added together, rather than one overwriting
+    # the other)
+    if backward_cell_net is not None:
+        proto = backward_cell_net.Proto()
+        operators = []
+        while len(proto.op) > 0:
+            op = proto.op[-1]
+            proto.op.remove(op)
+            operators.append(op)
+        for op in operators[::-1]:
+            proto.op.extend([op])
+            for j, output_blob in enumerate(op.output):
+                if output_blob in proto.external_input:
+                    # In place operation won't cause issues because it takes
+                    # existing value of a blob into account
+                    if output_blob in op.input:
+                        continue
+                    output_blob = core.BlobReference(output_blob)
+                    accum_blob = output_blob + "_accum"
+                    proto.op[-1].output[j] = str(accum_blob)
+                    backward_cell_net.Sum(
+                        [output_blob, accum_blob],
+                        [output_blob],
+                    )
+
+    def map_to_dual_list(m):
+        return [str(x) for x in list(m.keys())] + \
+               [str(x) for x in list(m.values())]
+
+    backward_args = {}
+    if backward_cell_net is not None:
+        backward_mapping_keys = set(viewkeys(backward_mapping))
+        backward_link_internal, backward_link_external, backward_link_offset = \
+            unpack_triple(backward_links)
+        params = [x for x in references if x in backward_mapping_keys]
+        param_grads = [
+            str(backward_mapping[x])
+            for x in references
+            if x in backward_mapping_keys
+        ]
+        if recompute_blobs_on_backward is None:
+            recompute_blobs_on_backward = set()
+        backward_args = {
+            'param': [all_inputs.index(p) for p in params],
+            'backward_link_internal': [str(l) for l in backward_link_internal],
+            'backward_link_external': [str(l) for l in backward_link_external],
+            'backward_link_offset': backward_link_offset,
+            'outputs_with_grads': outputs_with_grads,
+            'recompute_blobs_on_backward': [
+                str(b) for b in recompute_blobs_on_backward
+            ],
+            'param_grads': param_grads,
+        }
+        if len(backward_cell_net.Proto().op) != 0:
+            backward_args['backward_step_net'] = backward_cell_net.Proto()
+
+
+    results = net.RecurrentNetwork(
+        all_inputs,
+        all_outputs + [s("step_workspaces")],
+        alias_src=alias_src,
+        alias_dst=[str(a) for a in alias_dst],
+        alias_offset=alias_offset,
+        recurrent_states=recurrent_states,
+        initial_recurrent_state_ids=[
+            all_inputs.index(i) for i in recurrent_inputs
+        ],
+        link_internal=[str(l) for l in link_internal],
+        link_external=[str(l) for l in link_external],
+        link_offset=link_offset,
+        enable_rnn_executor=1,
+        step_net=cell_net.Proto(),
+        timestep="timestep" if timestep is None else str(timestep),
+        **backward_args
+    )
+
+    # Restore net type since 'rnn' is not recognized outside RNNs
+    cell_net.Proto().type = 'simple'
+
+    # The last output is a list of step workspaces,
+    # which is only needed internally for gradient propogation
+    return results[:-1]
+
+
+def set_rnn_executor_config(rnn_op, num_threads=None, max_cuda_streams=None):
+    from caffe2.proto import caffe2_pb2
+    assert rnn_op.type in {'RecurrentNetwork', 'RecurrentNetworkGradient'}
+
+    def add_arg(s, v):
+        a = caffe2_pb2.Argument()
+        a.name = "rnn_executor." + s
+        a.i = v
+        rnn_op.arg.extend([a])
+
+    if num_threads is not None:
+        add_arg('num_threads', num_threads)
+    if max_cuda_streams is not None:
+        add_arg('max_cuda_streams', max_cuda_streams)
+
+
+def retrieve_step_blobs(net, prefix='rnn'):
+    '''
+    Retrieves blobs from step workspaces (which contain intermediate recurrent
+    network computation for each timestep) and puts them in the global
+    workspace. This allows access to the contents of this intermediate
+    computation in python. Returns the list of extracted blob names.
+
+    net: the net from which the step workspace blobs should be extracted
+
+    prefix: prefix to append to extracted blob names when placing them in the
+    global workspace
+    '''
+    count = 1
+    output_list = []
+    for op in net.Proto().op:
+        if op.type == "RecurrentNetwork":
+            blob_name = prefix + "_" + str(count)
+            count = count + 1
+            scratch_workspaces_blob_name = op.output[-1]
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "RecurrentNetworkBlobFetcher",
+                    [scratch_workspaces_blob_name],
+                    [blob_name],
+                    prefix=prefix
+                )
+            )
+            output_list += workspace.FetchBlob(blob_name).tolist()
+    return output_list
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
new file mode 100644
index 0000000..7e2d1a8
--- /dev/null
+++ b/caffe2/python/regularizer.py
@@ -0,0 +1,329 @@
+# @package optimizer
+# Module caffe2.python.regularizer
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from caffe2.python import core, utils
+import numpy as np
+
+
+class RegularizationBy(object):
+    AFTER_OPTIMIZER = "after_optimizer"
+    ON_LOSS = "on_loss"
+
+
+class Regularizer(object):
+    def __init__(self):
+        self.kEpsilon = 1e-9
+
+    """
+    Adds regularization to train_net for given parameter. Its factor ahead of
+    regularization is given when initialization.
+    The param should be a BlobReference.
+    """
+
+    def __call__(self, net, param_init_net, param, grad=None, by=None):
+        assert isinstance(param, core.BlobReference)
+        by_enum = utils.EnumClassKeyVals(RegularizationBy)
+        assert by in by_enum.values(), (
+            "Regularizer of type {} is called with invalid by={}, "
+            "not in {}".format(self.__class__, by, by_enum.values())
+        )
+        run_func = "_run_" + by
+        assert hasattr(
+            self, run_func
+        ), "Regularizer of type {} does not implement function {}".format(
+            self.__class__, run_func
+        )
+        return getattr(self, run_func)(net, param_init_net, param, grad)
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        return None
+
+    def _run_after_optimizer(self, net, param_init_net, param, grad):
+        return None
+
+    def _ensure_clipped(
+        self,
+        net,
+        param,
+        grad=None,
+        min=None,
+        max=None,
+        open_range=False,
+        left_open=False,
+        right_open=False,
+    ):
+        min = (
+            min + self.kEpsilon
+            if min is not None and (open_range or left_open)
+            else min
+        )
+        max = (
+            max - self.kEpsilon
+            if max is not None and (open_range or right_open)
+            else max
+        )
+        input_blobs = (
+            [param, grad.indices, grad.values]
+            if isinstance(grad, core.GradientSlice)
+            else [param]
+        )
+        net.EnsureClipped(input_blobs, [param], min=min, max=max)
+
+
+class L1Norm(Regularizer):
+    def __init__(self, reg_lambda):
+        super(L1Norm, self).__init__()
+        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
+
+        self.reg_lambda = reg_lambda
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        output_blob = net.NextScopedBlob(param + "_l1_regularization")
+        net.LpNorm([param], [output_blob], p=1)
+        net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
+        return output_blob
+
+
+class L2Norm(Regularizer):
+    def __init__(self, reg_lambda):
+        super(L2Norm, self).__init__()
+        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
+
+        self.reg_lambda = reg_lambda
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        output_blob = net.NextScopedBlob(param + "_l2_regularization")
+        net.LpNorm([param], [output_blob], p=2)
+        net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
+        return output_blob
+
+
+class MaxNorm(Regularizer):
+    def __init__(self, norm=1.0):
+        super(MaxNorm, self).__init__()
+        self.norm = norm
+
+    def _run_after_optimizer(self, net, param_init_net, param, grad):
+        assert self.norm > 0, "norm should be bigger than 0."
+        if isinstance(grad, core.GradientSlice):
+            net.SparseNormalize(
+                [param, grad.indices, grad.values],
+                [param],
+                use_max_norm=True,
+                norm=self.norm,
+            )
+        else:
+            raise NotImplementedError("MaxNorm is not supported for dense parameters")
+
+
+class ConstantNorm(Regularizer):
+    def __init__(self, norm=1.0):
+        super(ConstantNorm, self).__init__()
+        self.norm = norm
+
+    def _run_after_optimizer(self, net, param_init_net, param, grad):
+        assert self.norm > 0, "norm should be bigger than 0."
+        if isinstance(grad, core.GradientSlice):
+            net.SparseNormalize(
+                [param, grad.indices, grad.values],
+                [param],
+                use_max_norm=False,
+                norm=self.norm,
+            )
+        else:
+            raise NotImplementedError(
+                "ConstantNorm is not supported for dense parameters"
+            )
+
+
+class LogBarrier(Regularizer):
+    """
+    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
+    35(67-68), 7. Chapter 19
+    """
+
+    def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
+        """
+        discount is a positive weight that is decreasing, and here it is implemented
+        similar to the learning rate. It is specified by a learning rate policy and
+        corresponding options
+        """
+        super(LogBarrier, self).__init__()
+        assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
+        self.reg_lambda = reg_lambda
+        self.discount_policy = discount_policy
+        self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        iteration = utils.BuildUniqueMutexIter(param_init_net, net)
+        # Since we are most likely to do a minimization
+        discount = net.NextScopedBlob(param + "_log_barrier_discount")
+        net.LearningRate(
+            [iteration],
+            [discount],
+            base_lr=-self.reg_lambda,
+            policy=self.discount_policy,
+            **self.discount_options
+        )
+        # TODO(xlwang): param might still be negative at the initialization time or
+        # slighly negative due to the distributed training. Enforce it's non-negativity
+        # for now (at least above machine epsilon)
+        param_non_neg = net.NextScopedBlob(param + "_non_neg")
+        net.Clip([param], [param_non_neg], min=self.kEpsilon)
+        param_log = net.NextScopedBlob(param + "_log")
+        net.Log([param_non_neg], [param_log])
+        param_log_sum = net.NextScopedBlob(param + "_log_sum")
+        net.SumElements([param_log], [param_log_sum])
+        output_blob = net.NextScopedBlob(param + "_log_barrier")
+        net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
+        return output_blob
+
+    def _run_after_optimizer(self, net, param_init_net, param, grad):
+        self._ensure_clipped(net, param, grad, min=0, open_range=True)
+
+
+class BoundedGradientProjection(Regularizer):
+    """
+    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
+    35(67-68), 7. Chapter 16
+    """
+
+    def __init__(
+        self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
+    ):
+        super(BoundedGradientProjection, self).__init__()
+        lb = float(lb) if lb is not None else None
+        ub = float(ub) if ub is not None else None
+        epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
+        assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
+            eps=epsilon
+        )
+        assert (
+            (lb is None)
+            or (ub is None)
+            or (
+                lb + (epsilon if left_open else 0.)
+                <= ub - (epsilon if right_open else 0.)
+            )
+        ), (
+            "Bounded Gradient Projection with invalid "
+            "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
+                lb=lb,
+                ub=ub,
+                lp="(" if left_open else "[",
+                rp=")" if right_open else "]",
+                eps=epsilon,
+            )
+        )
+        self.left_open = left_open
+        self.right_open = right_open
+        self.kEpsilon = epsilon
+        self.lb = lb
+        self.ub = ub
+
+    def _run_after_optimizer(self, net, param_init_net, param, grad):
+        self._ensure_clipped(
+            net,
+            param,
+            grad,
+            min=self.lb,
+            max=self.ub,
+            left_open=self.left_open,
+            right_open=self.right_open,
+        )
+
+
+class GroupL1Norm(Regularizer):
+    """
+    Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
+    Neurocomputing 241 (2017): 81-89.
+
+    This regularizer computes l1 norm of a weight matrix based on groups.
+    There are essentially three stages in the computation:
+    1. Compute the l2 norm on all the members of each group
+    2. Scale each l2 norm by the size of each group
+    3. Compute the l1 norm of the scaled l2 norms
+    """
+    def __init__(self, reg_lambda, groups, stabilizing_val=0):
+        """
+        Args:
+            reg_lambda: The weight of the regularization term.
+            groups: A list of integers describing the size of each group.
+                The length of the list is the number of groups.
+
+        Optional Args:
+            stabilizing_val: The computation of GroupL1Norm involves the Sqrt
+                operator. When values are small, its gradient can be numerically
+                unstable and causing gradient explosion. Adding this term to
+                stabilize gradient calculation. Recommended value of this term is
+                1e-8, but it depends on the specific scenarios. If the implementation
+                of the gradient operator of Sqrt has taken into stability into
+                consideration, this term won't be necessary.
+        """
+        super(GroupL1Norm, self).__init__()
+        assert (
+            (reg_lambda) >= 0
+        ), "regularization weight should be 0 or positive"
+        assert isinstance(groups, list), "groups needs to be a list"
+
+        self.reg_lambda = (reg_lambda)
+        self.groups = groups
+        self.stabilizing_val = stabilizing_val
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        """
+        Args:
+            param: The input blob to regularize. It should be a weight matrix
+                blob with shape (output_dim, input_dim). input_dim should be
+                equal to the sum of self.groups.
+
+        Returns:
+            group_l1_norm: The output blob after applying regularization.
+
+        These are the steps of computation:
+            1. square all elements
+            2. sum by row
+            3. lengthssum by group
+            4. square_root all elements
+            5. normalize each group based on group size
+            6. compute l1 norm of each group
+            7. scale the result with the regularization lambda
+        """
+        squared = net.Sqr(param)
+        reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
+        lengths_sum = net.LengthsSum(
+            [
+                reduced_sum,
+                net.GivenTensorIntFill(
+                    [], 1, shape=[len(self.groups)], values=self.groups
+                ),
+            ]
+        )
+
+        if self.stabilizing_val:
+            net.Add(
+                [lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
+                [lengths_sum],
+                broadcast=1,
+            )
+
+        sqrt = net.Sqrt(lengths_sum)
+
+        # Here we combine step 5 and step 7 into one operator call to
+        # improve efficiency: values = np.sqrt(self.groups) * self.reg_lambda
+        l2_scaled = net.Mul(
+            [
+                sqrt,
+                net.GivenTensorFill(
+                    [],
+                    shape=[len(self.groups)],
+                    values=np.sqrt(self.groups) * self.reg_lambda
+                )
+            ],
+            ['normalized_l2_norm_scaled']
+        )
+
+        group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)
+
+        return group_l1_norm
diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py
new file mode 100644
index 0000000..95556a2
--- /dev/null
+++ b/caffe2/python/regularizer_context.py
@@ -0,0 +1,38 @@
+# @package regularizer_context
+# Module caffe2.python.regularizer_context
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import context
+from caffe2.python.modifier_context import (
+    ModifierContext, UseModifierBase)
+
+
+@context.define_context(allow_default=True)
+class RegularizerContext(ModifierContext):
+    """
+    provide context to allow param_info to have different regularizers
+    """
+
+    def has_regularizer(self, name):
+        return self._has_modifier(name)
+
+    def get_regularizer(self, name):
+        assert self.has_regularizer(name), (
+            "{} regularizer is not provided!".format(name))
+        return self._get_modifier(name)
+
+
+class UseRegularizer(UseModifierBase):
+    '''
+    context class to allow setting the current context.
+    Example useage with layer:
+        regularizers = {'reg1': reg1, 'reg2': reg2}
+        with UseRegularizer(regularizers):
+            reg = RegularizerContext.current().get_regularizer('reg1')
+            layer(reg=reg)
+    '''
+    def _context_class(self):
+        return RegularizerContext
diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py
new file mode 100644
index 0000000..1c1b234
--- /dev/null
+++ b/caffe2/python/regularizer_test.py
@@ -0,0 +1,168 @@
+from __future__ import absolute_import, division, print_function
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+import numpy.testing as npt
+from caffe2.python import core, layer_model_instantiator, regularizer, schema, workspace
+from caffe2.python.layer_test_util import LayersTestCase
+from caffe2.python.optimizer import SgdOptimizer
+from caffe2.python.regularizer import L1Norm, RegularizationBy
+from caffe2.python.regularizer_context import RegularizerContext, UseRegularizer
+from hypothesis import given
+
+
+class TestRegularizerContext(LayersTestCase):
+    @given(X=hu.arrays(dims=[2, 5]))
+    def test_regularizer_context(self, X):
+        weight_reg_out = L1Norm(0.2)
+        bias_reg_out = L1Norm(0)
+        regularizers = {"WEIGHT": weight_reg_out, "BIAS": bias_reg_out}
+
+        output_dims = 2
+        input_record = self.new_record(schema.Scalar((np.float32, (5,))))
+        schema.FeedRecord(input_record, [X])
+
+        with UseRegularizer(regularizers):
+            weight_reg = RegularizerContext.current().get_regularizer("WEIGHT")
+            bias_reg = RegularizerContext.current().get_regularizer("BIAS")
+            optim = SgdOptimizer(0.15)
+
+            assert (
+                weight_reg == weight_reg_out
+            ), "fail to get correct weight reg from context"
+            assert bias_reg == bias_reg_out, "fail to get correct bias reg from context"
+            fc_output = self.model.FC(
+                input_record,
+                output_dims,
+                weight_optim=optim,
+                bias_optim=optim,
+                weight_reg=weight_reg,
+                bias_reg=bias_reg,
+            )
+            # model.output_schema has to a struct
+            self.model.output_schema = schema.Struct(("fc_output", fc_output))
+
+            self.assertEqual(schema.Scalar((np.float32, (output_dims,))), fc_output)
+
+            _, train_net = layer_model_instantiator.generate_training_nets(self.model)
+            ops = train_net.Proto().op
+            ops_type_list = [ops[i].type for i in range(len(ops))]
+            assert ops_type_list.count("LpNorm") == 2
+            assert ops_type_list.count("Scale") == 4
+            assert ops_type_list.count("LpNormGradient") == 2
+
+
+class TestRegularizer(LayersTestCase):
+    @given(X=hu.arrays(dims=[2, 5], elements=st.floats(min_value=-1.0, max_value=1.0)))
+    def test_log_barrier(self, X):
+        param = core.BlobReference("X")
+        workspace.FeedBlob(param, X)
+        train_init_net, train_net = self.get_training_nets()
+        reg = regularizer.LogBarrier(1.0)
+        output = reg(train_net, train_init_net, param, by=RegularizationBy.ON_LOSS)
+        reg(
+            train_net,
+            train_init_net,
+            param,
+            grad=None,
+            by=RegularizationBy.AFTER_OPTIMIZER,
+        )
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+        def ref(X):
+            return (
+                np.array(np.sum(-np.log(np.clip(X, 1e-9, None))) * 0.5).astype(
+                    np.float32
+                ),
+                np.clip(X, 1e-9, None),
+            )
+
+        for x, y in zip(workspace.FetchBlobs([output, param]), ref(X)):
+            npt.assert_allclose(x, y, rtol=1e-3)
+
+    @given(
+        X=hu.arrays(dims=[2, 5], elements=st.floats(min_value=-1.0, max_value=1.0)),
+        left_open=st.booleans(),
+        right_open=st.booleans(),
+        eps=st.floats(min_value=1e-6, max_value=1e-4),
+        ub=st.floats(min_value=-1.0, max_value=1.0),
+        lb=st.floats(min_value=-1.0, max_value=1.0),
+        **hu.gcs_cpu_only
+    )
+    def test_bounded_grad_proj(self, X, left_open, right_open, eps, ub, lb, gc, dc):
+        if ub - (eps if right_open else 0.) < lb + (eps if left_open else 0.):
+            return
+        param = core.BlobReference("X")
+        workspace.FeedBlob(param, X)
+        train_init_net, train_net = self.get_training_nets()
+        reg = regularizer.BoundedGradientProjection(
+            lb=lb, ub=ub, left_open=left_open, right_open=right_open, epsilon=eps
+        )
+        output = reg(train_net, train_init_net, param, by=RegularizationBy.ON_LOSS)
+        reg(
+            train_net,
+            train_init_net,
+            param,
+            grad=None,
+            by=RegularizationBy.AFTER_OPTIMIZER,
+        )
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+        def ref(X):
+            return np.clip(
+                X, lb + (eps if left_open else 0.), ub - (eps if right_open else 0.)
+            )
+
+        assert output is None
+        npt.assert_allclose(workspace.blobs[param], ref(X), atol=1e-7)
+
+    @given(
+        output_dim=st.integers(1, 10),
+        input_num=st.integers(3, 30),
+        reg_weight=st.integers(0, 10)
+    )
+    def test_group_l1_norm(self, output_dim, input_num, reg_weight):
+        """
+        1. create a weight blob
+        2. create random group splits
+        3. run group_l1_nrom with the weight blob
+        4. run equivalent np operations to calculate group l1 norm
+        5. compare if the results from 3 and 4 are equal
+        """
+        def compare_reference(weight, group_boundaries, reg_lambda, output):
+            group_splits = np.hsplit(weight, group_boundaries[1:-1])
+            l2_reg = np.sqrt([np.sum(np.square(g)) for g in group_splits])
+            l2_normalized = np.multiply(l2_reg,
+                np.array([np.sqrt(g.shape[1]) for g in group_splits]))
+            result = np.multiply(np.sum(l2_normalized), reg_lambda)
+            npt.assert_almost_equal(result, workspace.blobs[output], decimal=2)
+
+        weight = np.random.rand(output_dim, input_num).astype(np.float32)
+
+        feature_num = np.random.randint(low=1, high=input_num - 1)
+        group_boundaries = [0]
+        group_boundaries = np.append(
+            group_boundaries,
+            np.sort(
+                np.random.choice(range(1, input_num - 1), feature_num, replace=False)
+            ),
+        )
+        group_boundaries = np.append(group_boundaries, [input_num])
+        split_info = np.diff(group_boundaries)
+
+        weight_blob = core.BlobReference("weight_blob")
+        workspace.FeedBlob(weight_blob, weight)
+
+        train_init_net, train_net = self.get_training_nets()
+        reg = regularizer.GroupL1Norm(reg_weight * 0.1, split_info.tolist())
+        output = reg(
+            train_net, train_init_net, weight_blob, by=RegularizationBy.ON_LOSS
+        )
+
+        workspace.RunNetOnce(train_init_net)
+        workspace.RunNetOnce(train_net)
+
+        compare_reference(weight, group_boundaries, reg_weight * 0.1, output)
diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py
new file mode 100644
index 0000000..a37eb20
--- /dev/null
+++ b/caffe2/python/rnn/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
new file mode 100644
index 0000000..2d44e09
--- /dev/null
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace, core, lstm_benchmark, utils
+from copy import copy
+
+@utils.debug
+def Compare(args):
+    results = []
+    num_iters = 1000
+    args.gpu = True
+    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        for batch_size in [64, 128, 256]:
+            for seq_length in [20, 100]:
+                for hidden_dim in [40, 100, 400, 800]:
+                    args.batch_size = batch_size
+                    args.seq_length = seq_length
+                    args.hidden_dim = hidden_dim
+                    args.data_size = batch_size * seq_length * num_iters
+                    args.iters_to_report = num_iters // 3
+
+                    args.implementation = 'own'
+                    t_own = lstm_benchmark.Benchmark(args)
+                    workspace.ResetWorkspace()
+                    args.implementation = 'cudnn'
+                    t_cudnn = lstm_benchmark.Benchmark(args)
+                    workspace.ResetWorkspace()
+                    results.append((copy(args), float(t_own), float(t_cudnn)))
+                    print(args)
+                    print("t_cudnn / t_own: {}".format(t_cudnn / t_own))
+
+    for args, t_own, t_cudnn in results:
+        print("{}: cudnn time: {}, own time: {}, ratio: {}".format(
+            str(args), t_cudnn, t_own, t_cudnn / t_own))
+
+    ratio_sum = 0
+    for args, t_own, t_cudnn in results:
+        ratio = float(t_cudnn) / t_own
+        ratio_sum += ratio
+        print("hidden_dim: {}, seq_lengths: {}, batch_size: {}, num_layers: {}:"
+              " cudnn time: {}, own time: {}, ratio: {}".format(
+                  args.hidden_dim, args.seq_length, args.batch_size,
+                  args.num_layers, t_cudnn, t_own, ratio))
+
+    print("Ratio average: {}".format(ratio_sum / len(results)))
+
+
+if __name__ == '__main__':
+    args = lstm_benchmark.GetArgumentParser().parse_args()
+
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+        '--caffe2_print_blob_sizes_at_exit=0',
+        '--caffe2_gpu_memory_tracking=1'])
+
+    Compare(args)
diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py
new file mode 100644
index 0000000..1533c1e
--- /dev/null
+++ b/caffe2/python/rnn/rnn_cell_test_util.py
@@ -0,0 +1,75 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import workspace, scope
+from caffe2.python.model_helper import ModelHelper
+
+import numpy as np
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def tanh(x):
+    return 2.0 * sigmoid(2.0 * x) - 1
+
+
+def _prepare_rnn(
+    t, n, dim_in, create_rnn, outputs_with_grads,
+    forget_bias, memory_optim=False,
+    forward_only=False, drop_states=False, T=None,
+    two_d_initial_states=None, dim_out=None,
+    num_states=2,
+    **kwargs
+):
+    if dim_out is None:
+        dim_out = [dim_in]
+    print("Dims: ", t, n, dim_in, dim_out)
+
+    model = ModelHelper(name='external')
+
+    if two_d_initial_states is None:
+        two_d_initial_states = np.random.randint(2)
+
+    def generate_input_state(n, d):
+        if two_d_initial_states:
+            return np.random.randn(n, d).astype(np.float32)
+        else:
+            return np.random.randn(1, n, d).astype(np.float32)
+
+    states = []
+    for layer_id, d in enumerate(dim_out):
+        for i in range(num_states):
+            state_name = "state_{}/layer_{}".format(i, layer_id)
+            states.append(model.net.AddExternalInput(state_name))
+            workspace.FeedBlob(
+                states[-1], generate_input_state(n, d).astype(np.float32))
+
+    # Due to convoluted RNN scoping logic we make sure that things
+    # work from a namescope
+    with scope.NameScope("test_name_scope"):
+        input_blob, seq_lengths = model.net.AddScopedExternalInputs(
+            'input_blob', 'seq_lengths')
+
+        outputs = create_rnn(
+            model, input_blob, seq_lengths, states,
+            dim_in=dim_in, dim_out=dim_out, scope="external/recurrent",
+            outputs_with_grads=outputs_with_grads,
+            memory_optimization=memory_optim,
+            forget_bias=forget_bias,
+            forward_only=forward_only,
+            drop_states=drop_states,
+            static_rnn_unroll_size=T,
+            **kwargs
+        )
+
+    workspace.RunNetOnce(model.param_init_net)
+
+    workspace.FeedBlob(
+        seq_lengths,
+        np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
+    )
+    return outputs, model.net, states + [input_blob]
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
new file mode 100644
index 0000000..18ac5f4
--- /dev/null
+++ b/caffe2/python/rnn_cell.py
@@ -0,0 +1,1976 @@
+## @package rnn_cell
+# Module caffe2.python.rnn_cell
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+import inspect
+import itertools
+import logging
+import numpy as np
+import random
+import six
+from future.utils import viewkeys
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python.attention import (
+    apply_dot_attention,
+    apply_recurrent_attention,
+    apply_regular_attention,
+    apply_soft_coverage_attention,
+    AttentionType,
+)
+from caffe2.python import core, recurrent, workspace, brew, scope, utils
+from caffe2.python.modeling.parameter_sharing import ParameterSharing
+from caffe2.python.modeling.parameter_info import ParameterTags
+from caffe2.python.modeling.initializers import Initializer
+from caffe2.python.model_helper import ModelHelper
+
+
+def _RectifyName(blob_reference_or_name):
+    if blob_reference_or_name is None:
+        return None
+    if isinstance(blob_reference_or_name, six.string_types):
+        return core.ScopedBlobReference(blob_reference_or_name)
+    if not isinstance(blob_reference_or_name, core.BlobReference):
+        raise Exception("Unknown blob reference type")
+    return blob_reference_or_name
+
+
+def _RectifyNames(blob_references_or_names):
+    if blob_references_or_names is None:
+        return None
+    return list(map(_RectifyName, blob_references_or_names))
+
+
+class RNNCell(object):
+    '''
+    Base class for writing recurrent / stateful operations.
+
+    One needs to implement 2 methods: apply_override
+    and get_state_names_override.
+
+    As a result base class will provice apply_over_sequence method, which
+    allows you to apply recurrent operations over a sequence of any length.
+
+    As optional you could add input and output preparation steps by overriding
+    corresponding methods.
+    '''
+    def __init__(self, name=None, forward_only=False, initializer=None):
+        self.name = name
+        self.recompute_blobs = []
+        self.forward_only = forward_only
+        self._initializer = initializer
+
+    @property
+    def initializer(self):
+        return self._initializer
+
+    @initializer.setter
+    def initializer(self, value):
+        self._initializer = value
+
+    def scope(self, name):
+        return self.name + '/' + name if self.name is not None else name
+
+    def apply_over_sequence(
+        self,
+        model,
+        inputs,
+        seq_lengths=None,
+        initial_states=None,
+        outputs_with_grads=None,
+    ):
+        if initial_states is None:
+            with scope.NameScope(self.name):
+                if self.initializer is None:
+                    raise Exception("Either initial states "
+                                    "or initializer have to be set")
+                initial_states = self.initializer.create_states(model)
+
+        preprocessed_inputs = self.prepare_input(model, inputs)
+        step_model = ModelHelper(name=self.name, param_model=model)
+        input_t, timestep = step_model.net.AddScopedExternalInputs(
+            'input_t',
+            'timestep',
+        )
+        utils.raiseIfNotEqual(
+            len(initial_states), len(self.get_state_names()),
+            "Number of initial state values provided doesn't match the number "
+            "of states"
+        )
+        states_prev = step_model.net.AddScopedExternalInputs(*[
+            s + '_prev' for s in self.get_state_names()
+        ])
+        states = self._apply(
+            model=step_model,
+            input_t=input_t,
+            seq_lengths=seq_lengths,
+            states=states_prev,
+            timestep=timestep,
+        )
+
+        external_outputs = set(step_model.net.Proto().external_output)
+        for state in states:
+            if state not in external_outputs:
+                step_model.net.AddExternalOutput(state)
+
+        if outputs_with_grads is None:
+            outputs_with_grads = [self.get_output_state_index() * 2]
+
+        # states_for_all_steps consists of combination of
+        # states gather for all steps and final states. It looks like this:
+        # (state_1_all, state_1_final, state_2_all, state_2_final, ...)
+        states_for_all_steps = recurrent.recurrent_net(
+            net=model.net,
+            cell_net=step_model.net,
+            inputs=[(input_t, preprocessed_inputs)],
+            initial_cell_inputs=list(zip(states_prev, initial_states)),
+            links=dict(zip(states_prev, states)),
+            timestep=timestep,
+            scope=self.name,
+            forward_only=self.forward_only,
+            outputs_with_grads=outputs_with_grads,
+            recompute_blobs_on_backward=self.recompute_blobs,
+        )
+
+        output = self._prepare_output_sequence(
+            model,
+            states_for_all_steps,
+        )
+        return output, states_for_all_steps
+
+    def apply(self, model, input_t, seq_lengths, states, timestep):
+        input_t = self.prepare_input(model, input_t)
+        states = self._apply(
+            model, input_t, seq_lengths, states, timestep)
+        output = self._prepare_output(model, states)
+        return output, states
+
+    def _apply(
+        self,
+        model, input_t, seq_lengths, states, timestep, extra_inputs=None
+    ):
+        '''
+        This  method uses apply_override provided by a custom cell.
+        On the top it takes care of applying self.scope() to all the outputs.
+        While all the inputs stay within the scope this function was called
+        from.
+        '''
+        args = self._rectify_apply_inputs(
+            input_t, seq_lengths, states, timestep, extra_inputs)
+        with core.NameScope(self.name):
+            return self.apply_override(model, *args)
+
+    def _rectify_apply_inputs(
+            self, input_t, seq_lengths, states, timestep, extra_inputs):
+        '''
+        Before applying a scope we make sure that all external blob names
+        are converted to blob reference. So further scoping doesn't affect them
+        '''
+
+        input_t, seq_lengths, timestep = _RectifyNames(
+            [input_t, seq_lengths, timestep])
+        states = _RectifyNames(states)
+        if extra_inputs:
+            extra_input_names, extra_input_sizes = zip(*extra_inputs)
+            extra_inputs = _RectifyNames(extra_input_names)
+            extra_inputs = zip(extra_input_names, extra_input_sizes)
+
+        arg_names = inspect.getargspec(self.apply_override).args
+        rectified = [input_t, seq_lengths, states, timestep]
+        if 'extra_inputs' in arg_names:
+            rectified.append(extra_inputs)
+        return rectified
+
+
+    def apply_override(
+        self,
+        model, input_t, seq_lengths, timestep, extra_inputs=None,
+    ):
+        '''
+        A single step of a recurrent network to be implemented by each custom
+        RNNCell.
+
+        model: ModelHelper object new operators would be added to
+
+        input_t: singlse input with shape (1, batch_size, input_dim)
+
+        seq_lengths: blob containing sequence lengths which would be passed to
+        LSTMUnit operator
+
+        states: previous recurrent states
+
+        timestep: current recurrent iteration. Could be used together with
+        seq_lengths in order to determine, if some shorter sequences
+        in the batch have already ended.
+
+        extra_inputs: list of tuples (input, dim). specifies additional input
+        which is not subject to prepare_input(). (useful when a cell is a
+        component of a larger recurrent structure, e.g., attention)
+        '''
+        raise NotImplementedError('Abstract method')
+
+    def prepare_input(self, model, input_blob):
+        '''
+        If some operations in _apply method depend only on the input,
+        not on recurrent states, they could be computed in advance.
+
+        model: ModelHelper object new operators would be added to
+
+        input_blob: either the whole input sequence with shape
+        (sequence_length, batch_size, input_dim) or a single input with shape
+        (1, batch_size, input_dim).
+        '''
+        return input_blob
+
+    def get_output_state_index(self):
+        '''
+        Return index into state list of the "primary" step-wise output.
+        '''
+        return 0
+
+    def get_state_names(self):
+        '''
+        Returns recurrent state names with self.name scoping applied
+        '''
+        return list(map(self.scope, self.get_state_names_override()))
+
+    def get_state_names_override(self):
+        '''
+        Override this function in your custom cell.
+        It should return the names of the recurrent states.
+
+        It's required by apply_over_sequence method in order to allocate
+        recurrent states for all steps with meaningful names.
+        '''
+        raise NotImplementedError('Abstract method')
+
+    def get_output_dim(self):
+        '''
+        Specifies the dimension (number of units) of stepwise output.
+        '''
+        raise NotImplementedError('Abstract method')
+
+    def _prepare_output(self, model, states):
+        '''
+        Allows arbitrary post-processing of primary output.
+        '''
+        return states[self.get_output_state_index()]
+
+    def _prepare_output_sequence(self, model, state_outputs):
+        '''
+        Allows arbitrary post-processing of primary sequence output.
+
+        (Note that state_outputs alternates between full-sequence and final
+        output for each state, thus the index multiplier 2.)
+        '''
+        output_sequence_index = 2 * self.get_output_state_index()
+        return state_outputs[output_sequence_index]
+
+
+class LSTMInitializer(object):
+    def __init__(self, hidden_size):
+        self.hidden_size = hidden_size
+
+    def create_states(self, model):
+        return [
+            model.create_param(
+                param_name='initial_hidden_state',
+                initializer=Initializer(operator_name='ConstantFill',
+                                        value=0.0),
+                shape=[self.hidden_size],
+            ),
+            model.create_param(
+                param_name='initial_cell_state',
+                initializer=Initializer(operator_name='ConstantFill',
+                                        value=0.0),
+                shape=[self.hidden_size],
+            )
+        ]
+
+
+# based on http://pytorch.org/docs/master/nn.html#torch.nn.RNNCell
+class BasicRNNCell(RNNCell):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        forget_bias,
+        memory_optimization,
+        drop_states=False,
+        initializer=None,
+        activation=None,
+        **kwargs
+    ):
+        super(BasicRNNCell, self).__init__(**kwargs)
+        self.drop_states = drop_states
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.activation = activation
+
+        if self.activation not in ['relu', 'tanh']:
+            raise RuntimeError(
+                'BasicRNNCell with unknown activation function (%s)'
+                % self.activation)
+
+    def apply_override(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev = states[0]
+
+        gates_t = brew.fc(
+            model,
+            hidden_t_prev,
+            'gates_t',
+            dim_in=self.hidden_size,
+            dim_out=self.hidden_size,
+            axis=2,
+        )
+
+        brew.sum(model, [gates_t, input_t], gates_t)
+        if self.activation == 'tanh':
+            hidden_t = model.net.Tanh(gates_t, 'hidden_t')
+        elif self.activation == 'relu':
+            hidden_t = model.net.Relu(gates_t, 'hidden_t')
+        else:
+            raise RuntimeError(
+                'BasicRNNCell with unknown activation function (%s)'
+                % self.activation)
+
+        if seq_lengths is not None:
+            # TODO If this codepath becomes popular, it may be worth
+            # taking a look at optimizing it - for now a simple
+            # implementation is used to round out compatibility with
+            # ONNX.
+            timestep = model.net.CopyFromCPUInput(
+                timestep, 'timestep_gpu')
+            valid_b = model.net.GT(
+                [seq_lengths, timestep], 'valid_b', broadcast=1)
+            invalid_b = model.net.LE(
+                [seq_lengths, timestep], 'invalid_b', broadcast=1)
+            valid = model.net.Cast(valid_b, 'valid', to='float')
+            invalid = model.net.Cast(invalid_b, 'invalid', to='float')
+
+            hidden_valid = model.net.Mul(
+                [hidden_t, valid],
+                'hidden_valid',
+                broadcast=1,
+                axis=1,
+            )
+            if self.drop_states:
+                hidden_t = hidden_valid
+            else:
+                hidden_invalid = model.net.Mul(
+                    [hidden_t_prev, invalid],
+                    'hidden_invalid',
+                    broadcast=1, axis=1)
+                hidden_t = model.net.Add(
+                    [hidden_valid, hidden_invalid], hidden_t)
+        return (hidden_t,)
+
+    def prepare_input(self, model, input_blob):
+        return brew.fc(
+            model,
+            input_blob,
+            self.scope('i2h'),
+            dim_in=self.input_size,
+            dim_out=self.hidden_size,
+            axis=2,
+        )
+
+    def get_state_names(self):
+        return (self.scope('hidden_t'),)
+
+    def get_output_dim(self):
+        return self.hidden_size
+
+
+class LSTMCell(RNNCell):
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        forget_bias,
+        memory_optimization,
+        drop_states=False,
+        initializer=None,
+        **kwargs
+    ):
+        super(LSTMCell, self).__init__(initializer=initializer, **kwargs)
+        self.initializer = initializer or LSTMInitializer(
+            hidden_size=hidden_size)
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.forget_bias = float(forget_bias)
+        self.memory_optimization = memory_optimization
+        self.drop_states = drop_states
+        self.gates_size = 4 * self.hidden_size
+
+    def apply_override(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev, cell_t_prev = states
+
+        fc_input = hidden_t_prev
+        fc_input_dim = self.hidden_size
+
+        if extra_inputs is not None:
+            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
+            fc_input = brew.concat(
+                model,
+                [hidden_t_prev] + list(extra_input_blobs),
+                'gates_concatenated_input_t',
+                axis=2,
+            )
+            fc_input_dim += sum(extra_input_sizes)
+
+        gates_t = brew.fc(
+            model,
+            fc_input,
+            'gates_t',
+            dim_in=fc_input_dim,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+        brew.sum(model, [gates_t, input_t], gates_t)
+
+        if seq_lengths is not None:
+            inputs = [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep]
+        else:
+            inputs = [hidden_t_prev, cell_t_prev, gates_t, timestep]
+
+        hidden_t, cell_t = model.net.LSTMUnit(
+            inputs,
+            ['hidden_state', 'cell_state'],
+            forget_bias=self.forget_bias,
+            drop_states=self.drop_states,
+            sequence_lengths=(seq_lengths is not None),
+        )
+        model.net.AddExternalOutputs(hidden_t, cell_t)
+        if self.memory_optimization:
+            self.recompute_blobs = [gates_t]
+
+        return hidden_t, cell_t
+
+    def get_input_params(self):
+        return {
+            'weights': self.scope('i2h') + '_w',
+            'biases': self.scope('i2h') + '_b',
+        }
+
+    def get_recurrent_params(self):
+        return {
+            'weights': self.scope('gates_t') + '_w',
+            'biases': self.scope('gates_t') + '_b',
+        }
+
+    def prepare_input(self, model, input_blob):
+        return brew.fc(
+            model,
+            input_blob,
+            self.scope('i2h'),
+            dim_in=self.input_size,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+
+    def get_state_names_override(self):
+        return ['hidden_t', 'cell_t']
+
+    def get_output_dim(self):
+        return self.hidden_size
+
+
+class LayerNormLSTMCell(RNNCell):
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        forget_bias,
+        memory_optimization,
+        drop_states=False,
+        initializer=None,
+        **kwargs
+    ):
+        super(LayerNormLSTMCell, self).__init__(
+            initializer=initializer, **kwargs
+        )
+        self.initializer = initializer or LSTMInitializer(
+            hidden_size=hidden_size
+        )
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.forget_bias = float(forget_bias)
+        self.memory_optimization = memory_optimization
+        self.drop_states = drop_states
+        self.gates_size = 4 * self.hidden_size
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev, cell_t_prev = states
+
+        fc_input = hidden_t_prev
+        fc_input_dim = self.hidden_size
+
+        if extra_inputs is not None:
+            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
+            fc_input = brew.concat(
+                model,
+                [hidden_t_prev] + list(extra_input_blobs),
+                self.scope('gates_concatenated_input_t'),
+                axis=2,
+            )
+            fc_input_dim += sum(extra_input_sizes)
+
+        gates_t = brew.fc(
+            model,
+            fc_input,
+            self.scope('gates_t'),
+            dim_in=fc_input_dim,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+        brew.sum(model, [gates_t, input_t], gates_t)
+
+        # brew.layer_norm call is only difference from LSTMCell
+        gates_t, _, _ = brew.layer_norm(
+            model,
+            self.scope('gates_t'),
+            self.scope('gates_t_norm'),
+            dim_in=self.gates_size,
+            axis=-1,
+        )
+
+        hidden_t, cell_t = model.net.LSTMUnit(
+            [
+                hidden_t_prev,
+                cell_t_prev,
+                gates_t,
+                seq_lengths,
+                timestep,
+            ],
+            self.get_state_names(),
+            forget_bias=self.forget_bias,
+            drop_states=self.drop_states,
+        )
+        model.net.AddExternalOutputs(hidden_t, cell_t)
+        if self.memory_optimization:
+            self.recompute_blobs = [gates_t]
+
+        return hidden_t, cell_t
+
+    def get_input_params(self):
+        return {
+            'weights': self.scope('i2h') + '_w',
+            'biases': self.scope('i2h') + '_b',
+        }
+
+    def prepare_input(self, model, input_blob):
+        return brew.fc(
+            model,
+            input_blob,
+            self.scope('i2h'),
+            dim_in=self.input_size,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+
+    def get_state_names(self):
+        return (self.scope('hidden_t'), self.scope('cell_t'))
+
+
+class MILSTMCell(LSTMCell):
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev, cell_t_prev = states
+
+        fc_input = hidden_t_prev
+        fc_input_dim = self.hidden_size
+
+        if extra_inputs is not None:
+            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
+            fc_input = brew.concat(
+                model,
+                [hidden_t_prev] + list(extra_input_blobs),
+                self.scope('gates_concatenated_input_t'),
+                axis=2,
+            )
+            fc_input_dim += sum(extra_input_sizes)
+
+        prev_t = brew.fc(
+            model,
+            fc_input,
+            self.scope('prev_t'),
+            dim_in=fc_input_dim,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+
+        # defining initializers for MI parameters
+        alpha = model.create_param(
+            self.scope('alpha'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        beta_h = model.create_param(
+            self.scope('beta1'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        beta_i = model.create_param(
+            self.scope('beta2'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        b = model.create_param(
+            self.scope('b'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=0.0),
+        )
+
+        # alpha * input_t + beta_h
+        # Shape: [1, batch_size, 4 * hidden_size]
+        alpha_by_input_t_plus_beta_h = model.net.ElementwiseLinear(
+            [input_t, alpha, beta_h],
+            self.scope('alpha_by_input_t_plus_beta_h'),
+            axis=2,
+        )
+        # (alpha * input_t + beta_h) * prev_t =
+        # alpha * input_t * prev_t + beta_h * prev_t
+        # Shape: [1, batch_size, 4 * hidden_size]
+        alpha_by_input_t_plus_beta_h_by_prev_t = model.net.Mul(
+            [alpha_by_input_t_plus_beta_h, prev_t],
+            self.scope('alpha_by_input_t_plus_beta_h_by_prev_t')
+        )
+        # beta_i * input_t + b
+        # Shape: [1, batch_size, 4 * hidden_size]
+        beta_i_by_input_t_plus_b = model.net.ElementwiseLinear(
+            [input_t, beta_i, b],
+            self.scope('beta_i_by_input_t_plus_b'),
+            axis=2,
+        )
+        # alpha * input_t * prev_t + beta_h * prev_t + beta_i * input_t + b
+        # Shape: [1, batch_size, 4 * hidden_size]
+        gates_t = brew.sum(
+            model,
+            [alpha_by_input_t_plus_beta_h_by_prev_t, beta_i_by_input_t_plus_b],
+            self.scope('gates_t')
+        )
+        hidden_t, cell_t = model.net.LSTMUnit(
+            [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
+            [self.scope('hidden_t_intermediate'), self.scope('cell_t')],
+            forget_bias=self.forget_bias,
+            drop_states=self.drop_states,
+        )
+        model.net.AddExternalOutputs(
+            cell_t,
+            hidden_t,
+        )
+        if self.memory_optimization:
+            self.recompute_blobs = [gates_t]
+        return hidden_t, cell_t
+
+
+class LayerNormMILSTMCell(LSTMCell):
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        hidden_t_prev, cell_t_prev = states
+
+        fc_input = hidden_t_prev
+        fc_input_dim = self.hidden_size
+
+        if extra_inputs is not None:
+            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
+            fc_input = brew.concat(
+                model,
+                [hidden_t_prev] + list(extra_input_blobs),
+                self.scope('gates_concatenated_input_t'),
+                axis=2,
+            )
+            fc_input_dim += sum(extra_input_sizes)
+
+        prev_t = brew.fc(
+            model,
+            fc_input,
+            self.scope('prev_t'),
+            dim_in=fc_input_dim,
+            dim_out=self.gates_size,
+            axis=2,
+        )
+
+        # defining initializers for MI parameters
+        alpha = model.create_param(
+            self.scope('alpha'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        beta_h = model.create_param(
+            self.scope('beta1'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        beta_i = model.create_param(
+            self.scope('beta2'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=1.0),
+        )
+        b = model.create_param(
+            self.scope('b'),
+            shape=[self.gates_size],
+            initializer=Initializer('ConstantFill', value=0.0),
+        )
+
+        # alpha * input_t + beta_h
+        # Shape: [1, batch_size, 4 * hidden_size]
+        alpha_by_input_t_plus_beta_h = model.net.ElementwiseLinear(
+            [input_t, alpha, beta_h],
+            self.scope('alpha_by_input_t_plus_beta_h'),
+            axis=2,
+        )
+        # (alpha * input_t + beta_h) * prev_t =
+        # alpha * input_t * prev_t + beta_h * prev_t
+        # Shape: [1, batch_size, 4 * hidden_size]
+        alpha_by_input_t_plus_beta_h_by_prev_t = model.net.Mul(
+            [alpha_by_input_t_plus_beta_h, prev_t],
+            self.scope('alpha_by_input_t_plus_beta_h_by_prev_t')
+        )
+        # beta_i * input_t + b
+        # Shape: [1, batch_size, 4 * hidden_size]
+        beta_i_by_input_t_plus_b = model.net.ElementwiseLinear(
+            [input_t, beta_i, b],
+            self.scope('beta_i_by_input_t_plus_b'),
+            axis=2,
+        )
+        # alpha * input_t * prev_t + beta_h * prev_t + beta_i * input_t + b
+        # Shape: [1, batch_size, 4 * hidden_size]
+        gates_t = brew.sum(
+            model,
+            [alpha_by_input_t_plus_beta_h_by_prev_t, beta_i_by_input_t_plus_b],
+            self.scope('gates_t')
+        )
+        # brew.layer_norm call is only difference from MILSTMCell._apply
+        gates_t, _, _ = brew.layer_norm(
+            model,
+            self.scope('gates_t'),
+            self.scope('gates_t_norm'),
+            dim_in=self.gates_size,
+            axis=-1,
+        )
+        hidden_t, cell_t = model.net.LSTMUnit(
+            [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
+            [self.scope('hidden_t_intermediate'), self.scope('cell_t')],
+            forget_bias=self.forget_bias,
+            drop_states=self.drop_states,
+        )
+        model.net.AddExternalOutputs(
+            cell_t,
+            hidden_t,
+        )
+        if self.memory_optimization:
+            self.recompute_blobs = [gates_t]
+        return hidden_t, cell_t
+
+
+class DropoutCell(RNNCell):
+    '''
+    Wraps arbitrary RNNCell, applying dropout to its output (but not to the
+    recurrent connection for the corresponding state).
+    '''
+
+    def __init__(
+        self,
+        internal_cell,
+        dropout_ratio=None,
+        use_cudnn=False,
+        **kwargs
+    ):
+        self.internal_cell = internal_cell
+        self.dropout_ratio = dropout_ratio
+        assert 'is_test' in kwargs, "Argument 'is_test' is required"
+        self.is_test = kwargs.pop('is_test')
+        self.use_cudnn = use_cudnn
+        super(DropoutCell, self).__init__(**kwargs)
+
+        self.prepare_input = internal_cell.prepare_input
+        self.get_output_state_index = internal_cell.get_output_state_index
+        self.get_state_names = internal_cell.get_state_names
+        self.get_output_dim = internal_cell.get_output_dim
+
+        self.mask = 0
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        return self.internal_cell._apply(
+            model,
+            input_t,
+            seq_lengths,
+            states,
+            timestep,
+            extra_inputs,
+        )
+
+    def _prepare_output(self, model, states):
+        output = self.internal_cell._prepare_output(
+            model,
+            states,
+        )
+        if self.dropout_ratio is not None:
+            output = self._apply_dropout(model, output)
+        return output
+
+    def _prepare_output_sequence(self, model, state_outputs):
+        output = self.internal_cell._prepare_output_sequence(
+            model,
+            state_outputs,
+        )
+        if self.dropout_ratio is not None:
+            output = self._apply_dropout(model, output)
+        return output
+
+    def _apply_dropout(self, model, output):
+        if self.dropout_ratio and not self.forward_only:
+            with core.NameScope(self.name or ''):
+                output = brew.dropout(
+                    model,
+                    output,
+                    str(output) + '_with_dropout_mask{}'.format(self.mask),
+                    ratio=float(self.dropout_ratio),
+                    is_test=self.is_test,
+                    use_cudnn=self.use_cudnn,
+                )
+                self.mask += 1
+        return output
+
+
+class MultiRNNCellInitializer(object):
+    def __init__(self, cells):
+        self.cells = cells
+
+    def create_states(self, model):
+        states = []
+        for i, cell in enumerate(self.cells):
+            if cell.initializer is None:
+                raise Exception("Either initial states "
+                                "or initializer have to be set")
+
+            with core.NameScope("layer_{}".format(i)),\
+                    core.NameScope(cell.name):
+                states.extend(cell.initializer.create_states(model))
+        return states
+
+
+class MultiRNNCell(RNNCell):
+    '''
+    Multilayer RNN via the composition of RNNCell instance.
+
+    It is the resposibility of calling code to ensure the compatibility
+    of the successive layers in terms of input/output dimensiality, etc.,
+    and to ensure that their blobs do not have name conflicts, typically by
+    creating the cells with names that specify layer number.
+
+    Assumes first state (recurrent output) for each layer should be the input
+    to the next layer.
+    '''
+
+    def __init__(self, cells, residual_output_layers=None, **kwargs):
+        '''
+        cells: list of RNNCell instances, from input to output side.
+
+        name: string designating network component (for scoping)
+
+        residual_output_layers: list of indices of layers whose input will
+        be added elementwise to their output elementwise. (It is the
+        responsibility of the client code to ensure shape compatibility.)
+        Note that layer 0 (zero) cannot have residual output because of the
+        timing of prepare_input().
+
+        forward_only: used to construct inference-only network.
+        '''
+        super(MultiRNNCell, self).__init__(**kwargs)
+        self.cells = cells
+
+        if residual_output_layers is None:
+            self.residual_output_layers = []
+        else:
+            self.residual_output_layers = residual_output_layers
+
+        output_index_per_layer = []
+        base_index = 0
+        for cell in self.cells:
+            output_index_per_layer.append(
+                base_index + cell.get_output_state_index(),
+            )
+            base_index += len(cell.get_state_names())
+
+        self.output_connected_layers = []
+        self.output_indices = []
+        for i in range(len(self.cells) - 1):
+            if (i + 1) in self.residual_output_layers:
+                self.output_connected_layers.append(i)
+                self.output_indices.append(output_index_per_layer[i])
+            else:
+                self.output_connected_layers = []
+                self.output_indices = []
+        self.output_connected_layers.append(len(self.cells) - 1)
+        self.output_indices.append(output_index_per_layer[-1])
+
+        self.state_names = []
+        for i, cell in enumerate(self.cells):
+            self.state_names.extend(
+                map(self.layer_scoper(i), cell.get_state_names())
+            )
+
+        self.initializer = MultiRNNCellInitializer(cells)
+
+    def layer_scoper(self, layer_id):
+        def helper(name):
+            return "{}/layer_{}/{}".format(self.name, layer_id, name)
+        return helper
+
+    def prepare_input(self, model, input_blob):
+        input_blob = _RectifyName(input_blob)
+        with core.NameScope(self.name or ''):
+            return self.cells[0].prepare_input(model, input_blob)
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        '''
+        Because below we will do scoping across layers, we need
+        to make sure that string blob names are convereted to BlobReference
+        objects.
+        '''
+
+        input_t, seq_lengths, states, timestep, extra_inputs = \
+            self._rectify_apply_inputs(
+                input_t, seq_lengths, states, timestep, extra_inputs)
+
+        states_per_layer = [len(cell.get_state_names()) for cell in self.cells]
+        assert len(states) == sum(states_per_layer)
+
+        next_states = []
+        states_index = 0
+
+        layer_input = input_t
+        for i, layer_cell in enumerate(self.cells):
+            # # If cells don't have different names we still
+            # take care of scoping
+            with core.NameScope(self.name), core.NameScope("layer_{}".format(i)):
+                num_states = states_per_layer[i]
+                layer_states = states[states_index:(states_index + num_states)]
+                states_index += num_states
+
+                if i > 0:
+                    prepared_input = layer_cell.prepare_input(
+                        model, layer_input)
+                else:
+                    prepared_input = layer_input
+
+                layer_next_states = layer_cell._apply(
+                    model,
+                    prepared_input,
+                    seq_lengths,
+                    layer_states,
+                    timestep,
+                    extra_inputs=(None if i > 0 else extra_inputs),
+                )
+                # Since we're using here non-public method _apply,
+                # instead of apply, we have to manually extract output
+                # from states
+                if i != len(self.cells) - 1:
+                    layer_output = layer_cell._prepare_output(
+                        model,
+                        layer_next_states,
+                    )
+                    if i > 0 and i in self.residual_output_layers:
+                        layer_input = brew.sum(
+                            model,
+                            [layer_output, layer_input],
+                            self.scope('residual_output_{}'.format(i)),
+                        )
+                    else:
+                        layer_input = layer_output
+
+                next_states.extend(layer_next_states)
+        return next_states
+
+    def get_state_names(self):
+        return self.state_names
+
+    def get_output_state_index(self):
+        index = 0
+        for cell in self.cells[:-1]:
+            index += len(cell.get_state_names())
+        index += self.cells[-1].get_output_state_index()
+        return index
+
+    def _prepare_output(self, model, states):
+        connected_outputs = []
+        state_index = 0
+        for i, cell in enumerate(self.cells):
+            num_states = len(cell.get_state_names())
+            if i in self.output_connected_layers:
+                layer_states = states[state_index:state_index + num_states]
+                layer_output = cell._prepare_output(
+                    model,
+                    layer_states
+                )
+                connected_outputs.append(layer_output)
+            state_index += num_states
+        if len(connected_outputs) > 1:
+            output = brew.sum(
+                model,
+                connected_outputs,
+                self.scope('residual_output'),
+            )
+        else:
+            output = connected_outputs[0]
+        return output
+
+    def _prepare_output_sequence(self, model, states):
+        connected_outputs = []
+        state_index = 0
+        for i, cell in enumerate(self.cells):
+            num_states = 2 * len(cell.get_state_names())
+            if i in self.output_connected_layers:
+                layer_states = states[state_index:state_index + num_states]
+                layer_output = cell._prepare_output_sequence(
+                    model,
+                    layer_states
+                )
+                connected_outputs.append(layer_output)
+            state_index += num_states
+        if len(connected_outputs) > 1:
+            output = brew.sum(
+                model,
+                connected_outputs,
+                self.scope('residual_output_sequence'),
+            )
+        else:
+            output = connected_outputs[0]
+        return output
+
+
+class AttentionCell(RNNCell):
+
+    def __init__(
+        self,
+        encoder_output_dim,
+        encoder_outputs,
+        encoder_lengths,
+        decoder_cell,
+        decoder_state_dim,
+        attention_type,
+        weighted_encoder_outputs,
+        attention_memory_optimization,
+        **kwargs
+    ):
+        super(AttentionCell, self).__init__(**kwargs)
+        self.encoder_output_dim = encoder_output_dim
+        self.encoder_outputs = encoder_outputs
+        self.encoder_lengths = encoder_lengths
+        self.decoder_cell = decoder_cell
+        self.decoder_state_dim = decoder_state_dim
+        self.weighted_encoder_outputs = weighted_encoder_outputs
+        self.encoder_outputs_transposed = None
+        assert attention_type in [
+            AttentionType.Regular,
+            AttentionType.Recurrent,
+            AttentionType.Dot,
+            AttentionType.SoftCoverage,
+        ]
+        self.attention_type = attention_type
+        self.attention_memory_optimization = attention_memory_optimization
+
+    def _apply(
+        self,
+        model,
+        input_t,
+        seq_lengths,
+        states,
+        timestep,
+        extra_inputs=None,
+    ):
+        if self.attention_type == AttentionType.SoftCoverage:
+            decoder_prev_states = states[:-2]
+            attention_weighted_encoder_context_t_prev = states[-2]
+            coverage_t_prev = states[-1]
+        else:
+            decoder_prev_states = states[:-1]
+            attention_weighted_encoder_context_t_prev = states[-1]
+
+        assert extra_inputs is None
+
+        decoder_states = self.decoder_cell._apply(
+            model,
+            input_t,
+            seq_lengths,
+            decoder_prev_states,
+            timestep,
+            extra_inputs=[(
+                attention_weighted_encoder_context_t_prev,
+                self.encoder_output_dim,
+            )],
+        )
+
+        self.hidden_t_intermediate = self.decoder_cell._prepare_output(
+            model,
+            decoder_states,
+        )
+
+        if self.attention_type == AttentionType.Recurrent:
+            (
+                attention_weighted_encoder_context_t,
+                self.attention_weights_3d,
+                attention_blobs,
+            ) = apply_recurrent_attention(
+                model=model,
+                encoder_output_dim=self.encoder_output_dim,
+                encoder_outputs_transposed=self.encoder_outputs_transposed,
+                weighted_encoder_outputs=self.weighted_encoder_outputs,
+                decoder_hidden_state_t=self.hidden_t_intermediate,
+                decoder_hidden_state_dim=self.decoder_state_dim,
+                scope=self.name,
+                attention_weighted_encoder_context_t_prev=(
+                    attention_weighted_encoder_context_t_prev
+                ),
+                encoder_lengths=self.encoder_lengths,
+            )
+        elif self.attention_type == AttentionType.Regular:
+            (
+                attention_weighted_encoder_context_t,
+                self.attention_weights_3d,
+                attention_blobs,
+            ) = apply_regular_attention(
+                model=model,
+                encoder_output_dim=self.encoder_output_dim,
+                encoder_outputs_transposed=self.encoder_outputs_transposed,
+                weighted_encoder_outputs=self.weighted_encoder_outputs,
+                decoder_hidden_state_t=self.hidden_t_intermediate,
+                decoder_hidden_state_dim=self.decoder_state_dim,
+                scope=self.name,
+                encoder_lengths=self.encoder_lengths,
+            )
+        elif self.attention_type == AttentionType.Dot:
+            (
+                attention_weighted_encoder_context_t,
+                self.attention_weights_3d,
+                attention_blobs,
+            ) = apply_dot_attention(
+                model=model,
+                encoder_output_dim=self.encoder_output_dim,
+                encoder_outputs_transposed=self.encoder_outputs_transposed,
+                decoder_hidden_state_t=self.hidden_t_intermediate,
+                decoder_hidden_state_dim=self.decoder_state_dim,
+                scope=self.name,
+                encoder_lengths=self.encoder_lengths,
+            )
+        elif self.attention_type == AttentionType.SoftCoverage:
+            (
+                attention_weighted_encoder_context_t,
+                self.attention_weights_3d,
+                attention_blobs,
+                coverage_t,
+            ) = apply_soft_coverage_attention(
+                model=model,
+                encoder_output_dim=self.encoder_output_dim,
+                encoder_outputs_transposed=self.encoder_outputs_transposed,
+                weighted_encoder_outputs=self.weighted_encoder_outputs,
+                decoder_hidden_state_t=self.hidden_t_intermediate,
+                decoder_hidden_state_dim=self.decoder_state_dim,
+                scope=self.name,
+                encoder_lengths=self.encoder_lengths,
+                coverage_t_prev=coverage_t_prev,
+                coverage_weights=self.coverage_weights,
+            )
+        else:
+            raise Exception('Attention type {} not implemented'.format(
+                self.attention_type
+            ))
+
+        if self.attention_memory_optimization:
+            self.recompute_blobs.extend(attention_blobs)
+
+        output = list(decoder_states) + [attention_weighted_encoder_context_t]
+        if self.attention_type == AttentionType.SoftCoverage:
+            output.append(coverage_t)
+
+        output[self.decoder_cell.get_output_state_index()] = model.Copy(
+            output[self.decoder_cell.get_output_state_index()],
+            self.scope('hidden_t_external'),
+        )
+        model.net.AddExternalOutputs(*output)
+
+        return output
+
+    def get_attention_weights(self):
+        # [batch_size, encoder_length, 1]
+        return self.attention_weights_3d
+
+    def prepare_input(self, model, input_blob):
+        if self.encoder_outputs_transposed is None:
+            self.encoder_outputs_transposed = brew.transpose(
+                model,
+                self.encoder_outputs,
+                self.scope('encoder_outputs_transposed'),
+                axes=[1, 2, 0],
+            )
+        if (
+            self.weighted_encoder_outputs is None and
+            self.attention_type != AttentionType.Dot
+        ):
+            self.weighted_encoder_outputs = brew.fc(
+                model,
+                self.encoder_outputs,
+                self.scope('weighted_encoder_outputs'),
+                dim_in=self.encoder_output_dim,
+                dim_out=self.encoder_output_dim,
+                axis=2,
+            )
+
+        return self.decoder_cell.prepare_input(model, input_blob)
+
+    def build_initial_coverage(self, model):
+        """
+        initial_coverage is always zeros of shape [encoder_length],
+        which shape must be determined programmatically dureing network
+        computation.
+
+        This method also sets self.coverage_weights, a separate transform
+        of encoder_outputs which is used to determine coverage contribution
+        tp attention.
+        """
+        assert self.attention_type == AttentionType.SoftCoverage
+
+        # [encoder_length, batch_size, encoder_output_dim]
+        self.coverage_weights = brew.fc(
+            model,
+            self.encoder_outputs,
+            self.scope('coverage_weights'),
+            dim_in=self.encoder_output_dim,
+            dim_out=self.encoder_output_dim,
+            axis=2,
+        )
+
+        encoder_length = model.net.Slice(
+            model.net.Shape(self.encoder_outputs),
+            starts=[0],
+            ends=[1],
+        )
+        if (
+            scope.CurrentDeviceScope() is not None and
+            scope.CurrentDeviceScope().device_type == caffe2_pb2.CUDA
+        ):
+            encoder_length = model.net.CopyGPUToCPU(
+                encoder_length,
+                'encoder_length_cpu',
+            )
+        # total attention weight applied across decoding steps_per_checkpoint
+        # shape: [encoder_length]
+        initial_coverage = model.net.ConstantFill(
+            encoder_length,
+            self.scope('initial_coverage'),
+            value=0.0,
+            input_as_shape=1,
+        )
+        return initial_coverage
+
+    def get_state_names(self):
+        state_names = list(self.decoder_cell.get_state_names())
+        state_names[self.get_output_state_index()] = self.scope(
+            'hidden_t_external',
+        )
+        state_names.append(self.scope('attention_weighted_encoder_context_t'))
+        if self.attention_type == AttentionType.SoftCoverage:
+            state_names.append(self.scope('coverage_t'))
+        return state_names
+
+    def get_output_dim(self):
+        return self.decoder_state_dim + self.encoder_output_dim
+
+    def get_output_state_index(self):
+        return self.decoder_cell.get_output_state_index()
+
+    def _prepare_output(self, model, states):
+        if self.attention_type == AttentionType.SoftCoverage:
+            attention_context = states[-2]
+        else:
+            attention_context = states[-1]
+
+        with core.NameScope(self.name or ''):
+            output = brew.concat(
+                model,
+                [self.hidden_t_intermediate, attention_context],
+                'states_and_context_combination',
+                axis=2,
+            )
+
+        return output
+
+    def _prepare_output_sequence(self, model, state_outputs):
+        if self.attention_type == AttentionType.SoftCoverage:
+            decoder_state_outputs = state_outputs[:-4]
+        else:
+            decoder_state_outputs = state_outputs[:-2]
+
+        decoder_output = self.decoder_cell._prepare_output_sequence(
+            model,
+            decoder_state_outputs,
+        )
+
+        if self.attention_type == AttentionType.SoftCoverage:
+            attention_context_index = 2 * (len(self.get_state_names()) - 2)
+        else:
+            attention_context_index = 2 * (len(self.get_state_names()) - 1)
+
+        with core.NameScope(self.name or ''):
+            output = brew.concat(
+                model,
+                [
+                    decoder_output,
+                    state_outputs[attention_context_index],
+                ],
+                'states_and_context_combination',
+                axis=2,
+            )
+        return output
+
+
+class LSTMWithAttentionCell(AttentionCell):
+
+    def __init__(
+        self,
+        encoder_output_dim,
+        encoder_outputs,
+        encoder_lengths,
+        decoder_input_dim,
+        decoder_state_dim,
+        name,
+        attention_type,
+        weighted_encoder_outputs,
+        forget_bias,
+        lstm_memory_optimization,
+        attention_memory_optimization,
+        forward_only=False,
+    ):
+        decoder_cell = LSTMCell(
+            input_size=decoder_input_dim,
+            hidden_size=decoder_state_dim,
+            forget_bias=forget_bias,
+            memory_optimization=lstm_memory_optimization,
+            name='{}/decoder'.format(name),
+            forward_only=False,
+            drop_states=False,
+        )
+        super(LSTMWithAttentionCell, self).__init__(
+            encoder_output_dim=encoder_output_dim,
+            encoder_outputs=encoder_outputs,
+            encoder_lengths=encoder_lengths,
+            decoder_cell=decoder_cell,
+            decoder_state_dim=decoder_state_dim,
+            name=name,
+            attention_type=attention_type,
+            weighted_encoder_outputs=weighted_encoder_outputs,
+            attention_memory_optimization=attention_memory_optimization,
+            forward_only=forward_only,
+        )
+
+
+class MILSTMWithAttentionCell(AttentionCell):
+
+    def __init__(
+        self,
+        encoder_output_dim,
+        encoder_outputs,
+        decoder_input_dim,
+        decoder_state_dim,
+        name,
+        attention_type,
+        weighted_encoder_outputs,
+        forget_bias,
+        lstm_memory_optimization,
+        attention_memory_optimization,
+        forward_only=False,
+    ):
+        decoder_cell = MILSTMCell(
+            input_size=decoder_input_dim,
+            hidden_size=decoder_state_dim,
+            forget_bias=forget_bias,
+            memory_optimization=lstm_memory_optimization,
+            name='{}/decoder'.format(name),
+            forward_only=False,
+            drop_states=False,
+        )
+        super(MILSTMWithAttentionCell, self).__init__(
+            encoder_output_dim=encoder_output_dim,
+            encoder_outputs=encoder_outputs,
+            decoder_cell=decoder_cell,
+            decoder_state_dim=decoder_state_dim,
+            name=name,
+            attention_type=attention_type,
+            weighted_encoder_outputs=weighted_encoder_outputs,
+            attention_memory_optimization=attention_memory_optimization,
+            forward_only=forward_only,
+        )
+
+
+def _LSTM(
+    cell_class,
+    model,
+    input_blob,
+    seq_lengths,
+    initial_states,
+    dim_in,
+    dim_out,
+    scope=None,
+    outputs_with_grads=(0,),
+    return_params=False,
+    memory_optimization=False,
+    forget_bias=0.0,
+    forward_only=False,
+    drop_states=False,
+    return_last_layer_only=True,
+    static_rnn_unroll_size=None,
+    **cell_kwargs
+):
+    '''
+    Adds a standard LSTM recurrent network operator to a model.
+
+    cell_class: LSTMCell or compatible subclass
+
+    model: ModelHelper object new operators would be added to
+
+    input_blob: the input sequence in a format T x N x D
+            where T is sequence size, N - batch size and D - input dimension
+
+    seq_lengths: blob containing sequence lengths which would be passed to
+            LSTMUnit operator
+
+    initial_states: a list of (2 * num_layers) blobs representing the initial
+            hidden and cell states of each layer. If this argument is None,
+            these states will be added to the model as network parameters.
+
+    dim_in: input dimension
+
+    dim_out: number of units per LSTM layer
+            (use int for single-layer LSTM, list of ints for multi-layer)
+
+    outputs_with_grads : position indices of output blobs for LAST LAYER which
+            will receive external error gradient during backpropagation.
+            These outputs are: (h_all, h_last, c_all, c_last)
+
+    return_params: if True, will return a dictionary of parameters of the LSTM
+
+    memory_optimization: if enabled, the LSTM step is recomputed on backward
+            step so that we don't need to store forward activations for each
+            timestep. Saves memory with cost of computation.
+
+    forget_bias: forget gate bias (default 0.0)
+
+    forward_only: whether to create a backward pass
+
+    drop_states: drop invalid states, passed through to LSTMUnit operator
+
+    return_last_layer_only: only return outputs from final layer
+            (so that length of results does depend on number of layers)
+
+    static_rnn_unroll_size: if not None, we will use static RNN which is
+    unrolled into Caffe2 graph. The size of the unroll is the value of
+    this parameter.
+    '''
+    if type(dim_out) is not list and type(dim_out) is not tuple:
+        dim_out = [dim_out]
+    num_layers = len(dim_out)
+
+    cells = []
+    for i in range(num_layers):
+        cell = cell_class(
+            input_size=(dim_in if i == 0 else dim_out[i - 1]),
+            hidden_size=dim_out[i],
+            forget_bias=forget_bias,
+            memory_optimization=memory_optimization,
+            name=scope if num_layers == 1 else None,
+            forward_only=forward_only,
+            drop_states=drop_states,
+            **cell_kwargs
+        )
+        cells.append(cell)
+
+    cell = MultiRNNCell(
+        cells,
+        name=scope,
+        forward_only=forward_only,
+    ) if num_layers > 1 else cells[0]
+
+    cell = (
+        cell if static_rnn_unroll_size is None
+        else UnrolledCell(cell, static_rnn_unroll_size))
+
+    # outputs_with_grads argument indexes into final layer
+    outputs_with_grads = [4 * (num_layers - 1) + i for i in outputs_with_grads]
+    _, result = cell.apply_over_sequence(
+        model=model,
+        inputs=input_blob,
+        seq_lengths=seq_lengths,
+        initial_states=initial_states,
+        outputs_with_grads=outputs_with_grads,
+    )
+
+    if return_last_layer_only:
+        result = result[4 * (num_layers - 1):]
+    if return_params:
+        result = list(result) + [{
+            'input': cell.get_input_params(),
+            'recurrent': cell.get_recurrent_params(),
+        }]
+    return tuple(result)
+
+
+LSTM = functools.partial(_LSTM, LSTMCell)
+BasicRNN = functools.partial(_LSTM, BasicRNNCell)
+MILSTM = functools.partial(_LSTM, MILSTMCell)
+LayerNormLSTM = functools.partial(_LSTM, LayerNormLSTMCell)
+LayerNormMILSTM = functools.partial(_LSTM, LayerNormMILSTMCell)
+
+
+class UnrolledCell(RNNCell):
+    def __init__(self, cell, T):
+        self.T = T
+        self.cell = cell
+
+    def apply_over_sequence(
+        self,
+        model,
+        inputs,
+        seq_lengths,
+        initial_states,
+        outputs_with_grads=None,
+    ):
+        inputs = self.cell.prepare_input(model, inputs)
+
+        # Now they are blob references - outputs of splitting the input sequence
+        split_inputs = model.net.Split(
+            inputs,
+            [str(inputs) + "_timestep_{}".format(i)
+             for i in range(self.T)],
+            axis=0)
+        if self.T == 1:
+            split_inputs = [split_inputs]
+
+        states = initial_states
+        all_states = []
+        for t in range(0, self.T):
+            scope_name = "timestep_{}".format(t)
+            # Parameters of all timesteps are shared
+            with ParameterSharing({scope_name: ''}),\
+                    scope.NameScope(scope_name):
+                timestep = model.param_init_net.ConstantFill(
+                    [], "timestep", value=t, shape=[1],
+                    dtype=core.DataType.INT32,
+                    device_option=core.DeviceOption(caffe2_pb2.CPU))
+                states = self.cell._apply(
+                    model=model,
+                    input_t=split_inputs[t],
+                    seq_lengths=seq_lengths,
+                    states=states,
+                    timestep=timestep,
+                )
+            all_states.append(states)
+
+        all_states = zip(*all_states)
+        all_states = [
+            model.net.Concat(
+                list(full_output),
+                [
+                    str(full_output[0])[len("timestep_0/"):] + "_concat",
+                    str(full_output[0])[len("timestep_0/"):] + "_concat_info"
+
+                ],
+                axis=0)[0]
+            for full_output in all_states
+        ]
+        outputs = tuple(
+            six.next(it) for it in
+            itertools.cycle([iter(all_states), iter(states)])
+        )
+        outputs_without_grad = set(range(len(outputs))) - set(
+            outputs_with_grads)
+        for i in outputs_without_grad:
+            model.net.ZeroGradient(outputs[i], [])
+        logging.debug("Added 0 gradients for blobs:",
+                      [outputs[i] for i in outputs_without_grad])
+
+        final_output = self.cell._prepare_output_sequence(model, outputs)
+
+        return final_output, outputs
+
+
+def GetLSTMParamNames():
+    weight_params = ["input_gate_w", "forget_gate_w", "output_gate_w", "cell_w"]
+    bias_params = ["input_gate_b", "forget_gate_b", "output_gate_b", "cell_b"]
+    return {'weights': weight_params, 'biases': bias_params}
+
+
+def InitFromLSTMParams(lstm_pblobs, param_values):
+    '''
+    Set the parameters of LSTM based on predefined values
+    '''
+    weight_params = GetLSTMParamNames()['weights']
+    bias_params = GetLSTMParamNames()['biases']
+    for input_type in viewkeys(param_values):
+        weight_values = [
+            param_values[input_type][w].flatten()
+            for w in weight_params
+        ]
+        wmat = np.array([])
+        for w in weight_values:
+            wmat = np.append(wmat, w)
+        bias_values = [
+            param_values[input_type][b].flatten()
+            for b in bias_params
+        ]
+        bm = np.array([])
+        for b in bias_values:
+            bm = np.append(bm, b)
+
+        weights_blob = lstm_pblobs[input_type]['weights']
+        bias_blob = lstm_pblobs[input_type]['biases']
+        cur_weight = workspace.FetchBlob(weights_blob)
+        cur_biases = workspace.FetchBlob(bias_blob)
+
+        workspace.FeedBlob(
+            weights_blob,
+            wmat.reshape(cur_weight.shape).astype(np.float32))
+        workspace.FeedBlob(
+            bias_blob,
+            bm.reshape(cur_biases.shape).astype(np.float32))
+
+
+def cudnn_LSTM(model, input_blob, initial_states, dim_in, dim_out,
+               scope, recurrent_params=None, input_params=None,
+               num_layers=1, return_params=False):
+    '''
+    CuDNN version of LSTM for GPUs.
+    input_blob          Blob containing the input. Will need to be available
+                        when param_init_net is run, because the sequence lengths
+                        and batch sizes will be inferred from the size of this
+                        blob.
+    initial_states      tuple of (hidden_init, cell_init) blobs
+    dim_in              input dimensions
+    dim_out             output/hidden dimension
+    scope               namescope to apply
+    recurrent_params    dict of blobs containing values for recurrent
+                        gate weights, biases (if None, use random init values)
+                        See GetLSTMParamNames() for format.
+    input_params        dict of blobs containing values for input
+                        gate weights, biases (if None, use random init values)
+                        See GetLSTMParamNames() for format.
+    num_layers          number of LSTM layers
+    return_params       if True, returns (param_extract_net, param_mapping)
+                        where param_extract_net is a net that when run, will
+                        populate the blobs specified in param_mapping with the
+                        current gate weights and biases (input/recurrent).
+                        Useful for assigning the values back to non-cuDNN
+                        LSTM.
+    '''
+    with core.NameScope(scope):
+        weight_params = GetLSTMParamNames()['weights']
+        bias_params = GetLSTMParamNames()['biases']
+
+        input_weight_size = dim_out * dim_in
+        upper_layer_input_weight_size = dim_out * dim_out
+        recurrent_weight_size = dim_out * dim_out
+        input_bias_size = dim_out
+        recurrent_bias_size = dim_out
+
+        def init(layer, pname, input_type):
+            input_weight_size_for_layer = input_weight_size if layer == 0 else \
+                upper_layer_input_weight_size
+            if pname in weight_params:
+                sz = input_weight_size_for_layer if input_type == 'input' \
+                    else recurrent_weight_size
+            elif pname in bias_params:
+                sz = input_bias_size if input_type == 'input' \
+                    else recurrent_bias_size
+            else:
+                assert False, "unknown parameter type {}".format(pname)
+            return model.param_init_net.UniformFill(
+                [],
+                "lstm_init_{}_{}_{}".format(input_type, pname, layer),
+                shape=[sz])
+
+        # Multiply by 4 since we have 4 gates per LSTM unit
+        first_layer_sz = input_weight_size + recurrent_weight_size + \
+                         input_bias_size + recurrent_bias_size
+        upper_layer_sz = upper_layer_input_weight_size + \
+                         recurrent_weight_size + input_bias_size + \
+                         recurrent_bias_size
+        total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)
+
+        weights = model.create_param(
+            'lstm_weight',
+            shape=[total_sz],
+            initializer=Initializer('UniformFill'),
+            tags=ParameterTags.WEIGHT,
+        )
+
+        lstm_args = {
+            'hidden_size': dim_out,
+            'rnn_mode': 'lstm',
+            'bidirectional': 0,  # TODO
+            'dropout': 1.0,  # TODO
+            'input_mode': 'linear',  # TODO
+            'num_layers': num_layers,
+            'engine': 'CUDNN'
+        }
+
+        param_extract_net = core.Net("lstm_param_extractor")
+        param_extract_net.AddExternalInputs([input_blob, weights])
+        param_extract_mapping = {}
+
+        # Populate the weights-blob from blobs containing parameters for
+        # the individual components of the LSTM, such as forget/input gate
+        # weights and bises. Also, create a special param_extract_net that
+        # can be used to grab those individual params from the black-box
+        # weights blob. These results can be then fed to InitFromLSTMParams()
+        for input_type in ['input', 'recurrent']:
+            param_extract_mapping[input_type] = {}
+            p = recurrent_params if input_type == 'recurrent' else input_params
+            if p is None:
+                p = {}
+            for pname in weight_params + bias_params:
+                for j in range(0, num_layers):
+                    values = p[pname] if pname in p else init(j, pname, input_type)
+                    model.param_init_net.RecurrentParamSet(
+                        [input_blob, weights, values],
+                        weights,
+                        layer=j,
+                        input_type=input_type,
+                        param_type=pname,
+                        **lstm_args
+                    )
+                    if pname not in param_extract_mapping[input_type]:
+                        param_extract_mapping[input_type][pname] = {}
+                    b = param_extract_net.RecurrentParamGet(
+                        [input_blob, weights],
+                        ["lstm_{}_{}_{}".format(input_type, pname, j)],
+                        layer=j,
+                        input_type=input_type,
+                        param_type=pname,
+                        **lstm_args
+                    )
+                    param_extract_mapping[input_type][pname][j] = b
+
+        (hidden_input_blob, cell_input_blob) = initial_states
+        output, hidden_output, cell_output, rnn_scratch, dropout_states = \
+            model.net.Recurrent(
+                [input_blob, hidden_input_blob, cell_input_blob, weights],
+                ["lstm_output", "lstm_hidden_output", "lstm_cell_output",
+                 "lstm_rnn_scratch", "lstm_dropout_states"],
+                seed=random.randint(0, 100000),  # TODO: dropout seed
+                **lstm_args
+            )
+        model.net.AddExternalOutputs(
+            hidden_output, cell_output, rnn_scratch, dropout_states)
+
+    if return_params:
+        param_extract = param_extract_net, param_extract_mapping
+        return output, hidden_output, cell_output, param_extract
+    else:
+        return output, hidden_output, cell_output
+
+
+def LSTMWithAttention(
+    model,
+    decoder_inputs,
+    decoder_input_lengths,
+    initial_decoder_hidden_state,
+    initial_decoder_cell_state,
+    initial_attention_weighted_encoder_context,
+    encoder_output_dim,
+    encoder_outputs,
+    encoder_lengths,
+    decoder_input_dim,
+    decoder_state_dim,
+    scope,
+    attention_type=AttentionType.Regular,
+    outputs_with_grads=(0, 4),
+    weighted_encoder_outputs=None,
+    lstm_memory_optimization=False,
+    attention_memory_optimization=False,
+    forget_bias=0.0,
+    forward_only=False,
+):
+    '''
+    Adds a LSTM with attention mechanism to a model.
+
+    The implementation is based on https://arxiv.org/abs/1409.0473, with
+    a small difference in the order
+    how we compute new attention context and new hidden state, similarly to
+    https://arxiv.org/abs/1508.04025.
+
+    The model uses encoder-decoder naming conventions,
+    where the decoder is the sequence the op is iterating over,
+    while computing the attention context over the encoder.
+
+    model: ModelHelper object new operators would be added to
+
+    decoder_inputs: the input sequence in a format T x N x D
+    where T is sequence size, N - batch size and D - input dimension
+
+    decoder_input_lengths: blob containing sequence lengths
+    which would be passed to LSTMUnit operator
+
+    initial_decoder_hidden_state: initial hidden state of LSTM
+
+    initial_decoder_cell_state: initial cell state of LSTM
+
+    initial_attention_weighted_encoder_context: initial attention context
+
+    encoder_output_dim: dimension of encoder outputs
+
+    encoder_outputs: the sequence, on which we compute the attention context
+    at every iteration
+
+    encoder_lengths: a tensor with lengths of each encoder sequence in batch
+    (may be None, meaning all encoder sequences are of same length)
+
+    decoder_input_dim: input dimension (last dimension on decoder_inputs)
+
+    decoder_state_dim: size of hidden states of LSTM
+
+    attention_type: One of: AttentionType.Regular, AttentionType.Recurrent.
+    Determines which type of attention mechanism to use.
+
+    outputs_with_grads : position indices of output blobs which will receive
+    external error gradient during backpropagation
+
+    weighted_encoder_outputs: encoder outputs to be used to compute attention
+    weights. In the basic case it's just linear transformation of
+    encoder outputs (that the default, when weighted_encoder_outputs is None).
+    However, it can be something more complicated - like a separate
+    encoder network (for example, in case of convolutional encoder)
+
+    lstm_memory_optimization: recompute LSTM activations on backward pass, so
+                 we don't need to store their values in forward passes
+
+    attention_memory_optimization: recompute attention for backward pass
+
+    forward_only: whether to create only forward pass
+    '''
+    cell = LSTMWithAttentionCell(
+        encoder_output_dim=encoder_output_dim,
+        encoder_outputs=encoder_outputs,
+        encoder_lengths=encoder_lengths,
+        decoder_input_dim=decoder_input_dim,
+        decoder_state_dim=decoder_state_dim,
+        name=scope,
+        attention_type=attention_type,
+        weighted_encoder_outputs=weighted_encoder_outputs,
+        forget_bias=forget_bias,
+        lstm_memory_optimization=lstm_memory_optimization,
+        attention_memory_optimization=attention_memory_optimization,
+        forward_only=forward_only,
+    )
+    initial_states = [
+        initial_decoder_hidden_state,
+        initial_decoder_cell_state,
+        initial_attention_weighted_encoder_context,
+    ]
+    if attention_type == AttentionType.SoftCoverage:
+        initial_states.append(cell.build_initial_coverage(model))
+    _, result = cell.apply_over_sequence(
+        model=model,
+        inputs=decoder_inputs,
+        seq_lengths=decoder_input_lengths,
+        initial_states=initial_states,
+        outputs_with_grads=outputs_with_grads,
+    )
+    return result
+
+
+def _layered_LSTM(
+        model, input_blob, seq_lengths, initial_states,
+        dim_in, dim_out, scope, outputs_with_grads=(0,), return_params=False,
+        memory_optimization=False, forget_bias=0.0, forward_only=False,
+        drop_states=False, create_lstm=None):
+    params = locals()  # leave it as a first line to grab all params
+    params.pop('create_lstm')
+    if not isinstance(dim_out, list):
+        return create_lstm(**params)
+    elif len(dim_out) == 1:
+        params['dim_out'] = dim_out[0]
+        return create_lstm(**params)
+
+    assert len(dim_out) != 0, "dim_out list can't be empty"
+    assert return_params is False, "return_params not supported for layering"
+    for i, output_dim in enumerate(dim_out):
+        params.update({
+            'dim_out': output_dim
+        })
+        output, last_output, all_states, last_state = create_lstm(**params)
+        params.update({
+            'input_blob': output,
+            'dim_in': output_dim,
+            'initial_states': (last_output, last_state),
+            'scope': scope + '_layer_{}'.format(i + 1)
+        })
+    return output, last_output, all_states, last_state
+
+
+layered_LSTM = functools.partial(_layered_LSTM, create_lstm=LSTM)
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
new file mode 100644
index 0000000..31cfebc
--- /dev/null
+++ b/caffe2/python/schema.py
@@ -0,0 +1,1188 @@
+## @package schema
+# Module caffe2.python.schema
+"""
+Defines a minimal set of data types that allow to represent datasets with
+arbitrary nested structure, including objects of variable length, such as
+maps and lists.
+
+This defines a columnar storage format for such datasets on top of caffe2
+tensors. In terms of capacity of representation, it can represent most of
+the data types supported by Parquet, ORC, DWRF file formats.
+
+See comments in operator_test/dataset_ops_test.py for an example and
+walkthrough on how to use schema to store and iterate through a structured
+in-memory dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+import numpy as np
+from caffe2.python import core
+from caffe2.python import workspace
+from caffe2.python.core import BlobReference
+from collections import OrderedDict, namedtuple
+from past.builtins import basestring
+from future.utils import viewitems, viewkeys, viewvalues
+from itertools import islice
+from six import StringIO
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+FIELD_SEPARATOR = ':'
+
+
+def _join_field_name(prefix, suffix):
+    if prefix and suffix:
+        return '{}{}{}'.format(prefix, FIELD_SEPARATOR, suffix)
+    elif prefix:
+        return prefix
+    elif suffix:
+        return suffix
+    else:
+        return ''
+
+
+def _normalize_field(field_or_type_or_blob, keep_blobs=True):
+    """Clones/normalizes a field before adding it to a container."""
+    if isinstance(field_or_type_or_blob, Field):
+        return field_or_type_or_blob.clone(keep_blobs=keep_blobs)
+    elif type(field_or_type_or_blob) in (type, np.dtype):
+        return Scalar(dtype=field_or_type_or_blob)
+    else:
+        return Scalar(blob=field_or_type_or_blob)
+
+
+FeatureSpec = namedtuple(
+    'FeatureSpec',
+    [
+        'feature_type',
+        'feature_names',
+        'feature_ids',
+        'feature_is_request_only',
+        'desired_hash_size',
+        'feature_to_index',
+    ]
+)
+
+FeatureSpec.__new__.__defaults__ = (None, None, None, None, None, None)
+
+
+class Metadata(
+    namedtuple(
+        'Metadata', ['categorical_limit', 'expected_value', 'feature_specs']
+    )
+):
+    """Represents additional information associated with a scalar in schema.
+
+    `categorical_limit` - for fields of integral type that are guaranteed to be
+    non-negative it specifies the maximum possible value plus one. It's often
+    used as a size of an embedding table.
+
+    `expected_value` - anticipated average value of elements in the field.
+    Usually makes sense for length fields of lists.
+
+    `feature_specs` - information about the features that contained in this
+    field. For example if field have more than 1 feature it can have list of
+    feature names contained in this field."""
+    __slots__ = ()
+
+
+Metadata.__new__.__defaults__ = (None, None, None)
+
+
+class Field(object):
+    """Represents an abstract field type in a dataset.
+    """
+
+    def __init__(self, children):
+        """Derived classes must call this after their initialization."""
+        self._parent = (None, 0)
+        offset = 0
+        self._field_offsets = []
+        for child in children:
+            self._field_offsets.append(offset)
+            offset += len(child.field_names())
+        self._field_offsets.append(offset)
+
+    def clone_schema(self):
+        return self.clone(keep_blobs=False)
+
+    def field_names(self):
+        """Return the children field names for this field."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def field_types(self):
+        """Return the numpy.dtype for each of the children fields."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def field_metadata(self):
+        """Return the Metadata for each of the children fields."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def field_blobs(self):
+        """Return the list of blobs with contents for this Field.
+        Values can either be all numpy.ndarray or BlobReference.
+        If any of the fields doens't have a blob, throws.
+        """
+        raise NotImplementedError('Field is an abstract class.')
+
+    def all_scalars(self):
+        """Return the list of all Scalar instances in the Field.
+        The order is the same as for field_names() or field_blobs()"""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def has_blobs(self):
+        """Return True if every scalar of this field has blobs."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def clone(self, keep_blobs=True):
+        """Clone this Field along with its children."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def _set_parent(self, parent, relative_id):
+        self._parent = (parent, relative_id)
+
+    def slice(self):
+        """
+        Returns a slice representing the range of field ids that belong to
+        this field. This slice can be used to index a list of fields.
+
+        E.g.:
+
+        >>> s = Struct(
+        >>>     ('a', Scalar()),
+        >>>     ('b', Struct(
+        >>>         ('b1', Scalar()),
+        >>>         ('b2', Scalar()),
+        >>>     )),
+        >>>     ('c', Scalar()),
+        >>> )
+        >>> field_data = ['da', 'db1', 'db2', 'dc']
+        >>> field_data[s.b.split()]
+        ['db1', 'db2']
+        """
+        base_id = self._child_base_id()
+        return slice(base_id, base_id + len(self.field_names()))
+
+    def _child_base_id(self, child_index=None):
+        """Get the base id of the given child"""
+        p, i = self._parent
+        pos = 0 if child_index is None else self._field_offsets[child_index]
+        if p:
+            pos += p._child_base_id(i)
+        return pos
+
+    def __eq__(self, other):
+        """Equivalance of two schemas"""
+        return (
+            (self.field_names() == other.field_names()) and
+            (self.field_types() == other.field_types()) and
+            (self.field_metadata() == other.field_metadata())
+        )
+
+    def _pprint_impl(self, indent, str_buffer):
+        raise NotImplementedError('Field is an abstrct class.')
+
+    def __repr__(self):
+        str_buffer = StringIO()
+        self._pprint_impl(0, str_buffer)
+        contents = str_buffer.getvalue()
+        str_buffer.close()
+        return contents
+
+
+class List(Field):
+    """Represents a variable-length list.
+
+    Values of a list can also be complex fields such as Lists and Structs.
+    In addition to the fields exposed by its `values` field, a List exposes an
+    additional `lengths` field, which will contain the size of each list under
+    the parent domain.
+    """
+
+    def __init__(self, values, lengths_blob=None):
+        if isinstance(lengths_blob, Field):
+            assert isinstance(lengths_blob, Scalar)
+            self.lengths = _normalize_field(lengths_blob)
+        else:
+            self.lengths = Scalar(np.int32, lengths_blob)
+        self._items = _normalize_field(values)
+        self.lengths._set_parent(self, 0)
+        self._items._set_parent(self, 1)
+        Field.__init__(self, [self.lengths, self._items])
+
+    def field_names(self):
+        value_fields = self._items.field_names()
+        return (
+            ['lengths'] + [_join_field_name('values', v) for v in value_fields]
+        )
+
+    def field_types(self):
+        return self.lengths.field_types() + self._items.field_types()
+
+    def field_metadata(self):
+        return self.lengths.field_metadata() + self._items.field_metadata()
+
+    def field_blobs(self):
+        return self.lengths.field_blobs() + self._items.field_blobs()
+
+    def all_scalars(self):
+        return self.lengths.all_scalars() + self._items.all_scalars()
+
+    def has_blobs(self):
+        return self.lengths.has_blobs() and self._items.has_blobs()
+
+    def clone(self, keep_blobs=True):
+        return type(self)(
+            _normalize_field(self._items, keep_blobs=keep_blobs),
+            _normalize_field(self.lengths, keep_blobs=keep_blobs)
+        )
+
+    def _pprint_impl(self, indent, str_buffer):
+        str_buffer.write('  ' * indent + "List(\n")
+        str_buffer.write('  ' * (indent + 1) + "lengths=\n")
+        self.lengths._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
+        str_buffer.write('  ' * (indent + 1) + "_items=\n")
+        self._items._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
+        str_buffer.write('  ' * indent + ")\n")
+
+    def __getattr__(self, item):
+        """If the value of this list is a struct,
+        allow to introspect directly into its fields."""
+        if item.startswith('__'):
+            raise AttributeError(item)
+        if isinstance(self._items, Struct):
+            return getattr(self._items, item)
+        elif item == 'value' or item == 'items':
+            return self._items
+        else:
+            raise AttributeError('Field not found in list: %s.' % item)
+
+    def __getitem__(self, item):
+        names = item.split(FIELD_SEPARATOR, 1)
+
+        if len(names) == 1:
+            if item == 'lengths':
+                return self.lengths
+            elif item == 'values':
+                return self._items
+        else:
+            if names[0] == 'values':
+                return self._items[names[1]]
+        raise KeyError('Field not found in list: %s.' % item)
+
+
+class Struct(Field):
+    """Represents a named list of fields sharing the same domain.
+    """
+
+    def __init__(self, *fields):
+        """ fields is a list of tuples in format of (name, field). The name is
+        a string of nested name, e.g., `a`, `a:b`, `a:b:c`. For example
+
+        Struct(
+          ('a', Scalar()),
+          ('b:c', Scalar()),
+          ('b:d:e', Scalar()),
+          ('b', Struct(
+            ('f', Scalar()),
+          )),
+        )
+
+        is equal to
+
+        Struct(
+          ('a', Scalar()),
+          ('b', Struct(
+            ('c', Scalar()),
+            ('d', Struct(('e', Scalar()))),
+            ('f', Scalar()),
+          )),
+        )
+        """
+        for field in fields:
+            assert len(field) == 2
+            assert field[0], 'Field names cannot be empty'
+            assert field[0] != 'lengths', (
+                'Struct cannot contain a field named `lengths`.'
+            )
+        fields = [(name, _normalize_field(field)) for name, field in fields]
+        self.fields = OrderedDict()
+        for name, field in fields:
+            if FIELD_SEPARATOR in name:
+                name, field = self._struct_from_nested_name(name, field)
+            if name not in self.fields:
+                self.fields[name] = field
+                continue
+            if (
+                    not isinstance(field, Struct) or
+                    not isinstance(self.fields[name], Struct)
+            ):
+                raise ValueError('Duplicate field name: %s' % name)
+            self.fields[name] = self.fields[name] + field
+        for id, (_, field) in enumerate(viewitems(self.fields)):
+            field._set_parent(self, id)
+        Field.__init__(self, viewvalues(self.fields))
+        self._frozen = True
+
+    def _struct_from_nested_name(self, nested_name, field):
+        def create_internal(nested_name, field):
+            names = nested_name.split(FIELD_SEPARATOR, 1)
+            if len(names) == 1:
+                added_field = field
+            else:
+                added_field = create_internal(names[1], field)
+            return Struct((names[0], added_field))
+
+        names = nested_name.split(FIELD_SEPARATOR, 1)
+        assert len(names) >= 2
+        return names[0], create_internal(names[1], field)
+
+    def get_children(self):
+        return list(viewitems(self.fields))
+
+    def field_names(self):
+        names = []
+        for name, field in viewitems(self.fields):
+            names += [_join_field_name(name, f) for f in field.field_names()]
+        return names
+
+    def field_types(self):
+        types = []
+        for _, field in viewitems(self.fields):
+            types += field.field_types()
+        return types
+
+    def field_metadata(self):
+        metadata = []
+        for _, field in viewitems(self.fields):
+            metadata += field.field_metadata()
+        return metadata
+
+    def field_blobs(self):
+        blobs = []
+        for _, field in viewitems(self.fields):
+            blobs += field.field_blobs()
+        return blobs
+
+    def all_scalars(self):
+        scalars = []
+        for _, field in viewitems(self.fields):
+            scalars += field.all_scalars()
+        return scalars
+
+    def has_blobs(self):
+        return all(field.has_blobs() for field in viewvalues(self.fields))
+
+    def clone(self, keep_blobs=True):
+        normalized_fields = [
+            (k, _normalize_field(v, keep_blobs=keep_blobs))
+            for k, v in viewitems(self.fields)
+        ]
+        return type(self)(*normalized_fields)
+
+    def _get_field_by_nested_name(self, nested_name):
+        names = nested_name.split(FIELD_SEPARATOR, 1)
+        field = self.fields.get(names[0], None)
+
+        if field is None:
+            return None
+
+        if len(names) == 1:
+            return field
+
+        try:
+            return field[names[1]]
+        except (KeyError, TypeError):
+            return None
+
+    def _pprint_impl(self, indent, str_buffer):
+        str_buffer.write('  ' * indent + "Struct( \n")
+        for name, field in viewitems(self.fields):
+            str_buffer.write('  ' * (indent + 1) + "{}=".format(name) + "\n")
+            field._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
+        str_buffer.write('  ' * indent + ") \n")
+
+    def __contains__(self, item):
+        field = self._get_field_by_nested_name(item)
+        return field is not None
+
+    def __len__(self):
+        return len(self.fields)
+
+    def __getitem__(self, item):
+        """
+        item can be a tuple or list of ints or strings, or a single
+        int or string. String item is a nested field name, e.g., "a", "a:b",
+        "a:b:c". Int item is the index of a field at the first level of the
+        Struct.
+        """
+        if isinstance(item, list) or isinstance(item, tuple):
+            keys = list(viewkeys(self.fields))
+            return Struct(
+                * [
+                    (
+                        keys[k]
+                        if isinstance(k, int) else k, self[k]
+                    ) for k in item
+                ]
+            )
+        elif isinstance(item, int):
+            return next(islice(viewvalues(self.fields), item, None))
+        else:
+            field = self._get_field_by_nested_name(item)
+            if field is None:
+                raise KeyError('field "%s" not found' % (item))
+            return field
+
+    def get(self, item, default_value):
+        """
+        similar to python's dictionary get method, return field of item if found
+        (i.e. self.item is valid) or otherwise return default_value
+
+        it's a syntax suger of python's builtin getattr method
+        """
+        return getattr(self, item, default_value)
+
+    def __getattr__(self, item):
+        if item.startswith('__'):
+            raise AttributeError(item)
+        try:
+            return self.__dict__['fields'][item]
+        except KeyError:
+            raise AttributeError(item)
+
+    def __setattr__(self, key, value):
+        # Disable setting attributes after initialization to prevent false
+        # impression of being able to overwrite a field.
+        # Allowing setting internal states mainly so that _parent can be set
+        # post initialization.
+        if getattr(self, '_frozen', None) and not key.startswith('_'):
+            raise TypeError('Struct.__setattr__() is disabled after __init__()')
+        super(Struct, self).__setattr__(key, value)
+
+    def __add__(self, other):
+        """
+        Allows to merge fields of two schema.Struct using '+' operator.
+        If two Struct have common field names, the merge is conducted
+        recursively. Here are examples:
+
+        Example 1
+        s1 = Struct(('a', Scalar()))
+        s2 = Struct(('b', Scalar()))
+        s1 + s2 == Struct(
+            ('a', Scalar()),
+            ('b', Scalar()),
+        )
+
+        Example 2
+        s1 = Struct(
+            ('a', Scalar()),
+            ('b', Struct(('c', Scalar()))),
+        )
+        s2 = Struct(('b', Struct(('d', Scalar()))))
+        s1 + s2 == Struct(
+            ('a', Scalar()),
+            ('b', Struct(
+                ('c', Scalar()),
+                ('d', Scalar()),
+            )),
+        )
+        """
+        if not isinstance(other, Struct):
+            return NotImplemented
+
+        children = OrderedDict(self.get_children())
+        for name, right_field in other.get_children():
+            if name not in children:
+                children[name] = right_field
+                continue
+            left_field = children[name]
+            children[name] = left_field + right_field
+
+        return Struct(*(viewitems(children)))
+
+    def __sub__(self, other):
+        """
+        Allows to remove common fields of two schema.Struct from self by
+        using '-' operator. If two Struct have common field names, the
+        removal is conducted recursively. If a child struct has no fields
+        inside, it will be removed from its parent. Here are examples:
+
+        Example 1
+        s1 = Struct(
+            ('a', Scalar()),
+            ('b', Scalar()),
+        )
+        s2 = Struct(('a', Scalar()))
+        s1 - s2 == Struct(('b', Scalar()))
+
+        Example 2
+        s1 = Struct(
+            ('b', Struct(
+                ('c', Scalar()),
+                ('d', Scalar()),
+            ))
+        )
+        s2 = Struct(
+            ('b', Struct(('c', Scalar()))),
+        )
+        s1 - s2 == Struct(
+            ('b', Struct(
+                ('d', Scalar()),
+            )),
+        )
+
+        Example 3
+        s1 = Struct(
+            ('a', Scalar()),
+            ('b', Struct(
+                ('d', Scalar()),
+            ))
+        )
+        s2 = Struct(
+            ('b', Struct(
+                ('c', Scalar())
+                ('d', Scalar())
+            )),
+        )
+        s1 - s2 == Struct(
+            ('a', Scalar()),
+        )
+        """
+        if not isinstance(other, Struct):
+            return NotImplemented
+
+        children = OrderedDict(self.get_children())
+        for name, right_field in other.get_children():
+            if name in children:
+                left_field = children[name]
+                if type(left_field) == type(right_field):
+                    if isinstance(left_field, Struct):
+                        child = left_field - right_field
+                        if child.get_children():
+                            children[name] = child
+                            continue
+                    children.pop(name)
+                else:
+                    raise TypeError(
+                        "Type of left_field, " + str(type(left_field)) +
+                        ", is not the same as that of right_field, " +
+                        str(type(right_field)) +
+                        ", yet they have the same field name, " + name)
+        return Struct(*(children.items()))
+
+
+class Scalar(Field):
+    """Represents a typed scalar or tensor of fixed shape.
+
+    A Scalar is a leaf in a schema tree, translating to exactly one tensor in
+    the dataset's underlying storage.
+
+    Usually, the tensor storing the actual values of this field is a 1D tensor,
+    representing a series of values in its domain. It is possible however to
+    have higher rank values stored as a Scalar, as long as all entries have
+    the same shape.
+
+    E.g.:
+
+        Scalar(np.float64)
+
+            Scalar field of type float64. Caffe2 will expect readers and
+            datasets to expose it as a 1D tensor of doubles (vector), where
+            the size of the vector is determined by this fields' domain.
+
+        Scalar((np.int32, 5))
+
+            Tensor field of type int32. Caffe2 will expect readers and
+            datasets to implement it as a 2D tensor (matrix) of shape (L, 5),
+            where L is determined by this fields' domain.
+
+        Scalar((str, (10, 20)))
+
+            Tensor field of type str. Caffe2 will expect readers and
+            datasets to implement it as a 3D tensor of shape (L, 10, 20),
+            where L is determined by this fields' domain.
+
+    If the field type is unknown at construction time, call Scalar(), that will
+    default to np.void as its dtype.
+
+    It is an error to pass a structured dtype to Scalar, since it would contain
+    more than one field. Instead, use from_dtype, which will construct
+    a nested `Struct` field reflecting the given dtype's structure.
+
+    A Scalar can also contain a blob, which represents the value of this
+    Scalar. A blob can be either a numpy.ndarray, in which case it contain the
+    actual contents of the Scalar, or a BlobReference, which represents a
+    blob living in a caffe2 Workspace. If blob of different types are passed,
+    a conversion to numpy.ndarray is attempted.
+    """
+
+    def __init__(self, dtype=None, blob=None, metadata=None):
+        self._metadata = None
+        self.set(dtype, blob, metadata, unsafe=True)
+        Field.__init__(self, [])
+
+    def field_names(self):
+        return ['']
+
+    def field_type(self):
+        return self.dtype
+
+    def field_types(self):
+        return [self.dtype]
+
+    def field_metadata(self):
+        return [self._metadata]
+
+    def has_blobs(self):
+        return self._blob is not None
+
+    def field_blobs(self):
+        assert self._blob is not None, 'Value is not set for this field.'
+        return [self._blob]
+
+    def all_scalars(self):
+        return [self]
+
+    def clone(self, keep_blobs=True):
+        return Scalar(
+            dtype=self._original_dtype,
+            blob=self._blob if keep_blobs else None,
+            metadata=self._metadata
+        )
+
+    def get(self):
+        """Gets the current blob of this Scalar field."""
+        assert self._blob is not None, 'Value is not set for this field.'
+        return self._blob
+
+    def __call__(self):
+        """Shortcut for self.get()"""
+        return self.get()
+
+    @property
+    def metadata(self):
+        return self._metadata
+
+    def set_metadata(self, value):
+        assert isinstance(value, Metadata), \
+            'metadata must be Metadata, got {}'.format(type(value))
+        self._metadata = value
+        self._validate_metadata()
+
+    def _validate_metadata(self):
+        if self._metadata is None:
+            return
+        if (self._metadata.categorical_limit is not None and
+                self.dtype is not None):
+            assert np.issubdtype(self.dtype, np.integer), \
+                "`categorical_limit` can be specified only in integral " + \
+                "fields but got {}".format(self.dtype)
+
+    def set_value(self, blob, throw_on_type_mismatch=False, unsafe=False):
+        """Sets only the blob field still validating the existing dtype"""
+        if self.dtype.base != np.void and throw_on_type_mismatch:
+            assert isinstance(blob, np.ndarray), "Got {!r}".format(blob)
+            assert blob.dtype.base == self.dtype.base, (
+                "Expected {}, got {}".format(self.dtype.base, blob.dtype.base))
+        self.set(dtype=self._original_dtype, blob=blob, unsafe=unsafe)
+
+    def set(self, dtype=None, blob=None, metadata=None, unsafe=False):
+        """Set the type and/or blob of this scalar. See __init__ for details.
+
+        Args:
+            dtype: can be any numpy type. If not provided and `blob` is
+                   provided, it will be inferred. If no argument is provided,
+                   this Scalar will be of type np.void.
+            blob:  if provided, can be either a BlobReference or a
+                   numpy.ndarray. If a value of different type is passed,
+                   a conversion to numpy.ndarray is attempted. Strings aren't
+                   accepted, since they can be ambiguous. If you want to pass
+                   a string, to either BlobReference(blob) or np.array(blob).
+            metadata: optional instance of Metadata, if provided overrides
+                      the metadata information of the scalar
+        """
+        if not unsafe:
+            logger.warning(
+                "Scalar should be considered immutable. Only call Scalar.set() "
+                "on newly created Scalar with unsafe=True. This will become an "
+                "error soon."
+            )
+        if blob is not None and isinstance(blob, basestring):
+            raise ValueError(
+                'Passing str blob to Scalar.set() is ambiguous. '
+                'Do either set(blob=np.array(blob)) or '
+                'set(blob=BlobReference(blob))'
+            )
+
+        self._original_dtype = dtype
+        # Numpy will collapse a shape of 1 into an unindexed data array (shape = ()),
+        # which betrays the docstring of this class (which expects shape = (1,)).
+        # >>> import numpy as np
+        # >> np.dtype((np.int32, 1))
+        # dtype('int32')
+        # >>> np.dtype((np.int32, 5))
+        # dtype(('<i4', (5,)))
+        if dtype is not None and isinstance(dtype, tuple) and dtype[1] == 1:
+            dtype = (dtype[0], (1,))
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+        # If blob is not None and it is not a BlobReference, we assume that
+        # it is actual tensor data, so we will try to cast it to a numpy array.
+        if blob is not None and not isinstance(blob, BlobReference):
+            preserve_shape = isinstance(blob, np.ndarray)
+            if dtype is not None and dtype != np.void:
+                blob = np.array(blob, dtype=dtype.base)
+                # if array is empty we may need to reshape a little
+                if blob.size == 0 and not preserve_shape:
+                    blob = blob.reshape((0, ) + dtype.shape)
+            else:
+                assert isinstance(blob, np.ndarray), (
+                    'Invalid blob type: %s' % str(type(blob)))
+
+            # reshape scalars into 1D arrays
+            # TODO(azzolini): figure out better way of representing this
+            if len(blob.shape) == 0 and not preserve_shape:
+                blob = blob.reshape((1, ))
+
+            # infer inner shape from the blob given
+            # TODO(dzhulgakov): tweak this to make it work with PackedStruct
+            if (len(blob.shape) > 1 and dtype is not None and
+                    dtype.base != np.void):
+                dtype = np.dtype((dtype.base, blob.shape[1:]))
+        # if we were still unable to infer the dtype
+        if dtype is None:
+            dtype = np.dtype(np.void)
+        assert not dtype.fields, (
+            'Cannot create Scalar with a structured dtype. ' +
+            'Use from_dtype instead.'
+        )
+        self.dtype = dtype
+        self._blob = blob
+        if metadata is not None:
+            self.set_metadata(metadata)
+        self._validate_metadata()
+
+    def set_type(self, dtype):
+        self._original_dtype = dtype
+        if dtype is not None:
+            self.dtype = np.dtype(dtype)
+        else:
+            self.dtype = np.dtype(np.void)
+        self._validate_metadata()
+
+    def _pprint_impl(self, indent, str_buffer):
+        str_buffer.write('  ' * (indent) +
+            'Scalar({!r}, {!r}, {!r})'.format(
+            self.dtype, self._blob, self._metadata) + "\n")
+
+    def id(self):
+        """
+        Return the zero-indexed position of this scalar field in its schema.
+        Used in order to index into the field_blob list returned by readers or
+        accepted by writers.
+        """
+        return self._child_base_id()
+
+
+def Map(
+    keys,
+    values,
+    keys_name='keys',
+    values_name='values',
+    lengths_blob=None
+):
+    """A map is a List of Struct containing keys and values fields.
+    Optionally, you can provide custom name for the key and value fields.
+    """
+    return List(
+        Struct((keys_name, keys), (values_name, values)),
+        lengths_blob=lengths_blob
+    )
+
+
+def NamedTuple(name_prefix, *fields):
+    return Struct(* [('%s_%d' % (name_prefix, i), field)
+                     for i, field in enumerate(fields)])
+
+
+def Tuple(*fields):
+    """
+    Creates a Struct with default, sequential, field names of given types.
+    """
+    return NamedTuple('field', *fields)
+
+
+def RawTuple(num_fields, name_prefix='field'):
+    """
+    Creates a tuple of `num_field` untyped scalars.
+    """
+    assert isinstance(num_fields, int)
+    assert num_fields >= 0
+    return NamedTuple(name_prefix, *([np.void] * num_fields))
+
+
+def from_dtype(dtype, _outer_shape=()):
+    """Constructs a Caffe2 schema from the given numpy's dtype.
+
+    Numpy supports scalar, array-like and structured datatypes, as long as
+    all the shapes are fixed. This function breaks down the given dtype into
+    a Caffe2 schema containing `Struct` and `Scalar` types.
+
+    Fields containing byte offsets are not currently supported.
+    """
+    if not isinstance(dtype, np.dtype):
+        # wrap into a ndtype
+        shape = _outer_shape
+        dtype = np.dtype((dtype, _outer_shape))
+    else:
+        # concatenate shapes if necessary
+        shape = _outer_shape + dtype.shape
+        if shape != dtype.shape:
+            dtype = np.dtype((dtype.base, shape))
+
+    if not dtype.fields:
+        return Scalar(dtype)
+
+    struct_fields = []
+    for name, (fdtype, offset) in dtype.fields:
+        assert offset == 0, ('Fields with byte offsets are not supported.')
+        struct_fields += (name, from_dtype(fdtype, _outer_shape=shape))
+    return Struct(*struct_fields)
+
+
+class _SchemaNode(object):
+    """This is a private class used to represent a Schema Node"""
+
+    def __init__(self, name, type_str=''):
+        self.name = name
+        self.children = []
+        self.type_str = type_str
+        self.field = None
+
+    def add_child(self, name, type_str=''):
+        for child in self.children:
+            if child.name == name and child.type_str == type_str:
+                return child
+        child = _SchemaNode(name, type_str)
+        self.children.append(child)
+        return child
+
+    def get_field(self):
+
+        list_names = ['lengths', 'values']
+        map_names = ['lengths', 'keys', 'values']
+
+        if len(self.children) == 0 or self.field is not None:
+            if self.field is None:
+                return Struct()
+            else:
+                return self.field
+
+        child_names = []
+        for child in self.children:
+            child_names.append(child.name)
+
+        if (set(child_names) == set(list_names)):
+            for child in self.children:
+                if child.name == 'values':
+                    values_field = child.get_field()
+                else:
+                    lengths_field = child.get_field()
+            self.field = List(
+                values_field,
+                lengths_blob=lengths_field
+            )
+            self.type_str = "List"
+            return self.field
+        elif (set(child_names) == set(map_names)):
+            for child in self.children:
+                if child.name == 'keys':
+                    key_field = child.get_field()
+                elif child.name == 'values':
+                    values_field = child.get_field()
+                else:
+                    lengths_field = child.get_field()
+            self.field = Map(
+                key_field,
+                values_field,
+                lengths_blob=lengths_field
+            )
+            self.type_str = "Map"
+            return self.field
+
+        else:
+            struct_fields = []
+            for child in self.children:
+                struct_fields.append((child.name, child.get_field()))
+
+            self.field = Struct(*struct_fields)
+            self.type_str = "Struct"
+            return self.field
+
+    def print_recursively(self):
+        for child in self.children:
+            child.print_recursively()
+        logger.info("Printing node: Name and type")
+        logger.info(self.name)
+        logger.info(self.type_str)
+
+
+def from_column_list(
+    col_names, col_types=None,
+    col_blobs=None, col_metadata=None
+):
+    """
+    Given a list of names, types, and optionally values, construct a Schema.
+    """
+    if col_types is None:
+        col_types = [None] * len(col_names)
+    if col_metadata is None:
+        col_metadata = [None] * len(col_names)
+    if col_blobs is None:
+        col_blobs = [None] * len(col_names)
+    assert len(col_names) == len(col_types), (
+        'col_names and col_types must have the same length.'
+    )
+    assert len(col_names) == len(col_metadata), (
+        'col_names and col_metadata must have the same length.'
+    )
+    assert len(col_names) == len(col_blobs), (
+        'col_names and col_blobs must have the same length.'
+    )
+    root = _SchemaNode('root', 'Struct')
+    for col_name, col_type, col_blob, col_metadata in zip(
+        col_names, col_types, col_blobs, col_metadata
+    ):
+        columns = col_name.split(FIELD_SEPARATOR)
+        current = root
+        for i in range(len(columns)):
+            name = columns[i]
+            type_str = ''
+            field = None
+            if i == len(columns) - 1:
+                type_str = col_type
+                field = Scalar(
+                    dtype=col_type,
+                    blob=col_blob,
+                    metadata=col_metadata
+                )
+            next = current.add_child(name, type_str)
+            if field is not None:
+                next.field = field
+            current = next
+
+    return root.get_field()
+
+
+def from_blob_list(schema, values, throw_on_type_mismatch=False):
+    """
+    Create a schema that clones the given schema, but containing the given
+    list of values.
+    """
+    assert isinstance(schema, Field), 'Argument `schema` must be a Field.'
+    if isinstance(values, BlobReference):
+        values = [values]
+    record = schema.clone_schema()
+    scalars = record.all_scalars()
+    assert len(scalars) == len(values), (
+        'Values must have %d elements, got %d.' % (len(scalars), len(values))
+    )
+    for scalar, value in zip(scalars, values):
+        scalar.set_value(value, throw_on_type_mismatch, unsafe=True)
+    return record
+
+
+def as_record(value):
+    if isinstance(value, Field):
+        return value
+    elif isinstance(value, list) or isinstance(value, tuple):
+        is_field_list = all(
+            f is tuple and len(f) == 2 and isinstance(f[0], basestring)
+            for f in value
+        )
+        if is_field_list:
+            return Struct(* [(k, as_record(v)) for k, v in value])
+        else:
+            return Tuple(* [as_record(f) for f in value])
+    elif isinstance(value, dict):
+        return Struct(* [(k, as_record(v)) for k, v in viewitems(value)])
+    else:
+        return _normalize_field(value)
+
+
+def FetchRecord(blob_record, ws=None, throw_on_type_mismatch=False):
+    """
+    Given a record containing BlobReferences, return a new record with same
+    schema, containing numpy arrays, fetched from the current active workspace.
+    """
+
+    def fetch(v):
+        if ws is None:
+            return workspace.FetchBlob(str(v))
+        else:
+            return ws.blobs[str(v)].fetch()
+
+    assert isinstance(blob_record, Field)
+    field_blobs = blob_record.field_blobs()
+    assert all(isinstance(v, BlobReference) for v in field_blobs)
+    field_arrays = [fetch(value) for value in field_blobs]
+    return from_blob_list(blob_record, field_arrays, throw_on_type_mismatch)
+
+
+def FeedRecord(blob_record, arrays, ws=None):
+    """
+    Given a Record containing blob_references and arrays, which is either
+    a list of numpy arrays or a Record containing numpy arrays, feeds the
+    record to the current workspace.
+    """
+
+    def feed(b, v):
+        if ws is None:
+            workspace.FeedBlob(str(b), v)
+        else:
+            ws.create_blob(str(b))
+            ws.blobs[str(b)].feed(v)
+
+    assert isinstance(blob_record, Field)
+    field_blobs = blob_record.field_blobs()
+    assert all(isinstance(v, BlobReference) for v in field_blobs)
+    if isinstance(arrays, Field):
+        # TODO: check schema
+        arrays = arrays.field_blobs()
+    assert len(arrays) == len(field_blobs), (
+        'Values must contain exactly %d ndarrays.' % len(field_blobs)
+    )
+    for blob, array in zip(field_blobs, arrays):
+        feed(blob, array)
+
+
+def NewRecord(net, schema):
+    """
+    Given a record of np.arrays, create a BlobReference for each one of them,
+    returning a record containing BlobReferences. The name of each returned blob
+    is NextScopedBlob(field_name), which guarantees unique name in the current
+    net. Use NameScope explicitly to avoid name conflictions between different
+    nets.
+    """
+    if isinstance(schema, Scalar):
+        result = schema.clone()
+        result.set_value(
+            blob=net.NextScopedBlob('unnamed_scalar'),
+            unsafe=True,
+        )
+        return result
+
+    assert isinstance(schema, Field), 'Record must be a schema.Field instance.'
+    blob_refs = [
+        net.NextScopedBlob(prefix=name)
+        for name in schema.field_names()
+    ]
+    return from_blob_list(schema, blob_refs)
+
+
+def ConstRecord(net, array_record):
+    """
+    Given a record of arrays, returns a record of blobs,
+    initialized with net.Const.
+    """
+    blob_record = NewRecord(net, array_record)
+    for blob, array in zip(
+        blob_record.field_blobs(), array_record.field_blobs()
+    ):
+        net.Const(array, blob)
+    return blob_record
+
+
+def InitEmptyRecord(net, schema_or_record, enforce_types=False):
+    if not schema_or_record.has_blobs():
+        record = NewRecord(net, schema_or_record)
+    else:
+        record = schema_or_record
+
+    for blob_type, blob in zip(record.field_types(), record.field_blobs()):
+        try:
+            data_type = data_type_for_dtype(blob_type)
+            shape = [0] + list(blob_type.shape)
+            net.ConstantFill([], blob, shape=shape, dtype=data_type)
+        except TypeError:
+            logger.warning("Blob {} has type error".format(blob))
+            # If data_type_for_dtype doesn't know how to resolve given numpy
+            # type to core.DataType, that function can throw type error (for
+            # example that would happen for cases of unknown types such as
+            # np.void). This is not a problem for cases when the record if going
+            # to be overwritten by some operator later, though it might be an
+            # issue for type/shape inference.
+            if enforce_types:
+                raise
+            # If we don't enforce types for all items we'll create a blob with
+            # the default ConstantFill (FLOAT, no shape)
+            net.ConstantFill([], blob, shape=[0])
+
+    return record
+
+
+_DATA_TYPE_FOR_DTYPE = [
+    (np.str, core.DataType.STRING),
+    (np.float16, core.DataType.FLOAT16),
+    (np.float32, core.DataType.FLOAT),
+    (np.float64, core.DataType.DOUBLE),
+    (np.bool, core.DataType.BOOL),
+    (np.int8, core.DataType.INT8),
+    (np.int16, core.DataType.INT16),
+    (np.int32, core.DataType.INT32),
+    (np.int64, core.DataType.INT64),
+    (np.uint8, core.DataType.UINT8),
+    (np.uint16, core.DataType.UINT16),
+]
+
+
+def is_schema_subset(schema, original_schema):
+    # TODO add more checks
+    return set(schema.field_names()).issubset(
+        set(original_schema.field_names()))
+
+
+def equal_schemas(schema,
+                  original_schema,
+                  check_field_names=True,
+                  check_field_types=True,
+                  check_field_metas=False):
+    assert isinstance(schema, Field)
+    assert isinstance(original_schema, Field)
+
+    if check_field_names and (
+            schema.field_names() != original_schema.field_names()):
+        return False
+    if check_field_types and (
+            schema.field_types() != original_schema.field_types()):
+        return False
+    if check_field_metas and (
+            schema.field_metadata() != original_schema.field_metadata()):
+        return False
+
+    return True
+
+
+def schema_check(schema, previous=None):
+    record = as_record(schema)
+    if previous is not None:
+        assert equal_schemas(schema, previous)
+    return record
+
+
+def data_type_for_dtype(dtype):
+    for np_type, dt in _DATA_TYPE_FOR_DTYPE:
+        if dtype.base == np_type:
+            return dt
+    raise TypeError('Unknown dtype: ' + str(dtype.base))
+
+
+def attach_metadata_to_scalars(field, metadata):
+    for f in field.all_scalars():
+        f.set_metadata(metadata)
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
new file mode 100644
index 0000000..3e78bc3
--- /dev/null
+++ b/caffe2/python/schema_test.py
@@ -0,0 +1,406 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+import numpy as np
+
+import unittest
+import pickle
+import random
+
+
+class TestDB(unittest.TestCase):
+    def testPicklable(self):
+        s = schema.Struct(
+            ('field1', schema.Scalar(dtype=np.int32)),
+            ('field2', schema.List(schema.Scalar(dtype=str)))
+        )
+        s2 = pickle.loads(pickle.dumps(s))
+        for r in (s, s2):
+            self.assertTrue(isinstance(r.field1, schema.Scalar))
+            self.assertTrue(isinstance(r.field2, schema.List))
+            self.assertTrue(getattr(r, 'non_existent', None) is None)
+
+    def testListSubclassClone(self):
+        class Subclass(schema.List):
+            pass
+
+        s = Subclass(schema.Scalar())
+        clone = s.clone()
+        self.assertIsInstance(clone, Subclass)
+        self.assertEqual(s, clone)
+        self.assertIsNot(clone, s)
+
+    def testStructSubclassClone(self):
+        class Subclass(schema.Struct):
+            pass
+
+        s = Subclass(
+            ('a', schema.Scalar()),
+        )
+        clone = s.clone()
+        self.assertIsInstance(clone, Subclass)
+        self.assertEqual(s, clone)
+        self.assertIsNot(clone, s)
+
+    def testNormalizeField(self):
+        s = schema.Struct(('field1', np.int32), ('field2', str))
+        self.assertEquals(
+            s,
+            schema.Struct(
+                ('field1', schema.Scalar(dtype=np.int32)),
+                ('field2', schema.Scalar(dtype=str))
+            )
+        )
+
+    def testTuple(self):
+        s = schema.Tuple(np.int32, str, np.float32)
+        s2 = schema.Struct(
+            ('field_0', schema.Scalar(dtype=np.int32)),
+            ('field_1', schema.Scalar(dtype=np.str)),
+            ('field_2', schema.Scalar(dtype=np.float32))
+        )
+        self.assertEquals(s, s2)
+        self.assertEquals(s[0], schema.Scalar(dtype=np.int32))
+        self.assertEquals(s[1], schema.Scalar(dtype=np.str))
+        self.assertEquals(s[2], schema.Scalar(dtype=np.float32))
+        self.assertEquals(
+            s[2, 0],
+            schema.Struct(
+                ('field_2', schema.Scalar(dtype=np.float32)),
+                ('field_0', schema.Scalar(dtype=np.int32)),
+            )
+        )
+        # test iterator behavior
+        for i, (v1, v2) in enumerate(zip(s, s2)):
+            self.assertEquals(v1, v2)
+            self.assertEquals(s[i], v1)
+            self.assertEquals(s2[i], v1)
+
+    def testRawTuple(self):
+        s = schema.RawTuple(2)
+        self.assertEquals(
+            s, schema.Struct(
+                ('field_0', schema.Scalar()), ('field_1', schema.Scalar())
+            )
+        )
+        self.assertEquals(s[0], schema.Scalar())
+        self.assertEquals(s[1], schema.Scalar())
+
+    def testStructIndexing(self):
+        s = schema.Struct(
+            ('field1', schema.Scalar(dtype=np.int32)),
+            ('field2', schema.List(schema.Scalar(dtype=str))),
+            ('field3', schema.Struct()),
+        )
+        self.assertEquals(s['field2'], s.field2)
+        self.assertEquals(s['field2'], schema.List(schema.Scalar(dtype=str)))
+        self.assertEquals(s['field3'], schema.Struct())
+        self.assertEquals(
+            s['field2', 'field1'],
+            schema.Struct(
+                ('field2', schema.List(schema.Scalar(dtype=str))),
+                ('field1', schema.Scalar(dtype=np.int32)),
+            )
+        )
+
+    def testListInStructIndexing(self):
+        a = schema.List(schema.Scalar(dtype=str))
+        s = schema.Struct(
+            ('field1', schema.Scalar(dtype=np.int32)),
+            ('field2', a)
+        )
+        self.assertEquals(s['field2:lengths'], a.lengths)
+        self.assertEquals(s['field2:values'], a.items)
+        with self.assertRaises(KeyError):
+            s['fields2:items:non_existent']
+        with self.assertRaises(KeyError):
+            s['fields2:non_existent']
+
+    def testMapInStructIndexing(self):
+        a = schema.Map(
+            schema.Scalar(dtype=np.int32),
+            schema.Scalar(dtype=np.float32),
+        )
+        s = schema.Struct(
+            ('field1', schema.Scalar(dtype=np.int32)),
+            ('field2', a)
+        )
+        self.assertEquals(s['field2:values:keys'], a.keys)
+        self.assertEquals(s['field2:values:values'], a.values)
+        with self.assertRaises(KeyError):
+            s['fields2:keys:non_existent']
+
+    def testPreservesMetadata(self):
+        s = schema.Struct(
+            ('a', schema.Scalar(np.float32)), (
+                'b', schema.Scalar(
+                    np.int32,
+                    metadata=schema.Metadata(categorical_limit=5)
+                )
+            ), (
+                'c', schema.List(
+                    schema.Scalar(
+                        np.int32,
+                        metadata=schema.Metadata(categorical_limit=6)
+                    )
+                )
+            )
+        )
+        # attach metadata to lengths field
+        s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7))
+
+        self.assertEqual(None, s.a.metadata)
+        self.assertEqual(5, s.b.metadata.categorical_limit)
+        self.assertEqual(6, s.c.value.metadata.categorical_limit)
+        self.assertEqual(7, s.c.lengths.metadata.categorical_limit)
+        sc = s.clone()
+        self.assertEqual(None, sc.a.metadata)
+        self.assertEqual(5, sc.b.metadata.categorical_limit)
+        self.assertEqual(6, sc.c.value.metadata.categorical_limit)
+        self.assertEqual(7, sc.c.lengths.metadata.categorical_limit)
+        sv = schema.from_blob_list(
+            s, [
+                np.array([3.4]), np.array([2]), np.array([3]),
+                np.array([1, 2, 3])
+            ]
+        )
+        self.assertEqual(None, sv.a.metadata)
+        self.assertEqual(5, sv.b.metadata.categorical_limit)
+        self.assertEqual(6, sv.c.value.metadata.categorical_limit)
+        self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
+
+    def testDupField(self):
+        with self.assertRaises(ValueError):
+            schema.Struct(
+                ('a', schema.Scalar()),
+                ('a', schema.Scalar()))
+
+    def testAssignToField(self):
+        with self.assertRaises(TypeError):
+            s = schema.Struct(('a', schema.Scalar()))
+            s.a = schema.Scalar()
+
+    def testPreservesEmptyFields(self):
+        s = schema.Struct(
+            ('a', schema.Scalar(np.float32)),
+            ('b', schema.Struct()),
+        )
+        sc = s.clone()
+        self.assertIn("a", sc.fields)
+        self.assertIn("b", sc.fields)
+        sv = schema.from_blob_list(s, [np.array([3.4])])
+        self.assertIn("a", sv.fields)
+        self.assertIn("b", sv.fields)
+        self.assertEqual(0, len(sv.b.fields))
+
+    def testStructSubstraction(self):
+        s1 = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.Scalar()),
+            ('c', schema.Scalar()),
+        )
+        s2 = schema.Struct(
+            ('b', schema.Scalar())
+        )
+        s = s1 - s2
+        self.assertEqual(['a', 'c'], s.field_names())
+
+        s3 = schema.Struct(
+            ('a', schema.Scalar())
+        )
+        s = s1 - s3
+        self.assertEqual(['b', 'c'], s.field_names())
+
+        with self.assertRaises(TypeError):
+            s1 - schema.Scalar()
+
+    def testStructNestedSubstraction(self):
+        s1 = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.Struct(
+                ('c', schema.Scalar()),
+                ('d', schema.Scalar()),
+                ('e', schema.Scalar()),
+                ('f', schema.Scalar()),
+            )),
+        )
+        s2 = schema.Struct(
+            ('b', schema.Struct(
+                ('d', schema.Scalar()),
+                ('e', schema.Scalar()),
+            )),
+        )
+        s = s1 - s2
+        self.assertEqual(['a', 'b:c', 'b:f'], s.field_names())
+
+    def testStructAddition(self):
+        s1 = schema.Struct(
+            ('a', schema.Scalar())
+        )
+        s2 = schema.Struct(
+            ('b', schema.Scalar())
+        )
+        s = s1 + s2
+        self.assertIn("a", s.fields)
+        self.assertIn("b", s.fields)
+        with self.assertRaises(TypeError):
+            s1 + s1
+        with self.assertRaises(TypeError):
+            s1 + schema.Scalar()
+
+    def testStructNestedAddition(self):
+        s1 = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.Struct(
+                ('c', schema.Scalar())
+            )),
+        )
+        s2 = schema.Struct(
+            ('b', schema.Struct(
+                ('d', schema.Scalar())
+            ))
+        )
+        s = s1 + s2
+        self.assertEqual(['a', 'b:c', 'b:d'], s.field_names())
+
+        s3 = schema.Struct(
+            ('b', schema.Scalar()),
+        )
+        with self.assertRaises(TypeError):
+            s = s1 + s3
+
+    def testGetFieldByNestedName(self):
+        st = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.Struct(
+                ('c', schema.Struct(
+                    ('d', schema.Scalar()),
+                )),
+            )),
+        )
+        self.assertRaises(KeyError, st.__getitem__, '')
+        self.assertRaises(KeyError, st.__getitem__, 'x')
+        self.assertRaises(KeyError, st.__getitem__, 'x:y')
+        self.assertRaises(KeyError, st.__getitem__, 'b:c:x')
+        a = st['a']
+        self.assertTrue(isinstance(a, schema.Scalar))
+        bc = st['b:c']
+        self.assertIn('d', bc.fields)
+        bcd = st['b:c:d']
+        self.assertTrue(isinstance(bcd, schema.Scalar))
+
+    def testAddFieldByNestedName(self):
+        f_a = schema.Scalar(blob=core.BlobReference('blob1'))
+        f_b = schema.Struct(
+            ('c', schema.Struct(
+                ('d', schema.Scalar(blob=core.BlobReference('blob2'))),
+            )),
+        )
+        f_x = schema.Struct(
+            ('x', schema.Scalar(blob=core.BlobReference('blob3'))),
+        )
+
+        with self.assertRaises(TypeError):
+            st = schema.Struct(
+                ('a', f_a),
+                ('b', f_b),
+                ('b:c:d', f_x),
+            )
+        with self.assertRaises(TypeError):
+            st = schema.Struct(
+                ('a', f_a),
+                ('b', f_b),
+                ('b:c:d:e', f_x),
+            )
+
+        st = schema.Struct(
+            ('a', f_a),
+            ('b', f_b),
+            ('e:f', f_x),
+        )
+        self.assertEqual(['a', 'b:c:d', 'e:f:x'], st.field_names())
+        self.assertEqual(['blob1', 'blob2', 'blob3'], st.field_blobs())
+
+        st = schema.Struct(
+            ('a', f_a),
+            ('b:c:e', f_x),
+            ('b', f_b),
+        )
+        self.assertEqual(['a', 'b:c:e:x', 'b:c:d'], st.field_names())
+        self.assertEqual(['blob1', 'blob3', 'blob2'], st.field_blobs())
+
+        st = schema.Struct(
+            ('a:a1', f_a),
+            ('b:b1', f_b),
+            ('a', f_x),
+        )
+        self.assertEqual(['a:a1', 'a:x', 'b:b1:c:d'], st.field_names())
+        self.assertEqual(['blob1', 'blob3', 'blob2'], st.field_blobs())
+
+    def testContains(self):
+        st = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.Struct(
+                ('c', schema.Struct(
+                    ('d', schema.Scalar()),
+                )),
+            )),
+        )
+        self.assertTrue('a' in st)
+        self.assertTrue('b:c' in st)
+        self.assertTrue('b:c:d' in st)
+        self.assertFalse('' in st)
+        self.assertFalse('x' in st)
+        self.assertFalse('b:c:x' in st)
+        self.assertFalse('b:c:d:x' in st)
+
+    def testFromEmptyColumnList(self):
+        st = schema.Struct()
+        columns = st.field_names()
+        rec = schema.from_column_list(col_names=columns)
+        self.assertEqual(rec, schema.Struct())
+
+    def testFromColumnList(self):
+        st = schema.Struct(
+            ('a', schema.Scalar()),
+            ('b', schema.List(schema.Scalar())),
+            ('c', schema.Map(schema.Scalar(), schema.Scalar()))
+        )
+        columns = st.field_names()
+        # test that recovery works for arbitrary order
+        for _ in range(10):
+            some_blobs = [core.BlobReference('blob:' + x) for x in columns]
+            rec = schema.from_column_list(columns, col_blobs=some_blobs)
+            self.assertTrue(rec.has_blobs())
+            self.assertEqual(sorted(st.field_names()), sorted(rec.field_names()))
+            self.assertEqual([str(blob) for blob in rec.field_blobs()],
+                             [str('blob:' + name) for name in rec.field_names()])
+            random.shuffle(columns)
+
+    def testStructGet(self):
+        net = core.Net('test_net')
+        s1 = schema.NewRecord(net, schema.Scalar(np.float32))
+        s2 = schema.NewRecord(net, schema.Scalar(np.float32))
+        t = schema.Tuple(s1, s2)
+        assert t.get('field_0', None) == s1
+        assert t.get('field_1', None) == s2
+        assert t.get('field_2', None) is None
+
+    def testScalarShape(self):
+        s0 = schema.Scalar(np.int32)
+        self.assertEqual(s0.field_type().shape, ())
+
+        s1_good = schema.Scalar((np.int32, 5))
+        self.assertEqual(s1_good.field_type().shape, (5, ))
+
+        with self.assertRaises(ValueError):
+            s1_bad = schema.Scalar((np.int32, -1))
+
+        s1_hard = schema.Scalar((np.int32, 1))
+        self.assertEqual(s1_hard.field_type().shape, (1, ))
+
+        s2 = schema.Scalar((np.int32, (2, 3)))
+        self.assertEqual(s2.field_type().shape, (2, 3))
\ No newline at end of file
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
new file mode 100644
index 0000000..47e89f1
--- /dev/null
+++ b/caffe2/python/scope.py
@@ -0,0 +1,105 @@
+## @package scope
+# Module caffe2.python.scope
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import contextlib
+import threading
+from past.builtins import basestring
+
+from caffe2.proto import caffe2_pb2
+
+
+# The name scope and device scope when creating a new operator.
+_NAMESCOPE_SEPARATOR = '/'
+
+_threadlocal_scope = threading.local()
+
+
+def CurrentNameScope():
+    global _threadlocal_scope
+    if not hasattr(_threadlocal_scope, "namescope"):
+        _threadlocal_scope.namescope = ''
+    return _threadlocal_scope.namescope
+
+
+def CurrentDeviceScope():
+    global _threadlocal_scope
+    if not hasattr(_threadlocal_scope, "devicescope"):
+        _threadlocal_scope.devicescope = None
+    return _threadlocal_scope.devicescope
+
+
+@contextlib.contextmanager
+def NameScope(prefix, reset=False):
+    global _threadlocal_scope
+    assert isinstance(prefix, basestring) or prefix is None, \
+        "NameScope takes in a string as its argument."
+    old_scope = CurrentNameScope()
+    prefix = prefix + _NAMESCOPE_SEPARATOR if prefix else ''
+    if reset:
+        _threadlocal_scope.namescope = prefix
+    else:
+        _threadlocal_scope.namescope = _threadlocal_scope.namescope + prefix
+
+    try:
+        yield
+    finally:
+        assert _threadlocal_scope.namescope.endswith(prefix), \
+            "The namescope variable is changed from outside NameScope() calls."
+        _threadlocal_scope.namescope = old_scope
+
+
+@contextlib.contextmanager
+def DeviceScope(scope, node_name=None):
+    new_scope = caffe2_pb2.DeviceOption()
+    if scope:
+        assert isinstance(scope, caffe2_pb2.DeviceOption), \
+            "DeviceScope takes in a caffe2_pb2.DeviceOption as its argument."
+        new_scope.CopyFrom(scope)
+    else:
+        assert node_name, "At least one argument should be non-null in DeviceScope"
+
+    # rewrite node_name if it is explicitly given
+    if node_name:
+        new_scope.node_name = node_name
+    global _threadlocal_scope
+    old_scope = CurrentDeviceScope()
+    # nested scope should inherit the node_name if it is not explicitly set
+    if old_scope and old_scope.HasField('node_name') and \
+            not new_scope.HasField('node_name'):
+        new_scope.node_name = old_scope.node_name
+
+    # nested scope should inherit the extra_info and merged it with new extra_info
+    if old_scope and hasattr(old_scope, 'extra_info'):
+        new_scope.extra_info.extend(old_scope.extra_info)
+    new_scope.extra_info.sort()
+
+    _threadlocal_scope.devicescope = new_scope
+    try:
+        yield
+    finally:
+        assert _threadlocal_scope.devicescope == new_scope, \
+            "The device scope is changed from outside DeviceScope() calls."
+        _threadlocal_scope.devicescope = old_scope
+
+
+@contextlib.contextmanager
+def EmptyDeviceScope():
+    """
+    Allow users to 'disable' the device scope behaviour (so it can be
+    controlled at a NetDef::DeviceOption level, not overridden at
+    OperatorDef::DeviceOption level).
+
+    This sets the CurrentDeviceScope() to None, so that the field is
+    not set in CreateOperator(...), etc.
+    """
+    old_scope = CurrentDeviceScope()
+    try:
+        _threadlocal_scope.devicescope = None
+        yield
+    finally:
+        _threadlocal_scope.devicescope = old_scope
+        return
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
new file mode 100644
index 0000000..11f7a2c
--- /dev/null
+++ b/caffe2/python/scope_test.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import scope, core
+from caffe2.proto import caffe2_pb2
+
+import unittest
+import threading
+import time
+
+SUCCESS_COUNT = 0
+
+
+def thread_runner(idx, testobj):
+    global SUCCESS_COUNT
+    testobj.assertEquals(scope.CurrentNameScope(), "")
+    testobj.assertEquals(scope.CurrentDeviceScope(), None)
+    namescope = "namescope_{}".format(idx)
+    dsc = core.DeviceOption(caffe2_pb2.CUDA, idx)
+    with scope.DeviceScope(dsc):
+        with scope.NameScope(namescope):
+            testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
+            testobj.assertEquals(scope.CurrentDeviceScope(), dsc)
+
+            time.sleep(0.01 + idx * 0.01)
+            testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
+            testobj.assertEquals(scope.CurrentDeviceScope(), dsc)
+
+    testobj.assertEquals(scope.CurrentNameScope(), "")
+    testobj.assertEquals(scope.CurrentDeviceScope(), None)
+    SUCCESS_COUNT += 1
+
+
+class TestScope(unittest.TestCase):
+
+    def testNamescopeBasic(self):
+        self.assertEquals(scope.CurrentNameScope(), "")
+
+        with scope.NameScope("test_scope"):
+            self.assertEquals(scope.CurrentNameScope(), "test_scope/")
+
+        self.assertEquals(scope.CurrentNameScope(), "")
+
+    def testNamescopeAssertion(self):
+        self.assertEquals(scope.CurrentNameScope(), "")
+
+        try:
+            with scope.NameScope("test_scope"):
+                self.assertEquals(scope.CurrentNameScope(), "test_scope/")
+                raise Exception()
+        except Exception:
+            pass
+
+        self.assertEquals(scope.CurrentNameScope(), "")
+
+    def testDevicescopeBasic(self):
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        with scope.DeviceScope(dsc):
+            self.assertEquals(scope.CurrentDeviceScope(), dsc)
+
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+    def testEmptyDevicescopeBasic(self):
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        with scope.DeviceScope(dsc):
+            self.assertEquals(scope.CurrentDeviceScope(), dsc)
+            with scope.EmptyDeviceScope():
+                self.assertEquals(scope.CurrentDeviceScope(), None)
+            self.assertEquals(scope.CurrentDeviceScope(), dsc)
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+    def testDevicescopeAssertion(self):
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+
+        try:
+            with scope.DeviceScope(dsc):
+                self.assertEquals(scope.CurrentDeviceScope(), dsc)
+                raise Exception()
+        except Exception:
+            pass
+
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+    def testTags(self):
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        extra_info1 = ["key1:value1"]
+        extra_info2 = ["key2:value2"]
+        extra_info3 = ["key3:value3"]
+
+        extra_info_1_2 = ["key1:value1", "key2:value2"]
+        extra_info_1_2_3 = ["key1:value1", "key2:value2", "key3:value3"]
+
+        with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info1)):
+            self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1)
+
+            with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info2)):
+                self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
+
+                with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info3)):
+                    self.assertEquals(
+                        scope.CurrentDeviceScope().extra_info, extra_info_1_2_3
+                    )
+
+                self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
+            self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1)
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+    def testMultiThreaded(self):
+        """
+        Test that name/device scope are properly local to the thread
+        and don't interfere
+        """
+        global SUCCESS_COUNT
+        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        threads = []
+        for i in range(4):
+            threads.append(threading.Thread(
+                target=thread_runner,
+                args=(i, self),
+            ))
+        for t in threads:
+            t.start()
+
+        with scope.NameScope("master"):
+            self.assertEquals(scope.CurrentDeviceScope(), None)
+            self.assertEquals(scope.CurrentNameScope(), "master/")
+            for t in threads:
+                t.join()
+
+            self.assertEquals(scope.CurrentNameScope(), "master/")
+            self.assertEquals(scope.CurrentDeviceScope(), None)
+
+        # Ensure all threads succeeded
+        self.assertEquals(SUCCESS_COUNT, 4)
diff --git a/caffe2/python/session.py b/caffe2/python/session.py
new file mode 100644
index 0000000..b6679bd
--- /dev/null
+++ b/caffe2/python/session.py
@@ -0,0 +1,213 @@
+## @package session
+# Module caffe2.python.session
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+from caffe2.python import core, workspace
+from caffe2.python.task import Cluster, Task, TaskGroup, WorkspaceType
+
+
+class CompiledRunnable(object):
+    """ Wrapper for compiled runnable returned from session.compile() """
+    def __init__(self, obj, session_class):
+        self.obj = obj
+        self.session_class = session_class
+
+
+class Session(object):
+    """
+    Allows to run Nets, ExecutionSteps, Plans, Tasks and TaskGroups.
+    A session can potentially run in multiple nodes concurrently.
+
+
+    Example:
+        from core import Net
+        from caffe2.python.task import Task, TaskGroup, WorkspaceType
+
+        net = Net('test1')
+        net.Add([net.Const(1), net.Const(2)])
+
+        net2 = net.Clone()
+        step = core.execution_step('step1', [net2])
+
+        with TaskGroup(WorkspaceType.GLOBAL) as init_tg:
+            with Node('node1'):
+                n1setup = net.Net('n1setup')
+                n1msg = n1setup.Const('Hello from node 1.')
+                Task(step=n1setup)
+
+        with TaskGroup() as private_tg:
+            with Node('node1'):
+                n1 = net.Net('n1')
+                n1.Print(n1msg, 0)
+                Task(step=n1)
+            with Node('node2'):
+                n2 = net.Net('n2')
+                n2.Print(n2.Const('Hello from node 2.'), 0)
+                Task(step=n2)
+
+        session = LocalSession()
+        session.run(net)
+        session.run(step)
+        session.run(init_tg)
+        session.run(private_tg)
+
+
+    Global Workspace:
+        At the beggining of the session, a global workspace is created and kept
+        alive for the duration of the session.
+
+
+    Private Workspace:
+        Tasks can be run either directly on the global workspace, or they can
+        instantiate a private child workspace that is released after each run.
+
+    Blob visibility:
+        Tasks running in different nodes in parallel will always run under
+        different workspaces, so it must be assumed that they won't be able to
+        access each other's blobs. Tasks running on the same node will follow
+        Workspace hierarchy rules: tasks running on separate private workspaces
+        will only be able to share blobs defined on a common parent Workspace.
+    """
+
+    _compiled_cache = {}
+
+    def __init__(self):
+        self._open = True
+
+    def is_open(self):
+        return self._open
+
+    @classmethod
+    def compile(cls, runnable, workspace_type=None, setup_net_list=None):
+        if isinstance(runnable, CompiledRunnable):
+            assert cls == runnable.session_class, (
+                'Runnable was compiled for different session type. ' +
+                'Need: %s, got: %s' % (
+                    cls.__name__, runnable.session_class.__name__))
+            return runnable
+
+        if runnable in cls._compiled_cache:
+            return cls._compiled_cache[runnable]
+
+        if isinstance(runnable, TaskGroup):
+            if workspace_type:
+                if runnable.workspace_type():
+                    assert runnable.workspace_type() == workspace_type, \
+                        "Require {} but already have {}".format(
+                            workspace_type, runnable.workspace_type())
+                else:
+                    runnable._workspace_type = workspace_type
+            tg = runnable
+        else:
+            if workspace_type is None:
+                workspace_type = WorkspaceType.GLOBAL
+            tg = TaskGroup(workspace_type=workspace_type)
+            if isinstance(runnable, Task):
+                tg.add(runnable)
+            elif isinstance(runnable, core.ExecutionStep):
+                tg.add(Task(step=runnable))
+            elif isinstance(runnable, core.Plan):
+                # ExecutionSteps in Plan() object is supposed to run sequentially, while
+                # tasks in TaskGroup run in parallel. So if we have multiple
+                # ExecutionSteps in Plan() object, we choose to have a root
+                # ExecutionStep to wrap all ExecutionSteps.
+                assert len(runnable.Steps()) > 0
+                if len(runnable.Steps()) == 1:
+                    tg.add(Task(step=runnable.Steps()[0]))
+                else:
+                    # Task takes a list of ExecutionSteps and automatically wrap into
+                    # a root ExecutionStep
+                    tg.add(Task(step=runnable.Steps()))
+            else:
+                step = core.execution_step('runnable', runnable)
+                tg.add(Task(step=step))
+        compiled = CompiledRunnable(
+            cls._compile_task_group(tg, setup_net_list), session_class=cls)
+        cls._compiled_cache[runnable] = compiled
+        return compiled
+
+    def run(self, runnable, workspace_type=None, setup_net_list=None):
+        """Run the given runnable.
+
+        Args:
+            runnable: Object recognized by the Session. Currently, we support
+                TaskGroup, Task, Plan, ExecutionStep, and Net.
+            workspace_type: A string defined in the WorkspaceType object.
+            setup_net_list: A list of Net objects or a list of NetDef protos.
+                So far this is only used by the DistributedSession, in which we
+                need to pass a list of special nets to setup the master.
+        """
+        assert self.is_open(), 'Session is closed.'
+        assert runnable is not None, 'Got a none runnable.'
+        self._run_compiled(self.compile(runnable, workspace_type,
+                                        setup_net_list).obj)
+
+    def close(self):
+        if self.is_open():
+            self._do_close()
+            self._open = False
+
+    def fetch_output(self, output):
+        raise NotImplementedError()
+
+    def _run_compiled(self, task_group):
+        raise NotImplementedError()
+
+    @classmethod
+    def _compile_task_group(cls, task_group, setup_net_list=None):
+        return task_group
+
+    def _do_close(self):
+        pass
+
+    def __enter__(self):
+        assert self._open, 'Session already closed.'
+        return self
+
+    def __exit__(self, ex_type, value, traceback):
+        if ex_type is None:
+            self.close()
+
+
+class LocalSession(Session):
+    """
+    Session that runs in a single node.
+    Tasks are all remapped to run in parallel in the 'local' node.
+
+    Currently, LocalSession runs all parallel tasks in the same workspace,
+    but this behavior may change in the future. Only tasks pointing to the
+    same logical node are guaranteed to always run in the same workspace.
+    """
+    def __init__(self, ws=None):
+        Session.__init__(self)
+        self._ws = ws or workspace.C.Workspace.current
+
+    @classmethod
+    def _compile_task_group(cls, task_group, setup_net_list=None):
+        with Cluster():
+            task = task_group.to_task()
+        plan = core.Plan('task_group_plan')
+        plan.AddStep(task.get_step())
+        return (plan, task.output_list(), task.workspace_type)
+
+    def _run_compiled(self, compiled):
+        plan, output_list, workspace_type = compiled
+
+        # make sure the output blobs belong to the parent workspace
+        outputs = []
+        for name in output_list.names():
+            self._ws.create_blob(str(name))
+            outputs.append(core.BlobReference(str(name)))
+        output_list.set_values(outputs, _fetch_func=self._fetch_output)
+        task_ws = (
+            workspace.C.Workspace(self._ws)
+            if workspace_type == WorkspaceType.PRIVATE else self._ws)
+        with workspace.WorkspaceGuard(task_ws):
+            task_ws.run(plan)
+
+    def _fetch_output(self, output):
+        return self._ws.blobs[str(output)].fetch()
diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py
new file mode 100644
index 0000000..ae5e50d
--- /dev/null
+++ b/caffe2/python/session_test.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.schema import (
+    Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
+from caffe2.python import core, workspace
+from caffe2.python.session import LocalSession
+from caffe2.python.dataset import Dataset
+from caffe2.python.pipeline import pipe
+from caffe2.python.task import TaskGroup
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestLocalSession(TestCase):
+    def test_local_session(self):
+        init_net = core.Net('init')
+        src_values = Struct(
+            ('uid', np.array([1, 2, 6])),
+            ('value', np.array([1.4, 1.6, 1.7])))
+        expected_dst = Struct(
+            ('uid', np.array([2, 4, 12])),
+            ('value', np.array([0.0, 0.0, 0.0])))
+
+        with core.NameScope('init'):
+            src_blobs = NewRecord(init_net, src_values)
+            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
+
+        def proc1(rec):
+            net = core.Net('proc1')
+            with core.NameScope('proc1'):
+                out = NewRecord(net, rec)
+            net.Add([rec.uid(), rec.uid()], [out.uid()])
+            out.value.set(blob=rec.value(), unsafe=True)
+            return [net], out
+
+        def proc2(rec):
+            net = core.Net('proc2')
+            with core.NameScope('proc2'):
+                out = NewRecord(net, rec)
+            out.uid.set(blob=rec.uid(), unsafe=True)
+            net.Sub([rec.value(), rec.value()], [out.value()])
+            return [net], out
+
+        src_ds = Dataset(src_blobs)
+        dst_ds = Dataset(dst_blobs)
+
+        with TaskGroup() as tg:
+            out1 = pipe(src_ds.reader(), processor=proc1)
+            out2 = pipe(out1, processor=proc2)
+            pipe(out2, dst_ds.writer())
+
+        ws = workspace.C.Workspace()
+        FeedRecord(src_blobs, src_values, ws)
+        session = LocalSession(ws)
+        session.run(init_net)
+        session.run(tg)
+        output = FetchRecord(dst_blobs, ws=ws)
+
+        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
+            np.testing.assert_array_equal(a, b)
diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py
new file mode 100644
index 0000000..73e10d0
--- /dev/null
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@@ -0,0 +1,154 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+import numpy as np
+
+
+class TestSparseToDenseMask(TestCase):
+
+    def test_sparse_to_dense_mask_float(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2, 6])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_mask_invalid_inputs(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2000000000000, 999999999, 2, 3, 4, 5], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 3, 4, 5, 6], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([6], dtype=np.int32))
+        try:
+            workspace.RunOperatorOnce(op)
+        except RuntimeError:
+            self.fail("Exception raised with only one negative index")
+        workspace.FeedBlob(
+            'indices',
+            np.array([2000000000000, 999999999, -2, -3, -4, -5], dtype=np.int32))
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+    def test_sparse_to_dense_mask_subtensor(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2, 888, 6])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 6, 999999999, 2], dtype=np.int64))
+        workspace.FeedBlob(
+            'values',
+            np.array([[[1, -1]], [[2, -2]], [[3, -3]], [[4, -4]], [[5, -5]]],
+                     dtype=np.float))
+        workspace.FeedBlob('default', np.array([[-1, 0]], dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([2, 3], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([
+            [[[-1, 0]], [[1, -1]], [[-1, 0]], [[-1, 0]]],
+            [[[4, -4]], [[5, -5]], [[-1, 0]], [[3, -3]]]], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_mask_string(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2, 6])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array(['1', '2', '3', '4', '5', '6', '7'], dtype='S'))
+        workspace.FeedBlob('default', np.array('-1', dtype='S'))
+        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected =\
+            np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype='S')
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_mask_empty_lengths(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default'],
+            ['output'],
+            mask=[1, 2, 6])
+        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
+        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([-1, 1, 3], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_mask_no_lengths(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default'],
+            ['output'],
+            mask=[1, 2, 6])
+        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
+        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([-1, 1, 3], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_mask_presence_mask(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output', 'presence_mask'],
+            mask=[11, 12],
+            return_presence_mask=True)
+        workspace.FeedBlob('indices', np.array([11, 12, 13], dtype=np.int32))
+        workspace.FeedBlob('values', np.array([11, 12, 13], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([1, 2], dtype=np.int32))
+
+        workspace.RunOperatorOnce(op)
+
+        output = workspace.FetchBlob('output')
+        presence_mask = workspace.FetchBlob('presence_mask')
+        expected_output = np.array([[11, -1], [-1, 12]], dtype=np.float)
+        expected_presence_mask = np.array(
+            [[True, False], [False, True]],
+            dtype=np.bool)
+        self.assertEqual(output.shape, expected_output.shape)
+        np.testing.assert_array_equal(output, expected_output)
+        self.assertEqual(presence_mask.shape, expected_presence_mask.shape)
+        np.testing.assert_array_equal(presence_mask, expected_presence_mask)
diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py
new file mode 100644
index 0000000..13547d6
--- /dev/null
+++ b/caffe2/python/sparse_to_dense_test.py
@@ -0,0 +1,76 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+import numpy as np
+
+
+class TestSparseToDense(TestCase):
+    def test_sparse_to_dense(self):
+        op = core.CreateOperator(
+            'SparseToDense',
+            ['indices', 'values'],
+            ['output'])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 6, 7], dtype=np.int32))
+
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        print(output)
+
+        expected = np.zeros(1000, dtype=np.int32)
+        expected[2] = 1 + 7
+        expected[4] = 2
+        expected[999] = 6
+
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
+
+    def test_sparse_to_dense_invalid_inputs(self):
+        op = core.CreateOperator(
+            'SparseToDense',
+            ['indices', 'values'],
+            ['output'])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 6], dtype=np.int32))
+
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+    def test_sparse_to_dense_with_data_to_infer_dim(self):
+        op = core.CreateOperator(
+            'SparseToDense',
+            ['indices', 'values', 'data_to_infer_dim'],
+            ['output'])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 6, 7], dtype=np.int32))
+        workspace.FeedBlob(
+            'data_to_infer_dim',
+            np.array(np.zeros(1500, ), dtype=np.int32))
+
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        print(output)
+
+        expected = np.zeros(1500, dtype=np.int32)
+        expected[2] = 1 + 7
+        expected[4] = 2
+        expected[999] = 6
+
+        self.assertEqual(output.shape, expected.shape)
+        np.testing.assert_array_equal(output, expected)
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
new file mode 100644
index 0000000..311211d
--- /dev/null
+++ b/caffe2/python/task.py
@@ -0,0 +1,663 @@
+## @package task
+# Module caffe2.python.task
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, context
+from caffe2.python.schema import Field, from_blob_list
+from collections import defaultdict
+from copy import copy
+from future.utils import viewitems
+
+
+def _merge_node_kwargs(a, b):
+    # TODO(azzolini): consistency checks
+    if a is None:
+        return b
+    if b is None:
+        return a
+    c = copy(a)
+    c.update(b)
+    return c
+
+
+@context.define_context(allow_default=True)
+class Cluster(object):
+    """
+    Context that keeps track of all the node names used.
+    Users shouldn't have to use them directly, since a Cluster is automatically
+    generated at the first usage of 'Node'.
+    """
+
+    def __init__(self):
+        # list instead of set to keep order
+        self._nodes = []
+        self._node_kwargs = {}
+
+    def add_node(self, node):
+        if str(node) not in self._nodes:
+            self._nodes.append(str(node))
+        self._node_kwargs[str(node)] = _merge_node_kwargs(
+            node.kwargs(),
+            self._node_kwargs.get(str(node)))
+
+    def nodes(self):
+        """
+        Returns the list of unique node names used within this context.
+        """
+        return self._nodes
+
+    def node_kwargs(self):
+        return self._node_kwargs
+
+
+@context.define_context(allow_default=True)
+class Node(object):
+    """
+    A Node context is used to indicate that all Tasks instantiated within will
+    run on the given node name. (Only the name of the node actually counts.)
+    Example:
+
+        with TaskGroup() as tg:
+            with Node('node1'):
+                s1 = execution_step(...)
+                Task(step=s1)
+            with Node('node2'):
+                s2 = execution_step(...)
+            with Node('node1'):
+                s3 = execution_step(...)
+
+        In this example, all three execution steps will run in parallel.
+        Moreover, s1 and s3 will run on the same node, and can see each
+        others blobs.
+
+        Additionally, a Node can be passed implementation-specific kwargs,
+        in order to specify properties of the node.
+    """
+
+    def __init__(self, node='local', **kwargs):
+        self._name = str(node)
+        self._kwargs = kwargs
+        Cluster.current().add_node(self)
+
+    def __str__(self):
+        return self._name
+
+    def kwargs(self):
+        return self._kwargs
+
+
+class WorkspaceType(object):
+    """
+    Determines whether tasks of a TaskGroup will run directly at the global
+    workspace, which is kept alive across runs, or whether a new child
+    workspace will be created for the run and destroyed afterwards.
+    """
+    PRIVATE = 'private'
+    GLOBAL = 'global'
+
+
+def get_setup_nets(key, steps_or_nets, target):
+    init_net = core.Net(key + '/init')
+    exit_net = core.Net(key + '/exit')
+    init_nets = []
+    exit_nets = []
+    objs = []
+    for step_or_net in steps_or_nets:
+        if hasattr(step_or_net, 'get_all_attributes'):
+            objs += step_or_net.get_all_attributes(key)
+        elif hasattr(step_or_net, 'get_attributes'):
+            objs += step_or_net.get_attributes(key)
+    for obj in objs:
+        # these are needed in order to allow nesting of TaskGroup, which
+        # is a feature not yet implemented.
+        if hasattr(obj, '_setup_used') and obj._setup_used:
+            continue
+        if hasattr(obj, '_setup_target') and obj._setup_target != target:
+            continue
+        if hasattr(obj, 'setup'):
+            nets = obj.setup(init_net)
+            if isinstance(nets, (list, tuple)):
+                init_nets += nets
+            elif isinstance(nets, (core.Net, core.ExecutionStep)):
+                init_nets.append(nets)
+            elif nets is not None:
+                raise TypeError('Unsupported type for setup: %s' % type(nets))
+            obj._setup_used = True
+        if hasattr(obj, 'exit'):
+            nets = obj.exit(exit_net)
+            if isinstance(nets, (list, tuple)):
+                exit_nets += nets
+            elif isinstance(nets, (core.Net, core.ExecutionStep)):
+                exit_nets.append(nets)
+            elif nets is not None:
+                raise TypeError('Unsupported type for setup: %s' % type(nets))
+            obj._setup_used = True
+
+    if len(init_net.Proto().op) > 0:
+        init_nets.insert(0, init_net)
+    if len(exit_net.Proto().op) > 0:
+        exit_nets.insert(0, exit_net)
+    return init_nets, exit_nets
+
+
+def add_setup_steps(step, init_nets, exit_nets, name):
+    if not init_nets and not exit_nets:
+        return step
+    steps = []
+    if init_nets:
+        steps.append(core.execution_step('%s:init' % name, init_nets))
+    steps.append(step)
+    if len(exit_nets) > 0:
+        steps.append(core.execution_step('%s:exit' % name, exit_nets))
+    return core.execution_step(name, steps)
+
+
+@context.define_context(allow_default=False)
+class TaskGroup(object):
+    """
+    Context that gathers tasks which will run concurrently, potentially on
+    multiple nodes. All tasks in the same node will share the same workspace
+    and thus can share blobs, while tasks running in different nodes won't
+    be able to directly share data.
+
+    All tasks of the task group will start concurrently, and the task group
+    will finish execution when the last task of the group finishes.
+
+    Example:
+        # supose that s1 ... s5 are execution steps or nets.
+        with TaskGroup() as tg:
+            # these tasks go to default node 'local'
+            Task(step=s1)
+            Task(step=s2)
+
+            with Node('n2'):
+                Task(step=s3)
+            with Node('n1'):
+                Task(step=s4)
+            with Node('n2'):
+                Task(step=s5)
+
+        # this will run all steps in parallel.
+        # s1 and s2 will run at default node 'local'
+        # s3 and s5 will run at node 'n2'
+        # s4 will run at node 'n1'
+        session.run(tg)
+    """
+    LOCAL_SETUP = 'local_setup'
+
+    def __init__(self, workspace_type=None):
+        self._plan_cache = None
+        self._tasks = []
+        self._already_used = False
+        self._prev_active = None
+        self._tasks_to_add = []
+        self._report_nets = {}
+        self._report_steps = []
+        self._workspace_type = workspace_type
+        self._tasks_by_node = None
+
+    def add(self, task):
+        assert not self._already_used, (
+            'Cannot add Task to an already used TaskGroup.')
+        assert (
+            self._workspace_type is None or
+            task._workspace_type is None or
+            self._workspace_type == task._workspace_type)
+        if task._workspace_type is None:
+            task._workspace_type = (
+                self._workspace_type or WorkspaceType.PRIVATE)
+        if self._workspace_type is None:
+            self._workspace_type = task._workspace_type
+        task._notify_used()
+        self._tasks.append(task)
+
+    def tasks(self):
+        for task in self._tasks_to_add:
+            self.add(task)
+        self._tasks_to_add = []
+        self._already_used = True
+        return self._tasks
+
+    def num_registered_tasks(self):
+        return len(self._tasks_to_add) + len(self._tasks)
+
+    def used_nodes(self):
+        # use list to keep order
+        used = []
+        for task in self._tasks + self._tasks_to_add:
+            if task.node not in used:
+                used.append(task.node)
+        return used
+
+    def report_step(self, step=None, node=None, interval_ms=1000):
+        """
+        Add a "report step" to this TaskGroup. This step will run repeatedly
+        every `interval_ms` milliseconds for the duration of the TaskGroup
+        execution on each of the nodes. It is guaranteed that this step
+        will be run at least once after every Task in the node has finished.
+        """
+        step = core.to_execution_step(step)
+        step.RunEveryMillis(interval_ms)
+        self._report_steps.append((str(node or Node.current(node)), step))
+
+    def report_net(self, net=None, node=None, report_interval=5):
+        """
+        DEPRECATED. Use report_step instead.
+        """
+        node = str(node or Node.current(node))
+        assert net is None or node not in self._report_nets
+        if node not in self._report_nets:
+            self._report_nets[node] = (
+                net if net else core.Net('%s/reporter' % node),
+                report_interval)
+        return self._report_nets[node][0]
+
+    def tasks_by_node(self, node_remap=None):
+        # tasks_by_node can't be called twice because the setup won't
+        # work properly a second time.
+        node_map = {}
+        for task in self.tasks():
+            node_map[task.node] =\
+                node_remap(task.node) if node_remap else task.node
+        if self._tasks_by_node is not None:
+            tasks_by_node, prev_node_map = self._tasks_by_node
+            assert prev_node_map == node_map, (
+                'Cannot call tasks_by_node multiple times.')
+            return tasks_by_node
+
+        # now we have report_steps. report_net is deprecated
+        for node, (net, interval) in viewitems(self._report_nets):
+            self.report_step(net, node=node, interval_ms=interval * 1000)
+        self._report_nets = {}
+
+        tasks_by_node = defaultdict(list)
+        for task in self.tasks():
+            mapped_node = node_map[task.node]
+            tasks_by_node[mapped_node].append(task)
+
+        report_steps_by_node = defaultdict(list)
+        for original_node, step in self._report_steps:
+            report_steps_by_node[node_map[original_node]].append(step)
+
+        grouped_by_node = TaskGroup()
+        for node, tasks in viewitems(tasks_by_node):
+            report_steps = report_steps_by_node[node]
+            node_inits, node_exits = get_setup_nets(
+                TaskGroup.LOCAL_SETUP,
+                [t.get_step() for t in tasks] + report_steps,
+                self)
+            # shortcut for single task with no queue
+            steps = report_steps
+            outputs = []
+            grouped_workspace_type = WorkspaceType.PRIVATE
+            for task in tasks:
+                step = task.get_step()
+                step.SetCreateWorkspace(
+                    task.workspace_type() == WorkspaceType.PRIVATE)
+                if step is not None:
+                    steps.append(step)
+                outputs += task.outputs()
+                # If any of the tasks in the node uses the global workspace,
+                # then set the grouped task to use the global workspace as well
+                if task.workspace_type() == WorkspaceType.GLOBAL:
+                    grouped_workspace_type = WorkspaceType.GLOBAL
+            if len(steps) == 0:
+                steps.append(core.execution_step('empty', []))
+            if len(steps) == 1:
+                step = steps[0]
+            else:
+                step = core.execution_step(
+                    '%s:body' % node, steps, concurrent_substeps=True)
+            if len(node_inits) > 0 or len(node_exits) > 0:
+                steps = []
+                if len(node_inits) > 0:
+                    steps.append(
+                        core.execution_step('%s:init' % node, node_inits))
+                steps.append(step)
+                if len(node_exits) > 0:
+                    steps.append(
+                        core.execution_step('%s:exit' % node, node_exits))
+                step = core.execution_step(node, steps)
+            Task(
+                node=node, step=step, outputs=outputs,
+                name='grouped_by_node',
+                group=grouped_by_node, workspace_type=grouped_workspace_type)
+        self._tasks_by_node = (grouped_by_node, node_map)
+        return grouped_by_node
+
+    def to_task(self, node=None):
+        node = str(Node.current(node))
+        tasks = self.tasks_by_node(lambda x: node).tasks()
+        if len(tasks) == 0:
+            return Task()
+        return tasks[0]
+
+    def workspace_type(self):
+        return self._workspace_type
+
+
+class TaskOutput(object):
+    """
+    Represents the output of a task. An output can be a blob,
+    a list of blob, or a record.
+    """
+
+    def __init__(self, names):
+        self._schema = None
+        self._is_scalar = False
+        if isinstance(names, Field):
+            self._schema = names
+            names = self._schema.field_blobs()
+        self._is_scalar = type(names) not in (tuple, list)
+        if self._is_scalar:
+            names = [names]
+        self.names = names
+        self._values = None
+
+    def set(self, values, _fetch_func=None):
+        assert len(values) == len(self.names)
+        self._values = values
+        self._fetch_func = _fetch_func
+
+    def get(self):
+        assert self._values is not None, 'Output value not set yet.'
+        if self._is_scalar:
+            return self._values[0]
+        elif self._schema:
+            return from_blob_list(self._schema, self._values)
+        else:
+            return self._values
+
+    def fetch(self):
+        assert self._fetch_func is not None, (
+            'Cannot fetch value for this output.')
+        fetched_vals = [self._fetch_func(v) for v in self._values]
+        if self._is_scalar:
+            return fetched_vals[0]
+        elif self._schema:
+            return from_blob_list(self._schema, fetched_vals)
+        else:
+            return fetched_vals
+
+
+def final_output(blob_or_record):
+    """
+    Adds an output to the current Task, or if no task is active,
+    create a dummy task that returns the given blob or record
+    to the client. This will return the value of the blob or record when
+    the last task of the TaskGroup for a given node finishes.
+    """
+    cur_task = Task.current(required=False) or Task()
+    return cur_task.add_output(blob_or_record)
+
+
+class TaskOutputList(object):
+    """ Keeps a list of outputs for a task """
+    def __init__(self, outputs=None):
+        self.outputs = outputs or []
+
+    def names(self):
+        """
+        Retrive the output names.
+        TODO(azzolini): make this schema-based.
+        """
+        names = []
+        for o in self.outputs:
+            names += o.names
+        return names
+
+    def set_values(self, values, _fetch_func=None):
+        offset = 0
+        for o in self.outputs:
+            num = len(o.names)
+            o.set(values[offset:offset + num], _fetch_func)
+            offset += num
+        assert offset == len(values), 'Wrong number of output values.'
+
+
+@context.define_context()
+class Task(object):
+    """
+    A Task is composed of an execution step and zero or more outputs.
+    Tasks are executed in the context of a TaskGroup, which, in turn, can
+    be run by a Session.
+
+    Task outputs are fetched by the session at the end of the run.
+
+    The recommended way of creating a task is by using `net_builder.ops`.
+    Example:
+
+        from net_builder import ops
+        with Node('trainer'), Task(name='my_task', num_instances=2):
+            with ops.task_init():
+                globl = ops.Const(0)
+            with ops.task_instance_init():
+                local = ops.Const(0)
+            with ops.loop(100):
+                ops.Copy(globl, local)
+            with ops.task_instance_exit():
+                ops.Add([globl, local], [globl])
+            with ops.task_exit():
+                ops.Mul([globl, globl], [blobl])
+
+    The task above will create 2 instances that will run in parallel.
+    Each instance will copy `local` to `globl` 100 times, Then Add `local`
+    to `globl` once. The `Mul` will only execute once, after all the instances
+    of the task have finished.
+    """
+
+    # TASK_SETUP runs once per task, before/after all
+    # concurrent task instances start/finish.
+    TASK_SETUP = 'task_setup'
+    # Setup will run once for each instance of the task.
+    TASK_INSTANCE_SETUP = 'task_instance_setup'
+    REPORT_STEP = 'report_step'
+    _global_names_used = set()
+
+    @staticmethod
+    def _get_next_name(node, group, name):
+        basename = str(node) + '/' + str(name)
+        names_used = (
+            Task._global_names_used
+            if group is None else
+            set(t.name for t in group._tasks_to_add))
+        cur_name = basename
+        i = 0
+        while cur_name in names_used:
+            i += 1
+            cur_name = '%s:%d' % (basename, i)
+        return cur_name
+
+    def __init__(
+            self, step=None, outputs=None,
+            workspace_type=None, group=None, node=None, name=None,
+            num_instances=None):
+        """
+        Instantiate a Task and add it to the current TaskGroup and Node.
+
+        Args:
+           step:    If provided, this task will run this ExecutionStep.
+           outputs: If provided, the task will return the provided outputs
+                    to the client at completion time.
+           node:    If provided, force task execution on the given node.
+           name:    Name of the Task.
+           num_instances: If provided, this task will be cloned num_instances
+                          times at runtime, and all instances will run
+                          concurrently.
+        """
+        if not name and isinstance(step, core.ExecutionStep):
+            name = step.Proto().name
+        if not name:
+            name = 'task'
+        # register this node name with active context
+        self.node = str(Node.current(None if node is None else Node(node)))
+        self.group = TaskGroup.current(group, required=False)
+
+        self.name = Task._get_next_name(self.node, self.group, name)
+
+        # may need to be temporarily removed later if Task used as a context
+        if self.group is not None:
+            self.group._tasks_to_add.append(self)
+
+        self._already_used = False
+        self._step = None
+        self._step_with_setup = None
+        self._outputs = []
+        if step is not None:
+            self.set_step(step)
+        if outputs is not None:
+            self.add_outputs(outputs)
+
+        self._pipeline = None
+        self._is_pipeline_context = False
+        self._workspace_type = workspace_type
+        self._report_net = None
+        self._num_instances = num_instances
+
+    def __enter__(self):
+        # temporarily remove from _tasks_to_add to ensure correct order
+        if self.group is not None:
+            self.group._tasks_to_add.remove(self)
+        self._assert_not_used()
+        assert self._step is None, 'This Task already has an execution step.'
+        from caffe2.python import net_builder
+        self._net_builder = net_builder.NetBuilder(_fullname=self.name)
+        self._net_builder.__enter__()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._net_builder.__exit__(type, value, traceback)
+        if type is None:
+            self.set_step(self._net_builder)
+        if self.group is not None:
+            self.group._tasks_to_add.append(self)
+        self._net_builder = None
+
+    def workspace_type(self):
+        return self._workspace_type
+
+    def _assert_not_used(self):
+        assert not self._already_used, (
+            'Cannot modify task since it is already been used.')
+
+    def add_output(self, output):
+        self._assert_not_used()
+        output = (
+            output if isinstance(output, TaskOutput) else TaskOutput(output))
+        self._outputs.append(output)
+        return output
+
+    def add_outputs(self, outputs):
+        self._assert_not_used()
+        if type(outputs) not in (list, tuple):
+            return self.add_output(outputs)
+        else:
+            return [self.add_output(output) for output in outputs]
+
+    def set_step(self, step):
+        self._assert_not_used()
+        self._step = core.to_execution_step(step)
+
+    def get_step(self):
+        if self._step_with_setup is not None:
+            return self._step_with_setup
+
+        if self._step is None:
+            self._step_with_setup = core.execution_step(self.name, [])
+            return self._step_with_setup
+
+        report_steps = [
+            s
+            for s in self._step.get_all_attributes(Task.REPORT_STEP)
+            if not hasattr(s, '_report_step_used')
+        ]
+        for step in report_steps:
+            step._report_step_used = True
+            if not step.Proto().run_every_ms:
+                step.RunEveryMillis(1000)
+        task_init_nets, task_exit_nets = get_setup_nets(
+            Task.TASK_SETUP, [self._step] + report_steps, self)
+        instance_init_nets, instance_exit_nets = get_setup_nets(
+            Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self)
+        if len(self._outputs) == 0:
+            output_net = core.Net('%s:output' % self.name)
+            self.add_output(output_net.ConstantFill(
+                [], 1, dtype=core.DataType.INT32, value=0))
+            task_exit_nets.append(output_net)
+
+        # Add instance-level report steps
+        body = self._step if not report_steps else core.execution_step(
+            '%s:body' % self.name, report_steps + [self._step])
+        # Enclose with instance-level (thread-local) setup nets
+        step_with_instance_setup = add_setup_steps(
+            body, instance_init_nets, instance_exit_nets,
+            self.name + ':instance')
+        # Set up runtime concurrent instances
+        if self._num_instances and self._num_instances > 1:
+            step_with_instance_setup.SetCreateWorkspace(True)
+            step_with_instance_setup = core.execution_step(
+                '%s:parallel',
+                [step_with_instance_setup],
+                num_concurrent_instances=self._num_instances)
+        # Enclose with task-level setup nets
+        self._step_with_setup = add_setup_steps(
+            step_with_instance_setup, task_init_nets, task_exit_nets, self.name)
+
+        return self._step_with_setup
+
+    def output_list(self):
+        return TaskOutputList(self._outputs)
+
+    def outputs(self):
+        return self._outputs
+
+    def _notify_used(self):
+        self.get_step()
+        self._already_used = True
+
+
+class SetupNets(object):
+    """
+    Allow to register a list of nets to be run at initialization
+    and finalization of Tasks or TaskGroups.
+    For example, let's say you have the following:
+
+        init_net = core.Net('init')
+        my_val = init_net.ConstantFill([], 'my_val', value=0)
+
+        net = core.Net('counter')
+        net.Add([my_val, net.Const(1),], [my_val])
+
+        with TaskGroup() as task_group:
+            with Node('trainer'):
+                my_task = Task(step=[net])
+
+    In order to have `init_net` run once before `net` runs for the
+    first time, you can do one of the following:
+
+        net.add_attribute(Task.TASK_SETUP, SetupNets([init_net]))
+
+    or
+
+        net.add_attribute(TaskGroup.LOCAL_SETUP, SetupNets([init_net]))
+
+    - With Task.TASK_SETUP, init_net will run once at my_task startup.
+    - With TaskGroup.LOCAL_SETUP, init_net will run once on node 'trainer',
+      before any task of the task group is run on that node.
+
+    The same SetupNets object can be added to multiple nets. It will only
+    run once per Task/TaskGroup run.
+    """
+
+    def __init__(self, init_nets=None, exit_nets=None):
+        self.init_nets = init_nets
+        self.exit_nets = exit_nets
+
+    def setup(self, init_net):
+        return self.init_nets
+
+    def exit(self, exit_net):
+        return self.exit_nets
diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py
new file mode 100644
index 0000000..66d6835
--- /dev/null
+++ b/caffe2/python/test/blob_deallocation_test.py
@@ -0,0 +1,28 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+import unittest
+
+core.GlobalInit(['python'])
+
+
+class BlobDeallocationTest(unittest.TestCase):
+    def test(self):
+        net = core.Net('net')
+
+        x = net.GivenTensorStringFill([], ['x'], shape=[3], values=['a', 'b', 'c'])
+        y = net.GivenTensorStringFill([], ['y'], shape=[3], values=['d', 'e', 'f'])
+        net.Concat([x, y], ['concated', '_'], axis=0)
+
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(net)
+
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(net)
+        self.assertTrue(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py
new file mode 100644
index 0000000..72e9f83
--- /dev/null
+++ b/caffe2/python/test/do_op_test.py
@@ -0,0 +1,77 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+import unittest
+
+
+class DoOpTest(TestCase):
+    def test_operator(self):
+        def make_net():
+            subnet = core.Net('subnet')
+            subnet.Add(["X", "Y"], "Z")
+
+            net = core.Net("net")
+            net.CreateScope([], "W")
+
+            net.Do(
+                ["outer_X", "outer_Y", "W"],
+                ["outer_Z", "W"],
+                net=subnet.Proto(),
+                inner_blobs=["X", "Y", "Z"],
+                outer_blobs_idx=[0, 1, 2],
+            )
+
+            return net
+
+        net = make_net()
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob("outer_X", np.asarray([1, 2]))
+        workspace.FeedBlob("outer_Y", np.asarray([3, 4]))
+
+        workspace.RunNetOnce(net)
+        outer_Z_val = workspace.FetchBlob("outer_Z")
+        self.assertTrue(np.all(outer_Z_val == np.asarray([4, 6])))
+
+    def test_reuse_workspace(self):
+        def make_net():
+            param_init_subnet = core.Net('param_init_subnet')
+            param_init_subnet.ConstantFill([], "X", shape=[1], value=1)
+            param_init_subnet.ConstantFill([], "Y", shape=[1], value=2)
+
+            subnet = core.Net("subnet")
+            subnet.Add(["X", "Y"], "Z")
+
+            net = core.Net("net")
+            net.CreateScope([], "W")
+            net.Do(
+                "W", "W",
+                net=param_init_subnet.Proto(),
+                inner_blobs=[],
+                outer_blobs_idx=[],
+            )
+
+            net.Do(
+                "W", ["outer_Z", "W"],
+                net=subnet.Proto(),
+                inner_blobs=["Z"],
+                outer_blobs_idx=[0],
+                reuse_workspace=True,
+            )
+
+            return net
+
+        net = make_net()
+
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(net)
+        outer_Z_val = workspace.FetchBlob("outer_Z")
+        self.assertTrue(np.all(outer_Z_val == np.asarray([3])))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py
new file mode 100644
index 0000000..0bf6c8e
--- /dev/null
+++ b/caffe2/python/test/executor_test.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from caffe2.python import core, workspace
+from caffe2.python.test.executor_test_util import (
+    build_conv_model,
+    build_resnet50_dataparallel_model,
+    run_resnet50_epoch,
+    ExecutorTestBase,
+    executor_test_settings,
+    executor_test_model_names)
+
+from caffe2.python.test_util import TestCase
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+import unittest
+
+
+EXECUTORS = ["async_scheduling", "async_polling", "dag", "async_dag"]
+ITERATIONS = 1
+
+
+class ExecutorCPUConvNetTest(ExecutorTestBase):
+    @given(executor=st.sampled_from(EXECUTORS),
+           model_name=st.sampled_from(executor_test_model_names()),
+           batch_size=st.sampled_from([1]),
+           num_workers=st.sampled_from([8]))
+    @executor_test_settings
+    def test_executor(self, executor, model_name, batch_size, num_workers):
+        model = build_conv_model(model_name, batch_size)
+        model.Proto().num_workers = num_workers
+
+        def run_model():
+            iterations = ITERATIONS
+            if model_name == "MLP":
+                iterations = 1  # avoid numeric instability with MLP gradients
+            workspace.RunNet(model.net, iterations)
+
+        self.compare_executors(
+            model,
+            ref_executor="simple",
+            test_executor=executor,
+            model_run_func=run_model,
+        )
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
+class ExecutorGPUResNetTest(ExecutorTestBase):
+    @given(executor=st.sampled_from(EXECUTORS),
+           num_workers=st.sampled_from([8]))
+    @executor_test_settings
+    def test_executor(self, executor, num_workers):
+        model = build_resnet50_dataparallel_model(
+            num_gpus=workspace.NumCudaDevices(), batch_size=8, epoch_size=8)
+        model.Proto().num_workers = num_workers
+
+        def run_model():
+            run_resnet50_epoch(model, batch_size=8, epoch_size=8)
+
+        self.compare_executors(
+            model,
+            ref_executor="simple",
+            test_executor=executor,
+            model_run_func=run_model,
+        )
+
+
+class ExecutorFailingOpTest(TestCase):
+    def test_failing_op(self):
+        def create_failing_net(throw_exception):
+            net = core.Net("failing_net")
+            if throw_exception:
+                net.ThrowException([], [])
+            else:
+                net.Fail([], [])
+            net.Proto().type = "async_scheduling"
+            return net
+
+        workspace.ResetWorkspace()
+        net = create_failing_net(throw_exception=True)
+        workspace.CreateNet(net)
+        with self.assertRaises(RuntimeError):
+            workspace.RunNet(net)
+
+        with self.assertRaises(RuntimeError):
+            workspace.RunNet(net, allow_fail=True)
+
+        workspace.ResetWorkspace()
+        net = create_failing_net(throw_exception=False)
+        workspace.CreateNet(net)
+
+        with self.assertRaises(RuntimeError):
+            workspace.RunNet(net)
+
+        res = workspace.RunNet(net, allow_fail=True)
+        self.assertFalse(res)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
new file mode 100644
index 0000000..b392b3d
--- /dev/null
+++ b/caffe2/python/test/executor_test_util.py
@@ -0,0 +1,268 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from caffe2.python import (
+    brew, cnn, core, workspace, data_parallel_model,
+    timeout_guard, model_helper, optimizer)
+from caffe2.python.test_util import TestCase
+import caffe2.python.models.resnet as resnet
+from caffe2.python.modeling.initializers import Initializer
+from caffe2.python import convnet_benchmarks as cb
+from caffe2.python import hypothesis_test_util as hu
+
+import time
+import numpy as np
+from hypothesis import settings
+
+
+CI_MAX_EXAMPLES = 2
+CI_TIMEOUT = 600
+
+
+def executor_test_settings(func):
+    if hu.is_sandcastle() or hu.is_travis():
+        return settings(
+            max_examples=CI_MAX_EXAMPLES,
+            timeout=CI_TIMEOUT
+        )(func)
+    else:
+        return func
+
+
+def gen_test_resnet50(_order, _cudnn_ws):
+    model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="resnet_50_test",
+        cudnn_exhaustive_search=True,
+    )
+    data = model.net.AddExternalInput("data")
+    label = model.net.AddExternalInput("label")
+    (_softmax, loss) = resnet.create_resnet50(
+        model,
+        data,
+        num_input_channels=3,
+        num_labels=1000,
+        label=label,
+        is_test=False,
+    )
+    return model, 227
+
+
+def conv_model_generators():
+    return {
+        'AlexNet': cb.AlexNet,
+        'OverFeat': cb.OverFeat,
+        'VGGA': cb.VGGA,
+        'Inception': cb.Inception,
+        'MLP': cb.MLP,
+        'Resnet50': gen_test_resnet50,
+    }
+
+
+def executor_test_model_names():
+    if hu.is_sandcastle() or hu.is_travis():
+        return ["MLP"]
+    else:
+        return conv_model_generators().keys()
+
+
+def build_conv_model(model_name, batch_size):
+    model_gen_map = conv_model_generators()
+    assert model_name in model_gen_map, "Model " + model_name + " not found"
+    model, input_size = model_gen_map[model_name]("NCHW", None)
+
+    input_shape = [batch_size, 3, input_size, input_size]
+    if model_name == "MLP":
+        input_shape = [batch_size, input_size]
+
+    model.param_init_net.GaussianFill(
+        [],
+        "data",
+        shape=input_shape,
+        mean=0.0,
+        std=1.0
+    )
+    model.param_init_net.UniformIntFill(
+        [],
+        "label",
+        shape=[batch_size, ],
+        min=0,
+        max=999
+    )
+
+    model.AddGradientOperators(["loss"])
+
+    ITER = brew.iter(model, "iter")
+    LR = model.net.LearningRate(
+        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    for param in model.params:
+        param_grad = model.param_to_grad[param]
+        model.net.WeightedSum([param, ONE, param_grad, LR], param)
+
+    return model
+
+
+def build_resnet50_dataparallel_model(
+        num_gpus,
+        batch_size,
+        epoch_size,
+        cudnn_workspace_limit_mb=64,
+        num_channels=3,
+        num_labels=1000,
+        weight_decay=1e-4,
+        base_learning_rate=0.1,
+        image_size=227,
+        use_cpu=False):
+
+    batch_per_device = batch_size // num_gpus
+
+    train_arg_scope = {
+        'order': 'NCHW',
+        'use_cudnn': True,
+        'cudnn_exhaustive_search': False,
+        'ws_nbytes_limit': (cudnn_workspace_limit_mb * 1024 * 1024),
+        'deterministic': True,
+    }
+    train_model = model_helper.ModelHelper(
+        name="test_resnet50", arg_scope=train_arg_scope
+    )
+
+    def create_resnet50_model_ops(model, loss_scale):
+        with brew.arg_scope([brew.conv, brew.fc],
+                            WeightInitializer=Initializer,
+                            BiasInitializer=Initializer,
+                            enable_tensor_core=0):
+            pred = resnet.create_resnet50(
+                model,
+                "data",
+                num_input_channels=num_channels,
+                num_labels=num_labels,
+                no_bias=True,
+                no_loss=True,
+            )
+
+        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
+                                              ['softmax', 'loss'])
+        loss = model.Scale(loss, scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], "accuracy")
+        return [loss]
+
+    def add_optimizer(model):
+        stepsz = int(30 * epoch_size / batch_size)
+        optimizer.add_weight_decay(model, weight_decay)
+        opt = optimizer.build_multi_precision_sgd(
+            model,
+            base_learning_rate,
+            momentum=0.9,
+            nesterov=1,
+            policy="step",
+            stepsize=stepsz,
+            gamma=0.1
+        )
+        return opt
+
+    def add_image_input(model):
+        model.param_init_net.GaussianFill(
+            [],
+            ["data"],
+            shape=[batch_per_device, 3, image_size, image_size],
+            dtype='float',
+        )
+        model.param_init_net.ConstantFill(
+            [],
+            ["label"],
+            shape=[batch_per_device],
+            value=1,
+            dtype=core.DataType.INT32,
+        )
+
+    def add_post_sync_ops(model):
+        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
+            if param_info.blob_copy is not None:
+                model.param_init_net.HalfToFloat(
+                    param_info.blob,
+                    param_info.blob_copy[core.DataType.FLOAT])
+
+    # Create parallelized model
+    data_parallel_model.Parallelize(
+        train_model,
+        input_builder_fun=add_image_input,
+        forward_pass_builder_fun=create_resnet50_model_ops,
+        optimizer_builder_fun=add_optimizer,
+        post_sync_builder_fun=add_post_sync_ops,
+        devices=list(range(num_gpus)),
+        rendezvous=None,
+        optimize_gradient_memory=True,
+        cpu_device=use_cpu,
+        shared_model=use_cpu,
+    )
+
+    return train_model
+
+
+def run_resnet50_epoch(train_model, batch_size, epoch_size, skip_first_n_iter=0):
+    epoch_iters = int(epoch_size / batch_size)
+    prefix = "{}_{}".format(
+        train_model._device_prefix,
+        train_model._devices[0])
+    train_time = 0.0
+    train_examples = 0
+    for i in range(epoch_iters):
+        timeout = 600.0 if i == 0 else 60.0
+        with timeout_guard.CompleteInTimeOrDie(timeout):
+            t1 = time.time()
+            workspace.RunNet(train_model.net.Proto().name)
+            t2 = time.time()
+            dt = t2 - t1
+            if i >= skip_first_n_iter:
+                train_time += dt
+                train_examples += batch_size
+
+        fmt = "Finished iteration {}/{} ({:.2f} images/sec)"
+        print(fmt.format(i + 1, epoch_iters, batch_size / dt))
+
+    accuracy = workspace.FetchBlob(prefix + '/accuracy')
+    loss = workspace.FetchBlob(prefix + '/loss')
+
+    assert loss < 40, "Exploded gradients"
+
+    return (
+        train_examples,
+        train_time,
+        accuracy, loss)
+
+
+class ExecutorTestBase(TestCase):
+    def compare_executors(self, model, ref_executor, test_executor, model_run_func):
+        model.Proto().type = ref_executor
+        model.param_init_net.set_rand_seed(seed=0xCAFFE2)
+        model.net.set_rand_seed(seed=0xCAFFE2)
+
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(model.param_init_net)
+
+        workspace.CreateNet(model.net)
+        model_run_func()
+        ref_ws = {str(k): workspace.FetchBlob(k) for k in workspace.Blobs()}
+        ref_ws = {k: v for k, v in ref_ws.items() if type(v) is np.ndarray}
+
+        workspace.ResetWorkspace()
+        workspace.RunNetOnce(model.param_init_net)
+
+        model.Proto().type = test_executor
+        workspace.CreateNet(model.net, overwrite=True)
+        model_run_func()
+        test_ws = {str(k): workspace.FetchBlob(k) for k in workspace.Blobs()}
+        test_ws = {k: v for k, v in test_ws.items() if type(v) is np.ndarray}
+
+        for blob_name, ref_val in ref_ws.items():
+            self.assertTrue(
+                blob_name in test_ws,
+                "Blob {} not found in {} run".format(blob_name, test_executor))
+            val = test_ws[blob_name]
+            np.testing.assert_array_equal(
+                val, ref_val,
+                "Blob {} differs in {} run".format(blob_name, test_executor))
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
new file mode 100644
index 0000000..769679e
--- /dev/null
+++ b/caffe2/python/test_util.py
@@ -0,0 +1,35 @@
+## @package test_util
+# Module caffe2.python.test_util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import core, workspace
+
+import unittest
+
+
+def rand_array(*dims):
+    # np.random.rand() returns float instead of 0-dim array, that's why need to
+    # do some tricks
+    return np.array(np.random.rand(*dims) - 0.5).astype(np.float32)
+
+
+class TestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        workspace.GlobalInit([
+            'caffe2',
+            '--caffe2_log_level=0',
+        ])
+        # clear the default engines settings to separate out its
+        # affect from the ops tests
+        core.SetEnginePref({}, {})
+
+    def setUp(self):
+        self.ws = workspace.C.Workspace()
+        workspace.ResetWorkspace()
+
+    def tearDown(self):
+        workspace.ResetWorkspace()
diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py
new file mode 100644
index 0000000..c351bcd
--- /dev/null
+++ b/caffe2/python/text_file_reader.py
@@ -0,0 +1,58 @@
+## @package text_file_reader
+# Module caffe2.python.text_file_reader
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from caffe2.python.dataio import Reader
+from caffe2.python.schema import Scalar, Struct, data_type_for_dtype
+
+
+class TextFileReader(Reader):
+    """
+    Wrapper around operators for reading from text files.
+    """
+    def __init__(self, init_net, filename, schema, num_passes=1, batch_size=1):
+        """
+        Create op for building a TextFileReader instance in the workspace.
+
+        Args:
+            init_net   : Net that will be run only once at startup.
+            filename   : Path to file to read from.
+            schema     : schema.Struct representing the schema of the data.
+                         Currently, only support Struct of strings.
+            num_passes : Number of passes over the data.
+            batch_size : Number of rows to read at a time.
+        """
+        assert isinstance(schema, Struct), 'Schema must be a schema.Struct'
+        for name, child in schema.get_children():
+            assert isinstance(child, Scalar), (
+                'Only scalar fields are supported in TextFileReader.')
+        field_types = [
+            data_type_for_dtype(dtype) for dtype in schema.field_types()]
+        Reader.__init__(self, schema)
+        self._reader = init_net.CreateTextFileReader(
+            [],
+            filename=filename,
+            num_passes=num_passes,
+            field_types=field_types)
+        self._batch_size = batch_size
+
+    def read(self, net):
+        """
+        Create op for reading a batch of rows.
+        """
+        blobs = net.TextFileReaderRead(
+            [self._reader],
+            len(self.schema().field_names()),
+            batch_size=self._batch_size)
+        if type(blobs) is core.BlobReference:
+            blobs = [blobs]
+
+        is_empty = net.IsEmpty(
+            [blobs[0]],
+            core.ScopedBlobReference(net.NextName('should_stop'))
+        )
+
+        return (is_empty, blobs)
diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py
new file mode 100644
index 0000000..07226c1
--- /dev/null
+++ b/caffe2/python/timeout_guard.py
@@ -0,0 +1,109 @@
+## @package timeout_guard
+# Module caffe2.python.timeout_guard
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import contextlib
+import threading
+import os
+import time
+import signal
+import logging
+from future.utils import viewitems
+
+
+'''
+Sometimes CUDA devices can get stuck, 'deadlock'. In this case it is often
+better just the kill the process automatically. Use this guard to set a
+maximum timespan for a python call, such as RunNet(). If it does not complete
+in time, process is killed.
+
+Example usage:
+    with timeout_guard.CompleteInTimeOrDie(10.0):
+        core.RunNet(...)
+'''
+
+
+class WatcherThread(threading.Thread):
+
+    def __init__(self, timeout_secs):
+        threading.Thread.__init__(self)
+        self.timeout_secs = timeout_secs
+        self.completed = False
+        self.condition = threading.Condition()
+        self.daemon = True
+        self.caller_thread = threading.current_thread()
+
+    def run(self):
+        started = time.time()
+        self.condition.acquire()
+        while time.time() - started < self.timeout_secs and not self.completed:
+            self.condition.wait(self.timeout_secs - (time.time() - started))
+        self.condition.release()
+        if not self.completed:
+            log = logging.getLogger("timeout_guard")
+            log.error("Call did not finish in time. Timeout:{}s PID: {}".format(
+                self.timeout_secs,
+                os.getpid(),
+            ))
+
+            # First try dying cleanly, but in 10 secs, exit properly
+            def forcequit():
+                time.sleep(10.0)
+                log.info("Prepared output, dumping threads. ")
+                print("Caller thread was: {}".format(self.caller_thread))
+                print("-----After force------")
+                import sys
+                import traceback
+                code = []
+                for threadId, stack in viewitems(sys._current_frames()):
+                    if threadId == self.caller_thread.ident:
+                        code.append("\n# ThreadID: %s" % threadId)
+                        for filename, lineno, name, line in traceback.extract_stack(stack):
+                            code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+                            if line:
+                                code.append("  %s" % (line.strip()))
+
+                print("\n".join(code))
+                log.error("Process did not terminate cleanly in 10 s, forcing")
+                os.abort()
+
+            forcet = threading.Thread(target=forcequit, args=())
+            forcet.daemon = True
+            forcet.start()
+            print("Caller thread was: {}".format(self.caller_thread))
+            print("-----Before forcing------")
+            import sys
+            import traceback
+            code = []
+            for threadId, stack in viewitems(sys._current_frames()):
+                code.append("\n# ThreadID: %s" % threadId)
+                for filename, lineno, name, line in traceback.extract_stack(stack):
+                    code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+                    if line:
+                        code.append("  %s" % (line.strip()))
+
+            print("\n".join(code))
+            os.kill(os.getpid(), signal.SIGINT)
+
+
+@contextlib.contextmanager
+def CompleteInTimeOrDie(timeout_secs):
+    watcher = WatcherThread(timeout_secs)
+    watcher.start()
+    yield
+    watcher.completed = True
+    watcher.condition.acquire()
+    watcher.condition.notify()
+    watcher.condition.release()
+
+
+def EuthanizeIfNecessary(timeout_secs=120):
+    '''
+    Call this if you have problem with process getting stuck at shutdown.
+    It will kill the process if it does not terminate in timeout_secs.
+    '''
+    watcher = WatcherThread(timeout_secs)
+    watcher.start()
diff --git a/caffe2/python/toy_regression_test.py b/caffe2/python/toy_regression_test.py
new file mode 100644
index 0000000..2140074
--- /dev/null
+++ b/caffe2/python/toy_regression_test.py
@@ -0,0 +1,64 @@
+import numpy as np
+import unittest
+
+from caffe2.python import core, workspace, test_util
+
+
+class TestToyRegression(test_util.TestCase):
+    def testToyRegression(self):
+        """Tests a toy regression end to end.
+
+        The test code carries a simple toy regression in the form
+            y = 2.0 x1 + 1.5 x2 + 0.5
+        by randomly generating gaussian inputs and calculating the ground
+        truth outputs in the net as well. It uses a standard SGD to then
+        train the parameters.
+        """
+        workspace.ResetWorkspace()
+        init_net = core.Net("init")
+        W = init_net.UniformFill([], "W", shape=[1, 2], min=-1., max=1.)
+        B = init_net.ConstantFill([], "B", shape=[1], value=0.0)
+        W_gt = init_net.GivenTensorFill(
+            [], "W_gt", shape=[1, 2], values=[2.0, 1.5])
+        B_gt = init_net.GivenTensorFill([], "B_gt", shape=[1], values=[0.5])
+        LR = init_net.ConstantFill([], "LR", shape=[1], value=-0.1)
+        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
+        ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0,
+                                     dtype=core.DataType.INT32)
+
+        train_net = core.Net("train")
+        X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0)
+        Y_gt = X.FC([W_gt, B_gt], "Y_gt")
+        Y_pred = X.FC([W, B], "Y_pred")
+        dist = train_net.SquaredL2Distance([Y_gt, Y_pred], "dist")
+        loss = dist.AveragedLoss([], ["loss"])
+        # Get gradients for all the computations above. Note that in fact we
+        # don't need to get the gradient the Y_gt computation, but we'll just
+        # leave it there. In many cases, I am expecting one to load X and Y
+        # from the disk, so there is really no operator that will calculate the
+        # Y_gt input.
+        input_to_grad = train_net.AddGradientOperators([loss], skip=2)
+        # updates
+        train_net.Iter(ITER, ITER)
+        train_net.LearningRate(ITER, "LR", base_lr=-0.1,
+                               policy="step", stepsize=20, gamma=0.9)
+        train_net.WeightedSum([W, ONE, input_to_grad[str(W)], LR], W)
+        train_net.WeightedSum([B, ONE, input_to_grad[str(B)], LR], B)
+        for blob in [loss, W, B]:
+            train_net.Print(blob, [])
+
+        # the CPU part.
+        plan = core.Plan("toy_regression")
+        plan.AddStep(core.ExecutionStep("init", init_net))
+        plan.AddStep(core.ExecutionStep("train", train_net, 200))
+
+        workspace.RunPlan(plan)
+        W_result = workspace.FetchBlob("W")
+        B_result = workspace.FetchBlob("B")
+        np.testing.assert_array_almost_equal(W_result, [[2.0, 1.5]], decimal=2)
+        np.testing.assert_array_almost_equal(B_result, [0.5], decimal=2)
+        workspace.ResetWorkspace()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py
new file mode 100644
index 0000000..b48b73d
--- /dev/null
+++ b/caffe2/python/transformations.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.python._import_c_extension as C
+
+
+class Transformer(object):
+    def __init__(self):
+        pass
+
+    @classmethod
+    def runTransform(cls, transform_name, net):
+        pb = net.Proto().SerializeToString()
+        if C.transform_exists(transform_name):
+            output = C.run_transform(transform_name, pb)
+        elif C.workspace_transform_exists(transform_name):
+            output = C.run_workspace_transform(transform_name, pb)
+        else:
+            raise AttributeError('Transformation {} not found.'.format(transform_name))
+        net.Proto().ParseFromString(output)
+
+    def __getattr__(self, transform_name):
+        return lambda net : self.runTransform(transform_name, net)
+
+
+def fuseNNPACKConvRelu(net):
+    net.Proto().ParseFromString(
+        C.transform_fuseNNPACKConvRelu(net.Proto().SerializeToString())
+    )
+
+
+def sinkMaxPool(net):
+    net.Proto().ParseFromString(
+        C.transform_sinkMaxPool(net.Proto().SerializeToString())
+    )
+
+
+def optimizeForIDEEP(net, training_mode = False):
+    net.Proto().ParseFromString(
+        C.transform_optimizeForIDEEP(net.Proto().SerializeToString(), training_mode)
+    )
+
+
+def fuseConvBN(net):
+    net.Proto().ParseFromString(
+        C.transform_fuseConvBN(net.Proto().SerializeToString())
+    )
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
new file mode 100644
index 0000000..2edc88c
--- /dev/null
+++ b/caffe2/python/transformations_test.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+
+from caffe2.python.transformations import Transformer
+from caffe2.python import core, workspace, test_util
+
+transformer = Transformer()
+
+
+def str_compare(a, b, encoding="utf8"):
+    if isinstance(a, bytes):
+        a = a.decode(encoding)
+    if isinstance(b, bytes):
+        b = b.decode(encoding)
+    return a == b
+
+
+class TestTransformations(test_util.TestCase):
+    def test_transformer_AddNNPACK(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y2"])
+        transformer.AddNNPACK(net)
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+
+    def test_transformer_FuseNNPACKConvRelu(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y2"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+
+    def test_noFuseNNPACKConvRelu(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y2"])
+        net.Relu(["Y"], ["Y3"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 3
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation") and str_compare(arg.s, "Relu"):
+                has_activation_arg = True
+        assert not has_activation_arg
+
+    def test_transformer_FuseNNPACKConvReluNoInplace(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["X"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
+
+    def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
+
+    def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["X"])
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
+        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
+
+    def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y2"])
+        net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y2"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
+        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
+
+    def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y"], ["Y"])
+        net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.Relu(["Y2"], ["Y2"])
+        transformer.AddNNPACK(net)  # get the NNPACK engine
+        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
+        has_activation_arg = False
+        for arg in net.Proto().op[0].arg:
+            if str_compare(arg.name, "activation"):
+                assert str_compare(arg.s, "Relu")
+                has_activation_arg = True
+        assert has_activation_arg
+        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
+        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
+
+    def test_transformer_SinkMaxPool(self):
+        net = core.Net("net")
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net.MaxPool(["Y"], ["Y1"], kernel=3)
+        net.Relu(["Y1"], ["Y1"])
+        transformer.SinkMaxPool(net)
+        assert str_compare(net.Proto().op[1].type, "Relu")
+        assert str_compare(net.Proto().op[2].type, "MaxPool")
+
+    @given(
+        size=st.integers(7, 10),
+        input_channels=st.integers(1, 10),
+        seed=st.integers(0, 65535),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+    )
+    def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
+        net = core.Net("net")
+        c = input_channels
+        h = size
+        w = size
+        k = 3
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=k, order=order)
+        net.SpatialBN(
+            ["Y", "scale", "bias", "mean", "var"],
+            ["Y2"],
+            is_test=True,
+            order=order,
+            epsilon=epsilon,
+        )
+
+        np.random.seed(seed)
+        if order == "NCHW":
+            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+        else:
+            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
+        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32))
+        workspace.RunNetOnce(net)
+        preTransformOutput = workspace.FetchBlob("Y2")
+        transformer.FuseConvBN(net)
+
+        # Ensure fusion
+        assert len(net.Proto().op) == 1
+        workspace.RunNetOnce(net)
+        postTransformOutput = workspace.FetchBlob("Y2")
+        # Check that there is no numerical difference
+        assert np.allclose(
+            preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08
+        )
diff --git a/caffe2/python/trt/__init__.py b/caffe2/python/trt/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
new file mode 100644
index 0000000..f12a825
--- /dev/null
+++ b/caffe2/python/trt/test_trt.py
@@ -0,0 +1,318 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import onnx
+import onnx.defs
+from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from onnx.backend.base import namedtupledict
+from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
+import caffe2.python.onnx.backend as c2
+from caffe2.python.onnx.workspace import Workspace
+from caffe2.python.trt.transform import convert_onnx_model_to_trt_op, transform_caffe2_net
+from caffe2.python.onnx.tests.test_utils import TestCase
+import numpy as np
+import os.path
+import json
+import time
+import unittest
+import tarfile
+import tempfile
+import shutil
+from six.moves.urllib.request import urlretrieve
+
+def _print_net(net):
+    for i in net.external_input:
+        print("Input: {}".format(i))
+    for i in net.external_output:
+        print("Output: {}".format(i))
+    for op in net.op:
+        print("Op {}".format(op.type))
+        for x in op.input:
+            print("  input: {}".format(x))
+        for y in op.output:
+            print("  output: {}".format(y))
+
+
+_BASE_URL = 'https://s3.amazonaws.com/download.onnx/models/opset_{}'.format(onnx.defs.onnx_opset_version())
+
+# TODO: This is copied from https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py. Maybe we should
+# expose a model retrival API from ONNX
+def _download_onnx_model(model_name):
+    onnx_home = os.path.expanduser(os.getenv('ONNX_HOME', os.path.join('~', '.onnx')))
+    models_dir = os.getenv('ONNX_MODELS',
+                           os.path.join(onnx_home, 'models'))
+    model_dir = os.path.join(models_dir, model_name)
+    if not os.path.exists(os.path.join(model_dir, 'model.onnx')):
+        if os.path.exists(model_dir):
+            bi = 0
+            while True:
+                dest = '{}.old.{}'.format(model_dir, bi)
+                if os.path.exists(dest):
+                    bi += 1
+                    continue
+                shutil.move(model_dir, dest)
+                break
+        os.makedirs(model_dir)
+
+        # On Windows, NamedTemporaryFile can not be opened for a
+        # second time
+        url = '{}/{}.tar.gz'.format(_BASE_URL, model_name)
+        download_file = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            download_file.close()
+            print('Start downloading model {} from {}'.format(
+                model_name, url))
+            urlretrieve(url, download_file.name)
+            print('Done')
+            with tarfile.open(download_file.name) as t:
+                t.extractall(models_dir)
+        except Exception as e:
+            print('Failed to prepare data for model {}: {}'.format(
+                model_name, e))
+            raise
+        finally:
+            os.remove(download_file.name)
+    return model_dir
+
+class TensorRTOpTest(TestCase):
+    def _test_relu_graph(self, X, batch_size, trt_max_batch_size):
+        node_def = make_node("Relu", ["X"], ["Y"])
+        Y_c2 = c2.run_node(node_def, {"X": X})
+        graph_def = make_graph(
+            [node_def],
+            name="test",
+            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])],
+            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])])
+        model_def = make_model(graph_def, producer_name='relu-test')
+        op_outputs = [x.name for x in model_def.graph.output]
+        op = convert_onnx_model_to_trt_op(model_def, max_batch_size=trt_max_batch_size)
+        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        op.device_option.CopyFrom(device_option)
+        Y_trt = None
+        ws = Workspace()
+        with core.DeviceScope(device_option):
+            ws.FeedBlob("X", X)
+            ws.RunOperatorsOnce([op])
+            output_values = [ws.FetchBlob(name) for name in op_outputs]
+            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
+        np.testing.assert_almost_equal(Y_c2, Y_trt)
+
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_relu_graph_simple(self):
+        X = np.random.randn(1, 1, 3, 2).astype(np.float32)
+        self._test_relu_graph(X, 1, 50)
+
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_relu_graph_big_batch(self):
+        X = np.random.randn(52, 1, 3, 2).astype(np.float32)
+        self._test_relu_graph(X, 52, 50)
+
+    def _test_onnx_importer(self, model_name, data_input_index = 0):
+        model_dir = _download_onnx_model(model_name)
+        model_def = onnx.load(os.path.join(model_dir, 'model.onnx'))
+        input_blob_dims = [int(x.dim_value) for x in model_def.graph.input[data_input_index].type.tensor_type.shape.dim]
+        op_inputs = [x.name for x in model_def.graph.input]
+        op_outputs = [x.name for x in model_def.graph.output]
+        print("{}".format(op_inputs))
+        data = np.random.randn(*input_blob_dims).astype(np.float32)
+        Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data})
+        op = convert_onnx_model_to_trt_op(model_def, verbosity=3)
+        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        op.device_option.CopyFrom(device_option)
+        Y_trt = None
+        ws = Workspace()
+        with core.DeviceScope(device_option):
+            ws.FeedBlob(op_inputs[data_input_index], data)
+            ws.RunOperatorsOnce([op])
+            output_values = [ws.FetchBlob(name) for name in op_outputs]
+            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
+        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_resnet50(self):
+        self._test_onnx_importer('resnet50')
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_bvlc_alexnet(self):
+        self._test_onnx_importer('bvlc_alexnet')
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_densenet121(self):
+        self._test_onnx_importer('densenet121', -1)
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_inception_v1(self):
+        self._test_onnx_importer('inception_v1', -1)
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_inception_v2(self):
+        self._test_onnx_importer('inception_v2')
+
+    @unittest.skip('Need to revisit our ChannelShuffle exporter to avoid generating 5D tensor')
+    def test_shufflenet(self):
+        self._test_onnx_importer('shufflenet')
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_squeezenet(self):
+        self._test_onnx_importer('squeezenet', -1)
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_vgg16(self):
+        self._test_onnx_importer('vgg16')
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_vgg19(self):
+        self._test_onnx_importer('vgg19', -1)
+
+class TensorRTTransformTest(TestCase):
+    def _model_dir(self, model):
+        caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2'))
+        models_dir = os.getenv('CAFFE2_MODELS', os.path.join(caffe2_home, 'models'))
+        return os.path.join(models_dir, model)
+
+    def _download(self, model):
+        model_dir = self._model_dir(model)
+        assert not os.path.exists(model_dir)
+        os.makedirs(model_dir)
+        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+            url = getURLFromName(model, f)
+            dest = os.path.join(model_dir, f)
+            try:
+                try:
+                    downloadFromURLToFile(url, dest,
+                                          show_progress=False)
+                except TypeError:
+                    # show_progress not supported prior to
+                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                    # (Sep 17, 2017)
+                    downloadFromURLToFile(url, dest)
+            except Exception as e:
+                print("Abort: {reason}".format(reason=e))
+                print("Cleaning up...")
+                deleteDirectory(model_dir)
+                exit(1)
+
+    def _get_c2_model(self, model_name):
+        model_dir = self._model_dir(model_name)
+        if not os.path.exists(model_dir):
+            self._download(model_name)
+        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
+        c2_predict_net = caffe2_pb2.NetDef()
+        with open(c2_predict_pb, 'rb') as f:
+            c2_predict_net.ParseFromString(f.read())
+        c2_predict_net.name = model_name
+
+        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
+        c2_init_net = caffe2_pb2.NetDef()
+        with open(c2_init_pb, 'rb') as f:
+            c2_init_net.ParseFromString(f.read())
+        c2_init_net.name = model_name + '_init'
+
+        value_info = json.load(open(os.path.join(model_dir, 'value_info.json')))
+        return c2_init_net, c2_predict_net, value_info
+
+    def _add_head_tail(self, pred_net, new_head, new_tail):
+        orig_head = pred_net.external_input[0]
+        orig_tail = pred_net.external_output[0]
+
+        # Add head
+        head = caffe2_pb2.OperatorDef()
+        head.type = "Copy"
+        head.input.append(new_head)
+        head.output.append(orig_head)
+        dummy = caffe2_pb2.NetDef()
+        dummy.op.extend(pred_net.op)
+        del pred_net.op[:]
+        pred_net.op.extend([head])
+        pred_net.op.extend(dummy.op)
+        pred_net.external_input[0] = new_head
+
+        # Add tail
+        tail = caffe2_pb2.OperatorDef()
+        tail.type = "Copy"
+        tail.input.append(orig_tail)
+        tail.output.append(new_tail)
+        pred_net.op.extend([tail])
+        pred_net.external_output[0] = new_tail
+
+
+    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
+    def test_resnet50_core(self):
+        N = 2
+        warmup = 20
+        repeat = 100
+        print("Batch size: {}, repeat inference {} times, warmup {} times".format(N, repeat, warmup))
+        init_net, pred_net, _  = self._get_c2_model('resnet50')
+        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
+        input_blob_dims = (N, 3, 224, 224)
+        input_name = "real_data"
+
+        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        init_net.device_option.CopyFrom(device_option)
+        pred_net.device_option.CopyFrom(device_option)
+        for op in pred_net.op:
+            op.device_option.CopyFrom(device_option)
+            op.engine = 'CUDNN'
+        net_outputs = pred_net.external_output
+        Y_c2 = None
+        data =  np.random.randn(*input_blob_dims).astype(np.float32)
+        c2_time = 1
+        workspace.SwitchWorkspace("gpu_test", True)
+        with core.DeviceScope(device_option):
+            workspace.FeedBlob(input_name, data)
+            workspace.RunNetOnce(init_net)
+            workspace.CreateNet(pred_net)
+            for _ in range(warmup):
+                workspace.RunNet(pred_net.name)
+            start = time.time()
+            for _ in range(repeat):
+                workspace.RunNet(pred_net.name)
+            end = time.time()
+            c2_time = end - start
+            output_values = [workspace.FetchBlob(name) for name in net_outputs]
+            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
+        workspace.ResetWorkspace()
+
+        # Fill the workspace with the weights
+        with core.DeviceScope(device_option):
+            workspace.RunNetOnce(init_net)
+
+        # Cut the graph
+        start = time.time()
+        pred_net_cut = transform_caffe2_net(pred_net,
+                                            {input_name: input_blob_dims},
+                                            build_serializable_op=False)
+        del init_net, pred_net
+        pred_net_cut.device_option.CopyFrom(device_option)
+        for op in pred_net_cut.op:
+            op.device_option.CopyFrom(device_option)
+        #_print_net(pred_net_cut)
+
+        Y_trt = None
+        input_name = pred_net_cut.external_input[0]
+        print("C2 runtime: {}s".format(c2_time))
+        with core.DeviceScope(device_option):
+            workspace.FeedBlob(input_name, data)
+            workspace.CreateNet(pred_net_cut)
+            end = time.time()
+            print("Conversion time: {:.2f}s".format(end -start))
+
+            for _ in range(warmup):
+                workspace.RunNet(pred_net_cut.name)
+            start = time.time()
+            for _ in range(repeat):
+                workspace.RunNet(pred_net_cut.name)
+            end = time.time()
+            trt_time = end - start
+            print("TRT runtime: {}s, improvement: {}%".format(trt_time, (c2_time-trt_time)/c2_time*100))
+            output_values = [workspace.FetchBlob(name) for name in net_outputs]
+            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
+        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
+
+
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
new file mode 100644
index 0000000..128d303
--- /dev/null
+++ b/caffe2/python/trt/transform.py
@@ -0,0 +1,108 @@
+## @package onnx
+#Module caffe2.python.trt.transform
+
+"""
+TensorRT related transformation
+Note that ONNX-TRT enforce an NCHW input!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
+from caffe2.python import core, workspace
+import caffe2.python.onnx.frontend as c2_front
+import caffe2.python._import_c_extension as C
+import numpy as np
+
+def _dim_values_to_list(dim_values):
+    return [x.dim_value for x in dim_values]
+
+
+def _get_output_shapes(output_value_infos):
+    names = [x.name for x in output_value_infos]
+    shapes = [_dim_values_to_list(x.type.tensor_type.shape.dim) for x in output_value_infos]
+    return dict(zip(names, shapes))
+
+
+def check_gpu_():
+    try:
+        C.get_cuda_version()
+    except Exception as _:
+       raise Exception("TensorRT related functions require CUDA support")
+
+def convert_onnx_model_to_trt_op(onnx_model,
+        max_batch_size=50,
+        max_workspace_size=2*1024*1024,
+        verbosity=1,
+        debug_builder=False):
+    """
+    Convert the whole ONNX model to a TensorRT C2 op
+    """
+    check_gpu_()
+    trt_str = C.onnx_to_trt_op(onnx_model.SerializeToString(),
+                               _get_output_shapes(onnx_model.graph.output),
+                               max_batch_size,
+                               max_workspace_size,
+                               verbosity,
+                               debug_builder)
+    op = caffe2_pb2.OperatorDef()
+    op.ParseFromString(trt_str)
+    return op
+
+def _infer_shapes(init_net, pred_net, inputs):
+    ws, outputs = c2_native_run_net(init_net, pred_net, inputs)
+    hints = {}
+    for op in pred_net.op:
+        for o in op.output:
+            if o not in hints:
+                blob = ws.FetchBlob(o)
+                if hasattr(blob, 'shape'):
+                    hints[o] = blob.shape
+        for i in op.input:
+            if i not in hints:
+                blob = ws.FetchBlob(i)
+                if hasattr(blob, 'shape'):
+                    hints[i] = blob.shape
+
+    return hints
+
+def transform_caffe2_net(
+        pred_net,
+        input_shapes,
+        populate_shapes = False,
+        max_batch_size=50,
+        max_workspace_size=2*1024*1024,
+        verbosity=1,
+        debug_builder=False,
+        build_serializable_op=True):
+    """
+    Transfrom the caffe2_net by collapsing TRT-runnable nodes into trt c2 ops
+    """
+    check_gpu_()
+
+    # Hacky way to infer shapes as not all our operators have shape inference function.
+    # Normally this is not needed
+    shape_hints = {}
+    if populate_shapes:
+        input_data = {}
+        for k,v in input_shapes.items():
+            input_data[k] = np.random.randn(*v).astype(np.float32)
+        shape_hints = _infer_shapes(init_net, pred_net, input_data)
+
+    for k,v in input_shapes.items():
+        shape_hints[k] = v
+    pred_net_str = C.transform_trt(pred_net.SerializeToString(),
+                                   shape_hints,
+                                   max_batch_size,
+                                   max_workspace_size,
+                                   verbosity,
+                                   debug_builder,
+                                   build_serializable_op)
+    pred_net_cut = caffe2_pb2.NetDef()
+    pred_net_cut.ParseFromString(pred_net_str)
+    return pred_net_cut
+
diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py
new file mode 100644
index 0000000..a2011da
--- /dev/null
+++ b/caffe2/python/tt_core.py
@@ -0,0 +1,241 @@
+## @package tt_core
+# Module caffe2.python.tt_core
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+"""
+The following methods are various utility methods for using the Tensor-Train
+decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
+paper (http://epubs.siam.org/doi/abs/10.1137/090752286).
+
+Broadly speaking, these methods are used to replace fully connected layers in
+neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
+in their paper (http://arxiv.org/abs/1509.06569). More details about each of
+the methods are provided in each respective docstring.
+"""
+
+
+def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
+    """
+    Initialize randomized orthogonalized TT-cores.
+
+    This method should be used when a TT-layer is trained from scratch. The
+    sizes of each of the cores are specified by the inp_sizes and out_sizes, and
+    the respective tt_ranks will dictate the ranks of each of the cores. Note
+    that a larger set of tt_ranks will result in slower computation but will
+    result in more accurate approximations. The size of the ith core is:
+
+        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
+
+    Note that the following relationships of lengths of each input is expected:
+
+        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
+
+    Args:
+        inp_sizes: list of the input dimensions of the respective cores
+        out_sizes: list of the output dimensions of the respective cores
+        tt_ranks: list of the ranks of the respective cores
+        seed: integer to seed the random number generator
+
+    Returns:
+        cores: One-dimensional list of cores concatentated along an axis
+    """
+    np.random.seed(seed)
+
+    # Assert that the sizes of each input is correct
+    assert(len(inp_sizes) == len(out_sizes)), \
+           "The number of input dimensions (" + str(len(inp_sizes)) + \
+           ") must be equal to the number of output dimensions (" + \
+           str(len(out_sizes)) + ")."
+
+    assert(len(tt_ranks) == len(inp_sizes) + 1), \
+           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
+           "one more than the number of input and output dims (" + \
+           str(len(out_sizes)) + ")."
+
+    # Convert to numpy arrays
+    inp_sizes = np.array(inp_sizes)
+    out_sizes = np.array(out_sizes)
+    tt_ranks = np.array(tt_ranks)
+
+    # Initialize the cores array
+    cores_len = np.sum(
+        inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
+    cores = np.zeros(cores_len)
+    cores_idx = 0
+    rv = 1
+
+    # Compute the full list of cores by computing each individual one
+    for i in range(inp_sizes.shape[0]):
+        shape = [tt_ranks[i],
+                 inp_sizes[i],
+                 out_sizes[i],
+                 tt_ranks[i + 1]]
+
+        # Precompute the shape of each core
+        tall_shape = (np.prod(shape[:3]), shape[3])
+
+        # Randomly initialize the current core using a normal distribution
+        curr_core = np.dot(rv, np.random.normal(
+            0, 1, size=(shape[0], np.prod(shape[1:]))))
+        curr_core = curr_core.reshape(tall_shape)
+
+        # Orthogonalize the initialized current core and append to cores list
+        if i < inp_sizes.shape[0] - 1:
+            curr_core, rv = np.linalg.qr(curr_core)
+        cores[cores_idx:cores_idx +
+              curr_core.size] = curr_core.flatten()
+        cores_idx += curr_core.size
+
+    # Normalize the list of arrays using this Glarot trick
+    glarot_style = (np.prod(inp_sizes) *
+                    np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])
+
+    return (0.1 / glarot_style) * np.array(cores).astype(np.float32)
+
+
+def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
+    """
+    Convert a matrix into the TT-format.
+
+    This method will consume a 2D weight matrix such as those used in fully
+    connected layers in a neural network and will compute the TT-decomposition
+    of the weight matrix and return the TT-cores of the resulting computation.
+    This method should be used when converting a trained, fully connected layer,
+    into a TT-layer for increased speed and decreased parameter size. The size
+    of the ith core is:
+
+        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
+
+    Note that the following relationships of lengths of each input is expected:
+
+        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
+
+    We also require that np.prod(inp_sizes) == W.shape[0] and that
+    np.prod(out_sizes) == W.shape[1].
+
+    Args:
+        W: two-dimensional weight matrix numpy array representing a fully
+           connected layer to be converted to TT-format; note that the weight
+           matrix is transposed before decomposed because we want to emulate the
+           X * W^T operation that the FC layer performs.
+        inp_sizes: list of the input dimensions of the respective cores
+        out_sizes: list of the output dimensions of the respective cores
+        tt_ranks: list of the ranks of the respective cores
+
+    Returns:
+        new_cores: One-dimensional list of cores concatentated along an axis
+   """
+
+    # Assert that the sizes of each input is correct
+    assert(len(inp_sizes) == len(out_sizes)), \
+           "The number of input dimensions (" + str(len(inp_sizes)) + \
+           ") must be equal to the number of output dimensions (" + \
+           str(len(out_sizes)) + ")."
+
+    assert(len(tt_ranks) == len(inp_sizes) + 1), \
+           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
+           "one more than the number of input and output dimensions (" + \
+           str(len(out_sizes)) + ")."
+
+    assert(W.shape[0] == np.prod(inp_sizes)), \
+           "The product of the input sizes (" + str(np.prod(inp_sizes)) + \
+           ") must be equal to first dimension of W (" + str(W.shape[0]) + ")."
+
+    assert(W.shape[1] == np.prod(out_sizes)), \
+           "The product of the output sizes (" + str(np.prod(out_sizes)) + \
+           ") must be equal to second dimension of W (" + str(W.shape[1]) + ")."
+
+    # W is transposed so that the multiplication X * W^T can be computed, just
+    # as it is in the FC layer.
+    W = W.transpose()
+
+    # Convert to numpy arrays
+    inp_sizes = np.array(inp_sizes)
+    out_sizes = np.array(out_sizes)
+    tt_ranks = np.array(tt_ranks)
+
+    # Copy the original weight matrix in order to permute and reshape the weight
+    # matrix. In addition, the inp_sizes and out_sizes are combined to a single
+    # sizes array to use the tt_svd helper method, which only consumes a single
+    # sizes array.
+    W_copy = W.copy()
+    total_inp_size = inp_sizes.size
+    W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
+    order = np.repeat(np.arange(0, total_inp_size), 2) + \
+            np.tile([0, total_inp_size], total_inp_size)
+    W_copy = np.transpose(W_copy, axes=order)
+    W_copy = np.reshape(W_copy, inp_sizes * out_sizes)
+
+    # Use helper method to convert the W matrix copy into the preliminary
+    # cores array.
+    cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)
+
+    # Permute the dimensions of each of the cores to be compatible with the
+    # TT-layer.
+    new_cores = np.zeros(cores.shape).astype(np.float32)
+    idx = 0
+    for i in range(len(inp_sizes)):
+        shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
+        current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
+        current_core = current_core.transpose((1, 3, 0, 2))
+        new_cores[new_cores.shape[0] - idx - np.prod(shape):
+                  new_cores.shape[0] - idx] \
+                  = current_core.flatten()
+        idx += np.prod(shape)
+
+    return new_cores
+
+
+def tt_svd(W, sizes, tt_ranks):
+    """
+    Helper method for the matrix_to_tt() method performing the TT-SVD
+    decomposition.
+
+    Uses the TT-decomposition algorithm to convert a matrix to TT-format using
+    multiple reduced SVD operations.
+
+    Args:
+        W: two-dimensional weight matrix representing a fully connected layer to
+           be converted to TT-format preprocessed by the matrix_to_tt() method.
+        sizes: list of the dimensions of each of the cores
+        tt_ranks: list of the ranks of the respective cores
+
+    Returns:
+        cores: One-dimensional list of cores concatentated along an axis
+   """
+
+    assert(len(tt_ranks) == len(sizes) + 1)
+
+    C = W.copy()
+    total_size = sizes.size
+    core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
+                    dtype='float32')
+
+    # Compute iterative reduced SVD operations and store each resulting U matrix
+    # as an individual core.
+    pos = 0
+    for i in range(0, total_size - 1):
+        shape = tt_ranks[i] * sizes[i]
+        C = np.reshape(C, [shape, -1])
+        U, S, V = np.linalg.svd(C, full_matrices=False)
+        U = U[:, 0:tt_ranks[i + 1]]
+        S = S[0:tt_ranks[i + 1]]
+        V = V[0:tt_ranks[i + 1], :]
+
+        core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
+        pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
+        C = np.dot(np.diag(S), V)
+
+    core[pos:pos + tt_ranks[total_size - 1] *
+         sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
+    return core
+
+
+# TODO(Surya) Write a method to convert an entire network where all fully
+# connected layers are replaced by an TT layer.
+def fc_net_to_tt_net(net):
+    pass
diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py
new file mode 100644
index 0000000..aec5764
--- /dev/null
+++ b/caffe2/python/tt_core_test.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+
+from caffe2.python import core, workspace, tt_core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestTTSVD(hu.HypothesisTestCase):
+    def test_full_tt_svd(self):
+        size = 256
+        np.random.seed(1234)
+        X = np.expand_dims(
+            np.random.rand(size).astype(np.float32), axis=0)
+        W = np.random.rand(size, size).astype(np.float32)
+        b = np.zeros(size).astype(np.float32)
+        inp_sizes = [4, 4, 4, 4]
+        out_sizes = [4, 4, 4, 4]
+
+        op_fc = core.CreateOperator(
+            "FC",
+            ["X", "W", "b"],
+            ["Y"],
+        )
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        workspace.FeedBlob("b", b)
+        workspace.RunOperatorOnce(op_fc)
+        Y_fc = workspace.FetchBlob("Y").flatten()
+
+        # Testing TT-decomposition with high ranks
+        full_tt_ranks = [1, 16, 256, 16, 1]
+        full_cores = tt_core.matrix_to_tt(W, inp_sizes, out_sizes,
+                                          full_tt_ranks)
+
+        full_op_tt = core.CreateOperator(
+            "TT",
+            ["X", "b", "cores"],
+            ["Y"],
+            inp_sizes=inp_sizes,
+            out_sizes=out_sizes,
+            tt_ranks=full_tt_ranks,
+        )
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("cores", full_cores)
+        workspace.RunOperatorOnce(full_op_tt)
+        Y_full_tt = workspace.FetchBlob("Y").flatten()
+
+        assert(len(Y_fc) == len(Y_full_tt))
+        self.assertAlmostEquals(np.linalg.norm(Y_fc - Y_full_tt), 0, delta=1e-3)
+
+        # Testing TT-decomposition with minimal ranks
+        sparse_tt_ranks = [1, 1, 1, 1, 1]
+        sparse_cores = tt_core.matrix_to_tt(W, inp_sizes, out_sizes,
+                                            sparse_tt_ranks)
+
+        sparse_op_tt = core.CreateOperator(
+            "TT",
+            ["X", "b", "cores"],
+            ["Y"],
+            inp_sizes=inp_sizes,
+            out_sizes=out_sizes,
+            tt_ranks=sparse_tt_ranks,
+        )
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("b", b)
+        workspace.FeedBlob("cores", sparse_cores)
+        workspace.RunOperatorOnce(sparse_op_tt)
+        Y_sparse_tt = workspace.FetchBlob("Y").flatten()
+
+        assert(len(Y_fc) == len(Y_sparse_tt))
+        self.assertAlmostEquals(np.linalg.norm(Y_fc - Y_sparse_tt),
+                                39.974, delta=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
new file mode 100644
index 0000000..75124ad
--- /dev/null
+++ b/caffe2/python/utils.py
@@ -0,0 +1,384 @@
+# @package utils
+# Module caffe2.python.utils
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from future.utils import viewitems
+from google.protobuf.message import DecodeError, Message
+from google.protobuf import text_format
+
+import sys
+import copy
+import collections
+import functools
+import numpy as np
+from six import integer_types, binary_type, text_type, string_types
+
+OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
+ITERATION_MUTEX_NAME = "iteration_mutex"
+
+
+def OpAlmostEqual(op_a, op_b, ignore_fields=None):
+    '''
+    Two ops are identical except for each field in the `ignore_fields`.
+    '''
+    ignore_fields = ignore_fields or []
+    if not isinstance(ignore_fields, list):
+        ignore_fields = [ignore_fields]
+
+    assert all(isinstance(f, text_type) for f in ignore_fields), (
+        'Expect each field is text type, but got {}'.format(ignore_fields))
+
+    def clean_op(op):
+        op = copy.deepcopy(op)
+        for field in ignore_fields:
+            if op.HasField(field):
+                op.ClearField(field)
+        return op
+
+    op_a = clean_op(op_a)
+    op_b = clean_op(op_b)
+    return op_a == op_b
+
+
+def CaffeBlobToNumpyArray(blob):
+    if (blob.num != 0):
+        # old style caffe blob.
+        return (np.asarray(blob.data, dtype=np.float32)
+                .reshape(blob.num, blob.channels, blob.height, blob.width))
+    else:
+        # new style caffe blob.
+        return (np.asarray(blob.data, dtype=np.float32)
+                .reshape(blob.shape.dim))
+
+
+def Caffe2TensorToNumpyArray(tensor):
+    if tensor.data_type == caffe2_pb2.TensorProto.FLOAT:
+        return np.asarray(
+            tensor.float_data, dtype=np.float32).reshape(tensor.dims)
+    elif tensor.data_type == caffe2_pb2.TensorProto.DOUBLE:
+        return np.asarray(
+            tensor.double_data, dtype=np.float64).reshape(tensor.dims)
+    elif tensor.data_type == caffe2_pb2.TensorProto.INT32:
+        return np.asarray(
+            tensor.int32_data, dtype=np.int).reshape(tensor.dims)   # pb.INT32=>np.int use int32_data
+    elif tensor.data_type == caffe2_pb2.TensorProto.INT16:
+        return np.asarray(
+            tensor.int32_data, dtype=np.int16).reshape(tensor.dims)  # pb.INT16=>np.int16 use int32_data
+    elif tensor.data_type == caffe2_pb2.TensorProto.UINT16:
+        return np.asarray(
+            tensor.int32_data, dtype=np.uint16).reshape(tensor.dims)  # pb.UINT16=>np.uint16 use int32_data
+    elif tensor.data_type == caffe2_pb2.TensorProto.INT8:
+        return np.asarray(
+            tensor.int32_data, dtype=np.int8).reshape(tensor.dims)  # pb.INT8=>np.int8 use int32_data
+    elif tensor.data_type == caffe2_pb2.TensorProto.UINT8:
+        return np.asarray(
+            tensor.int32_data, dtype=np.uint8).reshape(tensor.dims)  # pb.UINT8=>np.uint8 use int32_data
+    else:
+        # TODO: complete the data type: bool, float16, byte, int64, string
+        raise RuntimeError(
+            "Tensor data type not supported yet: " + str(tensor.data_type))
+
+
+def NumpyArrayToCaffe2Tensor(arr, name=None):
+    tensor = caffe2_pb2.TensorProto()
+    tensor.dims.extend(arr.shape)
+    if name:
+        tensor.name = name
+    if arr.dtype == np.float32:
+        tensor.data_type = caffe2_pb2.TensorProto.FLOAT
+        tensor.float_data.extend(list(arr.flatten().astype(float)))
+    elif arr.dtype == np.float64:
+        tensor.data_type = caffe2_pb2.TensorProto.DOUBLE
+        tensor.double_data.extend(list(arr.flatten().astype(np.float64)))
+    elif arr.dtype == np.int or arr.dtype == np.int32:
+        tensor.data_type = caffe2_pb2.TensorProto.INT32
+        tensor.int32_data.extend(arr.flatten().astype(np.int).tolist())
+    elif arr.dtype == np.int16:
+        tensor.data_type = caffe2_pb2.TensorProto.INT16
+        tensor.int32_data.extend(list(arr.flatten().astype(np.int16)))  # np.int16=>pb.INT16 use int32_data
+    elif arr.dtype == np.uint16:
+        tensor.data_type = caffe2_pb2.TensorProto.UINT16
+        tensor.int32_data.extend(list(arr.flatten().astype(np.uint16)))  # np.uint16=>pb.UNIT16 use int32_data
+    elif arr.dtype == np.int8:
+        tensor.data_type = caffe2_pb2.TensorProto.INT8
+        tensor.int32_data.extend(list(arr.flatten().astype(np.int8)))   # np.int8=>pb.INT8 use int32_data
+    elif arr.dtype == np.uint8:
+        tensor.data_type = caffe2_pb2.TensorProto.UINT8
+        tensor.int32_data.extend(list(arr.flatten().astype(np.uint8)))   # np.uint8=>pb.UNIT8 use int32_data
+    else:
+        # TODO: complete the data type: bool, float16, byte, int64, string
+        raise RuntimeError(
+            "Numpy data type not supported yet: " + str(arr.dtype))
+    return tensor
+
+
+def MakeArgument(key, value):
+    """Makes an argument based on the value type."""
+    argument = caffe2_pb2.Argument()
+    argument.name = key
+    iterable = isinstance(value, collections.Iterable)
+
+    # Fast tracking common use case where a float32 array of tensor parameters
+    # needs to be serialized.  The entire array is guaranteed to have the same
+    # dtype, so no per-element checking necessary and no need to convert each
+    # element separately.
+    if isinstance(value, np.ndarray) and value.dtype.type is np.float32:
+        argument.floats.extend(value.flatten().tolist())
+        return argument
+
+    if isinstance(value, np.ndarray):
+        value = value.flatten().tolist()
+    elif isinstance(value, np.generic):
+        # convert numpy scalar to native python type
+        value = np.asscalar(value)
+
+    if type(value) is float:
+        argument.f = value
+    elif type(value) in integer_types or type(value) is bool:
+        # We make a relaxation that a boolean variable will also be stored as
+        # int.
+        argument.i = value
+    elif isinstance(value, binary_type):
+        argument.s = value
+    elif isinstance(value, text_type):
+        argument.s = value.encode('utf-8')
+    elif isinstance(value, caffe2_pb2.NetDef):
+        argument.n.CopyFrom(value)
+    elif isinstance(value, Message):
+        argument.s = value.SerializeToString()
+    elif iterable and all(type(v) in [float, np.float_] for v in value):
+        argument.floats.extend(
+            v.item() if type(v) is np.float_ else v for v in value
+        )
+    elif iterable and all(
+        type(v) in integer_types or type(v) in [bool, np.int_] for v in value
+    ):
+        argument.ints.extend(
+            v.item() if type(v) is np.int_ else v for v in value
+        )
+    elif iterable and all(
+        isinstance(v, binary_type) or isinstance(v, text_type) for v in value
+    ):
+        argument.strings.extend(
+            v.encode('utf-8') if isinstance(v, text_type) else v
+            for v in value
+        )
+    elif iterable and all(isinstance(v, caffe2_pb2.NetDef) for v in value):
+        argument.nets.extend(value)
+    elif iterable and all(isinstance(v, Message) for v in value):
+        argument.strings.extend(v.SerializeToString() for v in value)
+    else:
+        if iterable:
+            raise ValueError(
+                "Unknown iterable argument type: key={} value={}, value "
+                "type={}[{}]".format(
+                    key, value, type(value), set(type(v) for v in value)
+                )
+            )
+        else:
+            raise ValueError(
+                "Unknown argument type: key={} value={}, value type={}".format(
+                    key, value, type(value)
+                )
+            )
+    return argument
+
+
+def TryReadProtoWithClass(cls, s):
+    """Reads a protobuffer with the given proto class.
+
+    Inputs:
+      cls: a protobuffer class.
+      s: a string of either binary or text protobuffer content.
+
+    Outputs:
+      proto: the protobuffer of cls
+
+    Throws:
+      google.protobuf.message.DecodeError: if we cannot decode the message.
+    """
+    obj = cls()
+    try:
+        text_format.Parse(s, obj)
+        return obj
+    except text_format.ParseError:
+        obj.ParseFromString(s)
+        return obj
+
+
+def GetContentFromProto(obj, function_map):
+    """Gets a specific field from a protocol buffer that matches the given class
+    """
+    for cls, func in viewitems(function_map):
+        if type(obj) is cls:
+            return func(obj)
+
+
+def GetContentFromProtoString(s, function_map):
+    for cls, func in viewitems(function_map):
+        try:
+            obj = TryReadProtoWithClass(cls, s)
+            return func(obj)
+        except DecodeError:
+            continue
+    else:
+        raise DecodeError("Cannot find a fit protobuffer class.")
+
+
+def ConvertProtoToBinary(proto_class, filename, out_filename):
+    """Convert a text file of the given protobuf class to binary."""
+    proto = TryReadProtoWithClass(proto_class, open(filename).read())
+    with open(out_filename, 'w') as fid:
+        fid.write(proto.SerializeToString())
+
+
+def GetGPUMemoryUsageStats():
+    """Get GPU memory usage stats from CUDAContext. This requires flag
+       --caffe2_gpu_memory_tracking to be enabled"""
+    from caffe2.python import workspace, core
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "GetGPUMemoryUsage",
+            [],
+            ["____mem____"],
+            device_option=core.DeviceOption(caffe2_pb2.CUDA, 0),
+        ),
+    )
+    b = workspace.FetchBlob("____mem____")
+    return {
+        'total_by_gpu': b[0, :],
+        'max_by_gpu': b[1, :],
+        'total': np.sum(b[0, :]),
+        'max_total': np.sum(b[1, :])
+    }
+
+
+def ResetBlobs(blobs):
+    from caffe2.python import workspace, core
+    workspace.RunOperatorOnce(
+        core.CreateOperator(
+            "Free",
+            list(blobs),
+            list(blobs),
+            device_option=core.DeviceOption(caffe2_pb2.CPU),
+        ),
+    )
+
+
+class DebugMode(object):
+    '''
+    This class allows to drop you into an interactive debugger
+    if there is an unhandled exception in your python script
+
+    Example of usage:
+
+    def main():
+        # your code here
+        pass
+
+    if __name__ == '__main__':
+        from caffe2.python.utils import DebugMode
+        DebugMode.run(main)
+    '''
+
+    @classmethod
+    def run(cls, func):
+        try:
+            return func()
+        except KeyboardInterrupt:
+            raise
+        except Exception:
+            import pdb
+
+            print(
+                'Entering interactive debugger. Type "bt" to print '
+                'the full stacktrace. Type "help" to see command listing.')
+            print(sys.exc_info()[1])
+            print
+
+            pdb.post_mortem()
+            sys.exit(1)
+            raise
+
+
+def raiseIfNotEqual(a, b, msg):
+    if a != b:
+        raise Exception("{}. {} != {}".format(msg, a, b))
+
+
+def debug(f):
+    '''
+    Use this method to decorate your function with DebugMode's functionality
+
+    Example:
+
+    @debug
+    def test_foo(self):
+        raise Exception("Bar")
+
+    '''
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        def func():
+            return f(*args, **kwargs)
+        return DebugMode.run(func)
+
+    return wrapper
+
+
+def BuildUniqueMutexIter(
+    init_net,
+    net,
+    iter=None,
+    iter_mutex=None,
+    iter_val=0
+):
+    '''
+    Often, a mutex guarded iteration counter is needed. This function creates a
+    mutex iter in the net uniquely (if the iter already existing, it does
+    nothing)
+
+    This function returns the iter blob
+    '''
+    iter = iter if iter is not None else OPTIMIZER_ITERATION_NAME
+    iter_mutex = iter_mutex if iter_mutex is not None else ITERATION_MUTEX_NAME
+    from caffe2.python import core
+    if not init_net.BlobIsDefined(iter):
+        # Add training operators.
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+            iteration = init_net.ConstantFill(
+                [],
+                iter,
+                shape=[1],
+                value=iter_val,
+                dtype=core.DataType.INT64,
+            )
+            iter_mutex = init_net.CreateMutex([], [iter_mutex])
+            net.AtomicIter([iter_mutex, iteration], [iteration])
+    else:
+        iteration = init_net.GetBlobRef(iter)
+    return iteration
+
+
+def EnumClassKeyVals(cls):
+    # cls can only be derived from object
+    assert type(cls) == type
+    # Enum attribute keys are all capitalized and values are strings
+    enum = {}
+    for k in dir(cls):
+        if k == k.upper():
+            v = getattr(cls, k)
+            if isinstance(v, string_types):
+                assert v not in enum.values(), (
+                    "Failed to resolve {} as Enum: "
+                    "duplicate entries {}={}, {}={}".format(
+                        cls, k, v, [key for key in enum if enum[key] == v][0], v
+                    )
+                )
+                enum[k] = v
+    return enum
diff --git a/caffe2/python/visualize.py b/caffe2/python/visualize.py
new file mode 100644
index 0000000..1eecdcd
--- /dev/null
+++ b/caffe2/python/visualize.py
@@ -0,0 +1,175 @@
+## @package visualize
+# Module caffe2.python.visualize
+"""Functions that could be used to visualize Tensors.
+
+This is adapted from the old-time iceberk package that Yangqing wrote... Oh gold
+memories. Before decaf and caffe. Why iceberk? Because I was at Berkeley,
+bears are vegetarian, and iceberg lettuce has layers of leaves.
+
+(This joke is so lame.)
+"""
+
+import numpy as np
+from matplotlib import cm, pyplot
+
+
+def ChannelFirst(arr):
+    """Convert a HWC array to CHW."""
+    ndim = arr.ndim
+    return arr.swapaxes(ndim - 1, ndim - 2).swapaxes(ndim - 2, ndim - 3)
+
+
+def ChannelLast(arr):
+    """Convert a CHW array to HWC."""
+    ndim = arr.ndim
+    return arr.swapaxes(ndim - 3, ndim - 2).swapaxes(ndim - 2, ndim - 1)
+
+
+class PatchVisualizer(object):
+    """PatchVisualizer visualizes patches.
+  """
+
+    def __init__(self, gap=1):
+        self.gap = gap
+
+    def ShowSingle(self, patch, cmap=None):
+        """Visualizes one single patch.
+
+    The input patch could be a vector (in which case we try to infer the shape
+    of the patch), a 2-D matrix, or a 3-D matrix whose 3rd dimension has 3
+    channels.
+    """
+        if len(patch.shape) == 1:
+            patch = patch.reshape(self.get_patch_shape(patch))
+        elif len(patch.shape) > 2 and patch.shape[2] != 3:
+            raise ValueError("The input patch shape isn't correct.")
+        # determine color
+        if len(patch.shape) == 2 and cmap is None:
+            cmap = cm.gray
+        pyplot.imshow(patch, cmap=cmap)
+        return patch
+
+    def ShowMultiple(self, patches, ncols=None, cmap=None, bg_func=np.mean):
+        """Visualize multiple patches.
+
+    In the passed in patches matrix, each row is a patch, in the shape of either
+    n*n, n*n*1 or n*n*3, either in a flattened format (so patches would be a
+    2-D array), or a multi-dimensional tensor. We will try our best to figure
+    out automatically the patch size.
+    """
+        num_patches = patches.shape[0]
+        if ncols is None:
+            ncols = int(np.ceil(np.sqrt(num_patches)))
+        nrows = int(np.ceil(num_patches / float(ncols)))
+        if len(patches.shape) == 2:
+            patches = patches.reshape(
+                (patches.shape[0], ) + self.get_patch_shape(patches[0])
+            )
+        patch_size_expand = np.array(patches.shape[1:3]) + self.gap
+        image_size = patch_size_expand * np.array([nrows, ncols]) - self.gap
+        if len(patches.shape) == 4:
+            if patches.shape[3] == 1:
+                # gray patches
+                patches = patches.reshape(patches.shape[:-1])
+                image_shape = tuple(image_size)
+                if cmap is None:
+                    cmap = cm.gray
+            elif patches.shape[3] == 3:
+                # color patches
+                image_shape = tuple(image_size) + (3, )
+            else:
+                raise ValueError("The input patch shape isn't expected.")
+        else:
+            image_shape = tuple(image_size)
+            if cmap is None:
+                cmap = cm.gray
+        image = np.ones(image_shape) * bg_func(patches)
+        for pid in range(num_patches):
+            row = pid // ncols * patch_size_expand[0]
+            col = pid % ncols * patch_size_expand[1]
+            image[row:row+patches.shape[1], col:col+patches.shape[2]] = \
+                patches[pid]
+        pyplot.imshow(image, cmap=cmap, interpolation='nearest')
+        pyplot.axis('off')
+        return image
+
+    def ShowImages(self, patches, *args, **kwargs):
+        """Similar to ShowMultiple, but always normalize the values between 0 and 1
+    for better visualization of image-type data.
+    """
+        patches = patches - np.min(patches)
+        patches /= np.max(patches) + np.finfo(np.float64).eps
+        return self.ShowMultiple(patches, *args, **kwargs)
+
+    def ShowChannels(self, patch, cmap=None, bg_func=np.mean):
+        """ This function shows the channels of a patch.
+
+    The incoming patch should have shape [w, h, num_channels], and each channel
+    will be visualized as a separate gray patch.
+    """
+        if len(patch.shape) != 3:
+            raise ValueError("The input patch shape isn't correct.")
+        patch_reordered = np.swapaxes(patch.T, 1, 2)
+        return self.ShowMultiple(patch_reordered, cmap=cmap, bg_func=bg_func)
+
+    def get_patch_shape(self, patch):
+        """Gets the shape of a single patch.
+
+    Basically it tries to interprete the patch as a square, and also check if it
+    is in color (3 channels)
+    """
+        edgeLen = np.sqrt(patch.size)
+        if edgeLen != np.floor(edgeLen):
+            # we are given color patches
+            edgeLen = np.sqrt(patch.size / 3.)
+            if edgeLen != np.floor(edgeLen):
+                raise ValueError("I can't figure out the patch shape.")
+            return (edgeLen, edgeLen, 3)
+        else:
+            edgeLen = int(edgeLen)
+            return (edgeLen, edgeLen)
+
+
+_default_visualizer = PatchVisualizer()
+"""Utility functions that directly point to functions in the default visualizer.
+
+These functions don't return anything, so you won't see annoying printouts of
+the visualized images. If you want to save the images for example, you should
+explicitly instantiate a patch visualizer, and call those functions.
+"""
+
+
+class NHWC(object):
+    @staticmethod
+    def ShowSingle(*args, **kwargs):
+        _default_visualizer.ShowSingle(*args, **kwargs)
+
+    @staticmethod
+    def ShowMultiple(*args, **kwargs):
+        _default_visualizer.ShowMultiple(*args, **kwargs)
+
+    @staticmethod
+    def ShowImages(*args, **kwargs):
+        _default_visualizer.ShowImages(*args, **kwargs)
+
+    @staticmethod
+    def ShowChannels(*args, **kwargs):
+        _default_visualizer.ShowChannels(*args, **kwargs)
+
+
+class NCHW(object):
+    @staticmethod
+    def ShowSingle(patch, *args, **kwargs):
+        _default_visualizer.ShowSingle(ChannelLast(patch), *args, **kwargs)
+
+    @staticmethod
+    def ShowMultiple(patch, *args, **kwargs):
+        _default_visualizer.ShowMultiple(ChannelLast(patch), *args, **kwargs)
+
+    @staticmethod
+    def ShowImages(patch, *args, **kwargs):
+        _default_visualizer.ShowImages(ChannelLast(patch), *args, **kwargs)
+
+    @staticmethod
+    def ShowChannels(patch, *args, **kwargs):
+        _default_visualizer.ShowChannels(ChannelLast(patch), *args, **kwargs)
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
new file mode 100644
index 0000000..f96268d
--- /dev/null
+++ b/caffe2/python/workspace.py
@@ -0,0 +1,651 @@
+## @package workspace
+# Module caffe2.python.workspace
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import collections
+import contextlib
+from google.protobuf.message import Message
+from multiprocessing import Process
+import os
+from collections import defaultdict
+import logging
+import numpy as np
+from past.builtins import basestring
+import shutil
+import socket
+import tempfile
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import scope, utils
+
+import caffe2.python._import_c_extension as C
+
+logger = logging.getLogger(__name__)
+
+Blobs = C.blobs
+CreateBlob = C.create_blob
+CurrentWorkspace = C.current_workspace
+DeserializeBlob = C.deserialize_blob
+GlobalInit = C.global_init
+HasBlob = C.has_blob
+RegisteredOperators = C.registered_operators
+SerializeBlob = C.serialize_blob
+SwitchWorkspace = C.switch_workspace
+RootFolder = C.root_folder
+Workspaces = C.workspaces
+BenchmarkNet = C.benchmark_net
+GetStats = C.get_stats
+
+operator_tracebacks = defaultdict(dict)
+
+is_asan = C.is_asan
+has_gpu_support = C.has_gpu_support
+if has_gpu_support:
+    NumCudaDevices = C.num_cuda_devices
+    GetCUDAVersion = C.get_cuda_version
+    GetCuDNNVersion = C.get_cudnn_version
+
+    def GetCudaPeerAccessPattern():
+        return np.asarray(C.get_cuda_peer_access_pattern())
+
+    GetDeviceProperties = C.get_device_properties
+else:
+    NumCudaDevices = lambda: 0 # noqa
+    GetCuDNNVersion = lambda: 0 # noqa
+    GetCuDNNVersion = lambda: 0 # noqa
+    GetCudaPeerAccessPattern = lambda: np.array([]) # noqa
+    GetDeviceProperties = lambda x: None # noqa
+
+IsNUMAEnabled = C.is_numa_enabled
+GetNumNUMANodes = C.get_num_numa_nodes
+GetBlobNUMANode = C.get_blob_numa_node
+GetBlobSizeBytes = C.get_blob_size_bytes
+
+def _GetFreeFlaskPort():
+    """Get a free flask port."""
+    # We will prefer to use 5000. If not, we will then pick a random port.
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    result = sock.connect_ex(('127.0.0.1', 5000))
+    if result == 0:
+        return 5000
+    else:
+        s = socket.socket()
+        s.bind(('', 0))
+        port = s.getsockname()[1]
+        s.close()
+        # Race condition: between the interval we close the socket and actually
+        # start a mint process, another process might have occupied the port. We
+        # don't do much here as this is mostly for convenience in research
+        # rather than 24x7 service.
+        return port
+
+
+def StartMint(root_folder=None, port=None):
+    """Start a mint instance.
+
+    TODO(Yangqing): this does not work well under ipython yet. According to
+        https://github.com/ipython/ipython/issues/5862
+    writing up some fix is a todo item.
+    """
+    from caffe2.python.mint import app
+    if root_folder is None:
+        # Get the root folder from the current workspace
+        root_folder = C.root_folder()
+    if port is None:
+        port = _GetFreeFlaskPort()
+    process = Process(
+        target=app.main,
+        args=(
+            ['-p', str(port), '-r', root_folder],
+        )
+    )
+    process.start()
+    print('Mint running at http://{}:{}'.format(socket.getfqdn(), port))
+    return process
+
+
+def StringifyProto(obj):
+    """Stringify a protocol buffer object.
+
+  Inputs:
+    obj: a protocol buffer object, or a Pycaffe2 object that has a Proto()
+        function.
+  Outputs:
+    string: the output protobuf string.
+  Raises:
+    AttributeError: if the passed in object does not have the right attribute.
+  """
+    if isinstance(obj, basestring):
+        return obj
+    else:
+        if isinstance(obj, Message):
+            # First, see if this object is a protocol buffer, which we can
+            # simply serialize with the SerializeToString() call.
+            return obj.SerializeToString()
+        elif hasattr(obj, 'Proto'):
+            return obj.Proto().SerializeToString()
+        else:
+            raise ValueError("Unexpected argument to StringifyProto of type " +
+                             type(obj).__name__)
+
+
+def ResetWorkspace(root_folder=None):
+    if root_folder is None:
+        # Reset the workspace, but keep the current root folder setting.
+        return C.reset_workspace(C.root_folder())
+    else:
+        if not os.path.exists(root_folder):
+            os.makedirs(root_folder)
+        return C.reset_workspace(root_folder)
+
+
+def CreateNet(net, overwrite=False, input_blobs=None):
+    if input_blobs is None:
+        input_blobs = []
+    for input_blob in input_blobs:
+        C.create_blob(input_blob)
+    return CallWithExceptionIntercept(
+        C.create_net,
+        C.Workspace.current._last_failed_op_net_position,
+        GetNetName(net),
+        StringifyProto(net), overwrite,
+    )
+
+
+def Predictor(init_net, predict_net):
+    return C.Predictor(StringifyProto(init_net), StringifyProto(predict_net))
+
+
+def GetOperatorCost(operator, blobs):
+    return C.get_operator_cost(StringifyProto(operator), blobs)
+
+
+def RunOperatorOnce(operator):
+    return C.run_operator_once(StringifyProto(operator))
+
+
+def RunOperatorsOnce(operators):
+    for op in operators:
+        success = RunOperatorOnce(op)
+        if not success:
+            return False
+    return True
+
+
+def CallWithExceptionIntercept(func, op_id_fetcher, net_name, *args, **kwargs):
+    try:
+        return func(*args, **kwargs)
+    except Exception:
+        op_id = op_id_fetcher()
+        net_tracebacks = operator_tracebacks.get(net_name, None)
+        logger.warning(
+            'Original python traceback for operator `{}` in network '
+            '`{}` in exception above (most recent call last):'.format(
+                op_id, net_name))
+        if net_tracebacks and op_id in net_tracebacks:
+            tb = net_tracebacks[op_id]
+            for line in reversed(tb):
+                logger.warning('  File "{}", line {}, in {}'.format(
+                    line[0], line[1], line[2]))
+        raise
+
+
+def RunNetOnce(net):
+    return CallWithExceptionIntercept(
+        C.run_net_once,
+        C.Workspace.current._last_failed_op_net_position,
+        GetNetName(net),
+        StringifyProto(net),
+    )
+
+
+def RunNet(name, num_iter=1, allow_fail=False):
+    """Runs a given net.
+
+    Inputs:
+      name: the name of the net, or a reference to the net.
+      num_iter: number of iterations to run
+      allow_fail: if True, does not assert on net exec failure but returns False
+    Returns:
+      True or an exception.
+    """
+    return CallWithExceptionIntercept(
+        C.run_net,
+        C.Workspace.current._last_failed_op_net_position,
+        GetNetName(name),
+        StringifyNetName(name), num_iter, allow_fail,
+    )
+
+
+def RunPlan(plan_or_step):
+    # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
+    import caffe2.python.core as core
+    if isinstance(plan_or_step, core.ExecutionStep):
+        plan_or_step = core.Plan(plan_or_step)
+    return C.run_plan(StringifyProto(plan_or_step))
+
+
+def InferShapesAndTypes(nets, blob_dimensions=None, nets_proto=False):
+    """Infers the shapes and types for the specified nets.
+
+    Inputs:
+      nets: the list of nets
+      blob_dimensions (optional): a dictionary of blobs and their dimensions.
+          If not specified, the workspace blobs are used.
+      nets_proto (optional): a boolean flag indicating whether the protobuffer
+          representation is passed to the routine.
+    Returns:
+      A tuple of (shapes, types) dictionaries keyed by blob name.
+    """
+    if nets_proto:
+        net_protos = [StringifyProto(n) for n in nets]
+    else:
+        net_protos = [StringifyProto(n.Proto()) for n in nets]
+    if blob_dimensions is None:
+        blobdesc_prototxt = C.infer_shapes_and_types_from_workspace(net_protos)
+    else:
+        blobdesc_prototxt = C.infer_shapes_and_types_from_map(
+            net_protos, blob_dimensions
+        )
+    blobdesc_proto = caffe2_pb2.TensorShapes()
+    blobdesc_proto.ParseFromString(blobdesc_prototxt)
+    shapes = {}
+    types = {}
+    for ts in blobdesc_proto.shapes:
+        if not ts.unknown_shape:
+            shapes[ts.name] = list(ts.dims)
+            types[ts.name] = ts.data_type
+
+    return (shapes, types)
+
+
+def _StringifyName(name, expected_type):
+    if isinstance(name, basestring):
+        return name
+    assert type(name).__name__ == expected_type, \
+        "Expected a string or %s" % expected_type
+    return str(name)
+
+
+def StringifyBlobName(name):
+    return _StringifyName(name, "BlobReference")
+
+
+def StringifyNetName(name):
+    return _StringifyName(name, "Net")
+
+
+def GetNetName(net):
+    if isinstance(net, basestring):
+        return net
+    if type(net).__name__ == "Net":
+        return net.Name()
+    if isinstance(net, caffe2_pb2.NetDef):
+        return net.name
+    raise Exception("Not a Net object: {}".format(str(net)))
+
+
+def FeedBlob(name, arr, device_option=None):
+    """Feeds a blob into the workspace.
+
+    Inputs:
+      name: the name of the blob.
+      arr: either a TensorProto object or a numpy array object to be fed into
+          the workspace.
+      device_option (optional): the device option to feed the data with.
+    Returns:
+      True or False, stating whether the feed is successful.
+    """
+    if type(arr) is caffe2_pb2.TensorProto:
+        arr = utils.Caffe2TensorToNumpyArray(arr)
+    if type(arr) is np.ndarray and arr.dtype.kind in 'SU':
+        # Plain NumPy strings are weird, let's use objects instead
+        arr = arr.astype(np.object)
+
+    if device_option is None:
+        device_option = scope.CurrentDeviceScope()
+
+    if device_option and device_option.device_type == caffe2_pb2.CUDA:
+        if arr.dtype == np.dtype('float64'):
+            logger.warning(
+                "CUDA operators do not support 64-bit doubles, " +
+                "please use arr.astype(np.float32) or np.int32 for ints." +
+                " Blob: {}".format(name) +
+                " type: {}".format(str(arr.dtype))
+            )
+
+    name = StringifyBlobName(name)
+    if device_option is not None:
+        return C.feed_blob(name, arr, StringifyProto(device_option))
+    else:
+        return C.feed_blob(name, arr)
+
+
+def FetchBlobs(names):
+    """Fetches a list of blobs from the workspace.
+
+    Inputs:
+        names: list of names of blobs - strings or BlobReferences
+    Returns:
+        list of fetched blobs
+    """
+    return [FetchBlob(name) for name in names]
+
+
+def FetchBlob(name):
+    """Fetches a blob from the workspace.
+
+    Inputs:
+      name: the name of the blob - a string or a BlobReference
+    Returns:
+      Fetched blob (numpy array or string) if successful
+    """
+    result = C.fetch_blob(StringifyBlobName(name))
+    if isinstance(result, tuple):
+        raise TypeError(
+            "Use FetchInt8Blob to fetch Int8 Blob {}".format(
+                StringifyBlobName(name)
+            )
+        )
+    return result
+
+
+Int8Tensor = collections.namedtuple(
+    'Int8Tensor', ['data', 'scale', 'zero_point']
+)
+
+
+def FetchInt8Blob(name):
+    """Fetches an Int8 blob from the workspace. It shared backend implementation
+    with FetchBlob but it is recommened when fetching Int8 Blobs
+
+    Inputs:
+      name: the name of the Int8 blob - a string or a BlobReference
+    Returns:
+      data: int8 numpy array, data
+      scale: float, fake quantization scale
+      zero_point: int, fake quantization offset
+    """
+    result = C.fetch_blob(StringifyBlobName(name))
+    assert isinstance(result, tuple), \
+        'You are not fetching an Int8Blob {}. Please use FetchBlob'.format(
+            StringifyBlobName(name))
+    return Int8Tensor(*result)
+
+
+def _Workspace_fetch_int8_blob(ws, name):
+    """Fetches an Int8 blob from the workspace. It shared backend implementation
+    with FetchBlob but it is recommened when fetching Int8 Blobs
+
+    Inputs:
+      name: the name of the Int8 blob - a string or a BlobReference
+    Returns:
+      data: int8 numpy array, data
+      scale: float, fake quantization scale
+      zero_point: int, fake quantization offset
+    """
+    result = ws.fetch_blob(name)
+    assert isinstance(result, tuple), \
+        'You are not fetching an Int8Blob {}. Please use fetch_blob'.format(
+            StringifyBlobName(name))
+    return Int8Tensor(*result)
+
+
+C.Workspace.fetch_int8_blob = _Workspace_fetch_int8_blob
+
+
+def ApplyTransform(transform_key, net):
+    """Apply a Transform to a NetDef protobuf object, and returns the new
+    transformed NetDef.
+
+    Inputs:
+      transform_key: the name of the transform, as it is stored in the registry
+      net: a NetDef protobuf object
+    Returns:
+      Transformed NetDef protobuf object.
+    """
+    transformed_net = caffe2_pb2.NetDef()
+    transformed_str = C.apply_transform(
+        str(transform_key).encode('utf-8'),
+        net.SerializeToString(),
+    )
+    transformed_net.ParseFromString(transformed_str)
+    return transformed_net
+
+
+def ApplyTransformIfFaster(transform_key, net, init_net, **kwargs):
+    """Apply a Transform to a NetDef protobuf object, and returns the new
+    transformed NetDef, only if it runs faster than the original.
+
+    The runs are performed on the current active workspace (gWorkspace).
+    You should initialize that workspace before making a call to this function.
+
+    Inputs:
+      transform_key: the name of the transform, as it is stored in the registry
+      net: a NetDef protobuf object
+      init_net: The net to initialize the workspace.
+      warmup_runs (optional):
+        Determines how many times the net is run before testing.
+        Will be 5 by default.
+      main_runs (optional):
+        Determines how many times the net is run during testing.
+        Will be 10 by default.
+      improvement_threshold (optional):
+        Determines the factor which the new net needs to be faster
+        in order to replace the old. Will be 1.01 by default.
+
+    Returns:
+      Either a Transformed NetDef protobuf object, or the original netdef.
+    """
+
+    warmup_runs = kwargs['warmup_runs'] if 'warmup_runs' in kwargs else 5
+    main_runs = kwargs['main_runs'] if 'main_runs' in kwargs else 10
+    improvement_threshold = kwargs['improvement_threshold'] \
+        if 'improvement_threshold' in kwargs else 1.01
+
+    transformed_net = caffe2_pb2.NetDef()
+    transformed_str = C.apply_transform_if_faster(
+        str(transform_key).encode('utf-8'),
+        net.SerializeToString(),
+        init_net.SerializeToString(),
+        warmup_runs,
+        main_runs,
+        float(improvement_threshold),
+    )
+    transformed_net.ParseFromString(transformed_str)
+    return transformed_net
+
+
+def GetNameScope():
+    """Return the current namescope string. To be used to fetch blobs"""
+    return scope.CurrentNameScope()
+
+
+class _BlobDict(object):
+    """Provides python dict compatible way to do fetching and feeding"""
+
+    def __getitem__(self, key):
+        return FetchBlob(key)
+
+    def __setitem__(self, key, value):
+        return FeedBlob(key, value)
+
+    def __len__(self):
+        return len(C.blobs())
+
+    def __iter__(self):
+        return C.blobs().__iter__()
+
+    def __contains__(self, item):
+        return C.has_blob(item)
+
+
+blobs = _BlobDict()
+
+
+################################################################################
+# Utilities for immediate mode
+#
+# Caffe2's immediate mode implements the following behavior: between the two
+# function calls StartImmediate() and StopImmediate(), for any operator that is
+# called through CreateOperator(), we will also run that operator in a workspace
+# that is specific to the immediate mode. The user is explicitly expected to
+# make sure that these ops have proper inputs and outputs, i.e. one should not
+# run an op where an external input is not created or fed.
+#
+# Users can use FeedImmediate() and FetchImmediate() to interact with blobs
+# in the immediate workspace.
+#
+# Once StopImmediate() is called, all contents in the immediate workspace is
+# freed up so one can continue using normal runs.
+#
+# The immediate mode is solely for debugging purposes and support will be very
+# sparse.
+################################################################################
+
+_immediate_mode = False
+_immediate_workspace_name = "_CAFFE2_IMMEDIATE"
+_immediate_root_folder = ''
+
+
+def IsImmediate():
+    return _immediate_mode
+
+
+@contextlib.contextmanager
+def WorkspaceGuard(workspace_name):
+    current = CurrentWorkspace()
+    SwitchWorkspace(workspace_name, True)
+    yield
+    SwitchWorkspace(current)
+
+
+def StartImmediate(i_know=False):
+    global _immediate_mode
+    global _immediate_root_folder
+    if IsImmediate():
+        # already in immediate mode. We will kill the previous one
+        # and start from fresh.
+        StopImmediate()
+    _immediate_mode = True
+    with WorkspaceGuard(_immediate_workspace_name):
+        _immediate_root_folder = tempfile.mkdtemp()
+        ResetWorkspace(_immediate_root_folder)
+    if i_know:
+        # if the user doesn't want to see the warning message, sure...
+        return
+    print("""
+    Enabling immediate mode in caffe2 python is an EXTREMELY EXPERIMENTAL
+    feature and may very easily go wrong. This is because Caffe2 uses a
+    declarative way of defining operators and models, which is essentially
+    not meant to run things in an interactive way. Read the following carefully
+    to make sure that you understand the caveats.
+
+    (1) You need to make sure that the sequences of operators you create are
+    actually runnable sequentially. For example, if you create an op that takes
+    an input X, somewhere earlier you should have already created X.
+
+    (2) Caffe2 immediate uses one single workspace, so if the set of operators
+    you run are intended to be under different workspaces, they will not run.
+    To create boundaries between such use cases, you can call FinishImmediate()
+    and StartImmediate() manually to flush out everything no longer needed.
+
+    (3) Underlying objects held by the immediate mode may interfere with your
+    normal run. For example, if there is a leveldb that you opened in immediate
+    mode and did not close, your main run will fail because leveldb does not
+    support double opening. Immediate mode may also occupy a lot of memory esp.
+    on GPUs. Call FinishImmediate() as soon as possible when you no longer
+    need it.
+
+    (4) Immediate is designed to be slow. Every immediate call implicitly
+    creates a temp operator object, runs it, and destroys the operator. This
+    slow-speed run is by design to discourage abuse. For most use cases other
+    than debugging, do NOT turn on immediate mode.
+
+    (5) If there is anything FATAL happening in the underlying C++ code, the
+    immediate mode will immediately (pun intended) cause the runtime to crash.
+
+    Thus you should use immediate mode with extra care. If you still would
+    like to, have fun [https://xkcd.com/149/].
+    """)
+
+
+def StopImmediate():
+    """Stops an immediate mode run."""
+    # Phew, that was a dangerous ride.
+    global _immediate_mode
+    global _immediate_root_folder
+    if not IsImmediate():
+        return
+    with WorkspaceGuard(_immediate_workspace_name):
+        ResetWorkspace()
+    shutil.rmtree(_immediate_root_folder)
+    _immediate_root_folder = ''
+    _immediate_mode = False
+
+
+def ImmediateBlobs():
+    with WorkspaceGuard(_immediate_workspace_name):
+        return Blobs()
+
+
+def RunOperatorImmediate(op):
+    with WorkspaceGuard(_immediate_workspace_name):
+        RunOperatorOnce(op)
+
+
+def FetchImmediate(*args, **kwargs):
+    with WorkspaceGuard(_immediate_workspace_name):
+        return FetchBlob(*args, **kwargs)
+
+
+def FeedImmediate(*args, **kwargs):
+    with WorkspaceGuard(_immediate_workspace_name):
+        return FeedBlob(*args, **kwargs)
+
+
+# CWorkspace utilities
+
+def _Workspace_create_net_with_exception_intercept(ws, net, overwrite=False):
+    return CallWithExceptionIntercept(
+        ws._create_net,
+        ws._last_failed_op_net_position,
+        GetNetName(net),
+        StringifyProto(net), overwrite,
+    )
+
+
+C.Workspace.create_net = _Workspace_create_net_with_exception_intercept
+
+
+def _Workspace_run(ws, obj):
+    if hasattr(obj, 'Proto'):
+        obj = obj.Proto()
+    if isinstance(obj, caffe2_pb2.PlanDef):
+        return ws._run_plan(obj.SerializeToString())
+    if isinstance(obj, caffe2_pb2.NetDef):
+        return CallWithExceptionIntercept(
+            ws._run_net,
+            ws._last_failed_op_net_position,
+            GetNetName(obj),
+            obj.SerializeToString(),
+        )
+        # return ws._run_net(obj.SerializeToString())
+    if isinstance(obj, caffe2_pb2.OperatorDef):
+        return ws._run_operator(obj.SerializeToString())
+    raise ValueError(
+        "Don't know how to do Workspace.run() on {}".format(type(obj)))
+
+
+C.Workspace.run = _Workspace_run
+
+
+def _Blob_feed(blob, arg, device_option=None):
+    if device_option is not None:
+        device_option = StringifyProto(device_option)
+    return blob._feed(arg, device_option)
+
+
+C.Blob.feed = _Blob_feed
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
new file mode 100644
index 0000000..78468ec
--- /dev/null
+++ b/caffe2/python/workspace_test.py
@@ -0,0 +1,644 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import os
+import unittest
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, test_util, workspace, model_helper, brew
+
+import caffe2.python.hypothesis_test_util as htu
+import hypothesis.strategies as st
+from hypothesis import given
+
+
+class TestWorkspace(unittest.TestCase):
+    def setUp(self):
+        self.net = core.Net("test-net")
+        self.testblob_ref = self.net.ConstantFill(
+            [], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        workspace.ResetWorkspace()
+
+    def testRootFolder(self):
+        self.assertEqual(workspace.ResetWorkspace(), True)
+        self.assertEqual(workspace.RootFolder(), ".")
+        self.assertEqual(
+            workspace.ResetWorkspace("/tmp/caffe-workspace-test"), True)
+        self.assertEqual(workspace.RootFolder(), "/tmp/caffe-workspace-test")
+
+    def testWorkspaceHasBlobWithNonexistingName(self):
+        self.assertEqual(workspace.HasBlob("non-existing"), False)
+
+    def testRunOperatorOnce(self):
+        self.assertEqual(
+            workspace.RunOperatorOnce(
+                self.net.Proto().op[0].SerializeToString()
+            ), True
+        )
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+        blobs = workspace.Blobs()
+        self.assertEqual(len(blobs), 1)
+        self.assertEqual(blobs[0], "testblob")
+
+    def testGetOperatorCost(self):
+        op = core.CreateOperator(
+            "Conv2D",
+            ["X", "W"], ["Y"],
+            stride_h=1,
+            stride_w=1,
+            pad_t=1,
+            pad_l=1,
+            pad_b=1,
+            pad_r=1,
+            kernel=3,
+        )
+        X = np.zeros((1, 8, 8, 8))
+        W = np.zeros((1, 1, 3, 3))
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+        flops, _ = workspace.GetOperatorCost(op.SerializeToString(), ["X", "W"])
+        self.assertEqual(flops, 1152)
+
+    def testRunNetOnce(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
+    def testCurrentWorkspaceWrapper(self):
+        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+        self.assertIn("testblob", workspace.C.Workspace.current.blobs)
+        workspace.ResetWorkspace()
+        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+
+    def testRunPlan(self):
+        plan = core.Plan("test-plan")
+        plan.AddStep(core.ExecutionStep("test-step", self.net))
+        self.assertEqual(
+            workspace.RunPlan(plan.Proto().SerializeToString()), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
+    def testConstructPlanFromSteps(self):
+        step = core.ExecutionStep("test-step-as-plan", self.net)
+        self.assertEqual(workspace.RunPlan(step), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
+    def testResetWorkspace(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+        self.assertEqual(workspace.ResetWorkspace(), True)
+        self.assertEqual(workspace.HasBlob("testblob"), False)
+
+    def testTensorAccess(self):
+        ws = workspace.C.Workspace()
+
+        """ test in-place modification """
+        ws.create_blob("tensor").feed(np.array([1.1, 1.2, 1.3]))
+        tensor = ws.blobs["tensor"].tensor()
+        tensor.data[0] = 3.3
+        val = np.array([3.3, 1.2, 1.3])
+        np.testing.assert_array_equal(tensor.data, val)
+        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
+
+        """ test in-place initialization """
+        tensor.init([2, 3], core.DataType.INT32)
+        tensor.data[1, 1] = 100
+        val = np.zeros([2, 3], dtype=np.int32)
+        val[1, 1] = 100
+        np.testing.assert_array_equal(tensor.data, val)
+        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
+
+        """ strings cannot be initialized from python """
+        with self.assertRaises(RuntimeError):
+            tensor.init([3, 4], core.DataType.STRING)
+
+        """ feed (copy) data into tensor """
+        val = np.array([[b'abc', b'def'], [b'ghi', b'jkl']], dtype=np.object)
+        tensor.feed(val)
+        self.assertEquals(tensor.data[0, 0], b'abc')
+        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
+
+        val = np.array([1.1, 10.2])
+        tensor.feed(val)
+        val[0] = 5.2
+        self.assertEquals(tensor.data[0], 1.1)
+
+        """ fetch (copy) data from tensor """
+        val = np.array([1.1, 1.2])
+        tensor.feed(val)
+        val2 = tensor.fetch()
+        tensor.data[0] = 5.2
+        val3 = tensor.fetch()
+        np.testing.assert_array_equal(val, val2)
+        self.assertEquals(val3[0], 5.2)
+
+    def testFetchFeedBlob(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.FetchBlob("testblob")
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        self.assertEqual(workspace.FeedBlob("testblob", fetched), True)
+        fetched_again = workspace.FetchBlob("testblob")
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+    def testFetchFeedBlobViaBlobReference(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.FetchBlob(self.testblob_ref)
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True)
+        fetched_again = workspace.FetchBlob("testblob")  # fetch by name now
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+    def testFetchFeedBlobTypes(self):
+        for dtype in [np.float16, np.float32, np.float64, np.bool,
+                      np.int8, np.int16, np.int32, np.int64,
+                      np.uint8, np.uint16]:
+            try:
+                rng = np.iinfo(dtype).max * 2
+            except ValueError:
+                rng = 1000
+            data = ((np.random.rand(2, 3, 4) - 0.5) * rng).astype(dtype)
+            self.assertEqual(workspace.FeedBlob("testblob_types", data), True)
+            fetched_back = workspace.FetchBlob("testblob_types")
+            self.assertEqual(fetched_back.shape, (2, 3, 4))
+            self.assertEqual(fetched_back.dtype, dtype)
+            np.testing.assert_array_equal(fetched_back, data)
+
+    def testFetchFeedBlobBool(self):
+        """Special case for bool to ensure coverage of both true and false."""
+        data = np.zeros((2, 3, 4)).astype(np.bool)
+        data.flat[::2] = True
+        self.assertEqual(workspace.FeedBlob("testblob_types", data), True)
+        fetched_back = workspace.FetchBlob("testblob_types")
+        self.assertEqual(fetched_back.shape, (2, 3, 4))
+        self.assertEqual(fetched_back.dtype, np.bool)
+        np.testing.assert_array_equal(fetched_back, data)
+
+    def testGetBlobSizeBytes(self):
+        for dtype in [np.float16, np.float32, np.float64, np.bool,
+                      np.int8, np.int16, np.int32, np.int64,
+                      np.uint8, np.uint16]:
+            data = np.random.randn(2, 3).astype(dtype)
+            self.assertTrue(workspace.FeedBlob("testblob_sizeBytes", data), True)
+            self.assertEqual(
+                workspace.GetBlobSizeBytes("testblob_sizeBytes"),
+                6 * np.dtype(dtype).itemsize)
+        strs1 = np.array([b'Hello World!', b'abcd'])
+        strs2 = np.array([b'element1', b'element2'])
+        strs1_len, strs2_len = 0, 0
+        for str in strs1:
+            strs1_len += len(str)
+        for str in strs2:
+            strs2_len += len(str)
+        self.assertTrue(workspace.FeedBlob("testblob_str1", strs1), True)
+        self.assertTrue(workspace.FeedBlob("testblob_str2", strs2), True)
+        # size of blob "testblob_str1" = size_str1 * meta_.itemsize() + strs1_len
+        # size of blob "testblob_str2" = size_str2 * meta_.itemsize() + strs2_len
+        self.assertEqual(
+            workspace.GetBlobSizeBytes("testblob_str1") -
+            workspace.GetBlobSizeBytes("testblob_str2"), strs1_len - strs2_len)
+
+    def testFetchFeedBlobZeroDim(self):
+        data = np.empty(shape=(2, 0, 3), dtype=np.float32)
+        self.assertEqual(workspace.FeedBlob("testblob_empty", data), True)
+        fetched_back = workspace.FetchBlob("testblob_empty")
+        self.assertEqual(fetched_back.shape, (2, 0, 3))
+        self.assertEqual(fetched_back.dtype, np.float32)
+
+    def testFetchFeedLongStringTensor(self):
+        # long strings trigger array of object creation
+        strs = np.array([
+            b' '.join(10 * [b'long string']),
+            b' '.join(128 * [b'very long string']),
+            b'small \0\1\2 string',
+            b"Hello, world! I have special \0 symbols \1!"])
+        workspace.FeedBlob('my_str_tensor', strs)
+        strs2 = workspace.FetchBlob('my_str_tensor')
+        self.assertEqual(strs.shape, strs2.shape)
+        for i in range(0, strs.shape[0]):
+            self.assertEqual(strs[i], strs2[i])
+
+    def testFetchFeedShortStringTensor(self):
+        # small strings trigger NPY_STRING array
+        strs = np.array([b'elem1', b'elem 2', b'element 3'])
+        workspace.FeedBlob('my_str_tensor_2', strs)
+        strs2 = workspace.FetchBlob('my_str_tensor_2')
+        self.assertEqual(strs.shape, strs2.shape)
+        for i in range(0, strs.shape[0]):
+            self.assertEqual(strs[i], strs2[i])
+
+    def testFetchFeedPlainString(self):
+        # this is actual string, not a tensor of strings
+        s = b"Hello, world! I have special \0 symbols \1!"
+        workspace.FeedBlob('my_plain_string', s)
+        s2 = workspace.FetchBlob('my_plain_string')
+        self.assertEqual(s, s2)
+
+    def testFetchBlobs(self):
+        s1 = b"test1"
+        s2 = b"test2"
+        workspace.FeedBlob('s1', s1)
+        workspace.FeedBlob('s2', s2)
+        fetch1, fetch2 = workspace.FetchBlobs(['s1', 's2'])
+        self.assertEquals(s1, fetch1)
+        self.assertEquals(s2, fetch2)
+
+    def testFetchFeedViaBlobDict(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.blobs["testblob"]
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        workspace.blobs["testblob"] = fetched
+        fetched_again = workspace.blobs["testblob"]
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+        self.assertTrue("testblob" in workspace.blobs)
+        self.assertFalse("non_existant" in workspace.blobs)
+        self.assertEqual(len(workspace.blobs), 1)
+        for key in workspace.blobs:
+            self.assertEqual(key, "testblob")
+
+
+class TestMultiWorkspaces(unittest.TestCase):
+    def setUp(self):
+        workspace.SwitchWorkspace("default")
+        workspace.ResetWorkspace()
+
+    def testCreateWorkspace(self):
+        self.net = core.Net("test-net")
+        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
+        )
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+        self.assertEqual(workspace.SwitchWorkspace("test", True), None)
+        self.assertEqual(workspace.HasBlob("testblob"), False)
+        self.assertEqual(workspace.SwitchWorkspace("default"), None)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
+        try:
+            # The following should raise an error.
+            workspace.SwitchWorkspace("non-existing")
+            # so this should never happen.
+            self.assertEqual(True, False)
+        except RuntimeError:
+            pass
+
+        workspaces = workspace.Workspaces()
+        self.assertTrue("default" in workspaces)
+        self.assertTrue("test" in workspaces)
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+class TestWorkspaceGPU(test_util.TestCase):
+
+    def setUp(self):
+        workspace.ResetWorkspace()
+        self.net = core.Net("test-net")
+        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        self.net.RunAllOnGPU()
+
+    def testFetchBlobGPU(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.FetchBlob("testblob")
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        self.assertEqual(workspace.FeedBlob("testblob", fetched), True)
+        fetched_again = workspace.FetchBlob("testblob")
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+    def testGetCudaPeerAccessPattern(self):
+        pattern = workspace.GetCudaPeerAccessPattern()
+        self.assertEqual(type(pattern), np.ndarray)
+        self.assertEqual(pattern.ndim, 2)
+        self.assertEqual(pattern.shape[0], pattern.shape[1])
+        self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
+
+
+@unittest.skipIf(not workspace.C.has_mkldnn, "No MKLDNN support.")
+class TestWorkspaceMKLDNN(test_util.TestCase):
+
+    def testFeedFetchBlobMKLDNN(self):
+        arr = np.random.randn(2, 3).astype(np.float32)
+        workspace.FeedBlob(
+            "testblob_mkldnn", arr, core.DeviceOption(caffe2_pb2.MKLDNN))
+        fetched = workspace.FetchBlob("testblob_mkldnn")
+        np.testing.assert_array_equal(arr, fetched)
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestWorkspaceIDEEP(test_util.TestCase):
+
+    def testFeedFetchBlobIDEEP(self):
+        arr = np.random.randn(2, 3).astype(np.float32)
+        workspace.FeedBlob(
+            "testblob_ideep", arr, core.DeviceOption(caffe2_pb2.IDEEP))
+        fetched = workspace.FetchBlob("testblob_ideep")
+        np.testing.assert_array_equal(arr, fetched)
+
+class TestImmedibate(test_util.TestCase):
+    def testImmediateEnterExit(self):
+        workspace.StartImmediate(i_know=True)
+        self.assertTrue(workspace.IsImmediate())
+        workspace.StopImmediate()
+        self.assertFalse(workspace.IsImmediate())
+
+    def testImmediateRunsCorrectly(self):
+        workspace.StartImmediate(i_know=True)
+        net = core.Net("test-net")
+        net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        self.assertEqual(
+            workspace.ImmediateBlobs(), ["testblob"])
+        content = workspace.FetchImmediate("testblob")
+        # Also, the immediate mode should not invade the original namespace,
+        # so we check if this is so.
+        with self.assertRaises(RuntimeError):
+            workspace.FetchBlob("testblob")
+        np.testing.assert_array_equal(content, 1.0)
+        content[:] = 2.0
+        self.assertTrue(workspace.FeedImmediate("testblob", content))
+        np.testing.assert_array_equal(
+            workspace.FetchImmediate("testblob"), 2.0)
+        workspace.StopImmediate()
+        with self.assertRaises(RuntimeError):
+            content = workspace.FetchImmediate("testblob")
+
+    def testImmediateRootFolder(self):
+        workspace.StartImmediate(i_know=True)
+        # for testing we will look into the _immediate_root_folder variable
+        # but in normal usage you should not access that.
+        self.assertTrue(len(workspace._immediate_root_folder) > 0)
+        root_folder = workspace._immediate_root_folder
+        self.assertTrue(os.path.isdir(root_folder))
+        workspace.StopImmediate()
+        self.assertTrue(len(workspace._immediate_root_folder) == 0)
+        # After termination, immediate mode should have the root folder
+        # deleted.
+        self.assertFalse(os.path.exists(root_folder))
+
+
+class TestCppEnforceAsException(test_util.TestCase):
+    def testEnforce(self):
+        op = core.CreateOperator("Relu", ["X"], ["Y"])
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
+
+class TestCWorkspace(htu.HypothesisTestCase):
+    def test_net_execution(self):
+        ws = workspace.C.Workspace()
+        self.assertEqual(ws.nets, {})
+        self.assertEqual(ws.blobs, {})
+        net = core.Net("test-net")
+        net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        ws.create_net(net)
+        # If we do not specify overwrite, this should raise an error.
+        with self.assertRaises(RuntimeError):
+            ws.create_net(net)
+        # But, if we specify overwrite, this should pass.
+        ws.create_net(net, True)
+        # Overwrite can also be a kwarg.
+        ws.create_net(net, overwrite=True)
+        self.assertIn("testblob", ws.blobs)
+        self.assertEqual(len(ws.nets), 1)
+        net_name = net.Proto().name
+        self.assertIn("test-net", net_name)
+        net = ws.nets[net_name].run()
+        blob = ws.blobs["testblob"]
+        np.testing.assert_array_equal(
+            np.ones((1, 2, 3, 4), dtype=np.float32),
+            blob.fetch())
+
+    @given(name=st.text(), value=st.floats(min_value=-1, max_value=1.0))
+    def test_operator_run(self, name, value):
+        ws = workspace.C.Workspace()
+        op = core.CreateOperator(
+            "ConstantFill", [], [name], shape=[1], value=value)
+        ws.run(op)
+        self.assertIn(name, ws.blobs)
+        np.testing.assert_allclose(
+            [value], ws.blobs[name].fetch(), atol=1e-4, rtol=1e-4)
+
+    @given(blob_name=st.text(),
+           net_name=st.text(),
+           value=st.floats(min_value=-1, max_value=1.0))
+    def test_net_run(self, blob_name, net_name, value):
+        ws = workspace.C.Workspace()
+        net = core.Net(net_name)
+        net.ConstantFill([], [blob_name], shape=[1], value=value)
+        ws.run(net)
+        self.assertIn(blob_name, ws.blobs)
+        self.assertNotIn(net_name, ws.nets)
+        np.testing.assert_allclose(
+            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4)
+
+    @given(blob_name=st.text(),
+           net_name=st.text(),
+           plan_name=st.text(),
+           value=st.floats(min_value=-1, max_value=1.0))
+    def test_plan_run(self, blob_name, plan_name, net_name, value):
+        ws = workspace.C.Workspace()
+        plan = core.Plan(plan_name)
+        net = core.Net(net_name)
+        net.ConstantFill([], [blob_name], shape=[1], value=value)
+
+        plan.AddStep(core.ExecutionStep("step", nets=[net], num_iter=1))
+
+        ws.run(plan)
+        self.assertIn(blob_name, ws.blobs)
+        self.assertIn(net.Name(), ws.nets)
+        np.testing.assert_allclose(
+            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4)
+
+    @given(blob_name=st.text(),
+           net_name=st.text(),
+           value=st.floats(min_value=-1, max_value=1.0))
+    def test_net_create(self, blob_name, net_name, value):
+        ws = workspace.C.Workspace()
+        net = core.Net(net_name)
+        net.ConstantFill([], [blob_name], shape=[1], value=value)
+        ws.create_net(net).run()
+        self.assertIn(blob_name, ws.blobs)
+        self.assertIn(net.Name(), ws.nets)
+        np.testing.assert_allclose(
+            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4)
+
+    @given(name=st.text(),
+           value=htu.tensor(),
+           device_option=st.sampled_from(htu.device_options))
+    def test_array_serde(self, name, value, device_option):
+        ws = workspace.C.Workspace()
+        ws.create_blob(name).feed(value, device_option=device_option)
+        self.assertIn(name, ws.blobs)
+        blob = ws.blobs[name]
+        np.testing.assert_equal(value, ws.blobs[name].fetch())
+        serde_blob = ws.create_blob("{}_serde".format(name))
+        serde_blob.deserialize(blob.serialize(name))
+        np.testing.assert_equal(value, serde_blob.fetch())
+
+    @given(name=st.text(), value=st.text())
+    def test_string_serde(self, name, value):
+        value = value.encode('ascii', 'ignore')
+        ws = workspace.C.Workspace()
+        ws.create_blob(name).feed(value)
+        self.assertIn(name, ws.blobs)
+        blob = ws.blobs[name]
+        self.assertEqual(value, ws.blobs[name].fetch())
+        serde_blob = ws.create_blob("{}_serde".format(name))
+        serde_blob.deserialize(blob.serialize(name))
+        self.assertEqual(value, serde_blob.fetch())
+
+    def test_exception(self):
+        ws = workspace.C.Workspace()
+
+        with self.assertRaises(TypeError):
+            ws.create_net("...")
+
+
+class TestPredictor(unittest.TestCase):
+    def _create_model(self):
+        m = model_helper.ModelHelper()
+        y = brew.fc(m, "data", "y",
+                    dim_in=4, dim_out=2,
+                    weight_init=('ConstantFill', dict(value=1.0)),
+                    bias_init=('ConstantFill', dict(value=0.0)),
+                    axis=0)
+        m.net.AddExternalOutput(y)
+        return m
+
+    # Use this test with a bigger model to see how using Predictor allows to
+    # avoid issues with low protobuf size limit in Python
+    #
+    # def test_predictor_predefined(self):
+    #     workspace.ResetWorkspace()
+    #     path = 'caffe2/caffe2/test/assets/'
+    #     with open(path + 'squeeze_predict_net.pb') as f:
+    #         self.predict_net = f.read()
+    #     with open(path + 'squeeze_init_net.pb') as f:
+    #         self.init_net = f.read()
+    #     self.predictor = workspace.Predictor(self.init_net, self.predict_net)
+
+    #     inputs = [np.zeros((1, 3, 256, 256), dtype='f')]
+    #     outputs = self.predictor.run(inputs)
+    #     self.assertEqual(len(outputs), 1)
+    #     self.assertEqual(outputs[0].shape, (1, 1000, 1, 1))
+    #     self.assertAlmostEqual(outputs[0][0][0][0][0], 5.19026289e-05)
+
+    def test_predictor_memory_model(self):
+        workspace.ResetWorkspace()
+        m = self._create_model()
+        workspace.FeedBlob("data", np.zeros([4], dtype='float32'))
+        self.predictor = workspace.Predictor(
+            workspace.StringifyProto(m.param_init_net.Proto()),
+            workspace.StringifyProto(m.net.Proto()))
+
+        inputs = np.array([1, 3, 256, 256], dtype='float32')
+        outputs = self.predictor.run([inputs])
+        np.testing.assert_array_almost_equal(
+            np.array([[516, 516]], dtype='float32'), outputs)
+
+
+class TestTransform(htu.HypothesisTestCase):
+    @given(input_dim=st.integers(min_value=1, max_value=10),
+           output_dim=st.integers(min_value=1, max_value=10),
+           batch_size=st.integers(min_value=1, max_value=10))
+    def test_simple_transform(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
+        conv = brew.conv(m, fc2, "conv",
+                            dim_in=output_dim,
+                            dim_out=output_dim,
+                            use_cudnn=True,
+                            engine="CUDNN",
+                            kernel=3)
+
+        conv.Relu([], conv)\
+           .Softmax([], "pred") \
+           .LabelCrossEntropy(["label"], ["xent"]) \
+           .AveragedLoss([], "loss")
+
+        transformed_net_proto = workspace.ApplyTransform(
+            "ConvToNNPack",
+            m.net.Proto())
+
+        self.assertEqual(transformed_net_proto.op[2].engine, "NNPACK")
+
+    @given(input_dim=st.integers(min_value=1, max_value=10),
+           output_dim=st.integers(min_value=1, max_value=10),
+           batch_size=st.integers(min_value=1, max_value=10))
+    def test_registry_invalid(self, input_dim, output_dim, batch_size):
+        m = model_helper.ModelHelper()
+        brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
+        with self.assertRaises(RuntimeError):
+            workspace.ApplyTransform(
+                "definitely_not_a_real_transform",
+                m.net.Proto())
+
+    @given(value=st.floats(min_value=-1, max_value=1))
+    def test_apply_transform_if_faster(self, value):
+
+        init_net = core.Net("init_net")
+        init_net.ConstantFill([], ["data"], shape=[5, 5, 5, 5], value=value)
+        init_net.ConstantFill([], ["conv_w"], shape=[5, 5, 3, 3], value=value)
+        init_net.ConstantFill([], ["conv_b"], shape=[5], value=value)
+
+        self.assertEqual(
+            workspace.RunNetOnce(init_net.Proto().SerializeToString()), True)
+
+        m = model_helper.ModelHelper()
+        conv = brew.conv(m, "data", "conv",
+                            dim_in=5,
+                            dim_out=5,
+                            kernel=3,
+                            use_cudnn=True,
+                            engine="CUDNN")
+
+        conv.Relu([], conv)\
+           .Softmax([], "pred") \
+           .AveragedLoss([], "loss")
+
+        self.assertEqual(
+            workspace.RunNetOnce(m.net.Proto().SerializeToString()), True)
+
+        proto = workspace.ApplyTransformIfFaster(
+            "ConvToNNPack",
+            m.net.Proto(),
+            init_net.Proto())
+        self.assertEqual(
+            workspace.RunNetOnce(proto.SerializeToString()), True)
+        proto = workspace.ApplyTransformIfFaster(
+            "ConvToNNPack",
+            m.net.Proto(),
+            init_net.Proto(),
+            warmup_runs=10,
+            main_runs=100,
+            improvement_threshold=2.0)
+        self.assertEqual(
+            workspace.RunNetOnce(proto.SerializeToString()), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/queue/CMakeLists.txt b/caffe2/queue/CMakeLists.txt
new file mode 100644
index 0000000..740d974
--- /dev/null
+++ b/caffe2/queue/CMakeLists.txt
@@ -0,0 +1,33 @@
+# ---[ GPU files
+# ------[ general GPU
+file(GLOB tmp *_gpu.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# ------[ CUDA sources
+file(GLOB tmp *.cu)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+# ---[ GPU test files
+file(GLOB tmp *_gpu_test.cc)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/queue/blobs_queue.cc b/caffe2/queue/blobs_queue.cc
new file mode 100644
index 0000000..9a4c330
--- /dev/null
+++ b/caffe2/queue/blobs_queue.cc
@@ -0,0 +1,173 @@
+#include "caffe2/queue/blobs_queue.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+
+#include "caffe2/core/blob_stats.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+// Constants for user tracepoints
+static constexpr int SDT_NONBLOCKING_OP = 0;
+static constexpr int SDT_BLOCKING_OP = 1;
+static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
+static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
+static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
+
+BlobsQueue::BlobsQueue(
+    Workspace* ws,
+    const std::string& queueName,
+    size_t capacity,
+    size_t numBlobs,
+    bool enforceUniqueName,
+    const std::vector<std::string>& fieldNames)
+    : numBlobs_(numBlobs), name_(queueName), stats_(queueName) {
+  if (!fieldNames.empty()) {
+    CAFFE_ENFORCE_EQ(
+        fieldNames.size(), numBlobs, "Wrong number of fieldNames provided.");
+    stats_.queue_dequeued_bytes.setDetails(fieldNames);
+  }
+  queue_.reserve(capacity);
+  for (auto i = 0; i < capacity; ++i) {
+    std::vector<Blob*> blobs;
+    blobs.reserve(numBlobs);
+    for (auto j = 0; j < numBlobs; ++j) {
+      const auto blobName = queueName + "_" + to_string(i) + "_" + to_string(j);
+      if (enforceUniqueName) {
+        CAFFE_ENFORCE(
+            !ws->GetBlob(blobName),
+            "Queue internal blob already exists: ",
+            blobName);
+      }
+      blobs.push_back(ws->CreateBlob(blobName));
+    }
+    queue_.push_back(blobs);
+  }
+  DCHECK_EQ(queue_.size(), capacity);
+}
+
+bool BlobsQueue::blockingRead(
+    const std::vector<Blob*>& inputs,
+    float timeout_secs) {
+  Timer readTimer;
+  auto keeper = this->shared_from_this();
+  const auto& name = name_.c_str();
+  CAFFE_SDT(queue_read_start, name, (void*)this, SDT_BLOCKING_OP);
+  std::unique_lock<std::mutex> g(mutex_);
+  auto canRead = [this]() {
+    CAFFE_ENFORCE_LE(reader_, writer_);
+    return reader_ != writer_;
+  };
+  // Decrease queue balance before reading to indicate queue read pressure
+  // is being increased (-ve queue balance indicates more reads than writes)
+  CAFFE_EVENT(stats_, queue_balance, -1);
+  if (timeout_secs > 0) {
+    std::chrono::milliseconds timeout_ms(int(timeout_secs * 1000));
+    cv_.wait_for(
+        g, timeout_ms, [this, canRead]() { return closing_ || canRead(); });
+  } else {
+    cv_.wait(g, [this, canRead]() { return closing_ || canRead(); });
+  }
+  if (!canRead()) {
+    if (timeout_secs > 0 && !closing_) {
+      LOG(ERROR) << "DequeueBlobs timed out in " << timeout_secs << " secs";
+      CAFFE_SDT(queue_read_end, name, (void*)this, SDT_TIMEOUT);
+    } else {
+      CAFFE_SDT(queue_read_end, name, (void*)this, SDT_CANCEL);
+    }
+    return false;
+  }
+  DCHECK(canRead());
+  auto& result = queue_[reader_ % queue_.size()];
+  CAFFE_ENFORCE(inputs.size() >= result.size());
+  for (auto i = 0; i < result.size(); ++i) {
+    auto bytes = BlobStat::sizeBytes(*result[i]);
+    CAFFE_EVENT(stats_, queue_dequeued_bytes, bytes, i);
+    using std::swap;
+    swap(*(inputs[i]), *(result[i]));
+  }
+  CAFFE_SDT(queue_read_end, name, (void*)this, writer_ - reader_);
+  CAFFE_EVENT(stats_, queue_dequeued_records);
+  ++reader_;
+  cv_.notify_all();
+  CAFFE_EVENT(stats_, read_time_ns, readTimer.NanoSeconds());
+  return true;
+}
+
+bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
+  Timer writeTimer;
+  auto keeper = this->shared_from_this();
+  const auto& name = name_.c_str();
+  CAFFE_SDT(queue_write_start, name, (void*)this, SDT_NONBLOCKING_OP);
+  std::unique_lock<std::mutex> g(mutex_);
+  if (!canWrite()) {
+    CAFFE_SDT(queue_write_end, name, (void*)this, SDT_ABORT);
+    return false;
+  }
+  // Increase queue balance before writing to indicate queue write pressure is
+  // being increased (+ve queue balance indicates more writes than reads)
+  CAFFE_EVENT(stats_, queue_balance, 1);
+  DCHECK(canWrite());
+  doWrite(inputs);
+  CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
+  return true;
+}
+
+bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
+  Timer writeTimer;
+  auto keeper = this->shared_from_this();
+  const auto& name = name_.c_str();
+  CAFFE_SDT(queue_write_start, name, (void*)this, SDT_BLOCKING_OP);
+  std::unique_lock<std::mutex> g(mutex_);
+  // Increase queue balance before writing to indicate queue write pressure is
+  // being increased (+ve queue balance indicates more writes than reads)
+  CAFFE_EVENT(stats_, queue_balance, 1);
+  cv_.wait(g, [this]() { return closing_ || canWrite(); });
+  if (!canWrite()) {
+    CAFFE_SDT(queue_write_end, name, (void*)this, SDT_ABORT);
+    return false;
+  }
+  DCHECK(canWrite());
+  doWrite(inputs);
+  CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
+  return true;
+}
+
+void BlobsQueue::close() {
+  closing_ = true;
+
+  std::lock_guard<std::mutex> g(mutex_);
+  cv_.notify_all();
+}
+
+bool BlobsQueue::canWrite() {
+  // writer is always within [reader, reader + size)
+  // we can write if reader is within [reader, reader + size)
+  CAFFE_ENFORCE_LE(reader_, writer_);
+  CAFFE_ENFORCE_LE(writer_, reader_ + queue_.size());
+  return writer_ != reader_ + queue_.size();
+}
+
+void BlobsQueue::doWrite(const std::vector<Blob*>& inputs) {
+  auto& result = queue_[writer_ % queue_.size()];
+  CAFFE_ENFORCE(inputs.size() >= result.size());
+  const auto& name = name_.c_str();
+  for (auto i = 0; i < result.size(); ++i) {
+    using std::swap;
+    swap(*(inputs[i]), *(result[i]));
+  }
+  CAFFE_SDT(
+      queue_write_end, name, (void*)this, reader_ + queue_.size() - writer_);
+  ++writer_;
+  cv_.notify_all();
+}
+
+} // namespace caffe2
diff --git a/caffe2/queue/blobs_queue.h b/caffe2/queue/blobs_queue.h
new file mode 100644
index 0000000..ab98bc6
--- /dev/null
+++ b/caffe2/queue/blobs_queue.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+
+#include "caffe2/core/blob_stats.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+// A thread-safe, bounded, blocking queue.
+// Modelled as a circular buffer.
+
+// Containing blobs are owned by the workspace.
+// On read, we swap out the underlying data for the blob passed in for blobs
+
+class BlobsQueue : public std::enable_shared_from_this<BlobsQueue> {
+ public:
+  BlobsQueue(
+      Workspace* ws,
+      const std::string& queueName,
+      size_t capacity,
+      size_t numBlobs,
+      bool enforceUniqueName,
+      const std::vector<std::string>& fieldNames = {});
+
+  ~BlobsQueue() {
+    close();
+  }
+
+  bool blockingRead(
+      const std::vector<Blob*>& inputs,
+      float timeout_secs = 0.0f);
+  bool tryWrite(const std::vector<Blob*>& inputs);
+  bool blockingWrite(const std::vector<Blob*>& inputs);
+  void close();
+  size_t getNumBlobs() const {
+    return numBlobs_;
+  }
+
+ private:
+  bool canWrite();
+  void doWrite(const std::vector<Blob*>& inputs);
+
+  std::atomic<bool> closing_{false};
+
+  size_t numBlobs_;
+  std::mutex mutex_; // protects all variables in the class.
+  std::condition_variable cv_;
+  int64_t reader_{0};
+  int64_t writer_{0};
+  std::vector<std::vector<Blob*>> queue_;
+  const std::string name_;
+
+  struct QueueStats {
+    CAFFE_STAT_CTOR(QueueStats);
+    CAFFE_EXPORTED_STAT(queue_balance);
+    CAFFE_EXPORTED_STAT(queue_dequeued_records);
+    CAFFE_DETAILED_EXPORTED_STAT(queue_dequeued_bytes);
+    CAFFE_AVG_EXPORTED_STAT(read_time_ns);
+    CAFFE_AVG_EXPORTED_STAT(write_time_ns);
+  } stats_;
+};
+} // namespace caffe2
diff --git a/caffe2/queue/blobs_queue_db.cc b/caffe2/queue/blobs_queue_db.cc
new file mode 100644
index 0000000..ef06be9
--- /dev/null
+++ b/caffe2/queue/blobs_queue_db.cc
@@ -0,0 +1,60 @@
+#include "caffe2/queue/blobs_queue_db.h"
+
+#include <algorithm>
+#include <chrono>
+#include <random>
+#include <string>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/queue/blobs_queue.h"
+
+namespace caffe2 {
+namespace db {
+
+template <class Context>
+class CreateBlobsQueueDBOp : public Operator<CPUContext> {
+ public:
+  CreateBlobsQueueDBOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    std::unique_ptr<db::DB> db = caffe2::make_unique<BlobsQueueDB>(
+        "",
+        db::READ,
+        OperatorBase::Input<std::shared_ptr<BlobsQueue>>(0),
+        OperatorBase::template GetSingleArgument<int>("key_blob_index", -1),
+        OperatorBase::template GetSingleArgument<int>("value_blob_index", 0),
+        OperatorBase::template GetSingleArgument<float>("timeout_secs", 0.0));
+    OperatorBase::Output<db::DBReader>(0)->Open(std::move(db), 1, 0);
+    return true;
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CreateBlobsQueueDBOp);
+};
+
+REGISTER_CPU_OPERATOR(CreateBlobsQueueDB, CreateBlobsQueueDBOp<CPUContext>);
+
+OPERATOR_SCHEMA(CreateBlobsQueueDB)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg(
+        "key_blob_index",
+        "(default: -1 (no key)) index of blob for DB key in the BlobsQueue.")
+    .Arg(
+        "value_blob_index",
+        "(default: 0) index of blob for DB value in the BlobsQueue.")
+    .Arg(
+        "timeout_secs",
+        "(default: 0.0 (no timeout)) Timeout in seconds for reading from the "
+        "BlobsQueue.")
+    .SetDoc("Create a DBReader from a BlobsQueue")
+    .Input(0, "queue", "The shared pointer to a queue containing Blobs.")
+    .Output(0, "reader", "The DBReader for the given BlobsQueue");
+
+SHOULD_NOT_DO_GRADIENT(CreateBlobsQueueDB);
+
+} // namespace db
+} // namespace caffe2
diff --git a/caffe2/queue/blobs_queue_db.h b/caffe2/queue/blobs_queue_db.h
new file mode 100644
index 0000000..7d4ac14
--- /dev/null
+++ b/caffe2/queue/blobs_queue_db.h
@@ -0,0 +1,145 @@
+
+#pragma once
+
+#include <chrono>
+#include <string>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/queue/blobs_queue.h"
+
+namespace caffe2 {
+namespace db {
+
+namespace {
+const std::string& GetStringFromBlob(Blob* blob) {
+  if (blob->template IsType<string>()) {
+    return blob->template Get<string>();
+  } else if (blob->template IsType<Tensor<CPUContext>>()) {
+    return *blob->template Get<Tensor<CPUContext>>().template data<string>();
+  } else {
+    CAFFE_THROW("Unsupported Blob type");
+  }
+}
+}
+
+class BlobsQueueDBCursor : public Cursor {
+ public:
+  explicit BlobsQueueDBCursor(
+      std::shared_ptr<BlobsQueue> queue,
+      int key_blob_index,
+      int value_blob_index,
+      float timeout_secs)
+      : queue_(queue),
+        key_blob_index_(key_blob_index),
+        value_blob_index_(value_blob_index),
+        timeout_secs_(timeout_secs),
+        inited_(false),
+        valid_(false) {
+    LOG(INFO) << "BlobsQueueDBCursor constructed";
+    CAFFE_ENFORCE(queue_ != nullptr, "queue is null");
+    CAFFE_ENFORCE(value_blob_index_ >= 0, "value_blob_index < 0");
+  }
+
+  virtual ~BlobsQueueDBCursor() {}
+
+  void Seek(const string& /* unused */) override {
+    CAFFE_THROW("Seek is not supported.");
+  }
+
+  bool SupportsSeek() override {
+    return false;
+  }
+
+  void SeekToFirst() override {
+    // not applicable
+  }
+
+  void Next() override {
+    unique_ptr<Blob> blob = make_unique<Blob>();
+    vector<Blob*> blob_vector{blob.get()};
+    auto success = queue_->blockingRead(blob_vector, timeout_secs_);
+    if (!success) {
+      LOG(ERROR) << "Timed out reading from BlobsQueue or it is closed";
+      valid_ = false;
+      return;
+    }
+
+    if (key_blob_index_ >= 0) {
+      key_ = GetStringFromBlob(blob_vector[key_blob_index_]);
+    }
+    value_ = GetStringFromBlob(blob_vector[value_blob_index_]);
+    valid_ = true;
+  }
+
+  string key() override {
+    if (!inited_) {
+      Next();
+      inited_ = true;
+    }
+    return key_;
+  }
+
+  string value() override {
+    if (!inited_) {
+      Next();
+      inited_ = true;
+    }
+    return value_;
+  }
+
+  bool Valid() override {
+    return valid_;
+  }
+
+ private:
+  std::shared_ptr<BlobsQueue> queue_;
+  int key_blob_index_;
+  int value_blob_index_;
+  float timeout_secs_;
+  bool inited_;
+  string key_;
+  string value_;
+  bool valid_;
+};
+
+class BlobsQueueDB : public DB {
+ public:
+  BlobsQueueDB(
+      const string& source,
+      Mode mode,
+      std::shared_ptr<BlobsQueue> queue,
+      int key_blob_index = -1,
+      int value_blob_index = 0,
+      float timeout_secs = 0.0)
+      : DB(source, mode),
+        queue_(queue),
+        key_blob_index_(key_blob_index),
+        value_blob_index_(value_blob_index),
+        timeout_secs_(timeout_secs) {
+    LOG(INFO) << "BlobsQueueDB constructed";
+  }
+
+  virtual ~BlobsQueueDB() {
+    Close();
+  }
+
+  void Close() override {}
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<BlobsQueueDBCursor>(
+        queue_, key_blob_index_, value_blob_index_, timeout_secs_);
+  }
+
+  unique_ptr<Transaction> NewTransaction() override {
+    CAFFE_THROW("Not implemented.");
+  }
+
+ private:
+  std::shared_ptr<BlobsQueue> queue_;
+  int key_blob_index_;
+  int value_blob_index_;
+  float timeout_secs_;
+};
+} // namespace db
+} // namespace caffe2
diff --git a/caffe2/queue/queue_ops.cc b/caffe2/queue/queue_ops.cc
new file mode 100644
index 0000000..d26ce2b
--- /dev/null
+++ b/caffe2/queue/queue_ops.cc
@@ -0,0 +1,99 @@
+#include "queue_ops.h"
+#include <memory>
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(std::shared_ptr<BlobsQueue>);
+
+REGISTER_CPU_OPERATOR(CreateBlobsQueue, CreateBlobsQueueOp<CPUContext>);
+REGISTER_CPU_OPERATOR(EnqueueBlobs, EnqueueBlobsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(DequeueBlobs, DequeueBlobsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CloseBlobsQueue, CloseBlobsQueueOp<CPUContext>);
+
+REGISTER_CPU_OPERATOR(SafeEnqueueBlobs, SafeEnqueueBlobsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SafeDequeueBlobs, SafeDequeueBlobsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    WeightedSampleDequeueBlobs,
+    WeightedSampleDequeueBlobsOp<CPUContext>);
+
+OPERATOR_SCHEMA(CreateBlobsQueue).NumInputs(0).NumOutputs(1);
+OPERATOR_SCHEMA(EnqueueBlobs)
+    .NumInputsOutputs([](int inputs, int outputs) {
+      return inputs >= 2 && outputs >= 1 && inputs == outputs + 1;
+    })
+    .EnforceInplace([](int input, int output) { return input == output + 1; });
+OPERATOR_SCHEMA(DequeueBlobs)
+    .NumInputsOutputs([](int inputs, int outputs) {
+      return inputs == 1 && outputs >= 1;
+    })
+    .SetDoc(R"DOC(
+  Dequeue the blobs from queue.
+  )DOC")
+    .Arg("timeout_secs", "Timeout in secs, default: no timeout")
+    .Input(0, "queue", "The shared pointer for the BlobsQueue")
+    .Output(0, "blob", "The blob to store the dequeued data");
+
+OPERATOR_SCHEMA(CloseBlobsQueue).NumInputs(1).NumOutputs(0);
+
+OPERATOR_SCHEMA(SafeEnqueueBlobs)
+    .NumInputsOutputs([](int inputs, int outputs) {
+      return inputs >= 2 && outputs >= 2 && inputs == outputs;
+    })
+    .EnforceInplace([](int input, int output) { return input == output + 1; })
+    .SetDoc(R"DOC(
+Enqueue the blobs into queue. When the queue is closed and full, the output
+status will be set to true which can be used as exit criteria for execution
+step.
+The 1st input is the queue and the last output is the status. The rest are
+data blobs.
+)DOC")
+    .Input(0, "queue", "The shared pointer for the BlobsQueue");
+
+OPERATOR_SCHEMA(SafeDequeueBlobs)
+    .NumInputsOutputs([](int inputs, int outputs) {
+      return inputs == 1 && outputs >= 2;
+    })
+    .SetDoc(R"DOC(
+Dequeue the blobs from queue. When the queue is closed and empty, the output
+status will be set to true which can be used as exit criteria for execution
+step.
+The 1st input is the queue and the last output is the status. The rest are
+data blobs.
+)DOC")
+    .Arg(
+        "num_records",
+        "(default 1) If > 1, multiple records will be dequeued and tensors "
+        "for each column will be concatenated. This requires all tensors in "
+        "the records to be at least 1D, and to have the same inner dimensions.")
+    .Input(0, "queue", "The shared pointer for the BlobsQueue")
+    .Output(0, "blob", "The blob to store the dequeued data")
+    .Output(1, "status", "Is set to 0/1 depending on the success of dequeue");
+
+OPERATOR_SCHEMA(WeightedSampleDequeueBlobs)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(2, INT_MAX)
+    .SetDoc(R"DOC(
+Dequeue the blobs from multiple queues. When one of queues is closed and empty,
+the output status will be set to true which can be used as exit criteria for
+execution step.
+The 1st input is the queue and the last output is the status. The rest are
+data blobs.
+)DOC")
+    .Arg("weights", "Weights for sampling from multiple queues")
+    .Arg(
+        "table_idx_blob",
+        "The index of the blob (among the output blob list) "
+        "that will be used to store the index of the table chosen to read the "
+        "current batch.");
+
+NO_GRADIENT(CreateBlobsQueue);
+NO_GRADIENT(EnqueueBlobs);
+NO_GRADIENT(DequeueBlobs);
+NO_GRADIENT(CloseBlobsQueue);
+
+NO_GRADIENT(SafeEnqueueBlobs);
+NO_GRADIENT(SafeDequeueBlobs);
+NO_GRADIENT(WeightedSampleDequeueBlobs);
+
+}
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
new file mode 100644
index 0000000..4ed6aca
--- /dev/null
+++ b/caffe2/queue/queue_ops.h
@@ -0,0 +1,263 @@
+#pragma once
+
+#include <memory>
+#include "blobs_queue.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename Context>
+class CreateBlobsQueueOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  CreateBlobsQueueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        ws_(ws),
+        name(operator_def.output().Get(0)) {}
+
+  bool RunOnDevice() override {
+    const auto capacity = GetSingleArgument("capacity", 1);
+    const auto numBlobs = GetSingleArgument("num_blobs", 1);
+    const auto enforceUniqueName =
+        GetSingleArgument("enforce_unique_name", false);
+    const auto fieldNames =
+        OperatorBase::template GetRepeatedArgument<std::string>("field_names");
+    CAFFE_ENFORCE_EQ(this->OutputSize(), 1);
+    auto queuePtr = Operator<Context>::Outputs()[0]
+                        ->template GetMutable<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queuePtr);
+    *queuePtr = std::make_shared<BlobsQueue>(
+        ws_, name, capacity, numBlobs, enforceUniqueName, fieldNames);
+    return true;
+  }
+
+ private:
+  Workspace* ws_{nullptr};
+  const std::string name;
+};
+
+template <typename Context>
+class EnqueueBlobsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(InputSize() > 1);
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queue && OutputSize() == queue->getNumBlobs());
+    return queue->blockingWrite(this->Outputs());
+  }
+
+ private:
+};
+
+template <typename Context>
+class DequeueBlobsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  DequeueBlobsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    timeout_secs_ = OperatorBase::GetSingleArgument<float>("timeout_secs", 0);
+  }
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(InputSize() == 1);
+    auto queue =
+        OperatorBase::Inputs()[0]->template Get<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queue && OutputSize() == queue->getNumBlobs());
+    return queue->blockingRead(this->Outputs(), timeout_secs_);
+  }
+
+ private:
+  float timeout_secs_;
+};
+
+template <typename Context>
+class CloseBlobsQueueOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_EQ(InputSize(), 1);
+    auto queue =
+        OperatorBase::Inputs()[0]->template Get<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queue);
+    queue->close();
+    return true;
+  }
+
+ private:
+};
+
+template <typename Context>
+class SafeEnqueueBlobsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+  bool RunOnDevice() override {
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queue);
+    auto size = queue->getNumBlobs();
+    CAFFE_ENFORCE(
+        OutputSize() == size + 1,
+        "Expected " + caffe2::to_string(size + 1) + ", " +
+            " got: " + caffe2::to_string(size));
+    bool status = queue->blockingWrite(this->Outputs());
+    Output(size)->Resize();
+    math::Set<bool, Context>(
+        1, !status, Output(size)->template mutable_data<bool>(), &context_);
+    return true;
+  }
+};
+
+template <typename Context>
+class SafeDequeueBlobsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  SafeDequeueBlobsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        numRecords_(OperatorBase::GetSingleArgument<int>("num_records", 1)) {
+    CAFFE_ENFORCE_GT(numRecords_, 0);
+  }
+
+  bool dequeueMany(std::shared_ptr<BlobsQueue>& queue) {
+    auto size = queue->getNumBlobs();
+
+    if (blobs_.size() != size) {
+      blobs_.resize(size);
+      blobPtrs_.resize(size);
+      for (int col = 0; col < size; ++col) {
+        blobPtrs_.at(col) = &blobs_.at(col);
+      }
+    }
+
+    const int kTensorGrowthPct = 40;
+    for (int i = 0; i < numRecords_; ++i) {
+      if (!queue->blockingRead(blobPtrs_)) {
+        // if we read at least one record, status is still true
+        return i > 0;
+      }
+      for (int col = 0; col < size; ++col) {
+        auto* out = this->Output(col);
+        const auto& in = blobPtrs_.at(col)->template Get<Tensor<Context>>();
+        if (i == 0) {
+          out->CopyFrom(in);
+        } else {
+          auto oldSize = out->size();
+
+          CAFFE_ENFORCE(
+              in.ndim() > 0,
+              "Empty tensor to dequeue at column ",
+              col,
+              " within ",
+              size,
+              " total columns");
+
+          out->Extend(in.dims()[0], kTensorGrowthPct, &context_);
+          auto* dst =
+              (char*)out->raw_mutable_data() + oldSize * in.meta().itemsize();
+          context_.template CopyItems<Context, Context>(
+              in.meta(), in.size(), in.raw_data(), dst);
+        }
+      }
+    }
+    return true;
+  }
+
+  bool dequeueOne(std::shared_ptr<BlobsQueue>& queue) {
+    return queue->blockingRead(this->Outputs());
+  }
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(InputSize() == 1);
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    CAFFE_ENFORCE(queue);
+
+    auto size = queue->getNumBlobs();
+    CAFFE_ENFORCE_EQ(OutputSize(), size + 1);
+
+    bool status = numRecords_ > 1 ? dequeueMany(queue) : dequeueOne(queue);
+
+    Output(size)->Resize();
+    math::Set<bool, Context>(
+        1, !status, Output(size)->template mutable_data<bool>(), &context_);
+    return true;
+  }
+
+ private:
+  int numRecords_;
+  std::vector<Blob> blobs_;
+  std::vector<Blob*> blobPtrs_;
+};
+
+template <typename Context>
+class WeightedSampleDequeueBlobsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  WeightedSampleDequeueBlobsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        table_idx_blob_(
+            OperatorBase::GetSingleArgument<int>("table_idx_blob", -1)) {
+    CAFFE_ENFORCE_LT(table_idx_blob_, OutputSize() - 1);
+    vector<float> weights = OperatorBase::GetRepeatedArgument<float>("weights");
+    if (weights.empty()) {
+      weights.resize(InputSize(), 1.0f);
+    }
+    CAFFE_ENFORCE_EQ(InputSize(), weights.size());
+
+    float sum = accumulate(weights.begin(), weights.end(), 0.0f);
+    CAFFE_ENFORCE(sum > 0.0f, "Sum of weights must be positive");
+    cumProbs_.resize(weights.size());
+    for (int i = 0; i < weights.size(); i++) {
+      cumProbs_[i] = weights[i] / sum;
+      CAFFE_ENFORCE_GE(
+          cumProbs_[i], 0.0f, "Each probability must be non-negative");
+    }
+    std::partial_sum(cumProbs_.begin(), cumProbs_.end(), cumProbs_.begin());
+    // Put last value to be 1.0001 to avoid numerical issues.
+    cumProbs_.back() = 1.0001f;
+
+    LOG(INFO) << "Dequeue weights: " << weights;
+    LOG(INFO) << "cumProbs: " << cumProbs_;
+  }
+
+  bool RunOnDevice() override {
+    float r;
+    math::RandUniform<float, Context>(1, 0.0f, 1.0f, &r, &context_);
+    auto lb = lower_bound(cumProbs_.begin(), cumProbs_.end(), r);
+    CAFFE_ENFORCE(lb != cumProbs_.end(), "Cannot find ", r, " in cumProbs_.");
+    const int32_t idx = lb - cumProbs_.begin();
+    auto queue = Operator<Context>::Inputs()[idx]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+
+    CAFFE_ENFORCE(queue);
+    auto size = queue->getNumBlobs();
+    CAFFE_ENFORCE_EQ(OutputSize(), size + 1);
+    bool status = queue->blockingRead(this->Outputs());
+    if (table_idx_blob_ >= 0) {
+      auto* table_idx_blob_out = Output(table_idx_blob_);
+      table_idx_blob_out->Resize(1);
+      int32_t* data = table_idx_blob_out->template mutable_data<int32_t>();
+      data[0] = idx;
+    }
+
+    Output(size)->Resize();
+    math::Set<bool, Context>(
+        1, !status, Output(size)->template mutable_data<bool>(), &context_);
+    return true;
+  }
+
+ private:
+  vector<float> cumProbs_;
+  int table_idx_blob_;
+};
+} // namespace caffe2
diff --git a/caffe2/queue/queue_ops_gpu.cc b/caffe2/queue/queue_ops_gpu.cc
new file mode 100644
index 0000000..9d818b2
--- /dev/null
+++ b/caffe2/queue/queue_ops_gpu.cc
@@ -0,0 +1,16 @@
+#include "caffe2/utils/math.h"
+#include "queue_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(CreateBlobsQueue, CreateBlobsQueueOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(EnqueueBlobs, EnqueueBlobsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(DequeueBlobs, DequeueBlobsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(CloseBlobsQueue, CloseBlobsQueueOp<CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(SafeEnqueueBlobs, SafeEnqueueBlobsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(SafeDequeueBlobs, SafeDequeueBlobsOp<CUDAContext>);
+
+}
diff --git a/caffe2/queue/rebatching_queue.cc b/caffe2/queue/rebatching_queue.cc
new file mode 100644
index 0000000..6be252f
--- /dev/null
+++ b/caffe2/queue/rebatching_queue.cc
@@ -0,0 +1,234 @@
+#include "rebatching_queue.h"
+#include "caffe2/utils/smart_tensor_printer.h"
+
+namespace caffe2 {
+
+namespace {
+
+// This concat function will always create a new first dimension to concat
+void concat(
+    CPUContext& context,
+    const std::vector<std::vector<TensorCPU>>& inputs,
+    const std::vector<TensorCPU*>& outputs) {
+  CAFFE_ENFORCE(!inputs.empty());
+
+  const auto& inputZero = inputs[0];
+  const auto numTensors = inputZero.size();
+  const auto numRows = inputs.size();
+
+  // Precompute the output sizes to avoid resizing
+  std::vector<std::vector<TIndex>> outputDims(numTensors);
+
+  for (int i = 0; i < numTensors; ++i) {
+    SmartTensorPrinter::PrintTensor(inputZero.at(i));
+    outputDims[i] = inputZero.at(i).dims();
+    outputDims[i].insert(outputDims[i].begin(), numRows);
+  }
+
+  // Resize to the final output size
+  std::vector<void*> destinations(numTensors);
+  for (int i = 0; i < numTensors; ++i) {
+    outputs[i]->Resize(outputDims[i]);
+    destinations[i] = outputs[i]->raw_mutable_data(inputZero[i].meta());
+  }
+
+  for (int i = 0; i < numRows; ++i) {
+    CAFFE_ENFORCE_EQ(inputs[i].size(), numTensors);
+
+    for (int j = 0; j < numTensors; ++j) {
+      const auto& input = inputs[i][j];
+
+      CAFFE_ENFORCE(inputZero[j].meta() == input.meta());
+      CAFFE_ENFORCE_EQ(inputZero[j].itemsize(), input.itemsize());
+      CAFFE_ENFORCE_EQ(inputZero[j].ndim(), input.ndim());
+      for (int k = 0; k < input.ndim(); ++k) {
+        CAFFE_ENFORCE_EQ(input.dims()[k], inputZero[j].dims()[k]);
+      }
+
+      // Skip empty tensors
+      if (input.size() == 0) {
+        continue;
+      }
+
+      context.CopyItems<CPUContext, CPUContext>(
+          input.meta(),
+          input.size(),
+          input.raw_data() /* src */,
+          destinations[j] /* dst */
+          );
+
+      destinations[j] =
+          (char*)destinations[j] + input.size() * input.itemsize();
+    }
+  }
+}
+
+std::vector<std::vector<TensorCPU>> split(
+    CPUContext& context,
+    const std::vector<const TensorCPU*>& inputs) {
+  CAFFE_ENFORCE(!inputs.empty());
+
+  const auto outputSize = inputs[0]->dims().at(0);
+  std::vector<std::vector<TensorCPU>> outputs(outputSize);
+
+  for (const auto* inputPtr : inputs) {
+    CAFFE_ENFORCE(inputPtr);
+
+    const auto& input = *inputPtr;
+    const auto innerSize = input.size_from_dim(1);
+    const auto itemSize = input.meta().itemsize();
+
+    auto outputDims = input.dims();
+    CAFFE_ENFORCE(!outputDims.empty());
+    outputDims.erase(outputDims.begin());
+    CAFFE_ENFORCE_EQ(input.dims().at(0), outputSize);
+
+    for (int i = 0; i < outputSize; ++i) {
+      outputs[i].push_back(TensorCPU(outputDims));
+      context.CopyItems<CPUContext, CPUContext>(
+          input.meta(),
+          innerSize,
+          (char*)input.raw_data() + i * innerSize * itemSize /* src */,
+          outputs[i].back().raw_mutable_data(input.meta()) /* dst */);
+    }
+  }
+
+  return outputs;
+}
+} // anonymous namespace
+
+RebatchingQueue::RebatchingQueue(size_t capacity, size_t numBlobs)
+    : capacity_(capacity), numBlobs_(numBlobs), queue_(capacity) {}
+
+RebatchingQueue::~RebatchingQueue() {
+  close();
+}
+
+bool RebatchingQueue::canRead() const {
+  return tail_ < head_;
+}
+
+bool RebatchingQueue::dequeue(
+    CPUContext& context,
+    size_t numElements,
+    const std::vector<TensorCPU*>& outputs) {
+  std::vector<std::vector<TensorCPU>> results;
+  results.reserve(numElements);
+
+  for (;;) {
+    if (results.size() == numElements) {
+      break;
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+
+      cvEmpty_.wait(lock, [this] { return canRead() || isClosed_; });
+
+      // We only want to stop reading if the queue is empty and closed
+      if (!canRead() && isClosed_) {
+        break;
+      }
+
+      do {
+        results.push_back(std::move(queue_[tail_++ % capacity()]));
+      } while (canRead() && results.size() < numElements);
+    }
+
+    if (numElements == 1) {
+      cvOverflow_.notify_one();
+    } else {
+      cvOverflow_.notify_all();
+    }
+  }
+
+  if (results.empty()) {
+    return false;
+  }
+
+  concat(context, results, outputs);
+
+  return true;
+}
+
+bool RebatchingQueue::canWrite() const {
+  return tail_ + capacity() > head_;
+}
+
+bool RebatchingQueue::enqueueOne(
+    CPUContext& /*context*/,
+    const std::vector<const TensorCPU*>& inputs) {
+  std::vector<std::vector<TensorCPU>> splittedInputs;
+  splittedInputs.emplace_back();
+  auto& tensorVector = splittedInputs.back();
+  tensorVector.reserve(inputs.size());
+  for (const auto* tensorPtr : inputs) {
+    tensorVector.push_back(tensorPtr->Clone());
+  }
+
+  return enqueue(std::move(splittedInputs));
+}
+
+bool RebatchingQueue::enqueueMany(
+    CPUContext& context,
+    const std::vector<const TensorCPU*>& inputs) {
+  CAFFE_ENFORCE_EQ(numBlobs_, inputs.size());
+
+  std::vector<std::vector<TensorCPU>> splittedInputs;
+  splittedInputs = split(context, inputs);
+  return enqueue(std::move(splittedInputs));
+}
+
+bool RebatchingQueue::enqueue(
+    std::vector<std::vector<TensorCPU>> splittedInputs) {
+  int idx = 0;
+  for (;;) {
+    if (idx >= splittedInputs.size()) {
+      break;
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+
+      cvOverflow_.wait(lock, [this] { return canWrite() || isClosed_; });
+
+      if (isClosed_) {
+        // If we are here it means that we didn't apply the entire batch and if
+        // we get closed in the middle of enquing we treat it as a non-success.
+        return false;
+      }
+
+      do {
+        queue_[head_++ % capacity()] = std::move(splittedInputs[idx++]);
+      } while (canWrite() && idx < splittedInputs.size());
+    }
+
+    cvEmpty_.notify_all();
+  }
+
+  return true;
+}
+
+size_t RebatchingQueue::capacity() const {
+  return capacity_;
+}
+
+size_t RebatchingQueue::numBlobs() const {
+  return numBlobs_;
+}
+
+bool RebatchingQueue::isClosed() const {
+  std::lock_guard<std::mutex> g(mutex_);
+  return isClosed_;
+}
+
+void RebatchingQueue::close() {
+  {
+    std::lock_guard<std::mutex> g(mutex_);
+    isClosed_ = true;
+  }
+
+  cvEmpty_.notify_all();
+  cvOverflow_.notify_all();
+}
+} // caffe2
diff --git a/caffe2/queue/rebatching_queue.h b/caffe2/queue/rebatching_queue.h
new file mode 100644
index 0000000..052402e
--- /dev/null
+++ b/caffe2/queue/rebatching_queue.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+// TODO: This is a very naive implementation with a single mutex. We can do the
+// atomic index + circular queue optimizations or pull something more
+// heavy-weight later
+
+class RebatchingQueue {
+ public:
+  RebatchingQueue(size_t capacity, size_t numBlobs);
+
+  ~RebatchingQueue();
+
+  bool enqueueOne(
+      CPUContext& context,
+      const std::vector<const TensorCPU*>& inputs);
+
+  bool enqueueMany(
+      CPUContext& context,
+      const std::vector<const TensorCPU*>& inputs);
+
+  bool dequeue(
+      CPUContext& context,
+      size_t numElements,
+      const std::vector<TensorCPU*>& outputs);
+
+  size_t capacity() const;
+
+  size_t numBlobs() const;
+
+  bool isClosed() const;
+
+  void close();
+
+ private:
+  bool enqueue(std::vector<std::vector<TensorCPU>> splittedInputs);
+
+  bool canWrite() const;
+  bool canRead() const;
+
+  const size_t capacity_;
+  const size_t numBlobs_;
+
+  mutable std::mutex mutex_;
+
+  bool isClosed_{false};
+
+  uint64_t head_{0};
+  uint64_t tail_{0};
+
+  std::condition_variable cvEmpty_;
+  std::condition_variable cvOverflow_;
+
+  std::vector<std::vector<TensorCPU>> queue_;
+};
+} // caffe2
diff --git a/caffe2/queue/rebatching_queue_ops.cc b/caffe2/queue/rebatching_queue_ops.cc
new file mode 100644
index 0000000..5a18eac
--- /dev/null
+++ b/caffe2/queue/rebatching_queue_ops.cc
@@ -0,0 +1,72 @@
+#include "rebatching_queue_ops.h"
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(RebatchingQueuePtr);
+
+namespace {
+
+REGISTER_CPU_OPERATOR(CreateRebatchingQueue, CreateRebatchingQueueOp);
+REGISTER_CPU_OPERATOR(EnqueueRebatchingQueue, EnqueueRebatchingQueueOp);
+REGISTER_CPU_OPERATOR(DequeueRebatchingQueue, DequeueRebatchingQueueOp);
+REGISTER_CPU_OPERATOR(CloseRebatchingQueue, CloseRebatchingQueueOp);
+
+NO_GRADIENT(CreateRebatchingQueue);
+NO_GRADIENT(EnqueueRebatchingQueue);
+NO_GRADIENT(DequeueRebatchingQueue);
+NO_GRADIENT(CloseRebatchingQueue);
+
+OPERATOR_SCHEMA(CreateRebatchingQueue)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates the Queue.
+)DOC")
+    .Output(0, "queue", "object representing the queue")
+    .Arg("num_blobs", "Number of input tensors the queue will support")
+    .Arg(
+        "capacity",
+        "Maximal number of elements the queue can hold at any given point");
+
+OPERATOR_SCHEMA(CloseRebatchingQueue)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Closes the Queue.
+)DOC")
+    .Input(0, "queue", "object representing the queue");
+
+OPERATOR_SCHEMA(EnqueueRebatchingQueue)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Enqueues Tensors into the queue.
+Number of input tensors should be equal to the number of components passed
+during creation of the queue.
+If the Queue is closed this operation will fail.
+If enqueue_batch argument is set. We will split the input tensors by the
+first dimension to produce single queue elements.
+)DOC")
+    .Input(0, "queue", "object representing the queue")
+    .Input(1, "tensor", "First tensor to enque. ")
+    .Arg(
+        "enqueue_batch",
+        "Are we enqueuing a batch or just a single element. \
+        By default we enqueue single element.");
+
+OPERATOR_SCHEMA(DequeueRebatchingQueue)
+    .NumInputs(1)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Dequeue Tensors from the Queue.
+If the Queue is closed this might return less elements than asked.
+If num_elements > 1 the returned elements will be concatenated into one
+tensor per component.
+)DOC")
+    .Input(0, "rebatching_queue", "object representing the queue")
+    .Input(1, "tensor", "First tensor to enqueue")
+    .Arg(
+        "num_elements",
+        "Number of elements to dequeue. By default we dequeue one element.");
+}
+}
diff --git a/caffe2/queue/rebatching_queue_ops.h b/caffe2/queue/rebatching_queue_ops.h
new file mode 100644
index 0000000..80749a4
--- /dev/null
+++ b/caffe2/queue/rebatching_queue_ops.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "rebatching_queue.h"
+
+namespace caffe2 {
+
+using RebatchingQueuePtr = std::unique_ptr<RebatchingQueue>;
+
+class CreateRebatchingQueueOp : public Operator<CPUContext> {
+ public:
+  CreateRebatchingQueueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<RebatchingQueuePtr>(0) =
+        RebatchingQueuePtr(new RebatchingQueue(
+            OperatorBase::GetSingleArgument<int>("capacity", 1),
+            OperatorBase::GetSingleArgument<int>("num_blobs", 1)));
+    return true;
+  }
+};
+
+class EnqueueRebatchingQueueOp : public Operator<CPUContext> {
+ public:
+  EnqueueRebatchingQueueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        enqueueBatch_(
+            OperatorBase::GetSingleArgument<bool>("enqueue_batch", false)) {}
+  bool RunOnDevice() override {
+    auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
+    CHECK(queue);
+    CAFFE_ENFORCE_EQ(InputSize(), queue->numBlobs() + 1);
+    std::vector<const TensorCPU*> inputTensors;
+    inputTensors.reserve(InputSize() - 1);
+    for (int i = 1; i < InputSize(); ++i) {
+      inputTensors.push_back(&Input(i));
+    }
+
+    return enqueueBatch_ ? queue->enqueueMany(context_, inputTensors)
+                         : queue->enqueueOne(context_, inputTensors);
+  }
+
+ private:
+  const bool enqueueBatch_;
+};
+
+class DequeueRebatchingQueueOp : public Operator<CPUContext> {
+ public:
+  DequeueRebatchingQueueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        numElements_(OperatorBase::GetSingleArgument<int>("num_elements", 1)) {}
+
+  bool RunOnDevice() override {
+    auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
+    CHECK(queue);
+
+    std::vector<TensorCPU*> outputTensors;
+    outputTensors.reserve(OutputSize());
+    for (int i = 0; i < OutputSize(); ++i) {
+      outputTensors.push_back(Output(i));
+    }
+
+    return queue->dequeue(context_, numElements_, outputTensors);
+  }
+
+ private:
+  int numElements_;
+};
+
+class CloseRebatchingQueueOp : public Operator<CPUContext> {
+ public:
+  CloseRebatchingQueueOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_EQ(InputSize(), 1);
+    auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
+    CAFFE_ENFORCE(queue);
+    queue->close();
+    return true;
+  }
+};
+} // caffe2
diff --git a/caffe2/release-notes.md b/caffe2/release-notes.md
new file mode 100644
index 0000000..9816f72
--- /dev/null
+++ b/caffe2/release-notes.md
@@ -0,0 +1,175 @@
+# Caffe2 v0.7.0 Release Notes
+
+## Installation
+
+This build is confirmed for:
+
+* Ubuntu 14.04
+* Ubuntu 16.06
+
+### Required Dependencies
+
+```bash
+sudo apt-get update
+sudo apt-get install -y --no-install-recommends \
+      build-essential \
+      cmake \
+      git \
+      libgoogle-glog-dev \
+      libprotobuf-dev \
+      protobuf-compiler \
+      python-dev \
+      python-pip                          
+sudo pip install numpy protobuf
+```
+
+### Optional GPU Support
+
+If you plan to use GPU instead of CPU only, then you should install NVIDIA CUDA and cuDNN, a GPU-accelerated library of primitives for deep neural networks.
+[NVIDIA's detailed instructions](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu-installation) or if you're feeling lucky try the quick install set of commands below.
+
+**Update your graphics card drivers first!** Otherwise you may suffer from a wide range of difficult to diagnose errors.
+
+**For Ubuntu 14.04**
+
+```bash
+sudo apt-get update && sudo apt-get install wget -y --no-install-recommends
+wget "http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.61-1_amd64.deb"
+sudo dpkg -i cuda-repo-ubuntu1404_8.0.61-1_amd64.deb
+sudo apt-get update
+sudo apt-get install cuda
+```
+
+**For Ubuntu 16.04**
+
+```bash
+sudo apt-get update && sudo apt-get install wget -y --no-install-recommends
+wget "http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb"
+sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
+sudo apt-get update
+sudo apt-get install cuda
+```
+
+#### Install cuDNN (all Ubuntu versions)
+
+```
+CUDNN_URL="http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-linux-x64-v5.1.tgz"
+wget ${CUDNN_URL}
+sudo tar -xzf cudnn-8.0-linux-x64-v5.1.tgz -C /usr/local
+rm cudnn-8.0-linux-x64-v5.1.tgz && sudo ldconfig
+```
+
+### Optional Dependencies
+
+> Note `libgflags2` is for Ubuntu 14.04. `libgflags-dev` is for Ubuntu 16.04.
+
+```bash
+# for Ubuntu 14.04
+sudo apt-get install -y --no-install-recommends libgflags2
+```
+
+```bash
+# for Ubuntu 16.04
+sudo apt-get install -y --no-install-recommends libgflags-dev
+```
+
+```bash
+# for both Ubuntu 14.04 and 16.04
+sudo apt-get install -y --no-install-recommends \
+      libgtest-dev \
+      libiomp-dev \
+      libleveldb-dev \
+      liblmdb-dev \
+      libopencv-dev \
+      libopenmpi-dev \
+      libsnappy-dev \
+      openmpi-bin \
+      openmpi-doc \
+      python-pydot
+sudo pip install \
+      flask \
+      graphviz \
+      hypothesis \
+      jupyter \
+      matplotlib \
+      pydot python-nvd3 \
+      pyyaml \
+      requests \
+      scikit-image \
+      scipy \
+      setuptools \
+      tornado
+```
+
+### Clone & Build
+
+```bash
+git clone --recursive https://github.com/caffe2/caffe2.git && cd caffe2
+make && cd build && sudo make install
+python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure"
+```
+
+Run this command below to test if your GPU build was a success. You will get a test output either way, but it will warn you at the top of the output if CPU was used instead along with other errors like missing libraries.
+
+```bash
+python -m caffe2.python.operator_test.relu_op_test
+```
+
+### Environment Variables
+
+These environment variables may assist you depending on your current configuration. When using the install instructions above on the AWS Deep Learning AMI you don't need to set these variables. However, our Docker scripts built on Ubuntu-14.04 or NVIDIA's CUDA images seem to benefit from having these set. If you ran into problems with the build tests above then these are good things to check. Echo them first and see what you have and possibly append or replace with these directories. Also visit the troubleshooting section below.
+
+```bash
+echo $PYTHONPATH
+# export PYTHONPATH=/usr/local:$PYTHONPATH
+# export PYTHONPATH=$PYTHONPATH:/home/ubuntu/caffe2/build
+echo $LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+```
+
+### Setting Up Tutorials & Jupyter Server
+
+If you're running this all on a cloud computer, you probably won't have a UI or way to view the IPython notebooks by default. Typically, you would launch them locally with `ipython notebook` and you would see a localhost:8888 webpage pop up with the directory of notebooks running. The following example will show you how to launch the Jupyter server and connect to remotely via an SSH tunnel.
+
+First configure your cloud server to accept port 8889, or whatever you want, but change the port in the following commands. On AWS you accomplish this by adding a rule to your server's security group allowing a TCP inbound on port 8889. Otherwise you would adjust iptables for this.
+
+Next you launch the Juypter server.
+
+```
+jupyter notebook --no-browser --port=8889
+```
+
+Then create the SSH tunnel. This will pass the cloud server's Jupyter instance to your localhost 8888 port for you to use locally. The example below is templated after how you would connect AWS, where `your-public-cert.pem` is your own public certificate and `ubuntu@super-rad-GPU-instance.compute-1.amazonaws.com` is your login to your cloud server. You can easily grab this on AWS by going to Instances > Connect and copy the part after `ssh` and swap that out in the command below.
+
+```
+ssh -N -f -L localhost:8888:localhost:8889 -i "your-public-cert.pem" ubuntu@super-rad-GPU-instance.compute-1.amazonaws.com
+```
+
+### Troubleshooting
+
+|Python errors||
+|----|-----|
+|Python version | [Python](https://www.python.org/) is core to run Caffe2. We currently require [Python2.7](https://www.python.org/download/releases/2.7/). *Ubuntu 14.04 and greater have Python built in by default*, and that can be used to run Caffe2. To check your version: `python --version`|
+|Solution | If you want the developer version of python, you could install the `dev` package for Python: `sudo apt-get install python-dev`|
+|Python environment | You may have another version of Python installed or need to support Python version 3 for other projects.|
+|Solution | Try virtualenv or Anaconda. The [Anaconda](https://www.continuum.io/downloads) platform provides a single script to install many of the necessary packages for Caffe2, including Python. Using Anaconda is outside the scope of these instructions, but if you are interested, it may work well for you.|
+|pip version | If you plan to use Python with Caffe2 then you need pip.|
+|Solution | `sudo apt-get install python-pip` and also try using pip2 instead of pip.|
+|"AttributeError: 'module' object has no attribute 'MakeArgument'" | Occurs when calling `core.CreateOperator`|
+|Solution | Check your install directory (`/usr/local/`), and remove the folder `/caffe2/python/utils`|
+
+|Building from source||
+|----|-----|
+|OS version | Caffe2 requires Ubuntu 14.04 or greater.|
+|git | While you can download the Caffe2 source code and submodules directly from GitHub as a zip, using git makes it much easier.|
+|Solution | `sudo apt-get install git`|
+|protobuf | You may experience an error related to protobuf during the make step.|
+|Solution | Make sure you've installed protobuf in **both** of these two ways: `sudo apt-get install libprotobuf-dev protobuf-compiler && sudo pip install protobuf`|
+|libgflags2 error | This optional dependency is for Ubuntu 14.04.|
+|Solution | Use `apt-get install libgflags-dev` for Ubuntu 16.04.|
+
+|GPU Support||
+|----|-----|
+|GPU errors | Unsupported GPU or wrong version|
+|Solution | You need to know the specific `deb` for your version of Linux. `sudo dpkg -i| |cuda-repo-<distro>_<version>_<architecture>.deb` Refer to NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu-installation).|
+|Build issues | Be warned that installing CUDA and cuDNN will increase the size of your build by about 4GB, so plan to have at least 12GB for your Ubuntu disk size.|
diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt
new file mode 100644
index 0000000..9a1d67e
--- /dev/null
+++ b/caffe2/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+enum34
diff --git a/caffe2/sgd/CMakeLists.txt b/caffe2/sgd/CMakeLists.txt
new file mode 100644
index 0000000..740d974
--- /dev/null
+++ b/caffe2/sgd/CMakeLists.txt
@@ -0,0 +1,33 @@
+# ---[ GPU files
+# ------[ general GPU
+file(GLOB tmp *_gpu.cc)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# ------[ CUDA sources
+file(GLOB tmp *.cu)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+# ---[ CPU files.
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+# exclude test files and gpu files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+# ---[ GPU test files
+file(GLOB tmp *_gpu_test.cc)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+# ---[ CPU test files
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc
new file mode 100644
index 0000000..c963f54
--- /dev/null
+++ b/caffe2/sgd/adagrad_op.cc
@@ -0,0 +1,91 @@
+#include "adagrad_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Adagrad, AdagradOp<float, CPUContext>);
+OPERATOR_SCHEMA(Adagrad)
+    .NumInputs(4)
+    .NumOutputs(2, 4)
+    .AllowInplace({{0, 0}, {1, 1}})
+    .SetDoc(R"DOC(
+
+Computes the AdaGrad update for an input gradient and accumulated
+history. Concretely, given inputs (param, grad, moment, learning_rate),
+computes
+
+    new_moment = moment + square(grad)
+    effective_lr = learning_rate / (sqrt(new_moment) + epsilon)
+    update = learning_rate * grad / (sqrt(new_moment) + epsilon)
+    new_param = param + update
+and returns (new_param, new_moment).
+
+Optionally returns effective_lr and update as well.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment", "Moment history")
+    .Input(2, "grad", "Gradient computed")
+    .Input(3, "lr", "learning rate")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment", "Updated moment")
+    .Output(2, "output_effective_lr", "(optional) Effective learning rate")
+    .Output(3, "output_update", "(optional) Actual update that is applied.")
+
+    .Arg("epsilon", "Default 1e-5")
+    .Arg(
+        "decay",
+        "Default 1. If it is in (0, 1), the gradient square sum "
+        "is decayed by this factor.");
+
+REGISTER_CPU_OPERATOR(SparseAdagrad, SparseAdagradOp<float, CPUContext>);
+OPERATOR_SCHEMA(SparseAdagrad)
+    .NumInputs(5)
+    .NumOutputs(2)
+    .EnforceOneToOneInplace()
+    .SetDoc(R"DOC(
+
+Given inputs (param, moment, indices, grad, lr), runs the dense AdaGrad
+update on (param, grad, moment[indices], lr), and returns (new_param,
+new_moment) as in the dense case.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment", "Moment history")
+    .Input(2, "indices", "Sparse indices")
+    .Input(3, "grad", "Gradient computed")
+    .Input(4, "lr", "learning rate")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated moment")
+    .Arg("epsilon", "Default 1e-5");
+
+REGISTER_CPU_OPERATOR(
+    RowWiseSparseAdagrad,
+    RowWiseSparseAdagradOp<float, CPUContext>);
+OPERATOR_SCHEMA(RowWiseSparseAdagrad)
+    .NumInputs(5)
+    .NumOutputs(2)
+    .EnforceOneToOneInplace()
+    .SetDoc(R"DOC(
+
+Given inputs (param, moment, indices, grad, lr), runs a modified sparse Adagrad
+update on (param, grad, moment[indices], lr), and returns (new_param,
+new_momwnr), where moment is a 1D tensor with length equal to the number of
+rows in param: shape(moment) == shape(param)[0]. Each element of moment is
+applied to an entire row of param, and the new moment is calculated by adding
+the average squared sum of gradients across each row. Note that indices must
+also be a 1D tensor indexing into the rows of param.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment", "Moment history")
+    .Input(2, "indices", "Sparse indices")
+    .Input(3, "grad", "Gradient computed")
+    .Input(4, "lr", "learning rate")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated moment")
+    .Arg("epsilon", "Default 1e-5");
+
+SHOULD_NOT_DO_GRADIENT(Adagrad);
+SHOULD_NOT_DO_GRADIENT(SparseAdagrad);
+SHOULD_NOT_DO_GRADIENT(RowWiseSparseAdagrad);
+}
diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h
new file mode 100644
index 0000000..cef1cd4
--- /dev/null
+++ b/caffe2/sgd/adagrad_op.h
@@ -0,0 +1,336 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void adagrad_update(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    float hi = nh[i] = decay * h[i] + gi * gi;
+    nw[i] = w[i] + lr[0] * gi / (std::sqrt(hi) + epsilon);
+  }
+}
+
+template <typename Context>
+void adagrad_update_output_effective_lr(
+    int N,
+    const float* paramIn,
+    const float* gradIn,
+    const float* momentIn,
+    float* paramOut,
+    float* momentOut,
+    float* effectiveLROut,
+    float epsilon,
+    float decay,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float grad = gradIn[i];
+    float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
+    float effective_lr = effectiveLROut[i] =
+        lr[0] / (std::sqrt(moment) + epsilon);
+    paramOut[i] = paramIn[i] + effective_lr * grad;
+  }
+}
+
+template <typename Context>
+void adagrad_update_output_effective_lr_and_update(
+    int N,
+    const float* paramIn,
+    const float* gradIn,
+    const float* momentIn,
+    float* paramOut,
+    float* momentOut,
+    float* effectiveLROut,
+    float* updateOut,
+    float epsilon,
+    float decay,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float grad = gradIn[i];
+    float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
+    float effective_lr = effectiveLROut[i] =
+        lr[0] / (std::sqrt(moment) + epsilon);
+    float update = updateOut[i] = effective_lr * grad;
+    paramOut[i] = paramIn[i] + update;
+  }
+}
+
+template <typename T, class Context>
+class AdagradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AdagradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-5f)),
+        decay_(OperatorBase::GetSingleArgument<T>("decay", 1.0f)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_EQ(
+        Input(GRAD).size(),
+        Input(MOMENT_1).size(),
+        "PARAM size: ",
+        Input(PARAM).size(),
+        ", GRAD size: ",
+        Input(GRAD).size(),
+        ", MOMENT_1 size: ",
+        Input(MOMENT_1).size(),
+        ", LR size: ",
+        Input(LR).size());
+
+    CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(PARAM).size());
+    Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+    Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
+    if (OutputSize() == 2) {
+      adagrad_update<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(MOMENT_1).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
+          epsilon_,
+          decay_,
+          Input(LR).template data<T>(),
+          &context_);
+    } else if (OutputSize() == 3) {
+      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(GRAD));
+      adagrad_update_output_effective_lr<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(MOMENT_1).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
+          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<T>(),
+          epsilon_,
+          decay_,
+          Input(LR).template data<T>(),
+          &context_);
+    } else {
+      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(GRAD));
+      Output(OUTPUT_UPDATE)->ResizeLike(Input(GRAD));
+      adagrad_update_output_effective_lr_and_update<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(MOMENT_1).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
+          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<T>(),
+          Output(OUTPUT_UPDATE)->template mutable_data<T>(),
+          epsilon_,
+          decay_,
+          Input(LR).template data<T>(),
+          &context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  T decay_;
+  INPUT_TAGS(PARAM, MOMENT_1, GRAD, LR);
+  OUTPUT_TAGS(
+      OUTPUT_PARAM,
+      OUTPUT_MOMENT_1,
+      OUTPUT_EFFECTIVE_LR,
+      OUTPUT_UPDATE);
+};
+
+template <typename T, class Context>
+class SparseAdagradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* momentIn = Input(MOMENT_1).template data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+
+    auto n = Input(INDICES).size();
+    if (n == 0) {
+      return true;
+    }
+
+    auto block_size = Input(GRAD).size() / n;
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+      if (block_size == 1) {
+        float gi = gradIn[i];
+        float hi = momentOut[idx] = momentIn[idx] + gi * gi;
+        paramOut[idx] = paramIn[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
+      } else {
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+        CAFFE_ENFORCE_GE(
+            Input(PARAM).size(),
+            block_size + offsetIdx,
+            this->debug_def().input(PARAM),
+            ", out of bound,  idx:",
+            idx,
+            " for input i:",
+            i,
+            " and block size:",
+            block_size);
+        CAFFE_ENFORCE_GE(
+            Input(GRAD).size(),
+            block_size + offsetI,
+            this->debug_def().input(GRAD),
+            ", out of bound idx, idx:",
+            idx,
+            " for input i:",
+            i);
+#endif
+        adagrad_update(
+            block_size,
+            paramIn + offsetIdx,
+            gradIn + offsetI,
+            momentIn + offsetIdx,
+            paramOut + offsetIdx,
+            momentOut + offsetIdx,
+            epsilon_,
+            1.0f,
+            lr,
+            &context_);
+      }
+    }
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
+};
+
+template <typename T, class Context>
+class RowWiseSparseAdagradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RowWiseSparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).dims()[0], Input(MOMENT_1).size());
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* momentIn = Input(MOMENT_1).template data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+
+    auto n = Input(INDICES).size();
+    if (n == 0) {
+      return true;
+    }
+
+    auto block_size = Input(GRAD).size() / n;
+
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+      if (block_size == 1) {
+        float gi = gradIn[i];
+        float hi = momentOut[idx] = momentIn[idx] + gi * gi;
+        paramOut[idx] = paramIn[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
+      } else {
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+        CAFFE_ENFORCE_GE(
+            Input(PARAM).size(),
+            block_size + offsetIdx,
+            this->debug_def().input(PARAM),
+            ", out of bound,  idx:",
+            idx,
+            " for input i:",
+            i,
+            " and block size:",
+            block_size);
+        CAFFE_ENFORCE_GE(
+            Input(GRAD).size(),
+            block_size + offsetI,
+            this->debug_def().input(GRAD),
+            ", out of bound idx, idx:",
+            idx,
+            " for input i:",
+            i);
+#endif
+
+        const float* w = paramIn + offsetIdx;
+        const float* g = gradIn + offsetI;
+        const float* h = momentIn + idx;
+        float* nw = paramOut + offsetIdx;
+        float* nh = momentOut + idx;
+        float hs = 0.;
+        for (auto j = 0; j < block_size; ++j) {
+          float gj = g[j];
+          hs += gj * gj;
+        }
+        float hi = nh[0] = h[0] + hs / block_size;
+        float step = lr[0] / (std::sqrt(hi) + epsilon_);
+        for (auto j = 0; j < block_size; ++j) {
+          nw[j] = w[j] + g[j] * step;
+        }
+      }
+    }
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
+};
+}
diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
new file mode 100644
index 0000000..71e9f92
--- /dev/null
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@@ -0,0 +1,222 @@
+#include <cub/block/block_reduce.cuh>
+#include "adagrad_op.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/mixed_utils.h"
+
+namespace caffe2 {
+
+__global__ void AdagradUpdate(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float hi = nh[i] = decay * h[i] + gi * gi;
+    nw[i] = w[i] + lr[0] * gi / (std::sqrt(hi) + epsilon);
+  }
+}
+
+template <>
+void adagrad_update<CUDAContext>(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    const float* lr,
+    CUDAContext* context) {
+  AdagradUpdate<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(N, w, g, h, nw, nh, epsilon, decay, lr);
+}
+
+template <typename SIndex, typename THalf>
+__global__ void SparseAdagradKernel(
+    const size_t N,
+    const size_t grad_slice_sz,
+    const float epsilon,
+    THalf* param,
+    THalf* param_mom,
+    const SIndex* indices,
+    const float* grad,
+    const float* lr) {
+  const float LR = lr[0];
+  CUDA_1D_KERNEL_LOOP(i, N)
+  {
+    const size_t gradIdx = i;
+    const SIndex index = indices[i / grad_slice_sz];
+    const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);
+
+    float mom_new =
+        mixed_add(grad[gradIdx] * grad[gradIdx], param_mom[paramIdx]);
+    mixed_store(&mom_new, &(param_mom[paramIdx]));
+    float param_new = mixed_add(
+        LR * grad[gradIdx] / (sqrt(mom_new) + epsilon), param[paramIdx]);
+    mixed_store(&param_new, &(param[paramIdx]));
+  }
+}
+
+/**
+ * Calculate RowwiseSparseAdagrad
+ * M: gradients.dims[0]
+ * N: gradients.size_from_dim(1)
+ * grad: pointer to the gradients
+ * param: pointer to weights
+ * param_mom: pointer to the momentum
+ * indices: keys
+ */
+template <typename SIndex>
+__global__ void RowWiseSparseAdagradKernel(
+    const int M,
+    const int N,
+    const float epsilon,
+    float* param,
+    float* param_mom,
+    const SIndex* indices,
+    const float* grad,
+    const float* lr) {
+  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+  __shared__ BlockReduce::TempStorage temp_storage;
+  // in case gridDim is smaller than M
+  for (int i = blockIdx.x; i < M; i += gridDim.x) {
+    const SIndex index = indices[i];
+    float sum_squares = 0.0;
+    __shared__ float row_sum_squares_avg;
+
+    // in case N is bigger than block size which is 512 by default
+    for (int j = threadIdx.x; j < N; j += blockDim.x) {
+      const float x_ij = grad[i * N + j];
+      sum_squares += x_ij * x_ij;
+    }
+    float reduce_result = BlockReduce(temp_storage).Sum(sum_squares);
+    if (threadIdx.x == 0) {
+      row_sum_squares_avg = reduce_result / (float)N;
+      param_mom[index] += row_sum_squares_avg;
+    }
+    __syncthreads();
+    // update param
+    float step = lr[0] / (std::sqrt(param_mom[index]) + epsilon);
+    for (int j = threadIdx.x; j < N; j += blockDim.x) {
+      param[index * N + j] = param[index * N + j] + grad[i * N + j] * step;
+    }
+  }
+}
+
+template <typename T, class Context>
+class CUDASparseAdagradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CUDASparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {
+    const T decay = OperatorBase::GetSingleArgument<T>("decay", 1.0f);
+    CAFFE_ENFORCE_EQ(decay, 1.0, "Decay is not supported for SparseAdagradOp");
+  }
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    auto n = Input(INDICES).size();
+    if (n == 0) {
+      return true;
+    }
+    return DispatchHelper<TensorTypes2<float, float16>, IndexType>::call(
+        this, Input(PARAM));
+  }
+
+  template <typename IndexType, typename THalf>
+  bool DoRunWithType2() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto* indices = Input(INDICES).template data<IndexType>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* paramIn = Input(PARAM).template data<THalf>();
+    const auto* momentIn = Input(MOMENT_1).template data<THalf>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<THalf>();
+    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>();
+
+    auto N = Input(GRAD).size();
+    auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
+    if (N == 0) {
+      // empty grad, nothing to do here, not even launching the kernel
+      return true;
+    }
+    SparseAdagradKernel<IndexType, THalf>
+        <<<CAFFE_GET_BLOCKS(N),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            grad_slice_sz,
+            epsilon_,
+            Output(OUTPUT_PARAM)->template mutable_data<THalf>(),
+            Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>(),
+            Input(INDICES).template data<IndexType>(),
+            Input(GRAD).template data<float>(),
+            Input(LR).template data<float>());
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
+};
+
+template <>
+template <typename SIndex>
+bool RowWiseSparseAdagradOp<float, CUDAContext>::DoRunWithType() {
+  auto N = Input(GRAD).size();
+  if (N == 0) {
+    // empty grad, nothing to do here, not even launching the kernel
+    return true;
+  }
+  // size of the 1st dimension of the input gradient
+  auto GRAD_M = Input(GRAD).dim32(0);
+  auto GRAD_N = N / GRAD_M;
+
+  // each thread block will handle multiple rows of the input and output
+  RowWiseSparseAdagradKernel<<<
+      min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      GRAD_M,
+      GRAD_N,
+      epsilon_,
+      Output(OUTPUT_PARAM)->template mutable_data<float>(),
+      Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+      Input(INDICES).template data<SIndex>(),
+      Input(GRAD).template data<float>(),
+      Input(LR).template data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Adagrad, AdagradOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SparseAdagrad, CUDASparseAdagradOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    RowWiseSparseAdagrad,
+    RowWiseSparseAdagradOp<float, CUDAContext>);
+}
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
new file mode 100644
index 0000000..2541462
--- /dev/null
+++ b/caffe2/sgd/adam_op.cc
@@ -0,0 +1,105 @@
+#include "adam_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Adam, AdamOp<float, CPUContext>);
+OPERATOR_SCHEMA(Adam)
+    .NumInputs(6)
+    .NumOutputs(3, 4)
+    .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
+    .SetDoc(R"DOC(
+
+Computes the Adam update (https://arxiv.org/abs/1412.6980) for an
+input gradient and momentum parameters. Concretely, given inputs
+(param, m1, m2, grad, lr, iters),
+
+    t = iters + 1
+    correction_multiplier = sqrt(1 - power(beta2, t)) /
+      (1 - power(beta1, t))
+    m1_o = (beta1 * m1) + (1 - beta1) * grad
+    m2_o = (beta2 * m2) + (1 - beta2) * np.square(grad)
+    grad_o = correction_multiplier * m1_o / \
+        (sqrt(m2_o) + epsilon)
+    param_o = param + lr * grad_o
+
+and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment_1", "First moment history")
+    .Input(2, "moment_2", "Second moment history")
+    .Input(3, "grad", "Gradient computed")
+    .Input(4, "lr", "learning rate")
+    .Input(5, "iter", "iteration number")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated first moment")
+    .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_grad", "Effective grad")
+    .Arg("beta1", "Default 0.9")
+    .Arg("beta2", "Default 0.999")
+    .Arg("epsilon", "Default 1e-5");
+
+REGISTER_CPU_OPERATOR(SparseAdam, SparseAdamOp<float, CPUContext>);
+OPERATOR_SCHEMA(SparseAdam)
+    .NumInputs(7)
+    .NumOutputs(3)
+    .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
+    .SetDoc(R"DOC(
+
+    Computes the Adam Update for the sparse case.
+    Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the dense
+    Adam on (param, moment1[indices], momemnt2[indices], lr, iter) and returns
+    (new_param, new_moment1, new_moment2) as in dense case
+
+    )DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment_1", "First moment history")
+    .Input(2, "moment_2", "Second moment history")
+    .Input(3, "indices", "Sparse indices")
+    .Input(4, "grad", "Gradient computed")
+    .Input(5, "lr", "learning rate")
+    .Input(6, "iter", "iteration number")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated first moment")
+    .Output(2, "output_moment_2", "Updated second moment")
+    .Arg("beta1", "Default 0.9")
+    .Arg("beta2", "Default 0.999")
+    .Arg("epsilon", "Default 1e-5");
+
+REGISTER_CPU_OPERATOR(
+    RowWiseSparseAdam,
+    RowWiseSparseAdamOp<float, CPUContext>);
+OPERATOR_SCHEMA(RowWiseSparseAdam)
+    .NumInputs(7)
+    .NumOutputs(3)
+    .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
+    .SetDoc(R"DOC(
+
+    Computes a modified Adam Update for the sparse case.
+    Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the
+    Adam update on (param, moment1[indices], moment2[indices], lr, iter) and returns
+    (new_param, new_moment1, new_moment2), where moment2 is a 1D tensor
+    with length equal to the number of rows in param:
+    shape(moment2) == shape(param)[0]. Each element of  moment2 is
+    applied to an entire row of param, and the new moment2 values are
+    calculated by averaging across the row.
+
+    )DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment_1", "First moment history")
+    .Input(2, "moment_2", "Second moment history")
+    .Input(3, "indices", "Sparse indices")
+    .Input(4, "grad", "Gradient computed")
+    .Input(5, "lr", "learning rate")
+    .Input(6, "iter", "iteration number")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated first moment")
+    .Output(2, "output_moment_2", "Updated second moment")
+    .Arg("beta1", "Default 0.9")
+    .Arg("beta2", "Default 0.999")
+    .Arg("epsilon", "Default 1e-5");
+
+SHOULD_NOT_DO_GRADIENT(Adam);
+SHOULD_NOT_DO_GRADIENT(SparseAdam);
+SHOULD_NOT_DO_GRADIENT(RowWiseSparseAdam);
+} // namespace caffe2
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
new file mode 100644
index 0000000..bb30247
--- /dev/null
+++ b/caffe2/sgd/adam_op.h
@@ -0,0 +1,377 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void adam_update(
+    int N,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* ng,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    ng[i] = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+  }
+}
+
+template <typename Context>
+void adam_compute(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    nw[i] = w[i] + lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+  }
+}
+
+template <typename Context>
+void adam_compute_output_grad(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float* ng,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    float ngi = ng[i] = correction * mi / (std::sqrt(vi) + eps_hat);
+    nw[i] = w[i] + lr[0] * ngi;
+  }
+}
+
+template <typename T, class Context>
+class AdamOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AdamOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta1_(OperatorBase::GetSingleArgument<float>("beta1", 0.9f)),
+        beta2_(OperatorBase::GetSingleArgument<float>("beta2", 0.999f)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+  bool RunOnDevice() override {
+    // Iter live on the CPU
+    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size());
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size());
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_2).size());
+    Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+    Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
+    Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
+
+    const auto iter =
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+
+    const auto t = iter + 1;
+    const auto correction =
+        std::sqrt(T(1.) - std::pow(beta2_, t)) / (T(1.) - std::pow(beta1_, t));
+    if (OutputSize() == 3) {
+      adam_compute<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(MOMENT_1).template data<T>(),
+          Input(MOMENT_2).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_2)->template mutable_data<T>(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          correction,
+          Input(LR).template data<T>(),
+          &context_);
+    } else {
+      Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+      adam_compute_output_grad<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(MOMENT_1).template data<T>(),
+          Input(MOMENT_2).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
+          Output(OUTPUT_MOMENT_2)->template mutable_data<T>(),
+          Output(OUTPUT_GRAD)->template mutable_data<T>(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          correction,
+          Input(LR).template data<T>(),
+          &context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  T beta1_{0.9};
+  T beta2_{0.999};
+  T epsilon_{1e-8};
+  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, GRAD, LR, ITER);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
+};
+
+template <typename T, class Context>
+class SparseAdamOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseAdamOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta1_(OperatorBase::GetSingleArgument<float>("beta1", 0.9f)),
+        beta2_(OperatorBase::GetSingleArgument<float>("beta2", 0.999f)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_2).size());
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto iter =
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+
+    const auto t = iter + 1;
+    const auto correction =
+        std::sqrt(T(1.) - std::pow(beta2_, t)) / (T(1.) - std::pow(beta1_, t));
+
+    auto block_size = Input(PARAM).size() / Input(PARAM).dim(0);
+    auto n = Input(GRAD).size() / block_size;
+
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* moment1In = Input(MOMENT_1).template data<T>();
+    const auto* moment2In = Input(MOMENT_2).template data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+    auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
+
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+
+      if (block_size == 1) {
+        float gi = gradIn[i];
+        float mi = moment1Out[idx] =
+            moment1In[idx] * beta1_ + gi * (1 - beta1_);
+        float vi = moment2Out[idx] =
+            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+        paramOut[idx] =
+            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+      } else {
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+        CAFFE_ENFORCE_GE(
+            Input(PARAM).size(),
+            block_size + offsetIdx,
+            this->debug_def().input(PARAM),
+            ", out of bound,  idx:",
+            idx,
+            " for input i:",
+            i,
+            " and block size:",
+            block_size);
+        CAFFE_ENFORCE_GE(
+            Input(GRAD).size(),
+            block_size + offsetI,
+            this->debug_def().input(GRAD),
+            ", out of bound idx, idx:",
+            idx,
+            " for input i:",
+            i);
+#endif
+
+        adam_compute(
+            block_size,
+            paramIn + offsetIdx,
+            gradIn + offsetI,
+            moment1In + offsetIdx,
+            moment2In + offsetIdx,
+            paramOut + offsetIdx,
+            moment1Out + offsetIdx,
+            moment2Out + offsetIdx,
+            beta1_,
+            beta2_,
+            epsilon_,
+            correction,
+            lr,
+            &context_);
+      }
+    }
+    return true;
+  }
+
+ protected:
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+};
+
+template <typename T, class Context>
+class RowWiseSparseAdamOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RowWiseSparseAdamOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta1_(OperatorBase::GetSingleArgument<float>("beta1", 0.9f)),
+        beta2_(OperatorBase::GetSingleArgument<float>("beta2", 0.999f)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
+    CAFFE_ENFORCE_EQ(Input(PARAM).dims()[0], Input(MOMENT_2).size());
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto iter =
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+
+    const auto t = iter + 1;
+    const auto correction =
+        std::sqrt(T(1.) - std::pow(beta2_, t)) / (T(1.) - std::pow(beta1_, t));
+
+    auto block_size = Input(PARAM).size() / Input(PARAM).dim(0);
+    auto n = Input(GRAD).size() / block_size;
+
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* moment1In = Input(MOMENT_1).template data<T>();
+    const auto* moment2In = Input(MOMENT_2).template data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+    auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
+
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+
+      if (block_size == 1) {
+        float gi = gradIn[i];
+        float mi = moment1Out[idx] =
+            moment1In[idx] * beta1_ + gi * (1 - beta1_);
+        float vi = moment2Out[idx] =
+            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+        paramOut[idx] =
+            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+      } else {
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+        CAFFE_ENFORCE_GE(
+            Input(PARAM).size(),
+            block_size + offsetIdx,
+            this->debug_def().input(PARAM),
+            ", out of bound,  idx:",
+            idx,
+            " for input i:",
+            i,
+            " and block size:",
+            block_size);
+        CAFFE_ENFORCE_GE(
+            Input(GRAD).size(),
+            block_size + offsetI,
+            this->debug_def().input(GRAD),
+            ", out of bound idx, idx:",
+            idx,
+            " for input i:",
+            i);
+#endif
+
+        const float* w = paramIn + offsetIdx;
+        const float* g = gradIn + offsetI;
+        const float* m1 = moment1In + offsetIdx;
+        const float* m2 = moment2In + idx;
+        float* nw = paramOut + offsetIdx;
+        float* nm1 = moment1Out + offsetIdx;
+        float* nm2 = moment2Out + idx;
+
+        float m2_sum = 0.;
+        for (auto j = 0; j < block_size; ++j) {
+          float gj = g[j];
+          m2_sum += gj * gj;
+        }
+        float vi = nm2[0] =
+            m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
+        for (auto j = 0; j < block_size; ++j) {
+          float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
+          nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+        }
+      }
+    }
+    return true;
+  }
+
+ protected:
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+};
+
+} // namespace caffe2
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
new file mode 100644
index 0000000..c9b94e6
--- /dev/null
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -0,0 +1,160 @@
+#include "adam_op.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+__global__ void AdamUpdate(
+    int N,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* ng,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    ng[i] = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+  }
+}
+
+template <>
+void adam_update<CUDAContext>(
+    int N,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* ng,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    CUDAContext* context) {
+  AdamUpdate<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N, g, m, v, ng, nm, nv, beta1, beta2, eps_hat, correction, lr);
+}
+
+__global__ void AdamCompute(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    float ng = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+    nw[i] = w[i] + ng;
+  }
+}
+
+template <>
+void adam_compute<CUDAContext>(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    CUDAContext* context) {
+  AdamCompute<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr);
+}
+
+template <typename SIndex>
+__global__ void SparseAdamKernel(
+    const size_t N,
+    const size_t grad_slice_sz,
+    const float beta1,
+    const float beta2,
+    const float epsilon,
+    float* param,
+    float* mom1,
+    float* mom2,
+    const SIndex* indices,
+    const float* grad,
+    const float correction,
+    const float* lr,
+    const float iter) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const size_t gradIdx = i;
+    const SIndex index = indices[i / grad_slice_sz];
+    const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);
+
+    float m1n = mom1[paramIdx] =
+        mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1);
+    float m2n = mom2[paramIdx] =
+        mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2);
+    param[paramIdx] += lr[0] * correction * m1n / (sqrt(m2n) + epsilon);
+  }
+}
+
+template <>
+template <typename SIndex>
+bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
+  auto N = Input(GRAD).size();
+  auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
+  const auto iter =
+      OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+  const float correction = std::sqrt(1.0f - std::pow(beta2_, iter + 1)) /
+      (1.0f - std::pow(beta1_, iter + 1));
+
+  SparseAdamKernel<SIndex>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          N,
+          grad_slice_sz,
+          beta1_,
+          beta2_,
+          epsilon_,
+          Output(OUTPUT_PARAM)->template mutable_data<float>(),
+          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+          Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
+          Input(INDICES).template data<SIndex>(),
+          Input(GRAD).template data<float>(),
+          correction,
+          Input(LR).template data<float>(),
+          iter);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(Adam, AdamOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SparseAdam, SparseAdamOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/sgd/clip_tensor_op.cc b/caffe2/sgd/clip_tensor_op.cc
new file mode 100644
index 0000000..c3ee0a3
--- /dev/null
+++ b/caffe2/sgd/clip_tensor_op.cc
@@ -0,0 +1,36 @@
+#include "caffe2/sgd/clip_tensor_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ClipTensorByScaling, ClipTensorByScalingOp<CPUContext>);
+OPERATOR_SCHEMA(ClipTensorByScaling)
+    .NumInputs(2, 3)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+    Clips the input tensor by scaling based on the input value and the threshold.
+    The value is usually the (pre-computed) norm of the tensor. If the value is
+    larger than the threshold, scaling would be performed in this way:
+
+          tensor *= (threshold / value).
+
+    An optional input called additional_threshold can be provided which
+    will scale the original threshold before it is used. That is,
+    the final threshold will become threshold * additional_threshold.
+    This op could be used for gradient clipping.
+)DOC")
+    .Input(0, "input_tensor", "Tensor of floats to be clipped.")
+    .Input(1, "val", "Value to be compared against the threshold")
+    .Input(
+        2,
+        "additional_threshold",
+        "An optional additonal threshold to scale the orignal threshold")
+    .Arg("threshold", "Threshold to determine whether to scale down the tensor")
+    .Output(
+        0,
+        "clipped",
+        "Tensor of floats, which is the same size as the input tensor, "
+        "representing the clipped tensor.");
+
+SHOULD_NOT_DO_GRADIENT(ClipTensorByScaling);
+}; // namespace caffe2
diff --git a/caffe2/sgd/clip_tensor_op.h b/caffe2/sgd/clip_tensor_op.h
new file mode 100644
index 0000000..359dcad
--- /dev/null
+++ b/caffe2/sgd/clip_tensor_op.h
@@ -0,0 +1,67 @@
+#ifndef CAFFE2_OPERATORS_CLIP_TENSOR_OP_H_
+#define CAFFE2_OPERATORS_CLIP_TENSOR_OP_H_
+
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename Context>
+class ClipTensorByScalingOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  ClipTensorByScalingOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    threshold_ = OperatorBase::GetSingleArgument<float>("threshold", 0.0);
+    CAFFE_ENFORCE_GT(threshold_, 0, "Threshold must be greater than 0");
+  }
+
+  bool RunOnDevice() override {
+    const auto& input_tensor = Input(0);
+    CAFFE_ENFORCE_GT(input_tensor.size(), 0);
+    const auto& val = Input(1);
+    CAFFE_ENFORCE_EQ(val.size(), 1);
+
+    const auto* input_tensor_data = input_tensor.template data<float>();
+    const auto* val_data = val.template data<float>();
+
+    auto* clipped = Output(0);
+    clipped->ResizeLike(input_tensor);
+    float* clipped_tensor_data = clipped->template mutable_data<float>();
+
+    if (InputSize() > 2) {
+      const auto& additional_threshold = Input(2);
+      CAFFE_ENFORCE_EQ(additional_threshold.size(), 1);
+
+      threshold_ *= *(additional_threshold.template data<float>());
+    }
+
+    if (*val_data > threshold_) {
+      float ratio = threshold_ / *val_data;
+
+      math::Scale<float, Context>(
+          clipped->size(),
+          ratio,
+          input_tensor_data,
+          clipped_tensor_data,
+          &context_);
+    } else {
+      if (input_tensor_data != clipped_tensor_data) {
+        clipped->CopyFrom(input_tensor, &context_);
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  float threshold_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CLIP_TENSOR_OP_H_
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu
new file mode 100644
index 0000000..d8d98bc
--- /dev/null
+++ b/caffe2/sgd/fp16_momentum_sgd_op.cu
@@ -0,0 +1,264 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+#include "fp16_momentum_sgd_op.h"
+
+namespace caffe2 {
+namespace {
+__global__ void FP16MomentumSGDKernel(
+    int N,
+    const half2* g,
+    const half2* m,
+    half2* ng,
+    half2* nm,
+    const float* lr,
+    const float mom,
+    bool nesterov,
+    const float wd,
+    half2* param) {
+#if __CUDA_ARCH__ >= 530
+  const float lr2 = lr[0];
+  const half2 LR = __float2half2_rn(lr2);
+  const half2 momentum = __float2half2_rn(mom);
+  const half2 weight_decay = __float2half2_rn(wd);
+
+  int n = N / 2;
+  if (!nesterov) {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      ng[i] = __hfma2(weight_decay, param[i], g[i]);
+      const half2 adjusted_gradient =
+          __hfma2(LR, ng[i], __hmul2(momentum, m[i]));
+      nm[i] = adjusted_gradient;
+      ng[i] = adjusted_gradient;
+      if (param) {
+        param[i] = __hsub2(param[i], ng[i]);
+      }
+
+      // odd number of elements
+      if (i == 0 && (N % 2)) {
+        half *g_half = (half*)g, *param_half = (half*)param, *m_half = (half*)m,
+             *nm_half = (half*)nm, *ng_half = (half*)ng;
+        ng_half[N - 1] =
+            __hfma(__high2half(weight_decay), param_half[N - 1], g_half[N - 1]);
+        const half adjusted_gradient_half = __hfma(
+            __high2half(LR),
+            ng_half[N - 1],
+            __hmul(__high2half(momentum), m_half[N - 1]));
+        nm_half[N - 1] = adjusted_gradient_half;
+        ng_half[N - 1] = adjusted_gradient_half;
+        if (param) {
+          param_half[N - 1] = __hsub(param_half[N - 1], adjusted_gradient_half);
+        }
+      }
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      // computing the term (grad + lambda*weight)
+      // might need to change in case of denormalization
+      ng[i] = __hfma2(weight_decay, param[i], g[i]);
+      const half2 mi = m[i];
+      const half2 mom_mi = __hmul2(momentum, mi);
+      const half2 mi_new = __hfma2(LR, ng[i], mom_mi);
+      nm[i] = mi_new;
+      ng[i] = __hsub2(__hfma2(mi_new, momentum, mi_new), mom_mi);
+
+      if (param) {
+        param[i] = __hsub2(param[i], ng[i]);
+      }
+
+      // odd number of elements
+      if (i == 0 && (N % 2)) {
+        half *g_half = (half*)g, *param_half = (half*)param, *m_half = (half*)m,
+             *nm_half = (half*)nm, *ng_half = (half*)ng;
+        ng_half[N - 1] =
+            __hfma(__high2half(weight_decay), param_half[N - 1], g_half[N - 1]);
+        const half mi_half = m_half[N - 1];
+        const half mom_mi_half = __hmul(__high2half(momentum), mi_half);
+        const half mi_new_half =
+            __hfma(__high2half(LR), ng_half[N - 1], mom_mi_half);
+        nm_half[N - 1] = mi_new_half;
+        ng_half[N - 1] = __hsub(
+            __hfma(mi_new_half, __high2half(momentum), mi_new_half),
+            mom_mi_half);
+        if (param) {
+          param_half[N - 1] = __hsub(param_half[i], ng_half[N - 1]);
+        }
+      }
+    }
+  }
+
+#else
+   CUDA_KERNEL_ASSERT(false);
+#endif // CAFFE_HAS_CUDA_FP16
+}
+
+__global__ void FP16MomentumSGDFP32Kernel(
+    int N,
+    const half2* g,
+    const half2* m,
+    half2* ng,
+    half2* nm,
+    const float* lr,
+    const float mom,
+    bool nesterov,
+    const float wd,
+    half2* param) {
+#if __CUDA_ARCH__ >= 530
+  const float lr2 = lr[0];
+  const float LR = lr2;
+  const float momentum = mom;
+  const float weight_decay = wd;
+
+  int n = N / 2;
+  if (!nesterov) {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      float2 param_float2 = __half22float2(param[i]);
+      float2 g_float2 = __half22float2(g[i]);
+
+      float2 ng_float2;
+      ng_float2.x = __fmaf_rn(weight_decay, param_float2.x, g_float2.x);
+      ng_float2.y = __fmaf_rn(weight_decay, param_float2.y, g_float2.y);
+
+      float2 m_float2 = __half22float2(m[i]);
+      float2 adjusted_gradient_float2;
+      adjusted_gradient_float2.x =
+          __fmaf_rn(LR, ng_float2.x, __fmul_rn(momentum, m_float2.x));
+      adjusted_gradient_float2.y =
+          __fmaf_rn(LR, ng_float2.y, __fmul_rn(momentum, m_float2.y));
+
+      nm[i] = __float22half2_rn(adjusted_gradient_float2);
+      ng[i] = __float22half2_rn(adjusted_gradient_float2);
+
+      if (param) {
+        param_float2.x = __fsub_rn(param_float2.x, adjusted_gradient_float2.x);
+        param_float2.y = __fsub_rn(param_float2.y, adjusted_gradient_float2.y);
+        param[i] = __float22half2_rn(param_float2);
+      }
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      // computing the term (grad + lambda*weight)
+      // might need to change in case of denormalization
+
+      float2 param_float2 = __half22float2(param[i]);
+      float2 g_float2 = __half22float2(g[i]);
+
+      float2 ng_float2;
+      ng_float2.x = __fmaf_rn(weight_decay, param_float2.x, g_float2.x);
+      ng_float2.y = __fmaf_rn(weight_decay, param_float2.y, g_float2.y);
+
+      const float2 mi_float2 = __half22float2(m[i]);
+      float2 mom_mi_float2;
+      mom_mi_float2.x = __fmul_rn(momentum, mi_float2.x);
+      mom_mi_float2.y = __fmul_rn(momentum, mi_float2.y);
+      float2 mi_new_float2;
+      mi_new_float2.x = __fmaf_rn(LR, ng_float2.x, mom_mi_float2.x);
+      mi_new_float2.y = __fmaf_rn(LR, ng_float2.y, mom_mi_float2.y);
+
+      nm[i] = __float22half2_rn(mi_new_float2);
+      ng_float2.x = __fsub_rn(
+          __fmaf_rn(mi_new_float2.x, momentum, mi_new_float2.x),
+          mom_mi_float2.x);
+      ng_float2.y = __fsub_rn(
+          __fmaf_rn(mi_new_float2.y, momentum, mi_new_float2.y),
+          mom_mi_float2.y);
+      ng[i] = __float22half2_rn(ng_float2);
+
+      if (param) {
+        param_float2.x = __fsub_rn(param_float2.x, ng_float2.x);
+        param_float2.y = __fsub_rn(param_float2.y, ng_float2.y);
+        param[i] = __float22half2_rn(param_float2);
+      }
+    }
+  }
+#else
+   CUDA_KERNEL_ASSERT(false);
+#endif // CAFFE_HAS_CUDA_FP16
+}
+}
+
+template <>
+void fp16_momentum_sgd_update<CUDAContext>(
+    int N,
+    const float16* g,
+    const float16* m,
+    float16* ng,
+    float16* nm,
+    const float* lr,
+    float momentum,
+    bool nesterov,
+    float weight_decay,
+    bool fp32_update,
+    float16* param,
+    CUDAContext* context) {
+  const cudaDeviceProp& prop = GetDeviceProperty(0);
+  if (prop.major >= 6) {
+    if (!fp32_update) {
+      FP16MomentumSGDKernel<<<
+          CAFFE_GET_BLOCKS(N / 2),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context->cuda_stream()>>>(
+          N,
+          reinterpret_cast<const half2*>(g),
+          reinterpret_cast<const half2*>(m),
+          reinterpret_cast<half2*>(ng),
+          reinterpret_cast<half2*>(nm),
+          lr,
+          momentum,
+          nesterov,
+          weight_decay,
+          reinterpret_cast<half2*>(param));
+      // not setting N to N/2
+    } else {
+      FP16MomentumSGDFP32Kernel<<<
+          CAFFE_GET_BLOCKS(N / 2),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context->cuda_stream()>>>(
+          N,
+          reinterpret_cast<const half2*>(g),
+          reinterpret_cast<const half2*>(m),
+          reinterpret_cast<half2*>(ng),
+          reinterpret_cast<half2*>(nm),
+          lr,
+          momentum,
+          nesterov,
+          weight_decay,
+          reinterpret_cast<half2*>(param));
+      // not setting N to N/2
+    }
+
+  } else {
+    CAFFE_ENFORCE(false, "FP16MomentumSGDUpdate not supported. Major: ",
+      prop.major, " Minor: ", prop.minor);
+  }
+}
+
+REGISTER_CUDA_OPERATOR(
+    FP16MomentumSGDUpdate,
+    FP16MomentumSGDUpdateOp<float16, CUDAContext>);
+OPERATOR_SCHEMA(FP16MomentumSGDUpdate)
+    .NumInputs(4)
+    .NumOutputs(3)
+    .AllowInplace({{0, 0}, {1, 1}, {3, 2}})
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(3);
+      out[0] = in[0];
+      out[1] = in[1];
+      out[2] = in[3];
+      return out;
+    })
+    .SetDoc(R"DOC(
+
+Computes the momentum SGD update similarly to the MomentumSGDUpdateOp,
+however this op also performs the weight decay update at the same time, thus
+making it more efficient.
+
+This op is also functionally equivalent to the FP32MomentumSGDUpdateOp, however
+it expects FP16 data and performs its updates in either FP16 precision
+(default), or FP32 precision if the 'fp32_update' flag is set to True.
+
+)DOC");
+}
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h
new file mode 100644
index 0000000..85a9d53
--- /dev/null
+++ b/caffe2/sgd/fp16_momentum_sgd_op.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+template <class Context>
+void fp16_momentum_sgd_update(
+    int N,
+    const float16* g,
+    const float16* m,
+    float16* ng,
+    float16* nm,
+    const float* lr,
+    float momentum,
+    bool nesterov,
+    float weight_decay,
+    bool fp32_update,
+    float16* param,
+    Context* /*context*/) {}
+
+template <typename T, class Context>
+class FP16MomentumSGDUpdateOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FP16MomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        weight_decay_(
+            OperatorBase::GetSingleArgument<float>("weight_decay", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)),
+        // when set, fp32_update will read in the fp16 data but
+        // perform all the compute in fp32 precision.
+        fp32_update_(OperatorBase::GetSingleArgument<int>("fp32_update", 0)) {}
+
+  bool RunOnDevice() override {
+    // Iter live on the CPU
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_MOMENTUM)->ResizeLike(Input(MOMENTUM));
+
+    fp16_momentum_sgd_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(MOMENTUM).template data<T>(),
+        Output(OUTPUT_GRAD)->template mutable_data<T>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<T>(),
+        Input(LR).template data<float>(),
+        momentum_,
+        nesterov_,
+        weight_decay_,
+        fp32_update_,
+        Output(OUTPUT_PARAM)->template mutable_data<T>(),
+        &context_);
+
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  float weight_decay_{0.0};
+  bool nesterov_;
+  bool fp32_update_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+}
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.cu b/caffe2/sgd/fp32_momentum_sgd_op.cu
new file mode 100644
index 0000000..17a0d6b
--- /dev/null
+++ b/caffe2/sgd/fp32_momentum_sgd_op.cu
@@ -0,0 +1,140 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+#include "fp32_momentum_sgd_op.h"
+
+namespace caffe2 {
+namespace {
+
+__global__ void FP32MomentumSGDKernel(
+    int N,
+    const float2* g,
+    const float2* m,
+    float2* ng,
+    float2* nm,
+    const float* lr,
+    const float mom,
+    bool nesterov,
+    const float wd,
+    float2* param) {
+#if __CUDA_ARCH__ >= 530
+  const float lr2 = lr[0];
+  const float LR = lr2;
+  const float momentum = mom;
+  const float weight_decay = wd;
+
+  int n = N / 2;
+  if (!nesterov) {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      ng[i].x = __fmaf_rn(weight_decay, param[i].x, g[i].x);
+      ng[i].y = __fmaf_rn(weight_decay, param[i].y, g[i].y);
+
+      float2 mi_float2 = m[i];
+      float2 adjusted_gradient_float2;
+      adjusted_gradient_float2.x =
+          __fmaf_rn(LR, ng[i].x, __fmul_rn(momentum, mi_float2.x));
+      adjusted_gradient_float2.y =
+          __fmaf_rn(LR, ng[i].y, __fmul_rn(momentum, mi_float2.y));
+
+      nm[i] = adjusted_gradient_float2;
+      ng[i] = adjusted_gradient_float2;
+
+      if (param) {
+        param[i].x = __fsub_rn(param[i].x, adjusted_gradient_float2.x);
+        param[i].y = __fsub_rn(param[i].y, adjusted_gradient_float2.y);
+      }
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      // computing the term (grad + lambda*weight)
+      // might need to change in case of denormalization
+
+      ng[i].x = __fmaf_rn(weight_decay, param[i].x, g[i].x);
+      ng[i].y = __fmaf_rn(weight_decay, param[i].y, g[i].y);
+
+      const float2 mi_float2 = m[i];
+      float2 mom_mi_float2;
+      mom_mi_float2.x = __fmul_rn(momentum, mi_float2.x);
+      mom_mi_float2.y = __fmul_rn(momentum, mi_float2.y);
+      float2 mi_new_float2;
+      mi_new_float2.x = __fmaf_rn(LR, ng[i].x, mom_mi_float2.x);
+      mi_new_float2.y = __fmaf_rn(LR, ng[i].y, mom_mi_float2.y);
+
+      nm[i] = mi_new_float2;
+      ng[i].x = __fsub_rn(
+          __fmaf_rn(mi_new_float2.x, momentum, mi_new_float2.x),
+          mom_mi_float2.x);
+      ng[i].y = __fsub_rn(
+          __fmaf_rn(mi_new_float2.y, momentum, mi_new_float2.y),
+          mom_mi_float2.y);
+
+      if (param) {
+        param[i].x = __fsub_rn(param[i].x, ng[i].x);
+        param[i].y = __fsub_rn(param[i].y, ng[i].y);
+      }
+    }
+  }
+#else
+   CUDA_KERNEL_ASSERT(false);
+#endif // CAFFE_HAS_CUDA_FP16
+}
+}
+
+template <>
+void fp32_momentum_sgd_update<CUDAContext>(
+    int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    float momentum,
+    bool nesterov,
+    float weight_decay,
+    float* param,
+    CUDAContext* context) {
+  FP32MomentumSGDKernel<<<
+      CAFFE_GET_BLOCKS(N / 2),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N,
+      reinterpret_cast<const float2*>(g),
+      reinterpret_cast<const float2*>(m),
+      reinterpret_cast<float2*>(ng),
+      reinterpret_cast<float2*>(nm),
+      lr,
+      momentum,
+      nesterov,
+      weight_decay,
+      reinterpret_cast<float2*>(param));
+  // not setting N to N/2
+  // TODO_ check float performance vs float2
+}
+
+REGISTER_CUDA_OPERATOR(
+    FP32MomentumSGDUpdate,
+    FP32MomentumSGDUpdateOp<float, CUDAContext>);
+OPERATOR_SCHEMA(FP32MomentumSGDUpdate)
+    .NumInputs(4)
+    .NumOutputs(3)
+    .AllowInplace({{0, 0}, {1, 1}, {3, 2}})
+    .TensorInferenceFunction([](const OperatorDef& /* unused */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(3);
+      out[0] = in[0];
+      out[1] = in[1];
+      out[2] = in[3];
+      return out;
+    })
+    .SetDoc(R"DOC(
+
+Computes the momentum SGD update similarly to the MomentumSGDUpdateOp,
+however this op also performs the weight decay update at the same time, thus
+making it more efficient.
+
+This op is also functionally equivalent to the FP16MomentumSGDUpdateOp, however
+it expects FP32 data and performs its updates in FP32 precision.
+
+)DOC");
+}
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h
new file mode 100644
index 0000000..25ca516
--- /dev/null
+++ b/caffe2/sgd/fp32_momentum_sgd_op.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+namespace caffe2 {
+
+template <class Context>
+void fp32_momentum_sgd_update(
+    int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    float momentum,
+    bool nesterov,
+    float weight_decay,
+    float* param,
+    Context* /*context*/) {}
+
+template <typename T, class Context>
+class FP32MomentumSGDUpdateOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FP32MomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        weight_decay_(
+            OperatorBase::GetSingleArgument<float>("weight_decay", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    // Iter live on the CPU
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_MOMENTUM)->ResizeLike(Input(MOMENTUM));
+
+    fp32_momentum_sgd_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(MOMENTUM).template data<T>(),
+        Output(OUTPUT_GRAD)->template mutable_data<T>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<T>(),
+        Input(LR).template data<float>(),
+        momentum_,
+        nesterov_,
+        weight_decay_,
+        Output(OUTPUT_PARAM)->template mutable_data<T>(),
+        &context_);
+
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  float weight_decay_{0.0};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+}
diff --git a/caffe2/sgd/ftrl_op.cc b/caffe2/sgd/ftrl_op.cc
new file mode 100644
index 0000000..2634061
--- /dev/null
+++ b/caffe2/sgd/ftrl_op.cc
@@ -0,0 +1,147 @@
+#include "ftrl_op.h"
+
+namespace caffe2 {
+
+template <class T>
+inline T sgn(const T x) {
+  return (x == 0 ? 0 : (x < 0 ? -1 : 1));
+}
+
+template <typename T>
+inline void ftrl_compute(
+    const T w,
+    const T n,
+    const T z,
+    const T g,
+    T& nw,
+    T& nn,
+    T& nz,
+    const FtrlParams<T>& params) {
+  auto new_n = n + g * g;
+  auto sigma = (sqrt(new_n) - sqrt(n)) * params.alphaInv;
+  nn = new_n;
+  nz = z + g - sigma * w;
+  // update the weight
+  if (std::abs(nz) > params.lambda1) {
+    nw = (params.lambda1 * sgn(nz) - nz) /
+        ((params.beta + sqrt(new_n)) * params.alphaInv + params.lambda2);
+  } else {
+    nw = 0.0;
+  }
+}
+
+// TODO(dzhulgakov): implement SIMD-based version
+template <typename Context, typename T>
+void ftrl_update(
+    int N,
+    const T* w,
+    const T* nz,
+    const T* g,
+    T* new_w,
+    T* new_nz,
+    const FtrlParams<T>& params,
+    Context* /*context*/) {
+  // TODO(cxj): use OMP when it is reliable
+  // #pragma omp parallel for
+  for (auto i = 0; i < N; ++i) {
+    ftrl_compute(
+        w[i],
+        nz[i * 2],
+        nz[i * 2 + 1],
+        g[i],
+        new_w[i],
+        new_nz[i * 2],
+        new_nz[i * 2 + 1],
+        params);
+  }
+}
+
+template <typename T, typename Context>
+bool FtrlOp<T, Context>::RunOnDevice() {
+  // run time learning rate override
+  if (ALPHA < InputSize()) {
+    CAFFE_ENFORCE_EQ(Input(ALPHA).size(), 1, "alpha should be real-valued");
+    params_.alphaInv = 1.0 / *(Input(ALPHA).template data<T>());
+  }
+  CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(VAR).size());
+  CAFFE_ENFORCE_EQ(Input(GRAD).size() * 2, Input(N_Z).size());
+  Output(OUTPUT_VAR)->ResizeLike(Input(VAR));
+  Output(OUTPUT_N_Z)->ResizeLike(Input(N_Z));
+  ftrl_update<Context>(
+      Input(GRAD).size(),
+      Input(VAR).template data<T>(),
+      Input(N_Z).template data<T>(),
+      Input(GRAD).template data<T>(),
+      Output(OUTPUT_VAR)->template mutable_data<T>(),
+      Output(OUTPUT_N_Z)->template mutable_data<T>(),
+      params_,
+      &context_);
+  return true;
+}
+
+template <typename T>
+template <typename SIndex>
+void SparseFtrlOp<T>::DoRun() {
+  auto* var = Output(OUTPUT_VAR);
+  auto* n_z = Output(OUTPUT_N_Z);
+  auto& indices = Input(INDICES);
+  auto& grad = Input(GRAD);
+  CAFFE_ENFORCE_EQ(&Input(VAR), var, "In place operation is required");
+  CAFFE_ENFORCE_EQ(&Input(N_Z), n_z, "In place operation is required");
+  TIndex M = var->size();
+  TIndex N = var->dim(0);
+  TIndex block_size = M / N;
+  TIndex K = indices.size();
+  DCHECK_EQ(M * 2, n_z->size());
+  DCHECK_EQ(grad.size(), K * block_size);
+  T* w = var->template mutable_data<T>();
+  T* nz = n_z->template mutable_data<T>();
+  const SIndex* idxs = indices.template data<SIndex>();
+  const T* g = grad.template data<T>();
+
+  // TODO(cxj): use OMP when it is reliable
+  // #pragma omp parallel for
+  for (TIndex i = 0; i < K; ++i) {
+    SIndex idx = idxs[i];
+    DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
+                                << ", range 0 to " << N;
+    if (block_size == 1) {
+      ftrl_compute(
+          w[idx],
+          nz[idx * 2],
+          nz[idx * 2 + 1],
+          g[i],
+          w[idx],
+          nz[idx * 2],
+          nz[idx * 2 + 1],
+          params_);
+    } else {
+      TIndex x = block_size * idx;
+      ftrl_update(
+          block_size,
+          w + x,
+          nz + x * 2,
+          g + i * block_size,
+          w + x,
+          nz + x * 2,
+          params_,
+          &context_);
+    }
+  }
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(Ftrl, FtrlOp<float, CPUContext>);
+OPERATOR_SCHEMA(Ftrl).NumInputs(3, 4).NumOutputs(2).AllowInplace({{0, 0},
+                                                                  {1, 1}});
+SHOULD_NOT_DO_GRADIENT(Ftrl);
+
+REGISTER_CPU_OPERATOR(SparseFtrl, SparseFtrlOp<float>);
+OPERATOR_SCHEMA(SparseFtrl)
+    .NumInputs(4, 5)
+    .NumOutputs(2)
+    .EnforceInplace({{0, 0}, {1, 1}});
+SHOULD_NOT_DO_GRADIENT(SparseFtrl);
+}
+
+}
diff --git a/caffe2/sgd/ftrl_op.h b/caffe2/sgd/ftrl_op.h
new file mode 100644
index 0000000..079fe60
--- /dev/null
+++ b/caffe2/sgd/ftrl_op.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+struct FtrlParams {
+  explicit FtrlParams(OperatorBase* op)
+      : alphaInv(1.0 / op->GetSingleArgument<float>("alpha", 0.005f)),
+        beta(op->GetSingleArgument<float>("beta", 1.0f)),
+        lambda1(op->GetSingleArgument<float>("lambda1", 0.001f)),
+        lambda2(op->GetSingleArgument<float>("lambda2", 0.001f)) {}
+  T alphaInv;
+  T beta;
+  T lambda1;
+  T lambda2;
+};
+
+// TODO(dzhulgakov): implement GPU version if necessary
+template <typename T, class Context>
+class FtrlOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FtrlOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), params_(this) {
+    CAFFE_ENFORCE(
+        !HasArgument("alpha") || ALPHA >= InputSize(),
+        "Cannot specify alpha by both input and argument");
+  }
+  bool RunOnDevice() override;
+
+ protected:
+  FtrlParams<T> params_;
+  INPUT_TAGS(VAR, N_Z, GRAD, ALPHA);
+  OUTPUT_TAGS(OUTPUT_VAR, OUTPUT_N_Z);
+};
+
+template <typename T>
+class SparseFtrlOp final : public Operator<CPUContext> {
+ public:
+  SparseFtrlOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws), params_(this) {
+    CAFFE_ENFORCE(
+        !HasArgument("alpha") || ALPHA >= InputSize(),
+        "Cannot specify alpha by both input and argument");
+  }
+
+  bool RunOnDevice() override {
+    // run time learning rate override
+    if (ALPHA < InputSize()) {
+      CAFFE_ENFORCE_EQ(Input(ALPHA).size(), 1, "alpha should be real-valued");
+      params_.alphaInv = 1.0 / *(Input(ALPHA).template data<T>());
+    }
+    // Use run-time polymorphism
+    auto& indices = Input(INDICES);
+    if (indices.template IsType<int32_t>()) {
+      DoRun<int32_t>();
+    } else if (indices.template IsType<int64_t>()) {
+      DoRun<int64_t>();
+    } else {
+      LOG(FATAL) << "Unsupported type of INDICES in SparseFtrlOp: "
+                      << indices.meta().name();
+    }
+    return true;
+  }
+
+ protected:
+  FtrlParams<T> params_;
+  INPUT_TAGS(VAR, N_Z, INDICES, GRAD, ALPHA);
+  OUTPUT_TAGS(OUTPUT_VAR, OUTPUT_N_Z);
+
+ private:
+  template <typename SIndex>
+  void DoRun();
+};
+
+}
diff --git a/caffe2/sgd/gftrl_op.cc b/caffe2/sgd/gftrl_op.cc
new file mode 100644
index 0000000..cd47bfb
--- /dev/null
+++ b/caffe2/sgd/gftrl_op.cc
@@ -0,0 +1,104 @@
+#include "gftrl_op.h"
+
+namespace caffe2 {
+
+// Computes one coordinate
+template <typename T>
+
+inline void gftrl_compute(
+    const T& w,
+    const T& n,
+    const T& z,
+    const T& g,
+    T& nw,
+    T& nn,
+    T& nz,
+    const T& z_norm,
+    const int OutputDim,
+    const GFtrlParams<T>& params) {
+  auto new_n = n + g * g;
+  auto sigma = (sqrt(new_n) - sqrt(n)) * params.alphaInv;
+  nn = new_n;
+  nz = z + g - sigma * w;
+  // update the weight
+  if (z_norm > params.lambda1 * std::sqrt(OutputDim)) {
+    nw = nz * (params.lambda1 * std::sqrt(OutputDim) / z_norm - 1) /
+        ((params.beta + sqrt(new_n)) * params.alphaInv + params.lambda2);
+  } else {
+    nw = 0.0;
+  }
+}
+
+template <typename Context, typename T>
+void gftrl_update(
+    int OutputDim, // # of output nodes
+    int InputDim, // # of input features
+    const T* w,
+    const T* nz,
+    const T* g,
+    T* new_w,
+    T* new_nz,
+    const GFtrlParams<T>& params,
+    Context* /*context*/) {
+  for (auto j = 0; j < InputDim; ++j) {
+    T z_norm = 0.0;
+    for (auto i = 0; i < OutputDim; ++i) {
+      int idx = i * InputDim + j;
+      auto new_n = nz[idx * 2] + g[idx] * g[idx];
+      auto sigma = (sqrt(new_n) - sqrt(nz[idx * 2])) * params.alphaInv;
+      auto new_z = nz[idx * 2 + 1] + g[idx] - sigma * w[idx];
+      z_norm = z_norm + new_z * new_z;
+    }
+
+    z_norm = sqrt(z_norm);
+    for (auto i = 0; i < OutputDim; ++i) {
+      int idx = i * InputDim + j;
+      gftrl_compute(
+          w[idx],
+          nz[idx * 2],
+          nz[idx * 2 + 1],
+          g[idx],
+          new_w[idx],
+          new_nz[idx * 2],
+          new_nz[idx * 2 + 1],
+          z_norm,
+          OutputDim,
+          params);
+    }
+  }
+}
+
+template <typename T, typename Context>
+bool GFtrlOp<T, Context>::RunOnDevice() {
+  // run time learning rate override
+  if (ALPHA < InputSize()) {
+    CAFFE_ENFORCE_EQ(Input(ALPHA).size(), 1, "alpha should be real-valued");
+    params_.alphaInv = 1.0 / *(Input(ALPHA).template data<T>());
+  }
+
+  CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(VAR).size());
+  CAFFE_ENFORCE_EQ(Input(GRAD).size() * 2, Input(N_Z).size());
+  Output(OUTPUT_VAR)->ResizeLike(Input(VAR));
+  Output(OUTPUT_N_Z)->ResizeLike(Input(N_Z));
+  gftrl_update<Context>(
+      Input(GRAD).dim(0), // # of output nodes
+      Input(GRAD).size() / Input(GRAD).dim(0), // # of input features
+      Input(VAR).template data<T>(),
+      Input(N_Z).template data<T>(),
+      Input(GRAD).template data<T>(),
+      Output(OUTPUT_VAR)->template mutable_data<T>(),
+      Output(OUTPUT_N_Z)->template mutable_data<T>(),
+      params_,
+      &context_);
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(GFtrl, GFtrlOp<float, CPUContext>);
+OPERATOR_SCHEMA(GFtrl).NumInputs(3, 4).NumOutputs(2).AllowInplace({{0, 0},
+                                                                   {1, 1}});
+SHOULD_NOT_DO_GRADIENT(GFtrl);
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/sgd/gftrl_op.h b/caffe2/sgd/gftrl_op.h
new file mode 100644
index 0000000..90f8ca6
--- /dev/null
+++ b/caffe2/sgd/gftrl_op.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T>
+struct GFtrlParams {
+  explicit GFtrlParams(OperatorBase* op)
+      : alphaInv(1.0 / op->GetSingleArgument<float>("alpha", 0.005f)),
+        beta(op->GetSingleArgument<float>("beta", 1.0f)),
+        lambda1(op->GetSingleArgument<float>("lambda1", 0.001f)),
+        lambda2(op->GetSingleArgument<float>("lambda2", 0.001f)) {}
+  T alphaInv;
+  T beta;
+  T lambda1;
+  T lambda2;
+};
+
+template <typename T, class Context>
+class GFtrlOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  GFtrlOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), params_(this) {
+    CAFFE_ENFORCE(
+        !HasArgument("alpha") || ALPHA >= InputSize(),
+        "Cannot specify alpha by both input and argument");
+  }
+  bool RunOnDevice() override;
+
+ protected:
+  GFtrlParams<T> params_;
+  INPUT_TAGS(VAR, N_Z, GRAD, ALPHA);
+  OUTPUT_TAGS(OUTPUT_VAR, OUTPUT_N_Z);
+};
+
+} // namespace caffe2
diff --git a/caffe2/sgd/iter_op.cc b/caffe2/sgd/iter_op.cc
new file mode 100644
index 0000000..df9e261
--- /dev/null
+++ b/caffe2/sgd/iter_op.cc
@@ -0,0 +1,53 @@
+#include "caffe2/sgd/iter_op.h"
+
+namespace caffe2 {
+
+void MutexSerializer::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  CAFFE_ENFORCE(blob.IsType<std::unique_ptr<std::mutex>>());
+  BlobProto blob_proto;
+  blob_proto.set_name(name);
+  blob_proto.set_type("std::unique_ptr<std::mutex>");
+  blob_proto.set_content("");
+  acceptor(name, blob_proto.SerializeAsString());
+}
+
+void MutexDeserializer::Deserialize(const BlobProto& /* unused */, Blob* blob) {
+  *blob->GetMutable<std::unique_ptr<std::mutex>>() =
+      caffe2::make_unique<std::mutex>();
+}
+
+REGISTER_CPU_OPERATOR(Iter, IterOp<CPUContext>);
+REGISTER_CPU_OPERATOR(AtomicIter, AtomicIterOp<CPUContext>);
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<std::mutex>>()),
+    MutexSerializer);
+REGISTER_BLOB_DESERIALIZER(std::unique_ptr<std::mutex>, MutexDeserializer);
+
+OPERATOR_SCHEMA(Iter)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Stores a singe integer, that gets incremented on each call to Run().
+Useful for tracking the iteration count during SGD, for example.
+)DOC");
+
+OPERATOR_SCHEMA(AtomicIter)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{1, 0}})
+    .SetDoc(R"DOC(
+Similar to Iter, but takes a mutex as the first input to make sure that
+updates are carried out atomically. This can be used in e.g. Hogwild sgd
+algorithms.
+)DOC")
+    .Input(0, "mutex", "The mutex used to do atomic increment.")
+    .Input(1, "iter", "The iter counter as an int64_t TensorCPU.");
+
+NO_GRADIENT(Iter);
+NO_GRADIENT(AtomicIter);
+}  // namespace caffe2
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
new file mode 100644
index 0000000..13681d7
--- /dev/null
+++ b/caffe2/sgd/iter_op.h
@@ -0,0 +1,102 @@
+#ifndef CAFFE2_SGD_ITER_OP_H_
+#define CAFFE2_SGD_ITER_OP_H_
+
+#include <limits>
+#include <mutex>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+
+namespace caffe2 {
+
+inline void IncrementIter(TensorCPU* output) {
+  CAFFE_ENFORCE_EQ(
+      output->size(),
+      1,
+      "The output of IterOp exists, but not of the right size.");
+  int64_t* iter = output->template mutable_data<int64_t>();
+  CAFFE_ENFORCE(*iter >= 0, "Previous iteration number is negative.");
+  CAFFE_ENFORCE(
+      *iter < std::numeric_limits<int64_t>::max(), "Overflow will happen!");
+  (*iter)++;
+}
+
+// IterOp runs an iteration counter. I cannot think of a case where we would
+// need to access the iter variable on device, so this will always produce a
+// tensor on the CPU side. If the blob already exists and is a tensor<int64_t>
+// object, we will simply increment it (this emulates the case when we want to
+// resume training). Otherwise we will have the iter starting with 0.
+template <class Context>
+class IterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  IterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    if (InputSize() == 0) {
+      if (!OperatorBase::OutputIsType<TensorCPU>(0)) {
+        // This is the first run; set the iter to start with 0.
+        LOG(ERROR) << "You are using an old definition of IterOp that will "
+                      "be deprecated soon. More specifically, IterOp now "
+                      "requires an explicit in-place input and output.";
+
+        auto* output = OperatorBase::Output<TensorCPU>(0);
+        VLOG(1) << "Initializing iter counter.";
+        output->Resize(1);
+        output->template mutable_data<int64_t>()[0] = 0;
+      }
+    }
+    IncrementIter(OperatorBase::Output<TensorCPU>(0));
+    return true;
+  }
+};
+
+template <class Context>
+class AtomicIterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  AtomicIterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        stats_(std::string("atomic_iter/stats/") + operator_def.input(1)) {}
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
+    std::lock_guard<std::mutex> lg(*mutex);
+    IncrementIter(OperatorBase::Output<TensorCPU>(0));
+    CAFFE_EVENT(stats_, num_iter);
+    return true;
+  }
+
+ private:
+  struct AtomicIterOpStats {
+    CAFFE_STAT_CTOR(AtomicIterOpStats);
+    CAFFE_EXPORTED_STAT(num_iter);
+  } stats_;
+};
+
+class MutexSerializer : public BlobSerializerBase {
+ public:
+  /**
+   * Serializes a std::unique_ptr<std::mutex>. Note that this blob has to
+   * contain std::unique_ptr<std::mutex>, otherwise this function produces a
+   * fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      BlobSerializerBase::SerializationAcceptor acceptor) override;
+};
+
+class MutexDeserializer : public BlobDeserializerBase {
+ public:
+  void Deserialize(const BlobProto& proto, Blob* blob) override;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_SGD_ITER_OP_H_
diff --git a/caffe2/sgd/iter_op_gpu.cc b/caffe2/sgd/iter_op_gpu.cc
new file mode 100644
index 0000000..bdc93b9
--- /dev/null
+++ b/caffe2/sgd/iter_op_gpu.cc
@@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/sgd/iter_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(Iter, IterOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(AtomicIter, AtomicIterOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/sgd/lars_op.cc b/caffe2/sgd/lars_op.cc
new file mode 100644
index 0000000..3e013a9
--- /dev/null
+++ b/caffe2/sgd/lars_op.cc
@@ -0,0 +1,54 @@
+#include "caffe2/sgd/lars_op.h"
+#include <math.h>
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+void LarsOp<float, CPUContext>::Compute(
+    TIndex N,
+    const float* X_data,
+    const float* dX_data,
+    float offset,
+    float* lr_rescale_data) {
+  *lr_rescale_data = 1.0;
+
+  float X_norm =
+      sqrtf((ConstEigenVectorMap<float>(X_data, N).array()).square().sum());
+
+  if (X_norm > 0) {
+    float dX_norm =
+        sqrtf((ConstEigenVectorMap<float>(dX_data, N).array()).square().sum());
+    *lr_rescale_data /= (dX_norm / X_norm + offset);
+  }
+}
+
+REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(Lars)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Implement Layer-wise Adaptive Rate Scaling (LARS) as in
+https://arxiv.org/abs/1708.03888. Without weight decay, given a global
+learning rate lr, parameter tensor X and its gradient dX, the local learning
+rate for X will be
+
+    local_lr = lr * norm(X) / ( norm(dX) + offset * norm(X) )
+
+             = lr  / ( norm(dX) / norm(X) + offset ),
+
+where offset is a preset hyper-parameter to avoid numerical issue.
+In this implementation, we uses l2 norm and output the rescaling factor
+
+    1 / ( norm(dX) / norm(X) + offset ).
+
+)DOC")
+    .Input(0, "X", "Parameter tensor")
+    .Input(1, "dX", "Gradient tensor")
+    .Output(0, "lr_rescale", "Local learning rate rescaling factor")
+    .Arg("offset", "rescaling offset parameter");
+
+SHOULD_NOT_DO_GRADIENT(Lars);
+} // namespace caffe2
diff --git a/caffe2/sgd/lars_op.h b/caffe2/sgd/lars_op.h
new file mode 100644
index 0000000..f4eca35
--- /dev/null
+++ b/caffe2/sgd/lars_op.h
@@ -0,0 +1,51 @@
+#ifndef CAFFE2_OPERATORS_LARS_OP_H_
+#define CAFFE2_OPERATORS_LARS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LarsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  LarsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        offset_(OperatorBase::GetSingleArgument<float>("offset", 0.5)) {}
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto& dX = Input(1);
+    CAFFE_ENFORCE(
+        dX.size() == X.size(), "Gradient size doesn't match parameter size.");
+    CAFFE_ENFORCE_GE(offset_, 0);
+
+    auto* lr_rescale = Output(0);
+    lr_rescale->Resize(vector<TIndex>{1});
+
+    Compute(
+        dX.size(),
+        X.template data<T>(),
+        dX.template data<T>(),
+        offset_,
+        lr_rescale->template mutable_data<T>());
+
+    return true;
+  }
+
+ private:
+  void Compute(
+      TIndex N,
+      const T* X_data,
+      const T* dX_data,
+      T offset,
+      T* lr_rescale_data);
+
+  T offset_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LARS_OP_H_
diff --git a/caffe2/sgd/lars_op_gpu.cu b/caffe2/sgd/lars_op_gpu.cu
new file mode 100644
index 0000000..6008da9
--- /dev/null
+++ b/caffe2/sgd/lars_op_gpu.cu
@@ -0,0 +1,7 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+#include "caffe2/sgd/lars_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Lars, GPUFallbackOp<LarsOp<float, CPUContext>>);
+}
diff --git a/caffe2/sgd/learning_rate_adaption_op.cc b/caffe2/sgd/learning_rate_adaption_op.cc
new file mode 100644
index 0000000..c5ab8bd
--- /dev/null
+++ b/caffe2/sgd/learning_rate_adaption_op.cc
@@ -0,0 +1,41 @@
+#include "caffe2/sgd/learning_rate_adaption_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    LearningRateAdaption,
+    LearningRateAdaptionOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LearningRateAdaption)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+      Learning Rate Adaption is an operation that perform one iteration of
+      gradient descent based on learning rate:
+        lr(k) = lr(k-1) - lr_alpha * df(k-1)/dlr,
+      where df(k-1)/dlr is the gradient of objective function f on lr, and
+      lr_alpha is a learning rate hyperparameter. It can be prove that
+      df(k-1)/dlr equals INNERPRODUCT(grad(k-1), -grad(k-2)), where grad(k-1) is
+      the grad of f(k-1) on parameters. When the argument
+      "normalized_lr_adaption" is false, we simply perform the
+      following update:
+      lr(k) = lr(k-1) - lr_alpha * INNERPRODUCT(grad(k-1), grad(k-2)).
+      If we set "normalized_lr_adaption" to be true, we do not directly apply
+      INNERPRODUCT(grad(k-1), -grad(k-2)) as the grad. Instead, we perform the
+      following update:
+      lr(k) = lr(k-1) + lr_alpha * cosineSimilarity(grad(k-1), grad(k-2)).
+)DOC")
+    .Arg(
+        "lr_alpha",
+        "the learning rate for performing gradient descent on learning rate lr")
+    .Arg(
+        "normalized_lr_adaption",
+        "whether to apply normalized lr adaption or not")
+    .Input(0, "lr", "Learning rate")
+    .Input(1, "grad", "Gradient computed")
+    .Input(2, "effgrad", "The effective grad")
+    .Output(0, "output_lr", "Updated learning rate");
+
+NO_GRADIENT(LearningRateAdaption);
+} // namespace caffe2
diff --git a/caffe2/sgd/learning_rate_adaption_op.h b/caffe2/sgd/learning_rate_adaption_op.h
new file mode 100644
index 0000000..7286d1a
--- /dev/null
+++ b/caffe2/sgd/learning_rate_adaption_op.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <cfloat>
+#include <cmath>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void lr_update(
+    int n,
+    const float* grad,
+    const float* effgrad,
+    const float* lr,
+    float* nlr,
+    float lr_alpha,
+    bool normalized_lr_adaption,
+    Context* /*context*/) {
+  float x = 0;
+  float y = 0, z = 0;
+  const float kEps = 1e-12f;
+  for (auto i = 0; i < n; i++) {
+    x += grad[i] * effgrad[i];
+    if (normalized_lr_adaption) {
+      y += grad[i] * grad[i];
+      z += effgrad[i] * effgrad[i];
+    }
+  }
+  if (normalized_lr_adaption) {
+    y = fmax(std::sqrt(y), kEps);
+    z = fmax(std::sqrt(z), kEps);
+    nlr[0] = lr[0] * (1 - lr_alpha * x / (y * z));
+  } else {
+    nlr[0] = lr[0] - lr_alpha * x;
+  }
+}
+
+template <typename T, class Context>
+class LearningRateAdaptionOp final : public Operator<Context> {
+ public:
+  LearningRateAdaptionOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        lr_alpha_(OperatorBase::GetSingleArgument<float>("lr_alpha", 0.01f)),
+        normalized_lr_adaption_(OperatorBase::GetSingleArgument<bool>(
+            "normalized_lr_adaption",
+            true)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(EFFGRAD).size());
+    Output(OUTPUT_LR)->ResizeLike(Input(LR));
+    lr_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(EFFGRAD).template data<T>(),
+        Input(LR).template data<T>(),
+        Output(OUTPUT_LR)->template mutable_data<T>(),
+        lr_alpha_,
+        normalized_lr_adaption_,
+        &context_);
+    return true;
+  }
+
+ protected:
+  T lr_alpha_{1e-2};
+  bool normalized_lr_adaption_{true};
+  INPUT_TAGS(LR, GRAD, EFFGRAD);
+  OUTPUT_TAGS(OUTPUT_LR);
+};
+
+} // namespace caffe2
diff --git a/caffe2/sgd/learning_rate_functors.h b/caffe2/sgd/learning_rate_functors.h
new file mode 100644
index 0000000..5dfc791
--- /dev/null
+++ b/caffe2/sgd/learning_rate_functors.h
@@ -0,0 +1,207 @@
+#ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
+#define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
+
+#include <list>
+#include <map>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// LearningRateFunctor is a functor that when fed with an iter number, produces
+// the learning rate for the corresponding iteration.
+template <typename T>
+class LearningRateFunctor {
+ public:
+  virtual ~LearningRateFunctor() {}
+  virtual T operator()(const int64_t iter) const = 0;
+};
+
+// Fixed: not changing the learning rate at all.
+template <typename T>
+class FixedLearningRate : public LearningRateFunctor<T> {
+ public:
+  T operator()(const int64_t /*iter*/) const override {
+    return 1.;
+  }
+};
+
+// Alter: alternatate learning rate with active_period and inactive_period.
+// update for for a duration of active_period and then stop for a duration of
+// inactive_period if active_first, and vice versa
+template <typename T>
+class AlternateLearningRate : public LearningRateFunctor<T> {
+ public:
+  AlternateLearningRate(
+      const int64_t active_period,
+      const int64_t inactive_period,
+      const bool active_first)
+      : active_period_(active_period),
+        inactive_period_(inactive_period),
+        active_first_(active_first) {}
+  T operator()(const int64_t iter) const override {
+    if (iter % (active_period_ + inactive_period_) <
+        (active_first_ ? active_period_ : inactive_period_)) {
+      return active_first_ ? 1. : 0.;
+    } else {
+      return active_first_ ? 0. : 1.;
+    };
+  };
+
+  int64_t active_period_;
+  int64_t inactive_period_;
+  bool active_first_;
+};
+
+// Step: return gamma ^ (floor(iter / step))
+template <typename T>
+class StepLearningRate : public LearningRateFunctor<T> {
+ public:
+  StepLearningRate(const int stepsize, const T gamma)
+      : stepsize_(stepsize), gamma_(gamma) {}
+  T operator()(const int64_t iter) const override {
+    return std::pow(gamma_, static_cast<T>(iter / stepsize_));
+  }
+
+  int stepsize_;
+  T gamma_;
+};
+
+// Exp: return gamma ^ iter
+template <typename T>
+class ExpLearningRate : public LearningRateFunctor<T> {
+ public:
+  explicit ExpLearningRate(const T gamma) : gamma_(gamma) {}
+  T operator()(const int64_t iter) const override {
+    return std::pow(gamma_, static_cast<T>(iter));
+  }
+
+  T gamma_;
+};
+
+// Inv: return (1 + gamma * iter) ^ (-power)
+template <typename T>
+class InvLearningRate : public LearningRateFunctor<T> {
+ public:
+  InvLearningRate(const T gamma, const T power)
+      : gamma_(gamma), power_(power) {}
+  T operator()(const int64_t iter) const override {
+    return std::pow(T(1) + gamma_ * iter, -power_);
+  }
+  T gamma_;
+  T power_;
+};
+
+// Poly: return (1 - iter/max_iter) ^ (power)
+template <typename T>
+class PolyLearningRate : public LearningRateFunctor<T> {
+ public:
+  PolyLearningRate(const T power, const int64_t max_iter)
+      : power_(power), max_iter_(max_iter) {}
+  T operator()(const int64_t iter) const override {
+    return std::pow(1 - T(iter) / T(max_iter_), power_);
+  }
+  T power_;
+  uint64_t max_iter_;
+};
+
+// LinearWarmup: return max(iter/num_iter, 1)
+template <typename T>
+class LinearWarmupLearningRate : public LearningRateFunctor<T> {
+ public:
+  LinearWarmupLearningRate(const T start_multiplier, const int64_t num_iter)
+      : start_multiplier_(start_multiplier), num_iter_(num_iter) {}
+  T operator()(const int64_t iter) const override {
+    if (iter >= num_iter_) {
+      return 1.;
+    }
+    return start_multiplier_ + (1. - start_multiplier_) * T(iter) / T(num_iter_);
+  }
+  T start_multiplier_;
+  uint64_t num_iter_;
+};
+
+// ConstantWarmup: return scale when iter < num_iter, and 1 otherwise
+template <typename T>
+class ConstantWarmupLearningRate : public LearningRateFunctor<T> {
+ public:
+  ConstantWarmupLearningRate(const T multiplier, const int64_t num_iter)
+      : multiplier_(multiplier), num_iter_(num_iter) {}
+  T operator()(const int64_t iter) const override {
+    if (iter >= num_iter_) {
+      return 1.;
+    }
+    return T(multiplier_);
+  }
+  T multiplier_;
+  uint64_t num_iter_;
+};
+
+// hill: the learning rate changes according to following 3 stages
+// 1) linear warmup (increasing) at first num_iter steps from start_multiplier
+// 2) inverse shrink (decreasing) afterwards (gamma, power)
+// 3) lower bounded by end_multiplier
+template <typename T>
+class HillLearningRate : public LearningRateFunctor<T> {
+ public:
+  HillLearningRate(
+      const int64_t num_iter,
+      const T start_multiplier,
+      const T gamma,
+      const T power,
+      const T end_multiplier)
+      : linear_warmup_lr_(start_multiplier, num_iter),
+        inv_lr_(gamma, power),
+        num_iter_(num_iter),
+        end_multiplier_(end_multiplier) {}
+  T operator()(const int64_t iter) const override {
+    if (iter < num_iter_) {
+      return linear_warmup_lr_(iter);
+    } else {
+      return std::max(end_multiplier_, inv_lr_(iter - num_iter_));
+    }
+  }
+  LinearWarmupLearningRate<T> linear_warmup_lr_;
+  InvLearningRate<T> inv_lr_;
+  int64_t num_iter_;
+  T end_multiplier_;
+};
+
+template <typename T>
+class CompositeLearningRateItem {
+ public:
+  CompositeLearningRateItem(int64_t num_iter, LearningRateFunctor<T>* policy)
+      : num_iter_(num_iter), policy_(policy) {}
+  int64_t num_iter_;
+  LearningRateFunctor<T>* policy_;
+};
+
+// composite: the learning policy changes according to current iteration #
+template <typename T>
+class CompositeLearningRate : public LearningRateFunctor<T> {
+ public:
+  CompositeLearningRate(
+      const std::list<CompositeLearningRateItem<T>>& sub_policies) {
+    DCHECK_GT(sub_policies.size(), 0);
+    int64_t num_iter_start = 1;
+    for (auto it = sub_policies.begin(); it != sub_policies.end(); ++it) {
+      DCHECK_GT(it->num_iter_, 0);
+      sub_policies_[num_iter_start].reset(it->policy_);
+      num_iter_start += it->num_iter_;
+    }
+  }
+  T operator()(const int64_t iter) const override {
+    auto sub_policy = sub_policies_.upper_bound(iter);
+    DCHECK(sub_policy != sub_policies_.begin());
+    --sub_policy;
+    return (*sub_policy->second)(iter);
+  }
+
+ private:
+  std::map<int64_t, std::unique_ptr<LearningRateFunctor<T>>> sub_policies_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc
new file mode 100644
index 0000000..0f3016d
--- /dev/null
+++ b/caffe2/sgd/learning_rate_op.cc
@@ -0,0 +1,87 @@
+#include "caffe2/sgd/learning_rate_op.h"
+
+namespace caffe2 {
+REGISTER_CPU_OPERATOR(LearningRate, LearningRateOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(LearningRate)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Learning rate is a decreasing function of time. With low learning rates the
+improvements will be linear. With high learning rates they will start to look
+more exponential. Learning rate is controlled by the following arguments:
+
+
+Required:
+ `iterations`
+ `base_lr`: base learning rate
+ `policy`: this controls how the learning rate is applied, options are:
+   `fixed`
+   `step`: uses `stepsize`, `gamma`
+   `exp`: uses `gamma`
+   `inv`: uses `gamma`, `power`
+   `linearWarmup`: uses `start_multiplier`, `num_iter`
+   `constantWarmup`: uses `multiplier`, `num_iter`
+   `alter`: uses  `active_first`, `active_period`, `inactive_period`
+   `hill`: uses those in both `linearWarmup` and `inv`, plus `end_multiplier`
+   `composite`: uses `sub_policy_num_iters` and additional args with format
+   sub_policy_{sub_policy_index}_{sub_policy_arg}, for example:
+   sub_policy_0_policy: "exp", sub_policy_0_gamma: 0.99,
+   sub_policy_0_lr_scale: 1.2
+   sub_policy_0_policy: "fixed", sub_policy_0_lr_scale: 1.0
+   sub_policy_num_iters: [1000, 1000]
+
+Optional:
+  `stepsize`: defaults to 0
+  `gamma`: defaults to 0
+  `power`: defaults to 0
+  `num_iter`: defaults to 0
+  `start_multiplier`: defaults to 0
+  `multiplier`: defaults to 0.5
+
+
+Usage:
+  train_net.LearningRate(*iterations*, "*label*", base_lr=*float*,
+                         policy="policy_name", stepsize=*int*, gamma=*float*)
+
+
+Example usage:
+  train_net.LearningRate(200, "LR", base_lr=-0.1,
+                         policy="step", stepsize=20, gamma=0.9)
+)DOC")
+    .Arg("base_lr", "(float, required) base learning rate")
+    .Arg("policy", "(float, default 1.0) strategy for gamma enforcement")
+    .Arg("power", "(float, default 1.0) used only for inv policy type")
+    .Arg("gamma", "(float, default 1.0) momentum of change")
+    .Arg("stepsize", "(float, default 1.0) sampling rate on iterations")
+    .Arg("active_first", "(boolean, default True) in alter policy")
+    .Arg("active_period", "(int64_t, required) in alter policy")
+    .Arg("inactive_period", "(int64_t, required) in alter policy")
+    .Arg(
+        "max_iter",
+        "(int, default -1) maximum iterations in this training run")
+    .Arg(
+        "num_iter",
+        "(int, default 0) number of iterations over which to warmup lr")
+    .Arg(
+        "start_multiplier",
+        "(float, default 0) starting multiplier for learning rate")
+    .Arg(
+        "end_multiplier",
+        "(float, default 0) end multiplier for learning rate")
+    .Arg(
+        "multiplier",
+        "(float, default 0.5) constant multiplier for learning rate")
+    .Arg(
+        "sub_policy_num_iters",
+        "(int array, default empty) number of iterations for each sub learning rate policy in composite policy")
+    .Input(0, "input", "description needed")
+    .Output(0, "output", "description needed")
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      return std::make_pair(
+          std::vector<DeviceOption>{DeviceOption()},
+          std::vector<DeviceOption>{def.device_option()});
+    });
+
+NO_GRADIENT(LearningRate);
+}  // namespace caffe2
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
new file mode 100644
index 0000000..0a47b6c
--- /dev/null
+++ b/caffe2/sgd/learning_rate_op.h
@@ -0,0 +1,168 @@
+#ifndef CAFFE2_SGD_LEARNING_RATE_OP_H_
+#define CAFFE2_SGD_LEARNING_RATE_OP_H_
+
+#include <cfloat>
+#include <cmath>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/sgd/learning_rate_functors.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class LearningRateOp final : public Operator<Context> {
+ public:
+  LearningRateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        functor_(nullptr),
+        base_lr_(OperatorBase::template GetSingleArgument<float>(
+            "base_lr",
+            FLT_MAX)) {
+    CAFFE_ENFORCE_NE(base_lr_, FLT_MAX, "Base learning rate must be set.");
+    const string policy = OperatorBase::GetSingleArgument<string>("policy", "");
+    CAFFE_ENFORCE(policy.size(), "Must specify a learning rate policy.");
+    functor_.reset(createLearningRateFunctor(policy));
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    int64_t iter =
+        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
+    T learning_rate = cur_base_lr_ * (*functor_)(iter);
+    // Write to output.
+    auto* output = Output(0);
+    output->Resize(vector<TIndex>());
+    context_.template Copy<T, CPUContext, Context>(
+        1, &learning_rate, Output(0)->template mutable_data<T>());
+    return true;
+  }
+
+ private:
+  unique_ptr<LearningRateFunctor<T>> functor_;
+  T base_lr_;
+  T base_lr_scale_;
+  T cur_base_lr_;
+
+  LearningRateFunctor<T>* createLearningRateFunctor(
+      const string& policy,
+      const string& arg_prefix = "") {
+    if (policy != "composite") {
+      base_lr_scale_ =
+          OperatorBase::GetSingleArgument<float>(arg_prefix + "lr_scale", 1.0);
+      cur_base_lr_ = base_lr_scale_ * base_lr_;
+    }
+    if (policy == "fixed") {
+      return new FixedLearningRate<T>();
+    } else if (policy == "alter") {
+      bool active_first = OperatorBase::template GetSingleArgument<bool>(
+          arg_prefix + "active_first", true);
+      int64_t active_period = OperatorBase::template GetSingleArgument<int64_t>(
+          arg_prefix + "active_period", -1);
+      int64_t inactive_period =
+          OperatorBase::template GetSingleArgument<int64_t>(
+              arg_prefix + "inactive_period", -1);
+      DCHECK_GE(active_period, 0);
+      DCHECK_GE(inactive_period, 0);
+      return new AlternateLearningRate<T>(
+          active_period, inactive_period, active_first);
+    } else if (policy == "hill") {
+      int64_t num_iter = OperatorBase::template GetSingleArgument<int>(
+          arg_prefix + "num_iter", 0);
+      DCHECK_GT(num_iter, 0);
+      T start_multiplier = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "start_multiplier", 0.);
+      DCHECK_GE(start_multiplier, 0); // start_multiplier in range [0, 1]
+      DCHECK_LE(start_multiplier, 1);
+      T gamma = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "gamma", 0);
+      DCHECK_GT(gamma, 0);
+      T power = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "power", 0);
+      DCHECK_GT(power, 0);
+      T end_multiplier = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "end_multiplier", 0);
+      DCHECK_GE(end_multiplier, 0); // end_multiplier in range [0, 1]
+      DCHECK_LE(end_multiplier, 1);
+      return new HillLearningRate<T>(
+          num_iter, start_multiplier, gamma, power, end_multiplier);
+    } else if (policy == "step") {
+      int stepsize = OperatorBase::template GetSingleArgument<int>(
+          arg_prefix + "stepsize", 0);
+      T gamma = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "gamma", 0);
+      DCHECK_GT(stepsize, 0);
+      DCHECK_GT(gamma, 0);
+      return new StepLearningRate<T>(stepsize, gamma);
+    } else if (policy == "exp") {
+      T gamma = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "gamma", 0);
+      DCHECK_GT(gamma, 0);
+      return new ExpLearningRate<T>(gamma);
+    } else if (policy == "inv") {
+      T gamma = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "gamma", 0);
+      T power = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "power", 0);
+      DCHECK_GT(gamma, 0);
+      DCHECK_GT(power, 0);
+      return new InvLearningRate<T>(gamma, power);
+    } else if (policy == "poly") {
+      int max_iter = OperatorBase::template GetSingleArgument<int>(
+          arg_prefix + "max_iter", -1);
+      T power = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "power", 0);
+      DCHECK_GT(power, 0);
+      return new PolyLearningRate<T>(power, max_iter);
+    } else if (policy == "linearWarmup") {
+      T start_multiplier = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "start_multiplier", 0.);
+      int num_iter = OperatorBase::template GetSingleArgument<int>(
+          arg_prefix + "num_iter", 0);
+      DCHECK_GE(start_multiplier, 0);
+      return new LinearWarmupLearningRate<T>(start_multiplier, num_iter);
+    } else if (policy == "constantWarmup") {
+      T multiplier = OperatorBase::template GetSingleArgument<float>(
+          arg_prefix + "multiplier", 0.5);
+      int num_iter = OperatorBase::template GetSingleArgument<int>(
+          arg_prefix + "num_iter", 0);
+      DCHECK_GT(multiplier, 0);
+      return new ConstantWarmupLearningRate<T>(multiplier, num_iter);
+    } else if (policy == "composite") {
+      std::vector<int> sub_policy_num_iters =
+          OperatorBase::template GetRepeatedArgument<int>(
+              "sub_policy_num_iters");
+      std::list<CompositeLearningRateItem<T>> sub_policies;
+      CAFFE_ENFORCE_GT(
+          sub_policy_num_iters.size(),
+          0,
+          "Must specify at least one sub learning rate policy.");
+      for (int i = 0; i < sub_policy_num_iters.size(); ++i) {
+        CAFFE_ENFORCE_GT(
+            sub_policy_num_iters[i],
+            0,
+            "The number of iterations for sub learning rate policy should be positive.");
+        std::stringstream sub_policy_arg_prefix;
+        sub_policy_arg_prefix << "sub_policy_" << i << "_";
+        const string sub_policy_arg_prefix_str = sub_policy_arg_prefix.str();
+        const string sub_policy = OperatorBase::GetSingleArgument<string>(
+            sub_policy_arg_prefix_str + "policy", "");
+        if (sub_policy == "composite") {
+          CAFFE_THROW(
+              "Defining composite LR policy as a subpolicy of composite LR "
+              "policy is not allowed.");
+        }
+        sub_policies.push_back(CompositeLearningRateItem<T>(
+            sub_policy_num_iters[i],
+            createLearningRateFunctor(sub_policy, sub_policy_arg_prefix_str)));
+      }
+      return new CompositeLearningRate<T>(sub_policies);
+    } else {
+      CAFFE_THROW("Unknown learning rate policy: ", policy);
+      return NULL;
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_SGD_LEARNING_RATE_OP_H_
diff --git a/caffe2/sgd/learning_rate_op_gpu.cc b/caffe2/sgd/learning_rate_op_gpu.cc
new file mode 100644
index 0000000..f2d4cdf
--- /dev/null
+++ b/caffe2/sgd/learning_rate_op_gpu.cc
@@ -0,0 +1,6 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/sgd/learning_rate_op.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(LearningRate, LearningRateOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/caffe2/sgd/momentum_sgd_op.cc b/caffe2/sgd/momentum_sgd_op.cc
new file mode 100644
index 0000000..c1c5409
--- /dev/null
+++ b/caffe2/sgd/momentum_sgd_op.cc
@@ -0,0 +1,115 @@
+#include "momentum_sgd_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(MomentumSGD, MomentumSGDOp<float, CPUContext>);
+OPERATOR_SCHEMA(MomentumSGD)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {1, 1}})
+    .TensorInferenceFunction(
+        [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
+          vector<TensorShape> out(2);
+          out[0] = in[0];
+          out[1] = in[1];
+          return out;
+        })
+    .SetDoc(R"DOC(
+
+Computes a momentum SGD update for an input gradient and momentum
+parameters. Concretely, given inputs (grad, m, lr) and parameters
+(momentum, nesterov), computes:
+
+    if not nesterov:
+        adjusted_gradient = lr * grad + momentum * m
+        return (adjusted_gradient, adjusted_gradient)
+    else:
+        m_new = momentum * m + lr * grad
+        return ((1 + momentum) * m_new - momentum * m, m_new)
+
+Output is (grad, momentum)
+
+Note the difference to MomemtumSGDUpdate, which actually performs the
+parameter update (and is thus faster).
+)DOC");
+SHOULD_NOT_DO_GRADIENT(MomentumSGD);
+
+REGISTER_CPU_OPERATOR(
+    MomentumSGDUpdate,
+    MomentumSGDUpdateOp<float, CPUContext>);
+OPERATOR_SCHEMA(MomentumSGDUpdate)
+    .NumInputs(4)
+    .NumOutputs(3)
+    .AllowInplace({{0, 0}, {1, 1}, {3, 2}})
+    .TensorInferenceFunction(
+        [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
+          vector<TensorShape> out(3);
+          out[0] = in[0];
+          out[1] = in[1];
+          out[2] = in[3];
+          return out;
+        })
+    .SetDoc(R"DOC(
+
+Performs a momentum SGD update for an input gradient and momentum
+parameters. Concretely, given inputs (grad, m, lr, param) and arguments
+(momentum, nesterov), computes:
+
+    if not nesterov:
+        adjusted_gradient = lr * grad + momentum * m
+        param = param - adjusted_gradient
+        return (adjusted_gradient, adjusted_gradient, param)
+    else:
+        m_new = momentum * m + lr * grad
+        param = param - ((1 + momentum) * m_new - momentum * m),
+        return ((1 + momentum) * m_new - momentum * m, m_new, param)
+
+Output is (grad, momentum, parameter).
+
+Note the difference to MomentumSGD, which returns a new gradient
+but does not perform the parameter update.
+
+)DOC");
+SHOULD_NOT_DO_GRADIENT(MomentumSGDUpdate);
+
+REGISTER_CPU_OPERATOR(
+    SparseMomentumSGDUpdate,
+    SparseMomentumSGDUpdateOp<float, CPUContext>);
+OPERATOR_SCHEMA(SparseMomentumSGDUpdate)
+    .NumInputs(5)
+    .NumOutputs(3)
+    .AllowInplace({{0, 0}})
+    .EnforceInplace({{1, 1}, {3, 2}})
+    .TensorInferenceFunction(
+        [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
+          vector<TensorShape> out(3);
+          out[0] = in[0];
+          out[1] = in[1];
+          out[2] = in[3];
+          return out;
+        })
+    .SetDoc(R"DOC(
+
+Performs a momentum SGD update analogous to MomentumSGDUpdate, but using a
+GradientSlice and indices into the full param and momentum tables. Both param
+and momentum should be in-place (corresponding inputs and outputs should be the
+same blobs).
+
+
+
+)DOC")
+    .Input(0, "grad", "GradientSlice with gradients for updated indices.")
+    .Input(1, "moment", "Momentum blob, same shape as param.")
+    .Input(2, "lr", "Learning rate.")
+    .Input(3, "param", "Full parameter blob.")
+    .Input(
+        4,
+        "indices",
+        "Indices (in first dimension of param) where updates are performed.")
+    .Output(0, "output_grad", "Adjusted gradient.")
+    .Output(1, "output_moment", "Updated momentum.")
+    .Output(2, "output_param", "Updated parameter")
+    .Arg("momentum", "Momentum hyperparameter.")
+    .Arg("nesterov", "(boolean) Whether to use Nesterov Accelerated Gradient.");
+SHOULD_NOT_DO_GRADIENT(SparseMomentumSGDUpdate);
+}
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
new file mode 100644
index 0000000..23da3d4
--- /dev/null
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void momentum_sgd_update(
+    const int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    const float momentum,
+    const bool nesterov,
+    float* param,
+    Context* /*context*/) {
+  const float LR = lr[0];
+  for (auto i = 0; i < N; ++i) {
+    if (!nesterov) {
+      const float adjusted_gradient = LR * g[i] + momentum * m[i];
+      nm[i] = adjusted_gradient;
+      ng[i] = adjusted_gradient;
+    } else {
+      const float mi = m[i];
+      const float mi_new = momentum * mi + LR * g[i];
+      nm[i] = mi_new;
+      ng[i] = (1 + momentum) * mi_new - momentum * mi;
+    }
+
+    if (param) {
+      param[i] -= ng[i];
+    }
+  }
+}
+
+template <typename T, class Context>
+class MomentumSGDOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MomentumSGDOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<T>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    // Iter live on the CPU
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_MOMENTUM)->ResizeLike(Input(MOMENTUM));
+
+    momentum_sgd_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(MOMENTUM).template data<T>(),
+        Output(OUTPUT_GRAD)->template mutable_data<T>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<T>(),
+        Input(LR).template data<T>(),
+        momentum_,
+        nesterov_,
+        NULL,
+        &context_);
+    return true;
+  }
+
+ protected:
+  T momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM);
+};
+
+template <typename T, class Context>
+class MomentumSGDUpdateOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  MomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<T>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    // Iter live on the CPU
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size());
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_MOMENTUM)->ResizeLike(Input(MOMENTUM));
+
+    momentum_sgd_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(MOMENTUM).template data<T>(),
+        Output(OUTPUT_GRAD)->template mutable_data<T>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<T>(),
+        Input(LR).template data<T>(),
+        momentum_,
+        nesterov_,
+        Output(OUTPUT_PARAM)->template mutable_data<T>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  T momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+
+template <typename T, class Context>
+class SparseMomentumSGDUpdateOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<T>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    // Resize [potentially] out-of-place blobs
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENTUM).size());
+    CAFFE_ENFORCE_EQ(Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    auto block_size = Input(PARAM).size() / Input(PARAM).dim(0);
+    auto n = Input(GRAD).size() / block_size;
+
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* momentumIn = Input(MOMENTUM).template data<T>();
+    const auto* lr = Input(LR).template data<T>();
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+
+    auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
+    auto* momentumOut = Output(OUTPUT_MOMENTUM)->template mutable_data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+      auto offsetI = i * block_size;
+      auto offsetIdx = idx * block_size;
+
+      CAFFE_ENFORCE(offsetIdx + block_size <= Input(PARAM).size());
+      CAFFE_ENFORCE(offsetI + block_size <= Input(GRAD).size());
+
+      momentum_sgd_update<Context>(
+          block_size,
+          gradIn + offsetI,
+          momentumIn + offsetIdx,
+          gradOut + offsetI,
+          momentumOut + offsetIdx,
+          lr,
+          momentum_,
+          nesterov_,
+          paramOut + offsetIdx,
+          &context_);
+    }
+    return true;
+  }
+
+ protected:
+  T momentum_;
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM, INDICES);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+}
diff --git a/caffe2/sgd/momentum_sgd_op_gpu.cu b/caffe2/sgd/momentum_sgd_op_gpu.cu
new file mode 100644
index 0000000..9ef3f7e
--- /dev/null
+++ b/caffe2/sgd/momentum_sgd_op_gpu.cu
@@ -0,0 +1,126 @@
+#include "momentum_sgd_op.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+__global__ void MomentumSGDKernel(
+    const int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    const float momentum,
+    const bool nesterov,
+    float* param) {
+  const float LR = lr[0];
+  if (!nesterov) {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      const float adjusted_gradient =  LR * g[i] + momentum * m[i];
+      nm[i] = adjusted_gradient;
+      ng[i] = adjusted_gradient;
+      if (param) {
+        param[i] -= adjusted_gradient;
+      }
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      const float mi = m[i];
+      const float mi_new = momentum * mi + LR * g[i];
+      nm[i] = mi_new;
+      ng[i] = (1 + momentum) * mi_new - momentum * mi;
+      if (param) {
+        param[i] -= ng[i];
+      }
+    }
+  }
+}
+
+template <>
+void momentum_sgd_update<CUDAContext>(
+    const int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    const float momentum,
+    const bool nesterov,
+    float* param,
+    CUDAContext* context) {
+  MomentumSGDKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N, g, m, ng, nm, lr, momentum, nesterov, param);
+}
+
+
+template <typename SIndex>
+__global__ void SparseMomentumSGDKernel(
+    const size_t N,
+    const size_t sz,
+    const float momentum,
+    const bool nesterov,
+    float *param,
+    float *param_mom,
+    const SIndex *indices,
+    const float *gradIn,
+    float *gradOut,
+    const float *lr)
+{
+  const float LR = lr[0];
+  CUDA_1D_KERNEL_LOOP(i, N)
+  {
+    const size_t gradIdx = i;
+    const SIndex index = indices[i / sz];
+    const size_t paramIdx = index * sz + (i % sz);
+
+    if (!nesterov)
+    {
+      const float adjusted_gradient = LR * gradIn[gradIdx] +
+          momentum * param_mom[paramIdx];
+      gradOut[gradIdx] = adjusted_gradient;
+      param_mom[paramIdx] = adjusted_gradient;
+      param[paramIdx] -= adjusted_gradient;
+    } else {
+      const float mom_old = param_mom[paramIdx];
+      const float mom_new = LR * gradIn[gradIdx] + momentum * mom_old;
+      param_mom[paramIdx] = mom_new;
+      const float adjusted_gradient = (1 + momentum) * mom_new -
+          momentum * mom_old;
+      gradOut[gradIdx] = adjusted_gradient;
+      param[paramIdx] -= adjusted_gradient;
+    }
+  }
+}
+
+
+// Specialization of DoRunWithType for CUDA
+template <>
+template <typename SIndex>
+bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() {
+  auto N = Input(GRAD).size();
+  auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
+
+  SparseMomentumSGDKernel<SIndex><<<
+    CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
+    context_.cuda_stream()>>>(
+        N, grad_slice_sz,
+        momentum_, nesterov_,
+        Output(OUTPUT_PARAM)->template mutable_data<float>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<float>(),
+        Input(INDICES).template data<SIndex>(),
+        Input(GRAD).template data<float>(),
+        Output(OUTPUT_GRAD)->template mutable_data<float>(),
+        Input(LR).template data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(MomentumSGD, MomentumSGDOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MomentumSGDUpdate, MomentumSGDUpdateOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp<float, CUDAContext>);
+
+}
diff --git a/caffe2/sgd/rmsprop_op.cc b/caffe2/sgd/rmsprop_op.cc
new file mode 100644
index 0000000..ae73706
--- /dev/null
+++ b/caffe2/sgd/rmsprop_op.cc
@@ -0,0 +1,53 @@
+#include "rmsprop_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+void rmsprop_update<CPUContext>(
+    int N,
+    const float* g,
+    const float* ms,
+    const float* mom,
+    float* ng,
+    float* nms,
+    float* nmom,
+    float decay,
+    float momentum,
+    float epsilon,
+    const float* lr,
+    CPUContext* /*context*/) {
+  ConstEigenVectorArrayMap<float> gVec(g, N);
+  ConstEigenVectorArrayMap<float> msVec(ms, N);
+  ConstEigenVectorArrayMap<float> momVec(mom, N);
+  // Update new mean square estimate
+  EigenVectorArrayMap<float> nmsVec(nms, N);
+  nmsVec = msVec + (1.0f - decay) * (gVec * gVec - msVec);
+  // Update momentum estimate
+  EigenVectorArrayMap<float> nmomVec(nmom, N);
+  nmomVec = momVec * momentum + lr[0] * gVec / (epsilon + nmsVec).sqrt();
+  // New gradient is the momentum
+  EigenVectorArrayMap<float>(ng, N) = nmomVec;
+}
+
+REGISTER_CPU_OPERATOR(RmsProp, RmsPropOp<float, CPUContext>);
+OPERATOR_SCHEMA(RmsProp)
+    .NumInputs(4)
+    .NumOutputs(3)
+    .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
+    .SetDoc(R"DOC(
+Computes the RMSProp update
+(http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+Concretely, given inputs (grad, mean_squares, mom, lr), computes:
+
+    mean_squares_o = mean_squares + (1 - decay) * (square(grad) - mean_squares)
+    mom_o = momentum * mom + lr * grad / sqrt(epsilon + mean_squares_o)
+    grad_o = mom_o
+
+Returns (grad_o, mean_squares_o, mom_o).
+)DOC");
+SHOULD_NOT_DO_GRADIENT(RmsProp);
+
+}
diff --git a/caffe2/sgd/rmsprop_op.h b/caffe2/sgd/rmsprop_op.h
new file mode 100644
index 0000000..e3015f5
--- /dev/null
+++ b/caffe2/sgd/rmsprop_op.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void rmsprop_update(
+    int N,
+    const float* g,
+    const float* ms,
+    const float* mom,
+    float* ng,
+    float* nms,
+    float* nmom,
+    float decay,
+    float momentum,
+    float epsilon,
+    const float* lr,
+    Context* context);
+
+template <typename T, class Context>
+class RmsPropOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  RmsPropOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        decay_(OperatorBase::GetSingleArgument<float>("decay", 0.9f)),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0f)),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(LR).size() == 1);
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(MEAN_SQUARES).size());
+    CAFFE_ENFORCE(Input(GRAD).size() == Input(OUTPUT_MOMENTUM).size());
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    Output(OUTPUT_MEAN_SQUARES)->ResizeLike(Input(MEAN_SQUARES));
+    Output(OUTPUT_MOMENTUM)->ResizeLike(Input(MOMENTUM));
+    rmsprop_update<Context>(
+        Input(GRAD).size(),
+        Input(GRAD).template data<T>(),
+        Input(MEAN_SQUARES).template data<T>(),
+        Input(MOMENTUM).template data<T>(),
+        Output(OUTPUT_GRAD)->template mutable_data<T>(),
+        Output(OUTPUT_MEAN_SQUARES)->template mutable_data<T>(),
+        Output(OUTPUT_MOMENTUM)->template mutable_data<T>(),
+        decay_,
+        momentum_,
+        epsilon_,
+        Input(LR).template data<T>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  T decay_{0.9};
+  T momentum_{0.0};
+  T epsilon_{1e-8};
+  INPUT_TAGS(GRAD, MEAN_SQUARES, MOMENTUM, LR);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MEAN_SQUARES, OUTPUT_MOMENTUM);
+};
+}
diff --git a/caffe2/sgd/rmsprop_op_gpu.cu b/caffe2/sgd/rmsprop_op_gpu.cu
new file mode 100644
index 0000000..dd34e10
--- /dev/null
+++ b/caffe2/sgd/rmsprop_op_gpu.cu
@@ -0,0 +1,51 @@
+#include "rmsprop_op.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+__global__ void RmsPropUpdate(
+    int N,
+    const float* g,
+    const float* ms,
+    const float* mom,
+    float* ng,
+    float* nms,
+    float* nmom,
+    float decay,
+    float momentum,
+    float epsilon,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    // Update new mean square estimate
+    nms[i] = ms[i] + (1.0f - decay) * (g[i] * g[i] - ms[i]);
+    // Update momentum estimate
+    nmom[i] =
+        mom[i] * momentum + lr[0] * g[i] / std::sqrt(epsilon + nms[i]);
+    // New gradient is the momentum
+    ng[i] = nmom[i];
+  }
+}
+
+template <>
+void rmsprop_update<CUDAContext>(
+    int N,
+    const float* g,
+    const float* ms,
+    const float* mom,
+    float* ng,
+    float* nms,
+    float* nmom,
+    float decay,
+    float momentum,
+    float epsilon,
+    const float* lr,
+    CUDAContext* context) {
+  RmsPropUpdate<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+      N, g, ms, mom, ng, nms, nmom, decay, momentum, epsilon, lr);
+}
+
+
+REGISTER_CUDA_OPERATOR(RmsProp, RmsPropOp<float, CUDAContext>);
+
+}
diff --git a/caffe2/sgd/wngrad_op.cc b/caffe2/sgd/wngrad_op.cc
new file mode 100644
index 0000000..e92938c
--- /dev/null
+++ b/caffe2/sgd/wngrad_op.cc
@@ -0,0 +1,63 @@
+#include "wngrad_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(Wngrad, WngradOp<float, CPUContext>);
+OPERATOR_SCHEMA(Wngrad)
+    .NumInputs(4)
+    .NumOutputs(2, 4)
+    .AllowInplace({{0, 0}, {1, 1}})
+    .SetDoc(R"DOC(
+
+Computes the WnGrad update for an input gradient and accumulated
+history. This operator implement the optimization algorithm
+in https://arxiv.org/abs/1803.02865 by Wu, Ward and Bottou.
+Concretely, given inputs (param, grad, seq_b, learning_rate),
+computes
+
+    new_seq_b = seq_b + 1 / seq_b * norm(grad)^2
+    effective_lr = learning_rate / (new_seq_b + epsilon)
+    update = learning_rate * grad / (new_seq_b + epsilon)
+    new_param = param + update
+and returns (new_param, new_seq_b).
+
+Optionally returns effective_lr and update as well.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "seq_b", "Seq_b history")
+    .Input(2, "grad", "Gradient computed")
+    .Input(3, "lr", "learning rate")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_seq_b", "Updated seq_b")
+    .Output(2, "output_effective_lr", "(optional) Effective learning rate")
+    .Output(3, "output_update", "(optional) Actual update that is applied.")
+
+    .Arg("epsilon", "Default 1e-5");
+
+REGISTER_CPU_OPERATOR(SparseWngrad, SparseWngradOp<float, CPUContext>);
+OPERATOR_SCHEMA(SparseWngrad)
+    .NumInputs(5)
+    .NumOutputs(2)
+    .EnforceOneToOneInplace()
+    .SetDoc(R"DOC(
+
+This operator implement the optimization algorithm
+in https://arxiv.org/abs/1803.02865 by Wu, Ward and Bottou.
+Given inputs (param, seq_b, indices, grad, lr), runs the dense WnGrad
+update on (param, grad, seq_b, lr), and returns (new_param,
+new_seq_b) as in the dense case.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "seq_b", "seq_b history")
+    .Input(2, "indices", "Sparse indices")
+    .Input(3, "grad", "Gradient computed")
+    .Input(4, "lr", "learning rate")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_seq_b", "Updated seq_b")
+    .Arg("epsilon", "Default 1e-5");
+
+SHOULD_NOT_DO_GRADIENT(Wngrad);
+SHOULD_NOT_DO_GRADIENT(SparseWngrad);
+} // namespace caffe2
diff --git a/caffe2/sgd/wngrad_op.h b/caffe2/sgd/wngrad_op.h
new file mode 100644
index 0000000..13b58b7
--- /dev/null
+++ b/caffe2/sgd/wngrad_op.h
@@ -0,0 +1,248 @@
+#pragma once
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void wngrad_update(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    nw[i] = w[i] + lr[0] * gi / (h[0] + epsilon);
+  }
+  float nhTmp = 0.0;
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    nhTmp += gi * gi;
+  }
+  nhTmp /= (h[0] + epsilon);
+  nh[0] = h[0] + nhTmp;
+}
+
+template <typename Context>
+void wngrad_update_output_effective_lr(
+    int N,
+    const float* paramIn,
+    const float* gradIn,
+    const float* seqBIn,
+    float* paramOut,
+    float* seqBOut,
+    float* effectiveLROut,
+    float epsilon,
+    const float* lr,
+    Context* /*context*/) {
+  effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon);
+  float seqBTmp = 0.0;
+  for (auto i = 0; i < N; ++i) {
+    float gi = gradIn[i];
+    seqBTmp += gi * gi;
+  }
+  seqBTmp /= (seqBIn[0] + epsilon);
+  seqBOut[0] = seqBIn[0] + seqBTmp;
+  for (auto i = 0; i < N; ++i) {
+    float grad = gradIn[i];
+    paramOut[i] = paramIn[i] + effectiveLROut[0] * grad;
+  }
+}
+
+template <typename Context>
+void wngrad_update_output_effective_lr_and_update(
+    int N,
+    const float* paramIn,
+    const float* gradIn,
+    const float* seqBIn,
+    float* paramOut,
+    float* seqBOut,
+    float* effectiveLROut,
+    float* updateOut,
+    float epsilon,
+    const float* lr,
+    Context* /*context*/) {
+  effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon);
+  float seqBTmp = 0.0;
+  for (auto i = 0; i < N; ++i) {
+    float gi = gradIn[i];
+    seqBTmp += gi * gi;
+  }
+  seqBTmp /= (seqBIn[0] + epsilon);
+  seqBOut[0] = seqBIn[0] + seqBTmp;
+
+  for (auto i = 0; i < N; ++i) {
+    float grad = gradIn[i];
+    float update = updateOut[i] = effectiveLROut[0] * grad;
+    paramOut[i] = paramIn[i] + update;
+  }
+}
+
+template <typename T, class Context>
+class WngradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  WngradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_EQ(
+        Input(GRAD).size(),
+        Input(PARAM).size(),
+        "PARAM size: ",
+        Input(PARAM).size(),
+        ", GRAD size: ",
+        Input(GRAD).size(),
+        ", SEQ_B size: ",
+        Input(SEQ_B).size(),
+        ", LR size: ",
+        Input(LR).size());
+
+    Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+    Output(OUTPUT_SEQ_B)->ResizeLike(Input(SEQ_B));
+    if (OutputSize() == 2) {
+      wngrad_update<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(SEQ_B).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_SEQ_B)->template mutable_data<T>(),
+          epsilon_,
+          Input(LR).template data<T>(),
+          &context_);
+    } else if (OutputSize() == 3) {
+      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(SEQ_B));
+      wngrad_update_output_effective_lr<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(SEQ_B).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_SEQ_B)->template mutable_data<T>(),
+          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<T>(),
+          epsilon_,
+          Input(LR).template data<T>(),
+          &context_);
+    } else {
+      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(SEQ_B));
+      Output(OUTPUT_UPDATE)->ResizeLike(Input(GRAD));
+      wngrad_update_output_effective_lr_and_update<Context>(
+          Input(GRAD).size(),
+          Input(PARAM).template data<T>(),
+          Input(GRAD).template data<T>(),
+          Input(SEQ_B).template data<T>(),
+          Output(OUTPUT_PARAM)->template mutable_data<T>(),
+          Output(OUTPUT_SEQ_B)->template mutable_data<T>(),
+          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<T>(),
+          Output(OUTPUT_UPDATE)->template mutable_data<T>(),
+          epsilon_,
+          Input(LR).template data<T>(),
+          &context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  INPUT_TAGS(PARAM, SEQ_B, GRAD, LR);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_SEQ_B, OUTPUT_EFFECTIVE_LR, OUTPUT_UPDATE);
+};
+
+template <typename T, class Context>
+class SparseWngradOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SparseWngradOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(SEQ_B).size(), 1);
+    CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).ndim()));
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* seqBIn = Input(SEQ_B).template data<T>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* seqBOut = Output(OUTPUT_SEQ_B)->template mutable_data<T>();
+
+    auto n = Input(INDICES).size();
+    if (n == 0) {
+      return true;
+    }
+
+    auto block_size = Input(GRAD).size() / n;
+
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+      if (block_size == 1) {
+        float gi = gradIn[i];
+        paramOut[idx] = paramIn[idx] + lr[0] * gi / (seqBIn[0] + epsilon_);
+      } else {
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+        CAFFE_ENFORCE_GE(
+            Input(PARAM).size(),
+            block_size + offsetIdx,
+            this->debug_def().input(PARAM),
+            ", out of bound,  idx:",
+            idx,
+            " for input i:",
+            i,
+            " and block size:",
+            block_size);
+        CAFFE_ENFORCE_GE(
+            Input(GRAD).size(),
+            block_size + offsetI,
+            this->debug_def().input(GRAD),
+            ", out of bound idx, idx:",
+            idx,
+            " for input i:",
+            i);
+#endif
+        for (auto j = 0; j < block_size; ++j) {
+          float gi = gradIn[offsetI + j];
+          paramOut[offsetIdx + j] =
+              paramIn[offsetIdx + j] + lr[0] * gi / (seqBIn[0] + epsilon_);
+        }
+      }
+    }
+    float seqBTmp = 0.0;
+    for (auto i = 0; i < Input(GRAD).size(); ++i) {
+      float gi = gradIn[i];
+      seqBTmp += gi * gi;
+    }
+    seqBTmp /= seqBIn[0];
+    seqBOut[0] = seqBTmp + seqBIn[0];
+    return true;
+  }
+
+ protected:
+  T epsilon_;
+  INPUT_TAGS(PARAM, SEQ_B, INDICES, GRAD, LR);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_SEQ_B);
+};
+
+} // namespace caffe2
diff --git a/caffe2/sgd/yellowfin_op.cc b/caffe2/sgd/yellowfin_op.cc
new file mode 100644
index 0000000..a482941
--- /dev/null
+++ b/caffe2/sgd/yellowfin_op.cc
@@ -0,0 +1,93 @@
+#include "caffe2/sgd/yellowfin_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(YellowFin, YellowFinOp<float, CPUContext>);
+OPERATOR_SCHEMA(YellowFin)
+    .NumInputs(10)
+    .NumOutputs(8)
+    .AllowInplace(
+        {{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}})
+    .SetDoc(R"DOC(
+
+Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs
+momentum SGD optimization step. lr and mu are not being shared between
+parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary
+memory for computing moving averages (see the publication). Takes arguments
+beta: coefficient for moving averages,
+curv_win_width: timeframe when average squared gradient is being stored,
+epsilon: for numerical purposes,
+nesterov and zero_debias for debias of moving average.
+
+)DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment", "Momentum")
+    .Input(2, "lr", "Learning rate")
+    .Input(3, "mu", "Momentum coefficient")
+    .Input(4, "curv_win", "Memory for latest curvature ranges")
+    .Input(5, "g_avg", "Moving average of gradient")
+    .Input(6, "g2_avg", "Moving average of squared gradient")
+    .Input(7, "scalars_memory", "Memory for stateful scalars")
+    .Input(8, "grad", "Gradient computed")
+    .Input(9, "iter", "Iteration number")
+    .Output(0, "output_param", "Parameters to be updated")
+    .Output(1, "output_moment", "Momentum")
+    .Output(2, "output_lr", "Output learning rate")
+    .Output(3, "output_mu", "Output momentum coefficient")
+    .Output(4, "output_curv_win", "Output memory for latest curvature ranges")
+    .Output(5, "output_g_avg", "Output moving average of gradient")
+    .Output(6, "output_g2_avg", "Output moving average of squared gradient")
+    .Output(7, "output_scalars_memory", "Output memory for stateful scalars")
+    .Arg("beta", "Default 0.999")
+    .Arg("curv_win_width", "Default 20")
+    .Arg("epsilon", "Default 1e-6")
+    .Arg("nesterov", "Default false")
+    .Arg("zero_debias", "Default true");
+
+SHOULD_NOT_DO_GRADIENT(YellowFin);
+
+#define CAFFE2_YELLOWFIN_GETLRMU(T)                                         \
+  template <>                                                               \
+  void YellowFinOp<T, CPUContext>::GetLrMu() {                              \
+    const T curv_ratio = std::sqrt(*g_norm2_max_deb_ / *g_norm2_min_deb_);  \
+    const T mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);           \
+    const T pre_p = *distance_deb_ * *g_norm2_min_deb_;                     \
+    const T p = (pre_p * pre_p) / (2.0f * *variance_);                      \
+    const T w3 = (-std::sqrt(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f; \
+    const T w3_sign = w3 > 0.0f ? 1.0f : -1.0f;                             \
+    const T w = w3_sign * std::pow(std::abs(w3), 1.0f / 3.0f);              \
+    const T y = w - p / 3.0f / w;                                           \
+    const T root = y + 1.0f;                                                \
+    *mu_ = std::max(root * root, mu_limit * mu_limit);                      \
+    *lr_ = std::pow(1.0f - std::sqrt(*mu_), 2) / *g_norm2_min_deb_;         \
+    MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);                   \
+    MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);                   \
+  }
+
+CAFFE2_YELLOWFIN_GETLRMU(float)
+#undef CAFFE2_YELLOWFIN_GETLRMU
+
+// Usually moment_ == moment_out_ && param_ == param_out_
+#define CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(T)                                  \
+  template <>                                                                  \
+  void YellowFinOp<T, CPUContext>::MomentumSgdUpdate() {                       \
+    const T mu = *mu_avg_out_;                                                 \
+    const T lr = *lr_avg_out_;                                                 \
+    if (!nesterov_) {                                                          \
+      for (int i = 0; i < D_; ++i) {                                           \
+        moment_out_[i] = mu * moment_[i] + lr * grad_[i];                      \
+        param_out_[i] = param_[i] - moment_out_[i];                            \
+      }                                                                        \
+    } else {                                                                   \
+      for (int i = 0; i < D_; ++i) {                                           \
+        const T moment_i = moment_[i];                                         \
+        moment_out_[i] = mu * moment_i + lr * grad_[i];                        \
+        param_out_[i] = param_[i] - (1 + mu) * moment_out_[i] + mu * moment_i; \
+      }                                                                        \
+    }                                                                          \
+  }
+
+CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(float)
+#undef CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE
+
+} // caffe2
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
new file mode 100644
index 0000000..02403ea
--- /dev/null
+++ b/caffe2/sgd/yellowfin_op.h
@@ -0,0 +1,321 @@
+// YellowFin: An automatic tuner for momentum SGD
+// (https://arxiv.org/abs/1706.03471)
+// The YellowFinOp tunes learning rate and momentum and performs momentum SGD
+// steps. The learning rate and momentum are separate for any matrix of
+// parameters.
+
+#pragma once
+
+#include <cmath>
+#include <cstring>
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class YellowFinOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  YellowFinOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        curv_win_width_(
+            OperatorBase::GetSingleArgument<int>("curv_win_width", 20)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", false)),
+        zero_debias_(
+            OperatorBase::GetSingleArgument<bool>("zero_debias", true)),
+        epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-6f)),
+        beta_(OperatorBase::GetSingleArgument<T>("beta", 0.999f)) {}
+
+ protected:
+  // GetLrMu and MomentumSgdUpdate have different implementations for GPU and
+  // CPU. All other methods are generic.
+  void GetLrMu();
+  void MomentumSgdUpdate();
+
+  void AfterApply() {
+    // g
+    MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_);
+    // g2
+    math::Mul(D_, grad_, grad_, aux_vector_, &context_);
+    MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_);
+    // g_norm2
+    math::Dot(D_, grad_, grad_, g_norm2_, &context_);
+    math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_);
+    MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_);
+    // g_norm
+    math::Sqrt(1, g_norm2_, g_norm_, &context_);
+    MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_);
+    math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_);
+    // Curvature range: g_norm2_min, g_norm2_max
+    math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_);
+    T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_;
+    math::Log(1, g_norm2_, curv_win_cell, &context_);
+    int valid_end = std::min(curv_win_width_, iter_);
+    math::ReduceMin(
+        valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_);
+    math::ReduceMax(
+        valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_);
+    MovingAverage(
+        1,
+        g_norm2_min_,
+        g_norm2_min_avg_,
+        g_norm2_min_avg_out_,
+        g_norm2_min_deb_);
+    MovingAverage(
+        1,
+        g_norm2_max_,
+        g_norm2_max_avg_,
+        g_norm2_max_avg_out_,
+        g_norm2_max_deb_);
+    math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
+    math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
+    math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
+    math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
+    // Gradient variance
+    math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_);
+
+    math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_);
+    math::Maximum(1, epsilon_, variance_, variance_, &context_);
+    // Distance to opt
+    math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_);
+    MovingAverage(
+        1, distance_, distance_avg_, distance_avg_out_, distance_deb_);
+    if (iter_ > 1) {
+      GetLrMu();
+    }
+  }
+
+  void MovingAverage(
+      const int N,
+      const T* elt,
+      const T* avg,
+      T* new_avg,
+      T* debias_avg) {
+    const T one = 1;
+    math::Scale(N, beta_, avg, new_avg, &context_);
+    math::Axpy(N, one - beta_, elt, new_avg, &context_);
+    math::Scale(N, debias_factor_, new_avg, debias_avg, &context_);
+  }
+
+  T ZeroDebiasFactor() {
+    if (zero_debias_) {
+      const T one = 1;
+      return one / (one - std::pow(beta_, iter_));
+    } else {
+      return 1;
+    }
+  }
+
+ public:
+  bool RunOnDevice() override {
+// Iter live on the CPU
+
+#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME)   \
+  const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \
+  VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();
+
+CAFFE2_YF_READ_INPUT(PARAM, param)
+CAFFE2_YF_READ_INPUT(MOMENT, moment)
+CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
+CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
+CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
+CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
+CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
+CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
+CAFFE2_YF_READ_INPUT(GRAD, grad)
+#undef CAFFE2_YF_READ_OUTPUT
+
+    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
+    CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
+    CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
+    for (int i = 0; i < param_tensor.ndim(); ++i) {
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
+    }
+
+    iter_ = OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+
+    D_ = param_tensor.size();
+
+    // Input data - persistent memory for internal scalars
+    // Note: Memory for these scalars is being allocated during initialization
+    //       of the network. If you want to add / remove a scalar, make a
+    //       suitable change of memory size in the initialization.
+    const T* memory_it = scalars_memory_ - 1;
+    g_norm_avg_ = ++memory_it;
+    g_norm2_avg_ = ++memory_it;
+    g_norm2_min_avg_ = ++memory_it;
+    g_norm2_max_avg_ = ++memory_it;
+    distance_avg_ = ++memory_it;
+
+// Output data
+
+#define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME)         \
+  auto VAR_NAME##_out_tensor = Output(OUTPUT_##OUTPUT_NAME); \
+  VAR_NAME##_out_tensor->ResizeLike(VAR_NAME##_tensor);      \
+  VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data<T>();
+
+    CAFFE2_YF_READ_OUTPUT(PARAM, param)
+    CAFFE2_YF_READ_OUTPUT(MOMENT, moment)
+    CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg)
+    CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg)
+    CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win)
+    CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg)
+    CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg)
+    CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory)
+#undef CAFFE2_YF_READ_OUTPUT
+
+    T* out_memory_it = scalars_memory_out_ - 1;
+    g_norm_avg_out_ = ++out_memory_it;
+    g_norm2_avg_out_ = ++out_memory_it;
+    g_norm2_min_avg_out_ = ++out_memory_it;
+    g_norm2_max_avg_out_ = ++out_memory_it;
+    distance_avg_out_ = ++out_memory_it;
+
+#define CAFFE2_YF_INIT_VECTOR(NAME) \
+  NAME##_tensor_.Resize(D_);        \
+  NAME##_ = NAME##_tensor_.template mutable_data<T>();
+
+    CAFFE2_YF_INIT_VECTOR(aux_vector)
+    CAFFE2_YF_INIT_VECTOR(g_deb)
+    CAFFE2_YF_INIT_VECTOR(g2_deb)
+    CAFFE2_YF_INIT_VECTOR(g_deb2)
+#undef CAFFE2_YF_INIT_VECTOR
+
+#define CAFFE2_YF_INIT_SCALAR(NAME) \
+  NAME##_tensor_.Resize(1);         \
+  NAME##_ = NAME##_tensor_.template mutable_data<T>();
+
+    CAFFE2_YF_INIT_SCALAR(aux_scalar)
+    CAFFE2_YF_INIT_SCALAR(distance)
+    CAFFE2_YF_INIT_SCALAR(distance_deb)
+    CAFFE2_YF_INIT_SCALAR(g_norm)
+    CAFFE2_YF_INIT_SCALAR(g_norm_deb)
+    CAFFE2_YF_INIT_SCALAR(g_norm2)
+    CAFFE2_YF_INIT_SCALAR(g_norm2_max)
+    CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb)
+    CAFFE2_YF_INIT_SCALAR(g_norm2_min)
+    CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb)
+    CAFFE2_YF_INIT_SCALAR(g_norm2_deb)
+    CAFFE2_YF_INIT_SCALAR(lr)
+    CAFFE2_YF_INIT_SCALAR(lr_deb)
+    CAFFE2_YF_INIT_SCALAR(mu_deb)
+    CAFFE2_YF_INIT_SCALAR(mu)
+    CAFFE2_YF_INIT_SCALAR(variance)
+#undef CAFFE2_YF_INIT_SCALAR
+
+    debias_factor_ = ZeroDebiasFactor();
+    MomentumSgdUpdate();
+    AfterApply();
+    return true;
+  }
+
+ protected:
+  int curv_win_width_;
+  bool nesterov_;
+  bool zero_debias_;
+
+  T epsilon_;
+  T beta_;
+  T debias_factor_;
+
+  int D_;
+
+// Temporary memory on device, listed all variables used in calculations
+#define CAFFE2_YF_DEFINE_TENSOR(NAME) \
+  Tensor<Context> NAME##_tensor_;     \
+  T* NAME##_;
+
+  CAFFE2_YF_DEFINE_TENSOR(aux_vector)
+  CAFFE2_YF_DEFINE_TENSOR(g_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g2_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g_deb2)
+
+  CAFFE2_YF_DEFINE_TENSOR(aux_scalar)
+  CAFFE2_YF_DEFINE_TENSOR(distance)
+  CAFFE2_YF_DEFINE_TENSOR(distance_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min)
+  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb)
+  CAFFE2_YF_DEFINE_TENSOR(lr)
+  CAFFE2_YF_DEFINE_TENSOR(lr_deb)
+  CAFFE2_YF_DEFINE_TENSOR(mu)
+  CAFFE2_YF_DEFINE_TENSOR(mu_deb)
+  CAFFE2_YF_DEFINE_TENSOR(variance)
+
+  Tensor<Context> scratch_tensor_;
+
+#undef CAFFE2_YF_DEFINE_TENSOR
+
+  // Input tensors' data
+  const T* param_;
+  const T* moment_;
+  const T* lr_avg_;
+  const T* mu_avg_;
+  const T* curv_win_;
+  const T* g_avg_;
+  const T* g2_avg_;
+  const T* scalars_memory_;
+  const T* grad_;
+  int iter_;
+
+  // Scalar data from scalars_memory_ input tensor
+  const T* g_norm_avg_;
+  const T* g_norm2_avg_;
+  const T* g_norm2_min_avg_;
+  const T* g_norm2_max_avg_;
+  const T* distance_avg_;
+
+  // Output tensors' data
+
+  T* param_out_;
+  T* moment_out_;
+  T* lr_avg_out_;
+  T* mu_avg_out_;
+  T* curv_win_out_;
+  T* g_avg_out_;
+  T* g2_avg_out_;
+  T* scalars_memory_out_;
+
+  // Scalar data from scalars_memory_ output tensor
+  T* g_norm_avg_out_;
+  T* g_norm2_avg_out_;
+  T* g_norm2_min_avg_out_;
+  T* g_norm2_max_avg_out_;
+  T* distance_avg_out_;
+
+  INPUT_TAGS(
+      PARAM,
+      MOMENT,
+      LR_AVG,
+      MU_AVG,
+      CURV_WIN,
+      G_AVG,
+      G2_AVG,
+      SCALARS_MEMORY,
+      GRAD,
+      ITER);
+  OUTPUT_TAGS(
+      OUTPUT_PARAM,
+      OUTPUT_MOMENT,
+      OUTPUT_LR_AVG,
+      OUTPUT_MU_AVG,
+      OUTPUT_CURV_WIN,
+      OUTPUT_G_AVG,
+      OUTPUT_G2_AVG,
+      OUTPUT_SCALARS_MEMORY);
+};
+
+} // namespace caffe2
diff --git a/caffe2/sgd/yellowfin_op_gpu.cu b/caffe2/sgd/yellowfin_op_gpu.cu
new file mode 100644
index 0000000..89f9c54
--- /dev/null
+++ b/caffe2/sgd/yellowfin_op_gpu.cu
@@ -0,0 +1,85 @@
+// YellowFin: An automatic tuner for momentum SGD
+// (https://arxiv.org/abs/1706.03471)
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/sgd/yellowfin_op.h"
+
+namespace caffe2 {
+
+__global__ void GetLrMuKernel(
+    const float* g_norm2_max_deb,
+    const float* g_norm2_min_deb,
+    const float* distance_deb,
+    const float* variance,
+    float* mu,
+    float* lr) {
+  const float curv_ratio = sqrtf(*g_norm2_max_deb / *g_norm2_min_deb);
+  const float mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);
+  const float pre_p = *distance_deb * *g_norm2_min_deb;
+  const float p = (pre_p * pre_p) / (2.0f * *variance);
+  const float w3 = (-sqrtf(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f;
+  const float w3_sign = w3 > 0.0f ? 1.0f : -1.0f;
+  const float w = w3_sign * powf(fabsf(w3), 1.0f / 3.0f);
+  const float y = w - p / 3.0f / w;
+  const float root = y + 1.0f;
+  *mu = fmaxf(root * root, mu_limit * mu_limit);
+  *lr = powf(1.0f - sqrtf(*mu), 2) / *g_norm2_min_deb;
+}
+
+template <>
+void YellowFinOp<float, CUDAContext>::GetLrMu() {
+  // Finding root of cubic formula for YF's Single Step
+  GetLrMuKernel<<<1, 1, 0, context_.cuda_stream()>>>(
+      g_norm2_max_deb_, g_norm2_min_deb_, distance_deb_, variance_, mu_, lr_);
+  MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);
+  MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);
+}
+
+__global__ void MomentumSgdKernel(
+    const int N,
+    const float* mu_ptr,
+    const float* lr_ptr,
+    const float* param,
+    const float* grad,
+    const float* moment,
+    float* param_out,
+    float* moment_out,
+    bool nesterov) {
+  const float mu = *mu_ptr;
+  const float lr = *lr_ptr;
+  if (!nesterov) {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      moment_out[i] = mu * moment[i] + lr * grad[i];
+      param_out[i] = param[i] - moment_out[i];
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      const float moment_i = moment[i];
+      moment_out[i] = mu * moment_i + lr * grad[i];
+      param_out[i] = param[i] - (1 + mu) * moment_out[i] + mu * moment_i;
+    }
+  }
+}
+
+template <>
+void YellowFinOp<float, CUDAContext>::MomentumSgdUpdate() {
+  MomentumSgdKernel<<<
+      CAFFE_GET_BLOCKS(D_),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      D_,
+      mu_avg_out_,
+      lr_avg_out_,
+      param_,
+      grad_,
+      moment_,
+      param_out_,
+      moment_out_,
+      nesterov_);
+}
+
+REGISTER_CUDA_OPERATOR(YellowFin, YellowFinOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/share/CMakeLists.txt b/caffe2/share/CMakeLists.txt
new file mode 100644
index 0000000..73fd22b
--- /dev/null
+++ b/caffe2/share/CMakeLists.txt
@@ -0,0 +1,15 @@
+# There is a linking issue that happens in some of the Windows builds.
+# TODO(Yangqing): after the module redesing, enable this back.
+if (NOT MSVC)
+  add_subdirectory(contrib)
+endif()
+
+# CPU source, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
diff --git a/caffe2/share/contrib/CMakeLists.txt b/caffe2/share/contrib/CMakeLists.txt
new file mode 100644
index 0000000..01af2c0
--- /dev/null
+++ b/caffe2/share/contrib/CMakeLists.txt
@@ -0,0 +1,19 @@
+if (USE_NNPACK)
+  add_subdirectory(nnpack)
+endif()
+if (USE_ZSTD)
+  add_subdirectory(zstd)
+endif()
+if (NOT MSVC)
+  add_subdirectory(depthwise)
+endif()
+
+# CPU source, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
diff --git a/caffe2/share/contrib/depthwise/CMakeLists.txt b/caffe2/share/contrib/depthwise/CMakeLists.txt
new file mode 100644
index 0000000..1a05864
--- /dev/null
+++ b/caffe2/share/contrib/depthwise/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(Caffe2_CONTRIB_DEPTHWISE3x3_CPU_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/depthwise3x3_conv_op.cc"
+)
+set(Caffe2_CONTRIB_DEPTHWISE3x3_TEST_CPU_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/depthwise3x3_conv_op_test.cc"
+)
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_DEPTHWISE3x3_CPU_SRC} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${Caffe2_CONTRIB_DEPTHWISE3x3_TEST_CPU_SRC} PARENT_SCOPE)
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
new file mode 100644
index 0000000..6d42cf6
--- /dev/null
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
@@ -0,0 +1,544 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
+CAFFE2_DEFINE_bool(caffe2_profile_depthwise, false, "");
+
+namespace caffe2 {
+
+namespace {
+struct DepthwiseArgs {
+  // Input layer dimensions
+  int batch{0};
+  int in_rows{0};
+  int in_cols{0};
+  int stride{0};
+  int pad_rows{0};
+  int pad_cols{0};
+
+  // Output layer dimensions
+  int out_rows{0};
+  int out_cols{0};
+};
+
+#ifdef __ARM_NEON__
+
+static inline void winograd_f2k3_input_transform_inplace__neon(
+    float32x4_t* d0,
+    float32x4_t* d1,
+    float32x4_t* d2,
+    float32x4_t* d3) {
+  //*d7 = wd7;
+  float32x4_t wd0 = *d0 - *d2;
+  float32x4_t wd1 = *d1 + *d2;
+  float32x4_t wd2 = -*d1 + *d2;
+  float32x4_t wd3 = *d1 - *d3;
+  *d0 = wd0;
+  *d1 = wd1;
+  *d2 = wd2;
+  *d3 = wd3;
+}
+
+static inline void winograd_f2k3_output_transform_inplace__neon(
+    float32x4_t* m0,
+    float32x4_t* m1,
+    float32x4_t* m2,
+    float32x4_t* m3) {
+  *m0 = *m0 + *m1 + *m2;
+  *m1 = *m1 - *m2 - *m3;
+}
+
+static inline float32x4_t
+vmuladdq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
+#if defined(__aarch64__)
+  return vfmaq_f32(c, a, b);
+#else
+  return vmlaq_f32(c, a, b);
+#endif
+}
+
+static inline float32x4_t
+vmulsubq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
+#if defined(__aarch64__)
+  return vfmsq_f32(c, a, b);
+#else
+  return vmlsq_f32(c, a, b);
+#endif
+}
+
+static inline void winograd_f2k3_kernel_transform__neon(
+    const float32x4_t g0,
+    const float32x4_t g1,
+    const float32x4_t g2,
+    float32x4_t* transform0,
+    float32x4_t* transform1,
+    float32x4_t* transform2,
+    float32x4_t* transform3) {
+  const float32x4_t const_half = vdupq_n_f32(0.5f);
+  float32x4_t half_g0_plus_g2 = const_half * (g0 + g2);
+  *transform0 = g0;
+  *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1);
+  *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1);
+  *transform3 = g2;
+}
+
+static inline float32x4x4_t v4f_transpose4x4__neon(float32x4x4_t m) {
+  float32x4x4_t ret;
+  vst4q_f32((float*)(&ret), m);
+  return ret;
+}
+
+void runDepthwise3x3Conv(
+    const DepthwiseArgs& args,
+    const float* input,
+    const float* kernel,
+    const float* bias,
+    float* output) {
+  const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1);
+  float32x4x4_t kernel_tile;
+  {
+    const float32x4_t g0 = vld1q_f32(kernel);
+    const float32x4_t g1 = vld1q_f32(kernel + 3);
+    // g2[3] is junk
+    const float32x4_t g2 =
+        vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1);
+    float32x4x4_t w;
+    winograd_f2k3_kernel_transform__neon(
+        g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3]);
+    w = v4f_transpose4x4__neon(w);
+
+    winograd_f2k3_kernel_transform__neon(
+        w.val[0],
+        w.val[1],
+        w.val[2],
+        &kernel_tile.val[0],
+        &kernel_tile.val[1],
+        &kernel_tile.val[2],
+        &kernel_tile.val[3]);
+  }
+
+#define TILE                                                  \
+  winograd_f2k3_input_transform_inplace__neon(                \
+      &input_tile.val[0],                                     \
+      &input_tile.val[1],                                     \
+      &input_tile.val[2],                                     \
+      &input_tile.val[3]);                                    \
+  input_tile = v4f_transpose4x4__neon(input_tile);            \
+  winograd_f2k3_input_transform_inplace__neon(                \
+      &input_tile.val[0],                                     \
+      &input_tile.val[1],                                     \
+      &input_tile.val[2],                                     \
+      &input_tile.val[3]);                                    \
+                                                              \
+  for (int row = 0; row < 4; ++row) {                         \
+    input_tile.val[row] =                                     \
+        vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \
+  }                                                           \
+                                                              \
+  input_tile.val[1] = input_tile.val[1] + vbias;              \
+  winograd_f2k3_output_transform_inplace__neon(               \
+      &input_tile.val[0],                                     \
+      &input_tile.val[1],                                     \
+      &input_tile.val[2],                                     \
+      &input_tile.val[3]);                                    \
+  input_tile = v4f_transpose4x4__neon(input_tile);            \
+  winograd_f2k3_output_transform_inplace__neon(               \
+      &input_tile.val[0],                                     \
+      &input_tile.val[1],                                     \
+      &input_tile.val[2],                                     \
+      &input_tile.val[3])
+
+  // Non-padded regime.
+
+  // Iterate over non-padded output tiles.
+  // TODO: avoid spilling W by breaking out the non-padded vs padded case.
+  for (int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
+    for (int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
+      // load input tile for [oth, otw];
+      int ih = oth * 2 - args.pad_rows;
+      int iw = otw * 2 - args.pad_cols;
+      // fast-path, all accesses in-bounds
+      if (__builtin_expect(
+              ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
+                  iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
+                  2 * otw + 1 < args.out_cols,
+              1)) {
+        float32x4x4_t input_tile;
+        for (int row = 0; row < 4; ++row) {
+          input_tile.val[row] =
+              vld1q_f32(input + (ih + row) * args.in_cols + iw);
+        }
+
+        TILE;
+
+        for (size_t row = 0; row < 2; ++row) {
+          vst1_f32(
+              output + (oth * 2 + row) * args.out_cols + otw * 2,
+              vget_low_f32(input_tile.val[row]));
+        }
+      } else {
+        float block[4][4];
+        for (int row = 0; row < 4; ++row) {
+          for (int col = 0; col < 4; ++col) {
+            if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
+                iw + col < args.in_cols) {
+              block[row][col] = input[(ih + row) * args.in_cols + iw + col];
+            } else {
+              block[row][col] = 0.0;
+            }
+          }
+        }
+
+        float32x4x4_t input_tile;
+        for (int row = 0; row < 4; ++row) {
+          input_tile.val[row] = vld1q_f32(&block[row][0]);
+        }
+
+        TILE;
+
+        float oblock[2][2];
+        for (int row = 0; row < 2; ++row) {
+          vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row]));
+        }
+        for (int row = 0; row < 2; ++row) {
+          for (int col = 0; col < 2; ++col) {
+            if (2 * oth + row < args.out_rows &&
+                2 * otw + col < args.out_cols) {
+              output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
+                  oblock[row][col];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#else
+
+#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
+typedef float psimd_f32 __attribute__((vector_size(16), aligned(1)));
+typedef int psimd_s32 __attribute__((__vector_size__(16)));
+
+PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
+  *((psimd_f32*)address) = value;
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
+  return *((const psimd_f32*)address);
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
+  return (psimd_f32){c, c, c, c};
+}
+
+#if defined(__clang__)
+
+PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shufflevector(a, b, 0, 4 + 0, 1, 4 + 1);
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shufflevector(a, b, 2, 4 + 2, 3, 4 + 3);
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shufflevector(a, b, 0, 1, 4 + 0, 4 + 1);
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shufflevector(a, b, 2, 3, 4 + 2, 4 + 3);
+}
+
+#else
+
+PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shuffle(a, b, (psimd_s32){0, 4 + 0, 1, 4 + 1});
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shuffle(a, b, (psimd_s32){2, 4 + 2, 3, 4 + 3});
+}
+PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shuffle(a, b, (psimd_s32){0, 1, 4 + 0, 4 + 1});
+}
+
+PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+  return __builtin_shuffle(a, b, (psimd_s32){2, 3, 4 + 2, 4 + 3});
+}
+
+#endif
+
+static inline void psimd_transpose4x4_f32(
+    const psimd_f32 row0,
+    const psimd_f32 row1,
+    const psimd_f32 row2,
+    const psimd_f32 row3,
+    psimd_f32* col0,
+    psimd_f32* col1,
+    psimd_f32* col2,
+    psimd_f32* col3) {
+  const psimd_f32 row01lo = psimd_interleave_lo_f32(row0, row1);
+  const psimd_f32 row01hi = psimd_interleave_hi_f32(row0, row1);
+  const psimd_f32 row23lo = psimd_interleave_lo_f32(row2, row3);
+  const psimd_f32 row23hi = psimd_interleave_hi_f32(row2, row3);
+  *col0 = psimd_concat_lo_f32(row01lo, row23lo);
+  *col1 = psimd_concat_hi_f32(row01lo, row23lo);
+  *col2 = psimd_concat_lo_f32(row01hi, row23hi);
+  *col3 = psimd_concat_hi_f32(row01hi, row23hi);
+}
+
+static inline void winograd_f2k3_input_transform(
+    const psimd_f32 d0,
+    const psimd_f32 d1,
+    const psimd_f32 d2,
+    const psimd_f32 d3,
+    psimd_f32* transform0,
+    psimd_f32* transform1,
+    psimd_f32* transform2,
+    psimd_f32* transform3) {
+  *transform0 = d0 - d2;
+  *transform1 = d1 + d2;
+  *transform2 = -d1 + d2;
+  *transform3 = d1 - d3;
+}
+
+static inline void winograd_f2k3_kernel_transform(
+    const psimd_f32 g0,
+    const psimd_f32 g1,
+    const psimd_f32 g2,
+    psimd_f32* transform0,
+    psimd_f32* transform1,
+    psimd_f32* transform2,
+    psimd_f32* transform3) {
+  const psimd_f32 const_half = psimd_splat_f32(0.5);
+  const psimd_f32 half_g0_plus_g2 = const_half * (g0 + g2);
+  *transform0 = g0;
+  *transform1 = half_g0_plus_g2 + const_half * g1;
+  *transform2 = half_g0_plus_g2 - const_half * g1;
+  *transform3 = g2;
+}
+
+static inline void winograd_f2k3_output_transform(
+    const psimd_f32 m0,
+    const psimd_f32 m1,
+    const psimd_f32 m2,
+    const psimd_f32 m3,
+    psimd_f32* output0,
+    psimd_f32* output1) {
+  *output0 = m0 + m1 + m2;
+  *output1 = m1 - m2 - m3;
+}
+
+void runDepthwise3x3Conv(
+    const DepthwiseArgs& args,
+    const float* input,
+    const float* kernel,
+    const float* bias,
+    float* output) {
+  const psimd_f32 vbias = {0, *bias, 0, 0};
+  const psimd_f32 g0 = psimd_load_f32(kernel);
+  const psimd_f32 g1 = psimd_load_f32(kernel + 3);
+  const psimd_f32 g5678 = psimd_load_f32(kernel + 5);
+#ifdef __clang__
+  const psimd_f32 g2 = __builtin_shufflevector(g5678, g5678, 1, 2, 3, -1);
+#else
+  const psimd_f32 g2 =
+      __builtin_shuffle(g5678, g5678, (psimd_s32){1, 2, 3, -1});
+#endif
+  psimd_f32 w[4];
+  winograd_f2k3_kernel_transform(g0, g1, g2, &w[0], &w[1], &w[2], &w[3]);
+  psimd_transpose4x4_f32(w[0], w[1], w[2], w[3], &w[0], &w[1], &w[2], &w[3]);
+  psimd_f32 wg[4];
+  winograd_f2k3_kernel_transform(
+      w[0], w[1], w[2], &wg[0], &wg[1], &wg[2], &wg[3]);
+
+  // Iterate over non-padded output tiles.
+  for (int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
+    for (int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
+      // load input tile for [oth, otw], i.e. [2 * oth - 1:2 * oth - 1 + 2, 2 *
+      // otw - 1:2 * otw - 1 + 2]]
+      int ih = oth * 2 - args.pad_rows;
+      int iw = otw * 2 - args.pad_cols;
+      // fast-path, all accesses in-bounds
+      float block[4][4];
+      for (int row = 0; row < 4; ++row) {
+        for (int col = 0; col < 4; ++col) {
+          if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
+              iw + col < args.in_cols) {
+            block[row][col] = input[(ih + row) * args.in_cols + iw + col];
+          } else {
+            block[row][col] = 0.0;
+          }
+        }
+      }
+      psimd_f32 wd[4];
+      winograd_f2k3_input_transform(
+          psimd_load_f32(&block[0]),
+          psimd_load_f32(&block[1]),
+          psimd_load_f32(&block[2]),
+          psimd_load_f32(&block[3]),
+          &wd[0],
+          &wd[1],
+          &wd[2],
+          &wd[3]);
+      psimd_transpose4x4_f32(
+          wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
+      winograd_f2k3_input_transform(
+          wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
+
+      for (int row = 0; row < 4; ++row) {
+        wd[row] = wg[row] * wd[row];
+      }
+      wd[1] += vbias;
+      psimd_f32 s[4] = {{0}};
+      winograd_f2k3_output_transform(wd[0], wd[1], wd[2], wd[3], &s[0], &s[1]);
+      psimd_transpose4x4_f32(
+          s[0], s[1], s[2], s[3], &s[0], &s[1], &s[2], &s[3]);
+
+      psimd_f32 t0, t1;
+      winograd_f2k3_output_transform(s[0], s[1], s[2], s[3], &t0, &t1);
+
+      float oblock[2][4];
+      psimd_store_f32(&oblock[0], t0);
+      psimd_store_f32(&oblock[1], t1);
+      for (int row = 0; row < 2; ++row) {
+        for (int col = 0; col < 2; ++col) {
+          if (2 * oth + row >= 0 && 2 * otw + col >= 0 &&
+              2 * oth + row < args.out_rows && 2 * otw + col < args.out_cols) {
+            output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
+                oblock[row][col];
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
+
+class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
+  Depthwise3x3ConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "Depthwise3x3ConvOp only supports NCHW order");
+    OPERATOR_NEEDS_FEATURE(this->group_ > 1);
+    OPERATOR_NEEDS_FEATURE(this->kernel_w() == 3);
+    OPERATOR_NEEDS_FEATURE(this->kernel_h() == 3);
+    OPERATOR_NEEDS_FEATURE(this->stride_h() == 1);
+    OPERATOR_NEEDS_FEATURE(this->stride_w() == 1);
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override {
+    const Tensor<CPUContext>& X = Input(0);
+    auto& filter = Input(1);
+    Tensor<CPUContext>* Y = Output(0);
+    const int N = X.dim32(0), C = X.dim32(1);
+    CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
+    const int M = filter.dim32(0);
+
+    CAFFE_ENFORCE_EQ(M, X.dim32(1));
+    CAFFE_ENFORCE_EQ(C, X.dim32(1));
+    CAFFE_ENFORCE_EQ(C, this->group_);
+    CAFFE_ENFORCE_EQ(M, this->group_);
+
+    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+    Y->mutable_data<float>();
+
+    DepthwiseArgs args;
+    args.batch = X.dim32(0);
+    args.in_rows = X.dim32(2);
+    args.in_cols = X.dim32(3);
+    args.stride = this->stride_w();
+    args.pad_rows = this->pad_t();
+    args.pad_cols = this->pad_l();
+    args.out_rows = Y->dim32(2);
+    args.out_cols = Y->dim32(3);
+
+    const auto G = this->group_;
+    const auto IS = X.dim32(2) * X.dim32(3);
+    const auto OS = Y->dim32(2) * Y->dim32(3);
+
+    if (InputSize() != 3 && bias_.size() != M) {
+      // no bias.
+      bias_.Resize(M);
+      math::Set<float, CPUContext>(
+          M, 0.0, bias_.mutable_data<float>(), &context_);
+    }
+    const auto* bias =
+        InputSize() == 3 ? Input(2).data<float>() : bias_.data<float>();
+
+    auto f = [&](int n, int g) {
+      runDepthwise3x3Conv(
+          args,
+          X.data<float>() + g * IS + n * G * IS,
+          filter.data<float>() + g * 3 * 3,
+          bias + g,
+          Y->mutable_data<float>() + g * OS + n * G * OS);
+    };
+
+    Timer t;
+
+#if CAFFE2_MOBILE
+    ws_->GetThreadPool()->run(
+        [&](int, int n_g) {
+          const int g = n_g / N;
+          const int n = n_g % N;
+          f(n, g);
+        },
+        N * G);
+#else
+    for (auto n = 0; n < N; ++n) {
+      for (auto g = 0; g < G; ++g) {
+        f(n, g);
+      }
+    }
+#endif
+    if (FLAGS_caffe2_profile_depthwise) {
+      char buffer[1024];
+      const double gmacs = double(
+                               Y->dim32(2) * Y->dim32(3) * Y->dim32(1) *
+                               kernel_w() * kernel_h()) /
+          1.0E9;
+      const double gflops = 2 * gmacs / t.Seconds();
+      auto ret = snprintf(
+          buffer,
+          sizeof(buffer),
+          "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
+          "%4.2f, totalT: %6.3f, inputT: %6.3f, "
+          "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
+          size_t(X.dim(2)),
+          size_t(X.dim(3)),
+          size_t(X.dim(1)),
+          size_t(Y->dim(1)),
+          size_t(kernel_w()),
+          size_t(stride_w()),
+          size_t(pad_t()),
+          gmacs,
+          t.Seconds() * 1E3,
+          0 * 1E3,
+          0 * 1E3,
+          0 * 1E3,
+          0 * 1E3,
+          gflops);
+      CAFFE_ENFORCE(ret > 0);
+      LOG(INFO) << buffer;
+    }
+    return true;
+  }
+
+ private:
+  Tensor<CPUContext> bias_;
+};
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
new file mode 100644
index 0000000..0a759c8
--- /dev/null
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -0,0 +1,218 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+#include <random>
+
+namespace caffe2 {
+
+namespace {
+
+void AddNoiseInput(
+    const vector<TIndex>& shape,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+      tensor->size(), 0.0f, 3.0f, tensor->mutable_data<float>(), &context);
+  for (auto i = 0; i < tensor->size(); ++i) {
+    tensor->mutable_data<float>()[i] =
+        std::min(-5.0f, std::max(5.0f, tensor->mutable_data<float>()[i]));
+  }
+}
+
+inline float relativeError(float a, float b) {
+  return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
+}
+
+void compare(
+    int N,
+    int inputC,
+    int H,
+    int W,
+    int outputC,
+    int kernelH,
+    int kernelW,
+    int strideH,
+    int strideW,
+    int padT,
+    int padL,
+    int padB,
+    int padR,
+    int group,
+    float maxRelErr,
+    float absErrForRelErrFailure) {
+  LOG(INFO) << "running N " << N << " inputC " << inputC << " H " << H << " W "
+            << W << " outputC " << outputC << " kernelH " << kernelH
+            << " kernelW " << kernelW << " strideH " << strideH << " strideW "
+            << strideW << " padT " << padT << " padL " << padL << " padB "
+            << padB << " padR " << padR << " group " << group;
+
+  Workspace ws;
+
+  OperatorDef depthwiseOpDef;
+  depthwiseOpDef.set_name("test");
+  depthwiseOpDef.set_type("Conv");
+  depthwiseOpDef.set_engine("DEPTHWISE_3x3");
+  depthwiseOpDef.add_input("X");
+  depthwiseOpDef.add_input("W");
+  depthwiseOpDef.add_input("B");
+  depthwiseOpDef.add_output("Y_depthwise");
+
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
+
+  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(
+      vector<TIndex>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+
+  unique_ptr<OperatorBase> depthwiseOp(CreateOperator(depthwiseOpDef, &ws));
+  EXPECT_NE(nullptr, depthwiseOp.get());
+
+  OperatorDef referenceOpDef;
+  referenceOpDef.set_name("test");
+  referenceOpDef.set_type("Conv");
+  referenceOpDef.add_input("X");
+  referenceOpDef.add_input("W");
+  referenceOpDef.add_input("B");
+  referenceOpDef.add_output("Y_reference");
+
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
+
+  unique_ptr<OperatorBase> referenceOp(CreateOperator(referenceOpDef, &ws));
+  EXPECT_NE(nullptr, referenceOp.get());
+
+  for (auto i = 0; i < 10; ++i) {
+    EXPECT_TRUE(depthwiseOp->Run());
+  }
+  Blob* depthwiseOutputBlob = ws.GetBlob("Y_depthwise");
+  EXPECT_NE(nullptr, depthwiseOutputBlob);
+  auto& depthwiseOutput = depthwiseOutputBlob->Get<TensorCPU>();
+
+  for (auto i = 0; i < 10; ++i) {
+    EXPECT_TRUE(referenceOp->Run());
+  }
+
+  Blob* referenceOutputBlob = ws.GetBlob("Y_reference");
+  EXPECT_NE(nullptr, referenceOutputBlob);
+  auto& referenceOutput = referenceOutputBlob->Get<TensorCPU>();
+
+  // Compare all output points
+  for (int n = 0; n < depthwiseOutput.dim32(0); ++n) {
+    for (int c = 0; c < depthwiseOutput.dim32(1); ++c) {
+      for (int h = 0; h < depthwiseOutput.dim32(2); ++h) {
+        for (int w = 0; w < depthwiseOutput.dim32(3); ++w) {
+          int offset = n * depthwiseOutput.dim32(1) * depthwiseOutput.dim32(2) *
+                  depthwiseOutput.dim32(3) +
+              c * depthwiseOutput.dim32(2) * depthwiseOutput.dim32(3) +
+              h * depthwiseOutput.dim32(3) + w;
+
+          auto v1 = depthwiseOutput.data<float>()[offset];
+          auto v2 = referenceOutput.data<float>()[offset];
+
+          float relErr = relativeError(v1, v2);
+          float absErr = std::abs(v1 - v2);
+
+          // For small values / small difference, the relative error
+          // can be huge but the absolute error will be small
+          EXPECT_TRUE(
+              relErr <= maxRelErr ||
+              (relErr > maxRelErr && absErr <= absErrForRelErrFailure))
+              << v1 << " " << v2 << " (rel err " << relErr << ") "
+              << "(" << n << " " << c << " " << h << " " << w << ") "
+              << "running N " << N << " inputC " << inputC << " H " << H
+              << " W " << W << " outputC " << outputC << " kernelH " << kernelH
+              << " kernelW " << kernelW << " strideH " << strideH << " strideW "
+              << strideW << " padT " << padT << " padL " << padL << " padB "
+              << padB << " padR " << padR << " group " << group;
+        }
+      }
+    }
+  }
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+void runConv(
+    int kernelH,
+    int kernelW,
+    int strideH,
+    int strideW,
+    int group = 1,
+    int planesIn = randInt(1, 6),
+    int planesOut = randInt(1, 6),
+    int n = randInt(1, 2)) {
+  int h = randInt(20, 100);
+  int w = randInt(20, 100);
+  // This pad restriction is imposed by NNPACK
+  int padT = std::min(randInt(0, 3), kernelH - 1);
+  int padB = std::min(randInt(0, 3), kernelH - 1);
+  int padL = std::min(randInt(0, 3), kernelW - 1);
+  int padR = std::min(randInt(0, 3), kernelW - 1);
+
+  caffe2::compare(
+      n,
+      planesIn,
+      h,
+      w,
+      planesOut,
+      kernelH,
+      kernelW,
+      strideH,
+      strideW,
+      padT,
+      padL,
+      padB,
+      padR,
+      group,
+      0.05f,
+      0.1f);
+}
+
+} // unnamed namespace
+
+constexpr size_t kIters = 20;
+
+// TODO(#14383029) cblas_sgemm not yet implemented on limited mobile cases.
+#if !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+
+TEST(DEPTHWISE3x3, Conv) {
+  for (int i = 0; i < kIters; ++i) {
+    int channel = 2;
+    runConv(3, 3, 1, 1, channel, channel, channel, randInt(1, 2));
+  }
+}
+
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/share/contrib/nnpack/CMakeLists.txt b/caffe2/share/contrib/nnpack/CMakeLists.txt
new file mode 100644
index 0000000..a3ebf98
--- /dev/null
+++ b/caffe2/share/contrib/nnpack/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(Caffe2_CONTRIB_NNPACK_CPU_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv_op.cc"
+)
+set(Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/nnpack_test.cc"
+)
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_NNPACK_CPU_SRC} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC} PARENT_SCOPE)
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
new file mode 100644
index 0000000..8e1a0b2
--- /dev/null
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -0,0 +1,424 @@
+
+#include <iostream>
+
+#include "caffe2/core/common.h"
+
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+#include "caffe2/utils/math.h"
+#include "nnpack.h"
+
+CAFFE2_DEFINE_bool(caffe2_profile_nnpack, false, "");
+namespace caffe2 {
+
+void initNNPACK() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    enum nnp_status nnpack_status = nnp_initialize();
+    CAFFE_ENFORCE(
+        nnpack_status == nnp_status_success, "NNPack is not supported here!");
+  });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Definitions
+////////////////////////////////////////////////////////////////////////////////
+
+class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
+ public:
+  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<CPUContext>(operator_def, ws),
+        algorithm_(getConvolutionAlgorithm()),
+        activation_(getActivationType()),
+        transformStrategy_(getConvolutionTransformStrategy()),
+        ws_(ws) {
+    OPERATOR_NEEDS_FEATURE(
+        this->order_ == StorageOrder::NCHW,
+        "NNPack only supports NCHW order. Please consider add \
+            TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+    OPERATOR_NEEDS_FEATURE(
+        pad_t() < kernel_h(), "NNPACK only supports pad < kernel size");
+    OPERATOR_NEEDS_FEATURE(
+        pad_b() < kernel_h(), "NNPACK only supports pad < kernel size");
+    OPERATOR_NEEDS_FEATURE(
+        pad_l() < kernel_w(), "NNPACK only supports pad < kernel size");
+    OPERATOR_NEEDS_FEATURE(
+        pad_r() < kernel_w(), "NNPACK only supports pad < kernel size");
+
+    createSharedBuffer<CPUContext>(ws);
+  }
+
+  bool RunOnDeviceWithOrderNCHW() override;
+
+ private:
+  nnp_convolution_algorithm getConvolutionAlgorithm() const;
+  nnp_convolution_transform_strategy getConvolutionTransformStrategy() const;
+  nnp_activation getActivationType() const;
+
+  const nnp_convolution_algorithm algorithm_;
+  const nnp_activation activation_;
+  // Modified after precomputing the kernels. State transitions are:
+  // - precompute -> (first call to Run()) -> reuse (on successful precompute)
+  //                                       -> compute (on failing precompute)
+  // - compute
+  nnp_convolution_transform_strategy transformStrategy_;
+  Workspace* ws_;
+  // Per-group transformed filters
+  std::vector<TensorCPU*> transformedFilters_;
+  // Zero-filled bias for convolutions without bias
+  // This may be needed because NNPACK interface always expects conv with bias
+  std::vector<float> dummyBias_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm() const {
+  if (!OperatorBase::HasSingleArgumentOfType<std::string>("algo")) {
+    // No preference is stated. Heuristics for the best mobile device
+    // algorithm are different than NNPACK's version, as Winograd
+    // tends to be a lot faster. Use Winograd if the convolution
+    // is 3x3d1s1.
+    if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
+        dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
+      // use Winograd
+      return nnp_convolution_algorithm_wt8x8;
+    }
+
+    return nnp_convolution_algorithm_auto;
+  }
+
+  // Otherwise, there is a preference.
+  auto algo = OperatorBase::GetSingleArgument<std::string>("algo", "AUTO");
+  if (algo == "AUTO") {
+    return nnp_convolution_algorithm_auto;
+  }
+  if (algo == "WINOGRAD") {
+    return nnp_convolution_algorithm_wt8x8;
+  }
+  if (algo == "WINOGRAD_FP16") {
+    return nnp_convolution_algorithm_wt8x8_fp16;
+  }
+  if (algo == "FT16") {
+    return nnp_convolution_algorithm_ft16x16;
+  }
+  if (algo == "FT8") {
+    return nnp_convolution_algorithm_ft8x8;
+  }
+  if (algo == "IMPLICIT_GEMM") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  }
+  if (algo == "DIRECT") {
+    return nnp_convolution_algorithm_direct;
+  }
+  return nnp_convolution_algorithm_auto;
+}
+
+nnp_convolution_transform_strategy
+NNPACKConvOp::getConvolutionTransformStrategy() const {
+  auto kts = OperatorBase::GetSingleArgument<std::string>(
+      "convolution_transform_strategy", "COMPUTE");
+  if (kts == "PRECOMPUTE") {
+    return nnp_convolution_transform_strategy_precompute;
+  }
+  // Default to computing each time.
+  return nnp_convolution_transform_strategy_compute;
+}
+
+nnp_activation
+NNPACKConvOp::getActivationType() const {
+  auto activation = OperatorBase::GetSingleArgument<std::string>(
+    "activation", "identity");
+  if (activation == "identity") {
+    return nnp_activation_identity;
+  } else if (activation == "Relu") {
+    return nnp_activation_relu;
+  } else {
+    CAFFE_THROW("unsupported activation type \"", activation, "\"");
+  }
+}
+
+bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
+  /* Global variable with a unique ID of the pre-transformed kernel blob */
+  volatile static uint32_t precomputed_transform_id = 0;
+
+  auto& X = Input(0);
+  auto& filter = Input(1);
+  auto* Y = Output(0);
+  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
+  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  CAFFE_ENFORCE(filter.ndim() == 4, "");
+  const int M = filter.dim32(0);
+  CAFFE_ENFORCE(C % this->group_ == 0, "");
+  CAFFE_ENFORCE(M % this->group_ == 0, "");
+  CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
+  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
+  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
+  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+  const int oH = Y->dim32(2), oW = Y->dim32(3);
+
+  const float* biasData = NULL;
+  if (InputSize() == 3) {
+    /* Convolution with bias */
+    auto& bias = Input(2);
+    CAFFE_ENFORCE(bias.ndim() == 1, "");
+    CAFFE_ENFORCE(bias.dim32(0) == M, "");
+    biasData = bias.template data<float>();
+  } else {
+    /* NNPACK interface requires bias. Use a dummy zero-filled vector. */
+    if (dummyBias_.size() != M) {
+      dummyBias_.resize(M);
+    }
+    biasData = dummyBias_.data();
+  }
+
+  const size_t batch_size = X.dim32(0);
+  const size_t input_channels = X.dim32(1);
+  const size_t output_channels = Y->dim32(1);
+  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
+                               .height = static_cast<size_t>(X.dim32(2))};
+  // filter is MCHW
+  const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
+                                .height = static_cast<size_t>(filter.dim32(2))};
+  // pad is tblr
+  const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
+                               .right = static_cast<size_t>(pad_r()),
+                               .bottom = static_cast<size_t>(pad_b()),
+                               .left = static_cast<size_t>(pad_l())};
+
+  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
+                                     .height = static_cast<size_t>(stride_h())};
+  initNNPACK();
+  pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+
+  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor<CPUContext>* buffer) {
+    if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
+      transformedFilters_.resize(group_);
+
+      size_t transformedFilterSize = 0;
+      nnp_status status = nnp_convolution_inference(
+          algorithm_,
+          nnp_convolution_transform_strategy_precompute,
+          C / group_,
+          M / group_,
+          input_size,
+          padding,
+          kernel_size,
+          output_subsample,
+          nullptr /* input */,
+          nullptr /* filters */,
+          nullptr /* bias */,
+          nullptr /* output */,
+          nullptr /* workspace buffer = transformed filter */,
+          &transformedFilterSize,
+          nnp_activation_identity,
+          nullptr /* activation parameter */,
+          pool,
+          nullptr /* profile */);
+      if (status == nnp_status_success) {
+        /* For these convolution parameters filter transforms can be
+         * pre-computed */
+
+        /* Division with rounding up, in case size is not multiple of
+         * sizeof(float) */
+        const size_t transformedFilterElements =
+            (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
+
+        for (auto g = 0; g < group_; g++) {
+          transformedFilters_[g] =
+              ws_->CreateBlob(
+                     "__transformed_kernel_" +
+                     to_string(__sync_fetch_and_add(&precomputed_transform_id, 1)))
+                  ->GetMutable<TensorCPU>();
+          transformedFilters_[g]->Resize(transformedFilterElements);
+
+          status = nnp_convolution_inference(
+              algorithm_,
+              nnp_convolution_transform_strategy_precompute,
+              C / group_,
+              M / group_,
+              input_size,
+              padding,
+              kernel_size,
+              output_subsample,
+              nullptr /* input */,
+              filter.template data<float>() + filter.size() / group_ * g,
+              nullptr /* bias */,
+              nullptr /* output */,
+              static_cast<void*>(
+                  transformedFilters_[g]->template mutable_data<float>()),
+              &transformedFilterSize,
+              nnp_activation_identity,
+              nullptr /* activation parameter */,
+              pool,
+              nullptr /* profile */);
+          CAFFE_ENFORCE(
+              nnp_status_success == status,
+              "NNPACK convolution filter pre-transformation return error");
+        }
+
+        /*
+         * Now, we've precomputed all our filter transformations.
+         * Switch to reuse strategy to avoid doing transformation again on next
+         * iteration.
+         */
+        if (transformStrategy_ ==
+            nnp_convolution_transform_strategy_precompute) {
+          CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
+          transformStrategy_ = nnp_convolution_transform_strategy_reuse;
+        }
+      } else {
+        LOG(WARNING)
+            << "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
+        transformStrategy_ = nnp_convolution_transform_strategy_compute;
+      }
+
+      // Enforce when we leave this block that we have transitioned out of the
+      // precompute state.
+      CAFFE_ENFORCE(
+          transformStrategy_ != nnp_convolution_transform_strategy_precompute);
+    }
+
+    CAFFE_ENFORCE(
+        transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
+        transformStrategy_ == nnp_convolution_transform_strategy_compute);
+    const auto N = X.dim32(0);
+    for (auto n = 0; n < N; ++n) {
+      for (auto g = 0; g < group_; ++g) {
+        nnp_profile profile;
+        size_t workspaceSize = buffer->nbytes();
+        if (workspaceSize == 0) {
+          /* Allocate some memory to ensure buffer pointer is not NULL. This
+           * simplifies further logic. */
+          buffer->Resize(1);
+          workspaceSize = buffer->nbytes();
+        }
+        nnp_status status = nnp_convolution_inference(
+            algorithm_,
+            transformStrategy_,
+            C / group_,
+            M / group_,
+            input_size,
+            padding,
+            kernel_size,
+            output_subsample,
+            X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
+            transformStrategy_ == nnp_convolution_transform_strategy_reuse
+                ? transformedFilters_[g]->template data<float>()
+                : filter.template data<float>() + filter.size() / group_ * g,
+            biasData + M / group_ * g,
+            Y->template mutable_data<float>() + n * oH * oW * M +
+                g * oH * oW * (M / group_),
+            static_cast<void*>(buffer->template mutable_data<float>()),
+            &workspaceSize,
+            activation_,
+            nullptr /* activation parameter */,
+            pool,
+            FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
+        if (status == nnp_status_insufficient_buffer) {
+          /* Query required workspace size, increase buffer, and try again */
+          status = nnp_convolution_inference(
+              algorithm_,
+              transformStrategy_,
+              C / group_,
+              M / group_,
+              input_size,
+              padding,
+              kernel_size,
+              output_subsample,
+              nullptr /* input */,
+              nullptr,
+              nullptr /* bias */,
+              nullptr /* output */,
+              nullptr /* workspace buffer */,
+              &workspaceSize,
+              activation_,
+              nullptr /* activation parameter */,
+              pool,
+              nullptr /* profile */);
+          if (status == nnp_status_success) {
+            /* Division with rounding up, in case size is not multiple of
+             * sizeof(float) */
+            const size_t workspace_elements =
+                (workspaceSize + sizeof(float) - 1) / sizeof(float);
+            buffer->Resize(workspace_elements);
+
+            /* Try convolution_inference again. If this time it fails, it is
+             * fatal. */
+            status = nnp_convolution_inference(
+                algorithm_,
+                transformStrategy_,
+                C / group_,
+                M / group_,
+                input_size,
+                padding,
+                kernel_size,
+                output_subsample,
+                X.template data<float>() + n * C * H * W +
+                    g * H * W * (C / group_),
+                transformStrategy_ == nnp_convolution_transform_strategy_reuse
+                    ? transformedFilters_[g]->template data<float>()
+                    : filter.template data<float>() +
+                        filter.size() / group_ * g,
+                biasData + M / group_ * g,
+                Y->template mutable_data<float>() + n * oH * oW * M +
+                    g * oH * oW * (M / group_),
+                static_cast<void*>(buffer->template mutable_data<float>()),
+                &workspaceSize,
+                activation_,
+                nullptr /* activation parameter */,
+                pool,
+                FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
+          }
+        }
+
+        VLOG(1) << "NNPACK buffer size: " << buffer->nbytes();
+        CAFFE_ENFORCE(
+            nnp_status_success == status,
+            "NNPACK convolution computation returned error");
+        if (FLAGS_caffe2_profile_nnpack) {
+          char buffer[1024];
+          const double gmacs =
+              double(
+                  Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
+                  kernel_size.width * kernel_size.height / group_ / group_) /
+              1.0E9;
+          const double gflops = 2 * gmacs / profile.total;
+          auto ret = snprintf(
+              buffer,
+              sizeof(buffer),
+              "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
+              "%4.2f, totalT: %6.3f, inputT: %6.3f, "
+              "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
+              size_t(X.dim(2)),
+              size_t(X.dim(3)),
+              size_t(X.dim(1)),
+              size_t(Y->dim(1)),
+              size_t(kernel_size.width),
+              size_t(output_subsample.width),
+              size_t(padding.top),
+              gmacs,
+              profile.total * 1E3,
+              profile.input_transform * 1E3,
+              profile.kernel_transform * 1E3,
+              profile.block_multiplication * 1E3,
+              profile.output_transform * 1E3,
+              gflops);
+          CAFFE_ENFORCE(ret > 0);
+          std::cout << buffer << std::endl;
+        }
+      }
+    }
+  });
+  return true;
+}
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
+
+} // namespace caffe2
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
new file mode 100644
index 0000000..c94faaa
--- /dev/null
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -0,0 +1,389 @@
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+#include <random>
+
+namespace caffe2 {
+
+namespace {
+
+void AddNoiseInput(
+    const vector<TIndex>& shape,
+    const string& name,
+    Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+
+  math::RandGaussian<float, CPUContext>(
+      tensor->size(), 0.0f, 3.0f, tensor->mutable_data<float>(), &context);
+  for (auto i = 0; i < tensor->size(); ++i) {
+    tensor->mutable_data<float>()[i] =
+        std::min(-5.0f, std::max(5.0f, tensor->mutable_data<float>()[i]));
+  }
+}
+
+inline float relativeError(float a, float b) {
+  return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
+}
+
+void compare(
+    int N,
+    int inputC,
+    int H,
+    int W,
+    int outputC,
+    int kernelH,
+    int kernelW,
+    int strideH,
+    int strideW,
+    int padT,
+    int padL,
+    int padB,
+    int padR,
+    int group,
+    const std::string& algorithm,
+    const std::string& convolutionTransformStrategy,
+    const std::string& activation,
+    float maxRelErr,
+    float absErrForRelErrFailure) {
+  LOG(INFO) << "running N " << N << " inputC " << inputC << " H " << H << " W "
+            << W << " outputC " << outputC << " kernelH " << kernelH
+            << " kernelW " << kernelW << " strideH " << strideH << " strideW "
+            << strideW << " padT " << padT << " padL " << padL << " padB "
+            << padB << " padR " << padR << " group " << group;
+
+  Workspace ws;
+
+  OperatorDef nnpackOpDef;
+  nnpackOpDef.set_name("test");
+  nnpackOpDef.set_type("Conv");
+  nnpackOpDef.set_engine("NNPACK");
+  nnpackOpDef.add_input("X");
+  nnpackOpDef.add_input("W");
+  nnpackOpDef.add_input("B");
+  nnpackOpDef.add_output("Y_nnpack");
+
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  if (!algorithm.empty()) {
+    nnpackOpDef.add_arg()->CopyFrom(MakeArgument("algo", algorithm));
+  }
+  if (!convolutionTransformStrategy.empty()) {
+    nnpackOpDef.add_arg()->CopyFrom(MakeArgument(
+        "convolution_transform_strategy", convolutionTransformStrategy));
+  }
+  if (!activation.empty()) {
+    nnpackOpDef.add_arg()->CopyFrom(MakeArgument("activation", activation));
+  }
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  nnpackOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
+
+  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(
+      vector<TIndex>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+
+  unique_ptr<OperatorBase> nnpackOp(CreateOperator(nnpackOpDef, &ws));
+  EXPECT_NE(nullptr, nnpackOp.get());
+
+  OperatorDef referenceOpDef;
+  referenceOpDef.set_name("test");
+  referenceOpDef.set_type("Conv");
+  referenceOpDef.add_input("X");
+  referenceOpDef.add_input("W");
+  referenceOpDef.add_input("B");
+  referenceOpDef.add_output("Y_reference");
+
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("kernel_h", kernelH));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("kernel_w", kernelW));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("stride_h", strideH));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("stride_w", strideW));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_t", padT));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_l", padL));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
+  referenceOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
+
+  unique_ptr<OperatorBase> referenceOp(CreateOperator(referenceOpDef, &ws));
+  EXPECT_NE(nullptr, referenceOp.get());
+
+  unique_ptr<OperatorBase> activationOp;
+  if (activation == "Relu") {
+    OperatorDef activationOpDef;
+    activationOpDef.set_name("activation");
+    activationOpDef.set_type("Relu");
+    activationOpDef.add_input("Y_reference");
+    activationOpDef.add_output("Y_reference");
+    activationOp = CreateOperator(activationOpDef, &ws);
+    EXPECT_NE(nullptr, activationOp.get());
+  }
+
+  for (auto i = 0; i < 10; ++i) {
+    EXPECT_TRUE(nnpackOp->Run());
+  }
+  Blob* nnpackOutputBlob = ws.GetBlob("Y_nnpack");
+  EXPECT_NE(nullptr, nnpackOutputBlob);
+  auto& nnpackOutput = nnpackOutputBlob->Get<TensorCPU>();
+
+  for (auto i = 0; i < 10; ++i) {
+    EXPECT_TRUE(referenceOp->Run());
+    if (activationOp) {
+      EXPECT_TRUE(activationOp->Run());
+    }
+  }
+
+  Blob* referenceOutputBlob = ws.GetBlob("Y_reference");
+  EXPECT_NE(nullptr, referenceOutputBlob);
+  auto& referenceOutput = referenceOutputBlob->Get<TensorCPU>();
+
+  // Compare all output points
+  for (int n = 0; n < nnpackOutput.dim32(0); ++n) {
+    for (int c = 0; c < nnpackOutput.dim32(1); ++c) {
+      for (int h = 0; h < nnpackOutput.dim32(2); ++h) {
+        for (int w = 0; w < nnpackOutput.dim32(3); ++w) {
+          int offset = n * nnpackOutput.dim32(1) * nnpackOutput.dim32(2) *
+                  nnpackOutput.dim32(3) +
+              c * nnpackOutput.dim32(2) * nnpackOutput.dim32(3) +
+              h * nnpackOutput.dim32(3) + w;
+
+          auto v1 = nnpackOutput.data<float>()[offset];
+          auto v2 = referenceOutput.data<float>()[offset];
+
+          float relErr = relativeError(v1, v2);
+          float absErr = std::abs(v1 - v2);
+
+          // For small values / small difference, the relative error
+          // can be huge but the absolute error will be small
+          EXPECT_TRUE(
+              relErr <= maxRelErr ||
+              (relErr > maxRelErr && absErr <= absErrForRelErrFailure))
+              << v1 << " " << v2 << " (rel err " << relErr << ") "
+              << "(" << n << " " << c << " " << h << " " << w << ") "
+              << "running N " << N << " inputC " << inputC << " H " << H
+              << " W " << W << " outputC " << outputC << " kernelH " << kernelH
+              << " kernelW " << kernelW << " strideH " << strideH << " strideW "
+              << strideW << " padT " << padT << " padL " << padL << " padB "
+              << padB << " padR " << padR << " group " << group << " algorithm "
+              << algorithm << " convolutionTransformStrategy "
+              << convolutionTransformStrategy;
+        }
+      }
+    }
+  }
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+void runConv(
+    int kernelH,
+    int kernelW,
+    int strideH,
+    int strideW,
+    int group = 1,
+    std::string algo = "",
+    int planesIn = randInt(1, 6),
+    int planesOut = randInt(1, 6),
+    int n = randInt(1, 2),
+    std::string convolutionTransformStrategy = "COMPUTE",
+    std::string activation = "identity") {
+  int h = randInt(20, 100);
+  int w = randInt(20, 100);
+  // This pad restriction is imposed by NNPACK
+  int padT = std::min(randInt(0, 3), kernelH - 1);
+  int padB = std::min(randInt(0, 3), kernelH - 1);
+  int padL = std::min(randInt(0, 3), kernelW - 1);
+  int padR = std::min(randInt(0, 3), kernelW - 1);
+
+  caffe2::compare(
+      n,
+      planesIn,
+      h,
+      w,
+      planesOut,
+      kernelH,
+      kernelW,
+      strideH,
+      strideW,
+      padT,
+      padL,
+      padB,
+      padR,
+      group,
+      algo,
+      convolutionTransformStrategy,
+      activation,
+      0.05f,
+      0.1f);
+}
+
+} // unnamed namespace
+
+constexpr size_t kIters = 20;
+
+// TODO(#14383029) cblas_sgemm not yet implemented on limited mobile cases.
+#if !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+
+TEST(NNPACK, Conv_3x3s1) {
+  for (int i = 0; i < kIters; ++i) {
+    runConv(3, 3, 1, 1);
+  }
+}
+
+TEST(NNPACK, Conv_3x3s1_precompute) {
+  for (int i = 0; i < kIters; ++i) {
+    int group = randInt(1, 2);
+    runConv(
+        3,
+        3,
+        1,
+        1,
+        group,
+        "WINOGRAD",
+        group * randInt(1, 8),
+        group * randInt(1, 8),
+        1,
+        "PRECOMPUTE");
+  }
+}
+
+TEST(NNPACK, Conv_3x3s1_FP16) {
+  for (int i = 0; i < kIters; ++i) {
+    runConv(3, 3, 1, 1, 1, "WINOGRAD_FP16");
+  }
+}
+
+TEST(NNPACK, Conv_3x3s1_FP16_precompute) {
+  for (int i = 0; i < kIters; ++i) {
+    int group = randInt(1, 2);
+    runConv(
+        3,
+        3,
+        1,
+        1,
+        group,
+        "WINOGRAD_FP16",
+        group * randInt(1, 8),
+        group * randInt(1, 8),
+        1,
+        "PRECOMPUTE");
+  }
+}
+
+TEST(NNPACK, Conv_NxNs1) {
+  for (int i = 0; i < kIters; ++i) {
+    int kernel = randInt(2, 10);
+    runConv(kernel, kernel, 1, 1);
+  }
+}
+
+TEST(NNPACK, Conv_1x1s1) {
+  for (int i = 0; i < kIters; ++i) {
+    auto group = randInt(1, 3);
+    auto inChannels = randInt(1, 8) * group;
+    auto outChannels = randInt(1, 8) * group;
+    auto n = 1;
+    runConv(1, 1, 1, 1, group, "DIRECT", inChannels, outChannels, n);
+  }
+}
+
+TEST(NNPACK, ConvRelu_1x1s1) {
+  for (int i = 0; i < kIters; ++i) {
+    auto group = randInt(1, 3);
+    auto inChannels = randInt(1, 8) * group;
+    auto outChannels = randInt(1, 8) * group;
+    auto n = 1;
+    runConv(
+        1,
+        1,
+        1,
+        1,
+        group,
+        "DIRECT",
+        inChannels,
+        outChannels,
+        n,
+        "PRECOMPUTE",
+        "Relu");
+  }
+}
+
+TEST(NNPACK, Conv_1x1s1_precompute) {
+  for (int i = 0; i < kIters; ++i) {
+    auto group = randInt(1, 3);
+    auto inChannels = randInt(1, 8) * group;
+    auto outChannels = randInt(1, 8) * group;
+    auto n = 1;
+    runConv(
+        1, 1, 1, 1, group, "DIRECT", inChannels, outChannels, n, "PRECOMPUTE");
+  }
+}
+
+TEST(NNPACK, Conv_NxNs_grouped) {
+  for (int i = 0; i < kIters; ++i) {
+    int group = randInt(2, 3);
+    int iC = randInt(1, 6) * group;
+    int oC = randInt(1, 6) * group;
+    int kernel = randInt(2, 10);
+    int n = randInt(1, 2);
+    runConv(kernel, kernel, 1, 1, group, "", iC, oC, n);
+  }
+}
+
+TEST(NNPACK, Conv_NxNs_grouped_precompute) {
+  for (int i = 0; i < kIters; ++i) {
+    int group = randInt(2, 3);
+    int iC = randInt(1, 6) * group;
+    int oC = randInt(1, 6) * group;
+    int kernel = randInt(2, 10);
+    int n = randInt(1, 2);
+    runConv(kernel, kernel, 1, 1, group, "", iC, oC, n, "PRECOMPUTE");
+  }
+}
+
+TEST(NNPACK, Conv_NxNsW) {
+  for (int i = 0; i < 3; ++i) {
+    int kernel = randInt(3, 5);
+    int stride = randInt(1, kernel - 1);
+    runConv(kernel, kernel, stride, stride);
+  }
+}
+
+TEST(NNPACK, ConvRelu_NxNsW) {
+  for (int i = 0; i < 3; ++i) {
+    int kernel = randInt(3, 5);
+    int stride = randInt(1, kernel - 1);
+    runConv(kernel, kernel, stride, stride, 1, "", 1, 1, 1, "COMPUTE", "Relu");
+  }
+}
+
+TEST(NNPACK, Conv_HxWsHxW) {
+  for (int i = 0; i < 3; ++i) {
+    int kernelH = randInt(2, 5);
+    int kernelW = randInt(2, 5);
+    int strideH = randInt(1, kernelH - 1);
+    int strideW = randInt(1, kernelW - 1);
+    runConv(kernelH, kernelW, strideH, strideW);
+  }
+}
+
+#endif
+
+} // namespace caffe2
diff --git a/caffe2/share/contrib/zstd/CMakeLists.txt b/caffe2/share/contrib/zstd/CMakeLists.txt
new file mode 100644
index 0000000..dbc170e
--- /dev/null
+++ b/caffe2/share/contrib/zstd/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
new file mode 100644
index 0000000..d2c0ff6
--- /dev/null
+++ b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
@@ -0,0 +1,139 @@
+#include "quant_decomp_zstd_op.h"
+#include <stdint.h>
+#include <zstd.h>
+#include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+namespace {
+
+#define REGISTER_TYPE(index, type)                                      \
+  {                                                                     \
+    index, [](TensorCPU* tensor_) -> uint8_t* {                         \
+      return reinterpret_cast<uint8_t*>(tensor_->mutable_data<type>()); \
+    }                                                                   \
+  }
+
+// return a mutable pointer to the tensor in uint8_t format, the memory is
+//   allocated based on the type 'type_index'
+// supported type is defined in 'gTypeMapper'
+uint8_t* GetMutableData(int type_index, TensorCPU* tensor) {
+  // see COMP_DATA_TYPE_MAPPER in mutils.py for the mapping
+  static const std::map<int, std::function<uint8_t*(TensorCPU * tensor)>>
+      gTypeMapper = {REGISTER_TYPE(TensorProto::UINT8, uint8_t),
+                     REGISTER_TYPE(TensorProto::UINT16, uint16_t),
+                     REGISTER_TYPE(TensorProto::INT32, int32_t),
+                     REGISTER_TYPE(TensorProto::FLOAT, float)};
+
+  CAFFE_ENFORCE_EQ(
+      gTypeMapper.count(type_index),
+      1,
+      "Invalid type index " + caffe2::to_string(type_index) + ".");
+  return gTypeMapper.at(type_index)(tensor);
+}
+
+const uint8_t* GetCompressedPtr(const TensorCPU& compressed, size_t* out_size) {
+  CAFFE_ENFORCE(
+      // array of uint8_t
+      compressed.template IsType<uint8_t>() ||
+      // array with one string
+      compressed.template IsType<std::string>());
+
+  if (compressed.template IsType<uint8_t>()) {
+    *out_size = compressed.size();
+    return compressed.data<uint8_t>();
+  }
+
+  // string type
+  CAFFE_ENFORCE_EQ(compressed.size(), 1);
+  auto& str = compressed.data<std::string>()[0];
+  *out_size = str.size();
+  return reinterpret_cast<const uint8_t*>(str.data());
+}
+
+// Deserialize the string to get TensorProtos, storing tensors in compressed
+// format
+TensorProtos GetTensorsProto(const TensorCPU& compressed) {
+  size_t sz;
+  auto* ptr = GetCompressedPtr(compressed, &sz);
+  TensorProtos tensors;
+  CAFFE_ENFORCE(tensors.ParseFromArray(ptr, sz));
+  return tensors;
+}
+
+// Decompress tensor stored in compressed format
+// It is compressed using mutils.compress_data_list()
+void Decompress(const TensorProto& compressed, TensorCPU* outDecomp) {
+  vector<TIndex> shape(compressed.dims().begin(), compressed.dims().end());
+  // shape stores the dimensions of data before compression,
+  //   see _compress_data_single() in mutils.py
+  outDecomp->Resize(shape);
+  auto* out_ptr = GetMutableData(compressed.data_type(), outDecomp);
+
+  auto* src = reinterpret_cast<const uint8_t*>(compressed.byte_data().data());
+  size_t comp_size = compressed.byte_data().size();
+  size_t decomp_size = outDecomp->nbytes();
+
+  // call zstd
+  size_t dc_size = ZSTD_decompress(out_ptr, decomp_size, src, comp_size);
+  CAFFE_ENFORCE(!ZSTD_isError(dc_size), ZSTD_getErrorName(dc_size));
+  CAFFE_ENFORCE_EQ(decomp_size, dc_size);
+}
+
+} // namespace
+
+bool QuantDecompZstdOp::RunOnDevice() {
+  const auto& op_compressed = Input(0);
+
+  // Data could be an array of uint_t, or a string
+  CAFFE_ENFORCE(
+      // array of uint8_t
+      op_compressed.template IsType<uint8_t>() ||
+          // array with one string
+          op_compressed.template IsType<std::string>(),
+      op_compressed.meta().name());
+
+  // op_compressed: compressed data, 1d
+  if (op_compressed.template IsType<uint8_t>()) {
+    CAFFE_ENFORCE_EQ(op_compressed.ndim(), 1, op_compressed.ndim());
+  } else {
+    // string type has 0 dimension
+    CAFFE_ENFORCE_EQ(op_compressed.size(), 1, op_compressed.size());
+  }
+
+  auto tensors = GetTensorsProto(op_compressed);
+  CAFFE_ENFORCE_EQ(tensors.protos_size(), OutputSize());
+
+  for (int i = 0; i < OutputSize(); i++) {
+    Decompress(tensors.protos(i), Output(i));
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(QuantDecompZstd, QuantDecompZstdOp);
+
+OPERATOR_SCHEMA(QuantDecompZstd)
+    .NumInputs(1)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+ Decompress a set of tensors that are compressed using zstd.
+ The data can be compressed using mutils.compress_data_list(), see
+ quant_decomp_op_test.py for an example.
+ The number of outputs depended on the input.
+ )DOC")
+    .Input(
+        0,
+        "compressed",
+        "Compressed data in 1d tensor (uint8_t), "
+        "or 0d tensor with one element in string type."
+        "The data is compressed using mutils.compress_data_list().")
+    .Output(0, "output0", "Decompressed data 0")
+    .Output(1, "output1", "Decompressed data 1 if existed")
+    .Output(2, "output2", "Decompressed data 2 if existed")
+    .Output(3, "outputn", "Decompressed data n if existed");
+
+SHOULD_NOT_DO_GRADIENT(QuantDecompZstd);
+
+} // namespace caffe2
diff --git a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.h b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.h
new file mode 100644
index 0000000..0e3f7c4
--- /dev/null
+++ b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.h
@@ -0,0 +1,23 @@
+#ifndef QUANT_DECOMP_OP_H_
+#define QUANT_DECOMP_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+// Decompress a set of tensors compressed using zstd,
+// see quant_decomp_op_test.py for how to compress
+class QuantDecompZstdOp final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  QuantDecompZstdOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  ~QuantDecompZstdOp() {}
+
+  bool RunOnDevice() override;
+};
+
+} // namespace caffe2
+#endif // QUANT_DECOMP_OP_H_
diff --git a/caffe2/test/assets/squeeze_predict_net.pb b/caffe2/test/assets/squeeze_predict_net.pb
new file mode 100644
index 0000000..a06d959
Binary files /dev/null and b/caffe2/test/assets/squeeze_predict_net.pb differ
diff --git a/caffe2/test/caffe2_gtest_main.cc b/caffe2/test/caffe2_gtest_main.cc
new file mode 100644
index 0000000..610849a
--- /dev/null
+++ b/caffe2/test/caffe2_gtest_main.cc
@@ -0,0 +1,44 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include "caffe2/core/flags.h"
+#include "caffe2/core/init.h"
+
+CAFFE2_DEFINE_string(
+    caffe_test_root, "gen/", "The root of the caffe test folder.");
+
+GTEST_API_ int main(int argc, char **argv) {
+  // std::cout << "Running main() from gtest_main.cc\n";
+  testing::InitGoogleTest(&argc, argv);
+  caffe2::GlobalInit(&argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/caffe2/transforms/CMakeLists.txt b/caffe2/transforms/CMakeLists.txt
new file mode 100644
index 0000000..48e61ec
--- /dev/null
+++ b/caffe2/transforms/CMakeLists.txt
@@ -0,0 +1,15 @@
+# ---[ Get non-tests
+file(GLOB tmp *.cc)
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+
+# exclude test files
+file(GLOB tmp *_test.cc)
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+
+# ---[ Get tests
+file(GLOB tmp *_test.cc)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+
+# ---[ Send the lists to the parent scope.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/transforms/common_subexpression_elimination.cc b/caffe2/transforms/common_subexpression_elimination.cc
new file mode 100644
index 0000000..9274491
--- /dev/null
+++ b/caffe2/transforms/common_subexpression_elimination.cc
@@ -0,0 +1,158 @@
+#include "caffe2/transforms/common_subexpression_elimination.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+using transform::Graph;
+using transform::Node;
+
+// Checks if the node at model_idx and the node at candidate_idx are
+// "common subexpressions". That is, do they have the same function, and
+// take in the exact same input. If so, then their function is duplicated.
+bool are_nodes_common(const Graph& g, int model_idx, int candidate_idx) {
+  // We need the candidate operator to match this model_op.
+  const Node& model_node = g.node(model_idx);
+  const Node& candidate_node = g.node(candidate_idx);
+
+  // Types need to match.
+  if (model_node.op.type() != candidate_node.op.type()) {
+    return false;
+  }
+  // Arguments need to match.
+  if (!MatchArguments(model_node.op, candidate_node.op)) {
+    return false;
+  }
+  // Inputs need to match.
+  if (model_node.op.input_size() != candidate_node.op.input_size()) {
+    return false;
+  }
+  // If any input_blob name is different, this is not okay.
+  for (int i = 0; i < model_node.op.input_size(); i++) {
+    if (candidate_node.op.input(i) != model_node.op.input(i)) {
+      return false;
+    }
+  }
+  // Now, we also need to check that each blob comes from the same parent, or
+  // if they are external (isn't in parents). This is equivalent to a
+  // map equality (since parent edges can only contain up to one blob).
+  if (model_node.parents.size() != candidate_node.parents.size() ||
+      !std::equal(
+          model_node.parents.begin(),
+          model_node.parents.end(),
+          candidate_node.parents.begin())) {
+    return false;
+  }
+
+  // Output size have to match too.
+  if (model_node.op.output_size() != candidate_node.op.output_size()) {
+    return false;
+  }
+  return true;
+}
+
+bool CommonSubexpressionEliminationTransform::PatternRule(
+    const Graph& g,
+    const std::vector<int>& subgraph,
+    int idx) {
+  if (subgraph.size() == 0) {
+    if (IsWhitelisted(g.node(idx).op.type()))
+      return true;
+    return false;
+  }
+  return are_nodes_common(g, subgraph.at(0), idx);
+}
+
+// As long as we have matched more than 2 ops, it is worth eliminating.
+bool CommonSubexpressionEliminationTransform::ValidatorRule(
+    const Graph& /*g*/,
+    const std::vector<int>& subgraph) {
+  if (subgraph.size() >= 2) {
+    return true;
+  }
+  return false;
+}
+
+bool CommonSubexpressionEliminationTransform::ReplaceRule(
+    const std::vector<int>& subgraph,
+    Graph* g_ptr) {
+  CHECK(g_ptr);
+  auto& g = *g_ptr;
+
+  // We're gonna make a new node, with the same input as all of the ones in
+  // subgraph, but with their combined children.
+  int new_idx = g.size();
+  OperatorDef new_op = g.node(subgraph[0]).op;
+  // We will need to rename the output blobs.
+  new_op.clear_output();
+  for (const auto& blob : g.node(subgraph[0]).op.output()) {
+    new_op.add_output("transform/" + blob);
+  }
+
+  // Need to set up the parents.
+  const auto& new_op_parents = g.node(subgraph[0]).parents;
+
+  for (auto& parent : new_op_parents) {
+    int parent_idx = parent.first;
+
+    // Make the parents acknowledge us as its new child.
+    g.node(parent_idx).children[new_idx] = new_op_parents.at(parent_idx);
+
+    // Make the parents disown all our outdated siblings.
+    for (int i = 0; i < subgraph.size(); i++) {
+      g.node(parent_idx).children.erase(subgraph[i]);
+    }
+  }
+
+  // Add the node now.
+  g.push_node(
+      Node(new_op, true, new_op_parents, std::map<int, std::vector<string>>()));
+
+  // Now, we need to populate the child edges.
+  for (const int x : subgraph) {
+    // Figure out what the subgraph's node's blobs correspond to in new_op
+    // This is easy, since their indices match.
+    std::map<string, string> output_renamings;
+    for (int i = 0; i < new_op.output_size(); i++) {
+      output_renamings[g.node(x).op.output(i)] = g.node(new_idx).op.output(i);
+    }
+
+    // Now, time to add the old node's children to new_op
+    for (auto& child : g.node(x).children) {
+      int child_idx = child.first;
+      std::vector<string> blobs = child.second;
+
+      // rename the old blobs, and use them for our new edge.
+      for (string& blob : blobs) {
+        blob = output_renamings.at(blob);
+      }
+
+      // create this new edge
+      g.node(new_idx).children[child_idx] = blobs;
+      g.node(child_idx).parents[new_idx] = blobs;
+
+      // delete the old edge
+      g.node(child_idx).parents.erase(x);
+
+      // need to rename the inputs of the children too.
+      for (int i = 0; i < g.node(child_idx).op.input_size(); i++) {
+        string blob = g.node(child_idx).op.input(i);
+        if (output_renamings.count(blob) > 0) {
+          g.node(child_idx).op.set_input(i, output_renamings.at(blob));
+        }
+      }
+    }
+  }
+
+  g.DeactivateSubgraph(subgraph);
+
+  return true;
+}
+
+REGISTER_TRANSFORM(
+    CommonSubexpressionElimination,
+    CommonSubexpressionEliminationTransform);
+
+} // namespace caffe2
diff --git a/caffe2/transforms/common_subexpression_elimination.h b/caffe2/transforms/common_subexpression_elimination.h
new file mode 100644
index 0000000..e66ccf1
--- /dev/null
+++ b/caffe2/transforms/common_subexpression_elimination.h
@@ -0,0 +1,52 @@
+
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/transform.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+/**
+ * Common Subexpression Elimination
+ *
+ * This transforms looks for specific operators (denoted by whitelisted_ops_),
+ * and removes unnecessary repetition of that operator.
+ *
+ * Consider some operator of X, that reads from blob b_ written to by W.
+ * X_a and X_b read the output of X. However, another operator Y, is the same
+ * type as X, has the same arguments as X, and reads from the same input b_,
+ * written to by W. It's output is the same as X. Y_a, Y_b, and Y_c read from Y.
+ *
+ * Then, we can eliminate the common subexpressions X and Y, and merge them to
+ * Z, where X_a, X_b, Y_a, Y_b, and Y_c all read from Z.
+ *
+ *
+ * TODO(benz): Fix the error to not match nodes that write to external output.
+ */
+class CommonSubexpressionEliminationTransform : public Transform {
+ public:
+  CommonSubexpressionEliminationTransform() {
+    SetPatternMatchType(SORTED_WRT_EXECUTION_ORDER);
+  }
+
+ protected:
+  bool PatternRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph,
+      int idx) override;
+  bool ValidatorRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph) override;
+  bool ReplaceRule(const std::vector<int>& subgraph, transform::Graph* g_ptr)
+      override;
+
+ private:
+  bool IsWhitelisted(string op_type) {
+    return whitelisted_ops_.count(op_type);
+  }
+  std::set<string> whitelisted_ops_ = {"LearningRate", "FC"};
+};
+
+} // namespace caffe2
diff --git a/caffe2/transforms/common_subexpression_elimination_test.cc b/caffe2/transforms/common_subexpression_elimination_test.cc
new file mode 100644
index 0000000..a394874
--- /dev/null
+++ b/caffe2/transforms/common_subexpression_elimination_test.cc
@@ -0,0 +1,103 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/transforms/common_subexpression_elimination.h"
+
+namespace caffe2 {
+
+namespace {
+
+using transform::Graph;
+
+/**
+ *            /--->(FC)-->(Relu)
+ *  Before: (FC)-->(FC)-->(Relu)
+ *            \--->(FC)-->(Relu)
+ *
+ *                    /-->(Relu)
+ *  After : (FC)-->(FC)-->(Relu)
+ *                    \-->(Relu)
+ *
+ */
+TEST(CommonSubexpressionEliminationTest, TestSimple) {
+  NetDef netdef;
+  OperatorDef* op;
+
+  // This operator simply reads input and outputs it.
+  op = AddOp(&netdef, "FC", {"in", "w", "b"}, {"in1"});
+  op = AddOp(&netdef, "FC", {"in1", "w", "b"}, {"mid1"});
+  op = AddOp(&netdef, "FC", {"in1", "w", "b"}, {"mid2"});
+  op = AddOp(&netdef, "FC", {"in1", "w", "b"}, {"mid3"});
+  op = AddOp(&netdef, "Relu", {"mid1"}, {"out1"});
+  op = AddOp(&netdef, "Relu", {"mid2"}, {"out2"});
+  op = AddOp(&netdef, "Relu", {"mid3"}, {"out3"});
+
+  auto t = TransformRegistry()->Create("CommonSubexpressionElimination");
+  CHECK(t);
+  NetDef transformed_netdef = t->ApplyTo(netdef);
+
+  EXPECT_EQ(t->PatternMatch(Graph(netdef)).size(), 1); // one match
+  EXPECT_EQ(t->PatternMatch(Graph(netdef)).at(0).size(), 3); // 3 ops matched
+  EXPECT_EQ(transformed_netdef.op_size(), 5);
+  EXPECT_EQ(transformed_netdef.op(1).output_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(2).input_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(3).input_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(4).input_size(), 1);
+
+  // make sure op 1 writes to the blob read by 2, 3, and 4.
+  EXPECT_EQ(
+      transformed_netdef.op(1).output(0), transformed_netdef.op(2).input(0));
+  EXPECT_EQ(
+      transformed_netdef.op(1).output(0), transformed_netdef.op(3).input(0));
+  EXPECT_EQ(
+      transformed_netdef.op(1).output(0), transformed_netdef.op(4).input(0));
+}
+
+/**
+ * Almost the same as the one above, but it has to be able to merge from
+ * external input as well.
+ *
+ *            ->(FC)-->(Relu)
+ *  Before:   ->(FC)-->(Relu)
+ *            ->(FC)-->(Relu)
+ *
+ *                 /-->(Relu)
+ *  After :   ->(FC)-->(Relu)
+ *                 \-->(Relu)
+ *
+ */
+TEST(CommonSubexpressionEliminationTest, TestFromExternal) {
+  NetDef netdef;
+  OperatorDef* op;
+
+  // This operator simply reads input and outputs it.
+  op = AddOp(&netdef, "FC", {"in", "w", "b"}, {"mid1"});
+  op = AddOp(&netdef, "FC", {"in", "w", "b"}, {"mid2"});
+  op = AddOp(&netdef, "FC", {"in", "w", "b"}, {"mid3"});
+  op = AddOp(&netdef, "Relu", {"mid1"}, {"out1"});
+  op = AddOp(&netdef, "Relu", {"mid2"}, {"out2"});
+  op = AddOp(&netdef, "Relu", {"mid3"}, {"out3"});
+
+  auto t = TransformRegistry()->Create("CommonSubexpressionElimination");
+  CHECK(t);
+  NetDef transformed_netdef = t->ApplyTo(netdef);
+
+  EXPECT_EQ(t->PatternMatch(Graph(netdef)).size(), 1); // one match
+  EXPECT_EQ(t->PatternMatch(Graph(netdef)).at(0).size(), 3); // 3 ops matched
+  EXPECT_EQ(transformed_netdef.op_size(), 4);
+  EXPECT_EQ(transformed_netdef.op(0).output_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(1).input_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(2).input_size(), 1);
+  EXPECT_EQ(transformed_netdef.op(3).input_size(), 1);
+
+  EXPECT_EQ(
+      transformed_netdef.op(0).output(0), transformed_netdef.op(1).input(0));
+  EXPECT_EQ(
+      transformed_netdef.op(0).output(0), transformed_netdef.op(2).input(0));
+  EXPECT_EQ(
+      transformed_netdef.op(0).output(0), transformed_netdef.op(3).input(0));
+}
+
+} // namespace
+
+} // namespace Caffe2
diff --git a/caffe2/transforms/conv_to_nnpack_transform.cc b/caffe2/transforms/conv_to_nnpack_transform.cc
new file mode 100644
index 0000000..7240467
--- /dev/null
+++ b/caffe2/transforms/conv_to_nnpack_transform.cc
@@ -0,0 +1,7 @@
+#include "caffe2/transforms/conv_to_nnpack_transform.h"
+
+namespace caffe2 {
+
+REGISTER_TRANSFORM(ConvToNNPack, ConvToNNPackTransform);
+
+} // namespace caffe2
diff --git a/caffe2/transforms/conv_to_nnpack_transform.h b/caffe2/transforms/conv_to_nnpack_transform.h
new file mode 100644
index 0000000..6438b14
--- /dev/null
+++ b/caffe2/transforms/conv_to_nnpack_transform.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/transforms/single_op_transform.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+class ConvToNNPackTransform : public SingleOpTransform {
+ protected:
+  // Specify what the op needs to be to match the pattern.
+  bool MatchOperator(const OperatorDef& op) override {
+    return (
+        op.type() == "Conv" && op.device_option().device_type() == CPU &&
+        op.engine() != "NNPACK");
+  }
+
+  // Specify how the operator should be replaced.
+  void ReplaceOperator(OperatorDef* op) override {
+    op->set_engine("NNPACK");
+  }
+};
+
+} // namespace caffe2
diff --git a/caffe2/transforms/conv_to_nnpack_transform_test.cc b/caffe2/transforms/conv_to_nnpack_transform_test.cc
new file mode 100644
index 0000000..4ab80fc
--- /dev/null
+++ b/caffe2/transforms/conv_to_nnpack_transform_test.cc
@@ -0,0 +1,42 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/transforms/conv_to_nnpack_transform.h"
+
+namespace caffe2 {
+
+namespace {
+
+using transform::Graph;
+
+TEST(ConvToNNPackTest, TestSimple) {
+  NetDef netdef;
+  OperatorDef* op;
+  op = AddOp(&netdef, "Conv", {"in"}, {"out"});
+  op = AddOp(&netdef, "Relu", {"out"}, {"out"});
+  op = AddOp(&netdef, "Conv", {"out"}, {"out"}); // if not CPU, won't transform
+  op->mutable_device_option()->set_device_type(CUDA);
+  op = AddOp(&netdef, "Relu", {"out"}, {"out"});
+  op = AddOp(&netdef, "Conv", {"out"}, {"out"});
+  op->set_engine("NNPACK"); // does not need to be transformed
+  op = AddOp(&netdef, "Relu", {"out"}, {"out"});
+  op = AddOp(&netdef, "Conv", {"out"}, {"out"});
+  op = AddOp(&netdef, "Relu", {"out"}, {"out"});
+
+  auto t = TransformRegistry()->Create("ConvToNNPack");
+  NetDef transformed_netdef = t->ApplyTo(netdef);
+
+  int nnpack_count = 0;
+  for (auto& op : transformed_netdef.op()) {
+    if (op.type() == "Conv" && op.device_option().device_type() == CPU) {
+      EXPECT_EQ(op.engine(), "NNPACK");
+      nnpack_count++;
+    }
+  }
+  EXPECT_EQ(nnpack_count, 3);
+  EXPECT_EQ(t->PatternMatch(Graph(netdef)).size(), 2); // should get 2 matches
+}
+
+} // namespace
+
+} // namespace Caffe2
diff --git a/caffe2/transforms/pattern_net_transform.cc b/caffe2/transforms/pattern_net_transform.cc
new file mode 100644
index 0000000..d8546c5
--- /dev/null
+++ b/caffe2/transforms/pattern_net_transform.cc
@@ -0,0 +1,261 @@
+#include "caffe2/transforms/pattern_net_transform.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// First, single source traverse through the netdef.
+// This ensures all newly ordered are reachable from their prefix subset
+// Outputs a permutation of the operators.
+std::vector<int> PatternNetTransform::GetPatternTraversalOrder(
+    const transform::Graph& graph) {
+  std::vector<bool> visited(graph.size(), false);
+  std::vector<int> ordered_ops;
+  std::queue<int> q;
+  if (graph.size() > 0) {
+    q.push(0);
+    ordered_ops.push_back(0);
+    visited[0] = true;
+  }
+  while (!q.empty()) {
+    int idx = q.front();
+    q.pop();
+    for (const auto& edge : graph.node(idx).children) {
+      int x = edge.first;
+      if (!visited[x]) {
+        q.push(x);
+        ordered_ops.push_back(x);
+        visited[x] = true;
+      }
+    }
+    for (const auto& edge : graph.node(idx).parents) {
+      int x = edge.first;
+      if (!visited[x]) {
+        q.push(x);
+        ordered_ops.push_back(x);
+        visited[x] = true;
+      }
+    }
+  }
+  CAFFE_ENFORCE(
+      ordered_ops.size() == graph.size(), "Pattern graph must be connected.");
+  return ordered_ops;
+}
+
+bool compare_ops(
+    const OperatorDef& p_op,
+    const OperatorDef& g_op,
+    bool arg_match) {
+  // must specify a type for pattern operators
+  CAFFE_ENFORCE(
+      p_op.has_type(), "Types must be specified for all pattern operators.");
+  if (!MatchStrings(p_op.type(), g_op.type())) {
+    return false;
+  }
+  // ensure number of inputs are the same
+  if (p_op.input().size() != g_op.input().size()) {
+    return false;
+  }
+
+  // ensure number of outputs are the same
+  if (p_op.output().size() != g_op.output().size()) {
+    return false;
+  }
+
+  if (p_op.has_device_option()) {
+    if (!g_op.has_device_option() ||
+        p_op.device_option().device_type() !=
+            g_op.device_option().device_type()) {
+      return false;
+    }
+  }
+
+  // make sure engine is the same (if specified in pattern)
+  if (p_op.has_engine() && !MatchStrings(p_op.engine(), g_op.engine())) {
+    return false;
+  }
+  // If argument_match is specified, make sure those are the same.
+  if (arg_match) {
+    if (!MatchArguments(p_op, g_op)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// g.node(subgraph[i]) should match p_.node(ordered_ops_[i])
+// g.node(g_idx) should match p_.node(p_idx)
+bool PatternNetTransform::PatternRule(
+    const transform::Graph& g,
+    const std::vector<int>& subgraph,
+    int g_idx) {
+  if (subgraph.size() >= ordered_ops_.size()) {
+    return false;
+  }
+  int p_idx = ordered_ops_[subgraph.size()];
+
+  if (!compare_ops(p_.node(p_idx).op, g.node(g_idx).op, argument_match_)) {
+    return false;
+  }
+
+  // Let's say ordered_ops_ is [0, 2, 1], with 0 -> 2 being an edge
+  // When we try to match onto the second element, let's say our
+  // subgraph so far is [4], with it trying to become [4, 5].
+  // Then, we need to show that since 0 -> 2 is an edge is ordered_ops_,
+  // 4 must be a direct parent of 5 in the subgraph
+  // (the indices must match).
+  // Similarly, assume there is an edge from 1 -> 2 in p_.
+  // When trying to match [4, 5] to [4, 5, 7], we must verify that
+  // there exists an edge from 7 -> 5 in G.
+  for (const auto& edge : p_.node(p_idx).parents) {
+    int parent = edge.first;
+    // g_idx doesn't have parent in subgraph that p_[p_idx] has
+    // inverse_ops_ gets the index of a p_idx inside of ordered_ops_.
+    if (inverse_ops_[parent] < subgraph.size() &&
+        g.node(g_idx).parents.count(subgraph[inverse_ops_[parent]]) == 0) {
+      return false;
+    }
+  }
+
+  for (const auto& edge : p_.node(p_idx).children) {
+    int child = edge.first;
+    if (inverse_ops_[child] < subgraph.size() &&
+        g.node(g_idx).children.count(subgraph[inverse_ops_[child]]) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool PatternNetTransform::ValidatorRule(
+    const transform::Graph& /*g*/,
+    const std::vector<int>& subgraph) {
+  // Due to strict PatternRule, it suffices to simply check for size
+  return subgraph.size() == p_.size();
+}
+
+bool PatternNetTransform::ReplaceRule(
+    const std::vector<int>& match,
+    transform::Graph* g_ptr) {
+  CHECK(g_ptr);
+  auto& g = *g_ptr;
+
+  ssa_id_++;
+
+  // Map of PatternNet blob name to Matched blob name.
+  // Figures out how to rename the pattern_net to make the replacement fit.
+  std::unordered_map<string, string> external_renaming;
+
+  // Figure out blob renamings
+  for (int i = 0; i < match.size(); i++) {
+    int g_idx = match[i];
+    int p_idx = ordered_ops_[i];
+    for (int j = 0; j < p_.node(p_idx).op.input().size(); j++) {
+      string p_blob = p_.node(p_idx).op.input(j);
+      string g_blob = g.node(g_idx).op.input(j);
+      if (p_.external_input().count(p_blob)) {
+        external_renaming[p_blob] = g_blob;
+      }
+    }
+    for (int j = 0; j < p_.node(p_idx).op.output().size(); j++) {
+      string p_blob = p_.node(p_idx).op.output(j);
+      string g_blob = g.node(g_idx).op.output(j);
+      if (p_.external_output().count(p_blob)) {
+        external_renaming[p_blob] = g_blob;
+      }
+    }
+  }
+
+  auto input_list = g.GetSubgraphInput(match);
+  auto output_list = g.GetSubgraphOutput(match);
+
+  g.DeactivateSubgraph(match);
+
+  int offset = g.size();
+
+  g.resize_nodes(offset + r_.size());
+
+  // Append all the new operators.
+  for (int i = 0; i < r_.size(); i++) {
+    int new_node_idx = offset + i;
+
+    OperatorDef new_op = r_.node(i).op;
+
+    new_op.clear_input();
+    new_op.clear_output();
+    // Stitch Input from external graph into replaced subgraph
+    for (const auto& blob : r_.node(i).op.input()) {
+      if (external_renaming.count(blob)) {
+        string new_blob = external_renaming[blob];
+        new_op.add_input(new_blob);
+
+        // binary searches for new_blob amongst input list.
+        auto it = std::lower_bound(
+            input_list.begin(), input_list.end(), std::make_pair(new_blob, -1));
+
+        // if the input came from the graph (instead of G's external input)
+        for (; it < input_list.end() && it->first == new_blob; it++) {
+          int parent = it->second;
+          g.node(parent).children[new_node_idx].push_back(new_blob);
+          g.node(new_node_idx).parents[parent].push_back(new_blob);
+        }
+      } else {
+        new_op.add_input(TransformBlobWrapper(blob));
+      }
+    }
+    // Stitch Output from replaced subgraph to external graph.
+    for (const auto& blob : r_.node(i).op.output()) {
+      if (external_renaming.count(blob)) {
+        string new_blob = external_renaming[blob];
+        new_op.add_output(new_blob);
+
+        // binary searches for new_blob amongst input list.
+        auto it = std::lower_bound(
+            output_list.begin(),
+            output_list.end(),
+            std::make_pair(new_blob, -1));
+
+        // if the output goes to the graph (instead of G's external output)
+        for (; it < output_list.end() && it->first == new_blob; it++) {
+          int child = it->second;
+          g.node(child).parents[new_node_idx].push_back(new_blob);
+          g.node(new_node_idx).children[child].push_back(new_blob);
+        }
+      } else {
+        new_op.add_output(TransformBlobWrapper(blob));
+      }
+    }
+
+    // Connect all internal edges within replace graph
+    for (const auto& edge : r_.node(i).parents) {
+      int parent = edge.first;
+      int new_node_parent = offset + parent;
+      const auto& blobs = edge.second;
+      for (const string& blob : blobs) {
+        g.node(new_node_idx)
+            .parents[new_node_parent]
+            .push_back(TransformBlobWrapper(blob));
+      }
+    }
+
+    for (const auto& edge : r_.node(i).children) {
+      int child = edge.first;
+      int new_node_child = offset + child;
+      const auto& blobs = edge.second;
+      for (const string& blob : blobs) {
+        g.node(offset + i)
+            .children[new_node_child]
+            .push_back(TransformBlobWrapper(blob));
+      }
+    }
+
+    g.node(new_node_idx).op = new_op;
+    g.node(new_node_idx).active = true;
+  }
+  return true;
+}
+
+} // namespace Caffe2
diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h
new file mode 100644
index 0000000..1f54ccc
--- /dev/null
+++ b/caffe2/transforms/pattern_net_transform.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/transform.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+/**
+ * PatternNetTransform allows you to create transforms using a simple
+ * interface.
+ *
+ * Simply provide a Pattern NetDef and a Replace NetDef,
+ * and this Transform will find subgraphs which fit the pattern net,
+ * and replace it with the replace net.
+ */
+class PatternNetTransform : public Transform {
+ public:
+  PatternNetTransform(const NetDef& pattern_net, const NetDef& replace_net)
+      : p_(transform::Graph(pattern_net)), r_(transform::Graph(replace_net)) {
+    // external input and output must match!
+    CAFFE_ENFORCE(
+        p_.external_input() == r_.external_input(),
+        "External inputs do not match!");
+    CAFFE_ENFORCE(
+        p_.external_output() == r_.external_output(),
+        "External outputs do not match!");
+    ordered_ops_ = GetPatternTraversalOrder(p_);
+    inverse_ops_.resize(ordered_ops_.size());
+    for (int i = 0; i < ordered_ops_.size(); i++) {
+      inverse_ops_[ordered_ops_[i]] = i;
+    }
+  }
+
+  void EnableArgumentMatching() {
+    argument_match_ = true;
+  }
+
+  void DisableArgumentMatching() {
+    argument_match_ = false;
+  }
+
+ protected:
+  /**
+   * We want to the final result of subgraph to match the PatternNet in the
+   * order of ordered_ops, operator by operator.
+   *
+   * [[[ ie. g.node(subgraph[i]) should match p.node(ordered_ops[i]) ]]]
+   *
+   * PatternRule for PatternNetTransform does the following:
+   *
+   * When trying to insert node idx into subgraph[p_idx],
+   * we need to see if the edges between index and the
+   * subgraph match the edges between p[ordered_ops[idx]]
+   * and p[ordered_ops[0]...ordered_ops[p_idx-1]].
+   */
+  bool PatternRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph,
+      int idx) override;
+  /**
+   * ValidatorRule for PatternNetTransform does the following:
+   *
+   * Checks if the size of subgraph and p.size() are the same. That's it!
+   */
+  bool ValidatorRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph) override;
+  /**
+   * ReplaceRule for PatternNet Transform does the following:
+   *
+   * 1) Figure out edge renamings for edges going into/out of the subgraph.
+   * That is, for each blob in the pattern graph, what is it called in the
+   * matched subgraph?
+   *
+   * 2) Remove the matched subgraph.
+   *
+   * 3) Append the replace graph's operators to the graph's operators, and use
+   *    the renamings to rename the blob names.
+   *
+   * 4) Create all the children/parent relationships within the replaced graph,
+   *    and stitch together the inputs and outputs into the rest of the graph,
+   *    matching the removed subgraph.
+   */
+  bool ReplaceRule(const std::vector<int>& subgraph, transform::Graph* g_ptr)
+      override;
+
+ private:
+  /**
+   * This returns a permutation of the Pattern Net's operators.
+   * The permutation satisfies this property:
+   *    - For any index i, order(i) is a neighbor of some node from
+   *      {order(1), ..., order(i-1)}.
+   *
+   * Why is this important? Consider the following case:
+   * PatternNet: 0 ---> 2 <--- 1
+   *
+   * When we have matched onto [0], and trying to add [1] to our subgraph,
+   * we cannot, since PatternMatch only considers neighbors of the current
+   * subgraph as a candidate next node.
+   *
+   * Therefore, we must present the subgraph in an order such that each node is
+   * a neighbor of its prefix subgraph. One ordering for the above example is
+   * [0, 2, 1].
+   */
+  std::vector<int> GetPatternTraversalOrder(const transform::Graph& g);
+
+  // Graph of Pattern NetDef
+  transform::Graph p_;
+
+  // The Traversal Order of the Pattern Net's Operators
+  // This is a permutation of the numbers from {0, ..., p.size()-1}
+  std::vector<int> ordered_ops_;
+
+  // The Inverse of the Traversal Order of the Pattern Net's Operators
+  // That is, inverse_ops[ordered_ops[i]] == i is always true.
+  std::vector<int> inverse_ops_;
+
+  // Graph of Replace NetDef
+  transform::Graph r_;
+
+  // This flag determines if the transform will match operator arguments.
+  bool argument_match_ = false;
+
+  const string TransformBlobWrapper(const string& blob_name) {
+    return "transform/" + blob_name + "_" + caffe2::to_string(ssa_id_);
+  }
+
+  int ssa_id_ = 0;
+};
+
+} // namespace caffe2
diff --git a/caffe2/transforms/pattern_net_transform_test.cc b/caffe2/transforms/pattern_net_transform_test.cc
new file mode 100644
index 0000000..36925d9
--- /dev/null
+++ b/caffe2/transforms/pattern_net_transform_test.cc
@@ -0,0 +1,532 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/transforms/pattern_net_transform.h"
+
+namespace caffe2 {
+
+namespace {
+
+using transform::Graph;
+
+static std::atomic<int> counter;
+
+class DummyCounterOp final : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */) override {
+    counter.fetch_add(1);
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(DummyCounterOp1, DummyCounterOp);
+REGISTER_CUDA_OPERATOR(DummyCounterOp1, DummyCounterOp);
+
+OPERATOR_SCHEMA(DummyCounterOp1)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(DummyCounterOp2, DummyCounterOp);
+REGISTER_CUDA_OPERATOR(DummyCounterOp2, DummyCounterOp);
+
+OPERATOR_SCHEMA(DummyCounterOp2)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+REGISTER_CPU_OPERATOR(DummyCounterOp3, DummyCounterOp);
+REGISTER_CUDA_OPERATOR(DummyCounterOp3, DummyCounterOp);
+
+OPERATOR_SCHEMA(DummyCounterOp3)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX)
+    .AllowInplace({{0, 0}, {1, 1}});
+
+/**
+ * P = ---> (Op1) ---> (Op2) --->
+ *
+ * R = ---> (Op3) ---> (Op3) --->
+ */
+TEST(PatternNetTransformTest, TestGenerateTransform) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef netdef;
+  AddOp(&netdef, "DummyCounterOp1", {"in"}, {"mid1"});
+  AddOp(&netdef, "DummyCounterOp2", {"mid1"}, {"mid2"});
+  AddOp(&netdef, "DummyCounterOp1", {"mid2"}, {"mid3"});
+  AddOp(&netdef, "DummyCounterOp2", {"mid3"}, {"out"});
+
+  NetDef pdef;
+  AddOp(&pdef, "DummyCounterOp1", {"in"}, {"mid"});
+  AddOp(&pdef, "DummyCounterOp2", {"mid"}, {"out"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyCounterOp3", {"in"}, {"new_mid"});
+  AddOp(&rdef, "DummyCounterOp3", {"new_mid"}, {"out"});
+
+  PatternNetTransform t(pdef, rdef);
+
+  // test pattern match
+  Graph g(netdef);
+
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 2);
+
+  t.ReplacePattern(matches, &g);
+
+  EXPECT_EQ(g.size(), 8);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FALSE(g.is_node_active(i));
+  }
+  for (int i = 4; i < 8; i++) {
+    EXPECT_TRUE(g.is_node_active(i));
+  }
+
+  EXPECT_TRUE(g.node(4).children.count(5));
+  EXPECT_TRUE(g.node(5).children.count(6));
+  EXPECT_TRUE(g.node(6).children.count(7));
+
+  for (int i = 4; i < 8; i++) {
+    EXPECT_EQ(g.node(i).op.input().size(), 1);
+    EXPECT_EQ(g.node(i).op.output().size(), 1);
+  }
+
+  NetDef replaced_netdef = g.GetNetDef();
+
+  EXPECT_EQ(replaced_netdef.op().size(), 4);
+  EXPECT_EQ(replaced_netdef.op(0).type(), "DummyCounterOp3");
+  EXPECT_EQ(replaced_netdef.op(1).type(), "DummyCounterOp3");
+  EXPECT_EQ(replaced_netdef.op(2).type(), "DummyCounterOp3");
+  EXPECT_EQ(replaced_netdef.op(3).type(), "DummyCounterOp3");
+}
+
+/**
+ * P = ---> (Op1) ---> (Op2) --->
+ *
+ * R = ---> (Op3) ---> (Op3) --->
+ */
+TEST(PatternNetTransformTest, TestRepeatedTransform) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef netdef;
+  AddOp(&netdef, "DummyCounterOp1", {"in"}, {"out"});
+  AddOp(&netdef, "DummyCounterOp2", {"out"}, {"out"});
+  for (int i = 0; i < 99; i++) {
+    AddOp(&netdef, "DummyCounterOp1", {"out"}, {"out"});
+    AddOp(&netdef, "DummyCounterOp2", {"out"}, {"out"});
+  }
+
+  NetDef pdef;
+  AddOp(&pdef, "DummyCounterOp1", {"in"}, {"mid"});
+  AddOp(&pdef, "DummyCounterOp2", {"mid"}, {"out"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyCounterOp3", {"in"}, {"new_mid"});
+  AddOp(&rdef, "DummyCounterOp3", {"new_mid"}, {"out"});
+
+  PatternNetTransform t(pdef, rdef);
+
+  // test pattern match
+  Graph g(netdef);
+
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 100);
+
+  t.ReplacePattern(matches, &g);
+  NetDef replaced_netdef = g.GetNetDef();
+
+  EXPECT_EQ(replaced_netdef.op_size(), 200);
+  for (int i = 0; i < 200; i++) {
+    EXPECT_EQ(replaced_netdef.op(i).type(), "DummyCounterOp3");
+  }
+
+  unique_ptr<NetBase> net = CreateNet(replaced_netdef, &ws);
+  counter.exchange(0);
+  net.get()->Run();
+  EXPECT_EQ(200, counter.load());
+}
+
+/**
+ * P = ---> (Op1) ---> (Op3) ---> (Op2) --->
+ *            |------> (Op3) -------|
+ *
+ * R = ---> (Op1) --------------> (Op3) --->
+ *          |_(Op3)-->(Op3)-->(Op2)_|
+ *
+ */
+TEST(PatternNetTransformTest, TestHardTransform) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef netdef;
+  // Segment 1 (differs from P because of type)
+  AddOp(&netdef, "DummyCounterOp1", {"in"}, {"mid1a_1", "mid1b_1"});
+  AddOp(&netdef, "DummyCounterOp2", {"mid1a_1"}, {"mid2a_1"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1b_1"}, {"mid2b_1"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid2a_1", "mid2b_1"}, {"out_1"});
+
+  // Segment 2 (differs from P because of structure)
+  AddOp(
+      &netdef, "DummyCounterOp1", {"out_1"}, {"mid1a_2", "mid1b_2", "mid1c_2"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1a_2"}, {"mid2a_2"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1b_2"}, {"mid2b_2"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1c_2"}, {"mid2c_2"});
+  AddOp(
+      &netdef, "DummyCounterOp2", {"mid2a_2", "mid2b_2", "mid2c_2"}, {"out_2"});
+
+  // Segment 3
+  AddOp(&netdef, "DummyCounterOp1", {"out_2"}, {"mid1a_3", "mid1b_3"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1a_3"}, {"mid2a_3"});
+  AddOp(&netdef, "DummyCounterOp3", {"mid1b_3"}, {"mid2b_3"});
+  AddOp(&netdef, "DummyCounterOp2", {"mid2a_3", "mid2b_3"}, {"out"});
+
+  NetDef pdef;
+  // Should only match Segment 3
+  AddOp(&pdef, "DummyCounterOp1", {"sub_in"}, {"mid1a", "mid1b"});
+  AddOp(&pdef, "DummyCounterOp3", {"mid1a"}, {"mid2a"});
+  AddOp(&pdef, "DummyCounterOp3", {"mid1b"}, {"mid2b"});
+  AddOp(&pdef, "DummyCounterOp2", {"mid2a", "mid2b"}, {"sub_out"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyCounterOp1", {"sub_in"}, {"mid1a", "mid1b"});
+  AddOp(&rdef, "DummyCounterOp3", {"mid1b"}, {"mid2b"});
+  AddOp(&rdef, "DummyCounterOp3", {"mid2b"}, {"mid3b"});
+  AddOp(&rdef, "DummyCounterOp2", {"mid3b"}, {"mid4b"});
+  AddOp(&rdef, "DummyCounterOp3", {"mid1a", "mid4b"}, {"sub_out"});
+
+  PatternNetTransform t(pdef, rdef);
+  Graph g(netdef);
+  EXPECT_EQ(g.size(), 13);
+
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 1);
+
+  t.ReplacePattern(matches, &g);
+  EXPECT_EQ(g.size(), 18);
+
+  NetDef replaced_netdef = g.GetNetDef();
+  EXPECT_EQ(replaced_netdef.op_size(), 14);
+  unique_ptr<NetBase> net = CreateNet(replaced_netdef, &ws);
+  counter.exchange(0);
+  net.get()->Run();
+  EXPECT_EQ(14, counter.load());
+}
+
+TEST(PatternNetTransformTest, TestGeneralStringMatching) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef pdef;
+  AddOp(&pdef, "*", {"in"}, {"mid"});
+  AddOp(&pdef, "DummyOp1|DummyOp2", {"mid"}, {"mid2"});
+  AddOp(&pdef, "DummyOp3", {"mid2"}, {"out"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyOp1", {"in"}, {"out"});
+
+  NetDef netdef;
+  AddOp(&netdef, "DummyOp1", {"in"}, {"mid"});
+  AddOp(&netdef, "DummyOp3", {"mid"}, {"mid"}); // start of match 1
+  AddOp(&netdef, "DummyOp2", {"mid"}, {"mid"});
+  AddOp(&netdef, "DummyOp3", {"mid"}, {"mid"}); // end of match 1
+  AddOp(&netdef, "DummyOp1", {"mid"}, {"mid"}); // start of match 2
+  AddOp(&netdef, "DummyOp1", {"mid"}, {"mid"});
+  AddOp(&netdef, "DummyOp3", {"mid"}, {"mid"}); // end of match 2
+  AddOp(&netdef, "DummyOp3", {"mid"}, {"out"});
+
+  PatternNetTransform t(pdef, rdef);
+  transform::Graph g(netdef);
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 2);
+}
+
+TEST(PatternNetTransformTest, TestDeviceOptionMatching) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef pdef;
+  auto op = AddOp(&pdef, "DummyOp1", {"in"}, {"out"});
+  op->mutable_device_option()->set_device_type(CPU);
+
+  NetDef rdef;
+  op = AddOp(&rdef, "DummyOp1", {"in"}, {"out"});
+  op->mutable_device_option()->set_device_type(CUDA);
+
+  NetDef netdef;
+  op = AddOp(&netdef, "DummyOp1", {"in"}, {"mid"});
+  op->mutable_device_option()->set_device_type(CPU);
+  op = AddOp(&netdef, "DummyOp1", {"mid"}, {"mid"}); // should not match
+  op->mutable_device_option()->set_device_type(CUDA);
+  op = AddOp(&netdef, "DummyOp1", {"mid"}, {"out"});
+  op->mutable_device_option()->set_device_type(CPU);
+
+  PatternNetTransform t(pdef, rdef);
+  transform::Graph g(netdef);
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 2);
+
+  NetDef transformed_net = t.ApplyTo(netdef);
+  for (const auto& opdef : transformed_net.op()) {
+    EXPECT_TRUE(opdef.has_device_option());
+    EXPECT_EQ(opdef.device_option().device_type(), CUDA);
+  }
+}
+
+TEST(PatternNetTransformTest, TestEngineMatching) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef pdef;
+  auto op = AddOp(&pdef, "DummyOp1", {"in"}, {"out"});
+  op->set_engine("FakeEng1|FakeEng2");
+
+  NetDef rdef;
+  op = AddOp(&rdef, "DummyOp1", {"in"}, {"out"});
+  op->set_engine("FakeEng3");
+
+  NetDef netdef;
+  op = AddOp(&netdef, "DummyOp1", {"in"}, {"mid"});
+  op->set_engine("FakeEng1");
+  op = AddOp(&netdef, "DummyOp1", {"mid"}, {"mid"});
+  op->set_engine("FakeEng2");
+  op = AddOp(&netdef, "DummyOp1", {"mid"}, {"out"}); // should not match
+  op->set_engine("FakeEng3");
+
+  PatternNetTransform t(pdef, rdef);
+  transform::Graph g(netdef);
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 2);
+
+  NetDef transformed_net = t.ApplyTo(netdef);
+  for (const auto& opdef : transformed_net.op()) {
+    EXPECT_EQ(opdef.engine(), "FakeEng3");
+  }
+}
+
+TEST(PatternNetTransformTest, TestSingularArgumentMatching) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef pdef;
+  auto op = AddOp(&pdef, "Conv", {"in"}, {"out"});
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_w");
+    arg->set_i(3);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_h");
+    arg->set_i(3);
+  }
+
+  NetDef rdef;
+  op = AddOp(&rdef, "Conv", {"in"}, {"out"});
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_w");
+    arg->set_i(5);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_h");
+    arg->set_i(5);
+  }
+
+  NetDef netdef;
+  op = AddOp(&netdef, "Conv", {"in"}, {"mid"}); // Will match
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_w");
+    arg->set_i(3);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_h");
+    arg->set_i(3);
+  }
+  op = AddOp(&netdef, "Conv", {"mid"}, {"mid"}); // Has bad args, will not match
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_w");
+    arg->set_i(4);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_h");
+    arg->set_i(4);
+  }
+  op = AddOp(&netdef, "Conv", {"mid"}, {"mid"}); // Has no args, will not match
+  op = AddOp(&netdef, "Conv", {"mid"}, {"out"}); // Has different names
+  {
+    auto arg = op->add_arg();
+    arg->set_name("yolo");
+    arg->set_i(3);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("swag");
+    arg->set_i(3);
+  }
+  op = AddOp(&netdef, "Conv", {"in"}, {"mid"}); // Will match
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_w");
+    arg->set_i(3);
+  }
+  {
+    auto arg = op->add_arg();
+    arg->set_name("stride_h");
+    arg->set_i(3);
+  }
+
+  PatternNetTransform t(pdef, rdef);
+  t.EnableArgumentMatching();
+  transform::Graph g(netdef);
+  auto matches = t.PatternMatch(g);
+  EXPECT_EQ(matches.size(), 2);
+  NetDef transformed_net = t.ApplyTo(netdef);
+  EXPECT_EQ(transformed_net.op(0).arg(0).name(), "stride_w");
+  EXPECT_EQ(transformed_net.op(0).arg(0).i(), 5);
+  EXPECT_EQ(transformed_net.op(0).arg(1).name(), "stride_h");
+  EXPECT_EQ(transformed_net.op(0).arg(1).i(), 5);
+
+  EXPECT_EQ(transformed_net.op(4).arg(0).name(), "stride_w");
+  EXPECT_EQ(transformed_net.op(4).arg(0).i(), 5);
+  EXPECT_EQ(transformed_net.op(4).arg(1).name(), "stride_h");
+  EXPECT_EQ(transformed_net.op(4).arg(1).i(), 5);
+}
+
+/**
+ *           |--(Op2)--|
+ * P = --->(Op1)----->(Op3)--->
+ *           |--(Op2)--|
+ *
+ * R = ---> (Op2) --->
+ *
+ *                |--(Op2)--|
+ *           -->(Op1)----->(Op3)---
+ *           |    |--(Op2)--|     |
+ * G = ---> (Op1)                (Op3) --->
+ *           |    |--(Op2)--|     |
+ *           -->(Op1)----->(Op3)--
+ *                |--(Op2)--|
+ *
+ * In this test, the two "parallel" modules have intersecting execution orders.
+ * We wish to test that the pattern match can still detect the two modules,
+ * separately.
+ *
+ * Furthermore, we will apply the transform to G, TWICE.
+ * It should reduce G to a single operator.
+ */
+TEST(PatternNetTransformTest, TestNonStrictTopographicTransform) {
+  Workspace ws;
+  ws.CreateBlob("in");
+
+  NetDef netdef;
+  // Head
+  AddOp(&netdef, "DummyCounterOp1", {"in"}, {"in_1", "in_2"});
+
+  // 2 intertwined segments, each matching P. No strict ordering.
+  AddOp(&netdef, "DummyCounterOp1", {"in_1"}, {"m1_1", "m2_1"});
+  AddOp(&netdef, "DummyCounterOp1", {"in_2"}, {"m1_2", "m2_2"});
+  AddOp(&netdef, "DummyCounterOp2", {"m1_1"}, {"out1_1"});
+  AddOp(&netdef, "DummyCounterOp2", {"m1_2"}, {"out1_2"});
+  AddOp(&netdef, "DummyCounterOp2", {"m2_1"}, {"out2_1"});
+  AddOp(&netdef, "DummyCounterOp2", {"m2_2"}, {"out2_2"});
+  AddOp(&netdef, "DummyCounterOp3", {"out1_1", "out2_1"}, {"out1"});
+  AddOp(&netdef, "DummyCounterOp3", {"out1_2", "out2_2"}, {"out2"});
+
+  // Tail
+  AddOp(&netdef, "DummyCounterOp3", {"out1", "out2"}, {"out"});
+
+  NetDef pdef;
+  AddOp(&pdef, "DummyCounterOp1", {"myin"}, {"mid1a", "mid1b"});
+  AddOp(&pdef, "DummyCounterOp2", {"mid1a"}, {"mid2a"});
+  AddOp(&pdef, "DummyCounterOp2", {"mid1b"}, {"mid2b"});
+  AddOp(&pdef, "DummyCounterOp3", {"mid2a", "mid2b"}, {"myout"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyCounterOp2", {"myin"}, {"myout"});
+
+  PatternNetTransform t(pdef, rdef);
+
+  NetDef replaced_netdef = t.ApplyTo(netdef);
+  EXPECT_EQ(replaced_netdef.op_size(), 4);
+  unique_ptr<NetBase> net = CreateNet(replaced_netdef, &ws);
+  counter.exchange(0);
+  net.get()->Run();
+  EXPECT_EQ(4, counter.load());
+
+  // apply the transform again
+  // the entire net should get transformed this time
+  NetDef double_transformed_net = t.ApplyTo(replaced_netdef);
+  EXPECT_EQ(double_transformed_net.op_size(), 1);
+}
+
+/**
+ *      --->(Op1)----->(Op2)--->
+ *            |          ^
+ * P =        |----------|
+ *            |          v
+ *      --->(Op1)----->(Op2)--->
+ *
+ * R =  ---> (Op3) --->
+ *
+ * G = P -> P
+ *
+ * In this test, we fuse a subgraph with two inputs and two outputs, into one
+ * operator.
+ *
+ * This will ensure that we can allow a single edge to represent
+ * multiple blob names (the input and output of R are both 2 blobs).
+ *
+ * This will also ensure that patternmatch can traverse "backwards", from a node
+ * to its parent.
+ *
+ * Furthermore, this tests for repeat matches, since matching on either of the
+ * first two Op1 nodes will produce a match, but they are identical.
+ * So, the pattern should match 4 times, but only be replaced twice.
+ */
+TEST(PatternNetTransformTest, TestMultiInputOutputTransform) {
+  Workspace ws;
+  ws.CreateBlob("in1");
+  ws.CreateBlob("in2");
+
+  NetDef netdef;
+  AddOp(&netdef, "DummyCounterOp1", {"in1"}, {"in1"}); // has 2 children
+  AddOp(&netdef, "DummyCounterOp1", {"in2"}, {"in2"}); // has 2 children
+  AddOp(&netdef, "DummyCounterOp2", {"in1", "in2"}, {"mid1"});
+  AddOp(&netdef, "DummyCounterOp2", {"in1", "in2"}, {"mid2"});
+  AddOp(&netdef, "DummyCounterOp1", {"mid1"}, {"mid1"}); // has 2 children
+  AddOp(&netdef, "DummyCounterOp1", {"mid2"}, {"mid2"}); // has 2 children
+  AddOp(&netdef, "DummyCounterOp2", {"mid1", "mid2"}, {"out1"});
+  AddOp(&netdef, "DummyCounterOp2", {"mid1", "mid2"}, {"out2"});
+
+  NetDef pdef;
+  AddOp(&pdef, "DummyCounterOp1", {"subin1"}, {"subin1"}); // has 2 children
+  AddOp(&pdef, "DummyCounterOp1", {"subin2"}, {"subin2"}); // has 2 children
+  AddOp(&pdef, "DummyCounterOp2", {"subin1", "subin2"}, {"subout1"});
+  AddOp(&pdef, "DummyCounterOp2", {"subin1", "subin2"}, {"subout2"});
+
+  NetDef rdef;
+  AddOp(&rdef, "DummyCounterOp3", {"subin1", "subin2"}, {"subout1", "subout2"});
+
+  PatternNetTransform t(pdef, rdef);
+  Graph g(netdef);
+
+  NetDef replaced_netdef = t.ApplyTo(netdef);
+  EXPECT_EQ(replaced_netdef.op_size(), 2);
+  unique_ptr<NetBase> net = CreateNet(replaced_netdef, &ws);
+  counter.exchange(0);
+  net.get()->Run();
+  EXPECT_EQ(2, counter.load());
+}
+
+} // namespace
+
+} // namespace Caffe2
diff --git a/caffe2/transforms/single_op_transform.cc b/caffe2/transforms/single_op_transform.cc
new file mode 100644
index 0000000..4527550
--- /dev/null
+++ b/caffe2/transforms/single_op_transform.cc
@@ -0,0 +1,40 @@
+#include "caffe2/transforms/single_op_transform.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/net.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+using transform::Graph;
+
+bool SingleOpTransform::PatternRule(
+    const Graph& g,
+    const std::vector<int>& subgraph,
+    int idx) {
+  if (subgraph.size() == 0) {
+    return MatchOperator(g.node(idx).op);
+  }
+  return false;
+}
+
+bool SingleOpTransform::ValidatorRule(
+    const Graph& /*g*/,
+    const std::vector<int>& subgraph) {
+  if (subgraph.size() == 1) {
+    return true;
+  }
+  return false;
+}
+
+bool SingleOpTransform::ReplaceRule(
+    const std::vector<int>& subgraph,
+    Graph* g_ptr) {
+  CHECK(g_ptr);
+  auto& g = *g_ptr;
+  ReplaceOperator(&(g.node(subgraph[0]).op));
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/transforms/single_op_transform.h b/caffe2/transforms/single_op_transform.h
new file mode 100644
index 0000000..dbc53e3
--- /dev/null
+++ b/caffe2/transforms/single_op_transform.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/transform.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+/**
+ * Single Op Transform Base class
+ *
+ * A transform which is applied to a single node, in place.
+ *
+ * Transforms which derive from SingleOpTransform need to override:
+ * ReplaceOperator and MatchOperator.
+ */
+class SingleOpTransform : public Transform {
+ protected:
+  bool PatternRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph,
+      int idx) override;
+  bool ValidatorRule(
+      const transform::Graph& g,
+      const std::vector<int>& subgraph) override;
+  bool ReplaceRule(const std::vector<int>& subgraph, transform::Graph* g_ptr)
+      override;
+
+  // Specify what the op needs to be to match the pattern.
+  virtual bool MatchOperator(const OperatorDef& op) = 0;
+
+  // Specify how the operator should be replaced.
+  virtual void ReplaceOperator(OperatorDef* op) = 0;
+};
+
+} // namespace caffe2
diff --git a/caffe2/utils/Array.cpp b/caffe2/utils/Array.cpp
new file mode 100644
index 0000000..d2f99b9
--- /dev/null
+++ b/caffe2/utils/Array.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/Array.h"
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
new file mode 100644
index 0000000..921deb9
--- /dev/null
+++ b/caffe2/utils/Array.h
@@ -0,0 +1,322 @@
+/**
+* This file is based on the std::array implementation of libstdc++ at
+* https://gcc.gnu.org/onlinedocs/gcc-7.1.0/libstdc++/api/a01056_source.html
+*
+* Changes:
+*  - isolate, i.e. remove dependencies on internal libstdc++ stuff
+*  - use c++17 behavior even in c++11 or c++14
+*  - remove std::swappable special case because that doesn't work with MSVC
+*  - constexpr more things
+*  - add some features like prepend/tail
+*
+* If using std::array at runtime, feel free to either keep using std::array or use this one - it doesn't really matter.
+* For compile time computations, this one here is preferred because std::array in C++11
+* misses some constexpr specifiers, forcing these methods to be called at runtime instead of compile time.
+*/
+
+// Copyright (C) 2007-2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#pragma once
+
+#include <utility>
+#include <stdexcept>
+#include <string>
+#include "caffe2/utils/C++17.h"
+
+namespace c10 { namespace guts {
+
+namespace detail {
+template<typename _Tp, std::size_t _Nm>
+struct __array_traits final {
+  using _Type = _Tp[_Nm];
+
+  static constexpr _Tp& _S_ref(const _Type& __t, std::size_t __n) noexcept {
+    return const_cast<_Tp&>(__t[__n]);
+  }
+
+  static constexpr _Tp* _S_ptr(const _Type& __t) noexcept {
+    return const_cast<_Tp*>(__t);
+  }
+};
+
+template<typename _Tp>
+struct __array_traits<_Tp, 0> final {
+  struct _Type final {};
+
+  static constexpr _Tp& _S_ref(const _Type& __t, std::size_t) noexcept {
+    return *_S_ptr(__t);
+  }
+
+  static constexpr _Tp* _S_ptr(const _Type&) noexcept {
+    return nullptr;
+  }
+};
+
+[[noreturn]] inline void __throw_out_of_range(std::string msg) {
+  throw std::out_of_range(std::move(msg));
+}
+}
+
+template<typename _Tp, std::size_t _Nm>
+class array final {
+public:
+  using value_type = _Tp;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using iterator = value_type*;
+  using const_iterator = const value_type*;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+private:
+  using _AT_Type = detail::__array_traits<_Tp, _Nm>;
+public: // needs to be public member for aggregate initialization
+  typename _AT_Type::_Type _M_elems;
+
+public:
+  // No explicit construct/copy/destroy for aggregate type.
+
+  // DR 776.
+  C10_CPP14_CONSTEXPR void fill(const value_type& __u)
+  { std::fill_n(begin(), size(), __u); }
+
+  C10_CPP14_CONSTEXPR void swap(array& __other)
+  { std::swap_ranges(begin(), end(), __other.begin()); }
+
+  // Iterators.
+  C10_CPP14_CONSTEXPR iterator begin() noexcept
+  { return iterator(data()); }
+
+  constexpr const_iterator begin() const noexcept
+  { return const_iterator(data()); }
+
+  C10_CPP14_CONSTEXPR iterator end() noexcept
+  { return iterator(data() + _Nm); }
+
+  constexpr const_iterator end() const noexcept
+  { return const_iterator(data() + _Nm); }
+
+  C10_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept
+  { return reverse_iterator(end()); }
+
+  constexpr const_reverse_iterator rbegin() const noexcept
+  { return const_reverse_iterator(end()); }
+
+  C10_CPP14_CONSTEXPR reverse_iterator rend() noexcept
+  { return reverse_iterator(begin()); }
+
+  constexpr const_reverse_iterator rend() const noexcept
+  { return const_reverse_iterator(begin()); }
+
+  constexpr const_iterator cbegin() const noexcept
+  { return const_iterator(data()); }
+
+  constexpr const_iterator cend() const noexcept
+  { return const_iterator(data() + _Nm); }
+
+  constexpr const_reverse_iterator crbegin() const noexcept
+  { return const_reverse_iterator(end()); }
+
+  constexpr const_reverse_iterator crend() const noexcept
+  { return const_reverse_iterator(begin()); }
+
+  // Capacity.
+  constexpr size_type size() const noexcept { return _Nm; }
+
+  constexpr size_type max_size() const noexcept { return _Nm; }
+
+  constexpr bool empty() const noexcept { return size() == 0; }
+
+  // Element access.
+  C10_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept
+  { return _AT_Type::_S_ref(_M_elems, __n); }
+
+  constexpr const_reference operator[](size_type __n) const noexcept
+  { return _AT_Type::_S_ref(_M_elems, __n); }
+
+  C10_CPP14_CONSTEXPR reference at(size_type __n) {
+    if (__n >= _Nm) {
+      detail::__throw_out_of_range(std::string() +
+          "array::at: __n (which is " + to_string(__n) + ") " +
+          ">= _Nm (which is " + to_string(_Nm) + ")");
+    }
+    return _AT_Type::_S_ref(_M_elems, __n);
+  }
+
+  constexpr const_reference at(size_type __n) const {
+    // Result of conditional expression must be an lvalue so use
+    // boolean ? lvalue : (throw-expr, lvalue)
+    return __n < _Nm ? _AT_Type::_S_ref(_M_elems, __n)
+      : (detail::__throw_out_of_range(std::string() +
+            "array::at: __n (which is " + to_string(__n) + ") " +
+            ">= _Nm (which is " + to_string(_Nm) + ")"),
+         _AT_Type::_S_ref(_M_elems, 0));
+     }
+
+  C10_CPP14_CONSTEXPR reference front() noexcept
+  { return *begin(); }
+
+  constexpr const_reference front() const noexcept
+  { return _AT_Type::_S_ref(_M_elems, 0); }
+
+  C10_CPP14_CONSTEXPR reference back() noexcept
+  { return _Nm ? *(end() - 1) : *end(); }
+
+  constexpr const_reference back() const noexcept
+  {
+    return _Nm ? _AT_Type::_S_ref(_M_elems, _Nm - 1)
+            : _AT_Type::_S_ref(_M_elems, 0);
+  }
+
+  C10_CPP14_CONSTEXPR pointer data() noexcept
+  { return _AT_Type::_S_ptr(_M_elems); }
+
+  constexpr const_pointer data() const noexcept
+  { return _AT_Type::_S_ptr(_M_elems); }
+};
+
+#if defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201606
+  template<typename _Tp, typename... _Up>
+  array(_Tp, _Up...) ->
+    array<enable_if_t<(is_same_v<_Tp, _Up> && ...), _Tp>, 1 + sizeof...(_Up)>;
+#endif
+
+// Array comparisons.
+namespace detail {
+template<class T, size_t N>
+constexpr inline bool array_equals_(const array<T, N>& lhs, const array<T, N>& rhs, size_t current_index) {
+  return (current_index == N)
+         ? true
+         : (lhs.at(current_index) == rhs.at(current_index) && array_equals_(lhs, rhs, current_index + 1));
+}
+template<class T, size_t N>
+constexpr inline bool array_less_(const array<T, N>& lhs, const array<T, N>& rhs, size_t current_index) {
+  return (current_index == N)
+         ? false
+         : (lhs.at(current_index) < rhs.at(current_index) || array_less_(lhs, rhs, current_index + 1));
+}
+}
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator==(const array<_Tp, _Nm>& __one, const array<_Tp, _Nm>& __two)
+{ return detail::array_equals_(__one, __two, 0); }
+
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator!=(const array<_Tp, _Nm>& __one, const array<_Tp, _Nm>& __two)
+{ return !(__one == __two); }
+
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator<(const array<_Tp, _Nm>& __a, const array<_Tp, _Nm>& __b)
+{ return detail::array_less_(__a, __b, 0); }
+
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator>(const array<_Tp, _Nm>& __one, const array<_Tp, _Nm>& __two)
+{ return __two < __one; }
+
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator<=(const array<_Tp, _Nm>& __one, const array<_Tp, _Nm>& __two)
+{ return !(__one > __two); }
+
+template<typename _Tp, std::size_t _Nm>
+constexpr inline bool operator>=(const array<_Tp, _Nm>& __one, const array<_Tp, _Nm>& __two)
+{ return !(__one < __two); }
+
+// Specialized algorithms.
+template<typename _Tp, std::size_t _Nm>
+inline void swap(array<_Tp, _Nm>& __one, array<_Tp, _Nm>& __two) noexcept(noexcept(__one.swap(__two)))
+{ __one.swap(__two); }
+
+template<std::size_t _Int, typename _Tp, std::size_t _Nm>
+constexpr _Tp& get(array<_Tp, _Nm>& __arr) noexcept {
+   static_assert(_Int < _Nm, "array index is within bounds");
+   return detail::__array_traits<_Tp, _Nm>::_S_ref(__arr._M_elems, _Int);
+}
+
+template<std::size_t _Int, typename _Tp, std::size_t _Nm>
+constexpr _Tp&& get(array<_Tp, _Nm>&& __arr) noexcept
+{
+  static_assert(_Int < _Nm, "array index is within bounds");
+  return std::move(get<_Int>(__arr));
+}
+
+template<std::size_t _Int, typename _Tp, std::size_t _Nm>
+constexpr const _Tp& get(const array<_Tp, _Nm>& __arr) noexcept
+{
+  static_assert(_Int < _Nm, "array index is within bounds");
+  return detail::__array_traits<_Tp, _Nm>::_S_ref(__arr._M_elems, _Int);
+}
+
+/**
+ * Some added features not available in std::array.
+ * Only call these at compile time, they're slow if called at runtime.
+ * Examples:
+ *  tail({2, 3, 4}) == {3, 4}
+ *  prepend(2, {3, 4}) == {2, 3, 4}
+ */
+namespace detail {
+template<class T, size_t N, size_t... I>
+constexpr inline array<T, N-1> tail_(const array<T, N>& arg, guts::index_sequence<I...>) {
+  static_assert(sizeof...(I) == N-1, "invariant");
+  return {{get<I+1>(arg)...}};
+}
+}
+template<class T, size_t N>
+constexpr inline array<T, N-1> tail(const array<T, N>& arg) {
+  static_assert(N > 0, "Can only call tail() on an array with at least one element");
+  return detail::tail_(arg, guts::make_index_sequence<N-1>());
+}
+
+namespace detail {
+template<class T, size_t N, size_t... I>
+constexpr inline array<T, N+1> prepend_(T head, const array<T, N>& tail, guts::index_sequence<I...>) {
+  return {{std::move(head), get<I>(tail)...}};
+}
+}
+template<class T, size_t N>
+constexpr inline array<T, N+1> prepend(T head, const array<T, N>& tail) {
+  return detail::prepend_(std::move(head), tail, guts::make_index_sequence<N>());
+}
+
+/**
+ * Convert a C array into a std::array.
+ * Example:
+ *   int source[3] = {2, 3, 4};
+ *   std::array<int, 3> target = to_std_array(source);
+ */
+
+namespace detail {
+template<class T, size_t N, size_t... I>
+constexpr array<T, N> to_array_(const T (&arr)[N], guts::index_sequence<I...>) {
+  return {{arr[I]...}};
+}
+}
+
+template<class T, size_t N>
+constexpr array<T, N> to_array(const T (&arr)[N]) {
+  return detail::to_array_(arr, guts::make_index_sequence<N>());
+}
+
+}}
diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp
new file mode 100644
index 0000000..1f3171e
--- /dev/null
+++ b/caffe2/utils/Array_test.cpp
@@ -0,0 +1,92 @@
+#include "caffe2/utils/Array.h"
+#include <gtest/gtest.h>
+
+using c10::guts::array;
+using c10::guts::to_array;
+
+namespace {
+namespace test_equals {
+  static_assert(array<int, 0>{{}} == array<int, 0>{{}}, "");
+  static_assert(array<int, 3>{{2, 3, 4}} == array<int, 3>{{2, 3, 4}}, "");
+  static_assert(!(array<int, 3>{{2, 3, 4}} == array<int, 3>{{1, 3, 4}}), "");
+  static_assert(!(array<int, 3>{{2, 3, 4}} == array<int, 3>{{2, 1, 4}}), "");
+  static_assert(!(array<int, 3>{{2, 3, 4}} == array<int, 3>{{2, 3, 1}}), "");
+}
+
+namespace test_notequals {
+  static_assert(!(array<int, 0>{{}} != array<int, 0>{{}}), "");
+  static_assert(!(array<int, 3>{{2, 3, 4}} != array<int, 3>{{2, 3, 4}}), "");
+  static_assert(array<int, 3>{{2, 3, 4}} != array<int, 3>{{1, 3, 4}}, "");
+  static_assert(array<int, 3>{{2, 3, 4}} != array<int, 3>{{2, 1, 4}}, "");
+  static_assert(array<int, 3>{{2, 3, 4}} != array<int, 3>{{2, 3, 1}}, "");
+}
+
+namespace test_lessthan {
+  static_assert(!(array<int, 0>{{}} < array<int, 0>{{}}), "");
+  static_assert(!(array<int, 1>{{2}} < array<int, 1>{{1}}), "");
+  static_assert(array<int, 1>{{1}} < array<int, 1>{{2}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} < array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 3}} < array<int, 3>{{2, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} < array<int, 3>{{0, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 3}} < array<int, 3>{{1, 3, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} < array<int, 3>{{1, 1, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 3}} < array<int, 3>{{1, 2, 4}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} < array<int, 3>{{1, 2, 2}}), "");
+}
+
+namespace test_greaterthan {
+  static_assert(!(array<int, 0>{{}} > array<int, 0>{{}}), "");
+  static_assert(!(array<int, 1>{{1}} > array<int, 1>{{2}}), "");
+  static_assert(array<int, 1>{{2}} > array<int, 1>{{1}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} > array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{2, 2, 3}} > array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{0, 2, 3}} > array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 3, 3}} > array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 1, 3}} > array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 4}} > array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 2}} > array<int, 3>{{1, 2, 3}}), "");
+}
+
+namespace test_lessequals {
+  static_assert(array<int, 0>{{}} <= array<int, 0>{{}}, "");
+  static_assert(!(array<int, 1>{{2}} <= array<int, 1>{{1}}), "");
+  static_assert(array<int, 1>{{1}} <= array<int, 1>{{2}}, "");
+  static_assert(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{1, 2, 3}}, "");
+  static_assert(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{2, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{0, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{1, 3, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{1, 1, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{1, 2, 4}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 3}} <= array<int, 3>{{1, 2, 2}}), "");
+}
+
+namespace test_greaterequals {
+  static_assert(array<int, 0>{{}} >= array<int, 0>{{}}, "");
+  static_assert(!(array<int, 1>{{1}} >= array<int, 1>{{2}}), "");
+  static_assert(array<int, 1>{{2}} >= array<int, 1>{{1}}, "");
+  static_assert(array<int, 3>{{1, 2, 3}} >= array<int, 3>{{1, 2, 3}}, "");
+  static_assert(array<int, 3>{{2, 2, 3}} >= array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{0, 2, 3}} >= array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 3, 3}} >= array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 1, 3}} >= array<int, 3>{{1, 2, 3}}), "");
+  static_assert(array<int, 3>{{1, 2, 4}} >= array<int, 3>{{1, 2, 3}}, "");
+  static_assert(!(array<int, 3>{{1, 2, 2}} >= array<int, 3>{{1, 2, 3}}), "");
+}
+
+namespace test_tail {
+    static_assert(array < int, 2 > {{3, 4}} == tail(array < int, 3 > {{2, 3, 4}}), "");
+    static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), "");
+}
+
+namespace test_prepend {
+    static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), "");
+    static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), "");
+}
+
+namespace test_to_std_array {
+    constexpr int obj2[3] = {3, 5, 6};
+    static_assert(array < int, 3 > {{3, 5, 6}} == to_array(obj2), "");
+    static_assert(array < int, 3 > {{3, 5, 6}} == to_array<int, 3>({3, 5, 6}), "");
+}
+
+}
diff --git a/caffe2/utils/C++17.cpp b/caffe2/utils/C++17.cpp
new file mode 100644
index 0000000..d75d9fc
--- /dev/null
+++ b/caffe2/utils/C++17.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/C++17.h"
diff --git a/caffe2/utils/C++17.h b/caffe2/utils/C++17.h
new file mode 100644
index 0000000..0186944
--- /dev/null
+++ b/caffe2/utils/C++17.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <type_traits>
+#include <utility>
+#include <memory>
+#include <sstream>
+#include <string>
+
+/*
+ * This header adds some polyfills with C++14 and C++17 functionality
+ */
+
+namespace c10 { namespace guts {
+
+#if __cplusplus >= 201402L || defined(__cpp_lib_make_unique) && __cpp_lib_make_unique >= 201304L || \
+  (defined(__ANDROID__) && __ANDROID__ && __cplusplus >= 201300L) || defined(_MSC_VER) && _MSC_VER >= 1900
+
+/* using override */ using std::make_unique;
+
+#else
+
+// Implementation taken from folly
+template <typename T, typename... Args>
+typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+// Allows 'make_unique<T[]>(10)'. (N3690 s20.9.1.4 p3-4)
+template <typename T>
+typename std::enable_if<std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(const size_t n) {
+  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+}
+// Disallows 'make_unique<T[10]>()'. (N3690 s20.9.1.4 p5)
+template <typename T, typename... Args>
+typename std::enable_if<std::extent<T>::value != 0, std::unique_ptr<T>>::type
+make_unique(Args&&...) = delete;
+
+#endif
+
+
+#ifdef __cpp_lib_integer_sequence
+
+template<class T, T... Ints> using integer_sequence = std::integer_sequence<T, Ints...>;
+template<std::size_t... Ints> using index_sequence = std::index_sequence<Ints...>;
+template<class T, T N> using make_integer_sequence = std::make_integer_sequence<T, N>;
+template<std::size_t N> using make_index_sequence = std::make_index_sequence<N>;
+template<class... T> using index_sequence_for = std::index_sequence_for<T...>;
+
+#else
+
+template<class T, T... Ints> struct integer_sequence {
+  using value_type = T;
+  static constexpr std::size_t size() noexcept {return sizeof...(Ints);}
+};
+template<std::size_t... Ints> using index_sequence = integer_sequence<std::size_t, Ints...>;
+namespace detail {
+  template<class T, std::size_t I, std::size_t N, T... Ints>
+  struct make_integer_sequence_ {
+    using type = typename make_integer_sequence_<T, I+1, N, Ints..., I>::type;
+  };
+  template<class T, std::size_t N, T... Ints>
+  struct make_integer_sequence_<T, N, N, Ints...> {
+    using type = integer_sequence<T, Ints...>;
+  };
+}
+template<class T, T N> using make_integer_sequence = typename detail::make_integer_sequence_<T, 0, N>::type;
+template<std::size_t N> using make_index_sequence = make_integer_sequence<std::size_t, N>;
+static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
+static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
+template<class... T> using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+#endif
+
+
+#ifdef __cpp_lib_transformation_trait_aliases
+template<bool B, class T, class F> using conditional_t = std::conditional_t<B, T, F>;
+template<bool B, class T = void> using enable_if_t = std::enable_if_t<B, T>;
+template<class T> using add_lvalue_reference_t = std::add_lvalue_reference_t<T>;
+template<class T> using remove_reference_t = std::remove_reference_t<T>;
+template<class T> using remove_cv_t = std::remove_cv_t<T>;
+template<class T> using result_of_t = std::result_of_t<T>;
+template<class T> using decay_t = std::decay_t<T>;
+#else
+template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
+template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template<class T> using add_lvalue_reference_t = typename std::add_lvalue_reference<T>::type;
+template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
+template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
+template<class T> using result_of_t = typename std::result_of<T>::type;
+template<class T> using decay_t = typename std::decay<T>::type;
+#endif
+
+
+
+#ifdef __cpp_lib_logical_traits
+
+using conjunction = std::conjunction;
+using disjunction = std::disjunction;
+using bool_constant = std::bool_constant;
+using negation = std::negation;
+
+#else
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B1> struct conjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+    : conditional_t<bool(B1::value), conjunction<Bn...>, B1> {};
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/disjunction
+template<class...> struct disjunction : std::false_type { };
+template<class B1> struct disjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct disjunction<B1, Bn...>
+    : conditional_t<bool(B1::value), B1, disjunction<Bn...>>  { };
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/integral_constant
+template <bool B>
+using bool_constant = std::integral_constant<bool, B>;
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/negation
+template<class B>
+struct negation : bool_constant<!bool(B::value)> { };
+
+#endif
+
+
+
+#ifdef __cpp_lib_void_t
+
+template<class T> using void_t = std::void_t<T>;
+
+#else
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+// (it takes CWG1558 into account and also works for older compilers)
+template<typename... Ts> struct make_void { typedef void type;};
+template<typename... Ts> using void_t = typename make_void<Ts...>::type;
+
+#endif
+
+
+
+#ifdef __cpp_lib_apply
+
+using apply = std::apply;
+
+#else
+
+// Implementation from http://en.cppreference.com/w/cpp/utility/apply (but modified)
+// TODO This is an incomplete implementation of std::apply, not working for member functions.
+namespace detail {
+template <class F, class Tuple, std::size_t... I>
+constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence<I...>) -> decltype(std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...))
+{
+    return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
+}
+}  // namespace detail
+
+template <class F, class Tuple>
+constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl(
+    std::forward<F>(f), std::forward<Tuple>(t),
+    guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{}))
+{
+    return detail::apply_impl(
+        std::forward<F>(f), std::forward<Tuple>(t),
+        guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{});
+}
+
+#endif
+
+
+
+
+#if defined(__cpp_constexpr) && __cpp_constexpr >= 201304
+#  define C10_CPP14_CONSTEXPR constexpr
+#else
+#  define C10_CPP14_CONSTEXPR
+#endif
+
+
+
+
+// GCC 4.8 doesn't define std::to_string, even though that's in C++11. Let's define it.
+namespace detail {
+class DummyClassForToString final {};
+}}}
+namespace std {
+// We use SFINAE to detect if std::to_string exists for a type, but that only works
+// if the function name is defined. So let's define a std::to_string for a dummy type.
+inline std::string to_string(c10::guts::detail::DummyClassForToString) { return ""; }
+}
+namespace c10 { namespace guts { namespace detail {
+
+template<class T, class Enable = void>
+struct to_string_ final {
+    static std::string call(T value) {
+        std::ostringstream str;
+        str << value;
+        return str.str();
+    }
+};
+// If a std::to_string exists, use that instead
+template<class T>
+struct to_string_<T, void_t<decltype(std::to_string(std::declval<T>()))>> final {
+    static std::string call(T value) {
+        return std::to_string(value);
+    }
+};
+}
+template<class T> inline std::string to_string(T value) {
+    return detail::to_string_<T>::call(value);
+}
+
+
+}}
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
new file mode 100644
index 0000000..5db0666
--- /dev/null
+++ b/caffe2/utils/CMakeLists.txt
@@ -0,0 +1,141 @@
+list(APPEND Caffe2_CPU_SRCS
+  utils/proto_wrap.cc)
+
+# ---[ only support the above when full caffe2 isn't built
+if (NOT BUILD_CAFFE2)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+  return()
+endif()
+
+list(APPEND Caffe2_CPU_SRCS
+  utils/proto_utils.cc
+  utils/murmur_hash3.cc
+  utils/smart_tensor_printer.cc
+  utils/signal_handler.cc
+  utils/string_utils.cc
+  utils/threadpool/ThreadPool.cc
+  utils/cpuid.cc
+  utils/bench_utils.cc
+  utils/math_cpu.cc
+  utils/math_utils.cc
+  utils/thread_name.cc)
+
+# ---[ threadpool/pthreadpool* is a local modification of the NNPACK
+# pthreadpool with a very similar interface. Neither NNPACK, nor this
+# thread pool supports Windows.
+if (NOT MSVC)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
+          utils/threadpool/pthreadpool.cc
+          utils/threadpool/pthreadpool_impl.cc
+          )
+endif()
+
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
+        utils/math_gpu.cu
+        )
+
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
+        utils/hip/math_hip.cc
+        )
+
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
+        utils/fixed_divisor_test.cc
+        utils/math_test.cc
+        utils/fatal_signal_asan_no_sig_test.cc
+        utils/simple_queue_test.cc
+        utils/proto_utils_test.cc
+        utils/cpuid_test.cc
+        utils/smart_tensor_printer_test.cc
+        utils/cast_test.cc
+        )
+
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}
+        utils/math_gpu_test.cc
+        )
+
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
+        utils/hip/math_hip_test.cc
+        utils/hip/math_blas_hip_test.cc
+        )
+
+# TODO Remove the CMake_xxx variables above and add them to the variables for the local library target below instead
+
+set(LIB_SOURCES_CPU
+        Array.cpp
+        C++17.cpp
+        IdWrapper.cpp
+        Optional.cpp
+        Metaprogramming.cpp
+        TypeList.cpp
+        TypeTraits.cpp
+        )
+
+set(LIB_SOURCES_GPU
+        dummy.cpp
+        )
+
+set(LIB_SOURCES_HIP
+        dummy.cpp
+        )
+
+set(TEST_SOURCES_CPU
+        Array_test.cpp
+        Metaprogramming_test.cpp
+        TypeList_test.cpp
+        TypeTraits_test.cpp
+        )
+
+set(LIB_SOURCES_GPU
+        dummy.cpp
+        )
+
+set(TEST_SOURCES_HIP
+        dummy.cpp
+        )
+
+add_library(c10_utils_cpu OBJECT ${LIB_SOURCES_CPU})
+target_enable_style_warnings("c10_utils_cpu")
+
+add_library(c10_utils_gpu OBJECT ${LIB_SOURCES_GPU})
+target_enable_style_warnings("c10_utils_gpu")
+
+add_library(c10_utils_hip OBJECT ${LIB_SOURCES_HIP})
+target_enable_style_warnings("c10_utils_hip")
+
+if(BUILD_TEST)
+  add_executable(c10_utils_cpu_test ${TEST_SOURCES_CPU} $<TARGET_OBJECTS:c10_utils_cpu>)
+  add_test(NAME c10_utils_cpu_test COMMAND $<TARGET_FILE:c10_utils_cpu_test>)
+  target_enable_style_warnings(c10_utils_cpu_test)
+  target_link_libraries(c10_utils_cpu_test gtest_main)
+  if(INSTALL_TEST)
+    install(TARGETS c10_utils_cpu_test DESTINATION test)
+  endif()
+
+  add_executable(c10_utils_gpu_test ${TEST_SOURCES_GPU} $<TARGET_OBJECTS:c10_utils_gpu>)
+  add_test(NAME c10_utils_gpu_test COMMAND $<TARGET_FILE:c10_utils_gpu_test>)
+  target_enable_style_warnings(c10_utils_gpu_test)
+  target_link_libraries(c10_utils_gpu_test gtest_main)
+  if(INSTALL_TEST)
+    install(TARGETS c10_utils_gpu_test DESTINATION test)
+  endif()
+
+  add_executable(c10_utils_hip_test ${TEST_SOURCES_HIP} $<TARGET_OBJECTS:c10_utils_hip>)
+  add_test(NAME c10_utils_hip_test COMMAND $<TARGET_FILE:c10_utils_hip_test>)
+  target_enable_style_warnings(c10_utils_hip_test)
+  target_link_libraries(c10_utils_hip_test gtest_main)
+  if(INSTALL_TEST)
+    install(TARGETS c10_utils_hip_test DESTINATION test)
+  endif()
+endif()
+
+
+# TODO Once all source files are defined inside the local c10_utils_xxx targets,
+# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2)
+# instead of us locally adding it to Caffe2_xxx variables.
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/utils/GpuBitonicSort.cuh b/caffe2/utils/GpuBitonicSort.cuh
new file mode 100644
index 0000000..45cb298
--- /dev/null
+++ b/caffe2/utils/GpuBitonicSort.cuh
@@ -0,0 +1,178 @@
+#ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
+#define CAFFE2_UTILS_GPU_BITONIC_SORT_H_
+
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/GpuDefs.cuh"
+
+namespace caffe2 {
+
+// Returns true if the given integer type is a power-of-2 (positive only)
+// Note(jiayq): windows reported an error per
+//     https://github.com/caffe2/caffe2/issues/997
+// and as a result will make it a macro.
+#ifdef _MSC_VER
+#define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
+#else // _MSC_VER
+template <typename T>
+constexpr bool integerIsPowerOf2(T v) {
+  return (v && !(v & (v - 1)));
+}
+#endif // _MSC_VER
+
+/// The maximum in-block bitonic sort we support
+constexpr int kMaxBitonicSortSize = 4096;
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(K& kA, V& vA,
+                                   K& kB, V& vB,
+                                   bool dir,
+                                   const Comparator& comp) {
+  bool swap = comp(kA, vA, kB, vB);
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+  }
+};
+
+template <typename Comparator, typename K, typename V,
+          int Power2SortSize,
+          int ThreadsPerBlock>
+__device__ inline void bitonicSort(K* keys,
+                                   V* values,
+                                   const Comparator& comp) {
+  static_assert(Power2SortSize <= kMaxBitonicSortSize,
+                "sort size <= 4096 only supported");
+  // Assume the sort is taking place in shared memory
+  // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
+  //               "sort data too large (>32768 bytes)");
+  static_assert(integerIsPowerOf2(Power2SortSize),
+                "sort size must be power of 2");
+  static_assert(integerIsPowerOf2(ThreadsPerBlock),
+                "threads in block must be power of 2");
+
+  // If what we are sorting is too small, then not all threads
+  // participate
+  constexpr int numThreadsForSort = Power2SortSize / 2;
+  constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;
+
+  // If what we are sorting is too large, then threads must loop more
+  // than once
+  constexpr int loopPerThread =
+    allThreads ? numThreadsForSort / ThreadsPerBlock : 1;
+
+#pragma unroll
+  for (int size = 2; size < Power2SortSize; size *= 2) {
+
+#pragma unroll
+    for (int stride = size / 2; stride > 0; stride /= 2) {
+
+#pragma unroll
+      for (int loop = 0; loop < loopPerThread; ++loop) {
+        int threadId = loop * ThreadsPerBlock + threadIdx.x;
+        bool flag = ((threadId & (size / 2)) != 0);
+
+        int pos = 2 * threadId - (threadId & (stride - 1));
+
+        if (allThreads || (threadId < numThreadsForSort)) {
+          bitonicSwap<Comparator, K, V>(
+            keys[pos], values[pos],
+            keys[pos + stride], values[pos + stride],
+            flag, comp);
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+#pragma unroll
+  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+
+#pragma unroll
+    for (int loop = 0; loop < loopPerThread; ++loop) {
+      int threadId = loop * ThreadsPerBlock + threadIdx.x;
+
+      int pos = 2 * threadId - (threadId & (stride - 1));
+
+      if (allThreads || (threadId < numThreadsForSort)) {
+        bitonicSwap<Comparator, K, V>(
+          keys[pos], values[pos],
+          keys[pos + stride], values[pos + stride],
+          false, comp);
+      }
+
+      __syncthreads();
+    }
+  }
+}
+
+template <typename Comparator, typename K, typename V, int Power2SortSize>
+__device__ inline void warpBitonicSort(K* keys,
+                                       V* values,
+                                       const Comparator& comp) {
+  // Smaller sorts should use a warp shuffle sort
+  static_assert(Power2SortSize > kWarpSize,
+                "sort not large enough");
+  static_assert(integerIsPowerOf2(Power2SortSize),
+                "sort size must be power of 2");
+  static_assert(Power2SortSize <= kMaxBitonicSortSize,
+                "sort size <= 4096 only supported");
+
+  // If what we are sorting is too large, then lanes must loop more
+  // than once
+  constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
+  int laneId = getLaneId();
+
+#pragma unroll
+  for (int size = 2; size < Power2SortSize; size *= 2) {
+
+#pragma unroll
+    for (int stride = size / 2; stride > 0; stride /= 2) {
+
+#pragma unroll
+      for (int loop = 0; loop < loopPerThread; ++loop) {
+        int threadId = loop * kWarpSize + laneId;
+        bool flag = ((threadId & (size / 2)) != 0);
+
+        int pos = 2 * threadId - (threadId & (stride - 1));
+
+        bitonicSwap<Comparator, K, V>(
+          keys[pos], values[pos],
+          keys[pos + stride], values[pos + stride],
+          flag, comp);
+
+        __threadfence_block();
+      }
+    }
+  }
+
+#pragma unroll
+  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+
+#pragma unroll
+    for (int loop = 0; loop < loopPerThread; ++loop) {
+      int threadId = loop * kWarpSize + laneId;
+
+      int pos = 2 * threadId - (threadId & (stride - 1));
+
+      bitonicSwap<Comparator, K, V>(
+        keys[pos], values[pos],
+        keys[pos + stride], values[pos + stride],
+        false, comp);
+
+      __threadfence_block();
+    }
+  }
+}
+
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_UTILS_GPU_BITONIC_SORT_H_
diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
new file mode 100644
index 0000000..cf54f9e
--- /dev/null
+++ b/caffe2/utils/GpuDefs.cuh
@@ -0,0 +1,99 @@
+#ifndef CAFFE2_UTILS_GPU_DEFS_H_
+#define CAFFE2_UTILS_GPU_DEFS_H_
+
+#include <cuda_runtime.h>
+
+namespace caffe2 {
+
+// Static definition of GPU warp size for unrolling and code generation
+
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ <= 700
+constexpr int kWarpSize = 32;
+#else
+#error Unknown __CUDA_ARCH__; please define parameters for compute capability
+#endif // __CUDA_ARCH__ types
+#endif // __CUDA_ARCH__
+
+#ifndef __CUDA_ARCH__
+// dummy value for host compiler
+constexpr int kWarpSize = 32;
+#endif // !__CUDA_ARCH__
+
+//
+// Interfaces to PTX instructions for which there appears to be no
+// intrinsic
+//
+
+template <typename T>
+struct Bitfield {};
+
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
+
+template <>
+struct Bitfield<unsigned long long int> {
+  static __device__ __forceinline__
+  unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
+
+__device__ __forceinline__ int getLaneId() {
+  int laneId;
+  asm("mov.s32 %0, %laneid;" : "=r"(laneId) );
+  return laneId;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_UTILS_GPU_DEFS_H_
diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh
new file mode 100644
index 0000000..b38955b
--- /dev/null
+++ b/caffe2/utils/GpuScanUtils.cuh
@@ -0,0 +1,127 @@
+#ifndef CAFFE2_UTILS_GPU_SCAN_UTILS_H_
+#define CAFFE2_UTILS_GPU_SCAN_UTILS_H_
+
+#include "caffe2/utils/GpuDefs.cuh"
+
+namespace caffe2 {
+
+// from the cutorch library; can probably be replaced with their CUB
+// equivalents
+// Collection of in-kernel scan / prefix sum utilities
+
+// Inclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
+  // FIXME: this is a slow, simple implementation; need up/down sweep,
+  // prevent smem conflicts
+  smem[threadIdx.x] = in;
+
+  __syncthreads();
+
+  for (int offset = 1; offset < blockDim.x; offset *= 2) {
+    T val = 0;
+
+    if (threadIdx.x >= offset) {
+      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
+    }
+
+    __syncthreads();
+    if (threadIdx.x >= offset) {
+      smem[threadIdx.x] = val;
+    }
+
+    __syncthreads();
+  }
+
+  *out = smem[threadIdx.x];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
+  // FIXME: crappy implementation
+  // We kill write-after-read dependencies separately below, hence the `false`
+  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  *out -= in;
+  *carry = smem[blockDim.x - 1];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Inclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
+  // Within-warp, we use warp voting.
+#if CUDA_VERSION >= 9000
+  T vote = __ballot_sync(__activemask(), in);
+#else
+  T vote = __ballot(in);
+#endif
+
+  T index = __popc(getLaneMaskLe() & vote);
+  T carry = __popc(vote);
+
+  int warp = threadIdx.x / 32;
+
+  // Per each warp, write out a value
+  if (getLaneId() == 0) {
+    smem[warp] = carry;
+  }
+
+  __syncthreads();
+
+  // Sum across warps in one thread. This appears to be faster than a
+  // warp shuffle scan for CC 3.0+
+  if (threadIdx.x == 0) {
+    int current = 0;
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      T v = smem[i];
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
+    }
+  }
+
+  __syncthreads();
+
+  // load the carry from the preceding warp
+  if (warp >= 1) {
+    index = binop(index, smem[warp - 1]);
+  }
+
+  *out = index;
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  // Inclusive to exclusive
+  *out -= (T) in;
+
+  // The outgoing carry for all threads is the last warp's sum
+  *carry = smem[(blockDim.x / 32) - 1];
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_UTILS_GPU_SCAN_UTILS_H_
diff --git a/caffe2/utils/IdWrapper.cpp b/caffe2/utils/IdWrapper.cpp
new file mode 100644
index 0000000..7646a13
--- /dev/null
+++ b/caffe2/utils/IdWrapper.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/IdWrapper.h"
diff --git a/caffe2/utils/IdWrapper.h b/caffe2/utils/IdWrapper.h
new file mode 100644
index 0000000..0c8e548
--- /dev/null
+++ b/caffe2/utils/IdWrapper.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <functional>
+
+namespace c10 { namespace guts {
+
+/**
+ * This template simplifies generation of simple classes that wrap an id
+ * in a typesafe way. Namely, you can use it to create a very lightweight
+ * type that only offers equality comparators and hashing. Example:
+ *
+ *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
+ *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
+ *   };
+ *
+ * Then in the global top level namespace:
+ *
+ *   C10_DEFINE_IDWRAPPER(MyIdType);
+ *
+ * That's it - equality operators and hash functions are automatically defined
+ * for you, given the underlying type supports it.
+ */
+template <class ConcreteType, class UnderlyingType>
+class IdWrapper {
+public:
+    using underlying_type = UnderlyingType;
+    using concrete_type = ConcreteType;
+
+protected:
+    constexpr explicit IdWrapper(underlying_type id) noexcept(noexcept(underlying_type(std::declval<underlying_type>())))
+        : id_(id) {}
+
+    constexpr underlying_type underlyingId() const noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
+        return id_;
+    }
+
+private:
+    friend size_t hash_value(const concrete_type& v) {
+        return std::hash<underlying_type>()(v.id_);
+    }
+
+    // TODO Making operator== noexcept if underlying type is noexcept equality comparable doesn't work with GCC 4.8.
+    //      Fix this once we don't need GCC 4.8 anymore.
+    friend constexpr bool operator==(const concrete_type& lhs, const concrete_type& rhs) {
+        return lhs.id_ == rhs.id_;
+    }
+
+    // TODO Making operator!= noexcept if operator== is noexcept doesn't work with GCC 4.8.
+    //      Fix this once we don't need GCC 4.8 anymore.
+    friend constexpr bool operator!=(const concrete_type& lhs, const concrete_type& rhs) {
+        return !(lhs == rhs);
+    }
+
+    underlying_type id_;
+};
+
+}}
+
+#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName)             \
+  namespace std {                                            \
+  template <>                                                \
+  struct hash<ClassName> {                                   \
+    size_t operator()(ClassName x) const {                   \
+      return hash_value(x);                                  \
+    }                                                        \
+  };                                                         \
+  }
diff --git a/caffe2/utils/Metaprogramming.cpp b/caffe2/utils/Metaprogramming.cpp
new file mode 100644
index 0000000..ca0a77f
--- /dev/null
+++ b/caffe2/utils/Metaprogramming.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/Metaprogramming.h"
diff --git a/caffe2/utils/Metaprogramming.h b/caffe2/utils/Metaprogramming.h
new file mode 100644
index 0000000..f6f9318
--- /dev/null
+++ b/caffe2/utils/Metaprogramming.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <type_traits>
+#include <array>
+#include <functional>
+#include "caffe2/utils/TypeList.h"
+#include "caffe2/utils/Array.h"
+
+namespace c10 { namespace guts {
+
+
+/**
+ * Access information about result type or arguments from a function type.
+ * Example:
+ * using A = function_traits<int (float, double)>::return_type // A == int
+ * using A = function_traits<int (float, double)>::parameter_types::tuple_type // A == tuple<float, double>
+ */
+template<class Func> struct function_traits {
+  static_assert(!std::is_same<Func, Func>::value, "In function_traits<Func>, Func must be a plain function type.");
+};
+template<class Result, class... Args>
+struct function_traits<Result (Args...)> {
+  using func_type = Result (Args...);
+  using return_type = Result;
+  using parameter_types = typelist::typelist<Args...>;
+};
+
+
+
+/**
+ * Use extract_arg_by_filtered_index to return the i-th argument whose
+ * type fulfills a given type trait. The argument itself is perfectly forwarded.
+ *
+ * Example:
+ * std::string arg1 = "Hello";
+ * std::string arg2 = "World";
+ * std::string&& result = extract_arg_by_filtered_index<is_string, 1>(0, arg1, 2.0, std::move(arg2));
+ *
+ * Warning: Taking the result by rvalue reference can cause segfaults because ownership will not be passed on
+ *          from the original reference. The original reference dies after the expression and the resulting
+ */
+namespace detail {
+template<template <class> class Condition, size_t index, class Enable, class... Args> struct extract_arg_by_filtered_index_;
+template<template <class> class Condition, size_t index, class Head, class... Tail>
+struct extract_arg_by_filtered_index_<Condition, index, guts::enable_if_t<!Condition<Head>::value>, Head, Tail...> {
+  static auto call(Head&& /*head*/, Tail&&... tail)
+  -> decltype(extract_arg_by_filtered_index_<Condition, index, void, Tail...>::call(std::forward<Tail>(tail)...)) {
+    return extract_arg_by_filtered_index_<Condition, index, void, Tail...>::call(std::forward<Tail>(tail)...);
+  }
+};
+template<template <class> class Condition, size_t index, class Head, class... Tail>
+struct extract_arg_by_filtered_index_<Condition, index, guts::enable_if_t<Condition<Head>::value && index != 0>, Head, Tail...> {
+  static auto call(Head&& /*head*/, Tail&&... tail)
+  -> decltype(extract_arg_by_filtered_index_<Condition, index-1, void, Tail...>::call(std::forward<Tail>(tail)...)) {
+    return extract_arg_by_filtered_index_<Condition, index-1, void, Tail...>::call(std::forward<Tail>(tail)...);
+  }
+};
+template<template <class> class Condition, size_t index>
+struct extract_arg_by_filtered_index_<Condition, index, void> {
+  static void call() {
+    static_assert(index != index, "extract_arg_by_filtered_index out of range.");
+  }
+};
+template<template <class> class Condition, size_t index, class Head, class... Tail>
+struct extract_arg_by_filtered_index_<Condition, index, guts::enable_if_t<Condition<Head>::value && index == 0>, Head, Tail...> {
+  static auto call(Head&& head, Tail&&... /*tail*/)
+  -> decltype(std::forward<Head>(head)) {
+    return std::forward<Head>(head);
+  }
+};
+}
+template<template <class> class Condition, size_t index, class... Args>
+auto extract_arg_by_filtered_index(Args&&... args)
+-> decltype(detail::extract_arg_by_filtered_index_<Condition, index, void, Args...>::call(std::forward<Args>(args)...)) {
+  static_assert(is_type_condition<Condition>::value, "In extract_arg_by_filtered_index, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  return detail::extract_arg_by_filtered_index_<Condition, index, void, Args...>::call(std::forward<Args>(args)...);
+}
+
+
+
+/**
+ * Use filter_map to map a subset of the arguments to values.
+ * The subset is defined by type traits, and will be evaluated at compile time.
+ * At runtime, it will just loop over the pre-filtered arguments to create an std::array.
+ *
+ * Example:
+ *  // in C++14
+ *  std::array<double, 2> result = filter_map<double, std::is_integral>([] (auto a) {return (double)a;}, 3, "bla", 4);
+ *  // result == {3.0, 4.0}
+ *
+ *  // same example in C++11
+ *  struct my_map {
+ *    template<class T> constexpr double operator()(T a) {
+ *      return (double)a;
+ *    }
+ *  };
+ *  std::array<double, 2> result = filter_map<double, std::is_integral>(my_map(), 3, "bla", 4);
+ *  // result == {3.0, 4.0}
+ */
+namespace detail {
+
+template<class ResultType, size_t num_results> struct filter_map_ {
+   template<template <class> class Condition, class Mapper, class... Args, size_t... I>
+   static guts::array<ResultType, num_results> call(const Mapper& mapper, guts::index_sequence<I...>, Args&&... args) {
+     return guts::array<ResultType, num_results> { mapper(extract_arg_by_filtered_index<Condition, I>(std::forward<Args>(args)...))... };
+   }
+};
+template<class ResultType> struct filter_map_<ResultType, 0> {
+  template<template <class> class Condition, class Mapper, class... Args, size_t... I>
+  static guts::array<ResultType, 0> call(const Mapper& /*mapper*/, guts::index_sequence<I...>, Args&&... /*args*/) {
+    return guts::array<ResultType, 0> { };
+  }
+};
+}
+
+template<class ResultType, template <class> class Condition, class Mapper, class... Args> auto filter_map(const Mapper& mapper, Args&&... args)
+-> decltype(detail::filter_map_<ResultType, typelist::count_if<Condition, typelist::typelist<Args...>>::value>::template call<Condition, Mapper, Args...>(mapper, guts::make_index_sequence<typelist::count_if<Condition, typelist::typelist<Args...>>::value>(), std::forward<Args>(args)...)) {
+  static_assert(is_type_condition<Condition>::value, "In filter_map<Result, Condition>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+
+  static constexpr size_t num_results = typelist::count_if<Condition, typelist::typelist<Args...>>::value;
+  return detail::filter_map_<ResultType, num_results>::template call<Condition, Mapper, Args...>(mapper, guts::make_index_sequence<num_results>(), std::forward<Args>(args)...);
+}
+
+}}
diff --git a/caffe2/utils/Metaprogramming_test.cpp b/caffe2/utils/Metaprogramming_test.cpp
new file mode 100644
index 0000000..a972f41
--- /dev/null
+++ b/caffe2/utils/Metaprogramming_test.cpp
@@ -0,0 +1,217 @@
+#include "caffe2/utils/Metaprogramming.h"
+#include <gtest/gtest.h>
+
+using namespace c10::guts;
+
+namespace {
+
+namespace test_function_traits {
+    static_assert(std::is_same<void, typename function_traits<void(int, float)>::return_type>::value, "");
+    static_assert(std::is_same<int, typename function_traits<int(int, float)>::return_type>::value, "");
+    static_assert(std::is_same<typelist::typelist<int, float>, typename function_traits<void(int, float)>::parameter_types>::value, "");
+    static_assert(std::is_same<typelist::typelist<int, float>, typename function_traits<int(int, float)>::parameter_types>::value, "");
+}
+
+struct MovableOnly {
+    constexpr MovableOnly(int val_): val(val_) {/* no default constructor */}
+    MovableOnly(const MovableOnly&) = delete;
+    MovableOnly(MovableOnly&&) = default;
+    MovableOnly& operator=(const MovableOnly&) = delete;
+    MovableOnly& operator=(MovableOnly&&) = default;
+
+    friend bool operator==(const MovableOnly& lhs, const MovableOnly& rhs) {return lhs.val == rhs.val;}
+private:
+    int val;
+};
+
+template<class T> using is_my_movable_only_class = std::is_same<MovableOnly, remove_cv_t<remove_reference_t<T>>>;
+
+struct CopyCounting {
+    int move_count;
+    int copy_count;
+
+    CopyCounting(): move_count(0), copy_count(0) {}
+    CopyCounting(const CopyCounting& rhs): move_count(rhs.move_count), copy_count(rhs.copy_count + 1) {}
+    CopyCounting(CopyCounting&& rhs): move_count(rhs.move_count + 1), copy_count(rhs.copy_count) {}
+    CopyCounting& operator=(const CopyCounting& rhs) {
+        move_count = rhs.move_count;
+        copy_count = rhs.copy_count + 1;
+        return *this;
+    }
+    CopyCounting& operator=(CopyCounting&& rhs) {
+        move_count = rhs.move_count + 1;
+        copy_count = rhs.copy_count;
+        return *this;
+    }
+};
+
+template<class T> using is_my_copy_counting_class = std::is_same<CopyCounting, remove_cv_t<remove_reference_t<T>>>;
+
+namespace test_extract_arg_by_filtered_index {
+    class MyClass {};
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex) {
+        auto a1 = extract_arg_by_filtered_index<std::is_integral, 0>(3, "bla", MyClass(), 4, nullptr, 5);
+        auto a2 = extract_arg_by_filtered_index<std::is_integral, 1>(3, "bla", MyClass(), 4, nullptr, 5);
+        auto a3 = extract_arg_by_filtered_index<std::is_integral, 2>(3, "bla", MyClass(), 4, nullptr, 5);
+        EXPECT_EQ(3, a1);
+        EXPECT_EQ(4, a2);
+        EXPECT_EQ(5, a3);
+    }
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex_singleInput) {
+        auto a1 = extract_arg_by_filtered_index<std::is_integral, 0>(3);
+        EXPECT_EQ(3, a1);
+    }
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex_movableOnly) {
+        MovableOnly a1 = extract_arg_by_filtered_index<is_my_movable_only_class, 0>(3, MovableOnly(3), "test", MovableOnly(1));
+        MovableOnly a2 = extract_arg_by_filtered_index<is_my_movable_only_class, 1>(3, MovableOnly(3), "test", MovableOnly(1));
+        EXPECT_EQ(MovableOnly(3), a1);
+        EXPECT_EQ(MovableOnly(1), a2);
+    }
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex_onlyCopiesIfNecessary) {
+        CopyCounting source;
+        CopyCounting source2;
+        CopyCounting a1 = extract_arg_by_filtered_index<is_my_copy_counting_class, 0>(3, CopyCounting(), "test", source, std::move(source2));
+        CopyCounting a2 = extract_arg_by_filtered_index<is_my_copy_counting_class, 1>(3, CopyCounting(), "test", source, std::move(source2));
+        CopyCounting a3 = extract_arg_by_filtered_index<is_my_copy_counting_class, 2>(3, CopyCounting(), "test", source, std::move(source2));
+        EXPECT_EQ(1, a1.move_count);
+        EXPECT_EQ(0, a1.copy_count);
+        EXPECT_EQ(0, a2.move_count);
+        EXPECT_EQ(1, a3.move_count);
+        EXPECT_EQ(0, a3.copy_count);
+        EXPECT_EQ(1, a2.copy_count);
+    }
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex_onlyMovesIfNecessary) {
+        CopyCounting source;
+        CopyCounting source2;
+        CopyCounting&& a1 = extract_arg_by_filtered_index<is_my_copy_counting_class , 0>(3, std::move(source), "test", std::move(source2));
+        CopyCounting a2 = extract_arg_by_filtered_index<is_my_copy_counting_class , 1>(3, std::move(source), "test", std::move(source2));
+        EXPECT_EQ(0, a1.move_count);
+        EXPECT_EQ(0, a1.copy_count);
+        EXPECT_EQ(1, a2.move_count);
+        EXPECT_EQ(0, a2.copy_count);
+    }
+
+    template<class T> using is_true = std::true_type;
+
+    TEST(MetaprogrammingTest, ExtractArgByFilteredIndex_keepsLValueReferencesIntact) {
+        MyClass obj;
+        MyClass& a1 = extract_arg_by_filtered_index<is_true, 1>(3, obj, "test", obj);
+        EXPECT_EQ(&obj, &a1);
+    }
+}
+
+namespace test_filter_map {
+    class MyClass {};
+
+    struct map_to_double {
+      template<class T> constexpr double operator()(T a) const {
+        return static_cast<double>(a);
+      }
+    };
+
+    TEST(MetaprogrammingTest, FilterMap) {
+        auto result = filter_map<double, std::is_integral>(map_to_double(), 3, "bla", MyClass(), 4, nullptr, 5);
+        static_assert(std::is_same<array<double, 3>, decltype(result)>::value, "");
+        constexpr array<double, 3> expected{{3.0, 4.0, 5.0}};
+        EXPECT_EQ(expected, result);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_emptyInput) {
+        auto result = filter_map<double, std::is_integral>(map_to_double());
+        static_assert(std::is_same<array<double, 0>, decltype(result)>::value, "");
+        constexpr array<double, 0> expected{{}};
+        EXPECT_EQ(expected, result);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_emptyOutput) {
+        auto result = filter_map<double, std::is_integral>(map_to_double(), "bla", MyClass(), nullptr);
+        static_assert(std::is_same<array<double, 0>, decltype(result)>::value, "");
+        constexpr array<double, 0> expected{{}};
+        EXPECT_EQ(expected, result);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_movableOnly_byRValue) {
+        struct map_movable_by_rvalue {
+          MovableOnly operator()(MovableOnly&& a) const {
+            return std::move(a);
+          }
+        };
+
+        auto result = filter_map<MovableOnly, is_my_movable_only_class>(map_movable_by_rvalue(), MovableOnly(5), "bla", nullptr, 3, MovableOnly(2));
+        static_assert(std::is_same<array<MovableOnly, 2>, decltype(result)>::value, "");
+        constexpr array<MovableOnly, 2> expected {{MovableOnly(5), MovableOnly(2)}};
+        EXPECT_EQ(expected, result);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_movableOnly_byValue) {
+        struct map_movable_by_lvalue {
+          MovableOnly operator()(MovableOnly a) const {
+            return a;
+          }
+        };
+
+        auto result = filter_map<MovableOnly, is_my_movable_only_class>(map_movable_by_lvalue(), MovableOnly(5), "bla", nullptr, 3, MovableOnly(2));
+        static_assert(std::is_same<array<MovableOnly, 2>, decltype(result)>::value, "");
+        constexpr array<MovableOnly, 2> expected {{MovableOnly(5), MovableOnly(2)}};
+        EXPECT_EQ(expected, result);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_onlyCopiesIfNecessary) {
+        struct map_copy_counting_by_copy {
+          CopyCounting operator()(CopyCounting v) const {
+            return v;
+          }
+        };
+
+        CopyCounting source;
+        CopyCounting source2;
+        auto result = filter_map<CopyCounting, is_my_copy_counting_class>(map_copy_counting_by_copy(), CopyCounting(), "bla", nullptr, 3, source, std::move(source2));
+        static_assert(std::is_same<array<CopyCounting, 3>, decltype(result)>::value, "");
+        EXPECT_EQ(0, result[0].copy_count);
+        EXPECT_EQ(2, result[0].move_count);
+        EXPECT_EQ(1, result[1].copy_count);
+        EXPECT_EQ(1, result[1].move_count);
+        EXPECT_EQ(0, result[2].copy_count);
+        EXPECT_EQ(2, result[2].move_count);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_onlyMovesIfNecessary_1) {
+        struct map_copy_counting_by_move {
+          CopyCounting operator()(CopyCounting&& v) const {
+            return std::move(v);
+          }
+        };
+
+        CopyCounting source;
+        auto result = filter_map<CopyCounting, is_my_copy_counting_class>(map_copy_counting_by_move(), CopyCounting(), "bla", nullptr, 3, std::move(source));
+        static_assert(std::is_same<array<CopyCounting, 2>, decltype(result)>::value, "");
+        EXPECT_EQ(0, result[0].copy_count);
+        EXPECT_EQ(1, result[0].move_count);
+        EXPECT_EQ(0, result[1].copy_count);
+        EXPECT_EQ(1, result[1].move_count);
+    }
+
+    TEST(MetaprogrammingTest, FilterMap_onlyMovesIfNecessary_2) {
+        struct map_copy_counting_by_pointer {
+          const CopyCounting* operator()(const CopyCounting& v) const {
+            return &v;
+          }
+        };
+
+        CopyCounting source1;
+        CopyCounting source2;
+        auto result = filter_map<const CopyCounting*, is_my_copy_counting_class>(map_copy_counting_by_pointer(), "bla", nullptr, 3, source1, std::move(source2));
+        static_assert(std::is_same<array<const CopyCounting*, 2>, decltype(result)>::value, "");
+        EXPECT_EQ(0, result[0]->copy_count);
+        EXPECT_EQ(0, result[0]->move_count);
+        EXPECT_EQ(0, result[1]->copy_count);
+        EXPECT_EQ(0, result[1]->move_count);
+    }
+}
+
+}
diff --git a/caffe2/utils/Optional.cpp b/caffe2/utils/Optional.cpp
new file mode 100644
index 0000000..06d5db5
--- /dev/null
+++ b/caffe2/utils/Optional.cpp
@@ -0,0 +1 @@
+#include "Optional.h"
diff --git a/caffe2/utils/Optional.h b/caffe2/utils/Optional.h
new file mode 100644
index 0000000..67d7667
--- /dev/null
+++ b/caffe2/utils/Optional.h
@@ -0,0 +1,981 @@
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+//
+// From https://github.com/akrzemi1/Optional
+//
+// C10: move to c10 namespace
+
+#pragma once
+
+# include <utility>
+# include <type_traits>
+# include <initializer_list>
+# include <cassert>
+# include <functional>
+# include <string>
+# include <stdexcept>
+
+# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false
+
+# if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   endif
+# endif
+#
+# if defined __clang_major__
+#   if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   elif (__clang_major__ > 3)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   endif
+#   if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   endif
+# endif
+#
+# if defined _MSC_VER
+#   if (_MSC_VER >= 1900)
+#     define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   endif
+# endif
+
+# if defined __clang__
+#   if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#   else
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#   endif
+# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# else
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+# endif
+
+
+# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#   define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+# else
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#   define OPTIONAL_CONSTEXPR_INIT_LIST
+# endif
+
+# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L)
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 1
+# else
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 0
+# endif
+
+# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
+# if (defined __cplusplus) && (__cplusplus == 201103L)
+#   define OPTIONAL_MUTABLE_CONSTEXPR
+# else
+#   define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+# endif
+
+namespace c10 {
+
+// 20.5.4, optional for object types
+template <class T> class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T> class optional<T&>;
+
+
+// workaround: std utility functions aren't constexpr yet
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type&& t) noexcept
+{
+    static_assert(!std::is_lvalue_reference<T>::value, "!!");
+    return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr typename std::remove_reference<T>::type&& constexpr_move(T&& t) noexcept
+{
+    return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+
+#if defined NDEBUG
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR)        \
+  ((CHECK) ? (EXPR) : ([]{assert(((void)#CHECK, false));}(), (EXPR)))
+#endif
+
+
+namespace detail_
+{
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof
+{
+  template <class X>
+  constexpr static bool has_overload(...) { return false; }
+
+  template <class X, std::size_t S = sizeof(std::declval<X&>().operator&()) >
+  constexpr static bool has_overload(bool) { return true; }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref)
+{
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref)
+{
+  return std::addressof(ref);
+}
+
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) { return v; }
+
+} // namespace detail
+
+
+constexpr struct trivial_init_t{} trivial_init{};
+
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t{} in_place{};
+
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t
+{
+  struct init{};
+  constexpr explicit nullopt_t(init){}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public std::logic_error {
+public:
+  explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
+};
+
+
+template <class T>
+union storage_t
+{
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+  template <class... Args>
+  constexpr storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t(){}
+};
+
+
+template <class T>
+union constexpr_storage_t
+{
+    unsigned char dummy_;
+    T value_;
+
+    constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+    template <class... Args>
+    constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+    ~constexpr_storage_t() = default;
+};
+
+
+template <class T>
+struct optional_base
+{
+    bool init_;
+    storage_t<T> storage_;
+
+    constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit optional_base(in_place_t, Args&&... args)
+        : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    explicit optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+        : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    virtual ~optional_base() { if (init_) storage_.value_.T::~T(); }
+};
+
+
+template <class T>
+struct constexpr_optional_base
+{
+    bool init_;
+    constexpr_storage_t<T> storage_;
+
+    constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    std::is_trivially_destructible<T>::value,                          // if possible
+    constexpr_optional_base<typename std::remove_const<T>::type>, // use base with trivial destructor
+    optional_base<typename std::remove_const<T>::type>
+>::type;
+
+
+
+template <class T>
+class optional : private OptionalBase<T>
+{
+  static_assert( !std::is_same<typename std::decay<T>::type, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<typename std::decay<T>::type, in_place_t>::value, "bad T" );
+
+
+  constexpr bool initialized() const noexcept { return OptionalBase<T>::init_; }
+  typename std::remove_const<T>::type* dataptr() {  return std::addressof(OptionalBase<T>::storage_.value_); }
+  constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase<T>::storage_.value_); }
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& { return OptionalBase<T>::storage_.value_; }
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+#   else
+  T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+  T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+#   endif
+# else
+  constexpr const T& contained_val() const { return OptionalBase<T>::storage_.value_; }
+  T& contained_val() { return OptionalBase<T>::storage_.value_; }
+# endif
+
+  void clear() noexcept {
+    if (initialized()) dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(noexcept(T(il, std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>()  {};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>() {};
+
+  optional(const optional& rhs)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(*rhs);
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible<T>::value)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list<U> il, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept
+  {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(*rhs);
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs)
+  noexcept(std::is_nothrow_move_assignable<T>::value && std::is_nothrow_move_constructible<T>::value)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(std::move(*rhs));
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v)
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, T>::value,
+    optional&
+  >::type
+  {
+    if (initialized()) { contained_val() = std::forward<U>(v); }
+    else               { initialize(std::forward<U>(v));  }
+    return *this;
+  }
+
+
+  template <class... Args>
+  void emplace(Args&&... args)
+  {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(std::initializer_list<U> il, Args&&... args)
+  {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(std::is_nothrow_move_constructible<T>::value && noexcept(swap(std::declval<T&>(), std::declval<T&>())))
+  {
+    if      (initialized() == true  && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); }
+    else if (initialized() == false && rhs.initialized() == true)  { initialize(std::move(*rhs)); rhs.clear(); }
+    else if (initialized() == true  && rhs.initialized() == true)  { using std::swap; swap(**this, *rhs); }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept { return initialized(); }
+  constexpr bool has_value() const noexcept { return initialized(); }
+
+  constexpr T const* operator ->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+# if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & {
+    assert (initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && {
+    assert (initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized()) throw bad_optional_access("bad optional access");
+	return std::move(contained_val());
+  }
+
+# else
+
+  T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  T& operator *() {
+    assert (initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+# endif
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const&
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   else
+
+  template <class V>
+  T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   endif
+
+# else
+
+  template <class V>
+  constexpr T value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+# endif
+
+  // 20.6.3.6, modifiers
+  void reset() noexcept { clear(); }
+};
+
+
+template <class T>
+class optional<T&>
+{
+  static_assert( !std::is_same<T, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<T, in_place_t>::value, "bad T" );
+  T* ref;
+
+public:
+
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+
+  void swap(optional<T&>& rhs) noexcept
+  {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  constexpr bool has_value() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename std::decay<T>::type value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<typename std::decay<T>::type>(constexpr_forward<V>(v));
+  }
+
+  // x.x.x.x, modifiers
+  void reset() noexcept { ref = nullptr; }
+};
+
+
+template <class T>
+class optional<T&&>
+{
+  static_assert( sizeof(T) == 0, "optional rvalue references disallowed" );
+};
+
+
+// 20.5.8, Relational operators
+template <class T> constexpr bool operator==(const optional<T>& x, const optional<T>& y)
+{
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x == y);
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const optional<T>& y)
+{
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const optional<T>& y)
+{
+  return (y < x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const optional<T>& y)
+{
+  return !(y < x);
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x < y);
+}
+
+
+// 20.5.9, Comparison with nullopt
+template <class T> constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+
+
+// 20.5.10, Comparison with T
+template <class T> constexpr bool operator==(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// Comparison of optional<T&> with T
+template <class T> constexpr bool operator==(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T> constexpr bool operator==(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y)))
+{
+  x.swap(y);
+}
+
+
+template <class T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v)
+{
+  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(std::reference_wrapper<X> v)
+{
+  return optional<X&>(v.get());
+}
+
+
+} // namespace c10
+
+namespace std
+{
+  template <typename T>
+  struct hash<c10::optional<T>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef c10::optional<T> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+
+  template <typename T>
+  struct hash<c10::optional<T&>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef c10::optional<T&> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+}
+
+# undef TR2_OPTIONAL_REQUIRES
+# undef TR2_OPTIONAL_ASSERTED_EXPRESSION
diff --git a/caffe2/utils/TypeList.cpp b/caffe2/utils/TypeList.cpp
new file mode 100644
index 0000000..e513358
--- /dev/null
+++ b/caffe2/utils/TypeList.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/TypeList.h"
diff --git a/caffe2/utils/TypeList.h b/caffe2/utils/TypeList.h
new file mode 100644
index 0000000..3494843
--- /dev/null
+++ b/caffe2/utils/TypeList.h
@@ -0,0 +1,250 @@
+#pragma once
+
+#include "caffe2/utils/C++17.h"
+#include "caffe2/utils/TypeTraits.h"
+
+namespace c10 { namespace guts { namespace typelist {
+
+namespace detail {
+template<class... T> struct false_t : std::false_type {};
+}
+
+/**
+ * Type holding a list of types for compile time type computations
+ */
+template<class... Items> struct typelist final {
+private:
+    typelist() = delete; // not for instantiation
+};
+
+
+
+/**
+ * Returns the number of types in a typelist
+ * Example:
+ *   3  ==  size<typelist<int, int, double>>::value
+ */
+template<class TypeList> struct size final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::size<T>, T must be typelist<...>.");
+};
+template<class... Types> struct size<typelist<Types...>> final {
+    static constexpr size_t value = sizeof...(Types);
+};
+
+
+
+/**
+ * Transforms a list of types into a tuple holding these types.
+ * Example:
+ *   std::tuple<int, string>  ==  to_tuple_t<typelist<int, string>>
+ */
+template<class TypeList> struct to_tuple final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::to_tuple<T>, T must be typelist<...>.");
+};
+template<class... Types> struct to_tuple<typelist<Types...>> final {
+    using type = std::tuple<Types...>;
+};
+template<class TypeList> using to_tuple_t = typename to_tuple<TypeList>::type;
+
+
+
+
+/**
+ * Creates a typelist containing the types of a given tuple.
+ * Example:
+ *   typelist<int, string>  ==  from_tuple_t<std::tuple<int, string>>
+ */
+template<class Tuple> struct from_tuple final {
+    static_assert(detail::false_t<Tuple>::value, "In typelist::from_tuple<T>, T must be std::tuple<...>.");
+};
+template<class... Types> struct from_tuple<std::tuple<Types...>> final {
+  using type = typelist<Types...>;
+};
+template<class Tuple> using from_tuple_t = typename from_tuple<Tuple>::type;
+
+
+
+/**
+ * Concatenates multiple type lists.
+ * Example:
+ *   typelist<int, string, int>  ==  concat_t<typelist<int, string>, typelist<int>>
+ */
+template<class... TypeLists> struct concat final {
+    static_assert(detail::false_t<TypeLists...>::value, "In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
+};
+template<class... Head1Types, class... Head2Types, class... TailLists>
+struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...> final {
+  using type = typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::type;
+};
+template<class... HeadTypes>
+struct concat<typelist<HeadTypes...>> final {
+  using type = typelist<HeadTypes...>;
+};
+template<>
+struct concat<> final {
+  using type = typelist<>;
+};
+template<class... TypeLists> using concat_t = typename concat<TypeLists...>::type;
+
+
+
+/**
+ * Filters the types in a type list by a type trait.
+ * Examples:
+ *   typelist<int&, const string&&>  ==  filter_t<std::is_reference, typelist<void, string, int&, bool, const string&&, int>>
+ */
+template<template <class> class Condition, class TypeList> struct filter final {
+  static_assert(detail::false_t<TypeList>::value, "In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template<template <class> class Condition, class Head, class... Tail>
+struct filter<Condition, typelist<Head, Tail...>> final {
+  static_assert(is_type_condition<Condition>::value, "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = guts::conditional_t<
+    Condition<Head>::value,
+    concat_t<typelist<Head>, typename filter<Condition, typelist<Tail...>>::type>,
+    typename filter<Condition, typelist<Tail...>>::type
+  >;
+};
+template<template <class> class Condition>
+struct filter<Condition, typelist<>> final {
+  static_assert(is_type_condition<Condition>::value, "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = typelist<>;
+};
+template<template <class> class Condition, class TypeList>
+using filter_t = typename filter<Condition, TypeList>::type;
+
+
+
+/**
+ * Counts how many types in the list fulfill a type trait
+ * Examples:
+ *   2  ==  count_if<std::is_reference, typelist<void, string, int&, bool, const string&&, int>>
+ */
+template<template <class> class Condition, class TypeList>
+struct count_if final {
+  static_assert(is_type_condition<Condition>::value, "In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  static_assert(is_instantiation_of<typelist, TypeList>::value, "In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+  // TODO Direct implementation might be faster
+  static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
+};
+
+
+
+/**
+ * Returns true iff the type trait is true for all types in the type list
+ * Examples:
+ *   true   ==  true_for_each_type<std::is_reference, typelist<int&, const float&&, const MyClass&>>::value
+ *   false  ==  true_for_each_type<std::is_reference, typelist<int&, const float&&, MyClass>>::value
+ */
+template<template <class> class Condition, class TypeList> struct true_for_each_type final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::true_for_each_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template<template <class> class Condition, class... Types>
+struct true_for_each_type<Condition, typelist<Types...>> final
+: guts::conjunction<Condition<Types>...> {
+    static_assert(is_type_condition<Condition>::value, "In typelist::true_for_each_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+};
+
+
+
+/**
+ * Maps types of a type list using a type trait
+ * Example:
+ *  typelist<int&, double&, string&>  ==  map_t<std::add_lvalue_reference_t, typelist<int, double, string>>
+ */
+template<template <class> class Mapper, class TypeList> struct map final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template<template <class> class Mapper, class... Types>
+struct map<Mapper, typelist<Types...>> final {
+  using type = typelist<Mapper<Types>...>;
+};
+template<template <class> class Mapper, class TypeList>
+using map_t = typename map<Mapper, TypeList>::type;
+
+
+
+/**
+ * Returns the first element of a type list.
+ * Example:
+ *   int  ==  head_t<typelist<int, string>>
+ */
+template<class TypeList> struct head final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::head<T>, the T argument must be typelist<...>.");
+};
+template<class Head, class... Tail> struct head<typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template<class TypeList> using head_t = typename head<TypeList>::type;
+
+
+
+/**
+ * Reverses a typelist.
+ * Example:
+ *   typelist<int, string>  == reverse_t<typelist<string, int>>
+ */
+template<class TypeList> struct reverse final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::reverse<T>, the T argument must be typelist<...>.");
+};
+template<class Head, class... Tail> struct reverse<typelist<Head, Tail...>> final {
+  using type = concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
+};
+template<> struct reverse<typelist<>> final {
+  using type = typelist<>;
+};
+template<class TypeList> using reverse_t = typename reverse<TypeList>::type;
+
+
+
+/**
+ * Maps a list of types into a list of values.
+ * Examples:
+ *   // C++14 example
+ *   auto sizes =
+ *     map_types_to_values<typelist<int64_t, bool, uint32_t>>(
+ *       [] (auto t) { return sizeof(decltype(t)::type); }
+ *     );
+ *   //  sizes  ==  std::tuple<size_t, size_t, size_t>{8, 1, 4}
+ *
+ *   // C++14 example
+ *   auto shared_ptrs =
+ *     map_types_to_values<typelist<int, double>>(
+ *       [] (auto t) { return make_shared<typename decltype(t)::type>(); }
+ *     );
+ *   // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
+ *
+ *   // C++11 example
+ *   struct map_to_size {
+ *     template<class T> constexpr size_t operator()(T) {
+ *       return sizeof(typename T::type);
+ *     }
+ *   };
+ *   auto sizes =
+ *     map_types_to_values<typelist<int64_t, bool, uint32_t>>(
+ *       map_to_size()
+ *     );
+ *   //  sizes  ==  std::tuple<size_t, size_t, size_t>{8, 1, 4}
+ */
+namespace detail {
+template<class T> struct type_ final {
+    using type = T;
+};
+template<class TypeList> struct map_types_to_values final {
+    static_assert(detail::false_t<TypeList>::value, "In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
+};
+template<class... Types> struct map_types_to_values<typelist<Types...>> final {
+  template<class Func>
+  static std::tuple<guts::result_of_t<Func(type_<Types>)>...> call(Func&& func) {
+    return std::tuple<guts::result_of_t<Func(type_<Types>)>...> { std::forward<Func>(func)(type_<Types>())... };
+  }
+};
+}
+
+template<class TypeList, class Func> auto map_types_to_values(Func&& func)
+-> decltype(detail::map_types_to_values<TypeList>::call(std::forward<Func>(func))) {
+  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
+}
+
+
+}}}
diff --git a/caffe2/utils/TypeList_test.cpp b/caffe2/utils/TypeList_test.cpp
new file mode 100644
index 0000000..34fdd2b
--- /dev/null
+++ b/caffe2/utils/TypeList_test.cpp
@@ -0,0 +1,141 @@
+#include "caffe2/utils/TypeList.h"
+#include <gtest/gtest.h>
+#include <memory>
+
+using namespace c10::guts::typelist;
+
+namespace test_size {
+    class MyClass {};
+    static_assert(0 == size<typelist<>>::value, "");
+    static_assert(1 == size<typelist<int>>::value, "");
+    static_assert(3 == size<typelist<int, float&, const MyClass&&>>::value, "");
+}
+
+namespace test_from_tuple {
+    class MyClass {};
+    static_assert(std::is_same<typelist<int, float&, const MyClass&&>, from_tuple_t<std::tuple<int, float&, const MyClass&&>>>::value, "");
+    static_assert(std::is_same<typelist<>, from_tuple_t<std::tuple<>>>::value, "");
+}
+
+namespace test_to_tuple {
+    class MyClass {};
+    static_assert(std::is_same<std::tuple<int, float&, const MyClass&&>, to_tuple_t<typelist<int, float&, const MyClass&&>>>::value, "");
+    static_assert(std::is_same<std::tuple<>, to_tuple_t<typelist<>>>::value, "");
+}
+
+namespace test_concat {
+    class MyClass {};
+    static_assert(std::is_same<typelist<>, concat_t<>>::value, "");
+    static_assert(std::is_same<typelist<>, concat_t<typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<>, concat_t<typelist<>, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<int>, concat_t<typelist<int>>>::value, "");
+    static_assert(std::is_same<typelist<int>, concat_t<typelist<int>, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<int>, concat_t<typelist<>, typelist<int>>>::value, "");
+    static_assert(std::is_same<typelist<int>, concat_t<typelist<>, typelist<int>, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<int, float&>, concat_t<typelist<int>, typelist<float&>>>::value, "");
+    static_assert(std::is_same<typelist<int, float&>, concat_t<typelist<>, typelist<int, float&>, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<int, float&, const MyClass&&>, concat_t<typelist<>, typelist<int, float&>, typelist<const MyClass&&>>>::value, "");
+}
+
+namespace test_filter {
+    class MyClass {};
+    static_assert(std::is_same<typelist<>, filter_t<std::is_reference, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<>, filter_t<std::is_reference, typelist<int, float, double, MyClass>>>::value, "");
+    static_assert(std::is_same<typelist<float&, const MyClass&&>, filter_t<std::is_reference, typelist<int, float&, double, const MyClass&&>>>::value, "");
+}
+
+namespace test_count_if {
+    class MyClass final {};
+    static_assert(count_if<std::is_reference, typelist<int, bool&, const MyClass&&, float, double>>::value == 2, "");
+    static_assert(count_if<std::is_reference, typelist<int, bool>>::value == 0, "");
+    static_assert(count_if<std::is_reference, typelist<>>::value == 0, "");
+}
+
+namespace test_true_for_each_type {
+    template<class> class Test;
+    class MyClass {};
+    static_assert(true_for_each_type<std::is_reference, typelist<int&, const float&&, const MyClass&>>::value, "");
+    static_assert(!true_for_each_type<std::is_reference, typelist<int&, const float, const MyClass&>>::value, "");
+    static_assert(true_for_each_type<std::is_reference, typelist<>>::value, "");
+}
+
+namespace test_map {
+    class MyClass {};
+    static_assert(std::is_same<typelist<>, map_t<c10::guts::add_lvalue_reference_t, typelist<>>>::value, "");
+    static_assert(std::is_same<typelist<int&>, map_t<c10::guts::add_lvalue_reference_t, typelist<int>>>::value, "");
+    static_assert(std::is_same<typelist<int&, double&, const MyClass&>, map_t<c10::guts::add_lvalue_reference_t, typelist<int, double, const MyClass>>>::value, "");
+}
+
+namespace test_head {
+    class MyClass {};
+    static_assert(std::is_same<int, head_t<typelist<int, double>>>::value, "");
+    static_assert(std::is_same<const MyClass&, head_t<typelist<const MyClass&, double>>>::value, "");
+    static_assert(std::is_same<MyClass&&, head_t<typelist<MyClass&&, MyClass>>>::value, "");
+    static_assert(std::is_same<bool, head_t<typelist<bool>>>::value, "");
+}
+
+namespace test_reverse {
+    class MyClass {};
+    static_assert(std::is_same<
+            typelist<int, double, MyClass*, const MyClass&&>,
+            reverse_t<typelist<const MyClass&&, MyClass*, double, int>>
+    >::value, "");
+    static_assert(std::is_same<
+            typelist<>,
+            reverse_t<typelist<>>
+    >::value, "");
+}
+
+namespace test_map_types_to_values {
+    struct map_to_size {
+      template<class T> constexpr size_t operator()(T) const {return sizeof(typename T::type);}
+    };
+
+    TEST(TypeListTest, MapTypesToValues_sametype) {
+        auto sizes =
+            map_types_to_values<typelist<int64_t, bool, uint32_t>>(map_to_size());
+        std::tuple<size_t, size_t, size_t> expected(8, 1, 4);
+        static_assert(std::is_same<decltype(expected), decltype(sizes)>::value, "");
+        EXPECT_EQ(expected, sizes);
+    }
+
+    struct map_make_shared {
+      template<class T> std::shared_ptr<typename T::type> operator()(T) {
+        return std::make_shared<typename T::type>();
+      }
+    };
+
+    TEST(TypeListTest, MapTypesToValues_differenttypes) {
+        auto shared_ptrs =
+                map_types_to_values<typelist<int, double>>(map_make_shared());
+        static_assert(std::is_same<std::tuple<std::shared_ptr<int>, std::shared_ptr<double>>, decltype(shared_ptrs)>::value, "");
+    }
+
+    struct Class1 {static int func() {return 3;}};
+    struct Class2 {static double func() {return 2.0;}};
+
+    struct mapper_call_func {
+      template<class T> auto operator()(T) -> decltype(T::type::func()) { return T::type::func(); }
+    };
+
+    TEST(TypeListTest, MapTypesToValues_members) {
+        auto result =
+                map_types_to_values<typelist<Class1, Class2>>(mapper_call_func());
+        std::tuple<int, double> expected(3, 2.0);
+        static_assert(std::is_same<decltype(expected), decltype(result)>::value, "");
+        EXPECT_EQ(expected, result);
+    }
+
+    struct mapper_call_nonexistent_function {
+      template<class T> auto operator()(T) -> decltype(T::type::this_doesnt_exist()) { return T::type::this_doesnt_exist(); }
+    };
+
+    TEST(TypeListTest, MapTypesToValues_empty) {
+        auto result =
+                map_types_to_values<typelist<>>(mapper_call_nonexistent_function());
+        std::tuple<> expected;
+        static_assert(std::is_same<decltype(expected), decltype(result)>::value, "");
+        EXPECT_EQ(expected, result);
+    }
+
+}
diff --git a/caffe2/utils/TypeTraits.cpp b/caffe2/utils/TypeTraits.cpp
new file mode 100644
index 0000000..245b980
--- /dev/null
+++ b/caffe2/utils/TypeTraits.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/TypeTraits.h"
diff --git a/caffe2/utils/TypeTraits.h b/caffe2/utils/TypeTraits.h
new file mode 100644
index 0000000..0045869
--- /dev/null
+++ b/caffe2/utils/TypeTraits.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "caffe2/utils/C++17.h"
+#include <functional>
+
+namespace c10 {
+namespace guts {
+
+
+/**
+ * is_equality_comparable<T> is true_type iff the equality operator is defined for T.
+ */
+template<class T, class Enable = void> struct is_equality_comparable : std::false_type {};
+template<class T> struct is_equality_comparable<T, void_t<decltype(std::declval<T&>() == std::declval<T&>())>> : std::true_type {};
+template<class T> using is_equality_comparable_t = typename is_equality_comparable<T>::type;
+
+
+
+/**
+ * is_hashable<T> is true_type iff std::hash is defined for T
+ */
+template<class T, class Enable = void> struct is_hashable : std::false_type {};
+template<class T> struct is_hashable<T, void_t<decltype(std::hash<T>()(std::declval<T&>()))>> : std::true_type {};
+template<class T> using is_hashable_t = typename is_hashable<T>::type;
+
+
+
+/**
+ * is_function_type<T> is true_type iff T is a plain function type (i.e. "Result(Args...)")
+ */
+template<class T>
+struct is_function_type : std::false_type {};
+template<class Result, class... Args>
+struct is_function_type<Result (Args...)> : std::true_type {};
+template<class T> using is_function_type_t = typename is_function_type<T>::type;
+
+
+
+/**
+ * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T (e.g. vector<int> is an instantiation of vector)
+ *  Example:
+ *    is_instantiation_of_t<vector, vector<int>> // true
+ *    is_instantiation_of_t<pair, pair<int, string>> // true
+ *    is_instantiation_of_t<vector, pair<int, string>> // false
+ */
+template <template <class...> class Template, class T>
+struct is_instantiation_of : std::false_type {};
+template <template <class...> class Template, class... Args>
+struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
+template<template<class...> class Template, class T> using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
+
+
+
+/**
+ * is_type_condition<C> is true_type iff C<...> is a type trait representing a condition (i.e. has a constexpr static bool ::value member)
+ * Example:
+ *   is_type_condition<std::is_reference>  // true
+ */
+template<template<class> class C, class Enable = void>
+struct is_type_condition : std::false_type {};
+template<template<class> class C>
+struct is_type_condition<C, guts::enable_if_t<std::is_same<bool, guts::remove_cv_t<decltype(C<int>::value)>>::value>> : std::true_type {};
+
+}
+}
diff --git a/caffe2/utils/TypeTraits_test.cpp b/caffe2/utils/TypeTraits_test.cpp
new file mode 100644
index 0000000..14c7389
--- /dev/null
+++ b/caffe2/utils/TypeTraits_test.cpp
@@ -0,0 +1,114 @@
+#include "caffe2/utils/TypeTraits.h"
+#include <gtest/gtest.h>
+
+using namespace c10::guts;
+
+namespace {
+
+namespace test_is_equality_comparable {
+    class NotEqualityComparable {};
+    class EqualityComparable {};
+
+    inline bool operator==(const EqualityComparable &, const EqualityComparable &) { return false; }
+
+    static_assert(!is_equality_comparable<NotEqualityComparable>::value, "");
+    static_assert(is_equality_comparable<EqualityComparable>::value, "");
+    static_assert(is_equality_comparable<int>::value, "");
+
+    // v_ just exists to silence a compiler warning about operator==(EqualityComparable, EqualityComparable) not being needed
+    const bool v_ = EqualityComparable() == EqualityComparable();
+}
+
+namespace test_is_hashable {
+    class NotHashable {};
+    class Hashable {};
+}
+}
+namespace std {
+    template<> struct hash<test_is_hashable::Hashable> final {
+        size_t operator()(const test_is_hashable::Hashable &) { return 0; }
+    };
+}
+namespace {
+namespace test_is_hashable {
+    static_assert(is_hashable<int>::value, "");
+    static_assert(is_hashable<Hashable>::value, "");
+    static_assert(!is_hashable<NotHashable>::value, "");
+}
+
+namespace test_is_function_type {
+    class MyClass {};
+    struct Functor {
+        void operator()() {}
+    };
+    auto lambda = [] () {};
+    // func() and func__ just exists to silence a compiler warning about lambda being unused
+    bool func() {
+        lambda();
+        return true;
+    }
+    bool func__ = func();
+
+    static_assert(is_function_type<void()>::value, "");
+    static_assert(is_function_type<int()>::value, "");
+    static_assert(is_function_type<MyClass()>::value, "");
+    static_assert(is_function_type<void(MyClass)>::value, "");
+    static_assert(is_function_type<void(int)>::value, "");
+    static_assert(is_function_type<void(void*)>::value, "");
+    static_assert(is_function_type<int()>::value, "");
+    static_assert(is_function_type<int(MyClass)>::value, "");
+    static_assert(is_function_type<int(const MyClass&)>::value, "");
+    static_assert(is_function_type<int(MyClass&&)>::value, "");
+    static_assert(is_function_type<MyClass&&()>::value, "");
+    static_assert(is_function_type<MyClass&&(MyClass&&)>::value, "");
+    static_assert(is_function_type<const MyClass&(int, float, MyClass)>::value, "");
+
+    static_assert(!is_function_type<void>::value, "");
+    static_assert(!is_function_type<int>::value, "");
+    static_assert(!is_function_type<MyClass>::value, "");
+    static_assert(!is_function_type<void*>::value, "");
+    static_assert(!is_function_type<const MyClass&>::value, "");
+    static_assert(!is_function_type<MyClass&&>::value, "");
+
+    static_assert(!is_function_type<void (*)()>::value, "function pointers aren't plain functions");
+    static_assert(!is_function_type<Functor>::value, "Functors aren't plain functions");
+    static_assert(!is_function_type<decltype(lambda)>::value, "Lambdas aren't plain functions");
+}
+
+namespace test_is_instantiation_of {
+    class MyClass {};
+    template<class T> class Single {};
+    template<class T1, class T2> class Double {};
+    template<class... T> class Multiple {};
+
+    static_assert(is_instantiation_of<Single, Single<void>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<MyClass>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<int>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<void*>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<int*>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<const MyClass&>>::value, "");
+    static_assert(is_instantiation_of<Single, Single<MyClass&&>>::value, "");
+    static_assert(is_instantiation_of<Double, Double<int, void>>::value, "");
+    static_assert(is_instantiation_of<Double, Double<const int&, MyClass*>>::value, "");
+    static_assert(is_instantiation_of<Multiple, Multiple<>>::value, "");
+    static_assert(is_instantiation_of<Multiple, Multiple<int>>::value, "");
+    static_assert(is_instantiation_of<Multiple, Multiple<MyClass&, int>>::value, "");
+    static_assert(is_instantiation_of<Multiple, Multiple<MyClass&, int, MyClass>>::value, "");
+    static_assert(is_instantiation_of<Multiple, Multiple<MyClass&, int, MyClass, void*>>::value, "");
+
+    static_assert(!is_instantiation_of<Single, Double<int, int>>::value, "");
+    static_assert(!is_instantiation_of<Single, Double<int, void>>::value, "");
+    static_assert(!is_instantiation_of<Single, Multiple<int>>::value, "");
+    static_assert(!is_instantiation_of<Double, Single<int>>::value, "");
+    static_assert(!is_instantiation_of<Double, Multiple<int, int>>::value, "");
+    static_assert(!is_instantiation_of<Double, Multiple<>>::value, "");
+    static_assert(!is_instantiation_of<Multiple, Double<int, int>>::value, "");
+    static_assert(!is_instantiation_of<Multiple, Single<int>>::value, "");
+}
+
+namespace test_is_type_condition {
+    template<class> class NotATypeCondition {};
+    static_assert(is_type_condition<std::is_reference>::value, "");
+    static_assert(!is_type_condition<NotATypeCondition>::value, "");
+}
+}
diff --git a/caffe2/utils/bench_utils.cc b/caffe2/utils/bench_utils.cc
new file mode 100644
index 0000000..dfc9f02
--- /dev/null
+++ b/caffe2/utils/bench_utils.cc
@@ -0,0 +1,88 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cpuinfo.h>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+uint32_t wipe_cache() {
+  static uint32_t* wipe_buffer = nullptr;
+  static size_t wipe_size = 0;
+
+  if (wipe_buffer == nullptr) {
+    CAFFE_ENFORCE(cpuinfo_initialize(), "failed to initialize cpuinfo");
+    const cpuinfo_processor* processor = cpuinfo_get_processor(0);
+    if (processor->cache.l4 != nullptr) {
+      wipe_size = processor->cache.l4->size;
+    } else if (processor->cache.l3 != nullptr) {
+      wipe_size = processor->cache.l3->size;
+    } else if (processor->cache.l2 != nullptr) {
+      wipe_size = processor->cache.l2->size;
+    } else {
+      wipe_size = processor->cache.l1d->size;
+    }
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    /*
+     * On ARM precise cache size is not available, and cpuinfo may
+     * underestimate. Use max for uArch (see src/arm/cache.c)
+     */
+    switch (processor->core->uarch) {
+      case cpuinfo_uarch_cortex_a5:
+        wipe_size = 512 * 1024; /* Max observed */
+        break;
+      case cpuinfo_uarch_cortex_a7:
+        wipe_size = 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a8:
+        wipe_size = 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a9:
+        wipe_size = 1024 * 1024; /* Max observed */
+        break;
+      case cpuinfo_uarch_cortex_a12:
+      case cpuinfo_uarch_cortex_a17:
+        wipe_size = 8 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a15:
+        wipe_size = 4 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a35:
+        wipe_size = 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a53:
+        wipe_size = 2 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a57:
+        wipe_size = 2 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a72:
+        wipe_size = 4 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a73:
+        wipe_size = 8 * 1024 * 1024; /* uArch max */
+        break;
+      case cpuinfo_uarch_cortex_a55:
+      case cpuinfo_uarch_cortex_a75:
+      case cpuinfo_uarch_meerkat_m3:
+        wipe_size = 4 * 1024 * 1024; /* DynamIQ max */
+        break;
+      default:
+        wipe_size = 60 * 1024 * 1024;
+        break;
+    }
+#endif
+    LOG(INFO) << "Allocating cache wipe buffer of size" << wipe_size;
+    wipe_buffer = static_cast<uint32_t*>(malloc(wipe_size));
+    CAFFE_ENFORCE(wipe_buffer != nullptr);
+  }
+  uint32_t hash = 0;
+  for (uint32_t i = 0; i * sizeof(uint32_t) < wipe_size; i += 8) {
+    hash ^= wipe_buffer[i];
+  }
+  /* Make sure compiler doesn't optimize the loop away */
+  return hash;
+}
+
+} /* namespace caffe2 */
diff --git a/caffe2/utils/bench_utils.h b/caffe2/utils/bench_utils.h
new file mode 100644
index 0000000..e3f8a37
--- /dev/null
+++ b/caffe2/utils/bench_utils.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAFFE2_UTILS_BENCH_UTILS_H_
+#define CAFFE2_UTILS_BENCH_UTILS_H_
+
+#include <stdint.h>
+
+namespace caffe2 {
+
+uint32_t wipe_cache();
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_BENCH_UTILS_H_
diff --git a/caffe2/utils/cast.h b/caffe2/utils/cast.h
new file mode 100644
index 0000000..6f9db08
--- /dev/null
+++ b/caffe2/utils/cast.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <caffe2/utils/proto_utils.h>
+
+namespace caffe2 {
+
+namespace cast {
+
+inline TensorProto_DataType GetCastDataType(const ArgumentHelper& helper, std::string arg) {
+  TensorProto_DataType to;
+  if (helper.HasSingleArgumentOfType<string>(arg)) {
+    string s = helper.GetSingleArgument<string>(arg, "float");
+    std::transform(s.begin(), s.end(), s.begin(), ::toupper);
+#ifndef CAFFE2_USE_LITE_PROTO
+    CAFFE_ENFORCE(TensorProto_DataType_Parse(s, &to), "Unknown 'to' argument: ", s);
+#else
+
+// Manually implement in the lite proto case.
+#define X(t)                         \
+  if (s == #t) {                     \
+    return TensorProto_DataType_##t; \
+  }
+
+    X(FLOAT);
+    X(INT32);
+    X(BYTE);
+    X(STRING);
+    X(BOOL);
+    X(UINT8);
+    X(INT8);
+    X(UINT16);
+    X(INT16);
+    X(INT64);
+    X(FLOAT16);
+    X(DOUBLE);
+#undef X
+    CAFFE_THROW("Unhandled type argument: ", s);
+
+#endif
+  } else {
+    to = static_cast<TensorProto_DataType>(
+        helper.GetSingleArgument<int>(arg, TensorProto_DataType_FLOAT));
+  }
+  return to;
+}
+
+};  // namespace cast
+
+};  // namespace caffe2
diff --git a/caffe2/utils/cast_test.cc b/caffe2/utils/cast_test.cc
new file mode 100644
index 0000000..680e87b
--- /dev/null
+++ b/caffe2/utils/cast_test.cc
@@ -0,0 +1,39 @@
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/utils/cast.h"
+
+namespace caffe2 {
+
+TEST(CastTest, GetCastDataType) {
+  auto castOp = [](std::string t) {
+    // Ensure lowercase.
+    std::transform(t.begin(), t.end(), t.begin(), ::tolower);
+    auto op = CreateOperatorDef("Cast", "", {}, {});
+    AddArgument("to", t, &op);
+    return op;
+  };
+
+#define X(t)                    \
+  EXPECT_EQ(                    \
+      TensorProto_DataType_##t, \
+      cast::GetCastDataType(ArgumentHelper(castOp(#t)), "to"));
+
+  X(FLOAT);
+  X(INT32);
+  X(BYTE);
+  X(STRING);
+  X(BOOL);
+  X(UINT8);
+  X(INT8);
+  X(UINT16);
+  X(INT16);
+  X(INT64);
+  X(FLOAT16);
+  X(DOUBLE);
+#undef X
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/cblas.h b/caffe2/utils/cblas.h
new file mode 100644
index 0000000..c91b8bf
--- /dev/null
+++ b/caffe2/utils/cblas.h
@@ -0,0 +1,606 @@
+// This is the exact cblas.h header file, placed here purely in order to get
+// the enums.
+
+#include "caffe2/core/macros.h"
+
+#ifndef CBLAS_H
+#ifdef CAFFE2_USE_MKL
+#include <mkl_cblas.h>
+#else  // CAFFE2_USE_MKL
+
+#ifndef CBLAS_ENUM_DEFINED_H
+   #define CBLAS_ENUM_DEFINED_H
+   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+                         AtlasConj=114};
+   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
+   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
+   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+void cblas_xerbla(int p, const char *rout, const char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float  cblas_sdsdot(const int N, const float alpha, const float *X,
+                    const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+                   const int incY);
+float  cblas_sdot(const int N, const float  *X, const int incX,
+                  const float  *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+                  const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void   cblas_cdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_cdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+void   cblas_zdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float  cblas_snrm2(const int N, const float *X, const int incX);
+float  cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float  cblas_scnrm2(const int N, const void *X, const int incX);
+float  cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+                  const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+   (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+                 const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+                  const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+   (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+   (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+   (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+                float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+                float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+                double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+                double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *X, const int incX, const float beta,
+                 float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const float alpha,
+                 const float *A, const int lda, const float *X,
+                 const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda, float *X,
+                 const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *X, const int incX, const double beta,
+                 double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const double alpha,
+                 const double *A, const int lda, const double *X,
+                 const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda, double *X,
+                 const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *Ap,
+                 const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const float alpha, const float *X, const int incX,
+                const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A,
+                const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *Ap,
+                 const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const double alpha, const double *X, const int incX,
+                const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A,
+                const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X,
+                const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X,
+                const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta,
+                 float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const float *A, const int lda,
+                 const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const float alpha, const float *A, const int lda,
+                  const float *B, const int ldb, const float beta,
+                  float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *B, const int ldb, const double beta,
+                 double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const double *A, const int lda,
+                 const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const double alpha, const double *A, const int lda,
+                  const double *B, const int ldb, const double beta,
+                  double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const void *A, const int lda,
+                 const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const float beta,
+                  void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const void *A, const int lda,
+                 const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const double beta,
+                  void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif  /* end #ifdef CBLAS_ENUM_ONLY */
+#endif  // CAFFE2_USE_MKL
+#endif
diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h
new file mode 100644
index 0000000..fbe30fb
--- /dev/null
+++ b/caffe2/utils/conversions.h
@@ -0,0 +1,240 @@
+#pragma once
+
+#include <caffe2/core/types.h>
+
+#ifdef __CUDA_ARCH__
+// Proxy for including cuda_fp16.h, because common_gpu.h
+// has necessary diagnostic guards.
+#include <caffe2/core/common_gpu.h>
+#endif
+#if __HIP_DEVICE_COMPILE__
+#include <caffe2/core/hip/common_hip.h>
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#define CONVERSIONS_DECL __host__ __device__ inline
+#else
+#define CONVERSIONS_DECL inline
+#endif
+
+namespace caffe2 {
+
+namespace convert {
+
+namespace {
+inline float16 cpu_float2half_rn(float f) {
+  float16 ret;
+
+  static_assert(
+      sizeof(unsigned int) == sizeof(float),
+      "Programming error sizeof(unsigned int) != sizeof(float)");
+
+  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
+  unsigned x = *xp;
+  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+  unsigned sign, exponent, mantissa;
+
+  // Get rid of +NaN/-NaN case first.
+  if (u > 0x7f800000) {
+    ret.x = 0x7fffU;
+    return ret;
+  }
+
+  sign = ((x >> 16) & 0x8000);
+
+  // Get rid of +Inf/-Inf, +0/-0.
+  if (u > 0x477fefff) {
+    ret.x = sign | 0x7c00U;
+    return ret;
+  }
+  if (u < 0x33000001) {
+    ret.x = (sign | 0x0000);
+    return ret;
+  }
+
+  exponent = ((u >> 23) & 0xff);
+  mantissa = (u & 0x7fffff);
+
+  if (exponent > 0x70) {
+    shift = 13;
+    exponent -= 0x70;
+  } else {
+    shift = 0x7e - exponent;
+    exponent = 0;
+    mantissa |= 0x800000;
+  }
+  lsb = (1 << shift);
+  lsb_s1 = (lsb >> 1);
+  lsb_m1 = (lsb - 1);
+
+  // Round to nearest even.
+  remainder = (mantissa & lsb_m1);
+  mantissa >>= shift;
+  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+    ++mantissa;
+    if (!(mantissa & 0x3ff)) {
+      ++exponent;
+      mantissa = 0;
+    }
+  }
+
+  ret.x = (sign | (exponent << 10) | mantissa);
+
+  return ret;
+}
+
+inline float cpu_half2float(float16 h) {
+  unsigned sign = ((h.x >> 15) & 1);
+  unsigned exponent = ((h.x >> 10) & 0x1f);
+  unsigned mantissa = ((h.x & 0x3ff) << 13);
+
+  if (exponent == 0x1f) { /* NaN or Inf */
+    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+    exponent = 0xff;
+  } else if (!exponent) { /* Denorm or Zero */
+    if (mantissa) {
+      unsigned int msb;
+      exponent = 0x71;
+      do {
+        msb = (mantissa & 0x400000);
+        mantissa <<= 1; /* normalize */
+        --exponent;
+      } while (!msb);
+      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+    }
+  } else {
+    exponent += 0x70;
+  }
+
+  unsigned i = ((sign << 31) | (exponent << 23) | mantissa);
+  float ret;
+  memcpy(&ret, &i, sizeof(i));
+  return ret;
+}
+
+}; // anonymous
+
+#if __CUDACC__
+
+#if CUDA_VERSION >= 9000
+CONVERSIONS_DECL float16 halfToFloat16(half x) {
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#endif
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // __GNUC__
+  float16 r = *reinterpret_cast<float16*>(&x);
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif
+#endif // __GNUC__
+  return r;
+}
+
+inline half float16ToHalf(const float16 x) {
+  __half_raw hr;
+  hr.x = x.x;
+  half r(hr);
+  return r;
+}
+
+inline half floatToHalf(const float x) {
+  float16 xh = cpu_float2half_rn(x);
+  return float16ToHalf(xh);
+}
+
+#else
+inline float16 halfToFloat16(__half x) {
+  float16 r;
+  r.x = x.x;
+  return r;
+}
+
+inline __half float16ToHalf(const float16 x) {
+  __half r;
+  r.x = x.x;
+  return r;
+}
+
+inline half floatToHalf(const float x) {
+  float16 xh = cpu_float2half_rn(x);
+  return float16ToHalf(xh);
+}
+#endif // CUDA_VERSION
+
+#endif // __CUDACC__
+
+// general version: defer to static_cast
+template <typename IN, typename OUT>
+CONVERSIONS_DECL OUT To(const IN in) {
+  return static_cast<OUT>(in);
+}
+
+// explicit for fp16
+template <>
+CONVERSIONS_DECL float16 To(const float in) {
+#if __CUDA_ARCH__
+  // hacky interface between C2 fp16 and CUDA
+#if CUDA_VERSION >= 9000
+  half rh = __float2half(in);
+  return halfToFloat16(rh);
+#else
+  float16 ret;
+  ret.x = __float2half(in).x;
+  return ret;
+#endif // CUDA_VERSION >= 9000
+#elif __HIP_DEVICE_COMPILE__
+  float16 ret;
+  ret.x = __float2half(in);
+  return ret;
+#else
+  return cpu_float2half_rn(in);
+#endif
+}
+
+template <>
+CONVERSIONS_DECL float To(const float16 in) {
+#if __CUDA_ARCH__
+#if CUDA_VERSION >= 9000
+  __half_raw tmp;
+#else
+  __half tmp;
+#endif
+  tmp.x = in.x;
+  return __half2float(tmp);
+#elif __HIP_DEVICE_COMPILE__
+  __half tmp;
+  tmp = in.x;
+  return __half2float(tmp);
+#else
+  return cpu_half2float(in);
+#endif
+};
+
+template <>
+CONVERSIONS_DECL float To(const float in) {
+  return in;
+}
+
+template <typename OUT, typename IN>
+CONVERSIONS_DECL OUT Get(IN x) {
+  return static_cast<OUT>(x);
+}
+
+template <>
+CONVERSIONS_DECL float Get(float16 x) {
+  return To<float16, float>(x);
+}
+
+template <>
+CONVERSIONS_DECL float16 Get(float x) {
+  return To<float, float16>(x);
+}
+
+}; // namespace convert
+
+}; // namespace caffe2
+
+#undef CONVERSIONS_DECL
diff --git a/caffe2/utils/cpu_neon.h b/caffe2/utils/cpu_neon.h
new file mode 100644
index 0000000..7e68d73
--- /dev/null
+++ b/caffe2/utils/cpu_neon.h
@@ -0,0 +1,53 @@
+#ifndef CAFFE2_UTILS_CPU_NEON_H_
+#define CAFFE2_UTILS_CPU_NEON_H_
+
+// Provides a variety of ARM NEON-specific utility functions
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+
+namespace caffe2 {
+
+template <typename T>
+inline bool isPointerAligned(T* p, size_t align) {
+  return (reinterpret_cast<uintptr_t>(p) % align == 0);
+}
+
+inline float32x4_t vert_sum_f32(float32x4_t v0,
+                                float32x4_t v1,
+                                float32x4_t v2,
+                                float32x4_t v3) {
+  v0 = vaddq_f32(v0, v1);
+  v2 = vaddq_f32(v2, v3);
+  return vaddq_f32(v0, v2);
+}
+
+inline float horizontal_sum_f32(float32x4_t v0,
+                                float32x4_t v1,
+                                float32x4_t v2,
+                                float32x4_t v3) {
+  v0 = vert_sum_f32(v0, v1, v2, v3);
+  float32x2_t v = vadd_f32(vget_high_f32(v0), vget_low_f32(v0));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+
+// Load/store functions that assume alignment
+
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32((const float*)
+                   __builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+
+inline void vst1q_f32_aligned(float* p, float32x4_t v) {
+  vst1q_f32((float*) __builtin_assume_aligned(p, sizeof(float32x4_t)), v);
+}
+
+inline void vst4_u8_aligned(uint8_t* p, uint8x8x4_t v) {
+  vst4_u8((uint8_t*)
+          __builtin_assume_aligned(p, sizeof(uint8x8x4_t)), v);
+}
+
+}  // namespace caffe2
+
+#endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#endif  // CAFFE2_UTILS_CPU_NEON_H_
diff --git a/caffe2/utils/cpuid.cc b/caffe2/utils/cpuid.cc
new file mode 100644
index 0000000..b2e6b89
--- /dev/null
+++ b/caffe2/utils/cpuid.cc
@@ -0,0 +1,80 @@
+#include "caffe2/utils/cpuid.h"
+
+namespace caffe2 {
+
+const CpuId& GetCpuId() {
+  static CpuId cpuid_singleton;
+  return cpuid_singleton;
+}
+
+CAFFE2_API uint32_t CpuId::f1c_ = 0;
+CAFFE2_API uint32_t CpuId::f1d_ = 0;
+CAFFE2_API uint32_t CpuId::f7b_ = 0;
+CAFFE2_API uint32_t CpuId::f7c_ = 0;
+
+CpuId::CpuId() {
+#ifdef _MSC_VER
+  int reg[4];
+  __cpuid(static_cast<int*>(reg), 0);
+  const int n = reg[0];
+  if (n >= 1) {
+    __cpuid(static_cast<int*>(reg), 1);
+    f1c_ = uint32_t(reg[2]);
+    f1d_ = uint32_t(reg[3]);
+  }
+  if (n >= 7) {
+    __cpuidex(static_cast<int*>(reg), 7, 0);
+    f7b_ = uint32_t(reg[1]);
+    f7c_ = uint32_t(reg[2]);
+  }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && \
+    defined(__GNUC__)
+  // The following block like the normal cpuid branch below, but gcc
+  // reserves ebx for use of its pic register so we must specially
+  // handle the save and restore to avoid clobbering the register
+  uint32_t n;
+  __asm__(
+      "pushl %%ebx\n\t"
+      "cpuid\n\t"
+      "popl %%ebx\n\t"
+      : "=a"(n)
+      : "a"(0)
+      : "ecx", "edx");
+  if (n >= 1) {
+    uint32_t f1a;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(f1a), "=c"(f1c_), "=d"(f1d_)
+        : "a"(1)
+        :);
+  }
+  if (n >= 7) {
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "movl %%ebx, %%eax\n\r"
+        "popl %%ebx"
+        : "=a"(f7b_), "=c"(f7c_)
+        : "a"(7), "c"(0)
+        : "edx");
+  }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+  uint32_t n;
+  __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+  if (n >= 1) {
+    uint32_t f1a;
+    __asm__("cpuid" : "=a"(f1a), "=c"(f1c_), "=d"(f1d_) : "a"(1) : "ebx");
+  }
+  if (n >= 7) {
+    uint32_t f7a;
+    __asm__("cpuid"
+            : "=a"(f7a), "=b"(f7b_), "=c"(f7c_)
+            : "a"(7), "c"(0)
+            : "edx");
+  }
+#endif
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/cpuid.h b/caffe2/utils/cpuid.h
new file mode 100644
index 0000000..744ce9c
--- /dev/null
+++ b/caffe2/utils/cpuid.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <cstdint>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+class CpuId;
+
+CAFFE2_API const CpuId& GetCpuId();
+
+///////////////////////////////////////////////////////////////////////////////
+// Implementation of CpuId that is borrowed from folly.
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Identification of an Intel CPU.
+ * Supports CPUID feature flags (EAX=1) and extended features (EAX=7, ECX=0).
+ * Values from
+ * http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html
+ */
+class CpuId {
+ public:
+  CpuId();
+
+#define X(name, r, bit)              \
+  inline bool name() const {         \
+    return ((r) & (1U << bit)) != 0; \
+  }
+
+// cpuid(1): Processor Info and Feature Bits.
+#define C(name, bit) X(name, f1c_, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+
+#define D(name, bit) X(name, f1d_, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+// cpuid(7): Extended Features.
+#define B(name, bit) X(name, f7b_, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+
+#define E(name, bit) X(name, f7c_, bit)
+  E(prefetchwt1, 0)
+  E(avx512vbmi, 1)
+#undef E
+
+#undef X
+
+ private:
+  CAFFE2_API static uint32_t f1c_;
+  CAFFE2_API static uint32_t f1d_;
+  CAFFE2_API static uint32_t f7b_;
+  CAFFE2_API static uint32_t f7c_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/utils/cpuid_test.cc b/caffe2/utils/cpuid_test.cc
new file mode 100644
index 0000000..f3694f5
--- /dev/null
+++ b/caffe2/utils/cpuid_test.cc
@@ -0,0 +1,10 @@
+#include <gtest/gtest.h>
+#include "caffe2/utils/cpuid.h"
+
+namespace caffe2 {
+
+TEST(CpuIdTest, ShouldAlwaysHaveMMX) {
+  EXPECT_TRUE(GetCpuId().mmx());
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/dummy.cpp b/caffe2/utils/dummy.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
new file mode 100644
index 0000000..cf41d26
--- /dev/null
+++ b/caffe2/utils/eigen_utils.h
@@ -0,0 +1,154 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_UTILS_EIGEN_H_
+#define CAFFE2_OPERATORS_UTILS_EIGEN_H_
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+// Common Eigen types that we will often use
+template <typename T>
+using EigenMatrixMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenVectorMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+// 1-d array
+template <typename T>
+using EArrXt = Eigen::Array<T, Eigen::Dynamic, 1>;
+using EArrXf = Eigen::ArrayXf;
+using EArrXd = Eigen::ArrayXd;
+using EArrXi = Eigen::ArrayXi;
+using EArrXb = EArrXt<bool>;
+
+// 2-d array, column major
+template <typename T>
+using EArrXXt = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>;
+using EArrXXf = Eigen::ArrayXXf;
+
+// 2-d array, row major
+template <typename T>
+using ERArrXXt =
+    Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ERArrXXf = ERArrXXt<float>;
+
+// 1-d vector
+template <typename T>
+using EVecXt = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+using EVecXd = Eigen::VectorXd;
+using EVecXf = Eigen::VectorXf;
+
+// 1-d row vector
+using ERVecXd = Eigen::RowVectorXd;
+using ERVecXf = Eigen::RowVectorXf;
+
+// 2-d matrix, column major
+template <typename T>
+using EMatXt = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+using EMatXd = Eigen::MatrixXd;
+using EMatXf = Eigen::MatrixXf;
+
+// 2-d matrix, row major
+template <typename T>
+using ERMatXt =
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ERMatXd = ERMatXt<double>;
+using ERMatXf = ERMatXt<float>;
+
+namespace utils {
+
+template <typename T>
+Eigen::Map<const EArrXt<T>> AsEArrXt(const std::vector<T>& arr) {
+  return {arr.data(), static_cast<int>(arr.size())};
+}
+template <typename T>
+Eigen::Map<EArrXt<T>> AsEArrXt(std::vector<T>& arr) {
+  return {arr.data(), static_cast<int>(arr.size())};
+}
+
+// return a sub array of 'array' based on indices 'indices'
+template <class Derived, class Derived1, class Derived2>
+void GetSubArray(
+    const Eigen::ArrayBase<Derived>& array,
+    const Eigen::ArrayBase<Derived1>& indices,
+    Eigen::ArrayBase<Derived2>* out_array) {
+  CAFFE_ENFORCE_EQ(array.cols(), 1);
+  // using T = typename Derived::Scalar;
+
+  out_array->derived().resize(indices.size());
+  for (int i = 0; i < indices.size(); i++) {
+    DCHECK_LT(indices[i], array.size());
+    (*out_array)[i] = array[indices[i]];
+  }
+}
+
+// return a sub array of 'array' based on indices 'indices'
+template <class Derived, class Derived1>
+EArrXt<typename Derived::Scalar> GetSubArray(
+    const Eigen::ArrayBase<Derived>& array,
+    const Eigen::ArrayBase<Derived1>& indices) {
+  using T = typename Derived::Scalar;
+  EArrXt<T> ret(indices.size());
+  GetSubArray(array, indices, &ret);
+  return ret;
+}
+
+// return a sub array of 'array' based on indices 'indices'
+template <class Derived>
+EArrXt<typename Derived::Scalar> GetSubArray(
+    const Eigen::ArrayBase<Derived>& array,
+    const std::vector<int>& indices) {
+  return GetSubArray(array, AsEArrXt(indices));
+}
+
+// return 2d sub array of 'array' based on row indices 'row_indices'
+template <class Derived, class Derived1, class Derived2>
+void GetSubArrayRows(
+    const Eigen::ArrayBase<Derived>& array2d,
+    const Eigen::ArrayBase<Derived1>& row_indices,
+    Eigen::ArrayBase<Derived2>* out_array) {
+  out_array->derived().resize(row_indices.size(), array2d.cols());
+
+  for (int i = 0; i < row_indices.size(); i++) {
+    DCHECK_LT(row_indices[i], array2d.size());
+    out_array->row(i) =
+        array2d.row(row_indices[i]).template cast<typename Derived2::Scalar>();
+  }
+}
+
+// return indices of 1d array for elements evaluated to true
+template <class Derived>
+std::vector<int> GetArrayIndices(const Eigen::ArrayBase<Derived>& array) {
+  std::vector<int> ret;
+  for (int i = 0; i < array.size(); i++) {
+    if (array[i]) {
+      ret.push_back(i);
+    }
+  }
+  return ret;
+}
+
+} // namespace utils
+} // namespace caffe2
+
+#endif
diff --git a/caffe2/utils/fatal_signal_asan_no_sig_test.cc b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
new file mode 100644
index 0000000..b06231b
--- /dev/null
+++ b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
@@ -0,0 +1,147 @@
+#include "caffe2/utils/signal_handler.h"
+#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+#include <gtest/gtest.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <array>
+
+#include "caffe2/core/common.h"
+
+namespace {
+void* dummy_thread(void*) {
+  while (1) {
+  }
+}
+
+bool forkAndPipe(
+    std::string& stderrBuffer,
+    std::function<void(void)> callback) {
+  std::array<int, 2> stderrPipe;
+  if (pipe(stderrPipe.data()) != 0) {
+    perror("STDERR pipe");
+    return false;
+  }
+  pid_t child = fork();
+  if (child == 0) {
+    // Replace this process' stderr so we can read it.
+    if (dup2(stderrPipe[1], STDERR_FILENO) < 0) {
+      close(stderrPipe[0]);
+      close(stderrPipe[1]);
+      perror("dup2 STDERR");
+      exit(5);
+    }
+
+    // This is for the parent to work with.
+    close(stderrPipe[0]);
+    close(stderrPipe[1]);
+
+    callback();
+    exit(7);
+  } else if (child > 0) {
+    const int bufferSize = 128;
+    std::array<char, bufferSize> buffer;
+
+    // We want to close the writing end of the pipe right away so our
+    // read actually gets an EOF.
+    close(stderrPipe[1]);
+
+    // wait for child to finish crashing.
+    int statloc;
+    if (wait(&statloc) < 0) {
+      close(stderrPipe[0]);
+      perror("wait");
+      return false;
+    }
+
+    ssize_t bytesRead;
+    while ((bytesRead = read(stderrPipe[0], buffer.data(), bufferSize)) > 0) {
+      const std::string tmp(buffer.data(), bytesRead);
+      std::cout << tmp;
+      stderrBuffer += tmp;
+    }
+
+    // The child should have exited due to signal.
+    if (!WIFSIGNALED(statloc)) {
+      fprintf(stderr, "Child didn't exit because it received a signal\n");
+      if (WIFEXITED(statloc)) {
+        fprintf(stderr, "Exited with code: %d\n", WEXITSTATUS(statloc) & 0xff);
+      }
+      return false;
+    }
+
+    if (bytesRead < 0) {
+      perror("read");
+      return false;
+    }
+
+    close(stderrPipe[0]);
+    return true;
+  } else {
+    perror("fork");
+    return false;
+  }
+}
+} // namespace
+
+#define _TEST_FATAL_SIGNAL(signum, name, threadCount, print, expected)       \
+  do {                                                                       \
+    std::string stderrBuffer;                                                \
+    ASSERT_TRUE(forkAndPipe(stderrBuffer, [=]() {                            \
+      caffe2::setPrintStackTracesOnFatalSignal(print);                       \
+      pthread_t pt;                                                          \
+      for (int i = 0; i < threadCount; i++) {                                \
+        if (pthread_create(&pt, nullptr, ::dummy_thread, nullptr)) {         \
+          perror("pthread_create");                                          \
+        }                                                                    \
+      }                                                                      \
+      raise(signum);                                                         \
+    }));                                                                     \
+    int keyPhraseCount = 0;                                                  \
+    std::string keyPhrase =                                                  \
+        std::string(name) + "(" + caffe2::to_string(signum) + "), Thread";   \
+    size_t loc = 0;                                                          \
+    while ((loc = stderrBuffer.find(keyPhrase, loc)) != std::string::npos) { \
+      keyPhraseCount += 1;                                                   \
+      loc += 1;                                                              \
+    }                                                                        \
+    EXPECT_EQ(keyPhraseCount, expected);                                     \
+  } while (0)
+
+#define TEST_FATAL_SIGNAL(signum, name, threadCount) \
+  _TEST_FATAL_SIGNAL(signum, name, threadCount, true, threadCount + 1)
+
+#define TEST_FATAL_SIGNAL_NO_PRINT(signum, name, threadCount) \
+  _TEST_FATAL_SIGNAL(signum, name, threadCount, false, 0)
+
+TEST(fatalSignalTest, SIGABRT8) {
+  TEST_FATAL_SIGNAL(SIGABRT, "SIGABRT", 8);
+}
+
+TEST(fatalSignalTest, SIGINT8) {
+  TEST_FATAL_SIGNAL(SIGINT, "SIGINT", 8);
+}
+
+TEST(fatalSignalTest, SIGILL8) {
+  TEST_FATAL_SIGNAL(SIGILL, "SIGILL", 8);
+}
+
+TEST(fatalSignalTest, SIGFPE8) {
+  TEST_FATAL_SIGNAL(SIGFPE, "SIGFPE", 8);
+}
+
+TEST(fatalSignalTest, SIGBUS8) {
+  TEST_FATAL_SIGNAL(SIGBUS, "SIGBUS", 8);
+}
+
+TEST(fatalSignalTest, SIGSEGV8) {
+  TEST_FATAL_SIGNAL(SIGSEGV, "SIGSEGV", 8);
+}
+
+// Test that if we don't enable printing stack traces then we don't get any.
+TEST(fatalSignalTest, SIGABRT8_NOPRINT) {
+  TEST_FATAL_SIGNAL_NO_PRINT(SIGABRT, "SIGABRT", 8);
+}
+#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
new file mode 100644
index 0000000..a2aa32f
--- /dev/null
+++ b/caffe2/utils/filler.h
@@ -0,0 +1,126 @@
+#ifndef CAFFE2_FILLER_H_
+#define CAFFE2_FILLER_H_
+
+#include <sstream>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context_t>
+class TensorFiller {
+ public:
+  template <class Type>
+  void Fill(Tensor<Context_t>* tensor) const {
+    CAFFE_ENFORCE(context_, "context is null");
+    CAFFE_ENFORCE(tensor, "tensor is null");
+    auto min = static_cast<Type>(min_);
+    auto max = static_cast<Type>(max_);
+    CAFFE_ENFORCE_LE(min, max);
+
+    Tensor<Context_t> temp_tensor(shape_);
+    tensor->swap(temp_tensor);
+    Type* data = tensor->template mutable_data<Type>();
+    Context_t* context = static_cast<Context_t*>(context_);
+
+    // TODO: Come up with a good distribution abstraction so that
+    // the users could plug in their own distribution.
+    if (has_fixed_sum_) {
+      auto fixed_sum = static_cast<Type>(fixed_sum_);
+      CAFFE_ENFORCE_LE(min * tensor->size(), fixed_sum);
+      CAFFE_ENFORCE_GE(max * tensor->size(), fixed_sum);
+      math::RandFixedSum<Type, Context_t>(
+          tensor->size(), min, max, fixed_sum_, data, context);
+    } else {
+      math::RandUniform<Type, Context_t>(
+          tensor->size(), min, max, data, context);
+    }
+  }
+
+  template <class Type>
+  TensorFiller& Min(Type min) {
+    min_ = (double)min;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller& Max(Type max) {
+    max_ = (double)max;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller& FixedSum(Type fixed_sum) {
+    has_fixed_sum_ = true;
+    fixed_sum_ = (double)fixed_sum;
+    return *this;
+  }
+
+  // a helper function to construct the lengths vector for sparse features
+  template <class Type>
+  TensorFiller& SparseLengths(Type total_length) {
+    return FixedSum(total_length).Min(0).Max(total_length);
+  }
+
+  // a helper function to construct the segments vector for sparse features
+  template <class Type>
+  TensorFiller& SparseSegments(Type max_segment) {
+    CAFFE_ENFORCE(!has_fixed_sum_);
+    return Min(0).Max(max_segment);
+  }
+
+  TensorFiller& Shape(const std::vector<TIndex>& shape) {
+    shape_ = shape;
+    return *this;
+  }
+
+  // Use new context so that it is independent from its operator
+  TensorFiller& Context(Context_t* context) {
+    context_ = (void*)context;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller(
+      const std::vector<TIndex>& shape,
+      Type fixed_sum,
+      Context_t* context)
+      : shape_(shape),
+        has_fixed_sum_(true),
+        fixed_sum_((double)fixed_sum),
+        context_((void*)context) {}
+
+  TensorFiller(const std::vector<TIndex>& shape, Context_t* context)
+      : shape_(shape),
+        has_fixed_sum_(false),
+        fixed_sum_(0),
+        context_((void*)context) {}
+
+  TensorFiller() : TensorFiller({}, (Context_t*)nullptr) {}
+
+  std::string DebugString() const {
+    std::stringstream stream;
+    stream << "shape = [" << shape_ << "]; min = " << min_
+           << "; max = " << max_;
+    if (has_fixed_sum_) {
+      stream << "; fixed sum = " << fixed_sum_;
+    }
+    return stream.str();
+  }
+
+ private:
+  std::vector<TIndex> shape_;
+  // TODO: type is unknown until a user starts to fill data;
+  // cast everything to double for now.
+  double min_ = 0.0;
+  double max_ = 1.0;
+  bool has_fixed_sum_;
+  double fixed_sum_;
+  void* context_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_FILLER_H_
diff --git a/caffe2/utils/fixed_divisor.h b/caffe2/utils/fixed_divisor.h
new file mode 100644
index 0000000..8f04514
--- /dev/null
+++ b/caffe2/utils/fixed_divisor.h
@@ -0,0 +1,118 @@
+#ifndef CAFFE2_UTILS_FIXED_DIVISOR_H_
+#define CAFFE2_UTILS_FIXED_DIVISOR_H_
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef __CUDA_ARCH__
+#define FIXED_DIVISOR_DECL inline __host__ __device__
+#else
+#define FIXED_DIVISOR_DECL inline
+#endif
+
+namespace caffe2 {
+
+// Utility class for quickly calculating quotients and remainders for
+// a known integer divisor
+template <typename T>
+class FixedDivisor {};
+
+// Works for any positive divisor, 1 to INT_MAX. One 64-bit
+// multiplication and one 64-bit shift is used to calculate the
+// result.
+template <>
+class FixedDivisor<std::int32_t> {
+ public:
+  FixedDivisor() = default;
+
+  explicit FixedDivisor(const std::int32_t d) : d_(d) {
+    CalcSignedMagic();
+  }
+
+  FIXED_DIVISOR_DECL std::int32_t d() const {
+    return d_;
+  }
+
+  FIXED_DIVISOR_DECL std::uint64_t magic() const {
+    return magic_;
+  }
+
+  FIXED_DIVISOR_DECL int shift() const {
+    return shift_;
+  }
+
+  /// Calculates `q = n / d`.
+  FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const {
+    // In lieu of a mulhi instruction being available, perform the
+    // work in uint64
+    return (int32_t)((magic_ * (uint64_t)n) >> shift_);
+  }
+
+  /// Calculates `r = n % d`.
+  FIXED_DIVISOR_DECL std::int32_t Mod(const std::int32_t n) const {
+    return n - d_ * Div(n);
+  }
+
+  /// Calculates `q = n / d` and `r = n % d` together.
+  FIXED_DIVISOR_DECL void
+  DivMod(const std::int32_t n, std::int32_t* q, int32_t* r) const {
+    *q = Div(n);
+    *r = n - d_ * *q;
+  }
+
+ private:
+  // Calculates magic multiplicative value and shift amount for calculating `q =
+  // n / d` for signed 32-bit integers.
+  // Implementation taken from Hacker's Delight section 10.
+  void CalcSignedMagic() {
+    if (d_ == 1) {
+      magic_ = UINT64_C(0x1) << 32;
+      shift_ = 32;
+      return;
+    }
+
+    const std::uint32_t two31 = UINT32_C(0x80000000);
+    const std::uint32_t ad = std::abs(d_);
+    const std::uint32_t t = two31 + ((uint32_t)d_ >> 31);
+    const std::uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
+    std::uint32_t p = 31; // Init. p.
+    std::uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
+    std::uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
+    std::uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
+    std::uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
+    std::uint32_t delta = 0;
+    do {
+      ++p;
+      q1 <<= 1; // Update q1 = 2**p/|nc|.
+      r1 <<= 1; // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) { // (Must be an unsigned
+        ++q1; // comparison here).
+        r1 -= anc;
+      }
+      q2 <<= 1; // Update q2 = 2**p/|d|.
+      r2 <<= 1; // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) { // (Must be an unsigned
+        ++q2; // comparison here).
+        r2 -= ad;
+      }
+      delta = ad - r2;
+    } while (q1 < delta || (q1 == delta && r1 == 0));
+    std::int32_t magic = q2 + 1;
+    if (d_ < 0) {
+      magic = -magic;
+    }
+    shift_ = p;
+    magic_ = (std::uint64_t)(std::uint32_t)magic;
+  }
+
+  std::int32_t d_ = 1;
+  std::uint64_t magic_;
+  int shift_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_FIXED_DIVISOR_H_
diff --git a/caffe2/utils/fixed_divisor_test.cc b/caffe2/utils/fixed_divisor_test.cc
new file mode 100644
index 0000000..0af5720
--- /dev/null
+++ b/caffe2/utils/fixed_divisor_test.cc
@@ -0,0 +1,78 @@
+#include "caffe2/utils/fixed_divisor.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+namespace caffe2 {
+
+namespace {
+
+void CompareDivMod(int32_t v, int32_t divisor) {
+  auto fixed = FixedDivisor<int32_t>(divisor);
+
+  int native_q = v / divisor;
+  int native_r = v % divisor;
+
+  int fixed_q = fixed.Div(v);
+  int fixed_r = fixed.Mod(v);
+
+  EXPECT_EQ(native_q, fixed_q)
+      << v << " / " << divisor << " magic " << fixed.magic() << " shift "
+      << fixed.shift() << " quot " << fixed_q << " " << native_q;
+
+  EXPECT_EQ(native_r, fixed_r)
+      << v << " / " << divisor << " magic " << fixed.magic() << " shift "
+      << fixed.shift() << " rem " << fixed_r << " " << native_r;
+}
+
+} // namespace
+
+TEST(FixedDivisorTest, FixedDivisorInt32Test) {
+  constexpr int32_t kMax = std::numeric_limits<int32_t>::max();
+
+  // divide by 1
+  CompareDivMod(kMax, 1);
+  CompareDivMod(0, 1);
+  CompareDivMod(1, 1);
+
+  // divide by max
+  CompareDivMod(kMax, kMax);
+  CompareDivMod(0, kMax);
+  CompareDivMod(1, kMax);
+
+  // divide by random positive values
+  std::random_device rd;
+  std::uniform_int_distribution<int32_t> v_dist(0, kMax);
+  std::uniform_int_distribution<int32_t> q_dist(1, kMax);
+
+  std::uniform_int_distribution<int32_t> v_small_dist(0, 1000);
+  std::uniform_int_distribution<int32_t> q_small_dist(1, 1000);
+  for (int i = 0; i < 10000; ++i) {
+    auto q = q_dist(rd);
+    auto v = v_dist(rd);
+    auto q_small = q_small_dist(rd);
+    auto v_small = v_small_dist(rd);
+
+    // random value
+    CompareDivMod(v_small, q_small);
+    CompareDivMod(v_small, q);
+    CompareDivMod(v, q_small);
+    CompareDivMod(v, q);
+
+    // special values
+    CompareDivMod(kMax, q_small);
+    CompareDivMod(0, q_small);
+    CompareDivMod(1, q_small);
+    CompareDivMod(kMax, q);
+    CompareDivMod(0, q);
+    CompareDivMod(1, q);
+
+    CompareDivMod(v_small, 1);
+    CompareDivMod(v_small, kMax);
+    CompareDivMod(v, 1);
+    CompareDivMod(v, kMax);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/flat_hash_map/flat_hash_map.h b/caffe2/utils/flat_hash_map/flat_hash_map.h
new file mode 100644
index 0000000..41520a2
--- /dev/null
+++ b/caffe2/utils/flat_hash_map/flat_hash_map.h
@@ -0,0 +1,1455 @@
+// Taken from https://github.com/skarupke/flat_hash_map/blob/2ceada9f7fb5c6d808e33546882fc63662e10e79/flat_hash_map.hpp
+//
+//          Copyright Malte Skarupke 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+#include <functional>
+#include <vector>
+#include <cmath>
+#include <array>
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <type_traits>
+
+#ifdef _MSC_VER
+#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
+#else
+#define SKA_NOINLINE(...) __VA_ARGS__ __attribute__((noinline))
+#endif
+
+namespace ska
+{
+    struct prime_number_hash_policy;
+    struct power_of_two_hash_policy;
+
+    namespace detailv3
+    {
+        template<typename Result, typename Functor>
+        struct functor_storage : Functor
+        {
+            functor_storage() = default;
+            functor_storage(const Functor & functor)
+                    : Functor(functor)
+            {
+            }
+            template<typename... Args>
+            Result operator()(Args &&... args)
+            {
+                return static_cast<Functor &>(*this)(std::forward<Args>(args)...);
+            }
+            template<typename... Args>
+            Result operator()(Args &&... args) const
+            {
+                return static_cast<const Functor &>(*this)(std::forward<Args>(args)...);
+            }
+        };
+        template<typename Result, typename... Args>
+        struct functor_storage<Result, Result (*)(Args...)>
+        {
+            typedef Result (*function_ptr)(Args...);
+            function_ptr function;
+            functor_storage(function_ptr function)
+                    : function(function)
+            {
+            }
+            Result operator()(Args... args) const
+            {
+                return function(std::forward<Args>(args)...);
+            }
+            operator function_ptr &()
+            {
+                return function;
+            }
+            operator const function_ptr &()
+            {
+                return function;
+            }
+        };
+        template<typename key_type, typename value_type, typename hasher>
+        struct KeyOrValueHasher : functor_storage<size_t, hasher>
+        {
+            typedef functor_storage<size_t, hasher> hasher_storage;
+            KeyOrValueHasher() = default;
+            KeyOrValueHasher(const hasher & hash)
+                    : hasher_storage(hash)
+            {
+            }
+            size_t operator()(const key_type & key)
+            {
+                return static_cast<hasher_storage &>(*this)(key);
+            }
+            size_t operator()(const key_type & key) const
+            {
+                return static_cast<const hasher_storage &>(*this)(key);
+            }
+            size_t operator()(const value_type & value)
+            {
+                return static_cast<hasher_storage &>(*this)(value.first);
+            }
+            size_t operator()(const value_type & value) const
+            {
+                return static_cast<const hasher_storage &>(*this)(value.first);
+            }
+            template<typename F, typename S>
+            size_t operator()(const std::pair<F, S> & value)
+            {
+                return static_cast<hasher_storage &>(*this)(value.first);
+            }
+            template<typename F, typename S>
+            size_t operator()(const std::pair<F, S> & value) const
+            {
+                return static_cast<const hasher_storage &>(*this)(value.first);
+            }
+        };
+        template<typename key_type, typename value_type, typename key_equal>
+        struct KeyOrValueEquality : functor_storage<bool, key_equal>
+        {
+            typedef functor_storage<bool, key_equal> equality_storage;
+            KeyOrValueEquality() = default;
+            KeyOrValueEquality(const key_equal & equality)
+                    : equality_storage(equality)
+            {
+            }
+            bool operator()(const key_type & lhs, const key_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs, rhs);
+            }
+            bool operator()(const key_type & lhs, const value_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs, rhs.first);
+            }
+            bool operator()(const value_type & lhs, const key_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs);
+            }
+            bool operator()(const value_type & lhs, const value_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
+            }
+            template<typename F, typename S>
+            bool operator()(const key_type & lhs, const std::pair<F, S> & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs, rhs.first);
+            }
+            template<typename F, typename S>
+            bool operator()(const std::pair<F, S> & lhs, const key_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs);
+            }
+            template<typename F, typename S>
+            bool operator()(const value_type & lhs, const std::pair<F, S> & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
+            }
+            template<typename F, typename S>
+            bool operator()(const std::pair<F, S> & lhs, const value_type & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
+            }
+            template<typename FL, typename SL, typename FR, typename SR>
+            bool operator()(const std::pair<FL, SL> & lhs, const std::pair<FR, SR> & rhs)
+            {
+                return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
+            }
+        };
+        template<typename T>
+        struct sherwood_v3_entry
+        {
+            static constexpr sherwood_v3_entry special_end_entry()
+            {
+                sherwood_v3_entry end;
+                end.distance_from_desired = special_end_value;
+                return end;
+            }
+
+            bool has_value() const
+            {
+                return distance_from_desired >= 0;
+            }
+            bool is_empty() const
+            {
+                return distance_from_desired < 0;
+            }
+            bool is_at_desired_position() const
+            {
+                return distance_from_desired <= 0;
+            }
+            template<typename... Args>
+            void emplace(int8_t distance, Args &&... args)
+            {
+                new (std::addressof(value)) T(std::forward<Args>(args)...);
+                distance_from_desired = distance;
+            }
+
+            void destroy_value()
+            {
+                value.~T();
+                distance_from_desired = -1;
+            }
+
+            int8_t distance_from_desired = -1;
+            static constexpr int8_t special_end_value = 0;
+            union { T value; };
+        };
+        template<typename T>
+        struct sherwood_v3_entry_constexpr
+        {
+            constexpr explicit sherwood_v3_entry_constexpr(int8_t distance_from_desired_ = -1, typename std::aligned_storage<sizeof(T), alignof(T)>::type bytes_ = {})
+                    : distance_from_desired(distance_from_desired_), bytes(bytes_) {}
+
+            static constexpr sherwood_v3_entry_constexpr special_end_entry()
+            {
+                return sherwood_v3_entry_constexpr(
+                        sherwood_v3_entry<T>::special_end_value
+                );
+            }
+
+            int8_t distance_from_desired;
+            typename std::aligned_storage<sizeof(T), alignof(T)>::type bytes;
+        };
+        static constexpr int8_t min_lookups = 4;
+        template<typename T>
+        struct EntryDefaultTable
+        {
+            static constexpr std::array<const sherwood_v3_entry_constexpr<T>, min_lookups> table
+                    {{
+                            sherwood_v3_entry_constexpr<T>(),
+                            sherwood_v3_entry_constexpr<T>(),
+                            sherwood_v3_entry_constexpr<T>(),
+                            sherwood_v3_entry_constexpr<T>::special_end_entry()
+                    }};
+        };
+        template<typename T>
+        constexpr std::array<const sherwood_v3_entry_constexpr<T>, min_lookups> EntryDefaultTable<T>::table;
+
+        inline int8_t log2(size_t value)
+        {
+            static constexpr int8_t table[64] =
+                    {
+                            63,  0, 58,  1, 59, 47, 53,  2,
+                            60, 39, 48, 27, 54, 33, 42,  3,
+                            61, 51, 37, 40, 49, 18, 28, 20,
+                            55, 30, 34, 11, 43, 14, 22,  4,
+                            62, 57, 46, 52, 38, 26, 32, 41,
+                            50, 36, 17, 19, 29, 10, 13, 21,
+                            56, 45, 25, 31, 35, 16,  9, 12,
+                            44, 24, 15,  8, 23,  7,  6,  5
+                    };
+            value |= value >> 1;
+            value |= value >> 2;
+            value |= value >> 4;
+            value |= value >> 8;
+            value |= value >> 16;
+            value |= value >> 32;
+            return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58];
+        }
+        void throw_out_of_range();
+
+        template<typename T, bool>
+        struct AssignIfTrue
+        {
+            void operator()(T & lhs, const T & rhs)
+            {
+                lhs = rhs;
+            }
+            void operator()(T & lhs, T && rhs)
+            {
+                lhs = std::move(rhs);
+            }
+        };
+        template<typename T>
+        struct AssignIfTrue<T, false>
+        {
+            void operator()(T &, const T &)
+            {
+            }
+            void operator()(T &, T &&)
+            {
+            }
+        };
+
+        inline size_t next_power_of_two(size_t i)
+        {
+            --i;
+            i |= i >> 1;
+            i |= i >> 2;
+            i |= i >> 4;
+            i |= i >> 8;
+            i |= i >> 16;
+            i |= i >> 32;
+            ++i;
+            return i;
+        }
+
+        // void_t implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+        // (it takes CWG1558 into account and also works for older compilers)
+        template<typename... Ts> struct make_void { typedef void type;};
+        template<typename... Ts> using void_t = typename make_void<Ts...>::type;
+
+        template<typename T, typename = void>
+        struct HashPolicySelector
+        {
+            typedef prime_number_hash_policy type;
+        };
+        template<typename T>
+        struct HashPolicySelector<T, void_t<typename T::hash_policy>>
+        {
+            typedef typename T::hash_policy type;
+        };
+
+        template<typename T, typename FindKey, typename ArgumentHash, typename Hasher, typename ArgumentEqual, typename Equal, typename ArgumentAlloc, typename EntryAlloc>
+        class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
+        {
+            using Entry = detailv3::sherwood_v3_entry<T>;
+            using AllocatorTraits = std::allocator_traits<EntryAlloc>;
+            using EntryPointer = typename AllocatorTraits::pointer;
+            struct convertible_to_iterator;
+
+        public:
+
+            using value_type = T;
+            using size_type = size_t;
+            using difference_type = std::ptrdiff_t;
+            using hasher = ArgumentHash;
+            using key_equal = ArgumentEqual;
+            using allocator_type = EntryAlloc;
+            using reference = value_type &;
+            using const_reference = const value_type &;
+            using pointer = value_type *;
+            using const_pointer = const value_type *;
+
+            sherwood_v3_table()
+            {
+            }
+            explicit sherwood_v3_table(size_type bucket_count, const ArgumentHash & hash = ArgumentHash(), const ArgumentEqual & equal = ArgumentEqual(), const ArgumentAlloc & alloc = ArgumentAlloc())
+                    : EntryAlloc(alloc), Hasher(hash), Equal(equal)
+            {
+                rehash(bucket_count);
+            }
+            sherwood_v3_table(size_type bucket_count, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(bucket_count, ArgumentHash(), ArgumentEqual(), alloc)
+            {
+            }
+            sherwood_v3_table(size_type bucket_count, const ArgumentHash & hash, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(bucket_count, hash, ArgumentEqual(), alloc)
+            {
+            }
+            explicit sherwood_v3_table(const ArgumentAlloc & alloc)
+                    : EntryAlloc(alloc)
+            {
+            }
+            template<typename It>
+            sherwood_v3_table(It first, It last, size_type bucket_count = 0, const ArgumentHash & hash = ArgumentHash(), const ArgumentEqual & equal = ArgumentEqual(), const ArgumentAlloc & alloc = ArgumentAlloc())
+                    : sherwood_v3_table(bucket_count, hash, equal, alloc)
+            {
+                insert(first, last);
+            }
+            template<typename It>
+            sherwood_v3_table(It first, It last, size_type bucket_count, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(first, last, bucket_count, ArgumentHash(), ArgumentEqual(), alloc)
+            {
+            }
+            template<typename It>
+            sherwood_v3_table(It first, It last, size_type bucket_count, const ArgumentHash & hash, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(first, last, bucket_count, hash, ArgumentEqual(), alloc)
+            {
+            }
+            sherwood_v3_table(std::initializer_list<T> il, size_type bucket_count = 0, const ArgumentHash & hash = ArgumentHash(), const ArgumentEqual & equal = ArgumentEqual(), const ArgumentAlloc & alloc = ArgumentAlloc())
+                    : sherwood_v3_table(bucket_count, hash, equal, alloc)
+            {
+                if (bucket_count == 0)
+                    rehash(il.size());
+                insert(il.begin(), il.end());
+            }
+            sherwood_v3_table(std::initializer_list<T> il, size_type bucket_count, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(il, bucket_count, ArgumentHash(), ArgumentEqual(), alloc)
+            {
+            }
+            sherwood_v3_table(std::initializer_list<T> il, size_type bucket_count, const ArgumentHash & hash, const ArgumentAlloc & alloc)
+                    : sherwood_v3_table(il, bucket_count, hash, ArgumentEqual(), alloc)
+            {
+            }
+            sherwood_v3_table(const sherwood_v3_table & other)
+                    : sherwood_v3_table(other, AllocatorTraits::select_on_container_copy_construction(other.get_allocator()))
+            {
+            }
+            sherwood_v3_table(const sherwood_v3_table & other, const ArgumentAlloc & alloc)
+                    : EntryAlloc(alloc), Hasher(other), Equal(other), _max_load_factor(other._max_load_factor)
+            {
+                rehash_for_other_container(other);
+                try
+                {
+                    insert(other.begin(), other.end());
+                }
+                catch(...)
+                {
+                    clear();
+                    deallocate_data(entries, num_slots_minus_one, max_lookups);
+                    throw;
+                }
+            }
+            sherwood_v3_table(sherwood_v3_table && other) noexcept
+                    : EntryAlloc(std::move(other)), Hasher(std::move(other)), Equal(std::move(other))
+            {
+                swap_pointers(other);
+            }
+            sherwood_v3_table(sherwood_v3_table && other, const ArgumentAlloc & alloc) noexcept
+                    : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other))
+            {
+                swap_pointers(other);
+            }
+            sherwood_v3_table & operator=(const sherwood_v3_table & other)
+            {
+                if (this == std::addressof(other))
+                    return *this;
+
+                clear();
+                if (AllocatorTraits::propagate_on_container_copy_assignment::value)
+                {
+                    if (static_cast<EntryAlloc &>(*this) != static_cast<const EntryAlloc &>(other))
+                    {
+                        reset_to_empty_state();
+                    }
+                    AssignIfTrue<EntryAlloc, AllocatorTraits::propagate_on_container_copy_assignment::value>()(*this, other);
+                }
+                _max_load_factor = other._max_load_factor;
+                static_cast<Hasher &>(*this) = other;
+                static_cast<Equal &>(*this) = other;
+                rehash_for_other_container(other);
+                insert(other.begin(), other.end());
+                return *this;
+            }
+            sherwood_v3_table & operator=(sherwood_v3_table && other) noexcept
+            {
+                if (this == std::addressof(other))
+                    return *this;
+                else if (AllocatorTraits::propagate_on_container_move_assignment::value)
+                {
+                    clear();
+                    reset_to_empty_state();
+                    AssignIfTrue<EntryAlloc, AllocatorTraits::propagate_on_container_move_assignment::value>()(*this, std::move(other));
+                    swap_pointers(other);
+                }
+                else if (static_cast<EntryAlloc &>(*this) == static_cast<EntryAlloc &>(other))
+                {
+                    swap_pointers(other);
+                }
+                else
+                {
+                    clear();
+                    _max_load_factor = other._max_load_factor;
+                    rehash_for_other_container(other);
+                    for (T & elem : other)
+                        emplace(std::move(elem));
+                    other.clear();
+                }
+                static_cast<Hasher &>(*this) = std::move(other);
+                static_cast<Equal &>(*this) = std::move(other);
+                return *this;
+            }
+            ~sherwood_v3_table()
+            {
+                clear();
+                deallocate_data(entries, num_slots_minus_one, max_lookups);
+            }
+
+            const allocator_type & get_allocator() const
+            {
+                return static_cast<const allocator_type &>(*this);
+            }
+            const ArgumentEqual & key_eq() const
+            {
+                return static_cast<const ArgumentEqual &>(*this);
+            }
+            const ArgumentHash & hash_function() const
+            {
+                return static_cast<const ArgumentHash &>(*this);
+            }
+
+            template<typename ValueType>
+            struct templated_iterator
+            {
+                explicit templated_iterator(EntryPointer current_ = EntryPointer())
+                        : current(current_) {}
+
+                EntryPointer current;
+
+                using iterator_category = std::forward_iterator_tag;
+                using value_type = ValueType;
+                using difference_type = ptrdiff_t;
+                using pointer = ValueType *;
+                using reference = ValueType &;
+
+                friend bool operator==(const templated_iterator & lhs, const templated_iterator & rhs)
+                {
+                    return lhs.current == rhs.current;
+                }
+                friend bool operator!=(const templated_iterator & lhs, const templated_iterator & rhs)
+                {
+                    return !(lhs == rhs);
+                }
+
+                templated_iterator & operator++()
+                {
+                    do
+                    {
+                        ++current;
+                    }
+                    while(current->is_empty());
+                    return *this;
+                }
+                templated_iterator operator++(int)
+                {
+                    templated_iterator copy(*this);
+                    ++*this;
+                    return copy;
+                }
+
+                ValueType & operator*() const
+                {
+                    return current->value;
+                }
+                ValueType * operator->() const
+                {
+                    return std::addressof(current->value);
+                }
+
+                operator templated_iterator<const value_type>() const
+                {
+                    return templated_iterator<const value_type>(current);
+                }
+            };
+            using iterator = templated_iterator<value_type>;
+            using const_iterator = templated_iterator<const value_type>;
+
+            iterator begin()
+            {
+                for (EntryPointer it = entries;; ++it)
+                {
+                    if (it->has_value())
+                        return iterator(it);
+                }
+            }
+            const_iterator begin() const
+            {
+                for (EntryPointer it = entries;; ++it)
+                {
+                    if (it->has_value())
+                        return iterator(it);
+                }
+            }
+            const_iterator cbegin() const
+            {
+                return begin();
+            }
+            iterator end()
+            {
+                return iterator(entries + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups));
+            }
+            const_iterator end() const
+            {
+                return iterator(entries + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups));
+            }
+            const_iterator cend() const
+            {
+                return end();
+            }
+
+            iterator find(const FindKey & key)
+            {
+                size_t index = hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+                EntryPointer it = entries + ptrdiff_t(index);
+                for (int8_t distance = 0; it->distance_from_desired >= distance; ++distance, ++it)
+                {
+                    if (compares_equal(key, it->value))
+                        return iterator(it);
+                }
+                return end();
+            }
+            const_iterator find(const FindKey & key) const
+            {
+                return const_cast<sherwood_v3_table *>(this)->find(key);
+            }
+            size_t count(const FindKey & key) const
+            {
+                return find(key) == end() ? 0 : 1;
+            }
+            std::pair<iterator, iterator> equal_range(const FindKey & key)
+            {
+                iterator found = find(key);
+                if (found == end())
+                    return { found, found };
+                else
+                    return { found, std::next(found) };
+            }
+            std::pair<const_iterator, const_iterator> equal_range(const FindKey & key) const
+            {
+                const_iterator found = find(key);
+                if (found == end())
+                    return { found, found };
+                else
+                    return { found, std::next(found) };
+            }
+
+            template<typename Key, typename... Args>
+            std::pair<iterator, bool> emplace(Key && key, Args &&... args)
+            {
+                size_t index = hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+                EntryPointer current_entry = entries + ptrdiff_t(index);
+                int8_t distance_from_desired = 0;
+                for (; current_entry->distance_from_desired >= distance_from_desired; ++current_entry, ++distance_from_desired)
+                {
+                    if (compares_equal(key, current_entry->value))
+                        return { iterator(current_entry), false };
+                }
+                return emplace_new_key(distance_from_desired, current_entry, std::forward<Key>(key), std::forward<Args>(args)...);
+            }
+
+            std::pair<iterator, bool> insert(const value_type & value)
+            {
+                return emplace(value);
+            }
+            std::pair<iterator, bool> insert(value_type && value)
+            {
+                return emplace(std::move(value));
+            }
+            template<typename... Args>
+            iterator emplace_hint(const_iterator, Args &&... args)
+            {
+                return emplace(std::forward<Args>(args)...).first;
+            }
+            iterator insert(const_iterator, const value_type & value)
+            {
+                return emplace(value).first;
+            }
+            iterator insert(const_iterator, value_type && value)
+            {
+                return emplace(std::move(value)).first;
+            }
+
+            template<typename It>
+            void insert(It begin, It end)
+            {
+                for (; begin != end; ++begin)
+                {
+                    emplace(*begin);
+                }
+            }
+            void insert(std::initializer_list<value_type> il)
+            {
+                insert(il.begin(), il.end());
+            }
+
+            void rehash(size_t num_buckets)
+            {
+                num_buckets = std::max(num_buckets, static_cast<size_t>(std::ceil(num_elements / static_cast<double>(_max_load_factor))));
+                if (num_buckets == 0)
+                {
+                    reset_to_empty_state();
+                    return;
+                }
+                auto new_prime_index = hash_policy.next_size_over(num_buckets);
+                if (num_buckets == bucket_count())
+                    return;
+                int8_t new_max_lookups = compute_max_lookups(num_buckets);
+                EntryPointer new_buckets(AllocatorTraits::allocate(*this, num_buckets + new_max_lookups));
+                for (EntryPointer it = new_buckets, real_end = it + static_cast<ptrdiff_t>(num_buckets + new_max_lookups - 1); it != real_end; ++it)
+                {
+                    it->distance_from_desired = -1;
+                }
+                new_buckets[num_buckets + new_max_lookups - 1].distance_from_desired = Entry::special_end_value;
+                std::swap(entries, new_buckets);
+                std::swap(num_slots_minus_one, num_buckets);
+                --num_slots_minus_one;
+                hash_policy.commit(new_prime_index);
+                int8_t old_max_lookups = max_lookups;
+                max_lookups = new_max_lookups;
+                num_elements = 0;
+                for (EntryPointer it = new_buckets, end = it + static_cast<ptrdiff_t>(num_buckets + old_max_lookups); it != end; ++it)
+                {
+                    if (it->has_value())
+                    {
+                        emplace(std::move(it->value));
+                        it->destroy_value();
+                    }
+                }
+                deallocate_data(new_buckets, num_buckets, old_max_lookups);
+            }
+
+            void reserve(size_t num_elements)
+            {
+                size_t required_buckets = num_buckets_for_reserve(num_elements);
+                if (required_buckets > bucket_count())
+                    rehash(required_buckets);
+            }
+
+            // the return value is a type that can be converted to an iterator
+            // the reason for doing this is that it's not free to find the
+            // iterator pointing at the next element. if you care about the
+            // next iterator, turn the return value into an iterator
+            convertible_to_iterator erase(const_iterator to_erase)
+            {
+                EntryPointer current = to_erase.current;
+                current->destroy_value();
+                --num_elements;
+                for (EntryPointer next = current + ptrdiff_t(1); !next->is_at_desired_position(); ++current, ++next)
+                {
+                    current->emplace(static_cast<int8_t>(next->distance_from_desired - 1), std::move(next->value));
+                    next->destroy_value();
+                }
+                return { to_erase.current };
+            }
+
+            iterator erase(const_iterator begin_it, const_iterator end_it)
+            {
+                for (EntryPointer it = begin_it.current, end = end_it.current; it != end; ++it)
+                {
+                    if (it->has_value())
+                    {
+                        it->destroy_value();
+                        --num_elements;
+                    }
+                }
+                if (end_it == this->end())
+                    return this->end();
+                ptrdiff_t num_to_move = std::min(static_cast<ptrdiff_t>(end_it.current->distance_from_desired), end_it.current - begin_it.current);
+                EntryPointer to_return = end_it.current - num_to_move;
+                for (EntryPointer it = end_it.current; !it->is_at_desired_position();)
+                {
+                    EntryPointer target = it - num_to_move;
+                    target->emplace(it->distance_from_desired - num_to_move, std::move(it->value));
+                    it->destroy_value();
+                    ++it;
+                    num_to_move = std::min(static_cast<ptrdiff_t>(it->distance_from_desired), num_to_move);
+                }
+                return iterator(to_return);
+            }
+
+            size_t erase(const FindKey & key)
+            {
+                auto found = find(key);
+                if (found == end())
+                    return 0;
+                else
+                {
+                    erase(found);
+                    return 1;
+                }
+            }
+
+            void clear()
+            {
+                for (EntryPointer it = entries, end = it + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups); it != end; ++it)
+                {
+                    if (it->has_value())
+                        it->destroy_value();
+                }
+                num_elements = 0;
+            }
+
+            void shrink_to_fit()
+            {
+                rehash_for_other_container(*this);
+            }
+
+            void swap(sherwood_v3_table & other)
+            {
+                using std::swap;
+                swap_pointers(other);
+                swap(static_cast<ArgumentHash &>(*this), static_cast<ArgumentHash &>(other));
+                swap(static_cast<ArgumentEqual &>(*this), static_cast<ArgumentEqual &>(other));
+                if (AllocatorTraits::propagate_on_container_swap::value)
+                    swap(static_cast<EntryAlloc &>(*this), static_cast<EntryAlloc &>(other));
+            }
+
+            size_t size() const
+            {
+                return num_elements;
+            }
+            size_t max_size() const
+            {
+                return (AllocatorTraits::max_size(*this)) / sizeof(Entry);
+            }
+            size_t bucket_count() const
+            {
+                return num_slots_minus_one + 1;
+            }
+            size_type max_bucket_count() const
+            {
+                return (AllocatorTraits::max_size(*this) - min_lookups) / sizeof(Entry);
+            }
+            size_t bucket(const FindKey & key) const
+            {
+                return hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+            }
+            float load_factor() const
+            {
+                size_t buckets = bucket_count();
+                if (buckets)
+                    return static_cast<float>(num_elements) / bucket_count();
+                else
+                    return 0;
+            }
+            void max_load_factor(float value)
+            {
+                _max_load_factor = value;
+            }
+            float max_load_factor() const
+            {
+                return _max_load_factor;
+            }
+
+            bool empty() const
+            {
+                return num_elements == 0;
+            }
+
+        private:
+            using DefaultTable = detailv3::EntryDefaultTable<T>;
+            EntryPointer entries = const_cast<Entry *>(reinterpret_cast<const Entry *>(DefaultTable::table.data()));
+            size_t num_slots_minus_one = 0;
+            typename HashPolicySelector<ArgumentHash>::type hash_policy;
+            int8_t max_lookups = detailv3::min_lookups - 1;
+            float _max_load_factor = 0.5f;
+            size_t num_elements = 0;
+
+            static int8_t compute_max_lookups(size_t num_buckets)
+            {
+                int8_t desired = detailv3::log2(num_buckets);
+                return std::max(detailv3::min_lookups, desired);
+            }
+
+            size_t num_buckets_for_reserve(size_t num_elements) const
+            {
+                return static_cast<size_t>(std::ceil(num_elements / std::min(0.5, static_cast<double>(_max_load_factor))));
+            }
+            void rehash_for_other_container(const sherwood_v3_table & other)
+            {
+                rehash(std::min(num_buckets_for_reserve(other.size()), other.bucket_count()));
+            }
+
+            void swap_pointers(sherwood_v3_table & other)
+            {
+                using std::swap;
+                swap(hash_policy, other.hash_policy);
+                swap(entries, other.entries);
+                swap(num_slots_minus_one, other.num_slots_minus_one);
+                swap(num_elements, other.num_elements);
+                swap(max_lookups, other.max_lookups);
+                swap(_max_load_factor, other._max_load_factor);
+            }
+
+            template<typename Key, typename... Args>
+            SKA_NOINLINE(std::pair<iterator, bool>) emplace_new_key(int8_t distance_from_desired, EntryPointer current_entry, Key && key, Args &&... args)
+            {
+                using std::swap;
+                if (num_slots_minus_one == 0 || distance_from_desired == max_lookups || static_cast<double>(num_elements + 1) / static_cast<double>(bucket_count()) > _max_load_factor)
+                {
+                    grow();
+                    return emplace(std::forward<Key>(key), std::forward<Args>(args)...);
+                }
+                else if (current_entry->is_empty())
+                {
+                    current_entry->emplace(distance_from_desired, std::forward<Key>(key), std::forward<Args>(args)...);
+                    ++num_elements;
+                    return { iterator(current_entry), true };
+                }
+                value_type to_insert(std::forward<Key>(key), std::forward<Args>(args)...);
+                swap(distance_from_desired, current_entry->distance_from_desired);
+                swap(to_insert, current_entry->value);
+                iterator result(current_entry);
+                for (++distance_from_desired, ++current_entry;; ++current_entry)
+                {
+                    if (current_entry->is_empty())
+                    {
+                        current_entry->emplace(distance_from_desired, std::move(to_insert));
+                        ++num_elements;
+                        return { result, true };
+                    }
+                    else if (current_entry->distance_from_desired < distance_from_desired)
+                    {
+                        swap(distance_from_desired, current_entry->distance_from_desired);
+                        swap(to_insert, current_entry->value);
+                        ++distance_from_desired;
+                    }
+                    else
+                    {
+                        ++distance_from_desired;
+                        if (distance_from_desired == max_lookups)
+                        {
+                            swap(to_insert, result.current->value);
+                            grow();
+                            return emplace(std::move(to_insert));
+                        }
+                    }
+                }
+            }
+
+            void grow()
+            {
+                rehash(std::max(size_t(4), 2 * bucket_count()));
+            }
+
+            void deallocate_data(EntryPointer begin, size_t num_slots_minus_one, int8_t max_lookups)
+            {
+                if (begin != const_cast<Entry *>(reinterpret_cast<const Entry *>(DefaultTable::table.data())))
+                {
+                    AllocatorTraits::deallocate(*this, begin, num_slots_minus_one + max_lookups + 1);
+                }
+            }
+
+            void reset_to_empty_state()
+            {
+                deallocate_data(entries, num_slots_minus_one, max_lookups);
+                entries = const_cast<Entry *>(reinterpret_cast<const Entry *>(DefaultTable::table.data()));
+                num_slots_minus_one = 0;
+                hash_policy.reset();
+                max_lookups = detailv3::min_lookups - 1;
+            }
+
+            template<typename U>
+            size_t hash_object(const U & key)
+            {
+                return static_cast<Hasher &>(*this)(key);
+            }
+            template<typename U>
+            size_t hash_object(const U & key) const
+            {
+                return static_cast<const Hasher &>(*this)(key);
+            }
+            template<typename L, typename R>
+            bool compares_equal(const L & lhs, const R & rhs)
+            {
+                return static_cast<Equal &>(*this)(lhs, rhs);
+            }
+
+            struct convertible_to_iterator
+            {
+                EntryPointer it;
+
+                operator iterator()
+                {
+                    if (it->has_value())
+                        return iterator(it);
+                    else
+                        return ++iterator(it);
+                }
+                operator const_iterator()
+                {
+                    if (it->has_value())
+                        return const_iterator(it);
+                    else
+                        return ++const_iterator(it);
+                }
+            };
+
+        };
+    }
+
+    struct prime_number_hash_policy
+    {
+        static size_t mod0(size_t) { return 0llu; }
+        static size_t mod2(size_t hash) { return hash % 2llu; }
+        static size_t mod3(size_t hash) { return hash % 3llu; }
+        static size_t mod5(size_t hash) { return hash % 5llu; }
+        static size_t mod7(size_t hash) { return hash % 7llu; }
+        static size_t mod11(size_t hash) { return hash % 11llu; }
+        static size_t mod13(size_t hash) { return hash % 13llu; }
+        static size_t mod17(size_t hash) { return hash % 17llu; }
+        static size_t mod23(size_t hash) { return hash % 23llu; }
+        static size_t mod29(size_t hash) { return hash % 29llu; }
+        static size_t mod37(size_t hash) { return hash % 37llu; }
+        static size_t mod47(size_t hash) { return hash % 47llu; }
+        static size_t mod59(size_t hash) { return hash % 59llu; }
+        static size_t mod73(size_t hash) { return hash % 73llu; }
+        static size_t mod97(size_t hash) { return hash % 97llu; }
+        static size_t mod127(size_t hash) { return hash % 127llu; }
+        static size_t mod151(size_t hash) { return hash % 151llu; }
+        static size_t mod197(size_t hash) { return hash % 197llu; }
+        static size_t mod251(size_t hash) { return hash % 251llu; }
+        static size_t mod313(size_t hash) { return hash % 313llu; }
+        static size_t mod397(size_t hash) { return hash % 397llu; }
+        static size_t mod499(size_t hash) { return hash % 499llu; }
+        static size_t mod631(size_t hash) { return hash % 631llu; }
+        static size_t mod797(size_t hash) { return hash % 797llu; }
+        static size_t mod1009(size_t hash) { return hash % 1009llu; }
+        static size_t mod1259(size_t hash) { return hash % 1259llu; }
+        static size_t mod1597(size_t hash) { return hash % 1597llu; }
+        static size_t mod2011(size_t hash) { return hash % 2011llu; }
+        static size_t mod2539(size_t hash) { return hash % 2539llu; }
+        static size_t mod3203(size_t hash) { return hash % 3203llu; }
+        static size_t mod4027(size_t hash) { return hash % 4027llu; }
+        static size_t mod5087(size_t hash) { return hash % 5087llu; }
+        static size_t mod6421(size_t hash) { return hash % 6421llu; }
+        static size_t mod8089(size_t hash) { return hash % 8089llu; }
+        static size_t mod10193(size_t hash) { return hash % 10193llu; }
+        static size_t mod12853(size_t hash) { return hash % 12853llu; }
+        static size_t mod16193(size_t hash) { return hash % 16193llu; }
+        static size_t mod20399(size_t hash) { return hash % 20399llu; }
+        static size_t mod25717(size_t hash) { return hash % 25717llu; }
+        static size_t mod32401(size_t hash) { return hash % 32401llu; }
+        static size_t mod40823(size_t hash) { return hash % 40823llu; }
+        static size_t mod51437(size_t hash) { return hash % 51437llu; }
+        static size_t mod64811(size_t hash) { return hash % 64811llu; }
+        static size_t mod81649(size_t hash) { return hash % 81649llu; }
+        static size_t mod102877(size_t hash) { return hash % 102877llu; }
+        static size_t mod129607(size_t hash) { return hash % 129607llu; }
+        static size_t mod163307(size_t hash) { return hash % 163307llu; }
+        static size_t mod205759(size_t hash) { return hash % 205759llu; }
+        static size_t mod259229(size_t hash) { return hash % 259229llu; }
+        static size_t mod326617(size_t hash) { return hash % 326617llu; }
+        static size_t mod411527(size_t hash) { return hash % 411527llu; }
+        static size_t mod518509(size_t hash) { return hash % 518509llu; }
+        static size_t mod653267(size_t hash) { return hash % 653267llu; }
+        static size_t mod823117(size_t hash) { return hash % 823117llu; }
+        static size_t mod1037059(size_t hash) { return hash % 1037059llu; }
+        static size_t mod1306601(size_t hash) { return hash % 1306601llu; }
+        static size_t mod1646237(size_t hash) { return hash % 1646237llu; }
+        static size_t mod2074129(size_t hash) { return hash % 2074129llu; }
+        static size_t mod2613229(size_t hash) { return hash % 2613229llu; }
+        static size_t mod3292489(size_t hash) { return hash % 3292489llu; }
+        static size_t mod4148279(size_t hash) { return hash % 4148279llu; }
+        static size_t mod5226491(size_t hash) { return hash % 5226491llu; }
+        static size_t mod6584983(size_t hash) { return hash % 6584983llu; }
+        static size_t mod8296553(size_t hash) { return hash % 8296553llu; }
+        static size_t mod10453007(size_t hash) { return hash % 10453007llu; }
+        static size_t mod13169977(size_t hash) { return hash % 13169977llu; }
+        static size_t mod16593127(size_t hash) { return hash % 16593127llu; }
+        static size_t mod20906033(size_t hash) { return hash % 20906033llu; }
+        static size_t mod26339969(size_t hash) { return hash % 26339969llu; }
+        static size_t mod33186281(size_t hash) { return hash % 33186281llu; }
+        static size_t mod41812097(size_t hash) { return hash % 41812097llu; }
+        static size_t mod52679969(size_t hash) { return hash % 52679969llu; }
+        static size_t mod66372617(size_t hash) { return hash % 66372617llu; }
+        static size_t mod83624237(size_t hash) { return hash % 83624237llu; }
+        static size_t mod105359939(size_t hash) { return hash % 105359939llu; }
+        static size_t mod132745199(size_t hash) { return hash % 132745199llu; }
+        static size_t mod167248483(size_t hash) { return hash % 167248483llu; }
+        static size_t mod210719881(size_t hash) { return hash % 210719881llu; }
+        static size_t mod265490441(size_t hash) { return hash % 265490441llu; }
+        static size_t mod334496971(size_t hash) { return hash % 334496971llu; }
+        static size_t mod421439783(size_t hash) { return hash % 421439783llu; }
+        static size_t mod530980861(size_t hash) { return hash % 530980861llu; }
+        static size_t mod668993977(size_t hash) { return hash % 668993977llu; }
+        static size_t mod842879579(size_t hash) { return hash % 842879579llu; }
+        static size_t mod1061961721(size_t hash) { return hash % 1061961721llu; }
+        static size_t mod1337987929(size_t hash) { return hash % 1337987929llu; }
+        static size_t mod1685759167(size_t hash) { return hash % 1685759167llu; }
+        static size_t mod2123923447(size_t hash) { return hash % 2123923447llu; }
+        static size_t mod2675975881(size_t hash) { return hash % 2675975881llu; }
+        static size_t mod3371518343(size_t hash) { return hash % 3371518343llu; }
+        static size_t mod4247846927(size_t hash) { return hash % 4247846927llu; }
+        static size_t mod5351951779(size_t hash) { return hash % 5351951779llu; }
+        static size_t mod6743036717(size_t hash) { return hash % 6743036717llu; }
+        static size_t mod8495693897(size_t hash) { return hash % 8495693897llu; }
+        static size_t mod10703903591(size_t hash) { return hash % 10703903591llu; }
+        static size_t mod13486073473(size_t hash) { return hash % 13486073473llu; }
+        static size_t mod16991387857(size_t hash) { return hash % 16991387857llu; }
+        static size_t mod21407807219(size_t hash) { return hash % 21407807219llu; }
+        static size_t mod26972146961(size_t hash) { return hash % 26972146961llu; }
+        static size_t mod33982775741(size_t hash) { return hash % 33982775741llu; }
+        static size_t mod42815614441(size_t hash) { return hash % 42815614441llu; }
+        static size_t mod53944293929(size_t hash) { return hash % 53944293929llu; }
+        static size_t mod67965551447(size_t hash) { return hash % 67965551447llu; }
+        static size_t mod85631228929(size_t hash) { return hash % 85631228929llu; }
+        static size_t mod107888587883(size_t hash) { return hash % 107888587883llu; }
+        static size_t mod135931102921(size_t hash) { return hash % 135931102921llu; }
+        static size_t mod171262457903(size_t hash) { return hash % 171262457903llu; }
+        static size_t mod215777175787(size_t hash) { return hash % 215777175787llu; }
+        static size_t mod271862205833(size_t hash) { return hash % 271862205833llu; }
+        static size_t mod342524915839(size_t hash) { return hash % 342524915839llu; }
+        static size_t mod431554351609(size_t hash) { return hash % 431554351609llu; }
+        static size_t mod543724411781(size_t hash) { return hash % 543724411781llu; }
+        static size_t mod685049831731(size_t hash) { return hash % 685049831731llu; }
+        static size_t mod863108703229(size_t hash) { return hash % 863108703229llu; }
+        static size_t mod1087448823553(size_t hash) { return hash % 1087448823553llu; }
+        static size_t mod1370099663459(size_t hash) { return hash % 1370099663459llu; }
+        static size_t mod1726217406467(size_t hash) { return hash % 1726217406467llu; }
+        static size_t mod2174897647073(size_t hash) { return hash % 2174897647073llu; }
+        static size_t mod2740199326961(size_t hash) { return hash % 2740199326961llu; }
+        static size_t mod3452434812973(size_t hash) { return hash % 3452434812973llu; }
+        static size_t mod4349795294267(size_t hash) { return hash % 4349795294267llu; }
+        static size_t mod5480398654009(size_t hash) { return hash % 5480398654009llu; }
+        static size_t mod6904869625999(size_t hash) { return hash % 6904869625999llu; }
+        static size_t mod8699590588571(size_t hash) { return hash % 8699590588571llu; }
+        static size_t mod10960797308051(size_t hash) { return hash % 10960797308051llu; }
+        static size_t mod13809739252051(size_t hash) { return hash % 13809739252051llu; }
+        static size_t mod17399181177241(size_t hash) { return hash % 17399181177241llu; }
+        static size_t mod21921594616111(size_t hash) { return hash % 21921594616111llu; }
+        static size_t mod27619478504183(size_t hash) { return hash % 27619478504183llu; }
+        static size_t mod34798362354533(size_t hash) { return hash % 34798362354533llu; }
+        static size_t mod43843189232363(size_t hash) { return hash % 43843189232363llu; }
+        static size_t mod55238957008387(size_t hash) { return hash % 55238957008387llu; }
+        static size_t mod69596724709081(size_t hash) { return hash % 69596724709081llu; }
+        static size_t mod87686378464759(size_t hash) { return hash % 87686378464759llu; }
+        static size_t mod110477914016779(size_t hash) { return hash % 110477914016779llu; }
+        static size_t mod139193449418173(size_t hash) { return hash % 139193449418173llu; }
+        static size_t mod175372756929481(size_t hash) { return hash % 175372756929481llu; }
+        static size_t mod220955828033581(size_t hash) { return hash % 220955828033581llu; }
+        static size_t mod278386898836457(size_t hash) { return hash % 278386898836457llu; }
+        static size_t mod350745513859007(size_t hash) { return hash % 350745513859007llu; }
+        static size_t mod441911656067171(size_t hash) { return hash % 441911656067171llu; }
+        static size_t mod556773797672909(size_t hash) { return hash % 556773797672909llu; }
+        static size_t mod701491027718027(size_t hash) { return hash % 701491027718027llu; }
+        static size_t mod883823312134381(size_t hash) { return hash % 883823312134381llu; }
+        static size_t mod1113547595345903(size_t hash) { return hash % 1113547595345903llu; }
+        static size_t mod1402982055436147(size_t hash) { return hash % 1402982055436147llu; }
+        static size_t mod1767646624268779(size_t hash) { return hash % 1767646624268779llu; }
+        static size_t mod2227095190691797(size_t hash) { return hash % 2227095190691797llu; }
+        static size_t mod2805964110872297(size_t hash) { return hash % 2805964110872297llu; }
+        static size_t mod3535293248537579(size_t hash) { return hash % 3535293248537579llu; }
+        static size_t mod4454190381383713(size_t hash) { return hash % 4454190381383713llu; }
+        static size_t mod5611928221744609(size_t hash) { return hash % 5611928221744609llu; }
+        static size_t mod7070586497075177(size_t hash) { return hash % 7070586497075177llu; }
+        static size_t mod8908380762767489(size_t hash) { return hash % 8908380762767489llu; }
+        static size_t mod11223856443489329(size_t hash) { return hash % 11223856443489329llu; }
+        static size_t mod14141172994150357(size_t hash) { return hash % 14141172994150357llu; }
+        static size_t mod17816761525534927(size_t hash) { return hash % 17816761525534927llu; }
+        static size_t mod22447712886978529(size_t hash) { return hash % 22447712886978529llu; }
+        static size_t mod28282345988300791(size_t hash) { return hash % 28282345988300791llu; }
+        static size_t mod35633523051069991(size_t hash) { return hash % 35633523051069991llu; }
+        static size_t mod44895425773957261(size_t hash) { return hash % 44895425773957261llu; }
+        static size_t mod56564691976601587(size_t hash) { return hash % 56564691976601587llu; }
+        static size_t mod71267046102139967(size_t hash) { return hash % 71267046102139967llu; }
+        static size_t mod89790851547914507(size_t hash) { return hash % 89790851547914507llu; }
+        static size_t mod113129383953203213(size_t hash) { return hash % 113129383953203213llu; }
+        static size_t mod142534092204280003(size_t hash) { return hash % 142534092204280003llu; }
+        static size_t mod179581703095829107(size_t hash) { return hash % 179581703095829107llu; }
+        static size_t mod226258767906406483(size_t hash) { return hash % 226258767906406483llu; }
+        static size_t mod285068184408560057(size_t hash) { return hash % 285068184408560057llu; }
+        static size_t mod359163406191658253(size_t hash) { return hash % 359163406191658253llu; }
+        static size_t mod452517535812813007(size_t hash) { return hash % 452517535812813007llu; }
+        static size_t mod570136368817120201(size_t hash) { return hash % 570136368817120201llu; }
+        static size_t mod718326812383316683(size_t hash) { return hash % 718326812383316683llu; }
+        static size_t mod905035071625626043(size_t hash) { return hash % 905035071625626043llu; }
+        static size_t mod1140272737634240411(size_t hash) { return hash % 1140272737634240411llu; }
+        static size_t mod1436653624766633509(size_t hash) { return hash % 1436653624766633509llu; }
+        static size_t mod1810070143251252131(size_t hash) { return hash % 1810070143251252131llu; }
+        static size_t mod2280545475268481167(size_t hash) { return hash % 2280545475268481167llu; }
+        static size_t mod2873307249533267101(size_t hash) { return hash % 2873307249533267101llu; }
+        static size_t mod3620140286502504283(size_t hash) { return hash % 3620140286502504283llu; }
+        static size_t mod4561090950536962147(size_t hash) { return hash % 4561090950536962147llu; }
+        static size_t mod5746614499066534157(size_t hash) { return hash % 5746614499066534157llu; }
+        static size_t mod7240280573005008577(size_t hash) { return hash % 7240280573005008577llu; }
+        static size_t mod9122181901073924329(size_t hash) { return hash % 9122181901073924329llu; }
+        static size_t mod11493228998133068689(size_t hash) { return hash % 11493228998133068689llu; }
+        static size_t mod14480561146010017169(size_t hash) { return hash % 14480561146010017169llu; }
+        static size_t mod18446744073709551557(size_t hash) { return hash % 18446744073709551557llu; }
+
+        size_t index_for_hash(size_t hash, size_t /*num_slots_minus_one*/) const
+        {
+            static constexpr size_t (* const mod_functions[])(size_t) =
+                    {
+                            &mod0, &mod2, &mod3, &mod5, &mod7, &mod11, &mod13, &mod17, &mod23, &mod29, &mod37,
+                            &mod47, &mod59, &mod73, &mod97, &mod127, &mod151, &mod197, &mod251, &mod313, &mod397,
+                            &mod499, &mod631, &mod797, &mod1009, &mod1259, &mod1597, &mod2011, &mod2539, &mod3203,
+                            &mod4027, &mod5087, &mod6421, &mod8089, &mod10193, &mod12853, &mod16193, &mod20399,
+                            &mod25717, &mod32401, &mod40823, &mod51437, &mod64811, &mod81649, &mod102877,
+                            &mod129607, &mod163307, &mod205759, &mod259229, &mod326617, &mod411527, &mod518509,
+                            &mod653267, &mod823117, &mod1037059, &mod1306601, &mod1646237, &mod2074129,
+                            &mod2613229, &mod3292489, &mod4148279, &mod5226491, &mod6584983, &mod8296553,
+                            &mod10453007, &mod13169977, &mod16593127, &mod20906033, &mod26339969, &mod33186281,
+                            &mod41812097, &mod52679969, &mod66372617, &mod83624237, &mod105359939, &mod132745199,
+                            &mod167248483, &mod210719881, &mod265490441, &mod334496971, &mod421439783,
+                            &mod530980861, &mod668993977, &mod842879579, &mod1061961721, &mod1337987929,
+                            &mod1685759167, &mod2123923447, &mod2675975881, &mod3371518343, &mod4247846927,
+                            &mod5351951779, &mod6743036717, &mod8495693897, &mod10703903591, &mod13486073473,
+                            &mod16991387857, &mod21407807219, &mod26972146961, &mod33982775741, &mod42815614441,
+                            &mod53944293929, &mod67965551447, &mod85631228929, &mod107888587883, &mod135931102921,
+                            &mod171262457903, &mod215777175787, &mod271862205833, &mod342524915839,
+                            &mod431554351609, &mod543724411781, &mod685049831731, &mod863108703229,
+                            &mod1087448823553, &mod1370099663459, &mod1726217406467, &mod2174897647073,
+                            &mod2740199326961, &mod3452434812973, &mod4349795294267, &mod5480398654009,
+                            &mod6904869625999, &mod8699590588571, &mod10960797308051, &mod13809739252051,
+                            &mod17399181177241, &mod21921594616111, &mod27619478504183, &mod34798362354533,
+                            &mod43843189232363, &mod55238957008387, &mod69596724709081, &mod87686378464759,
+                            &mod110477914016779, &mod139193449418173, &mod175372756929481, &mod220955828033581,
+                            &mod278386898836457, &mod350745513859007, &mod441911656067171, &mod556773797672909,
+                            &mod701491027718027, &mod883823312134381, &mod1113547595345903, &mod1402982055436147,
+                            &mod1767646624268779, &mod2227095190691797, &mod2805964110872297, &mod3535293248537579,
+                            &mod4454190381383713, &mod5611928221744609, &mod7070586497075177, &mod8908380762767489,
+                            &mod11223856443489329, &mod14141172994150357, &mod17816761525534927,
+                            &mod22447712886978529, &mod28282345988300791, &mod35633523051069991,
+                            &mod44895425773957261, &mod56564691976601587, &mod71267046102139967,
+                            &mod89790851547914507, &mod113129383953203213, &mod142534092204280003,
+                            &mod179581703095829107, &mod226258767906406483, &mod285068184408560057,
+                            &mod359163406191658253, &mod452517535812813007, &mod570136368817120201,
+                            &mod718326812383316683, &mod905035071625626043, &mod1140272737634240411,
+                            &mod1436653624766633509, &mod1810070143251252131, &mod2280545475268481167,
+                            &mod2873307249533267101, &mod3620140286502504283, &mod4561090950536962147,
+                            &mod5746614499066534157, &mod7240280573005008577, &mod9122181901073924329,
+                            &mod11493228998133068689, &mod14480561146010017169, &mod18446744073709551557
+                    };
+            return mod_functions[prime_index](hash);
+        }
+        uint8_t next_size_over(size_t & size) const
+        {
+            // prime numbers generated by the following method:
+            // 1. start with a prime p = 2
+            // 2. go to wolfram alpha and get p = NextPrime(2 * p)
+            // 3. repeat 2. until you overflow 64 bits
+            // you now have large gaps which you would hit if somebody called reserve() with an unlucky number.
+            // 4. to fill the gaps for every prime p go to wolfram alpha and get ClosestPrime(p * 2^(1/3)) and ClosestPrime(p * 2^(2/3)) and put those in the gaps
+            // 5. get PrevPrime(2^64) and put it at the end
+            static constexpr const size_t prime_list[] =
+                    {
+                            2llu, 3llu, 5llu, 7llu, 11llu, 13llu, 17llu, 23llu, 29llu, 37llu, 47llu,
+                            59llu, 73llu, 97llu, 127llu, 151llu, 197llu, 251llu, 313llu, 397llu,
+                            499llu, 631llu, 797llu, 1009llu, 1259llu, 1597llu, 2011llu, 2539llu,
+                            3203llu, 4027llu, 5087llu, 6421llu, 8089llu, 10193llu, 12853llu, 16193llu,
+                            20399llu, 25717llu, 32401llu, 40823llu, 51437llu, 64811llu, 81649llu,
+                            102877llu, 129607llu, 163307llu, 205759llu, 259229llu, 326617llu,
+                            411527llu, 518509llu, 653267llu, 823117llu, 1037059llu, 1306601llu,
+                            1646237llu, 2074129llu, 2613229llu, 3292489llu, 4148279llu, 5226491llu,
+                            6584983llu, 8296553llu, 10453007llu, 13169977llu, 16593127llu, 20906033llu,
+                            26339969llu, 33186281llu, 41812097llu, 52679969llu, 66372617llu,
+                            83624237llu, 105359939llu, 132745199llu, 167248483llu, 210719881llu,
+                            265490441llu, 334496971llu, 421439783llu, 530980861llu, 668993977llu,
+                            842879579llu, 1061961721llu, 1337987929llu, 1685759167llu, 2123923447llu,
+                            2675975881llu, 3371518343llu, 4247846927llu, 5351951779llu, 6743036717llu,
+                            8495693897llu, 10703903591llu, 13486073473llu, 16991387857llu,
+                            21407807219llu, 26972146961llu, 33982775741llu, 42815614441llu,
+                            53944293929llu, 67965551447llu, 85631228929llu, 107888587883llu,
+                            135931102921llu, 171262457903llu, 215777175787llu, 271862205833llu,
+                            342524915839llu, 431554351609llu, 543724411781llu, 685049831731llu,
+                            863108703229llu, 1087448823553llu, 1370099663459llu, 1726217406467llu,
+                            2174897647073llu, 2740199326961llu, 3452434812973llu, 4349795294267llu,
+                            5480398654009llu, 6904869625999llu, 8699590588571llu, 10960797308051llu,
+                            13809739252051llu, 17399181177241llu, 21921594616111llu, 27619478504183llu,
+                            34798362354533llu, 43843189232363llu, 55238957008387llu, 69596724709081llu,
+                            87686378464759llu, 110477914016779llu, 139193449418173llu,
+                            175372756929481llu, 220955828033581llu, 278386898836457llu,
+                            350745513859007llu, 441911656067171llu, 556773797672909llu,
+                            701491027718027llu, 883823312134381llu, 1113547595345903llu,
+                            1402982055436147llu, 1767646624268779llu, 2227095190691797llu,
+                            2805964110872297llu, 3535293248537579llu, 4454190381383713llu,
+                            5611928221744609llu, 7070586497075177llu, 8908380762767489llu,
+                            11223856443489329llu, 14141172994150357llu, 17816761525534927llu,
+                            22447712886978529llu, 28282345988300791llu, 35633523051069991llu,
+                            44895425773957261llu, 56564691976601587llu, 71267046102139967llu,
+                            89790851547914507llu, 113129383953203213llu, 142534092204280003llu,
+                            179581703095829107llu, 226258767906406483llu, 285068184408560057llu,
+                            359163406191658253llu, 452517535812813007llu, 570136368817120201llu,
+                            718326812383316683llu, 905035071625626043llu, 1140272737634240411llu,
+                            1436653624766633509llu, 1810070143251252131llu, 2280545475268481167llu,
+                            2873307249533267101llu, 3620140286502504283llu, 4561090950536962147llu,
+                            5746614499066534157llu, 7240280573005008577llu, 9122181901073924329llu,
+                            11493228998133068689llu, 14480561146010017169llu, 18446744073709551557llu
+                    };
+            const size_t * found = std::lower_bound(std::begin(prime_list), std::end(prime_list) - 1, size);
+            size = *found;
+            return static_cast<uint8_t>(1 + found - prime_list);
+        }
+        void commit(uint8_t new_prime_index)
+        {
+            prime_index = new_prime_index;
+        }
+        void reset()
+        {
+            prime_index = 0;
+        }
+
+    private:
+        uint8_t prime_index = 0;
+    };
+
+    struct power_of_two_hash_policy
+    {
+        size_t index_for_hash(size_t hash, size_t num_slots_minus_one) const
+        {
+            return hash & num_slots_minus_one;
+        }
+        int8_t next_size_over(size_t & size) const
+        {
+            size = detailv3::next_power_of_two(size);
+            return 0;
+        }
+        void commit(int8_t)
+        {
+        }
+        void reset()
+        {
+        }
+
+    };
+
+    template<typename K, typename V, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<std::pair<K, V> > >
+    class flat_hash_map
+            : public detailv3::sherwood_v3_table
+                    <
+                            std::pair<K, V>,
+                            K,
+                            H,
+                            detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+                            E,
+                            detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+                            A,
+                            typename std::allocator_traits<A>::template rebind_alloc<detailv3::sherwood_v3_entry<std::pair<K, V>>>
+                    >
+    {
+        using Table = detailv3::sherwood_v3_table
+                <
+                        std::pair<K, V>,
+                        K,
+                        H,
+                        detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+                        E,
+                        detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+                        A,
+                        typename std::allocator_traits<A>::template rebind_alloc<detailv3::sherwood_v3_entry<std::pair<K, V>>>
+                >;
+    public:
+
+        using key_type = K;
+        using mapped_type = V;
+
+        using Table::Table;
+        flat_hash_map()
+        {
+        }
+
+        V & operator[](const K & key)
+        {
+            return emplace(key, convertible_to_value()).first->second;
+        }
+        V & operator[](K && key)
+        {
+            return emplace(std::move(key), convertible_to_value()).first->second;
+        }
+        V & at(const K & key)
+        {
+            auto found = this->find(key);
+            if (found == this->end())
+                throw std::out_of_range("Argument passed to at() was not in the map.");
+            return found->second;
+        }
+        const V & at(const K & key) const
+        {
+            auto found = this->find(key);
+            if (found == this->end())
+                throw std::out_of_range("Argument passed to at() was not in the map.");
+            return found->second;
+        }
+
+        using Table::emplace;
+        std::pair<typename Table::iterator, bool> emplace()
+        {
+            return emplace(key_type(), convertible_to_value());
+        }
+
+        friend bool operator==(const flat_hash_map & lhs, const flat_hash_map & rhs)
+        {
+            if (lhs.size() != rhs.size())
+                return false;
+            for (const typename Table::value_type & value : lhs)
+            {
+                auto found = rhs.find(value.first);
+                if (found == rhs.end())
+                    return false;
+                else if (value.second != found->second)
+                    return false;
+            }
+            return true;
+        }
+        friend bool operator!=(const flat_hash_map & lhs, const flat_hash_map & rhs)
+        {
+            return !(lhs == rhs);
+        }
+
+    private:
+        struct convertible_to_value
+        {
+            operator V() const
+            {
+                return V();
+            }
+        };
+    };
+
+    template<typename T, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T> >
+    class flat_hash_set
+            : public detailv3::sherwood_v3_table
+                    <
+                            T,
+                            T,
+                            H,
+                            detailv3::functor_storage<size_t, H>,
+                            E,
+                            detailv3::functor_storage<bool, E>,
+                            A,
+                            typename std::allocator_traits<A>::template rebind_alloc<detailv3::sherwood_v3_entry<T>>
+                    >
+    {
+        using Table = detailv3::sherwood_v3_table
+                <
+                        T,
+                        T,
+                        H,
+                        detailv3::functor_storage<size_t, H>,
+                        E,
+                        detailv3::functor_storage<bool, E>,
+                        A,
+                        typename std::allocator_traits<A>::template rebind_alloc<detailv3::sherwood_v3_entry<T>>
+                >;
+    public:
+
+        using key_type = T;
+
+        using Table::Table;
+        flat_hash_set()
+        {
+        }
+
+        template<typename... Args>
+        std::pair<typename Table::iterator, bool> emplace(Args &&... args)
+        {
+            return Table::emplace(T(std::forward<Args>(args)...));
+        }
+        std::pair<typename Table::iterator, bool> emplace(const key_type & arg)
+        {
+            return Table::emplace(arg);
+        }
+        std::pair<typename Table::iterator, bool> emplace(key_type & arg)
+        {
+            return Table::emplace(arg);
+        }
+        std::pair<typename Table::iterator, bool> emplace(const key_type && arg)
+        {
+            return Table::emplace(std::move(arg));
+        }
+        std::pair<typename Table::iterator, bool> emplace(key_type && arg)
+        {
+            return Table::emplace(std::move(arg));
+        }
+
+        friend bool operator==(const flat_hash_set & lhs, const flat_hash_set & rhs)
+        {
+            if (lhs.size() != rhs.size())
+                return false;
+            for (const T & value : lhs)
+            {
+                if (rhs.find(value) == rhs.end())
+                    return false;
+            }
+            return true;
+        }
+        friend bool operator!=(const flat_hash_set & lhs, const flat_hash_set & rhs)
+        {
+            return !(lhs == rhs);
+        }
+    };
+
+
+    template<typename T>
+    struct power_of_two_std_hash : std::hash<T>
+    {
+        typedef ska::power_of_two_hash_policy hash_policy;
+    };
+
+} // end namespace ska
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
new file mode 100644
index 0000000..ae54faa
--- /dev/null
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -0,0 +1,391 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/blob.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+TEST(MathROCBLASTest, GemmNoTransNoTrans) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+
+  Blob* blobX = ws.CreateBlob("X");
+  Blob* blobW = ws.CreateBlob("W");
+  Blob* blobY = ws.CreateBlob("Y");
+  Blob* blobY_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapeX{5, 10};
+  vector<int> shapeW{10, 6};
+  vector<int> shapeY{5, 6};
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  tensorX->Resize(shapeX);
+  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
+  tensorW->Resize(shapeW);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  tensorY->Resize(shapeY);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  tensorY_host->Resize(shapeY);
+
+  EXPECT_EQ(tensorX->size(), 50);
+  EXPECT_EQ(tensorW->size(), 60);
+  EXPECT_EQ(tensorY->size(), 30);
+
+  math::Set<float, HIPContext>(
+      tensorX->size(), 1, tensorX->mutable_data<float>(), &context);
+  math::Set<float, HIPContext>(
+      tensorW->size(), 1, tensorW->mutable_data<float>(), &context);
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kOne,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kZero,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
+  }
+
+  // Test Accumulate
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kOne,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kPointFive,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
+  }
+
+  // Test Accumulate
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kPointFive,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kOne,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
+  }
+}
+
+TEST(MathROCBLASTest, GemmNoTransTrans) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+
+  Blob* blobX = ws.CreateBlob("X");
+  Blob* blobW = ws.CreateBlob("W");
+  Blob* blobY = ws.CreateBlob("Y");
+  Blob* blobY_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapeX{5, 10};
+  vector<int> shapeW{6, 10};
+  vector<int> shapeY{5, 6};
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  tensorX->Resize(shapeX);
+  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
+  tensorW->Resize(shapeW);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  tensorY->Resize(shapeY);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  tensorY_host->Resize(shapeY);
+
+  EXPECT_EQ(tensorX->size(), 50);
+  EXPECT_EQ(tensorW->size(), 60);
+  EXPECT_EQ(tensorY->size(), 30);
+
+  math::Set<float, HIPContext>(
+      tensorX->size(), 1, tensorX->mutable_data<float>(), &context);
+  math::Set<float, HIPContext>(
+      tensorW->size(), 1, tensorW->mutable_data<float>(), &context);
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kOne,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kZero,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
+  }
+
+  // Test Accumulate
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kOne,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kPointFive,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
+  }
+
+  math::Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kPointFive,
+      tensorX->template data<float>(),
+      tensorW->template data<float>(),
+      kOne,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  EXPECT_EQ(tensorY_host->size(), 30);
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
+  }
+}
+
+TEST(MathROCBLASTest, GemvNoTrans) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+
+  Blob* blobA = ws.CreateBlob("A");
+  Blob* blobX = ws.CreateBlob("X");
+  Blob* blobY = ws.CreateBlob("Y");
+  Blob* blobY_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapeA{5, 10};
+  vector<int> shapeX{10};
+  vector<int> shapeY{5};
+  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
+  tensorA->Resize(shapeA);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  tensorX->Resize(shapeX);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  tensorY->Resize(shapeY);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  tensorY_host->Resize(shapeY);
+
+  EXPECT_EQ(tensorA->size(), 50);
+  EXPECT_EQ(tensorX->size(), 10);
+  EXPECT_EQ(tensorY->size(), 5);
+  math::Set<float, HIPContext>(
+      tensorA->size(), 1, tensorA->mutable_data<float>(), &context);
+  math::Set<float, HIPContext>(
+      tensorX->size(), 1, tensorX->mutable_data<float>(), &context);
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemv<float, HIPContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kOne,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kZero,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
+  }
+
+  // Test Accumulate
+  math::Gemv<float, HIPContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kOne,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kPointFive,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
+  }
+
+  // Test Accumulate
+  math::Gemv<float, HIPContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kPointFive,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kOne,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
+  }
+}
+
+TEST(MathROCBLASTest, GemvTrans) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+
+  Blob* blobA = ws.CreateBlob("A");
+  Blob* blobX = ws.CreateBlob("X");
+  Blob* blobY = ws.CreateBlob("Y");
+  Blob* blobY_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapeA{6, 10};
+  vector<int> shapeX{6};
+  vector<int> shapeY{10};
+  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
+  tensorA->Resize(shapeA);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  tensorX->Resize(shapeX);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  tensorY->Resize(shapeY);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  tensorY_host->Resize(shapeY);
+
+  EXPECT_EQ(tensorA->size(), 60);
+  EXPECT_EQ(tensorX->size(), 6);
+  EXPECT_EQ(tensorY->size(), 10);
+  math::Set<float, HIPContext>(
+      tensorA->size(), 1, tensorA->mutable_data<float>(), &context);
+  math::Set<float, HIPContext>(
+      tensorX->size(), 1, tensorX->mutable_data<float>(), &context);
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemv<float, HIPContext>(
+      CblasTrans,
+      6,
+      10,
+      kOne,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kZero,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
+  }
+
+  // Test Accumulate
+  math::Gemv<float, HIPContext>(
+      CblasTrans,
+      6,
+      10,
+      kOne,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kPointFive,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
+  }
+
+  // Test Accumulate
+  math::Gemv<float, HIPContext>(
+      CblasTrans,
+      6,
+      10,
+      kPointFive,
+      tensorA->data<float>(),
+      tensorX->data<float>(),
+      kOne,
+      tensorY->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  context.FinishDeviceComputation();
+  for (int i = 0; i < tensorY_host->size(); ++i) {
+    CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
+  }
+}
+} // namespace caffe2
diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc
new file mode 100644
index 0000000..9024783
--- /dev/null
+++ b/caffe2/utils/hip/math_hip.cc
@@ -0,0 +1,3319 @@
+#include "hip/hip_runtime.h"
+// Implements the math functions for GPU.
+
+#include "caffe2/utils/math.h"
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include <thrust/functional.h>
+
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/utils/conversions.h"
+
+#if THRUST_VERSION >= 100800
+#define THRUST_SUPPORTS_PER_THREAD
+#endif // THRUST_VERSION >= 100800
+
+#define ROCBLAS_FP16 0
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+inline __host__ __device__ bool Not(const bool x) {
+  return !x;
+}
+
+template <typename T>
+inline __host__ __device__ T Negate(const T& x) {
+  return -x;
+}
+
+template <typename T>
+inline __host__ __device__ T Square(const T& x) {
+  return x * x;
+}
+
+template <typename T>
+inline __host__ __device__ T Sign(const T& x) {
+  return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0));
+}
+
+#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
+  template <typename T>                                               \
+  struct Func##Functor {                                              \
+    inline __host__ __device__ T                                      \
+    operator()(const T& lhs, const T& rhs) const {                    \
+      return lhs expr rhs;                                            \
+    }                                                                 \
+  };                                                                  \
+  template <>                                                         \
+  struct Func##Functor<float16> {                                     \
+    inline __host__ __device__ float16                                \
+    operator()(const float16& lhs, const float16& rhs) const {        \
+      return convert::To<float, float16>(convert::To<float16, float>( \
+          lhs) expr convert::To<float16, float>(rhs));                \
+    }                                                                 \
+  };
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
+#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR
+
+template <typename TIn, typename TOut, class BinaryOperator>
+__global__ void SimpleBinaryOpHIPKernel(
+    const int N,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    C[i] = op(A[i], B[i]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void RowwiseBinaryOpHIPKenel(
+    const int rows,
+    const int cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  const int size = rows * cols;
+  HIP_1D_KERNEL_LOOP(C_index, size) {
+    const int j = C_index % cols;
+    const int A_index = broadcast_1st ? j : C_index;
+    const int B_index = broadcast_1st ? C_index : j;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void ColwiseBinaryOpHIPKenel(
+    const int rows,
+    const int cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  const int size = rows * cols;
+  HIP_1D_KERNEL_LOOP(C_index, size) {
+    const int i = C_index / cols;
+    const int A_index = broadcast_1st ? i : C_index;
+    const int B_index = broadcast_1st ? C_index : i;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+__global__ void BroadcastBinaryOpHIPKernel(
+    const int size,
+    const SimpleArray<int, D> A_strides,
+    const SimpleArray<int, D> B_strides,
+    const SimpleArray<int, D> C_dims,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  HIP_1D_KERNEL_LOOP(C_index, size) {
+    int A_index = 0;
+    int B_index = 0;
+    int C_index_val = C_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      const int d = C_index_val % C_dims.data[i];
+      A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
+      B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
+      C_index_val /= C_dims.data[i];
+    }
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BinaryOpWith2DBroadcasting(
+    const int ndim,
+    const int* dims,
+    const int pivot,
+    const bool rowwise_broadcast,
+    const bool broadcast_1st,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    HIPContext* context) {
+  const int rows =
+      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
+  const int cols =
+      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  const int size = rows * cols;
+  if (rowwise_broadcast) {
+    if (broadcast_1st) {
+      hipLaunchKernelGGL(
+          (RowwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, true>),
+          dim3(CAFFE_GET_BLOCKS(size)),
+          dim3(CAFFE_HIP_NUM_THREADS),
+          0,
+          context->hip_stream(),
+          rows,
+          cols,
+          op,
+          A,
+          B,
+          C);
+    } else {
+      hipLaunchKernelGGL(
+          (RowwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, false>),
+          dim3(CAFFE_GET_BLOCKS(size)),
+          dim3(CAFFE_HIP_NUM_THREADS),
+          0,
+          context->hip_stream(),
+          rows,
+          cols,
+          op,
+          A,
+          B,
+          C);
+    }
+  } else {
+    if (broadcast_1st) {
+      hipLaunchKernelGGL(
+          (ColwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, true>),
+          dim3(CAFFE_GET_BLOCKS(size)),
+          dim3(CAFFE_HIP_NUM_THREADS),
+          0,
+          context->hip_stream(),
+          rows,
+          cols,
+          op,
+          A,
+          B,
+          C);
+    } else {
+      hipLaunchKernelGGL(
+          (ColwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, false>),
+          dim3(CAFFE_GET_BLOCKS(size)),
+          dim3(CAFFE_HIP_NUM_THREADS),
+          0,
+          context->hip_stream(),
+          rows,
+          cols,
+          op,
+          A,
+          B,
+          C);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+void BroadcastBinaryOpImpl(
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    HIPContext* context) {
+  SimpleArray<int, D> A_strides_array;
+  SimpleArray<int, D> B_strides_array;
+  SimpleArray<int, D> C_dims_array;
+  int A_stride = 1;
+  int B_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
+    B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
+    A_stride *= A_dims[i];
+    B_stride *= B_dims[i];
+  }
+  std::copy(C_dims, C_dims + D, C_dims_array.data);
+  const int size =
+      std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
+  hipLaunchKernelGGL(
+      (BroadcastBinaryOpHIPKernel<TIn, TOut, BinaryOperator, D>),
+      dim3(CAFFE_GET_BLOCKS(size)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      size,
+      A_strides_array,
+      B_strides_array,
+      C_dims_array,
+      op,
+      A,
+      B,
+      C);
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BroadcastBinaryOp(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    HIPContext* context) {
+  const int ndim = std::max(A_ndim, B_ndim);
+  std::vector<int> A_dims_array(ndim);
+  std::vector<int> B_dims_array(ndim);
+  std::vector<int> C_dims_array(ndim);
+  utils::ComputeBroadcastBinaryOpDims(
+      A_ndim,
+      A_dims,
+      B_ndim,
+      B_dims,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data());
+  if (A_dims_array == B_dims_array) {
+    const int size = std::accumulate(
+        C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
+    hipLaunchKernelGGL(
+        (SimpleBinaryOpHIPKernel<TIn, TOut, BinaryOperator>),
+        dim3(CAFFE_GET_BLOCKS(size)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        size,
+        op,
+        A,
+        B,
+        C);
+    return;
+  }
+  int pivot;
+  bool broadcast_1st;
+  if (utils::IsRowwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        true,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  if (utils::IsColwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        false,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
+      ndim,
+      BroadcastBinaryOpImpl,
+      TIn,
+      TOut,
+      BinaryOperator,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data(),
+      op,
+      A,
+      B,
+      C,
+      context);
+}
+} // namespace
+
+#define DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(T, Func, op)            \
+  __global__ void Func##HIPKernel(const int N, const T* X, T* Y) { \
+    HIP_1D_KERNEL_LOOP(i, N) {                                     \
+      Y[i] = op(X[i]);                                             \
+    }                                                              \
+  }                                                                \
+  template <>                                                      \
+  void Func<T, HIPContext>(                                        \
+      const int N, const T* x, T* y, HIPContext* context) {        \
+    hipLaunchKernelGGL(                                            \
+        (Func##HIPKernel),                                         \
+        CAFFE_GET_BLOCKS(N),                                       \
+        CAFFE_HIP_NUM_THREADS,                                     \
+        0,                                                         \
+        context->hip_stream(),                                     \
+        N,                                                         \
+        x,                                                         \
+        y);                                                        \
+  }
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Exp, expf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Log, logf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cos, cosf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Acos, acosf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sin, sinf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Asin, asinf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Tan, tanf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Atan, atanf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Abs, fabsf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqrt, sqrtf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqr, Square<float>)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(bool, Not, Not)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Neg, Negate<float>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Neg, Negate<double>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Neg, Negate<std::int32_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Neg, Negate<std::int64_t>)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sign, Sign<float>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Sign, Sign<double>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Sign, Sign<std::int32_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Sign, Sign<std::int64_t>)
+
+#undef DELEGATE_SIMPLE_HIP_UNARY_FUNCTION
+
+#define DELEGATE_SINCOS_HIP_FUNCTION(T, fn)                         \
+  __global__ void _Kernel_##T##_##SinCos(                           \
+      const int N, const T* x, T* ys, T* yc) {                      \
+    HIP_1D_KERNEL_LOOP(i, N) {                                      \
+      fn(__ldg(x + i), ys + i, yc + i);                             \
+    }                                                               \
+  }                                                                 \
+  template <>                                                       \
+  void SinCos<T, HIPContext>(                                       \
+      const int N, const T* x, T* ys, T* yc, HIPContext* context) { \
+    hipLaunchKernelGGL(                                             \
+        (_Kernel_##T##_##SinCos),                                   \
+        CAFFE_GET_BLOCKS(N),                                        \
+        CAFFE_HIP_NUM_THREADS,                                      \
+        0,                                                          \
+        context->hip_stream(),                                      \
+        N,                                                          \
+        x,                                                          \
+        ys,                                                         \
+        yc);                                                        \
+  }
+
+DELEGATE_SINCOS_HIP_FUNCTION(float, sincosf)
+DELEGATE_SINCOS_HIP_FUNCTION(double, sincos)
+
+#undef DELEGATE_SINCOS_HIP_FUNCTION
+
+#define DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
+  template <>                                                                  \
+  void Func<TIn, HIPContext>(                                                  \
+      const int N, const TIn* A, const TIn* B, TOut* C, HIPContext* context) { \
+    hipLaunchKernelGGL(                                                        \
+        (SimpleBinaryOpHIPKernel<TIn, TOut, Op<TIn>>),                         \
+        CAFFE_GET_BLOCKS(N),                                                   \
+        CAFFE_HIP_NUM_THREADS,                                                 \
+        0,                                                                     \
+        context->hip_stream(),                                                 \
+        N,                                                                     \
+        Op<TIn>(),                                                             \
+        A,                                                                     \
+        B,                                                                     \
+        C);                                                                    \
+  }
+
+#define DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_SIMPLE_HIP_COMPARE_FUNCTION
+
+#define DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_SIMPLE_HIP_BINARY_FUNCTION
+
+DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION
+
+DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, ElemwiseMax, thrust::maximum);
+
+#undef DELEGATE_SIMPLE_HIP_BINARY_FUNCTION
+
+#define DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                          \
+  void Rowwise##Func<TIn, HIPContext, true>(                           \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const TIn* A,                                                    \
+      const TIn* B,                                                    \
+      TOut* C,                                                         \
+      HIPContext* context) {                                           \
+    const int size = rows * cols;                                      \
+    hipLaunchKernelGGL(                                                \
+        (RowwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, true>),           \
+        CAFFE_GET_BLOCKS(size),                                        \
+        CAFFE_HIP_NUM_THREADS,                                         \
+        0,                                                             \
+        context->hip_stream(),                                         \
+        rows,                                                          \
+        cols,                                                          \
+        Op<TIn>(),                                                     \
+        A,                                                             \
+        B,                                                             \
+        C);                                                            \
+  }                                                                    \
+  template <>                                                          \
+  void Rowwise##Func<TIn, HIPContext, false>(                          \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const TIn* A,                                                    \
+      const TIn* B,                                                    \
+      TOut* C,                                                         \
+      HIPContext* context) {                                           \
+    const int size = rows * cols;                                      \
+    hipLaunchKernelGGL(                                                \
+        (RowwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, false>),          \
+        CAFFE_GET_BLOCKS(size),                                        \
+        CAFFE_HIP_NUM_THREADS,                                         \
+        0,                                                             \
+        context->hip_stream(),                                         \
+        rows,                                                          \
+        cols,                                                          \
+        Op<TIn>(),                                                     \
+        A,                                                             \
+        B,                                                             \
+        C);                                                            \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<TIn, HIPContext, true>(                           \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const TIn* A,                                                    \
+      const TIn* B,                                                    \
+      TOut* C,                                                         \
+      HIPContext* context) {                                           \
+    const int size = rows * cols;                                      \
+    hipLaunchKernelGGL(                                                \
+        (ColwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, true>),           \
+        CAFFE_GET_BLOCKS(size),                                        \
+        CAFFE_HIP_NUM_THREADS,                                         \
+        0,                                                             \
+        context->hip_stream(),                                         \
+        rows,                                                          \
+        cols,                                                          \
+        Op<TIn>(),                                                     \
+        A,                                                             \
+        B,                                                             \
+        C);                                                            \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<TIn, HIPContext, false>(                          \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const TIn* A,                                                    \
+      const TIn* B,                                                    \
+      TOut* C,                                                         \
+      HIPContext* context) {                                           \
+    const int size = rows * cols;                                      \
+    hipLaunchKernelGGL(                                                \
+        (ColwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, false>),          \
+        CAFFE_GET_BLOCKS(size),                                        \
+        CAFFE_HIP_NUM_THREADS,                                         \
+        0,                                                             \
+        context->hip_stream(),                                         \
+        rows,                                                          \
+        cols,                                                          \
+        Op<TIn>(),                                                     \
+        A,                                                             \
+        B,                                                             \
+        C);                                                            \
+  }
+
+#define DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION
+
+#define DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Func, Op)             \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                          \
+      std::int32_t, std::int32_t, Func, Op)                           \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                          \
+      std::int64_t, std::int64_t, Func, Op)                           \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op)   \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op) \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION
+
+DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                      \
+      std::int32_t, std::int32_t, Func, Op)                       \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                      \
+      std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION
+
+#define DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
+  template <>                                                         \
+  void Func<TIn, HIPContext>(                                         \
+      const int A_ndim,                                               \
+      const int* A_dims,                                              \
+      const int B_ndim,                                               \
+      const int* B_dims,                                              \
+      const TIn* A,                                                   \
+      const TIn* B,                                                   \
+      TOut* C,                                                        \
+      HIPContext* context) {                                          \
+    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
+        A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
+  }
+
+#define DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_BROADCAST_HIP_COMPARE_FUNCTION
+
+#define DEFINE_BROADCAST_HIP_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_BROADCAST_HIP_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_BROADCAST_HIP_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_BROADCAST_HIP_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_BROADCAST_HIP_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_BROADCAST_HIP_BINARY_FUNCTION
+
+DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_BROADCAST_HIP_BINARY_FUNCTION
+
+#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
+  template <>                                                           \
+  void Funcname<T, HIPContext>(                                         \
+      const int N,                                                      \
+      const T* src,                                                     \
+      T* dst,                                                           \
+      Tensor<HIPContext>* scratch_ptr,                                  \
+      HIPContext* context) {                                            \
+    size_t memRequired = 0;                                             \
+    cub::DeviceReduce::func(                                            \
+        nullptr, memRequired, src, dst, N, context->hip_stream());      \
+    auto buffer_size =                                                  \
+        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
+    cub::DeviceReduce::func(                                            \
+        static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
+        memRequired,                                                    \
+        src,                                                            \
+        dst,                                                            \
+        N,                                                              \
+        context->hip_stream());                                         \
+  }
+
+DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
+DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max)
+DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max)
+DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
+
+#undef DELEGATE_REDUCTION_FUNCTION
+
+// Caffe2 gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <>
+void Gemm<float, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    HIPContext* context,
+    TensorProto::DataType math_type) {
+  // Note that rocblas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (TransB == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  ROCBLAS_ENFORCE(rocblas_sgemm(
+      context->rocblas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      A,
+      lda,
+      &beta,
+      C,
+      N));
+}
+
+template <>
+void Gemm<float16, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    HIPContext* context,
+    TensorProto::DataType math_type) {
+  CAFFE_THROW("Unsupported math type");
+#if ROCBLAS_FP16 // rocblas does not support fp16 yet
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (TransB == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  if (math_type == TensorProto_DataType_FLOAT) {
+    ROCBLAS_CHECK(rocblas_sgemmEx(
+        context->rocblas_handle(),
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        &alpha,
+        B,
+        CUDA_R_16F,
+        ldb,
+        A,
+        CUDA_R_16F,
+        lda,
+        &beta,
+        C,
+        CUDA_R_16F,
+        N));
+
+  } else if (math_type == TensorProto_DataType_FLOAT16) {
+    // convert alpha, beta from float -> __half
+    /*auto alpha_fp16 = convert::floatToHalf(alpha);
+    auto beta_fp16 = convert::floatToHalf(beta);
+
+    // call cublasHgemm
+    ROCBLAS_CHECK(cublasHgemm(
+        context->rocblas_handle(),
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        &alpha_fp16,
+        (const __half*)B,
+        ldb,
+        (const __half*)A,
+        lda,
+        &beta_fp16,
+        (__half*)C,
+        N));*/
+  } else {
+    // fail
+    CAFFE_THROW("Unsupported math type");
+  }
+#endif
+}
+
+template <>
+void BiasCHW<float, HIPContext>(
+    const float* bias,
+    const float* bias_multiplier,
+    const int bias_channels,
+    const int image_size,
+    float* image,
+    HIPContext* context) {
+  Gemm<float, HIPContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      bias_channels,
+      image_size,
+      1,
+      1,
+      bias,
+      bias_multiplier,
+      1,
+      image,
+      context);
+}
+
+template <>
+void GemmBatched<float, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch,
+    TensorProto::DataType math_type) {
+  const int a_stride = M * K;
+  const int b_stride = K * N;
+  const int c_stride = M * N;
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  const int lda = (TransA == CblasNoTrans) ? K : M;
+  const int ldb = (TransB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (TransB == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  ROCBLAS_ENFORCE(rocblas_sgemm_strided_batched(
+      context->rocblas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      b_stride,
+      A,
+      lda,
+      a_stride,
+      &beta,
+      C,
+      N,
+      c_stride,
+      batch_size));
+}
+
+namespace {
+
+__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __float2half(X[i]);
+  }
+}
+
+__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __half2float(X[i]);
+  }
+}
+
+}; // namespace
+
+template <>
+void GemmBatched<float16, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch,
+    TensorProto::DataType math_type) {
+  const int a_stride = M * K;
+  const int b_stride = K * N;
+  const int c_stride = M * N;
+
+  // 3 options:
+  // 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16
+  // 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx
+  // 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm
+
+  if (scratch != nullptr) {
+    const int A_size = a_stride * batch_size;
+    const int B_size = b_stride * batch_size;
+    // cast, cublasSgemmStridedBatched, cast
+    size_t in_elems = A_size + B_size;
+    size_t out_elems = c_stride * batch_size;
+
+    scratch->Resize(in_elems + out_elems);
+    float* scratch_ptr = scratch->mutable_data<float>();
+
+    float* A_fp32 = scratch_ptr;
+    float* B_fp32 = scratch_ptr + A_size;
+    float* C_fp32 = scratch_ptr + A_size + B_size;
+
+    // cast A, B into fp32
+    hipLaunchKernelGGL(
+        (HalfToFloatKernel),
+        dim3(CAFFE_GET_BLOCKS(A_size)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        A_size,
+        (half*)A,
+        A_fp32);
+    hipLaunchKernelGGL(
+        (HalfToFloatKernel),
+        dim3(CAFFE_GET_BLOCKS(B_size)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        B_size,
+        (half*)B,
+        B_fp32);
+
+    // run fp32 batched Gemm
+    GemmBatched<float, HIPContext>(
+        TransA,
+        TransB,
+        batch_size,
+        M,
+        N,
+        K,
+        alpha,
+        A_fp32,
+        B_fp32,
+        beta,
+        C_fp32,
+        context);
+
+    // cast result back to fp16
+    hipLaunchKernelGGL(
+        (FloatToHalfKernel),
+        dim3(CAFFE_GET_BLOCKS(batch_size * M * N)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        batch_size * M * N,
+        C_fp32,
+        (half*)C);
+  } else {
+#if ROCBLAS_FP16 // rocblas does not support fp16 yet
+    if (math_type == TensorProto_DataType_FLOAT) {
+      // loop over matrices in the batch
+      for (int i = 0; i < batch_size; ++i) {
+        math::Gemm<float16, HIPContext>(
+            TransA,
+            TransB,
+            M,
+            N,
+            K,
+            alpha,
+            A + a_stride * i,
+            B + b_stride * i,
+            beta,
+            C + c_stride * i,
+            context);
+      }
+    } else if (math_type == TensorProto_DataType_FLOAT16) {
+      // Note that cublas follows fortran order, so the order is different from
+      // the cblas convention.
+      const int lda = (TransA == CblasNoTrans) ? K : M;
+      const int ldb = (TransB == CblasNoTrans) ? N : K;
+      rocblas_operation cuTransA = (TransA == CblasNoTrans)
+          ? rocblas_operation_none
+          : rocblas_operation_transpose;
+      rocblas_operation cuTransB = (TransB == CblasNoTrans)
+          ? rocblas_operation_none
+          : rocblas_operation_transpose;
+
+      // convert alpha, beta from float -> __half
+      auto alpha_fp16 = convert::floatToHalf(alpha);
+      auto beta_fp16 = convert::floatToHalf(beta);
+      ROCBLAS_ENFORCE(cublasHgemmStridedBatched(
+          context->rocblas_handle(),
+          cuTransB,
+          cuTransA,
+          N,
+          M,
+          K,
+          &alpha_fp16,
+          (const __half*)B,
+          ldb,
+          b_stride,
+          (const __half*)A,
+          lda,
+          a_stride,
+          &beta_fp16,
+          (__half*)C,
+          N,
+          c_stride,
+          batch_size));
+    }
+#endif
+  }
+}
+
+template <>
+void GemmEx<float, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const int lda,
+    const float* B,
+    const int ldb,
+    const float beta,
+    float* C,
+    const int ldc,
+    HIPContext* context) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (TransB == CblasNoTrans)
+      ? rocblas_operation_none
+      : rocblas_operation_transpose;
+  ROCBLAS_ENFORCE(rocblas_sgemm(
+      context->rocblas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      A,
+      lda,
+      &beta,
+      C,
+      ldc));
+}
+
+template <>
+void Gemv<float, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float* A,
+    const float* x,
+    const float beta,
+    float* y,
+    HIPContext* context,
+    TensorProto::DataType math_type) {
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_transpose
+      : rocblas_operation_none;
+  ROCBLAS_ENFORCE(rocblas_sgemv(
+      context->rocblas_handle(),
+      cuTransA,
+      N,
+      M,
+      &alpha,
+      A,
+      N,
+      x,
+      1,
+      &beta,
+      y,
+      1));
+}
+
+// Batched Add variants
+namespace {
+
+template <typename T>
+__global__ void AddStripedBatchKernel(
+    const int N,
+    const T* first,
+    T* Y,
+    const int stripe,
+    const int batch) {
+  for (int j = 0; j < batch; j++) {
+    const T* x = first + j * stripe;
+    HIP_1D_KERNEL_LOOP(i, N) {
+      float tmpY = convert::To<T, float>(Y[i]);
+      tmpY += convert::To<T, float>(x[i]);
+      Y[i] = convert::To<float, T>(tmpY);
+    }
+  }
+}
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(T) \
+  template <>                                       \
+  void AddStripedBatch<T, HIPContext>(              \
+      const int N,                                  \
+      const T* first,                               \
+      T* Y,                                         \
+      const int stripe,                             \
+      const int batch,                              \
+      HIPContext* context) {                        \
+    hipLaunchKernelGGL(                             \
+        (AddStripedBatchKernel<T>),                 \
+        CAFFE_GET_BLOCKS(N),                        \
+        CAFFE_HIP_NUM_THREADS,                      \
+        0,                                          \
+        context->hip_stream(),                      \
+        N,                                          \
+        first,                                      \
+        Y,                                          \
+        stripe,                                     \
+        batch);                                     \
+  }
+
+CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float);
+CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float16);
+#undef CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH
+
+template <>
+void Gemv<float16, HIPContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float16* A,
+    const float16* x,
+    const float beta,
+    float16* y,
+    HIPContext* context,
+    TensorProto::DataType math_type) {
+  CAFFE_THROW("Unsupported math type");
+#if ROCBLAS_FP16 // rocblas does not support fp16 yet
+  rocblas_operation cuTransA = (TransA == CblasNoTrans)
+      ? rocblas_operation_transpose
+      : rocblas_operation_none;
+
+  // sort out what we need to call cublasSgemmEx / cublasHgemm
+  int m = (cuTransA == rocblas_operation_none) ? N : M;
+  int k = (cuTransA == rocblas_operation_none) ? M : N;
+  int LDA = (cuTransA == rocblas_operation_none) ? m : k;
+  int LDC = m;
+
+  if (math_type == TensorProto_DataType_FLOAT) {
+    ROCBLAS_CHECK(cublasSgemmEx(
+        context->rocblas_handle(),
+        cuTransA,
+        rocblas_operation_none,
+        m,
+        1,
+        k,
+        &alpha,
+        A,
+        CUDA_R_16F,
+        LDA,
+        x,
+        CUDA_R_16F,
+        k,
+        &beta,
+        y,
+        CUDA_R_16F,
+        LDC));
+  } else if (math_type == TensorProto_DataType_FLOAT16) {
+    auto alpha_fp16 = convert::floatToHalf(alpha);
+    auto beta_fp16 = convert::floatToHalf(beta);
+
+    ROCBLAS_CHECK(cublasHgemm(
+        context->rocblas_handle(),
+        cuTransA,
+        rocblas_operation_none,
+        m,
+        1,
+        k,
+        &alpha_fp16,
+        (const __half*)A,
+        LDA,
+        (const __half*)x,
+        k,
+        &beta_fp16,
+        (__half*)y,
+        LDC));
+  } else {
+    // fail
+    CAFFE_THROW("Unsupported math type");
+  }
+#endif
+}
+namespace {
+template <typename T>
+__global__ void SetKernel(const int N, const T alpha, T* Y) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    Y[i] = alpha;
+  }
+}
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_SET(T)                             \
+  template <>                                                     \
+  void Set<T, HIPContext>(                                        \
+      const size_t N, const T alpha, T* Y, HIPContext* context) { \
+    hipLaunchKernelGGL(                                           \
+        (SetKernel),                                              \
+        CAFFE_GET_BLOCKS(N),                                      \
+        CAFFE_HIP_NUM_THREADS,                                    \
+        0,                                                        \
+        context->hip_stream(),                                    \
+        static_cast<const int>(N),                                \
+        alpha,                                                    \
+        Y);                                                       \
+  }
+
+CAFFE2_SPECIALIZED_HIP_SET(float);
+CAFFE2_SPECIALIZED_HIP_SET(double);
+CAFFE2_SPECIALIZED_HIP_SET(bool);
+CAFFE2_SPECIALIZED_HIP_SET(int8_t);
+CAFFE2_SPECIALIZED_HIP_SET(int16_t);
+CAFFE2_SPECIALIZED_HIP_SET(float16);
+CAFFE2_SPECIALIZED_HIP_SET(int);
+CAFFE2_SPECIALIZED_HIP_SET(int64_t);
+CAFFE2_SPECIALIZED_HIP_SET(char);
+CAFFE2_SPECIALIZED_HIP_SET(uint8_t);
+CAFFE2_SPECIALIZED_HIP_SET(uint16_t);
+#undef CAFFE2_SPECIALIZED_HIP_SET
+
+namespace {
+template <typename T>
+__global__ void
+UniformShift(const size_t N, const float min, const float max, T* x) {
+  float scale = max - min;
+  HIP_1D_KERNEL_LOOP(i, N) {
+    x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
+  }
+}
+
+__global__ void
+UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
+  int* x_int = reinterpret_cast<int*>(x);
+  int range = (max - min + 1);
+  HIP_1D_KERNEL_LOOP(i, N) {
+    x_int[i] = min + static_cast<int>(x[i] % range);
+  }
+}
+} // namespace
+
+template <>
+void RandUniform<float, HIPContext>(
+    const size_t n,
+    const float min,
+    const float max,
+    float* r,
+    HIPContext* context) {
+  HIPRAND_ENFORCE(hiprandGenerateUniform(context->hiprand_generator(), r, n));
+  hipLaunchKernelGGL(
+      (UniformShift<float>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      min,
+      max,
+      r);
+}
+
+template <>
+void RandUniform<double, HIPContext>(
+    const size_t n,
+    const double min,
+    const double max,
+    double* r,
+    HIPContext* context) {
+  HIPRAND_ENFORCE(
+      hiprandGenerateUniformDouble(context->hiprand_generator(), r, n));
+  hipLaunchKernelGGL(
+      (UniformShift<double>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      min,
+      max,
+      r);
+}
+
+template <>
+void RandUniform<int, HIPContext>(
+    const size_t n,
+    const int min,
+    const int max,
+    int* r,
+    HIPContext* context) {
+  HIPRAND_ENFORCE(hiprandGenerate(
+      context->hiprand_generator(), reinterpret_cast<unsigned int*>(r), n));
+  hipLaunchKernelGGL(
+      (UniformIntFit),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      min,
+      max,
+      reinterpret_cast<unsigned int*>(r));
+}
+
+template <typename T>
+size_t HandleOddLengthRandGaussian(
+    const size_t n,
+    const T mean,
+    const T std,
+    T* r,
+    HIPContext* context) {
+  if (n % 2 == 1) {
+    std::default_random_engine generator;
+    std::normal_distribution<T> distribution(mean, std);
+    const T random_value = distribution(generator);
+    math::Set<T, HIPContext>(1, random_value, r + (n - 1), context);
+    return n - 1;
+  }
+  return n;
+}
+
+template <>
+void RandGaussian<float, HIPContext>(
+    const size_t n,
+    const float mean,
+    const float std,
+    float* r,
+    HIPContext* context) {
+  // If n is odd, we add a random Gaussian value at the end manually
+  // and generate n-1 random values using curandGenerateNormal.
+  // curandGenerateNormal requires n to be even.
+  const size_t even_n =
+      HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
+  HIPRAND_ENFORCE(hiprandGenerateNormal(
+      context->hiprand_generator(), r, even_n, mean, std));
+}
+
+template <>
+void RandGaussian<double, HIPContext>(
+    const size_t n,
+    const double mean,
+    const double std,
+    double* r,
+    HIPContext* context) {
+  const size_t even_n =
+      HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
+  HIPRAND_ENFORCE(hiprandGenerateNormalDouble(
+      context->hiprand_generator(), r, even_n, mean, std));
+}
+
+template <>
+void Dot<float, HIPContext>(
+    const int n,
+    const float* a,
+    const float* b,
+    float* y,
+    HIPContext* context) {
+  float result;
+  ROCBLAS_ENFORCE(
+      rocblas_sdot(context->rocblas_handle(), n, a, 1, b, 1, &result));
+  context->Copy<float, CPUContext, HIPContext>(1, &result, y);
+}
+
+template <>
+void Dot<float16, HIPContext>(
+    const int n,
+    const float16* a,
+    const float16* b,
+    float16* y,
+    HIPContext* context) {
+  CAFFE_THROW("Unsupported math type");
+#if ROCBLAS_FP16 // rocblas does not support fp16 yet
+  float16 result;
+  // execute with 32-bit math
+  ROCBLAS_CHECK(cublasDotEx(
+      context->rocblas_handle(),
+      n,
+      a,
+      CUDA_R_16F,
+      1,
+      b,
+      CUDA_R_16F,
+      1,
+      &result,
+      CUDA_R_16F,
+      CUDA_R_32F));
+  context->Copy<float16, CPUContext, HIPContext>(1, &result, y);
+#endif
+}
+
+// A previous version of caffe2 used Thrust but it turns out that thrust
+// reduction has an implicit scratch space allocation and deallocation, which
+// may interfere with NCCL and create a deadlock. Hence we are using a custom
+// reduction here.
+#define SUM_KERNEL_NTHREADS 128
+template <typename T>
+__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
+  const int idx = threadIdx.x;
+  __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
+
+  reduction_buffer[idx] = 0;
+
+  // A multilevel reduction.
+  // N -> 128
+  if (!square) {
+    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
+      reduction_buffer[idx] += convert::To<T, float>(X[i]);
+    }
+  } else {
+    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
+      float Xi = convert::To<T, float>(X[i]);
+      reduction_buffer[idx] += Xi * Xi;
+    }
+  }
+  __syncthreads();
+  // 128 -> 32
+  if (idx < 32) {
+    reduction_buffer[idx] += reduction_buffer[idx + 32] +
+        reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
+  }
+  __syncthreads();
+  // 32 -> 1
+  if (idx == 0) {
+    float tmp = 0;
+    for (int i = 0; i < 32; ++i) {
+      tmp += reduction_buffer[i];
+    }
+    *Y = convert::To<float, T>(tmp);
+  }
+}
+
+// According to the benchmarks script
+// caffe2/caffe2/experiments/python/device_reduce_sum_bench.py,
+// device reduce is slower for N <= 10000.
+#define DEVICE_REDUCE_SIZE_THRESHOLD 10000
+
+namespace {
+
+template <typename T>
+__global__ void SumConvertKernel(float* sum, T* dest) {
+  *dest = convert::To<float, T>(*sum);
+}
+
+template <typename T, typename IterT>
+void SumGenericIter(
+    const int N,
+    IterT it,
+    T*& dest,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch_ptr) {
+  size_t memRequired = 0;
+  cub::DeviceReduce::Sum(
+      nullptr, memRequired, it, dest, N, context->hip_stream());
+  auto buffer_size =
+      static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
+  if (!dest) {
+    // allocate one more T at the end of scratch for dest
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
+    dest = scratch_ptr->template mutable_data<T>() + buffer_size;
+  } else {
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
+  }
+  cub::DeviceReduce::Sum(
+      static_cast<void*>(scratch_ptr->template mutable_data<T>()),
+      memRequired,
+      it,
+      dest,
+      N,
+      context->hip_stream());
+}
+} // namespace
+
+template <>
+void Sum<float, HIPContext>(
+    const int N,
+    const float* x,
+    float* y,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SumGenericIter<float>(N, x, y, context, scratch_ptr);
+  } else {
+    hipLaunchKernelGGL(
+        (SumKernel),
+        dim3(1),
+        dim3(SUM_KERNEL_NTHREADS),
+        0,
+        context->hip_stream(),
+        N,
+        x,
+        y,
+        false);
+  }
+}
+
+template <>
+void Sum<int32_t, HIPContext>(
+    const int N,
+    const int32_t* x,
+    int32_t* y,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
+  } else {
+    hipLaunchKernelGGL(
+        (SumKernel),
+        dim3(1),
+        dim3(SUM_KERNEL_NTHREADS),
+        0,
+        context->hip_stream(),
+        N,
+        x,
+        y,
+        false);
+  }
+}
+
+namespace {
+template <typename T>
+struct FloatTransform {
+  inline __host__ __device__ float operator()(const T v) const {
+    return convert::To<T, float>(v);
+  }
+};
+} // namespace
+
+#define CAFFE2_MATH_SUM_FUNC(T)                                           \
+  template <>                                                             \
+  void Sum<T, HIPContext>(                                                \
+      const int N,                                                        \
+      const T* x,                                                         \
+      T* y,                                                               \
+      HIPContext* context,                                                \
+      Tensor<HIPContext>* scratch_ptr) {                                  \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
+      FloatTransform<T> transform;                                        \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
+          x, transform);                                                  \
+      float* sum = nullptr;                                               \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);            \
+      hipLaunchKernelGGL(                                                 \
+          (SumConvertKernel),                                             \
+          dim3(1),                                                        \
+          dim3(1),                                                        \
+          0,                                                              \
+          context->hip_stream(),                                          \
+          sum,                                                            \
+          y);                                                             \
+    } else {                                                              \
+      hipLaunchKernelGGL(                                                 \
+          (SumKernel),                                                    \
+          dim3(1),                                                        \
+          dim3(SUM_KERNEL_NTHREADS),                                      \
+          0,                                                              \
+          context->hip_stream(),                                          \
+          N,                                                              \
+          x,                                                              \
+          y,                                                              \
+          false);                                                         \
+    }                                                                     \
+  }
+
+CAFFE2_MATH_SUM_FUNC(float16)
+#undef CAFFE2_MATH_SUM_FUNC
+
+namespace {
+template <typename T>
+struct SqrTransform {
+  inline __host__ __device__ T operator()(const T v) const {
+    return v * v;
+  }
+};
+} //  namespace
+
+template <>
+void SumSqr<float, HIPContext>(
+    const int N,
+    const float* x,
+    float* y,
+    HIPContext* context,
+    Tensor<HIPContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SqrTransform<float> transform;
+    cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
+        x, transform);
+    SumGenericIter<float>(N, it, y, context, scratch_ptr);
+  } else {
+    hipLaunchKernelGGL(
+        (SumKernel),
+        dim3(1),
+        dim3(SUM_KERNEL_NTHREADS),
+        0,
+        context->hip_stream(),
+        N,
+        x,
+        y,
+        true);
+  }
+}
+
+#define CAFFE2_MATH_SUMSQR_FUNC(T)                                    \
+  template <>                                                         \
+  void SumSqr<T, HIPContext>(                                         \
+      const int N,                                                    \
+      const T* x,                                                     \
+      T* y,                                                           \
+      HIPContext* context,                                            \
+      Tensor<HIPContext>* scratch_ptr) {                              \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {            \
+      FloatTransform<T> float_transform;                              \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T*> \
+          float_it(x, float_transform);                               \
+      SqrTransform<float> sqr_transform;                              \
+      cub::TransformInputIterator<                                    \
+          float,                                                      \
+          SqrTransform<float>,                                        \
+          decltype(float_it)>                                         \
+          it(float_it, sqr_transform);                                \
+      float* sum = nullptr;                                           \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);        \
+      hipLaunchKernelGGL(                                             \
+          (SumConvertKernel),                                         \
+          dim3(1),                                                    \
+          dim3(1),                                                    \
+          0,                                                          \
+          context->hip_stream(),                                      \
+          sum,                                                        \
+          y);                                                         \
+    } else {                                                          \
+      hipLaunchKernelGGL(                                             \
+          (SumKernel),                                                \
+          dim3(1),                                                    \
+          dim3(SUM_KERNEL_NTHREADS),                                  \
+          0,                                                          \
+          context->hip_stream(),                                      \
+          N,                                                          \
+          x,                                                          \
+          y,                                                          \
+          true);                                                      \
+    }                                                                 \
+  }
+
+CAFFE2_MATH_SUMSQR_FUNC(float16)
+#undef CAFFE2_MATH_SUMSQR_FUNC
+#undef DEVICE_REDUCE_SIZE_THRESHOLD
+
+namespace {
+template <typename T>
+__global__ void
+SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    y[i] = x[i * D + idx[i]];
+  }
+}
+} // namespace
+
+template <>
+void Select<float, HIPContext>(
+    const int N,
+    const int D,
+    const float* x,
+    const int* idx,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (SelectKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(N)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      N,
+      D,
+      x,
+      idx,
+      y);
+}
+
+template <>
+void Select<float16, HIPContext>(
+    const int N,
+    const int D,
+    const float16* x,
+    const int* idx,
+    float16* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (SelectKernel<float16>),
+      dim3(CAFFE_GET_BLOCKS(N)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      N,
+      D,
+      x,
+      idx,
+      y);
+}
+
+namespace {
+template <typename T>
+__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
+  HIP_1D_KERNEL_LOOP(i, n) {
+    // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
+    y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
+  }
+}
+
+template <typename T>
+__global__ void
+ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
+  HIP_1D_KERNEL_LOOP(i, n) {
+    y[i] = x[i] * (*alpha);
+  }
+}
+
+template <typename T>
+__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
+  HIP_1D_KERNEL_LOOP(i, n) {
+    y[i] = powf(x[i], exponent);
+  }
+}
+
+// fp16 specialization
+template <>
+__global__ void ScaleKernelDeviceAlpha(
+    const int n,
+    const float* alpha,
+    const float16* x,
+    float16* y) {
+  HIP_1D_KERNEL_LOOP(i, n) {
+    y[i] = convert::To<float, float16>(
+        convert::To<float16, float>(x[i]) * (*alpha));
+  }
+}
+
+} // namespace
+
+template <>
+void Powx<float, HIPContext>(
+    const int N,
+    const float* a,
+    const float b,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (PowKernel),
+      dim3(CAFFE_GET_BLOCKS(N)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      N,
+      a,
+      b,
+      y);
+}
+
+template <>
+void Scale<float, HIPContext>(
+    const int n,
+    const float alpha,
+    const float* x,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (ScaleKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      x,
+      y);
+}
+
+template <>
+void Scale<float16, HIPContext>(
+    const int n,
+    const float alpha,
+    const float16* x,
+    float16* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (ScaleKernel<float16>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      x,
+      y);
+}
+
+template <>
+void Scale<float, HIPContext>(
+    const int n,
+    const float* alpha,
+    const float* x,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (ScaleKernelDeviceAlpha<float>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      x,
+      y);
+}
+
+template <>
+void Scale<float16, HIPContext>(
+    const int n,
+    const float* alpha,
+    const float16* x,
+    float16* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (ScaleKernelDeviceAlpha<float16>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      x,
+      y);
+}
+
+template <>
+void Axpy<float, HIPContext>(
+    const int N,
+    const float alpha,
+    const float* X,
+    float* Y,
+    HIPContext* context) {
+  ROCBLAS_ENFORCE(
+      rocblas_saxpy(context->rocblas_handle(), N, &alpha, X, 1, Y, 1));
+}
+
+template <>
+void Axpy<double, HIPContext>(
+    const int N,
+    const float alpha,
+    const double* X,
+    double* Y,
+    HIPContext* context) {
+  double alpha_d{alpha};
+  ROCBLAS_ENFORCE(
+      rocblas_daxpy(context->rocblas_handle(), N, &alpha_d, X, 1, Y, 1));
+}
+
+template <>
+void Axpy<float16, HIPContext>(
+    const int N,
+    const float alpha,
+    const float16* X,
+    float16* Y,
+    HIPContext* context) {
+  CAFFE_THROW("Unsupported math type");
+#if ROCBLAS_FP16
+  ROCBLAS_CHECK(cublasAxpyEx(
+      context->rocblas_handle(),
+      N,
+      &alpha,
+      CUDA_R_16F,
+      X,
+      CUDA_R_16F,
+      1,
+      Y,
+      CUDA_R_16F,
+      1,
+      CUDA_R_32F));
+#endif
+}
+
+namespace {
+template <typename T>
+__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
+  HIP_1D_KERNEL_LOOP(index, n) {
+    y[index] = convert::Get<T>(
+        convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
+  }
+}
+} // namespace
+
+template <>
+void Axpy<float, HIPContext>(
+    const int n,
+    const float* alpha,
+    const float* X,
+    float* Y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (AxpyKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      X,
+      Y);
+}
+
+template <>
+void Axpy<float16, HIPContext>(
+    const int n,
+    const float* alpha,
+    const float16* X,
+    float16* Y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (AxpyKernel<float16>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      alpha,
+      X,
+      Y);
+}
+
+namespace {
+template <typename T>
+__global__ void
+AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
+  HIP_1D_KERNEL_LOOP(index, n) {
+    y[index] = x[index] * a + y[index] * b;
+  }
+}
+} // namespace
+
+template <>
+void Axpby<float, HIPContext>(
+    const int n,
+    const float a,
+    const float* x,
+    const float b,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (AxpbyKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(n)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      n,
+      a,
+      x,
+      b,
+      y);
+}
+
+namespace {
+
+template <typename T>
+__global__ void Im2ColNCHWHIPKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* img_data,
+    T* col_data) {
+  HIP_1D_KERNEL_LOOP(index, n) {
+    const int w_out = index % output_w;
+    const int h_index = index / output_w;
+    const int h_out = h_index % output_h;
+    const int channel_in = h_index / output_h;
+    const int channel_out = channel_in * kernel_h * kernel_w;
+    const int h_in = h_out * stride_h - pad_t;
+    const int w_in = w_out * stride_w - pad_l;
+    const int output_size = output_h * output_w;
+    T* col_data_ptr =
+        col_data + (channel_out * output_h + h_out) * output_w + w_out;
+    const T* img_data_ptr =
+        img_data + (channel_in * input_h + h_in) * input_w + w_in;
+    int dh = 0;
+    for (int i = 0; i < kernel_h; ++i) {
+      int dw = 0;
+      for (int j = 0; j < kernel_w; ++j) {
+        const int h = h_in + dh;
+        const int w = w_in + dw;
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? __ldg(img_data_ptr + dh * input_w + dw)
+            : 0;
+        col_data_ptr += output_size;
+        dw += dilation_w;
+      }
+      dh += dilation_h;
+    }
+  }
+}
+
+template <typename T>
+__global__ void Im2ColNHWCHIPKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_w,
+    const int channels,
+    const T* img_data,
+    T* col_data) {
+  HIP_1D_KERNEL_LOOP(index, n) {
+    const int channel_in = index % channels;
+    const int w_out = index / channels % output_w;
+    const int h_out = index / channels / output_w;
+    const int h_in = h_out * stride_h - pad_t;
+    const int w_in = w_out * stride_w - pad_l;
+    T* col_data_ptr = col_data +
+        (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
+        channel_in;
+    int dh = 0;
+    for (int i = 0; i < kernel_h; ++i) {
+      int dw = 0;
+      for (int j = 0; j < kernel_w; ++j) {
+        const int h = h_in + dh;
+        const int w = w_in + dw;
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
+            : 0;
+        col_data_ptr += channels;
+        dw += dilation_w;
+      }
+      dh += dilation_h;
+    }
+  }
+}
+
+template <typename T>
+__global__ void Col2ImNCHWHIPKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int patch_h,
+    const int patch_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* col_data,
+    T* img_data) {
+  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
+  const int dpatch_w = dilation_w * (patch_w - 1) + 1;
+
+  HIP_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    const int w = index % input_w + pad_l;
+    const int h = index / input_w % input_h + pad_t;
+    const int c = index / (input_h * input_w);
+
+    // compute the start and end of the output
+    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
+    const int w_col_end = min(w / stride_w + 1, output_w);
+    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
+    const int h_col_end = min(h / stride_h + 1, output_h);
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_k = (h - h_col * stride_h);
+        int w_k = (w - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          const int col_data_index =
+              (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) *
+                  output_w +
+              w_col;
+          val += __ldg(col_data + col_data_index);
+        }
+      }
+    }
+    img_data[index] = val;
+  }
+}
+
+template <typename T>
+__global__ void Col2ImNHWCHIPKernel(
+    const int n,
+    const int input_w,
+    const int channels,
+    const int patch_h,
+    const int patch_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* col_data,
+    T* img_data) {
+  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
+  const int dpatch_w = dilation_w * (patch_w - 1) + 1;
+
+  HIP_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    const int c = index % channels;
+    const int w = index / channels % input_w + pad_l;
+    const int h = index / channels / input_w + pad_t;
+    // compute the start and end of the output
+    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
+    const int w_col_end = min(w / stride_w + 1, output_w);
+    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
+    const int h_col_end = min(h / stride_h + 1, output_h);
+    const int channels_col = patch_h * patch_w * channels;
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_k = h - h_col * stride_h;
+        int w_k = w - w_col * stride_w;
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          const int c_col = (h_k * patch_w + w_k) * channels + c;
+          val += __ldg(
+              col_data + (h_col * output_w + w_col) * channels_col + c_col);
+        }
+      }
+    }
+    img_data[index] = val;
+  }
+}
+
+template <typename T, int N, bool kCol2Im>
+__global__ void Im2ColNdNCHWHIPKernel(
+    const int outer_size,
+    const int inner_size,
+    const int kernel_size,
+    SimpleArray<int, N + 1> img_shape,
+    SimpleArray<int, N + 1> col_shape,
+    SimpleArray<int, N> kernel_shape,
+    SimpleArray<int, N> stride,
+    SimpleArray<int, N> dilation,
+    SimpleArray<int, N> pad,
+    const T* X_data,
+    T* Y_data) {
+  int d_offset[N];
+  int d_iter[N];
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    int offset_i = i;
+#pragma unroll
+    for (int d_i = N - 1; d_i >= 0; --d_i) {
+      d_offset[d_i] = offset_i % kernel_shape.data[d_i];
+      offset_i /= kernel_shape.data[d_i];
+    }
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int offset_j = j;
+#pragma unroll
+      for (int d_i = N - 1; d_i >= 0; --d_i) {
+        d_iter[d_i] = offset_j % col_shape.data[d_i + 1];
+        offset_j /= col_shape.data[d_i + 1];
+      }
+      const int col_index = i * inner_size + j;
+      int img_index = i / kernel_size;
+      bool is_padding = false;
+#pragma unroll
+      for (int d_i = 0; d_i < N; ++d_i) {
+        const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
+            d_offset[d_i] * dilation.data[d_i];
+        is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
+        img_index = img_index * img_shape.data[d_i + 1] + d_img;
+      }
+      if (!kCol2Im) {
+        Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index);
+      } else if (!is_padding) {
+        atomicAdd(Y_data + img_index, __ldg(X_data + col_index));
+      }
+    }
+  }
+}
+
+template <typename T, int N>
+void Im2ColNdNCHWHIPImpl(
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* img_data,
+    float* col_data,
+    HIPContext* context) {
+  const int outer_size = col_shape[0];
+  const int inner_size = col_size / outer_size;
+  const int kernel_size = std::accumulate(
+      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  SimpleArray<int, N + 1> img_shape_array;
+  SimpleArray<int, N + 1> col_shape_array;
+  SimpleArray<int, N> kernel_shape_array;
+  SimpleArray<int, N> stride_array;
+  SimpleArray<int, N> dilation_array;
+  SimpleArray<int, N> pad_array;
+  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
+  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
+  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
+  std::memcpy(stride_array.data, stride, N * sizeof(int));
+  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
+  std::memcpy(pad_array.data, pad, N * sizeof(int));
+  hipLaunchKernelGGL(
+      (Im2ColNdNCHWHIPKernel<T, N, false>),
+      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      outer_size,
+      inner_size,
+      kernel_size,
+      img_shape_array,
+      col_shape_array,
+      kernel_shape_array,
+      stride_array,
+      dilation_array,
+      pad_array,
+      img_data,
+      col_data);
+}
+
+template <typename T, int N>
+void Col2ImNdNCHWHIPImpl(
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* col_data,
+    float* img_data,
+    HIPContext* context) {
+  const int outer_size = col_shape[0];
+  const int inner_size = col_size / outer_size;
+  const int kernel_size = std::accumulate(
+      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  SimpleArray<int, N + 1> img_shape_array;
+  SimpleArray<int, N + 1> col_shape_array;
+  SimpleArray<int, N> kernel_shape_array;
+  SimpleArray<int, N> stride_array;
+  SimpleArray<int, N> dilation_array;
+  SimpleArray<int, N> pad_array;
+  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
+  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
+  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
+  std::memcpy(stride_array.data, stride, N * sizeof(int));
+  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
+  std::memcpy(pad_array.data, pad, N * sizeof(int));
+  Set<T, HIPContext>(img_size, 0, img_data, context);
+  hipLaunchKernelGGL(
+      (Im2ColNdNCHWHIPKernel<T, N, true>),
+      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      outer_size,
+      inner_size,
+      kernel_size,
+      img_shape_array,
+      col_shape_array,
+      kernel_shape_array,
+      stride_array,
+      dilation_array,
+      pad_array,
+      col_data,
+      img_data);
+}
+
+} // namespace
+
+template <>
+void Im2Col<float, HIPContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    HIPContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = channels * output_h * output_w;
+  hipLaunchKernelGGL(
+      (Im2ColNCHWHIPKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      num_kernels,
+      height,
+      width,
+      kernel_h,
+      kernel_w,
+      dilation_h,
+      dilation_w,
+      pad_t,
+      pad_l,
+      stride_h,
+      stride_w,
+      output_h,
+      output_w,
+      img_data,
+      col_data);
+}
+
+template <>
+void Im2Col<float, HIPContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    HIPContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = output_h * output_w * channels;
+  hipLaunchKernelGGL(
+      (Im2ColNHWCHIPKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      num_kernels,
+      height,
+      width,
+      kernel_h,
+      kernel_w,
+      dilation_h,
+      dilation_w,
+      pad_t,
+      pad_l,
+      stride_h,
+      stride_w,
+      output_w,
+      channels,
+      img_data,
+      col_data);
+}
+
+template <>
+void Col2Im<float, HIPContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    HIPContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = channels * height * width;
+  hipLaunchKernelGGL(
+      (Col2ImNCHWHIPKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      num_kernels,
+      height,
+      width,
+      kernel_h,
+      kernel_w,
+      dilation_h,
+      dilation_w,
+      pad_t,
+      pad_l,
+      stride_h,
+      stride_w,
+      output_h,
+      output_w,
+      col_data,
+      img_data);
+}
+
+template <>
+void Col2Im<float, HIPContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    HIPContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = height * width * channels;
+  hipLaunchKernelGGL(
+      (Col2ImNHWCHIPKernel<float>),
+      dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      num_kernels,
+      width,
+      channels,
+      kernel_h,
+      kernel_w,
+      dilation_h,
+      dilation_w,
+      pad_t,
+      pad_l,
+      stride_h,
+      stride_w,
+      output_h,
+      output_w,
+      col_data,
+      img_data);
+}
+
+template <>
+void Im2ColNd<float, HIPContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* img_data,
+    float* col_data,
+    HIPContext* context) {
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      N,
+      Im2ColNdNCHWHIPImpl,
+      float,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      img_data,
+      col_data,
+      context);
+}
+
+template <>
+void Col2ImNd<float, HIPContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* col_data,
+    float* img_data,
+    HIPContext* context) {
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      N,
+      Col2ImNdNCHWHIPImpl,
+      float,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      col_data,
+      img_data,
+      context);
+}
+
+template <>
+void CopyMatrix<HIPContext>(
+    const size_t itemsize,
+    const int M,
+    const int N,
+    const void* A,
+    const int lda,
+    void* B,
+    const int ldb,
+    HIPContext* context,
+    TypeMeta::TypedCopy copy) {
+  CAFFE_ENFORCE(!copy, "Copy constructor is not supported in HIP context");
+  hipMemcpy2DAsync(
+      B,
+      ldb * itemsize,
+      A,
+      lda * itemsize,
+      N * itemsize,
+      M,
+      hipMemcpyDeviceToDevice,
+      context->hip_stream());
+}
+
+template <>
+void CopyVector<float, HIPContext>(
+    const int N,
+    const float* src,
+    float* dst,
+    HIPContext* context) {
+  if (src != dst && N > 0) {
+    hipMemcpyAsync(
+        dst,
+        src,
+        sizeof(float) * N,
+        hipMemcpyDeviceToDevice,
+        context->hip_stream());
+  }
+}
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_HIP_NUM_THREADS>;
+
+template <typename T, class Reducer>
+__global__ void RowwiseReduceKernel(
+    const int rows,
+    const int cols,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+      val = reducer(X[i * cols + j], val);
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, class Reducer>
+__global__ void ColwiseReduceKernel(
+    const int rows,
+    const int cols,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < cols; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < rows; j += blockDim.x) {
+      val = reducer(X[j * cols + i], val);
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(T)                            \
+  template <>                                                            \
+  void RowwiseMax<T, HIPContext>(                                        \
+      const int N, const int D, const T* x, T* y, HIPContext* context) { \
+    hipLaunchKernelGGL(                                                  \
+        (RowwiseReduceKernel),                                           \
+        std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                           \
+        CAFFE_HIP_NUM_THREADS,                                           \
+        0,                                                               \
+        context->hip_stream(),                                           \
+        N,                                                               \
+        D,                                                               \
+        cub::Max(),                                                      \
+        std::numeric_limits<T>::lowest(),                                \
+        x,                                                               \
+        y);                                                              \
+  }
+CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(float)
+#undef CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX
+
+#define CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(T)                            \
+  template <>                                                            \
+  void ColwiseMax<T, HIPContext>(                                        \
+      const int N, const int D, const T* x, T* y, HIPContext* context) { \
+    hipLaunchKernelGGL(                                                  \
+        (ColwiseReduceKernel),                                           \
+        std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                           \
+        CAFFE_HIP_NUM_THREADS,                                           \
+        0,                                                               \
+        context->hip_stream(),                                           \
+        N,                                                               \
+        D,                                                               \
+        cub::Max(),                                                      \
+        std::numeric_limits<T>::lowest(),                                \
+        x,                                                               \
+        y);                                                              \
+  }
+CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(float)
+#undef CAFFE2_SPECIALIZED_HIP_COLWISE_MAX
+
+namespace {
+__global__ void
+maximum_kernel(const int N, const float alpha, const float* x, float* y) {
+  HIP_1D_KERNEL_LOOP(i, N) {
+    y[i] = fmaxf(x[i], alpha);
+  }
+}
+} // namespace
+
+template <>
+void Maximum(
+    const int N,
+    const float alpha,
+    const float* x,
+    float* y,
+    HIPContext* context) {
+  hipLaunchKernelGGL(
+      (maximum_kernel),
+      dim3(std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      N,
+      alpha,
+      x,
+      y);
+}
+
+namespace {
+
+template <typename T, class Reducer, int D>
+__global__ void ReduceTensorHIPKernel(
+    const int outer_size,
+    const int inner_size,
+    SimpleArray<int, D> X_strides,
+    SimpleArray<int, D> Y_dims,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int X_index = 0;
+      int Y_index = i * inner_size + j;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
+        Y_index /= Y_dims.data[d];
+      }
+      val = reducer(val, __ldg(X + X_index));
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, class Reducer, int D>
+void ReduceTensorHIPImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* dims,
+    const int* axes,
+    const Reducer& reducer,
+    const T& init,
+    const T* X,
+    T* Y,
+    HIPContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<int, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = dims[axes[i]];
+  }
+  hipLaunchKernelGGL(
+      (ReduceTensorHIPKernel<T, Reducer, D>),
+      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      outer_size,
+      inner_size,
+      X_strides,
+      Y_dims,
+      reducer,
+      init,
+      X,
+      Y);
+}
+
+template <typename T, class Reducer>
+void ReduceTensorHIP(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const Reducer& reducer,
+    const T& init,
+    const T* X,
+    T* Y,
+    HIPContext* context) {
+  CAFFE_ENFORCE_LE(num_axes, num_dims);
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
+  const int pivot = num_dims - num_axes;
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= dims[transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < num_dims; ++i) {
+    inner_size *= dims[transpose_axes[i]];
+  }
+  if (transpose_axes[pivot] == pivot) {
+    hipLaunchKernelGGL(
+        (RowwiseReduceKernel<T>),
+        dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        outer_size,
+        inner_size,
+        reducer,
+        init,
+        X,
+        Y);
+    return;
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+      num_dims,
+      ReduceTensorHIPImpl,
+      T,
+      Reducer,
+      outer_size,
+      inner_size,
+      dims,
+      transpose_axes.data(),
+      reducer,
+      init,
+      X,
+      Y,
+      context);
+}
+
+template <typename T>
+void ReduceMeanHIPImpl(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    HIPContext* context) {
+  ReduceTensorHIP(
+      num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
+  const int X_size =
+      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
+  int scale = 1;
+  for (int i = 0; i < num_axes; ++i) {
+    scale *= dims[axes[i]];
+  }
+  const int Y_size = X_size / scale;
+  Scale<T, HIPContext>(Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(T) \
+  template <>                                \
+  void ReduceMin<T, HIPContext>(             \
+      const int num_dims,                    \
+      const int* dims,                       \
+      const int num_axes,                    \
+      const int* axes,                       \
+      const T* X,                            \
+      T* Y,                                  \
+      HIPContext* context) {                 \
+    ReduceTensorHIP(                         \
+        num_dims,                            \
+        dims,                                \
+        num_axes,                            \
+        axes,                                \
+        cub::Min(),                          \
+        std::numeric_limits<T>::max(),       \
+        X,                                   \
+        Y,                                   \
+        context);                            \
+  }
+CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int32_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int64_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(float)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(double)
+#undef CAFFE2_SPECIALIZED_HIP_REDUCE_MIN
+
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(T) \
+  template <>                                \
+  void ReduceMax<T, HIPContext>(             \
+      const int num_dims,                    \
+      const int* dims,                       \
+      const int num_axes,                    \
+      const int* axes,                       \
+      const T* X,                            \
+      T* Y,                                  \
+      HIPContext* context) {                 \
+    ReduceTensorHIP(                         \
+        num_dims,                            \
+        dims,                                \
+        num_axes,                            \
+        axes,                                \
+        cub::Max(),                          \
+        std::numeric_limits<T>::lowest(),    \
+        X,                                   \
+        Y,                                   \
+        context);                            \
+  }
+CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int32_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int64_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(float)
+CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(double)
+#undef CAFFE2_SPECIALIZED_HIP_REDUCE_MAX
+
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(T)                              \
+  template <>                                                             \
+  void ReduceSum<T, HIPContext>(                                          \
+      const int num_dims,                                                 \
+      const int* dims,                                                    \
+      const int num_axes,                                                 \
+      const int* axes,                                                    \
+      const T* X,                                                         \
+      T* Y,                                                               \
+      HIPContext* context) {                                              \
+    ReduceTensorHIP(                                                      \
+        num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int32_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int64_t)
+CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(float)
+CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(double)
+#undef CAFFE2_SPECIALIZED_HIP_REDUCE_SUM
+
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(T)                            \
+  template <>                                                            \
+  void ReduceMean<T, HIPContext>(                                        \
+      const int num_dims,                                                \
+      const int* dims,                                                   \
+      const int num_axes,                                                \
+      const int* axes,                                                   \
+      const T* X,                                                        \
+      T* Y,                                                              \
+      HIPContext* context) {                                             \
+    ReduceMeanHIPImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(float)
+#undef CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN
+
+namespace {
+
+template <typename T, int D>
+__global__ void BroadcastHIPKernel(
+    const int Y_size,
+    const SimpleArray<int, D> X_strides,
+    const SimpleArray<int, D> Y_dims,
+    const T* X,
+    T* Y) {
+  HIP_1D_KERNEL_LOOP(Y_index, Y_size) {
+    int X_index = 0;
+    int Y_index_val = Y_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      X_index += X_strides.data[i] == 0
+          ? 0
+          : (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
+      Y_index_val /= Y_dims.data[i];
+    }
+    Y[Y_index] = __ldg(X + X_index);
+  }
+}
+
+template <typename T, int D>
+void BroadcastHIPImpl(
+    const int X_ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T* X,
+    T* Y,
+    HIPContext* context) {
+  SimpleArray<int, D> X_strides_array;
+  SimpleArray<int, D> Y_dims_array;
+  const int d = D - X_ndim;
+  std::fill(X_strides_array.data, X_strides_array.data + d, 0);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= d; --i) {
+    CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
+    X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride;
+    cur_stride *= X_dims[i - d];
+  }
+  std::copy_n(Y_dims, D, Y_dims_array.data);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
+  hipLaunchKernelGGL(
+      (BroadcastHIPKernel<T, D>),
+      dim3(CAFFE_GET_BLOCKS(Y_size)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      Y_size,
+      X_strides_array,
+      Y_dims_array,
+      X,
+      Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_BROADCAST(T)                                  \
+  template <>                                                                \
+  void Broadcast<T, HIPContext>(                                             \
+      const int X_ndim,                                                      \
+      const int* X_dims,                                                     \
+      const int Y_ndim,                                                      \
+      const int* Y_dims,                                                     \
+      const T* X,                                                            \
+      T* Y,                                                                  \
+      HIPContext* context) {                                                 \
+    CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                        \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                  \
+        Y_ndim, BroadcastHIPImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int32_t)
+CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int64_t)
+CAFFE2_SPECIALIZED_HIP_BROADCAST(float)
+CAFFE2_SPECIALIZED_HIP_BROADCAST(double)
+#undef CAFFE2_SPECIALIZED_HIP_BROADCAST
+
+namespace {
+
+template <typename T>
+__global__ void RowwiseMomentsHIPKernel(
+    const int rows,
+    const int cols,
+    const T* X,
+    T* mean,
+    T* variance) {
+  __shared__ typename BlockReduce<T>::TempStorage m_storage;
+  __shared__ typename BlockReduce<T>::TempStorage v_storage;
+  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
+    T m_val = 0;
+    T v_val = 0;
+    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+      const int X_index = i * cols + j;
+      m_val += __ldg(X + X_index);
+      v_val += __ldg(X + X_index) * __ldg(X + X_index);
+    }
+    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
+    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean[i] = m_val / static_cast<T>(cols);
+      variance[i] = v_val / static_cast<T>(cols) - mean[i] * mean[i];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, int D>
+__global__ void MomentsHIPKernel(
+    const int outer_size,
+    const int inner_size,
+    SimpleArray<int, D> X_strides,
+    SimpleArray<int, D> Y_dims,
+    const T* X,
+    T* mean,
+    T* variance) {
+  __shared__ typename BlockReduce<T>::TempStorage m_storage;
+  __shared__ typename BlockReduce<T>::TempStorage v_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T m_val = 0;
+    T v_val = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int X_index = 0;
+      int Y_index = i * inner_size + j;
+#pragma unroll
+      for (int i = D - 1; i >= 0; --i) {
+        X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
+        Y_index /= Y_dims.data[i];
+      }
+      m_val += __ldg(X + X_index);
+      v_val += __ldg(X + X_index) * __ldg(X + X_index);
+    }
+    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
+    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean[i] = m_val / static_cast<T>(inner_size);
+      variance[i] = v_val / static_cast<T>(inner_size) - mean[i] * mean[i];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, int D>
+void MomentsHIPImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    HIPContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<int, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = dims[axes[i]];
+  }
+  hipLaunchKernelGGL(
+      (MomentsHIPKernel<T, D>),
+      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      outer_size,
+      inner_size,
+      X_strides,
+      Y_dims,
+      X,
+      mean,
+      variance);
+}
+
+template <typename T>
+void MomentsHIP(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    HIPContext* context) {
+  CAFFE_ENFORCE_LE(num_axes, num_dims);
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
+  const int pivot = num_dims - num_axes;
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= dims[transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < num_dims; ++i) {
+    inner_size *= dims[transpose_axes[i]];
+  }
+  if (transpose_axes[pivot] == pivot) {
+    hipLaunchKernelGGL(
+        (RowwiseMomentsHIPKernel<T>),
+        dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+        dim3(CAFFE_HIP_NUM_THREADS),
+        0,
+        context->hip_stream(),
+        outer_size,
+        inner_size,
+        X,
+        mean,
+        variance);
+    return;
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      num_dims,
+      MomentsHIPImpl,
+      T,
+      outer_size,
+      inner_size,
+      dims,
+      transpose_axes.data(),
+      X,
+      mean,
+      variance,
+      context);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_MOMENTS(T)                                      \
+  template <>                                                                  \
+  void Moments<T, HIPContext>(                                                 \
+      const int num_dims,                                                      \
+      const int* dims,                                                         \
+      const int num_axes,                                                      \
+      const int* axes,                                                         \
+      const T* X,                                                              \
+      T* mean,                                                                 \
+      T* variance,                                                             \
+      HIPContext* context) {                                                   \
+    MomentsHIP<T>(num_dims, dims, num_axes, axes, X, mean, variance, context); \
+  }
+CAFFE2_SPECIALIZED_HIP_MOMENTS(float)
+#undef CAFFE2_SPECIALIZED_HIP_MOMENTS
+
+namespace {
+
+template <typename T, int D>
+__global__ void TransposeHIPKernel(
+    const int size,
+    const SimpleArray<int, D> X_strides,
+    const SimpleArray<int, D> Y_dims,
+    const T* X,
+    T* Y) {
+  HIP_1D_KERNEL_LOOP(Y_index, size) {
+    int X_index = 0;
+    int Y_index_val = Y_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
+      Y_index_val /= Y_dims.data[i];
+    }
+    Y[Y_index] = __ldg(X + X_index);
+  }
+}
+
+template <typename T, int D>
+void TransposeHIPImpl(
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* Y,
+    HIPContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<int, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  int size = 1;
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = dims[axes[i]];
+    size *= dims[i];
+  }
+  hipLaunchKernelGGL(
+      (TransposeHIPKernel<T, D>),
+      dim3(CAFFE_GET_BLOCKS(size)),
+      dim3(CAFFE_HIP_NUM_THREADS),
+      0,
+      context->hip_stream(),
+      size,
+      X_strides,
+      Y_dims,
+      X,
+      Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_HIP_TRANSPOSE(T)                    \
+  template <>                                                  \
+  void Transpose<T, HIPContext>(                               \
+      const int ndim,                                          \
+      const int* dims,                                         \
+      const int* axes,                                         \
+      const T* X,                                              \
+      T* Y,                                                    \
+      HIPContext* context) {                                   \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                    \
+        ndim, TransposeHIPImpl, T, dims, axes, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_HIP_TRANSPOSE(float)
+CAFFE2_SPECIALIZED_HIP_TRANSPOSE(double)
+CAFFE2_SPECIALIZED_HIP_TRANSPOSE(int)
+CAFFE2_SPECIALIZED_HIP_TRANSPOSE(TIndex)
+#undef CAFFE2_SPECIALIZED_HIP_TRANSPOSE
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/hip/math_hip_test.cc b/caffe2/utils/hip/math_hip_test.cc
new file mode 100644
index 0000000..19a4eed
--- /dev/null
+++ b/caffe2/utils/hip/math_hip_test.cc
@@ -0,0 +1,898 @@
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/hip/context_hip.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+void executeGpuBinaryOpTest(
+    int shapex0,
+    int shapex1,
+    int shapey,
+    std::function<float(int)> input0,
+    std::function<float(int)> input1,
+    std::function<void(
+        int N0,
+        int N1,
+        const float* src0,
+        const float* src1,
+        float* dst,
+        HIPContext* context)> operation,
+    std::function<float(int)> correct_output) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+
+  Blob* blobx0 = ws.CreateBlob("X0");
+  Blob* blobx1 = ws.CreateBlob("X1");
+  Blob* bloby = ws.CreateBlob("Y");
+  Blob* bloby_host = ws.CreateBlob("Y_host");
+
+  auto* tensorx0 = blobx0->GetMutable<Tensor<HIPContext>>();
+  auto* tensorx1 = blobx1->GetMutable<Tensor<HIPContext>>();
+  auto* tensory = bloby->GetMutable<Tensor<HIPContext>>();
+
+  vector<int> shapex0_vector{shapex0};
+  vector<int> shapex1_vector{shapex1};
+  vector<int> shapey_vector{shapey};
+
+  tensorx0->Resize(shapex0_vector);
+  tensorx1->Resize(shapex1_vector);
+  tensory->Resize(shapey_vector);
+
+  for (int i = 0; i < shapex0; i++) {
+    math::Set<float, HIPContext>(
+        1, input0(i), tensorx0->mutable_data<float>() + i, &context);
+  }
+  for (int i = 0; i < shapex1; i++) {
+    math::Set<float, HIPContext>(
+        1, input1(i), tensorx1->mutable_data<float>() + i, &context);
+  }
+  operation(
+      shapex0,
+      shapex1,
+      tensorx0->template data<float>(),
+      tensorx1->template data<float>(),
+      tensory->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+
+  // Copy result to CPU so we can inspect it
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<HIPContext, HIPContext>(*tensory, &context);
+  context.FinishDeviceComputation();
+
+  for (int i = 0; i < shapey; ++i) {
+    EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
+  }
+}
+
+TEST(MathUtilGPUTest, testAddStripedBatch) {
+  if (!HasHipGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(HIP);
+  HIPContext context(option);
+  Blob* blobx = ws.CreateBlob("X");
+  Blob* bloby = ws.CreateBlob("Y");
+  Blob* bloby_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapex{33 * 9, 25};
+  vector<int> shapey{33, 25};
+
+  auto* tensorx = blobx->GetMutable<Tensor<HIPContext>>();
+  tensorx->Resize(shapex);
+  int stripe = 33 * 25;
+  vector<float> tot(33, 0.0);
+  for (int j = 0; j < 9; j++) {
+    // Have different values for each line
+    for (int k = 0; k < 33; k++) {
+      math::Set<float, HIPContext>(
+          33,
+          1.0 + j + k,
+          tensorx->mutable_data<float>() + j * stripe + k * 25,
+          &context);
+      tot[k] += 1.0 + j + k;
+    }
+  }
+
+  auto* tensory = bloby->GetMutable<Tensor<HIPContext>>();
+  tensory->Resize(shapey);
+  math::Set<float, HIPContext>(
+      stripe, 0.0, tensory->mutable_data<float>(), &context);
+
+  math::AddStripedBatch<float, HIPContext>(
+      stripe,
+      tensorx->template data<float>(),
+      tensory->mutable_data<float>(),
+      stripe,
+      9,
+      &context);
+  context.FinishDeviceComputation();
+
+  // Copy result to CPU so we can inspect it
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<HIPContext, HIPContext>(*tensory, &context);
+  context.FinishDeviceComputation();
+
+  for (int k = 0; k < 33; k++) {
+    for (int i = 0; i < 25; i++) {
+      EXPECT_EQ(tensory_host->data<float>()[k * 25 + i], tot[k]);
+    }
+  }
+}
+
+TEST(MathUtilGPUTest, testReduceMin) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int /*i*/) { return 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         HIPContext* context) {
+        Tensor<HIPContext> aux;
+        math::ReduceMin<float, HIPContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int i) { return i == 3 ? 11.0f : 17.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         HIPContext* context) {
+        Tensor<HIPContext> aux;
+        math::ReduceMin<float, HIPContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+}
+
+TEST(MathUtilGPUTest, testReduceMax) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int /*i*/) { return 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         HIPContext* context) {
+        Tensor<HIPContext> aux;
+        math::ReduceMax<float, HIPContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int i) { return i == 3 ? 17.0f : 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         HIPContext* context) {
+        Tensor<HIPContext> aux;
+        math::ReduceMax<float, HIPContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 17.0f; });
+}
+
+TEST(MathUtilGPUTest, testElemwiseMax) {
+  executeGpuBinaryOpTest(
+      13,
+      13,
+      13,
+      [](int i) { return 2.0f - i; },
+      [](int i) { return i - 6.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* src1,
+         float* dst,
+         HIPContext* context) {
+        math::ElemwiseMax<float, HIPContext>(N0, src0, src1, dst, context);
+      },
+      [](int i) { return std::max(2.0f - i, i - 6.0f); });
+}
+
+TEST(MathUtilGPUTest, testCopyVector) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      6,
+      [](int i) { return 5.0f - i; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         HIPContext* context) {
+        math::CopyVector<float, HIPContext>(N0, src0, dst, context);
+      },
+      [](int i) { return 5.0f - i; });
+}
+
+namespace {
+
+constexpr float kEps = 1e-5;
+
+class GemmBatchedGPUTest
+    : public testing::TestWithParam<testing::tuple<bool, bool>> {
+ protected:
+  void SetUp() override {
+    if (!HasHipGPU()) {
+      return;
+    }
+    option_.set_device_type(HIP);
+    hip_context_ = make_unique<HIPContext>(option_);
+    Blob* X_blob = ws_.CreateBlob("X");
+    Blob* W_blob = ws_.CreateBlob("W");
+    Blob* Y_blob = ws_.CreateBlob("Y");
+    X_ = X_blob->GetMutable<Tensor<HIPContext>>();
+    W_ = W_blob->GetMutable<Tensor<HIPContext>>();
+    Y_ = Y_blob->GetMutable<Tensor<HIPContext>>();
+    X_->Resize(std::vector<TIndex>{3, 5, 10});
+    W_->Resize(std::vector<TIndex>{3, 6, 10});
+    Y_->Resize(std::vector<TIndex>{3, 5, 6});
+    math::Set<float, HIPContext>(
+        X_->size(), 1.0f, X_->mutable_data<float>(), hip_context_.get());
+    math::Set<float, HIPContext>(
+        W_->size(), 1.0f, W_->mutable_data<float>(), hip_context_.get());
+    trans_X_ = std::get<0>(GetParam());
+    trans_W_ = std::get<1>(GetParam());
+  }
+
+  void RunGemmBatched(const float alpha, const float beta) {
+    math::GemmBatched(
+        trans_X_ ? CblasTrans : CblasNoTrans,
+        trans_W_ ? CblasTrans : CblasNoTrans,
+        3,
+        5,
+        6,
+        10,
+        alpha,
+        X_->template data<float>(),
+        W_->template data<float>(),
+        beta,
+        Y_->template mutable_data<float>(),
+        hip_context_.get());
+  }
+
+  void VerifyOutput(const float value) const {
+    TensorCPU Y_cpu(*Y_);
+    for (int i = 0; i < Y_cpu.size(); ++i) {
+      EXPECT_FLOAT_EQ(value, Y_cpu.template data<float>()[i]);
+    }
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<HIPContext> hip_context_;
+  Tensor<HIPContext>* X_ = nullptr;
+  Tensor<HIPContext>* W_ = nullptr;
+  Tensor<HIPContext>* Y_ = nullptr;
+  bool trans_X_;
+  bool trans_W_;
+};
+
+TEST_P(GemmBatchedGPUTest, GemmBatchedGPUFloatTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  RunGemmBatched(1.0f, 0.0f);
+  VerifyOutput(10.0f);
+  RunGemmBatched(1.0f, 0.5f);
+  VerifyOutput(15.0f);
+  RunGemmBatched(0.5f, 1.0f);
+  VerifyOutput(20.0f);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GemmBatchedGPUTrans,
+    GemmBatchedGPUTest,
+    testing::Combine(testing::Bool(), testing::Bool()));
+
+class ReduceTensorGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasHipGPU()) {
+      return;
+    }
+    option_.set_device_type(HIP);
+    hip_context_ = make_unique<HIPContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<HIPContext>>();
+    Y_ = blob_y->GetMutable<Tensor<HIPContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    hip_context_->Copy<float, CPUContext, HIPContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<HIPContext, HIPContext>(*Y_, hip_context_.get());
+    hip_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  template <class ReduceFunc>
+  void RunRedcueTensorTest(
+      const ReduceFunc& reduce_func,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, axes, X_data);
+    reduce_func(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        hip_context_.get());
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<HIPContext> hip_context_;
+  Tensor<HIPContext>* X_ = nullptr;
+  Tensor<HIPContext>* Y_ = nullptr;
+};
+
+TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  const auto& reduce_min = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              HIPContext* context) {
+    return math::ReduceMin<float, HIPContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 2.0f, 3.0f});
+  RunRedcueTensorTest(
+      reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 3.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  const auto& reduce_max = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              HIPContext* context) {
+    return math::ReduceMax<float, HIPContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {4.0f, 5.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {7.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {6.0f, 8.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {6.0f, 15.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {5.0f, 7.0f, 9.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {21.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {10.0f, 26.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {16.0f, 20.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, HIPContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {14.0f, 22.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {2.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, HIPContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f});
+}
+
+class BroadcastGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasHipGPU()) {
+      return;
+    }
+    option_.set_device_type(HIP);
+    hip_context_ = make_unique<HIPContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<HIPContext>>();
+    Y_ = blob_y->GetMutable<Tensor<HIPContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
+      const std::vector<float>& X_data) {
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    hip_context_->Copy<float, CPUContext, HIPContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<HIPContext, HIPContext>(*Y_, hip_context_.get());
+    hip_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  void RunBroadcastTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, Y_dims, X_data);
+    math::Broadcast<float, HIPContext>(
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.size(),
+        Y_dims.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        hip_context_.get());
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<HIPContext> hip_context_;
+  Tensor<HIPContext>* X_ = nullptr;
+  Tensor<HIPContext>* Y_ = nullptr;
+};
+
+TEST_F(BroadcastGPUTest, BroadcastGPUFloatTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  RunBroadcastTest({2}, {2}, {1.0f, 2.0f}, {1.0f, 2.0f});
+  RunBroadcastTest({1}, {2}, {1.0f}, {1.0f, 1.0f});
+  RunBroadcastTest({1}, {2, 2}, {1.0f}, {1.0f, 1.0f, 1.0f, 1.0f});
+  RunBroadcastTest({2, 1}, {2, 2}, {1.0f, 2.0f}, {1.0f, 1.0f, 2.0f, 2.0f});
+  RunBroadcastTest(
+      {2, 1},
+      {2, 2, 2},
+      {1.0f, 2.0f},
+      {1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
+}
+
+class MomentsGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasHipGPU()) {
+      return;
+    }
+    option_.set_device_type(HIP);
+    hip_context_ = make_unique<HIPContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_mean = ws_.CreateBlob("mean");
+    Blob* blob_variance = ws_.CreateBlob("variance");
+    X_ = blob_x->GetMutable<Tensor<HIPContext>>();
+    mean_ = blob_mean->GetMutable<Tensor<HIPContext>>();
+    variance_ = blob_variance->GetMutable<Tensor<HIPContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_->Resize(X_dims);
+    mean_->Resize(Y_dims);
+    variance_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    hip_context_->Copy<float, CPUContext, HIPContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(
+      const std::vector<float>& mean_data,
+      const std::vector<float>& variance_data) {
+    Blob* blob_mean_host = ws_.CreateBlob("mean_host");
+    auto* mean_host = blob_mean_host->GetMutable<TensorCPU>();
+    mean_host->CopyFrom<HIPContext, HIPContext>(*mean_, hip_context_.get());
+    Blob* blob_variance_host = ws_.CreateBlob("variance_host");
+    auto* variance_host = blob_variance_host->GetMutable<TensorCPU>();
+    variance_host->CopyFrom<HIPContext, HIPContext>(
+        *variance_, hip_context_.get());
+    hip_context_->FinishDeviceComputation();
+
+    ASSERT_EQ(mean_data.size(), mean_host->size());
+    for (std::size_t i = 0; i < mean_data.size(); ++i) {
+      EXPECT_FLOAT_EQ(mean_data[i], mean_host->data<float>()[i]);
+    }
+    ASSERT_EQ(variance_data.size(), variance_host->size());
+    for (std::size_t i = 0; i < variance_data.size(); ++i) {
+      EXPECT_NEAR(variance_data[i], variance_host->data<float>()[i], kEps);
+    }
+  }
+
+  void RunMomentsTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& mean_data,
+      const std::vector<float>& variance_data) {
+    SetUpData(X_dims, axes, X_data);
+    math::Moments<float, HIPContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_->data<float>(),
+        mean_->mutable_data<float>(),
+        variance_->mutable_data<float>(),
+        hip_context_.get());
+    VerifyResult(mean_data, variance_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<HIPContext> hip_context_;
+  Tensor<HIPContext>* X_ = nullptr;
+  Tensor<HIPContext>* mean_ = nullptr;
+  Tensor<HIPContext>* variance_ = nullptr;
+};
+
+TEST_F(MomentsGPUTest, MomentsGPUFloatTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunMomentsTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {2.0f}, {2.0f / 3.0f});
+
+  // Test for 2D Tensor.
+  RunMomentsTest(
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f},
+      {2.0f / 3.0f, 2.0f / 3.0f});
+  RunMomentsTest(
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f},
+      {2.25f, 2.25f, 2.25f});
+  RunMomentsTest(
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f},
+      {35.0f / 12.0f});
+
+  // Test for 3D tensor.
+  RunMomentsTest(
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f},
+      {1.25, 1.25});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f},
+      {5.0f, 5.0f});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f},
+      {4.25, 4.25});
+}
+
+class TransposeGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasHipGPU()) {
+      return;
+    }
+    option_.set_device_type(HIP);
+    hip_context_ = make_unique<HIPContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<HIPContext>>();
+    Y_ = blob_y->GetMutable<Tensor<HIPContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    const int ndim = X_dims.size();
+    std::vector<int> Y_dims(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      Y_dims[i] = X_dims[axes[i]];
+    }
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    hip_context_->Copy<float, CPUContext, HIPContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<HIPContext, HIPContext>(*Y_, hip_context_.get());
+    hip_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  void RunTransposeTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, axes, X_data);
+    math::Transpose<float, HIPContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        hip_context_.get());
+    hip_context_->FinishDeviceComputation();
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<HIPContext> hip_context_;
+  Tensor<HIPContext>* X_ = nullptr;
+  Tensor<HIPContext>* Y_ = nullptr;
+};
+
+TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
+  if (!HasHipGPU()) {
+    return;
+  }
+  // Test for 1D transpose.
+  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
+
+  // Test for 2D transpose.
+  RunTransposeTest(
+      {2, 3},
+      {1, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
+
+  // Test for 3D transpose.
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 2, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
+}
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/utils/hip/mixed_utils_hip.h b/caffe2/utils/hip/mixed_utils_hip.h
new file mode 100644
index 0000000..efbd428
--- /dev/null
+++ b/caffe2/utils/hip/mixed_utils_hip.h
@@ -0,0 +1,101 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#ifndef CAFFE2_UTILS_MIXED_UTILS_HIP_H
+#define CAFFE2_UTILS_MIXED_UTILS_HIP_H
+
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/hip/context_hip.h"
+
+// define functions to allow add/mult/store operaions for input/output with
+// mixed precisions.
+namespace caffe2 {
+
+// functions that will only be triggered when there is no spcialized version
+// supported
+template <typename T, typename T2>
+inline __device__ T mixed_mult(T data1, T2 data2) {
+  return data1 * data2;
+};
+
+template <typename T, typename T2>
+inline __device__ T mixed_add(T data1, T2 data2) {
+  return data1 + data2;
+};
+
+template <typename TIN, typename TOUT>
+inline __device__ void mixed_store(TIN* data_in, TOUT* data_out) {
+  *data_out = *data_in;
+  return;
+};
+
+template <typename T>
+inline __device__ void mixed_store(T* data_in, T* data_out) {
+  *data_out = *data_in;
+  return;
+};
+
+template <>
+inline __device__ float mixed_mult(float data1, const float data2) {
+  return data1 * data2;
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, const half data2) {
+  return data1 * __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, float16 data2) {
+  half* data2_half = reinterpret_cast<half*>(&data2);
+  return data1 * __half2float(*data2_half);
+}
+template <>
+inline __device__ float mixed_add(float data1, const float data2) {
+  return data1 + data2;
+}
+
+template <>
+inline __device__ float mixed_add(float data1, const half data2) {
+  return data1 + __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_add(float data1, float16 data2) {
+  half* data2_half = reinterpret_cast<half*>(&data2);
+  return data1 + __half2float(*data2_half);
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float* data_out) {
+  *data_out = *data_in;
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(half* data_in, float* data_out) {
+  *data_out = __half2float(*data_in);
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float16* data_in, float* data_out) {
+  half* data_in_half = reinterpret_cast<half*>(data_in);
+  *data_out = __half2float(*data_in_half);
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float16* data_out) {
+  half data_in_half = __float2half(*data_in);
+  float16* data_in_float16 = reinterpret_cast<float16*>(&data_in_half);
+  *data_out = *data_in_float16;
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, half* data_out) {
+  half data_in_half = __float2half(*data_in);
+  *data_out = data_in_half;
+  return;
+}
+} // namespace caffe2
+#endif // for CAFFE2_UTILS_MIXED_UTILS_HIP_H
diff --git a/caffe2/utils/map_utils.h b/caffe2/utils/map_utils.h
new file mode 100644
index 0000000..3a3d8ea
--- /dev/null
+++ b/caffe2/utils/map_utils.h
@@ -0,0 +1,19 @@
+#pragma once
+
+namespace caffe2 {
+
+// Get value from map given key. Return suppiled default value if not found
+// This is a stripped down version from folly:
+// https://github.com/facebook/folly/blob/5a07e203d79324b68d69f294fa38e43b9671e9b1/folly/MapUtil.h#L35-L45
+template <
+    class Map,
+    typename Key = typename Map::key_type,
+    typename Value = typename Map::mapped_type>
+typename Map::mapped_type
+get_default(const Map& map, const Key& key, Value&& dflt) {
+  using M = typename Map::mapped_type;
+  auto pos = map.find(key);
+  return (pos != map.end()) ? (pos->second) : M(std::forward<Value>(dflt));
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/math-detail.h b/caffe2/utils/math-detail.h
new file mode 100644
index 0000000..5630729
--- /dev/null
+++ b/caffe2/utils/math-detail.h
@@ -0,0 +1,90 @@
+#ifndef CAFFE2_UTILS_MATH_DETAIL_H_
+#define CAFFE2_UTILS_MATH_DETAIL_H_
+namespace caffe2 {
+
+class CPUContext;
+
+namespace math {
+namespace detail {
+
+// proxy to a class because of partial specialization limitations for functions
+
+template<typename T, class Context, int FixedSize>
+struct ScaleImpl {
+  inline void operator()(
+      const int N,
+      const float alpha,
+      const T* x,
+      T* y,
+      Context* context) {
+    Scale(N, alpha, x, y, context);
+  }
+};
+
+// Put light-weight implementations in .h file to enable inlining
+template<typename T>
+struct ScaleImpl<T, CPUContext, 1> {
+  inline void operator()(
+      const int N,
+      const float alpha,
+      const T* x,
+      T* y,
+      CPUContext* /*context*/) {
+    DCHECK_EQ(N, 1);
+    *y = *x * alpha;
+  }
+};
+
+template<typename T, class Context, int FixedSize>
+struct AxpyImpl {
+  inline void operator()(
+      const int N,
+      const float alpha,
+      const T* x,
+      T* y,
+      Context* context) {
+    Axpy(N, alpha, x, y, context);
+  }
+};
+
+// Put light-weight implementations in .h file to enable inlining
+template<typename T>
+struct AxpyImpl<T, CPUContext, 1> {
+  inline void operator()(
+      const int N,
+      const float alpha,
+      const T* x,
+      T* y,
+      CPUContext* /*context*/) {
+    DCHECK_EQ(N, 1);
+    *y += *x * alpha;
+  }
+};
+
+
+}  // namespace detail
+
+template <typename T, class Context, int FixedSize>
+inline void ScaleFixedSize(
+    const int N,
+    const float alpha,
+    const T* x,
+    T* y,
+    Context* context) {
+  detail::ScaleImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
+}
+
+template <typename T, class Context, int FixedSize>
+inline void AxpyFixedSize(
+    const int N,
+    const float alpha,
+    const T* x,
+    T* y,
+    Context* context) {
+  detail::AxpyImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
+}
+
+}  // namespace math
+}  // namespace caffe2
+
+#endif  // CAFFE2_UTILS_MATH_DETAIL_H_
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
new file mode 100644
index 0000000..ca8535e
--- /dev/null
+++ b/caffe2/utils/math.h
@@ -0,0 +1,612 @@
+#ifndef CAFFE2_UTILS_MATH_H_
+#define CAFFE2_UTILS_MATH_H_
+// This is a simple translation from the old Caffe math interfaces. We aim to
+// still keep it simple, so all platforms would be able to support it fairly
+// easily.
+
+// We include the cblas header here so that we can obtain the macros from cblas.
+extern "C" {
+#include "caffe2/utils/cblas.h"
+}
+
+#ifdef CAFFE2_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif // CAFFE2_USE_ACCELERATE
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math_utils.h"
+
+namespace caffe2 {
+
+template <class Context>
+class Tensor;
+
+// An empty class as a placeholder for a math function that has no specific
+// engine specified.
+class DefaultEngine {};
+
+namespace math {
+
+template <typename T, class Context>
+void Exp(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Log(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Cos(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Acos(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Sin(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Asin(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Tan(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Atan(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Sinh(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Cosh(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void SinCos(const int N, const T* x, T* ys, T* yc, Context* context);
+template <typename T, class Context>
+void Tanh(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Abs(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Sqr(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Sqrt(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Rsqrt(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Cube(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Cbrt(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Neg(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Sign(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Not(const int N, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Powx(const int N, const T* a, const T b, T* y, Context* context);
+
+#define CAFFE2_DECLARE_COMPARE_OP(Comp)                                      \
+  template <typename T, class Context>                                       \
+  void Comp(const int N, const T* A, const T* B, bool* C, Context* context); \
+                                                                             \
+  template <typename T, class Context, bool kBroadcast1st = false>           \
+  void Rowwise##Comp(                                                        \
+      const int rows,                                                        \
+      const int cols,                                                        \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
+      Context* context);                                                     \
+                                                                             \
+  template <typename T, class Context, bool kBroadcast1st = false>           \
+  void Colwise##Comp(                                                        \
+      const int rows,                                                        \
+      const int cols,                                                        \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
+      Context* context);                                                     \
+                                                                             \
+  template <typename T, class Context>                                       \
+  void Comp(                                                                 \
+      const int A_ndim,                                                      \
+      const int* A_dims,                                                     \
+      const int B_ndim,                                                      \
+      const int* B_dims,                                                     \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
+      Context* context);
+
+CAFFE2_DECLARE_COMPARE_OP(EQ)
+CAFFE2_DECLARE_COMPARE_OP(NE)
+CAFFE2_DECLARE_COMPARE_OP(LT)
+CAFFE2_DECLARE_COMPARE_OP(LE)
+CAFFE2_DECLARE_COMPARE_OP(GT)
+CAFFE2_DECLARE_COMPARE_OP(GE)
+
+#undef CAFFE2_DECLARE_COMPARE_OP
+
+#define CAFFE2_DECLARE_BINARY_OP(Func)                                    \
+  template <typename T, class Context>                                    \
+  void Func(const int N, const T* A, const T* B, T* C, Context* context); \
+                                                                          \
+  template <typename T, class Context, bool kBroadcast1st = false>        \
+  void Rowwise##Func(                                                     \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
+      Context* context);                                                  \
+                                                                          \
+  template <typename T, class Context, bool kBroadcast1st = false>        \
+  void Colwise##Func(                                                     \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
+      Context* context);                                                  \
+                                                                          \
+  template <typename T, class Context>                                    \
+  void Func(                                                              \
+      const int A_ndim,                                                   \
+      const int* A_dims,                                                  \
+      const int B_ndim,                                                   \
+      const int* B_dims,                                                  \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
+      Context* context);
+
+CAFFE2_DECLARE_BINARY_OP(Add)
+CAFFE2_DECLARE_BINARY_OP(Sub)
+CAFFE2_DECLARE_BINARY_OP(Mul)
+CAFFE2_DECLARE_BINARY_OP(Div)
+
+CAFFE2_DECLARE_BINARY_OP(And)
+CAFFE2_DECLARE_BINARY_OP(Or)
+CAFFE2_DECLARE_BINARY_OP(Xor)
+
+CAFFE2_DECLARE_BINARY_OP(BitwiseAnd)
+CAFFE2_DECLARE_BINARY_OP(BitwiseOr)
+CAFFE2_DECLARE_BINARY_OP(BitwiseXor)
+
+#undef CAFFE2_DECLARE_BINARY_OP
+
+template <typename T, class Context>
+void ReduceMin(
+    const int N,
+    const T* x,
+    T* y,
+    Tensor<Context>* scratch_ptr,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceMax(
+    const int N,
+    const T* x,
+    T* y,
+    Tensor<Context>* scratch_ptr,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceMin(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceMax(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceSum(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceMean(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceL1(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+template <typename T, class Context>
+void ReduceL2(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Broadcasts X with X_dims to Y with Y_dims.
+template <typename T, class Context>
+void Broadcast(
+    const int X_ndim,
+    const int* X_dims,
+    const int Y_ndim,
+    const int* Y_dims,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Computes mean and variance over axes.
+template <typename T, class Context>
+void Moments(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    Context* context);
+
+// Adds batch sub-tensors elementwise to output. Stripe is the stripe length
+// and N is the number of elements to add (size of Y).
+template <typename T, class Context>
+void AddStripedBatch(
+    const int N,
+    const T* first,
+    T* y,
+    const int stripe,
+    const int batch,
+    Context* context);
+
+// Compute the row-wise max of a N*D matrix X, and write it to a N
+// dimensional vector y.
+template <typename T, class Context>
+void RowwiseMax(const int N, const int D, const T* x, T* y, Context* context);
+
+// Compute the column-wise max of a N*D matrix X, and write it to a D
+// dimensional vector y.
+template <typename T, class Context>
+void ColwiseMax(const int N, const int D, const T* x, T* y, Context* context);
+
+// Elemwise maximum of vector x and vector y. z[i] = max(x[i], y[i])
+template <typename T, class Context>
+void ElemwiseMax(const int N, const T* x, const T* y, T* z, Context* context);
+
+// Elemwise maximum of vector x and scalar alpha. y[i] = max(x[i], alpha)
+template <typename T, class Context>
+void Maximum(
+    const int N,
+    const float alpha,
+    const T* x,
+    T* y,
+    Context* context);
+
+// Transpose tensor X with dims by axes and write the result to tensor Y.
+template <typename T, class Context>
+void Transpose(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Decaf gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <typename T, class Context, class Engine = DefaultEngine>
+void Gemm(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const T* A,
+    const T* B,
+    const float beta,
+    T* C,
+    Context* context,
+    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+
+// We also provide a gemm that has explicit lda, ldb and ldc specified.
+// In most cases you probably want to use the function above, though.
+template <typename T, class Context, class Engine = DefaultEngine>
+void GemmEx(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* A,
+    const int lda,
+    const T* B,
+    const int ldb,
+    const T beta,
+    T* C,
+    const int ldc,
+    Context* context);
+
+// GemmBatched provides a simple abstraction into library routines
+template <typename T, class Context, class Engine = DefaultEngine>
+void GemmBatched(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const T* A,
+    const T* B,
+    const float beta,
+    T* C,
+    Context* context,
+    Tensor<Context>* scratch = nullptr,
+    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+
+// Gemv always takes in a M*N matrix A, and depending on whether we set TransA
+// to Trans, the output is:
+// CblasNoTrans: x is an N dim vector and y is an M dim vector.
+// CblasTrans:   x is an M dim vector and y is an N dim vector.
+template <typename T, class Context, class Engine = DefaultEngine>
+void Gemv(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const T* A,
+    const T* x,
+    const float beta,
+    T* y,
+    Context* context,
+    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+
+template <typename T, class Context>
+void Set(const size_t N, const T alpha, T* X, Context* context);
+
+template <typename T, class Context>
+void RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
+
+// Generate n values that sum up to a fixed sum
+// and subject to a restriction a <= x <= b for each x generated
+template <typename T, class Context>
+void RandFixedSum(
+    const size_t n,
+    const T a,
+    const T b,
+    const T sum,
+    T* r,
+    Context* context);
+
+template <typename T, class Context>
+void RandUniformUnique(
+    const size_t n,
+    const T a,
+    const T b,
+    T* r,
+    const size_t m,
+    const T* avoid,
+    Context* context);
+
+template <typename T, class Context>
+void RandGaussian(
+    const size_t n,
+    const T mean,
+    const T std,
+    T* r,
+    Context* context);
+
+// Dot matrix of vector a and b, and writes the result to a single value y.
+template <typename T, class Context>
+void Dot(const int N, const T* a, const T* b, T* y, Context* context);
+
+// Sum of vector x, and writes the result to a single value y.
+template <typename T, class Context>
+void Sum(
+    const int N,
+    const T* x,
+    T* y,
+    Context* context,
+    Tensor<Context>* scratch_ptr = nullptr);
+
+// Sum of squares of vector x, and writes the result to a single value y.
+template <typename T, class Context>
+void SumSqr(
+    const int N,
+    const T* x,
+    T* y,
+    Context* context,
+    Tensor<Context>* scratch_ptr = nullptr);
+
+// Select does index selection of the rows a N*D matrix x, and gives the N
+// dimensional vector y that contains the selected data.
+template <typename T, class Context>
+void Select(
+    const int N,
+    const int D,
+    const T* x,
+    const int* idx,
+    T* y,
+    Context* context);
+
+template <typename T, class Context>
+void Scale(const int N, const float alpha, const T* x, T* y, Context* context);
+
+// Different from the Scale function above, if alpha is passed in
+// as a pointer, we will assume that it lives on the Context device,
+// for example on GPU.
+template <typename T, class Context>
+void Scale(const int N, const float* alpha, const T* x, T* y, Context* context);
+
+template <typename T, class Context>
+void Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
+
+// Different from the Axpy function above, if alpha is passed in
+// as a pointer, we will assume that it lives on the Context device,
+// for example on GPU.
+template <typename T, class Context>
+void Axpy(const int N, const float* alpha, const T* x, T* y, Context* context);
+
+template <typename T, class Context>
+void Axpby(
+    const int N,
+    const float alpha,
+    const T* x,
+    const T b,
+    T* y,
+    Context* context);
+
+template <typename T, class Context, StorageOrder kOrder>
+void Im2ColNd(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const T* img_data,
+    T* col_data,
+    Context* context);
+
+template <typename T, class Context, StorageOrder kOrder>
+void Col2ImNd(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const T* col_data,
+    T* img_data,
+    Context* context);
+
+template <typename T, class Context, StorageOrder kOrder>
+void Im2Col(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const T* img_data,
+    T* col_data,
+    Context* context);
+
+template <typename T, class Context, int order>
+void Col2Im(
+    const int channels,
+    const int height,
+    const int width,
+    const int patch_h,
+    const int patch_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const T* col_data,
+    T* img_data,
+    Context* context);
+
+// Applies a per-channel bias value to each channel of the input
+// image. image_size is H * W
+template <typename T, class Context>
+void BiasCHW(
+    const T* bias,
+    const T* bias_multiplier,
+    const int bias_channels,
+    const int image_size,
+    T* image,
+    Context* context);
+
+template <class Context>
+void CopyMatrix(
+    const size_t item_size,
+    const int M,
+    const int N,
+    const void* A,
+    const int lda,
+    void* B,
+    const int ldb,
+    Context* context,
+    TypeMeta::TypedCopy copy = nullptr);
+
+template <typename T, class Context>
+void CopyVector(const int N, const T* A, T* B, Context* context);
+
+// Function uses casting from int to unsigned to compare if value of
+// parameter a is greater or equal to zero and lower than value of
+// parameter b. The b parameter is of type signed and is always
+// positive,
+// therefore its value is always lower than 0x800... where casting
+// negative value of a parameter converts it to value higher than
+// 0x800...
+// The casting allows to use one condition instead of two.
+inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
+// Calculates ceil(a / b). User must be careful to ensure that there
+// is no overflow or underflow in the calculation.
+template <typename T>
+constexpr T divUp(T a, T b) {
+  return (a + b - (T)1) / b;
+}
+
+// Rounds a up to the next highest multiple of b. User must be careful
+// to ensure that there is no overflow or underflow in the calculation
+// of divUp.
+template <typename T>
+constexpr T roundUp(T a, T b) {
+  return divUp<T>(a, b) * b;
+}
+
+// Returns log2(n) for a positive integer type
+template <typename T>
+constexpr int integerLog2(T n, int p = 0) {
+  return (n <= 1) ? p : integerLog2(n / 2, p + 1);
+}
+
+// Returns the next highest power-of-2 for an integer type
+template <typename T>
+constexpr T integerNextHighestPowerOf2(T v) {
+  return (integerIsPowerOf2(v) ? (T)2 * v : ((T)1 << (integerLog2(v) + 1)));
+}
+
+} // namespace math
+} // namespace caffe2
+
+#include "caffe2/utils/math-detail.h"
+#endif // CAFFE2_UTILS_MATH_H_
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
new file mode 100644
index 0000000..6ebf41a
--- /dev/null
+++ b/caffe2/utils/math_cpu.cc
@@ -0,0 +1,2611 @@
+// Implements the math functions for CPU.
+// The implementation in this file allows us to route the underlying numerical
+// computation library to different backends. Notably:
+// (1) For all BLAS-related functions, one can explicitly request a BLAS backend
+//     such as MKL, openblas or Atlas. To see the set of supported backends
+//     currently provided, check //third_party/blas/.
+// (2) If one chooses to link against MKL, we utilize MKL's vector math library
+//     (VML) for a few functions such as Exp and Log.
+// (3) Fallback implementations are provided in Eigen for cross-platform
+//     support. Since Eigen is a header-only library and supports a number of
+//     platforms, it allows one to quickly port Caffe2 to different platforms
+//     where BLAS may not be present.
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "caffe2/core/context.h"
+#include "caffe2/utils/cpu_neon.h"
+
+#include "Eigen/Core"
+#include "Eigen/Dense"
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#ifdef CAFFE2_USE_HPTT
+#include <hptt.h>
+#endif // CAFFE2_USE_HPTT
+
+#if defined(_MSC_VER)
+#include <process.h>
+#endif
+
+namespace caffe2 {
+
+namespace math {
+
+////////////////////////////////////////////////////////////////////////////////
+// BLAS alternatives.
+// Depending on whether we have specified an external BLAS library or not, we
+// will delegate the Caffe math functions that are BLAS-related to either the
+// CBLAS call or the Eigen implementation.
+////////////////////////////////////////////////////////////////////////////////
+#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
+
+// Caffe2 gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+//
+// The gemm call implements the following operation:
+//
+//                  C = alpha * op(A) * op(B) + beta * C
+//
+// where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
+// of A, B, and C are matrices and alpha and beta are scalars. Note that the
+// most common use case of gemm will involve setting alpha to 1 and beta to 0.
+//
+// op(A) and op(B) represent the transformations that are done to A and B before
+// the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
+// (transpose) if the argument TransA or TransB is set to CblasNoTrans or
+// CblasTrans, respectively, for each of A and B.
+template <>
+void Gemm<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CPUContext* context,
+    TensorProto::DataType math_type) {
+  auto C_mat = EigenMatrixMap<float>(C, N, M);
+  if (beta == 0) {
+    C_mat.setZero();
+  } else {
+    C_mat *= beta;
+  }
+  switch (TransA) {
+    case CblasNoTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, N, K) *
+               ConstEigenMatrixMap<float>(A, K, M));
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, K, N).transpose() *
+               ConstEigenMatrixMap<float>(A, K, M));
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
+    }
+    case CblasTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, N, K) *
+               ConstEigenMatrixMap<float>(A, M, K).transpose());
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, K, N).transpose() *
+               ConstEigenMatrixMap<float>(A, M, K).transpose());
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
+    }
+    default:
+      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
+  }
+}
+
+template <>
+void GemmEx<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const int lda,
+    const float* B,
+    const int ldb,
+    const float beta,
+    float* C,
+    const int ldc,
+    CPUContext*) {
+  using OuterStride = Eigen::OuterStride<Eigen::Dynamic>;
+  using StridedMap = Eigen::Map<Eigen::MatrixXf, 0, OuterStride>;
+  using ConstStridedMap = Eigen::Map<const Eigen::MatrixXf, 0, OuterStride>;
+  auto C_mat = StridedMap(C, N, M, OuterStride(ldc));
+  if (beta == 0) {
+    C_mat.setZero();
+  } else {
+    C_mat *= beta;
+  }
+  switch (TransA) {
+    case CblasNoTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, N, K, OuterStride(ldb)) *
+               ConstStridedMap(A, K, M, OuterStride(lda)));
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
+               ConstStridedMap(A, K, M, OuterStride(lda)));
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
+    }
+    case CblasTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, N, K, OuterStride(ldb)) *
+               ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
+               ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
+    }
+    default:
+      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
+  }
+}
+
+template <>
+void Gemv<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float* A,
+    const float* x,
+    const float beta,
+    float* y,
+    CPUContext* context,
+    TensorProto::DataType math_type) {
+  EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
+  if (beta == 0) {
+    // In Caffe2 we often do a lazy initialization, which may contain NaNs in
+    // the float values. As a result, if beta is 0, we explicitly do a setzero.
+    y_vec.setZero();
+  } else {
+    y_vec *= beta;
+  }
+  switch (TransA) {
+    case CblasNoTrans: {
+      y_vec.noalias() += alpha *
+          (ConstEigenMatrixMap<float>(A, N, M).transpose() *
+           ConstEigenVectorMap<float>(x, N));
+      return;
+    }
+    case CblasTrans: {
+      y_vec.noalias() += alpha *
+          (ConstEigenMatrixMap<float>(A, N, M) *
+           ConstEigenVectorMap<float>(x, M));
+      return;
+    }
+    default:
+      LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
+  }
+}
+
+#define CAFFE2_SPECIALIZED_SCALE(T)                                            \
+  template <>                                                                  \
+  void Scale<T, CPUContext>(                                                   \
+      const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
+    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha;            \
+  }                                                                            \
+  template <>                                                                  \
+  void Scale<T, CPUContext>(                                                   \
+      const int n,                                                             \
+      const float* alpha,                                                      \
+      const T* x,                                                              \
+      T* y,                                                                    \
+      CPUContext* context) {                                                   \
+    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha);         \
+  }
+CAFFE2_SPECIALIZED_SCALE(float)
+#undef CAFFE2_SPECIALIZED_SCALE
+
+#define CAFFE2_SPECIALIZED_DOT(T)                                        \
+  template <>                                                            \
+  void Dot<T, CPUContext>(                                               \
+      const int N, const T* a, const T* b, T* y, CPUContext* context) {  \
+    *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
+  }
+CAFFE2_SPECIALIZED_DOT(float)
+#undef CAFFE2_SPECIALIZED_DOT
+
+#define CAFFE2_SPECIALIZED_AXPY(T)                                          \
+  template <>                                                               \
+  void Axpy<T, CPUContext>(                                                 \
+      const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
+    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
+  }                                                                         \
+  template <>                                                               \
+  void Axpy<T, CPUContext>(                                                 \
+      const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
+    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
+  }
+CAFFE2_SPECIALIZED_AXPY(float)
+#undef CAFFE2_SPECIALIZED_AXPY
+
+#define CAFFE2_SPECIALIZED_AXPBY(T)                              \
+  template <>                                                    \
+  void Axpby<T, CPUContext>(                                     \
+      const int N,                                               \
+      const T alpha,                                             \
+      const T* x,                                                \
+      const T beta,                                              \
+      T* y,                                                      \
+      CPUContext* context) {                                     \
+    EigenVectorMap<T> y_vec(y, N);                               \
+    y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
+  }
+CAFFE2_SPECIALIZED_AXPBY(float)
+#undef CAFFE2_SPECIALIZED_AXPBY
+
+#else // CAFFE2_USE_EIGEN_FOR_BLAS
+
+template <>
+void Gemm<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CPUContext* /*context*/,
+    TensorProto::DataType /*math_type*/) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(
+      CblasRowMajor,
+      TransA,
+      TransB,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      beta,
+      C,
+      N);
+}
+
+template <>
+void GemmEx<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const int lda,
+    const float* B,
+    const int ldb,
+    const float beta,
+    float* C,
+    const int ldc,
+    CPUContext* /*context*/) {
+  cblas_sgemm(
+      CblasRowMajor,
+      TransA,
+      TransB,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      beta,
+      C,
+      ldc);
+}
+
+template <>
+void Gemv<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float* A,
+    const float* x,
+    const float beta,
+    float* y,
+    CPUContext* /*context*/,
+    TensorProto::DataType /*math_type*/) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+#define CAFFE2_SPECIALIZED_SCALE(T, prefix)                             \
+  template <>                                                           \
+  void Scale<T, CPUContext>(                                            \
+      const int n, const float alpha, const T* x, T* y, CPUContext*) {  \
+    if (y != x)                                                         \
+      cblas_##prefix##copy(n, x, 1, y, 1);                              \
+    cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1);           \
+  }                                                                     \
+  template <>                                                           \
+  void Scale<T, CPUContext>(                                            \
+      const int n, const float* alpha, const T* x, T* y, CPUContext*) { \
+    if (y != x)                                                         \
+      cblas_##prefix##copy(n, x, 1, y, 1);                              \
+    cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1);          \
+  }
+CAFFE2_SPECIALIZED_SCALE(float, s)
+#undef CAFFE2_SPECIALIZED_SCALE
+
+#define CAFFE2_SPECIALIZED_DOT(T, prefix)                       \
+  template <>                                                   \
+  void Dot<T, CPUContext>(                                      \
+      const int N, const T* a, const T* b, T* y, CPUContext*) { \
+    *y = cblas_##prefix##dot(N, a, 1, b, 1);                    \
+  }
+CAFFE2_SPECIALIZED_DOT(float, s)
+#undef CAFFE2_SPECIALIZED_DOT
+
+#define CAFFE2_SPECIALIZED_AXPY(T, prefix)                          \
+  template <>                                                       \
+  void Axpy<T, CPUContext>(                                         \
+      const int N, const T alpha, const T* x, T* y, CPUContext*) {  \
+    cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                     \
+  }                                                                 \
+  template <>                                                       \
+  void Axpy<T, CPUContext>(                                         \
+      const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
+    cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                    \
+  }
+CAFFE2_SPECIALIZED_AXPY(float, s)
+#undef CAFFE2_SPECIALIZED_AXPY
+
+// cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
+// we will need to implement it.
+#ifdef CAFFE2_USE_MKL
+#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)            \
+  template <>                                          \
+  void Axpby<T, CPUContext>(                           \
+      const int N,                                     \
+      const T alpha,                                   \
+      const T* x,                                      \
+      const T beta,                                    \
+      T* y,                                            \
+      CPUContext*) {                                   \
+    cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
+  }
+#else // CAFFE2_USE_MKL
+#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)     \
+  template <>                                   \
+  void Axpby<T, CPUContext>(                    \
+      const int N,                              \
+      const T alpha,                            \
+      const T* x,                               \
+      const T beta,                             \
+      T* y,                                     \
+      CPUContext*) {                            \
+    cblas_##prefix##scal(N, beta, y, 1);        \
+    cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
+  }
+#endif // CAFFE2_USE_MKL
+CAFFE2_SPECIALIZED_AXPBY(float, s)
+#undef CAFFE2_SPECIALIZED_AXPBY
+
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
+
+template <>
+void GemmBatched<float, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CPUContext* context,
+    Tensor<CPUContext>*, /* scratch */
+    TensorProto::DataType /* math_type */) {
+  const int a_stride = M * K;
+  const int b_stride = K * N;
+  const int c_stride = M * N;
+
+#ifdef CAFFE2_USE_MKL
+  (void)context;
+
+  const int lda = (TransA == CblasNoTrans) ? K : M;
+  const int ldb = (TransB == CblasNoTrans) ? N : K;
+  std::vector<const float*> a_array(batch_size, nullptr);
+  std::vector<const float*> b_array(batch_size, nullptr);
+  std::vector<float*> c_array(batch_size, nullptr);
+  for (int i = 0; i < batch_size; ++i) {
+    a_array[i] = A + a_stride * i;
+    b_array[i] = B + b_stride * i;
+    c_array[i] = C + c_stride * i;
+  }
+  cblas_sgemm_batch(
+      CblasRowMajor,
+      &TransA,
+      &TransB,
+      &M,
+      &N,
+      &K,
+      &alpha,
+      a_array.data(),
+      &lda,
+      b_array.data(),
+      &ldb,
+      &beta,
+      c_array.data(),
+      &N, // ldc_array
+      1,
+      &batch_size);
+#else // CAFFE2_USE_MKL
+  // loop over matrices in the batch
+  for (int i = 0; i < batch_size; ++i) {
+    math::Gemm<float, CPUContext>(
+        TransA,
+        TransB,
+        M,
+        N,
+        K,
+        alpha,
+        A + a_stride * i,
+        B + b_stride * i,
+        beta,
+        C + c_stride * i,
+        context);
+  }
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MKL VML alternatives.
+// Depending on whether we are using MKL, we will delegate the Caffe math
+// functions that are VML-related to either the VML call or the Eigen
+// implementation. If you are setting the flags (such as AVX) right for your CPU
+// architecture, usually Eigen will deliver a throughput as fast as the VML
+// functions.
+////////////////////////////////////////////////////////////////////////////////
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...)       \
+  template <>                                                                \
+  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    OriginalFunc(N, x, y, ##__VA_ARGS__);                                    \
+  }
+DELEGATE_SIMPLE_UNARY_FUNCTION(
+    float,
+    Exp,
+    vmsExp,
+    VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
+DELEGATE_SIMPLE_UNARY_FUNCTION(
+    double,
+    Exp,
+    vmdExp,
+    VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, vsAcos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, vdAcos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, vsAsin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, vdAsin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, vsTan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, vdTan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, vsAtan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, vdAtan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sinh, vsSinh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sinh, vdSinh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cosh, vsCosh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cosh, vdCosh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, vsTanh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, vdTanh)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, vsInvSqrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, vdInvSqrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cbrt, vsCbrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cbrt, vdCbrt)
+#undef DELEGATE_SIMPLE_UNARY_FUNCTION
+
+#define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc)           \
+  template <>                                               \
+  void SinCos<T, CPUContext>(                               \
+      const int N, const T* a, T* ys, T* yc, CPUContext*) { \
+    OriginalFunc(N, a, ys, yc);                             \
+  }
+DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
+DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
+#undef DELEGATE_SINCOS_FUNCTION
+
+#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)                               \
+  template <>                                                                 \
+  void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
+    OriginalFunc(N, a, b, y);                                                 \
+  }
+DELEGATE_POWX_FUNCTION(float, vsPowx)
+DELEGATE_POWX_FUNCTION(double, vdPowx)
+#undef DELEGATE_POWX_FUNCTION
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl)      \
+  template <>                                                   \
+  void Func<T, CPUContext>(                                     \
+      const int N, const T* A, const T* B, T* C, CPUContext*) { \
+    FuncImpl(N, A, B, C);                                       \
+  }
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION
+
+#else // CAFFE2_USE_MKL
+
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                    \
+  template <>                                                                \
+  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr();      \
+  }
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, acos)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, asin)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, tan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, atan)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, rsqrt)
+
+#undef DELEGATE_SIMPLE_UNARY_FUNCTION
+
+#define DELEGATE_SINCOS_FUNCTION(T)                                     \
+  template <>                                                           \
+  void SinCos<T, CPUContext>(                                           \
+      const int N, const T* x, T* ys, T* yc, CPUContext*) {             \
+    EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \
+    EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \
+  }
+DELEGATE_SINCOS_FUNCTION(float)
+DELEGATE_SINCOS_FUNCTION(double)
+#undef DELEGATE_SINCOS_FUNCTION
+
+#define DELEGATE_TANH_FUNCTION(T)                                             \
+  template <>                                                                 \
+  void Tanh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) {      \
+    EigenVectorMap<T>(Y, N) = T(1) -                                          \
+        ((ConstEigenVectorArrayMap<T>(X, N) * T(2)).exp() + T(1)).inverse() * \
+            T(2);                                                             \
+  }
+DELEGATE_TANH_FUNCTION(float)
+DELEGATE_TANH_FUNCTION(double)
+#undef DELEGATE_TANH_FUNCTION
+
+#define DELEGATE_CBRT_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Cbrt<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
+    std::transform(X, X + N, Y, [](const T x) { return cbrt(x); });      \
+  }
+DELEGATE_CBRT_FUNCTION(float)
+DELEGATE_CBRT_FUNCTION(double)
+#undef DELEGATE_CBRT_FUNCTION
+
+#define DELEGATE_POWX_FUNCTION(T)                                       \
+  template <>                                                           \
+  void Powx<T, CPUContext>(                                             \
+      const int N, const T* a, const T b, T* y, CPUContext*) {          \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b); \
+  }
+DELEGATE_POWX_FUNCTION(float)
+#undef DELEGATE_POWX_FUNCTION
+
+#define DELEGATE_SINH_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Sinh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2;        \
+  }
+DELEGATE_SINH_FUNCTION(float)
+DELEGATE_SINH_FUNCTION(double)
+#undef DELEGATE_SINH_FUNCTION
+
+#define DELEGATE_COSH_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Cosh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2;        \
+  }
+DELEGATE_COSH_FUNCTION(float)
+DELEGATE_COSH_FUNCTION(double)
+#undef DELEGATE_COSH_FUNCTION
+
+#endif // CAFFE2_USE_MKL
+
+#define DELEGATE_NEG_FUNCTION(T)                                        \
+  template <>                                                           \
+  void Neg<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N);            \
+  }
+DELEGATE_NEG_FUNCTION(float)
+DELEGATE_NEG_FUNCTION(double)
+DELEGATE_NEG_FUNCTION(std::int32_t)
+DELEGATE_NEG_FUNCTION(std::int64_t)
+#undef DELEGATE_NEG_FUNCTION
+
+#define DELEGATE_SIGN_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Sign<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign();  \
+  }
+DELEGATE_SIGN_FUNCTION(float)
+DELEGATE_SIGN_FUNCTION(double)
+DELEGATE_SIGN_FUNCTION(std::int32_t)
+DELEGATE_SIGN_FUNCTION(std::int64_t)
+#undef DELEGATE_SIGN_FUNCTION
+
+#define DELEGATE_CUBE_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Cube<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
+    EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube();  \
+  }
+DELEGATE_CUBE_FUNCTION(float)
+DELEGATE_CUBE_FUNCTION(double)
+DELEGATE_CUBE_FUNCTION(std::int32_t)
+DELEGATE_CUBE_FUNCTION(std::int64_t)
+#undef DELEGATE_CUBE_FUNCTION
+
+#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr)             \
+  template <>                                                   \
+  void Func<T, CPUContext>(                                     \
+      const int N, const T* A, const T* B, T* C, CPUContext*) { \
+    EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
+        expr ConstEigenVectorArrayMap<T>(B, N);                 \
+  }
+
+#ifdef CAFFE2_USE_MKL
+
+#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
+
+#else
+
+#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr)       \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
+
+#endif
+
+DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
+DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
+DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
+DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
+
+#undef DEFINE_SIMPLE_BINARY_FUNCTION
+#undef EIGEN_SIMPLE_BINARY_FUNCTION
+
+////////////////////////////////////////////////////////////////////////////////
+// Common math functions being used in Caffe that do not have a BLAS or MKL
+// equivalent. For all these functions, we will simply implement them either via
+// Eigen or via custom code.
+////////////////////////////////////////////////////////////////////////////////
+
+#define CAFFE2_SPECIALIZED_REDUCEMIN(T)    \
+  template <>                              \
+  void ReduceMin<T, CPUContext>(           \
+      const int N,                         \
+      const T* x,                          \
+      T* y,                                \
+      Tensor<CPUContext>* /*scratch_ptr*/, \
+      CPUContext* /*context*/) {           \
+    *y = *std::min_element(x, x + N);      \
+  }
+CAFFE2_SPECIALIZED_REDUCEMIN(float)
+#undef CAFFE2_SPECIALIZED_REDUCEMIN
+
+#define CAFFE2_SPECIALIZED_REDUCEMAX(T)    \
+  template <>                              \
+  void ReduceMax<T, CPUContext>(           \
+      const int N,                         \
+      const T* x,                          \
+      T* y,                                \
+      Tensor<CPUContext>* /*scratch_ptr*/, \
+      CPUContext* /*context*/) {           \
+    *y = *std::max_element(x, x + N);      \
+  }
+CAFFE2_SPECIALIZED_REDUCEMAX(float)
+CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
+CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
+
+#undef CAFFE2_SPECIALIZED_REDUCEMAX
+
+namespace {
+
+template <typename T, class Reducer>
+void ReduceTensor(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const Reducer& reducer,
+    const T& init,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  CAFFE_ENFORCE_LE(num_axes, num_dims);
+  if (X == Y) {
+    return;
+  }
+  std::vector<int> Y_dims(dims, dims + num_dims);
+  for (int i = 0; i < num_axes; ++i) {
+    Y_dims[axes[i]] = 1;
+  }
+  const int X_size =
+      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
+  const int Y_size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  Set<T, CPUContext>(Y_size, init, Y, context);
+  std::vector<int> index(num_dims, 0);
+  for (int X_index = 0; X_index < X_size; ++X_index) {
+    const int Y_index =
+        utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
+    Y[Y_index] = reducer(Y[Y_index], X[X_index]);
+    utils::IncreaseIndexInDims(num_dims, dims, index.data());
+  }
+}
+
+template <typename T>
+void ReduceMeanImpl(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensor(
+      num_dims, dims, num_axes, axes, std::plus<T>(), T(0), X, Y, context);
+  const int X_size =
+      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
+  int scale = 1;
+  for (int i = 0; i < num_axes; ++i) {
+    scale *= dims[axes[i]];
+  }
+  const int Y_size = X_size / scale;
+  Scale<T, CPUContext>(Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_REDUCE_MIN(T)                       \
+  template <>                                                  \
+  void ReduceMin<T, CPUContext>(                               \
+      const int num_dims,                                      \
+      const int* dims,                                         \
+      const int num_axes,                                      \
+      const int* axes,                                         \
+      const T* X,                                              \
+      T* Y,                                                    \
+      CPUContext* context) {                                   \
+    ReduceTensor(                                              \
+        num_dims,                                              \
+        dims,                                                  \
+        num_axes,                                              \
+        axes,                                                  \
+        [](const T& a, const T& b) { return std::min(a, b); }, \
+        std::numeric_limits<T>::max(),                         \
+        X,                                                     \
+        Y,                                                     \
+        context);                                              \
+  }
+CAFFE2_SPECIALIZED_REDUCE_MIN(std::int32_t)
+CAFFE2_SPECIALIZED_REDUCE_MIN(std::int64_t)
+CAFFE2_SPECIALIZED_REDUCE_MIN(float)
+CAFFE2_SPECIALIZED_REDUCE_MIN(double)
+#undef CAFFE2_SPECIALIZED_REDUCE_MIN
+
+#define CAFFE2_SPECIALIZED_REDUCE_MAX(T)                       \
+  template <>                                                  \
+  void ReduceMax<T, CPUContext>(                               \
+      const int num_dims,                                      \
+      const int* dims,                                         \
+      const int num_axes,                                      \
+      const int* axes,                                         \
+      const T* X,                                              \
+      T* Y,                                                    \
+      CPUContext* context) {                                   \
+    ReduceTensor(                                              \
+        num_dims,                                              \
+        dims,                                                  \
+        num_axes,                                              \
+        axes,                                                  \
+        [](const T& a, const T& b) { return std::max(a, b); }, \
+        std::numeric_limits<T>::lowest(),                      \
+        X,                                                     \
+        Y,                                                     \
+        context);                                              \
+  }
+CAFFE2_SPECIALIZED_REDUCE_MAX(std::int32_t)
+CAFFE2_SPECIALIZED_REDUCE_MAX(std::int64_t)
+CAFFE2_SPECIALIZED_REDUCE_MAX(float)
+CAFFE2_SPECIALIZED_REDUCE_MAX(double)
+#undef CAFFE2_SPECIALIZED_REDUCE_MAX
+
+#define CAFFE2_SPECIALIZED_REDUCE_SUM(T)                                      \
+  template <>                                                                 \
+  void ReduceSum<T, CPUContext>(                                              \
+      const int num_dims,                                                     \
+      const int* dims,                                                        \
+      const int num_axes,                                                     \
+      const int* axes,                                                        \
+      const T* X,                                                             \
+      T* Y,                                                                   \
+      CPUContext* context) {                                                  \
+    ReduceTensor(                                                             \
+        num_dims, dims, num_axes, axes, std::plus<T>(), T(0), X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_REDUCE_SUM(std::int32_t)
+CAFFE2_SPECIALIZED_REDUCE_SUM(std::int64_t)
+CAFFE2_SPECIALIZED_REDUCE_SUM(float)
+CAFFE2_SPECIALIZED_REDUCE_SUM(double)
+#undef CAFFE2_SPECIALIZED_REDUCE_SUM
+
+#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T)                             \
+  template <>                                                         \
+  void ReduceMean<T, CPUContext>(                                     \
+      const int num_dims,                                             \
+      const int* dims,                                                \
+      const int num_axes,                                             \
+      const int* axes,                                                \
+      const T* X,                                                     \
+      T* Y,                                                           \
+      CPUContext* context) {                                          \
+    ReduceMeanImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
+#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
+
+#define CAFFE2_SPECIALIZED_REDUCE_L1(T)                         \
+  template <>                                                   \
+  void ReduceL1<T, CPUContext>(                                 \
+      const int num_dims,                                       \
+      const int* dims,                                          \
+      const int num_axes,                                       \
+      const int* axes,                                          \
+      const T* X,                                               \
+      T* Y,                                                     \
+      CPUContext* context) {                                    \
+    ReduceTensor(                                               \
+        num_dims,                                               \
+        dims,                                                   \
+        num_axes,                                               \
+        axes,                                                   \
+        [](const T& a, const T& b) { return a + std::abs(b); }, \
+        T(0),                                                   \
+        X,                                                      \
+        Y,                                                      \
+        context);                                               \
+  }
+CAFFE2_SPECIALIZED_REDUCE_L1(float)
+#undef CAFFE2_SPECIALIZED_REDUCE_L1
+
+#define CAFFE2_SPECIALIZED_REDUCE_L2(T)                             \
+  template <>                                                       \
+  void ReduceL2<T, CPUContext>(                                     \
+      const int num_dims,                                           \
+      const int* dims,                                              \
+      const int num_axes,                                           \
+      const int* axes,                                              \
+      const T* X,                                                   \
+      T* Y,                                                         \
+      CPUContext* context) {                                        \
+    ReduceTensor(                                                   \
+        num_dims,                                                   \
+        dims,                                                       \
+        num_axes,                                                   \
+        axes,                                                       \
+        [](const T& a, const T& b) { return a + b * b; },           \
+        T(0),                                                       \
+        X,                                                          \
+        Y,                                                          \
+        context);                                                   \
+    std::vector<int> Y_dims(dims, dims + num_dims);                 \
+    for (int i = 0; i < num_axes; ++i) {                            \
+      Y_dims[axes[i]] = 1;                                          \
+    }                                                               \
+    const int Y_size = std::accumulate(                             \
+        Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>()); \
+    Sqrt<T, CPUContext>(Y_size, Y, Y, context);                     \
+  }
+CAFFE2_SPECIALIZED_REDUCE_L2(float)
+#undef CAFFE2_SPECIALIZED_REDUCE_L2
+
+namespace {
+
+template <typename T>
+void BroadcastImpl(
+    const int X_ndim,
+    const int* X_dims,
+    const int Y_ndim,
+    const int* Y_dims,
+    const T* X,
+    T* Y) {
+  CAFFE_ENFORCE_LE(X_ndim, Y_ndim);
+  std::vector<int> X_dims_ex(Y_ndim);
+  const int d = Y_ndim - X_ndim;
+  std::fill(X_dims_ex.begin(), X_dims_ex.begin() + d, 1);
+  for (int i = d; i < Y_ndim; ++i) {
+    CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
+    X_dims_ex[i] = X_dims[i - d];
+  }
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>());
+  std::vector<int> index(Y_ndim, 0);
+  for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
+    const int X_index =
+        utils::GetIndexFromDims(Y_ndim, X_dims_ex.data(), index.data());
+    Y[Y_index] = X[X_index];
+    utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
+  }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_BROADCAST(T)                     \
+  template <>                                               \
+  void Broadcast<T, CPUContext>(                            \
+      const int X_ndim,                                     \
+      const int* X_dims,                                    \
+      const int Y_ndim,                                     \
+      const int* Y_dims,                                    \
+      const T* X,                                           \
+      T* Y,                                                 \
+      CPUContext* /* context */) {                          \
+    BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, X, Y); \
+  }
+CAFFE2_SPECIALIZED_BROADCAST(std::int32_t)
+CAFFE2_SPECIALIZED_BROADCAST(std::int64_t)
+CAFFE2_SPECIALIZED_BROADCAST(float)
+CAFFE2_SPECIALIZED_BROADCAST(double)
+#undef CAFFE2_SPECIALIZED_BROADCAST
+
+namespace {
+
+template <typename T>
+void MomentsImpl(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    CPUContext* context) {
+  std::vector<int> Y_dims(dims, dims + num_dims);
+  for (int i = 0; i < num_axes; ++i) {
+    Y_dims[axes[i]] = 1;
+  }
+  const int X_size =
+      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
+  const int Y_size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  const int scale = X_size / Y_size;
+  Set<T, CPUContext>(Y_size, T(0), mean, context);
+  Set<T, CPUContext>(Y_size, T(0), variance, context);
+  std::vector<int> index(num_dims, 0);
+  for (int X_index = 0; X_index < X_size; ++X_index) {
+    const int Y_index =
+        utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
+    mean[Y_index] += X[X_index];
+    variance[Y_index] += X[X_index] * X[X_index];
+    utils::IncreaseIndexInDims(num_dims, dims, index.data());
+  }
+  for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
+    mean[Y_index] /= static_cast<T>(scale);
+    variance[Y_index] = variance[Y_index] / static_cast<T>(scale) -
+        mean[Y_index] * mean[Y_index];
+  }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_MOMENTS(T)                                \
+  template <>                                                        \
+  void Moments<T, CPUContext>(                                       \
+      const int num_dims,                                            \
+      const int* dims,                                               \
+      const int num_axes,                                            \
+      const int* axes,                                               \
+      const T* X,                                                    \
+      T* mean,                                                       \
+      T* variance,                                                   \
+      CPUContext* context) {                                         \
+    MomentsImpl<T>(                                                  \
+        num_dims, dims, num_axes, axes, X, mean, variance, context); \
+  }
+CAFFE2_SPECIALIZED_MOMENTS(float)
+#undef CAFFE2_SPECIALIZED_MOMENTS
+
+#define CAFFE2_SPECIALIZED_ROWWISEMAX(T)                         \
+  template <>                                                    \
+  void RowwiseMax<T, CPUContext>(                                \
+      const int N, const int D, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) =                                    \
+        ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff();    \
+  }
+CAFFE2_SPECIALIZED_ROWWISEMAX(float)
+#undef CAFFE2_SPECIALIZED_ROWWISEMAX
+
+#define CAFFE2_SPECIALIZED_COLWISEMAX(T)                         \
+  template <>                                                    \
+  void ColwiseMax<T, CPUContext>(                                \
+      const int N, const int D, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, D) =                                    \
+        ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff();    \
+  }
+CAFFE2_SPECIALIZED_COLWISEMAX(float)
+#undef CAFFE2_SPECIALIZED_COLWISEMAX
+
+#define CAFFE2_SPECIALIZED_ELEMWISEMAX(T)                                   \
+  template <>                                                               \
+  void ElemwiseMax<T, CPUContext>(                                          \
+      const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
+    std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) {         \
+      return std::max(x_i, y_i);                                            \
+    });                                                                     \
+  }
+CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
+#undef CAFFE2_SPECIALIZED_ELEMWISEMAX
+
+#define CAFFE2_SPECIALIZED_MAXIMUM(T)                                          \
+  template <>                                                                  \
+  void Maximum<T, CPUContext>(                                                 \
+      const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
+    std::transform(                                                            \
+        x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
+  }
+CAFFE2_SPECIALIZED_MAXIMUM(float)
+#undef CAFFE2_SPECIALIZED_MAXIMUM
+
+// The actual implementation uses eigen which is column major, so notice the
+// row/column swap in the actual implementation.
+
+#define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
+  template <>                                                          \
+  void Rowwise##Func<T, CPUContext, true>(                             \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == B) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).colwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(A, cols);                        \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(B, cols, rows)                         \
+              .colwise() expr ConstEigenVectorArrayMap<T>(A, cols);    \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<T, CPUContext, true>(                             \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == B) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).rowwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(A, rows).transpose();            \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(B, cols, rows)                         \
+              .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows)     \
+              .transpose();                                            \
+    }                                                                  \
+  }
+
+#define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
+  template <>                                                          \
+  void Rowwise##Func<T, CPUContext, false>(                            \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == A) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).colwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(B, cols);                        \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(A, cols, rows)                         \
+              .colwise() expr ConstEigenVectorArrayMap<T>(B, cols);    \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<T, CPUContext, false>(                            \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == A) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).rowwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(B, rows).transpose();            \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(A, cols, rows)                         \
+              .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows)     \
+              .transpose();                                            \
+    }                                                                  \
+  }
+
+#define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \
+  DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr)   \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr)
+
+#define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr)           \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr)        \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr)       \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr)
+
+DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +)
+DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
+#undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
+
+#define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T)           \
+  template <>                                               \
+  void RowwiseSub<T, CPUContext, true>(                     \
+      const int rows,                                       \
+      const int cols,                                       \
+      const T* A,                                           \
+      const T* B,                                           \
+      T* C,                                                 \
+      CPUContext*) {                                        \
+    EigenArrayMap<T>(C, cols, rows) =                       \
+        (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \
+        ConstEigenVectorArrayMap<T>(A, cols);               \
+  }                                                         \
+  template <>                                               \
+  void ColwiseSub<T, CPUContext, true>(                     \
+      const int rows,                                       \
+      const int cols,                                       \
+      const T* A,                                           \
+      const T* B,                                           \
+      T* C,                                                 \
+      CPUContext*) {                                        \
+    EigenArrayMap<T>(C, cols, rows) =                       \
+        (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \
+        ConstEigenVectorArrayMap<T>(A, rows).transpose();   \
+  }                                                         \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -)
+
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION
+
+#define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T)                  \
+  template <>                                                      \
+  void RowwiseDiv<T, CPUContext, true>(                            \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      T* C,                                                        \
+      CPUContext*) {                                               \
+    EigenArrayMap<T>(C, cols, rows) =                              \
+        ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \
+        ConstEigenVectorArrayMap<T>(A, cols);                      \
+  }                                                                \
+  template <>                                                      \
+  void ColwiseDiv<T, CPUContext, true>(                            \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      T* C,                                                        \
+      CPUContext*) {                                               \
+    EigenArrayMap<T>(C, cols, rows) =                              \
+        ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \
+        ConstEigenVectorArrayMap<T>(A, rows).transpose();          \
+  }                                                                \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /)
+
+DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float)
+DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double)
+DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /)
+DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION
+
+#undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION
+#undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
+
+#define CAFFE2_SPECIALIZED_SET(T)                                             \
+  template <>                                                                 \
+  void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
+    if (alpha == (T)0) {                                                      \
+      if (Y != nullptr) {                                                     \
+        memset(Y, 0, N * sizeof(T));                                          \
+      }                                                                       \
+    } else {                                                                  \
+      EigenVectorMap<T>(Y, N).setConstant(alpha);                             \
+    }                                                                         \
+  }
+
+CAFFE2_SPECIALIZED_SET(float);
+CAFFE2_SPECIALIZED_SET(double);
+CAFFE2_SPECIALIZED_SET(int8_t);
+CAFFE2_SPECIALIZED_SET(int16_t);
+CAFFE2_SPECIALIZED_SET(int);
+CAFFE2_SPECIALIZED_SET(int64_t);
+CAFFE2_SPECIALIZED_SET(bool);
+CAFFE2_SPECIALIZED_SET(char);
+CAFFE2_SPECIALIZED_SET(uint8_t);
+CAFFE2_SPECIALIZED_SET(uint16_t);
+#undef CAFFE2_SPECIALIZED_SET
+
+template <>
+void Not<bool, CPUContext>(
+    const int N,
+    const bool* x,
+    bool* y,
+    CPUContext* /*context*/) {
+  for (int i = 0; i < N; ++i) {
+    y[i] = !x[i];
+  }
+}
+
+#undef CAFFE2_DEFINE_BINARY_OP
+#undef CAFFE2_INSTANTIATE_BINARY_OP
+
+#define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T)             \
+  template <>                                                   \
+  void AddStripedBatch(                                         \
+      const int N,                                              \
+      const T* first,                                           \
+      T* y,                                                     \
+      const int stripe,                                         \
+      const int batch,                                          \
+      CPUContext* context) {                                    \
+    for (int j = 0; j < batch; j++) {                           \
+      Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
+    }                                                           \
+  }
+
+CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
+#undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
+
+namespace {
+
+template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
+void RowwiseBinaryOp(
+    const int rows,
+    const int cols,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      const int C_index = i * cols + j;
+      const int A_index = kBroadcast1st ? j : C_index;
+      const int B_index = kBroadcast1st ? C_index : j;
+      C[C_index] = op(A[A_index], B[B_index]);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
+void ColwiseBinaryOp(
+    const int rows,
+    const int cols,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      const int C_index = i * cols + j;
+      const int A_index = kBroadcast1st ? i : C_index;
+      const int B_index = kBroadcast1st ? C_index : i;
+      C[C_index] = op(A[A_index], B[B_index]);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class Operator1, class Operator2>
+void BinaryOpWith2DBroadcasting(
+    const int ndim,
+    const int* dims,
+    const int pivot,
+    const bool broadcast_1st,
+    const Operator1& op1,
+    const Operator2& op2,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CPUContext* context) {
+  const int rows =
+      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
+  const int cols =
+      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  if (broadcast_1st) {
+    op1(rows, cols, A, B, C, context);
+  } else {
+    op2(rows, cols, A, B, C, context);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BroadcastBinaryOpImpl(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  std::vector<int> index(ndim, 0);
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
+    const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
+    C[C_index] = op(A[A_index], B[B_index]);
+    utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
+}
+
+} // namespace
+
+#define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
+  template <>                                                          \
+  void Func<TIn, CPUContext>(                                          \
+      const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
+    std::transform(A, A + N, B, C, Op<TIn>());                         \
+  }
+
+#define DEFINE_1D_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_1D_COMPARE_FUNCTION(LT, std::less)
+DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_1D_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_1D_COMPARE_FUNCTION
+
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op)                 \
+  DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op)                 \
+  DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \
+  DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op)
+
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_1D_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_1D_BINARY_FUNCTION
+
+#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)             \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, CPUContext, true>(                                   \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
+  }                                                                            \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, CPUContext, false>(                                  \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>(                                \
+        rows, cols, Op<TIn>(), A, B, C);                                       \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, CPUContext, true>(                                   \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, CPUContext, false>(                                  \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>(                                \
+        rows, cols, Op<TIn>(), A, B, C);                                       \
+  }
+
+#define DEFINE_2D_COMPARE_FUNCTION(Func, Op)                          \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_2D_COMPARE_FUNCTION(LT, std::less)
+DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_2D_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_2D_COMPARE_FUNCTION
+
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
+
+#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)   \
+  template <>                                     \
+  void RowwiseDiv<T, CPUContext, true>(           \
+      const int rows,                             \
+      const int cols,                             \
+      const T* A,                                 \
+      const T* B,                                 \
+      T* C,                                       \
+      CPUContext*) {                              \
+    RowwiseBinaryOp<T, T, std::divides<T>, true>( \
+        rows, cols, std::divides<T>(), A, B, C);  \
+  }                                               \
+  template <>                                     \
+  void ColwiseDiv<T, CPUContext, true>(           \
+      const int rows,                             \
+      const int cols,                             \
+      const T* A,                                 \
+      const T* B,                                 \
+      T* C,                                       \
+      CPUContext*) {                              \
+    ColwiseBinaryOp<T, T, std::divides<T>, true>( \
+        rows, cols, std::divides<T>(), A, B, C);  \
+  }
+DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
+DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
+#undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION
+
+#define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                   \
+  void Func<TIn, CPUContext>(                                   \
+      const int A_ndim,                                         \
+      const int* A_dims,                                        \
+      const int B_ndim,                                         \
+      const int* B_dims,                                        \
+      const TIn* A,                                             \
+      const TIn* B,                                             \
+      TOut* C,                                                  \
+      CPUContext* context) {                                    \
+    const int ndim = std::max(A_ndim, B_ndim);                  \
+    std::vector<int> A_dims_array(ndim);                        \
+    std::vector<int> B_dims_array(ndim);                        \
+    std::vector<int> C_dims_array(ndim);                        \
+    utils::ComputeBroadcastBinaryOpDims(                        \
+        A_ndim,                                                 \
+        A_dims,                                                 \
+        B_ndim,                                                 \
+        B_dims,                                                 \
+        A_dims_array.data(),                                    \
+        B_dims_array.data(),                                    \
+        C_dims_array.data());                                   \
+    if (A_dims_array == B_dims_array) {                         \
+      const int size = std::accumulate(                         \
+          C_dims_array.cbegin(),                                \
+          C_dims_array.cend(),                                  \
+          1,                                                    \
+          std::multiplies<int>());                              \
+      Func<TIn, CPUContext>(size, A, B, C, context);            \
+      return;                                                   \
+    }                                                           \
+    int pivot;                                                  \
+    bool broadcast_1st;                                         \
+    if (utils::IsRowwiseBroadcastBinaryOp(                      \
+            ndim,                                               \
+            A_dims_array.data(),                                \
+            B_dims_array.data(),                                \
+            &pivot,                                             \
+            &broadcast_1st)) {                                  \
+      BinaryOpWith2DBroadcasting(                               \
+          ndim,                                                 \
+          C_dims_array.data(),                                  \
+          pivot,                                                \
+          broadcast_1st,                                        \
+          Rowwise##Func<TIn, CPUContext, true>,                 \
+          Rowwise##Func<TIn, CPUContext, false>,                \
+          A,                                                    \
+          B,                                                    \
+          C,                                                    \
+          context);                                             \
+      return;                                                   \
+    }                                                           \
+    if (utils::IsColwiseBroadcastBinaryOp(                      \
+            ndim,                                               \
+            A_dims_array.data(),                                \
+            B_dims_array.data(),                                \
+            &pivot,                                             \
+            &broadcast_1st)) {                                  \
+      BinaryOpWith2DBroadcasting(                               \
+          ndim,                                                 \
+          C_dims_array.data(),                                  \
+          pivot,                                                \
+          broadcast_1st,                                        \
+          Colwise##Func<TIn, CPUContext, true>,                 \
+          Colwise##Func<TIn, CPUContext, false>,                \
+          A,                                                    \
+          B,                                                    \
+          C,                                                    \
+          context);                                             \
+      return;                                                   \
+    }                                                           \
+    BroadcastBinaryOpImpl(                                      \
+        ndim,                                                   \
+        A_dims_array.data(),                                    \
+        B_dims_array.data(),                                    \
+        C_dims_array.data(),                                    \
+        Op<TIn>(),                                              \
+        A,                                                      \
+        B,                                                      \
+        C);                                                     \
+  }
+
+#define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less)
+DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_BROADCAST_COMPARE_FUNCTION
+
+#define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus)
+DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus)
+DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies)
+DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides)
+
+#undef DEFINE_BROADCAST_BINARY_FUNCTION
+
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION
+
+#undef DELEGATE_BROADCAST_BINARY_FUNCTION
+
+#define CAFFE2_RAND_UNIFORM_REAL(T)                                      \
+  template <>                                                            \
+  void RandUniform<T, CPUContext>(                                       \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) { \
+    std::uniform_real_distribution<T> distribution(a, b);                \
+    for (size_t i = 0; i < n; ++i) {                                     \
+      r[i] = distribution(context->RandGenerator());                     \
+    }                                                                    \
+  }
+CAFFE2_RAND_UNIFORM_REAL(float);
+CAFFE2_RAND_UNIFORM_REAL(double);
+#undef CAFFE2_RAND_UNIFORM_REAL
+
+#define CAFFE2_RAND_UNIFORM_CHAR(T)                                        \
+  template <>                                                              \
+  void RandUniform<T, CPUContext>(                                         \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) {   \
+    std::uniform_int_distribution<short> distribution((short)a, (short)b); \
+    for (size_t i = 0; i < n; ++i) {                                       \
+      r[i] = static_cast<T>(distribution(context->RandGenerator()));       \
+    }                                                                      \
+  }
+CAFFE2_RAND_UNIFORM_CHAR(int8_t);
+CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
+#undef CAFFE2_RAND_UNIFORM_CHAR
+
+#define CAFFE2_RAND_UNIFORM_INT(T)                                       \
+  template <>                                                            \
+  void RandUniform<T, CPUContext>(                                       \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) { \
+    std::uniform_int_distribution<T> distribution(a, b);                 \
+    for (size_t i = 0; i < n; ++i) {                                     \
+      r[i] = distribution(context->RandGenerator());                     \
+    }                                                                    \
+  }
+
+CAFFE2_RAND_UNIFORM_INT(int16_t);
+CAFFE2_RAND_UNIFORM_INT(int32_t);
+CAFFE2_RAND_UNIFORM_INT(int64_t);
+CAFFE2_RAND_UNIFORM_INT(uint16_t);
+CAFFE2_RAND_UNIFORM_INT(uint32_t);
+CAFFE2_RAND_UNIFORM_INT(uint64_t);
+#undef CAFFE2_RAND_UNIFORM_INT
+
+// This is not uniformly distributed between a and b.
+// It takes advantage of normal distribution to generate numbers
+// with mean = sum / n.
+// Ideally the algorithm should be generating n numbers between 0 and 1,
+// sum them up as scaled_sum, and use sum / scaled_sum to adjust the values
+// to between a and b.
+// The algorithm is non-trivial given the adjustment would be different towards
+// each value.
+#define CAFFE2_RAND_FIXED_SUM(T)                                        \
+  template <>                                                           \
+  void RandFixedSum<T, CPUContext>(                                     \
+      const size_t n,                                                   \
+      const T a,                                                        \
+      const T b,                                                        \
+      const T sum,                                                      \
+      T* r,                                                             \
+      CPUContext* context) {                                            \
+    CAFFE_ENFORCE_GE(a, 0);                                             \
+    CAFFE_ENFORCE_GE(sum / (double)n, a);                               \
+    CAFFE_ENFORCE_LE(sum / (double)n, b);                               \
+    T current_sum = 0;                                                  \
+    for (size_t i = 0; i < n - 1; ++i) {                                \
+      auto remaining_numbers = n - 1 - i;                               \
+      double mean = (sum - current_sum) / remaining_numbers;            \
+      double stdev = std::min(mean - a, b - mean);                      \
+      std::normal_distribution<double> distribution{mean, stdev / 4.0}; \
+      T value = distribution(context->RandGenerator());                 \
+      auto remaining_sum = sum - current_sum - value;                   \
+      if (value < a || remaining_sum > b * remaining_numbers) {         \
+        value = a;                                                      \
+      } else if (value > b || remaining_sum < a * remaining_numbers) {  \
+        value = b;                                                      \
+      }                                                                 \
+      r[i] = value;                                                     \
+      CAFFE_ENFORCE(a <= value && value <= b);                          \
+      current_sum += value;                                             \
+    }                                                                   \
+    r[n - 1] = sum - current_sum;                                       \
+    CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b);                      \
+  }
+CAFFE2_RAND_FIXED_SUM(float);
+CAFFE2_RAND_FIXED_SUM(double);
+CAFFE2_RAND_FIXED_SUM(int8_t);
+CAFFE2_RAND_FIXED_SUM(int16_t);
+CAFFE2_RAND_FIXED_SUM(int32_t);
+CAFFE2_RAND_FIXED_SUM(int64_t);
+CAFFE2_RAND_FIXED_SUM(uint8_t);
+CAFFE2_RAND_FIXED_SUM(uint16_t);
+CAFFE2_RAND_FIXED_SUM(uint32_t);
+CAFFE2_RAND_FIXED_SUM(uint64_t);
+#undef CAFFE2_RAND_FIXED_SUM
+
+#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                      \
+  template <>                                                          \
+  void RandUniformUnique<T, CPUContext>(                               \
+      const size_t n,                                                  \
+      const T a,                                                       \
+      const T b,                                                       \
+      T* r,                                                            \
+      const size_t m,                                                  \
+      const T* avoid,                                                  \
+      CPUContext* context) {                                           \
+    CAFFE_ENFORCE_LE(                                                  \
+        n, b - a - m + 1, "Cannot satisfy the unique requirement");    \
+    std::unordered_set<T> avoid_set(n);                                \
+    if (m) {                                                           \
+      avoid_set.insert(avoid, avoid + m);                              \
+      CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \
+    }                                                                  \
+    std::uniform_int_distribution<T> distribution(a, b);               \
+    T v = 0;                                                           \
+    for (size_t i = 0; i < n; ++i) {                                   \
+      do {                                                             \
+        v = distribution(context->RandGenerator());                    \
+      } while (avoid_set.count(v));                                    \
+      r[i] = v;                                                        \
+      avoid_set.insert(v);                                             \
+    }                                                                  \
+  }
+
+CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
+CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
+#undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
+
+template <>
+void RandGaussian<float, CPUContext>(
+    const size_t n,
+    const float mean,
+    const float std,
+    float* r,
+    CPUContext* context) {
+  std::normal_distribution<float> distribution(mean, std);
+  for (size_t i = 0; i < n; ++i) {
+    r[i] = distribution(context->RandGenerator());
+  }
+}
+
+#define CAFFE2_SPECIALIZED_SUM(T)            \
+  template <>                                \
+  void Sum<T, CPUContext>(                   \
+      const int N,                           \
+      const T* x,                            \
+      T* y,                                  \
+      CPUContext* /* unused */,              \
+      Tensor<CPUContext>* /* unused */) {    \
+    *y = ConstEigenVectorMap<T>(x, N).sum(); \
+  }
+
+CAFFE2_SPECIALIZED_SUM(float);
+CAFFE2_SPECIALIZED_SUM(int32_t);
+CAFFE2_SPECIALIZED_SUM(int64_t);
+
+#undef CAFFE2_SPECIALIZED_SUM
+
+template <>
+void SumSqr<float, CPUContext>(
+    const int N,
+    const float* x,
+    float* y,
+    CPUContext* /*context*/ /* unused */,
+    Tensor<CPUContext>* /*scratch_ptr*/ /* unused */) {
+  *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
+}
+
+template <>
+void Select<float, CPUContext>(
+    const int N,
+    const int D,
+    const float* x,
+    const int* idx,
+    float* y,
+    CPUContext* /*context*/) {
+  for (int i = 0; i < N; ++i) {
+    DCHECK_LT(idx[i], D);
+    y[i] = x[i * D + idx[i]];
+  }
+}
+
+namespace {
+
+template <typename T, bool kCol2Im>
+void Im2ColNdNCHWImpl(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* X_data,
+    float* Y_data,
+    CPUContext* context) {
+  if (kCol2Im) {
+    Set<T, CPUContext>(img_size, 0, Y_data, context);
+  }
+  const int outer_size = col_shape[0];
+  const int inner_size = col_size / outer_size;
+  const int kernel_size = std::accumulate(
+      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  std::vector<int> d_offset(N, 0);
+  std::vector<int> d_iter(N, 0);
+  for (int i = 0; i < outer_size; ++i) {
+    // Loop over spatial axes in reverse order to compute a per-axis offset.
+    int offset = i;
+    for (int d_i = N - 1; d_i >= 0; --d_i) {
+      d_offset[d_i] = offset % kernel_shape[d_i];
+      offset /= kernel_shape[d_i];
+    }
+    for (int j = 0; j < inner_size; ++j) {
+      // Loop over spatial axes in forward order to compute the indices in the
+      // image and column, and whether the index lies in the padding.
+      const int col_index = i * inner_size + j;
+      int img_index = i / kernel_size;
+      bool is_padding = false;
+      for (int d_i = 0; d_i < N; ++d_i) {
+        const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
+            d_offset[d_i] * dilation[d_i];
+        is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1];
+        img_index = img_index * img_shape[d_i + 1] + d_img;
+      }
+      if (!kCol2Im) {
+        Y_data[col_index] = is_padding ? 0 : X_data[img_index];
+      } else if (!is_padding) {
+        Y_data[img_index] += X_data[col_index];
+      }
+      utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
+    }
+  }
+}
+
+} // namespace
+
+template <>
+void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* img_data,
+    float* col_data,
+    CPUContext* context) {
+  Im2ColNdNCHWImpl<float, false>(
+      N,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      img_data,
+      col_data,
+      context);
+}
+
+template <>
+void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* col_data,
+    float* img_data,
+    CPUContext* context) {
+  Im2ColNdNCHWImpl<float, true>(
+      N,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      col_data,
+      img_data,
+      context);
+}
+
+template <>
+void Im2Col<float, CPUContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    CPUContext* /*context*/) {
+  const int output_h =
+      (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+      1;
+  const int output_w =
+      (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
+
+  // Fast path for zero padding and no dilation
+  // From Torch, THNN_(unfolded_copy)
+  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
+      pad_t == 0 && pad_b == 0) {
+    for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
+      const auto nip = k / (kernel_h * kernel_w);
+      const auto rest = k % (kernel_h * kernel_w);
+      const auto kh = rest / kernel_w;
+      const auto kw = rest % kernel_w;
+      auto* dst = col_data + nip * (kernel_h * kernel_w * output_h * output_w) +
+          kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
+      const auto* src = img_data + nip * (height * width);
+      for (auto y = 0; y < output_h; y++) {
+        const auto iy = y * stride_h + kh;
+        const auto ix = kw;
+        if (stride_w == 1) {
+          memcpy(
+              dst + (y * output_w),
+              src + (iy * width + ix),
+              sizeof(float) * output_w);
+        } else {
+          for (auto x = 0; x < output_w; x++) {
+            memcpy(
+                dst + (y * output_w + x),
+                src + (iy * width + ix + x * stride_w),
+                sizeof(float));
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Fast path for equal padding
+  if (pad_l == pad_r && pad_t == pad_b) {
+    // From Intel, https://github.com/BVLC/caffe/pull/3536
+    const int pad_h = pad_t;
+    const int pad_w = pad_l;
+    const int channel_size = height * width;
+    for (int channel = channels; channel--; img_data += channel_size) {
+      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_row = -pad_h + kernel_row * dilation_h;
+          for (int output_rows = output_h; output_rows; output_rows--) {
+            if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+              for (int output_cols = output_w; output_cols; output_cols--) {
+                *(col_data++) = 0;
+              }
+            } else {
+              int input_col = -pad_w + kernel_col * dilation_w;
+              for (int output_col = output_w; output_col; output_col--) {
+                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                  *(col_data++) = img_data[input_row * width + input_col];
+                } else {
+                  *(col_data++) = 0;
+                }
+                input_col += stride_w;
+              }
+            }
+            input_row += stride_h;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Baseline
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
+        int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
+          col_data[(c * height_col + h) * width_col + w] =
+              img_data[(c_im * height + h_pad) * width + w_pad];
+        } else {
+          col_data[(c * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void Im2Col<float, CPUContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    CPUContext* /*context*/) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
+        for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            memcpy(
+                col_data,
+                img_data + (ih * width + iw) * channels,
+                sizeof(float) * channels);
+          } else {
+            // This should be simply padded with zero.
+            memset(col_data, 0, sizeof(float) * channels);
+          }
+          col_data += channels;
+        }
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+template <>
+void Col2Im<float, CPUContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    CPUContext* context) {
+  const int output_h =
+      (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+      1;
+  const int output_w =
+      (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
+
+  Set<float, CPUContext>(height * width * channels, 0, img_data, context);
+
+  // Fast path for zero padding and no dilation
+  // From Torch, modified THNN_(unfolded_acc)
+  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
+      pad_t == 0 && pad_b == 0) {
+    for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
+      const auto nip = k / (kernel_h * kernel_w);
+      const auto rest = k % (kernel_h * kernel_w);
+      const auto kh = rest / kernel_w;
+      const auto kw = rest % kernel_w;
+      const auto* dst = col_data +
+          nip * (kernel_h * kernel_w * output_h * output_w) +
+          kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
+      auto* src = img_data + nip * (height * width);
+      for (auto y = 0; y < output_h; y++) {
+        const auto iy = y * stride_h + kh;
+        const auto ix = kw;
+        if (stride_w == 1) {
+          auto offsrc = src + (iy * width + ix);
+          const auto offdst = dst + (y * output_w);
+          for (auto i = 0; i < output_w; ++i) {
+            offsrc[i] += offdst[i];
+          }
+        } else {
+          for (auto x = 0; x < output_w; x++) {
+            auto offsrc = src + (iy * width + ix + x * stride_w);
+            const auto offdst = dst + (y * output_w + x);
+            *offsrc += *offdst;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Fast path for equal padding
+  if (pad_l == pad_r && pad_t == pad_b) {
+    // From Intel, https://github.com/BVLC/caffe/pull/3536
+    const int pad_h = pad_t;
+    const int pad_w = pad_l;
+    const int channel_size = height * width;
+    for (int channel = channels; channel--; img_data += channel_size) {
+      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_row = -pad_h + kernel_row * dilation_h;
+          for (int output_rows = output_h; output_rows; output_rows--) {
+            if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+              col_data += output_w;
+            } else {
+              int input_col = -pad_w + kernel_col * dilation_w;
+              for (int output_col = output_w; output_col; output_col--) {
+                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                  img_data[input_row * width + input_col] += *col_data;
+                }
+                ++col_data;
+                input_col += stride_w;
+              }
+            }
+            input_row += stride_h;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Fallback
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
+        int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
+          img_data[(c_im * height + h_pad) * width + w_pad] +=
+              col_data[(c * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+template <>
+void Col2Im<float, CPUContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    CPUContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+  Set<float, CPUContext>(height * width * channels, 0, img_data, context);
+  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
+        for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            auto* img_data_patch = img_data + (ih * width + iw) * channels;
+            Add<float, CPUContext>(
+                channels, img_data_patch, col_data, img_data_patch, context);
+          }
+          col_data += channels;
+        }
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+template <>
+void BiasCHW<float, CPUContext>(
+    const float* bias,
+    const float* /*bias_multiplier*/,
+    const int bias_channels,
+    const int image_size,
+    float* image,
+    CPUContext* /*context*/) {
+  // Sum the per-channel bias into every image plane
+  for (int c = 0; c < bias_channels; ++c) {
+    float b = bias[c];
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    float32x4_t vBias = vdupq_n_f32(b);
+
+    // We give alignment hints for additional speed, so handle the
+    // non-vectorizable prologue separately
+    constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
+
+    // FIXME: if input < kVecSizeInFloat, can't vectorize at all
+
+    int prologue = kVecSizeInFloat -
+        // remainder in floats
+        (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float);
+
+    int i = 0;
+    // Prologue loop
+    for (; i < prologue; ++i) {
+      image[i] += b;
+    }
+
+    // The loop is manually unrolled by 8
+    constexpr int kUnroll = 8;
+    constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
+
+    int remainder = image_size - prologue;
+    int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
+
+    // Vectorizable body
+    for (; i < vectorizable; i += kFloatsPerLoop) {
+      // Manually unrolled
+      float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
+      float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
+      float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
+      float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
+      float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
+      float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
+      float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
+      float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
+
+      v0 = vaddq_f32(v0, vBias);
+      v1 = vaddq_f32(v1, vBias);
+      v2 = vaddq_f32(v2, vBias);
+      v3 = vaddq_f32(v3, vBias);
+      v4 = vaddq_f32(v4, vBias);
+      v5 = vaddq_f32(v5, vBias);
+      v6 = vaddq_f32(v6, vBias);
+      v7 = vaddq_f32(v7, vBias);
+
+      vst1q_f32_aligned(image + i + 0, v0);
+      vst1q_f32_aligned(image + i + 4, v1);
+      vst1q_f32_aligned(image + i + 8, v2);
+      vst1q_f32_aligned(image + i + 12, v3);
+      vst1q_f32_aligned(image + i + 16, v4);
+      vst1q_f32_aligned(image + i + 20, v5);
+      vst1q_f32_aligned(image + i + 24, v6);
+      vst1q_f32_aligned(image + i + 28, v7);
+    }
+
+    // Non-vectorizable epilogue
+    for (; i < image_size; ++i) {
+      image[i] += b;
+    }
+#else
+    // Non-NEON CPU implementation
+    for (int i = 0; i < image_size; ++i) {
+      image[i] += b;
+    }
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+    image += image_size;
+  }
+}
+
+template <>
+void CopyMatrix<CPUContext>(
+    const size_t itemsize,
+    const int M,
+    const int N,
+    const void* A,
+    const int lda,
+    void* B,
+    const int ldb,
+    CPUContext* /*context*/,
+    TypeMeta::TypedCopy copy) {
+  if (A == nullptr || B == nullptr) {
+    return;
+  }
+  if (lda == N && ldb == N) {
+    // can coalese to a single memcpy of size M * N
+    if (copy) {
+      copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
+    } else {
+      memcpy(
+          static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
+    }
+    return;
+  }
+
+  for (int i = 0; i < M; ++i) {
+    if (copy) {
+      copy(
+          static_cast<const char*>(A) + lda * i * itemsize,
+          static_cast<char*>(B) + ldb * i * itemsize,
+          N);
+    } else {
+      memcpy(
+          static_cast<char*>(B) + ldb * i * itemsize,
+          static_cast<const char*>(A) + lda * i * itemsize,
+          itemsize * N);
+    }
+  }
+}
+
+#define CAFFE2_SPECIALIZED_COPYVECTOR(T)                            \
+  template <>                                                       \
+  void CopyVector<T, CPUContext>(                                   \
+      const int N, const T* src, T* dst, CPUContext* /*context*/) { \
+    if (src != dst && N > 0) {                                      \
+      memcpy(dst, src, sizeof(T) * N);                              \
+    }                                                               \
+  }
+CAFFE2_SPECIALIZED_COPYVECTOR(float)
+#undef CAFFE2_SPECIALIZED_COPYVECTOR
+
+namespace {
+
+#ifdef CAFFE2_USE_HPTT
+
+bool TryTransposeWithHPTT(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    const float* X,
+    float* Y) {
+  std::vector<int> axes_cm(ndim);
+  std::vector<int> dims_cm(ndim);
+  // Convert row-major index to column-major.
+  const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
+  for (int i = 0; i < ndim; ++i) {
+    axes_cm[i] = cm_fn(axes[cm_fn(i)]);
+    dims_cm[i] = dims[cm_fn(i)];
+  }
+  auto plan = hptt::create_plan(
+      axes_cm.data(),
+      ndim,
+      1.0,
+      X,
+      dims_cm.data(),
+      nullptr,
+      0.0,
+      Y,
+      nullptr,
+      hptt::ESTIMATE,
+      1);
+  if (plan == nullptr) {
+    return false;
+  }
+  plan->execute();
+  return true;
+}
+
+#endif // CAFFE2_USE_HPTT
+
+std::vector<int>
+ComputeXStrides(const int ndim, const int* dims, const int* axes) {
+  std::vector<int> x_strides(ndim);
+  std::vector<int> buff(ndim);
+  int cur_stride = 1;
+  for (int i = ndim - 1; i >= 0; --i) {
+    buff[i] = cur_stride;
+    cur_stride *= dims[i];
+  }
+  for (int i = 0; i < ndim; ++i) {
+    x_strides[i] = buff[axes[i]];
+  }
+  return x_strides;
+}
+
+template <typename T>
+void TransposeCPUImpl(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* Y) {
+  std::vector<int> Y_dims(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    Y_dims[i] = dims[axes[i]];
+  }
+  // Measure amount of contiguous data we can copy at once
+  int block_size = 1;
+  int num_shared_idx = 0;
+  for (int i = ndim - 1; i >= 0 && axes[i] == i; --i) {
+    block_size *= Y_dims[i];
+    ++num_shared_idx;
+  }
+  const int itr_axes = ndim - num_shared_idx;
+  const int num_blocks = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>());
+  if (ndim < 2 || itr_axes == 0) {
+    std::memcpy(Y, X, num_blocks * block_size * sizeof(T));
+    return;
+  }
+  const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes);
+  std::vector<int> index(itr_axes, 0);
+  for (int Y_index = 0; Y_index < num_blocks; ++Y_index) {
+    const int X_index = std::inner_product(
+        X_strides.cbegin(), X_strides.cend(), index.cbegin(), 0);
+    if (block_size == 1) {
+      Y[Y_index] = X[X_index];
+    } else {
+      std::memcpy(
+          Y + block_size * Y_index,
+          X + block_size * X_index,
+          block_size * sizeof(T));
+    }
+    utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
+  }
+}
+
+} // namespace
+
+template <>
+void Transpose<float, CPUContext>(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    const float* X,
+    float* Y,
+    CPUContext* /* context */) {
+#ifdef CAFFE2_USE_HPTT
+  if (TryTransposeWithHPTT(ndim, dims, axes, X, Y)) {
+    return;
+  }
+#endif // CAFFE2_USE_HPTT
+  TransposeCPUImpl(ndim, dims, axes, X, Y);
+}
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
+  template <>                                 \
+  void Transpose<T, CPUContext>(              \
+      const int ndim,                         \
+      const int* dims,                        \
+      const int* axes,                        \
+      const T* X,                             \
+      T* Y,                                   \
+      CPUContext* /* context */) {            \
+    TransposeCPUImpl(ndim, dims, axes, X, Y); \
+  }
+CAFFE2_SPECIALIZED_TRANSPOSE(double)
+CAFFE2_SPECIALIZED_TRANSPOSE(int)
+CAFFE2_SPECIALIZED_TRANSPOSE(TIndex)
+#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
+CAFFE2_SPECIALIZED_TRANSPOSE(long)
+#endif
+CAFFE2_SPECIALIZED_TRANSPOSE(std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE(std::uint16_t)
+#undef CAFFE2_SPECIALIZED_TRANSPOSE
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
new file mode 100644
index 0000000..e93c1a7
--- /dev/null
+++ b/caffe2/utils/math_gpu.cu
@@ -0,0 +1,3299 @@
+// Implements the math functions for GPU.
+
+#include "caffe2/utils/math.h"
+
+#include <cstring>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include <thrust/functional.h>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/fixed_divisor.h"
+#include "caffe2/utils/math_utils.h"
+
+#if THRUST_VERSION >= 100800
+#define THRUST_SUPPORTS_PER_THREAD
+#endif // THRUST_VERSION >= 100800
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
+  template <typename T>                                               \
+  struct Func##Functor {                                              \
+    inline __host__ __device__ T                                      \
+    operator()(const T& lhs, const T& rhs) const {                    \
+      return lhs expr rhs;                                            \
+    }                                                                 \
+  };                                                                  \
+  template <>                                                         \
+  struct Func##Functor<float16> {                                     \
+    inline __host__ __device__ float16                                \
+    operator()(const float16& lhs, const float16& rhs) const {        \
+      return convert::To<float, float16>(convert::To<float16, float>( \
+          lhs) expr convert::To<float16, float>(rhs));                \
+    }                                                                 \
+  };
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
+#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR
+
+template <typename T>
+__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    sincos(__ldg(X + i), S + i, C + i);
+#else
+    sincos(X[i], S + i, C + i);
+#endif
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+__global__ void SimpleBinaryOpCUDAKernel(
+    const int N,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    C[i] = op(A[i], B[i]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void RowwiseBinaryOpCUDAKenel(
+    const int size,
+    const FixedDivisor<int> cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    const int j = cols.Mod(C_index);
+    const int A_index = broadcast_1st ? j : C_index;
+    const int B_index = broadcast_1st ? C_index : j;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void ColwiseBinaryOpCUDAKenel(
+    const int size,
+    const FixedDivisor<int> cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    const int i = cols.Div(C_index);
+    const int A_index = broadcast_1st ? i : C_index;
+    const int B_index = broadcast_1st ? C_index : i;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+__global__ void BroadcastBinaryOpCUDAKernel(
+    const int size,
+    const SimpleArray<int, D> A_strides,
+    const SimpleArray<int, D> B_strides,
+    const SimpleArray<FixedDivisor<int>, D> C_dims,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    int A_index = 0;
+    int B_index = 0;
+    int C_index_val = C_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      int d;
+      C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
+      A_index += d * A_strides.data[i];
+      B_index += d * B_strides.data[i];
+    }
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BinaryOpWith2DBroadcasting(
+    const int ndim,
+    const int* dims,
+    const int pivot,
+    const bool rowwise_broadcast,
+    const bool broadcast_1st,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  const int rows =
+      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
+  const int cols =
+      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  if (rows == 0 || cols == 0) {
+    return;
+  }
+  const int size = rows * cols;
+  const FixedDivisor<int> cols_div(cols);
+  if (rowwise_broadcast) {
+    if (broadcast_1st) {
+      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
+    } else {
+      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
+    }
+  } else {
+    if (broadcast_1st) {
+      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
+    } else {
+      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+void BroadcastBinaryOpImpl(
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  SimpleArray<int, D> A_strides_array;
+  SimpleArray<int, D> B_strides_array;
+  SimpleArray<FixedDivisor<int>, D> C_dims_array;
+  int A_stride = 1;
+  int B_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    if (C_dims[i] == 0) {
+      return;
+    }
+    A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
+    B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
+    A_stride *= A_dims[i];
+    B_stride *= B_dims[i];
+    C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
+  }
+  const int size =
+      std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
+  BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C);
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BroadcastBinaryOp(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  const int ndim = std::max(A_ndim, B_ndim);
+  std::vector<int> A_dims_array(ndim);
+  std::vector<int> B_dims_array(ndim);
+  std::vector<int> C_dims_array(ndim);
+  utils::ComputeBroadcastBinaryOpDims(
+      A_ndim,
+      A_dims,
+      B_ndim,
+      B_dims,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data());
+  if (A_dims_array == B_dims_array) {
+    const int size = std::accumulate(
+        C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
+    SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(size, op, A, B, C);
+    return;
+  }
+  int pivot;
+  bool broadcast_1st;
+  if (utils::IsRowwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        true,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  if (utils::IsColwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        false,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
+      ndim,
+      BroadcastBinaryOpImpl,
+      TIn,
+      TOut,
+      BinaryOperator,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data(),
+      op,
+      A,
+      B,
+      C,
+      context);
+}
+
+} // namespace
+
+#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op)            \
+  __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
+    CUDA_1D_KERNEL_LOOP(i, N) {                                     \
+      Y[i] = op(X[i]);                                              \
+    }                                                               \
+  }                                                                 \
+  template <>                                                       \
+  void Func<T, CUDAContext>(                                        \
+      const int N, const T* x, T* y, CUDAContext* context) {        \
+    Func##CUDAKernel<<<                                             \
+        CAFFE_GET_BLOCKS(N),                                        \
+        CAFFE_CUDA_NUM_THREADS,                                     \
+        0,                                                          \
+        context->cuda_stream()>>>(N, x, y);                         \
+  }
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh, sinhf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh, coshf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh, tanhf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt, cbrtf)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int32_t,
+    Cube,
+    utils::Cube<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int64_t,
+    Cube,
+    utils::Cube<std::int64_t>)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, utils::Not)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int32_t,
+    Neg,
+    utils::Negate<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int64_t,
+    Neg,
+    utils::Negate<std::int64_t>)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int32_t,
+    Sign,
+    utils::Sign<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+    std::int64_t,
+    Sign,
+    utils::Sign<std::int64_t>)
+
+#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
+
+#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
+  template <>                                                        \
+  void SinCos<T, CUDAContext>(                                       \
+      const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
+    SinCosCUDAKernel<<<                                              \
+        CAFFE_GET_BLOCKS(N),                                         \
+        CAFFE_CUDA_NUM_THREADS,                                      \
+        0,                                                           \
+        context->cuda_stream()>>>(N, x, ys, yc);                     \
+  }
+CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
+CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
+#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
+
+#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                     \
+  void Func<TIn, CUDAContext>(                                    \
+      const int N,                                                \
+      const TIn* A,                                               \
+      const TIn* B,                                               \
+      TOut* C,                                                    \
+      CUDAContext* context) {                                     \
+    SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>>                  \
+        <<<CAFFE_GET_BLOCKS(N),                                   \
+           CAFFE_CUDA_NUM_THREADS,                                \
+           0,                                                     \
+           context->cuda_stream()>>>(N, Op<TIn>(), A, B, C);      \
+  }
+
+#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION
+
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION
+
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+    float,
+    float,
+    ElemwiseMax,
+    thrust::maximum);
+
+#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
+
+#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
+  template <>                                                             \
+  void Rowwise##Func<TIn, CUDAContext, true>(                             \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const TIn* A,                                                       \
+      const TIn* B,                                                       \
+      TOut* C,                                                            \
+      CUDAContext* context) {                                             \
+    if (rows == 0 || cols == 0) {                                         \
+      return;                                                             \
+    }                                                                     \
+    const int size = rows * cols;                                         \
+    const FixedDivisor<int> cols_div(cols);                               \
+    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                    \
+        <<<CAFFE_GET_BLOCKS(size),                                        \
+           CAFFE_CUDA_NUM_THREADS,                                        \
+           0,                                                             \
+           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
+  }                                                                       \
+  template <>                                                             \
+  void Rowwise##Func<TIn, CUDAContext, false>(                            \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const TIn* A,                                                       \
+      const TIn* B,                                                       \
+      TOut* C,                                                            \
+      CUDAContext* context) {                                             \
+    if (rows == 0 || cols == 0) {                                         \
+      return;                                                             \
+    }                                                                     \
+    const int size = rows * cols;                                         \
+    const FixedDivisor<int> cols_div(cols);                               \
+    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                   \
+        <<<CAFFE_GET_BLOCKS(size),                                        \
+           CAFFE_CUDA_NUM_THREADS,                                        \
+           0,                                                             \
+           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
+  }                                                                       \
+  template <>                                                             \
+  void Colwise##Func<TIn, CUDAContext, true>(                             \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const TIn* A,                                                       \
+      const TIn* B,                                                       \
+      TOut* C,                                                            \
+      CUDAContext* context) {                                             \
+    if (rows == 0 || cols == 0) {                                         \
+      return;                                                             \
+    }                                                                     \
+    const int size = rows * cols;                                         \
+    const FixedDivisor<int> cols_div(cols);                               \
+    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                    \
+        <<<CAFFE_GET_BLOCKS(size),                                        \
+           CAFFE_CUDA_NUM_THREADS,                                        \
+           0,                                                             \
+           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
+  }                                                                       \
+  template <>                                                             \
+  void Colwise##Func<TIn, CUDAContext, false>(                            \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const TIn* A,                                                       \
+      const TIn* B,                                                       \
+      TOut* C,                                                            \
+      CUDAContext* context) {                                             \
+    if (rows == 0 || cols == 0) {                                         \
+      return;                                                             \
+    }                                                                     \
+    const int size = rows * cols;                                         \
+    const FixedDivisor<int> cols_div(cols);                               \
+    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                   \
+        <<<CAFFE_GET_BLOCKS(size),                                        \
+           CAFFE_CUDA_NUM_THREADS,                                        \
+           0,                                                             \
+           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
+  }
+
+#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int32_t, std::int32_t, Func, Op)                            \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int64_t, std::int64_t, Func, Op)                            \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION
+
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int32_t, std::int32_t, Func, Op)                        \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION
+
+#define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
+  template <>                                                         \
+  void Func<TIn, CUDAContext>(                                        \
+      const int A_ndim,                                               \
+      const int* A_dims,                                              \
+      const int B_ndim,                                               \
+      const int* B_dims,                                              \
+      const TIn* A,                                                   \
+      const TIn* B,                                                   \
+      TOut* C,                                                        \
+      CUDAContext* context) {                                         \
+    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
+        A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
+  }
+
+#define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int32_t, std::int32_t, Func, Op)                         \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int64_t, std::int64_t, Func, Op)                         \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION
+
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int32_t, std::int32_t, Func, Op)                     \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION
+
+#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
+  template <>                                                           \
+  void Funcname<T, CUDAContext>(                                        \
+      const int N,                                                      \
+      const T* src,                                                     \
+      T* dst,                                                           \
+      Tensor<CUDAContext>* scratch_ptr,                                 \
+      CUDAContext* context) {                                           \
+    size_t memRequired = 0;                                             \
+    cub::DeviceReduce::func(                                            \
+        nullptr, memRequired, src, dst, N, context->cuda_stream());     \
+    auto buffer_size =                                                  \
+        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
+    cub::DeviceReduce::func(                                            \
+        static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
+        memRequired,                                                    \
+        src,                                                            \
+        dst,                                                            \
+        N,                                                              \
+        context->cuda_stream());                                        \
+  }
+
+DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
+DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max)
+DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max)
+DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
+
+#undef DELEGATE_REDUCTION_FUNCTION
+
+// Caffe2 gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <>
+void Gemm<float, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_ENFORCE(cublasSgemm(
+      context->cublas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      A,
+      lda,
+      &beta,
+      C,
+      N));
+}
+
+template <>
+void Gemm<float16, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  if (math_type == TensorProto_DataType_FLOAT) {
+    CUBLAS_CHECK(cublasSgemmEx(
+        context->cublas_handle(),
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        &alpha,
+        B,
+        CUDA_R_16F,
+        ldb,
+        A,
+        CUDA_R_16F,
+        lda,
+        &beta,
+        C,
+        CUDA_R_16F,
+        N));
+
+  } else if (math_type == TensorProto_DataType_FLOAT16) {
+    // convert alpha, beta from float -> __half
+    auto alpha_fp16 = convert::floatToHalf(alpha);
+    auto beta_fp16 = convert::floatToHalf(beta);
+
+    // call cublasHgemm
+    CUBLAS_CHECK(cublasHgemm(
+        context->cublas_handle(),
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        &alpha_fp16,
+        (const __half*)B,
+        ldb,
+        (const __half*)A,
+        lda,
+        &beta_fp16,
+        (__half*)C,
+        N));
+  } else {
+    // fail
+    CAFFE_THROW("Unsupported math type");
+  }
+}
+
+template <>
+void BiasCHW<float, CUDAContext>(
+    const float* bias,
+    const float* bias_multiplier,
+    const int bias_channels,
+    const int image_size,
+    float* image,
+    CUDAContext* context) {
+  Gemm<float, CUDAContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      bias_channels,
+      image_size,
+      1,
+      1,
+      bias,
+      bias_multiplier,
+      1,
+      image,
+      context);
+}
+
+template <>
+void GemmBatched<float, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch,
+    TensorProto::DataType math_type) {
+  const int a_stride = M * K;
+  const int b_stride = K * N;
+  const int c_stride = M * N;
+#if __CUDACC_VER_MAJOR__ < 8
+  // loop over matrices in the batch
+  for (int i = 0; i < batch_size; ++i) {
+    math::Gemm<float, CUDAContext>(
+        TransA,
+        TransB,
+        M,
+        N,
+        K,
+        alpha,
+        A + a_stride * i,
+        B + b_stride * i,
+        beta,
+        C + c_stride * i,
+        context);
+  }
+#else
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  const int lda = (TransA == CblasNoTrans) ? K : M;
+  const int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_ENFORCE(cublasSgemmStridedBatched(
+      context->cublas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      b_stride,
+      A,
+      lda,
+      a_stride,
+      &beta,
+      C,
+      N,
+      c_stride,
+      batch_size));
+#endif
+}
+
+namespace {
+
+__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __float2half(X[i]);
+  }
+}
+
+__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = __half2float(X[i]);
+  }
+}
+
+}; // namespace
+
+template <>
+void GemmBatched<float16, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch,
+    TensorProto::DataType math_type) {
+  const int a_stride = M * K;
+  const int b_stride = K * N;
+  const int c_stride = M * N;
+#if __CUDACC_VER_MAJOR__ < 8
+  // loop over matrices in the batch
+  for (int i = 0; i < batch_size; ++i) {
+    math::Gemm<float16, CUDAContext>(
+        TransA,
+        TransB,
+        M,
+        N,
+        K,
+        alpha,
+        A + a_stride * i,
+        B + b_stride * i,
+        beta,
+        C + c_stride * i,
+        context);
+  }
+#else
+  // 3 options:
+  // 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16
+  // 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx
+  // 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm
+
+  if (scratch != nullptr) {
+    const int A_size = a_stride * batch_size;
+    const int B_size = b_stride * batch_size;
+    // cast, cublasSgemmStridedBatched, cast
+    size_t in_elems = A_size + B_size;
+    size_t out_elems = c_stride * batch_size;
+
+    scratch->Resize(in_elems + out_elems);
+    float* scratch_ptr = scratch->mutable_data<float>();
+
+    float* A_fp32 = scratch_ptr;
+    float* B_fp32 = scratch_ptr + A_size;
+    float* C_fp32 = scratch_ptr + A_size + B_size;
+
+    // cast A, B into fp32
+    HalfToFloatKernel<<<
+        CAFFE_GET_BLOCKS(A_size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(A_size, (half*)A, A_fp32);
+    HalfToFloatKernel<<<
+        CAFFE_GET_BLOCKS(B_size),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(B_size, (half*)B, B_fp32);
+
+    // run fp32 batched Gemm
+    GemmBatched<float, CUDAContext>(
+        TransA,
+        TransB,
+        batch_size,
+        M,
+        N,
+        K,
+        alpha,
+        A_fp32,
+        B_fp32,
+        beta,
+        C_fp32,
+        context);
+
+    // cast result back to fp16
+    FloatToHalfKernel<<<
+        CAFFE_GET_BLOCKS(batch_size * M * N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context->cuda_stream()>>>(batch_size * M * N, C_fp32, (half*)C);
+  } else {
+    if (math_type == TensorProto_DataType_FLOAT) {
+      // loop over matrices in the batch
+      for (int i = 0; i < batch_size; ++i) {
+        math::Gemm<float16, CUDAContext>(
+            TransA,
+            TransB,
+            M,
+            N,
+            K,
+            alpha,
+            A + a_stride * i,
+            B + b_stride * i,
+            beta,
+            C + c_stride * i,
+            context);
+      }
+    } else if (math_type == TensorProto_DataType_FLOAT16) {
+      // Note that cublas follows fortran order, so the order is different from
+      // the cblas convention.
+      const int lda = (TransA == CblasNoTrans) ? K : M;
+      const int ldb = (TransB == CblasNoTrans) ? N : K;
+      cublasOperation_t cuTransA =
+          (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+      cublasOperation_t cuTransB =
+          (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+      // convert alpha, beta from float -> __half
+      auto alpha_fp16 = convert::floatToHalf(alpha);
+      auto beta_fp16 = convert::floatToHalf(beta);
+      CUBLAS_ENFORCE(cublasHgemmStridedBatched(
+          context->cublas_handle(),
+          cuTransB,
+          cuTransA,
+          N,
+          M,
+          K,
+          &alpha_fp16,
+          (const __half*)B,
+          ldb,
+          b_stride,
+          (const __half*)A,
+          lda,
+          a_stride,
+          &beta_fp16,
+          (__half*)C,
+          N,
+          c_stride,
+          batch_size));
+    }
+  }
+#endif
+}
+
+#if CUDA_VERSION >= 9000
+
+// No change, but required. Defer to default CUDA engine
+template <>
+void Gemm<float, CUDAContext, TensorCoreEngine>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  return Gemm<float, CUDAContext>(
+      TransA, TransB, M, N, K, alpha, A, B, beta, C, context, math_type);
+}
+
+template <>
+void Gemm<float16, CUDAContext, TensorCoreEngine>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // enable TensorCore for this call on this handle
+  if (TensorCoreAvailable()) {
+    CUBLAS_ENFORCE(
+        cublasSetMathMode(context->cublas_handle(), CUBLAS_TENSOR_OP_MATH));
+  }
+
+  CUBLAS_CHECK(cublasGemmEx(
+      context->cublas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      CUDA_R_16F,
+      ldb,
+      A,
+      CUDA_R_16F,
+      lda,
+      &beta,
+      C,
+      CUDA_R_16F,
+      N,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DFALT_TENSOR_OP));
+
+  // Now disable TensorCore math for subsequent calls to this handle
+  if (TensorCoreAvailable()) {
+    CUBLAS_ENFORCE(
+        cublasSetMathMode(context->cublas_handle(), CUBLAS_DEFAULT_MATH));
+  }
+}
+
+template <>
+void GemmBatched<float, CUDAContext, TensorCoreEngine>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const float* B,
+    const float beta,
+    float* C,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch,
+    TensorProto::DataType math_type) {
+  return GemmBatched<float, CUDAContext, DefaultEngine>(
+      TransA,
+      TransB,
+      batch_size,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      B,
+      beta,
+      C,
+      context,
+      scratch,
+      math_type);
+}
+
+template <>
+void GemmBatched<float16, CUDAContext, TensorCoreEngine>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch,
+    TensorProto::DataType math_type) {
+  return GemmBatched<float16, CUDAContext, DefaultEngine>(
+      TransA,
+      TransB,
+      batch_size,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      B,
+      beta,
+      C,
+      context,
+      scratch,
+      math_type);
+}
+
+#endif // CUDA_VERSION >= 9000
+
+template <>
+void GemmEx<float, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float* A,
+    const int lda,
+    const float* B,
+    const int ldb,
+    const float beta,
+    float* C,
+    const int ldc,
+    CUDAContext* context) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_ENFORCE(cublasSgemm(
+      context->cublas_handle(),
+      cuTransB,
+      cuTransA,
+      N,
+      M,
+      K,
+      &alpha,
+      B,
+      ldb,
+      A,
+      lda,
+      &beta,
+      C,
+      ldc));
+}
+
+template <>
+void Gemv<float, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float* A,
+    const float* x,
+    const float beta,
+    float* y,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_ENFORCE(cublasSgemv(
+      context->cublas_handle(),
+      cuTransA,
+      N,
+      M,
+      &alpha,
+      A,
+      N,
+      x,
+      1,
+      &beta,
+      y,
+      1));
+}
+
+// Batched Add variants
+namespace {
+
+template <typename T>
+__global__ void AddStripedBatchKernel(
+    const int N,
+    const T* first,
+    T* Y,
+    const int stripe,
+    const int batch) {
+  for (int j = 0; j < batch; j++) {
+    const T* x = first + j * stripe;
+    CUDA_1D_KERNEL_LOOP(i, N) {
+      float tmpY = convert::To<T, float>(Y[i]);
+      tmpY += convert::To<T, float>(x[i]);
+      Y[i] = convert::To<float, T>(tmpY);
+    }
+  }
+}
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T)              \
+  template <>                                                     \
+  void AddStripedBatch<T, CUDAContext>(                           \
+      const int N,                                                \
+      const T* first,                                             \
+      T* Y,                                                       \
+      const int stripe,                                           \
+      const int batch,                                            \
+      CUDAContext* context) {                                     \
+    AddStripedBatchKernel<T>                                      \
+        <<<CAFFE_GET_BLOCKS(N),                                   \
+           CAFFE_CUDA_NUM_THREADS,                                \
+           0,                                                     \
+           context->cuda_stream()>>>(N, first, Y, stripe, batch); \
+  }
+
+CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
+CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
+#undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
+
+template <>
+void Gemv<float16, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const int M,
+    const int N,
+    const float alpha,
+    const float16* A,
+    const float16* x,
+    const float beta,
+    float16* y,
+    CUDAContext* context,
+    TensorProto::DataType math_type) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  // sort out what we need to call cublasSgemmEx / cublasHgemm
+  int m = (cuTransA == CUBLAS_OP_N) ? N : M;
+  int k = (cuTransA == CUBLAS_OP_N) ? M : N;
+  int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
+  int LDC = m;
+
+  if (math_type == TensorProto_DataType_FLOAT) {
+    CUBLAS_CHECK(cublasSgemmEx(
+        context->cublas_handle(),
+        cuTransA,
+        CUBLAS_OP_N,
+        m,
+        1,
+        k,
+        &alpha,
+        A,
+        CUDA_R_16F,
+        LDA,
+        x,
+        CUDA_R_16F,
+        k,
+        &beta,
+        y,
+        CUDA_R_16F,
+        LDC));
+  } else if (math_type == TensorProto_DataType_FLOAT16) {
+    auto alpha_fp16 = convert::floatToHalf(alpha);
+    auto beta_fp16 = convert::floatToHalf(beta);
+
+    CUBLAS_CHECK(cublasHgemm(
+        context->cublas_handle(),
+        cuTransA,
+        CUBLAS_OP_N,
+        m,
+        1,
+        k,
+        &alpha_fp16,
+        (const __half*)A,
+        LDA,
+        (const __half*)x,
+        k,
+        &beta_fp16,
+        (__half*)y,
+        LDC));
+  } else {
+    // fail
+    CAFFE_THROW("Unsupported math type");
+  }
+}
+
+namespace {
+template <typename T>
+__global__ void SetKernel(const int N, const T alpha, T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Y[i] = alpha;
+  }
+}
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_SET(T)                             \
+  template <>                                                      \
+  void Set<T, CUDAContext>(                                        \
+      const size_t N, const T alpha, T* Y, CUDAContext* context) { \
+    SetKernel<<<                                                   \
+        CAFFE_GET_BLOCKS(N),                                       \
+        CAFFE_CUDA_NUM_THREADS,                                    \
+        0,                                                         \
+        context->cuda_stream()>>>(N, alpha, Y);                    \
+  }
+
+CAFFE2_SPECIALIZED_CUDA_SET(float);
+CAFFE2_SPECIALIZED_CUDA_SET(double);
+CAFFE2_SPECIALIZED_CUDA_SET(bool);
+CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
+CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
+CAFFE2_SPECIALIZED_CUDA_SET(float16);
+CAFFE2_SPECIALIZED_CUDA_SET(int);
+CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
+CAFFE2_SPECIALIZED_CUDA_SET(char);
+CAFFE2_SPECIALIZED_CUDA_SET(uint8_t);
+CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
+#undef CAFFE2_SPECIALIZED_CUDA_SET
+
+namespace {
+template <typename T>
+__global__ void
+UniformShift(const size_t N, const float min, const float max, T* x) {
+  float scale = max - min;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
+  }
+}
+
+__global__ void
+UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
+  int* x_int = reinterpret_cast<int*>(x);
+  int range = (max - min + 1);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    x_int[i] = min + static_cast<int>(x[i] % range);
+  }
+}
+} // namespace
+
+template <>
+void RandUniform<float, CUDAContext>(
+    const size_t n,
+    const float min,
+    const float max,
+    float* r,
+    CUDAContext* context) {
+  CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n));
+  UniformShift<float>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, min, max, r);
+}
+
+template <>
+void RandUniform<double, CUDAContext>(
+    const size_t n,
+    const double min,
+    const double max,
+    double* r,
+    CUDAContext* context) {
+  CURAND_ENFORCE(
+      curandGenerateUniformDouble(context->curand_generator(), r, n));
+  UniformShift<double>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, min, max, r);
+}
+
+template <>
+void RandUniform<int, CUDAContext>(
+    const size_t n,
+    const int min,
+    const int max,
+    int* r,
+    CUDAContext* context) {
+  CURAND_ENFORCE(curandGenerate(
+      context->curand_generator(), reinterpret_cast<unsigned int*>(r), n));
+  UniformIntFit<<<
+      CAFFE_GET_BLOCKS(n),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      n, min, max, reinterpret_cast<unsigned int*>(r));
+}
+
+template <typename T>
+size_t HandleOddLengthRandGaussian(
+    const size_t n,
+    const T mean,
+    const T std,
+    T* r,
+    CUDAContext* context) {
+  if (n % 2 == 1) {
+    std::default_random_engine generator;
+    std::normal_distribution<T> distribution(mean, std);
+    const T random_value = distribution(generator);
+    math::Set<T, CUDAContext>(1, random_value, r + (n - 1), context);
+    return n - 1;
+  }
+  return n;
+}
+
+template <>
+void RandGaussian<float, CUDAContext>(
+    const size_t n,
+    const float mean,
+    const float std,
+    float* r,
+    CUDAContext* context) {
+  // If n is odd, we add a random Gaussian value at the end manually
+  // and generate n-1 random values using curandGenerateNormal.
+  // curandGenerateNormal requires n to be even.
+  const size_t even_n =
+      HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
+  CURAND_ENFORCE(
+      curandGenerateNormal(context->curand_generator(), r, even_n, mean, std));
+}
+
+template <>
+void RandGaussian<double, CUDAContext>(
+    const size_t n,
+    const double mean,
+    const double std,
+    double* r,
+    CUDAContext* context) {
+  const size_t even_n =
+      HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
+  CURAND_ENFORCE(curandGenerateNormalDouble(
+      context->curand_generator(), r, even_n, mean, std));
+}
+
+template <>
+void Dot<float, CUDAContext>(
+    const int n,
+    const float* a,
+    const float* b,
+    float* y,
+    CUDAContext* context) {
+  float result;
+  CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
+  context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
+}
+
+template <>
+void Dot<float16, CUDAContext>(
+    const int n,
+    const float16* a,
+    const float16* b,
+    float16* y,
+    CUDAContext* context) {
+  float16 result;
+  // execute with 32-bit math
+  CUBLAS_CHECK(cublasDotEx(
+      context->cublas_handle(),
+      n,
+      a,
+      CUDA_R_16F,
+      1,
+      b,
+      CUDA_R_16F,
+      1,
+      &result,
+      CUDA_R_16F,
+      CUDA_R_32F));
+  context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
+}
+
+// A previous version of caffe2 used Thrust but it turns out that thrust
+// reduction has an implicit scratch space allocation and deallocation, which
+// may interfere with NCCL and create a deadlock. Hence we are using a custom
+// reduction here.
+#define SUM_KERNEL_NTHREADS 128
+template <typename T>
+__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
+  const int idx = threadIdx.x;
+  __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
+
+  reduction_buffer[idx] = 0;
+
+  // A multilevel reduction.
+  // N -> 128
+  if (!square) {
+    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
+      reduction_buffer[idx] += convert::To<T, float>(X[i]);
+    }
+  } else {
+    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
+      float Xi = convert::To<T, float>(X[i]);
+      reduction_buffer[idx] += Xi * Xi;
+    }
+  }
+  __syncthreads();
+  // 128 -> 32
+  if (idx < 32) {
+    reduction_buffer[idx] += reduction_buffer[idx + 32] +
+        reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
+  }
+  __syncthreads();
+  // 32 -> 1
+  if (idx == 0) {
+    float tmp = 0;
+    for (int i = 0; i < 32; ++i) {
+      tmp += reduction_buffer[i];
+    }
+    *Y = convert::To<float, T>(tmp);
+  }
+}
+
+// According to the benchmarks script
+// caffe2/caffe2/experiments/python/device_reduce_sum_bench.py,
+// device reduce is slower for N <= 10000.
+#define DEVICE_REDUCE_SIZE_THRESHOLD 10000
+
+namespace {
+
+template <typename T>
+__global__ void SumConvertKernel(float* sum, T* dest) {
+  *dest = convert::To<float, T>(*sum);
+}
+
+template <typename T, typename IterT>
+void SumGenericIter(
+    const int N,
+    IterT it,
+    T*& dest,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch_ptr) {
+  size_t memRequired = 0;
+  cub::DeviceReduce::Sum(
+      nullptr, memRequired, it, dest, N, context->cuda_stream());
+  auto buffer_size =
+      static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
+  if (!dest) {
+    // allocate one more T at the end of scratch for dest
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
+    dest = scratch_ptr->template mutable_data<T>() + buffer_size;
+  } else {
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
+  }
+  cub::DeviceReduce::Sum(
+      static_cast<void*>(scratch_ptr->template mutable_data<T>()),
+      memRequired,
+      it,
+      dest,
+      N,
+      context->cuda_stream());
+}
+} // namespace
+
+template <>
+void Sum<float, CUDAContext>(
+    const int N,
+    const float* x,
+    float* y,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SumGenericIter<float>(N, x, y, context, scratch_ptr);
+  } else {
+    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
+        N, x, y, false);
+  }
+}
+
+template <>
+void Sum<int32_t, CUDAContext>(
+    const int N,
+    const int32_t* x,
+    int32_t* y,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
+  } else {
+    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
+        N, x, y, false);
+  }
+}
+
+namespace {
+template <typename T>
+struct FloatTransform {
+  inline __host__ __device__ float operator()(const T v) const {
+    return convert::To<T, float>(v);
+  }
+};
+} // namespace
+
+#define CAFFE2_MATH_SUM_FUNC(T)                                           \
+  template <>                                                             \
+  void Sum<T, CUDAContext>(                                               \
+      const int N,                                                        \
+      const T* x,                                                         \
+      T* y,                                                               \
+      CUDAContext* context,                                               \
+      Tensor<CUDAContext>* scratch_ptr) {                                 \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
+      FloatTransform<T> transform;                                        \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
+          x, transform);                                                  \
+      float* sum = nullptr;                                               \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);            \
+      SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);      \
+    } else {                                                              \
+      SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(   \
+          N, x, y, false);                                                \
+    }                                                                     \
+  }
+
+CAFFE2_MATH_SUM_FUNC(float16)
+#undef CAFFE2_MATH_SUM_FUNC
+
+namespace {
+template <typename T>
+struct SqrTransform {
+  inline __host__ __device__ T operator()(const T v) const {
+    return v * v;
+  }
+};
+} //  namespace
+
+template <>
+void SumSqr<float, CUDAContext>(
+    const int N,
+    const float* x,
+    float* y,
+    CUDAContext* context,
+    Tensor<CUDAContext>* scratch_ptr) {
+  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
+    SqrTransform<float> transform;
+    cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
+        x, transform);
+    SumGenericIter<float>(N, it, y, context, scratch_ptr);
+  } else {
+    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
+        N, x, y, true);
+  }
+}
+
+#define CAFFE2_MATH_SUMSQR_FUNC(T)                                      \
+  template <>                                                           \
+  void SumSqr<T, CUDAContext>(                                          \
+      const int N,                                                      \
+      const T* x,                                                       \
+      T* y,                                                             \
+      CUDAContext* context,                                             \
+      Tensor<CUDAContext>* scratch_ptr) {                               \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {              \
+      FloatTransform<T> float_transform;                                \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T*>   \
+          float_it(x, float_transform);                                 \
+      SqrTransform<float> sqr_transform;                                \
+      cub::TransformInputIterator<                                      \
+          float,                                                        \
+          SqrTransform<float>,                                          \
+          decltype(float_it)>                                           \
+          it(float_it, sqr_transform);                                  \
+      float* sum = nullptr;                                             \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);          \
+      SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);    \
+    } else {                                                            \
+      SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \
+          N, x, y, true);                                               \
+    }                                                                   \
+  }
+
+CAFFE2_MATH_SUMSQR_FUNC(float16)
+#undef CAFFE2_MATH_SUMSQR_FUNC
+#undef DEVICE_REDUCE_SIZE_THRESHOLD
+
+namespace {
+template <typename T>
+__global__ void
+SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    y[i] = x[i * D + idx[i]];
+  }
+}
+} // namespace
+
+template <>
+void Select<float, CUDAContext>(
+    const int N,
+    const int D,
+    const float* x,
+    const int* idx,
+    float* y,
+    CUDAContext* context) {
+  SelectKernel<float>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, D, x, idx, y);
+}
+
+template <>
+void Select<float16, CUDAContext>(
+    const int N,
+    const int D,
+    const float16* x,
+    const int* idx,
+    float16* y,
+    CUDAContext* context) {
+  SelectKernel<float16>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, D, x, idx, y);
+}
+
+namespace {
+template <typename T>
+__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
+    y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
+  }
+}
+
+template <typename T>
+__global__ void
+ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    y[i] = x[i] * (*alpha);
+  }
+}
+
+template <typename T>
+__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    y[i] = powf(x[i], exponent);
+  }
+}
+
+// fp16 specialization
+template <>
+__global__ void ScaleKernelDeviceAlpha(
+    const int n,
+    const float* alpha,
+    const float16* x,
+    float16* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    y[i] = convert::To<float, float16>(
+        convert::To<float16, float>(x[i]) * (*alpha));
+  }
+}
+
+} // namespace
+
+template <>
+void Powx<float, CUDAContext>(
+    const int N,
+    const float* a,
+    const float b,
+    float* y,
+    CUDAContext* context) {
+  PowKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(N, a, b, y);
+}
+
+template <>
+void Scale<float, CUDAContext>(
+    const int n,
+    const float alpha,
+    const float* x,
+    float* y,
+    CUDAContext* context) {
+  ScaleKernel<float>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, x, y);
+}
+
+template <>
+void Scale<float16, CUDAContext>(
+    const int n,
+    const float alpha,
+    const float16* x,
+    float16* y,
+    CUDAContext* context) {
+  ScaleKernel<float16>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, x, y);
+}
+
+template <>
+void Scale<float, CUDAContext>(
+    const int n,
+    const float* alpha,
+    const float* x,
+    float* y,
+    CUDAContext* context) {
+  ScaleKernelDeviceAlpha<float>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, x, y);
+}
+
+template <>
+void Scale<float16, CUDAContext>(
+    const int n,
+    const float* alpha,
+    const float16* x,
+    float16* y,
+    CUDAContext* context) {
+  ScaleKernelDeviceAlpha<float16>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, x, y);
+}
+
+template <>
+void Axpy<float, CUDAContext>(
+    const int N,
+    const float alpha,
+    const float* X,
+    float* Y,
+    CUDAContext* context) {
+  CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
+}
+
+template <>
+void Axpy<double, CUDAContext>(
+    const int N,
+    const float alpha,
+    const double* X,
+    double* Y,
+    CUDAContext* context) {
+  double alpha_d{alpha};
+  CUBLAS_ENFORCE(
+      cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
+}
+
+template <>
+void Axpy<float16, CUDAContext>(
+    const int N,
+    const float alpha,
+    const float16* X,
+    float16* Y,
+    CUDAContext* context) {
+  CUBLAS_CHECK(cublasAxpyEx(
+      context->cublas_handle(),
+      N,
+      &alpha,
+      CUDA_R_16F,
+      X,
+      CUDA_R_16F,
+      1,
+      Y,
+      CUDA_R_16F,
+      1,
+      CUDA_R_32F));
+}
+
+namespace {
+template <typename T>
+__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    y[index] = convert::Get<T>(
+        convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
+  }
+}
+} // namespace
+
+template <>
+void Axpy<float, CUDAContext>(
+    const int n,
+    const float* alpha,
+    const float* X,
+    float* Y,
+    CUDAContext* context) {
+  AxpyKernel<float>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, X, Y);
+}
+
+template <>
+void Axpy<float16, CUDAContext>(
+    const int n,
+    const float* alpha,
+    const float16* X,
+    float16* Y,
+    CUDAContext* context) {
+  AxpyKernel<float16>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, alpha, X, Y);
+}
+
+namespace {
+template <typename T>
+__global__ void
+AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    y[index] = x[index] * a + y[index] * b;
+  }
+}
+} // namespace
+
+template <>
+void Axpby<float, CUDAContext>(
+    const int n,
+    const float a,
+    const float* x,
+    const float b,
+    float* y,
+    CUDAContext* context) {
+  AxpbyKernel<float>
+      <<<CAFFE_GET_BLOCKS(n),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(n, a, x, b, y);
+}
+
+namespace {
+
+template <typename T>
+__global__ void Im2ColNCHWCUDAKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* img_data,
+    T* col_data) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int w_out = index % output_w;
+    const int h_index = index / output_w;
+    const int h_out = h_index % output_h;
+    const int channel_in = h_index / output_h;
+    const int channel_out = channel_in * kernel_h * kernel_w;
+    const int h_in = h_out * stride_h - pad_t;
+    const int w_in = w_out * stride_w - pad_l;
+    const int output_size = output_h * output_w;
+    T* col_data_ptr =
+        col_data + (channel_out * output_h + h_out) * output_w + w_out;
+    const T* img_data_ptr =
+        img_data + (channel_in * input_h + h_in) * input_w + w_in;
+    int dh = 0;
+    for (int i = 0; i < kernel_h; ++i) {
+      int dw = 0;
+      for (int j = 0; j < kernel_w; ++j) {
+        const int h = h_in + dh;
+        const int w = w_in + dw;
+#if __CUDA_ARCH__ >= 350
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? __ldg(img_data_ptr + dh * input_w + dw)
+            : 0;
+#else
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? img_data_ptr[dh * input_w + dw]
+            : 0;
+#endif
+        col_data_ptr += output_size;
+        dw += dilation_w;
+      }
+      dh += dilation_h;
+    }
+  }
+}
+
+template <typename T>
+__global__ void Im2ColNHWCCUDAKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_w,
+    const int channels,
+    const T* img_data,
+    T* col_data) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int channel_in = index % channels;
+    const int w_out = index / channels % output_w;
+    const int h_out = index / channels / output_w;
+    const int h_in = h_out * stride_h - pad_t;
+    const int w_in = w_out * stride_w - pad_l;
+    T* col_data_ptr = col_data +
+        (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
+        channel_in;
+    int dh = 0;
+    for (int i = 0; i < kernel_h; ++i) {
+      int dw = 0;
+      for (int j = 0; j < kernel_w; ++j) {
+        const int h = h_in + dh;
+        const int w = w_in + dw;
+#if __CUDA_ARCH__ >= 350
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
+            : 0;
+#else
+        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
+            ? img_data[(h * input_w + w) * channels + channel_in]
+            : 0;
+#endif
+        col_data_ptr += channels;
+        dw += dilation_w;
+      }
+      dh += dilation_h;
+    }
+  }
+}
+
+template <typename T>
+__global__ void Col2ImNCHWCUDAKernel(
+    const int n,
+    const int input_h,
+    const int input_w,
+    const int patch_h,
+    const int patch_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* col_data,
+    T* img_data) {
+  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
+  const int dpatch_w = dilation_w * (patch_w - 1) + 1;
+
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    const int w = index % input_w + pad_l;
+    const int h = index / input_w % input_h + pad_t;
+    const int c = index / (input_h * input_w);
+
+    // compute the start and end of the output
+    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
+    const int w_col_end = min(w / stride_w + 1, output_w);
+    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
+    const int h_col_end = min(h / stride_h + 1, output_h);
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_k = (h - h_col * stride_h);
+        int w_k = (w - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          const int col_data_index =
+              (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) *
+                  output_w +
+              w_col;
+#if __CUDA_ARCH__ >= 350
+          val += __ldg(col_data + col_data_index);
+#else
+          val += col_data[col_data_index];
+#endif
+        }
+      }
+    }
+    img_data[index] = val;
+  }
+}
+
+template <typename T>
+__global__ void Col2ImNHWCCUDAKernel(
+    const int n,
+    const int input_w,
+    const int channels,
+    const int patch_h,
+    const int patch_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int stride_h,
+    const int stride_w,
+    const int output_h,
+    const int output_w,
+    const T* col_data,
+    T* img_data) {
+  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
+  const int dpatch_w = dilation_w * (patch_w - 1) + 1;
+
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    const int c = index % channels;
+    const int w = index / channels % input_w + pad_l;
+    const int h = index / channels / input_w + pad_t;
+    // compute the start and end of the output
+    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
+    const int w_col_end = min(w / stride_w + 1, output_w);
+    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
+    const int h_col_end = min(h / stride_h + 1, output_h);
+    const int channels_col = patch_h * patch_w * channels;
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_k = h - h_col * stride_h;
+        int w_k = w - w_col * stride_w;
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          const int c_col = (h_k * patch_w + w_k) * channels + c;
+#if __CUDA_ARCH__ >= 350
+          val += __ldg(
+              col_data + (h_col * output_w + w_col) * channels_col + c_col);
+#else
+          val += col_data[(h_col * output_w + w_col) * channels_col + c_col];
+#endif
+        }
+      }
+    }
+    img_data[index] = val;
+  }
+}
+
+template <typename T, int N, bool kCol2Im>
+__global__ void Im2ColNdNCHWCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const int kernel_size,
+    SimpleArray<int, N + 1> img_shape,
+    SimpleArray<int, N + 1> col_shape,
+    SimpleArray<int, N> kernel_shape,
+    SimpleArray<int, N> stride,
+    SimpleArray<int, N> dilation,
+    SimpleArray<int, N> pad,
+    const T* X_data,
+    T* Y_data) {
+  int d_offset[N];
+  int d_iter[N];
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    int offset_i = i;
+#pragma unroll
+    for (int d_i = N - 1; d_i >= 0; --d_i) {
+      d_offset[d_i] = offset_i % kernel_shape.data[d_i];
+      offset_i /= kernel_shape.data[d_i];
+    }
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int offset_j = j;
+#pragma unroll
+      for (int d_i = N - 1; d_i >= 0; --d_i) {
+        d_iter[d_i] = offset_j % col_shape.data[d_i + 1];
+        offset_j /= col_shape.data[d_i + 1];
+      }
+      const int col_index = i * inner_size + j;
+      int img_index = i / kernel_size;
+      bool is_padding = false;
+#pragma unroll
+      for (int d_i = 0; d_i < N; ++d_i) {
+        const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
+            d_offset[d_i] * dilation.data[d_i];
+        is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
+        img_index = img_index * img_shape.data[d_i + 1] + d_img;
+      }
+#if __CUDA_ARCH__ >= 350
+      if (!kCol2Im) {
+        Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index);
+      } else if (!is_padding) {
+        atomicAdd(Y_data + img_index, __ldg(X_data + col_index));
+      }
+#else
+      if (!kCol2Im) {
+        Y_data[col_index] = is_padding ? 0 : X_data[img_index];
+      } else if (!is_padding) {
+        atomicAdd(Y_data + img_index, X_data[col_index]);
+      }
+#endif
+    }
+  }
+}
+
+template <typename T, int N>
+void Im2ColNdNCHWCUDAImpl(
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* img_data,
+    float* col_data,
+    CUDAContext* context) {
+  const int outer_size = col_shape[0];
+  const int inner_size = col_size / outer_size;
+  const int kernel_size = std::accumulate(
+      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  SimpleArray<int, N + 1> img_shape_array;
+  SimpleArray<int, N + 1> col_shape_array;
+  SimpleArray<int, N> kernel_shape_array;
+  SimpleArray<int, N> stride_array;
+  SimpleArray<int, N> dilation_array;
+  SimpleArray<int, N> pad_array;
+  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
+  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
+  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
+  std::memcpy(stride_array.data, stride, N * sizeof(int));
+  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
+  std::memcpy(pad_array.data, pad, N * sizeof(int));
+  Im2ColNdNCHWCUDAKernel<T, N, false>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          kernel_size,
+          img_shape_array,
+          col_shape_array,
+          kernel_shape_array,
+          stride_array,
+          dilation_array,
+          pad_array,
+          img_data,
+          col_data);
+}
+
+template <typename T, int N>
+void Col2ImNdNCHWCUDAImpl(
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* col_data,
+    float* img_data,
+    CUDAContext* context) {
+  const int outer_size = col_shape[0];
+  const int inner_size = col_size / outer_size;
+  const int kernel_size = std::accumulate(
+      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  SimpleArray<int, N + 1> img_shape_array;
+  SimpleArray<int, N + 1> col_shape_array;
+  SimpleArray<int, N> kernel_shape_array;
+  SimpleArray<int, N> stride_array;
+  SimpleArray<int, N> dilation_array;
+  SimpleArray<int, N> pad_array;
+  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
+  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
+  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
+  std::memcpy(stride_array.data, stride, N * sizeof(int));
+  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
+  std::memcpy(pad_array.data, pad, N * sizeof(int));
+  Set<T, CUDAContext>(img_size, 0, img_data, context);
+  Im2ColNdNCHWCUDAKernel<T, N, true>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          kernel_size,
+          img_shape_array,
+          col_shape_array,
+          kernel_shape_array,
+          stride_array,
+          dilation_array,
+          pad_array,
+          col_data,
+          img_data);
+}
+
+} // namespace
+
+template <>
+void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    CUDAContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = channels * output_h * output_w;
+  Im2ColNCHWCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          num_kernels,
+          height,
+          width,
+          kernel_h,
+          kernel_w,
+          dilation_h,
+          dilation_w,
+          pad_t,
+          pad_l,
+          stride_h,
+          stride_w,
+          output_h,
+          output_w,
+          img_data,
+          col_data);
+}
+
+template <>
+void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* img_data,
+    float* col_data,
+    CUDAContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = output_h * output_w * channels;
+  Im2ColNHWCCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          num_kernels,
+          height,
+          width,
+          kernel_h,
+          kernel_w,
+          dilation_h,
+          dilation_w,
+          pad_t,
+          pad_l,
+          stride_h,
+          stride_w,
+          output_w,
+          channels,
+          img_data,
+          col_data);
+}
+
+template <>
+void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    CUDAContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = channels * height * width;
+  Col2ImNCHWCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          num_kernels,
+          height,
+          width,
+          kernel_h,
+          kernel_w,
+          dilation_h,
+          dilation_w,
+          pad_t,
+          pad_l,
+          stride_h,
+          stride_w,
+          output_h,
+          output_w,
+          col_data,
+          img_data);
+}
+
+template <>
+void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int pad_t,
+    const int pad_l,
+    const int pad_b,
+    const int pad_r,
+    const int stride_h,
+    const int stride_w,
+    const float* col_data,
+    float* img_data,
+    CUDAContext* context) {
+  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
+  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+  const int num_kernels = height * width * channels;
+  Col2ImNHWCCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(num_kernels),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          num_kernels,
+          width,
+          channels,
+          kernel_h,
+          kernel_w,
+          dilation_h,
+          dilation_w,
+          pad_t,
+          pad_l,
+          stride_h,
+          stride_w,
+          output_h,
+          output_w,
+          col_data,
+          img_data);
+}
+
+template <>
+void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* img_data,
+    float* col_data,
+    CUDAContext* context) {
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      N,
+      Im2ColNdNCHWCUDAImpl,
+      float,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      img_data,
+      col_data,
+      context);
+}
+
+template <>
+void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
+    const int N,
+    const int img_size,
+    const int col_size,
+    const int* img_shape,
+    const int* col_shape,
+    const int* kernel_shape,
+    const int* stride,
+    const int* dilation,
+    const int* pad,
+    const float* col_data,
+    float* img_data,
+    CUDAContext* context) {
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+      N,
+      Col2ImNdNCHWCUDAImpl,
+      float,
+      img_size,
+      col_size,
+      img_shape,
+      col_shape,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      col_data,
+      img_data,
+      context);
+}
+
+template <>
+void CopyMatrix<CUDAContext>(
+    const size_t itemsize,
+    const int M,
+    const int N,
+    const void* A,
+    const int lda,
+    void* B,
+    const int ldb,
+    CUDAContext* context,
+    TypeMeta::TypedCopy copy) {
+  CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context");
+  cudaMemcpy2DAsync(
+      B,
+      ldb * itemsize,
+      A,
+      lda * itemsize,
+      N * itemsize,
+      M,
+      cudaMemcpyDeviceToDevice,
+      context->cuda_stream());
+}
+
+template <>
+void CopyVector<float, CUDAContext>(
+    const int N,
+    const float* src,
+    float* dst,
+    CUDAContext* context) {
+  if (src != dst && N > 0) {
+    cudaMemcpyAsync(
+        dst,
+        src,
+        sizeof(float) * N,
+        cudaMemcpyDeviceToDevice,
+        context->cuda_stream());
+  }
+}
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename T, class Reducer>
+__global__ void RowwiseReduceKernel(
+    const int rows,
+    const int cols,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+      val = reducer(X[i * cols + j], val);
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, class Reducer>
+__global__ void ColwiseReduceKernel(
+    const int rows,
+    const int cols,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < cols; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < rows; j += blockDim.x) {
+      val = reducer(X[j * cols + i], val);
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T)                            \
+  template <>                                                             \
+  void RowwiseMax<T, CUDAContext>(                                        \
+      const int N, const int D, const T* x, T* y, CUDAContext* context) { \
+    RowwiseReduceKernel<<<                                                \
+        std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
+        CAFFE_CUDA_NUM_THREADS,                                           \
+        0,                                                                \
+        context->cuda_stream()>>>(                                        \
+        N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
+  }
+CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
+#undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX
+
+#define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T)                            \
+  template <>                                                             \
+  void ColwiseMax<T, CUDAContext>(                                        \
+      const int N, const int D, const T* x, T* y, CUDAContext* context) { \
+    ColwiseReduceKernel<<<                                                \
+        std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
+        CAFFE_CUDA_NUM_THREADS,                                           \
+        0,                                                                \
+        context->cuda_stream()>>>(                                        \
+        N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
+  }
+CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float)
+#undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX
+
+namespace {
+__global__ void
+maximum_kernel(const int N, const float alpha, const float* x, float* y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    y[i] = fmaxf(x[i], alpha);
+  }
+}
+} // namespace
+
+template <>
+void Maximum(
+    const int N,
+    const float alpha,
+    const float* x,
+    float* y,
+    CUDAContext* context) {
+  maximum_kernel<<<
+      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(N, alpha, x, y);
+}
+
+namespace {
+
+template <typename T, class Reducer, int D>
+__global__ void ReduceTensorCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    SimpleArray<int, D> X_strides,
+    SimpleArray<FixedDivisor<int>, D> Y_dims,
+    const Reducer reducer,
+    const T init,
+    const T* X,
+    T* Y) {
+  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T val = init;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int X_index = 0;
+      int Y_index = i * inner_size + j;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
+        X_index += r * X_strides.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      val = reducer(val, __ldg(X + X_index));
+#else
+      val = reducer(val, X[X_index]);
+#endif
+    }
+    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
+    if (threadIdx.x == 0) {
+      Y[i] = val;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, class Reducer, int D>
+void ReduceTensorCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* dims,
+    const int* axes,
+    const Reducer& reducer,
+    const T& init,
+    const T* X,
+    T* Y,
+    CUDAContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
+  }
+  ReduceTensorCUDAKernel<T, Reducer, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size, inner_size, X_strides, Y_dims, reducer, init, X, Y);
+}
+
+template <typename T, class Reducer>
+void ReduceTensorCUDA(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const Reducer& reducer,
+    const T& init,
+    const T* X,
+    T* Y,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_LE(num_axes, num_dims);
+  if (X == Y) {
+    return;
+  }
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
+  const int pivot = num_dims - num_axes;
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= dims[transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < num_dims; ++i) {
+    inner_size *= dims[transpose_axes[i]];
+  }
+  if (outer_size > 0 && inner_size > 0) {
+    if (transpose_axes[pivot] == pivot) {
+      RowwiseReduceKernel<T>
+          <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(
+              outer_size, inner_size, reducer, init, X, Y);
+      return;
+    }
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+        num_dims,
+        ReduceTensorCUDAImpl,
+        T,
+        Reducer,
+        outer_size,
+        inner_size,
+        dims,
+        transpose_axes.data(),
+        reducer,
+        init,
+        X,
+        Y,
+        context);
+  } else if (outer_size > 0) {
+    math::Set<T, CUDAContext>(outer_size, init, Y, context);
+  }
+}
+
+template <typename T>
+void ReduceMeanCUDAImpl(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* Y,
+    CUDAContext* context) {
+  ReduceTensorCUDA(
+      num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
+  const int X_size =
+      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
+  int scale = 1;
+  for (int i = 0; i < num_axes; ++i) {
+    scale *= dims[axes[i]];
+  }
+  const int Y_size = X_size / scale;
+  Scale<T, CUDAContext>(
+      Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
+  template <>                                 \
+  void ReduceMin<T, CUDAContext>(             \
+      const int num_dims,                     \
+      const int* dims,                        \
+      const int num_axes,                     \
+      const int* axes,                        \
+      const T* X,                             \
+      T* Y,                                   \
+      CUDAContext* context) {                 \
+    ReduceTensorCUDA(                         \
+        num_dims,                             \
+        dims,                                 \
+        num_axes,                             \
+        axes,                                 \
+        cub::Min(),                           \
+        std::numeric_limits<T>::max(),        \
+        X,                                    \
+        Y,                                    \
+        context);                             \
+  }
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(float)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
+#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN
+
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
+  template <>                                 \
+  void ReduceMax<T, CUDAContext>(             \
+      const int num_dims,                     \
+      const int* dims,                        \
+      const int num_axes,                     \
+      const int* axes,                        \
+      const T* X,                             \
+      T* Y,                                   \
+      CUDAContext* context) {                 \
+    ReduceTensorCUDA(                         \
+        num_dims,                             \
+        dims,                                 \
+        num_axes,                             \
+        axes,                                 \
+        cub::Max(),                           \
+        std::numeric_limits<T>::lowest(),     \
+        X,                                    \
+        Y,                                    \
+        context);                             \
+  }
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
+#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX
+
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T)                             \
+  template <>                                                             \
+  void ReduceSum<T, CUDAContext>(                                         \
+      const int num_dims,                                                 \
+      const int* dims,                                                    \
+      const int num_axes,                                                 \
+      const int* axes,                                                    \
+      const T* X,                                                         \
+      T* Y,                                                               \
+      CUDAContext* context) {                                             \
+    ReduceTensorCUDA(                                                     \
+        num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(float)
+CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
+#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM
+
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T)                            \
+  template <>                                                             \
+  void ReduceMean<T, CUDAContext>(                                        \
+      const int num_dims,                                                 \
+      const int* dims,                                                    \
+      const int num_axes,                                                 \
+      const int* axes,                                                    \
+      const T* X,                                                         \
+      T* Y,                                                               \
+      CUDAContext* context) {                                             \
+    ReduceMeanCUDAImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(float)
+#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN
+
+namespace {
+
+template <typename T, int D>
+__global__ void BroadcastCUDAKernel(
+    const int Y_size,
+    const SimpleArray<int, D> X_strides,
+    const SimpleArray<int, D> Y_dims,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(Y_index, Y_size) {
+    int X_index = 0;
+    int Y_index_val = Y_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      X_index += X_strides.data[i] == 0
+          ? 0
+          : (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
+      Y_index_val /= Y_dims.data[i];
+    }
+#if __CUDA_ARCH__ >= 350
+    Y[Y_index] = __ldg(X + X_index);
+#else
+    Y[Y_index] = X[X_index];
+#endif
+  }
+}
+
+template <typename T, int D>
+void BroadcastCUDAImpl(
+    const int X_ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T* X,
+    T* Y,
+    CUDAContext* context) {
+  SimpleArray<int, D> X_strides_array;
+  SimpleArray<int, D> Y_dims_array;
+  const int d = D - X_ndim;
+  std::fill(X_strides_array.data, X_strides_array.data + d, 0);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= d; --i) {
+    CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
+    X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride;
+    cur_stride *= X_dims[i - d];
+  }
+  std::copy_n(Y_dims, D, Y_dims_array.data);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
+  BroadcastCUDAKernel<T, D>
+      <<<CAFFE_GET_BLOCKS(Y_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(Y_size, X_strides_array, Y_dims_array, X, Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T)                                  \
+  template <>                                                                 \
+  void Broadcast<T, CUDAContext>(                                             \
+      const int X_ndim,                                                       \
+      const int* X_dims,                                                      \
+      const int Y_ndim,                                                       \
+      const int* Y_dims,                                                      \
+      const T* X,                                                             \
+      T* Y,                                                                   \
+      CUDAContext* context) {                                                 \
+    CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                         \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                   \
+        Y_ndim, BroadcastCUDAImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int32_t)
+CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int64_t)
+CAFFE2_SPECIALIZED_CUDA_BROADCAST(float)
+CAFFE2_SPECIALIZED_CUDA_BROADCAST(double)
+#undef CAFFE2_SPECIALIZED_CUDA_BROADCAST
+
+namespace {
+
+template <typename T>
+__global__ void RowwiseMomentsCUDAKernel(
+    const int rows,
+    const int cols,
+    const T* X,
+    T* mean,
+    T* variance) {
+  __shared__ typename BlockReduce<T>::TempStorage m_storage;
+  __shared__ typename BlockReduce<T>::TempStorage v_storage;
+  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
+    T m_val = 0;
+    T v_val = 0;
+    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+      const int X_index = i * cols + j;
+#if __CUDA_ARCH__ >= 350
+      m_val += __ldg(X + X_index);
+      v_val += __ldg(X + X_index) * __ldg(X + X_index);
+#else
+      m_val += X[X_index];
+      v_val += X[X_index] * X[X_index];
+#endif
+    }
+    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
+    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean[i] = m_val / static_cast<T>(cols);
+      variance[i] = v_val / static_cast<T>(cols) - mean[i] * mean[i];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, int D>
+__global__ void MomentsCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    SimpleArray<int, D> X_strides,
+    SimpleArray<FixedDivisor<int>, D> Y_dims,
+    const T* X,
+    T* mean,
+    T* variance) {
+  __shared__ typename BlockReduce<T>::TempStorage m_storage;
+  __shared__ typename BlockReduce<T>::TempStorage v_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T m_val = 0;
+    T v_val = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int X_index = 0;
+      int Y_index = i * inner_size + j;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
+        X_index += r * X_strides.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      m_val += __ldg(X + X_index);
+      v_val += __ldg(X + X_index) * __ldg(X + X_index);
+#else
+      m_val += X[X_index];
+      v_val += X[X_index] * X[X_index];
+#endif
+    }
+    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
+    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean[i] = m_val / static_cast<T>(inner_size);
+      variance[i] = v_val / static_cast<T>(inner_size) - mean[i] * mean[i];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, int D>
+void MomentsCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    CUDAContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
+  }
+  MomentsCUDAKernel<T, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size, inner_size, X_strides, Y_dims, X, mean, variance);
+}
+
+template <typename T>
+void MomentsCUDA(
+    const int num_dims,
+    const int* dims,
+    const int num_axes,
+    const int* axes,
+    const T* X,
+    T* mean,
+    T* variance,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_LE(num_axes, num_dims);
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
+  const int pivot = num_dims - num_axes;
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= dims[transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < num_dims; ++i) {
+    inner_size *= dims[transpose_axes[i]];
+  }
+  if (outer_size > 0 && inner_size > 0) {
+    if (transpose_axes[pivot] == pivot) {
+      RowwiseMomentsCUDAKernel<T>
+          <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(
+              outer_size, inner_size, X, mean, variance);
+      return;
+    }
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+        num_dims,
+        MomentsCUDAImpl,
+        T,
+        outer_size,
+        inner_size,
+        dims,
+        transpose_axes.data(),
+        X,
+        mean,
+        variance,
+        context);
+  }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T)                           \
+  template <>                                                        \
+  void Moments<T, CUDAContext>(                                      \
+      const int num_dims,                                            \
+      const int* dims,                                               \
+      const int num_axes,                                            \
+      const int* axes,                                               \
+      const T* X,                                                    \
+      T* mean,                                                       \
+      T* variance,                                                   \
+      CUDAContext* context) {                                        \
+    MomentsCUDA<T>(                                                  \
+        num_dims, dims, num_axes, axes, X, mean, variance, context); \
+  }
+CAFFE2_SPECIALIZED_CUDA_MOMENTS(float)
+#undef CAFFE2_SPECIALIZED_CUDA_MOMENTS
+
+namespace {
+
+template <typename T, int D>
+__global__ void TransposeCUDAKernel(
+    const int size,
+    const SimpleArray<int, D> X_strides,
+    const SimpleArray<FixedDivisor<int>, D> Y_dims,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(Y_index, size) {
+    int X_index = 0;
+    int Y_index_val = Y_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      int d;
+      Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
+      X_index += d * X_strides.data[i];
+    }
+#if __CUDA_ARCH__ >= 350
+    Y[Y_index] = __ldg(X + X_index);
+#else
+    Y[Y_index] = X[X_index];
+#endif
+  }
+}
+
+template <typename T, int D>
+void TransposeCUDAImpl(
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* Y,
+    CUDAContext* context) {
+  SimpleArray<int, D> X_strides;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
+  int size = 1;
+  for (int i = 0; i < D; ++i) {
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
+    size *= dims[i];
+  }
+  TransposeCUDAKernel<T, D>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                             \
+  template <>                                                            \
+  void Transpose<T, CUDAContext>(                                        \
+      const int ndim,                                                    \
+      const int* dims,                                                   \
+      const int* axes,                                                   \
+      const T* X,                                                        \
+      T* Y,                                                              \
+      CUDAContext* context) {                                            \
+    if (utils::IsIdentityPermutation(ndim, axes)) {                      \
+      const int size =                                                   \
+          std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
+      context->template Copy<T, CUDAContext, CUDAContext>(size, X, Y);   \
+      return;                                                            \
+    }                                                                    \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                              \
+        ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context);          \
+  }
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex)
+#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
new file mode 100644
index 0000000..8de888f
--- /dev/null
+++ b/caffe2/utils/math_gpu_test.cc
@@ -0,0 +1,920 @@
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/utility_ops.h"
+#include "caffe2/utils/math.h"
+
+CAFFE2_DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+void executeGpuBinaryOpTest(
+    int shapex0,
+    int shapex1,
+    int shapey,
+    std::function<float(int)> input0,
+    std::function<float(int)> input1,
+    std::function<void(
+        int N0,
+        int N1,
+        const float* src0,
+        const float* src1,
+        float* dst,
+        CUDAContext* context)> operation,
+    std::function<float(int)> correct_output) {
+  if (!HasCudaGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(CUDA);
+  CUDAContext context(option);
+
+  Blob* blobx0 = ws.CreateBlob("X0");
+  Blob* blobx1 = ws.CreateBlob("X1");
+  Blob* bloby = ws.CreateBlob("Y");
+  Blob* bloby_host = ws.CreateBlob("Y_host");
+
+  auto* tensorx0 = blobx0->GetMutable<Tensor<CUDAContext>>();
+  auto* tensorx1 = blobx1->GetMutable<Tensor<CUDAContext>>();
+  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
+
+  vector<int> shapex0_vector{shapex0};
+  vector<int> shapex1_vector{shapex1};
+  vector<int> shapey_vector{shapey};
+
+  tensorx0->Resize(shapex0_vector);
+  tensorx1->Resize(shapex1_vector);
+  tensory->Resize(shapey_vector);
+
+  for (int i = 0; i < shapex0; i++) {
+    math::Set<float, CUDAContext>(
+        1, input0(i), tensorx0->mutable_data<float>() + i, &context);
+  }
+  for (int i = 0; i < shapex1; i++) {
+    math::Set<float, CUDAContext>(
+        1, input1(i), tensorx1->mutable_data<float>() + i, &context);
+  }
+  operation(
+      shapex0,
+      shapex1,
+      tensorx0->template data<float>(),
+      tensorx1->template data<float>(),
+      tensory->mutable_data<float>(),
+      &context);
+  context.FinishDeviceComputation();
+
+  // Copy result to CPU so we can inspect it
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
+  context.FinishDeviceComputation();
+
+  for (int i = 0; i < shapey; ++i) {
+    EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
+  }
+}
+
+TEST(MathUtilGPUTest, testAddStripedBatch) {
+  if (!HasCudaGPU())
+    return;
+  Workspace ws;
+  DeviceOption option;
+  option.set_device_type(CUDA);
+  CUDAContext context(option);
+  Blob* blobx = ws.CreateBlob("X");
+  Blob* bloby = ws.CreateBlob("Y");
+  Blob* bloby_host = ws.CreateBlob("Y_host");
+
+  vector<int> shapex{33 * 9, 25};
+  vector<int> shapey{33, 25};
+
+  auto* tensorx = blobx->GetMutable<Tensor<CUDAContext>>();
+  tensorx->Resize(shapex);
+  int stripe = 33 * 25;
+  vector<float> tot(33, 0.0);
+  for (int j = 0; j < 9; j++) {
+    // Have different values for each line
+    for (int k = 0; k < 33; k++) {
+      math::Set<float, CUDAContext>(
+          33,
+          1.0 + j + k,
+          tensorx->mutable_data<float>() + j * stripe + k * 25,
+          &context);
+      tot[k] += 1.0 + j + k;
+    }
+  }
+
+  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
+  tensory->Resize(shapey);
+  math::Set<float, CUDAContext>(
+      stripe, 0.0, tensory->mutable_data<float>(), &context);
+
+  math::AddStripedBatch<float, CUDAContext>(
+      stripe,
+      tensorx->template data<float>(),
+      tensory->mutable_data<float>(),
+      stripe,
+      9,
+      &context);
+  context.FinishDeviceComputation();
+
+  // Copy result to CPU so we can inspect it
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
+  context.FinishDeviceComputation();
+
+  for (int k = 0; k < 33; k++) {
+    for (int i = 0; i < 25; i++) {
+      EXPECT_EQ(tensory_host->data<float>()[k * 25 + i], tot[k]);
+    }
+  }
+}
+
+TEST(MathUtilGPUTest, testReduceMin) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int /*i*/) { return 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         CUDAContext* context) {
+        Tensor<CUDAContext> aux;
+        math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int i) { return i == 3 ? 11.0f : 17.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         CUDAContext* context) {
+        Tensor<CUDAContext> aux;
+        math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+}
+
+TEST(MathUtilGPUTest, testReduceMax) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int /*i*/) { return 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         CUDAContext* context) {
+        Tensor<CUDAContext> aux;
+        math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 11.0f; });
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      1,
+      [](int i) { return i == 3 ? 17.0f : 11.0f; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         CUDAContext* context) {
+        Tensor<CUDAContext> aux;
+        math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
+      },
+      [](int /*i*/) { return 17.0f; });
+}
+
+TEST(MathUtilGPUTest, testElemwiseMax) {
+  executeGpuBinaryOpTest(
+      13,
+      13,
+      13,
+      [](int i) { return 2.0f - i; },
+      [](int i) { return i - 6.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* src1,
+         float* dst,
+         CUDAContext* context) {
+        math::ElemwiseMax<float, CUDAContext>(N0, src0, src1, dst, context);
+      },
+      [](int i) { return std::max(2.0f - i, i - 6.0f); });
+}
+
+TEST(MathUtilGPUTest, testCopyVector) {
+  executeGpuBinaryOpTest(
+      6,
+      1,
+      6,
+      [](int i) { return 5.0f - i; },
+      [](int /*i*/) { return 0.0f; },
+      [](int N0,
+         int /*N1*/,
+         const float* src0,
+         const float* /*src1*/,
+         float* dst,
+         CUDAContext* context) {
+        math::CopyVector<float, CUDAContext>(N0, src0, dst, context);
+      },
+      [](int i) { return 5.0f - i; });
+}
+
+namespace {
+
+constexpr float kEps = 1e-5;
+
+class GemmBatchedGPUTest
+    : public testing::TestWithParam<testing::tuple<bool, bool>> {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    Blob* X_blob = ws_.CreateBlob("X");
+    Blob* W_blob = ws_.CreateBlob("W");
+    Blob* Y_blob = ws_.CreateBlob("Y");
+    X_ = X_blob->GetMutable<Tensor<CUDAContext>>();
+    W_ = W_blob->GetMutable<Tensor<CUDAContext>>();
+    Y_ = Y_blob->GetMutable<Tensor<CUDAContext>>();
+    X_->Resize(std::vector<TIndex>{3, 5, 10});
+    W_->Resize(std::vector<TIndex>{3, 6, 10});
+    Y_->Resize(std::vector<TIndex>{3, 5, 6});
+    math::Set<float, CUDAContext>(
+        X_->size(), 1.0f, X_->mutable_data<float>(), cuda_context_.get());
+    math::Set<float, CUDAContext>(
+        W_->size(), 1.0f, W_->mutable_data<float>(), cuda_context_.get());
+    trans_X_ = std::get<0>(GetParam());
+    trans_W_ = std::get<1>(GetParam());
+  }
+
+  void RunGemmBatched(const float alpha, const float beta) {
+    math::GemmBatched(
+        trans_X_ ? CblasTrans : CblasNoTrans,
+        trans_W_ ? CblasTrans : CblasNoTrans,
+        3,
+        5,
+        6,
+        10,
+        alpha,
+        X_->template data<float>(),
+        W_->template data<float>(),
+        beta,
+        Y_->template mutable_data<float>(),
+        cuda_context_.get());
+  }
+
+  void VerifyOutput(const float value) const {
+    TensorCPU Y_cpu(*Y_);
+    for (int i = 0; i < Y_cpu.size(); ++i) {
+      EXPECT_FLOAT_EQ(value, Y_cpu.template data<float>()[i]);
+    }
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* W_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
+  bool trans_X_;
+  bool trans_W_;
+};
+
+TEST_P(GemmBatchedGPUTest, GemmBatchedGPUFloatTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  RunGemmBatched(1.0f, 0.0f);
+  VerifyOutput(10.0f);
+  RunGemmBatched(1.0f, 0.5f);
+  VerifyOutput(15.0f);
+  RunGemmBatched(0.5f, 1.0f);
+  VerifyOutput(20.0f);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GemmBatchedGPUTrans,
+    GemmBatchedGPUTest,
+    testing::Combine(testing::Bool(), testing::Bool()));
+
+class ReduceTensorGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    cuda_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  template <class ReduceFunc>
+  void RunRedcueTensorTest(
+      const ReduceFunc& reduce_func,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, axes, X_data);
+    reduce_func(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        cuda_context_.get());
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
+};
+
+TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  const auto& reduce_min = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              CUDAContext* context) {
+    return math::ReduceMin<float, CUDAContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {1.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 2.0f, 3.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 3.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  const auto& reduce_max = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              CUDAContext* context) {
+    return math::ReduceMax<float, CUDAContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {3.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {4.0f, 5.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {6.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {7.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {6.0f, 8.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {6.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {6.0f, 15.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {5.0f, 7.0f, 9.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {21.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {10.0f, 26.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {16.0f, 20.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CUDAContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {14.0f, 22.0f});
+}
+
+TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {2.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CUDAContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f});
+}
+
+class BroadcastGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
+      const std::vector<float>& X_data) {
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    cuda_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  void RunBroadcastTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, Y_dims, X_data);
+    math::Broadcast<float, CUDAContext>(
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.size(),
+        Y_dims.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        cuda_context_.get());
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
+};
+
+TEST_F(BroadcastGPUTest, BroadcastGPUFloatTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  RunBroadcastTest({2}, {2}, {1.0f, 2.0f}, {1.0f, 2.0f});
+  RunBroadcastTest({1}, {2}, {1.0f}, {1.0f, 1.0f});
+  RunBroadcastTest({1}, {2, 2}, {1.0f}, {1.0f, 1.0f, 1.0f, 1.0f});
+  RunBroadcastTest({2, 1}, {2, 2}, {1.0f, 2.0f}, {1.0f, 1.0f, 2.0f, 2.0f});
+  RunBroadcastTest(
+      {2, 1},
+      {2, 2, 2},
+      {1.0f, 2.0f},
+      {1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
+}
+
+class MomentsGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_mean = ws_.CreateBlob("mean");
+    Blob* blob_variance = ws_.CreateBlob("variance");
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    mean_ = blob_mean->GetMutable<Tensor<CUDAContext>>();
+    variance_ = blob_variance->GetMutable<Tensor<CUDAContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_->Resize(X_dims);
+    mean_->Resize(Y_dims);
+    variance_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(
+      const std::vector<float>& mean_data,
+      const std::vector<float>& variance_data) {
+    Blob* blob_mean_host = ws_.CreateBlob("mean_host");
+    auto* mean_host = blob_mean_host->GetMutable<TensorCPU>();
+    mean_host->CopyFrom<CUDAContext, CUDAContext>(*mean_, cuda_context_.get());
+    Blob* blob_variance_host = ws_.CreateBlob("variance_host");
+    auto* variance_host = blob_variance_host->GetMutable<TensorCPU>();
+    variance_host->CopyFrom<CUDAContext, CUDAContext>(
+        *variance_, cuda_context_.get());
+    cuda_context_->FinishDeviceComputation();
+
+    ASSERT_EQ(mean_data.size(), mean_host->size());
+    for (std::size_t i = 0; i < mean_data.size(); ++i) {
+      EXPECT_FLOAT_EQ(mean_data[i], mean_host->data<float>()[i]);
+    }
+    ASSERT_EQ(variance_data.size(), variance_host->size());
+    for (std::size_t i = 0; i < variance_data.size(); ++i) {
+      EXPECT_NEAR(variance_data[i], variance_host->data<float>()[i], kEps);
+    }
+  }
+
+  void RunMomentsTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& mean_data,
+      const std::vector<float>& variance_data) {
+    SetUpData(X_dims, axes, X_data);
+    math::Moments<float, CUDAContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_->data<float>(),
+        mean_->mutable_data<float>(),
+        variance_->mutable_data<float>(),
+        cuda_context_.get());
+    VerifyResult(mean_data, variance_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* mean_ = nullptr;
+  Tensor<CUDAContext>* variance_ = nullptr;
+};
+
+TEST_F(MomentsGPUTest, MomentsGPUFloatTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  // Test for 1D tensor.
+  RunMomentsTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {2.0f}, {2.0f / 3.0f});
+
+  // Test for 2D Tensor.
+  RunMomentsTest(
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f},
+      {2.0f / 3.0f, 2.0f / 3.0f});
+  RunMomentsTest(
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f},
+      {2.25f, 2.25f, 2.25f});
+  RunMomentsTest(
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f},
+      {35.0f / 12.0f});
+
+  // Test for 3D tensor.
+  RunMomentsTest(
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f},
+      {1.25, 1.25});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f},
+      {5.0f, 5.0f});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f},
+      {4.25, 4.25});
+}
+
+class TransposeGPUTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HasCudaGPU()) {
+      return;
+    }
+    option_.set_device_type(CUDA);
+    cuda_context_ = make_unique<CUDAContext>(option_);
+    Blob* blob_x = ws_.CreateBlob("X");
+    Blob* blob_y = ws_.CreateBlob("Y");
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+  }
+
+  void SetUpData(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data) {
+    const int ndim = X_dims.size();
+    std::vector<int> Y_dims(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      Y_dims[i] = X_dims[axes[i]];
+    }
+    X_->Resize(X_dims);
+    Y_->Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_->size());
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+        X_data.size(), X_data.data(), X_->mutable_data<float>());
+  }
+
+  void VerifyResult(const std::vector<float>& expected_output) {
+    Blob* blob_y_host = ws_.CreateBlob("Y_host");
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    cuda_context_->FinishDeviceComputation();
+    ASSERT_EQ(expected_output.size(), Y_host->size());
+    for (std::size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
+    }
+  }
+
+  void RunTransposeTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    SetUpData(X_dims, axes, X_data);
+    math::Transpose<float, CUDAContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.data(),
+        X_->data<float>(),
+        Y_->mutable_data<float>(),
+        cuda_context_.get());
+    cuda_context_->FinishDeviceComputation();
+    VerifyResult(Y_data);
+  }
+
+  Workspace ws_;
+  DeviceOption option_;
+  std::unique_ptr<CUDAContext> cuda_context_;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
+};
+
+TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
+  if (!HasCudaGPU()) {
+    return;
+  }
+  // Test for 1D transpose.
+  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
+
+  // Test for 2D transpose.
+  RunTransposeTest(
+      {2, 3},
+      {1, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
+
+  // Test for 3D transpose.
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 2, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
+}
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc
new file mode 100644
index 0000000..8ade7d6
--- /dev/null
+++ b/caffe2/utils/math_test.cc
@@ -0,0 +1,854 @@
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+TEST(MathTest, GemmNoTransNoTrans) {
+  DeviceOption option;
+  CPUContext cpu_context(option);
+  TensorCPU X(std::vector<int>{5, 10});
+  TensorCPU W(std::vector<int>{10, 6});
+  TensorCPU Y(std::vector<int>{5, 6});
+  EXPECT_EQ(X.size(), 50);
+  EXPECT_EQ(W.size(), 60);
+  math::Set<float, CPUContext>(
+      X.size(), 1, X.mutable_data<float>(), &cpu_context);
+  math::Set<float, CPUContext>(
+      W.size(), 1, W.mutable_data<float>(), &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < X.size(); ++i) {
+    CHECK_EQ(X.data<float>()[i], 1);
+  }
+  for (int i = 0; i < W.size(); ++i) {
+    CHECK_EQ(W.data<float>()[i], 1);
+  }
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kOne,
+      X.data<float>(),
+      W.data<float>(),
+      kZero,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 10) << i;
+  }
+  // Test Accumulate
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kOne,
+      X.data<float>(),
+      W.data<float>(),
+      kPointFive,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 15) << i;
+  }
+  // Test Accumulate
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      5,
+      6,
+      10,
+      kPointFive,
+      X.data<float>(),
+      W.data<float>(),
+      kOne,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 20) << i;
+  }
+}
+
+TEST(MathTest, GemmNoTransTrans) {
+  DeviceOption option;
+  CPUContext cpu_context(option);
+  TensorCPU X(std::vector<int>{5, 10});
+  TensorCPU W(std::vector<int>{6, 10});
+  TensorCPU Y(std::vector<int>{5, 6});
+  EXPECT_EQ(X.size(), 50);
+  EXPECT_EQ(W.size(), 60);
+  math::Set<float, CPUContext>(
+      X.size(), 1, X.mutable_data<float>(), &cpu_context);
+  math::Set<float, CPUContext>(
+      W.size(), 1, W.mutable_data<float>(), &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < X.size(); ++i) {
+    CHECK_EQ(X.data<float>()[i], 1);
+  }
+  for (int i = 0; i < W.size(); ++i) {
+    CHECK_EQ(W.data<float>()[i], 1);
+  }
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kOne,
+      X.data<float>(),
+      W.data<float>(),
+      kZero,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 10) << i;
+  }
+  // Test Accumulate
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kOne,
+      X.data<float>(),
+      W.data<float>(),
+      kPointFive,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 15) << i;
+  }
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasTrans,
+      5,
+      6,
+      10,
+      kPointFive,
+      X.data<float>(),
+      W.data<float>(),
+      kOne,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  EXPECT_EQ(Y.size(), 30);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 20) << i;
+  }
+}
+
+namespace {
+
+constexpr float kEps = 1e-5;
+
+class GemmBatchedTest
+    : public testing::TestWithParam<testing::tuple<bool, bool>> {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+    X_.Resize(std::vector<TIndex>{3, 5, 10});
+    W_.Resize(std::vector<TIndex>{3, 6, 10});
+    Y_.Resize(std::vector<TIndex>{3, 5, 6});
+    math::Set<float, CPUContext>(
+        X_.size(), 1, X_.mutable_data<float>(), cpu_context_.get());
+    math::Set<float, CPUContext>(
+        W_.size(), 1, W_.mutable_data<float>(), cpu_context_.get());
+    trans_X_ = std::get<0>(GetParam());
+    trans_W_ = std::get<1>(GetParam());
+  }
+
+  void RunGemmBatched(const float alpha, const float beta) {
+    math::GemmBatched(
+        trans_X_ ? CblasTrans : CblasNoTrans,
+        trans_W_ ? CblasTrans : CblasNoTrans,
+        3,
+        5,
+        6,
+        10,
+        alpha,
+        X_.template data<float>(),
+        W_.template data<float>(),
+        beta,
+        Y_.template mutable_data<float>(),
+        cpu_context_.get());
+  }
+
+  void VerifyOutput(const float value) const {
+    for (int i = 0; i < Y_.size(); ++i) {
+      EXPECT_FLOAT_EQ(value, Y_.template data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+  TensorCPU X_;
+  TensorCPU W_;
+  TensorCPU Y_;
+  bool trans_X_;
+  bool trans_W_;
+};
+
+TEST_P(GemmBatchedTest, GemmBatchedFloatTest) {
+  RunGemmBatched(1.0f, 0.0f);
+  VerifyOutput(10.0f);
+  RunGemmBatched(1.0f, 0.5f);
+  VerifyOutput(15.0f);
+  RunGemmBatched(0.5f, 1.0f);
+  VerifyOutput(20.0f);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GemmBatchedTrans,
+    GemmBatchedTest,
+    testing::Combine(testing::Bool(), testing::Bool()));
+
+} // namespace
+
+TEST(MathTest, GemvNoTrans) {
+  DeviceOption option;
+  CPUContext cpu_context(option);
+  TensorCPU A(std::vector<int>{5, 10});
+  TensorCPU X(std::vector<int>{10});
+  TensorCPU Y(std::vector<int>{5});
+  EXPECT_EQ(A.size(), 50);
+  EXPECT_EQ(X.size(), 10);
+  math::Set<float, CPUContext>(
+      A.size(), 1, A.mutable_data<float>(), &cpu_context);
+  math::Set<float, CPUContext>(
+      X.size(), 1, X.mutable_data<float>(), &cpu_context);
+  EXPECT_EQ(Y.size(), 5);
+  for (int i = 0; i < A.size(); ++i) {
+    CHECK_EQ(A.data<float>()[i], 1);
+  }
+  for (int i = 0; i < X.size(); ++i) {
+    CHECK_EQ(X.data<float>()[i], 1);
+  }
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemv<float, CPUContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kOne,
+      A.data<float>(),
+      X.data<float>(),
+      kZero,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 10) << i;
+  }
+  // Test Accumulate
+  math::Gemv<float, CPUContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kOne,
+      A.data<float>(),
+      X.data<float>(),
+      kPointFive,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 15) << i;
+  }
+  // Test Accumulate
+  math::Gemv<float, CPUContext>(
+      CblasNoTrans,
+      5,
+      10,
+      kPointFive,
+      A.data<float>(),
+      X.data<float>(),
+      kOne,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 20) << i;
+  }
+}
+
+TEST(MathTest, GemvTrans) {
+  DeviceOption option;
+  CPUContext cpu_context(option);
+  TensorCPU A(std::vector<int>{6, 10});
+  TensorCPU X(std::vector<int>{6});
+  TensorCPU Y(std::vector<int>{10});
+  EXPECT_EQ(A.size(), 60);
+  EXPECT_EQ(X.size(), 6);
+  math::Set<float, CPUContext>(
+      A.size(), 1, A.mutable_data<float>(), &cpu_context);
+  math::Set<float, CPUContext>(
+      X.size(), 1, X.mutable_data<float>(), &cpu_context);
+  EXPECT_EQ(Y.size(), 10);
+  for (int i = 0; i < A.size(); ++i) {
+    CHECK_EQ(A.data<float>()[i], 1);
+  }
+  for (int i = 0; i < X.size(); ++i) {
+    CHECK_EQ(X.data<float>()[i], 1);
+  }
+
+  const float kOne = 1.0;
+  const float kPointFive = 0.5;
+  const float kZero = 0.0;
+  math::Gemv<float, CPUContext>(
+      CblasTrans,
+      6,
+      10,
+      kOne,
+      A.data<float>(),
+      X.data<float>(),
+      kZero,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 6) << i;
+  }
+  // Test Accumulate
+  math::Gemv<float, CPUContext>(
+      CblasTrans,
+      6,
+      10,
+      kOne,
+      A.data<float>(),
+      X.data<float>(),
+      kPointFive,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 9) << i;
+  }
+  // Test Accumulate
+  math::Gemv<float, CPUContext>(
+      CblasTrans,
+      6,
+      10,
+      kPointFive,
+      A.data<float>(),
+      X.data<float>(),
+      kOne,
+      Y.mutable_data<float>(),
+      &cpu_context);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_EQ(Y.data<float>()[i], 12) << i;
+  }
+}
+
+using convert::cpu_float2half_rn;
+using convert::cpu_half2float;
+TEST(MathTest, FloatToHalfConversion) {
+  float a = 1.0f;
+  float b = 1.75f;
+  float c = 128.125f;
+
+  float converted_a = cpu_half2float(cpu_float2half_rn(a));
+  float converted_b = cpu_half2float(cpu_float2half_rn(b));
+  float converted_c = cpu_half2float(cpu_float2half_rn(c));
+
+  CHECK_EQ(a, converted_a);
+  CHECK_EQ(b, converted_b);
+  CHECK_EQ(c, converted_c);
+}
+
+namespace {
+
+class ReduceTensorTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+  }
+
+  template <class ReduceFunc>
+  void RunRedcueTensorTest(
+      const ReduceFunc& reduce_func,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_.Resize(X_dims);
+    Y_.Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_.size());
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
+        X_data.size(), X_data.data(), X_.mutable_data<float>());
+    reduce_func(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_.data<float>(),
+        Y_.mutable_data<float>(),
+        cpu_context_.get());
+    ASSERT_EQ(Y_data.size(), Y_.size());
+    for (int i = 0; i < Y_.size(); ++i) {
+      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+  TensorCPU X_;
+  TensorCPU Y_;
+};
+
+TEST_F(ReduceTensorTest, ReduceMinTest) {
+  const auto& reduce_min = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              CPUContext* context) {
+    return math::ReduceMin<float, CPUContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {1.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 2.0f, 3.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f});
+  RunRedcueTensorTest(
+      reduce_min,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 3.0f});
+}
+
+TEST_F(ReduceTensorTest, ReduceMaxTest) {
+  const auto& reduce_max = [](const int num_dims,
+                              const int* dims,
+                              const int num_axes,
+                              const int* axes,
+                              const float* X,
+                              float* Y,
+                              CPUContext* context) {
+    return math::ReduceMax<float, CPUContext>(
+        num_dims, dims, num_axes, axes, X, Y, context);
+  };
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {3.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {4.0f, 5.0f, 6.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {6.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {7.0f, 8.0f});
+  RunRedcueTensorTest(
+      reduce_max,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {6.0f, 8.0f});
+}
+
+TEST_F(ReduceTensorTest, ReduceSumTest) {
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {6.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {6.0f, 15.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {5.0f, 7.0f, 9.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {21.0f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {10.0f, 26.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {16.0f, 20.0f});
+  RunRedcueTensorTest(
+      math::ReduceSum<float, CPUContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {14.0f, 22.0f});
+}
+
+TEST_F(ReduceTensorTest, ReduceMeanTest) {
+  // Test for 1D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {3},
+      {0},
+      {1.0f, 2.0f, 3.0f},
+      {2.0f});
+
+  // Test for 2D Tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f});
+
+  // Test for 3D tensor.
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f});
+  RunRedcueTensorTest(
+      math::ReduceMean<float, CPUContext>,
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f});
+}
+
+class BroadcastTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+  }
+
+  void RunBroadcastTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    X_.Resize(X_dims);
+    Y_.Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_.size());
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
+        X_data.size(), X_data.data(), X_.mutable_data<float>());
+    math::Broadcast<float, CPUContext>(
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.size(),
+        Y_dims.data(),
+        X_.data<float>(),
+        Y_.mutable_data<float>(),
+        cpu_context_.get());
+    ASSERT_EQ(Y_data.size(), Y_.size());
+    for (int i = 0; i < Y_data.size(); ++i) {
+      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+
+  TensorCPU X_;
+  TensorCPU Y_;
+};
+
+TEST_F(BroadcastTest, BroadcastFloatTest) {
+  RunBroadcastTest({2}, {2}, {1.0f, 2.0f}, {1.0f, 2.0f});
+  RunBroadcastTest({1}, {2}, {1.0f}, {1.0f, 1.0f});
+  RunBroadcastTest({1}, {2, 2}, {1.0f}, {1.0f, 1.0f, 1.0f, 1.0f});
+  RunBroadcastTest({2, 1}, {2, 2}, {1.0f, 2.0f}, {1.0f, 1.0f, 2.0f, 2.0f});
+  RunBroadcastTest(
+      {2, 1},
+      {2, 2, 2},
+      {1.0f, 2.0f},
+      {1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
+}
+
+class MomentsTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+  }
+
+  void RunMomentsTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& mean_data,
+      const std::vector<float>& variance_data) {
+    const int ndim = X_dims.size();
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes) {
+      Y_dims[axis] = 1;
+    }
+    X_.Resize(X_dims);
+    mean_.Resize(Y_dims);
+    variance_.Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_.size());
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
+        X_data.size(), X_data.data(), X_.mutable_data<float>());
+    math::Moments<float, CPUContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.size(),
+        axes.data(),
+        X_.data<float>(),
+        mean_.mutable_data<float>(),
+        variance_.mutable_data<float>(),
+        cpu_context_.get());
+    ASSERT_EQ(mean_data.size(), mean_.size());
+    for (int i = 0; i < mean_data.size(); ++i) {
+      EXPECT_FLOAT_EQ(mean_data[i], mean_.data<float>()[i]);
+    }
+    ASSERT_EQ(variance_data.size(), variance_.size());
+    for (int i = 0; i < variance_data.size(); ++i) {
+      EXPECT_NEAR(variance_data[i], variance_.data<float>()[i], kEps);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+
+  TensorCPU X_;
+  TensorCPU mean_;
+  TensorCPU variance_;
+};
+
+TEST_F(MomentsTest, MomentsFloatTest) {
+  // Test for 1D tensor.
+  RunMomentsTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {2.0f}, {2.0f / 3.0f});
+
+  // Test for 2D Tensor.
+  RunMomentsTest(
+      {2, 3},
+      {1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.0f, 5.0f},
+      {2.0f / 3.0f, 2.0f / 3.0f});
+  RunMomentsTest(
+      {2, 3},
+      {0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {2.5f, 3.5f, 4.5f},
+      {2.25f, 2.25f, 2.25f});
+  RunMomentsTest(
+      {2, 3},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {3.5f},
+      {35.0f / 12.0f});
+
+  // Test for 3D tensor.
+  RunMomentsTest(
+      {2, 2, 2},
+      {1, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {2.5f, 6.5f},
+      {1.25, 1.25});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 1},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {4.0f, 5.0f},
+      {5.0f, 5.0f});
+  RunMomentsTest(
+      {2, 2, 2},
+      {0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {3.5f, 5.5f},
+      {4.25, 4.25});
+}
+
+class TransposeTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    cpu_context_ = make_unique<CPUContext>(option_);
+  }
+
+  void RunTransposeTest(
+      const std::vector<int>& X_dims,
+      const std::vector<int>& axes,
+      const std::vector<float>& X_data,
+      const std::vector<float>& Y_data) {
+    const int ndim = X_dims.size();
+    std::vector<int> Y_dims(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      Y_dims[i] = X_dims[axes[i]];
+    }
+    X_.Resize(X_dims);
+    Y_.Resize(Y_dims);
+    ASSERT_EQ(X_data.size(), X_.size());
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
+        X_data.size(), X_data.data(), X_.mutable_data<float>());
+    math::Transpose<float, CPUContext>(
+        X_dims.size(),
+        X_dims.data(),
+        axes.data(),
+        X_.data<float>(),
+        Y_.mutable_data<float>(),
+        cpu_context_.get());
+    ASSERT_EQ(Y_data.size(), Y_.size());
+    for (int i = 0; i < Y_.size(); ++i) {
+      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
+    }
+  }
+
+  DeviceOption option_;
+  std::unique_ptr<CPUContext> cpu_context_;
+
+  TensorCPU X_;
+  TensorCPU Y_;
+};
+
+TEST_F(TransposeTest, TransposeFloatTest) {
+  // Test for 1D transpose.
+  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});
+
+  // Test for 2D transpose.
+  RunTransposeTest(
+      {2, 3},
+      {1, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});
+
+  // Test for 3D transpose.
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 2, 0},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
+  RunTransposeTest(
+      {2, 2, 2},
+      {1, 0, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
+}
+
+} // namespace
+
+} // namespace caffe2
diff --git a/caffe2/utils/math_utils.cc b/caffe2/utils/math_utils.cc
new file mode 100644
index 0000000..1334111
--- /dev/null
+++ b/caffe2/utils/math_utils.cc
@@ -0,0 +1,150 @@
+#include "caffe2/utils/math_utils.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace math {
+namespace utils {
+
+void IncreaseIndexInDims(const int n, const int* dims, int* index) {
+  for (int i = n - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+int GetIndexFromDims(const int n, const int* dims, const int* index) {
+  int sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+bool IsIdentityPermutation(const int n, const int* perm) {
+  for (int i = 0; i < n; ++i) {
+    if (perm[i] != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ComputeBroadcastBinaryOpDims(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    int* A_broadcast_dims,
+    int* B_broadcast_dims,
+    int* C_broadcast_dims) {
+  const int ndim = std::max(A_ndim, B_ndim);
+  std::fill(A_broadcast_dims, A_broadcast_dims + ndim - A_ndim, 1);
+  std::fill(B_broadcast_dims, B_broadcast_dims + ndim - B_ndim, 1);
+  std::copy(A_dims, A_dims + A_ndim, A_broadcast_dims + ndim - A_ndim);
+  std::copy(B_dims, B_dims + B_ndim, B_broadcast_dims + ndim - B_ndim);
+  for (int i = 0; i < ndim; ++i) {
+    CAFFE_ENFORCE(
+        A_broadcast_dims[i] == B_broadcast_dims[i] ||
+        A_broadcast_dims[i] <= 1 || B_broadcast_dims[i] <= 1);
+    if (A_broadcast_dims[i] == 0 || B_broadcast_dims[i] == 0) {
+      C_broadcast_dims[i] = 0;
+    } else {
+      C_broadcast_dims[i] = std::max(A_broadcast_dims[i], B_broadcast_dims[i]);
+    }
+  }
+}
+
+bool IsRowwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st) {
+  if (ndim == 0) {
+    return false;
+  }
+  int A_pivot = 0;
+  for (; A_pivot < ndim && A_dims[A_pivot] == 1; ++A_pivot)
+    ;
+  int B_pivot = 0;
+  for (; B_pivot < ndim && B_dims[B_pivot] == 1; ++B_pivot)
+    ;
+  if (A_pivot == B_pivot) {
+    return false;
+  }
+  *pivot = std::max(A_pivot, B_pivot);
+  *broadcast_1st = A_pivot > B_pivot;
+  return std::equal(A_dims + *pivot, A_dims + ndim, B_dims + *pivot);
+}
+
+bool IsColwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st) {
+  if (ndim == 0) {
+    return false;
+  }
+  int A_pivot = ndim - 1;
+  for (; A_pivot >= 0 && A_dims[A_pivot] == 1; --A_pivot)
+    ;
+  int B_pivot = ndim - 1;
+  for (; B_pivot >= 0 && B_dims[B_pivot] == 1; --B_pivot)
+    ;
+  if (A_pivot == B_pivot) {
+    return false;
+  }
+  *pivot = std::min(A_pivot, B_pivot) + 1;
+  *broadcast_1st = A_pivot < B_pivot;
+  return std::equal(A_dims, A_dims + *pivot, B_dims);
+}
+
+void ComputeTransposeAxesForReduceOp(
+    const int num_dims,
+    const int num_reduce_axes,
+    const int* reduce_axes,
+    int* transpose_axes) {
+  const int d = num_dims - num_reduce_axes;
+  std::copy_n(reduce_axes, num_reduce_axes, transpose_axes + d);
+  std::sort(transpose_axes + d, transpose_axes + num_dims);
+  int p = 0;
+  int q = d;
+  for (int i = 0; i < num_dims; ++i) {
+    if (q < num_dims && i == transpose_axes[q]) {
+      ++q;
+    } else {
+      transpose_axes[p++] = i;
+    }
+  }
+}
+
+void ComputeTransposedStrides(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    int* strides) {
+  std::vector<int> buff(ndim);
+  int cur_stride = 1;
+  for (int i = ndim - 1; i >= 0; --i) {
+    buff[i] = cur_stride;
+    cur_stride *= dims[i];
+  }
+  for (int i = 0; i < ndim; ++i) {
+    strides[i] = buff[axes[i]];
+  }
+}
+
+} // namespace utils
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math_utils.h b/caffe2/utils/math_utils.h
new file mode 100644
index 0000000..4b54ab1
--- /dev/null
+++ b/caffe2/utils/math_utils.h
@@ -0,0 +1,92 @@
+#ifndef CAFFE2_UTILS_MATH_UTILS_H_
+#define CAFFE2_UTILS_MATH_UTILS_H_
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#define MATH_UTILS_DECL inline __host__ __device__
+#else
+#define MATH_UTILS_DECL inline
+#endif
+
+namespace caffe2 {
+namespace math {
+namespace utils {
+
+MATH_UTILS_DECL bool Not(const bool x) {
+  return !x;
+}
+
+template <typename T>
+MATH_UTILS_DECL T Sign(const T x) {
+  return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0));
+}
+
+template <typename T>
+MATH_UTILS_DECL T Negate(const T x) {
+  return -x;
+}
+
+template <typename T>
+MATH_UTILS_DECL T Inv(const T x) {
+  return T(1) / x;
+}
+
+template <typename T>
+MATH_UTILS_DECL T Square(const T x) {
+  return x * x;
+}
+
+template <typename T>
+MATH_UTILS_DECL T Cube(const T x) {
+  return x * x * x;
+}
+
+// Increase the index digits by one based on dims.
+void IncreaseIndexInDims(const int n, const int* dims, int* index);
+
+// Get index value from dims and index digits.
+int GetIndexFromDims(const int n, const int* dims, const int* index);
+
+// Checks if the input permutation is an identity permutation;
+bool IsIdentityPermutation(const int n, const int* perm);
+
+// Computest the broadcast binary operation dims.
+void ComputeBroadcastBinaryOpDims(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    int* A_broadcast_dims,
+    int* B_broadcast_dims,
+    int* C_broadcast_dims);
+
+bool IsRowwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st);
+
+bool IsColwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st);
+
+void ComputeTransposeAxesForReduceOp(
+    const int num_dims,
+    const int num_reduce_axes,
+    const int* reduce_axes,
+    int* transpose_axes);
+
+void ComputeTransposedStrides(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    int* strides);
+
+} // namespace utils
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_UTILS_H_
diff --git a/caffe2/utils/mixed_utils.h b/caffe2/utils/mixed_utils.h
new file mode 100644
index 0000000..dbdaa71
--- /dev/null
+++ b/caffe2/utils/mixed_utils.h
@@ -0,0 +1,104 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#ifndef CAFFE2_UTILS_MIXED_UTILS_H
+#define CAFFE2_UTILS_MIXED_UTILS_H
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+
+// define functions to allow add/mult/store operaions for input/output with
+// mixed precisions.
+namespace caffe2 {
+
+// functions that will only be triggered when there is no spcialized version
+// supported
+template <typename T, typename T2>
+inline __device__ T mixed_mult(T data1, T2 data2) {
+  return data1 * data2;
+};
+
+template <typename T, typename T2>
+inline __device__ T mixed_add(T data1, T2 data2) {
+  return data1 + data2;
+};
+
+template <typename TIN, typename TOUT>
+inline __device__ void mixed_store(TIN* data_in, TOUT* data_out) {
+  *data_out = *data_in;
+  return;
+};
+
+template <typename T>
+inline __device__ void mixed_store(T* data_in, T* data_out) {
+  *data_out = *data_in;
+  return;
+};
+
+#ifdef CAFFE_HAS_CUDA_FP16
+// define templated functions to support mixed precision computation
+template <>
+inline __device__ float mixed_mult(float data1, const float data2) {
+  return data1 * data2;
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, const half data2) {
+  return data1 * __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, float16 data2) {
+  half* data2_half = reinterpret_cast<half*>(&data2);
+  return data1 * __half2float(*data2_half);
+}
+template <>
+inline __device__ float mixed_add(float data1, const float data2) {
+  return data1 + data2;
+}
+
+template <>
+inline __device__ float mixed_add(float data1, const half data2) {
+  return data1 + __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_add(float data1, float16 data2) {
+  half* data2_half = reinterpret_cast<half*>(&data2);
+  return data1 + __half2float(*data2_half);
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float* data_out) {
+  *data_out = *data_in;
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(half* data_in, float* data_out) {
+  *data_out = __half2float(*data_in);
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float16* data_in, float* data_out) {
+  half* data_in_half = reinterpret_cast<half*>(data_in);
+  *data_out = __half2float(*data_in_half);
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float16* data_out) {
+  half data_in_half = __float2half(*data_in);
+  float16* data_in_float16 = reinterpret_cast<float16*>(&data_in_half);
+  *data_out = *data_in_float16;
+  return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, half* data_out) {
+  half data_in_half = __float2half(*data_in);
+  *data_out = data_in_half;
+  return;
+}
+#endif // for CAFFE_HAS_CUDA_FP16
+} // namespace caffe2
+#endif // for CAFFE2_UTILS_MIXED_UTILS_H
diff --git a/caffe2/utils/murmur_hash3.cc b/caffe2/utils/murmur_hash3.cc
new file mode 100644
index 0000000..28b3731
--- /dev/null
+++ b/caffe2/utils/murmur_hash3.cc
@@ -0,0 +1,463 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "caffe2/utils/murmur_hash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#define FALLTHROUGH_INTENDED
+
+#include <stdlib.h>
+
+#define ROTL32(x, y) _rotl(x, y)
+#define ROTL64(x, y) _rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((__always_inline__))
+
+#if defined(__clang__) && defined(__has_cpp_attribute)
+#if __has_cpp_attribute(clang::fallthrough)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#endif
+#elif defined(__GNUC__) && __GNUC__ > 6
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#endif
+#ifndef FALLTHROUGH_INTENDED
+#define FALLTHROUGH_INTENDED
+#endif
+
+inline uint32_t rotl32(uint32_t x, int8_t r) {
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64(uint64_t x, int8_t r) {
+  return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x, y) rotl32(x, y)
+#define ROTL64(x, y) rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) {
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) {
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+namespace caffe2 {
+
+void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1, 13);
+    h1 = h1 * 5 + 0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
+
+  uint32_t k1 = 0;
+
+  switch (len & 3) {
+    case 3:
+      k1 ^= tail[2] << 16;
+      FALLTHROUGH_INTENDED;
+    case 2:
+      k1 ^= tail[1] << 8;
+      FALLTHROUGH_INTENDED;
+    case 1:
+      k1 ^= tail[0];
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128(
+    const void* key,
+    const int len,
+    uint32_t seed,
+    void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b;
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5;
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i * 4 + 0);
+    uint32_t k2 = getblock32(blocks, i * 4 + 1);
+    uint32_t k3 = getblock32(blocks, i * 4 + 2);
+    uint32_t k4 = getblock32(blocks, i * 4 + 3);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL32(h1, 19);
+    h1 += h2;
+    h1 = h1 * 5 + 0x561ccd1b;
+
+    k2 *= c2;
+    k2 = ROTL32(k2, 16);
+    k2 *= c3;
+    h2 ^= k2;
+
+    h2 = ROTL32(h2, 17);
+    h2 += h3;
+    h2 = h2 * 5 + 0x0bcaa747;
+
+    k3 *= c3;
+    k3 = ROTL32(k3, 17);
+    k3 *= c4;
+    h3 ^= k3;
+
+    h3 = ROTL32(h3, 15);
+    h3 += h4;
+    h3 = h3 * 5 + 0x96cd1c35;
+
+    k4 *= c4;
+    k4 = ROTL32(k4, 18);
+    k4 *= c1;
+    h4 ^= k4;
+
+    h4 = ROTL32(h4, 13);
+    h4 += h1;
+    h4 = h4 * 5 + 0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k4 ^= tail[14] << 16;
+      FALLTHROUGH_INTENDED;
+    case 14:
+      k4 ^= tail[13] << 8;
+      FALLTHROUGH_INTENDED;
+    case 13:
+      k4 ^= tail[12] << 0;
+      k4 *= c4;
+      k4 = ROTL32(k4, 18);
+      k4 *= c1;
+      h4 ^= k4;
+      FALLTHROUGH_INTENDED;
+
+    case 12:
+      k3 ^= tail[11] << 24;
+      FALLTHROUGH_INTENDED;
+    case 11:
+      k3 ^= tail[10] << 16;
+      FALLTHROUGH_INTENDED;
+    case 10:
+      k3 ^= tail[9] << 8;
+      FALLTHROUGH_INTENDED;
+    case 9:
+      k3 ^= tail[8] << 0;
+      k3 *= c3;
+      k3 = ROTL32(k3, 17);
+      k3 *= c4;
+      h3 ^= k3;
+      FALLTHROUGH_INTENDED;
+
+    case 8:
+      k2 ^= tail[7] << 24;
+      FALLTHROUGH_INTENDED;
+    case 7:
+      k2 ^= tail[6] << 16;
+      FALLTHROUGH_INTENDED;
+    case 6:
+      k2 ^= tail[5] << 8;
+      FALLTHROUGH_INTENDED;
+    case 5:
+      k2 ^= tail[4] << 0;
+      k2 *= c2;
+      k2 = ROTL32(k2, 16);
+      k2 *= c3;
+      h2 ^= k2;
+      FALLTHROUGH_INTENDED;
+
+    case 4:
+      k1 ^= tail[3] << 24;
+      FALLTHROUGH_INTENDED;
+    case 3:
+      k1 ^= tail[2] << 16;
+      FALLTHROUGH_INTENDED;
+    case 2:
+      k1 ^= tail[1] << 8;
+      FALLTHROUGH_INTENDED;
+    case 1:
+      k1 ^= tail[0] << 0;
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+  h3 ^= len;
+  h4 ^= len;
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128(
+    const void* key,
+    const int len,
+    const uint32_t seed,
+    void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t* blocks = (const uint64_t*)(data);
+
+  for (int i = 0; i < nblocks; i++) {
+    uint64_t k1 = getblock64(blocks, i * 2 + 0);
+    uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+    k1 *= c1;
+    k1 = ROTL64(k1, 31);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL64(h1, 27);
+    h1 += h2;
+    h1 = h1 * 5 + 0x52dce729;
+
+    k2 *= c2;
+    k2 = ROTL64(k2, 33);
+    k2 *= c1;
+    h2 ^= k2;
+
+    h2 = ROTL64(h2, 31);
+    h2 += h1;
+    h2 = h2 * 5 + 0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k2 ^= ((uint64_t)tail[14]) << 48;
+      FALLTHROUGH_INTENDED;
+    case 14:
+      k2 ^= ((uint64_t)tail[13]) << 40;
+      FALLTHROUGH_INTENDED;
+    case 13:
+      k2 ^= ((uint64_t)tail[12]) << 32;
+      FALLTHROUGH_INTENDED;
+    case 12:
+      k2 ^= ((uint64_t)tail[11]) << 24;
+      FALLTHROUGH_INTENDED;
+    case 11:
+      k2 ^= ((uint64_t)tail[10]) << 16;
+      FALLTHROUGH_INTENDED;
+    case 10:
+      k2 ^= ((uint64_t)tail[9]) << 8;
+      FALLTHROUGH_INTENDED;
+    case 9:
+      k2 ^= ((uint64_t)tail[8]) << 0;
+      k2 *= c2;
+      k2 = ROTL64(k2, 33);
+      k2 *= c1;
+      h2 ^= k2;
+      FALLTHROUGH_INTENDED;
+
+    case 8:
+      k1 ^= ((uint64_t)tail[7]) << 56;
+      FALLTHROUGH_INTENDED;
+    case 7:
+      k1 ^= ((uint64_t)tail[6]) << 48;
+      FALLTHROUGH_INTENDED;
+    case 6:
+      k1 ^= ((uint64_t)tail[5]) << 40;
+      FALLTHROUGH_INTENDED;
+    case 5:
+      k1 ^= ((uint64_t)tail[4]) << 32;
+      FALLTHROUGH_INTENDED;
+    case 4:
+      k1 ^= ((uint64_t)tail[3]) << 24;
+      FALLTHROUGH_INTENDED;
+    case 3:
+      k1 ^= ((uint64_t)tail[2]) << 16;
+      FALLTHROUGH_INTENDED;
+    case 2:
+      k1 ^= ((uint64_t)tail[1]) << 8;
+      FALLTHROUGH_INTENDED;
+    case 1:
+      k1 ^= ((uint64_t)tail[0]) << 0;
+      k1 *= c1;
+      k1 = ROTL64(k1, 31);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/murmur_hash3.h b/caffe2/utils/murmur_hash3.h
new file mode 100644
index 0000000..ea67e71
--- /dev/null
+++ b/caffe2/utils/murmur_hash3.h
@@ -0,0 +1,34 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#pragma once
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+namespace caffe2 {
+
+void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out);
+
+void MurmurHash3_x86_128(const void* key, int len, uint32_t seed, void* out);
+
+void MurmurHash3_x64_128(const void* key, int len, uint32_t seed, void* out);
+
+} // namespace caffe2
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
new file mode 100644
index 0000000..33c4a94
--- /dev/null
+++ b/caffe2/utils/proto_utils.cc
@@ -0,0 +1,535 @@
+#include "caffe2/utils/proto_utils.h"
+
+#include <fcntl.h>
+#include <cerrno>
+#include <fstream>
+
+#include <google/protobuf/io/coded_stream.h>
+
+#ifndef CAFFE2_USE_LITE_PROTO
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#else
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#endif  // !CAFFE2_USE_LITE_PROTO
+
+#include "caffe2/core/logging.h"
+
+using ::google::protobuf::MessageLite;
+
+namespace caffe2 {
+
+std::string DeviceTypeName(const int32_t& d) {
+  switch (d) {
+    case CPU:
+      return "CPU";
+    case CUDA:
+      return "CUDA";
+    case OPENGL:
+      return "OPENGL";
+    case OPENCL:
+      return "OPENCL";
+    case MKLDNN:
+      return "MKLDNN";
+    case IDEEP:
+      return "IDEEP";
+    case HIP:
+      return "HIP";
+    default:
+      CAFFE_THROW(
+          "Unknown device: ",
+          d,
+          ". If you have recently updated the caffe2.proto file to add a new "
+          "device type, did you forget to update the DeviceTypeName() "
+          "function to reflect such recent changes?");
+      // The below code won't run but is needed to suppress some compiler
+      // warnings.
+      return "";
+  }
+}
+
+int DeviceId(const DeviceOption& option) {
+  switch (option.device_type()) {
+    case CPU:
+      return option.numa_node_id();
+    case CUDA:
+      return option.cuda_gpu_id();
+    case MKLDNN:
+      return option.numa_node_id();
+    case HIP:
+      return option.hip_gpu_id();
+    default:
+      CAFFE_THROW("Unknown device id for device type: ", option.device_type());
+  }
+}
+
+bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
+  return (
+      lhs.device_type() == rhs.device_type() &&
+      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
+      lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
+      lhs.node_name() == rhs.node_name() &&
+      lhs.numa_node_id() == rhs.numa_node_id());
+}
+
+bool ReadStringFromFile(const char* filename, string* str) {
+  std::ifstream ifs(filename, std::ios::in);
+  if (!ifs) {
+    VLOG(1) << "File cannot be opened: " << filename
+            << " error: " << ifs.rdstate();
+    return false;
+  }
+  ifs.seekg(0, std::ios::end);
+  size_t n = ifs.tellg();
+  str->resize(n);
+  ifs.seekg(0);
+  ifs.read(&(*str)[0], n);
+  return true;
+}
+
+bool WriteStringToFile(const string& str, const char* filename) {
+  std::ofstream ofs(filename, std::ios::out | std::ios::trunc);
+  if (!ofs.is_open()) {
+    VLOG(1) << "File cannot be created: " << filename
+            << " error: " << ofs.rdstate();
+    return false;
+  }
+  ofs << str;
+  return true;
+}
+
+// IO-specific proto functions: we will deal with the protocol buffer lite and
+// full versions differently.
+
+#ifdef CAFFE2_USE_LITE_PROTO
+
+// Lite runtime.
+
+namespace {
+class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
+ public:
+  explicit IfstreamInputStream(const string& filename)
+      : ifs_(filename.c_str(), std::ios::in | std::ios::binary) {}
+  ~IfstreamInputStream() { ifs_.close(); }
+
+  int Read(void* buffer, int size) {
+    if (!ifs_) {
+      return -1;
+    }
+    ifs_.read(static_cast<char*>(buffer), size);
+    return ifs_.gcount();
+  }
+
+ private:
+  std::ifstream ifs_;
+};
+}  // namespace
+
+string ProtoDebugString(const MessageLite& proto) {
+  return proto.SerializeAsString();
+}
+
+bool ParseProtoFromLargeString(const string& str, MessageLite* proto) {
+  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
+  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
+  // Set PlanDef message size limit to 1G.
+  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
+  return proto->ParseFromCodedStream(&coded_stream);
+}
+
+bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
+      new IfstreamInputStream(filename));
+  stream.SetOwnsCopyingStream(true);
+  // Total bytes hard limit / warning limit are set to 1GB and 512MB
+  // respectively.
+  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
+  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
+  return proto->ParseFromCodedStream(&coded_stream);
+}
+
+void WriteProtoToBinaryFile(
+    const MessageLite& /*proto*/,
+    const char* /*filename*/) {
+  LOG(FATAL) << "Not implemented yet.";
+}
+
+#else  // CAFFE2_USE_LITE_PROTO
+
+// Full protocol buffer.
+
+using ::google::protobuf::io::FileInputStream;
+using ::google::protobuf::io::FileOutputStream;
+using ::google::protobuf::io::ZeroCopyInputStream;
+using ::google::protobuf::io::CodedInputStream;
+using ::google::protobuf::io::ZeroCopyOutputStream;
+using ::google::protobuf::io::CodedOutputStream;
+using ::google::protobuf::Message;
+
+namespace TextFormat {
+bool ParseFromString(const string& spec, Message* proto) {
+  return ::google::protobuf::TextFormat::ParseFromString(spec, proto);
+}
+} // namespace TextFormat
+
+string ProtoDebugString(const Message& proto) {
+  return proto.ShortDebugString();
+}
+
+bool ParseProtoFromLargeString(const string& str, Message* proto) {
+  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
+  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
+  // Set PlanDef message size limit to 1G.
+  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
+  return proto->ParseFromCodedStream(&coded_stream);
+}
+
+bool ReadProtoFromTextFile(const char* filename, Message* proto) {
+  int fd = open(filename, O_RDONLY);
+  CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
+  FileInputStream* input = new FileInputStream(fd);
+  bool success = google::protobuf::TextFormat::Parse(input, proto);
+  delete input;
+  close(fd);
+  return success;
+}
+
+void WriteProtoToTextFile(const Message& proto, const char* filename) {
+  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  FileOutputStream* output = new FileOutputStream(fd);
+  CAFFE_ENFORCE(google::protobuf::TextFormat::Print(proto, output));
+  delete output;
+  close(fd);
+}
+
+bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+#if defined (_MSC_VER)  // for MSC compiler binary flag needs to be specified
+  int fd = open(filename, O_RDONLY | O_BINARY);
+#else
+  int fd = open(filename, O_RDONLY);
+#endif
+  CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
+  std::unique_ptr<ZeroCopyInputStream> raw_input(new FileInputStream(fd));
+  std::unique_ptr<CodedInputStream> coded_input(
+      new CodedInputStream(raw_input.get()));
+  // A hack to manually allow using very large protocol buffers.
+  coded_input->SetTotalBytesLimit(1073741824, 536870912);
+  bool success = proto->ParseFromCodedStream(coded_input.get());
+  coded_input.reset();
+  raw_input.reset();
+  close(fd);
+  return success;
+}
+
+void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename) {
+  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  CAFFE_ENFORCE_NE(
+      fd, -1, "File cannot be created: ", filename, " error number: ", errno);
+  std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
+  std::unique_ptr<CodedOutputStream> coded_output(
+      new CodedOutputStream(raw_output.get()));
+  CAFFE_ENFORCE(proto.SerializeToCodedStream(coded_output.get()));
+  coded_output.reset();
+  raw_output.reset();
+  close(fd);
+}
+
+#endif  // CAFFE2_USE_LITE_PROTO
+
+
+ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
+  for (auto& arg : def.arg()) {
+    if (arg_map_.count(arg.name())) {
+      if (arg.SerializeAsString() != arg_map_[arg.name()].SerializeAsString()) {
+        // If there are two arguments of the same name but different contents,
+        // we will throw an error.
+        CAFFE_THROW(
+            "Found argument of the same name ",
+            arg.name(),
+            "but with different contents.",
+            ProtoDebugString(def));
+      } else {
+        LOG(WARNING) << "Duplicated argument name [" << arg.name()
+                     << "] found in operator def: "
+                     << ProtoDebugString(def);
+      }
+    }
+    arg_map_[arg.name()] = arg;
+  }
+}
+
+ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
+  for (auto& arg : netdef.arg()) {
+    CAFFE_ENFORCE(
+        arg_map_.count(arg.name()) == 0,
+        "Duplicated argument name [", arg.name(), "] found in net def: ",
+        ProtoDebugString(netdef));
+    arg_map_[arg.name()] = arg;
+  }
+}
+
+bool ArgumentHelper::HasArgument(const string& name) const {
+  return arg_map_.count(name);
+}
+
+namespace {
+// Helper function to verify that conversion between types won't loose any
+// significant bit.
+template <typename InputType, typename TargetType>
+bool SupportsLosslessConversion(const InputType& value) {
+  return static_cast<InputType>(static_cast<TargetType>(value)) == value;
+}
+}
+
+bool operator==(const NetDef& l, const NetDef& r) {
+  return l.SerializeAsString() == r.SerializeAsString();
+}
+
+std::ostream& operator<<(std::ostream& output, const NetDef& n) {
+  output << n.SerializeAsString();
+  return output;
+}
+
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(                                      \
+    T, fieldname, enforce_lossless_conversion)                                \
+  template <>                                                                 \
+  T ArgumentHelper::GetSingleArgument<T>(                                     \
+      const string& name, const T& default_value) const {                     \
+    if (arg_map_.count(name) == 0) {                                          \
+      VLOG(1) << "Using default parameter value " << default_value            \
+              << " for parameter " << name;                                   \
+      return default_value;                                                   \
+    }                                                                         \
+    CAFFE_ENFORCE(                                                            \
+        arg_map_.at(name).has_##fieldname(),                                  \
+        "Argument ",                                                          \
+        name,                                                                 \
+        " does not have the right field: expected field " #fieldname);        \
+    auto value = arg_map_.at(name).fieldname();                               \
+    if (enforce_lossless_conversion) {                                        \
+      auto supportsConversion =                                               \
+          SupportsLosslessConversion<decltype(value), T>(value);              \
+      CAFFE_ENFORCE(                                                          \
+          supportsConversion,                                                 \
+          "Value",                                                            \
+          value,                                                              \
+          " of argument ",                                                    \
+          name,                                                               \
+          "cannot be represented correctly in a target type");                \
+    }                                                                         \
+    return static_cast<T>(value);                                             \
+  }                                                                           \
+  template <>                                                                 \
+  bool ArgumentHelper::HasSingleArgumentOfType<T>(const string& name) const { \
+    if (arg_map_.count(name) == 0) {                                          \
+      return false;                                                           \
+    }                                                                         \
+    return arg_map_.at(name).has_##fieldname();                               \
+  }
+
+INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(double, f, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(bool, i, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int8_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int16_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(uint8_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(uint16_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
+INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
+#undef INSTANTIATE_GET_SINGLE_ARGUMENT
+
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
+    T, fieldname, enforce_lossless_conversion)                         \
+  template <>                                                          \
+  vector<T> ArgumentHelper::GetRepeatedArgument<T>(                    \
+      const string& name, const std::vector<T>& default_value) const { \
+    if (arg_map_.count(name) == 0) {                                   \
+      return default_value;                                            \
+    }                                                                  \
+    vector<T> values;                                                  \
+    for (const auto& v : arg_map_.at(name).fieldname()) {              \
+      if (enforce_lossless_conversion) {                               \
+        auto supportsConversion =                                      \
+            SupportsLosslessConversion<decltype(v), T>(v);             \
+        CAFFE_ENFORCE(                                                 \
+            supportsConversion,                                        \
+            "Value",                                                   \
+            v,                                                         \
+            " of argument ",                                           \
+            name,                                                      \
+            "cannot be represented correctly in a target type");       \
+      }                                                                \
+      values.push_back(static_cast<T>(v));                             \
+    }                                                                  \
+    return values;                                                     \
+  }
+
+INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(double, floats, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(bool, ints, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int8_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int16_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(uint8_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(uint16_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
+INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(NetDef, nets, false)
+#undef INSTANTIATE_GET_REPEATED_ARGUMENT
+
+#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                            \
+template <>                                                                    \
+Argument MakeArgument(const string& name, const T& value) {                    \
+  Argument arg;                                                                \
+  arg.set_name(name);                                                          \
+  arg.set_##fieldname(value);                                                  \
+  return arg;                                                                  \
+}
+
+CAFFE2_MAKE_SINGULAR_ARGUMENT(bool, i)
+CAFFE2_MAKE_SINGULAR_ARGUMENT(float, f)
+CAFFE2_MAKE_SINGULAR_ARGUMENT(int, i)
+CAFFE2_MAKE_SINGULAR_ARGUMENT(int64_t, i)
+CAFFE2_MAKE_SINGULAR_ARGUMENT(string, s)
+#undef CAFFE2_MAKE_SINGULAR_ARGUMENT
+
+template <>
+bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
+template <>
+bool ArgumentHelper::RemoveArgument(NetDef& def, int index);
+
+template <>
+Argument MakeArgument(const string& name, const MessageLite& value) {
+  Argument arg;
+  arg.set_name(name);
+  arg.set_s(value.SerializeAsString());
+  return arg;
+}
+
+#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname)                            \
+template <>                                                                    \
+Argument MakeArgument(const string& name, const vector<T>& value) {            \
+  Argument arg;                                                                \
+  arg.set_name(name);                                                          \
+  for (const auto& v : value) {                                                \
+    arg.add_##fieldname(v);                                                    \
+  }                                                                            \
+  return arg;                                                                  \
+}
+
+CAFFE2_MAKE_REPEATED_ARGUMENT(float, floats)
+CAFFE2_MAKE_REPEATED_ARGUMENT(int, ints)
+CAFFE2_MAKE_REPEATED_ARGUMENT(int64_t, ints)
+CAFFE2_MAKE_REPEATED_ARGUMENT(string, strings)
+#undef CAFFE2_MAKE_REPEATED_ARGUMENT
+
+bool HasOutput(const OperatorDef& op, const std::string& output) {
+  for (const auto& outp : op.output()) {
+    if (outp == output) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasInput(const OperatorDef& op, const std::string& input) {
+  for (const auto& inp : op.input()) {
+    if (inp == input) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Return the argument index or -1 if it does not exist.
+int GetArgumentIndex(
+    const google::protobuf::RepeatedPtrField<Argument>& args,
+    const string& name) {
+  int index = 0;
+  for (const Argument& arg : args) {
+    if (arg.name() == name) {
+      return index;
+    }
+    index++;
+  }
+  return -1;
+}
+
+const Argument& GetArgument(const OperatorDef& def, const string& name) {
+  int index = GetArgumentIndex(def.arg(), name);
+  if (index != -1) {
+    return def.arg(index);
+  } else {
+    CAFFE_THROW(
+        "Argument named ",
+        name,
+        " does not exist in operator ",
+        ProtoDebugString(def));
+  }
+}
+
+const Argument& GetArgument(const NetDef& def, const string& name) {
+  int index = GetArgumentIndex(def.arg(), name);
+  if (index != -1) {
+    return def.arg(index);
+  } else {
+    CAFFE_THROW(
+        "Argument named ",
+        name,
+        " does not exist in net ",
+        ProtoDebugString(def));
+  }
+}
+
+bool GetFlagArgument(
+    const google::protobuf::RepeatedPtrField<Argument>& args,
+    const string& name,
+    bool default_value) {
+  int index = GetArgumentIndex(args, name);
+  if (index != -1) {
+    auto arg = args.Get(index);
+    CAFFE_ENFORCE(
+        arg.has_i(), "Can't parse argument as bool: ", ProtoDebugString(arg));
+    return arg.i();
+  }
+  return default_value;
+}
+
+bool GetFlagArgument(
+    const OperatorDef& def,
+    const string& name,
+    bool default_value) {
+  return GetFlagArgument(def.arg(), name, default_value);
+}
+
+bool GetFlagArgument(
+    const NetDef& def,
+    const string& name,
+    bool default_value) {
+  return GetFlagArgument(def.arg(), name, default_value);
+}
+
+Argument* GetMutableArgument(
+    const string& name,
+    const bool create_if_missing,
+    OperatorDef* def) {
+  for (int i = 0; i < def->arg_size(); ++i) {
+    if (def->arg(i).name() == name) {
+      return def->mutable_arg(i);
+    }
+  }
+  // If no argument of the right name is found...
+  if (create_if_missing) {
+    Argument* arg = def->add_arg();
+    arg->set_name(name);
+    return arg;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace caffe2
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
new file mode 100644
index 0000000..6311c22
--- /dev/null
+++ b/caffe2/utils/proto_utils.h
@@ -0,0 +1,345 @@
+#ifndef CAFFE2_UTILS_PROTO_UTILS_H_
+#define CAFFE2_UTILS_PROTO_UTILS_H_
+
+#ifdef CAFFE2_USE_LITE_PROTO
+#include <google/protobuf/message_lite.h>
+#else // CAFFE2_USE_LITE_PROTO
+#include <google/protobuf/message.h>
+#endif  // !CAFFE2_USE_LITE_PROTO
+
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/proto_wrap.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+using std::string;
+using ::google::protobuf::MessageLite;
+
+// A wrapper function to return device name string for use in blob serialization
+// / deserialization. This should have one to one correspondence with
+// caffe2/proto/caffe2.proto: enum DeviceType.
+//
+// Note that we can't use DeviceType_Name, because that is only available in
+// protobuf-full, and some platforms (like mobile) may want to use
+// protobuf-lite instead.
+std::string DeviceTypeName(const int32_t& d);
+
+int DeviceId(const DeviceOption& option);
+
+// Returns if the two DeviceOptions are pointing to the same device.
+bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
+
+// Common interfaces that reads file contents into a string.
+bool ReadStringFromFile(const char* filename, string* str);
+bool WriteStringToFile(const string& str, const char* filename);
+
+// Common interfaces that are supported by both lite and full protobuf.
+bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
+inline bool ReadProtoFromBinaryFile(const string filename, MessageLite* proto) {
+  return ReadProtoFromBinaryFile(filename.c_str(), proto);
+}
+
+void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
+inline void WriteProtoToBinaryFile(const MessageLite& proto,
+                                   const string& filename) {
+  return WriteProtoToBinaryFile(proto, filename.c_str());
+}
+
+#ifdef CAFFE2_USE_LITE_PROTO
+
+namespace TextFormat {
+inline bool ParseFromString(const string& spec, MessageLite* proto) {
+  LOG(FATAL) << "If you are running lite version, you should not be "
+             << "calling any text-format protobuffers.";
+}
+} // namespace TextFormat
+
+
+string ProtoDebugString(const MessageLite& proto);
+
+bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
+
+// Text format MessageLite wrappers: these functions do nothing but just
+// allowing things to compile. It will produce a runtime error if you are using
+// MessageLite but still want text support.
+inline bool ReadProtoFromTextFile(
+    const char* /*filename*/,
+    MessageLite* /*proto*/) {
+  LOG(FATAL) << "If you are running lite version, you should not be "
+                  << "calling any text-format protobuffers.";
+  return false;  // Just to suppress compiler warning.
+}
+inline bool ReadProtoFromTextFile(const string filename, MessageLite* proto) {
+  return ReadProtoFromTextFile(filename.c_str(), proto);
+}
+
+inline void WriteProtoToTextFile(
+    const MessageLite& /*proto*/,
+    const char* /*filename*/) {
+  LOG(FATAL) << "If you are running lite version, you should not be "
+                  << "calling any text-format protobuffers.";
+}
+inline void WriteProtoToTextFile(const MessageLite& proto,
+                                 const string& filename) {
+  return WriteProtoToTextFile(proto, filename.c_str());
+}
+
+inline bool ReadProtoFromFile(const char* filename, MessageLite* proto) {
+  return (ReadProtoFromBinaryFile(filename, proto) ||
+          ReadProtoFromTextFile(filename, proto));
+}
+
+inline bool ReadProtoFromFile(const string& filename, MessageLite* proto) {
+  return ReadProtoFromFile(filename.c_str(), proto);
+}
+
+#else  // CAFFE2_USE_LITE_PROTO
+
+using ::google::protobuf::Message;
+
+namespace TextFormat {
+bool ParseFromString(const string& spec, Message* proto);
+} // namespace TextFormat
+
+string ProtoDebugString(const Message& proto);
+
+bool ParseProtoFromLargeString(const string& str, Message* proto);
+
+bool ReadProtoFromTextFile(const char* filename, Message* proto);
+inline bool ReadProtoFromTextFile(const string filename, Message* proto) {
+  return ReadProtoFromTextFile(filename.c_str(), proto);
+}
+
+void WriteProtoToTextFile(const Message& proto, const char* filename);
+inline void WriteProtoToTextFile(const Message& proto, const string& filename) {
+  return WriteProtoToTextFile(proto, filename.c_str());
+}
+
+// Read Proto from a file, letting the code figure out if it is text or binary.
+inline bool ReadProtoFromFile(const char* filename, Message* proto) {
+  return (ReadProtoFromBinaryFile(filename, proto) ||
+          ReadProtoFromTextFile(filename, proto));
+}
+
+inline bool ReadProtoFromFile(const string& filename, Message* proto) {
+  return ReadProtoFromFile(filename.c_str(), proto);
+}
+
+#endif  // CAFFE2_USE_LITE_PROTO
+
+template <
+    class IterableInputs = std::initializer_list<string>,
+    class IterableOutputs = std::initializer_list<string>,
+    class IterableArgs = std::initializer_list<Argument>>
+OperatorDef CreateOperatorDef(
+    const string& type,
+    const string& name,
+    const IterableInputs& inputs,
+    const IterableOutputs& outputs,
+    const IterableArgs& args,
+    const DeviceOption& device_option = DeviceOption(),
+    const string& engine = "") {
+  OperatorDef def;
+  def.set_type(type);
+  def.set_name(name);
+  for (const string& in : inputs) {
+    def.add_input(in);
+  }
+  for (const string& out : outputs) {
+    def.add_output(out);
+  }
+  for (const Argument& arg : args) {
+    def.add_arg()->CopyFrom(arg);
+  }
+  if (device_option.has_device_type()) {
+    def.mutable_device_option()->CopyFrom(device_option);
+  }
+  if (engine.size()) {
+    def.set_engine(engine);
+  }
+  return def;
+}
+
+// A simplified version compared to the full CreateOperator, if you do not need
+// to specify args.
+template <
+    class IterableInputs = std::initializer_list<string>,
+    class IterableOutputs = std::initializer_list<string>>
+inline OperatorDef CreateOperatorDef(
+    const string& type,
+    const string& name,
+    const IterableInputs& inputs,
+    const IterableOutputs& outputs,
+    const DeviceOption& device_option = DeviceOption(),
+    const string& engine = "") {
+  return CreateOperatorDef(
+      type,
+      name,
+      inputs,
+      outputs,
+      std::vector<Argument>(),
+      device_option,
+      engine);
+}
+
+bool HasOutput(const OperatorDef& op, const std::string& output);
+bool HasInput(const OperatorDef& op, const std::string& input);
+
+/**
+ * @brief A helper class to index into arguments.
+ *
+ * This helper helps us to more easily index into a set of arguments
+ * that are present in the operator. To save memory, the argument helper
+ * does not copy the operator def, so one would need to make sure that the
+ * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
+ */
+class ArgumentHelper {
+ public:
+  template <typename Def>
+  static bool HasArgument(const Def& def, const string& name) {
+    return ArgumentHelper(def).HasArgument(name);
+  }
+
+  template <typename Def, typename T>
+  static T GetSingleArgument(
+      const Def& def,
+      const string& name,
+      const T& default_value) {
+    return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
+  }
+
+  template <typename Def, typename T>
+  static bool HasSingleArgumentOfType(const Def& def, const string& name) {
+    return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
+  }
+
+  template <typename Def, typename T>
+  static vector<T> GetRepeatedArgument(
+      const Def& def,
+      const string& name,
+      const std::vector<T>& default_value = std::vector<T>()) {
+    return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
+  }
+
+  template <typename Def, typename MessageType>
+  static MessageType GetMessageArgument(const Def& def, const string& name) {
+    return ArgumentHelper(def).GetMessageArgument<MessageType>(name);
+  }
+
+  template <typename Def, typename MessageType>
+  static vector<MessageType> GetRepeatedMessageArgument(
+      const Def& def,
+      const string& name) {
+    return ArgumentHelper(def).GetRepeatedMessageArgument<MessageType>(name);
+  }
+
+  template <typename Def>
+  static bool RemoveArgument(Def& def, int index) {
+    if (index >= def.arg_size()) {
+      return false;
+    }
+    if (index < def.arg_size() - 1) {
+      def.mutable_arg()->SwapElements(index, def.arg_size() - 1);
+    }
+    def.mutable_arg()->RemoveLast();
+    return true;
+  }
+
+  explicit ArgumentHelper(const OperatorDef& def);
+  explicit ArgumentHelper(const NetDef& netdef);
+  bool HasArgument(const string& name) const;
+
+  template <typename T>
+  T GetSingleArgument(const string& name, const T& default_value) const;
+  template <typename T>
+  bool HasSingleArgumentOfType(const string& name) const;
+  template <typename T>
+  vector<T> GetRepeatedArgument(
+      const string& name,
+      const std::vector<T>& default_value = std::vector<T>()) const;
+
+  template <typename MessageType>
+  MessageType GetMessageArgument(const string& name) const {
+    CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
+    MessageType message;
+    if (arg_map_.at(name).has_s()) {
+      CAFFE_ENFORCE(
+          message.ParseFromString(arg_map_.at(name).s()),
+          "Faild to parse content from the string");
+    } else {
+      VLOG(1) << "Return empty message for parameter " << name;
+    }
+    return message;
+  }
+
+  template <typename MessageType>
+  vector<MessageType> GetRepeatedMessageArgument(const string& name) const {
+    CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
+    vector<MessageType> messages(arg_map_.at(name).strings_size());
+    for (int i = 0; i < messages.size(); ++i) {
+      CAFFE_ENFORCE(
+          messages[i].ParseFromString(arg_map_.at(name).strings(i)),
+          "Faild to parse content from the string");
+    }
+    return messages;
+  }
+
+ private:
+  CaffeMap<string, Argument> arg_map_;
+};
+
+// **** Arguments Utils *****
+
+// Helper methods to get an argument from OperatorDef or NetDef given argument
+// name. Throws if argument does not exist.
+const Argument& GetArgument(const OperatorDef& def, const string& name);
+const Argument& GetArgument(const NetDef& def, const string& name);
+
+// Helper methods to query a boolean argument flag from OperatorDef or NetDef
+// given argument name. If argument does not exist, return default value.
+// Throws if argument exists but the type is not boolean.
+bool GetFlagArgument(
+    const OperatorDef& def,
+    const string& name,
+    bool default_value = false);
+bool GetFlagArgument(
+    const NetDef& def,
+    const string& name,
+    bool default_value = false);
+
+Argument* GetMutableArgument(
+    const string& name,
+    const bool create_if_missing,
+    OperatorDef* def);
+
+template <typename T>
+Argument MakeArgument(const string& name, const T& value);
+
+template <typename T>
+inline void AddArgument(const string& name, const T& value, OperatorDef* def) {
+  GetMutableArgument(name, true, def)->CopyFrom(MakeArgument(name, value));
+}
+// **** End Arguments Utils *****
+
+bool inline operator==(const DeviceOption& dl, const DeviceOption& dr) {
+  return IsSameDevice(dl, dr);
+}
+
+
+} // namespace caffe2
+
+namespace std {
+template <>
+struct hash<caffe2::DeviceOption> {
+  typedef caffe2::DeviceOption argument_type;
+  typedef std::size_t result_type;
+  result_type operator()(argument_type const& device_option) const {
+    std::string serialized;
+    CAFFE_ENFORCE(device_option.SerializeToString(&serialized));
+    return std::hash<std::string>{}(serialized);
+  }
+};
+} // namespace std
+
+#endif // CAFFE2_UTILS_PROTO_UTILS_H_
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
new file mode 100644
index 0000000..645e080
--- /dev/null
+++ b/caffe2/utils/proto_utils_test.cc
@@ -0,0 +1,32 @@
+#include "caffe2/utils/proto_utils.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+TEST(ProtoUtilsTest, IsSameDevice) {
+  DeviceOption a;
+  DeviceOption b;
+  EXPECT_TRUE(IsSameDevice(a, b));
+  a.set_node_name("my_node");
+  EXPECT_FALSE(IsSameDevice(a, b));
+  b.set_node_name("my_node");
+  EXPECT_TRUE(IsSameDevice(a, b));
+  b.set_cuda_gpu_id(2);
+  EXPECT_FALSE(IsSameDevice(a, b));
+  a.set_cuda_gpu_id(2);
+  EXPECT_TRUE(IsSameDevice(a, b));
+  a.set_device_type(DeviceType::CUDA);
+  b.set_device_type(DeviceType::CPU);
+  EXPECT_FALSE(IsSameDevice(a, b));
+}
+
+TEST(ProtoUtilsTest, SimpleReadWrite) {
+  string content("The quick brown fox jumps over the lazy dog.");
+  string name = std::tmpnam(nullptr);
+  EXPECT_TRUE(WriteStringToFile(content, name.c_str()));
+  string read_back;
+  EXPECT_TRUE(ReadStringFromFile(name.c_str(), &read_back));
+  EXPECT_EQ(content, read_back);
+}
+
+}  // namespace caffe2
diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc
new file mode 100644
index 0000000..3bcacd8
--- /dev/null
+++ b/caffe2/utils/proto_wrap.cc
@@ -0,0 +1,42 @@
+#include "caffe2/utils/proto_wrap.h"
+#include "caffe2/core/common.h"
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/generated_message_util.h>
+
+namespace caffe {
+
+// Caffe wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+
+}  // namespace caffe
+
+namespace ONNX_NAMESPACE {
+
+// ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+
+}  // namespace ONNX_NAMESPACE
+
+namespace caffe2 {
+
+// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+
+void ShutdownProtobufLibrary() {
+  ::google::protobuf::ShutdownProtobufLibrary();
+}
+
+}  // namespace caffe2
diff --git a/caffe2/utils/proto_wrap.h b/caffe2/utils/proto_wrap.h
new file mode 100644
index 0000000..853ca4e
--- /dev/null
+++ b/caffe2/utils/proto_wrap.h
@@ -0,0 +1,12 @@
+#ifndef CAFFE2_UTILS_PROTO_WRAP_H_
+#define CAFFE2_UTILS_PROTO_WRAP_H_
+
+namespace caffe2 {
+
+// A wrapper function to shut down protobuf library (this is needed in ASAN
+// testing and valgrind cases to avoid protobuf appearing to "leak" memory).
+void ShutdownProtobufLibrary();
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_PROTO_WRAP_H_
diff --git a/caffe2/utils/signal_handler.cc b/caffe2/utils/signal_handler.cc
new file mode 100644
index 0000000..6883c6b
--- /dev/null
+++ b/caffe2/utils/signal_handler.cc
@@ -0,0 +1,453 @@
+#include "caffe2/utils/signal_handler.h"
+#include "caffe2/core/logging.h"
+
+#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
+
+// Normal signal handler implementation.
+#include <cxxabi.h>
+#include <dirent.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <unwind.h>
+
+#include <atomic>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_set>
+
+#include "caffe2/core/init.h"
+
+#if CAFFE2_ANDROID
+#ifndef SYS_gettid
+#define SYS_gettid __NR_gettid
+#endif
+#ifndef SYS_tgkill
+#define SYS_tgkill __NR_tgkill
+#endif
+#endif
+
+namespace {
+
+struct sigaction previousSighup;
+struct sigaction previousSigint;
+std::atomic<int> sigintCount(0);
+std::atomic<int> sighupCount(0);
+std::atomic<int> hookedUpCount(0);
+
+void handleSignal(int signal) {
+  switch (signal) {
+    // TODO: what if the previous handler uses sa_sigaction?
+    case SIGHUP:
+      sighupCount += 1;
+      if (previousSighup.sa_handler) {
+        previousSighup.sa_handler(signal);
+      }
+      break;
+    case SIGINT:
+      sigintCount += 1;
+      if (previousSigint.sa_handler) {
+        previousSigint.sa_handler(signal);
+      }
+      break;
+  }
+}
+
+void hookupHandler() {
+  if (hookedUpCount++) {
+    return;
+  }
+  struct sigaction sa;
+  // Setup the handler
+  sa.sa_handler = &handleSignal;
+  // Restart the system call, if at all possible
+  sa.sa_flags = SA_RESTART;
+  // Block every signal during the handler
+  sigfillset(&sa.sa_mask);
+  // Intercept SIGHUP and SIGINT
+  if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
+    LOG(FATAL) << "Cannot install SIGHUP handler.";
+  }
+  if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
+    LOG(FATAL) << "Cannot install SIGINT handler.";
+  }
+}
+
+// Set the signal handlers to the default.
+void unhookHandler() {
+  if (--hookedUpCount > 0) {
+    return;
+  }
+  struct sigaction sa;
+  // Setup the sighub handler
+  sa.sa_handler = SIG_DFL;
+  // Restart the system call, if at all possible
+  sa.sa_flags = SA_RESTART;
+  // Block every signal during the handler
+  sigfillset(&sa.sa_mask);
+  // Intercept SIGHUP and SIGINT
+  if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
+    LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
+  }
+  if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
+    LOG(FATAL) << "Cannot uninstall SIGINT handler.";
+  }
+}
+
+#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+// The mutex protects the bool.
+std::mutex fatalSignalHandlersInstallationMutex;
+bool fatalSignalHandlersInstalled;
+// We need to hold a reference to call the previous SIGUSR2 handler in case
+// we didn't signal it
+struct sigaction previousSigusr2;
+// Flag dictating whether the SIGUSR2 handler falls back to previous handlers
+// or is intercepted in order to print a stack trace.
+std::atomic<bool> fatalSignalReceived(false);
+// Global state set when a fatal signal is received so that backtracing threads
+// know why they're printing a stacktrace.
+const char* fatalSignalName("<UNKNOWN>");
+int fatalSignum(-1);
+// This wait condition is used to wait for other threads to finish writing
+// their stack trace when in fatal sig handler (we can't use pthread_join
+// because there's no way to convert from a tid to a pthread_t).
+pthread_cond_t writingCond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t writingMutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct {
+  const char* name;
+  int signum;
+  struct sigaction previous;
+} kSignalHandlers[] = {
+  { "SIGABRT",  SIGABRT,  {} },
+  { "SIGINT",   SIGINT,   {} },
+  { "SIGILL",   SIGILL,   {} },
+  { "SIGFPE",   SIGFPE,   {} },
+  { "SIGBUS",   SIGBUS,   {} },
+  { "SIGSEGV",  SIGSEGV,  {} },
+  { nullptr,    0,        {} }
+};
+
+struct sigaction* getPreviousSigaction(int signum) {
+  for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
+    if (handler->signum == signum) {
+      return &handler->previous;
+    }
+  }
+  return nullptr;
+}
+
+const char* getSignalName(int signum) {
+  for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
+    if (handler->signum == signum) {
+      return handler->name;
+    }
+  }
+  return nullptr;
+}
+
+_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
+  auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
+  pcs.push_back(_Unwind_GetIP(context));
+  return _URC_NO_REASON;
+}
+
+std::vector<uintptr_t> getBacktrace() {
+  std::vector<uintptr_t> pcs;
+  _Unwind_Backtrace(unwinder, &pcs);
+  return pcs;
+}
+
+void printStacktrace() {
+  std::vector<uintptr_t> pcs = getBacktrace();
+  Dl_info info;
+  size_t i = 0;
+  for (uintptr_t pcAddr : pcs) {
+    const void* pc = reinterpret_cast<const void*>(pcAddr);
+    const char* path = nullptr;
+    const char* name = "???";
+    char* demangled = nullptr;
+    int offset = -1;
+
+    std::cerr << "[" << i << "] ";
+    if (dladdr(pc, &info)) {
+      path = info.dli_fname;
+      name = info.dli_sname ?: "???";
+      offset = reinterpret_cast<uintptr_t>(pc) -
+          reinterpret_cast<uintptr_t>(info.dli_saddr);
+
+      int status;
+      demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
+      if (status == 0) {
+        name = demangled;
+      }
+    }
+    std::cerr << name;
+    if (offset >= 0) {
+      std::cerr << "+" << reinterpret_cast<void*>(offset);
+    }
+    std::cerr << "(" << pc << ")";
+    if (path) {
+      std::cerr << " in " << path;
+    }
+    std::cerr << std::endl;
+    if (demangled) {
+      free(demangled);
+    }
+    i += 1;
+  }
+}
+
+void callPreviousSignalHandler(
+    struct sigaction* action,
+    int signum,
+    siginfo_t* info,
+    void* ctx) {
+  if (!action->sa_handler) {
+    return;
+  }
+  if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
+    action->sa_sigaction(signum, info, ctx);
+  } else {
+    action->sa_handler(signum);
+  }
+}
+
+// needsLock signals whether we need to lock our writing mutex.
+void stacktraceSignalHandler(bool needsLock) {
+  if (needsLock) {
+    pthread_mutex_lock(&writingMutex);
+  }
+  pid_t tid = syscall(SYS_gettid);
+  std::cerr << fatalSignalName << "(" << fatalSignum << "), Thread " << tid
+            << ": " << std::endl;
+  printStacktrace();
+  std::cerr << std::endl;
+  if (needsLock) {
+    pthread_mutex_unlock(&writingMutex);
+    pthread_cond_signal(&writingCond);
+  }
+}
+
+// Our fatal signal entry point
+void fatalSignalHandler(int signum) {
+  // Check if this is a proper signal that we declared above.
+  const char* name = getSignalName(signum);
+  if (!name) {
+    return;
+  }
+  if (fatalSignalReceived) {
+    return;
+  }
+  // Set the flag so that our SIGUSR2 handler knows that we're aborting and
+  // that it should intercept any SIGUSR2 signal.
+  fatalSignalReceived = true;
+  // Set state for other threads.
+  fatalSignum = signum;
+  fatalSignalName = name;
+  // Linux doesn't have a nice userland API for enumerating threads so we
+  // need to use the proc pseudo-filesystem.
+  DIR* procDir = opendir("/proc/self/task");
+  if (procDir) {
+    pid_t pid = getpid();
+    pid_t currentTid = syscall(SYS_gettid);
+    struct dirent* entry;
+    pthread_mutex_lock(&writingMutex);
+    while ((entry = readdir(procDir)) != nullptr) {
+      if (entry->d_name[0] == '.') {
+        continue;
+      }
+      pid_t tid = atoi(entry->d_name);
+      // If we've found the current thread then we'll jump into the SIGUSR2
+      // handler before calling pthread_cond_wait thus deadlocking, so branch
+      // our directly to the backtrace handler instead of signaling it.
+      if (tid != currentTid) {
+        syscall(SYS_tgkill, pid, tid, SIGUSR2);
+        pthread_cond_wait(&writingCond, &writingMutex);
+      } else {
+        stacktraceSignalHandler(false);
+      }
+    }
+    pthread_mutex_unlock(&writingMutex);
+  } else {
+    perror("Failed to open /proc/self/task");
+  }
+  sigaction(signum, getPreviousSigaction(signum), nullptr);
+  raise(signum);
+}
+
+// Our SIGUSR2 entry point
+void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx) {
+  if (fatalSignalReceived) {
+    stacktraceSignalHandler(true);
+  } else {
+    // We don't want to actually change the signal handler as we want to
+    // remain the signal handler so that we may get the usr2 signal later.
+    callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
+  }
+}
+
+// Installs SIGABRT signal handler so that we get stack traces
+// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
+// so that threads can communicate with each other (be sure if you use SIGUSR2)
+// to install your handler before initing caffe2 (we properly fall back to
+// the previous handler if we didn't initiate the SIGUSR2).
+void installFatalSignalHandlers() {
+  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
+  if (fatalSignalHandlersInstalled) {
+    return;
+  }
+  fatalSignalHandlersInstalled = true;
+  struct sigaction sa;
+  sigemptyset(&sa.sa_mask);
+  // Since we'll be in an exiting situation it's possible there's memory
+  // corruption, so make our own stack just in case.
+  sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
+  sa.sa_handler = ::fatalSignalHandler;
+  for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
+    if (sigaction(handler->signum, &sa, &handler->previous)) {
+      std::string str("Failed to add ");
+      str += handler->name;
+      str += " handler!";
+      perror(str.c_str());
+    }
+  }
+  sa.sa_sigaction = ::stacktraceSignalHandler;
+  if (sigaction(SIGUSR2, &sa, &::previousSigusr2)) {
+    perror("Failed to add SIGUSR2 handler!");
+  }
+}
+
+void uninstallFatalSignalHandlers() {
+  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
+  if (!fatalSignalHandlersInstalled) {
+    return;
+  }
+  fatalSignalHandlersInstalled = false;
+  for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
+    if (sigaction(handler->signum, &handler->previous, nullptr)) {
+      std::string str("Failed to remove ");
+      str += handler->name;
+      str += " handler!";
+      perror(str.c_str());
+    } else {
+      handler->previous = {};
+    }
+  }
+  if (sigaction(SIGUSR2, &::previousSigusr2, nullptr)) {
+    perror("Failed to add SIGUSR2 handler!");
+  } else {
+    ::previousSigusr2 = {};
+  }
+}
+#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+
+} // namespace
+
+#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+CAFFE2_DEFINE_bool(
+    caffe2_print_stacktraces,
+    false,
+    "If set, prints stacktraces when a fatal signal is raised.");
+#endif
+
+namespace caffe2 {
+
+SignalHandler::SignalHandler(
+    SignalHandler::Action SIGINT_action,
+    SignalHandler::Action SIGHUP_action)
+    : SIGINT_action_(SIGINT_action),
+      SIGHUP_action_(SIGHUP_action),
+      my_sigint_count_(sigintCount),
+      my_sighup_count_(sighupCount) {
+  hookupHandler();
+}
+
+SignalHandler::~SignalHandler() {
+  unhookHandler();
+}
+
+// Return true iff a SIGINT has been received since the last time this
+// function was called.
+bool SignalHandler::GotSIGINT() {
+  uint64_t count = sigintCount;
+  bool result = (count != my_sigint_count_);
+  my_sigint_count_ = count;
+  return result;
+}
+
+// Return true iff a SIGHUP has been received since the last time this
+// function was called.
+bool SignalHandler::GotSIGHUP() {
+  uint64_t count = sighupCount;
+  bool result = (count != my_sighup_count_);
+  my_sighup_count_ = count;
+  return result;
+}
+
+SignalHandler::Action SignalHandler::CheckForSignals() {
+  if (GotSIGHUP()) {
+    return SIGHUP_action_;
+  }
+  if (GotSIGINT()) {
+    return SIGINT_action_;
+  }
+  return SignalHandler::Action::NONE;
+}
+
+#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+void setPrintStackTracesOnFatalSignal(bool print) {
+  if (print) {
+    installFatalSignalHandlers();
+  } else {
+    uninstallFatalSignalHandlers();
+  }
+}
+bool printStackTracesOnFatalSignal() {
+  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
+  return fatalSignalHandlersInstalled;
+}
+
+namespace internal {
+bool Caffe2InitFatalSignalHandler(int*, char***) {
+  if (caffe2::FLAGS_caffe2_print_stacktraces) {
+    setPrintStackTracesOnFatalSignal(true);
+  }
+  return true;
+}
+
+REGISTER_CAFFE2_INIT_FUNCTION(
+    Caffe2InitFatalSignalHandler,
+    &Caffe2InitFatalSignalHandler,
+    "Inits signal handlers for fatal signals so we can see what if"
+    " caffe2_print_stacktraces is set.");
+
+} // namepsace internal
+#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+} // namespace caffe2
+
+#else // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
+
+// TODO: Currently we do not support signal handling in non-Linux yet - below is
+// a minimal implementation that makes things compile.
+namespace caffe2 {
+SignalHandler::SignalHandler(
+    SignalHandler::Action SIGINT_action,
+    SignalHandler::Action SIGHUP_action) {}
+SignalHandler::~SignalHandler() {}
+bool SignalHandler::GotSIGINT() {
+  return false;
+}
+bool SignalHandler::GotSIGHUP() {
+  return false;
+}
+SignalHandler::Action SignalHandler::CheckForSignals() {
+  return SignalHandler::Action::NONE;
+}
+} // namespace caffe2
+
+#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
diff --git a/caffe2/utils/signal_handler.h b/caffe2/utils/signal_handler.h
new file mode 100644
index 0000000..82e9d1c
--- /dev/null
+++ b/caffe2/utils/signal_handler.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#if defined(__APPLE__)
+#define CAFFE2_SUPPORTS_SIGNAL_HANDLER
+#elif defined(__linux__) && !defined(CAFFE2_DISABLE_SIGNAL_HANDLERS)
+#define CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS
+#define CAFFE2_SUPPORTS_SIGNAL_HANDLER
+#endif
+
+namespace caffe2 {
+
+class SignalHandler {
+ public:
+  enum class Action {
+    NONE,
+    STOP
+  };
+
+  // Contructor. Specify what action to take when a signal is received.
+  SignalHandler(Action SIGINT_action,
+                Action SIGHUP_action);
+  ~SignalHandler();
+
+  Action CheckForSignals();
+
+ private:
+  bool GotSIGINT();
+  bool GotSIGHUP();
+  Action SIGINT_action_;
+  Action SIGHUP_action_;
+  unsigned long my_sigint_count_;
+  unsigned long my_sighup_count_;
+};
+
+#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+// This works by setting up certain fatal signal handlers. Previous fatal
+// signal handlers will still be called when the signal is raised. Defaults
+// to being off.
+void setPrintStackTracesOnFatalSignal(bool print);
+bool printStackTracesOnFatalSignal();
+#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
+
+}  // namespace caffe2
diff --git a/caffe2/utils/simple_queue.h b/caffe2/utils/simple_queue.h
new file mode 100644
index 0000000..5adedf1
--- /dev/null
+++ b/caffe2/utils/simple_queue.h
@@ -0,0 +1,79 @@
+#ifndef CAFFE2_UTILS_SIMPLE_QUEUE_H_
+#define CAFFE2_UTILS_SIMPLE_QUEUE_H_
+
+#include <condition_variable>  // NOLINT
+#include <mutex>  // NOLINT
+#include <queue>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+// This is a very simple queue that Yangqing wrote when bottlefeeding the baby,
+// so don't take it seriously. What it does is a minimal thread-safe queue that
+// allows me to run network as a DAG.
+//
+// A usual work pattern looks like this: one or multiple producers push jobs
+// into this queue, and one or multiple workers pops jobs from this queue. If
+// nothing is in the queue but NoMoreJobs() is not called yet, the pop calls
+// will wait. If NoMoreJobs() has been called, pop calls will return false,
+// which serves as a message to the workers that they should exit.
+template <typename T>
+class SimpleQueue {
+ public:
+  SimpleQueue() : no_more_jobs_(false) {}
+
+  // Pops a value and writes it to the value pointer. If there is nothing in the
+  // queue, this will wait till a value is inserted to the queue. If there are
+  // no more jobs to pop, the function returns false. Otherwise, it returns
+  // true.
+  bool Pop(T* value) {
+    std::unique_lock<std::mutex> mutex_lock(mutex_);
+    while (queue_.size() == 0 && !no_more_jobs_) cv_.wait(mutex_lock);
+    if (queue_.size() == 0 && no_more_jobs_) return false;
+    *value = queue_.front();
+    queue_.pop();
+    return true;
+  }
+
+  int size() {
+    std::unique_lock<std::mutex> mutex_lock(mutex_);
+    return queue_.size();
+  }
+
+  // Push pushes a value to the queue.
+  void Push(const T& value) {
+    {
+      std::lock_guard<std::mutex> mutex_lock(mutex_);
+      CAFFE_ENFORCE(!no_more_jobs_, "Cannot push to a closed queue.");
+      queue_.push(value);
+    }
+    cv_.notify_one();
+  }
+
+  // NoMoreJobs() marks the close of this queue. It also notifies all waiting
+  // Pop() calls so that they either check out remaining jobs, or return false.
+  // After NoMoreJobs() is called, this queue is considered closed - no more
+  // Push() functions are allowed, and once existing items are all checked out
+  // by the Pop() functions, any more Pop() function will immediately return
+  // false with nothing set to the value.
+  void NoMoreJobs() {
+    {
+      std::lock_guard<std::mutex> mutex_lock(mutex_);
+      no_more_jobs_ = true;
+    }
+    cv_.notify_all();
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::queue<T> queue_;
+  bool no_more_jobs_;
+  // We do not allow copy constructors.
+  SimpleQueue(const SimpleQueue& /*src*/) {}
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_UTILS_SIMPLE_QUEUE_H_
diff --git a/caffe2/utils/simple_queue_test.cc b/caffe2/utils/simple_queue_test.cc
new file mode 100644
index 0000000..d0cf24a
--- /dev/null
+++ b/caffe2/utils/simple_queue_test.cc
@@ -0,0 +1,72 @@
+#include <thread>  // NOLINT
+
+#include "caffe2/utils/simple_queue.h"
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+static std::unique_ptr<SimpleQueue<int> > gQueue;
+
+static void ConsumerFunction(int thread_idx) {
+  int value;
+  while (true) {
+    if (!gQueue->Pop(&value)) return;
+    VLOG(1) << "Emitting " << value << " from thread " << thread_idx;
+  }
+}
+
+static void ProducerFunction(int thread_idx, int start, int count) {
+  for (int i = 0; i < count; ++i) {
+    VLOG(1) << "Pushing " << i + start << " from thread " << thread_idx;
+    gQueue->Push(i + start);
+  }
+}
+
+
+TEST(SimpleQueueTest, SingleProducerSingleConsumer) {
+  gQueue.reset(new SimpleQueue<int>());
+  std::thread consumer(ConsumerFunction, 0);
+  for (int i = 0; i < 10; ++i) {
+    gQueue->Push(i);
+  }
+  gQueue->NoMoreJobs();
+  consumer.join();
+}
+
+TEST(SimpleQueueTest, SingleProducerDoubleConsumer) {
+  gQueue.reset(new SimpleQueue<int>());
+  std::thread consumer0(ConsumerFunction, 0);
+  std::thread consumer1(ConsumerFunction, 1);
+  for (int i = 0; i < 10; ++i) {
+    gQueue->Push(i);
+  }
+  gQueue->NoMoreJobs();
+  consumer0.join();
+  consumer1.join();
+}
+
+
+TEST(SimpleQueueTest, DoubleProducerDoubleConsumer) {
+  gQueue.reset(new SimpleQueue<int>());
+  std::thread producer0(ProducerFunction, 0, 0, 10);
+  std::thread producer1(ProducerFunction, 0, 10, 10);
+  std::thread consumer0(ConsumerFunction, 2);
+  std::thread consumer1(ConsumerFunction, 3);
+  producer0.join();
+  producer1.join();
+  gQueue->NoMoreJobs();
+  consumer0.join();
+  consumer1.join();
+}
+
+TEST(SimpleQueueDeathTest, CannotAddAfterQueueFinished) {
+  gQueue.reset(new SimpleQueue<int>());
+  gQueue->Push(0);
+  gQueue->NoMoreJobs();
+  ASSERT_THROW(gQueue->Push(0), EnforceNotMet);
+}
+
+
+}  // namespace caffe2
+
+
diff --git a/caffe2/utils/smart_tensor_printer.cc b/caffe2/utils/smart_tensor_printer.cc
new file mode 100644
index 0000000..feb669e
--- /dev/null
+++ b/caffe2/utils/smart_tensor_printer.cc
@@ -0,0 +1,77 @@
+#include "smart_tensor_printer.h"
+
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+namespace {
+
+// Since DispatchHelper doesn't support passing arguments through the call()
+// method to DoRunWithType we have to create an object that will hold these
+// arguments explicitly.
+struct ProxyPrinter {
+  template <typename T>
+  bool DoRunWithType() {
+    tensorPrinter->Print<T>(*tensor);
+    return true;
+  }
+
+  void Print() {
+    // Pulled in printable types from caffe2/core/types.cc
+    // Unfortunately right now one has to add them by hand
+    DispatchHelper<TensorTypes<
+        float,
+        int,
+        std::string,
+        bool,
+        uint8_t,
+        int8_t,
+        uint16_t,
+        int16_t,
+        int64_t,
+        double,
+        char>>::call(this, tensor->meta());
+  }
+
+  const Tensor<CPUContext>* tensor;
+  TensorPrinter* tensorPrinter;
+};
+}
+
+SmartTensorPrinter::SmartTensorPrinter(const std::string& tensor_name)
+    : tensorPrinter_(tensor_name) {}
+
+SmartTensorPrinter::SmartTensorPrinter(
+    const std::string& tensor_name,
+    const std::string& file_name)
+    : tensorPrinter_(tensor_name, file_name) {}
+
+SmartTensorPrinter::SmartTensorPrinter(
+    const std::string& tensor_name,
+    const std::string& file_name,
+    int limit)
+    : tensorPrinter_(tensor_name, file_name, limit) {}
+
+void SmartTensorPrinter::Print(const Tensor<CPUContext>& tensor) {
+  ProxyPrinter printer;
+
+  printer.tensor = &tensor;
+  printer.tensorPrinter = &tensorPrinter_;
+  printer.Print();
+}
+
+SmartTensorPrinter& SmartTensorPrinter::DefaultTensorPrinter() {
+// TODO(janusz): thread_local does not work under mac.
+#if __APPLE__
+  CAFFE_THROW(
+      "SmartTensorPrinter does not work on mac yet due to thread_local.");
+#else
+  static thread_local SmartTensorPrinter printer;
+  return printer;
+#endif
+}
+
+void SmartTensorPrinter::PrintTensor(const Tensor<CPUContext>& tensor) {
+  DefaultTensorPrinter().Print(tensor);
+}
+}
diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
new file mode 100644
index 0000000..f99226d
--- /dev/null
+++ b/caffe2/utils/smart_tensor_printer.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+// This is a wrapper around the TensorPrinter that doesn't require the user to
+// explicit specify the type of the tensor while calling the Print() method.
+// It also supports a convenience function with a default constructed printer as
+// a static method.
+class SmartTensorPrinter {
+ public:
+  // The proliferation of constructors is to give the feature parity with
+  // TensorPrinter
+  // yet not repeat the default arguments explicitly in case they change in the
+  // future.
+  SmartTensorPrinter() = default;
+
+  explicit SmartTensorPrinter(const std::string& tensor_name);
+
+  SmartTensorPrinter(
+      const std::string& tensor_name,
+      const std::string& file_name);
+
+  SmartTensorPrinter(
+      const std::string& tensor_name,
+      const std::string& file_name,
+      int limit);
+
+  void Print(const Tensor<CPUContext>& tensor);
+
+  template <class Context>
+  void PrintMeta(const Tensor<Context>& tensor) {
+    tensorPrinter_.PrintMeta(tensor);
+  }
+
+  // Uses a default constructed SmartTensorPrinter
+  static void PrintTensor(const Tensor<CPUContext>& tensor);
+
+  // Uses a default constructed SmartTensorPrinter
+  template <class Context>
+  void PrintTensorMeta(const Tensor<Context>& tensor) {
+    DefaultTensorPrinter().PrintMeta(tensor);
+  }
+
+ private:
+  // Returns a thread local default constructed TensorPrinter
+  static SmartTensorPrinter& DefaultTensorPrinter();
+
+  TensorPrinter tensorPrinter_;
+};
+}
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
new file mode 100644
index 0000000..d5681e2
--- /dev/null
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -0,0 +1,51 @@
+#include "caffe2/utils/smart_tensor_printer.h"
+
+#include "caffe2/core/common.h"
+
+#include <gtest/gtest.h>
+
+namespace caffe2 {
+
+template <typename T>
+std::string my_to_string(const T& value) {
+  return to_string(value);
+}
+
+template <>
+std::string my_to_string<std::string>(const std::string& value) {
+  return value;
+}
+
+template <typename T>
+void expect_stderr_contains(const std::vector<T>& values) {
+  std::string captured_stderr = testing::internal::GetCapturedStderr();
+  for (const auto& value : values) {
+    std::string stringValue = my_to_string(value);
+    EXPECT_TRUE(captured_stderr.find(stringValue) != std::string::npos);
+  }
+}
+
+template <typename T>
+void printTensorAndCheck(const std::vector<T>& values) {
+  testing::internal::CaptureStderr();
+  CPUContext cpuContext;
+
+  Tensor<CPUContext> tensor(
+      std::vector<TIndex>{static_cast<TIndex>(values.size())},
+      values,
+      &cpuContext);
+
+  SmartTensorPrinter::PrintTensor(tensor);
+  expect_stderr_contains(values);
+}
+
+#if !(__APPLE__) // TODO(janusz): thread_local does not work under mac.
+
+TEST(SmartTensorPrinterTest, SimpleTest) {
+  printTensorAndCheck(std::vector<int>{1, 2, 3, 4, 5});
+  printTensorAndCheck(std::vector<std::string>{"bob", "alice", "facebook"});
+}
+
+#endif // !(__APPLE__)
+
+} // namespace caffe2
diff --git a/caffe2/utils/string_utils.cc b/caffe2/utils/string_utils.cc
new file mode 100644
index 0000000..a7b5bfe
--- /dev/null
+++ b/caffe2/utils/string_utils.cc
@@ -0,0 +1,116 @@
+#include "caffe2/utils/string_utils.h"
+
+#include <algorithm>
+#include <sstream>
+#include <vector>
+
+namespace caffe2 {
+
+std::vector<std::string> split(char separator, const std::string& string) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (getline(ss, item, separator)) {
+    pieces.push_back(std::move(item));
+  }
+  return pieces;
+}
+
+std::string trim(const std::string& str) {
+  size_t left = str.find_first_not_of(' ');
+  if (left == std::string::npos) {
+    return str;
+  }
+  size_t right = str.find_last_not_of(' ');
+  return str.substr(left, (right - left + 1));
+}
+
+size_t editDistance(
+  const std::string& s1, const std::string& s2, size_t max_distance)
+  {
+    std::vector<size_t> current(s1.length() + 1);
+    std::vector<size_t> previous(s1.length() + 1);
+    std::vector<size_t> previous1(s1.length() + 1);
+
+    return editDistanceHelper(
+        s1.c_str(),
+        s1.length(),
+        s2.c_str(),
+        s2.length(),
+        current,
+        previous,
+        previous1,
+        max_distance
+    );
+  }
+  #define NEXT_UNSAFE(s, i, c) { \
+      (c)=(uint8_t)(s)[(i)++]; \
+  }
+
+int32_t editDistanceHelper(const char* s1,
+  size_t s1_len,
+  const char* s2,
+  size_t s2_len,
+  std::vector<size_t> &current,
+  std::vector<size_t> &previous,
+  std::vector<size_t> &previous1,
+  size_t max_distance) {
+    if (max_distance) {
+      if (std::max(s1_len, s2_len) - std::min(s1_len, s2_len) > max_distance) {
+        return max_distance+1;
+      }
+    }
+
+    for (size_t j = 0; j <= s1_len; ++j) {
+      current[j] = j;
+    }
+
+    int32_t str2_offset = 0;
+    char prev2 = 0;
+    for (size_t i = 1; i <= s2_len; ++i) {
+      swap(previous1, previous);
+      swap(current, previous);
+      current[0] = i;
+
+      char c2 = s2[str2_offset];
+      char prev1 = 0;
+      int32_t str1_offset = 0;
+
+      NEXT_UNSAFE(s2, str2_offset, c2);
+
+      size_t current_min = s1_len;
+      for (size_t j = 1; j <= s1_len; ++j) {
+        size_t insertion = previous[j] + 1;
+        size_t deletion = current[j - 1] + 1;
+        size_t substitution = previous[j - 1];
+        size_t transposition = insertion;
+        char c1 = s1[str1_offset];
+
+        NEXT_UNSAFE(s1, str1_offset, c1);
+
+        if (c1 != c2) {
+          substitution += 1;
+        }
+
+
+        if (prev1 == c2 && prev2 == c1 && j > 1 && i > 1) {
+          transposition = previous1[j - 2] + 1;
+        }
+        prev1 = c1;
+
+        current[j] = std::min(std::min(insertion, deletion),
+                         std::min(substitution, transposition));
+        current_min = std::min(current_min, current[j]);
+      }
+
+
+      if (max_distance != 0 && current_min > max_distance) {
+        return max_distance+1;
+      }
+
+      prev2 = c2;
+    }
+
+    return current[s1_len];
+  }
+} // namespace caffe2
diff --git a/caffe2/utils/string_utils.h b/caffe2/utils/string_utils.h
new file mode 100644
index 0000000..ff0666f
--- /dev/null
+++ b/caffe2/utils/string_utils.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+std::vector<std::string> split(char separator, const std::string& string);
+
+std::string trim(const std::string& str);
+
+size_t editDistance(
+  const std::string& s1, const std::string& s2, size_t max_distance = 0);
+
+inline bool StartsWith(const std::string& str, const std::string& prefix) {
+  return std::mismatch(prefix.begin(), prefix.end(), str.begin()).first ==
+      prefix.end();
+}
+
+int32_t editDistanceHelper(const char* s1,
+  size_t s1_len,
+  const char* s2,
+  size_t s2_len,
+  std::vector<size_t> &current,
+  std::vector<size_t> &previous,
+  std::vector<size_t> &previous1,
+  size_t max_distance);
+} // namespace caffe2
diff --git a/caffe2/utils/thread_name.cc b/caffe2/utils/thread_name.cc
new file mode 100644
index 0000000..5fdcc22
--- /dev/null
+++ b/caffe2/utils/thread_name.cc
@@ -0,0 +1,24 @@
+#include "caffe2/utils/thread_name.h"
+
+#include <algorithm>
+
+#if defined(__GLIBC__) && !defined(__APPLE__) && !defined(__ANDROID__)
+#define CAFFE2_HAS_PTHREAD_SETNAME_NP
+#endif
+
+#ifdef CAFFE2_HAS_PTHREAD_SETNAME_NP
+#include <pthread.h>
+#endif
+
+namespace caffe2 {
+
+void setThreadName(std::string name) {
+#ifdef CAFFE2_HAS_PTHREAD_SETNAME_NP
+  constexpr size_t kMaxThreadName = 15;
+  name.resize(std::min(name.size(), kMaxThreadName));
+
+  pthread_setname_np(pthread_self(), name.c_str());
+#endif
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/thread_name.h b/caffe2/utils/thread_name.h
new file mode 100644
index 0000000..aece335
--- /dev/null
+++ b/caffe2/utils/thread_name.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <string>
+
+namespace caffe2 {
+
+void setThreadName(std::string name);
+
+} // namespace caffe2
diff --git a/caffe2/utils/thread_pool.h b/caffe2/utils/thread_pool.h
new file mode 100644
index 0000000..3c10dd7
--- /dev/null
+++ b/caffe2/utils/thread_pool.h
@@ -0,0 +1,174 @@
+#ifndef CAFFE2_UTILS_THREAD_POOL_H_
+#define CAFFE2_UTILS_THREAD_POOL_H_
+
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+
+#include "caffe2/core/numa.h"
+#include "caffe2/utils/thread_name.h"
+
+namespace caffe2 {
+
+class TaskThreadPool {
+ private:
+  struct task_element_t {
+    bool run_with_id;
+    const std::function<void()> no_id;
+    const std::function<void(std::size_t)> with_id;
+
+    explicit task_element_t(const std::function<void()>& f)
+        : run_with_id(false), no_id(f), with_id(nullptr) {}
+    explicit task_element_t(const std::function<void(std::size_t)>& f)
+        : run_with_id(true), no_id(nullptr), with_id(f) {}
+  };
+
+  std::queue<task_element_t> tasks_;
+  std::vector<std::thread> threads_;
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::condition_variable completed_;
+  bool running_;
+  bool complete_;
+  std::size_t available_;
+  std::size_t total_;
+  int numa_node_id_;
+
+ public:
+  explicit TaskThreadPool(std::size_t pool_size, int numa_node_id = -1)
+      : threads_(pool_size),
+        running_(true),
+        complete_(true),
+        available_(pool_size),
+        total_(pool_size),
+        numa_node_id_(numa_node_id) {
+    for (std::size_t i = 0; i < pool_size; ++i) {
+      threads_[i] = std::thread(std::bind(&TaskThreadPool::main_loop, this, i));
+    }
+  }
+
+  // Set running flag to false then notify all threads.
+  ~TaskThreadPool() {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      running_ = false;
+      condition_.notify_all();
+    }
+
+    try {
+      for (auto& t : threads_) {
+        t.join();
+      }
+    } catch (const std::exception&) {
+    }
+  }
+
+  size_t size() const {
+    return threads_.size();
+  }
+
+  /**
+   * The number of available (i.e. idle) threads in this thread pool.
+   */
+  size_t num_available() const {
+    return available_;
+  }
+
+  /// @brief Add task to the thread pool if a thread is currently available.
+  template <typename Task>
+  void runTask(Task task) {
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    // Set task and signal condition variable so that a worker thread will
+    // wake up and use the task.
+    tasks_.push(task_element_t(static_cast<std::function<void()>>(task)));
+    complete_ = false;
+    condition_.notify_one();
+  }
+
+  void run(const std::function<void()>& func) {
+    runTask(func);
+  }
+
+  template <typename Task>
+  void runTaskWithID(Task task) {
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    // Set task and signal condition variable so that a worker thread will
+    // wake up and use the task.
+    tasks_.push(
+        task_element_t(static_cast<std::function<void(std::size_t)>>(task)));
+    complete_ = false;
+    condition_.notify_one();
+  }
+
+  /// @brief Wait for queue to be empty
+  void waitWorkComplete() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!complete_) {
+      completed_.wait(lock);
+    }
+  }
+
+ private:
+  /// @brief Entry point for pool threads.
+  void main_loop(std::size_t index) {
+    setThreadName("CaffeTaskThread");
+    NUMABind(numa_node_id_);
+
+    while (running_) {
+      // Wait on condition variable while the task is empty and
+      // the pool is still running.
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (tasks_.empty() && running_) {
+        condition_.wait(lock);
+      }
+      // If pool is no longer running, break out of loop.
+      if (!running_) {
+        break;
+      }
+
+      // Copy task locally and remove from the queue.  This is
+      // done within its own scope so that the task object is
+      // destructed immediately after running the task.  This is
+      // useful in the event that the function contains
+      // shared_ptr arguments bound via bind.
+      {
+        auto tasks = tasks_.front();
+        tasks_.pop();
+        // Decrement count, indicating thread is no longer available.
+        --available_;
+
+        lock.unlock();
+
+        // Run the task.
+        try {
+          if (tasks.run_with_id) {
+            tasks.with_id(index);
+          } else {
+            tasks.no_id();
+          }
+        } catch (const std::exception&) {
+        }
+
+        // Update status of empty, maybe
+        // Need to recover the lock first
+        lock.lock();
+
+        // Increment count, indicating thread is available.
+        ++available_;
+        if (tasks_.empty() && available_ == total_) {
+          complete_ = true;
+          completed_.notify_one();
+        }
+      }
+    } // while running_
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_THREAD_POOL_H_
diff --git a/caffe2/utils/threadpool/ThreadPool.cc b/caffe2/utils/threadpool/ThreadPool.cc
new file mode 100644
index 0000000..002666a
--- /dev/null
+++ b/caffe2/utils/threadpool/ThreadPool.cc
@@ -0,0 +1,161 @@
+#include "caffe2/utils/threadpool/ThreadPool.h"
+#include "WorkersPool.h"
+#include "caffe2/core/logging.h"
+
+#include <cpuinfo.h>
+
+CAFFE2_DEFINE_bool(caffe2_threadpool_force_inline, false,
+                   "Force to always run jobs on the calling thread");
+
+// Whether or not threadpool caps apply to Android
+CAFFE2_DEFINE_int(caffe2_threadpool_android_cap, true, "");
+
+// Whether or not threadpool caps apply to iOS
+CAFFE2_DEFINE_int(caffe2_threadpool_ios_cap, true, "");
+
+
+namespace caffe2 {
+
+// Default smallest amount of work that will be partitioned between
+// multiple threads; the runtime value is configurable
+#if CAFFE2_ANDROID
+constexpr size_t kDefaultMinWorkSize = 8;
+#else
+constexpr size_t kDefaultMinWorkSize = 80;
+#endif
+
+std::unique_ptr<ThreadPool> ThreadPool::defaultThreadPool() {
+  CAFFE_ENFORCE(cpuinfo_initialize(), "cpuinfo initialization failed");
+  int numThreads = cpuinfo_get_processors_count();
+
+  bool applyCap = false;
+#if CAFFE2_ANDROID
+  applyCap = caffe2::FLAGS_caffe2_threadpool_android_cap;
+#elif CAFFE2_IOS
+  applyCap = caffe2::FLAGS_caffe2_threadpool_ios_cap;
+#endif
+
+  if (applyCap) {
+    switch (numThreads) {
+#if CAFFE2_ANDROID && (CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64)
+      case 4:
+          switch (cpuinfo_get_core(0)->midr & UINT32_C(0xFF00FFF0)) {
+            case UINT32_C(0x51002110): /* Snapdragon 820 Kryo Silver */
+            case UINT32_C(0x51002010): /* Snapdragon 821 Kryo Silver */
+            case UINT32_C(0x51002050): /* Snapdragon 820/821 Kryo Gold */
+              /* Kryo: 2+2 big.LITTLE */
+              numThreads = 2;
+              break;
+            default:
+              /* Anything else: assume homogeneous architecture */
+              numThreads = 4;
+              break;
+          }
+        break;
+#endif
+      case 5:
+        /* 4+1 big.LITTLE */
+        numThreads = 4;
+        break;
+      case 6:
+        /* 2+4 big.LITTLE */
+        numThreads = 2;
+        break;
+      case 8:
+        /* 4+4 big.LITTLE */
+        numThreads = 4;
+        break;
+      case 10:
+        /* 4+4+2 Min.Med.Max, running on Med cores */
+        numThreads = 4;
+        break;
+      default:
+        if (numThreads > 4) {
+          numThreads = numThreads / 2;
+        }
+        break;
+    }
+  }
+  LOG(INFO) << "Constructing thread pool with " << numThreads << " threads";
+  return caffe2::make_unique<ThreadPool>(numThreads);
+}
+
+ThreadPool::ThreadPool(int numThreads)
+    : minWorkSize_(kDefaultMinWorkSize), numThreads_(numThreads),
+      workersPool_(std::make_shared<WorkersPool>()) {}
+
+ThreadPool::~ThreadPool() {}
+
+int ThreadPool::getNumThreads() const {
+  std::lock_guard<std::mutex> guard(executionMutex_);
+  return numThreads_;
+}
+
+// Sets the minimum work size (range) for which to invoke the
+// threadpool; work sizes smaller than this will just be run on the
+// main (calling) thread
+void ThreadPool::setMinWorkSize(size_t size) {
+  std::lock_guard<std::mutex> guard(executionMutex_);
+  minWorkSize_ = size;
+}
+
+void ThreadPool::run(const std::function<void(int, size_t)>& fn, size_t range) {
+  std::lock_guard<std::mutex> guard(executionMutex_);
+  // If there are no worker threads, or if the range is too small (too
+  // little work), just run locally
+  const bool runLocally = range < minWorkSize_ ||
+                          FLAGS_caffe2_threadpool_force_inline ||
+                          (numThreads_ == 0);
+  if (runLocally) {
+    // Work is small enough to just run locally; multithread overhead
+    // is too high
+    for (size_t i = 0; i < range; ++i) {
+      fn(0, i);
+    }
+    return;
+  }
+
+  struct FnTask : public Task {
+    FnTask(){};
+    virtual ~FnTask(){};
+    const std::function<void(int, size_t)> *fn_;
+    int idx_;
+    size_t start_;
+    size_t end_;
+    virtual void Run() override {
+      for (auto i = start_; i < end_; ++i) {
+        (*fn_)(idx_, i);
+      }
+    }
+  };
+
+  CAFFE_ENFORCE_GE(numThreads_, 1);
+  const size_t unitsPerTask = (range + numThreads_ - 1) / numThreads_;
+  tasks_.resize(numThreads_);
+  for (size_t i = 0; i < numThreads_; ++i) {
+    if (!tasks_[i]) {
+      tasks_[i].reset(new FnTask());
+    }
+    auto *task = (FnTask *)tasks_[i].get();
+    task->fn_ = &fn;
+    task->idx_ = i;
+    task->start_ = std::min<size_t>(range, i * unitsPerTask);
+    task->end_ = std::min<size_t>(range, (i + 1) * unitsPerTask);
+    if (task->start_ >= task->end_) {
+      tasks_.resize(i);
+      break;
+    }
+    CAFFE_ENFORCE_LE(task->start_, range);
+    CAFFE_ENFORCE_LE(task->end_, range);
+  }
+  CAFFE_ENFORCE_LE(tasks_.size(), numThreads_);
+  CAFFE_ENFORCE_GE(tasks_.size(), 1);
+  workersPool_->Execute(tasks_);
+}
+
+void ThreadPool::withPool(const std::function<void(WorkersPool*)>& f) {
+  std::lock_guard<std::mutex> guard(executionMutex_);
+  f(workersPool_.get());
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/threadpool/ThreadPool.h b/caffe2/utils/threadpool/ThreadPool.h
new file mode 100644
index 0000000..aa6c44a
--- /dev/null
+++ b/caffe2/utils/threadpool/ThreadPool.h
@@ -0,0 +1,57 @@
+#ifndef CAFFE2_UTILS_THREADPOOL_H_
+#define CAFFE2_UTILS_THREADPOOL_H_
+
+#include "ThreadPoolCommon.h"
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+//
+// A work-stealing threadpool loosely based off of pthreadpool
+//
+
+namespace caffe2 {
+
+struct Task;
+class WorkersPool;
+
+constexpr size_t kCacheLineSize = 64;
+
+// A threadpool with the given number of threads.
+// NOTE: the kCacheLineSize alignment is present only for cache
+// performance, and is not strictly enforced (for example, when
+// the object is created on the heap). Thus, in order to avoid
+// misaligned intrinsics, no SSE instructions shall be involved in
+// the ThreadPool implementation.
+class alignas(kCacheLineSize) ThreadPool {
+ public:
+  static std::unique_ptr<ThreadPool> defaultThreadPool();
+  ThreadPool(int numThreads);
+  ~ThreadPool();
+  // Returns the number of threads currently in use
+  int getNumThreads() const;
+
+  // Sets the minimum work size (range) for which to invoke the
+  // threadpool; work sizes smaller than this will just be run on the
+  // main (calling) thread
+  void setMinWorkSize(size_t size);
+  size_t getMinWorkSize() const { return minWorkSize_; }
+  void run(const std::function<void(int, size_t)>& fn, size_t range);
+
+  // Run an arbitrary function in a thread-safe manner accessing the Workers
+  // Pool
+  void withPool(const std::function<void(WorkersPool*)>& fn);
+
+ private:
+  mutable std::mutex executionMutex_;
+  size_t minWorkSize_;
+  size_t numThreads_;
+  std::shared_ptr<WorkersPool> workersPool_;
+  std::vector<std::shared_ptr<Task>> tasks_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_THREADPOOL_H_
diff --git a/caffe2/utils/threadpool/ThreadPoolCommon.h b/caffe2/utils/threadpool/ThreadPoolCommon.h
new file mode 100644
index 0000000..997b5b1
--- /dev/null
+++ b/caffe2/utils/threadpool/ThreadPoolCommon.h
@@ -0,0 +1,23 @@
+#ifndef CAFFE2_UTILS_THREADPOOL_COMMON_H_
+#define CAFFE2_UTILS_THREADPOOL_COMMON_H_
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+// caffe2 depends upon NNPACK, which depends upon this threadpool, so
+// unfortunately we can't reference core/common.h here
+
+// This is copied from core/common.h's definition of CAFFE2_MOBILE
+// Define enabled when building for iOS or Android devices
+#if defined(__ANDROID__)
+#define CAFFE2_ANDROID 1
+#elif (defined(__APPLE__) &&                                            \
+       (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define CAFFE2_IOS 1
+#elif (defined(__APPLE__) && TARGET_OS_MAC)
+#define CAFFE2_IOS 1
+#else
+#endif // ANDROID / IOS / MACOS
+
+#endif  // CAFFE2_UTILS_THREADPOOL_COMMON_H_
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
new file mode 100644
index 0000000..0c621d5
--- /dev/null
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -0,0 +1,368 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <thread>
+#include "caffe2/core/common.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/thread_name.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+namespace caffe2 {
+
+// Uses code derived from gemmlowp,
+// https://github.com/google/gemmlowp/blob/6c91e1ed0c2eff1182d804310b92911fe9c18019/internal/multi_thread_gemm.h
+// Changes:
+// - allocation-free execute()
+// - Use RAII where possible.
+// - Run the first task on the main thread (since that is the largest task).
+// - removed custom allocator.
+// - Removed some ifdef's
+// - cache-line align Worker.
+// - use std::atomic instead of volatile and custom barriers.
+// - use std::mutex/std::condition_variable instead of raw pthreads.
+
+constexpr size_t kGEMMLOWPCacheLineSize = 64;
+
+template <typename T>
+struct AllocAligned {
+  // Allocate a T aligned at an `align` byte address
+  template <typename... Args>
+  static T* alloc(Args&&... args) {
+    void* p = nullptr;
+
+#if defined(__ANDROID__)
+    p = memalign(kGEMMLOWPCacheLineSize, sizeof(T));
+#elif defined(_MSC_VER)
+    p = _aligned_malloc(sizeof(T), kGEMMLOWPCacheLineSize);
+#else
+    posix_memalign((void**)&p, kGEMMLOWPCacheLineSize, sizeof(T));
+#endif
+
+    if (p) {
+      return new (p) T(std::forward<Args>(args)...);
+    }
+
+    return nullptr;
+  }
+
+  // Free a T previously allocated via AllocAligned<T>::alloc()
+  static void release(T* p) {
+    if (p) {
+      p->~T();
+      free((void*)p);
+    }
+  }
+};
+
+// Deleter object for unique_ptr for an aligned object
+template <typename T>
+struct AlignedDeleter {
+  void operator()(T* p) const { AllocAligned<T>::release(p); }
+};
+
+// make_unique that guarantees alignment
+template <typename T>
+struct MakeAligned {
+  template <typename... Args>
+  static std::unique_ptr<T, AlignedDeleter<T>> make(Args&&... args) {
+    return std::unique_ptr<T, AlignedDeleter<T>>(
+        AllocAligned<T>::alloc(std::forward<Args>(args)...));
+  }
+};
+
+const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
+
+#if defined(_MSC_VER)
+#define GEMMLOWP_NOP __nop();
+#else
+#define GEMMLOWP_NOP "nop\n"
+#endif
+
+#define GEMMLOWP_STRING_CONCAT_4(X) X X X X
+#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
+#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
+#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
+
+inline int Do256NOPs() {
+#if defined(_MSC_VER)
+  GEMMLOWP_NOP64;
+#else
+  asm volatile(GEMMLOWP_NOP64);
+#endif
+  return 64;
+}
+
+#undef GEMMLOWP_STRING_CONCAT_4
+#undef GEMMLOWP_NOP256
+#undef GEMMLOWP_NOP64
+#undef GEMMLOWP_NOP16
+#undef GEMMLOWP_NOP4
+#undef GEMMLOWP_NOP
+
+// Waits until *var != initial_value.
+//
+// Returns the new value of *var. The guarantee here is that
+// the return value is different from initial_value, and that that
+// new value has been taken by *var at some point during the
+// execution of this function. There is no guarantee that this is
+// still the value of *var when this function returns, since *var is
+// not assumed to be guarded by any lock.
+//
+// First does some busy-waiting for a fixed number of no-op cycles,
+// then falls back to passive waiting for the given condvar, guarded
+// by the given mutex.
+//
+// The idea of doing some initial busy-waiting is to help get
+// better and more consistent multithreading benefits for small GEMM sizes.
+// Busy-waiting help ensuring that if we need to wake up soon after having
+// started waiting, then we can wake up quickly (as opposed to, say,
+// having to wait to be scheduled again by the OS). On the other hand,
+// we must still eventually revert to passive waiting for longer waits
+// (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
+// so as to avoid permanently spinning.
+//
+template <typename T>
+T WaitForVariableChange(std::atomic<T>* var,
+                        T initial_value,
+                        std::condition_variable* cond,
+                        std::mutex* mutex) {
+  // If we are on a platform that supports it, spin for some time.
+  {
+    int nops = 0;
+    // First, trivial case where the variable already changed value.
+    T new_value = var->load(std::memory_order_relaxed);
+    if (new_value != initial_value) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return new_value;
+    }
+    // Then try busy-waiting.
+    while (nops < kMaxBusyWaitNOPs) {
+      nops += Do256NOPs();
+      new_value = var->load(std::memory_order_relaxed);
+      if (new_value != initial_value) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return new_value;
+      }
+    }
+  }
+
+  // Finally, do real passive waiting.
+  {
+    std::unique_lock<std::mutex> g(*mutex);
+    T new_value = var->load(std::memory_order_relaxed);
+    // Handle spurious wakeups.
+    cond->wait(g, [&]() {
+      new_value = var->load(std::memory_order_relaxed);
+      return new_value != initial_value;
+    });
+    DCHECK_NE(static_cast<size_t>(new_value), static_cast<size_t>(initial_value));
+    return new_value;
+  }
+}
+
+// A BlockingCounter lets one thread to wait for N events to occur.
+// This is how the master thread waits for all the worker threads
+// to have finished working.
+class BlockingCounter {
+ public:
+  // Sets/resets the counter; initial_count is the number of
+  // decrementing events that the Wait() call will be waiting for.
+  void Reset(std::size_t initial_count) {
+    std::lock_guard<std::mutex> g(mutex_);
+    DCHECK_EQ(count_, 0);
+    count_ = initial_count;
+  }
+
+  // Decrements the counter; if the counter hits zero, signals
+  // the thread that was waiting for that, and returns true.
+  // Otherwise (if the decremented count is still nonzero),
+  // returns false.
+  bool DecrementCount() {
+    const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1;
+    DCHECK_GE(count_value, 0);
+    if (count_value == 0) {
+      std::lock_guard<std::mutex> g(mutex_);
+      cond_.notify_one();
+    }
+    bool retval = count_value == 0;
+    return retval;
+  }
+
+  // Waits for the N other threads (N having been set by Reset())
+  // to hit the BlockingCounter.
+  void Wait() {
+    while (size_t count_value = count_.load(std::memory_order_relaxed)) {
+      WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
+    }
+  }
+
+ private:
+  std::condition_variable cond_;
+  std::mutex mutex_;
+  std::atomic<std::size_t> count_{0};
+};
+
+// A workload for a worker.
+struct Task {
+  Task() {}
+  virtual ~Task() {}
+  virtual void Run() = 0;
+};
+
+// A worker thread.
+class alignas(kGEMMLOWPCacheLineSize) Worker {
+ public:
+  enum class State : uint8_t {
+    ThreadStartup, // The initial state before the thread main loop runs.
+    Ready, // Is not working, has not yet received new work to do.
+    HasWork, // Has work to do.
+    ExitAsSoonAsPossible // Should exit at earliest convenience.
+  };
+
+  explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
+      : task_(nullptr),
+        state_(State::ThreadStartup),
+        counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+    thread_ = caffe2::make_unique<std::thread>([this]() { this->ThreadFunc(); });
+  }
+
+  ~Worker() {
+    ChangeState(State::ExitAsSoonAsPossible);
+    thread_->join();
+  }
+
+  // Changes State; may be called from either the worker thread
+  // or the master thread; however, not all state transitions are legal,
+  // which is guarded by assertions.
+  void ChangeState(State new_state) {
+    std::lock_guard<std::mutex> g(state_mutex_);
+    DCHECK(new_state != state_.load(std::memory_order_relaxed));
+    switch (state_.load(std::memory_order_relaxed)) {
+    case State::ThreadStartup:
+      DCHECK(new_state == State::Ready);
+      break;
+    case State::Ready:
+      DCHECK(new_state == State::HasWork || new_state == State::ExitAsSoonAsPossible);
+      break;
+    case State::HasWork:
+      DCHECK(new_state == State::Ready || new_state == State::ExitAsSoonAsPossible);
+      break;
+    default:
+      abort();
+    }
+    state_.store(new_state, std::memory_order_relaxed);
+    state_cond_.notify_one();
+    if (new_state == State::Ready) {
+      counter_to_decrement_when_ready_->DecrementCount();
+    }
+  }
+
+  // Thread entry point.
+  void ThreadFunc() {
+    setThreadName("CaffeWorkersPool");
+    ChangeState(State::Ready);
+
+    // Thread main loop
+    while (true) {
+      // Get a state to act on
+      // In the 'Ready' state, we have nothing to do but to wait until
+      // we switch to another state.
+      State state_to_act_upon =
+          WaitForVariableChange(&state_, State::Ready, &state_cond_, &state_mutex_);
+
+      // We now have a state to act on, so act.
+      switch (state_to_act_upon) {
+      case State::HasWork:
+        // Got work to do! So do it, and then revert to 'Ready' state.
+        DCHECK(task_.load());
+        (*task_).Run();
+        task_ = nullptr;
+        ChangeState(State::Ready);
+        break;
+      case State::ExitAsSoonAsPossible:
+        return;
+      default:
+        abort();
+      }
+    }
+  }
+
+  static void* ThreadFunc(void* arg) {
+    static_cast<Worker*>(arg)->ThreadFunc();
+    return nullptr;
+  }
+
+  // Called by the master thead to give this worker work to do.
+  // It is only legal to call this if the worker
+  void StartWork(Task* task) {
+    DCHECK(!task_.load());
+    task_ = task;
+    DCHECK(state_.load(std::memory_order_acquire) == State::Ready);
+    ChangeState(State::HasWork);
+  }
+
+ private:
+  // The underlying thread.
+  std::unique_ptr<std::thread> thread_;
+
+  // The task to be worked on.
+  std::atomic<Task*> task_;
+
+  // The condition variable and mutex guarding state changes.
+  std::condition_variable state_cond_;
+  std::mutex state_mutex_;
+
+  // The state enum tells if we're currently working, waiting for work, etc.
+  std::atomic<State> state_;
+
+  // pointer to the master's thread BlockingCounter object, to notify the
+  // master thread of when this worker switches to the 'Ready' state.
+  BlockingCounter* const counter_to_decrement_when_ready_;
+};
+
+class WorkersPool {
+ public:
+  WorkersPool() {}
+
+  void Execute(const std::vector<std::shared_ptr<Task>>& tasks) {
+    CAFFE_ENFORCE_GE(tasks.size(), 1);
+    // One of the tasks will be run on the current thread.
+    int workers_count = tasks.size() - 1;
+    CreateWorkers(workers_count);
+    DCHECK_LE(workers_count, (int)workers_.size());
+    counter_to_decrement_when_ready_.Reset(workers_count);
+    for (size_t task = 1; task < tasks.size(); ++task) {
+      workers_[task - 1]->StartWork(tasks[task].get());
+    }
+    // Execute the remaining workload immediately on the current thread.
+    auto& task = tasks.front();
+    task->Run();
+    // Wait for the workers submitted above to finish.
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+ private:
+  // Ensures that the pool has at least the given count of workers.
+  // If any new worker has to be created, this function waits for it to
+  // be ready.
+  void CreateWorkers(std::size_t workers_count) {
+    if (workers_.size() >= workers_count) {
+      return;
+    }
+    counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
+    while (workers_.size() < workers_count) {
+      workers_.push_back(MakeAligned<Worker>::make(&counter_to_decrement_when_ready_));
+    }
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+  DISABLE_COPY_AND_ASSIGN(WorkersPool);
+  std::vector<std::unique_ptr<Worker, AlignedDeleter<Worker>>> workers_;
+  // The BlockingCounter used to wait for the workers.
+  BlockingCounter counter_to_decrement_when_ready_;
+};
+} // namespace caffe2
diff --git a/caffe2/utils/threadpool/pthreadpool.cc b/caffe2/utils/threadpool/pthreadpool.cc
new file mode 100644
index 0000000..094e49c
--- /dev/null
+++ b/caffe2/utils/threadpool/pthreadpool.cc
@@ -0,0 +1,166 @@
+/* Standard C headers */
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+/* POSIX headers */
+#include <pthread.h>
+#include <unistd.h>
+
+/* Library header */
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/fixed_divisor.h"
+#include "caffe2/utils/threadpool/pthreadpool.h"
+
+
+static inline size_t divide_round_up(size_t dividend, size_t divisor) {
+  if (dividend % divisor == 0) {
+    return dividend / divisor;
+  } else {
+    return dividend / divisor + 1;
+  }
+}
+
+static inline size_t min(size_t a, size_t b) {
+  return a < b ? a : b;
+}
+
+struct compute_1d_tiled_context {
+  pthreadpool_function_1d_tiled_t function;
+  void* argument;
+  size_t range;
+  size_t tile;
+};
+
+static void compute_1d_tiled(const struct compute_1d_tiled_context* context, size_t linear_index) {
+  const size_t tile_index = linear_index;
+  const size_t index = tile_index * context->tile;
+  const size_t tile = min(context->tile, context->range - index);
+  context->function(context->argument, index, tile);
+}
+
+void pthreadpool_compute_1d_tiled(
+  pthreadpool_t threadpool,
+  pthreadpool_function_1d_tiled_t function,
+  void* argument,
+  size_t range,
+  size_t tile)
+{
+  if (threadpool == NULL) {
+    /* No thread pool provided: execute function sequentially on the calling thread */
+    for (size_t i = 0; i < range; i += tile) {
+      function(argument, i, min(range - i, tile));
+    }
+  } else {
+    /* Execute in parallel on the thread pool using linearized index */
+    const size_t tile_range = divide_round_up(range, tile);
+    struct compute_1d_tiled_context context = {
+      .function = function,
+      .argument = argument,
+      .range = range,
+      .tile = tile
+    };
+    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
+  }
+}
+
+struct compute_2d_context {
+  pthreadpool_function_2d_t function;
+  void* argument;
+  caffe2::FixedDivisor<int> range_j;
+};
+
+static void compute_2d(const struct compute_2d_context* context, size_t linear_index) {
+  DCHECK_LE(linear_index, std::numeric_limits<int>::max());
+
+  int q;
+  int r;
+  context->range_j.DivMod((int)linear_index, &q, &r);
+  context->function(context->argument, q, r);
+}
+
+void pthreadpool_compute_2d(
+  struct pthreadpool* threadpool,
+  pthreadpool_function_2d_t function,
+  void* argument,
+  size_t range_i,
+  size_t range_j)
+{
+  if (threadpool == NULL) {
+    /* No thread pool provided: execute function sequentially on the calling thread */
+    for (size_t i = 0; i < range_i; i++) {
+      for (size_t j = 0; j < range_j; j++) {
+        function(argument, i, j);
+      }
+    }
+  } else {
+    DCHECK_LE(range_i * range_j, (size_t) std::numeric_limits<int>::max());
+    /* Execute in parallel on the thread pool using linearized index */
+    struct compute_2d_context context = {
+      .function = function,
+      .argument = argument,
+      .range_j = caffe2::FixedDivisor<int>(range_j)
+    };
+    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
+  }
+}
+
+struct compute_2d_tiled_context {
+  pthreadpool_function_2d_tiled_t function;
+  void* argument;
+  caffe2::FixedDivisor<int> tile_range_j;
+  size_t range_i;
+  size_t range_j;
+  size_t tile_i;
+  size_t tile_j;
+};
+
+static void compute_2d_tiled(const struct compute_2d_tiled_context* context, size_t linear_index) {
+  int q;
+  int r;
+
+  context->tile_range_j.DivMod(linear_index, &q, &r);
+  const size_t max_tile_i = context->tile_i;
+  const size_t max_tile_j = context->tile_j;
+  const size_t index_i = q * max_tile_i;
+  const size_t index_j = r * max_tile_j;
+  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
+  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
+  context->function(context->argument, index_i, index_j, tile_i, tile_j);
+}
+
+void pthreadpool_compute_2d_tiled(
+  pthreadpool_t threadpool,
+  pthreadpool_function_2d_tiled_t function,
+  void* argument,
+  size_t range_i,
+  size_t range_j,
+  size_t tile_i,
+  size_t tile_j)
+{
+  if (threadpool == NULL) {
+    /* No thread pool provided: execute function sequentially on the calling thread */
+    for (size_t i = 0; i < range_i; i += tile_i) {
+      for (size_t j = 0; j < range_j; j += tile_j) {
+        function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
+      }
+    }
+  } else {
+    /* Execute in parallel on the thread pool using linearized index */
+    const size_t tile_range_i = divide_round_up(range_i, tile_i);
+    const size_t tile_range_j = divide_round_up(range_j, tile_j);
+    DCHECK_LE(tile_range_i * tile_range_j, (size_t) std::numeric_limits<int>::max());
+    struct compute_2d_tiled_context context = {
+      .function = function,
+      .argument = argument,
+      .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
+      .range_i = range_i,
+      .range_j = range_j,
+      .tile_i = tile_i,
+      .tile_j = tile_j
+    };
+    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
+  }
+}
diff --git a/caffe2/utils/threadpool/pthreadpool.h b/caffe2/utils/threadpool/pthreadpool.h
new file mode 100644
index 0000000..3de21a7
--- /dev/null
+++ b/caffe2/utils/threadpool/pthreadpool.h
@@ -0,0 +1,103 @@
+// pthreadpool header from https://github.com/Maratyszcza/pthreadpool
+// for NNPACK
+#ifndef CAFFE2_UTILS_PTHREADPOOL_H_
+#define CAFFE2_UTILS_PTHREADPOOL_H_
+
+#include "ThreadPoolCommon.h"
+
+
+#include <stddef.h> // for size_t
+
+typedef struct pthreadpool* pthreadpool_t;
+
+typedef void (*pthreadpool_function_1d_t)(void*, size_t);
+typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_3d_t)(void*, size_t, size_t, size_t);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Creates a thread pool with the specified number of threads.
+ *
+ * @param[in]  threads_count  The number of threads in the thread pool.
+ *    A value of 0 has special interpretation: it creates a thread for each
+ *    processor core available in the system.
+ *
+ * @returns  A pointer to an opaque thread pool object.
+ *    On error the function returns NULL and sets errno accordingly.
+ */
+pthreadpool_t pthreadpool_create(size_t threads_count);
+
+/**
+ * Queries the number of threads in a thread pool.
+ *
+ * @param[in]  threadpool  The thread pool to query.
+ *
+ * @returns  The number of threads in the thread pool.
+ */
+size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
+
+
+/**
+ * Processes items in parallel using threads from a thread pool.
+ *
+ * When the call returns, all items have been processed and the thread pool is
+ * ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param[in]  threadpool  The thread pool to use for parallelisation.
+ * @param[in]  function    The function to call for each item.
+ * @param[in]  argument    The first argument passed to the @a function.
+ * @param[in]  items       The number of items to process. The @a function
+ *    will be called once for each item.
+ */
+void pthreadpool_compute_1d(
+    pthreadpool_t threadpool,
+    pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range);
+
+void pthreadpool_compute_1d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_1d_tiled_t function,
+    void* argument,
+    size_t range,
+    size_t tile);
+
+void pthreadpool_compute_2d(
+    pthreadpool_t threadpool,
+    pthreadpool_function_2d_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j);
+
+void pthreadpool_compute_2d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_2d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t tile_i,
+    size_t tile_j);
+
+/**
+ * Terminates threads in the thread pool and releases associated resources.
+ *
+ * @warning  Accessing the thread pool after a call to this function constitutes
+ *    undefined behaviour and may cause data corruption.
+ *
+ * @param[in,out]  threadpool  The thread pool to destroy.
+ */
+void pthreadpool_destroy(pthreadpool_t threadpool);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // CAFFE2_UTILS_PTHREADPOOL_H_
diff --git a/caffe2/utils/threadpool/pthreadpool_impl.cc b/caffe2/utils/threadpool/pthreadpool_impl.cc
new file mode 100644
index 0000000..41cc3c0
--- /dev/null
+++ b/caffe2/utils/threadpool/pthreadpool_impl.cc
@@ -0,0 +1,24 @@
+#include "caffe2/utils/threadpool/pthreadpool.h"
+#include "caffe2/utils/threadpool/ThreadPool.h"
+
+
+//
+// External API
+//
+
+void pthreadpool_compute_1d(
+    pthreadpool_t threadpool,
+    pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range) {
+  reinterpret_cast<caffe2::ThreadPool*>(threadpool)
+      ->run(
+          [function, argument](int threadId, size_t workId) {
+            function(argument, workId);
+          },
+          range);
+}
+
+size_t pthreadpool_get_threads_count(pthreadpool_t threadpool) {
+  return reinterpret_cast<caffe2::ThreadPool*>(threadpool)->getNumThreads();
+}
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
new file mode 100644
index 0000000..be03d98
--- /dev/null
+++ b/caffe2/utils/zmq_helper.h
@@ -0,0 +1,137 @@
+#ifndef CAFFE2_UTILS_ZMQ_HELPER_H_
+#define CAFFE2_UTILS_ZMQ_HELPER_H_
+
+#include <zmq.h>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+class ZmqContext {
+ public:
+  explicit ZmqContext(int io_threads) : ptr_(zmq_ctx_new()) {
+    CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq context.");
+    int rc = zmq_ctx_set(ptr_, ZMQ_IO_THREADS, io_threads);
+    CAFFE_ENFORCE_EQ(rc, 0);
+    rc = zmq_ctx_set(ptr_, ZMQ_MAX_SOCKETS, ZMQ_MAX_SOCKETS_DFLT);
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+  ~ZmqContext() {
+    int rc = zmq_ctx_destroy(ptr_);
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  void* ptr() { return ptr_; }
+
+ private:
+  void* ptr_;
+
+  DISABLE_COPY_AND_ASSIGN(ZmqContext);
+};
+
+class ZmqMessage {
+ public:
+  ZmqMessage() {
+    int rc = zmq_msg_init(&msg_);
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  ~ZmqMessage() {
+    int rc = zmq_msg_close(&msg_);
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  zmq_msg_t* msg() { return &msg_; }
+
+  void* data() { return zmq_msg_data(&msg_); }
+  size_t size() { return zmq_msg_size(&msg_); }
+
+ private:
+  zmq_msg_t msg_;
+  DISABLE_COPY_AND_ASSIGN(ZmqMessage);
+};
+
+class ZmqSocket {
+ public:
+  explicit ZmqSocket(int type)
+      : context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
+    CAFFE_ENFORCE(ptr_ != nullptr, "Faild to create zmq socket.");
+  }
+
+  ~ZmqSocket() {
+    int rc = zmq_close(ptr_);
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  void Bind(const string& addr) {
+    int rc = zmq_bind(ptr_, addr.c_str());
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  void Unbind(const string& addr) {
+    int rc = zmq_unbind(ptr_, addr.c_str());
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  void Connect(const string& addr) {
+    int rc = zmq_connect(ptr_, addr.c_str());
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  void Disconnect(const string& addr) {
+    int rc = zmq_disconnect(ptr_, addr.c_str());
+    CAFFE_ENFORCE_EQ(rc, 0);
+  }
+
+  int Send(const string& msg, int flags) {
+    int nbytes = zmq_send(ptr_, msg.c_str(), msg.size(), flags);
+    if (nbytes) {
+      return nbytes;
+    } else if (zmq_errno() == EAGAIN) {
+      return 0;
+    } else {
+      LOG(FATAL) << "Cannot send zmq message. Error number: "
+                      << zmq_errno();
+      return 0;
+    }
+  }
+
+  int SendTillSuccess(const string& msg, int flags) {
+    CAFFE_ENFORCE(msg.size(), "You cannot send an empty message.");
+    int nbytes = 0;
+    do {
+      nbytes = Send(msg, flags);
+    } while (nbytes == 0);
+    return nbytes;
+  }
+
+  int Recv(ZmqMessage* msg) {
+    int nbytes = zmq_msg_recv(msg->msg(), ptr_, 0);
+    if (nbytes >= 0) {
+      return nbytes;
+    } else if (zmq_errno() == EAGAIN || zmq_errno() == EINTR) {
+      return 0;
+    } else {
+      LOG(FATAL) << "Cannot receive zmq message. Error number: "
+                      << zmq_errno();
+      return 0;
+    }
+  }
+
+  int RecvTillSuccess(ZmqMessage* msg) {
+    int nbytes = 0;
+    do {
+      nbytes = Recv(msg);
+    } while (nbytes == 0);
+    return nbytes;
+  }
+
+ private:
+  ZmqContext context_;
+  void* ptr_;
+};
+
+}  // namespace caffe2
+
+
+#endif  // CAFFE2_UTILS_ZMQ_HELPER_H_
diff --git a/caffe2/video/CMakeLists.txt b/caffe2/video/CMakeLists.txt
new file mode 100644
index 0000000..204f816
--- /dev/null
+++ b/caffe2/video/CMakeLists.txt
@@ -0,0 +1,39 @@
+if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
+        message(STATUS "Including video processing operators")
+  # ---[ GPU files
+  # ------[ general GPU
+  file(GLOB tmp *_gpu.cc)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+  # ------[ CUDA sources
+  file(GLOB tmp *.cu)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
+  # exclude test files
+  file(GLOB tmp *_test.cc)
+  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
+
+  # ---[ CPU files.
+  file(GLOB tmp *.cc)
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
+  # exclude test files and gpu files
+  file(GLOB tmp *_test.cc)
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+
+  # ---[ GPU test files
+  file(GLOB tmp *_gpu_test.cc)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
+
+  # ---[ CPU test files
+  file(GLOB tmp *_test.cc)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
+  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
+    ${Caffe2_GPU_TEST_SRCS})
+
+  # ---[ Send the lists to the parent scope.
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+else()
+        message(STATUS "Excluding video processing operators due to no opencv")
+endif()
diff --git a/caffe2/video/optical_flow.cc b/caffe2/video/optical_flow.cc
new file mode 100644
index 0000000..4855b2c
--- /dev/null
+++ b/caffe2/video/optical_flow.cc
@@ -0,0 +1,81 @@
+#include <caffe2/video/optical_flow.h>
+
+namespace caffe2 {
+
+void OpticalFlowExtractor(
+    const cv::Mat& prev_gray,
+    const cv::Mat& curr_gray,
+    const int flow_alg_type,
+    cv::Mat& flow) {
+  cv::Ptr<cv::DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
+  switch (flow_alg_type) {
+    case FLowAlgType::FarnebackOpticalFlow:
+      cv::calcOpticalFlowFarneback(
+          prev_gray,
+          curr_gray,
+          flow,
+          std::sqrt(2) / 2.0,
+          5,
+          10,
+          2,
+          7,
+          1.5,
+          cv::OPTFLOW_FARNEBACK_GAUSSIAN);
+      break;
+    case FLowAlgType::DensePyrLKOpticalFlow:
+      LOG(ERROR) << "DensePyrLKOpticalFlow only has sparse version on CPU";
+      break;
+    case FLowAlgType::BroxOpticalFlow:
+      LOG(ERROR) << "BroxOpticalFlow on CPU is not available";
+      break;
+    case FLowAlgType::OpticalFlowDual_TVL1:
+      tvl1->calc(prev_gray, curr_gray, flow);
+      break;
+    default:
+      LOG(ERROR) << "Unsupported optical flow type " << flow_alg_type;
+      break;
+  }
+}
+
+void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow) {
+  const int rows = prev_flow.rows;
+  const int cols = prev_flow.cols;
+
+  // merge two optical flows into one
+  for (int y = 0; y < rows; y++) {
+    for (int x = 0; x < cols; x++) {
+      cv::Point2f u = prev_flow.at<cv::Point2f>(y, x);
+      // get the new location
+      int x_new = std::min(cols - 1, std::max(0, cvRound(u.x + x)));
+      int y_new = std::min(rows - 1, std::max(0, cvRound(u.y + y)));
+      cv::Point2f u_new = curr_flow.at<cv::Point2f>(y_new, x_new);
+
+      // update the flow
+      prev_flow.at<cv::Point2f>(y, x) += u_new;
+    }
+  }
+}
+
+void MultiFrameOpticalFlowExtractor(
+    const std::vector<cv::Mat>& grays,
+    const int optical_flow_alg_type,
+    cv::Mat& flow) {
+  int num_frames = grays.size();
+  CAFFE_ENFORCE_GE(num_frames, 2, "need at least 2 frames!");
+
+  // compute optical flow for every two frames
+  std::vector<cv::Mat> flows;
+  for (int i = 0; i < num_frames - 1; i++) {
+    cv::Mat tmp;
+    OpticalFlowExtractor(grays[i], grays[i + 1], optical_flow_alg_type, tmp);
+    flows.push_back(tmp);
+  }
+
+  flows[0].copyTo(flow);
+  // aggregate optical flow across multiple frame
+  for (int i = 1; i < num_frames - 1; i++) {
+    MergeOpticalFlow(flow, flows[i]);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/video/optical_flow.h b/caffe2/video/optical_flow.h
new file mode 100644
index 0000000..2dbd7e3
--- /dev/null
+++ b/caffe2/video/optical_flow.h
@@ -0,0 +1,50 @@
+#ifndef CAFFE2_VIDEO_OPTICAL_FLOW_H_
+#define CAFFE2_VIDEO_OPTICAL_FLOW_H_
+
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <opencv2/video.hpp>
+
+#include <caffe2/core/logging.h>
+
+namespace caffe2 {
+
+// Four different types of optical flow algorithms supported;
+// BroxOpticalFlow doesn't have a CPU version;
+// DensePyrLKOpticalFlow only has sparse CPU version;
+enum FLowAlgType {
+  FarnebackOpticalFlow = 0,
+  DensePyrLKOpticalFlow = 1,
+  BroxOpticalFlow = 2,
+  OpticalFlowDual_TVL1 = 3,
+};
+
+// Define different types of optical flow data type
+// 0: original two channel optical flow
+// 1: three channel optical flow with magnitude as the third channel
+// 2: two channel optical flow + one channel gray
+// 3: two channel optical flow + three channel rgb
+enum FlowDataType {
+  Flow2C = 0,
+  Flow3C = 1,
+  FlowWithGray = 2,
+  FlowWithRGB = 3,
+};
+
+void OpticalFlowExtractor(
+    const cv::Mat& prev_gray,
+    const cv::Mat& curr_gray,
+    const int optical_flow_alg_type,
+    cv::Mat& flow);
+
+void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow);
+
+void MultiFrameOpticalFlowExtractor(
+    const std::vector<cv::Mat>& grays,
+    const int optical_flow_alg_type,
+    cv::Mat& flow);
+
+} // namespace caffe2
+
+#endif // CAFFE2_VIDEO_OPTICAL_FLOW_H_
diff --git a/caffe2/video/video_decoder.cc b/caffe2/video/video_decoder.cc
new file mode 100644
index 0000000..dace340
--- /dev/null
+++ b/caffe2/video/video_decoder.cc
@@ -0,0 +1,534 @@
+#include <caffe2/video/video_decoder.h>
+#include <caffe2/core/logging.h>
+
+#include <stdio.h>
+#include <mutex>
+#include <random>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libavutil/log.h>
+#include <libswresample/swresample.h>
+#include <libswscale/swscale.h>
+}
+
+namespace caffe2 {
+
+VideoDecoder::VideoDecoder() {
+  static bool gInitialized = false;
+  static std::mutex gMutex;
+  std::unique_lock<std::mutex> lock(gMutex);
+  if (!gInitialized) {
+    av_register_all();
+    avcodec_register_all();
+    avformat_network_init();
+    gInitialized = true;
+  }
+}
+
+void VideoDecoder::ResizeAndKeepAspectRatio(
+    const int origHeight,
+    const int origWidth,
+    const int heightMin,
+    const int widthMin,
+    int& outHeight,
+    int& outWidth) {
+  float min_aspect = (float)heightMin / (float)widthMin;
+  float video_aspect = (float)origHeight / (float)origWidth;
+  if (video_aspect >= min_aspect) {
+    outWidth = widthMin;
+    outHeight = (int)ceil(video_aspect * outWidth);
+  } else {
+    outHeight = heightMin;
+    outWidth = (int)ceil(outHeight / video_aspect);
+  }
+}
+
+void VideoDecoder::decodeLoop(
+    const string& videoName,
+    VideoIOContext& ioctx,
+    const Params& params,
+    const int start_frm,
+    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
+  AVPixelFormat pixFormat = params.pixelFormat_;
+  AVFormatContext* inputContext = avformat_alloc_context();
+  AVStream* videoStream_ = nullptr;
+  AVCodecContext* videoCodecContext_ = nullptr;
+  AVFrame* videoStreamFrame_ = nullptr;
+  AVPacket packet;
+  av_init_packet(&packet); // init packet
+  SwsContext* scaleContext_ = nullptr;
+
+  try {
+    inputContext->pb = ioctx.get_avio();
+    inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
+    int ret = 0;
+
+    // Determining the input format:
+    int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
+    DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
+
+    memset(probe.get(), 0, probeSz);
+    int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
+    if (len < probeSz - AVPROBE_PADDING_SIZE) {
+      LOG(ERROR) << "Insufficient data to determine video format";
+      return;
+    }
+
+    // seek back to start of stream
+    ioctx.seek(0, SEEK_SET);
+
+    unique_ptr<AVProbeData> probeData(new AVProbeData());
+    probeData->buf = probe.get();
+    probeData->buf_size = len;
+    probeData->filename = "";
+    // Determine the input-format:
+    inputContext->iformat = av_probe_input_format(probeData.get(), 1);
+
+    ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
+    if (ret < 0) {
+      LOG(ERROR) << "Unable to open stream " << ffmpegErrorStr(ret);
+      return;
+    }
+
+    ret = avformat_find_stream_info(inputContext, nullptr);
+    if (ret < 0) {
+      LOG(ERROR) << "Unable to find stream info in " << videoName << " "
+                 << ffmpegErrorStr(ret);
+      return;
+    }
+
+    // Decode the first video stream
+    int videoStreamIndex_ = params.streamIndex_;
+    if (videoStreamIndex_ == -1) {
+      for (int i = 0; i < inputContext->nb_streams; i++) {
+        auto stream = inputContext->streams[i];
+        if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
+          videoStreamIndex_ = i;
+          videoStream_ = stream;
+          break;
+        }
+      }
+    }
+
+    if (videoStream_ == nullptr) {
+      LOG(ERROR) << "Unable to find video stream in " << videoName << " "
+                 << ffmpegErrorStr(ret);
+      return;
+    }
+
+    // Initialize codec
+    AVDictionary* opts = nullptr;
+    videoCodecContext_ = videoStream_->codec;
+    try {
+      ret = avcodec_open2(
+          videoCodecContext_,
+          avcodec_find_decoder(videoCodecContext_->codec_id),
+          &opts);
+    } catch (const std::exception&) {
+      LOG(ERROR) << "Exception during open video codec";
+      return;
+    }
+
+    if (ret < 0) {
+      LOG(ERROR) << "Cannot open video codec : "
+                 << videoCodecContext_->codec->name;
+      return;
+    }
+
+    // Calculate if we need to rescale the frames
+    int origWidth = videoCodecContext_->width;
+    int origHeight = videoCodecContext_->height;
+    int outWidth = origWidth;
+    int outHeight = origHeight;
+
+    if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
+      // if the original resolution is too low,
+      // make its size at least (crop_height, crop_width)
+      if (params.crop_width_ > origWidth || params.crop_height_ > origHeight) {
+        ResizeAndKeepAspectRatio(
+            origHeight,
+            origWidth,
+            params.crop_height_,
+            params.crop_width_,
+            outHeight,
+            outWidth);
+      }
+    } else if (
+        params.video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
+      // resize the image to be at least
+      // (height_min, width_min) resolution while keep the aspect ratio
+      ResizeAndKeepAspectRatio(
+          origHeight,
+          origWidth,
+          params.height_min_,
+          params.width_min_,
+          outHeight,
+          outWidth);
+    } else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
+      // resize the image to the predefined
+      // resolution and ignore the aspect ratio
+      outWidth = params.scale_w_;
+      outHeight = params.scale_h_;
+    } else {
+      LOG(ERROR) << "Unknown video_res_type: " << params.video_res_type_;
+    }
+
+    // Make sure that we have a valid format
+    CAFFE_ENFORCE_NE(videoCodecContext_->pix_fmt, AV_PIX_FMT_NONE);
+
+    // Create a scale context
+    scaleContext_ = sws_getContext(
+        videoCodecContext_->width,
+        videoCodecContext_->height,
+        videoCodecContext_->pix_fmt,
+        outWidth,
+        outHeight,
+        pixFormat,
+        SWS_FAST_BILINEAR,
+        nullptr,
+        nullptr,
+        nullptr);
+
+    // Getting video meta data
+    VideoMeta videoMeta;
+    videoMeta.codec_type = videoCodecContext_->codec_type;
+    videoMeta.width = outWidth;
+    videoMeta.height = outHeight;
+    videoMeta.pixFormat = pixFormat;
+    videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
+
+    // If sampledFrames is not empty, empty it
+    if (sampledFrames.size() > 0) {
+      sampledFrames.clear();
+    }
+
+    if (params.intervals_.size() == 0) {
+      LOG(ERROR) << "Empty sampling intervals.";
+      return;
+    }
+
+    std::vector<SampleInterval>::const_iterator itvlIter =
+        params.intervals_.begin();
+    if (itvlIter->timestamp != 0) {
+      LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
+    }
+
+    double currFps = itvlIter->fps;
+    if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
+        currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
+      // fps must be 0, -1, -2 or > 0
+      LOG(ERROR) << "Invalid sampling fps.";
+    }
+
+    double prevTimestamp = itvlIter->timestamp;
+    itvlIter++;
+    if (itvlIter != params.intervals_.end() &&
+        prevTimestamp >= itvlIter->timestamp) {
+      LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
+    }
+
+    double lastFrameTimestamp = -1.0;
+    // Initialize frame and packet.
+    // These will be reused across calls.
+    videoStreamFrame_ = av_frame_alloc();
+
+    // frame index in video stream
+    int frameIndex = -1;
+    // frame index of outputed frames
+    int outputFrameIndex = -1;
+
+    /* identify the starting point from where we must start decoding */
+    std::mt19937 meta_randgen(time(nullptr));
+    long int start_ts = -1;
+    bool mustDecodeAll = false;
+    if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
+      /* we have a valid duration and nb_frames. We can safely
+       * detect an intermediate timestamp to start decoding from. */
+
+      // leave a margin of 10 frames to take in to account the error
+      // from av_seek_frame
+      long int margin =
+          int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
+      // if we need to do temporal jittering
+      if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
+        /* estimate the average duration for the required # of frames */
+        double maxFramesDuration =
+            (videoStream_->duration * params.num_of_required_frame_) /
+            (videoStream_->nb_frames);
+        int ts1 = 0;
+        int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
+        ts2 = ts2 > 0 ? ts2 : 0;
+        // pick a random timestamp between ts1 and ts2. ts2 is selected such
+        // that you have enough frames to satisfy the required # of frames.
+        start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
+        // seek a frame at start_ts
+        ret = av_seek_frame(
+            inputContext,
+            videoStreamIndex_,
+            0 > (start_ts - margin) ? 0 : (start_ts - margin),
+            AVSEEK_FLAG_BACKWARD);
+
+        // if we need to decode from the start_frm
+      } else if (params.decode_type_ == DecodeType::USE_START_FRM) {
+        start_ts = int(floor(
+            (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
+        // seek a frame at start_ts
+        ret = av_seek_frame(
+            inputContext,
+            videoStreamIndex_,
+            0 > (start_ts - margin) ? 0 : (start_ts - margin),
+            AVSEEK_FLAG_BACKWARD);
+      } else {
+        mustDecodeAll = true;
+      }
+
+      if (ret < 0) {
+        LOG(ERROR) << "Unable to decode from a random start point";
+        /* fall back to default decoding of all frames from start */
+        av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
+        mustDecodeAll = true;
+      }
+    } else {
+      /* we do not have the necessary metadata to selectively decode frames.
+       * Decode all frames as we do in the default case */
+      LOG(INFO) << " Decoding all frames as we do not have suffiecient"
+                   " metadata for selective decoding.";
+      mustDecodeAll = true;
+    }
+
+    int gotPicture = 0;
+    int eof = 0;
+    int selectiveDecodedFrames = 0;
+
+    int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
+        ? MAX_DECODING_FRAMES
+        : params.num_of_required_frame_;
+    // There is a delay between reading packets from the
+    // transport and getting decoded frames back.
+    // Therefore, after EOF, continue going while
+    // the decoder is still giving us frames.
+    int ipacket = 0;
+    while ((!eof || gotPicture) &&
+           /* either you must decode all frames or decode upto maxFrames
+            * based on status of the mustDecodeAll flag */
+           (mustDecodeAll ||
+            ((!mustDecodeAll) && (selectiveDecodedFrames < maxFrames))) &&
+           /* If on the last interval and not autodecoding keyframes and a
+            * SpecialFps indicates no more frames are needed, stop decoding */
+           !((itvlIter == params.intervals_.end() &&
+              (currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
+               currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
+             !params.keyFrames_)) {
+      try {
+        if (!eof) {
+          ret = av_read_frame(inputContext, &packet);
+          if (ret == AVERROR_EOF) {
+            eof = 1;
+            av_free_packet(&packet);
+            packet.data = nullptr;
+            packet.size = 0;
+            // stay in the while loop to flush frames
+          } else if (ret == AVERROR(EAGAIN)) {
+            av_free_packet(&packet);
+            continue;
+          } else if (ret < 0) {
+            LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
+          }
+          ipacket++;
+
+          // Ignore packets from other streams
+          if (packet.stream_index != videoStreamIndex_) {
+            av_free_packet(&packet);
+            continue;
+          }
+        }
+
+        ret = avcodec_decode_video2(
+            videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
+        if (ret < 0) {
+          LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
+        }
+
+        try {
+          // Nothing to do without a picture
+          if (!gotPicture) {
+            av_free_packet(&packet);
+            continue;
+          }
+          frameIndex++;
+
+          long int frame_ts =
+              av_frame_get_best_effort_timestamp(videoStreamFrame_);
+          double timestamp = frame_ts * av_q2d(videoStream_->time_base);
+
+          if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
+            /* process current frame if:
+             * 1) We are not doing selective decoding and mustDecodeAll
+             *    OR
+             * 2) We are doing selective decoding and current frame
+             *   timestamp is >= start_ts from where we start selective
+             *   decoding*/
+            // if reaching the next interval, update the current fps
+            // and reset lastFrameTimestamp so the current frame could be
+            // sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
+            if (itvlIter != params.intervals_.end() &&
+                timestamp >= itvlIter->timestamp) {
+              lastFrameTimestamp = -1.0;
+              currFps = itvlIter->fps;
+              prevTimestamp = itvlIter->timestamp;
+              itvlIter++;
+              if (itvlIter != params.intervals_.end() &&
+                  prevTimestamp >= itvlIter->timestamp) {
+                LOG(ERROR)
+                    << "Sampling interval timestamps must be strictly ascending.";
+              }
+            }
+
+            // keyFrame will bypass all checks on fps sampling settings
+            bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
+            if (!keyFrame) {
+              // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
+              if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
+                av_free_packet(&packet);
+                continue;
+              }
+
+              // fps is considered reached in the following cases:
+              // 1. lastFrameTimestamp < 0 - start of a new interval
+              //    (or first frame)
+              // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
+              //    frame
+              // 3. timestamp - lastFrameTimestamp has reached target fps and
+              //    currFps > 0 (not special fps setting)
+              // different modes for fps:
+              // SpecialFps::SAMPLE_NO_FRAMES (0):
+              //     disable fps sampling, no frame sampled at all
+              // SpecialFps::SAMPLE_ALL_FRAMES (-1):
+              //     unlimited fps sampling, will sample at native video fps
+              // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
+              //     disable fps sampling, but will get the frame at specific
+              //     timestamp
+              // others (> 0): decoding at the specified fps
+              bool fpsReached = lastFrameTimestamp < 0 ||
+                  currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
+                  (currFps > 0 &&
+                   timestamp >= lastFrameTimestamp + (1 / currFps));
+
+              if (!fpsReached) {
+                av_free_packet(&packet);
+                continue;
+              }
+            }
+
+            lastFrameTimestamp = timestamp;
+
+            outputFrameIndex++;
+            if (params.maximumOutputFrames_ != -1 &&
+                outputFrameIndex >= params.maximumOutputFrames_) {
+              // enough frames
+              av_free_packet(&packet);
+              break;
+            }
+
+            AVFrame* rgbFrame = av_frame_alloc();
+            if (!rgbFrame) {
+              LOG(ERROR) << "Error allocating AVframe";
+            }
+
+            try {
+              // Determine required buffer size and allocate buffer
+              int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
+              DecodedFrame::AvDataPtr buffer(
+                  (uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
+
+              int size = avpicture_fill(
+                  (AVPicture*)rgbFrame,
+                  buffer.get(),
+                  pixFormat,
+                  outWidth,
+                  outHeight);
+
+              sws_scale(
+                  scaleContext_,
+                  videoStreamFrame_->data,
+                  videoStreamFrame_->linesize,
+                  0,
+                  videoCodecContext_->height,
+                  rgbFrame->data,
+                  rgbFrame->linesize);
+
+              unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
+              frame->width_ = outWidth;
+              frame->height_ = outHeight;
+              frame->data_ = move(buffer);
+              frame->size_ = size;
+              frame->index_ = frameIndex;
+              frame->outputFrameIndex_ = outputFrameIndex;
+              frame->timestamp_ = timestamp;
+              frame->keyFrame_ = videoStreamFrame_->key_frame;
+
+              sampledFrames.push_back(move(frame));
+              selectiveDecodedFrames++;
+              av_frame_free(&rgbFrame);
+            } catch (const std::exception&) {
+              av_frame_free(&rgbFrame);
+            }
+          }
+          av_frame_unref(videoStreamFrame_);
+        } catch (const std::exception&) {
+          av_frame_unref(videoStreamFrame_);
+        }
+
+        av_free_packet(&packet);
+      } catch (const std::exception&) {
+        av_free_packet(&packet);
+      }
+    } // of while loop
+
+    // free all stuffs
+    sws_freeContext(scaleContext_);
+    av_packet_unref(&packet);
+    av_frame_free(&videoStreamFrame_);
+    avcodec_close(videoCodecContext_);
+    avformat_close_input(&inputContext);
+    avformat_free_context(inputContext);
+  } catch (const std::exception&) {
+    // In case of decoding error
+    // free all stuffs
+    sws_freeContext(scaleContext_);
+    av_packet_unref(&packet);
+    av_frame_free(&videoStreamFrame_);
+    avcodec_close(videoCodecContext_);
+    avformat_close_input(&inputContext);
+    avformat_free_context(inputContext);
+  }
+}
+
+void VideoDecoder::decodeMemory(
+    const char* buffer,
+    const int size,
+    const Params& params,
+    const int start_frm,
+    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
+  VideoIOContext ioctx(buffer, size);
+  decodeLoop(string("Memory Buffer"), ioctx, params, start_frm, sampledFrames);
+}
+
+void VideoDecoder::decodeFile(
+    const string& file,
+    const Params& params,
+    const int start_frm,
+    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
+  VideoIOContext ioctx(file);
+  decodeLoop(file, ioctx, params, start_frm, sampledFrames);
+}
+
+string VideoDecoder::ffmpegErrorStr(int result) {
+  std::array<char, 128> buf;
+  av_strerror(result, buf.data(), buf.size());
+  return string(buf.data());
+}
+
+} // namespace caffe2
diff --git a/caffe2/video/video_decoder.h b/caffe2/video/video_decoder.h
new file mode 100644
index 0000000..6b900d7
--- /dev/null
+++ b/caffe2/video/video_decoder.h
@@ -0,0 +1,423 @@
+#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
+#define CAFFE2_VIDEO_VIDEO_DECODER_H_
+
+#include <caffe2/core/logging.h>
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+}
+
+namespace caffe2 {
+
+#define VIO_BUFFER_SZ 32768
+#define MAX_DECODING_FRAMES 10000
+
+// enum to specify 3 special fps sampling behaviors:
+// 0: disable fps sampling, no frame sampled at all
+// -1: unlimited fps sampling, will sample at native video fps
+// -2: disable fps sampling, but will get the frame at specific timestamp
+enum SpecialFps {
+  SAMPLE_NO_FRAME = 0,
+  SAMPLE_ALL_FRAMES = -1,
+  SAMPLE_TIMESTAMP_ONLY = -2,
+};
+
+// three different types of resolution when decoding the video
+// 0: resize to width x height and ignore the aspect ratio;
+// 1: resize to make size at least (width x height) and keep the aspect ratio;
+// 2: using the original resolution of the video; if resolution
+//    is smaller than crop_height x crop_width, resize to ensure
+//    new height >= crop_height and new width >= crop_width
+//    and keep the aspect ratio;
+enum VideoResType {
+  USE_WIDTH_HEIGHT = 0,
+  USE_MINIMAL_WIDTH_HEIGHT = 1,
+  ORIGINAL_RES = 2,
+};
+
+// three different types of decoding behavior are supported
+// 0: do temporal jittering to sample a random clip from the video
+// 1: sample a clip from a given starting frame
+// 2: uniformly sample multiple clips from the video;
+enum DecodeType {
+  DO_TMP_JITTER = 0,
+  DO_UNIFORM_SMP = 1,
+  USE_START_FRM = 2,
+};
+
+// sampling interval for fps starting at specified timestamp
+// use enum SpecialFps to set special fps decoding behavior
+// note sampled fps will not always accurately follow the target fps,
+// because sampled frame has to snap to actual frame timestamp,
+// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
+// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
+// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
+struct SampleInterval {
+  double timestamp;
+  double fps;
+  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
+  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
+  bool operator<(const SampleInterval& itvl) const {
+    return (timestamp < itvl.timestamp);
+  }
+};
+
+class Params {
+ public:
+  // return all key-frames regardless of specified fps
+  bool keyFrames_ = false;
+
+  // Output image pixel format
+  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
+
+  // Index of stream to decode.
+  // -1 will automatically decode the first video stream.
+  int streamIndex_ = -1;
+
+  // How many frames to output at most from the video
+  // -1 no limit
+  int maximumOutputFrames_ = -1;
+
+  // params for video resolution
+  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
+
+  // the size of the patch croped from the input video
+  int crop_height_ = -1;
+  int crop_width_ = -1;
+
+  // minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT
+  int height_min_ = -1;
+  int width_min_ = -1;
+
+  // the video resolution after resizing
+  int scale_w_ = -1;
+  int scale_h_ = -1;
+
+  // params for decoding behavior
+  int decode_type_ = DecodeType::DO_TMP_JITTER;
+  int num_of_required_frame_ = -1;
+
+  // intervals_ control variable sampling fps between different timestamps
+  // intervals_ must be ordered strictly ascending by timestamps
+  // the first interval must have a timestamp of zero
+  // fps must be either the 3 special fps defined in SpecialFps, or > 0
+  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
+
+  Params() {}
+
+  /**
+   * FPS of output frames
+   * setting here will reset intervals_ and force decoding at target FPS
+   * This can be used if user just want to decode at a steady fps
+   */
+  Params& fps(float v) {
+    intervals_.clear();
+    intervals_.emplace_back(0, v);
+    return *this;
+  }
+
+  /**
+   * Sample output frames at a specified list of timestamps
+   * Timestamps must be in increasing order, and timestamps past the end of the
+   * video will be ignored
+   * Setting here will reset intervals_
+   */
+  Params& setSampleTimestamps(const std::vector<double>& timestamps) {
+    intervals_.clear();
+    // insert an interval per desired frame.
+    for (auto& timestamp : timestamps) {
+      intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
+    }
+    return *this;
+  }
+
+  /**
+   * Pixel format of output buffer, default PIX_FMT_RGB24
+   */
+  Params& pixelFormat(AVPixelFormat pixelFormat) {
+    pixelFormat_ = pixelFormat;
+    return *this;
+  }
+
+  /**
+   * Return all key-frames
+   */
+  Params& keyFrames(bool keyFrames) {
+    keyFrames_ = keyFrames;
+    return *this;
+  }
+
+  /**
+   * Index of video stream to process, defaults to the first video stream
+   */
+  Params& streamIndex(int index) {
+    streamIndex_ = index;
+    return *this;
+  }
+
+  /**
+   * Only output this many frames, default to no limit
+   */
+  Params& maxOutputFrames(int count) {
+    maximumOutputFrames_ = count;
+    return *this;
+  }
+
+  /**
+   * Output frame width, default to video width
+   */
+  Params& outputWidth(int width) {
+    scale_w_ = width;
+    return *this;
+  }
+
+  /**
+   * Output frame height, default to video height
+   */
+  Params& outputHeight(int height) {
+    scale_h_ = height;
+    return *this;
+  }
+};
+
+// data structure for storing decoded video frames
+class DecodedFrame {
+ public:
+  struct avDeleter {
+    void operator()(unsigned char* p) const {
+      av_free(p);
+    }
+  };
+  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
+
+  // decoded data buffer
+  AvDataPtr data_;
+
+  // size in bytes
+  int size_ = 0;
+
+  // frame dimensions
+  int width_ = 0;
+  int height_ = 0;
+
+  // timestamp in seconds since beginning of video
+  double timestamp_ = 0;
+
+  // true if this is a key frame.
+  bool keyFrame_ = false;
+
+  // index of frame in video
+  int index_ = -1;
+
+  // Sequential number of outputted frame
+  int outputFrameIndex_ = -1;
+};
+
+class VideoIOContext {
+ public:
+  explicit VideoIOContext(const std::string& fname)
+      : workBuffersize_(VIO_BUFFER_SZ),
+        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
+        inputFile_(nullptr),
+        inputBuffer_(nullptr),
+        inputBufferSize_(0) {
+    inputFile_ = fopen(fname.c_str(), "rb");
+    if (inputFile_ == nullptr) {
+      LOG(ERROR) << "Error opening video file " << fname;
+    }
+    ctx_ = avio_alloc_context(
+        static_cast<unsigned char*>(workBuffer_.get()),
+        workBuffersize_,
+        0,
+        this,
+        &VideoIOContext::readFile,
+        nullptr, // no write function
+        &VideoIOContext::seekFile);
+  }
+
+  explicit VideoIOContext(const char* buffer, int size)
+      : workBuffersize_(VIO_BUFFER_SZ),
+        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
+        inputFile_(nullptr),
+        inputBuffer_(buffer),
+        inputBufferSize_(size) {
+    ctx_ = avio_alloc_context(
+        static_cast<unsigned char*>(workBuffer_.get()),
+        workBuffersize_,
+        0,
+        this,
+        &VideoIOContext::readMemory,
+        nullptr, // no write function
+        &VideoIOContext::seekMemory);
+  }
+
+  ~VideoIOContext() {
+    av_free(ctx_);
+    if (inputFile_) {
+      fclose(inputFile_);
+    }
+  }
+
+  int read(unsigned char* buf, int buf_size) {
+    if (inputBuffer_) {
+      return readMemory(this, buf, buf_size);
+    } else if (inputFile_) {
+      return readFile(this, buf, buf_size);
+    } else {
+      return -1;
+    }
+  }
+
+  int64_t seek(int64_t offset, int whence) {
+    if (inputBuffer_) {
+      return seekMemory(this, offset, whence);
+    } else if (inputFile_) {
+      return seekFile(this, offset, whence);
+    } else {
+      return -1;
+    }
+  }
+
+  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
+    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
+    if (feof(h->inputFile_)) {
+      return AVERROR_EOF;
+    }
+    size_t ret = fread(buf, 1, buf_size, h->inputFile_);
+    if (ret < buf_size) {
+      if (ferror(h->inputFile_)) {
+        return -1;
+      }
+    }
+    return ret;
+  }
+
+  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
+    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
+    switch (whence) {
+      case SEEK_CUR: // from current position
+      case SEEK_END: // from eof
+      case SEEK_SET: // from beginning of file
+        return fseek(h->inputFile_, static_cast<long>(offset), whence);
+        break;
+      case AVSEEK_SIZE:
+        int64_t cur = ftell(h->inputFile_);
+        fseek(h->inputFile_, 0L, SEEK_END);
+        int64_t size = ftell(h->inputFile_);
+        fseek(h->inputFile_, cur, SEEK_SET);
+        return size;
+    }
+
+    return -1;
+  }
+
+  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
+    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
+    if (buf_size < 0) {
+      return -1;
+    }
+
+    int reminder = h->inputBufferSize_ - h->offset_;
+    int r = buf_size < reminder ? buf_size : reminder;
+    if (r < 0) {
+      return AVERROR_EOF;
+    }
+
+    memcpy(buf, h->inputBuffer_ + h->offset_, r);
+    h->offset_ += r;
+    return r;
+  }
+
+  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
+    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
+    switch (whence) {
+      case SEEK_CUR: // from current position
+        h->offset_ += offset;
+        break;
+      case SEEK_END: // from eof
+        h->offset_ = h->inputBufferSize_ + offset;
+        break;
+      case SEEK_SET: // from beginning of file
+        h->offset_ = offset;
+        break;
+      case AVSEEK_SIZE:
+        return h->inputBufferSize_;
+    }
+    return h->offset_;
+  }
+
+  AVIOContext* get_avio() {
+    return ctx_;
+  }
+
+ private:
+  int workBuffersize_;
+  DecodedFrame::AvDataPtr workBuffer_;
+  // for file mode
+  FILE* inputFile_;
+
+  // for memory mode
+  const char* inputBuffer_;
+  int inputBufferSize_;
+  int offset_ = 0;
+
+  AVIOContext* ctx_;
+};
+
+struct VideoMeta {
+  double fps;
+  int width;
+  int height;
+  enum AVMediaType codec_type;
+  AVPixelFormat pixFormat;
+  VideoMeta()
+      : fps(-1),
+        width(-1),
+        height(-1),
+        codec_type(AVMEDIA_TYPE_VIDEO),
+        pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
+};
+
+class VideoDecoder {
+ public:
+  VideoDecoder();
+
+  void decodeFile(
+      const std::string& filename,
+      const Params& params,
+      const int start_frm,
+      std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
+
+  void decodeMemory(
+      const char* buffer,
+      const int size,
+      const Params& params,
+      const int start_frm,
+      std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
+
+ private:
+  std::string ffmpegErrorStr(int result);
+
+  void ResizeAndKeepAspectRatio(
+      const int origHeight,
+      const int origWidth,
+      const int heightMin,
+      const int widthMin,
+      int& outHeight,
+      int& outWidth);
+
+  void decodeLoop(
+      const std::string& videoName,
+      VideoIOContext& ioctx,
+      const Params& params,
+      const int start_frm,
+      std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
+};
+} // namespace caffe2
+
+#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
diff --git a/caffe2/video/video_input_op.cc b/caffe2/video/video_input_op.cc
new file mode 100644
index 0000000..823de49
--- /dev/null
+++ b/caffe2/video/video_input_op.cc
@@ -0,0 +1,87 @@
+#include <caffe2/video/video_input_op.h>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(VideoInput, VideoInputOp<CPUContext>);
+
+OPERATOR_SCHEMA(VideoInput)
+    .NumInputs(0, 1)
+    .NumOutputs(2, 4)
+    .TensorInferenceFunction(
+        [](const OperatorDef& def,
+           const vector<TensorShape>& /* unused */ /*in*/) {
+          ArgumentHelper helper(def);
+          int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
+          int clip_per_video =
+              helper.GetSingleArgument<int>("clip_per_video", 1);
+          int crop_height = helper.GetSingleArgument<int>(
+              "crop_height", helper.GetSingleArgument<int>("crop_size", 0));
+          int crop_width = helper.GetSingleArgument<int>(
+              "crop_width", helper.GetSingleArgument<int>("crop_size", 0));
+          int length_rgb = helper.GetSingleArgument<int>("length_rgb", 0);
+          int channels_rgb = helper.GetSingleArgument<int>("channels_rgb", 3);
+          int length_of = helper.GetSingleArgument<int>("length_of", 0);
+          int channels_of = helper.GetSingleArgument<int>("channels_of", 2);
+
+          // get the flags
+          bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
+          bool get_optical_flow =
+              helper.GetSingleArgument<bool>("get_optical_flow", false);
+          bool do_multi_label =
+              helper.GetSingleArgument<bool>("do_multi_label", false);
+          bool get_video_id =
+              helper.GetSingleArgument<bool>("get_video_id", false);
+
+          int output_size = 1;
+          if (get_rgb) {
+            output_size++;
+          }
+          if (get_optical_flow) {
+            output_size++;
+          }
+          if (get_video_id) {
+            output_size++;
+          }
+
+          int index = 0;
+          vector<TensorShape> out(output_size);
+          CHECK_GT(crop_height, 0);
+          CHECK_GT(crop_width, 0);
+          batch_size *= clip_per_video;
+          if (get_rgb) {
+            out[index++] = CreateTensorShape(
+                vector<int>{batch_size,
+                            channels_rgb,
+                            length_rgb,
+                            crop_height,
+                            crop_width},
+                TensorProto::FLOAT);
+          }
+          if (get_optical_flow) {
+            out[index++] = CreateTensorShape(
+                vector<int>{batch_size,
+                            channels_of,
+                            length_of,
+                            crop_height,
+                            crop_width},
+                TensorProto::FLOAT);
+          }
+          if (!do_multi_label) {
+            out[index++] = CreateTensorShape(
+                vector<int>{1, batch_size}, TensorProto::INT32);
+          } else {
+            int num_of_class = helper.GetSingleArgument<int>("num_of_class", 0);
+            out[index++] = CreateTensorShape(
+                vector<int>{batch_size, num_of_class}, TensorProto::INT32);
+          }
+          if (get_video_id) {
+            out[index] = CreateTensorShape(
+                vector<int>{1, batch_size}, TensorProto::INT32);
+          }
+
+          return out;
+        });
+
+NO_GRADIENT(VideoInput);
+
+} // namespace caffe2
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
new file mode 100644
index 0000000..216b039
--- /dev/null
+++ b/caffe2/video/video_input_op.h
@@ -0,0 +1,863 @@
+#ifndef CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
+#define CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
+
+#include <istream>
+#include <ostream>
+#include <random>
+#include <string>
+
+#include <caffe2/core/db.h>
+#include <caffe2/core/logging.h>
+#include <caffe2/operators/prefetch_op.h>
+#include <caffe2/utils/math.h>
+#include <caffe2/utils/thread_pool.h>
+#include <caffe2/video/video_io.h>
+
+namespace caffe2 {
+
+template <class Context>
+class VideoInputOp final : public PrefetchOperator<Context> {
+ public:
+  using OperatorBase::OutputSize;
+  using PrefetchOperator<Context>::context_;
+  using PrefetchOperator<Context>::prefetch_thread_;
+  explicit VideoInputOp(const OperatorDef& operator_def, Workspace* ws);
+  ~VideoInputOp() {
+    PrefetchOperator<Context>::Finalize();
+  }
+
+  // override methods
+  bool Prefetch() override;
+  bool CopyPrefetched() override;
+
+ private:
+  void CheckParamsAndPrint();
+
+  bool GetClipsAndLabelsFromDBValue(
+      const std::string& value,
+      int& height,
+      int& width,
+      std::vector<unsigned char*>& buffer_rgb,
+      int* label_data,
+      int* video_id_data);
+
+  void DecodeAndTransform(
+      const std::string& value,
+      float* clip_rgb_data,
+      float* clip_of_data,
+      int* label_data,
+      int* video_id_data,
+      std::mt19937* randgen,
+      std::bernoulli_distribution* mirror_this_clip);
+
+  const db::DBReader* reader_;
+  CPUContext cpu_context_;
+  TensorCPU prefetched_clip_rgb_;
+  TensorCPU prefetched_clip_of_;
+  TensorCPU prefetched_label_;
+  TensorCPU prefetched_video_id_;
+  Tensor<Context> prefetched_clip_rgb_on_device_;
+  Tensor<Context> prefetched_clip_of_on_device_;
+  Tensor<Context> prefetched_label_on_device_;
+  Tensor<Context> prefetched_video_id_on_device_;
+  int batch_size_;
+  int clip_per_video_;
+  std::vector<float> mean_rgb_;
+  std::vector<float> inv_std_rgb_;
+  std::vector<float> mean_of_;
+  std::vector<float> inv_std_of_;
+  int channels_rgb_;
+  int channels_of_;
+  int crop_height_;
+  int crop_width_;
+  int scale_h_;
+  int scale_w_;
+  int height_min_;
+  int width_min_;
+  int length_rgb_;
+  int sampling_rate_rgb_;
+  bool color_jitter_;
+  float img_saturation_;
+  float img_brightness_;
+  float img_contrast_;
+  bool color_lighting_;
+  float color_lighting_std_;
+  std::vector<std::vector<float>> color_lighting_eigvecs_;
+  std::vector<float> color_lighting_eigvals_;
+  int num_of_required_frame_;
+  int length_of_;
+  int sampling_rate_of_;
+  int frame_gap_of_;
+  bool random_mirror_;
+  int num_of_class_;
+  bool use_local_file_;
+  bool random_crop_;
+  bool multi_crop_;
+  int multi_crop_count_;
+  int flow_data_type_;
+  int flow_alg_type_;
+  int decode_type_;
+  int video_res_type_;
+  bool do_flow_aggregation_;
+  bool get_rgb_;
+  bool get_optical_flow_;
+  bool get_video_id_;
+  bool do_multi_label_;
+
+  // thread pool for parse + decode
+  int num_decode_threads_;
+  std::shared_ptr<TaskThreadPool> thread_pool_;
+};
+
+template <class Context>
+void VideoInputOp<Context>::CheckParamsAndPrint() {
+  // check whether the input parameters are valid or not
+  CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be positive.");
+  CAFFE_ENFORCE_GT(
+      clip_per_video_, 0, "Number of clips per video should be positive.");
+  CAFFE_ENFORCE_GT(crop_height_, 0, "Must provide the cropping height value.");
+  CAFFE_ENFORCE_GT(crop_width_, 0, "Must provide the cropping width value.");
+
+  CAFFE_ENFORCE_GT(
+      num_of_required_frame_, 0, "Required number of frames must be positive.");
+
+  if (video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
+    CAFFE_ENFORCE_GT(height_min_, 0, "Must provide the minimal height value.");
+    CAFFE_ENFORCE_GT(width_min_, 0, "Must provide the minimal width value.");
+    CAFFE_ENFORCE_GE(
+        height_min_,
+        crop_height_,
+        "The minimal height must be no smaller than the cropping height.");
+    CAFFE_ENFORCE_GE(
+        width_min_,
+        crop_width_,
+        "The minimal width must be no smaller than the cropping width.");
+  } else if (video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
+    CAFFE_ENFORCE_GT(scale_h_, 0, "Must provide the scale height value.");
+    CAFFE_ENFORCE_GT(scale_w_, 0, "Must provide the scale width value.");
+    CAFFE_ENFORCE_GE(
+        scale_h_,
+        crop_height_,
+        "The scaled height must be no smaller than the cropping height.");
+    CAFFE_ENFORCE_GE(
+        scale_w_,
+        crop_width_,
+        "The scaled width must be no smaller than the cropping width.");
+  }
+
+  if (get_rgb_) {
+    CAFFE_ENFORCE_GT(length_rgb_, 0, "Must provide rgb clip length.");
+    CAFFE_ENFORCE_GT(
+        sampling_rate_rgb_, 0, "4 frames for mc2; 2 frames for res3d.");
+    CAFFE_ENFORCE_EQ(
+        channels_rgb_, mean_rgb_.size(), "Number rgb channels is wrong!");
+    CAFFE_ENFORCE_EQ(
+        channels_rgb_, inv_std_rgb_.size(), "Number rgb channels is wrong!");
+  }
+
+  if (get_optical_flow_) {
+    CAFFE_ENFORCE_GT(length_of_, 0, "Must provide optical flow clip length.");
+    CAFFE_ENFORCE_GT(
+        sampling_rate_of_, 0, "4 frames for mc2; 2 frames for res3d.");
+    CAFFE_ENFORCE_EQ(
+        channels_of_,
+        mean_of_.size(),
+        "Number of optical flow channels is wrong!");
+    CAFFE_ENFORCE_EQ(
+        channels_of_,
+        inv_std_of_.size(),
+        "Number of optical flow channels is wrong!");
+  }
+
+  if (clip_per_video_ > 1) {
+    CAFFE_ENFORCE_EQ(
+        decode_type_,
+        DecodeType::DO_UNIFORM_SMP,
+        "Only uniformly sampling is supported when sampling multiple clips!");
+  }
+
+  if (do_multi_label_) {
+    CAFFE_ENFORCE_GT(
+        num_of_class_,
+        0,
+        "Number of classes must be set when using multiple labels.");
+  }
+
+  // print out the parameter settings
+  LOG(INFO) << "Creating a clip input op with the following setting: ";
+  LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;";
+  LOG(INFO) << "    Outputting in batches of " << batch_size_ << " videos;";
+  LOG(INFO) << "    Each video has " << clip_per_video_ << " clips;";
+  LOG(INFO) << "    Scaling image to " << scale_h_ << "x" << scale_w_;
+  LOG(INFO) << "    (Height, Width) is at least (" << height_min_ << ", "
+            << width_min_ << ")";
+  LOG(INFO) << "    Cropping video frame to " << crop_height_ << "x"
+            << crop_width_ << (random_mirror_ ? " with " : " without ")
+            << "random mirroring;";
+  LOG(INFO) << "    Using " << (random_crop_ ? "random" : "center") << " crop";
+  LOG(INFO) << "    Is multi-cropping enabled: " << multi_crop_;
+
+  if (get_rgb_) {
+    LOG(INFO) << "    Using a clip of " << length_rgb_ << " rgb frames "
+              << "with " << channels_rgb_ << " channels "
+              << "and a sampling rate of 1:" << sampling_rate_rgb_;
+    LOG(INFO) << "    RGB data augmentation. Color jittering: " << color_jitter_
+              << ". Color lighting: " << color_lighting_;
+    for (int i = 0; i < channels_rgb_; i++) {
+      LOG(INFO) << "    RGB " << i << "-th channel mean: " << mean_rgb_[i]
+                << " std: " << 1.f / inv_std_rgb_[i];
+    }
+  }
+
+  if (get_optical_flow_) {
+    LOG(INFO) << "    Using a clip of " << length_of_ << " optical flow frames "
+              << "with " << channels_of_ << " channels "
+              << "and a sampling rate of 1:" << sampling_rate_of_
+              << " flow_data_type_: " << flow_data_type_
+              << " flow_alg_type_: " << flow_alg_type_;
+    for (int i = 0; i < channels_of_; i++) {
+      LOG(INFO) << "    Optical flow" << i
+                << "-th channel mean: " << mean_of_[i]
+                << " std: " << 1.f / inv_std_of_[i];
+    }
+  }
+
+  if (video_res_type_ == VideoResType::ORIGINAL_RES) {
+    LOG(INFO) << "    Use original resolution";
+  } else if (video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
+    LOG(INFO) << "    Resize with minimal size and keep aspect ratio";
+  } else if (video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
+    LOG(INFO) << "    Resize and ignore aspect ratio";
+  } else {
+    LOG(ERROR) << "    Unknown video resolution type";
+  }
+
+  if (decode_type_ == DecodeType::DO_TMP_JITTER) {
+    LOG(INFO) << "    Do temporal jittering";
+  } else if (decode_type_ == DecodeType::USE_START_FRM) {
+    LOG(INFO) << "    Use start_frm for decoding";
+  } else if (decode_type_ == DecodeType::DO_UNIFORM_SMP) {
+    LOG(INFO) << "    Do uniformly sampling";
+  } else {
+    LOG(ERROR) << "    Unknown video decoding type";
+  }
+}
+
+template <class Context>
+VideoInputOp<Context>::VideoInputOp(
+    const OperatorDef& operator_def,
+    Workspace* ws)
+    : PrefetchOperator<Context>(operator_def, ws),
+      reader_(nullptr),
+      batch_size_(
+          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
+      clip_per_video_(
+          OperatorBase::template GetSingleArgument<int>("clip_per_video", 1)),
+      mean_rgb_(OperatorBase::template GetRepeatedArgument<float>(
+          "mean_rgb_per_channel",
+          {OperatorBase::template GetSingleArgument<float>("mean_rgb", 128.)})),
+      inv_std_rgb_(OperatorBase::template GetRepeatedArgument<float>(
+          "std_rgb_per_channel",
+          {OperatorBase::template GetSingleArgument<float>("std_rgb", 1.)})),
+      mean_of_(OperatorBase::template GetRepeatedArgument<float>(
+          "mean_of_per_channel",
+          {OperatorBase::template GetSingleArgument<float>("mean_of", 0.)})),
+      inv_std_of_(OperatorBase::template GetRepeatedArgument<float>(
+          "std_of_per_channel",
+          {OperatorBase::template GetSingleArgument<float>("std_of", 1.)})),
+      channels_rgb_(
+          OperatorBase::template GetSingleArgument<int>("channels_rgb", 3)),
+      channels_of_(
+          OperatorBase::template GetSingleArgument<int>("channels_of", 2)),
+      crop_height_(OperatorBase::template GetSingleArgument<int>(
+          "crop_height",
+          {OperatorBase::template GetSingleArgument<int>("crop_size", 0.)})),
+      crop_width_(OperatorBase::template GetSingleArgument<int>(
+          "crop_width",
+          {OperatorBase::template GetSingleArgument<int>("crop_size", 0.)})),
+      scale_h_(OperatorBase::template GetSingleArgument<int>("scale_h", 0)),
+      scale_w_(OperatorBase::template GetSingleArgument<int>("scale_w", 0)),
+      height_min_(OperatorBase::template GetSingleArgument<int>(
+          "height_min",
+          {OperatorBase::template GetSingleArgument<int>("short_edge", 0)})),
+      width_min_(OperatorBase::template GetSingleArgument<int>(
+          "width_min",
+          {OperatorBase::template GetSingleArgument<int>("short_edge", 0)})),
+      length_rgb_(
+          OperatorBase::template GetSingleArgument<int>("length_rgb", 0)),
+      sampling_rate_rgb_(OperatorBase::template GetSingleArgument<int>(
+          "sampling_rate_rgb",
+          1)),
+      color_jitter_(OperatorBase::template GetSingleArgument<bool>(
+          "color_jitter",
+          false)),
+      img_saturation_(OperatorBase::template GetSingleArgument<float>(
+          "img_saturation",
+          0.4)),
+      img_brightness_(OperatorBase::template GetSingleArgument<float>(
+          "img_brightness",
+          0.4)),
+      img_contrast_(
+          OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)),
+      color_lighting_(OperatorBase::template GetSingleArgument<bool>(
+          "color_lighting",
+          false)),
+      color_lighting_std_(OperatorBase::template GetSingleArgument<float>(
+          "color_lighting_std",
+          0.1)),
+      length_of_(OperatorBase::template GetSingleArgument<int>("length_of", 0)),
+      sampling_rate_of_(
+          OperatorBase::template GetSingleArgument<int>("sampling_rate_of", 1)),
+      frame_gap_of_(
+          OperatorBase::template GetSingleArgument<int>("frame_gap_of", 1)),
+      random_mirror_(OperatorBase::template GetSingleArgument<bool>(
+          "random_mirror",
+          true)),
+      num_of_class_(
+          OperatorBase::template GetSingleArgument<int>("num_of_class", 0)),
+      use_local_file_(OperatorBase::template GetSingleArgument<bool>(
+          "use_local_file",
+          false)),
+      random_crop_(
+          OperatorBase::template GetSingleArgument<bool>("random_crop", true)),
+      multi_crop_(
+          OperatorBase::template GetSingleArgument<bool>("multi_crop", false)),
+      flow_data_type_(
+          OperatorBase::template GetSingleArgument<int>("flow_data_type", 0)),
+      flow_alg_type_(
+          OperatorBase::template GetSingleArgument<int>("flow_alg_type", 0)),
+      decode_type_(
+          OperatorBase::template GetSingleArgument<int>("decode_type", 0)),
+      video_res_type_(
+          OperatorBase::template GetSingleArgument<int>("video_res_type", 0)),
+      do_flow_aggregation_(OperatorBase::template GetSingleArgument<bool>(
+          "do_flow_aggregation",
+          true)),
+      get_rgb_(OperatorBase::template GetSingleArgument<bool>("get_rgb", true)),
+      get_optical_flow_(OperatorBase::template GetSingleArgument<bool>(
+          "get_optical_flow",
+          false)),
+      get_video_id_(OperatorBase::template GetSingleArgument<bool>(
+          "get_video_id",
+          false)),
+      do_multi_label_(OperatorBase::template GetSingleArgument<bool>(
+          "do_multi_label",
+          false)),
+      num_decode_threads_(OperatorBase::template GetSingleArgument<int>(
+          "num_decode_threads",
+          4)),
+      thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)) {
+  // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
+  color_lighting_eigvecs_.push_back(
+      std::vector<float>{-144.7125, 183.396, 102.2295});
+  color_lighting_eigvecs_.push_back(
+      std::vector<float>{-148.104, -1.1475, -207.57});
+  color_lighting_eigvecs_.push_back(
+      std::vector<float>{-148.818, -177.174, 107.1765});
+
+  color_lighting_eigvals_ = std::vector<float>{0.2175, 0.0188, 0.0045};
+
+  // multi-cropping for testing
+  multi_crop_count_ = 1;
+  if (multi_crop_) {
+    // we take left-top, central-top, right-top, left-bottom, central-bottom,
+    // right-bottom and central-central croppings as well as their mirrorings
+    // In total, 14 croppings
+    multi_crop_count_ = 14;
+  }
+
+  num_of_required_frame_ = 0;
+
+  // mean and std for normalizing different optical flow data type;
+  // Example statistics generated from SOA are shown below, and you may
+  // want to change them if you are running on a different dataset;
+
+  // 7 channels: (flow_x, flow_y, flow_magitude, gray, Red, Green, Blue)
+  const std::vector<float> InputDataMean = {
+      0.0046635, 0.0046261, 0.963986, 102.976, 110.201, 100.64, 95.9966};
+  const std::vector<float> InputDataStd = {
+      0.972347, 0.755146, 1.43588, 55.3691, 58.1489, 56.4701, 55.3324};
+
+  // if we need RGB as an input
+  if (get_rgb_) {
+    // how many frames we need for RGB
+    num_of_required_frame_ = std::max(
+        num_of_required_frame_, (length_rgb_ - 1) * sampling_rate_rgb_ + 1);
+
+    channels_rgb_ = 3;
+
+    if (mean_rgb_.size() != channels_rgb_ ||
+        inv_std_rgb_.size() != channels_rgb_) {
+      mean_rgb_.clear();
+      inv_std_rgb_.clear();
+      for (int i = 4; i < 7; i++) {
+        mean_rgb_.push_back(InputDataMean[i]);
+        inv_std_rgb_.push_back(1.f / InputDataStd[i]);
+      }
+    }
+  }
+
+  // if we need optical flow as an input
+  if (get_optical_flow_) {
+    // how many frames we need for optical flow
+    num_of_required_frame_ = std::max(
+        num_of_required_frame_,
+        (length_of_ - 1) * sampling_rate_of_ + frame_gap_of_ + 1);
+
+    if (mean_of_.size() != channels_of_ || inv_std_of_.size() != channels_of_) {
+      mean_of_.clear();
+      inv_std_of_.clear();
+
+      // set the parameters for different input data types
+      switch (flow_data_type_) {
+        case FlowDataType::Flow2C:
+          channels_of_ = 2;
+          for (int i = 0; i < channels_of_; i++) {
+            mean_of_.push_back(InputDataMean[i]);
+            inv_std_of_.push_back(1.f / InputDataStd[i]);
+          }
+          break;
+
+        case FlowDataType::Flow3C:
+          channels_of_ = 3;
+          for (int i = 0; i < channels_of_; i++) {
+            mean_of_.push_back(InputDataMean[i]);
+            inv_std_of_.push_back(1.f / InputDataStd[i]);
+          }
+          break;
+
+        // early fusion with gray
+        case FlowDataType::FlowWithGray:
+          channels_of_ = 3;
+          for (int i = 0; i < 2; i++) {
+            mean_of_.push_back(InputDataMean[i]);
+            inv_std_of_.push_back(1.f / InputDataStd[i]);
+          }
+          mean_of_.push_back(InputDataMean[3]);
+          inv_std_of_.push_back(1.f / InputDataStd[3]);
+          break;
+
+        // early fusion with RGB
+        case FlowDataType::FlowWithRGB:
+          channels_of_ = 5;
+          for (int i = 0; i < 2; i++) {
+            mean_of_.push_back(InputDataMean[i]);
+            inv_std_of_.push_back(1.f / InputDataStd[i]);
+          }
+          for (int i = 4; i < 7; i++) {
+            mean_of_.push_back(InputDataMean[i]);
+            inv_std_of_.push_back(1.f / InputDataStd[i]);
+          }
+          break;
+
+        default:
+          LOG(ERROR) << "Unknown optical flow type " << flow_data_type_;
+          break;
+      }
+    }
+  }
+
+  CheckParamsAndPrint();
+  // Always need a dbreader, even when using local video files
+  CAFFE_ENFORCE_GT(
+      operator_def.input_size(), 0, "Need to have a DBReader blob input");
+
+  vector<TIndex> data_shape(5);
+  vector<TIndex> label_shape(2);
+
+  // for RGB data
+  data_shape[0] = batch_size_ * clip_per_video_ * multi_crop_count_;
+  data_shape[1] = channels_rgb_;
+  data_shape[2] = length_rgb_;
+  data_shape[3] = crop_height_;
+  data_shape[4] = crop_width_;
+  prefetched_clip_rgb_.Resize(data_shape);
+
+  // for optical flow data
+  data_shape[1] = channels_of_;
+  data_shape[2] = length_of_;
+  prefetched_clip_of_.Resize(data_shape);
+
+  // If do_multi_label is used, output label is a binary vector
+  // of length num_of_class indicating which labels present
+  if (do_multi_label_) {
+    label_shape[0] = batch_size_ * clip_per_video_ * multi_crop_count_;
+    label_shape[1] = num_of_class_;
+    prefetched_label_.Resize(label_shape);
+  } else {
+    prefetched_label_.Resize(
+        vector<TIndex>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
+  }
+
+  prefetched_video_id_.Resize(
+      vector<TIndex>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
+}
+
+template <class Context>
+bool VideoInputOp<Context>::GetClipsAndLabelsFromDBValue(
+    const std::string& value,
+    int& height,
+    int& width,
+    std::vector<unsigned char*>& buffer_rgb,
+    int* label_data,
+    int* video_id_data) {
+  TensorProtos protos;
+  int curr_proto_idx = 0;
+  CAFFE_ENFORCE(protos.ParseFromString(value));
+  const TensorProto& video_proto = protos.protos(curr_proto_idx++);
+  const TensorProto& label_proto = protos.protos(curr_proto_idx++);
+
+  int start_frm = 0;
+  // start_frm is only valid when sampling 1 clip per video without
+  // temporal jitterring
+  if (decode_type_ == DecodeType::USE_START_FRM) {
+    CAFFE_ENFORCE_GE(
+        protos.protos_size(),
+        curr_proto_idx + 1,
+        "Start frm proto not provided");
+    const TensorProto& start_frm_proto = protos.protos(curr_proto_idx++);
+    start_frm = start_frm_proto.int32_data(0);
+  }
+
+  if (get_video_id_) {
+    CAFFE_ENFORCE_GE(
+        protos.protos_size(), curr_proto_idx + 1, "Video Id not provided");
+    const TensorProto& video_id_proto = protos.protos(curr_proto_idx);
+    for (int i = 0; i < clip_per_video_ * multi_crop_count_; i++) {
+      video_id_data[i] = video_id_proto.int64_data(0);
+    }
+  }
+  // assign labels
+  if (!do_multi_label_) {
+    for (int i = 0; i < clip_per_video_ * multi_crop_count_; i++) {
+      label_data[i] = label_proto.int32_data(0);
+    }
+  } else {
+    // For multiple label case, output label is a binary vector
+    // where presented concepts are makred 1
+    memset(
+        label_data,
+        0,
+        sizeof(int) * num_of_class_ * multi_crop_count_ * clip_per_video_);
+    for (int i = 0; i < clip_per_video_; i++) {
+      for (int j = 0; j < multi_crop_count_; ++j) {
+        for (int k = 0; k < label_proto.int32_data_size(); k++) {
+          CAFFE_ENFORCE_LT(
+              label_proto.int32_data(k),
+              num_of_class_,
+              "Label should be less than the number of classes.");
+          label_data
+              [(i * multi_crop_count_ + j) * num_of_class_ +
+               label_proto.int32_data(k)] = 1;
+        }
+      }
+    }
+  }
+
+  if (use_local_file_) {
+    CAFFE_ENFORCE_EQ(
+        video_proto.data_type(),
+        TensorProto::STRING,
+        "Database with a file_list is expected to be string data");
+  }
+
+  // initializing the decoding params
+  Params params;
+  params.maximumOutputFrames_ = MAX_DECODING_FRAMES;
+  params.video_res_type_ = video_res_type_;
+  params.crop_height_ = crop_height_;
+  params.crop_width_ = crop_width_;
+  params.height_min_ = height_min_;
+  params.width_min_ = width_min_;
+  params.scale_w_ = scale_w_;
+  params.scale_h_ = scale_h_;
+  params.decode_type_ = decode_type_;
+  params.num_of_required_frame_ = num_of_required_frame_;
+
+  char* video_buffer = nullptr; // for decoding from buffer
+  std::string video_filename; // for decoding from file
+  int encoded_size = 0;
+  if (video_proto.data_type() == TensorProto::STRING) {
+    const string& encoded_video_str = video_proto.string_data(0);
+    if (!use_local_file_) {
+      encoded_size = encoded_video_str.size();
+      video_buffer = const_cast<char*>(encoded_video_str.data());
+    } else {
+      video_filename = encoded_video_str;
+    }
+  } else if (video_proto.data_type() == TensorProto::BYTE) {
+    if (!use_local_file_) {
+      encoded_size = video_proto.byte_data().size();
+      video_buffer = const_cast<char*>(video_proto.byte_data().data());
+    } else {
+      // TODO: does this works?
+      video_filename = video_proto.string_data(0);
+    }
+  } else {
+    LOG(FATAL) << "Unknown video data type.";
+  }
+
+  DecodeMultipleClipsFromVideo(
+      video_buffer,
+      video_filename,
+      encoded_size,
+      params,
+      start_frm,
+      clip_per_video_,
+      use_local_file_,
+      height,
+      width,
+      buffer_rgb);
+
+  return true;
+}
+
+template <class Context>
+void VideoInputOp<Context>::DecodeAndTransform(
+    const std::string& value,
+    float* clip_rgb_data,
+    float* clip_of_data,
+    int* label_data,
+    int* video_id_data,
+    std::mt19937* randgen,
+    std::bernoulli_distribution* mirror_this_clip) {
+  std::vector<unsigned char*> buffer_rgb;
+  // get the video resolution after decoding
+  int height = 0;
+  int width = 0;
+  // Decode the video from memory or read from a local file
+  CHECK(GetClipsAndLabelsFromDBValue(
+      value, height, width, buffer_rgb, label_data, video_id_data));
+
+  int clip_offset_rgb = multi_crop_count_ * channels_rgb_ * length_rgb_ *
+      crop_height_ * crop_width_;
+  int clip_crop_offset_of =
+      channels_of_ * length_of_ * crop_height_ * crop_width_;
+  int clip_offset_of = multi_crop_count_ * clip_crop_offset_of;
+  for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size())); i++) {
+    // get the rectangle for cropping
+    int h_off = 0;
+    int w_off = 0;
+    if (random_crop_) {
+      // using random crop for training
+      h_off =
+          std::uniform_int_distribution<>(0, height - crop_height_)(*randgen);
+      w_off = std::uniform_int_distribution<>(0, width - crop_width_)(*randgen);
+    } else {
+      // using center crop for testing
+      h_off = (height - crop_height_) / 2;
+      w_off = (width - crop_width_) / 2;
+    }
+    // cv::Rect rect(w_off, h_off, crop_width_, crop_height_);
+
+    // Multi cropping: we take left-top, central-top, right-top, left-bottom,
+    // central-bottom, right-bottom and central-central croppings as well as
+    // their mirrorings. In total, 14 croppings
+    int multi_crop_w_off[7] = {0,
+                               (width - crop_width_) / 2,
+                               width - crop_width_,
+                               (width - crop_width_) / 2,
+                               0,
+                               (width - crop_width_) / 2,
+                               width - crop_width_};
+    int multi_crop_h_off[7] = {0,
+                               0,
+                               0,
+                               (height - crop_height_) / 2,
+                               height - crop_height_,
+                               height - crop_height_,
+                               height - crop_height_};
+
+    // randomly mirror the image or not
+    bool mirror_me = random_mirror_ && (*mirror_this_clip)(*randgen);
+    if (get_rgb_ && clip_rgb_data) {
+      ClipTransformRGB(
+          buffer_rgb[i],
+          multi_crop_count_,
+          crop_height_,
+          crop_width_,
+          length_rgb_,
+          channels_rgb_,
+          sampling_rate_rgb_,
+          height,
+          width,
+          h_off,
+          w_off,
+          multi_crop_h_off,
+          multi_crop_w_off,
+          mirror_me,
+          color_jitter_,
+          img_saturation_,
+          img_brightness_,
+          img_contrast_,
+          color_lighting_,
+          color_lighting_std_,
+          color_lighting_eigvecs_,
+          color_lighting_eigvals_,
+          mean_rgb_,
+          inv_std_rgb_,
+          randgen,
+          clip_rgb_data + (i * clip_offset_rgb));
+    }
+    if (get_optical_flow_ && clip_of_data) {
+      cv::Rect rect;
+      for (int j = 0; j < multi_crop_count_; ++j) {
+        if (multi_crop_count_ == 1) {
+          rect = cv::Rect(w_off, h_off, crop_width_, crop_height_);
+        } else {
+          mirror_me = j / (multi_crop_count_ / 2);
+          int k = j % (multi_crop_count_ / 2);
+          rect = cv::Rect(
+              multi_crop_w_off[k],
+              multi_crop_h_off[k],
+              crop_width_,
+              crop_height_);
+        }
+        ClipTransformOpticalFlow(
+            buffer_rgb[i],
+            crop_height_,
+            crop_width_,
+            length_of_,
+            channels_of_,
+            sampling_rate_of_,
+            height,
+            width,
+            rect,
+            channels_rgb_,
+            mirror_me,
+            flow_alg_type_,
+            flow_data_type_,
+            frame_gap_of_,
+            do_flow_aggregation_,
+            mean_of_,
+            inv_std_of_,
+            clip_of_data + (i * clip_offset_of) + j * clip_crop_offset_of);
+      }
+    }
+  }
+
+  if (buffer_rgb.size() > 0) {
+    for (int i = 0; i < buffer_rgb.size(); i++) {
+      unsigned char* buff = buffer_rgb[i];
+      delete[] buff;
+    }
+  }
+  buffer_rgb.clear();
+}
+
+template <class Context>
+bool VideoInputOp<Context>::Prefetch() {
+  // We will get the reader pointer from input.
+  // If we use local clips, db will store the list
+  reader_ = &OperatorBase::Input<db::DBReader>(0);
+
+  // Call mutable_data() once to allocate the underlying memory.
+  prefetched_clip_rgb_.mutable_data<float>();
+  prefetched_clip_of_.mutable_data<float>();
+  prefetched_label_.mutable_data<int>();
+  prefetched_video_id_.mutable_data<int>();
+
+  // Prefetching handled with a thread pool of "decode_threads" threads.
+  std::mt19937 meta_randgen(time(nullptr));
+  std::vector<std::mt19937> randgen_per_thread;
+  for (int i = 0; i < num_decode_threads_; ++i) {
+    randgen_per_thread.emplace_back(meta_randgen());
+  }
+
+  std::bernoulli_distribution mirror_this_clip(0.5);
+  for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    std::mt19937* randgen = &randgen_per_thread[item_id % num_decode_threads_];
+
+    int frame_size = crop_height_ * crop_width_;
+    // get the clip data pointer for the item_id -th example
+    float* clip_rgb_data = prefetched_clip_rgb_.mutable_data<float>() +
+        frame_size * length_rgb_ * channels_rgb_ * item_id * clip_per_video_ *
+            multi_crop_count_;
+
+    // get the optical flow data for the current clip
+    float* clip_of_data = prefetched_clip_of_.mutable_data<float>() +
+        frame_size * length_of_ * channels_of_ * item_id * clip_per_video_ *
+            multi_crop_count_;
+
+    // get the label data pointer for the item_id -th example
+    int* label_data = prefetched_label_.mutable_data<int>() +
+        (do_multi_label_ ? num_of_class_ : 1) * item_id * clip_per_video_ *
+            multi_crop_count_;
+
+    // get the video id data pointer for the item_id -th example
+    int* video_id_data = prefetched_video_id_.mutable_data<int>() +
+        item_id * clip_per_video_ * multi_crop_count_;
+
+    std::string key, value;
+    // read data
+    reader_->Read(&key, &value);
+
+    thread_pool_->runTask(std::bind(
+        &VideoInputOp<Context>::DecodeAndTransform,
+        this,
+        std::string(value),
+        clip_rgb_data,
+        clip_of_data,
+        label_data,
+        video_id_data,
+        randgen,
+        &mirror_this_clip));
+  } // for over the batch
+  thread_pool_->waitWorkComplete();
+
+  // If the context is not CPUContext, we will need to do a copy in the
+  // prefetch function as well.
+  if (!std::is_same<Context, CPUContext>::value) {
+    if (get_rgb_) {
+      prefetched_clip_rgb_on_device_.CopyFrom(prefetched_clip_rgb_, &context_);
+    }
+    if (get_optical_flow_) {
+      prefetched_clip_of_on_device_.CopyFrom(prefetched_clip_of_, &context_);
+    }
+    prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
+    if (get_video_id_) {
+      prefetched_video_id_on_device_.CopyFrom(prefetched_video_id_, &context_);
+    }
+  }
+  return true;
+}
+
+template <class Context>
+bool VideoInputOp<Context>::CopyPrefetched() {
+  int index = 0;
+  if (get_rgb_) {
+    auto* clip_rgb_output = OperatorBase::Output<Tensor<Context>>(index++);
+    if (std::is_same<Context, CPUContext>::value) {
+      clip_rgb_output->CopyFrom(prefetched_clip_rgb_, &context_);
+    } else {
+      clip_rgb_output->CopyFrom(prefetched_clip_rgb_on_device_, &context_);
+    }
+  }
+  if (get_optical_flow_) {
+    auto* clip_of_output = OperatorBase::Output<Tensor<Context>>(index++);
+    if (std::is_same<Context, CPUContext>::value) {
+      clip_of_output->CopyFrom(prefetched_clip_of_, &context_);
+    } else {
+      clip_of_output->CopyFrom(prefetched_clip_of_on_device_, &context_);
+    }
+  }
+  auto* label_output = OperatorBase::Output<Tensor<Context>>(index++);
+  if (std::is_same<Context, CPUContext>::value) {
+    label_output->CopyFrom(prefetched_label_, &context_);
+  } else {
+    label_output->CopyFrom(prefetched_label_on_device_, &context_);
+  }
+  if (get_video_id_) {
+    auto* video_id_output = OperatorBase::Output<Tensor<Context>>(index);
+    if (std::is_same<Context, CPUContext>::value) {
+      video_id_output->CopyFrom(prefetched_video_id_, &context_);
+    } else {
+      video_id_output->CopyFrom(prefetched_video_id_on_device_, &context_);
+    }
+  }
+  return true;
+}
+
+} // namespace caffe2
+
+#endif // CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
diff --git a/caffe2/video/video_input_op_gpu.cc b/caffe2/video/video_input_op_gpu.cc
new file mode 100644
index 0000000..b9e60e6
--- /dev/null
+++ b/caffe2/video/video_input_op_gpu.cc
@@ -0,0 +1,9 @@
+#include <caffe2/core/common_gpu.h>
+#include <caffe2/core/context_gpu.h>
+#include <caffe2/video/video_input_op.h>
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(VideoInput, VideoInputOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/video/video_io.cc b/caffe2/video/video_io.cc
new file mode 100644
index 0000000..0824cde
--- /dev/null
+++ b/caffe2/video/video_io.cc
@@ -0,0 +1,562 @@
+#include <caffe2/video/video_io.h>
+#include <caffe2/core/logging.h>
+#include <algorithm>
+#include <random>
+#include <string>
+
+namespace caffe2 {
+
+// assume CLHW order and color channels RGB
+void Saturation(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const float alpha_rand,
+    std::mt19937* randgen) {
+  float alpha = 1.0f +
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+
+  // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
+  const int channel_size = length * crop_height * crop_width;
+  int p = 0;
+  for (int l = 0; l < length; ++l) {
+    for (int h = 0; h < crop_height; ++h) {
+      for (int w = 0; w < crop_width; ++w) {
+        float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
+            clip[p + 2 * channel_size] * 0.114f;
+        for (int c = 0; c < 3; ++c) {
+          clip[c * channel_size + p] =
+              clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha);
+        }
+        p++;
+      }
+    }
+  }
+}
+
+// assume CLHW order and color channels RGB
+void Brightness(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const float alpha_rand,
+    std::mt19937* randgen) {
+  float alpha = 1.0f +
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+
+  int p = 0;
+  for (int c = 0; c < 3; ++c) {
+    for (int l = 0; l < length; ++l) {
+      for (int h = 0; h < crop_height; ++h) {
+        for (int w = 0; w < crop_width; ++w) {
+          clip[p++] *= alpha;
+        }
+      }
+    }
+  }
+}
+
+// assume CLHW order and color channels RGB
+void Contrast(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const float alpha_rand,
+    std::mt19937* randgen) {
+  const int channel_size = length * crop_height * crop_width;
+  float gray_mean = 0;
+  int p = 0;
+  for (int l = 0; l < length; ++l) {
+    for (int h = 0; h < crop_height; ++h) {
+      for (int w = 0; w < crop_width; ++w) {
+        // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
+        gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
+            clip[p + 2 * channel_size] * 0.114f;
+        p++;
+      }
+    }
+  }
+  gray_mean /= (length * crop_height * crop_width);
+
+  float alpha = 1.0f +
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+  p = 0;
+  for (int c = 0; c < 3; ++c) {
+    for (int l = 0; l < length; ++l) {
+      for (int h = 0; h < crop_height; ++h) {
+        for (int w = 0; w < crop_width; ++w) {
+          clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha);
+          p++;
+        }
+      }
+    }
+  }
+}
+
+// assume CLHW order and color channels RGB
+void ColorJitter(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const float saturation,
+    const float brightness,
+    const float contrast,
+    std::mt19937* randgen) {
+  std::srand(unsigned(std::time(0)));
+  std::vector<int> jitter_order{0, 1, 2};
+  // obtain a time-based seed:
+  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+  std::shuffle(
+      jitter_order.begin(),
+      jitter_order.end(),
+      std::default_random_engine(seed));
+
+  for (int i = 0; i < 3; ++i) {
+    if (jitter_order[i] == 0) {
+      Saturation(clip, length, crop_height, crop_width, saturation, randgen);
+    } else if (jitter_order[i] == 1) {
+      Brightness(clip, length, crop_height, crop_width, brightness, randgen);
+    } else {
+      Contrast(clip, length, crop_height, crop_width, contrast, randgen);
+    }
+  }
+}
+
+// assume CLHW order and color channels RGB
+void ColorLighting(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const float alpha_std,
+    const std::vector<std::vector<float>>& eigvecs,
+    const std::vector<float>& eigvals,
+    std::mt19937* randgen) {
+  std::normal_distribution<float> d(0, alpha_std);
+  std::vector<float> alphas(3);
+  for (int i = 0; i < 3; ++i) {
+    alphas[i] = d(*randgen);
+  }
+
+  std::vector<float> delta_rgb(3, 0.0);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
+    }
+  }
+
+  int p = 0;
+  for (int c = 0; c < 3; ++c) {
+    for (int l = 0; l < length; ++l) {
+      for (int h = 0; h < crop_height; ++h) {
+        for (int w = 0; w < crop_width; ++w) {
+          clip[p++] += delta_rgb[c];
+        }
+      }
+    }
+  }
+}
+
+// assume CLHW order and color channels RGB
+// mean subtraction and scaling.
+void ColorNormalization(
+    float* clip,
+    const int length,
+    const int crop_height,
+    const int crop_width,
+    const int channels,
+    const std::vector<float>& mean,
+    const std::vector<float>& inv_std) {
+  int p = 0;
+  for (int c = 0; c < channels; ++c) {
+    for (int l = 0; l < length; ++l) {
+      for (int h = 0; h < crop_height; ++h) {
+        for (int w = 0; w < crop_width; ++w) {
+          clip[p] = (clip[p] - mean[c]) * inv_std[c];
+          p++;
+        }
+      }
+    }
+  }
+}
+
+void ClipTransformRGB(
+    const unsigned char* buffer_rgb,
+    const int multi_crop_count,
+    const int crop_height,
+    const int crop_width,
+    const int length_rgb,
+    const int channels_rgb,
+    const int sampling_rate_rgb,
+    const int height,
+    const int width,
+    const int h_off,
+    const int w_off,
+    const int* multi_crop_h_off,
+    const int* multi_crop_w_off,
+    const bool mirror_me,
+    const bool color_jitter,
+    const float saturation,
+    const float brightness,
+    const float contrast,
+    const bool color_lighting,
+    const float color_lighting_std,
+    const std::vector<std::vector<float>>& color_lighting_eigvecs,
+    const std::vector<float>& color_lighting_eigvals,
+    const std::vector<float>& mean_rgb,
+    const std::vector<float>& inv_std_rgb,
+    std::mt19937* randgen,
+    float* transformed_clip) {
+  CAFFE_ENFORCE_EQ(
+      channels_rgb, mean_rgb.size(), "rgb channels must be equal to mean size");
+  CAFFE_ENFORCE_EQ(
+      mean_rgb.size(),
+      inv_std_rgb.size(),
+      "mean size must be equal to inv_std size");
+  int orig_index, tran_index;
+  if (multi_crop_count == 1) {
+    // Case 1: Multi_cropping is disabled
+    // The order of output dimensions is C, L, H, W
+    bool do_color_jitter_lighting =
+        (color_jitter || color_lighting) && channels_rgb == 3;
+    for (int c = 0; c < channels_rgb; ++c) {
+      for (int l = 0; l < length_rgb; ++l) {
+        int orig_index_l =
+            l * sampling_rate_rgb * height * width * channels_rgb;
+        int tran_index_l = (c * length_rgb + l) * crop_height;
+
+        for (int h = 0; h < crop_height; ++h) {
+          int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
+          int tran_index_h = (tran_index_l + h) * crop_width;
+
+          for (int w = 0; w < crop_width; ++w) {
+            orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
+
+            // mirror the frame
+            if (mirror_me) {
+              tran_index = tran_index_h + (crop_width - 1 - w);
+            } else {
+              tran_index = tran_index_h + w;
+            }
+
+            // normalize and transform the clip
+            if (do_color_jitter_lighting) {
+              transformed_clip[tran_index] = buffer_rgb[orig_index];
+            } else {
+              transformed_clip[tran_index] =
+                  (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
+            }
+          }
+        }
+      }
+    }
+    if (color_jitter && channels_rgb == 3) {
+      ColorJitter(
+          transformed_clip,
+          length_rgb,
+          crop_height,
+          crop_width,
+          saturation,
+          brightness,
+          contrast,
+          randgen);
+    }
+    if (color_lighting && channels_rgb == 3) {
+      ColorLighting(
+          transformed_clip,
+          length_rgb,
+          crop_height,
+          crop_width,
+          color_lighting_std,
+          color_lighting_eigvecs,
+          color_lighting_eigvals,
+          randgen);
+    }
+    if (do_color_jitter_lighting) {
+      // Color normalization
+      // Mean subtraction and division by standard deviation.
+      ColorNormalization(
+          transformed_clip,
+          length_rgb,
+          crop_height,
+          crop_width,
+          channels_rgb,
+          mean_rgb,
+          inv_std_rgb);
+    }
+  } else {
+    // Case 2: Multi_cropping is enabled. Multi cropping should be only used at
+    // testing stage. So color jittering and lighting are not used
+    for (int multi_crop_mirror = 0; multi_crop_mirror < 2;
+         ++multi_crop_mirror) {
+      for (int i = 0; i < multi_crop_count / 2; ++i) {
+        for (int c = 0; c < channels_rgb; ++c) {
+          for (int l = 0; l < length_rgb; ++l) {
+            int orig_index_l =
+                l * sampling_rate_rgb * height * width * channels_rgb;
+            int tran_index_l = (c * length_rgb + l) * crop_height;
+
+            for (int h = 0; h < crop_height; ++h) {
+              int orig_index_h = orig_index_l +
+                  (h + multi_crop_h_off[i]) * width * channels_rgb;
+              int tran_index_h = (tran_index_l + h) * crop_width;
+
+              for (int w = 0; w < crop_width; ++w) {
+                orig_index =
+                    orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c;
+
+                if (multi_crop_mirror == 1) {
+                  tran_index = tran_index_h + (crop_width - 1 - w);
+                } else {
+                  tran_index = tran_index_h + w;
+                }
+
+                transformed_clip[tran_index] =
+                    (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
+              }
+            }
+          }
+        }
+        transformed_clip +=
+            channels_rgb * length_rgb * crop_height * crop_width;
+      }
+    }
+  }
+}
+
+void ClipTransformOpticalFlow(
+    const unsigned char* buffer_rgb,
+    const int crop_height,
+    const int crop_width,
+    const int length_of,
+    const int channels_of,
+    const int sampling_rate_of,
+    const int height,
+    const int width,
+    const cv::Rect& rect,
+    const int channels_rgb,
+    const bool mirror_me,
+    const int flow_alg_type,
+    const int flow_data_type,
+    const int frame_gap_of,
+    const bool do_flow_aggregation,
+    const std::vector<float>& mean_of,
+    const std::vector<float>& inv_std_of,
+    float* transformed_clip) {
+  const int frame_size = crop_height * crop_width;
+  const int channel_size_flow = length_of * frame_size;
+
+  // for get the mean and std of the input data
+  bool extract_statistics = false;
+  static std::vector<double> mean_static(channels_of, 0.f);
+  static std::vector<double> std_static(channels_of, 0.f);
+  static long long count = 0;
+  cv::Scalar mean_img, std_img;
+
+  for (int l = 0; l < length_of; l++) {
+    // get the grayscale frames
+    std::vector<cv::Mat> grays, rgbs;
+    int step_size = do_flow_aggregation ? 1 : frame_gap_of;
+    for (int j = 0; j <= frame_gap_of; j += step_size) {
+      // get the current frame
+      const unsigned char* curr_frame = buffer_rgb +
+          (l * sampling_rate_of + j) * height * width * channels_rgb;
+      cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
+      memcpy(
+          img.data,
+          curr_frame,
+          height * width * channels_rgb * sizeof(unsigned char));
+
+      // crop and mirror the frame
+      cv::Mat img_cropped = img(rect);
+      if (mirror_me) {
+        cv::flip(img_cropped, img_cropped, 1);
+      }
+
+      cv::Mat gray;
+      cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
+      grays.push_back(gray);
+      rgbs.push_back(img_cropped);
+    }
+
+    cv::Mat first_gray, first_rgb;
+    cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2);
+    MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
+
+    std::vector<cv::Mat> imgs;
+    cv::split(flow, imgs);
+    // save the 2-channel optical flow first
+    int c = 0;
+    for (; c < 2; c++) {
+      if (extract_statistics) {
+        cv::meanStdDev(imgs[c], mean_img, std_img);
+        mean_static[c] += mean_img[0];
+        std_static[c] += std_img[0];
+      }
+
+      imgs[c] -= mean_of[c];
+      imgs[c] *= inv_std_of[c];
+      memcpy(
+          transformed_clip + c * channel_size_flow + l * frame_size,
+          imgs[c].data,
+          frame_size * sizeof(float));
+    }
+
+    cv::Mat mag;
+    std::vector<cv::Mat> chans;
+    // augment the optical flow with more channels
+    switch (flow_data_type) {
+      case FlowDataType::Flow2C:
+        // nothing to do if we only need two channels
+        break;
+
+      case FlowDataType::Flow3C:
+        // use magnitude as the third channel
+        mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
+        if (extract_statistics) {
+          cv::meanStdDev(mag, mean_img, std_img);
+          mean_static[c] += mean_img[0];
+          std_static[c] += std_img[0];
+        }
+
+        mag -= mean_of[c];
+        mag *= inv_std_of[c];
+        memcpy(
+            transformed_clip + c * channel_size_flow + l * frame_size,
+            mag.data,
+            frame_size * sizeof(float));
+        break;
+
+      case FlowDataType::FlowWithGray:
+        // add grayscale image as the third channel
+        grays[0].convertTo(first_gray, CV_32FC1);
+        if (extract_statistics) {
+          cv::meanStdDev(first_gray, mean_img, std_img);
+          mean_static[c] += mean_img[0];
+          std_static[c] += std_img[0];
+        }
+
+        first_gray -= mean_of[c];
+        first_gray *= inv_std_of[c];
+        memcpy(
+            transformed_clip + c * channel_size_flow + l * frame_size,
+            first_gray.data,
+            frame_size * sizeof(float));
+        break;
+
+      case FlowDataType::FlowWithRGB:
+        // add all three rgb channels
+        rgbs[0].convertTo(first_rgb, CV_32FC3);
+        cv::split(first_rgb, chans);
+        for (; c < channels_of; c++) {
+          if (extract_statistics) {
+            cv::meanStdDev(chans[c - 2], mean_img, std_img);
+            mean_static[c] += mean_img[0];
+            std_static[c] += std_img[0];
+          }
+
+          chans[c - 2] -= mean_of[c];
+          chans[c - 2] *= inv_std_of[c];
+          memcpy(
+              transformed_clip + c * channel_size_flow + l * frame_size,
+              chans[c - 2].data,
+              frame_size * sizeof(float));
+        }
+        break;
+
+      default:
+        LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
+        break;
+    }
+
+    if (extract_statistics) {
+      count++;
+      if (count % 1000 == 1) {
+        for (int i = 0; i < channels_of; i++) {
+          LOG(INFO) << i
+                    << "-th channel mean: " << mean_static[i] / float(count)
+                    << " std: " << std_static[i] / float(count);
+        }
+      }
+    }
+  }
+}
+
+void FreeDecodedData(
+    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
+  // free the sampledFrames
+  for (int i = 0; i < sampledFrames.size(); i++) {
+    DecodedFrame* p = sampledFrames[i].release();
+    delete p;
+  }
+  sampledFrames.clear();
+}
+
+bool DecodeMultipleClipsFromVideo(
+    const char* video_buffer,
+    const std::string& video_filename,
+    const int encoded_size,
+    const Params& params,
+    const int start_frm,
+    const int clip_per_video,
+    const bool use_local_file,
+    int& height,
+    int& width,
+    std::vector<unsigned char*>& buffer_rgb) {
+  std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
+  VideoDecoder decoder;
+
+  // decoding from buffer or file
+  if (!use_local_file) {
+    decoder.decodeMemory(
+        video_buffer, encoded_size, params, start_frm, sampledFrames);
+  } else {
+    decoder.decodeFile(video_filename, params, start_frm, sampledFrames);
+  }
+
+  for (int i = 0; i < buffer_rgb.size(); i++) {
+    unsigned char* buff = buffer_rgb[i];
+    delete[] buff;
+  }
+  buffer_rgb.clear();
+
+  if (sampledFrames.size() < params.num_of_required_frame_) {
+    // LOG(ERROR) << "The video seems faulty and we could not decode enough
+    // frames: "
+    //            << sampledFrames.size() << " VS " <<
+    //            params.num_of_required_frame_;
+    FreeDecodedData(sampledFrames);
+    return true;
+  }
+
+  height = sampledFrames[0]->height_;
+  width = sampledFrames[0]->width_;
+  float sample_stepsz = (clip_per_video <= 1)
+      ? 0
+      : (float(sampledFrames.size() - params.num_of_required_frame_) /
+         (clip_per_video - 1));
+
+  int image_size = 3 * height * width;
+  int clip_size = params.num_of_required_frame_ * image_size;
+  // get the RGB frames for each clip
+  for (int i = 0; i < clip_per_video; i++) {
+    unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
+    int clip_start = floor(i * sample_stepsz);
+    for (int j = 0; j < params.num_of_required_frame_; j++) {
+      memcpy(
+          buffer_rgb_ptr + j * image_size,
+          (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
+          image_size * sizeof(unsigned char));
+    }
+    buffer_rgb.push_back(buffer_rgb_ptr);
+  }
+  FreeDecodedData(sampledFrames);
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/video/video_io.h b/caffe2/video/video_io.h
new file mode 100644
index 0000000..bb07aad
--- /dev/null
+++ b/caffe2/video/video_io.h
@@ -0,0 +1,80 @@
+#ifndef CAFFE2_VIDEO_VIDEO_IO_H_
+#define CAFFE2_VIDEO_VIDEO_IO_H_
+
+#include <caffe2/core/common.h>
+#include <caffe/proto/caffe.pb.h>
+#include <caffe2/video/optical_flow.h>
+#include <caffe2/video/video_decoder.h>
+#include <opencv2/opencv.hpp>
+#include <random>
+
+#include <istream>
+#include <ostream>
+
+namespace caffe2 {
+
+void ClipTransformRGB(
+    const unsigned char* buffer_rgb,
+    const int multi_crop_count,
+    const int crop_height,
+    const int crop_width,
+    const int length_rgb,
+    const int channels_rgb,
+    const int sampling_rate_rgb,
+    const int height,
+    const int width,
+    const int h_off,
+    const int w_off,
+    const int* multi_crop_h_off,
+    const int* multi_crop_w_off,
+    const bool mirror_me,
+    const bool color_jitter,
+    const float saturation,
+    const float brightness,
+    const float contrast,
+    const bool color_lighting,
+    const float color_lighting_std,
+    const std::vector<std::vector<float>>& color_lighting_eigvecs,
+    const std::vector<float>& color_lighting_eigvals,
+    const std::vector<float>& mean_rgb,
+    const std::vector<float>& inv_std_rgb,
+    std::mt19937* randgen,
+    float* transformed_clip);
+
+void ClipTransformOpticalFlow(
+    const unsigned char* buffer_rgb,
+    const int crop_height,
+    const int crop_width,
+    const int length_of,
+    const int channels_of,
+    const int sampling_rate_of,
+    const int height,
+    const int width,
+    const cv::Rect& rect,
+    const int channels_rgb,
+    const bool mirror_me,
+    const int flow_alg_type,
+    const int flow_data_type,
+    const int frame_gap_of,
+    const bool do_flow_aggregation,
+    const std::vector<float>& mean_of,
+    const std::vector<float>& inv_std_of,
+    float* transformed_clip);
+
+void FreeDecodedData(std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
+
+bool DecodeMultipleClipsFromVideo(
+    const char* video_buffer,
+    const std::string& video_filename,
+    const int encoded_size,
+    const Params& params,
+    const int start_frm,
+    const int clip_per_video,
+    const bool use_local_file,
+    int& height,
+    int& width,
+    std::vector<unsigned char*>& buffer_rgb);
+
+} // namespace caffe2
+
+#endif // CAFFE2_VIDEO_VIDEO_IO_H_
diff --git a/cmake/BuildVariables.cmake b/cmake/BuildVariables.cmake
new file mode 100644
index 0000000..7897f63
--- /dev/null
+++ b/cmake/BuildVariables.cmake
@@ -0,0 +1,50 @@
+# ---[ Declare variables that we are going to use across the Caffe2 build.
+# This file defines common, Caffe2-wide variables that we use to collect
+# source files and other things. Each variable is annotated with their
+# intended uses.
+# Note that adding and / or deleting these variables usually involves
+# changing the whole build system, so make sure you send a PR early if you
+# want to change them.
+
+# Caffe2_{CPU,GPU}_SRCS is the list that will have all the related source
+# files for CPU and GPU respectively. They will be filled with the
+# CMakeLists.txt files under each folder respectively.
+set(Caffe2_CPU_SRCS)
+set(Caffe2_GPU_SRCS)
+
+# Caffe2_{CPU,GPU}_TEST_SRCS is the list that will have all the related source
+# files for CPU and GPU tests respectively.
+set(Caffe2_CPU_TEST_SRCS)
+set(Caffe2_GPU_TEST_SRCS)
+
+# Caffe2_{CPU,GPU}_INCLUDE is the list that will have all the include
+# directories for CPU and GPU respectively.
+set(Caffe2_CPU_INCLUDE)
+set(Caffe2_GPU_INCLUDE)
+
+# Caffe2_MAIN_LIBS is a list of the libraries that a dependent library should
+# depend on when it links against Caffe2.
+set(Caffe2_MAIN_LIBS)
+
+# Lists for Caffe2 dependency libraries, for CPU and CUDA respectively.
+set(Caffe2_DEPENDENCY_LIBS "")
+set(Caffe2_CUDA_DEPENDENCY_LIBS "")
+# This variable contains dependency libraries of Caffe2 which requires whole
+# symbol linkage. One example is the onnx lib where we need all its schema
+# symbols. However, if the lib is whole linked in caffe2 lib, we don't want
+# it to be linked in binaries that will link caffe2 lib. Because if caffe2 lib
+# is built as dynamic library, it will result in two copied of symbols of
+# Caffe2_DEPENDENCY_WHOLE_LINK_LIBS existing in caffe2.so and the binary, which
+# will cause issues. Therefore Caffe2_DEPENDENCY_WHOLE_LINK_LIBS will only
+# be linked by caffe2 lib.
+set(Caffe2_DEPENDENCY_WHOLE_LINK_LIBS "")
+
+# Lists for Caffe2 public dependency libraries. These libraries will be
+# transitive to any libraries that depends on Caffe2.
+set(Caffe2_PUBLIC_DEPENDENCY_LIBS "")
+set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS "")
+
+# List of modules that is built as part of the main Caffe2 build. For all
+# binary targets, such as Python and native binaries, they will be linked
+# automatically with these modules.
+set(Caffe2_MODULES "")
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
new file mode 100644
index 0000000..fe39366
--- /dev/null
+++ b/cmake/Caffe2Config.cmake.in
@@ -0,0 +1,126 @@
+# - Config file for the Caffe2 package
+# It defines the following variable(s)
+#   CAFFE2_INCLUDE_DIRS     - include directories for FooBar
+# as well as Caffe2 targets for other cmake libraries to use.
+
+# library version information
+
+set(CAFFE2_VERSION_MAJOR @CAFFE2_VERSION_MAJOR@)
+set(CAFFE2_VERSION_MINOR @CAFFE2_VERSION_MINOR@)
+set(CAFFE2_VERSION_PATCH @CAFFE2_VERSION_PATCH@)
+set(CAFFE2_VERSION "@CAFFE2_VERSION@")
+
+# Utils functions.
+include("${CMAKE_CURRENT_LIST_DIR}/public/utils.cmake")
+
+# Include threads lib.
+include("${CMAKE_CURRENT_LIST_DIR}/public/threads.cmake")
+
+# Depending on whether Caffe2 uses gflags during compile time or
+# not, invoke gflags.
+if (@USE_GFLAGS@)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/gflags.cmake")
+  if (NOT TARGET gflags)
+    message(FATAL_ERROR
+        "Your installed Caffe2 version uses gflags but the gflags library "
+        "cannot be found. Did you accidentally remove it, or have you set "
+        "the right CMAKE_PREFIX_PATH and/or GFLAGS_ROOT_DIR? If you do not "
+        "have gflags, you will need to install gflags and set the library "
+        "path accordingly.")
+  endif()
+endif()
+
+# Depending on whether Caffe2 uses glog during compile time or
+# not, invoke glog.
+if (@USE_GLOG@)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/glog.cmake")
+  if (NOT TARGET glog::glog)
+    message(FATAL_ERROR
+        "Your installed Caffe2 version uses glog but the glog library "
+        "cannot be found. Did you accidentally remove it, or have you set "
+        "the right CMAKE_PREFIX_PATH and/or GFLAGS_ROOT_DIR? If you do not "
+        "have glog, you will need to install glog and set the library "
+        "path accordingly.")
+  endif()
+endif()
+
+# Protobuf
+include("${CMAKE_CURRENT_LIST_DIR}/public/protobuf.cmake")
+if (NOT TARGET protobuf::libprotobuf)
+  message(FATAL_ERROR
+      "Your installed Caffe2 version uses protobuf but the protobuf library "
+      "cannot be found. Did you accidentally remove it, or have you set "
+      "the right CMAKE_PREFIX_PATH? If you do not have protobuf, you will "
+      "need to install protobuf and set the library path accordingly.")
+endif()
+message(STATUS "Caffe2: Protobuf version " ${Protobuf_VERSION})
+# If during build time we know the protobuf version, we will also do a sanity
+# check to ensure that the protobuf library that Caffe2 found is consistent with
+# the compiled version.
+if (@CAFFE2_KNOWN_PROTOBUF_VERSION@)
+  if (NOT (${Protobuf_VERSION} VERSION_EQUAL @Protobuf_VERSION@))
+    message(FATAL_ERROR
+        "Your installed Caffe2 is built with protobuf "
+        "@Protobuf_VERSION@"
+        ", while your current cmake setting discovers protobuf version "
+        ${Protobuf_VERSION}
+        ". Please specify a protobuf version that is the same as the built "
+        "version.")
+  endif()
+endif()
+
+if (@USE_CUDA@)
+  # The file public/cuda.cmake exclusively uses CAFFE2_USE_*.
+  # If Caffe2 was compiled with the libraries below, they must
+  # be found again when including the Caffe2 target.
+  set(CAFFE2_USE_CUDA @USE_CUDA@)
+  set(CAFFE2_USE_CUDNN @USE_CUDNN@)
+  set(CAFFE2_USE_TENSORRT @USE_TENSORRT@)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake")
+  if (@CAFFE2_USE_CUDA@ AND NOT CAFFE2_USE_CUDA)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses CUDA but I cannot find the CUDA "
+      "libraries. Please set the proper CUDA prefixes and / or install "
+      "CUDA.")
+  endif()
+  if (@CAFFE2_USE_CUDNN@ AND NOT CAFFE2_USE_CUDNN)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN "
+      "libraries. Please set the proper cuDNN prefixes and / or install "
+      "cuDNN.")
+  endif()
+  if (@CAFFE2_USE_TENSORRT@ AND NOT CAFFE2_USE_TENSORRT)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses TensorRT but I cannot find the TensorRT "
+      "libraries. Please set the proper TensorRT prefixes and / or install "
+      "TensorRT.")
+  endif()
+endif()
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/Caffe2Targets.cmake")
+
+# Interface libraries, that allows one to build proper link flags.
+# We will also define a helper variable, Caffe2_MAIN_LIBS, that resolves to
+# the main caffe2 libraries in cases of cuda presence / absence.
+caffe2_interface_library(caffe2 caffe2_library)
+if (@USE_CUDA@)
+  caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
+  set(Caffe2_MAIN_LIBS caffe2_library caffe2_gpu_library)
+else()
+  set(Caffe2_MAIN_LIBS caffe2_library)
+endif()
+
+# include directory.
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+# The following lines are here for backward compatibility, in case one
+# would like to use the old-style include path.
+get_filename_component(
+    CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+# Note: the current list dir is _INSTALL_PREFIX/share/cmake/Gloo.
+get_filename_component(
+    _INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+set(CAFFE2_INCLUDE_DIRS "${_INSTALL_PREFIX}/include")
diff --git a/cmake/Caffe2ConfigVersion.cmake.in b/cmake/Caffe2ConfigVersion.cmake.in
new file mode 100644
index 0000000..d130e12
--- /dev/null
+++ b/cmake/Caffe2ConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@CAFFE2_VERSION@")
+ 
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
new file mode 100644
index 0000000..bc30f35
--- /dev/null
+++ b/cmake/Codegen.cmake
@@ -0,0 +1,158 @@
+if (DEFINED ENV{PYTORCH_PYTHON})
+  message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}")
+  set(PYCMD "$ENV{PYTORCH_PYTHON}")
+else()
+  SET(PYCMD "python")
+endif()
+
+# ---[ Write the macros file
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
+    ${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
+
+# ---[ Installing the header files
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
+        DESTINATION include/caffe2/core)
+
+# ---[ ATen specific
+if (BUILD_ATEN)
+  # SET_SOURCE_FILES_PROPERTIES must be in the same CMakeLists.txt file as the target that includes the file
+  # so we need to set these commands here rather than in src/TH
+  IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+    IF(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_sse.cpp PROPERTIES COMPILE_FLAGS "${MSVC_OPT_FLAG}/fp:fast")
+    ELSE(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_sse.cpp PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
+    ENDIF(MSVC)
+  ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+  IF(C_AVX_FOUND)
+    IF(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_avx.cpp PROPERTIES COMPILE_FLAGS "${MSVC_OPT_FLAG}/fp:fast ${CXX_AVX_FLAGS}")
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${MSVC_OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
+    ELSE(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_avx.cpp PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${CXX_AVX_FLAGS}")
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "-O3 ${CXX_AVX_FLAGS}")
+    ENDIF(MSVC)
+  ENDIF(C_AVX_FOUND)
+
+  IF(C_AVX2_FOUND)
+    IF(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX2.cpp PROPERTIES COMPILE_FLAGS "${MSVC_OPT_FLAG}/arch:AVX2 ${CXX_AVX2_FLAGS}")
+    ELSE(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX2.cpp PROPERTIES COMPILE_FLAGS "-O3 ${CXX_AVX2_FLAGS}")
+    ENDIF(MSVC)
+  ENDIF(C_AVX2_FOUND)
+
+  IF(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/THAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
+  ENDIF()
+
+  FILE(GLOB cpu_kernel_cpp_in "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cpu/*.cpp")
+
+  IF(MSVC AND NOT "${CMAKE_BUILD_TYPE}" MATCHES "Debug")
+    SET(MSVC_OPT_FLAG "/Ox /fp:strict ")
+    SET(VCOMP_LIB "vcomp")
+  ELSE()
+    SET(MSVC_OPT_FLAG " ")
+    SET(VCOMP_LIB "vcompd")
+  ENDIF()
+
+  LIST(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
+  IF(MSVC)
+    LIST(APPEND CPU_CAPABILITY_FLAGS "${MSVC_OPT_FLAG}")
+  ELSE(MSVC)
+    LIST(APPEND CPU_CAPABILITY_FLAGS "-O3")
+  ENDIF(MSVC)
+
+  IF(CXX_AVX_FOUND)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX_CPU_DEFINITION")
+    LIST(APPEND CPU_CAPABILITY_NAMES "AVX")
+    IF(MSVC)
+      LIST(APPEND CPU_CAPABILITY_FLAGS "${MSVC_OPT_FLAG}/arch:AVX")
+    ELSE(MSVC)
+      LIST(APPEND CPU_CAPABILITY_FLAGS "-O3 -mavx")
+    ENDIF(MSVC)
+  ENDIF(CXX_AVX_FOUND)
+
+  IF(CXX_AVX2_FOUND)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
+    LIST(APPEND CPU_CAPABILITY_NAMES "AVX2")
+    IF(MSVC)
+      LIST(APPEND CPU_CAPABILITY_FLAGS "${MSVC_OPT_FLAG}/arch:AVX2")
+    ELSE(MSVC)
+      LIST(APPEND CPU_CAPABILITY_FLAGS "-O3 -mavx2 -mfma")
+    ENDIF(MSVC)
+  ENDIF(CXX_AVX2_FOUND)
+
+  list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
+  math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
+
+  FOREACH(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
+    FOREACH(IMPL ${cpu_kernel_cpp_in})
+      string(REPLACE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/" "" NAME ${IMPL})
+      LIST(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
+      SET(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
+      CONFIGURE_FILE(${IMPL} ${NEW_IMPL} COPYONLY)
+      SET(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
+      LIST(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
+      IF(MSVC)
+        SET(MACRO_FLAG "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
+      ELSE(MSVC)
+        SET(MACRO_FLAG "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
+      ENDIF(MSVC)
+      SET_SOURCE_FILES_PROPERTIES(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${MACRO_FLAG}")
+    ENDFOREACH()
+  ENDFOREACH()
+  list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
+
+  set(cwrap_files
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/Declarations.cwrap
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/THCUNN/generic/THCUNN.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/nn.yaml
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml)
+
+  FILE(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/*.py")
+
+  SET(GEN_COMMAND
+      ${PYCMD} ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/gen.py
+      --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
+      --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
+      ${cwrap_files}
+  )
+
+  EXECUTE_PROCESS(
+      COMMAND ${GEN_COMMAND}
+        --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
+        --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
+      RESULT_VARIABLE RETURN_VALUE
+  )
+  if (NOT RETURN_VALUE EQUAL 0)
+      message(STATUS ${generated_cpp})
+      message(FATAL_ERROR "Failed to get generated_cpp list")
+  endif()
+  file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
+  file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
+
+  file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
+
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
+
+  add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp}
+    COMMAND ${GEN_COMMAND}
+      --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
+    DEPENDS ${all_python} ${all_templates} ${cwrap_files})
+
+  # Generated headers used from a CUDA (.cu) file are
+  # not tracked correctly in CMake. We make the libATen.so depend explicitly
+  # on building the generated ATen files to workaround.
+  add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp})
+  add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
+  add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
+  add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
+  add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
+  add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
+endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
new file mode 100644
index 0000000..473b820
--- /dev/null
+++ b/cmake/Dependencies.cmake
@@ -0,0 +1,1282 @@
+# UBSAN triggers when compiling protobuf, so we need to disable it.
+set(UBSAN_FLAG "-fsanitize=undefined")
+
+macro(disable_ubsan)
+  if (CMAKE_C_FLAGS MATCHES ${UBSAN_FLAG} OR CMAKE_CXX_FLAGS MATCHES ${UBSAN_FLAG})
+    set(CAFFE2_UBSAN_ENABLED ON)
+    string(REPLACE ${UBSAN_FLAG} "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+    string(REPLACE ${UBSAN_FLAG} "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+endmacro()
+
+macro(enable_ubsan)
+  if (CAFFE2_UBSAN_ENABLED)
+    set(CMAKE_C_FLAGS "${UBSAN_FLAG} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${UBSAN_FLAG} ${CMAKE_CXX_FLAGS}")
+  endif()
+endmacro()
+
+# ---[ Custom Protobuf
+if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  disable_ubsan()
+  include(${CMAKE_CURRENT_LIST_DIR}/ProtoBuf.cmake)
+  enable_ubsan()
+endif()
+
+# ---[ Threads
+if(BUILD_CAFFE2)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/threads.cmake)
+  if (TARGET Threads::Threads)
+    list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS Threads::Threads)
+  else()
+    message(FATAL_ERROR
+        "Cannot find threading library. Caffe2 requires Threads to compile.")
+  endif()
+endif()
+
+# ---[ protobuf
+if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if(USE_LITE_PROTO)
+    set(CAFFE2_USE_LITE_PROTO 1)
+  endif()
+endif()
+
+# ---[ git: used to generate git build string.
+if(BUILD_CAFFE2)
+  find_package(Git)
+  if(GIT_FOUND)
+    execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
+                    WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/.."
+                    OUTPUT_VARIABLE CAFFE2_GIT_VERSION
+                    RESULT_VARIABLE __git_result)
+    if(NOT ${__git_result} EQUAL 0)
+      set(CAFFE2_GIT_VERSION "unknown")
+    endif()
+  else()
+    message(
+        WARNING
+        "Cannot find git, so Caffe2 won't have any git build info available")
+  endif()
+endif()
+
+# ---[ BLAS
+if(BUILD_ATEN)
+  set(BLAS "MKL" CACHE STRING "Selected BLAS library")
+else()
+  set(BLAS "Eigen" CACHE STRING "Selected BLAS library")
+endif()
+set_property(CACHE BLAS PROPERTY STRINGS "Eigen;ATLAS;OpenBLAS;MKL;vecLib")
+message(STATUS "The BLAS backend of choice:" ${BLAS})
+
+if(BLAS STREQUAL "Eigen")
+  # Eigen is header-only and we do not have any dependent libraries
+  set(CAFFE2_USE_EIGEN_FOR_BLAS ON)
+elseif(BLAS STREQUAL "ATLAS")
+  find_package(Atlas REQUIRED)
+  include_directories(SYSTEM ${ATLAS_INCLUDE_DIRS})
+  list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS ${ATLAS_LIBRARIES})
+  list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS cblas)
+elseif(BLAS STREQUAL "OpenBLAS")
+  find_package(OpenBLAS REQUIRED)
+  include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
+  list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS ${OpenBLAS_LIB})
+elseif(BLAS STREQUAL "MKL")
+  if(BLAS_SET_BY_USER)
+    find_package(MKL REQUIRED)
+  else()
+    find_package(MKL QUIET)
+  endif()
+  if(MKL_FOUND)
+    include_directories(SYSTEM ${MKL_INCLUDE_DIR})
+    list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS ${MKL_LIBRARIES})
+  else()
+    message(WARNING "MKL could not be found. Defaulting to Eigen")
+    set(BLAS "Eigen" CACHE STRING "Selected BLAS library")
+    set(CAFFE2_USE_EIGEN_FOR_BLAS ON)
+  endif()
+elseif(BLAS STREQUAL "vecLib")
+  find_package(vecLib REQUIRED)
+  include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
+  list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS ${vecLib_LINKER_LIBS})
+else()
+  message(FATAL_ERROR "Unrecognized blas option:" ${BLAS})
+endif()
+
+# Directory where NNPACK and cpuinfo will download and build all dependencies
+set(CONFU_DEPENDENCIES_SOURCE_DIR ${PROJECT_BINARY_DIR}/confu-srcs
+  CACHE PATH "Confu-style dependencies source directory")
+set(CONFU_DEPENDENCIES_BINARY_DIR ${PROJECT_BINARY_DIR}/confu-deps
+  CACHE PATH "Confu-style dependencies binary directory")
+
+# ---[ NNPACK
+if(USE_NNPACK)
+  include(${CMAKE_CURRENT_LIST_DIR}/External/nnpack.cmake)
+  if(NNPACK_FOUND)
+    if(TARGET nnpack)
+      # ---[ NNPACK is being built together with Caffe2: explicitly specify dependency
+      list(APPEND Caffe2_DEPENDENCY_LIBS nnpack)
+    else()
+      include_directories(SYSTEM ${NNPACK_INCLUDE_DIRS})
+      list(APPEND Caffe2_DEPENDENCY_LIBS ${NNPACK_LIBRARIES})
+    endif()
+  else()
+    message(WARNING "Not compiling with NNPACK. Suppress this warning with -DUSE_NNPACK=OFF")
+    caffe2_update_option(USE_NNPACK OFF)
+  endif()
+endif()
+
+# ---[ Caffe2 uses cpuinfo library in the thread pool
+if (NOT TARGET cpuinfo)
+  if (NOT DEFINED CPUINFO_SOURCE_DIR)
+    set(CPUINFO_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../third_party/cpuinfo" CACHE STRING "cpuinfo source directory")
+  endif()
+
+  set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL "")
+  set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "")
+  set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "")
+  set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "")
+  set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
+  if(MSVC)
+    if (CAFFE2_USE_MSVC_STATIC_RUNTIME)
+      set(CPUINFO_RUNTIME_TYPE "static" CACHE STRING "")
+    else()
+      set(CPUINFO_RUNTIME_TYPE "shared" CACHE STRING "")
+    endif()
+  endif()
+  add_subdirectory(
+    "${CPUINFO_SOURCE_DIR}"
+    "${CONFU_DEPENDENCIES_BINARY_DIR}/cpuinfo")
+  # We build static version of cpuinfo but link
+  # them into a shared library for Caffe2, so they need PIC.
+  set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
+endif()
+list(APPEND Caffe2_DEPENDENCY_LIBS cpuinfo)
+
+# ---[ gflags
+if(USE_GFLAGS)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/gflags.cmake)
+  if (TARGET gflags)
+    set(CAFFE2_USE_GFLAGS 1)
+    include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR})
+    list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS gflags)
+  else()
+    message(WARNING
+        "gflags is not found. Caffe2 will build without gflags support but "
+        "it is strongly recommended that you install gflags. Suppress this "
+        "warning with -DUSE_GFLAGS=OFF")
+    caffe2_update_option(USE_GFLAGS OFF)
+  endif()
+endif()
+
+# ---[ Google-glog
+if(USE_GLOG)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/glog.cmake)
+  if (TARGET glog::glog)
+    set(CAFFE2_USE_GOOGLE_GLOG 1)
+    include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+    list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS glog::glog)
+  else()
+    message(WARNING
+        "glog is not found. Caffe2 will build without glog support but it is "
+        "strongly recommended that you install glog. Suppress this warning "
+        "with -DUSE_GLOG=OFF")
+    caffe2_update_option(USE_GLOG OFF)
+  endif()
+endif()
+
+
+# ---[ Googletest and benchmark
+if(BUILD_TEST)
+  set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+  # We will build gtest as static libs and embed it directly into the binary.
+  set(BUILD_SHARED_LIBS OFF)
+  # For gtest, we will simply embed it into our test binaries, so we won't
+  # need to install it.
+  set(BUILD_GTEST ON)
+  set(INSTALL_GTEST OFF)
+  # We currently don't need gmock right now.
+  set(BUILD_GMOCK OFF)
+  # For Windows, we will check the runtime used is correctly passed in.
+  if (NOT CAFFE2_USE_MSVC_STATIC_RUNTIME)
+    set(gtest_force_shared_crt ON)
+  endif()
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
+  include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
+
+  # We will not need to test benchmark lib itself.
+  set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark testing as we don't need it.")
+  # We will not need to install benchmark since we link it statically.
+  set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "Disable benchmark install to avoid overwriting vendor install.")
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark)
+  include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include)
+
+  # Recover the build shared libs option.
+  set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
+endif()
+
+# ---[ LMDB
+if(USE_LMDB)
+  find_package(LMDB)
+  if(LMDB_FOUND)
+    include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${LMDB_LIBRARIES})
+  else()
+    message(WARNING "Not compiling with LMDB. Suppress this warning with -DUSE_LMDB=OFF")
+    caffe2_update_option(USE_LMDB OFF)
+  endif()
+endif()
+
+if (USE_OPENCL)
+  message(INFO "USING OPENCL")
+  find_package(OpenCL REQUIRED)
+  include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
+  include_directories(${CMAKE_CURRENT_LIST_DIR}/../caffe2/contrib/opencl)
+  list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCL_LIBRARIES})
+endif()
+
+# ---[ LevelDB
+# ---[ Snappy
+if(USE_LEVELDB)
+  find_package(LevelDB)
+  find_package(Snappy)
+  if(LEVELDB_FOUND AND SNAPPY_FOUND)
+    include_directories(SYSTEM ${LevelDB_INCLUDE})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${LevelDB_LIBRARIES})
+    include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${Snappy_LIBRARIES})
+  else()
+    message(WARNING "Not compiling with LevelDB. Suppress this warning with -DUSE_LEVELDB=OFF")
+    caffe2_update_option(USE_LEVELDB OFF)
+  endif()
+endif()
+
+# ---[ NUMA
+if(USE_NUMA)
+  if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    message(WARNING "NUMA is currently only supported under Linux.")
+    caffe2_update_option(USE_NUMA OFF)
+  else()
+    find_package(Numa)
+    if(NUMA_FOUND)
+      include_directories(SYSTEM ${Numa_INCLUDE_DIR})
+      list(APPEND Caffe2_DEPENDENCY_LIBS ${Numa_LIBRARIES})
+    else()
+      message(WARNING "Not compiling with NUMA. Suppress this warning with -DUSE_NUMA=OFF")
+      caffe2_update_option(USE_NUMA OFF)
+    endif()
+  endif()
+endif()
+
+# ---[ ZMQ
+if(USE_ZMQ)
+  find_package(ZMQ)
+  if(ZMQ_FOUND)
+    include_directories(SYSTEM ${ZMQ_INCLUDE_DIR})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${ZMQ_LIBRARIES})
+  else()
+    message(WARNING "Not compiling with ZMQ. Suppress this warning with -DUSE_ZMQ=OFF")
+    caffe2_update_option(USE_ZMQ OFF)
+  endif()
+endif()
+
+# ---[ Redis
+if(USE_REDIS)
+  find_package(Hiredis)
+  if(HIREDIS_FOUND)
+    include_directories(SYSTEM ${Hiredis_INCLUDE})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${Hiredis_LIBRARIES})
+  else()
+    message(WARNING "Not compiling with Redis. Suppress this warning with -DUSE_REDIS=OFF")
+    caffe2_update_option(USE_REDIS OFF)
+  endif()
+endif()
+
+
+# ---[ OpenCV
+if(USE_OPENCV)
+  # OpenCV 3
+  find_package(OpenCV 3 QUIET COMPONENTS core highgui imgproc imgcodecs videoio video)
+  if(NOT OpenCV_FOUND)
+    # OpenCV 2
+    find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
+  endif()
+  if(OpenCV_FOUND)
+    include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
+    message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
+  else()
+    message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
+    caffe2_update_option(USE_OPENCV OFF)
+  endif()
+endif()
+
+# ---[ FFMPEG
+if(USE_FFMPEG)
+  find_package(FFmpeg REQUIRED)
+  if (FFMPEG_FOUND)
+    message("Found FFMPEG/LibAV libraries")
+    include_directories(SYSTEM ${FFMPEG_INCLUDE_DIR})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${FFMPEG_LIBRARIES})
+  else ()
+    message("Not compiling with FFmpeg. Suppress this warning with -DUSE_FFMPEG=OFF")
+    caffe2_update_option(USE_FFMPEG OFF)
+  endif ()
+endif()
+
+# ---[ EIGEN
+# Due to license considerations, we will only use the MPL2 parts of Eigen.
+set(EIGEN_MPL2_ONLY 1)
+find_package(Eigen3)
+if(EIGEN3_FOUND)
+  message(STATUS "Found system Eigen at " ${EIGEN3_INCLUDE_DIR})
+else()
+  message(STATUS "Did not find system Eigen. Using third party subdirectory.")
+  set(EIGEN3_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/eigen)
+endif()
+include_directories(SYSTEM ${EIGEN3_INCLUDE_DIR})
+
+# ---[ Python + Numpy
+if(BUILD_PYTHON)
+  # If not given a Python installation, then use the current active Python
+  if(NOT DEFINED PYTHON_EXECUTABLE)
+    execute_process(
+      COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe)
+    if(${_exitcode} EQUAL 0)
+      string(STRIP ${_py_exe} PYTHON_EXECUTABLE)
+      message(STATUS "Setting Python to ${PYTHON_EXECUTABLE}")
+    endif()
+  endif()
+
+  # Check that Python works
+  if(DEFINED PYTHON_EXECUTABLE)
+    execute_process(
+        COMMAND "${PYTHON_EXECUTABLE}" "--version"
+        RESULT_VARIABLE _exitcode)
+    if(NOT ${_exitcode} EQUAL 0)
+      message(FATAL_ERROR "The Python executable ${PYTHON_EXECUTABLE} cannot be run. Make sure that it is an absolute path.")
+    endif()
+  endif()
+
+  # Seed PYTHON_INCLUDE_DIR and PYTHON_LIBRARY to be consistent with the
+  # executable that we already found (if we didn't actually find an executable
+  # then these will just use "python", but at least they'll be consistent with
+  # each other).
+  if(NOT DEFINED PYTHON_INCLUDE_DIR)
+    # distutils.sysconfig, if it's installed, is more accurate than sysconfig,
+    # which sometimes outputs directories that do not exist
+    pycmd_no_exit(_py_inc _exitcode "from distutils import sysconfig; print(sysconfig.get_python_inc())")
+    if("${_exitcode}" EQUAL 0 AND IS_DIRECTORY "${_py_inc}")
+      SET(PYTHON_INCLUDE_DIR "${_py_inc}")
+      message(STATUS "Setting Python's include dir to ${_py_inc} from distutils.sysconfig")
+    else()
+      pycmd_no_exit(_py_inc _exitcode "from sysconfig import get_paths; print(get_paths()['include'])")
+      if("${_exitcode}" EQUAL 0 AND IS_DIRECTORY "${_py_inc}")
+        SET(PYTHON_INCLUDE_DIR "${_py_inc}")
+        message(STATUS "Setting Python's include dir to ${_py_inc} from sysconfig")
+      endif()
+    endif()
+  endif(NOT DEFINED PYTHON_INCLUDE_DIR)
+
+  if(NOT DEFINED PYTHON_LIBRARY)
+    pycmd_no_exit(_py_lib _exitcode "from sysconfig import get_paths; print(get_paths()['stdlib'])")
+    if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}")
+      SET(PYTHON_LIBRARY "${_py_lib}")
+      message(STATUS "Setting Python's library to ${_py_lib}")
+    endif()
+  endif(NOT DEFINED PYTHON_LIBRARY)
+
+  # These should fill in the rest of the variables, like versions, but resepct
+  # the variables we set above
+  set(Python_ADDITIONAL_VERSIONS 3.7 3.6 3.5 2.8 2.7 2.6)
+  find_package(PythonInterp 2.7)
+  find_package(PythonLibs 2.7)
+  find_package(NumPy REQUIRED)
+  if(PYTHONINTERP_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    include_directories(SYSTEM ${PYTHON_INCLUDE_DIR} ${NUMPY_INCLUDE_DIR})
+    # Observers are required in the python build
+    caffe2_update_option(USE_OBSERVERS ON)
+  else()
+    message(WARNING "Python dependencies not met. Not compiling with python. Suppress this warning with -DBUILD_PYTHON=OFF")
+    caffe2_update_option(BUILD_PYTHON OFF)
+  endif()
+endif()
+
+# ---[ pybind11
+find_package(pybind11)
+if(pybind11_FOUND)
+  include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
+else()
+  include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/pybind11/include)
+endif()
+
+# ---[ MPI
+if(USE_MPI)
+  find_package(MPI)
+  if(MPI_CXX_FOUND)
+    message(STATUS "MPI support found")
+    message(STATUS "MPI compile flags: " ${MPI_CXX_COMPILE_FLAGS})
+    message(STATUS "MPI include path: " ${MPI_CXX_INCLUDE_PATH})
+    message(STATUS "MPI LINK flags path: " ${MPI_CXX_LINK_FLAGS})
+    message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES})
+    include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${MPI_CXX_LIBRARIES})
+    set(CMAKE_EXE_LINKER_FLAGS ${MPI_CXX_LINK_FLAGS})
+    find_program(OMPI_INFO
+      NAMES ompi_info
+      HINTS ${MPI_CXX_LIBRARIES}/../bin)
+    if(OMPI_INFO)
+      execute_process(COMMAND ${OMPI_INFO}
+                      OUTPUT_VARIABLE _output)
+      if(_output MATCHES "smcuda")
+        message(STATUS "Found OpenMPI with CUDA support built.")
+      else()
+        message(WARNING "OpenMPI found, but it is not built with CUDA support.")
+        set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1)
+      endif()
+    endif()
+  else()
+    message(WARNING "Not compiling with MPI. Suppress this warning with -DUSE_MPI=OFF")
+    caffe2_update_option(USE_MPI OFF)
+  endif()
+endif()
+
+# ---[ OpenMP
+if(USE_OPENMP)
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  else()
+    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
+    caffe2_update_option(USE_OPENMP OFF)
+  endif()
+endif()
+
+
+# ---[ Android specific ones
+if(ANDROID)
+  list(APPEND Caffe2_DEPENDENCY_LIBS log)
+endif()
+
+# ---[ CUDA
+if(USE_CUDA)
+  # public/*.cmake uses CAFFE2_USE_*
+  set(CAFFE2_USE_CUDA ${USE_CUDA})
+  set(CAFFE2_USE_CUDNN ${USE_CUDNN})
+  set(CAFFE2_USE_NVRTC ${USE_NVRTC})
+  set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
+  include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
+  if(CAFFE2_USE_CUDA)
+    # A helper variable recording the list of Caffe2 dependent libraries
+    # caffe2::cudart is dealt with separately, due to CUDA_ADD_LIBRARY
+    # design reason (it adds CUDA_LIBRARIES itself).
+    set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cufft caffe2::curand)
+    if(CAFFE2_USE_NVRTC)
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cuda caffe2::nvrtc)
+    else()
+      caffe2_update_option(USE_NVRTC OFF)
+    endif()
+    if(CAFFE2_USE_CUDNN)
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+      LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
+          "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" caffe2::cublas)
+    else()
+      caffe2_update_option(USE_CUDNN OFF)
+    endif()
+    if(CAFFE2_STATIC_LINK_CUDA)
+      # When statically linking, this must be the order of the libraries
+      LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
+          "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" caffe2::cublas)
+    else()
+      LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cublas)
+    endif()
+    if(CAFFE2_USE_TENSORRT)
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
+    else()
+      caffe2_update_option(USE_TENSORRT OFF)
+    endif()
+  else()
+    message(WARNING
+      "Not compiling with CUDA. Suppress this warning with "
+      "-DUSE_CUDA=OFF.")
+    caffe2_update_option(USE_CUDA OFF)
+    caffe2_update_option(USE_CUDNN OFF)
+    caffe2_update_option(USE_NVRTC OFF)
+    caffe2_update_option(USE_TENSORRT OFF)
+    set(CAFFE2_USE_CUDA OFF)
+    set(CAFFE2_USE_CUDNN OFF)
+    set(CAFFE2_USE_NVRTC OFF)
+    set(CAFFE2_USE_TENSORRT OFF)
+  endif()
+endif()
+
+# ---[ HIP
+if(BUILD_CAFFE2 OR BUILD_ATEN)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
+  if(PYTORCH_FOUND_HIP)
+    message(INFO "Compiling with HIP for AMD.")
+    caffe2_update_option(USE_ROCM ON)
+
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_PLATFORM_HCC__=1")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DCUDA_HAS_FP16=1")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_OPERATORS__=1")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_CONVERSIONS__=1")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-macro-redefined")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-inconsistent-missing-override")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-exceptions")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-negative")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-overflow")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-unused-command-line-argument")
+
+    set(Caffe2_HIP_INCLUDES
+      ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDES})
+
+    # This is needed for library added by hip_add_library (same for hip_add_executable)
+    hip_include_directories(${Caffe2_HIP_INCLUDES})
+
+    set(Caffe2_HIP_DEPENDENCY_LIBS
+      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
+    # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
+    if(BUILD_ATEN)
+      set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
+    endif()
+    # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
+    list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
+      roc::rocblas)
+  else()
+    caffe2_update_option(USE_ROCM OFF)
+  endif()
+endif()
+
+# ---[ ROCm
+if(USE_ROCM AND NOT BUILD_CAFFE2)
+ include_directories(SYSTEM ${HIP_PATH}/include)
+ include_directories(SYSTEM ${HIPBLAS_PATH}/include)
+ include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
+ include_directories(SYSTEM ${HIPRNG_PATH}/include)
+ include_directories(SYSTEM ${THRUST_PATH})
+
+ # load HIP cmake module and load platform id
+ EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig -P OUTPUT_VARIABLE PLATFORM)
+ EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
+
+ # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
+ # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
+
+ # Show message that we're using ROCm.
+ MESSAGE(STATUS "ROCM TRUE:")
+ MESSAGE(STATUS "CMAKE_CXX_COMPILER: " ${CMAKE_CXX_COMPILER})
+endif()
+
+# ---[ NCCL
+if(USE_NCCL)
+  if(NOT USE_CUDA)
+    message(WARNING
+        "Not using CUDA, so disabling NCCL. Suppress this warning with "
+        "-DUSE_NCCL=OFF.")
+    caffe2_update_option(USE_NCCL OFF)
+  elseif(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    message(WARNING "NCCL is currently only supported under Linux.")
+    caffe2_update_option(USE_NCCL OFF)
+  else()
+    include(${CMAKE_CURRENT_LIST_DIR}/External/nccl.cmake)
+    list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS __caffe2_nccl)
+  endif()
+endif()
+
+# ---[ CUB
+if(USE_CUDA)
+  find_package(CUB)
+  if(CUB_FOUND)
+    include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
+  else()
+    include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/cub)
+  endif()
+endif()
+
+if(USE_GLOO)
+  if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    message(WARNING "Gloo can only be used on Linux.")
+    caffe2_update_option(USE_GLOO OFF)
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    message(WARNING "Gloo can only be used on 64-bit systems.")
+    caffe2_update_option(USE_GLOO OFF)
+  else()
+    set(Gloo_USE_CUDA ${USE_CUDA})
+    find_package(Gloo)
+    if(Gloo_FOUND)
+      include_directories(SYSTEM ${Gloo_INCLUDE_DIRS})
+      list(APPEND Caffe2_DEPENDENCY_LIBS gloo)
+    else()
+      set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
+      set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
+
+      # Temporarily override variables to avoid building Gloo tests/benchmarks
+      set(__BUILD_TEST ${BUILD_TEST})
+      set(__BUILD_BENCHMARK ${BUILD_BENCHMARK})
+      set(BUILD_TEST OFF)
+      set(BUILD_BENCHMARK OFF)
+      add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
+      # Here is a little bit hacky. We have to put PROJECT_BINARY_DIR in front
+      # of PROJECT_SOURCE_DIR with/without conda system. The reason is that
+      # gloo generates a new config.h in the binary diretory.
+      include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
+      include_directories(BEFORE SYSTEM ${PROJECT_BINARY_DIR}/third_party/gloo)
+      set(BUILD_TEST ${__BUILD_TEST})
+      set(BUILD_BENCHMARK ${__BUILD_BENCHMARK})
+
+      # Add explicit dependency if NCCL is built from third_party.
+      # Without dependency, make -jN with N>1 can fail if the NCCL build
+      # hasn't finished when CUDA targets are linked.
+      if(NCCL_EXTERNAL)
+        add_dependencies(gloo_cuda nccl_external)
+      endif()
+    endif()
+    # Pick the right dependency depending on USE_CUDA
+    list(APPEND Caffe2_DEPENDENCY_LIBS gloo)
+    if(USE_CUDA)
+      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS gloo_cuda)
+    endif()
+  endif()
+endif()
+
+# ---[ profiling
+if(USE_PROF)
+  find_package(htrace)
+  if(htrace_FOUND)
+    set(USE_PROF_HTRACE ON)
+  else()
+    message(WARNING "htrace not found. Caffe2 will build without htrace prof")
+  endif()
+endif()
+
+if (USE_MOBILE_OPENGL)
+  if (ANDROID)
+    list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
+  elseif (IOS)
+    message(STATUS "TODO item for adding ios opengl dependency")
+  else()
+    message(WARNING "mobile opengl is only used in android or ios builds.")
+    caffe2_update_option(USE_MOBILE_OPENGL OFF)
+  endif()
+endif()
+
+# ---[ ARM Compute Library: check compatibility.
+if (USE_ACL)
+  if (NOT ANDROID)
+    message(WARNING "ARM Compute Library is only supported for Android builds.")
+    caffe2_update_option(USE_ACL OFF)
+  else()
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
+      # 32-bit ARM (armv7, armv7-a, armv7l, etc)
+      set(ACL_ARCH "armv7a")
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
+      # 64-bit ARM
+      set(ACL_ARCH "arm64-v8a")
+    else()
+      message(WARNING "ARM Compute Library is only supported for ARM/ARM64 builds.")
+      caffe2_update_option(USE_ACL OFF)
+    endif()
+  endif()
+endif()
+
+# ---[ ARM Compute Library: build the target.
+if (USE_ACL)
+  list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/")
+  list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/include")
+  include_directories(SYSTEM ${ARM_COMPUTE_INCLUDE_DIRS})
+  string (REPLACE ";" " -I" ANDROID_STL_INCLUDE_FLAGS "-I${ANDROID_STL_INCLUDE_DIRS}")
+  set (ARM_COMPUTE_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/../third_party/ComputeLibrary/")
+  set (ARM_COMPUTE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a")
+  set (ARM_COMPUTE_CORE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a")
+  set (ARM_COMPUTE_LIBS ${ARM_COMPUTE_LIB} ${ARM_COMPUTE_CORE_LIB})
+
+  add_custom_command(
+      OUTPUT ${ARM_COMPUTE_LIBS}
+      COMMAND
+        /bin/sh -c "export PATH=\"$PATH:$(dirname ${CMAKE_CXX_COMPILER})\" && \
+        scons -C \"${ARM_COMPUTE_SRC_DIR}\" -Q \
+          examples=no validation_tests=no benchmark_tests=no standalone=yes \
+          embed_kernels=yes opencl=no gles_compute=yes \
+          os=android arch=${ACL_ARCH} \
+          extra_cxx_flags=\"${ANDROID_CXX_FLAGS} ${ANDROID_STL_INCLUDE_FLAGS}\"" &&
+        /bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a" &&
+        /bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute_core-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a" &&
+        /bin/sh -c "rm -r ${ARM_COMPUTE_SRC_DIR}/build"
+      COMMENT "Building ARM compute library" VERBATIM)
+  add_custom_target(arm_compute_build ALL DEPENDS ${ARM_COMPUTE_LIBS})
+
+  add_library(arm_compute_core STATIC IMPORTED)
+  add_dependencies(arm_compute_core arm_compute_build)
+  set_property(TARGET arm_compute_core PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_CORE_LIB})
+
+  add_library(arm_compute STATIC IMPORTED)
+  add_dependencies(arm_compute arm_compute_build)
+  set_property(TARGET arm_compute PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_LIB})
+
+  list(APPEND Caffe2_DEPENDENCY_LIBS arm_compute arm_compute_core)
+endif()
+
+if (USE_SNPE AND ANDROID)
+  if (SNPE_LOCATION AND SNPE_HEADERS)
+    message(STATUS "Using SNPE location specified by -DSNPE_LOCATION: " ${SNPE_LOCATION})
+    message(STATUS "Using SNPE headers specified by -DSNPE_HEADERS: " ${SNPE_HEADERS})
+    include_directories(SYSTEM ${SNPE_HEADERS})
+    add_library(snpe SHARED IMPORTED)
+    set_property(TARGET snpe PROPERTY IMPORTED_LOCATION ${SNPE_LOCATION})
+    list(APPEND Caffe2_DEPENDENCY_LIBS snpe)
+  else()
+    caffe2_update_option(USE_SNPE OFF)
+  endif()
+endif()
+
+if (USE_METAL)
+  if (NOT IOS)
+    message(WARNING "Metal is only used in ios builds.")
+    caffe2_update_option(USE_METAL OFF)
+  endif()
+endif()
+
+if (USE_NNAPI AND NOT ANDROID)
+  message(WARNING "NNApi is only used in android builds.")
+  caffe2_update_option(USE_NNAPI OFF)
+endif()
+
+if (BUILD_ATEN)
+  if (BUILD_CAFFE2)
+    list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen)
+    if (USE_CUDA)
+      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS aten_op_header_gen)
+    endif()
+    include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten)
+  endif()
+endif()
+
+if (USE_ZSTD)
+  list(APPEND Caffe2_DEPENDENCY_LIBS libzstd_static)
+  include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/zstd/lib)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/zstd/build/cmake)
+  set_property(TARGET libzstd_static PROPERTY POSITION_INDEPENDENT_CODE ON)
+endif()
+
+# ---[ Onnx
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (NOT DEFINED ONNX_NAMESPACE)
+    SET(ONNX_NAMESPACE "onnx_c2")
+  endif()
+  if(EXISTS "${CAFFE2_CUSTOM_PROTOC_EXECUTABLE}")
+    set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${CAFFE2_CUSTOM_PROTOC_EXECUTABLE})
+  endif()
+  set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+  # We will build onnx as static libs and embed it directly into the binary.
+  set(BUILD_SHARED_LIBS OFF)
+  set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+  # If linking local protobuf, make sure ONNX has the same protobuf
+  # patches as Caffe2 and Caffe proto. This forces some functions to
+  # not be inline and instead route back to the statically-linked protobuf.
+  if (CAFFE2_LINK_LOCAL_PROTOBUF)
+    set(ONNX_PROTO_POST_BUILD_SCRIPT ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake)
+  endif()
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx)
+  include_directories(${ONNX_INCLUDE_DIRS})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DONNX_NAMESPACE=${ONNX_NAMESPACE}")
+  # In mobile build we care about code size, and so we need drop
+  # everything (e.g. checker, optimizer) in onnx but the pb definition.
+  if (ANDROID OR IOS)
+    caffe2_interface_library(onnx_proto onnx_library)
+  else()
+    caffe2_interface_library(onnx onnx_library)
+  endif()
+  list(APPEND Caffe2_DEPENDENCY_WHOLE_LINK_LIBS onnx_library)
+  list(APPEND Caffe2_DEPENDENCY_LIBS onnxifi_loader)
+  # Recover the build shared libs option.
+  set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
+endif()
+
+# --[ TensorRT integration with onnx-trt
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (USE_TENSORRT)
+    set(CMAKE_CUDA_COMPILER ${CUDA_NVCC_EXECUTABLE})
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx-tensorrt)
+    include_directories("${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx-tensorrt")
+    caffe2_interface_library(nvonnxparser_static onnx_trt_library)
+    list(APPEND Caffe2_DEPENDENCY_WHOLE_LINK_LIBS onnx_trt_library)
+    set(CAFFE2_USE_TRT 1)
+  endif()
+endif()
+
+# --[ ATen checks
+if (BUILD_ATEN)
+  set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+  set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
+
+  # RPATH stuff
+  # see https://cmake.org/Wiki/CMake_RPATH_handling
+  if (APPLE)
+    set(CMAKE_MACOSX_RPATH ON)
+  endif()
+  set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
+  set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+  list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
+  if ("${isSystemDir}" STREQUAL "-1")
+    set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+  endif()
+
+  if (NOT MSVC)
+    set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
+  endif()
+
+  INCLUDE(CheckCXXSourceCompiles)
+
+  # disable some verbose warnings
+  IF (MSVC)
+    set(CMAKE_CXX_FLAGS "/wd4267 /wd4251 /wd4522 /wd4522 /wd4838 /wd4305 /wd4244 /wd4190 /wd4101 /wd4996 /wd4275 ${CMAKE_CXX_FLAGS}")
+  ENDIF()
+
+  # windef.h will define max/min macros if NOMINMAX is not defined
+  IF (MSVC)
+    add_definitions(/DNOMINMAX)
+  ENDIF()
+
+  #Check if certain std functions are supported. Sometimes
+  #_GLIBCXX_USE_C99 macro is not defined and some functions are missing.
+  if (NOT ANDROID)
+    CHECK_CXX_SOURCE_COMPILES("
+    #include <cmath>
+    #include <string>
+
+    int main() {
+      int a = std::isinf(3.0);
+      int b = std::isnan(0.0);
+      std::string s = std::to_string(1);
+
+      return 0;
+      }" SUPPORT_GLIBCXX_USE_C99)
+
+    if (NOT SUPPORT_GLIBCXX_USE_C99)
+      message(FATAL_ERROR
+              "The C++ compiler does not support required functions. "
+              "This is very likely due to a known bug in GCC 5 "
+              "(and maybe other versions) on Ubuntu 17.10 and newer. "
+              "For more information, see: "
+              "https://github.com/pytorch/pytorch/issues/5229"
+             )
+    endif()
+  endif()
+
+  # Top-level build config
+  ############################################
+  # Flags
+  # When using MSVC
+
+  # Detect CUDA architecture and get best NVCC flags
+  # finding cuda must be first because other things depend on the result
+  #
+  # NB: We MUST NOT run this find_package if NOT USE_CUDA is set, because upstream
+  # FindCUDA has a bug where it will still attempt to make use of NOTFOUND
+  # compiler variables to run various probe tests.  We could try to fix
+  # this, but since FindCUDA upstream is subsumed by first-class support
+  # for CUDA language, it seemed not worth fixing.
+
+  IF (MSVC)
+    # we want to respect the standard, and we are bored of those **** .
+    ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+    LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819 -Xcompiler /wd4503 -Xcompiler /wd4190 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4275 -Xcompiler /wd4522")
+  ENDIF()
+
+  IF (NOT MSVC)
+    IF (CMAKE_VERSION VERSION_LESS "3.1")
+      SET(CMAKE_C_FLAGS "-std=c11 ${CMAKE_C_FLAGS}")
+    ELSE ()
+      SET(CMAKE_C_STANDARD 11)
+    ENDIF ()
+  ENDIF()
+
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
+      if (CUDA_VERSION VERSION_LESS "8.0")
+        MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
+      endif()
+    endif()
+  endif()
+
+  LIST(APPEND CUDA_NVCC_FLAGS -Wno-deprecated-gpu-targets)
+  LIST(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
+
+  if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    SET(CMAKE_CXX_STANDARD 11)
+  endif()
+
+  LIST(APPEND CUDA_NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+  LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+  IF (CMAKE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
+    LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+  ENDIF()
+
+  IF (CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+    MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
+    LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__")
+    add_compile_options(-DCUDA_HAS_FP16=1)
+  ELSE()
+    MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
+  ENDIF()
+
+  OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)")
+  IF (NOT NDEBUG)
+    MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
+    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
+  ENDIF()
+
+  # OpenMP support?
+  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+    IF (DARWIN_VERSION GREATER 9)
+      SET(APPLE_OPENMP_SUCKS 1)
+    ENDIF (DARWIN_VERSION GREATER 9)
+    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+      OUTPUT_VARIABLE GCC_VERSION)
+    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+      add_compile_options(-Wno-unknown-pragmas)
+      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+    ENDIF()
+  ENDIF()
+
+  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
+    FIND_PACKAGE(OpenMP)
+    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
+
+    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
+    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
+    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
+  ENDIF()
+
+  IF (OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  ENDIF()
+
+
+  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
+
+  FIND_PACKAGE(MAGMA)
+  IF (USE_CUDA AND MAGMA_FOUND)
+    INCLUDE_DIRECTORIES(SYSTEM ${MAGMA_INCLUDE_DIR})
+    SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
+    INCLUDE(CheckPrototypeDefinition)
+    check_prototype_definition(magma_get_sgeqrf_nb
+     "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
+     "0"
+     "magma.h"
+      MAGMA_V2)
+    IF (MAGMA_V2)
+      add_definitions(-DMAGMA_V2)
+    ENDIF (MAGMA_V2)
+
+    SET(USE_MAGMA 1)
+    MESSAGE(STATUS "Compiling with MAGMA support")
+    MESSAGE(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+    MESSAGE(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+    MESSAGE(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+  ELSE()
+    MESSAGE(STATUS "MAGMA not found. Compiling without MAGMA support")
+  ENDIF()
+
+  # ARM specific flags
+  FIND_PACKAGE(ARM)
+  IF (ASIMD_FOUND)
+    MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
+    add_compile_options(-D__NEON__)
+  ELSEIF (NEON_FOUND)
+    MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
+    add_compile_options(-mfpu=neon -D__NEON__)
+  ENDIF ()
+  IF (CORTEXA8_FOUND)
+    MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
+    add_compile_options(-mcpu=cortex-a8 -fprefetch-loop-arrays)
+  ENDIF ()
+  IF (CORTEXA9_FOUND)
+    MESSAGE(STATUS "Cortex-A9 Found with compiler flag : -mcpu=cortex-a9")
+    add_compile_options(-mcpu=cortex-a9)
+  ENDIF()
+
+  # Check that our programs run.  This is different from the native CMake compiler
+  # check, which just tests if the program compiles and links.  This is important
+  # because with ASAN you might need to help the compiled library find some
+  # dynamic libraries.
+  CHECK_C_SOURCE_RUNS("
+  int main() { return 0; }
+  " COMPILER_WORKS)
+  IF (NOT COMPILER_WORKS)
+    # Force cmake to retest next time around
+    unset(COMPILER_WORKS CACHE)
+    MESSAGE(FATAL_ERROR
+        "Could not run a simple program built with your compiler. "
+        "If you are trying to use -fsanitize=address, make sure "
+        "libasan is properly installed on your system (you can confirm "
+        "if the problem is this by attempting to build and run a "
+        "small program.)")
+  ENDIF()
+
+  CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+  # Check for a cpuid intrinsic
+  IF (HAVE_CPUID_H)
+      CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
+          int main()
+          {
+              unsigned int eax, ebx, ecx, edx;
+              return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+          }" HAVE_GCC_GET_CPUID)
+  ENDIF()
+  IF (HAVE_GCC_GET_CPUID)
+    add_compile_options(-DHAVE_GCC_GET_CPUID)
+  ENDIF()
+
+  CHECK_C_SOURCE_COMPILES("#include <stdint.h>
+      static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+      			 uint32_t *ecx, uint32_t *edx)
+      {
+        uint32_t a = *eax, b, c = *ecx, d;
+        asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
+        *eax = a; *ebx = b; *ecx = c; *edx = d;
+      }
+      int main() {
+        uint32_t a,b,c,d;
+        cpuid(&a, &b, &c, &d);
+        return 0;
+      }" NO_GCC_EBX_FPIC_BUG)
+
+  IF (NOT NO_GCC_EBX_FPIC_BUG)
+    add_compile_options(-DUSE_GCC_GET_CPUID)
+  ENDIF()
+
+  FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
+  IF (C_SSE2_FOUND)
+    MESSAGE(STATUS "SSE2 Found")
+    # TODO: Work out correct way to do this.  Note that C_SSE2_FLAGS is often
+    # empty, in which case it expands to " " flag which is bad
+    SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} ${CMAKE_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${C_SSE2_FLAGS} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-DUSE_SSE2)
+  ENDIF()
+  IF (C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+    SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} ${C_SSE4_2_FLAGS} ${CMAKE_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${C_SSE4_1_FLAGS} ${C_SSE4_2_FLAGS} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-DUSE_SSE4_1 -DUSE_SSE4_2)
+  ENDIF()
+  IF (C_SSE3_FOUND)
+    MESSAGE(STATUS "SSE3 Found")
+    SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} ${CMAKE_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${C_SSE3_FLAGS} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-DUSE_SSE3)
+  ENDIF()
+
+  # we don't set -mavx and -mavx2 flags globally, but only for specific files
+  # however, we want to enable the AVX codepaths, so we still need to
+  # add USE_AVX and USE_AVX2 macro defines
+  IF (C_AVX_FOUND)
+    MESSAGE(STATUS "AVX Found")
+    add_compile_options(-DUSE_AVX)
+  ENDIF()
+  IF (C_AVX2_FOUND)
+    MESSAGE(STATUS "AVX2 Found")
+    add_compile_options(-DUSE_AVX2)
+  ENDIF()
+
+  CHECK_C_SOURCE_RUNS("
+  #include <stdatomic.h>
+  // ATOMIC_INT_LOCK_FREE is flaky on some older gcc versions
+  // so if this define is not usable a preprocessor definition
+  // we fail this check and fall back to GCC atomics
+  #if ATOMIC_INT_LOCK_FREE == 2
+  #define TH_ATOMIC_IPC_REFCOUNT 1
+  #endif
+  int main()
+  {
+    int a;
+    int oa;
+    atomic_store(&a, 1);
+    atomic_fetch_add(&a, 1);
+    oa = atomic_load(&a);
+    if(!atomic_compare_exchange_strong(&a, &oa, 3))
+      return -1;
+    return 0;
+  }
+  " HAS_C11_ATOMICS)
+
+  IF (NOT HAS_C11_ATOMICS)
+    CHECK_C_SOURCE_RUNS("
+  #include <intrin.h>
+  int main()
+  {
+    long a;
+    _InterlockedExchange(&a, 1);
+    _InterlockedExchangeAdd(&a, 1);
+    if(_InterlockedCompareExchange(&a, 3, 2) != 2)
+      return -1;
+    return 0;
+  }
+  " HAS_MSC_ATOMICS)
+
+    CHECK_C_SOURCE_RUNS("
+  int main()
+  {
+    int a;
+    __sync_lock_test_and_set(&a, 1);
+    __sync_fetch_and_add(&a, 1);
+    if(!__sync_bool_compare_and_swap(&a, 2, 3))
+      return -1;
+    return 0;
+  }
+  " HAS_GCC_ATOMICS)
+  ENDIF()
+
+  IF (HAS_C11_ATOMICS)
+    ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
+    MESSAGE(STATUS "Atomics: using C11 intrinsics")
+  ELSEIF (HAS_MSC_ATOMICS)
+    ADD_DEFINITIONS(-DUSE_MSC_ATOMICS=1)
+    MESSAGE(STATUS "Atomics: using MSVC intrinsics")
+  ELSEIF (HAS_GCC_ATOMICS)
+    ADD_DEFINITIONS(-DUSE_GCC_ATOMICS=1)
+      MESSAGE(STATUS "Atomics: using GCC intrinsics")
+  ELSE()
+    SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    FIND_PACKAGE(Threads)
+    IF(THREADS_FOUND)
+      ADD_DEFINITIONS(-DUSE_PTHREAD_ATOMICS=1)
+      TARGET_LINK_LIBRARIES(TH ${CMAKE_THREAD_LIBS_INIT})
+      MESSAGE(STATUS "Atomics: using pthread")
+    ENDIF()
+  ENDIF()
+
+  IF (WIN32 AND NOT CYGWIN)
+    SET(BLAS_INSTALL_LIBRARIES "OFF"
+      CACHE BOOL "Copy the required BLAS DLLs into the TH install dirs")
+  ENDIF()
+
+  FIND_PACKAGE(BLAS)
+  SET(AT_MKL_ENABLED 0)
+  SET(AT_MKL_MT 0)
+  IF (BLAS_FOUND)
+    SET(USE_BLAS 1)
+    IF (BLAS_INFO STREQUAL "mkl")
+      ADD_DEFINITIONS(-DTH_BLAS_MKL)
+      IF(NOT BLAS_INCLUDE_DIR)
+        MESSAGE(FATAL_ERROR "MKL is used, but MKL header files are not found. \
+          You can get them by `conda install mkl-include` if using conda (if \
+          it is missing, run `conda upgrade -n root conda` first), and \
+          `pip install mkl-devel` if using pip. If build fails with header files \
+          available in the system, please make sure that CMake will search the \
+          directory containing them, e.g., by setting CMAKE_INCLUDE_PATH.")
+      ENDIF()
+      IF (MSVC AND MKL_LIBRARIES MATCHES ".*libiomp5md\\.lib.*")
+        ADD_DEFINITIONS(-D_OPENMP_NOFORCE_MANIFEST)
+        SET(AT_MKL_MT 1)
+      ENDIF()
+      INCLUDE_DIRECTORIES(SYSTEM ${BLAS_INCLUDE_DIR})  # include MKL headers
+      SET(AT_MKL_ENABLED 1)
+    ENDIF()
+  ENDIF()
+
+  FIND_PACKAGE(LAPACK)
+  IF (LAPACK_FOUND)
+    SET(USE_LAPACK 1)
+  ENDIF()
+
+  if (NOT USE_CUDA)
+    message("disabling CUDA because NOT USE_CUDA is set")
+    SET(AT_CUDA_ENABLED 0)
+  else()
+    SET(AT_CUDA_ENABLED 1)
+    find_package(CUDA 5.5 REQUIRED)
+  endif()
+
+  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
+    MESSAGE(STATUS "CuDNN not found. Compiling without CuDNN support")
+    set(AT_CUDNN_ENABLED 0)
+  ELSE()
+    include_directories(SYSTEM ${CUDNN_INCLUDE_DIRS})
+    set(AT_CUDNN_ENABLED 1)
+  ENDIF()
+
+  if (NO_MKLDNN)
+    message("disabling MKLDNN because NO_MKLDNN is set")
+    set(AT_MKLDNN_ENABLED 0)
+  else()
+    find_package(MKLDNN)
+    if(NOT MKLDNN_FOUND)
+      message(STATUS "MKLDNN not found. Compiling without MKLDNN support")
+      set(AT_MKLDNN_ENABLED 0)
+    else()
+      include_directories(SYSTEM ${MKLDNN_INCLUDE_DIRS})
+      set(AT_MKLDNN_ENABLED 1)
+    endif()
+  endif()
+
+  IF(UNIX AND NOT APPLE)
+     INCLUDE(CheckLibraryExists)
+     # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
+     CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
+     IF(NEED_LIBRT)
+       list(APPEND Caffe2_DEPENDENCY_LIBS rt)
+       SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
+     ENDIF(NEED_LIBRT)
+  ENDIF(UNIX AND NOT APPLE)
+
+  IF(UNIX)
+    SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+    CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+    IF(HAVE_MMAP)
+      ADD_DEFINITIONS(-DHAVE_MMAP=1)
+    ENDIF(HAVE_MMAP)
+    # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
+    ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+    CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+    IF(HAVE_SHM_OPEN)
+      ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+    ENDIF(HAVE_SHM_OPEN)
+    CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+    IF(HAVE_SHM_UNLINK)
+      ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+    ENDIF(HAVE_SHM_UNLINK)
+    CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+    IF(HAVE_MALLOC_USABLE_SIZE)
+      ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+    ENDIF(HAVE_MALLOC_USABLE_SIZE)
+  ENDIF(UNIX)
+
+  # Is __thread supported?
+  IF(NOT MSVC)
+    CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
+  ELSE(NOT MSVC)
+    CHECK_C_SOURCE_COMPILES("static __declspec( thread ) int x = 1; int main() { return x; }" C_HAS_THREAD)
+  ENDIF(NOT MSVC)
+  IF(NOT C_HAS_THREAD)
+    MESSAGE(STATUS "Warning: __thread is not supported, generating thread-unsafe code")
+  ELSE(NOT C_HAS_THREAD)
+    add_compile_options(-DTH_HAVE_THREAD)
+  ENDIF(NOT C_HAS_THREAD)
+endif()
+
+#
+# End ATen checks
+#
diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake
new file mode 100644
index 0000000..99b4e78
--- /dev/null
+++ b/cmake/External/nccl.cmake
@@ -0,0 +1,48 @@
+if (NOT __NCCL_INCLUDED)
+  set(__NCCL_INCLUDED TRUE)
+
+  # try the system-wide nccl first
+  find_package(NCCL)
+  if (NCCL_FOUND)
+      add_library(__caffe2_nccl INTERFACE)
+      target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
+      target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
+  else()
+    # build directory
+    set(nccl_PREFIX ${PROJECT_SOURCE_DIR}/third_party/nccl)
+
+    # we build nccl statically, but want to link it into the caffe shared library
+    # this requires position-independent code
+    if (UNIX)
+      set(NCCL_EXTRA_COMPILER_FLAGS "-Xcompiler -fPIC")
+    endif()
+
+    set(NCCL_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${NCCL_EXTRA_COMPILER_FLAGS})
+    set(NCCL_C_FLAGS ${CMAKE_C_FLAGS} ${NCCL_EXTRA_COMPILER_FLAGS})
+
+    ExternalProject_Add(nccl_external
+      SOURCE_DIR ${nccl_PREFIX}
+      BUILD_IN_SOURCE 1
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND
+        make
+        "CXX=${CMAKE_CXX_COMPILER}"
+        "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
+        "NVCC=${CUDA_NVCC_EXECUTABLE}"
+        "VERBOSE=0"
+      BUILD_BYPRODUCTS "${nccl_PREFIX}/build/lib/libnccl_static.a"
+      INSTALL_COMMAND ""
+      )
+
+    set(NCCL_FOUND TRUE)
+    add_library(__caffe2_nccl INTERFACE)
+    # The following old-style variables are set so that other libs, such as Gloo,
+    # can still use it.
+    set(NCCL_INCLUDE_DIRS ${nccl_PREFIX}/build/include)
+    set(NCCL_LIBRARIES ${nccl_PREFIX}/build/lib/libnccl_static.a)
+    add_dependencies(__caffe2_nccl nccl_external)
+    target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
+    target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
+  endif()
+
+endif()
diff --git a/cmake/External/nnpack.cmake b/cmake/External/nnpack.cmake
new file mode 100644
index 0000000..2cd9efc
--- /dev/null
+++ b/cmake/External/nnpack.cmake
@@ -0,0 +1,92 @@
+if (__NNPACK_INCLUDED)
+  return()
+endif()
+set(__NNPACK_INCLUDED TRUE)
+ 
+if (NOT USE_NNPACK)
+  return()
+endif()
+
+##############################################################################
+# NNPACK is built together with Caffe2
+# By default, it builds code from third-party/NNPACK submodule.
+# Define NNPACK_SOURCE_DIR to build with a different version.
+##############################################################################
+
+##############################################################################
+# (1) MSVC - unsupported 
+##############################################################################
+
+if (MSVC)
+  message(WARNING "NNPACK not supported on MSVC yet. Turn this warning off by USE_NNPACK=OFF.")
+  set(USE_NNPACK OFF)
+  return()
+endif()
+
+##############################################################################
+# (2) Anything but x86, x86-64, ARM, ARM64 - unsupported
+##############################################################################
+if(CMAKE_SYSTEM_PROCESSOR)
+  if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(i686|x86_64|armv5te|armv7-a|armv7l|aarch64)$")
+    message(WARNING "NNPACK is not supported on ${CMAKE_SYSTEM_PROCESSOR} processors. "
+      "The only supported architectures are x86, x86-64, ARM, and ARM64. "
+      "Turn this warning off by USE_NNPACK=OFF.")
+    set(USE_NNPACK OFF)
+    return()
+  endif()
+endif()
+
+##############################################################################
+# (3) Android, iOS, Linux, macOS - supported
+##############################################################################
+
+if (ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+  message(STATUS "Brace yourself, we are building NNPACK")
+  set(CAFFE2_THIRD_PARTY_ROOT ${PROJECT_SOURCE_DIR}/third_party)
+
+  # Directories for NNPACK dependencies submoduled in Caffe2
+  set(PYTHON_SIX_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/python-six" CACHE STRING "six (Python package) source directory")
+  set(PYTHON_ENUM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/python-enum" CACHE STRING "enum34 (Python package) source directory")
+  set(PYTHON_PEACHPY_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/python-peachpy" CACHE STRING "PeachPy (Python package) source directory")
+  if (NOT DEFINED CPUINFO_SOURCE_DIR)
+    set(CPUINFO_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/cpuinfo" CACHE STRING "cpuinfo source directory")
+  endif()
+  set(NNPACK_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/NNPACK" CACHE STRING "NNPACK source directory")
+  set(FP16_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/FP16" CACHE STRING "FP16 source directory")
+  set(FXDIV_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/FXdiv" CACHE STRING "FXdiv source directory")
+  set(PSIMD_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/psimd" CACHE STRING "PSimd source directory")
+  set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
+  set(GOOGLETEST_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/googletest" CACHE STRING "Google Test source directory")
+
+  if(NOT TARGET nnpack)
+    set(NNPACK_BUILD_TESTS OFF CACHE BOOL "")
+    set(NNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
+    set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
+    set(NNPACK_LIBRARY_TYPE "static" CACHE STRING "")
+    set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
+    set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
+    add_subdirectory(
+      "${NNPACK_SOURCE_DIR}"
+      "${CONFU_DEPENDENCIES_BINARY_DIR}/NNPACK")
+    # We build static versions of nnpack and pthreadpool but link
+    # them into a shared library for Caffe2, so they need PIC.
+    set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
+  endif()
+
+  set(NNPACK_FOUND TRUE)
+  set(NNPACK_INCLUDE_DIRS
+    $<TARGET_PROPERTY:nnpack,INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:pthreadpool,INCLUDE_DIRECTORIES>)
+  set(NNPACK_LIBRARIES $<TARGET_FILE:nnpack> $<TARGET_FILE:cpuinfo>)
+  return()
+endif()
+
+##############################################################################
+# (4) Catch-all: not supported.
+##############################################################################
+
+message(WARNING "Unknown platform - I don't know how to build NNPACK. "
+                "See cmake/External/nnpack.cmake for details.")
+set(USE_NNPACK OFF)
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
new file mode 100644
index 0000000..2a4e61f
--- /dev/null
+++ b/cmake/MiscCheck.cmake
@@ -0,0 +1,274 @@
+if (UNIX)
+  # prevent Unknown CMake command "check_function_exists".
+  include(CheckFunctionExists)
+endif()
+include(CheckIncludeFile)
+include(CheckCSourceCompiles)
+include(CheckCSourceRuns)
+include(CheckCCompilerFlag)
+include(CheckCXXSourceCompiles)
+include(CheckCXXCompilerFlag)
+include(CMakePushCheckState)
+
+# ---[ If running on Ubuntu, check system version and compiler version.
+if(EXISTS "/etc/os-release")
+  execute_process(COMMAND
+    "sed" "-ne" "s/^ID=\\([a-z]\\+\\)$/\\1/p" "/etc/os-release"
+    OUTPUT_VARIABLE OS_RELEASE_ID
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+  execute_process(COMMAND
+    "sed" "-ne" "s/^VERSION_ID=\"\\([0-9\\.]\\+\\)\"$/\\1/p" "/etc/os-release"
+    OUTPUT_VARIABLE OS_RELEASE_VERSION_ID
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+  if(OS_RELEASE_ID STREQUAL "ubuntu")
+    if(OS_RELEASE_VERSION_ID VERSION_GREATER "17.04")
+      if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0.0")
+          message(FATAL_ERROR
+            "Please use GCC 6 or higher on Ubuntu 17.04 and higher. "
+            "For more information, see: "
+            "https://github.com/caffe2/caffe2/issues/1633"
+            )
+        endif()
+      endif()
+    endif()
+  endif()
+endif()
+
+# ---[ Check if the data type long and int32_t/int64_t overlap.
+cmake_push_check_state(RESET)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+CHECK_CXX_SOURCE_COMPILES(
+    "#include <cstdint>
+
+    template <typename T> void Foo();
+    template<> void Foo<int32_t>() {}
+    template<> void Foo<int64_t>() {}
+    int main(int argc, char** argv) {
+      Foo<long>();
+      return 0;
+    }" CAFFE2_LONG_IS_INT32_OR_64)
+
+if (CAFFE2_LONG_IS_INT32_OR_64)
+  message(STATUS "Does not need to define long separately.")
+else()
+  message(STATUS "Need to define long as a separate typeid.")
+  set(CAFFE2_UNIQUE_LONG_TYPEMETA 1)
+endif()
+cmake_pop_check_state()
+
+# ---[ Check if std::exception_ptr is supported.
+cmake_push_check_state(RESET)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+CHECK_CXX_SOURCE_COMPILES(
+    "#include <string>
+    #include <exception>
+    int main(int argc, char** argv) {
+      std::exception_ptr eptr;
+      try {
+          std::string().at(1);
+      } catch(...) {
+          eptr = std::current_exception();
+      }
+    }" CAFFE2_EXCEPTION_PTR_SUPPORTED)
+
+if (CAFFE2_EXCEPTION_PTR_SUPPORTED)
+  message(STATUS "std::exception_ptr is supported.")
+  set(CAFFE2_USE_EXCEPTION_PTR 1)
+else()
+  message(STATUS "std::exception_ptr is NOT supported.")
+endif()
+cmake_pop_check_state()
+
+# ---[ Check for NUMA support
+cmake_push_check_state(RESET)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+CHECK_CXX_SOURCE_COMPILES(
+    "#include <numa.h>
+    #include <numaif.h>
+
+    int main(int argc, char** argv) {
+    }" CAFFE2_IS_NUMA_AVAILABLE)
+
+if (CAFFE2_IS_NUMA_AVAILABLE)
+  message(STATUS "NUMA is available")
+else()
+  message(STATUS "NUMA is not available")
+  set(CAFFE2_DISABLE_NUMA 1)
+endif()
+cmake_pop_check_state()
+
+# ---[ Check if we want to turn off deprecated warning due to glog.
+# Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that
+# is being deprecated. As a result, we will test if this is the environment we
+# are building under. If yes, we will turn off deprecation warning for a
+# cleaner build output.
+cmake_push_check_state(RESET)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+CHECK_CXX_SOURCE_COMPILES(
+    "#include <glog/stl_logging.h>
+    int main(int argc, char** argv) {
+      return 0;
+    }" CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING
+    FAIL_REGEX ".*-Wno-deprecated.*")
+
+if(NOT CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING AND NOT MSVC)
+  message(STATUS "Turning off deprecation warning due to glog.")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
+endif()
+cmake_pop_check_state()
+
+# ---[ Check if the compiler has AVX/AVX2 support. We only check AVX2.
+cmake_push_check_state(RESET)
+if (MSVC)
+  set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+else()
+  set(CMAKE_REQUIRED_FLAGS "-mavx2")
+endif()
+CHECK_CXX_SOURCE_COMPILES(
+    "#include <immintrin.h>
+     int main() {
+       __m256i a, b;
+       a = _mm256_set1_epi8 (1);
+       b = a;
+       _mm256_add_epi8 (a,a);
+       return 0;
+     }" CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+if (CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
+  message(STATUS "Current compiler supports avx2 extention. Will build perfkernels.")
+  # Currently MSVC seems to have a symbol not found error while linking (related
+  # to source file order?). As a result we will currently disable the perfkernel
+  # in msvc.
+  # Also see CMakeLists.txt under caffe2/perfkernels.
+  if (NOT MSVC)
+    set(CAFFE2_PERF_WITH_AVX 1)
+    set(CAFFE2_PERF_WITH_AVX2 1)
+  endif()
+endif()
+cmake_pop_check_state()
+
+# ---[ Checks if compiler supports -fvisibility=hidden
+check_cxx_compiler_flag("-fvisibility=hidden" COMPILER_SUPPORTS_HIDDEN_VISIBILITY)
+check_cxx_compiler_flag("-fvisibility-inlines-hidden" COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY)
+if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY})
+  set(CAFFE2_VISIBILITY_FLAG "-fvisibility-inlines-hidden")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_VISIBILITY_FLAG}")
+endif()
+
+# ---[ If we are using msvc, set no warning flags
+# Note(jiayq): if you are going to add a warning flag, check if this is
+# totally necessary, and only add when you see fit. If it is needed due to
+# a third party library (like Protobuf), mention it in the comment as
+# "THIRD_PARTY_NAME related"
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
+  add_compile_options(
+      ##########################################
+      # Protobuf related. Cannot remove.
+      # This is directly copied from
+      #     https://github.com/google/protobuf/blob/master/cmake/README.md
+      ##########################################
+      /wd4018 # 'expression' : signed/unsigned mismatch
+      /wd4065 # (3): switch with default but no case.
+      /wd4146 # unary minus operator applied to unsigned type, result still unsigned
+      /wd4244 # Conversion from 'type1' to 'type2', possible loss of data.
+      /wd4251 # 'identifier' : class 'type' needs to have dll-interface to be used by clients of class 'type2'
+      /wd4267 # Conversion from 'size_t' to 'type', possible loss of data.
+      /wd4305 # 'identifier' : truncation from 'type1' to 'type2'
+      /wd4355 # 'this' : used in base member initializer list
+      /wd4506 # (1): no definition for inline function. Protobuf related.
+      /wd4661 # No suitable definition provided for explicit template instantiation request
+      /wd4800 # 'type' : forcing value to bool 'true' or 'false' (performance warning)
+      /wd4996 # 'function': was declared deprecated
+      ##########################################
+      # Third party related. Cannot remove.
+      ##########################################
+      /wd4141 # (1): inline used twice. google benchmark related.
+      /wd4503 # (1): decorated name length exceeded, name was truncated.
+              #      Eigen related.
+      /wd4554 # (3): check operator precedence for possible error.
+              # Eigen related.
+      /wd4805 # (1): Unsafe mix of types in gtest/gtest.h. Gtest related.
+      ##########################################
+      # These are directly Caffe2 related. However, several are covered by
+      # protobuf now. We leave them here for documentation purposes only.
+      ##########################################
+      #/wd4018 # (3): Signed/unsigned mismatch. We've used it in many places
+      #        #      of the code and it would be hard to correct all.
+      #/wd4244 # (2/3/4): Possible loss of precision. Various cases where we
+      #        #      implicitly cast TIndex to int etc. Need cleaning.
+      #/wd4267 # (3): Conversion of size_t to smaller type. Same as 4244.
+      #/wd4996 # (3): Use of deprecated POSIX functions. Since we develop
+      #        #      mainly on Linux, this is ignored.
+      /wd4273 # (1): inconsistent dll linkage. This is related to the
+              #      caffe2 FLAGS_* definition using dllimport in header and
+              #      dllexport in cc file. The strategy is copied from gflags.
+  )
+
+  # Exception handing for compiler warining C4530, see
+  # https://msdn.microsoft.com/en-us/library/2axwkyt4.aspx
+  add_definitions("/EHsc")
+
+  set(CMAKE_SHARED_LINKER_FLAGS
+      "${CMAKE_SHARED_LINKER_FLAGS} /ignore:4049 /ignore:4217")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4217")
+endif()
+
+# ---[ If we are building on ios, or building with opengl support, we will
+# enable -mfpu=neon-fp16 for iOS Metal build. For Android, this fpu setting
+# is going to be done with android-cmake by setting
+#     -DANDROID_ABI="armeabi-v7a with NEON FP16"
+# in the build command.
+# Also, we will turn off deprecated-declarations
+# due to protobuf.
+
+if (IOS)
+  add_definitions("-mfpu=neon-fp16")
+  add_definitions("-Wno-deprecated-declarations")
+endif()
+
+# ---[ If we are building with ACL, we will enable neon-fp16.
+if(USE_ACL)
+  if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
+    # 32-bit ARM (armv7, armv7-a, armv7l, etc)
+    set(ACL_ARCH "armv7a")
+    # Compilers for 32-bit ARM need extra flags to enable NEON-FP16
+    add_definitions("-mfpu=neon-fp16")
+
+    include(CheckCCompilerFlag)
+    CHECK_C_COMPILER_FLAG(
+        -mfp16-format=ieee CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
+    if (CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
+      add_definitions("-mfp16-format=ieee")
+    endif()
+  endif()
+endif()
+
+# ---[ If we use asan, turn on the flags.
+# TODO: This only works with new style gcc and clang (not the old -faddress-sanitizer).
+# Change if necessary on old platforms.
+if (USE_ASAN)
+  set(CAFFE2_ASAN_FLAG "-fsanitize=address -fPIE -pie")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CAFFE2_ASAN_FLAG}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_ASAN_FLAG}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CAFFE2_ASAN_FLAG}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${CAFFE2_ASAN_FLAG}")
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${CAFFE2_ASAN_FLAG}")
+endif()
+
+# ---[ Create CAFFE2_BUILD_SHARED_LIBS for macros.h.in usage.
+set(CAFFE2_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+
+if (USE_NATIVE_ARCH)
+  check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+  if (COMPILER_SUPPORTS_MARCH_NATIVE)
+    add_definitions("-march=native")
+  else()
+    message(
+        WARNING
+        "Your compiler does not support -march=native. Turn off this warning "
+        "by setting -DUSE_NATIVE_ARCH=OFF.")
+  endif()
+endif()
diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
new file mode 100644
index 0000000..2dcb2a2
--- /dev/null
+++ b/cmake/Modules/FindARM.cmake
@@ -0,0 +1,76 @@
+# Check if the processor is an ARM and if Neon instruction are available on the machine where
+# the project is compiled.
+
+IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
+
+   #neon instruction can be found on the majority part of modern ARM processor
+   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
+   IF (NEON_TRUE)
+      set(NEON_FOUND true CACHE BOOL "NEON available on host")
+   ELSE (NEON_TRUE)
+      set(NEON_FOUND false CACHE BOOL "NEON available on host")
+   ENDIF (NEON_TRUE)
+
+   # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
+   STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
+   IF (ASIMD_TRUE)
+      set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
+   ELSE (ASIMD_TRUE)
+      set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
+   ENDIF (ASIMD_TRUE)
+
+   #Find the processor type (for now OMAP3 or OMAP4)
+   STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
+   IF (OMAP3_TRUE)
+      set(CORTEXA8_FOUND true CACHE BOOL "OMAP3 available on host")
+   ELSE (OMAP3_TRUE)
+      set(CORTEXA8_FOUND false CACHE BOOL "OMAP3 available on host")
+   ENDIF (OMAP3_TRUE)
+
+   #Find the processor type (for now OMAP3 or OMAP4)
+   STRING(REGEX REPLACE "^.*(OMAP4).*$" "\\1" OMAP4_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "OMAP4" "${OMAP4_THERE}" OMAP4_TRUE)
+   IF (OMAP4_TRUE)
+      set(CORTEXA9_FOUND true CACHE BOOL "OMAP4 available on host")
+   ELSE (OMAP4_TRUE)
+      set(CORTEXA9_FOUND false CACHE BOOL "OMAP4 available on host")
+   ENDIF (OMAP4_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+   EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
+      CPUINFO)
+
+   #neon instruction can be found on the majority part of modern ARM processor
+   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
+   IF (NEON_TRUE)
+      set(NEON_FOUND true CACHE BOOL "NEON available on host")
+   ELSE (NEON_TRUE)
+      set(NEON_FOUND false CACHE BOOL "NEON available on host")
+   ENDIF (NEON_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # TODO
+   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
+   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
+   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
+ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
+   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
+   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
+ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+if(NOT NEON_FOUND)
+      MESSAGE(STATUS "Could not find hardware support for NEON on this machine.")
+endif(NOT NEON_FOUND)
+if(NOT CORTEXA8_FOUND)
+      MESSAGE(STATUS "No OMAP3 processor on this machine.")
+endif(NOT CORTEXA8_FOUND)
+if(NOT CORTEXA9_FOUND)
+      MESSAGE(STATUS "No OMAP4 processor on this machine.")
+endif(NOT CORTEXA9_FOUND)
+mark_as_advanced(NEON_FOUND)
diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
new file mode 100644
index 0000000..9c665a4
--- /dev/null
+++ b/cmake/Modules/FindAtlas.cmake
@@ -0,0 +1,52 @@
+# Find the Atlas (and Lapack) libraries
+#
+# The following variables are optionally searched for defaults
+#  Atlas_ROOT_DIR:            Base directory where all Atlas components are found
+#
+# The following are set after configuration is done:
+#  Atlas_FOUND
+#  Atlas_INCLUDE_DIRS
+#  Atlas_LIBRARIES
+#  Atlas_LIBRARYRARY_DIRS
+
+set(Atlas_INCLUDE_SEARCH_PATHS
+  /usr/include/atlas
+  /usr/include/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/include
+)
+
+set(Atlas_LIB_SEARCH_PATHS
+  /usr/lib/atlas
+  /usr/lib/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/lib
+)
+
+find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+
+find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
+
+set(LOOKED_FOR
+  Atlas_CBLAS_INCLUDE_DIR
+  Atlas_CLAPACK_INCLUDE_DIR
+
+  Atlas_CBLAS_LIBRARY
+  Atlas_BLAS_LIBRARY
+  Atlas_LAPACK_LIBRARY
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Atlas DEFAULT_MSG ${LOOKED_FOR})
+
+if(ATLAS_FOUND)
+  set(Atlas_INCLUDE_DIR ${Atlas_CBLAS_INCLUDE_DIR} ${Atlas_CLAPACK_INCLUDE_DIR})
+  set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
+  mark_as_advanced(${LOOKED_FOR})
+
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+endif(ATLAS_FOUND)
+
diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
new file mode 100644
index 0000000..c51f17b
--- /dev/null
+++ b/cmake/Modules/FindBLAS.cmake
@@ -0,0 +1,315 @@
+# - Find BLAS library
+# This module finds an installed fortran library that implements the BLAS
+# linear-algebra interface (see http://www.netlib.org/blas/).
+# The list of libraries searched for is taken
+# from the autoconf macro file, acx_blas.m4 (distributed at
+# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html).
+#
+# This module sets the following variables:
+#  BLAS_FOUND - set to true if a library implementing the BLAS interface is found.
+#  BLAS_INFO - name of the detected BLAS library.
+#  BLAS_F2C - set to true if following the f2c return convention
+#  BLAS_LIBRARIES - list of libraries to link against to use BLAS
+#  BLAS_INCLUDE_DIR - include directory
+
+# Do nothing if BLAS was found before
+IF(NOT BLAS_FOUND)
+
+SET(BLAS_LIBRARIES)
+SET(BLAS_INCLUDE_DIR)
+SET(BLAS_INFO)
+SET(BLAS_F2C)
+
+SET(WITH_BLAS "" CACHE STRING "Blas type [mkl/open/goto/acml/atlas/accelerate/veclib/generic]")
+
+# Old FindBlas
+INCLUDE(CheckCSourceRuns)
+INCLUDE(CheckFortranFunctionExists)
+
+MACRO(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to NOTFOUND.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+
+  set(__list)
+  foreach(_elem ${_list})
+    if(__list)
+      set(__list "${__list} - ${_elem}")
+    else(__list)
+      set(__list "${_elem}")
+    endif(__list)
+  endforeach(_elem)
+  message(STATUS "Checking for [${__list}]")
+
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+    if(_libraries_work)
+      if ( WIN32 )
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS ENV LIB
+          PATHS ENV PATH )
+      endif ( WIN32 )
+      if ( APPLE )
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+          ENV DYLD_LIBRARY_PATH )
+      else ( APPLE )
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+          ENV LD_LIBRARY_PATH )
+      endif( APPLE )
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+      MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
+    endif(_libraries_work)
+  endforeach(_library ${_list})
+  if(_libraries_work)
+    # Test this combination of libraries.
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    if (CMAKE_Fortran_COMPILER_WORKS)
+      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
+    else (CMAKE_Fortran_COMPILER_WORKS)
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif (CMAKE_Fortran_COMPILER_WORKS)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  endif(_libraries_work)
+  if(NOT _libraries_work)
+    set(${LIBRARIES} NOTFOUND)
+  endif(NOT _libraries_work)
+endmacro(Check_Fortran_Libraries)
+
+# Intel MKL?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "mkl")))
+  FIND_PACKAGE(MKL)
+  IF(MKL_FOUND)
+    SET(BLAS_INFO "mkl")
+    SET(BLAS_LIBRARIES ${MKL_LIBRARIES})
+    SET(BLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+    SET(BLAS_VERSION ${MKL_VERSION})
+  ENDIF(MKL_FOUND)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "openblas")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "openblas;pthread")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES) AND (WIN32)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "libopenblas")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "goto2;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "goto")
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "goto2;gfortran;pthread")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "goto")
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "acml")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "acml;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "acml")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Apple BLAS library?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "accelerate")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "Accelerate")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "accelerate")
+    set(BLAS_IS_ACCELERATE 1)
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "veclib")))
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "vecLib")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "veclib")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "atlas")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "ptf77blas;atlas;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "atlas")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Generic BLAS library?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "generic")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "blas")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "generic")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Determine if blas was compiled with the f2c conventions
+IF (BLAS_LIBRARIES)
+  SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+int four = 4;
+int one = 1;
+extern double sdot_();
+int main() {
+  int i;
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
+}" BLAS_F2C_DOUBLE_WORKS )
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+int four = 4;
+int one = 1;
+extern float sdot_();
+int main() {
+  int i;
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
+}" BLAS_F2C_FLOAT_WORKS )
+  IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    MESSAGE(STATUS "This BLAS uses the F2C return conventions")
+    SET(BLAS_F2C TRUE)
+  ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    SET(BLAS_F2C FALSE)
+  ENDIF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+extern float cblas_sdot();
+int main() {
+  int i;
+  double r = cblas_sdot(4, x, 1, y, 1);
+  exit((float)r != (float).1234);
+}" BLAS_USE_CBLAS_DOT )
+  IF (BLAS_USE_CBLAS_DOT)
+    SET(BLAS_USE_CBLAS_DOT TRUE)
+  ELSE (BLAS_USE_CBLAS_DOT)
+    SET(BLAS_USE_CBLAS_DOT FALSE)
+  ENDIF (BLAS_USE_CBLAS_DOT)
+ENDIF(BLAS_LIBRARIES)
+
+# epilogue
+
+if(BLAS_LIBRARIES)
+  set(BLAS_FOUND TRUE)
+else(BLAS_LIBRARIES)
+  set(BLAS_FOUND FALSE)
+endif(BLAS_LIBRARIES)
+
+IF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
+  message(FATAL_ERROR "Cannot find a library with BLAS API. Please specify library location.")
+ENDIF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
+IF(NOT BLAS_FIND_QUIETLY)
+  IF(BLAS_FOUND)
+    MESSAGE(STATUS "Found a library with BLAS API (${BLAS_INFO}).")
+  ELSE(BLAS_FOUND)
+    MESSAGE(STATUS "Cannot find a library with BLAS API. Not using BLAS.")
+  ENDIF(BLAS_FOUND)
+ENDIF(NOT BLAS_FIND_QUIETLY)
+
+# Do nothing is BLAS was found before
+ENDIF(NOT BLAS_FOUND)
diff --git a/cmake/Modules/FindBenchmark.cmake b/cmake/Modules/FindBenchmark.cmake
new file mode 100644
index 0000000..e6420a5
--- /dev/null
+++ b/cmake/Modules/FindBenchmark.cmake
@@ -0,0 +1,27 @@
+# Try to find the Google Benchmark library and headers.
+#  Benchmark_FOUND        - system has benchmark lib
+#  Benchmark_INCLUDE_DIRS - the benchmark include directory
+#  Benchmark_LIBRARIES    - libraries needed to use benchmark
+
+find_path(Benchmark_INCLUDE_DIR
+  NAMES benchmark/benchmark.h
+  NO_SYSTEM_ENVIRONMENT_PATH
+  DOC "The directory where benchmark includes reside"
+)
+
+find_library(Benchmark_LIBRARY
+  NAMES benchmark
+  NO_SYSTEM_ENVIRONMENT_PATH
+  DOC "The benchmark library"
+)
+
+set(Benchmark_INCLUDE_DIRS ${Benchmark_INCLUDE_DIR})
+set(Benchmark_LIBRARIES    ${Benchmark_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Benchmark
+  FOUND_VAR Benchmark_FOUND
+  REQUIRED_VARS Benchmark_INCLUDE_DIR Benchmark_LIBRARY
+)
+
+mark_as_advanced(Benchmark_FOUND)
diff --git a/cmake/Modules/FindCUB.cmake b/cmake/Modules/FindCUB.cmake
new file mode 100644
index 0000000..8a68d7c
--- /dev/null
+++ b/cmake/Modules/FindCUB.cmake
@@ -0,0 +1,18 @@
+# Try to find the CUB library and headers.
+#  CUB_FOUND        - system has CUB
+#  CUB_INCLUDE_DIRS - the CUB include directory
+
+find_path(CUB_INCLUDE_DIR
+	NAMES cub/cub.cuh
+	DOC "The directory where CUB includes reside"
+)
+
+set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUB
+	FOUND_VAR CUB_FOUND
+	REQUIRED_VARS CUB_INCLUDE_DIR
+)
+
+mark_as_advanced(CUB_FOUND)
diff --git a/cmake/Modules/FindFFmpeg.cmake b/cmake/Modules/FindFFmpeg.cmake
new file mode 100644
index 0000000..ac5f3ac
--- /dev/null
+++ b/cmake/Modules/FindFFmpeg.cmake
@@ -0,0 +1,65 @@
+# - Try to find ffmpeg libraries
+#     (libavcodec, libavformat, libavutil, libswscale)
+# Once done this will define
+#
+# FFMPEG_FOUND - system has ffmpeg or libav
+# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
+# FFMPEG_LIBRARIES - Link these to use ffmpeg
+#
+
+if (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
+  # in cache already
+  set(FFMPEG_FOUND TRUE)
+else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
+
+  find_path(FFMPEG_AVCODEC_INCLUDE_DIR
+    NAMES libavcodec/avcodec.h
+    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
+    PATH_SUFFIXES ffmpeg libav
+  )
+
+  find_library(FFMPEG_LIBAVCODEC
+    NAMES avcodec
+    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
+  )
+
+  find_library(FFMPEG_LIBAVFORMAT
+    NAMES avformat
+    PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
+  )
+
+  find_library(FFMPEG_LIBAVUTIL
+    NAMES avutil
+    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
+  )
+
+
+  find_library(FFMPEG_LIBSWSCALE
+    NAMES swscale
+    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
+  )
+
+  if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
+    set(FFMPEG_FOUND TRUE)
+  endif()
+
+  if (FFMPEG_FOUND)
+    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
+
+    set(FFMPEG_LIBRARIES
+      ${FFMPEG_LIBAVCODEC}
+      ${FFMPEG_LIBAVFORMAT}
+      ${FFMPEG_LIBAVUTIL}
+      ${FFMPEG_LIBSWSCALE}
+    )
+
+    if (NOT FFMPEG_FIND_QUIETLY)
+      message(STATUS "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
+    endif (NOT FFMPEG_FIND_QUIETLY)
+  else (FFMPEG_FOUND)
+    if (FFMPEG_FIND_REQUIRED)
+      message(FATAL_ERROR "Could not find libavcodec or libavformat or libavutil")
+    endif (FFMPEG_FIND_REQUIRED)
+  endif (FFMPEG_FOUND)
+
+endif (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
diff --git a/cmake/Modules/FindGloo.cmake b/cmake/Modules/FindGloo.cmake
new file mode 100644
index 0000000..e965326
--- /dev/null
+++ b/cmake/Modules/FindGloo.cmake
@@ -0,0 +1,49 @@
+# Try to find the Gloo library and headers.
+#  Gloo_FOUND        - system has Gloo lib
+#  Gloo_INCLUDE_DIRS - the Gloo include directory
+#  Gloo_LIBRARY/Gloo_NATIVE_LIBRARY    - libraries needed to use Gloo
+
+find_path(Gloo_INCLUDE_DIR
+  NAMES gloo/common/common.h
+  DOC "The directory where Gloo includes reside"
+)
+
+find_library(Gloo_NATIVE_LIBRARY
+  NAMES gloo
+  DOC "The Gloo library (without CUDA)"
+)
+
+find_library(Gloo_CUDA_LIBRARY
+  NAMES gloo_cuda
+  DOC "The Gloo library (with CUDA)"
+)
+
+set(Gloo_INCLUDE_DIRS ${Gloo_INCLUDE_DIR})
+
+# use the CUDA library depending on the Gloo_USE_CUDA variable
+if (DEFINED Gloo_USE_CUDA)
+  if (${Gloo_USE_CUDA})
+    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  else()
+    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  endif()
+else()
+  # else try to use the CUDA library if found
+  if (${Gloo_CUDA_LIBRARY} STREQUAL "Gloo_CUDA_LIBRARY-NOTFOUND")
+    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  else()
+    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Gloo
+  FOUND_VAR Gloo_FOUND
+  REQUIRED_VARS Gloo_INCLUDE_DIR Gloo_LIBRARY
+)
+
+mark_as_advanced(Gloo_FOUND)
diff --git a/cmake/Modules/FindHiredis.cmake b/cmake/Modules/FindHiredis.cmake
new file mode 100644
index 0000000..b9cb69b
--- /dev/null
+++ b/cmake/Modules/FindHiredis.cmake
@@ -0,0 +1,24 @@
+# Find the Hiredis libraries
+#
+# The following variables are optionally searched for defaults
+#  HIREDIS_ROOT_DIR:    Base directory where all Hiredis components are found
+#
+# The following are set after configuration is done:
+#  HIREDIS_FOUND
+#  Hiredis_INCLUDE_DIR
+#  Hiredis_LIBRARIES
+
+find_path(Hiredis_INCLUDE_DIR NAMES hiredis/hiredis.h
+                             PATHS ${HIREDIS_ROOT_DIR} ${HIREDIS_ROOT_DIR}/include)
+
+find_library(Hiredis_LIBRARIES NAMES hiredis
+                              PATHS ${HIREDIS_ROOT_DIR} ${HIREDIS_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Hiredis DEFAULT_MSG Hiredis_INCLUDE_DIR Hiredis_LIBRARIES)
+
+if(HIREDIS_FOUND)
+  message(STATUS "Found Hiredis  (include: ${Hiredis_INCLUDE_DIR}, library: ${Hiredis_LIBRARIES})")
+  mark_as_advanced(Hiredis_INCLUDE_DIR Hiredis_LIBRARIES)
+endif()
+
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
new file mode 100644
index 0000000..9641c45
--- /dev/null
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -0,0 +1,190 @@
+# - Find LAPACK library
+# This module finds an installed fortran library that implements the LAPACK
+# linear-algebra interface (see http://www.netlib.org/lapack/).
+#
+# The approach follows that taken for the autoconf macro file, acx_lapack.m4
+# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html).
+#
+# This module sets the following variables:
+#  LAPACK_FOUND - set to true if a library implementing the LAPACK interface is found
+#  LAPACK_LIBRARIES - list of libraries (using full path name) for LAPACK
+
+# Note: I do not think it is a good idea to mixup different BLAS/LAPACK versions
+# Hence, this script wants to find a Lapack library matching your Blas library
+
+# Do nothing if LAPACK was found before
+IF(NOT LAPACK_FOUND)
+
+SET(LAPACK_LIBRARIES)
+SET(LAPACK_INFO)
+
+IF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  FIND_PACKAGE(BLAS)
+ELSE(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  FIND_PACKAGE(BLAS REQUIRED)
+ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+
+# Old search lapack script
+include(CheckFortranFunctionExists)
+
+macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+    if(_libraries_work)
+      if (WIN32)
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library} PATHS ENV LIB PATHS ENV PATH)
+      else (WIN32)
+        if(APPLE)
+          find_library(${_prefix}_${_library}_LIBRARY
+            NAMES ${_library}
+            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+            ENV DYLD_LIBRARY_PATH)
+        else(APPLE)
+          find_library(${_prefix}_${_library}_LIBRARY
+            NAMES ${_library}
+            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+            ENV LD_LIBRARY_PATH)
+        endif(APPLE)
+      endif(WIN32)
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+    endif(_libraries_work)
+  endforeach(_library ${_list})
+  if(_libraries_work)
+    # Test this combination of libraries.
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas})
+    if (CMAKE_Fortran_COMPILER_WORKS)
+      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
+    else (CMAKE_Fortran_COMPILER_WORKS)
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif (CMAKE_Fortran_COMPILER_WORKS)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  endif(_libraries_work)
+  if(NOT _libraries_work)
+    set(${LIBRARIES} FALSE)
+  endif(NOT _libraries_work)
+endmacro(Check_Lapack_Libraries)
+
+
+if(BLAS_FOUND)
+
+  # Intel MKL
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "mkl"))
+    IF(MKL_LAPACK_LIBRARIES)
+      SET(LAPACK_LIBRARIES ${MKL_LAPACK_LIBRARIES} ${MKL_LIBRARIES})
+    ELSE(MKL_LAPACK_LIBRARIES)
+      SET(LAPACK_LIBRARIES ${MKL_LIBRARIES})
+    ENDIF(MKL_LAPACK_LIBRARIES)
+    SET(LAPACK_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+    SET(LAPACK_INFO "mkl")
+  ENDIF()
+
+  # OpenBlas
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "open"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" OPEN_LAPACK_WORKS)
+    if(OPEN_LAPACK_WORKS)
+      SET(LAPACK_INFO "open")
+    else()
+      message(STATUS "It seems OpenBlas has not been compiled with Lapack support")
+    endif()
+  endif()
+
+  # GotoBlas
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "goto"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" GOTO_LAPACK_WORKS)
+    if(GOTO_LAPACK_WORKS)
+      SET(LAPACK_INFO "goto")
+    else()
+      message(STATUS "It seems GotoBlas has not been compiled with Lapack support")
+    endif()
+  endif()
+
+  # ACML
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "acml"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" ACML_LAPACK_WORKS)
+    if(ACML_LAPACK_WORKS)
+      SET(LAPACK_INFO "acml")
+    else()
+      message(STATUS "Strangely, this ACML library does not support Lapack?!")
+    endif()
+  endif()
+
+  # Accelerate
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" ACCELERATE_LAPACK_WORKS)
+    if(ACCELERATE_LAPACK_WORKS)
+      SET(LAPACK_INFO "accelerate")
+    else()
+      message(STATUS "Strangely, this Accelerate library does not support Lapack?!")
+    endif()
+  endif()
+
+  # vecLib
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "veclib"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" VECLIB_LAPACK_WORKS)
+    if(VECLIB_LAPACK_WORKS)
+      SET(LAPACK_INFO "veclib")
+    else()
+      message(STATUS "Strangely, this vecLib library does not support Lapack?!")
+    endif()
+  endif()
+
+  # Generic LAPACK library?
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "generic"))
+    check_lapack_libraries(
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "lapack"
+      "${BLAS_LIBRARIES}"
+      )
+    if(LAPACK_LIBRARIES)
+      SET(LAPACK_INFO "generic")
+    endif(LAPACK_LIBRARIES)
+  endif()
+
+else(BLAS_FOUND)
+  message(STATUS "LAPACK requires BLAS")
+endif(BLAS_FOUND)
+
+if(LAPACK_INFO)
+  set(LAPACK_FOUND TRUE)
+else(LAPACK_INFO)
+  set(LAPACK_FOUND FALSE)
+endif(LAPACK_INFO)
+
+IF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
+  message(FATAL_ERROR "Cannot find a library with LAPACK API. Please specify library location.")
+ENDIF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
+IF(NOT LAPACK_FIND_QUIETLY)
+  IF(LAPACK_FOUND)
+    MESSAGE(STATUS "Found a library with LAPACK API. (${LAPACK_INFO})")
+  ELSE(LAPACK_FOUND)
+    MESSAGE(STATUS "Cannot find a library with LAPACK API. Not using LAPACK.")
+  ENDIF(LAPACK_FOUND)
+ENDIF(NOT LAPACK_FIND_QUIETLY)
+
+# Do nothing if LAPACK was found before
+ENDIF(NOT LAPACK_FOUND)
diff --git a/cmake/Modules/FindLMDB.cmake b/cmake/Modules/FindLMDB.cmake
new file mode 100644
index 0000000..2f0adb1
--- /dev/null
+++ b/cmake/Modules/FindLMDB.cmake
@@ -0,0 +1,32 @@
+# Try to find the LMBD libraries and headers
+#  LMDB_FOUND - system has LMDB lib
+#  LMDB_INCLUDE_DIR - the LMDB include directory
+#  LMDB_LIBRARIES - Libraries needed to use LMDB
+
+# FindCWD based on FindGMP by:
+# Copyright (c) 2006, Laurent Montel, <montel@kde.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+# Adapted from FindCWD by:
+# Copyright 2013 Conrad Steenberg <conrad.steenberg@gmail.com>
+# Aug 31, 2013
+
+if(MSVC)
+  find_package(LMDB NO_MODULE)
+else()
+  find_path(LMDB_INCLUDE_DIR NAMES  lmdb.h PATHS "$ENV{LMDB_DIR}/include")
+  find_library(LMDB_LIBRARIES NAMES lmdb   PATHS "$ENV{LMDB_DIR}/lib" )
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LMDB DEFAULT_MSG LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+
+if(LMDB_FOUND)
+  message(STATUS "Found lmdb    (include: ${LMDB_INCLUDE_DIR}, library: ${LMDB_LIBRARIES})")
+  mark_as_advanced(LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+
+  caffe_parse_header(${LMDB_INCLUDE_DIR}/lmdb.h
+                     LMDB_VERSION_LINES MDB_VERSION_MAJOR MDB_VERSION_MINOR MDB_VERSION_PATCH)
+  set(LMDB_VERSION "${MDB_VERSION_MAJOR}.${MDB_VERSION_MINOR}.${MDB_VERSION_PATCH}")
+endif()
diff --git a/cmake/Modules/FindLevelDB.cmake b/cmake/Modules/FindLevelDB.cmake
new file mode 100644
index 0000000..320c246
--- /dev/null
+++ b/cmake/Modules/FindLevelDB.cmake
@@ -0,0 +1,44 @@
+# - Find LevelDB
+#
+#  LevelDB_INCLUDES  - List of LevelDB includes
+#  LevelDB_LIBRARIES - List of libraries when using LevelDB.
+#  LevelDB_FOUND     - True if LevelDB found.
+
+# Look for the header file.
+find_path(LevelDB_INCLUDE NAMES leveldb/db.h
+                          PATHS $ENV{LEVELDB_ROOT}/include /opt/local/include /usr/local/include /usr/include
+                          DOC "Path in which the file leveldb/db.h is located." )
+
+# Look for the library.
+find_library(LevelDB_LIBRARY NAMES leveldb
+                             PATHS /usr/lib $ENV{LEVELDB_ROOT}/lib
+                             DOC "Path to leveldb library." )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LevelDB DEFAULT_MSG LevelDB_INCLUDE LevelDB_LIBRARY)
+
+if(LEVELDB_FOUND)
+  message(STATUS "Found LevelDB (include: ${LevelDB_INCLUDE}, library: ${LevelDB_LIBRARY})")
+  set(LevelDB_INCLUDES ${LevelDB_INCLUDE})
+  set(LevelDB_LIBRARIES ${LevelDB_LIBRARY})
+  mark_as_advanced(LevelDB_INCLUDE LevelDB_LIBRARY)
+
+  if(EXISTS "${LevelDB_INCLUDE}/leveldb/db.h")
+    file(STRINGS "${LevelDB_INCLUDE}/leveldb/db.h" __version_lines
+           REGEX "static const int k[^V]+Version[ \t]+=[ \t]+[0-9]+;")
+
+    foreach(__line ${__version_lines})
+      if(__line MATCHES "[^k]+kMajorVersion[ \t]+=[ \t]+([0-9]+);")
+        set(LEVELDB_VERSION_MAJOR ${CMAKE_MATCH_1})
+      elseif(__line MATCHES "[^k]+kMinorVersion[ \t]+=[ \t]+([0-9]+);")
+        set(LEVELDB_VERSION_MINOR ${CMAKE_MATCH_1})
+      endif()
+    endforeach()
+
+    if(LEVELDB_VERSION_MAJOR AND LEVELDB_VERSION_MINOR)
+      set(LEVELDB_VERSION "${LEVELDB_VERSION_MAJOR}.${LEVELDB_VERSION_MINOR}")
+    endif()
+
+    # caffe_clear_vars(__line __version_lines)
+  endif()
+endif()
diff --git a/cmake/Modules/FindMAGMA.cmake b/cmake/Modules/FindMAGMA.cmake
new file mode 100644
index 0000000..2273c83
--- /dev/null
+++ b/cmake/Modules/FindMAGMA.cmake
@@ -0,0 +1,37 @@
+# - Find MAGMA library
+# This module finds an installed MAGMA library, a matrix algebra library
+# similar to LAPACK for GPU and multicore systems
+# (see http://icl.cs.utk.edu/magma/).
+#
+# This module will look for MAGMA library under /usr/local/magma by
+# default. To use a different installed version of the library set
+# environment variable MAGMA_HOME before running cmake (e.g.
+# MAGMA_HOME=${HOME}/lib/magma instead of default /usr/local/magma)
+#
+# This module sets the following variables:
+#  MAGMA_FOUND - set to true if the MAGMA library is found.
+#  MAGMA_LIBRARIES - list of libraries to link against to use MAGMA
+#  MAGMA_INCLUDE_DIR - include directory
+
+IF(NOT MAGMA_FOUND)
+
+include(FindPackageHandleStandardArgs)
+
+SET(MAGMA_LIBRARIES)
+SET(MAGMA_INCLUDE_DIR)
+
+FIND_LIBRARY(MAGMA_LIBRARIES magma 
+  HINTS $ENV{MAGMA_HOME} /usr/local/magma
+  PATH_SUFFIXES lib)
+
+FIND_PATH(MAGMA_INCLUDE_DIR magma.h 
+  HINTS $ENV{MAGMA_HOME} /usr/local/magma
+  PATH_SUFFIXES include)
+
+IF (MAGMA_LIBRARIES)
+  SET(MAGMA_FOUND TRUE)
+ELSE (MAGMA_LIBRARIES)
+  SET(MAGMA_FOUND FALSE)
+ENDIF (MAGMA_LIBRARIES)
+
+ENDIF(NOT MAGMA_FOUND)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000..14a0700
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,368 @@
+# - Find INTEL MKL library
+#
+# This module finds the Intel Mkl libraries.
+#
+#   USE_IDEEP                         : use IDEEP interface
+#   USE_MKLML                         : use MKLML interface
+#   MKLML_USE_SINGLE_DYNAMIC_LIBRARY  : use single dynamic library interface
+#   MKLML_USE_STATIC_LIBS             : use static libraries
+#   MKLML_MULTI_THREADED              : use multi-threading
+#
+# This module sets the following variables:
+#  MKL_FOUND - set to true if a library implementing the CBLAS interface is found
+#  MKL_VERSION - best guess
+#  MKL_INCLUDE_DIR - path to include dir.
+#  MKL_LIBRARIES - list of libraries for base mkl
+#  MKL_LAPACK_LIBRARIES - list of libraries to add for lapack
+#  MKL_SCALAPACK_LIBRARIES - list of libraries to add for scalapack
+#  MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers
+#  MKL_CDFT_LIBRARIES - list of libraries to add for the solvers
+
+# Do nothing if MKL_FOUND was set before!
+IF (NOT MKL_FOUND)
+
+SET(MKL_VERSION)
+SET(MKL_INCLUDE_DIR)
+SET(MKL_LIBRARIES)
+SET(MKL_LAPACK_LIBRARIES)
+SET(MKL_SCALAPACK_LIBRARIES)
+SET(MKL_SOLVER_LIBRARIES)
+SET(MKL_CDFT_LIBRARIES)
+
+# Includes
+INCLUDE(CheckTypeSize)
+INCLUDE(CheckFunctionExists)
+
+# Intel Compiler Suite
+SET(INTEL_COMPILER_DIR "/opt/intel" CACHE STRING
+  "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
+SET(INTEL_MKL_DIR "/opt/intel/mkl" CACHE STRING
+  "Root directory of the Intel MKL (standalone)")
+SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
+  "Force using the sequential (non threaded) libraries")
+
+# Checks
+CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
+IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+  SET(mklvers "intel64")
+  SET(iccvers "intel64")
+  SET(mkl64s "_lp64")
+ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
+  SET(mklvers "32")
+  SET(iccvers "ia32")
+  SET(mkl64s)
+ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
+IF(CMAKE_COMPILER_IS_GNUCC)
+  SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
+  SET(mklifaces  "intel" "gf")
+  SET(mklrtls "gomp" "iomp5")
+ELSE(CMAKE_COMPILER_IS_GNUCC)
+  SET(mklthreads "mkl_intel_thread")
+  SET(mklifaces  "intel")
+  SET(mklrtls "iomp5" "guide")
+  IF (MSVC)
+    SET(mklrtls "libiomp5md")
+  ENDIF (MSVC)
+ENDIF (CMAKE_COMPILER_IS_GNUCC)
+
+# Kernel libraries dynamically loaded
+SET(mklkerlibs "mc" "mc3" "nc" "p4n" "p4m" "p4m3" "p4p" "def")
+SET(mklseq)
+
+
+# Paths
+SET(saved_CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH})
+SET(saved_CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH})
+IF (EXISTS ${INTEL_COMPILER_DIR})
+  # TODO: diagnostic if dir does not exist
+  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+    "${INTEL_COMPILER_DIR}/lib/${iccvers}")
+  IF (NOT EXISTS ${INTEL_MKL_DIR})
+    SET(INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl")
+  ENDIF()
+ENDIF()
+IF (EXISTS ${INTEL_MKL_DIR})
+  # TODO: diagnostic if dir does not exist
+  SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
+    "${INTEL_MKL_DIR}/include")
+  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+    "${INTEL_MKL_DIR}/lib/${mklvers}")
+  IF (MSVC)
+    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+      "${INTEL_MKL_DIR}/lib/${iccvers}")
+  ENDIF()
+ENDIF()
+
+# Try linking multiple libs
+MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
+  # This macro checks for the existence of the combination of libraries given by _list.
+  # If the combination is found, this macro checks whether we can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+  SET(_prefix "${LIBRARIES}")
+  # start checking
+  SET(_libraries_work TRUE)
+  SET(${LIBRARIES})
+  SET(_combined_name)
+  SET(_paths)
+  set(__list)
+  foreach(_elem ${_list})
+    if(__list)
+      set(__list "${__list} - ${_elem}")
+    else(__list)
+      set(__list "${_elem}")
+    endif(__list)
+  endforeach(_elem)
+  message(STATUS "Checking for [${__list}]")
+  FOREACH(_library ${_list})
+    SET(_combined_name ${_combined_name}_${_library})
+    IF(_libraries_work)
+      IF(${_library} STREQUAL "gomp")
+          FIND_PACKAGE(OpenMP)
+          IF(OPENMP_FOUND)
+	      SET(${_prefix}_${_library}_LIBRARY ${OpenMP_C_FLAGS})
+          ENDIF(OPENMP_FOUND)
+      ELSE(${_library} STREQUAL "gomp")
+          FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
+      ENDIF(${_library} STREQUAL "gomp")
+      MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
+      SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+      IF(${_prefix}_${_library}_LIBRARY)
+        MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
+      ELSE(${_prefix}_${_library}_LIBRARY)
+        MESSAGE(STATUS "  Library ${_library}: not found")
+      ENDIF(${_prefix}_${_library}_LIBRARY)
+    ENDIF(_libraries_work)
+  ENDFOREACH(_library ${_list})
+  # Test this combination of libraries.
+  IF(_libraries_work)
+    SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    SET(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES};${CMAKE_REQUIRED_LIBRARIES}")
+    CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
+    SET(CMAKE_REQUIRED_LIBRARIES)
+    MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
+    SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  ENDIF(_libraries_work)
+  # Fin
+  IF(_libraries_work)
+  ELSE (_libraries_work)
+    SET(${LIBRARIES})
+    MARK_AS_ADVANCED(${LIBRARIES})
+  ENDIF(_libraries_work)
+ENDMACRO(CHECK_ALL_LIBRARIES)
+
+if(WIN32)
+  set(mkl_m "")
+  set(mkl_pthread "")
+else(WIN32)
+  set(mkl_m "m")
+  set(mkl_pthread "pthread")
+endif(WIN32)
+
+if(UNIX AND NOT APPLE)
+  set(mkl_dl "${CMAKE_DL_LIBS}")
+else(UNIX AND NOT APPLE)
+  set(mkl_dl "")
+endif(UNIX AND NOT APPLE)
+
+# Check for version 10/11
+IF (NOT MKL_LIBRARIES)
+  SET(MKL_VERSION 1011)
+ENDIF (NOT MKL_LIBRARIES)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      FOREACH(mklthread ${mklthreads})
+        IF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
+          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "")
+        ENDIF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
+      ENDFOREACH(mklthread)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      IF (NOT MKL_LIBRARIES)
+        CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m};${mkl_dl}" "")
+        IF (MKL_LIBRARIES)
+          SET(mklseq "_sequential")
+        ENDIF (MKL_LIBRARIES)
+      ENDIF (NOT MKL_LIBRARIES)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      FOREACH(mklthread ${mklthreads})
+        IF (NOT MKL_LIBRARIES)
+          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
+        ENDIF (NOT MKL_LIBRARIES)
+      ENDFOREACH(mklthread)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+
+# Check for older versions
+IF (NOT MKL_LIBRARIES)
+  SET(MKL_VERSION 900)
+  CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+    "mkl;guide;pthread;m" "")
+ENDIF (NOT MKL_LIBRARIES)
+
+# Include files
+IF (MKL_LIBRARIES)
+  FIND_PATH(MKL_INCLUDE_DIR "mkl_cblas.h")
+  MARK_AS_ADVANCED(MKL_INCLUDE_DIR)
+ENDIF (MKL_LIBRARIES)
+
+# Other libraries
+IF (MKL_LIBRARIES)
+  FOREACH(mkl64 ${mkl64s} "_core" "")
+    FOREACH(mkls ${mklseq} "")
+      IF (NOT MKL_LAPACK_LIBRARIES)
+        FIND_LIBRARY(MKL_LAPACK_LIBRARIES NAMES "mkl_lapack${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_LAPACK_LIBRARIES)
+      ENDIF (NOT MKL_LAPACK_LIBRARIES)
+      IF (NOT MKL_SCALAPACK_LIBRARIES)
+        FIND_LIBRARY(MKL_SCALAPACK_LIBRARIES NAMES "mkl_scalapack${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_SCALAPACK_LIBRARIES)
+      ENDIF (NOT MKL_SCALAPACK_LIBRARIES)
+      IF (NOT MKL_SOLVER_LIBRARIES)
+        FIND_LIBRARY(MKL_SOLVER_LIBRARIES NAMES "mkl_solver${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_SOLVER_LIBRARIES)
+      ENDIF (NOT MKL_SOLVER_LIBRARIES)
+      IF (NOT MKL_CDFT_LIBRARIES)
+        FIND_LIBRARY(MKL_CDFT_LIBRARIES NAMES "mkl_cdft${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_CDFT_LIBRARIES)
+      ENDIF (NOT MKL_CDFT_LIBRARIES)
+    ENDFOREACH(mkls)
+  ENDFOREACH(mkl64)
+ENDIF (MKL_LIBRARIES)
+
+# LibIRC: intel compiler always links this;
+# gcc does not; but mkl kernels sometimes need it.
+IF (MKL_LIBRARIES)
+  IF (CMAKE_COMPILER_IS_GNUCC)
+    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
+  ELSEIF (CMAKE_C_COMPILER_ID AND NOT CMAKE_C_COMPILER_ID STREQUAL "Intel")
+    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
+  ENDIF (CMAKE_COMPILER_IS_GNUCC)
+  MARK_AS_ADVANCED(MKL_KERNEL_libirc)
+  IF (MKL_KERNEL_libirc)
+    SET(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_KERNEL_libirc})
+  ENDIF (MKL_KERNEL_libirc)
+ENDIF (MKL_LIBRARIES)
+
+# Final
+SET(CMAKE_LIBRARY_PATH ${saved_CMAKE_LIBRARY_PATH})
+SET(CMAKE_INCLUDE_PATH ${saved_CMAKE_INCLUDE_PATH})
+IF (MKL_LIBRARIES)
+  SET(MKL_FOUND TRUE)
+  set(MKL_cmake_included true)
+ELSE (MKL_LIBRARIES)
+  SET(MKL_FOUND FALSE)
+  SET(MKL_VERSION)
+ENDIF (MKL_LIBRARIES)
+
+# Standard termination
+IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
+  MESSAGE(FATAL_ERROR "MKL library not found. Please specify library location")
+ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
+IF(NOT MKL_FIND_QUIETLY)
+  IF(MKL_FOUND)
+    MESSAGE(STATUS "MKL library found")
+  ELSE(MKL_FOUND)
+    MESSAGE(STATUS "MKL library not found")
+    return()
+  ENDIF(MKL_FOUND)
+ENDIF(NOT MKL_FIND_QUIETLY)
+
+# MKLML is included in the MKL package
+if (USE_MKL AND USE_MKLML)
+  set(CAFFE2_USE_MKL 1)
+endif()
+
+if (USE_MKL AND USE_IDEEP) 
+  set(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
+  set(MKLDNN_ROOT "${IDEEP_ROOT}/mkl-dnn")
+  find_path(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
+  find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  if (NOT MKLDNN_INCLUDE_DIR)
+    execute_process(COMMAND git submodule update --init mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
+    find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  endif()
+  
+  if (MKLDNN_INCLUDE_DIR)
+    list(APPEND IDEEP_INCLUDE_DIR ${MKLDNN_INCLUDE_DIR})
+    list(APPEND __ideep_looked_for MKLDNN_INCLUDE_DIR)
+    # to avoid adding conflicting submodels
+    set(ORIG_WITH_TEST ${WITH_TEST})
+    set(WITH_TEST OFF)
+    add_subdirectory(${IDEEP_ROOT})
+    set(WITH_TEST ${ORIG_WITH_TEST})
+
+    # If we cannot find MKL, we will use the Intel MKL Small library comes with ${MKLDNN_ROOT}/external
+    file(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl_vsl.h)
+    if(NOT MKL_FOUND AND MKLML_INNER_INCLUDE_DIR)
+      # if user has multiple version under external/ then guess last
+      # one alphabetically is "latest" and warn
+      list(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
+      if(MKLINCLEN GREATER 1)
+        list(SORT MKLML_INNER_INCLUDE_DIR)
+        list(REVERSE MKLML_INNER_INCLUDE_DIR)
+        list(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
+        set(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
+      endif()
+      get_filename_component(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
+      list(APPEND IDEEP_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
+      list(APPEND __ideep_looked_for MKLML_INNER_INCLUDE_DIR)
+
+      if(APPLE)
+        set(__mklml_inner_libs mklml iomp5)
+      else()
+        set(__mklml_inner_libs mklml_intel iomp5)
+      endif()
+
+      set(IDEEP_LIBRARIES "")
+      foreach (__mklml_inner_lib ${__mklml_inner_libs})
+        string(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
+        find_library(${__mklml_inner_lib_upper}_LIBRARY
+              NAMES ${__mklml_inner_lib}
+              PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
+              DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
+        mark_as_advanced(${__mklml_inner_lib_upper}_LIBRARY)
+        list(APPEND IDEEP_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
+        list(APPEND __ideep_looked_for ${__mklml_inner_lib_upper}_LIBRARY)
+      endforeach()
+    endif() # NOT MKL_FOUND AND MKLML_INNER_INCLUDE_DIR
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(IDEEP DEFAULT_MSG ${__ideep_looked_for})
+
+    if(IDEEP_FOUND)
+      set(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
+      list(APPEND IDEEP_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
+      message(STATUS "Found IDEEP (include: ${IDEEP_INCLUDE_DIR}, lib: ${IDEEP_LIBRARIES})")
+      set(CAFFE2_USE_IDEEP 1)
+      list(APPEND MKL_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
+      list(APPEND MKL_LIBRARIES ${IDEEP_LIBRARIES})
+    else()
+      message(FATAL_ERROR "Did not find IDEEP files!")
+    endif()
+
+    caffe_clear_vars(__ideep_looked_for __mklml_inner_libs)
+  endif() # MKLDNN_INCLUDE_DIR
+endif() # USE_IDEEP
+
+# Do nothing if MKL_FOUND was set before!
+ENDIF (NOT MKL_FOUND)
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
new file mode 100644
index 0000000..0862d5a
--- /dev/null
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -0,0 +1,32 @@
+# - Try to find MKLDNN
+#
+# The following variables are optionally searched for defaults
+#  MKLDNN_ROOT_DIR:            Base directory where all MKLDNN components are found
+#
+# The following are set after configuration is done:
+#  MKLDNN_FOUND
+#  MKLDNN_INCLUDE_DIRS
+#  MKLDNN_LIBRARIES
+#  MKLDNN_LIBRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(MKLDNN_ROOT_DIR "" CACHE PATH "Folder contains Intel MKLDNN")
+
+find_path(MKLDNN_INCLUDE_DIR mkldnn.h
+    HINTS ${MKLDNN_ROOT_DIR}
+    PATH_SUFFIXES include)
+
+find_library(MKLDNN_LIBRARY mkldnn
+    HINTS ${MKLDNN_LIB_DIR} ${MKLDNN_ROOT_DIR}
+    PATH_SUFFIXES lib lib64)
+
+find_package_handle_standard_args(
+    MKLDNN DEFAULT_MSG MKLDNN_INCLUDE_DIR MKLDNN_LIBRARY)
+
+if(MKLDNN_FOUND)
+  set(MKLDNN_INCLUDE_DIRS ${MKLDNN_INCLUDE_DIR})
+  set(MKLDNN_LIBRARIES ${MKLDNN_LIBRARY})
+  message(STATUS "Found MKLDNN      (include: ${MKLDNN_INCLUDE_DIR}, library: ${MKLDNN_LIBRARY})")
+  mark_as_advanced(MKLDNN_ROOT_DIR MKLDNN_LIBRARY MKLDNN_INCLUDE_DIR)
+endif()
diff --git a/cmake/Modules/FindMatlabMex.cmake b/cmake/Modules/FindMatlabMex.cmake
new file mode 100644
index 0000000..28ae65e
--- /dev/null
+++ b/cmake/Modules/FindMatlabMex.cmake
@@ -0,0 +1,48 @@
+# This module looks for MatlabMex compiler
+# Defines variables:
+#    Matlab_DIR    - Matlab root dir
+#    Matlab_mex    - path to mex compiler
+#    Matlab_mexext - path to mexext
+
+if(MSVC)
+  foreach(__ver "9.30" "7.14" "7.11" "7.10" "7.9" "7.8" "7.7")
+    get_filename_component(__matlab_root "[HKEY_LOCAL_MACHINE\\SOFTWARE\\MathWorks\\MATLAB\\${__ver};MATLABROOT]" ABSOLUTE)
+    if(__matlab_root)
+      break()
+    endif()
+  endforeach()
+endif()
+
+if(APPLE)
+  foreach(__ver "R2014b" "R2014a" "R2013b" "R2013a" "R2012b" "R2012a" "R2011b" "R2011a" "R2010b" "R2010a")
+    if(EXISTS /Applications/MATLAB_${__ver}.app)
+      set(__matlab_root /Applications/MATLAB_${__ver}.app)
+      break()
+    endif()
+  endforeach()
+endif()
+
+if(UNIX)
+   execute_process(COMMAND which matlab OUTPUT_STRIP_TRAILING_WHITESPACE
+                   OUTPUT_VARIABLE __out RESULT_VARIABLE __res)
+
+   if(__res MATCHES 0) # Suppress `readlink` warning if `which` returned nothing
+     execute_process(COMMAND which matlab  COMMAND xargs readlink
+                     COMMAND xargs dirname COMMAND xargs dirname COMMAND xargs echo -n
+                     OUTPUT_VARIABLE __matlab_root OUTPUT_STRIP_TRAILING_WHITESPACE)
+   endif()
+endif()
+
+
+find_path(Matlab_DIR NAMES bin/mex bin/mexext PATHS ${__matlab_root}
+                     DOC "Matlab directory" NO_DEFAULT_PATH)
+
+find_program(Matlab_mex    NAMES mex    mex.bat    HINTS ${Matlab_DIR} PATH_SUFFIXES bin NO_DEFAULT_PATH)
+find_program(Matlab_mexext NAMES mexext mexext.bat HINTS ${Matlab_DIR} PATH_SUFFIXES bin NO_DEFAULT_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MatlabMex DEFAULT_MSG Matlab_mex Matlab_mexext)
+
+if(MATLABMEX_FOUND)
+  mark_as_advanced(Matlab_mex Matlab_mexext)
+endif()
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000..0056b16
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,59 @@
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT_DIR: Base directory where all NCCL components are found
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is found
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS
+  ${NCCL_INCLUDE_DIR}
+  ${NCCL_ROOT_DIR}
+  ${NCCL_ROOT_DIR}/include
+  ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+IF ($ENV{USE_STATIC_NCCL})
+  MESSAGE(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+  SET(NCCL_LIBNAME "libnccl_static.a")
+ELSE()
+  SET(NCCL_LIBNAME "nccl")
+ENDIF()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS
+  ${NCCL_LIB_DIR}
+  ${NCCL_ROOT_DIR}
+  ${NCCL_ROOT_DIR}/lib
+  ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+  ${NCCL_ROOT_DIR}/lib64
+  ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
+  file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
+        REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
+  if (NCCL_MAJOR_VERSION_DEFINED)
+    string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
+            NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
+    message (STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
+  endif ()
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/cmake/Modules/FindNumPy.cmake b/cmake/Modules/FindNumPy.cmake
new file mode 100644
index 0000000..a671494
--- /dev/null
+++ b/cmake/Modules/FindNumPy.cmake
@@ -0,0 +1,58 @@
+# - Find the NumPy libraries
+# This module finds if NumPy is installed, and sets the following variables
+# indicating where it is.
+#
+# TODO: Update to provide the libraries and paths for linking npymath lib.
+#
+#  NUMPY_FOUND               - was NumPy found
+#  NUMPY_VERSION             - the version of NumPy found as a string
+#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
+#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
+#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
+#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
+#  NUMPY_INCLUDE_DIR         - path to the NumPy include files
+
+unset(NUMPY_VERSION)
+unset(NUMPY_INCLUDE_DIR)
+
+if(PYTHONINTERP_FOUND)
+  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "import numpy as n; print(n.__version__); print(n.get_include());"
+    RESULT_VARIABLE __result
+    OUTPUT_VARIABLE __output
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(__result MATCHES 0)
+    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
+    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
+    list(GET __values 0 NUMPY_VERSION)
+    list(GET __values 1 NUMPY_INCLUDE_DIR)
+
+    string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}")
+    if(NOT "${__ver_check}" STREQUAL "")
+      set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
+      set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
+      set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
+      math(EXPR NUMPY_VERSION_DECIMAL
+        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
+      string(REGEX REPLACE "\\\\" "/"  NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
+    else()
+     unset(NUMPY_VERSION)
+     unset(NUMPY_INCLUDE_DIR)
+     message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
+    endif()
+  endif()
+else()
+  message(STATUS "To find NumPy Python interpretator is required to be found.")
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
+                                        VERSION_VAR   NUMPY_VERSION)
+
+if(NUMPY_FOUND)
+  message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
+endif()
+
+caffe_clear_vars(__result __output __error_value __values __ver_check __error_value)
+
diff --git a/cmake/Modules/FindNuma.cmake b/cmake/Modules/FindNuma.cmake
new file mode 100644
index 0000000..63fbe90
--- /dev/null
+++ b/cmake/Modules/FindNuma.cmake
@@ -0,0 +1,29 @@
+# Find the Numa libraries
+#
+# The following variables are optionally searched for defaults
+#  NUMA_ROOT_DIR:    Base directory where all Numa components are found
+#
+# The following are set after configuration is done:
+#  NUMA_FOUND
+#  Numa_INCLUDE_DIR
+#  Numa_LIBRARIES
+
+find_path(
+    Numa_INCLUDE_DIR NAMES numa.h
+    PATHS ${NUMA_ROOT_DIR} ${NUMA_ROOT_DIR}/include)
+
+find_library(
+    Numa_LIBRARIES NAMES numa
+    PATHS ${NUMA_ROOT_DIR} ${NUMA_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    Numa DEFAULT_MSG Numa_INCLUDE_DIR Numa_LIBRARIES)
+
+if(NUMA_FOUND)
+  message(
+      STATUS
+      "Found Numa  (include: ${Numa_INCLUDE_DIR}, library: ${Numa_LIBRARIES})")
+  mark_as_advanced(Numa_INCLUDE_DIR Numa_LIBRARIES)
+endif()
+
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
new file mode 100644
index 0000000..70574ab
--- /dev/null
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -0,0 +1,66 @@
+
+
+SET(Open_BLAS_INCLUDE_SEARCH_PATHS
+  /usr/include
+  /usr/include/openblas
+  /usr/include/openblas-base
+  /usr/local/include
+  /usr/local/include/openblas
+  /usr/local/include/openblas-base
+  /usr/local/opt/openblas/include
+  /opt/OpenBLAS/include
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_HOME}/include
+)
+
+SET(Open_BLAS_LIB_SEARCH_PATHS
+        /lib/
+        /lib/openblas-base
+        /lib64/
+        /usr/lib
+        /usr/lib/openblas-base
+        /usr/lib64
+        /usr/local/lib
+        /usr/local/lib64
+        /usr/local/opt/openblas/lib
+        /opt/OpenBLAS/lib
+        $ENV{OpenBLAS}cd
+        $ENV{OpenBLAS}/lib
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/lib
+ )
+
+FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+
+SET(OpenBLAS_FOUND ON)
+
+#    Check include files
+IF(NOT OpenBLAS_INCLUDE_DIR)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+#    Check libraries
+IF(NOT OpenBLAS_LIB)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+IF (OpenBLAS_FOUND)
+  IF (NOT OpenBLAS_FIND_QUIETLY)
+    MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
+    MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
+  ENDIF (NOT OpenBLAS_FIND_QUIETLY)
+ELSE (OpenBLAS_FOUND)
+  IF (OpenBLAS_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
+  ENDIF (OpenBLAS_FIND_REQUIRED)
+ENDIF (OpenBLAS_FOUND)
+
+MARK_AS_ADVANCED(
+    OpenBLAS_INCLUDE_DIR
+    OpenBLAS_LIB
+    OpenBLAS
+)
+
diff --git a/cmake/Modules/FindRocksDB.cmake b/cmake/Modules/FindRocksDB.cmake
new file mode 100644
index 0000000..e33bcce
--- /dev/null
+++ b/cmake/Modules/FindRocksDB.cmake
@@ -0,0 +1,24 @@
+# Find the RocksDB libraries
+#
+# The following variables are optionally searched for defaults
+#  ROCKSDB_ROOT_DIR:    Base directory where all RocksDB components are found
+#
+# The following are set after configuration is done:
+#  ROCKSDB_FOUND
+#  RocksDB_INCLUDE_DIR
+#  RocksDB_LIBRARIES
+
+find_path(RocksDB_INCLUDE_DIR NAMES rocksdb/db.h
+                             PATHS ${ROCKSDB_ROOT_DIR} ${ROCKSDB_ROOT_DIR}/include)
+
+find_library(RocksDB_LIBRARIES NAMES rocksdb
+                              PATHS ${ROCKSDB_ROOT_DIR} ${ROCKSDB_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(RocksDB DEFAULT_MSG RocksDB_INCLUDE_DIR RocksDB_LIBRARIES)
+
+if(ROCKSDB_FOUND)
+  message(STATUS "Found RocksDB  (include: ${RocksDB_INCLUDE_DIR}, library: ${RocksDB_LIBRARIES})")
+  mark_as_advanced(RocksDB_INCLUDE_DIR RocksDB_LIBRARIES)
+endif()
+
diff --git a/cmake/Modules/FindSSE.cmake b/cmake/Modules/FindSSE.cmake
new file mode 100644
index 0000000..a14abe8
--- /dev/null
+++ b/cmake/Modules/FindSSE.cmake
@@ -0,0 +1,125 @@
+INCLUDE(CheckCSourceRuns)
+INCLUDE(CheckCXXSourceRuns)
+
+SET(SSE1_CODE "
+  #include <xmmintrin.h>
+
+  int main()
+  {
+    __m128 a;
+    float vals[4] = {0,0,0,0};
+    a = _mm_loadu_ps(vals);
+    return 0;
+  }")
+
+SET(SSE2_CODE "
+  #include <emmintrin.h>
+
+  int main()
+  {
+    __m128d a;
+    double vals[2] = {0,0};
+    a = _mm_loadu_pd(vals);
+    return 0;
+  }")
+
+SET(SSE3_CODE "
+  #include <pmmintrin.h>
+
+  int main( )
+  {
+    const int vals[4] = {0,0,0,0};
+    __m128i a;
+    a = _mm_lddqu_si128( (const __m128i*)vals );
+    return 0;
+  }")
+
+SET(SSE4_1_CODE "
+  #include <smmintrin.h>
+
+  int main ()
+  {
+    __m128i a = {0,0,0,0}, b = {0,0,0,0};
+    __m128i res = _mm_max_epi8(a, b);
+
+    return 0;
+  }
+")
+
+SET(SSE4_2_CODE "
+  #include <nmmintrin.h>
+
+  int main()
+  {
+    __m128i a = {0,0,0,0}, b = {0,0,0,0}, c = {0,0,0,0};
+    c = _mm_cmpgt_epi64(a, b);
+    return 0;
+  }
+")
+
+SET(AVX_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+")
+
+SET(AVX2_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    return 0;
+  }
+")
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      IF(lang STREQUAL "CXX")
+        CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ELSE()
+        CHECK_C_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ENDIF()
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
+        SET(${lang}_${type}_FLAGS "${__FLAG}" CACHE STRING "${lang} ${type} flags")
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND FALSE CACHE BOOL "${lang} ${type} support")
+    SET(${lang}_${type}_FLAGS "" CACHE STRING "${lang} ${type} flags")
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+CHECK_SSE(C "SSE1" " ;-msse;/arch:SSE")
+CHECK_SSE(C "SSE2" " ;-msse2;/arch:SSE2")
+CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
+CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
+CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
+CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+
+CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
+CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
+CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
+CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
+CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
diff --git a/cmake/Modules/FindSnappy.cmake b/cmake/Modules/FindSnappy.cmake
new file mode 100644
index 0000000..77fd32b
--- /dev/null
+++ b/cmake/Modules/FindSnappy.cmake
@@ -0,0 +1,27 @@
+# Find the Snappy libraries
+#
+# The following variables are optionally searched for defaults
+#  SNAPPY_ROOT_DIR:    Base directory where all Snappy components are found
+#
+# The following are set after configuration is done:
+#  SNAPPY_FOUND
+#  Snappy_INCLUDE_DIR
+#  Snappy_LIBRARIES
+
+find_path(Snappy_INCLUDE_DIR NAMES snappy.h
+                             PATHS ${SNAPPY_ROOT_DIR} ${SNAPPY_ROOT_DIR}/include)
+
+find_library(Snappy_LIBRARIES NAMES snappy
+                              PATHS ${SNAPPY_ROOT_DIR} ${SNAPPY_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_INCLUDE_DIR Snappy_LIBRARIES)
+
+if(SNAPPY_FOUND)
+  message(STATUS "Found Snappy  (include: ${Snappy_INCLUDE_DIR}, library: ${Snappy_LIBRARIES})")
+  mark_as_advanced(Snappy_INCLUDE_DIR Snappy_LIBRARIES)
+  caffe_parse_header(${Snappy_INCLUDE_DIR}/snappy-stubs-public.h
+                     SNAPPY_VERION_LINES SNAPPY_MAJOR SNAPPY_MINOR SNAPPY_PATCHLEVEL)
+  set(Snappy_VERSION "${SNAPPY_MAJOR}.${SNAPPY_MINOR}.${SNAPPY_PATCHLEVEL}")
+endif()
+
diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake
new file mode 100644
index 0000000..5522190
--- /dev/null
+++ b/cmake/Modules/FindZMQ.cmake
@@ -0,0 +1,32 @@
+# Find the ZMQ libraries
+#
+# The following variables are optionally searched for defaults
+#  ZMQ_ROOT_DIR:    Base directory where all ZMQ components are found
+#
+# The following are set after configuration is done:
+#  ZMQ_FOUND
+#  ZMQ_INCLUDE_DIR
+#  ZMQ_LIBRARIES
+#  ZMQ_VERSION_MAJOR 
+
+find_path(ZMQ_INCLUDE_DIR NAMES zmq.h
+                             PATHS ${ZMQ_ROOT_DIR} ${ZMQ_ROOT_DIR}/include)
+
+find_library(ZMQ_LIBRARIES NAMES zmq
+                              PATHS ${ZMQ_ROOT_DIR} ${ZMQ_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ZMQ DEFAULT_MSG ZMQ_INCLUDE_DIR ZMQ_LIBRARIES)
+
+if(ZMQ_FOUND)
+  message(STATUS "Found ZMQ  (include: ${ZMQ_INCLUDE_DIR}, library: ${ZMQ_LIBRARIES})")
+  mark_as_advanced(ZMQ_INCLUDE_DIR ZMQ_LIBRARIES)
+
+  caffe_parse_header(${ZMQ_INCLUDE_DIR}/zmq.h ZMQ_VERSION_LINES ZMQ_VERSION_MAJOR)
+  if(${ZMQ_VERSION_MAJOR} VERSION_LESS "3")
+    message(WARNING "Caffe2 requires zmq version 3 or above, but found " ${ZMQ_VERSION_MAJOR} ". Disabling zmq for now.")
+    set(ZMQ_FOUND)
+  else()
+
+  endif()
+endif()
diff --git a/cmake/Modules/Findpybind11.cmake b/cmake/Modules/Findpybind11.cmake
new file mode 100644
index 0000000..056ed1e
--- /dev/null
+++ b/cmake/Modules/Findpybind11.cmake
@@ -0,0 +1,18 @@
+# Try to find the pybind11 library and headers.
+#  pybind11_FOUND        - system has pybind11
+#  pybind11_INCLUDE_DIRS - the pybind11 include directory
+
+find_path(pybind11_INCLUDE_DIR
+	NAMES pybind11/pybind11.h
+	DOC "The directory where pybind11 includes reside"
+)
+
+set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(pybind11
+	FOUND_VAR pybind11_FOUND
+	REQUIRED_VARS pybind11_INCLUDE_DIR
+)
+
+mark_as_advanced(pybind11_FOUND)
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
new file mode 100644
index 0000000..4604336
--- /dev/null
+++ b/cmake/Modules/FindvecLib.cmake
@@ -0,0 +1,35 @@
+# Find the vecLib libraries as part of Accelerate.framework or as standalon framework
+#
+# The following are set after configuration is done:
+#  VECLIB_FOUND
+#  vecLib_INCLUDE_DIR
+#  vecLib_LINKER_LIBS
+
+
+if(NOT APPLE)
+  return()
+endif()
+
+set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
+
+find_path(vecLib_INCLUDE_DIR vecLib.h
+          DOC "vecLib include directory"
+          PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
+                /System/Library/${__veclib_include_suffix}
+                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+          NO_DEFAULT_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(vecLib DEFAULT_MSG vecLib_INCLUDE_DIR)
+
+if(VECLIB_FOUND)
+  if(vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+    set(vecLib_LINKER_LIBS -lcblas "-framework vecLib")
+    message(STATUS "Found standalone vecLib.framework")
+  else()
+    set(vecLib_LINKER_LIBS -lcblas "-framework Accelerate")
+    message(STATUS "Found vecLib as part of Accelerate.framework")
+  endif()
+
+  mark_as_advanced(vecLib_INCLUDE_DIR)
+endif()
diff --git a/cmake/Modules_CUDA_fix/FindCUDA.cmake b/cmake/Modules_CUDA_fix/FindCUDA.cmake
new file mode 100644
index 0000000..017ea59
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/FindCUDA.cmake
@@ -0,0 +1,11 @@
+# This is a wrapper of the upstream `./upstream/FindCUDA.cmake` that
+# automatically includes `./upstream/CMakeInitializeConfigs.cmake` before
+# `./upstream/FindCUDA.cmake`. The `CMakeInitializeConfigs.cmake`, which is
+# absent in old CMake versions, creates some necessary variables for the later
+# to run.
+# See ./README.md for details.
+
+set(UPSTREAM_FIND_CUDA_DIR "${CMAKE_CURRENT_LIST_DIR}/upstream/")
+
+include("${UPSTREAM_FIND_CUDA_DIR}/CMakeInitializeConfigs.cmake")
+include("${UPSTREAM_FIND_CUDA_DIR}/FindCUDA.cmake")
diff --git a/cmake/Modules_CUDA_fix/README.md b/cmake/Modules_CUDA_fix/README.md
new file mode 100644
index 0000000..4846876
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/README.md
@@ -0,0 +1,27 @@
+This `./upstream` subdolder contains fixes for `FindCUDA` that are introduced in
+later versions of cmake but cause generator expression errors in earlier CMake
+versions. Specifically:
+
+1. a problem where a generator expression for include directories was
+passed to NVCC, where the generator expression itself was prefixed by `-I`.
+As the NNPACK include directory generator expression expands to multiple
+directories, the second and later ones were not prefixed by `-I`, causing
+NVCC to return an error. First fixed in CMake 3.7 (see
+[Kitware/CMake@7ded655f](https://github.com/Kitware/CMake/commit/7ded655f)).
+
+2. Windows VS2017 fixes that allows one to define the ccbin path
+differently between earlier versions of Visual Studio and VS2017. First
+introduced after 3.10.1 master version (see
+[Kitware/CMake@bc88329e](https://github.com/Kitware/CMake/commit/bc88329e)).
+
+The downside of using these fixes is that `./upstream/CMakeInitializeConfigs.cmake`,
+defining some new CMake variables (added in
+[Kitware/CMake@48f7e2d3](https://github.com/Kitware/CMake/commit/48f7e2d3)),
+must be included before `./upstream/FindCUDA.cmake` to support older CMake
+versions. A wrapper `./FindCUDA.cmake` is created to do this automatically, and
+to allow submodules to use these fixes because we can't patch their
+`CMakeList.txt`.
+
+If you need to update files under `./upstream` folder, we recommend you issue PRs
+against [the CMake mainline branch](https://gitlab.kitware.com/cmake/cmake/tree/master/Modules/FindCUDA.cmake),
+and then backport it here for earlier CMake compatibility.
diff --git a/cmake/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake b/cmake/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake
new file mode 100644
index 0000000..5517e8f
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake
@@ -0,0 +1,40 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+# Present in upstream, but not supported on versions of cmake we need to support
+# include_guard(GLOBAL)
+
+# Initializes `<_PREFIX>_<CONFIG>` variables from the corresponding
+# `<_PREFIX>_<CONFIG>_INIT`, for the configurations currently used.
+function(cmake_initialize_per_config_variable _PREFIX _DOCSTRING)
+  string(STRIP "${${_PREFIX}_INIT}" _INIT)
+  set("${_PREFIX}" "${_INIT}"
+    CACHE STRING "${_DOCSTRING} during all build types.")
+  mark_as_advanced("${_PREFIX}")
+
+  if (NOT CMAKE_NOT_USING_CONFIG_FLAGS)
+    set(_CONFIGS Debug Release MinSizeRel RelWithDebInfo)
+
+    get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+    if (_GENERATOR_IS_MULTI_CONFIG)
+      list(APPEND _CONFIGS ${CMAKE_CONFIGURATION_TYPES})
+    else()
+      if (NOT CMAKE_NO_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE_INIT}" CACHE STRING
+          "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...")
+      endif()
+      list(APPEND _CONFIGS ${CMAKE_BUILD_TYPE})
+    endif()
+
+    list(REMOVE_DUPLICATES _CONFIGS)
+    foreach(_BUILD_TYPE IN LISTS _CONFIGS)
+      if (NOT "${_BUILD_TYPE}" STREQUAL "")
+        string(TOUPPER "${_BUILD_TYPE}" _BUILD_TYPE)
+        string(STRIP "${${_PREFIX}_${_BUILD_TYPE}_INIT}" _INIT)
+        set("${_PREFIX}_${_BUILD_TYPE}" "${_INIT}"
+          CACHE STRING "${_DOCSTRING} during ${_BUILD_TYPE} builds.")
+        mark_as_advanced("${_PREFIX}_${_BUILD_TYPE}")
+      endif()
+    endforeach()
+  endif()
+endfunction()
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
new file mode 100644
index 0000000..94758c9
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -0,0 +1,2017 @@
+#.rst:
+# FindCUDA
+# --------
+#
+# .. note::
+#
+#   The FindCUDA module has been superseded by first-class support
+#   for the CUDA language in CMake.  It is no longer necessary to
+#   use this module or call ``find_package(CUDA)``.  This module
+#   now exists only for compatibility with projects that have not
+#   been ported.
+#
+#   Instead, list ``CUDA`` among the languages named in the top-level
+#   call to the :command:`project` command, or call the
+#   :command:`enable_language` command with ``CUDA``.
+#   Then one can add CUDA (``.cu``) sources to programs directly
+#   in calls to :command:`add_library` and :command:`add_executable`.
+#
+# Tools for building CUDA C files: libraries and build dependencies.
+#
+# This script locates the NVIDIA CUDA C tools.  It should work on Linux,
+# Windows, and macOS and should be reasonably up to date with CUDA C
+# releases.
+#
+# This script makes use of the standard :command:`find_package` arguments of
+# ``<VERSION>``, ``REQUIRED`` and ``QUIET``.  ``CUDA_FOUND`` will report if an
+# acceptable version of CUDA was found.
+#
+# The script will prompt the user to specify ``CUDA_TOOLKIT_ROOT_DIR`` if
+# the prefix cannot be determined by the location of nvcc in the system
+# path and ``REQUIRED`` is specified to :command:`find_package`.  To use
+# a different installed version of the toolkit set the environment variable
+# ``CUDA_BIN_PATH`` before running cmake (e.g.
+# ``CUDA_BIN_PATH=/usr/local/cuda1.0`` instead of the default
+# ``/usr/local/cuda``) or set ``CUDA_TOOLKIT_ROOT_DIR`` after configuring.  If
+# you change the value of ``CUDA_TOOLKIT_ROOT_DIR``, various components that
+# depend on the path will be relocated.
+#
+# It might be necessary to set ``CUDA_TOOLKIT_ROOT_DIR`` manually on certain
+# platforms, or to use a CUDA runtime not installed in the default
+# location.  In newer versions of the toolkit the CUDA library is
+# included with the graphics driver -- be sure that the driver version
+# matches what is needed by the CUDA runtime version.
+#
+# The following variables affect the behavior of the macros in the
+# script (in alphebetical order).  Note that any of these flags can be
+# changed multiple times in the same directory before calling
+# ``CUDA_ADD_EXECUTABLE``, ``CUDA_ADD_LIBRARY``, ``CUDA_COMPILE``,
+# ``CUDA_COMPILE_PTX``, ``CUDA_COMPILE_FATBIN``, ``CUDA_COMPILE_CUBIN``
+# or ``CUDA_WRAP_SRCS``::
+#
+#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
+#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
+#      Note that making this different from the host code when generating object
+#      or C files from CUDA code just won't work, because size_t gets defined by
+#      nvcc in the generated source.  If you compile to PTX and then load the
+#      file yourself, you can mix bit sizes between device and host.
+#
+#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
+#   -- Set to ON if you want the custom build rule to be attached to the source
+#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
+#      targets.
+#
+#      This allows the user to build the target from the CUDA file; however, bad
+#      things can happen if the CUDA source file is added to multiple targets.
+#      When performing parallel builds it is possible for the custom build
+#      command to be run more than once and in parallel causing cryptic build
+#      errors.  VS runs the rules for every source file in the target, and a
+#      source can have only one rule no matter how many projects it is added to.
+#      When the rule is run from multiple targets race conditions can occur on
+#      the generated file.  Eventually everything will get built, but if the user
+#      is unaware of this behavior, there may be confusion.  It would be nice if
+#      this script could detect the reuse of source files across multiple targets
+#      and turn the option off for the user, but no good solution could be found.
+#
+#   CUDA_BUILD_CUBIN (Default OFF)
+#   -- Set to ON to enable and extra compilation pass with the -cubin option in
+#      Device mode. The output is parsed and register, shared memory usage is
+#      printed during build.
+#
+#   CUDA_BUILD_EMULATION (Default OFF for device mode)
+#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
+#      when CUDA_BUILD_EMULATION is TRUE.
+#
+#   CUDA_LINK_LIBRARIES_KEYWORD (Default "")
+#    -- The <PRIVATE|PUBLIC|INTERFACE> keyword to use for internal
+#       target_link_libraries calls. The default is to use no keyword which
+#       uses the old "plain" form of target_link_libraries. Note that is matters
+#       because whatever is used inside the FindCUDA module must also be used
+#       outside - the two forms of target_link_libraries cannot be mixed.
+#
+#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
+#   -- Set to the path you wish to have the generated files placed.  If it is
+#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
+#      Intermediate files will always be placed in
+#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
+#
+#   CUDA_HOST_COMPILATION_CPP (Default ON)
+#   -- Set to OFF for C compilation of host code.
+#
+#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER)
+#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
+#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
+#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets,
+#      the host compiler is constructed with one or more visual studio macros
+#      such as $(VCInstallDir), that expands out to the path when
+#      the command is run from within VS.
+#
+#   CUDA_NVCC_FLAGS
+#   CUDA_NVCC_FLAGS_<CONFIG>
+#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
+#      semi-colon delimited (e.g. --compiler-options;-Wall)
+#
+#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
+#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
+#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
+#      host compiler through nvcc's -Xcompiler flag.  This helps make the
+#      generated host code match the rest of the system better.  Sometimes
+#      certain flags give nvcc problems, and this will help you turn the flag
+#      propagation off.  This does not affect the flags supplied directly to nvcc
+#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
+#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
+#      shared library compilation are not affected by this flag.
+#
+#   CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST (Default "")
+#   -- A list containing the host flags that should not be propagated when
+#      CUDA_PROPAGATE_HOST_FLAGS is ON.
+#
+#   CUDA_SEPARABLE_COMPILATION (Default OFF)
+#   -- If set this will enable separable compilation for all CUDA runtime object
+#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#      (e.g. calling CUDA_WRAP_SRCS directly),
+#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
+#   CUDA_SOURCE_PROPERTY_FORMAT
+#   -- If this source file property is set, it can override the format specified
+#      to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN).  If an input source file
+#      is not a .cu file, setting this file will cause it to be treated as a .cu
+#      file. See documentation for set_source_files_properties on how to set
+#      this property.
+#
+#   CUDA_USE_STATIC_CUDA_RUNTIME (Default ON)
+#   -- When enabled the static version of the CUDA runtime library will be used
+#      in CUDA_LIBRARIES.  If the version of CUDA configured doesn't support
+#      this option, then it will be silently disabled.
+#
+#   CUDA_VERBOSE_BUILD (Default OFF)
+#   -- Set to ON to see all the commands used when building the CUDA file.  When
+#      using a Makefile generator the value defaults to VERBOSE (run make
+#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
+#      always print the output.
+#
+# The script creates the following macros (in alphebetical order)::
+#
+#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
+#   -- Adds the cufft library to the target (can be any target).  Handles whether
+#      you are in emulation mode or not.
+#
+#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
+#   -- Adds the cublas library to the target (can be any target).  Handles
+#      whether you are in emulation mode or not.
+#
+#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
+#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Creates an executable "cuda_target" which is made up of the files
+#      specified.  All of the non CUDA C files are compiled using the standard
+#      build rules specified by CMAKE and the cuda files are compiled to object
+#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
+#      added automatically to include_directories().  Some standard CMake target
+#      calls can be used on the target after calling this macro
+#      (e.g. set_target_properties and target_link_libraries), but setting
+#      properties that adjust compilation flags will not affect code compiled by
+#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
+#
+#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
+#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
+#
+#   CUDA_BUILD_CLEAN_TARGET()
+#   -- Creates a convenience target that deletes all the dependency files
+#      generated.  You should make clean after running this target to ensure the
+#      dependency files get regenerated.
+#
+#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
+#                 [OPTIONS ...] )
+#   -- Returns a list of generated files from the input source files to be used
+#      with ADD_LIBRARY or ADD_EXECUTABLE.
+#
+#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of PTX files generated from the input source files.
+#
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
+#
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
+#
+#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                        cuda_target
+#                                                        object_files )
+#   -- Compute the name of the intermediate link file used for separable
+#      compilation.  This file name is typically passed into
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#      based on cuda_target the list of objects files that need separable
+#      compilation as specified by object_files.  If the object_files list is
+#      empty, then output_file_var will be empty.  This function is called
+#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#      this is a function and not a macro.
+#
+#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+#   -- Sets the directories that should be passed to nvcc
+#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
+#      files.
+#
+#
+#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                            nvcc_flags object_files)
+#   -- Generates the link object required by separable compilation from the given
+#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#      argument.  The only nvcc flag added automatically is the bitness flag as
+#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#      instead of a macro.
+#
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#      Note that this is a function instead of a macro.
+#
+#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
+#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
+#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
+#      function under the hood.
+#
+#      Given the list of files (file0 file1 ... fileN) this macro generates
+#      custom commands that generate either PTX or linkable objects (use "PTX" or
+#      "OBJ" for the format argument to switch).  Files that don't end with .cu
+#      or have the HEADER_FILE_ONLY property are ignored.
+#
+#      The arguments passed in after OPTIONS are extra command line options to
+#      give to nvcc.  You can also specify per configuration options by
+#      specifying the name of the configuration followed by the options.  General
+#      options must precede configuration specific options.  Not all
+#      configurations need to be specified, only the ones provided will be used.
+#
+#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
+#         DEBUG -g
+#         RELEASE --use_fast_math
+#         RELWITHDEBINFO --use_fast_math;-g
+#         MINSIZEREL --use_fast_math
+#
+#      For certain configurations (namely VS generating object files with
+#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
+#      be produced for the given cuda file.  This is because when you add the
+#      cuda file to Visual Studio it knows that this file produces an object file
+#      and will link in the resulting object file automatically.
+#
+#      This script will also generate a separate cmake script that is used at
+#      build time to invoke nvcc.  This is for several reasons.
+#
+#        1. nvcc can return negative numbers as return values which confuses
+#        Visual Studio into thinking that the command succeeded.  The script now
+#        checks the error codes and produces errors when there was a problem.
+#
+#        2. nvcc has been known to not delete incomplete results when it
+#        encounters problems.  This confuses build systems into thinking the
+#        target was generated when in fact an unusable file exists.  The script
+#        now deletes the output files if there was an error.
+#
+#        3. By putting all the options that affect the build into a file and then
+#        make the build rule dependent on the file, the output files will be
+#        regenerated when the options change.
+#
+#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
+#      determine when to target the object compilation for a shared library.
+#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
+#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
+#      objects intended for shared libraries.  A preprocessor macro,
+#      <target_name>_EXPORTS is defined when a shared library compilation is
+#      detected.
+#
+#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
+#
+#
+#
+# The script defines the following variables::
+#
+#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
+#   CUDA_VERSION_MINOR    -- The minor version.
+#   CUDA_VERSION
+#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
+#   CUDA_HAS_FP16         -- Whether a short float (float16,fp16) is supported.
+#
+#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
+#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
+#                            SDK.  This script will not directly support finding
+#                            specific libraries or headers, as that isn't
+#                            supported by NVIDIA.  If you want to change
+#                            libraries when the path changes see the
+#                            FindCUDA.cmake script for an example of how to clear
+#                            these variables.  There are also examples of how to
+#                            use the CUDA_SDK_ROOT_DIR to locate headers or
+#                            libraries, if you so choose (at your own risk).
+#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
+#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
+#   CUDA_LIBRARIES        -- Cuda RT library.
+#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUFFT_TO_TARGET macro)
+#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
+#   CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library.
+#                                 Only available for CUDA version 5.5+
+#   CUDA_cudadevrt_LIBRARY -- Device runtime library.
+#                             Required for separable compilation.
+#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_cusolver_LIBRARY -- CUDA Direct Solver library.
+#                            Only available for CUDA version 7.0+.
+#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 5.5 - 8.0.
+#   CUDA_nppial_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppicc_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppicom_LIBRARY  -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppidei_LIBRARY  -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppif_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppig_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppim_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppist_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppisu_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppitc_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# FindCUDA.cmake
+
+# This macro helps us find the location of helper files we will need the full path to
+macro(CUDA_FIND_HELPER_FILE _name _extension)
+  set(_full_name "${_name}.${_extension}")
+  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+  # processed.  Using this variable, we can pull out the current path, and
+  # provide a way to get access to the other files we need local to here.
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
+  if(NOT EXISTS "${CUDA_${_name}}")
+    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "${error_message}")
+    else()
+      if(NOT CUDA_FIND_QUIETLY)
+        message(STATUS "${error_message}")
+      endif()
+    endif()
+  endif()
+  # Set this variable as internal, so the user isn't bugged with it.
+  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+#####################################################################
+## CUDA_INCLUDE_NVCC_DEPENDENCIES
+##
+
+# So we want to try and include the dependency file if it exists.  If
+# it doesn't exist then we need to create an empty one, so we can
+# include it.
+
+# If it does exist, then we need to check to see if all the files it
+# depends on exist.  If they don't then we should clear the dependency
+# file and regenerate it later.  This covers the case where a header
+# file has disappeared or moved.
+
+macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
+  set(CUDA_NVCC_DEPEND)
+  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
+
+
+  # Include the dependency file.  Create it first if it doesn't exist .  The
+  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
+  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
+  # hours figuring out why it didn't work.
+  if(NOT EXISTS ${dependency_file})
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+  # Always include this file to force CMake to run again next
+  # invocation and rebuild the dependencies.
+  #message("including dependency_file = ${dependency_file}")
+  include(${dependency_file})
+
+  # Now we need to verify the existence of all the included files
+  # here.  If they aren't there we need to just blank this variable and
+  # make the file regenerate again.
+#   if(DEFINED CUDA_NVCC_DEPEND)
+#     message("CUDA_NVCC_DEPEND set")
+#   else()
+#     message("CUDA_NVCC_DEPEND NOT set")
+#   endif()
+  if(CUDA_NVCC_DEPEND)
+    #message("CUDA_NVCC_DEPEND found")
+    foreach(f ${CUDA_NVCC_DEPEND})
+      # message("searching for ${f}")
+      if(NOT EXISTS ${f})
+        #message("file ${f} not found")
+        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+      endif()
+    endforeach()
+  else()
+    #message("CUDA_NVCC_DEPEND false")
+    # No dependencies, so regenerate the file.
+    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+  endif()
+
+  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
+  # No incoming dependencies, so we need to generate them.  Make the
+  # output depend on the dependency file itself, which should cause the
+  # rule to re-run.
+  if(CUDA_NVCC_DEPEND_REGENERATE)
+    set(CUDA_NVCC_DEPEND ${dependency_file})
+    #message("Generating an empty dependency_file: ${dependency_file}")
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# Setup variables' defaults
+###############################################################################
+###############################################################################
+
+# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
+else()
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
+endif()
+option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
+
+# Attach the build rule to the source file in VS.  This option
+option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
+
+# Prints out extra information about the cuda file during compilation
+option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
+
+# Set whether we are using emulation or device mode.
+option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
+
+# Where to put the generated output.
+set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
+
+# Parse HOST_COMPILATION mode.
+option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
+
+# Extra user settable flags
+cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.")
+
+if(CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)")
+  if(MSVC_VERSION LESS 1910)
+   set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin")
+  endif()
+
+  set(CUDA_HOST_COMPILER "${_CUDA_MSVC_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+
+else()
+  if(APPLE
+      AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang"
+      AND "${CMAKE_C_COMPILER}" MATCHES "/cc$")
+    # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+    # unhandled -dumpspecs option to clang. Also in case neither
+    # CMAKE_C_COMPILER is defined (project does not use C language) nor
+    # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+    # nvcc use its own default C compiler.
+    # Only care about this on APPLE with clang to avoid
+    # following symlinks to things like ccache
+    if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+      get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+      # if the real path does not end up being clang then
+      # go back to using CMAKE_C_COMPILER
+      if(NOT "${c_compiler_realpath}" MATCHES "/clang$")
+        set(c_compiler_realpath "${CMAKE_C_COMPILER}")
+      endif()
+    else()
+      set(c_compiler_realpath "")
+    endif()
+    set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
+  elseif(MSVC AND ("${CMAKE_C_COMPILER}" MATCHES "clcache" OR "${CMAKE_C_COMPILER}" MATCHES "sccache"))
+    # NVCC does not think it will work if it is passed clcache.exe as the host
+    # compiler, which means that builds with CC=cl.exe won't work.  Best to just
+    # feed it whatever the actual cl.exe is as the host compiler.
+    #
+    # FYI: clcache works as the match, but clcache.exe does NOT.
+    set(CUDA_HOST_COMPILER "cl.exe" CACHE FILEPATH "Host side compiler used by NVCC")
+  else()
+    set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
+      CACHE FILEPATH "Host side compiler used by NVCC")
+  endif()
+endif()
+
+# Propagate the host flags to the host compiler via -Xcompiler
+option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+
+# Blacklisted flags to prevent propagation
+set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST  "" CACHE STRING "Blacklisted flags to prevent propagation")
+
+# Enable CUDA_SEPARABLE_COMPILATION
+option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
+
+# Specifies whether the commands used when compiling the .cu file will be printed out.
+option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+
+mark_as_advanced(
+  CUDA_64_BIT_DEVICE_CODE
+  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
+  CUDA_GENERATED_OUTPUT_DIR
+  CUDA_HOST_COMPILATION_CPP
+  CUDA_NVCC_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST
+  CUDA_BUILD_CUBIN
+  CUDA_BUILD_EMULATION
+  CUDA_VERBOSE_BUILD
+  CUDA_SEPARABLE_COMPILATION
+  )
+
+# Single config generators like Makefiles or Ninja don't usually have
+# CMAKE_CONFIGURATION_TYPES defined (but note that it can be defined if set by
+# projects or developers). Even CMAKE_BUILD_TYPE might not be defined for
+# single config generators (and should not be defined for multi-config
+# generators). To ensure we get a complete superset of all possible
+# configurations, we combine CMAKE_CONFIGURATION_TYPES, CMAKE_BUILD_TYPE and
+# all of the standard configurations, then weed out duplicates with
+# list(REMOVE_DUPLICATES). Looping over the unique set then ensures we have
+# each configuration-specific set of nvcc flags defined and marked as advanced.
+set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES CUDA_configuration_types)
+
+###############################################################################
+###############################################################################
+# Locate CUDA, Set Build Type, etc.
+###############################################################################
+###############################################################################
+
+macro(cuda_unset_include_and_libraries)
+  unset(CUDA_TOOLKIT_INCLUDE CACHE)
+  unset(CUDA_CUDART_LIBRARY CACHE)
+  unset(CUDA_CUDA_LIBRARY CACHE)
+  # Make sure you run this before you unset CUDA_VERSION.
+  if(CUDA_VERSION VERSION_EQUAL "3.0")
+    # This only existed in the 3.0 version of the CUDA toolkit
+    unset(CUDA_CUDARTEMU_LIBRARY CACHE)
+  endif()
+  unset(CUDA_cudart_static_LIBRARY CACHE)
+  unset(CUDA_cudadevrt_LIBRARY CACHE)
+  unset(CUDA_cublas_LIBRARY CACHE)
+  unset(CUDA_cublas_device_LIBRARY CACHE)
+  unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cufft_LIBRARY CACHE)
+  unset(CUDA_cufftemu_LIBRARY CACHE)
+  unset(CUDA_cupti_LIBRARY CACHE)
+  unset(CUDA_curand_LIBRARY CACHE)
+  unset(CUDA_cusolver_LIBRARY CACHE)
+  unset(CUDA_cusparse_LIBRARY CACHE)
+  unset(CUDA_npp_LIBRARY CACHE)
+  unset(CUDA_nppc_LIBRARY CACHE)
+  unset(CUDA_nppi_LIBRARY CACHE)
+  unset(CUDA_npps_LIBRARY CACHE)
+  unset(CUDA_nvcuvenc_LIBRARY CACHE)
+  unset(CUDA_nvcuvid_LIBRARY CACHE)
+  unset(CUDA_GPU_DETECT_OUTPUT CACHE)
+endmacro()
+
+# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
+# if they have then clear the cache variables, so that will be detected again.
+if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
+  unset(CUDA_NVCC_EXECUTABLE CACHE)
+  cuda_unset_include_and_libraries()
+  unset(CUDA_VERSION CACHE)
+endif()
+
+if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+  cuda_unset_include_and_libraries()
+endif()
+
+#
+#  End of unset()
+#
+
+#
+#  Start looking for things
+#
+
+# Search for the cuda distribution.
+if(NOT CUDA_TOOLKIT_ROOT_DIR AND NOT CMAKE_CROSSCOMPILING)
+  # Search in the CUDA_BIN_PATH first.
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS
+      ENV CUDA_TOOLKIT_ROOT
+      ENV CUDA_PATH
+      ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    DOC "Toolkit location."
+    NO_DEFAULT_PATH
+    )
+
+  # Now search default paths
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS /opt/cuda/bin
+    PATH_SUFFIXES cuda/bin
+    DOC "Toolkit location."
+    )
+
+  if (CUDA_TOOLKIT_ROOT_DIR)
+    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+    # We need to force this back into the cache.
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
+    set(CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+
+  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
+    elseif(NOT CUDA_FIND_QUIETLY)
+      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
+    endif()
+  endif ()
+endif ()
+
+if(CMAKE_CROSSCOMPILING)
+  SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT})
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDA_TOOLKIT_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDA_TOOLKIT_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  endif()
+
+  if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
+    set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
+    SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
+    mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+  endif()
+
+  # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers
+  set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}")
+  macro( cuda_find_host_program )
+    if (COMMAND find_host_program)
+      find_host_program( ${ARGN} )
+    else()
+      find_program( ${ARGN} )
+    endif()
+  endmacro()
+else()
+  # for non-cross-compile, find_host_program == find_program and CUDA_TOOLKIT_TARGET_DIR == CUDA_TOOLKIT_ROOT_DIR
+  macro( cuda_find_host_program )
+    find_program( ${ARGN} )
+  endmacro()
+  SET (CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+
+# CUDA_NVCC_EXECUTABLE
+if(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
+  set(CUDA_NVCC_EXECUTABLE "$ENV{CUDA_NVCC_EXECUTABLE}" CACHE FILEPATH "The CUDA compiler")
+else()
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE
+    NAMES nvcc
+    PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    NO_DEFAULT_PATH
+    )
+  # Search default search paths, after we search our own set of paths.
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
+endif()
+mark_as_advanced(CUDA_NVCC_EXECUTABLE)
+
+if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
+  # Compute the version.
+  execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
+  mark_as_advanced(CUDA_VERSION)
+else()
+  # Need to set these based off of the cached value
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
+endif()
+
+
+# Always set this convenience variable
+set(CUDA_VERSION_STRING "${CUDA_VERSION}")
+
+# CUDA_TOOLKIT_INCLUDE
+find_path(CUDA_TOOLKIT_INCLUDE
+  device_functions.h # Header included in toolkit
+  PATHS ${CUDA_TOOLKIT_TARGET_DIR}
+  ENV CUDA_PATH
+  ENV CUDA_INC_PATH
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
+mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
+
+if (CUDA_VERSION VERSION_GREATER "7.0" OR EXISTS "${CUDA_TOOLKIT_INCLUDE}/cuda_fp16.h")
+  set(CUDA_HAS_FP16 TRUE)
+else()
+  set(CUDA_HAS_FP16 FALSE)
+endif()
+
+# Set the user list of include dir to nothing to initialize it.
+set (CUDA_NVCC_INCLUDE_DIRS_USER "")
+set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+
+macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # CUDA 3.2+ on Windows moved the library directories, so we need the new
+    # and old paths.
+    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
+  endif()
+  # CUDA 3.2+ on Windows moved the library directories, so we need to new
+  # (lib/Win32) and the old path (lib).
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "${CUDA_TOOLKIT_TARGET_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_LIB_PATH
+    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
+    DOC ${_doc}
+    NO_DEFAULT_PATH
+    )
+  if (NOT CMAKE_CROSSCOMPILING)
+    # Search default search paths, after we search our own set of paths.
+    find_library(${_var}
+      NAMES ${_names}
+      PATHS "/usr/lib/nvidia-current"
+      DOC ${_doc}
+      )
+  endif()
+endmacro()
+
+macro(cuda_find_library_local_first _var _names _doc)
+  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+macro(find_library_local_first _var _names _doc )
+  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+
+# CUDA_LIBRARIES
+cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
+if(CUDA_VERSION VERSION_EQUAL "3.0")
+  # The cudartemu library only existed for the 3.0 version of CUDA.
+  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
+  mark_as_advanced(
+    CUDA_CUDARTEMU_LIBRARY
+    )
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "5.5")
+  cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library")
+  mark_as_advanced(CUDA_cudart_static_LIBRARY)
+endif()
+
+
+option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF)
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_cudart_static_LIBRARY)
+else()
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_CUDART_LIBRARY)
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "5.0")
+  cuda_find_library_local_first(CUDA_cudadevrt_LIBRARY cudadevrt "\"cudadevrt\" library")
+  mark_as_advanced(CUDA_cudadevrt_LIBRARY)
+endif()
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  if(UNIX)
+    # Check for the dependent libraries.  Here we look for pthreads.
+    if (DEFINED CMAKE_THREAD_PREFER_PTHREAD)
+      set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD})
+    endif()
+    set(CMAKE_THREAD_PREFER_PTHREAD 1)
+
+    # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;}
+    # as the source file.  Unfortunately this causes a warning with -Wstrict-prototypes and
+    # -Werror causes the try_compile to fail.  We will just temporarily disable other flags
+    # when doing the find_package command here.
+    set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS})
+    set(CMAKE_C_FLAGS "-fPIC")
+    find_package(Threads REQUIRED)
+    set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags})
+
+    if (DEFINED _cuda_cmake_thread_prefer_pthread)
+      set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread})
+      unset(_cuda_cmake_thread_prefer_pthread)
+    else()
+      unset(CMAKE_THREAD_PREFER_PTHREAD)
+    endif()
+
+    if(NOT APPLE)
+      #On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDA_rt_LIBRARY rt)
+      if (NOT CUDA_rt_LIBRARY)
+        message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.")
+      endif()
+    endif()
+  endif()
+endif()
+
+# CUPTI library showed up in cuda toolkit 4.0
+if(NOT CUDA_VERSION VERSION_LESS "4.0")
+  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
+  mark_as_advanced(CUDA_cupti_LIBRARY)
+endif()
+
+# Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
+# using the CUDA runtime.  For the dynamic version of the runtime, most of the
+# dependencies are brough in, but for the static version there are additional libraries
+# and linker commands needed.
+# Initialize to empty
+set(CUDA_LIBRARIES)
+
+# If we are using emulation mode and we found the cudartemu library then use
+# that one instead of cudart.
+if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
+elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
+  if (CUDA_rt_LIBRARY)
+    list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY})
+  endif()
+  if(APPLE)
+    # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
+    # the static cuda runtime can find it at runtime.
+    list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib)
+  endif()
+else()
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
+endif()
+
+# 1.1 toolkit on linux doesn't appear to have a separate library on
+# some platforms.
+cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
+
+mark_as_advanced(
+  CUDA_CUDA_LIBRARY
+  CUDA_CUDART_LIBRARY
+  )
+
+#######################
+# Look for some of the toolkit helper libraries
+macro(FIND_CUDA_HELPER_LIBS _name)
+  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
+  mark_as_advanced(CUDA_${_name}_LIBRARY)
+endmacro()
+
+#######################
+# Disable emulation for v3.1 onward
+if(CUDA_VERSION VERSION_GREATER "3.0")
+  if(CUDA_BUILD_EMULATION)
+    message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
+  endif()
+endif()
+
+# Search for additional CUDA toolkit libraries.
+if(CUDA_VERSION VERSION_LESS "3.1")
+  # Emulation libraries aren't available in version 3.1 onward.
+  find_cuda_helper_libs(cufftemu)
+  find_cuda_helper_libs(cublasemu)
+endif()
+find_cuda_helper_libs(cufft)
+find_cuda_helper_libs(cublas)
+if(NOT CUDA_VERSION VERSION_LESS "3.2")
+  # cusparse showed up in version 3.2
+  find_cuda_helper_libs(cusparse)
+  find_cuda_helper_libs(curand)
+  if (WIN32)
+    find_cuda_helper_libs(nvcuvenc)
+    find_cuda_helper_libs(nvcuvid)
+  endif()
+endif()
+if(CUDA_VERSION VERSION_GREATER "5.0")
+  find_cuda_helper_libs(cublas_device)
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "9.0")
+  # In CUDA 9.0 NPP was nppi was removed
+  find_cuda_helper_libs(nppc)
+  find_cuda_helper_libs(nppial)
+  find_cuda_helper_libs(nppicc)
+  find_cuda_helper_libs(nppicom)
+  find_cuda_helper_libs(nppidei)
+  find_cuda_helper_libs(nppif)
+  find_cuda_helper_libs(nppig)
+  find_cuda_helper_libs(nppim)
+  find_cuda_helper_libs(nppist)
+  find_cuda_helper_libs(nppisu)
+  find_cuda_helper_libs(nppitc)
+  find_cuda_helper_libs(npps)
+  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppial_LIBRARY};${CUDA_nppicc_LIBRARY};${CUDA_nppicom_LIBRARY};${CUDA_nppidei_LIBRARY};${CUDA_nppif_LIBRARY};${CUDA_nppig_LIBRARY};${CUDA_nppim_LIBRARY};${CUDA_nppist_LIBRARY};${CUDA_nppisu_LIBRARY};${CUDA_nppitc_LIBRARY};${CUDA_npps_LIBRARY}")
+elseif(CUDA_VERSION VERSION_GREATER "5.0")
+  # In CUDA 5.5 NPP was split into 3 separate libraries.
+  find_cuda_helper_libs(nppc)
+  find_cuda_helper_libs(nppi)
+  find_cuda_helper_libs(npps)
+  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
+elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
+  find_cuda_helper_libs(npp)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS "7.0")
+  # cusolver showed up in version 7.0
+  find_cuda_helper_libs(cusolver)
+endif()
+
+if (CUDA_BUILD_EMULATION)
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
+else()
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+endif()
+
+########################
+# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
+# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
+find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
+ HINTS
+  "$ENV{NVSDKCOMPUTE_ROOT}/C"
+  ENV NVSDKCUDA_ROOT
+  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
+ PATHS
+  "/Developer/GPU\ Computing/C"
+  )
+
+# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
+# environment variables.
+set(CUDA_SDK_SEARCH_PATH
+  "${CUDA_SDK_ROOT_DIR}"
+  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
+  "/Developer/CUDA"
+  )
+
+# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
+
+# find_path(CUDA_CUT_INCLUDE_DIR
+#   cutil.h
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   PATH_SUFFIXES "common/inc"
+#   DOC "Location of cutil.h"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
+
+# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
+
+
+# Example of how to find a library in the CUDA_SDK_ROOT_DIR
+
+# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
+# # to get these confused, so we are setting the name based on the word size of
+# # the build.
+
+# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+#   set(cuda_cutil_name cutil64)
+# else()
+#   set(cuda_cutil_name cutil32)
+# endif()
+
+# find_library(CUDA_CUT_LIBRARY
+#   NAMES cutil ${cuda_cutil_name}
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   # The new version of the sdk shows up in common/lib, but the old one is in lib
+#   PATH_SUFFIXES "common/lib" "lib"
+#   DOC "Location of cutil library"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
+# mark_as_advanced(CUDA_CUT_LIBRARY)
+# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
+
+
+
+#############################
+# Check for required components
+set(CUDA_FOUND TRUE)
+
+set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
+set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+
+find_package_handle_standard_args(CUDA
+  REQUIRED_VARS
+    CUDA_TOOLKIT_ROOT_DIR
+    CUDA_NVCC_EXECUTABLE
+    CUDA_INCLUDE_DIRS
+    ${CUDA_CUDART_LIBRARY_VAR}
+  VERSION_VAR
+    CUDA_VERSION
+  )
+
+
+
+###############################################################################
+###############################################################################
+# Macros
+###############################################################################
+###############################################################################
+
+###############################################################################
+# Add include directories to pass to the nvcc command.
+macro(CUDA_INCLUDE_DIRECTORIES)
+  foreach(dir ${ARGN})
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS_USER ${dir})
+  endforeach()
+endmacro()
+
+
+##############################################################################
+cuda_find_helper_file(parse_cubin cmake)
+cuda_find_helper_file(make2cmake cmake)
+cuda_find_helper_file(run_nvcc cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/FindCUDA/select_compute_arch.cmake")
+
+##############################################################################
+# Separate the OPTIONS out from the sources
+#
+macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
+  set( ${_sources} )
+  set( ${_cmake_options} )
+  set( ${_options} )
+  set( _found_options FALSE )
+  foreach(arg ${ARGN})
+    if("x${arg}" STREQUAL "xOPTIONS")
+      set( _found_options TRUE )
+    elseif(
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
+        )
+      list(APPEND ${_cmake_options} ${arg})
+    else()
+      if ( _found_options )
+        list(APPEND ${_options} ${arg})
+      else()
+        # Assume this is a file
+        list(APPEND ${_sources} ${arg})
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
+#
+macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
+  set( _found_config )
+  foreach(arg ${ARGN})
+    # Determine if we are dealing with a perconfiguration flag
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      if (arg STREQUAL "${config_upper}")
+        set( _found_config _${arg})
+        # Set arg to nothing to keep it from being processed further
+        set( arg )
+      endif()
+    endforeach()
+
+    if ( arg )
+      list(APPEND ${_option_prefix}${_found_config} "${arg}")
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Helper to add the include directory for CUDA only once
+function(CUDA_ADD_CUDA_INCLUDE_ONCE)
+  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
+  set(_add TRUE)
+  if(_include_directories)
+    foreach(dir ${_include_directories})
+      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
+        set(_add FALSE)
+      endif()
+    endforeach()
+  endif()
+  if(_add)
+    include_directories(${CUDA_INCLUDE_DIRS})
+  endif()
+endfunction()
+
+function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
+  set(cmake_args ${ARGN})
+  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
+  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
+  list(FIND cmake_args SHARED _cuda_found_SHARED)
+  list(FIND cmake_args MODULE _cuda_found_MODULE)
+  list(FIND cmake_args STATIC _cuda_found_STATIC)
+  if( _cuda_found_SHARED GREATER -1 OR
+      _cuda_found_MODULE GREATER -1 OR
+      _cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs)
+  else()
+    if (BUILD_SHARED_LIBS)
+      set(_cuda_build_shared_libs SHARED)
+    else()
+      set(_cuda_build_shared_libs STATIC)
+    endif()
+  endif()
+  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
+endfunction()
+
+##############################################################################
+# Helper to avoid clashes of files with the same basename but different paths.
+# This doesn't attempt to do exactly what CMake internals do, which is to only
+# add this path when there is a conflict, since by the time a second collision
+# in names is detected it's already too late to fix the first one.  For
+# consistency sake the relative path will be added to all files.
+function(CUDA_COMPUTE_BUILD_PATH path build_path)
+  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
+  # Only deal with CMake style paths from here on out
+  file(TO_CMAKE_PATH "${path}" bpath)
+  if (IS_ABSOLUTE "${bpath}")
+    # Absolute paths are generally unnessary, especially if something like
+    # file(GLOB_RECURSE) is used to pick up the files.
+
+    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+    if (_binary_dir_pos EQUAL 0)
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+    else()
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+    endif()
+  endif()
+
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # CMake source.
+
+  # Remove leading /
+  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+  # Avoid absolute paths by removing ':'
+  string(REPLACE ":" "_" bpath "${bpath}")
+  # Avoid relative paths that go up the tree
+  string(REPLACE "../" "__/" bpath "${bpath}")
+  # Avoid spaces
+  string(REPLACE " " "_" bpath "${bpath}")
+
+  # Strip off the filename.  I wait until here to do it, since removin the
+  # basename can make a path that looked like path/../basename turn into
+  # path/.. (notice the trailing slash).
+  get_filename_component(bpath "${bpath}" PATH)
+
+  set(${build_path} "${bpath}" PARENT_SCOPE)
+  #message("${build_path} = ${bpath}")
+endfunction()
+
+##############################################################################
+# This helper macro populates the following variables and setups up custom
+# commands and targets to invoke the nvcc compiler to generate C or PTX source
+# dependent upon the format parameter.  The compiler is invoked once with -M
+# to generate a dependency file and a second time with -cuda or -ptx to generate
+# a .cpp or .ptx file.
+# INPUT:
+#   cuda_target         - Target name
+#   format              - PTX, CUBIN, FATBIN or OBJ
+#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
+#   OPTIONS             - Extra options to NVCC
+# OUTPUT:
+#   generated_files     - List of generated files
+##############################################################################
+##############################################################################
+
+macro(CUDA_WRAP_SRCS cuda_target format generated_files)
+
+  # Put optional arguments in list.
+  set(_argn_list "${ARGN}")
+  # If one of the given optional arguments is "PHONY", make a note of it, then
+  # remove it from the list.
+  list(FIND _argn_list "PHONY" _phony_idx)
+  if("${_phony_idx}" GREATER "-1")
+    set(_target_is_phony true)
+    list(REMOVE_AT _argn_list ${_phony_idx})
+  else()
+    set(_target_is_phony false)
+  endif()
+
+  # If CMake doesn't support separable compilation, complain
+  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
+    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
+  endif()
+
+  # Set up all the command line flags here, so that they can be overridden on a per target basis.
+
+  set(nvcc_flags "")
+
+  # Emulation if the card isn't present.
+  if (CUDA_BUILD_EMULATION)
+    # Emulation.
+    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
+  else()
+    # Device mode.  No flags necessary.
+  endif()
+
+  if(CUDA_HOST_COMPILATION_CPP)
+    set(CUDA_C_OR_CXX CXX)
+  else()
+    if(CUDA_VERSION VERSION_LESS "3.0")
+      set(nvcc_flags ${nvcc_flags} --host-compilation C)
+    else()
+      message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
+    endif()
+    set(CUDA_C_OR_CXX C)
+  endif()
+
+  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+
+  if(CUDA_64_BIT_DEVICE_CODE)
+    set(nvcc_flags ${nvcc_flags} -m64)
+  else()
+    set(nvcc_flags ${nvcc_flags} -m32)
+  endif()
+
+  if(CUDA_TARGET_CPU_ARCH)
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  endif()
+
+  # This needs to be passed in at this stage, because VS needs to fill out the
+  # various macros from within VS.  Note that CCBIN is only used if
+  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
+  # _CUDA_MSVC_HOST_COMPILER
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set(ccbin_flags -D "\"CCBIN:PATH=${_CUDA_MSVC_HOST_COMPILER}\"" )
+  else()
+    set(ccbin_flags)
+  endif()
+
+  # Figure out which configure we will use and pass that in as an argument to
+  # the script.  We need to defer the decision until compilation time, because
+  # for VS projects we won't know if we are making a debug or release build
+  # until build time.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set( CUDA_build_configuration "$(ConfigurationName)" )
+  else()
+    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
+  endif()
+
+  # Initialize our list of includes with the user ones followed by the CUDA system ones.
+  set(CUDA_NVCC_INCLUDE_DIRS ${CUDA_NVCC_INCLUDE_DIRS_USER} "${CUDA_INCLUDE_DIRS}")
+  if(_target_is_phony)
+    # If the passed in target name isn't a real target (i.e., this is from a call to one of the
+    # cuda_compile_* functions), need to query directory properties to get include directories
+    # and compile definitions.
+    get_directory_property(_dir_include_dirs INCLUDE_DIRECTORIES)
+    get_directory_property(_dir_compile_defs COMPILE_DEFINITIONS)
+
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "${_dir_include_dirs}")
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "${_dir_compile_defs}")
+  else()
+    # Append the include directories for this target via generator expression, which is
+    # expanded by the FILE(GENERATE) call below.  This generator expression captures all
+    # include dirs set by the user, whether via directory properties or target properties
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "$<TARGET_PROPERTY:${cuda_target},INCLUDE_DIRECTORIES>")
+
+    # Do the same thing with compile definitions
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "$<TARGET_PROPERTY:${cuda_target},COMPILE_DEFINITIONS>")
+  endif()
+
+
+  # Reset these variables
+  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
+  endforeach()
+
+  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${_argn_list})
+  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
+
+  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
+  # respected in CUDA_ADD_LIBRARY.
+  set(_cuda_build_shared_libs FALSE)
+  # SHARED, MODULE
+  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
+  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
+  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
+    set(_cuda_build_shared_libs TRUE)
+  endif()
+  # STATIC
+  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
+  if(_cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs FALSE)
+  endif()
+
+  # CUDA_HOST_FLAGS
+  if(_cuda_build_shared_libs)
+    # If we are setting up code for a shared library, then we need to add extra flags for
+    # compiling objects for shared libraries.
+    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
+  else()
+    set(CUDA_HOST_SHARED_FLAGS)
+  endif()
+
+  macro(_filter_blacklisted_host_flags CUDA_FLAGS)
+    string(REGEX REPLACE "[ \t]+" ";" ${CUDA_FLAGS} "${${CUDA_FLAGS}}")
+    foreach(_blacklisted ${CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST})
+      list(REMOVE_ITEM ${CUDA_FLAGS} "${_blacklisted}")
+    endforeach()
+    string(REPLACE ";" " " ${CUDA_FLAGS} "${${CUDA_FLAGS}}")
+  endmacro()
+
+  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
+  # always need to set the SHARED_FLAGS, though.
+  if(CUDA_PROPAGATE_HOST_FLAGS)
+    set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    _filter_blacklisted_host_flags(_cuda_C_FLAGS)
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${_cuda_C_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
+  else()
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
+  endif()
+
+  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
+  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
+    # we convert the strings to lists (like we want).
+
+    if(CUDA_PROPAGATE_HOST_FLAGS)
+      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
+      set(_cuda_fix_g3 FALSE)
+
+      if(CMAKE_COMPILER_IS_GNUCC)
+        if (CUDA_VERSION VERSION_LESS  "3.0" OR
+            CUDA_VERSION VERSION_EQUAL "4.1" OR
+            CUDA_VERSION VERSION_EQUAL "4.2"
+            )
+          set(_cuda_fix_g3 TRUE)
+        endif()
+      endif()
+      set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      _filter_blacklisted_host_flags(_cuda_C_FLAGS)
+      if(_cuda_fix_g3)
+        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${_cuda_C_FLAGS}")
+      endif()
+
+      string(APPEND _cuda_host_flags "\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
+    endif()
+
+    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
+    # like it is currently), we can remove the quotes around the
+    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
+    string(APPEND _cuda_nvcc_flags_config "\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
+  endforeach()
+
+  # Process the C++11 flag.  If the host sets the flag, we need to add it to nvcc and
+  # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses
+  # the C preprocessor).  In order to get this to work correctly, we need to use nvcc's
+  # specific c++11 flag.
+  if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11")
+    # Add the c++11 flag to nvcc if it isn't already present.  Note that we only look at
+    # the main flag instead of the configuration specific flags.
+    if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+      list(APPEND nvcc_flags --std c++11)
+    endif()
+    string(REGEX REPLACE "[-]+std=c\\+\\+11" "" _cuda_host_flags "${_cuda_host_flags}")
+  endif()
+
+  if(_cuda_build_shared_libs)
+    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
+  endif()
+
+  # Reset the output variable
+  set(_cuda_wrap_generated_files "")
+
+  # Iterate over the macro arguments and create custom
+  # commands for all the .cu files.
+  foreach(file ${_argn_list})
+    # Ignore any file marked as a HEADER_FILE_ONLY
+    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+    # Allow per source file overrides of the format.  Also allows compiling non-.cu files.
+    get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
+    if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header)
+
+      if(NOT _cuda_source_format)
+        set(_cuda_source_format ${format})
+      endif()
+      # If file isn't a .cu file, we need to tell nvcc to treat it as such.
+      if(NOT ${file} MATCHES "\\.cu$")
+        set(cuda_language_flag -x=cu)
+      else()
+        set(cuda_language_flag)
+      endif()
+
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
+      else()
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
+      endif()
+
+      if(cuda_compile_to_external_module)
+        # Don't use any of the host compilation flags for PTX targets.
+        set(CUDA_HOST_FLAGS)
+        set(CUDA_NVCC_FLAGS_CONFIG)
+      else()
+        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
+        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
+      endif()
+
+      # Determine output directory
+      cuda_compute_build_path("${file}" cuda_build_path)
+      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
+      if(CUDA_GENERATED_OUTPUT_DIR)
+        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
+      else()
+        if ( cuda_compile_to_external_module )
+          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+        else()
+          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
+        endif()
+      endif()
+
+      # Add a custom target to generate a c or ptx file. ######################
+
+      get_filename_component( basename ${file} NAME )
+      if( cuda_compile_to_external_module )
+        set(generated_file_path "${cuda_compile_output_dir}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
+        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
+      else()
+        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
+        if(CUDA_SEPARABLE_COMPILATION)
+          set(format_flag "-dc")
+        else()
+          set(format_flag "-c")
+        endif()
+      endif()
+
+      # Set all of our file names.  Make sure that whatever filenames that have
+      # generated_file_path in them get passed in through as a command line
+      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
+      # instead of configure time.
+      set(generated_file "${generated_file_path}/${generated_file_basename}")
+      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
+      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
+      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
+      set(custom_target_script_pregen "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake.pre-gen")
+      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}$<$<BOOL:$<CONFIG>>:.$<CONFIG>>.cmake")
+
+      # Setup properties for obj files:
+      if( NOT cuda_compile_to_external_module )
+        set_source_files_properties("${generated_file}"
+          PROPERTIES
+          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
+          )
+      endif()
+
+      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
+      get_filename_component(file_path "${file}" PATH)
+      if(IS_ABSOLUTE "${file_path}")
+        set(source_file "${file}")
+      else()
+        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+      endif()
+
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
+        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
+      endif()
+
+      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
+      cuda_include_nvcc_dependencies(${cmake_dependency_file})
+
+      # Convenience string for output #########################################
+      if(CUDA_BUILD_EMULATION)
+        set(cuda_build_type "Emulation")
+      else()
+        set(cuda_build_type "Device")
+      endif()
+
+      # Build the NVCC made dependency file ###################################
+      set(build_cubin OFF)
+      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
+         if ( NOT cuda_compile_to_external_module )
+           set ( build_cubin ON )
+         endif()
+      endif()
+
+      # Configure the build script
+      configure_file("${CUDA_run_nvcc}" "${custom_target_script_pregen}" @ONLY)
+      file(GENERATE
+        OUTPUT "${custom_target_script}"
+        INPUT "${custom_target_script_pregen}"
+        )
+
+      # So if a user specifies the same cuda file as input more than once, you
+      # can have bad things happen with dependencies.  Here we check an option
+      # to see if this is the behavior they want.
+      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
+        set(main_dep MAIN_DEPENDENCY ${source_file})
+      else()
+        set(main_dep DEPENDS ${source_file})
+      endif()
+
+      if(CUDA_VERBOSE_BUILD)
+        set(verbose_output ON)
+      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
+        set(verbose_output "$(VERBOSE)")
+      else()
+        set(verbose_output OFF)
+      endif()
+
+      # Create up the comment string
+      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
+      else()
+        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
+      endif()
+
+      set(_verbatim VERBATIM)
+      if(ccbin_flags MATCHES "\\$\\(VCInstallDir\\)")
+        set(_verbatim "")
+      endif()
+
+      # Build the generated file and dependency file ##########################
+      add_custom_command(
+        OUTPUT ${generated_file}
+        # These output files depend on the source_file and the contents of cmake_dependency_file
+        ${main_dep}
+        DEPENDS ${CUDA_NVCC_DEPEND}
+        DEPENDS ${custom_target_script}
+        # Make sure the output directory exists before trying to write to it.
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+        COMMAND ${CMAKE_COMMAND} ARGS
+          -D verbose:BOOL=${verbose_output}
+          ${ccbin_flags}
+          -D build_configuration:STRING=${CUDA_build_configuration}
+          -D "generated_file:STRING=${generated_file}"
+          -D "generated_cubin_file:STRING=${generated_cubin_file}"
+          -P "${custom_target_script}"
+        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
+        COMMENT "${cuda_build_comment_string}"
+        ${_verbatim}
+        )
+
+      # Make sure the build system knows the file is generated.
+      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+
+      list(APPEND _cuda_wrap_generated_files ${generated_file})
+
+      # Add the other files that we want cmake to clean on a cleanup ##########
+      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
+      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
+      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+
+    endif()
+  endforeach()
+
+  # Set the return parameter
+  set(${generated_files} ${_cuda_wrap_generated_files})
+endmacro()
+
+function(_cuda_get_important_host_flags important_flags flag_string)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  else()
+    string(REGEX MATCHALL "-fPIC" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  endif()
+  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+###############################################################################
+# Separable Compilation Link
+###############################################################################
+###############################################################################
+
+# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
+function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
+  if (object_files)
+    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
+  else()
+    set(output_file)
+  endif()
+
+  set(${output_file_var} "${output_file}" PARENT_SCOPE)
+endfunction()
+
+# Setup the build rule for the separable compilation intermediate link file.
+function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
+  if (object_files)
+
+    set_source_files_properties("${output_file}"
+      PROPERTIES
+      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
+                           # be linked.
+      GENERATED TRUE       # This file is generated during the build
+      )
+
+    # For now we are ignoring all the configuration specific flags.
+    set(nvcc_flags)
+    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
+    if(CUDA_64_BIT_DEVICE_CODE)
+      list(APPEND nvcc_flags -m64)
+    else()
+      list(APPEND nvcc_flags -m32)
+    endif()
+    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
+    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
+    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+      # Match VERBATIM check below.
+      if(CUDA_HOST_COMPILER MATCHES "\\$\\(VCInstallDir\\)")
+        list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+      else()
+        list(APPEND nvcc_flags -ccbin "${CUDA_HOST_COMPILER}")
+      endif()
+    endif()
+
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS*
+    set(config_specific_flags)
+    set(flags)
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+      set(important_host_flags)
+      _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      foreach(f ${important_host_flags})
+        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+    endforeach()
+    # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS
+    set(important_host_flags)
+    _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    foreach(f ${important_host_flags})
+      list(APPEND flags -Xcompiler ${f})
+    endforeach()
+
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
+    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
+
+    # Some generators don't handle the multiple levels of custom command
+    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
+    # we work around that issue by compiling the intermediate link object as a
+    # pre-link custom command in that situation.
+    set(do_obj_build_rule TRUE)
+    if (MSVC_VERSION GREATER 1599 AND MSVC_VERSION LESS 1800)
+      # VS 2010 and 2012 have this problem.
+      set(do_obj_build_rule FALSE)
+    endif()
+
+    set(_verbatim VERBATIM)
+    if(nvcc_flags MATCHES "\\$\\(VCInstallDir\\)")
+      set(_verbatim "")
+    endif()
+
+    if (do_obj_build_rule)
+      add_custom_command(
+        OUTPUT ${output_file}
+        DEPENDS ${object_files}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} ${CUDA_cublas_device_LIBRARY} -o ${output_file}
+        ${flags}
+        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND_EXPAND_LISTS
+        ${_verbatim}
+        )
+    else()
+      get_filename_component(output_file_dir "${output_file}" DIRECTORY)
+      add_custom_command(
+        TARGET ${cuda_target}
+        PRE_LINK
+        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} ${CUDA_cublas_device_LIBRARY} -o "${output_file}"
+        COMMAND_EXPAND_LISTS
+        ${_verbatim}
+        )
+    endif()
+ endif()
+endfunction()
+
+###############################################################################
+###############################################################################
+# ADD LIBRARY
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_LIBRARY cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
+    ${_cmake_options} ${_cuda_shared_flag}
+    OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_library(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+    ${CUDA_LIBRARIES}
+    )
+
+  if(CUDA_SEPARABLE_COMPILATION)
+    target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+      ${CUDA_cudadevrt_LIBRARY}
+      )
+  endif()
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# ADD EXECUTABLE
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_EXECUTABLE cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_executable(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# (Internal) helper for manually added cuda source files with specific targets
+###############################################################################
+###############################################################################
+macro(cuda_compile_base cuda_target format generated_files)
+  # Update a counter in this directory, to keep phony target names unique.
+  set(_cuda_target "${cuda_target}")
+  get_property(_counter DIRECTORY PROPERTY _cuda_internal_phony_counter)
+  if(_counter)
+    math(EXPR _counter "${_counter} + 1")
+  else()
+    set(_counter 1)
+  endif()
+  string(APPEND _cuda_target "_${_counter}")
+  set_property(DIRECTORY PROPERTY _cuda_internal_phony_counter ${_counter})
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${_cuda_target} ${format} _generated_files ${_sources}
+                  ${_cmake_options} OPTIONS ${_options} PHONY)
+
+  set( ${generated_files} ${_generated_files})
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE PTX
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_PTX generated_files)
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUFFT TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUFFT_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufftemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufft_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUBLAS TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUBLAS_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA BUILD CLEAN TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_BUILD_CLEAN_TARGET)
+  # Call this after you add all your CUDA targets, and you will get a
+  # convenience target.  You should also make clean after running this target
+  # to get the build system to generate all the code again.
+
+  set(cuda_clean_target_name clean_cuda_depends)
+  if (CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
+  endif()
+  add_custom_target(${cuda_clean_target_name}
+    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
+
+  # Clear out the variable, so the next time we configure it will be empty.
+  # This is useful so that the files won't persist in the list after targets
+  # have been removed.
+  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+endmacro()
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake
new file mode 100644
index 0000000..580f24a
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake
@@ -0,0 +1,106 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# This converts a file written in makefile syntax into one that can be included
+# by CMake.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Extra output
+#
+# input_file:FILEPATH=<>   Path to dependency file in makefile format
+#
+# output_file:FILEPATH=<>  Path to file with dependencies in CMake readable variable
+#
+
+file(READ ${input_file} depend_text)
+
+if (NOT "${depend_text}" STREQUAL "")
+
+  # message("FOUND DEPENDS")
+
+  string(REPLACE "\\ " " " depend_text ${depend_text})
+
+  # This works for the nvcc -M generated dependency files.
+  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+
+  set(dependency_list "")
+
+  foreach(file ${depend_text})
+
+    string(REGEX REPLACE "^ +" "" file ${file})
+
+    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
+    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
+    # try to prepend another '/' to the path and test again.  If it still fails remove the
+    # path.
+
+    if(NOT EXISTS "${file}")
+      if (EXISTS "/${file}")
+        set(file "/${file}")
+      else()
+        if(verbose)
+          message(WARNING " Removing non-existent dependency file: ${file}")
+        endif()
+        set(file "")
+      endif()
+    endif()
+
+    # Make sure we check to see if we have a file, before asking if it is not a directory.
+    # if(NOT IS_DIRECTORY "") will return TRUE.
+    if(file AND NOT IS_DIRECTORY "${file}")
+      # If softlinks start to matter, we should change this to REALPATH.  For now we need
+      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
+      # just /include.
+      get_filename_component(file_absolute "${file}" ABSOLUTE)
+      list(APPEND dependency_list "${file_absolute}")
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+# Remove the duplicate entries and sort them.
+list(REMOVE_DUPLICATES dependency_list)
+list(SORT dependency_list)
+
+foreach(file ${dependency_list})
+  string(APPEND cuda_nvcc_depend " \"${file}\"\n")
+endforeach()
+
+file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake
new file mode 100644
index 0000000..626c8a2
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake
@@ -0,0 +1,111 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# Parses a .cubin file produced by nvcc and reports statistics about the file.
+
+
+file(READ ${input_file} file_text)
+
+if (NOT "${file_text}" STREQUAL "")
+
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})
+
+  list(LENGTH file_text len)
+
+  foreach(line ${file_text})
+
+    # Only look at "code { }" blocks.
+    if(line MATCHES "^code")
+
+      # Break into individual lines.
+      string(REGEX REPLACE "\n" ";" line ${line})
+
+      foreach(entry ${line})
+
+        # Extract kernel names.
+        if (${entry} MATCHES "[^g]name = ([^ ]+)")
+          set(entry "${CMAKE_MATCH_1}")
+
+          # Check to see if the kernel name starts with "_"
+          set(skip FALSE)
+          # if (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # message("Skipping ${entry}")
+            # set(skip TRUE)
+          # else ()
+            message("Kernel:    ${entry}")
+          # endif ()
+
+        endif()
+
+        # Skip the rest of the block if necessary
+        if(NOT skip)
+
+          # Registers
+          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Registers: ${entry}")
+          endif()
+
+          # Local memory
+          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Local:     ${entry}")
+          endif()
+
+          # Shared memory
+          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Shared:    ${entry}")
+          endif()
+
+          if (${entry} MATCHES "^}")
+            message("")
+          endif()
+
+        endif()
+
+
+      endforeach()
+
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
new file mode 100644
index 0000000..6fc2439
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
@@ -0,0 +1,305 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+cmake_policy(PUSH)
+cmake_policy(SET CMP0007 NEW)
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
+set(source_file "@source_file@") # path
+set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
+set(cmake_dependency_file "@cmake_dependency_file@") # path
+set(CUDA_make2cmake "@CUDA_make2cmake@") # path
+set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
+set(build_cubin @build_cubin@) # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "@generated_file_path@") # path
+set(generated_file_internal "@generated_file@") # path
+set(generated_cubin_file_internal "@generated_cubin_file@") # path
+
+set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
+set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
+@CUDA_NVCC_FLAGS_CONFIG@
+set(nvcc_flags @nvcc_flags@) # list
+set(CUDA_NVCC_INCLUDE_DIRS "@CUDA_NVCC_INCLUDE_DIRS@") # list (needs to be in quotes to handle spaces properly).
+set(CUDA_NVCC_COMPILE_DEFINITIONS [==[@CUDA_NVCC_COMPILE_DEFINITIONS@]==]) # list (needs to be in lua quotes see #16510 ).
+set(format_flag "@format_flag@") # string
+set(cuda_language_flag @cuda_language_flag@) # list
+
+# Clean up list of include directories and add -I flags
+list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRS)
+set(CUDA_NVCC_INCLUDE_ARGS)
+foreach(dir ${CUDA_NVCC_INCLUDE_DIRS})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  list(APPEND CUDA_NVCC_INCLUDE_ARGS "-I${dir}")
+endforeach()
+
+# Clean up list of compile definitions, add -D flags, and append to nvcc_flags
+list(REMOVE_DUPLICATES CUDA_NVCC_COMPILE_DEFINITIONS)
+foreach(def ${CUDA_NVCC_COMPILE_DEFINITIONS})
+  list(APPEND nvcc_flags "-D${def}")
+endforeach()
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+@CUDA_HOST_FLAGS@
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  string(APPEND nvcc_host_compiler_flags ",\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+  if (CUDA_HOST_COMPILER STREQUAL "@_CUDA_MSVC_HOST_COMPILER@" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION @CUDA_VERSION@)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  # Note that this will remove all occurrences of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invocation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -D "verbose=${verbose}"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${cuda_language_flag}
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
+
+cmake_policy(POP)
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
new file mode 100644
index 0000000..2a196c2
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -0,0 +1,221 @@
+# Synopsis:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Fermi" "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
+
+if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+  if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    set(CUDA_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+  endif()
+endif()
+
+if (CUDA_VERSION VERSION_GREATER "6.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
+endif ()
+
+if (CUDA_VERSION VERSION_GREATER "7.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1")
+else()
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
+endif ()
+
+if (CUDA_VERSION VERSION_GREATER "8.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0" "7.0+PTX")
+else()
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX")
+endif()
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
+#
+function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+      set(file "${PROJECT_BINARY_DIR}/detect_cuda_compute_capabilities.cu")
+    else()
+      set(file "${PROJECT_BINARY_DIR}/detect_cuda_compute_capabilities.cpp")
+    endif()
+
+    file(WRITE ${file} ""
+      "#include <cuda_runtime.h>\n"
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+      try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+              RUN_OUTPUT_VARIABLE compute_capabilities)
+    else()
+      try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+              CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+              LINK_LIBRARIES ${CUDA_LIBRARIES}
+              RUN_OUTPUT_VARIABLE compute_capabilities)
+    endif()
+
+    if(run_result EQUAL 0)
+      string(REPLACE "2.1" "2.1(2.0)" compute_capabilities "${compute_capabilities}")
+      set(CUDA_GPU_DETECT_OUTPUT ${compute_capabilities}
+        CACHE INTERNAL "Returned GPU architectures from detect_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
+    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
+  else()
+    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
+# Usage:
+#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
+function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
+  set(CUDA_ARCH_LIST "${ARGN}")
+
+  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
+    set(CUDA_ARCH_LIST "Auto")
+  endif()
+
+  set(cuda_arch_bin)
+  set(cuda_arch_ptx)
+
+  if("${CUDA_ARCH_LIST}" STREQUAL "All")
+    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
+    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
+    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+  endif()
+
+  # Now process the list and look for names
+  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+  foreach(arch_name ${CUDA_ARCH_LIST})
+    set(arch_bin)
+    set(arch_ptx)
+    set(add_ptx FALSE)
+    # Check to see if we are compiling PTX
+    if(arch_name MATCHES "(.*)\\+PTX$")
+      set(add_ptx TRUE)
+      set(arch_name ${CMAKE_MATCH_1})
+    endif()
+    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+      set(arch_bin ${CMAKE_MATCH_1})
+      set(arch_ptx ${arch_bin})
+    else()
+      # Look for it in our list of known architectures
+      if(${arch_name} STREQUAL "Fermi")
+        set(arch_bin 2.0 "2.1(2.0)")
+      elseif(${arch_name} STREQUAL "Kepler+Tegra")
+        set(arch_bin 3.2)
+      elseif(${arch_name} STREQUAL "Kepler+Tesla")
+        set(arch_bin 3.7)
+      elseif(${arch_name} STREQUAL "Kepler")
+        set(arch_bin 3.0 3.5)
+        set(arch_ptx 3.5)
+      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+        set(arch_bin 5.3)
+      elseif(${arch_name} STREQUAL "Maxwell")
+        set(arch_bin 5.0 5.2)
+        set(arch_ptx 5.2)
+      elseif(${arch_name} STREQUAL "Pascal")
+        set(arch_bin 6.0 6.1)
+        set(arch_ptx 6.1)
+      elseif(${arch_name} STREQUAL "Volta")
+        set(arch_bin 7.0 7.0)
+        set(arch_ptx 7.0)
+      else()
+        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+      endif()
+    endif()
+    if(NOT arch_bin)
+      message(SEND_ERROR "arch_bin wasn't set for some reason")
+    endif()
+    list(APPEND cuda_arch_bin ${arch_bin})
+    if(add_ptx)
+      if (NOT arch_ptx)
+        set(arch_ptx ${arch_bin})
+      endif()
+      list(APPEND cuda_arch_ptx ${arch_ptx})
+    endif()
+  endforeach()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+  if(cuda_arch_bin)
+    list(REMOVE_DUPLICATES cuda_arch_bin)
+  endif()
+  if(cuda_arch_ptx)
+    list(REMOVE_DUPLICATES cuda_arch_ptx)
+  endif()
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified ARCH for the concrete CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake b/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
new file mode 100644
index 0000000..67f6bd6
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
@@ -0,0 +1,386 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindPackageHandleStandardArgs
+-----------------------------
+
+This module provides a function intended to be used in :ref:`Find Modules`
+implementing :command:`find_package(<PackageName>)` calls.  It handles the
+``REQUIRED``, ``QUIET`` and version-related arguments of ``find_package``.
+It also sets the ``<PackageName>_FOUND`` variable.  The package is
+considered found if all variables listed contain valid results, e.g.
+valid filepaths.
+
+.. command:: find_package_handle_standard_args
+
+  There are two signatures::
+
+    find_package_handle_standard_args(<PackageName>
+      (DEFAULT_MSG|<custom-failure-message>)
+      <required-var>...
+      )
+
+    find_package_handle_standard_args(<PackageName>
+      [FOUND_VAR <result-var>]
+      [REQUIRED_VARS <required-var>...]
+      [VERSION_VAR <version-var>]
+      [HANDLE_COMPONENTS]
+      [CONFIG_MODE]
+      [FAIL_MESSAGE <custom-failure-message>]
+      )
+
+  The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
+  the variables ``<required-var>...`` are valid and any optional
+  constraints are satisfied, and ``FALSE`` otherwise.  A success or
+  failure message may be displayed based on the results and on
+  whether the ``REQUIRED`` and/or ``QUIET`` option was given to
+  the :command:`find_package` call.
+
+  The options are:
+
+  ``(DEFAULT_MSG|<custom-failure-message>)``
+    In the simple signature this specifies the failure message.
+    Use ``DEFAULT_MSG`` to ask for a default message to be computed
+    (recommended).  Not valid in the full signature.
+
+  ``FOUND_VAR <result-var>``
+    Obsolete.  Specifies either ``<PackageName>_FOUND`` or
+    ``<PACKAGENAME>_FOUND`` as the result variable.  This exists only
+    for compatibility with older versions of CMake and is now ignored.
+    Result variables of both names are always set for compatibility.
+
+  ``REQUIRED_VARS <required-var>...``
+    Specify the variables which are required for this package.
+    These may be named in the generated failure message asking the
+    user to set the missing variable values.  Therefore these should
+    typically be cache entries such as ``FOO_LIBRARY`` and not output
+    variables like ``FOO_LIBRARIES``.
+
+  ``VERSION_VAR <version-var>``
+    Specify the name of a variable that holds the version of the package
+    that has been found.  This version will be checked against the
+    (potentially) specified required version given to the
+    :command:`find_package` call, including its ``EXACT`` option.
+    The default messages include information about the required
+    version and the version which has been actually found, both
+    if the version is ok or not.
+
+  ``HANDLE_COMPONENTS``
+    Enable handling of package components.  In this case, the command
+    will report which components have been found and which are missing,
+    and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
+    if any of the required components (i.e. not the ones listed after
+    the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
+    missing.
+
+  ``CONFIG_MODE``
+    Specify that the calling find module is a wrapper around a
+    call to ``find_package(<PackageName> NO_MODULE)``.  This implies
+    a ``VERSION_VAR`` value of ``<PackageName>_VERSION``.  The command
+    will automatically check whether the package configuration file
+    was found.
+
+  ``FAIL_MESSAGE <custom-failure-message>``
+    Specify a custom failure message instead of using the default
+    generated message.  Not recommended.
+
+Example for the simple signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibXml2 DEFAULT_MSG
+    LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
+
+The ``LibXml2`` package is considered to be found if both
+``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
+Then also ``LibXml2_FOUND`` is set to ``TRUE``.  If it is not found
+and ``REQUIRED`` was used, it fails with a
+:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
+used or not.  If it is found, success will be reported, including
+the content of the first ``<required-var>``.  On repeated CMake runs,
+the same message will not be printed again.
+
+Example for the full signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibArchive
+    REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
+    VERSION_VAR LibArchive_VERSION)
+
+In this case, the ``LibArchive`` package is considered to be found if
+both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
+Also the version of ``LibArchive`` will be checked by using the version
+contained in ``LibArchive_VERSION``.  Since no ``FAIL_MESSAGE`` is given,
+the default messages will be printed.
+
+Another example for the full signature:
+
+.. code-block:: cmake
+
+  find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
+  find_package_handle_standard_args(Automoc4  CONFIG_MODE)
+
+In this case, a ``FindAutmoc4.cmake`` module wraps a call to
+``find_package(Automoc4 NO_MODULE)`` and adds an additional search
+directory for ``automoc4``.  Then the call to
+``find_package_handle_standard_args`` produces a proper success/failure
+message.
+#]=======================================================================]
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
+
+# internal helper macro
+macro(_FPHSA_FAILURE_MESSAGE _msg)
+  if (${_NAME}_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else ()
+    if (NOT ${_NAME}_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif ()
+  endif ()
+endmacro()
+
+
+# internal helper macro to generate the failure message when used in CONFIG_MODE:
+macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
+  # <name>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
+  if(${_NAME}_CONFIG)
+    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
+  else()
+    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
+    # List them all in the error message:
+    if(${_NAME}_CONSIDERED_CONFIGS)
+      set(configsText "")
+      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
+      math(EXPR configsCount "${configsCount} - 1")
+      foreach(currentConfigIndex RANGE ${configsCount})
+        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
+        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
+        string(APPEND configsText "    ${filename} (version ${version})\n")
+      endforeach()
+      if (${_NAME}_NOT_FOUND_MESSAGE)
+        string(APPEND configsText "    Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n")
+      endif()
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}")
+
+    else()
+      # Simple case: No Config-file was found at all:
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
+    endif()
+  endif()
+endmacro()
+
+
+function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
+
+# Set up the arguments for `cmake_parse_arguments`.
+  set(options  CONFIG_MODE  HANDLE_COMPONENTS)
+  set(oneValueArgs  FAIL_MESSAGE  VERSION_VAR  FOUND_VAR)
+  set(multiValueArgs REQUIRED_VARS)
+
+# Check whether we are in 'simple' or 'extended' mode:
+  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
+  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
+
+  if(${INDEX} EQUAL -1)
+    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
+    set(FPHSA_REQUIRED_VARS ${ARGN})
+    set(FPHSA_VERSION_VAR)
+  else()
+    cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})
+
+    if(FPHSA_UNPARSED_ARGUMENTS)
+      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
+    endif()
+
+    if(NOT FPHSA_FAIL_MESSAGE)
+      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
+    endif()
+
+    # In config-mode, we rely on the variable <package>_CONFIG, which is set by find_package()
+    # when it successfully found the config-file, including version checking:
+    if(FPHSA_CONFIG_MODE)
+      list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
+      list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
+      set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
+    endif()
+
+    if(NOT FPHSA_REQUIRED_VARS)
+      message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
+    endif()
+  endif()
+
+# now that we collected all arguments, process them
+
+  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
+    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
+  endif()
+
+  list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
+
+  string(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOLOWER ${_NAME} _NAME_LOWER)
+
+  if(FPHSA_FOUND_VAR)
+    if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$"  OR  FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$")
+      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
+    else()
+      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.")
+    endif()
+  else()
+    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
+  endif()
+
+  # collect all variables which were not found, so they can be printed, so the
+  # user knows better what went wrong (#6375)
+  set(MISSING_VARS "")
+  set(DETAILS "")
+  # check if all passed variables are valid
+  set(FPHSA_FOUND_${_NAME} TRUE)
+  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
+    if(NOT ${_CURRENT_VAR})
+      set(FPHSA_FOUND_${_NAME} FALSE)
+      string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
+    else()
+      string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
+    endif()
+  endforeach()
+  if(FPHSA_FOUND_${_NAME})
+    set(${_NAME}_FOUND TRUE)
+    set(${_NAME_UPPER}_FOUND TRUE)
+  else()
+    set(${_NAME}_FOUND FALSE)
+    set(${_NAME_UPPER}_FOUND FALSE)
+  endif()
+
+  # component handling
+  unset(FOUND_COMPONENTS_MSG)
+  unset(MISSING_COMPONENTS_MSG)
+
+  if(FPHSA_HANDLE_COMPONENTS)
+    foreach(comp ${${_NAME}_FIND_COMPONENTS})
+      if(${_NAME}_${comp}_FOUND)
+
+        if(NOT DEFINED FOUND_COMPONENTS_MSG)
+          set(FOUND_COMPONENTS_MSG "found components: ")
+        endif()
+        string(APPEND FOUND_COMPONENTS_MSG " ${comp}")
+
+      else()
+
+        if(NOT DEFINED MISSING_COMPONENTS_MSG)
+          set(MISSING_COMPONENTS_MSG "missing components: ")
+        endif()
+        string(APPEND MISSING_COMPONENTS_MSG " ${comp}")
+
+        if(${_NAME}_FIND_REQUIRED_${comp})
+          set(${_NAME}_FOUND FALSE)
+          string(APPEND MISSING_VARS " ${comp}")
+        endif()
+
+      endif()
+    endforeach()
+    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
+    string(APPEND DETAILS "[c${COMPONENT_MSG}]")
+  endif()
+
+  # version handling:
+  set(VERSION_MSG "")
+  set(VERSION_OK TRUE)
+
+  # check with DEFINED here as the requested or found version may be "0"
+  if (DEFINED ${_NAME}_FIND_VERSION)
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})
+
+      if(${_NAME}_FIND_VERSION_EXACT)       # exact version required
+        # count the dots in the version string
+        string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${_FOUND_VERSION}")
+        # add one dot because there is one dot more than there are components
+        string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS)
+        if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT)
+          # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT
+          # is at most 4 here. Therefore a simple lookup table is used.
+          if (${_NAME}_FIND_VERSION_COUNT EQUAL 1)
+            set(_VERSION_REGEX "[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*")
+          else ()
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
+          endif ()
+          string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${_FOUND_VERSION}")
+          unset(_VERSION_REGEX)
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+          unset(_VERSION_HEAD)
+        else ()
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _FOUND_VERSION)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+        endif ()
+        unset(_VERSION_DOTS)
+
+      else()     # minimum version specified:
+        if (${_NAME}_FIND_VERSION VERSION_GREATER _FOUND_VERSION)
+          set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"")
+          set(VERSION_OK FALSE)
+        else ()
+          set(VERSION_MSG "(found suitable version \"${_FOUND_VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")")
+        endif ()
+      endif()
+
+    else()
+
+      # if the package was not found, but a version was given, add that to the output:
+      if(${_NAME}_FIND_VERSION_EXACT)
+         set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
+      else()
+         set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
+      endif()
+
+    endif()
+  else ()
+    # Check with DEFINED as the found version may be 0.
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
+    endif()
+  endif ()
+
+  if(VERSION_OK)
+    string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
+  else()
+    set(${_NAME}_FOUND FALSE)
+  endif()
+
+
+  # print the result:
+  if (${_NAME}_FOUND)
+    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
+  else ()
+
+    if(FPHSA_CONFIG_MODE)
+      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
+    else()
+      if(NOT VERSION_OK)
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})")
+      else()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
+      endif()
+    endif()
+
+  endif ()
+
+  set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+  set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Modules_CUDA_fix/upstream/FindPackageMessage.cmake b/cmake/Modules_CUDA_fix/upstream/FindPackageMessage.cmake
new file mode 100644
index 0000000..6821cee
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindPackageMessage.cmake
@@ -0,0 +1,47 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#.rst:
+# FindPackageMessage
+# ------------------
+#
+#
+#
+# FIND_PACKAGE_MESSAGE(<name> "message for user" "find result details")
+#
+# This macro is intended to be used in FindXXX.cmake modules files.  It
+# will print a message once for each unique find result.  This is useful
+# for telling the user where a package was found.  The first argument
+# specifies the name (XXX) of the package.  The second argument
+# specifies the message to display.  The third argument lists details
+# about the find result so that if they change the message will be
+# displayed again.  The macro also obeys the QUIET argument to the
+# find_package command.
+#
+# Example:
+#
+# ::
+#
+#   if(X11_FOUND)
+#     FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}"
+#       "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
+#   else()
+#    ...
+#   endif()
+
+function(FIND_PACKAGE_MESSAGE pkg msg details)
+  # Avoid printing a message repeatedly for the same find result.
+  if(NOT ${pkg}_FIND_QUIETLY)
+    string(REPLACE "\n" "" details "${details}")
+    set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
+    if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
+      # The message has not yet been printed.
+      message(STATUS "${msg}")
+
+      # Save the find details in the cache to avoid printing the same
+      # message again.
+      set("${DETAILS_VAR}" "${details}"
+        CACHE INTERNAL "Details about finding ${pkg}")
+    endif()
+  endif()
+endfunction()
diff --git a/cmake/Modules_CUDA_fix/upstream/README.md b/cmake/Modules_CUDA_fix/upstream/README.md
new file mode 100644
index 0000000..3fb6c43
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/README.md
@@ -0,0 +1,5 @@
+If you need to update files under this folder, we recommend you issue PRs
+against [the CMake mainline branch](https://gitlab.kitware.com/cmake/cmake/tree/master/Modules/FindCUDA.cmake),
+and then backport it here for earlier CMake compatibility.
+
+See [this](../README.md) for more details.
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
new file mode 100644
index 0000000..c270471
--- /dev/null
+++ b/cmake/ProtoBuf.cmake
@@ -0,0 +1,206 @@
+# Finds Google Protocol Buffers library and compilers and extends
+# the standard cmake script with version and python generation support
+macro(custom_protobuf_find)
+  message(STATUS "Use custom protobuf build.")
+  option(protobuf_BUILD_TESTS "" OFF)
+  option(protobuf_BUILD_EXAMPLES "" OFF)
+  option(protobuf_WITH_ZLIB "" OFF)
+  if (APPLE)
+    # Protobuf generated files triggers a deprecated atomic operation warning
+    # so we turn it off here.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+  endif()
+  if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+    # If we are going to link protobuf locally, we will need to turn off
+    # shared libs build for protobuf.
+    option(protobuf_BUILD_SHARED_LIBS "" OFF)
+  else()
+    # If we are building Caffe2 as shared libs, we will also build protobuf as
+    # shared libs.
+    option(protobuf_BUILD_SHARED_LIBS "" ${BUILD_SHARED_LIBS})
+  endif()
+  # We will make sure that protobuf and caffe2 uses the same msvc runtime.
+  option(protobuf_MSVC_STATIC_RUNTIME "" ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+
+  if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+    set(__caffe2_CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ${CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS})
+    set(__caffe2_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
+    set(BUILD_SHARED_LIBS OFF)
+    if (${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+    endif()
+    if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY})
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility-inlines-hidden")
+    endif()
+  endif()
+
+  set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/protobuf/cmake)
+
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE})
+
+  if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ${__caffe2_CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS})
+    set(BUILD_SHARED_LIBS ON)
+    set(CMAKE_CXX_FLAGS ${__caffe2_CMAKE_CXX_FLAGS})
+  endif()
+
+  # Protobuf "namespaced" target is only added post protobuf 3.5.1. As a
+  # result, for older versions, we will manually add alias.
+  if (NOT TARGET protobuf::libprotobuf)
+    add_library(protobuf::libprotobuf ALIAS libprotobuf)
+    add_library(protobuf::libprotobuf-lite ALIAS libprotobuf-lite)
+    add_executable(protobuf::protoc ALIAS protoc)
+  endif()
+endmacro()
+
+# Main entry for protobuf. If we are building on Android, iOS or we have hard
+# coded BUILD_CUSTOM_PROTOBUF, we will hard code the use of custom protobuf
+# in the submodule.
+if (ANDROID OR IOS)
+  if (NOT ${BUILD_CUSTOM_PROTOBUF})
+    message(WARNING
+        "For Android and iOS cross compilation, I am automatically using "
+        "custom protobuf under third party. Note that this behavior may "
+        "change in the future, and you will need to specify "
+        "-DBUILD_CUSTOM_PROTOBUF=ON explicitly.")
+  endif()
+  custom_protobuf_find()
+  # Unfortunately, new protobuf does not support libprotoc and protoc
+  # cross-compilation so we will need to exclude it.
+  # The problem of using EXCLUDE_FROM_ALL is that one is not going to be able
+  # to run cmake install. A proper solution has to be implemented by protobuf
+  # since we derive our cmake files from there.
+  # TODO(jiayq): change this once https://github.com/google/protobuf/pull/3878
+  # merges.
+  set_target_properties(
+      libprotoc protoc PROPERTIES
+      EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1)
+elseif (BUILD_CUSTOM_PROTOBUF)
+  message(STATUS "Building using own protobuf under third_party per request.")
+  custom_protobuf_find()
+else()
+  include(cmake/public/protobuf.cmake)
+endif()
+
+if ((NOT TARGET protobuf::libprotobuf) AND (NOT TARGET protobuf::libprotobuf-lite))
+  message(WARNING
+      "Protobuf cannot be found. Caffe2 will automatically switch to use "
+      "own protobuf under third_party. Note that this behavior may change in "
+      "the future, and you will need to specify -DBUILD_CUSTOM_PROTOBUF=ON "
+      "explicitly.")
+  custom_protobuf_find()
+
+  # TODO(jiayq): enable this in the future, when Jenkins Mac support is
+  # properly set up with protobuf installs.
+
+  # message(FATAL_ERROR
+  #     "Protobuf cannot be found. Caffe2 will have to build with libprotobuf. "
+  #     "Please set the proper paths so that I can find protobuf correctly.")
+endif()
+
+# Protobuf generated files use <> as inclusion path, so maybe we should use
+# SYSTEM inclusion path. But we need these include dirs to be found before
+# other protobuf include dirs in Anaconda
+get_target_property(__tmp protobuf::libprotobuf INTERFACE_INCLUDE_DIRECTORIES)
+message(STATUS "Caffe2 protobuf include directory: " ${__tmp})
+include_directories(BEFORE ${__tmp})
+
+# If Protobuf_VERSION is known (true in most cases, false if we are building
+# local protobuf), then we will add a protobuf version check in
+# Caffe2Config.cmake.in.
+if (DEFINED ${Protobuf_VERSION})
+  set(CAFFE2_KNOWN_PROTOBUF_VERSION TRUE)
+else()
+  set(CAFFE2_KNOWN_PROTOBUF_VERSION FALSE)
+  set(Protobuf_VERSION "Protobuf_VERSION_NOTFOUND")
+endif()
+
+
+# Figure out which protoc to use.
+# If CAFFE2_CUSTOM_PROTOC_EXECUTABLE is set, we assume the user knows
+# what they're doing and we blindly use the specified protoc. This
+# is typically the case when cross-compiling where protoc must be
+# compiled for the host architecture and libprotobuf must be
+# compiled for the target architecture.
+# If CAFFE2_CUSTOM_PROTOC_EXECUTABLE is NOT set, we use the protoc
+# target that is built as part of including the protobuf project.
+if(EXISTS "${CAFFE2_CUSTOM_PROTOC_EXECUTABLE}")
+  set(CAFFE2_PROTOC_EXECUTABLE ${CAFFE2_CUSTOM_PROTOC_EXECUTABLE})
+else()
+  set(CAFFE2_PROTOC_EXECUTABLE protobuf::protoc)
+endif()
+
+################################################################################################
+# Modification of standard 'protobuf_generate_cpp()' with output dir parameter and python support
+# Usage:
+#   caffe2_protobuf_generate_cpp_py(<srcs_var> <hdrs_var> <python_var> <proto_files>)
+function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: caffe_protobuf_generate_cpp_py() called without any proto files")
+    return()
+  endif()
+
+  set(${srcs_var})
+  set(${hdrs_var})
+  set(${python_var})
+  foreach(fil ${ARGN})
+    get_filename_component(abs_fil ${fil} ABSOLUTE)
+    get_filename_component(fil_we ${fil} NAME_WE)
+
+    list(APPEND ${srcs_var} "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.cc")
+    list(APPEND ${hdrs_var} "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h")
+    list(APPEND ${python_var} "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}_pb2.py")
+
+    # Note: the following depends on PROTOBUF_PROTOC_EXECUTABLE. This
+    # is done to make sure protoc is built before attempting to
+    # generate sources if we're using protoc from the third_party
+    # directory and are building it as part of the Caffe2 build. If
+    # points to an existing path, it is a no-op.
+    if (MSVC)
+      set(DLLEXPORT_STR "dllexport_decl=CAFFE2_API:")
+    else()
+      set(DLLEXPORT_STR "")
+    endif()
+
+    if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+      # We need to rewrite the pb.h files to route GetEmptyStringAlreadyInited
+      # through our wrapper in proto_utils so the memory location test
+      # is correct.
+      add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.cc"
+               "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h"
+               "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}_pb2.py"
+        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+        COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil}
+        COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil}
+
+        # If we remove all reference to these pb.h files from external
+        # libraries and binaries this rewrite can be removed.
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+
+        DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
+        COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
+    else()
+      add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.cc"
+               "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h"
+               "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}_pb2.py"
+        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+        COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil}
+        COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil}
+        DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
+        COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
+    endif()
+  endforeach()
+
+  set_source_files_properties(${${srcs_var}} ${${hdrs_var}} ${${python_var}} PROPERTIES GENERATED TRUE)
+  set(${srcs_var} ${${srcs_var}} PARENT_SCOPE)
+  set(${hdrs_var} ${${hdrs_var}} PARENT_SCOPE)
+  set(${python_var} ${${python_var}} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake
new file mode 100644
index 0000000..18ac37b
--- /dev/null
+++ b/cmake/ProtoBufPatch.cmake
@@ -0,0 +1,23 @@
+# CMake file to replace the string contents in ONNX, Caffe, and Caffe2 proto.
+# Usage example:
+#   cmake -DFILENAME=caffe2.pb.h -P ProtoBufPatch.cmake
+
+file(READ ${FILENAME} content)
+
+string(
+  REPLACE
+  "::google::protobuf::internal::GetEmptyStringAlreadyInited"
+  "GetEmptyStringAlreadyInited"
+  content
+  "${content}")
+
+foreach(ns ${NAMESPACES})
+  string(
+    REPLACE
+    "namespace ${ns} {"
+    "namespace ${ns} { const ::std::string& GetEmptyStringAlreadyInited(); "
+    content
+    "${content}")
+endforeach()
+
+file(WRITE ${FILENAME} "${content}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
new file mode 100644
index 0000000..e1debe8
--- /dev/null
+++ b/cmake/Summary.cmake
@@ -0,0 +1,139 @@
+# Prints accumulated Caffe2 configuration summary
+function (caffe2_print_configuration_summary)
+  message(STATUS "")
+  message(STATUS "******** Summary ********")
+  message(STATUS "General:")
+  message(STATUS "  CMake version         : ${CMAKE_VERSION}")
+  message(STATUS "  CMake command         : ${CMAKE_COMMAND}")
+  message(STATUS "  Git version           : ${CAFFE2_GIT_VERSION}")
+  message(STATUS "  System                : ${CMAKE_SYSTEM_NAME}")
+  message(STATUS "  C++ compiler          : ${CMAKE_CXX_COMPILER}")
+  message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
+  message(STATUS "  BLAS                  : ${BLAS}")
+  message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS}")
+  message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
+  get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
+  message(STATUS "  Compile definitions   : ${tmp}")
+  message(STATUS "  CMAKE_PREFIX_PATH     : ${CMAKE_PREFIX_PATH}")
+  message(STATUS "  CMAKE_INSTALL_PREFIX  : ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "")
+
+  message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
+  message(STATUS "  BUILD_ATEN            : ${BUILD_ATEN}")
+  message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
+  message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
+  if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+    message(STATUS "    Link local protobuf : ${CAFFE2_LINK_LOCAL_PROTOBUF}")
+  else()
+    message(STATUS "    Protobuf compiler   : ${PROTOBUF_PROTOC_EXECUTABLE}")
+    message(STATUS "    Protobuf includes   : ${PROTOBUF_INCLUDE_DIRS}")
+    message(STATUS "    Protobuf libraries  : ${PROTOBUF_LIBRARIES}")
+  endif()
+  if (${BUILD_CAFFE2})
+    message(STATUS "  BUILD_DOCS            : ${BUILD_DOCS}")
+  endif()
+  message(STATUS "  BUILD_PYTHON          : ${BUILD_PYTHON}")
+  if (${BUILD_PYTHON})
+    message(STATUS "    Python version      : ${PYTHON_VERSION_STRING}")
+    message(STATUS "    Python executable   : ${PYTHON_EXECUTABLE}")
+    message(STATUS "    Pythonlibs version  : ${PYTHONLIBS_VERSION_STRING}")
+    message(STATUS "    Python library      : ${PYTHON_LIBRARIES}")
+    message(STATUS "    Python includes     : ${PYTHON_INCLUDE_DIRS}")
+    message(STATUS "    Python site-packages: ${PYTHON_SITE_PACKAGES}")
+  endif()
+  message(STATUS "  BUILD_SHARED_LIBS     : ${BUILD_SHARED_LIBS}")
+  message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
+
+  message(STATUS "  USE_ASAN              : ${USE_ASAN}")
+  message(STATUS "  USE_ATEN              : ${USE_ATEN}")
+  message(STATUS "  USE_CUDA              : ${USE_CUDA}")
+  if(${USE_CUDA})
+    message(STATUS "    CUDA static link    : ${CAFFE2_STATIC_LINK_CUDA}")
+    message(STATUS "    USE_CUDNN           : ${USE_CUDNN}")
+    message(STATUS "    CUDA version        : ${CUDA_VERSION}")
+    if(${USE_CUDNN})
+      message(STATUS "    cuDNN version       : ${CUDNN_VERSION}")
+    endif()
+    message(STATUS "    CUDA root directory : ${CUDA_TOOLKIT_ROOT_DIR}")
+    get_target_property(__tmp caffe2::cuda IMPORTED_LOCATION)
+    message(STATUS "    CUDA library        : ${__tmp}")
+    get_target_property(__tmp caffe2::cudart INTERFACE_LINK_LIBRARIES)
+    message(STATUS "    cudart library      : ${__tmp}")
+    get_target_property(__tmp caffe2::cublas INTERFACE_LINK_LIBRARIES)
+    message(STATUS "    cublas library      : ${__tmp}")
+    get_target_property(__tmp caffe2::cufft INTERFACE_LINK_LIBRARIES)
+    message(STATUS "    cufft library       : ${__tmp}")
+    get_target_property(__tmp caffe2::curand IMPORTED_LOCATION)
+    message(STATUS "    curand library      : ${__tmp}")
+    if(${USE_CUDNN})
+      get_target_property(__tmp caffe2::cudnn IMPORTED_LOCATION)
+      message(STATUS "    cuDNN library       : ${__tmp}")
+    endif()
+    get_target_property(__tmp caffe2::nvrtc IMPORTED_LOCATION)
+    message(STATUS "    nvrtc               : ${__tmp}")
+    message(STATUS "    CUDA include path   : ${CUDA_INCLUDE_DIRS}")
+    message(STATUS "    NVCC executable     : ${CUDA_NVCC_EXECUTABLE}")
+    message(STATUS "    CUDA host compiler  : ${CUDA_HOST_COMPILER}")
+    message(STATUS "    USE_TENSORRT        : ${USE_TENSORRT}")
+    if(${USE_TENSORRT})
+      message(STATUS "      TensorRT runtime library: ${TENSORRT_LIBRARY}")
+      message(STATUS "      TensorRT include path   : ${TENSORRT_INCLUDE_DIR}")
+    endif()
+  endif()
+  message(STATUS "  USE_ROCM              : ${USE_ROCM}")
+  message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
+  message(STATUS "  USE_FFMPEG            : ${USE_FFMPEG}")
+  message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
+  message(STATUS "  USE_GLOG              : ${USE_GLOG}")
+  message(STATUS "  USE_GLOO              : ${USE_GLOO}")
+  if(${USE_GLOO})
+    message(STATUS "    USE_GLOO_IBVERBS    : ${USE_GLOO_IBVERBS}")
+  endif()
+  message(STATUS "  USE_LEVELDB           : ${USE_LEVELDB}")
+  if(${USE_LEVELDB})
+    message(STATUS "    LevelDB version     : ${LEVELDB_VERSION}")
+    message(STATUS "    Snappy version      : ${Snappy_VERSION}")
+  endif()
+  message(STATUS "  USE_LITE_PROTO        : ${USE_LITE_PROTO}")
+  message(STATUS "  USE_LMDB              : ${USE_LMDB}")
+  if(${USE_LMDB})
+    message(STATUS "    LMDB version        : ${LMDB_VERSION}")
+  endif()
+  message(STATUS "  USE_METAL             : ${USE_METAL}")
+  message(STATUS "  USE_MKL               : ${USE_MKL}")
+  if(${USE_MKL})
+    message(STATUS "    USE_MKLML           : ${USE_MKLML}")
+    message(STATUS "    USE_IDEEP           : ${USE_IDEEP}")
+  endif()
+  message(STATUS "  USE_MOBILE_OPENGL     : ${USE_MOBILE_OPENGL}")
+  message(STATUS "  USE_MPI               : ${USE_MPI}")
+  message(STATUS "  USE_NCCL              : ${USE_NCCL}")
+  if(${USE_NCCL})
+    message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
+  endif()
+  message(STATUS "  USE_NERVANA_GPU       : ${USE_NERVANA_GPU}")
+  if(${USE_NERVANA_GPU})
+    message(STATUS "    NERVANA_GPU version : ${NERVANA_GPU_VERSION}")
+  endif()
+  message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
+  message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
+  message(STATUS "  USE_OPENCL            : ${USE_OPENCL}")
+  message(STATUS "  USE_OPENCV            : ${USE_OPENCV}")
+  if(${USE_OPENCV})
+    message(STATUS "    OpenCV version      : ${OpenCV_VERSION}")
+  endif()
+  message(STATUS "  USE_OPENMP            : ${USE_OPENMP}")
+  message(STATUS "  USE_PROF              : ${USE_PROF}")
+  message(STATUS "  USE_REDIS             : ${USE_REDIS}")
+  message(STATUS "  USE_ROCKSDB           : ${USE_ROCKSDB}")
+  message(STATUS "  USE_ZMQ               : ${USE_ZMQ}")
+  if(${BUILD_ATEN})
+    message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
+    if(${USE_DISTRIBUTED})
+      message(STATUS "    USE_DISTRIBUTED_MW     : ${USE_DISTRIBUTED_MW}")
+    endif()
+  endif()
+
+  message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
+  message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
+endfunction()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 0000000..0b2ff54
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,319 @@
+################################################################################################
+# Exclude and prepend functionalities
+function (exclude OUTPUT INPUT)
+set(EXCLUDES ${ARGN})
+foreach(EXCLUDE ${EXCLUDES})
+        list(REMOVE_ITEM INPUT "${EXCLUDE}")
+endforeach()
+set(${OUTPUT} ${INPUT} PARENT_SCOPE)
+endfunction(exclude)
+
+function (prepend OUTPUT PREPEND)
+set(OUT "")
+foreach(ITEM ${ARGN})
+        list(APPEND OUT "${PREPEND}${ITEM}")
+endforeach()
+set(${OUTPUT} ${OUT} PARENT_SCOPE)
+endfunction(prepend)
+
+
+################################################################################################
+# Clears variables from list
+# Usage:
+#   caffe_clear_vars(<variables_list>)
+macro(caffe_clear_vars)
+  foreach(_var ${ARGN})
+    unset(${_var})
+  endforeach()
+endmacro()
+
+################################################################################################
+# Prints list element per line
+# Usage:
+#   caffe_print_list(<list>)
+function(caffe_print_list)
+  foreach(e ${ARGN})
+    message(STATUS ${e})
+  endforeach()
+endfunction()
+
+################################################################################################
+# Reads set of version defines from the header file
+# Usage:
+#   caffe_parse_header(<file> <define1> <define2> <define3> ..)
+macro(caffe_parse_header FILENAME FILE_VAR)
+  set(vars_regex "")
+  set(__parnet_scope OFF)
+  set(__add_cache OFF)
+  foreach(name ${ARGN})
+    if("${name}" STREQUAL "PARENT_SCOPE")
+      set(__parnet_scope ON)
+    elseif("${name}" STREQUAL "CACHE")
+      set(__add_cache ON)
+    elseif(vars_regex)
+      set(vars_regex "${vars_regex}|${name}")
+    else()
+      set(vars_regex "${name}")
+    endif()
+  endforeach()
+  if(EXISTS "${FILENAME}")
+    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
+  else()
+    unset(${FILE_VAR})
+  endif()
+  foreach(name ${ARGN})
+    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
+      if(${FILE_VAR})
+        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
+          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
+        else()
+          set(${name} "")
+        endif()
+        if(__add_cache)
+          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
+        elseif(__parnet_scope)
+          set(${name} "${${name}}" PARENT_SCOPE)
+        endif()
+      else()
+        unset(${name} CACHE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Reads single version define from the header file and parses it
+# Usage:
+#   caffe_parse_header_single_define(<library_name> <file> <define_name>)
+function(caffe_parse_header_single_define LIBNAME HDR_PATH VARNAME)
+  set(${LIBNAME}_H "")
+  if(EXISTS "${HDR_PATH}")
+    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
+  endif()
+
+  if(${LIBNAME}_H)
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
+    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+
+    # append a TWEAK version if it exists:
+    set(${LIBNAME}_VERSION_TWEAK "")
+    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
+      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
+    endif()
+    if(${LIBNAME}_VERSION_TWEAK)
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
+    else()
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+##############################################################################
+# Helper function to automatically generate __init__.py files where python
+# sources reside but there are no __init__.py present.
+function(caffe_autogen_init_py_files)
+  if (CAFFE2_AUTOGEN_INIT_PY_ALREADY_RUN)
+    message(STATUS
+        "A previous caffe2 cmake run already created the __init__.py files.")
+    return()
+  endif()
+  file(GLOB_RECURSE all_python_files RELATIVE ${PROJECT_SOURCE_DIR}
+       "${PROJECT_SOURCE_DIR}/caffe2/*.py")
+  set(python_paths_need_init_py)
+  foreach(python_file ${all_python_files})
+    get_filename_component(python_path ${python_file} PATH)
+    string(REPLACE "/" ";" path_parts ${python_path})
+    set(rebuilt_path ${CMAKE_BINARY_DIR})
+    foreach(path_part ${path_parts})
+      set(rebuilt_path "${rebuilt_path}/${path_part}")
+      list(APPEND python_paths_need_init_py ${rebuilt_path})
+    endforeach()
+  endforeach()
+  list(REMOVE_DUPLICATES python_paths_need_init_py)
+  # Since the _pb2.py files are yet to be created, we will need to manually
+  # add them to the list.
+  list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe)
+  list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe/proto)
+  list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe2/proto)
+
+  foreach(tmp ${python_paths_need_init_py})
+    if(NOT EXISTS ${tmp}/__init__.py)
+      # message(STATUS "Generate " ${tmp}/__init__.py)
+      file(WRITE ${tmp}/__init__.py "")
+    endif()
+  endforeach()
+  set(CAFFE2_AUTOGEN_INIT_PY_ALREADY_RUN TRUE CACHE INTERNAL
+      "Helper variable to record if autogen_init_py is already run or not.")
+endfunction()
+
+###
+# Removes common indentation from a block of text to produce code suitable for
+# setting to `python -c`, or using with pycmd. This allows multiline code to be
+# nested nicely in the surrounding code structure.
+#
+# This function respsects PYTHON_EXECUTABLE if it defined, otherwise it uses
+# `python` and hopes for the best. An error will be thrown if it is not found.
+#
+# Args:
+#     outvar : variable that will hold the stdout of the python command
+#     text   : text to remove indentation from
+#
+function(dedent outvar text)
+  # Use PYTHON_EXECUTABLE if it is defined, otherwise default to python
+  if ("${PYTHON_EXECUTABLE}" STREQUAL "")
+    set(_python_exe "python")
+  else()
+    set(_python_exe "${PYTHON_EXECUTABLE}")
+  endif()
+  set(_fixup_cmd "import sys; from textwrap import dedent; print(dedent(sys.stdin.read()))")
+  file(WRITE "${CMAKE_BINARY_DIR}/indented.txt" "${text}")
+  execute_process(
+    COMMAND "${_python_exe}" -c "${_fixup_cmd}"
+    INPUT_FILE "${CMAKE_BINARY_DIR}/indented.txt"
+    RESULT_VARIABLE _dedent_exitcode
+    OUTPUT_VARIABLE _dedent_text)
+  if(NOT ${_dedent_exitcode} EQUAL 0)
+    message(ERROR " Failed to remove indentation from: \n\"\"\"\n${text}\n\"\"\"
+    Python dedent failed with error code: ${_dedent_exitcode}")
+    message(FATAL_ERROR " Python dedent failed with error code: ${_dedent_exitcode}")
+  endif()
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_dedent_text}" _dedent_text)
+  set(${outvar} "${_dedent_text}" PARENT_SCOPE)
+endfunction()
+
+
+function(pycmd_no_exit outvar exitcode cmd)
+  # Use PYTHON_EXECUTABLE if it is defined, otherwise default to python
+  if ("${PYTHON_EXECUTABLE}" STREQUAL "")
+    set(_python_exe "python")
+  else()
+    set(_python_exe "${PYTHON_EXECUTABLE}")
+  endif()
+  # run the actual command
+  execute_process(
+    COMMAND "${_python_exe}" -c "${cmd}"
+    RESULT_VARIABLE _exitcode
+    OUTPUT_VARIABLE _output)
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_output}" _output)
+  set(${outvar} "${_output}" PARENT_SCOPE)
+  set(${exitcode} "${_exitcode}" PARENT_SCOPE)
+endfunction()
+
+
+###
+# Helper function to run `python -c "<cmd>"` and capture the results of stdout
+#
+# Runs a python command and populates an outvar with the result of stdout.
+# Common indentation in the text of `cmd` is removed before the command is
+# executed, so the caller does not need to worry about indentation issues.
+#
+# This function respsects PYTHON_EXECUTABLE if it defined, otherwise it uses
+# `python` and hopes for the best. An error will be thrown if it is not found.
+#
+# Args:
+#     outvar : variable that will hold the stdout of the python command
+#     cmd    : text representing a (possibly multiline) block of python code
+#
+function(pycmd outvar cmd)
+  dedent(_dedent_cmd "${cmd}")
+  pycmd_no_exit(_output _exitcode "${_dedent_cmd}")
+
+  if(NOT ${_exitcode} EQUAL 0)
+    message(ERROR " Failed when running python code: \"\"\"\n${_dedent_cmd}\n\"\"\"")
+    message(FATAL_ERROR " Python command failed with error code: ${_exitcode}")
+  endif()
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_output}" _output)
+  set(${outvar} "${_output}" PARENT_SCOPE)
+endfunction()
+
+###
+# Helper function to print out everything that cmake knows about a target
+#
+# Copied from https://stackoverflow.com/questions/32183975/how-to-print-all-the-properties-of-a-target-in-cmake
+# This isn't called anywhere, but it's very useful when debugging cmake
+# NOTE: This doesn't work for INTERFACE_LIBRARY or INTERFACE_LINK_LIBRARY targets
+
+function(print_target_properties tgt)
+  if(NOT TARGET ${tgt})
+    message("There is no target named '${tgt}'")
+    return()
+  endif()
+
+  # Get a list of all cmake properties TODO cache this lazily somehow
+  execute_process(COMMAND cmake --help-property-list OUTPUT_VARIABLE CMAKE_PROPERTY_LIST)
+  STRING(REGEX REPLACE ";" "\\\\;" CMAKE_PROPERTY_LIST "${CMAKE_PROPERTY_LIST}")
+  STRING(REGEX REPLACE "\n" ";" CMAKE_PROPERTY_LIST "${CMAKE_PROPERTY_LIST}")
+
+  foreach (prop ${CMAKE_PROPERTY_LIST})
+    string(REPLACE "<CONFIG>" "${CMAKE_BUILD_TYPE}" prop ${prop})
+    get_property(propval TARGET ${tgt} PROPERTY ${prop} SET)
+    if (propval)
+      get_target_property(propval ${tgt} ${prop})
+      message ("${tgt} ${prop} = ${propval}")
+    endif()
+  endforeach(prop)
+endfunction(print_target_properties)
+
+
+###
+# Helper function to add style warning options to the given target
+# Optionally pass in the second argument ($ARGV1) which will force -Werror if
+# it evaluates to true.
+function(target_enable_style_warnings TARGET)
+  if(MSVC)
+    # TODO Also add some warning options that MSVC can understand
+    set(WARNING_OPTIONS "")
+  else()
+    set(WARNING_OPTIONS
+            -Wall
+            -Wextra
+            -Wold-style-cast
+            -Wno-missing-braces
+            -Wcast-align
+            -Wcast-qual
+            -Wctor-dtor-privacy
+            -Wdisabled-optimization
+            -Wformat=2
+            -Winit-self
+            -Wmissing-include-dirs
+            -Woverloaded-virtual
+            -Wredundant-decls
+            -Wno-shadow
+            -Wsign-promo
+            -Wno-strict-overflow
+            -fdiagnostics-show-option
+            -Wno-conversion
+            -Wpedantic
+            -Wundef
+            )
+    # -Wno-gnu-zero-variadic-macro-arguments is not available in GCC-4.8.5. Set
+    # only when using clang.
+    # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      list(APPEND WARNING_OPTIONS "-Wno-gnu-zero-variadic-macro-arguments")
+    endif()
+    set(WERROR $ENV{WERROR})
+    if (${ARGC} GREATER 1)
+      # accessing ${ARGV1} is UB when ${ARGC} <= 1
+      # CMake doesn't do smart AND, so we have to use a nested `if`
+      if (${ARGV1})
+        set(WERROR TRUE)
+      endif()
+    endif()
+    if (WERROR)
+      list(APPEND WARNING_OPTIONS "-Werror")
+    endif()
+  endif()
+  if(APPLE)
+    set(WARNING_OPTIONS -Wno-gnu-zero-variadic-macro-arguments)
+  endif()
+  target_compile_options(${TARGET} PRIVATE ${WARNING_OPTIONS})
+endfunction()
diff --git a/cmake/Whitelist.cmake b/cmake/Whitelist.cmake
new file mode 100644
index 0000000..a283c15
--- /dev/null
+++ b/cmake/Whitelist.cmake
@@ -0,0 +1,32 @@
+
+if (__caffe2_whitelist_included)
+  return()
+endif()
+
+set (__caffe2_whitelist_included TRUE)
+
+set(CAFFE2_WHITELISTED_FILES)
+if (NOT CAFFE2_WHITELIST)
+  return()
+endif()
+
+# First read the whitelist file and break it by line.
+file(READ "${CAFFE2_WHITELIST}" whitelist_content)
+# Convert file contents into a CMake list
+string(REGEX REPLACE "\n" ";" whitelist_content ${whitelist_content})
+
+foreach(item ${whitelist_content})
+  file(GLOB_RECURSE tmp ${item})
+  set(CAFFE2_WHITELISTED_FILES ${CAFFE2_WHITELISTED_FILES} ${tmp})
+endforeach()
+
+macro(caffe2_do_whitelist output whitelist)
+  set(_tmp)
+  foreach(item ${${output}})
+    list(FIND ${whitelist} ${item} _index)
+    if (${_index} GREATER -1)
+      set(_tmp ${_tmp} ${item})
+    endif()
+  endforeach()
+  set(${output} ${_tmp})
+endmacro()
diff --git a/cmake/cmake_uninstall.cmake.in b/cmake/cmake_uninstall.cmake.in
new file mode 100644
index 0000000..d00a516
--- /dev/null
+++ b/cmake/cmake_uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
new file mode 100644
index 0000000..f70a5d1
--- /dev/null
+++ b/cmake/public/LoadHIP.cmake
@@ -0,0 +1,147 @@
+set(PYTORCH_FOUND_HIP FALSE)
+
+IF(NOT DEFINED ENV{ROCM_PATH})
+  SET(ROCM_PATH /opt/rocm)
+ELSE()
+  SET(ROCM_PATH $ENV{ROCM_PATH})
+ENDIF()
+
+# HIP_PATH
+IF(NOT DEFINED ENV{HIP_PATH})
+  SET(HIP_PATH ${ROCM_PATH}/hip)
+ELSE()
+  SET(HIP_PATH $ENV{HIP_PATH})
+ENDIF()
+
+IF(NOT EXISTS ${HIP_PATH})
+  return()
+ENDIF()
+
+# HCC_PATH
+IF(NOT DEFINED ENV{HCC_PATH})
+  SET(HCC_PATH ${ROCM_PATH}/hcc)
+ELSE()
+  SET(HCC_PATH $ENV{HCC_PATH})
+ENDIF()
+
+# HSA_PATH
+IF(NOT DEFINED ENV{HSA_PATH})
+  SET(HSA_PATH ${ROCM_PATH}/hsa)
+ELSE()
+  SET(HSA_PATH $ENV{HSA_PATH})
+ENDIF()
+
+# HIPBLAS_PATH
+IF(NOT DEFINED ENV{HIPBLAS_PATH})
+  SET(HIPBLAS_PATH ${ROCM_PATH}/hipblas)
+ELSE()
+  SET(HIPBLAS_PATH $ENV{HIPBLAS_PATH})
+ENDIF()
+
+# ROCBLAS_PATH
+IF(NOT DEFINED ENV{ROCBLAS_PATH})
+  SET(ROCBLAS_PATH ${ROCM_PATH}/rocblas)
+ELSE()
+  SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
+ENDIF()
+
+# HIPRNG_PATH
+IF(NOT DEFINED ENV{HIPRNG_PATH})
+  SET(HIPRNG_PATH ${ROCM_PATH}/hcrng)
+ELSE()
+  SET(HIPRNG_PATH $ENV{HIPRNG_PATH})
+ENDIF()
+
+# HIPSPARSE_PATH
+IF(NOT DEFINED ENV{HIPSPARSE_PATH})
+  SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
+ELSE()
+  SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH})
+ENDIF()
+
+# THRUST_PATH
+IF(DEFINED ENV{THRUST_PATH})
+  SET(THRUST_PATH $ENV{THRUST_PATH})
+ELSEIF(DEFINED ENV{THRUST_ROOT})
+  # TODO: Remove support of THRUST_ROOT environment variable
+  SET(THRUST_PATH $ENV{THRUST_ROOT})
+ELSE()
+  SET(THRUST_PATH ${ROCM_PATH}/Thrust)
+ENDIF()
+
+# HIPRAND_PATH
+IF(NOT DEFINED ENV{HIPRAND_PATH})
+  SET(HIPRAND_PATH ${ROCM_PATH}/hiprand)
+ELSE()
+  SET(HIPRAND_PATH $ENV{HIPRAND_PATH})
+ENDIF()
+
+# ROCRAND_PATH
+IF(NOT DEFINED ENV{ROCRAND_PATH})
+  SET(ROCRAND_PATH ${ROCM_PATH}/rocrand)
+ELSE()
+  SET(ROCRAND_PATH $ENV{ROCRAND_PATH})
+ENDIF()
+
+# MIOPEN_PATH
+IF(NOT DEFINED ENV{MIOPEN_PATH})
+  SET(MIOPEN_PATH ${ROCM_PATH}/miopen)
+ELSE()
+  SET(MIOPEN_PATH $ENV{MIOPEN_PATH})
+ENDIF()
+
+# Add HIP to the CMAKE Module Path
+set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
+
+# Disable Asserts In Code (Can't use asserts on HIP stack.)
+ADD_DEFINITIONS(-DNDEBUG)
+
+# Find the HIP Package
+FIND_PACKAGE(HIP 1.0)
+
+IF(HIP_FOUND)
+  set(PYTORCH_FOUND_HIP TRUE)
+
+  set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
+  set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+  set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
+  set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
+  set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
+  set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
+  set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
+
+  set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
+  set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
+  set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
+  set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+  set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
+  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
+
+  find_package(rocrand REQUIRED)
+  find_package(hiprand REQUIRED)
+  find_package(rocblas REQUIRED)
+  find_package(miopen REQUIRED)
+  #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
+  #find_package(hipsparse REQUIRED)
+
+  # TODO: hip_hcc has an interface include flag "-hc" which is only
+  # recognizable by hcc, but not gcc and clang. Right now in our
+  # setup, hcc is only used for linking, but it should be used to
+  # compile the *_hip.cc files as well.
+  FIND_LIBRARY(PYTORCH_HIP_HCC_LIBRARIES hip_hcc HINTS ${HIP_PATH}/lib)
+  # TODO: miopen_LIBRARIES should return fullpath to the library file,
+  # however currently it's just the lib name
+  FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
+  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
+  FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
+  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
+
+
+  # Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers.
+  set(hcc_INCLUDE_DIRS ${HCC_PATH}/include)
+  set(hsa_INCLUDE_DIRS ${HSA_PATH}/include)
+
+  set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)
+
+ENDIF()
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
new file mode 100644
index 0000000..314f20b
--- /dev/null
+++ b/cmake/public/cuda.cmake
@@ -0,0 +1,377 @@
+# ---[ cuda
+
+# sccache is only supported in CMake master and not in the newest official
+# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
+
+# Find CUDA.
+find_package(CUDA 7.0)
+if(NOT CUDA_FOUND)
+  message(WARNING
+    "Caffe2: CUDA cannot be found. Depending on whether you are building "
+    "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+    "give you more info.")
+  set(CAFFE2_USE_CUDA OFF)
+  return()
+endif()
+message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
+message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
+message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
+
+if(CUDA_FOUND)
+  # Sometimes, we may mismatch nvcc with the CUDA headers we are
+  # compiling with, e.g., if a ccache nvcc is fed to us by CUDA_NVCC_EXECUTABLE
+  # but the PATH is not consistent with CUDA_HOME.  It's better safe
+  # than sorry: make sure everything is consistent.
+  set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.cc")
+  file(WRITE ${file} ""
+    "#include <cuda.h>\n"
+    "#include <cstdio>\n"
+    "int main() {\n"
+    "  printf(\"%d.%d\", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);\n"
+    "  return 0;\n"
+    "}\n"
+    )
+  try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+    LINK_LIBRARIES ${CUDA_LIBRARIES}
+    RUN_OUTPUT_VARIABLE cuda_version_from_header
+    COMPILE_OUTPUT_VARIABLE output_var
+    )
+  if(NOT compile_result)
+    message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+  endif()
+  message(STATUS "Caffe2: Header version is: " ${cuda_version_from_header})
+  if(NOT ${cuda_version_from_header} STREQUAL ${CUDA_VERSION})
+    # Force CUDA to be processed for again next time
+    # TODO: I'm not sure if this counts as an implementation detail of
+    # FindCUDA
+    set(${cuda_version_from_findcuda} ${CUDA_VERSION})
+    unset(CUDA_TOOLKIT_ROOT_DIR_INTERNAL CACHE)
+    # Not strictly necessary, but for good luck.
+    unset(CUDA_VERSION CACHE)
+    # Error out
+    message(FATAL_ERROR "FindCUDA says CUDA version is ${cuda_version_from_findcuda} (usually determined by nvcc), "
+      "but the CUDA headers say the version is ${cuda_version_from_header}.  This often occurs "
+      "when you set both CUDA_HOME and CUDA_NVCC_EXECUTABLE to "
+      "non-standard locations, without also setting PATH to point to the correct nvcc.  "
+      "Perhaps, try re-running this command again with PATH=${CUDA_TOOLKIT_ROOT_DIR}/bin:$PATH.  "
+      "See above log messages for more diagnostics, and see https://github.com/pytorch/pytorch/issues/8092 for more details.")
+  endif()
+endif()
+
+# Find cuDNN.
+if(CAFFE2_STATIC_LINK_CUDA)
+  SET(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+  SET(CUDNN_LIBNAME "cudnn")
+endif()
+include(FindPackageHandleStandardArgs)
+
+if(DEFINED ENV{CUDNN_ROOT_DIR})
+  set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder contains NVIDIA cuDNN")
+else()
+  set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
+endif()
+
+if(DEFINED ENV{CUDNN_INCLUDE_DIR})
+  set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR})
+else()
+  find_path(CUDNN_INCLUDE_DIR cudnn.h
+    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES cuda/include include)
+endif()
+
+if(DEFINED ENV{CUDNN_LIBRARY})
+  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
+else()
+  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
+    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+endif()
+
+find_package_handle_standard_args(
+    CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
+if(NOT CUDNN_FOUND)
+  message(WARNING
+    "Caffe2: Cannot find cuDNN library. Turning the option off")
+  set(CAFFE2_USE_CUDNN OFF)
+else()
+  set(CAFFE2_USE_CUDNN ON)
+endif()
+
+# Optionally, find TensorRT
+if(CAFFE2_USE_TENSORRT)
+  find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES include)
+  find_library(TENSORRT_LIBRARY nvinfer
+    HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/x64)
+  find_package_handle_standard_args(
+    TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY)
+  if(NOT TENSORRT_FOUND)
+    message(WARNING
+      "Caffe2: Cannot find TensorRT library. Turning the option off")
+    set(CAFFE2_USE_TENSORRT OFF)
+  endif()
+endif()
+
+# ---[ Extract versions
+if(CAFFE2_USE_CUDNN)
+  # Get cuDNN version
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_HEADER_CONTENTS)
+  string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+               CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+  string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+               CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+  string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+               CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+               CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+  # Assemble cuDNN version
+  if(NOT CUDNN_VERSION_MAJOR)
+    set(CUDNN_VERSION "?")
+  else()
+    set(CUDNN_VERSION
+        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+  endif()
+  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
+endif()
+
+# ---[ CUDA libraries wrapper
+
+# find libcuda.so and lbnvrtc.so
+# For libcuda.so, we will find it under lib, lib64, and then the
+# stubs folder, in case we are building on a system that does not
+# have cuda driver installed. On windows, we also search under the
+# folder lib/x64.
+find_library(CUDA_CUDA_LIB cuda
+    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs lib/x64)
+find_library(CUDA_NVRTC_LIB nvrtc
+    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/x64)
+
+# Create new style imported libraries.
+# Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
+# is set. This path is where sane CUDA installations have their static
+# libraries installed. This flag should only be used for binary builds, so
+# end-users should never have this flag set.
+
+# cuda
+add_library(caffe2::cuda UNKNOWN IMPORTED)
+set_property(
+    TARGET caffe2::cuda PROPERTY IMPORTED_LOCATION
+    ${CUDA_CUDA_LIB})
+set_property(
+    TARGET caffe2::cuda PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# cudart. CUDA_LIBRARIES is actually a list, so we will make an interface
+# library.
+add_library(caffe2::cudart INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA)
+    set_property(
+        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
+else()
+    set_property(
+        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
+        ${CUDA_LIBRARIES})
+endif()
+set_property(
+    TARGET caffe2::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# cudnn
+# static linking is handled by USE_STATIC_CUDNN environment variable
+if(CAFFE2_USE_CUDNN)
+  add_library(caffe2::cudnn UNKNOWN IMPORTED)
+  set_property(
+      TARGET caffe2::cudnn PROPERTY IMPORTED_LOCATION
+      ${CUDNN_LIBRARY})
+  set_property(
+      TARGET caffe2::cudnn PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${CUDNN_INCLUDE_DIR})
+endif()
+
+# curand
+add_library(caffe2::curand UNKNOWN IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA)
+    set_property(
+        TARGET caffe2::curand PROPERTY IMPORTED_LOCATION
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a")
+else()
+    set_property(
+        TARGET caffe2::curand PROPERTY IMPORTED_LOCATION
+        ${CUDA_curand_LIBRARY})
+endif()
+set_property(
+    TARGET caffe2::curand PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# cufft. CUDA_CUFFT_LIBRARIES is actually a list, so we will make an
+# interface library similar to cudart.
+add_library(caffe2::cufft INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA)
+    set_property(
+        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static.a")
+else()
+    set_property(
+        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+        ${CUDA_CUFFT_LIBRARIES})
+endif()
+set_property(
+    TARGET caffe2::cufft PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# TensorRT
+if(CAFFE2_USE_TENSORRT)
+  add_library(caffe2::tensorrt UNKNOWN IMPORTED)
+  set_property(
+      TARGET caffe2::tensorrt PROPERTY IMPORTED_LOCATION
+      ${TENSORRT_LIBRARY})
+  set_property(
+      TARGET caffe2::tensorrt PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${TENSORRT_INCLUDE_DIR})
+endif()
+
+# cublas. CUDA_CUBLAS_LIBRARIES is actually a list, so we will make an
+# interface library similar to cudart.
+add_library(caffe2::cublas INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA)
+    set_property(
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a")
+else()
+    set_property(
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        ${CUDA_CUBLAS_LIBRARIES})
+endif()
+set_property(
+    TARGET caffe2::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# nvrtc
+add_library(caffe2::nvrtc UNKNOWN IMPORTED)
+set_property(
+    TARGET caffe2::nvrtc PROPERTY IMPORTED_LOCATION
+    ${CUDA_NVRTC_LIB})
+set_property(
+    TARGET caffe2::nvrtc PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# Note: in theory, we can add similar dependent library wrappers. For
+# now, Caffe2 only uses the above libraries, so we will only wrap
+# these.
+
+# Special care for windows platform: we know that 32-bit windows does not
+# support cuda.
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+  if(NOT (CMAKE_SIZEOF_VOID_P EQUAL 8))
+    message(FATAL_ERROR
+            "CUDA support not available with 32-bit windows. Did you "
+            "forget to set Win64 in the generator target?")
+    return()
+  endif()
+endif()
+
+if (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+
+# Add onnx namepsace definition to nvcc
+if (ONNX_NAMESPACE)
+  list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=${ONNX_NAMESPACE}")
+else()
+  list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=onnx_c2")
+endif()
+
+# CUDA 9.0 & 9.1 require GCC version <= 5
+# Although they support GCC 6, but a bug that wasn't fixed until 9.2 prevents
+# them from compiling the std::tuple header of GCC 6.
+# See Sec. 2.2.1 of
+# https://developer.download.nvidia.com/compute/cuda/9.2/Prod/docs/sidebar/CUDA_Toolkit_Release_Notes.pdf
+if ((CUDA_VERSION VERSION_EQUAL   9.0) OR
+    (CUDA_VERSION VERSION_GREATER 9.0  AND CUDA_VERSION VERSION_LESS 9.2))
+  if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+      NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 6.0 AND
+      CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER)
+    message(FATAL_ERROR
+      "CUDA ${CUDA_VERSION} is not compatible with std::tuple from GCC version "
+      ">= 6. Please upgrade to CUDA 9.2 or use the following option to use "
+      "another version (for example): \n"
+      "  -DCUDA_HOST_COMPILER=/usr/bin/gcc-5\n")
+  endif()
+elseif (CUDA_VERSION VERSION_EQUAL 8.0)
+  # CUDA 8.0 requires GCC version <= 5
+  if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+      NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 6.0 AND
+      CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER)
+    message(FATAL_ERROR
+      "CUDA 8.0 is not compatible with GCC version >= 6. "
+      "Use the following option to use another version (for example): \n"
+      "  -DCUDA_HOST_COMPILER=/usr/bin/gcc-5\n")
+  endif()
+endif()
+
+# setting nvcc arch flags
+torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
+
+# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
+foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
+  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
+endforeach()
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror")
+if (NOT MSVC)
+  list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif()
+
+# Debug and Release symbol support
+if (MSVC)
+  if ((${CMAKE_BUILD_TYPE} MATCHES "Release") OR (${CMAKE_BUILD_TYPE} MATCHES "RelWithDebInfo") OR (${CMAKE_BUILD_TYPE} MATCHES "MinSizeRel"))
+    if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+      list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MT")
+    else()
+      list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MD")
+    endif()
+  elseif(${CMAKE_BUILD_TYPE} MATCHES "Debug")
+    message(FATAL_ERROR
+            "Caffe2 currently does not support the combination of MSVC, Cuda "
+            "and Debug mode. Either set USE_CUDA=OFF or set the build type "
+            "to Release")
+    if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+      list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MTd")
+    else()
+      list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MDd")
+    endif()
+  else()
+    message(FATAL_ERROR "Unknown cmake build type: " ${CMAKE_BUILD_TYPE})
+  endif()
+elseif (CUDA_DEVICE_DEBUG)
+  list(APPEND CUDA_NVCC_FLAGS "-g" "-G")  # -G enables device code debugging symbols
+endif()
+
+# Set expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+# Set expt-extended-lambda to support lambda on device
+list(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda")
diff --git a/cmake/public/gflags.cmake b/cmake/public/gflags.cmake
new file mode 100644
index 0000000..ac4002e
--- /dev/null
+++ b/cmake/public/gflags.cmake
@@ -0,0 +1,84 @@
+# ---[ gflags
+
+# We will try to use the config mode first, and then manual find.
+find_package(gflags CONFIG QUIET)
+if (NOT TARGET gflags)
+  find_package(gflags MODULE QUIET)
+endif()
+
+if (TARGET gflags)
+  message(STATUS "Caffe2: Found gflags with new-style gflags target.")
+elseif(GFLAGS_FOUND)
+  message(STATUS "Caffe2: Found gflags with old-style gflag starget.")
+  add_library(gflags UNKNOWN IMPORTED)
+  set_property(
+      TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARY})
+  set_property(
+      TARGET gflags PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${GFLAGS_INCLUDE_DIR})
+else()
+  message(STATUS
+      "Caffe2: Cannot find gflags automatically. Using legacy find.")
+
+  # - Try to find GFLAGS in the legacy way.
+  #
+  # The following variables are optionally searched for defaults
+  #  GFLAGS_ROOT_DIR: Base directory where all GFLAGS components are found
+  #
+  # The following are set after configuration is done:
+  #  GFLAGS_FOUND
+  #  GFLAGS_INCLUDE_DIRS
+  #  GFLAGS_LIBRARIES
+  #  GFLAGS_LIBRARYRARY_DIRS
+  include(FindPackageHandleStandardArgs)
+  set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags")
+
+  # We are testing only a couple of files in the include directories
+  if(WIN32)
+    find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+        PATHS ${GFLAGS_ROOT_DIR}/src/windows)
+  else()
+    find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+        PATHS ${GFLAGS_ROOT_DIR})
+  endif()
+
+  if(WIN32)
+    find_library(GFLAGS_LIBRARY_RELEASE
+        NAMES libgflags
+        PATHS ${GFLAGS_ROOT_DIR}
+        PATH_SUFFIXES Release)
+
+    find_library(GFLAGS_LIBRARY_DEBUG
+        NAMES libgflags-debug
+        PATHS ${GFLAGS_ROOT_DIR}
+        PATH_SUFFIXES Debug)
+    set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG})
+  else()
+    find_library(GFLAGS_LIBRARY gflags)
+  endif()
+
+  find_package_handle_standard_args(
+      gflags DEFAULT_MSG GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
+
+  if(GFLAGS_FOUND)
+    message(
+        STATUS
+        "Caffe2: Found gflags  (include: ${GFLAGS_INCLUDE_DIR}, "
+        "library: ${GFLAGS_LIBRARY})")
+    add_library(gflags UNKNOWN IMPORTED)
+    set_property(
+        TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARY})
+    set_property(
+        TARGET gflags PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${GFLAGS_INCLUDE_DIR})
+  endif()
+endif()
+
+# After above, we should have the gflags target now.
+if (NOT TARGET gflags)
+  message(WARNING
+      "Caffe2: gflags cannot be found. Depending on whether you are building "
+      "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+      "give you more info.")
+endif()
+
diff --git a/cmake/public/glog.cmake b/cmake/public/glog.cmake
new file mode 100644
index 0000000..f719da3
--- /dev/null
+++ b/cmake/public/glog.cmake
@@ -0,0 +1,71 @@
+# ---[ glog
+
+# We will try to use the config mode first, and then manual find.
+find_package(glog CONFIG QUIET)
+if (NOT TARGET glog::glog)
+  find_package(glog MODULE QUIET)
+endif()
+
+if (TARGET glog::glog)
+  message(STATUS "Caffe2: Found glog with new-style glog target.")
+elseif(GLOG_FOUND)
+  message(
+      STATUS
+      "Caffe2: Found glog with old-style glog starget. Glog never shipped "
+      "old style glog targets, so somewhere in your cmake path there might "
+      "be a custom Findglog.cmake file that got triggered. We will make a "
+      "best effort to create the new style glog target for you.")
+  add_library(glog::glog UNKNOWN IMPORTED)
+  set_property(
+      TARGET glog::glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARY})
+  set_property(
+      TARGET glog::glog PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${GLOG_INCLUDE_DIR})
+else()
+  message(STATUS "Caffe2: Cannot find glog automatically. Using legacy find.")
+
+  # - Try to find Glog
+  #
+  # The following variables are optionally searched for defaults
+  #  GLOG_ROOT_DIR: Base directory where all GLOG components are found
+  #
+  # The following are set after configuration is done:
+  #  GLOG_FOUND
+  #  GLOG_INCLUDE_DIRS
+  #  GLOG_LIBRARIES
+  #  GLOG_LIBRARYRARY_DIRS
+
+  include(FindPackageHandleStandardArgs)
+  set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog")
+  if(NOT WIN32)
+      find_path(GLOG_INCLUDE_DIR glog/logging.h
+          PATHS ${GLOG_ROOT_DIR})
+  endif()
+
+  find_library(GLOG_LIBRARY glog
+      PATHS ${GLOG_ROOT_DIR}
+      PATH_SUFFIXES lib lib64)
+
+  find_package_handle_standard_args(glog DEFAULT_MSG GLOG_INCLUDE_DIR GLOG_LIBRARY)
+
+  if(GLOG_FOUND)
+    message(STATUS
+        "Caffe2: Found glog (include: ${GLOG_INCLUDE_DIR}, "
+        "library: ${GLOG_LIBRARY})")
+    add_library(glog::glog UNKNOWN IMPORTED)
+    set_property(
+        TARGET glog::glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARY})
+    set_property(
+        TARGET glog::glog PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${GLOG_INCLUDE_DIR})
+  endif()
+endif()
+
+# After above, we should have the glog::glog target now.
+if (NOT TARGET glog::glog)
+  message(WARNING
+      "Caffe2: glog cannot be found. Depending on whether you are building "
+      "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+      "give you more info.")
+endif()
+
diff --git a/cmake/public/protobuf.cmake b/cmake/public/protobuf.cmake
new file mode 100644
index 0000000..094d222
--- /dev/null
+++ b/cmake/public/protobuf.cmake
@@ -0,0 +1,92 @@
+# ---[ Protobuf
+
+# We will try to use the config mode first, and then manual find.
+find_package(Protobuf CONFIG QUIET)
+if (NOT Protobuf_FOUND)
+  find_package(Protobuf MODULE QUIET)
+endif()
+
+if ((TARGET protobuf::libprotobuf OR TARGET protobuf::libprotobuf-lite) AND TARGET protobuf::protoc)
+  # Hooray. This is the most ideal situation, meaning that you either have a
+  # Protobuf config file installed (like on Windows), or you are using a
+  # modern CMake that ships with a FindProtobuf.cmake file that produces
+  # modern targets.
+  message(STATUS "Caffe2: Found protobuf with new-style protobuf targets.")
+elseif(Protobuf_FOUND OR PROTOBUF_FOUND)
+  # If the modern targets are not present, we will generate them for you for
+  # backward compatibility. This is backported from CMake's new FindProtobuf.cmake
+  # content.
+  if ((NOT PROTOBUF_LIBRARY) AND (NOT PROTOBUF_LITE_LIBRARY))
+    message(FATAL_ERROR
+        "Caffe2: Found protobuf with old style targets, but could not find targets."
+        " PROTOBUF_LIBRARY: " ${PROTOBUF_LIBRARY}
+        " PROTOBUF_LITE_LIBRARY: " ${PROTOBUF_LITE_LIBRARY}
+        " Protobuf_LIBRARY: " ${Protobuf_LIBRARY}
+        " Protobuf_LITE_LIBRARY: " ${Protobuf_LITE_LIBRARY})
+  endif()
+  message(STATUS "Caffe2: Found protobuf with old-style protobuf targets.")
+
+  if(PROTOBUF_LIBRARY)
+    if (NOT TARGET protobuf::libprotobuf)
+      add_library(protobuf::libprotobuf UNKNOWN IMPORTED)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIRS}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY}")
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION "${PROTOBUF_LIBRARY}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY_RELEASE}")
+      set_property(TARGET protobuf::libprotobuf APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS RELEASE)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION_RELEASE "${PROTOBUF_LIBRARY_RELEASE}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY_DEBUG}")
+      set_property(TARGET protobuf::libprotobuf APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS DEBUG)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION_DEBUG "${PROTOBUF_LIBRARY_DEBUG}")
+    endif()
+  endif()
+
+  if(PROTOBUF_LITE_LIBRARY)
+    if (NOT TARGET protobuf::libprotobuf-lite)
+      add_library(protobuf::libprotobuf-lite UNKNOWN IMPORTED)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIRS}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY}")
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION "${PROTOBUF_LITE_LIBRARY}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY_RELEASE}")
+      set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS RELEASE)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION_RELEASE "${PROTOBUF_LITE_LIBRARY_RELEASE}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY_DEBUG}")
+      set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS DEBUG)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION_DEBUG "${PROTOBUF_LITE_LIBRARY_DEBUG}")
+    endif()
+  endif()
+
+  if(PROTOBUF_PROTOC_EXECUTABLE)
+    if (NOT TARGET protobuf::protoc)
+      add_executable(protobuf::protoc IMPORTED)
+    endif()
+    set_property(TARGET protobuf::protoc PROPERTY
+        IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+  endif()
+endif()
+
+# After above, we should have the protobuf related target now.
+if ((NOT TARGET protobuf::libprotobuf) AND (NOT TARGET protobuf::libprotobuf-lite))
+  message(WARNING
+      "Protobuf cannot be found. Depending on whether you are building Caffe2 "
+      "or a Caffe2 dependent library, the next warning / error will give you "
+      "more info.")
+endif()
diff --git a/cmake/public/threads.cmake b/cmake/public/threads.cmake
new file mode 100644
index 0000000..44c3f0e
--- /dev/null
+++ b/cmake/public/threads.cmake
@@ -0,0 +1,18 @@
+find_package(Threads REQUIRED)
+# For newer CMake, Threads::Threads is already defined. Otherwise, we will
+# provide a backward compatible wrapper for Threads::Threads.
+if(THREADS_FOUND AND NOT TARGET Threads::Threads)
+  add_library(Threads::Threads INTERFACE IMPORTED)
+
+  if(THREADS_HAVE_PTHREAD_ARG)
+    set_property(
+        TARGET Threads::Threads
+        PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
+  endif()
+
+  if(CMAKE_THREAD_LIBS_INIT)
+    set_property(
+        TARGET Threads::Threads
+        PROPERTY INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+endif()
\ No newline at end of file
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
new file mode 100644
index 0000000..84ed2c4
--- /dev/null
+++ b/cmake/public/utils.cmake
@@ -0,0 +1,199 @@
+##############################################################################
+# Macro to update cached options.
+macro (caffe2_update_option variable value)
+  get_property(__help_string CACHE ${variable} PROPERTY HELPSTRING)
+  set(${variable} ${value} CACHE BOOL ${__help_string} FORCE)
+endmacro()
+
+
+##############################################################################
+# Add an interface library definition that is dependent on the source.
+macro(caffe2_interface_library SRC DST)
+  add_library(${DST} INTERFACE)
+  add_dependencies(${DST} ${SRC})
+  # Depending on the nature of the source library as well as the compiler,
+  # determine the needed compilation flags.
+  get_target_property(__src_target_type ${SRC} TYPE)
+  # Depending on the type of the source library, we will set up the
+  # link command for the specific SRC library.
+  if (${__src_target_type} STREQUAL "STATIC_LIBRARY")
+    # In the case of static library, we will need to add whole-static flags.
+    if(APPLE)
+      target_link_libraries(
+          ${DST} INTERFACE -Wl,-force_load,$<TARGET_FILE:${SRC}>)
+    elseif(MSVC)
+      # In MSVC, we will add whole archive in default.
+      target_link_libraries(
+          ${DST} INTERFACE -WHOLEARCHIVE:$<TARGET_FILE:${SRC}>)
+    else()
+      # Assume everything else is like gcc
+      target_link_libraries(${DST} INTERFACE
+          "-Wl,--whole-archive,$<TARGET_FILE:${SRC}> -Wl,--no-whole-archive")
+    endif()
+    # Link all interface link libraries of the src target as well.
+    # For static library, we need to explicitly depend on all the libraries
+    # that are the dependent library of the source library. Note that we cannot
+    # use the populated INTERFACE_LINK_LIBRARIES property, because if one of the
+    # dependent library is not a target, cmake creates a $<LINK_ONLY:src> wrapper
+    # and then one is not able to find target "src". For more discussions, check
+    #   https://gitlab.kitware.com/cmake/cmake/issues/15415
+    #   https://cmake.org/pipermail/cmake-developers/2013-May/019019.html
+    # Specifically the following quote
+    #
+    # """
+    # For STATIC libraries we can define that the PUBLIC/PRIVATE/INTERFACE keys
+    # are ignored for linking and that it always populates both LINK_LIBRARIES
+    # LINK_INTERFACE_LIBRARIES.  Note that for STATIC libraries the
+    # LINK_LIBRARIES property will not be used for anything except build-order
+    # dependencies.
+    # """
+    target_link_libraries(${DST} INTERFACE
+        $<TARGET_PROPERTY:${SRC},LINK_LIBRARIES>)
+  elseif(${__src_target_type} STREQUAL "SHARED_LIBRARY")
+    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+      target_link_libraries(${DST} INTERFACE
+          "-Wl,--no-as-needed,$<TARGET_FILE:${SRC}> -Wl,--as-needed")
+    else()
+      target_link_libraries(${DST} INTERFACE ${SRC})
+    endif()
+    # Link all interface link libraries of the src target as well.
+    # For shared libraries, we can simply depend on the INTERFACE_LINK_LIBRARIES
+    # property of the target.
+    target_link_libraries(${DST} INTERFACE
+        $<TARGET_PROPERTY:${SRC},INTERFACE_LINK_LIBRARIES>)
+  else()
+    message(FATAL_ERROR
+        "You made a CMake build file error: target " ${SRC}
+        " must be of type either STATIC_LIBRARY or SHARED_LIBRARY. However, "
+        "I got " ${__src_target_type} ".")
+  endif()
+  # For all other interface properties, manually inherit from the source target.
+  set_target_properties(${DST} PROPERTIES
+    INTERFACE_COMPILE_DEFINITIONS
+    $<TARGET_PROPERTY:${SRC},INTERFACE_COMPILE_DEFINITIONS>
+    INTERFACE_COMPILE_OPTIONS
+    $<TARGET_PROPERTY:${SRC},INTERFACE_COMPILE_OPTIONS>
+    INTERFACE_INCLUDE_DIRECTORIES
+    $<TARGET_PROPERTY:${SRC},INTERFACE_INCLUDE_DIRECTORIES>
+    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
+    $<TARGET_PROPERTY:${SRC},INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>)
+endmacro()
+
+
+##############################################################################
+# Creating a Caffe2 binary target with sources specified with relative path.
+# Usage:
+#   caffe2_binary_target(target_name_or_src <src1> [<src2>] [<src3>] ...)
+# If only target_name_or_src is specified, this target is build with one single
+# source file and the target name is autogen from the filename. Otherwise, the
+# target name is given by the first argument and the rest are the source files
+# to build the target.
+function(caffe2_binary_target target_name_or_src)
+  if (${ARGN})
+    set(__target ${target_name_or_src})
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${ARGN}")
+  else()
+    get_filename_component(__target ${target_name_or_src} NAME_WE)
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${target_name_or_src}")
+  endif()
+  add_executable(${__target} ${__srcs})
+  target_link_libraries(${__target} ${Caffe2_MAIN_LIBS})
+  # If we have Caffe2_MODULES defined, we will also link with the modules.
+  if (DEFINED Caffe2_MODULES)
+    target_link_libraries(${__target} ${Caffe2_MODULES})
+  endif()
+  install(TARGETS ${__target} DESTINATION bin)
+endfunction()
+
+
+##############################################################################
+# Multiplex between loading executables for CUDA versus HIP (AMD Software Stack).
+# Usage:
+#   torch_cuda_based_add_executable(cuda_target)
+#
+macro(torch_cuda_based_add_executable cuda_target)
+  IF (USE_ROCM)
+    hip_add_executable(${cuda_target} ${ARGN})
+  ELSEIF(USE_CUDA)
+    cuda_add_executable(${cuda_target} ${ARGN})
+  ELSE()
+
+  ENDIF()
+endmacro()
+
+
+##############################################################################
+# Multiplex between adding libraries for CUDA versus HIP (AMD Software Stack).
+# Usage:
+#   torch_cuda_based_add_library(cuda_target)
+#
+macro(torch_cuda_based_add_library cuda_target)
+  IF (USE_ROCM)
+    hip_add_library(${cuda_target} ${ARGN})
+  ELSEIF(USE_CUDA)
+    cuda_add_library(${cuda_target} ${ARGN})
+  ELSE()
+  ENDIF()
+endmacro()
+
+
+##############################################################################
+# Get the NVCC arch flags specified by TORCH_CUDA_ARCH_LIST and CUDA_ARCH_NAME.
+# Usage:
+#   torch_cuda_get_nvcc_gencode_flag(variable_to_store_flags)
+#
+macro(torch_cuda_get_nvcc_gencode_flag store_var)
+  # setting nvcc arch flags
+  if ((NOT EXISTS ${TORCH_CUDA_ARCH_LIST}) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST}))
+    message(WARNING
+        "In the future we will require one to explicitly pass "
+        "TORCH_CUDA_ARCH_LIST to cmake instead of implicitly setting it as an "
+        "env variable. This will become a FATAL_ERROR in future version of "
+        "pytorch.")
+    set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+  endif()
+  if (EXISTS ${CUDA_ARCH_NAME})
+    message(WARNING
+        "CUDA_ARCH_NAME is no longer used. Use TORCH_CUDA_ARCH_LIST instead. "
+        "Right now, CUDA_ARCH_NAME is ${CUDA_ARCH_NAME} and "
+        "TORCH_CUDA_ARCH_LIST is ${TORCH_CUDA_ARCH_LIST}.")
+    set(TORCH_CUDA_ARCH_LIST TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+  endif()
+
+  # Invoke cuda_select_nvcc_arch_flags from proper cmake FindCUDA.
+  cuda_select_nvcc_arch_flags(${store_var} ${TORCH_CUDA_ARCH_LIST})
+endmacro()
+
+
+##############################################################################
+# Add ATen compile options.
+# Usage:
+#   aten_compile_options(lib_name)
+function(aten_compile_options libname)
+  target_compile_options(${libname}
+    PRIVATE
+    -Wall
+    -Wextra
+    -fexceptions
+    -Wno-missing-field-initializers
+    -Wno-type-limits
+    -Wno-unused-parameter
+    -Wno-unknown-warning-option
+    -Wno-unknown-pragmas)
+  if ($ENV{WERROR})
+    target_compile_options(${libname} PRIVATE -Werror)
+  endif()
+endfunction()
+
+
+##############################################################################
+# Set ATen target properties.
+# Usage:
+#   aten_set_target_props(lib_name)
+function(aten_set_target_props libname)
+  if(MSVC AND AT_MKL_MT)
+    set_target_properties(${libname} PROPERTIES LINK_FLAGS_RELEASE "/NODEFAULTLIB:${VCOMP_LIB}")
+    set_target_properties(${libname} PROPERTIES LINK_FLAGS_DEBUG "/NODEFAULTLIB:${VCOMP_LIB}")
+    set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS "/NODEFAULTLIB:${VCOMP_LIB}")
+  endif()
+endfunction()
diff --git a/conda/caffe2/full/build.sh b/conda/caffe2/full/build.sh
new file mode 100755
index 0000000..1a5f93a
--- /dev/null
+++ b/conda/caffe2/full/build.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# This needs to run on Ubuntu 16.04 with gcc 5
+# Install script for Anaconda environments with CUDA on linux
+# This script is not supposed to be called directly, but should be run by:
+#
+# $ cd <path to caffe2, e.g. ~/caffe2>
+# $ conda build conda/cuda_full
+#
+# If you're debugging this, it may be useful to use the env that conda build is
+# using:
+# $ cd <anaconda_root>/conda-bld/caffe2_<timestamp>
+# $ source activate _h_env_... # some long path with lots of placeholders
+#
+# Also, failed builds will accumulate those caffe2_<timestamp> directories. You
+# can remove them after a succesfull build with
+# $ conda build purge
+#
+
+set -ex
+
+echo "Installing caffe2 to ${PREFIX}"
+
+PYTHON_ARGS="$(python ./scripts/get_python_cmake_flags.py)"
+
+# Build with a big suite of libraries
+CMAKE_ARGS=()
+CMAKE_ARGS+=("-DUSE_CUDA=ON")
+CMAKE_ARGS+=("-DCUDA_ARCH_NAME=All")
+CMAKE_ARGS+=("-DUSE_GFLAGS=ON")
+CMAKE_ARGS+=("-DUSE_GLOG=ON")
+CMAKE_ARGS+=("-DUSE_GLOO=ON")
+CMAKE_ARGS+=("-DUSE_LMDB=ON")
+CMAKE_ARGS+=("-DUSE_NCCL=ON")
+CMAKE_ARGS+=("-DUSE_OPENCV=ON")
+
+# cuDNN and NCCL come from module locations
+CMAKE_ARGS+=("-DCUDNN_ROOT_DIR=/public/apps/cudnn/v7.0/cuda/")
+CMAKE_ARGS+=("-DNCCL_ROOT_DIR=$NCCL_ROOT_DIR")
+
+# openmpi is needed but can't be included from conda, b/c it's only available
+# in conda-forge, which uses gcc 4.8.5
+CMAKE_ARGS+=("-DUSE_MPI=ON")
+
+# Use MKL and hack around a broken eigen op
+CMAKE_ARGS+=("-DBLAS=MKL")
+rm -rf ./caffe2/operators/conv_op_eigen.cc
+
+# Explicitly turn unused packages off to prevent cmake from trying to find
+# system libraries. If conda packages are built with any system libraries then
+# they will not be relocatable.
+CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
+CMAKE_ARGS+=("-DUSE_REDIS=OFF")
+CMAKE_ARGS+=("-DUSE_ROCKSDB=OFF")
+
+# Install under specified prefix
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=$PREFIX")
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$PREFIX")
+
+# Build
+mkdir -p build
+cd build
+cmake "${CMAKE_ARGS[@]}"  $CONDA_CMAKE_ARGS $PYTHON_ARGS ..
+make VERBOSE=1 "-j$(nproc)"
+
+make install/fast
diff --git a/conda/caffe2/full/conda_build_config.yaml b/conda/caffe2/full/conda_build_config.yaml
new file mode 100644
index 0000000..2081dd9
--- /dev/null
+++ b/conda/caffe2/full/conda_build_config.yaml
@@ -0,0 +1,6 @@
+protobuf:
+  - 3.4.1
+pin_run_as_build:
+  protobuf:
+    min_pin: x.x
+    max_pin: x.x
diff --git a/conda/caffe2/full/meta.yaml b/conda/caffe2/full/meta.yaml
new file mode 100644
index 0000000..268d357
--- /dev/null
+++ b/conda/caffe2/full/meta.yaml
@@ -0,0 +1,45 @@
+{% set version = "0.8.dev" %}
+
+package:
+  name: caffe2-cuda-full
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: 0
+  skip: True  # [win]
+  script_env:
+    - CONDA_CMAKE_BUILD_ARGS
+    - NCCL_ROOT_DIR
+
+requirements:
+  build:
+    - cmake
+    - future
+    - mkl
+    - mkl-include
+    - numpy
+    - python
+    - six
+  run:
+    - future
+    - mkl
+    - mkl-include
+    - numpy
+    - python
+    - six
+
+test:
+  imports:
+    - caffe2.python.core
+
+about:
+  home: https://caffe2.ai/
+  license: BSD
+  summary: Caffe2 is a lightweight, modular, and scalable deep learning framework.
+
+extra:
+  recipe-maintainers:
+    - pjh5
diff --git a/conda/caffe2/meta.yaml b/conda/caffe2/meta.yaml
new file mode 100644
index 0000000..7f18a52
--- /dev/null
+++ b/conda/caffe2/meta.yaml
@@ -0,0 +1,51 @@
+
+package:
+  name: caffe2
+  version: {{ environ.get('PYTORCH_BUILD_VERSION') }}
+
+source:
+  path: ../../..
+
+build:
+  number: 0
+  string: py{{py}}_{{ environ.get('PYTORCH_BUILD_DATE') }}
+  skip: True  # [win]
+  script_env:
+    - CAFFE2_CMAKE_ARGS
+    - PACKAGE_CUDA_LIBS
+    - CUDA_VERSION
+  # features section here
+
+requirements:
+  build:
+    - cmake
+    - future
+    - numpy {{ numpy }}
+    - python
+    - setuptools
+    - six
+    # build section here
+  run:
+    - future
+    - numpy
+    - protobuf
+    - python
+    - setuptools
+    - six
+    # run section here
+
+test:
+  # test section here
+  imports:
+    - caffe2.python.core
+
+about:
+  home: https://caffe2.ai/
+  license: BSD 3-Clause
+  license_family: BSD
+  license_file: LICENSE
+  summary: Caffe2 is a lightweight, modular, and scalable deep learning framework.
+
+extra:
+  recipe-maintainers:
+    - pjh5
diff --git a/conda/caffe2/normal/.gitignore b/conda/caffe2/normal/.gitignore
new file mode 100644
index 0000000..0552636
--- /dev/null
+++ b/conda/caffe2/normal/.gitignore
@@ -0,0 +1 @@
+meta.yaml
diff --git a/conda/caffe2/normal/build.sh b/conda/caffe2/normal/build.sh
new file mode 100755
index 0000000..9a9a706
--- /dev/null
+++ b/conda/caffe2/normal/build.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Install script for Anaconda environments on macOS and linux.
+# This script is not supposed to be called directly, but should be called by
+# scripts/build_anaconda.sh, which handles setting lots of needed flags
+# depending on the current system and user flags.
+#
+# If you're debugging this, it may be useful to use the env that conda build is
+# using:
+# $ cd <anaconda_root>/conda-bld/caffe2_<timestamp>
+# $ source activate _h_env_... # some long path with lots of placeholders
+#
+# Also, failed builds will accumulate those caffe2_<timestamp> directories. You
+# can remove them after a succesfull build with
+# $ conda build purge
+
+set -ex
+
+echo "Installing caffe2 to ${PREFIX}"
+
+# Install under specified prefix
+cmake_args=()
+cmake_args+=("-DCMAKE_INSTALL_PREFIX=$PREFIX")
+cmake_args+=("-DCMAKE_PREFIX_PATH=$PREFIX")
+
+# Build Caffe2
+mkdir -p build
+cd build
+cmake "${cmake_args[@]}" $CAFFE2_CMAKE_ARGS ..
+if [ "$(uname)" == 'Darwin' ]; then
+  make "-j$(sysctl -n hw.ncpu)"
+else
+  make "-j$(nproc)"
+fi
+
+make install/fast
diff --git a/conda/caffe2/normal/conda_build_config.yaml b/conda/caffe2/normal/conda_build_config.yaml
new file mode 100644
index 0000000..1c996e6
--- /dev/null
+++ b/conda/caffe2/normal/conda_build_config.yaml
@@ -0,0 +1,5 @@
+numpy: 1.14
+pin_run_as_build:
+  numpy:
+    min_pin: x.x
+    max_pin: x.x
diff --git a/conda/integrated/.gitignore b/conda/integrated/.gitignore
new file mode 100644
index 0000000..0552636
--- /dev/null
+++ b/conda/integrated/.gitignore
@@ -0,0 +1 @@
+meta.yaml
diff --git a/conda/integrated/build.sh b/conda/integrated/build.sh
new file mode 100755
index 0000000..6ccc387
--- /dev/null
+++ b/conda/integrated/build.sh
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+
+# conda-build script used for integrated pytorch-caffe2 packages
+# NOTE: The meta.yaml in this directory should not be changed; it will be
+# overwritten by the meta.yaml in conda/caffe2/normal/ when
+# scripts/build_anaconda.sh is run.
+
+# Install script for Anaconda environments on macOS and linux.
+# This script is not supposed to be called directly, but should be called by
+# scripts/build_anaconda.sh, which handles setting lots of needed flags
+# depending on the current system and user flags.
+#
+# If you're debugging this, it may be useful to use the env that conda build is
+# using:
+# $ cd <anaconda_root>/conda-bld/caffe2_<timestamp>
+# $ source activate _h_env_... # some long path with lots of placeholders
+#
+# Also, failed builds will accumulate those caffe2_<timestamp> directories. You
+# can remove them after a succesfull build with
+# $ conda build purge
+
+set -ex
+
+# Pytorch environment variables needed during the build
+export CMAKE_LIBRARY_PATH=$PREFIX/lib:$PREFIX/include:$CMAKE_LIBRARY_PATH
+export CMAKE_PREFIX_PATH=$PREFIX
+export PYTORCH_BINARY_BUILD=1
+export TH_BINARY_BUILD=1
+export PYTORCH_BUILD_VERSION=$PKG_VERSION
+export PYTORCH_BUILD_NUMBER=$PKG_BUILDNUM
+
+# Pytorch CUDA flags
+if [[ -n $CUDA_VERSION ]]; then
+  if [[ $CUDA_VERSION == 9* ]]; then
+    # compile for Kepler, Kepler+Tesla, Maxwell, Pascal, Volta
+    export TORCH_CUDA_ARCH_LIST="3.5;5.2+PTX;6.0;6.1;7.0"
+  else
+    # don't compile for Volta
+    export TORCH_CUDA_ARCH_LIST="3.5;5.2+PTX;6.0;6.1"
+  fi
+  export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+  export NCCL_ROOT_DIR=/usr/local/cuda
+  #export USE_STATIC_CUDNN=1
+  #export USE_STATIC_NCCL=1
+  #export ATEN_STATIC_CUDA=1
+else
+  export NO_CUDA=1
+fi
+
+
+###########################################################
+# Build Caffe2
+###########################################################
+cmake_args=()
+cmake_args+=("-DCMAKE_INSTALL_PREFIX=$PREFIX")
+
+# Build Caffe2
+mkdir -p caffe2_build && pushd caffe2_build
+cmake "${cmake_args[@]}" $CAFFE2_CMAKE_ARGS ..
+if [ "$(uname)" == 'Darwin' ]; then
+  make "-j$(sysctl -n hw.ncpu)"
+else
+  make "-j$(nproc)"
+fi
+make install/fast
+popd
+
+
+
+
+###########################################################
+# Build Pytorch
+###########################################################
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  MACOSX_DEPLOYMENT_TARGET=10.9 python setup.py install
+  exit 0
+fi
+python setup.py install
+
+
+
+#########################################################################
+# Copy over CUDA .so files from system locations to the conda build dir #
+#########################################################################
+# Copies libnvrtc and libnvToolsExt to the site-packages/torch/lib/ directory
+# All other CUDA libraries should be statically linked
+if [[ -z $CUDA_VERSION || -z $PACKAGE_CUDA_LIBS ]]; then
+  exit 0
+fi
+
+# Function to rename .so files with their hashes appended to them
+fname_with_sha256() {
+  HASH=$(sha256sum $1 | cut -c1-8)
+  DIRNAME=$(dirname $1)
+  BASENAME=$(basename $1)
+  if [[ $BASENAME == "libnvrtc-builtins.so" ]]; then
+	  echo $1
+  else
+	  INITNAME=$(echo $BASENAME | cut -f1 -d".")
+	  ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
+	  echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
+  fi
+}
+
+# full_path_to_dependency <parent> <target> finds the full path to
+# libtarget.so* in 'ldd libparent*.so*'
+full_path_to_dependency() {
+  local _DEP="$(find $SP_DIR/torch/ -name "${1}*.so*" -maxdepth 1)"
+  local _TAR="lib${2}.so"
+  echo $(ldd $_DEP | grep -oP '(?<= => )\S+'$_TAR'\S*')
+}
+
+# These are all the CUDA related libaries needed by Pytorch and Caffe2 that are
+# not statically linked
+DEPS_SOPATHS=()
+DEPS_SOPATHS+=($(full_path_to_dependency '_C' 'nvToolsExt'))
+DEPS_SOPATHS+=($(full_path_to_dependency '_nvrtc' 'nvrtc'))
+# TODO add nvrtc-builtins too, but that doesn't show up in ldd or in patchelf
+
+# Loop through .so, adding hashes and copying them to site-packages
+patched=()
+for filepath in "${DEPS_SOPATHS[@]}"
+do
+  filename=$(basename $filepath)
+	destpath=$SP_DIR/torch/lib/$filename
+	if [[ "$filepath" != "$destpath" ]]; then
+    echo "Copying $filepath to $destpath"
+	  cp $filepath $destpath
+	fi
+
+	patchedpath=$(fname_with_sha256 $destpath)
+	patchedname=$(basename $patchedpath)
+	if [[ "$destpath" != "$patchedpath" ]]; then
+    echo "Moving $destpath to $patchedpath"
+	  mv $destpath $patchedpath
+	fi
+
+	patched+=("$patchedname")
+	echo "Copied $filepath to $patchedpath"
+done
+
+# Run patchelf to fix all the libaries to use the hashed names
+for ((i=0;i<${#DEPS_SOPATHS[@]};++i));
+do
+	find $SP_DIR/torch -name '*.so*' | while read sofile; do
+    origname=$(basename ${DEPS_SOPATHS[i]})
+	  patchedname=${patched[i]}
+	  if [[ "$origname" != "$patchedname" ]]; then
+	    set +e
+	    patchelf --print-needed $sofile | grep $origname 2>&1 >/dev/null
+	    ERRCODE=$?
+	    set -e
+	    if [ "$ERRCODE" -eq "0" ]; then
+	        echo "patching $sofile entry $origname to $patchedname"
+	        patchelf --replace-needed $origname $patchedname $sofile
+	    fi
+	  fi
+	done
+done
+
+# set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib and conda/lib
+find $SP_DIR/torch -name "*.so*" -maxdepth 1 -type f | while read sofile; do
+	echo "Setting rpath of $sofile to " '$ORIGIN:$ORIGIN/lib:$ORIGIN/../../..'
+	patchelf --set-rpath '$ORIGIN:$ORIGIN/lib:$ORIGIN/../../..' $sofile
+	patchelf --print-rpath $sofile
+done
+
+# set RPATH of lib/ files to $ORIGIN and conda/lib
+find $SP_DIR/torch/lib -name "*.so*" -maxdepth 1 -type f | while read sofile; do
+	echo "Setting rpath of $sofile to " '$ORIGIN:$ORIGIN/lib:$ORIGIN/../../../..'
+	patchelf --set-rpath '$ORIGIN:$ORIGIN/../../../..' $sofile
+	patchelf --print-rpath $sofile
+done
diff --git a/conda/integrated/conda_build_config.yaml b/conda/integrated/conda_build_config.yaml
new file mode 100644
index 0000000..c29f8b8
--- /dev/null
+++ b/conda/integrated/conda_build_config.yaml
@@ -0,0 +1,5 @@
+numpy: 1.11
+pin_run_as_build:
+  numpy:
+    min_pin: x.x
+    max_pin: x.x
diff --git a/docker/caffe2/jenkins/README.md b/docker/caffe2/jenkins/README.md
new file mode 100644
index 0000000..0490d32
--- /dev/null
+++ b/docker/caffe2/jenkins/README.md
@@ -0,0 +1,35 @@
+# Docker images for Jenkins
+
+This directory contains everything needed to build the Docker images
+that are used in our Jenkins setup. These images provide a variety of
+build environments for which we want to ensure that Caffe2 is
+compatible.
+
+The Dockerfiles located in subdirectories are parameterized to
+conditionally run build stages depending on build arguments passed to
+`docker build`. This lets us use only a few Dockerfiles for many
+images. The different configurations are identified by a freeform
+string that we call a _build environment_. This string is persisted in
+each image as the `BUILD_ENVIRONMENT` environment variable.
+
+Examples of valid build environments are:
+
+* `py2-cuda9.0-cudnn7-ubuntu16.04`
+* `py3-mkl-ubuntu16.04`
+* `py3-gcc7-ubuntu16.04`
+* `py3-cuda8.0-cudnn7-centos7`
+
+See `build.sh` for a full list of terms that are extracted from the
+build environment into parameters for the image build.
+
+## Contents
+
+* `build.sh` -- dispatch script to launch all builds
+* `common` -- scripts used to execute individual Docker build stages
+* `centos` -- Dockerfile for CentOS image
+* `centos-cuda` -- Dockerfile for CentOS image with CUDA support for nvidia-docker
+* `ubuntu` -- Dockerfile for Ubuntu image
+* `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker
+
+
+
diff --git a/docker/caffe2/jenkins/build.sh b/docker/caffe2/jenkins/build.sh
new file mode 100755
index 0000000..357a795
--- /dev/null
+++ b/docker/caffe2/jenkins/build.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+set -ex
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+UBUNTU_VERSION="$(echo "${image}" | perl -n -e'/ubuntu(\d+\.\d+)/ && print $1')"
+CENTOS_VERSION="$(echo "${image}" | perl -n -e'/centos(\d+)/ && print $1')"
+
+if [ -n "${UBUNTU_VERSION}" ]; then
+  OS="ubuntu"
+  DOCKERFILE="ubuntu/Dockerfile"
+elif [ -n "${CENTOS_VERSION}" ]; then
+  OS="centos"
+  DOCKERFILE="centos/Dockerfile"
+else
+  echo "Unable to derive operating system base..."
+  exit 1
+fi
+
+if [[ "$image" == py* ]]; then
+  PYTHON_VERSION="$(echo "${image}" | perl -n -e'/py(\d+(\.\d+)?)/ && print $1')"
+fi
+
+if [[ "$image" == *cuda* ]]; then
+  CUDA_VERSION="$(echo "${image}" | perl -n -e'/cuda(\d+\.\d+)/ && print $1')"
+  CUDNN_VERSION="$(echo "${image}" | perl -n -e'/cudnn(\d+)/ && print $1')"
+  DOCKERFILE="${OS}-cuda/Dockerfile"
+fi
+
+# TODO: the version number here actually doesn't do anything at the
+# moment
+if [[ "$image" == *rocm* ]]; then
+  ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')"
+  DOCKERFILE="${OS}-rocm/Dockerfile"
+fi
+
+if [[ "$image" == *conda* ]]; then
+  # Unlike python version, Anaconda version is either 2 or 3
+  ANACONDA_VERSION="$(echo "${image}" | perl -n -e'/conda(\d)/ && print $1')"
+fi
+
+if [[ "$image" == *-mkl-* ]]; then
+  MKL=yes
+fi
+
+if [[ "$image" == *-android-* ]]; then
+  ANDROID=yes
+
+  # The Android NDK requires CMake 3.6 or higher.
+  # See https://github.com/caffe2/caffe2/pull/1740 for more info.
+  CMAKE_VERSION=3.6.3
+fi
+
+if [[ "$image" == *-gcc* ]]; then
+  GCC_VERSION="$(echo "${image}" | perl -n -e'/gcc(\d+(\.\d+)?)/ && print $1')"
+fi
+
+if [[ "$image" == *-clang* ]]; then
+  CLANG_VERSION="$(echo "${image}" | perl -n -e'/clang(\d+(\.\d+)?)/ && print $1')"
+fi
+
+# Copy over common scripts to directory containing the Dockerfile to build
+cp -a common/* "$(dirname ${DOCKERFILE})"
+
+# Set Jenkins UID and GID if running Jenkins
+if [ -n "${JENKINS:-}" ]; then
+  JENKINS_UID=$(id -u jenkins)
+  JENKINS_GID=$(id -g jenkins)
+fi
+
+# Build image
+docker build \
+       --build-arg "BUILD_ENVIRONMENT=${image}" \
+       --build-arg "EC2=${EC2:-}" \
+       --build-arg "JENKINS=${JENKINS:-}" \
+       --build-arg "JENKINS_UID=${JENKINS_UID:-}" \
+       --build-arg "JENKINS_GID=${JENKINS_GID:-}" \
+       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
+       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
+       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
+       --build-arg "ANACONDA_VERSION=${ANACONDA_VERSION}" \
+       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
+       --build-arg "MKL=${MKL}" \
+       --build-arg "ANDROID=${ANDROID}" \
+       --build-arg "GCC_VERSION=${GCC_VERSION}" \
+       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
+       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
+       --build-arg "ROCM_VERSION=${ROCM_VERSION}" \
+       "$@" \
+       "$(dirname ${DOCKERFILE})"
diff --git a/docker/caffe2/jenkins/centos-cuda/.gitignore b/docker/caffe2/jenkins/centos-cuda/.gitignore
new file mode 100644
index 0000000..c97f963
--- /dev/null
+++ b/docker/caffe2/jenkins/centos-cuda/.gitignore
@@ -0,0 +1 @@
+*.sh
diff --git a/docker/caffe2/jenkins/centos-cuda/Dockerfile b/docker/caffe2/jenkins/centos-cuda/Dockerfile
new file mode 100644
index 0000000..f3c922c
--- /dev/null
+++ b/docker/caffe2/jenkins/centos-cuda/Dockerfile
@@ -0,0 +1,31 @@
+ARG CENTOS_VERSION
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-centos${CENTOS_VERSION}
+
+# Install required packages to build Caffe2
+ARG EC2
+ADD ./install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Compile/install ccache for faster builds
+ADD ./install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# Install Python
+ARG PYTHON_VERSION
+ADD ./install_python.sh install_python.sh
+RUN if [ -n "${PYTHON_VERSION}" ]; then bash ./install_python.sh; fi
+RUN rm install_python.sh
+
+# (optional) Add Jenkins user
+ARG JENKINS
+ARG JENKINS_UID
+ARG JENKINS_GID
+ADD ./add_jenkins_user.sh add_jenkins_user.sh
+RUN if [ -n "${JENKINS}" ]; then bash ./add_jenkins_user.sh; fi
+RUN rm add_jenkins_user.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
diff --git a/docker/caffe2/jenkins/centos/.gitignore b/docker/caffe2/jenkins/centos/.gitignore
new file mode 100644
index 0000000..c97f963
--- /dev/null
+++ b/docker/caffe2/jenkins/centos/.gitignore
@@ -0,0 +1 @@
+*.sh
diff --git a/docker/caffe2/jenkins/centos/Dockerfile b/docker/caffe2/jenkins/centos/Dockerfile
new file mode 100644
index 0000000..9ac9fee
--- /dev/null
+++ b/docker/caffe2/jenkins/centos/Dockerfile
@@ -0,0 +1,29 @@
+ARG CENTOS_VERSION
+FROM centos:${CENTOS_VERSION}
+
+# Install required packages to build Caffe2
+ARG EC2
+ADD ./install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Compile/install ccache for faster builds
+ADD ./install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# Install Python
+ARG PYTHON_VERSION
+ADD ./install_python.sh install_python.sh
+RUN if [ -n "${PYTHON_VERSION}" ]; then bash ./install_python.sh; fi
+RUN rm install_python.sh
+
+# (optional) Add Jenkins user
+ARG JENKINS
+ARG JENKINS_UID
+ARG JENKINS_GID
+ADD ./add_jenkins_user.sh add_jenkins_user.sh
+RUN if [ -n "${JENKINS}" ]; then bash ./add_jenkins_user.sh; fi
+RUN rm add_jenkins_user.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
diff --git a/docker/caffe2/jenkins/common/add_jenkins_user.sh b/docker/caffe2/jenkins/common/add_jenkins_user.sh
new file mode 100755
index 0000000..50258f6
--- /dev/null
+++ b/docker/caffe2/jenkins/common/add_jenkins_user.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -ex
+
+# Mirror jenkins user in container
+echo "jenkins:x:$JENKINS_UID:$JENKINS_GID::/var/lib/jenkins:" >> /etc/passwd
+echo "jenkins:x:$JENKINS_GID:" >> /etc/group
+
+# Create $HOME
+mkdir -p /var/lib/jenkins
+chown jenkins:jenkins /var/lib/jenkins
+mkdir -p /var/lib/jenkins/.ccache
+chown jenkins:jenkins /var/lib/jenkins/.ccache
+
+# Allow writing to /usr/local (for make install)
+chown jenkins:jenkins /usr/local
+
+# Allow sudo
+echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
diff --git a/docker/caffe2/jenkins/common/install_anaconda.sh b/docker/caffe2/jenkins/common/install_anaconda.sh
new file mode 100755
index 0000000..0e63d4f
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_anaconda.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+
+# Pick correct Anaconda package
+CONDA_PKG_NAME="Anaconda${ANACONDA_VERSION}-5.0.1-Linux-x86_64.sh"
+CONDA_PKG_URL="https://repo.continuum.io/archive/${CONDA_PKG_NAME}"
+
+# Install anaconda
+echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh
+curl -LO "$CONDA_PKG_URL"
+/bin/bash "$CONDA_PKG_NAME" -b -p /opt/conda
+rm "$CONDA_PKG_NAME"
+
+# Install packages needed for tests, but that aren't included in the base conda
+# requirements to keep them slim
+# pyyaml is needed to build Aten
+/opt/conda/bin/conda install -y hypothesis tabulate pydot pyyaml mock
diff --git a/docker/caffe2/jenkins/common/install_android.sh b/docker/caffe2/jenkins/common/install_android.sh
new file mode 100755
index 0000000..431a7ca
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_android.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -ex
+
+apt-get update
+apt-get install -y --no-install-recommends autotools-dev autoconf unzip
+apt-get autoclean && apt-get clean
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+pushd /tmp
+curl -Os https://dl.google.com/android/repository/android-ndk-r13b-linux-x86_64.zip
+popd
+_ndk_dir=/opt/ndk
+mkdir -p "$_ndk_dir"
+unzip -qo /tmp/android*.zip -d "$_ndk_dir"
+_versioned_dir=$(find "$_ndk_dir/" -mindepth 1 -maxdepth 1 -type d)
+mv "$_versioned_dir"/* "$_ndk_dir"/
+rmdir "$_versioned_dir"
+rm -rf /tmp/*
diff --git a/docker/caffe2/jenkins/common/install_base.sh b/docker/caffe2/jenkins/common/install_base.sh
new file mode 100755
index 0000000..c7e9da1
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_base.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+set -ex
+
+# This function installs protobuf 2.6
+install_protobuf_26() {
+  pb_dir="/usr/temp_pb_install_dir"
+  mkdir -p $pb_dir
+
+  # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+  # else it will fail with
+  #   g++: error: ./../lib64/crti.o: No such file or directory
+  ln -s /usr/lib64 "$pb_dir/lib64"
+
+  curl -LO "https://github.com/google/protobuf/releases/download/v2.6.1/protobuf-2.6.1.tar.gz"
+  tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-2.6.1.tar.gz
+  pushd "$pb_dir" && ./configure && make && make check && sudo make install && sudo ldconfig
+  popd
+  rm -rf $pb_dir
+}
+
+install_ubuntu() {
+  # Use AWS mirror if running in EC2
+  if [ -n "${EC2:-}" ]; then
+    A="archive.ubuntu.com"
+    B="us-east-1.ec2.archive.ubuntu.com"
+    perl -pi -e "s/${A}/${B}/g" /etc/apt/sources.list
+  fi
+
+  apt-get update
+  apt-get install -y --no-install-recommends \
+          autoconf \
+          build-essential \
+          ca-certificates \
+          cmake \
+          curl \
+          git \
+          libgoogle-glog-dev \
+          libhiredis-dev \
+          libiomp-dev \
+          libleveldb-dev \
+          liblmdb-dev \
+          libopencv-dev \
+          libpthread-stubs0-dev \
+          libsnappy-dev \
+          sudo \
+          vim
+
+  # Ubuntu 14.04 ships with protobuf 2.5, but ONNX needs protobuf >= 2.6
+  # so we install that here if on 14.04
+  # Ubuntu 14.04 also has cmake 2.8.12 as the default option, so we will
+  # install cmake3 here and use cmake3.
+  if [[ "$UBUNTU_VERSION" == 14.04 ]]; then
+    apt-get install -y --no-install-recommends cmake3
+    install_protobuf_26
+  else
+    apt-get install -y --no-install-recommends \
+            libprotobuf-dev \
+            protobuf-compiler
+  fi
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
+  # for Caffe2. That said, we still install them to make sure the build
+  # system opts to build/use protoc and libprotobuf from third-party.
+  yum install -y \
+      autoconf \
+      automake \
+      cmake \
+      cmake3 \
+      curl \
+      gcc \
+      gcc-c++ \
+      gflags-devel \
+      git \
+      glibc-devel \
+      glibc-headers \
+      glog-devel \
+      hiredis-devel \
+      leveldb-devel \
+      libstdc++-devel \
+      lmdb-devel \
+      make \
+      opencv-devel \
+      snappy-devel \
+      sudo
+
+  # Centos7 ships with protobuf 2.5, but ONNX needs protobuf >= 2.6
+  # so we always install install that here
+  install_protobuf_26
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+if [ -f /etc/lsb-release ]; then
+  install_ubuntu
+elif [ -f /etc/os-release ]; then
+  install_centos
+else
+  echo "Unable to determine OS..."
+  exit 1
+fi
diff --git a/docker/caffe2/jenkins/common/install_ccache.sh b/docker/caffe2/jenkins/common/install_ccache.sh
new file mode 100755
index 0000000..7c95988
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_ccache.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -ex
+
+# Install ccache from source.
+# Needs specific branch to work with nvcc (ccache/ccache#145)
+# Also pulls in a commit that disables documentation generation,
+# as this requires asciidoc to be installed (which pulls in a LOT of deps).
+pushd /tmp
+git clone https://github.com/pietern/ccache -b ccbin
+pushd ccache
+./autogen.sh
+./configure --prefix=/usr/local
+make "-j$(nproc)" install
+popd
+popd
+
+# Install sccache from pre-compiled binary.
+curl https://s3.amazonaws.com/ossci-linux/sccache -o /usr/local/bin/sccache
+chmod a+x /usr/local/bin/sccache
diff --git a/docker/caffe2/jenkins/common/install_clang.sh b/docker/caffe2/jenkins/common/install_clang.sh
new file mode 100755
index 0000000..694606e
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_clang.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CLANG_VERSION" ]
+
+apt-get update
+apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Use update-alternatives to make this version the default
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/clang-"$CLANG_VERSION" 50
+update-alternatives --install /usr/bin/g++ g++ /usr/bin/clang++-"$CLANG_VERSION" 50
diff --git a/docker/caffe2/jenkins/common/install_cmake.sh b/docker/caffe2/jenkins/common/install_cmake.sh
new file mode 100755
index 0000000..ddae4cb
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_cmake.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CMAKE_VERSION" ]
+
+# Turn 3.6.3 into v3.6
+path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
+file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
+
+# Download and install specific CMake version in /usr/local
+pushd /tmp
+curl -Os "https://cmake.org/files/${path}/${file}"
+tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
+rm -f cmake-*.tar.gz
+popd
diff --git a/docker/caffe2/jenkins/common/install_cuda.sh b/docker/caffe2/jenkins/common/install_cuda.sh
new file mode 100755
index 0000000..91a35d1
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_cuda.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -ex
+
+APT_INSTALL_CMD="apt-get install -y --no-install-recommends"
diff --git a/docker/caffe2/jenkins/common/install_gcc.sh b/docker/caffe2/jenkins/common/install_gcc.sh
new file mode 100755
index 0000000..23aad5b
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_gcc.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$GCC_VERSION" ]
+
+apt-get update
+apt-get install -y --no-install-recommends software-properties-common
+add-apt-repository ppa:ubuntu-toolchain-r/test
+apt-get update
+apt-get install -y --no-install-recommends gcc-"$GCC_VERSION" g++-"$GCC_VERSION"
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Use update-alternatives to make this version the default
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
+update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
diff --git a/docker/caffe2/jenkins/common/install_mkl.sh b/docker/caffe2/jenkins/common/install_mkl.sh
new file mode 100755
index 0000000..e3c49a2
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_mkl.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -ex
+
+# Needs https transport for apt
+apt-get update
+apt-get install -y --no-install-recommends apt-transport-https
+
+# Add Intel MKL repository
+key="https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB"
+curl "${key}" | apt-key add -
+echo 'deb http://apt.repos.intel.com/mkl all main' | \
+  tee /etc/apt/sources.list.d/intel-mkl.list
+apt-get update
+
+# Multiple candidates for intel-mkl-64bit, so have to be specific
+apt-get install -y --no-install-recommends intel-mkl-64bit-2018.1-038
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Ensure loader can find MKL path
+echo '/opt/intel/mkl/lib/intel64' | tee /etc/ld.so.conf.d/intel-mkl.conf
+ldconfig
diff --git a/docker/caffe2/jenkins/common/install_nccl.sh b/docker/caffe2/jenkins/common/install_nccl.sh
new file mode 100755
index 0000000..b02b3ea
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_nccl.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$UBUNTU_VERSION" ]
+[ -n "$CUDA_VERSION" ]
+
+# The NCCL version is not encoded in the build environment.
+# This file installs the latest version that works.
+
+# There are only NCCL packages for Ubuntu 16.04 and 14.04
+if [[ "$UBUNTU_VERSION" == 16.04 ]]; then
+  NCCL_UBUNTU_VER=ubuntu1604
+  NCCL_DEB='nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb'
+elif [[ "$UBUNTU_VERSION" == 14.04 ]]; then
+  NCCL_UBUNTU_VER=ubuntu1404
+  NCCL_DEB='nvidia-machine-learning-repo-ubuntu1404_4.0-2_amd64.deb'
+else
+  echo "There is no NCCL package for Ubuntu version ${UBUNTU_VERSION}."
+  echo "    NCCL will not be installed."
+fi
+
+if [ -n "$NCCL_UBUNTU_VER" ]; then
+
+  # The deb is agnostic of CUDA version
+  curl -LO "http://developer.download.nvidia.com/compute/machine-learning/repos/${NCCL_UBUNTU_VER}/x86_64/${NCCL_DEB}"
+
+  # This dpkg call needs wget
+  apt-get update
+  apt-get install -y wget
+  dpkg -i "${NCCL_DEB}"
+
+  # On March 8, 2018 Nvidia began recommending version 2.1.15
+  NCCL_LIB_VERSION="2.1.15-1+cuda${CUDA_VERSION:0:3}"
+
+  apt update
+  apt install -y --allow-downgrades libnccl2=$NCCL_LIB_VERSION libnccl-dev=$NCCL_LIB_VERSION
+fi
diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh
new file mode 100755
index 0000000..c31adc8
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_python.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+set -ex
+
+if [ -z "$PYTHON_VERSION" ]; then
+  echo "Please specify PYTHON_VERSION..."
+  exit 1
+fi
+
+install_ubuntu_deadsnakes() {
+  apt-get install -y --no-install-recommends software-properties-common
+  add-apt-repository ppa:deadsnakes/ppa
+  apt-get update
+  apt-get install -y --no-install-recommends "$1"
+}
+
+install_ubuntu() {
+  apt-get update
+
+  case "$PYTHON_VERSION" in
+    2*)
+      apt-get install -y --no-install-recommends \
+              python-dev \
+              python-setuptools
+      PYTHON=python2
+      ;;
+    3.5)
+      apt-get install -y --no-install-recommends \
+              python3-dev \
+              python3-setuptools
+      PYTHON=python3.5
+      ;;
+    3.6)
+      install_ubuntu_deadsnakes python3.6-dev
+      PYTHON=python3.6
+      INSTALL_SETUPTOOLS=yes
+      ;;
+    3.7)
+      install_ubuntu_deadsnakes python3.7-dev
+      PYTHON=python3.7
+      INSTALL_SETUPTOOLS=yes
+      ;;
+    *)
+      echo "Invalid PYTHON_VERSION..."
+      exit 1
+      ;;
+  esac
+
+  # Have to install unzip if installing setuptools
+  if [ -n "${INSTALL_SETUPTOOLS}" ]; then
+    apt-get install -y --no-install-recommends unzip
+  fi
+
+  # Clean up
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  source /etc/os-release
+  if [ "$ID" != "centos" ]; then
+    echo "Unknown ID: $ID"
+    exit 1
+  fi
+
+  case "$PYTHON_VERSION" in
+    2*)
+      yum install -y \
+          python-devel \
+          python-setuptools
+      PYTHON=python2
+      ;;
+    3.4)
+      yum install -y \
+          python34-devel \
+          python34-setuptools
+      PYTHON=python3
+      ;;
+    *)
+      echo "Invalid PYTHON_VERSION..."
+      exit 1
+      ;;
+  esac
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install Python packages depending on the base OS
+if [ -f /etc/lsb-release ]; then
+  install_ubuntu
+elif [ -f /etc/os-release ]; then
+  install_centos
+else
+  echo "Unable to determine OS..."
+  exit 1
+fi
+
+# Optionally install setuptools from source.
+# This is required for the non-standard Python version
+# installed on Ubuntu. They
+if [ -n "${INSTALL_SETUPTOOLS}" ]; then
+  curl -O https://files.pythonhosted.org/packages/6c/54/f7e9cea6897636a04e74c3954f0d8335cc38f7d01e27eec98026b049a300/setuptools-38.5.1.zip
+  unzip setuptools-38.5.1.zip
+  pushd setuptools-38.5.1
+  "$PYTHON" setup.py install
+  popd
+  rm -rf setuptools-38.5.1*
+fi
+
+# Install pip from source.
+# The python-pip package on Ubuntu Trusty is old
+# and upon install numpy doesn't use the binary
+# distribution, and fails to compile it from source.
+curl -O https://files.pythonhosted.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
+tar zxf pip-9.0.1.tar.gz
+pushd pip-9.0.1
+"$PYTHON" setup.py install
+popd
+rm -rf pip-9.0.1*
+
+if [ -z "${INSTALL_SETUPTOOLS}" ]; then
+  # Upgrade setuptools
+  # setuptools 38.5.2 seems to be buggy, see error in
+  # https://ci.pytorch.org/jenkins/job/caffe2-docker/job/py3.6-gcc5-ubuntu16.04/35/consoleFull
+  pip install -U pip setuptools!=38.5.2
+fi
+
+# tornado 5.0 requires Python 2.7.9+ or 3.4+
+if [[ $($PYTHON -c 'import sys; print(int(sys.version_info <= (2, 7, 9) or sys.version_info <= (3, 4)))' == 1) ]]; then
+    pip install 'tornado<5'
+fi
+
+# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
+# defaults installs the most recent networkx version, so we install this lower
+# version explicitly before scikit-image pulls it in as a dependency
+pip install networkx==2.0
+
+# click - onnx
+# pyyaml - needed to build Aten
+# hypothesis - tests
+# jupyter - for tutorials
+pip install --no-cache-dir \
+    click \
+    future \
+    hypothesis \
+    jupyter \
+    numpy \
+    protobuf \
+    pytest \
+    pyyaml \
+    scipy==0.19.1 \
+    scikit-image \
+    tabulate \
+    virtualenv \
+    mock \
+    typing \
+    typing-extensions \
+    pyyaml
+
diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
new file mode 100644
index 0000000..749efdb
--- /dev/null
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+set -ex
+
+# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually
+# it ignores all values of ROCM_VERSION which are not nightly.  Ugh!
+[ -n "$ROCM_VERSION" ]
+
+install_hip_nightly() {
+    git clone https://github.com/ROCm-Developer-Tools/HIP.git
+    pushd HIP
+    export HIP_PLATFORM=hcc
+    yes | ./install.sh --install
+    popd
+    rm -rf HIP
+
+    git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git
+    pushd hipBLAS
+    yes | ./install.sh --install
+    popd
+    rm -rf hipBLAS
+}
+
+install_ubuntu() {
+    apt-get update
+    apt-get install -y wget
+
+    DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian
+    # Add rocm repository
+    wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
+    echo "deb [arch=amd64] $DEB_ROCM_REPO xenial main" > /etc/apt/sources.list.d/rocm.list
+    apt-get update --allow-insecure-repositories
+
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+                   rocm-dev \
+                   rocm-libs \
+                   rocm-utils \
+                   rocfft \
+                   miopen-hip \
+                   miopengemm \
+                   rocblas \
+                   hipblas \
+                   rocrand \
+                   rocm-profiler \
+                   cxlactivitylogger
+}
+
+install_centos() {
+    echo "Not implemented yet"
+    exit 1
+}
+
+install_hip_thrust() {
+    # Needed for now, will be replaced soon
+    git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
+    rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
+    git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
+    cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd -
+}
+
+install_hcrng() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb 
+    dpkg -i /opt/rocm/debians/hcrng.deb
+}
+
+install_hcsparse() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb
+    dpkg -i /opt/rocm/debians/hcsparse.deb
+}
+
+
+# Install Python packages depending on the base OS
+if [ -f /etc/lsb-release ]; then
+  install_ubuntu
+elif [ -f /etc/os-release ]; then
+  install_centos
+else
+  echo "Unable to determine OS..."
+  exit 1
+fi
+
+# NB: We first install the "wrong" version, but then use those dev tools
+# to install the newer version of HIP.
+if [ "$ROCM_VERSION" = "nightly" ]; then
+  install_hip_nightly
+fi
+
+install_hip_thrust
+install_hcrng
+install_hcsparse
diff --git a/docker/caffe2/jenkins/ubuntu-cuda/.gitignore b/docker/caffe2/jenkins/ubuntu-cuda/.gitignore
new file mode 100644
index 0000000..c97f963
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu-cuda/.gitignore
@@ -0,0 +1 @@
+*.sh
diff --git a/docker/caffe2/jenkins/ubuntu-cuda/Dockerfile b/docker/caffe2/jenkins/ubuntu-cuda/Dockerfile
new file mode 100644
index 0000000..f09e4e8
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu-cuda/Dockerfile
@@ -0,0 +1,50 @@
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+# Install required packages to build Caffe2
+ARG EC2
+ARG UBUNTU_VERSION
+ADD ./install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Compile/install ccache for faster builds
+ADD ./install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# (optional) Install non-default GCC version
+ARG GCC_VERSION
+ADD ./install_gcc.sh install_gcc.sh
+RUN if [ -n "${GCC_VERSION}" ]; then bash ./install_gcc.sh; fi
+RUN rm install_gcc.sh
+
+# Install NCCL for all CUDA builds
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+ADD ./install_nccl.sh install_nccl.sh
+RUN bash ./install_nccl.sh && rm install_nccl.sh
+
+# Install Python
+ARG PYTHON_VERSION
+ADD ./install_python.sh install_python.sh
+RUN if [ -n "${PYTHON_VERSION}" ]; then bash ./install_python.sh; fi
+RUN rm install_python.sh
+
+# Install Anaconda
+ARG ANACONDA_VERSION
+ADD ./install_anaconda.sh install_anaconda.sh
+RUN if [ -n "${ANACONDA_VERSION}" ]; then bash ./install_anaconda.sh; fi
+RUN rm install_anaconda.sh
+
+# (optional) Add Jenkins user
+ARG JENKINS
+ARG JENKINS_UID
+ARG JENKINS_GID
+ADD ./add_jenkins_user.sh add_jenkins_user.sh
+RUN if [ -n "${JENKINS}" ]; then bash ./add_jenkins_user.sh ${JENKINS_UID} ${JENKINS_GID}; fi
+RUN rm add_jenkins_user.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
diff --git a/docker/caffe2/jenkins/ubuntu-rocm/.gitignore b/docker/caffe2/jenkins/ubuntu-rocm/.gitignore
new file mode 100644
index 0000000..c97f963
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu-rocm/.gitignore
@@ -0,0 +1 @@
+*.sh
diff --git a/docker/caffe2/jenkins/ubuntu-rocm/Dockerfile b/docker/caffe2/jenkins/ubuntu-rocm/Dockerfile
new file mode 100644
index 0000000..22a0d6e
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu-rocm/Dockerfile
@@ -0,0 +1,76 @@
+ARG UBUNTU_VERSION
+FROM ubuntu:${UBUNTU_VERSION}
+
+ARG EC2
+ADD ./install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Compile/install ccache for faster builds
+ADD ./install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# Install Python
+ARG PYTHON_VERSION
+ADD ./install_python.sh install_python.sh
+RUN if [ -n "${PYTHON_VERSION}" ]; then bash ./install_python.sh; fi
+RUN rm install_python.sh
+
+# Install Anaconda
+ARG ANACONDA_VERSION
+ADD ./install_anaconda.sh install_anaconda.sh
+RUN if [ -n "${ANACONDA_VERSION}" ]; then bash ./install_anaconda.sh; fi
+RUN rm install_anaconda.sh
+
+# (optional) Install Intel MKL
+ARG MKL
+ADD ./install_mkl.sh install_mkl.sh
+RUN if [ -n "${MKL}" ]; then bash ./install_mkl.sh; fi
+RUN rm install_mkl.sh
+
+# (optional) Install Android NDK
+ARG ANDROID
+ADD ./install_android.sh install_android.sh
+RUN if [ -n "${ANDROID}" ]; then bash ./install_android.sh; fi
+RUN rm install_android.sh
+
+# (optional) Install non-default GCC version
+ARG GCC_VERSION
+ADD ./install_gcc.sh install_gcc.sh
+RUN if [ -n "${GCC_VERSION}" ]; then bash ./install_gcc.sh; fi
+RUN rm install_gcc.sh
+
+# (optional) Install non-default clang version
+ARG CLANG_VERSION
+ADD ./install_clang.sh install_clang.sh
+RUN if [ -n "${CLANG_VERSION}" ]; then bash ./install_clang.sh; fi
+RUN rm install_clang.sh
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+ADD ./install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+
+# Install rocm
+ARG ROCM_VERSION
+ADD ./install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh
+RUN rm install_rocm.sh
+ENV PATH /opt/rocm/bin:$PATH
+ENV PATH /opt/rocm/hcc/bin:$PATH
+ENV PATH /opt/rocm/hip/bin:$PATH
+ENV PATH /opt/rocm/opencl/bin:$PATH
+ENV THRUST_ROOT /data/Thrust
+ENV MIOPEN_DISABLE_CACHE 1
+ENV HIP_PLATFORM hcc
+
+# (optional) Add Jenkins user
+ARG JENKINS
+ARG JENKINS_UID
+ARG JENKINS_GID
+ADD ./add_jenkins_user.sh add_jenkins_user.sh
+RUN if [ -n "${JENKINS}" ]; then bash ./add_jenkins_user.sh ${JENKINS_UID} ${JENKINS_GID}; fi
+RUN rm add_jenkins_user.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
diff --git a/docker/caffe2/jenkins/ubuntu/.gitignore b/docker/caffe2/jenkins/ubuntu/.gitignore
new file mode 100644
index 0000000..c97f963
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu/.gitignore
@@ -0,0 +1 @@
+*.sh
diff --git a/docker/caffe2/jenkins/ubuntu/Dockerfile b/docker/caffe2/jenkins/ubuntu/Dockerfile
new file mode 100644
index 0000000..843cb1b
--- /dev/null
+++ b/docker/caffe2/jenkins/ubuntu/Dockerfile
@@ -0,0 +1,66 @@
+ARG UBUNTU_VERSION
+FROM ubuntu:${UBUNTU_VERSION}
+
+# Install required packages to build Caffe2
+ARG EC2
+ARG UBUNTU_VERSION
+ADD ./install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Compile/install ccache for faster builds
+ADD ./install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# Install Python
+ARG PYTHON_VERSION
+ADD ./install_python.sh install_python.sh
+RUN if [ -n "${PYTHON_VERSION}" ]; then bash ./install_python.sh; fi
+RUN rm install_python.sh
+
+# Install Anaconda
+ARG ANACONDA_VERSION
+ADD ./install_anaconda.sh install_anaconda.sh
+RUN if [ -n "${ANACONDA_VERSION}" ]; then bash ./install_anaconda.sh; fi
+RUN rm install_anaconda.sh
+
+# (optional) Install Intel MKL
+ARG MKL
+ADD ./install_mkl.sh install_mkl.sh
+RUN if [ -n "${MKL}" ]; then bash ./install_mkl.sh; fi
+RUN rm install_mkl.sh
+
+# (optional) Install Android NDK
+ARG ANDROID
+ADD ./install_android.sh install_android.sh
+RUN if [ -n "${ANDROID}" ]; then bash ./install_android.sh; fi
+RUN rm install_android.sh
+
+# (optional) Install non-default GCC version
+ARG GCC_VERSION
+ADD ./install_gcc.sh install_gcc.sh
+RUN if [ -n "${GCC_VERSION}" ]; then bash ./install_gcc.sh; fi
+RUN rm install_gcc.sh
+
+# (optional) Install non-default clang version
+ARG CLANG_VERSION
+ADD ./install_clang.sh install_clang.sh
+RUN if [ -n "${CLANG_VERSION}" ]; then bash ./install_clang.sh; fi
+RUN rm install_clang.sh
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+ADD ./install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
+# (optional) Add Jenkins user
+ARG JENKINS
+ARG JENKINS_UID
+ARG JENKINS_GID
+ADD ./add_jenkins_user.sh add_jenkins_user.sh
+RUN if [ -n "${JENKINS}" ]; then bash ./add_jenkins_user.sh ${JENKINS_UID} ${JENKINS_GID}; fi
+RUN rm add_jenkins_user.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
diff --git a/docker/caffe2/readme.md b/docker/caffe2/readme.md
new file mode 100644
index 0000000..48f605c
--- /dev/null
+++ b/docker/caffe2/readme.md
@@ -0,0 +1,13 @@
+# Docker & Caffe2
+
+**Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run all GPU builds.**
+
+To get the latest source, rerun the docker builds using the Dockerfiles.
+
+Docker images at https://hub.docker.com/r/caffe2ai/caffe2/ are a few months old, but will be refreshed soon.  
+
+**Build like:** `docker build -t caffe2:cuda8-cudnn6-all-options .`
+
+**Run like:** `nvidia-docker run --rm -it caffe2:cuda8-cudnn6-all-options python -m caffe2.python.operator_test.relu_op_test`
+
+For Docker on USB related instructions you can find some help on the gh-pages branch [here](https://github.com/caffe2/caffe2/tree/gh-pages/docker/caffe2-docker-usb)
diff --git a/docker/caffe2/ubuntu-14.04-cpu-all-options/Dockerfile b/docker/caffe2/ubuntu-14.04-cpu-all-options/Dockerfile
new file mode 100644
index 0000000..27af975
--- /dev/null
+++ b/docker/caffe2/ubuntu-14.04-cpu-all-options/Dockerfile
@@ -0,0 +1,52 @@
+FROM caffe2ai/caffe2:c2v0.8.1.cpu.min.ubuntu14.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with cpu support
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgflags-dev \
+    libgflags2 \
+    libgtest-dev \
+    libiomp-dev \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libopenmpi-dev \
+    libsnappy-dev \
+    openmpi-bin \
+    openmpi-doc \
+    python-numpy \
+    python-pydot \
+    python-setuptools \
+    python-scipy \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# tornado 5.0 requires Python 2.7.9+ or 3.4+
+RUN pip install --no-cache-dir \
+    flask \
+    graphviz \
+    jupyter \
+    matplotlib \
+    pydot \
+    python-nvd3 \
+    pyyaml \
+    requests \
+    scikit-image \
+    scipy \
+    setuptools \
+    'tornado<5'
+
+########## INSTALLATION STEPS ###################
+RUN cd caffe2 && mkdir build && cd build \
+    && cmake .. \
+    -DUSE_CUDA=OFF \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-14.04-cpu-minimal/Dockerfile b/docker/caffe2/ubuntu-14.04-cpu-minimal/Dockerfile
new file mode 100644
index 0000000..d8f76b9
--- /dev/null
+++ b/docker/caffe2/ubuntu-14.04-cpu-minimal/Dockerfile
@@ -0,0 +1,39 @@
+FROM ubuntu:14.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with cpu support
+
+########## REQUIRED DEPENDENCIES ################
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    cmake \
+    git \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    python-pip \
+    protobuf-compiler \
+    python-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Don't use deb package because trusty's pip is too old for --no-cache-dir
+RUN curl -O https://bootstrap.pypa.io/get-pip.py \
+    && python get-pip.py \
+    && rm get-pip.py
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+RUN pip install --no-cache-dir future hypothesis numpy protobuf six
+
+########## INSTALLATION STEPS ###################
+RUN git clone --branch master --recursive https://github.com/caffe2/caffe2.git
+RUN cd caffe2 && mkdir build && cd build \
+    && cmake .. \
+    -DUSE_CUDA=OFF \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-16.04-cpu-all-options/Dockerfile b/docker/caffe2/ubuntu-16.04-cpu-all-options/Dockerfile
new file mode 100644
index 0000000..bb893e9
--- /dev/null
+++ b/docker/caffe2/ubuntu-16.04-cpu-all-options/Dockerfile
@@ -0,0 +1,50 @@
+FROM caffe2ai/caffe2:c2v0.8.1.cpu.min.ubuntu16.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with cpu support
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgflags-dev \
+    libgtest-dev \
+    libiomp-dev \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libopenmpi-dev \
+    libsnappy-dev \
+    openmpi-bin \
+    openmpi-doc \
+    python-numpy \
+    python-pydot \
+    python-setuptools \
+    python-scipy \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+    flask \
+    graphviz \
+    jupyter \
+    matplotlib \
+    pydot \
+    python-nvd3 \
+    pyyaml \
+    requests \
+    scikit-image \
+    scipy \
+    setuptools \
+    tornado
+
+########## INSTALLATION STEPS ###################
+RUN cd caffe2 && mkdir build && cd build \
+    && cmake .. \
+    -DUSE_CUDA=OFF \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-16.04-cpu-minimal/Dockerfile b/docker/caffe2/ubuntu-16.04-cpu-minimal/Dockerfile
new file mode 100644
index 0000000..acb96be
--- /dev/null
+++ b/docker/caffe2/ubuntu-16.04-cpu-minimal/Dockerfile
@@ -0,0 +1,39 @@
+FROM ubuntu:16.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with cpu support
+
+########## REQUIRED DEPENDENCIES ################
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    curl \
+    git \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    python-pip \
+    protobuf-compiler \
+    python-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Don't use deb package because trusty's pip is too old for --no-cache-dir
+RUN curl -O https://bootstrap.pypa.io/get-pip.py \
+    && python get-pip.py \
+    && rm get-pip.py
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+RUN pip install --no-cache-dir future hypothesis numpy protobuf six
+
+########## INSTALLATION STEPS ###################
+RUN git clone --branch master --recursive https://github.com/caffe2/caffe2.git
+RUN cd caffe2 && mkdir build && cd build \
+    && cmake .. \
+    -DUSE_CUDA=OFF \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-16.04-cuda8-cudnn6-all-options/Dockerfile b/docker/caffe2/ubuntu-16.04-cuda8-cudnn6-all-options/Dockerfile
new file mode 100644
index 0000000..713d7e0
--- /dev/null
+++ b/docker/caffe2/ubuntu-16.04-cuda8-cudnn6-all-options/Dockerfile
@@ -0,0 +1,67 @@
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with gpu support
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    git \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libgtest-dev \
+    libiomp-dev \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libopenmpi-dev \
+    libprotobuf-dev \
+    libsnappy-dev \
+    openmpi-bin \
+    openmpi-doc \
+    protobuf-compiler \
+    python-dev \
+    python-numpy \
+    python-pip \
+    python-pydot \
+    python-setuptools \
+    python-scipy \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir \
+    flask \
+    future \
+    graphviz \
+    hypothesis \
+    jupyter \
+    matplotlib \
+    numpy \
+    protobuf \
+    pydot \
+    python-nvd3 \
+    pyyaml \
+    requests \
+    scikit-image \
+    scipy \
+    setuptools \
+    six \
+    tornado
+
+########## INSTALLATION STEPS ###################
+RUN git clone --branch master --recursive https://github.com/caffe2/caffe2.git
+RUN cd caffe2 && mkdir build && cd build \
+    && cmake .. \
+    -DCUDA_ARCH_NAME=Manual \
+    -DCUDA_ARCH_BIN="35 52 60 61" \
+    -DCUDA_ARCH_PTX="61" \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile
new file mode 100644
index 0000000..9235da3
--- /dev/null
+++ b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile
@@ -0,0 +1,67 @@
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="aaronmarkham@fb.com"
+
+# caffe2 install with gpu support
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    git \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libgtest-dev \
+    libiomp-dev \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libopenmpi-dev \
+    libprotobuf-dev \
+    libsnappy-dev \
+    openmpi-bin \
+    openmpi-doc \
+    protobuf-compiler \
+    python-dev \
+    python-numpy \
+    python-pip \
+    python-pydot \
+    python-setuptools \
+    python-scipy \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir --upgrade pip==9.0.3 setuptools wheel && \
+    pip install --no-cache-dir \
+    flask \
+    future \
+    graphviz \
+    hypothesis \
+    jupyter \
+    matplotlib \
+    numpy \
+    protobuf \
+    pydot \
+    python-nvd3 \
+    pyyaml \
+    requests \
+    scikit-image \
+    scipy \
+    setuptools \
+    six \
+    tornado
+
+########## INSTALLATION STEPS ###################
+RUN git clone --branch master --recursive https://github.com/pytorch/pytorch.git
+RUN cd pytorch && mkdir build && cd build \
+    && cmake .. \
+    -DCUDA_ARCH_NAME=Manual \
+    -DCUDA_ARCH_BIN="35 52 60 61" \
+    -DCUDA_ARCH_PTX="61" \
+    -DUSE_NNPACK=OFF \
+    -DUSE_ROCKSDB=OFF \
+    && make -j"$(nproc)" install \
+    && ldconfig \
+    && make clean \
+    && cd .. \
+    && rm -rf build
+
+ENV PYTHONPATH /usr/local
diff --git a/docker/caffe2/ubuntu-16.04-gpu-tutorial/Dockerfile b/docker/caffe2/ubuntu-16.04-gpu-tutorial/Dockerfile
new file mode 100644
index 0000000..b9788c3
--- /dev/null
+++ b/docker/caffe2/ubuntu-16.04-gpu-tutorial/Dockerfile
@@ -0,0 +1,40 @@
+FROM caffe2ai/caffe2:latest
+MAINTAINER Orion Reblitz-Richardson <orionr@fb.com>
+
+# Caffe2 source refresh and tutorial files overlay
+# Change to a CPU-only docker base if needed (latest is GPU)
+
+########## INSTALLATION STEPS ###################
+RUN apt-get install unzip vim -y --no-install-recommends
+WORKDIR "/"
+RUN rm -rf caffe2
+RUN git clone --recursive https://github.com/caffe2/caffe2.git
+RUN rm -rf caffe2_tutorials
+RUN git clone --recursive https://github.com/caffe2/tutorials.git caffe2_tutorials
+
+########## REBUILD ###################
+WORKDIR "/caffe2"
+RUN make
+WORKDIR "/caffe2/build"
+RUN make install
+ENV PYTHONPATH /usr/local
+ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH
+WORKDIR "/usr/local"
+
+########## SETUP TUTORIAL FILES #################
+# get model files for Loading Pre-Trained Models
+RUN python -m caffe2.python.models.download -i squeezenet
+# not installing these by default to keep the image smaller
+# RUN python -m caffe2.python.models.download -i bvlc_alexnet
+# RUN python -m caffe2.python.models.download -i bvlc_googlenet
+# RUN python -m caffe2.python.models.download -i bvlc_reference_caffenet
+# rcnn model throws error, so holding off until fixed
+# RUN python -m caffe2.python.models.download -i bvlc_reference_rcnn_ilsvrc13
+# RUN python -m caffe2.python.models.download -i finetune_flickr_style
+# get MNIST dataset for MNIST
+WORKDIR "/caffe2_tutorials"
+RUN mkdir tutorial_data && cd tutorial_data
+WORKDIR "/caffe2_tutorials/tutorial_data"
+RUN wget "https://download.caffe2.ai/datasets/mnist/mnist.zip"
+RUN unzip -d mnist mnist.zip
+WORKDIR "/caffe2_tutorials"
diff --git a/docker/pytorch/Dockerfile b/docker/pytorch/Dockerfile
new file mode 100644
index 0000000..13204b3
--- /dev/null
+++ b/docker/pytorch/Dockerfile
@@ -0,0 +1,35 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         vim \
+         ca-certificates \
+         libjpeg-dev \
+         libpng-dev &&\
+     rm -rf /var/lib/apt/lists/*
+
+
+RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \
+     rm ~/miniconda.sh && \
+     /opt/conda/bin/conda install numpy pyyaml scipy ipython mkl mkl-include && \
+     /opt/conda/bin/conda install -c pytorch magma-cuda90 && \
+     /opt/conda/bin/conda clean -ya
+ENV PATH /opt/conda/bin:$PATH
+# This must be done before pip so that requirements.txt is available
+WORKDIR /opt/pytorch
+COPY . .
+
+RUN git submodule update --init
+RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+    pip install -v .
+
+RUN git clone https://github.com/pytorch/vision.git && cd vision && pip install -v .
+
+WORKDIR /workspace
+RUN chmod -R a+w /workspace
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..2a63943
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,35 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = PyTorch
+SOURCEDIR     = source
+BUILDDIR      = build
+PYCMD         = python
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+figures:
+	@$(PYCMD) source/scripts/build_activation_images.py
+
+docset: html
+	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/docs/ --force $(BUILDDIR)/html/
+
+	# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
+	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
+	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
+
+.PHONY: help Makefile docset
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile figures
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@echo "Removing everything under 'build'.."
+	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees
diff --git a/docs/caffe2/.Doxyfile-c b/docs/caffe2/.Doxyfile-c
new file mode 100644
index 0000000..c4873d6
--- /dev/null
+++ b/docs/caffe2/.Doxyfile-c
@@ -0,0 +1,2469 @@
+# Doxyfile 1.8.14
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Caffe2 - C++ API"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "A deep learning, cross platform ML framework"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = docs/caffe2/Caffe2-with-name-55-tall.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @CMAKE_CURRENT_BINARY_DIR@/docs/doxygen-c/
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = docs/caffe2/DoxygenLayout-c.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = NO
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = docs/caffe2/warnings-c.log
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  =
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.h \
+
+
+
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = _site/ \
+                         build/ \
+                         caffe/ \
+                         caffe2/contrib/ \
+                         caffe2/experiments/python \
+                         caffe2/mkl/operators/ \
+                         caffe2/mkl/mkl_operator.cc \
+                         caffe2/mpi/mpi_ops_gpu.cc \
+                         caffe2/mpi/mpi_ops.cc \
+                         caffe2/mpi/mpi_ops.h \
+                         caffe2/python/ \
+                         caffe2/test/ \
+                         cmake/ \
+                         docker/ \
+                         docs/ \
+                         models/ \
+                         third_party
+
+
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = *_test.cc
+
+
+
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = README.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = docs/caffe2/header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = docs/caffe2/footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        = docs/caffe2/stylesheet.css
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = docs/caffe2/main.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = YES
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/caffe2/.Doxyfile-python b/docs/caffe2/.Doxyfile-python
new file mode 100644
index 0000000..9d16671
--- /dev/null
+++ b/docs/caffe2/.Doxyfile-python
@@ -0,0 +1,2467 @@
+# Doxyfile 1.8.14
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Caffe2 - Python API"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "A deep learning, cross platform ML framework"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = docs/caffe2/Caffe2-with-name-55-tall.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @CMAKE_CURRENT_BINARY_DIR@/docs/doxygen-python/
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = YES
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = docs/caffe2/DoxygenLayout-python.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = NO
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = docs/caffe2/warnings-python.log
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  =
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          =  *.py
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = _site/ \
+                         build/ \
+                         caffe/ \
+                         caffe2/binaries/ \
+                         caffe2/contrib/ \
+                         caffe2/core/ \
+                         caffe2/cuda_rtc/ \
+                         caffe2/db/ \
+                         caffe2/experiments/operators/ \
+                         caffe2/image/ \
+                         caffe2/mkl/ \
+                         caffe2/mpi/ \
+                         caffe2/operators/ \
+                         caffe2/perfkernels/ \
+                         caffe2/proto/ \
+                         caffe2/queue/ \
+                         caffe2/sgd/ \
+                         caffe2/test/ \
+                         caffe2/transforms/ \
+                         caffe2/utils/ \
+                         caffe2/videos/ \
+                         cmake/ \
+                         docker/ \
+                         docs/ \
+                         models/ \
+                         third_party
+
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =  *_test.py
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = README.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = docs/caffe2/header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = docs/caffe2/footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        = docs/caffe2/stylesheet.css
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = docs/caffe2/main.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = YES
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/caffe2/Caffe2-with-name-55-tall.png b/docs/caffe2/Caffe2-with-name-55-tall.png
new file mode 100644
index 0000000..758e2fa
Binary files /dev/null and b/docs/caffe2/Caffe2-with-name-55-tall.png differ
diff --git a/docs/caffe2/DOXYGEN.md b/docs/caffe2/DOXYGEN.md
new file mode 100644
index 0000000..8adb34d
--- /dev/null
+++ b/docs/caffe2/DOXYGEN.md
@@ -0,0 +1,74 @@
+# Doxygen Notes
+
+Doxygen seems to behave better if the processing between C++ and Python was split up. This is why there are two different links to cover each API.
+
+C++ API docs work out of the box with the Caffe2 source code. Python docs require “python blocks (http://www.stack.nl/~dimitri/doxygen/manual/docblocks.html#pythonblocks)” which are (often) currently missing in the Python code.
+
+The Python script called “process.py” that resides in the /docs folder is to prepare the docs by looking for the block and if it doesn't exist prepend and customize the python blocks section with the module's path (e.g. Module caffe2.python.examples.char_rnn). It was noted that you need to delete the previous version of docs when you regenerate the docs or else things get messy, so the script deals with that as well.
+
+The doxygen customization includes these files in the doxygen folder:
+
+* header.html - logo links back to the main docs page
+* footer.html - includes the Facebook OSS footer
+* stylesheet.css - doxygen's default CSS; tweaked to fix formatting problems with the custom logo, header, and footer
+* main.css - copied from the caffe2ai CSS, so this should be refreshed after the design changes (this overrides/extends stylesheet.css)
+
+It also extracts info from markdown files found in the source tree. A legacy installation file was in the /docs folder and this was removed. These file show up in the top navigation under “Related Pages”.
+
+The flow to create the API documents is simple now:
+
+1. Run /caffe2_root/docs/process.py
+2. (TODO automatically) Copy the doxygen-c and doxygen-python folders created by the script to the gh-pages branch.
+
+Settings that were customized:
+
+OPTIMIZE_OUTPUT_JAVA - turned on for Python config, off for C++ config
+USE_MDFILE_AS_MAINPAGE  - use to flag a markdown file for the mainpage
+EXTRACT_ALL
+QUIET
+WARN_IF_UNDOCUMENTED
+FILE_PATTERNS
+DOT_MULTI_TARGETS = YES
+JAVADOC_AUTOBRIEF = YES
+QUIET = YES
+SOURCE_BROWSER = YES
+VERBATIM_HEADERS = NO
+SHOW_NAMESPACES = NO for C++ config
+
+Not using this (was in old config file, but seems to be for Latex):
+EXTRA_PACKAGES = amsmath \
+amsfonts \
+xr
+
+### NOTE / TODO:
+
+useful for xcode, currently off
+GENERATE_DOCSET = NO
+
+Look at search engine integration, xml output, etc
+EXTERNAL_SEARCH = YES
+
+### Other Notes
+
+To achieve better output in the Python docs:
+http://stackoverflow.com/questions/7690220/how-to-document-python-function-parameter-types
+
+Swap this kind of formatting into py files:
+
+```
+def my_method(x, y):"""!
+    my_method description
+
+    @type x: int
+    @param x: An integer
+
+    @type y: int|string
+    @param y: An integer or string
+
+    @rtype: string
+    @return: Returns a sentence with your variables in it
+    """return "Hello World! %s, %s" % (x,y)
+```
+
+Note that the bang (!) is added after the opening comment """! - this seems to do the trick and the remaining comments will be nicely parsed by Doxygen.
+
diff --git a/docs/caffe2/DoxygenLayout-c.xml b/docs/caffe2/DoxygenLayout-c.xml
new file mode 100644
index 0000000..092171b
--- /dev/null
+++ b/docs/caffe2/DoxygenLayout-c.xml
@@ -0,0 +1,197 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.14 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="no" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+    <tab type="user" url="/doxygen-c/html/classes.html" title="C++ API"/>
+    <tab type="user" url="/doxygen-python/html/annotated.html" title="Python API"/>
+    <tab type="user" url="https://github.com/caffe2/caffe2" title="GitHub"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/docs/caffe2/DoxygenLayout-python.xml b/docs/caffe2/DoxygenLayout-python.xml
new file mode 100644
index 0000000..092171b
--- /dev/null
+++ b/docs/caffe2/DoxygenLayout-python.xml
@@ -0,0 +1,197 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.14 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="no" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+    <tab type="user" url="/doxygen-c/html/classes.html" title="C++ API"/>
+    <tab type="user" url="/doxygen-python/html/annotated.html" title="Python API"/>
+    <tab type="user" url="https://github.com/caffe2/caffe2" title="GitHub"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/docs/caffe2/README.md b/docs/caffe2/README.md
new file mode 100644
index 0000000..f7a4e24
--- /dev/null
+++ b/docs/caffe2/README.md
@@ -0,0 +1,204 @@
+# Docs Notes
+
+Caffe2 docs are split up into three areas:
+
+1) Caffe2.ai website: this comes from the gh-pages branch via the markdown files found in `gh-pages:caffe2/_docs`; you can run this locally with jekyll and push changes directly to github if you're a contributor without the need for a PR
+2) Operators catalogue: this comes from scripts in the master branch found in `master:caffe2/python/docs`; running `github_generator > operators-catalogue.md` and moving this md file to `gh-pages:caffe2/_docs` manually is currently the process
+3) C++ API and Python API Doxygen docs: these are generated by Doxygen manually or by using `master:caffe2/docs/process.py` and copied to the `gh-pages:caffe2/doxygen-c` and `gh-pages:caffe2/doxygen-python` folders
+
+## Docs Maintenance Setup
+
+You need to have access to the master branch to generate the docs from source and you need access to the gh-pages branch to publish them. Since you often want the files simultaneously and since the folder structures between master and gh-pages differ so greatly, it is advised to clone each branch to different folders. Otherwise if you switch back and forth between gh-pages and master within the same local repo, you'll end up seeing folders in branches that shouldn't be there, and it can get really confusing once you start running Doxygen and generating thousands of new files.
+
+```
+cd ~
+git clone https://github.com/caffe2/caffe2.git c2master && cd c2master && git checkout master
+cd ~
+git clone https://github.com/caffe2/caffe2.git c2docs && cd c2docs && git checkout gh-pages
+```
+
+## Generating Operator Catalog
+
+To update the operator catalog, a script must be run on the master branch and copied to your docs checkout. To trigger this use:
+
+1. `cd ~/c2master`
+2. `python caffe2/python/docs/github.py ~/c2docs/_docs/operators-catalogue.md`
+
+## Generating API Docs with Doxygen
+
+Support for generating the docs has been included in the CMake build process, but it is turned off by default. To trigger building of the docs use:
+
+1. `cd ~/c2master`
+2. `mkdir build && cd build`
+3. `cmake -DBUILD_DOCS=ON .. && make`
+
+This will create a docs subfolder in the build folder. You can launch either API's web page docs by opening the index.html file found at `build/docs/doxygen-c/html/index.html` or `build/docs/doxygen-python/html/index.html`.
+
+To push to caffe2.ai, copy these from the build directory to your docs checkout:
+1. `cd ~/c2docs`
+2. `rm -rf doxygen-c`
+3. `rm -rf doxygen-python`
+4. `cp -r ~/c2master/build/docs/doxygen-c .`
+5. `cp -r ~/c2master/build/docs/doxygen-python .`
+6. `git add -A .`
+
+### Install Doxygen
+
+You will need to install Doxygen to build Caffe2 with the API docs.
+
+#### MacOS X
+
+1. Press Command+Space and type Terminal and press enter/return key.
+2. Run in Terminal app:
+   `ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2> /dev/null`
+   and press enter/return key. Wait for the command to finish.
+3. Run:
+   `brew install doxygen`
+
+Other operating systems' installation instructions can be found on the [Doxygen website](https://www.stack.nl/~dimitri/doxygen/manual/install.html).
+
+### Doxygen Notes
+
+Doxygen seems to behave better if the processing between C++ and Python was split up. This is why there are two different links to cover each API.
+
+C++ API docs work out of the box with the Caffe2 source code. Python docs require “[python blocks](http://www.stack.nl/~dimitri/doxygen/manual/docblocks.html#pythonblocks)” which are (often) currently missing in the Python code.
+
+The Doxygen customization includes these files in the doxygen folder:
+
+* header.html - logo links back to the main docs page
+* footer.html - includes the Facebook OSS footer
+* stylesheet.css - Doxygen's default CSS; tweaked to fix formatting problems with the custom logo, header, and footer
+* main.css - copied from the caffe2ai CSS, so this should be refreshed after the design changes (this overrides/extends stylesheet.css)
+
+It also extracts info from markdown files found in the source tree. A legacy installation file was in the /docs folder and this was removed. These file show up in the top navigation under “Related Pages”.
+
+### Scripts for Adding the Doxygen Preamble and Publishing the Docs
+
+Some of the Python files may be missing the preamble Doxygen needs to generate docs, so the Python script called “process.py” that resides in the /docs folder is used to prepare the docs by looking for the block and if it doesn't exist prepend and customize the python blocks section with the module's path (e.g. Module caffe2.python.examples.char_rnn). It was noted that you need to delete the previous version of docs when you regenerate the docs or else things get messy, so the script deals with that as well.
+
+To manually create API documents, go to your master checkout:
+
+1. `cd c2master`
+2. `cd docs && python process.py`
+
+If you want to push these to caffe2.ai, go to your docs checkout:
+1. `cd c2docs`
+2. Copy the files generated in build/docs to your gh-pages branch, commit, and push.
+3. `doxygen-c` and `doxygen-python` both go in the root folder of `gh-pages`
+4. `operators-catalogue.md` goes in `_docs` 
+
+### Running Doxygen Manually
+
+It may be beneficial to run Doxygen on your own and not try to inject the preamble. Just note that any files without a preamble will not show up in the docs.
+
+To do this, make sure you have doxygen installed and from the master branch root run:
+
+```
+doxygen .Doxyfile-python
+doxygen .Doxyfile-c
+```
+`.Doxyfile-python` and `.Doxyfile-c` are config files that are optimized for each code format. They each use Doxygen's config for `OUTPUT_DIRECTORY` which is set to docs/doxygen-python or docs/doxygen-c respectively.
+
+### Publishing the New Doxygen Files
+
+You will not be able to `git push` to `gh-pages` until you have switched auth methods to SSH:
+
+```
+git remote set-url origin git@github.com:caffe2/caffe2.git
+```
+
+Doxygen will tend to change and remove a lot of file and a simple copy can leave files behind that shouldn't be there anymore, so it is advised to delete the old doxygen-c and doxygen-python folders first. Copy the new files to the root folder of the gh-pages branch, making sure before you commit, you `git add` the deleted files, the new files, and the modified files. Then `git push` will push these changes without the need for a PR.
+
+### Maintaining the Doxygen Configs
+
+Each of the config files is customized differently and these changes are mentioned further below. A more common update though is the `EXCLUDE` config which is used to exclude whole directories and individual files. As new folders and special files are added it may make sense to update this so Python's API isn't including a bunch of new C++ info. You also end up with some third party files that you probably don't want to include in the docs.
+`FILE_PATTERNS` is supposed to prevent the Python & C++ docs from getting mixed up, but a combination of `FILE_PATTERNS` and `EXCLUDE` seems have worked best.
+
+Settings that were customized:
+
+OPTIMIZE_OUTPUT_JAVA - turned on for Python config, off for C++ config
+USE_MDFILE_AS_MAINPAGE  - use to flag a markdown file for the mainpage
+EXTRACT_ALL
+QUIET
+WARN_IF_UNDOCUMENTED
+FILE_PATTERNS
+DOT_MULTI_TARGETS = YES
+JAVADOC_AUTOBRIEF = YES
+QUIET = YES
+SOURCE_BROWSER = YES
+VERBATIM_HEADERS = NO
+SHOW_NAMESPACES = NO for C++ config
+
+Not using this (was in old config file, but seems to be for Latex):
+EXTRA_PACKAGES = amsmath \
+amsfonts \
+xr
+
+## Caffe2.ai
+
+For more info on contributing to Caffe2.ai, take a look at [CONTRIBUTING.md](https://github.com/caffe2/caffe2/blob/gh-pages/CONTRIBUTING.md) found in the root of the gh-pages branch. This will help you setup jekyll locally so you can make your edits and test your changes. The most commonly updated areas are the markdown files in the `_docs` folder as well as the navigation structure (to add things to the sidebar) which is found at `_data/nav_docs.yml`. Each markdown file is required to have a header and if you wish for the markdown doc to appear as an item in the navigation the `id` in the yml file is equivalent to the `docid` in the markdown file.
+
+For example in the Applications of Deep Learning markdown file we have:
+
+```
+---
+docid: applications-of-deep-learning
+title: Applications of Deep Learning
+layout: docs
+permalink: /docs/applications-of-deep-learning.html
+---
+```
+
+In the `_data/nav_docs.yml` file we have:
+
+```
+- title: Quick Start
+  items:
+  - id: getting-started
+  - id: learn-more
+  - id: caffe-migration
+- title: Learn
+  items:
+  - id: applications-of-deep-learning
+  - id: operators
+  - id: mobile-integration
+...(more navigation here)...
+```
+
+To link up the `docid: applications-of-deep-learning` markdown file to the navigation we have it listed under `- title: Learn` then under `items:` and then `- id: applications-of-deep-learning` using `id` instead of `docid`. The order in the yml file is how it is displayed in the browser, and it will pull the `title` from the markdown file's header and display that as the H1 for the page. In this example, `title: Applications of Deep Learning` shows "Applications of Deep Learning" at the top in H1 format, so in your markdown file there's no need to add your own H1 or single hash title for the page.
+
+### TODO:
+
+In the future it would be ideal to expand these Operator Catalogue scripts to provide documentation for all C++ operators and Python modules. Instead it focuses on a subset of operators only, so in the interim Doxygen is used to backfill this gap.
+
+Additionally, the operators-catalogue.md file that is generated is quite large and difficult to search, so breaking it up and adding categories to the Schema of the operators would be very helpful.
+
+To achieve better output in the Doxygen Python docs:
+http://stackoverflow.com/questions/7690220/how-to-document-python-function-parameter-types
+
+Swap this kind of formatting into py files:
+
+```
+def my_method(x, y):"""!
+    my_method description
+
+    @type x: int
+    @param x: An integer
+
+    @type y: int|string
+    @param y: An integer or string
+
+    @rtype: string
+    @return: Returns a sentence with your variables in it
+    """return "Hello World! %s, %s" % (x,y)
+```
+
+Note that the bang (!) is added after the opening comment """! - this seems to do the trick and the remaining comments will be nicely parsed by Doxygen.
+
+### Other Notes
+
+Useful for xcode, currently off
+GENERATE_DOCSET = NO
+
+Look at search engine integration, xml output, etc
+EXTERNAL_SEARCH = YES
diff --git a/docs/caffe2/footer.html b/docs/caffe2/footer.html
new file mode 100644
index 0000000..8e494de
--- /dev/null
+++ b/docs/caffe2/footer.html
@@ -0,0 +1,55 @@
+<!-- HTML footer for doxygen 1.8.14-->
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">$generatedby
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="$relpath^doxygen.png" alt="doxygen"/></a> $doxygenversion </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="$relpath^doxygen.png" alt="doxygen"/>
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+<div class="footerContainer">
+  <div id="footer_wrap" class="wrapper footerWrapper">
+    <div class="footerBlocks">
+      <div id="fb_oss" class="footerSection fbOpenSourceFooter">
+          <svg class="facebookOSSLogoSvg" viewBox="0 0 1133.9 1133.9" x="0px" y="0px" height=50 width=50>
+            <g>
+              <path class="logoRing outerRing" d="M 498.3 3.7 c 153.6 88.9 307.3 177.7 461.1 266.2 c 7.6 4.4 10.3 9.1 10.3 17.8 c -0.3 179.1 -0.2 358.3 0 537.4 c 0 8.1 -2.4 12.8 -9.7 17.1 c -154.5 88.9 -308.8 178.1 -462.9 267.5 c -9 5.2 -15.5 5.3 -24.6 0.1 c -153.9 -89.2 -307.9 -178 -462.1 -266.8 C 3 838.8 0 833.9 0 825.1 c 0.3 -179.1 0.2 -358.3 0 -537.4 c 0 -8.6 2.6 -13.6 10.2 -18 C 164.4 180.9 318.4 92 472.4 3 C 477 -1.5 494.3 -0.7 498.3 3.7 Z M 48.8 555.3 c 0 79.9 0.2 159.9 -0.2 239.8 c -0.1 10 3 15.6 11.7 20.6 c 137.2 78.8 274.2 157.8 411 237.3 c 9.9 5.7 17 5.7 26.8 0.1 c 137.5 -79.8 275.2 -159.2 412.9 -238.5 c 7.4 -4.3 10.5 -8.9 10.5 -17.8 c -0.3 -160.2 -0.3 -320.5 0 -480.7 c 0 -8.8 -2.8 -13.6 -10.3 -18 C 772.1 218 633.1 137.8 494.2 57.4 c -6.5 -3.8 -11.5 -4.5 -18.5 -0.5 C 336.8 137.4 197.9 217.7 58.8 297.7 c -7.7 4.4 -10.2 9.2 -10.2 17.9 C 48.9 395.5 48.8 475.4 48.8 555.3 Z" />
+              <path class="logoRing middleRing" d="M 184.4 555.9 c 0 -33.3 -1 -66.7 0.3 -100 c 1.9 -48 24.1 -86 64.7 -110.9 c 54.8 -33.6 110.7 -65.5 167 -96.6 c 45.7 -25.2 92.9 -24.7 138.6 1 c 54.4 30.6 108.7 61.5 162.2 93.7 c 44 26.5 67.3 66.8 68 118.4 c 0.9 63.2 0.9 126.5 0 189.7 c -0.7 50.6 -23.4 90.7 -66.6 116.9 c -55 33.4 -110.8 65.4 -167.1 96.5 c -43.4 24 -89 24.2 -132.3 0.5 c -57.5 -31.3 -114.2 -64 -170 -98.3 c -41 -25.1 -62.9 -63.7 -64.5 -112.2 C 183.5 621.9 184.3 588.9 184.4 555.9 Z M 232.9 556.3 c 0 29.5 0.5 59.1 -0.1 88.6 c -0.8 39.2 16.9 67.1 50.2 86.2 c 51.2 29.4 102.2 59.2 153.4 88.4 c 31.4 17.9 63.6 18.3 95 0.6 c 53.7 -30.3 107.1 -61.2 160.3 -92.5 c 29.7 -17.5 45 -44.5 45.3 -78.8 c 0.6 -61.7 0.5 -123.5 0 -185.2 c -0.3 -34.4 -15.3 -61.5 -44.9 -79 C 637.7 352.6 583 320.8 527.9 290 c -27.5 -15.4 -57.2 -16.1 -84.7 -0.7 c -56.9 31.6 -113.4 64 -169.1 97.6 c -26.4 15.9 -40.7 41.3 -41.1 72.9 C 232.6 491.9 232.9 524.1 232.9 556.3 Z" />
+              <path class="logoRing innerRing" d="M 484.9 424.4 c 69.8 -2.8 133.2 57.8 132.6 132 C 617 630 558.5 688.7 484.9 689.1 c -75.1 0.4 -132.6 -63.6 -132.7 -132.7 C 352.1 485 413.4 421.5 484.9 424.4 Z M 401.3 556.7 c -3.4 37.2 30.5 83.6 83 84.1 c 46.6 0.4 84.8 -37.6 84.9 -84 c 0.1 -46.6 -37.2 -84.4 -84.2 -84.6 C 432.2 472.1 397.9 518.3 401.3 556.7 Z" />
+            </g>
+          </svg>
+        <h2>Facebook Open Source</h2>
+      </div>
+      <div class="footerSection">
+        <a class="footerLink" href="https://code.facebook.com/projects/" target="_blank">Open Source Projects</a>
+        <a class="footerLink" href="https://github.com/facebook/" target="_blank">GitHub</a>
+        <a class="footerLink" href="https://twitter.com/fbOpenSource" target="_blank">Twitter</a>
+      </div>
+      <div class="footerSection rightAlign">
+        <a class="footerLink" href="https://github.com/caffe2/caffe2" target="_blank">Contribute to this project on GitHub</a>
+      </div>
+    </div>
+  </div>
+</div>
+<script type="text/javascript" src="/js/jekyll-link-anchors.js"></script>
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  ga('create', '{{ site.gacode }}', 'auto');
+  ga('send', 'pageview');
+</script>
+</body>
+</html>
diff --git a/docs/caffe2/header.html b/docs/caffe2/header.html
new file mode 100644
index 0000000..2568a2a
--- /dev/null
+++ b/docs/caffe2/header.html
@@ -0,0 +1,57 @@
+<!-- HTML header for doxygen 1.8.14-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<link rel="icon" href="/static/favicon.png" type="image/x-icon">
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo" width="56"><a href="/"><img alt="Logo" src="$relpath^$projectlogo"/></a></td>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <td id="projectalign" style="padding-left: 0.5em;">
+   <div id="projectname">$projectname
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/docs/caffe2/installation.md b/docs/caffe2/installation.md
new file mode 100644
index 0000000..6abc67f
--- /dev/null
+++ b/docs/caffe2/installation.md
@@ -0,0 +1,73 @@
+## Building Caffe2
+
+This guide builds from source. For alternatives, refer to https://caffe2.ai/docs/getting-started.html
+
+Get latest source from GitHub.
+
+    git clone --recursive https://github.com/caffe2/caffe2.git
+    cd caffe2
+
+Note that you might need to uninstall existing Eigen and pybind11 packages due to compile-time dependencies when building from source. For this reason, Caffe2 uses git submodules to reference external packages in the third_party folder. These are downloaded with the --recursive option.
+
+#### MacOS X
+
+    brew install openblas glog gtest automake protobuf leveled lmdb
+    mkdir build && cd build
+    cmake .. -DBLAS=OpenBLAS -DUSE_OPENCV=off
+    make
+
+#### Ubuntu
+
+###### Ubuntu 14.04 LTS
+    sudo apt-get install libprotobuf-dev protobuf-compiler libatlas-base-dev libgoogle-glog-dev libgtest-dev liblmdb-dev libleveldb-dev libsnappy-dev python-dev python-pip libiomp-dev libopencv-dev libpthread-stubs0-dev cmake
+    sudo pip install numpy
+    wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
+    sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
+    sudo apt-get update
+    sudo apt-get install cuda
+    sudo apt-get install git
+
+    CUDNN_URL="http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-linux-x64-v5.1.tgz" &&
+    curl -fsSL ${CUDNN_URL} -O &&
+    sudo tar -xzf cudnn-8.0-linux-x64-v5.1.tgz -C /usr/local &&
+    rm cudnn-8.0-linux-x64-v5.1.tgz &&
+    sudo ldconfig
+
+    mkdir build && cd build
+    cmake ..
+    make
+
+###### Ubuntu 16.04 LTS
+    sudo apt-get install libprotobuf-dev protobuf-compiler libatlas-base-dev libgoogle-glog-dev libgtest-dev liblmdb-dev libleveldb-dev libsnappy-dev python-dev python-pip libiomp-dev libopencv-dev libpthread-stubs0-dev cmake
+    sudo pip install numpy
+    wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
+    sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
+    sudo apt-get update
+    sudo apt-get install cuda
+    sudo apt-get install git
+
+    CUDNN_URL="http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-linux-x64-v5.1.tgz" &&
+    curl -fsSL ${CUDNN_URL} -O &&
+    sudo tar -xzf cudnn-8.0-linux-x64-v5.1.tgz -C /usr/local &&
+    rm cudnn-8.0-linux-x64-v5.1.tgz &&
+    sudo ldconfig
+
+    mkdir build && cd build
+    cmake ..
+    make
+
+## Python support
+
+To use Caffe2 in Python, you need two libraries, future and six.
+
+    pip install future six
+
+To run the tutorials, download additional source from GitHub.
+
+    git clone --recursive https://github.com/caffe2/tutorials.git caffe2_tutorials
+    cd caffe2_tutorials
+
+You'll also need jupyter (formerly ipython) notebooks and matplotlib, which can be installed on MacOS X with
+
+    brew install matplotlib --with-python3
+    pip install jupyter
diff --git a/docs/caffe2/main.css b/docs/caffe2/main.css
new file mode 100644
index 0000000..b00ed8b
--- /dev/null
+++ b/docs/caffe2/main.css
@@ -0,0 +1 @@
+@font-face{font-family:'Lato';src:url("/static/fonts/LatoLatin-Italic.woff2") format("woff2"),url("/static/fonts/LatoLatin-Italic.woff") format("woff");font-weight:normal;font-style:italic}@font-face{font-family:'Lato';src:url("/static/fonts/LatoLatin-Black.woff2") format("woff2"),url("/static/fonts/LatoLatin-Black.woff") format("woff");font-weight:900;font-style:normal}@font-face{font-family:'Lato';src:url("/static/fonts/LatoLatin-BlackItalic.woff2") format("woff2"),url("/static/fonts/LatoLatin-BlackItalic.woff") format("woff");font-weight:900;font-style:italic}@font-face{font-family:'Lato';src:url("/static/fonts/LatoLatin-Light.woff2") format("woff2"),url("/static/fonts/LatoLatin-Light.woff") format("woff");font-weight:300;font-style:normal}@font-face{font-family:'Lato';src:url("/static/fonts/LatoLatin-Regular.woff2") format("woff2"),url("/static/fonts/LatoLatin-Regular.woff") format("woff");font-weight:normal;font-style:normal}html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td,article,aside,canvas,details,embed,figure,figcaption,footer,header,hgroup,menu,nav,output,ruby,section,summary,time,mark,audio,video{margin:0;padding:0;border:0;font-size:100%;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,menu,nav,section{display:block}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}table{border-collapse:collapse;border-spacing:0}body{background:#FFF;color:#303846;font:normal 18px/1.4em "Lato",Calibri,Arial,sans-serif;height:100vh;text-align:left;text-rendering:optimizeLegibility}img{max-width:100%}article p img{max-width:100%;display:block;margin-left:auto;margin-right:auto}a{border-bottom:1px dotted #a67b5b;color:#303846;text-decoration:none;-webkit-transition:all 0.3s;transition:all 0.3s}blockquote{padding:15px 30px 15px 15px;margin:20px 0 15px 10px;background-color:rgba(204,122,111,0.1);border-left:10px solid rgba(191,87,73,0.2)}#fb_oss a{border:0}h1,h2,h3,h4{font-family:"Lato","Helvetica Neue",Arial,sans-serif;font-weight:900}.navPusher{border-top:58px solid #FFF;height:100%;left:0;position:relative;z-index:99}.homeContainer{background:#FFF;color:#4d4d4d;text-align:center}.homeContainer a{color:#a67b5b}.homeContainer .homeSplashFade{color:white}.homeContainer .homeWrapper{padding:3em 10px;text-align:center}.homeContainer .homeWrapper .wrapper{margin:0px auto;max-width:900px;padding:0 20px}.homeContainer .homeWrapper .projectLogo img{height:100px;margin-bottom:0px}.homeContainer .homeWrapper h1#project_title{font-family:"Lato","Helvetica Neue",Arial,sans-serif;font-size:300%;letter-spacing:-0.08em;line-height:1em;margin-bottom:80px}.homeContainer .homeWrapper h2#project_tagline{font-family:"Lato","Helvetica Neue",Arial,sans-serif;font-size:200%;letter-spacing:-0.04em;line-height:1em;color:#99424f}.wrapper{margin:0px auto;max-width:900px;padding:0 10px}.projectLogo{display:none}.projectLogo img{height:100px;margin-bottom:0px}section#intro{margin:10px 0;color:#999}section#intro p{line-height:1.5;padding-bottom:20px}section#intro ul{list-style:disc}section#intro ol,section#intro ul{padding-left:24px}section#intro ol li,section#intro ul li{padding-bottom:8px;padding-left:6px}section#intro strong,section#intro b{font-weight:bold}.fbossFontLight{font-family:"Lato",Calibri,Arial,sans-serif;font-weight:300;font-style:normal}.fb-like{display:block;margin-bottom:20px;width:100%}.center{display:block;text-align:center}.mainContainer{background:#FFF;overflow:auto}.mainContainer .mainWrapper{padding:4vh 10px;text-align:left}.mainContainer .mainWrapper .allShareBlock{padding:10px 0}.mainContainer .mainWrapper .allShareBlock .pluginBlock{margin:12px 0;padding:0}.mainContainer .mainWrapper :not(.gist-meta)>a:hover,.mainContainer .mainWrapper :not(.gist-meta)>a:focus{background:#FFF;color:#4d4d4d}.mainContainer .mainWrapper em,.mainContainer .mainWrapper i{font-style:italic}.mainContainer .mainWrapper strong,.mainContainer .mainWrapper b{font-weight:bold}.mainContainer .mainWrapper h1{font-size:300%;line-height:1em;padding:1.4em 0 1em;text-align:left}.mainContainer .mainWrapper h2{font-size:250%;line-height:1em;margin-bottom:20px;padding:1.4em 0 20px;text-align:left}.mainContainer .mainWrapper h2{border-bottom:1px solid #e6e6e6;font-size:22px;padding:10px 0}.mainContainer .mainWrapper h2.blockHeader{border-bottom:1px solid white;color:white;font-size:22px;margin-bottom:20px;padding:10px 0}.mainContainer .mainWrapper h3{font-size:150%;line-height:1.2em;padding:1em 0 0.8em}.mainContainer .mainWrapper h4{font-size:130%;line-height:1.2em;padding:1em 0 0.8em}.mainContainer .mainWrapper code{color:#999;display:inline-block}.mainContainer .mainWrapper p{padding:0.8em 0}.mainContainer .mainWrapper ul{list-style:disc}.mainContainer .mainWrapper ol,.mainContainer .mainWrapper ul{padding-left:24px}.mainContainer .mainWrapper ol li,.mainContainer .mainWrapper ul li{padding-bottom:4px;padding-left:6px}.mainContainer .mainWrapper strong{font-weight:bold}.mainContainer .mainWrapper .post{position:relative}.mainContainer .mainWrapper .post .katex{font-weight:700}.mainContainer .mainWrapper .post.basicPost{margin-top:30px}.mainContainer .mainWrapper .post :not(.gist-meta)>a{color:#a67b5b}.mainContainer .mainWrapper .post :not(.gist-meta)>a:hover,.mainContainer .mainWrapper .post :not(.gist-meta)>a:focus{color:#4d4d4d}.mainContainer .mainWrapper .post h2{border-bottom:4px solid #FFF;font-size:130%}.mainContainer .mainWrapper .post h3{border-bottom:1px solid #FFF;font-size:110%}.mainContainer .mainWrapper .post h4{border-bottom:1px solid #FFF;font-size:90%}.mainContainer .mainWrapper .post ol{list-style:decimal outside none}.mainContainer .mainWrapper .post .post-header{padding:1em 0}.mainContainer .mainWrapper .post .post-header h1{font-size:150%;line-height:1em;padding:0.4em 0 0}.mainContainer .mainWrapper .post .post-header h1 a{border:none}.mainContainer .mainWrapper .post .post-header .post-meta{color:#a67b5b;font-family:"Lato","Helvetica Neue",Arial,sans-serif;text-align:center}.mainContainer .mainWrapper .post .postSocialPlugins{padding-top:1em}.mainContainer .mainWrapper .post .docPagination{background:#FFF;bottom:0px;left:0px;position:absolute;right:0px}.mainContainer .mainWrapper .post .docPagination .pager{display:inline-block;width:50%}.mainContainer .mainWrapper .post .docPagination .pagingNext{float:right;text-align:right}.mainContainer .mainWrapper .post .docPagination :not(.gist-meta)>a{border:none;color:#a67b5b;display:block;padding:4px 12px}.mainContainer .mainWrapper .post .docPagination :not(.gist-meta)>a:hover{background-color:#4d4d4d;color:#303846}.mainContainer .mainWrapper .post .docPagination :not(.gist-meta)>a .pagerLabel{display:inline}.mainContainer .mainWrapper .post .docPagination :not(.gist-meta)>a .pagerTitle{display:none}.mainContainer .mainWrapper .posts .post{margin-bottom:6vh}#integrations_title{font-size:250%;margin:80px 0}.ytVideo{height:0;overflow:hidden;padding-bottom:53.4%;padding-top:25px;position:relative}.ytVideo iframe,.ytVideo object,.ytVideo embed{height:100%;left:0;position:absolute;top:0;width:100%}@media only screen and (min-width: 480px){h1#project_title{font-size:500%}h2#project_tagline{font-size:250%;color:#999}.projectLogo img{margin-bottom:10px;height:200px}.homeContainer .homeWrapper{padding-left:10px;padding-right:10px}.mainContainer .mainWrapper .post h2{font-size:180%}.mainContainer .mainWrapper .post h3{font-size:120%}.mainContainer .mainWrapper .post h4{font-size:100%}.mainContainer .mainWrapper .post .docPagination a .pagerLabel{display:none}.mainContainer .mainWrapper .post .docPagination a .pagerTitle{display:inline}}@media only screen and (min-width: 900px){.homeContainer .homeWrapper{position:relative}.homeContainer .homeWrapper .projectLogo{align-items:center;bottom:0;display:flex;justify-content:flex-end;left:0;padding:2em 20px 4em;position:absolute;right:20px;top:0}.homeContainer .homeWrapper .projectLogo img{height:100%;max-height:250px}}@media only screen and (min-width: 1024px){.mainContainer .mainWrapper .post{box-sizing:border-box;display:block}.mainContainer .mainWrapper .post ul#markdown-toc{font-size:14px;list-style-type:none;display:block}.mainContainer .mainWrapper .post ul#markdown-toc li{text-align:right;width:30%;float:left;margin-bottom:-1px}.mainContainer .mainWrapper .post .post-header h1{font-size:250%}.mainContainer .mainWrapper .posts .post{margin-bottom:4vh;width:100%}}@media only screen and (min-width: 1200px){.wrapper{max-width:1100px}}@media only screen and (min-width: 1500px){.wrapper{max-width:1400px}}.fixedHeaderContainer{background:#a67b5b;color:#4d4d4d;height:40px;padding:10px 0 8px;position:fixed;width:100%;z-index:9999}.fixedHeaderContainer a{align-items:center;border:0;color:#4d4d4d;display:flex;flex-flow:row nowrap;height:40px}.fixedHeaderContainer header{display:flex;flex-flow:row nowrap;position:relative;text-align:left}.fixedHeaderContainer header img{height:50px;padding-right:4px}.fixedHeaderContainer header h2{display:block;font-family:"Lato","Helvetica Neue",Arial,sans-serif;font-weight:900;line-height:18px;position:relative;font-size:22px;color:#191919;letter-spacing:1px}.navigationFull{height:34px;margin-left:auto}.navigationFull nav{position:relative}.navigationFull nav ul{display:flex;flex-flow:row nowrap;margin:0 -10px}.navigationFull nav ul li{padding:0 10px;display:block}.navigationFull nav ul li a{border-bottom:2px solid transparent;color:#fff;font-size:16px;font-weight:400;line-height:1.2em}.navigationFull nav ul li a:hover{border-bottom:2px solid #4d4d4d;color:#4d4d4d}.navigationFull nav ul li.navItemActive a{color:#4d4d4d}input[type="search"]{-moz-appearance:none;-webkit-appearance:none}.navSearchWrapper{align-self:center;position:relative}.navSearchWrapper::before{border:3px solid #ccc;border-radius:50%;content:" ";display:block;height:6px;left:15px;width:6px;position:absolute;top:4px;z-index:1}.navSearchWrapper::after{background:#ccc;content:" ";height:7px;left:24px;position:absolute;transform:rotate(-45deg);top:12px;width:3px;z-index:1}.navSearchWrapper .aa-dropdown-menu{background:#FFF;border:3px solid rgba(48,56,70,0.25);color:#303846;font-size:14px;left:auto !important;line-height:1.2em;right:0 !important}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header{background:#a67b5b;color:#FFF}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header .algolia-docsearch-suggestion--highlight{background-color:#FFF;color:#a67b5b}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--title .algolia-docsearch-suggestion--highlight,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column .algolia-docsearch-suggestion--highlight{color:#a67b5b}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion__secondary,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column{border-color:rgba(48,56,70,0.3)}input#search_input{padding-left:25px;font-size:14px;line-height:20px;border-radius:20px;background-color:rgba(153,153,153,0.25);border:none;color:rgba(153,153,153,0);outline:none;position:relative;transition:background-color 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),width 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),color 0.2s ease;width:200px}input#search_input:focus,input#search_input:active{background-color:#FFF;color:#303846;width:240px}.navigationSlider .navSearchWrapper::before{left:6px;top:6px}.navigationSlider .navSearchWrapper::after{left:15px;top:14px}.navigationSlider input#search_input_react{box-sizing:border-box;padding-left:25px;font-size:14px;line-height:20px;border-radius:20px;background-color:rgba(153,153,153,0.25);border:none;color:#303846;outline:none;position:relative;transition:background-color 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),width 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),color 0.2s ease;width:100%}.navigationSlider input#search_input_react:focus,.navigationSlider input#search_input_react:active{background-color:#FFF;color:#4d4d4d}.navigationSlider .algolia-docsearch-suggestion--subcategory-inline{display:none}.navigationSlider>span{width:100%}.navigationSlider .aa-dropdown-menu{background:#FFF;border:0px solid #FFF;color:#303846;font-size:12px;line-height:2em;max-height:140px;min-width:auto;overflow-y:scroll;-webkit-overflow-scrolling:touch;padding:0;border-radius:0;position:relative !important;width:100%}.rougeHighlight{background-color:#e9e9e9;color:#a67b5b}.rougeHighlight .c{color:#586e75}.rougeHighlight .err{color:#a67b5b}.rougeHighlight .g{color:#a67b5b}.rougeHighlight .k{color:#859900}.rougeHighlight .l{color:#a67b5b}.rougeHighlight .n{color:#a67b5b}.rougeHighlight .o{color:#859900}.rougeHighlight .x{color:#cb4b16}.rougeHighlight .p{color:#a67b5b}.rougeHighlight .cm{color:#586e75}.rougeHighlight .cp{color:#859900}.rougeHighlight .c1{color:#72c02c}.rougeHighlight .cs{color:#859900}.rougeHighlight .gd{color:#2aa198}.rougeHighlight .ge{color:#a67b5b;font-style:italic}.rougeHighlight .gr{color:#dc322f}.rougeHighlight .gh{color:#cb4b16}.rougeHighlight .gi{color:#859900}.rougeHighlight .go{color:#a67b5b}.rougeHighlight .gp{color:#a67b5b}.rougeHighlight .gs{color:#a67b5b;font-weight:bold}.rougeHighlight .gu{color:#cb4b16}.rougeHighlight .gt{color:#a67b5b}.rougeHighlight .kc{color:#cb4b16}.rougeHighlight .kd{color:#268bd2}.rougeHighlight .kn{color:#859900}.rougeHighlight .kp{color:#859900}.rougeHighlight .kr{color:#268bd2}.rougeHighlight .kt{color:#dc322f}.rougeHighlight .ld{color:#a67b5b}.rougeHighlight .m{color:#2aa198}.rougeHighlight .s{color:#2aa198}.rougeHighlight .na{color:#a67b5b}.rougeHighlight .nb{color:#B58900}.rougeHighlight .nc{color:#268bd2}.rougeHighlight .no{color:#cb4b16}.rougeHighlight .nd{color:#268bd2}.rougeHighlight .ni{color:#cb4b16}.rougeHighlight .ne{color:#cb4b16}.rougeHighlight .nf{color:#268bd2}.rougeHighlight .nl{color:#a67b5b}.rougeHighlight .nn{color:#a67b5b}.rougeHighlight .nx{color:#a67b5b}.rougeHighlight .py{color:#a67b5b}.rougeHighlight .nt{color:#268bd2}.rougeHighlight .nv{color:#268bd2}.rougeHighlight .ow{color:#859900}.rougeHighlight .w{color:#a67b5b}.rougeHighlight .mf{color:#2aa198}.rougeHighlight .mh{color:#2aa198}.rougeHighlight .mi{color:#2aa198}.rougeHighlight .mo{color:#2aa198}.rougeHighlight .sb{color:#586e75}.rougeHighlight .sc{color:#2aa198}.rougeHighlight .sd{color:#a67b5b}.rougeHighlight .s2{color:#2aa198}.rougeHighlight .se{color:#cb4b16}.rougeHighlight .sh{color:#a67b5b}.rougeHighlight .si{color:#2aa198}.rougeHighlight .sx{color:#2aa198}.rougeHighlight .sr{color:#dc322f}.rougeHighlight .s1{color:#2aa198}.rougeHighlight .ss{color:#2aa198}.rougeHighlight .bp{color:#268bd2}.rougeHighlight .vc{color:#268bd2}.rougeHighlight .vg{color:#268bd2}.rougeHighlight .vi{color:#268bd2}.rougeHighlight .il{color:#2aa198}.highlighter-rouge{color:#5e9f24;font:800 12px/1.5em Hack, monospace;max-width:100%}.highlighter-rouge .rougeHighlight{border-radius:3px;margin:20px 0;padding:0px;overflow-x:scroll;-webkit-overflow-scrolling:touch}.highlighter-rouge .rougeHighlight table{background:none;border:none}.highlighter-rouge .rougeHighlight table tbody tr{background:none;display:flex;flex-flow:row nowrap}.highlighter-rouge .rougeHighlight table tbody tr td{display:block;flex:1 1}.highlighter-rouge .rougeHighlight table tbody tr td.gutter{border-right:1px solid #fff;color:#c1a38d;margin-right:10px;max-width:40px;padding-right:10px}.highlighter-rouge .rougeHighlight table tbody tr td.gutter pre{max-width:20px}p>.highlighter-rouge,li>.highlighter-rouge,a>.highlighter-rouge{font-size:16px;font-weight:400;line-height:inherit}a:hover .highlighter-rouge{color:white}.promoSection{display:flex;flex-flow:column wrap;font-size:125%;line-height:1.6em;margin:-10px 0;position:relative;z-index:99}.promoSection .promoRow{padding:10px 0}.promoSection .promoRow .pluginWrapper{display:block}.promoSection .promoRow .pluginWrapper.ghWatchWrapper,.promoSection .promoRow .pluginWrapper.ghStarWrapper{height:28px}.promoSection .promoRow .pluginRowBlock{display:flex;flex-flow:wrap;justify-content:center;margin:0 -2px}.promoSection .promoRow .pluginRowBlock .pluginWrapper{padding:0 2px}iframe.pluginIframe{height:500px;margin-top:20px;width:100%}.iframeContent{display:none}.iframePreview{display:inline-block;margin-top:20px}@media only screen and (min-width: 1024px){.iframeContent{display:block}.iframePreview{display:none}}.button{border:1px solid #FFF;border-radius:3px;color:#FFF;display:inline-block;font-size:14px;font-weight:900;line-height:1.2em;padding:10px;text-transform:uppercase;transition:background 0.3s, color 0.3s}.button:hover{background:#FFF;color:#4d4d4d}.homeContainer .button{border-color:#99424f;border-width:1px;color:#99424f}.homeContainer .button:hover{background:#99424f;color:#FFF}.blockButton{display:block}.edit-page-link{float:right;font-size:14px;font-weight:normal;line-height:20px;opacity:0.6;transition:opacity 0.5s}.edit-page-link:hover{opacity:1}.gridBlockWrapper{background:#f9f9f9}.gridBlockWrapper.alternateBackground{background:#e9e9e9}.gridBlock{margin:0px auto;padding:0 10px;padding-top:100px;padding-bottom:50px;max-width:1200px}.gridBlock h3{width:100%;text-align:left;color:#999;font-size:20px;margin-top:-40px}.gridBlock .blockElement{padding:5px 0;align-items:center}.gridBlock .blockElement img{max-width:100%}.gridBlock .blockElement h3{font-size:40px;margin:0;padding:10px 0}.gridBlock .gridClear{clear:both}.gridBlock .alignCenter{text-align:center}.gridBlock .alignRight{text-align:right}.gridBlock .imageAlignSide{justify-content:center;align-items:center;display:flex;flex-flow:row wrap}.blockImage{max-width:900px;width:50%}.imageAlignTop .blockImage{margin-bottom:20px}.imageAlignTop.alignCenter .blockImage{margin-left:auto;margin-right:auto}.imageAlignSide p{margin-bottom:40px;max-width:560px;margin:0}.imageAlignSide .blockImage{flex:0 1 400px;margin-right:100px}.imageAlignSide .blockContent{flex:1 1}.imageAlignSide .blockContent p{padding:0}@media only screen and (max-width: 1023px){.responsiveList .blockContent{position:relative}.responsiveList .blockContent>div{padding-left:20px}.responsiveList .blockContent::before{content:"\2022";position:absolute}}@media only screen and (min-width: 1024px){.gridBlock{display:flex;flex-direction:row;flex-wrap:wrap}.gridBlock .oneByGridBlock{box-sizing:border-box;flex:1 0 100%;padding:10px}.gridBlock .twoByGridBlock{box-sizing:border-box;flex:1 0 50%;padding:10px}.gridBlock .fourByGridBlock{box-sizing:border-box;flex:1 0 25%;padding:10px}h2+.gridBlock{padding-top:20px}}@media only screen and (min-width: 1400px){.gridBlock{display:flex;flex-direction:row;flex-wrap:wrap}.gridBlock .oneByGridBlock{box-sizing:border-box;flex:1 0 100%;padding:10px 20px}.gridBlock .twoByGridBlock{box-sizing:border-box;flex:1 0 50%;padding:10px 20px}.gridBlock .fourByGridBlock{box-sizing:border-box;flex:1 0 25%;padding:10px 20px}}.poweredByContainer{background:#FFF;color:#4d4d4d;margin-bottom:20px}.poweredByContainer a{color:#4d4d4d}.poweredByContainer .poweredByWrapper h2{border-color:#999;color:#999}.poweredByContainer .poweredByMessage{color:#999;font-size:14px;padding-top:20px}.poweredByItems{display:flex;flex-flow:row wrap;margin:0 -10px}.poweredByItem{box-sizing:border-box;flex:1 0 50%;line-height:1.1em;padding:5px 10px}.poweredByItem.itemLarge{flex-basis:100%;padding:10px;text-align:center}.poweredByItem.itemLarge:nth-child(4){padding-bottom:20px}.poweredByItem.itemLarge img{max-height:30px}@media only screen and (min-width: 480px){.itemLarge{flex-basis:50%;max-width:50%}}@media only screen and (min-width: 1024px){.poweredByItem{flex-basis:25%;max-width:25%}.poweredByItem.itemLarge{padding-bottom:20px;text-align:left}}.footerContainer{background:#FFF;color:#a67b5b;overflow:hidden;padding:0 10px;text-align:left}.footerContainer .footerWrapper{border-top:1px solid #a67b5b;padding:0}.footerContainer .footerWrapper .footerBlocks{align-items:center;align-content:center;display:flex;flex-flow:row wrap;margin:0 -20px;padding:10px 0}.footerContainer .footerWrapper .footerSection{box-sizing:border-box;flex:1 1 25%;font-size:14px;min-width:275px;padding:0px 20px}.footerContainer .footerWrapper .footerSection a{border:0;color:inherit;display:inline-block;line-height:1.2em}.footerContainer .footerWrapper .footerSection .footerLink{padding-right:20px}.footerContainer .footerWrapper .fbOpenSourceFooter{align-items:center;display:flex;flex-flow:row nowrap;max-width:25%}.footerContainer .footerWrapper .fbOpenSourceFooter .facebookOSSLogoSvg{flex:0 0 31px;height:30px;margin-right:10px;width:31px}.footerContainer .footerWrapper .fbOpenSourceFooter .facebookOSSLogoSvg path{fill:#a67b5b}.footerContainer .footerWrapper .fbOpenSourceFooter .facebookOSSLogoSvg .middleRing{opacity:0.7}.footerContainer .footerWrapper .fbOpenSourceFooter .facebookOSSLogoSvg .innerRing{opacity:0.45}.footerContainer .footerWrapper .fbOpenSourceFooter h2{display:block;font-weight:900;line-height:1em}@media only screen and (min-width: 900px){.footerSection.rightAlign{margin-left:auto;max-width:25%;text-align:right}}.navigationFull{display:none}.navigationSlider{position:absolute;right:0px}.navigationSlider .navSlideout{cursor:pointer;padding-top:4px;position:absolute;right:10px;top:0;transition:top 0.3s;z-index:101}.navigationSlider .slidingNav{background:#a67b5b;box-sizing:border-box;height:0px;overflow-x:hidden;padding:0;position:absolute;right:0px;top:0;transition:height 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55),width 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55);width:0}.navigationSlider .slidingNav ul{flex-flow:column nowrap;list-style:none;padding:10px}.navigationSlider .slidingNav ul li{margin:0;padding:2px 0}.navigationSlider .slidingNav ul li a{color:#FFF;display:inline;margin:3px 5px;padding:2px 0px;transition:background-color 0.3s}.navigationSlider .slidingNav ul li a:focus,.navigationSlider .slidingNav ul li a:hover{border-bottom:2px solid #FFF}.navigationSlider .navSlideoutActive .slidingNav{height:auto;padding-top:48px;width:300px}.navigationSlider .navSlideoutActive .navSlideout{top:-2px}.navigationSlider .navSlideoutActive .navSlideout .menuExpand span:nth-child(1){background-color:#303846;top:16px;transform:rotate(45deg)}.navigationSlider .navSlideoutActive .navSlideout .menuExpand span:nth-child(2){opacity:0}.navigationSlider .navSlideoutActive .navSlideout .menuExpand span:nth-child(3){background-color:#303846;transform:rotate(-45deg)}.menuExpand{display:flex;flex-flow:column nowrap;height:20px;justify-content:space-between}.menuExpand span{background:#4d4d4d;border-radius:3px;display:block;flex:0 0 4px;height:4px;position:relative;top:0;transition:background-color 0.3s, top 0.3s, opacity 0.3s, transform 0.3s;width:20px}.navPusher{border-top:58px solid #FFF;position:relative;left:0;z-index:99;height:100%}.navPusher::after{position:absolute;top:0;right:0;width:0;height:0;background:rgba(0,0,0,0.4);content:'';opacity:0;-webkit-transition:opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s;transition:opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s}.sliderActive .navPusher::after{width:100%;height:100%;opacity:1;-webkit-transition:opacity 0.5s;transition:opacity 0.5s;z-index:100}@media only screen and (min-width: 1024px){.navigationFull{display:block}.navigationSlider{display:none}}.docsNavContainer{background:#d9d9d9;height:35px;left:0;position:fixed;width:100%;z-index:100}.docMainWrapper .wrapper.mainWrapper{padding-left:0;padding-right:0;padding-top:10px}.docsSliderActive .docsNavContainer{box-sizing:border-box;height:100%;overflow-y:auto;-webkit-overflow-scrolling:touch;padding-bottom:50px}.docsSliderActive .mainContainer{display:none}.navBreadcrumb{box-sizing:border-box;display:flex;flex-flow:row nowrap;font-size:12px;height:35px;overflow:hidden;padding:5px 10px}.navBreadcrumb a,.navBreadcrumb span{border:0;color:#303846}.navBreadcrumb i{padding:0 3px}nav.toc{position:relative}nav.toc section{padding:0px;position:relative}nav.toc section .navGroups{display:none;padding:40px 10px 10px}nav.toc .toggleNav{background:#d9d9d9;color:#303846;position:relative;transition:background-color 0.3s, color 0.3s}nav.toc .toggleNav .navToggle{cursor:pointer;height:24px;margin-right:10px;position:relative;text-align:left;width:18px}nav.toc .toggleNav .navToggle::before,nav.toc .toggleNav .navToggle::after{content:"";position:absolute;top:50%;left:0;left:8px;width:3px;height:6px;border:5px solid #303846;border-width:5px 0;margin-top:-8px;transform:rotate(45deg);z-index:1}nav.toc .toggleNav .navToggle::after{transform:rotate(-45deg)}nav.toc .toggleNav .navToggle i::before,nav.toc .toggleNav .navToggle i::after{content:"";position:absolute;top:50%;left:2px;background:transparent;border-width:0 5px 5px;border-style:solid;border-color:transparent #303846;height:0;margin-top:-7px;opacity:1;width:5px;z-index:10}nav.toc .toggleNav .navToggle i::after{border-width:5px 5px 0;margin-top:2px}nav.toc .toggleNav .navGroup{background:#bfbfbf;margin:1px 0}nav.toc .toggleNav .navGroup ul{display:none}nav.toc .toggleNav .navGroup h3{background:#bfbfbf;color:#303846;font-size:20px;font-weight:600;line-height:1.2em;padding:10px;transition:color 0.2s}nav.toc .toggleNav .navGroup h3 i:not(:empty){width:16px;height:16px;display:inline-block;box-sizing:border-box;text-align:center;color:rgba(48,56,70,0.5);margin-right:10px;transition:color 0.2s}nav.toc .toggleNav .navGroup.navGroupActive{background:#f2f2f2;color:#303846}nav.toc .toggleNav .navGroup.navGroupActive ul{display:block;padding-bottom:10px;padding-top:10px}nav.toc .toggleNav .navGroup.navGroupActive h3{background:#f2f2f2;color:#4d4d4d}nav.toc .toggleNav .navGroup.navGroupActive h3 i{display:none}nav.toc .toggleNav ul{padding-left:0;padding-right:24px}nav.toc .toggleNav ul li{list-style-type:none;padding-bottom:0;padding-left:0}nav.toc .toggleNav ul li a{border:none;color:#303846;display:inline-block;font-size:14px;line-height:1.1em;margin:2px 10px 5px;padding:5px 0 2px;transition:color 0.3s}nav.toc .toggleNav ul li a:hover,nav.toc .toggleNav ul li a:focus{color:#FFF}nav.toc .toggleNav ul li a.navItemActive{color:#a67b5b}nav.toc .toggleNavActive .navBreadcrumb{background:#d9d9d9;margin-bottom:20px;position:fixed;width:100%}nav.toc .toggleNavActive section .navGroups{display:block}nav.toc .toggleNavActive .navToggle::before,nav.toc .toggleNavActive .navToggle::after{border-width:6px 0;height:0px;margin-top:-6px}nav.toc .toggleNavActive .navToggle i{opacity:0}.docsNavVisible .navPusher .mainContainer{padding-top:35px}@media only screen and (min-width: 900px){.navBreadcrumb{padding:5px 0}nav.toc section .navGroups{padding:40px 0 0}}@media only screen and (min-width: 1024px){.navToggle{display:none}.docsSliderActive .mainContainer{display:block}.docsNavVisible .navPusher .mainContainer{padding-top:0}.docsNavContainer{background:none;box-sizing:border-box;height:auto;margin:40px 40px 0 0;overflow-y:auto;position:relative;width:300px}nav.toc section .navGroups{display:block;padding-top:0px}nav.toc .toggleNavActive .navBreadcrumb{margin-bottom:0;position:relative}.docMainWrapper{display:flex;flex-flow:row nowrap;margin-bottom:40px}.docMainWrapper .wrapper{padding-left:0;padding-right:0}.docMainWrapper .wrapper.mainWrapper{padding-top:0}.navBreadcrumb{display:none}.navBreadcrumb h2{padding:0 10px}}.blogContainer .posts{margin-top:60px}.blogContainer .posts .post{border:1px solid #FFF;border-radius:3px;padding:10px 20px 20px}.blogContainer .lonePost{margin-top:60px}.blogContainer .lonePost .post{padding:10px 0px 0px}.blogContainer .post-header h1{text-align:center}.blogContainer .post-header .post-authorName{color:rgba(48,56,70,0.7);font-size:14px;font-weight:900;margin-top:0;padding:0;text-align:center}.blogContainer .post-header .authorPhoto{border-radius:50%;height:50px;left:50%;margin-left:-25px;overflow:hidden;position:absolute;top:-25px;width:50px}table{background:#F8F8F8;border:1px solid #B0B0B0;position:relative;margin:10px auto;padding:0;width:100%;height:auto;border-collapse:collapse;text-align:center;table-layout:fixed}table thead{border-bottom:1px solid #B0B0B0;display:table-header-group}table tbody{display:table-row-group}table tr{display:table-row}table tr:nth-of-type(odd){background:#E8E8E8}table tr th,table tr td{border-right:1px dotted #B0B0B0;display:table-cell;font-size:14px;line-height:1.3em;padding:10px;text-align:left;vertical-align:top}table tr th:last-of-type,table tr td:last-of-type{border-right:0}table tr th code,table tr td code{color:#97dccf;display:inline-block;font-size:12px}table tr th{color:#000000;font-weight:bold;font-family:"Lato","Helvetica Neue",Arial,sans-serif;text-transform:uppercase}.mainContainer .mainWrapper .post .toggler :not(.gist-meta)>a{color:#99424f}.mainContainer .mainWrapper .toggler :not(.gist-meta)>a:hover,.mainContainer .mainWrapper .toggler :not(.gist-meta)>a:focus{background:#a67b5b;color:#99424f}.toggler a{display:inline-block;padding:10px 5px;margin:2px;border:1px solid #05A5D1;border-radius:3px;text-decoration:none !important}.toggler table{border-collapse:collapse;margin-top:50px}.toggler table,td,th{border:0}.toggler strong{font-size:24px;color:#a67b5b}.display-platform-mac .toggler .button-mac,.display-platform-ubuntu .toggler .button-ubuntu,.display-platform-centos .toggler .button-centos,.display-platform-windows .toggler .button-windows,.display-platform-ios .toggler .button-ios,.display-platform-android .toggler .button-android,.display-configuration-compile .toggler .button-compile,.display-configuration-prebuilt .toggler .button-prebuilt,.display-configuration-docker .toggler .button-docker,.display-configuration-cloud .toggler .button-cloud{background-color:#a67b5b}block{display:none}.display-platform-mac.display-configuration-prebuilt .mac.prebuilt,.display-platform-ubuntu.display-configuration-prebuilt .ubuntu.prebuilt,.display-platform-centos.display-configuration-prebuilt .centos.prebuilt,.display-platform-windows.display-configuration-prebuilt .windows.prebuilt,.display-platform-ios.display-configuration-prebuilt .ios.prebuilt,.display-platform-android.display-configuration-prebuilt .android.prebuilt,.display-platform-mac.display-configuration-compile .mac.compile,.display-platform-ubuntu.display-configuration-compile .ubuntu.compile,.display-platform-centos.display-configuration-compile .centos.compile,.display-platform-windows.display-configuration-compile .windows.compile,.display-platform-ios.display-configuration-compile .ios.compile,.display-platform-android.display-configuration-compile .android.compile,.display-platform-mac.display-configuration-docker .mac.docker,.display-platform-ubuntu.display-configuration-docker .ubuntu.docker,.display-platform-centos.display-configuration-docker .centos.docker,.display-platform-windows.display-configuration-docker .windows.docker,.display-platform-ios.display-configuration-docker .ios.docker,.display-platform-android.display-configuration-docker .android.docker,.display-platform-mac.display-configuration-cloud .mac.cloud,.display-platform-ubuntu.display-configuration-cloud .ubuntu.cloud,.display-platform-centos.display-configuration-cloud .centos.cloud,.display-platform-windows.display-configuration-cloud .windows.cloud,.display-platform-ios.display-configuration-cloud .ios.cloud,.display-platform-android.display-configuration-cloud .android.cloud{display:block}a.anchor{position:absolute;margin-top:-58px}.header-link{position:absolute;margin-left:0.2em;opacity:0;-webkit-transition:opacity 0.2s ease-in-out 0.1s;-moz-transition:opacity 0.2s ease-in-out 0.1s;-ms-transition:opacity 0.2s ease-in-out 0.1s}h2:hover .header-link,h3:hover .header-link,h4:hover .header-link,h5:hover .header-link,h6:hover .header-link{opacity:1}.operator_search{width:90%;margin:2%;font-size:18px;font-family:"Lato",Calibri,Arial,sans-serif;border:1px #888 solid;border-radius:4px;outline:none}
diff --git a/docs/caffe2/process.py b/docs/caffe2/process.py
new file mode 100644
index 0000000..f68b952
--- /dev/null
+++ b/docs/caffe2/process.py
@@ -0,0 +1,54 @@
+## @package process
+# Module doxygen.process
+# Script to insert preamble for doxygen and regen API docs
+
+import glob, os, shutil
+
+# Module caffe2...caffe2.python.control_test
+def insert(originalfile,first_line,description):
+    with open(originalfile,'r') as f:
+        f1 = f.readline()
+        if(f1.find(first_line)<0):
+            docs = first_line + description + f1
+            with open('newfile.txt','w') as f2:
+                f2.write(docs)
+                f2.write(f.read())
+            os.rename('newfile.txt',originalfile)
+        else:
+            print('already inserted')
+
+# move up from /caffe2_root/doxygen
+os.chdir("..")
+os.system("git checkout caffe2/contrib/.")
+os.system("git checkout caffe2/distributed/.")
+os.system("git checkout caffe2/experiments/.")
+os.system("git checkout caffe2/python/.")
+
+for root, dirs, files in os.walk("."):
+    for file in files:
+        if (file.endswith(".py") and not file.endswith("_test.py") and not file.endswith("__.py")):
+            filepath = os.path.join(root, file)
+            print("filepath: " + filepath)
+            directory = os.path.dirname(filepath)[2:]
+            directory = directory.replace("/",".")
+            print "directory: " + directory
+            name = os.path.splitext(file)[0]
+            first_line = "## @package " + name
+            description = "\n# Module " + directory + "." + name + "\n"
+            print first_line,description
+            insert(filepath,first_line,description)
+
+if os.path.exists("doxygen/doxygen-python"):
+    print("Looks like you ran this before, so we need to cleanup those old files...")
+    shutil.rmtree("doxygen/doxygen-python")
+else:
+    os.makedirs("doxygen/doxygen-python")
+
+if os.path.exists("doxygen/doxygen-c"):
+    print("Looks like you ran this before, so we need to cleanup those old files...")
+    shutil.rmtree("doxygen/doxygen-c")
+else:
+    os.makedirs("doxygen/doxygen-c")
+
+os.system("doxygen .Doxyfile-python")
+os.system("doxygen .Doxyfile-c")
diff --git a/docs/caffe2/stylesheet.css b/docs/caffe2/stylesheet.css
new file mode 100644
index 0000000..5b29fa8
--- /dev/null
+++ b/docs/caffe2/stylesheet.css
@@ -0,0 +1,1595 @@
+/* The standard CSS for doxygen 1.8.14 */
+
+body, table, div, p, dl {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+p.reference, p.definition {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+/* @group Heading Levels */
+
+h1.groupheader {
+	font-size: 150%;
+}
+
+.title {
+	font: 400 14px/28px Roboto,sans-serif;
+	font-size: 150%;
+	font-weight: bold;
+	margin: 10px 2px;
+}
+
+h2.groupheader {
+	border-bottom: 1px solid #324770;
+	color: #223354;
+	font-size: 150%;
+	font-weight: normal;
+	margin-top: 1.75em;
+	padding-top: 8px;
+	padding-bottom: 4px;
+	width: 100%;
+}
+
+h3.groupheader {
+	font-size: 100%;
+}
+
+h1, h2, h3, h4, h5, h6 {
+	-webkit-transition: text-shadow 0.5s linear;
+	-moz-transition: text-shadow 0.5s linear;
+	-ms-transition: text-shadow 0.5s linear;
+	-o-transition: text-shadow 0.5s linear;
+	transition: text-shadow 0.5s linear;
+	margin-right: 15px;
+}
+
+h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
+	text-shadow: 0 0 15px cyan;
+}
+
+dt {
+	font-weight: bold;
+}
+
+div.multicol {
+	-moz-column-gap: 1em;
+	-webkit-column-gap: 1em;
+	-moz-column-count: 3;
+	-webkit-column-count: 3;
+}
+
+p.startli, p.startdd {
+	margin-top: 2px;
+}
+
+p.starttd {
+	margin-top: 0px;
+}
+
+p.endli {
+	margin-bottom: 0px;
+}
+
+p.enddd {
+	margin-bottom: 4px;
+}
+
+p.endtd {
+	margin-bottom: 2px;
+}
+
+/* @end */
+
+caption {
+	font-weight: bold;
+}
+
+span.legend {
+        font-size: 70%;
+        text-align: center;
+}
+
+h3.version {
+        font-size: 90%;
+        text-align: center;
+}
+
+div.qindex, div.navtab{
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+}
+
+div.qindex, div.navpath {
+	width: 100%;
+	line-height: 140%;
+}
+
+div.navtab {
+	margin-right: 15px;
+}
+
+/* @group Link Styling */
+
+a {
+	color: #3D578C;
+	font-weight: normal;
+	text-decoration: none;
+}
+
+.contents a:visited {
+	color: #4665A2;
+}
+
+a:hover {
+	text-decoration: underline;
+}
+
+a.qindex {
+	font-weight: bold;
+}
+
+a.qindexHL {
+	font-weight: bold;
+	background-color: #9CAFD4;
+	color: #ffffff;
+	border: 1px double #869DCA;
+}
+
+.contents a.qindexHL:visited {
+        color: #ffffff;
+}
+
+a.el {
+	font-weight: bold;
+}
+
+a.elRef {
+}
+
+a.code, a.code:visited, a.line, a.line:visited {
+	color: #4665A2;
+}
+
+a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited {
+	color: #4665A2;
+}
+
+/* @end */
+
+dl.el {
+	margin-left: -1cm;
+}
+
+pre.fragment {
+        border: 1px solid #C4CFE5;
+        background-color: #FBFCFD;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
+        font-family: monospace, fixed;
+        font-size: 105%;
+}
+
+div.fragment {
+        padding: 0px;
+        margin: 4px 8px 4px 2px;
+	background-color: #FBFCFD;
+	border: 1px solid #C4CFE5;
+}
+
+div.line {
+	font-family: monospace, fixed;
+        font-size: 13px;
+	min-height: 13px;
+	line-height: 1.0;
+	text-wrap: unrestricted;
+	white-space: -moz-pre-wrap; /* Moz */
+	white-space: -pre-wrap;     /* Opera 4-6 */
+	white-space: -o-pre-wrap;   /* Opera 7 */
+	white-space: pre-wrap;      /* CSS3  */
+	word-wrap: break-word;      /* IE 5.5+ */
+	text-indent: -53px;
+	padding-left: 53px;
+	padding-bottom: 0px;
+	margin: 0px;
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+div.line:after {
+    content:"\000A";
+    white-space: pre;
+}
+
+div.line.glow {
+	background-color: cyan;
+	box-shadow: 0 0 10px cyan;
+}
+
+
+span.lineno {
+	padding-right: 4px;
+	text-align: right;
+	border-right: 2px solid #0F0;
+	background-color: #E8E8E8;
+        white-space: pre;
+}
+span.lineno a {
+	background-color: #D8D8D8;
+}
+
+span.lineno a:hover {
+	background-color: #C8C8C8;
+}
+
+.lineno {
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+div.ah, span.ah {
+	background-color: black;
+	font-weight: bold;
+	color: #ffffff;
+	margin-bottom: 3px;
+	margin-top: 3px;
+	padding: 0.2em;
+	border: solid thin #333;
+	border-radius: 0.5em;
+	-webkit-border-radius: .5em;
+	-moz-border-radius: .5em;
+	box-shadow: 2px 2px 3px #999;
+	-webkit-box-shadow: 2px 2px 3px #999;
+	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%);
+}
+
+div.classindex ul {
+        list-style: none;
+        padding-left: 0;
+}
+
+div.classindex span.ai {
+        display: inline-block;
+}
+
+div.groupHeader {
+	margin-left: 16px;
+	margin-top: 12px;
+	font-weight: bold;
+}
+
+div.groupText {
+	margin-left: 16px;
+	font-style: italic;
+}
+
+body {
+	background-color: white;
+	color: black;
+        margin: 0;
+}
+
+div.contents {
+	margin-top: 10px;
+	margin-left: 12px;
+	margin-right: 8px;
+}
+
+td.indexkey {
+	background-color: #EBEFF6;
+	font-weight: bold;
+	border: 1px solid #C4CFE5;
+	margin: 2px 0px 2px 0;
+	padding: 2px 10px;
+        white-space: nowrap;
+        vertical-align: top;
+}
+
+td.indexvalue {
+	background-color: #EBEFF6;
+	border: 1px solid #C4CFE5;
+	padding: 2px 10px;
+	margin: 2px 0px;
+}
+
+tr.memlist {
+	background-color: #EEF1F7;
+}
+
+p.formulaDsp {
+	text-align: center;
+}
+
+img.formulaDsp {
+
+}
+
+img.formulaInl {
+	vertical-align: middle;
+}
+
+div.center {
+	text-align: center;
+        margin-top: 0px;
+        margin-bottom: 0px;
+        padding: 0px;
+}
+
+div.center img {
+	border: 0px;
+}
+
+address.footer {
+	text-align: right;
+	padding-right: 12px;
+}
+
+img.footer {
+	border: 0px;
+	vertical-align: middle;
+}
+
+/* @group Code Colorization */
+
+span.keyword {
+	color: #008000
+}
+
+span.keywordtype {
+	color: #604020
+}
+
+span.keywordflow {
+	color: #e08000
+}
+
+span.comment {
+	color: #800000
+}
+
+span.preprocessor {
+	color: #806020
+}
+
+span.stringliteral {
+	color: #002080
+}
+
+span.charliteral {
+	color: #008080
+}
+
+span.vhdldigit {
+	color: #ff00ff
+}
+
+span.vhdlchar {
+	color: #000000
+}
+
+span.vhdlkeyword {
+	color: #700070
+}
+
+span.vhdllogic {
+	color: #ff0000
+}
+
+blockquote {
+        background-color: #F7F8FB;
+        border-left: 2px solid #9CAFD4;
+        margin: 0 24px 0 4px;
+        padding: 0 12px 0 16px;
+}
+
+/* @end */
+
+/*
+.search {
+	color: #003399;
+	font-weight: bold;
+}
+
+form.search {
+	margin-bottom: 0px;
+	margin-top: 0px;
+}
+
+input.search {
+	font-size: 75%;
+	color: #000080;
+	font-weight: normal;
+	background-color: #e8eef2;
+}
+*/
+
+td.tiny {
+	font-size: 75%;
+}
+
+.dirtab {
+	padding: 4px;
+	border-collapse: collapse;
+	border: 1px solid #A3B4D7;
+}
+
+th.dirtab {
+	background: #EBEFF6;
+	font-weight: bold;
+}
+
+hr {
+	height: 0px;
+	border: none;
+	border-top: 1px solid #4A6AAA;
+}
+
+hr.footer {
+	height: 1px;
+}
+
+/* @group Member Descriptions */
+
+table.memberdecls {
+	border-spacing: 0px;
+	padding: 0px;
+}
+
+.memberdecls td, .fieldtable tr {
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+	background-color: cyan;
+	box-shadow: 0 0 15px cyan;
+}
+
+.mdescLeft, .mdescRight,
+.memItemLeft, .memItemRight,
+.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+	background-color: #F9FAFC;
+	border: none;
+	margin: 4px;
+	padding: 1px 0 0 8px;
+}
+
+.mdescLeft, .mdescRight {
+	padding: 0px 8px 4px 8px;
+	color: #555;
+}
+
+.memSeparator {
+        border-bottom: 1px solid #DEE4F0;
+        line-height: 1px;
+        margin: 0px;
+        padding: 0px;
+}
+
+.memItemLeft, .memTemplItemLeft {
+        white-space: nowrap;
+}
+
+.memItemRight {
+	width: 100%;
+}
+
+.memTemplParams {
+	color: #4665A2;
+        white-space: nowrap;
+	font-size: 80%;
+}
+
+/* @end */
+
+/* @group Member Details */
+
+/* Styles for detailed member documentation */
+
+.memtitle {
+	padding: 8px;
+	border-top: 1px solid #A8B8D9;
+	border-left: 1px solid #A8B8D9;
+	border-right: 1px solid #A8B8D9;
+	border-top-right-radius: 4px;
+	border-top-left-radius: 4px;
+	margin-bottom: -1px;
+	background-image: url('nav_f.png');
+	background-repeat: repeat-x;
+	background-color: #E2E8F2;
+	line-height: 1.25;
+	font-weight: 300;
+	float:left;
+}
+
+.permalink
+{
+        font-size: 65%;
+        display: inline-block;
+        vertical-align: middle;
+}
+
+.memtemplate {
+	font-size: 80%;
+	color: #4665A2;
+	font-weight: normal;
+	margin-left: 9px;
+}
+
+.memnav {
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+	margin: 2px;
+	margin-right: 15px;
+	padding: 2px;
+}
+
+.mempage {
+	width: 100%;
+}
+
+.memitem {
+	padding: 0;
+	margin-bottom: 10px;
+	margin-right: 5px;
+        -webkit-transition: box-shadow 0.5s linear;
+        -moz-transition: box-shadow 0.5s linear;
+        -ms-transition: box-shadow 0.5s linear;
+        -o-transition: box-shadow 0.5s linear;
+        transition: box-shadow 0.5s linear;
+        display: table !important;
+        width: 100%;
+}
+
+.memitem.glow {
+         box-shadow: 0 0 15px cyan;
+}
+
+.memname {
+        font-weight: 400;
+        margin-left: 6px;
+}
+
+.memname td {
+	vertical-align: bottom;
+}
+
+.memproto, dl.reflist dt {
+        border-top: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 0px 6px 0px;
+        color: #253555;
+        font-weight: bold;
+        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+        background-color: #DFE5F1;
+        /* opera specific markup */
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        border-top-right-radius: 4px;
+        /* firefox specific markup */
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        -moz-border-radius-topright: 4px;
+        /* webkit specific markup */
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        -webkit-border-top-right-radius: 4px;
+
+}
+
+.overload {
+        font-family: "courier new",courier,monospace;
+	font-size: 65%;
+}
+
+.memdoc, dl.reflist dd {
+        border-bottom: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 10px 2px 10px;
+        background-color: #FBFCFD;
+        border-top-width: 0;
+        background-image:url('nav_g.png');
+        background-repeat:repeat-x;
+        background-color: #FFFFFF;
+        /* opera specific markup */
+        border-bottom-left-radius: 4px;
+        border-bottom-right-radius: 4px;
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        /* firefox specific markup */
+        -moz-border-radius-bottomleft: 4px;
+        -moz-border-radius-bottomright: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        /* webkit specific markup */
+        -webkit-border-bottom-left-radius: 4px;
+        -webkit-border-bottom-right-radius: 4px;
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+dl.reflist dt {
+        padding: 5px;
+}
+
+dl.reflist dd {
+        margin: 0px 0px 10px 0px;
+        padding: 5px;
+}
+
+.paramkey {
+	text-align: right;
+}
+
+.paramtype {
+	white-space: nowrap;
+}
+
+.paramname {
+	color: #602020;
+	white-space: nowrap;
+}
+.paramname em {
+	font-style: normal;
+}
+.paramname code {
+        line-height: 14px;
+}
+
+.params, .retval, .exception, .tparams {
+        margin-left: 0px;
+        padding-left: 0px;
+}
+
+.params .paramname, .retval .paramname {
+        font-weight: bold;
+        vertical-align: top;
+}
+
+.params .paramtype {
+        font-style: italic;
+        vertical-align: top;
+}
+
+.params .paramdir {
+        font-family: "courier new",courier,monospace;
+        vertical-align: top;
+}
+
+table.mlabels {
+	border-spacing: 0px;
+}
+
+td.mlabels-left {
+	width: 100%;
+	padding: 0px;
+}
+
+td.mlabels-right {
+	vertical-align: bottom;
+	padding: 0px;
+	white-space: nowrap;
+}
+
+span.mlabels {
+        margin-left: 8px;
+}
+
+span.mlabel {
+        background-color: #728DC1;
+        border-top:1px solid #5373B4;
+        border-left:1px solid #5373B4;
+        border-right:1px solid #C4CFE5;
+        border-bottom:1px solid #C4CFE5;
+	text-shadow: none;
+	color: white;
+	margin-right: 4px;
+	padding: 2px 3px;
+	border-radius: 3px;
+	font-size: 7pt;
+	white-space: nowrap;
+	vertical-align: middle;
+}
+
+
+
+/* @end */
+
+/* these are for tree view inside a (index) page */
+
+div.directory {
+        margin: 10px 0px;
+        border-top: 1px solid #9CAFD4;
+        border-bottom: 1px solid #9CAFD4;
+        width: 100%;
+}
+
+.directory table {
+        border-collapse:collapse;
+}
+
+.directory td {
+        margin: 0px;
+        padding: 0px;
+	vertical-align: top;
+}
+
+.directory td.entry {
+        white-space: nowrap;
+        padding-right: 6px;
+	padding-top: 3px;
+}
+
+.directory td.entry a {
+        outline:none;
+}
+
+.directory td.entry a img {
+        border: none;
+}
+
+.directory td.desc {
+
+        padding-left: 6px;
+	padding-right: 6px;
+	padding-top: 3px;
+	border-left: 1px solid rgba(0,0,0,0.05);
+}
+
+.directory tr.even {
+	padding-left: 6px;
+	background-color: #F7F8FB;
+}
+
+.directory img {
+	vertical-align: -30%;
+}
+
+.directory .levels {
+        white-space: nowrap;
+        width: 100%;
+        text-align: right;
+        font-size: 9pt;
+}
+
+.directory .levels span {
+        cursor: pointer;
+        padding-left: 2px;
+        padding-right: 2px;
+	color: #3D578C;
+}
+
+.arrow {
+    color: #9CAFD4;
+    -webkit-user-select: none;
+    -khtml-user-select: none;
+    -moz-user-select: none;
+    -ms-user-select: none;
+    user-select: none;
+    cursor: pointer;
+    font-size: 80%;
+    display: inline-block;
+    width: 16px;
+    height: 22px;
+}
+
+.icon {
+    font-family: Arial, Helvetica;
+    font-weight: bold;
+    font-size: 12px;
+    height: 14px;
+    width: 16px;
+    display: inline-block;
+    background-color: #728DC1;
+    color: white;
+    text-align: center;
+    border-radius: 4px;
+    margin-left: 2px;
+    margin-right: 2px;
+}
+
+.icona {
+    width: 24px;
+    height: 22px;
+    display: inline-block;
+}
+
+.iconfopen {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderopen.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.iconfclosed {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderclosed.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.icondoc {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('doc.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+table.directory {
+    font: 400 14px Roboto,sans-serif;
+}
+
+/* @end */
+
+div.dynheader {
+        margin-top: 8px;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+address {
+	font-style: normal;
+	color: #2A3D61;
+}
+
+table.doxtable caption {
+	caption-side: top;
+}
+
+table.doxtable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.doxtable td, table.doxtable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.doxtable th {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+table.fieldtable {
+        /*width: 100%;*/
+        margin-bottom: 10px;
+        border: 1px solid #A8B8D9;
+        border-spacing: 0px;
+        -moz-border-radius: 4px;
+        -webkit-border-radius: 4px;
+        border-radius: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+}
+
+.fieldtable td, .fieldtable th {
+        padding: 3px 7px 2px;
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+        white-space: nowrap;
+        border-right: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        vertical-align: top;
+}
+
+.fieldtable td.fieldname {
+        padding-top: 3px;
+}
+
+.fieldtable td.fielddoc {
+        border-bottom: 1px solid #A8B8D9;
+        /*width: 100%;*/
+}
+
+.fieldtable td.fielddoc p:first-child {
+        margin-top: 0px;
+}
+
+.fieldtable td.fielddoc p:last-child {
+        margin-bottom: 2px;
+}
+
+.fieldtable tr:last-child td {
+        border-bottom: none;
+}
+
+.fieldtable th {
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        font-size: 90%;
+        color: #253555;
+        padding-bottom: 4px;
+        padding-top: 5px;
+        text-align:left;
+        font-weight: 400;
+        -moz-border-radius-topleft: 4px;
+        -moz-border-radius-topright: 4px;
+        -webkit-border-top-left-radius: 4px;
+        -webkit-border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        border-top-right-radius: 4px;
+        border-bottom: 1px solid #A8B8D9;
+}
+
+
+.tabsearch {
+	top: 0px;
+	left: 10px;
+	height: 36px;
+	background-image: url('tab_b.png');
+	z-index: 101;
+	overflow: hidden;
+	font-size: 13px;
+}
+
+.navpath ul
+{
+	font-size: 11px;
+	background-image:url('tab_b.png');
+	background-repeat:repeat-x;
+	background-position: 0 -5px;
+	height:30px;
+	line-height:30px;
+	color:#8AA0CC;
+	border:solid 1px #C2CDE4;
+	overflow:hidden;
+	margin:0px;
+	padding:0px;
+}
+
+.navpath li
+{
+	list-style-type:none;
+	float:left;
+	padding-left:10px;
+	padding-right:15px;
+	background-image:url('bc_s.png');
+	background-repeat:no-repeat;
+	background-position:right;
+	color:#364D7C;
+}
+
+.navpath li.navelem a
+{
+	height:32px;
+	display:block;
+	text-decoration: none;
+	outline: none;
+	color: #283A5D;
+	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
+	text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+	text-decoration: none;
+}
+
+.navpath li.navelem a:hover
+{
+	color:#6884BD;
+}
+
+.navpath li.footer
+{
+        list-style-type:none;
+        float:right;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:none;
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#364D7C;
+        font-size: 8pt;
+}
+
+
+div.summary
+{
+	float: right;
+	font-size: 8pt;
+	padding-right: 5px;
+	width: 50%;
+	text-align: right;
+}
+
+div.summary a
+{
+	white-space: nowrap;
+}
+
+table.classindex
+{
+        margin: 10px;
+        white-space: nowrap;
+        margin-left: 3%;
+        margin-right: 3%;
+        width: 94%;
+        border: 0;
+        border-spacing: 0;
+        padding: 0;
+}
+
+div.ingroups
+{
+	font-size: 8pt;
+	width: 50%;
+	text-align: left;
+}
+
+div.ingroups a
+{
+	white-space: nowrap;
+}
+
+div.header
+{
+        background-image:url('nav_h.png');
+        background-repeat:repeat-x;
+	background-color: #F9FAFC;
+	margin:  0px;
+	border-bottom: 1px solid #C4CFE5;
+}
+
+div.headertitle
+{
+	padding: 5px 5px 5px 10px;
+}
+
+dl
+{
+        padding: 0 0 0 10px;
+}
+
+/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
+dl.section
+{
+	margin-left: 0px;
+	padding-left: 0px;
+}
+
+dl.note
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #D0C000;
+}
+
+dl.warning, dl.attention
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #FF0000;
+}
+
+dl.pre, dl.post, dl.invariant
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00D000;
+}
+
+dl.deprecated
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #505050;
+}
+
+dl.todo
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00C0E0;
+}
+
+dl.test
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #3030E0;
+}
+
+dl.bug
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #C08050;
+}
+
+dl.section dd {
+	margin-bottom: 6px;
+}
+
+
+#projectlogo
+{
+	vertical-align: bottom;
+	border-collapse: separate;
+}
+
+#projectlogo img
+{
+	border: 0px none;
+}
+
+#projectalign
+{
+        vertical-align: middle;
+}
+
+#projectname
+{
+	font: 300% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 2px 0px;
+}
+
+#projectbrief
+{
+	font: 120% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#projectnumber
+{
+	font: 50% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#titlearea
+{
+	padding: 0px;
+	margin: 0px;
+	width: 100%;
+	border-bottom: 1px solid #5373B4;
+}
+
+.image
+{
+        text-align: center;
+}
+
+.dotgraph
+{
+        text-align: center;
+}
+
+.mscgraph
+{
+        text-align: center;
+}
+
+.plantumlgraph
+{
+        text-align: center;
+}
+
+.diagraph
+{
+        text-align: center;
+}
+
+.caption
+{
+	font-weight: bold;
+}
+
+div.zoom
+{
+	border: 1px solid #90A5CE;
+}
+
+dl.citelist {
+        margin-bottom:50px;
+}
+
+dl.citelist dt {
+        color:#334975;
+        float:left;
+        font-weight:bold;
+        margin-right:10px;
+        padding:5px;
+}
+
+dl.citelist dd {
+        margin:2px 0;
+        padding:5px 0;
+}
+
+div.toc {
+        padding: 14px 25px;
+        background-color: #F4F6FA;
+        border: 1px solid #D8DFEE;
+        border-radius: 7px 7px 7px 7px;
+        float: right;
+        height: auto;
+        margin: 0 8px 10px 10px;
+        width: 200px;
+}
+
+div.toc li {
+        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
+        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
+        margin-top: 5px;
+        padding-left: 10px;
+        padding-top: 2px;
+}
+
+div.toc h3 {
+        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
+	color: #4665A2;
+        border-bottom: 0 none;
+        margin: 0;
+}
+
+div.toc ul {
+        list-style: none outside none;
+        border: medium none;
+        padding: 0px;
+}
+
+div.toc li.level1 {
+        margin-left: 0px;
+}
+
+div.toc li.level2 {
+        margin-left: 15px;
+}
+
+div.toc li.level3 {
+        margin-left: 30px;
+}
+
+div.toc li.level4 {
+        margin-left: 45px;
+}
+
+.inherit_header {
+        font-weight: bold;
+        color: gray;
+        cursor: pointer;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+.inherit_header td {
+        padding: 6px 0px 2px 5px;
+}
+
+.inherit {
+        display: none;
+}
+
+tr.heading h2 {
+        margin-top: 12px;
+        margin-bottom: 4px;
+}
+
+/* tooltip related style info */
+
+.ttc {
+        position: absolute;
+        display: none;
+}
+
+#powerTip {
+	cursor: default;
+	white-space: nowrap;
+	background-color: white;
+	border: 1px solid gray;
+	border-radius: 4px 4px 4px 4px;
+	box-shadow: 1px 1px 7px gray;
+	display: none;
+	font-size: smaller;
+	max-width: 80%;
+	opacity: 0.9;
+	padding: 1ex 1em 1em;
+	position: absolute;
+	z-index: 2147483647;
+}
+
+#powerTip div.ttdoc {
+        color: grey;
+	font-style: italic;
+}
+
+#powerTip div.ttname a {
+        font-weight: bold;
+}
+
+#powerTip div.ttname {
+        font-weight: bold;
+}
+
+#powerTip div.ttdeci {
+        color: #006318;
+}
+
+#powerTip div {
+        margin: 0px;
+        padding: 0px;
+        font: 12px/16px Roboto,sans-serif;
+}
+
+#powerTip:before, #powerTip:after {
+	content: "";
+	position: absolute;
+	margin: 0px;
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.w:after,  #powerTip.w:before,
+#powerTip.e:after,  #powerTip.e:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.nw:after, #powerTip.nw:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	border: solid transparent;
+	content: " ";
+	height: 0;
+	width: 0;
+	position: absolute;
+}
+
+#powerTip.n:after,  #powerTip.s:after,
+#powerTip.w:after,  #powerTip.e:after,
+#powerTip.nw:after, #powerTip.ne:after,
+#powerTip.sw:after, #powerTip.se:after {
+	border-color: rgba(255, 255, 255, 0);
+}
+
+#powerTip.n:before,  #powerTip.s:before,
+#powerTip.w:before,  #powerTip.e:before,
+#powerTip.nw:before, #powerTip.ne:before,
+#powerTip.sw:before, #powerTip.se:before {
+	border-color: rgba(128, 128, 128, 0);
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.nw:after, #powerTip.nw:before {
+	top: 100%;
+}
+
+#powerTip.n:after, #powerTip.ne:after, #powerTip.nw:after {
+	border-top-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+#powerTip.n:before {
+	border-top-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+#powerTip.n:after, #powerTip.n:before {
+	left: 50%;
+}
+
+#powerTip.nw:after, #powerTip.nw:before {
+	right: 14px;
+}
+
+#powerTip.ne:after, #powerTip.ne:before {
+	left: 14px;
+}
+
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	bottom: 100%;
+}
+
+#powerTip.s:after, #powerTip.se:after, #powerTip.sw:after {
+	border-bottom-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+
+#powerTip.s:before, #powerTip.se:before, #powerTip.sw:before {
+	border-bottom-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+
+#powerTip.s:after, #powerTip.s:before {
+	left: 50%;
+}
+
+#powerTip.sw:after, #powerTip.sw:before {
+	right: 14px;
+}
+
+#powerTip.se:after, #powerTip.se:before {
+	left: 14px;
+}
+
+#powerTip.e:after, #powerTip.e:before {
+	left: 100%;
+}
+#powerTip.e:after {
+	border-left-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.e:before {
+	border-left-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+#powerTip.w:after, #powerTip.w:before {
+	right: 100%;
+}
+#powerTip.w:after {
+	border-right-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.w:before {
+	border-right-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+@media print
+{
+  #top { display: none; }
+  #side-nav { display: none; }
+  #nav-path { display: none; }
+  body { overflow:visible; }
+  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
+  .summary { display: none; }
+  .memitem { page-break-inside: avoid; }
+  #doc-content
+  {
+    margin-left:0 !important;
+    height:auto !important;
+    width:auto !important;
+    overflow:inherit;
+    display:inline;
+  }
+}
+
+/* @group Markdown */
+
+/*
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTableHead tr {
+}
+
+table.markdownTableBodyLeft td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+th.markdownTableHeadLeft th.markdownTableHeadRight th.markdownTableHeadCenter th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter {
+	text-align: center
+}
+*/
+
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTable tr {
+}
+
+th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft, td.markdownTableBodyLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight, td.markdownTableBodyRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter, td.markdownTableBodyCenter {
+	text-align: center
+}
+
+
+/* @end */
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..a4dc68e
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+set SPHINXPROJ=PyTorch
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..0c3ee19
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx
+-e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
diff --git a/docs/source/_static/css/pytorch_theme.css b/docs/source/_static/css/pytorch_theme.css
new file mode 100644
index 0000000..0e54497
--- /dev/null
+++ b/docs/source/_static/css/pytorch_theme.css
@@ -0,0 +1,118 @@
+body {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Default header fonts are ugly */
+h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Use white for docs background */
+.wy-side-nav-search {
+    background-color: #fff;
+}
+
+.wy-nav-content-wrap, .wy-menu li.current > a  {
+    background-color: #fff;
+}
+
+@media screen and (min-width: 1400px) {
+    .wy-nav-content-wrap {
+        background-color: rgba(0, 0, 0, 0.0470588);
+    }
+
+    .wy-nav-content {
+        background-color: #fff;
+    }
+}
+
+/* Fixes for mobile */
+.wy-nav-top {
+    background-color: #fff;
+    background-image: url('../img/pytorch-logo-dark.svg');
+    background-repeat: no-repeat;
+    background-position: center;
+    padding: 0;
+    margin: 0.4045em 0.809em;
+    color: #333;
+}
+
+.wy-nav-top > a {
+    display: none;
+}
+
+@media screen and (max-width: 768px) {
+    .wy-side-nav-search>a img.logo {
+        height: 60px;
+    }
+}
+
+/* This is needed to ensure that logo above search scales properly */
+.wy-side-nav-search a {
+    display: block;
+}
+
+/* This ensures that multiple constructors will remain in separate lines. */
+.rst-content dl:not(.docutils) dt {
+    display: table;
+}
+
+/* Use our red for literals (it's very similar to the original color) */
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+    color: #F05732;
+}
+
+.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
+.rst-content code.xref, a .rst-content tt, a .rst-content code {
+    color: #404040;
+}
+
+/* Change link colors (except for the menu) */
+
+a {
+    color: #F05732;
+}
+
+a:hover {
+    color: #F05732;
+}
+
+
+a:visited {
+    color: #D44D2C;
+}
+
+.wy-menu a {
+    color: #b3b3b3;
+}
+
+.wy-menu a:hover {
+    color: #b3b3b3;
+}
+
+/* Default footer text is quite big */
+footer {
+    font-size: 80%;
+}
+
+footer .rst-footer-buttons {
+    font-size: 125%; /* revert footer settings - 1/80% = 125% */
+}
+
+footer p {
+    font-size: 100%;
+}
+
+/* For hidden headers that appear in TOC tree */
+/* see http://stackoverflow.com/a/32363545/3343043 */
+.rst-content .hidden-section {
+    display: none;
+}
+
+nav .hidden-section {
+    display: inherit;
+}
+
+.wy-side-nav-search>div.version {
+    color: #000;
+}
diff --git a/docs/source/_static/img/dynamic_graph.gif b/docs/source/_static/img/dynamic_graph.gif
new file mode 100644
index 0000000..f6fde31
Binary files /dev/null and b/docs/source/_static/img/dynamic_graph.gif differ
diff --git a/docs/source/_static/img/pytorch-logo-dark-unstable.png b/docs/source/_static/img/pytorch-logo-dark-unstable.png
new file mode 100644
index 0000000..15c544a
Binary files /dev/null and b/docs/source/_static/img/pytorch-logo-dark-unstable.png differ
diff --git a/docs/source/_static/img/pytorch-logo-dark.png b/docs/source/_static/img/pytorch-logo-dark.png
new file mode 100644
index 0000000..7992605
Binary files /dev/null and b/docs/source/_static/img/pytorch-logo-dark.png differ
diff --git a/docs/source/_static/img/pytorch-logo-dark.svg b/docs/source/_static/img/pytorch-logo-dark.svg
new file mode 100644
index 0000000..5e53000
--- /dev/null
+++ b/docs/source/_static/img/pytorch-logo-dark.svg
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 199.7 40.2" style="enable-background:new 0 0 199.7 40.2;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#EE4C2C;}
+	.st1{fill:#252525;}
+</style>
+<g>
+	<path class="st0" d="M40.8,9.3l-2.1,2.1c3.5,3.5,3.5,9.2,0,12.7c-3.5,3.5-9.2,3.5-12.7,0c-3.5-3.5-3.5-9.2,0-12.7l0,0l5.6-5.6
+		L32.3,5l0,0V0.8l-8.5,8.5c-4.7,4.7-4.7,12.2,0,16.9s12.2,4.7,16.9,0C45.5,21.5,45.5,13.9,40.8,9.3z"/>
+	<circle class="st0" cx="36.6" cy="7.1" r="1.6"/>
+</g>
+<g>
+	<g>
+		<path class="st1" d="M62.6,20l-3.6,0v9.3h-2.7V2.9c0,0,6.3,0,6.6,0c7,0,10.3,3.4,10.3,8.3C73.2,17,69.1,19.9,62.6,20z M62.8,5.4
+			c-0.3,0-3.9,0-3.9,0v12.1l3.8-0.1c5-0.1,7.7-2.1,7.7-6.2C70.4,7.5,67.8,5.4,62.8,5.4z"/>
+		<path class="st1" d="M85.4,29.2l-1.6,4.2c-1.8,4.7-3.6,6.1-6.3,6.1c-1.5,0-2.6-0.4-3.8-0.9l0.8-2.4c0.9,0.5,1.9,0.8,3,0.8
+			c1.5,0,2.6-0.8,4-4.5l1.3-3.4L75.3,10h2.8l6.1,16l6-16h2.7L85.4,29.2z"/>
+		<path class="st1" d="M101.9,5.5v23.9h-2.7V5.5h-9.3V2.9h21.3v2.5H101.9z"/>
+		<path class="st1" d="M118.8,29.9c-5.4,0-9.4-4-9.4-10.2c0-6.2,4.1-10.3,9.6-10.3c5.4,0,9.3,4,9.3,10.2
+			C128.3,25.8,124.2,29.9,118.8,29.9z M118.9,11.8c-4.1,0-6.8,3.3-6.8,7.8c0,4.7,2.8,7.9,6.9,7.9s6.8-3.3,6.8-7.8
+			C125.8,15,123,11.8,118.9,11.8z"/>
+		<path class="st1" d="M135,29.4h-2.6V10l2.6-0.5v4.1c1.3-2.5,3.2-4.1,5.7-4.1c1.3,0,2.5,0.4,3.4,0.9l-0.7,2.5
+			c-0.8-0.5-1.9-0.8-3-0.8c-2,0-3.9,1.5-5.5,5V29.4z"/>
+		<path class="st1" d="M154.4,29.9c-5.8,0-9.5-4.2-9.5-10.2c0-6.1,4-10.3,9.5-10.3c2.4,0,4.4,0.6,6.1,1.7l-0.7,2.4
+			c-1.5-1-3.3-1.6-5.4-1.6c-4.2,0-6.8,3.1-6.8,7.7c0,4.7,2.8,7.8,6.9,7.8c1.9,0,3.9-0.6,5.4-1.6l0.5,2.4
+			C158.7,29.3,156.6,29.9,154.4,29.9z"/>
+		<path class="st1" d="M176.7,29.4V16.9c0-3.4-1.4-4.9-4.1-4.9c-2.2,0-4.4,1.1-6,2.8v14.7h-2.6V0.9l2.6-0.5c0,0,0,12.1,0,12.2
+			c2-2,4.6-3.1,6.7-3.1c3.8,0,6.1,2.4,6.1,6.6v13.3H176.7z"/>
+	</g>
+</g>
+</svg>
diff --git a/docs/source/_static/img/pytorch-logo-flame.png b/docs/source/_static/img/pytorch-logo-flame.png
new file mode 100644
index 0000000..370633f
Binary files /dev/null and b/docs/source/_static/img/pytorch-logo-flame.png differ
diff --git a/docs/source/_static/img/pytorch-logo-flame.svg b/docs/source/_static/img/pytorch-logo-flame.svg
new file mode 100644
index 0000000..22d7228
--- /dev/null
+++ b/docs/source/_static/img/pytorch-logo-flame.svg
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   height="40.200001"
+   width="40.200001"
+   xml:space="preserve"
+   viewBox="0 0 40.200002 40.2"
+   y="0px"
+   x="0px"
+   id="Layer_1"
+   version="1.1"><metadata
+     id="metadata4717"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs4715" /><style
+     id="style4694"
+     type="text/css">
+	.st0{fill:#F05732;}
+	.st1{fill:#9E529F;}
+	.st2{fill:#333333;}
+</style><path
+     style="fill:#f05732"
+     id="path4696"
+     d="m 26.975479,12.199999 c -1.3,-1 -1.8,3.9 -4.4,3.9 -3,0 -4,-12.9999998 -6.3,-12.9999998 -0.7,0 -0.8,-0.4 -7.9000003,21.2999998 -2.9000001,9 4.4000003,15.8 11.8000003,15.8 4.6,0 12.3,-3 12.3,-12.6 0,-7.1 -3.5,-13.9 -5.5,-15.4 z m -6.9,23.1 c -3.7,0 -6.7,-3.1 -6.7,-7 0,-3.9 3,-7 6.7,-7 3.7,0 6.7,3.1 6.7,7 0,3.8 -3,7 -6.7,7 z"
+     class="st0" /><path
+     style="fill:#9e529f"
+     id="path4698"
+     d="m 24.075479,-7.6293945e-7 c -0.5,0 -1.8,2.49999996293945 -1.8,3.59999996293945 0,1.5 1,2 1.8,2 0.8,0 1.8,-0.5 1.8,-2 -0.1,-1.1 -1.4,-3.59999996293945 -1.8,-3.59999996293945 z"
+     class="st1" /></svg>
\ No newline at end of file
diff --git a/docs/source/_static/img/tensor_illustration.png b/docs/source/_static/img/tensor_illustration.png
new file mode 100644
index 0000000..b0039c7
Binary files /dev/null and b/docs/source/_static/img/tensor_illustration.png differ
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
new file mode 100644
index 0000000..e2d28c3
--- /dev/null
+++ b/docs/source/_templates/layout.html
@@ -0,0 +1,26 @@
+{% extends "!layout.html" %}
+
+<link rel="canonical" href="{{ theme_canonical_url }}{{ pagename }}.html" />
+{% block menu %}
+<div>
+  <a style="color:#F05732" href="{{ theme_canonical_url }}{{ pagename }}.html">
+    You are viewing unstable developer preview docs.
+    Click here to view docs for latest stable release.
+  </a>
+</div>
+{{ super() }}
+{% endblock %}
+
+{% block footer %}
+{{ super() }}
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+  ga('create', 'UA-90545585-1', 'auto');
+  ga('send', 'pageview');
+
+</script>
+{% endblock %}
diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
new file mode 100644
index 0000000..7863169
--- /dev/null
+++ b/docs/source/autograd.rst
@@ -0,0 +1,107 @@
+.. role:: hidden
+    :class: hidden-section
+
+Automatic differentiation package - torch.autograd
+==================================================
+
+.. automodule:: torch.autograd
+.. currentmodule:: torch.autograd
+
+.. autofunction:: backward
+
+.. autofunction:: grad
+
+.. _locally-disable-grad:
+
+Locally disabling gradient computation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: no_grad
+
+.. autoclass:: enable_grad
+
+.. autoclass:: set_grad_enabled
+
+In-place operations on Tensors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Supporting in-place operations in autograd is a hard matter, and we discourage
+their use in most cases. Autograd's aggressive buffer freeing and reuse makes
+it very efficient and there are very few occasions when in-place operations
+actually lower memory usage by any significant amount. Unless you're operating
+under heavy memory pressure, you might never need to use them.
+
+In-place correctness checks
+---------------------------
+
+All :class:`Tensor` s keep track of in-place operations applied to them, and
+if the implementation detects that a tensor was saved for backward in one of
+the functions, but it was modified in-place afterwards, an error will be raised
+once backward pass is started. This ensures that if you're using in-place
+functions and not seeing any errors, you can be sure that the computed
+gradients are correct.
+
+Variable (deprecated)
+^^^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+    The Variable API has been deprecated: Variables are no longer necessary to
+    use autograd with tensors. Autograd automatically supports Tensors with
+    ``requires_grad`` set to ``True``. Below please find a quick guide on what
+    has changed:
+
+    - ``Variable(tensor)`` and ``Variable(tensor, requires_grad)`` still work as expected,
+      but they return Tensors instead of Variables.
+    - ``var.data`` is the same thing as ``tensor.data``.
+    - Methods such as ``var.backward(), var.detach(), var.register_hook()`` now work on tensors
+      with the same method names.
+
+    In addition, one can now create tensors with ``requires_grad=True`` using factory
+    methods such as :func:`torch.randn`, :func:`torch.zeros`, :func:`torch.ones`, and others
+    like the following:
+
+    ``autograd_tensor = torch.randn((2, 3, 4), requires_grad=True)``
+
+Tensor autograd functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autoclass:: torch.Tensor
+    :members: backward, detach, detach_, register_hook, retain_grad
+
+:hidden:`Function`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: Function
+    :members:
+
+.. _grad-check:
+
+Numerical gradient checking
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: gradcheck
+
+.. autofunction:: gradgradcheck
+
+Profiler
+^^^^^^^^
+
+Autograd includes a profiler that lets you inspect the cost of different
+operators inside your model - both on the CPU and GPU. There are two modes
+implemented at the moment - CPU-only using :class:`~torch.autograd.profiler.profile`.
+and nvprof based (registers both CPU and GPU activity) using
+:class:`~torch.autograd.profiler.emit_nvtx`.
+
+.. autoclass:: torch.autograd.profiler.profile
+    :members:
+
+.. autoclass:: torch.autograd.profiler.emit_nvtx
+    :members:
+
+.. autofunction:: torch.autograd.profiler.load_nvprof
+
+Anomaly detection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: detect_anomaly
+
+.. autoclass:: set_detect_anomaly
diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
new file mode 100644
index 0000000..d6ce122
--- /dev/null
+++ b/docs/source/bottleneck.rst
@@ -0,0 +1,59 @@
+torch.utils.bottleneck
+======================
+
+.. currentmodule:: torch.utils.bottleneck
+
+`torch.utils.bottleneck` is a tool that can be used as an initial step for
+debugging bottlenecks in your program. It summarizes runs of your script with
+the Python profiler and PyTorch's autograd profiler.
+
+Run it on the command line with
+
+::
+
+    python -m torch.utils.bottleneck /path/to/source/script.py [args]
+
+where [args] are any number of arguments to `script.py`, or run
+``python -m torch.utils.bottleneck -h`` for more usage instructions.
+
+.. warning::
+    Because your script will be profiled, please ensure that it exits in a
+    finite amount of time.
+
+.. warning::
+    Due to the asynchronous nature of CUDA kernels, when running against
+    CUDA code, the cProfile output and CPU-mode autograd profilers may
+    not show correct timings: the reported CPU time reports the amount of time
+    used to launch the kernels but does not include the time the kernel
+    spent executing on a GPU unless the operation does a synchronize.
+    Ops that do synchronize appear to be extremely expensive under regular
+    CPU-mode profilers.
+    In these case where timings are incorrect, the CUDA-mode autograd profiler
+    may be helpful.
+
+.. note::
+    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
+    look at, you should first check if your script is CPU-bound
+    ("CPU total time is much greater than CUDA total time").
+    If it is CPU-bound, looking at the results of the CPU-mode autograd
+    profiler will help. If on the other hand your script spends most of its
+    time executing on the GPU, then it makes sense to start
+    looking for responsible CUDA operators in the output of the CUDA-mode
+    autograd profiler.
+
+    Of course the reality is much more complicated and your script might not be
+    in one of those two extremes depending on the part of the model you're
+    evaluating. If the profiler outputs don't help, you could try looking at
+    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
+    However, please take into account that the NVTX overhead is very high and
+    often gives a heavily skewed timeline.
+
+.. warning::
+    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
+    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
+    in its time reporting. This should not matter if your bottlenecks result
+    in code much slower than the CUDA startup time.
+
+For more complicated uses of the profilers (like in a multi-GPU case),
+please see https://docs.python.org/3/library/profile.html
+or :func:`torch.autograd.profiler.profile()` for more information.
diff --git a/docs/source/checkpoint.rst b/docs/source/checkpoint.rst
new file mode 100644
index 0000000..af30717
--- /dev/null
+++ b/docs/source/checkpoint.rst
@@ -0,0 +1,6 @@
+torch.utils.checkpoint
+======================
+
+.. currentmodule:: torch.utils.checkpoint
+.. autofunction:: checkpoint
+.. autofunction:: checkpoint_sequential
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..b48a5ad
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# PyTorch documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import torch
+try:
+    import torchvision
+except ImportError:
+    import warnings
+    warnings.warn('unable to load "torchvision" package')
+import sphinx_rtd_theme
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+]
+
+napoleon_use_ivar = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'PyTorch'
+copyright = '2018, Torch Contributors'
+author = 'Torch Contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+# TODO: change to [:2] at v1.0
+version = 'master (' + torch.__version__ + ' )'
+# The full version, including alpha/beta/rc tags.
+# TODO: verify this works as expected
+release = 'master'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+# Disable docstring inheritance
+autodoc_inherit_docstrings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'canonical_url': 'https://pytorch.org/docs/stable/',
+    'collapse_navigation': False,
+    'display_version': True,
+    'logo_only': True,
+}
+
+html_logo = '_static/img/pytorch-logo-dark-unstable.png'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static', '_images']
+
+# html_style_path = 'css/pytorch_theme.css'
+html_context = {
+    'css_files': [
+        'https://fonts.googleapis.com/css?family=Lato',
+        '_static/css/pytorch_theme.css'
+    ],
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PyTorchdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pytorch.tex', 'PyTorch Documentation',
+     'Torch Contributors', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'PyTorch', 'PyTorch Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PyTorch', 'PyTorch Documentation',
+     author, 'PyTorch', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/', None),
+    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+}
+
+# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
+# See http://stackoverflow.com/a/41184353/3343043
+
+from docutils import nodes
+from sphinx.util.docfields import TypedField
+from sphinx import addnodes
+
+
+def patched_make_field(self, types, domain, items, **kw):
+    # `kw` catches `env=None` needed for newer sphinx while maintaining
+    #  backwards compatibility when passed along further down!
+
+    # type: (List, unicode, Tuple) -> nodes.field
+    def handle_item(fieldarg, content):
+        par = nodes.paragraph()
+        par += addnodes.literal_strong('', fieldarg)  # Patch: this line added
+        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
+        #                           addnodes.literal_strong))
+        if fieldarg in types:
+            par += nodes.Text(' (')
+            # NOTE: using .pop() here to prevent a single type node to be
+            # inserted twice into the doctree, which leads to
+            # inconsistencies later when references are resolved
+            fieldtype = types.pop(fieldarg)
+            if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
+                typename = u''.join(n.astext() for n in fieldtype)
+                typename = typename.replace('int', 'python:int')
+                typename = typename.replace('long', 'python:long')
+                typename = typename.replace('float', 'python:float')
+                typename = typename.replace('type', 'python:type')
+                par.extend(self.make_xrefs(self.typerolename, domain, typename,
+                                           addnodes.literal_emphasis, **kw))
+            else:
+                par += fieldtype
+            par += nodes.Text(')')
+        par += nodes.Text(' -- ')
+        par += content
+        return par
+
+    fieldname = nodes.field_name('', self.label)
+    if len(items) == 1 and self.can_collapse:
+        fieldarg, content = items[0]
+        bodynode = handle_item(fieldarg, content)
+    else:
+        bodynode = self.list_type()
+        for fieldarg, content in items:
+            bodynode += nodes.list_item('', handle_item(fieldarg, content))
+    fieldbody = nodes.field_body('', bodynode)
+    return nodes.field('', fieldname, fieldbody)
+
+TypedField.make_field = patched_make_field
diff --git a/docs/source/cpp_extension.rst b/docs/source/cpp_extension.rst
new file mode 100644
index 0000000..d355aee
--- /dev/null
+++ b/docs/source/cpp_extension.rst
@@ -0,0 +1,12 @@
+torch.utils.cpp_extension
+=========================
+
+.. currentmodule:: torch.utils.cpp_extension
+.. autofunction:: CppExtension
+.. autofunction:: CUDAExtension
+.. autofunction:: BuildExtension
+.. autofunction:: load
+.. autofunction:: load_inline
+.. autofunction:: include_paths
+.. autofunction:: check_compiler_abi_compatibility
+.. autofunction:: verify_ninja_availability
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
new file mode 100644
index 0000000..b65c64f
--- /dev/null
+++ b/docs/source/cuda.rst
@@ -0,0 +1,55 @@
+torch.cuda
+===================================
+
+.. currentmodule:: torch.cuda
+
+.. automodule:: torch.cuda
+   :members:
+
+Random Number Generator
+-------------------------
+.. autofunction:: get_rng_state
+.. autofunction:: set_rng_state
+.. autofunction:: manual_seed
+.. autofunction:: manual_seed_all
+.. autofunction:: seed
+.. autofunction:: seed_all
+.. autofunction:: initial_seed
+
+
+Communication collectives
+-------------------------
+
+.. autofunction:: torch.cuda.comm.broadcast
+
+.. autofunction:: torch.cuda.comm.broadcast_coalesced
+
+.. autofunction:: torch.cuda.comm.reduce_add
+
+.. autofunction:: torch.cuda.comm.scatter
+
+.. autofunction:: torch.cuda.comm.gather
+
+Streams and events
+------------------
+
+.. autoclass:: Stream
+   :members:
+
+.. autoclass:: Event
+   :members:
+
+Memory management
+-----------------
+.. autofunction:: empty_cache
+.. autofunction:: memory_allocated
+.. autofunction:: max_memory_allocated
+.. autofunction:: memory_cached
+.. autofunction:: max_memory_cached
+
+NVIDIA Tools Extension (NVTX)
+-----------------------------
+
+.. autofunction:: torch.cuda.nvtx.mark
+.. autofunction:: torch.cuda.nvtx.range_push
+.. autofunction:: torch.cuda.nvtx.range_pop
diff --git a/docs/source/data.rst b/docs/source/data.rst
new file mode 100644
index 0000000..f6e4b7e
--- /dev/null
+++ b/docs/source/data.rst
@@ -0,0 +1,17 @@
+torch.utils.data
+===================================
+
+.. automodule:: torch.utils.data
+.. autoclass:: Dataset
+.. autoclass:: TensorDataset
+.. autoclass:: ConcatDataset
+.. autoclass:: Subset
+.. autoclass:: DataLoader
+.. autofunction:: torch.utils.data.random_split
+.. autoclass:: torch.utils.data.Sampler
+.. autoclass:: torch.utils.data.SequentialSampler
+.. autoclass:: torch.utils.data.RandomSampler
+.. autoclass:: torch.utils.data.SubsetRandomSampler
+.. autoclass:: torch.utils.data.WeightedRandomSampler
+.. autoclass:: torch.utils.data.BatchSampler
+.. autoclass:: torch.utils.data.distributed.DistributedSampler
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
new file mode 100644
index 0000000..27af7d3
--- /dev/null
+++ b/docs/source/distributed.rst
@@ -0,0 +1,274 @@
+.. role:: hidden
+    :class: hidden-section
+
+Distributed communication package - torch.distributed
+=====================================================
+
+.. automodule:: torch.distributed
+.. currentmodule:: torch.distributed
+
+Currently torch.distributed supports four backends, each with
+different capabilities. The table below shows which functions are available
+for use with CPU / CUDA tensors.
+MPI supports cuda only if the implementation used to build PyTorch supports it.
+
+
++------------+-----------+-----------+-----------+-----------+
+| Backend    | ``tcp``   | ``gloo``  | ``mpi``   | ``nccl``  |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| Device     | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
++============+=====+=====+=====+=====+=====+=====+=====+=====+
+| send       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| recv       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| broadcast  | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_reduce | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| reduce     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_gather | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| gather     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| scatter    | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| barrier    | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+
+.. _distributed-basics:
+
+Basics
+------
+
+The `torch.distributed` package provides PyTorch support and communication primitives
+for multiprocess parallelism across several computation nodes running on one or more
+machines. The class :func:`torch.nn.parallel.DistributedDataParallel` builds on this
+functionality to provide synchronous distributed training as a wrapper around any
+PyTorch model. This differs from the kinds of parallelism provided by
+:doc:`multiprocessing` and :func:`torch.nn.DataParallel` in that it supports
+multiple network-connected machines and in that the user must explicitly launch a separate
+copy of the main training script for each process.
+
+In the single-machine synchronous case, `torch.distributed` or the
+:func:`torch.nn.parallel.DistributedDataParallel` wrapper may still have advantages over other
+approaches to data-parallelism, including :func:`torch.nn.DataParallel`:
+
+* Each process maintains its own optimizer and performs a complete optimization step with each
+  iteration. While this may appear redundant, since the gradients have already been gathered
+  together and averaged across processes and are thus the same for every process, this means
+  that no parameter broadcast step is needed, reducing time spent transferring tensors between
+  nodes.
+* Each process contains an independent Python interpreter, eliminating the extra interpreter
+  overhead and "GIL-thrashing" that comes from driving several execution threads, model
+  replicas, or GPUs from a single Python process. This is especially important for models that
+  make heavy use of the Python runtime, including models with recurrent layers or many small
+  components.
+
+Initialization
+--------------
+
+The package needs to be initialized using the :func:`torch.distributed.init_process_group`
+function before calling any other methods. This blocks until all processes have
+joined.
+
+.. autofunction:: init_process_group
+
+.. autofunction:: get_rank
+
+.. autofunction:: get_world_size
+
+--------------------------------------------------------------------------------
+
+Currently three initialization methods are supported:
+
+TCP initialization
+^^^^^^^^^^^^^^^^^^
+
+There are two ways to initialize using TCP, both requiring a network address
+reachable from all processes and a desired ``world_size``. The first way
+requires specifying an address that belongs to the rank 0 process. This first way of
+initialization requires that all processes have manually specified ranks.
+
+Alternatively, the address has to be a valid IP multicast address, in which case
+ranks can be assigned automatically. Multicast initialization also supports
+a ``group_name`` argument, which allows you to use the same address for multiple
+jobs, as long as they use different group names.
+
+::
+
+    import torch.distributed as dist
+
+    # Use address of one of the machines
+    dist.init_process_group(init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
+
+    # or a multicast address - rank will be assigned automatically if unspecified
+    dist.init_process_group(init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
+                            world_size=4)
+
+Shared file-system initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another initialization method makes use of a file system that is shared and
+visible from all machines in a group, along with a desired ``world_size``. The URL should start
+with ``file://`` and contain a path to a non-existent file (in an existing
+directory) on a shared file system. This initialization method also supports a
+``group_name`` argument, which allows you to use the same shared file path for
+multiple jobs, as long as they use different group names.
+
+.. warning::
+    This method assumes that the file system supports locking using ``fcntl`` - most
+    local systems and NFS support it.
+
+::
+
+    import torch.distributed as dist
+
+    # Rank will be assigned automatically if unspecified
+    dist.init_process_group(init_method='file:///mnt/nfs/sharedfile', world_size=4,
+                            group_name=args.group)
+
+Environment variable initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This method will read the configuration from environment variables, allowing
+one to fully customize how the information is obtained. The variables to be set
+are:
+
+* ``MASTER_PORT`` - required; has to be a free port on machine with rank 0
+* ``MASTER_ADDR`` - required (except for rank 0); address of rank 0 node
+* ``WORLD_SIZE`` - required; can be set either here, or in a call to init function
+* ``RANK`` - required; can be set either here, or in a call to init function
+
+The machine with rank 0 will be used to set up all connections.
+
+This is the default method, meaning that ``init_method`` does not have to be specified (or
+can be ``env://``).
+
+Groups
+------
+
+By default collectives operate on the default group (also called the world) and
+require all processes to enter the distributed function call. However, some workloads can benefit
+from more fine-grained communication. This is where distributed groups come
+into play. :func:`~torch.distributed.new_group` function can be
+used to create new groups, with arbitrary subsets of all processes. It returns
+an opaque group handle that can be given as a ``group`` argument to all collectives
+(collectives are distributed functions to exchange information in certain well-known programming patterns).
+
+.. autofunction:: new_group
+
+Point-to-point communication
+----------------------------
+
+.. autofunction:: send
+
+.. autofunction:: recv
+
+:func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv`
+return distributed request objects when used. In general, the type of this object is unspecified
+as they should never be created manually, but they are guaranteed to support two methods:
+
+* ``is_completed()`` - returns True if the operation has finished
+* ``wait()`` - will block the process until the operation is finished.
+  ``is_completed()`` is guaranteed to return True once it returns.
+
+When using the MPI backend, :func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv`
+support non-overtaking, which has some guarantees on supporting message order. For more detail, see
+http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54
+
+.. autofunction:: isend
+
+.. autofunction:: irecv
+
+Collective functions
+--------------------
+
+.. autofunction:: broadcast
+
+.. autofunction:: all_reduce
+
+.. autofunction:: reduce
+
+.. autofunction:: all_gather
+
+.. autofunction:: gather
+
+.. autofunction:: scatter
+
+.. autofunction:: barrier
+
+Multi-GPU collective functions
+------------------------------
+
+If you have more than one GPU on each node, when using the NCCL backend,
+:func:`~torch.distributed.broadcast_multigpu`
+:func:`~torch.distributed.all_reduce_multigpu`
+:func:`~torch.distributed.reduce_multigpu` and
+:func:`~torch.distributed.all_gather_multigpu` support distributed collective
+operations among multiple GPUs within each node. These functions can potentially
+improve the overall distributed training performance and be easily used by
+passing a list of tensors. Each Tensor in the passed tensor list needs
+to be on a separate GPU device of the host where the function is called. Note
+that the length of the tensor list needs to be identical among all the
+distributed processes. Also note that currently the multi-GPU collective
+functions are only supported by the NCCL backend.
+
+For example, if the system we use for distributed training has 2 nodes, each
+of which has 8 GPUs. On each of the 16 GPUs, there is a tensor that we would
+like to all-reduce. The following code can serve as a reference:
+
+Code running on Node 0
+
+::
+
+    import torch
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=0)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+Code running on Node 1
+
+::
+
+    import torch
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=1)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+After the call, all 16 tensors on the two nodes will have the all-reduced value
+of 16
+
+.. autofunction:: broadcast_multigpu
+
+.. autofunction:: all_reduce_multigpu
+
+.. autofunction:: reduce_multigpu
+
+.. autofunction:: all_gather_multigpu
+
+
+Launch utility
+--------------
+
+The `torch.distributed` package also provides a launch utility in
+`torch.distributed.launch`.
+
+.. automodule:: torch.distributed.launch
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
new file mode 100644
index 0000000..db481d0
--- /dev/null
+++ b/docs/source/distributions.rst
@@ -0,0 +1,306 @@
+.. role:: hidden
+    :class: hidden-section
+
+Probability distributions - torch.distributions
+==================================================
+
+.. automodule:: torch.distributions
+.. currentmodule:: torch.distributions
+
+:hidden:`Distribution`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.distribution
+.. autoclass:: Distribution
+    :members:
+    :show-inheritance:
+
+:hidden:`ExponentialFamily`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.exp_family
+.. autoclass:: ExponentialFamily
+    :members:
+    :show-inheritance:
+
+:hidden:`Bernoulli`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.bernoulli
+.. autoclass:: Bernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Beta`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.beta
+.. autoclass:: Beta
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Binomial`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.binomial
+.. autoclass:: Binomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Categorical`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.categorical
+.. autoclass:: Categorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Cauchy`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.cauchy
+.. autoclass:: Cauchy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Chi2`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.chi2
+.. autoclass:: Chi2
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Dirichlet`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.dirichlet
+.. autoclass:: Dirichlet
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Exponential`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.exponential
+.. autoclass:: Exponential
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`FisherSnedecor`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.fishersnedecor
+.. autoclass:: FisherSnedecor
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Gamma`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.gamma
+.. autoclass:: Gamma
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Geometric`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.geometric
+.. autoclass:: Geometric
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Gumbel`
+~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.gumbel
+.. autoclass:: Gumbel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`HalfCauchy`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.half_cauchy
+.. autoclass:: HalfCauchy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`HalfNormal`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.half_normal
+.. autoclass:: HalfNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Independent`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.independent
+.. autoclass:: Independent
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Laplace`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.laplace
+.. autoclass:: Laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`LogNormal`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.log_normal
+.. autoclass:: LogNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Multinomial`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.multinomial
+.. autoclass:: Multinomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`MultivariateNormal`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.multivariate_normal
+.. autoclass:: MultivariateNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Normal`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.normal
+.. autoclass:: Normal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`OneHotCategorical`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.one_hot_categorical
+.. autoclass:: OneHotCategorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Pareto`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.pareto
+.. autoclass:: Pareto
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Poisson`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.poisson
+.. autoclass:: Poisson
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`RelaxedBernoulli`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.relaxed_bernoulli
+.. autoclass:: RelaxedBernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`RelaxedOneHotCategorical`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.relaxed_categorical
+.. autoclass:: RelaxedOneHotCategorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`StudentT`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.studentT
+.. autoclass:: StudentT
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`TransformedDistribution`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.transformed_distribution
+.. autoclass:: TransformedDistribution
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:hidden:`Uniform`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.uniform
+.. autoclass:: Uniform
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+`KL Divergence`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: torch.distributions.kl
+.. currentmodule:: torch.distributions.kl
+
+.. autofunction:: kl_divergence
+.. autofunction:: register_kl
+
+`Transforms`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: torch.distributions.transforms
+    :members:
+    :member-order: bysource
+
+`Constraints`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: torch.distributions.constraints
+    :members:
+    :member-order: bysource
+
+`Constraint Registry`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: torch.distributions.constraint_registry
+    :members:
+    :member-order: bysource
diff --git a/docs/source/dlpack.rst b/docs/source/dlpack.rst
new file mode 100644
index 0000000..869285d
--- /dev/null
+++ b/docs/source/dlpack.rst
@@ -0,0 +1,8 @@
+torch.utils.dlpack
+==================
+
+.. currentmodule:: torch.utils.dlpack
+
+.. autofunction:: from_dlpack
+.. autofunction:: to_dlpack
+
diff --git a/docs/source/ffi.rst b/docs/source/ffi.rst
new file mode 100644
index 0000000..ae7c0e9
--- /dev/null
+++ b/docs/source/ffi.rst
@@ -0,0 +1,6 @@
+torch.utils.ffi
+===============
+
+.. currentmodule:: torch.utils.ffi
+.. autofunction:: create_extension
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..ea6eb3c
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,59 @@
+.. PyTorch documentation master file, created by
+   sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+:github_url: https://github.com/pytorch/pytorch
+
+PyTorch documentation
+===================================
+
+PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Notes
+
+   notes/*
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Package Reference
+
+   torch
+   tensors
+   tensor_attributes
+   sparse
+   cuda
+   storage
+   nn
+   optim
+   torch.autograd <autograd>
+   torch.distributions <distributions>
+   torch.multiprocessing <multiprocessing>
+   torch.distributed <distributed>
+   bottleneck
+   checkpoint
+   cpp_extension
+   data
+   dlpack
+   ffi
+   model_zoo
+   onnx
+   torch.legacy <legacy>
+
+.. toctree::
+   :glob:
+   :maxdepth: 2
+   :caption: torchvision Reference
+
+   torchvision/index
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
diff --git a/docs/source/legacy.rst b/docs/source/legacy.rst
new file mode 100644
index 0000000..bc1aad5
--- /dev/null
+++ b/docs/source/legacy.rst
@@ -0,0 +1,4 @@
+Legacy package - torch.legacy
+===================================
+
+.. automodule:: torch.legacy
diff --git a/docs/source/model_zoo.rst b/docs/source/model_zoo.rst
new file mode 100644
index 0000000..3997a36
--- /dev/null
+++ b/docs/source/model_zoo.rst
@@ -0,0 +1,5 @@
+torch.utils.model_zoo
+===================================
+
+.. automodule:: torch.utils.model_zoo
+.. autofunction:: load_url
diff --git a/docs/source/multiprocessing.rst b/docs/source/multiprocessing.rst
new file mode 100644
index 0000000..afeb49d
--- /dev/null
+++ b/docs/source/multiprocessing.rst
@@ -0,0 +1,88 @@
+Multiprocessing package - torch.multiprocessing
+===============================================
+
+.. automodule:: torch.multiprocessing
+.. currentmodule:: torch.multiprocessing
+
+.. warning::
+
+    If the main process exits abruptly (e.g. because of an incoming signal),
+    Python's ``multiprocessing`` sometimes fails to clean up its children.
+    It's a known caveat, so if you're seeing any resource leaks after
+    interrupting the interpreter, it probably means that this has just happened
+    to you.
+
+Strategy management
+-------------------
+
+.. autofunction:: get_all_sharing_strategies
+.. autofunction:: get_sharing_strategy
+.. autofunction:: set_sharing_strategy
+
+Sharing CUDA tensors
+--------------------
+
+Sharing CUDA tensors between processes is supported only in Python 3, using
+a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
+Python 2 can only create subprocesses using ``fork``, and it's not supported
+by the CUDA runtime.
+
+.. warning::
+
+    CUDA API requires that the allocation exported to other processes remains
+    valid as long as it's used by them. You should be careful and ensure that
+    CUDA tensors you shared don't go out of scope as long as it's necessary.
+    This shouldn't be a problem for sharing model parameters, but passing other
+    kinds of data should be done with care. Note that this restriction doesn't
+    apply to shared CPU memory.
+
+
+Sharing strategies
+------------------
+
+This section provides a brief overview into how different sharing strategies
+work. Note that it applies only to CPU tensor - CUDA tensors will always use
+the CUDA API, as that's the only way they can be shared.
+
+File descriptor - ``file_descriptor``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+.. note::
+
+    This is the default strategy (except for macOS and OS X where it's not
+    supported).
+
+This strategy will use file descriptors as shared memory handles. Whenever a
+storage is moved to shared memory, a file descriptor obtained from ``shm_open``
+is cached with the object, and when it's going to be sent to other processes,
+the file descriptor will be transferred (e.g. via UNIX sockets) to it. The
+receiver will also cache the file descriptor and ``mmap`` it, to obtain a shared
+view onto the storage data.
+
+Note that if there will be a lot of tensors shared, this strategy will keep a
+large number of file descriptors open most of the time. If your system has low
+limits for the number of open file descriptors, and you can't raise them, you
+should use the ``file_system`` strategy.
+
+File system - ``file_system``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This strategy will use file names given to ``shm_open`` to identify the shared
+memory regions. This has a benefit of not requiring the implementation to cache
+the file descriptors obtained from it, but at the same time is prone to shared
+memory leaks. The file can't be deleted right after its creation, because other
+processes need to access it to open their views. If the processes fatally
+crash, or are killed, and don't call the storage destructors, the files will
+remain in the system. This is very serious, because they keep using up the
+memory until the system is restarted, or they're freed manually.
+
+To counter the problem of shared memory file leaks, :mod:`torch.multiprocessing`
+will spawn a daemon named ``torch_shm_manager`` that will isolate itself from
+the current process group, and will keep track of all shared memory allocations.
+Once all processes connected to it exit, it will wait a moment to ensure there
+will be no new connections, and will iterate over all shared memory files
+allocated by the group. If it finds that any of them still exist, they will be
+deallocated. We've tested this method and it proved to be robust to various
+failures. Still, if your system has high enough limits, and ``file_descriptor``
+is a supported strategy, we do not recommend switching to this one.
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
new file mode 100644
index 0000000..987044b
--- /dev/null
+++ b/docs/source/nn.rst
@@ -0,0 +1,1315 @@
+.. role:: hidden
+    :class: hidden-section
+
+torch.nn
+===================================
+
+.. automodule:: torch.nn
+.. currentmodule:: torch.nn
+
+Parameters
+----------
+
+.. autoclass:: Parameter
+    :members:
+
+Containers
+----------------------------------
+
+:hidden:`Module`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Module
+    :members:
+
+:hidden:`Sequential`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Sequential
+    :members:
+
+:hidden:`ModuleList`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ModuleList
+    :members:
+
+:hidden:`ModuleDict`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ModuleDict
+    :members:
+
+:hidden:`ParameterList`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ParameterList
+    :members:
+
+:hidden:`ParameterDict`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ParameterDict
+    :members:
+
+Convolution layers
+----------------------------------
+
+:hidden:`Conv1d`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Conv1d
+    :members:
+
+:hidden:`Conv2d`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Conv2d
+    :members:
+
+:hidden:`Conv3d`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Conv3d
+    :members:
+
+:hidden:`ConvTranspose1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConvTranspose1d
+    :members:
+
+:hidden:`ConvTranspose2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+.. autoclass:: ConvTranspose2d
+    :members:
+
+:hidden:`ConvTranspose3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConvTranspose3d
+    :members:
+
+:hidden:`Unfold`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Unfold
+    :members:
+
+:hidden:`Fold`
+~~~~~~~~~~~~~~
+
+.. autoclass:: Fold
+    :members:
+
+
+Pooling layers
+----------------------------------
+
+:hidden:`MaxPool1d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxPool1d
+    :members:
+
+:hidden:`MaxPool2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxPool2d
+    :members:
+
+:hidden:`MaxPool3d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxPool3d
+    :members:
+
+:hidden:`MaxUnpool1d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxUnpool1d
+    :members:
+
+:hidden:`MaxUnpool2d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxUnpool2d
+    :members:
+
+:hidden:`MaxUnpool3d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MaxUnpool3d
+    :members:
+
+:hidden:`AvgPool1d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AvgPool1d
+    :members:
+
+:hidden:`AvgPool2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AvgPool2d
+    :members:
+
+:hidden:`AvgPool3d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AvgPool3d
+    :members:
+
+:hidden:`FractionalMaxPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: FractionalMaxPool2d
+    :members:
+
+:hidden:`LPPool1d`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LPPool1d
+    :members:
+
+:hidden:`LPPool2d`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LPPool2d
+    :members:
+
+:hidden:`AdaptiveMaxPool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveMaxPool1d
+    :members:
+
+:hidden:`AdaptiveMaxPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveMaxPool2d
+    :members:
+
+:hidden:`AdaptiveMaxPool3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveMaxPool3d
+    :members:
+
+:hidden:`AdaptiveAvgPool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveAvgPool1d
+    :members:
+
+:hidden:`AdaptiveAvgPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveAvgPool2d
+    :members:
+
+:hidden:`AdaptiveAvgPool3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveAvgPool3d
+    :members:
+
+
+Padding layers
+--------------
+
+:hidden:`ReflectionPad1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ReflectionPad1d
+    :members:
+
+:hidden:`ReflectionPad2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ReflectionPad2d
+    :members:
+
+:hidden:`ReplicationPad1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ReplicationPad1d
+    :members:
+
+:hidden:`ReplicationPad2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ReplicationPad2d
+    :members:
+
+:hidden:`ReplicationPad3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ReplicationPad3d
+    :members:
+
+:hidden:`ZeroPad2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ZeroPad2d
+    :members:
+
+:hidden:`ConstantPad1d`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConstantPad1d
+    :members:
+
+:hidden:`ConstantPad2d`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConstantPad2d
+    :members:
+
+:hidden:`ConstantPad3d`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConstantPad3d
+    :members:
+
+
+Non-linear activations (weighted sum, nonlinearity)
+---------------------------------------------------
+
+:hidden:`ELU`
+~~~~~~~~~~~~~
+
+.. autoclass:: ELU
+    :members:
+
+:hidden:`Hardshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Hardshrink
+    :members:
+
+:hidden:`Hardtanh`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Hardtanh
+    :members:
+
+:hidden:`LeakyReLU`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LeakyReLU
+    :members:
+
+:hidden:`LogSigmoid`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LogSigmoid
+    :members:
+
+:hidden:`PReLU`
+~~~~~~~~~~~~~~~
+
+.. autoclass:: PReLU
+    :members:
+
+:hidden:`ReLU`
+~~~~~~~~~~~~~~
+
+.. autoclass:: ReLU
+    :members:
+
+:hidden:`ReLU6`
+~~~~~~~~~~~~~~~
+
+.. autoclass:: ReLU6
+    :members:
+
+:hidden:`RReLU`
+~~~~~~~~~~~~~~~
+
+.. autoclass:: RReLU
+    :members:
+
+:hidden:`SELU`
+~~~~~~~~~~~~~~
+
+.. autoclass:: SELU
+    :members:
+
+:hidden:`Sigmoid`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Sigmoid
+    :members:
+
+:hidden:`Softplus`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softplus
+    :members:
+
+:hidden:`Softshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softshrink
+    :members:
+
+:hidden:`Softsign`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softsign
+    :members:
+
+:hidden:`Tanh`
+~~~~~~~~~~~~~~
+
+.. autoclass:: Tanh
+    :members:
+
+:hidden:`Tanhshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Tanhshrink
+    :members:
+
+:hidden:`Threshold`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Threshold
+    :members:
+
+Non-linear activations (other)
+------------------------------
+
+:hidden:`Softmin`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softmin
+    :members:
+
+:hidden:`Softmax`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softmax
+    :members:
+
+:hidden:`Softmax2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Softmax2d
+    :members:
+
+:hidden:`LogSoftmax`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LogSoftmax
+    :members:
+
+:hidden:`AdaptiveLogSoftmaxWithLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveLogSoftmaxWithLoss
+    :members:
+
+Normalization layers
+----------------------------------
+
+:hidden:`BatchNorm1d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: BatchNorm1d
+    :members:
+
+:hidden:`BatchNorm2d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: BatchNorm2d
+    :members:
+
+:hidden:`BatchNorm3d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: BatchNorm3d
+    :members:
+
+:hidden:`GroupNorm`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: GroupNorm
+    :members:
+
+:hidden:`InstanceNorm1d`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm1d
+    :members:
+
+:hidden:`InstanceNorm2d`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm2d
+    :members:
+
+:hidden:`InstanceNorm3d`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm3d
+    :members:
+
+:hidden:`LayerNorm`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LayerNorm
+    :members:
+
+:hidden:`LocalResponseNorm`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LocalResponseNorm
+    :members:
+
+Recurrent layers
+----------------------------------
+
+:hidden:`RNN`
+~~~~~~~~~~~~~
+
+.. autoclass:: RNN
+    :members:
+
+:hidden:`LSTM`
+~~~~~~~~~~~~~~
+
+.. autoclass:: LSTM
+    :members:
+
+:hidden:`GRU`
+~~~~~~~~~~~~~
+
+.. autoclass:: GRU
+    :members:
+
+:hidden:`RNNCell`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: RNNCell
+    :members:
+
+:hidden:`LSTMCell`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LSTMCell
+    :members:
+
+:hidden:`GRUCell`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: GRUCell
+    :members:
+
+Linear layers
+----------------------------------
+
+:hidden:`Linear`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: Linear
+    :members:
+
+:hidden:`Bilinear`
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Bilinear
+    :members:
+
+Dropout layers
+----------------------------------
+
+:hidden:`Dropout`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Dropout
+    :members:
+
+:hidden:`Dropout2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Dropout2d
+    :members:
+
+:hidden:`Dropout3d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Dropout3d
+    :members:
+
+:hidden:`AlphaDropout`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AlphaDropout
+    :members:
+
+
+Sparse layers
+----------------------------------
+
+:hidden:`Embedding`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Embedding
+    :members:
+
+:hidden:`EmbeddingBag`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: EmbeddingBag
+    :members:
+
+Distance functions
+----------------------------------
+
+:hidden:`CosineSimilarity`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: CosineSimilarity
+    :members:
+
+:hidden:`PairwiseDistance`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: PairwiseDistance
+    :members:
+
+
+Loss functions
+----------------------------------
+
+:hidden:`L1Loss`
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: L1Loss
+    :members:
+
+:hidden:`MSELoss`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MSELoss
+    :members:
+
+:hidden:`CrossEntropyLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: CrossEntropyLoss
+    :members:
+
+:hidden:`NLLLoss`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: NLLLoss
+    :members:
+
+:hidden:`PoissonNLLLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: PoissonNLLLoss
+    :members:
+
+:hidden:`KLDivLoss`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: KLDivLoss
+    :members:
+
+:hidden:`BCELoss`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: BCELoss
+    :members:
+
+:hidden:`BCEWithLogitsLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: BCEWithLogitsLoss
+    :members:
+
+:hidden:`MarginRankingLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MarginRankingLoss
+    :members:
+
+:hidden:`HingeEmbeddingLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: HingeEmbeddingLoss
+    :members:
+
+:hidden:`MultiLabelMarginLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MultiLabelMarginLoss
+    :members:
+
+:hidden:`SmoothL1Loss`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: SmoothL1Loss
+    :members:
+
+:hidden:`SoftMarginLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: SoftMarginLoss
+    :members:
+
+:hidden:`MultiLabelSoftMarginLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MultiLabelSoftMarginLoss
+    :members:
+
+:hidden:`CosineEmbeddingLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: CosineEmbeddingLoss
+    :members:
+
+:hidden:`MultiMarginLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MultiMarginLoss
+    :members:
+
+:hidden:`TripletMarginLoss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: TripletMarginLoss
+    :members:
+
+
+Vision layers
+----------------
+
+:hidden:`PixelShuffle`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: PixelShuffle
+    :members:
+
+:hidden:`Upsample`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Upsample
+    :members:
+
+:hidden:`UpsamplingNearest2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: UpsamplingNearest2d
+    :members:
+
+:hidden:`UpsamplingBilinear2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: UpsamplingBilinear2d
+    :members:
+
+
+DataParallel layers (multi-GPU, distributed)
+--------------------------------------------
+
+:hidden:`DataParallel`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: DataParallel
+    :members:
+
+:hidden:`DistributedDataParallel`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: torch.nn.parallel.DistributedDataParallel
+    :members:
+
+
+Utilities
+---------
+
+:hidden:`clip_grad_norm_`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.clip_grad_norm_
+
+:hidden:`clip_grad_value_`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.clip_grad_value_
+
+:hidden:`parameters_to_vector`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.parameters_to_vector
+
+:hidden:`vector_to_parameters`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.vector_to_parameters
+
+:hidden:`weight_norm`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.weight_norm
+
+:hidden:`remove_weight_norm`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.remove_weight_norm
+
+:hidden:`spectral_norm`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.spectral_norm
+
+:hidden:`remove_spectral_norm`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.remove_spectral_norm
+
+
+.. currentmodule:: torch.nn.utils.rnn
+
+:hidden:`PackedSequence`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.PackedSequence
+
+
+:hidden:`pack_padded_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pack_padded_sequence
+
+
+:hidden:`pad_packed_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pad_packed_sequence
+
+
+:hidden:`pad_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pad_sequence
+
+
+:hidden:`pack_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pack_sequence
+
+
+torch.nn.functional
+===================
+
+.. currentmodule:: torch.nn.functional
+
+Convolution functions
+----------------------------------
+
+:hidden:`conv1d`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv1d
+
+:hidden:`conv2d`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv2d
+
+:hidden:`conv3d`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv3d
+
+:hidden:`conv_transpose1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv_transpose1d
+
+:hidden:`conv_transpose2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv_transpose2d
+
+:hidden:`conv_transpose3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: conv_transpose3d
+
+:hidden:`unfold`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: unfold
+
+:hidden:`fold`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: fold
+
+Pooling functions
+----------------------------------
+
+:hidden:`avg_pool1d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: avg_pool1d
+
+:hidden:`avg_pool2d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: avg_pool2d
+
+:hidden:`avg_pool3d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: avg_pool3d
+
+:hidden:`max_pool1d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_pool1d
+
+:hidden:`max_pool2d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_pool2d
+
+:hidden:`max_pool3d`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_pool3d
+
+:hidden:`max_unpool1d`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_unpool1d
+
+:hidden:`max_unpool2d`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_unpool2d
+
+:hidden:`max_unpool3d`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: max_unpool3d
+
+:hidden:`lp_pool1d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: lp_pool1d
+
+:hidden:`lp_pool2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: lp_pool2d
+
+:hidden:`adaptive_max_pool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_max_pool1d
+
+:hidden:`adaptive_max_pool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_max_pool2d
+
+:hidden:`adaptive_max_pool3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_max_pool3d
+
+:hidden:`adaptive_avg_pool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_avg_pool1d
+
+:hidden:`adaptive_avg_pool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_avg_pool2d
+
+:hidden:`adaptive_avg_pool3d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_avg_pool3d
+
+
+Non-linear activation functions
+-------------------------------
+
+:hidden:`threshold`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: threshold
+.. autofunction:: threshold_
+
+
+:hidden:`relu`
+~~~~~~~~~~~~~~
+
+.. autofunction:: relu
+.. autofunction:: relu_
+
+:hidden:`hardtanh`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: hardtanh
+.. autofunction:: hardtanh_
+
+:hidden:`relu6`
+~~~~~~~~~~~~~~~
+
+.. autofunction:: relu6
+
+:hidden:`elu`
+~~~~~~~~~~~~~
+
+.. autofunction:: elu
+.. autofunction:: elu_
+
+:hidden:`selu`
+~~~~~~~~~~~~~~
+
+.. autofunction:: selu
+
+:hidden:`leaky_relu`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: leaky_relu
+.. autofunction:: leaky_relu_
+
+:hidden:`prelu`
+~~~~~~~~~~~~~~~
+
+.. autofunction:: prelu
+
+:hidden:`rrelu`
+~~~~~~~~~~~~~~~
+
+.. autofunction:: rrelu
+.. autofunction:: rrelu_
+
+:hidden:`glu`
+~~~~~~~~~~~~~~~
+
+.. autofunction:: glu
+
+:hidden:`logsigmoid`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: logsigmoid
+
+:hidden:`hardshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: hardshrink
+
+:hidden:`tanhshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: tanhshrink
+
+:hidden:`softsign`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: softsign
+
+:hidden:`softplus`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: softplus
+
+:hidden:`softmin`
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: softmin
+
+:hidden:`softmax`
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: softmax
+
+:hidden:`softshrink`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: softshrink
+
+:hidden:`gumbel_softmax`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: gumbel_softmax
+
+:hidden:`log_softmax`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: log_softmax
+
+:hidden:`tanh`
+~~~~~~~~~~~~~~
+
+.. autofunction:: tanh
+
+:hidden:`sigmoid`
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: sigmoid
+
+Normalization functions
+-----------------------
+
+:hidden:`batch_norm`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: batch_norm
+
+:hidden:`instance_norm`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: instance_norm
+
+:hidden:`layer_norm`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: layer_norm
+
+:hidden:`local_response_norm`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: local_response_norm
+
+:hidden:`normalize`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: normalize
+
+Linear functions
+----------------
+
+:hidden:`linear`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: linear
+
+:hidden:`bilinear`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: bilinear
+
+Dropout functions
+-----------------
+
+:hidden:`dropout`
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: dropout
+
+:hidden:`alpha_dropout`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: alpha_dropout
+
+:hidden:`dropout2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: dropout2d
+
+:hidden:`dropout3d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: dropout3d
+
+Sparse functions
+----------------------------------
+
+:hidden:`embedding`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: embedding
+
+:hidden:`embedding_bag`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: embedding_bag
+
+Distance functions
+----------------------------------
+
+:hidden:`pairwise_distance`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pairwise_distance
+
+:hidden:`cosine_similarity`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: cosine_similarity
+
+
+Loss functions
+--------------
+
+:hidden:`binary_cross_entropy`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: binary_cross_entropy
+
+:hidden:`poisson_nll_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: poisson_nll_loss
+
+:hidden:`cosine_embedding_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: cosine_embedding_loss
+
+:hidden:`cross_entropy`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: cross_entropy
+
+:hidden:`hinge_embedding_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: hinge_embedding_loss
+
+:hidden:`kl_div`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: kl_div
+
+:hidden:`l1_loss`
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: l1_loss
+
+:hidden:`mse_loss`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: mse_loss
+
+:hidden:`margin_ranking_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: margin_ranking_loss
+
+:hidden:`multilabel_margin_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: multilabel_margin_loss
+
+:hidden:`multilabel_soft_margin_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: multilabel_soft_margin_loss
+
+:hidden:`multi_margin_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: multi_margin_loss
+
+:hidden:`nll_loss`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: nll_loss
+
+:hidden:`binary_cross_entropy_with_logits`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: binary_cross_entropy_with_logits
+
+:hidden:`smooth_l1_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: smooth_l1_loss
+
+:hidden:`soft_margin_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: soft_margin_loss
+
+:hidden:`triplet_margin_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: triplet_margin_loss
+
+Vision functions
+----------------
+
+:hidden:`pixel_shuffle`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pixel_shuffle
+
+:hidden:`pad`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pad
+
+:hidden:`interpolate`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: interpolate
+
+:hidden:`upsample`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: upsample
+
+:hidden:`upsample_nearest`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: upsample_nearest
+
+:hidden:`upsample_bilinear`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: upsample_bilinear
+
+:hidden:`grid_sample`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: grid_sample
+
+:hidden:`affine_grid`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: affine_grid
+
+DataParallel functions (multi-GPU, distributed)
+-----------------------------------------------
+
+:hidden:`data_parallel`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.parallel.data_parallel
+
+
+torch.nn.init
+=============
+
+.. currentmodule:: torch.nn.init
+.. autofunction:: calculate_gain
+.. autofunction:: uniform_
+.. autofunction:: normal_
+.. autofunction:: constant_
+.. autofunction:: eye_
+.. autofunction:: dirac_
+.. autofunction:: xavier_uniform_
+.. autofunction:: xavier_normal_
+.. autofunction:: kaiming_uniform_
+.. autofunction:: kaiming_normal_
+.. autofunction:: orthogonal_
+.. autofunction:: sparse_
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
new file mode 100644
index 0000000..3a7d610
--- /dev/null
+++ b/docs/source/notes/autograd.rst
@@ -0,0 +1,117 @@
+Autograd mechanics
+==================
+
+This note will present an overview of how autograd works and records the
+operations. It's not strictly necessary to understand all this, but we recommend
+getting familiar with it, as it will help you write more efficient, cleaner
+programs, and can aid you in debugging.
+
+.. _excluding-subgraphs:
+
+Excluding subgraphs from backward
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every Tensor has a flag: :attr:`requires_grad` that allows for fine grained
+exclusion of subgraphs from gradient computation and can increase efficiency.
+
+.. _excluding-requires_grad:
+
+``requires_grad``
+~~~~~~~~~~~~~~~~~
+
+If there's a single input to an operation that requires gradient, its output
+will also require gradient. Conversely, only if all inputs don't require
+gradient, the output also won't require it. Backward computation is never
+performed in the subgraphs, where all Tensors didn't require gradients.
+
+.. code::
+
+    >>> x = torch.randn(5, 5)  # requires_grad=False by default
+    >>> y = torch.randn(5, 5)  # requires_grad=False by default
+    >>> z = torch.randn((5, 5), requires_grad=True)
+    >>> a = x + y
+    >>> a.requires_grad
+    False
+    >>> b = a + z
+    >>> b.requires_grad
+    True
+
+This is especially useful when you want to freeze part of your model, or you
+know in advance that you're not going to use gradients w.r.t. some parameters.
+For example if you want to finetune a pretrained CNN, it's enough to switch the
+:attr:`requires_grad` flags in the frozen base, and no intermediate buffers will
+be saved, until the computation gets to the last layer, where the affine
+transform will use weights that require gradient, and the output of the network
+will also require them.
+
+.. code::
+
+    model = torchvision.models.resnet18(pretrained=True)
+    for param in model.parameters():
+        param.requires_grad = False
+    # Replace the last fully-connected layer
+    # Parameters of newly constructed modules have requires_grad=True by default
+    model.fc = nn.Linear(512, 100)
+
+    # Optimize only the classifier
+    optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)
+
+How autograd encodes the history
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Autograd is reverse automatic differentiation system.  Conceptually,
+autograd records a graph recording all of the operations that created
+the data as you execute operations, giving you a directed acyclic graph
+whose leaves are the input tensors and roots are the output tensors.
+By tracing this graph from roots to leaves, you can automatically
+compute the gradients using the chain rule.
+
+Internally, autograd represents this graph as a graph of
+:class:`Function` objects (really expressions), which can be
+:meth:`~torch.autograd.Function.apply` ed to compute the result of
+evaluating the graph.  When computing the forwards pass, autograd
+simultaneously performs the requested computations and builds up a graph
+representing the function that computes the gradient (the ``.grad_fn``
+attribute of each :class:`torch.Tensor` is an entry point into this graph).
+When the forwards pass is completed, we evaluate this graph in the
+backwards pass to compute the gradients.
+
+An important thing to note is that the graph is recreated from scratch at every
+iteration, and this is exactly what allows for using arbitrary Python control
+flow statements, that can change the overall shape and size of the graph at
+every iteration. You don't have to encode all possible paths before you
+launch the training - what you run is what you differentiate.
+
+In-place operations with autograd
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Supporting in-place operations in autograd is a hard matter, and we discourage
+their use in most cases. Autograd's aggressive buffer freeing and reuse makes
+it very efficient and there are very few occasions when in-place operations
+actually lower memory usage by any significant amount. Unless you're operating
+under heavy memory pressure, you might never need to use them.
+
+There are two main reasons that limit the applicability of in-place operations:
+
+1. In-place operations can potentially overwrite values required to compute
+   gradients.
+
+2. Every in-place operation actually requires the implementation to rewrite the
+   computational graph. Out-of-place versions simply allocate new objects and
+   keep references to the old graph, while in-place operations, require
+   changing the creator of all inputs to the :class:`Function` representing
+   this operation. This can be tricky, especially if there are many Tensors
+   that reference the same storage (e.g. created by indexing or transposing),
+   and in-place functions will actually raise an error if the storage of
+   modified inputs is referenced by any other :class:`Tensor`.
+
+In-place correctness checks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every tensor keeps a version counter, that is incremented every time it is
+marked dirty in any operation. When a Function saves any tensors for backward,
+a version counter of their containing Tensor is saved as well. Once you access
+``self.saved_tensors`` it is checked, and if it is greater than the saved value
+an error is raised. This ensures that if you're using in-place
+functions and not seeing any errors, you can be sure that the computed
+gradients are correct.
diff --git a/docs/source/notes/broadcasting.rst b/docs/source/notes/broadcasting.rst
new file mode 100644
index 0000000..40e0adc
--- /dev/null
+++ b/docs/source/notes/broadcasting.rst
@@ -0,0 +1,113 @@
+.. _broadcasting-semantics:
+
+Broadcasting semantics
+======================
+
+Many PyTorch operations support :any:`NumPy Broadcasting Semantics <numpy.doc.broadcasting>`.
+
+In short, if a PyTorch operation supports broadcast, then its Tensor arguments can be
+automatically expanded to be of equal sizes (without making copies of the data).
+
+General semantics
+-----------------
+Two tensors are "broadcastable" if the following rules hold:
+
+- Each tensor has at least one dimension.
+- When iterating over the dimension sizes, starting at the trailing dimension,
+  the dimension sizes must either be equal, one of them is 1, or one of them
+  does not exist.
+
+For Example::
+
+    >>> x=torch.empty(5,7,3)
+    >>> y=torch.empty(5,7,3)
+    # same shapes are always broadcastable (i.e. the above rules always hold)
+
+    >>> x=torch.empty((0,))
+    >>> y=torch.empty(2,2)
+    # x and y are not broadcastable, because x does not have at least 1 dimension
+
+    # can line up trailing dimensions
+    >>> x=torch.empty(5,3,4,1)
+    >>> y=torch.empty(  3,1,1)
+    # x and y are broadcastable.
+    # 1st trailing dimension: both have size 1
+    # 2nd trailing dimension: y has size 1
+    # 3rd trailing dimension: x size == y size
+    # 4th trailing dimension: y dimension doesn't exist
+
+    # but:
+    >>> x=torch.empty(5,2,4,1)
+    >>> y=torch.empty(  3,1,1)
+    # x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3
+
+If two tensors :attr:`x`, :attr:`y` are "broadcastable", the resulting tensor size
+is calculated as follows:
+
+- If the number of dimensions of :attr:`x` and :attr:`y` are not equal, prepend 1
+  to the dimensions of the tensor with fewer dimensions to make them equal length.
+- Then, for each dimension size, the resulting dimension size is the max of the sizes of
+  :attr:`x` and :attr:`y` along that dimension.
+
+For Example::
+
+    # can line up trailing dimensions to make reading easier
+    >>> x=torch.empty(5,1,4,1)
+    >>> y=torch.empty(  3,1,1)
+    >>> (x+y).size()
+    torch.Size([5, 3, 4, 1])
+
+    # but not necessary:
+    >>> x=torch.empty(1)
+    >>> y=torch.empty(3,1,7)
+    >>> (x+y).size()
+    torch.Size([3, 1, 7])
+
+    >>> x=torch.empty(5,2,4,1)
+    >>> y=torch.empty(3,1,1)
+    >>> (x+y).size()
+    RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1
+
+In-place semantics
+------------------
+One complication is that in-place operations do not allow the in-place tensor to change shape
+as a result of the broadcast.
+
+For Example::
+
+    >>> x=torch.empty(5,3,4,1)
+    >>> y=torch.empty(3,1,1)
+    >>> (x.add_(y)).size()
+    torch.Size([5, 3, 4, 1])
+
+    # but:
+    >>> x=torch.empty(1,3,1)
+    >>> y=torch.empty(3,1,7)
+    >>> (x.add_(y)).size()
+    RuntimeError: The expanded size of the tensor (1) must match the existing size (7) at non-singleton dimension 2.
+
+Backwards compatibility
+-----------------------
+Prior versions of PyTorch allowed certain pointwise functions to execute on tensors with different shapes,
+as long as the number of elements in each tensor was equal.  The pointwise operation would then be carried
+out by viewing each tensor as 1-dimensional.  PyTorch now supports broadcasting and the "1-dimensional"
+pointwise behavior is considered deprecated and will generate a Python warning in cases where tensors are
+not broadcastable, but have the same number of elements.
+
+Note that the introduction of broadcasting can cause backwards incompatible changes in the case where
+two tensors do not have the same shape, but are broadcastable and have the same number of elements.
+For Example::
+
+    >>> torch.add(torch.ones(4,1), torch.randn(4))
+
+would previously produce a Tensor with size: torch.Size([4,1]), but now produces a Tensor with size: torch.Size([4,4]).
+In order to help identify cases in your code where backwards incompatibilities introduced by broadcasting may exist,
+you may set `torch.utils.backcompat.broadcast_warning.enabled` to `True`, which will generate a python warning
+in such cases.
+
+For Example::
+
+    >>> torch.utils.backcompat.broadcast_warning.enabled=True
+    >>> torch.add(torch.ones(4,1), torch.ones(4))
+    __main__:1: UserWarning: self and other do not have the same shape, but are broadcastable, and have the same number of elements.
+    Changing behavior in a backwards incompatible manner to broadcasting rather than viewing as 1-dimensional.
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
new file mode 100644
index 0000000..63047bb
--- /dev/null
+++ b/docs/source/notes/cuda.rst
@@ -0,0 +1,273 @@
+.. _cuda-semantics:
+
+CUDA semantics
+==============
+
+:mod:`torch.cuda` is used to set up and run CUDA operations. It keeps track of
+the currently selected GPU, and all CUDA tensors you allocate will by default be
+created on that device. The selected device can be changed with a
+:any:`torch.cuda.device` context manager.
+
+However, once a tensor is allocated, you can do operations on it irrespective
+of the selected device, and the results will be always placed in on the same
+device as the tensor.
+
+Cross-GPU operations are not allowed by default, with the exception of
+:meth:`~torch.Tensor.copy_` and other methods with copy-like functionality
+such as :meth:`~torch.Tensor.to` and :meth:`~torch.Tensor.cuda`.
+Unless you enable peer-to-peer memory access, any attempts to launch ops on
+tensors spread across different devices will raise an error.
+
+Below you can find a small example showcasing this::
+
+    cuda = torch.device('cuda')     # Default CUDA device
+    cuda0 = torch.device('cuda:0')
+    cuda2 = torch.device('cuda:2')  # GPU 2 (these are 0-indexed)
+
+    x = torch.tensor([1., 2.], device=cuda0)
+    # x.device is device(type='cuda', index=0)
+    y = torch.tensor([1., 2.]).cuda()
+    # y.device is device(type='cuda', index=0)
+
+    with torch.cuda.device(1):
+        # allocates a tensor on GPU 1
+        a = torch.tensor([1., 2.], device=cuda)
+
+        # transfers a tensor from CPU to GPU 1
+        b = torch.tensor([1., 2.]).cuda()
+        # a.device and b.device are device(type='cuda', index=1)
+
+        # You can also use ``Tensor.to`` to transfer a tensor:
+        b2 = torch.tensor([1., 2.]).to(device=cuda)
+        # b.device and b2.device are device(type='cuda', index=1)
+
+        c = a + b
+        # c.device is device(type='cuda', index=1)
+
+        z = x + y
+        # z.device is device(type='cuda', index=0)
+
+        # even within a context, you can specify the device
+        # (or give a GPU index to the .cuda call)
+        d = torch.randn(2, device=cuda2)
+        e = torch.randn(2).to(cuda2)
+        f = torch.randn(2).cuda(cuda2)
+        # d.device, e.device, and f.device are all device(type='cuda', index=2)
+
+Asynchronous execution
+----------------------
+
+By default, GPU operations are asynchronous.  When you call a function that
+uses the GPU, the operations are *enqueued* to the particular device, but not
+necessarily executed until later.  This allows us to execute more computations
+in parallel, including operations on CPU or other GPUs.
+
+In general, the effect of asynchronous computation is invisible to the caller,
+because (1) each device executes operations in the order they are queued, and
+(2) PyTorch automatically performs necessary synchronization when copying data
+between CPU and GPU or between two GPUs.  Hence, computation will proceed as if
+every operation was executed synchronously.
+
+You can force synchronous computation by setting environment variable
+`CUDA_LAUNCH_BLOCKING=1`.  This can be handy when an error occurs on the GPU.
+(With asynchronous execution, such an error isn't reported until after the
+operation is actually executed, so the stack trace does not show where it was
+requested.)
+
+As an exception, several functions such as :meth:`~torch.Tensor.copy_` admit
+an explicit :attr:`async` argument, which lets the caller bypass synchronization
+when it is unnecessary.  Another exception is CUDA streams, explained below.
+
+CUDA streams
+^^^^^^^^^^^^
+
+A `CUDA stream`_ is a linear sequence of execution that belongs to a specific
+device.  You normally do not need to create one explicitly: by default, each
+device uses its own "default" stream.
+
+Operations inside each stream are serialized in the order they are created,
+but operations from different streams can execute concurrently in any
+relative order, unless explicit synchronization functions (such as
+:meth:`~torch.cuda.synchronize` or :meth:`~torch.cuda.Stream.wait_stream`) are
+used.  For example, the following code is incorrect::
+
+    cuda = torch.device('cuda')
+    s = torch.cuda.Stream()  # Create a new stream.
+    A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0)
+    with torch.cuda.stream(s):
+        # sum() may start execution before normal_() finishes!
+        B = torch.sum(A)
+
+When the "current stream" is the default stream, PyTorch automatically performs
+necessary synchronization when data is moved around, as explained above.
+However, when using non-default streams, it is the user's responsibility to
+ensure proper synchronization.
+
+.. _CUDA stream: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams
+
+.. _cuda-memory-management:
+
+Memory management
+-----------------
+
+PyTorch uses a caching memory allocator to speed up memory allocations. This
+allows fast memory deallocation without device synchronizations. However, the
+unused memory managed by the allocator will still show as if used in
+``nvidia-smi``. You can use :meth:`~torch.cuda.memory_allocated` and
+:meth:`~torch.cuda.max_memory_allocated` to monitor memory occupied by
+tensors, and use :meth:`~torch.cuda.memory_cached` and
+:meth:`~torch.cuda.max_memory_cached` to monitor memory managed by the caching
+allocator. Calling :meth:`~torch.cuda.empty_cache` can release all **unused**
+cached memory from PyTorch so that those can be used by other GPU applications.
+However, the occupied GPU memory by tensors will not be freed so it can not
+increase the amount of GPU memory available for PyTorch.
+
+Best practices
+--------------
+
+Device-agnostic code
+^^^^^^^^^^^^^^^^^^^^
+
+Due to the structure of PyTorch, you may need to explicitly write
+device-agnostic (CPU or GPU) code; an example may be creating a new tensor as
+the initial hidden state of a recurrent neural network.
+
+The first step is to determine whether the GPU should be used or not. A common
+pattern is to use Python's ``argparse`` module to read in user arguments, and
+have a flag that can be used to disable CUDA, in combination with
+:meth:`~torch.cuda.is_available`. In the following, ``args.device`` results in a
+:class:`torch.device` object that can be used to move tensors to CPU or CUDA.
+
+::
+
+    import argparse
+    import torch
+
+    parser = argparse.ArgumentParser(description='PyTorch Example')
+    parser.add_argument('--disable-cuda', action='store_true',
+                        help='Disable CUDA')
+    args = parser.parse_args()
+    args.device = None
+    if not args.disable_cuda and torch.cuda.is_available():
+        args.device = torch.device('cuda')
+    else:
+        args.device = torch.device('cpu')
+
+Now that we have ``args.device``, we can use it to create a Tensor on the
+desired device.
+
+::
+
+    x = torch.empty((8, 42), device=args.device)
+    net = Network().to(device=args.device)
+
+This can be used in a number of cases to produce device agnostic code. Below
+is an example when using a dataloader:
+
+::
+
+    cuda0 = torch.device('cuda:0')  # CUDA GPU 0
+    for i, x in enumerate(train_loader):
+        x = x.to(cuda0)
+
+When working with multiple GPUs on a system, you can use the
+``CUDA_VISIBLE_DEVICES`` environment flag to manage which GPUs are available to
+PyTorch. As mentioned above, to manually control which GPU a tensor is created
+on, the best practice is to use a :any:`torch.cuda.device` context manager.
+
+::
+
+    print("Outside device is 0")  # On device 0 (default in most scenarios)
+    with torch.cuda.device(1):
+        print("Inside device is 1")  # On device 1
+    print("Outside device is still 0")  # On device 0
+
+If you have a tensor and would like to create a new tensor of the same type on
+the same device, then you can use a ``torch.Tensor.new_*`` method
+(see :class:`torch.Tensor`).
+Whilst the previously mentioned ``torch.*`` factory functions
+(:ref:`tensor-creation-ops`) depend on the current GPU context and
+the attributes arguments you pass in, ``torch.Tensor.new_*`` methods preserve
+the device and other attributes of the tensor.
+
+This is the recommended practice when creating modules in which new
+tensors need to be created internally during the forward pass.
+
+::
+
+    cuda = torch.device('cuda')
+    x_cpu = torch.empty(2)
+    x_gpu = torch.empty(2, device=cuda)
+    x_cpu_long = torch.empty(2, dtype=torch.int64)
+
+    y_cpu = x_cpu.new_full([3, 2], fill_value=0.3)
+    print(y_cpu)
+
+        tensor([[ 0.3000,  0.3000],
+                [ 0.3000,  0.3000],
+                [ 0.3000,  0.3000]])
+
+    y_gpu = x_gpu.new_full([3, 2], fill_value=-5)
+    print(y_gpu)
+
+        tensor([[-5.0000, -5.0000],
+                [-5.0000, -5.0000],
+                [-5.0000, -5.0000]], device='cuda:0')
+
+    y_cpu_long = x_cpu_long.new_tensor([[1, 2, 3]])
+    print(y_cpu_long)
+
+        tensor([[ 1,  2,  3]])
+
+
+If you want to create a tensor of the same type and size of another tensor, and
+fill it with either ones or zeros, :meth:`~torch.ones_like` or
+:meth:`~torch.zeros_like` are provided as convenient helper functions (which
+also preserve :class:`torch.device` and :class:`torch.dtype` of a Tensor).
+
+::
+
+    x_cpu = torch.empty(2, 3)
+    x_gpu = torch.empty(2, 3)
+
+    y_cpu = torch.ones_like(x_cpu)
+    y_gpu = torch.zeros_like(x_gpu)
+
+
+Use pinned memory buffers
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. warning:
+
+    This is an advanced tip. You overuse of pinned memory can cause serious
+    problems if you'll be running low on RAM, and you should be aware that
+    pinning is often an expensive operation.
+
+Host to GPU copies are much faster when they originate from pinned (page-locked)
+memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory`
+method, that returns a copy of the object, with data put in a pinned region.
+
+Also, once you pin a tensor or storage, you can use asynchronous GPU copies.
+Just pass an additional ``non_blocking=True`` argument to a :meth:`~torch.Tensor.cuda`
+call. This can be used to overlap data transfers with computation.
+
+You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
+pinned memory by passing ``pin_memory=True`` to its constructor.
+
+.. _cuda-nn-dataparallel-instead:
+
+Use nn.DataParallel instead of multiprocessing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Most use cases involving batched inputs and multiple GPUs should default to
+using :class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with
+the GIL, a single Python process can saturate multiple GPUs.
+
+As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized.
+However, this is a known issue that is under active development. As always,
+test your use case.
+
+There are significant caveats to using CUDA models with
+:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
+requirements exactly, it is likely that your program will have incorrect or
+undefined behavior.
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
new file mode 100644
index 0000000..78c5582
--- /dev/null
+++ b/docs/source/notes/extending.rst
@@ -0,0 +1,209 @@
+Extending PyTorch
+=================
+
+In this note we'll cover ways of extending :mod:`torch.nn`,
+:mod:`torch.autograd`, and writing custom C extensions utilizing our C
+libraries.
+
+Extending :mod:`torch.autograd`
+-------------------------------
+
+.. currentmodule:: torch.autograd
+
+Adding operations to :mod:`~torch.autograd` requires implementing a new
+:class:`Function` subclass for each operation. Recall that :class:`Function` s
+are what :mod:`~torch.autograd` uses to compute the results and gradients, and
+encode the operation history. Every new function requires you to implement 2
+methods:
+
+- :meth:`~Function.forward` - the code that performs the operation. It can take
+  as many arguments as you want, with some of them being optional, if you
+  specify the default values. All kinds of Python objects are accepted here.
+  :class:`Tensor` arguments that track history (i.e., with
+  ``requires_grad=True``) will be converted to ones that don't track history
+  before the call, and their use will be registered in the graph. Note that this
+  logic won't traverse lists/dicts/any other data structures and will only
+  consider :class:`Tensor` s that are direct arguments to the call. You can
+  return either a single :class:`Tensor` output, or a :class:`tuple` of
+  :class:`Tensor` s if there are multiple outputs. Also, please refer to the
+  docs of :class:`Function` to find descriptions of useful methods that can be
+  called only from :meth:`~Function.forward`.
+- :meth:`~Function.backward` - gradient formula. It will be given
+  as many :class:`Tensor` arguments as there were outputs, with each of them
+  representing gradient w.r.t. that output. It should return as many
+  :class:`Tensor` s as there were inputs, with each of them containing the
+  gradient w.r.t. its corresponding input. If your inputs didn't require
+  gradient (:attr:`~ctx.needs_input_grad` is a tuple of booleans indicating
+  whether each input needs gradient computation), or were non-:class:`Tensor`
+  objects, you can return :class:`python:None`. Also, if you have optional
+  arguments to :meth:`~Function.forward` you can return more gradients than there
+  were inputs, as long as they're all :any:`python:None`.
+
+Below you can find code for a ``Linear`` function from :mod:`torch.nn`, with
+additional comments::
+
+    # Inherit from Function
+    class LinearFunction(Function):
+
+        # Note that both forward and backward are @staticmethods
+        @staticmethod
+        # bias is an optional argument
+        def forward(ctx, input, weight, bias=None):
+            ctx.save_for_backward(input, weight, bias)
+            output = input.mm(weight.t())
+            if bias is not None:
+                output += bias.unsqueeze(0).expand_as(output)
+            return output
+
+        # This function has only a single output, so it gets only one gradient
+        @staticmethod
+        def backward(ctx, grad_output):
+            # This is a pattern that is very convenient - at the top of backward
+            # unpack saved_tensors and initialize all gradients w.r.t. inputs to
+            # None. Thanks to the fact that additional trailing Nones are
+            # ignored, the return statement is simple even when the function has
+            # optional inputs.
+            input, weight, bias = ctx.saved_tensors
+            grad_input = grad_weight = grad_bias = None
+
+            # These needs_input_grad checks are optional and there only to
+            # improve efficiency. If you want to make your code simpler, you can
+            # skip them. Returning gradients for inputs that don't require it is
+            # not an error.
+            if ctx.needs_input_grad[0]:
+                grad_input = grad_output.mm(weight)
+            if ctx.needs_input_grad[1]:
+                grad_weight = grad_output.t().mm(input)
+            if bias is not None and ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum(0).squeeze(0)
+
+            return grad_input, grad_weight, grad_bias
+
+Now, to make it easier to use these custom ops, we recommend aliasing their
+``apply`` method::
+
+    linear = LinearFunction.apply
+
+Here, we give an additional example of a function that is parametrized by
+non-Tensor arguments::
+
+    class MulConstant(Function):
+        @staticmethod
+        def forward(ctx, tensor, constant):
+            # ctx is a context object that can be used to stash information
+            # for backward computation
+            ctx.constant = constant
+            return tensor * constant
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            # We return as many input gradients as there were arguments.
+            # Gradients of non-Tensor arguments to forward must be None.
+            return grad_output * ctx.constant, None
+
+.. note::
+    Inputs to ``backward``, i.e., :attr:`grad_output`, can also be Tensors that
+    track history. So if ``backward`` is implemented with differentiable
+    operations, (e.g., invocation of another custom
+    :class:`~torch.autograd.function`), higher order derivatives will work.
+
+You probably want to check if the backward method you implemented actually
+computes the derivatives of your function. It is possible by comparing with
+numerical approximations using small finite differences::
+
+    from torch.autograd import gradcheck
+
+    # gradcheck takes a tuple of tensors as input, check if your gradient
+    # evaluated with these tensors are close enough to numerical
+    # approximations and returns True if they all verify this condition.
+    input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(30,20,dtype=torch.double,requires_grad=True))
+    test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
+    print(test)
+
+See :ref:`grad-check` for more details on finite-difference gradient comparisons.
+
+Extending :mod:`torch.nn`
+-------------------------
+
+.. currentmodule:: torch.nn
+
+:mod:`~torch.nn` exports two kinds of interfaces - modules and their functional
+versions. You can extend it in both ways, but we recommend using modules for
+all kinds of layers, that hold any parameters or buffers, and recommend using
+a functional form parameter-less operations like activation functions, pooling,
+etc.
+
+Adding a functional version of an operation is already fully covered in the
+section above.
+
+Adding a :class:`Module`
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since :mod:`~torch.nn` heavily utilizes :mod:`~torch.autograd`, adding a new
+:class:`Module` requires implementing a :class:`~torch.autograd.Function`
+that performs the operation and can compute the gradient. From now on let's
+assume that we want to implement a ``Linear`` module and we have the function
+implemented as in the listing above. There's very little code required to
+add this. Now, there are two functions that need to be implemented:
+
+- ``__init__`` (*optional*) - takes in arguments such as kernel sizes, numbers
+  of features, etc. and initializes parameters and buffers.
+- :meth:`~Module.forward` - instantiates a :class:`~torch.autograd.Function` and
+  uses it to perform the operation. It's very similar to a functional wrapper
+  shown above.
+
+This is how a ``Linear`` module can be implemented::
+
+    class Linear(nn.Module):
+        def __init__(self, input_features, output_features, bias=True):
+            super(Linear, self).__init__()
+            self.input_features = input_features
+            self.output_features = output_features
+
+            # nn.Parameter is a special kind of Tensor, that will get
+            # automatically registered as Module's parameter once it's assigned
+            # as an attribute. Parameters and buffers need to be registered, or
+            # they won't appear in .parameters() (doesn't apply to buffers), and
+            # won't be converted when e.g. .cuda() is called. You can use
+            # .register_buffer() to register buffers.
+            # nn.Parameters require gradients by default.
+            self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
+            if bias:
+                self.bias = nn.Parameter(torch.Tensor(output_features))
+            else:
+                # You should always register all possible parameters, but the
+                # optional ones can be None if you want.
+                self.register_parameter('bias', None)
+
+            # Not a very smart way to initialize weights
+            self.weight.data.uniform_(-0.1, 0.1)
+            if bias is not None:
+                self.bias.data.uniform_(-0.1, 0.1)
+
+        def forward(self, input):
+            # See the autograd section for explanation of what happens here.
+            return LinearFunction.apply(input, self.weight, self.bias)
+
+        def extra_repr(self):
+            # (Optional)Set the extra information about this module. You can test
+            # it by printing an object of this class.
+            return 'in_features={}, out_features={}, bias={}'.format(
+                self.in_features, self.out_features, self.bias is not None
+            )
+
+
+Writing custom C++ extensions
+-----------------------------
+
+See this
+`PyTorch tutorial <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
+for a detailed explanation and examples.
+
+Documentations are available at :doc:`../cpp_extension`.
+
+
+Writing custom C extensions
+---------------------------
+
+Example available at
+`this GitHub repository <https://github.com/pytorch/extension-ffi>`_.
diff --git a/docs/source/notes/faq.rst b/docs/source/notes/faq.rst
new file mode 100644
index 0000000..83bf434
--- /dev/null
+++ b/docs/source/notes/faq.rst
@@ -0,0 +1,150 @@
+Frequently Asked Questions
+==========================
+
+My model reports "cuda runtime error(2): out of memory"
+-------------------------------------------------------
+
+As the error message suggests, you have run out of memory on your
+GPU.  Since we often deal with large amounts of data in PyTorch,
+small mistakes can rapidly cause your program to use up all of your
+GPU; fortunately, the fixes in these cases are often simple.
+Here are a few common things to check:
+
+**Don't accumulate history across your training loop.**
+By default, computations involving variables that require gradients
+will keep history.  This means that you should avoid using such
+variables in computations which will live beyond your training loops,
+e.g., when tracking statistics. Instead, you should detach the variable
+or access its underlying data.
+
+Sometimes, it can be non-obvious when differentiable variables can
+occur.  Consider the following training loop (abridged from `source
+<https://discuss.pytorch.org/t/high-memory-usage-while-training/162>`_):
+
+.. code-block:: python
+
+    total_loss = 0
+    for i in range(10000):
+        optimizer.zero_grad()
+        output = model(input)
+        loss = criterion(output)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss
+
+Here, ``total_loss`` is accumulating history across your training loop, since
+``loss`` is a differentiable variable with autograd history. You can fix this by
+writing `total_loss += float(loss)` instead.
+
+Other instances of this problem:
+`1 <https://discuss.pytorch.org/t/resolved-gpu-out-of-memory-error-with-batch-size-1/3719>`_.
+
+**Don't hold onto tensors and variables you don't need.**
+If you assign a Tensor or Variable to a local, Python will not
+deallocate until the local goes out of scope.  You can free
+this reference by using ``del x``.  Similarly, if you assign
+a Tensor or Variable to a member variable of an object, it will
+not deallocate until the object goes out of scope.  You will
+get the best memory usage if you don't hold onto temporaries
+you don't need.
+
+The scopes of locals can be larger than you expect.  For example:
+
+.. code-block:: python
+
+    for i in range(5):
+        intermediate = f(input[i])
+        result += g(intermediate)
+    output = h(result)
+    return output
+
+Here, ``intermediate`` remains live even while ``h`` is executing,
+because its scope extrudes past the end of the loop.  To free it
+earlier, you should ``del intermediate`` when you are done with it.
+
+**Don't run RNNs on sequences that are too large.**
+The amount of memory required to backpropagate through an RNN scales
+linearly with the length of the RNN; thus, you will run out of memory
+if you try to feed an RNN a sequence that is too long.
+
+The technical term for this phenomenon is `backpropagation through time
+<https://en.wikipedia.org/wiki/Backpropagation_through_time>`_,
+and there are plenty of references for how to implement truncated
+BPTT, including in the `word language model <https://github.com/pytorch/examples/tree/master/word_language_model>`_ example; truncation is handled by the
+``repackage`` function as described in
+`this forum post <https://discuss.pytorch.org/t/help-clarifying-repackage-hidden-in-word-language-model/226>`_.
+
+**Don't use linear layers that are too large.**
+A linear layer ``nn.Linear(m, n)`` uses :math:`O(nm)` memory: that is to say,
+the memory requirements of the weights
+scales quadratically with the number of features.  It is very easy
+to `blow through your memory <https://github.com/pytorch/pytorch/issues/958>`_
+this way (and remember that you will need at least twice the size of the
+weights, since you also need to store the gradients.)
+
+My GPU memory isn't freed properly
+-------------------------------------------------------
+PyTorch uses a caching memory allocator to speed up memory allocations. As a
+result, the values shown in ``nvidia-smi`` usually don't reflect the true
+memory usage. See :ref:`cuda-memory-management` for more details about GPU
+memory management.
+
+If your GPU memory isn't freed even after Python quits, it is very likely that
+some Python subprocesses are still alive. You may find them via
+``ps -elf | grep python`` and manually kill them with ``kill -9 [pid]``.
+
+.. _dataloader-workers-random-seed:
+
+My data loader workers return identical random numbers
+-------------------------------------------------------
+You are likely using other libraries to generate random numbers in the dataset.
+For example, NumPy's RNG is duplicated when worker subprocesses are started via
+``fork``. See :class:`torch.utils.data.DataLoader`'s document for how to
+properly set up random seeds in workers with its :attr:`worker_init_fn` option.
+
+.. _pack-rnn-unpack-with-data-parallelism:
+
+My recurrent network doesn't work with data parallelism
+-------------------------------------------------------
+There is a subtlety in using the
+``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+:class:`~torch.nn.Module` with :class:`~torch.nn.DataParallel` or
+:func:`~torch.nn.parallel.data_parallel`. Input to each the :meth:`forward` on
+each device will only be part of the entire input. Because the unpack operation
+:func:`torch.nn.utils.rnn.pad_packed_sequence` by default only pads up to the
+longest input it sees, i.e., the longest on that particular device, size
+mismatches will happen when results are gathered together. Therefore, you can
+instead take advantage of the :attr:`total_length` argument of
+:func:`~torch.nn.utils.rnn.pad_packed_sequence` to make sure that the
+:meth:`forward` calls return sequences of same length. For example, you can
+write::
+
+    from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+    class MyModule(nn.Module):
+        # ... __init__, other methods, etc.
+
+        # padding_input is of shape [B x T x *] (batch_first mode) and contains
+        # the sequences sorted by lengths
+        # B is the batch size
+        # T is max sequence length
+        def forward(self, padded_input, input_lengths):
+            total_length = padded_input.size(1)  # get the max sequence length
+            packed_input = pack_padded_sequence(padded_input, input_lengths,
+                                                batch_first=True)
+            packed_output, _ = self.my_lstm(packed_input)
+            output, _ = pad_packed_sequence(packed_output, batch_first=True,
+                                            total_length=total_length)
+            return output
+
+
+    m = MyModule().cuda()
+    dp_m = nn.DataParallel(m)
+
+
+Additionally, extra care needs to be taken when batch dimension is dim ``1``
+(i.e., ``batch_first=False``) with data parallelism. In this case, the first
+argument of pack_padded_sequence ``padding_input`` will be of shape
+``[T x B x *]`` and should be scattered along dim ``1``, but the second argument
+``input_lengths`` will be of shape ``[B]`` and should be scattered along dim
+``0``. Extra code to manipulate the tensor shapes will be needed.
diff --git a/docs/source/notes/multiprocessing.rst b/docs/source/notes/multiprocessing.rst
new file mode 100644
index 0000000..3c3c7a5
--- /dev/null
+++ b/docs/source/notes/multiprocessing.rst
@@ -0,0 +1,128 @@
+Multiprocessing best practices
+==============================
+
+:mod:`torch.multiprocessing` is a drop in replacement for Python's
+:mod:`python:multiprocessing` module. It supports the exact same operations,
+but extends it, so that all tensors sent through a
+:class:`python:multiprocessing.Queue`, will have their data moved into shared
+memory and will only send a handle to another process.
+
+.. note::
+
+    When a :class:`~torch.Tensor` is sent to another process, the
+    :class:`~torch.Tensor` data is shared. If :attr:`torch.Tensor.grad` is
+    not ``None``, it is also shared. After a :class:`~torch.Tensor` without
+    a :attr:`torch.Tensor.grad` field is sent to the other process, it
+    creates a standard process-specific ``.grad`` :class:`~torch.Tensor` that
+    is not automatically shared across all processes, unlike how the
+    :class:`~torch.Tensor`'s data has been shared.
+
+This allows to implement various training methods, like Hogwild, A3C, or any
+others that require asynchronous operation.
+
+Sharing CUDA tensors
+--------------------
+
+Sharing CUDA tensors between processes is supported only in Python 3, using
+a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
+Python 2 can only create subprocesses using ``fork``, and it's not supported
+by the CUDA runtime.
+
+.. warning::
+
+    CUDA API requires that the allocation exported to other processes remains
+    valid as long as it's used by them. You should be careful and ensure that
+    CUDA tensors you shared don't go out of scope as long as it's necessary.
+    This shouldn't be a problem for sharing model parameters, but passing other
+    kinds of data should be done with care. Note that this restriction doesn't
+    apply to shared CPU memory.
+
+See also: :ref:`cuda-nn-dataparallel-instead`
+
+
+Best practices and tips
+-----------------------
+
+Avoiding and fighting deadlocks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are a lot of things that can go wrong when a new process is spawned, with
+the most common cause of deadlocks being background threads. If there's any
+thread that holds a lock or imports a module, and ``fork`` is called, it's very
+likely that the subprocess will be in a corrupted state and will deadlock or
+fail in a different way. Note that even if you don't, Python built in
+libraries do - no need to look further than :mod:`python:multiprocessing`.
+:class:`python:multiprocessing.Queue` is actually a very complex class, that
+spawns multiple threads used to serialize, send and receive objects, and they
+can cause aforementioned problems too. If you find yourself in such situation
+try using a :class:`~python:multiprocessing.queues.SimpleQueue`, that doesn't
+use any additional threads.
+
+We're trying our best to make it easy for you and ensure these deadlocks don't
+happen but some things are out of our control. If you have any issues you can't
+cope with for a while, try reaching out on forums, and we'll see if it's an
+issue we can fix.
+
+Reuse buffers passed through a Queue
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Remember that each time you put a :class:`~torch.Tensor` into a
+:class:`python:multiprocessing.Queue`, it has to be moved into shared memory.
+If it's already shared, it is a no-op, otherwise it will incur an additional
+memory copy that can slow down the whole process. Even if you have a pool of
+processes sending data to a single one, make it send the buffers back - this
+is nearly free and will let you avoid a copy when sending next batch.
+
+Asynchronous multiprocess training (e.g. Hogwild)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using :mod:`torch.multiprocessing`, it is possible to train a model
+asynchronously, with parameters either shared all the time, or being
+periodically synchronized. In the first case, we recommend sending over the whole
+model object, while in the latter, we advise to only send the
+:meth:`~torch.nn.Module.state_dict`.
+
+We recommend using :class:`python:multiprocessing.Queue` for passing all kinds
+of PyTorch objects between processes. It is possible to e.g. inherit the tensors
+and storages already in shared memory, when using the ``fork`` start method,
+however it is very bug prone and should be used with care, and only by advanced
+users. Queues, even though they're sometimes a less elegant solution, will work
+properly in all cases.
+
+.. warning::
+
+    You should be careful about having global statements, that are not guarded
+    with an ``if __name__ == '__main__'``. If a different start method than
+    ``fork`` is used, they will be executed in all subprocesses.
+
+Hogwild
+~~~~~~~
+
+A concrete Hogwild implementation can be found in the `examples repository`__,
+but to showcase the overall structure of the code, there's also a minimal
+example below as well::
+
+    import torch.multiprocessing as mp
+    from model import MyModel
+
+    def train(model):
+        # Construct data_loader, optimizer, etc.
+        for data, labels in data_loader:
+            optimizer.zero_grad()
+            loss_fn(model(data), labels).backward()
+            optimizer.step()  # This will update the shared parameters
+
+    if __name__ == '__main__':
+        num_processes = 4
+        model = MyModel()
+        # NOTE: this is required for the ``fork`` method to work
+        model.share_memory()
+        processes = []
+        for rank in range(num_processes):
+            p = mp.Process(target=train, args=(model,))
+            p.start()
+            processes.append(p)
+        for p in processes:
+            p.join()
+
+.. __: https://github.com/pytorch/examples/tree/master/mnist_hogwild
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
new file mode 100644
index 0000000..4680031
--- /dev/null
+++ b/docs/source/notes/serialization.rst
@@ -0,0 +1,34 @@
+
+Serialization semantics
+=======================
+
+Best practices
+--------------
+
+.. _recommend-saving-models:
+
+Recommended approach for saving a model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two main approaches for serializing and restoring a model.
+
+The first (recommended) saves and loads only the model parameters::
+
+    torch.save(the_model.state_dict(), PATH)
+
+Then later::
+
+    the_model = TheModelClass(*args, **kwargs)
+    the_model.load_state_dict(torch.load(PATH))
+
+The second saves and loads the entire model::
+
+    torch.save(the_model, PATH)
+
+Then later::
+
+    the_model = torch.load(PATH)
+
+However in this case, the serialized data is bound to the specific classes
+and the exact directory structure used, so it can break in various ways when
+used in other projects, or after some serious refactors.
diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst
new file mode 100644
index 0000000..933d907
--- /dev/null
+++ b/docs/source/notes/windows.rst
@@ -0,0 +1,290 @@
+Windows FAQ
+==========================
+
+Building from source
+--------------------
+
+Include optional components
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two supported components for Windows PyTorch:
+MKL and MAGMA. Here are the steps to build with them.
+
+.. code-block:: bat
+
+    REM Make sure you have 7z and curl installed.
+
+    REM Download MKL files
+    curl https://s3.amazonaws.com/ossci-windows/mkl_2018.2.185.7z -k -O
+    7z x -aoa mkl_2018.2.185.7z -omkl
+
+    REM Download MAGMA files
+    REM cuda90/cuda91 is also available in the following line.
+    set CUDA_PREFIX=cuda80 
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%CUDA_PREFIX%_release_mkl_2018.2.185.7z -o magma.7z
+    7z x -aoa magma.7z -omagma
+    
+    REM Setting essential environment variables
+    set "CMAKE_INCLUDE_PATH=%cd%\\mkl\\include"
+    set "LIB=%cd%\\mkl\\lib;%LIB%"
+    set "MAGMA_HOME=%cd%\\magma"
+
+Speeding CUDA build for Windows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Visual Studio doesn't support parallel custom task currently.
+As an alternative, we can use ``Ninja`` to parallelize CUDA
+build tasks. It can be used by typing only a few lines of code.
+
+.. code-block:: bat
+    
+    REM Let's install ninja first.
+    pip install ninja
+
+    REM Set it as the cmake generator
+    set CMAKE_GENERATOR=Ninja
+
+
+One key install script
+^^^^^^^^^^^^^^^^^^^^^^
+
+You can take a look at `this set of scripts
+<https://github.com/peterjc123/pytorch-scripts>`_.
+It will lead the way for you.
+
+Extension
+---------
+
+CFFI Extension
+^^^^^^^^^^^^^^
+
+The support for CFFI Extension is very experimental. There're 
+generally two steps to enable it under Windows.
+
+First, specify additional ``libraries`` in ``Extension``
+object to make it build on Windows.
+
+.. code-block:: python
+
+   ffi = create_extension(
+       '_ext.my_lib',
+       headers=headers,
+       sources=sources,
+       define_macros=defines,
+       relative_to=__file__,
+       with_cuda=with_cuda,
+       extra_compile_args=["-std=c99"],
+       libraries=['ATen', '_C'] # Append cuda libaries when necessary, like cudart
+   )
+
+Second, here is a workground for "unresolved external symbol 
+state caused by ``extern THCState *state;``"
+
+Change the source code from C to C++. An example is listed below.
+
+.. code-block:: cpp
+
+    #include <THC/THC.h>
+    #include <ATen/ATen.h>
+
+    THCState *state = at::globalContext().thc_state;
+
+    extern "C" int my_lib_add_forward_cuda(THCudaTensor *input1, THCudaTensor *input2,
+                                            THCudaTensor *output)
+    {
+        if (!THCudaTensor_isSameSizeAs(state, input1, input2))
+        return 0;
+        THCudaTensor_resizeAs(state, output, input1);
+        THCudaTensor_cadd(state, output, input1, 1.0, input2);
+        return 1;
+    }
+
+    extern "C" int my_lib_add_backward_cuda(THCudaTensor *grad_output, THCudaTensor *grad_input)
+    {
+        THCudaTensor_resizeAs(state, grad_input, grad_output);
+        THCudaTensor_fill(state, grad_input, 1);
+        return 1;
+    }
+
+Cpp Extension
+^^^^^^^^^^^^^
+
+This type of extension has better support compared with
+the previous one. However, it still needs some manual
+configuration. First, you should open the
+**x86_x64 Cross Tools Command Prompt for VS 2017**.
+And then, you can open the Git-Bash in it. It is
+usually located in ``C:\Program Files\Git\git-bash.exe``.
+Finally, you can start your compiling process.
+
+Installation
+------------
+
+Package not found in win-32 channel.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bat
+
+    Solving environment: failed
+
+    PackagesNotFoundError: The following packages are not available from current channels:
+
+    - pytorch
+
+    Current channels:
+    - https://conda.anaconda.org/pytorch/win-32
+    - https://conda.anaconda.org/pytorch/noarch
+    - https://repo.continuum.io/pkgs/main/win-32
+    - https://repo.continuum.io/pkgs/main/noarch
+    - https://repo.continuum.io/pkgs/free/win-32
+    - https://repo.continuum.io/pkgs/free/noarch
+    - https://repo.continuum.io/pkgs/r/win-32
+    - https://repo.continuum.io/pkgs/r/noarch
+    - https://repo.continuum.io/pkgs/pro/win-32
+    - https://repo.continuum.io/pkgs/pro/noarch
+    - https://repo.continuum.io/pkgs/msys2/win-32
+    - https://repo.continuum.io/pkgs/msys2/noarch
+
+PyTorch doesn't work on 32-bit system. Please use Windows and
+Python 64-bit version.
+
+Why are there no Python 2 packages for Windows?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Because it's not stable enough. There're some issues that need to
+be solved before we officially release it. You can build it by yourself.
+
+Import error
+^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    from torch._C import *
+
+    ImportError: DLL load failed: The specified module could not be found.
+
+
+The problem is caused by the missing of the essential files. Actually,
+we include almost all the essential files that PyTorch need for the conda
+package except VC2017 redistributable and some mkl libraries. 
+You can resolve this by typing the following command.
+
+.. code-block:: bat
+
+    conda install -c peterjc123 vc vs2017_runtime
+    conda install mkl_fft intel_openmp numpy mkl
+
+As for the wheels package, since we didn't pack some libaries and VS2017 
+redistributable files in, please make sure you install them manually.
+The `VS 2017 redistributable installer
+<https://aka.ms/vs/15/release/VC_redist.x64.exe>`_ can be downloaded.
+And you should also pay attention to your installation of Numpy. Make sure it
+uses MKL instead of OpenBLAS. You may type in the following command.
+
+.. code-block:: bat
+
+    pip install numpy mkl intel-openmp mkl_fft
+
+Another possible cause may be you are using GPU version without NVIDIA
+graphics cards. Please replace your GPU package with the CPU one.
+
+.. code-block:: py3tb
+
+    from torch._C import *
+
+    ImportError: DLL load failed: The operating system cannot run %1.
+
+
+This is actually an upstream issue of Anaconda. When you initialize your
+environment with conda-forge channel, this issue will emerge. You may fix
+the intel-openmp libraries through this command.
+
+.. code-block:: bat
+
+    conda install -c defaults intel-openmp -f
+
+
+Usage (multiprocessing)
+-------------------------------------------------------
+
+Multiprocessing error without if-clause protection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    RuntimeError:
+   	An attempt has been made to start a new process before the
+   	current process has finished its bootstrapping phase.
+
+       This probably means that you are not using fork to start your
+       child processes and you have forgotten to use the proper idiom
+       in the main module:
+
+           if __name__ == '__main__':
+               freeze_support()
+               ...
+
+       The "freeze_support()" line can be omitted if the program
+       is not going to be frozen to produce an executable.
+
+The implementation of ``multiprocessing`` is different on Windows, which
+uses ``spawn`` instead of ``fork``. So we have to wrap the code with an
+if-clause to protect the code from executing multiple times. Refactor
+your code into the following structure.
+
+.. code-block:: python
+
+    import torch
+
+    def main()
+        for i, data in enumerate(dataloader):
+            # do something here
+
+    if __name__ == '__main__':
+        main()
+
+
+Multiprocessing error "Broken pipe"
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    ForkingPickler(file, protocol).dump(obj)
+
+    BrokenPipeError: [Errno 32] Broken pipe
+
+This issue happens when the child process ends before the parent process
+finishes sending data. There may be something wrong with your code. You
+can debug your code by reducing the ``num_worker`` of 
+:class:`~torch.utils.data.DataLoader` to zero and see if the issue persists.
+
+Multiprocessing error "driver shut down"
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    Couldn’t open shared file mapping: <torch_14808_1591070686>, error code: <1455> at torch\lib\TH\THAllocator.c:154
+
+    [windows] driver shut down
+
+Please update your graphics driver. If this persists, this may be that your
+graphics card is too old or the calculation is too heavy for your card. Please
+update the TDR settings according to this `post
+<https://www.pugetsystems.com/labs/hpc/Working-around-TDR-in-Windows-for-a-better-GPU-computing-experience-777/>`_.
+
+CUDA IPC operations
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+   THCudaCheck FAIL file=torch\csrc\generic\StorageSharing.cpp line=252 error=63 : OS call failed or operation not supported on this OS
+
+They are not supported on Windows. Something like doing multiprocessing on CUDA
+tensors cannot succeed, there are two alternatives for this.
+
+1. Don't use ``multiprocessing``. Set the ``num_worker`` of 
+:class:`~torch.utils.data.DataLoader` to zero.
+
+2. Share CPU tensors instead. Make sure your custom
+:class:`~torch.utils.data.DataSet` returns CPU tensors.
+
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
new file mode 100644
index 0000000..5b02e16
--- /dev/null
+++ b/docs/source/onnx.rst
@@ -0,0 +1,328 @@
+torch.onnx
+============
+.. automodule:: torch.onnx
+
+Example: End-to-end AlexNet from PyTorch to Caffe2
+--------------------------------------------------
+
+Here is a simple script which exports a pretrained AlexNet as defined in
+torchvision into ONNX.  It runs a single round of inference and then
+saves the resulting traced model to ``alexnet.onnx``::
+
+    from torch.autograd import Variable
+    import torch.onnx
+    import torchvision
+
+    dummy_input = Variable(torch.randn(10, 3, 224, 224)).cuda()
+    model = torchvision.models.alexnet(pretrained=True).cuda()
+
+    # Providing input and output names sets the display names for values
+    # within the model's graph. Setting these does not change the semantics
+    # of the graph; it is only for readability.
+    #
+    # The inputs to the network consist of the flat list of inputs (i.e.
+    # the values you would pass to the forward() method) followed by the
+    # flat list of parameters. You can partially specify names, i.e. provide
+    # a list here shorter than the number of inputs to the model, and we will
+    # only set that subset of names, starting from the beginning.
+    input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+    output_names = [ "output1" ]
+
+    torch.onnx.export(model, dummy_input, "alexnet.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+The resulting ``alexnet.onnx`` is a binary protobuf file which contains both
+the network structure and parameters of the model you exported
+(in this case, AlexNet).  The keyword argument ``verbose=True`` causes the
+exporter to print out a human-readable representation of the network::
+
+    # These are the inputs and parameters to the network, which have taken on
+    # the names we specified earlier.
+    graph(%actual_input_1 : Float(10, 3, 224, 224)
+          %learned_0 : Float(64, 3, 11, 11)
+          %learned_1 : Float(64)
+          %learned_2 : Float(192, 64, 5, 5)
+          %learned_3 : Float(192)
+          # ---- omitted for brevity ----
+          %learned_14 : Float(1000, 4096)
+          %learned_15 : Float(1000)) {
+      # Every statement consists of some output tensors (and their types),
+      # the operator to be run (with its attributes, e.g., kernels, strides,
+      # etc.), its input tensors (%actual_input_1, %learned_0, %learned_1)
+      %17 : Float(10, 64, 55, 55) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[11, 11], pads=[2, 2, 2, 2], strides=[4, 4]](%actual_input_1, %learned_0, %learned_1), scope: AlexNet/Sequential[features]/Conv2d[0]
+      %18 : Float(10, 64, 55, 55) = onnx::Relu(%17), scope: AlexNet/Sequential[features]/ReLU[1]
+      %19 : Float(10, 64, 27, 27) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%18), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+      # ---- omitted for brevity ----
+      %29 : Float(10, 256, 6, 6) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%28), scope: AlexNet/Sequential[features]/MaxPool2d[12]
+      # Dynamic means that the shape is not known. This may be because of a
+      # limitation of our implementation (which we would like to fix in a
+      # future release) or shapes which are truly dynamic.
+      %30 : Dynamic = onnx::Shape(%29), scope: AlexNet
+      %31 : Dynamic = onnx::Slice[axes=[0], ends=[1], starts=[0]](%30), scope: AlexNet
+      %32 : Long() = onnx::Squeeze[axes=[0]](%31), scope: AlexNet
+      %33 : Long() = onnx::Constant[value={9216}](), scope: AlexNet
+      # ---- omitted for brevity ----
+      %output1 : Float(10, 1000) = onnx::Gemm[alpha=1, beta=1, broadcast=1, transB=1](%45, %learned_14, %learned_15), scope: AlexNet/Sequential[classifier]/Linear[6]
+      return (%output1);
+    }
+
+You can also verify the protobuf using the `onnx <https://github.com/onnx/onnx/>`_ library.
+You can install ``onnx`` with conda::
+
+    conda install -c conda-forge onnx
+
+Then, you can run::
+
+    import onnx
+
+    # Load the ONNX model
+    model = onnx.load("alexnet.onnx")
+
+    # Check that the IR is well formed
+    onnx.checker.check_model(model)
+
+    # Print a human readable representation of the graph
+    onnx.helper.printable_graph(model.graph)
+
+To run the exported script with `caffe2 <https://caffe2.ai/>`_, you will need to install `caffe2`: If you don't have one already, Please `follow the install instructions <https://caffe2.ai/docs/getting-started.html>`_.
+
+Once these are installed, you can use the backend for Caffe2::
+
+    # ...continuing from above
+    import caffe2.python.onnx.backend as backend
+    import numpy as np
+
+    rep = backend.prepare(model, device="CUDA:0") # or "CPU"
+    # For the Caffe2 backend:
+    #     rep.predict_net is the Caffe2 protobuf for the network
+    #     rep.workspace is the Caffe2 workspace for the network
+    #       (see the class caffe2.python.onnx.backend.Workspace)
+    outputs = rep.run(np.random.randn(10, 3, 224, 224).astype(np.float32))
+    # To run networks with more than one input, pass a tuple
+    # rather than a single numpy ndarray.
+    print(outputs[0])
+
+In the future, there will be backends for other frameworks as well.
+
+Limitations
+-----------
+
+* The ONNX exporter is a *trace-based* exporter, which means that it
+  operates by executing your model once, and exporting the operators which
+  were actually run during this run.  This means that if your model is
+  dynamic, e.g., changes behavior depending on input data, the export
+  won't be accurate.  Similarly, a trace is likely to be valid only
+  for a specific input size (which is one reason why we require explicit inputs
+  on tracing.)  We recommend examining the model trace and making sure
+  the traced operators look reasonable.
+
+* PyTorch and Caffe2 often have implementations of operators with some
+  numeric differences.  Depending on model structure, these differences
+  may be negligible, but they can also cause major divergences in behavior
+  (especially on untrained models.)  In a future release, we plan to
+  allow Caffe2 to call directly to Torch implementations of operators, to
+  help you smooth over these differences when precision is important,
+  and to also document these differences.
+
+Supported operators
+-------------------
+
+The following operators are supported:
+
+* add (nonzero alpha not supported)
+* sub (nonzero alpha not supported)
+* mul
+* div
+* cat
+* mm
+* addmm
+* neg
+* sqrt
+* tanh
+* sigmoid
+* mean
+* sum
+* prod
+* t
+* expand (only when used before a broadcasting ONNX operator; e.g., add)
+* transpose
+* view
+* split
+* squeeze
+* prelu (single weight shared among input channels not supported)
+* threshold (non-zero threshold/non-zero value not supported)
+* leaky_relu
+* glu
+* softmax (only dim=-1 supported)
+* avg_pool2d (ceil_mode not supported)
+* log_softmax
+* unfold (experimental support with ATen-Caffe2 integration)
+* elu
+* concat
+* abs
+* index_select
+* pow
+* clamp
+* max
+* min
+* eq
+* gt
+* lt
+* ge
+* le
+* exp
+* permute
+* Conv
+* BatchNorm
+* MaxPool1d (ceil_mode not supported)
+* MaxPool2d (ceil_mode not supported)
+* MaxPool3d (ceil_mode not supported)
+* Embedding (no optional arguments supported)
+* RNN
+* ConstantPadNd
+* Dropout
+* FeatureDropout (training mode not supported)
+* Index (constant integer and tuple indices supported)
+
+The operator set above is sufficient to export the following models:
+
+* AlexNet
+* DCGAN
+* DenseNet
+* Inception (warning: this model is highly sensitive to changes in operator
+  implementation)
+* ResNet
+* SuperResolution
+* VGG
+* `word_language_model <https://github.com/pytorch/examples/tree/master/word_language_model>`_
+
+Adding export support for operators is an *advance usage*.
+To achieve this, developers need to touch the source code of PyTorch.
+Please follow the `instructions <https://github.com/pytorch/pytorch#from-source>`_
+for installing PyTorch from source.
+If the wanted operator is standardized in ONNX, it should be easy to add
+support for exporting such operator (adding a symbolic function for the operator).
+To confirm whether the operator is standardized or not, please check the
+`ONNX operator list <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_.
+
+If the operator is an ATen operator, which means you can find the declaration
+of the function in ``torch/csrc/autograd/generated/VariableType.h``
+(available in generated code in PyTorch install dir), you should add the symbolic
+function in ``torch/onnx/symbolic.py`` and follow the instructions listed as below:
+
+* Define the symbolic function in
+  `torch/onnx/symbolic.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic.py>`_.
+  Make sure the function has the same name as the ATen operator/function
+  defined in ``VariableType.h``.
+* The first parameter is always the exported ONNX graph.
+  Parameter names must EXACTLY match the names in ``VariableType.h``,
+  because dispatch is done with keyword arguments.
+* Parameter ordering does NOT necessarily match what is in ``VariableType.h``,
+  tensors (inputs) are always first, then non-tensor arguments.
+* In the symbolic function, if the operator is already standardized in ONNX,
+  we only need to create a node to represent the ONNX operator in the graph.
+* If the input argument is a tensor, but ONNX asks for a scalar, we have to
+  explicitly do the conversion. The helper function ``_scalar`` can convert a
+  scalar tensor into a python scalar, and ``_if_scalar_type_as`` can turn a
+  Python scalar into a PyTorch tensor.
+
+If the operator is a non-ATen operator, the symbolic function has to be
+added in the corresponding PyTorch Function class. Please read the following
+instructions:
+
+* Create a symbolic function named ``symbolic`` in the corresponding Function class.
+* The first parameter is always the exported ONNX graph.
+* Parameter names except the first must EXACTLY match the names in ``forward``.
+* The output tuple size must match the outputs of ``forward``.
+* In the symbolic function, if the operator is already standardized in ONNX,
+  we just need to create a node to represent the ONNX operator in the graph.
+
+Symbolic functions should be implemented in Python. All of these functions interact
+with Python methods which are implemented via C++-Python bindings,
+but intuitively the interface they provide looks like this::
+
+
+    def operator/symbolic(g, *inputs):
+      """
+      Modifies Graph (e.g., using "op"), adding the ONNX operations representing
+      this PyTorch function, and returning a Value or tuple of Values specifying the
+      ONNX outputs whose values correspond to the original PyTorch return values
+      of the autograd Function (or None if an output is not supported by ONNX).
+
+      Arguments:
+        g (Graph): graph to write the ONNX representation into
+        inputs (Value...): list of values representing the variables which contain
+            the inputs for this function
+      """
+
+    class Value(object):
+      """Represents an intermediate tensor value computed in ONNX."""
+      def type(self):
+        """Returns the Type of the value."""
+
+    class Type(object):
+      def sizes(self):
+        """Returns a tuple of ints representing the shape of a tensor this describes."""
+
+    class Graph(object):
+      def op(self, opname, *inputs, **attrs):
+        """
+        Create an ONNX operator 'opname', taking 'args' as inputs
+        and attributes 'kwargs' and add it as a node to the current graph,
+        returning the value representing the single output of this
+        operator (see the `outputs` keyword argument for multi-return
+        nodes).
+
+        The set of operators and the inputs/attributes they take
+        is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+        Arguments:
+            opname (string): The ONNX operator name, e.g., `Abs` or `Add`.
+            args (Value...): The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            kwargs: The attributes of the ONNX operator, with keys named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+            outputs (int, optional):  The number of outputs this operator returns;
+                by default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in positional.
+        """
+
+The ONNX graph C++ definition is in ``torch/csrc/jit/ir.h``.
+
+Here is an example of handling missing symbolic function for ``elu`` operator.
+We try to export the model and see the error message as below::
+
+    UserWarning: ONNX export failed on elu because torch.onnx.symbolic.elu does not exist
+    RuntimeError: ONNX export failed: Couldn't export operator elu
+
+The export fails because PyTorch does not support exporting ``elu`` operator.
+We find ``virtual Tensor elu(const Tensor & input, Scalar alpha, bool inplace) const override;``
+in ``VariableType.h``. This means ``elu`` is an ATen operator.
+We check the `ONNX operator list <http://https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
+and confirm that ``Elu`` is standardized in ONNX.
+We add the following lines to ``symbolic.py``::
+
+    def elu(g, input, alpha, inplace=False):
+        return g.op("Elu", input, alpha_f=_scalar(alpha))
+
+Now PyTorch is able to export ``elu`` operator.
+
+There are more examples in
+`symbolic.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic.py>`_,
+`tensor.py <https://github.com/pytorch/pytorch/blob/99037d627da68cdf53d3d0315deceddfadf03bba/torch/autograd/_functions/tensor.py#L24>`_,
+`padding.py <https://github.com/pytorch/pytorch/blob/99037d627da68cdf53d3d0315deceddfadf03bba/torch/nn/_functions/padding.py#L8>`_.
+
+
+The interface for specifying operator definitions is experimental;
+adventurous users should note that the APIs will probably
+change in a future interface.
+
+Functions
+--------------------------
+.. autofunction:: export
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
new file mode 100644
index 0000000..f44f51a
--- /dev/null
+++ b/docs/source/optim.rst
@@ -0,0 +1,147 @@
+torch.optim
+===================================
+
+.. automodule:: torch.optim
+
+How to use an optimizer
+-----------------------
+
+To use :mod:`torch.optim` you have to construct an optimizer object, that will hold
+the current state and will update the parameters based on the computed gradients.
+
+Constructing it
+^^^^^^^^^^^^^^^
+
+To construct an :class:`Optimizer` you have to give it an iterable containing the
+parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then,
+you can specify optimizer-specific options such as the learning rate, weight decay, etc.
+
+.. note::
+
+    If you need to move a model to GPU via `.cuda()`, please do so before
+    constructing optimizers for it. Parameters of a model after `.cuda()` will
+    be different objects with those before the call.
+
+    In general, you should make sure that optimized parameters live in
+    consistent locations when optimizers are constructed and used.
+
+Example::
+
+    optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
+    optimizer = optim.Adam([var1, var2], lr = 0.0001)
+
+Per-parameter options
+^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Optimizer` s also support specifying per-parameter options. To do this, instead
+of passing an iterable of :class:`~torch.autograd.Variable` s, pass in an iterable of
+:class:`dict` s. Each of them will define a separate parameter group, and should contain
+a ``params`` key, containing a list of parameters belonging to it. Other keys
+should match the keyword arguments accepted by the optimizers, and will be used
+as optimization options for this group.
+
+.. note::
+
+    You can still pass options as keyword arguments. They will be used as
+    defaults, in the groups that didn't override them. This is useful when you
+    only want to vary a single option, while keeping all others consistent
+    between parameter groups.
+
+
+For example, this is very useful when one wants to specify per-layer learning rates::
+
+    optim.SGD([
+                    {'params': model.base.parameters()},
+                    {'params': model.classifier.parameters(), 'lr': 1e-3}
+                ], lr=1e-2, momentum=0.9)
+
+This means that ``model.base``'s parameters will use the default learning rate of ``1e-2``,
+``model.classifier``'s parameters will use a learning rate of ``1e-3``, and a momentum of
+``0.9`` will be used for all parameters
+
+Taking an optimization step
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+All optimizers implement a :func:`~Optimizer.step` method, that updates the
+parameters. It can be used in two ways:
+
+``optimizer.step()``
+~~~~~~~~~~~~~~~~~~~~
+
+This is a simplified version supported by most optimizers. The function can be
+called once the gradients are computed using e.g.
+:func:`~torch.autograd.Variable.backward`.
+
+Example::
+
+    for input, target in dataset:
+        optimizer.zero_grad()
+        output = model(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        optimizer.step()
+
+``optimizer.step(closure)``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some optimization algorithms such as Conjugate Gradient and LBFGS need to
+reevaluate the function multiple times, so you have to pass in a closure that
+allows them to recompute your model. The closure should clear the gradients,
+compute the loss, and return it.
+
+Example::
+
+    for input, target in dataset:
+        def closure():
+            optimizer.zero_grad()
+            output = model(input)
+            loss = loss_fn(output, target)
+            loss.backward()
+            return loss
+        optimizer.step(closure)
+
+Algorithms
+----------
+
+.. autoclass:: Optimizer
+    :members:
+.. autoclass:: Adadelta
+    :members:
+.. autoclass:: Adagrad
+    :members:
+.. autoclass:: Adam
+    :members:
+.. autoclass:: SparseAdam
+    :members:
+.. autoclass:: Adamax
+    :members:
+.. autoclass:: ASGD
+    :members:
+.. autoclass:: LBFGS
+    :members:
+.. autoclass:: RMSprop
+    :members:
+.. autoclass:: Rprop
+    :members:
+.. autoclass:: SGD
+    :members:
+
+How to adjust Learning Rate
+---------------------------
+
+:mod:`torch.optim.lr_scheduler` provides several methods to adjust the learning
+rate based on the number of epochs. :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`
+allows dynamic learning rate reducing based on some validation measurements.
+
+.. autoclass:: torch.optim.lr_scheduler.LambdaLR
+    :members:
+.. autoclass:: torch.optim.lr_scheduler.StepLR
+    :members:
+.. autoclass:: torch.optim.lr_scheduler.MultiStepLR
+    :members:
+.. autoclass:: torch.optim.lr_scheduler.ExponentialLR
+    :members:
+.. autoclass:: torch.optim.lr_scheduler.CosineAnnealingLR
+    :members:
+.. autoclass:: torch.optim.lr_scheduler.ReduceLROnPlateau
+    :members:
diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py
new file mode 100644
index 0000000..ce424d1
--- /dev/null
+++ b/docs/source/scripts/build_activation_images.py
@@ -0,0 +1,84 @@
+"""
+This script will generate input-out plots for all of the activation
+functions. These are for use in the documentation, and potentially in
+online tutorials.
+"""
+
+import os.path
+import torch.nn.modules.activation
+import torch.autograd
+import matplotlib
+
+matplotlib.use('Agg')
+
+import pylab
+
+
+# Create a directory for the images, if it doesn't exist
+ACTIVATION_IMAGE_PATH = os.path.join(
+    os.path.realpath(os.path.join(__file__, "..")),
+    "activation_images"
+)
+
+if not os.path.exists(ACTIVATION_IMAGE_PATH):
+    os.mkdir(ACTIVATION_IMAGE_PATH)
+
+# In a refactor, these ought to go into their own module or entry
+# points so we can generate this list programmaticly
+functions = [
+    'ELU',
+    'Hardshrink',
+    'Hardtanh',
+    'LeakyReLU',  # Perhaps we should add text explaining slight slope?
+    'LogSigmoid',
+    'PReLU',
+    'ReLU',
+    'ReLU6',
+    'RReLU',
+    'SELU',
+    'Sigmoid',
+    'Softplus',
+    'Softshrink',
+    'Softsign',
+    'Tanh',
+    'Tanhshrink'
+    # 'Threshold'  Omit, pending cleanup. See PR5457
+]
+
+
+def plot_function(function, **args):
+    """
+    Plot a function on the current plot. The additional arguments may
+    be used to specify color, alpha, etc.
+    """
+    xrange = torch.arange(-7.0, 7.0, 0.01)  # We need to go beyond 6 for ReLU6
+    pylab.plot(
+        xrange.numpy(),
+        function(torch.autograd.Variable(xrange)).data.numpy(),
+        **args
+    )
+
+
+# Step through all the functions
+for function_name in functions:
+    plot_path = os.path.join(ACTIVATION_IMAGE_PATH, function_name + ".png")
+    if not os.path.exists(plot_path):
+        function = torch.nn.modules.activation.__dict__[function_name]()
+
+        # Start a new plot
+        pylab.clf()
+        pylab.grid(color='k', alpha=0.2, linestyle='--')
+
+        # Plot the current function
+        plot_function(function)
+
+        # The titles are a little redundant, given context?
+        pylab.title(function_name + " activation function")
+        pylab.xlabel("Input")
+        pylab.ylabel("Output")
+        pylab.xlim([-7, 7])
+        pylab.ylim([-7, 7])
+
+        # And save it
+        pylab.savefig(plot_path)
+        print('Saved activation image for {} at {}'.format(function, plot_path))
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
new file mode 100644
index 0000000..7694fe4
--- /dev/null
+++ b/docs/source/sparse.rst
@@ -0,0 +1,130 @@
+.. currentmodule:: torch.sparse
+
+.. _sparse-docs:
+
+torch.sparse
+============
+
+.. warning::
+
+    This API is currently experimental and may change in the near future.
+
+Torch supports sparse tensors in COO(rdinate) format, which can
+efficiently store and process tensors for which the majority of elements
+are zeros.
+
+A sparse tensor is represented as a pair of dense tensors: a tensor
+of values and a 2D tensor of indices.  A sparse tensor can be constructed
+by providing these two tensors, as well as the size of the sparse tensor
+(which cannot be inferred from these tensors!)  Suppose we want to define
+a sparse tensor with the entry 3 at location (0, 2), entry 4 at
+location (1, 0), and entry 5 at location (1, 2).  We would then write:
+
+    >>> i = torch.LongTensor([[0, 1, 1],
+                              [2, 0, 2]])
+    >>> v = torch.FloatTensor([3, 4, 5])
+    >>> torch.sparse.FloatTensor(i, v, torch.Size([2,3])).to_dense()
+     0  0  3
+     4  0  5
+    [torch.FloatTensor of size 2x3]
+
+Note that the input to LongTensor is NOT a list of index tuples.  If you want
+to write your indices this way, you should transpose before passing them to
+the sparse constructor:
+
+    >>> i = torch.LongTensor([[0, 2], [1, 0], [1, 2]])
+    >>> v = torch.FloatTensor([3,      4,      5    ])
+    >>> torch.sparse.FloatTensor(i.t(), v, torch.Size([2,3])).to_dense()
+     0  0  3
+     4  0  5
+    [torch.FloatTensor of size 2x3]
+
+You can also construct hybrid sparse tensors, where only the first n
+dimensions are sparse, and the rest of the dimensions are dense.
+
+    >>> i = torch.LongTensor([[2, 4]])
+    >>> v = torch.FloatTensor([[1, 3], [5, 7]])
+    >>> torch.sparse.FloatTensor(i, v).to_dense()
+     0  0
+     0  0
+     1  3
+     0  0
+     5  7
+    [torch.FloatTensor of size 5x2]
+
+An empty sparse tensor can be constructed by specifying its size:
+
+    >>> torch.sparse.FloatTensor(2, 3)
+    SparseFloatTensor of size 2x3 with indices:
+    [torch.LongTensor with no dimension]
+    and values:
+    [torch.FloatTensor with no dimension]
+
+.. note::
+
+    Our sparse tensor format permits *uncoalesced* sparse tensors, where
+    there may be duplicate coordinates in the indices; in this case,
+    the interpretation is that the value at that index is the sum of all
+    duplicate value entries. Uncoalesced tensors permit us to implement
+    certain operators more efficiently.
+
+    For the most part, you shouldn't have to care whether or not a
+    sparse tensor is coalesced or not, as most operations will work
+    identically given a coalesced or uncoalesced sparse tensor.
+    However, there are two cases in which you may need to care.
+
+    First, if you repeatedly perform an operation that can produce
+    duplicate entries (e.g., :func:`torch.sparse.FloatTensor.add`), you
+    should occasionally coalesce your sparse tensors to prevent
+    them from growing too large.
+
+    Second, some operators will produce different values depending on
+    whether or not they are coalesced or not (e.g.,
+    :func:`torch.sparse.FloatTensor._values` and
+    :func:`torch.sparse.FloatTensor._indices`, as well as
+    :func:`torch.Tensor._sparse_mask`).  These operators are
+    prefixed by an underscore to indicate that they reveal internal
+    implementation details and should be used with care, since code
+    that works with coalesced sparse tensors may not work with
+    uncoalesced sparse tensors; generally speaking, it is safest
+    to explicitly coalesce before working with these operators.
+
+    For example, suppose that we wanted to implement an operator
+    by operating directly on :func:`torch.sparse.FloatTensor._values`.
+    Multiplication by a scalar can be implemented in the obvious way,
+    as multiplication distributes over addition; however, square root
+    cannot be implemented directly, since ``sqrt(a + b) != sqrt(a) +
+    sqrt(b)`` (which is what would be computed if you were given an
+    uncoalesced tensor.)
+
+.. class:: FloatTensor()
+
+    .. method:: add
+    .. method:: add_
+    .. method:: clone
+    .. method:: dim
+    .. method:: div
+    .. method:: div_
+    .. method:: get_device
+    .. method:: hspmm
+    .. method:: mm
+    .. method:: mul
+    .. method:: mul_
+    .. method:: resizeAs_
+    .. method:: size
+    .. method:: spadd
+    .. method:: spmm
+    .. method:: sspaddmm
+    .. method:: sspmm
+    .. method:: sub
+    .. method:: sub_
+    .. method:: t_
+    .. method:: toDense
+    .. method:: transpose
+    .. method:: transpose_
+    .. method:: zero_
+    .. method:: coalesce
+    .. method:: is_coalesced
+    .. method:: _indices
+    .. method:: _values
+    .. method:: _nnz
diff --git a/docs/source/storage.rst b/docs/source/storage.rst
new file mode 100644
index 0000000..6114891
--- /dev/null
+++ b/docs/source/storage.rst
@@ -0,0 +1,12 @@
+torch.Storage
+===================================
+
+A :class:`torch.Storage` is a contiguous, one-dimensional array of a single
+data type.
+
+Every :class:`torch.Tensor` has a corresponding storage of the same data type.
+
+.. autoclass:: torch.FloatStorage
+   :members:
+   :undoc-members:
+   :inherited-members:
diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst
new file mode 100644
index 0000000..230b74d
--- /dev/null
+++ b/docs/source/tensor_attributes.rst
@@ -0,0 +1,131 @@
+.. currentmodule:: torch
+
+.. _tensor-attributes-doc:
+
+Tensor Attributes
+=================
+
+Each ``torch.Tensor`` has a :class:`torch.dtype`, :class:`torch.device`, and :class:`torch.layout`.
+
+.. _dtype-doc:
+
+torch.dtype
+-----------
+
+.. class:: torch.dtype
+
+A :class:`torch.dtype` is an object that represents the data type of a
+:class:`torch.Tensor`. PyTorch has eight different data types:
+
+========================   ===========================================   ===========================
+Data type                  dtype                                         Tensor types
+========================   ===========================================   ===========================
+32-bit floating point      ``torch.float32`` or ``torch.float``          ``torch.*.FloatTensor``
+64-bit floating point      ``torch.float64`` or ``torch.double``         ``torch.*.DoubleTensor``
+16-bit floating point      ``torch.float16`` or ``torch.half``           ``torch.*.HalfTensor``
+8-bit integer (unsigned)   ``torch.uint8``                               ``torch.*.ByteTensor``
+8-bit integer (signed)     ``torch.int8``                                ``torch.*.CharTensor``
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            ``torch.*.ShortTensor``
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              ``torch.*.IntTensor``
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             ``torch.*.LongTensor``
+========================   ===========================================   ===========================
+
+.. _device-doc:
+
+torch.device
+------------
+
+.. class:: torch.device
+
+A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
+or will be allocated.
+
+The :class:`torch.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device ordinal for the
+device type.  If the device ordinal is not present, this represents the current device for the device type;
+e.g. a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is the result of
+:func:`torch.cuda.current_device()`.
+
+A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
+
+A :class:`torch.device` can be constructed via a string or via a string and device ordinal
+
+Via a string:
+::
+
+    >>> torch.device('cuda:0')
+    device(type='cuda', index=0)
+
+    >>> torch.device('cpu')
+    device(type='cpu')
+
+    >>> torch.device('cuda')  # current cuda device
+    device(type='cuda')
+
+Via a string and device ordinal:
+
+::
+
+    >>> torch.device('cuda', 0)
+    device(type='cuda', index=0)
+
+    >>> torch.device('cpu', 0)
+    device(type='cpu', index=0)
+
+.. note::
+   The :class:`torch.device` argument in functions can generally be substituted with a string.
+   This allows for fast prototyping of code.
+
+   >>> # Example of a function that takes in a torch.device
+   >>> cuda1 = torch.device('cuda:1')
+   >>> torch.randn((2,3), device=cuda1)
+
+   >>> # You can substitute the torch.device with a string
+   >>> torch.randn((2,3), 'cuda:1')
+
+.. note::
+   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
+   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
+   tensors and is not supported for cpu tensors.
+
+   >>> torch.device(1)
+   device(type='cuda', index=1)
+
+.. note::
+   Methods which take a device will generally accept a (properly formatted) string
+   or (legacy) integer device ordinal, i.e. the following are all equivalent:
+
+   >>> torch.randn((2,3), device=torch.device('cuda:1'))
+   >>> torch.randn((2,3), device='cuda:1')
+   >>> torch.randn((2,3), device=1)  # legacy
+
+
+.. _layout-doc:
+
+torch.layout
+------------
+
+.. class:: torch.layout
+
+A :class:`torch.layout` is an object that represents the memory layout of a
+:class:`torch.Tensor`. Currently, we support ``torch.strided`` (dense Tensors)
+and have experimental support for ``torch.sparse_coo`` (sparse COO Tensors).
+
+``torch.strided`` represents dense Tensors and is the memory layout that
+is most commonly used. Each strided tensor has an associated
+:class:`torch.Storage`, which holds its data. These tensors provide
+multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
+view of a storage. Strides are a list of integers: the k-th stride
+represents the jump in the memory necessary to go from one element to the
+next one in the k-th dimension of the Tensor. This concept makes it possible
+to perform many tensor operations efficiently.
+
+Example::
+
+    >>> x = torch.Tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+
+    >>> x.t().stride()
+    (1, 5)
+
+For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`.
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
new file mode 100644
index 0000000..89f96cf
--- /dev/null
+++ b/docs/source/tensors.rst
@@ -0,0 +1,408 @@
+.. currentmodule:: torch
+
+.. _tensor-doc:
+
+torch.Tensor
+===================================
+
+A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of
+a single data type.
+
+Torch defines eight CPU tensor types and eight GPU tensor types:
+
+========================   ===========================================   ===========================   ================================
+Data type                  dtype                                         CPU tensor                    GPU tensor
+========================   ===========================================   ===========================   ================================
+32-bit floating point      ``torch.float32`` or ``torch.float``          :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
+64-bit floating point      ``torch.float64`` or ``torch.double``         :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
+16-bit floating point      ``torch.float16`` or ``torch.half``           :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
+8-bit integer (unsigned)   ``torch.uint8``                               :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
+8-bit integer (signed)     ``torch.int8``                                :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
+========================   ===========================================   ===========================   ================================
+
+:class:`torch.Tensor` is an alias for the default tensor type (:class:`torch.FloatTensor`).
+
+A tensor can be constructed from a Python :class:`list` or sequence using the
+:func:`torch.tensor` constructor:
+
+::
+
+    >>> torch.tensor([[1., -1.], [1., -1.]])
+    tensor([[ 1.0000, -1.0000],
+            [ 1.0000, -1.0000]])
+    >>> torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+
+.. warning::
+
+    :func:`torch.tensor` always copies :attr:`data`. If you have a Tensor
+    :attr:`data` and just want to change its ``requires_grad`` flag, use
+    :meth:`~torch.Tensor.requires_grad_` or
+    :meth:`~torch.Tensor.detach` to avoid a copy.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.as_tensor`.
+
+An tensor of specific data type can be constructed by passing a
+:class:`torch.dtype` and/or a :class:`torch.device` to a
+constructor or tensor creation op:
+
+::
+
+    >>> torch.zeros([2, 4], dtype=torch.int32)
+    tensor([[ 0,  0,  0,  0],
+            [ 0,  0,  0,  0]], dtype=torch.int32)
+    >>> cuda0 = torch.device('cuda:0')
+    >>> torch.ones([2, 4], dtype=torch.float64, device=cuda0)
+    tensor([[ 1.0000,  1.0000,  1.0000,  1.0000],
+            [ 1.0000,  1.0000,  1.0000,  1.0000]], dtype=torch.float64, device='cuda:0')
+
+The contents of a tensor can be accessed and modified using Python's indexing
+and slicing notation:
+
+::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    >>> print(x[1][2])
+    tensor(6)
+    >>> x[0][1] = 8
+    >>> print(x)
+    tensor([[ 1,  8,  3],
+            [ 4,  5,  6]])
+
+Use :meth:`torch.Tensor.item` to get a Python number from a tensor containing a
+single value:
+
+::
+
+    >>> x = torch.tensor([[1]])
+    >>> x
+    tensor([[ 1]])
+    >>> x.item()
+    1
+    >>> x = torch.tensor(2.5)
+    >>> x
+    tensor(2.5000)
+    >>> x.item()
+    2.5
+
+A tensor can be created with :attr:`requires_grad=True` so that
+:mod:`torch.autograd` records operations on them for automatic differentiation.
+
+::
+
+    >>> x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
+    >>> out = x.pow(2).sum()
+    >>> out.backward()
+    >>> x.grad
+    tensor([[ 2.0000, -2.0000],
+            [ 2.0000,  2.0000]])
+
+Each tensor has an associated :class:`torch.Storage`, which holds its data.
+The tensor class provides multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
+view of a storage and defines numeric operations on it.
+
+.. note::
+   For more information on the :class:`torch.dtype`, :class:`torch.device`, and
+   :class:`torch.layout` attributes of a :class:`torch.Tensor`, see
+   :ref:`tensor-attributes-doc`.
+
+.. note::
+   Methods which mutate a tensor are marked with an underscore suffix.
+   For example, :func:`torch.FloatTensor.abs_` computes the absolute value
+   in-place and returns the modified tensor, while :func:`torch.FloatTensor.abs`
+   computes the result in a new tensor.
+
+.. note::
+    To change an existing tensor's :class:`torch.device` and/or :class:`torch.dtype`, consider using
+    :meth:`~torch.Tensor.to` method on the tensor.
+
+.. class:: Tensor()
+
+   There are a few main ways to create a tensor, depending on your use case.
+
+   - To create a tensor with pre-existing data, use :func:`torch.tensor`.
+   - To create a tensor with specific size, use ``torch.*`` tensor creation
+     ops (see :ref:`tensor-creation-ops`).
+   - To create a tensor with the same size (and similar types) as another tensor,
+     use ``torch.*_like`` tensor creation ops
+     (see :ref:`tensor-creation-ops`).
+   - To create a tensor with similar type but different size as another tensor,
+     use ``tensor.new_*`` creation ops.
+
+   .. automethod:: new_tensor
+   .. automethod:: new_full
+   .. automethod:: new_empty
+   .. automethod:: new_ones
+   .. automethod:: new_zeros
+
+   .. automethod:: abs
+   .. automethod:: abs_
+   .. automethod:: acos
+   .. automethod:: acos_
+   .. automethod:: add
+   .. automethod:: add_
+   .. automethod:: addbmm
+   .. automethod:: addbmm_
+   .. automethod:: addcdiv
+   .. automethod:: addcdiv_
+   .. automethod:: addcmul
+   .. automethod:: addcmul_
+   .. automethod:: addmm
+   .. automethod:: addmm_
+   .. automethod:: addmv
+   .. automethod:: addmv_
+   .. automethod:: addr
+   .. automethod:: addr_
+   .. automethod:: apply_
+   .. automethod:: argmax
+   .. automethod:: argmin
+   .. automethod:: asin
+   .. automethod:: asin_
+   .. automethod:: atan
+   .. automethod:: atan2
+   .. automethod:: atan2_
+   .. automethod:: atan_
+   .. automethod:: baddbmm
+   .. automethod:: baddbmm_
+   .. automethod:: bernoulli
+   .. automethod:: bernoulli_
+   .. automethod:: bmm
+   .. automethod:: byte
+   .. automethod:: btrifact
+   .. automethod:: btrifact_with_info
+   .. automethod:: btrisolve
+   .. automethod:: cauchy_
+   .. automethod:: ceil
+   .. automethod:: ceil_
+   .. automethod:: char
+   .. automethod:: chunk
+   .. automethod:: clamp
+   .. automethod:: clamp_
+   .. automethod:: clone
+   .. automethod:: contiguous
+   .. automethod:: copy_
+   .. automethod:: cos
+   .. automethod:: cos_
+   .. automethod:: cosh
+   .. automethod:: cosh_
+   .. automethod:: cpu
+   .. automethod:: cross
+   .. automethod:: cuda
+   .. automethod:: cumprod
+   .. automethod:: cumsum
+   .. automethod:: data_ptr
+   .. automethod:: det
+   .. autoattribute:: device
+      :annotation:
+   .. automethod:: diag
+   .. automethod:: dim
+   .. automethod:: dist
+   .. automethod:: div
+   .. automethod:: div_
+   .. automethod:: dot
+   .. automethod:: double
+   .. automethod:: eig
+   .. automethod:: element_size
+   .. automethod:: eq
+   .. automethod:: eq_
+   .. automethod:: equal
+   .. automethod:: erf
+   .. automethod:: erf_
+   .. automethod:: erfc
+   .. automethod:: erfc_
+   .. automethod:: erfinv
+   .. automethod:: erfinv_
+   .. automethod:: exp
+   .. automethod:: exp_
+   .. automethod:: expm1
+   .. automethod:: expm1_
+   .. automethod:: expand
+   .. automethod:: expand_as
+   .. automethod:: exponential_
+   .. automethod:: fill_
+   .. automethod:: flip
+   .. automethod:: float
+   .. automethod:: floor
+   .. automethod:: floor_
+   .. automethod:: fmod
+   .. automethod:: fmod_
+   .. automethod:: frac
+   .. automethod:: frac_
+   .. automethod:: gather
+   .. automethod:: ge
+   .. automethod:: ge_
+   .. automethod:: gels
+   .. automethod:: geometric_
+   .. automethod:: geqrf
+   .. automethod:: ger
+   .. automethod:: gesv
+   .. automethod:: get_device
+   .. automethod:: gt
+   .. automethod:: gt_
+   .. automethod:: half
+   .. automethod:: histc
+   .. automethod:: index_add_
+   .. automethod:: index_copy_
+   .. automethod:: index_fill_
+   .. automethod:: index_put_
+   .. automethod:: index_select
+   .. automethod:: int
+   .. automethod:: inverse
+   .. automethod:: is_contiguous
+   .. autoattribute:: is_cuda
+      :annotation:
+   .. automethod:: is_pinned
+   .. automethod:: is_set_to
+   .. automethod:: is_signed
+   .. automethod:: item
+   .. automethod:: kthvalue
+   .. automethod:: le
+   .. automethod:: le_
+   .. automethod:: lerp
+   .. automethod:: lerp_
+   .. automethod:: log
+   .. automethod:: log_
+   .. automethod:: logdet
+   .. automethod:: log10
+   .. automethod:: log10_
+   .. automethod:: log1p
+   .. automethod:: log1p_
+   .. automethod:: log2
+   .. automethod:: log2_
+   .. automethod:: log_normal_
+   .. automethod:: logsumexp
+   .. automethod:: long
+   .. automethod:: lt
+   .. automethod:: lt_
+   .. automethod:: map_
+   .. automethod:: masked_scatter_
+   .. automethod:: masked_fill_
+   .. automethod:: masked_select
+   .. automethod:: matmul
+   .. automethod:: max
+   .. automethod:: mean
+   .. automethod:: median
+   .. automethod:: min
+   .. automethod:: mm
+   .. automethod:: mode
+   .. automethod:: mul
+   .. automethod:: mul_
+   .. automethod:: multinomial
+   .. automethod:: mv
+   .. automethod:: narrow
+   .. automethod:: ndimension
+   .. automethod:: ne
+   .. automethod:: ne_
+   .. automethod:: neg
+   .. automethod:: neg_
+   .. automethod:: nelement
+   .. automethod:: nonzero
+   .. automethod:: norm
+   .. automethod:: normal_
+   .. automethod:: numel
+   .. automethod:: numpy
+   .. automethod:: orgqr
+   .. automethod:: ormqr
+   .. automethod:: permute
+   .. automethod:: pin_memory
+   .. automethod:: pinverse
+   .. automethod:: potrf
+   .. automethod:: potri
+   .. automethod:: potrs
+   .. automethod:: pow
+   .. automethod:: pow_
+   .. automethod:: prod
+   .. automethod:: pstrf
+   .. automethod:: put_
+   .. automethod:: qr
+   .. automethod:: random_
+   .. automethod:: reciprocal
+   .. automethod:: reciprocal_
+   .. automethod:: remainder
+   .. automethod:: remainder_
+   .. automethod:: renorm
+   .. automethod:: renorm_
+   .. automethod:: repeat
+   .. automethod:: requires_grad_
+   .. automethod:: reshape
+   .. automethod:: reshape_as
+   .. automethod:: resize_
+   .. automethod:: resize_as_
+   .. automethod:: round
+   .. automethod:: round_
+   .. automethod:: rsqrt
+   .. automethod:: rsqrt_
+   .. automethod:: scatter_
+   .. automethod:: scatter_add_
+   .. automethod:: select
+   .. automethod:: set_
+   .. automethod:: share_memory_
+   .. automethod:: short
+   .. automethod:: sigmoid
+   .. automethod:: sigmoid_
+   .. automethod:: sign
+   .. automethod:: sign_
+   .. automethod:: sin
+   .. automethod:: sin_
+   .. automethod:: sinh
+   .. automethod:: sinh_
+   .. automethod:: size
+   .. automethod:: slogdet
+   .. automethod:: sort
+   .. automethod:: split
+   .. automethod:: sqrt
+   .. automethod:: sqrt_
+   .. automethod:: squeeze
+   .. automethod:: squeeze_
+   .. automethod:: std
+   .. automethod:: storage
+   .. automethod:: storage_offset
+   .. automethod:: storage_type
+   .. automethod:: stride
+   .. automethod:: sub
+   .. automethod:: sub_
+   .. automethod:: sum
+   .. automethod:: svd
+   .. automethod:: symeig
+   .. automethod:: t
+   .. automethod:: t_
+   .. automethod:: to
+   .. automethod:: take
+   .. automethod:: tan
+   .. automethod:: tan_
+   .. automethod:: tanh
+   .. automethod:: tanh_
+   .. automethod:: tolist
+   .. automethod:: topk
+   .. automethod:: trace
+   .. automethod:: transpose
+   .. automethod:: transpose_
+   .. automethod:: tril
+   .. automethod:: tril_
+   .. automethod:: triu
+   .. automethod:: triu_
+   .. automethod:: trtrs
+   .. automethod:: trunc
+   .. automethod:: trunc_
+   .. automethod:: type
+   .. automethod:: type_as
+   .. automethod:: unfold
+   .. automethod:: uniform_
+   .. automethod:: unique
+   .. automethod:: unsqueeze
+   .. automethod:: unsqueeze_
+   .. automethod:: var
+   .. automethod:: view
+   .. automethod:: view_as
+   .. automethod:: zero_
+
+.. class:: ByteTensor()
+
+   The following methods are unique to :class:`torch.ByteTensor`.
+
+   .. automethod:: all
+   .. automethod:: any
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
new file mode 100644
index 0000000..75b71fd
--- /dev/null
+++ b/docs/source/torch.rst
@@ -0,0 +1,305 @@
+torch
+===================================
+.. automodule:: torch
+
+Tensors
+----------------------------------
+.. autofunction:: is_tensor
+.. autofunction:: is_storage
+.. autofunction:: set_default_dtype
+.. autofunction:: get_default_dtype
+.. autofunction:: set_default_tensor_type
+.. autofunction:: numel
+.. autofunction:: set_printoptions
+.. autofunction:: set_flush_denormal
+
+.. _tensor-creation-ops:
+
+Creation Ops
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+    Random sampling creation ops are listed under :ref:`random-sampling` and
+    include:
+    :func:`torch.rand`
+    :func:`torch.rand_like`
+    :func:`torch.randn`
+    :func:`torch.randn_like`
+    :func:`torch.randint`
+    :func:`torch.randint_like`
+    :func:`torch.randperm`
+    You may also use :func:`torch.empty` with the :ref:`inplace-random-sampling`
+    methods to create :class:`torch.Tensor` s with values sampled from a broader
+    range of distributions.
+
+.. autofunction:: tensor
+.. autofunction:: sparse_coo_tensor
+.. autofunction:: as_tensor
+.. autofunction:: from_numpy
+.. autofunction:: zeros
+.. autofunction:: zeros_like
+.. autofunction:: ones
+.. autofunction:: ones_like
+.. autofunction:: arange
+.. autofunction:: range
+.. autofunction:: linspace
+.. autofunction:: logspace
+.. autofunction:: eye
+.. autofunction:: empty
+.. autofunction:: empty_like
+.. autofunction:: full
+.. autofunction:: full_like
+
+Indexing, Slicing, Joining, Mutating Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: cat
+.. autofunction:: chunk
+.. autofunction:: gather
+.. autofunction:: index_select
+.. autofunction:: masked_select
+.. autofunction:: nonzero
+.. autofunction:: reshape
+.. autofunction:: split
+.. autofunction:: squeeze
+.. autofunction:: stack
+.. autofunction:: t
+.. autofunction:: take
+.. autofunction:: transpose
+.. autofunction:: unbind
+.. autofunction:: unsqueeze
+.. autofunction:: where
+
+.. _random-sampling:
+
+Random sampling
+----------------------------------
+.. autofunction:: manual_seed
+.. autofunction:: initial_seed
+.. autofunction:: get_rng_state
+.. autofunction:: set_rng_state
+.. autodata:: default_generator
+.. autofunction:: bernoulli
+.. autofunction:: multinomial
+.. autofunction:: normal
+.. autofunction:: rand
+.. autofunction:: rand_like
+.. autofunction:: randint
+.. autofunction:: randint_like
+.. autofunction:: randn
+.. autofunction:: randn_like
+.. autofunction:: randperm
+
+.. _inplace-random-sampling:
+
+In-place random sampling
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are a few more in-place random sampling functions defined on Tensors as well. Click through to refer to their documentation:
+
+- :func:`torch.Tensor.bernoulli_` - in-place version of :func:`torch.bernoulli`
+- :func:`torch.Tensor.cauchy_` - numbers drawn from the Cauchy distribution
+- :func:`torch.Tensor.exponential_` - numbers drawn from the exponential distribution
+- :func:`torch.Tensor.geometric_` - elements drawn from the geometric distribution
+- :func:`torch.Tensor.log_normal_` - samples from the log-normal distribution
+- :func:`torch.Tensor.normal_` - in-place version of :func:`torch.normal`
+- :func:`torch.Tensor.random_` - numbers sampled from the discrete uniform distribution
+- :func:`torch.Tensor.uniform_` - numbers sampled from the continuous uniform distribution
+
+
+Serialization
+----------------------------------
+.. autofunction:: save
+.. autofunction:: load
+
+
+Parallelism
+----------------------------------
+.. autofunction:: get_num_threads
+.. autofunction:: set_num_threads
+
+Locally disabling gradient computation
+--------------------------------------
+The context managers :func:`torch.no_grad`, :func:`torch.enable_grad`, and
+:func:`torch.set_grad_enabled` are helpful for locally disabling and enabling
+gradient computation. See :ref:`locally-disable-grad` for more details on
+their usage.
+
+Examples::
+
+  >>> x = torch.zeros(1, requires_grad=True)
+  >>> with torch.no_grad():
+  ...     y = x * 2
+  >>> y.requires_grad
+  False
+
+  >>> is_train = False
+  >>> with torch.set_grad_enabled(is_train):
+  ...     y = x * 2
+  >>> y.requires_grad
+  False
+
+  >>> torch.set_grad_enabled(True)  # this can also be used as a function
+  >>> y = x * 2
+  >>> y.requires_grad
+  True
+
+  >>> torch.set_grad_enabled(False)
+  >>> y = x * 2
+  >>> y.requires_grad
+  False
+
+
+Math operations
+----------------------------------
+
+Pointwise Ops
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: abs
+.. autofunction:: acos
+.. autofunction:: add
+.. autofunction:: addcdiv
+.. autofunction:: addcmul
+.. autofunction:: asin
+.. autofunction:: atan
+.. autofunction:: atan2
+.. autofunction:: ceil
+.. autofunction:: clamp
+.. autofunction:: cos
+.. autofunction:: cosh
+.. autofunction:: div
+.. autofunction:: erf
+.. autofunction:: erfc
+.. autofunction:: erfinv
+.. autofunction:: exp
+.. autofunction:: expm1
+.. autofunction:: floor
+.. autofunction:: fmod
+.. autofunction:: frac
+.. autofunction:: lerp
+.. autofunction:: log
+.. autofunction:: log10
+.. autofunction:: log1p
+.. autofunction:: log2
+.. autofunction:: mul
+.. autofunction:: neg
+.. autofunction:: pow
+.. autofunction:: reciprocal
+.. autofunction:: remainder
+.. autofunction:: round
+.. autofunction:: rsqrt
+.. autofunction:: sigmoid
+.. autofunction:: sign
+.. autofunction:: sin
+.. autofunction:: sinh
+.. autofunction:: sqrt
+.. autofunction:: tan
+.. autofunction:: tanh
+.. autofunction:: trunc
+
+
+Reduction Ops
+~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: argmax
+.. autofunction:: argmin
+.. autofunction:: cumprod
+.. autofunction:: cumsum
+.. autofunction:: dist
+.. autofunction:: logsumexp
+.. autofunction:: mean
+.. autofunction:: median
+.. autofunction:: mode
+.. autofunction:: norm
+.. autofunction:: prod
+.. autofunction:: std
+.. autofunction:: sum
+.. autofunction:: unique
+.. autofunction:: var
+
+
+Comparison Ops
+~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: eq
+.. autofunction:: equal
+.. autofunction:: ge
+.. autofunction:: gt
+.. autofunction:: isfinite
+.. autofunction:: isinf
+.. autofunction:: isnan
+.. autofunction:: kthvalue
+.. autofunction:: le
+.. autofunction:: lt
+.. autofunction:: max
+.. autofunction:: min
+.. autofunction:: ne
+.. autofunction:: sort
+.. autofunction:: topk
+
+
+Spectral Ops
+~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: fft
+.. autofunction:: ifft
+.. autofunction:: rfft
+.. autofunction:: irfft
+.. autofunction:: stft
+.. autofunction:: bartlett_window
+.. autofunction:: blackman_window
+.. autofunction:: hamming_window
+.. autofunction:: hann_window
+
+
+Other Operations
+~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: bincount
+.. autofunction:: cross
+.. autofunction:: diag
+.. autofunction:: diagflat
+.. autofunction:: diagonal
+.. autofunction:: einsum
+.. autofunction:: flip
+.. autofunction:: histc
+.. autofunction:: meshgrid
+.. autofunction:: renorm
+.. autofunction:: trace
+.. autofunction:: tril
+.. autofunction:: triu
+
+
+BLAS and LAPACK Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: addbmm
+.. autofunction:: addmm
+.. autofunction:: addmv
+.. autofunction:: addr
+.. autofunction:: baddbmm
+.. autofunction:: bmm
+.. autofunction:: btrifact
+.. autofunction:: btrifact_with_info
+.. autofunction:: btrisolve
+.. autofunction:: btriunpack
+.. autofunction:: dot
+.. autofunction:: eig
+.. autofunction:: gels
+.. autofunction:: geqrf
+.. autofunction:: ger
+.. autofunction:: gesv
+.. autofunction:: inverse
+.. autofunction:: det
+.. autofunction:: logdet
+.. autofunction:: slogdet
+.. autofunction:: matmul
+.. autofunction:: mm
+.. autofunction:: mv
+.. autofunction:: orgqr
+.. autofunction:: ormqr
+.. autofunction:: pinverse
+.. autofunction:: potrf
+.. autofunction:: potri
+.. autofunction:: potrs
+.. autofunction:: pstrf
+.. autofunction:: qr
+.. autofunction:: svd
+.. autofunction:: symeig
+.. autofunction:: trtrs
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
new file mode 100644
index 0000000..9f527e8
--- /dev/null
+++ b/modules/CMakeLists.txt
@@ -0,0 +1,8 @@
+# ---[ Add modules
+add_subdirectory(detectron)
+add_subdirectory(module_test)
+add_subdirectory(observers)
+add_subdirectory(rocksdb)
+
+# Finally, set Caffe2_MODULES to parent scope.
+set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
new file mode 100644
index 0000000..f18077b
--- /dev/null
+++ b/modules/detectron/CMakeLists.txt
@@ -0,0 +1,14 @@
+file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
+file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
+
+# Note(ilijar): Since Detectron ops currently have no
+# CPU implementation, we only build GPU ops for now.
+if (USE_CUDA)
+  CUDA_ADD_LIBRARY(
+      caffe2_detectron_ops_gpu SHARED
+      ${Detectron_CPU_SRCS}
+      ${Detectron_GPU_SRCS})
+
+  target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
+  install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
+endif()
diff --git a/modules/detectron/batch_permutation_op.cc b/modules/detectron/batch_permutation_op.cc
new file mode 100644
index 0000000..f92d7dd
--- /dev/null
+++ b/modules/detectron/batch_permutation_op.cc
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "batch_permutation_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchPermutation)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Permute the batch elements of the input tensor X according to the permutation
+specified in the input indices.
+
+Warning: this op does not verify that indices is a valid permutation; gradient
+comptuation is only correct if indices is a permutation.
+)DOC")
+    .Input(
+        0,
+        "X",
+        "Tensor of at least 1D shape (N, D0, D1, ...).")
+    .Input(
+        1,
+        "indices",
+        "1D tensor of type int with shape (N, ) specifying a valid permutation "
+        "of the indices in [0, N - 1] (inclusive).")
+    .Output(
+        0,
+        "Y",
+        "Tensor with the same shape as X where the (D0, D1, ...) dimensional "
+        "batch elements of X are permuted according to the input indices.");
+
+OPERATOR_SCHEMA(BatchPermutationGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "indices",
+        "See BatchPermutation.")
+    .Input(
+        1,
+        "dY",
+        "Gradient of forward output 0 (Y).")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X).");
+
+template <>
+bool BatchPermutationOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& indices = Input(1);
+  auto* Y = Output(0);
+
+  CAFFE_ENFORCE_EQ(indices.ndim(), 1, "indices must be 1-d");
+  CAFFE_ENFORCE_EQ(
+    X.dim32(0), indices.dim32(0),
+    "X.dim32(0) must be equal to indices.dim32(0)",
+    "(",
+    X.dim32(0),
+    " vs. ",
+    indices.dim32(0),
+    ")");
+
+  Y->ResizeLike(X);
+
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int H = X.dim32(2);
+  const int W = X.dim32(3);
+
+  const float *src = X.template data<float>();
+  float *dst = Y->template mutable_data<float>();
+
+  for (int i = 0; i < N; i++) {
+    int idx = indices.template data<int>()[i];
+
+    std::memcpy(dst + i * C * H * W, src + idx * C * H * W, sizeof(float) * C * H * W);
+  }
+
+  return true;
+}
+
+class GetBatchPermutationGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchPermutationGradient",
+        "",
+        vector<string>{I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(BatchPermutation, GetBatchPermutationGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/batch_permutation_op.cu b/modules/detectron/batch_permutation_op.cu
new file mode 100644
index 0000000..33ead40
--- /dev/null
+++ b/modules/detectron/batch_permutation_op.cu
@@ -0,0 +1,112 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "batch_permutation_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+template <bool forward>
+__global__ void BatchPermutationKernel(
+    int N,
+    int C,
+    int H,
+    int W,
+    const float* src,
+    const int* indices,
+    float* dst) {
+  CUDA_1D_KERNEL_LOOP(index, N * C * H * W) {
+    int w = index % W;
+    int h = (index / W) % H;
+    int c = (index / W / H) % C;
+    int n = (index / W / H / C);
+    int idx = indices[n];
+    if (forward) {
+      dst[n * C * H * W + c * H * W + h * W + w] =
+          src[idx * C * H * W + c * H * W + h * W + w];
+    } else {
+      dst[idx * C * H * W + c * H * W + h * W + w] =
+          src[n * C * H * W + c * H * W + h * W + w];
+    }
+  }
+}
+}
+
+template <>
+bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& indices = Input(1);
+  auto* Y = Output(0);
+
+  CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d");
+  CAFFE_ENFORCE(
+      X.dim32(0) == indices.dim32(0),
+      "X.dim32(0) must be equal to indices.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      indices.dim32(0),
+      ")");
+
+  Y->ResizeLike(X);
+
+  BatchPermutationKernel<true><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      X.data<float>(),
+      indices.data<int>(),
+      Y->mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& indices = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+
+  BatchPermutationKernel<false><<<
+      CAFFE_GET_BLOCKS(dY.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      dY.dim32(0),
+      dY.dim32(1),
+      dY.dim32(2),
+      dY.dim32(3),
+      dY.data<float>(),
+      indices.data<int>(),
+      dX->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    BatchPermutation,
+    BatchPermutationOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/batch_permutation_op.h b/modules/detectron/batch_permutation_op.h
new file mode 100644
index 0000000..81a98bd
--- /dev/null
+++ b/modules/detectron/batch_permutation_op.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BATCHPERMUTATION_OP_H_
+#define BATCHPERMUTATION_OP_H_
+
+#include <cstring>
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BatchPermutationOp final : public Operator<Context> {
+ public:
+  BatchPermutationOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class BatchPermutationGradientOp final : public Operator<Context> {
+ public:
+  BatchPermutationGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+};
+
+} // namespace caffe2
+
+#endif // BATCHPERMUTATION_OP_H_
diff --git a/modules/detectron/group_spatial_softmax_op.cc b/modules/detectron/group_spatial_softmax_op.cc
new file mode 100644
index 0000000..6da88e9
--- /dev/null
+++ b/modules/detectron/group_spatial_softmax_op.cc
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "group_spatial_softmax_op.h"
+#include "caffe2/operators/softmax_shared.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    GroupSpatialSoftmax,
+    GroupSpatialSoftmaxOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GroupSpatialSoftmaxGradient,
+    GroupSpatialSoftmaxGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(GroupSpatialSoftmax)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+RetinaNet specific form of spatial softmax.
+
+The input is assumed to be unnormalized scores (sometimes called 'logits')
+arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
+elements in the batch, H and W are the height and width, and C = num_anchors *
+num_classes defines num_anchors 'groups' of softmax inputs, each of length
+num_classes. The softmax is applied to each group independently.
+
+See: https://arxiv.org/abs/1708.02002 for details.
+)DOC")
+    .Arg(
+        "num_classes",
+        "(int) default 81; number of classes in each softmax group.")
+    .Input(
+        0,
+        "scores",
+        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
+        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
+        "groups of contiguous num_classes softmax inputs.")
+    .Output(
+        0,
+        "probabilities",
+        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
+        "C = num_anchors * num_classes, and softmax was applied to each of the "
+        "num_anchors groups; within a group the num_classes values sum to 1.");
+
+OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "scores",
+        "See GroupSpatialSoftmax")
+    .Input(
+        1,
+        "d_probabilities",
+        "Gradient of forward output 0 (probabilities).")
+    .Output(
+        0,
+        "d_scores",
+        "Gradient of forward input 0 (scores).");
+
+class GetGroupSpatialSoftmaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "GroupSpatialSoftmaxGradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient);
+} // namespace caffe2
diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu
new file mode 100644
index 0000000..94771d7
--- /dev/null
+++ b/modules/detectron/group_spatial_softmax_op.cu
@@ -0,0 +1,179 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "group_spatial_softmax_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
+    const int H, const float* Xdata, float* Pdata, const int num_classes) {
+  // Loop throuh labels (N x A x H x W)
+  CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
+    int D = num_classes * A;
+    int x = index % W;
+    int y = (index / W) % H;
+    int a = (index / (W * H)) % A;
+    int i = index / W / H / A;
+
+    // Subtract max on each cell for numerical reasons
+    float max_val = -FLT_MAX;
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
+      max_val = max(max_val, Xdata[idx]);
+    }
+    // Exponentiate
+    float expsum = 0.0f;
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      float expx = exp(Xdata[idx] - max_val);
+      Pdata[idx] = expx;
+      expsum += expx;
+    }
+
+    // Normalize
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      Pdata[idx] /= expsum;
+    }
+
+  }
+}
+
+__global__ void SumProbsKernel(const int N, const int A, const int W,
+    const int H, const float* Ydata, const float* dYdata,
+    float* sum_probs_data, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
+    int D = num_classes * A;
+    int x = i % W;
+    int y = (i / W) % H;
+    int a = (i / (W * H)) % A;
+    int n = i / (W * H * A);
+
+    sum_probs_data[i] = 0.0;
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = n * (H * W * D) + c * (H * W) + y * W + x;
+      sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
+    }
+  }
+}
+
+__global__ void SubSumKernel(
+    const int N, const int A, const int W, const int H,
+    const float* sum_probs_data, float* dXdata, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
+    int D = num_classes * A;
+    int x = i % W;
+    int y = (i / W) % H;
+    int a = ((i / (W * H)) % D) / num_classes;
+    int n = i / W / H / D;
+    int idx = n * (H * W * A) + a * (H * W) + y * W + x;
+    dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
+  }
+}
+
+} // namespace
+
+
+template <>
+bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Logits
+  auto* P = Output(0); // Probabilities from softmax
+
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  int A = D / num_classes_;
+
+  P->ResizeLike(X);
+  DCHECK_EQ(X.ndim(), 4);
+
+  const float* Xdata = X.data<float>();
+  float* Pdata = P->mutable_data<float>();
+
+  // Softmax for each x,y location
+  GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                         0, context_.cuda_stream()>>>(
+      N, A, W, H, Xdata, Pdata, num_classes_);
+  return true;
+}
+
+
+template<>
+bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y = Input(0);  // Probabilities from softmax
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+
+  DCHECK_EQ(Y.ndim(), 4);
+
+  int N = Y.dim32(0);
+  int D = Y.dim32(1);
+  int H = Y.dim32(2);
+  int W = Y.dim32(3);
+  int A = D / num_classes_;
+
+  dX->ResizeLike(Y);
+
+  if (sum_probs_.size() != N * A * H * W) {
+    sum_probs_.Resize(N * A * H * W);
+  }
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  float* sum_probs_data = sum_probs_.mutable_data<float>();
+  math::Set<float, CUDAContext>(
+      sum_probs_.size(), 0.0f, sum_probs_data, &context_);
+
+  // Complete math:
+  // J_ij = h_i (delta_ij - h_j)
+  // d x_i = sum_j d h_ij = sum_j J_ij * dy_j
+  //       = sum_j h_i (delta_ij - h_j) * dy_j
+  //       = h_i dy_i - (sum_j h_i h_j dy_j)
+  //       = h_i dy_i - h_i sum_j h_j dy_j
+
+  // Step 0: dx = dy
+  context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
+
+  // Step 1: s = Sum(dY[j] * Y[j])
+  SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
+                   context_.cuda_stream()>>>(
+    N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
+
+  // Step 2: dX[i] = dX[i] - s
+  SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
+                  context_.cuda_stream()>>>(
+    N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
+
+  // Step 3: dX[i] = Y[i] * dX[i]
+  math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
+
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
+                       GroupSpatialSoftmaxOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
+                       GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h
new file mode 100644
index 0000000..6bced40
--- /dev/null
+++ b/modules/detectron/group_spatial_softmax_op.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_
+#define GROUP_SPATIAL_SOFTMAX_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class GroupSpatialSoftmaxOp final : public Operator<Context> {
+ public:
+  GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 81)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  int num_classes_;
+  StorageOrder order_;
+};
+
+template <typename T, class Context>
+class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
+ public:
+  GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 81)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  int num_classes_;
+  StorageOrder order_;
+  Tensor<Context> sum_probs_;
+};
+
+} // namespace caffe2
+
+#endif // GROUP_SPATIAL_SOFTMAX_OP_H_
diff --git a/modules/detectron/ps_roi_pool_op.cc b/modules/detectron/ps_roi_pool_op.cc
new file mode 100644
index 0000000..c57b0fc
--- /dev/null
+++ b/modules/detectron/ps_roi_pool_op.cc
@@ -0,0 +1,106 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ps_roi_pool_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    PSRoIPoolGradient,
+    PSRoIPoolGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(PSRoIPool)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Position Sensitive Region of Interest Pooling as used in R-FCN.
+)DOC")
+    .Arg(
+        "spatial_scale",
+        "(float) default 1.0; Spatial scale of the input feature map X "
+        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
+        "w.r.t. the input image.")
+    .Arg(
+        "group_size",
+        "(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} "
+        "is the pooled output Y's height and width, respectively.")
+    .Arg(
+        "output_dim",
+        "(int) default 1; number of channels in the pooled output, which might "
+        "be the number of classes is used for classification or 4 if used for "
+        "class agnostic bounding box regression.")
+    .Input(
+        0,
+        "X",
+        "4D position sensitive feature map input of shape (N, C, H, W), where "
+        "C = group_size**2 * output_dim.")
+    .Input(
+        1,
+        "RoIs",
+        "2D input of shape (R, 5) specifying R RoIs with five columns "
+        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
+        "coordinates are in the coordinate system of the input image.")
+    .Output(
+        0,
+        "Y",
+        "4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th "
+        "batch element is a pooled feature map cooresponding to the r-th RoI.")
+    .Output(
+        1,
+        "argmaxes",
+        "4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, "
+        "except it records the argmax indices rather than the max pooled "
+        "values.");
+
+OPERATOR_SCHEMA(PSRoIPoolGradient)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See PSRoIPool.")
+    .Input(
+        1,
+        "RoIs",
+        "See PSRoIPool.")
+    .Input(
+        2,
+        "argmaxes",
+        "See PSRoIPool.")
+    .Input(
+        3,
+        "dY",
+        "Gradient of forward output 0 (Y)")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X)");
+
+class GetPSRoIPoolGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "PSRoIPoolGradient",
+        "",
+        vector<string>{I(0), I(1), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu
new file mode 100644
index 0000000..53844b1
--- /dev/null
+++ b/modules/detectron/ps_roi_pool_op.cu
@@ -0,0 +1,289 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu
+//
+// ------------------------------------------------------------------
+// R-FCN
+// Copyright (c) 2016 Microsoft
+// Licensed under The MIT License [see r-fcn/LICENSE for details]
+// Written by Yi Li
+// ------------------------------------------------------------------
+//
+// COPYRIGHT
+//
+// All contributions by the University of California:
+// Copyright (c) 2014, 2015, The Regents of the University of California
+// (Regents)
+// All rights reserved.
+//
+// All other contributions:
+// Copyright (c) 2014, 2015, the respective contributors
+// All rights reserved.
+//
+// Caffe uses a shared copyright model: each contributor holds copyright over
+// their contributions to Caffe. The project versioning records all such
+// contribution and copyright details. If a contributor wants to further mark
+// their specific copyright on a particular contribution, they should indicate
+// their copyright solely in the commit message of the change when it is
+// committed.
+//
+// LICENSE
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// CONTRIBUTION AGREEMENT
+//
+// By contributing to the BVLC/caffe repository through pull-request, comment,
+// or otherwise, the contributor releases their content to the
+// license and copyright terms herein.
+
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "ps_roi_pool_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__
+float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__global__ void PSRoIPoolForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const T* bottom_rois,
+    const int output_dim,
+    const int group_size,
+    T* top_data,
+    int* mapping_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(
+      roundf(offset_bottom_rois[1])) * spatial_scale;
+    T roi_start_h = static_cast<T>(
+      roundf(offset_bottom_rois[2])) * spatial_scale;
+    T roi_end_w = static_cast<T>(
+      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
+    T roi_end_h = static_cast<T>(
+      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(
+      static_cast<T>(ph) * bin_size_h + roi_start_h);
+    int wstart = floor(
+      static_cast<T>(pw)* bin_size_w + roi_start_w);
+    int hend = ceil(
+      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+    int wend = ceil(
+      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0),width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int gw = pw;
+    int gh = ph;
+    int c = (ctop * group_size + gh) * group_size + gw;
+
+    const T* offset_bottom_data =
+      bottom_data + (roi_batch_ind * channels + c) * height * width;
+    T out_sum = 0;
+    for (int h = hstart; h < hend; ++h){
+     for (int w = wstart; w < wend; ++w){
+       int bottom_index = h*width + w;
+       out_sum += offset_bottom_data[bottom_index];
+     }
+    }
+
+    T bin_area = (hend - hstart) * (wend - wstart);
+    top_data[index] = is_empty ? 0. : out_sum / bin_area;
+    mapping_channel[index] = c;
+  }
+}
+
+template <typename T>
+__global__ void PSRoIPoolBackward(
+    const int nthreads,
+    const T* top_diff,
+    const int* mapping_channel,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int output_dim,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(
+      roundf(offset_bottom_rois[1])) * spatial_scale;
+    T roi_start_h = static_cast<T>(
+      roundf(offset_bottom_rois[2])) * spatial_scale;
+    T roi_end_w = static_cast<T>(
+      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
+    T roi_end_h = static_cast<T>(
+      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(
+      static_cast<T>(ph)* bin_size_h + roi_start_h);
+    int wstart = floor(
+      static_cast<T>(pw)* bin_size_w + roi_start_w);
+    int hend = ceil(
+      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+    int wend = ceil(
+      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Compute c at bottom
+    int c = mapping_channel[index];
+    T* offset_bottom_diff =
+      bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    T bin_area = (hend - hstart) * (wend - wstart);
+    T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
+    for (int h = hstart; h < hend; ++h){
+      for (int w = wstart; w < wend; ++w){
+        int bottom_index = h * width + w;
+        gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
+      }
+    }
+  }
+}
+
+} // namespace
+
+template<>
+bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Input data to pool
+  auto& R = Input(1);  // RoIs
+  auto* Y = Output(0); // PSRoI pooled data
+  auto* A = Output(1); // mapping_channel
+
+  Y->Resize(R.dim32(0), output_dim_, pooled_height_, pooled_width_);
+  A->Resize(Y->dims());
+  int output_size = Y->size();
+  PSRoIPoolForward<float><<<CAFFE_GET_BLOCKS(output_size),
+                            CAFFE_CUDA_NUM_THREADS,
+                            0, context_.cuda_stream()>>>(
+      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
+      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
+      group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
+  return true;
+}
+
+
+template<>
+bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X  = Input(0);  // Input data to pool
+  auto& R  = Input(1);  // RoIs
+  auto& A  = Input(2);  // mapping channels
+  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
+                        // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  // Must zero-out dX before accumulating gradients
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  PSRoIPoolBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
+                             CAFFE_CUDA_NUM_THREADS,
+                             0, context_.cuda_stream()>>>(
+      dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
+      X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
+      output_dim_, dX->mutable_data<float>(), R.data<float>());
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(PSRoIPool,
+                       PSRoIPoolOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(PSRoIPoolGradient,
+                       PSRoIPoolGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/ps_roi_pool_op.h b/modules/detectron/ps_roi_pool_op.h
new file mode 100644
index 0000000..ad15767
--- /dev/null
+++ b/modules/detectron/ps_roi_pool_op.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PS_ROI_POOL_OP_H_
+#define PS_ROI_POOL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class PSRoIPoolOp final : public Operator<Context> {
+ public:
+  PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        spatial_scale_(OperatorBase::GetSingleArgument<float>(
+              "spatial_scale", 1.)),
+        group_size_(OperatorBase::GetSingleArgument<int>("group_size", 1)),
+        output_dim_(OperatorBase::GetSingleArgument<int>("output_dim", 1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(group_size_, 0);
+    pooled_height_ = group_size_;
+    pooled_width_ = group_size_;
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+   float spatial_scale_;
+   int group_size_;
+   int output_dim_;
+   int pooled_height_;
+   int pooled_width_;
+   int channels_;
+   int height_;
+   int width_;
+ };
+
+template <typename T, class Context>
+class PSRoIPoolGradientOp final : public Operator<Context> {
+ public:
+  PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(OperatorBase::GetSingleArgument<float>(
+              "spatial_scale", 1.)),
+        group_size_(OperatorBase::GetSingleArgument<int>("group_size", 1)),
+        output_dim_(OperatorBase::GetSingleArgument<int>("output_dim", 1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(group_size_, 0);
+    pooled_height_ = group_size_;
+    pooled_width_ = group_size_;
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int group_size_;
+  int output_dim_;
+  int pooled_height_;
+  int pooled_width_;
+  int channels_;
+  int height_;
+  int width_;
+};
+
+} // namespace caffe2
+
+#endif // PS_ROI_POOL_OP_H_
diff --git a/modules/detectron/roi_pool_f_op.cc b/modules/detectron/roi_pool_f_op.cc
new file mode 100644
index 0000000..81bf8bb
--- /dev/null
+++ b/modules/detectron/roi_pool_f_op.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "roi_pool_f_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(RoIPoolF)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Region of Interest (RoI) pooling operation as used in Fast R-CNN.
+)DOC")
+    .Arg(
+        "spatial_scale",
+        "(float) default 1.0; Spatial scale of the input feature map X "
+        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
+        "w.r.t. the input image.")
+    .Arg(
+        "pooled_h",
+        "(int) default 1; Pooled output Y's height.")
+    .Arg(
+        "pooled_w",
+        "(int) default 1; Pooled output Y's width.")
+    .Input(
+        0,
+        "X",
+        "4D feature map input of shape (N, C, H, W).")
+    .Input(
+        1,
+        "RoIs",
+        "2D input of shape (R, 5) specifying R RoIs with five columns "
+        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
+        "coordinates are in the coordinate system of the input image.")
+    .Output(
+        0,
+        "Y",
+        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
+        "is a pooled feature map cooresponding to the r-th RoI.")
+    .Output(
+        1,
+        "argmaxes",
+        "4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it "
+        "records the argmax indices rather than the max pooled values.");
+
+OPERATOR_SCHEMA(RoIPoolFGradient)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See RoIPoolF.")
+    .Input(
+        1,
+        "RoIs",
+        "See RoIPoolF.")
+    .Input(
+        2,
+        "argmaxes",
+        "See RoIPoolF.")
+    .Input(
+        3,
+        "dY",
+        "Gradient of forward output 0 (Y)")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X)");
+
+class GetRoIPoolFGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RoIPoolFGradient",
+        "",
+        vector<string>{I(0), I(1), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu
new file mode 100644
index 0000000..ecec196
--- /dev/null
+++ b/modules/detectron/roi_pool_f_op.cu
@@ -0,0 +1,190 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "roi_pool_f_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__
+float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T* bottom_rois, T* top_data, int* argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
+    int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
+    int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
+    int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height)
+                       / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width)
+                       / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph)
+                                        * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw)
+                                        * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
+                                     * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
+                                     * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (offset_bottom_data[bottom_index] > maxval) {
+          maxval = offset_bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    argmax_data[index] = maxidx;
+  }
+}
+
+template <typename T>
+__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
+    const int* argmax_data, const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    T* offset_bottom_diff = bottom_diff + bottom_offset;
+    const int* offset_argmax_data = argmax_data + top_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      gpu_atomic_add(
+          static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
+          offset_bottom_diff + argmax);
+    }
+  }
+}
+
+} // namespace
+
+template<>
+bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Input data to pool
+  auto& R = Input(1);  // RoIs
+  auto* Y = Output(0); // RoI pooled data
+  auto* A = Output(1); // argmaxes
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    A->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    A->mutable_data<int>();
+    return true;
+  }
+
+  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+  A->Resize(Y->dims());
+  int output_size = Y->size();
+  RoIPoolFForward<float><<<CAFFE_GET_BLOCKS(output_size),
+                          CAFFE_CUDA_NUM_THREADS,
+                          0, context_.cuda_stream()>>>(
+      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
+      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
+      Y->mutable_data<float>(), A->mutable_data<int>());
+  return true;
+}
+
+
+template<>
+bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X  = Input(0);  // Input data to pool
+  auto& R  = Input(1);  // RoIs
+  auto& A  = Input(2);  // argmaxes
+  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
+                        // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  // Must zero-out dX before accumulating gradients
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  if (dY.size() > 0) {  // Handle possibly empty gradient if there were no rois
+    RoIPoolFBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
+                             CAFFE_CUDA_NUM_THREADS,
+                             0, context_.cuda_stream()>>>(
+        dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
+        X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
+        dX->mutable_data<float>(), R.data<float>());
+  }
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(RoIPoolF,
+                       RoIPoolFOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(RoIPoolFGradient,
+                       RoIPoolFGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/roi_pool_f_op.h b/modules/detectron/roi_pool_f_op.h
new file mode 100644
index 0000000..15a0707
--- /dev/null
+++ b/modules/detectron/roi_pool_f_op.h
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ROI_POOL_F_OP_H_
+#define ROI_POOL_F_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIPoolFOp final : public Operator<Context> {
+ public:
+  RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        spatial_scale_(OperatorBase::GetSingleArgument<float>(
+              "spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+};
+
+template <typename T, class Context>
+class RoIPoolFGradientOp final : public Operator<Context> {
+ public:
+  RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(OperatorBase::GetSingleArgument<float>(
+              "spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_POOL_F_OP_H_
diff --git a/modules/detectron/sample_as_op.cc b/modules/detectron/sample_as_op.cc
new file mode 100644
index 0000000..d22cfb8
--- /dev/null
+++ b/modules/detectron/sample_as_op.cc
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sample_as_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SampleAs)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Select the batch elements from input tensor X where the corresponding input
+label value is > 0.
+)DOC")
+    .Input(
+        0,
+        "X",
+        "Tensor of at least 1D shape (N, ...).")
+    .Input(
+        1,
+        "labels",
+        "Tensor of type int with 1D shape (N, ).")
+    .Output(
+        0,
+        "Y",
+        "Tensor with number of dims matching X, but with the length of dim 0 "
+        "equal to the number of non-zero elements in labels. The batch items "
+        "from X corresponding to the non-zero elements in labels are copied "
+        "into Y.");
+
+OPERATOR_SCHEMA(SampleAsGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See SampleAs.")
+    .Input(
+        1,
+        "labels",
+        "See SampleAs."
+    )
+    .Input(
+        2,
+        "dY",
+        "Gradient of forward output 0 (Y).")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X).");
+
+class GetSampleAsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SampleAsGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SampleAs, GetSampleAsGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu
new file mode 100644
index 0000000..43ebaa2
--- /dev/null
+++ b/modules/detectron/sample_as_op.cu
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* SampleAs by Kaiming He for Mask R-CNN
+X.dim32(0) = L.dim32(0)
+Y's output samples are the samples of X for which L > 0.
+*/
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "sample_as_op.h"
+
+#include <stdio.h>
+
+namespace caffe2 {
+
+template <>
+bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to be sliced
+  auto& L = Input(1); // Target data that provide the identity
+  auto* Y = Output(0); // Sliced data (Y.dim32(0) = num of (L > 0))
+
+  CAFFE_ENFORCE(
+      X.dim32(0) == L.dim32(0),
+      "X.dim32(0) must be equal to L.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      L.dim32(0),
+      ")");
+
+  // copy L to CPU:
+  std::vector<int> labels(L.dim32(0));
+  context_.CopyBytes<CUDAContext, CPUContext>(
+      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
+  // Make sure that the copy is finished
+  context_.FinishDeviceComputation();
+
+  int count = 0;
+  for (int i = 0; i < L.dim32(0); i++) {
+    if (labels[i] > 0) {
+      count++;
+    }
+  }
+  assert(count > 0);
+
+  // resize Y
+  vector<TIndex> out_shape(X.dims());
+  out_shape[0] = count;
+  Y->Resize(out_shape);
+
+  const int len = X.size() / X.dim32(0);
+
+  float* output = Y->mutable_data<float>();
+  for (int i = 0; i < L.dim32(0); i++) {
+    if (labels[i] > 0) {
+      context_.CopyBytes<CUDAContext, CUDAContext>(
+          len * sizeof(float), X.data<float>() + i * len, output);
+      output += len;
+    } // if
+  } // i
+
+  return true;
+}
+
+template <>
+bool SampleAsGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& L = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+
+  dX->ResizeLike(X);
+
+  // copy L to CPU:
+  std::vector<int> labels(L.dim32(0));
+  context_.CopyBytes<CUDAContext, CPUContext>(
+      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
+  // Make sure that the copy is finished
+  context_.FinishDeviceComputation();
+
+  // zero-out dX
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  const int len = X.size() / X.dim32(0);
+
+  const float* input = dY.data<float>();
+  for (int i = 0; i < L.dim32(0); i++) {
+    if (labels[i] > 0) {
+      context_.CopyBytes<CUDAContext, CUDAContext>(
+          len * sizeof(float), input, dX->mutable_data<float>() + i * len);
+      input += len;
+    } // if
+  } // i
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SampleAsGradient,
+    SampleAsGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/sample_as_op.h b/modules/detectron/sample_as_op.h
new file mode 100644
index 0000000..70d2214
--- /dev/null
+++ b/modules/detectron/sample_as_op.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_AS_OP_H_
+#define SAMPLE_AS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SampleAsOp final : public Operator<Context> {
+ public:
+  SampleAsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+};
+
+template <typename T, class Context>
+class SampleAsGradientOp final : public Operator<Context> {
+ public:
+  SampleAsGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+};
+
+} // namespace caffe2
+
+#endif // SAMPLE_AS_OP_H_
diff --git a/modules/detectron/select_smooth_l1_loss_op.cc b/modules/detectron/select_smooth_l1_loss_op.cc
new file mode 100644
index 0000000..502be37
--- /dev/null
+++ b/modules/detectron/select_smooth_l1_loss_op.cc
@@ -0,0 +1,107 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "select_smooth_l1_loss_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SelectSmoothL1Loss,
+    SelectSmoothL1LossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SelectSmoothL1LossGradient,
+    SelectSmoothL1LossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SelectSmoothL1Loss)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D
+tensor that encodes bounding box regression predictions.
+)DOC")
+    .Arg(
+        "beta",
+        "(float) default 1.0; L2 to L1 transition point.")
+    .Arg(
+        "scale",
+        "(float) default 1.0; multiply the loss by this scale factor.")
+    .Input(
+        0,
+        "Y_hat",
+        "4D tensor of bounding box regression predictions with shape "
+        "(N, 4 * num_bbox_classes * num_anchors, H, W).")
+    .Input(
+        1,
+        "Y",
+        "2D tensor of labels shape (M, 4) for 4 contiguous channels starting "
+        "at each of the M locations selected by the locations input.")
+    .Input(
+        2,
+        "locations",
+        "2D tensor of shape (M, 4) that identifies M 'select' locations "
+        "encoded by the four colums: (n, c, y, x). The loss is computed on the "
+        "four contiguous channel locations [c, c + 3] (inclusive).")
+    .Input(
+        3,
+        "normalizer",
+        "Scalar; the loss is divided by max(1, normalizer).")
+    .Output(
+        0,
+        "loss",
+        "Scalar loss.");
+
+OPERATOR_SCHEMA(SelectSmoothL1LossGradient)
+    .NumInputs(5)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "Y_hat",
+        "See SelectSmoothL1Loss.")
+    .Input(
+        1,
+        "Y",
+        "See SelectSmoothL1Loss.")
+    .Input(
+        2,
+        "locations",
+        "See SelectSmoothL1Loss.")
+    .Input(
+        3,
+        "normalizer",
+        "See SelectSmoothL1Loss.")
+    .Input(
+        4,
+        "d_loss",
+        "Gradient of forward output 0 (loss).")
+    .Output(
+        0,
+        "d_Y_hat",
+        "Gradient of forward input 0 (Y_hat).");
+
+class GetSelectSmoothL1LossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SelectSmoothL1LossGradient",
+        "",
+        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu
new file mode 100644
index 0000000..75b0b53
--- /dev/null
+++ b/modules/detectron/select_smooth_l1_loss_op.cu
@@ -0,0 +1,188 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "select_smooth_l1_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void SelectSmoothL1Kernel(
+    const int D, const int H, const int W,
+    const int M, const float* Y_hat, const float* Y, const float* L, float* out,
+    const float* S, const float beta) {
+  // f(x) = 0.5 * x^2 / beta      if |x| < beta
+  //        |x| - 0.5 * beta      otherwise
+  CUDA_1D_KERNEL_LOOP(i, M) {
+    int n = L[i * 4];
+    int c = L[i * 4 + 1];
+    int y = L[i * 4 + 2];
+    int x = L[i * 4 + 3];
+
+    for (int j = 0; j < 4; j++){
+      // Y_hat: N x (A * CLS * 4) x H x W
+      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
+      float y_hat = Y_hat[ind];
+      float y = Y[i * 4 + j];
+      float val = y_hat - y;
+      float abs_val = abs(val);
+      if (abs_val < beta) {
+        out[ind] = (0.5 * val * val / beta) / max(S[0], 1.0);
+      } else {
+        out[ind] = (abs_val - 0.5 * beta) / max(S[0], 1.0);
+      }
+    }
+  }
+}
+
+
+__global__ void SelectSmoothL1GradientKernel(
+    const int D, const int H, const int W,
+    const int M,
+    const float* Y_hat,
+    const float* Y,
+    const float* L,
+    float* out,
+    const float* d_loss_data,
+    float norm,
+    const float* S,
+    float beta) {
+  // f'(x) = x / beta     if |x| < beta
+  //       = sign(x)      otherwise
+  // We also scale by norm * d_loss in this kernel for convenience
+  CUDA_1D_KERNEL_LOOP(i, M) {
+    int n = L[i * 4];
+    int c = L[i * 4 + 1];
+    int y = L[i * 4 + 2];
+    int x = L[i * 4 + 3];
+    float d_loss = *d_loss_data;
+
+    for (int j = 0; j < 4; j++) {
+      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
+      float y_hat = Y_hat[ind];
+      float y = Y[i * 4 + j];
+      float val = y_hat - y;
+      float abs_val = abs(val);
+      if (abs_val < beta) {
+        out[ind] = norm * d_loss * val / beta / max(S[0], 1.0);
+      } else {
+        out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / max(S[0], 1.0);
+      }
+    }
+  }
+}
+} // namespace
+
+
+template<>
+bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
+  // bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case
+  auto& Y_hat     = Input(0);
+  // true targets: for example: M x 4 where M is the #fg boxes per fpn level
+  auto& Y         = Input(1);
+  // locations of fg boxes: M x 4
+  auto& L         = Input(2);
+  // total number of fg boxes across all FPN levels: scalar
+  auto& S         = Input(3);
+  auto* avg_loss  = Output(0);
+
+  avg_loss->Resize(vector<TIndex>());
+  if (Y.size() == 0){
+    math::Set<float, CUDAContext>(
+      1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
+    return true;
+  }
+
+  int N = Y_hat.dim32(0);
+  int D = Y_hat.dim32(1);
+  int H = Y_hat.dim32(2);
+  int W = Y_hat.dim32(3);
+
+  int M = Y.dim32(0);
+
+  // initialization
+  buff_.ResizeLike(Y_hat);
+  math::Set<float, CUDAContext>(
+    1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
+  math::Set<float, CUDAContext>(
+    buff_.size(), 0.0, buff_.mutable_data<float>(), &context_);
+
+  // Element-wise smooth l1 loss
+  // l := SelectSmoothL1((y_hat - y))
+  SelectSmoothL1Kernel<<<CAFFE_GET_BLOCKS(buff_.size()),
+                         CAFFE_CUDA_NUM_THREADS,
+                         0, context_.cuda_stream()>>>(
+    D, H, W,
+    M, Y_hat.data<float>(), Y.data<float>(),
+    L.data<float>(), buff_.mutable_data<float>(),
+    S.data<float>(), beta_);
+
+  // Sum of all losses
+  // al := sum_i l_i
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Sum<float, CUDAContext>(
+      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
+
+  // Average of input batch size
+  math::Scale<float, CUDAContext>(
+      1, scale_, avg_loss_data, avg_loss_data, &context_);
+  return true;
+}
+
+template<>
+bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y_hat      = Input(0);
+  auto& Y          = Input(1);
+  auto& L          = Input(2);
+  auto& S          = Input(3);
+  // Below is gradient of net w.r.t. avg_loss ("gradOuput"), should be all 1's
+  auto& d_avg_loss = Input(4);
+  auto* d_Y_hat    = Output(0); // gradient of net w.r.t. Y_hat ("gradInput")
+
+  d_Y_hat->ResizeLike(Y_hat);
+  math::Set<float, CUDAContext>(
+    d_Y_hat->size(), 0.0, d_Y_hat->mutable_data<float>(), &context_);
+  if (Y.size() == 0){
+    return true;
+  }
+
+  int N = Y_hat.dim32(0);
+  int D = Y_hat.dim32(1);
+  int H = Y_hat.dim32(2);
+  int W = Y_hat.dim32(3);
+
+  int M = Y.dim32(0);
+  // Element-wise weighted difference (can be used to ignore or reweight
+  // specific components)
+  // d := (y_hat - y)
+  // d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y))
+
+  SelectSmoothL1GradientKernel<<<CAFFE_GET_BLOCKS(d_Y_hat->size()),
+                                 CAFFE_CUDA_NUM_THREADS,
+                                 0, context_.cuda_stream()>>>(
+    D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
+    L.data<float>(), d_Y_hat->mutable_data<float>(),
+    d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
+
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss,
+                       SelectSmoothL1LossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient,
+                       SelectSmoothL1LossGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h
new file mode 100644
index 0000000..04908ef
--- /dev/null
+++ b/modules/detectron/select_smooth_l1_loss_op.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_
+#define SELECT_SMOOTH_L1_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SelectSmoothL1LossOp final : public Operator<Context> {
+ public:
+  SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 1.)),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)) {
+    CAFFE_ENFORCE(beta_ > 0);
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float beta_; // Transition point from L1 to L2 loss
+  float scale_; // Scale the loss by scale_
+  int dim_; // dimension for 1 anchor prediction
+  Tensor<Context> buff_; // Buffer for element-wise differences
+};
+
+template <typename T, class Context>
+class SelectSmoothL1LossGradientOp final : public Operator<Context> {
+ public:
+  SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 1.)),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)) {
+    CAFFE_ENFORCE(beta_ > 0);
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float beta_; // Transition point from L1 to L2 loss
+  float scale_; // Scale the loss by scale_
+  int dim_; // dimension for 1 anchor prediction
+  Tensor<Context> buff_; // Buffer for element-wise differences
+};
+
+} // namespace caffe2
+
+#endif // SELECT_SMOOTH_L1_LOSS_OP_H_
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cc b/modules/detectron/sigmoid_cross_entropy_loss_op.cc
new file mode 100644
index 0000000..f45ff40
--- /dev/null
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sigmoid_cross_entropy_loss_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyLoss,
+    SigmoidCrossEntropyLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyLossGradient,
+    SigmoidCrossEntropyLossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SigmoidCrossEntropyLoss)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute sigmoid activations followed by averaged binary cross entropy loss. The
+target values may be in {-1, 0, 1}, where -1 indicates that the corresponding
+sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By
+default the loss is divided by the number of targets > -1 and then multiplied by
+the `scale` op argument. The divisive normalization may be disable by setting
+the op argument `normalize` to 0 (the multiplication by `scale` still takes
+effect).
+
+This op fuses sigmoid and cross entropy for numerical stability in both forward
+and gradient computation.
+)DOC")
+    .Arg(
+        "scale",
+        "(float) default 1.0; multiply the loss by this scale factor.")
+    .Arg(
+        "normalize",
+        "(int) default 1; if true, divide the loss by the number of targets > "
+        "-1.")
+    .Input(
+        0,
+        "X",
+        "Tensor of predicted logits (shape must be at least 1D).")
+    .Input(
+        1,
+        "targets",
+        "Tensor of targets of type int and same shape as logits X.")
+    .Output(
+        0,
+        "loss",
+        "Scalar loss.");
+
+OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See SigmoidCrossEntropyLoss.")
+    .Input(
+        1,
+        "targets",
+        "See SigmoidCrossEntropyLoss.")
+    .Input(
+        2,
+        "d_loss",
+        "Gradient of forward output 0 (loss).")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X).");
+
+class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SigmoidCrossEntropyLossGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
new file mode 100644
index 0000000..a12b844
--- /dev/null
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
@@ -0,0 +1,185 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "sigmoid_cross_entropy_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    data[index] = (data[index] > a) ? data[index] : a;
+  }
+}
+
+__global__ void SigmoidCrossEntropyLossKernel(
+    const int n,
+    const float* logits,
+    const int* targets,
+    float* losses,
+    float* counts) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    if (targets[index] == -1) {
+      losses[index] = 0.;
+      counts[index] = 0.;
+    } else {
+      losses[index] =
+          -1. * logits[index] * (targets[index] - (logits[index] >= 0)) +
+          logf(
+              1 +
+              expf(logits[index] - 2 * logits[index] * (logits[index] >= 0)));
+      counts[index] = 1.;
+    }
+  }
+}
+
+__global__ void SigmoidCrossEntropyLossGradientKernel(
+    const int n,
+    const float* logits,
+    const int* targets,
+    float* d_logits,
+    float* counts) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    if (targets[index] == -1) {
+      d_logits[index] = 0.;
+      counts[index] = 0.;
+    } else {
+      d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index];
+      counts[index] = 1.;
+    }
+  }
+}
+} // namespace
+
+template <>
+bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& T = Input(1);
+  auto* avg_loss = Output(0);
+
+  CAFFE_ENFORCE(
+      X.size() == T.size(),
+      "Logit and target must have the same size",
+      "(",
+      X.size(),
+      " vs. ",
+      T.size(),
+      ")");
+  avg_loss->Resize(vector<TIndex>());
+  counts_.ResizeLike(X);
+  losses_.ResizeLike(X);
+  normalizer_.Resize(vector<TIndex>());
+  SigmoidCrossEntropyLossKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      X.data<float>(),
+      T.data<int>(),
+      losses_.mutable_data<float>(),
+      counts_.mutable_data<float>());
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Sum<float, CUDAContext>(
+      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
+  if (normalize_) {
+    float* normalizer_data = normalizer_.mutable_data<float>();
+    math::Sum<float, CUDAContext>(
+        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
+    // Prevent division by zero is all counts are zero
+    ElementwiseMaxKernel<<<
+        CAFFE_GET_BLOCKS(normalizer_.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
+    math::Div<float, CUDAContext>(
+        1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
+  }
+  math::Scale<float, CUDAContext>(
+      1, scale_, avg_loss_data, avg_loss_data, &context_);
+
+  return true;
+}
+
+template <>
+bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& T = Input(1);
+  auto& d_avg_loss = Input(2);
+  auto* dX = Output(0);
+
+  dX->ResizeLike(X);
+  counts_.ResizeLike(X);
+  normalizer_.Resize(vector<TIndex>());
+  SigmoidCrossEntropyLossGradientKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      X.data<float>(),
+      T.data<int>(),
+      dX->mutable_data<float>(),
+      counts_.mutable_data<float>());
+  if (normalize_) {
+    float* normalizer_data = normalizer_.mutable_data<float>();
+    math::Sum<float, CUDAContext>(
+        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
+    // Prevent division by zero is all counts are zero
+    ElementwiseMaxKernel<<<
+        CAFFE_GET_BLOCKS(normalizer_.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
+    math::Div<float, CUDAContext>(
+        1,
+        d_avg_loss.data<float>(),
+        normalizer_data,
+        normalizer_data,
+        &context_);
+    math::Scale<float, CUDAContext>(
+        1, scale_, normalizer_data, normalizer_data, &context_);
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        normalizer_data,
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+  } else {
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        scale_,
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        d_avg_loss.data<float>(),
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    SigmoidCrossEntropyLoss,
+    SigmoidCrossEntropyLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SigmoidCrossEntropyLossGradient,
+    SigmoidCrossEntropyLossGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h
new file mode 100644
index 0000000..34acd68
--- /dev/null
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
+#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SigmoidCrossEntropyLossOp final : public Operator<Context> {
+ public:
+  SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        normalize_(OperatorBase::GetSingleArgument<int>("normalize", 1)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  int normalize_;
+  Tensor<Context> losses_;
+  Tensor<Context> counts_;
+  Tensor<Context> normalizer_;
+};
+
+template <typename T, class Context>
+class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
+ public:
+  SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        normalize_(OperatorBase::GetSingleArgument<int>("normalize", 1)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  int normalize_;
+  Tensor<Context> counts_;
+  Tensor<Context> normalizer_;
+};
+
+} // namespace caffe2
+
+#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
diff --git a/modules/detectron/sigmoid_focal_loss_op.cc b/modules/detectron/sigmoid_focal_loss_op.cc
new file mode 100644
index 0000000..583e9a0
--- /dev/null
+++ b/modules/detectron/sigmoid_focal_loss_op.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sigmoid_focal_loss_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SigmoidFocalLossGradient,
+    SigmoidFocalLossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SigmoidFocalLoss)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+The binary form of Focal Loss designed for use in RetinaNet-like models.
+The input is assumed to be unnormalized scores (sometimes called 'logits')
+arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
+elements in the batch, H and W are the height and width, and C = num_anchors *
+num_classes defines num_anchors 'groups' of logits, each of length
+num_classes. For the binary form of Focal Loss, num_classes does not include
+the background category. (So, for COCO, num_classes = 80, not 81.)
+
+The binary form of focal loss is:
+
+  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
+
+where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0,
+respectively.
+
+See: https://arxiv.org/abs/1708.02002 for details.
+)DOC")
+    .Arg(
+       "scale",
+       "(float) default 1.0; multiply the loss by this scale factor.")
+    .Arg(
+       "alpha",
+       "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
+    .Arg(
+       "gamma",
+       "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
+    .Arg(
+       "num_classes",
+       "(int) default 80; number of classes (excluding background).")
+    .Input(
+       0,
+       "logits",
+       "4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape "
+       "(N, C, H, W), where C = num_anchors * num_classes.")
+    .Input(
+       1,
+       "labels",
+       "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
+       "a class label in [0, num_classes - 1] (inclusive). The label "
+       "identifies the one class that should have a sigmoid target of 1.")
+    .Input(
+       2,
+       "normalizer",
+       "Scalar; the loss is normalized by 1 / max(1, normalizer)."
+    )
+    .Output(
+       0,
+       "loss",
+       "Scalar loss.");
+
+OPERATOR_SCHEMA(SigmoidFocalLossGradient)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "logits",
+        "See SigmoidFocalLoss.")
+    .Input(
+        1,
+        "labels",
+        "See SigmoidFocalLoss.")
+    .Input(
+        2,
+        "normalizer",
+        "See SigmoidFocalLoss.")
+    .Input(
+        3,
+        "d_loss",
+        "Gradient of forward output 0 (loss)")
+    .Output(
+        0,
+        "d_logits",
+        "Gradient of forward input 0 (logits)");
+
+class GetSigmoidFocalLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{
+        {I(0), I(1), I(2), GO(0)},
+    };
+
+    return SingleGradientDef(
+        "SigmoidFocalLossGradient", "", blob_names, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu
new file mode 100644
index 0000000..462c126
--- /dev/null
+++ b/modules/detectron/sigmoid_focal_loss_op.cu
@@ -0,0 +1,179 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "sigmoid_focal_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void SigmoidFocalLossKernel(
+    const int N, const int D, const int H, const int W, const float* logits,
+    const int* targets, const float* weight_pos,
+    const float gamma, const float alpha,
+    const int num_classes, float* losses) {
+  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
+    int x = i % W;
+    int y = (i / W) % H;
+    int c = (i / (W * H)) % D;  // channel, here D is channel dim in input NxDxHxW
+    int n = i / (W * H * D);    // n in NxDxHxW
+
+    int A = D / num_classes;   // num_anchors = A
+    int a = c / num_classes;   // current anchor out of A anchors in D = A * num_cls
+    int d = c % num_classes;   // current class
+    int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];   // target
+
+    // check whether the class is true class or not.
+    // The target classes are in range 1 - 81 and the d is in range 0-80
+    // because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
+    float c1 = (t == (d + 1));
+    float c2 = (t != -1 & t != (d + 1));
+
+    float Np = max(weight_pos[0], 1.0);
+    float zn = (1.0 - alpha) / Np;
+    float zp = alpha / Np;
+
+    // p = 1. / 1. + expf(-x)
+    float p = 1. / (1. + expf(-logits[i]));
+
+    // (1 - p)**gamma * log(p) where
+    float term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+    // p**gamma * log(1 - p)
+    float term2 =
+        powf(p, gamma) *
+        (-1. * logits[i] * (logits[i] >= 0) -
+         logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+
+    losses[i] = 0.0;
+    losses[i] += -c1 * term1 * zp;
+    losses[i] += -c2 * term2 * zn;
+  }
+}
+
+__global__ void SigmoidFocalLossGradientKernel(
+    const int N, const int D, const int H, const int W, const float* logits,
+    const int* targets, float* dX_data, const float* weight_pos,
+    const float gamma, const float alpha, const int num_classes,
+    const float* avg_loss) {
+  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
+      float a_loss = avg_loss[0];
+      int x = i % W;
+      int y = (i / W) % H;
+      int c = (i / (W * H)) % D;
+      int n = i / (W * H * D);
+
+      int A = D / num_classes;   // num_anchors
+      int a = c / num_classes;   // current anchor
+      int d = c % num_classes;   // current class
+
+      float Np = max(weight_pos[0], 1.0);
+      float zn = (1.0 - alpha) / Np;
+      float zp = alpha / Np;
+      int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
+
+      float c1 = (t == (d + 1));
+      float c2 = (t != -1 & t != (d + 1));
+      float p = 1. / (1. + expf(-logits[i]));
+
+      // (1-p)**g * (1 - p - g*p*log(p))
+      float term1 =
+          powf((1. - p), gamma) *
+          (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+      // (p**g) * (g*(1-p)*log(1-p) - p)
+      float term2 =
+          powf(p, gamma) *
+          ((-1. * logits[i] * (logits[i] >= 0) -
+           logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+           (1. - p) * gamma - p);
+      dX_data[i] = 0.0;
+      dX_data[i] += -c1 * zp * term1;
+      dX_data[i] += -c2 * zn * term2;
+      dX_data[i] = dX_data[i] * a_loss;
+  }
+}
+} // namespace
+
+template<>
+bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
+  // Input logits, for example: N x (A * 80) x H x W in cls-agnostic
+  auto& X = Input(0);
+  // Target, for example: N x A x H x W
+  auto& T = Input(1);
+  // Number of positive examples: scalar
+  auto& wp = Input(2);
+  // output avg Sigmoid focal loss as mentioned in RetinaNet paper
+  auto* avg_loss = Output(0);
+
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+
+  avg_loss->Resize(vector<TIndex>());
+  losses_.ResizeLike(X);
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+
+  SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+      N, D, H, W, X.data<float>(), T.data<int>(),
+      wp.data<float>(), gamma_, alpha_, num_classes_,
+      losses_.mutable_data<float>());
+
+  math::Sum<float, CUDAContext>(
+      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
+  math::Scale<float, CUDAContext>(
+      1, scale_, avg_loss_data, avg_loss_data, &context_);
+
+  return true;
+}
+
+
+template<>
+bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& T = Input(1);
+  auto& wp = Input(2);
+  auto& d_avg_loss = Input(InputSize() - 1);
+  auto* dX = Output(0);
+
+  // get input shape
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+
+  dX->ResizeLike(X);
+
+  SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+      N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
+      wp.data<float>(), gamma_, alpha_, num_classes_,
+      d_avg_loss.data<float>());
+  math::Scale<float, CUDAContext>(
+    dX->size(), scale_, dX->data<float>(), dX->mutable_data<float>(), &context_);
+
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
+                       SigmoidFocalLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
+                       SigmoidFocalLossGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h
new file mode 100644
index 0000000..d59df8f
--- /dev/null
+++ b/modules/detectron/sigmoid_focal_loss_op.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIGMOID_FOCAL_LOSS_OP_H_
+#define SIGMOID_FOCAL_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SigmoidFocalLossOp final : public Operator<Context> {
+ public:
+  SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 80)),
+        gamma_(OperatorBase::GetSingleArgument<float>("gamma", 1.)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0.25)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  int num_classes_;
+  float gamma_;
+  float alpha_;
+  Tensor<Context> losses_;
+  Tensor<Context> counts_;
+};
+
+template <typename T, class Context>
+class SigmoidFocalLossGradientOp final : public Operator<Context> {
+ public:
+  SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 80)),
+        gamma_(OperatorBase::GetSingleArgument<float>("gamma", 1.)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0.25)) {
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  int num_classes_;
+  float gamma_;
+  float alpha_;
+  Tensor<Context> counts_;
+  Tensor<Context> weights_; // unignored weights
+};
+
+} // namespace caffe2
+
+#endif // SIGMOID_FOCAL_LOSS_OP_H_
diff --git a/modules/detectron/smooth_l1_loss_op.cc b/modules/detectron/smooth_l1_loss_op.cc
new file mode 100644
index 0000000..b1e22fc
--- /dev/null
+++ b/modules/detectron/smooth_l1_loss_op.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "smooth_l1_loss_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SmoothL1LossGradient,
+    SmoothL1LossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SmoothL1Loss)
+    .NumInputs(4)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Smooth L1 Loss is a minor variation of Huber loss in which the point of
+transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta:
+
+  SmoothL1(x) = 0.5 * x^2 / beta      if |x| < beta
+                |x| - 0.5 * beta      otherwise.
+
+SmoothL1 is used in Fast R-CNN and decendants as the loss function for bounding
+box regression.
+
+The loss computed by this op has a flexible form:
+
+  scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])).
+
+The weights alpha_in and alpha_out are called the "inside" and "outside"
+weights, respectively. The inside weights are typically set to either 0 or 1 to
+implement ignoring (when 0) certain samples. The outside weights can be used
+to implement a per-sample loss weight. The overall loss is scaled by scale / N,
+where N is the number of batch elements in the input predictions.
+)DOC")
+    .Arg(
+        "beta",
+        "(float) default 1.0; L2 to L1 transition point.")
+    .Arg(
+        "scale",
+        "(float) default 1.0; multiply the loss by this scale factor.")
+    .Input(
+        0,
+        "Y_hat",
+        "Tensor of predictions (at least 1D).")
+    .Input(
+        1,
+        "Y",
+        "Tensor of labels with the same shape as Y_hat.")
+    .Input(
+        2,
+        "alpha_in",
+        "Tensor of inside weights with the same shape as Y.")
+    .Input(
+        3,
+        "alpha_out",
+        "Tensor of outside weights with the same shape as Y.")
+    .Output(
+        0,
+        "loss",
+        "Scalar loss.");
+
+OPERATOR_SCHEMA(SmoothL1LossGradient)
+    .NumInputs(5)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "Y_hat",
+        "See SmoothL1Loss.")
+    .Input(
+        1,
+        "Y",
+        "See SmoothL1Loss.")
+    .Input(
+        2,
+        "alpha_in",
+        "See SmoothL1Loss.")
+    .Input(
+        3,
+        "alpha_out",
+        "See SmoothL1Loss.")
+    .Input(
+        4,
+        "d_loss",
+        "Gradient of forward output 0 (loss).")
+    .Output(
+        0,
+        "d_Y_hat",
+        "Gradient of forward input 0 (Y_hat).");
+
+class GetSmoothL1LossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SmoothL1LossGradient",
+        "",
+        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu
new file mode 100644
index 0000000..3ee84d3
--- /dev/null
+++ b/modules/detectron/smooth_l1_loss_op.cu
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "smooth_l1_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void SmoothL1Kernel(
+    const int n, const T* in, T* out, T beta) {
+  // f(x) = 0.5 * x^2 / beta      if |x| < beta
+  //        |x| - 0.5 * beta      otherwise
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = in[index];
+    T abs_val = abs(val);
+    if (abs_val < beta) {
+      out[index] = 0.5 * val * val / beta;
+    } else {
+      out[index] = abs_val - 0.5 * beta;
+    }
+  }
+}
+
+template <typename T>
+__global__ void SmoothL1GradientKernel(
+    const int n,
+    const T* in,
+    T* out,
+    const T* d_loss_data,
+    T norm,
+    T beta) {
+  // f'(x) = x / beta     if |x| < beta
+  //       = sign(x)      otherwise
+  // We also scale by norm * d_loss in this kernel for convenience
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = in[index];
+    T abs_val = abs(val);
+    T d_loss = *d_loss_data;
+    if (abs_val < beta) {
+      out[index] = norm * d_loss * val / beta;
+    } else {
+      out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
+    }
+  }
+}
+} // namespace
+
+template<>
+bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y_hat     = Input(0);
+  auto& Y         = Input(1);
+  auto& alpha_in  = Input(2);
+  auto& alpha_out = Input(3);
+  auto* avg_loss  = Output(0);
+
+  int N = Y.dim32(0);
+  // Require the same number of elements along axis 0 (batch size), but
+  // otherwise don't care about the shape (just the number of elements)
+  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
+      "Y_hat and Y must have the same number of elements along axis 0");
+  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
+      "Y_hat and Y must have the same number of elements");
+  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
+  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
+
+  avg_loss->Resize(vector<TIndex>());
+  buff_.ResizeLike(Y);
+
+  // Difference
+  // d := y_hat - y
+  math::Sub<float, CUDAContext>(
+      Y.size(), Y_hat.data<float>(), Y.data<float>(),
+      buff_.mutable_data<float>(), &context_);
+  // Element-wise weighted difference (can be used to ignore or reweight
+  // specific components)
+  // d := alpha_in * (y_hat - y)
+  math::Mul<float, CUDAContext>(
+      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
+      buff_.mutable_data<float>(), &context_);
+
+  // Element-wise smooth l1 loss
+  // l := SmoothL1(alpha_in * (y_hat - y))
+  SmoothL1Kernel<float>
+  <<<CAFFE_GET_BLOCKS(buff_.size()),
+     CAFFE_CUDA_NUM_THREADS,
+     0,
+     context_.cuda_stream()>>>(
+          buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
+          beta_);
+
+  // Element-wise weighted smooth l1 loss (can be used to specify a per-element
+  // loss weight)
+  // l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
+  math::Mul<float, CUDAContext>(
+      buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
+      buff_.mutable_data<float>(), &context_);
+  // Sum of all losses
+  // al := sum_i l_i
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Sum<float, CUDAContext>(
+      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
+  // Average of input batch size
+  // al := 1/N * al
+  math::Scale<float, CUDAContext>(
+      1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
+  return true;
+}
+
+template<>
+bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& Y_hat      = Input(0);
+  auto& Y          = Input(1);
+  auto& alpha_in   = Input(2);
+  auto& alpha_out  = Input(3);
+  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOuput")
+  auto* d_Y_hat    = Output(0); // gradient of net w.r.t. Y_hat ("gradInput")
+  // We intentially don't compute gradients for Y, alpha_{in,out} since they
+  // are not needed (can change in the future if desired)
+
+  int N = Y.dim32(0);
+  // Require the same number of elements along axis 0 (batch size), but
+  // otherwise don't care about the shape (just the number of elements)
+  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
+      "Y_hat and Y must have the same number of elements along axis 0");
+  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
+      "Y_hat and Y must have the same number of elements");
+  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
+  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
+  CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
+
+  d_Y_hat->ResizeLike(Y_hat);
+  buff_.ResizeLike(Y);
+
+  // Difference
+  // d := y_hat - y
+  math::Sub<float, CUDAContext>(
+      Y.size(), Y_hat.data<float>(), Y.data<float>(),
+      buff_.mutable_data<float>(), &context_);
+  // Element-wise weighted difference (can be used to ignore or reweight
+  // specific components)
+  // d := alpha_in * (y_hat - y)
+  math::Mul<float, CUDAContext>(
+      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
+      buff_.mutable_data<float>(), &context_);
+  // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
+  SmoothL1GradientKernel<float>
+  <<<CAFFE_GET_BLOCKS(buff_.size()),
+     CAFFE_CUDA_NUM_THREADS,
+     0,
+     context_.cuda_stream()>>>(
+         buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
+         d_avg_loss.data<float>(), scale_ / N, beta_);
+  // Element-wise scale by alpha_in and alpha_out
+  math::Mul<float, CUDAContext>(
+      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
+      d_Y_hat->mutable_data<float>(), &context_);
+  math::Mul<float, CUDAContext>(
+      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
+      d_Y_hat->mutable_data<float>(), &context_);
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(SmoothL1Loss,
+                       SmoothL1LossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
+                       SmoothL1LossGradientOp<float, CUDAContext>);
+}  // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h
new file mode 100644
index 0000000..283be2e
--- /dev/null
+++ b/modules/detectron/smooth_l1_loss_op.h
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SMOOTH_L1_LOSS_OP_H_
+#define SMOOTH_L1_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SmoothL1LossOp final : public Operator<Context> {
+ public:
+  SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 1.)),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)) {
+    CAFFE_ENFORCE(beta_ > 0);
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float beta_; // Transition point from L1 to L2 loss
+  float scale_; // Scale the loss by scale_
+  Tensor<Context> buff_; // Buffer for element-wise differences
+};
+
+template <typename T, class Context>
+class SmoothL1LossGradientOp final : public Operator<Context> {
+ public:
+  SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        beta_(OperatorBase::GetSingleArgument<float>("beta", 1.)),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)) {
+    CAFFE_ENFORCE(beta_ > 0);
+    CAFFE_ENFORCE(scale_ >= 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float beta_; // Transition point from L1 to L2 loss
+  float scale_; // Scale the loss by scale_
+  Tensor<Context> buff_; // Buffer for element-wise differences
+};
+
+} // namespace caffe2
+
+#endif // SMOOTH_L1_LOSS_OP_H_
diff --git a/modules/detectron/softmax_focal_loss_op.cc b/modules/detectron/softmax_focal_loss_op.cc
new file mode 100644
index 0000000..d4bbc64
--- /dev/null
+++ b/modules/detectron/softmax_focal_loss_op.cc
@@ -0,0 +1,125 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "softmax_focal_loss_op.h"
+#include "caffe2/operators/softmax_shared.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SoftmaxFocalLossGradient,
+    SoftmaxFocalLossGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(SoftmaxFocalLoss)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+A multiclass form of Focal Loss designed for use in RetinaNet-like models.
+The input is assumed to be unnormalized scores (sometimes called 'logits')
+arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
+elements in the batch, H and W are the height and width, and C = num_anchors *
+num_classes. The softmax is applied num_anchors times along the C axis.
+
+The softmax version of focal loss is:
+
+  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
+
+where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and
+s_j is the unnormalized score for class j.
+
+See: https://arxiv.org/abs/1708.02002 for details.
+)DOC")
+    .Arg(
+        "scale",
+        "(float) default 1.0; multiply the loss by this scale factor.")
+    .Arg(
+        "alpha",
+        "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
+    .Arg(
+        "gamma",
+        "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
+    .Arg(
+        "num_classes",
+        "(int) default 81; number of classes in each softmax group.")
+    .Input(
+        0,
+        "scores",
+        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
+        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
+        "groups of contiguous num_classes softmax inputs.")
+    .Input(
+        1,
+        "labels",
+        "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
+        "a class label in [0, num_classes - 1] (inclusive).")
+    .Input(
+        2,
+        "normalizer",
+        "Scalar; the loss is normalized by 1 / max(1, normalizer)."
+    )
+    .Output(
+        0,
+        "loss",
+        "Scalar loss.")
+    .Output(
+        1,
+        "probabilities",
+        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
+        "C = num_anchors * num_classes, and softmax was applied to each of the "
+        "num_anchors groups; within a group the num_classes values sum to 1.");
+
+OPERATOR_SCHEMA(SoftmaxFocalLossGradient)
+    .NumInputs(5)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "scores",
+        "See SoftmaxFocalLoss.")
+    .Input(
+        1,
+        "labels",
+        "See SoftmaxFocalLoss.")
+    .Input(
+        2,
+        "normalizer",
+        "See SoftmaxFocalLoss.")
+    .Input(
+        3,
+        "probabilities",
+        "Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.")
+    .Input(
+        4,
+        "d_loss",
+        "Gradient of forward output 0 (loss)")
+    .Output(
+        0,
+        "d_scores",
+        "Gradient of forward input 0 (scores)");
+
+class GetSoftmaxFocalLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SoftmaxFocalLossGradient",
+        "",
+        vector<string>{I(0), I(1), I(2), O(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient);
+} // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu
new file mode 100644
index 0000000..706a215
--- /dev/null
+++ b/modules/detectron/softmax_focal_loss_op.cu
@@ -0,0 +1,249 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "softmax_focal_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void SpatialSoftmaxKernel(const int N, const int A,
+    const int H, const int W, const float* Xdata, float* Pdata,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
+    int D = num_classes * A;
+    int x = index % W;
+    int y = (index / W) % H;
+    int a = (index / (W * H)) % A;
+    int i = index / W / H / A;
+
+    // Subtract max on each cell for numerical reasons
+    float max_val = -FLT_MAX;
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
+      max_val = max(max_val, Xdata[idx]);
+    }
+    // Exponentiate
+    float expsum = 0.0f;
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      float expx = exp(Xdata[idx] - max_val);
+      Pdata[idx] = expx;
+      expsum += expx;
+    }
+    // Normalize
+    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
+      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+      Pdata[idx] /= expsum;
+    }
+  }
+}
+
+
+__global__ void SoftmaxFocalLossKernel(
+    const int N, const int A, const int H, const int W,
+    const float* Pdata, const int* targets, float* losses,
+    const float* weight_pos, const float gamma, const float alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
+    int D = A * num_classes;
+    int x = i % W;
+    int y = (i / W) % H;
+    int a = (i / (W * H)) % A;
+    int n = i / (W * H * A);
+    const int label = static_cast<int>(targets[i]);
+
+    float Np = max(weight_pos[0], 1.0);
+    float z = (label == 0) * (1 - alpha) / Np +
+              (label >= 1) * alpha / Np;
+
+    losses[i] = 0.0;
+    if (label >= 0) {
+      int offset = a * num_classes;
+      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
+      losses[i] =
+          -(pow(1.0f - Pdata[idx], gamma) *
+          log(max(Pdata[idx], FLT_MIN))) * z;
+    }
+  }
+}
+
+
+__global__ void SoftmaxFocalLossGradientWeightKernel(
+    const int N, const int A, const int H, const int W,
+    const float* Pdata, const int* targets, float* buff,
+    const float* weight_pos, const float gamma, const float alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
+    int D = A * num_classes;
+    int x = i % W;
+    int y = (i / W) % H;
+    int a = (i / (W * H)) % A;
+    int n = i / (W * H * A);
+    const int label = static_cast<int>(targets[i]);
+    float Np = max(weight_pos[0], 1.0);
+    float z =  (label == 0) * (1 - alpha) / Np +
+               (label >= 1) * alpha / Np;
+
+    buff[i] = 0.0;
+    if (label >= 0) {
+      int offset = a * num_classes;
+      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
+      float onemp = 1. - Pdata[idx];
+      float p = Pdata[idx];
+      buff[i] =
+          (-pow(onemp, gamma) +
+          gamma * pow(onemp, gamma - 1) * p * log(max(p, FLT_MIN))) * z;
+    }
+  }
+}
+
+
+__global__ void SoftmaxFocalLossGradientKernel(
+    const int N, const int D, const int H, const int W,
+    const float* Pdata, const int* targets, const float* buff,
+    const float* d_loss_data, float* dX, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
+    int A = D / num_classes;
+    int x = i % W;
+    int y = (i / W) % H;
+    int d = (i / (W * H)) % D;
+    int a = d / num_classes;
+    int c = d % num_classes;
+    int n = i / (W * H * D);
+    float d_loss = *d_loss_data;
+
+    int ind = n * (H * W * A) + a * (H * W) + y * W + x;
+    const int label = static_cast<int>(targets[ind]);
+
+    float c1 = (label >= 0) * 1.0;
+    float c2 = (label == c) * 1.0;
+    dX[i] = 0.0;
+    dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
+  }
+}
+
+} // namespace
+
+
+template <>
+bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);         // Logits
+  auto& T = Input(1);         // Labels
+  auto& wp = Input(2);        // num of foregound
+  auto* avg_loss = Output(0); // average loss as output
+  auto* P = Output(1);        // softmax probability, going to be re-used in gradient
+
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  int A = D / num_classes_;
+
+  losses_.Resize(N * A * H * W);
+  P->Resize(N * D * H * W);
+  avg_loss->Resize(vector<TIndex>());
+  math::Set<float, CUDAContext>(
+      avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
+  math::Set<float, CUDAContext>(
+      P->size(), 0.f, P->mutable_data<float>(), &context_);
+  math::Set<float, CUDAContext>(
+      losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
+  DCHECK_EQ(X.ndim(), 4);
+
+  const float* Xdata = X.data<float>();
+  const float* Wdata = wp.data<float>();
+
+
+  // Spatial Softmax Kernel
+  SpatialSoftmaxKernel
+      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
+         0, context_.cuda_stream()>>>(
+    N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
+
+  // Compute loss for each x,y location
+  const int* Tdata = T.data<int>();
+  SoftmaxFocalLossKernel
+  <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
+      0, context_.cuda_stream()>>>(
+    N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
+    Wdata, gamma_, alpha_, num_classes_);
+
+  // sum the losses
+  float* avg_loss_data = avg_loss->mutable_data<float>();
+  math::Sum<float, CUDAContext>(
+      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
+  math::Scale<float, CUDAContext>(
+      1, scale_, avg_loss_data, avg_loss_data, &context_);
+
+  return true;
+}
+
+
+template<>
+bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);    // Logits
+  auto& T = Input(1);    // Label
+  auto& wp = Input(2);   // num of foreground example
+  auto& P = Input(3);    // Softmax Probability
+  auto& d_avg_loss = Input(4);
+  auto* dX = Output(0);  // gradient wrt logits
+
+
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  int H = X.dim32(2);
+  int W = X.dim32(3);
+  int A = D / num_classes_;
+
+  buff_.Resize(N * A * H * W);
+
+  dX->ResizeLike(X);
+
+  const float* Xdata = X.data<float>();
+  const int* Tdata = T.data<int>();
+  const float* Pdata = P.data<float>();
+  const float* Wdata = wp.data<float>();
+
+
+  // Compute the weight for gradients
+  SoftmaxFocalLossGradientWeightKernel
+      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
+         0, context_.cuda_stream()>>>(
+    N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
+    Wdata, gamma_, alpha_, num_classes_);
+  // Compute the gradient with the weights
+  const float* Bdata = buff_.data<float>();
+  SoftmaxFocalLossGradientKernel
+      <<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
+         0, context_.cuda_stream()>>>(
+    N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
+    dX->mutable_data<float>(), num_classes_);
+  math::Scale<float, CUDAContext>(
+    dX->size(), scale_, dX->data<float>(), dX->mutable_data<float>(),
+    &context_);
+  return true;
+}
+
+
+REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
+                       SoftmaxFocalLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
+                       SoftmaxFocalLossGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h
new file mode 100644
index 0000000..98750dd
--- /dev/null
+++ b/modules/detectron/softmax_focal_loss_op.h
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SOFTMAX_FOCAL_LOSS_OP_H_
+#define SOFTMAX_FOCAL_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SoftmaxFocalLossOp final : public Operator<Context> {
+ public:
+  SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        gamma_(OperatorBase::GetSingleArgument<float>("gamma", 1.)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0.25)),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 81)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  float gamma_;
+  float alpha_;
+  int num_classes_;
+  StorageOrder order_;
+  Tensor<Context> losses_;
+};
+
+template <typename T, class Context>
+class SoftmaxFocalLossGradientOp final : public Operator<Context> {
+ public:
+  SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        gamma_(OperatorBase::GetSingleArgument<float>("gamma", 1.)),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0.25)),
+        num_classes_(OperatorBase::GetSingleArgument<int>("num_classes", 81)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float scale_;
+  float gamma_;
+  float alpha_;
+  int num_classes_;
+  StorageOrder order_;
+  Tensor<Context> buff_;
+};
+
+} // namespace caffe2
+
+#endif // SOFTMAX_FOCAL_LOSS_OP_H_
diff --git a/modules/detectron/spatial_narrow_as_op.cc b/modules/detectron/spatial_narrow_as_op.cc
new file mode 100644
index 0000000..363aa63
--- /dev/null
+++ b/modules/detectron/spatial_narrow_as_op.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "spatial_narrow_as_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SpatialNarrowAsGradient,
+    SpatialNarrowAsGradientOp<CPUContext>);
+
+OPERATOR_SCHEMA(SpatialNarrowAs)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Reduces ("narrows") the spatial extent of A to that of B by removing rows and
+columns from the bottom and right.
+)DOC")
+    .Input(
+        0,
+        "A",
+        "3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).")
+    .Input(
+        1,
+        "B",
+        "3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 "
+        "and W1 <= W0.")
+    .Output(
+        0,
+        "C",
+        "Sub window of A containing rows [0, H1 - 1] (inclusive) and columns "
+        "[0, W1 - 1] (inclusive).");
+
+OPERATOR_SCHEMA(SpatialNarrowAsGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "A",
+        "See SpatialNarrowAs.")
+    .Input(
+        1,
+        "B",
+        "See SpatialNarrowAs.")
+    .Input(
+        2,
+        "dC",
+        "Gradient of forward output 0 (C).")
+    .Output(
+        0,
+        "dA",
+        "Gradient of forward input 0 (A)");
+
+class SpatialNarrowAsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SpatialNarrowAsGradient", "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu
new file mode 100644
index 0000000..1ee1cbc
--- /dev/null
+++ b/modules/detectron/spatial_narrow_as_op.cu
@@ -0,0 +1,163 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "spatial_narrow_as_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void CopyKernel(
+    const int N,
+    const int C,
+    const int in_H,
+    const int in_W,
+    const int out_H,
+    const int out_W,
+    const T* in_data,
+    T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
+    int w = index % out_W;
+    int h = (index / out_W) % out_H;
+    int c = (index / out_W / out_H) % C;
+    int n = (index / out_W / out_H / C);
+    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
+    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
+    out_data[out_index] = in_data[in_index];
+  }
+}
+
+template <typename T>
+__global__ void CopyGradientKernel(
+    const int N,
+    const int C,
+    const int in_H,
+    const int in_W,
+    const int out_H,
+    const int out_W,
+    const T* in_data,
+    T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
+    int w = index % in_W;
+    int h = (index / in_W) % in_H;
+    int c = (index / in_W / in_H) % C;
+    int n = (index / in_W / in_H / C);
+    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
+    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
+    out_data[out_index] = in_data[in_index];
+  }
+}
+} // namespace
+
+
+template <>
+bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
+}
+
+template <>
+template <typename T>
+bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
+  // Narrows input 0 (A) spatially to match input 1 (B)
+  auto& A = Input(0);
+  auto& B = Input(1);
+  auto* C = Output(0);
+
+  CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
+  if (A.ndim() == B.ndim()) {
+    CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
+    CAFFE_ENFORCE_GE(
+        A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
+    CAFFE_ENFORCE_GE(
+        A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
+
+    C->ResizeLike(B);
+  } else {
+    // For (N, H, W) case
+    CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
+    CAFFE_ENFORCE_GE(
+        A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
+    CAFFE_ENFORCE_GE(
+        A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
+    C->Resize(A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2));
+  }
+  int out_width = C->dim32(3);
+  int out_height = C->dim32(2);
+  int in_width = A.dim32(3);
+  int in_height = A.dim32(2);
+
+  CopyKernel<T><<<
+      CAFFE_GET_BLOCKS(C->size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      C->dim32(0),
+      C->dim32(1),
+      in_height,
+      in_width,
+      out_height,
+      out_width,
+      A.template data<T>(),
+      C->template mutable_data<T>());
+
+  return true;
+}
+
+template <>
+bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
+}
+
+template <>
+template <typename T>
+bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
+  auto& A = Input(0);
+  auto& B = Input(1);
+  auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
+  auto* dA = Output(0); // Gradient of net w.r.t. input to forward op
+
+  dA->ResizeLike(A);
+  math::Set<T, CUDAContext>(
+      dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
+  int out_width = dA->dim32(3);
+  int out_height = dA->dim32(2);
+  int in_width = dC.dim32(3);
+  int in_height = dC.dim32(2);
+
+  CopyGradientKernel<T><<<
+      CAFFE_GET_BLOCKS(dC.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      dA->dim32(0),
+      dA->dim32(1),
+      in_height,
+      in_width,
+      out_height,
+      out_width,
+      dC.template data<T>(),
+      dA->template mutable_data<T>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    SpatialNarrowAsGradient,
+    SpatialNarrowAsGradientOp<CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/spatial_narrow_as_op.h b/modules/detectron/spatial_narrow_as_op.h
new file mode 100644
index 0000000..a1fca86
--- /dev/null
+++ b/modules/detectron/spatial_narrow_as_op.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SPATIAL_NARROW_AS_OP_H_
+#define SPATIAL_NARROW_AS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class SpatialNarrowAsOp final : public Operator<Context> {
+ public:
+  SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  template <typename T>
+  bool DoRunWithType();
+};
+
+template <class Context>
+class SpatialNarrowAsGradientOp final : public Operator<Context> {
+ public:
+  SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  template <typename T>
+  bool DoRunWithType();
+};
+
+} // namespace caffe2
+
+#endif // SPATIAL_NARROW_AS_OP_H_
diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc
new file mode 100644
index 0000000..b668701
--- /dev/null
+++ b/modules/detectron/upsample_nearest_op.cc
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "upsample_nearest_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    UpsampleNearestGradient,
+    UpsampleNearestGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(UpsampleNearest)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Nearest neighbor upsampling operation. Implementation taken from THCUNN.
+)DOC")
+    .Arg(
+        "scale",
+        "(int) default 2; integer upsampling factor.")
+    .Input(
+        0,
+        "X",
+        "4D feature map input of shape (N, C, H, W).")
+    .Output(
+        0,
+        "Y",
+        "4D feature map of shape (N, C, scale * H, scale * W); Values are "
+        "neareast neighbor samples from X.");
+
+OPERATOR_SCHEMA(UpsampleNearestGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See UpsampleNearest.")
+    .Input(
+        1,
+        "dY",
+        "Gradient of forward output 0 (Y).")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X).");
+
+class GetUpsampleNearestGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "UpsampleNearestGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient);
+
+} // namespace caffe2
diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu
new file mode 100644
index 0000000..2afff97
--- /dev/null
+++ b/modules/detectron/upsample_nearest_op.cu
@@ -0,0 +1,220 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
+ *
+ * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+ * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+ * Copyright (c) 2011-2013 NYU (Clement Farabet)
+ * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
+ *                         Leon Bottou, Iain Melvin, Jason Weston)
+ * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+ * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+ *                         Samy Bengio, Johnny Mariethoz)
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the names of NEC Laboratories American and IDIAP Research
+ *    Institute nor the names of its contributors may be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "caffe2/core/context_gpu.h"
+#include "upsample_nearest_op.h"
+
+namespace caffe2 {
+
+namespace {
+__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
+  int x, y, z, w;
+  w = ii % d3;
+  ii = ii/d3;
+  z = ii % d2;
+  ii = ii/d2;
+  y = ii % d1;
+  ii = ii/d1;
+  x = ii;
+  w = w/scale_factor;
+  z = z/scale_factor;
+  d2 /= scale_factor;
+  d3 /= scale_factor;
+  return (((x*d1+y)*d2)+z)*d3+w;
+}
+
+__device__ int translate_idx_inv(
+    int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
+  int x, y, z, w;
+  w = ii % d3;
+  ii = ii/d3;
+  z = ii % d2;
+  ii = ii/d2;
+  y = ii % d1;
+  ii = ii/d1;
+  x = ii;
+  w = w*scale_factor+off_x;
+  z = z*scale_factor+off_y;
+  d2 *= scale_factor;
+  d3 *= scale_factor;
+  return (((x*d1+y)*d2)+z)*d3+w;
+}
+
+__global__ void upscale(const float *input, float *output, long no_elements,
+                        int scale_factor, int d1, int d2, int d3) {
+  long ii = threadIdx.x + blockDim.x * blockIdx.x;
+  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
+  if (ii >= no_elements) return;
+  int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
+  output[ii]=input[ipidx];
+}
+
+__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
+                          long no_elements, int scale_factor, int d1, int d2,
+                          int d3) {
+  long ii = threadIdx.x + blockDim.x * blockIdx.x;
+  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
+  if (ii >= no_elements) return;
+  for (int i=0; i < scale_factor; i++){
+    for(int j=0; j < scale_factor; j++){
+      int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
+      gradInput_data[ii] += gradOutput_data[ipidx];
+    }
+  }
+}
+} // namespace
+
+template<>
+bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+
+  vector<TIndex> out_shape;
+  for (int i = 0; i < X.ndim(); ++i) {
+    out_shape.push_back(X.dim32(i));
+  }
+  out_shape[X.ndim() - 1] *= scale_;
+  out_shape[X.ndim() - 2] *= scale_;
+  Y->Resize(out_shape);
+
+  int d1;
+  int d2;
+  int d3;
+  if (X.ndim() == 3) {
+    d1 = Y->dim32(0);
+    d2 = Y->dim32(1);
+    d3 = Y->dim32(2);
+  } else {
+    d1 = Y->dim32(1);
+    d2 = Y->dim32(2);
+    d3 = Y->dim32(3);
+  }
+  long no_elements = Y->size();
+
+  const float *input_data = X.data<float>();
+  float *output_data = Y->mutable_data<float>();
+
+  // cuda blocks & threads:
+  long nthreads = 256;
+  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+  // TODO: When we move to SM 3.5 we should update this
+  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+  long n_yblocks = (long)ceil(
+      (float)no_elements / (float)(n_xblocks * nthreads));
+  CAFFE_ENFORCE(n_yblocks <= 65535);
+  dim3 blocks(n_xblocks, n_yblocks);
+  dim3 threads(nthreads);
+
+  upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
+      input_data, output_data, no_elements, scale_, d1, d2, d3);
+  return true;
+}
+
+
+template<>
+bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X  = Input(0);   // Original input to "forward" op
+  auto& dY = Input(1);   // Gradient of net w.r.t. output of "forward" op
+                         // (aka "gradOutput")
+  auto* dX = Output(0);  // Gradient of net w.r.t. input to "forward" op
+                         // (aka "gradInput")
+
+  dX->ResizeLike(X);
+  float *gradInput_data = dX->mutable_data<float>();
+  const float *gradOutput_data = dY.data<float>();
+
+  int d1;
+  int d2;
+  int d3;
+  if (dX->ndim() == 3) {
+    d1 = dX->dim32(0);
+    d2 = dX->dim32(1);
+    d3 = dX->dim32(2);
+  } else {
+    d1 = dX->dim32(1);
+    d2 = dX->dim32(2);
+    d3 = dX->dim32(3);
+  }
+  long no_elements = dX->size();
+
+  // cuda blocks & threads:
+  long nthreads = 256;
+  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+  // TODO: When we move to SM 3.5 we should update this
+  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+  long n_yblocks = (long)ceil(
+      (float)no_elements / (float)(n_xblocks * nthreads));
+  CAFFE_ENFORCE(n_yblocks <= 65535);
+  dim3 blocks(n_xblocks, n_yblocks);
+  dim3 threads(nthreads);
+
+  math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
+  downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
+      gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(UpsampleNearest,
+                       UpsampleNearestOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
+                       UpsampleNearestGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h
new file mode 100644
index 0000000..2d3ecfa
--- /dev/null
+++ b/modules/detectron/upsample_nearest_op.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UPSAMPLE_NEAREST_OP_H_
+#define UPSAMPLE_NEAREST_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class UpsampleNearestOp final : public Operator<Context> {
+ public:
+  UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<int>("scale", 2)) {
+    DCHECK_GE(scale_, 1);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  int scale_;
+};
+
+template <typename T, class Context>
+class UpsampleNearestGradientOp final : public Operator<Context> {
+ public:
+  UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<int>("scale", 2)) {
+    DCHECK_GE(scale_, 1);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    // No CPU implementation for now
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  int scale_;
+};
+
+} // namespace caffe2
+
+#endif // UPSAMPLE_NEAREST_OP_H_
diff --git a/modules/module_test/CMakeLists.txt b/modules/module_test/CMakeLists.txt
new file mode 100644
index 0000000..3e97714
--- /dev/null
+++ b/modules/module_test/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  # If we are building the standalone module, we set the proper cmake variables.
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  find_package(Caffe2 REQUIRED)
+  set(BUILD_TEST ON)
+endif()
+
+if (BUILD_TEST)
+  add_library(
+      caffe2_module_test_dynamic
+      ${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc)
+
+  target_link_libraries(caffe2_module_test_dynamic caffe2_library)
+  install(TARGETS caffe2_module_test_dynamic DESTINATION lib)
+endif()
diff --git a/modules/module_test/module_test_dynamic.cc b/modules/module_test/module_test_dynamic.cc
new file mode 100644
index 0000000..3259616
--- /dev/null
+++ b/modules/module_test/module_test_dynamic.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/module.h"
+#include "caffe2/core/operator.h"
+
+// An explicitly defined module, testing correctness when we dynamically link a
+// module
+CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing.");
+
+namespace caffe2 {
+
+class Caffe2ModuleTestDynamicDummyOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  bool Run(int /* unused */ /*stream_id*/) override {
+    return true;
+  }
+  virtual string type() {
+    return "base";
+  }
+};
+
+REGISTER_CPU_OPERATOR(
+  Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp);
+OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy);
+
+} // namespace caffe2
diff --git a/modules/observers/CMakeLists.txt b/modules/observers/CMakeLists.txt
new file mode 100644
index 0000000..7f01f0f
--- /dev/null
+++ b/modules/observers/CMakeLists.txt
@@ -0,0 +1,26 @@
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (NOT USE_OBSERVERS)
+    return()
+  endif()
+else()
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  project(caffe2_observers CXX)
+  find_package(Caffe2 REQUIRED)
+  option(BUILD_SHARED_LIBS "Build shared libs." ON)
+endif()
+
+add_library(caffe2_observers
+    "${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc"
+    )
+target_link_libraries(caffe2_observers PUBLIC caffe2_library)
+target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
+target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB")
+install(TARGETS caffe2_observers DESTINATION lib)
+caffe2_interface_library(caffe2_observers caffe2_observers_library)
+
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE)
+endif()
+
diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h
new file mode 100644
index 0000000..3650e45
--- /dev/null
+++ b/modules/observers/net_observer_reporter.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <map>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+
+namespace caffe2 {
+
+struct PerformanceInformation {
+  // Analytic
+  int64_t flops = 0;
+  int64_t bytes_written = 0;
+  int64_t bytes_read = 0;
+  std::vector<TensorShape> tensor_shapes = {};
+  std::vector<Argument> args = {};
+  std::string engine = ""; // the engine used
+  std::string type = ""; // the type of the operator
+  // Measured
+  double latency = 0;
+};
+
+class CAFFE2_OBSERVER_API NetObserverReporter {
+ public:
+  virtual ~NetObserverReporter() = default;
+
+  /*
+    Report the delay metric collected by the observer.
+    The delays are saved in a map. The key is an identifier associated
+    with the reported delay. The value is the delay value in float
+  */
+  virtual void report(
+      NetBase* net,
+      std::map<std::string, PerformanceInformation>&) = 0;
+};
+}
diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc
new file mode 100644
index 0000000..2355fed
--- /dev/null
+++ b/modules/observers/net_observer_reporter_print.cc
@@ -0,0 +1,125 @@
+#include "observers/net_observer_reporter_print.h"
+
+#include <sstream>
+#include "caffe2/core/init.h"
+#include "observers/observer_config.h"
+
+namespace caffe2 {
+
+const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
+static std::string get_op_args(PerformanceInformation p);
+static std::string get_tensor_shapes(PerformanceInformation p);
+
+void NetObserverReporterPrint::report(
+    NetBase* net,
+    std::map<std::string, PerformanceInformation>& info) {
+  // Not allowed to use json library
+  std::map<
+      std::string,
+      std::map<std::string, std::map<std::string, std::string>>>
+      caffe2_perf;
+
+  for (auto& p : info) {
+    if ((p.first == "NET_DELAY") && (info.size() == 1)) {
+      // for Net_delay perf
+      caffe2_perf["NET"] = {
+          {"latency",
+           {{"value", caffe2::to_string(p.second.latency * 1000)},
+            {"unit", "us"}}},
+          {"flops", {{"value", "-1"}, {"unit", "flops"}}}};
+    } else if (p.first != "NET_DELAY") {
+      // for operator perf
+      std::string shape_str = get_tensor_shapes(p.second);
+      std::string args_str = get_op_args(p.second);
+
+      caffe2_perf[p.first] = {
+          {"latency",
+           {{"value", caffe2::to_string(p.second.latency * 1000)},
+            {"unit", "us"}}},
+          {"flops",
+           {{
+                "value",
+                caffe2::to_string(p.second.flops),
+            },
+            {"unit", "flops"}}},
+          {"tensor_shapes", {{"info_string", shape_str}, {"unit", ""}}},
+          {"op_args", {{"info_string", args_str}, {"unit", ""}}}};
+    }
+  }
+
+  for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
+    std::stringstream buffer;
+    buffer << IDENTIFIER << "{";
+    buffer << "\"" << it->first << "\""
+           << ": {";
+    for (auto jt = it->second.begin(); jt != it->second.end(); jt++) {
+      buffer << "\"" << jt->first << "\""
+             << ": {";
+      for (auto kt = jt->second.begin(); kt != jt->second.end(); kt++) {
+        buffer << "\"" << kt->first << "\""
+               << ": "
+               << "\"" << kt->second << "\"";
+        auto lt = kt;
+        if ((++lt) != jt->second.end()) {
+          buffer << ", ";
+        }
+      }
+      buffer << "}";
+      auto lt = jt;
+      if ((++lt) != it->second.end()) {
+        buffer << ", ";
+      }
+    }
+    buffer << "}}";
+    LOG(INFO) << buffer.str();
+  }
+}
+
+static std::string get_tensor_shapes(PerformanceInformation p) {
+  std::string shape_str;
+  std::stringstream shape_stream;
+  if (!p.tensor_shapes.empty()) {
+    shape_stream << "[";
+    for (int i = 0; i < p.tensor_shapes.size(); i++) {
+      shape_stream << "[";
+      for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) {
+        shape_stream << p.tensor_shapes[i].dims(j) << ", ";
+      }
+      shape_stream << "], ";
+    }
+    shape_stream << "]";
+    shape_str = shape_stream.str();
+  } else {
+    shape_str = "[]";
+  }
+  return shape_str;
+}
+
+static std::string get_op_args(PerformanceInformation p) {
+  std::string args_str;
+  if (!p.args.empty()) {
+    std::stringstream args;
+    args << "[";
+    for (int i = 0; i < p.args.size(); i++) {
+      args << "{" << p.args[i].name() << ": ";
+      if (p.args[i].has_i()) {
+        args << p.args[i].i();
+      } else if (p.args[i].has_s()) {
+        args << p.args[i].s();
+      } else if (p.args[i].has_n()) {
+        args << &p.args[i].n();
+      } else if (p.args[i].has_f()) {
+        args << p.args[i].f();
+      } else {
+        args << "None";
+      }
+      args << "}, ";
+    }
+    args << "]";
+    args_str = args.str();
+  } else {
+    args_str = "[]";
+  }
+  return args_str;
+}
+}
diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h
new file mode 100644
index 0000000..eb712b8
--- /dev/null
+++ b/modules/observers/net_observer_reporter_print.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "observers/net_observer_reporter.h"
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter {
+ public:
+  static const std::string IDENTIFIER;
+  void report(NetBase* net, std::map<std::string, PerformanceInformation>&);
+};
+
+} // namespace caffe2
diff --git a/modules/observers/observer_config.cc b/modules/observers/observer_config.cc
new file mode 100644
index 0000000..c6ba6a2
--- /dev/null
+++ b/modules/observers/observer_config.cc
@@ -0,0 +1,12 @@
+#include "observers/observer_config.h"
+
+namespace caffe2 {
+
+int ObserverConfig::netInitSampleRate_ = 0;
+int ObserverConfig::netFollowupSampleRate_ = 0;
+int ObserverConfig::netFollowupSampleCount_ = 0;
+int ObserverConfig::operatorNetSampleRatio_ = 0;
+int ObserverConfig::skipIters_ = 0;
+unique_ptr<NetObserverReporter> ObserverConfig::reporter_ = nullptr;
+int ObserverConfig::marker_ = -1;
+}
diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h
new file mode 100644
index 0000000..e1a6b3a
--- /dev/null
+++ b/modules/observers/observer_config.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include "observers/net_observer_reporter.h"
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+/*
+  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 :
+      Log operator metrics in every iteration
+  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 :
+      Log net metrics in every iterationn
+  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
+          netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 :
+      Log operator metrics first at odds of 1 / n. Once first logged,
+      the following c logs are at odds of 1 / min(n, m). Then repeat
+  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
+          netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 :
+      Log net metrics first at odds of 1 / n. Once first logged,
+      the following c logs are at odds of 1 / min(n, m). Then repeat
+  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
+          netFollowupSampleCount == c && operatorNetSampleRatio_ == o :
+      Log net metrics first at odds of 1 / n. Once first logged,
+      the following c logs are at odds of 1 / min(n, m), if the random number
+      is multiples of o, log operator metrics instead. Then repeat
+  skipIters_ == n: skip the first n iterations of the net.
+*/
+class CAFFE2_OBSERVER_API ObserverConfig {
+ public:
+  static void initSampleRate(
+      int netInitSampleRate,
+      int netFollowupSampleRate,
+      int netFollowupSampleCount,
+      int operatorNetSampleRatio,
+      int skipIters) {
+    CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate);
+    CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0);
+    netInitSampleRate_ = netInitSampleRate;
+    netFollowupSampleRate_ = netFollowupSampleRate;
+    netFollowupSampleCount_ = netFollowupSampleCount;
+    operatorNetSampleRatio_ = operatorNetSampleRatio;
+    skipIters_ = skipIters;
+  }
+  static int getNetInitSampleRate() {
+    return netInitSampleRate_;
+  }
+  static int getNetFollowupSampleRate() {
+    return netFollowupSampleRate_;
+  }
+  static int getNetFollowupSampleCount() {
+    return netFollowupSampleCount_;
+  }
+  static int getOpoeratorNetSampleRatio() {
+    return operatorNetSampleRatio_;
+  }
+  static int getSkipIters() {
+    return skipIters_;
+  }
+  static void setReporter(unique_ptr<NetObserverReporter> reporter) {
+    reporter_ = std::move(reporter);
+  }
+  static NetObserverReporter* getReporter() {
+    CAFFE_ENFORCE(reporter_);
+    return reporter_.get();
+  }
+  static void setMarker(int marker) {
+    marker_ = marker;
+  }
+  static int getMarker() {
+    return marker_;
+  }
+
+ private:
+  /* The odds of log net metric initially or immediately after reset */
+  static int netInitSampleRate_;
+
+  /* The odds of log net metric after log once after start of reset */
+  static int netFollowupSampleRate_;
+
+  /* The number of follow up logs to be collected for odds of
+     netFollowupSampleRate_ */
+  static int netFollowupSampleCount_;
+
+  /* The odds to log the operator metric instead of the net metric.
+     When the operator is logged the net is not logged. */
+  static int operatorNetSampleRatio_;
+
+  /* skip the first few iterations */
+  static int skipIters_;
+
+  static unique_ptr<NetObserverReporter> reporter_;
+
+  /* marker used in identifying the metrics in certain reporters */
+  static int marker_;
+};
+
+}
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
new file mode 100644
index 0000000..ed391a3
--- /dev/null
+++ b/modules/observers/perf_observer.cc
@@ -0,0 +1,164 @@
+#include "observers/perf_observer.h"
+#include "observers/observer_config.h"
+
+#include <random>
+#include "caffe2/core/common.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
+  AddGlobalNetObserverCreator([](NetBase* subject) {
+    return caffe2::make_unique<PerfNetObserver>(subject);
+  });
+  return true;
+}
+} // namespace
+
+REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
+    registerGlobalPerfNetObserverCreator,
+    &registerGlobalPerfNetObserverCreator,
+    "Caffe2 net global observer creator");
+
+PerfNetObserver::PerfNetObserver(NetBase* subject_)
+    : NetObserver(subject_), numRuns_(0) {}
+
+PerfNetObserver::~PerfNetObserver() {}
+
+void PerfNetObserver::Start() {
+  static int visitCount = 0;
+  // Select whether to log the operator or the net.
+  // We have one sample rate for the entire app.
+  int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
+  int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
+  int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
+  int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
+  int skipIters = ObserverConfig::getSkipIters();
+  int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
+  if (skipIters <= numRuns_ && sampleRate > 0 && rand() % sampleRate == 0) {
+    visitCount++;
+    if (visitCount == netFollowupSampleCount) {
+      visitCount = 0;
+    }
+    if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
+      logType_ = PerfNetObserver::OPERATOR_DELAY;
+    } else {
+      logType_ = PerfNetObserver::NET_DELAY;
+    }
+  } else {
+    logType_ = PerfNetObserver::NONE;
+  }
+  numRuns_++;
+
+  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
+    /* Always recreate new operator  observers
+       whenever we measure operator delay */
+    const auto& operators = subject_->GetOperators();
+    for (auto* op : operators) {
+      observerMap_[op] = op->AttachObserver(
+          caffe2::make_unique<PerfOperatorObserver>(op, this));
+    }
+  }
+
+  if (logType_ != PerfNetObserver::NONE) {
+    /* Only start timer when we need to */
+    timer_.Start();
+  }
+}
+
+void PerfNetObserver::Stop() {
+  if (logType_ == PerfNetObserver::NONE) {
+    return;
+  }
+  auto currentRunTime = timer_.MilliSeconds();
+  std::map<std::string, PerformanceInformation> info;
+  PerformanceInformation net_perf;
+  net_perf.latency = currentRunTime;
+  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
+    const auto& operators = subject_->GetOperators();
+    for (int idx = 0; idx < operators.size(); ++idx) {
+      const auto* op = operators[idx];
+      auto name = getObserverName(op, idx);
+      PerformanceInformation p;
+
+      p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
+                      ->getMilliseconds();
+
+      p.engine = op->engine();
+      p.type = op->type();
+      p.tensor_shapes =
+          static_cast<const PerfOperatorObserver*>(observerMap_[op])
+              ->getTensorShapes();
+
+      if (op->has_debug_def()) {
+        for (auto arg : op->debug_def().arg()) {
+          p.args.emplace_back(arg);
+        }
+      }
+
+      info.insert({name, p});
+    }
+
+    /* clear all operator delay after use so that we don't spent time
+       collecting the operator delay info in later runs */
+    for (auto* op : operators) {
+      op->DetachObserver(observerMap_[op]);
+    }
+    observerMap_.clear();
+  }
+  info.insert({"NET_DELAY", net_perf});
+  ObserverConfig::getReporter()->report(subject_, info);
+}
+
+caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
+    const {
+  string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
+  string displayName =
+      (op->has_debug_def() ? op->debug_def().name().size()
+               ? op->debug_def().name()
+               : (op->debug_def().output_size() ? op->debug_def().output(0)
+                                                : "NO_OUTPUT")
+                           : "NO_DEF");
+  caffe2::string name =
+      "ID_" + caffe2::to_string(idx) + "_" + opType + "_" + displayName;
+  return name;
+}
+
+PerfOperatorObserver::PerfOperatorObserver(
+    OperatorBase* op,
+    PerfNetObserver* netObserver)
+    : ObserverBase<OperatorBase>(op),
+      netObserver_(netObserver),
+      milliseconds_(0) {
+  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
+}
+
+PerfOperatorObserver::~PerfOperatorObserver() {}
+
+void PerfOperatorObserver::Start() {
+  /* Get the time from the start of the net minus the time spent
+     in previous invocations. It is the time spent on other operators.
+     This way, when the operator finishes, the time from the start of the net
+     minus the time spent in all other operators  is the total time on this
+     operator. This is done to avoid saving a timer in each operator */
+  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+}
+
+void PerfOperatorObserver::Stop() {
+  /* Time from the start of the net minus the time spent on all other
+     operators is the time spent on this operator */
+  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+  tensor_shapes_ = subject_->InputTensorShapes();
+}
+
+double PerfOperatorObserver::getMilliseconds() const {
+  return milliseconds_;
+}
+
+std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
+  return tensor_shapes_;
+}
+
+} // namespace caffe2
diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h
new file mode 100644
index 0000000..6fb4063
--- /dev/null
+++ b/modules/observers/perf_observer.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/timer.h"
+
+#include <unordered_map>
+
+namespace caffe2 {
+
+
+class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
+ public:
+  explicit PerfNetObserver(NetBase* subject_);
+  virtual ~PerfNetObserver();
+
+  caffe2::Timer& getTimer() {
+    return timer_;
+  }
+
+ private:
+  void Start() override;
+  void Stop() override;
+
+  caffe2::string getObserverName(const OperatorBase* op, int idx) const;
+
+ private:
+  enum LogType {
+    NONE,
+    OPERATOR_DELAY,
+    NET_DELAY,
+  };
+  LogType logType_;
+  unsigned int numRuns_;
+  std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
+      observerMap_;
+
+  caffe2::Timer timer_;
+};
+
+class PerfOperatorObserver : public ObserverBase<OperatorBase> {
+ public:
+  PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
+  virtual ~PerfOperatorObserver();
+
+  double getMilliseconds() const;
+  std::vector<TensorShape> getTensorShapes() const;
+
+ private:
+  void Start() override;
+  void Stop() override;
+
+ private:
+  // Observer of a net that owns corresponding op. We make sure net is never
+  // destructed while operator observer is still alive. First operator observer
+  // gets destructed, then the op, then the net and its observer.
+  // We do this trick in order to get access to net's name and other fields
+  // without storing inside the operator observer. Each field is memory
+  // costly here and a raw pointer is a cheapest sholution
+  PerfNetObserver* netObserver_;
+  double milliseconds_;
+  std::vector<TensorShape> tensor_shapes_;
+};
+} // namespace caffe2
diff --git a/modules/rocksdb/CMakeLists.txt b/modules/rocksdb/CMakeLists.txt
new file mode 100644
index 0000000..242423e
--- /dev/null
+++ b/modules/rocksdb/CMakeLists.txt
@@ -0,0 +1,68 @@
+# ---[ RocksDB module
+# In addition to being a useful module itself, RocksDB is also an exemplar
+# case where show how one should built a Caffe2 module inside the Caffe2
+# repository.
+#
+# This cmake file achieves two build modes:
+# (1) If one is invoking the main Caffe2 build, we will check a USE_* option,
+#     in this case USE_ROCKSDB, to test if we want to build this module.
+# (2) if we are building it in a standalone way, we will find the preinstalled
+#     Caffe2 library, and then build the library and install it.
+
+# ---[ First, determine if we are building with the main repo or not.
+# This is guarded by the CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO variable. It then
+# routes build to two paths:
+# (1) When we are building with the main repo, the caffe2_library is going to
+#     be already defined, and all related paths will be defined too. So we will
+#     simply test if the main repo build wants to build this module, in our
+#     case by the variable "USE_ROCKSDB".
+# (2) When we are not building with the main repo, we will need to do the usual
+#     cmake setup: version checks, project options, find dependent packages,
+#     etc.
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (NOT USE_ROCKSDB)
+    return()
+  endif()
+else()
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  project(caffe2_rocksdb CXX)
+  find_package(Caffe2 REQUIRED)
+  option(BUILD_SHARED_LIBS "Build shared libs." ON)
+  list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../../cmake/Modules)
+endif()
+
+
+# ---[ Second, find dependencies.
+# This one should be similar to the standard dependency discovery in normal
+# cmake. Note that for modules that are located in the Caffe2 repository,
+# cmake related files, such as FindRocksDB in this case, should live in the
+# cmake/ folder under root.
+find_package(RocksDB)
+if(NOT ROCKSDB_FOUND)
+  message(
+     FATAL_ERROR
+     "RocksDB not found. If you do not need caffe2_rocksdb, set "
+     "-DUSE_ROCKSDB=OFF to solve this error.")
+endif()
+
+# ---[ Third, create the CMake target.
+# The key to note is that this library will need to depend on caffe2_library,
+# which is the main lib of Caffe2. If your library explicitly depends on cuda,
+# then you will need to depend on the caffe2_gpu_library as well.
+add_library(caffe2_rocksdb ${CMAKE_CURRENT_SOURCE_DIR}/rocksdb.cc)
+target_link_libraries(caffe2_rocksdb PUBLIC caffe2_library)
+target_link_libraries(caffe2_rocksdb PRIVATE ${RocksDB_LIBRARIES})
+target_include_directories(caffe2_rocksdb PRIVATE ${RocksDB_INCLUDE_DIR})
+install(TARGETS caffe2_rocksdb DESTINATION lib)
+
+# ---[ Last, Append the library to Caffe2_MAIN_LIBS, if we are building with
+# the main repo.
+# The purpose of this is that, for all binaries built in the Caffe2 main repo,
+# they will be built with the first class modules that are built. As a result,
+# these binaries will not need to explicitly load these modules before using
+# them.
+# Note(jiayq): this also depends on a separate cmake move to reorg test builds
+# and binary builds after modules. When it is done, this note should be removed.
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_rocksdb PARENT_SCOPE)
+endif()
diff --git a/modules/rocksdb/rocksdb.cc b/modules/rocksdb/rocksdb.cc
new file mode 100644
index 0000000..64fdde4
--- /dev/null
+++ b/modules/rocksdb/rocksdb.cc
@@ -0,0 +1,116 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/module.h"
+#include "caffe2/core/flags.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/leveldb_options.h"
+
+CAFFE2_DEFINE_int(caffe2_rocksdb_block_size, 65536,
+                  "The caffe2 rocksdb block size when writing a rocksdb.");
+
+namespace caffe2 {
+namespace db {
+
+class RocksDBCursor : public Cursor {
+ public:
+  explicit RocksDBCursor(rocksdb::DB* db)
+      : iter_(db->NewIterator(rocksdb::ReadOptions())) {
+    SeekToFirst();
+  }
+  ~RocksDBCursor() {}
+  void Seek(const string& key) override { iter_->Seek(key); }
+  bool SupportsSeek() override { return true; }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void Next() override { iter_->Next(); }
+  string key() override { return iter_->key().ToString(); }
+  string value() override { return iter_->value().ToString(); }
+  bool Valid() override { return iter_->Valid(); }
+
+ private:
+  std::unique_ptr<rocksdb::Iterator> iter_;
+};
+
+class RocksDBTransaction : public Transaction {
+ public:
+  explicit RocksDBTransaction(rocksdb::DB* db) : db_(db) {
+    CAFFE_ENFORCE(db_);
+    batch_.reset(new rocksdb::WriteBatch());
+  }
+  ~RocksDBTransaction() { Commit(); }
+  void Put(const string& key, const string& value) override {
+    batch_->Put(key, value);
+  }
+  void Commit() override {
+    rocksdb::Status status = db_->Write(rocksdb::WriteOptions(), batch_.get());
+    batch_.reset(new rocksdb::WriteBatch());
+    CAFFE_ENFORCE(
+        status.ok(), "Failed to write batch to rocksdb: " + status.ToString());
+  }
+
+ private:
+  rocksdb::DB* db_;
+  std::unique_ptr<rocksdb::WriteBatch> batch_;
+
+  DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
+};
+
+class RocksDB : public DB {
+ public:
+  RocksDB(const string& source, Mode mode) : DB(source, mode) {
+    rocksdb::LevelDBOptions options;
+    options.block_size = FLAGS_caffe2_rocksdb_block_size;
+    options.write_buffer_size = 268435456;
+    options.max_open_files = 100;
+    options.error_if_exists = mode == NEW;
+    options.create_if_missing = mode != READ;
+    rocksdb::Options rocksdb_options = rocksdb::ConvertOptions(options);
+
+    rocksdb::DB* db_temp;
+    rocksdb::Status status = rocksdb::DB::Open(
+      rocksdb_options, source, &db_temp);
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to open rocksdb ",
+        source,
+        "\n",
+        status.ToString());
+    db_.reset(db_temp);
+    VLOG(1) << "Opened rocksdb " << source;
+  }
+
+  void Close() override { db_.reset(); }
+  unique_ptr<Cursor> NewCursor() override {
+    return make_unique<RocksDBCursor>(db_.get());
+  }
+  unique_ptr<Transaction> NewTransaction() override {
+    return make_unique<RocksDBTransaction>(db_.get());
+  }
+
+ private:
+  std::unique_ptr<rocksdb::DB> db_;
+};
+
+REGISTER_CAFFE2_DB(RocksDB, RocksDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(rocksdb, RocksDB);
+
+}  // namespace db
+
+CAFFE2_MODULE(caffe2_rocksdb, "RocksDB implementation for caffe2::DB.");
+}  // namespace caffe2
diff --git a/mypy-README.md b/mypy-README.md
new file mode 100644
index 0000000..c4c2918
--- /dev/null
+++ b/mypy-README.md
@@ -0,0 +1,11 @@
+### Optional type checking with mypy
+
+mypy is an optional static typechecker that works with Python 3.
+To use it, install the following dependencies:
+```bash
+# Install dependencies
+pip install mypy mypy-extensions
+
+# Run type checker in the pytorch/ directory
+mypy @mypy-files.txt
+```
diff --git a/mypy-files.txt b/mypy-files.txt
new file mode 100644
index 0000000..47f6bc5
--- /dev/null
+++ b/mypy-files.txt
@@ -0,0 +1 @@
+aten/src/ATen/function_wrapper.py
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..b676651
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+python_version = 2.7
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2431282
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pyyaml
+typing
diff --git a/scripts/add_apache_header.sh b/scripts/add_apache_header.sh
new file mode 100755
index 0000000..a29a059
--- /dev/null
+++ b/scripts/add_apache_header.sh
@@ -0,0 +1 @@
+cat  apache_header.txt $1 > _add_apache_header.txt && mv _add_apache_header.txt $1
diff --git a/scripts/apache_header.txt b/scripts/apache_header.txt
new file mode 100644
index 0000000..67f0790
--- /dev/null
+++ b/scripts/apache_header.txt
@@ -0,0 +1,16 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
diff --git a/scripts/apache_python.txt b/scripts/apache_python.txt
new file mode 100644
index 0000000..f946a1a
--- /dev/null
+++ b/scripts/apache_python.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
diff --git a/scripts/appveyor/install.bat b/scripts/appveyor/install.bat
new file mode 100644
index 0000000..a8251f5
--- /dev/null
+++ b/scripts/appveyor/install.bat
@@ -0,0 +1,10 @@
+:: Installation scripts for appveyor.
+
+@echo on
+
+if "%USE_CUDA%" == "ON" call %~dp0%install_cuda.bat
+
+:: Miniconda path for appveyor
+set PATH=C:\Miniconda-x64;C:\Miniconda-x64\Scripts;%PATH%
+:: Install numpy
+conda install -y numpy
\ No newline at end of file
diff --git a/scripts/appveyor/install_cuda.bat b/scripts/appveyor/install_cuda.bat
new file mode 100644
index 0000000..e40961c
--- /dev/null
+++ b/scripts/appveyor/install_cuda.bat
@@ -0,0 +1,22 @@
+@echo on
+
+appveyor DownloadFile ^
+  https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_windows-exe ^
+  -FileName cuda_8.0.44_windows.exe
+appveyor Downloadfile ^
+  http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-windows10-x64-v5.1.zip ^
+  -FileName cudnn-8.0-windows10-x64-v5.1.zip
+
+cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0 nvrtc_8.0 nvrtc_dev_8.0
+set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
+
+7z x cudnn-8.0-windows10-x64-v5.1.zip
+copy cuda\include\cudnn.h ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include\"
+copy cuda\lib\x64\cudnn.lib ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64\"
+copy cuda\bin\cudnn64_5.dll ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\"
+
+:: Make sure that nvcc is working correctly.
+nvcc -V || exit /b
\ No newline at end of file
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
new file mode 100755
index 0000000..e3286ab
--- /dev/null
+++ b/scripts/build_anaconda.sh
@@ -0,0 +1,404 @@
+#!/bin/bash
+
+# This script creates and (possibly) uploads a Caffe2 Anaconda package, and
+# then (optionally) installs the package locally into the current activated
+# conda environment. This script handles flags needed by CUDA and gcc versions,
+# and has good default behavior if called with no arguments.
+#
+# Usage:
+#  ./build_anaconda.sh [--cuda X.Y] [--cudnn Z]
+#                      [--name package_name]
+#                      [--suffix package_name_suffix]
+#                      [--skip-tests]
+#                      [--install-locally]
+#                      [--full] [--integrated]
+#                      [--conda <flag forwared to conda-build>]...
+#                      [<flags forwarded to cmake>]...
+#
+# Parameters can also be passed through the BUILD_ENVIRONMENT environment
+# variable, e.g. 
+#  BUILD_ENVIRONMENT=conda2-cuda8.0-cudnn7-gcc4.8 ./scripts/build_anaconda.sh
+# - Parameters parsed from the BUILD_ENVIRONMENT will be overridden by command
+#   line parameters.
+# - The conda version and gcc version given in BUILD_ENVIRONMENT are ignored.
+#   These versions are determined by calling the binaries with --version
+#
+# The special values CAFFE2_ANACONDA_ORG_ACCESS_TOKEN, and
+# ANACONDA_USERNAME can only be passed in as environment variables.
+#
+# This script works by
+# 1. Choosing the correct conda-build folder to use
+# 2. Building the package name and build-string
+# 3. Determining which flags and packages are required
+# 4. Calling into conda-build
+# 5. (optional) installing the built package locally
+
+set -ex
+
+#
+# Functions used in this script
+#
+
+# portable_sed: A wrapper around sed that works on both mac and linux, used to
+# alter conda-build files such as the meta.yaml. It always adds the inplace
+# flag
+#   portable_sed <full regex string> <file>
+if [ "$(uname)" == 'Darwin' ]; then
+  portable_sed () {
+    sed -E -i '' "$1" "$2"
+  }
+else
+  portable_sed () {
+    sed --regexp-extended -i "$1" "$2"
+  }
+fi
+
+# remove_lines_with: Given a string, removes any line that contains it
+remove_lines_with () {
+  portable_sed "/$1/d" $meta_yaml
+}
+
+# add_before <some marker> <some insertion> <in this file>
+# essentially replaces
+#
+#    <some marker>
+#
+# with
+#
+#    <some insertion>
+#    <some marker>
+#
+# ( *)     captured spaces before match == the indentation in the meta.yaml
+# ${1}     the marker to insert before
+# '\1'     captured whitespace == correct indentation
+# ${2}     the string to insert
+# \\"$'\n' escaped newline
+# '\1'      captured whitespace == correct indentation
+# ${1}     put the marker back
+add_before() {
+  portable_sed 's@( *)'"${1}@"'\1'"${2}\\"$'\n''\1'"${1}@" $3
+}
+append_to_section () {
+  add_before "# ${1} section here" "$2" $meta_yaml
+}
+# add_package <package_name> <optional package version specifier>
+# Takes a package name and version and finagles the meta.yaml to specify that
+add_package () {
+  append_to_section 'build' "- $1 $2"
+  append_to_section 'run' "- $1 $2"
+}
+
+
+###########################################################
+# Parse options from both command line and from BUILD_ENVIRONMENT
+###########################################################
+conda_args=()
+caffe2_cmake_args=()
+conda_channel=()
+if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
+  cuda_ver="$(echo $BUILD_ENVIRONMENT | grep --only-matching -P '(?<=cuda)[0-9]\.[0-9]')"
+fi
+if [[ $BUILD_ENVIRONMENT == *cudnn* ]]; then
+  cudnn_ver="$(echo $BUILD_ENVIRONMENT | grep --only-matching -P '(?<=cudnn)[0-9](\.[0-9])?')"
+fi
+if [[ $BUILD_ENVIRONMENT == *full* ]]; then
+  build_full=1
+fi
+
+# Support legacy way of passing in these parameters
+if [[ -n $SKIP_CONDA_TESTS ]]; then
+  conda_args+=("--no-test")
+  conda_args+=("--no-anaconda-upload")
+  upload_to_conda=''
+fi
+if [[ -n $UPLOAD_TO_CONDA ]]; then
+  upload_to_conda=1
+fi
+if [[ -n $CONDA_INSTALL_LOCALLY ]]; then
+  install_locally=1
+fi
+if [[ -n $INTEGRATED ]]; then
+  integrated=1
+fi
+
+# Parameters passed in by command line. These override those set by environment
+# variables
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --name)
+      shift
+      package_name=$1
+      ;;
+    --suffix)
+      shift
+      package_name_suffix=$1
+      ;;
+    --skip-tests)
+      conda_args+=("--no-test")
+      conda_args+=("--no-anaconda-upload")
+      upload_to_conda=''
+      ;;
+    --upload)
+      upload_to_conda=1
+      ;;
+    --install-locally)
+      install_locally=1
+      ;;
+    --meta-only)
+      stop_after_meta=1
+      ;;
+    --cuda)
+      shift
+      cuda_ver="$1"
+      ;;
+    --cudnn)
+      shift
+      cudnn_ver="$1"
+      ;;
+    --full)
+      build_full=1
+      ;;
+    --integrated)
+      integrated=1
+      ;;
+    --slim)
+      slim=1
+      ;;
+    --conda)
+      shift
+      conda_args+=("$1")
+      ;;
+    *)
+      caffe2_cmake_args+=("$1")
+      ;;
+  esac
+  shift
+done
+
+# Verify that the CUDA version is supported
+if [[ -n $cuda_ver ]]; then
+  if [[ $cuda_ver == 9.1* ]]; then
+    cuda_feature_name=cuda91
+  elif [[ $cuda_ver == 9.0* ]]; then
+    cuda_feature_name=cuda90
+  elif [[ $cuda_ver == 8.0* ]]; then
+    cuda_feature_name=cuda80
+  else
+    echo "Unsupported CUDA version $cuda_ver"
+    exit 1
+  fi
+  if [[ -z $cudnn_ver ]]; then
+    echo "No CuDNN version given. Caffe2 will still build against whatever"
+    echo "CuDNN that it finds first, and will break if there is no CuDNN found."
+  fi
+  echo "Detected CUDA version $cuda_ver"
+fi
+
+###########################################################
+# Set the build version
+export PYTORCH_BUILD_DATE="$(date +"%Y.%m.%d")"
+if [[ -n $integrated ]]; then
+  export PYTORCH_BUILD_VERSION="$(date +"%Y.%m.%d")"
+else
+  export PYTORCH_BUILD_VERSION="0.8.dev"
+fi
+
+
+###########################################################
+# Read the gcc version to see what ABI to build for
+if [[ "$(uname)" != 'Darwin' ]]; then
+  gcc_ver="$(gcc --version | grep --only-matching '[0-9]\.[0-9]\.[0-9]*' | head -1)"
+fi
+if [[ $gcc_ver == 4* ]]; then
+  GCC_USE_C11=0
+else
+  GCC_USE_C11=1
+fi
+
+# Read the python version
+PYTHON_VERSION="$(python --version 2>&1 | grep --only-matching '[0-9]\.[0-9]\.[0-9]*')"
+if [[ "$PYTHON_VERSION" == 3.6* ]]; then
+  # This is needed or else conda tries to move packages to python3/site-packages
+  # instead of python3.6/site-packages. Specifically 3.6 because that's what
+  # the latest Anaconda version is
+  conda_args+=(" --python 3.6")
+fi
+
+
+###########################################################
+# Pick the correct conda-build folder
+###########################################################
+# And copy the meta.yaml to the correct build folder
+pytorch_root="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+build_dir="${pytorch_root}/conda"
+if [[ -n $integrated ]]; then
+  \cp -r "${build_dir}/caffe2/meta.yaml" "${build_dir}/integrated/meta.yaml"
+  build_dir="${build_dir}/integrated"
+elif [[ -n $build_full ]]; then
+  build_dir="${build_dir}/caffe2/full"
+else
+  \cp -r "${build_dir}/caffe2/meta.yaml" "${build_dir}/caffe2/normal/meta.yaml"
+  build_dir="${build_dir}/caffe2/normal"
+fi
+meta_yaml="${build_dir}/meta.yaml"
+portable_sed "s#path:.*#path: $pytorch_root#" $meta_yaml
+
+
+###########################################################
+# Build the package name and build string depending on gcc and CUDA
+###########################################################
+build_string='py{{py}}'
+if [[ -z $package_name ]]; then
+  package_name='caffe2'
+  if [[ -n $integrated ]]; then
+    package_name="pytorch-${package_name}"
+  fi
+fi
+if [[ -n $cuda_ver ]]; then
+  # CUDA 9.0 and 9.1 are not in conda, and cuDNN is not in conda, so instead of
+  # pinning CUDA and cuDNN versions in the conda_build_config and then setting
+  # the package name in meta.yaml based off of these values, we let Caffe2
+  # take the CUDA and cuDNN versions that it finds in the build environment,
+  # and manually set the package name ourself.
+  package_name="${package_name}-cuda${cuda_ver}-cudnn${cudnn_ver}"
+  build_string="${build_string}-cuda${cuda_ver}-cudnn${cudnn_ver}-nccl2"
+else
+  build_string="${build_string}-cpu"
+fi
+if [[ "$(uname)" != 'Darwin' && $GCC_USE_C11 -eq 0 ]]; then
+  # gcc compatibility is not tracked by conda-forge, so we track it ourselves
+  package_name="${package_name}-gcc${gcc_ver:0:3}"
+  build_string="${build_string}-gcc${gcc_ver:0:3}"
+fi
+if [[ -n $build_full ]]; then
+  package_name="${package_name}-full"
+  build_string="${build_string}-full"
+fi
+portable_sed "s/name: caffe2.*\$/name: ${package_name}/" $meta_yaml
+#portable_sed "s/string:.*\$/string: ${build_string}/" $meta_yaml
+
+
+###########################################################
+# Handle tests
+###########################################################
+if [[ -n $integrated ]]; then
+  # Removed until https://github.com/conda/conda/issues/7245 is resolved
+  #if [[ -n $cuda_ver ]]; then
+  #  append_to_section 'test' 'requires:'
+  #  append_to_section 'test' "  - $cuda_feature_name"
+  #  append_to_section 'test' '  - nccl2'
+  #fi
+  append_to_section 'test' 'source_files:'
+  append_to_section 'test' '  - test'
+  append_to_section 'test' 'commands:'
+  append_to_section 'test' '  - OMP_NUM_THREADS=4 ./test/run_test.sh || true'
+fi
+
+
+###########################################################
+# Set flags and package requirements
+###########################################################
+# Add packages required for all Caffe2 builds
+add_package 'glog'
+add_package 'gflags'
+caffe2_cmake_args+=("-DUSE_LEVELDB=OFF")
+caffe2_cmake_args+=("-DUSE_LMDB=OFF")
+
+
+# Add packages required for pytorch
+if [[ -n $integrated ]]; then
+  add_package 'cffi'
+  add_package 'mkl' '>=2018'
+  add_package 'mkl-include'
+  add_package 'typing'
+  append_to_section 'build' '- pyyaml'
+  append_to_section 'build' '- setuptools'
+  #caffe2_cmake_args+=("-DBLAS=MKL")
+  if [[ -n $cuda_ver ]]; then
+    # Removed until https://github.com/conda/conda/issues/7245 is resolved
+    #append_to_section 'features' features:
+    #append_to_section 'features' "  - $cuda_feature_name" 
+    append_to_section 'build' "- magma-$cuda_feature_name"
+    #append_to_section 'features' '  - nccl2'
+    #add_package $cuda_feature_name
+    conda_channel+=('-c pytorch')
+
+    caffe2_cmake_args+=("-DUSE_ATEN=ON")
+  fi
+fi
+
+if [[ -z $slim ]]; then
+  add_package 'opencv' '<3.4'
+else
+  caffe2_cmake_args+=("-DUSE_OPENCV=OFF")
+fi
+
+# Flags required for CUDA for Caffe2
+if [[ -n $cuda_ver ]]; then
+  caffe2_cmake_args+=("-DUSE_CUDA=ON")
+  caffe2_cmake_args+=("-DUSE_NCCL=ON")
+
+  # NCCL and GLOO don't work with static CUDA right now. Cmake changes are
+  # needed
+  #caffe2_cmake_args+=("-DUSE_NCCL=OFF")
+  #caffe2_cmake_args+=("-DUSE_GLOO=OFF")
+  #caffe2_cmake_args+=("-DCAFFE2_STATIC_LINK_CUDA=ON")
+
+  if [[ $upload_to_conda ]]; then
+    caffe2_cmake_args+=("-DCUDA_ARCH_NAME=All")
+  fi
+else
+  # Flags required for CPU for Caffe2
+  caffe2_cmake_args+=("-DUSE_CUDA=OFF")
+  caffe2_cmake_args+=("-DUSE_NCCL=OFF")
+  #if [[ -z $integrated ]]; then
+  #  #caffe2_cmake_args+=("-DBLAS=MKL")
+  #  #add_package 'mkl'
+  #  #add_package 'mkl-include'
+  #fi
+fi
+
+# Change flags based on target gcc ABI
+# Default conda channels use gcc 7.2, conda-forge uses gcc 4.8.5
+if [[ "$(uname)" != 'Darwin' && "$GCC_USE_C11" -eq 0 ]]; then
+  caffe2_cmake_args+=("-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+  conda_channel+=('-c conda-forge')
+fi
+
+
+###########################################################
+# Set flags needed for uploading to Anaconda. This is only allowed if testing
+# is enabled
+###########################################################
+if [[ $upload_to_conda ]]; then
+  conda_args+=(" --user ${ANACONDA_USERNAME}")
+  conda_args+=(" --token ${CAFFE2_ANACONDA_ORG_ACCESS_TOKEN}")
+
+  # If building a redistributable, then package the CUDA libraries with it
+  # TODO this doesn't work on Ubuntu right now
+  #if [[ -n $cuda_ver ]]; then
+  #  export PACKAGE_CUDA_LIBS=1
+  #fi
+fi
+
+# Show what the final meta.yaml looks like
+echo "Finalized meta.yaml is"
+cat $meta_yaml
+if [[ -n $stop_after_meta ]]; then
+  exit 0
+fi
+
+
+###########################################################
+# Build Caffe2 with conda-build
+###########################################################
+CAFFE2_CMAKE_ARGS=${caffe2_cmake_args[@]} \
+    CUDA_VERSION=$cuda_ver \
+    conda build $build_dir \
+        ${conda_channel[@]} \
+        ${conda_args[@]}
+
+# Install Caffe2 from the built package into the local conda environment
+if [[ -n $install_locally ]]; then
+  conda install -y ${conda_channel[@]} $package_name --use-local
+fi
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
new file mode 100755
index 0000000..c7c843e
--- /dev/null
+++ b/scripts/build_android.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the android target.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for the Android platform
+# using android-cmake. A few notes:
+#
+# (1) This build also does a host build for protobuf. You will need autoconf
+#     to carry out this. If autoconf is not possible, you will need to provide
+#     a pre-built protoc binary that is the same version as the protobuf
+#     version under third_party.
+#     If you are building on Mac, you might need to install autotool and
+#     libtool. The easiest way is via homebrew:
+#         brew install automake
+#         brew install libtool
+# (2) You will need to have android ndk installed. The current script assumes
+#     that you set ANDROID_NDK to the location of ndk.
+# (3) The toolchain and the build target platform can be specified with the
+#     cmake arguments below. For more details, check out android-cmake's doc.
+
+set -e
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+if [ -z "$ANDROID_NDK" ]; then
+  echo "ANDROID_NDK not set; please set it to the Android NDK directory"
+  exit 1
+fi
+
+if [ ! -d "$ANDROID_NDK" ]; then
+  echo "ANDROID_NDK not a directory; did you install it under $ANDROID_NDK?"
+  exit 1
+fi
+
+echo "Bash: $(/bin/bash --version | head -1)"
+echo "Caffe2 path: $CAFFE2_ROOT"
+echo "Using Android NDK at $ANDROID_NDK"
+
+# Build protobuf from third_party so we have a host protoc binary.
+echo "Building protoc"
+$CAFFE2_ROOT/scripts/build_host_protoc.sh
+
+# Now, actually build the Android target.
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_android"}
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+
+CMAKE_ARGS=()
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+# Use locally built protoc because we'll build libprotobuf for the
+# target architecture and need an exact version match.
+CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CAFFE2_ROOT/build_host_protoc/bin/protoc")
+
+# Use android-cmake to build Android project from CMake.
+CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake")
+
+# Don't build artifacts we don't need
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
+CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
+CMAKE_ARGS+=("-DANDROID_TOOLCHAIN=gcc")
+# Disable unused dependencies
+CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
+CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
+CMAKE_ARGS+=("-DUSE_LMDB=OFF")
+CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
+CMAKE_ARGS+=("-DUSE_MPI=OFF")
+CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
+
+# Only toggle if VERBOSE=1
+if [ "${VERBOSE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
+fi
+
+# Android specific flags
+CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
+CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a with NEON")
+CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=21")
+CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
+# TODO: As the toolchain file doesn't support NEON-FP16 extension,
+# we disable USE_MOBILE_OPENGL for now, it will be re-enabled in the future.
+CMAKE_ARGS+=("-DUSE_MOBILE_OPENGL=OFF")
+
+# Use-specified CMake arguments go last to allow overridding defaults
+CMAKE_ARGS+=($@)
+
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_INSTALL_PREFIX=../install \
+    -DCMAKE_BUILD_TYPE=Release \
+    "${CMAKE_ARGS[@]}"
+
+# Cross-platform parallel build
+if [ "$(uname)" == "Darwin" ]; then
+  cmake --build . -- "-j$(sysctl -n hw.ncpu)"
+else
+  cmake --build . -- "-j$(nproc)"
+fi
diff --git a/scripts/build_host_protoc.sh b/scripts/build_host_protoc.sh
new file mode 100755
index 0000000..77944a3
--- /dev/null
+++ b/scripts/build_host_protoc.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+##############################################################################
+# Build script to build the protoc compiler for the host platform.
+##############################################################################
+# This script builds the protoc compiler for the host platform, which is needed
+# for any cross-compilation as we will need to convert the protobuf source
+# files to cc files.
+#
+# --other-flags accepts flags that should be passed to cmake. Optional.
+#
+# After the execution of the file, one should be able to find the host protoc
+# binary at build_host_protoc/bin/protoc.
+
+set -e
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_host_protoc"}
+mkdir -p $BUILD_ROOT/build
+cd $BUILD_ROOT/build
+
+CMAKE_ARGS=()
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=$BUILD_ROOT")
+CMAKE_ARGS+=("-Dprotobuf_BUILD_TESTS=OFF")
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+while true; do
+    case "$1" in
+        --other-flags)
+            shift;
+            CMAKE_ARGS+=("$@")
+            break ;;
+        "")
+            break ;;
+        *)
+            echo "Unknown option passed as argument: $1"
+            break ;;
+    esac
+done
+
+# Use ccache if available (this path is where Homebrew installs ccache symlinks)
+if [ "$(uname)" == 'Darwin' ] && [ -d /usr/local/opt/ccache/libexec ]; then
+  CMAKE_ARGS+=("-DCMAKE_C_COMPILER=/usr/local/opt/ccache/libexec/gcc")
+  CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=/usr/local/opt/ccache/libexec/g++")
+fi
+
+cmake "$CAFFE2_ROOT/third_party/protobuf/cmake" ${CMAKE_ARGS[@]}
+
+if [ "$(uname)" == 'Darwin' ]; then
+  cmake --build . -- "-j$(sysctl -n hw.ncpu)" install
+else
+  cmake --build . -- "-j$(nproc)" install
+fi
diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
new file mode 100755
index 0000000..4602e0d
--- /dev/null
+++ b/scripts/build_ios.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the iOS target.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for the iOS platform
+# using ios-cmake. This is very similar to the android-cmake - see
+# build_android.sh for more details.
+
+set -e
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+# Build protobuf from third_party so we have a host protoc binary.
+echo "Building protoc"
+BITCODE_FLAGS="-DCMAKE_C_FLAGS=-fembed-bitcode -DCMAKE_CXX_FLAGS=-fembed-bitcode "
+$CAFFE2_ROOT/scripts/build_host_protoc.sh --other-flags $BITCODE_FLAGS
+
+# Now, actually build the iOS target.
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_ios"}
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+
+CMAKE_ARGS=()
+
+# Use locally built protoc because we'll build libprotobuf for the
+# target architecture and need an exact version match.
+CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CAFFE2_ROOT/build_host_protoc/bin/protoc")
+
+# Use ios-cmake to build iOS project from CMake.
+# This projects sets CMAKE_C_COMPILER to /usr/bin/gcc and
+# CMAKE_CXX_COMPILER to /usr/bin/g++. In order to use ccache (if it is available) we
+# must override these variables via CMake arguments.
+CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$CAFFE2_ROOT/third_party/ios-cmake/toolchain/iOS.cmake")
+CCACHE_WRAPPER_PATH=/usr/local/opt/ccache/libexec
+if [ -d "$CCACHE_WRAPPER_PATH" ]; then
+  CMAKE_ARGS+=("-DCMAKE_C_COMPILER=$CCACHE_WRAPPER_PATH/gcc")
+  CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=$CCACHE_WRAPPER_PATH/g++")
+fi
+
+# IOS_PLATFORM controls type of iOS platform (see ios-cmake)
+if [ -n "${IOS_PLATFORM:-}" ]; then
+  CMAKE_ARGS+=("-DIOS_PLATFORM=${IOS_PLATFORM}")
+else
+  # IOS_PLATFORM is not set, default to OS, which builds iOS.
+  CMAKE_ARGS+=("-DIOS_PLATFORM=OS")
+fi
+
+# Don't build binaries or tests (only the library)
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
+
+# Disable unused dependencies
+CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
+CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
+CMAKE_ARGS+=("-DUSE_LMDB=OFF")
+CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
+CMAKE_ARGS+=("-DUSE_MPI=OFF")
+
+# pthreads
+CMAKE_ARGS+=("-DCMAKE_THREAD_LIBS_INIT=-lpthread")
+CMAKE_ARGS+=("-DCMAKE_HAVE_THREADS_LIBRARY=1")
+CMAKE_ARGS+=("-DCMAKE_USE_PTHREADS_INIT=1")
+
+# Only toggle if VERBOSE=1
+if [ "${VERBOSE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
+fi
+
+CMAKE_ARGS+=("-DCMAKE_C_FLAGS=-fembed-bitcode")
+CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
+
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_INSTALL_PREFIX=../install \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_SHARED_LIBS=OFF \
+    ${CMAKE_ARGS[@]} \
+    $@
+
+cmake --build . -- "-j$(sysctl -n hw.ncpu)"
diff --git a/scripts/build_local.sh b/scripts/build_local.sh
new file mode 100755
index 0000000..f2c40a6
--- /dev/null
+++ b/scripts/build_local.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#
+##############################################################################
+# Example command to build Caffe2
+##############################################################################
+#
+
+set -ex
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+CMAKE_ARGS=()
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+# Use ccache if available (this path is where Homebrew installs ccache symlinks)
+if [ "$(uname)" == 'Darwin' ]; then
+  CCACHE_WRAPPER_PATH=/usr/local/opt/ccache/libexec
+  if [ -d "$CCACHE_WRAPPER_PATH" ]; then
+    CMAKE_ARGS+=("-DCMAKE_C_COMPILER=$CCACHE_WRAPPER_PATH/gcc")
+    CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=$CCACHE_WRAPPER_PATH/g++")
+  fi
+fi
+
+# Use special install script with Anaconda
+if [ -n "${USE_ANACONDA}" ]; then
+  export SKIP_CONDA_TESTS=1
+  export CONDA_INSTALL_LOCALLY=1
+  "${ROOT_DIR}/scripts/build_anaconda.sh" "$@"
+else
+  # Make sure that pyyaml is installed for the codegen of building Aten to work
+  if [[ -n "$(python -c 'import yaml' 2>&1)" ]]; then
+    echo "Installing pyyaml with pip at $(which pip)"
+    pip install --user pyyaml
+  fi
+
+  # Make sure that typing is installed for the codegen of building Aten to work
+  if [[ -n "$(python -c 'import typing' 2>&1)" ]]; then
+    echo "Installing typing with pip at $(which pip)"
+    pip install --user typing
+  fi
+
+  # Build protobuf compiler from third_party if configured to do so
+  if [ -n "${USE_HOST_PROTOC:-}" ]; then
+    echo "USE_HOST_PROTOC is set; building protoc before building Caffe2..."
+    "$CAFFE2_ROOT/scripts/build_host_protoc.sh"
+    CUSTOM_PROTOC_EXECUTABLE="$CAFFE2_ROOT/build_host_protoc/bin/protoc"
+    echo "Built protoc $("$CUSTOM_PROTOC_EXECUTABLE" --version)"
+    CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CUSTOM_PROTOC_EXECUTABLE")
+  fi
+
+  # We are going to build the target into build.
+  BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+  mkdir -p "$BUILD_ROOT"
+  cd "$BUILD_ROOT"
+  echo "Building Caffe2 in: $BUILD_ROOT"
+
+  cmake "$CAFFE2_ROOT" \
+        -DCMAKE_BUILD_TYPE=Release \
+        "${CMAKE_ARGS[@]}" \
+        "$@"
+
+  # Determine the number of CPUs to build with.
+  # If the `CAFFE_MAKE_NCPUS` variable is not specified, use them all.
+  if [ -n "${CAFFE_MAKE_NCPUS}" ]; then
+      CAFFE_MAKE_NCPUS="$CAFFE_MAKE_NCPUS"
+  elif [ "$(uname)" == 'Darwin' ]; then
+      CAFFE_MAKE_NCPUS="$(sysctl -n hw.ncpu)"
+  else
+      CAFFE_MAKE_NCPUS="$(nproc)"
+  fi
+
+  # Now, actually build the target.
+  cmake --build . -- "-j$CAFFE_MAKE_NCPUS"
+fi
diff --git a/scripts/build_raspbian.sh b/scripts/build_raspbian.sh
new file mode 100755
index 0000000..659e02e
--- /dev/null
+++ b/scripts/build_raspbian.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the android target.
+##############################################################################
+# 
+# This script shows how one can build a Caffe2 binary for raspbian. The build
+# is essentially much similar to a host build, with one additional change
+# which is to specify -mfpu=neon for optimized speed.
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 raspbian into: $BUILD_ROOT"
+
+# obtain dependencies.
+echo "Installing dependencies."
+sudo apt-get install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  libpython-dev \
+  python-pip \
+  python-numpy \
+  protobuf-compiler \
+  python-protobuf
+# python dependencies
+sudo pip install hypothesis
+
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: you can add more dependencies above if you need libraries such as
+# leveldb, lmdb, etc.
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=hard" \
+    || exit 1
+
+# Note: while Raspberry pi has 4 cores, running too many builds in parallel may
+# cause out of memory errors so we will simply run -j 2 only.
+make -j 2 || exit 1
diff --git a/scripts/build_tegra_x1.sh b/scripts/build_tegra_x1.sh
new file mode 100755
index 0000000..0a8d7a0
--- /dev/null
+++ b/scripts/build_tegra_x1.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+##############################################################################
+# Example command to build Caffe2 on Tegra X1.
+##############################################################################
+# 
+# This script shows how one can build a Caffe2 binary for NVidia's TX1.
+# The build script assumes that you have the most recent libraries installed
+# via the JetPack toolkit available at
+#     https://developer.nvidia.com/embedded/jetpack
+# and it assumes that we are starting from a fresh system after the jetpack
+# installation. If you have already installed some of the dependencies, you
+# may be able to skip quite a few of the apt-get installs.
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 raspbian into: $BUILD_ROOT"
+
+# obtain necessary dependencies
+echo "Installing dependencies."
+sudo apt-get install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  protobuf-compiler
+
+# obtain optional dependencies that are usually useful to have.
+echo "Installing optional dependencies."
+sudo apt-get install \
+  libleveldb-dev \
+  liblmdb-dev \
+  libpython-dev \
+  libsnappy-dev \
+  python-numpy \
+  python-pip \
+  python-protobuf
+
+# Obtain python hypothesis, which Caffe2 uses for unit testing. Note that
+# the one provided by apt-get is quite old so we install it via pip
+sudo pip install hypothesis
+
+# Install the six module, which includes Python 2 and 3 compatibility utilities,
+# and is required for Caffe2
+sudo pip install six
+
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# CUDA_USE_STATIC_CUDA_RUNTIME needs to be set to off so that opencv can be
+# properly used. Otherwise, opencv will complain that opencv_dep_cudart cannot
+# be found.
+cmake "$CAFFE2_ROOT" -DCUDA_USE_STATIC_CUDA_RUNTIME=OFF \
+    || exit 1
+
+make -j 4 || exit 1
diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
new file mode 100755
index 0000000..38e90fa
--- /dev/null
+++ b/scripts/build_tizen.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+##############################################################################
+#  Example command to build the Tizen target (RPi3).
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for a Tizen device (RPi3).
+# The build is essentially much similar to a host build, with one additional change
+# which is to specify -mfpu=neon for optimized speed.
+
+setup_environment(){
+# The rootfs image for a Tizen target (RPi3)is located at the below webpage:
+# http://download.tizen.org/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
+# If you do not have a Tizen device, Please, run qemu-arm-static and chroot command.
+# $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 Tizen into: $BUILD_ROOT"
+}
+
+caffe2_lite_dep_packages(){
+# Obtain necessary dependencies
+# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
+# software platform officially support rpm format such as Fedora, OpenSUSE.
+# The official Tizen repository is as following:
+# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
+echo "Installing dependencies."
+sudo zypper install \
+  make \
+  strace \
+  cmake \
+  gcc* \
+  binutils \
+  glibc* \
+  cpp \
+  protobuf-devel \
+  libstdc++*
+}
+
+caffe2_lite_build(){
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: add more dependencies above if you need libraries such as leveldb, lmdb, etc.
+# If you have to disable a specific package due to a package absence
+# from https://git.tizen.org/cgit/, append -Dxxx_xxx=OFF option before executing cmake.
+cmake .. \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DUSE_GFLAGS=OFF  \
+    -DUSE_GLOG=OFF -DUSE_NNPACK=OFF \
+    -DRUN_HAVE_STD_REGEX=0 \
+    -DRUN_HAVE_POSIX_REGEX=0 \
+    -DHAVE_GNU_POSIX_REGEX=0 \
+    -DUSE_MPI=OFF -DUSE_OPENMP=OFF \
+    -DUSE_ROCKSDB=OFF \
+    -DUSE_LEVELDB=OFF \
+    -DUSE_LMDB=OFF \
+    -DBUILD_PYTHON=OFF \
+    -DUSE_GLOO=OFF \
+    -DUSE_OPENCV=OFF \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
+    || exit 1
+
+make -j`nproc` || exit 1
+}
+
+caffe2_full_dep_packages(){
+# Obtain necessary dependencies
+# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
+# software platform officially support rpm format such as Fedora, OpenSUSE.
+# The official Tizen repository is as following:
+# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
+echo "Installing dependencies."
+sudo zypper install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  protobuf-compiler
+
+# Obtain optional dependencies that are usually useful to have.
+echo "Installing optional dependencies."
+sudo zypper install \
+  libleveldb-dev \
+  liblmdb-dev \
+  libpython-dev \
+  libsnappy-dev \
+  python-numpy \
+  python-pip \
+  python-protobuf
+
+# Obtain python hypothesis, which Caffe2 uses for unit testing. Note that
+# the one provided by zypper is quite old so we install it via pip
+sudo pip install hypothesis
+
+# Install the six module, which includes Python 2 and 3 compatibility utilities,
+# and is required for Caffe2
+sudo pip install six
+}
+
+caffe2_full_build(){
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: add more dependencies above if you need libraries such as leveldb, lmdb, etc.
+# If you have to disable a specific package due to a package absence
+# from https://git.tizen.org/cgit/, append -Dxxx_xxx=OFF option before executing cmake.
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DUSE_CUDA=OFF \
+    -DUSE_OPENCV=OFF \
+    -DUSE_LMDB=OFF \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
+    || exit 1
+
+make -j`nproc` || exit 1
+}
+
+#### Main
+# Setup a build environment to compile Caffe2 deeplearning framework in Tizen platform.
+setup_environment
+# There are two build options to support 'full' version and 'lite' version (by default).
+caffe2_lite_dep_packages
+caffe2_lite_build
diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
new file mode 100644
index 0000000..c4b8a38
--- /dev/null
+++ b/scripts/build_windows.bat
@@ -0,0 +1,82 @@
+:: #############################################################################
+:: Example command to build on Windows.
+:: #############################################################################
+
+:: This script shows how one can build a Caffe2 binary for windows.
+
+@echo off
+setlocal
+
+SET ORIGINAL_DIR=%cd%
+SET CAFFE2_ROOT=%~dp0%..
+
+if NOT DEFINED CMAKE_BUILD_TYPE (
+  set CMAKE_BUILD_TYPE=Release
+)
+
+if NOT DEFINED USE_CUDA (
+  set USE_CUDA=OFF
+)
+
+if NOT DEFINED CMAKE_GENERATOR (
+  if DEFINED APPVEYOR_BUILD_WORKER_IMAGE (
+    if "%APPVEYOR_BUILD_WORKER_IMAGE%" == "Visual Studio 2017" (
+      set CMAKE_GENERATOR="Visual Studio 15 2017 Win64"
+    ) else if "%APPVEYOR_BUILD_WORKER_IMAGE%" == "Visual Studio 2015" (
+      set CMAKE_GENERATOR="Visual Studio 14 2015 Win64"
+    ) else (
+      echo "You made a programming error: unknown APPVEYOR_BUILD_WORKER_IMAGE:"
+      echo %APPVEYOR_BUILD_WORKER_IMAGE%
+      exit /b
+    )
+  ) else (
+    :: In default we use win64 VS 2015.
+    :: Main reason is that currently, cuda 9 does not support VS 2017 newest
+    :: version. To use cuda you will have to use 2015.
+    set CMAKE_GENERATOR="Visual Studio 14 2015 Win64"
+  )
+)
+
+:: Install pyyaml for Aten codegen
+pip install pyyaml
+
+echo CAFFE2_ROOT=%CAFFE2_ROOT%
+echo CMAKE_GENERATOR=%CMAKE_GENERATOR%
+echo CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE%
+
+if not exist %CAFFE2_ROOT%\build mkdir %CAFFE2_ROOT%\build
+cd %CAFFE2_ROOT%\build
+
+:: Set up cmake. We will skip building the test files right now.
+:: TODO: enable cuda support.
+cmake .. ^
+  -G%CMAKE_GENERATOR% ^
+  -DBUILD_TEST=OFF ^
+  -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
+  -DUSE_CUDA=%USE_CUDA% ^
+  -DTORCH_CUDA_ARCH_LIST=5.0 ^
+  -DUSE_NNPACK=OFF ^
+  -DUSE_CUB=OFF ^
+  -DUSE_GLOG=OFF ^
+  -DUSE_GFLAGS=OFF ^
+  -DUSE_LMDB=OFF ^
+  -DUSE_LEVELDB=OFF ^
+  -DUSE_ROCKSDB=OFF ^
+  -DUSE_OPENCV=OFF ^
+  -DBUILD_SHARED_LIBS=OFF ^
+  -DBUILD_PYTHON=OFF^
+  || goto :label_error
+
+:: Actually run the build
+cmake --build . --config %CMAKE_BUILD_TYPE% -- /maxcpucount:%NUMBER_OF_PROCESSORS% || goto :label_error
+
+echo "Caffe2 built successfully"
+cd %ORIGINAL_DIR%
+endlocal
+exit /b 0
+
+:label_error
+echo "Caffe2 building failed"
+cd %ORIGINAL_DIR%
+endlocal
+exit /b 1
diff --git a/scripts/diagnose_protobuf.py b/scripts/diagnose_protobuf.py
new file mode 100644
index 0000000..f3209a4
--- /dev/null
+++ b/scripts/diagnose_protobuf.py
@@ -0,0 +1,98 @@
+## @package diagnose_protobuf
+# Module scripts.diagnose_protobuf
+"""Diagnoses the current protobuf situation.
+
+Protocol buffer needs to be properly installed for Caffe2 to work, and
+sometimes it is rather tricky. Specifically, we will need to have a
+consistent version between C++ and python simultaneously. This is a
+convenience script for one to quickly check if this is so on one's local
+machine.
+
+Usage:
+    [set your environmental variables like PATH and PYTHONPATH]
+    python scripts/diagnose_protobuf.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import os
+import re
+from subprocess import Popen, PIPE
+
+# Get python protobuf version.
+try:
+    import google.protobuf
+    python_version = google.protobuf.__version__
+    python_protobuf_installed = True
+except ImportError: 
+    print("DEBUG: cannot find python protobuf install.")
+    python_protobuf_installed = False
+
+if os.name == 'nt':
+    protoc_name = 'protoc.exe'
+else:
+    protoc_name = 'protoc'
+
+try:
+    p = Popen([protoc_name, '--version'], stdout=PIPE, stderr=PIPE)
+    out, err = p.communicate()
+except:
+    print('DEBUG: did not find protoc binary.')
+    print('DEBUG: out: ' + out)
+    print('DEBUG: err: ' + err)
+    native_protobuf_installed = False
+else:
+    if p.returncode:
+        print('DEBUG: protoc returned a non-zero return code.')
+        print('DEBUG: out: ' + out)
+        print('DEBUG: err: ' + err)
+        native_protobuf_installed = False
+    else:
+        tmp = re.search('\d\.\d\.\d', out)
+        if tmp:
+            native_version = tmp.group(0)
+            native_protobuf_installed = True
+        else:
+            print('DEBUG: cannot parse protoc version string.')
+            print('DEBUG: out: ' + out)
+            native_protobuf_installed = False
+
+PYTHON_PROTOBUF_NOT_INSTALLED = """
+You have not installed python protobuf. Protobuf is needed to run caffe2. You
+can install protobuf via pip or conda (if you are using anaconda python).
+"""
+
+NATIVE_PROTOBUF_NOT_INSTALLED = """
+You have not installed the protoc binary. Protoc is needed to compile Caffe2
+protobuf source files. Depending on the platform you are on, you can install
+protobuf via:
+    (1) Mac: using homebrew and do brew install protobuf.
+    (2) Linux: use apt and do apt-get install libprotobuf-dev
+    (3) Windows: install from source, or from the releases here:
+        https://github.com/google/protobuf/releases/
+"""
+
+VERSION_MISMATCH = """
+Your python protobuf is of version {py_ver} but your native protoc version is of
+version {native_ver}. This will cause the installation to produce incompatible
+protobuf files. This is bad in general - consider installing the same version.
+""".format(py_ver=python_version, native_ver=native_version)
+
+# Now, give actual recommendations
+if not python_protobuf_installed:
+    print(PYTHON_PROTOBUF_NOT_INSTALLED)
+
+if not native_protobuf_installed:
+    print(NATIVE_PROTOBUF_NOT_INSTALLED)
+
+if python_protobuf_installed and native_protobuf_installed:
+    if python_version != native_version:
+        print(VERSION_MISMATCH)
+    else:
+        print('All looks good.')
+
+
+
+
diff --git a/scripts/fbcode-dev-setup/ccache_setup.sh b/scripts/fbcode-dev-setup/ccache_setup.sh
new file mode 100755
index 0000000..620ad21
--- /dev/null
+++ b/scripts/fbcode-dev-setup/ccache_setup.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+# This script installs CCache with CUDA support.
+# Example usage:
+#     ./ccache_setup.sh --path /installed/folder
+
+set -e
+shopt -s expand_aliases
+
+# Setup the proxy
+alias with_proxy="HTTPS_PROXY=http://fwdproxy.any:8080 HTTP_PROXY=http://fwdproxy.any:8080 FTP_PROXY=http://fwdproxy.any:8080 https_proxy=http://fwdproxy.any:8080 http_proxy=http://fwdproxy.any:8080 ftp_proxy=http://fwdproxy.any:8080 http_no_proxy='*.facebook.com|*.tfbnw.net|*.fb.com'"
+
+# Parse options
+path="$HOME/ccache"
+force=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --path)
+      shift
+      path="$1"
+      path=$(realpath "$path")
+      ;;
+    --force)  # Force install
+      force=true
+      ;;
+    --help)
+      echo 'usage: ./ccache_setup.py --path /installed/folder [--force]'
+      exit 0
+      ;;
+    *)
+      echo "Invalid option: $1"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Check whether you put nvcc in PATH
+set +e
+nvcc_path=$(which nvcc)
+if [[ -z "$nvcc_path" ]]; then
+  nvcc_path="/usr/local/cuda/bin/nvcc"
+  export PATH="/usr/local/cuda/bin:$PATH"
+fi
+set -e
+if [ ! -f "$nvcc_path" ] && ! $force; then
+  # shellcheck disable=SC2016
+  echo 'nvcc is not detected in $PATH'
+  exit 1
+fi
+echo "nvcc is detected at $nvcc_path"
+
+if [ -f "$CUDA_NVCC_EXECUTABLE" ] && [[ "$CUDA_NVCC_EXECUTABLE" == *"ccache"* ]]; then  # Heuristic rule
+  if $CUDA_NVCC_EXECUTABLE --version; then
+    if ! $force; then
+      echo "CCache with nvcc support is already installed at $CUDA_NVCC_EXECUTABLE, please add --force"
+      exit 0
+    fi
+  fi
+fi
+
+# Installing CCache
+echo "CCache will be installed at $path"
+if [ -e "$path" ]; then
+  mv --backup=t -T "$path" "${path}.old"
+fi
+
+with_proxy git clone https://github.com/colesbury/ccache.git "$path" -b ccbin
+cd "$path"
+./autogen.sh
+./configure
+make install prefix="$path"
+
+mkdir -p "$path/lib"
+mkdir -p "$path/cuda"
+ln -sf "$path/bin/ccache" "$path/lib/cc"
+ln -sf "$path/bin/ccache" "$path/lib/c++"
+ln -sf "$path/bin/ccache" "$path/lib/gcc"
+ln -sf "$path/bin/ccache" "$path/lib/g++"
+ln -sf "$path/bin/ccache" "$path/cuda/nvcc"
+"$path/bin/ccache" -M 25Gi
+
+# Make sure the nvcc wrapped in CCache is runnable
+"$path/cuda/nvcc" --version
+echo 'Congrats! The CCache with nvcc support is installed!'
+echo -e "Please add the following lines to your bash init script:\\n"
+echo "################ Env Var for CCache with CUDA support ################"
+# shellcheck disable=SC2016
+echo 'export PATH="'"$path"'/lib:$PATH"'
+echo 'export CUDA_NVCC_EXECUTABLE="'"$path"'/cuda/nvcc"'
+echo '######################################################################'
diff --git a/scripts/fbcode-dev-setup/onnx_c2_sanity_check.sh b/scripts/fbcode-dev-setup/onnx_c2_sanity_check.sh
new file mode 100755
index 0000000..bb4d1ef
--- /dev/null
+++ b/scripts/fbcode-dev-setup/onnx_c2_sanity_check.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+
+python -c 'from caffe2.python import build; from pprint import pprint; pprint(build.build_options)'
+python -c 'from caffe2.python import core, workspace; print("GPUs found: " + str(workspace.NumCudaDevices()))'
+python -c "import onnx"
+python -c "import torch"
+
+echo "Caffe2, PyTorch and ONNX installed successfully!!"
diff --git a/scripts/fbcode-dev-setup/onnx_c2_setup.sh b/scripts/fbcode-dev-setup/onnx_c2_setup.sh
new file mode 100755
index 0000000..db6c274
--- /dev/null
+++ b/scripts/fbcode-dev-setup/onnx_c2_setup.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# This script helps developers set up the ONNX Caffe2 and PyTorch develop environment on devgpu.
+# It creates an virtualenv instance, and installs all the dependencies in this environment.
+# The script will creates a folder called onnx-dev folder under the $HOME directory.
+# onnx, pytorch and caffe2 are installed seperately.
+# Please source $HOME/onnx-dev/.onnx_env_init to initialize the development before starting developing.
+
+
+# TODO: support python 3.
+
+# Set script configuration
+set -e
+shopt -s expand_aliases
+
+# Proxy setup
+alias with_proxy="HTTPS_PROXY=http://fwdproxy.any:8080 HTTP_PROXY=http://fwdproxy.any:8080 FTP_PROXY=http://fwdproxy.any:8080 https_proxy=http://fwdproxy.any:8080 http_proxy=http://fwdproxy.any:8080 ftp_proxy=http://fwdproxy.any:8080 http_no_proxy='*.facebook.com|*.tfbnw.net|*.fb.com'"
+
+# Set the variables
+RED='\033[0;31m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+onnx_root="$HOME/onnx-dev"   # I think hardcoding the onnx root dir is fine, just like fbsource
+venv="$onnx_root/onnxvenv"
+onnx_init_file="$onnx_root/.onnx_env_init"
+ccache_root="$onnx_root/ccache"
+ccache_script="$(pwd)/ccache_install.sh"
+sanity_script="$onnx_root/sanity.sh"
+
+# Check whether default CUDA exists
+# TODO check the required header and lib files
+default_cuda="/usr/local/cuda"
+if [[ ! -e "$default_cuda" ]]; then
+  echo "Default CUDA is not found at $default_cuda"
+fi
+
+# Checking to see if CuDNN is present, and install it if not exists
+if [ -f /usr/local/cuda/include/cudnn.h ]; then
+  echo "CuDNN header already exists!!"
+else
+  sudo cp -R /home/engshare/third-party2/cudnn/6.0.21/src/cuda/include/* /usr/local/cuda/include/
+  sudo cp -R /home/engshare/third-party2/cudnn/6.0.21/src/cuda/lib64/* /usr/local/cuda/lib64/
+fi
+
+# TODO set the specific version for each package
+# Install the dependencies for Caffe2
+sudo yum install python-virtualenv freetype-devel libpng-devel glog gflags protobuf protobuf-devel protobuf-compiler -y
+rpm -q protobuf  # check the version and if necessary update the value below
+protoc --version  # check protoc
+protoc_path=$(which protoc)
+if [[ "$protoc_path" != "/bin/protoc" ]]; then
+  echo "Warning: Non-default protoc is detected, the script may not work with non-default protobuf!!!"
+  echo "Please try to remove the protoc at $protoc_path and rerun this script."
+  exit 1
+fi
+
+# Upgrade Cmake to the right version (>3.0)
+sudo yum remove cmake3 -y
+sudo yum install cmake -y
+
+# Install the dependencies for CCache
+sudo yum install autoconf asciidoc -y
+
+# Create the root folder
+if [ -e "$onnx_root" ]; then
+  timestamp=$(date "+%Y.%m.%d-%H.%M.%S")
+  mv --backup=t -T "$onnx_root" "${onnx_root}.old.$timestamp"
+fi
+mkdir -p "$onnx_root"
+
+# Set the name of virtualenv instance
+with_proxy virtualenv "$venv"
+
+# Creating a script that can be sourced in the future for the environmental variable
+touch "$onnx_init_file"
+{
+  # shellcheck disable=SC2016
+  echo 'if [ -z "$LD_LIBRARY_PATH" ]; then';
+  echo '  export LD_LIBRARY_PATH=/usr/local/cuda/lib64';
+  echo 'else'
+  # shellcheck disable=SC2016
+  echo '  export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH';
+  echo "fi"
+  # shellcheck disable=SC2016
+  echo 'export PATH='"$ccache_root"'/lib:/usr/local/cuda/bin:$PATH';
+  echo "source $venv/bin/activate";
+  echo 'alias with_proxy="HTTPS_PROXY=http://fwdproxy.any:8080 HTTP_PROXY=http://fwdproxy.any:8080 FTP_PROXY=http://fwdproxy.any:8080 https_proxy=http://fwdproxy.any:8080 http_proxy=http://fwdproxy.any:8080 ftp_proxy=http://fwdproxy.any:8080 http_no_proxy='"'"'*.facebook.com|*.tfbnw.net|*.fb.com'"'"'"'
+} >> "$onnx_init_file"
+chmod u+x "$onnx_init_file"
+
+# Installing CCache
+cd "$onnx_root"
+if [ ! -f "$ccache_script" ]; then
+  ccache_script="$onnx_root/ccache_install.sh"
+  with_proxy wget https://raw.githubusercontent.com/pytorch/pytorch/master/scripts/fbcode-dev-setup/ccache_setup.sh -O "$ccache_script"
+fi
+chmod u+x "$ccache_script"
+"$ccache_script" --path "$ccache_root"
+
+# Test nvcc with CCache
+own_ccache=true
+if [ -f "$CUDA_NVCC_EXECUTABLE" ] && [[ "$ccache_root/cuda/nvcc" != "$CUDA_NVCC_EXECUTABLE" ]] && \
+  [[ "$CUDA_NVCC_EXECUTABLE" == *"ccache"* ]]; then  # Heuristic rule
+  if $CUDA_NVCC_EXECUTABLE --version; then
+    own_ccache=false
+  fi
+fi
+if [ "$own_ccache" = true ]; then
+  echo "export CUDA_NVCC_EXECUTABLE=$ccache_root/cuda/nvcc" >> "$onnx_init_file"
+fi
+
+# Loading env vars
+# shellcheck disable=SC1090
+source "$onnx_init_file"
+
+"$CUDA_NVCC_EXECUTABLE" --version
+
+# Create a virtualenv, activate it, upgrade pip
+if [ -f "$HOME/.pip/pip.conf" ]; then
+  echo "${RED}Warning: $HOME/.pip/pip.conf is detected, pip install may fail!${NC}"
+fi
+with_proxy python -m pip install -U pip setuptools
+with_proxy python -m pip install future numpy "protobuf>3.2" pytest-runner pyyaml typing ipython
+
+# Cloning repos
+cd "$onnx_root"
+with_proxy git clone https://github.com/onnx/onnx --recursive
+with_proxy git clone https://github.com/pytorch/pytorch --recursive
+
+# Build ONNX
+cd "$onnx_root/onnx"
+with_proxy python setup.py develop
+
+# Build PyTorch
+cd "$onnx_root/pytorch"
+with_proxy pip install -r "requirements.txt"
+with_proxy python setup.py build develop
+
+# Build Caffe2
+set +e
+
+cd "$onnx_root"
+with_proxy wget https://raw.githubusercontent.com/pytorch/pytorch/master/scripts/fbcode-dev-setup/onnx_c2_sanity_check.sh -O "$sanity_script"
+chmod u+x "$sanity_script"
+
+cd "$onnx_root/pytorch"
+with_proxy python setup_caffe2.py develop
+caffe2_exit_code=$?
+caffe2_ok=true
+if [ $caffe2_exit_code != 0 ]; then
+  caffe2_ok=false
+fi
+if ! $caffe2_ok; then
+  # Possible failure reasons when building Caffe2
+  ninja_path=$(which ninja)
+  if [[ ! -z "$ninja_path" ]]; then
+    echo "${RED}Warning: ninja is installed at $ninja_path, which may cause Caffe2 building issue!!!${NC}"
+    echo "${RED}Please try to remove the ninja at ${ninja_path}.${NC}"
+  fi
+  echo "${RED}We are almost there, only building Caffe2 fails. We can fix this problem seperately.${NC}"
+  echo "###### Please run the following command before development/fixing the problem: ######"
+  echo "${CYAN}source $onnx_init_file${NC}"
+  echo "#####################################################################################"
+  echo "########## Please run the following command to install Caffe2 seperately:  ##########"
+  echo "${CYAN}cd $onnx_root/pytorch; python setup_caffe2.py develop${NC}"
+  echo "#####################################################################################"
+  echo "########### Please run the following command to check your installation:  ###########"
+  echo "${CYAN}$sanity_script${NC}"
+  echo "#####################################################################################"
+  exit 1
+fi
+
+set -e
+
+# Sanity checks and useful info
+$sanity_script
+
+echo "Congrats, you are ready to rock!!"
+echo "################ Please run the following command before development ################"
+echo -e "${CYAN}source $onnx_init_file${NC}"
+echo "#####################################################################################"
diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py
new file mode 100644
index 0000000..0fac6d2
--- /dev/null
+++ b/scripts/get_python_cmake_flags.py
@@ -0,0 +1,26 @@
+## @package get_python_cmake_flags
+# Module scripts.get_python_cmake_flags
+##############################################################################
+# Use this script to find your preferred python installation.
+##############################################################################
+#
+# You can use the following to build with your preferred version of python
+# if your installation is not being properly detected by CMake.
+#
+#   mkdir -p build && cd build
+#   cmake $(python ../scripts/get_python_cmake_flags.py) ..
+#   make
+#
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from distutils import sysconfig
+import sys
+
+flags = [
+    '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable),
+    '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()),
+]
+
+print(' '.join(flags), end='')
diff --git a/scripts/model_zoo/update-caffe2-models.py b/scripts/model_zoo/update-caffe2-models.py
new file mode 100755
index 0000000..41e3d24
--- /dev/null
+++ b/scripts/model_zoo/update-caffe2-models.py
@@ -0,0 +1,164 @@
+#! /usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import subprocess
+import sys
+import tarfile
+import tempfile
+
+from six.moves.urllib.request import urlretrieve
+
+from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
+
+class SomeClass:
+    # largely copied from
+    # https://github.com/onnx/onnx-caffe2/blob/master/tests/caffe2_ref_test.py
+    def _download(self, model):
+        model_dir = self._caffe2_model_dir(model)
+        assert not os.path.exists(model_dir)
+        os.makedirs(model_dir)
+        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+            url = getURLFromName(model, f)
+            dest = os.path.join(model_dir, f)
+            try:
+                try:
+                    downloadFromURLToFile(url, dest,
+                                          show_progress=False)
+                except TypeError:
+                    # show_progress not supported prior to
+                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                    # (Sep 17, 2017)
+                    downloadFromURLToFile(url, dest)
+            except Exception as e:
+                print("Abort: {reason}".format(reason=e))
+                print("Cleaning up...")
+                deleteDirectory(model_dir)
+                exit(1)
+
+    def _caffe2_model_dir(self, model):
+        caffe2_home = os.path.expanduser('~/.caffe2')
+        models_dir = os.path.join(caffe2_home, 'models')
+        return os.path.join(models_dir, model)
+
+    def _onnx_model_dir(self, model):
+        onnx_home = os.path.expanduser('~/.onnx')
+        models_dir = os.path.join(onnx_home, 'models')
+        model_dir = os.path.join(models_dir, model)
+        return model_dir, os.path.dirname(model_dir)
+
+    # largely copied from
+    # https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py
+    def _prepare_model_data(self, model):
+        model_dir, models_dir = self._onnx_model_dir(model)
+        if os.path.exists(model_dir):
+            return
+        os.makedirs(model_dir)
+        url = 'https://s3.amazonaws.com/download.onnx/models/{}.tar.gz'.format(model)
+
+        # On Windows, NamedTemporaryFile cannot be opened for a
+        # second time
+        download_file = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            download_file.close()
+            print('Start downloading model {} from {}'.format(model, url))
+            urlretrieve(url, download_file.name)
+            print('Done')
+            with tarfile.open(download_file.name) as t:
+                t.extractall(models_dir)
+        except Exception as e:
+            print('Failed to prepare data for model {}: {}'.format(model, e))
+            raise
+        finally:
+            os.remove(download_file.name)
+
+models = [
+    'bvlc_alexnet',
+    'densenet121',
+    'inception_v1',
+    'inception_v2',
+    'resnet50',
+
+    # TODO currently onnx can't translate squeezenet :(
+    # 'squeezenet',
+
+    'vgg16',
+
+    # TODO currently vgg19 doesn't work in the CI environment,
+    # possibly due to OOM
+    # 'vgg19'
+]
+
+def download_models():
+    sc = SomeClass()
+    for model in models:
+        print('update-caffe2-models.py:  downloading', model)
+        caffe2_model_dir = sc._caffe2_model_dir(model)
+        onnx_model_dir, onnx_models_dir = sc._onnx_model_dir(model)
+        if not os.path.exists(caffe2_model_dir):
+            sc._download(model)
+        if not os.path.exists(onnx_model_dir):
+            sc._prepare_model_data(model)
+
+def generate_models():
+    sc = SomeClass()
+    for model in models:
+        print('update-caffe2-models.py:  generating', model)
+        caffe2_model_dir = sc._caffe2_model_dir(model)
+        onnx_model_dir, onnx_models_dir = sc._onnx_model_dir(model)
+        subprocess.check_call(['echo', model])
+        with open(os.path.join(caffe2_model_dir, 'value_info.json'), 'r') as f:
+            value_info = f.read()
+        subprocess.check_call([
+            'convert-caffe2-to-onnx',
+            '--caffe2-net-name', model,
+            '--caffe2-init-net', os.path.join(caffe2_model_dir, 'init_net.pb'),
+            '--value-info', value_info,
+            '-o', os.path.join(onnx_model_dir, 'model.pb'),
+            os.path.join(caffe2_model_dir, 'predict_net.pb')
+        ])
+        subprocess.check_call([
+            'tar',
+            '-czf',
+            model + '.tar.gz',
+            model
+        ], cwd=onnx_models_dir)
+
+def upload_models():
+    sc = SomeClass()
+    for model in models:
+        print('update-caffe2-models.py:  uploading', model)
+        onnx_model_dir, onnx_models_dir = sc._onnx_model_dir(model)
+        subprocess.check_call([
+            'aws',
+            's3',
+            'cp',
+            model + '.tar.gz',
+            "s3://download.onnx/models/{}.tar.gz".format(model),
+            '--acl', 'public-read'
+        ], cwd=onnx_models_dir)
+
+def cleanup():
+    sc = SomeClass()
+    for model in models:
+        onnx_model_dir, onnx_models_dir = sc._onnx_model_dir(model)
+        os.remove(os.path.join(os.path.dirname(onnx_model_dir), model + '.tar.gz'))
+
+if __name__ == '__main__':
+    try:
+        subprocess.check_call(['aws', 'sts', 'get-caller-identity'])
+    except:
+        print('update-caffe2-models.py:  please run `aws configure` manually to set up credentials')
+        sys.exit(1)
+    if sys.argv[1] == 'download':
+        download_models()
+    if sys.argv[1] == 'generate':
+        generate_models()
+    elif sys.argv[1] == 'upload':
+        upload_models()
+    elif sys.argv[1] == 'cleanup':
+        cleanup()
diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py
new file mode 100644
index 0000000..d31f7bf
--- /dev/null
+++ b/scripts/model_zoo/update-models-from-caffe2.py
@@ -0,0 +1,340 @@
+#! /usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import onnx.backend
+
+import argparse
+import caffe2.python.workspace as c2_workspace
+import glob
+import json
+import math
+import numpy as np
+import onnx
+import caffe2.python.onnx.frontend
+import caffe2.python.onnx.backend
+import os
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+
+import boto3
+
+from six.moves.urllib.request import urlretrieve
+
+from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
+from caffe2.proto import caffe2_pb2
+from onnx import numpy_helper
+from filechunkio import FileChunkIO
+
+
+"""A script converting Caffe2 models to ONNX, and updating ONNX model zoos.
+
+Arguments:
+    -v, verbose
+    --local-dir, where we store the ONNX and Caffe2 models
+    --no-cache, ignore existing models in local-dir
+    --clean-test-data, delete all the existing test data when updating ONNX model zoo
+    --add-test-data, add add-test-data sets of test data for each ONNX model
+    --only-local, run locally (for testing purpose)
+
+Examples:
+    # store the data in /home/username/zoo-dir, delete existing test data, ignore local cache,
+    # and generate 3 sets of new test data
+    python update-caffe2-models.py --local-dir /home/username/zoo-dir --clean-test-data --no-cache --add-test-data 3
+
+"""
+
+# TODO: Add GPU support
+
+
+def upload_onnx_model(model_name, zoo_dir, backup=False, only_local=False):
+    if only_local:
+        print('No uploading in local only mode.')
+        return
+    model_dir = os.path.join(zoo_dir, model_name)
+    suffix = '-backup' if backup else ''
+    if backup:
+        print('Backing up the previous version of ONNX model {}...'.format(model_name))
+    rel_file_name = '{}{}.tar.gz'.format(model_name, suffix)
+    abs_file_name = os.path.join(zoo_dir, rel_file_name)
+    print('Compressing {} model to {}'.format(model_name, abs_file_name))
+    with tarfile.open(abs_file_name, 'w:gz') as f:
+        f.add(model_dir, arcname=model_name)
+    file_size = os.stat(abs_file_name).st_size
+    print('Uploading {} ({} MB) to s3 cloud...'.format(abs_file_name, float(file_size) / 1024 / 1024))
+    client = boto3.client('s3', 'us-east-1')
+    transfer = boto3.s3.transfer.S3Transfer(client)
+    transfer.upload_file(abs_file_name, 'download.onnx', 'models/latest/{}'.format(rel_file_name),
+                         extra_args={'ACL': 'public-read'})
+
+    print('Successfully uploaded {} to s3!'.format(rel_file_name))
+
+
+def download_onnx_model(model_name, zoo_dir, use_cache=True, only_local=False):
+    model_dir = os.path.join(zoo_dir, model_name)
+    if os.path.exists(model_dir):
+        if use_cache:
+            upload_onnx_model(model_name, zoo_dir, backup=True, only_local=only_local)
+            return
+        else:
+            shutil.rmtree(model_dir)
+    url = 'https://s3.amazonaws.com/download.onnx/models/latest/{}.tar.gz'.format(model_name)
+
+    download_file = tempfile.NamedTemporaryFile(delete=False)
+    try:
+        download_file.close()
+        print('Downloading ONNX model {} from {} and save in {} ...\n'.format(
+            model_name, url, download_file.name))
+        urlretrieve(url, download_file.name)
+        with tarfile.open(download_file.name) as t:
+            print('Extracting ONNX model {} to {} ...\n'.format(model_name, zoo_dir))
+            t.extractall(zoo_dir)
+    except Exception as e:
+        print('Failed to download/backup data for ONNX model {}: {}'.format(model_name, e))
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+    finally:
+        os.remove(download_file.name)
+
+    if not only_local:
+        upload_onnx_model(model_name, zoo_dir, backup=True, only_local=only_local)
+
+
+def download_caffe2_model(model_name, zoo_dir, use_cache=True):
+    model_dir = os.path.join(zoo_dir, model_name)
+    if os.path.exists(model_dir):
+        if use_cache:
+            return
+        else:
+            shutil.rmtree(model_dir)
+    os.makedirs(model_dir)
+
+    for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+        url = getURLFromName(model_name, f)
+        dest = os.path.join(model_dir, f)
+        try:
+            try:
+                downloadFromURLToFile(url, dest,
+                                      show_progress=False)
+            except TypeError:
+                # show_progress not supported prior to
+                # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                # (Sep 17, 2017)
+                downloadFromURLToFile(url, dest)
+        except Exception as e:
+            print("Abort: {reason}".format(reason=e))
+            print("Cleaning up...")
+            deleteDirectory(model_dir)
+            raise
+
+
+def caffe2_to_onnx(caffe2_model_name, caffe2_model_dir):
+    caffe2_init_proto = caffe2_pb2.NetDef()
+    caffe2_predict_proto = caffe2_pb2.NetDef()
+
+    with open(os.path.join(caffe2_model_dir, 'init_net.pb'), 'rb') as f:
+        caffe2_init_proto.ParseFromString(f.read())
+        caffe2_init_proto.name = '{}_init'.format(caffe2_model_name)
+    with open(os.path.join(caffe2_model_dir, 'predict_net.pb'), 'rb') as f:
+        caffe2_predict_proto.ParseFromString(f.read())
+        caffe2_predict_proto.name = caffe2_model_name
+    with open(os.path.join(caffe2_model_dir, 'value_info.json'), 'rb') as f:
+        value_info = json.loads(f.read())
+
+    print('Converting Caffe2 model {} in {} to ONNX format'.format(caffe2_model_name, caffe2_model_dir))
+    onnx_model = caffe2.python.onnx.frontend.caffe2_net_to_onnx_model(
+        init_net=caffe2_init_proto,
+        predict_net=caffe2_predict_proto,
+        value_info=value_info
+    )
+
+    return onnx_model, caffe2_init_proto, caffe2_predict_proto
+
+
+def tensortype_to_ndarray(tensor_type):
+    shape = []
+    for dim in tensor_type.shape.dim:
+        shape.append(dim.dim_value)
+    if tensor_type.elem_type == onnx.TensorProto.FLOAT:
+        type = np.float32
+    elif tensor_type.elem_type == onnx.TensorProto.INT:
+        type = np.int32
+    else:
+        raise
+    array = np.random.rand(*shape).astype(type)
+    return array
+
+
+def generate_test_input_data(onnx_model, scale):
+    real_inputs_names = list(set([input.name for input in onnx_model.graph.input]) - set([init.name for init in onnx_model.graph.initializer]))
+    real_inputs = []
+    for name in real_inputs_names:
+        for input in onnx_model.graph.input:
+            if name == input.name:
+                real_inputs.append(input)
+
+    test_inputs = []
+    for input in real_inputs:
+        ndarray = tensortype_to_ndarray(input.type.tensor_type)
+        test_inputs.append((input.name, ndarray * scale))
+
+    return test_inputs
+
+
+def generate_test_output_data(caffe2_init_net, caffe2_predict_net, inputs):
+    p = c2_workspace.Predictor(caffe2_init_net, caffe2_predict_net)
+    inputs_map = {input[0]:input[1] for input in inputs}
+
+    output = p.run(inputs_map)
+    c2_workspace.ResetWorkspace()
+    return output
+
+
+def onnx_verify(onnx_model, inputs, ref_outputs):
+    prepared = caffe2.python.onnx.backend.prepare(onnx_model)
+    onnx_inputs = []
+    for input in inputs:
+        if isinstance(input, tuple):
+            onnx_inputs.append(input[1])
+        else:
+            onnx_inputs.append(input)
+    onnx_outputs = prepared.run(inputs=onnx_inputs)
+    np.testing.assert_almost_equal(onnx_outputs, ref_outputs, decimal=3)
+
+
+model_mapping = {
+    'bvlc_alexnet': 'bvlc_alexnet',
+    'bvlc_googlenet': 'bvlc_googlenet',
+    'bvlc_reference_caffenet': 'bvlc_reference_caffenet',
+    'bvlc_reference_rcnn_ilsvrc13': 'bvlc_reference_rcnn_ilsvrc13',
+    'densenet121': 'densenet121',
+    #'finetune_flickr_style': 'finetune_flickr_style',
+    'inception_v1': 'inception_v1',
+    'inception_v2': 'inception_v2',
+    'resnet50': 'resnet50',
+    'shufflenet': 'shufflenet',
+    'squeezenet': 'squeezenet_old',
+    #'vgg16': 'vgg16',
+    'vgg19': 'vgg19',
+    'zfnet512': 'zfnet512',
+}
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Update the ONNX models.')
+    parser.add_argument('-v', action="store_true", default=False, help="verbose")
+    parser.add_argument("--local-dir", type=str, default=os.path.expanduser('~'),
+                         help="local dir to store Caffe2 and ONNX models")
+    parser.add_argument("--no-cache", action="store_true", default=False,
+                         help="whether use local ONNX models")
+    parser.add_argument('--clean-test-data', action="store_true", default=False,
+                        help="remove the old test data")
+    parser.add_argument('--add-test-data', type=int, default=0,
+                        help="add new test data")
+    parser.add_argument('--only-local', action="store_true", default=False,
+                        help="no upload including backup")
+
+    args = parser.parse_args()
+    delete_test_data = args.clean_test_data
+    add_test_data = args.add_test_data
+    use_cache = not args.no_cache
+    only_local = args.only_local
+
+    root_dir = args.local_dir
+    caffe2_zoo_dir = os.path.join(root_dir, ".caffe2", "models")
+    onnx_zoo_dir = os.path.join(root_dir, ".onnx", "models")
+
+    for onnx_model_name in model_mapping:
+        c2_model_name = model_mapping[onnx_model_name]
+
+        print('####### Processing ONNX model {} ({} in Caffe2) #######'.format(onnx_model_name, c2_model_name))
+        download_caffe2_model(c2_model_name, caffe2_zoo_dir, use_cache=use_cache)
+        download_onnx_model(onnx_model_name, onnx_zoo_dir, use_cache=use_cache, only_local=only_local)
+
+        onnx_model_dir = os.path.join(onnx_zoo_dir, onnx_model_name)
+
+        if delete_test_data:
+            print('Deleting all the existing test data...')
+            # NB: For now, we don't delete the npz files.
+            #for f in glob.glob(os.path.join(onnx_model_dir, '*.npz')):
+            #    os.remove(f)
+            for f in glob.glob(os.path.join(onnx_model_dir, 'test_data_set*')):
+                shutil.rmtree(f)
+
+        onnx_model, c2_init_net, c2_predict_net = caffe2_to_onnx(c2_model_name, os.path.join(caffe2_zoo_dir, c2_model_name))
+
+        print('Deleteing old ONNX {} model...'.format(onnx_model_name))
+        for f in glob.glob(os.path.join(onnx_model_dir, 'model*'.format(onnx_model_name))):
+            os.remove(f)
+
+        print('Serializing generated ONNX {} model ...'.format(onnx_model_name))
+        with open(os.path.join(onnx_model_dir, 'model.onnx'), 'wb') as file:
+            file.write(onnx_model.SerializeToString())
+
+        print('Verifying model {} with ONNX model checker...'.format(onnx_model_name))
+        onnx.checker.check_model(onnx_model)
+
+        total_existing_data_set = 0
+        print('Verifying model {} with existing test data...'.format(onnx_model_name))
+        for f in glob.glob(os.path.join(onnx_model_dir, '*.npz')):
+            test_data = np.load(f, encoding='bytes')
+            inputs = list(test_data['inputs'])
+            ref_outputs = list(test_data['outputs'])
+            onnx_verify(onnx_model, inputs, ref_outputs)
+            total_existing_data_set += 1
+        for f in glob.glob(os.path.join(onnx_model_dir, 'test_data_set*')):
+            inputs = []
+            inputs_num = len(glob.glob(os.path.join(f, 'input_*.pb')))
+            for i in range(inputs_num):
+                tensor = onnx.TensorProto()
+                with open(os.path.join(f, 'input_{}.pb'.format(i)), 'rb') as pf:
+                    tensor.ParseFromString(pf.read())
+                inputs.append(numpy_helper.to_array(tensor))
+            ref_outputs = []
+            ref_outputs_num = len(glob.glob(os.path.join(f, 'output_*.pb')))
+            for i in range(ref_outputs_num):
+                tensor = onnx.TensorProto()
+                with open(os.path.join(f, 'output_{}.pb'.format(i)), 'rb') as pf:
+                    tensor.ParseFromString(pf.read())
+                ref_outputs.append(numpy_helper.to_array(tensor))
+            onnx_verify(onnx_model, inputs, ref_outputs)
+            total_existing_data_set += 1
+
+        starting_index = 0
+        while os.path.exists(os.path.join(onnx_model_dir, 'test_data_set_{}'.format(starting_index))):
+            starting_index += 1
+
+        if total_existing_data_set == 0 and add_test_data == 0:
+            add_test_data = 3
+            total_existing_data_set = 3
+
+        print('Generating {} sets of new test data...'.format(add_test_data))
+        for i in range(starting_index, add_test_data + starting_index):
+            data_dir = os.path.join(onnx_model_dir, 'test_data_set_{}'.format(i))
+            os.makedirs(data_dir)
+            inputs = generate_test_input_data(onnx_model, 255)
+            ref_outputs = generate_test_output_data(c2_init_net, c2_predict_net, inputs)
+            onnx_verify(onnx_model, inputs, ref_outputs)
+            for index, input in enumerate(inputs):
+                tensor = numpy_helper.from_array(input[1])
+                with open(os.path.join(data_dir, 'input_{}.pb'.format(index)), 'wb') as file:
+                    file.write(tensor.SerializeToString())
+            for index, output in enumerate(ref_outputs):
+                tensor = numpy_helper.from_array(output)
+                with open(os.path.join(data_dir, 'output_{}.pb'.format(index)), 'wb') as file:
+                    file.write(tensor.SerializeToString())
+
+        del onnx_model
+        del c2_init_net
+        del c2_predict_net
+
+        upload_onnx_model(onnx_model_name, onnx_zoo_dir, backup=False, only_local=only_local)
+
+        print('\n\n')
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
new file mode 100755
index 0000000..7a61210
--- /dev/null
+++ b/scripts/onnx/install-develop.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+# realpath might not be available on MacOS
+script_path=$(python -c "import os; import sys; print(os.path.realpath(sys.argv[1]))" "${BASH_SOURCE[0]}")
+top_dir=$(dirname $(dirname $(dirname "$script_path")))
+tp2_dir="$top_dir/third_party"
+
+pip install ninja
+
+# Install caffe2
+pip install -r "$top_dir/caffe2/requirements.txt"
+python setup_caffe2.py develop
+
+# Install onnx
+pip install -e "$tp2_dir/onnx"
+
+# Install pytorch
+pip install -r "$top_dir/requirements.txt"
+python setup.py build develop
diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh
new file mode 100755
index 0000000..a253642
--- /dev/null
+++ b/scripts/onnx/install.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+# realpath might not be available on MacOS
+script_path=$(python -c "import os; import sys; print(os.path.realpath(sys.argv[1]))" "${BASH_SOURCE[0]}")
+top_dir=$(dirname $(dirname $(dirname "$script_path")))
+tp2_dir="$top_dir/third_party"
+BUILD_DIR="$top_dir/build"
+mkdir -p "$BUILD_DIR"
+
+_pip_install() {
+    if [[ -n "$CI" ]]; then
+        if [[ -z "${SCCACHE_BUCKET}" ]]; then
+            ccache -z
+        fi
+    fi
+    if [[ -n "$CI" ]]; then
+        time pip install "$@"
+    else
+        pip install "$@"
+    fi
+    if [[ -n "$CI" ]]; then
+        if [[ -n "${SCCACHE_BUCKET}" ]]; then
+            sccache --show-stats
+        else
+            ccache -s
+        fi
+    fi
+}
+
+pip install -r "$top_dir/caffe2/requirements.txt"
+python setup_caffe2.py install
+
+# Install onnx
+_pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
+
+# Install pytorch
+pip install -r "$top_dir/requirements.txt"
+_pip_install -b "$BUILD_DIR/pytorch" "file://$top_dir#egg=torch"
diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
new file mode 100755
index 0000000..9dc3974
--- /dev/null
+++ b/scripts/onnx/test.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+set -ex
+
+UNKNOWN=()
+
+# defaults
+PARALLEL=0
+
+while [[ $# -gt 0 ]]
+do
+    arg="$1"
+    case $arg in
+        -p|--parallel)
+            PARALLEL=1
+            shift # past argument
+            ;;
+        *) # unknown option
+            UNKNOWN+=("$1") # save it in an array for later
+            shift # past argument
+            ;;
+    esac
+done
+set -- "${UNKNOWN[@]}" # leave UNKNOWN
+
+pip install pytest torchvision
+if [[ $PARALLEL == 1 ]]; then
+    pip install pytest-xdist
+fi
+
+# realpath might not be available on MacOS
+script_path=$(python -c "import os; import sys; print(os.path.realpath(sys.argv[1]))" "${BASH_SOURCE[0]}")
+top_dir=$(dirname $(dirname $(dirname "$script_path")))
+test_paths=(
+    "$top_dir/test/onnx"
+)
+
+if hash catchsegv 2>/dev/null; then
+    PYTEST="catchsegv pytest"
+else
+    PYTEST="pytest"
+fi
+
+if [[ $PARALLEL == 1 ]]; then
+    $PYTEST -n 3 "${test_paths[@]}"
+else
+    $PYTEST "${test_paths[@]}"
+fi
diff --git a/scripts/read_conda_versions.sh b/scripts/read_conda_versions.sh
new file mode 100755
index 0000000..3ca9b8a
--- /dev/null
+++ b/scripts/read_conda_versions.sh
@@ -0,0 +1,184 @@
+# Simple script used to easily search all packages in conda for their
+# dependency requirements
+# TODO also search through output of ldd
+# TODO update conda info syntax for different channels
+
+if [ -z "$CONDA_ROOT" ]; then
+  # TODO create our own environment
+  echo "Please set CONDA_ROOT so that I know where to search for conda libraries"
+  echo "I expect CONDA_ROOT to be the path to the current conda environment."
+  echo "Also FYI I will probably mess up the current conda environment."
+  exit 1
+fi
+
+if [ -z "$1" ]; then
+  echo "Please give me a package name to search for"
+  exit 1
+fi
+PKG_NAME="$1"
+
+if [ -n "$2" ]; then
+  echo "Searching in channel $2"
+  CONDA_CHANNEL="$2"
+fi
+
+# These are the packages of interest to search the dependencies for
+# TODO use this
+PACKAGES_OF_INTEREST=( libgcc-ng libprotobuf numpy )
+
+# We will run `conda install` and `conda uninstall` a lot, but we don't want
+# this very noisy output to clutter the user experience
+VERBOSE_LOG='read_conda_versions.log'
+echo "Conda install/uninstall log for $PKG_NAME" > $VERBOSE_LOG
+
+
+
+#
+# Build up the name of the installed library to call `nm` on
+#
+PKG_INSTALLED_LIB="$PKG_NAME"
+
+# opencv installs a bunch of libraries. We'll just check libopencv_core
+if [[ $PKG_NAME == opencv ]]; then
+  PKG_INSTALLED_LIB="${PKG_INSTALLED_LIB}_core"
+fi
+
+# Most packages prepend a 'lib' to the package name, but libprotobuf is an
+# exception
+if [[ $PKG_NAME != lib* ]]; then
+  PKG_INSTALLED_LIB="lib${PKG_INSTALLED_LIB}"
+fi
+
+# The shared library suffix differs on macOS an Linux
+if [[ "$(uname)" == Darwin ]]; then
+  PKG_INSTALLED_LIB="${PKG_INSTALLED_LIB}.dylib"
+else
+  PKG_INSTALLED_LIB="${PKG_INSTALLED_LIB}.so"
+fi
+echo "Determined the library name of $PKG_NAME to be $PKG_INSTALLED_LIB"
+echo "Determined the library name of $PKG_NAME to be $PKG_INSTALLED_LIB" >> $VERBOSE_LOG
+
+
+
+#
+# Get all available packages with conda-search
+#
+
+# Split the output from conda search into an array, one line per package (plus
+# the header)
+conda_search_packages=()
+while read -r line; do conda_search_packages+=("$line"); done <<< "$(conda search $PKG_NAME $CONDA_CHANNEL)"
+
+### Typical `conda search` output looks like
+###   Loading channels: done
+###   Name                       Version                   Build  Channel
+###   protobuf                   2.6.1                    py27_0  defaults
+###                              2.6.1                    py27_1  defaults
+###                              3.2.0                    py27_0  defaults
+###                              3.2.0                    py35_0  defaults
+###                              3.2.0                    py36_0  defaults
+###                              3.4.1            py27h66c1d77_0  defaults
+###                              3.4.1            py35h9d33684_0  defaults
+###                              3.4.1            py36h314970b_0  defaults
+###                              3.5.1            py27h0a44026_0  defaults
+###                              3.5.1            py35h0a44026_0  defaults
+###                              3.5.1            py36h0a44026_0  defaults
+##
+### Typical `conda info` output looks like
+###   protobuf 3.5.1 py36h0a44026_0
+###     -----------------------------
+###   file name   : protobuf-3.5.1-py36h0a44026_0.tar.bz2
+###   name        : protobuf
+###   version     : 3.5.1
+###   build string: py36h0a44026_0
+###   build number: 0
+###   channel     : https://repo.continuum.io/pkgs/main/osx-64
+###   size        : 589 KB
+###   arch        : None
+###   constrains  : ()
+###   license     : New BSD License
+###   license_family: BSD
+###   md5         : 7dbdb06612e21c42fbb8a62354e13e10
+###   platform    : None
+###   subdir      : osx-64
+###   timestamp   : 1519951502766
+###   url         : https://repo.continuum.io/pkgs/main/osx-64/protobuf-3.5.1-py36h0a44026_0.tar.bz2
+###   dependencies:
+###       libcxx >=4.0.1
+###       libprotobuf >=3.5.1,<3.6.0a0
+###       python >=3.6,<3.7.0a0
+###       six
+
+# Echo what packages we'll look through.
+echo "Processing these packages:"
+for pkg in "${conda_search_packages[@]:2}"; do
+  echo "  $pkg"
+done
+
+
+
+#
+# Look up each package in conda info, then install it and search the exported
+# symbols for signs of cxx11
+#
+for pkg in "${conda_search_packages[@]:2}"; do
+  echo "Processing $pkg" >> $VERBOSE_LOG
+
+  # Split each line into an array and build the package specification
+  # <package_name (1st line only)>  maj.min.patch  build_string  channel_name
+  line_parts=( $pkg )
+  if [[ ${line_parts[0]} == $PKG_NAME ]]; then
+    # First line of output
+    PKG_VERSION="${line_parts[1]}"
+    PKG_BUILD_STR="${line_parts[2]}"
+  else
+    PKG_VERSION="${line_parts[0]}"
+    PKG_BUILD_STR="${line_parts[1]}"
+  fi
+  PKG_SPEC="$PKG_NAME=$PKG_VERSION=$PKG_BUILD_STR"
+
+  # Output current pkg spec
+  echo
+  echo "${PKG_SPEC}:"
+  echo "Determined that the package spec is $PKG_SPEC" >> $VERBOSE_LOG
+
+  # Split the output of conda_info into an array of lines
+  pkg_dependencies=()
+  while read -r line; do pkg_dependencies+=("$line"); done <<< "$(conda info "$PKG_SPEC" $CONDA_CHANNEL)"
+
+  # List all the listed dependencies in `conda info`
+  if [ "${#pkg_dependencies[@]}" -gt 19 ]; then
+    echo "  Listed dependencies:"
+    echo "  Listed dependencies:" >> $VERBOSE_LOG
+    for pkg_dependency in "${pkg_dependencies[@]:20}"; do
+      echo "    $pkg_dependency"
+      echo "    $pkg_dependency" >> $VERBOSE_LOG
+    done
+  else
+    echo "  No listed dependencies in conda-info" >> $VERBOSE_LOG
+  fi
+
+  # But sometimes (a lot of the time) the gcc with which a package was built
+  # against is not listed in dependencies. So we try to figure it out manually
+  # We install this exact package, and then grep the exported symbols for signs
+  # of cxx11
+  echo "Calling conda-uninstall on $PKG_NAME" >> $VERBOSE_LOG
+  echo "conda uninstall -y $PKG_NAME --quiet" >> $VERBOSE_LOG
+  conda uninstall -y "$PKG_NAME" --quiet >> $VERBOSE_LOG 2>&1
+
+  echo "Calling conda-install on $PKG_SPEC" >> $VERBOSE_LOG
+  echo "conda install -y $PKG_SPEC --quiet --no-deps $CONDA_CHANNEL" >> $VERBOSE_LOG
+  conda install -y "$PKG_SPEC" --quiet --no-deps $CONDA_CHANNEL >> $VERBOSE_LOG 2>&1
+  if [ $? -eq 0 ]; then
+    # Only grep the exported symbols if the library was installed correctly
+
+    MENTIONS_CXX11="$(nm "$CONDA_ROOT/lib/$PKG_INSTALLED_LIB" | grep cxx11 | wc -l)"
+    if [ $MENTIONS_CXX11 -gt 0 ]; then
+      echo "  This package is built against the recent gcc ABI ($MENTIONS_CXX11 mentions of cxx11)"
+      echo "$CONDA_ROOT/lib/$PKG_INSTALLED_LIB mentions cxx11 $MENTIONS_CXX11 times" >> $VERBOSE_LOG
+    fi
+  else
+    echo "Error installing $PKG_SPEC , continuing"
+    echo "Error installing $PKG_SPEC , continuing" >> $VERBOSE_LOG
+  fi
+done
diff --git a/scripts/remove_apache_header.sh b/scripts/remove_apache_header.sh
new file mode 100755
index 0000000..97980bf
--- /dev/null
+++ b/scripts/remove_apache_header.sh
@@ -0,0 +1,13 @@
+if [[ "$1" == *.py ]]; then
+  apache_header="apache_python.txt"
+else
+  apache_header="apache_header.txt"
+fi
+apache_lines=$(wc -l < "${apache_header}")
+apache_md5=$(cat "${apache_header}" | md5)
+header_md5=$(head -n ${apache_lines} $1 | md5)
+if [ "${header_md5}" == "${apache_md5}" ]; then
+  keep_lines=$(($(wc -l < $1) - ${apache_lines}))
+  tail -n ${keep_lines} $1 > _remove_apache_header.txt
+  mv _remove_apache_header.txt $1
+fi
diff --git a/scripts/temp.sh b/scripts/temp.sh
new file mode 100755
index 0000000..18eb2b4
--- /dev/null
+++ b/scripts/temp.sh
@@ -0,0 +1,7 @@
+find ../caffe2 -name "*.py" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.h" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cc" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cpp" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cu" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.mm" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.m" -exec ./remove_apache_header.sh {} \;
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..9b672cf
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,1092 @@
+# Welcome to the PyTorch setup.py.
+#
+# Environment variables you are probably interested in:
+#
+#   DEBUG
+#     build with -O0 and -g (debug symbols)
+#
+#   MAX_JOBS
+#     maximum number of compile jobs we should use to compile your code
+#
+#   NO_CUDA
+#     disables CUDA build
+#
+#   CFLAGS
+#     flags to apply to both C and C++ files to be compiled (a quirk of setup.py
+#     which we have faithfully adhered to in our build system is that CFLAGS
+#     also applies to C++ files, in contrast to the default behavior of autogoo
+#     and cmake build systems.)
+#
+#   CC
+#     the C/C++ compiler to use (NB: the CXX flag has no effect for distutils
+#     compiles, because distutils always uses CC to compile, even for C++
+#     files.
+#
+# Environment variables for feature toggles:
+#
+#   NO_CUDNN
+#     disables the cuDNN build
+#
+#   NO_MKLDNN
+#     disables the MKLDNN build
+#
+#   NO_NNPACK
+#     disables NNPACK build
+#
+#   NO_DISTRIBUTED
+#     disables THD (distributed) build
+#
+#   NO_SYSTEM_NCCL
+#     disables use of system-wide nccl (we will use our submoduled
+#     copy in third_party/nccl)
+#
+#   USE_GLOO_IBVERBS
+#     toggle features related to distributed support
+#
+#   PYTORCH_BUILD_VERSION
+#   PYTORCH_BUILD_NUMBER
+#     specify the version of PyTorch, rather than the hard-coded version
+#     in this file; used when we're building binaries for distribution
+#
+#   TORCH_CUDA_ARCH_LIST
+#     specify which CUDA architectures to build for.
+#     ie `TORCH_CUDA_ARCH_LIST="6.0;7.0"`
+#
+#   ONNX_NAMESPACE
+#     specify a namespace for ONNX built here rather than the hard-coded
+#     one in this file; needed to build with other frameworks that share ONNX.
+#
+# Environment variables we respect (these environment variables are
+# conventional and are often understood/set by other software.)
+#
+#   CUDA_HOME (Linux/OS X)
+#   CUDA_PATH (Windows)
+#     specify where CUDA is installed; usually /usr/local/cuda or
+#     /usr/local/cuda-x.y
+#
+#   CUDNN_LIB_DIR
+#   CUDNN_INCLUDE_DIR
+#   CUDNN_LIBRARY
+#     specify where cuDNN is installed
+#
+#   NCCL_ROOT_DIR
+#   NCCL_LIB_DIR
+#   NCCL_INCLUDE_DIR
+#     specify where nccl is installed
+#
+#   MKLDNN_LIB_DIR
+#   MKLDNN_LIBRARY
+#   MKLDNN_INCLUDE_DIR
+#     specify where MKLDNN is installed
+#
+#   NVTOOLSEXT_PATH (Windows only)
+#     specify where nvtoolsext is installed
+#
+#   LIBRARY_PATH
+#   LD_LIBRARY_PATH
+#     we will search for libraries in these paths
+
+from setuptools import setup, Extension, distutils, Command, find_packages
+import setuptools.command.build_ext
+import setuptools.command.install
+import setuptools.command.develop
+import setuptools.command.build_py
+import distutils.unixccompiler
+import distutils.command.build
+import distutils.command.clean
+import distutils.sysconfig
+import platform
+import subprocess
+import shutil
+import multiprocessing
+import sys
+import os
+import json
+import glob
+import importlib
+
+from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
+
+# Before we run the setup_helpers, let's look for NO_* and WITH_*
+# variables and hotpatch the environment with the USE_* equivalent
+config_env_vars = ['CUDA', 'CUDNN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED', 'DISTRIBUTED_MW',
+                   'SYSTEM_NCCL', 'GLOO_IBVERBS']
+
+
+def hotpatch_var(var):
+    if check_env_flag('NO_' + var):
+        os.environ['USE_' + var] = '0'
+    elif check_negative_env_flag('NO_' + var):
+        os.environ['USE_' + var] = '1'
+    elif check_env_flag('WITH_' + var):
+        os.environ['USE_' + var] = '1'
+    elif check_negative_env_flag('WITH_' + var):
+        os.environ['USE_' + var] = '0'
+
+list(map(hotpatch_var, config_env_vars))
+
+from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
+from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
+                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
+from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
+    NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
+from tools.setup_helpers.mkldnn import (USE_MKLDNN, MKLDNN_LIBRARY,
+                                        MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR)
+from tools.setup_helpers.nnpack import USE_NNPACK
+from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
+from tools.setup_helpers.generate_code import generate_code
+from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
+from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
+    USE_DISTRIBUTED_MW, USE_GLOO_IBVERBS, USE_C10D
+
+################################################################################
+# Parameters parsed from environment
+################################################################################
+
+DEBUG = check_env_flag('DEBUG')
+IS_WINDOWS = (platform.system() == 'Windows')
+IS_DARWIN = (platform.system() == 'Darwin')
+IS_LINUX = (platform.system() == 'Linux')
+
+FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
+BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
+
+USE_CUDA_STATIC_LINK = False
+
+NUM_JOBS = multiprocessing.cpu_count()
+max_jobs = os.getenv("MAX_JOBS")
+if max_jobs is not None:
+    NUM_JOBS = min(NUM_JOBS, int(max_jobs))
+
+ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
+if not ONNX_NAMESPACE:
+    ONNX_NAMESPACE = "onnx_torch"
+
+# Ninja
+try:
+    import ninja
+    USE_NINJA = True
+    ninja_global = NinjaBuilder('global')
+except ImportError:
+    USE_NINJA = False
+    ninja_global = None
+
+# Constant known variables used throughout this file
+cwd = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.join(cwd, "torch", "lib")
+third_party_path = os.path.join(cwd, "third_party")
+tmp_install_path = lib_path + "/tmp_install"
+rel_site_packages = distutils.sysconfig.get_python_lib(prefix='')
+full_site_packages = distutils.sysconfig.get_python_lib()
+
+
+class PytorchCommand(setuptools.Command):
+    """
+    Base Pytorch command to avoid implementing initialize/finalize_options in
+    every subclass
+    """
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+
+################################################################################
+# Patches and workarounds
+################################################################################
+# Monkey-patch setuptools to compile in parallel
+if not USE_NINJA:
+    def parallelCCompile(self, sources, output_dir=None, macros=None,
+                         include_dirs=None, debug=0, extra_preargs=None,
+                         extra_postargs=None, depends=None):
+        # those lines are copied from distutils.ccompiler.CCompiler directly
+        macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
+            output_dir, macros, include_dirs, sources, depends, extra_postargs)
+        cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
+
+        # compile using a thread pool
+        import multiprocessing.pool
+
+        def _single_compile(obj):
+            src, ext = build[obj]
+            self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+        multiprocessing.pool.ThreadPool(NUM_JOBS).map(_single_compile, objects)
+
+        return objects
+    distutils.ccompiler.CCompiler.compile = parallelCCompile
+
+# Patch for linking with ccache
+original_link = distutils.unixccompiler.UnixCCompiler.link
+
+
+def patched_link(self, *args, **kwargs):
+    _cxx = self.compiler_cxx
+    self.compiler_cxx = None
+    result = original_link(self, *args, **kwargs)
+    self.compiler_cxx = _cxx
+    return result
+
+distutils.unixccompiler.UnixCCompiler.link = patched_link
+
+# Workaround setuptools -Wstrict-prototypes warnings
+# I lifted this code from https://stackoverflow.com/a/29634231/23845
+cfg_vars = distutils.sysconfig.get_config_vars()
+for key, value in cfg_vars.items():
+    if type(value) == str:
+        cfg_vars[key] = value.replace("-Wstrict-prototypes", "")
+
+
+################################################################################
+# Version and create_version_file
+################################################################################
+version = '0.5.0a0'
+if os.getenv('PYTORCH_BUILD_VERSION'):
+    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
+    build_number = int(os.getenv('PYTORCH_BUILD_NUMBER'))
+    version = os.getenv('PYTORCH_BUILD_VERSION')
+    if build_number > 1:
+        version += '.post' + str(build_number)
+else:
+    try:
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
+        version += '+' + sha[:7]
+    except Exception:
+        pass
+
+
+class create_version_file(PytorchCommand):
+    def run(self):
+        global version, cwd
+        print('-- Building version ' + version)
+        version_path = os.path.join(cwd, 'torch', 'version.py')
+        with open(version_path, 'w') as f:
+            f.write("__version__ = '{}'\n".format(version))
+            # NB: This is not 100% accurate, because you could have built the
+            # library code with DEBUG, but csrc without DEBUG (in which case
+            # this would claim to be a release build when it's not.)
+            f.write("debug = {}\n".format(repr(DEBUG)))
+            f.write("cuda = {}\n".format(repr(CUDA_VERSION)))
+
+
+################################################################################
+# Building dependent libraries
+################################################################################
+
+# All libraries that torch could depend on
+dep_libs = [
+    'nccl', 'caffe2',
+    'libshm', 'libshm_windows', 'gloo', 'THD', 'nanopb', 'c10d',
+]
+
+missing_pydep = '''
+Missing build dependency: Unable to `import {importname}`.
+Please install it via `conda install {module}` or `pip install {module}`
+'''.strip()
+
+
+def check_pydep(importname, module):
+    try:
+        importlib.import_module(importname)
+    except ImportError:
+        raise RuntimeError(missing_pydep.format(importname=importname, module=module))
+
+
+# Calls build_pytorch_libs.sh/bat with the correct env variables
+def build_libs(libs):
+    for lib in libs:
+        assert lib in dep_libs, 'invalid lib: {}'.format(lib)
+    if IS_WINDOWS:
+        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
+    else:
+        build_libs_cmd = ['bash', 'tools/build_pytorch_libs.sh']
+    my_env = os.environ.copy()
+    my_env["PYTORCH_PYTHON"] = sys.executable
+    my_env["CMAKE_PREFIX_PATH"] = full_site_packages
+    my_env["NUM_JOBS"] = str(NUM_JOBS)
+    my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
+    if not IS_WINDOWS:
+        if USE_NINJA:
+            my_env["CMAKE_GENERATOR"] = '-GNinja'
+            my_env["CMAKE_INSTALL"] = 'ninja install'
+        else:
+            my_env['CMAKE_GENERATOR'] = ''
+            my_env['CMAKE_INSTALL'] = 'make install'
+    if USE_SYSTEM_NCCL:
+        my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
+    if USE_CUDA:
+        my_env["CUDA_BIN_PATH"] = CUDA_HOME
+        build_libs_cmd += ['--use-cuda']
+    if USE_CUDA_STATIC_LINK:
+        build_libs_cmd += ['--cuda-static-link']
+    if USE_ROCM:
+        build_libs_cmd += ['--use-rocm']
+    if USE_NNPACK:
+        build_libs_cmd += ['--use-nnpack']
+    if USE_CUDNN:
+        my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
+        my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
+        my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
+    if USE_MKLDNN:
+        my_env["MKLDNN_LIB_DIR"] = MKLDNN_LIB_DIR
+        my_env["MKLDNN_LIBRARY"] = MKLDNN_LIBRARY
+        my_env["MKLDNN_INCLUDE_DIR"] = MKLDNN_INCLUDE_DIR
+        build_libs_cmd += ['--use-mkldnn']
+    if USE_GLOO_IBVERBS:
+        build_libs_cmd += ['--use-gloo-ibverbs']
+    if USE_DISTRIBUTED_MW:
+        build_libs_cmd += ['--use-distributed-mw']
+    if FULL_CAFFE2:
+        build_libs_cmd += ['--full-caffe2']
+
+    if subprocess.call(build_libs_cmd + libs, env=my_env) != 0:
+        print("Failed to run '{}'".format(' '.join(build_libs_cmd + libs)))
+        sys.exit(1)
+
+
+# Build all dependent libraries
+class build_deps(PytorchCommand):
+    def run(self):
+        # Check if you remembered to check out submodules
+        def check_file(f):
+            if not os.path.exists(f):
+                print("Could not find {}".format(f))
+                print("Did you run 'git submodule update --init'?")
+                sys.exit(1)
+        check_file(os.path.join(third_party_path, "gloo", "CMakeLists.txt"))
+        check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt"))
+        check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
+        check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
+        check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt'))
+        check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt'))
+
+        check_pydep('yaml', 'pyyaml')
+        check_pydep('typing', 'typing')
+
+        libs = []
+        if USE_NCCL and not USE_SYSTEM_NCCL:
+            libs += ['nccl']
+        libs += ['caffe2', 'nanopb']
+        if IS_WINDOWS:
+            libs += ['libshm_windows']
+        else:
+            libs += ['libshm']
+        if USE_DISTRIBUTED:
+            if sys.platform.startswith('linux'):
+                libs += ['gloo']
+            libs += ['THD']
+        if USE_C10D:
+            libs += ['c10d']
+        build_libs(libs)
+
+        # Use copies instead of symbolic files.
+        # Windows has very poor support for them.
+        sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py']
+        orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py']
+        for sym_file, orig_file in zip(sym_files, orig_files):
+            if os.path.exists(sym_file):
+                os.remove(sym_file)
+            shutil.copyfile(orig_file, sym_file)
+
+        # Copy headers necessary to compile C++ extensions.
+        #
+        # This is not perfect solution as build does not depend on any of
+        # the auto-generated code and auto-generated files will not be
+        # included in this copy. If we want to use auto-generated files,
+        # we need to find a better way to do this.
+        # More information can be found in conversation thread of PR #5772
+
+        self.copy_tree('torch/csrc', 'torch/lib/include/torch/csrc/')
+        self.copy_tree('third_party/pybind11/include/pybind11/',
+                       'torch/lib/include/pybind11')
+        self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h')
+
+
+build_dep_cmds = {}
+
+for lib in dep_libs:
+    # wrap in function to capture lib
+    class build_dep(build_deps):
+        description = 'Build {} external library'.format(lib)
+
+        def run(self):
+            build_libs([self.lib])
+    build_dep.lib = lib
+    build_dep_cmds['build_' + lib.lower()] = build_dep
+
+
+class build_module(PytorchCommand):
+    def run(self):
+        self.run_command('build_py')
+        self.run_command('build_ext')
+
+
+class build_py(setuptools.command.build_py.build_py):
+
+    def run(self):
+        self.run_command('create_version_file')
+        setuptools.command.build_py.build_py.run(self)
+
+
+class develop(setuptools.command.develop.develop):
+
+    def run(self):
+        self.run_command('create_version_file')
+        setuptools.command.develop.develop.run(self)
+        self.create_compile_commands()
+
+    def create_compile_commands(self):
+        def load(filename):
+            with open(filename) as f:
+                return json.load(f)
+        ninja_files = glob.glob('build/*compile_commands.json')
+        cmake_files = glob.glob('torch/lib/build/*/compile_commands.json')
+        all_commands = [entry
+                        for f in ninja_files + cmake_files
+                        for entry in load(f)]
+        with open('compile_commands.json', 'w') as f:
+            json.dump(all_commands, f, indent=2)
+        if not USE_NINJA:
+            print("WARNING: 'develop' is not building C++ code incrementally")
+            print("because ninja is not installed. Run this to enable it:")
+            print(" > pip install ninja")
+
+
+def monkey_patch_THD_link_flags():
+    '''
+    THD's dynamic link deps are not determined until after build_deps is run
+    So, we need to monkey-patch them in later
+    '''
+    # read tmp_install_path/THD_deps.txt for THD's dynamic linkage deps
+    with open(tmp_install_path + '/THD_deps.txt', 'r') as f:
+        thd_deps_ = f.read()
+    thd_deps = []
+    # remove empty lines
+    for l in thd_deps_.split(';'):
+        if l != '':
+            thd_deps.append(l)
+
+    C.extra_link_args += thd_deps
+
+
+build_ext_parent = ninja_build_ext if USE_NINJA \
+    else setuptools.command.build_ext.build_ext
+
+
+class build_ext(build_ext_parent):
+
+    def run(self):
+        # Print build options
+        if USE_NUMPY:
+            print('-- Building with NumPy bindings')
+        else:
+            print('-- NumPy not found')
+        if USE_CUDNN:
+            print('-- Detected cuDNN at ' + CUDNN_LIBRARY + ', ' + CUDNN_INCLUDE_DIR)
+        else:
+            print('-- Not using cuDNN')
+        if USE_CUDA:
+            print('-- Detected CUDA at ' + CUDA_HOME)
+        else:
+            print('-- Not using CUDA')
+        if USE_MKLDNN:
+            print('-- Detected MKLDNN at ' + MKLDNN_LIBRARY + ', ' + MKLDNN_INCLUDE_DIR)
+        else:
+            print('-- Not using MKLDNN')
+        if USE_NCCL and USE_SYSTEM_NCCL:
+            print('-- Using system provided NCCL library at ' +
+                  NCCL_SYSTEM_LIB + ', ' + NCCL_INCLUDE_DIR)
+        elif USE_NCCL:
+            print('-- Building NCCL library')
+        else:
+            print('-- Not using NCCL')
+        if USE_DISTRIBUTED:
+            print('-- Building with distributed package ')
+            monkey_patch_THD_link_flags()
+        else:
+            print('-- Building without distributed package')
+
+        generate_code(ninja_global)
+
+        if USE_NINJA:
+            # before we start the normal build make sure all generated code
+            # gets built
+            ninja_global.run()
+
+        # It's an old-style class in Python 2.7...
+        setuptools.command.build_ext.build_ext.run(self)
+
+        # Copy the essential export library to compile C++ extensions.
+        if IS_WINDOWS:
+            build_temp = self.build_temp
+
+            ext_filename = self.get_ext_filename('_C')
+            lib_filename = '.'.join(ext_filename.split('.')[:-1]) + '.lib'
+
+            export_lib = os.path.join(
+                build_temp, 'torch', 'csrc', lib_filename).replace('\\', '/')
+
+            build_lib = self.build_lib
+
+            target_lib = os.path.join(
+                build_lib, 'torch', 'lib', '_C.lib').replace('\\', '/')
+
+            self.copy_file(export_lib, target_lib)
+
+    def build_extensions(self):
+        # The caffe2 extensions are created in
+        # tmp_install/lib/pythonM.m/site-packages/caffe2/python/
+        # and need to be copied to build/lib.linux.... , which will be a
+        # platform dependent build folder created by the "build" command of
+        # setuptools. Only the contents of this folder are installed in the
+        # "install" command by default.
+        if FULL_CAFFE2:
+            sys.exit()
+            # We only make this copy for Caffe2's pybind extensions
+            caffe2_pybind_exts = [
+                'caffe2.python.caffe2_pybind11_state',
+                'caffe2.python.caffe2_pybind11_state_gpu',
+                'caffe2.python.caffe2_pybind11_state_hip',
+            ]
+            i = 0
+            while i < len(self.extensions):
+                ext = self.extensions[i]
+                if ext.name not in caffe2_pybind_exts:
+                    i += 1
+                    continue
+                fullname = self.get_ext_fullname(ext.name)
+                filename = self.get_ext_filename(fullname)
+
+                src = os.path.join(tmp_install_path, rel_site_packages, filename)
+                if not os.path.exists(src):
+                    print("{} does not exist".format(src))
+                    del self.extensions[i]
+                else:
+                    dst = os.path.join(os.path.realpath(self.build_lib), filename)
+                    dst_dir = os.path.dirname(dst)
+                    if not os.path.exists(dst_dir):
+                        os.makedirs(dst_dir)
+                    self.copy_file(src, dst)
+                    i += 1
+        distutils.command.build_ext.build_ext.build_extensions(self)
+
+    def get_outputs(self):
+        outputs = distutils.command.build_ext.build_ext.get_outputs(self)
+        if FULL_CAFFE2:
+            outputs += [os.path.join(self.build_lib, d) for d in ['caffe', 'caffe2']]
+        return outputs
+
+
+class build(distutils.command.build.build):
+    sub_commands = [
+        ('build_deps', lambda self: True),
+    ] + distutils.command.build.build.sub_commands
+
+
+class install(setuptools.command.install.install):
+
+    def run(self):
+        if not self.skip_build:
+            self.run_command('build_deps')
+
+        setuptools.command.install.install.run(self)
+
+
+class clean(distutils.command.clean.clean):
+
+    def run(self):
+        import glob
+        with open('.gitignore', 'r') as f:
+            ignores = f.read()
+            for wildcard in filter(bool, ignores.split('\n')):
+                for filename in glob.glob(wildcard):
+                    try:
+                        os.remove(filename)
+                    except OSError:
+                        shutil.rmtree(filename, ignore_errors=True)
+
+        # It's an old-style class in Python 2.7...
+        distutils.command.clean.clean.run(self)
+
+
+################################################################################
+# Configure compile flags
+################################################################################
+
+include_dirs = []
+library_dirs = []
+
+if IS_WINDOWS:
+    # /NODEFAULTLIB makes sure we only link to DLL runtime
+    # and matches the flags set for protobuf and ONNX
+    extra_link_args = ['/NODEFAULTLIB:LIBCMT.LIB']
+    # /MD links against DLL runtime
+    # and matches the flags set for protobuf and ONNX
+    # /Z7 turns on symbolic debugging information in .obj files
+    # /EHa is about native C++ catch support for asynchronous
+    # structured exception handling (SEH)
+    # /DNOMINMAX removes builtin min/max functions
+    # /wdXXXX disables warning no. XXXX
+    extra_compile_args = ['/MD', '/Z7',
+                          '/EHa', '/DNOMINMAX',
+                          '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
+                          '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
+                          '/wd4275']
+    if sys.version_info[0] == 2:
+        # /bigobj increases number of sections in .obj file, which is needed to link
+        # against libaries in Python 2.7 under Windows
+        extra_compile_args.append('/bigobj')
+else:
+    extra_link_args = []
+    extra_compile_args = [
+        '-std=c++11',
+        '-Wall',
+        '-Wextra',
+        '-Wno-unused-parameter',
+        '-Wno-missing-field-initializers',
+        '-Wno-write-strings',
+        '-Wno-zero-length-array',
+        '-Wno-unknown-pragmas',
+        # This is required for Python 2 declarations that are deprecated in 3.
+        '-Wno-deprecated-declarations',
+        # Python 2.6 requires -fno-strict-aliasing, see
+        # http://legacy.python.org/dev/peps/pep-3123/
+        # We also depend on it in our code (even Python 3).
+        '-fno-strict-aliasing',
+        # Clang has an unfixed bug leading to spurious missing
+        # braces warnings, see
+        # https://bugs.llvm.org/show_bug.cgi?id=21629
+        '-Wno-missing-braces'
+    ]
+    if check_env_flag('WERROR'):
+        extra_compile_args.append('-Werror')
+
+include_dirs += [
+    cwd,
+    tmp_install_path + "/include",
+    tmp_install_path + "/include/TH",
+    tmp_install_path + "/include/THNN",
+    tmp_install_path + "/include/ATen",
+    third_party_path + "/pybind11/include",
+    os.path.join(cwd, "torch", "csrc"),
+    "build/third_party",
+]
+
+library_dirs.append(lib_path)
+
+# we specify exact lib names to avoid conflict with lua-torch installs
+CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
+if USE_CUDA:
+    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
+if USE_ROCM:
+    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
+THD_LIB = os.path.join(lib_path, 'libTHD.a')
+NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
+C10D_LIB = os.path.join(lib_path, 'libc10d.a')
+
+# static library only
+NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a')
+if DEBUG:
+    PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobufd.a')
+else:
+    PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobuf.a')
+
+if IS_DARWIN:
+    CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
+    if USE_CUDA:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
+    if USE_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
+    NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
+
+if IS_WINDOWS:
+    CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
+    if USE_CUDA:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
+    if USE_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
+    # Windows needs direct access to ONNX libraries as well
+    # as through Caffe2 library
+    CAFFE2_LIBS += [
+        os.path.join(lib_path, 'onnx.lib'),
+        os.path.join(lib_path, 'onnx_proto.lib'),
+    ]
+    if DEBUG:
+        NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
+        PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobufd.lib')
+    else:
+        NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopb.lib')
+        PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobuf.lib')
+
+main_compile_args = ['-D_THP_CORE', '-DONNX_NAMESPACE=' + ONNX_NAMESPACE]
+main_libraries = ['shm']
+main_link_args = CAFFE2_LIBS + [NANOPB_STATIC_LIB, PROTOBUF_STATIC_LIB]
+main_sources = [
+    "torch/csrc/PtrWrapper.cpp",
+    "torch/csrc/Module.cpp",
+    "torch/csrc/Generator.cpp",
+    "torch/csrc/Size.cpp",
+    "torch/csrc/Dtype.cpp",
+    "torch/csrc/Device.cpp",
+    "torch/csrc/Exceptions.cpp",
+    "torch/csrc/Layout.cpp",
+    "torch/csrc/Storage.cpp",
+    "torch/csrc/DataLoader.cpp",
+    "torch/csrc/DynamicTypes.cpp",
+    "torch/csrc/assertions.cpp",
+    "torch/csrc/byte_order.cpp",
+    "torch/csrc/torch.cpp",
+    "torch/csrc/utils.cpp",
+    "torch/csrc/utils/cuda_lazy_init.cpp",
+    "torch/csrc/utils/invalid_arguments.cpp",
+    "torch/csrc/utils/object_ptr.cpp",
+    "torch/csrc/utils/python_arg_parser.cpp",
+    "torch/csrc/utils/tensor_list.cpp",
+    "torch/csrc/utils/tensor_new.cpp",
+    "torch/csrc/utils/tensor_numpy.cpp",
+    "torch/csrc/utils/tensor_dtypes.cpp",
+    "torch/csrc/utils/tensor_layouts.cpp",
+    "torch/csrc/utils/tensor_types.cpp",
+    "torch/csrc/utils/tuple_parser.cpp",
+    "torch/csrc/utils/tensor_apply.cpp",
+    "torch/csrc/utils/tensor_conversion_dispatch.cpp",
+    "torch/csrc/utils/tensor_flatten.cpp",
+    "torch/csrc/utils/variadic.cpp",
+    "torch/csrc/serialization.cpp",
+    "torch/csrc/finalizer.cpp",
+    "torch/csrc/jit/init.cpp",
+    "torch/csrc/jit/interpreter.cpp",
+    "torch/csrc/jit/register_prim_ops.cpp",
+    "torch/csrc/jit/python_interpreter.cpp",
+    "torch/csrc/jit/ir.cpp",
+    "torch/csrc/jit/fusion_compiler.cpp",
+    "torch/csrc/jit/graph_executor.cpp",
+    "torch/csrc/jit/python_ir.cpp",
+    "torch/csrc/jit/test_jit.cpp",
+    "torch/csrc/jit/tracer.cpp",
+    "torch/csrc/jit/tracer_state.cpp",
+    "torch/csrc/jit/python_tracer.cpp",
+    "torch/csrc/jit/passes/shape_analysis.cpp",
+    "torch/csrc/jit/interned_strings.cpp",
+    "torch/csrc/jit/type.cpp",
+    "torch/csrc/jit/export.cpp",
+    "torch/csrc/jit/import.cpp",
+    "torch/csrc/jit/autodiff.cpp",
+    "torch/csrc/jit/python_arg_flatten.cpp",
+    "torch/csrc/jit/variable_flags.cpp",
+    "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
+    "torch/csrc/jit/passes/graph_fuser.cpp",
+    "torch/csrc/jit/passes/onnx.cpp",
+    "torch/csrc/jit/passes/dead_code_elimination.cpp",
+    "torch/csrc/jit/passes/remove_expands.cpp",
+    "torch/csrc/jit/passes/lower_tuples.cpp",
+    "torch/csrc/jit/passes/lower_grad_of.cpp",
+    "torch/csrc/jit/passes/common_subexpression_elimination.cpp",
+    "torch/csrc/jit/passes/peephole.cpp",
+    "torch/csrc/jit/passes/inplace_check.cpp",
+    "torch/csrc/jit/passes/canonicalize.cpp",
+    "torch/csrc/jit/passes/batch_mm.cpp",
+    "torch/csrc/jit/passes/decompose_addmm.cpp",
+    "torch/csrc/jit/passes/specialize_undef.cpp",
+    "torch/csrc/jit/passes/erase_number_types.cpp",
+    "torch/csrc/jit/passes/loop_unrolling.cpp",
+    "torch/csrc/jit/passes/to_batch.cpp",
+    "torch/csrc/jit/passes/onnx/peephole.cpp",
+    "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
+    "torch/csrc/jit/generated/register_aten_ops.cpp",
+    "torch/csrc/jit/operator.cpp",
+    "torch/csrc/jit/script/lexer.cpp",
+    "torch/csrc/jit/script/compiler.cpp",
+    "torch/csrc/jit/script/module.cpp",
+    "torch/csrc/jit/script/init.cpp",
+    "torch/csrc/jit/script/python_tree_views.cpp",
+    "torch/csrc/jit/batched/BatchTensor.cpp",
+    "torch/csrc/autograd/init.cpp",
+    "torch/csrc/autograd/aten_variable_hooks.cpp",
+    "torch/csrc/autograd/grad_mode.cpp",
+    "torch/csrc/autograd/anomaly_mode.cpp",
+    "torch/csrc/autograd/python_anomaly_mode.cpp",
+    "torch/csrc/autograd/engine.cpp",
+    "torch/csrc/autograd/function.cpp",
+    "torch/csrc/autograd/variable.cpp",
+    "torch/csrc/autograd/saved_variable.cpp",
+    "torch/csrc/autograd/input_buffer.cpp",
+    "torch/csrc/autograd/profiler.cpp",
+    "torch/csrc/autograd/python_function.cpp",
+    "torch/csrc/autograd/python_cpp_function.cpp",
+    "torch/csrc/autograd/python_variable.cpp",
+    "torch/csrc/autograd/python_variable_indexing.cpp",
+    "torch/csrc/autograd/python_legacy_variable.cpp",
+    "torch/csrc/autograd/python_engine.cpp",
+    "torch/csrc/autograd/python_hook.cpp",
+    "torch/csrc/autograd/generated/VariableType.cpp",
+    "torch/csrc/autograd/generated/Functions.cpp",
+    "torch/csrc/autograd/generated/python_torch_functions.cpp",
+    "torch/csrc/autograd/generated/python_variable_methods.cpp",
+    "torch/csrc/autograd/generated/python_functions.cpp",
+    "torch/csrc/autograd/generated/python_nn_functions.cpp",
+    "torch/csrc/autograd/functions/basic_ops.cpp",
+    "torch/csrc/autograd/functions/tensor.cpp",
+    "torch/csrc/autograd/functions/accumulate_grad.cpp",
+    "torch/csrc/autograd/functions/utils.cpp",
+    "torch/csrc/autograd/functions/init.cpp",
+    "torch/csrc/nn/THNN.cpp",
+    "torch/csrc/tensor/python_tensor.cpp",
+    "torch/csrc/onnx/onnx.npb.cpp",
+    "torch/csrc/onnx/onnx.cpp",
+    "torch/csrc/onnx/init.cpp",
+]
+
+try:
+    import numpy as np
+    include_dirs.append(np.get_include())
+    extra_compile_args.append('-DUSE_NUMPY')
+    USE_NUMPY = True
+except ImportError:
+    USE_NUMPY = False
+
+if USE_DISTRIBUTED:
+    extra_compile_args += ['-DUSE_DISTRIBUTED']
+    main_sources += [
+        "torch/csrc/distributed/Module.cpp",
+    ]
+    if USE_DISTRIBUTED_MW:
+        main_sources += [
+            "torch/csrc/distributed/Tensor.cpp",
+            "torch/csrc/distributed/Storage.cpp",
+        ]
+        extra_compile_args += ['-DUSE_DISTRIBUTED_MW']
+    include_dirs += [tmp_install_path + "/include/THD"]
+    main_link_args += [THD_LIB]
+
+if USE_C10D:
+    extra_compile_args += ['-DUSE_C10D']
+    main_sources += ['torch/csrc/distributed/c10d/init.cpp']
+    main_link_args += [C10D_LIB]
+
+if USE_CUDA:
+    nvtoolext_lib_name = None
+    if IS_WINDOWS:
+        cuda_lib_path = CUDA_HOME + '/lib/x64/'
+        nvtoolext_lib_path = NVTOOLEXT_HOME + '/lib/x64/'
+        nvtoolext_include_path = os.path.join(NVTOOLEXT_HOME, 'include')
+
+        library_dirs.append(nvtoolext_lib_path)
+        include_dirs.append(nvtoolext_include_path)
+
+        nvtoolext_lib_name = 'nvToolsExt64_1'
+
+        # MSVC doesn't support runtime symbol resolving, `nvrtc` and `cuda` should be linked
+        main_libraries += ['nvrtc', 'cuda']
+    else:
+        cuda_lib_dirs = ['lib64', 'lib']
+
+        for lib_dir in cuda_lib_dirs:
+            cuda_lib_path = os.path.join(CUDA_HOME, lib_dir)
+            if os.path.exists(cuda_lib_path):
+                break
+
+        nvtoolext_lib_name = 'nvToolsExt'
+
+    library_dirs.append(cuda_lib_path)
+    cuda_include_path = os.path.join(CUDA_HOME, 'include')
+    include_dirs.append(cuda_include_path)
+    include_dirs.append(tmp_install_path + "/include/THCUNN")
+    extra_compile_args += ['-DUSE_CUDA']
+    extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
+    main_libraries += ['cudart', nvtoolext_lib_name]
+    main_sources += [
+        "torch/csrc/cuda/Module.cpp",
+        "torch/csrc/cuda/Storage.cpp",
+        "torch/csrc/cuda/Stream.cpp",
+        "torch/csrc/cuda/utils.cpp",
+        "torch/csrc/cuda/comm.cpp",
+        "torch/csrc/cuda/python_comm.cpp",
+        "torch/csrc/cuda/serialization.cpp",
+        "torch/csrc/nn/THCUNN.cpp",
+    ]
+
+if USE_ROCM:
+    rocm_include_path = '/opt/rocm/include'
+    hcc_include_path = '/opt/rocm/hcc/include'
+    hipblas_include_path = '/opt/rocm/hipblas/include'
+    hipsparse_include_path = '/opt/rocm/hcsparse/include'
+    hip_lib_path = '/opt/rocm/hip/lib'
+    hcc_lib_path = '/opt/rocm/hcc/lib'
+    include_dirs.append(rocm_include_path)
+    include_dirs.append(hcc_include_path)
+    include_dirs.append(hipblas_include_path)
+    include_dirs.append(hipsparse_include_path)
+    include_dirs.append(tmp_install_path + "/include/THCUNN")
+    extra_link_args.append('-L' + hip_lib_path)
+    extra_link_args.append('-Wl,-rpath,' + hip_lib_path)
+    extra_compile_args += ['-DUSE_ROCM']
+    extra_compile_args += ['-D__HIP_PLATFORM_HCC__']
+
+    main_sources += [
+        "torch/csrc/cuda/Module.cpp",
+        "torch/csrc/cuda/Storage.cpp",
+        "torch/csrc/cuda/Stream.cpp",
+        "torch/csrc/cuda/utils.cpp",
+        "torch/csrc/cuda/comm.cpp",
+        "torch/csrc/cuda/python_comm.cpp",
+        "torch/csrc/cuda/serialization.cpp",
+        "torch/csrc/nn/THCUNN.cpp",
+    ]
+
+if USE_NCCL:
+    if USE_SYSTEM_NCCL:
+        main_link_args += [NCCL_SYSTEM_LIB]
+        include_dirs.append(NCCL_INCLUDE_DIR)
+    else:
+        main_link_args += [NCCL_LIB]
+    extra_compile_args += ['-DUSE_NCCL']
+    main_sources += [
+        "torch/csrc/cuda/nccl.cpp",
+        "torch/csrc/cuda/python_nccl.cpp",
+    ]
+if USE_CUDNN:
+    main_libraries += [CUDNN_LIBRARY]
+    # NOTE: these are at the front, in case there's another cuDNN in CUDA path
+    include_dirs.insert(0, CUDNN_INCLUDE_DIR)
+    extra_compile_args += ['-DUSE_CUDNN']
+
+if DEBUG:
+    if IS_WINDOWS:
+        extra_link_args.append('/DEBUG:FULL')
+    else:
+        extra_compile_args += ['-O0', '-g']
+        extra_link_args += ['-O0', '-g']
+
+
+def make_relative_rpath(path):
+    if IS_DARWIN:
+        return '-Wl,-rpath,@loader_path/' + path
+    elif IS_WINDOWS:
+        return ''
+    else:
+        return '-Wl,-rpath,$ORIGIN/' + path
+
+################################################################################
+# Declare extensions and package
+################################################################################
+
+extensions = []
+if FULL_CAFFE2:
+    packages = find_packages(exclude=('tools', 'tools.*'))
+else:
+    packages = find_packages(exclude=('tools', 'tools.*', 'caffe2', 'caffe2.*', 'caffe', 'caffe.*'))
+C = Extension("torch._C",
+              libraries=main_libraries,
+              sources=main_sources,
+              language='c++',
+              extra_compile_args=main_compile_args + extra_compile_args,
+              include_dirs=include_dirs,
+              library_dirs=library_dirs,
+              extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
+              )
+extensions.append(C)
+
+if not IS_WINDOWS:
+    DL = Extension("torch._dl",
+                   sources=["torch/csrc/dl.c"],
+                   language='c'
+                   )
+    extensions.append(DL)
+
+
+if USE_CUDA:
+    thnvrtc_link_flags = extra_link_args + [make_relative_rpath('lib')]
+    if IS_LINUX:
+        thnvrtc_link_flags = thnvrtc_link_flags + ['-Wl,--no-as-needed']
+    # these have to be specified as -lcuda in link_flags because they
+    # have to come right after the `no-as-needed` option
+    if IS_WINDOWS:
+        thnvrtc_link_flags += ['cuda.lib', 'nvrtc.lib']
+    else:
+        thnvrtc_link_flags += ['-lcudart', '-lcuda', '-lnvrtc']
+    cuda_stub_path = [cuda_lib_path + '/stubs']
+    if IS_DARWIN:
+        # on macOS this is where the CUDA stub is installed according to the manual
+        cuda_stub_path = ["/usr/local/cuda/lib"]
+    THNVRTC = Extension("torch._nvrtc",
+                        sources=['torch/csrc/nvrtc.cpp'],
+                        language='c++',
+                        extra_compile_args=main_compile_args + extra_compile_args,
+                        include_dirs=include_dirs,
+                        library_dirs=library_dirs + cuda_stub_path,
+                        extra_link_args=thnvrtc_link_flags,
+                        )
+    extensions.append(THNVRTC)
+
+if FULL_CAFFE2:
+    # If building Caffe2 python as well, these extensions are built by cmake
+    # copied manually in build_extensions() inside the build_ext implementaiton
+    extensions.append(
+        setuptools.Extension(
+            name=str('caffe2.python.caffe2_pybind11_state'),
+            sources=[]),
+    )
+    extensions.append(
+        setuptools.Extension(
+            name=str('caffe2.python.caffe2_pybind11_state_gpu'),
+            sources=[]),
+    )
+
+cmdclass = {
+    'create_version_file': create_version_file,
+    'build': build,
+    'build_py': build_py,
+    'build_ext': build_ext,
+    'build_deps': build_deps,
+    'build_module': build_module,
+    'develop': develop,
+    'install': install,
+    'clean': clean,
+}
+cmdclass.update(build_dep_cmds)
+
+if __name__ == '__main__':
+    setup(
+        name="torch",
+        version=version,
+        description=("Tensors and Dynamic neural networks in "
+                     "Python with strong GPU acceleration"),
+        ext_modules=extensions,
+        cmdclass=cmdclass,
+        packages=packages,
+        package_data={
+            'torch': [
+                'lib/*.so*',
+                'lib/*.dylib*',
+                'lib/*.dll',
+                'lib/*.lib',
+                'lib/torch_shm_manager',
+                'lib/*.h',
+                'lib/include/ATen/*.h',
+                'lib/include/ATen/detail/*.h',
+                'lib/include/ATen/cuda/*.h',
+                'lib/include/ATen/cuda/*.cuh',
+                'lib/include/ATen/cuda/detail/*.h',
+                'lib/include/ATen/cudnn/*.h',
+                'lib/include/ATen/cuda/detail/*.cuh',
+                'lib/include/pybind11/*.h',
+                'lib/include/pybind11/detail/*.h',
+                'lib/include/TH/*.h*',
+                'lib/include/TH/generic/*.h*',
+                'lib/include/THC/*.h*',
+                'lib/include/THC/*.cuh',
+                'lib/include/THC/generic/*.h',
+                'lib/include/THCUNN/*.cuh',
+                'lib/include/THNN/*.h',
+                'lib/include/torch/csrc/*.h',
+                'lib/include/torch/csrc/autograd/*.h',
+                'lib/include/torch/csrc/jit/*.h',
+                'lib/include/torch/csrc/utils/*.h',
+                'lib/include/torch/csrc/cuda/*.h',
+                'lib/include/torch/torch.h',
+            ]
+        })
diff --git a/setup_caffe2.py b/setup_caffe2.py
new file mode 100644
index 0000000..0fd6205
--- /dev/null
+++ b/setup_caffe2.py
@@ -0,0 +1,274 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from distutils.spawn import find_executable
+from distutils import sysconfig, log
+import setuptools
+import setuptools.command.build_py
+import setuptools.command.develop
+import setuptools.command.build_ext
+
+from collections import namedtuple
+from contextlib import contextmanager
+import glob
+import os
+import multiprocessing
+import shlex
+import subprocess
+import sys
+from textwrap import dedent
+
+TOP_DIR = os.path.realpath(os.path.dirname(__file__))
+SRC_DIR = os.path.join(TOP_DIR, 'caffe2')
+CMAKE_BUILD_DIR = os.path.join(TOP_DIR, '.setuptools-cmake-build')
+
+install_requires = []
+setup_requires = []
+tests_require = []
+
+################################################################################
+# Pre Check
+################################################################################
+
+CMAKE = find_executable('cmake')
+assert CMAKE, 'Could not find "cmake" executable!'
+NINJA = find_executable('ninja')
+MAKE = find_executable('make')
+assert NINJA or MAKE, \
+    'Could not find neither "ninja" nor "make" executable!'
+
+################################################################################
+# utils functions
+################################################################################
+
+
+@contextmanager
+def cd(path):
+    if not os.path.isabs(path):
+        raise RuntimeError('Can only cd to absolute path, got: {}'.format(path))
+    orig_path = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(orig_path)
+
+################################################################################
+# Version
+################################################################################
+
+try:
+    git_version = subprocess.check_output(['git', 'describe', '--tags', 'HEAD'],
+                                          cwd=TOP_DIR).decode('ascii').strip()
+except (OSError, subprocess.CalledProcessError):
+    git_version = None
+
+with open(os.path.join(SRC_DIR, 'VERSION_NUMBER')) as version_file:
+    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
+        version=version_file.read().strip(),
+        git_version=git_version
+    )
+
+################################################################################
+# Customized commands
+################################################################################
+
+
+class Caffe2Command(setuptools.Command):
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+
+class create_version(Caffe2Command):
+    def run(self):
+        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
+            f.write(dedent('''
+            version = '{version}'
+            git_version = '{git_version}'
+            '''.format(**dict(VersionInfo._asdict()))))
+
+
+class cmake_build(Caffe2Command):
+    """
+    Compiles everything when `python setup.py build` is run using cmake.
+
+    Custom args can be passed to cmake by specifying the `CMAKE_ARGS`
+    environment variable. E.g. to build without cuda support run:
+        `CMAKE_ARGS=-DUSE_CUDA=Off python setup.py build`
+
+    The number of CPUs used by `make`/`ninja` can be specified by passing
+    `-j<ncpus>` to `setup.py build`.  By default all CPUs are used.
+    """
+    user_options = [
+        (str('jobs='), str('j'),
+            str('Specifies the number of jobs to use with make or ninja'))
+    ]
+
+    built = False
+
+    def initialize_options(self):
+        self.jobs = multiprocessing.cpu_count()
+
+    def finalize_options(self):
+        self.jobs = int(self.jobs)
+
+    def run(self):
+        if cmake_build.built:
+            return
+        cmake_build.built = True
+
+        if not os.path.exists(CMAKE_BUILD_DIR):
+            os.makedirs(CMAKE_BUILD_DIR)
+
+        with cd(CMAKE_BUILD_DIR):
+            # configure
+            cmake_args = [
+                find_executable('cmake'),
+                '-DBUILD_SHARED_LIBS=OFF',
+                '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable),
+                '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()),
+                '-DBUILD_TEST=OFF',
+                '-DBUILD_BENCHMARK=OFF',
+                '-DBUILD_BINARY=OFF',
+                '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
+            ]
+            if NINJA:
+                cmake_args.extend(['-G', 'Ninja'])
+            if 'CMAKE_ARGS' in os.environ:
+                extra_cmake_args = shlex.split(os.environ['CMAKE_ARGS'])
+                # prevent crossfire with downstream scripts
+                del os.environ['CMAKE_ARGS']
+                log.info('Extra cmake args: {}'.format(extra_cmake_args))
+                cmake_args.extend(extra_cmake_args)
+            cmake_args.append(TOP_DIR)
+            subprocess.check_call(cmake_args)
+
+            build_args = [NINJA or MAKE]
+            # control the number of concurrent jobs
+            if self.jobs is not None:
+                build_args.extend(['-j', str(self.jobs)])
+            subprocess.check_call(build_args)
+
+
+class build_py(setuptools.command.build_py.build_py):
+    def run(self):
+        self.run_command('create_version')
+        self.run_command('cmake_build')
+        for d in ['caffe', 'caffe2']:
+            for src in glob.glob(
+                    os.path.join(CMAKE_BUILD_DIR, d, 'proto', '*.py')):
+                dst = os.path.join(
+                    TOP_DIR, os.path.relpath(src, CMAKE_BUILD_DIR))
+                self.copy_file(src, dst)
+        setuptools.command.build_py.build_py.run(self)
+
+
+class build_ext(setuptools.command.build_ext.build_ext):
+    def get_outputs(self):
+        return [os.path.join(self.build_lib, d)
+                for d in ['caffe', 'caffe2']]
+
+    def run(self):
+        self.run_command('cmake_build')
+        setuptools.command.build_ext.build_ext.run(self)
+
+    def build_extensions(self):
+        i = 0
+        while i < len(self.extensions):
+            ext = self.extensions[i]
+            fullname = self.get_ext_fullname(ext.name)
+            filename = self.get_ext_filename(fullname)
+
+            src = os.path.join(CMAKE_BUILD_DIR, filename)
+            if not os.path.exists(src):
+                del self.extensions[i]
+            else:
+                dst = os.path.join(os.path.realpath(self.build_lib), filename)
+                self.copy_file(src, dst)
+                i += 1
+
+
+class develop(setuptools.command.develop.develop):
+    def run(self):
+        self.run_command('build_py')
+        setuptools.command.develop.develop.run(self)
+
+
+cmdclass = {
+    'create_version': create_version,
+    'cmake_build': cmake_build,
+    'build_py': build_py,
+    'build_ext': build_ext,
+    'develop': develop,
+}
+
+################################################################################
+# Extensions
+################################################################################
+
+ext_modules = [
+    setuptools.Extension(
+        name=str('caffe2.python.caffe2_pybind11_state'),
+        sources=[]),
+    setuptools.Extension(
+        name=str('caffe2.python.caffe2_pybind11_state_gpu'),
+        sources=[]),
+    setuptools.Extension(
+        name=str('caffe2.python.caffe2_pybind11_state_hip'),
+        sources=[]),
+]
+
+################################################################################
+# Packages
+################################################################################
+
+packages = setuptools.find_packages()
+
+install_requires.extend([
+    'protobuf',
+    'numpy',
+    'future',
+    'hypothesis',
+    'requests',
+    'scipy',
+    'six',
+])
+
+################################################################################
+# Test
+################################################################################
+
+setup_requires.append('pytest-runner')
+tests_require.extend(['pytest-cov', 'hypothesis'])
+
+################################################################################
+# Final
+################################################################################
+
+setuptools.setup(
+    name='caffe2',
+    version=VersionInfo.version,
+    description='Caffe2',
+    ext_modules=ext_modules,
+    cmdclass=cmdclass,
+    packages=packages,
+    install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
+    author='jiayq',
+    author_email='jiayq@fb.com',
+    url='https://caffe2.ai',
+    entry_points={
+        'console_scripts': [
+            'convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx',
+            'convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2',
+        ]
+    },
+)
diff --git a/test/bottleneck/test.py b/test/bottleneck/test.py
new file mode 100644
index 0000000..30e2307
--- /dev/null
+++ b/test/bottleneck/test.py
@@ -0,0 +1,4 @@
+import torch
+
+x = torch.ones((3, 3), requires_grad=True)
+(3 * x).sum().backward()
diff --git a/test/bottleneck/test_args.py b/test/bottleneck/test_args.py
new file mode 100644
index 0000000..cddb6a6
--- /dev/null
+++ b/test/bottleneck/test_args.py
@@ -0,0 +1,13 @@
+import argparse
+import torch
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # Required args. Raises error if they aren't passed.
+    parser.add_argument('--foo', help='foo', required=True)
+    parser.add_argument('--bar', help='bar', required=True)
+    _ = parser.parse_args()
+
+    x = torch.ones((3, 3), requires_grad=True)
+    (3 * x).sum().backward()
diff --git a/test/bottleneck/test_cuda.py b/test/bottleneck/test_cuda.py
new file mode 100644
index 0000000..60d2f4b
--- /dev/null
+++ b/test/bottleneck/test_cuda.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(20, 20)
+
+    def forward(self, input):
+        out = self.linear(input[:, 10:30])
+        return out.sum()
+
+
+def main():
+    data = torch.randn(10, 50).cuda()
+    model = Model().cuda()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+    for i in range(10):
+        optimizer.zero_grad()
+        loss = model(data)
+        loss.backward()
+        optimizer.step()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test/common.py b/test/common.py
new file mode 100644
index 0000000..4d2852e
--- /dev/null
+++ b/test/common.py
@@ -0,0 +1,548 @@
+r"""Importing this file must **not** initialize CUDA context. test_distributed
+relies on this assumption to properly run. This means that when this is imported
+no CUDA calls shall be made, including torch.cuda.device_count(), etc.
+
+common_cuda.py can freely initialize CUDA context when imported.
+"""
+
+import sys
+import os
+import platform
+import re
+import gc
+import types
+import inspect
+import argparse
+import unittest
+import warnings
+import random
+import contextlib
+from functools import wraps
+from itertools import product
+from copy import deepcopy
+from numbers import Number
+
+import __main__
+import errno
+
+import torch
+import torch.cuda
+from torch._utils_internal import get_writable_path
+from torch._six import string_classes, inf
+import torch.backends.cudnn
+import torch.backends.mkl
+
+
+torch.set_default_tensor_type('torch.DoubleTensor')
+torch.backends.cudnn.disable_global_flags()
+
+
+parser = argparse.ArgumentParser(add_help=False)
+parser.add_argument('--seed', type=int, default=1234)
+parser.add_argument('--accept', action='store_true')
+args, remaining = parser.parse_known_args()
+SEED = args.seed
+ACCEPT = args.accept
+UNITTEST_ARGS = [sys.argv[0]] + remaining
+torch.manual_seed(SEED)
+
+
+def run_tests(argv=UNITTEST_ARGS):
+    unittest.main(argv=argv)
+
+PY3 = sys.version_info > (3, 0)
+PY34 = sys.version_info >= (3, 4)
+
+IS_WINDOWS = sys.platform == "win32"
+IS_PPC = platform.machine() == "ppc64le"
+
+
+def _check_module_exists(name):
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    if not PY3:  # Python 2
+        import imp
+        try:
+            imp.find_module(name)
+            return True
+        except ImportError:
+            return False
+    elif PY34:  # Python [3, 3.4)
+        import importlib
+        loader = importlib.find_loader(name)
+        return loader is not None
+    else:  # Python >= 3.4
+        import importlib
+        spec = importlib.util.find_spec(name)
+        return spec is not None
+
+TEST_NUMPY = _check_module_exists('numpy')
+TEST_SCIPY = _check_module_exists('scipy')
+TEST_MKL = torch.backends.mkl.is_available()
+
+# On Py2, importing librosa 0.6.1 triggers a TypeError (if using newest joblib)
+# see librosa/librosa#729.
+# TODO: allow Py2 when librosa 0.6.2 releases
+TEST_LIBROSA = _check_module_exists('librosa') and PY3
+
+NO_MULTIPROCESSING_SPAWN = os.environ.get('NO_MULTIPROCESSING_SPAWN', '0') == '1'
+TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', '0') == '1'
+TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
+
+if TEST_NUMPY:
+    import numpy
+
+
+def skipIfNoLapack(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            fn(*args, **kwargs)
+        except Exception as e:
+            if 'Lapack library not found' in e.args[0]:
+                raise unittest.SkipTest('Compiled without Lapack')
+            raise
+    return wrapper
+
+
+def skipCUDAMemoryLeakCheckIf(condition):
+    def dec(fn):
+        if getattr(fn, '_do_cuda_memory_leak_check', True):  # if current True
+            fn._do_cuda_memory_leak_check = not condition
+        return fn
+    return dec
+
+
+def skipIfNoZeroSize(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if torch._C._use_zero_size_dim():
+            fn(*args, **kwargs)
+        else:
+            raise unittest.SkipTest('Compiled without arbitrary zero size dimension support')
+    return wrapper
+
+
+def get_cuda_memory_usage():
+    # we don't need CUDA synchronize because the statistics are not tracked at
+    # actual freeing, but at when marking the block as free.
+    num_devices = torch.cuda.device_count()
+    gc.collect()
+    return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices))
+
+
+def suppress_warnings(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            fn(*args, **kwargs)
+    return wrapper
+
+
+def get_cpu_type(type_name):
+    module, name = type_name.rsplit('.', 1)
+    assert module == 'torch.cuda'
+    return getattr(torch, name)
+
+
+def get_gpu_type(type_name):
+    if isinstance(type_name, type):
+        type_name = '{}.{}'.format(type_name.__module__, type_name.__name__)
+    module, name = type_name.rsplit('.', 1)
+    assert module == 'torch'
+    return getattr(torch.cuda, name)
+
+
+def to_gpu(obj, type_map={}):
+    if isinstance(obj, torch.Tensor):
+        assert obj.is_leaf
+        t = type_map.get(obj.type(), get_gpu_type(obj.type()))
+        with torch.no_grad():
+            res = obj.clone().type(t)
+            res.requires_grad = obj.requires_grad
+        return res
+    elif torch.is_storage(obj):
+        return obj.new().resize_(obj.size()).copy_(obj)
+    elif isinstance(obj, list):
+        return [to_gpu(o, type_map) for o in obj]
+    elif isinstance(obj, tuple):
+        return tuple(to_gpu(o, type_map) for o in obj)
+    else:
+        return deepcopy(obj)
+
+
+def get_function_arglist(func):
+    return inspect.getargspec(func).args
+
+
+def set_rng_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    if TEST_NUMPY:
+        numpy.random.seed(seed)
+
+
+@contextlib.contextmanager
+def freeze_rng_state():
+    rng_state = torch.get_rng_state()
+    if torch.cuda.is_available():
+        cuda_rng_state = torch.cuda.get_rng_state()
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.set_rng_state(cuda_rng_state)
+    torch.set_rng_state(rng_state)
+
+
+def iter_indices(tensor):
+    if tensor.dim() == 0:
+        return range(0)
+    if tensor.dim() == 1:
+        return range(tensor.size(0))
+    return product(*(range(s) for s in tensor.size()))
+
+
+def is_iterable(obj):
+    try:
+        iter(obj)
+        return True
+    except TypeError:
+        return False
+
+
+class TestCase(unittest.TestCase):
+    precision = 1e-5
+    maxDiff = None
+    _do_cuda_memory_leak_check = False
+
+    def __init__(self, method_name='runTest'):
+        super(TestCase, self).__init__(method_name)
+        # Wraps the tested method if we should do CUDA memory check.
+        test_method = getattr(self, method_name)
+        self._do_cuda_memory_leak_check &= getattr(test_method, '_do_cuda_memory_leak_check', True)
+        # FIXME: figure out the flaky -1024 anti-leaks on windows. See #8044
+        if self._do_cuda_memory_leak_check and not IS_WINDOWS:
+            # the import below may initialize CUDA context, so we do it only if
+            # self._do_cuda_memory_leak_check is True.
+            from common_cuda import TEST_CUDA
+            fullname = self.id().lower()  # class_name.method_name
+            if TEST_CUDA and ('gpu' in fullname or 'cuda' in fullname):
+                # initialize context & RNG to prevent false positive detections
+                # when the test is the first to initialize those
+                from common_cuda import initialize_cuda_context_rng
+                initialize_cuda_context_rng()
+                setattr(self, method_name, self.wrap_with_cuda_memory_check(test_method))
+
+    def wrap_with_cuda_memory_check(self, method):
+        # Assumes that `method` is the tested function in `self`.
+        # NOTE: Python Exceptions (e.g., unittest.Skip) keeps objects in scope
+        #       alive, so this cannot be done in setUp and tearDown because
+        #       tearDown is run unconditionally no matter whether the test
+        #       passes or not. For the same reason, we can't wrap the `method`
+        #       call in try-finally and always do the check.
+        @wraps(method)
+        def wrapper(self, *args, **kwargs):
+            befores = get_cuda_memory_usage()
+            method(*args, **kwargs)
+            afters = get_cuda_memory_usage()
+            for i, (before, after) in enumerate(zip(befores, afters)):
+                self.assertEqual(before, after, '{} leaked {} bytes CUDA memory on device {}'.format(
+                                 self.id(), after - before, i))
+        return types.MethodType(wrapper, self)
+
+    def setUp(self):
+        set_rng_seed(SEED)
+
+    def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
+        max_err = 0
+        self.assertEqual(x.size(), y.size())
+        for index in iter_indices(x):
+            max_err = max(max_err, abs(x[index] - y[index]))
+        self.assertLessEqual(max_err, prec, message)
+
+    def safeToDense(self, t):
+        r = self.safeCoalesce(t)
+        return r.to_dense()
+
+    def safeCoalesce(self, t):
+        tc = t.coalesce()
+        self.assertEqual(tc.to_dense(), t.to_dense())
+        self.assertTrue(tc.is_coalesced())
+
+        # Our code below doesn't work when nnz is 0, because
+        # then it's a 0D tensor, not a 2D tensor.
+        if t._nnz() == 0:
+            self.assertEqual(t._indices(), tc._indices())
+            self.assertEqual(t._values(), tc._values())
+            return tc
+
+        value_map = {}
+        for idx, val in zip(t._indices().t(), t._values()):
+            idx_tup = tuple(idx.tolist())
+            if idx_tup in value_map:
+                value_map[idx_tup] += val
+            else:
+                value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
+
+        new_indices = sorted(list(value_map.keys()))
+        new_values = [value_map[idx] for idx in new_indices]
+        if t._values().ndimension() < 2:
+            new_values = t._values().new(new_values)
+        else:
+            new_values = torch.stack(new_values)
+
+        new_indices = t._indices().new(new_indices).t()
+        tg = t.new(new_indices, new_values, t.size())
+
+        self.assertEqual(tc._indices(), tg._indices())
+        self.assertEqual(tc._values(), tg._values())
+
+        if t.is_coalesced():
+            self.assertEqual(tc._indices(), t._indices())
+            self.assertEqual(tc._values(), t._values())
+
+        return tg
+
+    def assertEqual(self, x, y, prec=None, message='', allow_inf=False):
+        if isinstance(prec, str) and message == '':
+            message = prec
+            prec = None
+        if prec is None:
+            prec = self.precision
+
+        if isinstance(x, torch.Tensor) and isinstance(y, Number):
+            self.assertEqual(x.item(), y, prec, message, allow_inf)
+        elif isinstance(y, torch.Tensor) and isinstance(x, Number):
+            self.assertEqual(x, y.item(), prec, message, allow_inf)
+        elif isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+            def assertTensorsEqual(a, b):
+                super(TestCase, self).assertEqual(a.size(), b.size(), message)
+                if a.numel() > 0:
+                    b = b.type_as(a)
+                    b = b.cuda(device=a.get_device()) if a.is_cuda else b.cpu()
+                    # check that NaNs are in the same locations
+                    nan_mask = a != a
+                    self.assertTrue(torch.equal(nan_mask, b != b), message)
+                    diff = a - b
+                    diff[nan_mask] = 0
+                    # TODO: implement abs on CharTensor
+                    if diff.is_signed() and 'CharTensor' not in diff.type():
+                        diff = diff.abs()
+                    max_err = diff.max()
+                    self.assertLessEqual(max_err, prec, message)
+            super(TestCase, self).assertEqual(x.is_sparse, y.is_sparse, message)
+            if x.is_sparse:
+                x = self.safeCoalesce(x)
+                y = self.safeCoalesce(y)
+                assertTensorsEqual(x._indices(), y._indices())
+                assertTensorsEqual(x._values(), y._values())
+            else:
+                assertTensorsEqual(x, y)
+        elif isinstance(x, string_classes) and isinstance(y, string_classes):
+            super(TestCase, self).assertEqual(x, y, message)
+        elif type(x) == set and type(y) == set:
+            super(TestCase, self).assertEqual(x, y, message)
+        elif is_iterable(x) and is_iterable(y):
+            super(TestCase, self).assertEqual(len(x), len(y), message)
+            for x_, y_ in zip(x, y):
+                self.assertEqual(x_, y_, prec, message)
+        elif isinstance(x, bool) and isinstance(y, bool):
+            super(TestCase, self).assertEqual(x, y, message)
+        elif isinstance(x, Number) and isinstance(y, Number):
+            if abs(x) == inf or abs(y) == inf:
+                if allow_inf:
+                    super(TestCase, self).assertEqual(x, y, message)
+                else:
+                    self.fail("Expected finite numeric values - x={}, y={}".format(x, y))
+                return
+            super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
+        else:
+            super(TestCase, self).assertEqual(x, y, message)
+
+    def assertAlmostEqual(self, x, y, places=None, msg=None, delta=None, allow_inf=None):
+        prec = delta
+        if places:
+            prec = 10**(-places)
+        self.assertEqual(x, y, prec, msg, allow_inf)
+
+    def assertNotEqual(self, x, y, prec=None, message=''):
+        if isinstance(prec, str) and message == '':
+            message = prec
+            prec = None
+        if prec is None:
+            prec = self.precision
+
+        if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+            if x.size() != y.size():
+                super(TestCase, self).assertNotEqual(x.size(), y.size())
+            self.assertGreater(x.numel(), 0)
+            y = y.type_as(x)
+            y = y.cuda(device=x.get_device()) if x.is_cuda else y.cpu()
+            nan_mask = x != x
+            if torch.equal(nan_mask, y != y):
+                diff = x - y
+                if diff.is_signed():
+                    diff = diff.abs()
+                diff[nan_mask] = 0
+                max_err = diff.max()
+                self.assertGreaterEqual(max_err, prec, message)
+        elif type(x) == str and type(y) == str:
+            super(TestCase, self).assertNotEqual(x, y)
+        elif is_iterable(x) and is_iterable(y):
+            super(TestCase, self).assertNotEqual(x, y)
+        else:
+            try:
+                self.assertGreaterEqual(abs(x - y), prec, message)
+                return
+            except (TypeError, AssertionError):
+                pass
+            super(TestCase, self).assertNotEqual(x, y, message)
+
+    def assertObjectIn(self, obj, iterable):
+        for elem in iterable:
+            if id(obj) == id(elem):
+                return
+        raise AssertionError("object not found in iterable")
+
+    # TODO: Support context manager interface
+    # NB: The kwargs forwarding to callable robs the 'subname' parameter.
+    # If you need it, manually apply your callable in a lambda instead.
+    def assertExpectedRaises(self, exc_type, callable, *args, **kwargs):
+        subname = None
+        if 'subname' in kwargs:
+            subname = kwargs['subname']
+            del kwargs['subname']
+        try:
+            callable(*args, **kwargs)
+        except exc_type as e:
+            self.assertExpected(str(e), subname)
+            return
+        # Don't put this in the try block; the AssertionError will catch it
+        self.fail(msg="Did not raise when expected to")
+
+    def assertWarns(self, callable, msg=''):
+        r"""
+        Test if :attr:`callable` raises a warning.
+        """
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            callable()
+            self.assertTrue(len(ws) > 0, msg)
+
+    def assertWarnsRegex(self, callable, regex, msg=''):
+        r"""
+        Test if :attr:`callable` raises any warning with message that contains
+        the regex pattern :attr:`regex`.
+        """
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            callable()
+            self.assertTrue(len(ws) > 0, msg)
+            found = any(re.search(regex, str(w.message)) is not None for w in ws)
+            self.assertTrue(found, msg)
+
+    def assertExpected(self, s, subname=None):
+        r"""
+        Test that a string matches the recorded contents of a file
+        derived from the name of this test and subname.  This file
+        is placed in the 'expect' directory in the same directory
+        as the test script. You can automatically update the recorded test
+        output using --accept.
+
+        If you call this multiple times in a single function, you must
+        give a unique subname each time.
+        """
+        if not (isinstance(s, str) or (sys.version_info[0] == 2 and isinstance(s, unicode))):
+            raise TypeError("assertExpected is strings only")
+
+        def remove_prefix(text, prefix):
+            if text.startswith(prefix):
+                return text[len(prefix):]
+            return text
+        # NB: we take __file__ from the module that defined the test
+        # class, so we place the expect directory where the test script
+        # lives, NOT where test/common.py lives.  This doesn't matter in
+        # PyTorch where all test scripts are in the same directory as
+        # test/common.py, but it matters in onnx-pytorch
+        module_id = self.__class__.__module__
+        munged_id = remove_prefix(self.id(), module_id + ".")
+        test_file = os.path.realpath(sys.modules[module_id].__file__)
+        expected_file = os.path.join(os.path.dirname(test_file),
+                                     "expect",
+                                     munged_id)
+        if subname:
+            expected_file += "-" + subname
+        expected_file += ".expect"
+        expected = None
+
+        def accept_output(update_type):
+            print("Accepting {} for {}:\n\n{}".format(update_type, munged_id, s))
+            with open(expected_file, 'w') as f:
+                f.write(s)
+
+        try:
+            with open(expected_file) as f:
+                expected = f.read()
+        except IOError as e:
+            if e.errno != errno.ENOENT:
+                raise
+            elif ACCEPT:
+                return accept_output("output")
+            else:
+                raise RuntimeError(
+                    ("I got this output for {}:\n\n{}\n\n"
+                     "No expect file exists; to accept the current output, run:\n"
+                     "python {} {} --accept").format(munged_id, s, __main__.__file__, munged_id))
+
+        # a hack for JIT tests
+        if IS_WINDOWS:
+            expected = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', expected)
+            s = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', s)
+
+        if ACCEPT:
+            if expected != s:
+                return accept_output("updated output")
+        else:
+            if hasattr(self, "assertMultiLineEqual"):
+                # Python 2.7 only
+                # NB: Python considers lhs "old" and rhs "new".
+                self.assertMultiLineEqual(expected, s)
+            else:
+                self.assertEqual(s, expected)
+
+    if sys.version_info < (3, 2):
+        # assertRegexpMatches renamed to assertRegex in 3.2
+        assertRegex = unittest.TestCase.assertRegexpMatches
+        # assertRaisesRegexp renamed to assertRaisesRegex in 3.2
+        assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
+
+
+def download_file(url, binary=True):
+    if sys.version_info < (3,):
+        from urlparse import urlsplit
+        import urllib2
+        request = urllib2
+        error = urllib2
+    else:
+        from urllib.parse import urlsplit
+        from urllib import request, error
+
+    filename = os.path.basename(urlsplit(url)[2])
+    data_dir = get_writable_path(os.path.join(os.path.dirname(__file__), 'data'))
+    path = os.path.join(data_dir, filename)
+
+    if os.path.exists(path):
+        return path
+    try:
+        data = request.urlopen(url, timeout=15).read()
+        with open(path, 'wb' if binary else 'w') as f:
+            f.write(data)
+        return path
+    except error.URLError:
+        msg = "could not download test file '{}'".format(url)
+        warnings.warn(msg, RuntimeWarning)
+        raise unittest.SkipTest(msg)
diff --git a/test/common_cuda.py b/test/common_cuda.py
new file mode 100644
index 0000000..1455496
--- /dev/null
+++ b/test/common_cuda.py
@@ -0,0 +1,27 @@
+r"""This file is allowed to initialize CUDA context when imported."""
+
+import torch
+import torch.cuda
+
+
+TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+CUDA_DEVICE = TEST_CUDA and torch.device("cuda:0")
+TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE))
+TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version()
+
+
+# Used below in `initialize_cuda_context_rng` to ensure that CUDA context and
+# RNG have been initialized.
+__cuda_ctx_rng_initialized = False
+
+
+# after this call, CUDA context and RNG must have been initialized on each GPU
+def initialize_cuda_context_rng():
+    global __cuda_ctx_rng_initialized
+    assert TEST_CUDA, 'CUDA must be available when calling initialize_cuda_context_rng'
+    if not __cuda_ctx_rng_initialized:
+        # initialize cuda context and rng for memory tests
+        for i in range(torch.cuda.device_count()):
+            torch.randn(1, device="cuda:{}".format(i))
+        __cuda_ctx_rng_initialized = True
diff --git a/test/common_nn.py b/test/common_nn.py
new file mode 100644
index 0000000..6172f4b
--- /dev/null
+++ b/test/common_nn.py
@@ -0,0 +1,1147 @@
+import sys
+import tempfile
+import unittest
+from copy import deepcopy
+from itertools import product
+
+import torch
+import torch.cuda
+from torch.nn.functional import _Reduction
+from common import TestCase, to_gpu, freeze_rng_state, is_iterable
+from common_cuda import TEST_CUDA
+from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors
+import torch.backends.cudnn
+
+
+# tarfile module tries to obtain a file object name in python 3.3
+if sys.version_info[:2] == (3, 3):
+    TemporaryFile = tempfile.NamedTemporaryFile
+else:
+    TemporaryFile = tempfile.TemporaryFile
+PRECISION = 1e-5
+
+
+def get_reduction(m):
+    result = getattr(m, 'reduction', None)
+    if result is None:
+        result = _Reduction.legacy_get_string(getattr(m, 'sizeAverage', None), True, emit_warning=False)
+    assert result is not None
+    return result
+
+
+def get_weight(m):
+    result = getattr(m, 'weight', None)
+    if result is not None:
+        return result
+    return getattr(m, 'weights', None)
+
+module_tests = [
+    dict(
+        module_name='Linear',
+        constructor_args=(10, 8),
+        input_size=(4, 10),
+        reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
+    ),
+    dict(
+        module_name='Linear',
+        constructor_args=(10, 8, False),
+        input_size=(4, 10),
+        desc='no_bias',
+        reference_fn=lambda i, p: torch.mm(i, p[0].t())
+    ),
+    dict(
+        module_name='Threshold',
+        constructor_args=(2, 1),
+        input_size=(2, 3, 4, 5),
+        check_inplace=True,
+        desc='threshold_value'
+    ),
+    dict(
+        module_name='Threshold',
+        constructor_args=(2, 10),
+        input_size=(2, 3, 4, 5),
+        desc='large_value'
+    ),
+    dict(
+        module_name='ReLU',
+        input_size=(2, 3, 4, 5),
+        check_inplace=True,
+    ),
+    dict(
+        module_name='ReLU6',
+        input_size=(2, 3, 4, 5),
+        check_inplace=True,
+    ),
+    dict(
+        module_name='RReLU',
+        input_size=(1, 2, 2),
+        test_cuda=False,
+    ),
+    dict(
+        module_name='RReLU',
+        constructor_args=(0.1, 0.9),
+        input_size=(4, 4, 5),
+        desc='with_up_down',
+        test_cuda=False,
+    ),
+    dict(
+        module_name='Hardtanh',
+        input_size=(3, 2, 5),
+        reference_fn=lambda i, _: i.clamp(-1, 1),
+    ),
+    dict(
+        module_name='Sigmoid',
+        input_size=(2, 3, 4, 5)
+    ),
+    dict(
+        module_name='Tanh',
+        input_size=(2, 3, 4, 5)
+    ),
+    dict(
+        module_name='Softmax',
+        constructor_args=(1,),
+        input_size=(10, 20),
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)),
+    ),
+    dict(
+        module_name='Softmax2d',
+        input_size=(1, 3, 10, 20),
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, False)),
+    ),
+    dict(
+        module_name='LogSoftmax',
+        constructor_args=(1,),
+        input_size=(10, 20),
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(),
+    ),
+    dict(
+        module_name='LogSoftmax',
+        constructor_args=(1,),
+        input_size=(1, 3, 10, 20),
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
+        desc='multiparam',
+    ),
+    dict(
+        module_name='ELU',
+        constructor_args=(2.,),
+        input_size=(3, 2, 5),
+    ),
+    # TODO: reference function
+    dict(
+        module_name='Hardshrink',
+        constructor_args=(2.,),
+        input_size=(4, 3, 2, 4),
+    ),
+    dict(
+        module_name='LeakyReLU',
+        input_size=(3, 2, 5),
+        check_inplace=True
+    ),
+    dict(
+        module_name='LeakyReLU',
+        constructor_args=(0.5,),
+        input_size=(3, 2, 5),
+        check_inplace=True,
+        desc='with_negval'
+    ),
+    dict(
+        module_name='LogSigmoid',
+        input_size=(2, 3, 4),
+        reference_fn=lambda i, _: i.sigmoid().log(),
+    ),
+    dict(
+        module_name='Softplus',
+        input_size=(10, 20),
+        reference_fn=lambda i, _: torch.log(1 + torch.exp(i)),
+    ),
+    dict(
+        module_name='Softplus',
+        constructor_args=(2,),
+        input_size=(10, 20),
+        reference_fn=lambda i, _: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
+        desc='beta',
+    ),
+    dict(
+        module_name='Softplus',
+        constructor_args=(2, -100),
+        input_size=(10, 20),
+        reference_fn=(lambda i, _: ((i * 2) > -100).type_as(i) * i +
+                                   ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))),
+        desc='beta_threshold',
+    ),
+    dict(
+        module_name='Softshrink',
+        input_size=(3, 2, 5),
+    ),
+    dict(
+        module_name='Softshrink',
+        constructor_args=(1,),
+        input_size=(3, 2, 5),
+        desc='lambda',
+    ),
+    dict(
+        module_name='CrossMapLRN2d',
+        constructor_args=(5, 5e-3, 1e-3, 2),
+        input_size=(2, 3, 6, 6),
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(2, 3, 4),
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        desc='1d',
+    ),
+    dict(
+        module_name='PReLU',
+        constructor_args=(3,),
+        input_size=(2, 3, 4),
+        desc='1d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(2, 3, 4, 5),
+        desc='2d',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='PReLU',
+        constructor_args=(3,),
+        input_size=(2, 3, 4, 5),
+        desc='2d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(2, 3, 4, 5, 6),
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        desc='3d',
+    ),
+    dict(
+        module_name='PReLU',
+        constructor_args=(3,),
+        input_size=(2, 3, 4, 5, 6),
+        desc='3d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='Softsign',
+        input_size=(3, 2, 5),
+        reference_fn=lambda i, _: i.div(1 + torch.abs(i)),
+    ),
+    dict(
+        module_name='Softmin',
+        constructor_args=(1,),
+        input_size=(10, 20),
+    ),
+    dict(
+        module_name='Softmin',
+        constructor_args=(1,),
+        input_size=(2, 3, 5, 10),
+        desc='multidim',
+    ),
+    dict(
+        module_name='Tanhshrink',
+        input_size=(2, 3, 4, 5)
+    ),
+]
+
+
+def kldivloss_reference(input, target, reduction='elementwise_mean'):
+    safe_target = target * (target > 0).type_as(target)
+    safe_target_log = (safe_target + (target <= 0).type_as(target)).log()
+    result = safe_target * (safe_target_log - input)
+    if reduction == 'elementwise_mean':
+        return result.mean()
+    elif reduction == 'sum':
+        return result.sum()
+    return result
+
+
+def nlllossNd_reference(input, target, weight=None, ignore_index=-100,
+                        reduction='elementwise_mean'):
+    assert input.dim() >= 3
+    N = input.size(0)
+    C = input.size(1)
+    out_size = (N,) + input.size()[2:]
+    output = torch.zeros(out_size).type_as(input)
+
+    if weight is None:
+        weight = torch.ones(C).type_as(input)
+    total_weight = 0
+    for tup in product(*[range(size) for size in out_size]):
+        t_nx = target[tup]
+        norm = 0. if ignore_index == t_nx else weight[t_nx].item()
+        input_index = list(tup)
+        input_index.insert(1, t_nx)
+        output[tup] = -input[tuple(input_index)] * norm
+        total_weight += norm
+
+    if reduction == 'elementwise_mean':
+        return output.sum() / total_weight
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def nllloss_reference(input, target, weight=None, ignore_index=-100,
+                      reduction='elementwise_mean'):
+
+    def nll_loss_helper(input, target, weight, ignore_index):
+        if target == ignore_index:
+            return (0, 0)
+        norm = 1 if weight is None else weight[target]
+        result = -input[target] * norm
+        return (result, norm)
+
+    losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
+                          for i, t in zip(input, target)]
+    losses, weights = zip(*losses_and_weights)
+    losses_tensor = input.new_tensor(losses)
+    if reduction == 'elementwise_mean':
+        return sum(losses_tensor) / sum(weights)
+    elif reduction == 'sum':
+        return sum(losses_tensor)
+    else:
+        return losses_tensor
+
+
+def smoothl1loss_reference(input, target, reduction='elementwise_mean'):
+    abs_diff = (input - target).abs()
+    ge_one_mask = (abs_diff >= 1).type_as(abs_diff)
+    lt_one_mask = (abs_diff < 1).type_as(abs_diff)
+    output = ge_one_mask * (abs_diff - 0.5) + lt_one_mask * 0.5 * (abs_diff ** 2)
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def _multilabelmarginloss_reference(input, target):
+    targets = []
+    for target_index in target:
+        if target_index < 0:
+            break
+        targets.append(target_index)
+
+    sum = 0
+    for target_index in targets:
+        for i in range(0, len(input)):
+            if i not in targets:
+                sum += max(0, 1 - input[target_index] + input[i])
+
+    return sum
+
+
+def multilabelmarginloss_reference(input, target, reduction='elementwise_mean'):
+    if input.dim() == 1:
+        n = 1
+        dim = input.size(0)
+        output = input.new(n).zero_()
+        output[0] = _multilabelmarginloss_reference(input, target)
+    else:
+        n = input.size(0)
+        dim = input.size(1)
+        output = input.new(n).zero_()
+        for i in range(0, n):
+            output[i] = _multilabelmarginloss_reference(input[i], target[i])
+
+    if reduction == 'elementwise_mean':
+        return output.mean() / dim
+    elif reduction == 'sum':
+        return output.sum() / dim
+    return output / dim
+
+
+def hingeembeddingloss_reference(input, target, margin=1.0, reduction='elementwise_mean'):
+    margin_clamp = (margin - input).clamp(min=0).type_as(input)
+    output = torch.where(target == 1, input, margin_clamp)
+
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def softmarginloss_reference(input, target, reduction='elementwise_mean'):
+    output = (1 + (-input * target).exp()).log()
+
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def _multimarginloss_reference(input, target_idx, p, margin, weight):
+    if weight is None:
+        weight = input.new(len(input)).fill_(1)
+
+    output = 0
+    for i in range(0, len(input)):
+        if i != target_idx:
+            output += max(0, weight[target_idx] * (margin - input[target_idx] + input[i]) ** p)
+    return output
+
+
+def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reduction='elementwise_mean'):
+    if input.dim() == 1:
+        n = 1
+        dim = input.size(0)
+        output = input.new(1)
+        output[0] = _multimarginloss_reference(input, target[0], p, margin, weight) / dim
+        return output
+    else:
+        n = input.size(0)
+        dim = input.size(1)
+        output = input.new(n)
+        for x in range(0, n):
+            output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
+
+        if reduction == 'elementwise_mean':
+            return output.mean() / dim
+        elif reduction == 'sum':
+            return output.sum() / dim
+        return output / dim
+
+
+def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='elementwise_mean'):
+    def _cos(a, b):
+        cos = a.new(a.size(0))
+        for i in range(0, a.size(0)):
+            cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
+        return cos
+
+    output = torch.where(target == 1, 1 - _cos(input1, input2), (_cos(input1, input2) - margin).clamp(min=0))
+
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def tripletmarginloss_reference(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False,
+                                reduction='elementwise_mean'):
+    d_p = torch.pairwise_distance(anchor, positive, p, eps)
+    d_n = torch.pairwise_distance(anchor, negative, p, eps)
+    if swap:
+        d_s = torch.pairwise_distance(positive, negative, p, eps)
+        d_n = torch.min(d_n, d_s)
+
+    output = torch.clamp(margin + d_p - d_n, min=0.0)
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def marginrankingloss_reference(input1, input2, target, margin=0, reduction='elementwise_mean'):
+    output = (-target * (input1 - input2) + margin).clamp(min=0)
+    if reduction == 'elementwise_mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+loss_reference_fns = {
+    'KLDivLoss': kldivloss_reference,
+    'NLLLoss': nllloss_reference,
+    'NLLLossNd': nlllossNd_reference,
+    'SmoothL1Loss': smoothl1loss_reference,
+    'MultiLabelMarginLoss': multilabelmarginloss_reference,
+    'HingeEmbeddingLoss': hingeembeddingloss_reference,
+    'SoftMarginLoss': softmarginloss_reference,
+    'MultiMarginLoss': multimarginloss_reference,
+    'CosineEmbeddingLoss': cosineembeddingloss_reference,
+    'TripletMarginLoss': tripletmarginloss_reference,
+    'MarginRankingLoss': marginrankingloss_reference,
+}
+
+
+criterion_tests = [
+    dict(
+        module_name='L1Loss',
+        input_size=(2, 3, 4),
+        target_size=(2, 3, 4),
+        reference_fn=lambda i, t, _: 1. / i.numel() *
+        sum((a - b).abs().sum() for a, b in zip(i, t)),
+    ),
+    dict(
+        module_name='NLLLoss',
+        input_fn=lambda: torch.rand(15, 10).log(),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            nllloss_reference(i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args=(None, None, 2),
+        input_fn=lambda: torch.rand(15, 10).log(),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, _: nllloss_reference(i, t, ignore_index=2),
+        desc='ignore_index'
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args_fn=lambda: (torch.rand(10),),
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            nllloss_reference(i, t, weight=get_weight(m)),
+        desc='weights',
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args_fn=lambda: (torch.rand(10), None, 2),
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            nllloss_reference(i, t, weight=get_weight(m), ignore_index=2),
+        desc='weights_ignore_index'
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args_fn=lambda: (torch.rand(10), None, -1),
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10 + 1).floor().long() - 1,
+        reference_fn=lambda i, t, m:
+            nllloss_reference(i, t, weight=get_weight(m), ignore_index=-1),
+        desc='weights_ignore_index_neg'
+    ),
+    dict(
+        module_name='KLDivLoss',
+        input_fn=lambda: torch.rand(10, 10).log(),
+        target_fn=lambda: torch.rand(10, 10),
+        reference_fn=lambda i, t, m:
+            kldivloss_reference(i, t, get_reduction(m)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='MSELoss',
+        input_size=(2, 3, 4, 5),
+        target_size=(2, 3, 4, 5),
+        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() / (i.numel()
+                                      if get_reduction(m) == 'elementwise_mean' else 1)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='BCELoss',
+        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
+        reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() /
+            (i.numel() if get_reduction(m) else 1),
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='BCELoss',
+        constructor_args_fn=lambda: (torch.rand(10),),
+        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
+        reference_fn=lambda i, t, m: -((t * i.log() + (1 - t) * (1 - i).log()) * get_weight(m)).sum() /
+            (i.numel() if get_reduction(m) else 1),
+        desc='weights',
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='CrossEntropyLoss',
+        input_size=(15, 10),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+    ),
+    dict(
+        module_name='CrossEntropyLoss',
+        constructor_args_fn=lambda: (torch.rand(10),),
+        input_size=(15, 10),
+        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
+        desc='weights',
+    ),
+    dict(
+        module_name='HingeEmbeddingLoss',
+        input_size=(10,),
+        target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
+        reference_fn=lambda i, t, m:
+            hingeembeddingloss_reference(i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='HingeEmbeddingLoss',
+        constructor_args=(0.5,),
+        input_size=(10,),
+        target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
+        reference_fn=lambda i, t, m:
+            hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
+        desc='margin',
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='MultiLabelMarginLoss',
+        input_size=(10,),
+        target_fn=lambda: torch.rand(10).mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            multilabelmarginloss_reference(i, t, reduction=get_reduction(m)),
+        desc="1d",
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiLabelMarginLoss',
+        input_size=(5, 10),
+        target_fn=lambda: torch.rand(5, 10).mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            multilabelmarginloss_reference(i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiLabelSoftMarginLoss',
+        input_size=(5, 10),
+        target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
+        reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(),
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiMarginLoss',
+        input_size=(5, 10),
+        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
+        reference_fn=lambda i, t, m:
+            multimarginloss_reference(i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiMarginLoss',
+        input_size=(10,),
+        target_fn=lambda: torch.rand(1).mul(8).floor().long(),
+        reference_fn=lambda i, t, m:
+            multimarginloss_reference(i, t, reduction=get_reduction(m)),
+        desc='1d',
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiMarginLoss',
+        constructor_args=(2,),
+        input_fn=lambda: torch.rand(5, 10).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
+        reference_fn=lambda i, t, m:
+            multimarginloss_reference(i, t, p=2, reduction=get_reduction(m)),
+        desc='p',
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiMarginLoss',
+        constructor_args=(1, 0.5),
+        legacy_constructor_args=(1, None, 0.5),
+        input_size=(5, 10),
+        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
+        reference_fn=lambda i, t, m:
+            multimarginloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
+        desc='margin',
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='MultiMarginLoss',
+        constructor_args=(1, 1, torch.rand(10)),
+        legacy_constructor_args=(1, torch.rand(10)),
+        input_size=(5, 10),
+        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
+        reference_fn=lambda i, t, m:
+            multimarginloss_reference(i, t, weight=get_weight(m), reduction=get_reduction(m)),
+        desc='weights',
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='SmoothL1Loss',
+        input_size=(5, 10),
+        target_size=(5, 10),
+        check_sum_reduction=True,
+        reference_fn=lambda i, t, m:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+    ),
+    dict(
+        module_name='SoftMarginLoss',
+        input_size=(5, 5),
+        target_fn=lambda: torch.randn(5, 5).sign(),
+        reference_fn=lambda i, t, m:
+            softmarginloss_reference(i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='CosineEmbeddingLoss',
+        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
+        target_fn=lambda: torch.randn(15).sign(),
+        reference_fn=lambda i, t, m:
+            cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='CosineEmbeddingLoss',
+        constructor_args=(0.7,),
+        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
+        target_fn=lambda: torch.randn(15).sign(),
+        reference_fn=lambda i, t, m:
+            cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)),
+        desc='margin',
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='MarginRankingLoss',
+        input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
+        target_fn=lambda: torch.randn(50).sign(),
+        reference_fn=lambda i, t, m:
+            marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='MarginRankingLoss',
+        constructor_args=(0.5,),
+        input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
+        target_fn=lambda: torch.randn(50).sign(),
+        reference_fn=lambda i, t, m:
+            marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)),
+        desc='margin',
+        check_sum_reduction=True,
+    ),
+]
+
+
+class NNTestCase(TestCase):
+
+    def _jacobian(self, input, num_out):
+        if isinstance(input, tuple):
+            return tuple(self._jacobian(elem, num_out) for elem in input)
+        elif isinstance(input, list):
+            return [self._jacobian(elem, num_out) for elem in input]
+        else:
+            return torch.zeros(input.nelement(), num_out)
+
+    def _flatten_tensors(self, x):
+        if isinstance(x, torch.Tensor):
+            if x.is_sparse:
+                return x.to_dense().view(-1)
+            else:
+                return x.view(-1)
+        else:
+            return tuple(self._flatten_tensors(a) for a in x)
+
+    def _zero_grad_input(self, input):
+        if isinstance(input, torch.Tensor):
+            if input.requires_grad and input.grad is not None:
+                input.grad.zero_()
+                input.grad.detach_()
+        else:
+            for i in input:
+                self._zero_grad_input(i)
+
+    def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+        output = self._forward(module, input)
+        output_size = output.nelement()
+
+        if jacobian_input:
+            jacobian_inp = self._jacobian(input, output_size)
+            flat_jacobian_input = list(iter_tensors(jacobian_inp))
+
+        if jacobian_parameters:
+            num_param = sum(p.numel() for p in self._get_parameters(module)[0])
+            jacobian_param = torch.zeros(num_param, output_size)
+
+        for i in range(output_size):
+            param, d_param = self._get_parameters(module)
+            # make non grad zeros
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+
+            d_out = torch.zeros_like(output)
+            flat_d_out = d_out.view(-1)
+            flat_d_out[i] = 1
+
+            if jacobian_parameters:
+                self._zero_grad_parameters(module)
+            # Tensors will accumulate gradient from multiple steps
+            if jacobian_input:
+                self._zero_grad_input(input)
+            d_input = self._backward(module, input, output, d_out)
+
+            if jacobian_input:
+                for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
+                    jacobian_x[:, i] = d_x.contiguous().view(-1)
+            if jacobian_parameters:
+                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
+
+        res = tuple()
+        if jacobian_input:
+            res += jacobian_inp,
+        if jacobian_parameters:
+            res += jacobian_param,
+
+        return res
+
+    def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+        def fw(input):
+            return self._forward(module, input).detach()
+
+        res = tuple()
+        if jacobian_input:
+            res += get_numerical_jacobian(fw, input, eps=1e-6),
+        if jacobian_parameters:
+            param, _ = self._get_parameters(module)
+            res += torch.cat([get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param], 0),
+        return res
+
+    def check_jacobian(self, module, input, jacobian_input=True):
+        jacobian_parameters = bool(self._get_parameters(module)[0])
+        analytical = self._analytical_jacobian(module, input, jacobian_input, jacobian_parameters)
+        numerical = self._numerical_jacobian(module, input, jacobian_input, jacobian_parameters)
+        analytical_t = list(iter_tensors(analytical))
+        numerical_t = list(iter_tensors(numerical))
+
+        # TODO: compare structure
+        self.assertLessEqual(
+            max(a.add(-1, n).abs().max() for a, n in zip(analytical_t, numerical_t)),
+            PRECISION
+        )
+
+    def check_criterion_jacobian(self, criterion, input, target):
+        eps = 1e-6
+        self._forward_criterion(criterion, input, target)
+        analytical_d_x = self._backward_criterion(criterion, input, target)
+        numerical_d_x = deepcopy(analytical_d_x)
+
+        input_t = iter_tensors(input)
+        numerical_t = iter_tensors(numerical_d_x)
+        for x, d_x in zip(input_t, numerical_t):
+            x = x.view(-1).data
+            d_x = d_x.view(-1).data
+            for i in range(x.nelement()):
+                original = x[i].item()
+                x[i] = original + eps
+                fx1 = self._forward_criterion(criterion, input, target)
+                x[i] = original - eps
+                fx2 = self._forward_criterion(criterion, input, target)
+                deriv = (fx1 - fx2) / (2. * eps)
+                d_x[i] = float(deriv)
+                x[i] = original
+
+        # TODO: check structure
+        analytical_t = list(iter_tensors(analytical_d_x))
+        numerical_t = list(iter_tensors(numerical_d_x))
+
+        self.assertLessEqual(
+            max(a.add(-1, n).abs().max() for a, n in zip(analytical_t, numerical_t)),
+            PRECISION
+        )
+
+
+class TestBase(object):
+
+    _required_arg_names = {'constructor_args', 'input'}
+
+    def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
+        self.desc = desc
+        self.fullname = fullname
+        self.constructor = constructor
+        self.reference_fn = reference_fn
+        for name in self._required_arg_names:
+            if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
+                if name == 'constructor_args':
+                    kwargs['constructor_args'] = tuple()
+                else:
+                    raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
+                                     .format(self.get_name(), name))
+        self._extra_kwargs = kwargs
+        self._arg_cache = {}
+
+    def get_name(self):
+        if self.fullname is not None:
+            return 'test_' + self.fullname
+
+        test_name = 'test_' + self.constructor.__name__
+        if self.desc:
+            test_name += '_' + self.desc
+        return test_name
+
+    def _unpack(self, value):
+        if isinstance(value, torch.Tensor):
+            return value
+        elif is_iterable(value):
+            return type(value)(self._unpack(v) for v in value)
+        else:
+            return value
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', True)
+
+    def _get_arg(self, name, unpack):
+        assert name in self._required_arg_names
+
+        if name not in self._arg_cache:
+            fn_name = name + '_fn'
+            size_name = name + '_size'
+
+            if name in self._extra_kwargs:
+                self._arg_cache[name] = self._extra_kwargs[name]
+            elif fn_name in self._extra_kwargs:
+                self._arg_cache[name] = self._extra_kwargs[fn_name]()
+            else:
+                assert size_name in self._extra_kwargs
+
+                def map_tensor_sizes(sizes):
+                    if isinstance(sizes, list):
+                        return [map_tensor_sizes(s) for s in sizes]
+                    elif isinstance(sizes, torch.Tensor):
+                        return sizes.double()
+                    else:
+                        return torch.randn(sizes)
+
+                self._arg_cache[name] = map_tensor_sizes(self._extra_kwargs[size_name])
+
+        return self._unpack(self._arg_cache[name]) if unpack else self._arg_cache[name]
+
+    def _get_input(self, unpack=True):
+        return self._get_arg('input', unpack)
+
+    def __call__(self, test_case):
+        raise NotImplementedError
+
+
+class ModuleTest(TestBase):
+
+    def __init__(self, *args, **kwargs):
+        super(ModuleTest, self).__init__(*args, **kwargs)
+        self.jacobian_input = kwargs.get('jacobian_input', True)
+        self.should_test_cuda = kwargs.get('test_cuda', True)
+        self.should_test_pickle = kwargs.get('pickle', True)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.FIXME_no_cuda_gradgrad_comparison = \
+            kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
+        self.precision = kwargs.get('precision', 2e-4)
+
+    def __call__(self, test_case):
+        module = self.constructor(*self.constructor_args)
+        input = self._get_input()
+
+        if self.reference_fn is not None:
+            out = test_case._forward(module, input)
+            ref_input = deepcopy(input)
+            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
+            test_case.assertEqual(out, expected_out)
+        self.test_noncontig(test_case, module, input)
+
+        if self.should_test_pickle:
+            # TODO: do this with in-memory files as soon as torch.save will support it
+            with TemporaryFile() as f:
+                test_case._forward(module, input)
+                torch.save(module, f)
+                f.seek(0)
+                module_copy = torch.load(f)
+                test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input))
+
+        self._do_test(test_case, module, input)
+
+    def noncontiguize(self, obj):
+        if isinstance(obj, list):
+            return [self.noncontiguize(o) for o in obj]
+        tensor = obj
+        ndim = tensor.dim()
+        # Always making only the last dimension noncontiguous is easy to hide
+        # bugs because .view(-1) will still work. So try to find a dim with size
+        # > 1 and make that non-contiguous, i.e., stack + select on the
+        # dimension directly after that.
+        dim = ndim
+        for d in range(ndim):
+            if tensor.size(d) > 1:
+                dim = d + 1
+                break
+        noncontig = torch.stack([torch.empty_like(tensor), tensor], dim).select(dim, 1).detach()
+        assert noncontig.numel() == 1 or not noncontig.is_contiguous()
+        noncontig.requires_grad = tensor.requires_grad
+        return noncontig
+
+    def test_noncontig(self, test_case, module, input):
+        # check no scalars, can't make non-contig
+        if isinstance(input, torch.Tensor) and input.dim() == 0:
+            return
+        if any(i.dim() == 0 for i in input if isinstance(i, torch.Tensor)):
+            return
+
+        test_case._zero_grad_parameters(module)
+        test_case._zero_grad_input(input)
+        with freeze_rng_state():
+            output = test_case._forward(module, input)
+            grad_output = output.new(output.shape).normal_()
+            output = output.clone()
+            d_input = deepcopy(test_case._backward(module, input, output, grad_output))
+            d_param = deepcopy(test_case._get_parameters(module)[1])
+
+        nc_input = self.noncontiguize(input)
+        nc_grad_output = self.noncontiguize(grad_output)
+        for contig_i, contig_g in product((True, False), repeat=2):
+            i = input if contig_i else nc_input
+            go = grad_output if contig_g else nc_grad_output
+            test_case._zero_grad_parameters(module)
+            test_case._zero_grad_input(i)
+            with freeze_rng_state():
+                out = test_case._forward(module, i)
+                grad = test_case._backward(module, i, out, go)
+
+                test_case.assertEqual(out, output)
+                test_case.assertEqual(grad, d_input, 1e-4)
+                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
+
+    def test_cuda(self, test_case):
+        if not TEST_CUDA or not self.should_test_cuda:
+            raise unittest.SkipTest('Excluded from CUDA tests')
+        try:
+            cpu_input = self._get_input()
+            type_map = {'torch.DoubleTensor': torch.cuda.FloatTensor}
+            gpu_input = to_gpu(cpu_input, type_map=type_map)
+
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args).float().cuda()
+            cpu_param = test_case._get_parameters(cpu_module)
+            gpu_param = test_case._get_parameters(gpu_module)
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+                gpu_p.data.copy_(cpu_p)
+
+            test_case._zero_grad_input(cpu_input)
+            test_case._zero_grad_input(gpu_input)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
+            cpu_output = test_case._forward(cpu_module, cpu_input)
+            gpu_output = test_case._forward(gpu_module, gpu_input)
+            test_case.assertEqual(cpu_output, gpu_output, self.precision)
+
+            # Run backwards on CPU and GPU and compare results
+            for i in range(5):
+                cpu_gradOutput = cpu_output.clone().normal_()
+                gpu_gradOutput = cpu_gradOutput.type('torch.cuda.FloatTensor')
+                cpu_gradInput = test_case._backward(cpu_module, cpu_input, cpu_output, cpu_gradOutput)
+                gpu_gradInput = test_case._backward(gpu_module, gpu_input, gpu_output, gpu_gradOutput)
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, self.precision)
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, self.precision)
+
+            # Run double-backwards on CPU and GPU and compare results
+            if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
+                cpu_output = cpu_module(cpu_input)
+                gpu_output = gpu_module(gpu_input)
+
+                cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
+                gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
+                gpu_gradOutput.requires_grad = True
+
+                cpu_gradInputs = torch.autograd.grad(
+                    cpu_output,
+                    (cpu_input,) + tuple(cpu_module.parameters()),
+                    cpu_gradOutput,
+                    create_graph=True)
+                gpu_gradInputs = torch.autograd.grad(
+                    gpu_output,
+                    (gpu_input,) + tuple(gpu_module.parameters()),
+                    gpu_gradOutput,
+                    create_graph=True)
+
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                    test_case.assertEqual(cpu_d_i, gpu_d_i, self.precision)
+
+                # We mix output into the second backwards computation so that
+                # torch.autograd.grad doesn't complain that some inputs
+                # are unreachable (which can happen if you differentiate
+                # only on the gradient.
+                cpu_gg = torch.autograd.grad(
+                    cpu_output.sum() + sum(map(lambda x: x.sum(), cpu_gradInputs)),
+                    (cpu_input, cpu_gradOutput) + tuple(cpu_module.parameters()),
+                    retain_graph=True)
+                gpu_gg = torch.autograd.grad(
+                    gpu_output.sum() + sum(map(lambda x: x.sum(), gpu_gradInputs)),
+                    (gpu_input, gpu_gradOutput) + tuple(gpu_module.parameters()),
+                    retain_graph=True)
+
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, self.precision)
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, self.precision)
+
+            self.test_noncontig(test_case, gpu_module, gpu_input)
+        except NotImplementedError:
+            pass
+        # TODO: remove this after CUDA scatter_ is implemented
+        except AttributeError as e:
+            if len(e.args) == 1 and "'FloatTensor' object has no attribute 'scatter_'" in e.args[0]:
+                pass
+            else:
+                raise
+
+
+class CriterionTest(TestBase):
+
+    _required_arg_names = TestBase._required_arg_names.union({'target'})
+
+    def __init__(self, *args, **kwargs):
+        super(CriterionTest, self).__init__(*args, **kwargs)
+        self.should_test_cuda = kwargs.get('test_cuda', True)
+        self.check_forward_only = kwargs.get('check_forward_only', True)
+
+    def _get_target(self):
+        return self._get_arg('target', True)
+
+    def __call__(self, test_case):
+        module = self.constructor(*self.constructor_args)
+        input = self._get_input()
+
+        # Check that these methods don't raise errors
+        module.__repr__()
+        str(module)
+
+        target = self._get_target()
+
+        if self.reference_fn is not None:
+            out = test_case._forward_criterion(module, input, target)
+            expected_out = self.reference_fn(deepcopy(input),
+                                             deepcopy(target), module)
+            if isinstance(expected_out, torch.Tensor):
+                expected_out = expected_out.item()
+            test_case.assertEqual(out, expected_out)
+
+        if self.check_forward_only:
+            return
+
+        test_case.check_criterion_jacobian(module, input, target)
+        self._do_extra_tests(test_case, module, input, target)
+
+    def test_cuda(self, test_case):
+        if not TEST_CUDA or not self.should_test_cuda:
+            raise unittest.SkipTest('Excluded from CUDA tests')
+        try:
+            cpu_input = self._get_input()
+            type_map = {
+                'torch.DoubleTensor': torch.cuda.FloatTensor,
+            }
+            gpu_input = to_gpu(cpu_input, type_map=type_map)
+
+            cpu_target = self._get_target()
+            gpu_target = to_gpu(cpu_target, type_map=type_map)
+
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args).float().cuda()
+
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
+            test_case.assertEqual(cpu_output, gpu_output, 4e-4)
+
+            gradOutput = torch.randn(())
+            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, gradOutput)
+            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, gradOutput)
+            test_case.assertEqual(cpu_gradInput, gpu_gradInput, 4e-4)
+        except NotImplementedError:
+            pass
+
+    def _do_extra_tests(self, test_case, module, input, target):
+        pass
diff --git a/test/cpp/api/README.md b/test/cpp/api/README.md
new file mode 100644
index 0000000..4ae69b7
--- /dev/null
+++ b/test/cpp/api/README.md
@@ -0,0 +1,28 @@
+# C++ API Tests
+
+In this folder live the tests for PyTorch's C++ API (formerly known as
+autogradpp). They use the [Catch2](https://github.com/catchorg/Catch2) test
+framework.
+
+## CUDA Tests
+
+The way we handle CUDA tests is by separating them into a separate `TEST_CASE`
+(e.g. we have `optim` and `optim_cuda` test cases in `optim.cpp`), and giving
+them the `[cuda]` tag. Then, inside `main.cpp` we detect at runtime whether
+CUDA is available. If not, we disable these CUDA tests by appending `~[cuda]`
+to the test specifications. The `~` disables the tag.
+
+One annoying aspect is that Catch only allows filtering on test cases and not
+sections. Ideally, one could have a section like `LSTM` inside the `RNN` test
+case, and give this section a `[cuda]` tag to only run it when CUDA is
+available. Instead, we have to create a whole separate `RNN_cuda` test case and
+put all these CUDA sections in there.
+
+## Integration Tests
+
+Integration tests use the MNIST dataset. You must download it by running the
+following command from the PyTorch root folder:
+
+```shell
+$ python tools/download_mnist.py -d test/cpp/api/mnist
+```
diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
new file mode 100644
index 0000000..32bfe3c
--- /dev/null
+++ b/test/cpp/api/any.cpp
@@ -0,0 +1,365 @@
+#include <catch.hpp>
+
+#include <torch/torch.h>
+#include <torch/utils.h>
+#include <torch/nn/modules/any.h>
+
+#include <algorithm>
+#include <string>
+
+using namespace torch::nn;
+using namespace torch::detail;
+
+using Catch::Contains;
+using Catch::StartsWith;
+
+TEST_CASE("any-module") {
+  torch::manual_seed(0);
+  SECTION("int()") {
+    struct M : torch::nn::Module {
+      int forward() {
+        return 123;
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE(any.forward().get<int>() == 123);
+  }
+  SECTION("int(int)") {
+    struct M : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE(any.forward(5).get<int>() == 5);
+  }
+  SECTION("const char*(const char*)") {
+    struct M : torch::nn::Module {
+      const char* forward(const char* x) {
+        return x;
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE(any.forward("hello").get<const char*>() == std::string("hello"));
+  }
+
+  SECTION("string(int, const double)") {
+    struct M : torch::nn::Module {
+      std::string forward(int x, const double f) {
+        return std::to_string(static_cast<int>(x + f));
+      }
+    };
+    AnyModule any(M{});
+    int x = 4;
+    REQUIRE(any.forward(x, 3.14).get<std::string>() == std::string("7"));
+  }
+
+  SECTION("Tensor(string, const string&, string&&)") {
+    struct M : torch::nn::Module {
+      torch::Tensor forward(
+          std::string a,
+          const std::string& b,
+          std::string&& c) {
+        const auto s = a + b + c;
+        return torch::ones({static_cast<int64_t>(s.size())});
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE(
+        any.forward(std::string("a"), std::string("ab"), std::string("abc"))
+            .get<torch::Tensor>()
+            .sum()
+            .toCInt() == 6);
+  }
+  SECTION("wrong argument type") {
+    struct M : torch::nn::Module {
+      int forward(float x) {
+        return x;
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE_THROWS_WITH(
+        any.forward(5.0),
+        StartsWith("Expected argument #0 to be of type float, "
+                   "but received value of type double"));
+  }
+  SECTION("wrong number of arguments") {
+    struct M : torch::nn::Module {
+      int forward(int a, int b) {
+        return a + b;
+      }
+    };
+    AnyModule any(M{});
+    REQUIRE_THROWS_WITH(
+        any.forward(),
+        Contains("M's forward() method expects 2 arguments, but received 0"));
+    REQUIRE_THROWS_WITH(
+        any.forward(5),
+        Contains("M's forward() method expects 2 arguments, but received 1"));
+    REQUIRE_THROWS_WITH(
+        any.forward(1, 2, 3),
+        Contains("M's forward() method expects 2 arguments, but received 3"));
+  }
+  SECTION("get()") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
+      int value;
+      int forward(float x) {
+        return x;
+      }
+    };
+    AnyModule any(M{5});
+
+    SECTION("good cast") {
+      REQUIRE(any.get<M>().value == 5);
+    }
+
+    SECTION("bad cast") {
+      struct N : torch::nn::Module {};
+      REQUIRE_THROWS_WITH(any.get<N>(), StartsWith("Attempted to cast module"));
+    }
+  }
+  SECTION("ptr()") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
+      int value;
+      int forward(float x) {
+        return x;
+      }
+    };
+    AnyModule any(M{5});
+
+    SECTION("base class cast") {
+      auto ptr = any.ptr();
+      REQUIRE(ptr != nullptr);
+      REQUIRE(ptr->name() == "M");
+    }
+
+    SECTION("good downcast") {
+      auto ptr = any.ptr<M>();
+      REQUIRE(ptr != nullptr);
+      REQUIRE(ptr->value == 5);
+    }
+
+    SECTION("bad downcast") {
+      struct N : torch::nn::Module {};
+      REQUIRE_THROWS_WITH(any.ptr<N>(), StartsWith("Attempted to cast module"));
+    }
+  }
+  SECTION("default state is empty") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : value(value_) {}
+      int value;
+      int forward(float x) {
+        return x;
+      }
+    };
+    AnyModule any;
+    REQUIRE(any.is_empty());
+    any = std::make_shared<M>(5);
+    REQUIRE(!any.is_empty());
+    REQUIRE(any.get<M>().value == 5);
+  }
+  SECTION("all methods throw for empty AnyModule") {
+    struct M : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    AnyModule any;
+    REQUIRE(any.is_empty());
+    REQUIRE_THROWS_WITH(
+        any.get<M>(), StartsWith("Cannot call get() on an empty AnyModule"));
+    REQUIRE_THROWS_WITH(
+        any.ptr<M>(), StartsWith("Cannot call ptr() on an empty AnyModule"));
+    REQUIRE_THROWS_WITH(
+        any.ptr(), StartsWith("Cannot call ptr() on an empty AnyModule"));
+    REQUIRE_THROWS_WITH(
+        any.type_info(),
+        StartsWith("Cannot call type_info() on an empty AnyModule"));
+    REQUIRE_THROWS_WITH(
+        any.forward<int>(5),
+        StartsWith("Cannot call forward() on an empty AnyModule"));
+  }
+  SECTION("can move assign differentm modules") {
+    struct M : torch::nn::Module {
+      std::string forward(int x) {
+        return std::to_string(x);
+      }
+    };
+    struct N : torch::nn::Module {
+      int forward(float x) {
+        return 3 + x;
+      }
+    };
+    AnyModule any;
+    REQUIRE(any.is_empty());
+    any = std::make_shared<M>();
+    REQUIRE(!any.is_empty());
+    REQUIRE(any.forward(5).get<std::string>() == "5");
+    any = std::make_shared<N>();
+    REQUIRE(!any.is_empty());
+    REQUIRE(any.forward(5.0f).get<int>() == 8);
+  }
+  SECTION("constructs from ModuleHolder") {
+    struct MImpl : torch::nn::Module {
+      explicit MImpl(int value_) : torch::nn::Module("M"), value(value_) {}
+      int value;
+      int forward(float x) {
+        return x;
+      }
+    };
+
+    struct M : torch::nn::ModuleHolder<MImpl> {
+      using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
+      using torch::nn::ModuleHolder<MImpl>::get;
+    };
+
+    AnyModule any(M{5});
+    REQUIRE(any.get<MImpl>().value == 5);
+    REQUIRE(any.get<M>()->value == 5);
+  }
+  SECTION("converts at::Tensor to torch::Tensor correctly") {
+    struct M : torch::nn::Module {
+      torch::Tensor forward(torch::Tensor input) {
+        return input;
+      }
+    };
+    struct N : torch::nn::Module {
+      at::Tensor forward(at::Tensor input) {
+        return input;
+      }
+    };
+    {
+      // When you get an at::Tensor by performing an operation on a
+      // torch::Tensor, the tensor should be converted back to torch::Tensor
+      // before being passed to the function (to avoid a type mismatch).
+      AnyModule any(M{});
+      at::Tensor tensor_that_is_actually_a_variable = torch::ones(5) * 2;
+      REQUIRE(
+          any.forward(tensor_that_is_actually_a_variable)
+              .get<torch::Tensor>()
+              .sum()
+              .toCFloat() == 10);
+      // But tensors that are really tensors should just error.
+      REQUIRE_THROWS_WITH(
+          any.forward(at::ones(5)),
+          StartsWith(
+              "Expected argument #0 to be of type torch::autograd::Variable, "
+              "but received value of type at::Tensor"));
+    }
+    {
+      // If the function does really accept an `at::Tensor`, this should still
+      // work.
+      AnyModule any(N{});
+      REQUIRE(any.forward(at::ones(5)).get<at::Tensor>().sum().toCFloat() == 5);
+    }
+  }
+}
+
+namespace torch {
+namespace nn {
+struct TestValue {
+  template <typename T>
+  explicit TestValue(T&& value) : value_(std::forward<T>(value)) {}
+  AnyModule::Value operator()() {
+    return std::move(value_);
+  }
+  AnyModule::Value value_;
+};
+template <typename T>
+AnyModule::Value make_value(T&& value) {
+  return TestValue(std::forward<T>(value))();
+}
+} // namespace nn
+} // namespace torch
+
+TEST_CASE("any-value") {
+  torch::manual_seed(0);
+  SECTION("gets the correct value for the right type") {
+    SECTION("int") {
+      auto value = make_value(5);
+      // const and non-const types have the same typeid()
+      REQUIRE(value.try_get<int>() != nullptr);
+      REQUIRE(value.try_get<const int>() != nullptr);
+      REQUIRE(value.get<int>() == 5);
+    }
+    SECTION("const int") {
+      auto value = make_value(5);
+      REQUIRE(value.try_get<const int>() != nullptr);
+      REQUIRE(value.try_get<int>() != nullptr);
+      REQUIRE(value.get<const int>() == 5);
+    }
+    SECTION("const char*") {
+      auto value = make_value("hello");
+      REQUIRE(value.try_get<const char*>() != nullptr);
+      REQUIRE(value.get<const char*>() == std::string("hello"));
+    }
+    SECTION("std::string") {
+      auto value = make_value(std::string("hello"));
+      REQUIRE(value.try_get<std::string>() != nullptr);
+      REQUIRE(value.get<std::string>() == "hello");
+    }
+    SECTION("pointers") {
+      std::string s("hello");
+      std::string* p = &s;
+      auto value = make_value(p);
+      REQUIRE(value.try_get<std::string*>() != nullptr);
+      REQUIRE(*value.get<std::string*>() == "hello");
+    }
+    SECTION("references") {
+      std::string s("hello");
+      const std::string& t = s;
+      auto value = make_value(t);
+      REQUIRE(value.try_get<std::string>() != nullptr);
+      REQUIRE(value.get<std::string>() == "hello");
+    }
+  }
+  SECTION("try_get returns nullptr for the wrong type") {
+    auto value = make_value(5);
+    REQUIRE(value.try_get<int>() != nullptr);
+    REQUIRE(value.try_get<float>() == nullptr);
+    REQUIRE(value.try_get<long>() == nullptr);
+    REQUIRE(value.try_get<std::string>() == nullptr);
+  }
+  SECTION("get throws for the wrong type") {
+    auto value = make_value(5);
+    REQUIRE(value.try_get<int>() != nullptr);
+    REQUIRE_THROWS_WITH(
+        value.get<float>(),
+        StartsWith("Attempted to cast Value to float, "
+                   "but its actual type is int"));
+    REQUIRE_THROWS_WITH(
+        value.get<long>(),
+        StartsWith("Attempted to cast Value to long, "
+                   "but its actual type is int"));
+  }
+  SECTION("move is allowed") {
+    auto value = make_value(5);
+    SECTION("construction") {
+      auto copy = make_value(std::move(value));
+      REQUIRE(copy.try_get<int>() != nullptr);
+      REQUIRE(copy.get<int>() == 5);
+    }
+    SECTION("assignment") {
+      auto copy = make_value(10);
+      copy = std::move(value);
+      REQUIRE(copy.try_get<int>() != nullptr);
+      REQUIRE(copy.get<int>() == 5);
+    }
+  }
+  SECTION("type_info is correct") {
+    SECTION("int") {
+      auto value = make_value(5);
+      REQUIRE(value.type_info().hash_code() == typeid(int).hash_code());
+    }
+    SECTION("const char") {
+      auto value = make_value("hello");
+      REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code());
+    }
+    SECTION("std::string") {
+      auto value = make_value(std::string("hello"));
+      REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code());
+    }
+  }
+}
diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp
new file mode 100644
index 0000000..5c99866
--- /dev/null
+++ b/test/cpp/api/cursor.cpp
@@ -0,0 +1,414 @@
+#include <catch.hpp>
+
+#include <torch/nn/cursor.h>
+#include <torch/nn/module.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace torch::nn;
+using namespace torch::detail;
+
+using Catch::StartsWith;
+
+struct TestModule : public torch::nn::Module {
+  TestModule(int64_t size) {
+    tensor1 = register_parameter("tensor1", torch::randn({size}));
+    tensor2 = register_parameter("tensor2", torch::randn({size}));
+  }
+
+  torch::Tensor tensor1;
+  torch::Tensor tensor2;
+};
+
+struct Container : public torch::nn::Module {
+  template <typename... Ms>
+  explicit Container(Ms&&... ms) {
+    add(0, ms...);
+  }
+
+  void add(size_t) {}
+
+  template <typename Head, typename... Tail>
+  void add(size_t index, Head head, Tail... tail) {
+    add(std::to_string(index), std::move(head));
+    add(index + 1, tail...);
+  }
+
+  template <typename M>
+  void add(std::string name, M&& module) {
+    m.push_back(register_module(name, std::make_shared<M>(std::move(module))));
+  }
+
+  template <typename M>
+  void add(std::string name, std::shared_ptr<M>&& module) {
+    m.push_back(register_module(name, std::move(module)));
+  }
+
+  Module& operator[](size_t index) {
+    return *m.at(index);
+  }
+
+  std::vector<std::shared_ptr<Module>> m;
+};
+
+TEST_CASE("cursor/module") {
+  torch::manual_seed(0);
+  SECTION("Works for flat models (depth = 1)") {
+    Container model(TestModule(1), TestModule(2), TestModule(3));
+    auto cursor = model.modules();
+
+    SECTION("Iterates in the correct order") {
+      auto iterator = cursor.begin();
+      REQUIRE(&iterator->value == &model[0]);
+      REQUIRE(&(++iterator)->value == &model[1]);
+      REQUIRE(&(++iterator)->value == &model[2]);
+      REQUIRE(++iterator == cursor.end());
+    }
+
+    SECTION("names are flat") {
+      auto iterator = cursor.begin();
+      REQUIRE(iterator->key == "0");
+      REQUIRE((++iterator)->key == "1");
+      REQUIRE((++iterator)->key == "2");
+    }
+
+    SECTION("Apply works") {
+      size_t count = 0;
+      cursor.apply([&count, &model](Module& module) {
+        REQUIRE(&module == &model[count]);
+        count += 1;
+      });
+      REQUIRE(count == 3);
+    }
+
+    SECTION("Apply_items works") {
+      size_t count = 0;
+      cursor.apply_items(
+          [&count, &model](const std::string& key, Module& module) {
+            REQUIRE(&module == &model[count]);
+            count += 1;
+          });
+      REQUIRE(count == 3);
+    }
+
+    SECTION("Map works") {
+      std::vector<Module*> vector(3);
+      cursor.map(vector.begin(), [](Module& module) { return &module; });
+      REQUIRE(vector[0] == &model[0]);
+      REQUIRE(vector[1] == &model[1]);
+      REQUIRE(vector[2] == &model[2]);
+
+      std::list<Module*> list;
+      cursor.map(std::inserter(list, list.end()), [](Module& module) {
+        return &module;
+      });
+      REQUIRE(list.size() == 3);
+      auto iterator = list.begin();
+      REQUIRE(*iterator++ == &model[0]);
+      REQUIRE(*iterator++ == &model[1]);
+      REQUIRE(*iterator++ == &model[2]);
+      REQUIRE(iterator == list.end());
+    }
+
+    SECTION("Map_items works") {
+      std::map<std::string, Module*> output;
+      cursor.map_items(
+          std::inserter(output, output.end()),
+          [](const std::string& key, Module& module) {
+            return std::make_pair(key, &module);
+          });
+      REQUIRE(output.size() == 3);
+      REQUIRE(output.count("0"));
+      REQUIRE(output.count("1"));
+      REQUIRE(output.count("2"));
+      REQUIRE(output["0"] == &model[0]);
+      REQUIRE(output["1"] == &model[1]);
+      REQUIRE(output["2"] == &model[2]);
+    }
+
+    SECTION("Count works for flat models") {
+      REQUIRE(cursor.size() == model.m.size());
+    }
+
+    SECTION("find() finds the correct modules when given a valid key") {
+      REQUIRE(cursor.find("0") == &model[0]);
+      REQUIRE(cursor.find("1") == &model[1]);
+      REQUIRE(cursor.find("2") == &model[2]);
+    }
+
+    SECTION("find() returns nullptr when given an invalid key") {
+      REQUIRE(cursor.find("foo") == nullptr);
+      REQUIRE(cursor.find("bar") == nullptr);
+    }
+
+    SECTION("at(key) returns the correct modules when given a valid key") {
+      REQUIRE(&cursor.at("0") == &model[0]);
+      REQUIRE(&cursor.at("1") == &model[1]);
+      REQUIRE(&cursor.at("2") == &model[2]);
+    }
+
+    SECTION("at(key) throws when given an invalid key") {
+      REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'"));
+      REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'"));
+    }
+
+    SECTION(
+        "operator[key] returns the correct modules when given a valid key") {
+      REQUIRE(&cursor["0"] == &model[0]);
+      REQUIRE(&cursor["1"] == &model[1]);
+      REQUIRE(&cursor["2"] == &model[2]);
+    }
+
+    SECTION("operator[key] throws when given an invalid key") {
+      REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'"));
+      REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'"));
+    }
+
+    SECTION("at(index) returns the correct modules when given a valid index") {
+      REQUIRE(&cursor.at(0).value == &model[0]);
+      REQUIRE(&cursor.at(1).value == &model[1]);
+      REQUIRE(&cursor.at(2).value == &model[2]);
+    }
+
+    SECTION("at(index) throws when given an invalid index") {
+      REQUIRE_THROWS_WITH(
+          cursor.at(5),
+          StartsWith("Index 5 is out of range for cursor of size 3"));
+      REQUIRE_THROWS_WITH(
+          cursor.at(123),
+          StartsWith("Index 123 is out of range for cursor of size 3"));
+    }
+
+    SECTION(
+        "operator[index] returns the correct modules when given a valid index") {
+      REQUIRE(&cursor[0].value == &model[0]);
+      REQUIRE(&cursor[1].value == &model[1]);
+      REQUIRE(&cursor[2].value == &model[2]);
+    }
+
+    SECTION("operator[index] throws when given an invalid key") {
+      REQUIRE_THROWS_WITH(
+          cursor[5],
+          StartsWith("Index 5 is out of range for cursor of size 3"));
+      REQUIRE_THROWS_WITH(
+          cursor[123],
+          StartsWith("Index 123 is out of range for cursor of size 3"));
+    }
+
+    SECTION("contains() is correct") {
+      REQUIRE(cursor.contains("0"));
+      REQUIRE(cursor.contains("1"));
+      REQUIRE(cursor.contains("2"));
+    }
+  }
+
+  SECTION("Works for deeper hierarchies (depth > 1)") {
+    // clang-format off
+    Container model(
+        Container(
+          TestModule(1),
+          TestModule(2)),
+        TestModule(3),
+        Container(
+          TestModule(4),
+          Container(
+            TestModule(5),
+            TestModule(6))
+        ));
+    // clang-format on
+
+    auto cursor = model.modules();
+    // This is sufficient for the hierarchical case
+    // (other tests build on top)
+    SECTION("Iterates in the correct order") {
+      auto iterator = cursor.begin();
+
+      REQUIRE(&iterator->value == &model[0]);
+
+      auto* seq = dynamic_cast<Container*>(&model[0]);
+      REQUIRE(seq != nullptr);
+      REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+
+      REQUIRE(&(++iterator)->value == &model[1]);
+      REQUIRE(&(++iterator)->value == &model[2]);
+
+      seq = dynamic_cast<Container*>(&model[2]);
+      REQUIRE(seq != nullptr);
+      REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+
+      seq = dynamic_cast<Container*>(&(*seq)[1]);
+      REQUIRE(seq != nullptr);
+      REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+    }
+
+    SECTION("children() returns only the first level of submodules") {
+      auto children = model.children();
+      REQUIRE(children.size() == 3);
+      REQUIRE(&children.at("0") == &model[0]);
+      REQUIRE(&children.at("1") == &model[1]);
+      REQUIRE(&children.at("2") == &model[2]);
+      REQUIRE(!children.contains("0.0"));
+      size_t count = 0;
+      for (auto& child : children) {
+        REQUIRE(child.key == std::to_string(count));
+        REQUIRE(&child.value == &model[count]);
+        count += 1;
+      }
+    }
+  }
+}
+
+TEST_CASE("cursor/parameter") {
+  torch::manual_seed(0);
+  SECTION("Works for single models") {
+    TestModule model(1);
+    auto cursor = model.parameters();
+
+    SECTION("Iterates in the correct order") {
+      auto iterator = cursor.begin();
+      REQUIRE(iterator->value.equal(model.tensor1));
+      REQUIRE((++iterator)->value.equal(model.tensor2));
+    }
+  }
+
+  SECTION("Works for flat models (depth = 1)") {
+    auto first = std::make_shared<TestModule>(1);
+    auto second = std::make_shared<TestModule>(2);
+    Container model(first, second);
+    auto cursor = model.parameters();
+
+    SECTION("Iterates in the correct order") {
+      auto iterator = cursor.begin();
+      REQUIRE(iterator->value.equal(first->tensor1));
+      REQUIRE((++iterator)->value.equal(first->tensor2));
+      REQUIRE((++iterator)->value.equal(second->tensor1));
+      REQUIRE((++iterator)->value.equal(second->tensor2));
+    }
+
+    SECTION("Apply_items works") {
+      size_t count = 0;
+      cursor.apply_items([&count, &model, &first, &second](
+                             const std::string& key, torch::Tensor& tensor) {
+        switch (count) {
+          case 0: {
+            REQUIRE(tensor.equal(first->tensor1));
+            break;
+          }
+          case 1: {
+            REQUIRE(tensor.equal(first->tensor2));
+            break;
+          }
+          case 2: {
+            REQUIRE(tensor.equal(second->tensor1));
+            break;
+          }
+          case 3: {
+            REQUIRE(tensor.equal(second->tensor2));
+            break;
+          }
+        }
+        count += 1;
+      });
+      REQUIRE(count == 4);
+    }
+
+    // Other tests are correct based on correct iteration behavior and apply
+    // working.
+  }
+
+  SECTION("Works for deeper hierarchies (depth > 1)") {
+    std::vector<std::shared_ptr<TestModule>> modules;
+    for (size_t i = 1; i <= 6; ++i) {
+      modules.push_back(std::make_shared<TestModule>(i));
+    }
+    // clang-format off
+    Container model(
+        Container(
+          modules[0],
+          modules[1]),
+        modules[2],
+        Container(
+          modules[3],
+          Container(
+            modules[4],
+            modules[5])
+        ));
+    // clang-format on
+    auto cursor = model.parameters();
+
+    SECTION("Iterates in the correct order") {
+      auto iterator = cursor.begin();
+      REQUIRE(iterator->value.equal(modules[0]->tensor1));
+      REQUIRE((++iterator)->value.equal(modules[0]->tensor2));
+      for (size_t index = 1; index < 6; ++index) {
+        REQUIRE((++iterator)->value.equal(modules[index]->tensor1));
+        REQUIRE((++iterator)->value.equal(modules[index]->tensor2));
+      }
+    }
+
+    SECTION("names are hierarchical") {
+      auto iterator = cursor.begin();
+      REQUIRE(iterator->key == "0.0.tensor1");
+      REQUIRE((++iterator)->key == "0.0.tensor2");
+      REQUIRE((++iterator)->key == "0.1.tensor1");
+      REQUIRE((++iterator)->key == "0.1.tensor2");
+      REQUIRE((++iterator)->key == "1.tensor1");
+      REQUIRE((++iterator)->key == "1.tensor2");
+      REQUIRE((++iterator)->key == "2.0.tensor1");
+      REQUIRE((++iterator)->key == "2.0.tensor2");
+      REQUIRE((++iterator)->key == "2.1.0.tensor1");
+      REQUIRE((++iterator)->key == "2.1.0.tensor2");
+      REQUIRE((++iterator)->key == "2.1.1.tensor1");
+      REQUIRE((++iterator)->key == "2.1.1.tensor2");
+      REQUIRE(++iterator == cursor.end());
+    }
+  }
+}
+
+TEST_CASE("cursor/non-const-to-const-conversion") {
+  torch::manual_seed(0);
+  auto first = std::make_shared<TestModule>(1);
+  auto second = std::make_shared<TestModule>(2);
+  Container model(first, second);
+
+  {
+    ConstModuleCursor const_cursor(model.modules());
+    {
+      ModuleCursor cursor = model.modules();
+      ConstModuleCursor const_cursor = cursor;
+    }
+  }
+  {
+    ConstParameterCursor const_cursor(model.parameters());
+    {
+      ParameterCursor cursor = model.parameters();
+      ConstParameterCursor const_cursor = cursor;
+    }
+  }
+  {
+    ConstBufferCursor const_cursor(model.buffers());
+    {
+      BufferCursor cursor = model.buffers();
+      ConstBufferCursor const_cursor = cursor;
+    }
+  }
+}
+
+TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") {
+  torch::manual_seed(0);
+  TestModule model(1);
+
+  /// This will only compile if `Cursor` has the appropriate const methods.
+  const auto cursor = model.parameters();
+  REQUIRE(cursor.contains("tensor1"));
+}
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
new file mode 100644
index 0000000..57e0f14
--- /dev/null
+++ b/test/cpp/api/integration.cpp
@@ -0,0 +1,407 @@
+#include <catch.hpp>
+
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/optim/adam.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/sgd.h>
+#include <torch/tensor.h>
+#include <torch/tensor_list_view.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/util.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+#include <cmath>
+#include <iostream>
+#include <random>
+
+class CartPole {
+  // Translated from openai/gym's cartpole.py
+ public:
+  double gravity = 9.8;
+  double masscart = 1.0;
+  double masspole = 0.1;
+  double total_mass = (masspole + masscart);
+  double length = 0.5; // actually half the pole's length;
+  double polemass_length = (masspole * length);
+  double force_mag = 10.0;
+  double tau = 0.02; // seconds between state updates;
+
+  // Angle at which to fail the episode
+  double theta_threshold_radians = 12 * 2 * M_PI / 360;
+  double x_threshold = 2.4;
+  int steps_beyond_done = -1;
+
+  torch::Tensor state;
+  double reward;
+  bool done;
+  int step_ = 0;
+
+  torch::Tensor getState() {
+    return state;
+  }
+
+  double getReward() {
+    return reward;
+  }
+
+  double isDone() {
+    return done;
+  }
+
+  void reset() {
+    state = torch::empty({4}).uniform_(-0.05, 0.05);
+    steps_beyond_done = -1;
+    step_ = 0;
+  }
+
+  CartPole() {
+    reset();
+  }
+
+  void step(int action) {
+    auto x = state[0].toCFloat();
+    auto x_dot = state[1].toCFloat();
+    auto theta = state[2].toCFloat();
+    auto theta_dot = state[3].toCFloat();
+
+    auto force = (action == 1) ? force_mag : -force_mag;
+    auto costheta = std::cos(theta);
+    auto sintheta = std::sin(theta);
+    auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
+        total_mass;
+    auto thetaacc = (gravity * sintheta - costheta * temp) /
+        (length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
+    auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
+
+    x = x + tau * x_dot;
+    x_dot = x_dot + tau * xacc;
+    theta = theta + tau * theta_dot;
+    theta_dot = theta_dot + tau * thetaacc;
+    state.data()[0] = x;
+    state.data()[1] = x_dot;
+    state.data()[2] = theta;
+    state.data()[3] = theta_dot;
+    done = x < -x_threshold || x > x_threshold ||
+        theta < -theta_threshold_radians || theta > theta_threshold_radians ||
+        step_ > 200;
+
+    if (!done) {
+      reward = 1.0;
+    } else if (steps_beyond_done == -1) {
+      // Pole just fell!
+      steps_beyond_done = 0;
+      reward = 0;
+    } else {
+      if (steps_beyond_done == 0) {
+        AT_ASSERT(false); // Can't do this
+      }
+    }
+    step_++;
+  }
+};
+
+template <typename M, typename F, typename O>
+bool test_mnist(
+    uint32_t batch_size,
+    uint32_t num_epochs,
+    bool useGPU,
+    M&& model,
+    F&& forward_op,
+    O&& optimizer) {
+  std::cout << "Training MNIST for " << num_epochs
+            << " epochs, rest your eyes for a bit!\n";
+  struct MNIST_Reader {
+    FILE* fp_;
+
+    explicit MNIST_Reader(const char* path) {
+      fp_ = fopen(path, "rbe");
+      if (!fp_)
+        throw std::runtime_error("failed to open file");
+    }
+
+    ~MNIST_Reader() {
+      if (fp_)
+        fclose(fp_);
+    }
+
+    uint32_t read_int() {
+      uint8_t buf[4];
+      if (fread(buf, sizeof(buf), 1, fp_) != 1) {
+        throw std::runtime_error("failed to read an integer");
+      }
+      return buf[0] << 24u | buf[1] << 16u | buf[2] << 8u | buf[3];
+    }
+
+    uint8_t read_byte() {
+      uint8_t i;
+      if (fread(&i, sizeof(i), 1, fp_) != 1) {
+        throw std::runtime_error("failed to read an byte");
+      }
+      return i;
+    }
+  };
+
+  auto readData = [&](std::string fn) {
+    MNIST_Reader rd(fn.c_str());
+
+    /* int image_magic = */ rd.read_int();
+    int image_count = rd.read_int();
+    int image_rows = rd.read_int();
+    int image_cols = rd.read_int();
+
+    auto data = torch::empty({image_count, 1, image_rows, image_cols});
+    auto a_data = data.accessor<float, 4>();
+
+    for (int c = 0; c < image_count; c++) {
+      for (int i = 0; i < image_rows; i++) {
+        for (int j = 0; j < image_cols; j++) {
+          a_data[c][0][i][j] = float(rd.read_byte()) / 255;
+        }
+      }
+    }
+
+    return data.toBackend(useGPU ? torch::kCUDA : torch::kCPU);
+  };
+
+  auto readLabels = [&](std::string fn) {
+    MNIST_Reader rd(fn.c_str());
+    /* int label_magic = */ rd.read_int();
+    int label_count = rd.read_int();
+
+    auto data = torch::empty({label_count}, torch::kInt64);
+    auto a_data = data.accessor<int64_t, 1>();
+
+    for (int i = 0; i < label_count; ++i) {
+      a_data[i] = static_cast<int64_t>(rd.read_byte());
+    }
+    return data.toBackend(useGPU ? torch::kCUDA : torch::kCPU);
+  };
+
+  auto trdata = readData("test/cpp/api/mnist/train-images-idx3-ubyte");
+  auto trlabel = readLabels("test/cpp/api/mnist/train-labels-idx1-ubyte");
+  auto tedata = readData("test/cpp/api/mnist/t10k-images-idx3-ubyte");
+  auto telabel = readLabels("test/cpp/api/mnist/t10k-labels-idx1-ubyte");
+
+  if (useGPU) {
+    model->to(torch::kCUDA);
+  }
+
+  std::random_device device;
+  std::mt19937 generator(device());
+
+  for (auto epoch = 0U; epoch < num_epochs; epoch++) {
+    auto shuffled_inds = std::vector<int>(trdata.size(0));
+    for (int i = 0; i < trdata.size(0); i++) {
+      shuffled_inds[i] = i;
+    }
+    std::shuffle(shuffled_inds.begin(), shuffled_inds.end(), generator);
+
+    const auto backend = useGPU ? torch::kCUDA : torch::kCPU;
+    auto inp =
+        torch::empty({batch_size, 1, trdata.size(2), trdata.size(3)}, backend);
+    auto lab =
+        torch::empty({batch_size}, torch::device(backend).dtype(torch::kInt64));
+    for (auto p = 0U; p < shuffled_inds.size() - batch_size; p++) {
+      inp[p % batch_size] = trdata[shuffled_inds[p]];
+      lab[p % batch_size] = trlabel[shuffled_inds[p]];
+
+      if (p % batch_size != batch_size - 1)
+        continue;
+      inp.set_requires_grad(true);
+      torch::Tensor x = forward_op(inp);
+      inp.set_requires_grad(false);
+      torch::Tensor y = lab;
+      torch::Tensor loss = torch::nll_loss(x, y);
+
+      optimizer.zero_grad();
+      loss.backward();
+      optimizer.step();
+    }
+  }
+
+  torch::NoGradGuard guard;
+  auto result = std::get<1>(forward_op(tedata).max(1));
+  torch::Tensor correct = (result == telabel).toType(torch::kFloat32);
+  std::cout << "Num correct: " << correct.data().sum().toCFloat() << " out of "
+            << telabel.size(0) << std::endl;
+  return correct.data().sum().toCFloat() > telabel.size(0) * 0.8;
+}
+
+TEST_CASE("integration/cartpole") {
+  torch::manual_seed(0);
+  std::cerr << "Training episodic policy gradient with a critic for up to 3000"
+               " episodes, rest your eyes for a bit!\n";
+  auto model = std::make_shared<SimpleContainer>();
+  auto linear = model->add(Linear(4, 128), "linear");
+  auto policyHead = model->add(Linear(128, 2), "policy");
+  auto valueHead = model->add(Linear(128, 1), "action");
+  auto optimizer = torch::optim::Adam(model->parameters(), 1e-3);
+
+  std::vector<torch::Tensor> saved_log_probs;
+  std::vector<torch::Tensor> saved_values;
+  std::vector<float> rewards;
+
+  auto forward = [&](torch::Tensor inp) {
+    auto x = linear->forward(inp).clamp_min(0);
+    torch::Tensor actions = policyHead->forward(x);
+    torch::Tensor value = valueHead->forward(x);
+    return std::make_tuple(torch::softmax(actions, -1), value);
+  };
+
+  auto selectAction = [&](torch::Tensor state) {
+    // Only work on single state right now, change index to gather for batch
+    auto out = forward(state);
+    auto probs = torch::Tensor(std::get<0>(out));
+    auto value = torch::Tensor(std::get<1>(out));
+    auto action = probs.data().multinomial(1)[0].toCInt();
+    // Compute the log prob of a multinomial distribution.
+    // This should probably be actually implemented in autogradpp...
+    auto p = probs / probs.sum(-1, true);
+    auto log_prob = p[action].log();
+    saved_log_probs.emplace_back(log_prob);
+    saved_values.push_back(value);
+    return action;
+  };
+
+  auto finishEpisode = [&]() {
+    auto R = 0.;
+    for (int i = rewards.size() - 1; i >= 0; i--) {
+      R = rewards[i] + 0.99 * R;
+      rewards[i] = R;
+    }
+    auto r_t = torch::from_blob(
+        rewards.data(), {static_cast<int64_t>(rewards.size())});
+    r_t = (r_t - r_t.mean()) / (r_t.std() + 1e-5);
+
+    std::vector<torch::Tensor> policy_loss;
+    std::vector<torch::Tensor> value_loss;
+    for (auto i = 0U; i < saved_log_probs.size(); i++) {
+      auto r = rewards[i] - saved_values[i].toCFloat();
+      policy_loss.push_back(-r * saved_log_probs[i]);
+      value_loss.push_back(torch::smooth_l1_loss(
+          saved_values[i], torch::ones({1}) * rewards[i]));
+    }
+
+    auto loss = torch::stack(torch::TensorListView(policy_loss)).sum() +
+        torch::stack(torch::TensorListView(value_loss)).sum();
+
+    optimizer.zero_grad();
+    loss.backward();
+    optimizer.step();
+
+    rewards.clear();
+    saved_log_probs.clear();
+    saved_values.clear();
+  };
+
+  auto env = CartPole();
+  double running_reward = 10.0;
+  for (auto episode = 0;; episode++) {
+    env.reset();
+    auto state = env.getState();
+    int t = 0;
+    for (; t < 10000; t++) {
+      auto action = selectAction(state);
+      env.step(action);
+      state = env.getState();
+      auto reward = env.getReward();
+      auto done = env.isDone();
+
+      rewards.push_back(reward);
+      if (done)
+        break;
+    }
+
+    running_reward = running_reward * 0.99 + t * 0.01;
+    finishEpisode();
+    /*
+    if (episode % 10 == 0) {
+      printf("Episode %i\tLast length: %5d\tAverage length: %.2f\n",
+              episode, t, running_reward);
+    }
+    */
+    if (running_reward > 150)
+      break;
+    REQUIRE(episode < 3000);
+  }
+}
+
+TEST_CASE("integration/mnist", "[cuda]") {
+  torch::manual_seed(0);
+  auto model = std::make_shared<SimpleContainer>();
+  auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
+  auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
+  auto drop = Dropout(0.3);
+  auto drop2d = Dropout2d(0.3);
+  auto linear1 = model->add(Linear(320, 50), "linear1");
+  auto linear2 = model->add(Linear(50, 10), "linear2");
+
+  auto forward = [&](torch::Tensor x) {
+    x = torch::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = conv2->forward(x);
+    x = drop2d->forward(x);
+    x = torch::max_pool2d(x, {2, 2}).relu();
+
+    x = x.view({-1, 320});
+    x = linear1->forward(x).clamp_min(0);
+    x = drop->forward(x);
+    x = linear2->forward(x);
+    x = torch::log_softmax(x, 1);
+    return x;
+  };
+
+  auto optimizer = torch::optim::SGD(
+      model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
+
+  REQUIRE(test_mnist(
+      32, // batch_size
+      3, // num_epochs
+      true, // useGPU
+      model,
+      forward,
+      optimizer));
+}
+
+TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
+  torch::manual_seed(0);
+  auto model = std::make_shared<SimpleContainer>();
+  auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
+  auto batchnorm2d =
+      model->add(BatchNorm(BatchNormOptions(10).stateful(true)), "batchnorm2d");
+  auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
+  auto linear1 = model->add(Linear(320, 50), "linear1");
+  auto batchnorm1 =
+      model->add(BatchNorm(BatchNormOptions(50).stateful(true)), "batchnorm1");
+  auto linear2 = model->add(Linear(50, 10), "linear2");
+
+  auto forward = [&](torch::Tensor x) {
+    x = torch::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = batchnorm2d->forward(x);
+    x = conv2->forward(x);
+    x = torch::max_pool2d(x, {2, 2}).relu();
+
+    x = x.view({-1, 320});
+    x = linear1->forward(x).clamp_min(0);
+    x = batchnorm1->forward(x);
+    x = linear2->forward(x);
+    x = torch::log_softmax(x, 1);
+    return x;
+  };
+
+  auto optimizer = torch::optim::SGD(
+      model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
+
+  REQUIRE(test_mnist(
+      32, // batch_size
+      3, // num_epochs
+      true, // useGPU
+      model,
+      forward,
+      optimizer));
+}
diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp
new file mode 100644
index 0000000..9dc5544
--- /dev/null
+++ b/test/cpp/api/main.cpp
@@ -0,0 +1,26 @@
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+#include <torch/cuda.h>
+
+#include <iostream>
+
+// Custom main to disable CUDA tests when they are not available.
+// https://github.com/catchorg/Catch2/blob/master/docs/own-main.md
+
+int main(int argc, char* argv[]) {
+  Catch::Session session;
+
+  const auto return_code = session.applyCommandLine(argc, argv);
+  if (return_code != 0) {
+    return return_code;
+  }
+
+  if (!torch::cuda::is_available()) {
+    std::cerr << "CUDA not available. Disabling CUDA tests" << std::endl;
+    // ~ disables the [cuda] tag.
+    session.configData().testsOrTags.emplace_back("~[cuda]");
+  }
+
+  return session.run();
+}
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
new file mode 100644
index 0000000..55d5d99
--- /dev/null
+++ b/test/cpp/api/misc.cpp
@@ -0,0 +1,332 @@
+#include <catch.hpp>
+
+#include <torch/detail/ordered_dict.h>
+#include <torch/expanding_array.h>
+#include <torch/nn/init.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <torch/csrc/utils/memory.h>
+
+#include <ATen/optional.h>
+
+using namespace torch::nn;
+
+template <typename T>
+using OrderedDict = torch::detail::OrderedDict<std::string, T>;
+
+using Catch::StartsWith;
+
+TEST_CASE("NoGrad") {
+  torch::manual_seed(0);
+  torch::NoGradGuard guard;
+  Linear model(5, 2);
+  auto x = torch::randn({10, 5}, torch::requires_grad());
+  auto y = model->forward(x);
+  torch::Tensor s = y.sum();
+
+  s.backward();
+  REQUIRE(!model->parameters()["weight"].grad().defined());
+}
+
+TEST_CASE("autograd") {
+  torch::manual_seed(0);
+  auto x = torch::randn({3, 3}, torch::requires_grad());
+  auto y = torch::randn({3, 3});
+  auto z = x * y;
+  SECTION("derivatives of zero-dim tensors") {
+    z.sum().backward();
+    REQUIRE(x.grad().allclose(y));
+  }
+  SECTION("derivatives of tensors") {
+    z.backward();
+    REQUIRE(x.grad().allclose(y));
+  }
+  SECTION("custom gradient inputs") {
+    z.sum().backward(torch::ones({}) * 2);
+    REQUIRE(x.grad().allclose(y * 2));
+  }
+  // Assume everything else is safe from PyTorch tests.
+}
+
+TEST_CASE("nn::init") {
+  auto tensor = torch::empty({3, 4}, torch::requires_grad());
+  REQUIRE_THROWS_WITH(
+      tensor.fill_(1),
+      StartsWith("a leaf Variable that requires grad "
+                 "has been used in an in-place operation"));
+  REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12);
+}
+
+TEST_CASE("expanding-array") {
+  torch::manual_seed(0);
+  SECTION("successful construction") {
+    SECTION("initializer_list") {
+      torch::ExpandingArray<5> e({1, 2, 3, 4, 5});
+      REQUIRE(e.size() == 5);
+      for (size_t i = 0; i < e.size(); ++i) {
+        REQUIRE((*e)[i] == i + 1);
+      }
+    }
+
+    SECTION("vector") {
+      torch::ExpandingArray<5> e(std::vector<int64_t>{1, 2, 3, 4, 5});
+      REQUIRE(e.size() == 5);
+      for (size_t i = 0; i < e.size(); ++i) {
+        REQUIRE((*e)[i] == i + 1);
+      }
+    }
+
+    SECTION("array") {
+      torch::ExpandingArray<5> e(std::array<int64_t, 5>({1, 2, 3, 4, 5}));
+      REQUIRE(e.size() == 5);
+      for (size_t i = 0; i < e.size(); ++i) {
+        REQUIRE((*e)[i] == i + 1);
+      }
+    }
+
+    SECTION("single value") {
+      torch::ExpandingArray<5> e(5);
+      REQUIRE(e.size() == 5);
+      for (size_t i = 0; i < e.size(); ++i) {
+        REQUIRE((*e)[i] == 5);
+      }
+    }
+  }
+  SECTION("throws for incorrect size on construction") {
+    SECTION("initializer_list") {
+      REQUIRE_THROWS_WITH(
+          torch::ExpandingArray<5>({1, 2, 3, 4, 5, 6, 7}),
+          StartsWith("Expected 5 values, but instead got 7"));
+    }
+    SECTION("vector") {
+      REQUIRE_THROWS_WITH(
+          torch::ExpandingArray<5>(std::vector<int64_t>({1, 2, 3, 4, 5, 6, 7})),
+          StartsWith("Expected 5 values, but instead got 7"));
+    }
+  }
+}
+
+TEST_CASE("make_unique") {
+  struct Test {
+    explicit Test(const int& x) : lvalue_(x) {}
+    explicit Test(int&& x) : rvalue_(x) {}
+
+    at::optional<int> lvalue_;
+    at::optional<int> rvalue_;
+  };
+
+  SECTION("forwards rvalues correctly") {
+    auto ptr = torch::make_unique<Test>(123);
+    REQUIRE(!ptr->lvalue_.has_value());
+    REQUIRE(ptr->rvalue_.has_value());
+    REQUIRE(*ptr->rvalue_ == 123);
+  }
+
+  SECTION("forwards lvalues correctly") {
+    int x = 5;
+    auto ptr = torch::make_unique<Test>(x);
+    REQUIRE(ptr->lvalue_.has_value());
+    REQUIRE(*ptr->lvalue_ == 5);
+    REQUIRE(!ptr->rvalue_.has_value());
+  }
+
+  SECTION("Can construct unique_ptr of array") {
+    auto ptr = torch::make_unique<int[]>(3);
+    // Value initialization is required by the standard.
+    REQUIRE(ptr[0] == 0);
+    REQUIRE(ptr[1] == 0);
+    REQUIRE(ptr[2] == 0);
+  }
+}
+
+TEST_CASE("ordered-dict") {
+  SECTION("is empty after default construction") {
+    OrderedDict<int> dict;
+    REQUIRE(dict.subject() == "Key");
+    REQUIRE(dict.is_empty());
+    REQUIRE(dict.size() == 0);
+  }
+
+  SECTION("insert inserts elements when they are not yet present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE(dict.size() == 2);
+  }
+
+  SECTION("get returns values when present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE(dict.get("a") == 1);
+    REQUIRE(dict.get("b") == 2);
+  }
+
+  SECTION("get throws when passed keys that are not present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE_THROWS_WITH(
+        dict.get("foo"), StartsWith("Key 'foo' is not defined"));
+    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+  }
+
+  SECTION("can initialize from list") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.size() == 2);
+    REQUIRE(dict.get("a") == 1);
+    REQUIRE(dict.get("b") == 2);
+  }
+
+  SECTION("insert throws when passed elements that are present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE_THROWS_WITH(
+        dict.insert("a", 1), StartsWith("Key 'a' already defined"));
+    REQUIRE_THROWS_WITH(
+        dict.insert("b", 1), StartsWith("Key 'b' already defined"));
+  }
+
+  SECTION("front() returns the first item") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.front().key == "a");
+    REQUIRE(dict.front().value == 1);
+  }
+
+  SECTION("back() returns the last item") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.back().key == "b");
+    REQUIRE(dict.back().value == 2);
+  }
+
+  SECTION("find returns pointers to values when present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.find("a") != nullptr);
+    REQUIRE(*dict.find("a") == 1);
+    REQUIRE(dict.find("b") != nullptr);
+    REQUIRE(*dict.find("b") == 2);
+  }
+
+  SECTION("find returns null pointers when passed keys that are not present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.find("bar") == nullptr);
+    REQUIRE(dict.find("") == nullptr);
+  }
+
+  SECTION("operator[] returns values when passed keys that are present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict["a"] == 1);
+    REQUIRE(dict["b"] == 2);
+  }
+
+  SECTION("operator[] returns items positionally when passed integers") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict[0].key == "a");
+    REQUIRE(dict[0].value == 1);
+    REQUIRE(dict[1].key == "b");
+    REQUIRE(dict[1].value == 2);
+  }
+
+  SECTION("operator[] throws when passed keys that are not present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE_THROWS_WITH(
+        dict.get("foo"), StartsWith("Key 'foo' is not defined"));
+    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+  }
+
+  SECTION("update inserts all items from another OrderedDict") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> dict2 = {{"c", 3}};
+    dict2.update(dict);
+    REQUIRE(dict2.size() == 3);
+    REQUIRE(dict2.find("a") != nullptr);
+    REQUIRE(dict2.find("b") != nullptr);
+    REQUIRE(dict2.find("c") != nullptr);
+  }
+
+  SECTION("update also checks for duplicates") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> dict2 = {{"a", 1}};
+    REQUIRE_THROWS_WITH(
+        dict2.update(dict), StartsWith("Key 'a' already defined"));
+  }
+
+  SECTION("Can iterate items") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    auto iterator = dict.begin();
+    REQUIRE(iterator != dict.end());
+    REQUIRE(iterator->key == "a");
+    REQUIRE(iterator->value == 1);
+    ++iterator;
+    REQUIRE(iterator != dict.end());
+    REQUIRE(iterator->key == "b");
+    REQUIRE(iterator->value == 2);
+    ++iterator;
+    REQUIRE(iterator == dict.end());
+  }
+
+  SECTION("clear makes the dict empty") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(!dict.is_empty());
+    dict.clear();
+    REQUIRE(dict.is_empty());
+  }
+
+  SECTION("can copy construct") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = dict;
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+  }
+
+  SECTION("can copy assign") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = {{"c", 1}};
+    REQUIRE(copy.find("c") != nullptr);
+    copy = dict;
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+    REQUIRE(copy.find("c") == nullptr);
+  }
+
+  SECTION("can move construct") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = std::move(dict);
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+  }
+
+  SECTION("can move assign") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = {{"c", 1}};
+    REQUIRE(copy.find("c") != nullptr);
+    copy = std::move(dict);
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+    REQUIRE(copy.find("c") == nullptr);
+  }
+
+  SECTION("can insert with braces") {
+    OrderedDict<std::pair<int, int>> dict;
+    dict.insert("a", {1, 2});
+    REQUIRE(!dict.is_empty());
+    REQUIRE(dict["a"].first == 1);
+    REQUIRE(dict["a"].second == 2);
+  }
+
+  SECTION("Error messages include the what") {
+    OrderedDict<int> dict("Penguin");
+    REQUIRE(dict.subject() == "Penguin");
+    dict.insert("a", 1);
+    REQUIRE(!dict.is_empty());
+    REQUIRE_THROWS_WITH(
+        dict.get("b"), StartsWith("Penguin 'b' is not defined"));
+    REQUIRE_THROWS_WITH(
+        dict.insert("a", 1), StartsWith("Penguin 'a' already defined"));
+  }
+}
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
new file mode 100644
index 0000000..c8e7bdc
--- /dev/null
+++ b/test/cpp/api/module.cpp
@@ -0,0 +1,352 @@
+#include <catch.hpp>
+
+#include <torch/nn/module.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/rnn.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/util.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+using Catch::StartsWith;
+
+struct AGIUnit : torch::nn::Module {};
+
+namespace test {
+struct AGIUnit : torch::nn::Module {};
+struct AGIUnit2 : torch::nn::Module {
+  AGIUnit2() : torch::nn::Module("Foo") {}
+};
+} // namespace test
+
+TEST_CASE("module/training-mode") {
+  torch::manual_seed(0);
+  Linear module(3, 4);
+  REQUIRE(module->is_training());
+  SECTION("Enable eval mode") {
+    module->eval();
+    REQUIRE(!module->is_training());
+  }
+  SECTION("Enable train mode") {
+    module->train();
+    REQUIRE(module->is_training());
+  }
+}
+
+TEST_CASE("module/zero-grad") {
+  torch::manual_seed(0);
+  Linear module(3, 4);
+  auto weight = torch::ones({8, 3}, torch::requires_grad());
+  auto loss = module->forward(weight).sum();
+  loss.backward();
+  for (auto& parameter : module->parameters()) {
+    auto grad = parameter->grad();
+    REQUIRE(grad.defined());
+    REQUIRE(grad.sum().toCFloat() != 0);
+  }
+  module->zero_grad();
+  for (auto& parameter : module->parameters()) {
+    auto grad = parameter->grad();
+    REQUIRE(grad.defined());
+    REQUIRE(grad.sum().toCFloat() == 0);
+  }
+}
+
+TEST_CASE("module/zero-grad-with-undefined") {
+  struct TestModule : torch::nn::Module {
+    TestModule() {
+      x = register_parameter("x", torch::ones(5, at::requires_grad()));
+      y = register_parameter("y", torch::ones(5, at::requires_grad()));
+    }
+    torch::Tensor x, y;
+  };
+
+  TestModule module;
+  auto z = module.x * 2;
+  z.sum().backward();
+
+  REQUIRE(module.x.grad().defined());
+  REQUIRE(!module.y.grad().defined());
+
+  module.zero_grad();
+
+  REQUIRE(module.x.grad().defined());
+  REQUIRE(!module.y.grad().defined());
+
+  REQUIRE(module.x.grad().sum().toCFloat() == 0);
+}
+
+TEST_CASE("module/name") {
+  // CHECK instead of REQUIRE because demangling may fail.
+  AGIUnit agi;
+  // Call it twice just to make sure there are no bugs in the lazy
+  // initialization semantics.
+  CHECK(agi.name() == "AGIUnit");
+  CHECK(agi.name() == "AGIUnit");
+  SECTION("correctly demangled") {
+    CHECK(test::AGIUnit().name() == "test::AGIUnit");
+    CHECK(test::AGIUnit2().name() == "Foo");
+  }
+}
+
+TEST_CASE("module/as") {
+  Linear module(3, 4);
+  REQUIRE(module->as<Linear>() == module.get());
+  REQUIRE(module->as<LinearImpl>() == module.get());
+  REQUIRE(module->as<Module>() == module.get());
+  REQUIRE(module->as<AGIUnit>() == nullptr);
+
+  std::shared_ptr<Module> raw = module.ptr();
+  REQUIRE(raw->as<Linear>() == module.get());
+  REQUIRE(raw->as<LinearImpl>() == module.get());
+  REQUIRE(raw->as<Module>() == module.get());
+  REQUIRE(raw->as<AGIUnit>() == nullptr);
+
+  Module& raw_ref = *raw.get();
+  REQUIRE(raw_ref.as<Linear>() == module.get());
+  REQUIRE(raw_ref.as<LinearImpl>() == module.get());
+  REQUIRE(raw_ref.as<Module>() == module.get());
+  REQUIRE(raw_ref.as<AGIUnit>() == nullptr);
+  if (auto* linear = raw_ref.as<Linear>()) {
+    REQUIRE(linear->weight.ndimension() == 2);
+  }
+
+  AGIUnit unit;
+  REQUIRE(unit.as<Linear>() == nullptr);
+  REQUIRE(unit.as<LinearImpl>() == nullptr);
+  REQUIRE(unit.as<AGIUnit>() == &unit);
+}
+
+TEST_CASE("module/conversions", "[cuda]") {
+  torch::manual_seed(0);
+  Linear module(128, 64);
+  SECTION("starts as float on CPU") {
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->device() == torch::Device(torch::kCPU));
+      REQUIRE(parameter->dtype() == torch::kFloat32);
+    }
+  }
+  SECTION("to(CUDA)") {
+    module->to({torch::kCUDA, 0});
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      REQUIRE(parameter->device().index() == 0);
+    }
+    module->to({at::kCUDA, 1});
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      REQUIRE(parameter->device().index() == 1);
+    }
+  }
+  SECTION("to(CPU)") {
+    module->to(torch::Device(torch::kCPU));
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->device().type() == torch::Device::Type::CPU);
+    }
+  }
+  SECTION("to(Int32)") {
+    module->to(torch::kInt32);
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->dtype() == torch::kInt32);
+    }
+  }
+  SECTION("to(Float64)") {
+    module->to(torch::kFloat64);
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->dtype() == torch::kFloat64);
+    }
+  }
+  SECTION("to(CUDA, Byte)") {
+    module->to(torch::Device(torch::kCUDA, 1), torch::kUInt8);
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      REQUIRE(parameter->device().index() == 1);
+    }
+    for (auto& parameter : module->parameters()) {
+      REQUIRE(parameter->dtype() == torch::kUInt8);
+    }
+  }
+}
+
+TEST_CASE("module/clone") {
+  torch::manual_seed(0);
+  SECTION(
+      "a module that does not override clone() throws when clone() is called") {
+    struct UnCloneable : Module {};
+    UnCloneable module;
+    REQUIRE_THROWS_WITH(
+        module.clone(), StartsWith("clone() has not been implemented"));
+  }
+
+  SECTION(
+      "a module that overrides clone() does not throw when clone() is called ") {
+    struct Cloneable : Module {
+      std::shared_ptr<Module> clone() const override {
+        return nullptr;
+      }
+    };
+    Cloneable module;
+    REQUIRE_NOTHROW(module.clone());
+  }
+
+  SECTION("Cloning creates distinct parameters") {
+    struct TestModule : public Cloneable<TestModule> {
+      TestModule() {
+        reset();
+      }
+      void reset() override {
+        l1 = register_module("l1", Linear(10, 3));
+        l2 = register_module("l2", Linear(3, 5));
+        l3 = register_module("l3", Linear(5, 100));
+        buffer = register_buffer("buf", torch::ones({2, 2}));
+      }
+
+      Linear l1{nullptr}, l2{nullptr}, l3{nullptr};
+      torch::Tensor buffer;
+    };
+
+    auto module = std::make_shared<TestModule>();
+
+    auto module2 = module->clone();
+    auto params1 = module->parameters();
+    auto params2 = module2->parameters();
+    REQUIRE(params1.size() == 6);
+    REQUIRE(params2.size() == 6);
+    for (auto& param : params1) {
+      REQUIRE(!pointer_equal(param.value, params2[param.key]));
+      REQUIRE(param->allclose(params2[param.key]));
+      param->data().add_(2);
+    }
+    for (auto& param : params1) {
+      REQUIRE(!param->allclose(params2[param.key]));
+    }
+
+    auto buffers1 = module->buffers();
+    auto buffers2 = module2->buffers();
+    REQUIRE(buffers1.size() == 1);
+    REQUIRE(buffers2.size() == 1);
+    for (auto& buffer : buffers1) {
+      REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key]));
+      REQUIRE(buffer->allclose(buffers2[buffer.key]));
+      buffer->data().add_(2);
+    }
+    for (auto& buffer : buffers1) {
+      REQUIRE(!buffer->allclose(buffers2[buffer.key]));
+    }
+  }
+
+  SECTION("Cloning preserves external references") {
+    struct TestModule : public Cloneable<TestModule> {
+      TestModule() {
+        reset();
+      }
+      void reset() override {
+        weight = register_parameter("weight", torch::ones({4, 4}));
+      }
+      torch::Tensor weight;
+    };
+    auto module = std::make_shared<TestModule>();
+    module->weight.data() += 1;
+    REQUIRE(pointer_equal(module->weight, module->parameters()["weight"]));
+    REQUIRE(module->weight.allclose(module->parameters()["weight"]));
+
+    auto module2 = std::dynamic_pointer_cast<TestModule>(
+        std::shared_ptr<Module>(module->clone()));
+    REQUIRE(!pointer_equal(module2->weight, module->weight));
+    REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"]));
+    REQUIRE(module2->weight.allclose(module2->parameters()["weight"]));
+    REQUIRE(module2->weight.allclose(module->weight));
+    REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"]));
+  }
+
+  SECTION("Cloning copies the values of variables of submodules") {
+    struct TestModule : public Cloneable<TestModule> {
+      TestModule() {
+        reset();
+      }
+      void reset() override {
+        weight = register_parameter("weight", torch::ones({4, 4}));
+      }
+
+      torch::Tensor weight;
+      int value = 0;
+    };
+    struct NestedModule : public Cloneable<NestedModule> {
+      NestedModule() {
+        reset();
+      }
+      void reset() override {
+        module = register_module("module", std::make_shared<TestModule>());
+      }
+      std::shared_ptr<TestModule> module;
+    };
+
+    auto a = std::make_shared<NestedModule>();
+    a->module->weight.data() += 1;
+    a->module->value = 123;
+
+    auto b = std::static_pointer_cast<NestedModule>(a->clone());
+
+    REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
+    REQUIRE(
+        pointer_equal(b->module->weight, b->module->parameters()["weight"]));
+    REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight));
+    REQUIRE(b->module->weight.allclose(a->module->weight));
+    REQUIRE(b->module->value == a->module->value);
+  }
+}
+
+TEST_CASE("module/parameters") {
+  torch::manual_seed(0);
+  struct TestModule : Module {
+    TestModule() {
+      a = register_parameter("a", torch::zeros({2, 2}));
+      b = register_parameter("b", torch::ones({2, 2}));
+      c = register_parameter("c", torch::ones({2, 2}) * 2);
+    }
+
+    torch::Tensor a, b, c;
+  };
+
+  TestModule module;
+
+  SECTION("has correct number of parameters") {
+    REQUIRE(module.parameters().size() == 3);
+  }
+
+  SECTION("contains parameters with the correct name") {
+    auto parameters = module.parameters();
+    REQUIRE(parameters.contains("a"));
+    REQUIRE(parameters.contains("b"));
+    REQUIRE(parameters.contains("c"));
+  }
+}
+
+TEST_CASE("module/buffers") {
+  torch::manual_seed(0);
+  struct TestModule : Module {
+    TestModule() {
+      a = register_buffer("a", torch::zeros({2, 2}));
+      b = register_buffer("b", torch::ones({2, 2}));
+      c = register_buffer("c", torch::ones({2, 2}) * 2);
+    }
+
+    torch::Tensor a, b, c;
+  };
+
+  TestModule module;
+
+  SECTION("has correct number of buffers") {
+    REQUIRE(module.buffers().size() == 3);
+  }
+
+  SECTION("contains buffers with the correct name") {
+    auto buffers = module.buffers();
+    REQUIRE(buffers.contains("a"));
+    REQUIRE(buffers.contains("b"));
+    REQUIRE(buffers.contains("c"));
+  }
+}
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
new file mode 100644
index 0000000..2cacc75
--- /dev/null
+++ b/test/cpp/api/modules.cpp
@@ -0,0 +1,276 @@
+#include <catch.hpp>
+
+#include <torch/nn/module.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/embedding.h>
+#include <torch/nn/modules/functional.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/util.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+class TestModel : public torch::nn::Module {
+ public:
+  TestModel()
+      : l1(register_module("l1", Linear(10, 3))),
+        l2(register_module("l2", Linear(3, 5))),
+        l3(register_module("l3", Linear(5, 100))) {}
+
+  Linear l1, l2, l3;
+};
+
+class NestedModel : public torch::nn::Module {
+ public:
+  NestedModel()
+      : l1(register_module("l1", Linear(5, 20))),
+        t(register_module("test", std::make_shared<TestModel>())),
+        param_(register_parameter("param", torch::empty({3, 2, 21}))) {}
+
+  torch::Tensor param_;
+  Linear l1;
+  std::shared_ptr<TestModel> t;
+};
+
+TEST_CASE("modules") {
+  torch::manual_seed(0);
+  SECTION("conv") {
+    SECTION("1d") {
+      Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
+      auto x = torch::randn({2, 3, 5}, torch::requires_grad());
+      auto y = model->forward(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      REQUIRE(y.ndimension() == 3);
+      REQUIRE(s.ndimension() == 0);
+      for (auto i = 0; i < 3; i++) {
+        REQUIRE(y.size(i) == 2);
+      }
+
+      REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3);
+    }
+    SECTION("2d") {
+      SECTION("even") {
+        Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
+        auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
+        auto y = model->forward(x);
+        torch::Tensor s = y.sum();
+
+        s.backward();
+        REQUIRE(y.ndimension() == 4);
+        REQUIRE(s.ndimension() == 0);
+        for (auto i = 0; i < 4; i++) {
+          REQUIRE(y.size(i) == 2);
+        }
+
+        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3);
+      }
+
+      SECTION("uneven") {
+        Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
+        auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
+        auto y = model->forward(x);
+        torch::Tensor s = y.sum();
+
+        s.backward();
+        REQUIRE(y.ndimension() == 4);
+        REQUIRE(s.ndimension() == 0);
+        for (auto i = 0; i < 4; i++) {
+          REQUIRE(y.size(i) == 2);
+        }
+
+        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2);
+      }
+    }
+    SECTION("3d") {
+      Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
+      auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
+      auto y = model->forward(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      REQUIRE(y.ndimension() == 5);
+      REQUIRE(s.ndimension() == 0);
+      for (auto i = 0; i < 5; i++) {
+        REQUIRE(y.size(i) == 2);
+      }
+
+      REQUIRE(
+          model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3 * 3);
+    }
+  }
+  SECTION("linear") {
+    SECTION("basic1") {
+      Linear model(5, 2);
+      auto x = torch::randn({10, 5}, torch::requires_grad());
+      auto y = model->forward(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      REQUIRE(y.ndimension() == 2);
+      REQUIRE(s.ndimension() == 0);
+      REQUIRE(y.size(0) == 10);
+      REQUIRE(y.size(1) == 2);
+
+      REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+    }
+  }
+
+  SECTION("simple") {
+    auto model = std::make_shared<SimpleContainer>();
+    auto l1 = model->add(Linear(10, 3), "l1");
+    auto l2 = model->add(Linear(3, 5), "l2");
+    auto l3 = model->add(Linear(5, 100), "l3");
+
+    auto x = torch::randn({1000, 10}, torch::requires_grad());
+    x = l1->forward(x).clamp_min(0);
+    x = l2->forward(x).clamp_min(0);
+    x = l3->forward(x).clamp_min(0);
+
+    x.backward();
+    REQUIRE(x.ndimension() == 2);
+    REQUIRE(x.size(0) == 1000);
+    REQUIRE(x.size(1) == 100);
+    REQUIRE(x.data().min().toCFloat() == 0);
+  }
+
+  SECTION("embedding") {
+    SECTION("basic") {
+      int dict_size = 10;
+      Embedding model(dict_size, 2);
+      // Cannot get gradients to change indices (input) - only for embedding
+      // params
+      auto x = torch::full({10}, dict_size - 1, torch::kInt64);
+      auto y = model->forward(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      REQUIRE(y.ndimension() == 2);
+      REQUIRE(s.ndimension() == 0);
+      REQUIRE(y.size(0) == 10);
+      REQUIRE(y.size(1) == 2);
+
+      REQUIRE(model->parameters()["table"].grad().numel() == 2 * dict_size);
+    }
+
+    SECTION("list") {
+      Embedding model(6, 4);
+      auto x = torch::full({2, 3}, 5, torch::kInt64);
+      auto y = model->forward(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      REQUIRE(y.ndimension() == 3);
+      REQUIRE(y.size(0) == 2);
+      REQUIRE(y.size(1) == 3);
+      REQUIRE(y.size(2) == 4);
+    }
+  }
+
+  SECTION("dropout") {
+    Dropout dropout(0.5);
+    torch::Tensor x = torch::ones(100, torch::requires_grad());
+    torch::Tensor y = dropout->forward(x);
+
+    y.backward();
+    REQUIRE(y.ndimension() == 1);
+    REQUIRE(y.size(0) == 100);
+    REQUIRE(y.sum().toCFloat() < 130); // Probably
+    REQUIRE(y.sum().toCFloat() > 70); // Probably
+
+    dropout->eval();
+    y = dropout->forward(x);
+    REQUIRE(y.data().sum().toCFloat() == 100);
+  }
+
+  SECTION("param") {
+    auto model = std::make_shared<NestedModel>();
+    auto parameters = model->parameters();
+    REQUIRE(parameters["param"].size(0) == 3);
+    REQUIRE(parameters["param"].size(1) == 2);
+    REQUIRE(parameters["param"].size(2) == 21);
+    REQUIRE(parameters["l1.bias"].size(0) == 20);
+    REQUIRE(parameters["l1.weight"].size(0) == 20);
+    REQUIRE(parameters["l1.weight"].size(1) == 5);
+    REQUIRE(parameters["test.l1.bias"].size(0) == 3);
+    REQUIRE(parameters["test.l1.weight"].size(0) == 3);
+    REQUIRE(parameters["test.l1.weight"].size(1) == 10);
+    REQUIRE(parameters["test.l2.bias"].size(0) == 5);
+    REQUIRE(parameters["test.l2.weight"].size(0) == 5);
+    REQUIRE(parameters["test.l2.weight"].size(1) == 3);
+    REQUIRE(parameters["test.l3.bias"].size(0) == 100);
+    REQUIRE(parameters["test.l3.weight"].size(0) == 100);
+    REQUIRE(parameters["test.l3.weight"].size(1) == 5);
+  }
+
+  SECTION("functional") {
+    {
+      bool was_called = false;
+      auto functional = Functional([&was_called](torch::Tensor input) {
+        was_called = true;
+        return input;
+      });
+      auto output = functional->forward(torch::ones(5, torch::requires_grad()));
+      REQUIRE(was_called);
+      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+
+      was_called = false;
+      output = functional(torch::ones(5, torch::requires_grad()));
+      REQUIRE(was_called);
+      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+    }
+    {
+      auto functional = Functional(torch::relu);
+      REQUIRE(functional(torch::ones({})).data().toCFloat() == 1);
+      REQUIRE(functional(torch::ones({})).toCFloat() == 1);
+      REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
+    }
+    {
+      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0);
+      REQUIRE(functional(torch::ones({})).toCFloat() == 0);
+    }
+  }
+}
+
+TEST_CASE("modules_cuda", "[cuda]") {
+  torch::manual_seed(0);
+  SECTION("1") {
+    Linear model(5, 2);
+    model->to(torch::kCUDA);
+    auto x =
+        torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
+    auto y = model->forward(x);
+    torch::Tensor s = y.sum();
+
+    s.backward();
+    REQUIRE(y.ndimension() == 2);
+    REQUIRE(s.ndimension() == 0);
+    REQUIRE(y.size(0) == 10);
+    REQUIRE(y.size(1) == 2);
+
+    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+  }
+
+  SECTION("2") {
+    Linear model(5, 2);
+    model->to(torch::kCUDA);
+    model->to(torch::kCPU);
+    auto x = torch::randn({10, 5}, torch::requires_grad());
+    auto y = model->forward(x);
+    torch::Tensor s = y.sum();
+
+    s.backward();
+    REQUIRE(y.ndimension() == 2);
+    REQUIRE(s.ndimension() == 0);
+    REQUIRE(y.size(0) == 10);
+    REQUIRE(y.size(1) == 2);
+
+    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+  }
+}
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
new file mode 100644
index 0000000..186159c
--- /dev/null
+++ b/test/cpp/api/optim.cpp
@@ -0,0 +1,260 @@
+#include <catch.hpp>
+
+#include <torch/nn/module.h>
+#include <torch/nn/modules/functional.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/sequential.h>
+#include <torch/optim.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/optim_baseline.h>
+#include <test/cpp/api/util.h>
+
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+using namespace torch::nn;
+using namespace torch::optim;
+
+template <typename OptimizerClass, typename Options>
+bool test_optimizer_xor(Options options) {
+  torch::manual_seed(0);
+
+  Sequential model(
+      Linear(2, 8),
+      Functional(torch::sigmoid),
+      Linear(8, 1),
+      Functional(torch::sigmoid));
+
+  const int64_t kBatchSize = 4;
+  const int64_t kMaximumNumberOfEpochs = 3000;
+
+  auto optimizer = OptimizerClass(model->parameters(), options);
+
+  float running_loss = 1;
+  int epoch = 0;
+  while (running_loss > 0.1) {
+    auto inputs = torch::empty({kBatchSize, 2});
+    auto labels = torch::empty({kBatchSize});
+    for (size_t i = 0; i < kBatchSize; i++) {
+      inputs[i] = torch::randint(2, {2}, torch::kInt64);
+      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+    }
+    inputs.set_requires_grad(true);
+    optimizer.zero_grad();
+    auto x = model->forward(inputs);
+    torch::Tensor loss = torch::binary_cross_entropy(x, labels);
+    loss.backward();
+
+    optimizer.step();
+
+    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    if (epoch > kMaximumNumberOfEpochs) {
+      std::cout << "Loss is too high after epoch " << epoch << ": "
+                << running_loss << std::endl;
+      return false;
+    }
+    epoch++;
+  }
+  return true;
+}
+
+template <typename Parameters>
+void assign_parameter(
+    const Parameters& parameters,
+    const char* name,
+    torch::Tensor new_tensor) {
+  auto parameter = parameters.at(name);
+  parameter.set_requires_grad(false);
+  parameter.flatten().copy_(new_tensor);
+  parameter.set_requires_grad(true);
+}
+
+template <typename OptimizerClass, typename Options>
+void check_exact_values(
+    Options options,
+    std::vector<std::vector<torch::Tensor>> expected_parameters) {
+  const size_t kIterations = 1001;
+  const size_t kSampleEvery = 100;
+
+  torch::manual_seed(0);
+
+  Sequential model(
+      Linear(2, 3),
+      Functional(torch::sigmoid),
+      Linear(3, 1),
+      Functional(torch::sigmoid));
+
+  model->to(torch::kFloat64);
+
+  // Use exact input values because matching random values is hard.
+  auto parameters = model->parameters();
+  assign_parameter(
+      parameters,
+      "0.weight",
+      torch::tensor({-0.2109, -0.4976, -0.1413, -0.3420, -0.2524, 0.6976}));
+  assign_parameter(
+      parameters, "0.bias", torch::tensor({-0.1085, -0.2979, 0.6892}));
+  assign_parameter(
+      parameters, "2.weight", torch::tensor({-0.0508, -0.3941, -0.2843}));
+  assign_parameter(parameters, "2.bias", torch::tensor({-0.0711}));
+
+  auto optimizer = OptimizerClass(parameters, options);
+  torch::Tensor input =
+      torch::tensor({0.1, 0.2, 0.3, 0.4, 0.5, 0.6}).reshape({3, 2});
+
+  for (size_t i = 0; i < kIterations; ++i) {
+    optimizer.zero_grad();
+    auto output = model->forward(input);
+    auto loss = output.sum();
+    loss.backward();
+
+    optimizer.step();
+
+    if (i % kSampleEvery == 0) {
+      REQUIRE(
+          expected_parameters.at(i / kSampleEvery).size() == parameters.size());
+      for (size_t p = 0; p < parameters.size(); ++p) {
+        REQUIRE(parameters.at(p)->defined());
+        auto computed = parameters.at(p)->flatten();
+        auto expected = expected_parameters.at(i / kSampleEvery).at(p);
+        if (!computed.allclose(expected, /*rtol=*/1e-3, /*atol=*/1e-5)) {
+          std::cout << "Iteration " << i << ": " << computed
+                    << " != " << expected << " (parameter " << p << ")"
+                    << std::endl;
+          REQUIRE(false);
+        }
+      }
+    }
+  }
+}
+
+TEST_CASE("Optim/BasicInterface") {
+  struct MyOptimizer : Optimizer {
+    using Optimizer::Optimizer;
+    void step() override {}
+  };
+  std::vector<torch::Tensor> parameters = {
+      torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})};
+  {
+    MyOptimizer optimizer(parameters);
+    REQUIRE(optimizer.size() == parameters.size());
+  }
+  {
+    MyOptimizer optimizer;
+    REQUIRE(optimizer.size() == 0);
+    optimizer.add_parameters(parameters);
+    REQUIRE(optimizer.size() == parameters.size());
+  }
+  {
+    Linear linear(3, 4);
+    MyOptimizer optimizer(linear->parameters());
+    REQUIRE(optimizer.size() == linear->parameters().size());
+  }
+}
+
+TEST_CASE("Optim/XORConvergence/SGD") {
+  REQUIRE(test_optimizer_xor<SGD>(
+      SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6)));
+}
+
+TEST_CASE("Optim/XORConvergence/Adagrad") {
+  REQUIRE(test_optimizer_xor<Adagrad>(
+      AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3)));
+}
+
+TEST_CASE("Optim/XORConvergence/RMSprop") {
+  REQUIRE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
+}
+
+TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") {
+  REQUIRE(test_optimizer_xor<RMSprop>(
+      RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6)));
+}
+
+TEST_CASE("Optim/XORConvergence/Adam") {
+  REQUIRE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
+}
+
+TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") {
+  REQUIRE(test_optimizer_xor<Adam>(
+      AdamOptions(0.1).weight_decay(1e-6).amsgrad(true)));
+}
+
+TEST_CASE("Optim/ProducesPyTorchValues/Adam") {
+  check_exact_values<Adam>(
+      AdamOptions(1.0).weight_decay(1e-6), expected_parameters::Adam);
+}
+
+TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") {
+  check_exact_values<Adagrad>(
+      AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3),
+      expected_parameters::Adagrad);
+}
+
+TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") {
+  check_exact_values<RMSprop>(
+      RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6),
+      expected_parameters::RMSprop);
+}
+
+TEST_CASE("Optim/ProducesPyTorchValues/SGD") {
+  check_exact_values<SGD>(
+      SGDOptions(0.1).momentum(0.9).weight_decay(1e-6),
+      expected_parameters::SGD);
+}
+
+TEST_CASE("Optim/ZeroGrad") {
+  torch::manual_seed(0);
+
+  Linear model(2, 8);
+  SGD optimizer(model->parameters(), 0.1);
+
+  for (const auto& parameter : model->parameters()) {
+    REQUIRE(!parameter->grad().defined());
+  }
+
+  auto output = model->forward(torch::ones({5, 2}));
+  auto loss = output.sum();
+  loss.backward();
+
+  for (const auto& parameter : model->parameters()) {
+    REQUIRE(parameter->grad().defined());
+    REQUIRE(parameter->grad().sum().toCFloat() > 0);
+  }
+
+  optimizer.zero_grad();
+
+  for (const auto& parameter : model->parameters()) {
+    REQUIRE(parameter->grad().defined());
+    REQUIRE(parameter->grad().sum().toCFloat() == 0);
+  }
+}
+
+TEST_CASE("Optim/ExternalVectorOfParameters") {
+  torch::manual_seed(0);
+
+  std::vector<torch::Tensor> parameters = {
+      torch::randn({2, 2}), torch::randn({3, 3}), torch::randn({4, 4})};
+  std::vector<torch::Tensor> original_parameters = {
+      parameters[0].clone(), parameters[1].clone(), parameters[2].clone()};
+
+  // Set all gradients to one
+  for (auto& parameter : parameters) {
+    parameter.grad() = torch::ones_like(parameter);
+  }
+
+  SGD optimizer(parameters, 1.0);
+
+  optimizer.step();
+
+  REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0));
+  REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
+  REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
+}
diff --git a/test/cpp/api/optim_baseline.h b/test/cpp/api/optim_baseline.h
new file mode 100644
index 0000000..85702f5
--- /dev/null
+++ b/test/cpp/api/optim_baseline.h
@@ -0,0 +1,285 @@
+// @generated from test/cpp/api/optim_baseline.py
+
+#include <torch/tensor.h>
+
+#include <vector>
+
+namespace expected_parameters {
+
+static std::vector<std::vector<torch::Tensor>> Adam = {
+  {
+      torch::tensor({0.7889792072185753, 0.502352757161707, 0.8586918160350755, 0.6579591155483523, 0.7476108843716649, 1.6975378965521928}),
+      torch::tensor({0.8914325952640971, 0.7020467393659713, 1.6891939504169646}),
+      torch::tensor({-1.0508020464076324, -1.3941340315767958, -1.2843369730696377}),
+      torch::tensor({-1.0711376814873597}),
+  },
+  {
+      torch::tensor({6.7760957077832815, 6.9124599235220785, 6.433510595961532, 6.317833927015116, 5.858358965431237, 6.877000885215112}),
+      torch::tensor({7.976190958038579, 6.534978464582878, 7.121533207755487}),
+      torch::tensor({-6.714974314073826, -7.080922682620933, -6.744778112213496}),
+      torch::tensor({-6.4298819001431085}),
+  },
+  {
+      torch::tensor({4.558988462696042, 5.176192443970086, 6.0336571634341505, 6.0224389139758, 5.280816350261895, 6.365433817392172}),
+      torch::tensor({7.158919001867603, 6.412706290235066, 6.908680508139067}),
+      torch::tensor({-6.686226984996727, -7.0485763202090785, -6.728173785196665}),
+      torch::tensor({-6.418722639668962}),
+  },
+  {
+      torch::tensor({2.676497911906038, 3.514230904704933, 5.540289462213301, 5.651091500603315, 4.599555032332985, 5.7436411842005715}),
+      torch::tensor({6.199573530483314, 6.253637773745561, 6.635649515135911}),
+      torch::tensor({-6.6479125255000415, -7.005525929642858, -6.705848142237787}),
+      torch::tensor({-6.403590364569412}),
+  },
+  {
+      torch::tensor({1.390551436159619, 2.186808822332009, 4.994097914148984, 5.230206052925842, 3.8873275598775168, 5.068205713003804}),
+      torch::tensor({5.2028669487041, 6.065647517713214, 6.318260904353201}),
+      torch::tensor({-6.601668647421558, -6.9535874186665785, -6.67883101301033}),
+      torch::tensor({-6.385259022353005}),
+  },
+  {
+      torch::tensor({0.6428874779821907, 1.2530294670027888, 4.42474396483426, 4.778966032173873, 3.194104800709639, 4.379924438267535}),
+      torch::tensor({4.239990461988167, 5.853776036724999, 5.967533875546956}),
+      torch::tensor({-6.548225472035984, -6.893592571399344, -6.64750966472527}),
+      torch::tensor({-6.363980692898975}),
+  },
+  {
+      torch::tensor({0.2649056275082859, 0.6623036233906192, 3.855580769864683, 4.312992553271765, 2.5539685072813745, 3.7096880079750543}),
+      torch::tensor({3.358522498538248, 5.6220080436157085, 5.592491715335737}),
+      torch::tensor({-6.488047848171285, -6.826075945430906, -6.612113718185246}),
+      torch::tensor({-6.339899671178354}),
+  },
+  {
+      torch::tensor({0.09720440879172912, 0.32295960311161326, 3.305014661197478, 3.8453888770306373, 1.987959590244318, 3.0803155463808802}),
+      torch::tensor({2.5867474425386776, 5.373786289684033, 5.200999931806661}),
+      torch::tensor({-6.421459980696732, -6.751415029544431, -6.572787274104027}),
+      torch::tensor({-6.313101192728845}),
+  },
+  {
+      torch::tensor({0.03168219948352204, 0.14514092874328724, 2.787089436799649, 3.3871307837850804, 1.506328896591618, 2.5075443167981337}),
+      torch::tensor({1.9371981268942096, 5.1122381676870345, 4.800104891435606}),
+      torch::tensor({-6.34870330695251, -6.669895358637439, -6.529622284228346}),
+      torch::tensor({-6.283633921566764}),
+  },
+  {
+      torch::tensor({0.009143320123746463, 0.06000692560705293, 2.3118678318360013, 2.947231977224673, 1.110782434234418, 2.0008734743554664}),
+      torch::tensor({1.4101939064831694, 4.840286974170306, 4.3961763185205}),
+      torch::tensor({-6.269967843778274, -6.581745817714579, -6.482676531197509}),
+      torch::tensor({-6.251522016769645}),
+  },
+  {
+      torch::tensor({0.00234842105352119, 0.02278637440138154, 1.8857973138568447, 2.532835881548222, 0.7967451883439284, 1.5644546013645035}),
+      torch::tensor({0.9973538456553341, 4.560708133428994, 3.9949611760478665}),
+      torch::tensor({-6.18541110750813, -6.487160016579177, -6.4319844222067895}),
+      torch::tensor({-6.216772332487236}),
+  },
+};
+
+static std::vector<std::vector<torch::Tensor>> Adagrad = {
+  {
+      torch::tensor({0.7891011046018798, 0.5024439245163383, 0.8587078329086189, 0.6579710994225316, 0.747636483621666, 1.697557019500142}),
+      torch::tensor({0.8914687688943375, 0.7020514988069164, 1.6892015076050049}),
+      torch::tensor({-1.0508031297732776, -1.3941351871450511, -1.284337597261839}),
+      torch::tensor({-1.0711381241617108}),
+  },
+  {
+      torch::tensor({2.346218944110103, 2.191939439502003, 1.683355201740813, 1.5405520021635604, 1.2137800230828062, 2.2052834637173024}),
+      torch::tensor({2.9090564593404, 1.7509657336815554, 2.336166413186925}),
+      torch::tensor({-2.206159683368316, -2.5344318233445415, -1.9622783535807609}),
+      torch::tensor({-1.5796101463783623}),
+  },
+  {
+      torch::tensor({2.3889328781057233, 2.2678221038007296, 1.7667624725138267, 1.6358015176639824, 1.2655767687152566, 2.2610880567112814}),
+      torch::tensor({3.045569451994985, 1.8770196253823253, 2.4192707519566765}),
+      torch::tensor({-2.4079300017528613, -2.7399112002234305, -2.0780613510632375}),
+      torch::tensor({-1.664722108226537}),
+  },
+  {
+      torch::tensor({2.3886137557806384, 2.2922158071009178, 1.8078384116424007, 1.6843524744409322, 1.290353948335789, 2.287071550970649}),
+      torch::tensor({3.111110355394278, 1.9438501730282314, 2.4630249355872826}),
+      torch::tensor({-2.5226122034499263, -2.857315093916292, -2.143964860243905}),
+      torch::tensor({-1.7130685809905042}),
+  },
+  {
+      torch::tensor({2.374703352203156, 2.298804499257456, 1.8330249458212446, 1.7151661013307247, 1.3048586226945842, 2.301765059046427}),
+      torch::tensor({3.150318222034133, 1.9877926185369321, 2.491399976401679}),
+      torch::tensor({-2.601415913361488, -2.938203895113964, -2.1892988334550028}),
+      torch::tensor({-1.7462964261966805}),
+  },
+  {
+      torch::tensor({2.3553658567303812, 2.297191758042688, 1.8501154749072124, 1.7368360586881881, 1.3141313000193942, 2.310745259215385}),
+      torch::tensor({3.1762315339155434, 2.0197585204578647, 2.5117041377790197}),
+      torch::tensor({-2.6606644002288697, -2.9991216074293856, -2.223413376189609}),
+      torch::tensor({-1.7712905233118807}),
+  },
+  {
+      torch::tensor({2.3338052201696207, 2.2913023710914993, 1.8624163948044772, 1.7530300731725457, 1.3203313209234842, 2.3163969478854742}),
+      torch::tensor({3.1943525925688934, 2.044447386769377, 2.527109724607397}),
+      torch::tensor({-2.7076634717294894, -3.047500808469036, -2.250495807208967}),
+      torch::tensor({-1.7911288238757486}),
+  },
+  {
+      torch::tensor({2.3114979154644892, 2.2830501835377808, 1.871616142999356, 1.7656325976608411, 1.324565631636651, 2.3199392342052025}),
+      torch::tensor({3.2074779925809085, 2.0642940833670544, 2.5392671301471235}),
+      torch::tensor({-2.7463093287485925, -3.087315541134716, -2.272780318857348}),
+      torch::tensor({-1.8074516661537263}),
+  },
+  {
+      torch::tensor({2.2891841627387346, 2.2734716995793693, 1.8786818999895825, 1.7757301317117604, 1.3274682997719436, 2.322067935399382}),
+      torch::tensor({3.2172019619454075, 2.0807140893178175, 2.5491374815141876}),
+      torch::tensor({-2.7789204504423823, -3.1209351402429175, -2.2915969523376867}),
+      torch::tensor({-1.821234722948421}),
+  },
+  {
+      torch::tensor({2.2672498238343066, 2.2631678037928893, 1.8842131287032622, 1.7840007705383885, 1.3294311820750493, 2.3232112430345424}),
+      torch::tensor({3.224507488068445, 2.094598223519413, 2.5573257155791715}),
+      torch::tensor({-2.8069849199086647, -3.1498826045022925, -2.3077996970997727}),
+      torch::tensor({-1.8331040438272388}),
+  },
+  {
+      torch::tensor({2.2458961718688957, 2.2525031725114775, 1.8886034384961112, 1.7908930341267955, 1.3307102435291205, 2.3236474976004615}),
+      torch::tensor({3.230036385413078, 2.1065407459636134, 2.5642349249609664}),
+      torch::tensor({-2.83151249424399, -3.1751926295316566, -2.3219682378974036}),
+      torch::tensor({-1.8434843744626483}),
+  },
+};
+
+static std::vector<std::vector<torch::Tensor>> RMSprop = {
+  {
+      torch::tensor({0.789062580418602, 0.5024151127899734, 0.8587027713374162, 0.6579673123497956, 0.7476283938233967, 1.6975509763502088}),
+      torch::tensor({0.891457337329431, 0.7020499947640909, 1.6891991194345595}),
+      torch::tensor({-1.0508027874170536, -1.3941348219719392, -1.2843374000098593}),
+      torch::tensor({-1.0711379842714959}),
+  },
+  {
+      torch::tensor({11.617971383650625, 12.569324191541254, 10.750362589690871, 10.758744280441727, 9.645515422139535, 10.800246335822324}),
+      torch::tensor({15.281460788745067, 11.227695130869545, 11.392623137702232}),
+      torch::tensor({-11.253200612957224, -11.640304407367251, -11.10515970703974}),
+      torch::tensor({-10.70450790002659}),
+  },
+  {
+      torch::tensor({6.046736887816883, 7.825431817224273, 9.687011716351043, 9.954711288408674, 8.14740348271963, 9.525985838440336}),
+      torch::tensor({12.822053316618073, 10.889090672152843, 10.845780422808176}),
+      torch::tensor({-11.174857199859773, -11.553906933533701, -11.060485249649355}),
+      torch::tensor({-10.673987015109141}),
+  },
+  {
+      torch::tensor({1.9571373332473823, 3.490982039449725, 8.146064947642957, 8.749830187230062, 6.146253042228082, 7.728648273553586}),
+      torch::tensor({9.565022547037527, 10.350286153772108, 9.996138388056213}),
+      torch::tensor({-11.046063491906642, -11.411999818458279, -10.986591391047115}),
+      torch::tensor({-10.623300659238438}),
+  },
+  {
+      torch::tensor({0.2533394450092949, 0.8487216725008625, 6.099516863836628, 7.058052576203578, 3.826101977602061, 5.4465243550891405}),
+      torch::tensor({5.842262376064937, 9.515175340901033, 8.729600468763985}),
+      torch::tensor({-10.836290562755634, -11.181092820342055, -10.865488142142983}),
+      torch::tensor({-10.540026730263222}),
+  },
+  {
+      torch::tensor({0.0028003666727239255, 0.05902391869285125, 3.748716429598524, 4.925383772212378, 1.7054258503085213, 3.0154295058730067}),
+      torch::tensor({2.5189723383469143, 8.274294567516511, 6.965678470134445}),
+      torch::tensor({-10.497890796162736, -10.809198403294875, -10.668113905016504}),
+      torch::tensor({-10.40375240539236}),
+  },
+  {
+      torch::tensor({8.271676181935537e-06, -0.00018355896681218517, 1.6335648369488047, 2.6808923568105154, 0.41363824898472834, 1.090479198637994}),
+      torch::tensor({0.5739272720885893, 6.5553203371069895, 4.77208963856775}),
+      torch::tensor({-9.96060858122121, -10.220305543889674, -10.349375660214779}),
+      torch::tensor({-10.182195023732161}),
+  },
+  {
+      torch::tensor({6.185398198559693e-08, -1.541936772053279e-07, 0.3796273256352156, 0.9401265830080575, 0.028596845546205944, 0.17648335822427214}),
+      torch::tensor({0.03428668381950183, 4.436633619476827, 2.515631003256126}),
+      torch::tensor({-9.129738869797919, -9.313631525351905, -9.842416162331277}),
+      torch::tensor({-9.82584888365321}),
+  },
+  {
+      torch::tensor({2.301163769642748e-06, 2.6686276058313296e-06, 0.023759872757083522, 0.14307496311038903, -8.923289496023129e-05, 0.004248712398606914}),
+      torch::tensor({-0.0001300076077318404, 2.2897575859027177, 0.8324154760613494}),
+      torch::tensor({-7.899784778698039, -7.981253754035267, -9.056141702142126}),
+      torch::tensor({-9.262858557574456}),
+  },
+  {
+      torch::tensor({0.0024924016682296903, 0.0030339372114999833, 0.0007509002769717036, 0.0037341431497881668, 0.0017606472677062757, 0.0019348763452492956}),
+      torch::tensor({0.0048385780006693566, 0.7299482676059379, 0.11498900317323348}),
+      torch::tensor({-6.2066461415350735, -6.169374349409609, -7.886599467110102}),
+      torch::tensor({-8.399404241287046}),
+  },
+  {
+      torch::tensor({0.105613217805617, 0.14212963123097416, 0.08487768339425271, 0.10413270403927237, 0.14189898312301077, 0.1815884431559531}),
+      torch::tensor({0.34655586728243415, 0.2531201095942877, 0.32677017754456106}),
+      torch::tensor({-4.164684884039514, -4.029224257118343, -6.277175857463741}),
+      torch::tensor({-7.156227835327511}),
+  },
+};
+
+static std::vector<std::vector<torch::Tensor>> SGD = {
+  {
+      torch::tensor({-0.21063954921142625, -0.49720932283029146, -0.1393184765948855, -0.3393909854529272, -0.25112862964818594, 0.699210126931771}),
+      torch::tensor({-0.10765733357148573, -0.2913064115911077, 0.6933846184980238}),
+      torch::tensor({-0.07998325270832098, -0.4214920657406949, -0.33498346710568583}),
+      torch::tensor({-0.14255125794128246}),
+  },
+  {
+      torch::tensor({-0.1330316364215997, -0.3935294274545123, 0.031868002506474064, -0.11084376970207319, -0.10044018049327709, 0.9010178292324523}),
+      torch::tensor({0.1528090373898639, 0.2821438649147616, 1.205386760430569}),
+      torch::tensor({-1.713468945736039, -2.005156118333251, -3.179277317098681}),
+      torch::tensor({-4.088880710164363}),
+  },
+  {
+      torch::tensor({-0.12680536431075715, -0.3851833683510085, 0.03904239282451148, -0.10124023247972329, -0.0940339175875239, 0.9095833537085721}),
+      torch::tensor({0.17373114038388335, 0.3062644150893206, 1.227860257896027}),
+      torch::tensor({-1.7373016253033575, -2.03249513013615, -3.219138770050986}),
+      torch::tensor({-4.137292114732651}),
+  },
+  {
+      torch::tensor({-0.12132497250032251, -0.3778331039904768, 0.04532003261274458, -0.09283201036735936, -0.08851196423135906, 0.9169578599973672}),
+      torch::tensor({0.19215412593446154, 0.327399336185489, 1.2472665827949112}),
+      torch::tensor({-1.7582130574461698, -2.056484638774683, -3.2538206997778527}),
+      torch::tensor({-4.1792051428429815}),
+  },
+  {
+      torch::tensor({-0.11642629644462195, -0.371259551231067, 0.05090337508675891, -0.08534947911495054, -0.0836630803858705, 0.9234252017697436}),
+      torch::tensor({0.20862718059402438, 0.34622033891380727, 1.2643318695604322}),
+      torch::tensor({-1.7768873985691658, -2.0779002006617127, -3.2845633491961754}),
+      torch::tensor({-4.216211613875442}),
+  },
+  {
+      torch::tensor({-0.11199692998473046, -0.3653126926175739, 0.055928814442105936, -0.07861097082436985, -0.07934681806193093, 0.9291742365403652}),
+      torch::tensor({0.22352641732263448, 0.3631801602983304, 1.2795402134567078}),
+      torch::tensor({-1.7937606819069465, -2.0972428048607665, -3.312155416912881}),
+      torch::tensor({-4.249311719641607}),
+  },
+  {
+      torch::tensor({-0.10795430968582471, -0.35988233095293476, 0.06049618926552477, -0.07248348167814321, -0.0754620039452837, 0.9343410885098413}),
+      torch::tensor({0.2371281737399247, 0.3786104526446995, 1.2932411233560754}),
+      torch::tensor({-1.8091521591375157, -2.114879507429728, -3.33717091609132}),
+      torch::tensor({-4.279229817804015}),
+  },
+  {
+      torch::tensor({-0.10423597676568656, -0.35488509675405316, 0.06468078527531465, -0.06686669857198466, -0.07193342158854438, 0.9390270110811988}),
+      torch::tensor({0.2496415568519247, 0.39276148981107395, 1.305694968823765}),
+      torch::tensor({-1.8233027090805771, -2.131087403685365, -3.3600405655741863}),
+      torch::tensor({-4.306507101504117}),
+  },
+  {
+      torch::tensor({-0.10079352100780103, -0.35025636793356374, 0.06854077976598513, -0.06168309068553341, -0.06870368260858187, 0.9433092952856199}),
+      torch::tensor({0.2612286853643801, 0.40582680705425617, 1.3171007763490907}),
+      torch::tensor({-1.8363986223633926, -2.146080884869125, -3.381095779579462}),
+      torch::tensor({-4.331558400559893}),
+  },
+  {
+      torch::tensor({-0.09758865034515093, -0.3459450254605736, 0.07212206086787502, -0.05687149432043213, -0.06572803950285522, 0.9472482340146743}),
+      torch::tensor({0.2720178289132921, 0.4179591600527258, 1.327613999831149}),
+      torch::tensor({-1.8485869819731393, -2.160029278482207, -3.400596982161749}),
+      torch::tensor({-4.354708572999517}),
+  },
+  {
+      torch::tensor({-0.094590547958586, -0.3419099268142475, 0.07546146037543552, -0.0523828089910123, -0.06297095100755593, 0.9508917353335773}),
+      torch::tensor({0.2821122445127561, 0.4292812357169464, 1.337358307292812}),
+      torch::tensor({-1.8599859731758501, -2.173068681359289, -3.418752504544578}),
+      torch::tensor({-4.376216692846435}),
+  },
+};
+
+} // namespace expected_parameters
diff --git a/test/cpp/api/optim_baseline.py b/test/cpp/api/optim_baseline.py
new file mode 100644
index 0000000..70050cf
--- /dev/null
+++ b/test/cpp/api/optim_baseline.py
@@ -0,0 +1,104 @@
+"""Script to generate baseline values from PyTorch optimization algorithms"""
+
+import argparse
+import math
+
+import torch
+import torch.optim
+
+
+HEADER = """
+#include <torch/tensor.h>
+
+#include <vector>
+
+namespace expected_parameters {
+"""
+
+FOOTER = "} // namespace expected_parameters"
+
+PARAMETERS = "static std::vector<std::vector<torch::Tensor>> {} = {{"
+
+OPTIMIZERS = {
+    "Adam": lambda p: torch.optim.Adam(p, 1.0, weight_decay=1e-6),
+    "Adagrad": lambda p: torch.optim.Adagrad(p, 1.0, weight_decay=1e-6, lr_decay=1e-3),
+    "RMSprop": lambda p: torch.optim.RMSprop(p, 0.1, momentum=0.9, weight_decay=1e-6),
+    "SGD": lambda p: torch.optim.SGD(p, 0.1, momentum=0.9, weight_decay=1e-6),
+}
+
+
+def weight_init(module):
+    if isinstance(module, torch.nn.Linear):
+        stdev = 1.0 / math.sqrt(module.weight.size(1))
+        for p in module.parameters():
+            p.data.uniform_(-stdev, stdev)
+
+
+def run(optimizer_name, iterations, sample_every):
+    torch.manual_seed(0)
+    model = torch.nn.Sequential(
+        torch.nn.Linear(2, 3),
+        torch.nn.Sigmoid(),
+        torch.nn.Linear(3, 1),
+        torch.nn.Sigmoid(),
+    )
+    model = model.to(torch.float64).apply(weight_init)
+
+    optimizer = OPTIMIZERS[optimizer_name](model.parameters())
+
+    input = torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]], dtype=torch.float64)
+
+    values = []
+    for i in range(iterations):
+        optimizer.zero_grad()
+
+        output = model.forward(input)
+        loss = output.sum()
+        loss.backward()
+
+        optimizer.step()
+
+        if i % sample_every == 0:
+
+            values.append(
+                [p.clone().flatten().data.numpy() for p in model.parameters()]
+            )
+
+    return values
+
+
+def emit(optimizer_parameter_map):
+    # Don't write generated with an @ in front, else this file is recognized as generated.
+    print("// @{} from {}".format('generated', __file__))
+    print(HEADER)
+    for optimizer_name, parameters in optimizer_parameter_map.items():
+        print(PARAMETERS.format(optimizer_name))
+        for sample in parameters:
+            print("  {")
+            for parameter in sample:
+                parameter_values = "{{{}}}".format(", ".join(map(str, parameter)))
+                print("      torch::tensor({}),".format(parameter_values))
+            print("  },")
+        print("};\n")
+    print(FOOTER)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        "Produce optimization output baseline from PyTorch"
+    )
+    parser.add_argument("-i", "--iterations", default=1001, type=int)
+    parser.add_argument("-s", "--sample-every", default=100, type=int)
+    options = parser.parse_args()
+
+    optimizer_parameter_map = {}
+    for optimizer in ["Adam", "Adagrad", "RMSprop", "SGD"]:
+        optimizer_parameter_map[optimizer] = run(
+            optimizer, options.iterations, options.sample_every
+        )
+
+    emit(optimizer_parameter_map)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
new file mode 100644
index 0000000..a87d088
--- /dev/null
+++ b/test/cpp/api/rnn.cpp
@@ -0,0 +1,233 @@
+#include <catch.hpp>
+
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/rnn.h>
+#include <torch/optim/adam.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/util.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+template <typename R, typename Func>
+bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
+  torch::manual_seed(0);
+
+  auto nhid = 32;
+  auto model = std::make_shared<SimpleContainer>();
+  auto l1 = model->add(Linear(1, nhid), "l1");
+  auto rnn = model->add(model_maker(nhid), "rnn");
+  auto lo = model->add(Linear(nhid, 1), "lo");
+
+  torch::optim::Adam optimizer(model->parameters(), 1e-2);
+  auto forward_op = [&](torch::Tensor x) {
+    auto T = x.size(0);
+    auto B = x.size(1);
+    x = x.view({T * B, 1});
+    x = l1->forward(x).view({T, B, nhid}).tanh_();
+    x = rnn->forward(x).output[T - 1];
+    x = lo->forward(x);
+    return x;
+  };
+
+  if (cuda) {
+    model->to(torch::kCUDA);
+  }
+
+  float running_loss = 1;
+  int epoch = 0;
+  auto max_epoch = 1500;
+  while (running_loss > 1e-2) {
+    auto bs = 16U;
+    auto nlen = 5U;
+
+    const auto backend = cuda ? torch::kCUDA : torch::kCPU;
+    auto inputs =
+        torch::rand({nlen, bs, 1}, backend).round().toType(torch::kFloat32);
+    auto labels = inputs.sum(0).detach();
+    inputs.set_requires_grad(true);
+
+    auto outputs = forward_op(inputs);
+    torch::Tensor loss = torch::mse_loss(outputs, labels);
+
+    optimizer.zero_grad();
+    loss.backward();
+    optimizer.step();
+
+    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    if (epoch > max_epoch) {
+      return false;
+    }
+    epoch++;
+  }
+  return true;
+};
+
+void check_lstm_sizes(RNNOutput output) {
+  // Expect the LSTM to have 64 outputs and 3 layers, with an input of batch
+  // 10 and 16 time steps (10 x 16 x n)
+
+  REQUIRE(output.output.ndimension() == 3);
+  REQUIRE(output.output.size(0) == 10);
+  REQUIRE(output.output.size(1) == 16);
+  REQUIRE(output.output.size(2) == 64);
+
+  REQUIRE(output.state.ndimension() == 4);
+  REQUIRE(output.state.size(0) == 2); // (hx, cx)
+  REQUIRE(output.state.size(1) == 3); // layers
+  REQUIRE(output.state.size(2) == 16); // Batchsize
+  REQUIRE(output.state.size(3) == 64); // 64 hidden dims
+
+  // Something is in the hiddens
+  REQUIRE(output.state.norm().toCFloat() > 0);
+}
+
+TEST_CASE("rnn") {
+  torch::manual_seed(0);
+  SECTION("sizes") {
+    LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
+    auto x = torch::randn({10, 16, 128}, torch::requires_grad());
+    auto output = model->forward(x);
+    auto y = x.mean();
+
+    y.backward();
+    check_lstm_sizes(output);
+
+    auto next = model->forward(x, output.state);
+
+    check_lstm_sizes(next);
+
+    torch::Tensor diff = next.state - output.state;
+
+    // Hiddens changed
+    REQUIRE(diff.data().abs().sum().toCFloat() > 1e-3);
+  }
+
+  SECTION("outputs") {
+    // Make sure the outputs match pytorch outputs
+    LSTM model(2, 2);
+    for (auto& v : model->parameters()) {
+      float size = v->numel();
+      auto p = static_cast<float*>(v->data().storage()->data());
+      for (size_t i = 0; i < size; i++) {
+        p[i] = i / size;
+      }
+    }
+
+    auto x = torch::empty({3, 4, 2}, torch::requires_grad());
+    float size = x.data().numel();
+    auto p = static_cast<float*>(x.data().storage()->data());
+    for (size_t i = 0; i < size; i++) {
+      p[i] = (size - i) / size;
+    }
+
+    auto out = model->forward(x);
+    REQUIRE(out.output.ndimension() == 3);
+    REQUIRE(out.output.size(0) == 3);
+    REQUIRE(out.output.size(1) == 4);
+    REQUIRE(out.output.size(2) == 2);
+
+    auto flat = out.output.data().view(3 * 4 * 2);
+    float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239,
+                     0.4183, 0.5147, 0.6822, 0.8064, 0.6726, 0.7968,
+                     0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
+                     0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
+    for (size_t i = 0; i < 3 * 4 * 2; i++) {
+      REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3);
+    }
+
+    REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2
+    REQUIRE(out.state.size(0) == 2);
+    REQUIRE(out.state.size(1) == 1);
+    REQUIRE(out.state.size(2) == 4);
+    REQUIRE(out.state.size(3) == 2);
+    flat = out.state.data().view(16);
+    float h_out[] = {0.7889,
+                     0.9003,
+                     0.7769,
+                     0.8905,
+                     0.7635,
+                     0.8794,
+                     0.7484,
+                     0.8666,
+                     1.1647,
+                     1.6106,
+                     1.1425,
+                     1.5726,
+                     1.1187,
+                     1.5329,
+                     1.0931,
+                     1.4911};
+    for (size_t i = 0; i < 16; i++) {
+      REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3);
+    }
+  }
+}
+
+TEST_CASE("rnn/integration/LSTM") {
+  REQUIRE(test_RNN_xor<LSTM>(
+      [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }));
+}
+
+TEST_CASE("rnn/integration/GRU") {
+  REQUIRE(
+      test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).layers(2)); }));
+}
+
+TEST_CASE("rnn/integration/RNN") {
+  SECTION("relu") {
+    REQUIRE(test_RNN_xor<RNN>(
+        [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }));
+  }
+  SECTION("tanh") {
+    REQUIRE(test_RNN_xor<RNN>(
+        [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }));
+  }
+}
+
+TEST_CASE("rnn_cuda", "[cuda]") {
+  SECTION("sizes") {
+    torch::manual_seed(0);
+    LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
+    model->to(torch::kCUDA);
+    auto x = torch::randn(
+        {10, 16, 128}, torch::requires_grad().device(torch::kCUDA));
+    auto output = model->forward(x);
+    auto y = x.mean();
+
+    y.backward();
+    check_lstm_sizes(output);
+
+    auto next = model->forward(x, output.state);
+
+    check_lstm_sizes(next);
+
+    torch::Tensor diff = next.state - output.state;
+
+    // Hiddens changed
+    REQUIRE(diff.data().abs().sum().toCFloat() > 1e-3);
+  }
+
+  SECTION("lstm") {
+    REQUIRE(test_RNN_xor<LSTM>(
+        [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true));
+  }
+
+  SECTION("gru") {
+    REQUIRE(test_RNN_xor<GRU>(
+        [](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true));
+  }
+
+  SECTION("rnn") {
+    SECTION("relu") {
+      REQUIRE(test_RNN_xor<RNN>(
+          [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true));
+    }
+    SECTION("tanh") {
+      REQUIRE(test_RNN_xor<RNN>(
+          [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true));
+    }
+  }
+}
diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp
new file mode 100644
index 0000000..8aa608c
--- /dev/null
+++ b/test/cpp/api/sequential.cpp
@@ -0,0 +1,305 @@
+#include <catch.hpp>
+
+#include <torch/nn/modules.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/sequential.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <memory>
+#include <vector>
+
+#include <test/cpp/api/util.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+using Catch::StartsWith;
+
+TEST_CASE("sequential") {
+  SECTION("construction from shared pointer") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : value(value_) {}
+      int value;
+      int forward() {
+        return value;
+      }
+    };
+    Sequential sequential(
+        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
+    REQUIRE(sequential->size() == 3);
+  }
+  SECTION("construction from concrete type") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : value(value_) {}
+      int value;
+      int forward() {
+        return value;
+      }
+    };
+
+    Sequential sequential(M(1), M(2), M(3));
+    REQUIRE(sequential->size() == 3);
+  }
+  SECTION("construction from module holders") {
+    struct MImpl : torch::nn::Module {
+      explicit MImpl(int value_) : value(value_) {}
+      int forward() {
+        return value;
+      }
+      int value;
+    };
+
+    struct M : torch::nn::ModuleHolder<MImpl> {
+      using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
+      using torch::nn::ModuleHolder<MImpl>::get;
+    };
+
+    Sequential sequential(M(1), M(2), M(3));
+    REQUIRE(sequential->size() == 3);
+  }
+  SECTION("push_back") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : value(value_) {}
+      int forward() {
+        return value;
+      }
+      int value;
+    };
+    Sequential sequential;
+    REQUIRE(sequential->size() == 0);
+    REQUIRE(sequential->is_empty());
+    sequential->push_back(Linear(3, 4));
+    REQUIRE(sequential->size() == 1);
+    sequential->push_back(std::make_shared<M>(1));
+    REQUIRE(sequential->size() == 2);
+    sequential->push_back(M(2));
+    REQUIRE(sequential->size() == 3);
+  }
+  SECTION("access") {
+    struct M : torch::nn::Module {
+      explicit M(int value_) : value(value_) {}
+      int forward() {
+        return value;
+      }
+      int value;
+    };
+    std::vector<std::shared_ptr<M>> modules = {
+        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
+
+    Sequential sequential;
+    for (auto& module : modules) {
+      sequential->push_back(module);
+    }
+    REQUIRE(sequential->size() == 3);
+
+    SECTION("at()") {
+      SECTION("returns the correct module for a given index") {
+        for (size_t i = 0; i < modules.size(); ++i) {
+          REQUIRE(&sequential->at<M>(i) == modules[i].get());
+        }
+      }
+      SECTION("throws for a bad index") {
+        REQUIRE_THROWS_WITH(
+            sequential->at<M>(modules.size() + 1),
+            StartsWith("Index out of range"));
+        REQUIRE_THROWS_WITH(
+            sequential->at<M>(modules.size() + 1000000),
+            StartsWith("Index out of range"));
+      }
+    }
+
+    SECTION("ptr()") {
+      SECTION("returns the correct module for a given index") {
+        for (size_t i = 0; i < modules.size(); ++i) {
+          REQUIRE(sequential->ptr(i).get() == modules[i].get());
+          REQUIRE(sequential[i].get() == modules[i].get());
+          REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
+        }
+      }
+      SECTION("throws for a bad index") {
+        REQUIRE_THROWS_WITH(
+            sequential->ptr(modules.size() + 1),
+            StartsWith("Index out of range"));
+        REQUIRE_THROWS_WITH(
+            sequential->ptr(modules.size() + 1000000),
+            StartsWith("Index out of range"));
+      }
+    }
+  }
+  SECTION("forward") {
+    SECTION("calling forward() on an empty sequential is disallowed") {
+      Sequential empty;
+      REQUIRE_THROWS_WITH(
+          empty->forward<int>(),
+          StartsWith("Cannot call forward() on an empty Sequential"));
+    }
+
+    SECTION("calling forward() on a non-empty sequential chains correctly") {
+      struct MockModule : torch::nn::Module {
+        explicit MockModule(int value) : expected(value) {}
+        int expected;
+        int forward(int value) {
+          REQUIRE(value == expected);
+          return value + 1;
+        }
+      };
+
+      Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
+
+      REQUIRE(sequential->forward<int>(1) == 4);
+    }
+
+    SECTION("calling forward() with the wrong return type throws") {
+      struct M : public torch::nn::Module {
+        int forward() {
+          return 5;
+        }
+      };
+
+      Sequential sequential(M{});
+      REQUIRE(sequential->forward<int>() == 5);
+      REQUIRE_THROWS_WITH(
+          sequential->forward<float>(),
+          StartsWith("The type of the return value "
+                     "is int, but you asked for type float"));
+    }
+
+    SECTION("The return type of forward() defaults to Tensor") {
+      struct M : public torch::nn::Module {
+        torch::Tensor forward(torch::Tensor v) {
+          return v;
+        }
+      };
+
+      Sequential sequential(M{});
+      auto variable = torch::ones({3, 3}, torch::requires_grad());
+      REQUIRE(sequential->forward(variable).equal(variable));
+    }
+  }
+
+  SECTION("returns the last value") {
+    torch::manual_seed(0);
+    Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
+
+    auto x = torch::randn({1000, 10}, torch::requires_grad());
+    auto y = sequential->forward(x);
+    REQUIRE(y.ndimension() == 2);
+    REQUIRE(y.size(0) == 1000);
+    REQUIRE(y.size(1) == 100);
+  }
+
+  SECTION("can hold other important modules") {
+    Sequential sequential(
+        Linear(10, 3),
+        Conv2d(1, 2, 3),
+        Dropout(0.5),
+        BatchNorm(5),
+        Embedding(4, 10),
+        LSTM(4, 5));
+  }
+
+  SECTION("converts at::Tensor to torch::Tensor correctly") {
+    struct M : torch::nn::Module {
+      torch::Tensor forward(torch::Tensor input) {
+        return input;
+      }
+    };
+
+    Sequential sequential(M{});
+    torch::Tensor variable = torch::ones(5);
+    REQUIRE(sequential->forward(variable).sum().toCFloat() == 5);
+
+    at::Tensor tensor_that_is_actually_a_variable = variable * 2;
+    REQUIRE(
+        sequential->forward(tensor_that_is_actually_a_variable)
+            .sum()
+            .toCFloat() == 10);
+  }
+  SECTION("extend() pushes modules from other Sequential") {
+    struct A : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    struct B : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    struct C : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    struct D : torch::nn::Module {
+      int forward(int x) {
+        return x;
+      }
+    };
+    Sequential a(A{}, B{});
+    Sequential b(C{}, D{});
+    a->extend(*b);
+
+    REQUIRE(a->size() == 4);
+    REQUIRE(a[0]->as<A>());
+    REQUIRE(a[1]->as<B>());
+    REQUIRE(a[2]->as<C>());
+    REQUIRE(a[3]->as<D>());
+
+    REQUIRE(b->size() == 2);
+    REQUIRE(b[0]->as<C>());
+    REQUIRE(b[1]->as<D>());
+
+    std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
+                                         std::make_shared<A>()};
+    b->extend(c);
+
+    REQUIRE(b->size() == 4);
+    REQUIRE(b[0]->as<C>());
+    REQUIRE(b[1]->as<D>());
+    REQUIRE(b[2]->as<A>());
+    REQUIRE(b[3]->as<A>());
+  }
+  SECTION("has reference semantics") {
+    Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
+    Sequential second(first);
+
+    REQUIRE(first.get() == second.get());
+    REQUIRE(first->size() == second->size());
+    REQUIRE(std::equal(
+        first->begin(),
+        first->end(),
+        second->begin(),
+        [](const AnyModule& first, const AnyModule& second) {
+          return &first == &second;
+        }));
+  }
+  SECTION("Is cloneable") {
+    Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
+    Sequential clone =
+        std::static_pointer_cast<SequentialImpl>(sequential->clone());
+    REQUIRE(sequential->size() == clone->size());
+
+    for (size_t i = 0; i < sequential->size(); ++i) {
+      // The modules should be the same kind (type).
+      REQUIRE(sequential[i]->name() == clone[i]->name());
+      // But not pointer-equal (distinct objects).
+      REQUIRE(sequential[i] != clone[i]);
+    }
+
+    // Verify that the clone is deep, i.e. parameters of modules are cloned too.
+
+    auto params1 = sequential->parameters();
+    auto params2 = clone->parameters();
+    REQUIRE(params1.size() == params2.size());
+    for (auto& param : params1) {
+      REQUIRE(!pointer_equal(param.value, params2[param.key]));
+      REQUIRE(param->allclose(params2[param.key]));
+      param->data().add_(2);
+    }
+    for (auto& param : params1) {
+      REQUIRE(!param->allclose(params2[param.key]));
+    }
+  }
+}
diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp
new file mode 100644
index 0000000..5cc8cc9
--- /dev/null
+++ b/test/cpp/api/serialization.cpp
@@ -0,0 +1,328 @@
+#include <catch.hpp>
+
+#include <torch/nn/modules/functional.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/sequential.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/sgd.h>
+#include <torch/serialization.h>
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <test/cpp/api/util.h>
+
+#include <cereal/archives/portable_binary.hpp>
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace torch::nn;
+
+namespace {
+Sequential xor_model() {
+  return Sequential(
+      Linear(2, 8),
+      Functional(at::sigmoid),
+      Linear(8, 1),
+      Functional(at::sigmoid));
+}
+} // namespace
+
+TEST_CASE("serialization") {
+  torch::manual_seed(0);
+  SECTION("undefined") {
+    auto x = torch::Tensor();
+
+    REQUIRE(!x.defined());
+
+    auto y = torch::randn({5});
+
+    std::stringstream ss;
+    torch::save(ss, &x);
+    torch::load(ss, &y);
+
+    REQUIRE(!y.defined());
+  }
+
+  SECTION("cputypes") {
+    for (int i = 0; i < static_cast<int>(torch::Dtype::NumOptions); i++) {
+      if (i == static_cast<int>(torch::Dtype::Half)) {
+        // XXX can't serialize half tensors at the moment since contiguous() is
+        // not implemented for this type;
+        continue;
+      } else if (i == static_cast<int>(torch::Dtype::Undefined)) {
+        // We can't construct a tensor for this type. This is tested in
+        // serialization/undefined anyway.
+        continue;
+      }
+
+      auto x = torch::ones(
+          {5, 5}, torch::getType(torch::kCPU, static_cast<torch::Dtype>(i)));
+      auto y = torch::empty({});
+
+      std::stringstream ss;
+      torch::save(ss, &x);
+      torch::load(ss, &y);
+
+      REQUIRE(y.defined());
+      REQUIRE(x.sizes().vec() == y.sizes().vec());
+      if (torch::isIntegralType(static_cast<torch::Dtype>(i))) {
+        REQUIRE(x.equal(y));
+      } else {
+        REQUIRE(x.allclose(y));
+      }
+    }
+  }
+
+  SECTION("binary") {
+    auto x = torch::randn({5, 5});
+    auto y = torch::Tensor();
+
+    std::stringstream ss;
+    {
+      cereal::BinaryOutputArchive archive(ss);
+      archive(x);
+    }
+    {
+      cereal::BinaryInputArchive archive(ss);
+      archive(y);
+    }
+
+    REQUIRE(y.defined());
+    REQUIRE(x.sizes().vec() == y.sizes().vec());
+    REQUIRE(x.allclose(y));
+  }
+  SECTION("portable_binary") {
+    auto x = torch::randn({5, 5});
+    auto y = torch::Tensor();
+
+    std::stringstream ss;
+    {
+      cereal::PortableBinaryOutputArchive archive(ss);
+      archive(x);
+    }
+    {
+      cereal::PortableBinaryInputArchive archive(ss);
+      archive(y);
+    }
+
+    REQUIRE(y.defined());
+    REQUIRE(x.sizes().vec() == y.sizes().vec());
+    REQUIRE(x.allclose(y));
+  }
+
+  SECTION("resized") {
+    auto x = torch::randn({11, 5});
+    x.resize_({5, 5});
+    auto y = torch::Tensor();
+
+    std::stringstream ss;
+    {
+      cereal::BinaryOutputArchive archive(ss);
+      archive(x);
+    }
+    {
+      cereal::BinaryInputArchive archive(ss);
+      archive(y);
+    }
+
+    REQUIRE(y.defined());
+    REQUIRE(x.sizes().vec() == y.sizes().vec());
+    REQUIRE(x.allclose(y));
+  }
+  SECTION("sliced") {
+    auto x = torch::randn({11, 5});
+    x = x.slice(0, 1, 3);
+    auto y = torch::Tensor();
+
+    std::stringstream ss;
+    {
+      cereal::BinaryOutputArchive archive(ss);
+      archive(x);
+    }
+    {
+      cereal::BinaryInputArchive archive(ss);
+      archive(y);
+    }
+
+    REQUIRE(y.defined());
+    REQUIRE(x.sizes().vec() == y.sizes().vec());
+    REQUIRE(x.allclose(y));
+  }
+
+  SECTION("noncontig") {
+    auto x = torch::randn({11, 5});
+    x = x.slice(1, 1, 4);
+    auto y = torch::Tensor();
+
+    std::stringstream ss;
+    {
+      cereal::BinaryOutputArchive archive(ss);
+      archive(x);
+    }
+    {
+      cereal::BinaryInputArchive archive(ss);
+      archive(y);
+    }
+
+    REQUIRE(y.defined());
+    REQUIRE(x.sizes().vec() == y.sizes().vec());
+    REQUIRE(x.allclose(y));
+  }
+
+  SECTION("xor") {
+    // We better be able to save and load a XOR model!
+    auto getLoss = [](Sequential model, uint32_t batch_size) {
+      auto inputs = torch::empty({batch_size, 2});
+      auto labels = torch::empty({batch_size});
+      for (size_t i = 0; i < batch_size; i++) {
+        inputs[i] = torch::randint(2, {2}, torch::kInt64);
+        labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+      }
+      auto x = model->forward<torch::Tensor>(inputs);
+      return torch::binary_cross_entropy(x, labels);
+    };
+
+    auto model = xor_model();
+    auto model2 = xor_model();
+    auto model3 = xor_model();
+    auto optimizer = torch::optim::SGD(
+        model->parameters(),
+        torch::optim::SGDOptions(1e-1)
+            .momentum(0.9)
+            .nesterov(true)
+            .weight_decay(1e-6));
+
+    float running_loss = 1;
+    int epoch = 0;
+    while (running_loss > 0.1) {
+      torch::Tensor loss = getLoss(model, 4);
+      optimizer.zero_grad();
+      loss.backward();
+      optimizer.step();
+
+      running_loss = running_loss * 0.99 + loss.data().sum().toCFloat() * 0.01;
+      REQUIRE(epoch < 3000);
+      epoch++;
+    }
+
+    std::stringstream ss;
+    torch::save(ss, model);
+    torch::load(ss, model2);
+
+    auto loss = getLoss(model2, 100);
+    REQUIRE(loss.toCFloat() < 0.1);
+  }
+
+  SECTION("optim") {
+    auto model1 = Linear(5, 2);
+    auto model2 = Linear(5, 2);
+    auto model3 = Linear(5, 2);
+
+    // Models 1, 2, 3 will have the same params
+    std::stringstream ss;
+    torch::save(ss, model1.get());
+    torch::load(ss, model2.get());
+    ss.seekg(0, std::ios::beg);
+    torch::load(ss, model3.get());
+
+    // Make some optimizers with momentum (and thus state)
+    auto optim1 = torch::optim::SGD(
+        model1->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
+    auto optim2 = torch::optim::SGD(
+        model2->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
+    auto optim2_2 = torch::optim::SGD(
+        model2->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
+    auto optim3 = torch::optim::SGD(
+        model3->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
+    auto optim3_2 = torch::optim::SGD(
+        model3->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
+
+    auto x = torch::ones({10, 5}, torch::requires_grad());
+
+    auto step = [&](torch::optim::Optimizer& optimizer, Linear model) {
+      optimizer.zero_grad();
+      auto y = model->forward(x).sum();
+      y.backward();
+      optimizer.step();
+    };
+
+    // Do 2 steps of model1
+    step(optim1, model1);
+    step(optim1, model1);
+
+    // Do 2 steps of model 2 without saving the optimizer
+    step(optim2, model2);
+    step(optim2_2, model2);
+
+    // Do 2 steps of model 3 while saving the optimizer
+    step(optim3, model3);
+    ss.clear();
+    torch::save(ss, &optim3);
+    torch::load(ss, &optim3_2);
+    step(optim3_2, model3);
+
+    auto param1 = model1->parameters();
+    auto param2 = model2->parameters();
+    auto param3 = model3->parameters();
+    for (auto& p : param1) {
+      auto& name = p.key;
+      // Model 1 and 3 should be the same
+      REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
+      REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
+    }
+  }
+}
+
+TEST_CASE("serialization_cuda", "[cuda]") {
+  torch::manual_seed(0);
+  // We better be able to save and load a XOR model!
+  auto getLoss = [](Sequential model, uint32_t batch_size) {
+    auto inputs = torch::empty({batch_size, 2});
+    auto labels = torch::empty({batch_size});
+    for (size_t i = 0; i < batch_size; i++) {
+      inputs[i] = torch::randint(2, {2}, torch::kInt64);
+      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+    }
+    auto x = model->forward<torch::Tensor>(inputs);
+    return torch::binary_cross_entropy(x, labels);
+  };
+
+  auto model = xor_model();
+  auto model2 = xor_model();
+  auto model3 = xor_model();
+  auto optimizer = torch::optim::SGD(
+      model->parameters(),
+      torch::optim::SGDOptions(1e-1).momentum(0.9).nesterov(true).weight_decay(
+          1e-6));
+
+  float running_loss = 1;
+  int epoch = 0;
+  while (running_loss > 0.1) {
+    torch::Tensor loss = getLoss(model, 4);
+    optimizer.zero_grad();
+    loss.backward();
+    optimizer.step();
+
+    running_loss = running_loss * 0.99 + loss.data().sum().toCFloat() * 0.01;
+    REQUIRE(epoch < 3000);
+    epoch++;
+  }
+
+  std::stringstream ss;
+  torch::save(ss, model);
+  torch::load(ss, model2);
+
+  auto loss = getLoss(model2, 100);
+  REQUIRE(loss.toCFloat() < 0.1);
+
+  model2->to(torch::kCUDA);
+  ss.clear();
+  torch::save(ss, model2);
+  torch::load(ss, model3);
+
+  loss = getLoss(model3, 100);
+  REQUIRE(loss.toCFloat() < 0.1);
+}
diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp
new file mode 100644
index 0000000..121478c
--- /dev/null
+++ b/test/cpp/api/static.cpp
@@ -0,0 +1,64 @@
+#include <catch.hpp>
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/any.h>
+#include <torch/nn/modules/linear.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <string>
+#include <vector>
+
+template <
+    typename T,
+    typename = torch::enable_if_t<!torch::detail::is_module<T>::value>>
+bool f(T&& m) {
+  return false;
+}
+
+template <typename T>
+torch::detail::enable_if_module_t<T, bool> f(T&& m) {
+  return true;
+}
+
+TEST_CASE("static") {
+  SECTION("all_of") {
+    REQUIRE(torch::all_of<>::value == true);
+    REQUIRE(torch::all_of<true>::value == true);
+    REQUIRE(torch::all_of<true, true, true>::value == true);
+    REQUIRE(torch::all_of<false>::value == false);
+    REQUIRE(torch::all_of<false, false, false>::value == false);
+    REQUIRE(torch::all_of<true, true, false>::value == false);
+  }
+  SECTION("any_of") {
+    REQUIRE(torch::any_of<>::value == false);
+    REQUIRE(torch::any_of<true>::value == true);
+    REQUIRE(torch::any_of<true, true, true>::value == true);
+    REQUIRE(torch::any_of<false>::value == false);
+    REQUIRE(torch::any_of<true, true, false>::value == true);
+  }
+  SECTION("enable_if_module_t") {
+    REQUIRE(f(torch::nn::LinearImpl(1, 2)) == true);
+    REQUIRE(f(5) == false);
+  }
+  SECTION("check_not_lvalue_references") {
+    REQUIRE(torch::detail::check_not_lvalue_references<int>() == true);
+    REQUIRE(
+        torch::detail::check_not_lvalue_references<float, int, char>() == true);
+    REQUIRE(
+        torch::detail::check_not_lvalue_references<float, int&, char>() ==
+        false);
+    REQUIRE(torch::detail::check_not_lvalue_references<std::string>() == true);
+    REQUIRE(
+        torch::detail::check_not_lvalue_references<std::string&>() == false);
+  }
+  SECTION("apply") {
+    std::vector<int> v;
+    torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
+    REQUIRE(v.size() == 5);
+    for (size_t i = 0; i < v.size(); ++i) {
+      REQUIRE(v.at(i) == 1 + i);
+    }
+  }
+}
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
new file mode 100644
index 0000000..07fa8c3
--- /dev/null
+++ b/test/cpp/api/tensor.cpp
@@ -0,0 +1,181 @@
+#include <catch.hpp>
+
+#include <torch/tensor.h>
+
+#include <ATen/ATen.h>
+
+#include <cmath>
+
+template <typename T>
+bool exactly_equal(at::Tensor left, T right) {
+  return at::Scalar(left).to<T>() == right;
+}
+
+template <typename T>
+bool almost_equal(at::Tensor left, T right, T tolerance = 1e-4) {
+  return std::abs(at::Scalar(left).to<T>() - right) < tolerance;
+}
+
+#define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)                \
+  REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type());   \
+  REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \
+  REQUIRE(tensor.dtype() == (type_));                                          \
+  REQUIRE(tensor.layout() == (layout_))
+
+TEST_CASE("Tensor/ToDtype") {
+  auto tensor = at::empty({3, 4});
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+
+  tensor = tensor.to(at::kInt);
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kInt, at::kStrided);
+
+  tensor = tensor.to(at::kChar);
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kChar, at::kStrided);
+
+  tensor = tensor.to(at::kDouble);
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kDouble, at::kStrided);
+}
+
+// Not currently supported.
+// TEST_CASE("Tensor/ToLayout") {
+//   auto tensor = at::empty({3, 4});
+//   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+//
+//   tensor = tensor.to(at::kSparse);
+//   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kSparse);
+//
+//   tensor = tensor.to(at::kStrided);
+//   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+// }
+
+TEST_CASE("Tensor/ToDevice", "[cuda]") {
+  auto tensor = at::empty({3, 4});
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+
+  tensor = tensor.to({at::kCUDA, 1});
+  REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kFloat, at::kStrided);
+
+  tensor = tensor.to({at::kCUDA, 0});
+  REQUIRE_TENSOR_OPTIONS(at::kCUDA, 0, at::kFloat, at::kStrided);
+
+  tensor = tensor.to({at::kCUDA, 1});
+  REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kFloat, at::kStrided);
+
+  tensor = tensor.to(at::Device(at::kCPU));
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+}
+
+TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
+  auto tensor = at::empty({3, 4});
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+
+  tensor = tensor.to({at::kCUDA, 1}, at::kInt);
+  REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kInt, at::kStrided);
+}
+
+TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") {
+  {
+    auto tensor = torch::empty({3, 4}, at::requires_grad());
+    REQUIRE(tensor.requires_grad());
+
+    tensor = tensor.to(at::kDouble);
+    REQUIRE(tensor.requires_grad());
+  }
+  {
+    auto tensor = torch::empty({3, 4});
+    REQUIRE(!tensor.requires_grad());
+
+    tensor = tensor.to(at::kDouble);
+    REQUIRE(!tensor.requires_grad());
+  }
+}
+
+TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") {
+  auto tensor = at::empty({3, 4}, at::kFloat);
+  auto hopefully_not_copy = tensor.to(at::kFloat);
+  REQUIRE(hopefully_not_copy.data<float>() == tensor.data<float>());
+}
+
+TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") {
+  auto tensor = at::tensor(123);
+  REQUIRE(tensor.numel() == 1);
+  REQUIRE(tensor.dtype() == at::kInt);
+  REQUIRE(tensor[0].toCInt() == 123);
+
+  tensor = at::tensor(123.456f);
+  REQUIRE(tensor.numel() == 1);
+  REQUIRE(tensor.dtype() == at::kFloat);
+  REQUIRE(almost_equal(tensor[0], 123.456f));
+
+  tensor = at::tensor(123.456);
+  REQUIRE(tensor.numel() == 1);
+  REQUIRE(tensor.dtype() == at::kDouble);
+  REQUIRE(almost_equal(tensor[0], 123.456));
+}
+
+TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") {
+  auto tensor = at::tensor({1, 2, 3});
+  REQUIRE(tensor.numel() == 3);
+  REQUIRE(tensor.dtype() == at::kInt);
+  REQUIRE(exactly_equal(tensor[0], 1));
+  REQUIRE(exactly_equal(tensor[1], 2));
+  REQUIRE(exactly_equal(tensor[2], 3));
+
+  tensor = at::tensor({1.5, 2.25, 3.125});
+  REQUIRE(tensor.numel() == 3);
+  REQUIRE(tensor.dtype() == at::kDouble);
+  REQUIRE(almost_equal(tensor[0], 1.5));
+  REQUIRE(almost_equal(tensor[1], 2.25));
+  REQUIRE(almost_equal(tensor[2], 3.125));
+}
+
+TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") {
+  auto tensor = torch::tensor({1, 2, 3});
+  REQUIRE(tensor.is_variable());
+  REQUIRE(tensor.numel() == 3);
+  REQUIRE(tensor.dtype() == at::kInt);
+  REQUIRE(exactly_equal(tensor[0], 1));
+  REQUIRE(exactly_equal(tensor[1], 2));
+  REQUIRE(exactly_equal(tensor[2], 3));
+
+  tensor = torch::tensor({1.5, 2.25, 3.125});
+  REQUIRE(tensor.is_variable());
+  REQUIRE(tensor.numel() == 3);
+  REQUIRE(tensor.dtype() == at::kDouble);
+  REQUIRE(almost_equal(tensor[0], 1.5));
+  REQUIRE(almost_equal(tensor[1], 2.25));
+  REQUIRE(almost_equal(tensor[2], 3.125));
+}
+
+TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") {
+  std::vector<int> v = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  auto tensor = at::tensor(v);
+  REQUIRE(tensor.numel() == v.size());
+  REQUIRE(tensor.dtype() == at::kInt);
+  for (size_t i = 0; i < v.size(); ++i) {
+    REQUIRE(exactly_equal(tensor[i], v.at(i)));
+  }
+
+  std::vector<float> w = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.0};
+  tensor = at::tensor(w);
+  REQUIRE(tensor.numel() == w.size());
+  REQUIRE(tensor.dtype() == at::kFloat);
+  for (size_t i = 0; i < w.size(); ++i) {
+    REQUIRE(almost_equal(tensor[i], w.at(i)));
+  }
+}
+
+TEST_CASE("Tensor/UsesOptionsThatAreSupplied") {
+  auto tensor = at::tensor(123, dtype(at::kFloat)) + 0.5;
+  REQUIRE(tensor.numel() == 1);
+  REQUIRE(tensor.dtype() == at::kFloat);
+  REQUIRE(almost_equal(tensor[0], 123.5));
+
+  tensor = at::tensor({1.1, 2.2, 3.3}, dtype(at::kInt));
+  REQUIRE(tensor.numel() == 3);
+  REQUIRE(tensor.dtype() == at::kInt);
+  REQUIRE(tensor.layout() == at::kStrided);
+  REQUIRE(exactly_equal(tensor[0], 1));
+  REQUIRE(exactly_equal(tensor[1], 2));
+  REQUIRE(exactly_equal(tensor[2], 3));
+}
diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp
new file mode 100644
index 0000000..5a92bd1
--- /dev/null
+++ b/test/cpp/api/tensor_cuda.cpp
@@ -0,0 +1,11 @@
+#include <catch.hpp>
+
+#include <ATen/ATen.h>
+
+#include <cmath>
+
+TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[cuda]") {
+  auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1}));
+  REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
+  REQUIRE(tensor.device().index() == 1);
+}
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
new file mode 100644
index 0000000..ab1de64
--- /dev/null
+++ b/test/cpp/api/tensor_options.cpp
@@ -0,0 +1,136 @@
+#include "catch.hpp"
+
+#include <torch/tensor.h>
+
+#include <ATen/Context.h>
+#include <ATen/Functions.h>
+#include <ATen/OptionsGuard.h>
+#include <ATen/TensorOptions.h>
+
+#include <vector>
+#include <string>
+
+using namespace at;
+
+// A macro so we don't lose location information when an assertion fails.
+#define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
+  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  REQUIRE(options.dtype() == (type_));                                      \
+  REQUIRE(options.layout() == (layout_))
+
+#define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
+  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  REQUIRE(tensor.type().scalarType() == (type_));                          \
+  REQUIRE(tensor.type().layout() == (layout_))
+
+TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
+  TensorOptions options;
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
+}
+
+TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
+  auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse);
+  REQUIRE(options.type() == getType(kSparseCPU, kInt));
+}
+
+TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
+  auto options = dtype(kInt);
+  REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided);
+
+  options = layout(kSparse);
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kSparse);
+
+  options = device({kCUDA, 1});
+  REQUIRE_OPTIONS(kCUDA, 1, kFloat, kStrided);
+
+  options = device_index(1);
+  REQUIRE_OPTIONS(kCUDA, 1, kFloat, kStrided);
+
+  options = dtype(kByte).layout(kSparse).device({kCUDA, 2}).device_index(3);
+  REQUIRE_OPTIONS(kCUDA, 3, kByte, kSparse);
+}
+
+TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
+  TensorOptions options;
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
+
+  options = TensorOptions({kCPU, 0});
+  REQUIRE_OPTIONS(kCPU, 0, kFloat, kStrided);
+
+  options = TensorOptions(kInt);
+  REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided);
+
+  options = TensorOptions(getType(kSparseCPU, kFloat));
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kSparse);
+
+  options = TensorOptions(getType(kSparseCPU, kByte));
+  REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
+}
+
+TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
+  auto options = TensorOptions(empty(5, kDouble));
+  REQUIRE_OPTIONS(kCPU, -1, kDouble, kStrided);
+
+  options = TensorOptions(empty(5, getType(kSparseCPU, kByte)));
+  REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
+}
+
+TEST_CASE("TensorOptions/ConstructsWellFromVariables") {
+  auto options = TensorOptions(torch::empty(5));
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
+  REQUIRE(!options.requires_grad());
+
+  options = TensorOptions(torch::empty(5, at::requires_grad()));
+  REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
+  REQUIRE(!options.requires_grad());
+}
+
+TEST_CASE("Device/ParsesCorrectlyFromString") {
+  Device device("cpu:0");
+  REQUIRE(device == Device(kCPU, 0));
+
+  device = Device("cpu");
+  REQUIRE(device == Device(kCPU));
+
+  device = Device("cuda:123");
+  REQUIRE(device == Device(kCUDA, 123));
+
+  device = Device("cuda");
+  REQUIRE(device == Device(kCUDA));
+
+  std::vector<std::string> badnesses = {
+      "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};
+  for (const auto& badness : badnesses) {
+    REQUIRE_THROWS(Device(badness));
+  }
+}
+
+TEST_CASE("OptionsGuard") {
+  Tensor tensor;
+  {
+    OptionsGuard guard(TensorOptions{});
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCPU, -1, kFloat, kStrided);
+
+  {
+    OptionsGuard guard(TensorOptions().dtype(kInt));
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCPU, -1, kInt, kStrided);
+
+  {
+    OptionsGuard guard(TensorOptions().dtype(kInt).layout(kSparse));
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCPU, -1, kInt, kSparse);
+
+  {
+    OptionsGuard guard(requires_grad(true));
+    tensor = torch::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCPU, -1, kFloat, kStrided);
+  REQUIRE(tensor.requires_grad());
+}
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
new file mode 100644
index 0000000..596e3b7
--- /dev/null
+++ b/test/cpp/api/tensor_options_cuda.cpp
@@ -0,0 +1,113 @@
+#include "catch.hpp"
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/Functions.h>
+#include <ATen/OptionsGuard.h>
+#include <ATen/TensorOptions.h>
+
+using namespace at;
+
+// A macro so we don't lose location information when an assertion fails.
+#define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
+  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  REQUIRE(options.dtype() == (type_));                                      \
+  REQUIRE(options.layout() == (layout_))
+
+#define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
+  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  REQUIRE(tensor.type().scalarType() == (type_));                          \
+  REQUIRE(tensor.type().layout() == (layout_))
+
+TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
+  auto options = TensorOptions(CUDA(kFloat));
+  REQUIRE_OPTIONS(kCUDA, -1, kFloat, kStrided);
+
+  options = TensorOptions(CUDA(kInt));
+  REQUIRE_OPTIONS(kCUDA, -1, kInt, kStrided);
+
+  options = TensorOptions(getType(kSparseCUDA, kFloat));
+  REQUIRE_OPTIONS(kCUDA, -1, kFloat, kSparse);
+
+  options = TensorOptions(getType(kSparseCUDA, kByte));
+  REQUIRE_OPTIONS(kCUDA, -1, kByte, kSparse);
+
+  options = TensorOptions(CUDA(kFloat), /*device=*/5);
+  REQUIRE_OPTIONS(kCUDA, 5, kFloat, kStrided);
+
+  options = TensorOptions(getType(kSparseCUDA, kFloat), /*device=*/5);
+  REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse);
+}
+
+TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[cuda]") {
+  auto options = TensorOptions(empty(5, device(kCUDA).dtype(kDouble)));
+  REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided);
+
+  options = TensorOptions(empty(5, getType(kSparseCUDA, kByte)));
+  REQUIRE_OPTIONS(kCUDA, 0, kByte, kSparse);
+
+  if (at::globalContext().getNumGPUs() > 1) {
+    Tensor tensor;
+    {
+      DeviceGuard guard(1);
+      tensor = empty(5, device(kCUDA));
+    }
+    options = TensorOptions(tensor);
+    REQUIRE_OPTIONS(kCUDA, 1, kFloat, kStrided);
+
+    {
+      DeviceGuard guard(1);
+      tensor = empty(5, device(kCUDA).layout(kSparse));
+    }
+    options = TensorOptions(tensor);
+    REQUIRE_OPTIONS(kCUDA, 1, kFloat, kSparse);
+  }
+}
+
+TEST_CASE("OptionsGuardCUDA", "[cuda]") {
+  Tensor tensor;
+  {
+    OptionsGuard guard(device(kCUDA));
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kFloat, kStrided);
+
+  {
+    OptionsGuard guard(device({kCUDA, 1}));
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCUDA, 1, kFloat, kStrided);
+
+  {
+    OptionsGuard guard(device(kCUDA).dtype(kInt));
+    tensor = at::empty({10});
+  }
+  REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided);
+}
+
+TEST_CASE("DeviceGuardOptionsGuardInteraction", "[cuda]") {
+  Tensor tensor;
+  {
+    // Check that OptionsGuard respects any active device before construction.
+    DeviceGuard guard(1);
+    {
+      OptionsGuard guard(device(kCUDA));
+      tensor = at::empty({10});
+      REQUIRE_TENSOR_OPTIONS(kCUDA, 1, kFloat, kStrided);
+      {
+        // Check that OptionsGuard respects any active device after
+        // construction.
+        DeviceGuard guard(0);
+        tensor = at::empty({10});
+        REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kFloat, kStrided);
+        {
+          OptionsGuard guard(device({kCUDA, 1}));
+          tensor = at::empty({10});
+          REQUIRE_TENSOR_OPTIONS(kCUDA, 1, kFloat, kStrided);
+        }
+      }
+    }
+  }
+}
diff --git a/test/cpp/api/util.h b/test/cpp/api/util.h
new file mode 100644
index 0000000..794e642
--- /dev/null
+++ b/test/cpp/api/util.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+
+#include <string>
+#include <utility>
+
+namespace torch {
+namespace test {
+
+// Lets you use a container without making a new class,
+// for experimental implementations
+class SimpleContainer : public nn::Cloneable<SimpleContainer> {
+ public:
+  void reset() override {}
+
+  template <typename ModuleHolder>
+  ModuleHolder add(
+      ModuleHolder module_holder,
+      std::string name = std::string()) {
+    return Module::register_module(std::move(name), module_holder);
+  }
+};
+
+inline bool pointer_equal(torch::Tensor first, torch::Tensor second) {
+  return first.data().data<float>() == second.data().data<float>();
+}
+} // namespace test
+} // namespace torch
diff --git a/test/cpp_extensions/cuda_extension.cpp b/test/cpp_extensions/cuda_extension.cpp
new file mode 100644
index 0000000..963850a
--- /dev/null
+++ b/test/cpp_extensions/cuda_extension.cpp
@@ -0,0 +1,19 @@
+#include <torch/torch.h>
+
+// Declare the function from cuda_extension.cu. It will be compiled
+// separately with nvcc and linked with the object file of cuda_extension.cpp
+// into one shared library.
+void sigmoid_add_cuda(const float* x, const float* y, float* output, int size);
+
+at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) {
+  AT_CHECK(x.type().is_cuda(), "x must be a CUDA tensor");
+  AT_CHECK(y.type().is_cuda(), "y must be a CUDA tensor");
+  auto output = at::zeros_like(x);
+  sigmoid_add_cuda(
+      x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
+  return output;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("sigmoid_add", &sigmoid_add, "sigmoid(x) + sigmoid(y)");
+}
diff --git a/test/cpp_extensions/cuda_extension.cu b/test/cpp_extensions/cuda_extension.cu
new file mode 100644
index 0000000..29511af
--- /dev/null
+++ b/test/cpp_extensions/cuda_extension.cu
@@ -0,0 +1,29 @@
+// NOTE: This is a copy of cuda_extension_kernel.cu. It's kept here to test
+// collision handling when a C++ file and CUDA file share the same filename.
+// Setuptools can't deal with this at all, so the setup.py-based test uses
+// cuda_extension_kernel.cu and the JIT test uses this file. Symlinks don't
+// work well on Windows, so this is the most thorough solution right now.
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <ATen/ATen.h>
+
+__global__ void sigmoid_add_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ y,
+    float* __restrict__ output,
+    const int size) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    const float sigmoid_x = 1.0f / (1.0f + __expf(-x[index]));
+    const float sigmoid_y = 1.0f / (1.0f + __expf(-y[index]));
+    output[index] = sigmoid_x + sigmoid_y;
+  }
+}
+
+void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
+  const int threads = 1024;
+  const int blocks = (size + threads - 1) / threads;
+  sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
+}
diff --git a/test/cpp_extensions/cuda_extension_kernel.cu b/test/cpp_extensions/cuda_extension_kernel.cu
new file mode 100644
index 0000000..6602199
--- /dev/null
+++ b/test/cpp_extensions/cuda_extension_kernel.cu
@@ -0,0 +1,23 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <ATen/ATen.h>
+
+__global__ void sigmoid_add_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ y,
+    float* __restrict__ output,
+    const int size) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    const float sigmoid_x = 1.0f / (1.0f + __expf(-x[index]));
+    const float sigmoid_y = 1.0f / (1.0f + __expf(-y[index]));
+    output[index] = sigmoid_x + sigmoid_y;
+  }
+}
+
+void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
+  const int threads = 1024;
+  const int blocks = (size + threads - 1) / threads;
+  sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
+}
diff --git a/test/cpp_extensions/cuda_extension_kernel2.cu b/test/cpp_extensions/cuda_extension_kernel2.cu
new file mode 100644
index 0000000..817bdf6
--- /dev/null
+++ b/test/cpp_extensions/cuda_extension_kernel2.cu
@@ -0,0 +1,23 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <ATen/ATen.h>
+
+__global__ void tanh_add_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ y,
+    float* __restrict__ output,
+    const int size) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    const float tanh_x = 2.0f / (1.0f + __expf(-2.0f * x[index])) - 1;
+    const float tanh_y = 2.0f / (1.0f + __expf(-2.0f * y[index])) - 1;
+    output[index] = tanh_x + tanh_y;
+  }
+}
+
+void tanh_add_cuda(const float* x, const float* y, float* output, int size) {
+  const int threads = 1024;
+  const int blocks = (size + threads - 1) / threads;
+  tanh_add_kernel<<<blocks, threads>>>(x, y, output, size);
+}
diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
new file mode 100644
index 0000000..0a7bc0f
--- /dev/null
+++ b/test/cpp_extensions/cudnn_extension.cpp
@@ -0,0 +1,80 @@
+/*
+ * CuDNN ReLU extension. Simple function but contains the general structure of
+ * most CuDNN extensions:
+ * 1) Check arguments. at::check* functions provide a standard way to validate
+ * input and provide pretty errors.
+ * 2) Create descriptors. Most CuDNN functions require creating and setting a
+ * variety of descriptors.
+ * 3) Apply the CuDNN function.
+ * 4) Destroy your descriptors.
+ * 5) Return something (optional).
+ */
+
+#include <torch/torch.h>
+
+#include <ATen/cudnn/Descriptors.h> // for TensorDescriptor
+#include <ATen/cudnn/Exceptions.h> // for CUDNN_CHECK
+#include <ATen/cudnn/Handles.h> // for getCudnnHandle
+
+// Name of function in python module and name used for error messages by
+// at::check* functions.
+const char* cudnn_relu_name = "cudnn_relu";
+
+// Check arguments to cudnn_relu
+void cudnn_relu_check(const at::Tensor& inputs, const at::Tensor& outputs) {
+  // Create TensorArgs. These record the names and positions of each tensor as a
+  // parameter.
+  at::TensorArg arg_inputs(inputs, "inputs", 0);
+  at::TensorArg arg_outputs(outputs, "outputs", 1);
+  // Check arguments. No need to return anything. These functions with throw an
+  // error if they fail. Messages are populated using information from
+  // TensorArgs.
+  at::checkContiguous(cudnn_relu_name, arg_inputs);
+  at::checkScalarType(cudnn_relu_name, arg_inputs, at::kFloat);
+  at::checkBackend(cudnn_relu_name, arg_inputs.tensor, at::kCUDA);
+  at::checkContiguous(cudnn_relu_name, arg_outputs);
+  at::checkScalarType(cudnn_relu_name, arg_outputs, at::kFloat);
+  at::checkBackend(cudnn_relu_name, arg_outputs.tensor, at::kCUDA);
+  at::checkSameSize(cudnn_relu_name, arg_inputs, arg_outputs);
+}
+
+void cudnn_relu(const at::Tensor& inputs, const at::Tensor& outputs) {
+  // Most CuDNN extensions will follow a similar pattern.
+  // Step 1: Check inputs. This will throw an error if inputs are invalid, so no
+  // need to check return codes here.
+  cudnn_relu_check(inputs, outputs);
+  // Step 2: Create descriptors
+  cudnnHandle_t cuDnn = at::native::getCudnnHandle();
+  // Note: 4 is minimum dim for a TensorDescriptor. Input and output are same
+  // size and type and contiguous, so one descriptor is sufficient.
+  at::native::TensorDescriptor input_tensor_desc(inputs, 4);
+  cudnnActivationDescriptor_t activationDesc;
+  // Note: Always check return value of cudnn functions using CUDNN_CHECK
+  AT_CUDNN_CHECK(cudnnCreateActivationDescriptor(&activationDesc));
+  AT_CUDNN_CHECK(cudnnSetActivationDescriptor(
+      activationDesc,
+      /*mode=*/CUDNN_ACTIVATION_RELU,
+      /*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
+      /*coef=*/1.));
+  // Step 3: Apply CuDNN function
+  float alpha = 1.;
+  float beta = 0.;
+  AT_CUDNN_CHECK(cudnnActivationForward(
+      cuDnn,
+      activationDesc,
+      &alpha,
+      input_tensor_desc.desc(),
+      inputs.data_ptr(),
+      &beta,
+      input_tensor_desc.desc(), // output descriptor same as input
+      outputs.data_ptr()));
+  // Step 4: Destroy descriptors
+  AT_CUDNN_CHECK(cudnnDestroyActivationDescriptor(activationDesc));
+  // Step 5: Return something (optional)
+}
+
+// Create the pybind11 module
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // Use the same name as the check functions so error messages make sense
+  m.def(cudnn_relu_name, &cudnn_relu, "CuDNN ReLU");
+}
diff --git a/test/cpp_extensions/doubler.h b/test/cpp_extensions/doubler.h
new file mode 100644
index 0000000..2b22dca
--- /dev/null
+++ b/test/cpp_extensions/doubler.h
@@ -0,0 +1,17 @@
+#include <torch/torch.h>
+
+struct Doubler {
+  Doubler(int A, int B) {
+    tensor_ = at::ones({A, B}, torch::CPU(at::kDouble));
+    torch::set_requires_grad(tensor_, true);
+  }
+  at::Tensor forward() {
+    return tensor_ * 2;
+  }
+  at::Tensor get() const {
+    return tensor_;
+  }
+
+ private:
+  at::Tensor tensor_;
+};
diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
new file mode 100644
index 0000000..8e79397
--- /dev/null
+++ b/test/cpp_extensions/extension.cpp
@@ -0,0 +1,37 @@
+#include <torch/torch.h>
+
+at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) {
+  return x.sigmoid() + y.sigmoid();
+}
+
+struct MatrixMultiplier {
+  MatrixMultiplier(int A, int B) {
+    tensor_ = at::ones({A, B}, torch::CPU(at::kDouble));
+    torch::set_requires_grad(tensor_, true);
+  }
+  at::Tensor forward(at::Tensor weights) {
+    return tensor_.mm(weights);
+  }
+  at::Tensor get() const {
+    return tensor_;
+  }
+
+ private:
+  at::Tensor tensor_;
+};
+
+bool function_taking_optional(at::optional<at::Tensor> tensor) {
+  return tensor.has_value();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("sigmoid_add", &sigmoid_add, "sigmoid(x) + sigmoid(y)");
+  m.def(
+      "function_taking_optional",
+      &function_taking_optional,
+      "function_taking_optional");
+  py::class_<MatrixMultiplier>(m, "MatrixMultiplier")
+      .def(py::init<int, int>())
+      .def("forward", &MatrixMultiplier::forward)
+      .def("get", &MatrixMultiplier::get);
+}
diff --git a/test/cpp_extensions/jit_extension.cpp b/test/cpp_extensions/jit_extension.cpp
new file mode 100644
index 0000000..e62be5b
--- /dev/null
+++ b/test/cpp_extensions/jit_extension.cpp
@@ -0,0 +1,20 @@
+#include <torch/torch.h>
+
+#include "doubler.h"
+
+using namespace at;
+
+Tensor exp_add(Tensor x, Tensor y);
+
+Tensor tanh_add(Tensor x, Tensor y) {
+  return x.tanh() + y.tanh();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("tanh_add", &tanh_add, "tanh(x) + tanh(y)");
+  m.def("exp_add", &exp_add, "exp(x) + exp(y)");
+  py::class_<Doubler>(m, "Doubler")
+  .def(py::init<int, int>())
+  .def("forward", &Doubler::forward)
+  .def("get", &Doubler::get);
+}
diff --git a/test/cpp_extensions/jit_extension2.cpp b/test/cpp_extensions/jit_extension2.cpp
new file mode 100644
index 0000000..e197308
--- /dev/null
+++ b/test/cpp_extensions/jit_extension2.cpp
@@ -0,0 +1,7 @@
+#include <torch/torch.h>
+
+using namespace at;
+
+Tensor exp_add(Tensor x, Tensor y) {
+  return x.exp() + y.exp();
+}
diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
new file mode 100644
index 0000000..a26fd8f
--- /dev/null
+++ b/test/cpp_extensions/setup.py
@@ -0,0 +1,27 @@
+import torch.cuda
+from setuptools import setup
+from torch.utils.cpp_extension import CppExtension, CUDAExtension
+from torch.utils.cpp_extension import CUDA_HOME
+
+ext_modules = [
+    CppExtension(
+        'torch_test_cpp_extension.cpp', ['extension.cpp'],
+        extra_compile_args=['-g']),
+]
+
+if torch.cuda.is_available() and CUDA_HOME is not None:
+    extension = CUDAExtension(
+        'torch_test_cpp_extension.cuda', [
+            'cuda_extension.cpp',
+            'cuda_extension_kernel.cu',
+            'cuda_extension_kernel2.cu',
+        ],
+        extra_compile_args={'cxx': ['-g'],
+                            'nvcc': ['-O2']})
+    ext_modules.append(extension)
+
+setup(
+    name='torch_test_cpp_extension',
+    packages=['torch_test_cpp_extension'],
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension})
diff --git a/test/cpp_extensions/torch_test_cpp_extension/__init__.py b/test/cpp_extensions/torch_test_cpp_extension/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/data/network1.py b/test/data/network1.py
new file mode 100644
index 0000000..68fbe37
--- /dev/null
+++ b/test/data/network1.py
@@ -0,0 +1,8 @@
+import torch.nn as nn
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.linear = nn.Linear(10, 20)
diff --git a/test/data/network2.py b/test/data/network2.py
new file mode 100644
index 0000000..862593c
--- /dev/null
+++ b/test/data/network2.py
@@ -0,0 +1,9 @@
+import torch.nn as nn
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.linear = nn.Linear(10, 20)
+        self.relu = nn.ReLU()
diff --git a/test/data/test_cuda_ignores.txt b/test/data/test_cuda_ignores.txt
new file mode 100644
index 0000000..ae87b32
--- /dev/null
+++ b/test/data/test_cuda_ignores.txt
@@ -0,0 +1,124 @@
+# List of functions that are not implemented on CUDA tensors
+# These are skipped by test_cuda.py
+torch.ByteTensor.abs
+torch.ByteTensor.abs_
+torch.ByteTensor.dist
+torch.ByteTensor.dot
+torch.ByteTensor.kthvalue
+torch.ByteTensor.lerp
+torch.ByteTensor.lerp_
+torch.ByteTensor.mean
+torch.ByteTensor.norm
+torch.ByteTensor.renorm
+torch.ByteTensor.renorm_
+torch.ByteTensor.std
+torch.ByteTensor.var
+torch.CharTensor.abs
+torch.CharTensor.abs_
+torch.CharTensor.dist
+torch.CharTensor.dot
+torch.CharTensor.kthvalue
+torch.CharTensor.lerp
+torch.CharTensor.lerp_
+torch.CharTensor.mean
+torch.CharTensor.norm
+torch.CharTensor.renorm
+torch.CharTensor.renorm_
+torch.CharTensor.std
+torch.CharTensor.var
+torch.DoubleTensor.kthvalue
+torch.FloatTensor.kthvalue
+torch.HalfTensor.chunk_
+torch.HalfTensor.clone_
+torch.HalfTensor.contiguous_
+torch.HalfTensor.cross_
+torch.HalfTensor.cumprod_
+torch.HalfTensor.cumsum_
+torch.HalfTensor.dim_
+torch.HalfTensor.dist_
+torch.HalfTensor.dot_
+torch.HalfTensor.element_size_
+torch.HalfTensor.equal_
+torch.HalfTensor.expand_
+torch.HalfTensor.expand_as_
+torch.HalfTensor.eye
+torch.HalfTensor.eye_
+torch.HalfTensor.fill
+torch.HalfTensor.geqrf
+torch.HalfTensor.geqrf_
+torch.HalfTensor.inverse
+torch.HalfTensor.inverse_
+torch.HalfTensor.is_contiguous_
+torch.HalfTensor.is_same_size_
+torch.HalfTensor.is_set_to_
+torch.HalfTensor.kthvalue
+torch.HalfTensor.kthvalue_
+torch.HalfTensor.max_
+torch.HalfTensor.mean_
+torch.HalfTensor.min_
+torch.HalfTensor.mode_
+torch.HalfTensor.narrow_
+torch.HalfTensor.ndimension_
+torch.HalfTensor.nelement_
+torch.HalfTensor.nonzero_
+torch.HalfTensor.norm_
+torch.HalfTensor.numel_
+torch.HalfTensor.ones
+torch.HalfTensor.ones_
+torch.HalfTensor.permute_
+torch.HalfTensor.prod_
+torch.HalfTensor.put__
+torch.HalfTensor.qr
+torch.HalfTensor.qr_
+torch.HalfTensor.repeat_
+torch.HalfTensor.size_
+torch.HalfTensor.sort_
+torch.HalfTensor.split_
+torch.HalfTensor.std_
+torch.HalfTensor.sum_
+torch.HalfTensor.take_
+torch.HalfTensor.to_list
+torch.HalfTensor.to_list_
+torch.HalfTensor.topk_
+torch.HalfTensor.trace_
+torch.HalfTensor.trigamma
+torch.HalfTensor.trigamma_
+torch.HalfTensor.var_
+torch.HalfTensor.view_
+torch.HalfTensor.view_as_
+torch.HalfTensor.zero
+torch.HalfTensor.zeros
+torch.HalfTensor.zeros_
+torch.IntTensor.dist
+torch.IntTensor.dot
+torch.IntTensor.kthvalue
+torch.IntTensor.lerp
+torch.IntTensor.lerp_
+torch.IntTensor.mean
+torch.IntTensor.norm
+torch.IntTensor.renorm
+torch.IntTensor.renorm_
+torch.IntTensor.std
+torch.IntTensor.var
+torch.LongTensor.dist
+torch.LongTensor.dot
+torch.LongTensor.kthvalue
+torch.LongTensor.lerp
+torch.LongTensor.lerp_
+torch.LongTensor.mean
+torch.LongTensor.norm
+torch.LongTensor.renorm
+torch.LongTensor.renorm_
+torch.LongTensor.std
+torch.LongTensor.var
+torch.ShortTensor.dist
+torch.ShortTensor.dot
+torch.ShortTensor.kthvalue
+torch.ShortTensor.lerp
+torch.ShortTensor.lerp_
+torch.ShortTensor.mean
+torch.ShortTensor.norm
+torch.ShortTensor.renorm
+torch.ShortTensor.renorm_
+torch.ShortTensor.std
+torch.ShortTensor.var
diff --git a/test/error_messages/storage.py b/test/error_messages/storage.py
new file mode 100644
index 0000000..bde3df7
--- /dev/null
+++ b/test/error_messages/storage.py
@@ -0,0 +1,71 @@
+import torch
+
+
+def check_error(desc, fn, *required_substrings):
+    try:
+        fn()
+    except Exception as e:
+        error_message = e.args[0]
+        print('=' * 80)
+        print(desc)
+        print('-' * 80)
+        print(error_message)
+        print('')
+        for sub in required_substrings:
+            assert sub in error_message
+        return
+    assert False, "given function ({}) didn't raise an error".format(desc)
+
+check_error(
+    'Wrong argument types',
+    lambda: torch.FloatStorage(object()),
+    'object')
+
+check_error('Unknown keyword argument',
+            lambda: torch.FloatStorage(content=1234.),
+            'keyword')
+
+check_error('Invalid types inside a sequence',
+            lambda: torch.FloatStorage(['a', 'b']),
+            'list', 'str')
+
+check_error('Invalid size type',
+            lambda: torch.FloatStorage(1.5),
+            'float')
+
+check_error('Invalid offset',
+            lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
+            '2', '4')
+
+check_error('Negative offset',
+            lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
+            '2', '-1')
+
+check_error('Invalid size',
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
+            '2', '1', '5')
+
+check_error('Negative size',
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
+            '2', '1', '-5')
+
+check_error('Invalid index type',
+            lambda: torch.FloatStorage(10)['first item'],
+            'str')
+
+
+def assign():
+    torch.FloatStorage(10)[1:-1] = '1'
+check_error('Invalid value type',
+            assign,
+            'str')
+
+check_error('resize_ with invalid type',
+            lambda: torch.FloatStorage(10).resize_(1.5),
+            'float')
+
+check_error('fill_ with invalid type',
+            lambda: torch.IntStorage(10).fill_('asdf'),
+            'str')
+
+# TODO: frombuffer
diff --git a/test/expect/TestAutograd.test_function-x_grad_desc.expect b/test/expect/TestAutograd.test_function-x_grad_desc.expect
new file mode 100644
index 0000000..d7ae0b4
--- /dev/null
+++ b/test/expect/TestAutograd.test_function-x_grad_desc.expect
@@ -0,0 +1 @@
+CloneBackward(ThAddBackward(ExpandBackward(AccumulateGrad()), ThMulBackward(ExpandBackward(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
diff --git a/test/expect/TestAutograd.test_function-y_grad_desc.expect b/test/expect/TestAutograd.test_function-y_grad_desc.expect
new file mode 100644
index 0000000..2263f99
--- /dev/null
+++ b/test/expect/TestAutograd.test_function-y_grad_desc.expect
@@ -0,0 +1 @@
+CloneBackward(ThAddBackward(MulBackward(ExpandBackward(AccumulateGrad())), ThMulBackward(ExpandBackward(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 0000000..b6af4e9
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect
new file mode 100644
index 0000000..8e4e1fc
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 0000000..b25c0d3
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 0000000..b6af4e9
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
new file mode 100644
index 0000000..8e4e1fc
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 0000000..b25c0d3
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestJit.test_alexnet.expect b/test/expect/TestJit.test_alexnet.expect
new file mode 100644
index 0000000..0d3a5f5
--- /dev/null
+++ b/test/expect/TestJit.test_alexnet.expect
@@ -0,0 +1,49 @@
+graph(%0 : Double(1, 3, 224, 224)
+      %1 : Double(64, 3, 11, 11)
+      %2 : Double(64)
+      %3 : Double(192, 64, 5, 5)
+      %4 : Double(192)
+      %5 : Double(384, 192, 3, 3)
+      %6 : Double(384)
+      %7 : Double(256, 384, 3, 3)
+      %8 : Double(256)
+      %9 : Double(256, 256, 3, 3)
+      %10 : Double(256)
+      %11 : Double(4096, 9216)
+      %12 : Double(4096)
+      %13 : Double(4096, 4096)
+      %14 : Double(4096)
+      %15 : Double(1000, 4096)
+      %16 : Double(1000)) {
+  %17 : Double(1, 64, 55, 55) = aten::_convolution[stride=[4, 4], padding=[2, 2], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%0, %1, %2), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %18 : Double(1, 64, 55, 55) = aten::threshold[threshold={0}, value={0}](%17), scope: AlexNet/Sequential[features]/ReLU[1]
+  %19 : Double(1, 64, 27, 27), %20 : Long(1, 64, 27, 27) = aten::max_pool2d_with_indices[kernel_size=[3, 3], stride=[2, 2], padding=[0, 0], dilation=[1, 1], ceil_mode=0](%18), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+  %21 : Double(1, 192, 27, 27) = aten::_convolution[stride=[1, 1], padding=[2, 2], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%19, %3, %4), scope: AlexNet/Sequential[features]/Conv2d[3]
+  %22 : Double(1, 192, 27, 27) = aten::threshold[threshold={0}, value={0}](%21), scope: AlexNet/Sequential[features]/ReLU[4]
+  %23 : Double(1, 192, 13, 13), %24 : Long(1, 192, 13, 13) = aten::max_pool2d_with_indices[kernel_size=[3, 3], stride=[2, 2], padding=[0, 0], dilation=[1, 1], ceil_mode=0](%22), scope: AlexNet/Sequential[features]/MaxPool2d[5]
+  %25 : Double(1, 384, 13, 13) = aten::_convolution[stride=[1, 1], padding=[1, 1], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%23, %5, %6), scope: AlexNet/Sequential[features]/Conv2d[6]
+  %26 : Double(1, 384, 13, 13) = aten::threshold[threshold={0}, value={0}](%25), scope: AlexNet/Sequential[features]/ReLU[7]
+  %27 : Double(1, 256, 13, 13) = aten::_convolution[stride=[1, 1], padding=[1, 1], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%26, %7, %8), scope: AlexNet/Sequential[features]/Conv2d[8]
+  %28 : Double(1, 256, 13, 13) = aten::threshold[threshold={0}, value={0}](%27), scope: AlexNet/Sequential[features]/ReLU[9]
+  %29 : Double(1, 256, 13, 13) = aten::_convolution[stride=[1, 1], padding=[1, 1], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%28, %9, %10), scope: AlexNet/Sequential[features]/Conv2d[10]
+  %30 : Double(1, 256, 13, 13) = aten::threshold[threshold={0}, value={0}](%29), scope: AlexNet/Sequential[features]/ReLU[11]
+  %31 : Double(1, 256, 6, 6), %32 : Long(1, 256, 6, 6) = aten::max_pool2d_with_indices[kernel_size=[3, 3], stride=[2, 2], padding=[0, 0], dilation=[1, 1], ceil_mode=0](%30), scope: AlexNet/Sequential[features]/MaxPool2d[12]
+  %33 : Long() = aten::size[dim=0](%31), scope: AlexNet
+  %34 : Long() = prim::Constant[value={9216}](), scope: AlexNet
+  %35 : Dynamic = aten::stack[dim=0](%33, %34), scope: AlexNet
+  %36 : Double(1, 9216) = aten::view(%31, %35), scope: AlexNet
+  %37 : Double(1, 9216) = ^Dropout(0.5, True, False)(%36), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %38 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %39 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%12), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %40 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%39, %37, %38), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %41 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%40), scope: AlexNet/Sequential[classifier]/ReLU[2]
+  %42 : Double(1, 4096) = ^Dropout(0.5, True, False)(%41), scope: AlexNet/Sequential[classifier]/Dropout[3]
+  %43 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %44 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%14), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %45 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%44, %42, %43), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %46 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%45), scope: AlexNet/Sequential[classifier]/ReLU[5]
+  %47 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %48 : Double(1, 1000) = aten::expand[size=[1, 1000], implicit=1](%16), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %49 : Double(1, 1000) = aten::addmm[beta={1}, alpha={1}](%48, %46, %47), scope: AlexNet/Sequential[classifier]/Linear[6]
+  return (%49);
+}
diff --git a/test/expect/TestJit.test_batchnorm.expect b/test/expect/TestJit.test_batchnorm.expect
new file mode 100644
index 0000000..4fa8a72
--- /dev/null
+++ b/test/expect/TestJit.test_batchnorm.expect
@@ -0,0 +1,9 @@
+graph(%0 : Double(2, 2, 2, 2)
+      %1 : Double(2)
+      %2 : Double(2)
+      %3 : Double(2)
+      %4 : Double(2)
+      %5 : Long()) {
+  %6 : Double(2, 2, 2, 2) = aten::batch_norm[training=1, momentum=0.1, eps=1e-05, cudnn_enabled=1](%0, %1, %2, %3, %4), scope: BatchNorm2d
+  return (%6);
+}
diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect
new file mode 100644
index 0000000..c1b45b1
--- /dev/null
+++ b/test/expect/TestJit.test_concat_fusion.expect
@@ -0,0 +1,12 @@
+graph(%0 : Float(3, 20)
+      %1 : Float(3, 20)) {
+  %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1)
+  return (%2);
+}
+with prim::FusionGroup_0 = graph(%3 : Float(3, 20)
+      %4 : Float(3, 20)) {
+  %6 : Float(3, 20) = aten::add[alpha={1}](%3, %4)
+  %5 : Float(3, 20) = aten::mul(%3, %4)
+  %2 : Float(6, 20) = aten::cat[dim=0](%6, %5)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_conv.expect b/test/expect/TestJit.test_conv.expect
new file mode 100644
index 0000000..584f807
--- /dev/null
+++ b/test/expect/TestJit.test_conv.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(20, 16, 50, 40)
+      %1 : Double(13, 16, 3, 3)) {
+  %2 : Dynamic = prim::Undefined(), scope: Conv2d
+  %3 : Double(20, 13, 48, 38) = aten::_convolution[stride=[1, 1], padding=[0, 0], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%0, %1, %2), scope: Conv2d
+  return (%3);
+}
diff --git a/test/expect/TestJit.test_cpp.expect b/test/expect/TestJit.test_cpp.expect
new file mode 100644
index 0000000..bfe49e4
--- /dev/null
+++ b/test/expect/TestJit.test_cpp.expect
@@ -0,0 +1,145 @@
+testBlocks
+graph(%a : Dynamic
+      %b : Dynamic
+      %c : Dynamic) {
+  %2 : Dynamic = aten::add[alpha={1}](%a, %b)
+  %4 : Dynamic = prim::If(%c)
+    block0() {
+      %5 : Dynamic = aten::add[alpha={1}](%2, %2)
+      -> (%5)
+    }
+    block1() {
+      %6 : Dynamic = aten::add[alpha={1}](%b, %2)
+      %7 : Dynamic = aten::add[alpha={1}](%6, %2)
+      -> (%7)
+    }
+  %8 : Dynamic = aten::add[alpha={1}](%4, %2)
+  return (%8);
+}
+
+graph(%a : Dynamic
+      %b : Dynamic
+      %c : Dynamic) {
+  %2 : Dynamic = aten::add[alpha={1}](%a, %b)
+  %4 : Dynamic = prim::If(%c)
+    block0() {
+      %6 : Dynamic = aten::add[alpha={1}](%b, %2)
+      %7 : Dynamic = aten::add[alpha={1}](%6, %2)
+      -> (%7)
+    }
+  %8 : Dynamic = aten::add[alpha={1}](%4, %2)
+  return (%8);
+}
+
+graph(%a : Dynamic
+      %b : Dynamic
+      %c : Dynamic) {
+  %3 : Dynamic = aten::add[alpha={1}](%a, %b)
+  %4 : Dynamic = prim::If(%c)
+    block0() {
+      %5 : Dynamic = aten::add[alpha={1}](%b, %3)
+      %6 : Dynamic = aten::add[alpha={1}](%5, %3)
+      -> (%6)
+    }
+  %7 : Dynamic = aten::add[alpha={1}](%4, %3)
+  return (%7);
+}
+
+testCreateAutodiffSubgraphs
+graph(%0 : Dynamic
+      %1 : Dynamic
+      %2 : Dynamic
+      %3 : Dynamic
+      %4 : Dynamic) {
+  %21 : Dynamic, %22 : Dynamic = prim::GraphExecutor_0(%0, %3, %1, %4, %2)
+  return (%22, %21);
+}
+with prim::GraphExecutor_0 = graph(%1 : Dynamic
+      %2 : Dynamic
+      %4 : Dynamic
+      %5 : Dynamic
+      %16 : Dynamic) {
+  %0 : Dynamic = aten::mm(%1, %2)
+  %3 : Dynamic = aten::mm(%4, %5)
+  %6 : Dynamic = aten::add[alpha={1}](%0, %3)
+  %7 : Dynamic, %8 : Dynamic, %9 : Dynamic, %10 : Dynamic = aten::chunk[chunks=4, dim=1](%6)
+  %11 : Dynamic = aten::sigmoid(%7)
+  %12 : Dynamic = aten::sigmoid(%10)
+  %13 : Dynamic = aten::tanh(%9)
+  %14 : Dynamic = aten::sigmoid(%8)
+  %15 : Dynamic = aten::mul(%14, %16)
+  %17 : Dynamic = aten::mul(%11, %13)
+  %18 : Dynamic = aten::add[alpha={1}](%15, %17)
+  %19 : Dynamic = aten::tanh(%18)
+  %20 : Dynamic = aten::mul(%12, %19)
+  return (%18, %20);
+}
+
+testDifferentiate
+graph(%0 : Float(2, 3, 4)
+      %1 : Float(2, 3, 4)) {
+  %2 : Float(2, 3, 4) = aten::mul(%0, %1)
+  %3 : Float(2, 3, 4) = aten::mul(%2, %0)
+  %4 : Float(2, 3, 4) = aten::add[alpha={1}](%3, %1)
+  return (%4, %2);
+}
+graph(%0 : Float(2, 3, 4)
+      %1 : Float(2, 3, 4)
+      %2 : Float(2, 3, 4)
+      %3 : Float(2, 3, 4)
+      %4 : Float(2, 3, 4)) {
+  %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+    block0() {
+      -> (%0, %0)
+    }
+  %7 : Float(2, 3, 4), %8 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%5)
+    block0() {
+      %9 : Float(2, 3, 4) = aten::mul(%5, %2)
+      %10 : Float(2, 3, 4) = aten::mul(%5, %4)
+      -> (%9, %10)
+    }
+  %11 : Dynamic = prim::AutogradAdd(%1, %7)
+  %12 : Float(2, 3, 4), %13 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%11)
+    block0() {
+      %14 : Float(2, 3, 4) = aten::mul(%11, %3)
+      %15 : Float(2, 3, 4) = aten::mul(%11, %2)
+      -> (%14, %15)
+    }
+  %16 : Dynamic = prim::AutogradAdd(%8, %12)
+  %17 : Dynamic = prim::AutogradAdd(%6, %13)
+  return (%16, %17);
+}
+
+testDifferentiateWithRequiresGrad
+graph(%0 : Float(2, 3, 4)
+      %1 : Float(2, 3, 4)) {
+  %2 : Float(2, 3, 4) = aten::mul(%1, %1)
+  %3 : Float(2, 3, 4) = aten::add[alpha={1}](%2, %1)
+  %4 : Float(2, 3, 4) = aten::add[alpha={1}](%3, %0)
+  %5 : Float(2, 3, 4) = aten::mul(%4, %0)
+  %6 : Float(2, 3, 4) = aten::add[alpha={1}](%5, %1)
+  return (%3, %6, %4);
+}
+graph(%0 : Float(2, 3, 4)
+      %1 : Float(2, 3, 4)
+      %2 : Float(2, 3, 4)
+      %3 : Float(2, 3, 4)) {
+  %4 : Float(2, 3, 4), %5 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+    block0() {
+      -> (%0, %0)
+    }
+  %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%4)
+    block0() {
+      %8 : Float(2, 3, 4) = aten::mul(%4, %2)
+      %9 : Float(2, 3, 4) = aten::mul(%4, %3)
+      -> (%8, %9)
+    }
+  %10 : Dynamic = prim::AutogradAdd(%1, %6)
+  %11 : Float(2, 3, 4), %12 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%10)
+    block0() {
+      -> (%10, %10)
+    }
+  %13 : Dynamic = prim::AutogradAdd(%7, %12)
+  return (%13);
+}
+
diff --git a/test/expect/TestJit.test_cse.expect b/test/expect/TestJit.test_cse.expect
new file mode 100644
index 0000000..b3d1a81
--- /dev/null
+++ b/test/expect/TestJit.test_cse.expect
@@ -0,0 +1,10 @@
+graph(%0 : Double(2)
+      %1 : Double(2)) {
+  %2 : Double(2) = aten::add[alpha={1}](%0, %1)
+  %3 : Double(2) = aten::mul(%2, %2)
+  %4 : Double(2) = aten::mul(%3, %2)
+  %5 : Double(2) = aten::tanh(%4)
+  %6 : Double(2) = aten::add[alpha={1}](%5, %5)
+  %7 : Double(2) = aten::add[alpha={1}](%4, %6)
+  return (%7);
+}
diff --git a/test/expect/TestJit.test_decompose_addmm.expect b/test/expect/TestJit.test_decompose_addmm.expect
new file mode 100644
index 0000000..a409a81
--- /dev/null
+++ b/test/expect/TestJit.test_decompose_addmm.expect
@@ -0,0 +1,18 @@
+graph(%mat : Dynamic
+      %mat1 : Dynamic
+      %mat2 : Dynamic
+      %alpha : Dynamic
+      %beta : Dynamic) {
+  %5 : Dynamic = aten::mm(%mat1, %mat2)
+  %6 : Dynamic = aten::add[alpha={1}](%mat, %5)
+  %7 : Dynamic = aten::mm(%mat1, %mat2)
+  %8 : Dynamic = aten::add[alpha={1}](%mat, %7)
+  %c : Dynamic = aten::addmm[beta={2}, alpha={4.2}](%mat, %mat1, %mat2)
+  %10 : Number = prim::TensorToNum(%beta)
+  %11 : Number = prim::TensorToNum(%alpha)
+  %d : Dynamic = aten::addmm(%mat, %mat1, %mat2, %10, %11)
+  %13 : Dynamic = aten::add[alpha={1}](%6, %8)
+  %14 : Dynamic = aten::add[alpha={1}](%13, %c)
+  %15 : Dynamic = aten::add[alpha={1}](%14, %d)
+  return (%15);
+}
diff --git a/test/expect/TestJit.test_dropout.expect b/test/expect/TestJit.test_dropout.expect
new file mode 100644
index 0000000..e89bca0
--- /dev/null
+++ b/test/expect/TestJit.test_dropout.expect
@@ -0,0 +1,4 @@
+graph(%0 : Double(2, 2)) {
+  %1 : Double(2, 2) = ^Dropout(0.6, True, False)(%0), scope: Dropout
+  return (%1);
+}
diff --git a/test/expect/TestJit.test_fuse_last_device.expect b/test/expect/TestJit.test_fuse_last_device.expect
new file mode 100644
index 0000000..276fadc
--- /dev/null
+++ b/test/expect/TestJit.test_fuse_last_device.expect
@@ -0,0 +1,14 @@
+graph(%0 : Float(1)
+      %1 : Float(1)) {
+  %2 : Float(1) = prim::FusionGroup_0[device=1](%0, %1)
+  return (%2);
+}
+with prim::FusionGroup_0 = graph(%6 : Float(1)
+      %9 : Float(1)) {
+  %10 : Float(1) = aten::add[alpha={1}](%6, %9)
+  %8 : Float(1) = aten::mul(%6, %10)
+  %5 : Float(1) = aten::add[other={1}, alpha={1}](%8)
+  %3 : Float(1) = aten::tanh(%5)
+  %1 : Float(1) = aten::sigmoid(%3)
+  return (%1);
+}
diff --git a/test/expect/TestJit.test_fusion_distribute.expect b/test/expect/TestJit.test_fusion_distribute.expect
new file mode 100644
index 0000000..4465074
--- /dev/null
+++ b/test/expect/TestJit.test_fusion_distribute.expect
@@ -0,0 +1,16 @@
+graph(%0 : Float(4, 4)
+      %1 : Float(4, 4)) {
+  %2 : Float(4!, 2), %3 : Float(4!, 2) = aten::chunk[chunks=2, dim=1](%0)
+  %4 : Float(4!, 2), %5 : Float(4!, 2) = aten::chunk[chunks=2, dim=1](%1)
+  %6 : Float(4, 2) = prim::FusionGroup_0[device=0](%2, %4, %3, %5)
+  return (%6);
+}
+with prim::FusionGroup_0 = graph(%3 : Float(4!, 2)
+      %4 : Float(4!, 2)
+      %6 : Float(4!, 2)
+      %7 : Float(4!, 2)) {
+  %8 : Float(4, 2) = aten::add[alpha={1}](%6, %7)
+  %5 : Float(4, 2) = aten::add[alpha={1}](%3, %4)
+  %2 : Float(4, 2) = aten::mul(%5, %8)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_inplace_transplant.expect b/test/expect/TestJit.test_inplace_transplant.expect
new file mode 100644
index 0000000..e31e8c7
--- /dev/null
+++ b/test/expect/TestJit.test_inplace_transplant.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(1)) {
+  %1 : Double(1) = aten::clone(%0)
+  %2 : Double(1) = aten::add[other={2}, alpha={1}](%1)
+  %3 : Double(1) = aten::add[other={3}, alpha={1}](%2)
+  return (%3);
+}
diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect
new file mode 100644
index 0000000..7f6b3f1
--- /dev/null
+++ b/test/expect/TestJit.test_lstm_fusion_concat.expect
@@ -0,0 +1,43 @@
+graph(%0 : Float(3, 10)
+      %1 : Float(3, 20)
+      %2 : Float(3, 20)
+      %3 : Float(80, 10)
+      %4 : Float(80, 20)
+      %5 : Float(80)
+      %6 : Float(80)) {
+  %7 : Float(10!, 80!) = aten::t(%3)
+  %8 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%5)
+  %9 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%8, %0, %7)
+  %10 : Float(20!, 80!) = aten::t(%4)
+  %11 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%6)
+  %12 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%11, %1, %10)
+  %13 : Float(3!, 20), %14 : Float(3!, 20), %15 : Float(3!, 20), %16 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%9)
+  %17 : Float(3!, 20), %18 : Float(3!, 20), %19 : Float(3!, 20), %20 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%12)
+  %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
+  return (%21);
+}
+with prim::FusionGroup_0 = graph(%14 : Float(3, 20)
+      %24 : Float(3!, 20)
+      %25 : Float(3!, 20)
+      %27 : Float(3!, 20)
+      %28 : Float(3!, 20)
+      %30 : Float(3!, 20)
+      %31 : Float(3!, 20)
+      %33 : Float(3!, 20)
+      %34 : Float(3!, 20)) {
+  %35 : Float(3, 20) = aten::add[alpha={1}](%33, %34)
+  %32 : Float(3, 20) = aten::add[alpha={1}](%30, %31)
+  %29 : Float(3, 20) = aten::add[alpha={1}](%27, %28)
+  %26 : Float(3, 20) = aten::add[alpha={1}](%24, %25)
+  %23 : Float(3, 20) = aten::sigmoid(%35)
+  %21 : Float(3, 20) = aten::sigmoid(%32)
+  %19 : Float(3, 20) = aten::tanh(%29)
+  %17 : Float(3, 20) = aten::sigmoid(%26)
+  %15 : Float(3, 20) = aten::mul(%21, %14)
+  %12 : Float(3, 20) = aten::mul(%23, %19)
+  %9 : Float(3, 20) = aten::add[alpha={1}](%15, %12)
+  %6 : Float(3, 20) = aten::tanh(%9)
+  %5 : Float(3, 20) = aten::mul(%17, %6)
+  %2 : Float(6, 20) = aten::cat[dim=0](%5, %9)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_lstm_fusion_cpu.expect b/test/expect/TestJit.test_lstm_fusion_cpu.expect
new file mode 100644
index 0000000..642e5dd
--- /dev/null
+++ b/test/expect/TestJit.test_lstm_fusion_cpu.expect
@@ -0,0 +1,42 @@
+graph(%0 : Float(3, 10)
+      %1 : Float(3, 20)
+      %2 : Float(3, 20)
+      %3 : Float(80, 10)
+      %4 : Float(80, 20)
+      %5 : Float(80)
+      %6 : Float(80)) {
+  %7 : Float(10!, 80!) = aten::t(%3)
+  %8 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%5)
+  %9 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%8, %0, %7)
+  %10 : Float(20!, 80!) = aten::t(%4)
+  %11 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%6)
+  %12 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%11, %1, %10)
+  %13 : Float(3!, 20), %14 : Float(3!, 20), %15 : Float(3!, 20), %16 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%9)
+  %17 : Float(3!, 20), %18 : Float(3!, 20), %19 : Float(3!, 20), %20 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%12)
+  %21 : Float(3, 20), %22 : Float(3, 20) = prim::FusionGroup_0[device=-1](%2, %16, %20, %15, %19, %14, %18, %13, %17)
+  return (%21, %22);
+}
+with prim::FusionGroup_0 = graph(%12 : Float(3, 20)
+      %22 : Float(3!, 20)
+      %23 : Float(3!, 20)
+      %25 : Float(3!, 20)
+      %26 : Float(3!, 20)
+      %28 : Float(3!, 20)
+      %29 : Float(3!, 20)
+      %31 : Float(3!, 20)
+      %32 : Float(3!, 20)) {
+  %33 : Float(3, 20) = aten::add[alpha={1}](%31, %32)
+  %30 : Float(3, 20) = aten::add[alpha={1}](%28, %29)
+  %27 : Float(3, 20) = aten::add[alpha={1}](%25, %26)
+  %24 : Float(3, 20) = aten::add[alpha={1}](%22, %23)
+  %21 : Float(3, 20) = aten::sigmoid(%33)
+  %19 : Float(3, 20) = aten::sigmoid(%30)
+  %17 : Float(3, 20) = aten::tanh(%27)
+  %15 : Float(3, 20) = aten::sigmoid(%24)
+  %13 : Float(3, 20) = aten::mul(%19, %12)
+  %10 : Float(3, 20) = aten::mul(%21, %17)
+  %7 : Float(3, 20) = aten::add[alpha={1}](%13, %10)
+  %4 : Float(3, 20) = aten::tanh(%7)
+  %2 : Float(3, 20) = aten::mul(%15, %4)
+  return (%2, %7);
+}
diff --git a/test/expect/TestJit.test_lstm_fusion_cuda.expect b/test/expect/TestJit.test_lstm_fusion_cuda.expect
new file mode 100644
index 0000000..f239399
--- /dev/null
+++ b/test/expect/TestJit.test_lstm_fusion_cuda.expect
@@ -0,0 +1,42 @@
+graph(%0 : Float(3, 10)
+      %1 : Float(3, 20)
+      %2 : Float(3, 20)
+      %3 : Float(80, 10)
+      %4 : Float(80, 20)
+      %5 : Float(80)
+      %6 : Float(80)) {
+  %7 : Float(10!, 80!) = aten::t(%3)
+  %8 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%5)
+  %9 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%8, %0, %7)
+  %10 : Float(20!, 80!) = aten::t(%4)
+  %11 : Float(3!, 80) = aten::expand[size=[3, 80], implicit=0](%6)
+  %12 : Float(3, 80) = aten::addmm[alpha={1}, beta={1}](%11, %1, %10)
+  %13 : Float(3!, 20), %14 : Float(3!, 20), %15 : Float(3!, 20), %16 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%9)
+  %17 : Float(3!, 20), %18 : Float(3!, 20), %19 : Float(3!, 20), %20 : Float(3!, 20) = aten::chunk[chunks=4, dim=1](%12)
+  %21 : Float(3, 20), %22 : Float(3, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
+  return (%21, %22);
+}
+with prim::FusionGroup_0 = graph(%12 : Float(3, 20)
+      %22 : Float(3!, 20)
+      %23 : Float(3!, 20)
+      %25 : Float(3!, 20)
+      %26 : Float(3!, 20)
+      %28 : Float(3!, 20)
+      %29 : Float(3!, 20)
+      %31 : Float(3!, 20)
+      %32 : Float(3!, 20)) {
+  %33 : Float(3, 20) = aten::add[alpha={1}](%31, %32)
+  %30 : Float(3, 20) = aten::add[alpha={1}](%28, %29)
+  %27 : Float(3, 20) = aten::add[alpha={1}](%25, %26)
+  %24 : Float(3, 20) = aten::add[alpha={1}](%22, %23)
+  %21 : Float(3, 20) = aten::sigmoid(%33)
+  %19 : Float(3, 20) = aten::sigmoid(%30)
+  %17 : Float(3, 20) = aten::tanh(%27)
+  %15 : Float(3, 20) = aten::sigmoid(%24)
+  %13 : Float(3, 20) = aten::mul(%19, %12)
+  %10 : Float(3, 20) = aten::mul(%21, %17)
+  %7 : Float(3, 20) = aten::add[alpha={1}](%13, %10)
+  %4 : Float(3, 20) = aten::tanh(%7)
+  %2 : Float(3, 20) = aten::mul(%15, %4)
+  return (%2, %7);
+}
diff --git a/test/expect/TestJit.test_nested_inplace.expect b/test/expect/TestJit.test_nested_inplace.expect
new file mode 100644
index 0000000..ff7e60b
--- /dev/null
+++ b/test/expect/TestJit.test_nested_inplace.expect
@@ -0,0 +1,4 @@
+graph(%0 : Double(2, 2)) {
+  %1 : Double(2, 2) = aten::threshold[threshold={0}, value={0}](%0)
+  return (%1);
+}
diff --git a/test/expect/TestJit.test_peephole.expect b/test/expect/TestJit.test_peephole.expect
new file mode 100644
index 0000000..6acd3d9
--- /dev/null
+++ b/test/expect/TestJit.test_peephole.expect
@@ -0,0 +1,4 @@
+graph(%0 : Double(1)
+      %1 : Double(1)) {
+  return (%0);
+}
diff --git a/test/expect/TestJit.test_peephole_cuda-different_device.expect b/test/expect/TestJit.test_peephole_cuda-different_device.expect
new file mode 100644
index 0000000..6f399da
--- /dev/null
+++ b/test/expect/TestJit.test_peephole_cuda-different_device.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(1)
+      %1 : Double(1)) {
+  %2 : Double(1) = aten::type_as(%0, %1)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_peephole_cuda-same_device.expect b/test/expect/TestJit.test_peephole_cuda-same_device.expect
new file mode 100644
index 0000000..6acd3d9
--- /dev/null
+++ b/test/expect/TestJit.test_peephole_cuda-same_device.expect
@@ -0,0 +1,4 @@
+graph(%0 : Double(1)
+      %1 : Double(1)) {
+  return (%0);
+}
diff --git a/test/expect/TestJit.test_python_ir.expect b/test/expect/TestJit.test_python_ir.expect
new file mode 100644
index 0000000..8631f36
--- /dev/null
+++ b/test/expect/TestJit.test_python_ir.expect
@@ -0,0 +1,9 @@
+graph(%0 : Dynamic
+      %1 : Dynamic) {
+  %2 : Double(1) = aten::add[alpha={1}](%0, %1)
+  %3 : Double(1) = aten::mul(%0, %2)
+  %4 : Double(1) = aten::tanh(%3)
+  %5 : Double(1) = aten::sigmoid(%4)
+  %6 : Dynamic = prim::TensorTest[a= 1  1  1  1 [ CPUDoubleTensor{2,2} ]]()
+  return (%5);
+}
diff --git a/test/expect/TestJit.test_repeated_input.expect b/test/expect/TestJit.test_repeated_input.expect
new file mode 100644
index 0000000..57e5706
--- /dev/null
+++ b/test/expect/TestJit.test_repeated_input.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(2, 2)
+      %1 : Double(2, 2)) {
+  %2 : Double(2, 2) = aten::add[alpha={1}](%0, %1)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_repeated_output.expect b/test/expect/TestJit.test_repeated_output.expect
new file mode 100644
index 0000000..b3baff6
--- /dev/null
+++ b/test/expect/TestJit.test_repeated_output.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(2, 2)
+      %1 : Double(2, 2)) {
+  %2 : Double(2, 2) = aten::add[alpha={1}](%0, %1)
+  return (%2, %2);
+}
diff --git a/test/expect/TestJit.test_scopes.expect b/test/expect/TestJit.test_scopes.expect
new file mode 100644
index 0000000..0557837
--- /dev/null
+++ b/test/expect/TestJit.test_scopes.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(1)
+      %1 : Double(1)) {
+  %2 : Double(1) = aten::add[alpha={1}](%0, %1)
+  %3 : Double(1) = aten::mul(%0, %2), scope: Foo
+  %4 : Double(1) = aten::tanh(%3), scope: Foo/Bar
+  %5 : Double(1) = aten::sigmoid(%4), scope: Foo
+  return (%5);
+}
diff --git a/test/expect/TestJit.test_scopes_identity_node.expect b/test/expect/TestJit.test_scopes_identity_node.expect
new file mode 100644
index 0000000..d793062
--- /dev/null
+++ b/test/expect/TestJit.test_scopes_identity_node.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(1, 3, 227, 227)
+      %1 : Double(64, 3, 11, 11)
+      %2 : Double(64)) {
+  %3 : Double(1, 64, 56, 56) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[11, 11], pads=[2, 2, 2, 2], strides=[4, 4]](%0, %1, %2), scope: Net/Sequential[features]/Conv2d[0]
+  %4 : Double(1, 64, 56, 56) = onnx::Relu(%3), scope: Net/Sequential[features]/ReLU[1]
+  %5 : Double(1, 64, 27, 27) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%4), scope: Net/Sequential[features]/MaxPool2d[2]
+  return (%5);
+}
diff --git a/test/expect/TestJit.test_scopes_intermediate_node.expect b/test/expect/TestJit.test_scopes_intermediate_node.expect
new file mode 100644
index 0000000..f616a73
--- /dev/null
+++ b/test/expect/TestJit.test_scopes_intermediate_node.expect
@@ -0,0 +1,4 @@
+graph(%0 : Double(2)) {
+  %1 : Double(2) = onnx::LogSoftmax[axis=0](%0), scope: Net
+  return (%1);
+}
diff --git a/test/expect/TestJit.test_shape_analysis_broadcast.expect b/test/expect/TestJit.test_shape_analysis_broadcast.expect
new file mode 100644
index 0000000..bbe5b74
--- /dev/null
+++ b/test/expect/TestJit.test_shape_analysis_broadcast.expect
@@ -0,0 +1,7 @@
+graph(%a : Double(3, 1, 5)
+      %b : Double(4, 1, 8, 5)) {
+  %2 : Double(4!, 3!, 8!, 5) = aten::expand[size=[4, 3, 8, 5], implicit=0](%a)
+  %3 : Double(4!, 3!, 8, 5) = aten::expand[size=[4, 3, 8, 5], implicit=0](%b)
+  %4 : Double(4, 3, 8, 5) = aten::add[alpha={1}](%2, %3)
+  return (%4);
+}
diff --git a/test/expect/TestJit.test_shared_param.expect b/test/expect/TestJit.test_shared_param.expect
new file mode 100644
index 0000000..ec758df
--- /dev/null
+++ b/test/expect/TestJit.test_shared_param.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(2, 2)
+      %1 : Double(2, 2)) {
+  %2 : Double(2, 2) = aten::mul(%0, %1), scope: MyModule
+  %3 : Double(2, 2) = aten::add[alpha={1}](%2, %1), scope: MyModule
+  return (%3);
+}
diff --git a/test/expect/TestJit.test_simple.expect b/test/expect/TestJit.test_simple.expect
new file mode 100644
index 0000000..1db84de
--- /dev/null
+++ b/test/expect/TestJit.test_simple.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(1)
+      %1 : Double(1)) {
+  %2 : Double(1) = aten::add[alpha={1}](%0, %1)
+  %3 : Double(1) = aten::mul(%0, %2)
+  %4 : Double(1) = aten::tanh(%3)
+  %5 : Double(1) = aten::sigmoid(%4)
+  return (%5);
+}
diff --git a/test/expect/TestJit.test_trace_size.expect b/test/expect/TestJit.test_trace_size.expect
new file mode 100644
index 0000000..1531045
--- /dev/null
+++ b/test/expect/TestJit.test_trace_size.expect
@@ -0,0 +1,9 @@
+graph(%0 : Double(5, 2, 4)) {
+  %1 : Long() = aten::size[dim=1](%0)
+  %2 : Long() = aten::mul[other={2}](%1)
+  %3 : Long() = aten::size[dim=0](%0)
+  %4 : Long() = prim::Constant[value={2}]()
+  %5 : Dynamic = aten::stack[dim=0](%2, %3, %4)
+  %6 : Double(4, 5, 2) = aten::view(%0, %5)
+  return (%6);
+}
diff --git a/test/expect/TestJit.test_trace_size_with_grad.expect b/test/expect/TestJit.test_trace_size_with_grad.expect
new file mode 100644
index 0000000..1531045
--- /dev/null
+++ b/test/expect/TestJit.test_trace_size_with_grad.expect
@@ -0,0 +1,9 @@
+graph(%0 : Double(5, 2, 4)) {
+  %1 : Long() = aten::size[dim=1](%0)
+  %2 : Long() = aten::mul[other={2}](%1)
+  %3 : Long() = aten::size[dim=0](%0)
+  %4 : Long() = prim::Constant[value={2}]()
+  %5 : Dynamic = aten::stack[dim=0](%2, %3, %4)
+  %6 : Double(4, 5, 2) = aten::view(%0, %5)
+  return (%6);
+}
diff --git a/test/expect/TestPytorchExportModes.test_aten_fallback.expect b/test/expect/TestPytorchExportModes.test_aten_fallback.expect
new file mode 100644
index 0000000..fdb6194
--- /dev/null
+++ b/test/expect/TestPytorchExportModes.test_aten_fallback.expect
@@ -0,0 +1,17 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "0", type:Tensor dims: 3 4},{name: "1", type:Tensor dims: 3 4}]
+      outputs: [{name: "3", type:Tensor dims: 3 3},{name: "4", type:Tensor dims: 3 4}]
+      initializers: []
+      nodes: [
+        Node {type: "Add", inputs: [0,1], outputs: [2], attributes: []},
+        Node {type: "ATen", inputs: [2], outputs: [3,4], attributes: [{ name: 'operator', type: string, value: 'qr'}]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_call_python_fn_from_script_fn.expect b/test/expect/TestScript.test_call_python_fn_from_script_fn.expect
new file mode 100644
index 0000000..593a7cb
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_fn_from_script_fn.expect
@@ -0,0 +1,8 @@
+graph(%x : Dynamic) {
+  %1 : Dynamic = ^python_fn()(%x)
+  %2 : int = prim::Constant[value={1}]()
+  %3 : Dynamic = prim::NumToTensor(%2)
+  %4 : Dynamic = aten::type_as(%3, %1)
+  %6 : Dynamic = aten::add[alpha={1}](%1, %4)
+  return (%6);
+}
diff --git a/test/expect/TestScript.test_call_python_fn_from_script_module.expect b/test/expect/TestScript.test_call_python_fn_from_script_module.expect
new file mode 100644
index 0000000..6688c82
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_fn_from_script_module.expect
@@ -0,0 +1,6 @@
+graph(%x : Dynamic
+      %1 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %3 : Dynamic = ^python_fn()(%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_python_fn_from_traced_module.expect b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect
new file mode 100644
index 0000000..2a87a36
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 3)) {
+  %2 : Double(3, 4) = aten::neg(%0)
+  %4 : Double(3, 3) = aten::mm(%2, %1)
+  return (%4);
+}
diff --git a/test/expect/TestScript.test_call_python_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_python_fn_from_tracing_fn.expect
new file mode 100644
index 0000000..4eb19bb
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_fn_from_tracing_fn.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(3, 4)) {
+  %1 : Double(3, 4) = aten::neg(%0)
+  %2 : Double(3, 4) = aten::add[other={1}, alpha={1}](%1)
+  return (%2);
+}
diff --git a/test/expect/TestScript.test_call_python_mod_from_script_fn.expect b/test/expect/TestScript.test_call_python_mod_from_script_fn.expect
new file mode 100644
index 0000000..cf498dc
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_mod_from_script_fn.expect
@@ -0,0 +1,8 @@
+graph(%x : Dynamic) {
+  %1 : Dynamic = ^<python_value>()(%x)
+  %2 : int = prim::Constant[value={1}]()
+  %3 : Dynamic = prim::NumToTensor(%2)
+  %4 : Dynamic = aten::type_as(%3, %1)
+  %6 : Dynamic = aten::add[alpha={1}](%1, %4)
+  return (%6);
+}
diff --git a/test/expect/TestScript.test_call_python_mod_from_script_module.expect b/test/expect/TestScript.test_call_python_mod_from_script_module.expect
new file mode 100644
index 0000000..cfd907e
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_mod_from_script_module.expect
@@ -0,0 +1,6 @@
+graph(%x : Dynamic
+      %1 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %3 : Dynamic = ^<python_value>()(%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_python_mod_from_traced_module.expect b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect
new file mode 100644
index 0000000..925bbf1
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 5)
+      %2 : Double(5, 7)) {
+  %4 : Double(3, 5) = aten::mm(%0, %1)
+  %6 : Double(3, 7) = aten::mm(%4, %2)
+  %7 : Double(3, 7) = aten::add[other={1}, alpha={1}](%6)
+  return (%7);
+}
diff --git a/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect
new file mode 100644
index 0000000..4de15a5
--- /dev/null
+++ b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)) {
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
+  %3 : Double(3, 3) = aten::mm(%0, %1)
+  %4 : Double(3, 3) = aten::add[other={1}, alpha={1}](%3)
+  return (%4);
+}
diff --git a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
new file mode 100644
index 0000000..b50d218
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
@@ -0,0 +1,8 @@
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::neg(%x)
+  %2 : int = prim::Constant[value={1}]()
+  %3 : Dynamic = prim::NumToTensor(%2)
+  %4 : Dynamic = aten::type_as(%3, %1)
+  %6 : Dynamic = aten::add[alpha={1}](%1, %4)
+  return (%6);
+}
diff --git a/test/expect/TestScript.test_call_script_fn_from_script_module.expect b/test/expect/TestScript.test_call_script_fn_from_script_module.expect
new file mode 100644
index 0000000..08ffa4b
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_fn_from_script_module.expect
@@ -0,0 +1,6 @@
+graph(%x : Dynamic
+      %1 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %3 : Dynamic = aten::neg(%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_script_fn_from_traced_module.expect b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect
new file mode 100644
index 0000000..adaab38
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 5)) {
+  %3 : Double(3, 5) = aten::mm(%0, %1)
+  %5 : Double(3, 5) = aten::neg(%3)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect
new file mode 100644
index 0000000..cffec80
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(3, 4)) {
+  %2 : Double(3, 4) = aten::neg(%0)
+  %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
new file mode 100644
index 0000000..e3008f4
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
@@ -0,0 +1,17 @@
+graph(%x : Dynamic) {
+  %1 : int = prim::Constant[value={4}]()
+  %2 : int = prim::Constant[value={3}]()
+  %3 : int = prim::Constant[value={6}]()
+  %4 : int = prim::Constant[value={0}]()
+  %5 : int[] = prim::Constant[value= 0 -1 [ CPULongTensor{2} ]]()
+  %6 : Dynamic = prim::NumToTensor(%1)
+  %7 : Dynamic = prim::NumToTensor(%2)
+  %8 : int[] = aten::stack[dim=0](%6, %7)
+  %9 : Dynamic = aten::zeros(%8, %3, %4, %5)
+  %10 : Dynamic = aten::mm(%x, %9)
+  %11 : int = prim::Constant[value={1}]()
+  %12 : Dynamic = prim::NumToTensor(%11)
+  %13 : Dynamic = aten::type_as(%12, %10)
+  %15 : Dynamic = aten::add[alpha={1}](%10, %13)
+  return (%15);
+}
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_module.expect b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
new file mode 100644
index 0000000..5cae9dc
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
@@ -0,0 +1,7 @@
+graph(%x : Dynamic
+      %1 : Dynamic
+      %3 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %4 : Dynamic = aten::mm(%2, %3)
+  return (%4);
+}
diff --git a/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect
new file mode 100644
index 0000000..d446882
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)) {
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
+  %4 : Double(3, 3) = aten::mm(%0, %1)
+  %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_call_script_module_from_traced_module.expect b/test/expect/TestScript.test_call_script_module_from_traced_module.expect
new file mode 100644
index 0000000..c249ddc
--- /dev/null
+++ b/test/expect/TestScript.test_call_script_module_from_traced_module.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 5)
+      %2 : Double(5, 7)) {
+  %4 : Double(3, 5) = aten::mm(%0, %1)
+  %7 : Double(3, 7) = aten::mm(%4, %2)
+  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7)
+  return (%8);
+}
diff --git a/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect b/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect
new file mode 100644
index 0000000..a9894e7
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect
@@ -0,0 +1,8 @@
+graph(%x : Dynamic) {
+  %1 : Double(3, 4) = aten::neg(%x)
+  %2 : int = prim::Constant[value={1}]()
+  %3 : Dynamic = prim::NumToTensor(%2)
+  %4 : Dynamic = aten::type_as(%3, %1)
+  %6 : Dynamic = aten::add[alpha={1}](%1, %4)
+  return (%6);
+}
diff --git a/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect
new file mode 100644
index 0000000..4e25a85
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 5)) {
+  %3 : Double(3, 5) = aten::mm(%0, %1)
+  %5 : Double(3, 4) = aten::neg(%3)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect
new file mode 100644
index 0000000..cffec80
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect
@@ -0,0 +1,5 @@
+graph(%0 : Double(3, 4)) {
+  %2 : Double(3, 4) = aten::neg(%0)
+  %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
new file mode 100644
index 0000000..e1e1912
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
@@ -0,0 +1,9 @@
+graph(%x : Dynamic) {
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
+  %2 : Double(3, 3) = aten::mm(%x, %1)
+  %3 : int = prim::Constant[value={1}]()
+  %4 : Dynamic = prim::NumToTensor(%3)
+  %5 : Dynamic = aten::type_as(%4, %2)
+  %7 : Dynamic = aten::add[alpha={1}](%2, %5)
+  return (%7);
+}
diff --git a/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect
new file mode 100644
index 0000000..d446882
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect
@@ -0,0 +1,6 @@
+graph(%0 : Double(3, 4)) {
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
+  %4 : Double(3, 3) = aten::mm(%0, %1)
+  %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_call_traced_module_from_traced_module.expect b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect
new file mode 100644
index 0000000..c249ddc
--- /dev/null
+++ b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(3, 4)
+      %1 : Double(4, 5)
+      %2 : Double(5, 7)) {
+  %4 : Double(3, 5) = aten::mm(%0, %1)
+  %7 : Double(3, 7) = aten::mm(%4, %2)
+  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7)
+  return (%8);
+}
diff --git a/test/expect/TestScript.test_call_tracing_fn_from_script_module.expect b/test/expect/TestScript.test_call_tracing_fn_from_script_module.expect
new file mode 100644
index 0000000..6194232
--- /dev/null
+++ b/test/expect/TestScript.test_call_tracing_fn_from_script_module.expect
@@ -0,0 +1,6 @@
+graph(%x : Dynamic
+      %1 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %3 : Double(3, 3) = aten::neg(%2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_call_tracing_mod_from_script_module.expect b/test/expect/TestScript.test_call_tracing_mod_from_script_module.expect
new file mode 100644
index 0000000..6b38869
--- /dev/null
+++ b/test/expect/TestScript.test_call_tracing_mod_from_script_module.expect
@@ -0,0 +1,7 @@
+graph(%x : Dynamic
+      %1 : Dynamic
+      %3 : Dynamic) {
+  %2 : Dynamic = aten::mm(%x, %1)
+  %4 : Double(3, 5) = aten::mm(%2, %3)
+  return (%4);
+}
diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect
new file mode 100644
index 0000000..5bcef43
--- /dev/null
+++ b/test/expect/TestScript.test_cat_lifts.expect
@@ -0,0 +1,12 @@
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1](%x, %x)
+  return (%1);
+}
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1]()
+  return (%1);
+}
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1](%x)
+  return (%1);
+}
diff --git a/test/expect/TestScript.test_erase_number_types.expect b/test/expect/TestScript.test_erase_number_types.expect
new file mode 100644
index 0000000..9632635
--- /dev/null
+++ b/test/expect/TestScript.test_erase_number_types.expect
@@ -0,0 +1,10 @@
+graph(%a : Dynamic) {
+  %1 : Long() = prim::Constant[value={7}]()
+  %2 : Long() = prim::Constant[value={1}]()
+  %3 : Dynamic = aten::add[alpha={1}](%1, %2)
+  %4 : Long() = prim::Constant[value={3}]()
+  %5 : Dynamic = aten::add[alpha={1}](%3, %4)
+  %c.1 : Dynamic = aten::add[alpha={1}](%a, %5)
+  %c : Dynamic = aten::add[alpha={1}](%c.1, %5)
+  return (%c);
+}
diff --git a/test/expect/TestScript.test_if_for_in_range.expect b/test/expect/TestScript.test_if_for_in_range.expect
new file mode 100644
index 0000000..a37c165
--- /dev/null
+++ b/test/expect/TestScript.test_if_for_in_range.expect
@@ -0,0 +1,26 @@
+graph(%a.1 : Dynamic
+      %b.1 : Dynamic) {
+  %d.1 : Long() = prim::Constant[value={3}]()
+  %3 : Long() = prim::Constant[value={20}]()
+  %4 : Byte() = prim::Constant[value={1}]()
+  %a : Dynamic, %d : Long(), %b : Dynamic = prim::Loop(%3, %4, %a.1, %d.1, %b.1)
+    block0(%_ : Dynamic, %6 : Dynamic, %10 : Long(), %14 : Dynamic) {
+      %7 : Long() = prim::Constant[value={10}]()
+      %8 : Dynamic = aten::gt(%6, %7)
+      %a.3 : Dynamic, %b.3 : Dynamic, %d.3 : Long() = prim::If(%8)
+        block0() {
+          %9 : Long() = prim::Constant[value={3}]()
+          %a.2 : Dynamic = aten::add[alpha={1}](%9, %10)
+          -> (%a.2, %14, %10)
+        }
+        block1() {
+          %12 : Long() = prim::Constant[value={3}]()
+          %b.2 : Dynamic = aten::add[alpha={1}](%12, %10)
+          %d.2 : Long() = prim::Constant[value={4}]()
+          -> (%6, %b.2, %d.2)
+        }
+      %20 : Byte() = prim::Constant[value={1}]()
+      -> (%20, %a.3, %d.3, %b.3)
+    }
+  return (%d);
+}
diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
new file mode 100644
index 0000000..24ff0fe
--- /dev/null
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(100)
+      %1 : Long(4)
+      %2 : Double(1, 1, 1, 4)) {
+  %3 : Double(4) = aten::view[size=[4]](%2)
+  %4 : Long(4) = aten::_cast_Long[non_blocking=0](%1)
+  %11 : Double(100) = aten::index_put(%0, %4, %3)
+  return (%11);
+}
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
new file mode 100644
index 0000000..f483213
--- /dev/null
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -0,0 +1,7 @@
+graph(%0 : Double(100)
+      %1 : Long(4)
+      %2 : Double(4)) {
+  %3 : Long(4) = aten::_cast_Long[non_blocking=0](%1)
+  %10 : Double(100) = aten::index_put(%0, %3, %2)
+  return (%10);
+}
diff --git a/test/expect/TestScript.test_index_select_shape_prop.expect b/test/expect/TestScript.test_index_select_shape_prop.expect
new file mode 100644
index 0000000..32a9d77
--- /dev/null
+++ b/test/expect/TestScript.test_index_select_shape_prop.expect
@@ -0,0 +1,5 @@
+graph(%x : Double(2, 2)
+      %y : Long(4)) {
+  %2 : Double(2, 4) = aten::index_select[dim=1](%x, %y)
+  return (%2);
+}
diff --git a/test/expect/TestScript.test_loop_unroll_unused_counter.expect b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
new file mode 100644
index 0000000..1a03ab9
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
@@ -0,0 +1,54 @@
+graph(%x : Dynamic) {
+  %y.1 : Dynamic = ^FIXME_zerol()()
+  %2 : Byte() = prim::Constant[value={1}]()
+  %3 : Dynamic = aten::div[other={8}](%x)
+  %4 : Dynamic = aten::mul[other={8}](%3)
+  %5 : Dynamic = aten::sub[alpha={1}](%x, %4)
+  %y.3 : Dynamic = prim::Loop(%3, %2, %y.1)
+    block0(%i.1 : Dynamic, %8 : Dynamic) {
+      %9 : int = prim::Constant[value={1}]()
+      %10 : Dynamic = prim::NumToTensor(%9)
+      %11 : Dynamic = aten::type_as(%10, %8)
+      %y.12 : Dynamic = aten::add[alpha={1}](%8, %11)
+      %13 : int = prim::Constant[value={1}]()
+      %14 : Dynamic = prim::NumToTensor(%13)
+      %15 : Dynamic = aten::type_as(%14, %y.12)
+      %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %15)
+      %17 : int = prim::Constant[value={1}]()
+      %18 : Dynamic = prim::NumToTensor(%17)
+      %19 : Dynamic = aten::type_as(%18, %y.5)
+      %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %19)
+      %21 : int = prim::Constant[value={1}]()
+      %22 : Dynamic = prim::NumToTensor(%21)
+      %23 : Dynamic = aten::type_as(%22, %y.6)
+      %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %23)
+      %25 : int = prim::Constant[value={1}]()
+      %26 : Dynamic = prim::NumToTensor(%25)
+      %27 : Dynamic = aten::type_as(%26, %y.7)
+      %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %27)
+      %29 : int = prim::Constant[value={1}]()
+      %30 : Dynamic = prim::NumToTensor(%29)
+      %31 : Dynamic = aten::type_as(%30, %y.8)
+      %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %31)
+      %33 : int = prim::Constant[value={1}]()
+      %34 : Dynamic = prim::NumToTensor(%33)
+      %35 : Dynamic = aten::type_as(%34, %y.9)
+      %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %35)
+      %37 : int = prim::Constant[value={1}]()
+      %38 : Dynamic = prim::NumToTensor(%37)
+      %39 : Dynamic = aten::type_as(%38, %y.10)
+      %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %39)
+      %41 : Byte() = prim::Constant[value={1}]()
+      -> (%41, %y.11)
+    }
+  %y : Dynamic = prim::Loop(%5, %2, %y.3)
+    block0(%i : Dynamic, %44 : Dynamic) {
+      %45 : int = prim::Constant[value={1}]()
+      %46 : Dynamic = prim::NumToTensor(%45)
+      %47 : Dynamic = aten::type_as(%46, %44)
+      %y.4 : Dynamic = aten::add[alpha={1}](%44, %47)
+      %49 : Byte() = prim::Constant[value={1}]()
+      -> (%49, %y.4)
+    }
+  return (%y);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling.expect b/test/expect/TestScript.test_loop_unrolling.expect
new file mode 100644
index 0000000..54fa974
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling.expect
@@ -0,0 +1,37 @@
+graph(%x : Dynamic) {
+  %y.1 : Dynamic = ^FIXME_zerol()()
+  %2 : Byte() = prim::Constant[value={1}]()
+  %3 : Long() = prim::Constant[value={0}]()
+  %4 : Dynamic = aten::div[other={8}](%x)
+  %5 : Dynamic = aten::mul[other={8}](%4)
+  %6 : Dynamic = aten::sub[alpha={1}](%x, %5)
+  %7 : Dynamic, %y.3 : Dynamic = prim::Loop(%4, %2, %3, %y.1)
+    block0(%i.1 : Dynamic, %10 : Dynamic, %11 : Dynamic) {
+      %y.12 : Dynamic = aten::add[alpha={1}](%11, %10)
+      %13 : Dynamic = aten::add[alpha={1}, other={1}](%10)
+      %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %13)
+      %15 : Dynamic = aten::add[alpha={1}, other={1}](%13)
+      %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %15)
+      %17 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+      %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %17)
+      %19 : Dynamic = aten::add[alpha={1}, other={1}](%17)
+      %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %19)
+      %21 : Dynamic = aten::add[alpha={1}, other={1}](%19)
+      %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %21)
+      %23 : Dynamic = aten::add[alpha={1}, other={1}](%21)
+      %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %23)
+      %25 : Dynamic = aten::add[alpha={1}, other={1}](%23)
+      %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %25)
+      %27 : Byte() = prim::Constant[value={1}]()
+      %28 : Dynamic = aten::add[alpha={1}, other={1}](%25)
+      -> (%27, %28, %y.11)
+    }
+  %29 : Dynamic, %y : Dynamic = prim::Loop(%6, %2, %7, %y.3)
+    block0(%i : Dynamic, %32 : Dynamic, %33 : Dynamic) {
+      %y.4 : Dynamic = aten::add[alpha={1}](%33, %32)
+      %35 : Byte() = prim::Constant[value={1}]()
+      %36 : Dynamic = aten::add[alpha={1}, other={1}](%32)
+      -> (%35, %36, %y.4)
+    }
+  return (%y);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_const.expect b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
new file mode 100644
index 0000000..5a6fc16
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
@@ -0,0 +1,44 @@
+graph() {
+  %y.1 : Dynamic = ^FIXME_zerol()()
+  %1 : int = prim::Constant[value={1}]()
+  %2 : Dynamic = prim::NumToTensor(%1)
+  %3 : Dynamic = aten::type_as(%2, %y.1)
+  %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %3)
+  %5 : int = prim::Constant[value={1}]()
+  %6 : Dynamic = prim::NumToTensor(%5)
+  %7 : Dynamic = aten::type_as(%6, %y.11)
+  %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %7)
+  %9 : int = prim::Constant[value={1}]()
+  %10 : Dynamic = prim::NumToTensor(%9)
+  %11 : Dynamic = aten::type_as(%10, %y.2)
+  %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %11)
+  %13 : int = prim::Constant[value={1}]()
+  %14 : Dynamic = prim::NumToTensor(%13)
+  %15 : Dynamic = aten::type_as(%14, %y.3)
+  %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %15)
+  %17 : int = prim::Constant[value={1}]()
+  %18 : Dynamic = prim::NumToTensor(%17)
+  %19 : Dynamic = aten::type_as(%18, %y.4)
+  %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %19)
+  %21 : int = prim::Constant[value={1}]()
+  %22 : Dynamic = prim::NumToTensor(%21)
+  %23 : Dynamic = aten::type_as(%22, %y.5)
+  %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %23)
+  %25 : int = prim::Constant[value={1}]()
+  %26 : Dynamic = prim::NumToTensor(%25)
+  %27 : Dynamic = aten::type_as(%26, %y.6)
+  %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %27)
+  %29 : int = prim::Constant[value={1}]()
+  %30 : Dynamic = prim::NumToTensor(%29)
+  %31 : Dynamic = aten::type_as(%30, %y.7)
+  %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %31)
+  %33 : int = prim::Constant[value={1}]()
+  %34 : Dynamic = prim::NumToTensor(%33)
+  %35 : Dynamic = aten::type_as(%34, %y.8)
+  %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %35)
+  %37 : int = prim::Constant[value={1}]()
+  %38 : Dynamic = prim::NumToTensor(%37)
+  %39 : Dynamic = aten::type_as(%38, %y.9)
+  %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %39)
+  return (%y.10);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect
new file mode 100644
index 0000000..020a401
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect
@@ -0,0 +1,24 @@
+graph() {
+  %y.1 : Dynamic = ^FIXME_zerol()()
+  %1 : Long() = prim::Constant[value={0}]()
+  %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %1)
+  %3 : Dynamic = aten::add[alpha={1}, other={1}](%1)
+  %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %3)
+  %5 : Dynamic = aten::add[alpha={1}, other={1}](%3)
+  %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %5)
+  %7 : Dynamic = aten::add[alpha={1}, other={1}](%5)
+  %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %7)
+  %9 : Dynamic = aten::add[alpha={1}, other={1}](%7)
+  %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %9)
+  %11 : Dynamic = aten::add[alpha={1}, other={1}](%9)
+  %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %11)
+  %13 : Dynamic = aten::add[alpha={1}, other={1}](%11)
+  %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %13)
+  %15 : Dynamic = aten::add[alpha={1}, other={1}](%13)
+  %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %15)
+  %17 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+  %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %17)
+  %19 : Dynamic = aten::add[alpha={1}, other={1}](%17)
+  %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %19)
+  return (%y.10);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_nested.expect b/test/expect/TestScript.test_loop_unrolling_nested.expect
new file mode 100644
index 0000000..f19c54d
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_nested.expect
@@ -0,0 +1,44 @@
+graph(%x : Dynamic) {
+  %y.1 : Dynamic = ^FIXME_zerol()()
+  %2 : int = prim::Constant[value={10}]()
+  %3 : Byte() = prim::Constant[value={1}]()
+  %y : Dynamic = prim::Loop(%2, %3, %y.1)
+    block0(%i : Dynamic, %6 : Dynamic) {
+      %7 : Byte() = prim::Constant[value={1}]()
+      %8 : Long() = prim::Constant[value={0}]()
+      %9 : Dynamic = aten::div[other={8}](%x)
+      %10 : Dynamic = aten::mul[other={8}](%9)
+      %11 : Dynamic = aten::sub[alpha={1}](%x, %10)
+      %12 : Dynamic, %y.4 : Dynamic = prim::Loop(%9, %7, %8, %6)
+        block0(%j.1 : Dynamic, %15 : Dynamic, %16 : Dynamic) {
+          %y.13 : Dynamic = aten::add[alpha={1}](%16, %15)
+          %18 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+          %y.6 : Dynamic = aten::add[alpha={1}](%y.13, %18)
+          %20 : Dynamic = aten::add[alpha={1}, other={1}](%18)
+          %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %20)
+          %22 : Dynamic = aten::add[alpha={1}, other={1}](%20)
+          %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %22)
+          %24 : Dynamic = aten::add[alpha={1}, other={1}](%22)
+          %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %24)
+          %26 : Dynamic = aten::add[alpha={1}, other={1}](%24)
+          %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %26)
+          %28 : Dynamic = aten::add[alpha={1}, other={1}](%26)
+          %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %28)
+          %30 : Dynamic = aten::add[alpha={1}, other={1}](%28)
+          %y.12 : Dynamic = aten::add[alpha={1}](%y.11, %30)
+          %32 : Byte() = prim::Constant[value={1}]()
+          %33 : Dynamic = aten::add[alpha={1}, other={1}](%30)
+          -> (%32, %33, %y.12)
+        }
+      %34 : Dynamic, %y.3 : Dynamic = prim::Loop(%11, %7, %12, %y.4)
+        block0(%j : Dynamic, %37 : Dynamic, %38 : Dynamic) {
+          %y.5 : Dynamic = aten::add[alpha={1}](%38, %37)
+          %40 : Byte() = prim::Constant[value={1}]()
+          %41 : Dynamic = aten::add[alpha={1}, other={1}](%37)
+          -> (%40, %41, %y.5)
+        }
+      %42 : Byte() = prim::Constant[value={1}]()
+      -> (%42, %y.3)
+    }
+  return (%y);
+}
diff --git a/test/expect/TestScript.test_math_numbers-float.expect b/test/expect/TestScript.test_math_numbers-float.expect
new file mode 100644
index 0000000..67ea8b4
--- /dev/null
+++ b/test/expect/TestScript.test_math_numbers-float.expect
@@ -0,0 +1,16 @@
+graph(%x : Dynamic) {
+  %1 : float = prim::Constant[value={1.1}]()
+  %2 : float = prim::Constant[value={3.1}]()
+  %3 : Dynamic = prim::NumToTensor(%1)
+  %4 : Dynamic = prim::NumToTensor(%2)
+  %5 : Dynamic = aten::add[alpha={1}](%3, %4)
+  %c : float = prim::TensorToNum(%5)
+  %7 : int = prim::Constant[value={1}]()
+  %8 : int = prim::Constant[value={6}]()
+  %9 : int = prim::Constant[value={0}]()
+  %10 : int[] = prim::Constant[value= 0 -1 [ CPULongTensor{2} ]]()
+  %11 : Dynamic = prim::NumToTensor(%7)
+  %12 : int[] = aten::stack[dim=0](%11)
+  %13 : Dynamic = aten::full(%12, %c, %8, %9, %10)
+  return (%13);
+}
diff --git a/test/expect/TestScript.test_math_numbers-int.expect b/test/expect/TestScript.test_math_numbers-int.expect
new file mode 100644
index 0000000..9f02859
--- /dev/null
+++ b/test/expect/TestScript.test_math_numbers-int.expect
@@ -0,0 +1,16 @@
+graph(%x : Dynamic) {
+  %1 : int = prim::Constant[value={7}]()
+  %2 : int = prim::Constant[value={8}]()
+  %3 : Dynamic = prim::NumToTensor(%1)
+  %4 : Dynamic = prim::NumToTensor(%2)
+  %5 : Dynamic = aten::add[alpha={1}](%3, %4)
+  %c : int = prim::TensorToNum(%5)
+  %7 : int = prim::Constant[value={1}]()
+  %8 : int = prim::Constant[value={6}]()
+  %9 : int = prim::Constant[value={0}]()
+  %10 : int[] = prim::Constant[value= 0 -1 [ CPULongTensor{2} ]]()
+  %11 : Dynamic = prim::NumToTensor(%7)
+  %12 : int[] = aten::stack[dim=0](%11)
+  %13 : Dynamic = aten::full(%12, %c, %8, %9, %10)
+  return (%13);
+}
diff --git a/test/expect/TestScript.test_math_schema.expect b/test/expect/TestScript.test_math_schema.expect
new file mode 100644
index 0000000..7d8f8d2
--- /dev/null
+++ b/test/expect/TestScript.test_math_schema.expect
@@ -0,0 +1,5 @@
+graph(%x : Dynamic
+      %y : Dynamic) {
+  %2 : Dynamic = aten::add[alpha={1}](%x, %y)
+  return (%2);
+}
diff --git a/test/expect/TestScript.test_math_tensor_number.expect b/test/expect/TestScript.test_math_tensor_number.expect
new file mode 100644
index 0000000..f9d0ded
--- /dev/null
+++ b/test/expect/TestScript.test_math_tensor_number.expect
@@ -0,0 +1,7 @@
+graph(%x : Dynamic) {
+  %1 : int = prim::Constant[value={7}]()
+  %2 : Dynamic = prim::NumToTensor(%1)
+  %3 : Dynamic = aten::type_as(%2, %x)
+  %4 : Dynamic = aten::add[alpha={1}](%x, %3)
+  return (%4);
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_inline_params.expect b/test/expect/TestScript.test_onnx_export_script_inline_params.expect
new file mode 100644
index 0000000..2d6f315
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_inline_params.expect
@@ -0,0 +1,19 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 2 3},{name: "1", type:Tensor dims: 3 3},{name: "2", type:Tensor dims: 3 4}]
+      outputs: [{name: "6", type:Tensor dims: 2 4}]
+      initializers: [TensorProto shape: [3 3],TensorProto shape: [3 4]]
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [1]}]},
+        Node {type: "Gemm", inputs: [x,1,3], outputs: [4], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 0},{ name: 'broadcast', type: int, value: 1}]},
+        Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [1]}]},
+        Node {type: "Gemm", inputs: [4,2,5], outputs: [6], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 0},{ name: 'broadcast', type: int, value: 1}]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_inline_script.expect b/test/expect/TestScript.test_onnx_export_script_inline_script.expect
new file mode 100644
index 0000000..dc43cdc
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_inline_script.expect
@@ -0,0 +1,17 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "2", type:Tensor dims: 1 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "Neg", inputs: [x], outputs: [1], attributes: []},
+        Node {type: "Add", inputs: [1,1], outputs: [2], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_inline_trace.expect b/test/expect/TestScript.test_onnx_export_script_inline_trace.expect
new file mode 100644
index 0000000..dc43cdc
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_inline_trace.expect
@@ -0,0 +1,17 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "2", type:Tensor dims: 1 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "Neg", inputs: [x], outputs: [1], attributes: []},
+        Node {type: "Add", inputs: [1,1], outputs: [2], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_module.expect b/test/expect/TestScript.test_onnx_export_script_module.expect
new file mode 100644
index 0000000..ea3d149
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_module.expect
@@ -0,0 +1,16 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "1", type:Tensor dims: 1 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "Add", inputs: [x,x], outputs: [1], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_module_if.expect b/test/expect/TestScript.test_onnx_export_script_module_if.expect
new file mode 100644
index 0000000..264b863
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_module_if.expect
@@ -0,0 +1,41 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x.1", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "4", type:Tensor dims: 1 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "ReduceSum", inputs: [x.1], outputs: [1], attributes: [{ name: 'keepdims', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Greater", inputs: [1,2], outputs: [3], attributes: []},
+        Node {type: "If", inputs: [3], outputs: [4], attributes: [{ name: 'then_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: []
+              outputs: [{name: "5", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Neg", inputs: [x.1], outputs: [5], attributes: []}
+              ]
+            }
+
+          },{ name: 'else_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export2"
+              inputs: []
+              outputs: [{name: "x.1", type:Tensor dims: 1 2 3}]
+              initializers: []
+              nodes: [
+                
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_script_module_loop.expect b/test/expect/TestScript.test_onnx_export_script_module_loop.expect
new file mode 100644
index 0000000..c3b982d
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_module_loop.expect
@@ -0,0 +1,30 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x.1", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "3", type:Tensor dims: 1 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Loop", inputs: [1,2,x.1], outputs: [3], attributes: [{ name: 'body', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: [{name: "_", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "6", type:Tensor dims: }]
+              outputs: [{name: "8", type:Tensor dims: },{name: "7", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Add", inputs: [6,6], outputs: [7], attributes: []},
+                Node {type: "Constant", inputs: [], outputs: [8], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_shape_reshape.expect b/test/expect/TestScript.test_onnx_export_shape_reshape.expect
new file mode 100644
index 0000000..05d657f
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_shape_reshape.expect
@@ -0,0 +1,19 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "0", type:Tensor dims: 1 2 3}]
+      outputs: [{name: "4", type:Tensor dims: 5 2 3}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [3]}]},
+        Node {type: "Tile", inputs: [0,1], outputs: [2], attributes: []},
+        Node {type: "Shape", inputs: [2], outputs: [3], attributes: []},
+        Node {type: "Reshape", inputs: [2,3], outputs: [4], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f1.expect b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
new file mode 100644
index 0000000..47f55eb
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
@@ -0,0 +1,66 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x.1", type:Tensor dims: 1 10}]
+      outputs: [{name: "6", type:Tensor dims: 10 1}]
+      initializers: []
+      nodes: [
+        Node {type: "Add", inputs: [x.1,x.1], outputs: [1], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Transpose", inputs: [1], outputs: [3], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "Transpose", inputs: [1], outputs: [4], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "Transpose", inputs: [1], outputs: [5], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "If", inputs: [2], outputs: [6], attributes: [{ name: 'then_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: []
+              outputs: [{name: "8", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+                Node {type: "If", inputs: [7], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
+                    GraphProto {
+                      name: "torch-jit-export2"
+                      inputs: []
+                      outputs: [{name: "3", type:Tensor dims: }]
+                      initializers: []
+                      nodes: [
+                        
+                      ]
+                    }
+
+                  },{ name: 'else_branch', type: graph, value:
+                    GraphProto {
+                      name: "torch-jit-export3"
+                      inputs: []
+                      outputs: [{name: "4", type:Tensor dims: }]
+                      initializers: []
+                      nodes: [
+                        
+                      ]
+                    }
+
+                  }]}
+              ]
+            }
+
+          },{ name: 'else_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export4"
+              inputs: []
+              outputs: [{name: "5", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
new file mode 100644
index 0000000..34e7dad
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
@@ -0,0 +1,63 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x.1", type:Tensor dims: 1 10},{name: "1", type:Tensor dims: 20 10},{name: "2", type:Tensor dims: 20}]
+      outputs: [{name: "5", type:Tensor dims: 1 20}]
+      initializers: [TensorProto shape: [20 10],TensorProto shape: [20]]
+      nodes: [
+        Node {type: "Add", inputs: [x.1,x.1], outputs: [3], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "If", inputs: [4], outputs: [5], attributes: [{ name: 'then_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: []
+              outputs: [{name: "7", type:Tensor dims: 1 20}]
+              initializers: []
+              nodes: [
+                Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+                Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value:
+                    GraphProto {
+                      name: "torch-jit-export2"
+                      inputs: []
+                      outputs: [{name: "8", type:Tensor dims: 1 20}]
+                      initializers: []
+                      nodes: [
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [8], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'broadcast', type: int, value: 1},{ name: 'transB', type: int, value: 1}]}
+                      ]
+                    }
+
+                  },{ name: 'else_branch', type: graph, value:
+                    GraphProto {
+                      name: "torch-jit-export3"
+                      inputs: []
+                      outputs: [{name: "9", type:Tensor dims: 1 20}]
+                      initializers: []
+                      nodes: [
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'broadcast', type: int, value: 1},{ name: 'transB', type: int, value: 1}]}
+                      ]
+                    }
+
+                  }]}
+              ]
+            }
+
+          },{ name: 'else_branch', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export4"
+              inputs: []
+              outputs: [{name: "10", type:Tensor dims: 1 20}]
+              initializers: []
+              nodes: [
+                Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'broadcast', type: int, value: 1},{ name: 'transB', type: int, value: 1}]}
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_print-stdout.expect b/test/expect/TestScript.test_print-stdout.expect
new file mode 100644
index 0000000..7523688
--- /dev/null
+++ b/test/expect/TestScript.test_print-stdout.expect
@@ -0,0 +1,5 @@
+ 0.5000
+ 0.9526
+ 0.9975
+ 0.9999
+[ Variable[CPUDoubleType]{4} ]
diff --git a/test/expect/TestScript.test_python_frontend.expect b/test/expect/TestScript.test_python_frontend.expect
new file mode 100644
index 0000000..8e59310
--- /dev/null
+++ b/test/expect/TestScript.test_python_frontend.expect
@@ -0,0 +1,61 @@
+(def
+  (ident fn)
+  (list
+    (param (ident x) (tensor_type))
+    (param (ident y) (tensor_type))
+    (param (ident z) (tensor_type)))
+  (list
+    (assign
+      (list (variable (ident q)))
+      (=)
+      (-
+        (+
+          (variable (ident x))
+          (variable (ident y)))
+        (apply
+          (.
+            (variable (ident z))
+            (ident sigmoid))
+          (list)
+          (list))))
+    (expression statement
+      (list
+        (apply
+          (variable (ident print))
+          (list (variable (ident q)))
+          (list))))
+    (assign
+      (list (variable (ident w)))
+      (=)
+      (unary minus
+        (variable (ident z))))
+    (if
+      (and
+        (and
+          (not (variable (ident x)))
+          (not (variable (ident y))))
+        (variable (ident z)))
+      (list
+        (assign
+          (list (variable (ident m)))
+          (=)
+          (if
+            (not (variable (ident z)))
+            (variable (ident x))
+            (variable (ident y)))))
+      (list))
+    (while
+      (and
+        (<
+          (variable (ident x))
+          (variable (ident y)))
+        (>
+          (variable (ident y))
+          (variable (ident z))))
+      (list
+        (assign
+          (list (variable (ident q)))
+          (=)
+          (variable (ident x)))))
+    (return
+      (list (variable (ident x))))))
diff --git a/test/expect/TestScript.test_sum-1.expect b/test/expect/TestScript.test_sum-1.expect
new file mode 100644
index 0000000..8e16536
--- /dev/null
+++ b/test/expect/TestScript.test_sum-1.expect
@@ -0,0 +1,8 @@
+graph(%x : Dynamic) {
+  %1 : int = prim::Constant[value={4}]()
+  %2 : int = prim::Constant[value={0}]()
+  %3 : Dynamic = prim::NumToTensor(%1)
+  %4 : int[] = aten::stack[dim=0](%3)
+  %5 : Dynamic = aten::sum(%x, %4, %2)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_sum-2.expect b/test/expect/TestScript.test_sum-2.expect
new file mode 100644
index 0000000..dece8c4
--- /dev/null
+++ b/test/expect/TestScript.test_sum-2.expect
@@ -0,0 +1,8 @@
+graph(%x : Double(1, 1, 1, 1, 4)) {
+  %1 : Long() = prim::Constant[value={4}]()
+  %2 : Long() = prim::Constant[value={0}]()
+  %3 : Long() = prim::NumToTensor(%1)
+  %4 : Dynamic = aten::stack[dim=0](%3)
+  %5 : Dynamic = aten::sum(%x, %4, %2)
+  return (%5);
+}
diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 0000000..b6af4e9
--- /dev/null
+++ b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect
new file mode 100644
index 0000000..8e4e1fc
--- /dev/null
+++ b/test/expect/TestSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 0000000..b25c0d3
--- /dev/null
+++ b/test/expect/TestSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
new file mode 100644
index 0000000..fab1614
--- /dev/null
+++ b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
@@ -0,0 +1 @@
+backend of indices (CUDA) must match backend of values (CPU)
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
new file mode 100644
index 0000000..77b0b50
--- /dev/null
+++ b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
@@ -0,0 +1 @@
+add: expected 'other' to be a CPU tensor, but got a CUDA tensor
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_is_nonzero-empty.expect b/test/expect/TestTorch.test_is_nonzero-empty.expect
new file mode 100644
index 0000000..49fe5d7
--- /dev/null
+++ b/test/expect/TestTorch.test_is_nonzero-empty.expect
@@ -0,0 +1 @@
+bool value of Tensor with no values is ambiguous
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_is_nonzero-multiple.expect b/test/expect/TestTorch.test_is_nonzero-multiple.expect
new file mode 100644
index 0000000..8e6945d
--- /dev/null
+++ b/test/expect/TestTorch.test_is_nonzero-multiple.expect
@@ -0,0 +1 @@
+bool value of Tensor with more than one value is ambiguous
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-bigint.expect b/test/expect/TestTorch.test_print-bigint.expect
new file mode 100644
index 0000000..48aa2d9
--- /dev/null
+++ b/test/expect/TestTorch.test_print-bigint.expect
@@ -0,0 +1 @@
+tensor(2341234123412341)
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-default_device.expect b/test/expect/TestTorch.test_print-default_device.expect
new file mode 100644
index 0000000..f9d864f
--- /dev/null
+++ b/test/expect/TestTorch.test_print-default_device.expect
@@ -0,0 +1 @@
+tensor([123])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-default_dtype.expect b/test/expect/TestTorch.test_print-default_dtype.expect
new file mode 100644
index 0000000..e30b8b3
--- /dev/null
+++ b/test/expect/TestTorch.test_print-default_dtype.expect
@@ -0,0 +1,2 @@
+tensor([ 0.0000e+00, 9.8813e-324, 9.8813e-323, 1.0000e+307, 1.0000e+308,
+                inf])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-device.expect b/test/expect/TestTorch.test_print-device.expect
new file mode 100644
index 0000000..7c13e22
--- /dev/null
+++ b/test/expect/TestTorch.test_print-device.expect
@@ -0,0 +1 @@
+tensor([123], device='cuda:0')
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-dtype.expect b/test/expect/TestTorch.test_print-dtype.expect
new file mode 100644
index 0000000..593ed77
--- /dev/null
+++ b/test/expect/TestTorch.test_print-dtype.expect
@@ -0,0 +1,2 @@
+tensor([ 0.0000e+00, 9.8813e-324, 9.8813e-323, 1.0000e+307, 1.0000e+308,
+                inf], dtype=torch.float64)
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-negint.expect b/test/expect/TestTorch.test_print-negint.expect
new file mode 100644
index 0000000..a0badd1
--- /dev/null
+++ b/test/expect/TestTorch.test_print-negint.expect
@@ -0,0 +1 @@
+tensor([ 1, -2])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-nonfinite.expect b/test/expect/TestTorch.test_print-nonfinite.expect
new file mode 100644
index 0000000..1ef32b3
--- /dev/null
+++ b/test/expect/TestTorch.test_print-nonfinite.expect
@@ -0,0 +1 @@
+tensor([4.0000,    inf, 1.5000,   -inf, 0.0000,    nan, 1.0000])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-posint.expect b/test/expect/TestTorch.test_print-posint.expect
new file mode 100644
index 0000000..09c598b
--- /dev/null
+++ b/test/expect/TestTorch.test_print-posint.expect
@@ -0,0 +1 @@
+tensor([1, 2])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-requires_grad.expect b/test/expect/TestTorch.test_print-requires_grad.expect
new file mode 100644
index 0000000..3665c9d
--- /dev/null
+++ b/test/expect/TestTorch.test_print-requires_grad.expect
@@ -0,0 +1 @@
+tensor([123.], requires_grad=True)
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-scimode.expect b/test/expect/TestTorch.test_print-scimode.expect
new file mode 100644
index 0000000..ab76448
--- /dev/null
+++ b/test/expect/TestTorch.test_print-scimode.expect
@@ -0,0 +1 @@
+tensor([1.0000e+28, 1.0000e-28])
\ No newline at end of file
diff --git a/test/expect/TestTorch.test_print-summary.expect b/test/expect/TestTorch.test_print-summary.expect
new file mode 100644
index 0000000..747c492
--- /dev/null
+++ b/test/expect/TestTorch.test_print-summary.expect
@@ -0,0 +1 @@
+tensor([0., 0., 0.,  ..., 0., 0., 0.])
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 0000000..b6af4e9
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
new file mode 100644
index 0000000..8e4e1fc
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 0000000..b25c0d3
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/ffi/src/cpu/lib.h b/test/ffi/src/cpu/lib.h
new file mode 100644
index 0000000..a0c2f12
--- /dev/null
+++ b/test/ffi/src/cpu/lib.h
@@ -0,0 +1,6 @@
+
+void good_func(THFloatTensor *tensor, int a, float b);
+void bad_func(THFloatTensor *tensor, int a, float b);
+THFloatTensor * new_tensor(int a);
+float int_to_float(int a);
+
diff --git a/test/ffi/src/cpu/lib1.c b/test/ffi/src/cpu/lib1.c
new file mode 100644
index 0000000..2a44600
--- /dev/null
+++ b/test/ffi/src/cpu/lib1.c
@@ -0,0 +1,19 @@
+#include <TH/TH.h>
+
+void good_func(THFloatTensor *tensor, int a, float b)
+{
+  THFloatTensor_mul(tensor, tensor, a);
+  THFloatTensor_add(tensor, tensor, b);
+}
+
+THFloatTensor * new_tensor(int a)
+{
+  THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
+  THFloatTensor_fill(t, a);
+  return t;
+}
+
+float int_to_float(int a)
+{
+  return a;
+}
diff --git a/test/ffi/src/cpu/lib2.c b/test/ffi/src/cpu/lib2.c
new file mode 100644
index 0000000..43bb95a
--- /dev/null
+++ b/test/ffi/src/cpu/lib2.c
@@ -0,0 +1,8 @@
+#include <TH/TH.h>
+
+void bad_func(THFloatTensor *tensor, int a, float b)
+{
+  THFloatTensor_mul(tensor, tensor, a);
+  THFloatTensor_add(tensor, tensor, b);
+  THFloatTensor_addbmm(tensor, 1, tensor, 1, tensor, tensor);
+}
diff --git a/test/ffi/src/cuda/cudalib.c b/test/ffi/src/cuda/cudalib.c
new file mode 100644
index 0000000..c829830
--- /dev/null
+++ b/test/ffi/src/cuda/cudalib.c
@@ -0,0 +1,12 @@
+#include <TH/TH.h>
+#include <THC/THC.h>
+
+extern THCState *state;
+
+#include "../cpu/lib1.c"
+
+void cuda_func(THCudaTensor *tensor, int a, float b)
+{
+  THCudaTensor_mul(state, tensor, tensor, a);
+  THCudaTensor_add(state, tensor, tensor, b);
+}
diff --git a/test/ffi/src/cuda/cudalib.h b/test/ffi/src/cuda/cudalib.h
new file mode 100644
index 0000000..5f9ae63
--- /dev/null
+++ b/test/ffi/src/cuda/cudalib.h
@@ -0,0 +1,5 @@
+
+void good_func(THFloatTensor *tensor, int a, float b);
+void cuda_func(THCudaTensor *tensor, int a, float b);
+THFloatTensor * new_tensor(int a);
+float int_to_float(int a);
diff --git a/test/ffi/src/lib.h b/test/ffi/src/lib.h
new file mode 100644
index 0000000..61f21a5
--- /dev/null
+++ b/test/ffi/src/lib.h
@@ -0,0 +1,5 @@
+
+void my_func(THFloatTensor *tensor, int a, float b);
+void my_cuda_func(THCudaTensor *tensor, int a, float b);
+THFloatTensor * new_t(int a);
+float new_int(int a);
diff --git a/test/onnx/debug_embed_params.py b/test/onnx/debug_embed_params.py
new file mode 100644
index 0000000..e5c3e34
--- /dev/null
+++ b/test/onnx/debug_embed_params.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import itertools
+
+import torch
+import torch.jit
+from torch.autograd import Variable
+import torch.autograd.function as function
+
+import onnx
+import caffe2.python.onnx.backend as c2
+from test_pytorch_common import flatten
+
+
+torch.set_default_tensor_type('torch.FloatTensor')
+try:
+    import torch
+except ImportError:
+    print('Cannot import torch, hence caffe2-torch test will not run.')
+    sys.exit(0)
+
+
+def run_embed_params(proto, model, input, state_dict=None, use_gpu=True):
+    """
+    This is only a helper debug function so we can test embed_params=False
+    case as well on pytorch front
+    This should likely be removed from the release version of the code
+    """
+    device = 'CPU'
+    if use_gpu:
+        device = 'CUDA'
+    model_def = onnx.ModelProto.FromString(proto)
+    onnx.checker.check_model(model_def)
+    prepared = c2.prepare(model_def, device=device)
+
+    if state_dict:
+        parameters = []
+        # Passed in state_dict may have a different order.  Make
+        # sure our order is consistent with the model's order.
+        # TODO: Even better: keyword arguments!
+        for k in model.state_dict():
+            if k not in state_dict:
+                # Once PyTorch Module adds unnecessary paramter, the old pre-trained model does not have it.
+                # Just simply pass the new one.
+                # TODO: Please don't export unnecessary parameter.
+                parameters.append(model.state_dict()[k])
+            else:
+                parameters.append(state_dict[k])
+    else:
+        parameters = list(model.state_dict().values())
+
+    W = {}
+    for k, v in zip(model_def.graph.input, flatten((input, parameters))):
+        if isinstance(v, Variable):
+            W[k.name] = v.data.cpu().numpy()
+        else:
+            W[k.name] = v.cpu().numpy()
+
+    caffe2_out = prepared.run(inputs=W)
+
+    return caffe2_out
diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect
new file mode 100644
index 0000000..9e01aeb
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect
@@ -0,0 +1,70 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
new file mode 100644
index 0000000..9dd22ca
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
@@ -0,0 +1,73 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
new file mode 100644
index 0000000..9e01aeb
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
@@ -0,0 +1,70 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
new file mode 100644
index 0000000..204ba9d
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
@@ -0,0 +1,73 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect
new file mode 100644
index 0000000..2d4a555
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_addconstant.expect
@@ -0,0 +1,64 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    output: "1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: DOUBLE
+        raw_data: "\000\000\000\000\000\000\360?"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: DOUBLE
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect
new file mode 100644
index 0000000..76c5271
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_addmm.expect
@@ -0,0 +1,109 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    output: "3"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 1
+      type: FLOAT
+    }
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "0"
+    input: "1"
+    input: "3"
+    output: "4"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 1
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect
new file mode 100644
index 0000000..f618a92
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_at_op.expect
@@ -0,0 +1,52 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "0"
+    output: "1"
+    op_type: "ATen"
+    attribute {
+      name: "operator"
+      s: "add"
+      type: STRING
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect
new file mode 100644
index 0000000..ebcf22b
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_basic.expect
@@ -0,0 +1,75 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+  }
+  node {
+    input: "0"
+    input: "2"
+    output: "3"
+    op_type: "Mul"
+  }
+  node {
+    input: "3"
+    output: "4"
+    op_type: "Tanh"
+  }
+  node {
+    input: "4"
+    output: "5"
+    op_type: "Sigmoid"
+  }
+  node {
+    input: "5"
+    output: "6"
+    op_type: "Neg"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "6"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect
new file mode 100644
index 0000000..f01abbb
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_batchnorm.expect
@@ -0,0 +1,168 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    input: "3"
+    input: "4"
+    output: "6"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 9.99999974737875e-06
+      type: FLOAT
+    }
+    attribute {
+      name: "is_test"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "momentum"
+      f: 1
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\330=\221>|\037(?"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "2"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "3"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "4"
+    raw_data: "\000\000\200?\000\000\200?"
+  }
+  initializer {
+    data_type: INT64
+    name: "5"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+        }
+      }
+    }
+  }
+  output {
+    name: "6"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
new file mode 100644
index 0000000..1de9b35
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
@@ -0,0 +1,176 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "6"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "6"
+    input: "1"
+    input: "2"
+    input: "3"
+    input: "4"
+    output: "7"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 9.99999974737875e-06
+      type: FLOAT
+    }
+    attribute {
+      name: "is_test"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "momentum"
+      f: 1
+      type: FLOAT
+    }
+  }
+  node {
+    input: "7"
+    output: "8"
+    op_type: "Squeeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\217~,?b\265\251>"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "2"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "3"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "4"
+    raw_data: "\000\000\200?\000\000\200?"
+  }
+  initializer {
+    data_type: INT64
+    name: "5"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+        }
+      }
+    }
+  }
+  output {
+    name: "8"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
new file mode 100644
index 0000000..9bdadb5
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
@@ -0,0 +1,172 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    input: "3"
+    input: "4"
+    output: "6"
+    output: "7"
+    output: "8"
+    output: "batch_norm_dead_output-9"
+    output: "batch_norm_dead_output-10"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 9.99999974737875e-06
+      type: FLOAT
+    }
+    attribute {
+      name: "is_test"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "momentum"
+      f: 0.899999976158142
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\330=\221>|\037(?"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "2"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "3"
+    raw_data: "\315\314\314=\315\314\314="
+  }
+  initializer {
+    dims: 2
+    data_type: FLOAT
+    name: "4"
+    raw_data: "fff?fff?"
+  }
+  initializer {
+    data_type: INT64
+    name: "5"
+    raw_data: "\001\000\000\000\000\000\000\000"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+        }
+      }
+    }
+  }
+  output {
+    name: "6"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect
new file mode 100644
index 0000000..2806bd8
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_chunk.expect
@@ -0,0 +1,65 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    output: "2"
+    op_type: "Split"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "split"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect
new file mode 100644
index 0000000..97958cc
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_clip.expect
@@ -0,0 +1,56 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Clip"
+    attribute {
+      name: "max"
+      f: 0.5
+      type: FLOAT
+    }
+    attribute {
+      name: "min"
+      f: -0.5
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect
new file mode 100644
index 0000000..f85787e
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_clip_max.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Clip"
+    attribute {
+      name: "max"
+      f: 0.100000001490116
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect
new file mode 100644
index 0000000..844fc2a
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_clip_min.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Clip"
+    attribute {
+      name: "min"
+      f: -0.100000001490116
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect
new file mode 100644
index 0000000..78e9643
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_concat2.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect
new file mode 100644
index 0000000..c7be917
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_conv.expect
@@ -0,0 +1,121 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 3
+      ints: 3
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 13
+    dims: 16
+    dims: 3
+    dims: 3
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\022\254`=\314$\241\275\"2\032=\342\322:=\022kx=\300<\313\272@\035\234\275@\362w\275\3564n=\000\351\217:\210\305\025\274\026\332c=\316\222I\275;fO\275\276\337\231\275\211 \252=\246\344[=;~`\275\233c\005\275f@\361\274\272\351\\=\310\003\030\275\300\216\337\274\360o\230\274z\262{=\320\n\216\274\000z\366;P\254\217;^\023]=d\326\234\274\027d\202\275\"\027R=\000\2673\275+\220\230=>\r9=\313.w\275h\020\030\2756\312\031\2755\007\243=v\010\001\275\240\026k\274\242/t=\215\347\222=\336e1=i\316\210=f\325\373\274\006\364\005\275\316\215(=0+\035=\246\016K\275K\'\004\275\362\177\001=2Pi=9\271\221==$\231=R^_=\220_\200;\300\305\035\274\256p@=\034\260\375\274\260X\355;\241^\243=\300\361\305:8\236\226\275;\227\013\275\212{.=D\366\213\275\2500\234\275\336%4=\033.\232=L\345\211\274f\344^=\276\351,=\271\274\245=\256e\033\275\310\3441\275\300H\341\272 \200b\274\200\303F\273\360\333U\275\233\021\226=\026da=(#\036\275E\204\236=\253e\246\275P\347%=.*\023=\357\271\203=\2739\221=\300\020\305:\216\3368=\273h1\275\266\360\247\275@\337C\274\254\240\213<Pc\301\274P\245\361;\\0\210\274\000!\245:\n\307\030=\340Ux\274\330\305\010\275\350\373M\274\216\244B\275\250x>\274\006\306h\275O\026\234\275\220\202\207\275J\224e=\345\344\240=\374\337\274\274z\263Y=h\324R<\\:\245\275\242\000n=r\026\032=\274\316\364<\230\261\025<\343\020&\275P\277\322\273x\376$\274N\227\006=P]\344\273\000e\201\272`\023P<\024\324\216<X\232&=\\\221\351<W\255\234\275L\330\323<\205@\227=\376\347D\275\325\r\235=\246kX=\262Jc=&\"o=\333\025\000\275\3343\351\274\236h9=\004\240\313<\340\312\222<\026I\247\275\020\233\251;\340g\366\274\200=\327\274]\260\213=\232<*=L\\\226\274l\265\332\274\003$\242=\343\367\214\275\217\212\247=\000/\215\272R>\216\275\206\312\326\274`\267\013;\036\355\240\275Pdi\275X\210.\274\353-&\275\002q==\3261\001=FUG=Xa8\274\305\031\200=\326\017&\275\030\007\005=\020\262\351;D\333\230\274\203\305\206\275\256\335N=\206\3408=xM{\275oM\224=\300-k\275\345\264\234=\01686\275x\202\014=\350\246\027\274d\325\212\275\020$V\275V\256\257\274r\263k=\203\243\017\275\241w\212=\264\232\327<6\035\003\275HPl\275Ha\213\275\254\337\251<`b_\273D\263\227< \377\347\274\377G\223\275\342Z.=>Ii=8\257M\275\032\325C=LV\360<|\t\344<\306\325\016\275\0262T\275\200\rB\273\033\324\\\275$<\251<\326G\t=@[\204\272\320i\247<\246~%\2752(d=+\366\244\275\324N\251\275\030\262\222\275\2374\217\275 ^\245\2740\t\244\273\276H]=\326\250\312\274\370\017O\275\266\305B=\3064\212\275\316\350t\275V\025\251\275\000\206)\271\240\220\016;\220\034\207;\372\377[=z\026x= \227\005\274\030\350\"\275\216\223Y\275\0208\347;\000C\255;\223j\232\275\222d0=\240l0\273\240\235}\273\277\234\221\275rb0=\235\255\241=\250\202\031<\252\345\\=\203Y\221=Ve\375\274s:)\275\273\034\240=\020\372\220;\300\245\211:\253\250\020\275\000\340\336\270\240\327e<J\376c=\020A\356<;\013?\2750\027\337;\030\355\035\275@\230\302<lu\350<`NA\275e6\244=\312\024r=\300\347\263\274,M\251\275`\177s\274\200%8\274\035\353\221=6Y\020\275\235\212\243=\2302*\275\202\367\242\275J\250\233\275\034\331\272\274uJ\234=L&\365\274d\323\234\275\243\324F\275@\253,\275*\245\225\275\013?\252\275\3510\232=\342\206P=\200\331g<\033\2714\275\3304U\275\277r\230=*:/=\347\035\242\275 \270y\274\256\000\027=nC\033\275\030G<<\266B\033=\334\364\224<\200R\007;6y\204\275\014\221\326<h>3\275\034C\232\274S\232\020\275\254\255\365<\332\350\247\275\300\371\256:\356\3246=\020\230\377<F\226\335\274Fh\003=\220\334\357\273$p\351<\332\247S= \365\302\274E~\247=?\013\244=\034?\340<\373\360\226=p\250\251<\303IT\275n\221\002\275\232\340b=\034\311\371<\322\200#=\334\230\274\274\216\314\001=\306\346\342\274>\312H\275\264\001\264<\366b\364\274[;[\2754O\241\275\276\204\021=\340\344F;\222]n=\325%\250=`\017\016\275@\372Z<K\025\215=\355\206\230=\200\272\005;\226\236\235\275P\376j\275\212N\014=\300\273\374\274&+\013=2]q=\032\010\232\275/\366\223=\024\262\227\275c`\217\2759\033\224=\340\302j\273v\031b=\240\315m\273\227\330\210=@W\005\275\274\237\223\274\240\255C\273\333\034\033\275\236\324p\275 \212K\273\242;\022=`\003\023=/3\247\275<\257\255\274\200e\242\274\013\025\226\275\263\267!\275\322\204w=\030\320k\275\001\343\215=\010\364z\274\260\033\242\273\000\224\005\272p|\260;@\231\340\272z\212\006=@\023\036=i\n\215=\320\021\373<d\357\246<\306\321\214\275\tP\223=P`l\275K\"v\275\200\003{<\332/\003=\"\335-=L\246\252\275\217\247\207\275\303hz\275\344N\247\275@i\020=s\220\033\275\030\324_\274\320\231\232<\205\307\222=\270\3757\274\224\317\244<\002\326Q=\216&}=\2400I<\346G\336\274}\323\211=\276\232d=\034\200\237\275\n\344X=0\276\214\274\274\320\235\275b)f=\266f\010=\2000\310;\240\301N\273\223\210\250\275\000\224+\273\235W\222=Fg\026\275\344\177\207\275\276$\031=\250\355L\274\332`B=\234\376\366<\253\221\234=\210\362@\274\004\357\246\274\326\202/=\234\016\355\274\235\310\202=\333(\223\275\306V\342\274\256\031N=x\232C\274 )\023\273\000\361\340\271+LU\275\000\2424\2744)\362<\266OZ\275\361`\210=g\214\246=@\325\274;\300\227\005\274\212{a=E9\217=xXA\274\340\321n;;\247\215\275\300nK\274\030\315i\274p\301\370;`\023^\275bUz=\342\240\243\275h\036%<\330\304[\274\327$\211=\220\260\230;\214L\257\274\372\336M=\211\223\234=T4\216<\260=\212\275v\270`=N\2751=\036.n=T0\306<!p\210=\344\251\353<\206M\n\275P\'{\275b\207n=\000h\373;\353\004k\275\336\225p\275\206G\273\274\256\344(=|l\344<\231\363\233=\220\264\373\273\310\337f\275pX\243\273\n\341\013=fq&=\203\217\204\275\230\266\021\274\263\334\t\275$\233\320<\340O\"\275\"\206K=<<\347\274 E\334<8\025\033=\233\002U\275\\\203\254\2749\226\223=\265\341\204={\236\207=\254\211\372<\326\023S\275\220[\324\273\361\300\235=\2405X<\224\270\246\274\347\324\246= \320\032\274X\377\010<\000\260\264\273x\273\"\275\"\206{=\350\264\030\275\341\215\200=E\203\237=\360p\221\273\266\"D\275y\217\212=L\256\340<,\335\215<\356i_\275\007\221\240\275\264\033\351<\330\2636<\3661@\275\230E|\274\340\2007\275\306\242<\275j\203]=\232D\002=xg.\274\234\276\346<\036\'k\275:\204l=\264q\377<\233P\\\275 \013%=\014\245\235\275\252\2048=\367\031\232=\300\216\"<\030\374\016=\226\306Q=N\206A\275\235)\246=f\203X=\016\376\014=xx\000=\000W6\275\\g\352<\226\312\335\274\000G\325\273\373R7\275\253\2677\275\3067u\275\374\333\247<\207z\230=\230\037g<\200\207^\275\226\005C\275\026-\204\275\320R\212\275\330\320:\275[\270\204=\340@\034\274m\204\240=@\3344<\340\325\016\274|\306\224\274 \274\364\274\032U.= \2433\274\006Lg\275\301\272\221=C\220\241\2754\024\234<\303\3655\275\037\215\230=h\257@\274\020\360\366;>^\023\275XS0<\001\231\247=\354X\340<\243\316\215=\360\346\004=H\265#<\260\361\201\275w\240\212= ._\274M\022\206=\0047\226<\303\302-\275\204\367\364<C\371\\\275\307B\220\275\240MV<\300)\275\273\310Sx<\316\256\026\275`(\010\274\340|\013\274\274\344\227\274\334\346\230\275\2009\242<\330\351\"<\006#\363\274\022\027m=\374#\344<\222\265\t=k\217\005\275\232<2=\240\031&\275\232\036\233\275\206\226\220\275u\034\222=\326\224%\275\310\343\250\2754\365\377<\332@4=\020\275\214;8\263r<\227\367\204=\316\013\245\275\326a\000=pT\220;\323\327\225\275\310\022Z<\014\353\365<t\001\205\274,\265\314<\254\315\214\275H9;\275Px\255;\025\r\242=@\373N\274\025\201\211=\036#@=\370\227\r<(\204\240\275WJ\244=t\366\241\274,n\223\275\3466t\2753\235\247\275\000jK\274Q\350\215=\020\230\351<\024\271\262<\240\2749\273\026Q\007\275\200\234\006:0\364\265;\325\230\231=\320\002\266\274>bT=\233\027V\275\006yv=\334m\375<\322\013]=rEx=80b<\200\222F<0\267\357\273Nm|\275\366\302K=\234{\357< z+;\000\370\247<`v ;\034o\226\275\346,\024=\033]\215=\314\233\320\274\002\237F=\200\034T\2750\024\330;\024L\207<@\376\357:\002\261:=\276\246\202\275\016g\016\275\002\312j=\206L\312\274\177\237\226=\234x\300\274\000\010{\274\274$\344\274\006\033;\275\213\352\r\275\300\223\274<\352_{=&\266X= \250\t\2740\332\204<\000\3702:\371\"\214=\026|\240\275\326\313\247\275\035\177\201=lp\262\274\324\222\261<\272\314]=\314\002\224\274\3209\375;\003?A\275\256\247\035\275\215i\235=Z\326,=\022ua=\240R\034=\220\367e\275\004\370\210<`aN\275\026\0137\2754\261\331<l\242\212<\376V9= \216\\\274\000y\365:\032A\002=\240;h;\262t\022=(\373\004\275X\\\000<\230\270W\274@U\367<>\243\020=3\314\227\275\312\002+=y\001\237=\334\377\241<\004/\236<\330\005\177\275\r\007\225=P\332\274\273/\005\251\275B \032=\274\255\312\274+;j\275`\2347\274\252\242==\334)\205<\300\365\320<\266P\024=z\316X=~{Q=\277:\241=\216\333\205\275\000g\2469\351\255\245=h\2529\275\200!\212<\016.V=vD\245\275\001q\224=T\225\332<\\/\255\274\370\325\232\275\240\357C<>\277F=\243!W\275h\262~\275\000\030{;\035\346\226=\\\277\310\274X\325\016=$G\240\275$\247\207\274\270`+\275\3000C<\350\371 =X\316(<0w\225<\270\244\241\275,\330\332\274F\250\025\275\323\013\224=F\236p\275\212\3235=%\371\203=\236\316N\275\200Rd\272\347\007\230\275\030\227a\274\036\220\r\275\370:\007=\006\016\030\275p8\353;{Z\232=\344\031\204\275\372\035\021=\224\036\231<\334\'\247<\252>1=<_\310<\020\035\260\274@\2463\275\346V\263\274\344K\242\275\300\327\000\275\006K0=\327\004\201=P\021\211;\340\236-\274\264\036\246\274m`\220=\213\037\246\275\317c\251\275R\367.=\013\241\'\275\314\r\275<\320\010\317;\034\255\272\274\310\245\002\275\200\327\251<\311\363\245=D\220\301<\332\320!=\233;\234\275\223\363\224\275\201\356\224=\240\245&<\250\324\025\275a\202\246=bt1=^y\017=\340v\345<V\224\244\275\366\035A=\340\247:\275+\037:\275\372\022\"=\000\177)<$\005\315<\010\017><\304,\250\274\366\255\226\275V\345L=\222\261D=\0302w\275s\352\035\275\020\316\364\273\377\342\245\275\010\247V<(\2774<@\271\263:\000\236H\273@2\216<\010\370y\275LO\250\27405\350;\343K8\275\243\360\206=d\266\251<\000\234\3329\026\213\033=`\rp\275\332\202J=\313\320-\275\333\035m\275\275E\246=\373Z\233=\321\300\240=\374]\207\275\272/\245\275\370\304\n=\200\304<\273\362J\016=\324\333\235\275M\252\212=\333l\027\2750\346\300\274J\360v=,/\277<\017\372\220=D\370\211\275\314M\213\275,\213\245<\260\360\217\275\240\215J\274\000\271\2269\232Qh=0?\262\273\340T\016<8\263_\274\212r/=r\263}=L\307\231<`\314\001\275R+\001=w:\236=\353[\225=\210d\034\275k%\021\275\200\233t\272w\342\223=o\200\212=\026\036d=`\001\311<;\343\244=\312\037\242\275\036\"[=\2706*\275\374\302\361\274\350\216\032\275p\263\214\275|9\252<8\177w\274\370)\004\274\270\301c\275\242\331F=\014r\212<`\307e\273\265\232\211=Z\315B=\333vi\275K\365/\275\000\177\327<\n[}=\240\266\002\274x\234\000<\014^\220\275N\301\242\275\335\237\222=\2408t\274\300\3277<\034\236\240<Hf$=\246l\373\274\236\"\210\275{\340\227=m\252\237=\253\024H\275P\366\277\274\222)k=\246\027+\275\000\317\330<\200\270\223;\000\233\225\274P\255\003\275\016\206\003=@\366}\274`1\020<8\336z\275KS\235=\200\276\364;3\010G\2751\r\220=03m\275\026\255$=@\364\314\272\000\233\336\271l{\245<\246bu=S\017\243\275\274\023\363\274\260%\366\274&!\r\275\372\3137=+\226\243=`!R<\230\262K\274\257\223\240\2758\235>\274q\036\201=5\356\236=\226\300\246\275\300*\206<vH<\275\356\367\007=\252\341Y=@A\211;\204K\222\275\314h\225<\326z$\275\202YZ=$^\234\274\226#P=\000\037\270;\362\034\036=\036\233\030\275\300~\247\273`\230\013;W\005\243\275`\375\311<P\013\374<\024\235\310<|\344\347\274\220$\233\275I\270\245=^\353J=+\247J\275\274\264\321<\262aF=\242\317\242\275F,3\275\202\362\006=\235U\200=\253 S\275\200\326\343;\375\214\225=:\3600=\2563]=\266\225a\275\350/d\274\376x1\275\226@\366\274\\\334\250\274\236\220W\275$\r\226\275\004\337\246\275\202\2521=j\272x=\216q:\275[\233*\275\256O\240\275L\363\203<\271\'\225=0\014\216\275\270\240\227\275|\222\311<&\233\273\274I\322\213=4\326\275<\314\353\233\274\352Q\221\2750\267\220\274\217\230\203=\375\026\235=#H\201\275\253\030\233=\320\017\301<V\354\017=\213-P\275@?\303\272@\303\200<|\n\317<\376\315X\275F\226G={dP\275\227E\207=\306\262\021\275+\335\243\275\266+\270\274\305\302\203=\016\300B=H\236\'<\354v\267<\310I\"<\020v\341;\252\001\037=;\356\232\275(|@<N\2640\275\310#c\274\342\265F=\360%\324\273\374g\371\274\0064K\275\210\332\"=1\014\245= \366F\273\260\252\245;\332\014s=\222%X=T#\345<f\266\317\2740\367\252\273\320\314\362\274|\030\302\274\214?\346<D\213\220\275L\202\251\275X=\n\274\344U\206\275\231?\227=p\271\234\274`\005\\\273\265]\244=\314\346\207\275\373\250\001\275\300\216\363<\332r\n=\326\370\237\275J\316==\016 \003=\276qW=\273\352G\275\233\256\233\275\030$b\275\000}!\275\235\311\203=\340\232?\273!\346\230=\026\202@=\2730n\275\311\242\247=\200\025]\274\377\231\225=\373\372\211=n=\247\275\030\017\030=\032W%=\253\325x\2758\331[\274\274\331\261<$\026\257<\214\004\203\2746\010\345\274C\010\210\275.\r0=js?=2\323,=\000\303<\274\000t\317<P\375f\275\326\0307\275\314V\230<JMT=\274\203\227<p\327\365;_p\226\275\n\377E=\314\222\255\274{\300\226=\214\261\221\274:\301t=\260w\231\275\366\253\373\2747\360\221=\245\244\211=\302\004x==Z\226=\230\316\032<\322\366E=l5\253<\360\347\353\273^\2515\275d\324\206<\036\215.=\364\371\371<\261\023\214=\200\000d;v\035K=\206\"\037\275\323\241\227\275\206>\255\274+\373y\275\336\316;=\350\3401<D\364\243<oy\217=H\212%\275\320\304\223\275+yg\275\232\345X=\2053\235=\305Z\227=\251f\206=[\005\234\275\000\343\336\273Hx3<@C\261\273\245\335\233=\240\r\232\275\340T\037<\024m\247\274\267?\200\275N\374\234\275\260T\364\274=\360\211=+\022$\275\304\267\245\275&\313i\275\321G\234=\006yD=(+!\2748\'\n=2x\000=\323\374\244\275\220\020\223<\200jC<\340\2677\275\270\345\021<\002\364>=\014\276\207\275@\320#<\220\270\316<\257\307\223=\361\334\247=\005\003\235=\000>\345:\230\320%\275\212\306J=(\0353<\032\225\005=\213\\\213=\300\033\314<\203nE\275\0323\'=\266Q$=\223\342\247=c\327\204\2754o\254<\360Q\265\273\260\325X\275z\266D=r\007X=\310\3731<\227\352\242=\177\353\204=\n#N=\240\204\361<^\224\010=f\237\376\274\224B\360<u\214\250=h5^<\360\262\275<\346@\353\274I:\211=\272\276;=j\272A=\336\201\021\275\245\250\234=4V\340<p\216H\275@>\243\274,\354\203\275\274\351\234<\224\370\231\274\000]\346\271\223\247\020\275\260\363\243\275\362\036z=)\217\250=\333\326A\275R\022i=>a\033\2750t\341<\227\211\227=\002\266{=\035\263\232=\200\267\366\274\204I\250\275\370\372W\274D\347\242\274\0014\233=\362\204]=.\033&\275]\355\203=\370\035n\275\300\000l\274\020*\315<\374\363\313<\332MG=l\266\327\274!\003\212=y\225\200=\254{\235\274\306\377\033\275\374\300\351<\000\327\362:8\342&\2742\311[=\347\205\226\275\213\307\220=\243\3657\275\363$\201=ey\243=\374\221\257<\330ZJ<\334\275\376<\336\323\234\275\323h\222=\342\360I=\01662=\275\345\203=\"\265\203\275F\3662=P|\205;\351\344\234=>A\031=44\221\2758j\205\275\350\241K\274\026\363\340\274`\3308\274\252s/=~B-=\254\260\252\274\032\247P=\014K\247\275G\032\214=&\342\020\275U\300\231=\265\227\243=nt\037=I\260\246=\300G\321;\214\256\275\274v\327{=\253d\233=\352\360j=\270\035U\274\252\223\177=\300\372\305:@\031\276\272\323\206\000\275\330\036\221\275\230XG<\234Y\251\275\306\262\025\275f\216\200\275&<\202\275vIc\275\310W/\2750\355\332<\250\315\205\275\354-\354\274b\n/=\246\267\350\274\256^\026\2758\226\016=z\0027=\020q\177\275F\205p\275sA\201=@|\364\274\003]$\275\014\006\335< \261\014\274\324A\342<>\347\222\275!\360\236=\240\266}\274\270\360\035\275\200\325m;\240\"\373\274b@.=~\310s\275\203\305d\275H\3649\275\000\'\3369\334\377\206<oo\224\275@T\253;6!a\275\210\214\234\275\003\205\211=\3321Q=\023\326;\275\236C!\275\266,&\275\243\3338\275\023PP\275\250\350N\274c\375\022\275\\l\257\274E\355\241=.\311:=8\242@<\020\030\370;\000\370\020\275 \310\343<\355@\212=\3636h\275\320\357\214\274\223\272|\275\274#\270\274-\254\225=\n\321\r=2\245D=\024\377\201<@\327\277:\000F\023<\353\0339\275\364Y\307<Z\332Z=z\202o=\356\000U\275\273Co\275\306\021\231\275do\201\274\323X\212=^vV\275\321\200\252=\200H\007\272z\0207=\200\344\033\272\"\234_=\266\332B\275<\260\217<hR\231\275\000\370=<\n\216\236\275~Qv\275p\275\215\274\273\373w\275\233\372\"\275q\332\217=X\357\025<\000\260^\275v{\037\275\200\224\226\273\310@\\\275\036-\026=\362\242j=\220-\346\273\034\301\260\274\020\336\332<\340\301;<\300)\364\274\346K1\275\200\374\242<\266BQ=\340pz\273l\003\333\274\234\352\352\274D\214\303<h\272q<\242\265!=\020\240$\275\353?\001\275\3704\024\275\226@\016\275\370\231\n<\272\016X=\300\360\225\274\323jK\275(\037p<V\362\004\275`B\271<\246\372V=\033\252F\275\004\303\217\275$\362\355<\340ux\274\006\177\"=CV\232=b}[=\0309\032=\223L\222=\006\002\341\274<%\200\274\264\004\211<\220k(\275p\t\241\275J\303E=6\"l\275^\361g=N\206i=\202c8=Z\031g=\353\353\223\275\360\003\306<\277\037\227\275\273>l\275H\274 <\036\247]\275\343\"2\275\\^\250<\304\353\330<;\214Z\2753\003h\2758\211\n\275@q\354\273h\243\037= D\033;\226Tq\275qn\233=\376\007\213\275P\263\266<\020\355\206<\275\222\222=RC4=<\217\231<\274J\203\275 0\016<\334\334\210\274\032\0024=\251/\226=\274f\310<\000\353\316\272\016\345\000=\022Jf=2*4=\210\374#\275\206L\001\275\005\007\227=\242\227:=\360\307\357<ZN<=X\230D<\333\351a\275P!\252<\240\246r\275\242\203G=\242\2173=0\240\321\273\243\260F\275\032\t5=\200\n\203<\206\311A=\000\203e\274D\336\241\274\360\310\277\273\200\001?;\262\374\235\275N\300J=\202x.=\335\016\201=\355o\224=\214\031\267<(\363\005=\206\266P=@V\202;\201\352\221=\340H\260\274\260\322\355;[\350\223\275*}R=\266\306\360\274\346!_=:6\222\275\300m\224\275\362c\210\275\031\254\241=(\277E\274\360\013\360;\300\010\321\273#\240\"\275\323\027\030\275\373\016P\275JQ\252\275P6\272<C\356\237\275\364\t\214\2746\017\024=\303\230\213\275G\317\212\275@\220_\274\276\336\036=\014#\214\274\316U!\275\365^\212=*\250\033=\314\304\200\275\031,\205=\375\010\224=p\025\357\273\310f&\275\316,:\275\20062\274Y\367\215=\2315\236=>\223M\275x\027\017\274\301\264\245=\302\322$=<\377\241<\273e\233=\n(.=p\370\352<\303\240+\275 \253\367\274\302\037\n=\220j\320\274mP\204=<\355\305<@\027\241<\302\033Z=\316\303\027=\343\005E\2750`\237\274\223w\234\275\310\3579<\177\013\244\275`\222\315<H\215G\275H\370\003\274\353\022_\275\330\312\222\275\340\336P\275\010\002\027\275\200\310S\274`\201l\273\347\277\213\275de\252\275\272\020\022=\323\265\236=J\302O=\314\002\210\275P\277\332;\305\354\215=\240v\217\275k\303X\275\230p\037\2746\235\355\274\373\017\231=\206Bl\275\240\215\010<\266\300\256\274VN\023=\364\237\270<\246\241\363\274\240\213+;\223\241\216\275{\376\014\275\342`\\=\026]\321\274\250\243G\275\312\347z=\003/\213\275k\303\224=\346\316\001\275$\317\300<X<\222\275<*\246<\200\375\361\273\263\244\202=\206\321S\275Kv\226=\006\260\222\275\002\211\211\275ZD-=\000\237\313:\323\242\r\275\232-v=\202\013\216\275P\312\356;\266\245\235\275\251\272\222=`\000\301<\360\004\333<\361\306\220=\313\366\007\275z\2220=|`\307\274\340\316\235<U\366\232=`\326)<J\336;=\360\316\201\274\2700F\2754S\371<X\251c<\000>\314\274[\274*\275\250\003I<\021\352\240=[.\234=\360\232\227\273\262\241/=\203\315\031\275\346\305~\275H\354\037<PXM\275}\036\215=\270\301\231\275s\261\215\275%\346\236=\276\2764=\210\341V\274;\336\240=\020;;\275 \316\230\274\330bz\275\200\261N\275\253S2\275g\262\237\275hv\225\275\020\343X\275\000v\353\274&r\323\274\364\204\246\275\262\201\243\275\364,\243<h\263\013\275Y\010\201=F\220\035\275\260\035\003\275\341\005\217=\026@>\275\230\3054\275\020i\205\274\332\023\177= \353y\275\222_\251\2753f\237=\205\254\240=d!\375<p\006\236\275\000\307O<\302\316\237\275lh\305\274\220\250\214;\204c\321<\020\362\335<\373M\217=\266F\356\274xR\037=\000\224D\273\010\2632<xY\025=\334s\224<\340(\023<`\227\243<\210\224\215\275\216\320z\275\023B[\275 \275\014\273\340\300O\273\333\354\245=\347<\241\275\322zE=\024\010\245<\032\022@=\374\262\277\274\000,@<\340S\003\275\220%\245<D\325\215\274\220\374\320\274\375\335\203=\302B\241\275\000\310\372\272\274\233\244\275\360\020\272\274 \2706;{QE\275\370\0204<#:\231=\236hI\275\342\')=\300\022\227:\336\023\010=hiT\275\243f \275\260\225\235<\350o\013\275\365j\252=\000\020\274;\300>1\275@\310\371<\365s\233=~ZT\275\000\326\200\272\324b\340<\323V\226=\000\374\2659>\264\006\275^\"q\275\376p-=\300\357\256\274\216\352Q\275H\t8\274\220\362\323;\274s\213<v\207I=\034\311\325\274\001J\220=\304\354\223<\266\266E=\034\240\314\274\360*\340;/\304\245\275\357\371\221=\320\223\212\274\243gV\275\316\371q=Lu\201\274\036\370e= \'\034=\254V\361\274,y\212\27434\025\275tG\301<\240\347B<b\022g=\355\'\211= \001\222\274g\254\226\275@#\337;\324\266\234\274B\374f=\'\274\224=\370{\000\275\036F==\223\010\242=@\232\267:+\364`\275\242\302#=\362xh=\356\342J=\360\257\t\275\034G\362<XDs\274&\2743\275\204J\235<`w9\273\345\210\221=\316h\016\275\346\210\365\274v\345\314\274\030\255\006\274\023~I\275\230\026\001<4\'\270<\200eZ\273\326cs\275\310\\\026<\222\357b= \023\016;hjV\275?\302\202=(\210Y< \244q<\320\224*\275\360\271;\275\330J\013\275X\337\034=A\233\240=\340\036\200\275\300\262?<\372c^=\213\027\204=x[6\275\236\304]\275\306Y\021=\200=\\\275\340cO\275\252aZ=\036Yw\275\022[0=\366\264#=\000\326c\271=\350\206=\200\300\223;\202\271?=8\3606<\314\351\203<,\275\242\274\006\223;\275\254\244\264<w\241\245=\242\010\237\275.r\024\275\342\266\177=\314F\206<\000(\2529\002\350\225\275\330\272H\2750KI\275\000v\241\2747\r\204\275N\330\236\275\362;L=\332\364H=v\252z\275D\333\225\275\250\374\247\275\312MD=z\260\'=u\366\224=\\5\344<\260\242\376;\334]\353\274\\\213\364\274`\366\373<&\021,\275\233\362=\275\262\352a=+mg\275`~\016<\266C =\202:\210\275"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 20
+          }
+          dim {
+            dim_value: 16
+          }
+          dim {
+            dim_value: 50
+          }
+          dim {
+            dim_value: 40
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 13
+          }
+          dim {
+            dim_value: 16
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 20
+          }
+          dim {
+            dim_value: 13
+          }
+          dim {
+            dim_value: 48
+          }
+          dim {
+            dim_value: 38
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect
new file mode 100644
index 0000000..bfecf52
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_convtranspose.expect
@@ -0,0 +1,127 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "ConvTranspose"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 3
+      ints: 3
+      type: INTS
+    }
+    attribute {
+      name: "output_padding"
+      ints: 2
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 1
+      ints: 1
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 3
+      ints: 3
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 3
+    dims: 3
+    dims: 3
+    dims: 3
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\306\306\264=\020\351c\275\020\362\301<)\341\026>\326\253,\276v,\375=\214\350l\275\014] \275\352\215\273=x\267#\276H+\356\274\315R<>j\332\215=l\275\221\275\360\263+\275\327i\033>\206\350\224=w\226\005>2\316\375\275(Cc=$\351\014=\210h\t=\031\3756>\362.\332=\023\327\007>\336\017\255=b\246\245=D\035\t\275\016\356-\276Qx&\276\247z\365\275\211.\377\275\346\212\271=\304\005t=\330\304`\275z\036\026\276\343\001B>\201\026\024>\336q\273=\317\020C>\221\327\321\275\035t\340\275\231\264\346\275\260\247\030\274\331\365-\276\007\002\034>\2409\242;^G\304=\367r\330\275Pk\242\274\000\\\317<X\301W\275i\003:>K\022\264\275\264\301t=\204\266\005\276\020\233\272\274\020!Q\275\306\r\252=7\251\t>\360\266?=\240$\217\274P\332I\275\375\201\032>\260B:\275qD\016>Z]\257\275\361j\">\251GD>\217\026\007>J\016\342=\326\262\222=\352\313\301\275\300\206\026;\322\266\207=\310\371M\275\354O\r=R\326\346=>\2276\276`_3\276r\326\353\275"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 5
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 12
+          }
+          dim {
+            dim_value: 15
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
new file mode 100644
index 0000000..3ff6001
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Equal"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: INT8
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect
new file mode 100644
index 0000000..53ae0ce
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_exp.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Exp"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect
new file mode 100644
index 0000000..355af68
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_flatten.expect
@@ -0,0 +1,220 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Shape"
+  }
+  node {
+    input: "1"
+    output: "2"
+    op_type: "Slice"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "ends"
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "starts"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "2"
+    output: "3"
+    op_type: "Squeeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "0"
+    output: "4"
+    op_type: "Shape"
+  }
+  node {
+    input: "4"
+    output: "5"
+    op_type: "Slice"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "ends"
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "starts"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "5"
+    output: "6"
+    op_type: "Squeeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "6"
+    output: "7"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 11
+      type: INT
+    }
+  }
+  node {
+    output: "8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: DOUBLE
+        raw_data: "\000\000\000\000\000\000\360?"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "8"
+    input: "7"
+    output: "9"
+    op_type: "Div"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: DOUBLE
+        raw_data: "\000\000\000\000\000\0008@"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "9"
+    input: "10"
+    output: "11"
+    op_type: "Mul"
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "11"
+    output: "12"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "3"
+    output: "13"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "12"
+    output: "14"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "13"
+    input: "14"
+    output: "15"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "0"
+    input: "15"
+    output: "16"
+    op_type: "Reshape"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "16"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 24
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
new file mode 100644
index 0000000..63d9782
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "1"
+    input: "0"
+    output: "2"
+    op_type: "Less"
+  }
+  node {
+    input: "2"
+    output: "3"
+    op_type: "Not"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: INT8
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
new file mode 100644
index 0000000..e029f5a
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Greater"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: INT8
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect
new file mode 100644
index 0000000..97958cc
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_hardtanh.expect
@@ -0,0 +1,56 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Clip"
+    attribute {
+      name: "max"
+      f: 0.5
+      type: FLOAT
+    }
+    attribute {
+      name: "min"
+      f: -0.5
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect
new file mode 100644
index 0000000..73cbac2
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_index.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Slice"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "ends"
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "starts"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "1"
+    output: "2"
+    op_type: "Squeeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
new file mode 100644
index 0000000..1bc3214
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "1"
+    input: "0"
+    output: "2"
+    op_type: "Greater"
+  }
+  node {
+    input: "2"
+    output: "3"
+    op_type: "Not"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: INT8
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect
new file mode 100644
index 0000000..7f4d7d9
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "LogSoftmax"
+    attribute {
+      name: "axis"
+      i: 2
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
new file mode 100644
index 0000000..3d4ba3c
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Less"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT32
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: INT8
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect
new file mode 100644
index 0000000..e046e02
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_max.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Max"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect
new file mode 100644
index 0000000..4100a93
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_maxpool.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "MaxPool"
+    attribute {
+      name: "kernel_shape"
+      ints: 3
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 20
+          }
+          dim {
+            dim_value: 16
+          }
+          dim {
+            dim_value: 50
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 20
+          }
+          dim {
+            dim_value: 16
+          }
+          dim {
+            dim_value: 24
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect
new file mode 100644
index 0000000..0f23334
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_mean.expect
@@ -0,0 +1,51 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceMean"
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect
new file mode 100644
index 0000000..0e80742
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_min.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Min"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect
new file mode 100644
index 0000000..b5fb325
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_mm.expect
@@ -0,0 +1,92 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    output: "2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: FLOAT
+        raw_data: "\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    output: "3"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 0
+      type: FLOAT
+    }
+    attribute {
+      name: "broadcast"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_non_float_params.expect b/test/onnx/expect/TestOperators.test_non_float_params.expect
new file mode 100644
index 0000000..cc3db60
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_non_float_params.expect
@@ -0,0 +1,76 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+  }
+  node {
+    input: "0"
+    input: "2"
+    output: "3"
+    op_type: "Mul"
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 2
+    dims: 2
+    data_type: INT64
+    name: "1"
+    raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_norm.expect b/test/onnx/expect/TestOperators.test_norm.expect
new file mode 100644
index 0000000..99a8a80
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_norm.expect
@@ -0,0 +1,65 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceL2"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect
new file mode 100644
index 0000000..40abf3c
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_pad.expect
@@ -0,0 +1,75 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "reflect"
+      type: STRING
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 2
+      ints: 0
+      ints: 0
+      ints: 1
+      ints: 3
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 9
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect
new file mode 100644
index 0000000..6998f1e
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_params.expect
@@ -0,0 +1,91 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Add"
+  }
+  node {
+    input: "0"
+    input: "2"
+    output: "3"
+    op_type: "Mul"
+  }
+  node {
+    input: "3"
+    output: "4"
+    op_type: "Tanh"
+  }
+  node {
+    input: "4"
+    output: "5"
+    op_type: "Sigmoid"
+  }
+  node {
+    input: "5"
+    output: "6"
+    op_type: "Neg"
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 2
+    dims: 2
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\000\000\200?\000\000\000@\000\000@@\000\000\200@"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "6"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect
new file mode 100644
index 0000000..cfdba0b
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_permute2.expect
@@ -0,0 +1,80 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 1
+      ints: 4
+      ints: 2
+      ints: 5
+      ints: 3
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect
new file mode 100644
index 0000000..8be8e75
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_pow.expect
@@ -0,0 +1,81 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Pow"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect
new file mode 100644
index 0000000..de286f1
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_prod.expect
@@ -0,0 +1,51 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceProd"
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect
new file mode 100644
index 0000000..b8b8a7d
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect
@@ -0,0 +1,65 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceMean"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
new file mode 100644
index 0000000..f40b1b5
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceMean"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect
new file mode 100644
index 0000000..de047b6
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect
@@ -0,0 +1,65 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceProd"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
new file mode 100644
index 0000000..c026d21
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceProd"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect
new file mode 100644
index 0000000..7f12a0d
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect
@@ -0,0 +1,65 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceSum"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
new file mode 100644
index 0000000..01dff8b
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
@@ -0,0 +1,68 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceSum"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect
new file mode 100644
index 0000000..7b6d687
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_repeat.expect
@@ -0,0 +1,72 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    output: "1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Tile"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 9
+          }
+          dim {
+            dim_value: 16
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
new file mode 100644
index 0000000..6070932
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -0,0 +1,85 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    output: "1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Reshape"
+  }
+  node {
+    output: "3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "2"
+    input: "3"
+    output: "4"
+    op_type: "Tile"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 8
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect
new file mode 100644
index 0000000..44ee8c9
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_selu.expect
@@ -0,0 +1,58 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Selu"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect
new file mode 100644
index 0000000..1a78045
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_sqrt.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Sqrt"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect
new file mode 100644
index 0000000..4c068db
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_sum.expect
@@ -0,0 +1,51 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "ReduceSum"
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override.expect b/test/onnx/expect/TestOperators.test_symbolic_override.expect
new file mode 100644
index 0000000..16cdb9a
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_symbolic_override.expect
@@ -0,0 +1,103 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    output: "3"
+    op_type: "InstanceNormalization"
+    attribute {
+      name: "epsilon"
+      f: 9.99999971718069e-10
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 10
+    data_type: FLOAT
+    name: "1"
+    raw_data: "\340e\355<\246\305\315>L\n\205>j\270\273>\300\314n=t]3?\200)T=\362\257\357>\217~,?b\265\251>"
+  }
+  initializer {
+    dims: 10
+    data_type: FLOAT
+    name: "2"
+    raw_data: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 10
+          }
+          dim {
+            dim_value: 32
+          }
+          dim {
+            dim_value: 32
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 10
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 10
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 10
+          }
+          dim {
+            dim_value: 32
+          }
+          dim {
+            dim_value: 32
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
new file mode 100644
index 0000000..6782beb
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
@@ -0,0 +1,104 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    input: "2"
+    output: "3"
+    op_type: "Sum"
+  }
+  node {
+    input: "0"
+    output: "4"
+    op_type: "Neg"
+  }
+  node {
+    input: "1"
+    output: "5"
+    op_type: "Neg"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "4"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect
new file mode 100644
index 0000000..4298fd2
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_transpose.expect
@@ -0,0 +1,41 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect
new file mode 100644
index 0000000..697f6b7
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_type_as.expect
@@ -0,0 +1,35 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
new file mode 100644
index 0000000..aa76b6f
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -0,0 +1,48 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Flatten"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 6
+}
diff --git a/test/onnx/expect/TestVerify.test_dynamic_model_structure.expect b/test/onnx/expect/TestVerify.test_dynamic_model_structure.expect
new file mode 100644
index 0000000..e39be34
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_dynamic_model_structure.expect
@@ -0,0 +1 @@
+When I exported your model with different inputs, the result
\ No newline at end of file
diff --git a/test/onnx/expect/TestVerify.test_embedded_constant_difference.expect b/test/onnx/expect/TestVerify.test_embedded_constant_difference.expect
new file mode 100644
index 0000000..5283f21
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_embedded_constant_difference.expect
@@ -0,0 +1,42 @@
+When I exported your model with different inputs, the result was different.
+(To get more information, run torch.onnx.verify(..., verbose=True))
+----------------------------------------------------------------------
+ERROR: Strings are not equal:
+
+  graph torch-jit-export (%name: "0"
+  type {
+    tensor_type {
+      elem_type: FLOAT
+      shape {
+        dim {
+          dim_value: 2
+        }
+        dim {
+          dim_value: 2
+        }
+      }
+    }
+  }
+  ) {
+-   %1 = Slice[axes = [0], ends = [1], starts = [0]](%0)
+?                                  ^             ^
++   %1 = Slice[axes = [0], ends = [2], starts = [1]](%0)
+?                                  ^             ^
+    %2 = Squeeze[axes = [0]](%1)
+    return %name: "2"
+  type {
+    tensor_type {
+      elem_type: FLOAT
+      shape {
+        dim {
+          dim_value: 2
+        }
+      }
+    }
+  }
+  
+  }
+
+  * A difference in model structure usually means that
+    your model has dynamic control flow.  These models are not
+    currently supported by the exporter.
\ No newline at end of file
diff --git a/test/onnx/expect/TestVerify.test_explicit_test_args.expect b/test/onnx/expect/TestVerify.test_explicit_test_args.expect
new file mode 100644
index 0000000..e39be34
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_explicit_test_args.expect
@@ -0,0 +1 @@
+When I exported your model with different inputs, the result
\ No newline at end of file
diff --git a/test/onnx/expect/TestVerify.test_jumbled_params.expect b/test/onnx/expect/TestVerify.test_jumbled_params.expect
new file mode 100644
index 0000000..2996bea
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_jumbled_params.expect
@@ -0,0 +1,8 @@
+When I exported your model with different inputs, the result was different.
+(To get more information, run torch.onnx.verify(..., verbose=True))
+----------------------------------------------------------------------
+ERROR: Parameters list differs: [] != [u'1']
+
+  * This is really strange! The second time I exported your model,
+    it had a different set of parameters.  Are you assigning Parameters
+    in the forward() of your model definition?
\ No newline at end of file
diff --git a/test/onnx/expect/TestVerify.test_modifying_params.expect b/test/onnx/expect/TestVerify.test_modifying_params.expect
new file mode 100644
index 0000000..7ddc4b4
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_modifying_params.expect
@@ -0,0 +1,5 @@
+
+Arrays are not equal
+
+(mismatch 100.0%)
+ x: array([3.], dty
\ No newline at end of file
diff --git a/test/onnx/expect/TestVerify.test_result_different.expect b/test/onnx/expect/TestVerify.test_result_different.expect
new file mode 100644
index 0000000..1320367
--- /dev/null
+++ b/test/onnx/expect/TestVerify.test_result_different.expect
@@ -0,0 +1,4 @@
+
+Not equal to tolerance rtol=0.001, atol=1e-07
+
+(mismatch 10
\ No newline at end of file
diff --git a/test/onnx/model_defs/__init__.py b/test/onnx/model_defs/__init__.py
new file mode 100644
index 0000000..8f07b0a
--- /dev/null
+++ b/test/onnx/model_defs/__init__.py
@@ -0,0 +1,4 @@
+from .squeezenet import *
+from .super_resolution import *
+from .op_test import *
+from .srresnet import *
diff --git a/test/onnx/model_defs/dcgan.py b/test/onnx/model_defs/dcgan.py
new file mode 100644
index 0000000..b65cd10
--- /dev/null
+++ b/test/onnx/model_defs/dcgan.py
@@ -0,0 +1,90 @@
+import torch
+import torch.nn as nn
+
+
+# configurable
+bsz = 64
+imgsz = 64
+nz = 100
+ngf = 64
+ndf = 64
+nc = 3
+
+
+# custom weights initialization called on netG and netD
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+
+
+class _netG(nn.Module):
+    def __init__(self, ngpu):
+        super(_netG, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is Z, going into a convolution
+            nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
+            nn.BatchNorm2d(ngf * 8),
+            nn.ReLU(True),
+            # state size. (ngf*8) x 4 x 4
+            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 4),
+            nn.ReLU(True),
+            # state size. (ngf*4) x 8 x 8
+            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 2),
+            nn.ReLU(True),
+            # state size. (ngf*2) x 16 x 16
+            nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf),
+            nn.ReLU(True),
+            # state size. (ngf) x 32 x 32
+            nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
+            nn.Tanh()
+            # state size. (nc) x 64 x 64
+        )
+
+    def forward(self, input):
+        if self.ngpu > 1 and isinstance(input.data, torch.cuda.FloatTensor):
+            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
+        else:
+            output = self.main(input)
+        return output
+
+
+class _netD(nn.Module):
+    def __init__(self, ngpu):
+        super(_netD, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is (nc) x 64 x 64
+            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf) x 32 x 32
+            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*2) x 16 x 16
+            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*4) x 8 x 8
+            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 8),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*8) x 4 x 4
+            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
+            nn.Sigmoid()
+        )
+
+    def forward(self, input):
+        if self.ngpu > 1 and isinstance(input.data, torch.cuda.FloatTensor):
+            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
+        else:
+            output = self.main(input)
+
+        return output.view(-1, 1)
diff --git a/test/onnx/model_defs/lstm_flattening_result.py b/test/onnx/model_defs/lstm_flattening_result.py
new file mode 100644
index 0000000..c55d25e
--- /dev/null
+++ b/test/onnx/model_defs/lstm_flattening_result.py
@@ -0,0 +1,7 @@
+from torch import nn
+
+
+class LstmFlatteningResult(nn.LSTM):
+    def forward(self, input, *fargs, **fkwargs):
+        output, (hidden, cell) = nn.LSTM.forward(self, input, *fargs, **fkwargs)
+        return output, hidden, cell
diff --git a/test/onnx/model_defs/mnist.py b/test/onnx/model_defs/mnist.py
new file mode 100644
index 0000000..dd54560
--- /dev/null
+++ b/test/onnx/model_defs/mnist.py
@@ -0,0 +1,22 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MNIST(nn.Module):
+
+    def __init__(self):
+        super(MNIST, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x)
diff --git a/test/onnx/model_defs/op_test.py b/test/onnx/model_defs/op_test.py
new file mode 100644
index 0000000..47c3bc2
--- /dev/null
+++ b/test/onnx/model_defs/op_test.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+class DummyNet(nn.Module):
+
+    def __init__(self, num_classes=1000):
+        super(DummyNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.LeakyReLU(0.02),
+            nn.BatchNorm2d(3),
+            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False)
+        )
+
+    def forward(self, x):
+        output = self.features(x)
+        return output.view(-1, 1).squeeze(1)
+
+
+class ConcatNet(nn.Module):
+
+    def __init__(self):
+        super(ConcatNet, self).__init__()
+
+    def forward(self, inputs):
+        return torch.cat(inputs, 1)
+
+
+class PermuteNet(nn.Module):
+
+    def __init__(self):
+        super(PermuteNet, self).__init__()
+
+    def forward(self, input):
+        return input.permute(2, 3, 0, 1)
+
+
+class PReluNet(nn.Module):
+
+    def __init__(self):
+        super(PReluNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.PReLU(3),
+        )
+
+    def forward(self, x):
+        output = self.features(x)
+        return output
diff --git a/test/onnx/model_defs/rnn_model_with_packed_sequence.py b/test/onnx/model_defs/rnn_model_with_packed_sequence.py
new file mode 100644
index 0000000..ee57984
--- /dev/null
+++ b/test/onnx/model_defs/rnn_model_with_packed_sequence.py
@@ -0,0 +1,17 @@
+from torch import nn
+from torch.nn.utils import rnn as rnn_utils
+
+
+class RnnModelWithPackedSequence(nn.Module):
+    def __init__(self, model, batch_first):
+        super(RnnModelWithPackedSequence, self).__init__()
+        self.model = model
+        self.batch_first = batch_first
+
+    def forward(self, input, *args):
+        args, seq_lengths = args[:-1], args[-1]
+        input = rnn_utils.pack_padded_sequence(input, seq_lengths, self.batch_first)
+        rets = self.model(input, *args)
+        ret, rets = rets[0], rets[1:]
+        ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first)
+        return tuple([ret] + list(rets))
diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py
new file mode 100644
index 0000000..e4ace18
--- /dev/null
+++ b/test/onnx/model_defs/squeezenet.py
@@ -0,0 +1,91 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+
+class Fire(nn.Module):
+
+    def __init__(self, inplanes, squeeze_planes,
+                 expand1x1_planes, expand3x3_planes):
+        super(Fire, self).__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
+                                   kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
+                                   kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat([
+            self.expand1x1_activation(self.expand1x1(x)),
+            self.expand3x3_activation(self.expand3x3(x))
+        ], 1)
+
+
+class SqueezeNet(nn.Module):
+
+    def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
+        super(SqueezeNet, self).__init__()
+        if version not in [1.0, 1.1]:
+            raise ValueError("Unsupported SqueezeNet version {version}:"
+                             "1.0 or 1.1 expected".format(version=version))
+        self.num_classes = num_classes
+        if version == 1.0:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 96, kernel_size=7, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(96, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                Fire(128, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(256, 32, 128, 128),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(512, 64, 256, 256),
+            )
+        else:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=3, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(64, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(128, 32, 128, 128),
+                Fire(256, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=ceil_mode),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                Fire(512, 64, 256, 256),
+            )
+        # Final convolution is initialized differently from the rest
+        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=0.5),
+            final_conv,
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(13)
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if m is final_conv:
+                    init.normal(m.weight.data, mean=0.0, std=0.01)
+                else:
+                    init.kaiming_uniform(m.weight.data)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x.view(x.size(0), self.num_classes)
diff --git a/test/onnx/model_defs/srresnet.py b/test/onnx/model_defs/srresnet.py
new file mode 100644
index 0000000..0328d39
--- /dev/null
+++ b/test/onnx/model_defs/srresnet.py
@@ -0,0 +1,81 @@
+import math
+
+from torch import nn
+from torch.nn import init
+
+
+def _initialize_orthogonal(conv):
+    prelu_gain = math.sqrt(2)
+    init.orthogonal(conv.weight, gain=prelu_gain)
+    if conv.bias is not None:
+        conv.bias.data.zero_()
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, n_filters):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(n_filters)
+        self.prelu = nn.PReLU(n_filters)
+        self.conv2 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(n_filters)
+
+        # Orthogonal initialisation
+        _initialize_orthogonal(self.conv1)
+        _initialize_orthogonal(self.conv2)
+
+    def forward(self, x):
+        residual = self.prelu(self.bn1(self.conv1(x)))
+        residual = self.bn2(self.conv2(residual))
+        return x + residual
+
+
+class UpscaleBlock(nn.Module):
+    def __init__(self, n_filters):
+        super(UpscaleBlock, self).__init__()
+        self.upscaling_conv = nn.Conv2d(n_filters, 4 * n_filters, kernel_size=3, padding=1)
+        self.upscaling_shuffler = nn.PixelShuffle(2)
+        self.upscaling = nn.PReLU(n_filters)
+        _initialize_orthogonal(self.upscaling_conv)
+
+    def forward(self, x):
+        return self.upscaling(self.upscaling_shuffler(self.upscaling_conv(x)))
+
+
+class SRResNet(nn.Module):
+    def __init__(self, rescale_factor, n_filters, n_blocks):
+        super(SRResNet, self).__init__()
+        self.rescale_levels = int(math.log(rescale_factor, 2))
+        self.n_filters = n_filters
+        self.n_blocks = n_blocks
+
+        self.conv1 = nn.Conv2d(3, n_filters, kernel_size=9, padding=4)
+        self.prelu1 = nn.PReLU(n_filters)
+
+        for residual_block_num in range(1, n_blocks + 1):
+            residual_block = ResidualBlock(self.n_filters)
+            self.add_module('residual_block' + str(residual_block_num), nn.Sequential(residual_block))
+
+        self.skip_conv = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+        self.skip_bn = nn.BatchNorm2d(n_filters)
+
+        for upscale_block_num in range(1, self.rescale_levels + 1):
+            upscale_block = UpscaleBlock(self.n_filters)
+            self.add_module('upscale_block' + str(upscale_block_num), nn.Sequential(upscale_block))
+
+        self.output_conv = nn.Conv2d(n_filters, 3, kernel_size=9, padding=4)
+
+        # Orthogonal initialisation
+        _initialize_orthogonal(self.conv1)
+        _initialize_orthogonal(self.skip_conv)
+        _initialize_orthogonal(self.output_conv)
+
+    def forward(self, x):
+        x_init = self.prelu1(self.conv1(x))
+        x = self.residual_block1(x_init)
+        for residual_block_num in range(2, self.n_blocks + 1):
+            x = getattr(self, 'residual_block' + str(residual_block_num))(x)
+        x = self.skip_bn(self.skip_conv(x)) + x_init
+        for upscale_block_num in range(1, self.rescale_levels + 1):
+            x = getattr(self, 'upscale_block' + str(upscale_block_num))(x)
+        return self.output_conv(x)
diff --git a/test/onnx/model_defs/super_resolution.py b/test/onnx/model_defs/super_resolution.py
new file mode 100644
index 0000000..d0ba46a
--- /dev/null
+++ b/test/onnx/model_defs/super_resolution.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+
+class SuperResolutionNet(nn.Module):
+    def __init__(self, upscale_factor):
+        super(SuperResolutionNet, self).__init__()
+
+        self.relu = nn.ReLU()
+        self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
+        self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
+        self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
+        self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+        self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.relu(self.conv1(x))
+        x = self.relu(self.conv2(x))
+        x = self.relu(self.conv3(x))
+        x = self.pixel_shuffle(self.conv4(x))
+        return x
+
+    def _initialize_weights(self):
+        init.orthogonal(self.conv1.weight, init.calculate_gain('relu'))
+        init.orthogonal(self.conv2.weight, init.calculate_gain('relu'))
+        init.orthogonal(self.conv3.weight, init.calculate_gain('relu'))
+        init.orthogonal(self.conv4.weight)
diff --git a/test/onnx/model_defs/word_language_model.py b/test/onnx/model_defs/word_language_model.py
new file mode 100644
index 0000000..8f3b4a2
--- /dev/null
+++ b/test/onnx/model_defs/word_language_model.py
@@ -0,0 +1,74 @@
+# The model is from here:
+#   https://github.com/pytorch/examples/blob/master/word_language_model/model.py
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+class RNNModel(nn.Module):
+    """Container module with an encoder, a recurrent module, and a decoder."""
+
+    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
+                 dropout=0.5, tie_weights=False, batchsize=2):
+        super(RNNModel, self).__init__()
+        self.drop = nn.Dropout(dropout)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        if rnn_type in ['LSTM', 'GRU']:
+            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
+        else:
+            try:
+                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
+            except KeyError:
+                raise ValueError("""An invalid option for `--model` was supplied,
+                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
+            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
+        self.decoder = nn.Linear(nhid, ntoken)
+
+        # Optionally tie weights as in:
+        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
+        # https://arxiv.org/abs/1608.05859
+        # and
+        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
+        # https://arxiv.org/abs/1611.01462
+        if tie_weights:
+            if nhid != ninp:
+                raise ValueError('When using the tied flag, nhid must be equal to emsize')
+            self.decoder.weight = self.encoder.weight
+
+        self.init_weights()
+
+        self.rnn_type = rnn_type
+        self.nhid = nhid
+        self.nlayers = nlayers
+        self.hidden = self.init_hidden(batchsize)
+
+    @staticmethod
+    def repackage_hidden(h):
+        """Detach hidden states from their history."""
+        if isinstance(h, torch.Tensor):
+            return h.detach()
+        else:
+            return tuple(RNNModel.repackage_hidden(v) for v in h)
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.fill_(0)
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, input, hidden):
+        emb = self.drop(self.encoder(input))
+        output, hidden = self.rnn(emb, hidden)
+        output = self.drop(output)
+        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
+        self.hidden = RNNModel.repackage_hidden(hidden)
+        return decoded.view(output.size(0), output.size(1), decoded.size(1))
+
+    def init_hidden(self, bsz):
+        weight = next(self.parameters()).data
+        if self.rnn_type == 'LSTM':
+            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
+                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
+        else:
+            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())
diff --git a/test/onnx/pytorch_helper.py b/test/onnx/pytorch_helper.py
new file mode 100644
index 0000000..17afa53
--- /dev/null
+++ b/test/onnx/pytorch_helper.py
@@ -0,0 +1,85 @@
+import io
+import torch.onnx
+import onnx
+from caffe2.python.onnx.backend import Caffe2Backend
+from caffe2.python.core import BlobReference, Net
+
+
+_next_idx = 0
+# Clone net takes a dict instead of a lambda
+# It should probably take a lambda, it is more flexible
+# We fake dict here
+
+
+class _FakeDict(object):
+    def __init__(self, fn):
+        self.fn = fn
+
+    def get(self, name, _):
+        return self.fn(name)
+
+
+def PyTorchModule(helper, model, sample_arguments, caffe2_inputs, prefix_name=None):
+    """
+    Embed an ONNX-exportable PyTorch Model into a Caffe2 model being built.
+
+    Arguments:
+        helper (caffe2.python.core.ModelHelder): the model helper where
+            this imported network should be inserted
+        model (torch.nn.Module): the model to be exported
+        sample_arguments (tuple of arguments): the inputs to
+            the model, e.g., such that ``model(*args)`` is a valid
+            invocation of the model.  Any non-Variable arguments will
+            be hard-coded into the exported model; any Variable arguments
+            will become inputs of the exported model, in the order they
+            occur in args.  If args is a Variable, this is equivalent
+            to having called it with a 1-ary tuple of that Variable.
+            (Note: passing keyword arguments to the model is not currently
+            supported.  Give us a shout if you need it.)
+        caffe2_inputs (list of str or caffe2.python.core.BlobReference): the
+           caffe2 Blobs that should be inputs to this network. Must be
+           the same length as sample_arguments
+        prefix_name: prefix name to add to each member of the blob, if None then
+           a fresh prefix pytorch_input_N/ is used
+    Returns:
+        A tuple of caffe2.python.core.BlobReference objects referring to the
+        models outputs, or a single BlobReference when the model returns a single
+        value.
+    """
+    if prefix_name is None:
+        global _next_idx
+        prefix_name = 'pytorch_import_' + str(_next_idx) + '/'
+        _next_idx += 1
+
+    # TODO: handle the case where model cannot be exported
+    # and embed as a Python op in Caffe2
+    f = io.BytesIO()
+    torch.onnx.export(
+        model, sample_arguments, f, export_params=True)
+    onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(
+        onnx_model)
+
+    initialized = set([x.name for x in onnx_model.graph.initializer])
+    uninitialized_inputs = {x.name: i for i, x in enumerate(
+        onnx_model.graph.input) if x.name not in initialized}
+
+    if(len(uninitialized_inputs) != len(caffe2_inputs)):
+        raise ValueError('Expected {} inputs but found {}'.format(
+            len(uninitialized_inputs), len(caffe2_inputs)))
+
+    def remap_blob_name(name):
+        if name in uninitialized_inputs:
+            idx = uninitialized_inputs[name]
+            return str(caffe2_inputs[idx])
+        return prefix_name + name
+
+    predict_net = Net(predict_net).Clone('anon', _FakeDict(remap_blob_name))
+    helper.net.AppendNet(predict_net)
+
+    init_net = Net(init_net).Clone('anon', _FakeDict(remap_blob_name))
+    helper.param_init_net.AppendNet(init_net)
+
+    results = tuple([BlobReference(remap_blob_name(x.name), helper.net)
+                     for x in onnx_model.graph.output])
+    return results
diff --git a/test/onnx/test_caffe2.py b/test/onnx/test_caffe2.py
new file mode 100644
index 0000000..85ef2ea
--- /dev/null
+++ b/test/onnx/test_caffe2.py
@@ -0,0 +1,828 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import wraps
+import numpy as np
+import sys
+import unittest
+import itertools
+
+import torch.onnx
+import torch.onnx.operators
+from torch import nn
+from torch.autograd import Variable, function
+import torch.utils.model_zoo as model_zoo
+from torch.nn.utils import rnn as rnn_utils
+from debug_embed_params import run_embed_params
+import io
+
+# Import various models for testing
+from torchvision.models.alexnet import alexnet
+from torchvision.models.inception import inception_v3
+from torchvision.models.densenet import densenet121
+from torchvision.models.resnet import resnet50
+from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn
+
+from model_defs.squeezenet import SqueezeNet
+from model_defs.super_resolution import SuperResolutionNet
+from model_defs.srresnet import SRResNet
+import model_defs.dcgan as dcgan
+import model_defs.word_language_model as word_language_model
+from model_defs.mnist import MNIST
+from model_defs.lstm_flattening_result import LstmFlatteningResult
+from model_defs.rnn_model_with_packed_sequence import RnnModelWithPackedSequence
+
+import onnx
+import caffe2.python.onnx.backend as c2
+
+from test_pytorch_common import skipIfTravis, skipIfNoLapack, skipIfNoCuda
+import verify
+
+skip = unittest.skip
+
+
+def skipIfEmbed(func):
+    def wrapper(self):
+        if self.embed_params:
+            raise unittest.SkipTest("Skip embed_params verify test")
+        return func(self)
+    return wrapper
+
+
+# def import_model(proto, input, workspace=None, use_gpu=True):
+#    model_def = onnx.ModelProto.FromString(proto)
+#    onnx.checker.check_model(model_def)
+#
+#    if workspace is None:
+#        workspace = {}
+#    if isinstance(input, tuple):
+#        for i in range(len(input)):
+#            workspace[model_def.graph.input[i]] = input[i]
+#    else:
+#        workspace[model_def.graph.input[0]] = input
+#
+#    caffe2_out_workspace = c2.run_model(
+#        init_graph=None,
+#        predict_graph=graph_def,
+#        inputs=workspace,
+#        use_gpu=use_gpu)
+#    caffe2_out = caffe2_out_workspace[0]
+#    return caffe2_out
+
+
+def do_export(model, inputs, *args, **kwargs):
+    f = io.BytesIO()
+    out = torch.onnx._export(model, inputs, f, *args, **kwargs)
+    return f.getvalue(), out
+
+
+torch.set_default_tensor_type('torch.FloatTensor')
+try:
+    import torch
+except ImportError:
+    print('Cannot import torch, hence caffe2-torch test will not run.')
+    sys.exit(0)
+
+
+BATCH_SIZE = 2
+
+RNN_BATCH_SIZE = 7
+RNN_SEQUENCE_LENGTH = 11
+RNN_INPUT_SIZE = 5
+RNN_HIDDEN_SIZE = 3
+
+model_urls = {
+    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
+    'dcgan_b': 'https://s3.amazonaws.com/pytorch/test_data/export/netG_bedroom_epoch_1-0649e76b.pth',
+    'dcgan_f': 'https://s3.amazonaws.com/pytorch/test_data/export/netG_faces_epoch_49-d86035a6.pth',
+    'densenet121': 'https://download.pytorch.org/models/densenet121-d66d3027.pth',
+    'inception_v3_google': 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'srresNet': 'https://s3.amazonaws.com/pytorch/demos/srresnet-e10b2039.pth',
+    'super_resolution': 'https://s3.amazonaws.com/pytorch/test_data/export/superres_epoch100-44c6958e.pth',
+    'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
+    'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
+    'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth',
+    'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth',
+}
+
+
+class TestCaffe2Backend(unittest.TestCase):
+    embed_params = False
+
+    def setUp(self):
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+        np.random.seed(seed=0)
+
+    def convert_cuda(self, model, input):
+        cuda_model = model.cuda()
+        # input might be nested - we want to move everything to GPU
+        cuda_input = function._nested_map(
+            lambda o: isinstance(o, Variable) or torch.is_tensor(o),
+            lambda o: o.cuda())(input)
+        return cuda_model, cuda_input
+
+    def run_debug_test(self, model, train, batch_size, state_dict=None,
+                       input=None, use_gpu=True):
+        """
+        # TODO: remove this from the final release version
+        This test is for our debugging only for the case where
+        embed_params=False
+        """
+        model.train(train)
+        if state_dict is not None:
+            model.load_state_dict(state_dict)
+
+        # Either user specified input or random (deterministic) input
+        if input is None:
+            input = Variable(torch.randn(batch_size, 3, 224, 224),
+                             requires_grad=True)
+        if use_gpu:
+            model, input = self.convert_cuda(model, input)
+
+        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False)
+        if isinstance(torch_out, torch.autograd.Variable):
+            torch_out = (torch_out,)
+
+        caffe2_out = run_embed_params(onnxir, model, input, state_dict, use_gpu)
+        for i, (x, y) in enumerate(zip(torch_out, caffe2_out)):
+            np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3)
+
+    def run_actual_test(self, model, train, batch_size, state_dict=None,
+                        input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+        """
+        This is what the user facing version will look like
+        """
+        # set the training/test mode for the model
+        model.train(train)
+        # use the pre-trained model params if available
+        if state_dict is not None:
+            model.load_state_dict(state_dict)
+
+        # Either user specified input or random (deterministic) input
+        if input is None:
+            input = Variable(torch.randn(batch_size, 3, 224, 224),
+                             requires_grad=True)
+        # GPU-ize the model, if requested
+        if use_gpu:
+            model, input = self.convert_cuda(model, input)
+
+        # Verify the model runs the same in Caffe2
+        verify.verify(model, input, c2, rtol=rtol, atol=atol)
+
+    def run_model_test(self, model, train, batch_size, state_dict=None,
+                       input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+        use_gpu_ = torch.cuda.is_available() and use_gpu
+        if self.embed_params:
+            self.run_actual_test(model, train, batch_size, state_dict, input,
+                                 use_gpu=use_gpu_, rtol=rtol, atol=atol)
+        else:
+            self.run_debug_test(model, train, batch_size, state_dict, input,
+                                use_gpu=use_gpu_)
+
+    def test_linear(self):
+        model = nn.Linear(1, 1)
+        input = Variable(torch.randn(1, 1), requires_grad=True)
+        self.run_model_test(model, train=False, batch_size=0, input=input)
+
+    def test_lstm_cell(self):
+        model = nn.LSTMCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE)
+        input = Variable(torch.randn(BATCH_SIZE, RNN_INPUT_SIZE))
+        h0 = Variable(torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE))
+        c0 = Variable(torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, (h0, c0)), use_gpu=False)
+
+    def test_gru_cell(self):
+        model = nn.GRUCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE)
+        input = Variable(torch.randn(BATCH_SIZE, RNN_INPUT_SIZE))
+        h0 = Variable(torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, h0), use_gpu=False)
+
+    def _dispatch_rnn_test(self, name, *args, **kwargs):
+        if name == 'elman':
+            self._elman_rnn_test(*args, **kwargs)
+        if name == 'lstm':
+            self._lstm_test(*args, **kwargs)
+        if name == 'gru':
+            self._gru_test(*args, **kwargs)
+
+    def _elman_rnn_test(self, layers, nonlinearity, bidirectional,
+                        initial_state, packed_sequence, dropout):
+        model = nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE,
+                       layers,
+                       nonlinearity=nonlinearity,
+                       bidirectional=bidirectional,
+                       dropout=dropout)
+
+        if packed_sequence == 1:
+            model = RnnModelWithPackedSequence(model, False)
+        if packed_sequence == 2:
+            model = RnnModelWithPackedSequence(model, True)
+
+        def make_input(batch_size):
+            seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
+            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            inputs = [Variable(torch.randn(l, RNN_INPUT_SIZE)) for l in seq_lengths]
+            inputs = rnn_utils.pad_sequence(inputs)
+            if packed_sequence == 2:
+                inputs = inputs.transpose(0, 1)
+            inputs = [inputs]
+
+            directions = 2 if bidirectional else 1
+
+            if initial_state:
+                h0 = Variable(torch.randn(directions * layers, batch_size, RNN_HIDDEN_SIZE))
+                inputs.append(h0)
+            if packed_sequence != 0:
+                inputs.append(Variable(torch.IntTensor(seq_lengths)))
+            if len(inputs) == 1:
+                input = inputs[0]
+            else:
+                input = tuple(inputs)
+            return input
+
+        input = make_input(RNN_BATCH_SIZE)
+        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False, atol=1e-7)
+
+        # test that the model still runs with a different batch size
+        onnxir, _ = do_export(model, input)
+        other_input = make_input(RNN_BATCH_SIZE + 1)
+        _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
+
+    def _lstm_test(self, layers, bidirectional, initial_state,
+                   packed_sequence, dropout):
+        model = LstmFlatteningResult(
+            RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
+            bidirectional=bidirectional, dropout=dropout)
+        if packed_sequence == 1:
+            model = RnnModelWithPackedSequence(model, False)
+        if packed_sequence == 2:
+            model = RnnModelWithPackedSequence(model, True)
+
+        def make_input(batch_size):
+            seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
+            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            inputs = [Variable(torch.randn(l, RNN_INPUT_SIZE)) for l in seq_lengths]
+            inputs = rnn_utils.pad_sequence(inputs)
+            if packed_sequence == 2:
+                inputs = inputs.transpose(0, 1)
+            inputs = [inputs]
+
+            directions = 2 if bidirectional else 1
+
+            if initial_state:
+                h0 = Variable(torch.randn(directions * layers, batch_size, RNN_HIDDEN_SIZE))
+                c0 = Variable(torch.randn(directions * layers, batch_size, RNN_HIDDEN_SIZE))
+                inputs.append((h0, c0))
+            if packed_sequence != 0:
+                inputs.append(Variable(torch.IntTensor(seq_lengths)))
+            if len(inputs) == 1:
+                input = inputs[0]
+            else:
+                input = tuple(inputs)
+            return input
+
+        input = make_input(RNN_BATCH_SIZE)
+        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False)
+
+        # test that the model still runs with a different batch size
+        onnxir, _ = do_export(model, input)
+        other_input = make_input(RNN_BATCH_SIZE + 1)
+        _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
+
+    def _gru_test(self, layers, bidirectional, initial_state,
+                  packed_sequence, dropout):
+        model = nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
+                       bidirectional=bidirectional, dropout=dropout)
+        if packed_sequence == 1:
+            model = RnnModelWithPackedSequence(model, False)
+        if packed_sequence == 2:
+            model = RnnModelWithPackedSequence(model, True)
+
+        def make_input(batch_size):
+            seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
+            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            inputs = [Variable(torch.randn(l, RNN_INPUT_SIZE)) for l in seq_lengths]
+            inputs = rnn_utils.pad_sequence(inputs)
+            if packed_sequence == 2:
+                inputs = inputs.transpose(0, 1)
+            inputs = [inputs]
+
+            directions = 2 if bidirectional else 1
+
+            if initial_state:
+                h0 = Variable(torch.randn(directions * layers, batch_size, RNN_HIDDEN_SIZE))
+                inputs.append(h0)
+            if packed_sequence != 0:
+                inputs.append(Variable(torch.IntTensor(seq_lengths)))
+            if len(inputs) == 1:
+                input = inputs[0]
+            else:
+                input = tuple(inputs)
+            return input
+
+        input = make_input(RNN_BATCH_SIZE)
+        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False)
+
+        # test that the model still runs with a different batch size
+        onnxir, _ = do_export(model, input)
+        other_input = make_input(RNN_BATCH_SIZE + 1)
+        _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
+
+    def test_rnn_init_predict_split(self):
+        model = nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 3, bidirectional=True)
+        seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=7)
+        seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+        input = [Variable(torch.randn(l, RNN_INPUT_SIZE)) for l in seq_lengths]
+        input = rnn_utils.pad_sequence(input)
+
+        # Test that we are correctly splitting between init and
+        # predict net. When we embed parameters, there should be more
+        # ops in the init net.
+        mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0])
+        prepared = c2.prepare(mp, device='CPU')
+        if self.embed_params:
+            assert len(prepared.init_net.op) == 1038
+            assert len(prepared.predict_net.op) == 101
+        else:
+            assert len(prepared.init_net.op) == 27
+            assert len(prepared.predict_net.op) == 1112
+
+    def test_alexnet(self):
+        state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False)
+        self.run_model_test(alexnet(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict, atol=1e-3)
+
+    @skipIfNoCuda
+    def test_dcgan(self):
+        # dcgan is flaky on some seeds, see:
+        # https://github.com/ProjectToffee/onnx/pull/70
+        torch.manual_seed(1)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(1)
+
+        netD = dcgan._netD(1)
+        netD.apply(dcgan.weights_init)
+        input = Variable(torch.randn(BATCH_SIZE, 3, dcgan.imgsz, dcgan.imgsz))
+        self.run_model_test(netD, train=False, batch_size=BATCH_SIZE,
+                            input=input)
+
+        netG = dcgan._netG(1)
+        netG.apply(dcgan.weights_init)
+        state_dict = model_zoo.load_url(model_urls['dcgan_b'], progress=False)
+        # state_dict = model_zoo.load_url(model_urls['dcgan_f'], progress=False)
+        noise = Variable(
+            torch.randn(BATCH_SIZE, dcgan.nz, 1, 1).normal_(0, 1))
+        self.run_model_test(netG, train=False, batch_size=BATCH_SIZE,
+                            input=noise, state_dict=state_dict, rtol=1e-2, atol=1e-6)
+
+    @unittest.skipIf(not torch.cuda.is_available(),
+                     "model on net has cuda in it, awaiting fix")
+    def test_densenet(self):
+        state_dict = model_zoo.load_url(model_urls['densenet121'], progress=False)
+        self.run_model_test(densenet121(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict, atol=1e-7)
+
+    @skip("doesn't match exactly...")
+    # TODO: figure out the numerical instabilities
+    def test_inception(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 3, 299, 299), requires_grad=True)
+        # state_dict = model_zoo.load_url(model_urls['inception_v3_google'], progress=False)
+        state_dict = None
+        self.run_model_test(inception_v3(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict, input=x)
+
+    def test_resnet(self):
+        state_dict = model_zoo.load_url(model_urls['resnet50'], progress=False)
+        self.run_model_test(resnet50(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict, atol=1e-6)
+
+    def test_squeezenet(self):
+        sqnet_v1_1 = SqueezeNet(version=1.1)
+        state_dict = model_zoo.load_url(model_urls['squeezenet1_1'], progress=False)
+        # state_dict = model_zoo.load_url(model_urls['squeezenet1_0'], progress=False)
+        self.run_model_test(sqnet_v1_1, train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict)
+
+    # @skip('takes long to run, LAPACK needed for gpu')
+    @skipIfNoLapack
+    @unittest.skip("This model takes too much memory")
+    def test_srresnet(self):
+        super_resolution_net = SRResNet(
+            rescale_factor=4, n_filters=64, n_blocks=8)
+        state_dict = model_zoo.load_url(model_urls['srresNet'], progress=False)
+        x = Variable(torch.randn(1, 3, 224, 224), requires_grad=True)
+        self.run_model_test(super_resolution_net, train=False,
+                            batch_size=1, state_dict=state_dict,
+                            input=x, use_gpu=False)
+
+    @skipIfTravis
+    @skipIfNoLapack
+    @skipIfNoCuda
+    def test_super_resolution(self):
+        super_resolution_net = SuperResolutionNet(upscale_factor=3)
+        state_dict = model_zoo.load_url(model_urls['super_resolution'], progress=False)
+        x = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+        self.run_model_test(super_resolution_net, train=False,
+                            batch_size=BATCH_SIZE, state_dict=state_dict,
+                            input=x, use_gpu=False, atol=1e-6)
+
+    @unittest.skip("This model takes too much memory")
+    def test_vgg16(self):
+        state_dict = model_zoo.load_url(model_urls['vgg16'], progress=False)
+        self.run_model_test(vgg16(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict)
+
+    @skip("disable to run tests faster...")
+    def test_vgg16_bn(self):
+        self.run_model_test(vgg16_bn(), train=False,
+                            batch_size=BATCH_SIZE)
+
+    @skip("disable to run tests faster...")
+    def test_vgg19(self):
+        state_dict = model_zoo.load_url(model_urls['vgg19'], progress=False)
+        self.run_model_test(vgg19(), train=False, batch_size=BATCH_SIZE,
+                            state_dict=state_dict)
+
+    @skip("disable to run tests faster...")
+    def test_vgg19_bn(self):
+        self.run_model_test(vgg19_bn(), train=False,
+                            batch_size=BATCH_SIZE)
+
+    def run_word_language_model(self, model_name):
+        ntokens = 50
+        emsize = 5
+        nhid = 5
+        nlayers = 5
+        dropout = 0.2
+        tied = False
+        batchsize = 5
+        model = word_language_model.RNNModel(model_name, ntokens, emsize,
+                                             nhid, nlayers, dropout, tied,
+                                             batchsize)
+        x = Variable(torch.arange(0, ntokens).long().view(-1, batchsize),
+                     requires_grad=False)
+        # Only support CPU version, since tracer is not working in GPU RNN.
+        self.run_model_test(model, train=False, input=(x, model.hidden),
+                            batch_size=batchsize, use_gpu=False)
+
+    def test_word_language_model_RNN_TANH(self):
+        self.run_word_language_model("RNN_TANH")
+
+    def test_word_language_model_RNN_RELU(self):
+        self.run_word_language_model("RNN_RELU")
+
+    def test_word_language_model_LSTM(self):
+        self.run_word_language_model("LSTM")
+
+    def test_word_language_model_GRU(self):
+        self.run_word_language_model("GRU")
+
+    def test_batchnorm1d_special(self):
+        c = Variable(torch.randn(BATCH_SIZE, 224))
+        model = nn.BatchNorm1d(224)
+        self.run_model_test(model, train=True, input=c, batch_size=BATCH_SIZE)
+
+    def test_constant(self):
+        c = Variable(torch.randn(BATCH_SIZE, 3, 224, 224))
+
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                return input + c.type_as(input)
+
+        self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
+
+    def test_consumed_bn(self):
+        underlying = nn.BatchNorm2d(3)
+        self.run_model_test(underlying, train=True, batch_size=BATCH_SIZE)
+
+    def _test_index_generic(self, fn):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                return fn(input)
+
+        m1 = Variable(torch.randn(3, 4))
+        self.run_model_test(MyModel(), input=m1, train=False, batch_size=BATCH_SIZE)
+
+    def test_index_1d(self):
+        self._test_index_generic(lambda input: input[0])
+
+    def test_index_2d_1dimslice(self):
+        self._test_index_generic(lambda input: input[0:1, :])
+
+    def test_index_2d_sliceint(self):
+        self._test_index_generic(lambda input: input[1, :])
+
+    def test_index_2d_neg_slice(self):
+        self._test_index_generic(lambda input: input[0:-1, :])
+
+    # TODO: Slicing along two dimensions is currently unsupported by the caffe2
+    # backend. Revisit if this becomes supported in the future.
+    """
+    def test_index_2d_2dimslice(self):
+        self._test_index_generic(lambda input: input[0:1, 0:1])
+    """
+    """
+    def test_index_2d_neg_slice2dim(self):
+        self._test_index_generic(lambda input: input[0:-1, 0:-1])
+    """
+
+    def test_chunk(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                # TODO: Why index? This returns a tuple and test runner doesn't
+                # support tuple comparison.
+                return input.chunk(20, dim=2)[-1]
+        self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
+
+    def test_sqrt(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                return input.sqrt()
+        input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9))
+        self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
+
+    def test_addconstant(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                # TODO: Why index? This returns a tuple and test runner doesn't
+                # support tuple comparison.
+                return input + 1
+        self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
+
+    def test_subconstant(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                # TODO: Why index? This returns a tuple and test runner doesn't
+                # support tuple comparison.
+                return input - 1
+        self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
+
+    def test_embedding(self):
+        model = nn.Embedding(10, 3, padding_idx=-1)
+        input = Variable(torch.LongTensor(list(range(10))[::-1]))
+        self.run_model_test(model, train=False, input=input, batch_size=BATCH_SIZE)
+
+    def test_constantpad2d(self):
+        model = nn.ConstantPad2d((1, 2, 3, 4), 3.5)
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_reflectionpad2d(self):
+        model = nn.ReflectionPad2d((1, 2, 3, 4))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_replicationpad2d(self):
+        model = nn.ReplicationPad2d((1, 2, 3, 4))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_maxpool2d(self):
+        model = nn.MaxPool2d(5, padding=(1, 2))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_maxpool2d_single_padding(self):
+        model = nn.MaxPool2d(5, padding=2)
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    @unittest.skip("C2 and PyTorch have small difference in padding implementation")
+    def test_avgpool2d(self):
+        model = nn.AvgPool2d(5, padding=(2))
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_avgpool2d_no_padding(self):
+        model = nn.AvgPool2d(5)
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE)
+
+    def test_weight_norm(self):
+        model = nn.utils.weight_norm(nn.Conv1d(1, 1, 3))
+        input = Variable(torch.randn(1, 1, 5), requires_grad=True)
+        self.run_model_test(
+            model, train=True, batch_size=0, input=input, use_gpu=False
+        )
+
+    def test_mnist(self):
+        model = MNIST()
+        input = Variable(torch.randn(BATCH_SIZE, 1, 28, 28))
+        state_dict = None
+        # TODO: test with state_dict
+        self.run_model_test(model, train=False, input=input, batch_size=BATCH_SIZE,
+                            state_dict=state_dict)
+
+    def test_mm(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, m1, m2):
+                return torch.mm(m1, m2)
+        m1 = Variable(torch.randn(3, 4))
+        m2 = Variable(torch.randn(4, 5))
+        self.run_model_test(MyModel(), train=False, input=(m1, m2), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_addmm(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, ma, m1, m2):
+                return torch.addmm(ma, m1, m2)
+        ma = Variable(torch.randn(5))
+        m1 = Variable(torch.randn(3, 4))
+        m2 = Variable(torch.randn(4, 5))
+        self.run_model_test(MyModel(), train=False, input=(ma, m1, m2), batch_size=BATCH_SIZE, use_gpu=False)
+
+    # test for a pytorch optimization pass, see https://github.com/pytorch/pytorch/pull/7872
+    def test_consecutive_transposes(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                return x.transpose(1, 2).transpose(2, 3)
+        x = Variable(torch.randn(5, 6, 7, 8))
+        self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_sum(self):
+        shape = (3, 4, 5)
+        for params in [{}] + [{'dim': i} for i in range(len(shape))]:
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, x):
+                    return torch.sum(x, **params)
+            x = Variable(torch.randn(*shape))
+            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_mean(self):
+        shape = (3, 4, 5)
+        for params in [{}] + [{'dim': i} for i in range(len(shape))]:
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, x):
+                    return torch.mean(x, **params)
+            x = Variable(torch.randn(*shape))
+            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    # TODO: Add test cases for prod once Caffe2 has support for ReduceProd
+
+    def test_softmax(self):
+        for i in range(7)[2:]:
+            model = nn.Softmax(dim=i - 1)
+            dims = [2] * (i - 2) + [3, 4]
+            input = Variable(torch.randn(*dims).fill_(1),
+                             requires_grad=True)
+            self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=input)
+
+    def test_logsoftmax(self):
+        for i in range(7)[2:]:
+            model = nn.LogSoftmax(dim=i - 1)
+            dims = [2] * (i - 2) + [3, 4]
+            input = Variable(torch.randn(*dims).fill_(1),
+                             requires_grad=True)
+            self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=input)
+
+    def test_convtranspose(self):
+        model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2)
+        self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7)
+
+    # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed
+    # TODO: We should have another pass to eliminate the unused initializers in ONNX models.
+    @skipIfEmbed
+    def test_instance_norm(self):
+        underlying = nn.InstanceNorm2d(3)
+        self.run_model_test(underlying, train=False, batch_size=BATCH_SIZE)
+
+    def test_dynamic_sizes(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                shape = torch.onnx.operators.shape_as_tensor(x)
+                new_shape = torch.cat((torch.LongTensor([-1]), shape[0].view(1)))
+                return torch.onnx.operators.reshape_from_tensor_shape(x, new_shape)
+        x = Variable(torch.randn(3, 5, 7))
+        self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_advanced_broadcast(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x, y):
+                return torch.mul(x, y)
+        x = Variable(torch.randn(1, 5, 10))
+        y = Variable(torch.randn(1, 5, 1))
+        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False)
+
+# a bit of metaprogramming to set up all the rnn tests
+
+
+def make_test(name, base, layer, bidirectional, initial_state,
+              variable_length, dropout,
+              **extra_kwargs):
+    test_name = str('_'.join([
+        'test', name, layer[1],
+        bidirectional[1], initial_state[1],
+        variable_length[1], dropout[1]
+    ]))
+
+    def f(self):
+        self._dispatch_rnn_test(
+            base,
+            layers=layer[0],
+            bidirectional=bidirectional[0],
+            initial_state=initial_state[0],
+            packed_sequence=variable_length[0],
+            dropout=dropout[0],
+            **extra_kwargs)
+
+    f.__name__ = test_name
+    setattr(TestCaffe2Backend, f.__name__, f)
+
+
+def setup_rnn_tests():
+    layers_opts = [
+        (1, 'unilayer'),
+        (3, 'trilayer')
+    ]
+    bidirectional_opts = [
+        (False, 'forward'),
+        (True, 'bidirectional')
+    ]
+    initial_state_opts = [
+        (True, 'with_initial_state'),
+        (False, 'no_initial_state')
+    ]
+    variable_length_opts = [
+        (0, 'without_sequence_lengths'),
+        (1, 'with_variable_length_sequences'),
+        (2, 'with_batch_first_sequence_lengths')
+    ]
+    dropout_opts = [
+        (0.2, 'with_dropout'),
+        (0.0, 'without_dropout')
+    ]
+    test_count = 0
+    for (layer, bidirectional, initial_state, variable_length, dropout) in \
+        itertools.product(
+            layers_opts,
+            bidirectional_opts,
+            initial_state_opts,
+            variable_length_opts,
+            dropout_opts,
+    ):
+
+        for base, name, extra_kwargs in (
+                ('elman', 'elman_relu', {'nonlinearity': u'relu'}),
+                ('elman', 'elman_tanh', {'nonlinearity': u'tanh'}),
+                ('lstm', 'lstm', {}),
+                ('gru', 'gru', {})
+        ):
+            make_test(name, base, layer, bidirectional, initial_state,
+                      variable_length, dropout,
+                      **extra_kwargs)
+            test_count += 1
+
+    # sanity check that a representative example does exist
+    TestCaffe2Backend.test_gru_trilayer_forward_with_initial_state_without_sequence_lengths_with_dropout
+
+    # make sure no one accidentally disables all the tests without
+    # noticing
+    assert test_count == 192, test_count
+setup_rnn_tests()
+
+# add the same test suite as above, but switch embed_params=False
+# to embed_params=True
+TestCaffe2BackendEmbed = type(str("TestCaffe2BackendEmbed"),
+                              (unittest.TestCase,),
+                              dict(TestCaffe2Backend.__dict__, embed_params=True))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/onnx/test_caffe2_common.py b/test/onnx/test_caffe2_common.py
new file mode 100644
index 0000000..dba6649
--- /dev/null
+++ b/test/onnx/test_caffe2_common.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import glob
+import numpy as np
+import onnx.backend.test
+import caffe2.python.onnx.backend as c2
+import os
+from onnx import numpy_helper
+
+
+def load_tensor_as_numpy_array(f):
+    tensor = onnx.TensorProto()
+    with open(f, 'rb') as file:
+        tensor.ParseFromString(file.read())
+    return tensor
+
+
+def assert_similar(ref, real):
+    np.testing.assert_equal(len(ref), len(real))
+    for i in range(len(ref)):
+        np.testing.assert_allclose(ref[i], real[i], rtol=1e-3)
+
+
+def run_generated_test(model_file, data_dir, device='CPU'):
+    model = onnx.load(model_file)
+    input_num = len(glob.glob(os.path.join(data_dir, "input_*.pb")))
+    inputs = []
+    for i in range(input_num):
+        inputs.append(numpy_helper.to_array(load_tensor_as_numpy_array(
+            os.path.join(data_dir, "input_{}.pb".format(i)))))
+    output_num = len(glob.glob(os.path.join(data_dir, "output_*.pb")))
+    outputs = []
+    for i in range(output_num):
+        outputs.append(numpy_helper.to_array(load_tensor_as_numpy_array(
+            os.path.join(data_dir, "output_{}.pb".format(i)))))
+    prepared = c2.prepare(model, device=device)
+    c2_outputs = prepared.run(inputs)
+    assert_similar(outputs, c2_outputs)
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
new file mode 100644
index 0000000..d20dfa2
--- /dev/null
+++ b/test/onnx/test_models.py
@@ -0,0 +1,164 @@
+from torchvision.models.alexnet import alexnet
+from torchvision.models.inception import inception_v3
+from torchvision.models.densenet import densenet121
+from torchvision.models.resnet import resnet50
+from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn
+
+from model_defs.mnist import MNIST
+from model_defs.word_language_model import RNNModel
+from model_defs.squeezenet import SqueezeNet
+from model_defs.super_resolution import SuperResolutionNet
+from model_defs.srresnet import SRResNet
+from model_defs.dcgan import _netD, _netG, weights_init, bsz, imgsz, nz
+from model_defs.op_test import DummyNet, ConcatNet, PermuteNet, PReluNet
+
+from test_pytorch_common import TestCase, run_tests, skipIfNoLapack, skipIfCI
+
+import torch
+import torch.onnx
+import torch.onnx.utils
+from torch.autograd import Variable, Function
+from torch.nn import Module
+from torch.onnx import OperatorExportTypes
+
+import onnx
+import onnx.checker
+import onnx.helper
+
+import google.protobuf.text_format
+
+import io
+import unittest
+
+import caffe2.python.onnx.backend as backend
+
+from verify import verify
+
+if torch.cuda.is_available():
+    def toC(x):
+        return x.cuda()
+else:
+    def toC(x):
+        return x
+
+BATCH_SIZE = 2
+
+
+class TestModels(TestCase):
+    def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7):
+        trace = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX)
+        torch._C._jit_pass_lint(trace.graph())
+        verify(model, inputs, backend, rtol=rtol, atol=atol)
+
+    def test_ops(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
+        )
+        self.exportTest(toC(DummyNet()), toC(x))
+
+    def test_prelu(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
+        )
+        self.exportTest(PReluNet(), x)
+
+    def test_concat(self):
+        input_a = Variable(torch.randn(BATCH_SIZE, 3))
+        input_b = Variable(torch.randn(BATCH_SIZE, 3))
+        inputs = ((toC(input_a), toC(input_b)), )
+        self.exportTest(toC(ConcatNet()), inputs)
+
+    def test_permute(self):
+        x = Variable(torch.randn(BATCH_SIZE, 3, 10, 12))
+        self.exportTest(PermuteNet(), x)
+
+    @unittest.skip("This model takes too much memory")
+    def test_srresnet(self):
+        x = Variable(torch.randn(1, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x))
+
+    @skipIfCI
+    @skipIfNoLapack
+    def test_super_resolution(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0)
+        )
+        self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6)
+
+    def test_alexnet(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
+        )
+        self.exportTest(toC(alexnet()), toC(x))
+
+    @unittest.skip("Waiting for https://github.com/pytorch/pytorch/pull/3100")
+    def test_mnist(self):
+        x = Variable(torch.randn(BATCH_SIZE, 1, 28, 28).fill_(1.0))
+        self.exportTest(toC(MNIST()), toC(x))
+
+    @skipIfCI
+    def test_vgg16(self):
+        # VGG 16-layer model (configuration "D")
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(vgg16()), toC(x))
+
+    @skipIfCI
+    def test_vgg16_bn(self):
+        # VGG 16-layer model (configuration "D") with batch normalization
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(vgg16_bn()), toC(x))
+
+    @skipIfCI
+    def test_vgg19(self):
+        # VGG 19-layer model (configuration "E")
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(vgg19()), toC(x))
+
+    @skipIfCI
+    def test_vgg19_bn(self):
+        # VGG 19-layer model (configuration 'E') with batch normalization
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(vgg19_bn()), toC(x))
+
+    def test_resnet(self):
+        # ResNet50 model
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(resnet50()), toC(x), atol=1e-6)
+
+    def test_inception(self):
+        x = Variable(
+            torch.randn(BATCH_SIZE, 3, 299, 299).fill_(1.0))
+        self.exportTest(toC(inception_v3()), toC(x))
+
+    def test_squeezenet(self):
+        # SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and
+        # <0.5MB model size
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        sqnet_v1_0 = SqueezeNet(version=1.1)
+        self.exportTest(toC(sqnet_v1_0), toC(x))
+
+        # SqueezeNet 1.1 has 2.4x less computation and slightly fewer params
+        # than SqueezeNet 1.0, without sacrificing accuracy.
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        sqnet_v1_1 = SqueezeNet(version=1.1)
+        self.exportTest(toC(sqnet_v1_1), toC(x))
+
+    def test_densenet(self):
+        # Densenet-121 model
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
+        self.exportTest(toC(densenet121()), toC(x))
+
+    def test_dcgan_netD(self):
+        netD = _netD(1)
+        netD.apply(weights_init)
+        input = Variable(torch.Tensor(bsz, 3, imgsz, imgsz).normal_(0, 1))
+        self.exportTest(toC(netD), toC(input))
+
+    def test_dcgan_netG(self):
+        netG = _netG(1)
+        netG.apply(weights_init)
+        input = Variable(torch.Tensor(bsz, nz, 1, 1).normal_(0, 1))
+        self.exportTest(toC(netG), toC(input))
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/onnx/test_onnx_common.py b/test/onnx/test_onnx_common.py
new file mode 100644
index 0000000..9183a4b
--- /dev/null
+++ b/test/onnx/test_onnx_common.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+
+
+onnx_model_dir = os.path.join(os.path.dirname(
+    os.path.realpath(__file__)), os.pardir, "repos", "onnx", "onnx",
+    "backend", "test", "data")
+
+
+pytorch_converted_dir = os.path.join(onnx_model_dir, "pytorch-converted")
+
+
+pytorch_operator_dir = os.path.join(onnx_model_dir, "pytorch-operator")
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
new file mode 100644
index 0000000..cbde1fa
--- /dev/null
+++ b/test/onnx/test_operators.py
@@ -0,0 +1,458 @@
+from test_pytorch_common import TestCase, run_tests, skipIfNoLapack, flatten
+import test_onnx_common
+
+import torch
+import torch.onnx
+from torch.autograd import Variable, Function
+from torch.nn import Module
+import torch.nn as nn
+
+import onnx
+import onnx.checker
+import onnx.helper
+
+import google.protobuf.text_format
+
+import itertools
+import io
+import unittest
+import inspect
+import argparse
+import glob
+import os
+import shutil
+import sys
+import common
+from onnx import numpy_helper
+
+_onnx_test = False
+
+
+def export_to_string(model, inputs, *args, **kwargs):
+    f = io.BytesIO()
+    with torch.no_grad():
+        torch.onnx.export(model, inputs, f, *args, **kwargs)
+    return f.getvalue()
+
+
+class FuncModule(Module):
+    def __init__(self, f, params=tuple()):
+        super(FuncModule, self).__init__()
+        self.f = f
+        self.params = nn.ParameterList(list(params))
+
+    def forward(self, *args):
+        return self.f(*itertools.chain(args, self.params))
+
+
+class TestOperators(TestCase):
+
+    def assertONNXExpected(self, binary_pb, subname=None):
+        model_def = onnx.ModelProto.FromString(binary_pb)
+        onnx.checker.check_model(model_def)
+        # doc_string contains stack trace in it, strip it
+        onnx.helper.strip_doc_string(model_def)
+        self.assertExpected(google.protobuf.text_format.MessageToString(model_def, float_format='.15g'), subname)
+        return model_def
+
+    def assertONNX(self, f, args, params=tuple(), **kwargs):
+        if isinstance(f, nn.Module):
+            m = f
+        else:
+            m = FuncModule(f, params)
+        onnx_model_pb = export_to_string(m, args, **kwargs)
+        model_def = self.assertONNXExpected(onnx_model_pb)
+        if _onnx_test:
+            test_function = inspect.stack()[1][0].f_code.co_name
+            test_name = test_function[0:4] + "_operator" + test_function[4:]
+            output_dir = os.path.join(test_onnx_common.pytorch_operator_dir, test_name)
+            # Assume:
+            #     1) the old test should be delete before the test.
+            #     2) only one assertONNX in each test, otherwise will override the data.
+            assert not os.path.exists(output_dir), "{} should not exist!".format(output_dir)
+            os.makedirs(output_dir)
+            with open(os.path.join(output_dir, "model.onnx"), 'wb') as file:
+                file.write(model_def.SerializeToString())
+            data_dir = os.path.join(output_dir, "test_data_set_0")
+            os.makedirs(data_dir)
+            if isinstance(args, Variable):
+                args = (args,)
+            for index, var in enumerate(flatten(args)):
+                tensor = numpy_helper.from_array(var.data.numpy())
+                with open(os.path.join(data_dir, "input_{}.pb".format(index)), 'wb') as file:
+                    file.write(tensor.SerializeToString())
+            outputs = m(*args)
+            if isinstance(outputs, Variable):
+                outputs = (outputs,)
+            for index, var in enumerate(flatten(outputs)):
+                tensor = numpy_helper.from_array(var.data.numpy())
+                with open(os.path.join(data_dir, "output_{}.pb".format(index)), 'wb') as file:
+                    file.write(tensor.SerializeToString())
+
+    def assertONNXRaises(self, err, f, args, params=tuple(), **kwargs):
+        if isinstance(f, nn.Module):
+            m = f
+        else:
+            m = FuncModule(f, params)
+        self.assertExpectedRaises(err, lambda: export_to_string(m, args, **kwargs))
+
+    def assertONNXRaisesRegex(self, err, reg, f, args, params=tuple(), **kwargs):
+        if isinstance(f, nn.Module):
+            m = f
+        else:
+            m = FuncModule(f, params)
+        with self.assertRaisesRegex(err, reg):
+            export_to_string(m, args, **kwargs)
+
+    def test_basic(self):
+        x = Variable(torch.Tensor([0.4]), requires_grad=True)
+        y = Variable(torch.Tensor([0.7]), requires_grad=True)
+        self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), (x, y))
+
+    def test_view(self):
+        x = Variable(torch.Tensor([0]), requires_grad=True)
+        self.assertONNX(lambda x: x.view(1, 1), x)
+
+    def test_index(self):
+        x = Variable(torch.Tensor([[0]]), requires_grad=True)
+        self.assertONNX(lambda x: x[0], x)
+
+    def test_type_as(self):
+        x = Variable(torch.Tensor([0]), requires_grad=True)
+        self.assertONNX(lambda x: x.type_as(x), x)
+
+    def test_addconstant(self):
+        x = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        self.assertONNX(lambda x: x + 1, x)
+
+    def test_add_broadcast(self):
+        x = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        y = Variable(torch.DoubleTensor(3), requires_grad=True)
+        self.assertONNX(lambda x, y: x + y, (x, y))
+
+    def test_add_left_broadcast(self):
+        x = Variable(torch.DoubleTensor(3), requires_grad=True)
+        y = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        self.assertONNXRaisesRegex(RuntimeError,
+                                   r"ONNX export failed: Could not export a broadcasted operation.*",
+                                   lambda x, y: x + y, (x, y), verbose=True)
+
+    def test_add_size1_broadcast(self):
+        x = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        y = Variable(torch.DoubleTensor(2, 1), requires_grad=True)
+        self.assertONNX(lambda x, y: x + y, (x, y))
+
+    def test_add_size1_right_broadcast(self):
+        x = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        y = Variable(torch.DoubleTensor(3), requires_grad=True)
+        self.assertONNX(lambda x, y: x + y, (x, y))
+
+    def test_add_size1_singleton_broadcast(self):
+        x = Variable(torch.DoubleTensor(2, 3), requires_grad=True)
+        y = Variable(torch.DoubleTensor(1, 3), requires_grad=True)
+        self.assertONNX(lambda x, y: x + y, (x, y))
+
+    def test_transpose(self):
+        x = Variable(torch.Tensor([[0, 1], [2, 3]]), requires_grad=True)
+        self.assertONNX(lambda x: x.transpose(0, 1).transpose(1, 0), x)
+
+    def test_chunk(self):
+        x = Variable(torch.Tensor([0, 1, 2]), requires_grad=True)
+        self.assertONNX(lambda x: x.chunk(2), x)
+
+    def test_concat2(self):
+        x = Variable(torch.randn(2, 3))
+        y = Variable(torch.randn(2, 3))
+        self.assertONNX(lambda inputs: torch.cat(inputs, 1), ((x, y),))
+
+    def test_mm(self):
+        m1 = Variable(torch.randn(2, 3), requires_grad=True)
+        m2 = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(torch.mm, (m1, m2))
+
+    def test_addmm(self):
+        m1 = Variable(torch.randn(2, 3), requires_grad=True)
+        m2 = Variable(torch.randn(3, 4), requires_grad=True)
+        m3 = Variable(torch.randn(4), requires_grad=True)
+        self.assertONNX(lambda x, y, z: torch.addmm(torch.addmm(z, x, y), x, y), (m1, m2, m3))
+
+    def test_permute2(self):
+        x = Variable(torch.Tensor([[[[[[0]]]]]]), requires_grad=True)
+        self.assertONNX(lambda x: x.permute(0, 1, 4, 2, 5, 3), x)
+
+    def test_pad(self):
+        x = Variable(torch.Tensor([[[[0, 1, 1, 1], [2, 3, 7, 7]]]]), requires_grad=True)
+        self.assertONNX(nn.ReflectionPad2d((2, 3, 0, 1)), x)
+
+    def test_params(self):
+        x = Variable(torch.Tensor([[1, 2], [3, 4]]), requires_grad=True)
+        y = nn.Parameter(torch.Tensor([[1, 2], [3, 4]]), requires_grad=True)
+        self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), x, params=(y, ))
+
+    def test_symbolic_mismatch(self):
+        class MyFun(Function):
+            @staticmethod
+            def symbolic(g, x):
+                # The inside of this function should never be invoked, because
+                # we will fail due to an argument mismatch first.
+                assert False
+
+            @staticmethod
+            def forward(ctx, x, y):
+                return x + y
+
+        x = Variable(torch.randn(2, 2).fill_(1.0))
+        y = Variable(torch.randn(2, 2).fill_(1.0))
+        # NB: Don't use expect test here, the type error wobbles depending
+        # on Python version
+        with self.assertRaisesRegex(TypeError, "occurred when translating MyFun"):
+            export_to_string(FuncModule(MyFun().apply), (x, y))
+
+    # TODO: Do an nn style test for these
+    def test_batchnorm(self):
+        x = Variable(torch.randn(2, 2, 2, 2).fill_(1.0), requires_grad=True)
+        self.assertONNX(nn.BatchNorm2d(2), x)
+
+    def test_batchnorm_1d(self):
+        x = Variable(torch.randn(2, 2).fill_(1.0), requires_grad=True)
+        self.assertONNX(nn.BatchNorm1d(2), x)
+
+    def test_batchnorm_training(self):
+        x = Variable(torch.randn(2, 2, 2, 2).fill_(1.0), requires_grad=True)
+        self.assertONNX(nn.BatchNorm2d(2), x, training=True)
+
+    def test_conv(self):
+        x = Variable(torch.randn(20, 16, 50, 40).fill_(1.0), requires_grad=True)
+        self.assertONNX(nn.Conv2d(16, 13, 3, bias=False), x)
+
+    def test_convtranspose(self):
+        x = Variable(torch.randn(2, 3, 4, 5).fill_(1.0), requires_grad=True)
+        self.assertONNX(nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False,
+                                           padding=1, output_padding=2), x)
+
+    def test_maxpool(self):
+        x = Variable(torch.randn(20, 16, 50))
+        self.assertONNX(nn.MaxPool1d(3, stride=2), x)
+
+    def test_at_op(self):
+        x = Variable(torch.randn(3, 4))
+
+        class MyFun(Function):
+
+            @staticmethod
+            def symbolic(g, x):
+                return g.at("add", x, x)
+
+            @staticmethod
+            def forward(ctx, x):
+                return x + x
+
+        class MyModule(Module):
+            def forward(self, x):
+                return MyFun.apply(x)
+
+        self.assertONNX(MyModule(), x)
+
+    def test_clip(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.clamp(x, min=-0.5, max=0.5), x)
+
+    def test_clip_min(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.clamp(min=-0.1), x)
+
+    def test_clip_max(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.clamp(max=0.1), x)
+
+    def test_hardtanh(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.nn.Hardtanh(-0.5, 0.5)(x), x)
+
+    def test_max(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        y = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x, y: torch.max(x, y), (x, y))
+
+    def test_min(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        y = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x, y: torch.min(x, y), (x, y))
+
+    def test_mean(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.mean(x), x)
+
+    def test_reduced_mean(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.mean(x, dim=2), x)
+
+    def test_reduced_mean_keepdim(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.mean(x, dim=2, keepdim=True), x)
+
+    def test_sum(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.sum(x), x)
+
+    def test_reduced_sum(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.sum(x, dim=2), x)
+
+    def test_reduced_sum_keepdim(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.sum(x, dim=2, keepdim=True), x)
+
+    def test_prod(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.prod(x), x)
+
+    def test_reduced_prod(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.prod(x, dim=2), x)
+
+    def test_reduced_prod_keepdim(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.prod(x, dim=2, keepdim=True), x)
+
+    def test_sqrt(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: torch.sqrt(x), x)
+
+    def test_equal(self):
+        x = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        y = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        self.assertONNX(lambda x, y: x == y, (x, y))
+
+    def test_lt(self):
+        x = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        y = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        self.assertONNX(lambda x, y: x < y, (x, y))
+
+    def test_gt(self):
+        x = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        y = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        self.assertONNX(lambda x, y: x > y, (x, y))
+
+    def test_le(self):
+        x = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        y = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        self.assertONNX(lambda x, y: x <= y, (x, y))
+
+    def test_ge(self):
+        x = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        y = Variable(torch.randn(3, 4).int(), requires_grad=False)
+        self.assertONNX(lambda x, y: x >= y, (x, y))
+
+    def test_exp(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.exp(), x)
+
+    def test_flatten(self):
+        # Flatten is a special case of Reshape when the output is a 2-D tensor.
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.view(x.size()[0], x.numel() // x.size()[0]), x)
+
+    def test_logsoftmax(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(nn.LogSoftmax(dim=2), x)
+
+    def test_pow(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        y = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x, y: x.pow(y), (x, y))
+
+    def test_selu(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(nn.SELU(), x)
+
+    def test_repeat(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.repeat(1, 2, 3, 4), x)
+
+    def test_repeat_dim_overflow(self):
+        x = Variable(torch.randn(1, 2), requires_grad=True)
+        self.assertONNX(lambda x: x.repeat(1, 2, 3, 4), x)
+
+    def test_norm(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.norm(dim=2), (x))
+
+    def test_symbolic_override(self):
+        """Lifted from fast-neural-style: custom implementation of instance norm
+        to be mapped to ONNX operator"""
+
+        class CustomInstanceNorm(torch.nn.Module):
+            def __init__(self, dim, eps=1e-9):
+                super(CustomInstanceNorm, self).__init__()
+                self.scale = nn.Parameter(torch.FloatTensor(dim).uniform_())
+                self.shift = nn.Parameter(torch.FloatTensor(dim).zero_())
+                self.eps = eps
+
+            def forward(self, x):
+                return self._run_forward(x, self.scale, self.shift, eps=self.eps)
+
+            @staticmethod
+            @torch.onnx.symbolic_override(
+                lambda g, x, scale, shift, eps: g.op(
+                    'InstanceNormalization', x, scale, shift, epsilon_f=eps)
+            )
+            def _run_forward(x, scale, shift, eps):
+                # since we hand-roll instance norm it doesn't perform well all in fp16
+                n = x.size(2) * x.size(3)
+                t = x.view(x.size(0), x.size(1), n)
+                mean = torch.mean(t, 2).unsqueeze(2).unsqueeze(3).expand_as(x)
+                # Calculate the biased var. torch.var returns unbiased var
+                var = torch.var(t, 2).unsqueeze(2).unsqueeze(3).expand_as(x) * ((float(n) - 1) / float(n))
+                scale_broadcast = scale.unsqueeze(1).unsqueeze(1).unsqueeze(0)
+                scale_broadcast = scale_broadcast.expand_as(x)
+                shift_broadcast = shift.unsqueeze(1).unsqueeze(1).unsqueeze(0)
+                shift_broadcast = shift_broadcast.expand_as(x)
+                out = (x - mean) / torch.sqrt(var + eps)
+                out = out * scale_broadcast + shift_broadcast
+                return out
+
+        instnorm = CustomInstanceNorm(10)
+        x = Variable(torch.randn(2, 10, 32, 32))
+        self.assertONNX(instnorm, x)
+
+    """
+    def test_rnn(self):
+        rnn = nn.RNN(30, 20, 2)
+        input = Variable(torch.randn(10, 32, 30))
+        output, hidden = rnn(input)
+        self.assertONNX(rnn, input)
+    """
+
+    def test_symbolic_override_nested(self):
+        def symb(g, x, y):
+            assert isinstance(x, torch._C.Value)
+            assert isinstance(y[0], torch._C.Value)
+            assert isinstance(y[1], torch._C.Value)
+            return g.op('Sum', x, y[0], y[1]), (
+                g.op('Neg', x), g.op('Neg', y[0]))
+
+        @torch.onnx.symbolic_override_first_arg_based(symb)
+        def foo(x, y):
+            return x + y[0] + y[1], (-x, -y[0])
+
+        class BigModule(torch.nn.Module):
+            def forward(self, x, y):
+                return foo(x, y)
+
+        inp = (Variable(torch.FloatTensor([1])),
+               (Variable(torch.FloatTensor([2])),
+                Variable(torch.FloatTensor([3]))))
+        BigModule()(*inp)
+        self.assertONNX(BigModule(), inp)
+
+
+if __name__ == '__main__':
+    onnx_test_flag = '--onnx-test'
+    _onnx_test = onnx_test_flag in common.UNITTEST_ARGS
+    if onnx_test_flag in common.UNITTEST_ARGS:
+        common.UNITTEST_ARGS.remove(onnx_test_flag)
+    if _onnx_test:
+        for d in glob.glob(os.path.join(test_onnx_common.pytorch_operator_dir, "test_operator_*")):
+            shutil.rmtree(d)
+    run_tests()
diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py
new file mode 100644
index 0000000..b830889
--- /dev/null
+++ b/test/onnx/test_pytorch_common.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+import os
+import unittest
+import sys
+import torch
+import torch.autograd.function as function
+
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.insert(-1, pytorch_test_dir)
+
+from common import *
+
+torch.set_default_tensor_type('torch.FloatTensor')
+
+
+def _skipper(condition, reason):
+    def decorator(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            if condition():
+                raise unittest.SkipTest(reason)
+            return f(*args, **kwargs)
+        return wrapper
+    return decorator
+
+
+skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(),
+                        'CUDA is not available')
+
+skipIfTravis = _skipper(lambda: os.getenv('TRAVIS'),
+                        'Skip In Travis')
+
+skipIfCI = _skipper(lambda: os.getenv('CI') or os.getenv('TRAVIS') or os.getenv('JENKINS_URL'),
+                    'Skip In CI')
+
+
+def flatten(x):
+    return tuple(function._iter_filter(lambda o: isinstance(o, torch.Tensor))(x))
diff --git a/test/onnx/test_pytorch_helper.py b/test/onnx/test_pytorch_helper.py
new file mode 100644
index 0000000..3da5d89
--- /dev/null
+++ b/test/onnx/test_pytorch_helper.py
@@ -0,0 +1,68 @@
+# Some standard imports
+import numpy as np
+from torch import nn
+from torch.autograd import Variable
+import torch.onnx
+import torch.nn.init as init
+from caffe2.python.model_helper import ModelHelper
+from pytorch_helper import PyTorchModule
+import unittest
+from caffe2.python.core import workspace
+
+from test_pytorch_common import skipIfNoLapack
+
+
+class TestCaffe2Backend(unittest.TestCase):
+
+    @skipIfNoLapack
+    def test_helper(self):
+
+        class SuperResolutionNet(nn.Module):
+            def __init__(self, upscale_factor, inplace=False):
+                super(SuperResolutionNet, self).__init__()
+
+                self.relu = nn.ReLU(inplace=inplace)
+                self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
+                self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
+                self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
+                self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+                self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
+
+                self._initialize_weights()
+
+            def forward(self, x):
+                x = self.relu(self.conv1(x))
+                x = self.relu(self.conv2(x))
+                x = self.relu(self.conv3(x))
+                x = self.pixel_shuffle(self.conv4(x))
+                return x
+
+            def _initialize_weights(self):
+                init.orthogonal(self.conv1.weight, init.calculate_gain('relu'))
+                init.orthogonal(self.conv2.weight, init.calculate_gain('relu'))
+                init.orthogonal(self.conv3.weight, init.calculate_gain('relu'))
+                init.orthogonal(self.conv4.weight)
+
+        torch_model = SuperResolutionNet(upscale_factor=3)
+
+        fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+
+        # use ModelHelper to create a C2 net
+        helper = ModelHelper(name="test_model")
+        start = helper.Sigmoid(['the_input'])
+        # Embed the ONNX-converted pytorch net inside it
+        toutput, = PyTorchModule(helper, torch_model, (fake_input,), [start])
+        output = helper.Sigmoid(toutput)
+
+        workspace.RunNetOnce(helper.InitProto())
+        workspace.FeedBlob('the_input', fake_input.data.numpy())
+        # print([ k for k in workspace.blobs ])
+        workspace.RunNetOnce(helper.Proto())
+        c2_out = workspace.FetchBlob(str(output))
+
+        torch_out = torch.sigmoid(torch_model(torch.sigmoid(fake_input)))
+
+        np.testing.assert_almost_equal(torch_out.data.cpu().numpy(), c2_out, decimal=3)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/onnx/test_verify.py b/test/onnx/test_verify.py
new file mode 100644
index 0000000..009010a
--- /dev/null
+++ b/test/onnx/test_verify.py
@@ -0,0 +1,121 @@
+import torch
+from torch.autograd import Variable, Function
+from torch.nn import Module, Parameter
+import caffe2.python.onnx.backend as backend
+from verify import verify
+
+from test_pytorch_common import TestCase, run_tests
+
+import unittest
+
+
+class TestVerify(TestCase):
+    maxDiff = None
+
+    def assertVerifyExpectFail(self, *args, **kwargs):
+        try:
+            verify(*args, **kwargs)
+        except AssertionError as e:
+            if str(e):
+                # substring a small piece of string because the exact message
+                # depends on system's formatting settings
+                self.assertExpected(str(e)[:60])
+                return
+            else:
+                raise
+        # Don't put this in the try block; the AssertionError will catch it
+        self.assertTrue(False, msg="verify() did not fail when expected to")
+
+    def test_result_different(self):
+        class BrokenAdd(Function):
+            @staticmethod
+            def symbolic(g, a, b):
+                return g.op("Add", a, b)
+
+            @staticmethod
+            def forward(ctx, a, b):
+                return a.sub(b)  # yahaha! you found me!
+
+        class MyModel(Module):
+            def forward(self, x, y):
+                return BrokenAdd().apply(x, y)
+
+        x = Variable(torch.Tensor([1, 2]))
+        y = Variable(torch.Tensor([3, 4]))
+        self.assertVerifyExpectFail(MyModel(), (x, y), backend)
+
+    def test_jumbled_params(self):
+        class MyModel(Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                y = x * x
+                self.param = Parameter(torch.Tensor([2]))
+                return y
+
+        x = Variable(torch.Tensor([1, 2]))
+        with self.assertRaisesRegex(RuntimeError, "state_dict changed"):
+            verify(MyModel(), x, backend)
+
+    def test_modifying_params(self):
+        class MyModel(Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+                self.param = Parameter(torch.Tensor([2]))
+
+            def forward(self, x):
+                y = x * x
+                self.param.data.add_(1.0)
+                return y
+
+        x = Variable(torch.Tensor([1, 2]))
+        self.assertVerifyExpectFail(MyModel(), x, backend)
+
+    def test_dynamic_model_structure(self):
+        class MyModel(Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+                self.iters = 0
+
+            def forward(self, x):
+                if self.iters % 2 == 0:
+                    r = x * x
+                else:
+                    r = x + x
+                self.iters += 1
+                return r
+
+        x = Variable(torch.Tensor([1, 2]))
+        self.assertVerifyExpectFail(MyModel(), x, backend)
+
+    @unittest.skip("Indexing is broken by #3725")
+    def test_embedded_constant_difference(self):
+        class MyModel(Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+                self.iters = 0
+
+            def forward(self, x):
+                r = x[self.iters % 2]
+                self.iters += 1
+                return r
+
+        x = Variable(torch.Tensor([[1, 2], [3, 4]]))
+        self.assertVerifyExpectFail(MyModel(), x, backend)
+
+    def test_explicit_test_args(self):
+        class MyModel(Module):
+            def forward(self, x):
+                if x.data.sum() == 1.0:
+                    return x + x
+                else:
+                    return x * x
+
+        x = Variable(torch.Tensor([[6, 2]]))
+        y = Variable(torch.Tensor([[2, -1]]))
+        self.assertVerifyExpectFail(MyModel(), x, backend, test_args=[(y,)])
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/onnx/verify.py b/test/onnx/verify.py
new file mode 100644
index 0000000..f36415d
--- /dev/null
+++ b/test/onnx/verify.py
@@ -0,0 +1,448 @@
+import torch
+import torch.jit
+import torch.onnx
+
+import onnx
+import onnx.helper
+
+import numpy as np
+
+import difflib
+import contextlib
+import io
+
+
+def colonize(msg, sep=": "):
+    if not msg:
+        return ""
+    else:
+        return msg + sep
+
+
+class Errors(object):
+    """
+    An error-collecting object which supports error recovery.
+
+    It is intended to be used like a context manager:
+
+    >>> with Errors("Top-level error message") as errs:
+    >>>     ...
+    """
+
+    def __init__(self, msg, rtol=1e-3, atol=1e-7):
+        self.msg = msg
+        self.errors = []
+        self.context = []
+        self.rtol = rtol
+        self.atol = atol
+
+        # Allocated upon instance creation so that multiple Errors
+        # can be used
+        class ShortCircuit(Exception):
+            pass
+        self.exc_class = ShortCircuit
+
+    def requireAlmostEqual(self, x, y, msg=None):
+        """
+        Test that x and y are nearly equal (equal within self.rtol
+        precision); aborts execution if they are not.
+        """
+        self.almostEqualAndThen(x, y, msg, self.failWith)
+
+    def checkAlmostEqual(self, x, y, msg=None):
+        """
+        Test that x and y are nearly equal (equal within self.rtol
+        precision), but continue execution even if they are not equal.
+
+        To prevent error cascades, you should remember to call 'failIfErrs'
+        at some later point in time.
+        """
+        self.almostEqualAndThen(x, y, msg, self.addErr)
+
+    def almostEqualAndThen(self, x, y, msg, k):
+        """
+        Helper for implementing 'requireAlmostEqual' and 'checkAlmostEqual'.
+        Upon failure, invokes continuation 'k' with the error message.
+
+        At the moment, only tests on 'numpy.ndarray' are supported.
+        """
+        if isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
+            try:
+                np.testing.assert_allclose(x, y, rtol=self.rtol, atol=self.atol, equal_nan=False, verbose=True)
+            except AssertionError as e:
+                raise
+                k("{}{}".format(colonize(msg), str(e).lstrip()))
+        else:
+            raise RuntimeError("Unsupported almost equal test")
+
+    def requireEqual(self, x, y, msg=None):
+        """
+        Test that x and y are equal; aborts execution if they are not.
+        """
+        self.equalAndThen(x, y, msg, self.failWith)
+
+    def checkEqual(self, x, y, msg=None):
+        """
+        Test that x and y are equal, but continue execution even if they are not equal.
+
+        To prevent error cascades, you should remember to call 'failIfErrs'
+        at some later point in time.
+        """
+        self.equalAndThen(x, y, msg, self.addErr)
+
+    # Bit-for-bit accuracy test
+    def equalAndThen(self, x, y, msg, k):
+        """
+        Helper for implementing 'requireEqual' and 'checkEqual'.  Upon failure,
+        invokes continuation 'k' with the error message.
+        """
+        if isinstance(x, onnx.TensorProto) and isinstance(y, onnx.TensorProto):
+            self.equalAndThen(x.name, y.name, msg, k)
+            # Use numpy for the comparison
+            t1 = onnx.numpy_helper.to_array(x)
+            t2 = onnx.numpy_helper.to_array(y)
+            new_msg = "{}In embedded parameter '{}'".format(colonize(msg), x.name)
+            self.equalAndThen(t1, t2, new_msg, k)
+        elif isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
+            try:
+                np.testing.assert_equal(x, y)
+            except AssertionError as e:
+                raise
+                k("{}{}".format(colonize(msg, ": "), str(e).lstrip()))
+        else:
+            if x != y:
+                # TODO: Better algorithm for lists
+                sx = str(x)
+                sy = str(y)
+                if len(sx) > 40 or len(sy) > 40 or '\n' in sx or '\n' in sy:
+                    # long form
+                    l = "=" * 50
+                    k("\n{}The value\n{}\n{}\n{}\n\ndoes not equal\n\n{}\n{}\n{}"
+                        .format(colonize(msg, ":\n"), l, sx, l, l, sy, l))
+                else:
+                    k("{}{} != {}".format(colonize(msg), sx, sy))
+
+    def requireMultiLineEqual(self, x, y, msg=None):
+        """
+        Test that long, multi-line strings x and y are equal;
+        aborts execution if they are not.
+        """
+        self.multiLineEqualAndThen(x, y, msg, self.failWith)
+
+    def multiLineEqualAndThen(self, x, y, msg, k):
+        """
+        Helper for implementing 'requireMultiLineEqual'.  Upon failure,
+        invokes continuation 'k' with the error message.
+        """
+        if msg is None:
+            msg = "Strings are not equal"
+        if x != y:
+            diff = difflib.ndiff(x.splitlines(True), y.splitlines(True))
+            k("{}{}".format(colonize(msg, ":\n\n"), "".join(diff)))
+
+    def addErr(self, msg):
+        """
+        Add an error to the error context, but continue executing.
+        """
+        # TODO: instead of immediately concatenating the context in the msg,
+        # attach it as metadata and make a decision how to format it later.
+        msg_w_ctx = msg
+        for c in reversed(self.context):
+            msg += "\n\n  * " + "\n    ".join(c.splitlines())
+        self.errors.append(msg)
+
+    def fail(self):
+        """
+        Immediately fail and short-circuit to the next recovery context.
+
+        NB: It is an error to 'fail' without having added any errors to
+        the error context.
+        """
+        raise self.exc_class()
+
+    def failWith(self, msg):
+        """
+        Add an error to the error context, and then short-circuit.
+        """
+        self.addErr(msg)
+        self.fail()
+
+    def failIfErrs(self):
+        """
+        If there are any errors in the error context, short-circuit.
+
+        This is used to prevent error cascades.
+        """
+        if self.errors:
+            self.fail()
+
+    def recover(parent_self):
+        """
+        Returns a context manager which can be used to recover in case of
+        an error.  Example usage:
+
+        >>> with errs.recover():
+        >>>     ...
+        """
+        class Recover(object):
+            def __enter__(self):
+                pass
+
+            def __exit__(self, exc_type, exc_value, traceback):
+                if exc_type == parent_self.exc_class:
+                    return True
+        return Recover()
+
+    def addErrCtxt(parent_self, msg):
+        """
+        Returns a context manager which encloses a fragment of code with
+        an extra contextual message, e.g., where an error occurred, or a hint
+        applicable to all errors in the area.  Example usage:
+
+        >>> with errs.addErrCtx("Some text"):
+        >>>     ...
+        """
+        class AddContext(object):
+            def __enter__(self):
+                parent_self.context.append(msg)
+
+            def __exit__(self, exc_type, exc_value, traceback):
+                parent_self.context.pop()
+        return AddContext()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.errors:
+            errors_msg = "\n\n".join(map(lambda x: "ERROR: " + x, self.errors))
+            final_msg = "{}\n{}\n{}".format(self.msg, '-' * 70, errors_msg)
+            raise AssertionError(final_msg)
+        if exc_type == self.exc_class:
+            raise RuntimeError("ShortCircuit was raised, but no errors were recorded")
+
+
+@contextlib.contextmanager
+def set_training(model, mode):
+    """
+    A context manager to temporarily set the training mode of 'model'
+    to 'mode', resetting it when we exit the with-block.
+    """
+    old_mode = model.training
+    if old_mode != mode:
+        model.train(mode)
+    try:
+        yield
+    finally:
+        if old_mode != mode:
+            model.train(old_mode)
+
+
+def verify(model, args, backend, verbose=False, training=False, rtol=1e-3, atol=1e-7, test_args=2):
+    """
+    Export a model into ONNX, import it into a specified ONNX backend, and then
+    on a few random inputs verify that PyTorch and the backend produced the same
+    results.  Requires onnx to be installed.
+
+    This function may spuriously fail: some operators are implemented with
+    different numerical precision in an ONNX backend, in which case an unstable
+    network (e.g., Inception) may blow up these numerical instabilities.  This
+    situation is less likely to happen if your model has been trained.  However,
+    if this is not the case, you may have found a bug!  Please report it to the
+    PyTorch developers.  You can also debug the issue yourself by removing
+    suffixes of operators from your model until verification passes.
+
+    For reproduceability, we recommend explicitly setting PyTorch's seed before
+    invoking this function.
+
+    Arguments:
+        model (torch.nn.Module): the model to be exported and verified
+        args (tuple of arguments): the inputs to
+            the model, e.g., such that ``model(*args)`` is a valid
+            invocation of the model.  Any non-Variable arguments will
+            be hard-coded into the exported model; any Variable arguments
+            will become inputs of the exported model, in the order they
+            occur in args.  If args is a Variable, this is equivalent
+            to having called it with a 1-ary tuple of that Variable.
+            (Note: passing keyword arguments to the model is not currently
+            supported.  Give us a shout if you need it.)
+        backend (onnx.backend module): ONNX backend to verify with
+        verbose (bool, default False): if specified, we will print out a debug
+            description of the trace being exported.
+        training (bool, default False): export the model in training mode.  At
+            the moment, ONNX is oriented towards exporting models for inference
+            only, so you will generally not need to set this to True.
+        rtol (float, default 1e-3): relative precision required
+        test_args (int or iterable of args, default 2):
+            either an integer specifying the number
+            of random arguments to generate, or an iterable producing arguments
+            to test under.
+    """
+    def _nested_map(condition, fn, condition_msg=None):
+        def _map(obj):
+            if condition(obj):
+                return fn(obj)
+            elif obj is None:
+                return None
+            elif isinstance(obj, (list, tuple)):
+                return type(obj)(_map(x) for x in obj)
+            else:
+                raise ValueError("Auto nesting doesn't know how to process "
+                                 "an input object of type " + torch.typename(obj) +
+                                 (". Accepted types: " + condition_msg +
+                                  ", or lists/tuples of them"
+                                  if condition_msg else ""))
+
+        return _map
+
+    def _iter_filter(condition, allow_unknown=False, condition_msg=None):
+        def _iter(obj):
+            if condition(obj):
+                yield obj
+            elif obj is None:
+                return
+            elif isinstance(obj, (list, tuple)):
+                for o in obj:
+                    for var in _iter(o):
+                        yield var
+            elif allow_unknown:
+                yield obj
+            else:
+                raise ValueError("Auto nesting doesn't know how to process "
+                                 "an input object of type " + torch.typename(obj) +
+                                 (". Accepted types: " + condition_msg +
+                                  ", or lists/tuples of them"
+                                  if condition_msg else ""))
+
+        return _iter
+
+    def is_tensor(o):
+        return isinstance(o, torch.Tensor)
+
+    _iter_tensors = _iter_filter(is_tensor, condition_msg="Tensors")
+
+    def randomize_arg(arg):
+        new_data = arg.data.clone()
+        # For now, don't try randomizing non-float tensors; these
+        # are likely to be things like indices, where just randomly
+        # spattering some longs is unlikely to work.  One way we could
+        # make this work is to apply a random permutation or something.
+        if arg.is_floating_point():
+            new_data.uniform_()
+        return torch.autograd.Variable(new_data, requires_grad=arg.requires_grad)
+
+    randomize_args = _nested_map(is_tensor, randomize_arg)
+
+    def backend_args(args):
+        # TODO: onnx should accept iterables
+        return tuple(v.data.cpu().numpy() for v in _iter_tensors(args))
+
+    def load_bytes(b):
+        b.seek(0)
+        x = onnx.load(b)
+        # doc_string has stack traces - let's remove them to make comparison
+        # sane
+        onnx.helper.strip_doc_string(x)
+        return x
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    with set_training(model, training):
+        proto_bytes = io.BytesIO()
+        torch_out = torch.onnx._export(model, args, proto_bytes, verbose=verbose)
+        proto = load_bytes(proto_bytes)
+        prepared = backend.prepare(proto)
+
+        def run(args):
+            alt_proto_bytes = io.BytesIO()
+            torch_out = torch.onnx._export(model, args, alt_proto_bytes, verbose=verbose)
+            alt_proto = load_bytes(alt_proto_bytes)
+            if proto.SerializeToString() != alt_proto.SerializeToString():
+                # OK, let's try to figure out what happened.
+                msg = "When I exported your model with different inputs, the result was different."
+                if not verbose:
+                    msg += "\n(To get more information, run torch.onnx.verify(..., verbose=True))"
+                with Errors(msg, rtol=rtol, atol=atol) as errs:
+                    # First, check if we have the same number of parameters, and
+                    # that they're the same order.  If they don't, something has *really* gone wrong.
+                    initializer_order_hint = ("This is really strange! The second time I exported your model,\n"
+                                              "it had a different set of parameters.  Are you assigning Parameters\n"
+                                              "in the forward() of your model definition?")
+                    with errs.addErrCtxt(initializer_order_hint):
+                        errs.requireEqual(list(map(lambda x: x.name, proto.graph.initializer)),
+                                          list(map(lambda x: x.name, alt_proto.graph.initializer)),
+                                          msg="Parameters list differs")
+
+                    # Now check if the embedded parameters are actually the same
+                    initializer_hint = ("A difference in embedded parameters usually means that\n"
+                                        "your model is updating parameters/buffers even in inference\n"
+                                        "mode.  Look for a buggy nn.Module which isn't respecting train().\n")
+                    with errs.recover(), errs.addErrCtxt(initializer_hint):
+                        for x, y in zip(proto.graph.initializer, alt_proto.graph.initializer):
+                            errs.checkEqual(x, y)
+
+                    # Next, check if the model structure lines up.
+                    structure_hint = ("A difference in model structure usually means that\n"
+                                      "your model has dynamic control flow.  These models are not\n"
+                                      "currently supported by the exporter.")
+                    with errs.recover(), errs.addErrCtxt(structure_hint):
+                        # Delete initializers since we already tested them
+                        stripped_proto = onnx.ModelProto()
+                        stripped_proto.CopyFrom(proto)
+                        del stripped_proto.graph.initializer[:]
+
+                        stripped_alt_proto = onnx.ModelProto()
+                        stripped_alt_proto.CopyFrom(alt_proto)
+                        del stripped_alt_proto.graph.initializer[:]
+
+                        # Compare the printable graph representations first
+                        errs.requireMultiLineEqual(onnx.helper.printable_graph(stripped_proto.graph),
+                                                   onnx.helper.printable_graph(stripped_alt_proto.graph))
+
+                        # Compare the actual protobuf text formats now (not
+                        # very user-friendly!)
+                        errs.requireMultiLineEqual(str(stripped_proto), str(stripped_alt_proto))
+
+                        # One last ditch effort, using built-in equality on
+                        # protobufs
+                        errs.requireEqual(stripped_proto, stripped_alt_proto)
+
+                    errs.failIfErrs()
+
+                    # At this point, we should have figured out why the binary
+                    # protobufs differed, and short-circuited out of this code
+                    # with a helpful error message.  But what if we didn't?
+                    # We better still try to give a good error message in this
+                    # case.  We EXPECT these requires to fail.  If they don't,
+                    # that is a bug in verify
+                    errs.requireEqual(proto, alt_proto)
+                    errs.requireEqual(proto_bytes.getvalue(), alt_proto_bytes.getvalue())
+                    assert False
+
+            # TODO: test that the traced model also returns the same thing...
+            run_helper(torch_out, args)
+
+        # Factored out so we can avoid one run of the model
+        def run_helper(torch_out, args):
+            backend_out = prepared.run(backend_args(args))
+            if isinstance(torch_out, torch.Tensor):
+                torch_out = (torch_out,)
+            # NB: onnx backend NEVER returns bare numpy array
+            msg = "ONNX backend returned different results from PyTorch"
+            result_hint = ("If you are not using trained parameters, a difference in results\n"
+                           "could mean that your network is numerically unstable.  Otherwise\n"
+                           "it indicates a bug in PyTorch/ONNX; please file a bug report.")
+            with Errors(msg, rtol=rtol, atol=atol) as errs, errs.addErrCtxt(result_hint):
+                for i, (x, y) in enumerate(zip(torch_out, backend_out)):
+                    errs.checkAlmostEqual(x.data.cpu().numpy(), y, "In output {}".format(i))
+
+        run_helper(torch_out, args)
+
+        if isinstance(test_args, int):
+            for i in range(test_args):
+                run(randomize_args(args))
+        else:
+            for test_arg in test_args:
+                run(test_arg)
diff --git a/test/optim/compare.sh b/test/optim/compare.sh
new file mode 100755
index 0000000..b4356db
--- /dev/null
+++ b/test/optim/compare.sh
@@ -0,0 +1,14 @@
+
+th test.lua > lua.out
+python3 test.py > python.out
+
+diff lua.out python.out >/dev/null 2>&1
+RESULT=$?
+if [[ RESULT -eq 0 ]]; then
+    echo "PASS"
+else
+    echo "FAIL"
+    echo "Press ENTER to open vimdiff"
+    read
+    vimdiff lua.out python.out
+fi
diff --git a/test/optim/test.lua b/test/optim/test.lua
new file mode 100644
index 0000000..33e47ac
--- /dev/null
+++ b/test/optim/test.lua
@@ -0,0 +1,33 @@
+local cjson = require 'cjson'
+require 'optim'
+
+function rosenbrock(t)
+    x, y = t[1], t[2]
+    return (1 - x) ^ 2 + 100 * (y - x^2)^2
+end
+
+function drosenbrock(t)
+    x, y = t[1], t[2]
+    return torch.DoubleTensor({-400 * x * (y - x^2) - 2 * (1 - x), 200 * x * (y - x^2)})
+end
+
+local fd = io.open('tests.json', 'r')
+local tests = cjson.decode(fd:read('*a'))
+fd:close()
+
+for i, test in ipairs(tests) do
+    print(test.algorithm)
+    algorithm = optim[test.algorithm]
+    for i, config in ipairs(test.config) do
+        print('================================================================================')
+        params = torch.DoubleTensor({1.5, 1.5})
+        for i = 1, 100 do
+            function closure(x)
+                return rosenbrock(x), drosenbrock(x)
+            end
+            algorithm(closure, params, config)
+            print(string.format('%.8f\t%.8f', params[1], params[2]))
+        end
+    end
+end
+
diff --git a/test/optim/test.py b/test/optim/test.py
new file mode 100644
index 0000000..459bc0f
--- /dev/null
+++ b/test/optim/test.py
@@ -0,0 +1,41 @@
+import json
+import torch
+import torch.legacy.optim as optim
+from pprint import pprint
+
+
+def rosenbrock(tensor):
+    x, y = tensor
+    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
+
+
+def drosenbrock(tensor):
+    x, y = tensor
+    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
+
+algorithms = {
+    'adadelta': optim.adadelta,
+    'adagrad': optim.adagrad,
+    'adam': optim.adam,
+    'adamax': optim.adamax,
+    'asgd': optim.asgd,
+    'cg': optim.cg,
+    'nag': optim.nag,
+    'rmsprop': optim.rmsprop,
+    'rprop': optim.rprop,
+    'sgd': optim.sgd,
+    'lbfgs': optim.lbfgs,
+}
+
+with open('tests.json', 'r') as f:
+    tests = json.loads(f.read())
+
+for test in tests:
+    print(test['algorithm'] + '\t')
+    algorithm = algorithms[test['algorithm']]
+    for config in test['config']:
+        print('================================================================================\t')
+        params = torch.DoubleTensor((1.5, 1.5))
+        for i in range(100):
+            algorithm(lambda x: (rosenbrock(x), drosenbrock(x)), params, config)
+            print('{:.8f}\t{:.8f}\t'.format(params[0], params[1]))
diff --git a/test/optim/tests.json b/test/optim/tests.json
new file mode 100644
index 0000000..d4ef3ab
--- /dev/null
+++ b/test/optim/tests.json
@@ -0,0 +1,109 @@
+[
+    {
+        "algorithm": "adadelta",
+        "config": [
+            {},
+            {"rho": 0.95},
+            {"rho": 0.95, "eps": 1e-3},
+            {"weightDecay": 0.2}
+        ]
+    },
+    {
+        "algorithm": "adagrad",
+        "config": [
+            {}
+        ]
+    },
+    {
+        "algorithm": "adam",
+        "config": [
+            {},
+            {"learningRate": 1e-4},
+            {"learningRate": 1e-4, "beta1": 0.92},
+            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
+            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
+            {"learningRate": 1e-4, "weightDecay": 0.1}
+        ]
+    },
+    {
+        "algorithm": "adamax",
+        "config": [
+            {},
+            {"learningRate": 1e-4},
+            {"learningRate": 1e-4, "beta1": 0.92},
+            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
+            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3}
+        ]
+    },
+    {
+        "algorithm": "asgd",
+        "config": [
+            {},
+            {"eta0": 1e-4},
+            {"eta0": 1e-4, "lambda": 1e-2},
+            {"eta0": 1e-4, "lambda": 1e-2, "alpha": 0.9},
+            {"eta0": 1e-4, "lambda": 1e-2, "alpha": 0.9, "t0": 10}
+        ]
+    },
+    {
+        "algorithm": "cg",
+        "config": [
+            {},
+            {"rho": 0.02},
+            {"sig": 0.06},
+            {"int": 0.12},
+            {"ext": 3.2},
+            {"maxIter": 5},
+            {"ratio": 95}
+        ]
+    },
+    {
+        "algorithm": "nag",
+        "config": [
+            {},
+            {"learningRate": 1e-4},
+            {"learningRate": 1e-4, "learningRateDecay": 0.1},
+            {"learningRate": 1e-4, "weightDecay": 0.3},
+            {"learningRate": 1e-4, "momentum": 0.95},
+            {"learningRate": 1e-4, "momentum": 0.95, "dampening": 0.8}
+        ]
+    },
+    {
+        "algorithm": "rmsprop",
+        "config": [
+            {},
+            {"learningRate": 1e-4},
+            {"learningRate": 1e-4, "alpha": 0.95},
+            {"learningRate": 1e-4, "alpha": 0.95, "epsilon": 1e-3},
+            {"weightDecay": 0.2}
+        ]
+    },
+    {
+        "algorithm": "rprop",
+        "config": [
+            {},
+            {"stepsize": 0.05},
+            {"stepsize": 0.05, "etaplus": 1.15},
+            {"stepsize": 0.05, "etaplus": 1.15, "etaminus": 0.6},
+            {"stepsize": 0.05, "etaplus": 1.15, "etaminus": 0.6, "stepsizemax": 1, "stepsizemin": 1e-3},
+            {"stepsize": 0.05, "etaplus": 1.15, "etaminus": 0.6, "niter": 10}
+        ]
+    },
+    {
+        "algorithm": "sgd",
+        "config": [
+            {},
+            {"learningRate": 1e-4},
+            {"learningRate": 1e-4, "momentum": 0.95, "dampening": 0.9},
+            {"learningRate": 1e-4, "nesterov": true, "momentum": 0.95, "dampening": 0},
+            {"weightDecay": 0.2}
+        ]
+    },
+    {
+        "algorithm": "lbfgs",
+        "config": [
+            {},
+            {"learningRate": 1e-1}
+        ]
+    }
+]
diff --git a/test/run_test.py b/test/run_test.py
new file mode 100644
index 0000000..65aa100
--- /dev/null
+++ b/test/run_test.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import argparse
+import os
+import shlex
+import shutil
+import signal
+import subprocess
+import sys
+import tempfile
+
+import torch
+from torch.utils import cpp_extension
+
+TESTS = [
+    'autograd',
+    'cpp_extensions',
+    'c10d',
+    'cuda',
+    'dataloader',
+    'distributed',
+    'distributions',
+    'indexing',
+    'jit',
+    'legacy_nn',
+    'multiprocessing',
+    'nccl',
+    'nn',
+    'optim',
+    'sparse',
+    'torch',
+    'utils',
+]
+
+WINDOWS_BLACKLIST = [
+    'distributed',
+]
+
+DISTRIBUTED_TESTS_CONFIG = {
+    'tcp': {
+        'WORLD_SIZE': '3'
+    },
+    'gloo': {
+        'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
+    },
+    'nccl': {
+        'WORLD_SIZE': '2'
+    },
+    'mpi': {
+        'WORLD_SIZE': '3'
+    },
+}
+
+# https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
+SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) for n in dir(signal)
+                             if n.startswith('SIG') and '_' not in n)
+
+
+def print_to_stderr(message):
+    print(message, file=sys.stderr)
+
+
+def shell(command, cwd):
+    sys.stdout.flush()
+    sys.stderr.flush()
+    return subprocess.call(
+        shlex.split(command), universal_newlines=True, cwd=cwd)
+
+
+def get_shell_output(command):
+    return subprocess.check_output(shlex.split(command)).decode().strip()
+
+
+def run_test(python, test_module, test_directory, options):
+    verbose = '--verbose' if options.verbose else ''
+    # Can't call `python -m unittest test_*` here because it doesn't run code
+    # in `if __name__ == '__main__': `. So call `python test_*.py` instead.
+    return shell('{} {}.py {}'.format(python, test_module, verbose),
+                 test_directory)
+
+
+def test_cpp_extensions(python, test_module, test_directory, options):
+    try:
+        cpp_extension.verify_ninja_availability()
+    except RuntimeError:
+        print(
+            'Ninja is not available. Skipping C++ extensions test. '
+            "Install ninja with 'pip install ninja' or 'conda install ninja'.")
+        return 0
+    return_code = shell('{} setup.py install --root ./install'.format(python),
+                        os.path.join(test_directory, 'cpp_extensions'))
+    if return_code != 0:
+        return return_code
+
+    python_path = os.environ.get('PYTHONPATH', '')
+    try:
+        cpp_extensions = os.path.join(test_directory, 'cpp_extensions')
+        install_directory = ''
+        # install directory is the one that is named site-packages
+        for root, directories, _ in os.walk(os.path.join(cpp_extensions, 'install')):
+            for directory in directories:
+                if '-packages' in directory:
+                    install_directory = os.path.join(root, directory)
+
+        assert install_directory, 'install_directory must not be empty'
+        os.environ['PYTHONPATH'] = os.pathsep.join([install_directory, python_path])
+        return run_test(python, test_module, test_directory, options)
+    finally:
+        os.environ['PYTHONPATH'] = python_path
+
+
+def test_distributed(python, test_module, test_directory, options):
+    mpi_available = subprocess.call('command -v mpiexec', shell=True) == 0
+    if options.verbose and not mpi_available:
+        print_to_stderr(
+            'MPI not available -- MPI backend tests will be skipped')
+    for backend, env_vars in DISTRIBUTED_TESTS_CONFIG.items():
+        if backend == 'mpi' and not mpi_available:
+            continue
+        for with_init_file in {True, False}:
+            tmp_dir = tempfile.mkdtemp()
+            if options.verbose:
+                with_init = ' with file init_method' if with_init_file else ''
+                print_to_stderr(
+                    'Running distributed tests for the {} backend{}'.format(
+                        backend, with_init))
+            os.environ['TEMP_DIR'] = tmp_dir
+            os.environ['BACKEND'] = backend
+            os.environ['INIT_METHOD'] = 'env://'
+            os.environ.update(env_vars)
+            if with_init_file:
+                init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                os.environ['INIT_METHOD'] = init_method
+            try:
+                os.mkdir(os.path.join(tmp_dir, 'barrier'))
+                os.mkdir(os.path.join(tmp_dir, 'test_dir'))
+                if backend == 'mpi':
+                    # test mpiexec for --noprefix option
+                    devnull = open(os.devnull, 'w')
+                    noprefix_opt = '--noprefix' if subprocess.call(
+                        'mpiexec -n 1 --noprefix bash -c ""', shell=True,
+                        stdout=devnull, stderr=subprocess.STDOUT) == 0 else ''
+
+                    mpiexec = 'mpiexec -n 3 {} {}'.format(noprefix_opt, python)
+
+                    return_code = run_test(mpiexec, test_module,
+                                           test_directory, options)
+                else:
+                    return_code = run_test(python, test_module, test_directory,
+                                           options)
+                if return_code != 0:
+                    return return_code
+            finally:
+                shutil.rmtree(tmp_dir)
+    return 0
+
+
+CUSTOM_HANDLERS = {
+    'cpp_extensions': test_cpp_extensions,
+    'distributed': test_distributed,
+}
+
+
+def parse_test_module(test):
+    return test.split('.')[0]
+
+
+class TestChoices(list):
+    def __init__(self, *args, **kwargs):
+        super(TestChoices, self).__init__(args[0])
+
+    def __contains__(self, item):
+        return list.__contains__(self, parse_test_module(item))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Run the PyTorch unit test suite',
+        epilog='where TESTS is any of: {}'.format(', '.join(TESTS)))
+    parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='print verbose information and test-by-test results')
+    parser.add_argument(
+        '-p', '--python', help='the python interpreter to execute tests with')
+    parser.add_argument(
+        '-c', '--coverage', action='store_true', help='enable coverage')
+    parser.add_argument(
+        '-i',
+        '--include',
+        nargs='+',
+        choices=TestChoices(TESTS),
+        default=TESTS,
+        metavar='TESTS',
+        help='select a set of tests to include (defaults to ALL tests).'
+             ' tests can be specified with module name, module.TestClass'
+             ' or module.TestClass.test_method')
+    parser.add_argument(
+        '-x',
+        '--exclude',
+        nargs='+',
+        choices=TESTS,
+        metavar='TESTS',
+        default=[],
+        help='select a set of tests to exclude')
+    parser.add_argument(
+        '-f',
+        '--first',
+        choices=TESTS,
+        metavar='TESTS',
+        help='select the test to start from (excludes previous tests)')
+    parser.add_argument(
+        '-l',
+        '--last',
+        choices=TESTS,
+        metavar='TESTS',
+        help='select the last test to run (excludes following tests)')
+    parser.add_argument(
+        '--ignore-win-blacklist',
+        action='store_true',
+        help='always run blacklisted windows tests')
+    return parser.parse_args()
+
+
+def get_python_command(options):
+    if options.coverage:
+        return 'coverage run --parallel-mode --source torch'
+    elif options.python:
+        return options.python
+    else:
+        return os.environ.get('PYCMD', 'python')
+
+
+def find_test_index(test, selected_tests, find_last_index=False):
+    """Find the index of the first or last occurrence of a given test/test module in the list of seleceted tests.
+
+    This function is used to determine the indexes when slicing the list of selected tests when
+    ``options.first``(:attr:`find_last_index`=False) and/or ``options.last``(:attr:`find_last_index`=True) are used.
+
+    :attr:`selected_tests` can be a list that contains multiple consequent occurrences of tests
+    as part of the same test module, e.g.:
+
+    ```
+    selected_tests = ['autograd', 'cuda', **'torch.TestTorch.test_acos',
+                     'torch.TestTorch.test_tan', 'torch.TestTorch.test_add'**, 'utils']
+    ```
+
+    If :attr:`test`='torch' and :attr:`find_last_index`=False result should be **2**.
+    If :attr:`test`='torch' and :attr:`find_last_index`=True result should be **4**.
+
+    Arguments:
+        test (str): Name of test to lookup
+        selected_tests (list): List of tests
+        find_last_index (bool, optional): should we lookup the index of first or last
+            occurrence (first is default)
+
+    Returns:
+        index of the first or last occurance of the given test
+    """
+    idx = 0
+    found_idx = -1
+    for t in selected_tests:
+        if t.startswith(test):
+            found_idx = idx
+            if not find_last_index:
+                break
+        idx += 1
+    return found_idx
+
+
+def exclude_tests(exclude_list, selected_tests, exclude_message=None):
+    tests_copy = selected_tests[:]
+    for exclude_test in exclude_list:
+        for test in tests_copy:
+            if test.startswith(exclude_test):
+                if exclude_message is not None:
+                    print_to_stderr('Excluding {} {}'.format(test, exclude_message))
+                selected_tests.remove(test)
+    return selected_tests
+
+
+def get_selected_tests(options):
+    selected_tests = options.include
+
+    if options.first:
+        first_index = find_test_index(options.first, selected_tests)
+        selected_tests = selected_tests[first_index:]
+
+    if options.last:
+        last_index = find_test_index(options.last, selected_tests, find_last_index=True)
+        selected_tests = selected_tests[:last_index + 1]
+
+    selected_tests = exclude_tests(options.exclude, selected_tests)
+
+    if sys.platform == 'win32' and not options.ignore_win_blacklist:
+        ostype = os.environ.get('MSYSTEM')
+        target_arch = os.environ.get('VSCMD_ARG_TGT_ARCH')
+        if ostype != 'MINGW64' or target_arch != 'x64':
+            WINDOWS_BLACKLIST.append('cpp_extensions')
+
+        selected_tests = exclude_tests(WINDOWS_BLACKLIST, selected_tests, 'on Windows')
+
+    return selected_tests
+
+
+def main():
+    options = parse_args()
+    python = get_python_command(options)
+    test_directory = os.path.dirname(os.path.abspath(__file__))
+    selected_tests = get_selected_tests(options)
+
+    if options.verbose:
+        print_to_stderr('Selected tests: {}'.format(', '.join(selected_tests)))
+
+    if options.coverage:
+        shell('coverage erase')
+
+    for test in selected_tests:
+        test_name = 'test_{}'.format(test)
+        test_module = parse_test_module(test)
+
+        print_to_stderr('Running {} ...'.format(test_name))
+        handler = CUSTOM_HANDLERS.get(test_module, run_test)
+        return_code = handler(python, test_name, test_directory, options)
+        assert isinstance(return_code, int) and not isinstance(
+            return_code, bool), 'Return code should be an integer'
+        if return_code != 0:
+            message = '{} failed!'.format(test_name)
+            if return_code < 0:
+                # subprocess.Popen returns the child process' exit signal as
+                # return code -N, where N is the signal number.
+                signal_name = SIGNALS_TO_NAMES_DICT[-return_code]
+                message += ' Received signal: {}'.format(signal_name)
+            raise RuntimeError(message)
+
+    if options.coverage:
+        shell('coverage combine')
+        shell('coverage html')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test/test_autograd.py b/test/test_autograd.py
new file mode 100644
index 0000000..4c54f5f
--- /dev/null
+++ b/test/test_autograd.py
@@ -0,0 +1,3493 @@
+import contextlib
+import gc
+import sys
+import math
+import torch
+import unittest
+import warnings
+from copy import deepcopy
+from collections import OrderedDict
+from itertools import product
+from operator import mul, itemgetter
+from functools import reduce, wraps
+from torch._six import inf, nan
+from torch.autograd.gradcheck import gradgradcheck, gradcheck
+from torch.autograd.function import once_differentiable
+from torch.autograd.profiler import profile
+from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
+    suppress_warnings, skipIfNoZeroSize
+from torch.autograd import Variable, Function, detect_anomaly
+from torch.autograd.function import InplaceFunction
+from torch.testing import make_non_contiguous, randn_like
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+PRECISION = 1e-4
+
+
+class NoArgsClass(object):
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise StopIteration()
+    next = __next__  # Python 2 compatibility
+
+    def __len__(self):
+        return 0
+
+NO_ARGS = NoArgsClass()
+
+
+class non_differentiable(object):
+    def __init__(self, tensor):
+        self.tensor = tensor
+
+
+@contextlib.contextmanager
+def backward_engine(engine):
+    _prev_engine = Variable._execution_engine
+    Variable._execution_engine = engine()
+    try:
+        yield
+    finally:
+        Variable._execution_engine = _prev_engine
+
+
+def graph_desc(fn):
+    if fn is None:
+        return 'None'
+    result = type(fn).__name__ + '('
+    next_functions = fn.next_functions
+    for next_fn, _ in next_functions:
+        result += graph_desc(next_fn)
+        result += ', '
+    if next_functions:
+        result = result[:-2]
+    return result + ')'
+
+
+class TestAutograd(TestCase):
+
+    def _function_test(self, cls):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=True)
+        result = cls.apply(x, 2, y)
+        go = torch.ones((), requires_grad=True)
+        result.sum().backward(go, create_graph=True)
+
+        self.assertEqual(x.grad.data, y.data + torch.ones(5, 5))
+        self.assertEqual(y.grad.data, x.data + torch.ones(5, 5) * 2)
+        self.assertIsNotNone(x.grad.grad_fn)
+        self.assertIsNotNone(y.grad.grad_fn)
+
+        return x, y
+
+    def test_function(self):
+        class MyFunction(Function):
+
+            @staticmethod
+            def forward(ctx, tensor1, pyscalar, tensor2):
+                ctx.pyscalar = pyscalar
+                ctx.save_for_backward(tensor1, tensor2)
+                return tensor1 + pyscalar * tensor2 + tensor1 * tensor2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                var1, var2 = ctx.saved_tensors
+                # NOTE: self is the test case here
+                self.assertIsInstance(var1, torch.Tensor)
+                self.assertIsInstance(var2, torch.Tensor)
+                self.assertIsInstance(grad_output, torch.Tensor)
+                return (grad_output + grad_output * var2, None,
+                        grad_output * ctx.pyscalar + grad_output * var1)
+
+        x, y = self._function_test(MyFunction)
+
+        x_grad_desc = graph_desc(x.grad.grad_fn)
+        y_grad_desc = graph_desc(y.grad.grad_fn)
+        self.assertExpected(x_grad_desc, "x_grad_desc")
+        self.assertExpected(y_grad_desc, "y_grad_desc")
+
+    def test_once_differentiable(self):
+        class MyFunction(Function):
+
+            @staticmethod
+            def forward(ctx, tensor1, pyscalar, tensor2):
+                ctx.pyscalar = pyscalar
+                ctx.save_for_backward(tensor1, tensor2)
+                return tensor1 + pyscalar * tensor2 + tensor1 * tensor2
+
+            @staticmethod
+            @once_differentiable
+            def backward(ctx, grad_output):
+                self.assertFalse(torch.is_grad_enabled())
+                t1, t2 = ctx.saved_tensors
+                return (grad_output + grad_output * t2, None,
+                        grad_output * ctx.pyscalar + grad_output * t1)
+
+        x, y = self._function_test(MyFunction)
+        self.assertEqual(graph_desc(x.grad.grad_fn),
+                         'CloneBackward(Error(AccumulateGrad(), None, AccumulateGrad()))')
+        self.assertEqual(graph_desc(y.grad.grad_fn),
+                         'CloneBackward(Error(AccumulateGrad(), None, AccumulateGrad()))')
+
+    def test_function_returns_input(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, grad):
+                return grad * 2
+
+        for shape in [(1,), ()]:
+            v = torch.ones(shape, requires_grad=True)
+            MyFunction.apply(v).backward()
+            self.assertEqual(v.grad, torch.full(shape, 2))
+
+            v.grad.data.zero_()
+            MyFunction.apply(v.clone()).backward()
+            self.assertEqual(v.grad, torch.full(shape, 2))
+
+    def test_legacy_function_none_grad(self):
+        class MyFunction(Function):
+            def forward(self, x):
+                return torch.zeros(2, 2, 2)
+
+            def backward(self, grad_output):
+                return None
+
+        shape = (2, 3)
+        v = torch.ones(shape, requires_grad=True)
+        y = v[0, 0].expand(3, 5).t().sum()
+        MyFunction()(y).sum().backward()
+        self.assertEqual(v.grad.data, torch.zeros(shape))
+
+    def test_invalid_gradients(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return torch.randn(10, dtype=torch.float)
+
+        with self.assertRaisesRegex(RuntimeError, 'expected shape'):
+            input = torch.randn(5, 5, dtype=torch.float, requires_grad=True)
+            MyFunction.apply(input).sum().backward()
+        with self.assertRaisesRegex(RuntimeError, 'expected type'):
+            input = torch.randn(10, dtype=torch.double, requires_grad=True)
+            MyFunction.apply(input).sum().backward()
+
+    def test_accumulate_grad(self):
+        grad_output = torch.ones(5, 5)
+
+        def compute_grad(create_graph):
+            x = torch.randn(5, 5, requires_grad=True)
+            y = x + 2
+            y.backward(grad_output, retain_graph=True)
+            x_grad = x.grad
+            x_grad_clone = x.grad.clone()
+            y.backward(grad_output, create_graph=create_graph)
+            return x_grad, x_grad_clone
+
+        # Accumulate in-place when create_graph is False
+        x_grad, x_grad_clone = compute_grad(create_graph=False)
+        self.assertEqual(x_grad, x_grad_clone * 2)
+
+        # Accumulate out-of-place when create_graph is False
+        x_grad, x_grad_clone = compute_grad(create_graph=True)
+        self.assertEqual(x_grad, x_grad_clone)
+
+    def test_hessian_vector(self):
+        x = torch.randn(2, 2, requires_grad=True)
+        y = torch.randn(2, 2, requires_grad=True)
+
+        z = x ** 2 + y * x + y ** 2
+        z.backward(torch.ones(2, 2), create_graph=True)
+
+        x_grad = 2 * x.data + y.data
+        y_grad = x.data + 2 * y.data
+        self.assertEqual(x.grad.data, x_grad)
+        self.assertEqual(y.grad.data, y_grad)
+
+        grad_sum = 2 * x.grad + y.grad
+        grad_sum.backward(torch.ones(2, 2))
+        x_hv = torch.ones(2, 2) * 5
+        y_hv = torch.ones(2, 2) * 4
+        self.assertEqual(x.grad.data, x_grad + x_hv)
+        self.assertEqual(y.grad.data, y_grad + y_hv)
+
+    def test_grad(self):
+        x = torch.randn(2, 2, requires_grad=True)
+        y = torch.randn(2, 2, requires_grad=True)
+        z = x ** 2 + y * x + y ** 2
+        z.backward(torch.ones(2, 2), create_graph=True)
+
+        x_grad = 2 * x.data + y.data
+        y_grad = x.data + 2 * y.data
+        self.assertEqual(x.grad.data, x_grad)
+        self.assertEqual(y.grad.data, y_grad)
+
+        grad_sum = 2 * x.grad + y.grad
+        x_hv = torch.autograd.grad(
+            outputs=[grad_sum], grad_outputs=[torch.ones(2, 2)],
+            inputs=[x], create_graph=True)
+        expected_x_hv = torch.ones(2, 2) * 5
+        expected_y_hv = torch.ones(2, 2) * 4
+
+        self.assertEqual(x_hv[0].data, expected_x_hv)
+        self.assertEqual(x.grad.data, x_grad)
+        self.assertEqual(y.grad.data, y_grad)
+
+    def test_grad_nonleaf(self):
+        x_init = torch.randn(2, 2, requires_grad=True)
+        x = x_init
+        y = torch.randn(2, 2, requires_grad=True)
+        grad_output = torch.ones(2, 2)
+
+        def fn(x):
+            return x ** 2 + y * x + y ** 2
+
+        for i in range(5):
+            grad_x, = torch.autograd.grad(
+                fn(x), x, grad_outputs=grad_output, create_graph=True)
+
+            grad_x_expected = 2 * x.data + y.data
+            self.assertIsNone(y.grad)
+            self.assertIsNone(x.grad)
+            self.assertEqual(grad_x.data, grad_x_expected)
+
+            x = x + 0.05 * grad_x
+
+        val_init = fn(x_init).data.sum()
+        val_final = fn(x).data.sum()
+        self.assertGreater(val_final, val_init)
+
+        x.backward(grad_output)
+        self.assertIsNotNone(y.grad)
+        self.assertIsNotNone(x_init.grad)
+
+    def test_grad_nonleaf_many_outputs(self):
+        # This checks an edge case for function callbacks
+        # We want to capture two grads of a function, but can only
+        # register a single callback.
+        x = torch.randn(4, 2, requires_grad=True)
+        a, b = x.chunk(2)
+
+        def hook(*grads):
+            hook_called[0] = True
+        hook_called = [False]
+        x.register_hook(hook)
+
+        go = torch.randn(2, 2)
+        grad_a, grad_b = torch.autograd.grad(
+            (a + 2 * b), [a, b], grad_outputs=go, create_graph=True)
+
+        self.assertEqual(grad_a.data, go)
+        self.assertEqual(grad_b.data, go * 2)
+        self.assertFalse(hook_called[0])
+        self.assertIsNone(x.grad)
+
+    def test_sharded_grad(self):
+        leaves = [torch.zeros(5, 5, requires_grad=True) for _ in range(10)]
+        intermediates = [l * i + l * l for i, l in enumerate(leaves)]
+        loss = sum(v * i for i, v in enumerate(intermediates)).sum()
+
+        # define a helper for dividing intermediates into groups
+        def group(l, group_size):
+            return (l[i:i + group_size] for i in range(0, len(l), group_size))
+
+        # Compute the d loss / d intermediates in chunks of shard_size
+        shard_size = 2
+        d_intermediates = [d_i for intermediates_batch in group(intermediates, shard_size)
+                           for d_i in torch.autograd.grad(loss, intermediates_batch)]
+        # Compute rest of backward pass
+        torch.autograd.backward(intermediates, d_intermediates)
+
+        for i, l in enumerate(leaves):
+            self.assertEqual(l.grad.data, i * i * (1 + l.data))
+
+    def test_backward_badcalls(self):
+        x = torch.ones(1)
+        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+            x.backward()
+
+    def test_grad_badcalls(self):
+        x = torch.ones(1)
+        y = x ** 2
+        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+            torch.autograd.grad(x, y)
+        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+            torch.autograd.grad(y, x)
+
+        x = torch.ones(1, requires_grad=True)
+        y = x ** 2
+        torch.autograd.grad(y, x)  # this should succeed now
+
+    def test_grad_fn_badcalls(self):
+        error_regex = 'expected .* arguments, got .* instead'
+        x = torch.ones(1, requires_grad=True)
+        y = x ** 2
+        with self.assertRaisesRegex(TypeError, error_regex):
+            y.grad_fn(x.detach(), x.detach())  # too many
+        with self.assertRaisesRegex(TypeError, error_regex):
+            y.grad_fn()  # too few
+
+        y.grad_fn(x.detach())  # this should succeed
+
+    def test_grad_unreachable(self):
+        x = torch.ones(1, requires_grad=True)
+        y = torch.ones(1, requires_grad=True)
+        # Make sure x and y have grad accumulators allocated
+        z = x * 2
+        w = y * 2
+
+        grad_x, grad_y = torch.autograd.grad(x * 2, [x, y], allow_unused=True)
+        self.assertEqual(grad_x, x * 2)
+        self.assertIsNone(grad_y)
+
+        # This is slightly different than the case above, because z doesn't even
+        # have a grad accumulator allocated.
+        z = torch.ones(1, requires_grad=True)
+        grad_x, grad_z = torch.autograd.grad(x * 2, [x, z], allow_unused=True)
+        self.assertEqual(grad_x, x * 2)
+        self.assertIsNone(grad_z)
+
+    def test_hooks(self):
+        x = torch.ones(5, 5, requires_grad=True)
+        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)
+
+        counter = [0]
+
+        def bw_hook(inc, grad):
+            self.assertIsInstance(grad, torch.Tensor)
+            counter[0] += inc
+
+        z = x ** 2 + x * 2 + x * y + y
+        x.register_hook(lambda *args: bw_hook(0, *args))
+        test = z.register_hook(lambda *args: bw_hook(1, *args))
+        z.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(counter[0], 1)
+
+        test2 = z.register_hook(lambda *args: bw_hook(2, *args))
+        z.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(counter[0], 4)
+
+        test2.remove()
+        z.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(counter[0], 5)
+
+        def bw_hook_modify(grad):
+            return grad.mul(2)
+
+        test.remove()
+        z.register_hook(bw_hook_modify)
+        y.grad.data.zero_()
+        z.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(y.grad.data, (x.data + 1) * 2)
+
+        y.register_hook(bw_hook_modify)
+        y.grad.data.zero_()
+        z.backward(torch.ones(5, 5))
+        self.assertEqual(y.grad.data, (x.data + 1) * 4)
+
+    def test_hooks_cpp(self):
+        # Tests hooks for autograd function implemented in C++
+        bn = torch.nn.BatchNorm1d(5, affine=False)
+        bn.eval()
+
+        counter = [0]
+
+        def bw_hook(grad):
+            counter[0] += 1
+            return grad * 2
+
+        x = torch.ones(5, 5, requires_grad=True)
+        z = bn(x)
+        z.register_hook(bw_hook)
+        z.sum().backward()
+
+        self.assertEqual(counter[0], 1, 'bw_hook not called')
+        self.assertEqual(x.grad.data, torch.ones(5, 5) * 2)
+
+    def test_hook_none(self):
+        # WARNING: this is a test for autograd internals.
+        # You should never have to use such things in your code.
+        class NoneGradientFunction(Function):
+
+            def forward(self, x, y):
+                assert self.needs_input_grad[0]
+                assert not self.needs_input_grad[1]
+                return x, y
+
+            def backward(self, grad_x, grad_y):
+                return grad_x, None
+
+        fn = NoneGradientFunction()
+        was_called = [False]
+
+        def hook(grad_input, grad_output):
+            self.assertIsInstance(grad_input, tuple)
+            self.assertIsInstance(grad_output, tuple)
+            self.assertIsNotNone(grad_input[0])
+            self.assertIsNotNone(grad_input[1])
+            self.assertIsNotNone(grad_output[0])
+            self.assertIsNotNone(grad_output[1])
+            was_called[0] = True
+        fn.register_hook(hook)
+
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5)
+        sum(fn(x, y)).sum().backward()
+        self.assertTrue(was_called[0])
+
+    def test_retain_grad(self):
+        input = torch.rand(1, 3, requires_grad=True)
+        h1 = input * 3
+        out = (h1 * h1).sum()
+
+        # It should be possible to call retain_grad() multiple times
+        h1.retain_grad()
+        h1.retain_grad()
+
+        # Gradient should be accumulated
+        out.backward(retain_graph=True)
+        self.assertEqual(h1.data * 2, h1.grad.data)
+        out.backward(retain_graph=True)
+        self.assertEqual(h1.data * 4, h1.grad.data)
+
+        input.grad.data.zero_()
+        # It should be a no-op for leaves
+        input.retain_grad()
+        input.retain_grad()
+        out.backward()
+        self.assertEqual(input.data * 18, input.grad.data)
+
+    def test_retain_grad_cycle(self):
+        import gc
+        import weakref
+        counter = [0]
+        refs = [None]
+
+        x = torch.ones(5, 5, requires_grad=True)
+
+        def run_test():
+            y = x * 2
+            y.retain_grad()
+
+            def inc(*args):
+                counter[0] += 1
+            refs[0] = weakref.ref(y, inc)
+            return y / 2
+
+        z = run_test()
+        gc.collect()
+        self.assertIsNone(refs[0]())
+        self.assertEqual(counter[0], 1)
+        z.sum().backward()
+
+    def test_backward(self):
+        v_t = torch.randn(5, 5)
+        x_t = torch.randn(5, 5)
+        y_t = torch.rand(5, 5) + 0.1
+        z_t = torch.randn(5, 5)
+        grad_output = torch.randn(5, 5)
+        v = Variable(v_t, requires_grad=True)
+        x = Variable(x_t, requires_grad=True)
+        y = Variable(y_t, requires_grad=True)
+        z = Variable(z_t, requires_grad=True)
+
+        v.backward(grad_output)
+        self.assertEqual(v.grad.data, grad_output)
+
+        a = x + (y * z) + 4 * z ** 2 * x / y
+        a.backward(grad_output)
+        x_grad = 4 * z_t.pow(2) / y_t + 1
+        y_grad = z_t - 4 * x_t * z_t.pow(2) / y_t.pow(2)
+        z_grad = 8 * x_t * z_t / y_t + y_t
+        self.assertEqual(x.grad.data, x_grad * grad_output)
+        self.assertEqual(y.grad.data, y_grad * grad_output)
+        self.assertEqual(z.grad.data, z_grad * grad_output)
+
+    def test_sparse_backward(self):
+        class FixedGradientFunction(Function):
+            def __init__(self, grad):
+                self.grad = grad
+
+            def forward(self, x):
+                return x
+
+            def backward(self, grad_x):
+                return self.grad
+
+        size = torch.Size([6, 3, 2])
+        i1 = torch.LongTensor([
+            [0, 3, 4],
+            [0, 2, 2],
+        ])
+        v1 = torch.DoubleTensor([[1, 2], [4, 5], [7, 8]])
+        sparse_grad1 = Variable(torch.sparse.DoubleTensor(i1, v1, size))
+        i2 = torch.LongTensor([
+            [0, 1, 3, 4],
+            [0, 1, 2, 2],
+        ])
+        v2 = torch.DoubleTensor([[1, 2], [4, 3], [4, 5], [7, 8]])
+        sparse_grad2 = Variable(torch.sparse.DoubleTensor(i2, v2, size))
+        dense_grad = Variable(torch.rand(size).double())
+        sparse_fn1 = FixedGradientFunction(sparse_grad1)
+        sparse_fn2 = FixedGradientFunction(sparse_grad2)
+        dense_fn = FixedGradientFunction(dense_grad)
+
+        # sparse first
+        x = torch.randn(size, requires_grad=True)
+        (sparse_fn1(x) + dense_fn(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
+        # dense first
+        x = torch.randn(size, requires_grad=True)
+        (dense_fn(x) + sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
+        # sparse only
+        x = torch.randn(size, requires_grad=True)
+        (sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad, sparse_grad1 + sparse_grad2)
+
+    def test_multi_backward(self):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=True)
+
+        q = torch.randn(5, 5, requires_grad=True)
+
+        a = torch.randn(5, 5, requires_grad=True)
+        b = torch.randn(5, 5, requires_grad=True)
+
+        q2 = q * 2
+        z = x + y + q2
+        c = a * b + q2
+        grad_z = torch.randn(5, 5)
+        grad_c = torch.randn(5, 5)
+        torch.autograd.backward([z, c], [grad_z, grad_c])
+
+        self.assertEqual(x.grad.data, grad_z)
+        self.assertEqual(y.grad.data, grad_z)
+        self.assertEqual(a.grad.data, grad_c * b.data)
+        self.assertEqual(b.grad.data, grad_c * a.data)
+        self.assertEqual(q.grad.data, (grad_c + grad_z) * 2)
+
+    def test_multi_backward_no_grad(self):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=False)
+
+        z = x + y
+        q = y * 2
+
+        # NB: we currently raise an exception if any arguments to backwards
+        # have requires_grad=False and don't have a grad_fn. We may want to
+        # relax that check to a warning.
+        def call_backwards():
+            torch.autograd.backward([z, q], [torch.ones(5, 5), torch.ones(5, 5)])
+        self.assertRaises(RuntimeError, call_backwards)
+
+    def test_dependent_backward(self):
+        x = torch.randn(10, requires_grad=True)
+        y = x ** 2
+        z = y ** 3
+
+        go_y = torch.randn(10)
+        go_z = torch.randn(10)
+        torch.autograd.backward([y, z], [go_y, go_z])
+
+        xd = x.data
+        self.assertEqual(x.grad.data, 2 * xd * go_y + 6 * xd.pow(5) * go_z)
+
+    def test_save_output_nr(self):
+        x = torch.randn(10, requires_grad=True)
+
+        class MultiOutputFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x[:5], x[5:]
+
+            @staticmethod
+            def backward(ctx, *grad):
+                return torch.cat(grad)
+
+        a, b = MultiOutputFn.apply(x)
+        self.assertEqual(b.output_nr, 1)
+
+        class TestFn(Function):
+            @staticmethod
+            def forward(ctx, b):
+                ctx.save_for_backward(b)
+                return b * 2
+
+            @staticmethod
+            def backward(ctx, grad_b):
+                b, = ctx.saved_tensors
+                self.assertEqual(b.output_nr, 1)
+
+        TestFn.apply(b).sum().backward()
+
+    def test_free_deep_graph(self):
+        def scope():
+            depth = 150000
+            x = torch.randn(1, requires_grad=True)
+            y = x.clone()
+
+            # build a "chain" computation graph
+            for i in range(depth):
+                y = y + y * 0.000001
+
+            # graph deletion occurs when the above locals go out of scope.
+            # In this case `del y` will trigger it but it's easier to leave
+            # it to Python to delete the locals.
+
+        # Should not stack overflow
+        scope()
+
+    def test_free_deep_graph_complicated(self):
+        def scope():
+            depth = 100000
+            randchoice = torch.randint(2, [depth, 2])
+            x = torch.randn(1, requires_grad=True)
+            y = x.clone()
+
+            # Hold the two previous values
+            prev_values = [None, None]
+
+            # Build a "chain with skip connections" graph
+            for i in range(depth):
+                prev_tensors = [tensor for tensor in prev_values[:-1]
+                                if tensor is not None]
+                prev_values.append(y)
+                prev_values.pop(0)
+
+                # Definitely pick one tensor to add
+                y += y * 0.000001
+
+                # Possibly add other tensors
+                nprev = len(prev_tensors)
+                if nprev == 2:
+                    y += randchoice[depth].mul(torch.cat(prev_tensors)).sum()
+
+            # graph deletion occurs when the above locals go out of scope.
+
+        # Should not stack overflow
+        scope()
+
+    def test_free_deep_graph_pyfunction(self):
+        class MyOp(Function):
+            @staticmethod
+            def forward(ctx, tensor1, tensor2):
+                return tensor1 + tensor2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output, grad_output
+
+        def scope():
+            depth = 150000
+            x = torch.randn(1, requires_grad=True)
+            y = x.clone()
+
+            # build deeply nested computation graph
+            for i in range(depth):
+                y = MyOp.apply(y, y)
+
+            # graph deletion occurs when the above locals go out of scope.
+
+        # Should not stack overflow
+        scope()
+
+    def test_no_grad(self):
+        x = torch.ones(5, 5, requires_grad=True)
+        y = Variable(torch.ones(5, 5) * 4)
+        with torch.no_grad():
+            w = x + y
+
+        @torch.no_grad()
+        def adder(x, y):
+            return x + y
+
+        z = adder(x, y)
+
+        self.assertFalse(w.requires_grad)
+        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))
+        self.assertIsNone(w.grad_fn)
+        self.assertFalse(z.requires_grad)
+        self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))
+        self.assertIsNone(z.grad_fn)
+
+    def test_no_grad_python_function(self):
+        """Python Functions should respect grad mode."""
+        x = torch.ones(5, 5, requires_grad=True)
+
+        class MyOp(Function):
+            @staticmethod
+            def forward(self, x):
+                return x + 1
+
+            @staticmethod
+            def backward(self, dy):
+                return dy
+
+        with torch.no_grad():
+            y = MyOp.apply(x)
+        self.assertFalse(y.requires_grad)
+
+    def test_indexing(self):
+        x = torch.arange(1., 17).view(4, 4)
+        y = Variable(x, requires_grad=True)
+
+        def compare(x, y, idx, indexed_tensor, indexed_var):
+            indexed_var_t = indexed_var.data
+            if not isinstance(indexed_tensor, torch.Tensor):
+                indexed_var_t = indexed_var_t[0]
+            self.assertEqual(indexed_tensor, indexed_var_t)
+
+            indexed_var.sum().backward()
+            expected_grad = torch.Tensor(x.size()).fill_(0)
+            expected_grad[idx] = 1
+            self.assertEqual(y.grad.data, expected_grad)
+
+        def check_index(x, y, idx):
+            if y.grad is not None:
+                y.grad.data.zero_()
+            indexed_tensor = x[idx]
+            indexed_var = y[idx]
+            compare(x, y, idx, indexed_tensor, indexed_var)
+
+        check_index(x, y, 1)
+        check_index(x, y, (1, 1))
+        check_index(x, y, slice(1, None))
+        check_index(x, y, slice(None, 2))
+        check_index(x, y, (slice(None, 2), 2))
+        check_index(x, y, (slice(1, 2), 2))
+        check_index(x, y, (1, slice(2, None)))
+        check_index(x, y, (slice(None, None), slice(2, None)))
+        check_index(x, y, torch.LongTensor([0, 2]))
+        check_index(x, y, torch.rand(4, 4).bernoulli().byte())
+        check_index(x, y, (Ellipsis, slice(2, None)))
+        check_index(x, y, ([0], [0]))
+        check_index(x, y, ([1, 2, 3], [0]))
+        check_index(x, y, ([1, 2], [2, 1]))
+        check_index(x, y, ([[1, 2], [3, 0]], [[0, 1], [2, 3]]))
+        check_index(x, y, ([slice(None), [2, 3]]))
+        check_index(x, y, ([[2, 3], slice(None)]))
+
+        # advanced indexing, with less dim, or ellipsis
+        check_index(x, y, ([0]))
+        check_index(x, y, ([0], ))
+
+        x = torch.arange(1., 49).view(4, 3, 4)
+        y = Variable(x, requires_grad=True)
+
+        check_index(x, y, (slice(None), [0], [0]))
+        check_index(x, y, ([0], [0], slice(None)))
+        check_index(x, y, (slice(None), [0, 1, 2], [0]))
+        check_index(x, y, ([0, 1, 2], [0], slice(None)))
+        check_index(x, y, (slice(None), [1, 2], [2, 1]))
+        check_index(x, y, ([1, 2], [2, 1], slice(None)))
+        check_index(x, y, (slice(None), [[1, 2], [2, 0]], [[0, 1], [2, 3]]))
+        check_index(x, y, ([[1, 2], [3, 0]], [[0, 1], [2, 2]], slice(None)))
+        check_index(x, y, (slice(None), slice(None), [2, 1]))
+        check_index(x, y, (slice(None), [2, 1], slice(None)))
+        check_index(x, y, ([2, 1], slice(None), slice(None)))
+
+        # advanced indexing, with less dim, or ellipsis
+        check_index(x, y, ([0], ))
+        check_index(x, y, ([0], slice(None)))
+        check_index(x, y, ([0], Ellipsis))
+        check_index(x, y, ([1, 2], [0, 1]))
+        check_index(x, y, ([1, 2], [0, 1], Ellipsis))
+        check_index(x, y, (Ellipsis, [1, 2], [0, 1]))
+
+        # advanced indexing, with a tensor wrapped in a variable
+        z = torch.LongTensor([0, 1])
+        zv = Variable(z, requires_grad=False)
+        seq = [z, Ellipsis]
+        seqv = [zv, Ellipsis]
+
+        if y.grad is not None:
+            y.grad.data.zero_()
+        indexed_tensor = x[seq]
+        indexed_var = y[seqv]
+        compare(x, y, seq, indexed_tensor, indexed_var)
+
+    def test_indexing_duplicates(self):
+        x = torch.arange(1., 17).view(4, 4)
+        y = Variable(x, requires_grad=True)
+
+        idx = torch.LongTensor([1, 1, 3, 2, 1, 2])
+        y[idx].sum().backward()
+        expected_grad = torch.zeros(4, 4)
+        for i in idx:
+            expected_grad[i] += 1
+        self.assertEqual(y.grad.data, expected_grad)
+
+        # with advanced indexing
+        x = torch.arange(1., 17).view(4, 4)
+        y = Variable(x, requires_grad=True)
+
+        idx = [[1, 1, 3, 2, 1, 2], [0]]
+        y[idx].sum().backward()
+        expected_grad = torch.zeros(4, 4)
+        for i in idx[0]:
+            for j in idx[1]:
+                expected_grad[i][j] += 1
+
+        self.assertEqual(y.grad.data, expected_grad)
+
+        x = torch.arange(1., 17).view(4, 4)
+        y = Variable(x, requires_grad=True)
+        idx = [[[1, 2], [0, 0]], [[0, 1], [1, 1]]]
+        y[idx].sum().backward()
+        expected_grad = torch.Tensor([[0, 2, 0, 0],
+                                      [1, 0, 0, 0],
+                                      [0, 1, 0, 0],
+                                      [0, 0, 0, 0]])
+        self.assertEqual(y.grad.data, expected_grad)
+
+        x = torch.arange(1., 65).view(4, 4, 4)
+        y = Variable(x, requires_grad=True)
+
+        idx = [[1, 1, 1], slice(None), slice(None)]
+        y[idx].sum().backward()
+        expected_grad = torch.Tensor(4, 4, 4).zero_()
+        expected_grad[1].fill_(3)
+        self.assertEqual(y.grad.data, expected_grad)
+
+    def test_volatile_deprecated(self):
+        v = torch.autograd.torch.randn(3, 3)
+        with warnings.catch_warnings(record=True) as w:
+            self.assertFalse(v.volatile)
+        self.assertIn('volatile', str(w[0].message))
+
+    def test_saved_variables_deprecated(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, tensor1, tensor2):
+                ctx.save_for_backward(tensor1, tensor2)
+                return tensor1 + tensor2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                var1, var2 = ctx.saved_variables
+                return (grad_output, grad_output)
+
+        with warnings.catch_warnings(record=True) as warns:
+            warnings.simplefilter("always")
+            x = torch.randn((3, 3), requires_grad=True)
+            y = torch.randn((3, 3), requires_grad=True)
+            model = MyFunction()
+            model.apply(x, y).sum().backward()
+
+            has_deprecated = map(lambda warn:
+                                 'deprecated' in str(warn) and
+                                 'saved_variables' in str(warn),
+                                 warns)
+            has_deprecated = reduce(lambda x, y: x or y, has_deprecated)
+            self.assertTrue(has_deprecated)
+
+    def test_requires_grad(self):
+        x = torch.randn(5, 5)
+        y = torch.randn(5, 5)
+        z = torch.randn(5, 5, requires_grad=True)
+        a = x + y
+        self.assertFalse(a.requires_grad)
+        b = a + z
+        self.assertTrue(b.requires_grad)
+
+        def error():
+            raise RuntimeError
+        # Make sure backward isn't called on these
+        a._backward_hooks = OrderedDict()
+        x._backward_hooks = OrderedDict()
+        y._backward_hooks = OrderedDict()
+        a._backward_hooks['test'] = error
+        x._backward_hooks['test'] = error
+        y._backward_hooks['test'] = error
+        b.backward(torch.ones(5, 5))
+
+    def test_requires_grad_(self):
+        x = torch.randn(5, 5)
+        y = torch.randn(5, 5, requires_grad=True)
+        self.assertIs(x, x.requires_grad_())
+        self.assertTrue(x.requires_grad)
+        self.assertIs(y, y.requires_grad_())
+        self.assertTrue(y.requires_grad)
+        self.assertIs(x, x.requires_grad_(True))
+        self.assertTrue(x.requires_grad)
+        self.assertIs(y, y.requires_grad_(True))
+        self.assertTrue(y.requires_grad)
+        z = x * y
+        self.assertRaises(RuntimeError, lambda: z.requires_grad_(False))
+        self.assertIs(z, z.requires_grad_())
+        self.assertTrue(z.requires_grad)
+        self.assertIs(z, z.requires_grad_(True))
+        self.assertTrue(z.requires_grad)
+
+        self.assertIs(x, x.requires_grad_(False))
+        self.assertFalse(x.requires_grad)
+        self.assertIs(y, y.requires_grad_(False))
+        self.assertFalse(y.requires_grad)
+
+    def test_requires_grad_inplace(self):
+        a = torch.randn(5, 5)
+        b = torch.randn(5, 5, requires_grad=True)
+        a += b
+        self.assertTrue(a.requires_grad)
+
+        # non-leaf Variable
+        a = torch.randn(5, 5) + 0
+        b = torch.randn(5, 5, requires_grad=True)
+        a += b
+        self.assertTrue(a.requires_grad)
+
+    def test_no_requires_grad_inplace(self):
+        # basic case, should be able to modify inplace while requires_grad is False
+        a = torch.randn(2, 3)
+        a.add_(5)
+        a.requires_grad = True
+        a.sum().backward()
+        self.assertEqual(a.grad.data, torch.ones(2, 3))
+
+        # same but with a view
+        a = torch.randn(2, 3)
+        b = a[:]
+        b.add_(5)
+        a.requires_grad = True
+        a.sum().backward()
+        self.assertEqual(a.grad.data, torch.ones(2, 3))
+
+        # should fail if requires_grad = True when we modify inplace
+        a = torch.randn(2, 3)
+        b = a[:]
+        a.requires_grad = True
+        with self.assertRaises(RuntimeError):
+            a.add_(5)
+        with self.assertRaises(RuntimeError):
+            b.add_(5)
+
+    def test_requires_grad_factory(self):
+        x = torch.randn(2, 3)
+        fns = [torch.ones_like, torch.testing.randn_like]
+        dtypes = [torch.float32, torch.float64]
+        for fn in fns:
+            for requires_grad in [True, False]:
+                for dtype in dtypes:
+                    for use_cuda in [True, False]:
+                        if not use_cuda:
+                            output = fn(x, dtype=dtype, requires_grad=requires_grad)
+                            self.assertEqual(requires_grad, output.requires_grad)
+                            self.assertIs(dtype, output.dtype)
+                        elif torch.cuda.is_available() and torch.cuda.device_count() > 1:
+                            output = fn(x, dtype=dtype, device=1, requires_grad=requires_grad)
+                            self.assertEqual(requires_grad, output.requires_grad)
+                            self.assertIs(dtype, output.dtype)
+                            self.assertEqual(1, output.get_device())
+
+    def test_grad_assignment(self):
+        x = torch.randn(5, 5)
+        a = torch.randn(2, 2)  # size mismatch
+        b = Variable(torch.randn(5, 5).long())  # type mismatch
+
+        with self.assertRaises(RuntimeError):
+            x.grad = torch.randn(2, 2)
+        with self.assertRaises(RuntimeError):
+            x.grad = Variable(torch.randn(5, 5).long())
+        with self.assertRaises(RuntimeError):
+            x.grad = x
+
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA not available")
+        with self.assertRaises(RuntimeError):
+            x.grad = Variable(torch.randn(5, 5).cuda())
+
+        if torch.cuda.device_count() < 2:
+            raise unittest.SkipTest("At least 2 CUDA devices needed")
+        x = Variable(torch.randn(5, 5).cuda(0))
+        with self.assertRaises(RuntimeError):
+            x.grad = Variable(torch.randn(5, 5).cuda(1))
+
+    def test_duplicate_backward_root(self):
+        a = torch.randn(5, 5, requires_grad=True)
+        b = torch.randn(5, 5, requires_grad=True)
+
+        x = a * b
+        grad_output = torch.randn_like(x)
+        torch.autograd.backward([x, x], [grad_output, grad_output])
+
+        self.assertEqual(a.grad.data, b.data * grad_output * 2)
+        self.assertEqual(b.grad.data, a.data * grad_output * 2)
+
+    def test_backward_no_grad(self):
+        a = torch.randn(5, 5, requires_grad=True)
+        b = a + 2
+        with self.assertRaises(RuntimeError):
+            torch.autograd.backward([b], [None])
+
+    def test_next_functions(self):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=True)
+
+        a = x + y
+        self.assertIsNotNone(a.grad_fn)
+        next_functions = a.grad_fn.next_functions
+        self.assertEqual(len(next_functions), 2)
+        self.assertIsInstance(next_functions[0][0], torch._C._functions.AccumulateGrad)
+        self.assertEqual(next_functions[0][1], 0)
+        self.assertIsInstance(next_functions[1][0], torch._C._functions.AccumulateGrad)
+        self.assertEqual(next_functions[1][1], 0)
+
+        b = a + 5
+        next_functions = b.grad_fn.next_functions
+        self.assertEqual(len(next_functions), 1)
+        self.assertIs(next_functions[0][0], a.grad_fn)
+
+    def test_inplace(self):
+        x = torch.ones(5, 5, requires_grad=True)
+        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)
+
+        z = x * y
+        q = z + y
+        w = z * y
+        z.add_(2)
+        # Add doesn't need it's inputs to do backward, so it shouldn't raise
+        q.backward(torch.ones(5, 5), retain_graph=True)
+        # Mul saves both inputs in forward, so it should raise
+        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))
+
+        z = x * y
+        q = z * y
+        r = z + y
+        w = z.add_(y)
+        # w is a the last expression, so this should succeed
+        w.backward(torch.ones(5, 5), retain_graph=True)
+        # r doesn't use the modified value in backward, so it should succeed
+        r.backward(torch.ones(5, 5), retain_graph=True)
+        # q uses dirty z, so it should raise
+        self.assertRaises(RuntimeError, lambda: q.backward(torch.ones(5, 5)))
+
+        x.grad.data.zero_()
+        m = x / 2
+        z = m + y / 8
+        q = z * y
+        r = z + y
+        prev_version = z._version
+        w = z.exp_()
+        self.assertNotEqual(z._version, prev_version)
+        r.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(x.grad.data, torch.ones(5, 5) / 2)
+        w.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertEqual(x.grad.data, torch.Tensor(5, 5).fill_((1 + math.e) / 2))
+        self.assertRaises(RuntimeError, lambda: q.backward(torch.ones(5, 5)))
+
+        leaf = torch.ones(5, 5, requires_grad=True)
+        x = leaf.clone()
+        x.add_(10)
+        self.assertEqual(x.data, torch.ones(5, 5) * 11)
+        # x should be still usable
+        y = x + 2
+        y.backward(torch.ones(5, 5))
+        self.assertEqual(leaf.grad.data, torch.ones(5, 5))
+        z = x * y
+        x.add_(2)
+        self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))
+
+    def test_mark_non_differentiable(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, input):
+                output = input > 0
+                ctx.mark_non_differentiable(output)
+                return output
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return (grad_output * 0).type(torch.DoubleTensor)
+
+        x = torch.randn(5, 5, requires_grad=True)
+        mask = MyFunction.apply(x)
+        self.assertFalse(mask.requires_grad)
+        y = x.masked_fill(mask, 0)
+        y.sum().backward()
+
+    def test_mark_non_differentiable_mixed(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, input):
+                a = input + 1
+                b = input + 2
+                ctx.mark_non_differentiable(a)
+                return a, b
+
+            @staticmethod
+            def backward(ctx, grad_a, grad_b):
+                self.assertTrue((grad_a == 0).all())
+                self.assertTrue((grad_b == 1).all())
+                return grad_b
+
+        x = torch.randn(5, 5, requires_grad=True)
+        a, b = MyFunction.apply(x)
+        self.assertFalse(a.requires_grad)
+        self.assertTrue(b.requires_grad)
+        b.sum().backward()
+        self.assertEqual(x.grad.data, torch.ones(5, 5))
+
+    def test_mark_non_differentiable_none(self):
+        # This used to segfault because MyFunction would send back null
+        # gradients to MulBackward, which is implemented in C++. C++
+        # implemented functions expect incoming  grad_ouptuts to be non-null.
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, input):
+                output = input.clone()
+                ctx.mark_non_differentiable(output)
+                return output
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return None
+
+        x = torch.randn(5, 5, requires_grad=True)
+        r = MyFunction.apply(x * x)
+        (r * x).sum().backward()
+
+    def test_return_duplicate(self):
+        class DoubleDuplicate(Function):
+            @staticmethod
+            def forward(ctx, x):
+                output = x * 2
+                return output, output
+
+            @staticmethod
+            def backward(ctx, grad1, grad2):
+                return grad1 * 2 + grad2 * 2
+
+        def fn(x):
+            a, b = DoubleDuplicate.apply(x)
+            self.assertIs(a, b)
+            return a + b
+
+        x = torch.randn(5, 5, requires_grad=True)
+        gradcheck(fn, [x])
+        gradgradcheck(fn, [x])
+
+    def test_return_duplicate_inplace(self):
+        class DoubleInplace(Function):
+            @staticmethod
+            def forward(ctx, x):
+                x.mul_(2)
+                ctx.mark_dirty(x)
+                return x, x
+
+            @staticmethod
+            def backward(ctx, grad1, grad2):
+                return grad1 * 2 + grad2 * 2
+
+        def inplace_fn(x):
+            a, b = DoubleInplace.apply(x.clone())
+            self.assertIs(a, b)
+            return a + b
+
+        x = torch.randn(5, 5, requires_grad=True)
+        gradcheck(inplace_fn, [x])
+        gradgradcheck(inplace_fn, [x])
+
+        # Can't modify leaf variables in-place
+        self.assertRaises(RuntimeError, lambda: InplaceFunction.apply(x))
+        # Functions which modify views in-place must return only one output
+        self.assertRaises(RuntimeError, lambda: InplaceFunction.apply(x.clone()[0]))
+
+    @suppress_warnings
+    def test_resize(self):
+        x = torch.ones(2, 3)
+        self.assertTrue(x.resize(3, 2).size() == (3, 2))
+
+    def _test_setitem(self, size, index):
+        x = torch.ones(*size, requires_grad=True)
+        y = x + 2
+        y_version = y._version
+        y[index] = 2
+        self.assertNotEqual(y._version, y_version)
+        y.backward(torch.ones(*size))
+        expected_grad = torch.ones(*size)
+        expected_grad[index] = 0
+        self.assertEqual(x.grad, expected_grad)
+
+    def _test_setitem_tensor(self, size, index):
+        x = torch.ones(*size, requires_grad=True)
+        y = x + 2
+        y_version = y._version
+        value = x.new(x[index].size()).fill_(7)
+        value.requires_grad = True
+        y[index] = value
+        self.assertNotEqual(y._version, y_version)
+        y.backward(torch.ones(*size))
+        expected_grad_input = torch.ones(*size)
+        expected_grad_input[index] = 0
+        self.assertEqual(x.grad, expected_grad_input)
+        self.assertEqual(value.grad, torch.ones_like(value))
+
+        # case when x broadcasts to as y[1]
+        x = torch.randn(4, requires_grad=True)
+        y = torch.zeros(2, 3, 4)
+        y[1] = x
+        y.backward(torch.randn(2, 3, 4))
+        self.assertEqual(x.size(), x.grad.size())
+
+    def test_setitem(self):
+        self._test_setitem((5, 5), 1)
+        self._test_setitem((5,), 1)
+        self._test_setitem((1,), 0)
+        self._test_setitem((10,), [[0, 4, 2]])
+        self._test_setitem((5, 5), [[0, 4], [2, 2]])
+        self._test_setitem((5, 5, 5), [slice(None), slice(None), [1, 3]])
+        self._test_setitem((5, 5, 5), [slice(None), [1, 3], slice(None)])
+        self._test_setitem((5, 5, 5), [[1, 3], slice(None), slice(None)])
+        self._test_setitem((5, 5, 5), [slice(None), [2, 4], [1, 3]])
+        self._test_setitem((5, 5, 5), [[1, 3], [2, 4], slice(None)])
+        self._test_setitem_tensor((5, 5), 3)
+        self._test_setitem_tensor((5, 5), [[0, 1], [1, 0]])
+        self._test_setitem_tensor((5,), 3)
+        self._test_setitem_tensor((5,), Variable(torch.LongTensor([3]), requires_grad=False).sum())
+        self._test_setitem_tensor((5,), [[0, 1, 2, 3]])
+        self._test_setitem_tensor((5, 5, 5), [slice(None), slice(None), [1, 3]])
+        self._test_setitem_tensor((5, 5, 5), [slice(None), [1, 3], slice(None)])
+        self._test_setitem_tensor((5, 5, 5), [[1, 3], slice(None), slice(None)])
+        self._test_setitem_tensor((5, 5, 5), [slice(None), [2, 4], [1, 3]])
+        self._test_setitem_tensor((5, 5, 5), [[1, 3], [2, 4], slice(None)])
+        self._test_setitem_tensor((5, 5, 5), [Variable(torch.LongTensor([1,
+                                              3]), requires_grad=False), [2, 4], slice(None)])
+
+    def test_setitem_mask(self):
+        mask = torch.ByteTensor(5, 5).bernoulli_()
+        self._test_setitem((5, 5), Variable(mask))
+        self._test_setitem((5,), Variable(mask[0]))
+        self._test_setitem((1,), Variable(mask[0, 0:1]))
+        self._test_setitem_tensor((5, 5), Variable(mask))
+        self._test_setitem_tensor((5,), Variable(mask[0]))
+
+    def test_select_sum(self):
+        # both select and sum return Scalars in ATen; ensure they work together.
+        x = torch.randn(10, requires_grad=True)
+
+        def func(x):
+            return x.select(0, 1).sum()
+
+        gradcheck(func, [x])
+        gradgradcheck(func, [x])
+
+    def test_stack(self):
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=True)
+        z = torch.randn(10, 10, requires_grad=True)
+        stacked = torch.stack([x, y, z], 0)
+        grad = torch.randn(3, 10, 10)
+        stacked.backward(grad)
+        self.assertEqual(x.grad.data, grad[0])
+        self.assertEqual(y.grad.data, grad[1])
+        self.assertEqual(z.grad.data, grad[2])
+
+    def test_unbind(self):
+        stacked = torch.randn(3, 10, 10, requires_grad=True)
+        x, y, z = stacked.unbind()
+        grad = torch.randn(3, 10, 10)
+        torch.autograd.backward([x, y, z], grad.unbind())
+        self.assertEqual(stacked.grad.data, grad)
+
+    def test_put(self):
+        root = torch.randn(4, 5, requires_grad=True)
+        values = torch.randn(6, requires_grad=True)
+        idx = Variable(torch.LongTensor([1, 2, 3, -1, -2, -3]))
+
+        def func(root, values):
+            x = root.clone()
+            x.put_(idx, values)
+            return x
+
+        gradcheck(func, [root, values])
+        gradgradcheck(func, [root, values])
+
+    def test_put_accumulate(self):
+        root = torch.randn(4, 5, requires_grad=True)
+        values = torch.randn(6, requires_grad=True)
+        idx = Variable(torch.LongTensor([1, 2, 3, 1, 2, 3]))
+
+        def func(root, values):
+            x = root.clone()
+            x.put_(idx, values, accumulate=True)
+            return x
+
+        gradcheck(func, [root, values])
+        gradgradcheck(func, [root, values])
+
+    def test_fill(self):
+        root = torch.randn(4, 5, requires_grad=True)
+
+        def func(root):
+            x = root.clone()
+            x.fill_(2)
+            return x
+
+        gradcheck(func, [root])
+        gradgradcheck(func, [root])
+
+    def test_unused_output(self):
+        x = torch.randn(10, 10, requires_grad=True)
+        outputs = x.chunk(5)
+        o = outputs[2]
+        o = o * 4 + 2
+        o.sum().backward()
+        expected_grad = torch.zeros(10, 10)
+        expected_grad[4:6] = 4
+        self.assertEqual(x.grad.data, expected_grad)
+
+        x.grad.data.zero_()
+        grad_output = torch.randn(2, 10)
+        outputs = x.chunk(5)
+        outputs[0].backward(grad_output)
+        expected_grad = torch.zeros(10, 10)
+        expected_grad[:2] = grad_output
+        self.assertEqual(x.grad.data, expected_grad)
+
+    def test_gc_in_destructor(self):
+        """
+        Previously, if a Function destructor triggered a garbage collection,
+        the Variable's tp_dealloc handler would get called twice leading to a
+        segfault.
+        """
+        class CollectOnDelete(Function):
+
+            def __del__(self):
+                gc.collect()
+
+        for i in range(10):
+            Variable(torch.randn(10, 10), _grad_fn=CollectOnDelete())
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    def test_unused_output_gpu(self):
+        from torch.nn.parallel._functions import Broadcast
+        x = Variable(torch.randn(5, 5).float().cuda(), requires_grad=True)
+        outputs = Broadcast.apply(list(range(torch.cuda.device_count())), x)
+        y = outputs[-1] * 2
+        y.sum().backward()
+        self.assertEqual(x.grad.data, torch.ones(5, 5) * 2)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    def test_backward_device(self):
+        # check that current device matches the variable's device
+        device = [None]
+
+        class Identity(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                device[0] = torch.cuda.current_device()
+                return grad_output.clone()
+
+        v = Variable(torch.randn(1).cuda(1), requires_grad=True)
+        Identity.apply(v).backward()
+        self.assertEqual(device[0], 1)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    def test_inputbuffer_add_multigpu(self):
+        input = torch.randn(1).cuda(0).requires_grad_()
+        output = input.cuda(1) + input.cuda(1)
+        output.backward()
+
+    def test_detach(self):
+        x = torch.randn(10, 10, requires_grad=True)
+        y = x + 2
+        y = y.detach()
+        z = y * 4 + 2
+        self.assertFalse(y.requires_grad)
+        self.assertFalse(z.requires_grad)
+
+        x = torch.randn(10, 10, requires_grad=True)
+        y = x * 2
+        y = y.detach()
+        self.assertFalse(y.requires_grad)
+        self.assertIsNone(y.grad_fn)
+        z = x + y
+        z.sum().backward()
+        # This is an incorrect gradient, but we assume that's what the user
+        # wanted. detach() is an advanced option.
+        self.assertEqual(x.grad.data, torch.ones(10, 10))
+
+        # in-place detach
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=True)
+        a = x * 2
+        (y + a).sum().backward(retain_graph=True)
+        a.detach_()
+        self.assertFalse(a.requires_grad)
+        (y + a).sum().backward()  # this won't backprop to x
+        self.assertEqual(x.grad.data, torch.ones(10, 10) * 2)
+        self.assertEqual(y.grad.data, torch.ones(10, 10) * 2)
+
+        # in-place deatch on a view raises an exception
+        view = x.narrow(0, 1, 4)
+        self.assertRaisesRegex(RuntimeError, 'view', lambda: view.detach_())
+
+    def test_detach_base(self):
+        "detaching base does not detach view"
+        x = torch.randn(10, 10, requires_grad=True)
+        view = x.narrow(0, 1, 4)
+        x.detach_()
+        self.assertFalse(x.requires_grad)
+        self.assertTrue(view.requires_grad)
+        self.assertIsNotNone(view.grad_fn)
+        self.assertIs(view._base, x)
+
+    def _test_type_conversion_backward(self, t, ):
+        fvar = Variable(t(torch.randn(5, 5).float()), requires_grad=True)
+        fvar.double().sum().backward()
+        self.assertEqual(fvar.grad, torch.ones_like(fvar))
+        self.assertEqual(type(fvar.grad.data), type(fvar.data))
+        dvar = Variable(t(torch.randn(5, 5).double()), requires_grad=True)
+        dvar.float().sum().backward()
+        self.assertEqual(dvar.grad, torch.ones_like(dvar))
+        self.assertEqual(type(dvar.grad.data), type(dvar.data))
+
+    def test_type_conversions(self):
+        x = torch.randn(5, 5)
+        self.assertIsInstance(x.float(), torch.FloatTensor)
+        self.assertIsInstance(x.int(), torch.IntTensor)
+        if torch.cuda.is_available():
+            self.assertIsInstance(x.float().cuda(), torch.cuda.FloatTensor)
+            self.assertIsInstance(x.int().cuda(), torch.cuda.IntTensor)
+            self.assertIsInstance(x.int().cuda().cpu(), torch.IntTensor)
+            if torch.cuda.device_count() >= 2:
+                x2 = x.float().cuda(1)
+                self.assertIsInstance(x2, torch.cuda.FloatTensor)
+                self.assertIs(x2.get_device(), 1)
+                x2 = x.float().cuda()
+                self.assertIsInstance(x2.data, torch.cuda.FloatTensor)
+                self.assertIs(x2.get_device(), 0)
+                x2 = x2.cuda(1)
+                self.assertIsInstance(x2, torch.cuda.FloatTensor)
+                self.assertIs(x2.get_device(), 1)
+                y = Variable(torch.randn(5).cuda(1), requires_grad=True)
+                y.cpu().sum().backward()
+                self.assertIs(y.grad.get_device(), 1)
+                self.assertIs(y.long().data.get_device(), 1)
+
+        for t in [torch.DoubleTensor, torch.FloatTensor, torch.IntTensor, torch.ByteTensor]:
+            for y_var in (True, False):
+                y = torch.randint(5, (5, 5), dtype=t.dtype)
+                y = Variable(y) if y_var else y
+                self.assertIsInstance(x.type(t), t)
+                self.assertIsInstance(x.type_as(y), t)
+                # TODO: t.dtype should work
+                t_dtype = t().dtype
+                self.assertIsInstance(x.type(t_dtype), t)
+                self.assertIs(t_dtype, x.type(t_dtype).dtype)
+                self.assertEqual(y.data_ptr(), y.type(t).data_ptr())
+                if torch.cuda.is_available():
+                    for x_cuda in (True, False):
+                        for y_cuda in (True, False):
+                            x_c = x.cuda() if x_cuda else x
+                            y_c = y.cuda() if y_cuda else y
+                            _, y_type = y_c.type().rsplit('.', 1)
+                            y_typestr = ('torch.cuda.' if y_cuda else 'torch.') + y_type
+                            self.assertEqual(y_c.type(), x_c.type(y_typestr).type())
+                            self.assertIs(y_c.dtype, x_c.type(y_c.dtype).dtype)
+                            self.assertEqual(y_c.data_ptr(), y_c.cuda().data_ptr() if y_cuda else y_c.data_ptr())
+
+        self._test_type_conversion_backward(lambda x: x)
+        if torch.cuda.is_available():
+            self._test_type_conversion_backward(lambda x: x.cuda())
+            if torch.cuda.device_count() >= 2:
+                # one of these has to be the non-default device
+                self._test_type_conversion_backward(lambda x: x.cuda(0))
+                self._test_type_conversion_backward(lambda x: x.cuda(1))
+
+    def _test_pyscalar_conversions(self, t, integral_conv):
+        # integral -> integral
+        l = t(torch.zeros(1, 1, 1, dtype=torch.long))
+        pyscalar = -12345
+        l[0] = pyscalar
+        self.assertEqual(integral_conv(l), pyscalar)
+
+        # floating point -> floating point
+        f = Variable(t(torch.randn(1, 1)))
+        pyscalar = -12345.1
+        f[0] = pyscalar
+        self.assertEqual(float(f), pyscalar)
+        f[0] = nan
+        self.assertTrue(math.isnan(float(f)))
+        f[0] = inf
+        self.assertEqual(float(f), inf, allow_inf=True)
+        f[0] = -inf
+        self.assertEqual(float(f), -inf, allow_inf=True)
+
+        # integral -> floating point
+        # check we can convert something that loses precision
+        pyscalar = 1234567890123456789
+        self.assertNotEqual(pyscalar, integral_conv(float(pyscalar)))
+        l[0] = pyscalar
+        self.assertEqual(float(l), float(pyscalar))
+
+        # floating point -> integral
+        f[0] = nan
+        self.assertRaises(ValueError, lambda: integral_conv(f[0]))
+        f[0] = inf
+        self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
+        f[0] = -inf
+        self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
+        f[0] = sys.float_info.max
+        self.assertEqual(integral_conv(f), sys.float_info.max)
+
+        # bool, nonzero
+        def test_nonzero(tensor, value, expected):
+            tensor[0] = value
+            self.assertEqual(expected, bool(tensor))
+            self.assertEqual(expected, True if tensor else False)
+
+        test_nonzero(l, 0, False)
+        test_nonzero(l, -2, True)
+        test_nonzero(f, 0.0, False)
+        test_nonzero(f, sys.float_info.min, True)
+        test_nonzero(f, nan, bool(nan))
+        test_nonzero(f, inf, bool(inf))
+        test_nonzero(f, -inf, bool(-inf))
+
+    def test_pyscalar_conversions(self):
+        self._test_pyscalar_conversions(lambda x: x, lambda x: int(x))
+        if sys.version_info[0] == 2:
+            self._test_pyscalar_conversions(lambda x: x, lambda x: long(x))
+        if torch.cuda.is_available():
+            self._test_pyscalar_conversions(lambda x: x.cuda(), lambda x: int(x))
+            if sys.version_info[0] == 2:
+                self._test_pyscalar_conversions(lambda x: x.cuda(), lambda x: long(x))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_pin_memory(self):
+        x = torch.randn(2, 2, requires_grad=True)
+        self.assertEqual(x, x.pin_memory())
+        self.assertIsNot(x, x.pin_memory())
+        self.assertTrue(x.pin_memory().requires_grad)
+        gradcheck(lambda x: x.pin_memory(), [x])
+        gradgradcheck(lambda x: x.pin_memory(), [x])
+
+    def test_isolated_node(self):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=True)
+
+        a = x + y
+        b = torch.max(a, 1, True)[1].repeat(1, 5).double()
+        o = (b + a).sum()
+        o.backward()
+
+    def test_shape(self):
+        x = torch.randn(3, 4)
+        self.assertEqual(2, len(x.shape))
+        self.assertEqual(x.shape[0], 3)
+        self.assertEqual(x.shape[1], 4)
+
+    def test_numpy_requires_grad(self):
+        x = torch.randn(2, 2, requires_grad=True)
+        self.assertRaisesRegex(RuntimeError, 'requires grad', lambda: x.numpy())
+
+    def test_return_leaf(self):
+        class Identity(Function):
+
+            def forward(self, a, b):
+                return a, a + b
+
+            def backward(self, grad_a, grad_b):
+                return grad_a + grad_b, grad_b
+
+        hook_called = [False]
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5, 5, requires_grad=True)
+
+        q, p = Identity()(x, y)
+
+        # Make sure hooks only receive grad from usage of q, not x.
+        def hook(grad):
+            hook_called[0] = True
+            self.assertEqual(grad.data, torch.ones(5, 5))
+
+        q.register_hook(hook)
+        (q + p + x).sum().backward()
+        self.assertEqual(x.grad.data, torch.ones(5, 5) * 3)
+        self.assertEqual(y.grad.data, torch.ones(5, 5))
+        self.assertTrue(hook_called[0])
+
+    def test_return_leaf_inplace(self):
+        class Inplace(InplaceFunction):
+
+            def forward(self, a, b):
+                self.mark_dirty(a)
+                return a.add_(b), b + 2
+
+            def backward(self, grad_a, grad_b):
+                return grad_a, grad_a + grad_b
+
+        x = torch.randn(5, 5)
+        y = torch.randn(5, 5, requires_grad=True)
+
+        fn = Inplace(True)
+        q, p = fn(x, y)
+        self.assertIs(q, x)
+        self.assertIs(q.grad_fn, fn)
+        self.assertTrue(q.requires_grad)
+        q.sum().backward()
+        self.assertEqual(y.grad.data, torch.ones(5, 5))
+
+    def test_leaf_assignment(self):
+        x = torch.randn(5, 5)
+        y = torch.randn(5, requires_grad=True)
+        z = torch.randn(5, requires_grad=True)
+
+        x[0] = y
+        x[1] = 2 * z
+        self.assertTrue(x.requires_grad)
+        self.assertIsNot(x.grad_fn, None)
+        x.sum().backward()
+        self.assertEqual(y.grad.data, torch.ones(5))
+        self.assertEqual(z.grad.data, torch.ones(5) * 2)
+
+    def test_no_grad_assignment(self):
+        x = torch.randn(5, 5, requires_grad=True)
+        y = torch.randn(5)
+        with torch.no_grad():
+            x[0] = y
+
+        self.assertTrue(x.requires_grad)
+        self.assertIsNone(x.grad_fn)
+
+    def test_no_grad_modifies_version(self):
+        x = torch.randn(5, requires_grad=True)
+        y = torch.randn(5, requires_grad=True)
+        z = (x * y).sum()
+        with torch.no_grad():
+            x *= 2
+        self.assertRaisesRegex(RuntimeError, 'modified by an inplace operation',
+                               lambda: z.backward())
+
+    def test_no_grad_input(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(self, x):
+                return x
+
+            @staticmethod
+            def backward(self, grad_output):
+                return grad_output
+
+        x = torch.randn(5, requires_grad=True)
+        with torch.no_grad():
+            y = MyFunction.apply(x)
+
+        self.assertTrue(x.requires_grad)
+        self.assertIsNone(y.grad_fn)
+
+    def test_backward_copy(self):
+        # This tests checks backward engine for a very subtle bug that appreared
+        # in one of the initial versions of autograd. Gradients tensors were
+        # simply stored in lists while the function waited for all its gradients
+        # to be computed. However, sometimes an output was used multiple times,
+        # so the gradients needed to be summed. Engine used to keep a need_copy
+        # set of tensors that will need a clone upon next addition and removed
+        # them from the set as soon as the clone was performed. However, this
+        # could lead to incorrect results if the same gradient tensor was
+        # buffered in three places in the graph:
+        # 1. When accumulating gradients in one of these places it was cloned
+        #    and removed from need_copy set.
+        # 2. When accumulating in second place, it wasn't in the need_copy set,
+        #    so the gradients were simply accumulated in-place (which already
+        #    modified the grad in 3rd place)
+        # 3. When accumulating in the third place, it wasn't in the need_copy set
+        #    as well, so the incoming gradient was summed in-place, yielding
+        #    incorrect results in all functions, except the first one.
+        x = torch.ones(5, 5, requires_grad=True)
+        y = torch.ones(5, 5, requires_grad=True)
+        # Simulate that we're in the middle of the graph
+        a = x + 2
+        b = y + 2
+        c = x + 2
+        # This op will just return grad_output two times in backward
+        add1 = a + b
+        add2 = add1 + c
+        # Simulate a long branch, so grad_output will get buffered.
+        for i in range(4):
+            a = a * 2
+            b = b * 2
+            c = c * 2
+        branch = a + b + c
+        out = add2 + branch
+        # expected gradients are:
+        # for x: 34 (16 from final a, 16 from final c, 2 from add2)
+        # for y: 17 (16 from final b, 1 from add2)
+        grad_output = torch.ones(5, 5)
+        out.backward(grad_output)
+        self.assertEqual(x.grad, torch.ones(5, 5) * 34)
+        self.assertEqual(y.grad, torch.ones(5, 5) * 17)
+
+    def test_save_none_for_backward(self):
+        test_case = self
+
+        class MyFn(Function):
+
+            def forward(self, input):
+                self.save_for_backward(None, input, None)
+                return input * input
+
+            def backward(self, grad_output):
+                n1, input, n2 = self.saved_tensors
+                test_case.assertIsNone(n1)
+                test_case.assertIsNone(n2)
+                return 2 * input * grad_output
+
+        x = torch.randn(5, 5, requires_grad=True)
+        y = MyFn()(x)
+        y.sum().backward()
+        self.assertEqual(x.grad, 2 * x)
+
+    def test_too_many_grads(self):
+        class MyFn(Function):
+
+            def forward(self, input):
+                return input
+
+            def backward(self, grad_output):
+                return grad_output, None, None
+
+        x = torch.randn(5, 5, requires_grad=True)
+        y = MyFn()(x)
+        y.sum().backward()
+        self.assertEqual(x.grad, torch.ones_like(x))
+
+    def test_pickle(self):
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=False)
+
+        def assert_strict_equal(var1, var2):
+            self.assertEqual(var1.data, var2.data)
+            self.assertEqual(var1.requires_grad, var2.requires_grad)
+
+        serialized = [pickle.dumps([x, y], protocol=p) for p in range(3)]
+        for dump in serialized:
+            xc, yc = pickle.loads(dump)
+            assert_strict_equal(xc, x)
+            assert_strict_equal(yc, y)
+
+    def test_dep_nograd(self):
+        class F1(Function):
+
+            def forward(self, input):
+                out = torch.randn(input.size())
+                self.mark_non_differentiable(out)
+                return input, out
+
+            def backward(self, grad_output, ignored):
+                return grad_output
+
+        class F2(Function):
+
+            def forward(self, input, ignored):
+                return input
+
+            def backward(self, grad_output):
+                return grad_output, None
+
+        x = torch.randn(5, requires_grad=True)
+        a, b = F1()(x)
+        b = b + 1  # separate F1 from F2 by another op
+        self.assertTrue(a.requires_grad)
+        self.assertFalse(b.requires_grad)
+        c = F2()(a, b)
+        c.backward(torch.ones(c.size()))
+        self.assertEqual(x.grad.data, torch.ones(x.size()))
+
+    def test_set_grad_enabled(self):
+        x = torch.tensor([1.], requires_grad=True)
+        with torch.set_grad_enabled(False):
+            y = x * 2
+        self.assertFalse(y.requires_grad)
+        with torch.set_grad_enabled(True):
+            y = x * 2
+        self.assertTrue(y.requires_grad)
+        with torch.set_grad_enabled(False):
+            torch.set_grad_enabled(True)
+            y = x * 2
+        self.assertTrue(y.requires_grad)
+
+    def test_reentrant(self):
+        y_data = torch.randn(2, 2)
+
+        class Reenter(Function):
+            @staticmethod
+            def forward(ctx, x):
+                with torch.enable_grad():
+                    ctx.x = Variable(x.data, requires_grad=True)
+                    ctx.y = Variable(y_data, requires_grad=True)
+                    ctx.output_var = ctx.x * ctx.y
+                return ctx.output_var.detach()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                with torch.enable_grad():
+                    ctx.output_var.sum().backward()
+                return ctx.x.grad * grad_output
+
+        x = torch.randn(2, 2, requires_grad=True)
+        out = Reenter.apply(x)
+        out.sum().backward()
+        self.assertEqual(x.grad.data, y_data)
+
+    def test_cat(self):
+        f_args_variable = (torch.randn(1, S, S, requires_grad=True),
+                           torch.randn(2, S, S, requires_grad=True),
+                           torch.randn(3, S, S, requires_grad=True),
+                           0)
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_cat", "cat",
+                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
+                              True, f_args_variable, f_args_tensor)
+
+    def test_cat_negdim_1(self):
+        f_args_variable = (torch.randn(S, S, 1, requires_grad=True),
+                           torch.randn(S, S, 2, requires_grad=True),
+                           torch.randn(S, S, 3, requires_grad=True),
+                           -1)
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_cat_negdim_1", "cat",
+                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
+                              True, f_args_variable, f_args_tensor)
+
+    def test_cat_negdim_2(self):
+        f_args_variable = (torch.randn(S, 1, S, requires_grad=True),
+                           torch.randn(S, 2, S, requires_grad=True),
+                           torch.randn(S, 3, S, requires_grad=True),
+                           -2)
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_cat_negdim_2", "cat",
+                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
+                              True, f_args_variable, f_args_tensor)
+
+    def test_cat_empty_legacy(self):
+        f_args_variable = (torch.randn(0, requires_grad=True),
+                           torch.randn(S, S, requires_grad=True))
+        # gradgradcheck doesn't work, probably because legacy size tracking is wrong somewhere,
+        # hence False passed below, but gradcheck checked explicitly.
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_cat_empty_legacy", "cat",
+                              lambda a, b: torch.cat((a, b)),
+                              False, f_args_variable, f_args_tensor)
+        self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION))
+
+    @skipIfNoZeroSize
+    def test_cat_empty(self):
+        f_args_variable = (torch.randn(0, S, requires_grad=True),
+                           torch.randn(S, S, requires_grad=True))
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_cat_empty", "cat",
+                              lambda a, b: torch.cat((a, b)),
+                              True, f_args_variable, f_args_tensor)
+
+    @skipIfNoLapack
+    def test_potrf(self):
+        root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True)
+
+        def run_test(upper):
+            def func(root):
+                x = torch.mm(root, root.t())
+                return torch.potrf(x, upper)
+
+            gradcheck(func, [root])
+            gradgradcheck(func, [root])
+
+        run_test(upper=True)
+        run_test(upper=False)
+
+    @skipIfNoLapack
+    def test_trtrs(self):
+        def _test_with_size(N, C):
+            A = torch.rand(N, N, requires_grad=True)
+            b = torch.rand(N, C, requires_grad=True)
+
+            for upper, transpose, unitriangular in product((True, False), repeat=3):
+                def func(A, b):
+                    return torch.trtrs(b, A, upper, transpose, unitriangular)
+
+                gradcheck(func, [A, b])
+                gradgradcheck(func, [A, b])
+
+        _test_with_size(S, S + 1)
+        _test_with_size(S, S - 1)
+
+    @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
+    def test_fft_ifft_rfft_irfft(self):
+        def _test_complex(sizes, signal_ndim):
+            x = torch.randn(sizes, requires_grad=True, dtype=torch.double)
+
+            for normalized in (True, False):
+                def fft(x):
+                    return x.fft(signal_ndim, normalized=normalized)
+
+                gradcheck(fft, [x])
+                gradgradcheck(fft, [x], gen_non_contig_grad_outputs=True)
+
+                def ifft(fx):
+                    return fx.ifft(signal_ndim, normalized=normalized)
+
+                # Use output of fft(x) for inverse fft, due to symmetry requirements
+                fx = fft(x).detach()
+                fx.requires_grad = True
+                gradcheck(ifft, [fx])
+                gradgradcheck(ifft, [fx], gen_non_contig_grad_outputs=True)
+
+        def _test_real(sizes, signal_ndim):
+            x = torch.randn(sizes, requires_grad=True, dtype=torch.double)
+            if x.dim() == signal_ndim:
+                start_dim = 0
+            else:
+                start_dim = 1
+            signal_sizes = x.size()[start_dim:start_dim + signal_ndim]
+
+            for normalized, onesided in product((True, False), repeat=2):
+                def rfft(x):
+                    return x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
+
+                gradcheck(rfft, [x])
+                gradgradcheck(rfft, [x], gen_non_contig_grad_outputs=True)
+
+                # Generally speaking, irfft itself won't and can't pass the
+                # current gradcheck as it assumes the input follows conjugate
+                # symmetry, an requirement that is never true with our point
+                # numerical Jacobian estimate. Without input symmtry, irfft's
+                # behavior is undefined.
+                #
+                # Even onesided results can't remove all redundancy. For
+                # example, consider the .select(last_signal_dim, 0) slice.
+                # It is entirely represented in the onesided results (except
+                # for 1D), and will be reflected onto itself!
+                #
+                # So only 1D onesided irfft should pass grad check as it is
+                # guaranteed that the input has no symmetrical values.
+                #
+                # In other cases, we test a function that first uses rfft to
+                # generate a tensor that follows the conjugate symmetry irfft
+                # expects, and then feeds it into irfft. Since rfft is already
+                # tested above, we thereby verify the correctness of irfft.
+                if signal_ndim == 1 and onesided:
+                    def irfft(fx):
+                        return fx.irfft(signal_ndim, normalized=normalized,
+                                        onesided=onesided, signal_sizes=signal_sizes)
+
+                    # Use output of rfft(x) for inverse rfft, due to symmetry requirements
+                    fx = rfft(x).detach()
+                    fx.requires_grad = True
+                    gradcheck(irfft, [fx])
+                    gradgradcheck(irfft, [fx], gen_non_contig_grad_outputs=True)
+                else:
+                    # Test this function: f(x) = ifft(rfft(x) + rfft(z)), where
+                    # z is some fixed tensor of same size as x. rfft(z) term is
+                    # needed because otherwise f becomes identity.
+                    z = torch.randn(sizes, dtype=torch.double)
+                    fz = z.rfft(signal_ndim, normalized=normalized, onesided=onesided)
+
+                    def rfft_irfft(x):
+                        fx = x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
+                        y = fx + fz
+                        return y.irfft(signal_ndim, normalized=normalized,
+                                       onesided=onesided, signal_sizes=signal_sizes)
+
+                    gradcheck(rfft_irfft, [x])
+                    gradgradcheck(rfft_irfft, [x], gen_non_contig_grad_outputs=True)
+
+        _test_real((2, 10), 1)
+        _test_real((2, 3, 4), 2)
+        _test_real((2, 3, 4, 3), 3)
+
+        _test_complex((2, 2, 10, 2), 1)
+        _test_complex((1, 2, 3, 4, 2), 2)
+        _test_complex((2, 1, 3, 4, 3, 2), 3)
+
+    def test_variable_traverse(self):
+        def get_out_and_unrefed_cycle():
+            inp = torch.randn(10, requires_grad=True)
+            tmp = inp.view(10, 1)
+            out = tmp.view(10)
+
+            # Create a reference cycle that contains an
+            # intermediary Variable in the graph
+            my_list = []
+            my_list.append(tmp)
+            my_list.append(my_list)
+
+            return out
+
+        out = get_out_and_unrefed_cycle()
+        gc.collect()
+        # This will segfault if things have been erroneously released
+        out.backward(torch.randn(out.size()))
+
+    def test_norm_subgradient(self):
+        def run_test(input_size, norm_deg):
+            input = torch.zeros(*input_size, requires_grad=True)
+            input.norm(norm_deg).backward()
+            self.assertEqual(input.grad.data.abs().sum(), 0)
+
+        run_test((10,), 2)
+        run_test((10, 10), 2)
+        run_test((10,), 3)
+        run_test((10,), 1)
+        run_test((10,), 1.5)
+
+    def test_pow_zero_tensor_gradient(self):
+        def run_test(input_size, exponent):
+            input = torch.zeros(*input_size, requires_grad=True)
+            input.pow(exponent).sum().backward()
+            self.assertEqual(input.grad.data.abs().sum(), 0)
+
+        run_test((10,), torch.zeros(10))
+        run_test((10, 10), torch.zeros(10, 10))
+        run_test((10,), 0)
+
+    def test_pinverse(self):
+        # Why is pinverse tested this way, and not ordinarily as other linear algebra methods?
+        # 1. Pseudo-inverses are not generally continuous, which means that they are not differentiable
+        # 2. Derivatives for pseudo-inverses exist typically for constant rank (Golub et al, 1973)
+        # 3. This method creates two orthogonal matrices, and a constructs a test case with large
+        #    singular values (given by x to the function).
+        # 4. This will ensure that small perturbations don't affect the rank of matrix, in which case
+        #    a derivative exists.
+        # 5. This test exists since pinverse is implemented using SVD, and is hence a backpropable method
+        m, n = 5, 10
+        U = torch.randn(n, m).qr()[0].t()  # Orthogonal with dimensions m x n
+        V = torch.randn(n, m).qr()[0].t()  # Orthogonal with dimensions m x n
+
+        def func(x):
+            S = torch.cat([x, torch.zeros(n - m)], 0)
+            M = U.mm(torch.diag(S)).mm(V.t())
+            return M.pinverse()
+
+        gradcheck(func, [torch.rand(m).add_(1).requires_grad_()])
+        gradcheck(func, [torch.rand(m).add_(10).requires_grad_()])
+        gradgradcheck(func, [torch.rand(m).add_(1).requires_grad_()])
+        gradgradcheck(func, [torch.rand(m).add_(10).requires_grad_()])
+
+    def test_profiler(self):
+        x = torch.randn(10, 10)
+
+        with profile() as p:
+            y = x * 2 + 4
+
+        last_end = 0
+        names = ['mul', 'add']
+        self.assertEqual(len(p.function_events), len(names))
+        for info, expected_name in zip(p.function_events, names):
+            self.assertGreater(info.cpu_interval.start, last_end)
+            self.assertEqual(info.name, expected_name)
+            last_end = info.cpu_interval.end
+
+    def test_dir(self):
+        x = torch.randn(10, 10)
+        keys = dir(x)
+        self.assertIn('shape', keys)
+
+        for key in keys:
+            self.assertTrue(hasattr(x, key))
+
+    def test_as_strided(self):
+
+        def test(x, repro_fn, *args):
+            def closure(x):
+                if repro_fn is not None:
+                    x = repro_fn(x)
+                return x.as_strided(*args)
+
+            x = x.to(torch.double).detach().requires_grad_()
+            gradcheck(closure, [x])
+            gradgradcheck(closure, [x])
+
+        # test
+        test(torch.arange(0, 25), lambda x: x.view(5, 5), [3, 3], [6, 2], 2)
+
+        # test crazy stride at dim with size 1 case
+        test(torch.randn(10), None, [1, 2, 1, 5], [0, 5, 100, 1], 2)
+
+        # test expand case
+        test(torch.randn(5), None, [3, 3, 3], [0, 1, 0], 2)
+        test(torch.randn(5), None, [3, 3, 3], [0, 0, 0], 4)
+        test(torch.randn(5), lambda x: x.expand(5, 5), [5, 5], [0, 1], 0)
+
+        # test non-expand overlapping case
+        test(torch.randn(35), None, [6, 6], [5, 1], 2)
+        test(torch.randn(15), None, [3, 2], [3, 6], 2)
+
+        # test transpose case
+        test(torch.randn(3, 4), None, [4, 3], [1, 4])
+
+        # test "getting things outside the input" case
+        x = torch.randn(6, 2)
+        test(x[3:], None, [3, 2], [2, 1], 0)  # should be all zeros
+        self.assertEqual(x[3:].as_strided([3, 2], [2, 1], 0), x[:3])
+
+        # test select on expanded input case
+        test(torch.randn(2, 3), lambda x: x.expand(10, 2, 3), [2, 3], [3, 1], 0)
+
+    def _test_where_functional(self, t):
+        x = Variable(t(torch.randn(5, 5)), requires_grad=True)
+        y = Variable(t(torch.randn(5, 5)), requires_grad=True)
+        cond = Variable(t(mask_not_all_zeros((5, 5))), requires_grad=False)
+
+        def where(cond, x, y):
+            return torch.where(cond, x, y)
+
+        gradcheck(where, [cond, x, y], raise_exception=True)
+        gradgradcheck(where, [cond, x, y], [Variable(t(torch.randn(5, 5)))])
+
+        x = Variable(t(torch.randn(5, 1, 5)), requires_grad=True)
+        y = Variable(t(torch.randn(5, 5, 1)), requires_grad=True)
+        gradcheck(where, [cond, x, y], raise_exception=True)
+        gradgradcheck(where, [cond, x, y], [Variable(t(torch.randn(5, 5, 5)))])
+
+    def test_where_functional(self):
+        self._test_where_functional(lambda t: t)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_where_functional_cuda(self):
+        self._test_where_functional(lambda t: t.cuda())
+
+    def test_inplace_view_backprop_base(self):
+        # modify view and back-prop through base
+        root = torch.randn(2, 2, requires_grad=True)
+        x = root.clone()
+        v1 = x.narrow(0, 0, 1)
+        v1.mul_(2)
+        x.sum().backward()
+        self.assertEqual(root.grad.data.tolist(), [[2, 2], [1, 1]])
+
+    def test_inplace_view_backprop_view_of_view(self):
+        # modify view and backprop through view-of-view
+        root = torch.randn(2, 2, requires_grad=True)
+        x = root.clone()
+        v1 = x.narrow(0, 0, 1)
+        v2 = x.narrow(0, 0, 1)
+        v1.mul_(2)
+        v2.sum().backward()
+        self.assertEqual(root.grad.data.tolist(), [[2, 2], [0, 0]])
+
+    def test_inplace_view_of_view(self):
+        # modify view-of-view and backprop through base
+        root = torch.randn(2, 2, requires_grad=True)
+        x = root.clone()
+        v1 = x.narrow(0, 0, 1)
+        v2 = v1.narrow(1, 1, 1)
+        v2.mul_(2)
+        x.sum().backward()
+        self.assertEqual(root.grad.data.tolist(), [[1, 2], [1, 1]])
+
+    def test_inplace_view_gradcheck(self):
+        # gradcheck modifications to views
+        a = torch.randn(4, 4, requires_grad=True)
+        b = torch.randn(2, 2, requires_grad=True)
+
+        def func(root, b):
+            x = root.clone()
+            x.narrow(1, 2, 2).narrow(0, 1, 2).mul_(b)
+            x.narrow(1, 0, 2).narrow(0, 1, 2).mul_(b)
+            return x
+
+        gradcheck(func, [a, b], raise_exception=True)
+        go = torch.randn(a.size(), requires_grad=True)
+        gradgradcheck(func, (a, b), (go,))
+
+    def test_inplace_view_makes_base_require_grad(self):
+        # in-place modification to view makes base require grad
+        a = torch.randn(4, 4, requires_grad=False)
+        b = torch.randn(4, 2, requires_grad=True)
+
+        def func(root, b):
+            x = root.clone()
+            self.assertFalse(x.requires_grad)
+            x.narrow(1, 2, 2).mul_(b)
+            self.assertTrue(x.requires_grad)
+            return x
+
+        gradcheck(func, [a, b], raise_exception=True)
+        go = torch.randn(a.size(), requires_grad=True)
+        gradgradcheck(func, (a, b), (go,))
+
+    def test_inplace_view_backprop_view(self):
+        # modify view and backprop through view
+        a = Variable(torch.Tensor([2, 5]), requires_grad=False)
+        b = Variable(torch.Tensor([3]), requires_grad=True)
+        res = a.narrow(0, 1, 1).mul_(b)
+        res.sum().backward()
+        self.assertEqual(b.grad.data.tolist(), [5])
+        self.assertIsNone(a.grad)
+
+    def test_inplace_view_modify_base(self):
+        # Test that an in-place operation on a base that forced it to require
+        # grad also forces any previous views to require grad and backprop
+        # correctly
+        r = torch.ones(1, requires_grad=True)
+
+        def fn(r):
+            x = torch.ones(5)
+            v = x.select(0, 1)
+            self.assertFalse(v.requires_grad)
+            self.assertIsNone(v.grad_fn)
+            x.add_(r)  # v is now dependent on r due to the in-place op on x
+            self.assertTrue(v.requires_grad)
+            return v
+
+        gradcheck(fn, [r])
+        gradgradcheck(fn, [r])
+
+    def test_inplace_view_python(self):
+        # in-place modifications of Python-autograd created view
+        a = torch.randn(4, 4, requires_grad=True)
+        b = torch.randn(2, 2, requires_grad=True)
+
+        class PyAdd(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.mark_dirty(x)
+                x.add_(y)
+                return x
+
+            @staticmethod
+            def backward(ctx, grad):
+                return grad, grad
+
+        def func(root, b):
+            x = root.clone()
+            PyAdd.apply(x.narrow(1, 2, 2).narrow(0, 1, 2), b)
+            PyAdd.apply(x.narrow(1, 0, 2).narrow(0, 1, 2), b)
+            return x
+
+        gradcheck(func, [a, b], raise_exception=True)
+        go = torch.randn(a.size(), requires_grad=True)
+        gradgradcheck(func, (a, b), (go,))
+
+    def test_inplace_view_non_contig(self):
+        data = torch.ones(2, 3, 2).select(2, 1).t()
+        root = Variable(data, requires_grad=True)
+        x = root.clone()
+        v1 = x.narrow(0, 0, 1)
+        v2 = v1.narrow(1, 1, 1)
+        v2.mul_(2)
+        x.sum().backward()
+        self.assertEqual(root.grad.data.tolist(), [[1, 2], [1, 1], [1, 1]])
+
+    def test_inplace_view_saved_output(self):
+        # Test an in-place operation on a view in which the in-place op saves
+        # its output. Previously, this created a reference cycle.
+        dealloc = [0]
+
+        class IncrementOnDelete(object):
+            def __del__(self):
+                dealloc[0] += 1
+
+        def test():
+            root = torch.randn(3, 3, requires_grad=True)
+            copy = root.clone()
+            copy.grad_fn.register_hook(IncrementOnDelete())
+            view = copy.view(9)
+            torch.nn.functional.relu(view, inplace=True)
+
+        test()
+        self.assertEqual(dealloc[0], 1)
+
+    def test_mul_out(self):
+        a = torch.randn(2, 2, requires_grad=True)
+        b = torch.randn(2, 2, requires_grad=True)
+        x = torch.zeros_like(a)
+
+        # out=... functions don't support automatic differentiation currently
+        self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
+
+        # the inputs can require grad if we're in no_grad() mode
+        with torch.no_grad():
+            torch.mul(a, b, out=x)
+            self.assertEqual(x, a * b)
+
+    def test_mul_out_result_requires_grad(self):
+        a = torch.randn(2, 2)
+        b = torch.randn(2, 2)
+        x = torch.zeros(2, 2, requires_grad=True)
+        # we should throw an exception if the output requires grad
+        self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
+
+    def test_diagonal_derivative_requires_grad(self):
+        # test that the backward requires grad
+        # we do this is because diagonal_backward uses inplace
+        # operations and gradgradcheck does not catch whether
+        # they works as expected (it will succeed even if
+        # the gradient has requires_grad == False
+        a = torch.randn(5, 6, requires_grad=True)
+        b = torch.diagonal(a)**2
+        c = b.sum()
+        d, = torch.autograd.grad(c, a, retain_graph=True, create_graph=True)
+        self.assertTrue(d.requires_grad)
+
+    @staticmethod
+    def _test_set_requires_grad_only_for_floats(self, cuda):
+        dtypes = [torch.int64, torch.int32, torch.int16, torch.int8,
+                  torch.float, torch.double]
+        if cuda:
+            dtypes.append(torch.half)
+
+        def f1(dt):
+            a = torch.ones(1, dtype=dt, device='cuda' if cuda else 'cpu')
+            a.requires_grad_()
+
+        def f2(dt):
+            a = torch.ones(1, dtype=dt, device='cuda' if cuda else 'cpu')
+            a.requires_grad = True
+
+        def f3(dt):
+            torch.ones(1, dtype=dt, device='cuda' if cuda else 'cpu', requires_grad=True)
+
+        for dt in dtypes:
+            a = torch.ones(1, dtype=dt, device='cuda' if cuda else 'cpu')
+            a.requires_grad = False  # should always work
+            a.requires_grad_(False)
+
+            for f in [f1, f2, f3]:
+                if dt.is_floating_point:
+                    f(dt)
+                else:
+                    with self.assertRaisesRegex(RuntimeError, 'floating point',
+                                                msg="dt: {} device: {}".format(a.dtype, a.device)):
+                        f(dt)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_set_requires_grad_only_for_floats_cuda(self):
+        self._test_set_requires_grad_only_for_floats(self, True)
+
+    def test_set_requires_grad_only_for_floats(self):
+        self._test_set_requires_grad_only_for_floats(self, False)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_rnn_backward_to_input_but_not_parameters_cuda(self):
+        # this checks whether it is possible to not require
+        # weight parameters, but require inputs, see #7722
+        dev = torch.device('cuda')
+        l = torch.nn.LSTM(2, 3).to(dev)
+        for p in l.parameters():
+            p.requires_grad = False
+        s = torch.randn(1, 1, 2, requires_grad=True, device=dev)
+        out, _ = l(s)
+        out.sum().backward()
+        self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)
+
+    def test_anomaly_detect_nan(self):
+        size = 10
+
+        class MyFunc(Function):
+            @staticmethod
+            def forward(ctx, inp1, inp2, fail_0th):
+                ctx.fail_0th = fail_0th
+                return inp1.sum(0, keepdim=True)
+
+            @staticmethod
+            def backward(ctx, gO):
+                gI = gO.clone().expand(size)
+                gI[0] = 0
+                gI[0] /= 0  # Generate a nan
+                if ctx.fail_0th:
+                    return gI, None, None
+                else:
+                    return None, gI, None
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        out.backward()  # Should not fail
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        with self.assertRaisesRegex(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
+            with warnings.catch_warnings(record=True) as w:
+                with detect_anomaly():
+                    out.backward()
+            self.assertIn('No forward pass information', str(w[0].message))
+
+        inp = torch.rand(size, requires_grad=True)
+        with self.assertRaisesRegex(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
+            with warnings.catch_warnings(record=True) as w:
+                with detect_anomaly():
+                    out = MyFunc.apply(inp, inp, False)
+                    out.backward()
+            self.assertIn('MyFunc.apply', str(w[0].message))
+
+
+def index_variable(shape, max_indices):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
+    index = torch.rand(*shape).mul_(max_indices).floor_().long()
+    return index
+
+
+def index_perm_variable(shape, max_indices):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
+
+    index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
+    return index
+
+
+def gather_variable(shape, index_dim, max_indices, duplicate=False):
+    assert len(shape) == 2
+    assert index_dim < 2
+    batch_dim = 1 - index_dim
+    index = torch.LongTensor(*shape)
+    for i in range(shape[index_dim]):
+        index.select(index_dim, i).copy_(
+            torch.randperm(max_indices)[:shape[batch_dim]])
+    if duplicate:
+        index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
+    return index
+
+
+def mask_not_all_zeros(shape):
+    assert len(shape) > 0
+    while True:
+        result = torch.randn(shape).gt(0)
+        if result.sum() > 0:
+            return result
+
+
+def prod_zeros(dim_size, dim_select):
+    assert len(dim_select) == 2
+    result = torch.randn(dim_size, dim_size, dim_size)
+    result.narrow(dim_select[0], 0, 1).narrow(dim_select[1], 1, 1).zero_()
+    result.narrow(dim_select[0], 2, 1).narrow(dim_select[1], 3, 1).zero_()
+    result.narrow(dim_select[0], 4, 1).narrow(dim_select[1], 3, 1).zero_()
+    return result
+
+
+def prod_single_zero(dim_size):
+    result = torch.randn(dim_size, dim_size)
+    result[0, 1] = 0
+    return result
+
+
+def random_square_matrix_of_rank(l, rank):
+    assert rank <= l
+    A = torch.randn(l, l)
+    u, s, v = A.svd()
+    for i in range(l):
+        if i >= rank:
+            s[i] = 0
+        elif s[i] == 0:
+            s[i] = 1
+    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
+
+
+def random_symmetric_matrix(l):
+    A = torch.randn(l, l)
+    for i in range(l):
+        for j in range(i):
+            A[i, j] = A[j, i]
+    return A
+
+
+def random_symmetric_psd_matrix(l):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1))
+
+
+def random_symmetric_pd_matrix(l, eps=1e-5):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps
+
+
+def make_nonzero_det(A, sign=None, min_singular_value=0.1):
+    u, s, v = A.svd()
+    s[s < min_singular_value] = min_singular_value
+    A = u.mm(torch.diag(s)).mm(v.t())
+    det = A.det().item()
+    if sign is not None:
+        if (det < 0) ^ (sign < 0):
+            A[0, :].neg_()
+    return A
+
+
+def random_fullrank_matrix_distinct_singular_value(l):
+    A = torch.randn(l, l)
+    u, _, v = A.svd()
+    s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
+    return u.mm(torch.diag(s)).mm(v.t())
+
+
+def uniform_scalar(offset=0, requires_grad=False):
+    v = torch.rand(()) + offset
+    v.requires_grad = requires_grad
+    return v
+
+
+def normal_scalar_clamp(amin, amax, requires_grad=False):
+    v = torch.randn(()).clamp(amin, amax)
+    v.requires_grad = requires_grad
+    return v
+
+
+def bernoulli_scalar():
+    return torch.tensor(0, dtype=torch.uint8).bernoulli_()
+
+
+class dont_convert(tuple):
+    pass
+
+
+L = 20
+M = 10
+S = 5
+
+# (
+#   method name,
+#   input size/constructing fn,
+#   args (tuple represents shape of a tensor arg),
+#   test variant name (will be used at test name suffix),    // optional
+#   indices for possible dim arg,                            // optional
+#   fn mapping output to part that should be gradcheck'ed,   // optional
+# )
+method_tests = [
+    ('add', (S, S, S), ((S, S, S),)),
+    ('add', (S, S, S), ((S, S),), 'broadcast_rhs'),
+    ('add', (S, S), ((S, S, S),), 'broadcast_lhs'),
+    ('add', (S, 1, S), ((M, S),), 'broadcast_all'),
+    ('add', (), ((),), 'scalar'),
+    ('add', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('add', (), ((S, S, S),), 'scalar_broadcast_lhs'),
+    ('add', (S, S, S), (3.14,), 'constant'),
+    ('add', (), (3.14,), 'scalar_constant'),
+    ('__radd__', (S, S, S), (3.14,), 'constant'),
+    ('__radd__', (), (3.14,), 'scalar_constant'),
+    ('sub', (S, S, S), ((S, S, S),)),
+    ('sub', (S, S, S), ((S, S),), 'broadcast_rhs'),
+    ('sub', (S, S), ((S, S, S),), 'broadcast_lhs'),
+    ('sub', (S, 1, S), ((M, S),), 'broadcast_all'),
+    ('sub', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('sub', (), ((S, S, S),), 'scalar_broadcast_lhs'),
+    ('sub', (S, S, S), (3.14,), 'constant'),
+    ('sub', (), (3.14,), 'scalar_constant'),
+    ('__rsub__', (S, S, S), (3.14,), 'constant'),
+    ('__rsub__', (), (3.14,), 'scalar_constant'),
+    ('mul', (S, S, S), ((S, S, S),)),
+    ('mul', (), ((),), 'scalar'),
+    ('mul', (S, S, S), ((S, S),), 'broadcast_rhs'),
+    ('mul', (S, S), ((S, S, S),), 'broadcast_lhs'),
+    ('mul', (S, 1, S), ((M, S),), 'broadcast_all'),
+    ('mul', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('mul', (), ((S, S, S),), 'scalar_broadcast_lhs'),
+    ('mul', (S, S, S), (3.14,), 'constant'),
+    ('mul', (), (3.14,), 'scalar_constant'),
+    ('__rmul__', (S, S, S), (3.14,), 'constant'),
+    ('__rmul__', (), (3.14,), 'scalar_constant'),
+    ('div', (S, S, S), (torch.rand(S, S, S) + 0.1,)),
+    ('div', (S, S, S), (torch.rand(S, S) + 0.1,), 'broadcast_rhs'),
+    ('div', (S, S), (torch.rand(S, S, S) + 0.1,), 'broadcast_lhs'),
+    ('div', (S, 1, S), (torch.rand(M, S) + 0.1,), 'broadcast_all'),
+    ('div', (), (uniform_scalar(0.1),), 'scalar'),
+    ('div', (S, S, S), (uniform_scalar(0.1),), 'scalar_broadcast_rhs'),
+    ('div', (), (uniform_scalar(0.1),), 'scalar_broadcast_lhs'),
+    ('div', torch.rand(S, S, S) + 1e-1, (3.14,), 'constant'),
+    ('__rdiv__', torch.rand(S, S, S) + 1e-1, (3.14,), 'constant'),
+    ('div', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant'),
+    ('__rdiv__', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant'),
+    ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(S, S, S) + 0.1,)),
+    ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(1,) + 0.1,), 'broadcast_rhs'),
+    ('pow', torch.rand(1,) + 1e-3, (torch.rand(S, S, S) + 0.1,), 'broadcast_lhs'),
+    ('pow', torch.rand(S, 1, S) + 1e-3, (torch.rand(1, S, 1) + 0.1,), 'broadcast_all'),
+    ('pow', uniform_scalar(1e-3, requires_grad=True), (uniform_scalar(0.1),), 'scalar'),
+    ('pow', torch.rand(S, S, S) + 1e-3, (uniform_scalar(0.1),), 'scalar_broadcast_rhs'),
+    ('pow', uniform_scalar(1e-3, requires_grad=True), (torch.rand(S, S, S) + 0.1,), 'scalar_broadcast_lhs'),
+    ('pow', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant'),
+    ('__rpow__', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant'),
+    ('pow', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant'),
+    ('__rpow__', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant'),
+    ('transpose', (1, 2, 3), (1, 2), 'dim', [0, 1]),
+    ('transpose', (), (0, 0), 'scalar'),
+    ('transpose', (1,), (0, 0), '1d'),
+    ('transpose', torch.rand(L, L), (0, 1), '2d'),
+    ('transpose', torch.rand(S, S, S), (2, 0), '3d'),
+    ('t', (1, 2), NO_ARGS),
+    ('view', (S, S, S), (S * S, S),),
+    ('view', (S, S, S), (torch.Size([S * S, S]),), 'size'),
+    ('view', (S,), (S,), '1d'),
+    ('view', (), (dont_convert(()),), 'scalar_to_scalar'),
+    ('view', (), (1,), 'scalar_to_1d'),
+    ('reshape', (S, S, S), (S * S, S),),
+    ('reshape', (S, S, S), (torch.Size([S * S, S]),), 'size'),
+    ('reshape', (S,), (S,), '1d'),
+    ('reshape', (), (dont_convert(()),), 'scalar_to_scalar'),
+    ('reshape', (), (1,), 'scalar_to_1d'),
+    ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
+    ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'),
+    ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),
+    ('flip', (S, S, S), ([0],), 'd0'),
+    ('flip', (S, S, S), ([0, 1, 2],), 'd012'),
+    ('flip', (S, S, S), ([0, 2],), 'd02'),
+    ('flip', (S, S, S), ([2, 0],), 'd20'),
+    ('view_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
+    ('view_as', (), (non_differentiable(torch.tensor(5.5)),), 'scalar'),
+    ('view_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),
+    ('expand', (S, 1, 1), (S, S, S)),
+    ('expand', (torch.Size([S, 1, S]),), (S, S, S), 'size'),
+    ('expand', (S, 1), (S, S, S), 'new_dim'),
+    ('expand', (1,), (S, S, S), '1_element'),
+    ('expand', (1, S), (1, 1, S), 'new_dim_front_old_front_1'),
+    ('expand', (), (dont_convert(()),), 'scalar_to_scalar'),
+    ('expand', (), (1, 3, 2), 'scalar_to_dims'),
+    ('exp', (S, S, S), NO_ARGS),
+    ('exp', (), NO_ARGS, 'scalar'),
+    ('expm1', (S, S, S), NO_ARGS),
+    ('expm1', (), NO_ARGS, 'scalar'),
+    ('erf', torch.rand(S, S, S), NO_ARGS),
+    ('erf', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar'),
+    ('erfc', torch.rand(S, S, S), NO_ARGS),
+    ('erfc', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar'),
+    ('erfinv', torch.rand(S, S, S).clamp(-0.9, 0.9), NO_ARGS),
+    ('erfinv', normal_scalar_clamp(-0.9, 0.9, requires_grad=True), NO_ARGS, 'scalar'),
+    ('log', torch.rand(S, S, S) + 1e-2, NO_ARGS),
+    ('log', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar'),
+    ('log10', torch.rand(S, S, S) + 1e-2, NO_ARGS),
+    ('log10', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar'),
+    ('log1p', torch.rand(S, S, S), NO_ARGS),
+    ('log1p', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar'),
+    ('log2', torch.rand(S, S, S) + 1e-2, NO_ARGS),
+    ('log2', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar'),
+    ('tanh', (S, S, S), NO_ARGS),
+    ('tanh', (), NO_ARGS, 'scalar'),
+    ('sigmoid', (S, S, S), NO_ARGS),
+    ('sigmoid', (), NO_ARGS, 'scalar'),
+    ('sinh', (S, S, S), NO_ARGS),
+    ('sinh', (), NO_ARGS, 'scalar'),
+    ('cosh', (S, S, S), NO_ARGS),
+    ('cosh', (), NO_ARGS, 'scalar'),
+    ('abs', (S, S, S), NO_ARGS),
+    ('abs', (), NO_ARGS, 'scalar'),
+    ('clamp', (S, S, S), (0, 1)),
+    ('clamp', (S, S, S), (None, 0.5), 'min'),
+    ('clamp', (S, S, S), (0.5, None), 'max'),
+    ('clamp', (), (0, 1), 'scalar'),
+    ('clamp', (), (None, 0.5), 'min_scalar'),
+    ('clamp', (), (0.5, None), 'max_scalar'),
+    ('sqrt', torch.rand(S, S, S) + 5e-4, NO_ARGS),
+    ('sqrt', uniform_scalar(5e-4, requires_grad=True), NO_ARGS, 'scalar'),
+    ('sin', (S, S, S), NO_ARGS),
+    ('sin', (), NO_ARGS, 'scalar'),
+    ('cos', (S, S, S), NO_ARGS),
+    ('cos', (), NO_ARGS, 'scalar'),
+    ('tan', torch.randn(S, S, S).clamp(-1, 1), NO_ARGS),
+    ('asin', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS),
+    ('acos', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS),
+    ('atan', (S, S, S), NO_ARGS),
+    ('atan', (), NO_ARGS, 'scalar'),
+    ('atan2', (S, S, S), ((S, S, S),)),
+    ('atan2', (), ((),), 'scalar'),
+    ('reciprocal', torch.rand(S, S, S) + 0.1, NO_ARGS),
+    ('reciprocal', uniform_scalar(0.1, requires_grad=True), NO_ARGS, 'scalar'),
+    ('round', (S, S, S), NO_ARGS),
+    ('round', (), NO_ARGS, 'scalar'),
+    ('sign', (S, S, S), NO_ARGS),
+    ('sign', (), NO_ARGS, 'scalar'),
+    ('trunc', (S, S, S), NO_ARGS),
+    ('trunc', (), NO_ARGS, 'scalar'),
+    ('floor', (S, S, S), NO_ARGS),
+    ('floor', (), NO_ARGS, 'scalar'),
+    ('ceil', (S, S, S), NO_ARGS),
+    ('ceil', (), NO_ARGS, 'scalar'),
+    ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS),
+    ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar'),
+    ('frac', (S, S, S), NO_ARGS),
+    ('frac', (), NO_ARGS, 'scalar'),
+    ('fmod', (S, S, S), (1.5,)),
+    ('fmod', (), (1.5,), 'scalar'),
+    ('fmod', (S, S, S), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'tensor'),
+    ('fmod', (S,), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'tensor_broadcast_lhs'),
+    ('fmod', (S, S, S), (non_differentiable(torch.rand(S) + 1.5),), 'tensor_broadcast_rhs'),
+    ('fmod', (S, 1, S), (non_differentiable(torch.rand(S, S) + 1.5),), 'tensor_broadcast_all'),
+    ('fmod', (), (non_differentiable(uniform_scalar(1.5)),), 'scalar_tensor'),
+    ('fmod', (), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'scalar_tensor_broadcast_lhs'),
+    ('fmod', (S, S, S), (non_differentiable(uniform_scalar(1.5)),), 'scalar_tensor_broadcast_rhs'),
+    ('remainder', (S, S, S), (1.5,)),
+    ('remainder', (), (1.5,), 'scalar'),
+    ('remainder', (S, S, S), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'tensor'),
+    ('remainder', (S,), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'tensor_broadcast_lhs'),
+    ('remainder', (S, 1, S), (non_differentiable(torch.rand(S, S) + 1.5),), 'tensor_broadcast_all'),
+    ('remainder', (), (non_differentiable(uniform_scalar(1.5)),), 'scalar_tensor'),
+    ('remainder', (), (non_differentiable(torch.rand(S, S, S) + 1.5),), 'scalar_tensor_broadcast_lhs'),
+    ('lerp', (S, S, S), ((S, S, S), 0.4)),
+    ('lerp', (S, S, S), ((S,), 0.4), 'broadcast_rhs'),
+    ('lerp', (S,), ((S, S, S), 0.4), 'broadcast_lhs'),
+    ('lerp', (S, 1, S), ((S, S), 0.4), 'broadcast_all'),
+    ('lerp', (), ((), 0.4), 'scalar'),
+    ('lerp', (S, S, S), ((), 0.4), 'scalar_broadcast_rhs'),
+    ('lerp', (), ((S, S, S), 0.4), 'scalar_broadcast_lhs'),
+    ('max', (S, S, S), NO_ARGS),
+    ('max', (S, S, S), (1,), 'dim', [0]),
+    ('max', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('max', (), NO_ARGS, 'scalar'),
+    ('max', (), (0,), 'scalar_dim', [0]),
+    ('max', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('max', (S, S, S), ((S, S, S),), 'elementwise'),
+    ('max', (S, S, S), ((S,),), 'elementwise_broadcast_rhs'),
+    ('max', (S,), ((S, S, S),), 'elementwise_broadcast_lhs'),
+    ('max', (S, 1, S), ((S, S),), 'elementwise_broadcast_all'),
+    ('max', (), ((),), 'scalar_elementwise'),
+    ('max', (S, S, S), ((),), 'scalar_elementwise_broadcast_rhs'),
+    ('max', (), ((S, S, S),), 'scalar_elementwise_broadcast_lhs'),
+    ('min', (S, S, S), NO_ARGS),
+    ('min', (S, S, S), (1,), 'dim', [0]),
+    ('min', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('min', (), NO_ARGS, 'scalar'),
+    ('min', (), (0,), 'scalar_dim', [0]),
+    ('min', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('min', (S, S, S), ((S, S, S),), 'elementwise'),
+    ('min', (S, S, S), ((S,),), 'elementwise_broadcast_rhs'),
+    ('min', (S,), ((S, S, S),), 'elementwise_broadcast_lhs'),
+    ('min', (S, 1, S), ((S, S),), 'elementwise_broadcast_all'),
+    ('min', (), ((),), 'scalar_elementwise'),
+    ('min', (S, S, S), ((),), 'scalar_elementwise_broadcast_rhs'),
+    ('min', (), ((S, S, S),), 'scalar_elementwise_broadcast_lhs'),
+    ('mean', (S, S, S), NO_ARGS),
+    ('mean', (S, S, S), (1,), 'dim', [0]),
+    ('mean', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('mean', (), NO_ARGS, 'scalar'),
+    ('mean', (), (0,), 'scalar_dim', [0]),
+    ('mean', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('kthvalue', (S, S, S), (2,)),
+    ('kthvalue', (), (1,), 'scalar'),
+    ('kthvalue', (S, S, S), (2, 1,), 'dim', [1]),
+    ('kthvalue', (), (1, 0,), 'scalar_dim', [1]),
+    ('kthvalue', (S, S, S), (2, 1, True,), 'keepdim_dim', [1]),
+    ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', [1]),
+    ('kthvalue', (S,), (2, 0,), 'dim_1d', [1]),
+    ('kthvalue', (S,), (2, 0, True,), 'keepdim_dim_1d', [1]),
+    ('median', (S, S, S), NO_ARGS),
+    ('median', (S, S, S), (1,), 'dim', [0]),
+    ('median', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('median', (), NO_ARGS, 'scalar'),
+    ('median', (), (0,), 'scalar_dim', [0]),
+    ('median', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('mode', (S, S, S), NO_ARGS),
+    ('mode', (S, S, S), (1,), 'dim', [0]),
+    ('mode', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('mode', (), NO_ARGS, 'scalar'),
+    ('mode', (), (0,), 'scalar_dim', [0]),
+    ('mode', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('sum', (S, S, S), NO_ARGS),
+    ('sum', (S, S, S), (1,), 'dim', [0]),
+    ('sum', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('sum', (), NO_ARGS, 'scalar'),
+    ('sum', (), (0,), 'scalar_dim', [0]),
+    ('sum', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('sum', (S, S, S), ([1, 2],), 'multi_dim'),
+    ('sum', (S, S, S), ([1, 2], True,), 'multi_dim_keepdim'),
+    ('prod', (S, S, S), NO_ARGS),
+    ('prod', (S, S, S), (1,), 'dim', [0]),
+    ('prod', (S, S, S), (1, True,), 'keepdim_dim', [0]),
+    ('prod', (), NO_ARGS, 'scalar'),
+    ('prod', (), (0,), 'scalar_dim', [0]),
+    ('prod', (), (0, True,), 'scalar_keepdim_dim', [0]),
+    ('prod', prod_zeros(S, [0, 1]), NO_ARGS, 'zerodims2'),
+    ('prod', prod_zeros(S, [0, 2]), NO_ARGS, 'zerodims1'),
+    ('prod', prod_zeros(S, [1, 2]), NO_ARGS, 'zerodims0'),
+    ('prod', prod_zeros(S, [0, 1]), (1,), 'zeros_dims2', [0]),
+    ('prod', prod_zeros(S, [0, 2]), (1,), 'zeros_dims1', [0]),
+    ('prod', prod_zeros(S, [1, 2]), (1,), 'zeros_dims0', [0]),
+    ('prod', prod_zeros(S, [0, 1]), (1, True), 'keepdim_zeros_dims2', [0]),
+    ('prod', prod_zeros(S, [0, 2]), (1, True), 'keepdim_zeros_dims1', [0]),
+    ('prod', prod_zeros(S, [1, 2]), (1, True), 'keepdim_zeros_dims0', [0]),
+    ('prod', prod_single_zero(S), NO_ARGS, 'single_zero'),
+    ('prod', (torch.tensor(0., requires_grad=True)), NO_ARGS, 'scalar_zero'),
+    ('prod', (torch.tensor(0., requires_grad=True)), (0,), 'scalar_dim_zero', [0]),
+    ('prod', (torch.tensor(0., requires_grad=True)), (0, True,), 'scalar_keepdim_dim_zero', [0]),
+    ('var', (S, S, S), NO_ARGS),
+    ('var', (S, S, S), (1,), 'dim', [0]),
+    ('var', (S, S, S), (1, True, True), 'keepdim_dim', [0]),
+    ('var', (S,), (0,), 'dim_1d', [0]),
+    ('var', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
+    ('std', (S, S, S), NO_ARGS),
+    ('std', (S, S, S), (1,), 'dim', [0]),
+    ('std', (S, S, S), (1, True, True), 'keepdim_dim', [0]),
+    ('std', (S,), (0,), 'dim_1d', [0]),
+    ('std', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
+    ('renorm', (S, S, S), (2, 1, 0.5), 'dim', [1]),
+    ('renorm', (S, S, S), (1, 2, 3), 'norm_1'),
+    ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'),
+    ('repeat', (S,), (2,), 'single_number'),
+    ('repeat', (), (2, 3), 'scalar'),
+    ('repeat', (2, 2), (3, 2)),
+    ('repeat', (2, 2), (1, 3, 1, 2), 'unsqueeze'),
+    ('cumsum', (S, S, S), (0,), 'dim0', [0]),
+    ('cumsum', (S, S, S), (1,), 'dim1', [0]),
+    ('cumsum', (), (0,), 'dim0_scalar', [0]),
+    ('cumprod', (S, S, S), (0,)),
+    ('cumprod', (S, S, S), (1,), 'dim1', [0]),
+    ('cumprod', (), (0,), 'scalar'),
+    ('cumprod', (torch.tensor(0., requires_grad=True)), (0,), 'scalar_zeros'),
+    ('cumprod', prod_zeros(S, [0, 1]), (1,), 'zeros_dim2', [0]),
+    ('cumprod', prod_zeros(S, [0, 2]), (1,), 'zeros_dim1', [0]),
+    ('cumprod', prod_zeros(S, [1, 2]), (1,), 'zeros_dim0', [0]),
+    ('unfold', (), (0, 1, 1), 'scalar', [0]),
+    ('unfold', (S, S, S, S), (1, 3, 1), '', [0]),
+    ('unfold', (S, S, S), (2, 3, 2), 'lastdim', [0]),
+    ('addmm', (S, M), ((S, S), (S, M)),),
+    ('addmm', (1,), ((S, S), (S, M)), 'broadcast_lhs'),
+    ('addmm', (S, M), ((S, S), (S, M)), 'coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addmm', (1,), ((S, S), (S, M)), 'broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addmm', (), ((S, S), (S, M)), 'scalar_broadcast_lhs'),
+    ('addmm', (), ((S, S), (S, M)), 'scalar_broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addbmm', (S, M), ((S, S, S), (S, S, M)),),
+    ('addbmm', (1,), ((S, S, S), (S, S, M)), 'broadcast_lhs'),
+    ('addbmm', (S, M), ((S, S, S), (S, S, M)), 'coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addbmm', (1,), ((S, S, S), (S, S, M)), 'broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addbmm', (), ((S, S, S), (S, S, M)), 'scalar_broadcast_lhs'),
+    ('addbmm', (), ((S, S, S), (S, S, M)), 'scalar_broadcast_lhs_coef', (), (), lambda x: x,
+        {'beta': 0.2, 'alpha': 0.6}),
+    ('baddbmm', (S, S, M), ((S, S, S), (S, S, M)),),
+    ('baddbmm', (1,), ((S, S, S), (S, S, M)), 'broadcast_lhs'),
+    ('baddbmm', (S, S, M), ((S, S, S), (S, S, M)), 'coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('baddbmm', (1,), ((S, S, S), (S, S, M)), 'broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('baddbmm', (), ((S, S, S), (S, S, M)), 'scalar_broadcast_lhs'),
+    ('baddbmm', (), ((S, S, S), (S, S, M)), 'scalar_broadcast_lhs_coef', (), (), lambda x: x,
+        {'beta': 0.2, 'alpha': 0.6}),
+    ('addmv', (S,), ((S, M), (M,)),),
+    ('addmv', (1,), ((S, M), (M,)), 'broadcast_lhs'),
+    ('addmv', (S,), ((S, M), (M,)), 'coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addmv', (1,), ((S, M), (M,)), 'broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addmv', (), ((S, M), (M,)), 'scalar_broadcast_lhs'),
+    ('addmv', (), ((S, M), (M,)), 'scalar_broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addr', (S, M), ((S,), (M,)),),
+    ('addr', (), ((S,), (M,)), 'broadcast_lhs'),
+    ('addr', (S, M), ((S,), (M,)), 'coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('addr', (), ((S,), (M,)), 'broadcast_lhs_coef', (), (), lambda x: x, {'beta': 0.2, 'alpha': 0.6}),
+    ('dot', (L,), ((L,),),),
+    ('mm', (S, M), ((M, S),)),
+    ('bmm', (M, S, M), ((M, M, S),)),
+    ('mv', (S, M), ((M,),)),
+    ('ger', (S,), ((M,),)),
+    ('matmul', (L,), ((L,),),),
+    ('matmul', (S, M), ((M,),), "2d_1d"),
+    ('matmul', (M, ), ((M, S),), "1d_2d"),
+    ('matmul', (S, M), ((M, S),), "2d_2d"),
+    ('matmul', (S, S, M, M), ((S, S, M, S),), "4d_4d"),
+    ('matmul', (S, S, M, M), ((M,),), "4d_1d"),
+    ('matmul', (M,), ((S, S, M, S),), "1d_4d"),
+    ('addcmul', (S, S), ((S, S), (S, S))),
+    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
+    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
+    ('addcmul', (S, S), ((S, S), (S, S)), 'scale', (), (), lambda x: x, {'value': 0.5}),
+    ('addcmul', (S, S), ((S, 1), (1, S)), 'scale_broadcast_rhs', (), (), lambda x: x, {'value': 0.5}),
+    ('addcmul', (1,), ((S, S, 1), (1, S)), 'scale_broadcast_all', (), (), lambda x: x, {'value': 0.5}),
+    ('addcmul', (), ((), ()), 'scalar'),
+    ('addcmul', (S, S), ((), ()), 'scalar_broadcast_rhs'),
+    ('addcmul', (), ((S, S, 1), (1, S)), 'scalar_broadcast_lhs'),
+    ('addcmul', (), ((), ()), 'scalar_scale', (), (), lambda x: x, {'value': 0.5}),
+    ('addcmul', (S, S), ((), ()), 'scalar_scale_broadcast_rhs', (), (), lambda x: x, {'value': 0.5}),
+    ('addcmul', (), ((S, S, 1), (1, S)), 'scalar_scale_broadcast_lhs', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (S, S), ((S, S), (S, S))),
+    ('addcdiv', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
+    ('addcdiv', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
+    ('addcdiv', (S, S), ((S, S), (S, S)), 'scale', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (S, S), ((S, 1), (1, S)), 'scale_broadcast_rhs', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (1,), ((S, S, 1), (1, S)), 'scale_broadcast_all', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (), ((), ()), 'scalar'),
+    ('addcdiv', (S, S), ((), ()), 'scalar_broadcast_rhs'),
+    ('addcdiv', (), ((S, S, 1), (1, S)), 'scalar_broadcast_lhs'),
+    ('addcdiv', (), ((), ()), 'scalar_scale', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (S, S), ((), ()), 'scalar_scale_broadcast_rhs', (), (), lambda x: x, {'value': 0.5}),
+    ('addcdiv', (), ((S, S, 1), (1, S)), 'scalar_scale_broadcast_lhs', (), (), lambda x: x, {'value': 0.5}),
+    ('zero_', (S, S, S), NO_ARGS),
+    ('zero_', (), NO_ARGS, 'scalar'),
+    ('logsumexp', (S, S), (1,)),
+    ('norm', (S, S), (2,)),
+    ('norm', (S, S), (0,), '0'),
+    ('norm', (S, S), (0.5,), '0_5'),
+    ('norm', (S, S), (1,), '1'),
+    ('norm', (S, S), (3,), '3'),
+    ('norm', (S, S), (inf,), 'inf'),
+    ('norm', (S, S), (-1,), 'neg_1'),
+    ('norm', (S, S), (-0.5,), 'neg_0_5'),
+    ('norm', (S, S), (-1.5,), 'neg_1_5'),
+    ('norm', torch.rand(S, S, S) + 5e-2, (1.5,), '1_5'),
+    ('norm', (S, S, S), (2, 1), '2_dim', [1]),
+    ('norm', (S, S, S), (3, 1), '3_dim', [1]),
+    ('norm', torch.rand(S, S, S) + 5e-2, (1.5, 1), '1_5_dim', [1]),
+    ('norm', (S, S, S), (2, 1, True), 'keepdim_2_dim', [1]),
+    ('norm', (S, S, S), (3, 1, True), 'keepdim_3_dim', [1]),
+    ('norm', torch.rand(S, S, S) + 5e-2, (1.5, 1, True), 'keepdim_1_5_dim', [1]),
+    ('norm', (), (2, 0), '2_dim_scalar', [1]),
+    ('norm', (), (3, 0), '3_dim_scalar', [1]),
+    ('norm', (), (2, 0, True), 'keepdim_2_dim_scalar', [1]),
+    ('norm', (), (3, 0, True), 'keepdim_3_dim_scalar', [1]),
+    ('clone', (S, M, S), NO_ARGS),
+    ('clone', (), NO_ARGS, 'scalar'),
+    ('dist', (S, S, S), ((S, S, S),)),
+    ('dist', (S, S, S), ((S,),), 'broadcast_rhs'),
+    ('dist', (S,), ((S, S, S),), 'broadcast_lhs'),
+    ('dist', (S, 1, S), ((S, S),), 'broadcast_all'),
+    ('dist', (), ((),), 'scalar'),
+    ('dist', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('dist', (), ((S, S, S),), 'scalar_broadcast_lhs'),
+    ('dist', (S, S, S), ((S, S, S), 4), '4'),
+    ('dist', (S, S, S), ((S,), 4), '4_broadcast_rhs'),
+    ('dist', (S,), ((S, S, S), 4), '4_broadcast_lhs'),
+    ('dist', (S, 1, S), ((S, S), 4), '4_broadcast_all'),
+    ('dist', (), ((), 4), 'scalar_4'),
+    ('dist', (S, S, S), ((), 4), 'scalar_4_broadcast_rhs'),
+    ('dist', (), ((S, S, S), 4), 'scalar_4_broadcast_lhs'),
+    ('diag', (M, M), NO_ARGS, '2d'),
+    ('diag', (3, 5), NO_ARGS, '2d_wide'),
+    ('diag', (3, 5), (2,), '2d_wide_pos'),
+    ('diag', (3, 5), (-2,), '2d_wide_neg'),
+    ('diag', (5, 3), NO_ARGS, '2d_tall'),
+    ('diag', (5, 3), (2,), '2d_tall_pos'),
+    ('diag', (5, 3), (-2,), '2d_tall_neg'),
+    ('diag', (M,), NO_ARGS, '1d'),
+    ('diag', (M, M), (1,), '2d_1'),
+    ('diag', (M, M), (2,), '2d_2'),
+    ('diagonal', (M, M), NO_ARGS, '2d'),
+    ('diagonal', (3, 5), NO_ARGS, '2d_wide'),
+    ('diagonal', (3, 5), (2,), '2d_wide_pos'),
+    ('diagonal', (3, 5), (-2,), '2d_wide_neg'),
+    ('diagonal', (5, 3), NO_ARGS, '2d_tall'),
+    ('diagonal', (5, 3), (2,), '2d_tall_pos'),
+    ('diagonal', (5, 3), (-2,), '2d_tall_neg'),
+    ('diagonal', (M, M), (1,), '2d_1'),
+    ('diagonal', (M, M), (2,), '2d_2'),
+    ('diagonal', (M, M, M), (1, 1, 2), '3d_1'),
+    ('diagonal', (M, M, M), (2, 0, 1), '3d_2'),
+    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
+    ('tril', (M, M), NO_ARGS),
+    ('tril', (M, M), (2,), 'idx'),
+    ('triu', (M, M), NO_ARGS),
+    ('triu', (M, M), (2,), 'idx'),
+    ('trace', (M, M), NO_ARGS),
+    ('cross', (S, 3), ((S, 3),)),
+    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
+    ('index_select', (S, S, S), (0, index_variable(2, S)), 'dim', [0]),
+    ('index_select', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_mixed_dim', [0]),
+    ('index_select', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_dim', [0]),
+    ('index_add', (S, S), (0, index_variable(2, S), (2, S)), 'dim', [0]),
+    ('index_add', (), (0, torch.tensor([0], dtype=torch.int64), torch.tensor([2.])), 'scalar_input_dim', [0]),
+    ('index_add', (), (0, torch.tensor(0, dtype=torch.int64), torch.tensor(2.)), 'scalar_all_dim', [0]),
+    ('index_copy', (S, S), (0, index_perm_variable(2, S), (2, S)), 'dim', [0]),
+    ('index_copy', (), (0, torch.tensor([0], dtype=torch.int64), torch.tensor([2.])), 'scalar_input_dim', [0]),
+    ('index_copy', (), (0, torch.tensor(0, dtype=torch.int64), torch.tensor(2.)), 'scalar_all_dim', [0]),
+    ('index_fill', (S, S), (0, index_variable(2, S), 2), 'dim', [0]),
+    # FIXME: we should compute the derivative w.r.t torch.tensor(2)
+    ('index_fill', (S, S), (0, index_variable(2, S), non_differentiable(torch.tensor(2))),
+     'variable_dim', [0]),
+    ('index_fill', (S, S), (0, torch.tensor(0, dtype=torch.int64), 2), 'scalar_index_dim', [0]),
+    ('index_fill', (), (0, torch.tensor([0], dtype=torch.int64), 2), 'scalar_input_dim', [0]),
+    ('index_fill', (), (0, torch.tensor(0, dtype=torch.int64), 2), 'scalar_both_dim', [0]),
+    ('inverse', (S, S), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
+    ('det', (S, S), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
+    ('det', (1, 1), NO_ARGS, '1x1', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_symmetric_matrix(S), NO_ARGS, 'symmetric', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_symmetric_psd_matrix(S), NO_ARGS, 'symmetric_psd', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_symmetric_pd_matrix(S), NO_ARGS, 'symmetric_pd', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_square_matrix_of_rank(S, S - 2), NO_ARGS, 'dim2_null', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_square_matrix_of_rank(S, 1), NO_ARGS, 'rank1', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_square_matrix_of_rank(S, 2), NO_ARGS, 'rank2', NO_ARGS, [skipIfNoLapack]),
+    ('det', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS,
+     'distinct_singular_values', NO_ARGS, [skipIfNoLapack]),
+    # For `logdet` and `slogdet`, the function at det=0 is not smooth.
+    # We need to exclude tests with det=0 (e.g. dim2_null, rank1, rank2) and use
+    # `make_nonzero_det` to make the random matrices have nonzero det. For
+    # `logdet`, we also set `make_nonzero_det(matrix, sign=1)` to make the
+    # matrix have positive det.
+    ('logdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
+    ('logdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS, '1x1', NO_ARGS, [skipIfNoLapack]),
+    ('logdet', lambda: make_nonzero_det(random_symmetric_matrix(S), 1), NO_ARGS,
+     'symmetric', NO_ARGS, [skipIfNoLapack]),
+    ('logdet', lambda: make_nonzero_det(random_symmetric_pd_matrix(S), 1), NO_ARGS,
+     'symmetric_pd', NO_ARGS, [skipIfNoLapack]),
+    ('logdet', lambda: make_nonzero_det(random_fullrank_matrix_distinct_singular_value(S), 1, 0), NO_ARGS,
+     'distinct_singular_values', NO_ARGS, [skipIfNoLapack]),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS,
+     '1x1_pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), -1), NO_ARGS,
+     '1x1_neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS,
+     'pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), -1), NO_ARGS,
+     'neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(random_symmetric_matrix(S)), NO_ARGS,
+     'symmetric', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: random_symmetric_pd_matrix(S), NO_ARGS,
+     'symmetric_pd', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS,
+     'distinct_singular_values', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], NO_ARGS,
+     'wide', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], NO_ARGS,
+     'tall', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], (False,),
+     'wide_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0], usv[1], usv[2][:, :(S - 2)])),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], (False,),
+     'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
+     'large', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack]),
+    ('fill_', (S, S, S), (1,), 'number'),
+    ('fill_', (), (1,), 'number_scalar'),
+    # FIXME: we should compute the derivative w.r.t torch.tensor(1)
+    ('fill_', (S, S, S), (non_differentiable(torch.tensor(1)),), 'variable'),
+    ('eq_', (S, S, S), ((S, S, S),)),
+    ('eq_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('eq_', (), ((),), 'scalar'),
+    ('eq_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('ne_', (S, S, S), ((S, S, S),)),
+    ('ne_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('ne_', (), ((),), 'scalar'),
+    ('ne_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('gt_', (S, S, S), ((S, S, S),)),
+    ('gt_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('gt_', (), ((),), 'scalar'),
+    ('gt_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('ge_', (S, S, S), ((S, S, S),)),
+    ('ge_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('ge_', (), ((),), 'scalar'),
+    ('ge_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('lt_', (S, S, S), ((S, S, S),)),
+    ('lt_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('lt_', (), ((),), 'scalar'),
+    ('lt_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('le_', (S, S, S), ((S, S, S),)),
+    ('le_', (S, S, S), ((1,),), 'broadcast_rhs'),
+    ('le_', (), ((),), 'scalar'),
+    ('le_', (S, S, S), ((),), 'scalar_broadcast_rhs'),
+    ('eq_', (S, S, S), (0,), 'pyscalar'),
+    ('ne_', (S, S, S), (0,), 'pyscalar'),
+    ('gt_', (S, S, S), (0,), 'pyscalar'),
+    ('ge_', (S, S, S), (0,), 'pyscalar'),
+    ('le_', (S, S, S), (0,), 'pyscalar'),
+    ('lt_', (), (0,), 'pyscalar'),
+    ('eq_', (), (0,), 'pyscalar_scalar'),
+    ('ne_', (), (0,), 'pyscalar_scalar'),
+    ('gt_', (), (0,), 'pyscalar_scalar'),
+    ('ge_', (), (0,), 'pyscalar_scalar'),
+    ('lt_', (), (0,), 'pyscalar_scalar'),
+    ('le_', (), (0,), 'pyscalar_scalar'),
+    ('permute', (1, 2, 3, 4), (0, 2, 3, 1)),
+    ('permute', (1, 2, 3, 4), (0, -2, -1, 1), 'neg_dim'),
+    ('permute', (), (dont_convert(()),), 'scalar'),
+    ('select', (S, S, S), (1, 2), 'dim', [0]),
+    ('select', (S, S, S), (1, -1), 'wrap_dim', [0]),
+    ('select', (S,), (0, 2), '1d'),
+    ('narrow', (S, S, S), (1, 2, 2), 'dim', [0]),
+    ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0], [skipIfNoZeroSize]),
+    ('squeeze', (S, 1, S, 1), NO_ARGS),
+    ('squeeze', (1, 1, 1, 1), NO_ARGS, 'input_sizes_are_ones'),
+    ('squeeze', (S, 1, S, 1), (1,), '1_dim', [0]),
+    ('squeeze', (S, 1, S, 1), (2,), 'not_1_dim', [0]),
+    ('squeeze', (), (0,), 'scalar', [0]),
+    ('unsqueeze', (S, S, S), (0,), 'first', [0]),
+    ('unsqueeze', (S, S, S), (1,), 'middle', [0]),
+    ('unsqueeze', (S, S, S), (3,), 'last', [0]),
+    ('unsqueeze', (), (0,), 'scalar', [0]),
+    ('chunk', (S, S, S), (2,)),
+    ('chunk', (S, S, S), (S, 1), 'dim', [1]),
+    ('split', (S, S, S), (2,)),
+    ('split', (S, S, S), (S, 1), 'dim', [1]),
+    ('split', (S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],), 'size_list'),
+    ('split', (S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], 2), 'size_list_dim', [1]),
+    ('gather', (M, S), (0, gather_variable((S, S), 1, M, True)), 'dim0', [0]),
+    ('gather', (M, S), (1, gather_variable((M, S // 2), 0, S, True)), 'dim1', [0]),
+    ('gather', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_input', [0]),
+    ('gather', (S,), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_index', [0]),
+    ('gather', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_both', [0]),
+    ('scatter', (M, S), (0, gather_variable((S, S), 1, M), (S, S)), 'dim0', [0]),
+    ('scatter', (M, S), (1, gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1', [0]),
+    ('scatter', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim0', [0]),
+    ('scatter_add', (M, S), (0, gather_variable((S, S), 1, M), (S, S)), 'dim0', [0]),
+    ('scatter_add', (M, S), (1, gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1', [0]),
+    ('scatter_add', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim0', [0]),
+    ('masked_select', (M, M), (mask_not_all_zeros((M, M)),)),
+    ('masked_select', (M, M), (mask_not_all_zeros((M,)),), 'broadcast_rhs'),
+    ('masked_select', (M,), (mask_not_all_zeros((M, M)),), 'broadcast_lhs'),
+    ('masked_select', (M, 1, M), (mask_not_all_zeros((M, M)),),
+     'broadcast_all'),
+    ('masked_select', (), (torch.tensor(1, dtype=torch.uint8),), 'scalar'),
+    ('masked_select', (M, M), (torch.tensor(1, dtype=torch.uint8),), 'scalar_broadcast_rhs'),
+    ('masked_select', (), (mask_not_all_zeros((M, M)),), 'scalar_broadcast_lhs'),
+    ('masked_fill', (M, M), (torch.ByteTensor(M, M).bernoulli_(), 10)),
+    ('masked_fill', (M, M), (torch.ByteTensor(M, M).bernoulli_(), torch.tensor(10)), 'tensor'),
+    # no lhs or all broadcast on masked_fill or masked_scatter because it's always inplace
+    ('masked_fill', (M, M), (torch.ByteTensor(M,).bernoulli_(), 10), 'broadcast_rhs'),
+    ('masked_fill', (), (torch.tensor(0, dtype=torch.uint8, requires_grad=False).bernoulli_(), 10), 'scalar'),
+    ('masked_fill', (), (torch.tensor(0, dtype=torch.uint8, requires_grad=False).bernoulli_(), torch.tensor(10)),
+     'scalar_variable'),
+    ('masked_fill', (M, M), (torch.tensor(0, dtype=torch.uint8, requires_grad=False).bernoulli_(), 10),
+     'scalar_broadcast_rhs'),
+    ('masked_scatter', (M, M), (torch.ByteTensor(M, M).bernoulli_(), (M, M))),
+    ('masked_scatter', (M, M), (torch.ByteTensor(M,).bernoulli_(), (M, M)),
+     'broadcast_rhs'),
+    ('masked_scatter', (M, M), (bernoulli_scalar(), (M, M)), 'scalar'),
+    ('masked_scatter', (M, M), (bernoulli_scalar(), (M, M)),
+     'scalar_broadcast_rhs'),
+    ('resize_', (S, S, S), (torch.Size([S * S, S])), 'fewer_dims'),
+    ('resize_', (), (dont_convert(()),), 'scalar'),
+    ('resize_', (), (torch.Size([1, 1, 1])), 'scalar_to_dims'),
+    ('resize_as_', (), (non_differentiable(torch.tensor(5.)),), 'scalar'),
+    ('resize_as_', (), (non_differentiable(torch.randn((1, 1, 1))),), 'scalar_to_dims'),
+    ('resize_as_', (S, S, S), (non_differentiable(torch.randn(S * S, S)),)),
+    ('sort', (S, M, S), NO_ARGS),
+    ('sort', (S, M, S), (1,), 'dim'),
+    ('sort', (S, M, S), (1, True), 'dim_desc'),
+    ('sort', (), NO_ARGS, 'scalar'),
+    ('sort', (), (0,), 'dim_scalar'),
+    ('sort', (), (0, True), 'dim_desc_scalar'),
+    ('topk', (S, M, S), (3,)),
+    ('topk', (S, M, S), (3, 1), 'dim', [1]),
+    ('topk', (S, M, S), (3, 1, True), 'dim_desc', [1]),
+    ('topk', (S, M, S), (3, 1, True, True), 'dim_desc_sort', [1]),
+    ('topk', (), (1,), 'scalar'),
+    ('topk', (), (1, 0), 'dim_sclar', [1]),
+    ('topk', (), (1, 0, True), 'dim_desc_scalar', [1]),
+    ('topk', (), (1, 0, True, True), 'dim_desc_sort_scalar', [1]),
+    ('take', (S, S, S), (torch.LongTensor([[-3, 2], [20, 2]]),)),
+    ('take', (S, S, S), (torch.tensor(0, dtype=torch.int64),), 'scalar_index'),
+    ('take', (), (torch.LongTensor([0]),), 'scalar_data'),
+    ('take', (), (torch.tensor(0, dtype=torch.int64),), 'scalar_both'),
+    ('where', (M, M), (mask_not_all_zeros((M, M)), (M, M))),
+    ('where', (M, 1, M), (mask_not_all_zeros((M, M)), (M, M, 1)), 'broadcast_all'),
+    ('where', (), (bernoulli_scalar(), ()), 'scalar'),
+    ('where', (M, 1, M), (bernoulli_scalar(), (M, M, 1)), 'scalar_broadcast_mask'),
+    ('where', (), (mask_not_all_zeros((M, M)), ()), 'scalar_broadcast_non_mask'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([1, 2]),)),
+    ('__getitem__', torch.randn(S, S, S), (slice(0, 3),), 'slice'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([slice(0, 3), 1]),), 'slice_index'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 2, 3], [1, 3, 3], [0, 0, 2]]),), 'adv_index'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 0, 3], [1, 1, 3], [0, 0, 2]]),), 'adv_index_dup'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([slice(None), slice(None), [0, 3]]),), 'adv_index_end'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([slice(None), [0, 3], slice(None)]),), 'adv_index_mid'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 3], slice(None), slice(None)]),), 'adv_index_beg'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 3], [1, 2], slice(None)]),), 'adv_index_comb'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 3], ]),), 'adv_index_sub'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 3], slice(None)]),), 'adv_index_sub_2'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 3], Ellipsis]),), 'adv_index_sub_3'),
+    ('__getitem__', torch.randn(S, S, S), (dont_convert([[0, 2, 3], [1, 3, 3],
+     torch.LongTensor([0, 0, 2])]),), 'adv_index_var'),
+]
+# TODO: clamp with min/max
+
+
+def create_input(call_args, requires_grad=True, non_contiguous=False, call_kwargs=None):
+    if not isinstance(call_args, tuple):
+        call_args = (call_args,)
+
+    def map_arg(arg):
+        def maybe_non_contig(tensor):
+            return tensor if not non_contiguous else make_non_contiguous(tensor)
+
+        if isinstance(arg, torch.Size) or isinstance(arg, dont_convert):
+            return arg
+        elif isinstance(arg, tuple) and len(arg) == 0:
+            var = torch.randn((), dtype=torch.double)
+            var.requires_grad = requires_grad
+            return var
+        elif isinstance(arg, tuple) and not isinstance(arg[0], torch.Tensor):
+            return Variable(maybe_non_contig(torch.randn(*arg, dtype=torch.double)), requires_grad=requires_grad)
+        elif isinstance(arg, non_differentiable):
+            if isinstance(arg.tensor, torch.Tensor):
+                return maybe_non_contig(arg.tensor)
+            return maybe_non_contig(arg.tensor)
+        elif isinstance(arg, torch.Tensor):
+            if arg.dtype == torch.float:
+                arg = arg.double()
+            v = maybe_non_contig(arg).detach()
+            v.requires_grad = requires_grad and v.is_floating_point()
+            return v
+        elif callable(arg):
+            return map_arg(arg())
+        else:
+            return arg
+    args_out = tuple(map_arg(arg) for arg in call_args)
+    kwargs_out = {k: map_arg(v) for k, v in call_kwargs.items()} if call_kwargs else {}
+    return args_out, kwargs_out
+
+
+def unpack_variables(args):
+    if isinstance(args, tuple):
+        return tuple(unpack_variables(elem) for elem in args)
+    else:
+        return args
+
+
+EXCLUDE_FUNCTIONAL = {
+    'addmm',
+    'addmm_',
+    'addbmm',
+    'baddbmm',
+    'addmv',
+    'addmv_',
+    'addr',
+    'addr_',
+    'reshape',
+    'where'  # argument order
+}
+EXCLUDE_GRADCHECK = {
+}
+EXCLUDE_GRADGRADCHECK = {
+}
+EXCLUDE_GRADGRADCHECK_BY_TEST_NAME = {
+    # *det methods uses svd in backward when matrix is not invertible. However,
+    # svd backward is unstable unless the matrix has positive distinct singular
+    # values. Generated random matrices satisfy this with high probability, but
+    # we can't rely on it. So only test gradgrad on invertible test cases and
+    # _distinct_singular_values.
+    'test_det',
+    'test_det_1x1',
+    'test_det_symmetric',
+    'test_det_symmetric_psd',
+    'test_det_dim2_null',
+    'test_det_rank1',
+    'test_det_rank2',
+    'test_logdet',
+    'test_logdet_1x1',
+    'test_logdet_symmetric',
+    'test_slogdet_1x1_neg_det',
+    'test_slogdet_neg_det',
+    'test_slogdet_symmetric',
+}
+
+
+def exclude_tensor_method(name, test_name):
+    # there are no tensor equivalents for these (inplace or out)
+    exclude_all_tensor_method_by_test_name = {
+        'test_clamp_min',
+        'test_clamp_max',
+        'test_clamp_min_scalar',
+        'test_clamp_max_scalar',
+        'test_slice',
+        'test_where',
+        'test_where_broadcast_all',
+        'test_where_scalar',
+        'test_where_scalar_broadcast_mask',
+        'test_where_scalar_broadcast_non_mask',
+    }
+    # there are no out-of-place tensor equivalents for these
+    exclude_outplace_tensor_method = {
+        'index_add',
+        'index_copy',
+        'index_fill',
+        'masked_fill',
+        'masked_scatter',
+        'scatter',
+        'scatter_add',
+        'det',
+    }
+    if test_name in exclude_all_tensor_method_by_test_name:
+        return True
+    is_magic_method = name[:2] == '__' and name[-2:] == '__'
+    is_inplace = name[-1] == "_" and not is_magic_method
+    if not is_inplace and name in exclude_outplace_tensor_method:
+        return True
+    return False
+
+
+def gradgradcheck_method_precision_override(test_name):
+    # these are just empirical observations, we should improve
+    gradgradcheck_precision_override = {
+        'test_norm': {'atol': 2e-2, 'rtol': 1e-2},
+        'test_norm_1_5': {'atol': 1.5e-2, 'rtol': 1e-2},
+        'test_norm_3': {'atol': 5e-2, 'rtol': 1e-2},
+        'test_dist': {'atol': 5e-2, 'rtol': 1e-2},
+        'test_dist_4': {'atol': 8e-2, 'rtol': 1e-2},
+    }
+    non_broadcasted_test_name = test_name.split("_broadcast")[0]
+    override = gradgradcheck_precision_override.get(non_broadcasted_test_name)
+    if override:
+        if 'broadcast_lhs' in test_name or 'broadcast_rhs' in test_name:
+            # errors accumulated across 1 dimension
+            override = {'atol': override['atol'] * S, 'rtol': override['atol'] * S}
+        elif 'broadcast_all' in test_name:
+            # errors accumulated across multiple dimensions
+            override = {'atol': override['atol'] * S * S, 'rtol': override['atol'] * S * S}
+    return override
+
+
+def run_grad_and_gradgrad_checks(test_case, name, test_name, apply_method, output_variable,
+                                 input_variables, run_gradgradcheck=True):
+    test_case.assertTrue(gradcheck(apply_method, input_variables, eps=1e-6, atol=PRECISION))
+    if name in EXCLUDE_GRADGRADCHECK or test_name in EXCLUDE_GRADGRADCHECK_BY_TEST_NAME:
+        return
+    gradgradcheck_precision_override = gradgradcheck_method_precision_override(test_name)
+    if gradgradcheck_precision_override is not None:
+        atol = gradgradcheck_precision_override['atol']
+        rtol = gradgradcheck_precision_override['rtol']
+        test_case.assertTrue(gradgradcheck(apply_method, input_variables, None, atol=atol, rtol=rtol,
+                                           gen_non_contig_grad_outputs=True))
+    else:
+        test_case.assertTrue(gradgradcheck(apply_method, input_variables, gen_non_contig_grad_outputs=True))
+
+
+def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
+                          f_args_variable, f_args_tensor):
+    output_variable = apply_fn(*f_args_variable)
+
+    if run_grad_checks:
+        run_grad_and_gradgrad_checks(test_case, name, test_name, apply_fn,
+                                     output_variable, f_args_variable)
+
+    self_variable = f_args_variable[0]
+    if isinstance(output_variable, torch.Tensor) and output_variable.requires_grad and self_variable is not None:
+        output_variable.backward(randn_like(output_variable))
+        test_case.assertEqual(self_variable.type(), self_variable.grad.type())
+        test_case.assertEqual(self_variable.size(), self_variable.grad.size())
+
+
+def add_test(
+        name,
+        self_size,
+        args,
+        variant_name='',
+        dim_args_idx=(),
+        skipTestIf=(),
+        output_process_fn=lambda x: x,
+        kwargs=None):
+    kwargs = kwargs if kwargs else {}
+    basic_test_name = 'test_' + name
+    if variant_name != '':
+        basic_test_name += '_' + variant_name
+
+    for dim_perm in product([-1, 1], repeat=len(dim_args_idx)):
+        test_name = basic_test_name
+        new_args = [arg * dim_perm[dim_args_idx.index(i)] if i in dim_args_idx else arg for i, arg in enumerate(args)]
+        test_name = basic_test_name + ''.join('_neg' + str(i) for i, idx in enumerate(dim_perm) if idx < 0)
+        new_args = tuple(new_args)
+
+        # for-loop bodies don't define scopes, so we have to save the variables
+        # we want to close over in some way
+        def do_test(self, name=name, self_size=self_size, args=new_args, test_name=test_name,
+                    output_process_fn=output_process_fn):
+            def check(name):
+                is_magic_method = name[:2] == '__' and name[-2:] == '__'
+                is_inplace = name[-1] == "_" and not is_magic_method
+                self_variable = create_input((self_size,))[0][0]
+                # FixMe: run grad checks on inplace self
+                if is_inplace:
+                    self_variable.requires_grad = False
+                # need to record this because methods can change the szie (e.g. unsqueeze)
+                args_variable, kwargs_variable = create_input(args, requires_grad=not is_inplace, call_kwargs=kwargs)
+                self_tensor = deepcopy(self_variable.data)
+                args_tensor = deepcopy(unpack_variables(args_variable))
+                output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
+                if not exclude_tensor_method(name, test_name):
+                    output_tensor = getattr(self_tensor, name)(*args_tensor, **kwargs_variable)
+                    if not isinstance(output_tensor, torch.Tensor) and not isinstance(output_tensor, tuple):
+                        output_tensor = torch.DoubleTensor((output_tensor,))
+                    self.assertEqual(unpack_variables(output_variable), output_tensor)
+                    # TODO: check that both have changed after adding all inplace ops
+
+                def fn(*inputs):
+                    output = getattr(inputs[0], name)(*inputs[1:], **kwargs)
+                    return output_process_fn(output)
+
+                if not is_inplace and name not in EXCLUDE_GRADCHECK:
+                    run_grad_and_gradgrad_checks(self, name, test_name, fn,
+                                                 output_variable, (self_variable,) + args_variable)
+
+                # functional interface tests
+                if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
+                    def fn(*inputs):
+                        output = getattr(torch, name)(*inputs)
+                        return output_process_fn(output)
+
+                    f_args_variable = (self_variable,) + args_variable
+                    f_args_tensor = (self_tensor,) + args_tensor
+                    # could run the gradchecks again, but skip since we did it for the methods above.
+                    run_functional_checks(self, test_name, name, fn,
+                                          False, f_args_variable, f_args_tensor)
+
+                # check for correct type of input.data and input.grad.data
+                if not is_inplace:
+                    self_variable = create_input((self_size,), requires_grad=True)[0][0]
+                    args_variable, kwargs_variable = create_input(args, requires_grad=False, call_kwargs=kwargs)
+                    output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
+                    if isinstance(output_variable, torch.autograd.Variable):
+                        output_variable.backward(randn_like(output_variable))
+                        self.assertTrue(type(self_variable.data) == type(self_variable.grad.data))
+                        self.assertTrue(self_variable.size() == self_variable.grad.size())
+
+                    # compare grads to inplace grads
+                    inplace_name = name + '_'
+                    # can't broadcast inplace to left hand side
+                    skip_inplace = ('broadcast_lhs' in test_name or
+                                    'broadcast_all' in test_name)
+                    if hasattr(torch.ones(1), inplace_name) and not skip_inplace:
+                        output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
+                        if not isinstance(output_variable, tuple):
+                            output_variable = (output_variable,)
+                        inplace_self_variable = deepcopy(self_variable)
+                        inplace_self_variable_copy = tuple(i + 0 if i is not None else None
+                                                           for i in (inplace_self_variable,))
+                        inplace_args_variable = deepcopy(args_variable)
+                        inplace_args_variable_copy = tuple(i + 0 if i is not None else None
+                                                           for i in inplace_args_variable)
+
+                        inplace_output_variable = (
+                            getattr(inplace_self_variable_copy[0], inplace_name)(*inplace_args_variable_copy,
+                                                                                 **kwargs_variable))
+                        if not isinstance(inplace_output_variable, tuple):
+                            inplace_output_variable = (inplace_output_variable,)
+                        self.assertEqual(inplace_output_variable, output_variable)
+                        # Check that gradient is the same
+                        for inp_i, i in zip((inplace_self_variable,) + inplace_args_variable,
+                                            (self_variable,) + args_variable):
+                            if not isinstance(inp_i, torch.Tensor):
+                                assert not isinstance(i, torch.Tensor)
+                                continue
+                            if inp_i.grad is not None:
+                                inp_i.grad.data.zero_()
+                            if i.grad is not None:
+                                i.grad.data.zero_()
+                        for io, o in zip(inplace_output_variable, output_variable):
+                            grad = randn_like(io).double()
+                            io.backward(grad)
+                            o.backward(grad)
+                        for inp_i, i in zip((inplace_self_variable,) + inplace_args_variable,
+                                            (self_variable,) + args_variable):
+                            if not isinstance(inp_i, torch.Tensor):
+                                continue
+                            self.assertEqual(inp_i.grad, i.grad)
+
+            check(name)
+            inplace_name = name + '_'
+            # can't broadcast inplace to left hand side
+            broadcast_skip_inplace = 'broadcast_lhs' in test_name or 'broadcast_all' in test_name
+            if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace:
+                check(inplace_name)
+
+        assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
+
+        for skip in skipTestIf:
+            do_test = skip(do_test)
+
+        setattr(TestAutograd, test_name, do_test)
+
+for test in method_tests:
+    add_test(*test)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_c10d.py b/test/test_c10d.py
new file mode 100644
index 0000000..902fafd
--- /dev/null
+++ b/test/test_c10d.py
@@ -0,0 +1,371 @@
+import math
+import multiprocessing
+import socket
+import sys
+import tempfile
+import unittest
+from functools import wraps
+
+import torch
+import torch.distributed.c10d as c10d
+
+from common import TestCase
+
+
+TIMEOUT_DEFAULT = 5
+TIMEOUT_OVERRIDE = {}
+
+
+def get_timeout(test_id):
+    return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
+
+
+def find_free_port():
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(('localhost', 0))
+    sockname = sock.getsockname()
+    sock.close()
+    return sockname[1]
+
+
+if not c10d.is_available():
+    print('c10d not available, skipping tests')
+    sys.exit(0)
+
+
+class StoreTestBase(object):
+    def _create_store(self, i):
+        raise RuntimeError("not implemented")
+
+    def _test_set_get(self, fs):
+        fs.set("key0", "value0")
+        fs.set("key1", "value1")
+        fs.set("key2", "value2")
+        self.assertEqual(b"value0", fs.get("key0"))
+        self.assertEqual(b"value1", fs.get("key1"))
+        self.assertEqual(b"value2", fs.get("key2"))
+
+    def test_set_get(self):
+        self._test_set_get(self._create_store())
+
+
+class FileStoreTest(TestCase, StoreTestBase):
+    def setUp(self):
+        self.file = tempfile.NamedTemporaryFile()
+
+    def tearDown(self):
+        self.file.close()
+
+    def _create_store(self):
+        return c10d.FileStore(self.file.name)
+
+
+class TCPStoreTest(TestCase, StoreTestBase):
+    def _create_store(self):
+        addr = 'localhost'
+        port = find_free_port()
+        return c10d.TCPStore(addr, port, True)
+
+
+class RendezvousTest(TestCase):
+    def test_unknown_handler(self):
+        with self.assertRaisesRegex(RuntimeError, "^No rendezvous handler"):
+            c10d.rendezvous('invalid://')
+
+
+class RendezvousFileTest(TestCase):
+    def test_common_errors(self):
+        with self.assertRaisesRegex(ValueError, 'path missing'):
+            gen = c10d.rendezvous('file://?rank=0&size=1')
+            next(gen)
+        with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
+            gen = c10d.rendezvous('file:///tmp/foo?size=1')
+            next(gen)
+        with self.assertRaisesRegex(ValueError, 'size parameter missing'):
+            gen = c10d.rendezvous('file:///tmp/foo?rank=0')
+            next(gen)
+
+    def test_nominal(self):
+        with tempfile.NamedTemporaryFile() as file:
+            url = 'file://%s?size=%d' % (file.name, 2)
+            gen0 = c10d.rendezvous(url + "&rank=0")
+            store0, rank0, size0 = next(gen0)
+            self.assertEqual(0, rank0)
+            self.assertEqual(2, size0)
+            gen1 = c10d.rendezvous(url + "&rank=1")
+            store1, rank1, size1 = next(gen1)
+            self.assertEqual(1, rank1)
+            self.assertEqual(2, size1)
+
+            # Set value on both stores
+            store0.set("key0", "value0")
+            store1.set("key1", "value1")
+
+            # Cross check with get
+            self.assertEqual(b"value0", store1.get("key0"))
+            self.assertEqual(b"value1", store0.get("key1"))
+
+
+class RendezvousTCPTest(TestCase):
+    def test_common_errors(self):
+        with self.assertRaisesRegex(ValueError, 'port number missing'):
+            gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&size=1')
+            next(gen)
+        with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
+            gen = c10d.rendezvous('tcp://127.0.0.1:23456?size=1')
+            next(gen)
+        with self.assertRaisesRegex(ValueError, 'size parameter missing'):
+            gen = c10d.rendezvous('tcp://127.0.0.1:23456?rank=0')
+            next(gen)
+
+    def test_nominal(self):
+        addr = 'localhost'
+        port = find_free_port()
+        url = 'tcp://%s:%d?size=%d' % (addr, port, 2)
+        gen0 = c10d.rendezvous(url + "&rank=0")
+        store0, rank0, size0 = next(gen0)
+        self.assertEqual(0, rank0)
+        self.assertEqual(2, size0)
+        gen1 = c10d.rendezvous(url + "&rank=1")
+        store1, rank1, size1 = next(gen1)
+        self.assertEqual(1, rank1)
+        self.assertEqual(2, size1)
+
+        # Set value on both stores
+        store0.set("key0", "value0")
+        store1.set("key1", "value1")
+
+        # Cross check with get
+        self.assertEqual(b"value0", store1.get("key0"))
+        self.assertEqual(b"value1", store0.get("key1"))
+
+
+class ProcessGroupGlooTest(TestCase):
+    MAIN_PROCESS_RANK = -1
+
+    @staticmethod
+    def join_or_run(fn):
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_PROCESS_RANK:
+                self._join_processes(fn)
+            else:
+                fn(self)
+        return wrapper
+
+    # The main process spawns N subprocesses that run the test.
+    # This function patches overwrites every test function to either
+    # assume the role of the main process and join its subprocesses,
+    # or run the underlying test function.
+    @classmethod
+    def setUpClass(cls):
+        for attr in dir(cls):
+            if attr.startswith('test'):
+                fn = getattr(cls, attr)
+                setattr(cls, attr, cls.join_or_run(fn))
+
+    def setUp(self):
+        self.rank = self.MAIN_PROCESS_RANK
+        self.size = 4
+        self.file = tempfile.NamedTemporaryFile()
+        self.processes = [self._spawn_process(rank) for rank in range(int(self.size))]
+
+    def tearDown(self):
+        for p in self.processes:
+            p.terminate()
+        self.file.close()
+
+    def _spawn_process(self, rank):
+        name = 'process ' + str(rank)
+        process = multiprocessing.Process(target=self._run, name=name, args=(rank,))
+        process.start()
+        return process
+
+    def _run(self, rank):
+        self.rank = rank
+
+        # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+        # We're retreiving a corresponding test and executing it.
+        getattr(self, self.id().split(".")[2])()
+        sys.exit(0)
+
+    def _join_processes(self, fn):
+        timeout = get_timeout(self.id())
+        for p in self.processes:
+            p.join(timeout)
+
+    def opts(self):
+        opts = c10d.ProcessGroupGloo.Options()
+        opts.timeout = 1.0
+        opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        return opts
+
+    def test_broadcast_ops(self):
+        store = c10d.FileStore(self.file.name)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.size, self.opts())
+
+        def broadcast(xs, rootRank, rootTensor):
+            opts = c10d.BroadcastOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            work = pg.broadcast(xs, opts)
+            work.wait()
+
+        # Every rank is root once, every tensor index is root once
+        for i in range(self.size):
+            for j in range(2):
+                xs = [
+                    torch.Tensor([self.rank * self.size + 0.0]),
+                    torch.Tensor([self.rank * self.size + 1.0]),
+                ]
+
+                broadcast(xs, i, j)
+                self.assertEqual(torch.Tensor([i * self.size + j]), xs[0])
+                self.assertEqual(torch.Tensor([i * self.size + j]), xs[1])
+
+        # Test overloaded convenience function
+        x = torch.Tensor([self.rank + 1.0])
+        work = pg.broadcast(x, root=0)
+        work.wait()
+        self.assertEqual(torch.Tensor([1.0]), x)
+
+    def test_allreduce_ops(self):
+        store = c10d.FileStore(self.file.name)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.size, self.opts())
+
+        def allreduce(x, op):
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            work = pg.allreduce([x], opts)
+            work.wait()
+
+        # Sum
+        x = torch.Tensor([self.rank + 1.0])
+        allreduce(x, c10d.ReduceOp.SUM)
+        self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)
+
+        # Product
+        x = torch.Tensor([self.rank + 1.0])
+        allreduce(x, c10d.ReduceOp.PRODUCT)
+        self.assertEqual(torch.Tensor([float(math.factorial(self.size))]), x)
+
+        # Min
+        x = torch.Tensor([self.rank + 1.0])
+        allreduce(x, c10d.ReduceOp.MIN)
+        self.assertEqual(torch.Tensor([1.0]), x)
+
+        # Max
+        x = torch.Tensor([self.rank + 1.0])
+        allreduce(x, c10d.ReduceOp.MAX)
+        self.assertEqual(torch.Tensor([self.size]), x)
+
+        # Test overloaded convenience function (defaults to using sum)
+        x = torch.Tensor([self.rank + 1.0])
+        work = pg.allreduce(x)
+        work.wait()
+        self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)
+
+
+class ProcessGroupNCCLTest(TestCase):
+    MAIN_PROCESS_RANK = 0
+
+    def setUp(self):
+        if not hasattr(c10d, "ProcessGroupNCCL"):
+            raise unittest.SkipTest("C10D is not built with NCCL process group,"
+                                    " skipping test")
+
+        self.rank = self.MAIN_PROCESS_RANK
+        self.size = 1
+        self.file = tempfile.NamedTemporaryFile()
+
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("torch.cuda not available, skipping test")
+
+        self.num_gpus = torch.cuda.device_count()
+
+        if self.num_gpus < 2:
+            raise unittest.SkipTest("Requires at least 2 GPUs, skipping test")
+
+    def tearDown(self):
+        self.file.close()
+
+    def test_broadcast_ops(self):
+        store = c10d.FileStore(self.file.name)
+        pg = c10d.ProcessGroupNCCL(store, self.rank, self.size)
+
+        def broadcast(xs, rootRank, rootTensor):
+            opts = c10d.BroadcastOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            work = pg.broadcast(xs, opts)
+            work.wait()
+
+        # for every root tensor
+        for rt in range(self.num_gpus):
+            tensors = []
+            for i in range(self.num_gpus):
+                tensors.append(torch.Tensor([i]).cuda(i))
+
+            broadcast(tensors, self.rank, rt)
+
+            for i in range(self.num_gpus):
+                self.assertEqual(tensors[i], tensors[rt])
+
+    def test_allreduce_ops(self):
+        store = c10d.FileStore(self.file.name)
+        pg = c10d.ProcessGroupNCCL(store, self.rank, self.size)
+
+        def allreduce(tensors, op):
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            work = pg.allreduce(tensors, opts)
+            work.wait()
+
+        # Sum
+        tensors = []
+        for i in range(self.num_gpus):
+            tensors.append(torch.Tensor([i + 1]).cuda(i))
+
+        allreduce(tensors, c10d.ReduceOp.SUM)
+
+        for i in range(self.num_gpus):
+            self.assertEqual(
+                torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]),
+                tensors[i])
+
+        # Product
+        tensors = []
+        for i in range(self.num_gpus):
+            tensors.append(torch.Tensor([i + 1]).cuda(i))
+
+        allreduce(tensors, c10d.ReduceOp.PRODUCT)
+
+        for i in range(self.num_gpus):
+            self.assertEqual(
+                torch.Tensor([float(math.factorial(self.num_gpus))]),
+                tensors[i])
+
+        # Min
+        tensors = []
+        for i in range(self.num_gpus):
+            tensors.append(torch.Tensor([i + 1]).cuda(i))
+
+        allreduce(tensors, c10d.ReduceOp.MIN)
+
+        for i in range(self.num_gpus):
+            self.assertEqual(torch.Tensor([1.0]), tensors[i])
+
+        # Max
+        tensors = []
+        for i in range(self.num_gpus):
+            tensors.append(torch.Tensor([i + 1]).cuda(i))
+
+        allreduce(tensors, c10d.ReduceOp.MAX)
+
+        for i in range(self.num_gpus):
+            self.assertEqual(torch.Tensor([self.num_gpus]), tensors[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
new file mode 100755
index 0000000..3db7a42
--- /dev/null
+++ b/test/test_cpp_extensions.py
@@ -0,0 +1,271 @@
+import os
+import unittest
+import sys
+
+import torch
+import torch.utils.cpp_extension
+import torch.backends.cudnn
+try:
+    import torch_test_cpp_extension.cpp as cpp_extension
+except ImportError:
+    print("\'test_cpp_extensions.py\' cannot be invoked directly. " +
+          "Run \'python run_test.py -i cpp_extensions\' for the \'test_cpp_extensions.py\' tests.")
+    raise
+
+import common
+
+from torch.utils.cpp_extension import CUDA_HOME
+TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+TEST_CUDNN = False
+if TEST_CUDA:
+    CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, 'include/cudnn.h'))
+    TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
+
+
+class TestCppExtension(common.TestCase):
+    def test_extension_function(self):
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+        z = cpp_extension.sigmoid_add(x, y)
+        self.assertEqual(z, x.sigmoid() + y.sigmoid())
+
+    def test_extension_module(self):
+        mm = cpp_extension.MatrixMultiplier(4, 8)
+        weights = torch.rand(8, 4)
+        expected = mm.get().mm(weights)
+        result = mm.forward(weights)
+        self.assertEqual(expected, result)
+
+    def test_backward(self):
+        mm = cpp_extension.MatrixMultiplier(4, 8)
+        weights = torch.rand(8, 4, requires_grad=True)
+        result = mm.forward(weights)
+        result.sum().backward()
+        tensor = mm.get()
+
+        expected_weights_grad = tensor.t().mm(torch.ones([4, 4]))
+        self.assertEqual(weights.grad, expected_weights_grad)
+
+        expected_tensor_grad = torch.ones([4, 4]).mm(weights.t())
+        self.assertEqual(tensor.grad, expected_tensor_grad)
+
+    def test_jit_compile_extension(self):
+        module = torch.utils.cpp_extension.load(
+            name='jit_extension',
+            sources=[
+                'cpp_extensions/jit_extension.cpp',
+                'cpp_extensions/jit_extension2.cpp'
+            ],
+            extra_include_paths=['cpp_extensions'],
+            extra_cflags=['-g'],
+            verbose=True)
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+
+        z = module.tanh_add(x, y)
+        self.assertEqual(z, x.tanh() + y.tanh())
+
+        # Checking we can call a method defined not in the main C++ file.
+        z = module.exp_add(x, y)
+        self.assertEqual(z, x.exp() + y.exp())
+
+        # Checking we can use this JIT-compiled class.
+        doubler = module.Doubler(2, 2)
+        self.assertIsNone(doubler.get().grad)
+        self.assertEqual(doubler.get().sum(), 4)
+        self.assertEqual(doubler.forward().sum(), 8)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_cuda_extension(self):
+        import torch_test_cpp_extension.cuda as cuda_extension
+
+        x = torch.zeros(100, device='cuda', dtype=torch.float32)
+        y = torch.zeros(100, device='cuda', dtype=torch.float32)
+
+        z = cuda_extension.sigmoid_add(x, y).cpu()
+
+        # 2 * sigmoid(0) = 2 * 0.5 = 1
+        self.assertEqual(z, torch.ones_like(z))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_jit_cuda_extension(self):
+        # NOTE: The name of the extension must equal the name of the module.
+        module = torch.utils.cpp_extension.load(
+            name='torch_test_cuda_extension',
+            sources=[
+                'cpp_extensions/cuda_extension.cpp',
+                'cpp_extensions/cuda_extension.cu'
+            ],
+            extra_cuda_cflags=['-O2'],
+            verbose=True)
+
+        x = torch.zeros(100, device='cuda', dtype=torch.float32)
+        y = torch.zeros(100, device='cuda', dtype=torch.float32)
+
+        z = module.sigmoid_add(x, y).cpu()
+
+        # 2 * sigmoid(0) = 2 * 0.5 = 1
+        self.assertEqual(z, torch.ones_like(z))
+
+    @unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
+    def test_jit_cudnn_extension(self):
+        # implementation of CuDNN ReLU
+        if sys.platform == 'win32':
+            extra_ldflags = ['cudnn.lib']
+        else:
+            extra_ldflags = ['-lcudnn']
+        module = torch.utils.cpp_extension.load(
+            name='torch_test_cudnn_extension',
+            sources=[
+                'cpp_extensions/cudnn_extension.cpp'
+            ],
+            extra_ldflags=extra_ldflags,
+            verbose=True,
+            with_cuda=True)
+
+        x = torch.randn(100, device='cuda', dtype=torch.float32)
+        y = torch.zeros(100, device='cuda', dtype=torch.float32)
+        module.cudnn_relu(x, y)  # y=relu(x)
+        self.assertEqual(torch.nn.functional.relu(x), y)
+        with self.assertRaisesRegex(RuntimeError, "same size"):
+            y_incorrect = torch.zeros(20, device='cuda', dtype=torch.float32)
+            module.cudnn_relu(x, y_incorrect)
+
+    def test_optional(self):
+        has_value = cpp_extension.function_taking_optional(torch.ones(5))
+        self.assertTrue(has_value)
+        has_value = cpp_extension.function_taking_optional(None)
+        self.assertFalse(has_value)
+
+    def test_inline_jit_compile_extension_with_functions_as_list(self):
+        cpp_source = '''
+        at::Tensor tanh_add(at::Tensor x, at::Tensor y) {
+          return x.tanh() + y.tanh();
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='inline_jit_extension_with_functions_list',
+            cpp_sources=cpp_source,
+            functions='tanh_add',
+            verbose=True)
+
+        self.assertEqual(module.tanh_add.__doc__.split('\n')[2], 'tanh_add')
+
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+
+        z = module.tanh_add(x, y)
+        self.assertEqual(z, x.tanh() + y.tanh())
+
+    def test_inline_jit_compile_extension_with_functions_as_dict(self):
+        cpp_source = '''
+        at::Tensor tanh_add(at::Tensor x, at::Tensor y) {
+          return x.tanh() + y.tanh();
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='inline_jit_extension_with_functions_dict',
+            cpp_sources=cpp_source,
+            functions={'tanh_add': 'Tanh and then sum :D'},
+            verbose=True)
+
+        self.assertEqual(
+            module.tanh_add.__doc__.split('\n')[2], 'Tanh and then sum :D')
+
+    def test_inline_jit_compile_extension_multiple_sources_and_no_functions(self):
+        cpp_source1 = '''
+        at::Tensor sin_add(at::Tensor x, at::Tensor y) {
+          return x.sin() + y.sin();
+        }
+        '''
+
+        cpp_source2 = '''
+        #include <torch/torch.h>
+        at::Tensor sin_add(at::Tensor x, at::Tensor y);
+        PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+          m.def("sin_add", &sin_add, "sin(x) + sin(y)");
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='inline_jit_extension',
+            cpp_sources=[cpp_source1, cpp_source2],
+            verbose=True)
+
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+
+        z = module.sin_add(x, y)
+        self.assertEqual(z, x.sin() + y.sin())
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_inline_jit_compile_extension_cuda(self):
+        cuda_source = '''
+        __global__ void cos_add_kernel(
+            const float* __restrict__ x,
+            const float* __restrict__ y,
+            float* __restrict__ output,
+            const int size) {
+          const auto index = blockIdx.x * blockDim.x + threadIdx.x;
+          if (index < size) {
+            output[index] = __cosf(x[index]) + __cosf(y[index]);
+          }
+        }
+
+        at::Tensor cos_add(at::Tensor x, at::Tensor y) {
+          auto output = at::zeros_like(x);
+          const int threads = 1024;
+          const int blocks = (output.numel() + threads - 1) / threads;
+          cos_add_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
+          return output;
+        }
+        '''
+
+        # Here, the C++ source need only declare the function signature.
+        cpp_source = 'at::Tensor cos_add(at::Tensor x, at::Tensor y);'
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='inline_jit_extension_cuda',
+            cpp_sources=cpp_source,
+            cuda_sources=cuda_source,
+            functions=['cos_add'],
+            verbose=True)
+
+        self.assertEqual(module.cos_add.__doc__.split('\n')[2], 'cos_add')
+
+        x = torch.randn(4, 4, device='cuda', dtype=torch.float32)
+        y = torch.randn(4, 4, device='cuda', dtype=torch.float32)
+
+        z = module.cos_add(x, y)
+        self.assertEqual(z, x.cos() + y.cos())
+
+    def test_inline_jit_compile_extension_throws_when_functions_is_bad(self):
+        with self.assertRaises(ValueError):
+            torch.utils.cpp_extension.load_inline(
+                name='invalid_jit_extension', cpp_sources='', functions=5)
+
+    def test_lenient_flag_handling_in_jit_extensions(self):
+        cpp_source = '''
+        at::Tensor tanh_add(at::Tensor x, at::Tensor y) {
+          return x.tanh() + y.tanh();
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='lenient_flag_handling_extension',
+            cpp_sources=cpp_source,
+            functions='tanh_add',
+            extra_cflags=['-g\n\n', '-O0 -Wall'],
+            extra_include_paths=['       cpp_extensions\n', '../'],
+            verbose=True)
+
+        x = torch.zeros(100, dtype=torch.float32)
+        y = torch.zeros(100, dtype=torch.float32)
+        z = module.tanh_add(x, y).cpu()
+        self.assertEqual(z, x.tanh() + y.tanh())
+
+
+if __name__ == '__main__':
+    common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
new file mode 100644
index 0000000..9d56682
--- /dev/null
+++ b/test/test_cuda.py
@@ -0,0 +1,1960 @@
+import io
+import math
+import tempfile
+import re
+import unittest
+import sys
+from itertools import repeat
+import os
+from contextlib import contextmanager
+
+import torch
+import torch.cuda
+import torch.cuda.comm as comm
+from torch import multiprocessing as mp
+from torch._six import inf, nan
+
+from test_torch import TestTorch
+from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
+    PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN
+
+# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
+# because if we do that, the TEST_CUDNN line from common_cuda will be executed
+# multiple times as well during the execution of this test suite, and it will
+# cause CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811
+
+TEST_MAGMA = TEST_CUDA
+if TEST_CUDA:
+    torch.ones(1).cuda()  # has_magma shows up after cuda is initialized
+    TEST_MAGMA = torch.cuda.has_magma
+
+floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor,
+                torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor}
+
+
+def is_floating(t):
+    if not isinstance(t, type):
+        raise TypeError('t should be an instance of type')
+    assert t != torch.autograd.Variable
+    return t in floating_set
+
+
+def is_half(t):
+    if isinstance(t, torch.Tensor):
+        return t.dtype == torch.float16
+    assert isinstance(t, type)
+    assert t != torch.autograd.Variable
+    return t in [torch.HalfTensor, torch.cuda.HalfTensor]
+
+
+types = [
+    torch.FloatTensor,
+    torch.DoubleTensor,
+    torch.LongTensor,
+    torch.IntTensor,
+    torch.ShortTensor,
+    torch.CharTensor,
+    torch.ByteTensor,
+    torch.HalfTensor,
+]
+
+signed_types = [
+    torch.FloatTensor,
+    torch.DoubleTensor,
+    torch.LongTensor,
+    torch.IntTensor,
+    torch.ShortTensor,
+    torch.CharTensor,
+]
+
+unsigned_types = [
+    torch.ByteTensor,
+]
+
+float_types = [
+    torch.FloatTensor,
+    torch.DoubleTensor,
+    torch.HalfTensor,
+]
+
+float_types_no_half = [
+    torch.FloatTensor,
+    torch.DoubleTensor,
+]
+
+
+def number(floating, integer, t):
+    return floating if is_floating(t) else integer
+
+
+def cast_tensor(tensor, t):
+    return t(tensor.size()).copy_(tensor)
+
+S = 10
+M = 50
+
+
+def make_tensor(t, *sizes):
+    if 'Half' in t.__name__:
+        return t(*sizes).copy_(torch.randn(*sizes))
+    else:
+        tensor = t(*sizes)
+        if tensor.is_floating_point():
+            return tensor.normal_()
+        else:
+            return tensor.random_(0, 10)
+
+
+def make_sparse_tensor(t, n, *sizes):
+    assert t.is_sparse
+    tensor = t()
+    i = tensor._indices()
+    i = i.new(len(sizes), n).copy_(
+        torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0))
+    v = tensor._values()
+    v = v.new(n).copy_(torch.randn(n))
+    return t(i, v, torch.Size(sizes))
+
+
+def tensor_clamp(t, min, max):
+    if is_half(t):
+        return t.float().clamp(min, max).half()
+    else:
+        return t.clamp(min, max)
+
+
+def tensor_mul(t, scale):
+    if is_half(t):
+        return t.float().mul(scale).half()
+    else:
+        return t.mul(scale)
+
+
+def tensor_abs_(t):
+    if is_half(t):
+        return t.float().abs_().half()
+    else:
+        return t.abs_()
+
+
+def constant_tensor_sub(a, b):
+    # helper function to address const - torch.HalfTensor where it doesn't
+    # have resize_as()
+    if is_half(b):
+        return (a - b.float()).half()
+    else:
+        return a - b
+
+
+def constant_tensor_add(a, b):
+    # helper function to address const + torch.HalfTensor where it doesn't
+    # have add()
+    if is_half(b):
+        return (a + b.float()).half()
+    else:
+        return a + b
+
+
+def small_2d(t):
+    return make_tensor(t, S, S)
+
+
+def small_2d_scaled(t, scale=10):
+    return tensor_mul(make_tensor(t, S, S), scale)
+
+
+def small_2d_oneish(t):
+    if is_floating(t):
+        return tensor_clamp(make_tensor(t, S, S), min=0.99, max=1.01)
+    else:
+        return t(S, S).fill_(1)
+
+
+def small_3d(t):
+    return make_tensor(t, S, S, S)
+
+
+def medium_1d(t):
+    return make_tensor(t, M)
+
+
+def medium_2d(t):
+    return make_tensor(t, M, M)
+
+
+def medium_2d_expanded(t):
+    return t(1).expand(M, M)
+
+
+def medium_2d_scaled(t, scale=10):
+    return tensor_mul(make_tensor(t, M, M), scale)
+
+
+def small_3d_ones(t):
+    return t(S, S, S).copy_(torch.ones(S, S, S))
+
+
+def small_3d_positive(t):
+    # In div_tensor(), half cannot achieve float precision
+    min_val = 1e-3 if is_floating(t) and not is_half(t) else 2
+    return tensor_clamp(make_tensor(t, S, S, S), min_val, 120)
+
+
+def small_3d_unique(t):
+    return t(S, S, S).copy_(torch.arange(1, S * S * S + 1).view(S, S, S))
+
+
+def small_1d_lapack(t):
+    return t(1, 3).copy_(torch.arange(1, 4).view(3))
+
+
+def small_2d_lapack(t):
+    return t(3, 3).copy_(torch.arange(1, 10).view(3, 3))
+
+
+def small_2d_lapack_skinny(t):
+    return t(3, 4).copy_(torch.arange(1, 13).view(3, 4))
+
+
+def small_2d_lapack_fat(t):
+    return t(4, 3).copy_(torch.arange(1, 13).view(4, 3))
+
+
+def large_2d_lapack(t):
+    return t(1000, 1000).normal_()
+
+
+def long_type(t):
+    return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
+
+
+def new_t(*sizes):
+    def tmp(t):
+        return t(*sizes).copy_(torch.randn(*sizes))
+    return tmp
+
+# Content of each tuple:
+# - function name
+# - constructor for the tensor,    signature: fn(tensor_type) -> tensor
+# - constructor for the arguments, signature: fn(tensor_type) -> list
+# - postfix name for the test (must be unique for a given function) (default='')
+# - tensor types to use (default=types)
+# - disable inplace test, if set to True, no inplace test will be done (default=False)
+# - decorator, e.g., unittest.skipIf (default is no decorator)
+tests = [
+    ('add', small_3d, lambda t: [number(3.14, 3, t)]),
+    ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
+    ('sub', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('mul', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('div', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types),
+    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types),
+    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types),
+    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
+    # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
+    ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half),
+    ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types),
+    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
+    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],),
+    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t),
+                                            tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], 'scalar'),
+    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],),
+    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
+    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
+    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
+    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
+    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],),
+    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
+    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
+    ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]),
+    ('fmod', small_3d, lambda t: [3], 'value'),
+    ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('chunk', medium_2d, lambda t: [4],),
+    ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
+    ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'),
+    ('clamp', medium_2d_scaled, lambda t: [-1, 5], None, signed_types),
+    ('clamp', medium_2d_scaled, lambda t: [1, 5], None, unsigned_types),
+    ('clone', medium_2d, lambda t: [],),
+    ('contiguous', medium_2d, lambda t: [],),
+    ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
+    ('cumprod', small_3d, lambda t: [1],),
+    ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
+    ('cumsum', small_3d, lambda t: [1],),
+    ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
+    ('dim', small_3d, lambda t: [],),
+    ('dist', small_2d, lambda t: [small_2d(t)],),
+    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
+    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
+    ('dot', medium_1d, lambda t: [medium_1d(t)],),
+    ('element_size', medium_1d, lambda t: [],),
+    ('eq', small_3d_ones, lambda t: [small_3d(t)],),
+    ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('ne', small_3d_ones, lambda t: [small_3d(t)],),
+    ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d(t)],),
+    ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
+    ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
+    ('fill', medium_2d, lambda t: [number(3.14, 3, t)],),
+    ('ge', medium_2d, lambda t: [medium_2d(t)],),
+    ('le', medium_2d, lambda t: [medium_2d(t)],),
+    ('gt', medium_2d, lambda t: [medium_2d(t)],),
+    ('lt', medium_2d, lambda t: [medium_2d(t)],),
+    ('is_contiguous', medium_2d, lambda t: [],),
+    # TODO: can't check negative case - GPU copy will be contiguous
+    ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
+    ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
+    ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
+    # TODO: positive case
+    ('kthvalue', small_3d_unique, lambda t: [3],),
+    ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
+    ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
+    ('lerp', small_3d, lambda t: [small_3d(t), 0.3],),
+    ('max', small_3d_unique, lambda t: [],),
+    ('max', small_3d_unique, lambda t: [1], 'dim'),
+    ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('min', small_3d_unique, lambda t: [],),
+    ('min', small_3d_unique, lambda t: [1], 'dim'),
+    ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('mean', small_3d, lambda t: [],),
+    ('mean', small_3d, lambda t: [-1], 'neg_dim'),
+    ('mean', small_3d, lambda t: [1], 'dim'),
+    ('mode', small_3d, lambda t: [],),
+    ('mode', small_3d, lambda t: [1], 'dim'),
+    ('mode', small_3d, lambda t: [-1], 'neg_dim'),
+    ('remainder', small_3d, lambda t: [3], 'value'),
+    ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types),
+    ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types),
+    ('std', small_3d, lambda t: [],),
+    ('std', small_3d, lambda t: [1], 'dim'),
+    ('std', small_3d, lambda t: [-1], 'neg_dim'),
+    ('var', small_3d, lambda t: [],),
+    ('var', small_3d, lambda t: [1], 'dim'),
+    ('var', small_3d, lambda t: [-1], 'neg_dim'),
+    ('ndimension', small_3d, lambda t: [],),
+    ('nelement', small_3d, lambda t: [],),
+    ('numel', small_3d, lambda t: [],),
+    ('narrow', small_3d, lambda t: [1, 3, 2],),
+    ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
+    ('nonzero', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [3], '3_norm'),
+    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
+    ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
+    ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
+    ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
+    ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])],),
+    ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'),
+    ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'),
+    ('prod', small_2d_oneish, lambda t: [],),
+    ('prod', small_3d, lambda t: [1], 'dim'),
+    ('prod', small_3d, lambda t: [-1], 'neg_dim'),
+    ('sum', small_2d, lambda t: [],),
+    ('sum', small_3d, lambda t: [1], 'dim'),
+    ('sum', small_3d, lambda t: [-1], 'neg_dim'),
+    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
+    ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
+    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
+    ('repeat', small_2d, lambda t: [2, 2, 2],),
+    ('size', new_t(1, 2, 3, 4), lambda t: [],),
+    ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
+    ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
+    ('sort', small_3d_unique, lambda t: [],),
+    ('sort', small_3d_unique, lambda t: [1], 'dim'),
+    ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
+    ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
+    ('split', small_3d, lambda t: [2],),
+    ('split', small_3d, lambda t: [2, 1], 'dim'),
+    ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
+    ('t', new_t(1, 2), lambda t: [],),
+    ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])],),
+    ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
+    ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
+    ('to_list', small_3d, lambda t: [],),
+    ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort'),
+    ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort'),
+    ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort'),
+    ('trace', medium_2d, lambda t: [],),
+    ('tril', medium_2d, lambda t: [],),
+    ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
+    ('tril', medium_2d, lambda t: [2], 'positive'),
+    ('tril', medium_2d, lambda t: [-2], 'negative'),
+    ('triu', medium_2d, lambda t: [],),
+    ('triu', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
+    ('triu', medium_2d, lambda t: [2], 'positive'),
+    ('triu', medium_2d, lambda t: [-2], 'negative'),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'),
+    ('view', small_3d, lambda t: [100, 10], 'contiguous'),
+    ('view_as', small_3d, lambda t: [make_tensor(t, 100, 10)],),
+    ('zero', small_3d, lambda t: [],),
+    ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
+    ('eye', small_2d, lambda t: [3, 4],),
+    ('flip', small_3d, lambda t: [0], 'd0', types, True),
+    ('flip', small_3d, lambda t: [0, 1, 2], 'd012', types, True),
+    ('flip', small_3d, lambda t: [0, 2], 'd02', types, True),
+    ('flip', small_3d, lambda t: [2, 0], 'd20', types, True),
+    ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types),
+    ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
+    ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
+    ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)),
+        lambda t: [2], None, signed_types),
+    ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)),
+        lambda t: [2], None, signed_types),
+    # lapack tests
+    ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('inverse', new_t(20, 20), lambda t: [], None, float_types, False),
+    ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', lambda t: new_t(10, 10)(t).t(), lambda t: [True], 'square_col_maj',
+        float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', new_t(20, 5), lambda t: [True], 'tall_some', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', new_t(20, 5), lambda t: [False], 'tall_all', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [True],
+        'tall_some_col_maj', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [False],
+        'tall_all_col_maj', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+    ('eig', new_t(10, 10), lambda t: [True], 'with_eigvec', float_types_no_half, False,
+        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
+]
+
+# TODO: random functions, cat, gather, scatter, index*, masked*,
+#       resize, resizeAs, storage_offset, storage, stride, unfold
+
+custom_precision = {
+    'addbmm': 1e-4,
+    'addmm': 1e-4,
+    'addmv': 1e-4,
+    'addr': 1e-4,
+    'baddbmm': 1e-4,
+    'rsqrt': 1e-4,
+    'cumprod': 1e-4,
+    'qr': 3e-4,
+    'digamma': 1e0,  # large values lead to large absolute error but small relative error
+}
+
+custom_half_precision = {
+    'add': 1e-2,
+    'acos': 1e-3,
+    'addbmm': 1e-1,
+    'addcdiv': 1e-2,
+    'addcmul': 1e-2,
+    'addmm': 1e-1,
+    'addmv': 1e-2,
+    'addr': 1e-2,
+    'asin': 1e-3,
+    'atan2': 1e-3,
+    'atan': 1e-3,
+    'baddbmm': 1e-2,
+    'cos': 1e-3,
+    'cosh': 1e-2,
+    'cross': 1e-2,
+    'cumprod': 1e-2,
+    'cumsum': 1e-2,
+    'dist': 1e-2,
+    'div': 1e-3,
+    'dot': 1e-2,
+    'erf': 1e-3,
+    'erfc': 1e-3,
+    'erfinv': 1e-3,
+    'exp': 1e-2,
+    'expm1': 1e-2,
+    'fill': 1e-3,
+    'lerp': 1e-2,
+    'lgamma': 1e-2,
+    'log': 1e-2,
+    'log10': 1e-2,
+    'log1p': 1e-3,
+    'log2': 1e-2,
+    'mean': 1e-3,
+    'mul': 1e-2,
+    'norm': 1e-1,
+    'pow': 1e-1,
+    'prod': 1e-3,
+    'reciprocal': 1e-1,
+    'remainder': 1e-3,
+    'renorm': 1e-3,
+    'rsqrt': 1e-2,
+    'sigmoid': 1e-3,
+    'sin': 1e-3,
+    'sinh': 1e-3,
+    'sqrt': 1e-3,
+    'std': 1e-3,
+    'sub': 1e-2,
+    'sum': 1e-2,
+    'tan': 1e-3,
+    'tanh': 1e-3,
+    'trace': 1e-3,
+    'var': 1e-3,
+    '__lshift__': 1e-3,
+    '__rshift__': 1e-3,
+}
+
+simple_pointwise = [
+    'abs',
+    'sign',
+]
+for fn in simple_pointwise:
+    tests.append((fn, small_3d, lambda t: []))
+
+simple_pointwise_float = [
+    'log',
+    'log10',
+    'log1p',
+    'log2',
+    'sigmoid',
+    'sin',
+    'sqrt',
+    'tanh',
+    'acos',
+    'asin',
+    'atan',
+    'cos',
+    'cosh',
+    'erf',
+    'erfc',
+    'erfinv',
+    'exp',
+    'expm1',
+    'reciprocal',
+    'floor',
+    'frac',
+    'neg',
+    'round',
+    'trunc',
+    'ceil',
+    'lgamma',
+    'digamma',
+    'trigamma',
+]
+
+for fn in simple_pointwise_float:
+    tests.append((fn, small_3d, lambda t: [], None, float_types))
+
+_cycles_per_ms = None
+
+
+def get_cycles_per_ms():
+    """Approximate number of cycles per millisecond for torch.cuda._sleep"""
+    global _cycles_per_ms
+    if _cycles_per_ms is None:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        torch.cuda._sleep(1000000)
+        end.record()
+        end.synchronize()
+        _cycles_per_ms = 1000000 / start.elapsed_time(end)
+    return _cycles_per_ms
+
+
+def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
+    def tmp(self):
+        cpu_tensor = tensor_constructor(t)
+        gpu_tensor = to_gpu(cpu_tensor)
+        cpu_args = arg_constructor(t)
+        gpu_args = [to_gpu(arg) for arg in cpu_args]
+        if is_half(t):
+            cpu_tensor = cpu_tensor.float()
+            cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args]
+        cpu_result = getattr(cpu_tensor, fn)(*cpu_args)
+        try:
+            gpu_result = getattr(gpu_tensor, fn)(*gpu_args)
+        except RuntimeError as e:
+            reason = e.args[0]
+            if 'only supports floating-point types' in reason or 'unimplemented data type' in reason:
+                raise unittest.SkipTest('unimplemented data type')
+            raise
+        except AttributeError as e:
+            reason = e.args[0]
+            if 'object has no attribute' in reason:
+                raise unittest.SkipTest('unimplemented data type')
+            raise
+        # If one changes, another should change as well
+        self.assertEqual(cpu_tensor, gpu_tensor, precision)
+        self.assertEqual(cpu_args, gpu_args, precision)
+        # Compare results
+        if fn == 'element_size' and t.__name__ == 'HalfTensor':
+            # Workaround since cpu_result is float
+            self.assertEqual(2, gpu_result)
+        else:
+            self.assertEqual(cpu_result, gpu_result, precision)
+    return tmp
+
+
+class TestCuda(TestCase):
+    _do_cuda_memory_leak_check = True
+
+    @staticmethod
+    def _test_memory_stats_generator(self, device=None, N=35):
+        if device is None:
+            device = torch.cuda.current_device()
+
+        m0 = torch.cuda.memory_allocated(device)
+        last_m_arr = [torch.cuda.memory_allocated(device)]
+        max_m_arr = [torch.cuda.max_memory_allocated(device)]
+        last_c_arr = [torch.cuda.memory_cached(device)]
+        max_c_arr = [torch.cuda.max_memory_cached(device)]
+
+        def alloc(*size):
+            with torch.cuda.device(device):
+                # NOTE: do **not** use methods that can have additional
+                #       memory overhead, e.g., inplace random sampling methods.
+                #       they can leave some memory occupied even after being
+                #       deallocated, e.g., initialized RNG state, causing some
+                #       memory checks below to fail.
+                return torch.cuda.FloatTensor(*size)
+
+        def assert_change(comp=1, empty_cache=False):
+            # comp > 0: increased
+            # comp = 0: equal
+            # comp < 0: decreased
+            new_m = torch.cuda.memory_allocated(device)
+            new_max_m = torch.cuda.max_memory_allocated(device)
+            if comp > 0:
+                self.assertGreater(new_m, last_m_arr[0])
+            elif comp < 0:
+                self.assertLess(new_m, last_m_arr[0])
+            else:
+                self.assertEqual(new_m, last_m_arr[0])
+            self.assertLessEqual(new_m, new_max_m)
+            self.assertGreaterEqual(new_max_m, max_m_arr[0])
+            last_m_arr[0] = new_m
+            max_m_arr[0] = new_max_m
+
+            new_c = torch.cuda.memory_cached(device)
+            new_max_c = torch.cuda.max_memory_cached(device)
+            # emptying cache may happen (due to allocation or empty_cache), so
+            # we can't assert new_c >= last_c
+            self.assertLessEqual(new_c, new_max_c)
+            self.assertGreaterEqual(new_max_c, max_c_arr[0])
+            last_c_arr[0] = new_c
+            max_c_arr[0] = new_max_c
+
+            if empty_cache:
+                torch.cuda.empty_cache()
+                new_c = torch.cuda.memory_cached(device)
+                new_max_c = torch.cuda.max_memory_cached(device)
+                self.assertLessEqual(new_c, last_c_arr[0])
+                self.assertLessEqual(new_c, new_max_c)
+                self.assertEqual(new_max_c, max_c_arr[0])
+                last_c_arr[0] = new_c
+
+        assert_change(0)
+        assert_change(0)
+        yield
+
+        tensors1 = [alloc(1), alloc(10, 20), alloc(200, 300, 2000)]
+        m1 = torch.cuda.memory_allocated(device)
+        assert_change(1)
+        yield
+
+        tensors2 = []
+
+        for i in range(1, int(N / 2) + 1):
+            # small ones
+            tensors2.append(alloc(i, i * 4))
+            assert_change(1)
+            yield
+
+        for i in range(5, int(N / 2) + 5):
+            # large ones
+            tensors2.append(alloc(i, i * 7, i * 9, i * 11))
+            assert_change(1)
+            yield
+
+        tensors2.append(alloc(0, 0, 0))
+        assert_change(0)
+        yield
+
+        permute = []
+        for i in torch.randperm(len(tensors2)):
+            permute.append(tensors2[i])
+            assert_change(0)
+            yield
+
+        del tensors2
+        assert_change(0)
+        yield
+        tensors2 = permute
+        assert_change(0)
+        yield
+        del permute
+        assert_change(0)
+        yield
+
+        for i in range(int(N / 2)):
+            x = tensors2[i].numel()
+            del tensors2[i]
+            assert_change(-x)  # in case that tensors2[i] is empty
+            yield
+
+        for i in range(2, int(2 * N / 3) + 2):
+            tensors2.append(alloc(i, i * 3, i * 8))
+            assert_change(1)
+            yield
+
+        del tensors2
+        assert_change(-1)
+        assert_change(0)
+        self.assertEqual(torch.cuda.memory_allocated(device), m1)
+        yield True
+
+        del tensors1
+        assert_change(-1)
+        self.assertEqual(torch.cuda.memory_allocated(device), m0)
+
+        # test empty_cache
+        assert_change(0, empty_cache=True)
+
+    def test_memory_stats(self):
+        torch.cuda.empty_cache()
+        for _ in self._test_memory_stats_generator(self):
+            pass
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_memory_stats_multigpu(self):
+        # advance a generator with a end flag
+        def advance(gen, end):
+            if not end:
+                try:
+                    next(gen)
+                except StopIteration:
+                    end = True
+            return end
+
+        # interlace
+        torch.cuda.empty_cache()
+        gen0 = self._test_memory_stats_generator(self, device=0, N=35)
+        gen1 = self._test_memory_stats_generator(self, device=1, N=35)
+        end0 = end1 = False
+        while not (end0 and end1):
+            end0 = advance(gen0, end0)
+            end1 = advance(gen1, end1)
+
+        # semi-random order
+        torch.cuda.empty_cache()
+        gen0 = self._test_memory_stats_generator(self, device=0, N=35)
+        gen1 = self._test_memory_stats_generator(self, device=1, N=35)
+        end0 = end1 = False
+
+        while not (end0 and end1):
+            end0 = advance(gen0, end0)
+            if not end0:
+                gen1_max_times = torch.LongTensor(1).random_(0, 3)[0]
+            else:
+                gen1_max_times = inf
+            t = 0
+            while t < gen1_max_times and not end1:
+                end1 = advance(gen1, end1)
+                t += 1
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_autogpu(self):
+        x = torch.randn(5, 5).cuda()
+        y = torch.randn(5, 5).cuda()
+        self.assertEqual(x.get_device(), 0)
+        self.assertEqual(x.get_device(), 0)
+        with torch.cuda.device(1):
+            z = torch.randn(5, 5).cuda()
+            self.assertEqual(z.get_device(), 1)
+            q = x.add(y)
+            self.assertEqual(q.get_device(), 0)
+            w = torch.randn(5, 5).cuda()
+            self.assertEqual(w.get_device(), 1)
+            self.assertEqual(y.cuda().get_device(), 1)
+        z = z.cuda()
+        self.assertEqual(z.get_device(), 0)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_new(self):
+        x = torch.randn(3, 3).cuda()
+        self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
+        self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
+
+        with torch.cuda.device(1):
+            self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
+            self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_copy_device(self):
+        x = torch.randn(5, 5).cuda()
+        with torch.cuda.device(1):
+            y = x.cuda()
+            self.assertEqual(y.get_device(), 1)
+            self.assertIs(y.cuda(), y)
+            z = y.cuda(0)
+            self.assertEqual(z.get_device(), 0)
+            self.assertIs(z.cuda(0), z)
+
+        x = torch.randn(5, 5)
+        with torch.cuda.device(1):
+            y = x.cuda()
+            self.assertEqual(y.get_device(), 1)
+            self.assertIs(y.cuda(), y)
+            z = y.cuda(0)
+            self.assertEqual(z.get_device(), 0)
+            self.assertIs(z.cuda(0), z)
+
+    def test_serialization_array_with_storage(self):
+        x = torch.randn(5, 5).cuda()
+        y = torch.IntTensor(2, 5).fill_(0).cuda()
+        q = [x, y, x, y.storage()]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(q, f)
+            f.seek(0)
+            q_copy = torch.load(f)
+        self.assertEqual(q_copy, q, 0)
+        q_copy[0].fill_(5)
+        self.assertEqual(q_copy[0], q_copy[2], 0)
+        self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
+        self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
+        self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
+        self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
+        q_copy[1].fill_(10)
+        self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
+
+    def test_type_conversions(self):
+        x = torch.randn(5, 5)
+        self.assertIsInstance(x.float(), torch.FloatTensor)
+        self.assertIsInstance(x.cuda(), torch.cuda.DoubleTensor)
+        self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor)
+        self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor)
+        self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor)
+
+        y = x.storage()
+        self.assertIsInstance(y.float(), torch.FloatStorage)
+        self.assertIsInstance(y.cuda(), torch.cuda.DoubleStorage)
+        self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage)
+        self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage)
+        self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_type_conversions_same_gpu(self):
+        x = torch.randn(5, 5).cuda(1)
+        self.assertEqual(x.int().get_device(), 1)
+        self.assertEqual(x.type(torch.int).get_device(), 1)
+        self.assertEqual(x.to(torch.int).get_device(), 1)
+
+    def test_neg(self):
+        TestTorch._test_neg(self, lambda t: t.cuda())
+
+    def _test_broadcast(self, input):
+        if not TEST_MULTIGPU:
+            raise unittest.SkipTest("only one GPU detected")
+        result = comm.broadcast(input, (0, 1))
+        for i, t in enumerate(result):
+            self.assertEqual(t.get_device(), i)
+            self.assertEqual(t, input)
+            if input.is_cuda and input.get_device() == i:
+                self.assertEqual(t.data_ptr(), input.data_ptr())
+
+    def test_broadcast_cpu(self):
+        self._test_broadcast(torch.randn(5, 5))
+
+    def test_broadcast_gpu(self):
+        self._test_broadcast(torch.randn(5, 5).cuda())
+
+    def test_min_max_nan(self):
+        tests = [(lambda x: x.min(), 'min'),
+                 (lambda x: x.max(), 'max'),
+                 (lambda x: x.min(0)[0], 'min_dim'),
+                 (lambda x: x.max(0)[0], 'max_dim')]
+        for f, name in tests:
+            a = torch.arange(25.0).view(5, 5)
+            a[2, 2] = nan
+            actual = f(a.cuda()).cpu()
+            expected = f(a).cpu()
+            self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name))
+            self.assertEqual(actual[~torch.isnan(actual)],
+                             expected[~torch.isnan(expected)], 'nans for {}'.format(name))
+
+    @staticmethod
+    def _test_broadcast_coalesced(self, tensors, buffer_size):
+        b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
+        for (_, bt), t in zip(b_tensors, tensors):
+            self.assertEqual(bt.get_device(), 1)
+            self.assertEqual(bt, t)
+            self.assertIsInstance(bt, type(t))
+
+        bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=buffer_size)
+        bc_tensors_t = list(zip(*bc_tensors))
+        self.assertEqual(b_tensors, bc_tensors_t)
+        for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
+            self.assertEqual(bt.get_device(), bct.get_device())
+            self.assertIsInstance(bct, type(bt))
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_broadcast_coalesced(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
+            make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
+            make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+        self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_broadcast_coalesced_dense_only(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+        self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_reduce_add(self):
+        x = torch.randn(5, 5)
+        y = torch.randn(5, 5)
+        x_cuda = x.cuda(0)
+        y_cuda = y.cuda(1)
+        result = comm.reduce_add((x_cuda, y_cuda))
+        self.assertEqual(result.get_device(), 0)
+        self.assertEqual(result.cpu(), x + y)
+
+    @staticmethod
+    def _test_reduce_add_coalesced(self, tensors, buffer_size):
+        dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]
+
+        r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
+        for r, t in zip(r_tensors, tensors):
+            self.assertEqual(r.get_device(), t.get_device())
+            self.assertEqual(r, t * 2)
+            self.assertEqual(r.type(), t.type())
+
+        rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size)
+        self.assertEqual(r_tensors, rc_tensors)
+        for r, rc in zip(r_tensors, rc_tensors):
+            self.assertEqual(rc.get_device(), r.get_device())
+            self.assertEqual(rc.type(), r.type())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_reduce_add_coalesced(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
+            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
+            make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
+            make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+        self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_reduce_add_coalesced_dense_only(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+        self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
+
+    def _test_scatter(self, input, chunk_sizes=None, dim=0):
+        if not TEST_MULTIGPU:
+            raise unittest.SkipTest("only one GPU detected")
+        result = comm.scatter(input, (0, 1), chunk_sizes, dim)
+        self.assertEqual(len(result), 2)
+        if chunk_sizes is None:
+            chunk_sizes = tuple(repeat(input.size(dim) // 2, 2))
+        chunk_start = 0
+        for i, r in enumerate(result):
+            chunk_end = chunk_start + chunk_sizes[i]
+            index = [slice(None, None), slice(None, None)]
+            index[dim] = slice(chunk_start, chunk_end)
+            self.assertEqual(r, input[tuple(index)], 0)
+            chunk_start = chunk_end
+
+    def test_scatter_cpu(self):
+        self._test_scatter(torch.randn(4, 4), dim=0)
+
+    def test_scatter_cpu_dim(self):
+        self._test_scatter(torch.randn(4, 4), dim=1)
+
+    def test_scatter_cpu_neg_dim(self):
+        self._test_scatter(torch.randn(4, 4), dim=-2)
+
+    def test_scatter_cpu_sizes(self):
+        self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4))
+
+    def test_scatter_gpu(self):
+        self._test_scatter(torch.randn(4, 4).cuda(), dim=0)
+
+    def test_scatter_gpu_dim(self):
+        self._test_scatter(torch.randn(4, 4).cuda(), dim=1)
+
+    def test_scatter_gpu_neg_dim(self):
+        self._test_scatter(torch.randn(4, 4).cuda(), dim=-2)
+
+    def test_scatter_gpu_sizes(self):
+        self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4))
+
+    def _test_gather(self, dim):
+        if not TEST_MULTIGPU:
+            raise unittest.SkipTest("only one GPU detected")
+        x = torch.randn(2, 5).cuda(0)
+        y = torch.randn(2, 5).cuda(1)
+        result = comm.gather((x, y), dim)
+
+        expected_size = list(x.size())
+        expected_size[dim] += y.size(dim)
+        expected_size = torch.Size(expected_size)
+        self.assertEqual(result.get_device(), 0)
+        self.assertEqual(result.size(), expected_size)
+
+        index = [slice(None, None), slice(None, None)]
+        index[dim] = slice(0, x.size(dim))
+        self.assertEqual(result[tuple(index)], x)
+        index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
+        self.assertEqual(result[tuple(index)], y)
+
+    def test_gather(self):
+        self._test_gather(0)
+
+    def test_gather_dim(self):
+        self._test_gather(1)
+
+    def test_from_sequence(self):
+        seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
+        reference = torch.arange(0, 20).resize_(5, 4)
+        for t in types:
+            cuda_type = get_gpu_type(t)
+            self.assertEqual(cuda_type(seq), reference)
+
+    def test_torch_manual_seed_seeds_cuda_devices(self):
+        with freeze_rng_state():
+            x = torch.zeros(4, 4).float().cuda()
+            torch.manual_seed(2)
+            self.assertEqual(torch.cuda.initial_seed(), 2)
+            x.uniform_()
+            torch.manual_seed(2)
+            y = x.clone().uniform_()
+            self.assertEqual(x, y)
+            self.assertEqual(torch.cuda.initial_seed(), 2)
+
+    def test_manual_seed(self):
+        with freeze_rng_state():
+            x = torch.zeros(4, 4).float().cuda()
+            torch.cuda.manual_seed(2)
+            self.assertEqual(torch.cuda.initial_seed(), 2)
+            x.uniform_()
+            torch.cuda.manual_seed(2)
+            y = x.clone().uniform_()
+            self.assertEqual(x, y)
+            self.assertEqual(torch.cuda.initial_seed(), 2)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_cat_autogpu(self):
+        x = torch.randn(4, 4).cuda(1)
+        y = torch.randn(4, 4).cuda(1)
+        z = torch.cat([x, y], 0)
+        self.assertEqual(z.get_device(), x.get_device())
+
+    def test_cat(self):
+        SIZE = 10
+        for dim in range(-3, 3):
+            pos_dim = dim if dim >= 0 else 3 + dim
+            x = torch.rand(13, SIZE, SIZE).transpose(0, pos_dim).cuda()
+            y = torch.rand(17, SIZE, SIZE).transpose(0, pos_dim).cuda()
+            z = torch.rand(19, SIZE, SIZE).transpose(0, pos_dim).cuda()
+
+            res1 = torch.cat((x, y, z), dim)
+            self.assertEqual(res1.narrow(pos_dim, 0, 13), x, 0)
+            self.assertEqual(res1.narrow(pos_dim, 13, 17), y, 0)
+            self.assertEqual(res1.narrow(pos_dim, 30, 19), z, 0)
+
+        x = torch.randn(20, SIZE, SIZE).cuda()
+        self.assertEqual(torch.cat(torch.split(x, 7)), x)
+        self.assertEqual(torch.cat(torch.chunk(x, 7)), x)
+
+        y = torch.randn(1, SIZE, SIZE).cuda()
+        z = torch.cat([x, y])
+        self.assertEqual(z.size(), (21, SIZE, SIZE))
+
+    def test_cat_empty_legacy(self):
+        TestTorch._test_cat_empty_legacy(self, use_cuda=True)
+
+    def test_cat_empty(self):
+        TestTorch._test_cat_empty(self, use_cuda=True)
+
+    def test_bernoulli(self):
+        x = torch.tensor([0, 1], dtype=torch.float32, device='cuda')
+        self.assertEqual(x.bernoulli().tolist(), [0, 1])
+
+    def test_cat_bad_input_sizes(self):
+        x = torch.randn(2, 1).cuda()
+        y = torch.randn(2, 1, 1).cuda()
+        z = torch.randn(2, 1, 1).cuda()
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z]))
+
+        x = torch.randn(2, 1, 2).cuda()
+        y = torch.randn(2, 1, 1).cuda()
+        z = torch.randn(2, 2, 1).cuda()
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1))
+
+    @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
+    @unittest.skipIf(not PY3, "Tensor was serialized with Python 3")
+    def test_load_nonexistent_device(self):
+        # Setup: create a serialized file object with a 'cuda:9' restore location
+        tensor = torch.randn(2, device='cuda')
+        buf = io.BytesIO()
+        torch.save(tensor, buf)
+        # NB: this might not work in the future if serialization changes
+        buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
+
+        msg = r'Attempting to deserialize object on CUDA device 9'
+        with self.assertRaisesRegex(RuntimeError, msg):
+            _ = torch.load(buf)
+
+    def test_serialization(self):
+        x = torch.randn(4, 4).cuda()
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        self.assertEqual(x_copy, x)
+        self.assertIs(type(x_copy), type(x))
+        self.assertEqual(x_copy.get_device(), x.get_device())
+
+    def test_serialization_array_with_empty(self):
+        x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), original.get_device())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_multigpu_serialization(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), original.get_device())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_multigpu_serialization_remap(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+
+        def gpu_remap(storage, location):
+            if location == 'cuda:1':
+                return storage.cuda(0)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f, map_location=gpu_remap)
+
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), 0)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_multigpu_serialization_remap_dict(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), 0)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_cuda_set_device(self):
+        x = torch.randn(5, 5)
+        with torch.cuda.device(1):
+            self.assertEqual(x.cuda().get_device(), 1)
+            torch.cuda.set_device(0)
+            self.assertEqual(x.cuda().get_device(), 0)
+            with torch.cuda.device(1):
+                self.assertEqual(x.cuda().get_device(), 1)
+            self.assertEqual(x.cuda().get_device(), 0)
+            torch.cuda.set_device(1)
+        self.assertEqual(x.cuda().get_device(), 0)
+
+    def test_is_tensor(self):
+        for t in types:
+            tensor = get_gpu_type(t)()
+            self.assertTrue(torch.is_tensor(tensor))
+        self.assertTrue(torch.is_tensor(torch.cuda.HalfTensor()))
+
+    def test_cuda_synchronize(self):
+        torch.cuda.synchronize()
+
+    def test_streams(self):
+        default_stream = torch.cuda.current_stream()
+        user_stream = torch.cuda.Stream()
+        self.assertEqual(torch.cuda.current_stream(), default_stream)
+        self.assertNotEqual(default_stream, user_stream)
+        self.assertEqual(default_stream.cuda_stream, 0)
+        self.assertNotEqual(user_stream.cuda_stream, 0)
+        with torch.cuda.stream(user_stream):
+            self.assertEqual(torch.cuda.current_stream(), user_stream)
+        self.assertTrue(user_stream.query())
+        # copy 10 MB tensor from CPU-GPU which should take some time
+        tensor1 = torch.ByteTensor(10000000).pin_memory()
+        tensor2 = tensor1.cuda(non_blocking=True)
+        self.assertFalse(default_stream.query())
+        default_stream.synchronize()
+        self.assertTrue(default_stream.query())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_streams_multi_gpu(self):
+        default_stream = torch.cuda.current_stream()
+        self.assertEqual(default_stream.device, 0)
+        stream = torch.cuda.Stream(device=1)
+        self.assertEqual(stream.device, 1)
+        with torch.cuda.device(1):
+            self.assertEqual(torch.cuda.current_stream().device, 1)
+            self.assertNotEqual(torch.cuda.current_stream(), default_stream)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_tensor_device(self):
+        self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0)
+        self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1)
+        with torch.cuda.device(1):
+            self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 1)
+            self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0)
+            self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1)
+
+    def test_events(self):
+        stream = torch.cuda.current_stream()
+        event = torch.cuda.Event(enable_timing=True)
+        self.assertTrue(event.query())
+        start_event = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start_event)
+        torch.cuda._sleep(int(50 * get_cycles_per_ms()))
+        stream.record_event(event)
+        self.assertFalse(event.query())
+        event.synchronize()
+        self.assertTrue(event.query())
+        self.assertGreater(start_event.elapsed_time(event), 0)
+
+    def test_record_stream(self):
+        cycles_per_ms = get_cycles_per_ms()
+
+        t = torch.FloatTensor([1, 2, 3, 4]).pin_memory()
+        result = torch.cuda.FloatTensor(t.size())
+        stream = torch.cuda.Stream()
+        ptr = [None]
+
+        # Performs the CPU->GPU copy in a background stream
+        def perform_copy():
+            with torch.cuda.stream(stream):
+                tmp = t.cuda(non_blocking=True)
+                ptr[0] = tmp.data_ptr()
+            torch.cuda.current_stream().wait_stream(stream)
+            tmp.record_stream(torch.cuda.current_stream())
+            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+            result.copy_(tmp)
+
+        perform_copy()
+        with torch.cuda.stream(stream):
+            tmp2 = torch.cuda.FloatTensor(t.size())
+            tmp2.zero_()
+            self.assertNotEqual(tmp2.data_ptr(), ptr[0], 'allocation re-used to soon')
+
+        self.assertEqual(result.tolist(), [1, 2, 3, 4])
+
+        # Check that the block will be re-used after the main stream finishes
+        torch.cuda.current_stream().synchronize()
+        with torch.cuda.stream(stream):
+            tmp3 = torch.cuda.FloatTensor(t.size())
+            self.assertEqual(tmp3.data_ptr(), ptr[0], 'allocation not re-used')
+
+    def test_noncontiguous_pinned_memory(self):
+        # See issue #3266
+        x = torch.arange(0, 10).view((2, 5))
+        self.assertEqual(x.t(), x.t().pin_memory())
+
+    def test_caching_pinned_memory(self):
+        cycles_per_ms = get_cycles_per_ms()
+
+        # check that allocations are re-used after deletion
+        t = torch.FloatTensor([1]).pin_memory()
+        ptr = t.data_ptr()
+        del t
+        t = torch.FloatTensor([1]).pin_memory()
+        self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')
+
+        # check that the allocation is not re-used if it's in-use by a copy
+        gpu_tensor = torch.cuda.FloatTensor([0])
+        torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+        gpu_tensor.copy_(t, non_blocking=True)
+        del t
+        t = torch.FloatTensor([1]).pin_memory()
+        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
+        self.assertEqual(list(gpu_tensor), [1])
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_caching_pinned_memory_multi_gpu(self):
+        # checks that the events preventing pinned memory from being re-used
+        # too early are recorded on the correct GPU
+        cycles_per_ms = get_cycles_per_ms()
+
+        t = torch.FloatTensor([1]).pin_memory()
+        ptr = t.data_ptr()
+        gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
+        gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)
+
+        with torch.cuda.device(1):
+            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+            gpu_tensor1.copy_(t, non_blocking=True)
+
+        del t
+        t = torch.FloatTensor([2]).pin_memory()
+        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
+
+        with torch.cuda.device(0):
+            gpu_tensor0.copy_(t, non_blocking=True)
+
+        self.assertEqual(gpu_tensor1[0], 1)
+        self.assertEqual(gpu_tensor0[0], 2)
+
+    @staticmethod
+    def _select_broadcastable_dims(dims_full=None):
+        return TestTorch._select_broadcastable_dims(dims_full)
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_pinverse(self):
+        TestTorch._test_pinverse(self, lambda t: t.cuda())
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_det_logdet_slogdet(self):
+        TestTorch._test_det_logdet_slogdet(self, lambda t: t.cuda())
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_gesv_batched(self):
+        TestTorch._test_gesv_batched(self, lambda t: t.cuda())
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_gesv_batched_dims(self):
+        TestTorch._test_gesv_batched_dims(self, lambda t: t.cuda())
+
+    def test_view(self):
+        TestTorch._test_view(self, lambda t: t.cuda())
+
+    def test_flip(self):
+        TestTorch._test_flip(self, use_cuda=True)
+
+    def test_signal_window_functions(self):
+        TestTorch._test_signal_window_functions(self, device=torch.device('cuda'))
+
+    def test_fft_ifft_rfft_irfft(self):
+        TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
+
+        @contextmanager
+        def plan_cache_max_size(n):
+            original = torch.backends.cuda.cufft_plan_cache.max_size
+            torch.backends.cuda.cufft_plan_cache.max_size = n
+            yield
+            torch.backends.cuda.cufft_plan_cache.max_size = original
+
+        with plan_cache_max_size(max(1, torch.backends.cuda.cufft_plan_cache.size - 10)):
+            TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
+
+        with plan_cache_max_size(0):
+            TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
+
+        torch.backends.cuda.cufft_plan_cache.clear()
+
+        # check that stll works after clearing cache
+        with plan_cache_max_size(10):
+            TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
+
+        with self.assertRaisesRegex(RuntimeError, r"must be non-negative"):
+            torch.backends.cuda.cufft_plan_cache.max_size = -1
+
+        with self.assertRaisesRegex(RuntimeError, r"read-only property"):
+            torch.backends.cuda.cufft_plan_cache.size = -1
+
+    def test_stft(self):
+        TestTorch._test_stft(self, device=torch.device('cuda'))
+
+    def test_multinomial(self):
+        TestTorch._test_multinomial(self, torch.cuda.FloatTensor)
+
+        # Test a corner case from older PyTorch (Issue #4858)
+        freqs = torch.cuda.FloatTensor([
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.03178183361887932, 0.027680952101945877, 0.033176131546497345,
+            0.046052902936935425, 0.07742464542388916, 0.11543981730937958,
+            0.14148041605949402, 0.15784293413162231, 0.13180233538150787,
+            0.08271478116512299, 0.049702685326337814, 0.027557924389839172,
+            0.018125897273421288, 0.011851548217236996, 0.010252203792333603,
+            0.007422595750540495, 0.005372154992073774, 0.0045109698548913,
+            0.0036087757907807827, 0.0035267581697553396, 0.0018864056328311563,
+            0.0024605290964245796, 0.0022964938543736935, 0.0018453967059031129,
+            0.0010662291897460818, 0.0009842115687206388, 0.00045109697384759784,
+            0.0007791675161570311, 0.00020504408166743815, 0.00020504408166743815,
+            0.00020504408166743815, 0.00012302644609007984, 0.0,
+            0.00012302644609007984, 4.100881778867915e-05, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0])
+
+        torch.cuda.manual_seed(11042)
+        sample = torch.multinomial(freqs, 1000, True)
+        self.assertNotEqual(freqs[sample].min(), 0)
+
+    @staticmethod
+    def mute():
+        os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno())
+
+    def _spawn_method(self, method, arg):
+        ctx = mp.get_context("spawn")
+        with ctx.Pool(1, initializer=self.mute) as pool:
+            errors = pool.map(method, [arg])
+            for e in errors:
+                if 'device-side assert triggered' not in str(e):
+                    self.fail(e)
+
+    @staticmethod
+    def _test_multinomial_invalid_probs_cuda(probs):
+        try:
+            with torch.random.fork_rng(devices=[0]):
+                torch.multinomial(probs.to('cuda'), 1)
+                torch.cuda.synchronize()
+            return False  # Should not be reached
+        except RuntimeError as e:
+            return e
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(IS_WINDOWS, 'FIXME: CUDA OOM error on Windows')
+    @unittest.skipIf(not PY3,
+                     "spawn start method is not supported in Python 2, \
+                     but we need it for creating another process with CUDA")
+    def test_multinomial_invalid_probs_cuda(self):
+        test_method = TestCuda._test_multinomial_invalid_probs_cuda
+        self._spawn_method(test_method, torch.Tensor([0, -1]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))
+
+    def test_broadcast(self):
+        TestTorch._test_broadcast(self, lambda t: t.cuda())
+
+    def test_contiguous(self):
+        TestTorch._test_contiguous(self, lambda t: t.cuda())
+
+    def test_broadcast_fused_matmul(self):
+        TestTorch._test_broadcast_fused_matmul(self, lambda t: t.cuda())
+
+    def test_broadcast_batched_matmul(self):
+        TestTorch._test_broadcast_batched_matmul(self, lambda t: t.cuda())
+
+    def test_index(self):
+        TestTorch._test_index(self, lambda t: t.cuda())
+
+    def test_advancedindex(self):
+        TestTorch._test_advancedindex(self, lambda t: t.cuda())
+
+    def test_advancedindex_mixed_cpu_cuda(self):
+        def test(x, ia, ib):
+            # test getitem
+            self.assertEqual(x[:, ia, None, ib, 0].cpu(),
+                             x.cpu()[:, ia.cpu(), None, ib.cpu(), 0])
+            self.assertEqual(x[ia], x.cpu()[ia.cpu()])
+            # test setitem
+            x_clone1 = x.clone()
+            x_clone2 = x.clone()
+            first_shape = x[:, ia, None, ib, 0].shape
+            second_shape = x[ia].shape
+            x_clone1[:, ia, None, ib, 0] = torch.randn(first_shape).to(x_clone1)
+            x_clone2[ia] = torch.randn(second_shape).to(x_clone2)
+
+        cpu = torch.device('cpu')
+        for device in ['cuda:0', 'cuda:1'] if torch.cuda.device_count() > 1 else ['cuda']:
+            # Index cpu tensor with cuda tensor
+            x = torch.randn(3, 4, 4, 4, 3)
+            ia = torch.tensor([0, 2, 1]).to(device)
+            ib = torch.tensor([0, 2, 1]).to(device)
+            test(x, ia, ib)
+
+            # Index cuda tensor with cpu tensor
+            x = x.to(device)
+            ia = ia.to(cpu)
+            ib = ib.to(cpu)
+            test(x, ia, ib)
+
+            # Index cpu tensor with mixed cpu, cuda tensors
+            x = x.to(cpu)
+            ia = ia.to(cpu)
+            ib = ib.to(device)
+            test(x, ia, ib)
+
+            # Index cuda tensor with mixed cpu, cuda tensors
+            x = x.to(device)
+            ia = ia.to(cpu)
+            ib = ib.to(device)
+            test(x, ia, ib)
+
+            if torch.cuda.device_count() > 1:
+                other_device = 'cuda:0' if device != 'cuda:0' else 'cuda:1'
+                # Index cuda tensor with mixed cpu, cuda tensors on different devices
+                x = x.to(device)
+                ia = ia.to(cpu)
+                ib = ib.to(other_device)
+                test(x, ia, ib)
+
+    def test_advancedindex_big(self):
+        TestTorch._test_advancedindex_big(self, lambda t: t.cuda())
+
+    def test_btrifact(self):
+        TestTorch._test_btrifact(self, lambda t: t.cuda())
+
+    def test_btrisolve(self):
+        TestTorch._test_btrisolve(self, lambda t: t.cuda())
+
+    def test_dim_reduction(self):
+        TestTorch._test_dim_reduction(self, lambda t: t.cuda())
+
+    def test_tensor_gather(self):
+        TestTorch._test_gather(self, lambda t: t.cuda(), False)
+
+    def test_tensor_scatter(self):
+        TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False)
+
+    def test_tensor_scatterAdd(self):
+        TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False)
+
+    def test_tensor_scatterFill(self):
+        TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False)
+
+    def test_min_max_inits(self):
+        # Testing if THC_reduceAll received the correct index initialization.
+        # This affects the result of THC_reduceAll operations at extreme values
+        x = torch.cuda.ByteTensor([0])
+        y = torch.cuda.ByteTensor([255])
+        expected = torch.cuda.LongTensor([0])[0]
+
+        _, v = x.max(dim=0)
+        self.assertEqual(v, expected)
+
+        _, v = y.min(dim=0)
+        self.assertEqual(v, expected)
+
+    def test_int_pow(self):
+        TestTorch._test_int_pow(self, lambda x: x.cuda())
+
+    def test_remainder_overflow(self):
+        TestTorch._test_remainder_overflow(self, dtype=torch.int64, device='cuda')
+
+    def test_var(self):
+        cpu_tensor = torch.randn(2, 3, 3)
+        gpu_tensor = cpu_tensor.cuda()
+        self.assertEqual(gpu_tensor.var(), cpu_tensor.var())
+        self.assertEqual(gpu_tensor.var(1), cpu_tensor.var(1))
+        self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))
+        self.assertEqual(gpu_tensor.std(), cpu_tensor.std())
+        self.assertEqual(gpu_tensor.std(1), cpu_tensor.std(1))
+        self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))
+
+        cpu_tensor = torch.randn(100)
+        gpu_tensor = cpu_tensor.cuda()
+        self.assertEqual(gpu_tensor.var(), cpu_tensor.var())
+
+    def test_var_unbiased(self):
+        tensor = torch.randn(100).cuda()
+        self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True))
+        self.assertEqual(tensor.var(), tensor.var(unbiased=True))
+        self.assertEqual(tensor.var(unbiased=False), tensor.var(0, unbiased=False))
+
+        tensor = torch.FloatTensor([1.0, 2.0]).cuda()
+        self.assertEqual(tensor.var(unbiased=True), 0.5)
+        self.assertEqual(tensor.var(unbiased=False), 0.25)
+
+        tensor = torch.randn(100).cuda()
+        self.assertEqual(tensor.std(0), tensor.std(0, unbiased=True))
+        self.assertEqual(tensor.std(), tensor.std(unbiased=True))
+        self.assertEqual(tensor.std(unbiased=False), tensor.std(0, unbiased=False))
+
+    def test_var_large_input(self):
+        # Large, not-nice input
+        tensor_cpu = torch.randn(2 * 32 * 1024 + 1, 2, 67)
+        tensor_cuda = tensor_cpu.cuda()
+
+        self.assertEqual(tensor_cpu.var(2), tensor_cuda.var(2).cpu())
+
+    def test_var_stability(self):
+        tensor = torch.FloatTensor([2281.5, 2281.25]).cuda()
+
+        # Stability for inner dim
+        self.assertEqual(tensor.var(0), 0.03125)
+
+        # General stability
+        self.assertEqual(tensor.var(), 0.03125)
+
+        # Stability for outer dimensions
+        tensor = tensor.unsqueeze(1)
+        self.assertEqual(tensor.var(0), 0.03125)
+
+    def test_digamma(self):
+        def test(use_double=False):
+            cpu_tensor = torch.randn(10, 10, 10)
+            gpu_tensor = cpu_tensor.cuda()
+            zeros = torch.zeros(10, 10, 10)
+            if (use_double):
+                cpu_tensor = cpu_tensor.double()
+                gpu_tensor = gpu_tensor.double()
+                zeros = zeros.double()
+            cpu_out = cpu_tensor.digamma()
+            gpu_out = gpu_tensor.digamma()
+            norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
+            self.assertEqual(norm_errors, zeros)
+
+        test(True)
+        test(False)
+
+        # Test float32 behavior near and at poles.
+        cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
+                                  -100.99999994, -1931.99999994, 0.000000111,
+                                  -0.000000111, 0, -1, -2, -931])
+        expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan])
+        gpu_tensor = cpu_tensor.cuda()
+        cpu_out = cpu_tensor.digamma()
+        gpu_out = gpu_tensor.digamma()
+        norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
+        self.assertEqual(norm_errors, expected_errors)
+
+    def test_polygamma(self):
+        def test(use_double=False):
+            cpu_tensor = torch.randn(10, 10, 10)
+            gpu_tensor = cpu_tensor.cuda()
+            zeros = torch.zeros(10, 10, 10)
+            if (use_double):
+                cpu_tensor = cpu_tensor.double()
+                gpu_tensor = gpu_tensor.double()
+                zeros = zeros.double()
+            for n in [0, 1]:
+                cpu_out = cpu_tensor.polygamma(n)
+                gpu_out = gpu_tensor.polygamma(n)
+                norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
+                self.assertEqual(norm_errors, zeros)
+
+        test(True)
+        test(False)
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_symeig(self):
+        # Small case
+        tensor = torch.randn(3, 3).cuda()
+        tensor = torch.mm(tensor, tensor.t())
+        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
+        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
+
+        # Large case
+        tensor = torch.randn(257, 257).cuda()
+        tensor = torch.mm(tensor, tensor.t())
+        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
+        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
+
+    def test_arange(self):
+        for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']:
+            a = torch.cuda.__dict__[t]()
+            torch.arange(0, 10, out=a)
+            b = torch.__dict__[t]()
+            torch.arange(0, 10, out=b)
+            self.assertEqual(a, b.cuda())
+
+    def test_linspace(self):
+        a = torch.linspace(0, 10, 10, device='cuda')
+        b = torch.linspace(0, 10, 10)
+        self.assertEqual(a, b.cuda())
+
+    def test_logspace(self):
+        a = torch.logspace(1, 10, 10, device='cuda')
+        b = torch.logspace(1, 10, 10)
+        self.assertEqual(a, b.cuda())
+
+    def test_diagonal(self):
+        TestTorch._test_diagonal(self, dtype=torch.float32, device='cuda')
+
+    def test_diagflat(self):
+        TestTorch._test_diagflat(self, dtype=torch.float32, device='cuda')
+
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_trtrs(self):
+        TestTorch._test_trtrs(self, lambda t: t.cuda())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_get_set_rng_state_all(self):
+        states = torch.cuda.get_rng_state_all()
+        before0 = torch.cuda.FloatTensor(100, device=0).normal_()
+        before1 = torch.cuda.FloatTensor(100, device=1).normal_()
+        torch.cuda.set_rng_state_all(states)
+        after0 = torch.cuda.FloatTensor(100, device=0).normal_()
+        after1 = torch.cuda.FloatTensor(100, device=1).normal_()
+        self.assertEqual(before0, after0, 0)
+        self.assertEqual(before1, after1, 0)
+
+    def test_nvtx(self):
+        # Just making sure we can see the symbols
+        torch.cuda.nvtx.range_push("foo")
+        torch.cuda.nvtx.mark("bar")
+        torch.cuda.nvtx.range_pop()
+
+    def test_randperm_cuda(self):
+        cuda = torch.device('cuda:0')
+
+        # For small inputs, randperm is offloaded to CPU instead
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100, device=cuda)
+        res2 = torch.cuda.LongTensor()
+        torch.randperm(100, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100000, device=cuda)
+        res2 = torch.cuda.LongTensor()
+        torch.randperm(100000, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(100, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(50000, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(50000, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        # randperm of 0 elements is an empty tensor
+        res1 = torch.randperm(0, device=cuda)
+        res2 = torch.cuda.LongTensor(5)
+        torch.randperm(0, out=res2, device=cuda)
+        self.assertEqual(res1.numel(), 0)
+        self.assertEqual(res2.numel(), 0)
+
+    def test_random_neg_values(self):
+        TestTorch._test_random_neg_values(self, use_cuda=True)
+
+    def test_overlapped_indices(self):
+        a = torch.arange(0, 128).view(32, 4).cuda()
+        b = torch.arange(0, 128).view(32, 4).cuda()
+        b = b.set_(b.storage(), storage_offset=0, size=(65, 64), stride=(1, 1))
+        b += 5
+        b = b.set_(b.storage(),
+                   storage_offset=0,
+                   size=a.size(),
+                   stride=a.stride())
+        a += 5
+        self.assertEqual(a, b)
+
+    def test_bincount_cuda(self):
+        TestTorch._test_bincount(self, device='cuda')
+        # ensure CUDA code coverage
+        input_size = (5000,)
+        w = torch.randn(input_size, device='cuda')
+        w_cpu = w.cpu()
+        # test shared memory impl
+        t = torch.randint(50, input_size, dtype=torch.int8, device='cuda')
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+        # test multi block memory impl
+        # see `THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM` in SummaryOps.cu
+        t = torch.randint(500, input_size, dtype=torch.int64, device='cuda')
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+        # test global memory impl
+        # see `THRESH_NUMBER_BINS_FOR_GLOBAL_MEM` in SummaryOps.cu
+        t = torch.randint(2000, input_size, dtype=torch.int64, device='cuda')
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+
+    def test_tiny_half_norm_(self):
+        a = torch.arange(25).cuda().float()
+        a /= 100000000
+        b = a.half()
+        self.assertGreater(b.norm().item(), 0)
+
+    # Test that wrap_with_cuda_memory_check successfully detects leak
+    def test_cuda_memory_leak_detection(self):
+        l = []
+
+        @self.wrap_with_cuda_memory_check
+        def no_leak():
+            pass
+
+        @self.wrap_with_cuda_memory_check
+        def leak_gpu0():
+            l.append(torch.tensor(10, device=torch.device("cuda:0")))
+
+        no_leak()
+
+        with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 0"):
+            leak_gpu0()
+
+        if TEST_MULTIGPU:
+            @self.wrap_with_cuda_memory_check
+            def leak_gpu1():
+                l.append(torch.tensor(10, device=torch.device("cuda:1")))
+
+            with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"):
+                leak_gpu1()
+
+
+def load_ignore_file():
+    from os.path import join, dirname
+    global ignores
+    path = join(dirname(__file__), 'data', 'test_cuda_ignores.txt')
+    with open(path, 'r') as f:
+        ignores = {l for l in f.read().splitlines() if not l.startswith('#')}
+
+
+def generate_tests():
+    for decl in tests:
+        for t in types:
+            tensor = t()
+
+            # Default values
+            desc = ''
+            type_subset = types
+            no_inplace = False
+            decorator = None
+            if len(decl) == 3:
+                name, constr, arg_constr = decl
+            elif len(decl) == 4:
+                name, constr, arg_constr, desc = decl
+            elif len(decl) == 5:
+                name, constr, arg_constr, desc, type_subset = decl
+            elif len(decl) == 6:
+                name, constr, arg_constr, desc, type_subset, no_inplace = decl
+            elif len(decl) == 7:
+                name, constr, arg_constr, desc, type_subset, no_inplace, decorator = decl
+
+            if t not in type_subset:
+                continue
+
+            precision = custom_precision.get(name, TestCuda.precision)
+            if is_half(t):
+                precision = custom_half_precision.get(name, precision)
+
+            for inplace in (True, False):
+                if inplace and no_inplace:
+                    continue
+                if inplace:
+                    name_inner = name + '_'
+                else:
+                    name_inner = name
+
+                if t != torch.HalfTensor and not hasattr(tensor, name_inner):
+                    # torch.HalfTensor doesn't support most operations,
+                    # but we use torch.FloatTensor as cpu baseline
+                    continue
+                full_name = '{}.{}'.format(tensor.type(), name_inner)
+                if full_name in ignores:
+                    continue
+
+                test_name = 'test_' + t.__name__ + '_' + name_inner
+                if desc:
+                    test_name += '_' + desc
+
+                assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
+
+                test_fn = compare_cpu_gpu(constr, arg_constr, name_inner, t, precision)
+
+                if decorator is not None:
+                    test_fn = decorator(test_fn)
+
+                setattr(TestCuda, test_name, test_fn)
+
+
+if __name__ == '__main__':
+    if TEST_CUDA:
+        load_ignore_file()
+        generate_tests()
+
+    # skip TestTorch tests
+    # hide in __name__ == '__main__' because we don't want this to be run when
+    # someone imports test_cuda
+    def load_tests(loader, tests, pattern):
+        test_suite = unittest.TestSuite()
+        for test_group in tests:
+            for test in test_group:
+                if test.__class__.__name__ == 'TestTorch':
+                    continue
+                test_suite.addTest(test)
+        return test_suite
+
+    run_tests()
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
new file mode 100644
index 0000000..d1dabd2
--- /dev/null
+++ b/test/test_dataloader.py
@@ -0,0 +1,721 @@
+import math
+import sys
+import errno
+import os
+import ctypes
+import signal
+import torch
+import time
+import traceback
+import unittest
+import subprocess
+from torch import multiprocessing as mp
+from torch.utils.data import Dataset, TensorDataset, DataLoader, ConcatDataset
+from torch.utils.data.dataset import random_split
+from torch.utils.data.dataloader import default_collate, ExceptionWrapper, MANAGER_STATUS_CHECK_INTERVAL
+from common import TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN
+
+# We cannot import TEST_CUDA from common_nn here, because if we do that,
+# the TEST_CUDNN line from common_nn will be executed multiple times
+# as well during the execution of this test suite, and it will cause
+# CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+
+# We need spawn start method for test_manager_unclean_exit, but
+# Python 2.7 doesn't allow it.
+if sys.version_info[0] == 3:
+    # Get a multiprocessing context because some test / third party library will
+    # set start_method when imported, and setting again triggers RuntimeError.
+    mp = mp.get_context(method='spawn')
+
+
+JOIN_TIMEOUT = 17.0 if IS_WINDOWS else 6.5
+
+
+class TestDatasetRandomSplit(TestCase):
+    def test_lengths_must_equal_datset_size(self):
+        with self.assertRaises(ValueError):
+            random_split([1, 2, 3, 4], [1, 2])
+
+    def test_splits_have_correct_size(self):
+        splits = random_split([1, 2, 3, 4, 5, 6], [2, 4])
+        self.assertEqual(len(splits), 2)
+        self.assertEqual(len(splits[0]), 2)
+        self.assertEqual(len(splits[1]), 4)
+
+    def test_splits_are_mutually_exclusive(self):
+        data = [5, 2, 3, 4, 1, 6]
+        splits = random_split(data, [2, 4])
+        all_values = []
+        all_values.extend(list(splits[0]))
+        all_values.extend(list(splits[1]))
+        data.sort()
+        all_values.sort()
+        self.assertListEqual(data, all_values)
+
+
+class TestTensorDataset(TestCase):
+
+    def test_len(self):
+        source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15))
+        self.assertEqual(len(source), 15)
+
+    def test_getitem(self):
+        t = torch.randn(15, 10, 2, 3, 4, 5)
+        l = torch.randn(15, 10)
+        source = TensorDataset(t, l)
+        for i in range(15):
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])
+
+    def test_getitem_1d(self):
+        t = torch.randn(15)
+        l = torch.randn(15)
+        source = TensorDataset(t, l)
+        for i in range(15):
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])
+
+    def test_single_tensor(self):
+        t = torch.randn(5, 10)
+        source = TensorDataset(t)
+        self.assertEqual(len(source), 5)
+        for i in range(5):
+            self.assertEqual(t[i], source[i][0])
+
+    def test_many_tensors(self):
+        t0 = torch.randn(5, 10, 2, 3, 4, 5)
+        t1 = torch.randn(5, 10)
+        t2 = torch.randn(5, 10, 2, 5)
+        t3 = torch.randn(5, 10, 3, 7)
+        source = TensorDataset(t0, t1, t2, t3)
+        self.assertEqual(len(source), 5)
+        for i in range(5):
+            self.assertEqual(t0[i], source[i][0])
+            self.assertEqual(t1[i], source[i][1])
+            self.assertEqual(t2[i], source[i][2])
+            self.assertEqual(t3[i], source[i][3])
+
+
+class TestConcatDataset(TestCase):
+
+    def test_concat_two_singletons(self):
+        result = ConcatDataset([[0], [1]])
+        self.assertEqual(2, len(result))
+        self.assertEqual(0, result[0])
+        self.assertEqual(1, result[1])
+
+    def test_concat_two_non_singletons(self):
+        result = ConcatDataset([[0, 1, 2, 3, 4],
+                                [5, 6, 7, 8, 9]])
+        self.assertEqual(10, len(result))
+        self.assertEqual(0, result[0])
+        self.assertEqual(5, result[5])
+
+    def test_concat_two_non_singletons_with_empty(self):
+        # Adding an empty dataset somewhere is correctly handled
+        result = ConcatDataset([[0, 1, 2, 3, 4],
+                                [],
+                                [5, 6, 7, 8, 9]])
+        self.assertEqual(10, len(result))
+        self.assertEqual(0, result[0])
+        self.assertEqual(5, result[5])
+
+    def test_concat_raises_index_error(self):
+        result = ConcatDataset([[0, 1, 2, 3, 4],
+                                [5, 6, 7, 8, 9]])
+        with self.assertRaises(IndexError):
+            # this one goes to 11
+            result[11]
+
+    def test_add_dataset(self):
+        d1 = TensorDataset(torch.rand(7, 3, 28, 28), torch.rand(7))
+        d2 = TensorDataset(torch.rand(7, 3, 28, 28), torch.rand(7))
+        d3 = TensorDataset(torch.rand(7, 3, 28, 28), torch.rand(7))
+        result = d1 + d2 + d3
+        self.assertEqual(21, len(result))
+        self.assertEqual(0, (d1[0][0] - result[0][0]).abs().sum())
+        self.assertEqual(0, (d2[0][0] - result[7][0]).abs().sum())
+        self.assertEqual(0, (d3[0][0] - result[14][0]).abs().sum())
+
+
+# Stores the first encountered exception in .exception.
+# Inspired by https://stackoverflow.com/a/33599967
+class ErrorTrackingProcess(mp.Process):
+
+    def __init__(self, *args, **kwargs):
+        super(ErrorTrackingProcess, self).__init__(*args, **kwargs)
+        self._pconn, self._cconn = mp.Pipe()
+        self._exception = None
+
+    def run(self):
+        # Disable stderr printing from os level, and make workers not printing
+        # to stderr.
+        # Can't use sys.stderr.close, otherwise Python `raise` will error with
+        # ValueError: I/O operation on closed file.
+        os.close(sys.stderr.fileno())
+        try:
+            super(ErrorTrackingProcess, self).run()
+            self._cconn.send(None)
+        except Exception as e:
+            self._cconn.send(ExceptionWrapper(sys.exc_info()))
+            raise
+
+    @property
+    def exception(self):
+        if self._pconn.poll():
+            self._exception = self._pconn.recv()
+        if self._exception is None:
+            return None
+        else:
+            return self._exception.exc_type(self._exception.exc_msg)
+
+    # ESRCH means that os.kill can't finds alive proc
+    def send_signal(self, signum, ignore_ESRCH=False):
+        try:
+            os.kill(self.pid, signum)
+        except OSError as e:
+            if not ignore_ESRCH or e.errno != errno.ESRCH:
+                raise
+
+
+class ErrorDataset(Dataset):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __len__(self):
+        return self.size
+
+
+class SegfaultDataset(Dataset):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __getitem__(self, idx):
+        return ctypes.string_at(0)
+
+    def __len__(self):
+        return self.size
+
+
+class SleepDataset(Dataset):
+
+    def __init__(self, size, sleep_sec):
+        self.size = size
+        self.sleep_sec = sleep_sec
+
+    def __getitem__(self, idx):
+        time.sleep(self.sleep_sec)
+        return idx
+
+    def __len__(self):
+        return self.size
+
+
+class SeedDataset(Dataset):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __getitem__(self, idx):
+        return torch.initial_seed()
+
+    def __len__(self):
+        return self.size
+
+
+# Inspired by https://stackoverflow.com/a/26703365
+# This will ensure that each worker at least processes one data
+class SynchronizedSeedDataset(Dataset):
+
+    def __init__(self, size, num_workers):
+        assert size >= num_workers
+        self.count = mp.Value('i', 0, lock=True)
+        self.barrier = mp.Semaphore(0)
+        self.num_workers = num_workers
+        self.size = size
+
+    def __getitem__(self, idx):
+        with self.count.get_lock():
+            self.count.value += 1
+            if self.count.value == self.num_workers:
+                self.barrier.release()
+        self.barrier.acquire()
+        self.barrier.release()
+        return torch.initial_seed()
+
+    def __len__(self):
+        return self.size
+
+
+def _test_timeout():
+    dataset = SleepDataset(10, 10)
+    dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1)
+    _ = next(iter(dataloader))
+
+
+def _test_segfault():
+    dataset = SegfaultDataset(10)
+    dataloader = DataLoader(dataset, batch_size=2, num_workers=2)
+    _ = next(iter(dataloader))
+
+
+# test custom init function
+def init_fn(worker_id):
+    torch.manual_seed(12345)
+
+
+class TestDataLoader(TestCase):
+
+    def setUp(self):
+        self.data = torch.randn(100, 2, 3, 5)
+        self.labels = torch.randperm(50).repeat(2)
+        self.dataset = TensorDataset(self.data, self.labels)
+
+    def _test_sequential(self, loader):
+        batch_size = loader.batch_size
+        for i, (sample, target) in enumerate(loader):
+            idx = i * batch_size
+            self.assertEqual(sample, self.data[idx:idx + batch_size])
+            self.assertEqual(target, self.labels[idx:idx + batch_size])
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
+
+    def _test_shuffle(self, loader):
+        found_data = {i: 0 for i in range(self.data.size(0))}
+        found_labels = {i: 0 for i in range(self.labels.size(0))}
+        batch_size = loader.batch_size
+        for i, (batch_samples, batch_targets) in enumerate(loader):
+            for sample, target in zip(batch_samples, batch_targets):
+                for data_point_idx, data_point in enumerate(self.data):
+                    if data_point.eq(sample).all():
+                        self.assertFalse(found_data[data_point_idx])
+                        found_data[data_point_idx] += 1
+                        break
+                self.assertEqual(target, self.labels[data_point_idx])
+                found_labels[data_point_idx] += 1
+            self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
+            self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
+
+    def _test_error(self, loader):
+        it = iter(loader)
+        errors = 0
+        while True:
+            try:
+                next(it)
+            except NotImplementedError:
+                errors += 1
+            except StopIteration:
+                self.assertEqual(errors,
+                                 math.ceil(float(len(loader.dataset)) / loader.batch_size))
+                return
+
+    def test_invalid_assign_after_init(self):
+        dl = DataLoader(self.dataset)
+        for attr in ('batch_size', 'sampler', 'drop_last'):
+            def fn():
+                setattr(dl, attr, {})
+
+            self.assertRaises(ValueError, fn)
+
+    def test_sequential(self):
+        self._test_sequential(DataLoader(self.dataset))
+
+    def test_sequential_batch(self):
+        self._test_sequential(DataLoader(self.dataset, batch_size=2))
+
+    def test_growing_dataset(self):
+        dataset = [torch.ones(4) for _ in range(4)]
+        dataloader_seq = DataLoader(dataset, shuffle=False)
+        dataloader_shuffle = DataLoader(dataset, shuffle=True)
+        dataset.append(torch.ones(4))
+        self.assertEqual(len(dataloader_seq), 5)
+        self.assertEqual(len(dataloader_shuffle), 5)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_sequential_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
+        for input, target in loader:
+            self.assertTrue(input.is_pinned())
+            self.assertTrue(target.is_pinned())
+
+    def test_multiple_dataloaders(self):
+        loader1_it = iter(DataLoader(self.dataset, num_workers=1))
+        loader2_it = iter(DataLoader(self.dataset, num_workers=2))
+        next(loader1_it)
+        next(loader1_it)
+        next(loader2_it)
+        next(loader2_it)
+        next(loader1_it)
+        next(loader2_it)
+
+    @unittest.skip("temporarily disable until flaky failures are fixed")
+    def test_segfault(self):
+        p = ErrorTrackingProcess(target=_test_segfault)
+        p.start()
+        p.join(JOIN_TIMEOUT)
+        try:
+            self.assertFalse(p.is_alive())
+            self.assertNotEqual(p.exitcode, 0)
+            if IS_WINDOWS:
+                self.assertIsInstance(p.exception, OSError)
+                self.assertRegex(str(p.exception), r'access violation reading ')
+            else:
+                self.assertIsInstance(p.exception, RuntimeError)
+                self.assertRegex(str(p.exception), r'DataLoader worker \(pid \d+\) is killed by signal: ')
+        finally:
+            p.terminate()
+
+    def test_timeout(self):
+        p = ErrorTrackingProcess(target=_test_timeout)
+        p.start()
+        p.join(JOIN_TIMEOUT)
+        try:
+            self.assertFalse(p.is_alive())
+            self.assertNotEqual(p.exitcode, 0)
+            self.assertIsInstance(p.exception, RuntimeError)
+            self.assertRegex(str(p.exception), r'DataLoader timed out after \d+ seconds')
+        finally:
+            p.terminate()
+
+    def test_worker_seed(self):
+        num_workers = 6
+        dataset = SynchronizedSeedDataset(num_workers, num_workers)
+        dataloader = DataLoader(dataset, batch_size=1, num_workers=num_workers)
+        seeds = set()
+        for batch in dataloader:
+            seeds.add(batch[0])
+        self.assertEqual(len(seeds), num_workers)
+
+    def test_worker_init_fn(self):
+        dataset = SeedDataset(4)
+        dataloader = DataLoader(dataset, batch_size=2, num_workers=2,
+                                worker_init_fn=init_fn)
+        for batch in dataloader:
+            self.assertEqual(12345, batch[0])
+            self.assertEqual(12345, batch[1])
+
+    def test_shuffle(self):
+        self._test_shuffle(DataLoader(self.dataset, shuffle=True))
+
+    def test_shuffle_batch(self):
+        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True))
+
+    def test_sequential_workers(self):
+        self._test_sequential(DataLoader(self.dataset, num_workers=4))
+
+    def test_seqential_batch_workers(self):
+        self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4))
+
+    def test_shuffle_workers(self):
+        self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4))
+
+    def test_shuffle_batch_workers(self):
+        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
+
+    def _test_batch_sampler(self, **kwargs):
+        # [(0, 1), (2, 3, 4), (5, 6), (7, 8, 9), ...]
+        batches = []
+        for i in range(0, 100, 5):
+            batches.append(tuple(range(i, i + 2)))
+            batches.append(tuple(range(i + 2, i + 5)))
+
+        dl = DataLoader(self.dataset, batch_sampler=batches, **kwargs)
+        self.assertEqual(len(dl), 40)
+        for i, (input, _target) in enumerate(dl):
+            if i % 2 == 0:
+                offset = i * 5 // 2
+                self.assertEqual(len(input), 2)
+                self.assertEqual(input, self.data[offset:offset + 2])
+            else:
+                offset = i * 5 // 2
+                self.assertEqual(len(input), 3)
+                self.assertEqual(input, self.data[offset:offset + 3])
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    def test_batch_sampler(self):
+        self._test_batch_sampler()
+        self._test_batch_sampler(num_workers=4)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_shuffle_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
+        for input, target in loader:
+            self.assertTrue(input.is_pinned())
+            self.assertTrue(target.is_pinned())
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_numpy(self):
+        import numpy as np
+
+        class TestDataset(torch.utils.data.Dataset):
+            def __getitem__(self, i):
+                return np.ones((2, 3, 4)) * i
+
+            def __len__(self):
+                return 1000
+
+        loader = DataLoader(TestDataset(), batch_size=12)
+        batch = next(iter(loader))
+        self.assertIsInstance(batch, torch.DoubleTensor)
+        self.assertEqual(batch.size(), torch.Size([12, 2, 3, 4]))
+
+    def test_error(self):
+        self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    def test_error_workers(self):
+        self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
+
+    @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test")
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_partial_workers(self):
+        "check that workers exit even if the iterator is not exhausted"
+        loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=True))
+        workers = loader.workers
+        worker_manager_thread = loader.worker_manager_thread
+        for i, sample in enumerate(loader):
+            if i == 3:
+                break
+        del loader
+        for w in workers:
+            w.join(JOIN_TIMEOUT)
+            self.assertFalse(w.is_alive(), 'subprocess not terminated')
+            self.assertEqual(w.exitcode, 0)
+        worker_manager_thread.join(JOIN_TIMEOUT)
+        self.assertFalse(worker_manager_thread.is_alive())
+
+    @staticmethod
+    def _manager_process(dataset, worker_pids, manager_exit_event):
+        loader = iter(DataLoader(dataset, batch_size=2, num_workers=4, pin_memory=True))
+        workers = loader.workers
+        for i in range(len(workers)):
+            worker_pids[i] = int(workers[i].pid)
+        for i, sample in enumerate(loader):
+            if i == 3:
+                break
+        # Simulate a dirty exit of the manager process
+        manager_exit_event.set()
+        if IS_WINDOWS:
+            os.system('taskkill /PID ' + str(os.getpid()) + ' /F')
+        else:
+            os.kill(os.getpid(), signal.SIGKILL)
+
+    @staticmethod
+    def _is_process_alive(pid, pname):
+        # There is a chance of a terminated child process's pid being reused by a new unrelated process,
+        # but since we are looping this check very frequently, we will know that the child process dies
+        # before the new unrelated process starts.
+        if IS_WINDOWS:
+            command = 'tasklist | find "{}" /i'.format(pid)
+        else:
+            command = 'ps -p {} -o comm='.format(pid)
+        p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
+        (output, err) = p.communicate()
+        p_status = p.wait()
+        output = output.decode('utf-8')
+        return pname in output
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     "spawn start method is not supported in Python 2, \
+                     but we need it for creating another process with CUDA")
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_manager_unclean_exit(self):
+        '''there might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \
+but they are all safe to ignore'''
+        worker_pids = mp.Array('i', [0] * 4)
+
+        manager_exit_event = mp.Event()
+        p = mp.Process(target=TestDataLoader._manager_process,
+                       args=(self.dataset, worker_pids, manager_exit_event))
+        p.start()
+
+        manager_exit_event.wait()
+
+        exit_status = [False] * len(worker_pids)
+        start_time = time.time()
+        pname = 'python'
+        while True:
+            for i in range(len(worker_pids)):
+                pid = worker_pids[i]
+                if not exit_status[i]:
+                    if not TestDataLoader._is_process_alive(pid, pname):
+                        exit_status[i] = True
+            if all(exit_status):
+                break
+            else:
+                time.sleep(1)
+                self.assertFalse(time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT,
+                                 'subprocess not terminated')
+
+    def test_len(self):
+        def check_len(dl, expected):
+            self.assertEqual(len(dl), expected)
+            n = 0
+            for sample in dl:
+                n += 1
+            self.assertEqual(n, expected)
+        check_len(self.dataset, 100)
+        check_len(DataLoader(self.dataset, batch_size=2), 50)
+        check_len(DataLoader(self.dataset, batch_size=3), 34)
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_numpy_scalars(self):
+        import numpy as np
+
+        class ScalarDataset(torch.utils.data.Dataset):
+            def __init__(self, dtype):
+                self.dtype = dtype
+
+            def __getitem__(self, i):
+                return self.dtype()
+
+            def __len__(self):
+                return 4
+
+        dtypes = {
+            np.float64: torch.DoubleTensor,
+            np.float32: torch.FloatTensor,
+            np.float16: torch.HalfTensor,
+            np.int64: torch.LongTensor,
+            np.int32: torch.IntTensor,
+            np.int16: torch.ShortTensor,
+            np.int8: torch.CharTensor,
+            np.uint8: torch.ByteTensor,
+        }
+        for dt, tt in dtypes.items():
+            dset = ScalarDataset(dt)
+            loader = DataLoader(dset, batch_size=2)
+            batch = next(iter(loader))
+            self.assertIsInstance(batch, tt)
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_default_colate_bad_numpy_types(self):
+        import numpy as np
+
+        # Should be a no-op
+        arr = np.array(['a', 'b', 'c'])
+        default_collate(arr)
+
+        arr = np.array([[['a', 'b', 'c']]])
+        self.assertRaises(TypeError, lambda: default_collate(arr))
+
+        arr = np.array([object(), object(), object()])
+        self.assertRaises(TypeError, lambda: default_collate(arr))
+
+        arr = np.array([[[object(), object(), object()]]])
+        self.assertRaises(TypeError, lambda: default_collate(arr))
+
+
+class StringDataset(Dataset):
+    def __init__(self):
+        self.s = '12345'
+
+    def __len__(self):
+        return len(self.s)
+
+    def __getitem__(self, ndx):
+        return (self.s[ndx], ndx)
+
+
+class TestStringDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = StringDataset()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_shuffle_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
+        for batch_ndx, (s, n) in enumerate(loader):
+            self.assertIsInstance(s[0], str)
+            self.assertTrue(n.is_pinned())
+
+
+class DictDataset(Dataset):
+    def __len__(self):
+        return 4
+
+    def __getitem__(self, ndx):
+        return {
+            'a_tensor': torch.Tensor(4, 2).fill_(ndx),
+            'another_dict': {
+                'a_number': ndx,
+            },
+        }
+
+
+class TestDictDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = DictDataset()
+
+    def test_sequential_batch(self):
+        loader = DataLoader(self.dataset, batch_size=2, shuffle=False)
+        batch_size = loader.batch_size
+        for i, sample in enumerate(loader):
+            idx = i * batch_size
+            self.assertEqual(set(sample.keys()), {'a_tensor', 'another_dict'})
+            self.assertEqual(set(sample['another_dict'].keys()), {'a_number'})
+
+            t = sample['a_tensor']
+            self.assertEqual(t.size(), torch.Size([batch_size, 4, 2]))
+            self.assertTrue((t[0] == idx).all())
+            self.assertTrue((t[1] == idx + 1).all())
+
+            n = sample['another_dict']['a_number']
+            self.assertEqual(n.size(), torch.Size([batch_size]))
+            self.assertEqual(n[0], idx)
+            self.assertEqual(n[1], idx + 1)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
+        for batch_ndx, sample in enumerate(loader):
+            self.assertTrue(sample['a_tensor'].is_pinned())
+            self.assertTrue(sample['another_dict']['a_number'].is_pinned())
+
+
+class TestWorkerQueueDataset(Dataset):
+    def __init__(self, data):
+        self.data = data
+        self.worker_id = None
+
+    def worker_init_fn(self, worker_id):
+        self.worker_id = worker_id
+
+    def __getitem__(self, item):
+        return self.worker_id, self.data[item]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class TestIndividualWorkerQueue(TestCase):
+    def setUp(self):
+        self.dataset = TestWorkerQueueDataset([i for i in range(128)])
+
+    def _run_ind_worker_queue_test(self, batch_size, num_workers):
+        loader = DataLoader(
+            self.dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
+            worker_init_fn=self.dataset.worker_init_fn
+        )
+        current_worker_idx = 0
+        for i, (worker_ids, sample) in enumerate(loader):
+            self.assertEqual(worker_ids.tolist(), [current_worker_idx] * batch_size)
+            self.assertEqual(sample.tolist(), [j for j in range(i * batch_size, (i + 1) * batch_size)])
+            current_worker_idx += 1
+            if current_worker_idx == num_workers:
+                current_worker_idx = 0
+
+    def test_ind_worker_queue(self):
+        for batch_size in (8, 16, 32, 64):
+            for num_workers in range(1, 6):
+                self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_distributed.py b/test/test_distributed.py
new file mode 100644
index 0000000..5f50165
--- /dev/null
+++ b/test/test_distributed.py
@@ -0,0 +1,1003 @@
+import fcntl
+import multiprocessing
+import os
+import sys
+import copy
+import time
+import unittest
+from functools import wraps, reduce
+from contextlib import contextmanager
+
+import torch
+import torch.cuda
+import torch.nn as nn
+import torch.distributed as dist
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+from common import TestCase
+
+from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
+
+BACKEND = os.environ['BACKEND']
+TEMP_DIR = os.environ['TEMP_DIR']
+INIT_METHOD = os.getenv('INIT_METHOD', 'env://')
+MASTER_PORT = '29500'
+
+DEFAULT_TIMEOUT = 15
+CUSTOMIZED_TIMEOUT = {'test_DistributedDataParallel': 25}
+
+
+def get_timeout(test_id):
+    test_name = test_id.split('.')[-1]
+    if test_name in CUSTOMIZED_TIMEOUT:
+        return CUSTOMIZED_TIMEOUT[test_name]
+    else:
+        return DEFAULT_TIMEOUT
+
+
+if not dist.is_available():
+    print('Distributed not available, skipping tests')
+    sys.exit(0)
+
+SKIP_IF_NO_CUDA_EXIT_CODE = 75
+SKIP_IF_NO_GPU_EXIT_CODE = 76
+SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
+SKIP_IF_BACKEND_UNAVAILABLE = 78
+
+
+def skip_if_no_cuda_distributed(func):
+    func.skip_if_no_cuda_distributed = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def skip_if_no_gpu(func):
+    """ Nccl multigpu tests requires at least 2 GPUS. Skip if this is not met"""
+    func.skip_if_no_gpu = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+        if torch.cuda.device_count() < int(os.environ['WORLD_SIZE']):
+            sys.exit(SKIP_IF_NO_GPU_EXIT_CODE)
+
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def skip_if_small_worldsize(func):
+    func.skip_if_small_worldsize = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ['BACKEND'] != "mpi") and int(os.environ['WORLD_SIZE']) <= 2:
+            sys.exit(SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE)
+
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def apply_hack_for_nccl():
+    # This is a hack for a known NCCL issue using multiprocess
+    # in conjunction with multiple threads to manage different GPUs which
+    # may cause ncclCommInitRank to fail.
+    # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4
+    # It slows down the performance of collective operations.
+    # Without this setting NCCL might throw unhandled error.
+    os.environ['NCCL_MAX_NRINGS'] = '1'
+
+
+@contextmanager
+def _lock():
+    lockfile = os.path.join(TEMP_DIR, 'lockfile')
+    with open(lockfile, 'w') as lf:
+        try:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+            yield
+        finally:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            lf.close()
+
+
+def _build_tensor(size, value=None):
+    if value is None:
+        value = size
+    return torch.FloatTensor(size, size, size).fill_(value)
+
+
+class Barrier(object):
+    barrier_id = 0
+
+    @classmethod
+    def init(cls):
+        cls.barrier_id = 0
+        barrier_dir = os.path.join(TEMP_DIR, 'barrier')
+        for f_name in os.listdir(barrier_dir):
+            os.unlink(os.path.join(barrier_dir, f_name))
+
+    @classmethod
+    def sync(cls, timeout=5):
+        cls.barrier_id += 1
+        barrier_dir = os.path.join(TEMP_DIR, 'barrier')
+        pid = str(os.getpid())
+        barrier_file = os.path.join(barrier_dir, pid)
+        with _lock():
+            with open(barrier_file, 'w') as f:
+                f.write(str(cls.barrier_id))
+
+        start_time = time.time()
+        while True:
+            arrived = 0
+            with _lock():
+                for f_name in os.listdir(barrier_dir):
+                    with open(os.path.join(barrier_dir, f_name), 'r') as f:
+                        data = f.read()
+                        if int(data) >= cls.barrier_id:
+                            arrived += 1
+            if arrived == dist.get_world_size():
+                break
+
+            if time.time() - start_time > timeout:
+                raise RuntimeError("barrier timeout")
+            time.sleep(0.1)
+
+
+class _DistTestBase(object):
+
+    def _barrier(self, *args, **kwargs):
+        Barrier.sync(*args, **kwargs)
+
+    def _init_group_test(self):
+        group = [1, 2]
+        group_id = dist.new_group(group)
+        rank = dist.get_rank()
+        if rank not in group:
+            return ([], None, rank)
+
+        return (group, group_id, rank)
+
+    def _init_global_test(self):
+        group = [i for i in range(0, dist.get_world_size())]
+        group_id = dist.group.WORLD
+        rank = dist.get_rank()
+        return (group, group_id, rank)
+
+    # HELPER FOR MULTIGPU TESTS
+    def _init_multigpu_helper(self):
+        """Multigpu tests are designed to simulate the multi nodes with multi
+        GPUs on each node. Nccl backend requires equal #GPUs in each process.
+        On a single node, all visible GPUs are evenly
+        divided to subsets, each process only uses a subset.
+        """
+        nGPUs = torch.cuda.device_count()
+        world_size = dist.get_world_size()
+        visible_devices = range(nGPUs)
+
+        if BACKEND == 'nccl':
+            apply_hack_for_nccl()
+
+        nGPUs_per_process = nGPUs // world_size
+        rank_to_GPU = {i: list(visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process])
+                       for i in range(world_size)}
+        return rank_to_GPU
+
+    # GET RANK
+    def test_get_rank(self):
+        test_dir = os.path.join(TEMP_DIR, 'test_dir')
+        pid = str(os.getpid())
+        num_processes = dist.get_world_size()
+        with open(os.path.join(test_dir, pid), 'w') as f:
+            f.write(str(dist.get_rank()))
+
+        self._barrier()
+
+        all_ranks = set()
+        for f_name in os.listdir(test_dir):
+            with open(os.path.join(test_dir, f_name), 'r') as f:
+                all_ranks.add(int(f.read()))
+        self.assertEqual(len(all_ranks), num_processes)
+
+        self._barrier()
+
+        if dist.get_rank() == 0:
+            for f_name in os.listdir(test_dir):
+                os.unlink(os.path.join(test_dir, f_name))
+
+        self._barrier()
+
+    # SEND RECV
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support send/recv")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support send/recv")
+    def test_send_recv(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(rank + 1)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(src + 1, value=-1)
+            expected_tensor = _build_tensor(src + 1)
+            dist.recv(tensor, src)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    # SEND RECV ANY SOURCE
+    @unittest.skipIf(BACKEND == 'gloo',
+                     "Gloo does not support send/recv from any source")
+    @unittest.skipIf(BACKEND == 'nccl',
+                     "Nccl does not support send/recv from any source")
+    def test_send_recv_any_source(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(10, rank)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        recv_ranks = set()
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(10, value=-1)
+            sender = dist.recv(tensor)
+            self.assertTrue(tensor.eq(sender).all())
+            recv_ranks.add(sender)
+
+        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
+        self._barrier()
+
+    # ISEND
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support isend")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support isend")
+    def test_isend(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            requests = [
+                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
+            ]
+            for request in requests:
+                request.wait()
+                self.assertTrue(request.is_completed())
+        else:
+            tensor = _build_tensor(rank, -1)
+            dist.recv(tensor, 0)
+            self.assertEqual(tensor, _build_tensor(rank, 10))
+
+        self._barrier()
+
+    # IRECV
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support irecv")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support irecv")
+    def test_irecv(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
+            requests = [
+                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
+            ]
+
+            for src in range(1, world_size):
+                requests[src - 1].wait()
+                self.assertTrue(requests[src - 1].is_completed())
+                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
+        else:
+            tensor = _build_tensor(rank, 10)
+            dist.send(tensor, 0)
+
+        self._barrier()
+
+    # BROADCAST
+    def _test_broadcast_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None):
+        for ttype, value, requires_cuda in [
+            ('torch.FloatTensor', -1e-10, False),
+            ('torch.DoubleTensor', -1e-100, False),
+            ('torch.HalfTensor', -0.1, True),
+            ('torch.CharTensor', -2, False),
+            ('torch.ByteTensor', 129, False),
+            ('torch.IntTensor', -1e5, False),
+            ('torch.LongTensor', -1e15, False),
+        ]:
+            if requires_cuda and not cuda:
+                continue
+            for src in group:
+                expected_tensor = _build_tensor(src + 1, value).type(ttype)
+                if cuda:
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                if rank == src:
+                    dist.broadcast(expected_tensor, src, group_id)
+                else:
+                    tensor = _build_tensor(src + 1, -1).type(ttype)
+                    if cuda:
+                        tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    dist.broadcast(tensor, src, group_id)
+                    self.assertEqual(tensor.size(), expected_tensor.size())
+                    self.assertEqual(tensor.ne(expected_tensor).max(), 0)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_broadcast(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND != 'gloo' and BACKEND != 'nccl',
+                     "Only Gloo and Nccl backend supports CUDA allReduce")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_broadcast_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_broadcast_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    # REDUCE
+    def _test_reduce_helper(self, group, group_id, rank, op, master_value,
+                            worker_value, expected_value, cuda=False, rank_to_GPU=None):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM,
+            2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    @unittest.skipIf(BACKEND != 'nccl', "Only Nccl supports CUDA reduce")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10,
+            2 + 10 * (len(group) - 1), True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # ALL REDUCE
+    def _test_all_reduce_helper(self, group, group_id, rank, op, master_value,
+                                worker_value, expected_value, cuda=False, rank_to_GPU=None):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_all_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    @unittest.skipIf(BACKEND != 'gloo' and BACKEND != 'nccl',
+                     "Only Gloo & Nccl backend support CUDA allReduce")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1)), True, rank_to_GPU
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_all_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_all_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_all_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # SCATTER
+    def _test_scatter_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, -1)
+            expected_tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, i) for i in group] if rank == dest else []
+            dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support scatter")
+    def test_scatter(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support scatter")
+    @skip_if_small_worldsize
+    def test_scatter_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    # GATHER
+    def _test_gather_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, -1) for i in group] if rank == dest else []
+            dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id)
+            if rank == dest:
+                expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == 'gloo', "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    # ALL GATHER
+    def _test_all_gather_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, -1) for i in group]
+            if cuda:
+                tensor = tensor.cuda(rank_to_GPU[rank][0])
+                tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+            dist.all_gather(tensors, tensor, group_id)
+
+            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+            for t1, t2 in zip(tensors, expected_tensors):
+                self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_all_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND != 'nccl', "Only Nccl supports CUDA all gather")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_gather_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    # BARRIER
+    def _test_barrier_helper(self, group, group_id, rank):
+        WAIT_TIME = 0.3  # seconds
+
+        for dest in group:
+            expected_time = torch.DoubleTensor(1).fill_(0.0)
+            if dest == rank:
+                expected_time.fill_(time.time() + WAIT_TIME)
+                dist.broadcast(expected_time, dest, group_id)
+                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
+                dist.barrier(group_id)
+            else:
+                dist.broadcast(expected_time, dest, group_id)
+                dist.barrier(group_id)
+                self.assertGreaterEqual(time.time(), expected_time[0])
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support CPU tensors")
+    def test_barrier(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == 'nccl', "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_barrier_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    def _test_broadcast_multigpu_helper(self, group, group_id,
+                                        rank, rank_to_GPU):
+        for src in group:
+            expected_tensor = _build_tensor(src + 1)
+            tensors = [_build_tensor(src + 1, -1).cuda(device=i)
+                       for i in rank_to_GPU[rank]]
+            if rank == src:
+                tensors[0] = expected_tensor.cuda(
+                    device=rank_to_GPU[rank][0])
+
+            dist.broadcast_multigpu(tensors, src, group_id)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != 'nccl',
+                     "Only Nccl backend supports broadcast multigpu")
+    @skip_if_no_gpu
+    def test_broadcast_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_multigpu_helper(group, group_id,
+                                             rank, rank_to_GPU)
+
+    def _test_all_reduce_multigpu_helper(self, group, group_id, rank,
+                                         rank_to_GPU, op,
+                                         master_value, worker_value,
+                                         expected_value):
+        for src in group:
+            if rank == src:
+                tensors = [_build_tensor(src + 1, master_value).cuda(device=i)
+                           for i in rank_to_GPU[rank]]
+            else:
+                tensors = [_build_tensor(src + 1, worker_value).cuda(device=i)
+                           for i in rank_to_GPU[rank]]
+
+            dist.all_reduce_multigpu(tensors, op, group_id)
+            expected_tensor = _build_tensor(src + 1, expected_value)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != 'nccl',
+                     "Only Nccl backend supports allreduce multigpu")
+    @skip_if_no_gpu
+    def test_all_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_multigpu_helper(
+            group, group_id, rank, rank_to_GPU, dist.reduce_op.SUM,
+            2, 10, (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]))
+
+    def _test_reduce_multigpu_helper(self, group, group_id, rank,
+                                     rank_to_GPU, op, master_value,
+                                     worker_value, expected_value):
+        for src in group:
+            if rank == src:
+                tensors = [_build_tensor(src + 1, master_value).cuda(device=i)
+                           for i in rank_to_GPU[rank]]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+                expected_tensor = _build_tensor(src + 1, expected_value)
+                self.assertEqual(tensors[0], expected_tensor)
+            else:
+                tensors = [_build_tensor(src + 1, worker_value).cuda(device=i)
+                           for i in rank_to_GPU[rank]]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != 'nccl',
+                     "Only Nccl backend supports reduce multigpu")
+    @skip_if_no_gpu
+    def test_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_multigpu_helper(
+            group, group_id, rank, rank_to_GPU, dist.reduce_op.SUM,
+            2, 10, (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]))
+
+    def _test_all_gather_multigpu_helper(self, group, group_id, rank,
+                                         rank_to_GPU):
+        for dest in group:
+            tensors = [_build_tensor(dest + 1).cuda(device=i)
+                       for i in rank_to_GPU[rank]]
+
+            # construct expected output along with
+            # a place holder to receive all gather results
+            output_tensors = []
+            expected_output = []
+            output_per_gpu = [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group)
+            expected_per_gpu = [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group)
+            for gpu in rank_to_GPU[rank]:
+                output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
+                expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
+
+            dist.all_gather_multigpu(output_tensors, tensors, group_id)
+            self.assertEqual(output_tensors, expected_output)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != 'nccl',
+                     "Only Nccl backend supports allgather multigpu")
+    @skip_if_no_gpu
+    def test_all_gather_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_multigpu_helper(group, group_id, rank,
+                                              rank_to_GPU)
+
+    def _create_Net(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 50, bias=False)
+                self.fc3 = nn.Linear(50, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                x = self.fc3(x)
+                return F.softmax(x, dim=1)
+        return Net()
+
+    def _model_step(self, model):
+        for param in model.parameters():
+            param.data += param.grad
+            param.grad = None
+
+    def _prepare_dummy_data(self, local_bs):
+        # global_bs for DDP should be divisible by WORLD_SIZE
+        global_bs = int(WORLD_SIZE) * local_bs
+        input_cpu = torch.randn(global_bs, 2)
+        target = torch.randn(global_bs, 4)
+        loss = nn.MSELoss()
+        return global_bs, input_cpu, target, loss
+
+    # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL
+    def _test_DDP_helper(self, model, input_var, target, loss):
+        model.train()
+        output = model(input_var)
+        l = loss(output, target)
+        l.backward()
+
+    def _assert_equal_param(self, param_gpu, param_DDP):
+        self.assertEqual(len(param_gpu), len(param_DDP))
+        for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            self.assertEqual(p_gpu, p_DDP)
+
+    def _test_DDP_2iter(self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size):
+        for i in range(2):
+            # single cpu/gpu training
+            self._test_DDP_helper(model_base,
+                                  input,
+                                  target,
+                                  loss)
+
+            # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
+            self._test_DDP_helper(model_DDP,
+                                  input[rank * local_bs: (rank + 1) * local_bs],
+                                  target[rank * local_bs: (rank + 1) * local_bs],
+                                  loss)
+
+            # Update weights and run a second iteration to shake out errors
+            self._model_step(model_base)
+            self._model_step(model_DDP)
+            self._assert_equal_param(list(model_base.parameters()), list(model_DDP.module.parameters()))
+
+            # Shuffle the input so that DDP input is different
+            input = input[torch.randperm(batch_size)]
+
+    @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
+                     "Only Nccl & Gloo backend support DistributedDataParallel")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_DistributedDataParallel(self):
+        # Run a simple end to end DDP model, use result of single node model
+        # as baseline
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+
+        # cpu training setup
+        model = self._create_Net()
+
+        # single gpu training setup
+        model_gpu = copy.deepcopy(model)
+        gpu_subset = list(rank_to_GPU[rank])
+        model_gpu.cuda(gpu_subset[0])
+
+        # DDP training setup
+        model_DDP = copy.deepcopy(model)
+        model_DDP.cuda(gpu_subset[0])
+        model_DDP = nn.parallel.DistributedDataParallel(model_DDP, device_ids=gpu_subset)
+
+        # dummy data initialization
+        local_bs = len(gpu_subset)
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(model_gpu,
+                             model_DDP,
+                             input_cpu.cuda(gpu_subset[0]),
+                             target.cuda(gpu_subset[0]),
+                             loss,
+                             local_bs,
+                             rank,
+                             global_bs)
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == 'nccl', "nccl does not support DistributedDataParallelCPU")
+    def test_DistributedDataParallelCPU(self):
+        # Run a simple end to end DDP-CPU model, use result of single node
+        # model as baseline
+        group, group_id, rank = self._init_global_test()
+
+        # cpu training setup
+        model_base = self._create_Net()
+
+        # DDP-CPU training setup
+        model_DDP = copy.deepcopy(model_base)
+        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+
+        # dummy data initialization
+        local_bs = 2
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(model_base,
+                             model_DDP,
+                             input_cpu,
+                             target,
+                             loss,
+                             local_bs,
+                             rank,
+                             global_bs)
+        self._barrier()
+
+
+if BACKEND == 'tcp' or BACKEND == 'gloo' or BACKEND == 'nccl':
+    WORLD_SIZE = os.environ['WORLD_SIZE']
+
+    class TestDistBackend(TestCase, _DistTestBase):
+        MANAGER_PROCESS_RANK = -1
+
+        @staticmethod
+        def manager_join(fn):
+            @wraps(fn)
+            def wrapper(self):
+                if self.rank == self.MANAGER_PROCESS_RANK:
+                    self._join_and_reduce(fn)
+                else:
+                    fn(self)
+            return wrapper
+
+        @classmethod
+        def setUpClass(cls):
+            os.environ['MASTER_ADDR'] = MASTER_ADDR
+            os.environ['MASTER_PORT'] = MASTER_PORT
+            os.environ['WORLD_SIZE'] = WORLD_SIZE
+            for attr in dir(cls):
+                if attr.startswith('test'):
+                    fn = getattr(cls, attr)
+                    setattr(cls, attr, cls.manager_join(fn))
+
+        def setUp(self):
+            self.processes = []
+            self.rank = self.MANAGER_PROCESS_RANK
+            Barrier.init()
+            for rank in range(int(WORLD_SIZE)):
+                self.processes.append(self._spawn_process(rank))
+
+        def tearDown(self):
+            for p in self.processes:
+                p.terminate()
+
+        def _spawn_process(self, rank):
+            os.environ['RANK'] = str(rank)
+            name = 'process ' + str(rank)
+            process = multiprocessing.Process(target=self._run, name=name,
+                                              args=(rank,))
+            process.start()
+            return process
+
+        def _run(self, rank):
+            self.rank = rank
+            try:
+                dist.init_process_group(init_method=INIT_METHOD,
+                                        backend=BACKEND,
+                                        world_size=int(WORLD_SIZE))
+            except RuntimeError as e:
+                if 'recompile' in e.args[0]:
+                    sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
+                    # sys.exit(0)
+                raise
+            # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+            # We're retreiving a corresponding test and executing it.
+            getattr(self, self.id().split(".")[2])()
+            sys.exit(0)
+
+        def _join_and_reduce(self, fn):
+            skip_ok = getattr(fn, "skip_if_no_cuda_distributed", False) \
+                or getattr(fn, "skip_if_no_gpu", False) \
+                or getattr(fn, "skip_if_small_worldsize", False)
+            self.JOIN_TIMEOUT = get_timeout(self.id())
+            for p in self.processes:
+                p.join(self.JOIN_TIMEOUT)
+
+            first_process = self.processes[0]
+            for p in self.processes:
+                self.assertEqual(p.exitcode, first_process.exitcode)
+
+            if first_process.exitcode == SKIP_IF_BACKEND_UNAVAILABLE:
+                raise unittest.SkipTest("Compiled without the " + BACKEND + " backend")
+
+            if skip_ok:
+                # do this first so we don't give an error message about
+                # mismatched exit codes if the first isn't valid
+                assert first_process.exitcode == 0 \
+                    or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE \
+                    or first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE \
+                    or first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE
+
+                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
+                    raise unittest.SkipTest("cuda is not available")
+                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
+                    raise unittest.SkipTest("One unique gpu per process is not available")
+                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
+                    raise unittest.SkipTest("worldsize is too small to run group tests")
+
+            self.assertEqual(first_process.exitcode, 0)
+
+elif BACKEND == 'mpi':
+    WORLD_SIZE = os.environ['WORLD_SIZE']
+    dist.init_process_group(init_method=INIT_METHOD, backend='mpi')
+
+    class TestMPI(TestCase, _DistTestBase):
+        pass
+
+if __name__ == '__main__':
+    assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"
+
+    unittest.main()
diff --git a/test/test_distributed_trap.py b/test/test_distributed_trap.py
new file mode 100644
index 0000000..1b73268
--- /dev/null
+++ b/test/test_distributed_trap.py
@@ -0,0 +1,23 @@
+import os
+import tempfile
+import sys
+import random
+import __test_main__
+
+tmp_dir = tempfile.TemporaryDirectory()
+os.environ["TEMP_DIR"] = tmp_dir.name
+os.mkdir(os.path.join(tmp_dir.name, "barrier"))
+os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
+init_dir_path = os.path.join(tmp_dir.name, "init_dir")
+os.mkdir(init_dir_path)
+init_method = os.environ.get('INIT_METHOD')
+if init_method is not None and init_method == "zeus":
+    os.environ['INIT_METHOD'] = 'zeus://unittest_' + \
+        str(random.randint(1, 1000000000000))
+else:
+    os.environ['INIT_METHOD'] = 'file://' + \
+        os.path.join(init_dir_path, 'shared_init_file')
+
+
+if __name__ == '__main__':
+    __test_main__.main(sys.argv)
diff --git a/test/test_distributions.py b/test/test_distributions.py
new file mode 100644
index 0000000..f53271e
--- /dev/null
+++ b/test/test_distributions.py
@@ -0,0 +1,3728 @@
+"""
+Note [Randomized statistical tests]
+-----------------------------------
+
+This note describes how to maintain tests in this file as random sources
+change. This file contains two types of randomized tests:
+
+1. The easier type of randomized test are tests that should always pass but are
+   initialized with random data. If these fail something is wrong, but it's
+   fine to use a fixed seed by inheriting from common.TestCase.
+
+2. The trickier tests are statistical tests. These tests explicitly call
+   set_rng_seed(n) and are marked "see Note [Randomized statistical tests]".
+   These statistical tests have a known positive failure rate
+   (we set failure_rate=1e-3 by default). We need to balance strength of these
+   tests with annoyance of false alarms. One way that works is to specifically
+   set seeds in each of the randomized tests. When a random generator
+   occasionally changes (as in #4312 vectorizing the Box-Muller sampler), some
+   of these statistical tests may (rarely) fail. If one fails in this case,
+   it's fine to increment the seed of the failing test (but you shouldn't need
+   to increment it more than once; otherwise something is probably actually
+   wrong).
+"""
+
+import math
+import numbers
+import unittest
+from collections import namedtuple
+from itertools import product
+from random import shuffle
+
+import torch
+from torch._six import inf
+from common import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN
+from common_cuda import TEST_CUDA
+from torch.autograd import grad, gradcheck
+from torch.distributions import (Bernoulli, Beta, Binomial, Categorical,
+                                 Cauchy, Chi2, Dirichlet, Distribution,
+                                 Exponential, ExponentialFamily,
+                                 FisherSnedecor, Gamma, Geometric, Gumbel,
+                                 HalfCauchy, HalfNormal,
+                                 Independent, Laplace, LogisticNormal,
+                                 LogNormal, Multinomial, MultivariateNormal,
+                                 Normal, OneHotCategorical, Pareto, Poisson,
+                                 RelaxedBernoulli, RelaxedOneHotCategorical,
+                                 StudentT, TransformedDistribution, Uniform,
+                                 constraints, kl_divergence)
+from torch.distributions.constraint_registry import biject_to, transform_to
+from torch.distributions.constraints import Constraint, is_dependent
+from torch.distributions.dirichlet import _Dirichlet_backward
+from torch.distributions.kl import _kl_expfamily_expfamily
+from torch.distributions.transforms import (AbsTransform, AffineTransform,
+                                            ComposeTransform, ExpTransform,
+                                            LowerCholeskyTransform,
+                                            PowerTransform, SigmoidTransform,
+                                            SoftmaxTransform,
+                                            StickBreakingTransform,
+                                            identity_transform)
+from torch.distributions.utils import _finfo, probs_to_logits, lazy_property
+from torch.nn.functional import softmax
+
+TEST_NUMPY = True
+try:
+    import numpy as np
+    import scipy.stats
+    import scipy.special
+except ImportError:
+    TEST_NUMPY = False
+
+
+def pairwise(Dist, *params):
+    """
+    Creates a pair of distributions `Dist` initialzed to test each element of
+    param with each other.
+    """
+    params1 = [torch.tensor([p] * len(p)) for p in params]
+    params2 = [p.transpose(0, 1) for p in params1]
+    return Dist(*params1), Dist(*params2)
+
+
+def is_all_nan(tensor):
+    """
+    Checks if all entries of a tensor is nan.
+    """
+    return (tensor != tensor).all()
+
+
+# Register all distributions for generic tests.
+Example = namedtuple('Example', ['Dist', 'params'])
+EXAMPLES = [
+    Example(Bernoulli, [
+        {'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
+        {'probs': torch.tensor([0.3], requires_grad=True)},
+        {'probs': 0.3},
+    ]),
+    Example(Geometric, [
+        {'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
+        {'probs': torch.tensor([0.3], requires_grad=True)},
+        {'probs': 0.3},
+    ]),
+    Example(Beta, [
+        {
+            'concentration1': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'concentration0': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+        },
+        {
+            'concentration1': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
+            'concentration0': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
+        },
+    ]),
+    Example(Categorical, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
+    ]),
+    Example(Binomial, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor([10])},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor([10, 8])},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+         'total_count': torch.tensor([[10., 8.], [5., 3.]])},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+         'total_count': torch.tensor(0.)},
+    ]),
+    Example(Multinomial, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
+    ]),
+    Example(Cauchy, [
+        {'loc': 0.0, 'scale': 1.0},
+        {'loc': torch.tensor([0.0]), 'scale': 1.0},
+        {'loc': torch.tensor([[0.0], [0.0]]),
+         'scale': torch.tensor([[1.0], [1.0]])}
+    ]),
+    Example(Chi2, [
+        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
+        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+    ]),
+    Example(StudentT, [
+        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
+        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+    ]),
+    Example(Dirichlet, [
+        {'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
+        {'concentration': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)},
+    ]),
+    Example(Exponential, [
+        {'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
+        {'rate': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+    ]),
+    Example(FisherSnedecor, [
+        {
+            'df1': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'df2': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'df1': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'df2': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+        },
+        {
+            'df1': torch.tensor([1.0]),
+            'df2': 1.0,
+        }
+    ]),
+    Example(Gamma, [
+        {
+            'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'rate': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+        },
+        {
+            'concentration': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
+            'rate': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
+        },
+    ]),
+    Example(Gumbel, [
+        {
+            'loc': torch.randn(5, 5, requires_grad=True),
+            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.randn(1, requires_grad=True),
+            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+        },
+    ]),
+    Example(HalfCauchy, [
+        {'scale': 1.0},
+        {'scale': torch.tensor([[1.0], [1.0]])}
+    ]),
+    Example(HalfNormal, [
+        {'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
+        {'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+        {'scale': torch.tensor([1e-5, 1e-5], requires_grad=True)}
+    ]),
+    Example(Independent, [
+        {
+            'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+            'reinterpreted_batch_ndims': 0,
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+            'reinterpreted_batch_ndims': 1,
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+            'reinterpreted_batch_ndims': 2,
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+            'reinterpreted_batch_ndims': 2,
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+            'reinterpreted_batch_ndims': 3,
+        },
+    ]),
+    Example(Laplace, [
+        {
+            'loc': torch.randn(5, 5, requires_grad=True),
+            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.randn(1, requires_grad=True),
+            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1.0, 0.0], requires_grad=True),
+            'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
+        },
+    ]),
+    Example(LogNormal, [
+        {
+            'loc': torch.randn(5, 5, requires_grad=True),
+            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.randn(1, requires_grad=True),
+            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1.0, 0.0], requires_grad=True),
+            'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
+        },
+    ]),
+    Example(LogisticNormal, [
+        {
+            'loc': torch.randn(5, 5).requires_grad_(),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
+        },
+        {
+            'loc': torch.randn(1).requires_grad_(),
+            'scale': torch.randn(1).abs().requires_grad_(),
+        },
+        {
+            'loc': torch.tensor([1.0, 0.0], requires_grad=True),
+            'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
+        },
+    ]),
+    Example(MultivariateNormal, [
+        {
+            'loc': torch.randn(5, 2, requires_grad=True),
+            'covariance_matrix': torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True),
+        },
+        {
+            'loc': torch.randn(2, 3, requires_grad=True),
+            'precision_matrix': torch.tensor([[2.0, 0.1, 0.0],
+                                              [0.1, 0.25, 0.0],
+                                              [0.0, 0.0, 0.3]], requires_grad=True),
+        },
+        {
+            'loc': torch.randn(5, 3, 2, requires_grad=True),
+            'scale_tril': torch.tensor([[[2.0, 0.0], [-0.5, 0.25]],
+                                        [[2.0, 0.0], [0.3, 0.25]],
+                                        [[5.0, 0.0], [-0.5, 1.5]]], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1.0, -1.0]),
+            'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
+        },
+    ]),
+    Example(Normal, [
+        {
+            'loc': torch.randn(5, 5, requires_grad=True),
+            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.randn(1, requires_grad=True),
+            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1.0, 0.0], requires_grad=True),
+            'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
+        },
+    ]),
+    Example(OneHotCategorical, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
+    ]),
+    Example(Pareto, [
+        {
+            'scale': 1.0,
+            'alpha': 1.0
+        },
+        {
+            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'alpha': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        },
+        {
+            'scale': torch.tensor([1.0]),
+            'alpha': 1.0
+        }
+    ]),
+    Example(Poisson, [
+        {
+            'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+        },
+        {
+            'rate': torch.tensor(torch.randn(3).abs(), requires_grad=True),
+        },
+        {
+            'rate': 0.2,
+        }
+    ]),
+    Example(RelaxedBernoulli, [
+        {
+            'temperature': torch.tensor([0.5], requires_grad=True),
+            'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True),
+        },
+        {
+            'temperature': torch.tensor([2.0]),
+            'probs': torch.tensor([0.3]),
+        },
+        {
+            'temperature': torch.tensor([7.2]),
+            'logits': torch.tensor([-2.0, 2.0, 1.0, 5.0])
+        }
+    ]),
+    Example(RelaxedOneHotCategorical, [
+        {
+            'temperature': torch.tensor([0.5], requires_grad=True),
+            'probs': torch.tensor([[0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True)
+        },
+        {
+            'temperature': torch.tensor([2.0]),
+            'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]])
+        },
+        {
+            'temperature': torch.tensor([7.2]),
+            'logits': torch.tensor([[-2.0, 2.0], [1.0, 5.0]])
+        }
+    ]),
+    Example(TransformedDistribution, [
+        {
+            'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+            'transforms': [],
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+            'transforms': ExpTransform(),
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+            'transforms': [AffineTransform(torch.randn(3, 5), torch.randn(3, 5)),
+                           ExpTransform()],
+        },
+        {
+            'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
+                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+            'transforms': AffineTransform(1, 2),
+        },
+    ]),
+    Example(Uniform, [
+        {
+            'low': torch.zeros(5, 5, requires_grad=True),
+            'high': torch.ones(5, 5, requires_grad=True),
+        },
+        {
+            'low': torch.zeros(1, requires_grad=True),
+            'high': torch.ones(1, requires_grad=True),
+        },
+        {
+            'low': torch.tensor([1.0, 1.0], requires_grad=True),
+            'high': torch.tensor([2.0, 3.0], requires_grad=True),
+        },
+    ]),
+]
+
+BAD_EXAMPLES = [
+    Example(Bernoulli, [
+        {'probs': torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
+        {'probs': torch.tensor([-0.5], requires_grad=True)},
+        {'probs': 1.00001},
+    ]),
+    Example(Beta, [
+        {
+            'concentration1': torch.tensor([0.0], requires_grad=True),
+            'concentration0': torch.tensor([0.0], requires_grad=True),
+        },
+        {
+            'concentration1': torch.tensor([-1.0], requires_grad=True),
+            'concentration0': torch.tensor([-2.0], requires_grad=True),
+        },
+    ]),
+    Example(Geometric, [
+        {'probs': torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
+        {'probs': torch.tensor([-0.3], requires_grad=True)},
+        {'probs': 1.00000001},
+    ]),
+    Example(Categorical, [
+        {'probs': torch.tensor([[-0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
+        {'probs': torch.tensor([[-1.0, 10.0], [0.0, -1.0]], requires_grad=True)},
+    ]),
+    Example(Binomial, [
+        {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True),
+         'total_count': 10},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
+         'total_count': 10},
+    ]),
+    Example(Cauchy, [
+        {'loc': 0.0, 'scale': -1.0},
+        {'loc': torch.tensor([0.0]), 'scale': 0.0},
+        {'loc': torch.tensor([[0.0], [-2.0]]),
+         'scale': torch.tensor([[-0.000001], [1.0]])}
+    ]),
+    Example(Chi2, [
+        {'df': torch.tensor([0.], requires_grad=True)},
+        {'df': torch.tensor([-2.], requires_grad=True)},
+    ]),
+    Example(StudentT, [
+        {'df': torch.tensor([0.], requires_grad=True)},
+        {'df': torch.tensor([-2.], requires_grad=True)},
+    ]),
+    Example(Dirichlet, [
+        {'concentration': torch.tensor([0.], requires_grad=True)},
+        {'concentration': torch.tensor([-2.], requires_grad=True)}
+    ]),
+    Example(Exponential, [
+        {'rate': torch.tensor([0., 0.], requires_grad=True)},
+        {'rate': torch.tensor([-2.], requires_grad=True)}
+    ]),
+    Example(FisherSnedecor, [
+        {
+            'df1': torch.tensor([0., 0.], requires_grad=True),
+            'df2': torch.tensor([-1., -100.], requires_grad=True),
+        },
+        {
+            'df1': torch.tensor([1., 1.], requires_grad=True),
+            'df2': torch.tensor([0., 0.], requires_grad=True),
+        }
+    ]),
+    Example(Gamma, [
+        {
+            'concentration': torch.tensor([0., 0.], requires_grad=True),
+            'rate': torch.tensor([-1., -100.], requires_grad=True),
+        },
+        {
+            'concentration': torch.tensor([1., 1.], requires_grad=True),
+            'rate': torch.tensor([0., 0.], requires_grad=True),
+        }
+    ]),
+    Example(Gumbel, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([0., 1.], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([1., -1.], requires_grad=True),
+        },
+    ]),
+    Example(HalfCauchy, [
+        {'scale': -1.0},
+        {'scale': 0.0},
+        {'scale': torch.tensor([[-0.000001], [1.0]])}
+    ]),
+    Example(HalfNormal, [
+        {'scale': torch.tensor([0., 1.], requires_grad=True)},
+        {'scale': torch.tensor([1., -1.], requires_grad=True)},
+    ]),
+    Example(Laplace, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([0., 1.], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([1., -1.], requires_grad=True),
+        },
+    ]),
+    Example(LogNormal, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([0., 1.], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([1., -1.], requires_grad=True),
+        },
+    ]),
+    Example(Normal, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([0., 1.], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'scale': torch.tensor([1., -1.], requires_grad=True),
+        },
+        {
+            'loc': torch.tensor([1.0, 0.0], requires_grad=True),
+            'scale': torch.tensor([1e-5, -1e-5], requires_grad=True),
+        },
+    ]),
+    Example(OneHotCategorical, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.1, -10.0, 0.2]], requires_grad=True)},
+        {'probs': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+    ]),
+    Example(Pareto, [
+        {
+            'scale': 0.0,
+            'alpha': 0.0
+        },
+        {
+            'scale': torch.tensor([0.0, 0.0], requires_grad=True),
+            'alpha': torch.tensor([-1e-5, 0.0], requires_grad=True)
+        },
+        {
+            'scale': torch.tensor([1.0]),
+            'alpha': -1.0
+        }
+    ]),
+    Example(Poisson, [
+        {
+            'rate': torch.tensor([0.0], requires_grad=True),
+        },
+        {
+            'rate': -1.0,
+        }
+    ]),
+    Example(RelaxedBernoulli, [
+        {
+            'temperature': torch.tensor([1.5], requires_grad=True),
+            'probs': torch.tensor([1.7, 0.2, 0.4], requires_grad=True),
+        },
+        {
+            'temperature': torch.tensor([2.0]),
+            'probs': torch.tensor([-1.0]),
+        }
+    ]),
+    Example(RelaxedOneHotCategorical, [
+        {
+            'temperature': torch.tensor([0.5], requires_grad=True),
+            'probs': torch.tensor([[-0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True)
+        },
+        {
+            'temperature': torch.tensor([2.0]),
+            'probs': torch.tensor([[-1.0, 0.0], [-1.0, 1.1]])
+        }
+    ]),
+    Example(TransformedDistribution, [
+        {
+            'base_distribution': Normal(0, 1),
+            'transforms': lambda x: x,
+        },
+        {
+            'base_distribution': Normal(0, 1),
+            'transforms': [lambda x: x],
+        },
+    ]),
+    Example(Uniform, [
+        {
+            'low': torch.tensor([2.0], requires_grad=True),
+            'high': torch.tensor([2.0], requires_grad=True),
+        },
+        {
+            'low': torch.tensor([0.0], requires_grad=True),
+            'high': torch.tensor([0.0], requires_grad=True),
+        },
+        {
+            'low': torch.tensor([1.0], requires_grad=True),
+            'high': torch.tensor([0.0], requires_grad=True),
+        }
+    ])
+]
+
+
+class TestDistributions(TestCase):
+    _do_cuda_memory_leak_check = True
+
+    def _gradcheck_log_prob(self, dist_ctor, ctor_params):
+        # performs gradient checks on log_prob
+        distribution = dist_ctor(*ctor_params)
+        s = distribution.sample()
+        if s.is_floating_point():
+            s.detach_().requires_grad_()
+
+        expected_shape = distribution.batch_shape + distribution.event_shape
+        self.assertEqual(s.size(), expected_shape)
+
+        def apply_fn(s, *params):
+            return dist_ctor(*params).log_prob(s)
+
+        gradcheck(apply_fn, (s,) + tuple(ctor_params), raise_exception=True)
+
+    def _check_log_prob(self, dist, asset_fn):
+        # checks that the log_prob matches a reference function
+        s = dist.sample()
+        log_probs = dist.log_prob(s)
+        log_probs_data_flat = log_probs.view(-1)
+        s_data_flat = s.view(len(log_probs_data_flat), -1)
+        for i, (val, log_prob) in enumerate(zip(s_data_flat, log_probs_data_flat)):
+            asset_fn(i, val.squeeze(), log_prob)
+
+    def _check_sampler_sampler(self, torch_dist, ref_dist, message, multivariate=False,
+                               num_samples=10000, failure_rate=1e-3):
+        # Checks that the .sample() method matches a reference function.
+        torch_samples = torch_dist.sample((num_samples,)).squeeze()
+        torch_samples = torch_samples.cpu().numpy()
+        ref_samples = ref_dist.rvs(num_samples).astype(np.float64)
+        if multivariate:
+            # Project onto a random axis.
+            axis = np.random.normal(size=torch_samples.shape[-1])
+            axis /= np.linalg.norm(axis)
+            torch_samples = np.dot(torch_samples, axis)
+            ref_samples = np.dot(ref_samples, axis)
+        samples = [(x, +1) for x in torch_samples] + [(x, -1) for x in ref_samples]
+        shuffle(samples)  # necessary to prevent stable sort from making uneven bins for discrete
+        samples.sort(key=lambda x: x[0])
+        samples = np.array(samples)[:, 1]
+
+        # Aggregate into bins filled with roughly zero-mean unit-variance RVs.
+        num_bins = 10
+        samples_per_bin = len(samples) // num_bins
+        bins = samples.reshape((num_bins, samples_per_bin)).mean(axis=1)
+        stddev = samples_per_bin ** -0.5
+        threshold = stddev * scipy.special.erfinv(1 - 2 * failure_rate / num_bins)
+        message = '{}.sample() is biased:\n{}'.format(message, bins)
+        for bias in bins:
+            self.assertLess(-threshold, bias, message)
+            self.assertLess(bias, threshold, message)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def _check_sampler_discrete(self, torch_dist, ref_dist, message,
+                                num_samples=10000, failure_rate=1e-3):
+        """Runs a Chi2-test for the support, but ignores tail instead of combining"""
+        torch_samples = torch_dist.sample((num_samples,)).squeeze()
+        torch_samples = torch_samples.cpu().numpy()
+        unique, counts = np.unique(torch_samples, return_counts=True)
+        pmf = ref_dist.pmf(unique)
+        msk = (counts > 5) & ((pmf * num_samples) > 5)
+        self.assertGreater(pmf[msk].sum(), 0.9, "Distribution is too sparse for test; try increasing num_samples")
+        chisq, p = scipy.stats.chisquare(counts[msk], pmf[msk] * num_samples)
+        self.assertGreater(p, failure_rate, message)
+
+    def _check_enumerate_support(self, dist, examples):
+        for param, expected in examples:
+            param = torch.tensor(param)
+            expected = torch.tensor(expected)
+            actual = dist(param).enumerate_support()
+            self.assertEqual(actual, expected)
+            actual = dist(param).enumerate_support()
+            self.assertEqual(actual, expected)
+
+    def test_sample_detached(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                variable_params = [p for p in param.values() if getattr(p, 'requires_grad', False)]
+                if not variable_params:
+                    continue
+                dist = Dist(**param)
+                sample = dist.sample()
+                self.assertFalse(sample.requires_grad,
+                                 msg='{} example {}/{}, .sample() is not detached'.format(
+                                     Dist.__name__, i + 1, len(params)))
+
+    def test_rsample_requires_grad(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                if not any(getattr(p, 'requires_grad', False) for p in param.values()):
+                    continue
+                dist = Dist(**param)
+                if not dist.has_rsample:
+                    continue
+                sample = dist.rsample()
+                self.assertTrue(sample.requires_grad,
+                                msg='{} example {}/{}, .rsample() does not require grad'.format(
+                                    Dist.__name__, i + 1, len(params)))
+
+    def test_enumerate_support_type(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                try:
+                    self.assertTrue(type(dist.sample()) is type(dist.enumerate_support()),
+                                    msg=('{} example {}/{}, return type mismatch between ' +
+                                         'sample and enumerate_support.').format(Dist.__name__, i + 1, len(params)))
+                except NotImplementedError:
+                    pass
+
+    def test_lazy_property_grad(self):
+        x = torch.randn(1, requires_grad=True)
+
+        class Dummy(object):
+            @lazy_property
+            def y(self):
+                return x + 1
+
+        def test():
+            x.grad = None
+            Dummy().y.backward()
+            self.assertEqual(x.grad, torch.ones(1))
+
+        test()
+        with torch.no_grad():
+            test()
+
+        mean = torch.randn(2)
+        cov = torch.eye(2, requires_grad=True)
+        distn = MultivariateNormal(mean, cov)
+        with torch.no_grad():
+            distn.scale_tril
+        distn.scale_tril.sum().backward()
+        self.assertIsNotNone(cov.grad)
+
+    def test_has_examples(self):
+        distributions_with_examples = set(e.Dist for e in EXAMPLES)
+        for Dist in globals().values():
+            if isinstance(Dist, type) and issubclass(Dist, Distribution) \
+                    and Dist is not Distribution and Dist is not ExponentialFamily:
+                self.assertIn(Dist, distributions_with_examples,
+                              "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__))
+
+    def test_bernoulli(self):
+        p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
+        r = torch.tensor(0.3, requires_grad=True)
+        s = 0.3
+        self.assertEqual(Bernoulli(p).sample((8,)).size(), (8, 3))
+        self.assertFalse(Bernoulli(p).sample().requires_grad)
+        self.assertEqual(Bernoulli(r).sample((8,)).size(), (8,))
+        self.assertEqual(Bernoulli(r).sample().size(), ())
+        self.assertEqual(Bernoulli(r).sample((3, 2)).size(), (3, 2,))
+        self.assertEqual(Bernoulli(s).sample().size(), ())
+        self._gradcheck_log_prob(Bernoulli, (p,))
+
+        def ref_log_prob(idx, val, log_prob):
+            prob = p[idx]
+            self.assertEqual(log_prob, math.log(prob if val else 1 - prob))
+
+        self._check_log_prob(Bernoulli(p), ref_log_prob)
+        self._check_log_prob(Bernoulli(logits=p.log() - (-p).log1p()), ref_log_prob)
+        self.assertRaises(NotImplementedError, Bernoulli(r).rsample)
+
+        # check entropy computation
+        self.assertEqual(Bernoulli(p).entropy(), torch.tensor([0.6108, 0.5004, 0.6730]), prec=1e-4)
+        self.assertEqual(Bernoulli(torch.tensor([0.0])).entropy(), torch.tensor([0.0]))
+        self.assertEqual(Bernoulli(s).entropy(), torch.tensor(0.6108), prec=1e-4)
+
+    def test_bernoulli_enumerate_support(self):
+        examples = [
+            ([0.1], [[0], [1]]),
+            ([0.1, 0.9], [[0, 0], [1, 1]]),
+            ([[0.1, 0.2], [0.3, 0.4]], [[[0, 0], [0, 0]], [[1, 1], [1, 1]]]),
+        ]
+        self._check_enumerate_support(Bernoulli, examples)
+
+    def test_bernoulli_3d(self):
+        p = torch.full((2, 3, 5), 0.5).requires_grad_()
+        self.assertEqual(Bernoulli(p).sample().size(), (2, 3, 5))
+        self.assertEqual(Bernoulli(p).sample(sample_shape=(2, 5)).size(),
+                         (2, 5, 2, 3, 5))
+        self.assertEqual(Bernoulli(p).sample((2,)).size(), (2, 2, 3, 5))
+
+    def test_geometric(self):
+        p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
+        r = torch.tensor(0.3, requires_grad=True)
+        s = 0.3
+        self.assertEqual(Geometric(p).sample((8,)).size(), (8, 3))
+        self.assertEqual(Geometric(1).sample(), 0)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -inf, allow_inf=True)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(0.)), 0)
+        self.assertFalse(Geometric(p).sample().requires_grad)
+        self.assertEqual(Geometric(r).sample((8,)).size(), (8,))
+        self.assertEqual(Geometric(r).sample().size(), ())
+        self.assertEqual(Geometric(r).sample((3, 2)).size(), (3, 2))
+        self.assertEqual(Geometric(s).sample().size(), ())
+        self._gradcheck_log_prob(Geometric, (p,))
+        self.assertRaises(ValueError, lambda: Geometric(0))
+        self.assertRaises(NotImplementedError, Geometric(r).rsample)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_geometric_log_prob_and_entropy(self):
+        p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
+        s = 0.3
+
+        def ref_log_prob(idx, val, log_prob):
+            prob = p[idx].detach()
+            self.assertEqual(log_prob, scipy.stats.geom(prob, loc=-1).logpmf(val))
+
+        self._check_log_prob(Geometric(p), ref_log_prob)
+        self._check_log_prob(Geometric(logits=p.log() - (-p).log1p()), ref_log_prob)
+
+        # check entropy computation
+        self.assertEqual(Geometric(p).entropy(), scipy.stats.geom(p.detach().numpy(), loc=-1).entropy(), prec=1e-3)
+        self.assertEqual(float(Geometric(s).entropy()), scipy.stats.geom(s, loc=-1).entropy().item(), prec=1e-3)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_geometric_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for prob in [0.01, 0.18, 0.8]:
+            self._check_sampler_discrete(Geometric(prob),
+                                         scipy.stats.geom(p=prob, loc=-1),
+                                         'Geometric(prob={})'.format(prob))
+
+    def test_binomial(self):
+        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        for total_count in [1, 2, 10]:
+            self._gradcheck_log_prob(lambda p: Binomial(total_count, p), [p])
+            self._gradcheck_log_prob(lambda p: Binomial(total_count, None, p.log()), [p])
+        self.assertRaises(NotImplementedError, Binomial(10, p).rsample)
+        self.assertRaises(NotImplementedError, Binomial(10, p).entropy)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_binomial_log_prob(self):
+        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        for total_count in [1, 2, 10]:
+
+            def ref_log_prob(idx, x, log_prob):
+                p = probs.view(-1)[idx].item()
+                expected = scipy.stats.binom(total_count, p).logpmf(x)
+                self.assertAlmostEqual(log_prob, expected, places=3)
+
+            self._check_log_prob(Binomial(total_count, probs), ref_log_prob)
+            logits = probs_to_logits(probs, is_binary=True)
+            self._check_log_prob(Binomial(total_count, logits=logits), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_binomial_log_prob_vectorized_count(self):
+        probs = torch.tensor([0.2, 0.7, 0.9])
+        for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])),
+                                    (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]:
+            log_prob = Binomial(total_count, probs).log_prob(sample)
+            expected = scipy.stats.binom(total_count.cpu().numpy(), probs.cpu().numpy()).logpmf(sample)
+            self.assertAlmostEqual(log_prob, expected, places=4)
+
+    def test_binomial_extreme_vals(self):
+        total_count = 100
+        bin0 = Binomial(total_count, 0)
+        self.assertEqual(bin0.sample(), 0)
+        self.assertAlmostEqual(bin0.log_prob(torch.tensor([0.]))[0], 0, places=3)
+        self.assertEqual(float(bin0.log_prob(torch.tensor([1.])).exp()), 0, allow_inf=True)
+        bin1 = Binomial(total_count, 1)
+        self.assertEqual(bin1.sample(), total_count)
+        self.assertAlmostEqual(bin1.log_prob(torch.tensor([float(total_count)]))[0], 0, places=3)
+        self.assertEqual(float(bin1.log_prob(torch.tensor([float(total_count - 1)])).exp()), 0, allow_inf=True)
+        zero_counts = torch.zeros(torch.Size((2, 2)))
+        bin2 = Binomial(zero_counts, 1)
+        self.assertEqual(bin2.sample(), zero_counts)
+        self.assertEqual(bin2.log_prob(zero_counts), zero_counts)
+
+    def test_binomial_vectorized_count(self):
+        set_rng_seed(0)
+        total_count = torch.tensor([[4, 7], [3, 8]])
+        bin0 = Binomial(total_count, torch.tensor(1.))
+        self.assertEqual(bin0.sample(), total_count)
+        bin1 = Binomial(total_count, torch.tensor(0.5))
+        samples = bin1.sample(torch.Size((100000,)))
+        self.assertTrue((samples <= total_count.type_as(samples)).all())
+        self.assertEqual(samples.mean(dim=0), bin1.mean, prec=0.02)
+        self.assertEqual(samples.var(dim=0), bin1.variance, prec=0.02)
+
+    def test_binomial_enumerate_support(self):
+        set_rng_seed(0)
+        bin0 = Binomial(0, torch.tensor(1.))
+        self.assertEqual(bin0.enumerate_support(), torch.tensor([0.]))
+        bin1 = Binomial(torch.tensor(5), torch.tensor(0.5))
+        self.assertEqual(bin1.enumerate_support(), torch.arange(6))
+
+    def test_multinomial_1d(self):
+        total_count = 10
+        p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
+        self.assertEqual(Multinomial(total_count, p).sample().size(), (3,))
+        self.assertEqual(Multinomial(total_count, p).sample((2, 2)).size(), (2, 2, 3))
+        self.assertEqual(Multinomial(total_count, p).sample((1,)).size(), (1, 3))
+        self._gradcheck_log_prob(lambda p: Multinomial(total_count, p), [p])
+        self._gradcheck_log_prob(lambda p: Multinomial(total_count, None, p.log()), [p])
+        self.assertRaises(NotImplementedError, Multinomial(10, p).rsample)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_multinomial_1d_log_prob(self):
+        total_count = 10
+        p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
+        dist = Multinomial(total_count, probs=p)
+        x = dist.sample()
+        log_prob = dist.log_prob(x)
+        expected = torch.tensor(scipy.stats.multinomial.logpmf(x.numpy(), n=total_count, p=dist.probs.detach().numpy()))
+        self.assertEqual(log_prob, expected)
+
+        dist = Multinomial(total_count, logits=p.log())
+        x = dist.sample()
+        log_prob = dist.log_prob(x)
+        expected = torch.tensor(scipy.stats.multinomial.logpmf(x.numpy(), n=total_count, p=dist.probs.detach().numpy()))
+        self.assertEqual(log_prob, expected)
+
+    def test_multinomial_2d(self):
+        total_count = 10
+        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
+        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
+        p = torch.tensor(probabilities, requires_grad=True)
+        s = torch.tensor(probabilities_1, requires_grad=True)
+        self.assertEqual(Multinomial(total_count, p).sample().size(), (2, 3))
+        self.assertEqual(Multinomial(total_count, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
+        self.assertEqual(Multinomial(total_count, p).sample((6,)).size(), (6, 2, 3))
+        set_rng_seed(0)
+        self._gradcheck_log_prob(lambda p: Multinomial(total_count, p), [p])
+        self._gradcheck_log_prob(lambda p: Multinomial(total_count, None, p.log()), [p])
+
+        # sample check for extreme value of probs
+        self.assertEqual(Multinomial(total_count, s).sample(),
+                         torch.tensor([[total_count, 0], [0, total_count]]))
+
+        # check entropy computation
+        self.assertRaises(NotImplementedError, Multinomial(10, p).entropy)
+
+    def test_categorical_1d(self):
+        p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
+        self.assertTrue(is_all_nan(Categorical(p).mean))
+        self.assertTrue(is_all_nan(Categorical(p).variance))
+        self.assertEqual(Categorical(p).sample().size(), ())
+        self.assertFalse(Categorical(p).sample().requires_grad)
+        self.assertEqual(Categorical(p).sample((2, 2)).size(), (2, 2))
+        self.assertEqual(Categorical(p).sample((1,)).size(), (1,))
+        self._gradcheck_log_prob(Categorical, (p,))
+        self.assertRaises(NotImplementedError, Categorical(p).rsample)
+
+    def test_categorical_2d(self):
+        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
+        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
+        p = torch.tensor(probabilities, requires_grad=True)
+        s = torch.tensor(probabilities_1, requires_grad=True)
+        self.assertEqual(Categorical(p).mean.size(), (2,))
+        self.assertEqual(Categorical(p).variance.size(), (2,))
+        self.assertTrue(is_all_nan(Categorical(p).mean))
+        self.assertTrue(is_all_nan(Categorical(p).variance))
+        self.assertEqual(Categorical(p).sample().size(), (2,))
+        self.assertEqual(Categorical(p).sample(sample_shape=(3, 4)).size(), (3, 4, 2))
+        self.assertEqual(Categorical(p).sample((6,)).size(), (6, 2))
+        self._gradcheck_log_prob(Categorical, (p,))
+
+        # sample check for extreme value of probs
+        set_rng_seed(0)
+        self.assertEqual(Categorical(s).sample(sample_shape=(2,)),
+                         torch.tensor([[0, 1], [0, 1]]))
+
+        def ref_log_prob(idx, val, log_prob):
+            sample_prob = p[idx][val] / p[idx].sum()
+            self.assertEqual(log_prob, math.log(sample_prob))
+
+        self._check_log_prob(Categorical(p), ref_log_prob)
+        self._check_log_prob(Categorical(logits=p.log()), ref_log_prob)
+
+        # check entropy computation
+        self.assertEqual(Categorical(p).entropy(), torch.tensor([1.0114, 1.0297]), prec=1e-4)
+        self.assertEqual(Categorical(s).entropy(), torch.tensor([0.0, 0.0]))
+
+    def test_categorical_enumerate_support(self):
+        examples = [
+            ([0.1, 0.2, 0.7], [0, 1, 2]),
+            ([[0.1, 0.9], [0.3, 0.7]], [[0, 0], [1, 1]]),
+        ]
+        self._check_enumerate_support(Categorical, examples)
+
+    def test_one_hot_categorical_1d(self):
+        p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
+        self.assertEqual(OneHotCategorical(p).sample().size(), (3,))
+        self.assertFalse(OneHotCategorical(p).sample().requires_grad)
+        self.assertEqual(OneHotCategorical(p).sample((2, 2)).size(), (2, 2, 3))
+        self.assertEqual(OneHotCategorical(p).sample((1,)).size(), (1, 3))
+        self._gradcheck_log_prob(OneHotCategorical, (p,))
+        self.assertRaises(NotImplementedError, OneHotCategorical(p).rsample)
+
+    def test_one_hot_categorical_2d(self):
+        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
+        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
+        p = torch.tensor(probabilities, requires_grad=True)
+        s = torch.tensor(probabilities_1, requires_grad=True)
+        self.assertEqual(OneHotCategorical(p).sample().size(), (2, 3))
+        self.assertEqual(OneHotCategorical(p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
+        self.assertEqual(OneHotCategorical(p).sample((6,)).size(), (6, 2, 3))
+        self._gradcheck_log_prob(OneHotCategorical, (p,))
+
+        dist = OneHotCategorical(p)
+        x = dist.sample()
+        self.assertEqual(dist.log_prob(x), Categorical(p).log_prob(x.max(-1)[1]))
+
+    def test_one_hot_categorical_enumerate_support(self):
+        examples = [
+            ([0.1, 0.2, 0.7], [[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
+            ([[0.1, 0.9], [0.3, 0.7]], [[[1, 0], [1, 0]], [[0, 1], [0, 1]]]),
+        ]
+        self._check_enumerate_support(OneHotCategorical, examples)
+
+    def test_poisson_shape(self):
+        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        self.assertEqual(Poisson(rate).sample().size(), (2, 3))
+        self.assertEqual(Poisson(rate).sample((7,)).size(), (7, 2, 3))
+        self.assertEqual(Poisson(rate_1d).sample().size(), (1,))
+        self.assertEqual(Poisson(rate_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Poisson(2.0).sample((2,)).size(), (2,))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_poisson_log_prob(self):
+        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+
+        def ref_log_prob(idx, x, log_prob):
+            l = rate.view(-1)[idx].detach()
+            expected = scipy.stats.poisson.logpmf(x, l)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        set_rng_seed(0)
+        self._check_log_prob(Poisson(rate), ref_log_prob)
+        self._gradcheck_log_prob(Poisson, (rate,))
+        self._gradcheck_log_prob(Poisson, (rate_1d,))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_poisson_sample(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
+        for rate in [0.1, 1.0, 5.0]:
+            self._check_sampler_discrete(Poisson(rate),
+                                         scipy.stats.poisson(rate),
+                                         'Poisson(lambda={})'.format(rate),
+                                         failure_rate=1e-3)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_poisson_gpu_sample(self):
+        set_rng_seed(1)
+        for rate in [0.12, 0.9, 4.0]:
+            self._check_sampler_discrete(Poisson(torch.tensor([rate]).cuda()),
+                                         scipy.stats.poisson(rate),
+                                         'Poisson(lambda={}, cuda)'.format(rate),
+                                         failure_rate=1e-3)
+
+    def test_relaxed_bernoulli(self):
+        p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
+        r = torch.tensor(0.3, requires_grad=True)
+        s = 0.3
+        temp = torch.tensor(0.67, requires_grad=True)
+        self.assertEqual(RelaxedBernoulli(temp, p).sample((8,)).size(), (8, 3))
+        self.assertFalse(RelaxedBernoulli(temp, p).sample().requires_grad)
+        self.assertEqual(RelaxedBernoulli(temp, r).sample((8,)).size(), (8,))
+        self.assertEqual(RelaxedBernoulli(temp, r).sample().size(), ())
+        self.assertEqual(RelaxedBernoulli(temp, r).sample((3, 2)).size(), (3, 2,))
+        self.assertEqual(RelaxedBernoulli(temp, s).sample().size(), ())
+        self._gradcheck_log_prob(RelaxedBernoulli, (temp, p))
+        self._gradcheck_log_prob(RelaxedBernoulli, (temp, r))
+
+        # test that rsample doesn't fail
+        s = RelaxedBernoulli(temp, p).rsample()
+        s.backward(torch.ones_like(s))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_rounded_relaxed_bernoulli(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+
+        class Rounded(object):
+            def __init__(self, dist):
+                self.dist = dist
+
+            def sample(self, *args, **kwargs):
+                return torch.round(self.dist.sample(*args, **kwargs))
+
+        for probs, temp in product([0.1, 0.2, 0.8], [0.1, 1.0, 10.0]):
+            self._check_sampler_discrete(Rounded(RelaxedBernoulli(temp, probs)),
+                                         scipy.stats.bernoulli(probs),
+                                         'Rounded(RelaxedBernoulli(temp={}, probs={}))'.format(temp, probs),
+                                         failure_rate=1e-3)
+
+        for probs in [0.001, 0.2, 0.999]:
+            equal_probs = torch.tensor(0.5)
+            dist = RelaxedBernoulli(1e10, probs)
+            s = dist.rsample()
+            self.assertEqual(equal_probs, s)
+
+    def test_relaxed_one_hot_categorical_1d(self):
+        p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
+        temp = torch.tensor(0.67, requires_grad=True)
+        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample().size(), (3,))
+        self.assertFalse(RelaxedOneHotCategorical(probs=p, temperature=temp).sample().requires_grad)
+        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((2, 2)).size(), (2, 2, 3))
+        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((1,)).size(), (1, 3))
+        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp, p))
+
+    def test_relaxed_one_hot_categorical_2d(self):
+        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
+        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
+        temp = torch.tensor([3.0], requires_grad=True)
+        # The lower the temperature, the more unstable the log_prob gradcheck is
+        # w.r.t. the sample. Values below 0.25 empirically fail the default tol.
+        temp_2 = torch.tensor([0.25], requires_grad=True)
+        p = torch.tensor(probabilities, requires_grad=True)
+        s = torch.tensor(probabilities_1, requires_grad=True)
+        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample().size(), (2, 3))
+        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
+        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample((6,)).size(), (6, 2, 3))
+        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp, p))
+        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp_2, p))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_argmax_relaxed_categorical(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+
+        class ArgMax(object):
+            def __init__(self, dist):
+                self.dist = dist
+
+            def sample(self, *args, **kwargs):
+                s = self.dist.sample(*args, **kwargs)
+                _, idx = torch.max(s, -1)
+                return idx
+
+        class ScipyCategorical(object):
+            def __init__(self, dist):
+                self.dist = dist
+
+            def pmf(self, samples):
+                new_samples = np.zeros(samples.shape + self.dist.p.shape)
+                new_samples[np.arange(samples.shape[0]), samples] = 1
+                return self.dist.pmf(new_samples)
+
+        for probs, temp in product([torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])], [0.1, 1.0, 10.0]):
+            self._check_sampler_discrete(ArgMax(RelaxedOneHotCategorical(temp, probs)),
+                                         ScipyCategorical(scipy.stats.multinomial(1, probs)),
+                                         'Rounded(RelaxedOneHotCategorical(temp={}, probs={}))'.format(temp, probs),
+                                         failure_rate=1e-3)
+
+        for probs in [torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])]:
+            equal_probs = torch.ones(probs.size()) / probs.size()[0]
+            dist = RelaxedOneHotCategorical(1e10, probs)
+            s = dist.rsample()
+            self.assertEqual(equal_probs, s)
+
+    def test_uniform(self):
+        low = torch.zeros(5, 5, requires_grad=True)
+        high = torch.tensor(torch.ones(5, 5) * 3, requires_grad=True)
+        low_1d = torch.zeros(1, requires_grad=True)
+        high_1d = torch.tensor(torch.ones(1) * 3, requires_grad=True)
+        self.assertEqual(Uniform(low, high).sample().size(), (5, 5))
+        self.assertEqual(Uniform(low, high).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(Uniform(low_1d, high_1d).sample().size(), (1,))
+        self.assertEqual(Uniform(low_1d, high_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,))
+
+        # Check log_prob computation when value outside range
+        uniform = Uniform(low_1d, high_1d)
+        above_high = torch.tensor([4.0])
+        below_low = torch.tensor([-1.0])
+        self.assertEqual(uniform.log_prob(above_high).item(), -inf, allow_inf=True)
+        self.assertEqual(uniform.log_prob(below_low).item(), -inf, allow_inf=True)
+
+        # check cdf computation when value outside range
+        self.assertEqual(uniform.cdf(below_low).item(), 0)
+        self.assertEqual(uniform.cdf(above_high).item(), 1)
+
+        set_rng_seed(1)
+        self._gradcheck_log_prob(Uniform, (low, high))
+        self._gradcheck_log_prob(Uniform, (low, 1.0))
+        self._gradcheck_log_prob(Uniform, (0.0, high))
+
+        state = torch.get_rng_state()
+        rand = low.new(low.size()).uniform_()
+        torch.set_rng_state(state)
+        u = Uniform(low, high).rsample()
+        u.backward(torch.ones_like(u))
+        self.assertEqual(low.grad, 1 - rand)
+        self.assertEqual(high.grad, rand)
+        low.grad.zero_()
+        high.grad.zero_()
+
+    def test_cauchy(self):
+        loc = torch.zeros(5, 5, requires_grad=True)
+        scale = torch.ones(5, 5, requires_grad=True)
+        loc_1d = torch.zeros(1, requires_grad=True)
+        scale_1d = torch.ones(1, requires_grad=True)
+        self.assertTrue(is_all_nan(Cauchy(loc_1d, scale_1d).mean))
+        self.assertEqual(Cauchy(loc_1d, scale_1d).variance, inf, allow_inf=True)
+        self.assertEqual(Cauchy(loc, scale).sample().size(), (5, 5))
+        self.assertEqual(Cauchy(loc, scale).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(Cauchy(loc_1d, scale_1d).sample().size(), (1,))
+        self.assertEqual(Cauchy(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Cauchy(0.0, 1.0).sample((1,)).size(), (1,))
+
+        set_rng_seed(1)
+        self._gradcheck_log_prob(Cauchy, (loc, scale))
+        self._gradcheck_log_prob(Cauchy, (loc, 1.0))
+        self._gradcheck_log_prob(Cauchy, (0.0, scale))
+
+        state = torch.get_rng_state()
+        eps = loc.new(loc.size()).cauchy_()
+        torch.set_rng_state(state)
+        c = Cauchy(loc, scale).rsample()
+        c.backward(torch.ones_like(c))
+        self.assertEqual(loc.grad, torch.ones_like(scale))
+        self.assertEqual(scale.grad, eps)
+        loc.grad.zero_()
+        scale.grad.zero_()
+
+    def test_halfcauchy(self):
+        scale = torch.ones(5, 5, requires_grad=True)
+        scale_1d = torch.ones(1, requires_grad=True)
+        self.assertTrue(is_all_nan(HalfCauchy(scale_1d).mean))
+        self.assertEqual(HalfCauchy(scale_1d).variance, inf, allow_inf=True)
+        self.assertEqual(HalfCauchy(scale).sample().size(), (5, 5))
+        self.assertEqual(HalfCauchy(scale).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(HalfCauchy(scale_1d).sample().size(), (1,))
+        self.assertEqual(HalfCauchy(scale_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(HalfCauchy(1.0).sample((1,)).size(), (1,))
+
+        set_rng_seed(1)
+        self._gradcheck_log_prob(HalfCauchy, (scale,))
+        self._gradcheck_log_prob(HalfCauchy, (1.0,))
+
+        state = torch.get_rng_state()
+        eps = scale.new(scale.size()).cauchy_().abs_()
+        torch.set_rng_state(state)
+        c = HalfCauchy(scale).rsample()
+        c.backward(torch.ones_like(c))
+        self.assertEqual(scale.grad, eps)
+        scale.grad.zero_()
+
+    def test_halfnormal(self):
+        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        std_1d = torch.randn(1, requires_grad=True)
+        std_delta = torch.tensor([1e-5, 1e-5])
+        self.assertEqual(HalfNormal(std).sample().size(), (5, 5))
+        self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(HalfNormal(std_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(HalfNormal(std_1d).sample().size(), (1,))
+        self.assertEqual(HalfNormal(.6).sample((1,)).size(), (1,))
+        self.assertEqual(HalfNormal(50.0).sample((1,)).size(), (1,))
+
+        # sample check for extreme value of std
+        set_rng_seed(1)
+        self.assertEqual(HalfNormal(std_delta).sample(sample_shape=(1, 2)),
+                         torch.tensor([[[0.0, 0.0], [0.0, 0.0]]]),
+                         prec=1e-4)
+
+        self._gradcheck_log_prob(HalfNormal, (std,))
+        self._gradcheck_log_prob(HalfNormal, (1.0,))
+
+        # check .log_prob() can broadcast.
+        dist = HalfNormal(torch.ones(2, 1, 4))
+        log_prob = dist.log_prob(torch.ones(3, 1))
+        self.assertEqual(log_prob.shape, (2, 3, 4))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_halfnormal_logprob(self):
+        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+
+        def ref_log_prob(idx, x, log_prob):
+            s = std.view(-1)[idx].detach()
+            expected = scipy.stats.halfnorm(scale=s).logpdf(x)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(HalfNormal(std), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_halfnormal_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for std in [0.1, 1.0, 10.0]:
+            self._check_sampler_sampler(HalfNormal(std),
+                                        scipy.stats.halfnorm(scale=std),
+                                        'HalfNormal(scale={})'.format(std))
+
+    def test_lognormal(self):
+        mean = torch.randn(5, 5, requires_grad=True)
+        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        mean_1d = torch.randn(1, requires_grad=True)
+        std_1d = torch.randn(1, requires_grad=True)
+        mean_delta = torch.tensor([1.0, 0.0])
+        std_delta = torch.tensor([1e-5, 1e-5])
+        self.assertEqual(LogNormal(mean, std).sample().size(), (5, 5))
+        self.assertEqual(LogNormal(mean, std).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(LogNormal(mean_1d, std_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(LogNormal(mean_1d, std_1d).sample().size(), (1,))
+        self.assertEqual(LogNormal(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(LogNormal(-0.7, 50.0).sample((1,)).size(), (1,))
+
+        # sample check for extreme value of mean, std
+        set_rng_seed(1)
+        self.assertEqual(LogNormal(mean_delta, std_delta).sample(sample_shape=(1, 2)),
+                         torch.tensor([[[math.exp(1), 1.0], [math.exp(1), 1.0]]]),
+                         prec=1e-4)
+
+        self._gradcheck_log_prob(LogNormal, (mean, std))
+        self._gradcheck_log_prob(LogNormal, (mean, 1.0))
+        self._gradcheck_log_prob(LogNormal, (0.0, std))
+
+        # check .log_prob() can broadcast.
+        dist = LogNormal(torch.zeros(4), torch.ones(2, 1, 1))
+        log_prob = dist.log_prob(torch.ones(3, 1))
+        self.assertEqual(log_prob.shape, (2, 3, 4))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_lognormal_logprob(self):
+        mean = torch.randn(5, 1, requires_grad=True)
+        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+
+        def ref_log_prob(idx, x, log_prob):
+            m = mean.view(-1)[idx].detach()
+            s = std.view(-1)[idx].detach()
+            expected = scipy.stats.lognorm(s=s, scale=math.exp(m)).logpdf(x)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(LogNormal(mean, std), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_lognormal_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for mean, std in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(LogNormal(mean, std),
+                                        scipy.stats.lognorm(scale=math.exp(mean), s=std),
+                                        'LogNormal(loc={}, scale={})'.format(mean, std))
+
+    def test_logisticnormal(self):
+        mean = torch.randn(5, 5).requires_grad_()
+        std = torch.randn(5, 5).abs().requires_grad_()
+        mean_1d = torch.randn(1).requires_grad_()
+        std_1d = torch.randn(1).requires_grad_()
+        mean_delta = torch.tensor([1.0, 0.0])
+        std_delta = torch.tensor([1e-5, 1e-5])
+        self.assertEqual(LogisticNormal(mean, std).sample().size(), (5, 6))
+        self.assertEqual(LogisticNormal(mean, std).sample((7,)).size(), (7, 5, 6))
+        self.assertEqual(LogisticNormal(mean_1d, std_1d).sample((1,)).size(), (1, 2))
+        self.assertEqual(LogisticNormal(mean_1d, std_1d).sample().size(), (2,))
+        self.assertEqual(LogisticNormal(0.2, .6).sample((1,)).size(), (2,))
+        self.assertEqual(LogisticNormal(-0.7, 50.0).sample((1,)).size(), (2,))
+
+        # sample check for extreme value of mean, std
+        set_rng_seed(1)
+        self.assertEqual(LogisticNormal(mean_delta, std_delta).sample(),
+                         torch.tensor([math.exp(1) / (1. + 1. + math.exp(1)),
+                                       1. / (1. + 1. + math.exp(1)),
+                                       1. / (1. + 1. + math.exp(1))]),
+                         prec=1e-4)
+
+        self._gradcheck_log_prob(LogisticNormal, (mean, std))
+        self._gradcheck_log_prob(LogisticNormal, (mean, 1.0))
+        self._gradcheck_log_prob(LogisticNormal, (0.0, std))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_logisticnormal_logprob(self):
+        mean = torch.randn(5, 7).requires_grad_()
+        std = torch.randn(5, 7).abs().requires_grad_()
+
+        # Smoke test for now
+        # TODO: Once _check_log_prob works with multidimensional distributions,
+        #       add proper testing of the log probabilities.
+        dist = LogisticNormal(mean, std)
+        assert dist.log_prob(dist.sample()).detach().cpu().numpy().shape == (5,)
+
+    def _get_logistic_normal_ref_sampler(self, base_dist):
+
+        def _sampler(num_samples):
+            x = base_dist.rvs(num_samples)
+            offset = np.log((x.shape[-1] + 1) - np.ones_like(x).cumsum(-1))
+            z = 1. / (1. + np.exp(offset - x))
+            z_cumprod = np.cumprod(1 - z, axis=-1)
+            y1 = np.pad(z, ((0, 0), (0, 1)), mode='constant', constant_values=1.)
+            y2 = np.pad(z_cumprod, ((0, 0), (1, 0)), mode='constant', constant_values=1.)
+            return y1 * y2
+
+        return _sampler
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_logisticnormal_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        means = map(np.asarray, [(-1.0, -1.0), (0.0, 0.0), (1.0, 1.0)])
+        covs = map(np.diag, [(0.1, 0.1), (1.0, 1.0), (10.0, 10.0)])
+        for mean, cov in product(means, covs):
+            base_dist = scipy.stats.multivariate_normal(mean=mean, cov=cov)
+            ref_dist = scipy.stats.multivariate_normal(mean=mean, cov=cov)
+            ref_dist.rvs = self._get_logistic_normal_ref_sampler(base_dist)
+            mean_th = torch.tensor(mean)
+            std_th = torch.tensor(np.sqrt(np.diag(cov)))
+            self._check_sampler_sampler(
+                LogisticNormal(mean_th, std_th), ref_dist,
+                'LogisticNormal(loc={}, scale={})'.format(mean_th, std_th),
+                multivariate=True)
+
+    def test_normal(self):
+        loc = torch.randn(5, 5, requires_grad=True)
+        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        loc_1d = torch.randn(1, requires_grad=True)
+        scale_1d = torch.randn(1, requires_grad=True)
+        loc_delta = torch.tensor([1.0, 0.0])
+        scale_delta = torch.tensor([1e-5, 1e-5])
+        self.assertEqual(Normal(loc, scale).sample().size(), (5, 5))
+        self.assertEqual(Normal(loc, scale).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(Normal(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Normal(loc_1d, scale_1d).sample().size(), (1,))
+        self.assertEqual(Normal(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(Normal(-0.7, 50.0).sample((1,)).size(), (1,))
+
+        # sample check for extreme value of mean, std
+        set_rng_seed(1)
+        self.assertEqual(Normal(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
+                         torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
+                         prec=1e-4)
+
+        self._gradcheck_log_prob(Normal, (loc, scale))
+        self._gradcheck_log_prob(Normal, (loc, 1.0))
+        self._gradcheck_log_prob(Normal, (0.0, scale))
+
+        state = torch.get_rng_state()
+        eps = torch.normal(torch.zeros_like(loc), torch.ones_like(scale))
+        torch.set_rng_state(state)
+        z = Normal(loc, scale).rsample()
+        z.backward(torch.ones_like(z))
+        self.assertEqual(loc.grad, torch.ones_like(loc))
+        self.assertEqual(scale.grad, eps)
+        loc.grad.zero_()
+        scale.grad.zero_()
+        self.assertEqual(z.size(), (5, 5))
+
+        def ref_log_prob(idx, x, log_prob):
+            m = loc.view(-1)[idx]
+            s = scale.view(-1)[idx]
+            expected = (math.exp(-(x - m) ** 2 / (2 * s ** 2)) /
+                        math.sqrt(2 * math.pi * s ** 2))
+            self.assertAlmostEqual(log_prob, math.log(expected), places=3)
+
+        self._check_log_prob(Normal(loc, scale), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_normal_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Normal(loc, scale),
+                                        scipy.stats.norm(loc=loc, scale=scale),
+                                        'Normal(mean={}, std={})'.format(loc, scale))
+
+    def test_multivariate_normal_shape(self):
+        mean = torch.randn(5, 3, requires_grad=True)
+        mean_no_batch = torch.randn(3, requires_grad=True)
+        mean_multi_batch = torch.randn(6, 5, 3, requires_grad=True)
+
+        # construct PSD covariance
+        tmp = torch.randn(3, 10)
+        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
+        prec = torch.tensor(cov.inverse(), requires_grad=True)
+        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+
+        # construct batch of PSD covariances
+        tmp = torch.randn(6, 5, 3, 10)
+        cov_batched = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+        prec_batched = [C.inverse() for C in cov_batched.view((-1, 3, 3))]
+        prec_batched = torch.stack(prec_batched).view(cov_batched.shape)
+        scale_tril_batched = [torch.potrf(C, upper=False) for C in cov_batched.view((-1, 3, 3))]
+        scale_tril_batched = torch.stack(scale_tril_batched).view(cov_batched.shape)
+
+        # ensure that sample, batch, event shapes all handled correctly
+        self.assertEqual(MultivariateNormal(mean, cov).sample().size(), (5, 3))
+        self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample().size(), (3,))
+        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample().size(), (6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, cov).sample((2,)).size(), (2, 5, 3))
+        self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample((2,)).size(), (2, 3))
+        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, cov).sample((2, 7)).size(), (2, 7, 5, 3))
+        self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample((2, 7)).size(), (2, 7, 3))
+        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3))
+        self.assertEqual(MultivariateNormal(mean, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+
+        # check gradients
+        self._gradcheck_log_prob(MultivariateNormal, (mean, cov))
+        self._gradcheck_log_prob(MultivariateNormal, (mean_multi_batch, cov))
+        self._gradcheck_log_prob(MultivariateNormal, (mean_multi_batch, cov_batched))
+        self._gradcheck_log_prob(MultivariateNormal, (mean, None, prec))
+        self._gradcheck_log_prob(MultivariateNormal, (mean_no_batch, None, prec_batched))
+        self._gradcheck_log_prob(MultivariateNormal, (mean, None, None, scale_tril))
+        self._gradcheck_log_prob(MultivariateNormal, (mean_no_batch, None, None, scale_tril_batched))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_multivariate_normal_log_prob(self):
+        mean = torch.randn(3, requires_grad=True)
+        tmp = torch.randn(3, 10)
+        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
+        prec = torch.tensor(cov.inverse(), requires_grad=True)
+        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+
+        # check that logprob values match scipy logpdf,
+        # and that covariance and scale_tril parameters are equivalent
+        dist1 = MultivariateNormal(mean, cov)
+        dist2 = MultivariateNormal(mean, precision_matrix=prec)
+        dist3 = MultivariateNormal(mean, scale_tril=scale_tril)
+        ref_dist = scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy())
+
+        x = dist1.sample((10,))
+        expected = ref_dist.logpdf(x.numpy())
+
+        self.assertAlmostEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), places=3)
+        self.assertAlmostEqual(0.0, np.mean((dist2.log_prob(x).detach().numpy() - expected)**2), places=3)
+        self.assertAlmostEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), places=3)
+
+        # Double-check that batched versions behave the same as unbatched
+        mean = torch.randn(5, 3, requires_grad=True)
+        tmp = torch.randn(5, 3, 10)
+        cov = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+
+        dist_batched = MultivariateNormal(mean, cov)
+        dist_unbatched = [MultivariateNormal(mean[i], cov[i]) for i in range(mean.size(0))]
+
+        x = dist_batched.sample((10,))
+        batched_prob = dist_batched.log_prob(x)
+        unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
+
+        self.assertEqual(batched_prob.shape, unbatched_prob.shape)
+        self.assertAlmostEqual(0.0, (batched_prob - unbatched_prob).abs().max(), places=3)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_multivariate_normal_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        mean = torch.randn(3, requires_grad=True)
+        tmp = torch.randn(3, 10)
+        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
+        prec = torch.tensor(cov.inverse(), requires_grad=True)
+        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+
+        self._check_sampler_sampler(MultivariateNormal(mean, cov),
+                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
+                                    'MultivariateNormal(loc={}, cov={})'.format(mean, cov),
+                                    multivariate=True)
+        self._check_sampler_sampler(MultivariateNormal(mean, precision_matrix=prec),
+                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
+                                    'MultivariateNormal(loc={}, prec={})'.format(mean, prec),
+                                    multivariate=True)
+        self._check_sampler_sampler(MultivariateNormal(mean, scale_tril=scale_tril),
+                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
+                                    'MultivariateNormal(loc={}, scale_tril={})'.format(mean, scale_tril),
+                                    multivariate=True)
+
+    def test_multivariate_normal_properties(self):
+        loc = torch.randn(5)
+        scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5))
+        m = MultivariateNormal(loc=loc, scale_tril=scale_tril)
+        self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t()))
+        self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0]))
+        self.assertEqual(m.scale_tril, torch.potrf(m.covariance_matrix, upper=False))
+
+    def test_exponential(self):
+        rate = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        self.assertEqual(Exponential(rate).sample().size(), (5, 5))
+        self.assertEqual(Exponential(rate).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(Exponential(rate_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Exponential(rate_1d).sample().size(), (1,))
+        self.assertEqual(Exponential(0.2).sample((1,)).size(), (1,))
+        self.assertEqual(Exponential(50.0).sample((1,)).size(), (1,))
+
+        self._gradcheck_log_prob(Exponential, (rate,))
+        state = torch.get_rng_state()
+        eps = rate.new(rate.size()).exponential_()
+        torch.set_rng_state(state)
+        z = Exponential(rate).rsample()
+        z.backward(torch.ones_like(z))
+        self.assertEqual(rate.grad, -eps / rate**2)
+        rate.grad.zero_()
+        self.assertEqual(z.size(), (5, 5))
+
+        def ref_log_prob(idx, x, log_prob):
+            m = rate.view(-1)[idx]
+            expected = math.log(m) - m * x
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Exponential(rate), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_exponential_sample(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
+        for rate in [1e-5, 1.0, 10.]:
+            self._check_sampler_sampler(Exponential(rate),
+                                        scipy.stats.expon(scale=1. / rate),
+                                        'Exponential(rate={})'.format(rate))
+
+    def test_laplace(self):
+        loc = torch.randn(5, 5, requires_grad=True)
+        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        loc_1d = torch.randn(1, requires_grad=True)
+        scale_1d = torch.randn(1, requires_grad=True)
+        loc_delta = torch.tensor([1.0, 0.0])
+        scale_delta = torch.tensor([1e-5, 1e-5])
+        self.assertEqual(Laplace(loc, scale).sample().size(), (5, 5))
+        self.assertEqual(Laplace(loc, scale).sample((7,)).size(), (7, 5, 5))
+        self.assertEqual(Laplace(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Laplace(loc_1d, scale_1d).sample().size(), (1,))
+        self.assertEqual(Laplace(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(Laplace(-0.7, 50.0).sample((1,)).size(), (1,))
+
+        # sample check for extreme value of mean, std
+        set_rng_seed(0)
+        self.assertEqual(Laplace(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
+                         torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
+                         prec=1e-4)
+
+        self._gradcheck_log_prob(Laplace, (loc, scale))
+        self._gradcheck_log_prob(Laplace, (loc, 1.0))
+        self._gradcheck_log_prob(Laplace, (0.0, scale))
+
+        state = torch.get_rng_state()
+        eps = torch.ones_like(loc).uniform_(-.5, .5)
+        torch.set_rng_state(state)
+        z = Laplace(loc, scale).rsample()
+        z.backward(torch.ones_like(z))
+        self.assertEqual(loc.grad, torch.ones_like(loc))
+        self.assertEqual(scale.grad, -eps.sign() * torch.log1p(-2 * eps.abs()))
+        loc.grad.zero_()
+        scale.grad.zero_()
+        self.assertEqual(z.size(), (5, 5))
+
+        def ref_log_prob(idx, x, log_prob):
+            m = loc.view(-1)[idx]
+            s = scale.view(-1)[idx]
+            expected = (-math.log(2 * s) - abs(x - m) / s)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Laplace(loc, scale), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_laplace_sample(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
+        for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Laplace(loc, scale),
+                                        scipy.stats.laplace(loc=loc, scale=scale),
+                                        'Laplace(loc={}, scale={})'.format(loc, scale))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma_shape(self):
+        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        beta = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        alpha_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        beta_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
+        self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Gamma(alpha_1d, beta_1d).sample().size(), (1,))
+        self.assertEqual(Gamma(0.5, 0.5).sample().size(), ())
+        self.assertEqual(Gamma(0.5, 0.5).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            a = alpha.view(-1)[idx].detach()
+            b = beta.view(-1)[idx].detach()
+            expected = scipy.stats.gamma.logpdf(x, a, scale=1 / b)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Gamma(alpha, beta), ref_log_prob)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma_gpu_shape(self):
+        alpha = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
+        beta = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
+        alpha_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
+        beta_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
+        self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
+        self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Gamma(alpha_1d, beta_1d).sample().size(), (1,))
+        self.assertEqual(Gamma(0.5, 0.5).sample().size(), ())
+        self.assertEqual(Gamma(0.5, 0.5).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            a = alpha.view(-1)[idx].detach()
+            b = beta.view(-1)[idx].detach()
+            expected = scipy.stats.gamma.logpdf(x, a, scale=1 / b)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Gamma(alpha, beta), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Gamma(alpha, beta),
+                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
+                                        'Gamma(concentration={}, rate={})'.format(alpha, beta))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_gamma_gpu_sample(self):
+        set_rng_seed(0)
+        for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+            a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
+            self._check_sampler_sampler(Gamma(a, b),
+                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
+                                        'Gamma(alpha={}, beta={})'.format(alpha, beta),
+                                        failure_rate=1e-4)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_pareto(self):
+        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True)
+        self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3))
+        self.assertEqual(Pareto(scale, alpha).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Pareto(scale_1d, alpha_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Pareto(scale_1d, alpha_1d).sample().size(), (1,))
+        self.assertEqual(Pareto(1.0, 1.0).sample().size(), ())
+        self.assertEqual(Pareto(1.0, 1.0).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            s = scale.view(-1)[idx].detach()
+            a = alpha.view(-1)[idx].detach()
+            expected = scipy.stats.pareto.logpdf(x, a, scale=s)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Pareto(scale, alpha), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_pareto_sample(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
+        for scale, alpha in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Pareto(scale, alpha),
+                                        scipy.stats.pareto(alpha, scale=scale),
+                                        'Pareto(scale={}, alpha={})'.format(scale, alpha))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gumbel(self):
+        loc = torch.randn(2, 3, requires_grad=True)
+        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        loc_1d = torch.randn(1, requires_grad=True)
+        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        self.assertEqual(Gumbel(loc, scale).sample().size(), (2, 3))
+        self.assertEqual(Gumbel(loc, scale).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Gumbel(loc_1d, scale_1d).sample().size(), (1,))
+        self.assertEqual(Gumbel(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Gumbel(1.0, 1.0).sample().size(), ())
+        self.assertEqual(Gumbel(1.0, 1.0).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            l = loc.view(-1)[idx].detach()
+            s = scale.view(-1)[idx].detach()
+            expected = scipy.stats.gumbel_r.logpdf(x, loc=l, scale=s)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Gumbel(loc, scale), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gumbel_sample(self):
+        set_rng_seed(1)  # see note [Randomized statistical tests]
+        for loc, scale in product([-5.0, -1.0, -0.1, 0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Gumbel(loc, scale),
+                                        scipy.stats.gumbel_r(loc=loc, scale=scale),
+                                        'Gumbel(loc={}, scale={})'.format(loc, scale))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_fishersnedecor(self):
+        df1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        df2 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        df1_1d = torch.randn(1).abs()
+        df2_1d = torch.randn(1).abs()
+        self.assertTrue(is_all_nan(FisherSnedecor(1, 2).mean))
+        self.assertTrue(is_all_nan(FisherSnedecor(1, 4).variance))
+        self.assertEqual(FisherSnedecor(df1, df2).sample().size(), (2, 3))
+        self.assertEqual(FisherSnedecor(df1, df2).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(FisherSnedecor(df1_1d, df2_1d).sample().size(), (1,))
+        self.assertEqual(FisherSnedecor(df1_1d, df2_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(FisherSnedecor(1.0, 1.0).sample().size(), ())
+        self.assertEqual(FisherSnedecor(1.0, 1.0).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            f1 = df1.view(-1)[idx].detach()
+            f2 = df2.view(-1)[idx].detach()
+            expected = scipy.stats.f.logpdf(x, f1, f2)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(FisherSnedecor(df1, df2), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_fishersnedecor_sample(self):
+        set_rng_seed(1)  # see note [Randomized statistical tests]
+        for df1, df2 in product([0.1, 0.5, 1.0, 5.0, 10.0], [0.1, 0.5, 1.0, 5.0, 10.0]):
+            self._check_sampler_sampler(FisherSnedecor(df1, df2),
+                                        scipy.stats.f(df1, df2),
+                                        'FisherSnedecor(loc={}, scale={})'.format(df1, df2))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_chi2_shape(self):
+        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        self.assertEqual(Chi2(df).sample().size(), (2, 3))
+        self.assertEqual(Chi2(df).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Chi2(df_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Chi2(df_1d).sample().size(), (1,))
+        self.assertEqual(Chi2(torch.tensor(0.5, requires_grad=True)).sample().size(), ())
+        self.assertEqual(Chi2(0.5).sample().size(), ())
+        self.assertEqual(Chi2(0.5).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            d = df.view(-1)[idx].detach()
+            expected = scipy.stats.chi2.logpdf(x, d)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(Chi2(df), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_chi2_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for df in [0.1, 1.0, 5.0]:
+            self._check_sampler_sampler(Chi2(df),
+                                        scipy.stats.chi2(df),
+                                        'Chi2(df={})'.format(df))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_studentT(self):
+        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        self.assertTrue(is_all_nan(StudentT(1).mean))
+        self.assertTrue(is_all_nan(StudentT(1).variance))
+        self.assertEqual(StudentT(2).variance, inf, allow_inf=True)
+        self.assertEqual(StudentT(df).sample().size(), (2, 3))
+        self.assertEqual(StudentT(df).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(StudentT(df_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(StudentT(df_1d).sample().size(), (1,))
+        self.assertEqual(StudentT(torch.tensor(0.5, requires_grad=True)).sample().size(), ())
+        self.assertEqual(StudentT(0.5).sample().size(), ())
+        self.assertEqual(StudentT(0.5).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            d = df.view(-1)[idx].detach()
+            expected = scipy.stats.t.logpdf(x, d)
+            self.assertAlmostEqual(log_prob, expected, places=3)
+
+        self._check_log_prob(StudentT(df), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_studentT_sample(self):
+        set_rng_seed(11)  # see Note [Randomized statistical tests]
+        for df, loc, scale in product([0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(StudentT(df=df, loc=loc, scale=scale),
+                                        scipy.stats.t(df=df, loc=loc, scale=scale),
+                                        'StudentT(df={}, loc={}, scale={})'.format(df, loc, scale))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_studentT_log_prob(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        num_samples = 10
+        for df, loc, scale in product([0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+            dist = StudentT(df=df, loc=loc, scale=scale)
+            x = dist.sample((num_samples,))
+            actual_log_prob = dist.log_prob(x)
+            for i in range(num_samples):
+                expected_log_prob = scipy.stats.t.logpdf(x[i], df=df, loc=loc, scale=scale)
+                self.assertAlmostEqual(float(actual_log_prob[i]), float(expected_log_prob), places=3)
+
+    def test_dirichlet_shape(self):
+        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        alpha_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        self.assertEqual(Dirichlet(alpha).sample().size(), (2, 3))
+        self.assertEqual(Dirichlet(alpha).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Dirichlet(alpha_1d).sample().size(), (4,))
+        self.assertEqual(Dirichlet(alpha_1d).sample((1,)).size(), (1, 4))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_dirichlet_log_prob(self):
+        num_samples = 10
+        alpha = torch.exp(torch.randn(5))
+        dist = Dirichlet(alpha)
+        x = dist.sample((num_samples,))
+        actual_log_prob = dist.log_prob(x)
+        for i in range(num_samples):
+            expected_log_prob = scipy.stats.dirichlet.logpdf(x[i].numpy(), alpha.numpy())
+            self.assertAlmostEqual(actual_log_prob[i], expected_log_prob, places=3)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_dirichlet_sample(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        alpha = torch.exp(torch.randn(3))
+        self._check_sampler_sampler(Dirichlet(alpha),
+                                    scipy.stats.dirichlet(alpha.numpy()),
+                                    'Dirichlet(alpha={})'.format(list(alpha)),
+                                    multivariate=True)
+
+    def test_beta_shape(self):
+        con1 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        con0 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
+        con1_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        con0_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        self.assertEqual(Beta(con1, con0).sample().size(), (2, 3))
+        self.assertEqual(Beta(con1, con0).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Beta(con1_1d, con0_1d).sample().size(), (4,))
+        self.assertEqual(Beta(con1_1d, con0_1d).sample((1,)).size(), (1, 4))
+        self.assertEqual(Beta(0.1, 0.3).sample().size(), ())
+        self.assertEqual(Beta(0.1, 0.3).sample((5,)).size(), (5,))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_beta_log_prob(self):
+        for _ in range(100):
+            con1 = np.exp(np.random.normal())
+            con0 = np.exp(np.random.normal())
+            dist = Beta(con1, con0)
+            x = dist.sample()
+            actual_log_prob = dist.log_prob(x).sum()
+            expected_log_prob = scipy.stats.beta.logpdf(x, con1, con0)
+            self.assertAlmostEqual(float(actual_log_prob), float(expected_log_prob), places=3, allow_inf=True)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_beta_sample(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
+        for con1, con0 in product([0.1, 1.0, 10.0], [0.1, 1.0, 10.0]):
+            self._check_sampler_sampler(Beta(con1, con0),
+                                        scipy.stats.beta(con1, con0),
+                                        'Beta(alpha={}, beta={})'.format(con1, con0))
+        # Check that small alphas do not cause NANs.
+        for Tensor in [torch.FloatTensor, torch.DoubleTensor]:
+            x = Beta(Tensor([1e-6]), Tensor([1e-6])).sample()[0]
+            self.assertTrue(np.isfinite(x) and x > 0, 'Invalid Beta.sample(): {}'.format(x))
+
+    def test_independent_shape(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                base_dist = Dist(**param)
+                x = base_dist.sample()
+                base_log_prob_shape = base_dist.log_prob(x).shape
+                for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
+                    indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
+                    indep_log_prob_shape = base_log_prob_shape[:len(base_log_prob_shape) - reinterpreted_batch_ndims]
+                    self.assertEqual(indep_dist.log_prob(x).shape, indep_log_prob_shape)
+                    self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                    self.assertEqual(indep_dist.has_rsample, base_dist.has_rsample)
+                    if indep_dist.has_rsample:
+                        self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                    try:
+                        self.assertEqual(indep_dist.enumerate_support().shape, base_dist.enumerate_support().shape)
+                        self.assertEqual(indep_dist.mean.shape, base_dist.mean.shape)
+                    except NotImplementedError:
+                        pass
+                    try:
+                        self.assertEqual(indep_dist.variance.shape, base_dist.variance.shape)
+                    except NotImplementedError:
+                        pass
+                    try:
+                        self.assertEqual(indep_dist.entropy().shape, indep_log_prob_shape)
+                    except NotImplementedError:
+                        pass
+
+    def test_cdf_icdf_inverse(self):
+        # Tests the invertibility property on the distributions
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                samples = dist.sample(sample_shape=(20,))
+                try:
+                    cdf = dist.cdf(samples)
+                    actual = dist.icdf(cdf)
+                except NotImplementedError:
+                    continue
+                rel_error = torch.abs(actual - samples) / (1e-10 + torch.abs(samples))
+                self.assertLess(rel_error.max(), 1e-4, msg='\n'.join([
+                    '{} example {}/{}, icdf(cdf(x)) != x'.format(Dist.__name__, i + 1, len(params)),
+                    'x = {}'.format(samples),
+                    'cdf(x) = {}'.format(cdf),
+                    'icdf(cdf(x)) = {}'.format(actual),
+                ]))
+
+    def test_cdf_log_prob(self):
+        # Tests if the differentiation of the CDF gives the PDF at a given value
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                samples = torch.tensor(dist.sample())
+                if samples.dtype.is_floating_point:
+                    samples.requires_grad_()
+                try:
+                    cdfs = dist.cdf(samples)
+                    pdfs = dist.log_prob(samples).exp()
+                except NotImplementedError:
+                    continue
+                cdfs_derivative = grad(cdfs.sum(), [samples])[0]  # this should not be wrapped in torch.abs()
+                self.assertEqual(cdfs_derivative, pdfs, message='\n'.join([
+                    '{} example {}/{}, d(cdf)/dx != pdf(x)'.format(Dist.__name__, i + 1, len(params)),
+                    'x = {}'.format(samples),
+                    'cdf = {}'.format(cdfs),
+                    'pdf = {}'.format(pdfs),
+                    'grad(cdf) = {}'.format(cdfs_derivative),
+                ]))
+
+    def test_valid_parameter_broadcasting(self):
+        # Test correct broadcasting of parameter sizes for distributions that have multiple
+        # parameters.
+        # example type (distribution instance, expected sample shape)
+        valid_examples = [
+            (Normal(loc=torch.tensor([0., 0.]), scale=1),
+             (2,)),
+            (Normal(loc=0, scale=torch.tensor([1., 1.])),
+             (2,)),
+            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
+             (2,)),
+            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
+             (1, 2)),
+            (Normal(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
+             (1, 1)),
+            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=1),
+             (2,)),
+            (FisherSnedecor(df1=1, df2=torch.tensor([1., 1.])),
+             (2,)),
+            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([1.])),
+             (2,)),
+            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([[1.]])),
+             (1, 2)),
+            (FisherSnedecor(df1=torch.tensor([1.]), df2=torch.tensor([[1.]])),
+             (1, 1)),
+            (Gamma(concentration=torch.tensor([1., 1.]), rate=1),
+             (2,)),
+            (Gamma(concentration=1, rate=torch.tensor([1., 1.])),
+             (2,)),
+            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.], [1.], [1.]])),
+             (3, 2)),
+            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.]])),
+             (1, 2)),
+            (Gamma(concentration=torch.tensor([1.]), rate=torch.tensor([[1.]])),
+             (1, 1)),
+            (Gumbel(loc=torch.tensor([0., 0.]), scale=1),
+             (2,)),
+            (Gumbel(loc=0, scale=torch.tensor([1., 1.])),
+             (2,)),
+            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
+             (2,)),
+            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
+             (1, 2)),
+            (Gumbel(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
+             (1, 1)),
+            (Laplace(loc=torch.tensor([0., 0.]), scale=1),
+             (2,)),
+            (Laplace(loc=0, scale=torch.tensor([1., 1.])),
+             (2,)),
+            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
+             (2,)),
+            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
+             (1, 2)),
+            (Laplace(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
+             (1, 1)),
+            (Pareto(scale=torch.tensor([1., 1.]), alpha=1),
+             (2,)),
+            (Pareto(scale=1, alpha=torch.tensor([1., 1.])),
+             (2,)),
+            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([1.])),
+             (2,)),
+            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([[1.]])),
+             (1, 2)),
+            (Pareto(scale=torch.tensor([1.]), alpha=torch.tensor([[1.]])),
+             (1, 1)),
+            (StudentT(df=torch.tensor([1., 1.]), loc=1),
+             (2,)),
+            (StudentT(df=1, scale=torch.tensor([1., 1.])),
+             (2,)),
+            (StudentT(df=torch.tensor([1., 1.]), loc=torch.tensor([1.])),
+             (2,)),
+            (StudentT(df=torch.tensor([1., 1.]), scale=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (StudentT(df=torch.tensor([1., 1.]), loc=torch.tensor([[1.]])),
+             (1, 2)),
+            (StudentT(df=torch.tensor([1.]), scale=torch.tensor([[1.]])),
+             (1, 1)),
+        ]
+
+        for dist, expected_size in valid_examples:
+            dist_sample_size = dist.sample().size()
+            self.assertEqual(dist_sample_size, expected_size,
+                             'actual size: {} != expected size: {}'.format(dist_sample_size, expected_size))
+
+    def test_invalid_parameter_broadcasting(self):
+        # invalid broadcasting cases; should throw error
+        # example type (distribution class, distribution params)
+        invalid_examples = [
+            (Normal, {
+                'loc': torch.tensor([[0, 0]]),
+                'scale': torch.tensor([1, 1, 1, 1])
+            }),
+            (Normal, {
+                'loc': torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
+                'scale': torch.tensor([1, 1])
+            }),
+            (FisherSnedecor, {
+                'df1': torch.tensor([1, 1]),
+                'df2': torch.tensor([1, 1, 1]),
+            }),
+            (Gumbel, {
+                'loc': torch.tensor([[0, 0]]),
+                'scale': torch.tensor([1, 1, 1, 1])
+            }),
+            (Gumbel, {
+                'loc': torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
+                'scale': torch.tensor([1, 1])
+            }),
+            (Gamma, {
+                'concentration': torch.tensor([0, 0]),
+                'rate': torch.tensor([1, 1, 1])
+            }),
+            (Laplace, {
+                'loc': torch.tensor([0, 0]),
+                'scale': torch.tensor([1, 1, 1])
+            }),
+            (Pareto, {
+                'scale': torch.tensor([1, 1]),
+                'alpha': torch.tensor([1, 1, 1])
+            }),
+            (StudentT, {
+                'df': torch.tensor([1, 1]),
+                'scale': torch.tensor([1, 1, 1])
+            }),
+            (StudentT, {
+                'df': torch.tensor([1, 1]),
+                'loc': torch.tensor([1, 1, 1])
+            })
+        ]
+
+        for dist, kwargs in invalid_examples:
+            self.assertRaises(RuntimeError, dist, **kwargs)
+
+
+# These tests are only needed for a few distributions that implement custom
+# reparameterized gradients. Most .rsample() implementations simply rely on
+# the reparameterization trick and do not need to be tested for accuracy.
+class TestRsample(TestCase):
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma(self):
+        num_samples = 100
+        for alpha in [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
+            alphas = torch.tensor([alpha] * num_samples, dtype=torch.float, requires_grad=True)
+            betas = alphas.new_ones(num_samples)
+            x = Gamma(alphas, betas).rsample()
+            x.sum().backward()
+            x, ind = x.sort()
+            x = x.detach().numpy()
+            actual_grad = alphas.grad[ind].numpy()
+            # Compare with expected gradient dx/dalpha along constant cdf(x,alpha).
+            cdf = scipy.stats.gamma.cdf
+            pdf = scipy.stats.gamma.pdf
+            eps = 0.01 * alpha / (1.0 + alpha ** 0.5)
+            cdf_alpha = (cdf(x, alpha + eps) - cdf(x, alpha - eps)) / (2 * eps)
+            cdf_x = pdf(x, alpha)
+            expected_grad = -cdf_alpha / cdf_x
+            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
+            self.assertLess(np.max(rel_error), 0.0005, '\n'.join([
+                'Bad gradient dx/alpha for x ~ Gamma({}, 1)'.format(alpha),
+                'x {}'.format(x),
+                'expected {}'.format(expected_grad),
+                'actual {}'.format(actual_grad),
+                'rel error {}'.format(rel_error),
+                'max error {}'.format(rel_error.max()),
+                'at alpha={}, x={}'.format(alpha, x[rel_error.argmax()]),
+            ]))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_chi2(self):
+        num_samples = 100
+        for df in [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
+            dfs = torch.tensor([df] * num_samples, dtype=torch.float, requires_grad=True)
+            x = Chi2(dfs).rsample()
+            x.sum().backward()
+            x, ind = x.sort()
+            x = x.detach().numpy()
+            actual_grad = dfs.grad[ind].numpy()
+            # Compare with expected gradient dx/ddf along constant cdf(x,df).
+            cdf = scipy.stats.chi2.cdf
+            pdf = scipy.stats.chi2.pdf
+            eps = 0.01 * df / (1.0 + df ** 0.5)
+            cdf_df = (cdf(x, df + eps) - cdf(x, df - eps)) / (2 * eps)
+            cdf_x = pdf(x, df)
+            expected_grad = -cdf_df / cdf_x
+            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
+            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
+                'Bad gradient dx/ddf for x ~ Chi2({})'.format(df),
+                'x {}'.format(x),
+                'expected {}'.format(expected_grad),
+                'actual {}'.format(actual_grad),
+                'rel error {}'.format(rel_error),
+                'max error {}'.format(rel_error.max()),
+            ]))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_dirichlet_on_diagonal(self):
+        num_samples = 20
+        grid = [1e-1, 1e0, 1e1]
+        for a0, a1, a2 in product(grid, grid, grid):
+            alphas = torch.tensor([[a0, a1, a2]] * num_samples, dtype=torch.float, requires_grad=True)
+            x = Dirichlet(alphas).rsample()[:, 0]
+            x.sum().backward()
+            x, ind = x.sort()
+            x = x.detach().numpy()
+            actual_grad = alphas.grad[ind].numpy()[:, 0]
+            # Compare with expected gradient dx/dalpha0 along constant cdf(x,alpha).
+            # This reduces to a distribution Beta(alpha[0], alpha[1] + alpha[2]).
+            cdf = scipy.stats.beta.cdf
+            pdf = scipy.stats.beta.pdf
+            alpha, beta = a0, a1 + a2
+            eps = 0.01 * alpha / (1.0 + np.sqrt(alpha))
+            cdf_alpha = (cdf(x, alpha + eps, beta) - cdf(x, alpha - eps, beta)) / (2 * eps)
+            cdf_x = pdf(x, alpha, beta)
+            expected_grad = -cdf_alpha / cdf_x
+            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
+            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
+                'Bad gradient dx[0]/dalpha[0] for Dirichlet([{}, {}, {}])'.format(a0, a1, a2),
+                'x {}'.format(x),
+                'expected {}'.format(expected_grad),
+                'actual {}'.format(actual_grad),
+                'rel error {}'.format(rel_error),
+                'max error {}'.format(rel_error.max()),
+                'at x={}'.format(x[rel_error.argmax()]),
+            ]))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_beta_wrt_alpha(self):
+        num_samples = 20
+        grid = [1e-2, 1e-1, 1e0, 1e1, 1e2]
+        for con1, con0 in product(grid, grid):
+            con1s = torch.tensor([con1] * num_samples, dtype=torch.float, requires_grad=True)
+            con0s = con1s.new_tensor([con0] * num_samples)
+            x = Beta(con1s, con0s).rsample()
+            x.sum().backward()
+            x, ind = x.sort()
+            x = x.detach().numpy()
+            actual_grad = con1s.grad[ind].numpy()
+            # Compare with expected gradient dx/dcon1 along constant cdf(x,con1,con0).
+            cdf = scipy.stats.beta.cdf
+            pdf = scipy.stats.beta.pdf
+            eps = 0.01 * con1 / (1.0 + np.sqrt(con1))
+            cdf_alpha = (cdf(x, con1 + eps, con0) - cdf(x, con1 - eps, con0)) / (2 * eps)
+            cdf_x = pdf(x, con1, con0)
+            expected_grad = -cdf_alpha / cdf_x
+            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
+            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
+                'Bad gradient dx/dcon1 for x ~ Beta({}, {})'.format(con1, con0),
+                'x {}'.format(x),
+                'expected {}'.format(expected_grad),
+                'actual {}'.format(actual_grad),
+                'rel error {}'.format(rel_error),
+                'max error {}'.format(rel_error.max()),
+                'at x = {}'.format(x[rel_error.argmax()]),
+            ]))
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_beta_wrt_beta(self):
+        num_samples = 20
+        grid = [1e-2, 1e-1, 1e0, 1e1, 1e2]
+        for con1, con0 in product(grid, grid):
+            con0s = torch.tensor([con0] * num_samples, dtype=torch.float, requires_grad=True)
+            con1s = con0s.new_tensor([con1] * num_samples)
+            x = Beta(con1s, con0s).rsample()
+            x.sum().backward()
+            x, ind = x.sort()
+            x = x.detach().numpy()
+            actual_grad = con0s.grad[ind].numpy()
+            # Compare with expected gradient dx/dcon0 along constant cdf(x,con1,con0).
+            cdf = scipy.stats.beta.cdf
+            pdf = scipy.stats.beta.pdf
+            eps = 0.01 * con0 / (1.0 + np.sqrt(con0))
+            cdf_beta = (cdf(x, con1, con0 + eps) - cdf(x, con1, con0 - eps)) / (2 * eps)
+            cdf_x = pdf(x, con1, con0)
+            expected_grad = -cdf_beta / cdf_x
+            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
+            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
+                'Bad gradient dx/dcon0 for x ~ Beta({}, {})'.format(con1, con0),
+                'x {}'.format(x),
+                'expected {}'.format(expected_grad),
+                'actual {}'.format(actual_grad),
+                'rel error {}'.format(rel_error),
+                'max error {}'.format(rel_error.max()),
+                'at x = {!r}'.format(x[rel_error.argmax()]),
+            ]))
+
+    def test_dirichlet_multivariate(self):
+        alpha_crit = 0.25 * (5.0 ** 0.5 - 1.0)
+        num_samples = 100000
+        for shift in [-0.1, -0.05, -0.01, 0.0, 0.01, 0.05, 0.10]:
+            alpha = alpha_crit + shift
+            alpha = torch.tensor([alpha], dtype=torch.float, requires_grad=True)
+            alpha_vec = torch.cat([alpha, alpha, alpha.new([1])])
+            z = Dirichlet(alpha_vec.expand(num_samples, 3)).rsample()
+            mean_z3 = 1.0 / (2.0 * alpha + 1.0)
+            loss = torch.pow(z[:, 2] - mean_z3, 2.0).mean()
+            actual_grad = grad(loss, [alpha])[0]
+            # Compute expected gradient by hand.
+            num = 1.0 - 2.0 * alpha - 4.0 * alpha**2
+            den = (1.0 + alpha)**2 * (1.0 + 2.0 * alpha)**3
+            expected_grad = num / den
+            self.assertEqual(actual_grad, expected_grad, 0.002, '\n'.join([
+                "alpha = alpha_c + %.2g" % shift,
+                "expected_grad: %.5g" % expected_grad,
+                "actual_grad: %.5g" % actual_grad,
+                "error = %.2g" % torch.abs(expected_grad - actual_grad).max(),
+            ]))
+
+    def test_dirichlet_tangent_field(self):
+        num_samples = 20
+        alpha_grid = [0.5, 1.0, 2.0]
+
+        # v = dx/dalpha[0] is the reparameterized gradient aka tangent field.
+        def compute_v(x, alpha):
+            return torch.stack([
+                _Dirichlet_backward(x, alpha, torch.eye(3, 3)[i].expand_as(x))[:, 0]
+                for i in range(3)
+            ], dim=-1)
+
+        for a1, a2, a3 in product(alpha_grid, alpha_grid, alpha_grid):
+            alpha = torch.tensor([a1, a2, a3], requires_grad=True).expand(num_samples, 3)
+            x = Dirichlet(alpha).rsample()
+            dlogp_da = grad([Dirichlet(alpha).log_prob(x.detach()).sum()],
+                            [alpha], retain_graph=True)[0][:, 0]
+            dlogp_dx = grad([Dirichlet(alpha.detach()).log_prob(x).sum()],
+                            [x], retain_graph=True)[0]
+            v = torch.stack([grad([x[:, i].sum()], [alpha], retain_graph=True)[0][:, 0]
+                             for i in range(3)], dim=-1)
+            # Compute ramaining properties by finite difference.
+            self.assertEqual(compute_v(x, alpha), v, message='Bug in compute_v() helper')
+            # dx is an arbitrary orthonormal basis tangent to the simplex.
+            dx = torch.tensor([[2., -1., -1.], [0., 1., -1.]])
+            dx /= dx.norm(2, -1, True)
+            eps = 1e-2 * x.min(-1, True)[0]  # avoid boundary
+            dv0 = (compute_v(x + eps * dx[0], alpha) - compute_v(x - eps * dx[0], alpha)) / (2 * eps)
+            dv1 = (compute_v(x + eps * dx[1], alpha) - compute_v(x - eps * dx[1], alpha)) / (2 * eps)
+            div_v = (dv0 * dx[0] + dv1 * dx[1]).sum(-1)
+            # This is a modification of the standard continuity equation, using the product rule to allow
+            # expression in terms of log_prob rather than the less numerically stable log_prob.exp().
+            error = dlogp_da + (dlogp_dx * v).sum(-1) + div_v
+            self.assertLess(torch.abs(error).max(), 0.005, '\n'.join([
+                'Dirichlet([{}, {}, {}]) gradient violates continuity equation:'.format(a1, a2, a3),
+                'error = {}'.format(error),
+            ]))
+
+
+class TestDistributionShapes(TestCase):
+    def setUp(self):
+        super(TestCase, self).setUp()
+        self.scalar_sample = 1
+        self.tensor_sample_1 = torch.ones(3, 2)
+        self.tensor_sample_2 = torch.ones(3, 2, 3)
+        Distribution.set_default_validate_args(True)
+
+    def tearDown(self):
+        super(TestCase, self).tearDown()
+        Distribution.set_default_validate_args(False)
+
+    def test_entropy_shape(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(validate_args=False, **param)
+                try:
+                    actual_shape = dist.entropy().size()
+                    expected_shape = dist.batch_shape if dist.batch_shape else torch.Size()
+                    message = '{} example {}/{}, shape mismatch. expected {}, actual {}'.format(
+                        Dist.__name__, i + 1, len(params), expected_shape, actual_shape)
+                    self.assertEqual(actual_shape, expected_shape, message=message)
+                except NotImplementedError:
+                    continue
+
+    def test_bernoulli_shape_scalar_params(self):
+        bernoulli = Bernoulli(0.3)
+        self.assertEqual(bernoulli._batch_shape, torch.Size())
+        self.assertEqual(bernoulli._event_shape, torch.Size())
+        self.assertEqual(bernoulli.sample().size(), torch.Size())
+        self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, bernoulli.log_prob, self.scalar_sample)
+        self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(bernoulli.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_bernoulli_shape_tensor_params(self):
+        bernoulli = Bernoulli(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        self.assertEqual(bernoulli._batch_shape, torch.Size((3, 2)))
+        self.assertEqual(bernoulli._event_shape, torch.Size(()))
+        self.assertEqual(bernoulli.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
+        self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, bernoulli.log_prob, self.tensor_sample_2)
+        self.assertEqual(bernoulli.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+
+    def test_geometric_shape_scalar_params(self):
+        geometric = Geometric(0.3)
+        self.assertEqual(geometric._batch_shape, torch.Size())
+        self.assertEqual(geometric._event_shape, torch.Size())
+        self.assertEqual(geometric.sample().size(), torch.Size())
+        self.assertEqual(geometric.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, geometric.log_prob, self.scalar_sample)
+        self.assertEqual(geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(geometric.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_geometric_shape_tensor_params(self):
+        geometric = Geometric(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        self.assertEqual(geometric._batch_shape, torch.Size((3, 2)))
+        self.assertEqual(geometric._event_shape, torch.Size(()))
+        self.assertEqual(geometric.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(geometric.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
+        self.assertEqual(geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, geometric.log_prob, self.tensor_sample_2)
+        self.assertEqual(geometric.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+
+    def test_beta_shape_scalar_params(self):
+        dist = Beta(0.1, 0.1)
+        self.assertEqual(dist._batch_shape, torch.Size())
+        self.assertEqual(dist._event_shape, torch.Size())
+        self.assertEqual(dist.sample().size(), torch.Size())
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, dist.log_prob, self.scalar_sample)
+        self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_beta_shape_tensor_params(self):
+        dist = Beta(torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
+                    torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]))
+        self.assertEqual(dist._batch_shape, torch.Size((3, 2)))
+        self.assertEqual(dist._event_shape, torch.Size(()))
+        self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
+        self.assertEqual(dist.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+
+    def test_binomial_shape(self):
+        dist = Binomial(10, torch.tensor([0.6, 0.3]))
+        self.assertEqual(dist._batch_shape, torch.Size((2,)))
+        self.assertEqual(dist._event_shape, torch.Size(()))
+        self.assertEqual(dist.sample().size(), torch.Size((2,)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
+
+    def test_binomial_shape_vectorized_n(self):
+        dist = Binomial(torch.tensor([[10, 3, 1], [4, 8, 4]]), torch.tensor([0.6, 0.3, 0.1]))
+        self.assertEqual(dist._batch_shape, torch.Size((2, 3)))
+        self.assertEqual(dist._event_shape, torch.Size(()))
+        self.assertEqual(dist.sample().size(), torch.Size((2, 3)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 2, 3)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
+
+    def test_multinomial_shape(self):
+        dist = Multinomial(10, torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        self.assertEqual(dist._batch_shape, torch.Size((3,)))
+        self.assertEqual(dist._event_shape, torch.Size((2,)))
+        self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3,)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
+        self.assertEqual(dist.log_prob(torch.ones(3, 1, 2)).size(), torch.Size((3, 3)))
+
+    def test_categorical_shape(self):
+        # unbatched
+        dist = Categorical(torch.tensor([0.6, 0.3, 0.1]))
+        self.assertEqual(dist._batch_shape, torch.Size(()))
+        self.assertEqual(dist._event_shape, torch.Size(()))
+        self.assertEqual(dist.sample().size(), torch.Size())
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2,)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(dist.log_prob(torch.ones(3, 1)).size(), torch.Size((3, 1)))
+        # batched
+        dist = Categorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        self.assertEqual(dist._batch_shape, torch.Size((3,)))
+        self.assertEqual(dist._event_shape, torch.Size(()))
+        self.assertEqual(dist.sample().size(), torch.Size((3,)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3,)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
+        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(dist.log_prob(torch.ones(3, 1)).size(), torch.Size((3, 3)))
+
+    def test_one_hot_categorical_shape(self):
+        # unbatched
+        dist = OneHotCategorical(torch.tensor([0.6, 0.3, 0.1]))
+        self.assertEqual(dist._batch_shape, torch.Size(()))
+        self.assertEqual(dist._event_shape, torch.Size((3,)))
+        self.assertEqual(dist.sample().size(), torch.Size((3,)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
+        simplex_sample = self.tensor_sample_2 / self.tensor_sample_2.sum(-1, keepdim=True)
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 2,)))
+        self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,)))
+        simplex_sample = torch.ones(3, 3) / 3
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        # batched
+        dist = OneHotCategorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        self.assertEqual(dist._batch_shape, torch.Size((3,)))
+        self.assertEqual(dist._event_shape, torch.Size((2,)))
+        self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
+        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True)
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
+        self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3)))
+        simplex_sample = torch.ones(3, 1, 2) / 2
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3)))
+
+    def test_cauchy_shape_scalar_params(self):
+        cauchy = Cauchy(0, 1)
+        self.assertEqual(cauchy._batch_shape, torch.Size())
+        self.assertEqual(cauchy._event_shape, torch.Size())
+        self.assertEqual(cauchy.sample().size(), torch.Size())
+        self.assertEqual(cauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, cauchy.log_prob, self.scalar_sample)
+        self.assertEqual(cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(cauchy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_cauchy_shape_tensor_params(self):
+        cauchy = Cauchy(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        self.assertEqual(cauchy._batch_shape, torch.Size((2,)))
+        self.assertEqual(cauchy._event_shape, torch.Size(()))
+        self.assertEqual(cauchy.sample().size(), torch.Size((2,)))
+        self.assertEqual(cauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, cauchy.log_prob, self.tensor_sample_2)
+        self.assertEqual(cauchy.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_halfcauchy_shape_scalar_params(self):
+        halfcauchy = HalfCauchy(1)
+        self.assertEqual(halfcauchy._batch_shape, torch.Size())
+        self.assertEqual(halfcauchy._event_shape, torch.Size())
+        self.assertEqual(halfcauchy.sample().size(), torch.Size())
+        self.assertEqual(halfcauchy.sample(torch.Size((3, 2))).size(),
+                         torch.Size((3, 2)))
+        self.assertRaises(ValueError, halfcauchy.log_prob, self.scalar_sample)
+        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_1).size(),
+                         torch.Size((3, 2)))
+        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_2).size(),
+                         torch.Size((3, 2, 3)))
+
+    def test_halfcauchy_shape_tensor_params(self):
+        halfcauchy = HalfCauchy(torch.tensor([1., 1.]))
+        self.assertEqual(halfcauchy._batch_shape, torch.Size((2,)))
+        self.assertEqual(halfcauchy._event_shape, torch.Size(()))
+        self.assertEqual(halfcauchy.sample().size(), torch.Size((2,)))
+        self.assertEqual(halfcauchy.sample(torch.Size((3, 2))).size(),
+                         torch.Size((3, 2, 2)))
+        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_1).size(),
+                         torch.Size((3, 2)))
+        self.assertRaises(ValueError, halfcauchy.log_prob, self.tensor_sample_2)
+        self.assertEqual(halfcauchy.log_prob(torch.ones(2, 1)).size(),
+                         torch.Size((2, 2)))
+
+    def test_dirichlet_shape(self):
+        dist = Dirichlet(torch.tensor([[0.6, 0.3], [1.6, 1.3], [2.6, 2.3]]))
+        self.assertEqual(dist._batch_shape, torch.Size((3,)))
+        self.assertEqual(dist._event_shape, torch.Size((2,)))
+        self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
+        self.assertEqual(dist.sample((5, 4)).size(), torch.Size((5, 4, 3, 2)))
+        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True)
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
+        simplex_sample = torch.ones(3, 1, 2)
+        simplex_sample = simplex_sample / simplex_sample.sum(-1).unsqueeze(-1)
+        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3)))
+
+    def test_gamma_shape_scalar_params(self):
+        gamma = Gamma(1, 1)
+        self.assertEqual(gamma._batch_shape, torch.Size())
+        self.assertEqual(gamma._event_shape, torch.Size())
+        self.assertEqual(gamma.sample().size(), torch.Size())
+        self.assertEqual(gamma.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, gamma.log_prob, self.scalar_sample)
+        self.assertEqual(gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(gamma.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_gamma_shape_tensor_params(self):
+        gamma = Gamma(torch.tensor([1., 1.]), torch.tensor([1., 1.]))
+        self.assertEqual(gamma._batch_shape, torch.Size((2,)))
+        self.assertEqual(gamma._event_shape, torch.Size(()))
+        self.assertEqual(gamma.sample().size(), torch.Size((2,)))
+        self.assertEqual(gamma.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, gamma.log_prob, self.tensor_sample_2)
+        self.assertEqual(gamma.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_chi2_shape_scalar_params(self):
+        chi2 = Chi2(1)
+        self.assertEqual(chi2._batch_shape, torch.Size())
+        self.assertEqual(chi2._event_shape, torch.Size())
+        self.assertEqual(chi2.sample().size(), torch.Size())
+        self.assertEqual(chi2.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, chi2.log_prob, self.scalar_sample)
+        self.assertEqual(chi2.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(chi2.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_chi2_shape_tensor_params(self):
+        chi2 = Chi2(torch.tensor([1., 1.]))
+        self.assertEqual(chi2._batch_shape, torch.Size((2,)))
+        self.assertEqual(chi2._event_shape, torch.Size(()))
+        self.assertEqual(chi2.sample().size(), torch.Size((2,)))
+        self.assertEqual(chi2.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(chi2.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, chi2.log_prob, self.tensor_sample_2)
+        self.assertEqual(chi2.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_studentT_shape_scalar_params(self):
+        st = StudentT(1)
+        self.assertEqual(st._batch_shape, torch.Size())
+        self.assertEqual(st._event_shape, torch.Size())
+        self.assertEqual(st.sample().size(), torch.Size())
+        self.assertEqual(st.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, st.log_prob, self.scalar_sample)
+        self.assertEqual(st.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(st.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_studentT_shape_tensor_params(self):
+        st = StudentT(torch.tensor([1., 1.]))
+        self.assertEqual(st._batch_shape, torch.Size((2,)))
+        self.assertEqual(st._event_shape, torch.Size(()))
+        self.assertEqual(st.sample().size(), torch.Size((2,)))
+        self.assertEqual(st.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(st.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, st.log_prob, self.tensor_sample_2)
+        self.assertEqual(st.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_pareto_shape_scalar_params(self):
+        pareto = Pareto(1, 1)
+        self.assertEqual(pareto._batch_shape, torch.Size())
+        self.assertEqual(pareto._event_shape, torch.Size())
+        self.assertEqual(pareto.sample().size(), torch.Size())
+        self.assertEqual(pareto.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertEqual(pareto.log_prob(self.tensor_sample_1 + 1).size(), torch.Size((3, 2)))
+        self.assertEqual(pareto.log_prob(self.tensor_sample_2 + 1).size(), torch.Size((3, 2, 3)))
+
+    def test_gumbel_shape_scalar_params(self):
+        gumbel = Gumbel(1, 1)
+        self.assertEqual(gumbel._batch_shape, torch.Size())
+        self.assertEqual(gumbel._event_shape, torch.Size())
+        self.assertEqual(gumbel.sample().size(), torch.Size())
+        self.assertEqual(gumbel.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertEqual(gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_normal_shape_scalar_params(self):
+        normal = Normal(0, 1)
+        self.assertEqual(normal._batch_shape, torch.Size())
+        self.assertEqual(normal._event_shape, torch.Size())
+        self.assertEqual(normal.sample().size(), torch.Size())
+        self.assertEqual(normal.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, normal.log_prob, self.scalar_sample)
+        self.assertEqual(normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(normal.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_normal_shape_tensor_params(self):
+        normal = Normal(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        self.assertEqual(normal._batch_shape, torch.Size((2,)))
+        self.assertEqual(normal._event_shape, torch.Size(()))
+        self.assertEqual(normal.sample().size(), torch.Size((2,)))
+        self.assertEqual(normal.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, normal.log_prob, self.tensor_sample_2)
+        self.assertEqual(normal.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_uniform_shape_scalar_params(self):
+        uniform = Uniform(0, 1)
+        self.assertEqual(uniform._batch_shape, torch.Size())
+        self.assertEqual(uniform._event_shape, torch.Size())
+        self.assertEqual(uniform.sample().size(), torch.Size())
+        self.assertEqual(uniform.sample(torch.Size((3, 2))).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, uniform.log_prob, self.scalar_sample)
+        self.assertEqual(uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(uniform.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_uniform_shape_tensor_params(self):
+        uniform = Uniform(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        self.assertEqual(uniform._batch_shape, torch.Size((2,)))
+        self.assertEqual(uniform._event_shape, torch.Size(()))
+        self.assertEqual(uniform.sample().size(), torch.Size((2,)))
+        self.assertEqual(uniform.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, uniform.log_prob, self.tensor_sample_2)
+        self.assertEqual(uniform.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+    def test_exponential_shape_scalar_param(self):
+        expon = Exponential(1.)
+        self.assertEqual(expon._batch_shape, torch.Size())
+        self.assertEqual(expon._event_shape, torch.Size())
+        self.assertEqual(expon.sample().size(), torch.Size())
+        self.assertEqual(expon.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, expon.log_prob, self.scalar_sample)
+        self.assertEqual(expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(expon.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_exponential_shape_tensor_param(self):
+        expon = Exponential(torch.tensor([1., 1.]))
+        self.assertEqual(expon._batch_shape, torch.Size((2,)))
+        self.assertEqual(expon._event_shape, torch.Size(()))
+        self.assertEqual(expon.sample().size(), torch.Size((2,)))
+        self.assertEqual(expon.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, expon.log_prob, self.tensor_sample_2)
+        self.assertEqual(expon.log_prob(torch.ones(2, 2)).size(), torch.Size((2, 2)))
+
+    def test_laplace_shape_scalar_params(self):
+        laplace = Laplace(0, 1)
+        self.assertEqual(laplace._batch_shape, torch.Size())
+        self.assertEqual(laplace._event_shape, torch.Size())
+        self.assertEqual(laplace.sample().size(), torch.Size())
+        self.assertEqual(laplace.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, laplace.log_prob, self.scalar_sample)
+        self.assertEqual(laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(laplace.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
+    def test_laplace_shape_tensor_params(self):
+        laplace = Laplace(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        self.assertEqual(laplace._batch_shape, torch.Size((2,)))
+        self.assertEqual(laplace._event_shape, torch.Size(()))
+        self.assertEqual(laplace.sample().size(), torch.Size((2,)))
+        self.assertEqual(laplace.sample((3, 2)).size(), torch.Size((3, 2, 2)))
+        self.assertEqual(laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertRaises(ValueError, laplace.log_prob, self.tensor_sample_2)
+        self.assertEqual(laplace.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+
+
+class TestKL(TestCase):
+
+    def setUp(self):
+
+        class Binomial30(Binomial):
+            def __init__(self, probs):
+                super(Binomial30, self).__init__(30, probs)
+
+        # These are pairs of distributions with 4 x 4 parameters as specified.
+        # The first of the pair e.g. bernoulli[0] varies column-wise and the second
+        # e.g. bernoulli[1] varies row-wise; that way we test all param pairs.
+        bernoulli = pairwise(Bernoulli, [0.1, 0.2, 0.6, 0.9])
+        binomial30 = pairwise(Binomial30, [0.1, 0.2, 0.6, 0.9])
+        binomial_vectorized_count = (Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
+                                     Binomial(torch.tensor([3, 4]), torch.tensor([0.5, 0.8])))
+        beta = pairwise(Beta, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5])
+        categorical = pairwise(Categorical, [[0.4, 0.3, 0.3],
+                                             [0.2, 0.7, 0.1],
+                                             [0.33, 0.33, 0.34],
+                                             [0.2, 0.2, 0.6]])
+        chi2 = pairwise(Chi2, [1.0, 2.0, 2.5, 5.0])
+        dirichlet = pairwise(Dirichlet, [[0.1, 0.2, 0.7],
+                                         [0.5, 0.4, 0.1],
+                                         [0.33, 0.33, 0.34],
+                                         [0.2, 0.2, 0.4]])
+        exponential = pairwise(Exponential, [1.0, 2.5, 5.0, 10.0])
+        gamma = pairwise(Gamma, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5])
+        gumbel = pairwise(Gumbel, [-2.0, 4.0, -3.0, 6.0], [1.0, 2.5, 1.0, 2.5])
+        halfnormal = pairwise(HalfNormal, [1.0, 2.0, 1.0, 2.0])
+        laplace = pairwise(Laplace, [-2.0, 4.0, -3.0, 6.0], [1.0, 2.5, 1.0, 2.5])
+        lognormal = pairwise(LogNormal, [-2.0, 2.0, -3.0, 3.0], [1.0, 2.0, 1.0, 2.0])
+        normal = pairwise(Normal, [-2.0, 2.0, -3.0, 3.0], [1.0, 2.0, 1.0, 2.0])
+        onehotcategorical = pairwise(OneHotCategorical, [[0.4, 0.3, 0.3],
+                                                         [0.2, 0.7, 0.1],
+                                                         [0.33, 0.33, 0.34],
+                                                         [0.2, 0.2, 0.6]])
+        pareto = pairwise(Pareto, [2.5, 4.0, 2.5, 4.0], [2.25, 3.75, 2.25, 3.75])
+        poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0])
+        uniform_within_unit = pairwise(Uniform, [0.15, 0.95, 0.2, 0.8], [0.1, 0.9, 0.25, 0.75])
+        uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7])
+        uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4])
+        uniform_pareto = pairwise(Uniform, [6.5, 8.5, 6.5, 8.5], [7.5, 7.5, 9.5, 9.5])
+
+        # These tests should pass with precision = 0.01, but that makes tests very expensive.
+        # Instead, we test with precision = 0.1 and only test with higher precision locally
+        # when adding a new KL implementation.
+        # The following pairs are not tested due to very high variance of the monte carlo
+        # estimator; their implementations have been reviewed with extra care:
+        # - (pareto, normal)
+        self.precision = 0.1  # Set this to 0.01 when testing a new KL implementation.
+        self.max_samples = int(1e07)  # Increase this when testing at smaller precision.
+        self.samples_per_batch = int(1e04)
+        self.finite_examples = [
+            (bernoulli, bernoulli),
+            (bernoulli, poisson),
+            (beta, beta),
+            (beta, chi2),
+            (beta, exponential),
+            (beta, gamma),
+            (beta, normal),
+            (binomial30, binomial30),
+            (binomial_vectorized_count, binomial_vectorized_count),
+            (categorical, categorical),
+            (chi2, chi2),
+            (chi2, exponential),
+            (chi2, gamma),
+            (chi2, normal),
+            (dirichlet, dirichlet),
+            (exponential, chi2),
+            (exponential, exponential),
+            (exponential, gamma),
+            (exponential, gumbel),
+            (exponential, normal),
+            (gamma, chi2),
+            (gamma, exponential),
+            (gamma, gamma),
+            (gamma, gumbel),
+            (gamma, normal),
+            (gumbel, gumbel),
+            (gumbel, normal),
+            (halfnormal, halfnormal),
+            (laplace, laplace),
+            (lognormal, lognormal),
+            (laplace, normal),
+            (normal, gumbel),
+            (normal, normal),
+            (onehotcategorical, onehotcategorical),
+            (pareto, chi2),
+            (pareto, pareto),
+            (pareto, exponential),
+            (pareto, gamma),
+            (poisson, poisson),
+            (uniform_within_unit, beta),
+            (uniform_positive, chi2),
+            (uniform_positive, exponential),
+            (uniform_positive, gamma),
+            (uniform_real, gumbel),
+            (uniform_real, normal),
+            (uniform_pareto, pareto),
+        ]
+
+        self.infinite_examples = [
+            (Bernoulli(0), Bernoulli(1)),
+            (Bernoulli(1), Bernoulli(0)),
+            (Categorical(torch.tensor([0.9, 0.1])), Categorical(torch.tensor([1., 0.]))),
+            (Beta(1, 2), Uniform(0.25, 1)),
+            (Beta(1, 2), Uniform(0, 0.75)),
+            (Beta(1, 2), Uniform(0.25, 0.75)),
+            (Beta(1, 2), Pareto(1, 2)),
+            (Binomial(31, 0.7), Binomial(30, 0.3)),
+            (Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
+             Binomial(torch.tensor([2, 3]), torch.tensor([0.5, 0.8]))),
+            (Chi2(1), Beta(2, 3)),
+            (Chi2(1), Pareto(2, 3)),
+            (Chi2(1), Uniform(-2, 3)),
+            (Exponential(1), Beta(2, 3)),
+            (Exponential(1), Pareto(2, 3)),
+            (Exponential(1), Uniform(-2, 3)),
+            (Gamma(1, 2), Beta(3, 4)),
+            (Gamma(1, 2), Pareto(3, 4)),
+            (Gamma(1, 2), Uniform(-3, 4)),
+            (Gumbel(-1, 2), Beta(3, 4)),
+            (Gumbel(-1, 2), Chi2(3)),
+            (Gumbel(-1, 2), Exponential(3)),
+            (Gumbel(-1, 2), Gamma(3, 4)),
+            (Gumbel(-1, 2), Pareto(3, 4)),
+            (Gumbel(-1, 2), Uniform(-3, 4)),
+            (Laplace(-1, 2), Beta(3, 4)),
+            (Laplace(-1, 2), Chi2(3)),
+            (Laplace(-1, 2), Exponential(3)),
+            (Laplace(-1, 2), Gamma(3, 4)),
+            (Laplace(-1, 2), Pareto(3, 4)),
+            (Laplace(-1, 2), Uniform(-3, 4)),
+            (Normal(-1, 2), Beta(3, 4)),
+            (Normal(-1, 2), Chi2(3)),
+            (Normal(-1, 2), Exponential(3)),
+            (Normal(-1, 2), Gamma(3, 4)),
+            (Normal(-1, 2), Pareto(3, 4)),
+            (Normal(-1, 2), Uniform(-3, 4)),
+            (Pareto(2, 1), Chi2(3)),
+            (Pareto(2, 1), Exponential(3)),
+            (Pareto(2, 1), Gamma(3, 4)),
+            (Pareto(1, 2), Normal(-3, 4)),
+            (Pareto(1, 2), Pareto(3, 4)),
+            (Poisson(2), Bernoulli(0.5)),
+            (Poisson(2.3), Binomial(10, 0.2)),
+            (Uniform(-1, 1), Beta(2, 2)),
+            (Uniform(0, 2), Beta(3, 4)),
+            (Uniform(-1, 2), Beta(3, 4)),
+            (Uniform(-1, 2), Chi2(3)),
+            (Uniform(-1, 2), Exponential(3)),
+            (Uniform(-1, 2), Gamma(3, 4)),
+            (Uniform(-1, 2), Pareto(3, 4)),
+        ]
+
+    def test_kl_monte_carlo(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for (p, _), (_, q) in self.finite_examples:
+            actual = kl_divergence(p, q)
+            numerator = 0
+            denominator = 0
+            while denominator < self.max_samples:
+                x = p.sample(sample_shape=(self.samples_per_batch,))
+                numerator += (p.log_prob(x) - q.log_prob(x)).sum(0)
+                denominator += x.size(0)
+                expected = numerator / denominator
+                error = torch.abs(expected - actual) / (1 + expected)
+                if error[error == error].max() < self.precision:
+                    break
+            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
+                'Incorrect KL({}, {}).'.format(type(p).__name__, type(q).__name__),
+                'Expected ({} Monte Carlo samples): {}'.format(denominator, expected),
+                'Actual (analytic): {}'.format(actual),
+            ]))
+
+    # Multivariate normal has a separate Monte Carlo based test due to the requirement of random generation of
+    # positive (semi) definite matrices. n is set to 5, but can be increased during testing.
+    def test_kl_multivariate_normal(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        n = 5  # Number of tests for multivariate_normal
+        for i in range(0, n):
+            loc = [torch.randn(4) for _ in range(0, 2)]
+            scale_tril = [transform_to(constraints.lower_cholesky)(torch.randn(4, 4)) for _ in range(0, 2)]
+            p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
+            q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
+            actual = kl_divergence(p, q)
+            numerator = 0
+            denominator = 0
+            while denominator < self.max_samples:
+                x = p.sample(sample_shape=(self.samples_per_batch,))
+                numerator += (p.log_prob(x) - q.log_prob(x)).sum(0)
+                denominator += x.size(0)
+                expected = numerator / denominator
+                error = torch.abs(expected - actual) / (1 + expected)
+                if error[error == error].max() < self.precision:
+                    break
+            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
+                'Incorrect KL(MultivariateNormal, MultivariateNormal) instance {}/{}'.format(i + 1, n),
+                'Expected ({} Monte Carlo sample): {}'.format(denominator, expected),
+                'Actual (analytic): {}'.format(actual),
+            ]))
+
+    def test_kl_multivariate_normal_batched(self):
+        b = 7  # Number of batches
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        scale_tril = [transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)) for _ in range(0, 2)]
+        expected_kl = torch.stack([
+            kl_divergence(MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
+                          MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i])) for i in range(0, b)])
+        actual_kl = kl_divergence(MultivariateNormal(loc[0], scale_tril=scale_tril[0]),
+                                  MultivariateNormal(loc[1], scale_tril=scale_tril[1]))
+        self.assertEqual(expected_kl, actual_kl)
+
+    def test_kl_exponential_family(self):
+        for (p, _), (_, q) in self.finite_examples:
+            if type(p) == type(q) and issubclass(type(p), ExponentialFamily):
+                actual = kl_divergence(p, q)
+                expected = _kl_expfamily_expfamily(p, q)
+                self.assertEqual(actual, expected, message='\n'.join([
+                    'Incorrect KL({}, {}).'.format(type(p).__name__, type(q).__name__),
+                    'Expected (using Bregman Divergence) {}'.format(expected),
+                    'Actual (analytic) {}'.format(actual),
+                    'max error = {}'.format(torch.abs(actual - expected).max())
+                ]))
+
+    def test_kl_infinite(self):
+        for p, q in self.infinite_examples:
+            self.assertTrue((kl_divergence(p, q) == inf).all(),
+                            'Incorrect KL({}, {})'.format(type(p).__name__, type(q).__name__))
+
+    def test_kl_edgecases(self):
+        self.assertEqual(kl_divergence(Bernoulli(0), Bernoulli(0)), 0)
+        self.assertEqual(kl_divergence(Bernoulli(1), Bernoulli(1)), 0)
+        self.assertEqual(kl_divergence(Categorical(torch.tensor([0., 1.])), Categorical(torch.tensor([0., 1.]))), 0)
+
+    def test_kl_shape(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                try:
+                    kl = kl_divergence(dist, dist)
+                except NotImplementedError:
+                    continue
+                expected_shape = dist.batch_shape if dist.batch_shape else torch.Size()
+                self.assertEqual(kl.shape, expected_shape, message='\n'.join([
+                    '{} example {}/{}'.format(Dist.__name__, i + 1, len(params)),
+                    'Expected {}'.format(expected_shape),
+                    'Actual {}'.format(kl.shape),
+                ]))
+
+    def test_entropy_monte_carlo(self):
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                try:
+                    actual = dist.entropy()
+                except NotImplementedError:
+                    continue
+                x = dist.sample(sample_shape=(60000,))
+                expected = -dist.log_prob(x).mean(0)
+                ignore = (expected == inf)
+                expected[ignore] = actual[ignore]
+                self.assertEqual(actual, expected, prec=0.2, message='\n'.join([
+                    '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
+                    'Expected (monte carlo) {}'.format(expected),
+                    'Actual (analytic) {}'.format(actual),
+                    'max error = {}'.format(torch.abs(actual - expected).max()),
+                ]))
+
+    def test_entropy_exponential_family(self):
+        for Dist, params in EXAMPLES:
+            if not issubclass(Dist, ExponentialFamily):
+                continue
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                try:
+                    actual = dist.entropy()
+                except NotImplementedError:
+                    continue
+                try:
+                    expected = ExponentialFamily.entropy(dist)
+                except NotImplementedError:
+                    continue
+                self.assertEqual(actual, expected, message='\n'.join([
+                    '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
+                    'Expected (Bregman Divergence) {}'.format(expected),
+                    'Actual (analytic) {}'.format(actual),
+                    'max error = {}'.format(torch.abs(actual - expected).max())
+                ]))
+
+
+class TestConstraints(TestCase):
+    def test_params_contains(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                for name, value in param.items():
+                    if isinstance(value, numbers.Number):
+                        value = torch.tensor([value])
+                    if Dist in (Categorical, OneHotCategorical, Multinomial) and name == 'probs':
+                        # These distributions accept positive probs, but elsewhere we
+                        # use a stricter constraint to the simplex.
+                        value = value / value.sum(-1, True)
+                    try:
+                        constraint = dist.arg_constraints[name]
+                    except KeyError:
+                        continue  # ignore optional parameters
+
+                    if is_dependent(constraint):
+                        continue
+
+                    message = '{} example {}/{} parameter {} = {}'.format(
+                        Dist.__name__, i + 1, len(params), name, value)
+                    self.assertTrue(constraint.check(value).all(), msg=message)
+
+    def test_support_contains(self):
+        for Dist, params in EXAMPLES:
+            self.assertIsInstance(Dist.support, Constraint)
+            for i, param in enumerate(params):
+                dist = Dist(**param)
+                value = dist.sample()
+                constraint = dist.support
+                message = '{} example {}/{} sample = {}'.format(
+                    Dist.__name__, i + 1, len(params), value)
+                self.assertTrue(constraint.check(value).all(), msg=message)
+
+
+class TestNumericalStability(TestCase):
+    def _test_pdf_score(self,
+                        dist_class,
+                        x,
+                        expected_value,
+                        probs=None,
+                        logits=None,
+                        expected_gradient=None,
+                        prec=1e-5):
+        if probs is not None:
+            p = probs.detach().requires_grad_()
+            dist = dist_class(p)
+        else:
+            p = logits.detach().requires_grad_()
+            dist = dist_class(logits=p)
+        log_pdf = dist.log_prob(x)
+        log_pdf.sum().backward()
+        self.assertEqual(log_pdf,
+                         expected_value,
+                         prec=prec,
+                         message='Incorrect value for tensor type: {}. Expected = {}, Actual = {}'
+                         .format(type(x), expected_value, log_pdf))
+        if expected_gradient is not None:
+            self.assertEqual(p.grad,
+                             expected_gradient,
+                             prec=prec,
+                             message='Incorrect gradient for tensor type: {}. Expected = {}, Actual = {}'
+                             .format(type(x), expected_gradient, p.grad))
+
+    def test_bernoulli_gradient(self):
+        for tensor_type in [torch.FloatTensor, torch.DoubleTensor]:
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 probs=tensor_type([0]),
+                                 x=tensor_type([0]),
+                                 expected_value=tensor_type([0]),
+                                 expected_gradient=tensor_type([0]))
+
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 probs=tensor_type([0]),
+                                 x=tensor_type([1]),
+                                 expected_value=tensor_type([_finfo(tensor_type([])).eps]).log(),
+                                 expected_gradient=tensor_type([0]))
+
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 probs=tensor_type([1e-4]),
+                                 x=tensor_type([1]),
+                                 expected_value=tensor_type([math.log(1e-4)]),
+                                 expected_gradient=tensor_type([10000]))
+
+            # Lower precision due to:
+            # >>> 1 / (1 - torch.FloatTensor([0.9999]))
+            # 9998.3408
+            # [torch.FloatTensor of size 1]
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 probs=tensor_type([1 - 1e-4]),
+                                 x=tensor_type([0]),
+                                 expected_value=tensor_type([math.log(1e-4)]),
+                                 expected_gradient=tensor_type([-10000]),
+                                 prec=2)
+
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 logits=tensor_type([math.log(9999)]),
+                                 x=tensor_type([0]),
+                                 expected_value=tensor_type([math.log(1e-4)]),
+                                 expected_gradient=tensor_type([-1]),
+                                 prec=1e-3)
+
+    def test_bernoulli_with_logits_underflow(self):
+        for tensor_type, lim in ([(torch.FloatTensor, -1e38),
+                                  (torch.DoubleTensor, -1e308)]):
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 logits=tensor_type([lim]),
+                                 x=tensor_type([0]),
+                                 expected_value=tensor_type([0]),
+                                 expected_gradient=tensor_type([0]))
+
+    def test_bernoulli_with_logits_overflow(self):
+        for tensor_type, lim in ([(torch.FloatTensor, 1e38),
+                                  (torch.DoubleTensor, 1e308)]):
+            self._test_pdf_score(dist_class=Bernoulli,
+                                 logits=tensor_type([lim]),
+                                 x=tensor_type([1]),
+                                 expected_value=tensor_type([0]),
+                                 expected_gradient=tensor_type([0]))
+
+    def test_categorical_log_prob(self):
+        for dtype in ([torch.float, torch.double]):
+            p = torch.tensor([0, 1], dtype=dtype, requires_grad=True)
+            categorical = OneHotCategorical(p)
+            log_pdf = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
+            self.assertEqual(log_pdf.item(), 0)
+
+    def test_categorical_log_prob_with_logits(self):
+        for dtype in ([torch.float, torch.double]):
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
+            categorical = OneHotCategorical(logits=p)
+            log_pdf_prob_1 = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
+            self.assertEqual(log_pdf_prob_1.item(), 0)
+            log_pdf_prob_0 = categorical.log_prob(torch.tensor([1, 0], dtype=dtype))
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)
+
+    def test_multinomial_log_prob(self):
+        for dtype in ([torch.float, torch.double]):
+            p = torch.tensor([0, 1], dtype=dtype, requires_grad=True)
+            s = torch.tensor([0, 10], dtype=dtype)
+            multinomial = Multinomial(10, p)
+            log_pdf = multinomial.log_prob(s)
+            self.assertEqual(log_pdf.item(), 0)
+
+    def test_multinomial_log_prob_with_logits(self):
+        for dtype in ([torch.float, torch.double]):
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
+            multinomial = Multinomial(10, logits=p)
+            log_pdf_prob_1 = multinomial.log_prob(torch.tensor([0, 10], dtype=dtype))
+            self.assertEqual(log_pdf_prob_1.item(), 0)
+            log_pdf_prob_0 = multinomial.log_prob(torch.tensor([10, 0], dtype=dtype))
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)
+
+
+class TestLazyLogitsInitialization(TestCase):
+    def setUp(self):
+        self.examples = [e for e in EXAMPLES if e.Dist in
+                         (Categorical, OneHotCategorical, Bernoulli, Binomial, Multinomial)]
+
+    def test_lazy_logits_initialization(self):
+        for Dist, params in self.examples:
+            param = params[0]
+            if 'probs' in param:
+                probs = param.pop('probs')
+                param['logits'] = probs_to_logits(probs)
+                dist = Dist(**param)
+                shape = (1,) if not dist.event_shape else dist.event_shape
+                dist.log_prob(torch.ones(shape))
+                message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
+                self.assertFalse('probs' in vars(dist), msg=message)
+                try:
+                    dist.enumerate_support()
+                except NotImplementedError:
+                    pass
+                self.assertFalse('probs' in vars(dist), msg=message)
+                batch_shape, event_shape = dist.batch_shape, dist.event_shape
+                self.assertFalse('probs' in vars(dist), msg=message)
+
+    def test_lazy_probs_initialization(self):
+        for Dist, params in self.examples:
+            param = params[0]
+            if 'probs' in param:
+                dist = Dist(**param)
+                dist.sample()
+                message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
+                self.assertFalse('logits' in vars(dist), msg=message)
+                try:
+                    dist.enumerate_support()
+                except NotImplementedError:
+                    pass
+                self.assertFalse('logits' in vars(dist), msg=message)
+                batch_shape, event_shape = dist.batch_shape, dist.event_shape
+                self.assertFalse('logits' in vars(dist), msg=message)
+
+
+@unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+class TestAgainstScipy(TestCase):
+    def setUp(self):
+        positive_var = torch.randn(20).exp()
+        positive_var2 = torch.randn(20).exp()
+        random_var = torch.randn(20)
+        simplex_tensor = softmax(torch.randn(20), dim=-1)
+        self.distribution_pairs = [
+            (
+                Bernoulli(simplex_tensor),
+                scipy.stats.bernoulli(simplex_tensor)
+            ),
+            (
+                Beta(positive_var, positive_var2),
+                scipy.stats.beta(positive_var, positive_var2)
+            ),
+            (
+                Binomial(10, simplex_tensor),
+                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor)
+            ),
+            (
+                Cauchy(random_var, positive_var),
+                scipy.stats.cauchy(loc=random_var, scale=positive_var)
+            ),
+            (
+                Dirichlet(positive_var),
+                scipy.stats.dirichlet(positive_var)
+            ),
+            (
+                Exponential(positive_var),
+                scipy.stats.expon(scale=positive_var.reciprocal())
+            ),
+            (
+                FisherSnedecor(positive_var, 4 + positive_var2),  # var for df2<=4 is undefined
+                scipy.stats.f(positive_var, 4 + positive_var2)
+            ),
+            (
+                Gamma(positive_var, positive_var2),
+                scipy.stats.gamma(positive_var, scale=positive_var2.reciprocal())
+            ),
+            (
+                Geometric(simplex_tensor),
+                scipy.stats.geom(simplex_tensor, loc=-1)
+            ),
+            (
+                Gumbel(random_var, positive_var2),
+                scipy.stats.gumbel_r(random_var, positive_var2)
+            ),
+            (
+                HalfCauchy(positive_var),
+                scipy.stats.halfcauchy(scale=positive_var)
+            ),
+            (
+                HalfNormal(positive_var2),
+                scipy.stats.halfnorm(scale=positive_var2)
+            ),
+            (
+                Laplace(random_var, positive_var2),
+                scipy.stats.laplace(random_var, positive_var2)
+            ),
+            (
+                # Tests fail 1e-5 threshold if scale > 3
+                LogNormal(random_var, positive_var.clamp(max=3)),
+                scipy.stats.lognorm(s=positive_var.clamp(max=3), scale=random_var.exp())
+            ),
+            (
+                Multinomial(10, simplex_tensor),
+                scipy.stats.multinomial(10, simplex_tensor)
+            ),
+            (
+                MultivariateNormal(random_var, torch.diag(positive_var2)),
+                scipy.stats.multivariate_normal(random_var, torch.diag(positive_var2))
+            ),
+            (
+                Normal(random_var, positive_var2),
+                scipy.stats.norm(random_var, positive_var2)
+            ),
+            (
+                OneHotCategorical(simplex_tensor),
+                scipy.stats.multinomial(1, simplex_tensor)
+            ),
+            (
+                Pareto(positive_var, 2 + positive_var2),
+                scipy.stats.pareto(2 + positive_var2, scale=positive_var)
+            ),
+            (
+                Poisson(positive_var),
+                scipy.stats.poisson(positive_var)
+            ),
+            (
+                StudentT(2 + positive_var, random_var, positive_var2),
+                scipy.stats.t(2 + positive_var, random_var, positive_var2)
+            ),
+            (
+                Uniform(random_var, random_var + positive_var),
+                scipy.stats.uniform(random_var, positive_var)
+            )
+        ]
+
+    def test_mean(self):
+        for pytorch_dist, scipy_dist in self.distribution_pairs:
+            if isinstance(pytorch_dist, (Cauchy, HalfCauchy)):
+                # Cauchy, HalfCauchy distributions' mean is nan, skipping check
+                continue
+            elif isinstance(pytorch_dist, MultivariateNormal):
+                self.assertEqual(pytorch_dist.mean, scipy_dist.mean, allow_inf=True, message=pytorch_dist)
+            else:
+                self.assertEqual(pytorch_dist.mean, scipy_dist.mean(), allow_inf=True, message=pytorch_dist)
+
+    def test_variance_stddev(self):
+        for pytorch_dist, scipy_dist in self.distribution_pairs:
+            if isinstance(pytorch_dist, (Cauchy, HalfCauchy)):
+                # Cauchy, HalfCauchy distributions' standard deviation is nan, skipping check
+                continue
+            elif isinstance(pytorch_dist, (Multinomial, OneHotCategorical)):
+                self.assertEqual(pytorch_dist.variance, np.diag(scipy_dist.cov()), message=pytorch_dist)
+                self.assertEqual(pytorch_dist.stddev, np.diag(scipy_dist.cov()) ** 0.5, message=pytorch_dist)
+            elif isinstance(pytorch_dist, MultivariateNormal):
+                self.assertEqual(pytorch_dist.variance, np.diag(scipy_dist.cov), message=pytorch_dist)
+                self.assertEqual(pytorch_dist.stddev, np.diag(scipy_dist.cov) ** 0.5, message=pytorch_dist)
+            else:
+                self.assertEqual(pytorch_dist.variance, scipy_dist.var(), allow_inf=True, message=pytorch_dist)
+                self.assertEqual(pytorch_dist.stddev, scipy_dist.var() ** 0.5, message=pytorch_dist)
+
+    def test_cdf(self):
+        for pytorch_dist, scipy_dist in self.distribution_pairs:
+            samples = pytorch_dist.sample((5,))
+            try:
+                cdf = pytorch_dist.cdf(samples)
+            except NotImplementedError:
+                continue
+            self.assertEqual(cdf, scipy_dist.cdf(samples), message=pytorch_dist)
+
+    def test_icdf(self):
+        for pytorch_dist, scipy_dist in self.distribution_pairs:
+            samples = torch.rand((5,) + pytorch_dist.batch_shape)
+            try:
+                icdf = pytorch_dist.icdf(samples)
+            except NotImplementedError:
+                continue
+            self.assertEqual(icdf, scipy_dist.ppf(samples), message=pytorch_dist)
+
+
+class TestTransforms(TestCase):
+    def setUp(self):
+        self.transforms = []
+        transforms_by_cache_size = {}
+        for cache_size in [0, 1]:
+            transforms = [
+                AbsTransform(cache_size=cache_size),
+                ExpTransform(cache_size=cache_size),
+                PowerTransform(exponent=2,
+                               cache_size=cache_size),
+                PowerTransform(exponent=torch.tensor(5.).normal_(),
+                               cache_size=cache_size),
+                SigmoidTransform(cache_size=cache_size),
+                AffineTransform(0, 1, cache_size=cache_size),
+                AffineTransform(1, -2, cache_size=cache_size),
+                AffineTransform(torch.randn(5),
+                                torch.randn(5),
+                                cache_size=cache_size),
+                AffineTransform(torch.randn(4, 5),
+                                torch.randn(4, 5),
+                                cache_size=cache_size),
+                SoftmaxTransform(cache_size=cache_size),
+                StickBreakingTransform(cache_size=cache_size),
+                LowerCholeskyTransform(cache_size=cache_size),
+                ComposeTransform([
+                    AffineTransform(torch.randn(4, 5),
+                                    torch.randn(4, 5),
+                                    cache_size=cache_size),
+                ]),
+                ComposeTransform([
+                    AffineTransform(torch.randn(4, 5),
+                                    torch.randn(4, 5),
+                                    cache_size=cache_size),
+                    ExpTransform(cache_size=cache_size),
+                ]),
+                ComposeTransform([
+                    AffineTransform(0, 1, cache_size=cache_size),
+                    AffineTransform(torch.randn(4, 5),
+                                    torch.randn(4, 5),
+                                    cache_size=cache_size),
+                    AffineTransform(1, -2, cache_size=cache_size),
+                    AffineTransform(torch.randn(4, 5),
+                                    torch.randn(4, 5),
+                                    cache_size=cache_size),
+                ]),
+            ]
+            for t in transforms[:]:
+                transforms.append(t.inv)
+            transforms.append(identity_transform)
+            self.transforms += transforms
+            if cache_size == 0:
+                self.unique_transforms = transforms[:]
+
+    def _generate_data(self, transform):
+        domain = transform.domain
+        codomain = transform.codomain
+        x = torch.empty(4, 5)
+        if domain is constraints.lower_cholesky or codomain is constraints.lower_cholesky:
+            x = torch.empty(6, 6)
+            x = x.normal_()
+            return x
+        elif domain is constraints.real:
+            return x.normal_()
+        elif domain is constraints.positive:
+            return x.normal_().exp()
+        elif domain is constraints.unit_interval:
+            return x.uniform_()
+        elif domain is constraints.simplex:
+            x = x.normal_().exp()
+            x /= x.sum(-1, True)
+            return x
+        raise ValueError('Unsupported domain: {}'.format(domain))
+
+    def test_inv_inv(self):
+        for t in self.transforms:
+            self.assertTrue(t.inv.inv is t)
+
+    def test_equality(self):
+        transforms = self.unique_transforms
+        for x, y in product(transforms, transforms):
+            if x is y:
+                self.assertTrue(x == y)
+                self.assertFalse(x != y)
+            else:
+                self.assertFalse(x == y)
+                self.assertTrue(x != y)
+
+        self.assertTrue(identity_transform == identity_transform.inv)
+        self.assertFalse(identity_transform != identity_transform.inv)
+
+    def test_forward_inverse_cache(self):
+        for transform in self.transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            try:
+                y = transform(x)
+            except NotImplementedError:
+                continue
+            x2 = transform.inv(y)  # should be implemented at least by caching
+            y2 = transform(x2)  # should be implemented at least by caching
+            if transform.bijective:
+                # verify function inverse
+                self.assertEqual(x2, x, message='\n'.join([
+                    '{} t.inv(t(-)) error'.format(transform),
+                    'x = {}'.format(x),
+                    'y = t(x) = {}'.format(y),
+                    'x2 = t.inv(y) = {}'.format(x2),
+                ]))
+            else:
+                # verify weaker function pseudo-inverse
+                self.assertEqual(y2, y, message='\n'.join([
+                    '{} t(t.inv(t(-))) error'.format(transform),
+                    'x = {}'.format(x),
+                    'y = t(x) = {}'.format(y),
+                    'x2 = t.inv(y) = {}'.format(x2),
+                    'y2 = t(x2) = {}'.format(y2),
+                ]))
+
+    def test_forward_inverse_no_cache(self):
+        for transform in self.transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            try:
+                y = transform(x)
+                x2 = transform.inv(y.clone())  # bypass cache
+                y2 = transform(x2)
+            except NotImplementedError:
+                continue
+            if transform.bijective:
+                # verify function inverse
+                self.assertEqual(x2, x, message='\n'.join([
+                    '{} t.inv(t(-)) error'.format(transform),
+                    'x = {}'.format(x),
+                    'y = t(x) = {}'.format(y),
+                    'x2 = t.inv(y) = {}'.format(x2),
+                ]))
+            else:
+                # verify weaker function pseudo-inverse
+                self.assertEqual(y2, y, message='\n'.join([
+                    '{} t(t.inv(t(-))) error'.format(transform),
+                    'x = {}'.format(x),
+                    'y = t(x) = {}'.format(y),
+                    'x2 = t.inv(y) = {}'.format(x2),
+                    'y2 = t(x2) = {}'.format(y2),
+                ]))
+
+    def test_univariate_forward_jacobian(self):
+        for transform in self.transforms:
+            if transform.event_dim > 0:
+                continue
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            try:
+                y = transform(x)
+                actual = transform.log_abs_det_jacobian(x, y)
+            except NotImplementedError:
+                continue
+            expected = torch.abs(grad([y.sum()], [x])[0]).log()
+            self.assertEqual(actual, expected, message='\n'.join([
+                'Bad {}.log_abs_det_jacobian() disagrees with ()'.format(transform),
+                'Expected: {}'.format(expected),
+                'Actual: {}'.format(actual),
+            ]))
+
+    def test_univariate_inverse_jacobian(self):
+        for transform in self.transforms:
+            if transform.event_dim > 0:
+                continue
+            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            try:
+                x = transform.inv(y)
+                actual = transform.log_abs_det_jacobian(x, y)
+            except NotImplementedError:
+                continue
+            expected = -torch.abs(grad([x.sum()], [y])[0]).log()
+            self.assertEqual(actual, expected, message='\n'.join([
+                '{}.log_abs_det_jacobian() disagrees with .inv()'.format(transform),
+                'Expected: {}'.format(expected),
+                'Actual: {}'.format(actual),
+            ]))
+
+    def test_jacobian_shape(self):
+        for transform in self.transforms:
+            x = self._generate_data(transform)
+            try:
+                y = transform(x)
+                actual = transform.log_abs_det_jacobian(x, y)
+            except NotImplementedError:
+                continue
+            self.assertEqual(actual.shape, x.shape[:x.dim() - transform.event_dim])
+
+    def test_transform_shapes(self):
+        transform0 = ExpTransform()
+        transform1 = SoftmaxTransform()
+        transform2 = LowerCholeskyTransform()
+
+        self.assertEqual(transform0.event_dim, 0)
+        self.assertEqual(transform1.event_dim, 1)
+        self.assertEqual(transform2.event_dim, 2)
+        self.assertEqual(ComposeTransform([transform0, transform1]).event_dim, 1)
+        self.assertEqual(ComposeTransform([transform0, transform2]).event_dim, 2)
+        self.assertEqual(ComposeTransform([transform1, transform2]).event_dim, 2)
+
+    def test_transformed_distribution_shapes(self):
+        transform0 = ExpTransform()
+        transform1 = SoftmaxTransform()
+        transform2 = LowerCholeskyTransform()
+        base_dist0 = Normal(torch.zeros(4, 4), torch.ones(4, 4))
+        base_dist1 = Dirichlet(torch.ones(4, 4))
+        base_dist2 = Normal(torch.zeros(3, 4, 4), torch.ones(3, 4, 4))
+        examples = [
+            ((4, 4), (), base_dist0),
+            ((4,), (4,), base_dist1),
+            ((4, 4), (), TransformedDistribution(base_dist0, [transform0])),
+            ((4,), (4,), TransformedDistribution(base_dist0, [transform1])),
+            ((4,), (4,), TransformedDistribution(base_dist0, [transform0, transform1])),
+            ((), (4, 4), TransformedDistribution(base_dist0, [transform0, transform2])),
+            ((4,), (4,), TransformedDistribution(base_dist0, [transform1, transform0])),
+            ((), (4, 4), TransformedDistribution(base_dist0, [transform1, transform2])),
+            ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform0])),
+            ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform1])),
+            ((4,), (4,), TransformedDistribution(base_dist1, [transform0])),
+            ((4,), (4,), TransformedDistribution(base_dist1, [transform1])),
+            ((), (4, 4), TransformedDistribution(base_dist1, [transform2])),
+            ((4,), (4,), TransformedDistribution(base_dist1, [transform0, transform1])),
+            ((), (4, 4), TransformedDistribution(base_dist1, [transform0, transform2])),
+            ((4,), (4,), TransformedDistribution(base_dist1, [transform1, transform0])),
+            ((), (4, 4), TransformedDistribution(base_dist1, [transform1, transform2])),
+            ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform0])),
+            ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform1])),
+            ((3, 4, 4), (), base_dist2),
+            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2])),
+            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform0, transform2])),
+            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform1, transform2])),
+            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform0])),
+            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform1])),
+        ]
+        for batch_shape, event_shape, dist in examples:
+            self.assertEqual(dist.batch_shape, batch_shape)
+            self.assertEqual(dist.event_shape, event_shape)
+            x = dist.rsample()
+            try:
+                dist.log_prob(x)  # this should not crash
+            except NotImplementedError:
+                continue
+
+
+class TestConstraintRegistry(TestCase):
+    def get_constraints(self, is_cuda=False):
+        tensor = torch.cuda.DoubleTensor if is_cuda else torch.DoubleTensor
+        return [
+            constraints.real,
+            constraints.positive,
+            constraints.greater_than(tensor([-10., -2, 0, 2, 10])),
+            constraints.greater_than(0),
+            constraints.greater_than(2),
+            constraints.greater_than(-2),
+            constraints.less_than(tensor([-10., -2, 0, 2, 10])),
+            constraints.less_than(0),
+            constraints.less_than(2),
+            constraints.less_than(-2),
+            constraints.unit_interval,
+            constraints.interval(tensor([-4., -2, 0, 2, 4]),
+                                 tensor([-3., 3, 1, 5, 5])),
+            constraints.interval(-2, -1),
+            constraints.interval(1, 2),
+            constraints.simplex,
+            constraints.lower_cholesky,
+        ]
+
+    def test_biject_to(self):
+        for constraint in self.get_constraints():
+            try:
+                t = biject_to(constraint)
+            except NotImplementedError:
+                continue
+            self.assertTrue(t.bijective, "biject_to({}) is not bijective".format(constraint))
+            x = torch.randn(5, 5)
+            y = t(x)
+            self.assertTrue(constraint.check(y).all(), '\n'.join([
+                "Failed to biject_to({})".format(constraint),
+                "x = {}".format(x),
+                "biject_to(...)(x) = {}".format(y),
+            ]))
+            x2 = t.inv(y)
+            self.assertEqual(x, x2, message="Error in biject_to({}) inverse".format(constraint))
+
+            j = t.log_abs_det_jacobian(x, y)
+            self.assertEqual(j.shape, x.shape[:x.dim() - t.event_dim])
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_biject_to_cuda(self):
+        for constraint in self.get_constraints(is_cuda=True):
+            try:
+                t = biject_to(constraint)
+            except NotImplementedError:
+                continue
+            self.assertTrue(t.bijective, "biject_to({}) is not bijective".format(constraint))
+            # x = torch.randn(5, 5, device="cuda")
+            x = torch.randn(5, 5).cuda()
+            y = t(x)
+            self.assertTrue(constraint.check(y).all(), '\n'.join([
+                "Failed to biject_to({})".format(constraint),
+                "x = {}".format(x),
+                "biject_to(...)(x) = {}".format(y),
+            ]))
+            x2 = t.inv(y)
+            self.assertEqual(x, x2, message="Error in biject_to({}) inverse".format(constraint))
+
+            j = t.log_abs_det_jacobian(x, y)
+            self.assertEqual(j.shape, x.shape[:x.dim() - t.event_dim])
+
+    def test_transform_to(self):
+        for constraint in self.get_constraints():
+            t = transform_to(constraint)
+            x = torch.randn(5, 5)
+            y = t(x)
+            self.assertTrue(constraint.check(y).all(), "Failed to transform_to({})".format(constraint))
+            x2 = t.inv(y)
+            y2 = t(x2)
+            self.assertEqual(y, y2, message="Error in transform_to({}) pseudoinverse".format(constraint))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_transform_to_cuda(self):
+        for constraint in self.get_constraints(is_cuda=True):
+            t = transform_to(constraint)
+            # x = torch.randn(5, 5, device="cuda")
+            x = torch.randn(5, 5).cuda()
+            y = t(x)
+            self.assertTrue(constraint.check(y).all(), "Failed to transform_to({})".format(constraint))
+            x2 = t.inv(y)
+            y2 = t(x2)
+            self.assertEqual(y, y2, message="Error in transform_to({}) pseudoinverse".format(constraint))
+
+
+class TestValidation(TestCase):
+    def setUp(self):
+        super(TestCase, self).setUp()
+        Distribution.set_default_validate_args(True)
+
+    def test_valid(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                Dist(validate_args=True, **param)
+
+    @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
+    def test_invalid(self):
+        for Dist, params in BAD_EXAMPLES:
+            for i, param in enumerate(params):
+                try:
+                    with self.assertRaises(ValueError):
+                        Dist(validate_args=True, **param)
+                except AssertionError:
+                    fail_string = 'ValueError not raised for {} example {}/{}'
+                    raise AssertionError(fail_string.format(Dist.__name__, i + 1, len(params)))
+
+    def tearDown(self):
+        super(TestCase, self).tearDown()
+        Distribution.set_default_validate_args(False)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_indexing.py b/test/test_indexing.py
new file mode 100644
index 0000000..00865d9
--- /dev/null
+++ b/test/test_indexing.py
@@ -0,0 +1,546 @@
+from common import TestCase, run_tests, skipIfNoZeroSize
+import torch
+import warnings
+from torch import tensor
+
+
+class TestIndexing(TestCase):
+    def test_single_int(self):
+        v = torch.randn(5, 7, 3)
+        self.assertEqual(v[4].shape, (7, 3))
+
+    def test_multiple_int(self):
+        v = torch.randn(5, 7, 3)
+        self.assertEqual(v[4].shape, (7, 3))
+        self.assertEqual(v[4, :, 1].shape, (7,))
+
+    def test_none(self):
+        v = torch.randn(5, 7, 3)
+        self.assertEqual(v[None].shape, (1, 5, 7, 3))
+        self.assertEqual(v[:, None].shape, (5, 1, 7, 3))
+        self.assertEqual(v[:, None, None].shape, (5, 1, 1, 7, 3))
+        self.assertEqual(v[..., None].shape, (5, 7, 3, 1))
+
+    def test_step(self):
+        v = torch.arange(10)
+        self.assertEqual(v[::1], v)
+        self.assertEqual(v[::2].tolist(), [0, 2, 4, 6, 8])
+        self.assertEqual(v[::3].tolist(), [0, 3, 6, 9])
+        self.assertEqual(v[::11].tolist(), [0])
+        self.assertEqual(v[1:6:2].tolist(), [1, 3, 5])
+
+    def test_step_assignment(self):
+        v = torch.zeros(4, 4)
+        v[0, 1::2] = torch.tensor([3., 4.])
+        self.assertEqual(v[0].tolist(), [0, 3, 0, 4])
+        self.assertEqual(v[1:].sum(), 0)
+
+    def test_byte_mask(self):
+        v = torch.randn(5, 7, 3)
+        mask = torch.ByteTensor([1, 0, 1, 1, 0])
+        self.assertEqual(v[mask].shape, (3, 7, 3))
+        self.assertEqual(v[mask], torch.stack([v[0], v[2], v[3]]))
+
+        v = torch.tensor([1.])
+        self.assertEqual(v[v == 0], torch.tensor([]))
+
+    def test_multiple_byte_mask(self):
+        v = torch.randn(5, 7, 3)
+        # note: these broadcast together and are transposed to the first dim
+        mask1 = torch.ByteTensor([1, 0, 1, 1, 0])
+        mask2 = torch.ByteTensor([1, 1, 1])
+        self.assertEqual(v[mask1, :, mask2].shape, (3, 7))
+
+    def test_byte_mask2d(self):
+        v = torch.randn(5, 7, 3)
+        c = torch.randn(5, 7)
+        num_ones = (c > 0).sum()
+        r = v[c > 0]
+        self.assertEqual(r.shape, (num_ones, 3))
+
+    def test_int_indices(self):
+        v = torch.randn(5, 7, 3)
+        self.assertEqual(v[[0, 4, 2]].shape, (3, 7, 3))
+        self.assertEqual(v[:, [0, 4, 2]].shape, (5, 3, 3))
+        self.assertEqual(v[:, [[0, 1], [4, 3]]].shape, (5, 2, 2, 3))
+
+    def test_int_indices2d(self):
+        # From the NumPy indexing example
+        x = torch.arange(0, 12).view(4, 3)
+        rows = torch.tensor([[0, 0], [3, 3]])
+        columns = torch.tensor([[0, 2], [0, 2]])
+        self.assertEqual(x[rows, columns].tolist(), [[0, 2], [9, 11]])
+
+    def test_int_indices_broadcast(self):
+        # From the NumPy indexing example
+        x = torch.arange(0, 12).view(4, 3)
+        rows = torch.tensor([0, 3])
+        columns = torch.tensor([0, 2])
+        result = x[rows[:, None], columns]
+        self.assertEqual(result.tolist(), [[0, 2], [9, 11]])
+
+    def test_empty_index(self):
+        x = torch.arange(0, 12).view(4, 3)
+        idx = torch.tensor([], dtype=torch.long)
+        self.assertEqual(x[idx].numel(), 0)
+
+        # empty assignment should have no effect but not throw an exception
+        y = x.clone()
+        y[idx] = -1
+        self.assertEqual(x, y)
+
+        mask = torch.zeros(4, 3).byte()
+        y[mask] = -1
+        self.assertEqual(x, y)
+
+    @skipIfNoZeroSize
+    def test_empty_ndim_index(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn(5, device=device)
+            self.assertEqual(torch.empty(0, 2, device=device), x[torch.empty(0, 2, dtype=torch.int64, device=device)])
+
+            x = torch.randn(2, 3, 4, 5, device=device)
+            self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device),
+                             x[:, torch.empty(0, 6, dtype=torch.int64, device=device)])
+
+    @skipIfNoZeroSize
+    def test_empty_ndim_index_bool(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn(5, device=device)
+            self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)])
+
+    @skipIfNoZeroSize
+    def test_empty_slice(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn(2, 3, 4, 5, device=device)
+            y = x[:, :, :, 1]
+            z = y[:, 1:1, :]
+            self.assertEqual((2, 0, 4), z.shape)
+            # this isn't technically necessary, but matches NumPy stride calculations.
+            self.assertEqual((60, 20, 5), z.stride())
+            self.assertTrue(z.is_contiguous())
+
+    def test_index_getitem_copy_bools_slices(self):
+        true = torch.tensor(1, dtype=torch.uint8)
+        false = torch.tensor(0, dtype=torch.uint8)
+
+        tensors = [torch.randn(2, 3), torch.tensor(3)]
+
+        for a in tensors:
+            self.assertNotEqual(a.data_ptr(), a[True].data_ptr())
+            self.assertEqual(torch.empty(0, *a.shape), a[False])
+            self.assertNotEqual(a.data_ptr(), a[true].data_ptr())
+            self.assertEqual(torch.empty(0, *a.shape), a[false])
+            self.assertEqual(a.data_ptr(), a[None].data_ptr())
+            self.assertEqual(a.data_ptr(), a[...].data_ptr())
+
+    def test_index_setitem_bools_slices(self):
+        true = torch.tensor(1, dtype=torch.uint8)
+        false = torch.tensor(0, dtype=torch.uint8)
+
+        tensors = [torch.randn(2, 3), torch.tensor(3)]
+
+        for a in tensors:
+            # prefix with a 1,1, to ensure we are compatible with numpy which cuts off prefix 1s
+            # (some of these ops already prefix a 1 to the size)
+            neg_ones = torch.ones_like(a) * -1
+            neg_ones_expanded = neg_ones.unsqueeze(0).unsqueeze(0)
+            a[True] = neg_ones_expanded
+            self.assertEqual(a, neg_ones)
+            a[False] = 5
+            self.assertEqual(a, neg_ones)
+            a[true] = neg_ones_expanded * 2
+            self.assertEqual(a, neg_ones * 2)
+            a[false] = 5
+            self.assertEqual(a, neg_ones * 2)
+            a[None] = neg_ones_expanded * 3
+            self.assertEqual(a, neg_ones * 3)
+            a[...] = neg_ones_expanded * 4
+            self.assertEqual(a, neg_ones * 4)
+            if a.dim() == 0:
+                with self.assertRaises(RuntimeError):
+                    a[:] = neg_ones_expanded * 5
+
+    def test_setitem_expansion_error(self):
+        true = torch.tensor(True)
+        a = torch.randn(2, 3)
+        # check prefix with  non-1s doesn't work
+        a_expanded = a.expand(torch.Size([5, 1]) + a.size())
+        with self.assertRaises(RuntimeError):
+            a[True] = a_expanded
+        with self.assertRaises(RuntimeError):
+            a[true] = a_expanded
+
+    def test_getitem_scalars(self):
+        zero = torch.tensor(0, dtype=torch.int64)
+        one = torch.tensor(1, dtype=torch.int64)
+
+        # non-scalar indexed with scalars
+        a = torch.randn(2, 3)
+        self.assertEqual(a[0], a[zero])
+        self.assertEqual(a[0][1], a[zero][one])
+        self.assertEqual(a[0, 1], a[zero, one])
+        self.assertEqual(a[0, one], a[zero, 1])
+
+        # indexing by a scalar should slice (not copy)
+        self.assertEqual(a[0, 1].data_ptr(), a[zero, one].data_ptr())
+        self.assertEqual(a[1].data_ptr(), a[one.int()].data_ptr())
+        self.assertEqual(a[1].data_ptr(), a[one.short()].data_ptr())
+
+        # scalar indexed with scalar
+        r = torch.randn(())
+        with self.assertRaises(RuntimeError):
+            r[:]
+        with self.assertRaises(IndexError):
+            r[zero]
+        self.assertEqual(r, r[...])
+
+    def test_setitem_scalars(self):
+        zero = torch.tensor(0, dtype=torch.int64)
+
+        # non-scalar indexed with scalars
+        a = torch.randn(2, 3)
+        a_set_with_number = a.clone()
+        a_set_with_scalar = a.clone()
+        b = torch.randn(3)
+
+        a_set_with_number[0] = b
+        a_set_with_scalar[zero] = b
+        self.assertEqual(a_set_with_number, a_set_with_scalar)
+        a[1, zero] = 7.7
+        self.assertEqual(7.7, a[1, 0])
+
+        # scalar indexed with scalars
+        r = torch.randn(())
+        with self.assertRaises(RuntimeError):
+            r[:] = 8.8
+        with self.assertRaises(IndexError):
+            r[zero] = 8.8
+        r[...] = 9.9
+        self.assertEqual(9.9, r)
+
+    def test_basic_advanced_combined(self):
+        # From the NumPy indexing example
+        x = torch.arange(0, 12).view(4, 3)
+        self.assertEqual(x[1:2, 1:3], x[1:2, [1, 2]])
+        self.assertEqual(x[1:2, 1:3].tolist(), [[4, 5]])
+
+        # Check that it is a copy
+        unmodified = x.clone()
+        x[1:2, [1, 2]].zero_()
+        self.assertEqual(x, unmodified)
+
+        # But assignment should modify the original
+        unmodified = x.clone()
+        x[1:2, [1, 2]] = 0
+        self.assertNotEqual(x, unmodified)
+
+    def test_int_assignment(self):
+        x = torch.arange(0, 4).view(2, 2)
+        x[1] = 5
+        self.assertEqual(x.tolist(), [[0, 1], [5, 5]])
+
+        x = torch.arange(0, 4).view(2, 2)
+        x[1] = torch.arange(5, 7)
+        self.assertEqual(x.tolist(), [[0, 1], [5, 6]])
+
+    def test_byte_tensor_assignment(self):
+        x = torch.arange(0., 16).view(4, 4)
+        b = torch.ByteTensor([True, False, True, False])
+        value = torch.tensor([3., 4., 5., 6.])
+        x[b] = value
+        self.assertEqual(x[0], value)
+        self.assertEqual(x[1], torch.arange(4, 8))
+        self.assertEqual(x[2], value)
+        self.assertEqual(x[3], torch.arange(12, 16))
+
+    def test_variable_slicing(self):
+        x = torch.arange(0, 16).view(4, 4)
+        indices = torch.IntTensor([0, 1])
+        i, j = indices
+        self.assertEqual(x[i:j], x[0:1])
+
+    def test_ellipsis_tensor(self):
+        x = torch.arange(0, 9).view(3, 3)
+        idx = torch.tensor([0, 2])
+        self.assertEqual(x[..., idx].tolist(), [[0, 2],
+                                                [3, 5],
+                                                [6, 8]])
+        self.assertEqual(x[idx, ...].tolist(), [[0, 1, 2],
+                                                [6, 7, 8]])
+
+    def test_invalid_index(self):
+        x = torch.arange(0, 16).view(4, 4)
+        self.assertRaisesRegex(TypeError, 'slice indices', lambda: x["0":"1"])
+
+    def test_zero_dim_index(self):
+        # We temporarily support indexing a zero-dim tensor as if it were
+        # a one-dim tensor to better maintain backwards compatibility.
+        x = torch.tensor(10)
+        with warnings.catch_warnings(record=True) as w:
+            self.assertEqual(x, x[0])
+            self.assertEqual(len(w), 1)
+
+
+# The tests below are from NumPy test_indexing.py with some modifications to
+# make them compatible with PyTorch. It's licensed under the BDS license below:
+#
+# Copyright (c) 2005-2017, NumPy Developers.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+#
+#     * Neither the name of the NumPy Developers nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+class NumpyTests(TestCase):
+    def test_index_no_floats(self):
+        a = torch.tensor([[[5.]]])
+
+        self.assertRaises(IndexError, lambda: a[0.0])
+        self.assertRaises(IndexError, lambda: a[0, 0.0])
+        self.assertRaises(IndexError, lambda: a[0.0, 0])
+        self.assertRaises(IndexError, lambda: a[0.0, :])
+        self.assertRaises(IndexError, lambda: a[:, 0.0])
+        self.assertRaises(IndexError, lambda: a[:, 0.0, :])
+        self.assertRaises(IndexError, lambda: a[0.0, :, :])
+        self.assertRaises(IndexError, lambda: a[0, 0, 0.0])
+        self.assertRaises(IndexError, lambda: a[0.0, 0, 0])
+        self.assertRaises(IndexError, lambda: a[0, 0.0, 0])
+        self.assertRaises(IndexError, lambda: a[-1.4])
+        self.assertRaises(IndexError, lambda: a[0, -1.4])
+        self.assertRaises(IndexError, lambda: a[-1.4, 0])
+        self.assertRaises(IndexError, lambda: a[-1.4, :])
+        self.assertRaises(IndexError, lambda: a[:, -1.4])
+        self.assertRaises(IndexError, lambda: a[:, -1.4, :])
+        self.assertRaises(IndexError, lambda: a[-1.4, :, :])
+        self.assertRaises(IndexError, lambda: a[0, 0, -1.4])
+        self.assertRaises(IndexError, lambda: a[-1.4, 0, 0])
+        self.assertRaises(IndexError, lambda: a[0, -1.4, 0])
+        # self.assertRaises(IndexError, lambda: a[0.0:, 0.0])
+        # self.assertRaises(IndexError, lambda: a[0.0:, 0.0,:])
+
+    def test_none_index(self):
+        # `None` index adds newaxis
+        a = tensor([1, 2, 3])
+        self.assertEqual(a[None].dim(), a.dim() + 1)
+
+    def test_empty_tuple_index(self):
+        # Empty tuple index creates a view
+        a = tensor([1, 2, 3])
+        self.assertEqual(a[()], a)
+        self.assertEqual(a[()].data_ptr(), a.data_ptr())
+
+    def test_empty_fancy_index(self):
+        # Empty list index creates an empty array
+        a = tensor([1, 2, 3])
+        self.assertEqual(a[[]], torch.tensor([]))
+
+        b = tensor([]).long()
+        self.assertEqual(a[[]], torch.tensor([], dtype=torch.long))
+
+        b = tensor([]).float()
+        self.assertRaises(RuntimeError, lambda: a[b])
+
+    def test_ellipsis_index(self):
+        a = tensor([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]])
+        self.assertIsNot(a[...], a)
+        self.assertEqual(a[...], a)
+        # `a[...]` was `a` in numpy <1.9.
+        self.assertEqual(a[...].data_ptr(), a.data_ptr())
+
+        # Slicing with ellipsis can skip an
+        # arbitrary number of dimensions
+        self.assertEqual(a[0, ...], a[0])
+        self.assertEqual(a[0, ...], a[0, :])
+        self.assertEqual(a[..., 0], a[:, 0])
+
+        # In NumPy, slicing with ellipsis results in a 0-dim array. In PyTorch
+        # we don't have separate 0-dim arrays and scalars.
+        self.assertEqual(a[0, ..., 1], torch.tensor(2))
+
+        # Assignment with `(Ellipsis,)` on 0-d arrays
+        b = torch.tensor(1)
+        b[(Ellipsis,)] = 2
+        self.assertEqual(b, 2)
+
+    def test_single_int_index(self):
+        # Single integer index selects one row
+        a = tensor([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]])
+
+        self.assertEqual(a[0], [1, 2, 3])
+        self.assertEqual(a[-1], [7, 8, 9])
+
+        # Index out of bounds produces IndexError
+        self.assertRaises(IndexError, a.__getitem__, 1 << 30)
+        # Index overflow produces Exception  NB: different exception type
+        self.assertRaises(Exception, a.__getitem__, 1 << 64)
+
+    def test_single_bool_index(self):
+        # Single boolean index
+        a = tensor([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]])
+
+        self.assertEqual(a[True], a[None])
+        self.assertEqual(a[False], a[None][0:0])
+
+    def test_boolean_shape_mismatch(self):
+        arr = torch.ones((5, 4, 3))
+
+        # TODO: prefer IndexError
+        index = tensor([True])
+        self.assertRaisesRegex(RuntimeError, 'mask', lambda: arr[index])
+
+        index = tensor([False] * 6)
+        self.assertRaisesRegex(RuntimeError, 'mask', lambda: arr[index])
+
+        index = torch.ByteTensor(4, 4).zero_()
+        self.assertRaisesRegex(RuntimeError, 'mask', lambda: arr[index])
+
+        self.assertRaisesRegex(RuntimeError, 'mask', lambda: arr[(slice(None), index)])
+
+    def test_boolean_indexing_onedim(self):
+        # Indexing a 2-dimensional array with
+        # boolean array of length one
+        a = tensor([[0., 0., 0.]])
+        b = tensor([True])
+        self.assertEqual(a[b], a)
+        # boolean assignment
+        a[b] = 1.
+        self.assertEqual(a, tensor([[1., 1., 1.]]))
+
+    def test_boolean_assignment_value_mismatch(self):
+        # A boolean assignment should fail when the shape of the values
+        # cannot be broadcast to the subscription. (see also gh-3458)
+        a = torch.arange(0, 4)
+
+        def f(a, v):
+            a[a > -1] = tensor(v)
+
+        self.assertRaisesRegex(Exception, "expand", f, a, [])
+        self.assertRaisesRegex(Exception, 'expand', f, a, [1, 2, 3])
+        self.assertRaisesRegex(Exception, 'expand', f, a[:1], [1, 2, 3])
+
+    def test_boolean_indexing_twodim(self):
+        # Indexing a 2-dimensional array with
+        # 2-dimensional boolean array
+        a = tensor([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]])
+        b = tensor([[True, False, True],
+                    [False, True, False],
+                    [True, False, True]])
+        self.assertEqual(a[b], tensor([1, 3, 5, 7, 9]))
+        self.assertEqual(a[b[1]], tensor([[4, 5, 6]]))
+        self.assertEqual(a[b[0]], a[b[2]])
+
+        # boolean assignment
+        a[b] = 0
+        self.assertEqual(a, tensor([[0, 2, 0],
+                                    [4, 0, 6],
+                                    [0, 8, 0]]))
+
+    def test_boolean_indexing_weirdness(self):
+        # Weird boolean indexing things
+        a = torch.ones((2, 3, 4))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        else:
+            self.assertEqual((0,), a[False, True, ...].shape)
+        self.assertEqual(torch.ones(1, 2), a[True, [0, 1], True, True, [1], [[2]]])
+        if torch._C._use_zero_size_dim():
+            self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...])
+
+    def test_boolean_indexing_weirdness_tensors(self):
+        # Weird boolean indexing things
+        false = torch.tensor(False)
+        true = torch.tensor(True)
+        a = torch.ones((2, 3, 4))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        else:
+            self.assertEqual((0,), a[False, True, ...].shape)
+        self.assertEqual(torch.ones(1, 2), a[true, [0, 1], true, true, [1], [[2]]])
+        if torch._C._use_zero_size_dim():
+            self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...])
+
+    def test_boolean_indexing_alldims(self):
+        true = torch.tensor(True)
+        a = torch.ones((2, 3))
+        self.assertEqual((1, 2, 3), a[True, True].shape)
+        self.assertEqual((1, 2, 3), a[true, true].shape)
+
+    def test_everything_returns_views(self):
+        # Before `...` would return a itself.
+        a = tensor([5])
+
+        self.assertIsNot(a, a[()])
+        self.assertIsNot(a, a[...])
+        self.assertIsNot(a, a[:])
+
+    def test_broaderrors_indexing(self):
+        a = torch.zeros(5, 5)
+        self.assertRaisesRegex(RuntimeError, 'match the size', a.__getitem__, ([0, 1], [0, 1, 2]))
+        self.assertRaisesRegex(RuntimeError, 'match the size', a.__setitem__, ([0, 1], [0, 1, 2]), 0)
+
+    def test_trivial_fancy_out_of_bounds(self):
+        a = torch.zeros(5)
+        ind = torch.ones(20, dtype=torch.int64)
+        ind[-1] = 10
+        self.assertRaises(RuntimeError, a.__getitem__, ind)
+        self.assertRaises(RuntimeError, a.__setitem__, ind, 0)
+        ind = torch.ones(20, dtype=torch.int64)
+        ind[0] = 11
+        self.assertRaises(RuntimeError, a.__getitem__, ind)
+        self.assertRaises(RuntimeError, a.__setitem__, ind, 0)
+
+    def test_index_is_larger(self):
+        # Simple case of fancy index broadcasting of the index.
+        a = torch.zeros((5, 5))
+        a[[[0], [1], [2]], [0, 1, 2]] = tensor([2., 3., 4.])
+
+        self.assertTrue((a[:3, :3] == tensor([2., 3., 4.])).all())
+
+    def test_broadcast_subspace(self):
+        a = torch.zeros((100, 100))
+        v = torch.arange(0., 100)[:, None]
+        b = torch.arange(99, -1, -1).long()
+        a[b] = v
+        expected = b.double().unsqueeze(1).expand(100, 100)
+        self.assertEqual(a, expected)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
new file mode 100644
index 0000000..9a1661e
--- /dev/null
+++ b/test/test_jit.py
@@ -0,0 +1,5061 @@
+import torch
+import torch.jit
+import torch.nn as nn
+import torch.nn.functional as F
+from contextlib import contextmanager
+from itertools import product, chain
+import torch.jit.frontend
+from torch.autograd import Variable, Function
+from torch.autograd.function import traceable
+from torch.testing import assert_allclose
+from torch.onnx import OperatorExportTypes
+from common import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN
+from textwrap import dedent
+import os
+import io
+import sys
+import unittest
+import inspect
+import textwrap
+import numpy as np
+import tempfile
+import shutil
+import warnings
+from test_autograd import method_tests, create_input, unpack_variables, \
+    exclude_tensor_method, EXCLUDE_GRADCHECK, EXCLUDE_FUNCTIONAL
+from copy import deepcopy
+import random
+
+from torch.jit.frontend import NotSupportedError
+from torch.jit import BatchTensor
+
+try:
+    import torchvision
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+
+
+skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
+
+RUN_CUDA = torch.cuda.is_available()
+RUN_CUDA_HALF = RUN_CUDA
+if torch.cuda.is_available():
+    CUDA_VERSION = torch._C._cuda_getCompiledVersion()
+    for d in range(torch.cuda.device_count()):
+        major = torch.cuda.get_device_capability(d)[0]
+        if (CUDA_VERSION < 8000 and major >= 6) or (CUDA_VERSION < 9000 and major >= 7):
+            RUN_CUDA = False
+        if (CUDA_VERSION < 9000 or major < 6):
+            RUN_CUDA_HALF = False
+
+RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
+
+PY2 = sys.version_info[0] == 2
+PY35 = sys.version_info >= (3, 5)
+WINDOWS = sys.platform == 'win32'
+
+
+# TODO: Replace all uses of this function with the literal "0" when the jit
+# is able to support returning numbers (as opposed to only Tensors)
+def FIXME_zerol():
+    return torch.tensor([0])
+
+
+def LSTMCellF(input, hx, cx, *params):
+    return LSTMCell(input, (hx, cx), *params)
+
+
+def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
+    hx, cx = hidden
+    gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
+
+    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+    ingate = torch.sigmoid(ingate)
+    forgetgate = torch.sigmoid(forgetgate)
+    cellgate = torch.tanh(cellgate)
+    outgate = torch.sigmoid(outgate)
+
+    cy = (forgetgate * cx) + (ingate * cellgate)
+    hy = outgate * torch.tanh(cy)
+    return hy, cy
+
+
+def LSTMCellC(*args, **kwargs):
+    hy, cy = LSTMCellF(*args, **kwargs)
+    return torch.cat((hy, cy))
+
+
+def canonical(graph):
+    return str(torch._C._jit_pass_canonicalize(graph))
+
+
+def get_lstm_inputs(device):
+    input = torch.randn(3, 10, dtype=torch.float, device=device)
+    hx = torch.randn(3, 20, dtype=torch.float, device=device)
+    cx = torch.randn(3, 20, dtype=torch.float, device=device)
+    module = nn.LSTMCell(10, 20).to(device, torch.float)  # Just to allocate weights with correct sizes
+    return (input, hx, cx) + tuple(p.requires_grad_(False) for p in module.parameters())
+
+
+def get_fn(file_name, script_path):
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(file_name, script_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    fn = module.fn
+    return fn
+
+
+class JitTestCase(TestCase):
+    def assertExpectedONNXGraph(self, trace, *args, **kwargs):
+        torch.onnx._optimize_trace(trace, operator_export_type=OperatorExportTypes.ONNX)
+        self.assertExpectedGraph(trace, *args, **kwargs)
+
+    def assertExpectedGraph(self, trace, *args, **kwargs):
+        if isinstance(trace, torch._C.Graph):
+            graph = trace
+        else:
+            graph = trace.graph()
+
+        torch._C._jit_pass_lint(graph)
+        torch._C._jit_pass_dce(graph)
+        torch._C._jit_pass_lint(graph)
+        graph = torch._C._jit_pass_canonicalize(graph)
+        torch._C._jit_pass_lint(graph)
+        self.assertExpected(str(graph), *args, **kwargs)
+
+    def run_pass(self, name, trace):
+        if isinstance(trace, torch._C.Graph):
+            graph = trace
+            set_graph = False
+        else:
+            set_graph = True
+            graph = trace.graph()
+
+        torch._C._jit_pass_lint(graph)
+        result = getattr(torch._C, '_jit_pass_' + name)(graph)
+        if result is not None:
+            graph = result
+        torch._C._jit_pass_lint(graph)
+
+        if set_graph:
+            trace.set_graph(graph)
+        return graph
+
+    def checkTrace(self, func, reference_tensors, input_tensors=None,
+                   optimize=True, drop=None, allow_unused=False,
+                   verbose=False, inputs_require_grads=True):
+        # TODO: check gradients for parameters, not just inputs
+        def allSum(vs):
+            # drop allows us to remove some values from ever being used
+            # to test unused outputs
+            if drop is not None:
+                vs = vs[:-drop]
+            # we don't want all the grad for all the outputs to be the same
+            # so we multiply each by a constant
+            return sum([(i + 1) * v.sum() for i, v in enumerate(vs) if v is not None])
+        if input_tensors is None:
+            input_tensors = reference_tensors
+
+        nograd_inputs = reference_tensors
+        if inputs_require_grads:
+            recording_inputs = [t.clone().requires_grad_() for t in reference_tensors]
+        else:
+            recording_inputs = reference_tensors
+
+        if isinstance(func, torch._C.Graph):
+            ge = torch._C.GraphExecutor(func, optimize)
+        else:
+            ge = torch.jit.trace(*input_tensors, optimize=optimize)(func)
+
+        if verbose:
+            print(ge.graph)
+
+        # test no gradients case
+        outputs = func(*nograd_inputs)
+        outputs_ge = ge(*nograd_inputs)
+        self.assertEqual(outputs, outputs_ge)
+
+        # test single grad case
+        outputs = func(*recording_inputs)
+        if inputs_require_grads:
+            grads = torch.autograd.grad(allSum(outputs), recording_inputs,
+                                        allow_unused=allow_unused)
+
+        outputs_ge = ge(*recording_inputs)
+        if inputs_require_grads:
+            grads_ge = torch.autograd.grad(allSum(outputs_ge), recording_inputs,
+                                           allow_unused=allow_unused)
+        self.assertEqual(outputs, outputs_ge)
+        if inputs_require_grads:
+            self.assertEqual(grads, grads_ge)
+
+        # test the grad grad case
+
+        outputs = func(*recording_inputs)
+        l1 = allSum(outputs)
+        if inputs_require_grads:
+            grads = torch.autograd.grad(l1, recording_inputs, create_graph=True,
+                                        allow_unused=allow_unused)
+        if inputs_require_grads:
+            l2 = (allSum(grads) * l1)
+            grads2 = torch.autograd.grad(l2, recording_inputs, allow_unused=allow_unused)
+
+        if inputs_require_grads:
+            recording_inputs = [Variable(t, requires_grad=True)
+                                for t in reference_tensors]
+
+        outputs_ge = ge(*recording_inputs)
+        l1_ge = allSum(outputs_ge)
+        if inputs_require_grads:
+            grads_ge = torch.autograd.grad(
+                l1_ge, recording_inputs, create_graph=True, allow_unused=allow_unused)
+
+        if inputs_require_grads:
+            l2_ge = (allSum(grads_ge) * l1_ge)
+            grads2_ge = torch.autograd.grad(l2_ge, recording_inputs, allow_unused=allow_unused)
+
+        self.assertEqual(outputs, outputs_ge)
+        if inputs_require_grads:
+            self.assertEqual(grads, grads_ge)
+            for g2, g2_ge in zip(grads2, grads2_ge):
+                if g2 is None and g2_ge is None:
+                    continue
+                self.assertTrue(torch.allclose(g2, g2_ge, atol=7e-4, rtol=1e-4))
+
+        return ge
+
+
+class TestJit(JitTestCase):
+    def assertExportImport(self, trace, inputs):
+        initializers = []
+
+        def run(graph):
+            return torch._C.GraphExecutor(graph, False)(*inputs)
+
+        proto, _ = trace.graph().export(initializers, onnx_opset_version=0,
+                                        defer_weight_export=False,
+                                        operator_export_type=OperatorExportTypes.RAW)
+        self.assertFalse(initializers)
+
+        imported_graph, initializers = torch._C._jit_import_graph(proto)
+        self.assertFalse(initializers)
+
+        self.assertEqual(run(trace.graph()), run(imported_graph))
+
+    def test_simple(self):
+        x = torch.tensor([0.4], requires_grad=True)
+        y = torch.tensor([0.7], requires_grad=True)
+
+        def f(x, y):
+            return torch.sigmoid(torch.tanh(x * (x + y)))
+
+        trace, z = torch.jit.get_trace_graph(f, (x, y))
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x, y))
+
+    def test_peephole(self):
+        a = torch.tensor([0.4], requires_grad=True)
+        b = torch.tensor([0.7], requires_grad=True)
+        c = torch.tensor([0], dtype=torch.int32)
+
+        def f(x, y):
+            return x.type_as(y)
+
+        trace, z = torch.jit.get_trace_graph(f, (a, b))
+        self.run_pass('peephole', trace)
+        self.assertExpectedGraph(trace)
+        trace, z = torch.jit.get_trace_graph(f, (a, c))
+        s = str(trace)
+        self.run_pass('peephole', trace)
+        self.assertEqual(s, str(trace))
+
+    def test_peephole_dynamic(self):
+        def f(x, y):
+            return x.type_as(y)
+
+        fn = torch.jit.script(f)
+        s = str(fn.graph)
+        torch._C._jit_pass_peephole(fn.graph)
+        self.assertEqual(s, str(fn.graph))
+
+    @unittest.skipIf(not RUN_CUDA, "cpp tests require CUDA")
+    def test_peephole_cuda(self):
+        a = torch.tensor([0.4], requires_grad=True, device='cpu')
+        b = torch.tensor([0.7], requires_grad=True, device='cuda')
+        c = torch.tensor([0.7], requires_grad=True, device='cuda')
+
+        def f(x, y):
+            return x.type_as(y)
+
+        trace, z = torch.jit.get_trace_graph(f, (a, c))
+        s = str(trace)
+        self.run_pass('peephole', trace)
+        self.assertEqual(s, str(trace))
+        trace, z = torch.jit.get_trace_graph(f, (b, c))
+        self.run_pass('peephole', trace)
+        self.assertExpectedGraph(trace, subname="same_device")
+
+    def test_index(self):
+        x = torch.tensor([0.4], requires_grad=True)
+        y = torch.tensor([0], dtype=torch.int64)
+
+        def fn(x, y):
+            return x[y]
+
+        fn_traced = torch.jit.trace(x, y)(fn)
+
+        self.assertEqual(fn(x, y), fn_traced(x, y))
+
+    # Backwards tracing was broken for indexing by a constant,
+    # because it's internally implemented using as_strided,
+    # and we attempted to trace its derivative (which is not
+    # currently supported.)  It currently works because
+    # slice() is now not marked as traceable.
+    def test_index_constant(self):
+        x = torch.tensor([0.4], requires_grad=True)
+
+        def fn(x):
+            return x[0]
+
+        def run(f):
+            y = f(x)
+            grad = torch.autograd.grad(y, x)[0].clone()
+            return y, grad
+
+        traced_fn = torch.jit.trace(torch.ones(1))(fn)
+        self.assertEqual(run(fn), run(traced_fn))
+
+    def test_scopes(self):
+        x = torch.tensor([0.4], requires_grad=True)
+        y = torch.tensor([0.7], requires_grad=True)
+
+        def f(x, y):
+            out = x + y
+            with torch.jit.scope('Foo', out):
+                out = x * out
+                with torch.jit.scope('Bar', out):
+                    out = torch.tanh(out)
+                out = torch.sigmoid(out)
+            return out
+
+        trace, z = torch.jit.get_trace_graph(f, (x, y))
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x, y))
+
+    def test_scopes_intermediate_node(self):
+
+        class Net(nn.Module):
+            def forward(self, x):
+                return F.log_softmax(x, dim=0)
+
+        net = Net()
+        t = torch.ones(2, requires_grad=True)
+        trace, _ = torch.jit.get_trace_graph(net, (t,))
+        self.assertExportImport(trace, (t,))
+        self.assertExpectedONNXGraph(trace)
+
+    def test_scopes_identity_node(self):
+
+        class Net(nn.Module):
+
+            def __init__(self):
+                super(Net, self).__init__()
+                self.features = nn.Sequential(
+                    nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+                    nn.ReLU(inplace=True),
+                    nn.MaxPool2d(kernel_size=3, stride=2),
+                )
+
+            def forward(self, x):
+                x = self.features(x)
+                return x
+
+        model = Net()
+
+        t = torch.ones(1, 3, 227, 227, requires_grad=True)
+
+        with torch.onnx.set_training(model, False):
+            trace, _ = torch.jit.get_trace_graph(model, (t,))
+
+        self.assertExportImport(trace, (t,) + tuple(model.parameters()))
+        self.assertExpectedONNXGraph(trace)
+
+    # TODO: Fuser doesn't work at all when inputs require grad. Fix that
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_lstm_fusion_cuda(self):
+        inputs = get_lstm_inputs('cuda')
+        ge = self.checkTrace(LSTMCellF, inputs)
+        self.assertExpectedGraph(ge.graph_for(*inputs))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/8746")
+    def test_lstm_fusion_cpu(self):
+        inputs = get_lstm_inputs('cpu')
+        try:
+            ge = self.checkTrace(LSTMCellF, inputs)
+            self.assertExpectedGraph(ge.graph_for(*inputs))
+        except RuntimeError as e:
+            if 'Failed to compile' in e.args[0]:
+                warnings.warn('CPU fuser test has failed! This is not a hard failure, '
+                              'because the kernels sometimes trigger bugs in compilers '
+                              '(most notably GCC 7.2).')
+                raise unittest.SkipTest('Failed to compile')
+            else:
+                raise
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_lstm_fusion_concat(self):
+        inputs = get_lstm_inputs('cuda')
+        ge = self.checkTrace(LSTMCellC, inputs)
+        self.assertExpectedGraph(ge.graph_for(*inputs))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_concat_fusion(self):
+        hx = torch.randn(3, 20, dtype=torch.float, device='cuda')
+        cx = torch.randn(3, 20, dtype=torch.float, device='cuda')
+
+        def foo(hx, cx):
+            return torch.cat((hx + cx, hx * cx))
+
+        ge = self.checkTrace(foo, (hx, cx))
+        self.assertExpectedGraph(ge.graph_for(hx, cx))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_fusion_distribute(self):
+        def f(x, y):
+            z1, z2 = (x + y).chunk(2, dim=1)
+            return z1 * z2
+
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(f, (x, y))
+        self.assertExpectedGraph(ge.graph_for(x, y))
+
+    @staticmethod
+    def fn_test_comparison_gt_lt(x, y):
+        mask = (x > 0).type_as(x)
+        z = x * mask + y
+        mask = (x < 0).type_as(x)
+        z = z * mask + y
+        return z
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_comparison_gt_lt(self):
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(self.fn_test_comparison_gt_lt, (x, y))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_comparison_ge_le(self):
+        def f(x, y):
+            mask = (x >= 0).type_as(x)
+            z = x * mask + y
+            mask = (x <= 0).type_as(x)
+            z = z * mask + y
+            return z
+
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(f, (x, y))
+
+    @staticmethod
+    def fn_test_relu(x, y):
+        return F.relu(x + .5 * y)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_relu(self):
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(self.fn_test_relu, (x, y))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_small_constant(self):
+        def fn_test_small_constant(x, y):
+            return (1e-8 * x + 5e-9 * y) * 1e8
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(fn_test_small_constant, (x, y))
+
+    @staticmethod
+    def fn_test_exp(x, y):
+        return (x + .5 * y).exp()
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_exp(self):
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(self.fn_test_exp, (x, y))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @unittest.skipIf(not RUN_CUDA_HALF, "no half support")
+    def test_cuda_half(self):
+        x = torch.randn(4, 4, dtype=torch.half, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.half, device='cuda')
+
+        funcs = [
+            self.fn_test_comparison_gt_lt,
+            self.fn_test_relu,
+            self.fn_test_exp
+        ]
+
+        # Note: Non fused inputs must be float to prevent loss of precision
+        inputs = (x.float(), y.float())
+        fusion_inputs = (x, y)
+        for fn in funcs:
+            local_inputs = [t.clone().requires_grad_() for t in inputs]
+            local_fusion_inputs = [t.clone().requires_grad_() for t in fusion_inputs]
+
+            # Verifies outputs
+            fusion = torch.jit.trace(*local_fusion_inputs, optimize=True)(fn)
+            outputs = fn(*local_inputs)
+            fusion_outputs = fusion(*local_fusion_inputs)
+            outputs_half = [t.half() for t in outputs]
+            self.assertEqual(outputs_half, fusion_outputs)
+
+            # Verifies gradients
+            for output, fusion_output in zip(outputs_half, fusion_outputs):
+                grads = torch.autograd.grad(
+                    output.float().sum(), local_inputs, allow_unused=True, retain_graph=True)
+                fusion_grads = torch.autograd.grad(
+                    fusion_output.sum(), local_fusion_inputs, allow_unused=True, retain_graph=True)
+                grads_half = [t.half() for t in grads]
+                self.assertEqual(grads_half, fusion_grads)
+
+    # TODO: adapt this test to check that GraphExecutor treats them differently
+    @unittest.skip("Need to be adjusted to Graph Executor")
+    def test_arg_configurations(self):
+        """Different arg configurations should trigger different traces"""
+        x = Variable(torch.FloatTensor(4, 4).uniform_())
+        x_double = Variable(x.data.double())
+        x_grad = Variable(x.data.clone(), requires_grad=True)
+        y = Variable(torch.randn(4))
+
+        configurations = [
+            (x,),
+            (x_double,),
+            (x_grad,),
+            (y,),
+            ([x, x],),
+            ([x, y],),
+        ]
+        if torch.cuda.is_available():
+            x_cuda = Variable(x.data.cuda())
+            configurations += [
+                (x_cuda,),
+                ([x, x_cuda],),
+                ([x_cuda, x],),
+                ([[x_cuda, x]],),
+            ]
+            if torch.cuda.device_count() > 1:
+                x_cuda_1 = Variable(x.data.cuda(1))
+                configurations += [
+                    (x_cuda_1,),
+                    ([x_cuda, x_cuda_1],),
+                ]
+
+        @torch.jit.compile(nderivs=0)
+        def fn(*args):
+            in_vars, _ = torch._C._jit_flatten(args)
+            return in_vars[0] + 1
+
+        for i, config in enumerate(configurations):
+            self.assertFalse(fn.has_trace_for(*config))
+            fn(*config)
+            self.assertTrue(fn.has_trace_for(*config))
+            for unk_config in configurations[i + 1:]:
+                self.assertFalse(fn.has_trace_for(*unk_config))
+        self.assertEqual(fn.hits, 0)
+
+    def test_cse(self):
+        x = torch.tensor([0.4, 0.3], requires_grad=True)
+        y = torch.tensor([0.7, 0.5], requires_grad=True)
+
+        def fn(x, y):
+            w = (x + y) * (x + y) * (x + y)
+            t = torch.tanh(w) + torch.tanh(w)
+            z = (x + y) * (x + y) * (x + y) + t
+            return z
+
+        trace, _ = torch.jit.get_trace_graph(fn, (x, y))
+        self.run_pass('cse', trace)
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x, y))
+
+    def test_scalar(self):
+        # NB: must not require grad; if it requires grad, it's always a Tensor
+        x = torch.tensor(2.)
+        y = torch.tensor(3.)
+
+        def fn(x, y):
+            return x - y
+        trace, _ = torch.jit.get_trace_graph(fn, (x, y))
+
+    def test_shape_analysis_broadcast(self):
+        def broadcast(a, b):
+            return a + b
+
+        x = torch.randn(3, 1, 5, requires_grad=True)
+        y = torch.randn(4, 1, 8, 5, requires_grad=True)
+
+        graph = torch.jit._script_graph(broadcast)
+        torch._C._jit_pass_shape_analysis(graph, (x, y), False)
+        self.assertExpectedGraph(graph)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA_MULTI_GPU, "needs non-zero device")
+    def test_fuse_last_device(self):
+        device = 'cuda:' + str(1)
+        x = torch.tensor([0.4], dtype=torch.float, device=device)
+        y = torch.tensor([0.7], dtype=torch.float, device=device)
+
+        def doit(x, y):
+            return torch.sigmoid(torch.tanh(x * (x + y) + 1))
+
+        ge = self.checkTrace(doit, (x, y))
+        self.assertExpectedGraph(ge.graph_for(x, y))
+
+    # TODO: update verify to work with GraphExecutors
+    @unittest.skip("verify needs to be updated to work with GraphExecutors")
+    def test_verify(self):
+        x = torch.tensor([0.4], requires_grad=True)
+        y = torch.tensor([0.7], requires_grad=True)
+
+        @torch.jit.compile
+        def f(x, y):
+            z = torch.sigmoid(x * (x + y))
+            w = torch.abs(x * x * x + y) + Variable(torch.ones(1))
+            return z, w
+
+        torch.jit.verify(f, (x, y), loss_fn=lambda z, w: z * w, devices=[])
+
+    def test_constant(self):
+        x = torch.randn(2, 2, requires_grad=True)
+
+        def f(x):
+            return x.matmul(torch.diag(torch.tensor([2., 2.])))
+
+        self.checkTrace(f, (x,), (torch.ones(2, 2, requires_grad=True),))
+
+    def test_legacy_fail(self):
+        class MyLegacyFn(Function):
+            def forward(self, x):
+                return x
+
+            def backward(self, grad_output):
+                return grad_output
+
+        x = torch.tensor([0.], requires_grad=True)
+        with self.assertRaisesRegex(RuntimeError, "MyLegacyFn"):
+            torch.jit.get_trace_graph(lambda x: MyLegacyFn()(x), (x,))
+
+    def test_inplace_transplant(self):
+        x = torch.tensor([0.], requires_grad=True)
+
+        def fn(x):
+            y = x.clone()
+            y.add_(2)
+            y.add_(3)
+            return y
+
+        trace, _ = torch.jit.get_trace_graph(fn, (x,))
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x,))
+
+    def test_inplace_flags(self):
+        class InplaceFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.mark_dirty(x)
+                return x.add_(1)
+
+            @staticmethod
+            def backward(ctx, go):
+                return go
+
+        class RegularFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.add(1)
+
+            @staticmethod
+            def backward(ctx, go):
+                return go
+
+        x = torch.tensor([0.], requires_grad=True)
+
+        def fn(x):
+            y = RegularFn.apply(x)
+            y = InplaceFn.apply(y)
+            y = InplaceFn.apply(y)
+            y = RegularFn.apply(y)
+            return y
+
+        trace, _ = torch.jit.get_trace_graph(fn, (x,))
+        self.run_pass('dce', trace)
+        ops = [n for n in trace.graph().nodes()]
+        for op in ops:
+            self.assertTrue(op.hasAttribute('inplace'))
+        inplace_flags = [False, True, True, False]
+        for op, is_inplace in zip(ops, inplace_flags):
+            self.assertEqual(op.i('inplace'), is_inplace)
+
+    def test_inplace_check(self):
+        class MyInplaceFn(Function):
+            @staticmethod
+            def forward(self, x):
+                x.add_(1)
+                self.mark_dirty(x)
+                return x
+
+            @staticmethod
+            def backward(self, grad):
+                return grad
+
+        def fn(x):
+            return MyInplaceFn.apply(x)
+
+        x = torch.randn(5, 5)
+        ge = torch._C.GraphExecutor(fn, (x,))
+        with self.assertRaisesRegex(RuntimeError, 'inplace MyInplaceFn'):
+            ge(x)
+
+    def do_trace_size(self, requires_grad):
+        def fn(x):
+            return x.view(x.shape[1] * 2, x.size(0), 2)
+
+        x = torch.randn(5, 2, 4, requires_grad=requires_grad)
+        y = torch.randn(4, 8, 4, requires_grad=requires_grad)
+
+        # Check that it behaves as expected
+        traced_fn = torch.jit.trace(x)(fn)
+        self.assertEqual(traced_fn(y), fn(y))
+        self.assertEqual(traced_fn(x), fn(x))
+
+        # Check that the trace looks ok
+        trace, _ = torch.jit.get_trace_graph(fn, (x,))
+        self.assertExpectedGraph(trace)
+
+    def test_trace_size(self):
+        self.do_trace_size(False)
+
+    # test the different graph_executor path that happens when
+    # gradients are required and sizes are involved
+    def test_trace_size_with_grad(self):
+        self.do_trace_size(True)
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_output_unflatten(self):
+        """Check that outputs of traced functions retain the original structure and nesting"""
+        def fn(x):
+            return (x * 2, (x ** 2, x + 4, (x + 2,), ), x * 4)
+
+        self.checkTrace(fn, (torch.randn(2, 2),))
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_input_flatten(self):
+        """Check that inputs to traced functions are flattened"""
+
+        def fn(x, t):
+            y, z = t
+            return x * y * z
+
+        inputs = (torch.randn(1), (torch.randn(1), torch.randn(1)))
+        self.checkTrace(fn, inputs)
+
+    # TODO: adapt to a GraphExecutor test
+    @unittest.skip("Need to instrument GraphExecutors a bit more")
+    def test_flags(self):
+        x, y = torch.randn(2, 2)
+        y = Variable(torch.randn(2, 2))
+
+        @torch.jit.compile
+        def fn(x, y):
+            return (x * x + y * y + x * y).sum()
+
+        grads = {}
+        for rx, ry in product((True, False), repeat=2):
+            x.requires_grad = rx
+            y.requires_grad = ry
+
+            self.assertFalse(fn.has_trace_for(x, y))
+            out = fn(x, y)
+
+            self.assertFalse(fn.has_trace_for(x, y))
+            for v, name, compute in [(x, 'x', rx), (y, 'y', ry)]:
+                if not compute:
+                    continue
+                grad_v, = torch.autograd.grad(out, v, retain_graph=True)
+                expected_grad = grads.setdefault(name, grad_v)
+                self.assertEqual(grad_v, expected_grad)
+            self.assertEqual(fn.has_trace_for(x, y), rx or ry)
+
+    def test_python_ir(self):
+        x = torch.tensor([0.4], requires_grad=True)
+        y = torch.tensor([0.7], requires_grad=True)
+
+        def doit(x, y):
+            return torch.sigmoid(torch.tanh(x * (x + y)))
+
+        trace, _ = torch.jit.get_trace_graph(doit, (x, y))
+        self.run_pass('dce', trace)
+        self.run_pass('canonicalize', trace)
+        g = trace.graph()
+        g2 = torch._C.Graph()
+        g_to_g2 = {}
+        for node in g.inputs():
+            g_to_g2[node] = g2.addInput()
+        for node in g.nodes():
+            n_ = g2.createClone(node, lambda x: g_to_g2[x])
+            g2.appendNode(n_)
+            for o, no in zip(node.outputs(), n_.outputs()):
+                g_to_g2[o] = no
+
+        for node in g.outputs():
+            g2.registerOutput(g_to_g2[node])
+
+        t_node = g2.create("prim::TensorTest").t_("a", torch.ones([2, 2]))
+        self.assertEqual(t_node.attributeNames(), ["a"])
+        g2.appendNode(t_node)
+        self.assertTrue(torch.equal(torch.ones(2, 2), t_node.t("a")))
+        self.assertExpected(str(g2))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "cpp tests require CUDA")
+    def test_cpp(self):
+        # rather than rebuild assertExpected in cpp,
+        # just glob all the cpp outputs into one file for now
+        self.assertExpected(torch._C._jit_run_cpp_tests())
+
+    def test_batchnorm(self):
+        x = torch.ones(2, 2, 2, 2)
+        trace, _ = torch.jit.get_trace_graph(nn.BatchNorm2d(2), x)
+        self.assertExpectedGraph(trace)
+
+    def test_dropout(self):
+        x = torch.ones(2, 2)
+        trace, _ = torch.jit.get_trace_graph(nn.Dropout(0.6), x)
+        self.assertExpectedGraph(trace)
+
+    def test_conv(self):
+        x = torch.ones(20, 16, 50, 40)
+        trace, _ = torch.jit.get_trace_graph(nn.Conv2d(16, 13, 3, bias=False), x)
+        self.assertExpectedGraph(trace)
+
+    def test_repeated_input(self):
+        def fn(a, b):
+            return a + b
+
+        ge = self.checkTrace(fn, [torch.randn(2, 2)] * 2)
+        self.assertExpectedGraph(ge.graph)
+
+    def test_repeated_output(self):
+        def fn(a, b):
+            z = a + b
+            return z, z
+
+        ge = self.checkTrace(fn, [torch.randn(2, 2) for _ in range(2)])
+        self.assertExpectedGraph(ge.graph)
+
+    @skipIfNoTorchVision
+    def test_alexnet(self):
+        x = torch.ones(1, 3, 224, 224)
+        trace, _ = torch.jit.get_trace_graph(torchvision.models.AlexNet(), x)
+        self.assertExpectedGraph(trace)
+
+    # Inplace copies don't work with tracer yet.
+    # This is actually somewhat important to support correctly
+    # as all backwards functions of views are implemented
+    # as a zero filled tensor with a gradient fill on the
+    # viewed portion.
+    @unittest.expectedFailure
+    def test_inplace_copy(self):
+        x = torch.randn(4, 4, requires_grad=True)
+
+        def f(x):
+            out = Variable(torch.zeros(x.size()))
+            out.copy_(x)
+            return out
+
+        trace, z = torch.jit.get_trace_graph(f, (x, ))
+        self.run_pass('dce', trace)
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x,))
+
+    def test_shared_param(self):
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.b = self.a = nn.Parameter(torch.randn(2, 2))
+
+            def forward(self, x):
+                return x * self.a + self.b
+
+        m = MyModule()
+        trace, _ = torch.jit.get_trace_graph(m, (torch.randn(2, 2),))
+        self.assertEqual(len(list(trace.graph().inputs())), 2)
+        self.assertExpectedGraph(trace)
+
+    def test_nested_inplace(self):
+        x = torch.randn(2, 2)
+        trace, _ = torch.jit.get_trace_graph(lambda x: F.threshold(x, 0, 0, inplace=True), (x,))
+        self.assertExpectedGraph(trace)
+        self.assertExportImport(trace, (x,))
+
+    def run_ge_tests(self, optimize, use_cuda):
+        def rand(*args):
+            t = torch.rand(*args).float()
+            if use_cuda:
+                t = t.cuda()
+            return t
+        self.checkTrace(lambda a, b: a * b + b,
+                        [rand(1), rand(1)], [rand(2, 3), rand(2, 3)],
+                        optimize=optimize)
+        # trivial identity
+        self.checkTrace(lambda a, b: (
+            b, a), [rand(1), rand(1)], optimize=optimize)
+
+        def foo(a):
+            t = a * a
+            return t * t, 4 * t
+        self.checkTrace(foo, [rand(1)], optimize=optimize)
+        # unused input
+        self.checkTrace(
+            lambda a, b: a * a, [rand(1), rand(1)], optimize=optimize,
+            allow_unused=True)
+        # test outputs that do not get used in grad
+        self.checkTrace(foo, [rand(1)], drop=1, optimize=optimize)
+        # test autograd fallback
+        self.checkTrace(lambda a, b: a * b /
+                        (a - 2 * b) + b, [rand(1), rand(1)],
+                        optimize=optimize)
+
+    def test_ge_unoptimized(self):
+        self.run_ge_tests(False, False)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    def test_ge_optimized(self):
+        self.run_ge_tests(True, False)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    def test_ge_cuda(self):
+        self.run_ge_tests(True, True)
+
+    # more manual test of graph executor that can be used as a scratchpad
+    def test_ge(self):
+        def foo(a, b):
+            return a * b / (a - b) + b
+        V = Variable
+        a, b = V(torch.rand(1)), V(torch.rand(1))
+        ge = torch._C.GraphExecutor(foo, (a, b))
+        a, b = V(torch.rand(1), requires_grad=True), V(
+            torch.rand(1), requires_grad=True)
+        r, = ge(a, b)
+        da, db = torch.autograd.grad(r + 3, [a, b], create_graph=True)
+
+        l2 = (da * db + db * db)
+        g2result = torch.autograd.grad(l2, [da, db])
+
+        r = foo(a, b)
+        da2, db2 = torch.autograd.grad(r + 3, [a, b], create_graph=True)
+        self.assertEqual(da, da2)
+        self.assertEqual(db, db2)
+        l3 = (da2 * db2 + db2 * db2)
+        g2result2 = torch.autograd.grad(l3, [da2, db2])
+        self.assertEqual(g2result, g2result2)
+
+    def test_trace_annotation(self):
+        @torch.jit.trace(torch.rand(1))
+        def foo(a):
+            return a + a + a
+
+        x = torch.randn(5, 5)
+        self.assertEqual(foo(x), x + x + x)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
+    def test_traced_module(self):
+        class Model(nn.Module):
+            def __init__(self, num_features, num_layers):
+                super(Model, self).__init__()
+                self.num_layers = num_layers
+                layers = [[nn.Linear(num_features, num_features), nn.Sigmoid()]
+                          for _ in range(num_layers)]
+                self.submodule = nn.Sequential(*chain(*layers))
+
+            def forward(self, x):
+                for i in range(self.num_layers):
+                    x = self.submodule[i](x) + x
+                return x
+
+        model = Model(5, 3)
+        x = torch.randn(2, 5)
+        traced_model = torch.jit.trace(x)(model)
+
+        # We're missing some attributes these modules had initially. Make sure we can
+        # still get the __repr__()
+        model.__repr__()
+
+        # XXX: indexing sequentials is broken
+        linear_submodule = next(iter(traced_model.submodule._modules.values()))
+
+        # All attributes that aren't parameters should raise
+        with self.assertRaises(AttributeError):
+            linear_submodule.in_features
+        linear_submodule.weight
+        with self.assertRaises(RuntimeError):
+            traced_model.asdf = 4
+        linear_submodule.weight = nn.Parameter(torch.randn(linear_submodule.weight.shape))
+        with self.assertRaises(RuntimeError):
+            del linear_submodule.weight
+
+        # Submodules can't be called
+        with self.assertRaises(RuntimeError):
+            linear_submodule(x)
+
+        # Type casts
+        linear_submodule.cuda()
+        traced_model.float().cuda()
+        cuda_out = traced_model(x.float().cuda())
+        traced_model.cpu()
+        cpu_out = traced_model(x.float())
+        self.assertEqual(cpu_out, cuda_out)
+        traced_model.double()
+
+        # state_dict + load_state_dict
+        state = {k: v.clone() for k, v in traced_model.state_dict().items()}
+        new_state = {k: v.clone().fill_(1) for k, v in state.items()}
+        out = traced_model(x)
+        traced_model.load_state_dict(new_state)
+        out_ones = traced_model(x)
+        traced_model.load_state_dict(state)
+        out_state = traced_model(x)
+        self.assertEqual(out, out_state)
+        self.assertNotEqual(out, out_ones)
+
+    def test_python_function(self):
+        class MyFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x + 1
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output
+
+        @torch.jit.trace(torch.zeros(2))
+        def fn(x):
+            return MyFn.apply(x + 2) + 3
+
+        x = torch.tensor([1., 2., 3.])
+        y = torch.randn(2, 2, requires_grad=True)
+        fn(x)
+        fn(y)
+
+    def test_decompose_addmm(self):
+        @torch.jit.script
+        def addmm(mat, mat1, mat2, alpha, beta):
+            a = mat.addmm(mat1, mat2)
+            b = mat.addmm(mat1, mat2, alpha=1.0, beta=1.0)
+            c = mat.addmm(mat1, mat2, alpha=4.20, beta=2.0)
+            d = mat.addmm(mat1, mat2, alpha=alpha, beta=beta)
+
+            return a + b + c + d
+
+        mat = torch.randn(2, 2)
+        mat1 = torch.randn(2, 4)
+        mat2 = torch.randn(4, 2)
+        alpha = torch.FloatTensor([123.0])
+        beta = torch.FloatTensor([321.0])
+
+        out_ref = addmm(mat, mat1, mat2, alpha, beta)
+        self.run_pass('decompose_addmm', addmm.graph)
+        out_test = addmm(mat, mat1, mat2, alpha, beta)
+        self.assertEqual(out_ref, out_test)
+        self.assertExpected(canonical(addmm.graph))
+
+    def test_index_put(self):
+        ten = torch.zeros(3, 3)
+        mask = torch.Tensor([[True, True, True],
+                             [True, False, False],
+                             [True, True, False]]).byte()
+
+        def test_fn(ten, mask):
+            ten[mask] = torch.ones(6)
+            return ten
+
+        traced_test_fn = torch.jit.trace(ten, mask)(test_fn)
+
+        ten = torch.rand(3, 3)
+        self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
+
+
+class TestBatched(TestCase):
+    # generate random examples and create an batchtensor with them
+    def rand_batch(self, *dims):
+        dims = [dim for dim in dims if dim != ()]
+        xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:])) for i in range(dims[0])]
+        xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]))
+        return xs, xb
+
+    def test_create_batchtensor(self):
+        # create from tensorlist
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2), (True, 5))
+        self.assertEqual(xs, batch.examples())
+        # create from data, mask, dims
+        batch2 = BatchTensor(batch.get_data(), batch.get_mask(), batch.get_dims())
+        self.assertEqual(xs, batch2.examples())
+        # expand a tensor to a batchtensor given batch_size
+        xs = torch.rand(3, 4, 5)
+        batch3 = BatchTensor(xs, 2)
+        xs = xs.unsqueeze(0)
+        self.assertEqual([xs, xs], batch3.examples())
+
+    def test_batch_elementwise_unary(self):
+        @torch.jit.batch(batch_size=4)
+        def tanh(a):
+            return torch.tanh(a)
+
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2))
+        res_batch = tanh(batch)
+        res = [torch.tanh(xs[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_elementwise_binary(self):
+        @torch.jit.batch(batch_size=4)
+        def add(a, b):
+            return a + b
+
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2))
+        xs2, batch2 = xs, batch
+        res_batch = add(batch, batch2)
+        res = [torch.add(xs[j], xs2[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        # test broadcast
+        xs, batch = self.rand_batch(4, (False, 3), (False, 2))
+        b = torch.rand(3, 2)
+        res_batch = add(batch, b)
+        res = [torch.add(xs[j], b) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_mm(self):
+        @torch.jit.batch(batch_size=4)
+        def mm(a, b):
+            return torch.mm(a, b)
+
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3))
+        res_batch = mm(batch, batch2)
+        res = [torch.mm(xs[j].squeeze(0), xs2[j].squeeze(0)).unsqueeze(0) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        # test broadcast
+        b = torch.rand(2, 4)
+        res_batch = mm(batch, b)
+        res = [torch.mm(xs[j].squeeze(0), b).unsqueeze(0) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_matmul(self):
+        @torch.jit.batch(batch_size=4)
+        def matmul(a, b):
+            return torch.matmul(a, b)
+
+        def matmul_test(xs, batch, xs2, batch2):
+            ys = [torch.matmul(xs[j].squeeze(0), xs2[j].squeeze(0)).unsqueeze(0) for j in range(4)]
+            ybs = matmul(batch, batch2)
+            self.assertEqual(ys, ybs.examples())
+
+        # 1 dimension * 1 dimension
+        xs, batch = self.rand_batch(4, (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 2))
+        matmul_test(xs, batch, xs2, batch2)
+        # 1 dimension * 2 dimension
+        xs, batch = self.rand_batch(4, (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3))
+        matmul_test(xs, batch, xs2, batch2)
+        # 2 dimension * 1 dimensions
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 2))
+        matmul_test(xs, batch, xs2, batch2)
+        # 2 dimension * 2 dimension
+        xs, batch = self.rand_batch(4, (True, 3), (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3))
+        matmul_test(xs, batch, xs2, batch2)
+
+    def test_batch_where(self):
+        @torch.jit.batch(batch_size=4)
+        def where(c, a, b):
+            return torch.where(c, a, b)
+
+        xs, batch = self.rand_batch(4, (False, 3), (False, 2))
+        xs2, batch2 = self.rand_batch(4, (False, 3), (False, 2))
+
+        dims = [4, (False, 3), (False, 2)]
+        xs_cond = [torch.rand(1, 3, 2).byte() for i in range(dims[0])]
+        batch_cond = BatchTensor(xs_cond, torch.tensor([b for b, d in dims[1:]]))
+
+        res_batch = where(batch_cond, batch, batch2)
+        res = [torch.where(xs_cond[j], xs[j], xs2[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_lstm_cell(self):
+        def LSTMCell(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
+            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+            # activations
+            i_t = torch.sigmoid(i_t)
+            f_t = torch.sigmoid(f_t)
+            o_t = torch.sigmoid(o_t)
+            # cell computations
+            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+            c_t = torch.tanh(c_t)
+            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
+            h_t = torch.mul(o_t, torch.tanh(c_t))
+            return h_t
+
+        @torch.jit.batch(batch_size=4)
+        def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
+            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+            # activations
+            i_t = torch.sigmoid(i_t)
+            f_t = torch.sigmoid(f_t)
+            o_t = torch.sigmoid(o_t)
+            # cell computations
+            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+            c_t = torch.tanh(c_t)
+            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
+            h_t = torch.mul(o_t, torch.tanh(c_t))
+            return h_t
+
+        batch_size, input_size, hidden_size = 4, 3, 2
+        xs, batch = self.rand_batch(batch_size, (False, input_size))
+        hx, h_batch = self.rand_batch(batch_size, (False, hidden_size))
+        cx, c_batch = self.rand_batch(batch_size, (False, hidden_size))
+
+        # input to hidden weights
+        w_xi = torch.rand(input_size, hidden_size)
+        w_xf = torch.rand(input_size, hidden_size)
+        w_xo = torch.rand(input_size, hidden_size)
+        w_xc = torch.rand(input_size, hidden_size)
+        # hidden to hidden weights
+        w_hi = torch.rand(hidden_size, hidden_size)
+        w_hf = torch.rand(hidden_size, hidden_size)
+        w_ho = torch.rand(hidden_size, hidden_size)
+        w_hc = torch.rand(hidden_size, hidden_size)
+        # bias terms
+        b_i = torch.rand(hidden_size)
+        b_f = torch.rand(hidden_size)
+        b_o = torch.rand(hidden_size)
+        b_c = torch.rand(hidden_size)
+
+        ys = [LSTMCell(xs[j].squeeze(0), hx[j], cx[j], w_xi, w_xf, w_xo, w_xc,
+                       w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)]
+        ybs = LSTMCell_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc,
+                             w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c)
+        self.assertEqual(ys, ybs.examples())
+
+
+class TestScript(JitTestCase):
+    @contextmanager
+    def capture_stdout(self):
+        # No idea how to capture stdout from C++ on Windows
+        if WINDOWS:
+            yield ['']
+            return
+        import os
+        import fcntl
+        import errno
+        sys.stdout.flush()
+        stdout_fd = os.dup(1)
+        r, w = os.pipe()
+        try:
+            # Override stdout with r - dup is guaranteed to return the lowest free fd
+            os.close(1)
+            os.dup(w)
+
+            captured_stdout = ['']
+            yield captured_stdout
+            sys.stdout.flush()  # Make sure that Python hasn't buffered anything
+
+            # Do the ugly dance to read all the data that was written into the pipe
+            fcntl.fcntl(r, fcntl.F_SETFL, os.O_NONBLOCK)
+            total_stdout = ''
+            while True:
+                try:
+                    total_stdout += os.read(r, 1000).decode('ascii')
+                except OSError as e:
+                    if e.errno != errno.EAGAIN:
+                        raise
+                    break
+            captured_stdout[0] = total_stdout
+        finally:
+            # Revert the change, and clean up all fds
+            os.close(1)
+            os.dup(stdout_fd)
+            os.close(stdout_fd)
+            os.close(r)
+            os.close(w)
+
+    def checkScript(self, script, inputs, optimize=True, outputs=None, name='func', capture_output=False, frames_up=1):
+        if isinstance(script, str):
+            cu = torch.jit.CompilationUnit(script, optimize, _frames_up=frames_up)
+            ge = getattr(cu, name)
+        else:
+            if capture_output:
+                with self.capture_stdout() as captured:
+                    outputs = script(*inputs)
+            else:
+                outputs = script(*inputs)
+            # Check the string frontend first
+            source = textwrap.dedent(inspect.getsource(script))
+            self.checkScript(source, inputs, optimize, outputs, script.__name__, capture_output, frames_up=2)
+            # Continue checking the Python frontend
+            ge = torch.jit.script(script, optimize, _frames_up=1)
+
+        if capture_output:
+            with self.capture_stdout() as captured:
+                outputs_ge = ge(*inputs)
+            if not WINDOWS:
+                self.assertExpected(captured[0], subname='stdout')
+        else:
+            outputs_ge = ge(*inputs)
+        self.assertEqual(outputs, outputs_ge)
+
+    def test_script_cu(self):
+        cu = torch.jit.CompilationUnit('''
+            def foo(a):
+                b = a
+                return b
+        ''')
+        a = Variable(torch.rand(1))
+        self.assertEqual(a, cu.foo(a))
+
+    def test_script_annotation(self):
+        @torch.jit.script
+        def foo(a):
+            return a + a + a
+        s = Variable(torch.rand(2))
+        self.assertEqual(s + s + s, foo(s))
+
+    def test_add(self):
+        def func(a, b):
+            c = a + b
+            c += a
+            return c
+
+        a = torch.rand(1, requires_grad=True)
+        b = torch.rand(1, requires_grad=True)
+        self.checkScript(func, (a, b), optimize=True)
+
+    def test_mul(self):
+        def func(a, b):
+            return a * b
+
+        a = torch.rand(1, requires_grad=True)
+        b = torch.rand(1, requires_grad=True)
+        self.checkScript(func, (a, b), optimize=True)
+
+    @unittest.skipIf(not PY35, "Python 3.5 needed")
+    def test_matmul_py3(self):
+        code = dedent("""
+        def fn(a, b):
+            return a @ b
+        """)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            script_path = os.path.join(tmp_dir, 'script.py')
+            with open(script_path, 'w') as f:
+                f.write(code)
+            fn = get_fn('test_matmul_py3', script_path)
+
+            a = torch.rand(4, 3, requires_grad=True)
+            b = torch.rand(3, 2, requires_grad=True)
+            self.checkScript(fn, (a, b), optimize=True)
+
+    def test_pow(self):
+        def func(a, b):
+            return a ** b
+
+        def func2(a, b, c, d):
+            return c + a ** b ** d
+
+        a = torch.rand(1, requires_grad=True)
+        b = torch.rand(1, requires_grad=True)
+        c = torch.rand(1, requires_grad=True)
+        d = torch.rand(1, requires_grad=True)
+        self.checkScript(func, (a, b), optimize=True)
+        self.checkScript(func2, (a, b, c, d), optimize=True)
+
+    def test_triple(self):
+        def func(x):
+            return 3. * x
+
+        x = torch.rand(1, dtype=torch.float, requires_grad=True)
+        self.checkScript(func, [x], optimize=True)
+
+    def test_slice(self):
+        def func(x):
+            return x[:5]
+
+        x = torch.rand(10, dtype=torch.float, requires_grad=True)
+        self.checkScript(func, [x], optimize=True)
+
+    def test_gather(self):
+        def func(x):
+            return x[0]
+
+        x = torch.rand(10, dtype=torch.float, requires_grad=True)
+        self.checkScript(func, [x], optimize=True)
+
+    def test_keyword(self):
+        @torch.jit.script
+        def func(x):
+            return torch.sum(x, dim=0)
+
+        x = torch.rand(10, dtype=torch.float, requires_grad=True)
+        y = func(x)
+        y2 = torch.sum(x, dim=0)
+        self.assertEqual(y, y2)
+
+    # TODO: renable when we support passing literals to script fns
+    @unittest.expectedFailure
+    def test_literal_xfail(self):
+        def func4(a, b):
+            c = 0, (0, 0)
+            x = True
+            while x:
+                x = False
+                c = a, (a, b)
+            d, e = c
+            f, g = e
+            return d + f + g
+
+        self.checkScript(func4, (a, b), optimize=True)
+
+    def test_literal(self):
+        def func(a, b):
+            c = [a, b]
+            d, e = c
+            return d + e
+
+        def func2(a, b):
+            c = a, b
+            d, e = c
+            return d + e
+
+        def func3(a, b):
+            c = a, (a, b)
+            d, e = c
+            f, g = e
+            return d + f + g
+
+        a = torch.rand(1, requires_grad=True)
+        b = torch.rand(1, requires_grad=True)
+        self.checkScript(func, (a, b), optimize=True)
+        self.checkScript(func2, (a, b), optimize=True)
+        self.checkScript(func3, (a, b), optimize=True)
+
+    def test_expand(self):
+        @torch.jit.script
+        def func(x, y):
+            return x + y
+
+        x = torch.rand(2, 3, dtype=torch.float, requires_grad=True)
+        y = torch.rand(3, dtype=torch.float, requires_grad=True)
+        out = func(x, y)
+        self.assertEqual(func(x, y), x + y)
+
+        grad = torch.randn(2, 3, dtype=torch.float)
+        out.backward(grad)
+        self.assertEqual(x.grad, grad)
+        self.assertEqual(y.grad, grad.sum(dim=0))
+
+    def test_sum(self):
+        @torch.jit.script
+        def func(x):
+            return x.sum(dim=[4])
+
+        @torch.jit.script
+        def func2(x):
+            return x.sum(dim=4)
+
+        self.assertExpected(canonical(func.graph), subname='1')
+        # test that shape analysis is written correctly for sum with IntList[1] dim argument
+        torch._C._jit_pass_shape_analysis(
+            func2.graph, (torch.zeros(1, 1, 1, 1, 4),), False)
+        self.assertExpected(canonical(func2.graph), subname='2')
+
+    def test_cat(self):
+        @torch.jit.script
+        def func(x):
+            return torch.cat((x, x), dim=0)
+
+        x = torch.rand(10, dtype=torch.float, requires_grad=True)
+        self.assertEqual(func(x), torch.cat((x, x), dim=0))
+
+        with self.assertRaisesRegex(RuntimeError, "expected at most"):
+            @torch.jit.script
+            def func(x):
+                return torch.cat((x, x), x, dim=0)
+
+    def test_cat_lifts(self):
+        @torch.jit.script
+        def foo(x):
+            return torch.cat([x, x], dim=1)
+
+        @torch.jit.script
+        def foo2(x):
+            return torch.cat([], dim=1)
+
+        @torch.jit.script
+        def foo3(x):
+            return torch.cat([x], dim=1)
+
+        self.assertExpected(
+            canonical(foo.graph) +
+            canonical(foo2.graph) +
+            canonical(foo3.graph))
+
+    def test_func_call(self):
+        script = '''
+        def add(a, b):
+            return a + b
+
+        def mul(a, x):
+            return a * x
+
+        def func(alpha, beta, x, y):
+            return add(mul(alpha, x), mul(beta, y))
+        '''
+        alpha = torch.rand(1, dtype=torch.float, requires_grad=True)
+        beta = torch.rand(1, dtype=torch.float, requires_grad=True)
+        x = torch.rand(3, dtype=torch.float, requires_grad=True)
+        y = torch.rand(3, dtype=torch.float, requires_grad=True)
+        outputs = alpha * x + beta * y
+        # NOTE: cannot optimize yet because broadcasts are not inserted before the fuser runs
+        self.checkScript(script, [alpha, beta, x, y], optimize=False, outputs=outputs)
+
+    def test_view_shape_prop(self):
+        cu = torch.jit.CompilationUnit('''
+        def test_view_shape_prop(a):
+            return view(a, size=[-1])
+        ''')
+        inputs = [torch.zeros(10, 10)]
+        outputs = torch.zeros(100)
+
+        real_outs = cu.test_view_shape_prop(*inputs)
+        self.assertEqual(real_outs, outputs)
+
+    def test_integral_shape_inference(self):
+        cu = torch.jit.CompilationUnit('''
+        def test_integral_shape_inference(a):
+            return a / a
+        ''')
+        inputs = [torch.ones(10, 10).type(torch.LongTensor)]
+        outputs = torch.ones(10, 10)
+
+        self.assertEqual(cu.test_integral_shape_inference(*inputs), outputs)
+
+    def test_fuser_multiple_blocks(self):
+        cu = torch.jit.CompilationUnit('''
+        def test_fuser_multiple_blocks(this, that, theother, meme):
+            i = 0
+            while i < 20:
+                this = cat([this, meme], dim=0)
+                that = cat([that, meme], dim=0)
+                theother = cat([theother, meme], dim=0)
+                i = i + 1
+            return this, that, theother
+        ''')
+
+        inputs = [torch.ones(0, 10, 10)] * 3
+        inputs += [torch.ones(1, 10, 10)]
+        outputs = [torch.ones(20, 10, 10)] * 3
+
+        self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
+
+    def test_dropout_script(self):
+
+        eg = torch.zeros(1, 2, 3, requires_grad=True)
+
+        @torch.jit.trace(eg)
+        def foo(x):
+            x = torch.neg(x)
+            return F.dropout(x)
+
+        class MyDrop(nn.Module):
+            def forward(self, x):
+                return foo(x)
+
+        f = io.BytesIO()
+        torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
+
+    @unittest.skip("RuntimeError: VariableType::ID() not implemented")
+    def test_cast(self):
+        script = '''
+        def to_int(x):
+            return int(x)
+        '''
+        x = Variable(torch.FloatTensor([1.1, 2.3]), requires_grad=True)
+        out = Variable(torch.IntTensor([1, 2]), requires_grad=True)
+        self.checkScript(script, [x], optimize=True, outputs=[out], func='to_int')
+
+    def test_python_frontend(self):
+        def fn(x, y, z):
+            q = x + y - z.sigmoid()
+            print(q)
+            w = -z
+            if not x and not y and z:
+                m = x if not z else y
+            while x < y > z:
+                q = x
+            return x
+
+        ast = torch.jit.frontend.get_jit_ast(fn)
+        self.assertExpected(str(ast))
+
+    def _make_scalar_vars(self, arr, dtype):
+        return [torch.tensor(val, dtype=dtype) for val in arr]
+
+    def test_while(self):
+        def func(a, b, max):
+            while a < max:
+                a = a + 1
+                b = b + 1
+            c = a + b
+            return c
+
+        inputs = self._make_scalar_vars([1, 1, 10], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_fibb(self):
+        def func(lim):
+            first = 1
+            second = 1
+            i = 1
+            somenum = 5
+            dontmutateme = 3
+            third = 0
+            while i < lim:
+                third = first + second
+                first = second
+                second = third
+                j = 0
+                while j < 10:
+                    somenum = somenum * 2
+                    j = j + 1
+                i = i + j
+                i = i + dontmutateme
+
+            st = second + third
+            fs = first + second
+            zero = FIXME_zerol()
+            return third + zero, st + zero, fs + zero
+
+        inputs = self._make_scalar_vars([10], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_if(self):
+        def func(a, b):
+            zero = FIXME_zerol()
+            d = 3
+            if a > 10:
+                a = zero + 3 + d
+            else:
+                b = zero + 3 + d
+                d = 4
+            c = a + b
+            return c
+
+        inputs = self._make_scalar_vars([1, -1], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_if_for_in_range(self):
+        def func(a, b):
+            d = FIXME_zerol() + 3
+            for _ in range(20):
+                if a > 10:
+                    a = 3 + d
+                else:
+                    b = 3 + d
+                    d = FIXME_zerol() + 4
+                c = a + b
+            return d
+        inputs = self._make_scalar_vars([1, -1], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_if_noelse(self):
+        def func(a, b):
+            if a > 10:
+                a = 3 + b
+            c = a + b
+            return c
+
+        inputs = self._make_scalar_vars([-1, 1], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_while_nonexistent_value(self):
+        with self.assertRaisesRegex(RuntimeError, "undefined value x"):
+            torch.jit.CompilationUnit('''
+            def test_while(a, b):
+                while a < 10:
+                    a = a + x
+                    b = b + 1
+                return a + b
+            ''')
+
+    def test_while_nonexistent_cond_value(self):
+        with self.assertRaisesRegex(RuntimeError, "undefined value x"):
+            torch.jit.CompilationUnit('''
+            def test_while(a, b):
+                while a < x:
+                    a = a + 1
+                    b = b + 1
+                return a + b
+            ''')
+
+    def test_while_write_outer_then_read(self):
+        def func(a, b):
+            while a < 10:
+                a = a + 1
+                b = a + 1
+            return a + b
+
+        inputs = self._make_scalar_vars([42, 1337], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_while_nest_if(self):
+        def func(a, b):
+            c = FIXME_zerol()
+            while a < 10:
+                a = a + 1
+                b = b + 1
+                if a > b:
+                    c = -a
+                else:
+                    c = -b
+            return c + 1
+
+        inputs = self._make_scalar_vars([-1234, 4321], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_math_schema(self):
+        # This should use the add(Tensor, Tensor) schema.
+        # Also tests to see if alpha={1} is lifted correctly.
+        def fn(x, y):
+            return x + y
+
+        graph = torch.jit._script_graph(fn)
+        self.assertExpectedGraph(graph)
+
+    def test_math_tensor_number(self):
+        # Test that 7 is casted to tensor, then casted to the
+        # correct type, and finally added to x.
+        def fn(x):
+            return x + 7
+
+        graph = torch.jit._script_graph(fn)
+        self.assertExpectedGraph(graph)
+
+    def test_math_numbers(self):
+        # Test that the numbers are casted to tensor,
+        # added, and then casted back.
+        def fn1(x):
+            c = 7 + 8
+            # FIXME: return number instead of tensor
+            return torch.full([1], c)
+
+        def fn2(x):
+            c = 1.1 + 3.1
+            # FIXME: return number instead of tensor
+            return torch.full([1], c)
+
+        graph1 = torch.jit._script_graph(fn1)
+        self.assertExpectedGraph(graph1, subname="int")
+        graph2 = torch.jit._script_graph(fn2)
+        self.assertExpectedGraph(graph2, subname="float")
+
+    def test_if_nest_while(self):
+        def func(a, b):
+            c = FIXME_zerol()
+            if a > b:
+                while a > b:
+                    b = b + 1
+                    c = -b
+            return c
+
+        inputs = self._make_scalar_vars([4321, 1234], torch.int64)
+        self.checkScript(func, inputs, optimize=True)
+
+    def test_script_for_in_range(self):
+        def fn():
+            c = FIXME_zerol()
+            for i in range(100):
+                c += i
+            return c
+        self.checkScript(fn, (), outputs=4950, optimize=True)
+
+    def test_script_for_in_range_dynamic(self):
+        def fn():
+            c = FIXME_zerol()
+            for i in range(100):
+                # FIXME: i should really be IntType and not DynamicType in the frontend
+                # In addition, i should be a scalar tensor (it has size (1,) atm)
+                acc = FIXME_zerol()
+                for j in range(i):
+                    acc += j
+                c += acc
+            return c
+        self.checkScript(fn, (), optimize=False)
+
+    def test_script_for_in_range_ast(self):
+        @torch.jit.script
+        def test_script_for_in_range_ast():
+            c = FIXME_zerol()
+            for i in range(100):
+                acc = FIXME_zerol()
+                for j in range(i):
+                    acc += j
+                c += acc
+            return c
+
+        self.assertEqual(test_script_for_in_range_ast(), 161700)
+
+    def test_script_for_in_range_if_ast(self):
+        @torch.jit.script
+        def test_script_for_in_range_if_ast(x):
+            output = FIXME_zerol()
+            for i in range(20):
+                if i == 0:
+                    output = x.unsqueeze(0)
+                else:
+                    output = torch.cat((output, x.unsqueeze(0)), dim=0)
+            return output
+        inputs = self._make_scalar_vars([0], torch.int64)
+
+        self.assertEqual(test_script_for_in_range_if_ast(*inputs).shape[0], 20)
+
+    def test_script_bool_constant(self):
+        script = '''
+        def test_script_bool_constant():
+            a = True
+            return a
+        '''
+        outputs = [1]
+        self.checkScript(script, [], outputs[0], True, 'test_script_bool_constant')
+
+    def test_ternary(self):
+        def func(a, b):
+            c = 3
+            c = a + b if a > 3 else b
+            return c
+
+        inputs_true = self._make_scalar_vars([5, 2], torch.int64)
+        inputs_false = self._make_scalar_vars([1, 0], torch.int64)
+        self.checkScript(func, inputs_true, optimize=True)
+        self.checkScript(func, inputs_false, optimize=True)
+
+    def test_print(self):
+        def func(x, y):
+            q = (x + y).sigmoid()
+            print(q)
+            w = -q
+            return w * w
+
+        x = torch.arange(4., requires_grad=True)
+        y = torch.arange(0., 8, 2, requires_grad=True)
+        self.checkScript(func, [x, y], optimize=True, capture_output=True)
+
+    def test_multiple_assignment(self):
+        def outer_func(x):
+            return x * 2, x + 2
+
+        @torch.jit.script
+        def func(x):
+            y, z = outer_func(x)
+            return y + z
+
+        x = torch.arange(4)
+        self.assertEqual(func(x), x * 2 + x + 2)
+
+    def test_literals(self):
+        def func(a):
+            return a.view(size=[1, 2, 3])
+
+        a = torch.randn(6)
+        self.checkScript(func, [a], optimize=True)
+
+    def test_return(self):
+        def no_return(a):
+            a + 1
+
+        def void_return(a):
+            return
+
+        def one_return(a):
+            return a + 1.
+
+        def multiple_returns(a):
+            return a * 1., a * 2., a * 3.
+
+        a = torch.randn(1, dtype=torch.float)
+        self.checkScript(no_return, [a], optimize=True)
+        self.checkScript(void_return, [a], optimize=True)
+        self.checkScript(one_return, [a], optimize=True)
+        self.checkScript(multiple_returns, [a], optimize=True)
+
+    def test_error(self):
+        @torch.jit.script
+        def foo(a):
+            return a.t()
+        s = Variable(torch.rand(10))
+        # XXX: this should stay quiet in stay propagation and only fail in the interpreter
+        with self.assertRaisesRegex(RuntimeError, "failed in interpreter"):
+            foo(s)
+
+        @torch.jit.script
+        def bar(c, b):
+            return c / b
+
+        with self.assertRaisesRegex(RuntimeError, "failed in interpreter"):
+            bar(Variable(torch.rand(10), requires_grad=True), Variable(torch.rand(9), requires_grad=True))
+
+    def test_binop_unsupported_error(self):
+        with self.assertRaisesRegex(NotSupportedError, "unsupported binary operator:"):
+            @torch.jit.script
+            def binop(x, y):
+                # Replace this with another unsupported op when/if it gets supported
+                return x << y
+
+    def test_number_math(self):
+        template = ('''
+# int, int -> int
+def func1():
+    c = 8 {op} 2
+    # FIXME: return number instead of tensor
+    return torch.full([1], c)
+
+def func2():
+    c = 2 {op} 2
+    # FIXME: return number instead of tensor
+    return torch.full([1], c)
+
+# float, float -> float
+def func3():
+    c = 3.14 {op} 0.125
+    # FIXME: return number instead of tensor
+    return torch.full([1], c)
+
+def func4():
+    c = 3.14 {op} 3.14
+    # FIXME: return number instead of tensor
+    return torch.full([1], c)
+''')
+        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        # TODO: turn this on for py3 (and add PY3 division semantics)
+        ops_py2_only = ['/']
+        if PY2:
+            ops.extend(ops_py2_only)
+
+        for op in ops:
+            code = template.format(op=op)
+            scope = {}
+            exec(code, globals(), scope)
+            cu = torch.jit.CompilationUnit(code)
+
+            self.assertEqual(cu.func1(), scope['func1']())
+            self.assertEqual(cu.func2(), scope['func2']())
+            self.assertEqual(cu.func3(), scope['func3']())
+            self.assertEqual(cu.func4(), scope['func4']())
+
+    def test_number_neg(self):
+        # int -> int
+        def func1():
+            c = -8
+            # FIXME: return number instead of tensor
+            return torch.full([1], c)
+
+        # float -> float
+        def func2():
+            c = -3.14
+            # FIXME: return number instead of tensor
+            return torch.full([1], c)
+
+        self.checkScript(func1, (), optimize=True)
+        self.checkScript(func2, (), optimize=True)
+
+    def _test_tensor_number_math(self, device='cpu'):
+        template = ('''
+def func(t):
+    return {lhs} {op} {rhs}
+''')
+
+        def test(op, const, swap_args):
+            args = ('t', const)
+            if swap_args:
+                args = (const, 't')
+
+            code = template.format(lhs=args[0], rhs=args[1], op=op)
+            scope = {}
+            exec(code, globals(), scope)
+            cu = torch.jit.CompilationUnit(code)
+
+            self.assertEqual(cu.func(tensor), scope['func'](tensor))
+
+        var_int = 2
+        var_float = 1.4321
+
+        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        # TODO: turn this on for py3 (and add PY3 division semantics)
+        ops_py2_only = ['/']
+        if PY2:
+            ops.extend(ops_py2_only)
+
+        float_tensor = torch.randn(5, 5, device=device)
+        double_tensor = torch.randn(5, 5, dtype=torch.double, device=device)
+        long_tensor = torch.randint(-5, 5, (5, 5), dtype=torch.long, device=device)
+        long_tensor[long_tensor == 0] = 2
+
+        tensors = [float_tensor, double_tensor, long_tensor]
+        consts = [var_int, var_float]
+
+        for op, tensor, const, swap_args in product(ops, tensors, consts, [True, False]):
+            # FIXME: things like 2 / long_tensor are not implemented correctly
+            # Look in torch/tensor.py to see how pytorch implements it.
+            if op == '/' and tensor.data_ptr() == long_tensor.data_ptr():
+                continue
+
+            test(op, const, swap_args)
+
+    def test_tensor_number_math(self):
+        self._test_tensor_number_math()
+
+    @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    def test_tensor_number_math_cuda(self):
+        self._test_tensor_number_math(device='cuda')
+
+    def test_python_call(self):
+        def pyfunc(a):
+            return a * 3.0
+
+        cu = torch.jit.CompilationUnit('''
+        def other_func(a):
+            return a + a
+
+        def test_call_python(a):
+            b = pyfunc(a)
+            b = other_func(b)
+            i = 0
+            step = 1
+            while i < 10:
+                b = pyfunc(b)
+                if b > 3.0:
+                    b = pyfunc(b)
+                i = 11
+            return b
+        ''')
+        inputs = self._make_scalar_vars([1], torch.float)
+        outputs = self._make_scalar_vars([54], torch.float)
+
+        self.assertEqual(cu.test_call_python(*inputs), outputs[0])
+
+    def test_python_call_failure(self):
+        with self.assertRaisesRegex(RuntimeError, "undefined value pyfunc2"):
+            def pyfunc(a):
+                return a * 3.0
+
+            cu = torch.jit.CompilationUnit('''
+            def other_func(a):
+                return a + a
+
+            def test_call_python(a):
+                b = pyfunc(a)
+                b = other_func(b)
+                i = 0
+                step = 1
+                while i < 10:
+                    b = pyfunc2(b)
+                    if b > 3.0:
+                        b = pyfunc(b)
+                    i = 11
+                return b
+            ''')
+            inputs = self._make_scalar_vars([1], torch.float)
+            outputs = self._make_scalar_vars([54], torch.float)
+
+            self.assertEqual(cu.test_call_python(*inputs), outputs)
+
+    def test_python_call_annotation(self):
+        def pyfunc(a):
+            return a * 3.0
+
+        @torch.jit.script
+        def foo(a):
+            return pyfunc(a) + pyfunc(a)
+
+        inputs = self._make_scalar_vars([1], torch.float)
+        outputs = self._make_scalar_vars([6], torch.float)
+        self.assertEqual(foo(*inputs), outputs[0])
+
+    def test_python_call_annoytation_failure(self):
+        with self.assertRaisesRegex(RuntimeError, "undefined value pyfunc2"):
+            def pyfunc(a):
+                return a * 3.0
+
+            @torch.jit.script
+            def foo(a):
+                return pyfunc2(a) + pyfunc(a)
+
+            inputs = self._make_scalar_vars([1], torch.float)
+            outputs = self._make_scalar_vars([6], torch.float)
+
+            self.assertEqual(foo(*inputs), outputs[0])
+
+    def test_desugar_module(self):
+        import torch.nn.functional as F
+
+        def fn(x, slope):
+            a = torch.abs(x)
+            b = torch.nn.functional.prelu(x, slope)
+            c = F.prelu(x, slope)
+            return a, b, c
+
+        x = torch.arange(-3., 4)
+        slope = torch.tensor([0.5])
+        self.checkScript(fn, [x, slope], optimize=True)
+
+    def test_script_docstring(self):
+        @torch.jit.script
+        def with_docstring(x):
+            """test str"""
+            y = x
+            """y is the same as x"""
+            return y
+        self.assertEqual(with_docstring.__doc__, 'test str')
+
+    def test_script_method_docstring(self):
+        class A(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def with_docstring(self, x):
+                """test str"""
+                y = x
+                """y is the same as x"""
+                return y
+        a = A()
+        self.assertEqual(a.with_docstring.__doc__, 'test str')
+
+    def test_script_module(self):
+        class M1(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M1, self).__init__(False)
+                self.weight = nn.Parameter(torch.randn(2))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class PModule(nn.Module):
+            def __init__(self):
+                super(PModule, self).__init__()
+                self.a = nn.Parameter(torch.randn(2, 3))
+
+            def forward(self, a):
+                return self.a.mm(a)
+
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(False)
+                # test submodule
+                self.sub = M1()
+                self.sub2 = PModule()
+                # test parameters
+                self.weight = nn.Parameter(torch.randn(2, 3))
+                self.bias = nn.Parameter(torch.randn(2))
+                # test defining a method from a string
+                self.define("""
+                    def hi(self, a):
+                        return self.weight.mm(a)
+                """)
+            # test script methods
+
+            @torch.jit.script_method
+            def doit(self, input):
+                # test use of parameter
+                return self.weight.mm(input)
+
+            @torch.jit.script_method
+            def doit2(self, input):
+                return self.weight.mm(input)
+
+            @torch.jit.script_method
+            def forward(self, input):
+                a = self.doit(input)
+                b = self.doit2(input)
+                c = self.hi(input)
+                d = self.sub2(input)
+                return a + b + self.bias + self.sub(a) + c + d
+        m2 = M2()
+        input = torch.randn(3, 2)
+        a = m2.weight.mm(input)
+        b = m2.weight.mm(input)
+        c = m2.weight.mm(input)
+        d = m2.sub2.a.mm(input)
+        ref = a + b + m2.bias + m2.sub.weight + a + c + d
+        self.assertEqual(ref, m2.forward(input))
+        m2.weight = nn.Parameter(torch.zeros_like(m2.weight))
+        m2.bias = nn.Parameter(torch.zeros_like(m2.bias))
+        m2.sub.weight = nn.Parameter(torch.zeros_like(m2.sub.weight))
+        m2.sub2.a.data.zero_()
+        self.assertEqual(torch.zeros(2, 2), m2.forward(torch.randn(3, 2)))
+
+    def test_script_module_call_noscript(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.value = 1
+
+            def foo(self):
+                return torch.ones(2, 2) + self.value
+
+            @torch.jit.script_method
+            def forward(self, input):
+                return input + self.foo()
+
+        m = M()
+        input = torch.randn(2, 2)
+        o = m(input)
+        self.assertEqual(o, input + torch.ones(2, 2) + 1)
+        # check that we can change python attributes
+        # and that those changes are picked up in script methods
+        m.value = 2
+        o = m(input)
+        self.assertEqual(o, input + torch.ones(2, 2) + 2)
+
+    def test_script_module_nochange_submodule(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.sub = nn.Linear(5, 5)
+
+            @torch.jit.script_method
+            def forward(self, input):
+                return self.sub(input)
+
+        m = M()
+        input = torch.randn(1, 5, 5)
+        o = m(input)
+        self.assertEqual(o, m.sub(input))
+        with self.assertRaisesRegex(RuntimeError, "cannot re-assign"):
+            m.sub = nn.Linear(5, 5)
+
+    def test_script_inline_trace_multiple_args(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__(False)
+
+            def forward(self, input, input2):
+                return input + input2
+
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(False)
+                self.m = torch.jit.trace(torch.zeros(4, 3), torch.zeros(4, 3))(M())
+
+            @torch.jit.script_method
+            def forward(self, inp):
+                return self.m(inp, inp)
+
+        m2 = M2()
+        m2(torch.zeros(4, 3))
+
+    def test_script_module_const(self):
+        class M(torch.jit.ScriptModule):
+
+            __constants__ = ['b', 'i', 'c']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.b = False
+                self.i = 1
+                self.c = 3.5
+
+            @torch.jit.script_method
+            def forward(self):
+                return self.b, self.i, self.c
+
+        m = M()
+        o0, o1, o2 = m()
+        self.assertEqual(o0, 0)
+        self.assertEqual(o1, 1)
+        self.assertEqual(o2, 3.5)
+
+    def test_script_module_fail_const(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.b = False
+
+            @torch.jit.script_method
+            def forward(self):
+                return self.b
+        with self.assertRaisesRegex(RuntimeError, "is not usable in a script method"):
+            M()
+
+    def test_script_module_valid_consts(self):
+        class Foo(torch.jit.ScriptModule):
+            __constants__ = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
+
+            def __init__(self):
+                super(Foo, self).__init__(False)
+                self.a = 1
+                self.b = 1.2
+                self.c = False
+                self.d = [nn.Linear(3, 4)]
+                self.e = lambda x: x
+                self.f = [3, 4, 5]
+                self.assertTrue(type(self.f) is tuple)
+                self.g = [3, (3, 4), 5]
+                with self.assertRaisesRegex(TypeError, "is not a valid constant"):
+                    self.h = type(1)
+                with self.assertRaisesRegex(TypeError, "is not a valid constant"):
+                    self.i = (3, 4, {})
+
+    def test_script_module_for(self):
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['b']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.b = [1, 2, 3, 4]
+
+            @torch.jit.script_method
+            def forward(self):
+                sum = FIXME_zerol()
+                for i in self.b:
+                    sum += i
+                return sum
+
+        m = M()
+        self.assertEqual(m(), 10)
+
+    def test_script_module_for2(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__(False)
+                self.weight = nn.Parameter(torch.randn(2))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['mods']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.mods = nn.ModuleList([Sub() for i in range(10)])
+
+            @torch.jit.script_method
+            def forward(self, v):
+                for m in self.mods:
+                    v = m(v)
+                return v
+
+        i = torch.Tensor(2)
+        m = M()
+        o = m(i)
+        v = i
+        for sub in m.mods:
+            v = sub(v)
+        self.assertEqual(o, v)
+
+    def test_script_module_const_submodule_fail(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__(False)
+                self.weight = nn.Parameter(torch.randn(2))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.mods = [Sub() for _ in range(10)]
+
+            @torch.jit.script_method
+            def forward(self):
+                for _ in self.mods:
+                    print(1)
+                return 4
+
+        with self.assertRaisesRegex(RuntimeError, "did you forget to add it __constants__"):
+            M()
+
+    def test_script_module_not_tuple(self):
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['mods']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.mods = 1
+
+            @torch.jit.script_method
+            def forward(self, v):
+                for m in self.mods:
+                    print(m)
+                return v
+        with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
+            M()
+
+    def test_script_sequential_for(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__(False)
+                self.weight = nn.Parameter(torch.randn(2))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['mods']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.mods = nn.Sequential(Sub(), Sub(), Sub())
+
+            @torch.jit.script_method
+            def forward(self, v):
+                for m in self.mods:
+                    v = m(v)
+                return v
+
+            @torch.jit.script_method
+            def forward2(self, v):
+                return self.mods(v)
+
+        i = torch.Tensor(2)
+        m = M()
+        o = m(i)
+        v = i
+        for sub in m.mods:
+            v = sub(v)
+        self.assertEqual(o, v)
+
+        o2 = m.forward2(i)
+        self.assertEqual(o2, v)
+
+    def test_script_sequential_multi_output_fail(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__(False)
+                self.weight = nn.Parameter(torch.randn(2))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class ReturnMulti(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ReturnMulti, self).__init__(False)
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return x, x, x
+
+        class HaveSequential(torch.jit.ScriptModule):
+            __constants__ = ['someseq']
+
+            def __init__(self):
+                super(HaveSequential, self).__init__(False)
+                self.someseq = nn.Sequential(
+                    Sub(),
+                    ReturnMulti(),
+                    Sub()
+                )
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.someseq(x)
+
+        with self.assertRaisesRegex(RuntimeError, "(Tensor, Tensor, Tensor)"):
+            hs = HaveSequential()
+            i = torch.Tensor(2)
+            hs(i)
+
+    def test_constant_as_attr(self):
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['dim']
+
+            def __init__(self):
+                super(M, self).__init__(False)
+                self.dim = 1
+
+            @torch.jit.script_method
+            def forward(self, v):
+                return torch.cat([v, v, v], dim=self.dim)
+        v = torch.zeros(1, 1)
+        self.assertEqual(torch.cat([v, v, v], dim=1), M()(v))
+
+    class StarTestSumStarred(torch.nn.Module):
+        def __init__(self):
+            super(TestScript.StarTestSumStarred, self).__init__()
+
+        def forward(self, *inputs):
+            output = inputs[0]
+            for i in range(1, len(inputs)):
+                output += inputs[i]
+            return output
+
+    class StarTestReturnThree(torch.nn.Module):
+        def __init__(self):
+            super(TestScript.StarTestReturnThree, self).__init__()
+
+        def forward(self, rep):
+            return rep, rep, rep
+
+    def test_script_star_expr(self):
+
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(True)
+                self.m = torch.jit.trace(
+                    torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3))(TestScript.StarTestSumStarred())
+                self.g = torch.jit.trace(torch.ones(4, 3))(TestScript.StarTestReturnThree())
+
+            @torch.jit.script_method
+            def forward(self, rep):
+                tup = self.g(rep)
+                return self.m(*tup)
+
+        m = M2()
+        self.assertEqual(m(torch.zeros(4, 3)), 3 * torch.zeros(4, 3))
+
+    def test_script_star_expr_string(self):
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(True)
+                self.m = torch.jit.trace(
+                    torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3))(TestScript.StarTestSumStarred())
+                self.g = torch.jit.trace(torch.ones(4, 3))(TestScript.StarTestReturnThree())
+
+                self.define('''
+            def forward(self, rep):
+                tup = self.g(rep)
+                return self.m(*tup)
+                ''')
+
+        m = M2()
+        self.assertEqual(m(torch.zeros(4, 3)), 3 * torch.zeros(4, 3))
+
+    class StarTestSumAndReturnThree(torch.nn.Module):
+        def __init__(self):
+            super(TestScript.StarTestSumAndReturnThree, self).__init__()
+
+        def forward(self, *inputs):
+            output = inputs[0]
+            for i in range(1, len(inputs)):
+                output += inputs[i]
+            return output, output, output
+
+    def test_script_star_assign(self):
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(True)
+                self.g = torch.jit.trace(torch.ones(4, 3))(TestScript.StarTestSumAndReturnThree())
+                self.define('''
+            def forward(self, rep):
+                head, *tail = self.g(rep)
+                return head
+                ''')
+
+        m = M2()
+        self.assertEqual(m(torch.zeros(4, 3)), 3 * torch.zeros(4, 3))
+
+    def test_script_module_star_assign2(self):
+        class M2(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M2, self).__init__(True)
+                self.g = torch.jit.trace(
+                    torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3)
+                )(
+                    TestScript.StarTestSumAndReturnThree()
+                )
+                self.define('''
+            def forward(self, rep):
+                *head, tail = self.g(rep, rep, rep)
+                return tail
+                ''')
+
+        m = M2()
+        self.assertEqual(m(torch.ones(4, 3)), 3 * torch.ones(4, 3))
+
+    def test_script_module_star_assign_fail_pythonop(self):
+
+        with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
+            class M2(torch.jit.ScriptModule):
+                def __init__(self):
+                    super(M2, self).__init__(True)
+
+                    def myfunc():
+                        return torch.zeros(1, 2, 3), torch.zeros(1, 2, 3)
+
+                    self.define('''
+                def forward(self, rep):
+                    a, *b = myfunc()
+                    return a
+                    ''')
+
+            m = M2()
+            m(torch.zeros(4, 3))
+
+    def test_script_module_star_assign_fail_builtin(self):
+        with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
+            class M2(torch.jit.ScriptModule):
+                def __init__(self):
+                    super(M2, self).__init__(True)
+
+                    self.define('''
+                def forward(self, rep):
+                    a, *b = torch.neg(rep)
+                    return a
+                    ''')
+
+            m = M2()
+            m(torch.zeros(4, 3))
+
+    def test_pack_padded_pad_packed_trace(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        T, B, C = 3, 5, 7
+
+        class PadPackedWrapper(torch.nn.Module):
+            def __init__(self):
+                super(PadPackedWrapper, self).__init__()
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = np.ones((T, B, C))
+        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
+        # set padding value so we can test equivalence
+        for b in range(B):
+            if seq_lens[b] < T:
+                x[seq_lens[b]:, b, :] = 0
+        seq_lens = torch.from_numpy(seq_lens)
+        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
+
+        m = PadPackedWrapper()
+        m_traced = torch.jit.trace(x, seq_lens)(m)
+
+        y = m(x, seq_lens)
+        loss = torch.sum(y)
+        loss.backward()
+        grad = x.grad.clone()
+        x.grad.zero_()
+
+        y_traced = m_traced(x, seq_lens)
+        loss_traced = torch.sum(y_traced)
+        loss_traced.backward()
+        grad_traced = x.grad.clone()
+
+        self.assertEqual(y_traced, x)
+        self.assertEqual(y_traced, y)
+        self.assertEqual(grad, grad_traced)
+
+        f = io.BytesIO()
+        torch.onnx._export(m, (x, seq_lens), f, verbose=False)
+
+    def test_script_outputs(self):
+        with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
+            @torch.jit.script
+            def foo(a):
+                c, d = a + a
+                return c + d
+
+        @torch.jit.script
+        def return3():
+            # FIXME: use number instead of tensor
+            return torch.full([1], 1), torch.full([1], 2), torch.full([1], 3)
+
+        with self.assertRaisesRegex(RuntimeError, "too many values to unpack"):
+            @torch.jit.script
+            def bind2():
+                a, b = return3()
+                print(a)
+                print(b)
+
+    def test_script_chunk(self):
+        @torch.jit.script
+        def foo(a):
+            b, c = torch.chunk(a, dim=0, chunks=2)
+            return b
+        v = torch.rand(10, 3)
+        self.assertEqual(torch.chunk(v, dim=0, chunks=2)[0], foo(v))
+
+        with self.assertRaisesRegex(RuntimeError, "too many values to unpack"):
+            @torch.jit.script
+            def foo(a):
+                b, c = torch.chunk(a, dim=0, chunks=3)
+                return b
+
+    def test_rnn_trace_override(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        num_layers = 3
+        T, B, C = 11, 5, 7
+
+        class RNNTraceWrapper(torch.nn.Module):
+            def __init__(self, cell_type):
+                super(RNNTraceWrapper, self).__init__()
+                if cell_type == 'RNN':
+                    self.rnn = torch.nn.RNN(input_size=C, hidden_size=C, num_layers=num_layers)
+                elif cell_type == 'LSTM':
+                    self.rnn = torch.nn.LSTM(input_size=C, hidden_size=C, num_layers=num_layers)
+                elif cell_type == 'GRU':
+                    self.rnn = torch.nn.GRU(input_size=C, hidden_size=C, num_layers=num_layers)
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = self.rnn(x)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        for cell_type in ['RNN', 'LSTM', 'GRU']:
+            x = torch.ones(T, B, C, requires_grad=True)
+            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+
+            m = RNNTraceWrapper(cell_type)
+            m_traced = torch.jit.trace(x, seq_lens)(m)
+
+            y = m(x, seq_lens)
+            loss = torch.sum(y)
+            loss.backward()
+            grad = x.grad.clone()
+            x.grad.zero_()
+
+            y_traced = m_traced(x, seq_lens)
+            loss_traced = torch.sum(y_traced)
+            loss_traced.backward()
+            grad_traced = x.grad.clone()
+
+            self.assertEqual(y_traced, y)
+            self.assertEqual(grad, grad_traced)
+
+            f = io.BytesIO()
+            torch.onnx._export(m, (x, seq_lens), f, verbose=False)
+
+    def test_tuples(self):
+        @torch.jit.script
+        def foo(i):
+            a = torch.chunk(i, dim=0, chunks=2)
+            c = a
+            # some nonsense with if-statements and loops to check
+            # that tuple lowering doesn't fail
+            if True:
+                c = torch.chunk(i, dim=0, chunks=2)
+            t0, t1 = c
+            while False:
+                t0, t1 = c
+                c = torch.chunk(i, dim=0, chunks=2)
+            return t0
+
+        v = torch.rand(10, 3)
+        self.assertEqual(torch.chunk(v, dim=0, chunks=2)[0], foo(v))
+
+        with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"):
+            @torch.jit.script
+            def mixtypes(x):
+                a = torch.chunk(x, dim=0, chunks=2)
+                if True:
+                    a = 4
+
+    def test_type_annotations(self):
+        def fn(x, y):
+            # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+            return x, x * 2, x * 3
+
+        with self.assertRaisesRegex(RuntimeError, r"need 4 values .* found only 3"):
+            @torch.jit.script
+            def script_fn(x):
+                x, y, z, w = fn(x, x)
+
+        with self.assertRaisesRegex(RuntimeError, r"too many values .* need 2 but found 3"):
+            @torch.jit.script
+            def script_fn2(x):
+                x, y = fn(x, x)
+
+        def fn_unpack(x):
+            y, z, w = fn(x, x)
+            return y
+
+        def fn_index(x):
+            q = fn(x, x)
+            return x
+
+        x = torch.ones(2, 2)
+        self.checkScript(fn_unpack, (x,), optimize=True)
+        self.checkScript(fn_index, (x,), optimize=True)
+
+    def test_type_annotations_varargs(self):
+        def fn_varargs(x, *args):
+            return args[0] if args else x
+
+        def fn1(x, y, z):
+            return fn_varargs(x)
+
+        def fn2(x, y, z):
+            return fn_varargs(x, y)
+
+        def fn3(x, y, z):
+            return fn_varargs(x, y, z)
+
+        x, y, z = [torch.randn(2, 2) for _ in range(3)]
+        self.checkScript(fn1, (x, y, z), optimize=True)
+        self.checkScript(fn2, (x, y, z), optimize=True)
+        self.checkScript(fn3, (x, y, z), optimize=True)
+
+    @unittest.skipIf(not PY35, "Python 3.5 needed")
+    def test_type_annotation_py3(self):
+        import importlib.util
+
+        code = dedent("""
+        import torch
+        from torch import Tensor
+        from typing import Tuple
+
+        def fn(x : torch.Tensor, y : Tensor, z) -> Tuple[Tensor, Tensor, Tensor]:
+            return (x, y + z, z)
+        """)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            script_path = os.path.join(tmp_dir, 'script.py')
+            with open(script_path, 'w') as f:
+                f.write(code)
+            fn = get_fn('test_type_annotation_py3', script_path)
+
+            with self.assertRaisesRegex(RuntimeError, r"expected Tensor, but got"):
+                @torch.jit.script
+                def bad_fn(x):
+                    x, y = fn((x, x), x, x)
+                    return y
+
+            with self.assertRaisesRegex(RuntimeError, r"too many values .* need 2 but found 3"):
+                @torch.jit.script
+                def bad_fn2(x):
+                    x, y = fn(x, x, x)
+                    return y
+
+            with self.assertRaisesRegex(RuntimeError, r"need 4 values .* found only 3"):
+                @torch.jit.script
+                def bad_fn3(x):
+                    x, y, z, w = fn(x, x, x)
+                    return y
+
+            def good_fn(x):
+                y, z, w = fn(x, x, x)
+                return y, z, w
+
+            self.checkScript(good_fn, (torch.ones(2, 2),), optimize=True)
+
+    def test_type_annotation_module(self):
+        class BaseModule(torch.jit.ScriptModule):
+            def foo(self, x):
+                # type: (Tensor) -> Tensor
+                return x + 1
+
+            def bar(self, x, y):
+                # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+                return x + y, y
+
+            def baz(self, x, y):
+                return x
+
+        class ModuleTooMany(BaseModule):
+            @torch.jit.script_method
+            def method(self, x):
+                return self.foo(x, x)
+
+        class ModuleTooFew(BaseModule):
+            @torch.jit.script_method
+            def method(self, x):
+                return self.bar(x)
+
+        class ModuleTooManyAssign(BaseModule):
+            @torch.jit.script_method
+            def method(self, x):
+                y, z, w = self.bar(x, x)
+                return x
+
+        class ModuleDefault(BaseModule):
+            @torch.jit.script_method
+            def method(self, x):
+                y = self.baz(x)
+                return x
+
+        with self.assertRaisesRegex(RuntimeError, "incorrect number of arguments: expected 1, but got 2"):
+            ModuleTooMany()
+        with self.assertRaisesRegex(RuntimeError, "incorrect number of arguments: expected 2, but got 1"):
+            ModuleTooFew()
+        with self.assertRaisesRegex(RuntimeError, "need 3 values .* found only 2"):
+            ModuleTooManyAssign()
+        with self.assertRaisesRegex(RuntimeError, "incorrect number of arguments: expected 2, but got 1"):
+            ModuleDefault()
+
+    def test_script_define_order(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                pass
+
+            @torch.jit.script_method
+            def call_foo(self, input):
+                return self.foo(input)
+
+            @torch.jit.script_method
+            def foo(self, input):
+                return input + 1
+        m = M()
+        self.assertEqual(2, m.call_foo(torch.ones((), dtype=torch.int64)))
+
+    def test_script_define_order_recursive_fail(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                pass
+
+            @torch.jit.script_method
+            def call_foo(self, input):
+                return self.foo(input)
+
+            @torch.jit.script_method
+            def foo(self, input):
+                self.call_foo(input)
+
+        with self.assertRaisesRegex(RuntimeError, 'called recursively involving'):
+            M()
+
+    # TODO: Use this when we support passing literals to script fns
+    @unittest.expectedFailure
+    def test_script_kwargs_fn_call(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                pass
+
+            @torch.jit.script_method
+            def call_foo(self, input):
+                return self.foo(input=input, bar=1)
+
+            @torch.jit.script_method
+            def foo(self, bar, input):
+                return input + bar
+        m = M()
+        self.assertEqual(2, m.call_foo(torch.ones((), dtype=torch.int64)))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    def test_trace_of_script(self):
+        @torch.jit.script
+        def foo(a, c):
+            b = 0.0
+            if a == 0.0:
+                b = 1.0
+            return b + c
+
+        a = torch.ones(1, dtype=torch.float)
+
+        @torch.jit.trace(torch.zeros(1, dtype=torch.float))
+        def use(b):
+            return foo(b - 1.0, a) + 1.0
+
+        # test we propagated shapes through the function
+        self.assertTrue("Dynamic" not in str(use.graph))
+
+        self.assertEqual(3, use(torch.ones(1, dtype=torch.float)))
+        self.assertEqual(2, use(torch.zeros(1, dtype=torch.float)))
+
+    def test_if_define(self):
+        @torch.jit.script
+        def foo(a):
+            if a == 0:
+                b = 1
+            else:
+                b = 0
+            return FIXME_zerol() + (b + 1)
+
+        @torch.jit.script
+        def foo2(a):
+            b = 0
+            if a == 0:
+                b = 1
+            return FIXME_zerol() + (b + 1)
+
+        @torch.jit.script
+        def foo3(a):
+            b = 1
+            if a == 0:
+                c = 4
+            else:
+                b = 0
+            return FIXME_zerol() + (b + 1)
+
+        a = torch.ones(1, dtype=torch.long)
+        b = torch.zeros(1, dtype=torch.long)
+        self.assertEqual(1, foo(a))
+        self.assertEqual(2, foo(b))
+        self.assertEqual(1, foo2(a))
+        self.assertEqual(2, foo2(b))
+        self.assertEqual(1, foo3(a))
+        self.assertEqual(2, foo3(b))
+
+    def test_onnx_export_script_module(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = x - x
+                return x + x
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3))
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
+            example_outputs=outputs))
+
+    def test_onnx_export_script_python_fail(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToInline, self).__init__()
+
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+                self.mod = ModuleToInline()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3))
+        f = io.BytesIO()
+        with self.assertRaisesRegex(RuntimeError, "Couldn't export Python operator"):
+            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False,
+                               example_outputs=outputs)
+
+    def test_onnx_export_script_inline_trace(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToInline, self).__init__()
+
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+                self.mod = torch.jit.trace(torch.zeros(1, 2, 3))(ModuleToInline())
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3))
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
+            example_outputs=outputs))
+
+    def test_onnx_export_script_inline_script(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToInline, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+                self.mod = ModuleToInline()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3))
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
+            example_outputs=outputs))
+
+    def test_onnx_export_script_module_loop(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                for _ in range(100):
+                    x = x + x
+                return x
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3))
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
+            example_outputs=outputs))
+
+    def test_onnx_export_script_module_if(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                if torch.sum(x) > 0:
+                    x = torch.neg(x)
+                return x
+
+        mte = ModuleToExport()
+        outputs = mte(torch.zeros(1, 2, 3, dtype=torch.long))
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
+            example_outputs=outputs))
+
+    def test_onnx_export_script_inline_params(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToInline, self).__init__()
+                self.m = torch.nn.Parameter(torch.ones(3, 3))
+                self.unused = torch.nn.Parameter(torch.ones(1, 2, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, self.m)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+                self.mod = ModuleToInline()
+                self.param = torch.nn.Parameter(torch.ones(3, 4))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return torch.mm(y, self.param)
+
+        mte = ModuleToExport()
+        result = mte(torch.zeros(2, 3))
+        reference = torch.mm(torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4))
+        self.assertEqual(result, reference)
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.ones(2, 3),), None, verbose=False,
+            example_outputs=result, propagate=True))
+
+    def test_trace_with_size(self):
+        @torch.jit.trace(torch.zeros(1, 1))
+        def foo(x):
+            return x + 1
+
+        @torch.jit.script
+        def bar(x):
+            y = foo(x)
+            if True:
+                # FIXME: use number instead of tensor
+                y = torch.full([1], 7)
+            return y + 1
+
+        self.assertEqual(8, bar(torch.ones(1, 1)))
+
+    def test_index_select_shape_prop(self):
+
+        @torch.jit.script
+        def foo(x, y):
+            return torch.index_select(x, index=y, dim=1)
+
+        a = torch.zeros(2, 2)
+        b = torch.zeros(4, dtype=torch.long)
+        foo.graph.propagate_shapes((a, b), False)
+        self.assertExpected(canonical(foo.graph))
+
+    def test_onnx_export_speculate(self):
+
+        class Foo(torch.jit.ScriptModule):
+            def __init__(self, m):
+                super(Foo, self).__init__()
+                self.m = m
+
+            @torch.jit.script_method
+            def forward(self, x):
+                x += x
+                if True:
+                    if True:
+                        y = self.m(x)
+                    else:
+                        y = self.m(x)
+                else:
+                    y = self.m(x)
+                return y
+
+        linear = torch.jit.trace(torch.zeros(1, 10, dtype=torch.float))(nn.Linear(10, 20).float())
+
+        @torch.jit.script
+        def transpose(x):
+            return x.t()
+
+        f1 = Foo(transpose)
+        outputs_f1 = f1(torch.ones(1, 10, dtype=torch.float))
+        f2 = Foo(linear)
+        outputs_f2 = f2(torch.ones(1, 10, dtype=torch.float))
+
+        onnx_ish = torch.onnx.export_to_pretty_string(
+            f1,
+            (torch.ones(1, 10, dtype=torch.float), ),
+            None, verbose=False, example_outputs=outputs_f1)
+        self.assertExpected(onnx_ish, subname='f1')
+        onnx_ish = torch.onnx.export_to_pretty_string(
+            f2,
+            (torch.ones(1, 10, dtype=torch.float), ),
+            None, verbose=False, example_outputs=outputs_f2)
+        self.assertExpected(onnx_ish, subname='f2')
+
+    def test_onnx_export_shape_reshape(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                import torch.onnx.operators
+                x = x.repeat(5, 1, 1)
+                shape = torch.onnx.operators.shape_as_tensor(x)
+                reshaped = torch.onnx.operators.reshape_from_tensor_shape(x, shape)
+                return reshaped
+
+        foo = torch.jit.trace(torch.zeros(1, 2, 3))(Foo())
+        outputs = foo(torch.zeros(1, 2, 3))
+        f = io.BytesIO()
+        s = torch.onnx.export_to_pretty_string(foo, (torch.zeros(1, 2, 3)), f,
+                                               example_outputs=outputs)
+        self.assertExpected(s)
+
+    def test_shape_analysis_loop(self):
+        def foo(a, b, x):
+            c = a
+            # on the first iteration of the loop it appears that
+            # c should have a expand to the size of b
+            # but on the second+ iterations, there is no broadcast and the
+            # sizes are different.
+            # previously this would cause the compiler to (1) enter an infinite
+            # loop trying to compute the shape, and (2) insert invalid
+            # broadcasts.
+            # this test ensure we don't regress on these issues
+            for _ in range(2):
+                a = c + b
+                c = x
+                b = x
+            return a
+
+        self.checkScript(foo, (torch.zeros(1), torch.zeros(4), torch.zeros(5)), optimize=False)
+
+    def test_intlist_args(self):
+        def func_1(x):
+            return torch.nn.functional.adaptive_avg_pool1d(x, 1)
+
+        def func_2(x):
+            return torch.nn.functional.adaptive_avg_pool1d(x, output_size=1)
+
+        def func_3(x):
+            return torch.nn.functional.adaptive_avg_pool1d(x, output_size=[1])
+
+        x = torch.randn(8, 8, 8)
+        self.checkScript(func_1, [x], optimize=True)
+        self.checkScript(func_2, [x], optimize=True)
+        self.checkScript(func_3, [x], optimize=True)
+
+    def test_wrong_implicit_expand(self):
+
+        @torch.jit.trace(torch.zeros(3), torch.zeros(1))
+        def foo(a, b):
+            return a + b
+
+        a = torch.rand(4)
+        b = torch.rand(4)
+        self.assertEqual(a + b, foo(a, b))
+
+    def test_builtin_args_fails(self):
+
+        with self.assertRaisesRegex(RuntimeError, 'expected at most'):
+            @torch.jit.script
+            def f0(a):
+                torch.sum(a, a, a, a)
+
+        with self.assertRaisesRegex(RuntimeError, 'unknown keyword argument'):
+            @torch.jit.script
+            def f1(a):
+                torch.sum(foo=4)
+
+        with self.assertRaisesRegex(RuntimeError, 'specified twice'):
+            @torch.jit.script
+            def f2(a):
+                torch.sum(a, self=a)
+
+        with self.assertRaisesRegex(RuntimeError, 'not provided'):
+            @torch.jit.script
+            def f3(a):
+                torch.sum(dim=4)
+
+        with self.assertRaisesRegex(RuntimeError, 'for argument \'tensors\' but found Tensor'):
+            @torch.jit.script
+            def f4(a):
+                torch.cat(a)
+
+        with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found \\(\\(Tensor\\)\\)'):
+            @torch.jit.script
+            def f5(a):
+                torch.cat([[a]])
+
+        with self.assertRaisesRegex(RuntimeError, 'expected a value of type int\\[\\] for argument \'size\''):
+            @torch.jit.script
+            def f6(a):
+                a.expand(size=[3, [4]])
+
+        with self.assertRaisesRegex(RuntimeError, 'xpected a value of type Tensor for argument \'self\''):
+            @torch.jit.script
+            def f7(a):
+                torch.sum([4])
+
+    def test_builtin_args(self):
+
+        def t0(a):
+            # default arg dim
+            return torch.cat([a, a])
+
+        self.checkScript(t0, (torch.zeros(1, 1),))
+
+        def t1(a):
+            # keywords out of order
+            return torch.cat(dim=1, tensors=[a, a])
+
+        self.checkScript(t1, (torch.zeros(1, 1, 2),))
+
+        def t2(a):
+            # mix const/non-const attributes
+            if True:
+                b = 1
+            else:
+                b = 0
+            return torch.sum(a, dim=b, keepdim=False)
+
+        self.checkScript(t2, (torch.zeros(1, 1, 2),))
+
+    def test_gather_dynamic_index(self):
+        def t(x):
+            gather1 = x[0]
+            idx = 0 + 1
+            gather2 = x[idx]
+            return gather1 + gather2
+
+        self.checkScript(t, (torch.zeros(3, 2, 3),))
+
+    def test_slice_dynamic_index(self):
+        def t(x):
+            slice1 = x[0:1]
+            zero = 0
+            one = zero + 1
+            slice2 = x[zero:one]
+            return slice1 + slice2
+
+        self.checkScript(t, (torch.zeros(3, 2, 3),))
+
+    def test_addmm_grad(self):
+        """ This test checks several things:
+            1. An expand node was inserted before the addmm operating on the
+               bias term.
+            2. The fused form of addmm appears in the ultimate graph that's
+               executed.
+            3. A sum op was emitted for accumulating gradients along the 0th
+               (expanded) dimension of the bias term.
+            4. The correct symbolic representation for the backward pass of the
+               mm operator was emitted (x.t() -> mm)
+
+            TODO: we should actually check these conditions once we have a way
+            to dump the GraphExecutor state. Namely the processed forward graph
+            and the backward graph.
+        """
+        @torch.jit.script
+        def addmm_grad_test(b, x, w):
+            return torch.addmm(b, x, w)
+
+        # Initialize param and input values
+        w_init = torch.rand(2, 5)
+        b_init = torch.rand(5)
+        x = torch.rand(3, 2)
+
+        # Clone trainable params
+        b = b_init.clone()
+        b.requires_grad_()
+        w = w_init.clone()
+        w.requires_grad_()
+
+        # Test symbolic differentiation
+        y = addmm_grad_test(b, x, w)
+        y.sum().backward()
+
+        # clone params for autograd reference
+        b_ref = b_init.clone()
+        b_ref.requires_grad_()
+        w_ref = w_init.clone()
+        w_ref.requires_grad_()
+        y_ref = torch.addmm(b_ref, x, w_ref)
+        y_ref.sum().backward()
+
+        self.assertEqual(w.grad, w_ref.grad)
+        self.assertEqual(b.grad, b_ref.grad)
+
+    def test_zeros(self):
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['d']
+
+            def __init__(self):
+                self.d = torch.device('cpu')
+
+            @torch.jit.script_method
+            def create(self):
+                return torch.zeros([1, 1, 2], dtype=torch.float, device=self.d, layout=torch.strided)
+
+        r = M().create()
+        self.assertEqual(r.dtype, torch.float)
+        self.assertEqual(torch.zeros([1, 1, 2], dtype=torch.float), r)
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    def test_rand(self):
+
+        def test_rand():
+            a = torch.rand([3, 4])
+            return a + 1.0 - a
+
+        self.checkScript(test_rand, ())
+
+    def test_erase_number_types(self):
+        def func(a):
+            b = 7 + 1 + 3
+            c = a + b
+            c += b
+            return c
+
+        graph = torch.jit._script_graph(func)
+        self.run_pass('erase_number_types', graph)
+        self.assertExpectedGraph(graph)
+
+    def test_loop_unrolling(self):
+        def fn(x):
+            y = FIXME_zerol()
+            for i in range(x):
+                y += i
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+        self.checkScript(fn, (torch.tensor(10),))
+
+    def test_loop_unrolling_const(self):
+        def fn():
+            y = FIXME_zerol()
+            for i in range(10):
+                y += 1
+            return y
+
+        def fn2():
+            y = FIXME_zerol()
+            for i in range(10):
+                y += i
+            return y
+
+        def check(fn, name):
+            graph = torch.jit._script_graph(fn)
+            self.run_pass('loop_unrolling', graph)
+            self.assertExpectedGraph(graph, subname=name)
+            self.checkScript(fn, ())
+
+        check(fn, 'add_const')
+        check(fn2, 'add_iter')
+
+    def test_loop_unrolling_nested(self):
+        def fn(x):
+            y = FIXME_zerol()
+            for i in range(10):
+                for j in range(x):
+                    y += j
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+        self.checkScript(fn, (torch.tensor(10),))
+
+    def test_loop_unroll_unused_counter(self):
+        def fn(x):
+            y = FIXME_zerol()
+            for i in range(x):
+                y += 1
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+
+    def test_loop_unroll_negative(self):
+        def fn(x):
+            y = FIXME_zerol()
+            for i in range(x):
+                y += 1
+            return y
+
+        self.checkScript(fn, (torch.tensor(-20),))
+        self.checkScript(fn, (torch.tensor(-2),))
+        self.checkScript(fn, (torch.tensor(-1),))
+        self.checkScript(fn, (torch.tensor(0),))
+        self.checkScript(fn, (torch.tensor(1),))
+        self.checkScript(fn, (torch.tensor(2),))
+
+    def test_where(self):
+        def fn(x, y):
+            return torch.where(x > 0.0, x, y)
+
+        self.checkScript(fn, (torch.randn(3, 2, dtype=torch.float), torch.ones(3, 2, dtype=torch.float)))
+
+    def test_reassign_module_lhs(self):
+        with self.assertRaisesRegex(RuntimeError, 'Cannot re-assign \'self\' because it has type value and self is'
+                                    ' not a first-class value.  Only reassignments to first-class values are allowed'):
+            class ReassignSelfLHS(torch.jit.ScriptModule):
+                @torch.jit.script_method
+                def forward(self, x):
+                    for i in range(20):
+                        self = x
+                    return self
+
+            ReassignSelfLHS()
+
+    def test_reassign_module_rhs(self):
+        with self.assertRaisesRegex(RuntimeError, 'Cannot re-assign \'x\' to a value of type module because x is not a'
+                                    ' first-class value.  Only reassignments to first-class values are allowed'):
+            class ReassignSelfRHS(torch.jit.ScriptModule):
+                @torch.jit.script_method
+                def forward(self, x):
+                    for i in range(20):
+                        x = self
+                    return self
+
+            ReassignSelfRHS()
+
+    def test_chunk_non_constant(self):
+        with self.assertRaisesRegex(RuntimeError, 'argument \'chunks\' must be a constant'):
+            @torch.jit.script
+            def chunk_non_constant(x, y):
+                return x.chunk(y)
+
+    def test_unknown_builtin(self):
+        with self.assertRaisesRegex(RuntimeError, 'unknown builtin op'):
+            @torch.jit.script
+            def unknown_builtin(x):
+                return x.splork(3)
+
+    def test_expected_tensor_found_tuple(self):
+        with self.assertRaisesRegex(RuntimeError, 'expected a tensor value but found a Tuple'):
+            @torch.jit.script
+            def return_tuple_wrong(x):
+                a = (x, x)
+                return a, x
+
+    def test_method_no_self(self):
+        with self.assertRaisesRegex(RuntimeError, 'methods must have a self argument'):
+            class MethodNoSelf(torch.jit.ScriptModule):
+                @torch.jit.script_method
+                def forward():
+                    return torch.zeros(3, 4)
+
+            MethodNoSelf()
+
+    def test_return_stmt_not_at_end(self):
+        with self.assertRaisesRegex(RuntimeError, 'return statements can appear only at the end of the function body'):
+            @torch.jit.script
+            def return_stmt_wrong(x):
+                if x > 3:
+                    return 3
+                else:
+                    return x
+
+    def test_for_range_no_arg(self):
+        with self.assertRaisesRegex(RuntimeError, 'range\(\) expects 1 argument but got 0'):
+            @torch.jit.script
+            def range_no_arg(x):
+                for i in range():
+                    x += 1
+                return x
+
+    def test_list_iterables(self):
+        with self.assertRaisesRegex(RuntimeError, 'List of iterables is not supported currently'):
+            cu = torch.jit.CompilationUnit('''
+            def list_iterables(x):
+                for i, j in [2, 3, 4], [5, 6, 7]:
+                    x += i
+                    x += j
+                return x
+            ''')
+
+    def test_for_tuple_unpack(self):
+        with self.assertRaisesRegex(RuntimeError, 'Iteration variable unpacking is not supported'):
+            cu = torch.jit.CompilationUnit('''
+            def for_tuple_unpack(x, y):
+                for i, j in [[3, 4], [5, 6], [7, 8]]:
+                    x += i
+                    y += j
+                return x, y
+            ''')
+
+    def test_single_starred_lhs(self):
+        with self.assertRaisesRegex(RuntimeError, 'A Starred expression may only appear on the lhs within the presence'
+                                                  ' of another non-starred expression'):
+            cu = torch.jit.CompilationUnit('''
+            def single_starred_lhs(x):
+                a = (x, x, x)
+                *b = a
+                return b
+            ''')
+
+    def test_multi_reduction(self):
+        with self.assertRaisesRegex(RuntimeError, 'reductions are only allowed when there is a single variable on'
+                                                  ' the left-hand side'):
+            cu = torch.jit.CompilationUnit('''
+            def multi_reduction(x):
+                a, b += x
+                return a, b
+            ''')
+
+    def test_invalid_call_arguments(self):
+        with self.assertRaisesRegex(RuntimeError, 'arguments for call are not valid'):
+            @torch.jit.script
+            def invalid_call_arguments(x):
+                return torch.unsqueeze(3, 4, 5, 6, 7, 8)
+
+    def test_invalid_lhs_assignment(self):
+        with self.assertRaisesRegex(RuntimeError, 'lhs of assignment must be a variable or starred expression'):
+            cu = torch.jit.CompilationUnit('''
+            def invalid_lhs_assignment(x):
+                x + 1 = x
+                return x
+            ''')
+
+    def test_multi_starred_expr_lhs(self):
+        with self.assertRaisesRegex(RuntimeError, 'Only one starred expression is allowed on the lhs'):
+            cu = torch.jit.CompilationUnit('''
+            def multi_starred_expr_lhs():
+                a, *b, *c = [1, 2, 3, 4, 5, 6]
+                return a
+            ''')
+
+    def test_pack_tuple_into_non_var(self):
+        with self.assertRaisesRegex(RuntimeError, 'Cannot pack a tuple into a non-variable'):
+            cu = torch.jit.CompilationUnit('''
+            def pack_tuple_into_non_var(x):
+                a, *1 = (3, 4, 5)
+                return x
+            ''')
+
+    def test_print_kwargs(self):
+        with self.assertRaisesRegex(RuntimeError, 'print doesn\'t accept any keyword arguments'):
+            cu = torch.jit.CompilationUnit('''
+            def print_kwargs(x):
+                print(x, flush=True)
+                return x
+            ''')
+
+    def test_builtin_use_as_value(self):
+        with self.assertRaisesRegex(RuntimeError, 'builtin cannot be used as a value'):
+            @torch.jit.script
+            def builtin_use_as_value(x):
+                return x.unsqueeze
+
+    def test_wrong_use_as_tuple(self):
+        with self.assertRaisesRegex(RuntimeError, 'cannot be used as a tuple'):
+            def test_fn():
+                return 3
+
+            @torch.jit.script
+            def wrong_use_as_tuple(self):
+                a, b = test_fn
+                return a
+
+    def test_wrong_attr_lookup(self):
+        with self.assertRaisesRegex(RuntimeError, 'attribute lookup is not defined on builtin'):
+            @torch.jit.script
+            def wrong_attr_lookup(self, x):
+                a = x.unsqueeze.myattr
+                return a
+
+    def test_wrong_use_as_callable(self):
+        with self.assertRaisesRegex(RuntimeError, 'cannot call a value'):
+            @torch.jit.script
+            def wrong_use_as_callable(x):
+                return x(3, 4, 5)
+
+    def test_wrong_python_kwarg_call(self):
+        with self.assertRaisesRegex(RuntimeError, 'keyword arguments in Python calls aren\'t supported'):
+            # NB: the only way I could get to this code path is if I made the
+            # python function have 0 inputs, since we interpret all the inputs to
+            # the function as inputs (including those with defaults) and not kwargs
+            def test_fn():
+                return 3
+
+            @torch.jit.script
+            def wrong_python_kwarg_call(self):
+                return test_fn(attr=6)
+
+    def test_python_val_doesnt_have_attr(self):
+        with self.assertRaisesRegex(RuntimeError, 'object has no attribute abcd'):
+            def test_fn():
+                return 3
+
+            @torch.jit.script
+            def python_val_doesnt_have_attr():
+                return test_fn.abcd
+
+    def test_wrong_module_attr_lookup(self):
+        with self.assertRaisesRegex(RuntimeError, 'unsupported attribute lookup on'):
+            import io
+
+            @torch.jit.script
+            def wrong_module_attr_lookup():
+                return io.BytesIO
+
+    def test_wrong_method_call_inputs(self):
+        with self.assertRaisesRegex(RuntimeError, 'argument \'y\' not provided'):
+            class SomeModule(torch.jit.ScriptModule):
+
+                @torch.jit.script_method
+                def foo(self, x, y):
+                    return x
+
+                @torch.jit.script_method
+                def forward(self, x, y):
+                    return self.foo(x)
+            SomeModule()
+
+    def test_single_starred_expr_for_loop(self):
+        with self.assertRaisesRegex(RuntimeError, 'Starred unpacking is currently not supported for for loops'):
+            cu = torch.jit.CompilationUnit('''
+            def test():
+                x = 0
+                for *a in [1, 2, 3]:
+                    x = x + 1
+                return x
+            ''')
+
+    def test_duplicate(self):
+        with self.assertRaisesRegex(RuntimeError, 'Method \'test\' already defined'):
+            cu = torch.jit.CompilationUnit('''
+            def test():
+                return 1
+
+            def test():
+                return 2
+            ''')
+
+    def test_call_ge(self):
+        with self.assertRaisesRegex(RuntimeError, 'expected at most 1 arguments but found 3'):
+            @torch.jit.trace(torch.zeros(1, 2, 3))
+            def foo(x):
+                return x
+
+            @torch.jit.script
+            def test_fn():
+                return foo(torch.full([1], 1), torch.full([1], 2), torch.full([1], 3))
+
+    def test_wrong_return_type(self):
+        with self.assertRaisesRegex(RuntimeError, 'Python functions can currently only return Tensors'):
+            def somefunc():
+                # type: () -> Tuple[Tuple[Tensor, Tensor]]
+                return torch.zeros(3, 4), torch.zeros(4, 5)
+
+            @torch.jit.script
+            def wrong_return_type():
+                return somefunc()
+
+    # Tests for calling between different front-end modes
+    def test_call_python_fn_from_tracing_fn(self):
+        def python_fn(x):
+            return torch.neg(x)
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return python_fn(x) + 1
+
+        # The neg op in the python function should be properly inlined to the
+        # graph
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_python_mod_from_tracing_fn(self):
+        class PythonMod(torch.nn.Module):
+            def __init__(self):
+                super(PythonMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        pm = PythonMod()
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return pm(x) + 1
+
+        # Note: the parameter self.param from the Python module is inlined
+        # into the graph
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_traced_fn_from_tracing_fn(self):
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn1(x):
+            return torch.neg(x)
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return traced_fn1(x) + 1
+
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_traced_mod_from_tracing_fn(self):
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return tm(x) + 1
+
+        # Note: the parameter self.param from the Python module is inlined
+        # into the graph
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_script_fn_from_tracing_fn(self):
+        @torch.jit.script
+        def script_fn(x):
+            return torch.neg(x)
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return script_fn(x) + 1
+
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_script_mod_from_tracing_fn(self):
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        sm = ScriptMod()
+
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return sm(x) + 1
+
+        self.assertExpected(str(traced_fn.graph))
+
+    def test_call_python_fn_from_traced_module(self):
+        def python_fn(x):
+            return torch.neg(x)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            def forward(self, x):
+                return torch.mm(python_fn(x), self.param)
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        # Note: parameter self.param from the traced module should appear as
+        # an input to the graph and the neg op from the Python function should
+        # be properly inlined
+        self.assertExpected(str(tm.graph))
+
+    def test_call_python_mod_from_traced_module(self):
+        class PythonModule(torch.nn.Module):
+            def __init__(self):
+                super(PythonModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(5, 7))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 5))
+                self.mod = PythonModule()
+
+            def forward(self, x):
+                return self.mod(torch.mm(x, self.param)) + 1
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        # Note: the parameters from both modules should appear in the flattened
+        # inputs of the graph. All ops from both modules should be inlined.
+        self.assertExpected(str(tm.graph))
+
+    def test_call_traced_fn_from_traced_module(self):
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return torch.neg(x)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 5))
+
+            def forward(self, x):
+                return traced_fn(torch.mm(x, self.param))
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+        # Note: neg op from the traced function should be properly inlined
+        self.assertExpected(str(tm.graph))
+
+    def test_call_traced_module_from_traced_module(self):
+        class TracedModule1(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule1, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(5, 7))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 5))
+                self.mod = torch.jit.trace(torch.rand(3, 5))(TracedModule1())
+
+            def forward(self, x):
+                return self.mod(torch.mm(x, self.param)) + 1
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        # Note: the parameters from both modules should appear in the flattened
+        # inputs of the graph. All ops from both modules should be inlined.
+        self.assertExpected(str(tm.graph))
+
+    def test_call_script_fn_from_traced_module(self):
+        @torch.jit.script
+        def traced_fn(x):
+            return torch.neg(x)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 5))
+
+            def forward(self, x):
+                return traced_fn(torch.mm(x, self.param))
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+        # Note: neg op from the script function should be properly inlined
+        self.assertExpected(str(tm.graph))
+
+    def test_call_script_module_from_traced_module(self):
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(5, 7))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 5))
+                self.mod = ScriptMod()
+
+            def forward(self, x):
+                return self.mod(torch.mm(x, self.param)) + 1
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        # Note: the parameters from both modules should appear in the flattened
+        # inputs of the graph. All ops from both modules should be inlined.
+        self.assertExpected(str(tm.graph))
+
+    def test_call_python_fn_from_script_fn(self):
+        def python_fn(x):
+            return torch.neg(x)
+
+        @torch.jit.script
+        def script_fn(x):
+            return python_fn(x) + 1
+
+        # Note: the call to python_fn appears as `^python_fn()` and is called
+        # as a PythonOp in the interpreter
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_python_mod_from_script_fn(self):
+        class PythonModule(torch.nn.Module):
+            def __init__(self):
+                super(PythonModule, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(5, 7))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        pm = PythonModule()
+
+        @torch.jit.script
+        def script_fn(x):
+            return pm(x) + 1
+
+        # Note: call to pm(x) appears as ^<python_value>() in the trace.
+        # Parameters are NOT inlined.
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_traced_fn_from_script_fn(self):
+        @torch.jit.trace(torch.rand(3, 4))
+        def traced_fn(x):
+            return torch.neg(x)
+
+        @torch.jit.script
+        def script_fn(x):
+            return traced_fn(x) + 1
+
+        # Note: the neg op from traced_fn should be properly inlined into the
+        # script function's graph
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_traced_mod_from_script_fn(self):
+        class TracedModule(torch.nn.Module):
+            def __init__(self):
+                super(TracedModule, self).__init__()
+
+            def forward(self, x):
+                return torch.mm(x, torch.zeros(4, 3))
+
+        tm = torch.jit.trace(torch.rand(3, 4))(TracedModule())
+
+        @torch.jit.script
+        def script_fn(x):
+            return tm(x) + 1
+
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_script_fn_from_script_fn(self):
+        @torch.jit.script
+        def script_fn1(x):
+            return torch.neg(x)
+
+        @torch.jit.script
+        def script_fn(x):
+            return script_fn1(x) + 1
+
+        # Note: the neg op from script_fn1 should be properly inlined into the
+        # graph of script_fn
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_script_mod_from_script_fn(self):
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, torch.zeros([4, 3]))
+
+        sm = ScriptMod()
+
+        @torch.jit.script
+        def script_fn(x):
+            return sm(x) + 1
+
+        self.assertExpected(str(script_fn.graph))
+
+    def test_call_python_fn_from_script_module(self):
+        def python_fn(x):
+            return torch.neg(x)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return python_fn(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        self.assertExpected(str(sm.__getattr__('forward').graph))
+
+    def test_call_python_mod_from_script_module(self):
+        class PythonMod(torch.nn.Module):
+            def __init__(self):
+                super(PythonMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 5))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+                self.pm = PythonMod()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.pm(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        # Note: the call into PythonMod appears as ^<python_value>(). Parameters
+        # are NOT inlined
+        self.assertExpected(str(sm.graph))
+
+    def test_call_tracing_fn_from_script_module(self):
+        @torch.jit.trace(torch.rand(3, 3))
+        def traced_fn(x):
+            return torch.neg(x)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return traced_fn(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        self.assertExpected(str(sm.__getattr__('forward').graph))
+
+    def test_call_tracing_mod_from_script_module(self):
+        class TracedMod(torch.nn.Module):
+            def __init__(self):
+                super(TracedMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 5))
+
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+                self.tm = torch.jit.trace(torch.rand(3, 3))(TracedMod())
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.tm(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        # Note: the parameters from both modules should appear in the flattened
+        # input list to the graph. The mm op from TracedMod should be properly
+        # inlined
+        self.assertExpected(str(sm.graph))
+
+    def test_call_script_fn_from_script_module(self):
+        @torch.jit.script
+        def script_fn(x):
+            return torch.neg(x)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return script_fn(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        self.assertExpected(str(sm.__getattr__('forward').graph))
+
+    def test_call_script_mod_from_script_module(self):
+        class ScriptMod1(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod1, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 5))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, self.param)
+
+        class ScriptMod(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ScriptMod, self).__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 3))
+                self.tm = ScriptMod1()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.tm(torch.mm(x, self.param))
+
+        sm = ScriptMod()
+        # Note: the parameters from both modules should appear in the flattened
+        # input list to the graph. The mm op from ScriptMod1 should be properly
+        # inlined
+        self.assertExpected(str(sm.graph))
+
+    def test_module_with_params_called_fails(self):
+        with self.assertRaisesRegex(RuntimeError, "Attempted to inline a Module with parameters. Stateful "
+                                                  "modules to be inlined must be submodules of the callee."):
+            class ScriptMod(torch.jit.ScriptModule):
+                def __init__(self):
+                    super(ScriptMod, self).__init__()
+                    self.param = torch.nn.Parameter(torch.rand(3, 3))
+
+                @torch.jit.script_method
+                def forward(self, x):
+                    return torch.mm(x, self.param)
+
+            sm = ScriptMod()
+
+            @torch.jit.script
+            def some_func(x):
+                return sm(x)
+
+    def test_index_put_trace_with_view(self):
+        @torch.jit.trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(1, 1, 1, 4))
+        def test_index_put(target, indices, rhs):
+            target[indices] = rhs
+            return target
+
+        self.assertExpected(str(test_index_put.graph))
+
+    def test_index_put_trace_without_view(self):
+        @torch.jit.trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(4))
+        def test_index_put(target, indices, rhs):
+            target[indices] = rhs
+            return target
+
+        self.assertExpected(str(test_index_put.graph))
+
+
+class TestEndToEndHybridFrontendModels(JitTestCase):
+
+    def test_dcgan_models(self):
+        class DCGANGenerator(nn.Module):
+            def __init__(self, nz, ngf, nc):
+                super(DCGANGenerator, self).__init__()
+                self.main = nn.Sequential(
+                    # input is Z, going into a convolution
+                    nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
+                    nn.BatchNorm2d(ngf * 8),
+                    nn.ReLU(True),
+                    # state size. (ngf*8) x 4 x 4
+                    nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ngf * 4),
+                    nn.ReLU(True),
+                    # state size. (ngf*4) x 8 x 8
+                    nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ngf * 2),
+                    nn.ReLU(True),
+                    # state size. (ngf*2) x 16 x 16
+                    nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ngf),
+                    nn.ReLU(True),
+                    # state size. (ngf) x 32 x 32
+                    nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
+                    nn.Tanh()
+                    # state size. (nc) x 64 x 64
+                )
+
+            def forward(self, input):
+                return self.main(input)
+
+        class DCGANDiscriminator(nn.Module):
+            def __init__(self, nc, ndf):
+                super(DCGANDiscriminator, self).__init__()
+                self.main = nn.Sequential(
+                    # input is (nc) x 64 x 64
+                    nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    # state size. (ndf) x 32 x 32
+                    nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ndf * 2),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    # state size. (ndf*2) x 16 x 16
+                    nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ndf * 4),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    # state size. (ndf*4) x 8 x 8
+                    nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
+                    nn.BatchNorm2d(ndf * 8),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    # state size. (ndf*8) x 4 x 4
+                    nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
+                    nn.Sigmoid()
+                )
+
+            def forward(self, input):
+                return self.main(input).view(-1, 1).squeeze(1)
+
+        bs, nz, ngf, nc, ndf = 5, 6, 9, 3, 10
+        self.checkTrace(DCGANGenerator(nz, ngf, nc), (torch.rand(bs, nz, 1, 1),))
+        example_input = DCGANGenerator(nz, ngf, nc)(torch.rand(bs, nz, 1, 1))
+        self.checkTrace(DCGANDiscriminator(nc, ndf), (example_input,))
+
+    @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug')
+    def test_neural_style(self):
+        class TransformerNet(torch.nn.Module):
+            def __init__(self):
+                super(TransformerNet, self).__init__()
+                # Initial convolution layers
+                self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
+                self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
+                self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
+                self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
+                self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
+                self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
+                # Residual layers
+                self.res1 = ResidualBlock(128)
+                self.res2 = ResidualBlock(128)
+                self.res3 = ResidualBlock(128)
+                self.res4 = ResidualBlock(128)
+                self.res5 = ResidualBlock(128)
+                # Upsampling Layers
+                self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
+                self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
+                self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
+                self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
+                self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
+                # Non-linearities
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, X):
+                y = self.relu(self.in1(self.conv1(X)))
+                y = self.relu(self.in2(self.conv2(y)))
+                y = self.relu(self.in3(self.conv3(y)))
+                y = self.res1(y)
+                y = self.res2(y)
+                y = self.res3(y)
+                y = self.res4(y)
+                y = self.res5(y)
+                y = self.relu(self.in4(self.deconv1(y)))
+                y = self.relu(self.in5(self.deconv2(y)))
+                y = self.deconv3(y)
+                return y
+
+        class ConvLayer(torch.nn.Module):
+            def __init__(self, in_channels, out_channels, kernel_size, stride):
+                super(ConvLayer, self).__init__()
+                reflection_padding = kernel_size // 2
+                self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+                self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+            def forward(self, x):
+                out = self.reflection_pad(x)
+                out = self.conv2d(out)
+                return out
+
+        class ResidualBlock(torch.nn.Module):
+            """ResidualBlock
+            introduced in: https://arxiv.org/abs/1512.03385
+            recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
+            """
+
+            def __init__(self, channels):
+                super(ResidualBlock, self).__init__()
+                self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+                self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
+                self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+                self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                residual = x
+                out = self.relu(self.in1(self.conv1(x)))
+                out = self.in2(self.conv2(out))
+                out = out + residual
+                return out
+
+        class UpsampleConvLayer(torch.nn.Module):
+            """UpsampleConvLayer
+            Upsamples the input and then does a convolution. This method gives better results
+            compared to ConvTranspose2d.
+            ref: http://distill.pub/2016/deconv-checkerboard/
+            """
+
+            def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
+                super(UpsampleConvLayer, self).__init__()
+                self.upsample = upsample
+                if upsample:
+                    self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
+                reflection_padding = kernel_size // 2
+                self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+                self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+            def forward(self, x):
+                x_in = x
+                if self.upsample:
+                    x_in = self.upsample_layer(x_in)
+                out = self.reflection_pad(x_in)
+                out = self.conv2d(out)
+                return out
+
+        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),))
+
+    def test_mnist(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+                self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+                self.conv2_drop = nn.Dropout2d()
+                self.fc1 = nn.Linear(320, 50)
+                self.fc2 = nn.Linear(50, 10)
+
+            def forward(self, x):
+                x = F.relu(F.max_pool2d(self.conv1(x), 2))
+                x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+                x = x.view(-1, 320)
+                x = F.relu(self.fc1(x))
+                x = F.dropout(x, training=self.training)
+                x = self.fc2(x)
+                return F.log_softmax(x, dim=1)
+
+        # FIXME: eval() is present because it works around the issue described
+        # in https://github.com/pytorch/pytorch/issues/8448
+        self.checkTrace(Net().eval(), (torch.rand(5, 1, 28, 28),))
+
+    def test_reinforcement_learning(self):
+        class Policy(nn.Module):
+            def __init__(self):
+                super(Policy, self).__init__()
+                self.affine1 = nn.Linear(4, 128)
+                self.affine2 = nn.Linear(128, 2)
+
+            def forward(self, x):
+                x = F.relu(self.affine1(x))
+                action_scores = self.affine2(x)
+                return F.softmax(action_scores, dim=1)
+
+        self.checkTrace(Policy(), (torch.rand(1, 4),))
+
+    def test_snli(self):
+        # TODO:
+        #   1) nn.LSTM is called as a Python function https://github.com/pytorch/pytorch/issues/8449
+        #   2) Dropout is called as a Python function https://github.com/pytorch/pytorch/issues/8450
+        class Bottle(nn.Module):
+
+            def forward(self, input):
+                if len(input.size()) <= 2:
+                    return super(Bottle, self).forward(input)
+                size = input.size()[:2]
+                out = super(Bottle, self).forward(input.view(size[0] * size[1], -1))
+                return out.view(size[0], size[1], -1)
+
+        class Linear(Bottle, nn.Linear):
+            pass
+
+        class Encoder(nn.Module):
+
+            def __init__(self, config):
+                super(Encoder, self).__init__()
+                self.config = config
+                input_size = config.d_proj if config.projection else config.d_embed
+                dropout = 0 if config.n_layers == 1 else config.dp_ratio
+                self.rnn = nn.LSTM(input_size=input_size, hidden_size=config.d_hidden,
+                                   num_layers=config.n_layers, dropout=dropout,
+                                   bidirectional=config.birnn)
+
+            def forward(self, inputs):
+                batch_size = inputs.size()[1]
+                state_shape = self.config.n_cells, batch_size, self.config.d_hidden
+                h0 = c0 = inputs.new_zeros(state_shape)
+                outputs, (ht, ct) = self.rnn(inputs, (h0, c0))
+                return ht[-1] if not self.config.birnn else ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
+
+        class SNLIClassifier(nn.Module):
+
+            def __init__(self, config):
+                super(SNLIClassifier, self).__init__()
+                self.config = config
+                self.embed = nn.Embedding(config.n_embed, config.d_embed)
+                self.projection = Linear(config.d_embed, config.d_proj)
+                self.encoder = Encoder(config)
+                self.dropout = nn.Dropout(p=config.dp_ratio)
+                self.relu = nn.ReLU()
+                seq_in_size = 2 * config.d_hidden
+                if self.config.birnn:
+                    seq_in_size *= 2
+                lin_config = [seq_in_size] * 2
+                self.out = nn.Sequential(
+                    Linear(*lin_config),
+                    self.relu,
+                    self.dropout,
+                    Linear(*lin_config),
+                    self.relu,
+                    self.dropout,
+                    Linear(*lin_config),
+                    self.relu,
+                    self.dropout,
+                    Linear(seq_in_size, config.d_out))
+
+            def forward(self, premise, hypothesis):
+                prem_embed = self.embed(premise)
+                hypo_embed = self.embed(hypothesis)
+                if self.config.fix_emb:
+                    prem_embed = prem_embed.detach()
+                    hypo_embed = hypo_embed.detach()
+                if self.config.projection:
+                    prem_embed = self.relu(self.projection(prem_embed))
+                    hypo_embed = self.relu(self.projection(hypo_embed))
+                premise = self.encoder(prem_embed)
+                hypothesis = self.encoder(hypo_embed)
+                scores = self.out(torch.cat([premise, hypothesis], 1))
+                return scores
+
+        class Config:
+            n_embed = 100
+            d_embed = 100
+            d_proj = 300
+            dp_ratio = 0.0  # For deterministic testing TODO: change by fixing seed in checkTrace?
+            d_hidden = 300
+            birnn = True
+            d_out = 300
+            fix_emb = True
+            projection = True
+            n_layers = 2
+            n_cells = 4  # 2 * n_layers because birnn = True
+
+        premise = torch.LongTensor(48, 128).random_(0, 100)
+        hypothesis = torch.LongTensor(24, 128).random_(0, 100)
+
+        self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False)
+
+    def test_super_resolution(self):
+        import torch.nn.init as init
+
+        class Net(nn.Module):
+
+            def __init__(self, upscale_factor):
+                super(Net, self).__init__()
+
+                self.relu = nn.ReLU()
+                self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
+                self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
+                self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
+                self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+                self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
+
+            def forward(self, x):
+                x = self.relu(self.conv1(x))
+                x = self.relu(self.conv2(x))
+                x = self.relu(self.conv3(x))
+                x = self.pixel_shuffle(self.conv4(x))
+                return x
+
+        net = Net(upscale_factor=4)
+        self.checkTrace(net, (torch.rand(5, 1, 64, 64),))
+
+    def test_time_sequence_prediction(self):
+        class Sequence(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sequence, self).__init__()
+                self.lstm1 = nn.LSTMCell(1, 51)
+                self.lstm2 = nn.LSTMCell(51, 51)
+                self.linear = nn.Linear(51, 1)
+
+            # TODO: could not pass tuple to a python Op and type annotations
+            # is not descending to python signature, hence the wrapper
+            # see https://github.com/pytorch/pytorch/issues/8778
+            # and https://github.com/pytorch/pytorch/issues/8777
+            def test_lstm1(self, input, hx, cx):
+                # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor]
+                return self.lstm1(input, (hx, cx))
+
+            def test_lstm2(self, input, hx, cx):
+                # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor]
+                return self.lstm2(input, (hx, cx))
+
+            # TODO: could not support tensor constructors in script
+            # see https://github.com/pytorch/pytorch/issues/8814
+            def test_tensor(self):
+                return torch.tensor([], dtype=torch.double)
+
+            @torch.jit.script_method
+            def forward(self, input):
+                # TODO: add future as input with default val
+                # see https://github.com/pytorch/pytorch/issues/8724
+                outputs = self.test_tensor()
+                h_t = torch.zeros((3, 51), dtype=torch.double)
+                c_t = torch.zeros((3, 51), dtype=torch.double)
+                h_t2 = torch.zeros((3, 51), dtype=torch.double)
+                c_t2 = torch.zeros((3, 51), dtype=torch.double)
+
+                output = torch.zeros([3, 51])
+                future = 2
+
+                # TODO: chunk call should be input.chunk(input.size(1), dim=1)
+                # see https://github.com/pytorch/pytorch/issues/8775
+                for input_t in input.chunk(4, dim=1):
+                    h_t, c_t = self.test_lstm1(input_t, h_t, c_t)
+                    h_t2, c_t2 = self.test_lstm2(h_t, h_t2, c_t2)
+                    output = self.linear(h_t2)
+                    outputs = torch.cat((outputs, output), 1)
+                for _ in range(future):  # if we should predict the future
+                    h_t, c_t = self.test_lstm1(output, h_t, c_t)
+                    h_t2, c_t2 = self.test_lstm2(h_t, h_t2, c_t2)
+                    output = self.linear(h_t2)
+                    outputs = torch.cat((outputs, output), 1)
+                return outputs
+
+        self.checkTrace(Sequence(), (torch.rand(3, 4),))
+
+    def test_vae(self):
+        class VAE(nn.Module):
+            def __init__(self):
+                super(VAE, self).__init__()
+
+                self.fc1 = nn.Linear(784, 400)
+                self.fc21 = nn.Linear(400, 20)
+                self.fc22 = nn.Linear(400, 20)
+                self.fc3 = nn.Linear(20, 400)
+                self.fc4 = nn.Linear(400, 784)
+
+            def encode(self, x):
+                h1 = F.relu(self.fc1(x))
+                return self.fc21(h1), self.fc22(h1)
+
+            def reparameterize(self, mu, logvar):
+                if self.training:
+                    std = torch.exp(0.5 * logvar)
+                    eps = torch.randn_like(std)
+                    return eps.mul(std).add_(mu)
+                else:
+                    return mu
+
+            def decode(self, z):
+                h3 = F.relu(self.fc3(z))
+                return torch.sigmoid(self.fc4(h3))
+
+            def forward(self, x):
+                mu, logvar = self.encode(x.view(-1, 784))
+                z = self.reparameterize(mu, logvar)
+                return self.decode(z), mu, logvar
+
+        # FIXME: this fails under training because of the call to `randn_like`
+        # https://github.com/pytorch/pytorch/issues/8443
+        self.checkTrace(VAE().eval(), (torch.rand(128, 1, 28, 28),))
+
+
+# Smoke tests for export methods
+class TestPytorchExportModes(JitTestCase):
+    class MyModel(nn.Module):
+        def __init__(self):
+            super(TestPytorchExportModes.MyModel, self).__init__()
+
+        def forward(self, x):
+            return x.transpose(0, 1)
+
+    def test_protobuf(self):
+        torch_model = TestPytorchExportModes.MyModel()
+        fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+        f = io.BytesIO()
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.PROTOBUF_FILE)
+
+    def test_zipfile(self):
+        torch_model = TestPytorchExportModes.MyModel()
+        fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+        f = io.BytesIO()
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE)
+
+    def test_compressed_zipfile(self):
+        torch_model = TestPytorchExportModes.MyModel()
+        fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+        f = io.BytesIO()
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE)
+
+    def test_directory(self):
+        torch_model = TestPytorchExportModes.MyModel()
+        fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
+        d = tempfile.mkdtemp()
+        torch.onnx._export(torch_model, (fake_input), d, verbose=False,
+                           export_type=torch.onnx.ExportTypes.DIRECTORY)
+        shutil.rmtree(d)
+
+    def test_aten_fallback(self):
+        class ModelWithAtenNotONNXOp(nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        f = io.BytesIO()
+        exported = torch.onnx.export_to_pretty_string(
+            ModelWithAtenNotONNXOp(), (x, y), f,
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        self.assertExpected(exported)
+
+
+# known to be failing in tracer
+EXCLUDE_TRACED = {
+    'test_split_dim',
+    'test_split_dim_neg0',
+    'test_gesv',
+    'test_inverse',
+}
+
+# known to be failing in script
+EXCLUDE_SCRIPT = {
+    'test_clamp_max',
+    'test_clamp_max_scalar',
+    'test_clamp_min',
+    'test_clamp_min_scalar',
+    # TODO: Fix var/std
+    # there are two schemas for var (and std):
+    # (1) var(Tensor, int, *, bool, bool, Tensor)
+    # (2) var(Tensor, *, bool)
+    #
+    # Right now, the following is happening:
+    # - Shorter schemas come before longer schemas
+    # - bool, int are treated as IntType rather than DynamicType like before
+    # So the schemas look like the following in operator:
+    # (2) var(DynamicType, IntType)
+    # (1) var(DynamicType, IntType, IntType, DynamicType)
+    # Now, when one calls torch.var(tensor, dim=1), the compiler mistakingly
+    # matches it with (2) instead of (1), which is a problem.
+    'test_std_dim',
+    'test_std_dim_1d',
+    'test_std_dim_1d_neg0',
+    'test_std_dim_neg0',
+    'test_var_dim',
+    'test_var_dim_1d',
+    'test_var_dim_1d_neg0',
+    'test_var_dim_neg0',
+    'test_norm_inf',
+    'test_renorm_norm_inf',
+    'test_split',
+    'test_split_size_list',
+    'test_split_size_list_dim',
+    'test_split_size_list_dim_neg0',
+    'test_expand',
+    'test_expand_1_element',
+    'test_expand_new_dim',
+    'test_expand_new_dim_front_old_front_1',
+    'test_expand_scalar_to_dims',
+    'test_expand_scalar_to_scalar',
+    'test_expand_size',
+    'test_permute',
+    'test_permute_neg_dim',
+    'test_permute_scalar',
+    'test_repeat',
+    'test_repeat_scalar',
+    'test_repeat_single_number',
+    'test_repeat_unsqueeze',
+    'test_reshape',
+    'test_reshape_1d',
+    'test_reshape_scalar_to_1d',
+    'test_reshape_scalar_to_scalar',
+    'test_reshape_size',
+    'test_view',
+    'test_view_1d',
+    'test_view_scalar_to_1d',
+    'test_view_scalar_to_scalar',
+    'test_view_size',
+    'test_split_dim',
+    'test_split_dim_neg0',
+    'test_gesv',
+    'test_inverse',
+}
+
+
+# make a new function where all non-tensor arguments in 'args' have been partially
+# applied, and all tensor arguments remain.
+# used to trace functions when some arguments are not tensors
+def partial_apply_nontensors(fn, args, **kwargs):
+    source = ['t' if isinstance(arg, torch.Tensor) else 's' for arg in args]
+
+    def new_fn(*tensors_):
+        tensors = iter(tensors_)
+        return fn(*(args[i] if s == 's' else next(tensors) for i, s in enumerate(source)), **kwargs)
+
+    return new_fn, [arg for arg in args if isinstance(arg, torch.Tensor)]
+
+
+def create_traced_fn(fn):
+    def traced_fn(*inputs, **kwargs):
+        fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
+        traced = torch.jit.trace(*inputs_tensors)(fn_tensors)
+        return traced(*inputs_tensors)
+    return traced_fn
+
+script_template = '''
+def the_method({}):
+    return {}
+'''
+
+
+def create_script_fn(method_name, is_functional, output_process_fn):
+    def script_fn(*args, **kwargs):
+        formals = []
+        tensors = []
+        actuals = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                name = 'i{}'.format(len(formals))
+                formals.append(name)
+                actuals.append(name)
+                tensors.append(arg)
+            else:
+                actuals.append(str(arg))
+        kwargs_str = ''
+        for k, v in kwargs.items():
+            kwargs_str += ', ' + k + '=' + str(v)
+        if is_functional:
+            call = 'torch.{}({}{})'.format(method_name, ', '.join(actuals), kwargs_str)
+        else:
+            call = '{}.{}({}{})'.format(actuals[0], method_name, ', '.join(actuals[1:]), kwargs_str)
+        script = script_template.format(', '.join(formals), call)
+        CU = torch.jit.CompilationUnit(script)
+        return output_process_fn(CU.the_method(*tensors))
+    return script_fn
+
+
+def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True):
+    kwargs = kwargs if kwargs else {}
+
+    def allSum(vs):
+        if isinstance(vs, torch.Tensor):
+            vs = (vs,)
+        return sum([(i + 1) * v.sum()
+                    for i, v in enumerate(vs)
+                    if v is not None and v.dtype.is_floating_point])
+
+    def clone_inputs(requires_grad):
+        inputs = [
+            arg.detach().clone().requires_grad_(requires_grad and arg.requires_grad)
+            if isinstance(arg, torch.Tensor) else arg for arg in args
+        ]
+        return inputs, [input for input in inputs if isinstance(input, torch.Tensor) and input.requires_grad]
+
+    nograd_inputs, nograd_tensors = clone_inputs(False)
+    recording_inputs, recording_tensors = clone_inputs(True)
+
+    # test no gradients case
+    outputs = reference_func(*nograd_inputs, **kwargs)
+    outputs_test = func(*nograd_inputs, **kwargs)
+    self.assertEqual(outputs, outputs_test)
+
+    # test single grad case
+    outputs = reference_func(*recording_inputs, **kwargs)
+    grads = torch.autograd.grad(allSum(outputs), recording_tensors,
+                                allow_unused=allow_unused)
+
+    outputs_test = func(*recording_inputs, **kwargs)
+    grads_test = torch.autograd.grad(allSum(outputs_test), recording_tensors,
+                                     allow_unused=allow_unused)
+    self.assertEqual(outputs, outputs_test)
+    self.assertEqual(grads, grads_test)
+
+    # test the grad grad case
+
+    outputs = reference_func(*recording_inputs, **kwargs)
+    l1 = allSum(outputs)
+    grads = torch.autograd.grad(l1, recording_tensors, create_graph=True,
+                                allow_unused=allow_unused)
+    l2 = (allSum(grads) * l1)
+    grads2 = torch.autograd.grad(l2, recording_tensors, allow_unused=allow_unused)
+
+    recording_inputs, recording_tensors = clone_inputs(True)
+
+    outputs_test = func(*recording_inputs, **kwargs)
+    l1_test = allSum(outputs_test)
+    grads_test = torch.autograd.grad(
+        l1_test, recording_tensors, create_graph=True, allow_unused=allow_unused)
+    l2_test = (allSum(grads_test) * l1_test)
+    grads2_test = torch.autograd.grad(l2_test, recording_tensors, allow_unused=allow_unused)
+
+    self.assertEqual(outputs, outputs_test)
+    self.assertEqual(grads, grads_test)
+    for g2, g2_test in zip(grads2, grads2_test):
+        if g2 is None and g2_ge is None:
+            continue
+        self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4))
+
+
+class TestJitGenerated(TestCase):
+    pass
+
+
+# UBSAN per-function exclusions don't seem to work with OpenMP pragmas,
+# and we have to disable the failing tests here instead.
+UBSAN_BLACKLISTED_TESTS = [
+    "test___rdiv___constant",
+    "test___rdiv___scalar_constant",
+    "test_addcdiv",
+    "test_addcdiv_broadcast_all",
+    "test_addcdiv_broadcast_rhs",
+    "test_addcdiv_scalar",
+    "test_addcdiv_scalar_broadcast_lhs",
+    "test_addcdiv_scalar_broadcast_rhs",
+    "test_addcdiv_scalar_scale",
+    "test_addcdiv_scalar_scale_broadcast_lhs",
+    "test_addcdiv_scalar_scale_broadcast_rhs",
+    "test_addcdiv_scale",
+    "test_addcdiv_scale_broadcast_all",
+    "test_addcdiv_scale_broadcast_rhs",
+    "test_add_broadcast_all",
+    "test_add_broadcast_lhs",
+    "test_add_broadcast_rhs",
+    "test_add_constant",
+    "test_add_scalar",
+    "test_add_scalar_broadcast_lhs",
+    "test_add_scalar_broadcast_rhs",
+    "test_div",
+    "test_div_broadcast_all",
+    "test_div_broadcast_lhs",
+    "test_div_broadcast_rhs",
+    "test_div_scalar",
+    "test_div_scalar_broadcast_lhs",
+    "test_div_scalar_broadcast_rhs",
+    "test_rsqrt",
+    "test_rsqrt_scalar",
+    "test_add",
+    "test_reciprocal",
+    "test_reciprocal_scalar",
+]
+
+
+def add_test(
+        name,
+        self_size,
+        args,
+        variant_name='',
+        dim_args_idx=(),
+        skipTestIf=(),
+        output_process_fn=lambda x: x,
+        kwargs=None):
+    basic_test_name = 'test_' + name
+    if variant_name != '':
+        basic_test_name += '_' + variant_name
+
+    for dim_perm in product([-1, 1], repeat=len(dim_args_idx)):
+        test_name = basic_test_name
+        new_args = [arg * dim_perm[dim_args_idx.index(i)] if i in dim_args_idx else arg for i, arg in enumerate(args)]
+        test_name = basic_test_name + ''.join('_neg' + str(i) for i, idx in enumerate(dim_perm) if idx < 0)
+        new_args = tuple(new_args)
+
+        # for-loop bodies don't define scopes, so we have to save the variables
+        # we want to close over in some way
+        def do_test(self, name=name, self_size=self_size, args=new_args, test_name=test_name,
+                    output_process_fn=output_process_fn):
+            def check(name):
+                is_magic_method = name[:2] == '__' and name[-2:] == '__'
+                is_inplace = name[-1] == "_" and not is_magic_method
+                self_variable = create_input((self_size,))[0][0]
+                # FixMe: run grad checks on inplace self
+                if is_inplace:
+                    self_variable.requires_grad = False
+                # need to record this because methods can change the szie (e.g. unsqueeze)
+                args_variable, kwargs_variable = create_input(args, requires_grad=not is_inplace, call_kwargs=kwargs)
+                self_tensor = deepcopy(self_variable.data)
+                args_tensor = deepcopy(unpack_variables(args_variable))
+                output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
+
+                def fn(*inputs, **kwargs):
+                    output = getattr(inputs[0], name)(*inputs[1:], **kwargs)
+                    return output_process_fn(output)
+
+                if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
+                    if test_name not in EXCLUDE_TRACED:
+                        check_against_reference(self, create_traced_fn(fn),
+                                                fn, (self_variable,) + args_variable, kwargs_variable)
+
+                    if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
+                        check_against_reference(self,
+                                                create_script_fn(name, False, output_process_fn),
+                                                fn, (self_variable,) + args_variable, kwargs_variable)
+
+                # functional interface tests
+                if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
+                    def fn(*inputs, **kwargs):
+                        output = getattr(torch, name)(*inputs, **kwargs)
+                        return output_process_fn(output)
+
+                    f_args_variable = (self_variable,) + args_variable
+                    f_args_tensor = (self_tensor,) + args_tensor
+
+                    if not is_inplace and test_name not in EXCLUDE_TRACED:
+                        check_against_reference(self, create_traced_fn(fn), fn, f_args_variable, kwargs_variable)
+
+                    if not is_inplace and test_name not in EXCLUDE_SCRIPT:
+                        check_against_reference(self,
+                                                create_script_fn(name, True, output_process_fn),
+                                                fn, f_args_variable, kwargs_variable)
+
+            check(name)
+            inplace_name = name + '_'
+            # can't broadcast inplace to left hand side
+            broadcast_skip_inplace = 'broadcast_lhs' in test_name or 'broadcast_all' in test_name
+            if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace:
+                check(inplace_name)
+
+        assert not hasattr(TestJitGenerated, test_name), 'Two tests have the same name: ' + test_name
+
+        for skip in skipTestIf:
+            do_test = skip(do_test)
+
+        if not (TEST_WITH_UBSAN and test_name in UBSAN_BLACKLISTED_TESTS):
+            setattr(TestJitGenerated, test_name, do_test)
+
+for test in method_tests:
+    add_test(*test)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
new file mode 100644
index 0000000..1463d15
--- /dev/null
+++ b/test/test_legacy_nn.py
@@ -0,0 +1,1326 @@
+import math
+import random
+import unittest
+import collections
+from copy import deepcopy
+
+import torch
+import torch.legacy.nn as nn
+from common import to_gpu, freeze_rng_state, run_tests
+from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
+    module_tests, criterion_tests, PRECISION
+from torch.autograd.gradcheck import get_numerical_jacobian
+from torch.autograd import Variable
+
+
+class OldModuleTest(ModuleTest):
+
+    def __init__(self, *args, **kwargs):
+        super(OldModuleTest, self).__init__(*args, **kwargs)
+        self.check_inplace = kwargs.get('check_inplace', False)
+        # Never check gradgrad for legacy NN
+        self.check_gradgrad = False
+
+    def _do_test(self, test_case, module, input):
+        # TODO: check update parameters
+        # TODO: test IO
+        module.training()
+        with torch.no_grad():
+            test_case.check_jacobian(module, input, self.jacobian_input)
+        module.evaluate()
+        with torch.no_grad():
+            test_case.check_jacobian(module, input, self.jacobian_input)
+
+        # Test .type()
+        module.float().double().forward(input)
+
+        # Test .clearState()
+        module.clearState()
+
+        # test if module can be printed
+        module.__repr__()
+
+        if self.check_inplace:
+            input2 = deepcopy(input)
+            module_ip = self.constructor(*self.constructor_args, inplace=True)
+            with freeze_rng_state():
+                output = module.forward(input)
+            test_case.assertEqual(input, input2)
+            with freeze_rng_state():
+                output2 = module_ip.forward(input2)
+            if not torch.equal(output, input):
+                test_case.assertNotEqual(input, input2)
+            test_case.assertEqual(output, input2)
+
+# TODO: hessian tests
+tests = [
+    OldModuleTest(nn.Add,
+                  constructor_args=(torch.Size([5, 4]),),
+                  input_size=(3, 5, 4),
+                  desc='3D'),
+    OldModuleTest(nn.Add,
+                  constructor_args=(1, True),
+                  input_size=(3, 1, 4),
+                  desc='scalar'),
+    OldModuleTest(nn.AddConstant,
+                  constructor_args=(3.5,),
+                  input_size=(3, 5, 4),
+                  reference_fn=lambda i, _: i + 3.5,
+                  check_inplace=True),
+    OldModuleTest(nn.BatchNormalization,
+                  constructor_args=(10,),
+                  input_size=(4, 10),
+                  desc='affine'),
+    OldModuleTest(nn.BatchNormalization,
+                  constructor_args=(10, 1e-3, 0.3, False),
+                  input_size=(4, 10),
+                  desc='not_affine'),
+    OldModuleTest(nn.SpatialBatchNormalization,
+                  constructor_args=(3,),
+                  input_size=(2, 3, 6, 6)),
+    OldModuleTest(nn.SpatialBatchNormalization,
+                  constructor_args=(3, 1e-3, 0.8),
+                  input_size=(2, 3, 6, 6),
+                  desc='momentum'),
+    OldModuleTest(nn.SpatialBatchNormalization,
+                  constructor_args=(3, 1e-3, 0.8, False),
+                  input_size=(2, 3, 6, 6),
+                  desc='no_affine'),
+    OldModuleTest(nn.VolumetricBatchNormalization,
+                  constructor_args=(3,),
+                  input_size=(2, 3, 4, 4, 4)),
+    OldModuleTest(nn.VolumetricBatchNormalization,
+                  constructor_args=(3, 1e-3, 0.7),
+                  input_size=(2, 3, 4, 4, 4),
+                  desc='momentum'),
+    OldModuleTest(nn.VolumetricBatchNormalization,
+                  constructor_args=(3, 1e-3, 0.7, False),
+                  input_size=(2, 3, 4, 4, 4),
+                  desc='no_affine'),
+    OldModuleTest(nn.CMul,
+                  constructor_args=(5, 6),
+                  input_size=(10, 5, 6),
+                  desc='3D'),
+    OldModuleTest(nn.CMul,
+                  constructor_args=(50, 4),
+                  input_size=(1, 50, 4),
+                  desc='3D_single_example'),
+    OldModuleTest(nn.CMul,
+                  constructor_args=(1, 5),
+                  input_fn=lambda: torch.randn(10, 3, 5)[:, 1],
+                  desc='3D_noncontiguous'),
+    OldModuleTest(nn.Exp,
+                  input_size=(2, 3, 4),
+                  reference_fn=lambda i, _: i.exp()),
+    OldModuleTest(nn.Log,
+                  input_fn=lambda: torch.rand(2, 3, 2) + 0.1,
+                  reference_fn=lambda i, _: i.log()),
+    OldModuleTest(nn.Clamp,
+                  constructor_args=(-2., 5.),
+                  input_fn=lambda: torch.randn(3, 2, 50) * 6,
+                  reference_fn=lambda i, _: i.clamp(-2, 5)),
+    OldModuleTest(nn.Abs,
+                  input_size=(3, 20, 5),
+                  reference_fn=lambda i, _: i.abs()),
+    OldModuleTest(nn.Bilinear,
+                  constructor_args=(2, 3, 10),
+                  input_size=[(4, 2), (4, 3)]),
+    OldModuleTest(nn.Bilinear,
+                  constructor_args=(5, 4, 2),
+                  input_size=[(2, 5), (2, 4)],
+                  desc='small_output'),
+    OldModuleTest(nn.Euclidean,
+                  constructor_args=(5, 7),
+                  input_size=(10, 5)),
+    OldModuleTest(nn.WeightedEuclidean,
+                  constructor_args=(5, 7),
+                  input_size=(10, 5)),
+    OldModuleTest(nn.Cosine,
+                  constructor_args=(5, 7),
+                  input_size=(10, 5)),
+    OldModuleTest(nn.CAddTable,
+                  input_size=[(5, 7), (5, 7)]),
+    OldModuleTest(nn.CSubTable,
+                  input_size=[(5, 7), (5, 7)]),
+    OldModuleTest(nn.CDivTable,
+                  input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1]),
+    OldModuleTest(nn.CMulTable,
+                  input_size=[(5, 7), (5, 7)]),
+    OldModuleTest(nn.Square,
+                  input_size=(10, 2, 4),
+                  reference_fn=lambda i, _: i.mul(i)),
+    OldModuleTest(nn.Sqrt,
+                  input_fn=lambda: torch.rand(10, 2, 4) + 0.01,
+                  reference_fn=lambda i, _: i.sqrt()),
+    OldModuleTest(nn.Squeeze,
+                  input_size=(2, 1, 1, 4, 5),
+                  reference_fn=lambda i, _: i.squeeze()),
+    OldModuleTest(nn.Squeeze,
+                  constructor_args=(1,),
+                  input_size=(2, 1, 1, 4, 5),
+                  reference_fn=lambda i, _: i.squeeze(1),
+                  desc='dim'),
+    OldModuleTest(nn.Unsqueeze,
+                  constructor_args=(1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 1, 4, 5)),
+    OldModuleTest(nn.Unsqueeze,
+                  constructor_args=(0,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(1, 2, 4, 5),
+                  desc='fist_dim'),
+    OldModuleTest(nn.Unsqueeze,
+                  constructor_args=(3,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 4, 5, 1),
+                  desc='last_dim'),
+    OldModuleTest(nn.View,
+                  constructor_args=(-1, 2, 20),
+                  input_size=(2, 2, 4, 5),
+                  reference_fn=lambda i, _: i.view(-1, 2, 20),
+                  desc='infer_batch'),
+    OldModuleTest(nn.View,
+                  constructor_args=(2, 2, 2, 5),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 2, 2, 5),
+                  desc='split_dim'),
+    OldModuleTest(nn.View,
+                  constructor_args=(2, -1, 2, 5),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, -1, 2, 5),
+                  desc='infer_middle'),
+    OldModuleTest(nn.Sum,
+                  constructor_args=(1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.sum(1, keepdim=False)),
+    OldModuleTest(nn.Sum,
+                  constructor_args=(1, True),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.sum(1, keepdim=False).div(i.size(1)),
+                  desc='sizeAverage'),
+    OldModuleTest(nn.Mean,
+                  constructor_args=(1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False)),
+    OldModuleTest(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()),
+                  input_size=(4, 3, 2, 2),
+                  fullname='GradientReversal'),
+    OldModuleTest(nn.Identity,
+                  input_size=(4, 3, 2, 4),
+                  reference_fn=lambda i, _: i),
+    OldModuleTest(nn.DotProduct,
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.Tensor(list(
+                      a.dot(b) for a, b in zip(i[0], i[1])))
+                  ),
+    OldModuleTest(nn.CosineDistance,
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.Tensor(list(
+                      a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1])))
+                  ),
+    OldModuleTest(nn.JoinTable,
+                  constructor_args=(0,),
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.cat(i, 0),
+                  desc='first_dim'),
+    OldModuleTest(nn.JoinTable,
+                  constructor_args=(2,),
+                  input_size=[(2, 4, 2), (2, 4, 2)],
+                  reference_fn=lambda i, _: torch.cat(i, 2),
+                  desc='positive_dim_index'),
+    OldModuleTest(nn.JoinTable,
+                  constructor_args=(-1,),
+                  input_size=[(2, 4, 2, 4), (2, 4, 2, 4)],
+                  reference_fn=lambda i, _: torch.cat(i, 3),
+                  desc='negative_dim_index'),
+    OldModuleTest(nn.MM,
+                  input_size=[(4, 5, 3), (4, 3, 2)],
+                  reference_fn=lambda i, _: torch.bmm(*i)),
+    OldModuleTest(nn.MV,
+                  input_size=[(4, 5, 3), (4, 3)],
+                  reference_fn=lambda i, _: torch.bmm(i[0], i[1].view(i[1].size(0), i[1].size(1), 1)).squeeze()),
+    OldModuleTest(nn.Max,
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.max(i, 0, False)[0]),
+    OldModuleTest(nn.Max,
+                  constructor_args=(1,),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.max(i, 1, False)[0],
+                  desc='with_dimension'),
+    OldModuleTest(nn.Min,
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.min(i, 0, False)[0]),
+    OldModuleTest(nn.Min,
+                  constructor_args=(1,),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.min(i, 1, False)[0],
+                  desc='with_dimension'),
+    OldModuleTest(nn.MixtureTable,
+                  input_size=[(5, 3), (5, 3, 6)]),
+    OldModuleTest(nn.LookupTable,
+                  constructor_args=(4, 3),
+                  input_fn=lambda: torch.randperm(2).repeat(1, 2),
+                  jacobian_input=False),
+    OldModuleTest(nn.Mul,
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, p: i * p[0][0]),
+    OldModuleTest(nn.MulConstant,
+                  constructor_args=(4,),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i * 4,
+                  check_inplace=True),
+    OldModuleTest(nn.Narrow,
+                  constructor_args=(0, 0),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i.narrow(0, 0, 1)),
+    OldModuleTest(nn.Narrow,
+                  constructor_args=(1, 1, 2),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i.narrow(1, 1, 2),
+                  desc='length'),
+    OldModuleTest(nn.Transpose,
+                  constructor_args=((1, 2), (1, 3)),
+                  input_size=(2, 3, 4, 5),
+                  reference_fn=lambda i, _: i.transpose(1, 2).transpose(1, 3)),
+    OldModuleTest(nn.Transpose,
+                  constructor_args=((1, 2),),
+                  input_size=(2, 3, 4, 5),
+                  reference_fn=lambda i, _: i.transpose(1, 2),
+                  desc='single_arg'),
+    # TODO: this seems to be very slow
+    OldModuleTest(nn.Replicate,
+                  constructor_args=(2, 1),
+                  input_size=(10, 3, 4, 5),
+                  reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5)),
+    OldModuleTest(nn.Padding,
+                  constructor_args=(0, 2, -10),
+                  input_size=(2, 3, 4, 5)),
+    OldModuleTest(nn.Padding,
+                  constructor_args=(0, 2, -10, 1),
+                  input_size=(2, 3, 4, 5),
+                  desc='index'),
+    OldModuleTest(nn.Padding,
+                  constructor_args=(0, -2, -10, 1),
+                  input_size=(2, 3, 4, 5),
+                  desc='negative_pad'),
+    OldModuleTest(nn.PartialLinear,
+                  constructor_args=(5, 6),
+                  input_size=(4, 5)),
+    OldModuleTest(lambda: nn.PartialLinear(5, 6).setPartition(torch.Tensor((2, 4))),
+                  input_size=(4, 5),
+                  fullname='PartialLinear_setPartition'),
+    OldModuleTest(nn.Power,
+                  constructor_args=(2,),
+                  input_size=(2, 3, 4, 5)),
+    OldModuleTest(nn.Power,
+                  constructor_args=(1.5,),
+                  input_fn=lambda: torch.rand(3, 4, 5),
+                  desc='fractional'),
+    OldModuleTest(nn.Reshape,
+                  constructor_args=(4, 5),
+                  input_size=(3, 4 * 5),
+                  desc='add_dim'),
+    OldModuleTest(nn.Reshape,
+                  constructor_args=(4 * 5,),
+                  input_size=(3, 4, 5),
+                  desc='squash_dim'),
+    OldModuleTest(nn.Select,
+                  constructor_args=(1, 2),
+                  input_size=(3, 4, 5),
+                  reference_fn=lambda i, _: i.select(1, 2)),
+    OldModuleTest(nn.SelectTable,
+                  constructor_args=(1,),
+                  input_size=[(1,), (2,), (3,), (4,)],
+                  reference_fn=lambda i, _: i[1]),
+    OldModuleTest(nn.SpatialAveragePooling,
+                  constructor_args=(2, 2),
+                  input_size=(2, 3, 6, 6)),
+    OldModuleTest(nn.SpatialAveragePooling,
+                  constructor_args=(2, 2, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride'),
+    OldModuleTest(nn.SpatialAveragePooling,
+                  constructor_args=(2, 2, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride_pad'),
+    OldModuleTest(nn.SpatialAdaptiveMaxPooling,
+                  constructor_args=(4, 4),
+                  input_size=(2, 3, 8, 8),
+                  reference_fn=lambda i, _: nn.SpatialMaxPooling(2, 2).forward(i)),
+    OldModuleTest(nn.SpatialAdaptiveMaxPooling,
+                  constructor_args=(4, 4),
+                  input_size=(2, 3, 7, 11),
+                  desc='irregular'),
+    OldModuleTest(nn.SpatialConvolution,
+                  constructor_args=(3, 4, 3, 3),
+                  input_size=(2, 3, 6, 6)),
+    OldModuleTest(nn.SpatialConvolution,
+                  constructor_args=(3, 4, 3, 3, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='strided'),
+    OldModuleTest(nn.SpatialConvolution,
+                  constructor_args=(3, 4, 3, 3, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='padding'),
+    OldModuleTest(nn.SpatialConvolutionLocal,
+                  constructor_args=(3, 2, 4, 4, 2, 2),
+                  input_size=(1, 3, 4, 4)),
+    OldModuleTest(nn.SpatialConvolutionLocal,
+                  constructor_args=(3, 2, 6, 6, 2, 2, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride'),
+    OldModuleTest(nn.SpatialConvolutionLocal,
+                  constructor_args=(3, 2, 6, 6, 2, 2, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride_pad'),
+    OldModuleTest(nn.SpatialDivisiveNormalization,
+                  constructor_args=(3,),
+                  input_size=(2, 3, 8, 8)),
+    OldModuleTest(nn.SpatialContrastiveNormalization,
+                  constructor_args=(3,),
+                  input_size=(2, 3, 8, 8)),
+    OldModuleTest(nn.SpatialDilatedConvolution,
+                  constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
+                  input_size=(2, 3, 8, 8)),
+    OldModuleTest(nn.SpatialDilatedConvolution,
+                  constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
+                  input_size=(2, 3, 8, 8),
+                  desc='stride_pad'),
+    OldModuleTest(nn.SpatialMaxPooling,
+                  constructor_args=(3, 3, 2, 2, 1, 1),
+                  input_size=(1, 3, 7, 7)),
+    OldModuleTest(nn.SpatialReflectionPadding,
+                  constructor_args=(1, 2, 3, 4),
+                  input_size=(2, 3, 8, 8)),
+    OldModuleTest(nn.SpatialReplicationPadding,
+                  constructor_args=(1, 2, 3, 4),
+                  input_size=(2, 3, 4, 4)),
+    OldModuleTest(nn.SpatialZeroPadding,
+                  constructor_args=(1, 2, 3, 4),
+                  input_size=(2, 3, 4, 4)),
+    OldModuleTest(nn.SpatialConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne'),
+    OldModuleTest(nn.SpatialConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne_stride'),
+    OldModuleTest(nn.SpatialConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='full'),
+    OldModuleTest(nn.SpatialFullConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne'),
+    OldModuleTest(nn.SpatialFullConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne_stride'),
+    OldModuleTest(nn.SpatialFullConvolutionMap,
+                  constructor_args=(nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='full'),
+    # TODO: test CUDA
+    OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 0.5, 0.5).fixPoolingRegions(),
+                  input_size=(1, 3, 5, 5),
+                  fullname='SpatialFractionalMaxPooling_ratio',
+                  test_cuda=False),
+    OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 4, 4).fixPoolingRegions(),
+                  input_size=(1, 3, 7, 7),
+                  fullname='SpatialFractionalMaxPooling_size',
+                  test_cuda=False),
+    OldModuleTest(nn.SpatialFullConvolution,
+                  constructor_args=(3, 4, 3, 3, 2, 2, 1, 1, 1, 1),
+                  input_size=(1, 3, 7, 7)),
+    OldModuleTest(nn.SpatialLPPooling,
+                  constructor_args=(3, 2, 2, 2, 2, 2),
+                  input_size=(1, 3, 7, 7)),
+    OldModuleTest(nn.SpatialSubSampling,
+                  constructor_args=(3, 3, 3, 2, 2),
+                  input_size=(1, 3, 7, 7)),
+    OldModuleTest(nn.SpatialSubtractiveNormalization,
+                  constructor_args=(3,),
+                  input_size=(1, 3, 7, 7)),
+    OldModuleTest(nn.SpatialSubtractiveNormalization,
+                  constructor_args=(3, torch.rand(3)),
+                  input_size=(1, 3, 7, 7),
+                  desc='kernel'),
+    OldModuleTest(nn.SpatialUpSamplingNearest,
+                  constructor_args=(2,),
+                  input_size=(1, 3, 4, 4)),
+
+    OldModuleTest(nn.TemporalConvolution,
+                  constructor_args=(4, 5, 3),
+                  input_size=(2, 10, 4)),
+    OldModuleTest(nn.TemporalConvolution,
+                  constructor_args=(4, 5, 3, 2),
+                  input_size=(2, 10, 4),
+                  desc='stride'),
+    # TODO: this runs in non-batch mode only
+    OldModuleTest(nn.TemporalSubSampling,
+                  constructor_args=(4, 3),
+                  input_size=(10, 4)),
+    OldModuleTest(nn.TemporalSubSampling,
+                  constructor_args=(4, 3, 2),
+                  input_size=(10, 4),
+                  desc='stride'),
+
+    OldModuleTest(nn.VolumetricAveragePooling,
+                  constructor_args=(2, 2, 2),
+                  input_size=(2, 3, 4, 4, 4)),
+    OldModuleTest(nn.VolumetricAveragePooling,
+                  constructor_args=(2, 2, 2, 2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride'),
+    OldModuleTest(nn.VolumetricAveragePooling,
+                  constructor_args=(2, 2, 2, 2, 2, 2, 1, 1, 1),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride_pad'),
+    OldModuleTest(nn.VolumetricConvolution,
+                  constructor_args=(3, 4, 2, 2, 2),
+                  input_size=(2, 3, 3, 3, 3)),
+    OldModuleTest(nn.VolumetricConvolution,
+                  constructor_args=(3, 4, 2, 2, 2, 2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride'),
+    OldModuleTest(nn.VolumetricConvolution,
+                  constructor_args=(3, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride_padding'),
+    OldModuleTest(nn.VolumetricFullConvolution,
+                  constructor_args=(2, 3, 2, 2, 2),
+                  input_size=(1, 2, 4, 4, 4)),
+    OldModuleTest(nn.VolumetricMaxPooling,
+                  constructor_args=(2, 2, 2),
+                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000)),
+    OldModuleTest(nn.VolumetricMaxPooling,
+                  constructor_args=(2, 2, 2, 2, 2, 2),
+                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000),
+                  desc='stride'),
+    OldModuleTest(nn.VolumetricMaxPooling,
+                  constructor_args=(2, 2, 2, 2, 2, 2, 1, 1, 1),
+                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000),
+                  desc='stride_padding'),
+    OldModuleTest(nn.VolumetricReplicationPadding,
+                  constructor_args=(1, 2, 3, 4, 5, 6),
+                  input_size=(2, 3, 5, 5, 5)),
+
+    CriterionTest(nn.L1Cost,
+                  input_size=(2, 3, 4, 5),
+                  target=None),
+    CriterionTest(nn.L1HingeEmbeddingCriterion,
+                  input_size=[(2, 3, 4, 5), (2, 3, 4, 5)],
+                  target=1),
+    CriterionTest(nn.L1HingeEmbeddingCriterion,
+                  constructor_args=(2,),
+                  input_size=[(2, 3, 4, 5), (2, 3, 4, 5)],
+                  target=1,
+                  desc='margin'),
+    CriterionTest(nn.WeightedMSECriterion,
+                  constructor_args_fn=lambda: (torch.rand(3, 4, 5),),
+                  input_size=(2, 3, 4, 5),
+                  target_size=(2, 3, 4, 5)),
+    CriterionTest(nn.MarginCriterion,
+                  input_size=(5, 10),
+                  target_fn=lambda: torch.randn(5, 10).sign()),
+    CriterionTest(nn.ClassSimplexCriterion,
+                  constructor_args=(30,),
+                  input_fn=lambda: torch.randn(5, 30).mul(10).renorm(2, 0, 1),
+                  target_fn=lambda: torch.rand(5).mul(30).floor().long(),
+                  desc='margin'),
+]
+# TODO: FlattenTable gradient
+# TODO: NarrowTable gradient
+# TODO: CriterionTable
+# TODO: MultiCriterion
+# TODO: SplitTable
+
+for p in (1, 2, 1.5):
+    tests.append(
+        OldModuleTest(nn.Normalize,
+                      constructor_args=(p,),
+                      input_size=(4, 5),
+                      # Eh, we need to use p as a default, so it's passed by value
+                      reference_fn=lambda i, _, p=p: i.div(i.norm(p, 1, True).expand_as(i)),
+                      desc=str(p)),
+    )
+for p in range(1, 4 + 1):
+    tests.append(
+        OldModuleTest(nn.PairwiseDistance,
+                      constructor_args=(p,),
+                      input_size=[(4, 10), (4, 10)],
+                      desc=str(p))
+    )
+
+
+def build_spatial_unpooling_net():
+    pool = nn.SpatialMaxPooling(2, 2, 2, 2)
+    unpool = nn.SpatialMaxUnpooling(pool)
+    return nn.Sequential().add(pool).add(unpool)
+
+tests.append(
+    OldModuleTest(build_spatial_unpooling_net,
+                  input_size=(1, 3, 10, 10),
+                  desc='SpatialMaxUnpooling')
+)
+
+
+def build_volumetric_unpooling_net():
+    pool = nn.VolumetricMaxPooling(2, 2, 2, 2)
+    unpool = nn.VolumetricMaxUnpooling(pool)
+    return nn.Sequential().add(pool).add(unpool)
+
+tests.append(
+    OldModuleTest(build_volumetric_unpooling_net,
+                  input_size=(1, 3, 10, 10),
+                  desc='VolumetricMaxUnpooling')
+)
+
+
+def prepare_tests():
+    def add_test(test):
+        test_name = test.get_name()
+        cuda_test_name = test_name + '_cuda'
+        if hasattr(TestNN, test_name):
+            raise RuntimeError('Found two tests with the same name: ' + test_name)
+        if hasattr(TestNN, cuda_test_name):
+            raise RuntimeError('Found two tests with the same name: ' + cuda_test_name)
+        setattr(TestNN, test_name, lambda self, test=test: test(self))
+        setattr(TestNN, cuda_test_name, lambda self, test=test: test.test_cuda(self))
+    name_remap = {
+        'Conv2d': 'SpatialConvolution',
+        'MaxPool2d': 'SpatialMaxPooling',
+        'AvgPool2d': 'SpatialAveragePooling',
+        'Softmax': 'SoftMax',
+        'Softmax2d': 'SpatialSoftMax',
+        'LogSoftmax': 'LogSoftMax',
+        'BatchNorm1d': 'BatchNormalization',
+        'BatchNorm2d': 'SpatialBatchNormalization',
+        'BatchNorm3d': 'VolumetricBatchNormalization',
+        'Hardtanh': 'HardTanh',
+        'Hardshrink': 'HardShrink',
+        'Softplus': 'SoftPlus',
+        'Softshrink': 'SoftShrink',
+        'Softsign': 'SoftSign',
+        'Softmin': 'SoftMin',
+        'Tanhshrink': 'TanhShrink',
+        'CrossMapLRN2d': 'SpatialCrossMapLRN',
+        'L1Loss': 'AbsCriterion',
+        'NLLLoss': 'ClassNLLCriterion',
+        'NLLLoss2d': 'SpatialClassNLLCriterion',
+        'KLDivLoss': 'DistKLDivCriterion',
+    }
+    for test in tests:
+        add_test(test)
+    for test_params in module_tests:
+        test_params = deepcopy(test_params)
+        name = test_params.pop('module_name')
+        name = name_remap.get(name, name)
+        # hardshrink is deprecated in nn
+        if name == "HardShrink":
+            continue
+
+        test_params['constructor'] = getattr(nn, name)
+        test = OldModuleTest(**test_params)
+        add_test(test)
+    for test_params in criterion_tests:
+        test_params = deepcopy(test_params)
+        name = test_params.pop('module_name')
+        name = name_remap.get(name, name.replace('Loss', 'Criterion'))
+        # hardshrink is deprecated in nn
+        if name == "HardShrink":
+            continue
+
+        # nn.NLLLoss2d is deprecated, but there is a NLLLoss test for 2d
+        if name == 'ClassNLLCriterion' and 'desc' in test_params.keys() and '2d' in test_params['desc']:
+            name = 'SpatialClassNLLCriterion'
+
+        test_params['constructor'] = getattr(nn, name)
+
+        # If legacy constructor args are specified, use them instead
+        legacy_args = test_params.pop('legacy_constructor_args', None)
+        if legacy_args is not None:
+            test_params['constructor_args'] = legacy_args
+
+        test = CriterionTest(**test_params)
+        add_test(test)
+
+
+def require_grad(input):
+    if isinstance(input, torch.Tensor):
+        input = input.detach()
+        input.requires_grad = True
+        return input
+    elif isinstance(input, collections.Iterable):
+        return type(input)(require_grad(e) for e in input)
+    return input
+
+
+class TestNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+
+    def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+        def fw(input):
+            out = self._forward(module, input)
+            if isinstance(out, Variable):
+                return out.data
+            return out
+
+        res = tuple()
+        if jacobian_input:
+            input = require_grad(input)
+            res += get_numerical_jacobian(fw, input, eps=1e-6),
+        if jacobian_parameters:
+            params, _ = self._get_parameters(module)
+            jacobians = []
+            for p in params:
+                p = p.detach()
+                p.requires_grad = True
+                jacobians.append(get_numerical_jacobian(fw, input, p, eps=1e-6))
+            res += torch.cat(jacobians, 0),
+        return res
+
+    def _forward(self, module, input):
+        with freeze_rng_state():
+            with torch.no_grad():
+                return module.forward(input)
+
+    def _backward(self, module, input, output, grad_output, create_graph=False):
+        if isinstance(input, Variable):
+            input = input.data
+
+        return module.backward(input, grad_output)
+
+    def _forward_criterion(self, criterion, input, target):
+        with torch.no_grad():
+            return criterion.forward(input, target)
+
+    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+        # Ignore gradOutput. It's used for non-legacy tests.
+        with torch.no_grad():
+            return criterion.backward(input, target)
+
+    def _zero_grad_parameters(self, module):
+        return module.zeroGradParameters()
+
+    def _get_parameters(self, module):
+        return module.parameters() or ([], [])
+
+    def test_Dropout(self):
+        p = 0.2
+        input = torch.Tensor(1000).fill_(1 - p)
+
+        module = nn.Dropout(p)
+        output = module.forward(input)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
+        gradInput = module.backward(input, input)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
+
+        module = nn.Dropout(p, True)
+        output = module.forward(input.clone())
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
+        gradInput = module.backward(input.clone(), input.clone())
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_SpatialDropout(self):
+        p = 0.2
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        nfeats = 1000
+        input = torch.Tensor(b, nfeats, w, h).fill_(1)
+        module = nn.SpatialDropout(p)
+        module.training()
+        output = module.forward(input)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
+        gradInput = module.backward(input, input)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_VolumetricDropout(self):
+        p = 0.2
+        bsz = random.randint(1, 5)
+        t = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        nfeats = 1000
+        input = torch.Tensor(bsz, nfeats, t, w, h).fill_(1)
+        module = nn.VolumetricDropout(p)
+        module.training()
+        output = module.forward(input)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
+        gradInput = module.backward(input, input)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_ReLU_reference(self):
+        input = torch.randn(10, 20)
+        module = nn.ReLU()
+        output = module.forward(input)
+        self.assertTrue(output[input.ge(0)].eq(input[input.gt(0)]).all())
+        self.assertTrue(output[input.lt(0)].eq(0).all())
+
+    def test_ReLU6_reference(self):
+        input = torch.randn(10, 20).mul(10)
+        module = nn.ReLU6()
+        output = module.forward(input)
+        self.assertTrue(output[input.ge(6)].eq(6).all())
+        self.assertTrue(output[input.lt(0)].eq(0).all())
+
+    def test_Copy(self):
+        input = torch.randn(3, 4).double()
+        c = nn.Copy(torch.DoubleTensor, torch.FloatTensor)
+        output = c.forward(input)
+        self.assertIsInstance(output, torch.FloatTensor)
+        self.assertEqual(output, input.float(), 1e-6)
+        gradInput = c.backward(input, output.fill_(1))
+        self.assertIsInstance(gradInput, torch.DoubleTensor)
+        self.assertEqual(gradInput, output.double(), 1e-6)
+        c.dontCast = True
+        c.double()
+        self.assertIsInstance(output, torch.FloatTensor)
+
+        # Check that these don't raise errors
+        c.__repr__()
+        str(c)
+
+    def test_FlattenTable(self):
+        input = [
+            torch.rand(1),
+            [
+                torch.rand(2),
+                [
+                    torch.rand(3)
+                ],
+            ],
+            torch.rand(4)
+        ]
+        gradOutput = [
+            torch.rand(1),
+            torch.rand(2),
+            torch.rand(3),
+            torch.rand(4)
+        ]
+
+        m = nn.FlattenTable()
+        output = m.forward(input)
+        self.assertEqual(len(output), 4)
+        self.assertEqual(output[0], input[0])
+        self.assertEqual(output[1], input[1][0])
+        self.assertEqual(output[2], input[1][1][0])
+        self.assertEqual(output[3], input[2])
+
+        gradInput = m.backward(input, gradOutput)
+        self.assertEqual(gradOutput[0], gradInput[0])
+        self.assertEqual(gradOutput[1], gradInput[1][0])
+        self.assertEqual(gradOutput[2], gradInput[1][1][0])
+        self.assertEqual(gradOutput[3], gradInput[2])
+
+        # Check that these don't raise errors
+        m.__repr__()
+        str(m)
+
+        # More uglyness: FlattenTable doesn't rebuild the table every updateOutput
+        # call, so we need to make sure that modifications to the input are
+        # detected correctly (and that the table is correctly rebuilt.
+        # CASE 1: Nothing changes so the output table shouldn't be redefined
+        old_input_map = m.input_map
+        old_output = m.output
+        m.forward(input)
+        self.assertEqual(old_input_map, m.input_map)
+        self.assertEqual(old_output, m.output)
+
+        # CASE 2: An element is added to the input table
+        old_input_map = m.input_map
+        old_output = m.output
+        input[1].append(torch.rand(5))
+        m.forward(input)
+        self.assertNotEqual(old_input_map, m.input_map)
+        self.assertNotEqual(old_output, m.output)
+
+        # CASE 3: An element is removed from the input table
+        old_input_map = m.input_map
+        old_output = m.output
+        input.pop()
+        m.forward(input)
+        self.assertNotEqual(old_input_map, m.input_map)
+        self.assertNotEqual(old_output, m.output)
+
+    def test_Concat(self):
+        input = torch.randn(4, 2)
+        num_modules = random.randint(2, 5)
+        linears = [nn.Linear(2, 5) for i in range(num_modules)]
+
+        m = nn.Concat(0)
+        for l in linears:
+            m.add(l)
+            l.zeroGradParameters()
+            l.weight.fill_(1)
+            l.bias.fill_(0)
+
+        # Check that these don't raise errors
+        m.__repr__()
+        str(m)
+
+        output = m.forward(input)
+        output2 = input.sum(1, True).expand(4, 5).repeat(num_modules, 1)
+        self.assertEqual(output2, output)
+
+        gradInput = m.backward(input, torch.ones(output2.size()))
+        gradInput2 = torch.ones(4, 2).fill_(num_modules * 5)
+        self.assertEqual(gradInput, gradInput2)
+
+        gradWeight = input.sum(0, keepdim=True).expand(5, 2)
+        for l in linears:
+            self.assertEqual(gradWeight, l.gradWeight)
+
+    def test_Parallel(self):
+        input = torch.randn(3, 4, 5)
+        m = nn.Parallel(0, 2)
+        m.add(nn.View(4, 5, 1))
+        m.add(nn.View(4, 5, 1))
+        m.add(nn.View(4, 5, 1))
+
+        # Check that these don't raise errors
+        m.__repr__()
+        str(m)
+
+        output = m.forward(input)
+        output2 = input.transpose(0, 2).transpose(0, 1)
+        self.assertEqual(output2, output)
+
+        gradInput = m.backward(input, output2)
+        self.assertEqual(gradInput, input)
+
+    def test_ParallelTable(self):
+        input = torch.randn(3, 4, 5)
+        p = nn.ParallelTable()
+        p.add(nn.View(4, 5, 1))
+        p.add(nn.View(4, 5, 1))
+        p.add(nn.View(4, 5, 1))
+        m = nn.Sequential()
+        m.add(nn.SplitTable(0))
+        m.add(p)
+        m.add(nn.JoinTable(2))
+
+        # Check that these don't raise errors
+        p.__repr__()
+        str(p)
+
+        output = m.forward(input)
+        output2 = input.transpose(0, 2).transpose(0, 1)
+        self.assertEqual(output2, output)
+
+        gradInput = m.backward(input, output2)
+        self.assertEqual(gradInput, input)
+
+    def test_ConcatTable(self):
+        input = [
+            torch.randn(3, 4).float(), torch.randn(3, 4).float(), [torch.randn(3, 4).float()]
+        ]
+        _gradOutput = [
+            torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float()
+        ]
+        gradOutput = [
+            [_gradOutput[0][0], _gradOutput[1][0], [_gradOutput[2][0]]],
+            [_gradOutput[0][1], _gradOutput[1][1], [_gradOutput[2][1]]],
+            [_gradOutput[0][2], _gradOutput[1][2], [_gradOutput[2][2]]]
+        ]
+        module = nn.ConcatTable()
+        module.add(nn.Identity())
+        module.add(nn.Identity())
+        module.add(nn.Identity())
+        module.float()
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+        output = module.forward(input)
+        output2 = [input, input, input]
+        self.assertEqual(output2, output)
+        gradInput = module.backward(input, gradOutput)
+        gradInput2 = [_gradOutput[0].sum(0, keepdim=False), _gradOutput[1].sum(
+            0, keepdim=False), [_gradOutput[2].sum(0, keepdim=False)]]
+        self.assertTrue(isinstance(gradInput, list))
+        self.assertFalse(isinstance(gradInput[0], list))
+        self.assertFalse(isinstance(gradInput[1], list))
+        self.assertTrue(isinstance(gradInput[2], list))
+        self.assertEqual(len(gradInput), 3)
+        self.assertEqual(len(gradInput[2]), 1)
+        for t1, t2 in zip(iter_tensors(gradInput), iter_tensors(gradInput2)):
+            self.assertEqual(t1, t2)
+
+        # test outputs for variable length inputs
+        test = nn.ConcatTable()
+        test.add(nn.Identity())
+        test.add(nn.Identity())
+
+        x = [torch.randn(5), torch.randn(5)]
+        y = [torch.randn(5)]
+
+        o1 = len(test.forward(x))
+        go1 = len(test.backward(x, [x, x]))
+        o2 = len(test.forward(y))
+        go2 = len(test.backward(y, [y, y]))
+        self.assertEqual(o1, 2)
+        self.assertEqual(go1, 2)
+        self.assertEqual(o2, 2)
+        self.assertEqual(go2, 1)
+
+    def test_DepthConcat(self):
+        outputSize = [5, 6, 7, 8]
+        input = torch.randn(2, 3, 12, 12)
+        gradOutput = torch.randn(2, sum(outputSize), 12, 12)
+        concat = nn.DepthConcat(1)
+        concat.add(nn.SpatialConvolution(3, outputSize[0], 1, 1, 1, 1))  # > 2, 5, 12, 12
+        concat.add(nn.SpatialConvolution(3, outputSize[1], 3, 3, 1, 1))  # > 2, 6, 10, 10
+        concat.add(nn.SpatialConvolution(3, outputSize[2], 4, 4, 1, 1))  # > 2, 7, 9, 9
+        concat.add(nn.SpatialConvolution(3, outputSize[3], 5, 5, 1, 1))  # > 2, 8, 8, 8
+        concat.zeroGradParameters()
+        # forward/backward
+        outputConcat = concat.forward(input)
+        gradInputConcat = concat.backward(input, gradOutput)
+        # the spatial dims are the largest, the nFilters is the sum
+        output = torch.Tensor(2, sum(outputSize), 12, 12).zero_()  # zero for padding
+        narrows = ((slice(None), slice(0, 5), slice(None), slice(None)),
+                   (slice(None), slice(5, 11), slice(1, 11), slice(1, 11)),
+                   (slice(None), slice(11, 18), slice(1, 10), slice(1, 10)),
+                   (slice(None), slice(18, 26), slice(2, 10), slice(2, 10)))
+        gradInput = input.clone().zero_()
+        for i in range(4):
+            conv = concat.get(i)
+            gradWeight = conv.gradWeight.clone()
+            conv.zeroGradParameters()
+            output[narrows[i]].copy_(conv.forward(input))
+            gradInput.add_(conv.backward(input, gradOutput[narrows[i]]))
+            self.assertEqual(gradWeight, conv.gradWeight)
+
+        self.assertEqual(output, outputConcat)
+        self.assertEqual(gradInput, gradInputConcat)
+
+        # Check that these don't raise errors
+        concat.__repr__()
+        str(concat)
+
+    def test_Contiguous(self):
+        input = torch.randn(10, 10, 10)
+        noncontig = input[:, 4]
+        module = nn.Contiguous()
+        assert not noncontig.is_contiguous()
+        output = module.forward(noncontig)
+        self.assertEqual(output, noncontig)
+        self.assertTrue(output.is_contiguous())
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_Index(self):
+        net = nn.Index(0)
+
+        # test 1D
+        input = [torch.Tensor((10, 20, 30)), torch.LongTensor((0, 1, 1, 2))]
+        output = net.forward(input)
+        self.assertEqual(output, torch.Tensor((10, 20, 20, 30)))
+
+        gradOutput = torch.Tensor((1, 1, 1, 3))
+        gradInput = net.backward(input, gradOutput)
+        self.assertEqual(gradInput[0], torch.Tensor((1, 2, 3)))
+
+        # test 2D
+        input = [torch.Tensor(((10, 20), (30, 40))), torch.LongTensor((0, 0))]
+        output = net.forward(input)
+        self.assertEqual(output, torch.Tensor(((10, 20), (10, 20))))
+
+        gradOutput = torch.Tensor(((1, 2), (1, 2)))
+        gradInput = net.backward(input, gradOutput)
+        self.assertEqual(gradInput[0], torch.Tensor(((2, 4), (0, 0))))
+
+        # Check that these don't raise errors
+        net.__repr__()
+        str(net)
+
+    def test_L1Penalty(self):
+        weight = 1
+        m = nn.L1Penalty(weight, False, False)
+
+        input = torch.rand(2, 10).add_(-0.5)
+        input[0][0] = 0
+
+        m.forward(input)
+        grad = m.backward(input, torch.ones(input.size()))
+
+        self.assertEqual(input.abs().sum() * weight, m.loss)
+
+        true_grad = (input.gt(0).type_as(grad) +
+                     input.lt(0).type_as(grad).mul_(-1)).mul_(weight)
+        self.assertEqual(true_grad, grad)
+
+        # Check that these don't raise errors
+        m.__repr__()
+        str(m)
+
+    def test_MaskedSelect(self):
+        input = torch.randn(4, 5)
+        mask = torch.ByteTensor(4, 5).bernoulli_()
+        module = nn.MaskedSelect()
+        out = module.forward([input, mask])
+        self.assertEqual(input.masked_select(mask), out)
+
+        gradOut = torch.Tensor((20, 80))
+        input = torch.Tensor(((10, 20), (30, 40)))
+        inTarget = torch.Tensor(((20, 0), (0, 80)))
+        mask = torch.ByteTensor(((1, 0), (0, 1)))
+        module = nn.MaskedSelect()
+        module.forward([input, mask])
+        gradIn = module.backward([input, mask], gradOut)
+        self.assertEqual(inTarget, gradIn[0])
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_MultiCriterion(self):
+        input = torch.rand(2, 10)
+        target = torch.LongTensor((1, 8))
+        nll = nn.ClassNLLCriterion()
+        nll2 = nn.CrossEntropyCriterion()
+        mc = nn.MultiCriterion().add(nll, 0.5).add(nll2)
+
+        output = mc.forward(input, target)
+        output2 = nll.forward(input, target) / 2 + nll2.forward(input, target)
+
+        self.assertEqual(output, output2)
+        gradInput = mc.backward(input, target)
+        gradInput2 = nll.backward(input, target).clone().div(2).add(nll2.backward(input, target))
+        self.assertEqual(gradInput, gradInput2)
+
+        # test type
+        mc.float()
+        gradInput = gradInput.clone()
+        input3 = input.float()
+        target3 = target
+        output3 = mc.forward(input3, target3)
+        gradInput3 = mc.backward(input3, target3)
+        self.assertEqual(output, output3)
+        self.assertEqual(gradInput.float(), gradInput3)
+
+        # Check that these don't raise errors
+        mc.__repr__()
+        str(mc)
+
+        # test table input
+        # TODO: enable when Criterion.clone is ready
+        # mc.double()
+        # input = [torch.randn(2, 10), [torch.randn(2, 10), torch.randn(2, 10)]]
+        # target = [torch.IntTensor((1, 8)), [torch.IntTensor((5, 6)), torch.IntTensor((4, 3))]]
+        # pnllc = nn.ParallelCriterion().add(nll).add(nn.ParallelCriterion().add(nll.clone()).add(nll.clone()))
+        # pnllc2 = nn.ParallelCriterion().add(nll2).add(nn.ParallelCriterion().add(nll2.clone()).add(nll2.clone()))
+        # mc = nn.MultiCriterion().add(pnllc, 0.5).add(pnllc2)
+        # output = mc.forward(input, target)
+        # output2 = pnllc.forward(input, target)/2 + pnllc2.forward(input, target)
+        # self.assertEqual(output, output2)
+        # gradInput = mc.backward(input, target)
+        # gradInput2 = pnllc.clone().backward(input, target)
+        # gradInput2b = pnllc2.backward(input, target)
+        # gradInput2[0].div(2).add(gradInput2b[0])
+        # gradInput2[1][0].div(2).add(gradInput2b[1][0])
+        # gradInput2[1][1].div(2).add(gradInput2b[1][1])
+        # self.assertEqual(gradInput[1], gradInput2[0])
+        # self.assertEqual(gradInput[1][9], gradInput2[1][0])
+        # self.assertEqual(gradInput[1][1], gradInput2[1][1])
+
+    def test_ParallelCriterion(self):
+        input = [torch.rand(2, 10), torch.randn(2, 10)]
+        target = [torch.LongTensor((1, 8)), torch.randn(2, 10)]
+        nll = nn.ClassNLLCriterion()
+        mse = nn.MSECriterion()
+        pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
+        output = pc.forward(input, target)
+        output2 = nll.forward(input[0], target[0]) / 2 + mse.forward(input[1], target[1])
+        self.assertEqual(output, output2)
+        gradInput2 = [nll.backward(input[0], target[0]).clone().div(2), mse.backward(input[1], target[1])]
+        gradInput = pc.backward(input, target)
+        self.assertEqual(gradInput[0], gradInput2[0])
+        self.assertEqual(gradInput[1], gradInput2[1])
+
+        # test type
+        pc.float()
+        gradInput[0], gradInput[1] = gradInput[0].clone(), gradInput[1].clone()
+        input3 = [input[0].float(), input[1].float()]
+        target3 = [target[0], target[1].float()]
+        output3 = pc.forward(input3, target3)
+        gradInput3 = pc.backward(input3, target3)
+        self.assertEqual(output, output3)
+        self.assertEqual(gradInput[0].float(), gradInput3[0])
+        self.assertEqual(gradInput[1].float(), gradInput3[1])
+
+        # test repeatTarget
+        input = [torch.rand(2, 10), torch.randn(2, 10)]
+        target = torch.randn(2, 10)
+        mse = nn.MSECriterion()
+        pc = nn.ParallelCriterion(True).add(mse, 0.5).add(nn.MSECriterion())
+        output = pc.forward(input, target)
+        output2 = mse.forward(input[0], target) / 2 + mse.forward(input[1], target)
+        self.assertEqual(output, output2)
+        gradInput = pc.backward(input, target)
+        gradInput2 = [mse.backward(input[0], target).clone().div(2), mse.backward(input[1], target)]
+        self.assertEqual(gradInput[0], gradInput2[0])
+        self.assertEqual(gradInput[1], gradInput2[1])
+
+        # table input
+        input = [torch.randn(2, 10), [torch.rand(2, 10), torch.randn(2, 10)]]
+        target = [torch.LongTensor((2, 5)), [torch.LongTensor((1, 8)), torch.randn(2, 10)]]
+        nll2 = nn.ClassNLLCriterion()
+        nll = nn.ClassNLLCriterion()
+        mse = nn.MSECriterion()
+        pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
+        pc2 = nn.ParallelCriterion().add(nll2, 0.4).add(pc)
+        output = pc2.forward(input, target)
+        output2 = (nll2.forward(input[0], target[0]) * 0.4 +
+                   nll.forward(input[1][0], target[1][0]) / 2 +
+                   mse.forward(input[1][1], target[1][1]))
+        self.assertEqual(output, output2)
+        gradInput2 = [
+            nll2.backward(input[0], target[0]).clone().mul(0.4),
+            [nll.backward(input[1][1], target[1][0]).clone().div(2), mse.backward(input[1][1], target[1][1])]
+        ]
+        gradInput = pc2.backward(input, target)
+        self.assertEqual(gradInput[0], gradInput2[0])
+        self.assertEqual(gradInput[1][0], gradInput2[1][0])
+        self.assertEqual(gradInput[1][1], gradInput2[1][1])
+
+        # Check that these don't raise errors
+        pc.__repr__()
+        str(pc)
+
+    def test_NarrowTable(self):
+        input = [torch.Tensor(i) for i in range(1, 6)]
+
+        module = nn.NarrowTable(1)
+        output = module.forward(input)
+        self.assertEqual(output, input[1:2])
+
+        module = nn.NarrowTable(2, 3)
+        output = module.forward(input)
+        self.assertEqual(output, input[2:5])
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def test_accUpdateGradParameters(self):
+        module = nn.LookupTable(5, 3)
+        module.weight.fill_(2)
+        input = torch.LongTensor([1, 3])
+        output = module.updateOutput(input)
+        module.backwardUpdate(input, output, 0.1)
+        self.assertEqual(module.weight[0, 0], 2)
+        self.assertEqual(module.weight[3, 0], 1.8)
+
+    def _build_net(self):
+        return (nn.Sequential()
+                .add(nn.Concat(0)
+                     .add(nn.Linear(2, 5))
+                     .add(nn.Linear(2, 5)))
+                .add(nn.ReLU())
+                .add(nn.Linear(10, 20)))
+
+    def test_parameters(self):
+        net = self._build_net()
+        concat = net.modules[0]
+        param, grad = net.parameters()
+
+        self.assertEqual(len(param), 6)
+        self.assertEqual(len(grad), 6)
+
+        self.assertObjectIn(concat.modules[0].weight, param)
+        self.assertObjectIn(concat.modules[0].bias, param)
+        self.assertObjectIn(concat.modules[1].weight, param)
+        self.assertObjectIn(concat.modules[1].bias, param)
+        self.assertObjectIn(net.modules[2].weight, param)
+        self.assertObjectIn(net.modules[2].bias, param)
+
+        self.assertObjectIn(concat.modules[0].gradWeight, grad)
+        self.assertObjectIn(concat.modules[0].gradBias, grad)
+        self.assertObjectIn(concat.modules[1].gradWeight, grad)
+        self.assertObjectIn(concat.modules[1].gradBias, grad)
+        self.assertObjectIn(net.modules[2].gradWeight, grad)
+        self.assertObjectIn(net.modules[2].gradBias, grad)
+
+    def test_flattenParameters(self):
+        net = self._build_net()
+        param, grad_param = net.flattenParameters()
+        self.assertEqual(param.dim(), 1)
+        self.assertEqual(param.size(0), 250)
+        self.assertEqual(grad_param.dim(), 1)
+        self.assertEqual(grad_param.size(0), 250)
+
+    def test_findModules(self):
+        net = self._build_net()
+        modules, containers = net.findModules(nn.Linear)
+        self.assertEqual(len(modules), 3)
+        self.assertEqual(len(modules), len(containers))
+        self.assertObjectIn(net.modules[0].modules[0], modules)
+        self.assertObjectIn(net.modules[0].modules[1], modules)
+        self.assertObjectIn(net.modules[2], modules)
+        self.assertObjectIn(net.modules[0], containers)
+        self.assertEqual(containers.count(net.modules[0]), 2)
+        self.assertObjectIn(net, containers)
+        for m, c in zip(modules, containers):
+            self.assertObjectIn(m, c.modules)
+
+    def test_apply(self):
+        net = self._build_net()
+        seen_modules = set()
+
+        def callback(module):
+            self.assertNotIn(module, seen_modules)
+            seen_modules.add(module)
+        net.apply(callback)
+        self.assertEqual(len(seen_modules), 6)
+
+    def test_listModules(self):
+        net = self._build_net()
+        module_list = list()
+
+        def callback(module):
+            module_list.append(module)
+        net.apply(callback)
+        self.assertEqual(module_list, net.listModules())
+
+    def test_replace(self):
+        ref_net = self._build_net()
+        net = self._build_net()
+
+        def callback(module):
+            if isinstance(module, nn.ReLU):
+                return nn.Tanh()
+            return module
+        net.replace(callback)
+
+        for module, reference in zip(net.listModules(), ref_net.listModules()):
+            if isinstance(reference, nn.ReLU):
+                self.assertIsInstance(module, nn.Tanh)
+            else:
+                self.assertIsInstance(module, type(reference))
+
+
+prepare_tests()
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
new file mode 100644
index 0000000..da7ee1c
--- /dev/null
+++ b/test/test_multiprocessing.py
@@ -0,0 +1,459 @@
+import contextlib
+import gc
+import os
+import sys
+import time
+import unittest
+from sys import platform
+
+import torch
+import torch.cuda
+import torch.multiprocessing as mp
+from torch.autograd import Variable
+from torch.nn import Parameter
+from common import TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN
+
+
+TEST_REPEATS = 30
+HAS_SHM_FILES = os.path.isdir('/dev/shm')
+TEST_CUDA_IPC = torch.cuda.is_available() and \
+    sys.version_info[0] == 3 and \
+    sys.platform != 'darwin' and \
+    sys.platform != 'win32'
+TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
+
+
+class SubProcess(mp.Process):
+    def __init__(self, tensor):
+        super(SubProcess, self).__init__()
+        self.tensor = tensor
+        self.daemon = True
+
+    def run(self):
+        self.tensor.add_(3)
+
+
+def simple_fill(queue, event):
+    data = queue.get()
+    data[0][:] = 4
+    event.set()
+
+
+def simple_pool_fill(tensor):
+    tensor.fill_(4)
+    return tensor.add(1)
+
+
+def send_tensor(queue, event, tp):
+    t = torch.ones(5, 5).type(tp)
+    queue.put(t)
+    queue.put(t)
+    event.wait()
+
+
+def call_backward():
+    x = torch.autograd.Variable(torch.randn(3, 3), requires_grad=True)
+    x.sum().backward()
+
+
+def sum_tensors(inq, outq):
+    with torch.cuda.device(1):
+        tensors = inq.get()
+        for tensor in tensors:
+            outq.put((tensor.sum().item(), tensor.get_device(),
+                      tensor.numel(), tensor.storage().size()))
+
+
+def queue_get_exception(inqueue, outqueue):
+    os.close(2)  # hide expected error message
+    try:
+        torch.zeros(5, 5).cuda()
+    except Exception as e:
+        outqueue.put(e)
+    else:
+        outqueue.put('no exception')
+
+
+# Multiply by two in a separate stream
+def cuda_multiply_two(queue, ready, done):
+    ready.set()
+    with torch.cuda.stream(torch.cuda.Stream()):
+        cuda_event, tensor = queue.get()
+        cuda_event.wait()
+        tensor.mul_(2)
+        cuda_event.record()
+        done.set()
+        del cuda_event
+
+
+def autograd_sharing(queue, ready, master_modified):
+    var = queue.get()
+    ready.set()
+    master_modified.wait()
+
+    expected_var = torch.arange(1., 26).view(5, 5)
+    expected_var[0, 0] = 1000
+    is_ok = var.data.equal(expected_var)
+    var.data[:] = torch.ones(5, 5)
+
+    is_ok &= var.grad is None
+    var._grad = Variable(torch.ones(5, 5), requires_grad=False)
+
+    queue.put(is_ok)
+
+
+@contextlib.contextmanager
+def fs_sharing():
+    prev_strategy = mp.get_sharing_strategy()
+    mp.set_sharing_strategy('file_system')
+    try:
+        yield
+    finally:
+        mp.set_sharing_strategy(prev_strategy)
+
+
+class leak_checker(object):
+
+    def __init__(self, test_case):
+        self.checked_pids = [os.getpid()]
+        self.test_case = test_case
+
+    def __enter__(self):
+        self.next_fds = self._get_next_fds(10)
+        return self
+
+    def __exit__(self, *args):
+        if args[0] is None:
+            # Check that the 10th available file-descriptor at the end of the
+            # test is no more than 4 higher than the 10th available at the
+            # start. This attempts to catch file descriptor leaks, but allows
+            # one-off initialization that may use up a file descriptor
+            # TODO: Disabled because this check is too flaky
+            # available_fds = self._get_next_fds(10)
+            # self.test_case.assertLessEqual(
+            #     available_fds[-1] - self.next_fds[-1], 5)
+            self.test_case.assertFalse(self.has_shm_files())
+        return False
+
+    def check_pid(self, pid):
+        self.checked_pids.append(pid)
+
+    def _get_next_fds(self, n=1):
+        # dup uses the lowest-numbered unused descriptor for the new descriptor
+        fds = [os.dup(0) for i in range(n)]
+        for fd in fds:
+            os.close(fd)
+        return fds
+
+    def has_shm_files(self, wait=True):
+        if not HAS_SHM_FILES:
+            return False
+        result = self._has_shm_files()
+        if result and mp.get_sharing_strategy() == 'file_system' and wait:
+            time.sleep(0.5)
+            return self._has_shm_files()
+        return result
+
+    def _has_shm_files(self):
+        gc.collect()
+        names = list('torch_' + str(pid) for pid in self.checked_pids)
+        for filename in os.listdir('/dev/shm'):
+            for name in names:
+                if filename.startswith(name):
+                    return True
+        return False
+
+
+class TestMultiprocessing(TestCase):
+
+    def _test_sharing(self, ctx=mp, type=torch.FloatTensor, repeat=1):
+        def test_fill():
+            x = torch.zeros(5, 5).type(type)
+            q = ctx.Queue()
+            e = ctx.Event()
+            data = [x, x[:, 1]]
+            q.put(data)
+            p = ctx.Process(target=simple_fill, args=(q, e))
+            p.daemon = True
+            lc.check_pid(p.pid)
+            p.start()
+            e.wait(10)
+            self.assertTrue(e.is_set())
+            self.assertTrue(data[0].eq(4).all())
+            self.assertTrue(data[1].eq(4).all())
+            p.join(1)
+            self.assertFalse(p.is_alive())
+
+        def test_receive():
+            q = ctx.Queue()
+            e = ctx.Event()
+            p = ctx.Process(target=send_tensor, args=(q, e, type))
+            p.daemon = True
+            lc.check_pid(p.pid)
+            p.start()
+            t1 = q.get()
+            t2 = q.get()
+            self.assertTrue(t1.eq(1).all())
+            self.assertTrue(id(t1.storage()) == id(t2.storage()))
+            e.set()
+            p.join(1)
+            self.assertFalse(p.is_alive())
+
+        with leak_checker(self) as lc:
+            for _ in range(repeat):
+                test_fill()
+                test_receive()
+
+    def _test_preserve_sharing(self, ctx=mp, repeat=1):
+        def do_test():
+            x = torch.randn(5, 5)
+            data = [x.storage(), x, x[2], x[:, 1]]
+            q = ctx.Queue()
+            q.put(data)
+            new_data = q.get(timeout=1)
+            self.assertEqual(new_data, data, 0)
+            storage_cdata = data[0]._cdata
+            self.assertEqual(new_data[0]._cdata, storage_cdata)
+            for t in new_data[1:]:
+                self.assertEqual(t.storage()._cdata, storage_cdata)
+
+        with leak_checker(self):
+            for i in range(repeat):
+                do_test()
+
+    def _test_pool(self, ctx=mp, repeat=1):
+        def do_test():
+            p = ctx.Pool(2)
+            for proc in p._pool:
+                lc.check_pid(proc.pid)
+
+            buffers = [torch.zeros(2, 2) for i in range(4)]
+            results = p.map(simple_pool_fill, buffers, 1)
+            self.assertEqual(len(results), len(buffers))
+            for r in results:
+                self.assertEqual(r, torch.ones(2, 2) * 5, 0)
+            for b in buffers:
+                self.assertEqual(b, torch.ones(2, 2) * 4, 0)
+
+            p.close()
+            p.join()
+
+        with leak_checker(self) as lc:
+            for i in range(repeat):
+                do_test()
+
+    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
+    def test_fd_sharing(self):
+        self._test_sharing(repeat=TEST_REPEATS)
+
+    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    def test_fd_preserve_sharing(self):
+        self._test_preserve_sharing(repeat=TEST_REPEATS)
+
+    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    def test_fd_pool(self):
+        self._test_pool(repeat=TEST_REPEATS)
+
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
+    def test_fs_sharing(self):
+        with fs_sharing():
+            self._test_sharing(repeat=TEST_REPEATS)
+
+    def test_fs_preserve_sharing(self):
+        with fs_sharing():
+            self._test_preserve_sharing(repeat=TEST_REPEATS)
+
+    def test_fs_pool(self):
+        with fs_sharing():
+            self._test_pool(repeat=TEST_REPEATS)
+
+    @unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
+    def test_fs(self):
+        def queue_put():
+            x = torch.DoubleStorage(4)
+            q = mp.Queue()
+            self.assertFalse(lc.has_shm_files())
+            q.put(x)
+            time.sleep(0.05)  # queue serializes asynchronously
+            self.assertTrue(lc.has_shm_files(wait=False))
+            q.get()
+
+        with fs_sharing(), leak_checker(self) as lc:
+            for _ in range(TEST_REPEATS):
+                queue_put()
+
+    def test_inherit_tensor(self):
+        t = torch.zeros(5, 5)
+        p = SubProcess(t.share_memory_())
+        p.start()
+        p.join(1)
+        self.assertEqual(t, torch.ones(5, 5) * 3, 0)
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    def test_cuda(self):
+        torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
+        self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
+    def test_cuda_small_tensors(self):
+        # Check multiple small tensors which will likely use the same
+        # underlying cached allocation
+        ctx = mp.get_context('spawn')
+        tensors = []
+        for i in range(5):
+            device = i % 2
+            tensors += [torch.arange(i * 5., (i + 1) * 5).cuda(device)]
+
+        inq = ctx.Queue()
+        outq = ctx.Queue()
+        inq.put(tensors)
+        p = ctx.Process(target=sum_tensors, args=(inq, outq))
+        p.start()
+
+        results = []
+        for i in range(5):
+            results.append(outq.get())
+        p.join()
+
+        for i, tensor in enumerate(tensors):
+            v, device, tensor_size, storage_size = results[i]
+            self.assertEqual(v, torch.arange(i * 5., (i + 1) * 5).sum())
+            self.assertEqual(device, i % 2)
+            self.assertEqual(tensor_size, 5)
+            # You might think this should be the case, but it's not!  After
+            # data from the CUDA caching allocator goes through IPC, the
+            # size of the storage is the size of the *cached cudaMalloc for
+            # the entire memory block* of the storage, not just the storage.
+            # See Note [CUDA IPC and the caching allocator] for more info
+            #
+            # self.assertEqual(storage_size, 5)
+
+    @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    def test_cuda_bad_call(self):
+        # Initialize CUDA
+        t = torch.zeros(5, 5).cuda().cpu()
+        inq = mp.Queue()
+        outq = mp.Queue()
+        p = mp.Process(target=queue_get_exception, args=(inq, outq))
+        p.start()
+        inq.put(t)
+        p.join()
+        self.assertIsInstance(outq.get(), RuntimeError)
+
+    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    def test_event(self):
+        ctx = mp.get_context('spawn')
+        queue = ctx.Queue()
+        ready = ctx.Event()
+        done = ctx.Event()
+        p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
+        p.start()
+
+        ready.wait()
+        with torch.cuda.stream(torch.cuda.Stream()):
+            tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
+            # Use a sleep kernel to test events. Without the event, the
+            # multiply happens before the add.
+            event = torch.cuda.Event(interprocess=True)
+            torch.cuda._sleep(20000000)  # about 30 ms
+            tensor.add_(1)
+            event.record()
+            queue.put((event, tensor))
+            done.wait()  # must wait until subprocess records event
+            event.synchronize()
+            self.assertEqual(list(tensor), [4, 4, 4, 4])
+        p.join()
+
+    def _test_empty_tensor_sharing(self, dtype, device):
+        q = mp.Queue()
+        empty = torch.tensor([], dtype=dtype, device=device)
+        q.put(empty)
+        out = q.get(timeout=1)
+        self.assertEqual(out, empty)
+
+    def test_empty_tensor_sharing(self):
+        self._test_empty_tensor_sharing(torch.float32, torch.device('cpu'))
+        self._test_empty_tensor_sharing(torch.int64, torch.device('cpu'))
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    def test_empty_tensor_sharing_cuda(self):
+        self._test_empty_tensor_sharing(torch.float32, torch.device('cuda'))
+        self._test_empty_tensor_sharing(torch.int64, torch.device('cuda'))
+
+    def _test_autograd_sharing(self, var):
+        ready = mp.Event()
+        master_modified = mp.Event()
+        queue = mp.Queue()
+        p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
+        p.daemon = True
+        p.start()
+        var._grad = Variable(torch.zeros(5, 5), requires_grad=False)
+        queue.put(var)
+
+        ready.wait()
+        var.data[0, 0] = 1000
+        var.grad.data[:] = torch.ones(5, 5) * 4
+        master_modified.set()
+
+        worker_ok = queue.get()
+        self.assertTrue(worker_ok)
+
+        self.assertEqual(var.data, torch.ones(5, 5))
+        self.assertEqual(var.grad.data, torch.ones(5, 5) * 4)
+        p.join(1)
+        self.assertFalse(p.is_alive())
+
+    def test_variable_sharing(self):
+        for requires_grad in [True, False]:
+            var = Variable(torch.arange(1., 26).view(5, 5),
+                           requires_grad=requires_grad)
+            self._test_autograd_sharing(var)
+
+    def test_parameter_sharing(self):
+        param = Parameter(torch.arange(1., 26).view(5, 5))
+        self._test_autograd_sharing(param)
+
+    def test_empty_shared(self):
+        t = torch.Tensor()
+        t.share_memory_()
+
+    def _test_is_shared(self):
+        t = torch.randn(5, 5)
+        self.assertFalse(t.is_shared())
+        t.share_memory_()
+        self.assertTrue(t.is_shared())
+
+    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    def test_is_shared(self):
+        self._test_is_shared()
+
+    def test_fs_is_shared(self):
+        with fs_sharing():
+            self._test_is_shared()
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    def test_is_shared_cuda(self):
+        t = torch.randn(5, 5).cuda()
+        self.assertTrue(t.is_shared())
+
+    @unittest.skip('this test occasionally fails and deadlocks; see https://github.com/pytorch/pytorch/issues/5834')
+    def test_backwards_fork(self):
+        r"backwards() should succeed when called before and after a fork"
+        call_backward()
+        p = mp.Process(target=call_backward)
+        p.start()
+        p.join(1)
+        self.assertFalse(p.is_alive())
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nccl.py b/test/test_nccl.py
new file mode 100644
index 0000000..581d1a8
--- /dev/null
+++ b/test/test_nccl.py
@@ -0,0 +1,101 @@
+import unittest
+
+import torch
+import torch.cuda.nccl as nccl
+import torch.cuda
+
+from common import TestCase, run_tests, IS_WINDOWS
+from common_cuda import TEST_CUDA, TEST_MULTIGPU
+
+
+nGPUs = torch.cuda.device_count()
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811
+
+
+class TestNCCL(TestCase):
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    def test_unique_id(self):
+        uid = nccl.unique_id()
+        self.assertIsInstance(uid, bytes)
+        self.assertGreater(len(uid), 1)
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_broadcast(self):
+        expected = torch.FloatTensor(128).uniform_()
+        tensors = [expected.cuda()]
+        for device in range(1, torch.cuda.device_count()):
+            with torch.cuda.device(device):
+                tensors.append(torch.cuda.FloatTensor(128))
+
+        nccl.broadcast(tensors)
+        for i in range(torch.cuda.device_count()):
+            self.assertEqual(tensors[i], expected)
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_reduce(self):
+        tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(128).zero_()
+        for t in tensors:
+            expected.add_(t)
+
+        tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
+        nccl.reduce(tensors)
+
+        self.assertEqual(tensors[0], expected)
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_all_reduce(self):
+        tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(128).zero_()
+        for t in tensors:
+            expected.add_(t)
+
+        tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
+        nccl.all_reduce(tensors)
+
+        for tensor in tensors:
+            self.assertEqual(tensor, expected)
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_all_gather(self):
+        inputs = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.cat(inputs, 0)
+
+        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
+        outputs = [torch.cuda.FloatTensor(128 * nGPUs, device=i)
+                   for i in range(nGPUs)]
+        nccl.all_gather(inputs, outputs)
+
+        for tensor in outputs:
+            self.assertEqual(tensor, expected)
+
+    @unittest.skipIf(IS_WINDOWS, "NCCL doesn't support Windows")
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_reduce_scatter(self):
+        in_size = 32 * nGPUs
+        out_size = 32
+
+        inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(in_size).zero_()
+        for t in inputs:
+            expected.add_(t)
+        expected = expected.view(nGPUs, 32)
+
+        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
+        outputs = [torch.cuda.FloatTensor(out_size, device=i)
+                   for i in range(nGPUs)]
+        nccl.reduce_scatter(inputs, outputs)
+
+        for i in range(nGPUs):
+            self.assertEqual(outputs[i], expected[i])
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
new file mode 100644
index 0000000..f318132
--- /dev/null
+++ b/test/test_nn.py
@@ -0,0 +1,8101 @@
+import math
+import random
+import string
+import unittest
+import itertools
+import contextlib
+import warnings
+import pickle
+from copy import deepcopy
+from itertools import repeat, product
+from functools import wraps, reduce
+from operator import mul
+from collections import OrderedDict
+import hashlib
+import os
+
+import torch
+from torch._six import inf, nan
+import torch.backends.cudnn as cudnn
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.parallel as dp
+import torch.nn.init as init
+import torch.nn.utils.rnn as rnn_utils
+import torch.legacy.nn as legacy
+from torch.nn.utils import clip_grad_norm_, clip_grad_value_
+from torch.nn.utils import parameters_to_vector, vector_to_parameters
+from torch.autograd import Variable, gradcheck
+from torch.autograd.gradcheck import gradgradcheck
+from torch.nn import Parameter
+from torch.nn.parallel._functions import Broadcast
+from common import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, \
+    TEST_SCIPY, IS_WINDOWS, download_file, PY3, PY34, to_gpu, \
+    get_function_arglist, skipCUDAMemoryLeakCheckIf
+from common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \
+    TEST_CUDNN_VERSION
+from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
+    module_tests, criterion_tests, loss_reference_fns, get_reduction, \
+    get_weight, smoothl1loss_reference, kldivloss_reference
+
+
+if TEST_SCIPY:
+    from scipy import stats
+
+ALL_TENSORTYPES = [torch.float,
+                   torch.double,
+                   torch.half]
+
+NO_HALF_TENSORTYPES = [torch.float,
+                       torch.double]
+
+DOUBLE_TENSORTYPES = [torch.double]
+
+dtype2prec = {torch.float: 1e-5,
+              torch.double: 1e-5,
+              torch.half: 1e-2}
+
+
+# WARNING: If you add a new top-level test case to this file, you MUST
+# update test/run_test.py to list it, otherwise it will NOT be run in
+# CI.
+
+
+# Used to run the same test with different tensor types
+def repeat_test_for_types(dtypes):
+    def repeat_helper(f):
+        @wraps(f)
+        def call_helper(self, *args):
+            for dtype in dtypes:
+                if PY34:
+                    with TestCase.subTest(self, dtype=dtype):
+                        f(self, *args, dtype=dtype)
+                else:
+                    f(self, *args, dtype=dtype)
+
+        return call_helper
+    return repeat_helper
+
+
+class PackedSequenceTest(TestCase):
+
+    _type_by_name = {
+        'torch.DoubleTensor': (torch.DoubleTensor, 'double'),
+        'torch.FloatTensor': (torch.FloatTensor, 'float'),
+        # We leave out `'torch.HalfTensor': (torch.HalfTensor, 'half'),`
+        # because of an error in `pad_packed_sequence`
+        # > AttributeError: 'torch.HalfTensor' object has no attribute 'fill_'
+        'torch.LongTensor': (torch.LongTensor, 'long'),
+        'torch.IntTensor': (torch.IntTensor, 'int'),
+        'torch.ShortTensor': (torch.ShortTensor, 'short'),
+        'torch.CharTensor': (torch.CharTensor, 'char'),
+        'torch.ByteTensor': (torch.ByteTensor, 'byte'),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super(PackedSequenceTest, self).__init__(*args, **kwargs)
+        self.batch_size = 5
+        self.max_length = 6
+
+    def _ordered_sequence(self, tensor_type):
+        """Create ordered list of random sequences"""
+        seqs = [tensor_type(random.randint(1, self.max_length))
+                for _ in range(self.batch_size)]
+        seqs = [s.random_(-128, 128) for s in seqs]
+        ordered = sorted(seqs, key=len, reverse=True)
+        return ordered
+
+    def _padded_sequence(self, tensor_type):
+        """Create Tensor of random padded sequences"""
+        ordered = self._ordered_sequence(tensor_type)
+        lengths = list(map(len, ordered))
+        padded_tensor = rnn_utils.pad_sequence(ordered)
+        return padded_tensor, lengths
+
+    def test_type_casts(self):
+        """Test type casting of `PackedSequence` against type casting of tensor"""
+        for _, (input_type, _) in self._type_by_name.items():
+            for expected_type_str, (_, cast_str) in self._type_by_name.items():
+                padded, lengths = self._padded_sequence(input_type)
+                packed = rnn_utils.pack_padded_sequence(padded, lengths)
+                # Apply cast to `PackedSequence` instance and unpack
+                masked = getattr(packed, cast_str)()
+                unpacked, lengths_out = rnn_utils.pad_packed_sequence(masked)
+                self.assertEqual(unpacked.type(), expected_type_str)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_cuda_mask(self):
+        tensor_type = torch.FloatTensor
+        cuda_type_str = 'torch.cuda.FloatTensor'
+        padded, lengths = self._padded_sequence(tensor_type)
+        packed = rnn_utils.pack_padded_sequence(padded, lengths)
+        self.assertFalse(packed.is_cuda)
+        packed = packed.cuda()
+        self.assertTrue(packed.is_cuda)
+        unpacked, _ = rnn_utils.pad_packed_sequence(packed)
+        self.assertEqual(unpacked.type(), cuda_type_str)
+
+    def test_total_length(self):
+        padded, lengths = self._padded_sequence(torch.FloatTensor)
+        max_length = max(lengths)
+        packed = rnn_utils.pack_padded_sequence(padded, lengths)
+        # test ValueError if total_length < max_length
+        for total_length in (-1, 0, max_length - 1):
+            for batch_first in (True, False):
+                def err_fn():
+                    rnn_utils.pad_packed_sequence(packed, batch_first=batch_first,
+                                                  total_length=total_length)
+            self.assertRaisesRegex(ValueError,
+                                   r'Expected total_length to be at least the '
+                                   r'length of the longest sequence in input',
+                                   err_fn)
+        # test that pad_packed_sequence returns results of correct length
+        for batch_first in (True, False):
+            no_extra_pad, _ = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first)
+            for total_length_delta in (0, 1, 8):
+                total_length = max_length + total_length_delta
+                unpacked, lengths_out = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first,
+                                                                      total_length=total_length)
+                self.assertEqual(lengths, lengths_out)
+                self.assertEqual(unpacked.size(1 if batch_first else 0), total_length)
+                if total_length_delta == 0:
+                    ref_output = no_extra_pad
+                elif batch_first:
+                    extra_pad = no_extra_pad.new_zeros(self.batch_size, total_length_delta)
+                    ref_output = torch.cat([no_extra_pad, extra_pad], 1)
+                else:
+                    extra_pad = no_extra_pad.new_zeros(total_length_delta, self.batch_size)
+                    ref_output = torch.cat([no_extra_pad, extra_pad], 0)
+                self.assertEqual(unpacked, ref_output)
+
+    def test_to(self):
+        padded, lengths = self._padded_sequence(torch.IntTensor)
+        a = rnn_utils.pack_padded_sequence(padded, lengths).cpu()
+
+        self.assertIs(a, a.to('cpu'))
+        self.assertIs(a, a.to('cpu', dtype=torch.int32))
+        self.assertEqual(a.long(), a.to(torch.int64))
+
+        if torch.cuda.is_available():
+            for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                b = a.cuda(device=cuda)
+                self.assertIs(b, b.to(cuda))
+                self.assertEqual(a, b.to('cpu'))
+                self.assertEqual(b, a.to(cuda))
+                self.assertEqual(a, b.to('cpu', dtype=torch.int32))
+                self.assertIs(b, b.to(dtype=torch.int32))
+                self.assertEqual(b.long(), b.to(dtype=torch.int64))
+
+
+def default_tensor_type(type):
+    type_str = torch.typename(type)
+
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            old_type = torch.Tensor().type()
+            torch.set_default_tensor_type(type_str)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                torch.set_default_tensor_type(old_type)
+
+        return wrapper
+
+    return decorator
+
+
+def _assertGradAndGradgradChecks(test_case, apply_fn, inputs):
+    # call assert function rather than returning a bool since it's nicer
+    # if we get whether this failed on the gradcheck or the gradgradcheck.
+    test_case.assertTrue(gradcheck(apply_fn, inputs))
+    test_case.assertTrue(gradgradcheck(apply_fn, inputs))
+
+
+class InputVariableMixin(object):
+    def _get_input(self):
+        input = TestBase._get_input(self, False)
+
+        def map_variables(i):
+            if isinstance(i, torch.Tensor):
+                if i.is_floating_point():
+                    i.requires_grad = True
+                return i
+            else:
+                return type(i)(map_variables(elem) for elem in i)
+
+        return map_variables(input)
+
+
+class NewModuleTest(InputVariableMixin, ModuleTest):
+    def __init__(self, *args, **kwargs):
+        super(NewModuleTest, self).__init__(*args, **kwargs)
+        self.cudnn = kwargs.get('cudnn', False)
+        self.check_inplace = kwargs.get('check_inplace', False)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+
+    def _do_test(self, test_case, module, input):
+        test_case.check_jacobian(module, input, self.jacobian_input)
+
+        if self.check_gradgrad:
+            # could probably unify check_jacobian above with this.
+            params = tuple(x for x in module.parameters())
+            _assertGradAndGradgradChecks(test_case,
+                                         lambda x, *args, **kw: test_case._forward(module, x), (input,) + params)
+
+        # check if module can be printed
+        module.__repr__()
+
+        if self.check_inplace:
+            # check if the inplace variant of the module gives the same result
+            # as the out-of-place
+
+            module_ip = self.constructor(*self.constructor_args, inplace=True)
+
+            input_version = input._version
+            with freeze_rng_state():
+                output = module(input)
+            test_case.assertEqual(input._version, input_version)
+
+            input_ip = deepcopy(input)
+            input_ip_clone = input_ip.clone()
+            with freeze_rng_state():
+                output_ip = module_ip(input_ip_clone)
+            test_case.assertNotEqual(input_ip_clone._version, input_version)
+            test_case.assertEqual(output, output_ip)
+            grad = output.data.clone().normal_()
+            input.grad.data.zero_()
+            output.backward(grad)
+            output_ip.backward(grad)
+            test_case.assertEqual(input.grad, input_ip.grad)
+
+        if isinstance(input, torch.LongTensor) and TEST_CUDA:
+            # check that cuda() moves module parameters to correct GPU device,
+            # and that float() casts parameters correctly
+
+            input = input.cuda()
+            module.float().cuda()
+            module(input)
+            for p in module.parameters():
+                test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                test_case.assertEqual(p.get_device(), 0)
+
+            if torch.cuda.device_count() > 1:
+                input = input.cuda(1)
+                module.cuda(1)
+                with torch.cuda.device(1):
+                    module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                    test_case.assertEqual(p.get_device(), 1)
+        else:
+            # check that float()/double() casters work correctly
+
+            # to float
+            if not isinstance(input, torch.LongTensor):
+                input = input.float()
+            module.float()
+            module(input)
+            for p in module.parameters():
+                test_case.assertIsInstance(p, torch.FloatTensor)
+
+            # and back to double
+            if not isinstance(input, torch.LongTensor):
+                input = input.double()
+            module.double()
+            module(input)
+            for p in module.parameters():
+                test_case.assertIsInstance(p, torch.DoubleTensor)
+
+            if TEST_CUDA and self.should_test_cuda:
+                # check that cuda() moves module parameters to correct GPU device,
+                # and that float() casts parameters correctly
+
+                # to GPU0
+                input = input.float().cuda()
+                module.float().cuda()
+                module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                    test_case.assertEqual(p.get_device(), 0)
+
+                # to CPU
+                input = input.cpu()
+                module.cpu()
+                module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.FloatTensor)
+
+                # back to GPU0
+                input = input.cuda()
+                module.cuda()
+                module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                    test_case.assertEqual(p.get_device(), 0)
+
+                # test that forwards of module runs correctly without cuDNN
+                if self.cudnn:
+                    with torch.backends.cudnn.flags(enabled=False):
+                        module(input)
+                        for p in module.parameters():
+                            test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                            test_case.assertEqual(p.get_device(), 0)
+
+                if torch.cuda.device_count() >= 2:
+                    # test cross-GPU transfer works
+                    # to GPU1
+                    input = input.cuda(1)
+                    module.cuda(1)
+                    with torch.cuda.device(1):
+                        module(input)
+                    for p in module.parameters():
+                        test_case.assertIsInstance(p, torch.cuda.FloatTensor)
+                        test_case.assertEqual(p.get_device(), 1)
+
+                # test double()
+                input = input.double().cuda()
+                module.double().cuda()
+                module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.cuda.DoubleTensor)
+                    test_case.assertEqual(p.get_device(), 0)
+
+                # test half()
+                input = input.half().cuda()
+                module.half().cuda()
+                module(input)
+                for p in module.parameters():
+                    test_case.assertIsInstance(p, torch.cuda.HalfTensor)
+                    test_case.assertEqual(p.get_device(), 0)
+
+    def _get_target(self):
+        return self._get_arg('target', False)
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', False)
+
+
+class NewCriterionTest(InputVariableMixin, CriterionTest):
+    # TODO: check that criterions don't ignore grad_output
+
+    def __init__(self, *args, **kwargs):
+        super(NewCriterionTest, self).__init__(*args, **kwargs)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+
+    def _do_extra_tests(self, test_case, module, input, target):
+        if not self.check_gradgrad:
+            return
+
+        test_case.assertFalse(target.requires_grad)
+
+        params = tuple(x for x in module.parameters())
+        if not isinstance(input, tuple):
+            inputs = (input,) + params
+
+            def apply_fn(input, *params):
+                return module(input, target)
+        else:
+            inputs = input + params
+
+            def apply_fn(input1, input2, *params):
+                return module(input1, input2, target)
+
+        # TODO: we don't pass `target` as part of inputs because we don't
+        # currently compute the gradient w.r.t. target for loss functions.
+        gradcheck(apply_fn, inputs)
+        gradgradcheck(apply_fn, inputs)
+
+    def test_cuda(self, test_case, dtype=None):
+        def convert_dtype(obj, dtype, requires_grad=False):
+            if isinstance(obj, torch.Tensor):
+                return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad)
+            elif isinstance(obj, torch.Tensor):
+                return obj.to(dtype)
+            elif isinstance(obj, tuple):
+                return tuple(convert_dtype(o, dtype, requires_grad) for o in obj)
+            else:
+                return obj
+
+        if not TEST_CUDA or not self.should_test_cuda:
+            raise unittest.SkipTest('Excluded from CUDA tests')
+        try:
+            cpu_input = self._get_input()
+            cpu_target = self._get_target()
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args)
+
+            # Convert input, target and module parameters to dtype
+            if dtype is not None:
+                cpu_input = convert_dtype(cpu_input, dtype, True)
+                # NLLLoss requires target to be LongTensor
+                if not isinstance(cpu_target, torch.LongTensor):
+                    cpu_target = convert_dtype(cpu_target, dtype)
+                cpu_module.type(dtype)
+                gpu_module.type(dtype)
+
+            # GPU setup
+            gpu_input = to_gpu(cpu_input)
+            gpu_target = to_gpu(cpu_target)
+            gpu_module.cuda()
+
+            # torch.HalfTensor doesn't support most operations, converting back to default
+            if dtype == torch.half:
+                cpu_input = self._get_input()
+                cpu_target = self._get_target()
+                # Loss modules with weights require consistent input/module weight types
+                cpu_module = self.constructor(*self.constructor_args)
+
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
+            # dtype can be None, so set precision in this way instead of a precision map
+            test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4)
+
+            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
+            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
+            test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4)
+        except NotImplementedError:
+            pass
+
+    def _get_target(self):
+        return self._get_arg('target', False)
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', False)
+
+
+class TestNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+
+    def _forward(self, module, input):
+        with freeze_rng_state():
+            return module(input)
+
+    def _backward(self, module, input, output, grad_output, create_graph=False):
+        output.backward(grad_output, retain_graph=True, create_graph=create_graph)
+        if input.grad is None:
+            return None
+        return input.grad.data
+
+    def _forward_criterion(self, criterion, input, target):
+        if isinstance(input, tuple):
+            args = input + (target,)
+            output = criterion(*args)
+        else:
+            output = criterion(input, target)
+        return output.item()
+
+    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+        input_tuple = input if isinstance(input, tuple) else (input,)
+        for i in input_tuple:
+            if i.grad is not None:
+                i.grad.data.zero_()
+        args = input_tuple + (target,)
+        if gradOutput is None:
+            gradOutput = torch.ones(())
+        criterion(*args).backward(gradOutput.type_as(input_tuple[0]))
+        if isinstance(input, tuple):
+            return tuple(map(lambda i: i.grad.data, input))
+        else:
+            return input.grad.data
+
+    def _zero_grad_parameters(self, module):
+        for p in module.parameters():
+            if p.grad is not None:
+                p.grad.data.zero_()
+                p.grad.detach_()
+
+    def _get_parameters(self, module):
+        params = []
+        d_params = []
+        for p in module.parameters():
+            params.append(p)
+            d_params.append(p.grad)
+        return params, d_params
+
+    def test_module_backcompat(self):
+        from torch.serialization import SourceChangeWarning
+        path = download_file('https://download.pytorch.org/test_data/linear.pt')
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', SourceChangeWarning)
+            m = torch.load(path)
+        input = torch.randn(2, 3, dtype=torch.float)
+        self.assertEqual(m(input).size(), (2, 5))
+
+    def test_hooks(self):
+        module = nn.Sigmoid()
+        input = torch.ones(5, 5, requires_grad=True)
+
+        counter = {
+            'forwards': 0,
+            'backwards': 0
+        }
+
+        def fw_hook(inc, h_module, input, output):
+            self.assertIsInstance(input, tuple)
+            self.assertTrue(isinstance(output, torch.Tensor))
+            self.assertTrue(h_module is module)
+            self.assertEqual(input[0].data, torch.ones(5, 5))
+            self.assertEqual(output.data, torch.Tensor(5, 5).fill_(1 / (1 + 1 / math.e)))
+            counter['forwards'] += inc
+
+        def bw_hook(inc, h_module, grad_input, grad_output):
+            self.assertIsInstance(grad_input, tuple)
+            self.assertIsInstance(grad_output, tuple)
+            self.assertTrue(h_module is module)
+            self.assertEqual(grad_output[0].data, torch.ones(5, 5) * 2)
+            counter['backwards'] += inc
+
+        test_fwd = module.register_forward_hook(lambda *args: fw_hook(1, *args))
+
+        module(input)
+        module(input)
+        self.assertEqual(counter['forwards'], 2)
+        self.assertEqual(counter['backwards'], 0)
+
+        test_bwd = module.register_backward_hook(
+            lambda *args: bw_hook(1, *args))
+
+        output = module(input)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 0)
+
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 1)
+
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 2)
+
+        test2_fwd = module.register_forward_hook(lambda *args: fw_hook(2, *args))
+
+        output = module(input)
+        self.assertEqual(counter['forwards'], 6)
+        self.assertEqual(counter['backwards'], 2)
+
+        test2_bwd = module.register_backward_hook(lambda *args: bw_hook(2, *args))
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 9)
+        self.assertEqual(counter['backwards'], 5)
+
+        test2_bwd.remove()
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 12)
+        self.assertEqual(counter['backwards'], 6)
+
+        test2_fwd.remove()
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 13)
+        self.assertEqual(counter['backwards'], 7)
+
+        test_fwd.remove()
+        test_bwd.remove()
+
+    def test_hook_cpp(self):
+        counter = [0]
+        bn = nn.BatchNorm1d(5)
+
+        def hook(module, grad_inputs, grad_outputs):
+            counter[0] += 1
+            self.assertEqual(len(grad_inputs), 3)
+            self.assertEqual(len(grad_outputs), 1)
+            self.assertEqual(module, bn)
+
+        bn.register_backward_hook(hook)
+        output = bn(torch.randn(5, 5, requires_grad=True))
+        output.sum().backward()
+
+    def test_hook_fail(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+
+        def fw_fail1(self, input, output):
+            return output
+
+        def fw_fail2(self, input, output):
+            return input
+
+        def bw_fail1(self, grad_input, grad_output):
+            return grad_input[:-1]
+
+        def bw_fail2(self, grad_input, grad_output):
+            return grad_input + (torch.randn(2, 2),)
+
+        with module.register_forward_hook(fw_fail1):
+            with self.assertRaises(RuntimeError) as err:
+                module(input)
+            self.assertIn("fw_fail", err.exception.args[0])
+            self.assertIn("didn't return None", err.exception.args[0])
+
+        with module.register_forward_hook(fw_fail2):
+            with self.assertRaises(RuntimeError) as err:
+                module(input)
+            self.assertIn("fw_fail2", err.exception.args[0])
+            self.assertIn("didn't return None", err.exception.args[0])
+
+        with module.register_backward_hook(bw_fail1):
+            with self.assertRaises(RuntimeError) as err:
+                module(input).sum().backward()
+            self.assertIn("bw_fail", err.exception.args[0])
+            self.assertIn("got 0, but expected 1", err.exception.args[0])
+
+        with module.register_backward_hook(bw_fail2):
+            with self.assertRaises(RuntimeError) as err:
+                module(input).sum().backward()
+            self.assertIn("bw_fail2", err.exception.args[0])
+            self.assertIn("got 2, but expected 1", err.exception.args[0])
+
+    def test_hook_writeable(self):
+        module = nn.Linear(5, 5)
+        input = torch.randn(5, 5, requires_grad=True)
+
+        def bw_hook(module, grad_input, grad_output):
+            for grad in grad_input:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            for grad in grad_output:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            return tuple(gi * 2 for gi in grad_input)
+
+        module.register_backward_hook(bw_hook)
+        module(input).backward(torch.ones(5, 5))
+        expected_grad = torch.ones(5, 5).mm(module.weight.data) * 2
+        self.assertEqual(input.grad.data, expected_grad)
+
+    def test_zero_grad(self):
+        i = torch.randn(2, 5, requires_grad=True)
+        module = nn.Linear(5, 5)
+        for p in module.parameters():
+            p.requires_grad = False
+        module.zero_grad()
+
+        module.weight.requires_grad = True
+        module.zero_grad()
+        self.assertIsNone(module.weight.grad)  # uninitialized grad
+
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+
+        module.bias.requires_grad = True
+        module.zero_grad()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNotNone(module.bias.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
+        self.assertGreater(module.bias.grad.data.abs().sum(), 0)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
+
+    def test_no_grad(self):
+        module = nn.Conv2d(2, 5, kernel_size=3, padding=1)
+        input = torch.randn(1, 2, 10, 10)
+        x = input
+        y = input.clone()
+
+        output = module(x)
+        self.assertTrue(output.requires_grad)
+        output.backward(torch.ones(1, 5, 10, 10))
+
+        with torch.no_grad():
+            output2 = module(y)
+            self.assertFalse(output2.requires_grad)
+            self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10)))
+
+    def _test_dropout(self, cls, input):
+        p = 0.2
+        input.fill_(1 - p)
+
+        module = cls(p)
+        input_var = torch.tensor(input, requires_grad=True)
+        output = module(input_var)
+        self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
+        output.backward(input)
+        self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
+
+        module = cls(p, True)
+        input_var = torch.tensor(input.clone(), requires_grad=True)
+        output = module(input_var + 0)
+        self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
+        output.backward(input)
+        self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
+
+        # Check that these don't raise errors
+        module.__repr__()
+        str(module)
+
+    def _test_alpha_dropout(self, cls, input):
+        mean = input.mean()
+        std = input.std()
+
+        for p in [0.2, 0.5, 0.8]:
+            module = cls(p)
+            input_var = torch.tensor(input, requires_grad=True)
+            output = module(input_var)
+            # output mean should be close to input mean
+            self.assertLess(abs(output.data.mean() - mean), 0.1)
+            # output std should be close to input std
+            self.assertLess(abs(output.data.std() - std), 0.1)
+            output.backward(input)
+
+    def test_parameters(self):
+        def num_params(module):
+            return len(list(module.parameters()))
+
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = l
+                self.l2 = l
+                self.param = Parameter(torch.Tensor(3, 5))
+
+        l = nn.Linear(10, 20)
+        n = Net()
+        s = nn.Sequential(n, n, n, n)
+        self.assertEqual(num_params(l), 2)
+        self.assertEqual(num_params(n), 3)
+        self.assertEqual(num_params(s), 3)
+
+    def test_named_parameters(self):
+        def num_params(module):
+            return len(dict(module.named_parameters()))
+
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = l
+                self.l2 = l
+                self.param = Parameter(torch.Tensor(3, 5))
+
+        l = nn.Linear(10, 20)
+        n = Net()
+        s = nn.Sequential(n, n, n, n)
+
+        for name in dict(l.named_parameters()).keys():
+            self.assertTrue(name in ['bias', 'weight'])
+
+        for name in dict(n.named_parameters()).keys():
+            self.assertTrue(name in ['l1.bias', 'l1.weight', 'param'])
+
+        for name in dict(s.named_parameters()).keys():
+            self.assertTrue(name in ['0.l1.bias', '0.l1.weight', '0.param'])
+
+        self.assertEqual(num_params(l), 2)
+        self.assertEqual(num_params(n), 3)
+        self.assertEqual(num_params(s), 3)
+
+    def test_call_supports_python_dict_output(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = nn.Linear(10, 20)
+                self.register_backward_hook(self.hook)
+                self.check_backward_hook_flag = False
+
+            def hook(self, module, grad_out, grad_in):
+                self.check_backward_hook_flag = True
+
+            def forward(self, inputs):
+                return {"output": self.l1(inputs).sum()}
+
+        net = Net()
+        model_output = net(torch.randn([5, 10]))
+        model_output["output"].backward()
+        self.assertTrue(net.check_backward_hook_flag)
+
+    def test_children(self):
+        l1 = nn.Linear(2, 2)
+        l2 = nn.Linear(2, 2)
+        l3 = nn.Linear(2, 2)
+        l4 = nn.Linear(2, 2)
+        subnet = nn.Sequential(l3, l4)
+        s = nn.Sequential(l1, l2, l1, l2, subnet)
+        self.assertEqual(list(s.children()), [l1, l2, subnet])
+
+    def test_dir(self):
+        linear = nn.Linear(2, 2)
+        linear._test_submodule = nn.Linear(2, 2)
+        linear._test_parameter = Parameter(torch.Tensor(2, 2))
+        linear.register_buffer('_test_buffer', torch.Tensor(2, 2))
+        keys = dir(linear)
+        self.assertIn('_test_submodule', keys)
+        self.assertIn('_test_parameter', keys)
+        self.assertIn('_test_buffer', keys)
+
+        for key in keys:
+            self.assertTrue(hasattr(linear, key))
+
+    def test_repr(self):
+        # no extra information or sub-modules
+        empty_sequential = nn.Sequential()
+        expected_repr_empty = 'Sequential()'
+        self.assertEqual(repr(empty_sequential), expected_repr_empty)
+
+        # one liner extra information
+        linear = nn.Linear(1, 1)
+        expected_repr_linear = 'Linear(in_features=1, out_features=1, bias=True)'
+        self.assertEqual(repr(linear), expected_repr_linear)
+
+        # sub-modules repr
+        sequential = nn.Sequential(linear)
+        expected_repr_sequential = 'Sequential(\n' \
+            '  (0): Linear(in_features=1, out_features=1, bias=True)\n' \
+            ')'
+        self.assertEqual(repr(sequential), expected_repr_sequential)
+
+    def test_dir_digit(self):
+        model = nn.Sequential(nn.Linear(2, 2))
+        keys = dir(model)
+        self.assertNotIn('0', keys)
+
+    def test_named_children(self):
+        l1 = nn.Linear(2, 2)
+        l2 = nn.Linear(2, 2)
+        l3 = nn.Linear(2, 2)
+        l4 = nn.Linear(2, 2)
+        subnet = nn.Sequential(l3, l4)
+        s = nn.Sequential()
+        with self.assertRaises(KeyError):
+            s.add_module('', l1)
+        with self.assertRaises(KeyError):
+            s.add_module('name.with.dot', l1)
+        s.add_module('layer1', l1)
+        s.add_module('layer2', l2)
+        s.add_module('layer3', l1)
+        s.add_module('layer4', l2)
+        s.add_module('subnet', subnet)
+        self.assertEqual(list(s.named_children()), [('layer1', l1), ('layer2', l2), ('subnet', subnet)])
+
+    def test_modules(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = l
+                self.l2 = l
+                self.param = torch.empty(3, 5)
+
+        l = nn.Linear(10, 20)
+        n = Net()
+        s = nn.Sequential(n, n, n, n)
+        self.assertEqual(list(s.modules()), [s, n, l])
+
+    def test_named_modules(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = l
+                self.l2 = l
+                self.param = torch.empty(3, 5)
+                self.block = block
+        l = nn.Linear(10, 20)
+        l1 = nn.Linear(10, 20)
+        l2 = nn.Linear(10, 20)
+        block = nn.Sequential()
+        block.add_module('linear1', l1)
+        block.add_module('linear2', l2)
+        n = Net()
+        s = nn.Sequential(n, n, n, n)
+        self.assertEqual(list(s.named_modules()), [('', s), ('0', n), ('0.l1', l),
+                                                   ('0.block', block), ('0.block.linear1', l1),
+                                                   ('0.block.linear2', l2)])
+
+    def test_register_buffer_raises_error_if_name_is_not_string(self):
+        m = nn.Module()
+        expected_error = 'buffer name should be a string. Got '
+        with self.assertRaisesRegex(TypeError, expected_error + 'int'):
+            m.register_buffer(1, torch.rand(5))
+        with self.assertRaisesRegex(TypeError, expected_error + 'NoneType'):
+            m.register_buffer(None, torch.rand(5))
+
+    def test_register_buffer_raises_error_if_attr_exists(self):
+        m = nn.Module()
+        m.attribute_name = 5
+        with self.assertRaises(KeyError):
+            m.register_buffer('attribute_name', torch.rand(5))
+
+        del m.attribute_name
+        m.register_parameter('attribute_name', nn.Parameter())
+        with self.assertRaises(KeyError):
+            m.register_buffer('attribute_name', torch.rand(5))
+
+        del m.attribute_name
+        m.add_module('attribute_name', nn.Module())
+        with self.assertRaises(KeyError):
+            m.register_buffer('attribute_name', torch.rand(5))
+
+    def test_register_buffer_raises_error_if_not_tensor(self):
+        m = nn.Module()
+        with self.assertRaises(TypeError):
+            m.register_buffer('attribute_name', 5)
+
+    def test_register_buffer_allows_overwriting_with_same_name(self):
+        m = nn.Module()
+        buffer1 = torch.rand(5)
+        buffer2 = buffer1 + 5
+        buffer3 = None
+        m.register_buffer('buffer_name', buffer1)
+        self.assertEqual(m.buffer_name, buffer1)
+        m.register_buffer('buffer_name', buffer2)
+        self.assertEqual(m.buffer_name, buffer2)
+        m.register_buffer('buffer_name', buffer3)
+        self.assertEqual(m.buffer_name, buffer3)
+
+    def test_register_parameter_raises_error_if_name_is_not_string(self):
+        m = nn.Module()
+        expected_error = 'parameter name should be a string. Got '
+        with self.assertRaisesRegex(TypeError, expected_error + 'int'):
+            m.register_parameter(1, nn.Parameter())
+        with self.assertRaisesRegex(TypeError, expected_error + 'NoneType'):
+            m.register_parameter(None, nn.Parameter())
+
+    def test_register_parameter_raises_error_if_attr_exists(self):
+        m = nn.Module()
+        m.attribute_name = 5
+        with self.assertRaises(KeyError):
+            m.register_parameter('attribute_name', nn.Parameter())
+
+        del m.attribute_name
+        m.register_buffer('attribute_name', torch.rand(5))
+        with self.assertRaises(KeyError):
+            m.register_parameter('attribute_name', nn.Parameter())
+
+        del m.attribute_name
+        m.add_module('attribute_name', nn.Module())
+        with self.assertRaises(KeyError):
+            m.register_parameter('attribute_name', nn.Parameter())
+
+    def test_register_parameter_allows_overwriting_with_same_name(self):
+        m = nn.Module()
+        param1 = nn.Parameter(torch.rand(5))
+        param2 = nn.Parameter(param1.data + 5)
+        param3 = None
+        m.register_parameter('param_name', param1)
+        self.assertEqual(m.param_name, param1)
+        m.register_parameter('param_name', param2)
+        self.assertEqual(m.param_name, param2)
+        m.register_parameter('param_name', param3)
+        self.assertEqual(m.param_name, param3)
+
+    def test_add_module_raises_error_if_attr_exists(self):
+        m = nn.Module()
+        m.attribute_name = 5
+        with self.assertRaises(KeyError):
+            m.add_module('attribute_name', nn.Module())
+
+        del m.attribute_name
+        m.register_buffer('attribute_name', torch.rand(5))
+        with self.assertRaises(KeyError):
+            m.add_module('attribute_name', nn.Module())
+
+        del m.attribute_name
+        m.register_parameter('attribute_name', nn.Parameter())
+        with self.assertRaises(KeyError):
+            m.add_module('attribute_name', nn.Module())
+
+    def test_Sequential_getitem(self):
+        l1 = nn.Linear(10, 20)
+        l2 = nn.Linear(20, 30)
+        l3 = nn.Linear(30, 40)
+        l4 = nn.Linear(40, 50)
+        n = nn.Sequential(l1, l2, l3, l4)
+        self.assertIs(n[0], l1)
+        self.assertIs(n[1], l2)
+        self.assertIs(n[2], l3)
+        self.assertIs(n[3], l4)
+        self.assertIs(n[torch.tensor(3, dtype=torch.int64)], l4)
+        self.assertEqual(n[1:], nn.Sequential(l2, l3, l4))
+        self.assertEqual(n[3:], nn.Sequential(l4))
+        self.assertEqual(n[:-1], nn.Sequential(l1, l2, l3))
+        self.assertEqual(n[:-3], nn.Sequential(l1))
+        self.assertEqual(n[::-1], nn.Sequential(l4, l3, l2, l1))
+
+    def test_Sequential_setitem(self):
+        l1 = nn.Linear(10, 20)
+        l2 = nn.Linear(20, 30)
+        l3 = nn.Linear(30, 40)
+        l4 = nn.Linear(40, 50)
+        n = nn.Sequential(l1, l2, l3)
+        n[0] = l4
+        n[-1] = l4
+        n[torch.tensor(1, dtype=torch.int16)] = l1
+        self.assertIs(n[0], l4)
+        self.assertIs(n[1], l1)
+        self.assertIs(n[2], l4)
+
+    def test_Sequential_setitem_named(self):
+        l1 = nn.Linear(10, 20)
+        l2 = nn.Linear(20, 30)
+        l3 = nn.Linear(30, 40)
+        l4 = nn.Linear(40, 50)
+        n = nn.Sequential(OrderedDict([
+            ('linear1', l1),
+            ('linear2', l2),
+            ('linear3', l3),
+        ]))
+
+        n[0] = l4
+        n[-1] = l4
+        self.assertEqual(n.linear1, l4)
+        self.assertEqual(n.linear3, l4)
+
+    def test_Sequential_delitem(self):
+        l1 = nn.Linear(10, 20)
+        l2 = nn.Linear(20, 30)
+        l3 = nn.Linear(30, 40)
+        l4 = nn.Linear(40, 50)
+        n = nn.Sequential(l1, l2, l3, l4)
+        del n[-1]
+        self.assertEqual(n, nn.Sequential(l1, l2, l3))
+        del n[1::2]
+        self.assertEqual(n, nn.Sequential(l1, l3))
+
+    def test_ModuleList(self):
+        modules = [nn.ReLU(), nn.Linear(5, 5)]
+        module_list = nn.ModuleList(modules)
+
+        def check():
+            self.assertEqual(len(module_list), len(modules))
+            for m1, m2 in zip(modules, module_list):
+                self.assertIs(m1, m2)
+            for m1, m2 in zip(modules, module_list.children()):
+                self.assertIs(m1, m2)
+            for i in range(len(modules)):
+                self.assertIs(module_list[i], modules[i])
+
+        check()
+        modules += [nn.Conv2d(3, 4, 3)]
+        module_list += [modules[-1]]
+        check()
+        modules.append(nn.Tanh())
+        module_list.append(modules[-1])
+        check()
+        next_modules = [nn.Linear(5, 5), nn.Sigmoid()]
+        modules.extend(next_modules)
+        module_list.extend(next_modules)
+        check()
+        modules[2] = nn.Conv2d(5, 3, 2)
+        module_list[2] = modules[2]
+        check()
+        idx = torch.tensor(2, dtype=torch.int32)
+        modules[2] = nn.Conv2d(5, 3, 2)
+        module_list[idx] = modules[2]
+        self.assertIs(module_list[idx], modules[2])
+        check()
+        self.assertEqual(module_list[1:], nn.ModuleList(modules[1:]))
+        self.assertEqual(module_list[3:], nn.ModuleList(modules[3:]))
+        self.assertEqual(module_list[:-1], nn.ModuleList(modules[:-1]))
+        self.assertEqual(module_list[:-3], nn.ModuleList(modules[:-3]))
+        self.assertEqual(module_list[::-1], nn.ModuleList(modules[::-1]))
+        del module_list[-1]
+        self.assertEqual(module_list, nn.ModuleList(modules[:-1]))
+        del module_list[1::2]
+        self.assertEqual(module_list, nn.ModuleList(modules[:-1][0::2]))
+
+        with self.assertRaises(TypeError):
+            module_list += nn.ReLU()
+        with self.assertRaises(TypeError):
+            module_list.extend(nn.ReLU())
+
+        l1 = nn.Linear(1, 2)
+        l2 = nn.Linear(2, 3)
+        l3 = nn.Linear(3, 2)
+        l4 = nn.Linear(2, 3)
+        subnet = nn.Sequential(l3, l4)
+        s = nn.Sequential(
+            OrderedDict([
+                ("layer1", l1),
+                ("layer2", l2),
+                ("layer3", l3),
+                ("layer4", l4),
+                ("subnet_layer", subnet)
+            ])
+        )
+        modules = list(s.modules())
+        module_list = nn.ModuleList()
+        module_list.extend(s.modules())
+        check()
+
+    def test_ModuleDict(self):
+        modules = OrderedDict([
+            ('act', nn.ReLU()),
+            ('conv', nn.Conv2d(10, 10, 5)),
+            ('fc', nn.Linear(5, 5)),
+        ])
+
+        module_dict = nn.ModuleDict(modules)
+
+        def check():
+            self.assertEqual(len(module_dict), len(modules))
+            for k1, m2 in zip(modules, module_dict.children()):
+                self.assertIs(modules[k1], m2)
+            for k1, k2 in zip(modules, module_dict):
+                self.assertIs(modules[k1], module_dict[k2])
+            for k in module_dict:
+                self.assertIs(module_dict[k], modules[k])
+            for k in module_dict.keys():
+                self.assertIs(module_dict[k], modules[k])
+            for k, v in module_dict.items():
+                self.assertIs(modules[k], v)
+            for k1, m2 in zip(modules, module_dict.values()):
+                self.assertIs(modules[k1], m2)
+            for k in modules.keys():
+                self.assertTrue(k in module_dict)
+        check()
+
+        modules['conv'] = nn.Conv2d(3, 4, 3)
+        module_dict['conv'] = modules['conv']
+        check()
+
+        next_modules = [
+            ('fc2', nn.Linear(5, 5)),
+            ('act', nn.Sigmoid()),
+        ]
+        modules.update(next_modules)
+        module_dict.update(next_modules)
+        check()
+
+        next_modules = OrderedDict([
+            ('fc3', nn.Linear(5, 5)),
+            ('act2', nn.Sigmoid()),
+        ])
+        modules.update(next_modules)
+        module_dict.update(next_modules)
+        check()
+
+        next_modules = {
+            'fc4': nn.Linear(5, 5),
+            'act3': nn.Sigmoid()
+        }
+        modules.update(sorted(next_modules.items()))
+        module_dict.update(next_modules)
+        check()
+
+        del module_dict['fc']
+        del modules['fc']
+        check()
+
+        with self.assertRaises(TypeError):
+            module_dict.update(nn.ReLU())
+
+        with self.assertRaises(TypeError):
+            module_dict.update([nn.ReLU()])
+
+        with self.assertRaises(ValueError):
+            module_dict.update([[nn.ReLU()]])
+
+        with self.assertRaises(TypeError):
+            module_dict[1] = nn.ReLU()
+
+        s = nn.Sequential(modules)
+        module_dict = nn.ModuleDict(s.named_children())
+        check()
+
+        c = module_dict.pop('conv')
+        self.assertIs(c, modules['conv'])
+        modules.pop('conv')
+        check()
+
+        module_dict.clear()
+        self.assertEqual(len(module_dict), 0)
+        modules.clear()
+        check()
+
+    def test_ParameterList(self):
+        def make_param():
+            return Parameter(torch.randn(10, 10))
+        parameters = [make_param(), make_param()]
+        param_list = nn.ParameterList(parameters)
+
+        def check():
+            self.assertEqual(len(parameters), len(param_list))
+            for p1, p2 in zip(parameters, param_list):
+                self.assertIs(p1, p2)
+            for p1, p2 in zip(parameters, param_list.parameters()):
+                self.assertIs(p1, p2)
+            for i in range(len(parameters)):
+                self.assertIs(parameters[i], param_list[i])
+
+        check()
+        parameters += [make_param()]
+        param_list += [parameters[-1]]
+        check()
+        parameters.append(make_param())
+        param_list.append(parameters[-1])
+        check()
+        next_params = [make_param(), make_param()]
+        parameters.extend(next_params)
+        param_list.extend(next_params)
+        check()
+        parameters[2] = make_param()
+        param_list[2] = parameters[2]
+        check()
+        idx = torch.tensor(2, dtype=torch.int32)
+        parameters[2] = make_param()
+        param_list[idx] = parameters[2]
+        self.assertIs(param_list[idx], parameters[2])
+        check()
+        self.assertEqual(param_list[1:], nn.ParameterList(parameters[1:]))
+        self.assertEqual(param_list[3:], nn.ParameterList(parameters[3:]))
+        self.assertEqual(param_list[:-1], nn.ParameterList(parameters[:-1]))
+        self.assertEqual(param_list[:-3], nn.ParameterList(parameters[:-3]))
+        self.assertEqual(param_list[::-1], nn.ParameterList(parameters[::-1]))
+
+        with self.assertRaises(TypeError):
+            param_list += make_param()
+        with self.assertRaises(TypeError):
+            param_list.extend(make_param())
+
+        l1 = nn.Linear(1, 2)
+        l2 = nn.Linear(2, 3)
+        l3 = nn.Linear(3, 2)
+        l4 = nn.Linear(2, 3)
+        subnet = nn.Sequential(l3, l4)
+        s = nn.Sequential(
+            OrderedDict([
+                ("layer1", l1),
+                ("layer2", l2),
+                ("layer3", l3),
+                ("layer4", l4),
+                ("subnet_layer", subnet)
+            ])
+        )
+        parameters = list(s.parameters())
+        param_list = nn.ParameterList()
+        param_list.extend(s.parameters())
+        check()
+
+    def test_ParameterDict(self):
+        parameters = OrderedDict([
+            ('p1', Parameter(torch.randn(10, 10))),
+            ('p2', Parameter(torch.randn(10, 10))),
+            ('p3', Parameter(torch.randn(10, 10))),
+        ])
+
+        parameter_dict = nn.ParameterDict(parameters)
+
+        def check():
+            self.assertEqual(len(parameter_dict), len(parameters))
+            for k1, m2 in zip(parameters, parameter_dict.parameters()):
+                self.assertIs(parameters[k1], m2)
+            for k1, k2 in zip(parameters, parameter_dict):
+                self.assertIs(parameters[k1], parameter_dict[k2])
+            for k in parameter_dict:
+                self.assertIs(parameter_dict[k], parameters[k])
+            for k in parameter_dict.keys():
+                self.assertIs(parameter_dict[k], parameters[k])
+            for k, v in parameter_dict.items():
+                self.assertIs(v, parameters[k])
+            for k1, m2 in zip(parameters, parameter_dict.values()):
+                self.assertIs(parameters[k1], m2)
+            for k in parameters.keys():
+                self.assertTrue(k in parameter_dict)
+
+        check()
+
+        parameters['p4'] = Parameter(torch.randn(10, 10))
+        parameter_dict['p4'] = parameters['p4']
+        check()
+
+        next_parameters = [
+            ('p5', Parameter(torch.randn(10, 10))),
+            ('p2', Parameter(torch.randn(10, 10))),
+        ]
+        parameters.update(next_parameters)
+        parameter_dict.update(next_parameters)
+        check()
+
+        next_parameters = OrderedDict([
+            ('p6', Parameter(torch.randn(10, 10))),
+            ('p5', Parameter(torch.randn(10, 10))),
+        ])
+        parameters.update(next_parameters)
+        parameter_dict.update(next_parameters)
+        check()
+
+        next_parameters = {
+            'p8': Parameter(torch.randn(10, 10)),
+            'p7': Parameter(torch.randn(10, 10))
+        }
+        parameters.update(sorted(next_parameters.items()))
+        parameter_dict.update(next_parameters)
+        check()
+
+        del parameter_dict['p3']
+        del parameters['p3']
+        check()
+
+        with self.assertRaises(TypeError):
+            parameter_dict.update(1)
+
+        with self.assertRaises(TypeError):
+            parameter_dict.update([1])
+
+        with self.assertRaises(ValueError):
+            parameter_dict.update(Parameter(torch.randn(10, 10)))
+
+        with self.assertRaises(TypeError):
+            parameter_dict[1] = Parameter(torch.randn(10, 10))
+
+        p_pop = parameter_dict.pop('p4')
+        self.assertIs(p_pop, parameters['p4'])
+        parameters.pop('p4')
+        check()
+
+        parameter_dict.clear()
+        self.assertEqual(len(parameter_dict), 0)
+        parameters.clear()
+        check()
+
+    def test_add_module(self):
+        l = nn.Linear(10, 20)
+        net = nn.Module()
+        net.l = l
+        net.l2 = l
+        net.add_module('empty', None)
+        self.assertEqual(net.l, l)
+        self.assertEqual(net.l2, l)
+        self.assertEqual(net.empty, None)
+        net.add_module('l3', l)
+        self.assertEqual(net.l3, l)
+        l3 = nn.Linear(20, 10)
+        net.add_module('l', l3)
+        self.assertEqual(net.l, l3)
+        self.assertRaises(TypeError, lambda: net.add_module('x', 'non-module'))
+        self.assertRaisesRegex(TypeError, 'module name should be a string. Got int',
+                               lambda: net.add_module(1, l))
+        self.assertRaisesRegex(TypeError, 'module name should be a string. Got NoneType',
+                               lambda: net.add_module(None, l))
+
+    def test_module_to_argparse(self):
+        net = nn.Sequential(nn.Linear(3, 3))
+        cpu = torch.device('cpu')
+        with self.assertRaises(TypeError):
+            net.to(cpu, True)
+        with self.assertRaises(TypeError):
+            net.to(torch.long)
+        with self.assertRaises(TypeError):
+            net.to(None, True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, torch.long, True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, dtype=torch.long, non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to([])
+        with self.assertRaises(TypeError):
+            net.to({}, non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to(torch.tensor(3, dtype=torch.long), non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, torch.tensor(3, dtype=torch.long), non_blocking=True)
+
+    def test_type(self):
+        l = nn.Linear(10, 20)
+        net = nn.Module()
+        net.l = l
+        net.l2 = l
+        net.add_module('empty', None)
+        net.register_buffer('indices', torch.LongTensor(1))
+        net.float()
+        self.assertIsInstance(l.weight.data, torch.FloatTensor)
+        self.assertIsInstance(l.bias.data, torch.FloatTensor)
+        self.assertIsInstance(net.indices, torch.LongTensor)
+        net.double()
+        self.assertIsInstance(l.weight.data, torch.DoubleTensor)
+        self.assertIsInstance(l.bias.data, torch.DoubleTensor)
+        self.assertIsInstance(net.indices, torch.LongTensor)
+        net.to(torch.half)
+        self.assertIsInstance(l.weight.data, torch.HalfTensor)
+        self.assertIsInstance(l.bias.data, torch.HalfTensor)
+        self.assertIsInstance(net.indices, torch.LongTensor)
+        if TEST_CUDA:
+            net.float().cuda()
+            self.assertIsInstance(l.weight.data, torch.cuda.FloatTensor)
+            self.assertIsInstance(l.bias.data, torch.cuda.FloatTensor)
+            self.assertIsInstance(net.indices, torch.cuda.LongTensor)
+            net.cpu()
+            self.assertIsInstance(l.weight.data, torch.FloatTensor)
+            self.assertIsInstance(l.bias.data, torch.FloatTensor)
+            self.assertIsInstance(net.indices, torch.LongTensor)
+            net.to("cuda", torch.double, True)
+            self.assertIsInstance(l.weight.data, torch.cuda.DoubleTensor)
+            self.assertIsInstance(l.bias.data, torch.cuda.DoubleTensor)
+            self.assertIsInstance(net.indices, torch.cuda.LongTensor)
+            net.to(torch.empty(1, device="cuda:0", dtype=torch.half))
+            self.assertIsInstance(l.weight.data, torch.cuda.HalfTensor)
+            self.assertIsInstance(l.bias.data, torch.cuda.HalfTensor)
+            self.assertIsInstance(net.indices, torch.cuda.LongTensor)
+        net.to(torch.device("cpu"), non_blocking=True)
+        self.assertIsInstance(l.weight.data, torch.HalfTensor)
+        self.assertIsInstance(l.bias.data, torch.HalfTensor)
+        self.assertIsInstance(net.indices, torch.LongTensor)
+        net.type(torch.FloatTensor)
+        self.assertIsInstance(l.weight.data, torch.FloatTensor)
+        self.assertIsInstance(l.bias.data, torch.FloatTensor)
+        net.to(torch.DoubleTensor(1))
+        self.assertIsInstance(l.weight.data, torch.DoubleTensor)
+        self.assertIsInstance(l.bias.data, torch.DoubleTensor)
+        if TEST_CUDA:
+            net.type(torch.cuda.FloatTensor)
+            self.assertIsInstance(l.weight.data, torch.cuda.FloatTensor)
+            self.assertIsInstance(l.bias.data, torch.cuda.FloatTensor)
+
+    def test_non_leaf_parameters(self):
+        l1 = nn.Linear(10, 10)
+        l2 = nn.Linear(10, 10)
+
+        def assign_weight():
+            l2.weight = l1.weight + 2
+
+        self.assertRaises(TypeError, assign_weight)
+        # This should work though
+        l2.weight = Parameter(torch.randn(10, 10))
+
+    def test_clip_grad_norm(self):
+        l = nn.Linear(10, 10)
+        max_norm = 2
+
+        def compute_norm(norm_type):
+            norm_type = float(norm_type)
+            if norm_type != inf:
+                total_norm = 0
+                for p in l.parameters():
+                    total_norm += p.grad.data.abs().pow(norm_type).sum()
+                return pow(total_norm, 1. / norm_type)
+            else:
+                return max(p.grad.data.abs().max() for p in l.parameters())
+
+        def compare_scaling(grads):
+            p_scale = [p.grad.data.div(g).view(-1) for p, g in zip(l.parameters(), grads)]
+            scale = torch.cat(p_scale)
+            self.assertEqual(scale.std(), 0)
+            return scale[0]
+
+        grads = torch.arange(1., 101).view(10, 10), torch.ones(10).div(1000)
+        for norm_type in [0.5, 1.5, 2, 4, 'inf']:
+            for p, g in zip(l.parameters(), grads):
+                p._grad = Variable(g.clone().view_as(p.data))
+            norm_before = compute_norm(norm_type)
+            norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
+            norm_after = compute_norm(norm_type)
+            self.assertEqual(norm, norm_before)
+            self.assertEqual(norm_after, max_norm)
+            self.assertLessEqual(norm_after, norm_before)
+            compare_scaling(grads)
+
+        # Small gradients should be left unchanged
+        grads = torch.rand(10, 10).div(10000), torch.ones(10).div(500)
+        for norm_type in [0.5, 1.5, 2, 4, 'inf']:
+            for p, g in zip(l.parameters(), grads):
+                p.grad.data.copy_(g)
+            norm_before = compute_norm(norm_type)
+            norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
+            norm_after = compute_norm(norm_type)
+            self.assertEqual(norm, norm_before)
+            self.assertEqual(norm_before, norm_after)
+            self.assertLessEqual(norm_after, max_norm)
+            scale = compare_scaling(grads)
+            self.assertEqual(scale, 1)
+
+        # Should accept a single Tensor as input
+        p1, p2 = torch.randn(10, 10), torch.randn(10, 10)
+        g = torch.arange(1., 101).view(10, 10)
+        p1._grad = g.clone()
+        p2._grad = g.clone()
+        for norm_type in [0.5, 1.5, 2, 4, 'inf']:
+            clip_grad_norm_(p1, max_norm, norm_type=norm_type)
+            clip_grad_norm_([p2], max_norm, norm_type=norm_type)
+            self.assertEqual(p1.grad, p2.grad)
+
+    def test_clip_grad_value(self):
+        l = nn.Linear(10, 10)
+        clip_value = 2.5
+
+        grad_w, grad_b = torch.arange(-50., 50).view(10, 10).div_(5), torch.ones(10).mul_(2)
+        for grad_list in [[grad_w, grad_b], [grad_w, None]]:
+            for p, g in zip(l.parameters(), grad_list):
+                p._grad = g.clone().view_as(p.data) if g is not None else g
+
+        clip_grad_value_(l.parameters(), clip_value)
+        for p in filter(lambda p: p.grad is not None, l.parameters()):
+            self.assertLessEqual(p.grad.data.max(), clip_value)
+            self.assertGreaterEqual(p.grad.data.min(), -clip_value)
+
+        # Should accept a single Tensor as input
+        p1, p2 = torch.randn(10, 10), torch.randn(10, 10)
+        g = torch.arange(-50., 50).view(10, 10).div_(5)
+        p1._grad = g.clone()
+        p2._grad = g.clone()
+        clip_grad_value_(p1, clip_value)
+        clip_grad_value_([p2], clip_value)
+        self.assertEqual(p1.grad, p2.grad)
+
+    def test_parameters_to_vector(self):
+        conv1 = nn.Conv2d(3, 10, 5)
+        fc1 = nn.Linear(10, 20)
+        model = nn.Sequential(conv1, fc1)
+
+        vec = parameters_to_vector(model.parameters())
+        self.assertEqual(vec.size(0), 980)
+
+    def test_vector_to_parameters(self):
+        conv1 = nn.Conv2d(3, 10, 5)
+        fc1 = nn.Linear(10, 20)
+        model = nn.Sequential(conv1, fc1)
+
+        vec = Variable(torch.arange(0., 980))
+        vector_to_parameters(vec, model.parameters())
+
+        sample = next(model.parameters())[0, 0, 0]
+        self.assertTrue(torch.equal(sample.data, vec.data[:5]))
+
+    # We don't want to make propagating NaN a hard requirement on ops, but for
+    # these easy ones, we should make them do so.
+    def _test_nonlinearity_propagate_nan(self, device):
+        def test(nonlinearity, *args, **kwargs):
+            x = torch.tensor([nan], device=device)
+            fn = getattr(F, nonlinearity)
+            try:
+                self.assertTrue(math.isnan(fn(x, *args, **kwargs).item()))
+            except Exception as e:
+                if 'not implemented' not in str(e):
+                    raise
+
+        test('relu')
+        test('relu', inplace=True)
+        test('relu6')
+        test('elu')
+        test('selu')
+        test('rrelu')
+        test('rrelu', inplace=True)
+        test('hardtanh')
+        test('tanh')
+        test('sigmoid')
+        test('logsigmoid')
+        test('hardshrink')
+        test('tanhshrink')
+        test('softsign')
+        test('softmin', 0)
+        test('softmax', 0)
+        test('log_softmax', 0)
+        test('leaky_relu', 0.2)
+
+    def test_nonlinearity_propagate_nan(self):
+        self._test_nonlinearity_propagate_nan('cpu')
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_nonlinearity_propagate_nan_cuda(self):
+        self._test_nonlinearity_propagate_nan('cuda')
+
+    def test_weight_norm(self):
+        input = torch.randn(3, 5)
+        m = nn.Linear(5, 7)
+        expected_output = m(input)
+
+        # add weight normalization
+        m = torch.nn.utils.weight_norm(m)
+        self.assertEqual(m.weight_v.size(), m.weight.size())
+        self.assertEqual(m.weight_g.size(), (7, 1))
+        self.assertEqual(m(input), expected_output)
+
+        # remove weight norm
+        m = torch.nn.utils.remove_weight_norm(m)
+        self.assertFalse(hasattr(m, 'weight_g'))
+        self.assertFalse(hasattr(m, 'weight_v'))
+        self.assertEqual(m(input), expected_output)
+
+        # test with dim=1
+        m = torch.nn.utils.weight_norm(m, dim=1)
+        self.assertEqual(m.weight_v.size(), m.weight.size())
+        self.assertEqual(m.weight_g.size(), (1, 5))
+        self.assertEqual(m(input), expected_output)
+
+    def test_weight_norm_pickle(self):
+        m = torch.nn.utils.weight_norm(nn.Linear(5, 7))
+        m = pickle.loads(pickle.dumps(m))
+        self.assertIsInstance(m, nn.Linear)
+
+    def test_spectral_norm(self):
+        input = torch.randn(3, 5)
+        m = nn.Linear(5, 7)
+        m = torch.nn.utils.spectral_norm(m)
+
+        self.assertEqual(m.weight_u.size(), torch.Size([m.weight.size(0)]))
+        # weight_orig should be trainable
+        self.assertTrue(hasattr(m, 'weight_orig'))
+        self.assertTrue('weight_orig' in m._parameters)
+        # weight_u should be just a reused buffer
+        self.assertTrue(hasattr(m, 'weight_u'))
+        self.assertTrue('weight_u' in m._buffers)
+        self.assertTrue('weight' in m._buffers)
+        # weight should be a plain attribute, not counted as a buffer or a param
+        self.assertFalse('weight' in m._parameters)
+        # it should also be sharing storage as `weight_orig`
+        self.assertEqual(m.weight_orig.storage(), m.weight.storage())
+        self.assertEqual(m.weight_orig.size(), m.weight.size())
+        self.assertEqual(m.weight_orig.stride(), m.weight.stride())
+
+        m = torch.nn.utils.remove_spectral_norm(m)
+        self.assertFalse(hasattr(m, 'weight_orig'))
+        self.assertFalse(hasattr(m, 'weight_u'))
+        # weight should be converted back as a parameter
+        self.assertTrue(hasattr(m, 'weight'))
+        self.assertTrue('weight' in m._parameters)
+
+    def test_spectral_norm_eval_remove(self):
+        inp = torch.randn(3, 5)
+        m = nn.Linear(5, 7)
+        m = torch.nn.utils.spectral_norm(m)
+        x0 = m(inp)
+        m.eval()
+        # test that eval mode and removing / adding+removing doesn't change weight and output
+        x1 = m(inp)
+        x2 = m(inp)
+        self.assertEqual(x0, x1)
+        self.assertEqual(x0, x2)
+        # test that we can backward several times without running into problems
+        x1 = m(inp)
+        x1.sum().backward()
+        x1 = m(inp)
+        x1.sum().backward()
+        # test removing
+        m = torch.nn.utils.remove_spectral_norm(m)
+        x3 = m(inp)
+        self.assertEqual(x0, x3)
+        m = torch.nn.utils.spectral_norm(m)
+        m = torch.nn.utils.remove_spectral_norm(m)
+        x4 = m(inp)
+        self.assertEqual(x0, x4)
+        # check that removing after train doesn't change output
+        m.train()
+        m = torch.nn.utils.spectral_norm(m)
+        for i in range(5):
+            x0 = m(inp)
+        m = torch.nn.utils.remove_spectral_norm(m)
+        x1 = m(inp)
+        self.assertEqual(x0, x1)
+
+    def test_spectral_norm_dim(self):
+        inp = torch.randn(2, 3, 10, 12)
+        m = nn.ConvTranspose2d(3, 4, (5, 6))
+        m = torch.nn.utils.spectral_norm(m)
+        # this should not run into incompatible shapes
+        x = m(inp)
+        # check that u refers to the same dimension
+        self.assertEqual(m.weight_u.shape, m.weight_orig[0, :, 0, 0].shape)
+
+    def test_spectral_norm_forward(self):
+        input = torch.randn(3, 5)
+        m = nn.Linear(5, 7)
+        m = torch.nn.utils.spectral_norm(m)
+        # naive forward
+        _weight, _bias, _u = m.weight_orig, m.bias, m.weight_u
+        _weight_mat = _weight.view(_weight.size(0), -1)
+        _v = torch.mv(_weight_mat.t(), _u)
+        _v = F.normalize(_v, dim=0, eps=1e-12)
+        _u = torch.mv(_weight_mat, _v)
+        _u = F.normalize(_u, dim=0, eps=1e-12)
+        _weight.data /= torch.dot(_u, torch.matmul(_weight_mat, _v))
+        out_hat = torch.nn.functional.linear(input, _weight, _bias)
+        expect_out = m(input)
+        self.assertAlmostEqual(expect_out, out_hat)
+
+    def test_spectral_norm_pickle(self):
+        m = torch.nn.utils.spectral_norm(nn.Linear(5, 7))
+        m = pickle.loads(pickle.dumps(m))
+        self.assertIsInstance(m, nn.Linear)
+
+    def test_embedding_sparse_basic(self):
+        embedding = nn.Embedding(10, 20, sparse=True)
+        input = Variable(torch.LongTensor([[0, 2, 4, 5], [4, 3, 0, 9]]))
+        embedding(input).sum().backward()
+        self.assertTrue(embedding.weight.grad.is_sparse)
+        self.assertEqual(embedding.weight.grad.shape, embedding.weight.shape)
+
+    def test_embedding_padding_idx(self):
+        embedding = nn.Embedding(10, 20, padding_idx=0)
+        input = Variable(torch.LongTensor([[0, 2, 4, 5], [4, 3, 0, 9]]))
+        output = embedding(input)
+        self.assertEqual(output[0][0].sum(), 0)
+        self.assertEqual(output[1][2].sum(), 0)
+
+        embedding = nn.Embedding(10, 20, padding_idx=0, sparse=True)
+        input = Variable(torch.LongTensor([[0, 2, 4, 5], [4, 3, 0, 9]]))
+        output = embedding(input)
+        self.assertEqual(output[0][0].sum(), 0)
+        self.assertEqual(output[1][2].sum(), 0)
+
+        # negative indexing check for padding_idx
+        # padding_idx=-2, num_embeddings=10 ==> index 8 padded
+        embedding = nn.Embedding(10, 20, padding_idx=-2)
+        input = Variable(torch.LongTensor([[0, 2, 8, 5], [4, 8, 0, 9]]))
+        output = embedding(input)
+        self.assertEqual(output[0][2].sum(), 0)
+        self.assertEqual(output[1][1].sum(), 0)
+
+        embedding = nn.Embedding(10, 20, padding_idx=-2, sparse=True)
+        input = Variable(torch.LongTensor([[0, 2, 8, 5], [4, 8, 0, 9]]))
+        output = embedding(input)
+        self.assertEqual(output[0][2].sum(), 0)
+        self.assertEqual(output[1][1].sum(), 0)
+
+        # out of bounds check for padding_idx
+        self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=25)
+        self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=-25)
+
+        # test backward when input contains padding_idx
+        padding_idx = 0
+        embedding = nn.Embedding(5, 2, padding_idx=padding_idx)
+        for n in (1, 2):
+            for other_indices in ([], [1, 3], [2]):
+                indices = torch.LongTensor(other_indices + [padding_idx] * n)
+                pre = embedding.weight[padding_idx].clone()
+                embedding(indices).sum().backward()
+                after = (embedding.weight + embedding.weight.grad)[padding_idx]
+                embedding.zero_grad()
+                self.assertEqual(after, pre)
+
+    def test_embedding_max_norm(self):
+        embedding = nn.Embedding(22, 5, max_norm=1.0)
+        input = Variable(torch.LongTensor([2, 8, 8, 6]))
+        output = embedding(input)
+        self.assertEqual(output[1], output[2])
+        self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_embedding_max_norm_cuda(self, dtype=torch.float):
+        embedding = nn.Embedding(22, 5, max_norm=1.0).to("cuda", dtype=dtype)
+        # nn.Embedding only takes LongTensor as input
+        input = torch.tensor([2, 8, 8, 6], device="cuda", dtype=torch.long)
+        output = embedding(input)
+        self.assertEqual(output[1], output[2])
+        self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())
+
+    def test_embedding_from_pretrained(self):
+        a = torch.Tensor([[1, 2, 3], [4, 5, 6]])
+        embedding = nn.Embedding.from_pretrained(a)
+        self.assertEqual(a, embedding.weight.data)
+
+        input = Variable(torch.LongTensor([0, 1]))
+        output = embedding(input)
+        self.assertEqual(a, output)
+
+    def test_embedding_functional(self):
+        a = torch.tensor([
+            [1, 3, 2],
+            [0, 2, 1]
+        ], dtype=torch.long)
+        embeddings = torch.rand(4, 3, requires_grad=True)
+
+        embed_old = torch.nn.Embedding(4, 3)
+        embed_old.weight.data = embeddings.data
+        res_old = embed_old(a)
+
+        res_F = F.embedding(a, embeddings)
+        self.assertEqual(res_old, res_F)
+
+    def _test_gumbel_softmax_st(self, cuda, dtype=torch.float):
+        th = torch.cuda if cuda else torch
+        """
+        Things we might want to check:
+        - if we make various draws, do we get different one-hot values?
+        - is the proportion approximately in line with the softmax values?
+        - with hard, is it one-hot?
+        - with hard, is there still a gradient?
+        """
+        num_draws = 100
+        K = 3
+        logits = torch.tensor([[0.2, 0.8, 0.1]])
+        if dtype != torch.half:
+            logits = logits.to(dtype)
+        logits_softmax = torch.nn.functional.softmax(logits, 1)
+        y_draws = torch.zeros(num_draws, K)
+        preds = torch.zeros(num_draws)
+
+        if cuda:
+            logits = logits.cuda()
+            y_draws = y_draws.cuda()
+            preds = preds.cuda()
+
+        exceed_limits = 0
+        for draw in range(num_draws):
+            logits_var = torch.tensor(logits, requires_grad=True)
+            y_draw = torch.nn.functional.gumbel_softmax(
+                logits_var,
+                hard=True)
+            assert y_draw.size() == logits.size()
+            # check we have a gradient
+            assert y_draw.requires_grad
+            err = y_draw - logits.new_tensor([[0, 0.5, 0.3]])
+            loss = (err * err).sum()
+            loss.backward()
+            if logits_var.grad.std() < 0.01 or logits_var.grad.std() > 1.0:
+                exceed_limits += 1
+            y_draws[draw] = y_draw.data
+            _, pred = y_draw.max(1)
+            preds[draw] = pred.data[0]
+        assert exceed_limits / num_draws < 0.05
+        # check it's approximately one-hot
+        num_ones = (y_draws == 1).int().sum()
+        num_zeros = (y_draws == 0).int().sum()
+        assert num_ones + num_zeros == num_draws * K
+        assert num_ones == num_draws
+        # check output classes approx in line with logits
+        num_class_one = (preds == 1).int().sum()
+        assert num_class_one < num_draws
+        assert num_class_one > num_draws / 3
+
+    def test_gumbel_softmax_st(self):
+        self._test_gumbel_softmax_st(False)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_gumbel_softmax_st_cuda(self, dtype=torch.float):
+        self._test_gumbel_softmax_st(True, dtype=dtype)
+
+    def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
+        # check a known test example
+        device = torch.device("cuda") if cuda else torch.device("cpu")
+        es = nn.EmbeddingBag(5, 2, mode=mode, sparse=sparse).to(device, dtype)
+        es.weight.data.copy_(torch.arange(1, 11, device=device, dtype=dtype).view_as(es.weight))
+        input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=torch.long)
+
+        # Empty list is only handled in CPU for now
+        offsets = torch.tensor([0, 3], device=device, dtype=torch.long) if cuda \
+            else torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long)
+
+        grad_output = torch.tensor(
+            [1, 2,
+             3, 4], device=device, dtype=dtype).view(2, 2)
+        grad_output_with_empty = torch.tensor(
+            [99, 99,
+             1, 2,
+             99, 99,
+             3, 4,
+             99, 99], device=device, dtype=dtype).view(5, 2)
+
+        if mode == "sum" or mode == "mean":
+            denominator = 1 if mode == "sum" else 3
+            expected_output = torch.tensor(
+                [[13, 16],
+                 [13, 16]], device=device, dtype=dtype) / denominator
+
+            expected_output_with_empty = torch.tensor(
+                [[0, 0],
+                 [13, 16],
+                 [0, 0],
+                 [13, 16],
+                 [0, 0]], device=device, dtype=dtype) / denominator
+
+            expected_grad_weight = torch.tensor(
+                [[3, 4],
+                 [5, 8],
+                 [0, 0],
+                 [1, 2],
+                 [3, 4]], device=device, dtype=dtype) / denominator
+        elif mode == "max":
+            expected_output = torch.tensor(
+                [[7, 8],
+                 [9, 10]], device=device, dtype=dtype)
+
+            expected_output_with_empty = torch.tensor(
+                [[0, 0],
+                 [7, 8],
+                 [0, 0],
+                 [9, 10],
+                 [0, 0]], device=device, dtype=dtype)
+
+            expected_grad_weight = torch.tensor(
+                [[0, 0],
+                 [0, 0],
+                 [0, 0],
+                 [1, 2],
+                 [3, 4]], device=device, dtype=dtype)
+
+        output = es(input, offsets)
+        output.backward(grad_output if cuda else grad_output_with_empty)
+
+        es_weight_grad = es.weight.grad.data
+        if sparse:
+            es_weight_grad = es.weight.grad.data.to_dense()
+        self.assertEqual(
+            output.data,
+            expected_output if cuda else expected_output_with_empty)
+        self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
+
+        # check same example except as 2D (2 x 3)
+        input = input.data.view(2, -1)
+        es.zero_grad()
+        output = es(input)
+        output.backward(grad_output)
+
+        es_weight_grad = es.weight.grad.data
+        if sparse:
+            es_weight_grad = es.weight.grad.data.to_dense()
+        self.assertEqual(output.data, expected_output)
+        self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
+
+        # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
+        def _test_vs_Embedding(N, D, B, L, max_norm=None):
+            es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype)
+            e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype)
+            e.weight.data.copy_(es.weight.data)
+            input = torch.randint(N, (B, L), device=device, dtype=torch.long)
+            offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L)
+            grad_output = torch.rand(B, D, device=device, dtype=dtype)
+
+            output = es(input.view(-1), offsets)
+            if mode == 'sum':
+                ref_output = e(input).sum(1)
+            elif mode == 'mean':
+                ref_output = e(input).mean(1)
+            elif mode == 'max':
+                ref_output = e(input).max(1)[0]
+
+            self.assertEqual(output, ref_output, dtype2prec[dtype])
+
+            output.backward(grad_output)
+            ref_output.backward(grad_output)
+            es_weight_grad = es.weight.grad.data
+            if sparse:
+                es_weight_grad = es.weight.grad.data.to_dense()
+
+            # We have more floating point error here because we are dealing with larger numbers
+            needed_prec = dtype2prec[dtype] * 2
+            self.assertEqual(es_weight_grad, e.weight.grad, needed_prec)
+
+        N, D, B, L = random.randint(1, 100), random.randint(1, 100), random.randint(1, 50), random.randint(1, 50)
+        _test_vs_Embedding(N, D, B, L)
+        for max_norm in (None, 3):
+            for p in itertools.product([1, 2], repeat=4):
+                _test_vs_Embedding(*p, max_norm=max_norm)
+
+        # check that giving illegal input combos raises error
+        es = nn.EmbeddingBag(10, 20, mode=mode, sparse=sparse)
+        input = torch.ones(3, 4)
+        offset = torch.arange(0, 3)
+        self.assertRaises(ValueError, lambda: es(input, offset))
+        self.assertRaises(ValueError, lambda: es(input.view(-1)))
+        offset[0] = 1
+        self.assertRaises(ValueError, lambda: es(input.view(-1), offset))
+        offset[0] = 0
+        offset[-1] = 100
+        self.assertRaises(ValueError, lambda: es(input.view(-1), offset))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_pool3d_size_one_feature_dim(self):
+        # Tests crazy strides for feature dim of size 1
+        x = Variable(torch.randn(7, 1, 5, 3, 2, device="cuda"))
+        strange_strides = [30, 1234, 6, 2, 1]
+        y = x.as_strided(x.size(), strange_strides)
+        x = x.cpu().as_strided(x.size(), strange_strides)
+
+        to_test = {
+            'max_pool3d': lambda t: F.max_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
+            'avg_pool3d': lambda t: F.avg_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
+        }
+
+        for test, fn in to_test.items():
+            # Should not crash
+            out_y = fn(y)
+            out_x = fn(x)
+            self.assertEqual(out_y, out_x.cuda(), test)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_AvgPool3d_backward_after_cat_dim1_cuda(self):
+        # x has to have batch_size 1 to test contiguous checks
+        x = torch.randn(1, 3, 4, 4, 4, device="cuda", requires_grad=True)
+        y = F.avg_pool3d(x, kernel_size=3, padding=1, stride=2)
+
+        grad = torch.randn(y.size(), device="cuda")
+        # increase the stride in dimension 0. the tensor is still contiguous because size[0] is 1
+        stride = list(grad.stride())
+        stride[0] = stride[0] * 2
+        grad.set_(grad.storage(), 0, grad.size(), stride)
+        assert grad.is_contiguous()
+
+        y.backward(grad)
+
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    def test_contig_wrong_stride_cudnn(self):
+        # x has to have batch_size 1 to test contiguous checks
+        x = torch.randn(1, 16, 5, 5, device="cuda")
+        stride = list(x.stride())
+        stride[0] = 20
+        # change the stride in dimension 0. the tensor is still contiguous because size[0] is 1
+        x.set_(x.storage(), 0, x.size(), stride)
+        self.assertTrue(x.is_contiguous())
+        F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device="cuda"))
+        F.conv2d(x, torch.randn(1, 16, 1, 1, device="cuda"))
+
+    def test_embedding_bag(self):
+        self._test_EmbeddingBag(False, 'sum', False)
+        self._test_EmbeddingBag(False, 'mean', False)
+        self._test_EmbeddingBag(False, 'max', False)
+
+        self._test_EmbeddingBag(False, 'sum', True)
+        self._test_EmbeddingBag(False, 'mean', True)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_embedding_bag_cuda(self, dtype=torch.float):
+        self._test_EmbeddingBag(True, 'sum', False, dtype)
+        self._test_EmbeddingBag(True, 'mean', False, dtype)
+        self._test_EmbeddingBag(True, 'max', False, dtype)
+        if dtype != torch.half:
+            # torch.cuda.sparse.HalfTensor is not enabled.
+            self._test_EmbeddingBag(True, 'sum', True, dtype)
+            self._test_EmbeddingBag(True, 'mean', True, dtype)
+
+    def test_fractional_max_pool2d(self):
+        x = torch.randn(1, 2, 7, 7, requires_grad=True)
+        samples = x.new(1, 2, 2).uniform_()
+
+        def func(x):
+            return F.fractional_max_pool2d(
+                x, (2, 2), output_size=(3, 3), _random_samples=samples)
+
+        self.assertEqual(func(x).shape, (1, 2, 3, 3))
+        gradcheck(func, [x])
+        gradgradcheck(func, [x])
+
+        x = torch.randn(2, 7, 7, requires_grad=True)
+        samples = x.new(2, 2).uniform_()
+        self.assertEqual(func(x).shape, (2, 3, 3))
+        gradcheck(func, [x])
+        gradgradcheck(func, [x])
+
+    def test_Dropout(self):
+        input = torch.Tensor(1000)
+        self._test_dropout(nn.Dropout, input)
+
+    def test_Dropout2d(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, w, h)
+        self._test_dropout(nn.Dropout2d, input)
+
+    def test_Dropout3d(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        d = random.randint(1, 2)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, d, w, h)
+        self._test_dropout(nn.Dropout3d, input)
+
+    def test_AlphaDropout(self):
+        # generate random tensor with zero mean and unit std
+        input = torch.randn(5000)
+        self._test_alpha_dropout(nn.AlphaDropout, input)
+
+    def test_FeatureAlphaDropout(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        d = random.randint(1, 2)
+        num_features = 1000
+        input = torch.randn(num_features, b, d, w, h)
+        self._test_alpha_dropout(nn.FeatureAlphaDropout, input)
+
+    def _test_InstanceNorm_general(self, cls, input, device="cpu", dtype=torch.float):
+        # default case track_running_stats=False
+        b, c = input.size(0), input.size(1)
+        input_var = torch.tensor(input, device=device, dtype=dtype, requires_grad=True)
+
+        IN = cls(c, eps=0).to(device, dtype)
+
+        output = IN(input_var)
+        out_reshaped = output.view(b * c, -1)
+
+        mean = out_reshaped.mean(1)
+        var = out_reshaped.var(1, unbiased=False)
+
+        self.assertAlmostEqual(torch.abs(mean.data).mean(), 0, delta=1e-5)
+        self.assertAlmostEqual(torch.abs(var.data).mean(), 1, delta=1e-5)
+
+        # check that eval mode doesn't change behavior
+        grad_out = torch.randn_like(output)
+        res1 = output.data.clone()
+        output.backward(grad_out)
+        grad1 = input_var.grad.data.clone()
+
+        IN.eval()
+        output = IN(input_var)
+        input_var.grad = None
+        output.backward(grad_out)
+        res2 = output.data
+        grad2 = input_var.grad.data
+        self.assertEqual(res1, res2)
+        self.assertEqual(grad1, grad2)
+
+        # If track_running_stats=True and momentum=1, running_mean/var should be
+        # equal to mean/var of the input (with unbias correction)
+        IN = cls(c, momentum=1, eps=0, track_running_stats=True).to(device, dtype)
+
+        output = IN(input_var)
+
+        input_reshaped = input_var.transpose(1, 0).reshape(c, -1)
+        mean = input_reshaped.mean(1)
+
+        input_reshaped = input_var.transpose(1, 0).reshape(c, b, -1)
+        var = input_reshaped.var(2, unbiased=True)[:, :]
+
+        self.assertAlmostEqual(torch.abs(mean.data - IN.running_mean).mean(), 0, delta=1e-5)
+        self.assertAlmostEqual(torch.abs(var.data.mean(1) - IN.running_var).mean(), 0, delta=1e-5)
+
+        # in eval mode, adding X * std to a channel in input should make the
+        # corresponding channel in output have mean X
+        IN.eval()
+        delta = IN.running_var.sqrt() * torch.arange(c, device=device, dtype=dtype)
+        delta = delta.view(-1, *[1 for _ in range(2, input.dim())])
+        output = IN(input_var + delta)
+        self.assertEqual(output.transpose(0, 1).reshape(c, -1).mean(1), torch.arange(c))
+
+    def _test_InstanceNorm_cuda_half(self, cls, input):
+        # THNN
+        input = Variable(input.cuda().half().random_(1, 10), requires_grad=True)
+        m = cls(input.size(1), affine=True, track_running_stats=True).to("cuda", torch.half)
+        thnn_output = m(input)
+        thnn_output.sum().backward()
+        thnn_input_grad = input.grad.data.clone()
+        self.assertEqual(thnn_output.type(), input.type())
+        # cuDNN
+        if TEST_CUDNN:
+            input.grad = None
+            m = m.float()
+            cudnn_output = m(input)
+            cudnn_output.sum().backward()
+            cudnn_input_grad = input.grad.data.clone()
+            self.assertEqual(cudnn_output.type(), input.type())
+            self.assertAlmostEqual(cudnn_output, thnn_output, delta=1e-4)
+            self.assertAlmostEqual(cudnn_input_grad, thnn_input_grad, delta=1e-3)
+
+    def test_InstanceNorm1d_general(self):
+        b = random.randint(3, 5)
+        c = random.randint(3, 5)
+        d = random.randint(8, 10)
+
+        input = torch.rand(b, c, d)
+        self._test_InstanceNorm_general(nn.InstanceNorm1d, input, dtype=torch.float)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_InstanceNorm1d_general_cuda(self):
+        b = random.randint(3, 5)
+        c = random.randint(3, 5)
+        d = random.randint(8, 10)
+
+        input = torch.rand(b, c, d)
+        self._test_InstanceNorm_general(nn.InstanceNorm1d, input, "cuda", torch.float)
+        self._test_InstanceNorm_cuda_half(nn.InstanceNorm1d, input)
+
+    def test_InstanceNorm2d_general(self):
+        b = random.randint(3, 5)
+        c = random.randint(3, 5)
+        w = random.randint(3, 6)
+        h = random.randint(6, 8)
+
+        input = torch.rand(b, c, h, w)
+        self._test_InstanceNorm_general(nn.InstanceNorm2d, input, dtype=torch.float)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_InstanceNorm2d_general_cuda(self):
+        b = random.randint(3, 5)
+        c = random.randint(3, 5)
+        w = random.randint(3, 6)
+        h = random.randint(6, 8)
+
+        input = torch.rand(b, c, h, w)
+        self._test_InstanceNorm_general(nn.InstanceNorm2d, input, "cuda", torch.float)
+        self._test_InstanceNorm_cuda_half(nn.InstanceNorm2d, input)
+
+    def test_InstanceNorm3d_general(self):
+        b = random.randint(3, 5)
+        c = random.randint(3, 5)
+        w = random.randint(2, 5)
+        h = random.randint(2, 5)
+        d = random.randint(2, 5)
+
+        input = torch.rand(b, c, h, w, d)
+        self._test_InstanceNorm_general(nn.InstanceNorm3d, input, dtype=torch.float)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_InstanceNorm3d_general_cuda(self):
+        b = random.randint(3, 5)
+        c = random.randint(2, 5)
+        w = random.randint(2, 5)
+        h = random.randint(2, 5)
+        d = random.randint(2, 5)
+
+        input = torch.rand(b, c, h, w, d)
+        self._test_InstanceNorm_general(nn.InstanceNorm3d, input, "cuda", torch.float)
+        self._test_InstanceNorm_cuda_half(nn.InstanceNorm3d, input)
+
+    def _test_LayerNorm_general(self, device="cpu", dtype=torch.float):
+        for i in range(2, 6):
+            shape = torch.randint(3, 6, (i,), dtype=torch.long).tolist()
+            x = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10)
+            normalized_ndim = random.randint(1, i - 1)  # inclusive
+            normalized_shape = shape[-normalized_ndim:]
+            unnormalized_shape = shape[:-normalized_ndim]
+
+            # test that LN normalizes to mean 0 and stddev 1
+            ln = nn.LayerNorm(normalized_shape, eps=0).to(device, dtype)
+            ln.weight.data.fill_(1)
+            ln.bias.data.fill_(0)
+            output = ln(x)
+            out_reshaped = output.view(*(unnormalized_shape + [-1]))
+            mean = out_reshaped.mean(-1)
+            var = out_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean.data).mean(), 0, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var.data).mean(), 1, delta=1e-5)
+
+            # test that LN applies weight and bias correctly
+            scale, bias = torch.empty(2).uniform_(0.2, 2).tolist()
+            ln.weight.data.fill_(scale)
+            ln.bias.data.fill_(bias)
+            output = ln(x)
+            out_reshaped = output.view(*(unnormalized_shape + [-1]))
+            mean = out_reshaped.mean(-1)
+            var = out_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean.data).mean(), bias, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var.data).mean(), scale ** 2, delta=1e-5)
+
+        bad_norm_shape_input_shape = {
+            (): (),
+            (2, 3): (3,),
+            (2,): (1, 2, 3),
+            (10,): (2, 3),
+            10: (2, 3),
+        }
+        for norm_shape, input_shape in bad_norm_shape_input_shape.items():
+            ln = nn.LayerNorm(norm_shape)
+            input = torch.empty(input_shape, device=device, dtype=dtype).uniform_(0, 10)
+            self.assertRaises(RuntimeError, lambda: ln(input))
+
+    def _test_LayerNorm_cuda_half(self):
+        input = Variable(torch.empty(2, 3, 3, 2).to("cuda", torch.half).random_(1, 10), requires_grad=True)
+        m = nn.LayerNorm([3, 2]).to("cuda", torch.half)
+        output = m(input)
+        output.sum().backward()
+        self.assertEqual(output.type(), input.type())
+
+    def test_LayerNorm_general(self):
+        self._test_LayerNorm_general()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_LayerNorm_general_cuda(self):
+        self._test_LayerNorm_general("cuda")
+        self._test_LayerNorm_cuda_half()
+
+    def _test_GroupNorm_general(self, device="cpu", dtype=torch.float):
+        good_shape_g = {
+            (1, 2, 3, 4): 2,
+            (2, 3, 10): 3,
+            (3, 1, 1, 1, 2): 1,
+            (2, 6, 4, 2, 2): 3,
+        }
+        for shape, g in good_shape_g.items():
+            x = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10)
+            b = shape[0]
+            c = shape[1]
+
+            # test that GN normalizes to mean 0 and stddev 1
+            gn = nn.GroupNorm(g, c, eps=0).to(device, dtype)
+            gn.weight.data.fill_(1)
+            gn.bias.data.fill_(0)
+            output = gn(x)
+            out_reshaped = output.view(b, g, -1)
+            mean = out_reshaped.mean(-1)
+            var = out_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean).mean(), 0, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var).mean(), 1, delta=1e-5)
+
+            # test that GN applies weight and bias correctly
+            scale = torch.empty(c, device=device, dtype=dtype).uniform_(0.2, 2)
+            bias = torch.empty(c, device=device, dtype=dtype).uniform_(0.2, 2)
+            gn.weight.data.copy_(scale)
+            gn.bias.data.copy_(bias)
+            output = gn(x)
+            out_reshaped = output.view(b, c, -1)
+            out_normed = (out_reshaped - bias.view(c, 1)) / scale.view(c, 1)
+            out_normed_reshaped = out_normed.view(b, g, -1)
+            mean = out_normed_reshaped.mean(-1)
+            var = out_normed_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean).mean(), 0, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var).mean(), 1, delta=1e-5)
+
+        bad_shape_g = {
+            (1, 2, 3, 4): 3,
+            (2, 3, 10): 2,
+            (3, 1, 1, 1, 2): 10,
+            (2, 6, 4, 2, 2): 4,
+        }
+        for shape, g in bad_shape_g.items():
+            gn = nn.GroupNorm(g, shape[1])
+            input = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10)
+            self.assertRaises(RuntimeError, lambda: gn(input))
+
+    def _test_GroupNorm_cuda_half(self):
+        input = Variable(torch.empty(2, 3, 3, 2).to("cuda", torch.half).random_(1, 10), requires_grad=True)
+        input = torch.zeros(2, 4, 3, 2, requires_grad=True).cuda().half().random_(1, 10)
+        m = nn.GroupNorm(2, 4).to("cuda", torch.half)
+        output = m(input)
+        output.sum().backward()
+        self.assertEqual(output.type(), input.type())
+
+    def test_GroupNorm_general(self):
+        self._test_GroupNorm_general(dtype=torch.float)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_GroupNorm_general_cuda(self):
+        self._test_GroupNorm_general("cuda", torch.float)
+        self._test_GroupNorm_cuda_half()
+
+    def test_pad(self):
+        inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
+        _assertGradAndGradgradChecks(self, lambda x: F.pad(x, (1, 1, 1, 1)), (inputs,))
+        _assertGradAndGradgradChecks(self, lambda x: F.pad(x, (-1, 1, -2, 1)), (inputs,))
+        _assertGradAndGradgradChecks(self, lambda x: F.pad(x, (-1, 1, -2, 1), value=2), (inputs,))
+        self.assertTrue(gradcheck(lambda x: F.pad(x, (-1, 1, -2, 1), mode='replicate'), (inputs,)))
+        self.assertTrue(gradcheck(lambda x: F.pad(x, (-1, 1, -2, 1), mode='reflect'), (inputs,)))
+
+        inputs = torch.randn(1, 2, 3, 4, 4, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x: F.pad(x, (1, 1, 1, 1, 1, 1), mode='replicate'), (inputs,)))
+
+        # assert that relfection padding errors when pad >= input size
+        expected_err_msg = r"Padding size should be less than the corresponding input dimension"
+        self.assertRaisesRegex(RuntimeError, expected_err_msg,
+                               lambda: F.pad(torch.randn(1, 1, 2, 3), (1, 1, 3, 0), mode='reflect'))
+        self.assertRaisesRegex(RuntimeError, expected_err_msg,
+                               lambda: F.pad(torch.randn(1, 1, 2), (2, 1), mode='reflect'))
+
+    def test_pad_scalar_error(self):
+        inputs = torch.tensor(0., requires_grad=True)
+        self.assertRaises(AssertionError, lambda: F.pad(inputs, (1, 1)))
+        self.assertRaises(AssertionError, lambda: F.pad(inputs, (1,)))
+
+    def test_normalize(self):
+        inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))
+        self.assertTrue(gradcheck(lambda x: F.normalize(x, p=2, dim=-2), (inputs,)))
+
+        inputs = torch.randn((), requires_grad=True)
+        self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))
+
+    def _test_maxpool_indices(self, num_dim, adaptive=False, device="cpu", dtype=torch.float):
+        def expected_indices(dim):
+            if dim == 1:
+                return torch.tensor([1, 3], dtype=torch.double).repeat(2, 2, 1)
+            if dim == 2:
+                return torch.tensor([[5, 7], [13, 15]], dtype=torch.double).repeat(2, 2, 1, 1)
+
+        def expected_grad(dim):
+            if dim == 1:
+                return torch.tensor([0, 1, 0, 1], dtype=torch.double).repeat(2, 2, 1)
+            grad = expected_grad(dim - 1)
+            zero = torch.zeros(grad.size())
+            return torch.stack((zero, grad, zero, grad), 2)
+
+        def expected_output(dim):
+            if dim == 1:
+                return torch.arange(2, 17, 2).view(2, 2, 2)
+            if dim == 2:
+                col = torch.arange(6, 63, 8)
+                return torch.stack([col, col + 2], 1).view(2, 2, 2, 2)
+
+        if adaptive:
+            cls_name = 'AdaptiveMaxPool{}d'.format(num_dim)
+        else:
+            cls_name = 'MaxPool{}d'.format(num_dim)
+        module_cls = getattr(nn, cls_name)
+        module = module_cls(2, return_indices=True).to(device, dtype=dtype)
+        numel = 4 ** (num_dim + 1)
+        input = torch.arange(1, numel + 1).view(2, 2, *repeat(4, num_dim)).to(device, dtype=dtype)
+        input_var = torch.tensor(input, requires_grad=True)
+
+        # Check forward
+        output, indices = module(input_var)
+        if num_dim != 3:
+            expected_indices = expected_indices(num_dim)
+            expected_output = expected_output(num_dim)
+            self.assertEqual(indices.dim(), input.dim())
+            self.assertEqual(indices.data.squeeze(), expected_indices)
+            self.assertEqual(output.data.squeeze(), expected_output)
+        self.assertTrue(output.requires_grad)
+        self.assertFalse(indices.requires_grad)
+
+        # Make sure backward works
+        grad_output = torch.ones(output.size(), device=device, dtype=dtype)
+        output.backward(grad_output, retain_graph=True)
+        expected_grad = expected_grad(num_dim)
+        self.assertEqual(input_var.grad.data, expected_grad.view_as(input))
+
+        # Make sure backward after changing indices will result in an error
+        indices.add_(1)
+        self.assertRaises(RuntimeError, lambda: output.backward(grad_output))
+
+    def test_adaptive_pooling_input_size(self):
+        for numel in (2, 3):
+            for pool_type in ('Max', 'Avg'):
+                cls_name = 'Adaptive{}Pool{}d'.format(pool_type, numel)
+                module_cls = getattr(nn, cls_name)
+                output_size = (2,) * numel
+                module = module_cls(output_size)
+
+                input = torch.randn(output_size)
+                self.assertRaises(ValueError, lambda: module(input))
+
+    def test_adaptive_pooling_size_none(self):
+        for numel in (2, 3):
+            for pool_type in ('Max', 'Avg'):
+                cls_name = 'Adaptive{}Pool{}d'.format(pool_type, numel)
+                module_cls = getattr(nn, cls_name)
+                output_size = (2,) * (numel - 1) + (None,)
+                module = module_cls(output_size)
+
+                input = torch.randn((4,) * (numel + 1))
+                output = module(input)
+                self.assertEqual(output.size(), (4,) + (2,) * (numel - 1) + (4,))
+
+    def test_Conv2d_naive_groups(self):
+        self._test_Conv2d_naive_groups()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_Conv2d_naive_groups_cuda(self, dtype=torch.float):
+        self._test_Conv2d_naive_groups("cuda", dtype)
+
+    def test_batchnorm_eval(self):
+        self._test_batchnorm_eval()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_batchnorm_eval_cuda(self, dtype=torch.float):
+        self._test_batchnorm_eval("cuda", dtype)
+
+    def test_batchnorm_simple_average(self):
+        self._test_batchnorm_simple_average()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_batchnorm_simple_average_cuda(self):
+        self._test_batchnorm_simple_average(torch.cuda.FloatTensor)
+
+    def test_MaxPool1d_indices(self):
+        self._test_maxpool_indices(1)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_MaxPool1d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(1, device="cuda", dtype=dtype)
+
+    def test_MaxPool2d_indices(self):
+        self._test_maxpool_indices(2)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_MaxPool2d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(2, device="cuda", dtype=dtype)
+
+    def test_MaxPool3d_indices(self):
+        self._test_maxpool_indices(3)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_MaxPool3d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(3, device="cuda", dtype=dtype)
+
+    def test_AdaptiveMaxPool1d_indices(self):
+        self._test_maxpool_indices(1, adaptive=True)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_AdaptiveMaxPool1d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(1, adaptive=True, device="cuda", dtype=dtype)
+
+    def test_AdaptiveMaxPool2d_indices(self):
+        self._test_maxpool_indices(2, adaptive=True)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_AdaptiveMaxPool2d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(2, adaptive=True, device="cuda", dtype=dtype)
+
+    def test_AdaptiveMaxPool3d_indices(self):
+        self._test_maxpool_indices(3, adaptive=True)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_AdaptiveMaxPool3d_indices_cuda(self, dtype=torch.float):
+        self._test_maxpool_indices(3, adaptive=True, device="cuda", dtype=dtype)
+
+    @staticmethod
+    def _test_max_pool_nan(self, device, dtype=torch.float):
+        for adaptive in ['', 'adaptive_']:
+            for num_dim in [1, 2, 3]:
+                fn_name = '{}max_pool{}d'.format(adaptive, num_dim)
+                fn = getattr(F, fn_name)
+                x = torch.full([1, 1] + num_dim * [3], nan)
+                res = fn(x, 1 if adaptive else 3)
+                self.assertTrue(math.isnan(res.item()))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_max_pool_nan_cuda(self, dtype=torch.float):
+        self._test_max_pool_nan(self, device="cuda", dtype=dtype)
+
+    def test_max_pool_nan(self, dtype=torch.float):
+        self._test_max_pool_nan(self, device="cpu")
+
+    def _test_scatter(self, tensor):
+        x = torch.tensor(tensor, requires_grad=True)
+        result = dp.scatter(x, (0, 1))
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0], x[:2])
+        self.assertEqual(result[0].get_device(), 0)
+        self.assertEqual(result[1], x[2:])
+        self.assertEqual(result[1].get_device(), 1)
+        grad = result[0].data.clone().fill_(2)
+        result[0].backward(grad)
+        self.assertEqual(x.grad.data[:2], grad)
+        self.assertEqual(x.grad.data[2:], grad.clone().zero_())
+        _assertGradAndGradgradChecks(self, lambda y: dp.scatter(y, (0, 1)), (x,))
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_scatter_cpu(self):
+        self._test_scatter(torch.randn(4, 4))
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_scatter_gpu(self):
+        self._test_scatter(torch.randn(4, 4).cuda())
+
+    def _test_gather(self, output_device):
+        inputs = (
+            torch.randn(2, 4, device='cuda:0', requires_grad=True),
+            torch.randn(2, 4, device='cuda:1', requires_grad=True),
+        )
+        result = dp.gather(inputs, output_device)
+        self.assertEqual(result.size(), torch.Size([4, 4]))
+        self.assertEqual(result[:2], inputs[0])
+        self.assertEqual(result[2:], inputs[1])
+        if output_device != -1:
+            self.assertEqual(result.get_device(), output_device)
+        else:
+            self.assertFalse(result.is_cuda)
+        grad = torch.randn(4, 4)
+        if output_device != -1:
+            grad = grad.cuda(output_device)
+        result.backward(grad)
+        self.assertEqual(inputs[0].grad.data, grad[:2])
+        self.assertEqual(inputs[1].grad.data, grad[2:])
+        _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs)
+
+        # test scalar inputs, should stack into a vector in this case
+        inputs = (
+            torch.randn((), device='cuda:0', requires_grad=True),
+            torch.randn((), device='cuda:1', requires_grad=True),
+        )
+        result = dp.gather(inputs, output_device)
+        self.assertEqual(result.size(), torch.Size([2]))
+        self.assertEqual(result[0], inputs[0])
+        self.assertEqual(result[1], inputs[1])
+        if output_device != -1:
+            self.assertEqual(result.get_device(), output_device)
+        else:
+            self.assertFalse(result.is_cuda)
+        grad = torch.randn(2)
+        if output_device != -1:
+            grad = grad.cuda(output_device)
+        result.backward(grad)
+        self.assertEqual(inputs[0].grad, grad[0])
+        self.assertEqual(inputs[1].grad, grad[1])
+        _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_gather_cpu(self):
+        self._test_gather(-1)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_gather_gpu(self):
+        self._test_gather(0)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_gather_different_len_dicts(self):
+        inputs = (
+            {'a': Variable(torch.randn(1, 2).cuda(0), requires_grad=True)},
+            {
+                'b': Variable(torch.randn(1, 2).cuda(1), requires_grad=True),
+                'a': Variable(torch.randn(1, 2).cuda(1), requires_grad=True)
+            }
+        )
+        with self.assertRaises(ValueError):
+            _ = dp.gather(inputs, target_device=0)
+
+    def _test_broadcast_double_backwards(self, *tensors):
+        variables = tuple(torch.tensor(t, requires_grad=True) for t in tensors)
+        _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), variables)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_broadcast_double_backwards_gpu(self):
+        self._test_broadcast_double_backwards(torch.randn(4, 4).cuda(),
+                                              torch.randn(4, 4).cuda(),
+                                              torch.randn(4, 4).cuda())
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_broadcast_not_requiring_grad(self):
+        variables = [
+            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
+            Variable(torch.randn(1, 2).cuda(), requires_grad=False),
+            Variable(torch.randn(1, 2).cuda(), requires_grad=False),
+            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
+            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
+        ]
+        broadcasted_variables = Broadcast.apply((0, 1), *variables)
+        for output_idx, broadcasted_var in enumerate(broadcasted_variables):
+            input_var = variables[output_idx % len(variables)]
+            self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_broadcast_no_grad(self):
+        x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device='cuda')
+        with torch.no_grad():
+            broadcasted = Broadcast.apply((0, 1), x)
+        self.assertTrue(x.requires_grad)
+        for output in broadcasted:
+            self.assertFalse(output.requires_grad)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_replicate(self):
+        module = nn.Linear(10, 5).float().cuda()
+        input = Variable(torch.randn(2, 10).float().cuda())
+        expected_output = module(input).data
+        replicas = dp.replicate(module, (0, 1))
+        for i, replica in enumerate(replicas):
+            for p in replica.parameters():
+                self.assertEqual(p.get_device(), i)
+            replica_input = input.cuda(i)
+            self.assertEqual(replica(replica_input).data, expected_output)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_replicate_buffers(self):
+        net = nn.Module()
+        net.bn = nn.BatchNorm2d(10)
+        net.cuda()
+        replicas = dp.replicate(net, (0, 1))
+        for i, replica in enumerate(replicas):
+            self.assertEqual(replica.bn.running_mean.get_device(), i, 'buffer on wrong device')
+            self.assertEqual(replica.bn.running_var.get_device(), i, 'buffer on wrong device')
+            self.assertEqual(replica.bn.num_batches_tracked.get_device(), i, 'buffer on wrong device')
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_parallel_apply(self):
+        l1 = nn.Linear(10, 5).to("cuda:0", torch.float)
+        l2 = nn.Linear(10, 5).to("cuda:1", torch.float)
+        i1 = torch.randn(2, 10, device="cuda:0", dtype=torch.float)
+        i2 = torch.randn(2, 10, device="cuda:1", dtype=torch.float)
+        expected1 = l1(i1).data
+        expected2 = l2(i2).data
+        modules = (l1, l2)
+        expected_outputs = (expected1, expected2)
+
+        # each input can be either a collection of positional arguments
+        #                       or an object representing the single argument
+        for inputs in [((i1,), (i2,)), (i1, i2)]:
+            outputs = dp.parallel_apply(modules, inputs, None)
+            for out, expected in zip(outputs, expected_outputs):
+                self.assertEqual(out.data, expected)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_multiple_input(self):
+        class TestModule(nn.Module):
+
+            def forward(self, var1, var2, float1, var3=None):
+                if var3 is None:
+                    return float1 * (var1 * var2)
+                else:
+                    return float1 * (var1 * var2 + var3)
+
+        m = TestModule()
+        var1 = torch.randn(5, 5, dtype=torch.float, requires_grad=True)
+        var2 = torch.randn(5, 5, dtype=torch.float, requires_grad=True)
+        var3 = torch.randn(5, 5, dtype=torch.float, requires_grad=False)
+
+        float1 = torch.randn(1).item()
+
+        expected = m(var1, var2, float1)
+        loss = expected.sum()
+        loss.backward()
+        gvar1_exp = var1.grad.clone()
+        gvar2_exp = var2.grad.clone()
+
+        def local_test(out):
+            var1.grad.data.fill_(0.0)
+            var2.grad.data.fill_(0.0)
+            loss = out.sum()
+            loss.backward()
+            self.assertEqual(out, expected)
+            self.assertEqual(gvar1_exp, var1.grad)
+            self.assertEqual(gvar2_exp, var2.grad)
+
+        out = dp.data_parallel(m, (var1, var2, float1), (0, 1))
+        local_test(out)
+
+        out = dp.data_parallel(m, (var1, var2, float1), (1, 0))
+        local_test(out)
+
+        out = dp.data_parallel(m, (var1, var2, float1), (0,))
+        local_test(out)
+
+        var1.grad.data.fill_(0.0)
+        var2.grad.data.fill_(0.0)
+        expected = m(var1, var2, float1, var3=var3)
+        loss = expected.sum()
+        loss.backward()
+        gvar1_exp = var1.grad.clone()
+        gvar2_exp = var2.grad.clone()
+
+        dpm = nn.DataParallel(TestModule())
+        out = dpm(var1, var2, float1, var3=var3)
+        local_test(out)
+
+        dpm = nn.DataParallel(TestModule(), device_ids=[0])
+        out = dpm(var1, var2, float1, var3=var3)
+        local_test(out)
+
+        kwarg_wrap = {'var3': var3}
+        out = dp.data_parallel(
+            m, (var1, var2, float1), (0, 1), module_kwargs=kwarg_wrap)
+        local_test(out)
+
+        out = dp.data_parallel(
+            m, (var1, var2, float1), (0,), module_kwargs=kwarg_wrap)
+        local_test(out)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_small_back(self):
+        l = nn.Linear(10, 5).float().cuda()
+        i = Variable(torch.randn(20, 10).float().cuda())
+        out = dp.data_parallel(l, i, (0, 1))
+        self.assertEqual(out, l(i))
+
+    @unittest.skipIf(not TEST_MULTIGPU or not PY3, "multi-GPU not supported")
+    def test_data_parallel_model_no_refcycles(self):
+        # Python 2.7 will create reference cycles with the following
+        # Module on multiple GPUs, but Python 3 shouldn't unless
+        # there are refcycles on the PyTorch side (or the defined module)
+        import gc
+
+        class Model(nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.linear = nn.Linear(1, 1)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        gc.collect()
+        model = nn.DataParallel(Model().cuda())
+        data = Variable(torch.randn(1).cuda())
+        model(data)
+
+        refcycles = gc.collect()
+        self.assertEqual(refcycles, 0)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_no_grad(self):
+        test = self
+
+        class Layer(nn.Module):
+            def forward(self, x):
+                test.assertFalse(torch.is_grad_enabled())
+                return x
+
+        l = Layer()
+        i = Variable(torch.randn(20, 10).float().cuda())
+        with torch.no_grad():
+            dp.data_parallel(l, i, (0, 1))
+        self.assertRaises(AssertionError, lambda: dp.data_parallel(l, i, (0, 1)))
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel(self):
+        l = nn.Linear(10, 5).float().cuda()
+        i = Variable(torch.randn(20, 10).float().cuda(1))
+        l.cuda(1)
+        expected_out = l(i)
+        loss = expected_out.sum()
+        loss.backward()
+        expected_grads = []
+        for param in l.parameters():
+            expected_grads.append(param.grad.clone())
+        dev_ids_list = [(0, 1), (1, 0)]
+        for dev_id in dev_ids_list:
+            with torch.cuda.device(dev_id[0]):
+                l.cuda()
+                l.zero_grad()
+                out = dp.data_parallel(l, i, dev_id)
+                loss = out.sum()
+                loss.backward()
+                self.assertEqual(out.get_device(), dev_id[0])
+                self.assertEqual(out.data, expected_out.data)
+                for expected, param in zip(expected_grads, l.parameters()):
+                    self.assertEqual(param.grad.data, expected.data)
+
+        # Check for None device_ids
+        l = l.cuda()
+        out = dp.data_parallel(l, i)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_sparse(self):
+        l = nn.Embedding(10, 5, sparse=True).to("cuda:1")
+        i = torch.randint(10, (20, 5), device="cuda:1", dtype=torch.long)
+        expected_out = l(i)
+        loss = expected_out.sum()
+        loss.backward()
+        expected_grads = []
+        for param in l.parameters():
+            expected_grads.append(param.grad.clone())
+        dev_ids_list = [(0, 1), (1, 0)]
+        for dev_id in dev_ids_list:
+            with torch.cuda.device(dev_id[0]):
+                l.cuda()
+                l.zero_grad()
+                out = dp.data_parallel(l, i, dev_id)
+                loss = out.sum()
+                loss.backward()
+                self.assertEqual(out.get_device(), dev_id[0])
+                self.assertEqual(out.data, expected_out.data)
+                for expected, param in zip(expected_grads, l.parameters()):
+                    self.assertEqual(param.grad.data, expected.data)
+
+        # Check for None device_ids
+        l = l.cuda()
+        out = dp.data_parallel(l, i)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_nested_output(self):
+        def fn(input):
+            return [
+                input, (input.sin(), input.cos(), [input.add(1)]), input,
+                {'a': input, 'b': [input.sin()]}
+            ]
+
+        class Net(nn.Module):
+            def forward(self, input):
+                return fn(input)
+
+        i = Variable(torch.randn(2, 2).float().cuda(1))
+        gpus = range(torch.cuda.device_count())
+        output = dp.data_parallel(Net(), i, gpus)
+        self.assertEqual(output, fn(i))
+        self.assertIsInstance(output[0], torch.Tensor)
+        self.assertIsInstance(output[1], tuple)
+        self.assertIsInstance(output[1][0], torch.Tensor)
+        self.assertIsInstance(output[1][1], torch.Tensor)
+        self.assertIsInstance(output[1][2], list)
+        self.assertIsInstance(output[1][2][0], torch.Tensor)
+        self.assertIsInstance(output[2], torch.Tensor)
+        self.assertIsInstance(output[3], dict)
+        self.assertEqual(len(output[3]), 2)
+        self.assertIn('a', output[3])
+        self.assertIn('b', output[3])
+        self.assertIsInstance(output[3]['a'], torch.Tensor)
+        self.assertIsInstance(output[3]['b'], list)
+        self.assertIsInstance(output[3]['b'][0], torch.Tensor)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_data_parallel_nested_input(self):
+        def fn(input):
+            return input[1][0]
+
+        class Net(nn.Module):
+            def forward(self, *input):
+                return fn(input)
+
+        i = Variable(torch.randn(20, 3).float().cuda(1))
+        input = (i.cos(), (i.sin(), i), i.sin())
+        gpus = range(torch.cuda.device_count())
+        output = dp.data_parallel(Net(), input, gpus)
+        self.assertEqual(output, fn(input))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_data_parallel_module(self, dtype=torch.float):
+        l = nn.Linear(10, 5).to("cuda", dtype)
+        i = torch.randn(20, 10, device="cuda", dtype=dtype)
+        expected_out = l(i).data
+        net = nn.DataParallel(l)
+        out = net(i)
+        self.assertEqual(out.get_device(), 0)
+        self.assertEqual(out.data, expected_out)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_data_parallel_module_kwargs_only(self, dtype=torch.float):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l = l
+
+            def forward(self, input):
+                return self.l(input)
+
+        l = nn.Linear(10, 5).to("cuda", dtype)
+        i = torch.randn(20, 10, device="cuda", dtype=dtype)
+        expected_out = l(i).data
+        n = nn.DataParallel(Net())
+        out = n(input=i)
+        self.assertEqual(out.get_device(), 0)
+        self.assertEqual(out.data, expected_out)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_data_parallel_module_kwargs_only_empty_list(self, dtype=torch.float):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l = l
+
+            def forward(self, input):
+                return self.l(input['data'])
+
+        l = nn.Linear(10, 5).to("cuda", dtype)
+        i = torch.randn(20, 10, device="cuda", dtype=dtype)
+        expected_out = l(i).data
+        n = nn.DataParallel(Net())
+        out = n(input={'data': i, 'unused': []})
+        self.assertEqual(out.get_device(), 0)
+        self.assertEqual(out.data, expected_out)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_data_parallel_module_kwargs_only_empty_dict(self, dtype=torch.float):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l = l
+
+            def forward(self, input):
+                return self.l(input['data'])
+
+        l = nn.Linear(10, 5).to("cuda", dtype)
+        i = torch.randn(20, 10, device="cuda", dtype=dtype)
+        expected_out = l(i).data
+        n = nn.DataParallel(Net())
+        out = n(input={'data': i, 'unused': {}})
+        self.assertEqual(out.get_device(), 0)
+        self.assertEqual(out.data, expected_out)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_data_parallel_module_kwargs_only_empty_tuple(self, dtype=torch.float):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l = l
+
+            def forward(self, input):
+                return self.l(input['data'])
+
+        l = nn.Linear(10, 5).to("cuda", dtype)
+        i = torch.randn(20, 10, device="cuda", dtype=dtype)
+        expected_out = l(i).data
+        n = nn.DataParallel(Net())
+        out = n(input={'data': i, 'unused': ()})
+        self.assertEqual(out.get_device(), 0)
+        self.assertEqual(out.data, expected_out)
+
+    def test_state_dict(self):
+        l = nn.Linear(5, 5)
+        block = nn.Module()
+        block.conv = nn.Conv2d(3, 3, 3, bias=False)
+        net = nn.Module()
+        net.linear1 = l
+        net.linear2 = l
+        net.bn = nn.BatchNorm2d(2)
+        net.block = block
+        net.add_module('empty', None)
+
+        state_dict = net.state_dict()
+        self.assertEqual(len(state_dict), 10)
+        self.assertEqual(len(state_dict._metadata), 6)
+        self.assertIn('', state_dict._metadata)
+        self.assertIn('linear1', state_dict._metadata)
+        self.assertIn('linear1.weight', state_dict)
+        self.assertIn('linear1.bias', state_dict)
+        self.assertIn('linear2', state_dict._metadata)
+        self.assertIn('linear2.weight', state_dict)
+        self.assertIn('linear2.bias', state_dict)
+        self.assertIn('block', state_dict._metadata)
+        self.assertIn('block.conv', state_dict._metadata)
+        self.assertIn('block.conv.weight', state_dict)
+        self.assertIn('block.conv.weight', state_dict)
+        self.assertNotIn('block.conv.bias', state_dict)
+        self.assertIn('bn', state_dict._metadata)
+        self.assertIn('bn.weight', state_dict)
+        self.assertIn('bn.bias', state_dict)
+        self.assertIn('bn.running_var', state_dict)
+        self.assertIn('bn.running_mean', state_dict)
+        self.assertIn('bn.num_batches_tracked', state_dict)
+        self.assertFalse(any(map(lambda k: k.startswith('empty'), state_dict.keys())))
+        for k, v in state_dict.items():
+            param = net
+            for component in k.split('.'):
+                param = getattr(param, component)
+                if isinstance(param, Parameter):
+                    param = param.data
+            self.assertEqual(v.data_ptr(), param.data_ptr())
+
+        l = nn.Linear(5, 5)
+        state_dict = l.state_dict()
+        self.assertEqual(len(state_dict), 2)
+        self.assertEqual(len(state_dict._metadata), 1)
+        self.assertIn('', state_dict._metadata)
+        self.assertTrue(state_dict._metadata['']['version'] >= 0)
+        self.assertEqual(state_dict['weight'].data_ptr(), l.weight.data_ptr())
+        self.assertEqual(state_dict['bias'].data_ptr(), l.bias.data_ptr())
+
+    def test_load_state_dict(self):
+        l = nn.Linear(5, 5)
+        block = nn.Module()
+        block.conv1 = nn.Conv2d(3, 3, 3, bias=True)
+        block.conv2 = nn.Conv2d(3, 3, 3, bias=False)
+        net = nn.Module()
+        net.linear1 = l
+        net.linear2 = l
+        net.bn = nn.BatchNorm2d(2)
+        net.block = block
+        net.add_module('empty', None)
+
+        state_dict = net.state_dict()
+        state_dict.update({
+            'linear1.weight': torch.ones(5, 5),
+            'block.conv1.bias': torch.arange(1, 4),
+            'bn.running_mean': torch.randn(2),
+        })
+        net.load_state_dict(state_dict)
+        self.assertEqual(net.linear1.weight.data, state_dict['linear1.weight'])
+        self.assertEqual(net.block.conv1.bias.data, state_dict['block.conv1.bias'])
+        self.assertEqual(net.bn.running_mean, state_dict['bn.running_mean'])
+
+        state_dict = net.state_dict()
+        state_dict.update({'extra': torch.ones(5)})
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+
+        state_dict = net.state_dict()
+        state_dict.update({'extra.param': torch.ones(5)})
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+
+        state_dict = net.state_dict()
+        del state_dict['linear1.weight']
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+
+        state_dict = net.state_dict()
+        state_dict.update({'bn.running_mean': torch.rand(14, 4)})  # wrong size
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+
+        state_dict = net.state_dict()
+        old_state_dict = deepcopy(state_dict)
+        state_dict = {
+            'linear1.weight': torch.ones(5, 5),
+            'block.conv1.bias': torch.arange(1, 4),
+            'bn.running_mean': torch.randn(2),
+            'nonexistent_key': torch.rand(3)
+        }
+        net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(net.linear1.weight.data, state_dict['linear1.weight'])
+        self.assertEqual(net.block.conv1.bias.data, state_dict['block.conv1.bias'])
+        self.assertEqual(net.bn.running_mean, state_dict['bn.running_mean'])
+        new_state_dict = net.state_dict()
+        del old_state_dict['linear1.weight']
+        del old_state_dict['block.conv1.bias']
+        del old_state_dict['bn.running_mean']
+        for k, v, in old_state_dict.items():
+            self.assertTrue(v.equal(new_state_dict[k]))
+
+    def test_load_state_dict_BC(self):
+        # BatchNormNd
+        # Added num_batches_tracked buffer at version 2. For state dict with
+        # earlier versions or no versions, it should provide default value of 0.
+        bn = nn.BatchNorm2d(3)
+        state_dict = bn.state_dict()
+        del state_dict['num_batches_tracked']
+        state_dict._metadata['']['version'] = 1  # version 1
+        bn.load_state_dict(state_dict)
+        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
+        self.assertEqual(bn.num_batches_tracked.item(), 0)
+        del state_dict._metadata['']['version']  # no version
+        bn.load_state_dict(state_dict)
+        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
+        self.assertEqual(bn.num_batches_tracked.item(), 0)
+
+    def test_parameter_assignment(self):
+        l = nn.Linear(5, 5)
+
+        def num_params():
+            return len(list(l.parameters()))
+
+        self.assertEqual(num_params(), 2)
+
+        new_param = Parameter(torch.randn(5, 5))
+        l.param_name = new_param
+        self.assertEqual(num_params(), 3)
+        self.assertObjectIn(new_param, l.parameters())
+
+        var = torch.randn(5, 5)
+        l.var_name = var
+        self.assertEqual(num_params(), 3)
+        self.assertNotIn(id(var), map(id, l.parameters()))
+
+        # Make sure Variables are not saved as parameters
+        l.variable_attr = torch.empty(5, 5)
+        self.assertEqual(num_params(), 3)
+        l.param_attr = Parameter(torch.empty(5, 5))
+        self.assertEqual(num_params(), 4)
+
+        # It shouldn't be possible to replace a parameter with a Variable
+        def assign_var():
+            l.param_attr = torch.empty(5, 5)
+
+        self.assertRaises(TypeError, assign_var)
+        # But replacing it with None should be fine
+        l.param_attr = None
+        self.assertEqual(num_params(), 3)
+
+    def test_assignment(self):
+        l = nn.Module()
+        a = nn.Parameter(torch.randn(2))
+        b = nn.Parameter(torch.randn(3))
+        c = nn.Parameter(torch.randn(4))
+        q = nn.Linear(4, 4)
+        r = nn.Linear(5, 5)
+        w = nn.Linear(6, 6)
+
+        def test_assignments(get_list, a, b, c):
+            # Check that None can be shadowed
+            l.a = None
+            self.assertIsNone(l.a)
+            self.assertIn('a', l.__dict__)
+            l.a = a
+            self.assertIs(l.a, a)
+            self.assertEqual(get_list(), [a])
+            self.assertNotIn('a', l.__dict__)
+
+            # Assign second object
+            l.b = None
+            self.assertIsNone(l.b)
+            self.assertIn('b', l.__dict__)
+            l.b = b
+            self.assertIs(l.b, b)
+            self.assertEqual(get_list(), [a, b])
+            self.assertNotIn('b', l.__dict__)
+
+            # Remove and add the object back. Order should be unchanged.
+            l.a = None
+            self.assertIsNone(l.a)
+            self.assertEqual(get_list(), [b])
+            l.a = a
+            self.assertIs(l.a, a)
+            self.assertEqual(get_list(), [a, b])
+
+            # Replace object with another one. Order should be unchanged.
+            l.a = c
+            self.assertIs(l.a, c)
+            self.assertEqual(get_list(), [c, b])
+
+            # Remove and reassign an attribute. It should appear at the end of the list now.
+            del l.a
+            self.assertFalse(hasattr(l, 'a'))
+            l.a = a
+            self.assertIs(l.a, a)
+            self.assertEqual(get_list(), [b, a])
+
+        test_assignments(lambda: list(l.parameters()), a, b, c)
+        del l.a, l.b
+        self.assertEqual(list(l.parameters()), [])
+
+        test_assignments(lambda: list(l.children()), q, r, w)
+        del l.a, l.b
+        self.assertEqual(list(l.children()), [])
+
+        buf = torch.randn(10)
+        l.register_buffer('buf', buf)
+        self.assertIs(l.buf, buf)
+        l.buf = None
+        self.assertIs(l.buf, None)
+        self.assertNotIn('buf', l.__dict__)  # should be stored in l._buffers
+        l.buf = buf
+        self.assertIn('buf', l.state_dict())
+        self.assertEqual(l.state_dict()['buf'], buf)
+
+    def test_Conv2d_inconsistent_types(self):
+        inputs = Variable(torch.randn(4, 1, 7, 7).float())
+        weights = Variable(torch.randn(1, 1, 3, 3).double())
+        # inconsistent types should raise an exception
+        self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+        # but it should work with the same type
+        nn.functional.conv2d(inputs.float(), weights.float())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
+        inputs = Variable(torch.randn(4, 1, 7, 7).float().cuda())
+        weights = Variable(torch.randn(1, 1, 3, 3).double().cuda())
+        bias = Variable(torch.randn(1).double().cuda())
+
+        with torch.backends.cudnn.flags(enabled=False):
+            # inconsistent types should raise an exception
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+
+            # but it should work with the same type
+            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
+        inputs = Variable(torch.randn(4, 1, 7, 7).float().cuda())
+        weights = Variable(torch.randn(1, 1, 3, 3).double().cuda())
+        bias = Variable(torch.randn(1).double().cuda())
+
+        with torch.backends.cudnn.flags(enabled=True):
+            # inconsistent types should raise an exception
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+
+            # but it should work with the same type
+            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_Conv2d_deterministic_cudnn(self, dtype=torch.float):
+        inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True)
+        with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
+            conv1 = torch.nn.Conv2d(3, 3, 3).to("cuda", dtype)
+            conv2 = torch.nn.Conv2d(3, 3, 3).to("cuda", dtype)
+            conv2.bias.data.copy_(conv1.bias.data)
+            conv2.weight.data.copy_(conv1.weight.data)
+            out1 = conv1(inputs)
+            out2 = conv2(inputs)
+            self.assertEqual(out1, out2, prec=0.0)
+            y = torch.randn(out1.size(), device="cuda", dtype=dtype)
+            out1.backward(y)
+            out2.backward(y)
+            self.assertEqual(conv1.bias.grad.data, conv2.bias.grad.data, prec=0.0)
+            self.assertEqual(conv1.weight.grad.data, conv2.weight.grad.data, prec=0.0)
+
+    def test_Conv2d_missing_argument(self):
+        c = nn.Conv2d(3, 3, 3)
+        self.assertRaises(TypeError, lambda: c(None))
+
+    def test_Conv2d_backward_twice(self):
+        input = torch.randn(2, 3, 5, 5)
+        c = nn.Conv2d(3, 3, 3)
+        o1 = c(input)
+        o1.sum().backward()
+        self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
+                               lambda: o1.sum().backward())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_Conv2d_large_workspace(self, dtype=torch.float):
+        # These sizes require huge cuDNN workspaces. Make sure we choose a
+        # reasonable algorithm that does not run out of memory
+        sizes = [
+            (1, 256, 109, 175),
+            (1, 256, 80, 128),
+            (1, 256, 120, 192),
+        ]
+
+        def run_test(benchmark):
+            with torch.backends.cudnn.flags(benchmark=benchmark):
+                conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to("cuda", dtype)
+                for size in sizes:
+                    x = torch.randn(size, device="cuda", dtype=dtype)
+                    out = conv(torch.tensor(x, requires_grad=True))
+                    out.backward(torch.ones_like(out))
+
+        run_test(benchmark=False)
+        run_test(benchmark=True)
+
+    def test_conv_modules_raise_error_on_incorrect_input_size(self):
+        modules = [nn.Conv1d(3, 8, 3), nn.ConvTranspose1d(3, 8, 3),
+                   nn.Conv2d(3, 8, 3), nn.ConvTranspose2d(3, 8, 3),
+                   nn.Conv3d(3, 8, 3), nn.ConvTranspose3d(3, 8, 3)]
+
+        invalid_input_dims = [(2, 4), (2, 4),
+                              (3, 5), (3, 5),
+                              (4, 6), (4, 6)]
+
+        for invalid_dims, module in zip(invalid_input_dims, modules):
+            for dims in invalid_dims:
+                input = torch.empty(torch.Size((3, ) * dims))
+                self.assertRaises(RuntimeError, lambda: module(input))
+
+    def test_conv_shapecheck(self):
+        def test(should_raise, module, input_size):
+            input = torch.empty(3, *input_size)
+            if should_raise:
+                self.assertRaises(RuntimeError, lambda: module(input))
+            else:
+                # just run it to ensure no exception raised.
+                module(input)
+
+        # Conv1d
+        test(True, nn.Conv1d(1, 1, 3), (1, 2))
+        test(True, nn.Conv1d(1, 1, 3, stride=2), (1, 2))
+        test(False, nn.Conv1d(1, 1, 2), (1, 2))
+        test(False, nn.Conv1d(1, 1, 2, stride=2), (1, 2))
+        test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1), (1, 2))
+
+        # Conv2d
+        test(True, nn.Conv2d(1, 1, (3, 3)), (1, 2, 2))
+        test(False, nn.Conv2d(1, 1, (3, 3)), (1, 3, 3))
+        test(False, nn.Conv2d(1, 1, (3, 3), padding=1), (1, 2, 2))
+
+        # Conv3D
+        test(True, nn.Conv3d(1, 1, (3, 3, 3)), (1, 2, 2, 2))
+        test(False, nn.Conv3d(1, 1, (3, 3, 3)), (1, 3, 3, 3))
+        test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1), (1, 2, 2, 2))
+
+    def test_ConvTranspose2d_output_size(self):
+        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
+        i = torch.randn(2, 3, 6, 6)
+        for h in range(15, 22):
+            for w in range(15, 22):
+                if 18 <= h <= 20 and 18 <= w <= 20:
+                    output = m(i, output_size=(h, w))
+                    self.assertEqual(output.size()[2:], (h, w))
+                else:
+                    self.assertRaises(ValueError, lambda: m(i, (h, w)))
+
+    def _test_Conv2d_naive_groups(self, device="cpu", dtype=torch.float):
+        # Check that grouped convolutions matches two half convolutions
+        m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype)
+        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+        output.backward(grad_output)
+
+        m1 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m1.weight.data.copy_(m.weight.data[:2])
+        m1.bias.data.copy_(m.bias.data[:2])
+        i1 = Variable(i.data[:, :2].contiguous(), requires_grad=True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :2].contiguous())
+
+        m2 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m2.weight.data.copy_(m.weight.data[2:])
+        m2.bias.data.copy_(m.bias.data[2:])
+        i2 = Variable(i.data[:, 2:].contiguous(), requires_grad=True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 2:].contiguous())
+
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         prec=dtype2prec[dtype])
+        self.assertEqual(m.bias.grad.data,
+                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                         prec=dtype2prec[dtype])
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         prec=dtype2prec[dtype])
+
+    # For https://github.com/pytorch/pytorch/pull/1273
+    # Almost identical to the above `test_Conv2d_naive_groups`
+    def test_Conv2d_groups_nobias(self):
+        dev_dtypes = [("cpu", torch.float)]
+        if TEST_CUDA:
+            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
+        for device, dtype in dev_dtypes:
+            m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
+            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+            output = m(i)
+            grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+            output.backward(grad_output)
+
+            m1 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+            m1.weight.data.copy_(m.weight.data[:2])
+            i1 = Variable(i.data[:, :2].contiguous(), requires_grad=True)
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :2].contiguous())
+
+            m2 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[2:])
+            i2 = Variable(i.data[:, 2:].contiguous(), requires_grad=True)
+            output2 = m2(i2)
+            output2.backward(grad_output[:, 2:].contiguous())
+
+            self.assertEqual(output, torch.cat([output1, output2], 1))
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             dtype2prec[dtype])
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                             dtype2prec[dtype])
+
+    # Very similar to test_Conv2d_naive_groups but with special care to handle
+    # the number of groups == number of input channels
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_Conv2d_depthwise_naive_groups_cuda(self, dtype=torch.float):
+        for depth_multiplier in [1, 2]:
+            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to("cuda", dtype)
+            i = torch.tensor(torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype) / 2, requires_grad=True)
+            output = m(i)
+            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, device="cuda", dtype=dtype) / 2
+            output.backward(grad_output)
+
+            offset = 1 * depth_multiplier
+
+            m1 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to("cuda", dtype)
+            m1.weight.data = m.weight.data[:offset].clone()
+            m1.bias.data = m.bias.data[:offset].clone()
+            i1 = torch.tensor(i.data[:, :1].contiguous(), requires_grad=True)
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :offset].contiguous())
+
+            m2 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to("cuda", dtype)
+            m2.weight.data.copy_(m.weight.data[offset:])
+            m2.bias.data.copy_(m.bias.data[offset:])
+            i2 = torch.tensor(i.data[:, 1:].contiguous(), requires_grad=True)
+            output2 = m2(i2)
+            output2.backward(grad_output[:, offset:].contiguous())
+
+            self.assertEqual(output, torch.cat([output1, output2], 1),
+                             prec=dtype2prec[dtype])
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             prec=dtype2prec[dtype])
+            self.assertEqual(m.bias.grad.data,
+                             torch.cat([m1.bias.grad.data,
+                                        m2.bias.grad.data], 0),
+                             prec=dtype2prec[dtype])
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data,
+                                        m2.weight.grad.data], 0),
+                             prec=dtype2prec[dtype])
+
+    def test_MaxUnpool2d_output_size(self):
+        m = nn.MaxPool2d(3, stride=2, return_indices=True)
+        mu = nn.MaxUnpool2d(3, stride=2)
+        big_t = torch.rand(1, 1, 6, 6)
+        big_t[0][0][4][4] = 100
+        output_big, indices_big = m(big_t)
+        self.assertRaises(RuntimeError, lambda: mu(output_big, indices_big))
+
+        small_t = torch.rand(1, 1, 5, 5)
+        for i in range(0, 4, 2):
+            for j in range(0, 4, 2):
+                small_t[:, :, i, j] = 100
+        output_small, indices_small = m(Variable(small_t))
+        for h in range(3, 10):
+            for w in range(3, 10):
+                if 4 <= h <= 6 and 4 <= w <= 6:
+                    size = (h, w)
+                    if h == 5:
+                        size = torch.LongStorage(size)
+                    elif h == 6:
+                        size = torch.LongStorage((1, 1) + size)
+                    mu(output_small, indices_small, output_size=size)
+                else:
+                    self.assertRaises(ValueError, lambda: mu(output_small, indices_small, (h, w)))
+
+    def test_container_copy(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.linear = nn.Linear(4, 5)
+
+            def forward(self, input):
+                return self.linear(input)
+
+        input = torch.randn(2, 4)
+
+        model = Model()
+        model_cp = deepcopy(model)
+        self.assertEqual(model(input).data, model_cp(input).data)
+
+        model_cp.linear.weight.data[:] = 2
+        self.assertNotEqual(model(input).data, model_cp(input).data)
+
+    def test_RNN_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for module in (nn.RNNCell, nn.GRUCell):
+            for bias in (True, False):
+                input = torch.randn(3, 10)
+                hx = torch.randn(3, 20)
+                cell = module(10, 20, bias=bias)
+                for i in range(6):
+                    hx = cell(input, hx)
+
+                hx.sum().backward()
+
+    def _test_loss_equal_input_target_shape(self, cast):
+        # Tests losses whose inputs should have the same size.
+        losses = {
+            'mse_loss': lambda x, y: F.mse_loss(x, y),
+            'l1_loss': lambda x, y: F.l1_loss(x, y),
+            'smooth_l1_loss': lambda x, y: F.smooth_l1_loss(x, y),
+            'kl_div': lambda x, y: F.kl_div(x, y),
+            'poisson_nll_loss': lambda x, y: F.poisson_nll_loss(x, y),
+        }
+
+        input = Variable(cast(torch.randn(3, 5)))
+        target = Variable(cast(torch.randn(5, 3)))
+        for name, fn in losses.items():
+            self.assertRaises(Exception, lambda: fn(input, target))
+
+    def test_loss_equal_input_target_shape(self):
+        self._test_loss_equal_input_target_shape(lambda x: x)
+
+    def test_NLLLoss_mismatched_batch(self):
+        x = torch.randn((10, 3), requires_grad=True)
+        # t should have size (10,)
+        t = torch.zeros((3,), dtype=torch.int64)
+        with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
+            F.nll_loss(x, t)
+
+    def test_RNN_cell_no_broadcasting(self):
+        def test(cell_module, input, hx, input_size, hidden_size):
+            cell = cell_module(input_size, hidden_size)
+            self.assertRaises(RuntimeError, lambda: cell(input, hx))
+
+        def test_all(hidden_size, bad_hx, good_hx, input_size, input):
+            test(nn.RNNCell, input, bad_hx, input_size, hidden_size)
+            test(nn.GRUCell, input, bad_hx, input_size, hidden_size)
+            test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size)
+            test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size)
+
+        hidden_size = 20
+        input_size = 10
+        input = torch.randn(3, input_size)
+        bad_hx = torch.randn(1, hidden_size)
+        good_hx = torch.randn(3, hidden_size)
+
+        # Test hidden/input batch size broadcasting
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test hx's hidden_size vs module's hidden_size broadcasting
+        bad_hx = torch.randn(3, 1)
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test input's input_size vs module's input_size broadcasting
+        bad_input = torch.randn(3, 1)
+        test_all(hidden_size, good_hx, good_hx, input_size, bad_input)
+
+    def test_invalid_dropout_p(self):
+        v = torch.ones(1)
+        self.assertRaises(ValueError, lambda: nn.Dropout(-0.1))
+        self.assertRaises(ValueError, lambda: nn.Dropout(1.1))
+        self.assertRaises(ValueError, lambda: nn.Dropout2d(-0.1))
+        self.assertRaises(ValueError, lambda: nn.Dropout2d(1.1))
+        self.assertRaises(ValueError, lambda: nn.Dropout3d(-0.1))
+        self.assertRaises(ValueError, lambda: nn.Dropout3d(1.1))
+        self.assertRaises(ValueError, lambda: F.dropout(v, -0.1))
+        self.assertRaises(ValueError, lambda: F.dropout(v, 1.1))
+
+    def test_pad_sequence(self):
+        def pad(tensor, length):
+            return torch.cat(
+                [tensor.data, tensor.data.new(
+                    length - tensor.size(0), *tensor.size()[1:]).zero_()])
+
+        # single dimensional
+        a = torch.tensor([1, 2, 3])
+        b = torch.tensor([4, 5])
+        c = torch.tensor([6])
+
+        # batch_first = true
+        expected = torch.tensor([[4, 5, 0], [1, 2, 3], [6, 0, 0]])
+        padded = rnn_utils.pad_sequence([b, a, c], True)
+        self.assertEqual(padded, expected)
+
+        # batch_first = false
+        padded = rnn_utils.pad_sequence([b, a, c])
+        self.assertEqual(padded, expected.transpose(0, 1))
+
+        # pad with non-zero value
+        expected = torch.tensor([[4, 5, 1], [1, 2, 3], [6, 1, 1]])
+        padded = rnn_utils.pad_sequence([b, a, c], True, 1)
+        self.assertEqual(padded, expected)
+
+        # Test pad sorted sequence
+        expected = torch.tensor([[1, 2, 3], [4, 5, 0], [6, 0, 0]])
+        padded = rnn_utils.pad_sequence([a, b, c], True)
+        self.assertEqual(padded, expected)
+
+        # more dimensions
+        maxlen = 9
+        for num_dim in (0, 1, 2, 3):
+            sequences = []
+            trailing_dims = [4] * num_dim
+            for i in range(1, maxlen + 1):
+                seq_len = i * i
+                sequences.append(torch.rand(seq_len, 5, *trailing_dims))
+            random.shuffle(sequences)
+            expected = []
+            for seq in sequences:
+                expected.append(pad(seq, maxlen * maxlen))
+            # batch first = true
+            expected = torch.stack(expected)
+            padded = rnn_utils.pad_sequence(sequences, True)
+            self.assertEqual(padded, expected)
+
+            # batch first = false
+            padded = rnn_utils.pad_sequence(sequences)
+            self.assertEqual(padded, expected.transpose(0, 1))
+
+    def test_pack_sequence(self):
+        def _compatibility_test(sequences, lengths, batch_first):
+            padded = rnn_utils.pad_sequence(sequences, batch_first)
+            packed = rnn_utils.pack_sequence(sequences)
+            unpacked = rnn_utils.pad_packed_sequence(packed, batch_first)
+            self.assertEqual(padded, unpacked[0])
+            pack_padded = rnn_utils.pack_padded_sequence(padded, lengths, batch_first)
+            self.assertEqual(packed, pack_padded)
+
+        # single dimensional
+        a = torch.tensor([1, 2, 3])
+        b = torch.tensor([4, 5])
+        c = torch.tensor([6])
+        packed = rnn_utils.pack_sequence([a, b, c])
+        expected = torch.tensor([1, 4, 6, 2, 5, 3])
+        self.assertEqual(packed.batch_sizes, [3, 2, 1])
+        self.assertEqual(packed.data.data, expected)
+
+        # more dimensions
+        maxlen = 9
+        for num_dim in (0, 1, 2, 3):
+            sequences = []
+            lengths = []
+            trailing_dims = [4] * num_dim
+            for i in range(maxlen, 0, -1):
+                seq_len = i * i
+                lengths.append(seq_len)
+                sequences.append(torch.rand(seq_len, 5, *trailing_dims))
+
+            # compatibility with other utilities
+            for batch_first in (True, False):
+                _compatibility_test(sequences, lengths, batch_first)
+
+    def test_pack_padded_sequence(self):
+        def pad(tensor, length):
+            return torch.cat([tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_()])
+        lengths = [10, 8, 4, 2, 2, 2, 1]
+        max_length = lengths[0]
+        batch_sizes = [sum(map(bool, filter(lambda x: x >= i, lengths))) for i in range(1, max_length + 1)]
+        offset = 0
+        padded = torch.cat([pad(i * 100 + torch.arange(1., 5 * l + 1).view(l, 1, 5), max_length)
+                            for i, l in enumerate(lengths, 1)], 1)
+        padded = torch.tensor(padded, requires_grad=True)
+        expected_data = [[torch.arange(1., 6) + (i + 1) * 100 + 5 * n for i in range(batch_size)]
+                         for n, batch_size in enumerate(batch_sizes)]
+        expected_data = list(itertools.chain.from_iterable(expected_data))
+        expected_data = torch.stack(expected_data, dim=0)
+
+        for batch_first in (True, False):
+            src = padded
+            if batch_first:
+                src = src.transpose(0, 1)
+
+            # check output
+            packed = rnn_utils.pack_padded_sequence(src, lengths, batch_first=batch_first)
+            self.assertEqual(packed.data.data, expected_data)
+            self.assertEqual(packed.batch_sizes, batch_sizes)
+
+            # test inverse
+            unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first)
+            self.assertEqual(unpacked, src)
+            self.assertEqual(unpacked_len, lengths)
+
+            # check grad
+            if padded.grad is not None:
+                padded.grad.data.zero_()
+            grad_output = unpacked.data.clone().normal_()
+            unpacked.backward(grad_output)
+            if batch_first:
+                grad_output.transpose_(0, 1)
+            for i, l in enumerate(lengths):
+                self.assertEqual(padded.grad.data[:l, i], grad_output[:l, i])
+                if l < 10:
+                    self.assertEqual(padded.grad.data[l:, i].abs().sum(), 0)
+
+    def _test_variable_sequence(self, device="cpu", dtype=torch.float):
+        def pad(var, length):
+            if var.size(0) == length:
+                return var
+            return torch.cat([var, var.new_zeros(length - var.size(0), *var.size()[1:])])
+
+        lengths = [10, 10, 6, 2, 2, 1, 1]
+        max_length = lengths[0]
+        x_leaf = torch.randn(max_length, len(lengths), 3, device=device, dtype=dtype, requires_grad=True)
+        lstm = nn.LSTM(3, 4, bidirectional=True, num_layers=2).to(device, dtype)
+        lstm2 = deepcopy(lstm).to(device, dtype)
+        x = x_leaf
+
+        # Compute sequences separately
+        seq_outs = []
+        seq_hiddens = []
+        for i, l in enumerate(lengths):
+            out, hid = lstm2(x[:l, i:i + 1])
+            out_pad = pad(out, max_length)
+            seq_outs.append(out_pad)
+            seq_hiddens.append(hid)
+        seq_out = torch.cat(seq_outs, 1)
+        seq_hidden = tuple(torch.cat(hids, 1) for hids in zip(*seq_hiddens))
+
+        # Use packed format
+        packed = rnn_utils.pack_padded_sequence(x, lengths)
+        packed_out, packed_hidden = lstm(packed)
+        unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out)
+
+        # Check forward
+        self.assertEqual(packed_hidden, seq_hidden)
+        self.assertEqual(unpacked, seq_out)
+        self.assertEqual(unpacked_len, lengths)
+
+        # Check backward
+        seq_out.sum().backward()
+        grad_x = x_leaf.grad.data.clone()
+        x_leaf.grad.data.zero_()
+        unpacked.sum().backward()
+
+        self.assertEqual(x_leaf.grad, grad_x)
+        for p1, p2 in zip(lstm.parameters(), lstm2.parameters()):
+            self.assertEqual(p1.grad, p2.grad, dtype2prec[dtype])
+
+    def test_variable_sequence(self):
+        self._test_variable_sequence()
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_variable_sequence_cuda(self, dtype=torch.float):
+        self._test_variable_sequence("cuda", dtype)
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = torch.randn(3, 10)
+            hx = torch.randn(3, 20)
+            cx = torch.randn(3, 20)
+            lstm = nn.LSTMCell(10, 20, bias=bias)
+            for i in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx + cx).sum().backward()
+
+    @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available')
+    def test_cudnn_rnn_dropout_states_device(self):
+        rnn = nn.RNN(10, 20, num_layers=2, dropout=.5)
+        device = 1
+        input = torch.randn(5, 4, 10).cuda(device)
+        rnn.cuda(device)
+        hx = torch.randn(2, 4, 20).cuda(device)
+        output = rnn(input, hx)
+
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_cudnn_weight_format(self):
+        rnns = [
+            nn.LSTM(10, 20, batch_first=True),
+            nn.GRU(10, 20, batch_first=True),
+            nn.RNN(10, 20, batch_first=True)
+        ]
+        first_warn = True
+        for rnn in rnns:
+            rnn.cuda()
+            input = Variable(torch.randn(5, 4, 10).cuda(), requires_grad=True)
+            hx = Variable(torch.randn(1, 5, 20).cuda(), requires_grad=True)
+            all_vars = [input, hx] + list(rnn.parameters())
+            if isinstance(rnn, nn.LSTM):
+                cx = Variable(torch.randn(1, 5, 20).cuda(), requires_grad=True)
+                all_vars[2:2] = [cx]
+                hx = (hx, cx)
+
+            output = rnn(input, hx)
+            output[0].sum().backward()
+            grads = [v.grad.data.clone() for v in all_vars]
+            for v in all_vars:
+                v.grad.data.zero_()
+
+            # Weights will no longer view onto the same chunk of memory
+            weight = all_vars[4]
+            weight_data = weight.data.clone()
+            weight.data.set_(weight_data)
+
+            for i in range(2):
+                with warnings.catch_warnings(record=True) as w:
+                    output_noncontig = rnn(input, hx)
+                if first_warn:
+                    self.assertEqual(len(w), 1)
+                    self.assertIn('weights are not part of single contiguous chunk of memory', w[0].message.args[0])
+                    first_warn = False
+                output_noncontig[0].sum().backward()
+                grads_noncontig = [v.grad.data.clone() for v in all_vars]
+                for v in all_vars:
+                    v.grad.data.zero_()
+                self.assertEqual(output, output_noncontig)
+                self.assertEqual(grads_noncontig, grads)
+
+            # Make sure these still share storage
+            weight_data[:] = 4
+            self.assertEqual(weight_data, all_vars[4].data)
+
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_cudnn_weight_tying(self):
+        rnns = [
+            nn.LSTM(10, 20, batch_first=True, bidirectional=True),
+            nn.GRU(10, 20, batch_first=True, bidirectional=True),
+            nn.RNN(10, 20, batch_first=True, bidirectional=True)
+        ]
+        for rnn in rnns:
+            rnn.bias_ih_l0_reverse = rnn.bias_ih_l0
+            rnn.cuda()
+            input = Variable(torch.randn(5, 4, 10).cuda(), requires_grad=True)
+            hx = Variable(torch.randn(2, 5, 20).cuda(), requires_grad=True)
+            all_vars = [input, hx] + list(rnn.parameters())
+            opt = torch.optim.SGD(rnn.parameters(), lr=0.1)
+            opt.zero_grad()
+            if isinstance(rnn, nn.LSTM):
+                cx = Variable(torch.randn(2, 5, 20).cuda(), requires_grad=True)
+                all_vars[2:2] = [cx]
+                hx = (hx, cx)
+
+            with warnings.catch_warnings(record=True) as w:
+                output = rnn(input, hx)
+            output[0].sum().backward()
+
+            opt.step()
+            with warnings.catch_warnings(record=True) as w:
+                output_cuda = rnn(input, hx)
+            rnn.cpu()
+            hx = (hx[0].cpu(), hx[1].cpu()) if isinstance(rnn, nn.LSTM) else hx.cpu()
+            output_cpu = rnn(input.cpu(), hx)
+            self.assertEqual(output_cuda, output_cpu)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_cuda_rnn_fused(self, dtype=torch.float):
+
+        def copy_rnn(rnn1, rnn2):
+            for x_layer, y_layer in zip(rnn1.all_weights, rnn2.all_weights):
+                for x, y in zip(x_layer, y_layer):
+                    x.data.copy_(y.data)
+
+        def check_rnn_grads(rnn1, rnn2):
+            for x_layer, y_layer in zip(rnn1.all_weights, rnn2.all_weights):
+                for x, y in zip(x_layer, y_layer):
+                    self.assertEqual(x.grad, y.grad, prec=5e-5)
+
+        input_size = 10
+        hidden_size = 6
+        num_layers = 2
+        seq_length = 7
+        batch = 6
+        input_val = torch.randn(seq_length, batch, input_size, device="cuda", dtype=dtype)
+        grad_output = torch.randn(seq_length, batch, hidden_size, device="cuda", dtype=dtype)
+        hx_val = torch.randn(num_layers, batch, hidden_size, device="cuda", dtype=dtype)
+        grad_hy = torch.randn(num_layers, batch, hidden_size, device="cuda", dtype=dtype)
+        with torch.backends.cudnn.flags(enabled=False):
+            for module in (nn.GRU, nn.LSTM):
+                for bias in (True, False):
+                    rnn = module(input_size, hidden_size, num_layers, bias=bias).to("cuda", dtype)
+                    rnn_cuda = module(input_size, hidden_size, num_layers, bias=bias).to("cuda", dtype)
+                    copy_rnn(rnn, rnn_cuda)
+
+                    is_lstm = isinstance(rnn, nn.LSTM)
+                    if is_lstm:
+                        hx = (Variable(hx_val.clone(), requires_grad=True),
+                              Variable(hx_val.clone().add(1), requires_grad=True))
+                        hx_cuda = (Variable(hx_val.clone().cuda(), requires_grad=True),
+                                   Variable(hx_val.clone().cuda().add(1), requires_grad=True))
+                    else:
+                        hx = Variable(hx_val.clone(), requires_grad=True)
+                        hx_cuda = Variable(hx_val.clone().cuda(), requires_grad=True)
+
+                    inp = Variable(input_val.clone(), requires_grad=True)
+                    inp_cu = Variable(input_val.clone().cuda(), requires_grad=True)
+                    output1, hy1 = rnn(inp, hx)
+                    output2, hy2 = rnn_cuda(inp_cu, hx_cuda)
+                    if is_lstm:
+                        torch.autograd.backward(
+                            [output1, hy1[0], hy1[1]], [grad_output, grad_hy, grad_hy + 1]
+                        )
+                        torch.autograd.backward(
+                            [output2, hy2[0], hy2[1]],
+                            [grad_output.cuda(), grad_hy.cuda(), (grad_hy + 1).cuda()]
+                        )
+                    else:
+                        torch.autograd.backward([output1, hy1], [grad_output, grad_hy])
+                        torch.autograd.backward([output2, hy2], [grad_output.cuda(), grad_hy.cuda()])
+
+                    self.assertEqual(output1, output2)
+                    self.assertEqual(hy1, hy2)
+
+                    check_rnn_grads(rnn, rnn_cuda)
+                    self.assertEqual(inp.grad.data, inp_cu.grad.data)
+                    if is_lstm:
+                        self.assertEqual(hx[0].grad.data, hx_cuda[0].grad.data)
+                        self.assertEqual(hx[1].grad.data, hx_cuda[1].grad.data)
+                    else:
+                        self.assertEqual(hx.grad.data, hx_cuda.grad.data)
+
+    def test_rnn_args_check(self):
+        input_size = 3
+        hidden_size = 5
+        num_layers = 2
+        batch_size = 4
+        seq_len = 6
+        num_directions = 1
+        bad_size = 7  # prime number so that no size can divide it.
+
+        def test(input_shape, hidden_shape, mode):
+            for input, hidden in get_inputs(input_shape, hidden_shape, mode):
+                model = getattr(nn, mode)(input_size, hidden_size, num_layers)
+                self.assertRaises(RuntimeError, lambda: model(input, hidden))
+
+        correct_input_shape = (seq_len, batch_size, input_size)
+        correct_hidden_shape = (num_layers * num_directions, batch_size, hidden_size)
+
+        def update_shape(shape, dim, new_dim_size):
+            new_shape = list(shape)
+            new_shape[dim] = new_dim_size
+            return tuple(new_shape)
+
+        def get_inputs(input_shape, hidden_shape, mode):
+            '''returns list( tuple(input, hidden) )
+            where input, hidden are inputs to a model'''
+            input = torch.randn(input_shape)
+            hidden = torch.randn(hidden_shape)
+            if mode is not 'LSTM':
+                return [(input, hidden)]
+            if hidden_shape == correct_hidden_shape:
+                return [(input, (hidden, hidden))]
+            good_hidden = torch.randn(correct_hidden_shape)
+            return [
+                (input, (hidden, good_hidden)),
+                (input, (good_hidden, hidden)),
+            ]
+
+        rnn_modes = ['RNN', 'GRU', 'LSTM']
+        for mode in rnn_modes:
+            # Incorrect input batch size
+            input_shape = update_shape(correct_input_shape, 1, bad_size)
+            hidden_shape = correct_hidden_shape
+            test(input_shape, hidden_shape, mode)
+
+            # Incorrect hidden batch size
+            input_shape = correct_input_shape
+            hidden_shape = update_shape(correct_hidden_shape, 1, bad_size)
+            test(input_shape, hidden_shape, mode)
+
+            # Incorrect input size
+            input_shape = update_shape(correct_input_shape, 2, bad_size)
+            hidden_shape = correct_hidden_shape
+            test(input_shape, hidden_shape, mode)
+
+            # Incorrect hidden size
+            input_shape = correct_input_shape
+            hidden_shape = update_shape(correct_hidden_shape, 2, bad_size)
+            test(input_shape, hidden_shape, mode)
+
+            # Incorrect hidden[0]
+            input_shape = correct_input_shape
+            hidden_shape = update_shape(correct_hidden_shape, 0, bad_size)
+            test(input_shape, hidden_shape, mode)
+
+    def test_rnn_initial_hidden_state(self):
+        rnn_modes = ['RNN', 'GRU', 'LSTM']
+        for mode in rnn_modes:
+            rnn = getattr(nn, mode)(30, 20, 2)
+            input = torch.randn(10, 32, 30)
+            hidden = torch.zeros(2, 32, 20)
+
+            if mode is 'LSTM':
+                hidden = (hidden, hidden)
+            output1, hidden1 = rnn(input, hidden)
+            output2, hidden2 = rnn(input)
+            self.assertEqual(output1, output2)
+            self.assertEqual(hidden1, hidden2)
+
+    def _test_rnn_retain_variables(self, device="cpu", dtype=torch.double):
+        rnns = [nn.LSTM(10, 20, num_layers=2).to(device, dtype),
+                nn.GRU(10, 20, num_layers=2).to(device, dtype),
+                nn.RNN(10, 20, num_layers=2).to(device, dtype)]
+        for rnn in rnns:
+            input = torch.randn(5, 6, 10, device=device, dtype=dtype, requires_grad=True)
+            output = rnn(input)
+            output[0].sum().backward(retain_graph=True)
+            grads = [input.grad.data.clone()] + [p.grad.data.clone() for p in rnn.parameters()]
+            for i in range(4):
+                rnn.zero_grad()
+                input.grad.data.zero_()
+                output[0].sum().backward(retain_graph=True)
+                grads2 = [input.grad.data] + [p.grad.data for p in rnn.parameters()]
+                self.assertEqual(grads, grads2)
+
+    def test_rnn_retain_variables(self):
+        self._test_rnn_retain_variables()
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_rnn_retain_variables_cuda(self, dtype=torch.float):
+        with torch.backends.cudnn.flags(enabled=False):
+            self._test_rnn_retain_variables("cuda", dtype)
+        self._test_rnn_retain_variables("cuda", dtype)
+
+    def _test_RNN_cpu_vs_cudnn(self, dropout):
+
+        def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights_val):
+            is_lstm = isinstance(rnn, nn.LSTM)
+
+            for x_layer, y_layer in zip(rnn.all_weights, weights_val):
+                for x, y in zip(x_layer, y_layer):
+                    x.data.copy_(y.data)
+
+            if isinstance(input_val, rnn_utils.PackedSequence):
+                input = rnn_utils.PackedSequence(
+                    Variable(input_val.data.data, requires_grad=True), input_val.batch_sizes)
+                input_var = input.data
+            else:
+                input = Variable(input_val.clone(), requires_grad=True)
+                input_var = input
+            if is_lstm:
+                hx = (Variable(hx_val.clone(), requires_grad=True),
+                      Variable(hx_val.add(1), requires_grad=True))
+            else:
+                hx = Variable(hx_val.clone(), requires_grad=True)
+
+            if cuda:
+                rnn.cuda()
+                input_var.data = input_var.data.cuda()
+                if is_lstm:
+                    hx[0].data = hx[0].data.cuda()
+                    hx[1].data = hx[1].data.cuda()
+                else:
+                    hx.data = hx.data.cuda()
+                grad_hy = grad_hy.cuda()
+                grad_output = grad_output.cuda()
+
+            output, hy = rnn(input, hx)
+
+            if isinstance(output, rnn_utils.PackedSequence):
+                output = output.data
+
+            if is_lstm:
+                torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_hy + 1])
+            else:
+                torch.autograd.backward([output, hy], [grad_output, grad_hy])
+
+            return {'output': output.data,
+                    'hy': hy[0].data if is_lstm else hy.data,
+                    'weights': rnn.all_weights,
+                    'grad_input': input_var.grad.data,
+                    'grad_hx': hx[0].grad.data if is_lstm else hx.grad.data,
+                    'cy': hy[1].data if is_lstm else None,
+                    'grad_cx': hx[1].grad.data if is_lstm else None}
+
+        input_size = 10
+        hidden_size = 6
+        num_layers = 2
+        seq_length = 7
+        batch = 6
+
+        def make_noncontig(tensor):
+            ndim = tensor.dim()
+            return torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
+
+        def compare_cpu_gpu(outputs_cpu, outputs_gpu):
+            self.assertEqual(list(outputs_cpu.keys()), list(outputs_gpu.keys()))
+            for key in outputs_cpu.keys():
+                if key != 'weights':
+                    self.assertEqual(outputs_cpu[key], outputs_gpu[key], prec=5e-5, message=key)
+
+            # check grad weights separately, as nested dict
+            for cpu_layer_weight, gpu_layer_weight in zip(outputs_cpu['weights'], outputs_gpu['weights']):
+                for (cpu_weight, gpu_weight) in zip(cpu_layer_weight, gpu_layer_weight):
+                    self.assertEqual(cpu_weight.grad.data, gpu_weight.grad.data, prec=5e-5)
+
+        for module in (nn.RNN, nn.LSTM, nn.GRU):
+            for bias, bidirectional, batch_first, contig, variable_len, lens_as_tensor \
+                    in product((True, False), repeat=6):
+
+                num_directions = 2 if bidirectional else 1
+                if batch_first:
+                    input_val = torch.randn(batch, seq_length, input_size)
+                    grad_output = torch.randn(batch, seq_length, hidden_size * num_directions)
+                else:
+                    input_val = torch.randn(seq_length, batch, input_size)
+                    grad_output = torch.randn(seq_length, batch, hidden_size * num_directions)
+
+                if not contig:
+                    grad_output = make_noncontig(grad_output)
+                    grad_hy = make_noncontig(grad_hy)
+                    input_var = make_noncontig(input_val)
+                    hx_val = make_noncontig(hx_val)
+
+                hx_val = torch.randn(num_layers * num_directions, batch, hidden_size)
+                grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size)
+
+                if variable_len:
+                    lengths = [7, 5, 5, 2, 1, 1]
+                    if lens_as_tensor:
+                        lengths = torch.tensor(lengths, dtype=torch.long)
+                    input_val = rnn_utils.pack_padded_sequence(input_val, lengths, batch_first=batch_first)
+                    grad_output = rnn_utils.pack_padded_sequence(grad_output, lengths, batch_first=batch_first).data
+
+                rnn = module(input_size,
+                             hidden_size,
+                             num_layers,
+                             bias=bias,
+                             dropout=dropout,
+                             bidirectional=bidirectional,
+                             batch_first=batch_first)
+
+                outputs_cpu = forward_backward(
+                    False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+
+                rnn_gpu = module(input_size,
+                                 hidden_size,
+                                 num_layers,
+                                 bias=bias,
+                                 dropout=dropout,
+                                 bidirectional=bidirectional,
+                                 batch_first=batch_first)
+
+                outputs_gpu = forward_backward(
+                    True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+
+                compare_cpu_gpu(outputs_cpu, outputs_gpu)
+
+        for nonlinearity in ('tanh', 'relu'):
+            hx_val = torch.randn(num_layers, batch, hidden_size)
+            input_val = torch.randn(seq_length, batch, input_size)
+            grad_output = torch.randn(
+                seq_length, batch, hidden_size * num_directions)
+            grad_hy = torch.randn(
+                num_layers * num_directions, batch, hidden_size)
+
+            rnn = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
+            outputs_cpu = forward_backward(False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+
+            rnn_gpu = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
+            outputs_gpu = forward_backward(True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+
+            compare_cpu_gpu(outputs_cpu, outputs_gpu)
+
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    def test_RNN_cpu_vs_cudnn_no_dropout(self):
+        self._test_RNN_cpu_vs_cudnn(0)
+
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 5103), "needs cudnn >= 5.1")
+    @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    def test_RNN_cpu_vs_cudnn_with_dropout(self):
+        # Because of dropout randomness, can only compare dropout=0 and dropout=1
+        self._test_RNN_cpu_vs_cudnn(1)
+
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 5103), "needs cudnn >= 5.1")
+    def test_RNN_dropout(self):
+        # checking the assumption that cuDNN sticks dropout in between
+        # RNN layers
+        for p in (0, 0.276, 0.731, 1):
+            for train in (True, False):
+                for cuda in (True, False):
+                    rnn = nn.RNN(10, 1000, 2, bias=False, dropout=p, nonlinearity='relu')
+                    if cuda:
+                        rnn.cuda()
+
+                    if train:
+                        rnn.train()
+                    else:
+                        rnn.eval()
+                    rnn.weight_ih_l0.data.fill_(1)
+                    rnn.weight_hh_l0.data.fill_(1)
+                    rnn.weight_ih_l1.data.fill_(1)
+                    rnn.weight_hh_l1.data.fill_(1)
+                    input = torch.ones(1, 1, 10)
+                    hx = torch.zeros(2, 1, 1000)
+                    if cuda:
+                        input = input.cuda()
+                        hx = hx.cuda()
+
+                    output, hy = rnn(input, hx)
+                    self.assertEqual(output.data.min(), output.data.max())
+                    output_val = output.data[0][0][0]
+                    if p == 0 or not train:
+                        self.assertEqual(output_val, 10000)
+                    elif p == 1:
+                        self.assertEqual(output_val, 0)
+                    else:
+                        self.assertGreater(output_val, 8000)
+                        self.assertLess(output_val, 12000)
+                        denorm_mod = (output_val * (1 - p)) % 10
+                        self.assertLess(min(denorm_mod, 10 - denorm_mod), 1e-2)
+
+                    self.assertEqual(hy[0].data.min(), hy[0].data.max())
+                    self.assertEqual(hy[1].data.min(), hy[1].data.max())
+                    self.assertEqual(hy.data[0][0][0], 10)
+                    self.assertEqual(hy.data[1][0][0], output_val)
+
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 5103), "needs cudnn >= 5.1")
+    def test_RNN_dropout_state(self):
+        import sys
+        if sys.version_info[0] == 2:
+            import cPickle as pickle
+        else:
+            import pickle
+        for p in (0, 0.1234):
+            for train in (True, False):
+                for cuda in (True, False):
+                    rnn = nn.RNN(100, 100, 2, bias=False, dropout=p, nonlinearity='relu')
+                    if cuda:
+                        rnn.cuda()
+
+                    if train:
+                        rnn.train()
+                    else:
+                        rnn.eval()
+                    input = torch.rand(1, 1, 100)
+                    hx = torch.rand(2, 1, 100)
+                    if cuda:
+                        input = input.cuda()
+                        hx = hx.cuda()
+
+                    output1, hy1 = rnn(input, hx)
+                    output2, hy2 = rnn(input, hx)
+
+                    rnn_pickle = pickle.dumps(rnn)
+                    rnn2 = pickle.loads(rnn_pickle)
+                    rnn2.flatten_parameters()
+                    output3, hy3 = rnn2(input, hx)
+
+                    if p == 0 or not train:
+                        self.assertEqual(output1, output2)
+                        self.assertEqual(output1, output3)
+                        self.assertEqual(hy1, hy2)
+                        self.assertEqual(hy1, hy3)
+                    else:
+                        self.assertNotEqual(output1, output2)
+                        self.assertNotEqual(output1, output3)
+                        self.assertNotEqual(hy1, hy2)
+                        self.assertNotEqual(hy1, hy3)
+
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 5103), "needs cudnn >= 5.1")
+    def test_RNN_change_dropout(self):
+        for train, cuda in product((True, False), repeat=2):
+            rnn = nn.RNN(100, 100, 2, dropout=0, nonlinearity='relu')
+            input = torch.rand(3, 2, 100)
+            if cuda:
+                input.data = input.data.cuda()
+                rnn.cuda()
+
+            if train:
+                rnn.train()
+            else:
+                rnn.eval()
+
+            prev_output = None
+            for p in (0, 0.5, 0, 0.7, 0.2, 1, 0.2, 0):
+                rnn.dropout = p
+                output1, hy1 = rnn(input)
+                output2, hy2 = rnn(input)
+
+                if p == 0 or p == 1 or not train:
+                    self.assertEqual(output1, output2)
+                    self.assertEqual(hy1, hy2)
+                else:
+                    self.assertNotEqual(output1, output2)
+                    self.assertNotEqual(hy1, hy2)
+
+                if prev_output is not None:
+                    if not train:
+                        self.assertEqual(output1.data, prev_output)
+                        self.assertEqual(output2.data, prev_output)
+                    else:
+                        self.assertNotEqual(output1.data, prev_output)
+                        self.assertNotEqual(output2.data, prev_output)
+                prev_output = output1.data
+
+    def _verify_pixel_shuffle(self, input, output, upscale_factor):
+        for c in range(output.size(1)):
+            for h in range(output.size(2)):
+                for w in range(output.size(3)):
+                    height_idx = h // upscale_factor
+                    weight_idx = w // upscale_factor
+                    channel_idx = (upscale_factor * (h % upscale_factor)) + (w % upscale_factor) + \
+                                  (c * upscale_factor ** 2)
+                    self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx])
+
+    def test_inplace_thnn(self):
+        modules = [nn.ReLU, nn.ELU, nn.SELU, nn.RReLU]
+        for mod in modules:
+            r = mod(inplace=True)
+            input = torch.randn(5, 5, requires_grad=True)
+            output = r(input + 0)
+            grad_output = torch.randn(5, 5)
+            grad_output_clone = grad_output.clone()
+            output.backward(grad_output)
+            self.assertEqual(grad_output, grad_output_clone)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @repeat_test_for_types(ALL_TENSORTYPES)
+    def test_noncontig_conv_grad_cuda(self, dtype=torch.float):
+        # FIXME: remove after adding non-contiguous grad tests for all modules
+        module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype)
+        input = torch.randn(2, 3, 10, 10, dtype=dtype, device="cuda", requires_grad=True)
+        output = module(input)
+
+        grad = torch.randn(2, 2, 5, 10, 10, dtype=dtype, device="cuda")[:, 1]
+        assert not grad.is_contiguous()
+        output.backward(grad, retain_graph=True)
+        self.assertIsNotNone(input.grad)
+        result = input.grad.data.clone()
+        input.grad.data.zero_()
+
+        output.backward(grad.contiguous())
+        self.assertEqual(result, input.grad.data, dtype2prec[dtype])
+
+    def test_pixel_shuffle(self):
+        batch_size = random.randint(1, 3)
+        upscale_factor = random.randint(2, 5)
+        channels = random.randint(1, 4) * upscale_factor ** 2
+        height = random.randint(5, 10)
+        width = random.randint(5, 10)
+
+        input = torch.rand(batch_size, channels, height, width, requires_grad=True)
+        ps = nn.PixelShuffle(upscale_factor)
+        output = ps(input)
+        self._verify_pixel_shuffle(input.data, output.data, upscale_factor)
+        output.backward(output.data)
+        self.assertEqual(input.data, input.grad.data)
+
+    def test_elu_inplace_view(self):
+        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
+
+        def func(root):
+            x = root.clone()
+            view = x.narrow(0, 1, 2)
+            res = F.elu(view, inplace=True)
+            self.assertIs(res, view)
+            return x
+
+        gradcheck(func, [v])
+        gradgradcheck(func, [v])
+
+    def test_relu_inplace_view(self):
+        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
+
+        def func(root):
+            x = root.clone()
+            view = x.narrow(0, 1, 2)
+            res = F.relu(view, inplace=True)
+            self.assertIs(res, view)
+            return x
+
+        gradcheck(func, [v])
+        gradgradcheck(func, [v])
+
+    def test_bce_loss_always_nonnegative(self):
+        target = torch.ones(5)
+        input = torch.ones(5)
+        self.assertEqual((nn.BCELoss()(input, target) < 0).sum(), 0)
+
+        target = torch.zeros(5)
+        input = torch.zeros(5)
+        self.assertEqual((nn.BCELoss()(input, target) < 0).sum(), 0)
+
+    def test_bce_with_logits_raises_if_target_and_input_are_different_size(self):
+        target = torch.rand(5)
+        input = torch.rand(5, 1)
+        with self.assertRaises(ValueError):
+            nn.BCEWithLogitsLoss()(input, target)
+
+        target = torch.rand(5, 1)
+        input = torch.rand(5)
+        with self.assertRaises(ValueError):
+            nn.BCEWithLogitsLoss()(input, target)
+
+    def test_bce_with_logits_gives_same_result_as_sigmoid_and_bce_loss(self):
+        sigmoid = nn.Sigmoid()
+
+        target = torch.rand(64, 4)
+        output = torch.rand(64, 4) - 0.5
+
+        self.assertEqual(nn.BCEWithLogitsLoss()(output, target), nn.BCELoss()(sigmoid(output), target))
+
+        weight = torch.rand(4)
+        self.assertEqual(nn.BCEWithLogitsLoss(weight)(output, target), nn.BCELoss(weight)(sigmoid(output), target))
+
+        target = torch.zeros(4, 1, dtype=torch.float)
+        output = torch.empty(4, 1, dtype=torch.float).fill_(-100)
+
+        self.assertEqual(nn.BCEWithLogitsLoss()(output, target), nn.BCELoss()(sigmoid(output), target))
+
+        self.assertEqual(nn.BCEWithLogitsLoss(reduction='none')(output, target),
+                         nn.BCELoss(reduction='none')(sigmoid(output), target))
+
+        weight = torch.rand(1, dtype=torch.float)
+        self.assertEqual(nn.BCEWithLogitsLoss(weight)(output, target), nn.BCELoss(weight)(sigmoid(output), target))
+
+    def test_bce_with_logits_has_correct_grad_at_zero(self):
+        output = torch.zeros(3, 1, requires_grad=True)
+        target = torch.zeros(3, 1)
+        nn.BCEWithLogitsLoss(reduction='sum')(output, target).backward()
+        expected_grad = torch.empty(3, 1).fill_(0.5)
+        self.assertEqual(output.grad, expected_grad)
+
+    def test_bce_with_logits_broadcasts_weights(self):
+        target = torch.rand(16, 4)
+        output = torch.rand(16, 4) - 0.5
+
+        weight = torch.rand(4)
+        out1 = nn.BCEWithLogitsLoss(weight)(output, target)
+
+        weight = weight.expand(16, 4).contiguous()
+        out2 = nn.BCEWithLogitsLoss(weight)(output, target)
+
+        self.assertEqual(out1, out2)
+
+        weight = torch.rand(16, 1)
+        out1 = nn.BCEWithLogitsLoss(weight)(output, target)
+
+        weight = weight.expand(16, 4).contiguous()
+        out2 = nn.BCEWithLogitsLoss(weight)(output, target)
+
+        self.assertEqual(out1, out2)
+
+    def test_bce_with_logits_ones_in_pos_weights_are_the_same_as_none(self):
+        target = torch.rand(64, 4)
+        output = torch.rand(64, 4) - 0.5
+        pos_weight = torch.ones(64, 4)
+
+        self.assertEqual(nn.BCEWithLogitsLoss()(output, target),
+                         nn.BCEWithLogitsLoss(pos_weight=pos_weight)(output, target))
+
+    def test_bce_with_logits_broadcasts_pos_weights(self):
+        target = torch.rand(64, 4)
+        output = torch.rand(64, 4) - 0.5
+        pos_weight = torch.rand(4)
+        out1 = nn.BCEWithLogitsLoss(pos_weight=pos_weight)(output, target)
+
+        pos_weight1 = pos_weight.expand(1, 4)
+        out2 = nn.BCEWithLogitsLoss(pos_weight=pos_weight1)(output, target)
+
+        pos_weight2 = pos_weight.expand(64, 4)
+        out3 = nn.BCEWithLogitsLoss(pos_weight=pos_weight2)(output, target)
+
+        self.assertEqual(out1, out2)
+        self.assertEqual(out1, out3)
+
+    def test_bce_with_logits_with_pos_weight_has_correct_grad_at_zero(self):
+        output = torch.zeros(3, 1, requires_grad=True)
+        target = torch.zeros(3, 1)
+        pos_weight = torch.ones(3, 1)
+        nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='sum')(output, target).backward()
+        expected_grad = torch.empty(3, 1).fill_(0.5)
+        grad = output.grad
+        self.assertEqual(grad, expected_grad)
+
+    def test_bce_loss_broadcasts_weights(self):
+        sigmoid = nn.Sigmoid()
+        target = torch.rand(16, 4)
+        output = torch.rand(16, 4) - 0.5
+
+        weight = torch.rand(4)
+        out1 = nn.BCELoss(weight)(sigmoid(output), target)
+
+        weight = weight.expand(16, 4).contiguous()
+        out2 = nn.BCELoss(weight)(sigmoid(output), target)
+
+        self.assertEqual(out1, out2)
+
+        weight = torch.rand(16, 1)
+        out1 = nn.BCELoss(weight)(sigmoid(output), target)
+
+        weight = weight.expand(16, 4).contiguous()
+        out2 = nn.BCELoss(weight)(sigmoid(output), target)
+
+        self.assertEqual(out1, out2)
+
+    def test_elu_inplace_gradgrad(self):
+        v = torch.randn(8, requires_grad=True)
+
+        def func(root):
+            x = root.clone()
+            return F.elu(x, inplace=True)
+
+        gradcheck(func, [v])
+        gradgradcheck(func, [v])
+
+    def test_hardtanh_inplace_gradgrad(self):
+        v = torch.randn(8, requires_grad=True)
+
+        def func(root):
+            x = root.clone()
+            return F.hardtanh(x, inplace=True)
+
+        gradcheck(func, [v])
+        gradgradcheck(func, [v])
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_batchnorm_cudnn_half(self):
+        # THNN
+        input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device="cuda", requires_grad=True)
+        m = nn.BatchNorm2d(3).half().cuda()
+        thnn_output = m(input)
+        thnn_output.sum().backward()
+        thnn_input_grad = input.grad.data.clone()
+        self.assertEqual(thnn_output.type(), input.type())
+        # cuDNN
+        if TEST_CUDNN:
+            input.grad = None
+            m = m.float()
+            cudnn_output = m(input)
+            cudnn_output.sum().backward()
+            cudnn_input_grad = input.grad.data.clone()
+            self.assertEqual(cudnn_output.type(), input.type())
+            self.assertEqual(cudnn_output, thnn_output)
+            self.assertAlmostEqual(cudnn_input_grad, thnn_input_grad, delta=1e-3)
+
+    def _test_batchnorm_update_stats(self, device="cpu", dtype=torch.float):
+        module = nn.BatchNorm1d(3).to(device, dtype)
+
+        data = torch.rand(4, 3, device=device, dtype=dtype)
+
+        # training pass
+        old_running_mean = module.running_mean.clone()
+        old_running_var = module.running_var.clone()
+        old_num_batches_tracked = module.num_batches_tracked.clone()
+        module(data)
+        self.assertNotEqual(old_running_mean, module.running_mean)
+        self.assertNotEqual(old_running_var, module.running_var)
+        self.assertEqual(old_num_batches_tracked + 1, module.num_batches_tracked)
+
+        # eval pass
+        module.eval()
+        old_running_mean = module.running_mean.clone()
+        old_running_var = module.running_var.clone()
+        old_num_batches_tracked = module.num_batches_tracked.clone()
+        module(data)
+        self.assertEqual(old_running_mean, module.running_mean)
+        self.assertEqual(old_running_var, module.running_var)
+        self.assertEqual(old_num_batches_tracked, module.num_batches_tracked)
+
+    def test_batchnorm_update_stats(self):
+        self._test_batchnorm_update_stats()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_batchnorm_update_stats_cuda(self):
+        self._test_batchnorm_update_stats("cuda", torch.float)
+
+    def test_batchnorm_raises_error_if_running_mean_is_not_same_size_as_input(self):
+        input = torch.rand(2, 10)
+        running_var = torch.rand(10)
+        wrong_sizes = [9, 11]
+        for size in wrong_sizes:
+            with self.assertRaises(RuntimeError):
+                F.batch_norm(input, torch.rand(size), running_var)
+
+    def test_batchnorm_raises_error_if_running_var_is_not_same_size_as_input(self):
+        input = torch.rand(2, 10)
+        running_mean = torch.rand(10)
+        wrong_sizes = [9, 11]
+        for size in wrong_sizes:
+            with self.assertRaises(RuntimeError):
+                F.batch_norm(input, running_mean, torch.rand(size))
+
+    def test_batchnorm_raises_error_if_weight_is_not_same_size_as_input(self):
+        input = torch.rand(2, 10)
+        running_mean = torch.rand(10)
+        running_var = torch.rand(10)
+        wrong_sizes = [9, 11]
+        for size in wrong_sizes:
+            with self.assertRaises(RuntimeError):
+                F.batch_norm(input, running_mean, running_var, weight=Parameter(torch.rand(size)))
+
+    def test_batchnorm_raises_error_if_bias_is_not_same_size_as_input(self):
+        input = torch.rand(2, 10)
+        running_mean = torch.rand(10)
+        running_var = torch.rand(10)
+        wrong_sizes = [9, 11]
+        for size in wrong_sizes:
+            with self.assertRaises(RuntimeError):
+                F.batch_norm(input, running_mean, running_var, bias=Parameter(torch.rand(size)))
+
+    def _test_batchnorm_eval(self, device="cpu", dtype=torch.float):
+        module = nn.BatchNorm1d(3).to(device, dtype)
+        module.eval()
+
+        data = torch.rand(4, 3, device=device, dtype=dtype, requires_grad=True)
+        grad = torch.rand(4, 3, device=device, dtype=dtype)
+
+        # 1st pass
+        res1 = module(data)
+        res1.backward(grad)
+        grad1 = data.grad.clone()
+
+        # 2nd pass
+        if data.grad is not None:
+            data.grad.data.zero_()
+
+        res2 = module(data)
+        res2.backward(grad)
+        grad2 = data.grad.clone()
+        self.assertEqual(res1, res2)
+        self.assertEqual(grad1, grad2)
+
+        # track_running_stats=False
+        module = nn.BatchNorm1d(3, track_running_stats=False).to(device, dtype)
+
+        data = torch.rand(4, 3, device=device, dtype=dtype, requires_grad=True)
+        grad = torch.rand(4, 3, device=device, dtype=dtype)
+
+        # 1st pass
+        res1 = module(data)
+        res1.backward(grad)
+        grad1 = data.grad.clone()
+
+        # set eval
+        module.eval()
+
+        # 2nd pass
+        if data.grad is not None:
+            data.grad.data.zero_()
+
+        res2 = module(data)
+        res2.backward(grad)
+        grad2 = data.grad.clone()
+        self.assertEqual(res1, res2)
+        self.assertEqual(grad1, grad2)
+
+    def _test_batchnorm_simple_average(self, test_type=torch.FloatTensor):
+        module = nn.BatchNorm1d(3, momentum=None).type(test_type)
+        zeros = torch.zeros(3).type(test_type)
+        ones = torch.ones(3).type(test_type)
+        self.assertEqual(module.running_mean, zeros)
+        self.assertEqual(module.running_var, ones)
+
+        data1 = torch.rand(4, 3).type(test_type)
+        data2 = torch.rand(4, 3).type(test_type)
+
+        # 1st pass
+        res1 = module(data1)
+        running_mean1 = module.running_mean.clone()
+        running_var1 = module.running_var.clone()
+        self.assertNotEqual(running_mean1, zeros)
+        self.assertNotEqual(running_var1, ones)
+
+        # reset stats
+        module.reset_running_stats()
+        self.assertEqual(module.running_mean, zeros)
+        self.assertEqual(module.running_var, ones)
+
+        # 2nd pass
+        res2 = module(data2)
+        running_mean2 = module.running_mean.clone()
+        running_var2 = module.running_var.clone()
+        self.assertNotEqual(running_mean2, zeros)
+        self.assertNotEqual(running_var2, ones)
+
+        # reset stats
+        module.reset_running_stats()
+        self.assertEqual(module.running_mean, zeros)
+        self.assertEqual(module.running_var, ones)
+
+        # 3rd (combined) pass
+        res3 = module(data1)
+        res4 = module(data2)
+        self.assertEqual(res3, res1)
+        self.assertEqual(res4, res2)
+        self.assertAlmostEqual(module.running_mean, (running_mean1 + running_mean2) / 2)
+        self.assertAlmostEqual(module.running_var, (running_var1 + running_var2) / 2)
+
+    def test_pairwise_distance(self):
+        input1 = torch.randn(4, 4, requires_grad=True)
+        input2 = torch.randn(4, 4, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x, y: F.pairwise_distance(x, y), (input1, input2)))
+
+    def test_cosine_embedding_loss_no_reduce(self):
+        input1 = torch.randn(15, 10, requires_grad=True)
+        input2 = torch.randn(15, 10, requires_grad=True)
+        target = torch.randn(15).sign()
+        self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss(
+            x, y, z, reduction='none'), (input1, input2, target)))
+        self.assertEqual(F.cosine_embedding_loss(input1, input2, target, reduction='none'),
+                         loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target, reduction='none'))
+
+    def test_cosine_embedding_loss_margin_no_reduce(self):
+        input1 = torch.randn(15, 10, requires_grad=True)
+        input2 = torch.randn(15, 10, requires_grad=True)
+        target = torch.randn(15).sign()
+        self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss(
+            x, y, z, margin=0.5, reduction='none'), (input1, input2, target)))
+        self.assertEqual(F.cosine_embedding_loss(input1, input2, target, margin=0.5, reduction='none'),
+                         loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target,
+                                                                   margin=0.5, reduction='none'))
+
+    def test_margin_ranking_loss_no_reduce(self):
+        input1 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        input2 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        target = torch.randn(15).sign()
+        self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
+            x, y, z, reduction='none'), (input1, input2, target)))
+        self.assertEqual(F.margin_ranking_loss(input1, input2, target, reduction='none'),
+                         loss_reference_fns['MarginRankingLoss'](input1, input2, target, reduction='none'))
+
+    def test_margin_ranking_loss_margin_no_reduce(self):
+        input1 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        input2 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        target = torch.randn(15).sign()
+        self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
+            x, y, z, margin=0.5, reduction='none'), (input1, input2, target)))
+        self.assertEqual(F.margin_ranking_loss(input1, input2, target, margin=0.5, reduction='none'),
+                         loss_reference_fns['MarginRankingLoss'](input1, input2, target, margin=0.5, reduction='none'))
+
+    def test_triplet_margin_loss(self):
+        input1 = torch.randn(5, 10, requires_grad=True)
+        input2 = torch.randn(5, 10, requires_grad=True)
+        input3 = torch.randn(5, 10, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
+            x1, x2, x3), (input1, input2, input3)))
+        self.assertEqual(F.triplet_margin_loss(input1, input2, input3),
+                         loss_reference_fns['TripletMarginLoss'](input1, input2, input3))
+
+    def test_triplet_margin_loss_swap(self):
+        input1 = torch.randn(5, 10, requires_grad=True)
+        input2 = torch.randn(5, 10, requires_grad=True)
+        input3 = torch.randn(5, 10, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
+            x1, x2, x3, swap=True), (input1, input2, input3)))
+        self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True),
+                         loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True))
+
+    def test_triplet_margin_loss_no_reduce(self):
+        input1 = torch.randn(5, 10, requires_grad=True)
+        input2 = torch.randn(5, 10, requires_grad=True)
+        input3 = torch.randn(5, 10, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
+            x1, x2, x3, reduction='none'), (input1, input2, input3)))
+        self.assertEqual(F.triplet_margin_loss(input1, input2, input3, reduction='none'),
+                         loss_reference_fns['TripletMarginLoss'](input1, input2, input3, reduction='none'))
+
+    def test_triplet_margin_loss_swap_no_reduce(self):
+        input1 = torch.randn(5, 10, requires_grad=True)
+        input2 = torch.randn(5, 10, requires_grad=True)
+        input3 = torch.randn(5, 10, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
+            x1, x2, x3, swap=True, reduction='none'), (input1, input2, input3)))
+        self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'),
+                         loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none'))
+
+    def test_cosine_similarity(self):
+        input1 = torch.randn(4, 4, requires_grad=True)
+        input2 = torch.randn(4, 4, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y), (input1, input2)))
+
+        input1 = torch.randn(4, 5, 6, requires_grad=True)
+        input2 = torch.randn(4, 5, 6, requires_grad=True)
+        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2)))
+        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2)))
+
+        input1 = torch.randn((), requires_grad=True)
+        input2 = torch.randn((), requires_grad=True)
+        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2)))
+        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2)))
+
+        # Check cosine_similarity input/output shapes
+        input_size = (1, 3, 2, 1)
+        expected_size = (1, 2, 1)
+        input1 = torch.randn(input_size, requires_grad=True)
+        input2 = torch.randn(input_size, requires_grad=True)
+        self.assertEqual(F.cosine_similarity(input1, input2, dim=1).size(), expected_size)
+
+    def test_grid_sample_unsupported_mode(self):
+        with self.assertRaisesRegex(NotImplementedError, "nn.functional.grid_sample got unsupported mode: 'garbage'"):
+            F.grid_sample(torch.tensor([]), torch.tensor([]), mode='garbage')
+
+    def test_grid_sample(self):
+        def test_cpu_against_cuda(N, C, H, W, padding_mode):
+            def test_shape(N, C, IH, IW, H, W, padding_mode):
+
+                input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), requires_grad=True)
+                grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), requires_grad=True)
+                out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
+                self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
+
+                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
+                self.assertEqual(out_cpu, out_cuda)
+
+                gradients = out_cpu.data.new(out_cpu.size()).normal_()
+                out_cpu.backward(gradients)
+                out_cuda.backward(gradients.cuda())
+                self.assertEqual(input_cpu.grad, input_cuda.grad)
+                self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
+
+                # check that zero-dimensional input strides don't error out
+                base_input = torch.randn(C, IH, IW)
+                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
+                grid_cpu = torch.randn(N, H, W, 2, requires_grad=True)
+                out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
+
+                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
+                out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
+                self.assertEqual(out_cpu, out_cuda)
+
+            # test same size output
+            test_shape(N, C, H, W, H, W, padding_mode)
+
+            # test larger output
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            H = random.randint(IH + 1, 12)
+            W = random.randint(IW + 1, 12)
+            test_shape(N, C, IH, IW, H, W, padding_mode)
+
+            # test smaller output
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            H = random.randint(1, IH)
+            W = random.randint(1, IW)
+            test_shape(N, C, IH, IW, H, W, padding_mode)
+
+        # test known input on CPU
+        for padding_mode in ['zeros', 'border']:
+
+            input = Variable(torch.arange(1., 11).view(1, 1, 2, 5))
+            grid = Variable(torch.Tensor(
+                [[-0.9, -1.4, 0, 0.2, 1],
+                 [-1, -0.333, 0, 0.5, 1],
+                 [-1, -0.5, 0, 0.3333, 1],
+                 [-1, -0.2, 0, 1.1, 0.5]]).view(1, 2, 5, 2))
+            output = F.grid_sample(input, grid, padding_mode=padding_mode)
+
+            if padding_mode == 'zeros':
+                groundtruth = torch.Tensor(
+                    [[0.9600, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                     [2.2500, 6.333250045, 5.0000, 5.1000, 7.0000]]).view(1, 1, 2, 5)
+            else:
+                groundtruth = torch.Tensor(
+                    [[1.2000, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                     [2.2500, 6.333250045, 5.0000, 5.1000, 8.7500]]).view(1, 1, 2, 5)
+
+            self.assertEqual(output.data, groundtruth)
+
+            # do gradcheck
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            H = random.randint(1, 8)
+            W = random.randint(1, 8)
+            input = torch.randn(N, C, H, W, requires_grad=True)
+            grid = torch.randn(N, H, W, 2, requires_grad=True)
+            self.assertTrue(gradcheck(
+                lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode),
+                (input, grid)))
+
+            # test CUDA against CPU
+            if TEST_CUDA:
+                test_cpu_against_cuda(N, C, H, W, padding_mode)
+
+                # test channels >1024, which doesn't work on cudnn 7102 and further
+                N, C, H, W = 1, 1025, 3, 3
+                self.assertTrue(gradcheck(
+                    lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode),
+                    (input, grid)))
+                test_cpu_against_cuda(N, C, H, W, padding_mode)
+
+    def test_grid_sample_3d(self):
+        def test_cpu_against_cuda(N, C, D, H, W, padding_mode):
+            def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode):
+
+                input_cpu = Variable(torch.randn(C, N, ID, IH, IW).transpose(0, 1), requires_grad=True)
+                grid_cpu = Variable(torch.randn(D, N, H, W, 3).transpose(0, 1), requires_grad=True)
+                out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
+                self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W]))
+
+                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
+                self.assertEqual(out_cpu, out_cuda)
+
+                gradients = out_cpu.data.new(out_cpu.size()).normal_()
+                out_cpu.backward(gradients)
+                out_cuda.backward(gradients.cuda())
+                self.assertEqual(input_cpu.grad, input_cuda.grad)
+                self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
+
+                # check that zero-dimensional input strides don't error out
+                base_input = torch.randn(C, ID, IH, IW)
+                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
+                grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True)
+                out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
+
+                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
+                out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
+                self.assertEqual(out_cpu, out_cuda)
+
+            # test same size output
+            test_shape(N, C, D, H, W, D, H, W, padding_mode)
+
+            # test larger output
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            ID = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            D = random.randint(ID + 1, 12)
+            H = random.randint(IH + 1, 12)
+            W = random.randint(IW + 1, 12)
+            test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
+
+            # test smaller output
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            ID = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            D = random.randint(1, ID)
+            H = random.randint(1, IH)
+            W = random.randint(1, IW)
+            test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
+
+        # test known input on CPU
+        for padding_mode in ['zeros', 'border']:
+            # do gradcheck
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            D = random.randint(1, 8)
+            H = random.randint(1, 8)
+            W = random.randint(1, 8)
+            input = torch.randn(N, C, D, H, W, requires_grad=True)
+            grid = torch.randn(N, D, H, W, 3, requires_grad=True)
+            self.assertTrue(gradcheck(
+                lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode),
+                (input, grid)))
+
+            # test CUDA against CPU
+            if TEST_CUDA:
+                test_cpu_against_cuda(N, C, D, H, W, padding_mode)
+
+    def test_affine_grid(self):
+        # test known input on CPU
+        input = Variable(torch.arange(1., 7).view(1, 2, 3))
+        output = F.affine_grid(input, torch.Size([1, 1, 2, 2]))
+        groundtruth = torch.Tensor(
+            [[[0, -3], [2, 5]], [[4, 7], [6, 15]]]).view(1, 2, 2, 2)
+        self.assertEqual(output.data, groundtruth)
+
+        # do gradcheck
+        N = random.randint(1, 8)
+        C = random.randint(1, 8)
+        H = random.randint(1, 8)
+        W = random.randint(1, 8)
+        sz = torch.Size([N, C, H, W])
+        inp = torch.randn(N, 2, 3, requires_grad=True)
+        self.assertTrue(gradcheck(lambda inp: F.affine_grid(inp, sz), (inp,)))
+
+        # test CPU against CUDA
+        if TEST_CUDNN:
+            input_cpu = torch.randn(N, 2, 3, requires_grad=True)
+            out_cpu = F.affine_grid(input_cpu, sz)
+            gradients = torch.randn(out_cpu.size())
+            out_cpu.backward(gradients)
+            input_gpu = Variable(input_cpu.data.cuda(), requires_grad=True)
+            out_cuda = F.affine_grid(input_gpu, sz)
+            out_cuda.backward(gradients.cuda())
+            self.assertEqual(out_cpu, out_cuda)
+            self.assertEqual(input_cpu.grad, input_gpu.grad)
+
+    def test_upsamplingNearest1d(self):
+        m = nn.Upsample(size=4, mode='nearest')
+        in_t = torch.ones(1, 1, 2)
+        out_t = m(Variable(in_t))
+        self.assertEqual(torch.ones(1, 1, 4), out_t.data)
+
+        input = torch.randn(1, 1, 2, requires_grad=True)
+        gradcheck(lambda x: F.upsample(x, 4, mode='nearest'), [input])
+
+    def test_upsamplingLinear1d(self):
+        for align_corners in [True, False]:
+            kwargs = dict(mode='linear', align_corners=align_corners)
+
+            # test float scale factor up & downsampling
+            for scale_factor in [0.5, 1.5, 2]:
+                m = nn.Upsample(scale_factor=scale_factor, **kwargs)
+                in_t = torch.ones(1, 1, 2)
+                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+                out_t = m(in_t)
+                self.assertEqual(torch.ones(1, 1, out_size), out_t.data)
+
+                input = torch.randn(1, 1, 2, requires_grad=True)
+                gradcheck(lambda x: F.upsample(x, out_size, **kwargs), (input,))
+
+    def test_upsamplingLinear1d_spatial_invariance(self):
+        m = nn.Upsample(scale_factor=3, mode='linear', align_corners=False)
+        in_t_9 = torch.zeros(1, 1, 9)
+        in_t_9[:, :, :4].normal_()
+        out_t_9 = m(in_t_9)
+        out_t_5 = m(in_t_9[:, :, :5])
+        self.assertEqual(out_t_9[:, :, :15], out_t_5)
+
+    def test_upsamplingNearest2d(self):
+        m = nn.Upsample(size=4, mode='nearest')
+        in_t = torch.ones(1, 1, 2, 2)
+        out_t = m(Variable(in_t))
+        self.assertEqual(torch.ones(1, 1, 4, 4), out_t.data)
+
+        input = torch.randn(1, 1, 2, 2, requires_grad=True)
+        self.assertEqual(
+            F.upsample(input, 4, mode='nearest'),
+            F.upsample(input, scale_factor=2, mode='nearest'))
+        gradcheck(lambda x: F.upsample(x, 4, mode='nearest'), [input])
+        gradgradcheck(lambda x: F.upsample(x, 4, mode='nearest'), [input])
+
+    def test_upsamplingBilinear2d(self):
+        for align_corners in [True, False]:
+            kwargs = dict(mode='bilinear', align_corners=align_corners)
+
+            # test float scale factor up & downsampling
+            for scale_factor in [0.5, 1.5, 2]:
+                m = nn.Upsample(scale_factor=scale_factor, **kwargs)
+                in_t = torch.ones(1, 1, 2, 2)
+                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+                out_t = m(in_t)
+                self.assertEqual(torch.ones(1, 1, out_size, out_size), out_t.data)
+
+                input = torch.randn(1, 1, 2, 2, requires_grad=True)
+                gradcheck(lambda x: F.upsample(x, out_size, **kwargs), [input])
+
+    def test_upsamplingBilinear2d_spatial_invariance(self):
+        m = nn.Upsample(scale_factor=3, mode='bilinear', align_corners=False)
+        in_t_9 = torch.zeros(1, 1, 9, 9)
+        in_t_9[:, :, :4, :4].normal_()
+        out_t_9 = m(in_t_9)
+        out_t_5 = m(in_t_9[:, :, :5, :5])
+        self.assertEqual(out_t_9[:, :, :15, :15], out_t_5)
+
+    def test_upsamplingNearest3d(self):
+        m = nn.Upsample(size=4, mode='nearest')
+        in_t = torch.ones(1, 1, 2, 2, 2)
+        out_t = m(Variable(in_t))
+        self.assertEqual(torch.ones(1, 1, 4, 4, 4), out_t.data)
+
+        input = torch.randn(1, 1, 2, 2, 2, requires_grad=True)
+        gradcheck(lambda x: F.upsample(x, 4, mode='nearest'), [input])
+
+    def test_upsamplingTrilinear3d(self):
+        for align_corners in [True, False]:
+            kwargs = dict(mode='trilinear', align_corners=align_corners)
+
+            # test float scale factor up & downsampling
+            for scale_factor in [0.5, 1.5, 2]:
+                m = nn.Upsample(scale_factor=scale_factor, **kwargs)
+                in_t = torch.ones(1, 1, 2, 2, 2)
+                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+                out_t = m(in_t)
+                self.assertEqual(torch.ones(1, 1, out_size, out_size, out_size), out_t.data)
+
+                input = torch.randn(1, 1, 2, 2, 2, requires_grad=True)
+                self.assertEqual(
+                    F.upsample(input, (out_size, out_size, out_size), **kwargs),
+                    F.upsample(input, scale_factor=scale_factor, **kwargs))
+                gradcheck(lambda x: F.upsample(x, out_size, **kwargs), [input])
+                gradgradcheck(lambda x: F.upsample(x, out_size, **kwargs), [input])
+
+    def test_upsamplingTrilinear3d_spatial_invariance(self):
+        m = nn.Upsample(scale_factor=3, mode='trilinear', align_corners=False)
+        in_t_9 = torch.zeros(1, 1, 9, 9, 9)
+        in_t_9[:, :, :4, :4, :4].normal_()
+        out_t_9 = m(in_t_9)
+        out_t_5 = m(in_t_9[:, :, :5, :5, :5])
+        self.assertEqual(out_t_9[:, :, :15, :15, :15], out_t_5)
+
+    def test_interpolate(self):
+        def _test_interpolate_helper(in_t, scale_factor, layer):
+                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+                dim = len(in_t.shape) - 2
+                out_shape = [1, 1] + [out_size] * dim
+                out_t = m(in_t)
+                self.assertEqual(torch.ones(out_shape), out_t)
+
+                self.assertEqual(
+                    F.interpolate(in_t, (out_size,) * dim, **kwargs),
+                    F.interpolate(in_t, scale_factor=scale_factor, **kwargs))
+                gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [in_t])
+                gradgradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [in_t])
+
+        def _make_input(dim):
+            size = [1, 1]
+            size += [2] * dim
+            return torch.ones(size, requires_grad=True)
+
+        device_list = ['cpu']
+        if TEST_CUDA:
+            device_list.append('cuda')
+
+        for device in device_list:
+            for scale_factor in [0.5, 1.5, 2]:
+                for mode in ['nearest', 'area']:
+                    kwargs = dict(mode=mode)
+                    m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device)
+                    for input in [_make_input(1), _make_input(2), _make_input(3)]:
+                        _test_interpolate_helper(input, scale_factor, m)
+
+                for align_corners in [True, False]:
+                    kwargs = dict(mode='linear', align_corners=align_corners)
+                    m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device)
+                    _test_interpolate_helper(_make_input(1), scale_factor, m)
+
+                    kwargs = dict(mode='bilinear', align_corners=align_corners)
+                    m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device)
+                    _test_interpolate_helper(_make_input(2), scale_factor, m)
+
+                    kwargs = dict(mode='trilinear', align_corners=align_corners)
+                    m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device)
+                    _test_interpolate_helper(_make_input(3), scale_factor, m)
+
+    def test_linear_broadcasting(self):
+        m = nn.Linear(5, 8)
+        inp = torch.randn(2, 3, 5)
+        expected = m(inp.view(6, 5)).view(2, 3, 8)
+        self.assertEqual(expected, m(inp))
+
+    def test_bilinear(self):
+        module = nn.Bilinear(10, 10, 8)
+        module_legacy = legacy.Bilinear(10, 10, 8)
+
+        module_legacy.weight.copy_(module.weight.data)
+        module_legacy.bias.copy_(module.bias.data)
+
+        input1 = torch.randn(4, 10)
+        input2 = torch.randn(4, 10)
+
+        output = module(Variable(input1), Variable(input2))
+        output_legacy = module_legacy.forward([input1, input2])
+
+        self.assertEqual(output.data, output_legacy)
+
+        input1_1 = torch.tensor(input1, requires_grad=True)
+        input2_1 = torch.tensor(input2, requires_grad=True)
+
+        module.zero_grad()
+        module_legacy.zeroGradParameters()
+
+        output = module(input1_1, input2_1)
+        grad_output = torch.randn(*output.size())
+        gi1_legacy, gi2_legacy = module_legacy.backward([input1, input2], grad_output)
+        output.backward(grad_output)
+        gi1 = input1_1.grad.data.clone()
+        gi2 = input2_1.grad.data.clone()
+
+        self.assertEqual(gi1, gi1_legacy)
+        self.assertEqual(gi2, gi2_legacy)
+        self.assertEqual(module.weight.grad.data, module_legacy.gradWeight)
+        self.assertEqual(module.bias.grad.data, module_legacy.gradBias)
+
+        _assertGradAndGradgradChecks(self, lambda x1, x2: F.bilinear(x1, x2, module.weight, module.bias),
+                                     (input1_1, input2_1))
+
+    def test_bilinear_no_bias(self):
+        module = nn.Bilinear(10, 10, 8)
+        module_no_bias = nn.Bilinear(10, 10, 8, False)
+
+        module.bias.data.zero_()
+        module.weight.data.copy_(module_no_bias.weight)
+
+        input1 = torch.randn(4, 10, requires_grad=True)
+        input2 = torch.randn(4, 10, requires_grad=True)
+        grad_output = torch.randn(4, 8)
+
+        def run(net):
+            input1.grad = input2.grad = None
+            output = net(input1, input2)
+            output.backward(grad_output)
+
+            return output.data, input1.grad.data, input2.grad.data
+
+        out, g1, g2 = run(module)
+        out_nb, g1_nb, g2_nb = run(module_no_bias)
+
+        self.assertEqual(out, out_nb)
+        self.assertEqual(g1, g1_nb)
+        self.assertEqual(g2, g2_nb)
+
+        _assertGradAndGradgradChecks(self,
+                                     lambda x1, x2: F.bilinear(x1, x2, module_no_bias.weight, module_no_bias.bias),
+                                     (input1, input2))
+
+    def test_bilinear_broadcasting(self):
+        m = nn.Bilinear(5, 6, 8)
+        input1 = torch.randn(2, 3, 5)
+        input2 = torch.randn(2, 3, 6)
+        expected = m(input1.view(6, 5), input2.view(6, 6)).view(2, 3, 8)
+        self.assertEqual(expected, m(input1, input2))
+
+    def test_conv_tbc(self):
+        inp = torch.randn(9, 4, 5, requires_grad=True)
+        weight = torch.randn(3, 5, 6, requires_grad=True)
+        bias = torch.randn(6, requires_grad=True)
+
+        gradcheck(lambda i, w, b, pad: F.conv_tbc(i, w, b, pad), (inp, weight, bias, 3))
+
+    @staticmethod
+    def _test_conv_noncontig_weights(self, device):
+        for dim in (1, 2, 3):
+            for grouped in (True, False):
+                nc = 3
+                groups = 3 if grouped else 1
+                w = torch.randn([3] * dim, device=device)
+                w = w.expand([nc, int(nc / groups)] + list(w.shape))
+                x = torch.randn([1, nc] + ([5] * dim), device=device)
+                getattr(F, 'conv{}d'.format(dim))(x, w, groups=groups)
+                getattr(F, 'conv_transpose{}d'.format(dim))(x, w, groups=groups)
+
+    def test_conv_noncontig_weights(self):
+        self._test_conv_noncontig_weights(self, torch.device('cpu'))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_conv_noncontig_weights_cuda(self):
+        self._test_conv_noncontig_weights(self, torch.device('cuda'))
+
+    def run_conv_double_back_test(self, kern, stride, padding, chan_in, chan_out, batch_size,
+                                  inp_size, dilation, no_weight, groups=1, use_cuda=False,
+                                  use_bias=True, dtype=torch.double):
+        if use_cuda:
+            device = torch.device("cuda")
+        else:
+            device = torch.device("cpu")
+
+        x = torch.randn(batch_size, chan_in, inp_size, inp_size, device=device,
+                        dtype=dtype, requires_grad=True)
+        weight = torch.randn(chan_out, chan_in // groups, kern, kern, device=device,
+                             dtype=dtype, requires_grad=not no_weight)
+        if use_bias:
+            bias = torch.randn(chan_out, device=device, dtype=dtype, requires_grad=True)
+        else:
+            bias = None
+
+        def func(*inputs):
+            if use_bias:
+                lx, lweight, lbias = inputs
+            else:
+                lx, lweight = inputs
+                lbias = None
+            # We disable cudnn during forward to avoid finite difference imprecision issues
+            with cudnn.flags(enabled=False):
+                out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups)
+            return out
+
+        if use_bias:
+            inputs = x, weight, bias
+        else:
+            inputs = x, weight
+
+        dummy_out = func(*inputs)
+        grad_y = torch.randn_like(dummy_out, device=device, dtype=dtype, requires_grad=True)
+
+        return gradgradcheck(func, inputs, (grad_y,))
+
+    def test_conv_double_backward(self):
+        batch_size = 2
+        for kern, inp_size, dilations in [(3, 6, [1, 2]), (3, 7, [1]), (4, 9, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in \
+                    product([1, 2], [0, 1, 2], [2], [3], dilations):
+                for no_weight in (True, False):
+                    result = self.run_conv_double_back_test(kern, stride,
+                                                            padding, chan_in, chan_out,
+                                                            batch_size, inp_size, dilation,
+                                                            no_weight)
+                    self.assertTrue(result,
+                                    "Conv double backward test failed with parameters:" +
+                                    "\nkern: " + str(kern) +
+                                    "\nstride: " + str(stride) +
+                                    "\npadding: " + str(padding) +
+                                    "\nchan_in: " + str(chan_in) +
+                                    "\nchan_out: " + str(chan_out) +
+                                    "\nbatch_size: " + str(batch_size) +
+                                    "\ninp_size: " + str(inp_size) +
+                                    "\ndilation: " + str(dilation))
+
+    def test_conv_double_backward_no_bias(self):
+        kern = 3
+        stride = 2
+        chan_in, chan_out = 2, 4
+        batch_size = 2
+        inp_size = 5
+        padding = 1
+        dilation = 1
+        no_weight = False
+        use_bias = True
+        result = self.run_conv_double_back_test(kern, stride,
+                                                padding, chan_in, chan_out,
+                                                batch_size, inp_size, dilation,
+                                                no_weight, use_bias=use_bias)
+        self.assertTrue(result,
+                        "Conv double backward test failed with parameters:" +
+                        "\nkern: " + str(kern) +
+                        "\nstride: " + str(stride) +
+                        "\npadding: " + str(padding) +
+                        "\nchan_in: " + str(chan_in) +
+                        "\nchan_out: " + str(chan_out) +
+                        "\nbatch_size: " + str(batch_size) +
+                        "\ninp_size: " + str(inp_size) +
+                        "\ndilation: " + str(dilation))
+
+    def test_conv_double_backward_groups(self):
+        kern = 3
+        stride = 1
+        padding = 2
+        chan_in, chan_out = 2, 4
+        batch_size = 2
+        inp_size = 6
+        dilation = 1
+        no_weight = False
+        groups = 2
+        result = self.run_conv_double_back_test(kern, stride,
+                                                padding, chan_in * groups, chan_out * groups,
+                                                batch_size, inp_size, dilation,
+                                                no_weight, groups=groups)
+        self.assertTrue(result,
+                        "Conv double backward test failed with parameters:" +
+                        "\nkern: " + str(kern) +
+                        "\nstride: " + str(stride) +
+                        "\npadding: " + str(padding) +
+                        "\nchan_in: " + str(chan_in) +
+                        "\nchan_out: " + str(chan_out) +
+                        "\nbatch_size: " + str(batch_size) +
+                        "\ninp_size: " + str(inp_size) +
+                        "\ndilation: " + str(dilation) +
+                        "\ngroups: " + str(groups))
+
+    def test_conv_double_backward_stride(self):
+        batch_size = 2
+
+        # Cannot provide ggW when stride is > 1
+        for kern, inp_size, dilations in [(3, 5, [1, 2]), (3, 7, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in product([2], [0, 1], [1], [2], dilations):
+                no_weight = False
+                self.run_conv_double_back_test(kern, stride,
+                                               padding, chan_in, chan_out,
+                                               batch_size, inp_size, dilation,
+                                               no_weight)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_cudnn_noncontiguous_weight(self):
+        # Noncontiguous weights must be contiguous() before being
+        # passed to cuDNN
+        input = Variable(torch.cuda.DoubleTensor([1, 1, 1]).view(1, 1, 3))
+        weights1 = Variable(torch.cuda.DoubleTensor([1]).expand(1, 1, 2))
+        weights2 = Variable(torch.cuda.DoubleTensor([1]).expand(1, 1, 2)).contiguous()
+        self.assertEqual(F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
+                         F.conv1d(input, weights2, bias=None, stride=2, dilation=2))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @repeat_test_for_types(DOUBLE_TENSORTYPES)
+    def test_conv_double_backward_cuda(self, dtype=torch.double):
+        # Double backward only runs with DoubleTensor due to precison reason
+        batch_size = 1
+        for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in product([1], [2], [2], [3], dilations):
+                no_weight = stride == 2
+                result = self.run_conv_double_back_test(kern, stride,
+                                                        padding, chan_in, chan_out,
+                                                        batch_size, inp_size, dilation,
+                                                        no_weight, use_cuda=True, dtype=dtype)
+                self.assertTrue(result,
+                                "Conv double backward test failed with parameters:" +
+                                "\nkern: " + str(kern) +
+                                "\nstride: " + str(stride) +
+                                "\npadding: " + str(padding) +
+                                "\nchan_in: " + str(chan_in) +
+                                "\nchan_out: " + str(chan_out) +
+                                "\nbatch_size: " + str(batch_size) +
+                                "\ninp_size: " + str(inp_size) +
+                                "\ndilation: " + str(dilation))
+
+    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input'):
+        for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
+            for batch, stride, padding, chan_in, chan_out, dilation in \
+                    product([1, 2], [1, 2], [0, 1, 2], [2], [3], [1]):
+
+                input_shape = [batch, chan_in]
+                weight_shape = [chan_out, chan_in]
+                for _ in range(dim):
+                    input_shape.append(inp_size)
+                    weight_shape.append(kern)
+
+                input = torch.randn(input_shape, requires_grad=True)
+                weight = torch.randn(weight_shape, requires_grad=True)
+                output = func_forward(input, weight, stride=stride, padding=padding, dilation=dilation)
+
+                gradient_o = torch.randn(output.shape)
+                gradient_w = torch.autograd.grad(output, input if (gradient == 'input') else weight, gradient_o)
+
+                self.assertAlmostEqual(gradient_w[0],
+                                       func_backward(
+                                           input_shape if (gradient == 'input') else input,
+                                           weight_shape if (gradient == 'weight') else weight,
+                                           gradient_o,
+                                           stride=stride,
+                                           padding=padding,
+                                           dilation=dilation))
+
+    def test_grad_conv1d_input(self):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input')
+
+    def test_grad_conv1d_weight(self):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight')
+
+    def test_grad_conv2d_input(self):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input')
+
+    def test_grad_conv2d_weight(self):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight')
+
+    def test_grad_conv3d_input(self):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input')
+
+    def test_grad_conv3d_weight(self):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight')
+
+    def test_fold_invalid_arg(self):
+        # input wrong dimension
+
+        fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3))
+        with self.assertRaisesRegex(NotImplementedError, r"Only 3D input Tensors are supported"):
+            fold(torch.randn(1, 5))
+
+        # input.size(1) not divisible by \prod(kernel_size)
+
+        fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3))
+        with self.assertRaisesRegex(RuntimeError, r"be divisible by the product of kernel_size"):
+            fold(torch.randn(1, 5, 9))
+
+        with self.assertRaisesRegex(RuntimeError, r"be divisible by the product of kernel_size"):
+            fold(torch.randn(1, 19, 9))
+
+        # input.size(2) not matching the total number of sliding blocks
+
+        with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"):
+            fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3))
+            fold(torch.randn(1, 6, 10))
+
+        with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"):
+            fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3), stride=(2, 2))
+            fold(torch.randn(1, 6, 5))
+
+        with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"):
+            fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3), stride=(2, 2), dilation=(1, 2), padding=(2, 0))
+            fold(torch.randn(1, 6, 5))  # should be 4 * 1 = 4 sliding blocks
+
+    def test_unfold_invalid_arg(self):
+        # input wrong dimension
+
+        unfold = nn.Unfold(kernel_size=(2, 3))
+        with self.assertRaisesRegex(NotImplementedError, r"Only 4D input Tensors are supported"):
+            unfold(torch.randn(1, 5, 2))
+
+        # calculated output shape is too small
+
+        with self.assertRaisesRegex(RuntimeError, r"too small \(non-positive\)"):
+            unfold = nn.Unfold(kernel_size=(2, 3))
+            unfold(torch.randn(1, 2, 2, 2))
+
+        with self.assertRaisesRegex(RuntimeError, r"too small \(non-positive\)"):
+            unfold = nn.Unfold(kernel_size=(5, 3), padding=(1, 1))
+            unfold(torch.randn(1, 2, 2, 3))
+
+        with self.assertRaisesRegex(RuntimeError, r"too small \(non-positive\)"):
+            unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
+            unfold(torch.randn(1, 2, 2, 2))
+
+    def test_adaptive_log_softmax(self):
+        # args validation
+        with self.assertRaises(ValueError):
+            _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 15, 15], div_value=2.)
+
+        with self.assertRaises(ValueError):
+            _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 15, 10], div_value=2.)
+
+        with self.assertRaises(ValueError):
+            _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 25], div_value=2.)
+
+        # input shapes
+        with self.assertRaisesRegex(RuntimeError, r"Input and target should have the same size"):
+            asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.)
+            x = torch.randn(2, 16)
+            y = torch.tensor([0, 5, 10])
+            asfm(x, y)
+
+        # out-of-bound targets
+        with self.assertRaisesRegex(RuntimeError, r"Target values should be in"):
+            asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.)
+            x = torch.randn(2, 16)
+            y = torch.tensor([0, 20])
+            asfm(x, y)
+
+        # cluster sizes
+        asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.)
+        x = torch.randn(2, 16)
+        y = torch.tensor([0, 17])
+
+        self.assertEqual(asfm.head.weight.size(), (5 + 3, 16))   # 5 targets in head, 3 clusters, dimensionality 16
+        self.assertEqual(asfm.tail[0][1].weight.size(), (5, 8))  # 5 targets in this cluster, dimensionality 8
+        self.assertEqual(asfm.tail[1][1].weight.size(), (5, 4))
+        self.assertEqual(asfm.tail[2][1].weight.size(), (5, 2))
+
+        self.assertEqual(asfm(x, y).output.size(), (2, ))
+
+        # log_probs actually returns log_proba
+        asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 4, [2], div_value=2.)
+        x = torch.randn(4, 8)
+        logprob_out = asfm.log_prob(x)
+
+        self.assertEqual(torch.exp(logprob_out).data.sum(1), torch.ones(4))
+
+        # forward returns the same thing as log_probs
+        for v in [0, 1, 2, 3]:
+            y = torch.full((4,), v, dtype=torch.long)
+            out, loss = asfm(x, y)
+
+            self.assertEqual(out, logprob_out.gather(1, y.unsqueeze(1)).squeeze())
+            self.assertEqual(loss, F.nll_loss(logprob_out, y))
+
+        # predict
+        x = torch.randn(64, 8).abs_()
+
+        # argmax in shortlist
+        asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True)
+        asfm.head.weight.data.abs_()
+        asfm.head.bias.data.abs_()
+        asfm.head.weight.data[asfm.shortlist_size:, :].zero_()
+
+        out = asfm.predict(x)
+        self.assertEqual(out, asfm.log_prob(x).argmax(dim=1))
+
+        # argmax outside of shortlist
+        asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True)
+        asfm.head.weight.data.abs_()
+        asfm.head.bias.data.abs_()
+        asfm.head.weight.data[:asfm.shortlist_size, :].zero_()
+
+        out = asfm.predict(x)
+        self.assertEqual(out, asfm.log_prob(x).argmax(dim=1))
+
+        # half of the argmax in shortlist, half in clusters
+        asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True)
+        asfm.head.weight.data.abs_()
+        asfm.head.bias.data.abs_()
+
+        x[:32, :asfm.shortlist_size].zero_()
+        x[32:, asfm.shortlist_size:].zero_()
+
+        asfm.head.weight.data[:asfm.shortlist_size, asfm.shortlist_size:].zero_()
+        asfm.head.weight.data[asfm.shortlist_size:, :asfm.shortlist_size].zero_()
+
+        out = asfm.predict(x)
+        self.assertEqual(out, asfm.log_prob(x).argmax(dim=1))
+
+
+class TestNNInit(TestCase):
+    def setUp(self):
+        super(TestNNInit, self).setUp()
+        random.seed(123)
+
+    def _is_normal(self, tensor, mean, std):
+        samples = tensor.view(-1).tolist()
+        p_value = stats.kstest(samples, 'norm', args=(mean, std))[1]
+        return p_value > 0.0001
+
+    def _is_uniform(self, tensor, a, b):
+        samples = tensor.view(-1).tolist()
+        p_value = stats.kstest(samples, 'uniform', args=(a, (b - a)))[1]
+        return p_value > 0.0001
+
+    def _create_random_nd_tensor(self, dims, size_min, size_max):
+        size = [random.randint(size_min, size_max) for _ in range(dims)]
+        tensor = torch.zeros(size)
+        return tensor
+
+    def _random_float(self, a, b):
+        return (b - a) * random.random() + a
+
+    def test_calculate_gain_linear(self):
+        for fn in ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose2d', 'conv_transpose2d', 'conv_transpose3d']:
+            gain = init.calculate_gain(fn)
+            self.assertEqual(gain, 1)
+
+    def test_calculate_gain_nonlinear(self):
+        for fn in ['sigmoid', 'tanh', 'relu', 'leaky_relu']:
+            gain = init.calculate_gain(fn)
+            if fn == 'sigmoid':
+                self.assertEqual(gain, 1)
+            elif fn == 'tanh':  # 5 / 3
+                self.assertEqual(gain, 1.6666666666666667)
+            elif fn == 'relu':  # sqrt(2)
+                self.assertEqual(gain, 1.4142135623730951)
+            elif fn == 'leaky_relu':  # sqrt(2 / 1 + slope^2))
+                self.assertEqual(gain, 1.4141428569978354)
+
+    def test_calculate_gain_leaky_relu(self):
+        for param in [None, 0, 0.01, 10]:
+            gain = init.calculate_gain('leaky_relu', param)
+            if param is None:  # Default slope is 0.01
+                self.assertEqual(gain, 1.4141428569978354)
+            elif param == 0:  # No slope = same gain as normal ReLU
+                self.assertEqual(gain, 1.4142135623730951)
+            elif param == 0.01:
+                self.assertEqual(gain, 1.4141428569978354)
+            elif param == 10:
+                self.assertEqual(gain, 0.14071950894605836)
+
+    def test_calculate_gain_leaky_relu_only_accepts_numbers(self):
+        for param in [True, [1], {'a': 'b'}]:
+            with self.assertRaises(ValueError):
+                init.calculate_gain('leaky_relu', param)
+
+    def test_calculate_gain_only_accepts_valid_nonlinearities(self):
+        for n in [2, 5, 25]:
+            # Generate random strings of lengths that definitely aren't supported
+            random_string = ''.join([random.choice(string.ascii_lowercase) for i in range(n)])
+            with self.assertRaises(ValueError):
+                init.calculate_gain(random_string)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_uniform(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
+            a = self._random_float(-3, 3)
+            b = a + self._random_float(1, 5)
+            init.uniform_(input_tensor, a=a, b=b)
+            assert self._is_uniform(input_tensor, a, b)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_normal(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
+            mean = self._random_float(-3, 3)
+            std = self._random_float(1, 5)
+            init.normal_(input_tensor, mean=mean, std=std)
+
+            assert self._is_normal(input_tensor, mean, std)
+
+    def test_constant(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+            val = self._random_float(1, 10)
+            init.constant_(input_tensor, val)
+
+            self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
+
+    def test_ones_and_zeros(self):
+        for init_fn_, val in zip([init.ones_, init.zeros_], [1, 0]):
+            for dims in [1, 2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+                init_fn_(input_tensor)
+
+                self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
+
+    def test_eye(self):
+        input_tensor = self._create_random_nd_tensor(2, size_min=1, size_max=5)
+        init.eye_(input_tensor)
+
+        # Check every single element
+        for i in range(input_tensor.size(0)):
+            for j in range(input_tensor.size(1)):
+                if i == j:
+                    assert input_tensor[i][j] == 1
+                else:
+                    assert input_tensor[i][j] == 0
+
+    def test_eye_only_works_on_2d_inputs(self):
+        for dims in [1, 3]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.eye_(tensor)
+
+    def test_max_unpool(self):
+        # Test 1D
+        output, indices = F.max_pool1d(torch.randn([1, 1, 4]), 2, stride=2, return_indices=True)
+        self.assertEqual(F.max_unpool1d(output, indices, 2), F.max_unpool1d(output, indices, 2, stride=2))
+
+        # Test 2D
+        output, indices = F.max_pool2d(torch.randn([1, 1, 4, 4]), 2, stride=2, return_indices=True)
+        self.assertEqual(F.max_unpool2d(output, indices, 2), F.max_unpool2d(output, indices, 2, stride=2))
+
+        # Test 3D
+        output, indices = F.max_pool3d(torch.randn([4, 4, 4, 4, 4]), 2, stride=2, return_indices=True)
+        self.assertEqual(F.max_unpool3d(output, indices, 2), F.max_unpool3d(output, indices, 2, stride=2))
+
+    def test_dirac_properties(self):
+        for dims in [3, 4, 5]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+            init.dirac_(input_tensor)
+
+            c_out, c_in = input_tensor.size(0), input_tensor.size(1)
+            min_d = min(c_out, c_in)
+            # Check number of nonzeros is equivalent to smallest dim
+            assert torch.nonzero(input_tensor).size(0) == min_d
+            # Check sum of values (can have precision issues, hence assertEqual) is also equivalent
+            self.assertEqual(input_tensor.sum(), min_d)
+
+    def test_dirac_identity(self):
+        batch, in_c, out_c, size, kernel_size = 8, 3, 4, 5, 3
+        # Test 1D
+        input_var = torch.randn(batch, in_c, size)
+        filter_var = torch.zeros(out_c, in_c, kernel_size)
+        init.dirac_(filter_var)
+        output_var = F.conv1d(input_var, filter_var)
+        input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
+        self.assertEqual(input_tensor[:, :, 1:-1], output_tensor[:, :in_c, :])  # Assert in_c outputs are preserved
+        assert torch.nonzero(output_tensor[:, in_c:, :]).numel() == 0  # Assert extra outputs are 0
+
+        # Test 2D
+        input_var = torch.randn(batch, in_c, size, size)
+        filter_var = torch.zeros(out_c, in_c, kernel_size, kernel_size)
+        init.dirac_(filter_var)
+        output_var = F.conv2d(input_var, filter_var)
+        input_tensor, output_tensor = input_var.data, output_var.data
+        self.assertEqual(input_tensor[:, :, 1:-1, 1:-1], output_tensor[:, :in_c, :, :])
+        assert torch.nonzero(output_tensor[:, in_c:, :, :]).numel() == 0
+
+        # Test 3D
+        input_var = torch.randn(batch, in_c, size, size, size)
+        filter_var = torch.zeros(out_c, in_c, kernel_size, kernel_size, kernel_size)
+        init.dirac_(filter_var)
+        output_var = F.conv3d(input_var, filter_var)
+        input_tensor, output_tensor = input_var.data, output_var.data
+        self.assertEqual(input_tensor[:, :, 1:-1, 1:-1, 1:-1], output_tensor[:, :in_c, :, :])
+        assert torch.nonzero(output_tensor[:, in_c:, :, :, :]).numel() == 0
+
+    def test_dirac_only_works_on_3_4_5d_inputs(self):
+        for dims in [1, 2, 6]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.dirac_(tensor)
+
+    def test_xavier_uniform_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+            with self.assertRaises(ValueError):
+                init.xavier_uniform_(tensor)
+
+    def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+            with self.assertRaises(ValueError):
+                init.xavier_normal_(tensor)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_xavier_uniform(self):
+        for use_gain in [True, False]:
+            for dims in [2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                gain = 1
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.xavier_uniform_(input_tensor, gain=gain)
+                else:
+                    init.xavier_uniform_(input_tensor)
+
+                fan_in = input_tensor.size(1)
+                fan_out = input_tensor.size(0)
+                if input_tensor.dim() > 2:
+                    fan_in *= input_tensor[0, 0].numel()
+                    fan_out *= input_tensor[0, 0].numel()
+
+                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+                bounds = expected_std * math.sqrt(3)
+                assert self._is_uniform(input_tensor, -bounds, bounds)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_xavier_normal(self):
+        for use_gain in [True, False]:
+            for dims in [2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                gain = 1
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.xavier_normal_(input_tensor, gain=gain)
+                else:
+                    init.xavier_normal_(input_tensor)
+
+                fan_in = input_tensor.size(1)
+                fan_out = input_tensor.size(0)
+                if input_tensor.dim() > 2:
+                    fan_in *= input_tensor[0, 0].numel()
+                    fan_out *= input_tensor[0, 0].numel()
+
+                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+                assert self._is_normal(input_tensor, 0, expected_std)
+
+    def test_kaiming_uniform_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+                init.kaiming_uniform_(tensor)
+
+    def test_kaiming_normal_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+                init.kaiming_normal_(tensor)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_kaiming_uniform(self):
+        for use_a in [True, False]:
+            for dims in [2, 4]:
+                for mode in ['fan_in', 'fan_out']:
+                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                    if use_a:
+                        a = self._random_float(0.1, 2)
+                        init.kaiming_uniform_(input_tensor, a=a, mode=mode)
+                    else:
+                        a = 0
+                        init.kaiming_uniform_(input_tensor, mode=mode)
+
+                    fan_in = input_tensor.size(1)
+                    fan_out = input_tensor.size(0)
+                    if input_tensor.dim() > 2:
+                        fan_in *= input_tensor[0, 0].numel()
+                        fan_out *= input_tensor[0, 0].numel()
+
+                    if mode == 'fan_in':
+                        n = fan_in
+                    else:
+                        n = fan_out
+
+                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
+                    bounds = expected_std * math.sqrt(3.0)
+                    assert self._is_uniform(input_tensor, -bounds, bounds)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_kaiming_normal(self):
+        for use_a in [True, False]:
+            for dims in [2, 4]:
+                for mode in ['fan_in', 'fan_out']:
+                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                    if use_a:
+                        a = self._random_float(0.1, 2)
+                        init.kaiming_normal_(input_tensor, a=a, mode=mode)
+                    else:
+                        a = 0
+                        init.kaiming_normal_(input_tensor, mode=mode)
+
+                    fan_in = input_tensor.size(1)
+                    fan_out = input_tensor.size(0)
+                    if input_tensor.dim() > 2:
+                        fan_in *= input_tensor[0, 0].numel()
+                        fan_out *= input_tensor[0, 0].numel()
+
+                    if mode == 'fan_in':
+                        n = fan_in
+                    else:
+                        n = fan_out
+
+                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
+                    assert self._is_normal(input_tensor, 0, expected_std)
+
+    def test_sparse_only_works_on_2d_inputs(self):
+        for dims in [1, 3]:
+            with self.assertRaises(ValueError):
+                sparsity = self._random_float(0.1, 0.9)
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.sparse_(tensor, sparsity)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_sparse_default_std(self):
+        for use_random_std in [True, False]:
+            input_tensor = self._create_random_nd_tensor(2, size_min=30, size_max=35)
+            rows, cols = input_tensor.size(0), input_tensor.size(1)
+            sparsity = self._random_float(0.1, 0.2)
+
+            std = 0.01  # default std
+            if use_random_std:
+                std = self._random_float(0.01, 0.2)
+                init.sparse_(input_tensor, sparsity=sparsity, std=std)
+            else:
+                init.sparse_(input_tensor, sparsity=sparsity)
+
+            for col_idx in range(input_tensor.size(1)):
+                column = input_tensor[:, col_idx]
+                assert column[column == 0].nelement() >= math.ceil(sparsity * rows)
+
+            assert self._is_normal(input_tensor[input_tensor != 0], 0, std)
+
+    @skipIfNoLapack
+    def test_orthogonal(self):
+        for use_gain in [True, False]:
+            for tensor_size in [[3, 4], [4, 3], [20, 2, 3, 4], [2, 3, 4, 5]]:
+                input_tensor = torch.zeros(tensor_size)
+                gain = 1.0
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.orthogonal_(input_tensor, gain=gain)
+                else:
+                    init.orthogonal_(input_tensor)
+
+                rows, cols = tensor_size[0], reduce(mul, tensor_size[1:])
+                flattened_tensor = input_tensor.view(rows, cols)
+                if rows > cols:
+                    self.assertEqual(torch.mm(flattened_tensor.t(), flattened_tensor),
+                                     torch.eye(cols) * gain ** 2, prec=1e-6)
+                else:
+                    self.assertEqual(torch.mm(flattened_tensor, flattened_tensor.t()),
+                                     torch.eye(rows) * gain ** 2, prec=1e-6)
+
+    def test_deprecation(self):
+        x = torch.randn(3, 3)
+
+        def fn():
+            init.normal(x)
+        self.assertWarnsRegex(fn, 'deprecated', 'methods not suffixed with underscore should be deprecated')
+
+
+# Generates rand tensor with non-equal values. This ensures that duplicate
+# values won't be causing test failure for modules like MaxPooling.
+# size should be small, otherwise randperm fails / long overflows.
+def _rand_tensor_non_equal(*size):
+    total = reduce(mul, size, 1)
+    return torch.randperm(total).view(*size).double()
+
+
+def add_test(test, decorator=None):
+    def add(test_name, fn):
+        if hasattr(TestNN, test_name):
+            raise RuntimeError('Found two tests with the same name: ' + test_name)
+        if decorator is not None:
+            fn = decorator(fn)
+        setattr(TestNN, test_name, fn)
+
+    test_name = test.get_name()
+    add(test_name, lambda self, test=test: test(self))
+    cuda_test_name = test_name + '_cuda'
+    # With dtype enable, it's good enough to test against three floating types
+    if 'dtype' in get_function_arglist(test.test_cuda):
+        add(cuda_test_name + '_float', lambda self,
+            test=test: test.test_cuda(self, dtype=torch.float))
+        add(cuda_test_name + '_double', lambda self,
+            test=test: test.test_cuda(self, dtype=torch.double))
+        add(cuda_test_name + '_half', lambda self,
+            test=test: test.test_cuda(self, dtype=torch.half))
+    else:
+        add(cuda_test_name, lambda self, test=test: test.test_cuda(self))
+
+
+def wrap_functional(fn, **kwargs):
+    class FunctionalModule(nn.Module):
+        def forward(self, *args):
+            return fn(*args, **kwargs)
+    return FunctionalModule
+
+
+new_criterion_tests = [
+    dict(
+        module_name='BCEWithLogitsLoss',
+        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.randn(15, 10).gt(0).double()
+    ),
+    dict(
+        module_name='BCEWithLogitsLoss',
+        constructor_args=(torch.rand(10),),
+        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
+        desc='weights'
+    ),
+    dict(
+        module_name='BCEWithLogitsLoss',
+        constructor_args=(torch.rand(()),),
+        input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.randn(()).gt(0).double(),
+        desc='scalar_weights'
+    ),
+    dict(
+        module_name='NLLLoss',
+        input_size=(2, 3, 5, 5),
+        target_fn=lambda: torch.rand(2, 5, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        desc='2d'
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args_fn=lambda: (torch.rand(3),),
+        input_size=(2, 3, 5, 5),
+        target=torch.rand(2, 5, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)),
+        desc='2d_weights',
+    ),
+    dict(
+        module_name='NLLLoss',
+        constructor_args=(None, None, 1),
+        input_size=(2, 3, 5, 5),
+        target_fn=lambda: torch.rand(2, 5, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['NLLLossNd'](i, t, ignore_index=1),
+        desc='2d_ignore_index',
+    ),
+    dict(
+        module_name='NLLLoss',
+        input_size=(2, 3, 5, 5, 2, 2),
+        target_fn=lambda: torch.rand(2, 5, 5, 2, 2).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        desc='higher_dim'
+    ),
+    dict(
+        module_name='NLLLoss',
+        input_size=(2, 3, 5),
+        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        desc='dim_is_3'
+    ),
+    dict(
+        module_name='PoissonNLLLoss',
+        input_size=(2, 3, 4, 5),
+        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
+        desc='no_full_loss',  # without sterling approx
+    ),
+    dict(
+        module_name='PoissonNLLLoss',
+        constructor_args=(False,),
+        input_fn=lambda: torch.randn(2, 3, 4, 5).abs_().add_(0.001),
+        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
+        desc='full_loss',  # with sterling approx
+    ),
+    dict(
+        module_name='L1Loss',
+        input_size=(),
+        target_size=(),
+        reference_fn=lambda i, t, _: 1. / i.numel() * (i - t).abs().sum(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='KLDivLoss',
+        input_fn=lambda: torch.rand(()).log(),
+        target_fn=lambda: torch.rand(()),
+        reference_fn=lambda i, t, m:
+            kldivloss_reference(i, t, get_reduction(m)),
+        check_sum_reduction=True,
+        desc='scalar',
+    ),
+    dict(
+        module_name='MSELoss',
+        input_size=(),
+        target_size=(),
+        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() /
+                                      (i.numel() if get_reduction(m) == 'elementwise_mean' else 1)),
+        check_sum_reduction=True,
+        desc='scalar'
+    ),
+    dict(
+        module_name='MSELoss',
+        input_fn=lambda: torch.ones(5, 68, 64, 64, dtype=torch.float) / 10,
+        target_fn=lambda: torch.zeros(5, 68, 64, 64, dtype=torch.float),
+        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() /
+                                      (i.numel() if get_reduction(m) == 'elementwise_mean' else 1)),
+        check_forward_only=True,
+        desc='prec',
+    ),
+    dict(
+        module_name='BCELoss',
+        constructor_args_fn=lambda: (torch.rand(()),),
+        input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2),
+        target_fn=lambda: torch.rand(()).gt(0).double(),
+        reference_fn=lambda i, t, m: -((t * i.log() + (1 - t) * (1 - i).log()) * get_weight(m)).sum() /
+            (i.numel() if get_reduction(m) == 'elementwise_mean' else 1),
+        desc='scalar_weights',
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='HingeEmbeddingLoss',
+        constructor_args=(0.5,),
+        input_size=(),
+        target_fn=lambda: torch.randn(()).gt(0).double().mul_(2).sub(1),
+        desc='scalar_margin',
+        check_sum_reduction=True,
+    ),
+    dict(
+        module_name='SmoothL1Loss',
+        input_size=(),
+        target_size=(),
+        check_sum_reduction=True,
+        reference_fn=lambda i, t, m:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+        desc='scalar',
+    ),
+    dict(
+        module_name='MultiLabelSoftMarginLoss',
+        constructor_args=(torch.rand(10),),
+        input_fn=lambda: torch.randn(5, 10),
+        target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
+        reference_fn=lambda i, t, m: -((t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * get_weight(m)).sum() /
+            (i.numel() if get_reduction(m) == 'elementwise_mean' else 1),
+        desc='weights',
+        check_sum_reduction=True,
+        check_gradgrad=False,
+    ),
+]
+
+
+def poissonnllloss_no_reduce_test():
+    t = torch.randn(10, 10)
+    return dict(
+        fullname='PoissonNLLLLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(10, 10),
+        pickle=False)
+
+
+def bceloss_no_reduce_test():
+    t = Variable(torch.randn(15, 10).gt(0).double())
+    return dict(
+        fullname='BCELoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()),
+        check_gradgrad=False,
+        pickle=False)
+
+
+def bceloss_no_reduce_scalar_test():
+    t = torch.randn(()).gt(0).double()
+    return dict(
+        fullname='BCELoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()),
+        check_gradgrad=False,
+        pickle=False)
+
+
+def bceloss_weights_no_reduce_test():
+    t = Variable(torch.randn(15, 10).gt(0).double())
+    weights = torch.rand(10)
+    return dict(
+        fullname='BCELoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i),
+                                             weight=weights.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def bceloss_weights_no_reduce_scalar_test():
+    t = torch.randn(()).double()
+    weights = torch.rand(())
+    return dict(
+        fullname='BCELoss_weights_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i),
+                                             weight=weights.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def bce_with_logistic_no_reduce_test():
+    t = Variable(torch.randn(15, 10).gt(0).double())
+    sigmoid = nn.Sigmoid()
+    return dict(
+        fullname='BCEWithLogitsLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
+        check_gradgrad=False,
+        pickle=False)
+
+
+def bce_with_logistic_no_reduce_scalar_test():
+    t = torch.randn(()).gt(0).double()
+    sigmoid = nn.Sigmoid()
+    return dict(
+        fullname='BCEWithLogitsLoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
+        check_gradgrad=False,
+        pickle=False)
+
+
+def kldivloss_with_target_no_reduce_test():
+    i = torch.rand(10, 10).log()
+    return dict(
+        fullname='KLDivLoss_with_target_no_reduce',
+        constructor=wrap_functional(
+            lambda t: F.kl_div(i.type_as(t), t, reduction='none')),
+        input_fn=lambda: torch.rand(10, 10),
+        reference_fn=lambda t, _:
+            loss_reference_fns['KLDivLoss'](i.type_as(t), t, reduction='none'),
+        pickle=False)
+
+
+def kldivloss_no_reduce_test():
+    t = torch.randn(10, 10)
+    return dict(
+        fullname='KLDivLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(10, 10).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
+        pickle=False)
+
+
+def kldivloss_no_reduce_scalar_test():
+    t = torch.randn(())
+    return dict(
+        fullname='KLDivLoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.rand(()).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
+        pickle=False)
+
+
+def l1loss_no_reduce_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='L1Loss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(2, 3, 4),
+        reference_fn=lambda i, m: (i - t.type_as(i)).abs(),
+        pickle=False)
+
+
+def l1loss_no_reduce_scalar_test():
+    t = torch.randn(())
+    return dict(
+        fullname='L1Loss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(()),
+        reference_fn=lambda i, m: (i - t.type_as(i)).abs(),
+        pickle=False)
+
+
+def mseloss_no_reduce_test():
+    input_size = (2, 3, 4, 5)
+    target = torch.randn(*input_size)
+    return dict(
+        fullname='MSELoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
+        input_size=input_size,
+        reference_fn=lambda i, m: (i - target).pow(2),
+        pickle=False)
+
+
+def mseloss_no_reduce_scalar_test():
+    input_size = ()
+    target = torch.randn(input_size)
+    return dict(
+        fullname='MSELoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
+        input_size=input_size,
+        reference_fn=lambda i, m: (i - target).pow(2),
+        pickle=False)
+
+
+def nllloss_no_reduce_test():
+    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(15, 10).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nllloss_no_reduce_ignore_index_test():
+    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
+    kwargs = {'ignore_index': 2, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(15, 10).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nllloss_no_reduce_weights_test():
+    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False)
+
+
+def nllloss_no_reduce_weights_ignore_index_test():
+    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none',
+                'ignore_index': 2}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i.data))),
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False)
+
+
+def nllloss_no_reduce_weights_ignore_index_neg_test():
+    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none',
+                'ignore_index': -1}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights_ignore_index_neg',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        input=torch.rand(15, 10).add(1e-2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False)
+
+
+def nllloss2d_no_reduce_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss2d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nllloss2d_no_reduce_ignore_index_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    kwargs = {'ignore_index': 1, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss2d_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nllloss2d_no_reduce_weights_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    weight = torch.rand(3)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLoss2d_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False)
+
+
+def nlllossNd_no_reduce_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLossNd_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nlllossNd_no_reduce_ignore_index_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    kwargs = {'ignore_index': 1, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLossNd_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False)
+
+
+def nlllossNd_no_reduce_weights_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    weight = torch.rand(3)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLossNd_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        reference_fn=lambda i, _:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False)
+
+
+def smoothl1loss_no_reduce_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='SmoothL1Loss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(2, 3, 4),
+        reference_fn=lambda i, _:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
+        pickle=False)
+
+
+def smoothl1loss_no_reduce_scalar_test():
+    t = torch.randn(())
+    return dict(
+        fullname='SmoothL1Loss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(()),
+        reference_fn=lambda i, _:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
+        pickle=False)
+
+
+def multilabelmarginloss_1d_no_reduce_test():
+    t = Variable(torch.rand(10).mul(10).floor().long())
+    return dict(
+        fullname='MultiLabelMarginLoss_1d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        input_fn=lambda: torch.randn(10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multilabelmarginloss_index_neg_test():
+    t = Variable(torch.clamp(torch.rand(5, 10).add(-.5).mul(20).floor().long(), min=-1))
+    return dict(
+        fullname='MultiLabelMarginLoss_index_neg',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multilabelmarginloss_no_reduce_test():
+    t = Variable(torch.rand(5, 10).mul(10).floor().long())
+    return dict(
+        fullname='MultiLabelMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def hingeembeddingloss_no_reduce_test():
+    t = Variable(torch.randn(10).gt(0).double().mul_(2).sub(1))
+    return dict(
+        fullname='HingeEmbeddingLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.hinge_embedding_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), reduction='none'),
+        check_sum_reduction=True,
+        pickle=False)
+
+
+def hingeembeddingloss_margin_no_reduce_test():
+    t = Variable(torch.randn(10).gt(0).double().mul_(2).sub(1))
+    return dict(
+        fullname='HingeEmbeddingLoss_margin_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.hinge_embedding_loss(i, t.type_as(i), margin=0.5, reduction='none')),
+        input_fn=lambda: torch.randn(10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), margin=0.5, reduction='none'),
+        check_sum_reduction=True,
+        pickle=False)
+
+
+def softmarginloss_no_reduce_test():
+    t = torch.randn(5, 5)
+    return dict(
+        fullname='SoftMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.soft_margin_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(5, 5),
+        reference_fn=lambda i, _:
+            loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'),
+        pickle=False)
+
+
+def multilabelsoftmarginloss_no_reduce_test():
+    t = torch.rand(5, 10).mul(2).floor()
+    return dict(
+        fullname='MultiLabelSoftMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()),
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multilabelsoftmarginloss_weights_no_reduce_test():
+    t = torch.rand(5, 10).mul(2).floor()
+    weights = torch.rand(10)
+    return dict(
+        fullname='MultiLabelSoftMarginLoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i),
+                                                    weight=weights.type_as(i), reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, m: -((t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multimarginloss_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multimarginloss_1d_no_reduce_test():
+    t = torch.rand(1).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_1d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        input_fn=lambda: torch.randn(10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multimarginloss_p_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_p_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), p=2, reduction='none')),
+        input_fn=lambda: torch.randn(5, 10).clamp_(1e-2, 1 - 1e-2),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), p=2, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multimarginloss_margin_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_margin_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), margin=0.5, reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
+                                                  margin=0.5, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multimarginloss_weights_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    weights = torch.rand(10)
+    return dict(
+        fullname='MultiMarginLoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), weight=weights.type_as(i),
+                                          reduction='none')),
+        input_fn=lambda: torch.randn(5, 10),
+        reference_fn=lambda i, _:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
+                                                  weight=weights, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+new_module_tests = [
+    poissonnllloss_no_reduce_test(),
+    bceloss_no_reduce_test(),
+    bceloss_weights_no_reduce_test(),
+    bce_with_logistic_no_reduce_test(),
+    bceloss_no_reduce_scalar_test(),
+    bceloss_weights_no_reduce_scalar_test(),
+    bce_with_logistic_no_reduce_scalar_test(),
+    kldivloss_with_target_no_reduce_test(),
+    kldivloss_no_reduce_test(),
+    kldivloss_no_reduce_scalar_test(),
+    l1loss_no_reduce_test(),
+    l1loss_no_reduce_scalar_test(),
+    mseloss_no_reduce_test(),
+    mseloss_no_reduce_scalar_test(),
+    nllloss_no_reduce_test(),
+    nllloss_no_reduce_ignore_index_test(),
+    nllloss_no_reduce_weights_test(),
+    nllloss_no_reduce_weights_ignore_index_test(),
+    nllloss_no_reduce_weights_ignore_index_neg_test(),
+    nllloss2d_no_reduce_test(),
+    nllloss2d_no_reduce_weights_test(),
+    nllloss2d_no_reduce_ignore_index_test(),
+    nlllossNd_no_reduce_test(),
+    nlllossNd_no_reduce_weights_test(),
+    nlllossNd_no_reduce_ignore_index_test(),
+    smoothl1loss_no_reduce_test(),
+    smoothl1loss_no_reduce_scalar_test(),
+    multilabelmarginloss_1d_no_reduce_test(),
+    multilabelmarginloss_index_neg_test(),
+    multilabelmarginloss_no_reduce_test(),
+    hingeembeddingloss_no_reduce_test(),
+    hingeembeddingloss_margin_no_reduce_test(),
+    softmarginloss_no_reduce_test(),
+    multilabelsoftmarginloss_no_reduce_test(),
+    multilabelsoftmarginloss_weights_no_reduce_test(),
+    multimarginloss_no_reduce_test(),
+    multimarginloss_1d_no_reduce_test(),
+    multimarginloss_p_no_reduce_test(),
+    multimarginloss_margin_no_reduce_test(),
+    multimarginloss_weights_no_reduce_test(),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(10,),
+        input_size=(4, 10),
+        cudnn=True,
+        check_eval=True,
+        desc='affine',
+    ),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(5,),
+        input_size=(4, 5, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='3d_input',
+    ),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(10, 1e-3, None),
+        input_size=(4, 10),
+        cudnn=True,
+        check_eval=True,
+        desc='affine_simple_average',
+    ),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(10, 1e-3, 0.3, False),
+        input_size=(4, 10),
+        cudnn=True,
+        check_eval=True,
+        desc='not_affine',
+    ),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(10, 1e-3, 0.3, True, False),
+        input_size=(4, 10),
+        cudnn=True,
+        check_eval=True,
+        desc='not_tracking_stats',
+    ),
+    dict(
+        module_name='BatchNorm1d',
+        constructor_args=(5, 1e-3, 0.3, False),
+        input_size=(4, 5, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='3d_input_not_affine',
+    ),
+    dict(
+        module_name='BatchNorm2d',
+        constructor_args=(3,),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+    ),
+    dict(
+        module_name='BatchNorm2d',
+        constructor_args=(3, 1e-3, None),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_simple_average',
+    ),
+    dict(
+        module_name='BatchNorm2d',
+        constructor_args=(3, 1e-3, 0.8),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+        desc='momentum',
+    ),
+    dict(
+        module_name='BatchNorm2d',
+        constructor_args=(3, 1e-3, 0.8, False),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+        desc='not_affine',
+    ),
+    dict(
+        module_name='BatchNorm2d',
+        constructor_args=(3, 1e-3, 0.8, True, False),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+        desc='not_tracking_stats',
+    ),
+    dict(
+        module_name='BatchNorm3d',
+        constructor_args=(3,),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+    ),
+    dict(
+        module_name='BatchNorm3d',
+        constructor_args=(3, 1e-3, None),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+        desc='3d_simple_average',
+    ),
+    dict(
+        module_name='BatchNorm3d',
+        constructor_args=(3, 1e-3, 0.7),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+        desc='momentum',
+    ),
+    dict(
+        module_name='BatchNorm3d',
+        constructor_args=(3, 1e-3, 0.7, False),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+        desc='not_affine',
+    ),
+    dict(
+        module_name='BatchNorm3d',
+        constructor_args=(3, 1e-3, 0.7, True, False),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+        desc='not_tracking_stats',
+    ),
+    dict(
+        module_name='InstanceNorm1d',
+        constructor_args=(3, 1e-3, 0.3),
+        input_size=(4, 3, 15),
+        cudnn=True,
+        check_eval=True,
+    ),
+    dict(
+        module_name='InstanceNorm1d',
+        constructor_args=(3, 1e-3, 0.3, False, True),
+        input_size=(4, 3, 15),
+        cudnn=True,
+        check_eval=True,
+        desc='tracking_stats',
+    ),
+    dict(
+        module_name='InstanceNorm2d',
+        constructor_args=(3, 1e-3, 0.3),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+    ),
+    dict(
+        module_name='InstanceNorm2d',
+        constructor_args=(3, 1e-3, 0.3, False, True),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        check_eval=True,
+        desc='tracking_stats',
+    ),
+    dict(
+        module_name='InstanceNorm3d',
+        constructor_args=(3, 1e-3, 0.3),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+    ),
+    dict(
+        module_name='InstanceNorm3d',
+        constructor_args=(3, 1e-3, 0.3, False, True),
+        input_size=(2, 3, 4, 4, 4),
+        cudnn=True,
+        check_eval=True,
+        desc='tracking_stats',
+    ),
+    dict(
+        module_name='LayerNorm',
+        constructor_args=([5], 1e-3),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_elementwise_affine',
+    ),
+    dict(
+        module_name='LayerNorm',
+        constructor_args=([5], 1e-3, False),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_no_elementwise_affine',
+    ),
+    dict(
+        module_name='LayerNorm',
+        constructor_args=([2, 2, 5], 1e-3),
+        input_size=(4, 2, 2, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='3d_elementwise_affine',
+    ),
+    dict(
+        module_name='LayerNorm',
+        constructor_args=([2, 2, 5], 1e-3, False),
+        input_size=(4, 2, 2, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='3d_no_elementwise_affine',
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 6, 1e-3),
+        input_size=(4, 6, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_affine',
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(5, 5, 1e-3, False),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(1, 5, 1e-3, False),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_no_affine_LN',  # this setting is equivalent with LayerNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 6, 1e-3),
+        input_size=(4, 6, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_affine',
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 3, 1e-3, False),
+        input_size=(4, 3, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(1, 3, 1e-3, False),
+        input_size=(4, 3, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_no_affine_LN',  # this setting is equivalent with LayerNorm
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3),
+        input_size=(2, 4, 10),
+        cudnn=True,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3, 2),
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='stride',
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3, 1, 1),
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='pad1'
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 5, 1, 2),
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='pad2'
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 4, 3, 1, 1),
+        input_size=(1, 4, 1),
+        cudnn=True,
+        desc='pad1size1'
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 4, 5, 1, 2),
+        input_size=(1, 4, 1),
+        cudnn=True,
+        desc='pad2size1'
+    ),
+    dict(
+        fullname='Conv1d_dilated',
+        constructor=lambda: nn.Conv1d(4, 5, kernel_size=3, dilation=2),
+        input_size=(2, 4, 10),
+    ),
+    dict(
+        fullname='Conv1d_groups',
+        constructor=lambda: nn.Conv1d(4, 6, kernel_size=3, groups=2),
+        input_size=(2, 4, 6),
+        cudnn=True,
+    ),
+    dict(
+        fullname='ConvTranspose1d',
+        constructor=lambda: nn.ConvTranspose1d(3, 4, kernel_size=3, stride=(3,), padding=1, output_padding=(1,)),
+        cudnn=True,
+        input_size=(1, 3, 7),
+    ),
+    dict(
+        module_name='ConvTranspose1d',
+        constructor_args=(3, 4, 3, 2, 1, 1, 1, False),
+        input_size=(1, 3, 6),
+        cudnn=True,
+        desc='no_bias',
+    ),
+    dict(
+        module_name='ConvTranspose1d',
+        constructor_args=(3, 4, 3, 2, 1, 1, 1, True, 2),
+        input_size=(1, 3, 6),
+        cudnn=True,
+        desc='dilated',
+    ),
+    dict(
+        fullname='ConvTranspose1d_groups',
+        constructor=lambda: nn.ConvTranspose1d(4, 6, 3, stride=(3,), padding=1, output_padding=(1,), groups=2),
+        cudnn=True,
+        input_size=(2, 4, 7),
+    ),
+    dict(
+        module_name='MaxPool1d',
+        constructor_args=(4,),
+        input_size=(2, 10, 4),
+    ),
+    dict(
+        module_name='MaxPool1d',
+        constructor_args=(4, 4),
+        input_size=(2, 10, 4),
+        desc='stride',
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 2)),
+        input_size=(2, 3, 7, 5),
+        cudnn=True,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 3), (2, 2)),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        desc='strided',
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 3), (2, 2), (1, 1)),
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        desc='padding',
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 2, (3, 3), (2, 2), (1, 1), (2, 2)),
+        input_size=(2, 3, 8, 8),
+        cudnn=True,
+        desc='dilated',
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 2), 1, 0, 1, 1, False),
+        input_size=(2, 3, 6, 5),
+        cudnn=True,
+        desc='no_bias',
+    ),
+    dict(
+        fullname='Conv2d_groups',
+        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
+        input_size=(2, 4, 6, 5),
+        cudnn=True,
+    ),
+    dict(
+        fullname='Conv2d_groups_thnn',
+        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
+        input_size=(2, 4, 6, 5),
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (3, 2), 1, (1, 1)),
+        cudnn=True,
+        input_size=(1, 3, 7, 6),
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False, (2, 2)),
+        input_size=(1, 3, 6, 7),
+        cudnn=True,
+        desc='dilated',
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False),
+        input_size=(1, 3, 6, 7),
+        cudnn=True,
+        desc='no_bias',
+    ),
+    dict(
+        fullname='ConvTranspose2d_groups',
+        constructor=lambda: nn.ConvTranspose2d(2, 4, (2, 3), groups=2),
+        input_size=(1, 2, 4, 5),
+        cudnn=True,
+    ),
+    dict(
+        fullname='Conv2d_depthwise',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), groups=4),
+        input_size=(2, 4, 6, 6),
+    ),
+    dict(
+        fullname='Conv2d_depthwise_with_multiplier',
+        constructor=lambda: nn.Conv2d(4, 8, (3, 3), groups=4),
+        input_size=(2, 4, 6, 6),
+    ),
+    dict(
+        fullname='Conv2d_depthwise_strided',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), stride=(2, 2), groups=4),
+        input_size=(2, 4, 6, 6),
+    ),
+    dict(
+        fullname='Conv2d_depthwise_padded',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), padding=(1, 1), groups=4),
+        input_size=(2, 4, 6, 6),
+    ),
+    dict(
+        fullname='Conv2d_depthwise_dilated',
+        constructor=lambda: nn.Conv2d(4, 4, (2, 2), dilation=(2, 2), groups=4),
+        input_size=(2, 4, 5, 5),
+    ),
+    dict(
+        module_name='MaxPool2d',
+        constructor_args=((3, 3), (2, 2), (1, 1)),
+        input_size=(1, 3, 7, 7),
+    ),
+    dict(
+        module_name='AvgPool1d',
+        constructor_args=(2,),
+        input_size=(2, 3, 6),
+    ),
+    dict(
+        module_name='AvgPool1d',
+        constructor_args=((2,), (2,)),
+        input_size=(2, 3, 6),
+        desc='stride',
+    ),
+    dict(
+        module_name='AvgPool1d',
+        constructor_args=(2, 2, 1),
+        input_size=(2, 3, 6),
+        desc='stride_pad',
+    ),
+    dict(
+        module_name='AvgPool2d',
+        constructor_args=((2, 2),),
+        input_size=(2, 3, 6, 6),
+    ),
+    dict(
+        module_name='AvgPool2d',
+        constructor_args=((2, 2), (2, 2)),
+        input_size=(2, 3, 6, 6),
+        desc='stride',
+    ),
+    dict(
+        module_name='AvgPool2d',
+        constructor_args=((2, 2), (2, 2), (1, 1)),
+        input_size=(2, 3, 6, 6),
+        desc='stride_pad',
+    ),
+    dict(
+        module_name='LPPool2d',
+        constructor_args=(2, (2, 2), 2),
+        input_size=(1, 3, 7, 7),
+    ),
+    dict(
+        module_name='LPPool2d',
+        constructor_args=(1.5, 2),
+        input_fn=lambda: torch.rand(1, 3, 7, 7),
+        desc='norm',
+    ),
+    dict(
+        module_name='LPPool1d',
+        constructor_args=(1.5, 2),
+        input_fn=lambda: torch.rand(1, 3, 7),
+        desc='norm',
+    ),
+    dict(
+        module_name='LPPool1d',
+        constructor_args=(2, 2, 3),
+        input_size=(1, 3, 7),
+    ),
+    dict(
+        module_name='LocalResponseNorm',
+        constructor_args=(3, ),
+        input_size=(1, 5, 7),
+        desc='1d'
+    ),
+    dict(
+        module_name='LocalResponseNorm',
+        constructor_args=(2, ),
+        input_size=(1, 5, 7, 7),
+        desc='2d_uneven_pad'
+    ),
+    dict(
+        module_name='LocalResponseNorm',
+        constructor_args=(1, 1, 0.5, 2),
+        input_size=(1, 5, 7, 7, 7),
+        desc='3d_custom_params'
+    ),
+    dict(
+        module_name='ReflectionPad1d',
+        constructor_args=((1, 2),),
+        input_size=(2, 3, 8),
+    ),
+    dict(
+        module_name='ReflectionPad2d',
+        constructor_args=((1, 2, 3, 4),),
+        input_size=(2, 3, 8, 8),
+    ),
+    dict(
+        module_name='ReplicationPad1d',
+        constructor_args=((1, 2),),
+        input_size=(2, 3, 4),
+    ),
+    dict(
+        module_name='ReplicationPad2d',
+        constructor_args=((1, 2, 3, 4),),
+        input_size=(2, 3, 4, 4),
+    ),
+    dict(
+        module_name='ZeroPad2d',
+        constructor_args=((1, 2, 3, 4),),
+        input_size=(2, 3, 4, 4)
+    ),
+    dict(
+        module_name='ZeroPad2d',
+        constructor_args=((-1, -1, -1, -2),),
+        input_size=(2, 3, 4, 4),
+        desc='negative_dims'
+    ),
+    dict(
+        module_name='ConstantPad1d',
+        constructor_args=((1, 2), 2),
+        input_size=(2, 3, 4)
+    ),
+    dict(
+        module_name='ConstantPad2d',
+        constructor_args=((1, 2, 3, 4), 2),
+        input_size=(2, 3, 4, 4)
+    ),
+    dict(
+        module_name='ConstantPad3d',
+        constructor_args=((1, 2, 3, 4, 1, 0), 2),
+        input_size=(2, 3, 4, 4, 5)
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, (2, 3, 4)),
+        input_size=(2, 3, 3, 4, 5),
+        cudnn=True,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, (2, 3, 4), 1, 0, 1, 1, False),
+        input_size=(2, 3, 3, 4, 5),
+        cudnn=True,
+        desc='no_bias',
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, 2, 2),
+        input_size=(2, 3, 5, 5, 5),
+        cudnn=True,
+        desc='stride',
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, 2, 2, 1),
+        input_size=(2, 3, 5, 5, 5),
+        cudnn=True,
+        desc='stride_padding',
+    ),
+    dict(
+        fullname='Conv3d_groups',
+        constructor=lambda: nn.Conv3d(4, 6, kernel_size=3, groups=2),
+        input_size=(2, 4, 4, 5, 4),
+        cudnn=True,
+    ),
+    dict(
+        fullname='Conv3d_dilated',
+        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2),
+        input_size=(2, 3, 5, 5, 5),
+    ),
+    dict(
+        fullname='Conv3d_dilated_strided',
+        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2, stride=2),
+        input_size=(2, 3, 5, 5, 5),
+    ),
+    dict(
+        module_name='ConvTranspose3d',
+        constructor_args=(2, 3, (2, 3, 2)),
+        cudnn=True,
+        input_size=(1, 2, 4, 5, 4),
+    ),
+    dict(
+        module_name='ConvTranspose3d',
+        constructor_args=(2, 3, (2, 3, 2), 1, 0, 0, 1, True, (2, 2, 2)),
+        cudnn=True,
+        input_size=(1, 2, 4, 5, 4),
+        desc='dilated',
+    ),
+    dict(
+        module_name='MaxPool3d',
+        constructor_args=((2, 2, 2),),
+        input_size=(2, 3, 5, 5, 5),
+    ),
+    dict(
+        module_name='MaxPool3d',
+        constructor_args=(2, (2, 2, 2)),
+        input_size=(2, 3, 5, 5, 5),
+        desc='stride',
+    ),
+    dict(
+        module_name='MaxPool3d',
+        constructor_args=(2, 2, (1, 1, 1)),
+        input_size=(2, 3, 5, 5, 5),
+        desc='stride_padding',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=((2, 2, 2),),
+        input_size=(2, 3, 4, 4, 4),
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=(2, (2, 2, 2)),
+        input_size=(2, 3, 5, 5, 5),
+        desc='stride',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=(2, 2, (1, 1, 1)),
+        input_size=(2, 3, 5, 5, 5),
+        desc='stride_pad',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=(4, 2, (1, 2, 1)),
+        input_size=(2, 3, 5, 5, 5),
+        desc='stride_pad_gpu_fixedkw_output',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=((2, 4, 8), 1, (1, 1, 2)),
+        input_size=(2, 3, 2, 4, 8),
+        desc='stride_pad_gpu_general_output',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=(3, 1, 0),
+        input_size=(2, 3, 4, 4, 4),
+        desc='stride1_pad0_gpu_input',
+    ),
+    dict(
+        module_name='AvgPool3d',
+        constructor_args=(2, 2, (1, 1, 1)),
+        input_size=(2, 3, 4, 4, 4),
+        desc='stride_pad_gpu_input_nooverlap',
+    ),
+    dict(
+        module_name='ReplicationPad3d',
+        constructor_args=((1, 2, 3, 4, 5, 6),),
+        input_size=(2, 3, 5, 5, 5),
+    ),
+    dict(
+        module_name='Embedding',
+        constructor_args=(4, 3),
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        jacobian_input=False,
+        check_gradgrad=False,
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3),
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        jacobian_input=False,
+        check_gradgrad=False,
+        desc='mean',
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3, None, 2, False, 'sum'),
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        jacobian_input=False,
+        check_gradgrad=False,
+        desc='sum',
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3, None, 2, False, 'max'),
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        jacobian_input=False,
+        check_gradgrad=False,
+        desc='max',
+    ),
+    dict(
+        fullname='EmbeddingBag_sparse',
+        constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True),
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
+        jacobian_input=False,
+        check_gradgrad=False,
+    ),
+    dict(
+        constructor=lambda: nn.Embedding(4, 3, sparse=True),
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
+        jacobian_input=False,
+        fullname='Embedding_sparse',
+        check_gradgrad=False,
+    ),
+    dict(
+        constructor=lambda: nn.FractionalMaxPool2d(
+            2, output_ratio=0.5, _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
+        input_size=(1, 3, 5, 5),
+        fullname='FractionalMaxPool2d_ratio',
+    ),
+    dict(
+        constructor=lambda: nn.FractionalMaxPool2d((2, 2), output_size=(
+            4, 4), _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
+        input_size=(1, 3, 7, 7),
+        fullname='FractionalMaxPool2d_size',
+        test_cuda=False,
+    ),
+    dict(
+        module_name='PixelShuffle',
+        constructor_args=(3,),
+        input_size=(1, 9, 4, 4),
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'nearest'),
+        input_size=(1, 2, 4),
+        desc='nearest_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((12, ), None, 'nearest'),
+        input_size=(1, 2, 3),
+        desc='nearest_tuple_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'nearest'),
+        input_size=(1, 2, 4),
+        desc='nearest_scale_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'linear', False),
+        input_size=(1, 2, 4),
+        desc='linear_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((4, ), None, 'linear', False),
+        input_size=(1, 2, 3),
+        desc='linear_tuple_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'linear', False),
+        input_size=(1, 2, 4),
+        desc='linear_scale_1d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'linear', True),
+        input_size=(1, 2, 4),
+        desc='linear_1d_align_corners',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'linear', True),
+        input_size=(1, 2, 4),
+        desc='linear_scale_1d_align_corners',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'nearest'),
+        input_size=(1, 2, 4, 4),
+        desc='nearest_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((12, 16), None, 'nearest'),
+        input_size=(1, 2, 3, 4),
+        desc='nearest_tuple_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'nearest'),
+        input_size=(1, 2, 4, 4),
+        desc='nearest_scale_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'bilinear', False),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((4, 6), None, 'bilinear', False),
+        input_size=(1, 2, 2, 3),
+        desc='bilinear_tuple_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'bilinear', False),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_scale_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, (2, 2), 'bilinear', False),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_scale_tuple_shared_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, (2, 1), 'bilinear', False),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_scale_tuple_skewed_2d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((4, 6), None, 'bilinear', True),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_tuple_2d_align_corners',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, (2, 1), 'bilinear', True),
+        input_size=(1, 2, 4, 4),
+        desc='bilinear_scale_tuple_skewed_2d_align_corners',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'nearest'),
+        input_size=(1, 2, 4, 4, 4),
+        desc='nearest_3d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((12, 16, 16), None, 'nearest'),
+        input_size=(1, 2, 3, 4, 4),
+        desc='nearest_tuple_3d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 4, 'nearest'),
+        input_size=(1, 2, 4, 4, 4),
+        desc='nearest_scale_3d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(12, None, 'trilinear', False),
+        input_size=(1, 2, 4, 4, 4),
+        desc='trilinear_3d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((4, 6, 6), None, 'trilinear', False),
+        input_size=(1, 2, 2, 3, 3),
+        desc='trilinear_tuple_3d',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 3, 'trilinear', False),
+        input_size=(1, 2, 3, 4, 4),
+        desc='trilinear_scale_3d',
+        # See https://github.com/pytorch/pytorch/issues/5006
+        precision=3e-4,
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=((4, 6, 6), None, 'trilinear', True),
+        input_size=(1, 2, 2, 3, 3),
+        desc='trilinear_tuple_3d_align_corners',
+    ),
+    dict(
+        module_name='Upsample',
+        constructor_args=(None, 3, 'trilinear', True),
+        input_size=(1, 2, 3, 4, 4),
+        desc='trilinear_scale_3d_align_corners',
+        # See https://github.com/pytorch/pytorch/issues/5006
+        precision=3e-4,
+    ),
+    dict(
+        module_name='AdaptiveMaxPool1d',
+        constructor_args=(3,),
+        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5),
+    ),
+    dict(
+        module_name='AdaptiveMaxPool2d',
+        constructor_args=(3,),
+        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
+        desc='single',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool2d',
+        constructor_args=((3, 4),),
+        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
+        desc='tuple',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool2d',
+        constructor_args=((3, None),),
+        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
+        desc='tuple_none',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool3d',
+        constructor_args=(3,),
+        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
+        desc='single',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool3d',
+        constructor_args=((3, 4, 5),),
+        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
+        desc='tuple',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool3d',
+        constructor_args=((3, None, 5),),
+        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
+        desc='tuple_none',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool3d',
+        constructor_args=(3,),
+        input_fn=lambda: _rand_tensor_non_equal(2, 3, 12, 9, 3),
+        desc='single_nonatomic',
+    ),
+    dict(
+        module_name='AdaptiveMaxPool3d',
+        constructor_args=((3, 4, 5),),
+        input_fn=lambda: _rand_tensor_non_equal(2, 3, 6, 4, 10),
+        desc='tuple_nonatomic',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool1d',
+        constructor_args=(3,),
+        input_fn=lambda: torch.rand(1, 3, 5),
+    ),
+    dict(
+        module_name='AdaptiveAvgPool2d',
+        constructor_args=(3,),
+        input_fn=lambda: torch.rand(1, 3, 5, 6),
+        desc='single',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool2d',
+        constructor_args=((3, 4),),
+        input_fn=lambda: torch.rand(1, 3, 5, 6),
+        desc='tuple',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool2d',
+        constructor_args=((3, None),),
+        input_fn=lambda: torch.rand(1, 3, 5, 6),
+        desc='tuple_none',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool3d',
+        constructor_args=(3,),
+        input_fn=lambda: torch.rand(2, 3, 5, 2, 7),
+        desc='single',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool3d',
+        constructor_args=((3, 4, 5),),
+        input_fn=lambda: torch.rand(2, 3, 5, 3, 7),
+        desc='tuple',
+    ),
+    dict(
+        module_name='AdaptiveAvgPool3d',
+        constructor_args=((None, 4, 5),),
+        input_fn=lambda: torch.rand(2, 3, 5, 3, 7),
+        desc='tuple_none',
+    ),
+    dict(
+        module_name='SELU',
+        input_size=(3, 2, 5),
+        check_inplace=True
+    ),
+    dict(
+        module_name='SELU',
+        input_size=(),
+        check_inplace=True,
+        desc='scalar'
+    ),
+    dict(
+        module_name='GLU',
+        input_size=(5, 6),
+    ),
+    dict(
+        module_name='GLU',
+        constructor_args=(1,),
+        input_size=(5, 6, 7),
+        desc='dim'
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=-1),
+        input_size=(2, 128),  # trigger the last-dim algo in CUDA
+        fullname='softmax_lastdim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1),
+        input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
+        fullname='softmax_spatial_special',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1),
+        input_size=(2, 2, 4, 4),  # regular spatial algorithm
+        fullname='softmax_spatial',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=0),
+        input_size=(2, 3, 4, 5),
+        fullname='softmax_functional_dim0',
+        test_cuda=False,
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=3),
+        input_size=(2, 3, 4, 5),
+        fullname='softmax_functional_dim3',
+        test_cuda=False,
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=-1),
+        input_size=(),
+        fullname='softmax_functional_scalar',
+        test_cuda=False,
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=-1),
+        input_size=(2, 128),  # trigger the last-dim algo in CUDA
+        fullname='log_softmax_lastdim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=1),
+        input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
+        fullname='log_softmax_spatial_special',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=1),
+        input_size=(2, 2, 4, 4),  # regular spatial algorithm
+        fullname='log_softmax_spatial',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=0),
+        input_size=(2, 3, 4, 5),
+        fullname='log_softmax_dim0',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=3),
+        input_size=(2, 3, 4, 5),
+        fullname='log_softmax_dim3',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=0),
+        input_size=(),
+        fullname='log_softmax_scalar',
+        pickle=False,
+    ),
+    dict(
+        fullname='Unfold',
+        constructor=lambda: nn.Unfold((2, 2), (1, 1), (0, 0), (1, 1)),
+        input_size=(2, 4, 3, 3),
+        check_gradgrad=False,
+        test_cuda=True,
+    ),
+    dict(
+        fullname='Fold',
+        constructor=lambda: nn.Fold((3, 3), (2, 2), (1, 1), (0, 0), (1, 1)),
+        input_size=(2, 16, 4),
+        check_gradgrad=False,
+        test_cuda=True,
+    ),
+    dict(
+        fullname='Unfold_int_input',
+        constructor=lambda: nn.Unfold(2, 1, 0, 1),
+        input_size=(2, 4, 3, 3),
+        check_gradgrad=False,
+        test_cuda=True,
+    ),
+    dict(
+        fullname='Fold_int_input',
+        constructor=lambda: nn.Fold(3, 2, 1, 0, 1),
+        input_size=(2, 16, 4),
+        check_gradgrad=False,
+        test_cuda=True,
+    ),
+    dict(
+        module_name='Threshold',
+        constructor_args=(2, 1),
+        input_size=(),
+        check_inplace=True,
+        desc='threshold_value_scalar'
+    ),
+
+    dict(
+        module_name='ReLU',
+        input_size=(),
+        check_inplace=True,
+        desc='scalar'
+    ),
+    dict(
+        module_name='ReLU6',
+        input_size=(),
+        check_inplace=True,
+        desc='scalar'
+    ),
+    dict(
+        module_name='RReLU',
+        constructor_args=(0.1, 0.9),
+        input_size=(),
+        desc='with_up_down_scalar',
+        test_cuda=False,
+    ),
+    dict(
+        module_name='Hardtanh',
+        input_size=(),
+        reference_fn=lambda i, _: i.clamp(-1, 1),
+        desc='scalar'
+    ),
+    dict(
+        module_name='Sigmoid',
+        input_size=(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='Tanh',
+        input_size=(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='Softmax',
+        constructor_args=(0,),
+        input_size=(),
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(0, True)),
+        desc='scalar',
+    ),
+    dict(
+        module_name='LogSoftmax',
+        constructor_args=(0,),
+        input_size=(),
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(0, False)).log_(),
+        desc='multiparam_scalar',
+    ),
+    dict(
+        module_name='ELU',
+        constructor_args=(2.,),
+        input_size=(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='Hardshrink',
+        constructor_args=(2.,),
+        input_size=(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='LeakyReLU',
+        constructor_args=(0.5,),
+        input_size=(),
+        check_inplace=True,
+        desc='with_negval_scalar'
+    ),
+    dict(
+        module_name='LogSigmoid',
+        input_size=(),
+        reference_fn=lambda i, _: i.sigmoid().log(),
+        desc='scalar'
+    ),
+    dict(
+        module_name='Softplus',
+        constructor_args=(2, -100),
+        input_size=(),
+        reference_fn=(lambda i, _: ((i * 2) > -100).type_as(i) * i +
+                                   ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))),
+        desc='beta_threshold_scalar',
+    ),
+    dict(
+        module_name='Softshrink',
+        constructor_args=(1,),
+        input_size=(),
+        desc='lambda_scalar',
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(),
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        desc='scalar',
+    ),
+    dict(
+        module_name='Softsign',
+        input_size=(),
+        reference_fn=lambda i, _: i.div(1 + torch.abs(i)),
+        desc='scalar',
+    ),
+    dict(
+        module_name='Softmin',
+        constructor_args=(0,),
+        input_size=(),
+        desc='scalar',
+    ),
+    dict(
+        module_name='Tanhshrink',
+        input_size=(),
+        desc='scalar',
+    ),
+]
+
+
+for test_params in module_tests + new_module_tests:
+    # TODO: CUDA is not implemented yet
+    if 'constructor' not in test_params:
+        name = test_params.pop('module_name')
+        test_params['constructor'] = getattr(nn, name)
+    decorator = test_params.pop('decorator', None)
+    test = NewModuleTest(**test_params)
+    add_test(test, decorator)
+    if 'check_eval' in test_params:
+        # create a new test that is identical but that sets module.training to False
+        desc = test_params.get('desc', None)
+        test_params['desc'] = 'eval' if desc is None else desc + '_eval'
+
+        def gen_eval_constructor(constructor):
+            def eval_constructor(*args, **kwargs):
+                cons = constructor(*args, **kwargs)
+                cons.training = False
+                return cons
+            eval_constructor.__name__ = constructor.__name__
+            return eval_constructor
+
+        test_params['constructor'] = gen_eval_constructor(test_params['constructor'])
+        test = NewModuleTest(**test_params)
+        add_test(test, decorator)
+
+for test_params in criterion_tests + new_criterion_tests:
+    name = test_params.pop('module_name')
+    test_params['constructor'] = getattr(nn, name)
+    test = NewCriterionTest(**test_params)
+    decorator = test_params.pop('decorator', None)
+    add_test(test, decorator)
+    if 'check_sum_reduction' in test_params:
+        desc = test_params.get('desc', None)
+        test_params['desc'] = 'sum_reduction' if desc is None else desc + '_sum_reduction'
+
+        def gen_sum_reduction_constructor(constructor):
+            def sum_reduction_constructor(*args, **kwargs):
+                cons = constructor(*args, reduction='sum', **kwargs)
+                return cons
+            sum_reduction_constructor.__name__ = constructor.__name__
+            return sum_reduction_constructor
+
+        test_params['constructor'] = gen_sum_reduction_constructor(test_params['constructor'])
+        test = NewCriterionTest(**test_params)
+        add_test(test, decorator)
+
+
+class UnpoolingNet(nn.Module):
+    def __init__(self, pool, unpool):
+        super(UnpoolingNet, self).__init__()
+        self.pool = pool
+        self.unpool = unpool
+
+    def forward(self, input):
+        return self.unpool(*self.pool(input))
+
+
+add_test(NewModuleTest(
+    constructor=lambda: UnpoolingNet(
+        nn.MaxPool1d(2, return_indices=True),
+        nn.MaxUnpool1d(2)),
+    input_size=(1, 1, 4),
+    fullname='MaxUnpool1d_net',))
+add_test(NewModuleTest(
+    constructor=lambda: UnpoolingNet(
+        nn.MaxPool2d(2, return_indices=True),
+        nn.MaxUnpool2d(2)),
+    input_size=(1, 1, 2, 4),
+    fullname='MaxUnpool2d_net',))
+add_test(NewModuleTest(
+    constructor=lambda: UnpoolingNet(
+        nn.MaxPool3d(2, return_indices=True),
+        nn.MaxUnpool3d(2)),
+    input_size=(1, 1, 2, 4, 6),
+    fullname='MaxUnpool3d_net',
+    check_gradgrad=False,))
+
+
+class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss):
+    def __call__(self, input):
+        t = torch.tensor([0, 1, 4, 8]).to(input.device)
+        return nn.AdaptiveLogSoftmaxWithLoss.__call__(self, input, t).output
+
+
+add_test(NewModuleTest(
+    constructor=lambda: _AdaptiveLogSoftmaxWithLoss(16, 10, [2, 6]),
+    input_size=(4, 16),
+    fullname='AdaptiveLogSoftmax'))
+
+
+num_shards = os.environ.get('TEST_NN_NUM_SHARDS', None)
+shard = os.environ.get('TEST_NN_SHARD', None)
+if num_shards is not None and shard is not None:
+    num_shards = int(num_shards)
+    shard = int(shard)
+
+    def load_tests(loader, tests, pattern):
+        test_suite = unittest.TestSuite()
+        for test_group in tests:
+            for test in test_group:
+                hash_id = int(hashlib.sha256(str(test).encode('utf-8')).hexdigest(), 16)
+                if hash_id % num_shards == shard:
+                    test_suite.addTest(test)
+        return test_suite
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_optim.py b/test/test_optim.py
new file mode 100644
index 0000000..532e61a
--- /dev/null
+++ b/test/test_optim.py
@@ -0,0 +1,705 @@
+import math
+import unittest
+import functools
+from copy import deepcopy
+import torch
+from torch._six import inf
+import torch.optim as optim
+import torch.legacy.optim as old_optim
+import torch.nn.functional as F
+from torch.optim import SGD
+from torch.autograd import Variable
+from torch import sparse
+from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau
+from common import TestCase, run_tests, TEST_WITH_UBSAN
+
+
+def rosenbrock(tensor):
+    x, y = tensor
+    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
+
+
+def drosenbrock(tensor):
+    x, y = tensor
+    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
+
+
+def wrap_old_fn(old_fn, **config):
+    def wrapper(closure, params, state):
+        return old_fn(closure, params, config, state)
+    return wrapper
+
+
+class TestOptim(TestCase):
+
+    def _test_rosenbrock(self, constructor, old_fn):
+        params_t = torch.Tensor([1.5, 1.5])
+        state = {}
+
+        params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
+        optimizer = constructor([params])
+
+        solution = torch.Tensor([1, 1])
+        initial_dist = params.data.dist(solution)
+
+        def eval():
+            optimizer.zero_grad()
+            loss = rosenbrock(params)
+            loss.backward()
+            # loss.backward() will give **slightly** different
+            # gradients, than drosenbtock, because of a different ordering
+            # of floating point operations. In most cases it doesn't matter,
+            # but some optimizers are so sensitive that they can temporarily
+            # diverge up to 1e-4, just to converge again. This makes the
+            # comparison more stable.
+            params.grad.data.copy_(drosenbrock(params.data))
+            return loss
+
+        for i in range(2000):
+            optimizer.step(eval)
+            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
+                   params_t, state)
+            self.assertEqual(params.data, params_t)
+
+        self.assertLessEqual(params.data.dist(solution), initial_dist)
+
+    def _test_rosenbrock_sparse(self, constructor, sparse_only=False):
+        params_t = torch.Tensor([1.5, 1.5])
+
+        params = Variable(params_t, requires_grad=True)
+        optimizer = constructor([params])
+
+        if not sparse_only:
+            params_c = Variable(params_t.clone(), requires_grad=True)
+            optimizer_c = constructor([params_c])
+
+        solution = torch.Tensor([1, 1])
+        initial_dist = params.data.dist(solution)
+
+        def eval(params, sparse_grad, w):
+            # Depending on w, provide only the x or y gradient
+            optimizer.zero_grad()
+            loss = rosenbrock(params)
+            loss.backward()
+            grad = drosenbrock(params.data)
+            # NB: We torture test the optimizer by returning an
+            # uncoalesced sparse tensor
+            if w:
+                i = torch.LongTensor([[0, 0]])
+                x = grad[0]
+                v = torch.DoubleTensor([x / 4., x - x / 4.])
+            else:
+                i = torch.LongTensor([[1, 1]])
+                y = grad[1]
+                v = torch.DoubleTensor([y - y / 4., y / 4.])
+            x = sparse.DoubleTensor(i, v, torch.Size([2]))
+            if sparse_grad:
+                params.grad.data = x
+            else:
+                params.grad.data = x.to_dense()
+            return loss
+
+        for i in range(2000):
+            # Do cyclic coordinate descent
+            w = i % 2
+            optimizer.step(functools.partial(eval, params, True, w))
+            if not sparse_only:
+                optimizer_c.step(functools.partial(eval, params_c, False, w))
+                self.assertEqual(params.data, params_c.data)
+
+        self.assertLessEqual(params.data.dist(solution), initial_dist)
+
+    def _test_basic_cases_template(self, weight, bias, input, constructor):
+        weight = Variable(weight, requires_grad=True)
+        bias = Variable(bias, requires_grad=True)
+        input = Variable(input)
+        optimizer = constructor(weight, bias)
+
+        # to check if the optimizer can be printed as a string
+        optimizer.__repr__()
+
+        def fn():
+            optimizer.zero_grad()
+            y = weight.mv(input)
+            if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
+                y = y.cuda(bias.get_device())
+            loss = (y + bias).pow(2).sum()
+            loss.backward()
+            return loss
+
+        initial_value = fn().item()
+        for i in range(200):
+            optimizer.step(fn)
+        self.assertLess(fn().item(), initial_value)
+
+    def _test_state_dict(self, weight, bias, input, constructor):
+        weight = Variable(weight, requires_grad=True)
+        bias = Variable(bias, requires_grad=True)
+        input = Variable(input)
+
+        def fn_base(optimizer, weight, bias):
+            optimizer.zero_grad()
+            i = input_cuda if weight.is_cuda else input
+            loss = (weight.mv(i) + bias).pow(2).sum()
+            loss.backward()
+            return loss
+
+        optimizer = constructor(weight, bias)
+        fn = functools.partial(fn_base, optimizer, weight, bias)
+
+        # Prime the optimizer
+        for i in range(20):
+            optimizer.step(fn)
+        # Clone the weights and construct new optimizer for them
+        weight_c = Variable(weight.data.clone(), requires_grad=True)
+        bias_c = Variable(bias.data.clone(), requires_grad=True)
+        optimizer_c = constructor(weight_c, bias_c)
+        fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c)
+        # Load state dict
+        state_dict = deepcopy(optimizer.state_dict())
+        state_dict_c = deepcopy(optimizer.state_dict())
+        optimizer_c.load_state_dict(state_dict_c)
+        # Run both optimizations in parallel
+        for i in range(20):
+            optimizer.step(fn)
+            optimizer_c.step(fn_c)
+            self.assertEqual(weight, weight_c)
+            self.assertEqual(bias, bias_c)
+        # Make sure state dict wasn't modified
+        self.assertEqual(state_dict, state_dict_c)
+
+        # Check that state dict can be loaded even when we cast parameters
+        # to a different type and move to a different device.
+        if not torch.cuda.is_available():
+            return
+
+        input_cuda = Variable(input.data.float().cuda())
+        weight_cuda = Variable(weight.data.float().cuda(), requires_grad=True)
+        bias_cuda = Variable(bias.data.float().cuda(), requires_grad=True)
+        optimizer_cuda = constructor(weight_cuda, bias_cuda)
+        fn_cuda = functools.partial(fn_base, optimizer_cuda, weight_cuda, bias_cuda)
+
+        state_dict = deepcopy(optimizer.state_dict())
+        state_dict_c = deepcopy(optimizer.state_dict())
+        optimizer_cuda.load_state_dict(state_dict_c)
+
+        # Make sure state dict wasn't modified
+        self.assertEqual(state_dict, state_dict_c)
+
+        for i in range(20):
+            optimizer.step(fn)
+            optimizer_cuda.step(fn_cuda)
+            self.assertEqual(weight, weight_cuda)
+            self.assertEqual(bias, bias_cuda)
+
+    def _test_basic_cases(self, constructor, ignore_multidevice=False):
+        self._test_state_dict(
+            torch.randn(10, 5),
+            torch.randn(10),
+            torch.randn(5),
+            constructor
+        )
+        self._test_basic_cases_template(
+            torch.randn(10, 5),
+            torch.randn(10),
+            torch.randn(5),
+            constructor
+        )
+        # non-contiguous parameters
+        self._test_basic_cases_template(
+            torch.randn(10, 5, 2)[..., 0],
+            torch.randn(10, 2)[..., 0],
+            torch.randn(5),
+            constructor
+        )
+        # CUDA
+        if not torch.cuda.is_available():
+            return
+        self._test_basic_cases_template(
+            torch.randn(10, 5).cuda(),
+            torch.randn(10).cuda(),
+            torch.randn(5).cuda(),
+            constructor
+        )
+        # Multi-GPU
+        if not torch.cuda.device_count() > 1 or ignore_multidevice:
+            return
+        self._test_basic_cases_template(
+            torch.randn(10, 5).cuda(0),
+            torch.randn(10).cuda(1),
+            torch.randn(5).cuda(0),
+            constructor
+        )
+
+    def _build_params_dict(self, weight, bias, **kwargs):
+        return [dict(params=[weight]), dict(params=[bias], **kwargs)]
+
+    def _build_params_dict_single(self, weight, bias, **kwargs):
+        return [dict(params=bias, **kwargs)]
+
+    def test_sgd(self):
+        self._test_rosenbrock(
+            lambda params: optim.SGD(params, lr=1e-3),
+            wrap_old_fn(old_optim.sgd, learningRate=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
+                                     dampening=0, weight_decay=1e-4),
+            wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
+                        dampening=0, weightDecay=1e-4)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict_single(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict_single(weight, bias, lr=1e-2))
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
+            optim.SGD(None, lr=1e-2, momentum=-0.5)
+
+    def test_sgd_sparse(self):
+        self._test_rosenbrock_sparse(
+            lambda params: optim.SGD(params, lr=5e-3)
+        )
+
+    def test_adam(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adam(params, lr=1e-2),
+            wrap_old_fn(old_optim.adam, learningRate=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
+                                            amsgrad=True)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3, amsgrad=True)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+            optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
+
+    def test_sparse_adam(self):
+        self._test_rosenbrock_sparse(
+            lambda params: optim.SparseAdam(params, lr=4e-2),
+            True
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+            optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
+
+    def test_adadelta(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params),
+            wrap_old_fn(old_optim.adadelta)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params, rho=0.95),
+            wrap_old_fn(old_optim.adadelta, rho=0.95)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta([weight, bias])
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta(
+                self._build_params_dict(weight, bias, rho=0.95))
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
+            optim.Adadelta(None, lr=1e-2, rho=1.1)
+
+    def test_adagrad(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1,
+                                               initial_accumulator_value=0.1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-1)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid lr_decay value: -0.5"):
+            optim.Adagrad(None, lr=1e-2, lr_decay=-0.5)
+
+    def test_adagrad_sparse(self):
+        self._test_rosenbrock_sparse(
+            lambda params: optim.Adagrad(params, lr=1e-1)
+        )
+
+    def test_adamax(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adamax(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-1)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
+
+    def test_rmsprop(self):
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.RMSprop(
+                self._build_params_dict(weight, bias, lr=1e-3),
+                lr=1e-2)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
+            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
+
+    def test_asgd(self):
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3, t0=100)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
+            optim.ASGD(None, lr=1e-2, weight_decay=-0.5)
+
+    def test_rprop(self):
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
+            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
+
+    def test_lbfgs(self):
+        self._test_rosenbrock(
+            lambda params: optim.LBFGS(params),
+            wrap_old_fn(old_optim.lbfgs)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
+            wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.LBFGS([weight, bias]),
+            ignore_multidevice=True
+        )
+
+    @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
+    def test_lbfgs_return_type(self):
+        params = [torch.randn(10, 5), torch.randn(10)]
+        opt1 = optim.LBFGS(params, 0.01, tolerance_grad=inf)
+        opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-inf)
+
+        def closure():
+            return torch.Tensor([10])
+
+        res1 = opt1.step(closure)
+        res2 = opt2.step(closure)
+        self.assertEqual(type(res1), type(res2))
+
+    def test_invalid_param_type(self):
+        with self.assertRaises(TypeError):
+            optim.SGD(Variable(torch.randn(5, 5)), lr=3)
+
+
+class SchedulerTestNet(torch.nn.Module):
+    def __init__(self):
+        super(SchedulerTestNet, self).__init__()
+        self.conv1 = torch.nn.Conv2d(1, 1, 1)
+        self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+    def forward(self, x):
+        return self.conv2(F.relu(self.conv1(x)))
+
+
+class TestLRScheduler(TestCase):
+    def setUp(self):
+        self.net = SchedulerTestNet()
+        self.opt = SGD(
+            [{'params': self.net.conv1.parameters()}, {'params': self.net.conv2.parameters(), 'lr': 0.5}],
+            lr=0.05)
+
+    def test_step_lr(self):
+        # lr = 0.05     if epoch < 3
+        # lr = 0.005    if 30 <= epoch < 6
+        # lr = 0.0005   if epoch >= 9
+        epochs = 10
+        single_targets = [0.05] * 3 + [0.005] * 3 + [0.0005] * 3 + [0.00005] * 3
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
+        self._test(scheduler, targets, epochs)
+
+    def test_multi_step_lr(self):
+        # lr = 0.05     if epoch < 2
+        # lr = 0.005    if 2 <= epoch < 5
+        # lr = 0.0005   if epoch < 9
+        # lr = 0.00005   if epoch >= 9
+        epochs = 10
+        single_targets = [0.05] * 2 + [0.005] * 3 + [0.0005] * 4 + [0.00005] * 3
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        self._test(scheduler, targets, epochs)
+
+    def test_exp_lr(self):
+        epochs = 10
+        single_targets = [0.05 * (0.9 ** x) for x in range(epochs)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        scheduler = ExponentialLR(self.opt, gamma=0.9)
+        self._test(scheduler, targets, epochs)
+
+    def test_cos_anneal_lr(self):
+        epochs = 10
+        eta_min = 1e-10
+        single_targets = [eta_min + (0.05 - eta_min) *
+                          (1 + math.cos(math.pi * x / epochs)) / 2
+                          for x in range(epochs)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        scheduler = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+        self._test(scheduler, targets, epochs)
+
+    def test_reduce_lr_on_plateau1(self):
+        epochs = 10
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 20]
+        metrics = [10 - i * 0.0167 for i in range(20)]
+        scheduler = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
+                                      threshold=0.01, patience=5, cooldown=5)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau2(self):
+        epochs = 22
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2]
+        metrics = [10 - i * 0.0165 for i in range(22)]
+        scheduler = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
+                                      mode='min', threshold=0.1)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau3(self):
+        epochs = 22
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4]
+        metrics = [-0.8] * 2 + [-0.234] * 20
+        scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
+                                      threshold_mode='abs')
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau4(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 20]
+        metrics = [1.5 * (1.025 ** i) for i in range(20)]  # 1.025 > 1.1**0.25
+        scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=3,
+                                      threshold_mode='rel', threshold=0.1)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau5(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
+        metrics = [1.5 * (1.005 ** i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel',
+                                      threshold=0.1, patience=5, cooldown=5)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau6(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 20]
+        metrics = [1.5 * (0.85 ** i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
+                                      threshold=0.1)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau7(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
+        metrics = [1] * 7 + [0.6] + [0.5] * 12
+        scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
+                                      threshold=0.1, patience=5, cooldown=5)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_reduce_lr_on_plateau8(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        targets = [[0.5] * 6 + [0.4] * 14, [0.5] * 6 + [0.3] * 14]
+        metrics = [1.5 * (1.005 ** i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel', min_lr=[0.4, 0.3],
+                                      threshold=0.1, patience=5, cooldown=5)
+        self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+
+    def test_lambda_lr(self):
+        epochs = 10
+        self.opt.param_groups[0]['lr'] = 0.05
+        self.opt.param_groups[1]['lr'] = 0.4
+        targets = [[0.05 * (0.9 ** x) for x in range(epochs)], [0.4 * (0.8 ** x) for x in range(epochs)]]
+        scheduler = LambdaLR(self.opt,
+                             lr_lambda=[lambda x1: 0.9 ** x1, lambda x2: 0.8 ** x2])
+        self._test(scheduler, targets, epochs)
+
+    def test_step_lr_state_dict(self):
+        self._check_scheduler_state_dict(
+            lambda: StepLR(self.opt, gamma=0.1, step_size=3),
+            lambda: StepLR(self.opt, gamma=0.01 / 2, step_size=1))
+
+    def test_multi_step_lr_state_dict(self):
+        self._check_scheduler_state_dict(
+            lambda: MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]),
+            lambda: MultiStepLR(self.opt, gamma=0.01, milestones=[1, 4, 6]))
+
+    def test_exp_step_lr_state_dict(self):
+        self._check_scheduler_state_dict(
+            lambda: ExponentialLR(self.opt, gamma=0.1),
+            lambda: ExponentialLR(self.opt, gamma=0.01))
+
+    def test_cosine_lr_state_dict(self):
+        epochs = 10
+        eta_min = 1e-10
+        self._check_scheduler_state_dict(
+            lambda: CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min),
+            lambda: CosineAnnealingLR(self.opt, T_max=epochs // 2, eta_min=eta_min / 2),
+            epochs=epochs)
+
+    def test_reduce_lr_on_plateau_state_dict(self):
+        scheduler = ReduceLROnPlateau(self.opt, mode='min', factor=0.1, patience=2)
+        for score in [1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 3.0, 2.0, 1.0]:
+            scheduler.step(score)
+        scheduler_copy = ReduceLROnPlateau(self.opt, mode='max', factor=0.5, patience=10)
+        scheduler_copy.load_state_dict(scheduler.state_dict())
+        for key in scheduler.__dict__.keys():
+            if key not in {'optimizer', 'is_better'}:
+                self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
+
+    def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
+        scheduler = constr()
+        for _ in range(epochs):
+            scheduler.step()
+        scheduler_copy = constr2()
+        scheduler_copy.load_state_dict(scheduler.state_dict())
+        for key in scheduler.__dict__.keys():
+            if key != 'optimizer':
+                self.assertAlmostEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
+        self.assertAlmostEqual(scheduler.get_lr(), scheduler_copy.get_lr())
+
+    def _test(self, scheduler, targets, epochs=10):
+        for epoch in range(epochs):
+            scheduler.step(epoch)
+            for param_group, target in zip(self.opt.param_groups, targets):
+                self.assertAlmostEqual(target[epoch], param_group['lr'],
+                                       msg='LR is wrong in epoch {}: expected {}, got {}'.format(
+                                           epoch, target[epoch], param_group['lr']), delta=1e-5)
+
+    def _test_reduce_lr_on_plateau(self, scheduler, targets, metrics, epochs=10, verbose=False):
+        for epoch in range(epochs):
+            scheduler.step(metrics[epoch])
+            if verbose:
+                print('epoch{}:\tlr={}'.format(epoch, self.opt.param_groups[0]['lr']))
+            for param_group, target in zip(self.opt.param_groups, targets):
+                self.assertAlmostEqual(target[epoch], param_group['lr'],
+                                       msg='LR is wrong in epoch {}: expected {}, got {}'.format(
+                                           epoch, target[epoch], param_group['lr']), delta=1e-5)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_sparse.py b/test/test_sparse.py
new file mode 100644
index 0000000..da316bd
--- /dev/null
+++ b/test/test_sparse.py
@@ -0,0 +1,1128 @@
+import torch
+from torch import sparse
+
+import itertools
+import functools
+import random
+import unittest
+from common import TestCase, run_tests
+from common_cuda import TEST_CUDA
+from test_torch import TestTorch
+from numbers import Number
+
+
+def cpu_only(inner):
+    @functools.wraps(inner)
+    def outer(self, *args, **kwargs):
+        if self.is_cuda:
+            raise unittest.SkipTest("Test is CPU-only")
+        inner(self, *args, **kwargs)
+    return outer
+
+
+def cuda_only(inner):
+    @functools.wraps(inner)
+    def outer(self, *args, **kwargs):
+        if not self.is_cuda:
+            raise unittest.SkipTest("Test is GPU-only")
+        inner(self, *args, **kwargs)
+    return outer
+
+
+class TestSparse(TestCase):
+
+    def setUp(self):
+        # These parameters control the various ways we can run the test.
+        # We will subclass and override this method to implement CUDA
+        # tests
+        self.is_cuda = False
+        self.is_uncoalesced = False
+        self.device = 'cpu'
+        self.IndexTensor = torch.LongTensor
+        self.ValueTensor = torch.DoubleTensor
+        self.value_dtype = torch.float64
+        self.SparseTensor = torch.sparse.DoubleTensor
+        super(TestSparse, self).setUp()
+
+    def _gen_sparse(self, d, nnz, with_size):
+        # TODO: Consider implementing this in the CUDA case by directly
+        # performing the operations on the GPU.  You won't be able to
+        # use torch.rand/torch.randn in this case because they are
+        # CPU-only.  If you do this, you can remove the is_cuda branch
+        # at the end.
+        #
+        # If you do this, be sure to update assert_uncoalesced too
+
+        if isinstance(with_size, Number):
+            with_size = [with_size] * d
+
+        if self.is_uncoalesced:
+            # We want to generate a tensor with a lot of uncoalesced
+            # entries to stress test whether or not we handle this
+            # (subtle) case correctly
+            v_size = [nnz * 2] + list(with_size[d:])
+            v = torch.randn(*v_size)
+            r = torch.rand(d, nnz)
+            # Repeat the indexes, so every position shows up twice
+            i = torch.cat([r, r], dim=1) * \
+                torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1)
+            i = i.type(torch.LongTensor)
+            x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
+            self.assert_uncoalesced(x)
+        else:
+            # Generate a sparse tensor with d sparse dimensions; the
+            # rest the dimensions with_size[d:] are dense.
+            v_size = [nnz] + list(with_size[d:])
+            v = torch.randn(*v_size)
+            i = torch.rand(d, nnz) * \
+                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
+            i = i.type(torch.LongTensor)
+            x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
+
+        if self.is_cuda:
+            return x.cuda(), i.cuda(), v.cuda()
+        else:
+            return x, i.clone(), v.clone()
+
+    def assert_uncoalesced(self, x):
+        """
+        Test if a CPU tensor is uncoalesced.  This is used to ensure
+        correctness of the uncoalesced tensor generation algorithm.
+        """
+        assert not x.is_coalesced()
+        # Strategy: construct a new sparse tensor with the raw value
+        # field overwritten to a tensor of ones, coalesce it, and then
+        # check if any value entries are > 1 (which indicates that the
+        # original was uncoalesced.)
+        i = x._indices().clone()
+        v = x._values().clone().fill_(1)
+        y = torch.sparse.DoubleTensor(i, v, x.size())
+        z = self.safeCoalesce(y)
+        assert (z._values() > 1).sum() > 0
+
+    def randn(self, *args, **kwargs):
+        """
+        Variant of torch.randn that also works in the TEST_CUDA case.
+        """
+        # TODO: Put this in torch.cuda.randn
+        return self.ValueTensor(*args, **kwargs).normal_()
+
+    def test_basic(self):
+        x, i, v = self._gen_sparse(3, 10, 100)
+
+        self.assertEqual(i, x._indices())
+        self.assertEqual(v, x._values())
+
+        x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
+        self.assertEqual(i, x._indices())
+        self.assertEqual(v, x._values())
+        self.assertEqual(x.ndimension(), 3)
+        self.assertEqual(self.safeCoalesce(x)._nnz(), 10)
+        for i in range(3):
+            self.assertEqual(x.size(i), 100)
+
+        # Make sure that coalesce handles duplicate indices correctly
+        i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]])
+        v = self.ValueTensor([[idx**2, idx] for idx in range(i.size(1))])
+        x = self.SparseTensor(i, v, torch.Size([10, 2]))
+        self.assertEqual(self.safeCoalesce(x)._nnz(), 9)
+
+        # Make sure we can access empty indices / values
+        x = self.SparseTensor()
+        self.assertEqual(x._indices().numel(), 0)
+        self.assertEqual(x._values().numel(), 0)
+
+    def test_ctor_size_checks(self):
+        indices = self.IndexTensor([
+            [0, 0, 0],
+            [0, 3, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+        ])
+        values = self.ValueTensor([2, 1, 3, 4])
+
+        # indices inconsistent with size
+        self.assertRaises(
+            RuntimeError,
+            lambda: self.SparseTensor(indices, values, torch.Size([2, 1, 1])))
+
+        # values inconsistent with size
+        values = self.ValueTensor([
+            [2, 1, 2, 1],
+            [1, 0, 5, 2],
+        ])
+        self.assertRaises(
+            RuntimeError,
+            lambda: self.SparseTensor(indices, values, torch.Size([2, 4, 2, 1])))
+
+    def test_to_dense(self):
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        v = self.ValueTensor([2, 1, 3, 4])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
+        res = self.ValueTensor([
+            [[2, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0]],
+            [[1, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0]],
+            [[0, 3, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 4]],
+        ])
+
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+        self.assertEqual(res, self.safeToDense(x))
+
+    def test_shared(self):
+        i = self.IndexTensor([[2]])
+        v = self.ValueTensor([5])
+        x = self.SparseTensor(i, v, torch.Size([3]))
+        v[0] = 6
+        self.assertEqual(self.ValueTensor([0, 0, 6]), self.safeToDense(x))
+        i[0][0] = 0
+        self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x))
+
+    def test_to_dense_hybrid(self):
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+        ])
+        v = self.ValueTensor([[2, 3], [1, 2], [3, 4], [4, 5]])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 2]))
+        res = self.ValueTensor([
+            [[2, 3],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[1, 2],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[3, 4],
+             [0, 0],
+             [0, 0],
+             [4, 5]],
+        ])
+
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+        self.assertEqual(res, self.safeToDense(x))
+
+    def test_contig(self):
+        i = self.IndexTensor([
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
+            [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
+        ])
+        v = self.ValueTensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        x = self.SparseTensor(i, v, torch.Size([100, 100]))
+        exp_i = self.IndexTensor([
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
+            [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
+        ])
+        exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor([3, 2, 4, 1])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor([2, 1, 3, 4])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+        # Duplicate indices
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor([3, 2, 4, 1])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor([6, 4])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+    def test_contig_hybrid(self):
+        i = self.IndexTensor([
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
+            [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
+        ])
+        v = self.ValueTensor([
+            [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+            [6, 7], [7, 8], [8, 9], [9, 10], [10, 11],
+        ])
+        x = self.SparseTensor(i, v, torch.Size([100, 100, 2]))
+        exp_i = self.IndexTensor([
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
+            [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
+        ])
+        exp_v = self.ValueTensor([
+            [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
+            [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
+        ])
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor([[3, 3, 3], [2, 2, 2], [4, 4, 4], [1, 1, 1]])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+        # Duplicate indices
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor([[3, 2, 3], [2, 1, 1], [4, 3, 4], [1, 1, 1]])
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
+
+    def test_clone(self):
+        x, _, _ = self._gen_sparse(4, 20, 5)
+        if self.is_uncoalesced:
+            self.assertFalse(x.is_coalesced())
+            y = x.clone()
+            self.assertFalse(y.is_coalesced())
+        x = x.coalesce()
+        self.assertTrue(x.is_coalesced())
+        y = x.clone()
+        self.assertTrue(y.is_coalesced())
+
+    @cuda_only
+    def test_cuda_empty(self):
+        x = torch.sparse.FloatTensor(2, 3, 4)
+        y = x.cuda(0)
+        self.assertEqual(x._sparseDims(), y._sparseDims())
+        self.assertEqual(x._denseDims(), y._denseDims())
+        x = y.cpu()
+        self.assertEqual(y._sparseDims(), x._sparseDims())
+        self.assertEqual(y._denseDims(), x._denseDims())
+
+    def test_transpose(self):
+        x = self._gen_sparse(4, 20, 5)[0]
+        y = self.safeToDense(x)
+
+        for i, j in itertools.combinations(range(4), 2):
+            x = x.transpose_(i, j)
+            y = y.transpose(i, j)
+            self.assertEqual(self.safeToDense(x), y)
+
+            x = x.transpose(i, j)
+            y = y.transpose(i, j)
+            self.assertEqual(self.safeToDense(x), y)
+
+    def test_transpose_coalesce_invariant(self):
+        # If a sparse tensor is coalesced, its transpose should be the same
+        # If a sparse tensor is uncoalesed, its transpose should be the same
+        x_coalesced = self._gen_sparse(2, 3, 4)[0].coalesce()
+        x_indices = x_coalesced._indices()
+        x_values = x_coalesced._values()
+
+        y_uncoalesced = self.SparseTensor(
+            torch.cat([x_indices, x_indices], dim=1),
+            torch.cat([x_values, x_values]),
+            x_coalesced.size())
+
+        self.assertTrue(x_coalesced.is_coalesced())
+        self.assertFalse(y_uncoalesced.is_coalesced())
+
+        self.assertTrue(x_coalesced.transpose(0, 1).is_coalesced())
+        self.assertFalse(y_uncoalesced.transpose(0, 1).is_coalesced())
+
+        x_coalesced.transpose_(0, 1)
+        y_uncoalesced.transpose_(0, 1)
+        self.assertTrue(x_coalesced.is_coalesced())
+        self.assertFalse(y_uncoalesced.is_coalesced())
+
+    def test_t_empty(self):
+        x = self.SparseTensor(2, 3)
+        x.t_()
+        self.assertEqual(torch.Size([3, 2]), x.size())
+        self.assertEqual(0, x._indices().numel())
+        self.assertEqual(0, x._values().numel())
+        self.assertEqual(x._sparseDims(), 2)
+        self.assertEqual(x._denseDims(), 0)
+
+        x = self.SparseTensor(2, 3)
+        y = x.t()
+        self.assertEqual(torch.Size([3, 2]), y.size())
+        self.assertEqual(0, y._indices().numel())
+        self.assertEqual(0, y._values().numel())
+        self.assertEqual(x._sparseDims(), 2)
+        self.assertEqual(x._denseDims(), 0)
+
+    def test_add_zeros(self):
+        def test_shape(sparse_dims, sizes):
+            x, _, _ = self._gen_sparse(sparse_dims, 20, sizes)
+            zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device)
+            r1 = zeros + x
+            r2 = x + zeros
+            self.assertEqual(r1, x)
+            self.assertEqual(r2, x)
+
+        test_shape(1, [1])
+        test_shape(4, [3, 17, 19, 5])
+        test_shape(2, [3, 17, 19, 5])
+
+    @cpu_only
+    def test_mm(self):
+        def test_shape(di, dj, dk):
+            x, _, _ = self._gen_sparse(2, 20, [di, dj])
+            t = torch.randn(di, dk)
+            y = torch.randn(dj, dk)
+            alpha = random.random()
+            beta = random.random()
+
+            res = torch.addmm(alpha, t, beta, x, y)
+            expected = torch.addmm(alpha, t, beta, self.safeToDense(x), y)
+            self.assertEqual(res, expected)
+
+            res = torch.addmm(t, x, y)
+            expected = torch.addmm(t, self.safeToDense(x), y)
+            self.assertEqual(res, expected)
+
+            res = torch.mm(x, y)
+            expected = torch.mm(self.safeToDense(x), y)
+            self.assertEqual(res, expected)
+
+        test_shape(10, 100, 100)
+        test_shape(100, 1000, 200)
+        test_shape(64, 10000, 300)
+
+    @cpu_only
+    def test_saddmm(self):
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
+            t = self._gen_sparse(2, 20, [di, dk])[0]
+            y = torch.randn(dj, dk)
+            alpha = random.random()
+            beta = random.random()
+
+            res = torch.saddmm(alpha, t, beta, x, y)
+            expected = torch.addmm(alpha, self.safeToDense(t), beta, self.safeToDense(x), y)
+            self.assertEqual(self.safeToDense(res), expected)
+
+            res = torch.saddmm(t, x, y)
+            expected = torch.addmm(self.safeToDense(t), self.safeToDense(x), y)
+            self.assertEqual(self.safeToDense(res), expected)
+
+            res = torch.smm(x, y)
+            expected = torch.mm(self.safeToDense(x), y)
+            self.assertEqual(self.safeToDense(res), expected)
+
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
+
+    def test_dsmm(self):
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
+            y = self.randn(dj, dk)
+
+            res = torch.dsmm(x, y)
+            expected = torch.mm(self.safeToDense(x), y)
+            self.assertEqual(res, expected)
+
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
+
+    def test_hsmm(self):
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
+            y = self.randn(dj, dk)
+
+            res = torch.hsmm(x, y)
+            # TODO: use self.safeToDense(), but this triggers
+            # https://github.com/pytorch/pytorch/issues/3170
+            expected = torch.mm(x.to_dense(), y)
+            self.assertEqual(res.to_dense(), expected)
+
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
+
+    def _test_spadd_shape(self, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
+        y = self.randn(*shape)
+        r = random.random()
+
+        res = torch.add(y, r, x)
+        expected = y + r * self.safeToDense(x)
+
+        self.assertEqual(res, expected)
+
+        # Non contiguous dense tensor
+        s = list(shape)
+        s[0] = shape[-1]
+        s[-1] = shape[0]
+        y = self.randn(*s)
+        y.transpose_(0, len(s) - 1)
+        r = random.random()
+
+        res = torch.add(y, r, x)
+        expected = y + r * self.safeToDense(x)
+
+        self.assertEqual(res, expected)
+
+        x, i, v = self._gen_sparse(len(shape_i), 10, shape)
+        nnz = i.size(1)
+
+        # Non contiguous sparse indices tensor
+        x_ = self.SparseTensor(i[:, ::2], v[:int(nnz / 2)], x.shape)
+        res = torch.add(y, r, x_)
+        expected = y + r * self.safeToDense(x_)
+        self.assertEqual(res, expected)
+
+        # Non contiguous sparse values tensor
+        x_ = self.SparseTensor(i[:, :int(nnz / 2)], v[::2], x.shape)
+        res = torch.add(y, r, x_)
+        expected = y + r * self.safeToDense(x_)
+        self.assertEqual(res, expected)
+
+        # Non contiguous sparse indices and values tensors
+        x_ = self.SparseTensor(i[:, 1::2], v[1::2], x.shape)
+        res = torch.add(y, r, x_)
+        expected = y + r * self.safeToDense(x_)
+        self.assertEqual(res, expected)
+
+    def test_spadd(self):
+        self._test_spadd_shape([5, 6])
+        self._test_spadd_shape([10, 10, 10])
+        self._test_spadd_shape([50, 30, 20])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5])
+
+    def test_spadd_hybrid(self):
+        self._test_spadd_shape([5, 6], [2, 3])
+        self._test_spadd_shape([10, 10, 10], [3])
+        self._test_spadd_shape([50, 30, 20], [2])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])
+
+    def test_norm(self):
+        x, _, _ = self._gen_sparse(3, 10, 100)
+        y = x.coalesce()
+        self.assertEqual(x.norm(), y._values().norm())
+
+    def _test_basic_ops_shape(self, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+
+        y1 = x1 + x2
+        y2 = x1.clone()
+        y2.add_(x2)
+        expected = self.safeToDense(x1) + self.safeToDense(x2)
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        y1 = x1 - x2
+        y2 = x1.clone()
+        y2.sub_(x2)
+        expected = self.safeToDense(x1) - self.safeToDense(x2)
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        y1 = x1 * x2
+        y2 = x1.clone()
+        y2.mul_(x2)
+        expected = self.safeToDense(x1) * self.safeToDense(x2)
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        y1 = x1 * 37.5
+        y2 = x1.clone()
+        y2.mul_(37.5)
+        expected = self.safeToDense(x1) * 37.5
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        y1 = x1 / 37.5
+        y2 = x1.clone()
+        y2.div_(37.5)
+        expected = self.safeToDense(x1) / 37.5
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        # TODO: add back inplace support
+        y1 = x1 ** 2
+        y2 = x1.clone()
+        y2 = y2.pow(2)
+        expected = self.safeToDense(x1) ** 2
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+        y = x1.clone()
+        y.zero_()
+        expected = torch.zeros(x1.size())
+        self.assertEqual(self.safeToDense(y), expected)
+
+        self.assertFalse(x1.is_coalesced())
+        y = x1.coalesce()
+        z = x1.coalesce()
+        self.assertFalse(x1.is_coalesced())
+        self.assertTrue(y.is_coalesced())
+        self.assertEqual(x1, y)
+        # check that coalesce is out of place
+        y._values().add_(1)
+        self.assertEqual(z._values() + 1, y._values())
+
+    def test_basic_ops(self):
+        self._test_basic_ops_shape([5, 6])
+        self._test_basic_ops_shape([10, 10, 10])
+        self._test_basic_ops_shape([50, 30, 20])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
+
+    def test_basic_ops_hybrid(self):
+        self._test_basic_ops_shape([5, 6], [2, 3])
+        self._test_basic_ops_shape([10, 10, 10], [3])
+        self._test_basic_ops_shape([50, 30, 20], [2])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])
+
+    def test_add_dense_sparse_mismatch(self):
+        x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device)
+        sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device),
+                                     torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device),
+                                     torch.Size([3, 4, 4]))
+        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
+
+    def _test_sparse_mask_shape(self, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+
+        y1 = x1 + x2
+        y2 = x1.clone()
+        y2.add_(x2)
+        expected = self.safeToDense(x1) + self.safeToDense(x2)
+        self.assertEqual(self.safeToDense(y1), expected)
+        self.assertEqual(self.safeToDense(y2), expected)
+
+    def _test_sparse_mask_fixed(self):
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor([1, 2, 3, 4])
+        x = self.SparseTensor(i, v, torch.Size([5, 4])).coalesce()
+        dense = self.ValueTensor([
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+            [9, 10, 11, 12],
+            [13, 14, 15, 16],
+            [17, 18, 19, 20],
+        ])
+        exp_v = self.ValueTensor([7, 14, 3, 20])
+        res = dense._sparse_mask(x)
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
+        self.assertEqual(res, expected)
+
+    def test_sparse_mask(self):
+        self._test_sparse_mask_fixed()
+
+        self._test_sparse_mask_shape([5, 6])
+        self._test_sparse_mask_shape([10, 10, 10])
+        self._test_sparse_mask_shape([50, 30, 20])
+        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5])
+
+    def _test_zeros(self, shape, out_shape_i, out_shape_v=None):
+        out_shape = out_shape_i + (out_shape_v or [])
+        for nnz in [9, 12]:
+            out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
+            torch.zeros(*shape, out=out)
+            self.assertEqual(tuple(out.size()), tuple(shape))
+            self.assertTrue(out._indices().numel() == out._values().numel() == 0)
+            self.assertEqual(out._nnz(), 0)
+            self.assertEqual(out._sparseDims(), len(shape))
+            self.assertEqual(out._denseDims(), 0)
+
+    def test_log1p(self):
+        if self.is_cuda:
+            input = torch.cuda.sparse.DoubleTensor(
+                torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(),
+                torch.FloatTensor([3, 4, 5]).cuda(),
+                torch.Size([3]))
+        else:
+            input = torch.sparse.DoubleTensor(
+                torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
+                torch.FloatTensor([3, 4, 5]),
+                torch.Size([3]))
+
+        expected_output = torch.tensor([3., 4., 5.]).log1p_()
+        self.assertEqual(expected_output, input.log1p().to_dense())
+        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
+
+        # test in-place op on uncoalesced input
+        self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced")
+
+        input.requires_grad_()
+        self.assertTrue(input.requires_grad)
+
+        # test autograd
+        x = input.clone()
+        y = input.log1p()
+        self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward")
+
+        # test uncoalesced input
+        input_uncoalesced = torch.sparse.DoubleTensor(
+            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
+            torch.Size([3]))
+        self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense())
+        self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense())
+
+    def test_zeros(self):
+        i_shapes = [2, 3, 4]
+        v_shapes = [3, 4, 5, 6]
+        for i_dim in range(1, len(i_shapes) + 1):
+            for v_dim in range(len(v_shapes) + 1):
+                self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim])
+
+    def _test_zeros_like(self, template_shape_i, template_shape_v=None):
+        template_shape_v = template_shape_v or []
+        template_shape = template_shape_i + template_shape_v
+        for nnz in [9, 12]:
+            t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
+            res = torch.zeros_like(t)
+            self.assertEqual(tuple(res.size()), tuple(template_shape))
+            self.assertTrue(res._indices().numel() == res._values().numel() == 0)
+            self.assertEqual(res._nnz(), 0)
+            self.assertEqual(res._sparseDims(), len(template_shape_i))
+            self.assertEqual(res._denseDims(), len(template_shape_v))
+
+    def test_zeros_like(self):
+        i_shapes = [2, 3, 4]
+        v_shapes = [3, 4, 5, 6]
+        for i_dim in range(1, len(i_shapes) + 1):
+            for v_dim in range(len(v_shapes) + 1):
+                self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim])
+
+    def _test_sparse_mask_hybrid_fixed(self):
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]])
+        # TODO: This is also testing that, if coalesce is a no-op,
+        # the indices don't get permuted. I don't know if we actually
+        # want to give this invariant.
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce()
+        dense = self.ValueTensor([
+            [[1, 3], [2, 2], [3, 3], [4, 2]],
+            [[5, 7], [6, 7], [7, 9], [8, 9]],
+            [[9, 2], [10, 4], [11, 1], [12, 3]],
+            [[13, 5], [14, 1], [15, 1], [16, 6]],
+            [[17, 7], [18, 2], [19, 7], [20, 1]],
+        ])
+        res = dense._sparse_mask(x)
+        exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]])
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
+        self.assertEqual(res, expected)
+
+    def test_sparse_variable_methods(self):
+        # TODO: delete when tensor/variable are merged
+        from torch.autograd import Variable
+        i = self.IndexTensor([[0, 1, 1], [2, 0, 2]])
+        v = self.ValueTensor([3, 4, 5])
+        sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3]))
+        sparse_var = Variable(sparse_mat)
+
+        to_test_one_arg = {
+            'zeros_like': lambda x: torch.zeros_like(x),
+            'transpose': lambda x: x.transpose(0, 1),
+            'transpose_': lambda x: x.transpose_(0, 1),
+            't': lambda x: x.t(),
+            't_': lambda x: x.t_(),
+            'div': lambda x: x.div(2),
+            'div_': lambda x: x.div_(2),
+            'pow': lambda x: x.pow(2),
+            '_nnz': lambda x: x._nnz(),
+            'is_coalesced': lambda x: x.is_coalesced(),
+            'coalesce': lambda x: x.coalesce(),
+            'to_dense': lambda x: x.to_dense(),
+            '_sparseDims': lambda x: x._sparseDims(),
+            '_denseDims': lambda x: x._denseDims(),
+            'norm': lambda x: x.norm(),
+            'log1p': lambda x: x.log1p(),
+        }
+
+        for test_name, test_fn in to_test_one_arg.items():
+            var1 = sparse_var.clone()
+            tensor1 = sparse_mat.clone()
+
+            out_var = test_fn(var1)
+            out_tensor = test_fn(tensor1)
+
+            if isinstance(out_tensor, int) or isinstance(out_tensor, bool):
+                if not isinstance(out_var, int) and not isinstance(out_var, bool):
+                    check_var = out_var.data[0]
+                else:
+                    check_var = out_var
+                self.assertEqual(out_var, out_tensor)
+                continue
+
+            # Assume output is variable / tensor
+            self.assertEqual(test_fn(var1).data, test_fn(tensor1),
+                             test_name)
+
+        i = self.IndexTensor([[0, 0, 1], [1, 2, 1]])
+        v = self.ValueTensor([3, 3, 4])
+        sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3]))
+        sparse_var2 = Variable(sparse_mat2)
+
+        to_test_two_arg = {
+            'sub': lambda x, y: x.sub(y),
+            'sub_': lambda x, y: x.sub_(y),
+            'mul': lambda x, y: x.mul(y),
+            'mul_': lambda x, y: x.mul_(y),
+        }
+
+        for test_name, test_fn in to_test_two_arg.items():
+            var1 = sparse_var.clone()
+            var2 = sparse_var2.clone()
+            tensor1 = sparse_mat.clone()
+            tensor2 = sparse_mat2.clone()
+            self.assertEqual(test_fn(var1, var2).data,
+                             test_fn(tensor1, tensor2), test_name)
+
+        to_test_mixed = [
+            # test name, lambda expression, should_run_when_cuda
+            ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False),
+            ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False),
+            ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False),
+            ('addmm', lambda sp, de: de.addmm(sp, de), True),
+            # TODO: This looks like a typo
+            ('addmm_', lambda sp, de: de.addmm(sp, de), True),
+            ('mm', lambda sp, de: torch.mm(sp, de), True),
+            ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True),
+        ]
+
+        i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]])
+        v = self.ValueTensor([3, 3, 4, 1, 2])
+        sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3]))
+        sparse_var = Variable(sparse_mat)
+        dense_mat = sparse_mat.to_dense().random_(0, 5)
+        dense_var = Variable(dense_mat)
+
+        for test_name, test_fn, test_cuda in to_test_mixed:
+            if sparse_var.is_cuda and not test_cuda:
+                continue
+            sp_var = sparse_var.clone()
+            de_var = dense_var.clone()
+            sp_mat = sparse_mat.clone()
+            de_mat = dense_mat.clone()
+            self.assertEqual(test_fn(sp_var, de_var).data,
+                             test_fn(sp_mat, de_mat), test_name)
+
+    def test_sparse_mask_hybrid(self):
+        self._test_sparse_mask_hybrid_fixed()
+
+        self._test_sparse_mask_shape([5, 6], [2, 3])
+        self._test_sparse_mask_shape([10, 10, 10], [3])
+        self._test_sparse_mask_shape([50, 30, 20], [2])
+        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2])
+
+    def test_sparse_add_coalesce(self):
+        i = self.IndexTensor([[1, 2, 1]])
+        v = self.ValueTensor([3, 4, 5])
+        x = self.SparseTensor(i, v, torch.Size([3]))
+        y = self.SparseTensor(i, v, torch.Size([3]))
+        z = x + y
+
+        self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
+
+    @cuda_only
+    def test_storage_not_null(self):
+        x = torch.cuda.sparse.FloatTensor(2)
+        self.assertNotEqual(x.get_device(), -1)
+
+    @cuda_only
+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_same_gpu(self):
+        i = self.IndexTensor([[2]]).cuda(1)
+        v = self.ValueTensor([5]).cuda(1)
+        x = self.SparseTensor(i, v, torch.Size([3]), device=1)
+        self.assertEqual(x.get_device(), 1)
+        self.assertEqual(x._values().get_device(), 1)
+        self.assertEqual(x._indices().get_device(), 1)
+
+        x = self.SparseTensor(3, device=1)
+        self.assertEqual(x.get_device(), 1)
+        self.assertEqual(x._values().get_device(), 1)
+        self.assertEqual(x._indices().get_device(), 1)
+
+        v = self.ValueTensor([5]).cuda(0)
+        self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3])))
+
+    def _test_new_device(self, size, device):
+        with torch.cuda.device(device):
+            x = torch.cuda.sparse.DoubleTensor(*size)
+        self.assertEqual(x.get_device(), device)
+        x1 = x.new()
+        x2 = x.new(2, 3)
+        self.assertEqual(x1.get_device(), device)
+        self.assertEqual(x2.get_device(), device)
+
+    @cuda_only
+    def test_new_device_single_gpu(self):
+        self._test_new_device((), 0)
+        self._test_new_device((30, 20), 0)
+        self._test_new_device((30, 20, 10), 0)
+
+    @cuda_only
+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_new_device_multi_gpu(self):
+        self._test_new_device((), 1)
+        self._test_new_device((30, 20), 1)
+        self._test_new_device((30, 20, 10), 1)
+
+    def test_new(self):
+        x, indices, values = self._gen_sparse(3, 10, 100)
+        if not x.is_cuda:
+            # CUDA sparse tensors currently requires the size to be
+            # specified if nDimV > 0
+            self.assertEqual(x.new(indices, values), x)
+        self.assertEqual(x.new(indices, values, x.size()), x)
+
+    @cpu_only  # not really, but we only really want to run this once
+    def test_factory(self):
+        default_size = torch.Size([1, 3])
+        size = torch.Size([3, 3])
+        for include_size in [True, False]:
+            for use_tensor_idx in [True, False]:
+                for use_tensor_val in [True, False]:
+                    for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
+                        # have to include size with cuda sparse tensors
+                        include_size = include_size or use_cuda
+                        dtype = torch.float64
+                        long_dtype = torch.int64
+                        device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1)
+                        indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
+                        values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1.
+                        if include_size:
+                            sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
+                                                                    device=device, requires_grad=True)
+                        else:
+                            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
+                                                                    device=device, requires_grad=True)
+                        self.assertEqual(indices, sparse_tensor._indices())
+                        self.assertEqual(values, sparse_tensor._values())
+                        self.assertEqual(size if include_size else default_size, sparse_tensor.size())
+                        self.assertEqual(dtype, sparse_tensor.dtype)
+                        if use_cuda:
+                            self.assertEqual(device, sparse_tensor._values().device)
+                        self.assertEqual(True, sparse_tensor.requires_grad)
+
+    def test_factory_size_check(self):
+        indices = self.IndexTensor([[1, 2], [0, 2]])
+        values = self.ValueTensor([.5, .5])
+        sizes = torch.Size([2, 3])
+        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+            self.SparseTensor(indices, values, sizes)
+
+        indices = self.IndexTensor([[1, 2], [0, 2]])
+        values = self.ValueTensor([[1, 1, 1], [1, 1, 1]])
+        sizes = torch.Size([3, 3, 2])
+        with self.assertRaisesRegex(RuntimeError, "values and sizes are inconsistent"):
+            self.SparseTensor(indices, values, sizes)
+
+    def test_factory_empty_indices(self):
+        device = 'cuda' if self.is_cuda else 'cpu'
+        tensor = torch.sparse_coo_tensor([], [], torch.Size([]), device=device)
+        expected_indices = torch.tensor([], dtype=torch.long, device=device)
+        self.assertEqual(tensor._indices(), expected_indices)
+
+    @cpu_only
+    def test_factory_type_inference(self):
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
+        self.assertEqual(torch.float32, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float64))
+        self.assertEqual(torch.float64, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1]))
+        self.assertEqual(torch.int64, t.dtype)
+
+    @cuda_only
+    def test_factory_device_type_inference(self):
+        # both indices/values are CUDA
+        shape = (1, 3)
+        for indices_device in ['cuda', 'cpu']:
+            for values_device in ['cuda', 'cpu']:
+                for sparse_device in ['cuda', 'cpu', None]:
+                    t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                torch.tensor([1.], device=values_device),
+                                                (1, 3), device=sparse_device)
+                    should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
+                    self.assertEqual(should_be_cuda, t.is_cuda)
+
+    @cpu_only
+    def test_factory_copy(self):
+        # both correct
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.tensor([1.], dtype=torch.float64)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
+        # only indices correct
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.tensor([1.], dtype=torch.float32)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
+        # only values correct
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.tensor([1.], dtype=torch.float64)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
+        # neither correct
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.tensor([1.], dtype=torch.float32)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
+    @cpu_only  # not really, but we only really want to run this once
+    def test_dtypes(self):
+        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
+        TestTorch._test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
+        if torch.cuda.is_available():
+            TestTorch._test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
+
+    @cpu_only  # not really, but we only really want to run this once
+    def test_empty_full(self):
+        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
+        TestTorch._test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
+        if torch.cuda.device_count() > 0:
+            TestTorch._test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None)
+            TestTorch._test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
+
+    def test_is_sparse(self):
+        x = torch.randn(3, 3)
+        self.assertFalse(x.is_sparse)
+
+        x = self.SparseTensor()
+        self.assertTrue(x.is_sparse)
+
+    def test_resize_as(self):
+        def do_test(t):
+            y = t.new().resize_as_(t).zero_()
+            self.assertEqual(y.shape, t.shape)
+            # Check that y can be added to t. Currently, this requires that
+            # _sparseDims and _denseDims match.
+            self.assertEqual(t, t + y)
+
+        do_test(self.SparseTensor())
+
+    def test_is_nonzero(self):
+        self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
+        self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
+        self.assertFalse(torch.sparse_coo_tensor(([0], [0]), 0., (1, 1)).is_nonzero())
+        self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero())
+        self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero())
+        # NB: We should test "scalar" sparse tensors, but they don't actually
+        # work at the moment (in principle, they should)
+
+
+class TestUncoalescedSparse(TestSparse):
+    def setUp(self):
+        super(TestUncoalescedSparse, self).setUp()
+        self.is_uncoalesced = True
+
+
+@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+class TestCudaSparse(TestSparse):
+    def setUp(self):
+        super(TestCudaSparse, self).setUp()
+        self.is_cuda = True
+        self.device = 'cuda'
+        self.IndexTensor = torch.cuda.LongTensor
+        self.ValueTensor = torch.cuda.DoubleTensor
+        self.SparseTensor = torch.cuda.sparse.DoubleTensor
+
+
+@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+class TestCudaUncoalescedSparse(TestCudaSparse):
+    def setUp(self):
+        super(TestCudaUncoalescedSparse, self).setUp()
+        self.is_uncoalesced = True
+
+
+class TestSparseOneOff(TestCase):
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_cuda_from_cpu(self):
+        self.assertExpectedRaises(
+            RuntimeError,
+            lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                             torch.randn(4, 4, 4),
+                                             [3, 4, 4]))
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_cuda_sparse_cpu_dense_add(self):
+        x = torch.zeros(3, 4, 4)
+        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                                 torch.randn(4, 4, 4).cuda(),
+                                                 [3, 4, 4])
+        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_torch.py b/test/test_torch.py
new file mode 100644
index 0000000..bb2c2fe
--- /dev/null
+++ b/test/test_torch.py
@@ -0,0 +1,8311 @@
+import sys
+import io
+import os
+import math
+import random
+import operator
+import copy
+import shutil
+import torch
+import torch.cuda
+import tempfile
+import unittest
+import warnings
+import pickle
+import gzip
+from torch._utils_internal import get_file_path, get_file_path_2
+from torch.utils.dlpack import from_dlpack, to_dlpack
+from torch._utils import _rebuild_tensor
+from torch._six import inf, nan
+from itertools import product, combinations
+from functools import reduce
+from torch import multiprocessing as mp
+from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
+    TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize
+from multiprocessing.reduction import ForkingPickler
+
+if TEST_NUMPY:
+    import numpy as np
+
+if TEST_SCIPY:
+    from scipy import signal
+
+if TEST_LIBROSA:
+    import librosa
+
+SIZE = 100
+
+can_retrieve_source = True
+with warnings.catch_warnings(record=True) as warns:
+    with tempfile.NamedTemporaryFile() as checkpoint:
+        x = torch.save(torch.nn.Module(), checkpoint)
+        for warn in warns:
+            if "Couldn't retrieve source code" in warn.message.args[0]:
+                can_retrieve_source = False
+                break
+
+
+class FilelikeMock(object):
+    def __init__(self, data, has_fileno=True, has_readinto=False):
+        if has_readinto:
+            setattr(self, 'readinto', self.readinto_opt)
+        if has_fileno:
+            # Python 2's StringIO.StringIO has no fileno attribute.
+            # This is used to test that.
+            setattr(self, 'fileno', self.fileno_opt)
+
+        self.calls = set([])
+        self.bytesio = io.BytesIO(data)
+
+        def trace(fn, name):
+            def result(*args, **kwargs):
+                self.calls.add(name)
+                return fn(*args, **kwargs)
+            return result
+
+        for attr in ['read', 'readline', 'seek', 'tell', 'write', 'flush']:
+            traced_fn = trace(getattr(self.bytesio, attr), attr)
+            setattr(self, attr, traced_fn)
+
+    def fileno_opt(self):
+        raise io.UnsupportedOperation('Not a real file')
+
+    def readinto_opt(self, view):
+        self.calls.add('readinto')
+        return self.bytesio.readinto(view)
+
+    def was_called(self, name):
+        return name in self.calls
+
+
+class BytesIOContext(io.BytesIO):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+
+class TestTorch(TestCase):
+
+    def test_dot(self):
+        types = {
+            'torch.DoubleTensor': 1e-8,
+            'torch.FloatTensor': 1e-4,
+        }
+        for tname, _prec in types.items():
+            v1 = torch.randn(100).type(tname)
+            v2 = torch.randn(100).type(tname)
+            res1 = torch.dot(v1, v2)
+            res2 = 0
+            for i, j in zip(v1, v2):
+                res2 += i * j
+            self.assertEqual(res1, res2)
+            out = torch.randn(()).type(tname)
+            torch.dot(v1, v2, out=out)
+            self.assertEqual(res1, out)
+
+        # Test 0-strided
+        for tname, _prec in types.items():
+            v1 = torch.randn(1).type(tname).expand(100)
+            v2 = torch.randn(100).type(tname)
+            res1 = torch.dot(v1, v2)
+            res2 = 0
+            for i, j in zip(v1, v2):
+                res2 += i * j
+            self.assertEqual(res1, res2)
+            out = torch.randn(()).type(tname)
+            torch.dot(v1, v2, out=out)
+            self.assertEqual(res1, out)
+
+    def test_ger(self):
+        types = {
+            'torch.DoubleTensor': 1e-8,
+            'torch.FloatTensor': 1e-4,
+        }
+        for tname, _prec in types.items():
+            v1 = torch.randn(100).type(tname)
+            v2 = torch.randn(100).type(tname)
+            res1 = torch.ger(v1, v2)
+            res2 = torch.zeros(100, 100).type(tname)
+            for i in range(100):
+                for j in range(100):
+                    res2[i, j] = v1[i] * v2[j]
+            self.assertEqual(res1, res2)
+
+        # Test 0-strided
+        for tname, _prec in types.items():
+            v1 = torch.randn(1).type(tname).expand(100)
+            v2 = torch.randn(100).type(tname)
+            res1 = torch.ger(v1, v2)
+            res2 = torch.zeros(100, 100).type(tname)
+            for i in range(100):
+                for j in range(100):
+                    res2[i, j] = v1[i] * v2[j]
+            self.assertEqual(res1, res2)
+
+    def test_addr(self):
+        types = {
+            'torch.DoubleTensor': 1e-8,
+            'torch.FloatTensor': 1e-4,
+        }
+
+        def run_test(m, v1, v2, m_transform=lambda x: x):
+            m = m_transform(m.clone())
+            ref = m.clone()
+            torch.addr(m, v1, v2, out=m)
+            for i in range(m.size(0)):
+                for j in range(m.size(1)):
+                    ref[i, j] += v1[i] * v2[j]
+            self.assertEqual(m, ref)
+
+        for tname, _prec in types.items():
+            for h, w in [(100, 110), (1, 20), (200, 2)]:
+                m = torch.randn(h, w).type(tname)
+                v1 = torch.randn(h).type(tname)
+                v2 = torch.randn(w).type(tname)
+                run_test(m, v1, v2)
+                # test transpose
+                run_test(m, v2, v1, lambda x: x.transpose(0, 1))
+                # test 0 strided
+                v1 = torch.randn(1).type(tname).expand(h)
+                run_test(m, v1, v2)
+                run_test(m, v2, v1, lambda x: x.transpose(0, 1))
+
+    def test_addmv(self):
+        types = {
+            'torch.DoubleTensor': 1e-8,
+            'torch.FloatTensor': 1e-4,
+        }
+        for tname, _prec in types.items():
+            t = torch.randn(10).type(tname)
+            m = torch.randn(10, 100).type(tname)
+            v = torch.randn(100).type(tname)
+            res1 = torch.addmv(t, m, v)
+            res2 = torch.zeros(10).type(tname)
+            res2 += t
+            for i in range(10):
+                for j in range(100):
+                    res2[i] += m[i, j] * v[j]
+            self.assertEqual(res1, res2)
+
+        # Test 0-strided
+        for tname, _prec in types.items():
+            t = torch.randn(1).type(tname).expand(10)
+            m = torch.randn(10, 1).type(tname).expand(10, 100)
+            v = torch.randn(100).type(tname)
+            res1 = torch.addmv(t, m, v)
+            res2 = torch.zeros(10).type(tname)
+            res2 += t
+            for i in range(10):
+                for j in range(100):
+                    res2[i] += m[i, j] * v[j]
+            self.assertEqual(res1, res2)
+
+    def test_addmm(self):
+        types = {
+            'torch.DoubleTensor': 1e-8,
+            'torch.FloatTensor': 1e-4,
+        }
+        for tname, _prec in types.items():
+            M = torch.randn(10, 25).type(tname)
+            m1 = torch.randn(10, 50).type(tname)
+            m2 = torch.randn(50, 25).type(tname)
+            res1 = torch.addmm(M, m1, m2)
+            res2 = torch.zeros(10, 25).type(tname)
+            res2 += M
+            for i in range(10):
+                for j in range(25):
+                    for k in range(50):
+                        res2[i, j] += m1[i, k] * m2[k, j]
+            self.assertEqual(res1, res2)
+
+        # Test 0-strided
+        for tname, _prec in types.items():
+            M = torch.randn(10, 1).type(tname).expand(10, 25)
+            m1 = torch.randn(10, 1).type(tname).expand(10, 50)
+            m2 = torch.randn(50, 25).type(tname)
+            res1 = torch.addmm(M, m1, m2)
+            res2 = torch.zeros(10, 25).type(tname)
+            res2 += M
+            for i in range(10):
+                for j in range(25):
+                    for k in range(50):
+                        res2[i, j] += m1[i, k] * m2[k, j]
+            self.assertEqual(res1, res2)
+
+    def test_allclose(self):
+        x = torch.tensor([1.0, 2.0, 3.0])
+        y = torch.tensor([1.01, 2.01, 3.01])
+        self.assertTrue(torch.allclose(x, y, rtol=0, atol=0.02))
+        self.assertTrue(torch.allclose(x, y, rtol=0.01, atol=0.0))
+        self.assertFalse(torch.allclose(x, y))
+        self.assertTrue(torch.allclose(torch.tensor([0.0]), torch.tensor([1e-8])))
+        x = torch.tensor([2.0, 3.0, nan])
+        y = torch.tensor([2.01, 3.01, nan])
+        self.assertFalse(torch.allclose(x, y, rtol=1e-2))
+        self.assertTrue(torch.allclose(x, y, rtol=1e-2, equal_nan=True))
+        self.assertFalse(torch.allclose(x, y, rtol=1e-3, equal_nan=True))
+        inf_t = torch.tensor([inf])
+        self.assertTrue(torch.allclose(inf_t, inf_t))
+        self.assertTrue(torch.allclose(-inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, torch.tensor([1e20])))
+        self.assertFalse(torch.allclose(-inf_t, torch.tensor([-1e20])))
+
+    def test_linear_algebra_scalar_raises(self):
+        m = torch.randn(5, 5)
+        v = torch.randn(5)
+        s = torch.tensor(7)
+        self.assertRaises(RuntimeError, lambda: torch.mv(m, s))
+        self.assertRaises(RuntimeError, lambda: torch.addmv(v, m, s))
+        self.assertRaises(RuntimeError, lambda: torch.ger(v, s))
+        self.assertRaises(RuntimeError, lambda: torch.ger(s, v))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))
+
+    def _test_math(self, torchfn, mathfn, input=None):
+        if input is None:
+            input = []
+            input.append(list(range(-5, 5)))
+            input.append([0 for x in range(-5, 5)])
+            input.append([x + 1e-6 for x in range(-5, 5)])
+            # Some vectorized implementations don't support large ranges
+            input.append([x + 1e10 for x in range(-5, 5)])
+            input.append([x - 1e10 for x in range(-5, 5)])
+            input.append(torch.randn(10).tolist())
+            input.append((torch.randn(10) + 1e6).tolist())
+            input.append([math.pi * (x / 2) for x in range(-5, 5)])
+
+        def compare_reference(input, dtype):
+            input = torch.tensor(input, dtype=dtype)
+            res1 = torchfn(input.clone())
+            res2 = input.clone().apply_(lambda x: mathfn(x))
+            torch.testing.assert_allclose(res1, res2)
+
+        # compare against the reference math function
+        compare_reference(input, torch.double)
+        compare_reference(input, torch.float)
+
+        def check_non_contiguous(shape, dtype):
+            contig = torch.randn(shape, dtype=dtype)
+            non_contig = torch.empty(shape + (2,), dtype=dtype)[..., 0]
+            non_contig.copy_(contig)
+            self.assertFalse(non_contig.is_contiguous())
+            self.assertEqual(torchfn(contig), torchfn(non_contig), 'non-contiguous')
+
+        # compare application against contiguous vs. non-contiguous
+        check_non_contiguous((5, 7), torch.double)
+        check_non_contiguous((1024,), torch.double)
+        check_non_contiguous((5, 7), torch.float)
+        check_non_contiguous((1024,), torch.float)
+
+        # If size(dim) == 1, stride(dim) is not defined.
+        # The code needs to be able to handle this
+        def check_contiguous_size1(dtype):
+            contig = torch.randn((5, 100), dtype=dtype)
+            contig = contig[:1, :50]
+            contig2 = torch.empty(contig.size(), dtype=dtype)
+            contig2.copy_(contig)
+            self.assertTrue(contig.is_contiguous())
+            self.assertTrue(contig2.is_contiguous())
+            self.assertEqual(torchfn(contig), torchfn(contig2), 'contiguous size1')
+
+        check_contiguous_size1(torch.double)
+        check_contiguous_size1(torch.float)
+
+        def check_contiguous_size1_largedim(dtype):
+            contig = torch.randn((5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4), dtype=dtype)
+            contig = contig[:1, :, :, :, :, :, :, :, :, :, :, :]
+            contig2 = torch.empty(contig.size(), dtype=dtype)
+            contig2.copy_(contig)
+            self.assertTrue(contig.is_contiguous())
+            self.assertTrue(contig2.is_contiguous())
+            self.assertEqual(torchfn(contig), torchfn(contig2), 'contiguous size1')
+
+        check_contiguous_size1_largedim(torch.double)
+        check_contiguous_size1_largedim(torch.float)
+
+        def check_large(dtype):
+            input = torch.randn(1024, 512, dtype=dtype)
+            actual = torchfn(input)
+            expected = torch.stack([torchfn(slice) for slice in input])
+            self.assertEqual(actual, expected, 'large')
+
+        # compare large tensor vs. repeated small applications to expose
+        # possible parallelism bugs.
+        check_large(torch.double)
+        check_large(torch.float)
+
+    def __test_math_by_name(self, function_name, mathfn, selffn):
+        mathfn = getattr(math, mathfn)
+        if selffn:
+            def torchfn(x):
+                return getattr(x, function_name)()
+        else:
+            torchfn = getattr(torch, function_name)
+        self._test_math(torchfn, mathfn)
+
+    def _test_math_by_name(self, function_name, test_self=True):
+        if test_self:
+            self.__test_math_by_name(function_name + "_", function_name, True)
+        self.__test_math_by_name(function_name, function_name, False)
+
+    def test_sin(self):
+        self._test_math_by_name('sin')
+
+    def test_sinh(self):
+        def sinh(x):
+            try:
+                return math.sinh(x)
+            except OverflowError:
+                return inf if x > 0 else -inf
+        self._test_math(torch.sinh, sinh)
+
+    def test_lgamma(self):
+        def lgamma(x):
+            if x <= 0 and x == int(x):
+                return inf
+            return math.lgamma(x)
+        self._test_math(torch.lgamma, lgamma)
+
+    def _digamma_input(self, test_poles=True):
+        input = []
+        input.append((torch.randn(10).abs() + 1e-4).tolist())
+        input.append((torch.randn(10).abs() + 1e6).tolist())
+        zeros = torch.linspace(-9.5, -0.5, 10)
+        input.append(zeros.tolist())
+        input.append((zeros - 0.49).tolist())
+        input.append((zeros + 0.49).tolist())
+        input.append((zeros + (torch.rand(10) * 0.99) - 0.5).tolist())
+
+        if test_poles:
+            input.append([-0.999999994, -1.999999994, -2.0000000111,
+                          -100.99999994, -1931.99999994, 0.000000111,
+                          -0.000000111, 0, -2, -329])
+        return input
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+    def test_digamma(self):
+        from scipy.special import digamma
+
+        # scipy 1.1.0 changed when it returns +/-inf vs. NaN
+        def torch_digamma_without_inf(inp):
+            res = torch.digamma(inp)
+            res[(res == -inf) | (res == inf)] = nan
+            return res
+
+        def scipy_digamma_without_inf(inp):
+            res = digamma(inp)
+            if np.isscalar(res):
+                return res if np.isfinite(res) else nan
+            res[np.isinf(res)] = nan
+            return res
+
+        self._test_math(torch_digamma_without_inf, scipy_digamma_without_inf, self._digamma_input())
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+    def test_polygamma(self):
+        from scipy.special import polygamma
+        for n in [0, 1]:
+            self._test_math(lambda x: torch.polygamma(n, x),
+                            lambda x: polygamma(n, x).item(),
+                            self._digamma_input(test_poles=False))
+
+    def test_asin(self):
+        self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else nan)
+
+    def test_cos(self):
+        self._test_math_by_name('cos')
+
+    def test_cosh(self):
+        def cosh(x):
+            try:
+                return math.cosh(x)
+            except OverflowError:
+                # Return inf on overflow.
+                # See http://en.cppreference.com/w/cpp/numeric/math/cosh
+                return inf
+        self._test_math(torch.cosh, cosh)
+
+    def test_acos(self):
+        self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else nan)
+
+    def test_tan(self):
+        self._test_math_by_name('tan')
+
+    def test_tanh(self):
+        self._test_math_by_name('tanh')
+
+    def test_atan(self):
+        self._test_math_by_name('atan')
+
+    def test_log(self):
+        def log(x):
+            if x == 0:
+                return -inf
+            elif x < 0:
+                return nan
+            return math.log(x)
+        self._test_math(torch.log, log)
+
+    def test_log10(self):
+        def log10(x):
+            if x == 0:
+                return -inf
+            elif x < 0:
+                return nan
+            return math.log10(x)
+        self._test_math(torch.log10, log10)
+
+    def test_log1p(self):
+        def log1p(x):
+            if x == -1:
+                return -inf
+            elif x < -1:
+                return nan
+            return math.log1p(x)
+        self._test_math(torch.log1p, log1p)
+
+    def test_log2(self):
+        def log2(x):
+            if x == 0:
+                return -inf
+            elif x < 0:
+                return nan
+            try:
+                return math.log2(x)
+            except AttributeError:
+                return math.log(x, 2)
+        self._test_math(torch.log2, log2)
+
+    def test_sqrt(self):
+        self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else nan)
+
+    def test_erf(self):
+        self._test_math_by_name('erf')
+
+    def test_erfc(self):
+        self._test_math_by_name('erfc')
+
+    def test_erfinv(self):
+        def checkType(tensor):
+            inputValues = torch.randn(4, 4, out=tensor()).clamp(-2., 2.)
+            self.assertEqual(tensor(inputValues).erf().erfinv(), tensor(inputValues))
+            # test inf
+            self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([-inf, inf])))
+            # test nan
+            self.assertEqual(tensor([-2, 2]).erfinv(), tensor([nan, nan]))
+
+        checkType(torch.FloatTensor)
+        checkType(torch.DoubleTensor)
+
+    def test_exp(self):
+        def exp(x):
+            try:
+                return math.exp(x)
+            except OverflowError:
+                return inf
+        self._test_math(torch.exp, exp)
+
+    def test_expm1(self):
+        def expm1(x):
+            try:
+                return math.expm1(x)
+            except OverflowError:
+                return inf
+        self._test_math(torch.expm1, expm1)
+
+    def test_floor(self):
+        self._test_math_by_name('floor')
+
+    def test_ceil(self):
+        self._test_math_by_name('ceil')
+
+    def test_rsqrt(self):
+        def rsqrt(x):
+            if x == 0:
+                return inf
+            elif x < 0:
+                return nan
+            return 1.0 / math.sqrt(x)
+
+        self._test_math(torch.rsqrt, rsqrt)
+
+    def test_sigmoid(self):
+        # TODO: why not simulate math.sigmoid like with rsqrt?
+        inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000]
+        expectedOutput = [0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000]
+        precision_4dps = 0.0002
+
+        def checkType(tensor):
+            self.assertEqual(tensor(inputValues).sigmoid(), tensor(expectedOutput), precision_4dps)
+
+        checkType(torch.FloatTensor)
+        checkType(torch.DoubleTensor)
+
+    def test_frac(self):
+        self._test_math(torch.frac, lambda x: math.fmod(x, 1))
+
+    def test_trunc(self):
+        self._test_math(torch.trunc, lambda x: x - math.fmod(x, 1))
+
+    def test_round(self):
+        self._test_math(torch.round, round)
+
+    def test_has_storage(self):
+        self.assertIsNotNone(torch.Tensor().storage())
+        self.assertIsNotNone(torch.Tensor(0).storage())
+        self.assertIsNotNone(torch.Tensor([]).storage())
+        self.assertIsNotNone(torch.Tensor().clone().storage())
+        self.assertIsNotNone(torch.Tensor([0, 0, 0]).nonzero().storage())
+        self.assertIsNotNone(torch.Tensor().new().storage())
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_has_storage_numpy(self):
+        for dtype in [np.float32, np.float64, np.int64,
+                      np.int32, np.int16, np.uint8]:
+            arr = np.array([1], dtype=dtype)
+            self.assertIsNotNone(torch.FloatTensor(arr).storage())
+            self.assertIsNotNone(torch.DoubleTensor(arr).storage())
+            self.assertIsNotNone(torch.IntTensor(arr).storage())
+            self.assertIsNotNone(torch.LongTensor(arr).storage())
+            self.assertIsNotNone(torch.ByteTensor(arr).storage())
+            if torch.cuda.is_available():
+                self.assertIsNotNone(torch.cuda.FloatTensor(arr).storage())
+                self.assertIsNotNone(torch.cuda.DoubleTensor(arr).storage())
+                self.assertIsNotNone(torch.cuda.IntTensor(arr).storage())
+                self.assertIsNotNone(torch.cuda.LongTensor(arr).storage())
+                self.assertIsNotNone(torch.cuda.ByteTensor(arr).storage())
+
+    def _testSelection(self, torchfn, mathfn):
+        # contiguous
+        m1 = torch.randn(100, 100)
+        res1 = torchfn(m1)
+        res2 = m1[0, 0]
+        for i, j in iter_indices(m1):
+            res2 = mathfn(res2, m1[i, j])
+        self.assertEqual(res1, res2)
+
+        # non-contiguous
+        m1 = torch.randn(10, 10, 10)
+        m2 = m1[:, 4]
+        res1 = torchfn(m2)
+        res2 = m2[0, 0]
+        for i, j in iter_indices(m2):
+            res2 = mathfn(res2, m2[i][j])
+        self.assertEqual(res1, res2)
+
+        # with indices
+        m1 = torch.randn(100, 100)
+        res1val, res1ind = torchfn(m1, 1, False)
+        res2val = m1[:, 0:1].clone().squeeze()
+        res2ind = res1ind.clone().fill_(0)
+        for i, j in iter_indices(m1):
+            if mathfn(res2val[i], m1[i, j]) != res2val[i]:
+                res2val[i] = m1[i, j]
+                res2ind[i] = j
+
+        maxerr = 0
+        for i in range(res1val.size(0)):
+            maxerr = max(maxerr, abs(res1val[i] - res2val[i]))
+            self.assertEqual(res1ind[i], res2ind[i])
+        self.assertLessEqual(abs(maxerr), 1e-5)
+
+        # NaNs
+        for index in (0, 4, 99):
+            m1 = torch.randn(100)
+            m1[index] = nan
+            res1val, res1ind = torch.max(m1, 0)
+            self.assertTrue(math.isnan(res1val))
+            self.assertEqual(res1ind, index)
+            res1val = torchfn(m1)
+            self.assertTrue(math.isnan(res1val))
+
+    def test_max(self):
+        self._testSelection(torch.max, max)
+
+    def test_min(self):
+        self._testSelection(torch.min, min)
+
+    @staticmethod
+    def _test_norm(self, device):
+        # full reduction
+        x = torch.randn(5, device=device)
+        xn = x.cpu().numpy()
+        for p in [0, 1, 2, 3, 4, inf]:
+            res = x.norm(p).item()
+            expected = np.linalg.norm(xn, p)
+            self.assertEqual(res, expected, "full reduction failed for {}-norm".format(p))
+        # one dimension
+        x = torch.randn(5, 5, device=device)
+        xn = x.cpu().numpy()
+        for p in [0, 1, 2, 3, 4, inf]:
+            res = x.norm(p, 1).cpu().numpy()
+            expected = np.linalg.norm(xn, p, 1)
+            self.assertEqual(res.shape, expected.shape)
+            self.assertTrue(np.allclose(res, expected), "dim reduction failed for {}-norm".format(p))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_norm(self):
+        self._test_norm(self, device='cpu')
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_norm_cuda(self):
+        self._test_norm(self, device='cuda')
+
+    def test_dim_reduction_uint8_overflow(self):
+        example = [[-1, 2, 1], [5, 3, 6]]
+        x = torch.tensor(example, dtype=torch.uint8)
+        self.assertEqual(x.sum(dtype=torch.uint8).item(), 16)
+        self.assertEqual(x.sum(0, dtype=torch.uint8), torch.FloatTensor([4, 5, 7]))
+        self.assertEqual(x.sum(1, dtype=torch.uint8), torch.FloatTensor([2, 14]))
+        y = torch.tensor(example, dtype=torch.uint8)
+        torch.sum(x, 0, out=y)
+        self.assertEqual(x.sum(0, dtype=torch.uint8), y)
+
+    @staticmethod
+    def _test_dim_reduction(self, cast):
+        example = [[-1, 2, 1], [5, 3, 6]]
+
+        types = [torch.double,
+                 torch.float,
+                 torch.int64,
+                 torch.int32,
+                 torch.int16]
+
+        # This won't test for 256bit instructions, since we usually
+        # only work on 1 cacheline (1024bit) at a time and these
+        # examples aren't big enough to trigger that.
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.sum().item(), 16)
+            self.assertEqual(x.sum(0), torch.FloatTensor([4, 5, 7]))
+            self.assertEqual(x.sum(1), torch.FloatTensor([2, 14]))
+            y = cast(torch.tensor(example, dtype=dtype))
+            torch.sum(x, 0, out=y)
+            self.assertEqual(x.sum(0), y)
+
+        # Mean not supported for Int types
+        for dtype in types[:2]:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.mean().item(), 16.0 / 6)
+            self.assertEqual(x.mean(0), torch.FloatTensor([2.0, 2.5, 7.0 / 2]))
+            self.assertEqual(x.mean(1), torch.FloatTensor([2.0 / 3, 14.0 / 3]))
+
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.prod().item(), -180)
+            self.assertEqual(x.prod(0), torch.FloatTensor([-5, 6, 6]))
+            self.assertEqual(x.prod(1), torch.FloatTensor([-2, 90]))
+
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.max().item(), 6)
+            self.assertEqual(x.max(0), (torch.FloatTensor([5, 3, 6]), torch.FloatTensor([1, 1, 1])))
+            self.assertEqual(x.max(1), (torch.FloatTensor([2, 6]), torch.FloatTensor([1, 2])))
+
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.min().item(), -1)
+            self.assertEqual(x.min(0), (torch.FloatTensor([-1, 2, 1]), torch.FloatTensor([0, 0, 0])))
+            self.assertEqual(x.min(1), (torch.FloatTensor([-1, 3]), torch.FloatTensor([0, 1])))
+
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.argmax().item(), 5)
+            self.assertEqual(x.argmax(dim=0), torch.FloatTensor([1, 1, 1]))
+            self.assertEqual(x.argmax(dim=1), torch.FloatTensor([1, 2]))
+            self.assertEqual(x.argmax(dim=0, keepdim=True), torch.FloatTensor([[1, 1, 1]]))
+            # test that non-contiguous tensors work
+            self.assertEqual(x[:, :2].argmax().item(), 2)
+
+        for dtype in types:
+            x = cast(torch.tensor(example, dtype=dtype))
+            self.assertEqual(x.argmin().item(), 0)
+            self.assertEqual(x.argmin(dim=0), torch.FloatTensor([0, 0, 0]))
+            self.assertEqual(x.argmin(dim=1), torch.FloatTensor([0, 1]))
+            self.assertEqual(x.argmin(dim=1, keepdim=True), torch.FloatTensor([[0], [1]]))
+            # test that non-contiguous tensors work
+            self.assertEqual(x[:, :2].argmin().item(), 0)
+
+        dim_red_fns = [
+            "mean", "median", "mode", "norm", "prod",
+            "std", "sum", "var", "max", "min"]
+
+        def normfn_attr(t, dim, keepdim=False, out=None):
+            attr = getattr(torch, "norm")
+            return attr(t, 2, dim, keepdim, out=out)
+
+        for fn_name in dim_red_fns:
+            fn_attr = getattr(torch, fn_name) if fn_name != "norm" else normfn_attr
+
+            def fn(x, dim, keepdim=False, out=None):
+                ans = fn_attr(x, dim, keepdim=keepdim, out=out)
+                return ans if not isinstance(ans, tuple) else ans[0]
+
+            def fn_tuple(x, dim, keepdim=False, out=None):
+                return fn_attr(x, dim, keepdim=keepdim, out=out)
+
+            def test_multidim(x, dim):
+                self.assertEqual(fn(x, dim).unsqueeze(dim), fn(x, dim, keepdim=True))
+                self.assertEqual(x.ndimension() - 1, fn(x, dim).ndimension())
+                self.assertEqual(x.ndimension(), fn(x, dim, keepdim=True).ndimension())
+
+            # general case
+            x = cast(torch.randn(3, 4, 5))
+            dim = random.randint(0, 2)
+            test_multidim(x, dim)
+
+            # check 1-d behavior
+            x = cast(torch.randn(1))
+            dim = 0
+            self.assertEqual(fn(x, dim).shape, tuple())
+            self.assertEqual(fn(x, dim, keepdim=True).shape, (1,))
+
+            # check reducing of a singleton dimension
+            dims = [3, 4, 5]
+            singleton_dim = random.randint(0, 2)
+            dims[singleton_dim] = 1
+            x = cast(torch.randn(dims))
+            test_multidim(x, singleton_dim)
+
+            # check reducing with output kwargs
+            if fn_name in ['median', 'mode', 'max', 'min']:
+                y = cast(torch.randn(5, 3))
+                values = cast(torch.randn(5, 3))
+                indices = cast(torch.zeros(5, 3).long() - 1)
+                fn_tuple(y, 1, keepdim=False, out=(values[:, 1], indices[:, 1]))
+                values_expected, indices_expected = fn_tuple(y, 1, keepdim=False)
+                self.assertEqual(values[:, 1], values_expected,
+                                 '{} values with out= kwarg'.format(fn_name))
+                self.assertEqual(indices[:, 1], indices_expected,
+                                 '{} indices with out= kwarg'.format(fn_name))
+                continue
+
+            x = cast(torch.randn(5, 3))
+            y = cast(torch.randn(5, 3))
+            fn(y, 1, keepdim=False, out=x[:, 1])
+            expected = fn(y, 1, keepdim=False)
+            self.assertEqual(x[:, 1], expected, '{} with out= kwarg'.format(fn_name))
+
+    def test_dim_reduction(self):
+        self._test_dim_reduction(self, lambda t: t)
+
+    @skipIfNoZeroSize
+    def test_reduction_empty(self):
+        fns_to_test = [
+            # name, function, identity
+            ('max', lambda *args, **kwargs: torch.max(*args, **kwargs), None),
+            ('kthvalue', lambda *args, **kwargs: torch.kthvalue(*args, k=1, **kwargs), None),
+            ('argmax', lambda *args, **kwargs: torch.argmax(*args, **kwargs), None),
+            ('min', lambda *args, **kwargs: torch.min(*args, **kwargs), None),
+            ('argmin', lambda *args, **kwargs: torch.argmin(*args, **kwargs), None),
+            ('mode', lambda *args, **kwargs: torch.mode(*args, **kwargs), None),
+            ('median', lambda *args, **kwargs: torch.median(*args, **kwargs), None),
+
+            ('prod', lambda *args, **kwargs: torch.prod(*args, **kwargs), 1),
+            ('sum', lambda *args, **kwargs: torch.sum(*args, **kwargs), 0),
+            ('norm', lambda *args, **kwargs: torch.norm(*args, p=2, **kwargs), 0),
+            ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), nan),
+            ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), nan),
+            ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), nan),
+            ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), -inf),
+        ]
+
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        shape = (2, 0, 4)
+        for device in devices:
+            x = torch.randn(shape, device=device)
+
+            for item in fns_to_test:
+                name, fn, identity = item
+                if identity is None:
+                    ident_err = 'does not have an identity'
+                    self.assertRaisesRegex(RuntimeError, ident_err, lambda: fn(x, dim=2))
+                    self.assertRaisesRegex(RuntimeError, ident_err, lambda: fn(x, dim=2, keepdim=True))
+                    self.assertRaisesRegex(RuntimeError, ident_err, lambda: fn(x, dim=1))
+                    self.assertRaisesRegex(RuntimeError, ident_err, lambda: fn(x, dim=1, keepdim=True))
+                else:
+                    self.assertEqual(torch.empty((2, 0), device=device), fn(x, dim=2))
+                    self.assertEqual(torch.empty((2, 0, 1), device=device), fn(x, dim=2, keepdim=True))
+                    # assertEqual doesn't work with inf, -inf, nan and two tensors.
+                    check = (torch.testing.assert_allclose if math.isnan(identity) or math.isinf(identity) else
+                             self.assertEqual)
+                    check(torch.full((2, 4), identity, device=device), fn(x, dim=1))
+                    check(torch.full((2, 1, 4), identity, device=device), fn(x, dim=1, keepdim=True))
+                    try:
+                        check(torch.full((), identity, device=device), fn(x))
+                    except TypeError as err:
+                        # ignore if there is no allreduce.
+                        self.assertTrue('required positional arguments: "dim"' in str(err))
+
+            # any
+            xb = x.to(torch.uint8)
+            yb = x.to(torch.uint8)
+            self.assertEqual((2, 0), xb.any(2).shape)
+            self.assertEqual((2, 0, 1), xb.any(2, keepdim=True).shape)
+            self.assertEqual(torch.zeros((2, 4), device=device), xb.any(1))
+            self.assertEqual(torch.zeros((2, 1, 4), device=device), xb.any(1, keepdim=True))
+            self.assertEqual(torch.zeros((), device=device), xb.any())
+
+            # all
+            self.assertEqual((2, 0), xb.all(2).shape)
+            self.assertEqual((2, 0, 1), xb.all(2, keepdim=True).shape)
+            self.assertEqual(torch.ones((2, 4), device=device), xb.all(1))
+            self.assertEqual(torch.ones((2, 1, 4), device=device), xb.all(1, keepdim=True))
+            self.assertEqual(torch.ones((), device=device), xb.all())
+
+    @skipIfNoZeroSize
+    def test_pairwise_distance_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            shape = (2, 0)
+            x = torch.randn(shape, device=device)
+            y = torch.randn(shape, device=device)
+
+            self.assertEqual(torch.zeros(2, device=device), torch.pairwise_distance(x, y))
+            self.assertEqual(torch.zeros((2, 1), device=device), torch.pairwise_distance(x, y, keepdim=True))
+
+            shape = (0, 2)
+            x = torch.randn(shape, device=device)
+            y = torch.randn(shape, device=device)
+            self.assertEqual(torch.zeros(0, device=device), torch.pairwise_distance(x, y))
+            self.assertEqual(torch.zeros((0, 1), device=device), torch.pairwise_distance(x, y, keepdim=True))
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+    def test_logsumexp(self):
+        from scipy.special import logsumexp
+        a = torch.randn(5, 4)
+        a[0, 0] = inf
+        a[1, :] = -inf
+        actual = a.logsumexp(1)
+        expected = logsumexp(a.numpy(), 1)
+        self.assertEqual(expected.shape, actual.shape)
+        self.assertTrue(np.allclose(expected, actual.numpy()))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_cpu_parallel(self):
+        # To use parallel branches we'll need to compare on tensors
+        # that are relatively large. Even if this is run on a single
+        # core machine these tests will still give you signal on
+        # the correctness
+
+        def _run_test(size):
+            for dim in range(len(size) + 1):
+                nv = np.round(np.random.rand(*size))  # 0s and 1s
+                tv = torch.from_numpy(nv)
+                # Parallelisim is only used if numel is
+                # larger than grainsize defined in Parallel.h
+                self.assertTrue(tv.numel() > 32768)
+                if dim == len(size):
+                    nvs = nv.sum()
+                    tvs = tv.sum()
+                else:
+                    nvs = nv.sum(dim)
+                    tvs = tv.sum(dim)
+                diff = np.abs(nvs - tvs.numpy()).sum()
+                self.assertEqual(diff, 0)
+
+        _run_test([2, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3])
+        _run_test([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
+        _run_test([1, 32 * 8 * 32 * 8])
+        _run_test([1, 32770])
+
+    def _testCSelection(self, torchfn, mathfn):
+        # Two tensors
+        size = (100, 100)
+        a = torch.rand(*size)
+        b = torch.rand(*size)
+        c = torchfn(a, b)
+        expected_c = torch.zeros(*size)
+        expected_c.map2_(a, b, lambda _, a, b: mathfn(a, b))
+        self.assertEqual(expected_c, c, 0)
+
+    def test_max_elementwise(self):
+        self._testCSelection(torch.max, max)
+
+    def test_min_elementwise(self):
+        self._testCSelection(torch.min, min)
+
+    def test_lerp(self):
+        def TH_lerp(a, b, weight):
+            return a + weight * (b - a)
+
+        size = (100, 100)
+        a = torch.rand(*size)
+        b = torch.rand(*size)
+        w = random.random()
+        result = torch.lerp(a, b, w)
+        expected = a.clone()
+        expected.map2_(a, b, lambda _, a, b: TH_lerp(a, b, w))
+        self.assertEqual(result, expected)
+
+    def test_all_any(self):
+        def test(size):
+            x = torch.ones(*size).byte()
+            self.assertTrue(x.all())
+            self.assertTrue(x.any())
+
+            x[3] = 0
+            self.assertFalse(x.all())
+            self.assertTrue(x.any())
+
+            x.zero_()
+            self.assertFalse(x.all())
+            self.assertFalse(x.any())
+
+            x.fill_(2)
+            self.assertTrue(x.all())
+            self.assertTrue(x.any())
+
+        test((10,))
+        test((5, 5))
+
+    def test_all_any_empty(self):
+        x = torch.ByteTensor()
+        self.assertTrue(x.all())
+        self.assertFalse(x.any())
+
+    def test_all_any_with_dim(self):
+        def test(x):
+            r1 = x.prod(dim=0, keepdim=False).byte()
+            r2 = x.all(dim=0, keepdim=False)
+            self.assertEqual(r1.shape, r2.shape)
+            self.assertTrue((r1 == r2).all())
+
+            r3 = x.sum(dim=1, keepdim=True).clamp(0, 1).byte()
+            r4 = x.any(dim=1, keepdim=True)
+            self.assertEqual(r3.shape, r4.shape)
+            self.assertTrue((r3 == r4).all())
+
+        test(torch.ByteTensor([[0, 0, 0],
+                               [0, 0, 1],
+                               [0, 1, 1],
+                               [1, 1, 1]]))
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_all_any_empty_cuda(self):
+        x = torch.cuda.ByteTensor()
+        self.assertTrue(x.all())
+        self.assertFalse(x.any())
+
+    def test_mv(self):
+        m1 = torch.randn(100, 100)
+        v1 = torch.randn(100)
+
+        res1 = torch.mv(m1, v1)
+        res2 = res1.clone().zero_()
+        for i, j in iter_indices(m1):
+            res2[i] += m1[i][j] * v1[j]
+
+        self.assertEqual(res1, res2)
+
+    def test_add(self):
+        # [res] torch.add([res,] tensor1, tensor2)
+        m1 = torch.randn(100, 100)
+        v1 = torch.randn(100)
+
+        # contiguous
+        res1 = torch.add(m1[4], v1)
+        res2 = res1.clone().zero_()
+        for i in range(m1.size(1)):
+            res2[i] = m1[4, i] + v1[i]
+        self.assertEqual(res1, res2)
+
+        m1 = torch.randn(100, 100)
+        v1 = torch.randn(100)
+
+        # non-contiguous
+        res1 = torch.add(m1[:, 4], v1)
+        res2 = res1.clone().zero_()
+        for i in range(m1.size(0)):
+            res2[i] = m1[i, 4] + v1[i]
+        self.assertEqual(res1, res2)
+
+        # [res] torch.add([res,] tensor, value)
+        m1 = torch.randn(10, 10)
+
+        # contiguous
+        res1 = m1.clone()
+        res1[3].add_(2)
+        res2 = m1.clone()
+        for i in range(m1.size(1)):
+            res2[3, i] = res2[3, i] + 2
+        self.assertEqual(res1, res2)
+
+        # non-contiguous
+        m1 = torch.randn(10, 10)
+        res1 = m1.clone()
+        res1[:, 3].add_(2)
+        res2 = m1.clone()
+        for i in range(m1.size(0)):
+            res2[i, 3] = res2[i, 3] + 2
+        self.assertEqual(res1, res2)
+
+        # [res] torch.add([res,] tensor1, value, tensor2)
+
+    def test_csub(self):
+        # with a tensor
+        a = torch.randn(100, 90)
+        b = a.clone().normal_()
+
+        res_add = torch.add(a, -1, b)
+        res_csub = a.clone()
+        res_csub.sub_(b)
+        self.assertEqual(res_add, res_csub)
+
+        # with a scalar
+        a = torch.randn(100, 100)
+
+        scalar = 123.5
+        res_add = torch.add(a, -scalar)
+        res_csub = a.clone()
+        res_csub.sub_(scalar)
+        self.assertEqual(res_add, res_csub)
+
+    @staticmethod
+    def _test_neg(self, cast):
+        float_types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor']
+        int_types = ['torch.IntTensor', 'torch.ShortTensor', 'torch.ByteTensor',
+                     'torch.CharTensor']
+
+        for t in float_types + int_types:
+            if t in float_types:
+                a = cast(torch.randn(100, 90).type(t))
+            else:
+                a = cast(torch.Tensor(100, 90).type(t).random_(-128, 128))
+            zeros = cast(torch.Tensor().type(t)).resize_as_(a).zero_()
+
+            if t == 'torch.ByteTensor':
+                res_add = torch.add(zeros, a, alpha=255)
+            else:
+                res_add = torch.add(zeros, a, alpha=-1)
+            res_neg = a.clone()
+            res_neg.neg_()
+            self.assertEqual(res_neg, res_add)
+
+            # test out of place as well
+            res_neg_out_place = a.clone().neg()
+            self.assertEqual(res_neg_out_place, res_add)
+
+            # test via __neg__ operator
+            res_neg_op = -a.clone()
+            self.assertEqual(res_neg_op, res_add)
+
+    def test_neg(self):
+        self._test_neg(self, lambda t: t)
+
+    def test_reciprocal(self):
+        a = torch.randn(100, 89)
+        res_div = 1 / a
+        res_reciprocal = a.clone()
+        res_reciprocal.reciprocal_()
+        self.assertEqual(res_reciprocal, res_div)
+
+    def test_mul(self):
+        m1 = torch.randn(10, 10)
+        res1 = m1.clone()
+        res1[:, 3].mul_(2)
+        res2 = m1.clone()
+        for i in range(res1.size(0)):
+            res2[i, 3] = res2[i, 3] * 2
+        self.assertEqual(res1, res2)
+
+    def test_div(self):
+        m1 = torch.randn(10, 10)
+        res1 = m1.clone()
+        res1[:, 3].div_(2)
+        res2 = m1.clone()
+        for i in range(m1.size(0)):
+            res2[i, 3] = res2[i, 3] / 2
+        self.assertEqual(res1, res2)
+
+    def test_floordiv(self):
+        for dtype in torch.testing.get_all_dtypes():
+            if dtype is torch.float16:
+                continue
+            x = torch.randn(100).mul(10).to(dtype)
+            y = x // 3
+            self.assertEqual(y.dtype, x.dtype)
+            z = torch.tensor([math.trunc(v.item() / 3.) for v in x], dtype=y.dtype)
+            self.assertEqual(y, z)
+
+    def test_rdiv(self):
+        for dtype in torch.testing.get_all_dtypes():
+            if dtype is torch.float16:
+                continue
+            x = torch.rand(100).add(1).mul(4).to(dtype)
+            y = 30 / x
+            if dtype.is_floating_point:
+                z = torch.tensor([30 / v.item() for v in x], dtype=dtype)
+            else:
+                z = torch.tensor([math.trunc(30. / v.item()) for v in x], dtype=dtype)
+            self.assertEqual(y, z)
+
+    def test_fmod(self):
+        m1 = torch.Tensor(10, 10).uniform_(-10., 10.)
+        res1 = m1.clone()
+        q = 2.1
+        res1[:, 3].fmod_(q)
+        res2 = m1.clone()
+        for i in range(m1.size(1)):
+            res2[i, 3] = math.fmod(res2[i, 3], q)
+        self.assertEqual(res1, res2)
+
+    def test_remainder(self):
+        # Check the Floating point case, both tensor and scalar overloads
+        for use_item in [True, False]:
+            m1 = torch.Tensor(10, 10).uniform_(-10., 10.)
+            res1 = m1.clone()
+            res2 = m1.clone()
+            qs = torch.arange(-5.1, 4.1)
+            # Check the case where the divisor is a simple float
+            for col_idx, q in enumerate(qs):
+                # Reference
+                for i in range(m1.size(0)):
+                    res2[i, col_idx] = res2[i, col_idx] % q
+                # To test
+                res1[:, col_idx].remainder_(q if not use_item else q.item())
+            self.assertEqual(res1, res2)
+            # Check the case where the divisor is a tensor
+            res1 = m1.clone()
+            res1.remainder_(qs.unsqueeze(0).expand_as(res1))
+            self.assertEqual(res1, res2)
+
+        # Check the LongTensor case, both tensor and scalar overloads
+        for use_item in [True, False]:
+            long_m1 = torch.LongTensor(10, 10).random_(-10, 10)
+            long_res1 = long_m1.clone()
+            long_res2 = long_m1.clone()
+            long_qs = torch.arange(-5, 5)
+            long_qs[5] = 5  # Can't handle the divisor=0 case
+            for col_idx, long_q in enumerate(long_qs):
+                # Reference
+                for i in range(long_m1.size(0)):
+                    long_res2[i, col_idx] = long_res2[i, col_idx] % long_q
+                # To test
+                long_res1[:, col_idx].remainder_(long_q if not use_item else long_q.item())
+            self.assertEqual(long_res1, long_res2)
+            # Divisor is a tensor case
+            long_res1 = long_m1.clone()
+            long_res1.remainder_(long_qs.unsqueeze(0).expand_as(long_res1))
+
+    @staticmethod
+    def _test_remainder_overflow(self, dtype, device):
+        # Check Integer Overflows
+        x = torch.tensor(23500, dtype=dtype, device=device)
+        q = 392486996410368
+        self.assertEqual(x % q, x)
+        self.assertEqual(-x % q, q - x)
+        self.assertEqual(x % -q, x - q)
+        self.assertEqual(-x % -q, -x)
+
+    def test_remainder_overflow(self):
+        self._test_remainder_overflow(self, dtype=torch.int64, device='cpu')
+
+    def test_mm(self):
+        # helper function
+        def matrixmultiply(mat1, mat2):
+            n = mat1.size(0)
+            m = mat1.size(1)
+            p = mat2.size(1)
+            res = torch.zeros(n, p)
+            for i, j in iter_indices(res):
+                res[i, j] = sum(mat1[i, k] * mat2[k, j] for k in range(m))
+            return res
+
+        # contiguous case
+        n, m, p = 10, 10, 5
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(m, p)
+        res = torch.mm(mat1, mat2)
+
+        res2 = matrixmultiply(mat1, mat2)
+        self.assertEqual(res, res2)
+
+        # non contiguous case 1
+        n, m, p = 10, 10, 5
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(p, m).t()
+        res = torch.mm(mat1, mat2)
+
+        res2 = matrixmultiply(mat1, mat2)
+        self.assertEqual(res, res2)
+
+        # non contiguous case 2
+        n, m, p = 10, 10, 5
+        mat1 = torch.randn(m, n).t()
+        mat2 = torch.randn(m, p)
+        res = torch.mm(mat1, mat2)
+
+        res2 = matrixmultiply(mat1, mat2)
+        self.assertEqual(res, res2)
+
+        # non contiguous case 3
+        n, m, p = 10, 10, 5
+        mat1 = torch.randn(m, n).t()
+        mat2 = torch.randn(p, m).t()
+        res = torch.mm(mat1, mat2)
+
+        res2 = matrixmultiply(mat1, mat2)
+        self.assertEqual(res, res2)
+
+        # test with zero stride
+        n, m, p = 10, 10, 5
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(m, 1).expand(m, p)
+        res = torch.mm(mat1, mat2)
+
+        res2 = matrixmultiply(mat1, mat2)
+        self.assertEqual(res, res2)
+
+    @staticmethod
+    def _test_btrifact(self, cast):
+        a = torch.FloatTensor((((1.3722, -0.9020),
+                                (1.8849, 1.9169)),
+                               ((0.7187, -1.1695),
+                                (-0.0139, 1.3572)),
+                               ((-1.6181, 0.7148),
+                                (1.3728, 0.1319))))
+        a = cast(a)
+        a_LU, pivots = a.btrifact()  # test default info
+
+        # test deprecated info argument
+        info = cast(torch.IntTensor())
+        with warnings.catch_warnings(record=True):
+            a_LU, pivots = a.btrifact(info=info)
+        self.assertEqual(info.abs().sum(), 0)
+
+        a_LU_, pivots_, info_ = a.btrifact_with_info()
+        self.assertEqual(a_LU, a_LU_)
+        self.assertEqual(pivots, pivots_)
+        self.assertEqual(info, info_)
+        P, a_L, a_U = torch.btriunpack(a_LU, pivots)
+        a_ = torch.bmm(P, torch.bmm(a_L, a_U))
+        self.assertEqual(a_, a)
+
+    @skipIfNoLapack
+    def test_btrifact(self):
+        self._test_btrifact(self, lambda t: t)
+
+    @staticmethod
+    def _test_btrisolve(self, cast):
+        a = torch.FloatTensor((((1.3722, -0.9020),
+                                (1.8849, 1.9169)),
+                               ((0.7187, -1.1695),
+                                (-0.0139, 1.3572)),
+                               ((-1.6181, 0.7148),
+                                (1.3728, 0.1319))))
+        b = torch.FloatTensor(((4.02, 6.19),
+                               (-1.56, 4.00),
+                               (9.81, -4.09)))
+        a, b = cast(a), cast(b)
+        LU_data, pivots, info = a.btrifact_with_info()
+        self.assertEqual(info.abs().sum(), 0)
+        x = torch.btrisolve(b, LU_data, pivots)
+        b_ = torch.bmm(a, x.unsqueeze(2)).squeeze()
+        self.assertEqual(b_, b)
+
+    @skipIfNoLapack
+    def test_btrisolve(self):
+        self._test_btrisolve(self, lambda t: t)
+
+    def test_bmm(self):
+        num_batches = 10
+        M, N, O = 23, 8, 12
+        b1 = torch.randn(num_batches, M, N)
+        b2 = torch.randn(num_batches, N, O)
+        res = torch.bmm(b1, b2)
+        for i in range(num_batches):
+            r = torch.mm(b1[i], b2[i])
+            self.assertEqual(r, res[i])
+
+    def test_addbmm(self):
+        # num_batches = 10
+        # M, N, O = 12, 8, 5
+        num_batches = 2
+        M, N, O = 2, 3, 4
+        b1 = torch.randn(num_batches, M, N)
+        b2 = torch.randn(num_batches, N, O)
+        res = torch.bmm(b1, b2)
+        res2 = torch.Tensor().resize_as_(res[0]).zero_()
+
+        res2.addbmm_(b1, b2)
+        self.assertEqual(res2, res.sum(0, False))
+
+        res2.addbmm_(1, b1, b2)
+        self.assertEqual(res2, res.sum(0, False) * 2)
+
+        res2.addbmm_(1., .5, b1, b2)
+        self.assertEqual(res2, res.sum(0, False) * 2.5)
+
+        res3 = torch.addbmm(1, res2, 0, b1, b2)
+        self.assertEqual(res3, res2)
+
+        res4 = torch.addbmm(1, res2, .5, b1, b2)
+        self.assertEqual(res4, res.sum(0, False) * 3)
+
+        res5 = torch.addbmm(0, res2, 1, b1, b2)
+        self.assertEqual(res5, res.sum(0, False))
+
+        res6 = torch.addbmm(.1, res2, .5, b1, b2)
+        self.assertEqual(res6, res2 * .1 + (res.sum(0) * .5))
+
+    def test_baddbmm(self):
+        num_batches = 10
+        M, N, O = 12, 8, 5
+        b1 = torch.randn(num_batches, M, N)
+        b2 = torch.randn(num_batches, N, O)
+        res = torch.bmm(b1, b2)
+        res2 = torch.Tensor().resize_as_(res).zero_()
+
+        res2.baddbmm_(b1, b2)
+        self.assertEqual(res2, res)
+
+        res2.baddbmm_(1, b1, b2)
+        self.assertEqual(res2, res * 2)
+
+        res2.baddbmm_(1, .5, b1, b2)
+        self.assertEqual(res2, res * 2.5)
+
+        res3 = torch.baddbmm(1, res2, 0, b1, b2)
+        self.assertEqual(res3, res2)
+
+        res4 = torch.baddbmm(1, res2, .5, b1, b2)
+        self.assertEqual(res4, res * 3)
+
+        res5 = torch.baddbmm(0, res2, 1, b1, b2)
+        self.assertEqual(res5, res)
+
+        res6 = torch.baddbmm(.1, res2, .5, b1, b2)
+        self.assertEqual(res6, res2 * .1 + res * .5)
+
+    def test_clamp(self):
+        m1 = torch.rand(100).mul(5).add(-2.5)  # uniform in [-2.5, 2.5]
+        # just in case we're extremely lucky.
+        min_val = -1
+        max_val = 1
+        m1[1] = min_val
+        m1[2] = max_val
+
+        res1 = m1.clone()
+        res1.clamp_(min_val, max_val)
+        res2 = m1.clone()
+        for i in iter_indices(res2):
+            res2[i] = max(min_val, min(max_val, res2[i]))
+        self.assertEqual(res1, res2)
+
+        out = m1.clone()
+        torch.clamp(m1, min=min_val, max=max_val, out=out)
+        self.assertEqual(out, res1)
+
+        res1 = torch.clamp(m1, min=min_val)
+        res2 = m1.clone()
+        for i in iter_indices(res2):
+            res2[i] = max(min_val, res2[i])
+        self.assertEqual(res1, res2)
+
+        torch.clamp(m1, min=min_val, out=out)
+        self.assertEqual(out, res1)
+
+        res1 = torch.clamp(m1, max=max_val)
+        res2 = m1.clone()
+        for i in iter_indices(res2):
+            res2[i] = min(max_val, res2[i])
+        self.assertEqual(res1, res2)
+
+        torch.clamp(m1, max=max_val, out=out)
+        self.assertEqual(out, res1)
+
+    def test_pow(self):
+        # [res] torch.pow([res,] x)
+
+        # pow has dedicated implementation for different exponents
+        for exponent in [-2, -1, -0.5, 0.5, 1, 2, 3, 4]:
+            # base - tensor, exponent - number
+            # contiguous
+            m1 = torch.rand(100, 100) + 0.5
+            res1 = torch.pow(m1[4], exponent)
+            res2 = res1.clone().zero_()
+            for i in range(res2.size(0)):
+                res2[i] = math.pow(m1[4][i], exponent)
+            self.assertEqual(res1, res2)
+
+            # non-contiguous
+            m1 = torch.rand(100, 100) + 0.5
+            res1 = torch.pow(m1[:, 4], exponent)
+            res2 = res1.clone().zero_()
+            for i in range(res2.size(0)):
+                res2[i] = math.pow(m1[i, 4], exponent)
+            self.assertEqual(res1, res2)
+
+        # base - number, exponent - tensor
+        # contiguous
+        m1 = torch.randn(100, 100)
+        res1 = torch.pow(3, m1[4])
+        res2 = res1.clone().zero_()
+        for i in range(res2.size(0)):
+            res2[i] = math.pow(3, m1[4, i])
+        self.assertEqual(res1, res2)
+
+        # non-contiguous
+        m1 = torch.randn(100, 100)
+        res1 = torch.pow(3, m1[:, 4])
+        res2 = res1.clone().zero_()
+        for i in range(res2.size(0)):
+            res2[i] = math.pow(3, m1[i][4])
+        self.assertEqual(res1, res2)
+
+    def test_rpow(self):
+        m = torch.randn(10, 10)
+        self.assertEqual(torch.pow(2, m), 2**m)
+
+    @staticmethod
+    def _test_int_pow(self, cast):
+        if not TEST_NUMPY:
+            return
+        import numpy as np
+
+        def check_against_np(tensor, exp):
+            tensor_np = tensor.cpu().numpy()
+            exp_np = exp if isinstance(exp, int) else exp.cpu().numpy()
+            expected = torch.LongTensor(tensor_np ** exp_np).type_as(tensor)
+            self.assertEqual(torch.pow(tensor, exp), expected)
+            self.assertEqual(tensor.pow(exp), torch.pow(tensor, exp))
+
+        typecasts = [
+            lambda x: x.long(),
+            lambda x: x.short(),
+            lambda x: x.byte(),
+        ]
+
+        if not IS_WINDOWS:
+            typecasts.append(lambda x: x.int())
+
+        shape = (11, 5)
+        tensor = cast(torch.LongTensor(shape).random_(-10, 10))
+        exps = [0, 1, 2, 5, cast(torch.LongTensor(shape).random_(0, 20))]
+
+        for typecast in typecasts:
+            for exp in exps:
+                t = typecast(tensor)
+                e = exp if isinstance(exp, int) else typecast(exp)
+                check_against_np(t, e)
+
+    def test_int_pow(self):
+        self._test_int_pow(self, lambda x: x)
+
+    def _test_cop(self, torchfn, mathfn):
+        def reference_implementation(res2):
+            for i, j in iter_indices(sm1):
+                idx1d = i * sm1.size(0) + j
+                res2[i, j] = mathfn(sm1[i, j], sm2[idx1d])
+            return res2
+
+        # contiguous
+        m1 = torch.randn(10, 10, 10)
+        m2 = torch.randn(10, 10 * 10)
+        sm1 = m1[4]
+        sm2 = m2[4]
+
+        res1 = torchfn(sm1, sm2.view(10, 10))
+        res2 = reference_implementation(res1.clone())
+        self.assertEqual(res1, res2)
+
+        # non-contiguous
+        m1 = torch.randn(10, 10, 10)
+        m2 = torch.randn(10 * 10, 10 * 10)
+        sm1 = m1[:, 4]
+        sm2 = m2[:, 4]
+        # view as sm1.size()
+        sm2.set_(sm2.storage(), sm2.storage_offset(), sm1.size(), (sm2.stride()[0] * 10, sm2.stride()[0]))
+        res1 = torchfn(sm1, sm2)
+        # reference_implementation assumes 1-d sm2
+        sm2.set_(sm2.storage(), sm2.storage_offset(), m2[:, 4].size(), m2[:, 4].stride())
+        res2 = reference_implementation(res1.clone())
+        self.assertEqual(res1, res2)
+
+    def test_cdiv(self):
+        self._test_cop(torch.div, lambda x, y: x / y)
+
+    def test_cfmod(self):
+        self._test_cop(torch.fmod, math.fmod)
+
+    def test_cremainder(self):
+        self._test_cop(torch.remainder, lambda x, y: x % y)
+
+    def test_cmul(self):
+        self._test_cop(torch.mul, lambda x, y: x * y)
+
+    def test_cpow(self):
+        self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y))
+
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    def test_einsum(self):
+        # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+        x = torch.randn(5)
+        y = torch.randn(7)
+        A = torch.randn(3, 5)
+        B = torch.randn(2, 5)
+        C = torch.randn(2, 3, 5)
+        D = torch.randn(2, 5, 7)
+        E = torch.randn(7, 9)
+        F = torch.randn(2, 3, 5, 7)
+        G = torch.randn(7, 11, 13)
+        H = torch.randn(4, 4)
+        I = torch.randn(3, 4, 4)
+        l = torch.randn(5, 10)
+        r = torch.randn(5, 20)
+        w = torch.randn(30, 10, 20)
+        test_list = [
+            # -- Vector
+            ("i->", x),                 # sum
+            ("i,i->", x, x),            # dot
+            ("i,i->i", x, x),           # vector element-wise mul
+            ("i,j->ij", x, y),          # outer
+            # -- Matrix
+            ("ij->ji", A),              # transpose
+            ("ij->j", A),               # row sum
+            ("ij->i", A),               # col sum
+            ("ij,ij->ij", A, A),        # matrix element-wise mul
+            ("ij,j->i", A, x),          # matrix vector multiplication
+            ("ij,kj->ik", A, B),        # matmul
+            ("ij,ab->ijab", A, E),      # matrix outer product
+            # -- Tensor
+            ("aij,ajk->aik", C, D),     # batch matmul
+            ("ijk,jk->i", C, A),        # tensor matrix contraction
+            ("aij,jk->aik", D, E),      # tensor matrix contraction
+            ("abcd,dfg->abcfg", F, G),  # tensor tensor contraction
+            ("ijk,jk->ik", C, A),       # tensor matrix contraction with double indices
+            ("ijk,jk->ij", C, A),       # tensor matrix contraction with double indices
+            ("ijk,ik->j", C, B),        # non contiguous
+            ("ijk,ik->jk", C, B),       # non contiguous with double indices
+            # -- Diagonal
+            ("ii", H),                 # trace
+            ("ii->i", H),              # diagonal
+            # -- Ellipsis
+            ("i...->...", H),
+            ("ki,...k->i...", A.t(), B),
+            ("k...,jk", A.t(), B),
+            ("...ii->...i", I),       # batch diagonal
+            # -- Other
+            ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
+        ]
+        for test in test_list:
+            actual = torch.einsum(test[0], test[1:])
+            expected = np.einsum(test[0], *[t.numpy() for t in test[1:]])
+            self.assertEqual(expected.shape, actual.shape, test[0])
+            self.assertTrue(np.allclose(expected, actual.numpy()), test[0])
+
+            def do_einsum(*args):
+                return torch.einsum(test[0], args)
+            # FIXME: following test cases fail gradcheck
+            if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}:
+                gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:])
+                self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
+            self.assertTrue(A._version == 0)  # check that we do not use inplace ops
+
+    def test_sum_all(self):
+        def check_sum_all(tensor):
+            pylist = tensor.reshape(-1).tolist()
+            self.assertEqual(tensor.sum(), sum(pylist))
+
+        check_sum_all(torch.tensor([1, 2, 3, 4, 5]))
+        check_sum_all(torch.randn(200000))
+        check_sum_all(torch.randn(2000, 2)[:, 0])
+
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    def test_sum_dim(self):
+        def check_sum_dim(tensors, dim):
+            for tensor in tensors:
+                expected = tensor.numpy().sum(dim)
+                actual = tensor.sum(dim)
+                self.assertEqual(expected.shape, actual.shape)
+                if actual.dtype == torch.float:
+                    self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05))
+                else:
+                    self.assertTrue(np.allclose(expected, actual.numpy()))
+
+        float_types = [torch.double,
+                       torch.float]
+        int_types = [torch.int64,
+                     torch.int32,
+                     torch.int16]
+
+        def make_contiguous(shape, dtype):
+            if dtype in float_types:
+                return torch.randn(*shape, dtype=dtype)
+            result = torch.zeros(*shape, dtype=dtype)
+            result.apply_(lambda x: random.randint(-100, 100))
+            return result
+
+        def make_non_contiguous(shape, dtype):
+            contig = make_contiguous(shape, dtype)
+            non_contig = torch.empty(shape + (2,), dtype=dtype)[..., 0]
+            non_contig.copy_(contig)
+            self.assertFalse(non_contig.is_contiguous())
+            return non_contig
+
+        def make_tensors(*shape):
+            tensors = []
+            for dtype in float_types + int_types:
+                tensors.append(make_contiguous(shape, dtype))
+                tensors.append(make_non_contiguous(shape, dtype))
+            return tensors
+
+        check_sum_dim(make_tensors(5, 400000), 1)
+        check_sum_dim(make_tensors(3, 5, 7), 0)
+        check_sum_dim(make_tensors(3, 5, 7), 1)
+        check_sum_dim(make_tensors(3, 5, 7), 2)
+        check_sum_dim(make_tensors(100000), -1)
+        check_sum_dim(make_tensors(50, 50, 50), 0)
+        check_sum_dim(make_tensors(50, 50, 50), 1)
+        check_sum_dim(make_tensors(50, 50, 50), 2)
+        check_sum_dim(make_tensors(50, 50, 50), (1, 2))
+        check_sum_dim(make_tensors(50, 50, 50), (1, -1))
+
+        def make_contiguous_slice(size, dtype):
+            contig = make_contiguous((1, size), dtype)
+            non_contig = contig[:1, 1:size - 1]
+            self.assertTrue(non_contig.is_contiguous())
+            return contig
+
+        for dtype in float_types + int_types:
+            check_sum_dim(make_contiguous_slice(5, dtype), 0)
+            check_sum_dim(make_contiguous_slice(50, dtype), 0)
+            check_sum_dim(make_contiguous_slice(500, dtype), 0)
+            check_sum_dim(make_contiguous_slice(100000, dtype), 0)
+
+    def test_sum_out(self):
+        x = torch.rand(100, 100)
+        res1 = torch.sum(x, 1)
+        res2 = torch.Tensor()
+        torch.sum(x, 1, out=res2)
+        self.assertEqual(res1, res2)
+        x = torch.rand(100, 100, 100)
+        res1 = x.sum(2).sum(1)
+        res2 = torch.Tensor()
+        torch.sum(x, (2, 1), out=res2)
+        self.assertEqual(res1, res2)
+
+    # TODO: these tests only check if it's possible to pass a return value
+    # it'd be good to expand them
+    def test_prod(self):
+        x = torch.rand(100, 100)
+        res1 = torch.prod(x, 1)
+        res2 = torch.Tensor()
+        torch.prod(x, 1, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_cumsum(self):
+        x = torch.rand(100, 100)
+        res1 = torch.cumsum(x, 1)
+        res2 = torch.Tensor()
+        torch.cumsum(x, 1, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_cumprod(self):
+        x = torch.rand(100, 100)
+        res1 = torch.cumprod(x, 1)
+        res2 = torch.Tensor()
+        torch.cumprod(x, 1, out=res2)
+        self.assertEqual(res1, res2)
+
+    def _test_reduce_integer_upcast(self, fn, has_out=True):
+        shape = (3, 4, 5)
+        reduced_shape = fn(torch.ones(shape)).shape
+
+        def _test_out(dtype, other_dtype):
+            out = torch.ones(reduced_shape, dtype=dtype)
+            result = fn(x, out=out)
+            self.assertIs(out.dtype, result.dtype)
+            self.assertEqual(fn(x.type(dtype)), result)
+            result = fn(x, out=out, dtype=dtype)
+            self.assertIs(out.dtype, result.dtype)
+            self.assertEqual(fn(x.type(dtype)), result)
+            # 'out' is favored over dtype, check error
+            self.assertRaises(RuntimeError, lambda: fn(x, out=out, dtype=other_dtype))
+
+        for dtype in [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]:
+            x = torch.ones(shape, dtype=dtype)
+            expected_dtype = dtype if dtype.is_floating_point else torch.int64
+            self.assertIs(expected_dtype, fn(x).dtype)
+            self.assertEqual(fn(x.type(expected_dtype)), fn(x))
+
+            if dtype.is_floating_point:
+                other_dtype = torch.float32 if dtype == torch.float64 else torch.float64
+            else:
+                other_dtype = torch.int32 if dtype != torch.int32 else torch.int16
+            self.assertIs(other_dtype, fn(x, dtype=other_dtype).dtype)
+            self.assertEqual(fn(x.type(other_dtype)), fn(x, dtype=other_dtype))
+
+            # test mixed int/float
+            mixed_dtype = torch.int32 if dtype.is_floating_point else torch.float32
+            self.assertIs(mixed_dtype, fn(x, dtype=mixed_dtype).dtype)
+            self.assertEqual(fn(x.type(mixed_dtype)), fn(x, dtype=mixed_dtype))
+
+            if has_out:
+                _test_out(dtype, other_dtype)
+                _test_out(dtype, mixed_dtype)
+
+    def test_sum_integer_upcast(self):
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.sum(x, **kwargs), False)
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.sum(x, 0, **kwargs))
+
+    def test_prod_integer_upcast(self):
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.prod(x, **kwargs), False)
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.prod(x, 0, **kwargs))
+
+    def test_cumsum_integer_upcast(self):
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.cumsum(x, 0, **kwargs))
+
+    def test_cumprod_integer_upcast(self):
+        self._test_reduce_integer_upcast(lambda x, **kwargs: torch.cumprod(x, 0, **kwargs))
+
+    def test_cross(self):
+        x = torch.rand(100, 3, 100)
+        y = torch.rand(100, 3, 100)
+        res1 = torch.cross(x, y)
+        res2 = torch.Tensor()
+        torch.cross(x, y, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_zeros(self):
+        res1 = torch.zeros(100, 100)
+        res2 = torch.Tensor()
+        torch.zeros(100, 100, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_zeros_like(self):
+        expected = torch.zeros(100, 100)
+
+        res1 = torch.zeros_like(expected)
+        self.assertEqual(res1, expected)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_zeros_like_cuda(self):
+        expected = torch.zeros(100, 100).cuda()
+
+        res1 = torch.zeros_like(expected)
+        self.assertEqual(res1, expected)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, 'only one GPU detected')
+    def test_zeros_like_multiple_device(self):
+        expected = torch.zeros(100, 100).cuda()
+        x = torch.cuda.FloatTensor(100, 100, device=1)
+        output = torch.zeros_like(x)
+        self.assertEqual(output, expected)
+
+    def test_zeros_out(self):
+        shape = (3, 4)
+        out = torch.zeros(shape)
+        torch.zeros(shape, out=out)
+
+        # change the dtype, layout, device
+        self.assertRaises(RuntimeError, lambda: torch.zeros(shape, dtype=torch.int64, out=out))
+        self.assertRaises(RuntimeError, lambda: torch.zeros(shape, layout=torch.sparse_coo, out=out))
+        if torch.cuda.is_available():
+            self.assertRaises(RuntimeError, lambda: torch.zeros(shape, device='cuda', out=out))
+
+        # leave them the same
+        self.assertEqual(torch.zeros(shape), torch.zeros(shape, dtype=out.dtype, out=out))
+        self.assertEqual(torch.zeros(shape), torch.zeros(shape, layout=torch.strided, out=out))
+        self.assertEqual(torch.zeros(shape), torch.zeros(shape, device='cpu', out=out))
+
+    def test_histc(self):
+        x = torch.Tensor((2, 4, 2, 2, 5, 4))
+        y = torch.histc(x, 5, 1, 5)  # nbins,  min,  max
+        z = torch.Tensor((0, 3, 0, 2, 1))
+        self.assertEqual(y, z)
+
+    def test_ones(self):
+        res1 = torch.ones(100, 100)
+        res2 = torch.Tensor()
+        torch.ones(100, 100, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_ones_like(self):
+        expected = torch.ones(100, 100)
+
+        res1 = torch.ones_like(expected)
+        self.assertEqual(res1, expected)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_ones_like_cuda(self):
+        expected = torch.ones(100, 100).cuda()
+
+        res1 = torch.ones_like(expected)
+        self.assertEqual(res1, expected)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, 'only one GPU detected')
+    def test_ones_like_multiple_device(self):
+        expected = torch.ones(100, 100).cuda()
+        x = torch.cuda.FloatTensor(100, 100, device=1)
+        output = torch.ones_like(x)
+        self.assertEqual(output, expected)
+
+    @staticmethod
+    def _test_dtypes(self, dtypes, layout, device):
+        for dtype in dtypes:
+            if dtype != torch.float16:
+                out = torch.zeros((2, 3), dtype=dtype, layout=layout, device=device)
+                self.assertIs(dtype, out.dtype)
+                self.assertIs(layout, out.layout)
+                self.assertEqual(device, out.device)
+
+    def test_dtypes(self):
+        all_dtypes = torch.testing.get_all_dtypes()
+        self._test_dtypes(self, all_dtypes, torch.strided, torch.device('cpu'))
+        if torch.cuda.is_available():
+            self._test_dtypes(self, all_dtypes, torch.strided, torch.device('cuda:0'))
+
+    def test_copy_dtypes(self):
+        all_dtypes = torch.testing.get_all_dtypes()
+        for dtype in all_dtypes:
+            copied_dtype = copy.deepcopy(dtype)
+            self.assertIs(dtype, copied_dtype)
+
+    def test_device(self):
+        cpu = torch.device('cpu')
+        self.assertEqual('cpu', str(cpu))
+        self.assertEqual('cpu', cpu.type)
+        self.assertEqual(None, cpu.index)
+
+        cpu0 = torch.device('cpu:0')
+        self.assertEqual('cpu:0', str(cpu0))
+        self.assertEqual('cpu', cpu0.type)
+        self.assertEqual(0, cpu0.index)
+
+        cpu0 = torch.device('cpu', 0)
+        self.assertEqual('cpu:0', str(cpu0))
+        self.assertEqual('cpu', cpu0.type)
+        self.assertEqual(0, cpu0.index)
+
+        cuda = torch.device('cuda')
+        self.assertEqual('cuda', str(cuda))
+        self.assertEqual('cuda', cuda.type)
+        self.assertEqual(None, cuda.index)
+
+        cuda1 = torch.device('cuda:1')
+        self.assertEqual('cuda:1', str(cuda1))
+        self.assertEqual('cuda', cuda1.type)
+        self.assertEqual(1, cuda1.index)
+
+        cuda1 = torch.device('cuda', 1)
+        self.assertEqual('cuda:1', str(cuda1))
+        self.assertEqual('cuda', cuda1.type)
+        self.assertEqual(1, cuda1.index)
+
+        self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1'))
+        self.assertRaises(RuntimeError, lambda: torch.device('cpu:1'))
+        self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1))
+        self.assertRaises(RuntimeError, lambda: torch.device('cpu', 1))
+        self.assertRaises(RuntimeError, lambda: torch.device('cuda:-1'))
+        self.assertRaises(RuntimeError, lambda: torch.device('cuda', -1))
+        self.assertRaises(RuntimeError, lambda: torch.device(-1))
+
+        self.assertRaises(TypeError, lambda: torch.device('other'))
+        self.assertRaises(TypeError, lambda: torch.device('other:0'))
+
+        device_set = {'cpu', 'cpu:0', 'cuda', 'cuda:0', 'cuda:1', 'cuda:10', 'cuda:100'}
+        device_hash_set = set()
+        for device in list(device_set):
+            device_hash_set.add(hash(torch.device(device)))
+        self.assertEqual(len(device_set), len(device_hash_set))
+
+    def test_tensor_device(self):
+        def assertEqual(device_str, fn):
+            self.assertEqual(torch.device(device_str), fn().device)
+            self.assertEqual(device_str, str(fn().device))
+
+        assertEqual('cpu', lambda: torch.tensor(5))
+        assertEqual('cpu', lambda: torch.ones((2, 3), dtype=torch.float32, device='cpu'))
+        # NOTE: 'cpu' is the canonical representation of 'cpu:0', but 'cuda:X' is the canonical
+        # representation of cuda devices.
+        assertEqual('cpu', lambda: torch.ones((2, 3), dtype=torch.float32, device='cpu:0'))
+        assertEqual('cpu', lambda: torch.tensor(torch.ones((2, 3), dtype=torch.float32), device='cpu:0'))
+        if TEST_NUMPY:
+            assertEqual('cpu', lambda: torch.tensor(np.random.randn(2, 3), device='cpu'))
+
+        if torch.cuda.is_available():
+            assertEqual('cuda:0', lambda: torch.tensor(5).cuda(0))
+            assertEqual('cuda:0', lambda: torch.tensor(5).cuda('cuda:0'))
+            self.assertRaises(RuntimeError, lambda: torch.tensor(5).cuda('cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.tensor(5).cuda('cpu:0'))
+            assertEqual('cuda:0', lambda: torch.tensor(5, dtype=torch.int64, device=0))
+            assertEqual('cuda:0', lambda: torch.tensor(5, dtype=torch.int64, device='cuda:0'))
+            assertEqual('cuda:' + str(torch.cuda.current_device()),
+                        lambda: torch.tensor(5, dtype=torch.int64, device='cuda'))
+            assertEqual('cuda:0', lambda: torch.tensor(torch.ones((2, 3), dtype=torch.float32), device='cuda:0'))
+            if TEST_NUMPY:
+                assertEqual('cuda:0', lambda: torch.tensor(np.random.randn(2, 3), device='cuda:0'))
+
+            if torch.cuda.device_count() > 1:
+                assertEqual('cuda:1', lambda: torch.tensor(5).cuda(1))
+                assertEqual('cuda:1', lambda: torch.tensor(5).cuda('cuda:1'))
+                assertEqual('cuda:1', lambda: torch.tensor(5, dtype=torch.int64, device=1))
+                assertEqual('cuda:1', lambda: torch.tensor(5, dtype=torch.int64, device='cuda:1'))
+                assertEqual('cuda:1', lambda: torch.tensor(torch.ones((2, 3), dtype=torch.float32), device='cuda:1'))
+                if TEST_NUMPY:
+                    assertEqual('cuda:1', lambda: torch.tensor(np.random.randn(2, 3), device='cuda:1'))
+
+    def test_to(self):
+        a = torch.tensor(5)
+        self.assertEqual(a.device, a.to('cpu').device)
+        self.assertEqual(a.device, a.to('cpu', dtype=torch.float32).device)
+        self.assertIs(torch.float32, a.to('cpu', dtype=torch.float32).dtype)
+        self.assertEqual(a.device, a.to(torch.float32).device)
+        self.assertIs(torch.float32, a.to(dtype=torch.float32).dtype)
+
+        if torch.cuda.is_available():
+            for non_blocking in [True, False]:
+                for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                    b = torch.tensor(5., device=cuda)
+                    self.assertEqual(b.device, b.to(cuda, non_blocking=non_blocking).device)
+                    self.assertEqual(a.device, b.to('cpu', non_blocking=non_blocking).device)
+                    self.assertEqual(b.device, a.to(cuda, non_blocking=non_blocking).device)
+                    self.assertIs(torch.int32, b.to('cpu', dtype=torch.int32, non_blocking=non_blocking).dtype)
+                    self.assertEqual(a.device, b.to('cpu', dtype=torch.int32, non_blocking=non_blocking).device)
+                    self.assertIs(torch.int32, b.to(dtype=torch.int32).dtype)
+                    self.assertEqual(b.device, b.to(dtype=torch.int32).device)
+
+    def test_to_with_tensor(self):
+        a = torch.tensor(5)
+        self.assertEqual(a.device, a.to(a).device)
+
+        if torch.cuda.is_available():
+            for non_blocking in [True, False]:
+                for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                    b = torch.tensor(5., device=cuda)
+                    self.assertEqual(b.device, b.to(b, non_blocking=non_blocking).device)
+                    self.assertEqual(a.device, b.to(a, non_blocking=non_blocking).device)
+                    self.assertEqual(b.device, a.to(b, non_blocking=non_blocking).device)
+
+    @staticmethod
+    def _test_empty_full(self, dtypes, layout, device):
+        shape = torch.Size([2, 3])
+
+        def check_value(tensor, dtype, layout, device, value, requires_grad):
+            self.assertEqual(shape, tensor.shape)
+            self.assertIs(dtype, tensor.dtype)
+            self.assertIs(layout, tensor.layout)
+            self.assertEqual(tensor.requires_grad, requires_grad)
+            if tensor.is_cuda and device is not None:
+                self.assertEqual(device, tensor.device)
+            if value is not None:
+                fill = tensor.new(shape).fill_(value)
+                self.assertEqual(tensor, fill)
+
+        def get_int64_dtype(dtype):
+            module = '.'.join(str(dtype).split('.')[1:-1])
+            if not module:
+                return torch.int64
+            return operator.attrgetter(module)(torch).int64
+
+        default_dtype = torch.get_default_dtype()
+        check_value(torch.empty(shape), default_dtype, torch.strided, -1, None, False)
+        check_value(torch.full(shape, -5), default_dtype, torch.strided, -1, None, False)
+        for dtype in dtypes:
+            for rg in {dtype.is_floating_point, False}:
+                int64_dtype = get_int64_dtype(dtype)
+                v = torch.empty(shape, dtype=dtype, device=device, layout=layout, requires_grad=rg)
+                check_value(v, dtype, layout, device, None, rg)
+                out = v.new()
+                check_value(torch.empty(shape, out=out, device=device, layout=layout, requires_grad=rg),
+                            dtype, layout, device, None, rg)
+                check_value(v.new_empty(shape), dtype, layout, device, None, False)
+                check_value(v.new_empty(shape, dtype=int64_dtype, device=device, requires_grad=False),
+                            int64_dtype, layout, device, None, False)
+                check_value(torch.empty_like(v), dtype, layout, device, None, False)
+                check_value(torch.empty_like(v, dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
+                            int64_dtype, layout, device, None, False)
+
+                if dtype is not torch.float16 and layout != torch.sparse_coo:
+                    fv = 3
+                    v = torch.full(shape, fv, dtype=dtype, layout=layout, device=device, requires_grad=rg)
+                    check_value(v, dtype, layout, device, fv, rg)
+                    check_value(v.new_full(shape, fv + 1), dtype, layout, device, fv + 1, False)
+                    out = v.new()
+                    check_value(torch.full(shape, fv + 2, out=out, device=device, layout=layout, requires_grad=rg),
+                                dtype, layout, device, fv + 2, rg)
+                    check_value(v.new_full(shape, fv + 3, dtype=int64_dtype, device=device, requires_grad=False),
+                                int64_dtype, layout, device, fv + 3, False)
+                    check_value(torch.full_like(v, fv + 4), dtype, layout, device, fv + 4, False)
+                    check_value(torch.full_like(v, fv + 5,
+                                                dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
+                                int64_dtype, layout, device, fv + 5, False)
+
+    def test_empty_full(self):
+        self._test_empty_full(self, torch.testing.get_all_dtypes(), torch.strided, torch.device('cpu'))
+        if torch.cuda.device_count() > 0:
+            self._test_empty_full(self, torch.testing.get_all_dtypes(), torch.strided, None)
+            self._test_empty_full(self, torch.testing.get_all_dtypes(), torch.strided, torch.device('cuda:0'))
+
+    def test_dtype_out_match(self):
+        d = torch.autograd.Variable(torch.DoubleTensor(2, 3))
+        self.assertRaises(RuntimeError, lambda: torch.zeros((2, 3), out=d, dtype=torch.float32))
+
+    def test_constructor_dtypes(self):
+        default_type = torch.Tensor().type()
+        self.assertIs(torch.Tensor().dtype, torch.get_default_dtype())
+
+        self.assertIs(torch.uint8, torch.ByteTensor.dtype)
+        self.assertIs(torch.float32, torch.FloatTensor.dtype)
+        self.assertIs(torch.float64, torch.DoubleTensor.dtype)
+
+        torch.set_default_tensor_type('torch.FloatTensor')
+        self.assertIs(torch.float32, torch.get_default_dtype())
+        self.assertIs(torch.FloatStorage, torch.Storage)
+
+        torch.set_default_dtype(torch.float64)
+        self.assertIs(torch.float64, torch.get_default_dtype())
+        self.assertIs(torch.DoubleStorage, torch.Storage)
+
+        torch.set_default_tensor_type(torch.FloatTensor)
+        self.assertIs(torch.float32, torch.get_default_dtype())
+        self.assertIs(torch.FloatStorage, torch.Storage)
+
+        if torch.cuda.is_available():
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            self.assertIs(torch.float32, torch.get_default_dtype())
+            self.assertIs(torch.float32, torch.cuda.FloatTensor.dtype)
+            self.assertIs(torch.cuda.FloatStorage, torch.Storage)
+
+            torch.set_default_dtype(torch.float64)
+            self.assertIs(torch.float64, torch.get_default_dtype())
+            self.assertIs(torch.cuda.DoubleStorage, torch.Storage)
+
+        # don't support integral or sparse default types.
+        self.assertRaises(TypeError, lambda: torch.set_default_tensor_type('torch.IntTensor'))
+        self.assertRaises(TypeError, lambda: torch.set_default_dtype(torch.int64))
+
+        # don't allow passing dtype to set_default_tensor_type
+        self.assertRaises(TypeError, lambda: torch.set_default_tensor_type(torch.float32))
+
+        torch.set_default_tensor_type(default_type)
+
+    def test_type(self):
+        x = torch.randn(3, 3).double()
+        self.assertEqual(x.type('torch.FloatTensor').dtype, torch.float32)
+        self.assertEqual(x.type(torch.FloatTensor).dtype, torch.float32)
+        self.assertEqual(x.int().type(torch.Tensor).dtype, torch.get_default_dtype())
+        self.assertEqual(x.type(torch.int32).dtype, torch.int32)
+
+    def test_tensor_factory(self):
+        expected = torch.Tensor([1, 1])
+        # test data
+        res1 = torch.tensor([1, 1])
+        self.assertEqual(res1, expected)
+
+        res1 = torch.tensor([1, 1], dtype=torch.int)
+        self.assertEqual(res1, expected)
+        self.assertIs(torch.int, res1.dtype)
+
+        # test copy
+        res2 = torch.tensor(expected)
+        self.assertEqual(res2, expected)
+        res2[1] = 2
+        self.assertEqual(expected, torch.ones_like(expected))
+
+        res2 = torch.tensor(expected, dtype=torch.int)
+        self.assertEqual(res1, expected)
+        self.assertIs(torch.int, res1.dtype)
+
+        # test copy with numpy
+        if TEST_NUMPY:
+            a = np.array([5.])
+            res1 = torch.tensor(a)
+            self.assertEqual(5., res1[0].item())
+            a[0] = 7.
+            self.assertEqual(5., res1[0].item())
+
+    def test_tensor_factory_type_inference(self):
+        def test_inference(default_dtype):
+            saved_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(default_dtype)
+            self.assertIs(default_dtype, torch.tensor(()).dtype)
+            self.assertIs(default_dtype, torch.tensor(5.).dtype)
+            self.assertIs(torch.int64, torch.tensor(5).dtype)
+            self.assertIs(torch.uint8, torch.tensor(True).dtype)
+            self.assertIs(torch.int32, torch.tensor(5, dtype=torch.int32).dtype)
+            self.assertIs(default_dtype, torch.tensor(((7, 5), (9, 5.))).dtype)
+            self.assertIs(default_dtype, torch.tensor(((5., 5), (3, 5))).dtype)
+            self.assertIs(torch.int64, torch.tensor(((5, 3), (3, 5))).dtype)
+
+            if TEST_NUMPY:
+                self.assertIs(torch.float64, torch.tensor(np.array(())).dtype)
+                self.assertIs(torch.float64, torch.tensor(np.array(5.)).dtype)
+                if np.array(5).dtype == np.int64:  # np long, which can be 4 bytes (e.g. on windows)
+                    self.assertIs(torch.int64, torch.tensor(np.array(5)).dtype)
+                else:
+                    self.assertIs(torch.int32, torch.tensor(np.array(5)).dtype)
+                self.assertIs(torch.uint8, torch.tensor(np.array(3, dtype=np.uint8)).dtype)
+                self.assertIs(default_dtype, torch.tensor(((7, np.array(5)), (np.array(9), 5.))).dtype)
+                self.assertIs(torch.float64, torch.tensor(((7, 5), (9, np.array(5.)))).dtype)
+                self.assertIs(torch.int64, torch.tensor(((5, np.array(3)), (np.array(3), 5))).dtype)
+            torch.set_default_dtype(saved_dtype)
+
+        test_inference(torch.float64)
+        test_inference(torch.float32)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_tensor_factory_cuda_type_inference(self):
+        saved_type = torch.Tensor().type()
+        torch.set_default_tensor_type(torch.cuda.DoubleTensor)
+        torch.set_default_dtype(torch.float32)
+        self.assertIs(torch.float32, torch.tensor(0.).dtype)
+        self.assertEqual(torch.device('cuda:0'), torch.tensor(0.).device)
+        torch.set_default_dtype(torch.float64)
+        self.assertIs(torch.float64, torch.tensor(0.).dtype)
+        self.assertEqual(torch.device('cuda:0'), torch.tensor(0.).device)
+        torch.set_default_tensor_type(saved_type)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_tensor_factory_cuda_type(self):
+        saved_type = torch.Tensor().type()
+        torch.set_default_tensor_type(torch.cuda.FloatTensor)
+        x = torch.zeros((5, 5))
+        self.assertIs(torch.float32, x.dtype)
+        self.assertTrue(x.is_cuda)
+        torch.set_default_tensor_type(torch.cuda.DoubleTensor)
+        x = torch.zeros((5, 5))
+        self.assertIs(torch.float64, x.dtype)
+        self.assertTrue(x.is_cuda)
+        torch.set_default_tensor_type(saved_type)
+
+    @skipIfNoZeroSize
+    def test_tensor_factories_empty(self):
+        # ensure we can create empty tensors from each factory function
+        shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)]
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+
+        for device in devices:
+            for shape in shapes:
+                self.assertEqual(shape, torch.zeros(shape, device=device).shape)
+                self.assertEqual(shape, torch.zeros_like(torch.zeros(shape, device=device)).shape)
+                self.assertEqual(shape, torch.empty(shape, device=device).shape)
+                self.assertEqual(shape, torch.empty_like(torch.zeros(shape, device=device)).shape)
+                self.assertEqual(shape, torch.full(shape, 3, device=device).shape)
+                self.assertEqual(shape, torch.full_like(torch.zeros(shape, device=device), 3).shape)
+                self.assertEqual(shape, torch.ones(shape, device=device).shape)
+                self.assertEqual(shape, torch.ones_like(torch.zeros(shape, device=device)).shape)
+                self.assertEqual(shape, torch.rand(shape, device=device).shape)
+                self.assertEqual(shape, torch.rand_like(torch.zeros(shape, device=device)).shape)
+                self.assertEqual(shape, torch.randn(shape, device=device).shape)
+                self.assertEqual(shape, torch.randn_like(torch.zeros(shape, device=device)).shape)
+                self.assertEqual(shape, torch.randint(6, shape, device=device).shape)
+                self.assertEqual(shape, torch.randint_like(torch.zeros(shape, device=device), 6).shape)
+
+            self.assertEqual((0,), torch.arange(0, device=device).shape)
+            self.assertEqual((0, 0), torch.eye(0, device=device).shape)
+            self.assertEqual((0, 0), torch.eye(0, 0, device=device).shape)
+            self.assertEqual((5, 0), torch.eye(5, 0, device=device).shape)
+            self.assertEqual((0, 5), torch.eye(0, 5, device=device).shape)
+            self.assertEqual((0,), torch.linspace(1, 1, 0, device=device).shape)
+            self.assertEqual((0,), torch.logspace(1, 1, 0, device=device).shape)
+            self.assertEqual((0,), torch.randperm(0, device=device).shape)
+            self.assertEqual((0,), torch.bartlett_window(0, device=device).shape)
+            self.assertEqual((0,), torch.hamming_window(0, device=device).shape)
+            self.assertEqual((0,), torch.hann_window(0, device=device).shape)
+            self.assertEqual((1, 1, 0), torch.tensor([[[]]], device=device).shape)
+            self.assertEqual((1, 1, 0), torch.as_tensor([[[]]], device=device).shape)
+
+    def test_new_tensor(self):
+        expected = torch.autograd.Variable(torch.ByteTensor([1, 1]))
+        # test data
+        res1 = expected.new_tensor([1, 1])
+        self.assertEqual(res1, expected)
+        res1 = expected.new_tensor([1, 1], dtype=torch.int)
+        self.assertEqual(res1, expected)
+        self.assertIs(torch.int, res1.dtype)
+
+        # test copy
+        res2 = expected.new_tensor(expected)
+        self.assertEqual(res2, expected)
+        res2[1] = 2
+        self.assertEqual(expected, torch.ones_like(expected))
+        res2 = expected.new_tensor(expected, dtype=torch.int)
+        self.assertEqual(res2, expected)
+        self.assertIs(torch.int, res2.dtype)
+
+        # test copy with numpy
+        if TEST_NUMPY:
+            a = np.array([5.])
+            res1 = torch.tensor(a)
+            res1 = res1.new_tensor(a)
+            self.assertEqual(5., res1[0].item())
+            a[0] = 7.
+            self.assertEqual(5., res1[0].item())
+
+        if torch.cuda.device_count() >= 2:
+            expected = expected.cuda(1)
+            res1 = expected.new_tensor([1, 1])
+            self.assertEqual(res1.get_device(), expected.get_device())
+            res1 = expected.new_tensor([1, 1], dtype=torch.int)
+            self.assertIs(torch.int, res1.dtype)
+            self.assertEqual(res1.get_device(), expected.get_device())
+
+            res2 = expected.new_tensor(expected)
+            self.assertEqual(res2.get_device(), expected.get_device())
+            res2 = expected.new_tensor(expected, dtype=torch.int)
+            self.assertIs(torch.int, res1.dtype)
+            self.assertEqual(res2.get_device(), expected.get_device())
+            res2 = expected.new_tensor(expected, dtype=torch.int, device=0)
+            self.assertIs(torch.int, res1.dtype)
+            self.assertEqual(res2.get_device(), 0)
+
+            res1 = expected.new_tensor(1)
+            self.assertEqual(res1.get_device(), expected.get_device())
+            res1 = expected.new_tensor(1, dtype=torch.int)
+            self.assertIs(torch.int, res1.dtype)
+            self.assertEqual(res1.get_device(), expected.get_device())
+
+    def test_as_tensor(self):
+        # from python data
+        x = [[0, 1], [2, 3]]
+        self.assertEqual(torch.tensor(x), torch.as_tensor(x))
+        self.assertEqual(torch.tensor(x, dtype=torch.float32), torch.as_tensor(x, dtype=torch.float32))
+
+        # from tensor (doesn't copy unless type is different)
+        y = torch.tensor(x)
+        self.assertIs(y, torch.as_tensor(y))
+        self.assertIsNot(y, torch.as_tensor(y, dtype=torch.float32))
+        if torch.cuda.is_available():
+            self.assertIsNot(y, torch.as_tensor(y, device='cuda'))
+            y_cuda = y.to('cuda')
+            self.assertIs(y_cuda, torch.as_tensor(y_cuda))
+            self.assertIs(y_cuda, torch.as_tensor(y_cuda, device='cuda'))
+
+        if TEST_NUMPY:
+            # doesn't copy
+            n = np.random.rand(5, 6)
+            n_astensor = torch.as_tensor(n)
+            self.assertEqual(torch.tensor(n), n_astensor)
+            n_astensor[0][0] = 250.7
+            self.assertEqual(torch.tensor(n), n_astensor)
+
+            # changing dtype causes copy
+            n = np.random.rand(5, 6).astype(np.float32)
+            n_astensor = torch.as_tensor(n, dtype=torch.float64)
+            self.assertEqual(torch.tensor(n, dtype=torch.float64), n_astensor)
+            n_astensor[0][1] = 250.8
+            self.assertNotEqual(torch.tensor(n, dtype=torch.float64), n_astensor)
+
+            # changing device causes copy
+            if torch.cuda.is_available():
+                n = np.random.randn(5, 6)
+                n_astensor = torch.as_tensor(n, device='cuda')
+                self.assertEqual(torch.tensor(n, device='cuda'), n_astensor)
+                n_astensor[0][2] = 250.9
+                self.assertNotEqual(torch.tensor(n, device='cuda'), n_astensor)
+
+    def test_diag(self):
+        x = torch.rand(100, 100)
+        res1 = torch.diag(x)
+        res2 = torch.Tensor()
+        torch.diag(x, out=res2)
+        self.assertEqual(res1, res2)
+
+    @staticmethod
+    def _test_diagonal(self, dtype, device):
+        x = torch.randn((100, 100), dtype=dtype, device=device)
+        result = torch.diagonal(x)
+        expected = torch.diag(x)
+        self.assertEqual(result, expected)
+
+        x = torch.randn((100, 100), dtype=dtype, device=device)
+        result = torch.diagonal(x, 17)
+        expected = torch.diag(x, 17)
+        self.assertEqual(result, expected)
+
+    def test_diagonal(self):
+        self._test_diagonal(self, dtype=torch.float32, device='cpu')
+
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    def test_diagonal_multidim(self):
+        x = torch.randn(10, 11, 12, 13)
+        xn = x.numpy()
+        for args in [(2, 2, 3),
+                     (2,),
+                     (-2, 1, 2),
+                     (0, -2, -1)]:
+            result = torch.diagonal(x, *args)
+            expected = xn.diagonal(*args)
+            self.assertEqual(expected.shape, result.shape)
+            self.assertTrue(np.allclose(expected, result.numpy()))
+        # test non-continguous
+        xp = x.permute(1, 2, 3, 0)
+        result = torch.diagonal(xp, 0, -2, -1)
+        expected = xp.numpy().diagonal(0, -2, -1)
+        self.assertEqual(expected.shape, result.shape)
+        self.assertTrue(np.allclose(expected, result.numpy()))
+
+    @staticmethod
+    def _test_diagflat(self, dtype, device):
+        # Basic sanity test
+        x = torch.randn((100,), dtype=dtype, device=device)
+        result = torch.diagflat(x)
+        expected = torch.diag(x)
+        self.assertEqual(result, expected)
+
+        # Test offset
+        x = torch.randn((100,), dtype=dtype, device=device)
+        result = torch.diagflat(x, 17)
+        expected = torch.diag(x, 17)
+        self.assertEqual(result, expected)
+
+        # Test where input has more than one dimension
+        x = torch.randn((2, 3, 4), dtype=dtype, device=device)
+        result = torch.diagflat(x)
+        expected = torch.diag(x.contiguous().view(-1))
+        self.assertEqual(result, expected)
+
+        # Noncontig input
+        x = torch.randn((2, 3, 4), dtype=dtype, device=device).transpose(2, 0)
+        self.assertFalse(x.is_contiguous())
+        result = torch.diagflat(x)
+        expected = torch.diag(x.contiguous().view(-1))
+        self.assertEqual(result, expected)
+
+    def test_diagflat(self):
+        self._test_diagflat(self, dtype=torch.float32, device='cpu')
+
+    def test_eye(self):
+        res1 = torch.eye(100, 100)
+        res2 = torch.Tensor()
+        torch.eye(100, 100, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_renorm(self):
+        m1 = torch.randn(10, 5)
+        res1 = torch.Tensor()
+
+        def renorm(matrix, value, dim, max_norm):
+            m1 = matrix.transpose(dim, 0).contiguous()
+            # collapse non-dim dimensions.
+            m2 = m1.clone().resize_(m1.size(0), int(math.floor(m1.nelement() / m1.size(0))))
+            norms = m2.norm(value, 1, True)
+            # clip
+            new_norms = norms.clone()
+            new_norms[torch.gt(norms, max_norm)] = max_norm
+            new_norms.div_(norms.add_(1e-7))
+            # renormalize
+            m1.mul_(new_norms.expand_as(m1))
+            return m1.transpose(dim, 0)
+
+        # note that the axis fed to torch.renorm is different (2~=1)
+        maxnorm = m1.norm(2, 1).mean()
+        m2 = renorm(m1, 2, 1, maxnorm)
+        m1.renorm_(2, 1, maxnorm)
+        self.assertEqual(m1, m2, 1e-5)
+        self.assertEqual(m1.norm(2, 0), m2.norm(2, 0), 1e-5)
+
+        m1 = torch.randn(3, 4, 5)
+        m2 = m1.transpose(1, 2).contiguous().clone().resize_(15, 4)
+        maxnorm = m2.norm(2, 0).mean()
+        m2 = renorm(m2, 2, 1, maxnorm)
+        m1.renorm_(2, 1, maxnorm)
+        m3 = m1.transpose(1, 2).contiguous().clone().resize_(15, 4)
+        self.assertEqual(m3, m2)
+        self.assertEqual(m3.norm(2, 0), m2.norm(2, 0))
+
+    @staticmethod
+    def _test_renorm_ps(self, device):
+        # full reduction
+        x = torch.randn(5, 5)
+        xn = x.numpy()
+        for p in [1, 2, 3, 4, inf]:
+            res = x.renorm(p, 1, 1)
+            expected = x / x.norm(p, 0, keepdim=True).clamp(min=1)
+            self.assertEqual(res.numpy(), expected.numpy(), "renorm failed for {}-norm".format(p))
+
+    def test_renorm_ps(self):
+        self._test_renorm_ps(self, device='cpu')
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_renorm_ps_cuda(self):
+        self._test_renorm_ps(self, device='cuda')
+
+    @staticmethod
+    def _test_multinomial(self, type):
+        def make_prob_dist(shape, is_contiguous):
+            if is_contiguous:
+                return type(*shape).uniform_()
+            elif len(shape) == 1:
+                return type(*(shape + [5])).uniform_()[:, 2]
+            else:
+                # num dim = 2
+                new_shape = [2, shape[1], 7, 1, shape[0], 1, 10]
+                prob_dist = type(*new_shape).uniform_()
+                prob_dist = prob_dist.transpose(1, 4)
+                prob_dist = prob_dist[1, :, 5, 0, :, 0, 4]
+                assert not prob_dist.is_contiguous()  # sanity check
+                return prob_dist
+
+        for is_contiguous in (True, False):
+            # with replacement
+            n_row = 3
+            for n_col in range(4, 5 + 1):
+                prob_dist = make_prob_dist([n_row, n_col], is_contiguous)
+                # indices that shouldn't be sampled (<0 means none)
+                zero_prob_indices = torch.LongTensor(n_row).random_(-2, n_col).tolist()
+                for i, j in enumerate(zero_prob_indices):
+                    if j >= 0:
+                        prob_dist[i, j] = 0
+                n_sample = n_col * 3
+                sample_indices = torch.multinomial(prob_dist, n_sample, True)
+                self.assertEqual(prob_dist.dim(), 2)
+                self.assertEqual(sample_indices.size(1), n_sample)
+                for i in range(n_row):
+                    zero_prob_idx = zero_prob_indices[i]
+                    if zero_prob_idx < 0:
+                        continue
+                    for j in range(n_sample):
+                        self.assertNotEqual(sample_indices[i, j], zero_prob_idx,
+                                            "sampled an index with zero probability")
+
+            # without replacement
+            n_row = 3
+            for n_col in range(2, 10 + 1, 2):
+                prob_dist = make_prob_dist([n_row, n_col], is_contiguous)
+                # indices that shouldn't be sampled (<0 means none)
+                zero_prob_indices = torch.LongTensor(n_row).random_(-1, n_col).tolist()
+                for i, j in enumerate(zero_prob_indices):
+                    if j >= 0:
+                        prob_dist[i, j] = 0
+                n_sample = max(1, n_col - 2)
+                sample_indices = torch.multinomial(prob_dist, n_sample, False)
+                self.assertEqual(prob_dist.dim(), 2)
+                self.assertEqual(sample_indices.size(1), n_sample)
+                for i in range(n_row):
+                    row_samples = {}
+                    zero_prob_idx = zero_prob_indices[i]
+                    for j in range(n_sample):
+                        sample_idx = sample_indices[i, j]
+                        if zero_prob_idx >= 0:
+                            self.assertNotEqual(sample_idx, zero_prob_idx,
+                                                "sampled an index with zero probability")
+                        self.assertNotIn(sample_idx, row_samples, "sampled an index twice")
+                        row_samples[sample_idx] = True
+
+            # vector
+            n_col = 4
+            prob_dist = make_prob_dist([n_col], is_contiguous).fill_(1)
+            zero_prob_idx = 1  # index that shouldn't be sampled
+            prob_dist[zero_prob_idx] = 0
+            n_sample = 20
+            sample_indices = torch.multinomial(prob_dist, n_sample, True)
+            for sample_index in sample_indices:
+                self.assertNotEqual(sample_index, zero_prob_idx, "sampled an index with zero probability")
+            s_dim = sample_indices.dim()
+            self.assertEqual(sample_indices.dim(), 1, "wrong number of dimensions")
+            self.assertEqual(prob_dist.dim(), 1, "wrong number of prob_dist dimensions")
+            self.assertEqual(sample_indices.size(0), n_sample, "wrong number of samples")
+
+    def test_multinomial(self):
+        self._test_multinomial(self, torch.FloatTensor)
+
+    def _spawn_method(self, method, arg):
+        try:
+            mp.set_start_method('spawn')
+        except RuntimeError:
+            pass
+        with mp.Pool(1) as pool:
+            self.assertTrue(pool.map(method, [arg]))
+
+    @staticmethod
+    def _test_multinomial_invalid_probs(probs):
+        try:
+            torch.multinomial(probs.to('cpu'), 1)
+            return False  # Should not be reached
+        except RuntimeError as e:
+            return 'invalid multinomial distribution' in str(e)
+
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(IS_WINDOWS, 'FIXME: CUDA OOM error on Windows')
+    @unittest.skipIf(not PY3,
+                     "spawn start method is not supported in Python 2, \
+                     but we need it for for testing failure case for CPU RNG on Windows")
+    def test_multinomial_invalid_probs(self):
+        test_method = TestTorch._test_multinomial_invalid_probs
+        self._spawn_method(test_method, torch.Tensor([0, -1]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))
+
+    @suppress_warnings
+    def test_range(self):
+        res1 = torch.range(0, 1)
+        res2 = torch.Tensor()
+        torch.range(0, 1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # Check range for non-contiguous tensors.
+        x = torch.zeros(2, 3)
+        torch.range(0, 3, out=x.narrow(1, 1, 2))
+        res2 = torch.Tensor(((0, 0, 1), (0, 2, 3)))
+        self.assertEqual(x, res2, 1e-16)
+
+        # Check negative
+        res1 = torch.Tensor((1, 0))
+        res2 = torch.Tensor()
+        torch.range(1, 0, -1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # Equal bounds
+        res1 = torch.ones(1)
+        res2 = torch.Tensor()
+        torch.range(1, 1, -1, out=res2)
+        self.assertEqual(res1, res2, 0)
+        torch.range(1, 1, 1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # FloatTensor
+        res1 = torch.range(0.6, 0.9, 0.1, out=torch.FloatTensor())
+        self.assertEqual(res1.size(0), 4)
+        res1 = torch.range(1, 10, 0.3, out=torch.FloatTensor())
+        self.assertEqual(res1.size(0), 31)
+
+        # DoubleTensor
+        res1 = torch.range(0.6, 0.9, 0.1, out=torch.DoubleTensor())
+        self.assertEqual(res1.size(0), 4)
+        res1 = torch.range(1, 10, 0.3, out=torch.DoubleTensor())
+        self.assertEqual(res1.size(0), 31)
+
+    def test_range_warning(self):
+        with warnings.catch_warnings(record=True) as w:
+            torch.range(0, 10)
+            self.assertEqual(len(w), 1)
+
+    def test_arange(self):
+        res1 = torch.arange(0, 1)
+        res2 = torch.Tensor()
+        torch.arange(0, 1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # Check arange with only one argument
+        res1 = torch.arange(10)
+        res2 = torch.arange(0, 10)
+        self.assertEqual(res1, res2, 0)
+
+        # Check arange for non-contiguous tensors.
+        x = torch.zeros(2, 3)
+        torch.arange(0, 4, out=x.narrow(1, 1, 2))
+        res2 = torch.Tensor(((0, 0, 1), (0, 2, 3)))
+        self.assertEqual(x, res2, 1e-16)
+
+        # Check negative
+        res1 = torch.Tensor((1, 0))
+        res2 = torch.Tensor()
+        torch.arange(1, -1, -1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # Equal bounds
+        res1 = torch.ones(1)
+        res2 = torch.Tensor()
+        torch.arange(1, 0, -1, out=res2)
+        self.assertEqual(res1, res2, 0)
+        torch.arange(1, 2, 1, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # FloatTensor
+        res1 = torch.arange(0.6, 0.89, 0.1, out=torch.FloatTensor())
+        self.assertEqual(res1, [0.6, 0.7, 0.8])
+        res1 = torch.arange(1, 10, 0.3, out=torch.FloatTensor())
+        self.assertEqual(res1.size(0), 30)
+        self.assertEqual(res1[0], 1)
+        self.assertEqual(res1[29], 9.7)
+
+        # DoubleTensor
+        res1 = torch.arange(0.6, 0.89, 0.1, out=torch.DoubleTensor())
+        self.assertEqual(res1, [0.6, 0.7, 0.8])
+        res1 = torch.arange(1, 10, 0.3, out=torch.DoubleTensor())
+        self.assertEqual(res1.size(0), 30)
+        self.assertEqual(res1[0], 1)
+        self.assertEqual(res1[29], 9.7)
+
+        # Check that it's exclusive
+        r = torch.arange(0, 5)
+        self.assertEqual(r.min(), 0)
+        self.assertEqual(r.max(), 4)
+        self.assertEqual(r.numel(), 5)
+
+        r = torch.arange(0, 5, 2)
+        self.assertEqual(r.min(), 0)
+        self.assertEqual(r.max(), 4)
+        self.assertEqual(r.numel(), 3)
+
+        r1 = torch.arange(0, 5 + 1e-6)
+        r2 = torch.arange(0, 5)
+        r3 = torch.arange(0, 5 - 1e-6)
+        self.assertEqual(r1[:-1], r2, 0)
+        self.assertEqual(r2, r3, 0)
+
+        r1 = torch.arange(10, -1 + 1e-6, -1)
+        r2 = torch.arange(10, -1, -1)
+        r3 = torch.arange(10, -1 - 1e-6, -1)
+        self.assertEqual(r1, r2, 0)
+        self.assertEqual(r2, r3[:-1], 0)
+
+    def test_arange_inference(self):
+        saved_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch.float32)
+        # end only
+        self.assertIs(torch.float32, torch.arange(1.).dtype)
+        self.assertIs(torch.float32, torch.arange(torch.tensor(1.)).dtype)
+        self.assertIs(torch.float32, torch.arange(torch.tensor(1., dtype=torch.float64)).dtype)
+
+        self.assertIs(torch.int64, torch.arange(1).dtype)
+        self.assertIs(torch.int64, torch.arange(torch.tensor(1)).dtype)
+        self.assertIs(torch.int64, torch.arange(torch.tensor(1, dtype=torch.int16)).dtype)
+
+        # start, end, [step]
+        self.assertIs(torch.float32, torch.arange(1., 3).dtype)
+        self.assertIs(torch.float32, torch.arange(torch.tensor(1., dtype=torch.float64), 3).dtype)
+        self.assertIs(torch.float32, torch.arange(1, 3.).dtype)
+        self.assertIs(torch.float32, torch.arange(torch.tensor(1, dtype=torch.int16), torch.tensor(3.)).dtype)
+        self.assertIs(torch.float32, torch.arange(1, 3, 1.).dtype)
+        self.assertIs(torch.float32,
+                      torch.arange(torch.tensor(1),
+                                   torch.tensor(3, dtype=torch.int16),
+                                   torch.tensor(1., dtype=torch.float64)).dtype)
+
+        self.assertIs(torch.int64, torch.arange(1, 3).dtype)
+        self.assertIs(torch.int64, torch.arange(torch.tensor(1), 3).dtype)
+        self.assertIs(torch.int64, torch.arange(torch.tensor(1), torch.tensor(3, dtype=torch.int16)).dtype)
+        self.assertIs(torch.int64, torch.arange(1, 3, 1).dtype)
+        self.assertIs(torch.int64,
+                      torch.arange(torch.tensor(1),
+                                   torch.tensor(3),
+                                   torch.tensor(1, dtype=torch.int16)).dtype)
+        torch.set_default_dtype(saved_dtype)
+
+    @staticmethod
+    def _select_broadcastable_dims(dims_full=None):
+        # select full dimensionality
+        if dims_full is None:
+            dims_full = []
+            ndims = random.randint(1, 4)
+            dims_full = [random.randint(1, 8) for _ in range(ndims)]
+        else:
+            ndims = len(dims_full)
+
+        # select actual dimensions for ops:
+        # larger: full ndims, individual sizes may be reduced
+        # smaller: possibly reduced ndims, sizes may be reduced
+        smaller_ndims = random.randint(1, ndims)
+        dims_small = []
+        dims_large = []
+        for i in range(ndims - 1, -1, -1):
+            j = random.randint(1, 3)
+            if j == 1:  # no reduced singleton dimension
+                ds = dims_full[i]
+                dl = dims_full[i]
+            elif j == 2:  # larger may have reduced singleton dimension
+                ds = dims_full[i]
+                dl = 1 if len(dims_small) < smaller_ndims else dims_full[i]
+            elif j == 3:  # smaller may have reduced singleton dimension
+                ds = 1
+                dl = dims_full[i]
+            dims_large = [dl] + dims_large
+            if len(dims_small) < smaller_ndims:
+                dims_small = [ds] + dims_small
+        return (dims_small, dims_large, dims_full)
+
+    @staticmethod
+    def _test_broadcast(self, cast):
+
+        # all functions
+        fns = {
+            "dist", "atan2", "pow", "lerp", "add",
+            "sub", "mul", "div", "fmod", "remainder",
+            "eq", "ge", "gt", "le", "lt", "max", "min", "ne",
+            "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill",
+            "map", "map2", "copy"
+        }
+        # functions with three tensor arguments
+        fns_3_args = {"addcdiv", "addcmul", "map2"}
+
+        for fn in fns:
+            (dims_small, dims_large, dims_full) = self._select_broadcastable_dims()
+            small = cast(torch.randn(*dims_small).float())
+            large = cast(torch.randn(*dims_large).float())
+            small_expanded = small.expand(*dims_full)
+            large_expanded = large.expand(*dims_full)
+            small2 = None
+            small2_expanded = None
+            if fn in fns_3_args:
+                # create another smaller tensor
+                (dims_small2, _, _) = self._select_broadcastable_dims(dims_full)
+                small2 = cast(torch.randn(*dims_small2).float())
+                small2_expanded = small2.expand(*dims_full)
+
+            if small.is_cuda and fn in ['map', 'map2']:
+                # map and map2 are not implementd on CUDA tensors
+                continue
+
+            # TODO: fix masked_scatter and masked_fill broadcasting
+            if hasattr(large_expanded, fn) and fn not in ['masked_scatter', 'masked_fill']:
+                # run through tensor versions of functions
+                # and verify fully expanded inputs give same results
+                expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
+
+                def tensorfn(myfn, t1, t2):
+                    if fn == "lerp":
+                        return myfn(t1, 0.5)
+                    elif fn == "masked_select":
+                        return myfn(t1 < 0)
+                    elif fn in fns_3_args:
+                        return myfn(1, t1, t2)
+                    else:
+                        return myfn(t1)
+
+                # test various orders
+                for first, second, third in [(large, small, small2), (small, large, small2),
+                                             (small2, small, large), (small2, large, small)]:
+                    if first is None:
+                        break  # ignore last iter when small2 is None
+                    method_expanded = getattr(expanded[first], fn)
+                    method = getattr(first, fn)
+                    r1 = tensorfn(method_expanded, expanded[second], expanded[third])
+                    r2 = tensorfn(method, second, third)
+                    self.assertEqual(r1, r2)
+
+            # now for torch. versions of functions
+            if hasattr(torch, fn):
+                fntorch = getattr(torch, fn)
+                expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
+
+                def torchfn(t1, t2, t3):
+                    if fn == "lerp":
+                        return fntorch(t1, t2, 0.5)
+                    elif fn == "masked_select":
+                        return fntorch(t1, t2 < 0)
+                    elif fn == "masked_scatter":
+                        return fntorch(t1, t2 < 0.5, cast(torch.arange(1, t1.nelement() + 1).float()))
+                    elif fn == "masked_fill":
+                        return fntorch(t1, t2 < 0.5, 1.0)
+                    elif fn in fns_3_args:
+                        return fntorch(t1, 1.0, t2, t3)
+                    else:
+                        return fntorch(t1, t2)
+
+                # test various orders
+                for first, second, third in [(large, small, small2), (small, large, small2),
+                                             (small2, small, large), (small2, large, small)]:
+                    if first is None:
+                        break  # ignore last iter when small2 is None
+                    r1 = torchfn(expanded[first], expanded[second], expanded[third])
+                    r2 = torchfn(first, second, third)
+                    self.assertEqual(r1, r2)
+
+            # now for in place functions
+            # in-place tensor is not broadcastable; test only guaranteed
+            # to work by broadcasting other argument(s)
+            if not hasattr(large_expanded, fn + "_"):
+                continue
+
+            # need to clone largeExpanded so we can reuse, since functions are in-place
+            large_expanded_clone = large_expanded.clone()
+
+            def tensorfn_inplace(t0, t1, t2=None):
+                t0_fn = getattr(t0, fn + "_")
+                if fn == "lerp":
+                    return t0_fn(t1, 0.5)
+                elif fn == "masked_scatter":
+                    return t0_fn(t1 < 0.5, cast(torch.arange(1, t0.nelement() + 1).float()))
+                elif fn == "masked_fill":
+                    return t0_fn(t1 < 0.5, 1.0)
+                elif fn == "map":
+                    return t0_fn(t1, lambda x, y: x + y)
+                elif fn == "map2":
+                    return t0_fn(t1, t2, lambda x, y, z: x + y + z)
+                elif fn in fns_3_args:
+                    return t0_fn(1.0, t1, t2)
+                else:
+                    return t0_fn(t1)
+            r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded)
+            r2 = tensorfn_inplace(large_expanded_clone, small, small2)
+            # in-place pointwise operations don't actually work if the in-place
+            # tensor is 0-strided (numpy has the same issue)
+            if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()):
+                self.assertEqual(r1, r2)
+
+            def broadcastable(t0, t1, t2=None):
+                try:
+                    t1.expand_as(t0)
+                    if t2 is not None:
+                        t2.expand_as(t0)
+                except RuntimeError:
+                    return False
+                return True
+
+            def _test_in_place_broadcastable(t0, t1, t2=None):
+                if not broadcastable(t0, t1, t2):
+                    same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True)
+                    if not same_size:
+                        self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2))
+                else:
+                    tensorfn_inplace(t0, t1, t2)
+
+            if fn not in fns_3_args:
+                _test_in_place_broadcastable(small, large_expanded)
+                _test_in_place_broadcastable(small, large)
+            else:
+                _test_in_place_broadcastable(small2, small_expanded, large_expanded)
+                _test_in_place_broadcastable(small2, small, large)
+
+    def test_broadcast(self):
+        self._test_broadcast(self, lambda t: t)
+
+    @skipIfNoZeroSize
+    def test_broadcast_empty(self):
+        # empty + empty
+        self.assertRaises(RuntimeError, lambda: torch.randn(5, 0) + torch.randn(0, 5))
+        self.assertEqual(torch.randn(5, 0), torch.randn(0) + torch.randn(5, 0))
+        self.assertEqual(torch.randn(5, 0, 0), torch.randn(0) + torch.randn(5, 0, 1))
+
+        # scalar + empty
+        self.assertEqual(torch.randn(5, 0, 6), torch.randn(()) + torch.randn(5, 0, 6))
+
+        # non-empty, empty
+        self.assertEqual(torch.randn(0), torch.randn(0) + torch.randn(1))
+        self.assertEqual(torch.randn(0, 7, 0, 6, 5, 0, 7),
+                         torch.randn(0, 7, 0, 6, 5, 0, 1) + torch.randn(1, 1, 5, 1, 7))
+        self.assertRaises(RuntimeError, lambda: torch.randn(7, 0) + torch.randn(2, 1))
+
+    @staticmethod
+    def _test_contiguous(self, cast):
+        x = cast(torch.randn(1, 16, 5, 5))
+        self.assertTrue(x.is_contiguous())
+        stride = list(x.stride())
+        stride[0] = 20
+        # change the stride in dimension 0. the tensor is still contiguous because size[0] is 1
+        x.set_(x.storage(), 0, x.size(), stride)
+        self.assertTrue(x.is_contiguous())
+
+    def test_contiguous(self):
+        return self._test_contiguous(self, lambda t: t)
+
+    def test_empty_tensor_props(self):
+        sizes = [(0,)]
+        if torch._C._use_zero_size_dim():
+            sizes += [(0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for size in sizes:
+            for device in devices:
+                x = torch.empty(tuple(size), device=device)
+                self.assertEqual(size, x.shape)
+                self.assertTrue(x.is_contiguous())
+                size_ones_instead_of_zeros = (x if x != 0 else 1 for x in size)
+                y = torch.empty(tuple(size_ones_instead_of_zeros), device=device)
+                self.assertEqual(x.stride(), y.stride())
+
+    def test_scalars_as_floats(self):
+        "zero-dim variables that don't require grad should bind to scalar arguments"
+        x = torch.tensor(2.)
+        y = torch.tensor(3.)
+        # 3 + (3 * 3) * 2
+        self.assertEqual(y.addcmul(y, y, value=x), 21)
+
+        x = torch.tensor(2., requires_grad=True)
+        self.assertRaises(Exception, lambda: y.addcmul(y, y, value=x))
+
+    @staticmethod
+    def _test_broadcast_fused_matmul(self, cast):
+        fns = ["baddbmm", "addbmm", "addmm", "addmv", "addr"]
+
+        for fn in fns:
+            batch_dim = random.randint(1, 8)
+            n_dim = random.randint(1, 8)
+            m_dim = random.randint(1, 8)
+            p_dim = random.randint(1, 8)
+
+            def dims_full_for_fn():
+                if fn == "baddbmm":
+                    return ([batch_dim, n_dim, p_dim], [batch_dim, n_dim, m_dim], [batch_dim, m_dim, p_dim])
+                elif fn == "addbmm":
+                    return ([n_dim, p_dim], [batch_dim, n_dim, m_dim], [batch_dim, m_dim, p_dim])
+                elif fn == "addmm":
+                    return ([n_dim, p_dim], [n_dim, m_dim], [m_dim, p_dim])
+                elif fn == "addmv":
+                    return ([n_dim], [n_dim, m_dim], [m_dim])
+                elif fn == "addr":
+                    return ([n_dim, m_dim], [n_dim], [m_dim])
+                else:
+                    raise AssertionError("unknown function")
+
+            (t0_dims_full, t1_dims, t2_dims) = dims_full_for_fn()
+            (t0_dims_small, _, _) = self._select_broadcastable_dims(t0_dims_full)
+
+            t0_small = cast(torch.randn(*t0_dims_small).float())
+            t1 = cast(torch.randn(*t1_dims).float())
+            t2 = cast(torch.randn(*t2_dims).float())
+
+            t0_full = cast(t0_small.expand(*t0_dims_full))
+
+            fntorch = getattr(torch, fn)
+            r0 = fntorch(t0_small, t1, t2)
+            r1 = fntorch(t0_full, t1, t2)
+            self.assertEqual(r0, r1)
+
+    def test_broadcast_fused_matmul(self):
+        self._test_broadcast_fused_matmul(self, lambda t: t)
+
+    @staticmethod
+    def _test_broadcast_batched_matmul(self, cast):
+        n_dim = random.randint(1, 8)
+        m_dim = random.randint(1, 8)
+        p_dim = random.randint(1, 8)
+        full_batch_dims = [random.randint(1, 3) for i in range(random.randint(1, 3))]
+        (batch_dims_small, _, _) = self._select_broadcastable_dims(full_batch_dims)
+
+        def verify_batched_matmul(full_lhs, one_dimensional):
+            if not one_dimensional:
+                lhs_dims = [n_dim, m_dim]
+                rhs_dims = [m_dim, p_dim]
+                result_dims = [n_dim, p_dim]
+            else:
+                lhs_dims = [n_dim, m_dim] if full_lhs else [m_dim]
+                rhs_dims = [m_dim, p_dim] if not full_lhs else [m_dim]
+                result_dims = [n_dim] if full_lhs else [p_dim]
+
+            lhs_mat_dims = lhs_dims if len(lhs_dims) != 1 else [1, m_dim]
+            rhs_mat_dims = rhs_dims if len(rhs_dims) != 1 else [m_dim, 1]
+            full_mat_dims = lhs_mat_dims if full_lhs else rhs_mat_dims
+            dim0_dims = rhs_dims if full_lhs else lhs_dims
+            small_dims = batch_dims_small + (rhs_mat_dims if full_lhs else lhs_mat_dims)
+
+            small = cast(torch.randn(*(small_dims)).float())
+            dim0 = cast(torch.randn(*(dim0_dims)).float())
+            full = cast(torch.randn(*(full_batch_dims + full_mat_dims)).float())
+            if not one_dimensional:
+                (lhsTensors, rhsTensors) = ((full,), (small, dim0)) if full_lhs else ((small, dim0), (full,))
+            else:
+                (lhsTensors, rhsTensors) = ((full,), (dim0,)) if full_lhs else ((dim0,), (full,))
+
+            def maybe_squeeze_result(l, r, result):
+                if len(lhs_dims) == 1 and l.dim() != 1:
+                    return result.squeeze(-2)
+                elif len(rhs_dims) == 1 and r.dim() != 1:
+                    return result.squeeze(-1)
+                else:
+                    return result
+
+            for lhs in lhsTensors:
+                lhs_expanded = lhs.expand(*(torch.Size(full_batch_dims) + torch.Size(lhs_mat_dims)))
+                lhs_expanded_matmul_fn = getattr(lhs_expanded, "matmul")
+                for rhs in rhsTensors:
+                    rhs_expanded = ((rhs if len(rhs_dims) != 1 else rhs.unsqueeze(-1)).
+                                    expand(*(torch.Size(full_batch_dims) + torch.Size(rhs_mat_dims))))
+                    truth = maybe_squeeze_result(lhs_expanded, rhs_expanded, lhs_expanded_matmul_fn(rhs_expanded))
+                    for l in (lhs, lhs_expanded):
+                        for r in (rhs, rhs_expanded):
+                            l_matmul_fn = getattr(l, "matmul")
+                            result = maybe_squeeze_result(l, r, l_matmul_fn(r))
+                            self.assertEqual(truth, result)
+                            # test torch.matmul function as well
+                            torch_result = maybe_squeeze_result(l, r, torch.matmul(l, r))
+                            self.assertEqual(truth, torch_result)
+                            # test torch.matmul with out
+                            out = torch.zeros_like(torch_result)
+                            torch.matmul(l, r, out=out)
+                            self.assertEqual(truth, maybe_squeeze_result(l, r, out))
+
+                # compare to bmm
+                bmm_result = (torch.bmm(lhs_expanded.contiguous().view(-1, *lhs_mat_dims),
+                                        rhs_expanded.contiguous().view(-1, *rhs_mat_dims)))
+                self.assertEqual(truth.view(-1, *result_dims), bmm_result.view(-1, *result_dims))
+
+        for indices in product((True, False), repeat=2):
+            verify_batched_matmul(*indices)
+
+    def test_broadcast_batched_matmul(self):
+        self._test_broadcast_batched_matmul(self, lambda t: t)
+
+    def test_copy_broadcast(self):
+        torch.zeros(5, 6).copy_(torch.zeros(6))
+        self.assertRaises(RuntimeError, lambda: torch.zeros(5, 6).copy_(torch.zeros(30)))
+
+    def test_randperm(self):
+        _RNGState = torch.get_rng_state()
+        res1 = torch.randperm(100)
+        res2 = torch.LongTensor()
+        torch.set_rng_state(_RNGState)
+        torch.randperm(100, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+        # randperm of 0 elements is an empty tensor
+        res1 = torch.randperm(0)
+        res2 = torch.LongTensor(5)
+        torch.randperm(0, out=res2)
+        self.assertEqual(res1.numel(), 0)
+        self.assertEqual(res2.numel(), 0)
+
+    def test_random(self):
+        # This test is flaky with p<=(2/(ub-lb))^200=6e-36
+        t = torch.FloatTensor(200)
+        lb = 1
+        ub = 4
+
+        t.fill_(-1)
+        t.random_(lb, ub)
+        self.assertEqual(t.min(), lb)
+        self.assertEqual(t.max(), ub - 1)
+
+        t.fill_(-1)
+        t.random_(ub)
+        self.assertEqual(t.min(), 0)
+        self.assertEqual(t.max(), ub - 1)
+
+    @staticmethod
+    def _test_random_neg_values(self, use_cuda=False):
+        signed_types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor',
+                        'torch.IntTensor', 'torch.ShortTensor']
+        for tname in signed_types:
+            res = torch.rand(SIZE, SIZE).type(tname)
+            if use_cuda:
+                res = res.cuda()
+            res.random_(-10, -1)
+            self.assertLessEqual(res.max().item(), 9)
+            self.assertGreaterEqual(res.min().item(), -10)
+
+    def test_random_neg_values(self):
+        self._test_random_neg_values(self)
+
+    def assertIsOrdered(self, order, x, mxx, ixx, task):
+        SIZE = 4
+        if order == 'descending':
+            def check_order(a, b):
+                return a >= b
+        elif order == 'ascending':
+            def check_order(a, b):
+                return a <= b
+        else:
+            error('unknown order "{}", must be "ascending" or "descending"'.format(order))
+
+        are_ordered = True
+        for j, k in product(range(SIZE), range(1, SIZE)):
+            self.assertTrue(check_order(mxx[j][k - 1], mxx[j][k]),
+                            'torch.sort ({}) values unordered for {}'.format(order, task))
+
+        seen = set()
+        indicesCorrect = True
+        size = x.size(x.dim() - 1)
+        for k in range(size):
+            seen.clear()
+            for j in range(size):
+                self.assertEqual(x[k][ixx[k][j]], mxx[k][j],
+                                 'torch.sort ({}) indices wrong for {}'.format(order, task))
+                seen.add(ixx[k][j])
+            self.assertEqual(len(seen), size)
+
+    def test_sort(self):
+        SIZE = 4
+        x = torch.rand(SIZE, SIZE)
+        res1val, res1ind = torch.sort(x)
+
+        # Test use of result tensor
+        res2val = torch.Tensor()
+        res2ind = torch.LongTensor()
+        torch.sort(x, out=(res2val, res2ind))
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # Test sorting of random numbers
+        self.assertIsOrdered('ascending', x, res2val, res2ind, 'random')
+
+        # Test simple sort
+        self.assertEqual(
+            torch.sort(torch.Tensor((50, 40, 30, 20, 10)))[0],
+            torch.Tensor((10, 20, 30, 40, 50)),
+            0
+        )
+
+        # Test that we still have proper sorting with duplicate keys
+        x = torch.floor(torch.rand(SIZE, SIZE) * 10)
+        torch.sort(x, out=(res2val, res2ind))
+        self.assertIsOrdered('ascending', x, res2val, res2ind, 'random with duplicate keys')
+
+        # DESCENDING SORT
+        x = torch.rand(SIZE, SIZE)
+        res1val, res1ind = torch.sort(x, x.dim() - 1, True)
+
+        # Test use of result tensor
+        res2val = torch.Tensor()
+        res2ind = torch.LongTensor()
+        torch.sort(x, x.dim() - 1, True, out=(res2val, res2ind))
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # Test sorting of random numbers
+        self.assertIsOrdered('descending', x, res2val, res2ind, 'random')
+
+        # Test simple sort task
+        self.assertEqual(
+            torch.sort(torch.Tensor((10, 20, 30, 40, 50)), 0, True)[0],
+            torch.Tensor((50, 40, 30, 20, 10)),
+            0
+        )
+
+        # Test that we still have proper sorting with duplicate keys
+        self.assertIsOrdered('descending', x, res2val, res2ind, 'random with duplicate keys')
+
+    def test_topk(self):
+        def topKViaSort(t, k, dim, dir):
+            sorted, indices = t.sort(dim, dir)
+            return sorted.narrow(dim, 0, k), indices.narrow(dim, 0, k)
+
+        def compareTensors(t, res1, ind1, res2, ind2, dim):
+            # Values should be exactly equivalent
+            self.assertEqual(res1, res2, 0)
+
+            # Indices might differ based on the implementation, since there is
+            # no guarantee of the relative order of selection
+            if not ind1.eq(ind2).all():
+                # To verify that the indices represent equivalent elements,
+                # gather from the input using the topk indices and compare against
+                # the sort indices
+                vals = t.gather(dim, ind2)
+                self.assertEqual(res1, vals, 0)
+
+        def compare(t, k, dim, dir):
+            topKVal, topKInd = t.topk(k, dim, dir, True)
+            sortKVal, sortKInd = topKViaSort(t, k, dim, dir)
+            compareTensors(t, sortKVal, sortKInd, topKVal, topKInd, dim)
+
+        t = torch.rand(random.randint(1, SIZE),
+                       random.randint(1, SIZE),
+                       random.randint(1, SIZE))
+
+        for _kTries in range(3):
+            for _dimTries in range(3):
+                for transpose in (True, False):
+                    for dir in (True, False):
+                        testTensor = t
+                        if transpose:
+                            dim1 = random.randrange(t.ndimension())
+                            dim2 = dim1
+                            while dim1 == dim2:
+                                dim2 = random.randrange(t.ndimension())
+
+                            testTensor = t.transpose(dim1, dim2)
+
+                        dim = random.randrange(testTensor.ndimension())
+                        k = random.randint(1, testTensor.size(dim))
+                        compare(testTensor, k, dim, dir)
+
+    def test_topk_arguments(self):
+        q = torch.randn(10, 2, 10)
+        # Make sure True isn't mistakenly taken as the 2nd dimension (interpreted as 1)
+        self.assertRaises(TypeError, lambda: q.topk(4, True))
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_topk_noncontiguous_gpu(self):
+        t = torch.randn(20, device="cuda")[::2]
+        top1, idx1 = t.topk(5)
+        top2, idx2 = t.contiguous().topk(5)
+        self.assertEqual(top1, top2)
+        self.assertEqual(idx1, idx2)
+
+    def test_kthvalue(self):
+        SIZE = 50
+        x = torch.rand(SIZE, SIZE, SIZE)
+        x0 = x.clone()
+
+        k = random.randint(1, SIZE)
+        res1val, res1ind = torch.kthvalue(x, k, keepdim=False)
+        res2val, res2ind = torch.sort(x)
+
+        self.assertEqual(res1val[:, :], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], 0)
+        # test use of result tensors
+        k = random.randint(1, SIZE)
+        res1val = torch.Tensor()
+        res1ind = torch.LongTensor()
+        torch.kthvalue(x, k, keepdim=False, out=(res1val, res1ind))
+        res2val, res2ind = torch.sort(x)
+        self.assertEqual(res1val[:, :], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], 0)
+
+        # test non-default dim
+        k = random.randint(1, SIZE)
+        res1val, res1ind = torch.kthvalue(x, k, 0, keepdim=False)
+        res2val, res2ind = torch.sort(x, 0)
+        self.assertEqual(res1val, res2val[k - 1], 0)
+        self.assertEqual(res1ind, res2ind[k - 1], 0)
+
+        # non-contiguous
+        y = x.narrow(1, 0, 1)
+        y0 = y.contiguous()
+        k = random.randint(1, SIZE)
+        res1val, res1ind = torch.kthvalue(y, k)
+        res2val, res2ind = torch.kthvalue(y0, k)
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # check that the input wasn't modified
+        self.assertEqual(x, x0, 0)
+
+        # simple test case (with repetitions)
+        y = torch.Tensor((3, 5, 4, 1, 1, 5))
+        self.assertEqual(torch.kthvalue(y, 3)[0], 3, 0)
+        self.assertEqual(torch.kthvalue(y, 2)[0], 1, 0)
+
+    def test_median(self):
+        for size in (155, 156):
+            x = torch.rand(size, size)
+            x0 = x.clone()
+
+            nelem = x.nelement()
+            res1val = torch.median(x)
+            res2val, _ = torch.sort(x.view(nelem))
+            ind = int(math.floor((nelem + 1) / 2) - 1)
+
+            self.assertEqual(res2val[ind], res1val, 0)
+
+            res1val, res1ind = torch.median(x, dim=1, keepdim=False)
+            res2val, res2ind = torch.sort(x)
+            ind = int(math.floor((size + 1) / 2) - 1)
+
+            self.assertEqual(res2val.select(1, ind), res1val, 0)
+            self.assertEqual(res2val.select(1, ind), res1val, 0)
+
+            # Test use of result tensor
+            res2val = torch.Tensor()
+            res2ind = torch.LongTensor()
+            torch.median(x, dim=-1, keepdim=False, out=(res2val, res2ind))
+            self.assertEqual(res2val, res1val, 0)
+            self.assertEqual(res2ind, res1ind, 0)
+
+            # Test non-default dim
+            res1val, res1ind = torch.median(x, 0, keepdim=False)
+            res2val, res2ind = torch.sort(x, 0)
+            self.assertEqual(res1val, res2val[ind], 0)
+            self.assertEqual(res1ind, res2ind[ind], 0)
+
+            # input unchanged
+            self.assertEqual(x, x0, 0)
+
+    def test_mode(self):
+        x = torch.arange(1., SIZE * SIZE + 1).clone().resize_(SIZE, SIZE)
+        x[:2] = 1
+        x[:, :2] = 1
+        x0 = x.clone()
+
+        # Pre-calculated results.
+        res1val = torch.Tensor(SIZE).fill_(1)
+        # The indices are the position of the last appearance of the mode element.
+        res1ind = torch.LongTensor(SIZE).fill_(1)
+        res1ind[0] = SIZE - 1
+        res1ind[1] = SIZE - 1
+
+        res2val, res2ind = torch.mode(x, keepdim=False)
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # Test use of result tensor
+        res2val = torch.Tensor()
+        res2ind = torch.LongTensor()
+        torch.mode(x, keepdim=False, out=(res2val, res2ind))
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # Test non-default dim
+        res2val, res2ind = torch.mode(x, 0, False)
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
+
+        # input unchanged
+        self.assertEqual(x, x0, 0)
+
+    def test_tril(self):
+        x = torch.rand(SIZE, SIZE)
+        res1 = torch.tril(x)
+        res2 = torch.Tensor()
+        torch.tril(x, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+    def test_triu(self):
+        x = torch.rand(SIZE, SIZE)
+        res1 = torch.triu(x)
+        res2 = torch.Tensor()
+        torch.triu(x, out=res2)
+        self.assertEqual(res1, res2, 0)
+
+    def test_cat(self):
+        SIZE = 10
+        for dim in range(-3, 3):
+            pos_dim = dim if dim >= 0 else 3 + dim
+            x = torch.rand(13, SIZE, SIZE).transpose(0, pos_dim)
+            y = torch.rand(17, SIZE, SIZE).transpose(0, pos_dim)
+            z = torch.rand(19, SIZE, SIZE).transpose(0, pos_dim)
+
+            res1 = torch.cat((x, y, z), dim)
+            self.assertEqual(res1.narrow(pos_dim, 0, 13), x, 0)
+            self.assertEqual(res1.narrow(pos_dim, 13, 17), y, 0)
+            self.assertEqual(res1.narrow(pos_dim, 30, 19), z, 0)
+
+        x = torch.randn(20, SIZE, SIZE)
+        self.assertEqual(torch.cat(torch.split(x, 7)), x)
+        self.assertEqual(torch.cat(torch.chunk(x, 7)), x)
+
+        y = torch.randn(1, SIZE, SIZE)
+        z = torch.cat([x, y])
+        self.assertEqual(z.size(), (21, SIZE, SIZE))
+
+        self.assertRaises(RuntimeError, lambda: torch.cat([]))
+
+    def test_cat_bad_input_sizes(self):
+        x = torch.randn(2, 1)
+        y = torch.randn(2, 1, 1)
+        z = torch.randn(2, 1, 1)
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z]))
+
+        x = torch.randn(2, 1, 2)
+        y = torch.randn(2, 1, 1)
+        z = torch.randn(2, 2, 1)
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1))
+
+    def test_cat_scalars(self):
+        x = torch.tensor(0)
+        y = torch.tensor(1)
+        with self.assertRaisesRegex(RuntimeError, 'zero-dimensional.*cannot be concatenated'):
+            torch.cat([x, y])
+
+    @staticmethod
+    def _test_cat_empty_legacy(self, use_cuda=False):
+        # FIXME: this is legacy behavior and should be removed
+        # when we support empty tensors with arbitrary sizes
+        dtype = torch.float32
+        device = 'cuda' if use_cuda else 'cpu'
+
+        x = torch.randn((4, 3, 32, 32), dtype=dtype, device=device)
+        empty = torch.randn((0,), dtype=dtype, device=device)
+
+        res1 = torch.cat([x, empty], dim=1)
+        res2 = torch.cat([empty, x], dim=1)
+        self.assertEqual(res1, res2)
+
+        conv = torch.nn.Conv2d(3, 3, kernel_size=1).float()
+        if use_cuda:
+            conv = conv.cuda()
+        res1 = torch.cat([conv(x), empty], dim=1)
+        res2 = torch.cat([empty, conv(x)], dim=1)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.cat([empty, empty], dim=1)
+        self.assertEqual(res1, empty)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    'expected a non-empty list of Tensors'):
+            torch.cat([], dim=1)
+
+    def test_cat_empty_legacy(self):
+        self._test_cat_empty_legacy(self)
+
+    @staticmethod
+    def _test_cat_empty(self, use_cuda=False):
+        if not torch._C._use_zero_size_dim():
+            return
+
+        dtype = torch.float32
+        device = 'cuda' if use_cuda else 'cpu'
+
+        x = torch.randn((4, 3, 32, 32), dtype=dtype, device=device)
+        empty = torch.randn((4, 0, 32, 32), dtype=dtype, device=device)
+
+        res1 = torch.cat([x, empty], dim=1)
+        res2 = torch.cat([empty, x], dim=1)
+        self.assertEqual(res1, res2)
+
+        conv = torch.nn.Conv2d(3, 3, kernel_size=1).float()
+        if use_cuda:
+            conv = conv.cuda()
+        res1 = torch.cat([conv(x), empty], dim=1)
+        res2 = torch.cat([empty, conv(x)], dim=1)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.cat([empty, empty], dim=1)
+        self.assertEqual(res1, empty)
+
+        # check non-legacy-behavior (sizes don't match)
+        empty = torch.randn((4, 0, 31, 32), dtype=dtype, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, empty], dim=1))
+        self.assertRaises(RuntimeError, lambda: torch.cat([empty, x], dim=1))
+
+        # check non-legacy-behavior (dimensions don't match)
+        empty = torch.randn((4, 0), dtype=dtype, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.cat([x, empty], dim=1))
+        self.assertRaises(RuntimeError, lambda: torch.cat([empty, x], dim=1))
+
+    def test_cat_empty(self):
+        self._test_cat_empty(self)
+
+    def test_narrow_empty(self):
+        if not torch._C._use_zero_size_dim():
+            return
+
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn(2, 3, 4, device=device)
+            for d in range(x.dim()):
+                y = x.narrow(d, x.size(d), 0)
+                sz = list(x.size())
+                sz[d] = 0
+                self.assertEqual(sz, y.size())
+
+    def test_stack(self):
+        x = torch.rand(2, 3, 4)
+        y = torch.rand(2, 3, 4)
+        z = torch.rand(2, 3, 4)
+        for dim in range(4):
+            res = torch.stack((x, y, z), dim)
+            res_neg = torch.stack((x, y, z), dim - 4)
+            expected_size = x.size()[:dim] + (3,) + x.size()[dim:]
+            self.assertEqual(res, res_neg)
+            self.assertEqual(res.size(), expected_size)
+            self.assertEqual(res.select(dim, 0), x, 0)
+            self.assertEqual(res.select(dim, 1), y, 0)
+            self.assertEqual(res.select(dim, 2), z, 0)
+
+    def test_stack_out(self):
+        x = torch.rand(2, 3, 4)
+        y = torch.rand(2, 3, 4)
+        z = torch.rand(2, 3, 4)
+        for dim in range(4):
+            expected_size = x.size()[:dim] + (3,) + x.size()[dim:]
+            res_out = x.new(expected_size)
+            res_neg_out = x.new(expected_size)
+            res_out_dp = res_out.data_ptr()
+            res_out_neg_dp = res_neg_out.data_ptr()
+            torch.stack((x, y, z), dim, out=res_out)
+            torch.stack((x, y, z), dim - 4, out=res_neg_out)
+            self.assertEqual(res_out, res_neg_out)
+            self.assertEqual(res_out.size(), expected_size)
+            self.assertEqual(res_out_dp, res_out.data_ptr())
+            self.assertEqual(res_out_neg_dp, res_neg_out.data_ptr())
+            self.assertEqual(res_out.select(dim, 0), x, 0)
+            self.assertEqual(res_out.select(dim, 1), y, 0)
+            self.assertEqual(res_out.select(dim, 2), z, 0)
+
+    def test_unbind(self):
+        x = torch.rand(2, 3, 4, 5)
+        for dim in range(4):
+            res = torch.unbind(x, dim)
+            res2 = x.unbind(dim)
+            self.assertEqual(x.size(dim), len(res))
+            self.assertEqual(x.size(dim), len(res2))
+            for i in range(dim):
+                self.assertEqual(x.select(dim, i), res[i])
+                self.assertEqual(x.select(dim, i), res2[i])
+
+    def test_linspace(self):
+        _from = random.random()
+        to = _from + random.random()
+        res1 = torch.linspace(_from, to, 137)
+        res2 = torch.Tensor()
+        torch.linspace(_from, to, 137, out=res2)
+        self.assertEqual(res1, res2, 0)
+        self.assertRaises(RuntimeError, lambda: torch.linspace(0, 1, 1))
+        self.assertEqual(torch.linspace(0, 0, 1), torch.zeros(1), 0)
+
+        # Check linspace for generating with start > end.
+        self.assertEqual(torch.linspace(2, 0, 3), torch.Tensor((2, 1, 0)), 0)
+
+        # Check linspace for non-contiguous tensors.
+        x = torch.zeros(2, 3)
+        y = torch.linspace(0, 3, 4, out=x.narrow(1, 1, 2))
+        self.assertEqual(x, torch.Tensor(((0, 0, 1), (0, 2, 3))), 0)
+
+    def test_logspace(self):
+        _from = random.random()
+        to = _from + random.random()
+        res1 = torch.logspace(_from, to, 137)
+        res2 = torch.Tensor()
+        torch.logspace(_from, to, 137, out=res2)
+        self.assertEqual(res1, res2, 0)
+        self.assertRaises(RuntimeError, lambda: torch.logspace(0, 1, 1))
+        self.assertEqual(torch.logspace(0, 0, 1), torch.ones(1), 0)
+
+        # Check logspace_ for generating with start > end.
+        self.assertEqual(torch.logspace(1, 0, 2), torch.Tensor((10, 1)), 0)
+
+        # Check logspace_ for non-contiguous tensors.
+        x = torch.zeros(2, 3)
+        y = torch.logspace(0, 3, 4, out=x.narrow(1, 1, 2))
+        self.assertEqual(x, torch.Tensor(((0, 1, 10), (0, 100, 1000))), 0)
+
+    def test_rand(self):
+        torch.manual_seed(123456)
+        res1 = torch.rand(SIZE, SIZE)
+        res2 = torch.Tensor()
+        torch.manual_seed(123456)
+        torch.rand(SIZE, SIZE, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_randint(self):
+        torch.manual_seed(123456)
+        res1 = torch.randint(0, 6, (SIZE, SIZE))
+        res2 = torch.Tensor()
+        torch.manual_seed(123456)
+        torch.randint(0, 6, (SIZE, SIZE), out=res2)
+        torch.manual_seed(123456)
+        res3 = torch.randint(6, (SIZE, SIZE))
+        res4 = torch.Tensor()
+        torch.manual_seed(123456)
+        torch.randint(6, (SIZE, SIZE), out=res4)
+        self.assertEqual(res1, res2)
+        self.assertEqual(res1, res3)
+        self.assertEqual(res1, res4)
+        self.assertEqual(res2, res3)
+        self.assertEqual(res2, res4)
+        self.assertEqual(res3, res4)
+        res1 = res1.view(-1)
+        high = (res1 < 6).type(torch.LongTensor)
+        low = (res1 >= 0).type(torch.LongTensor)
+        tensorSize = res1.size()[0]
+        assert(tensorSize == high.sum())
+        assert(tensorSize == low.sum())
+
+    def test_randn(self):
+        torch.manual_seed(123456)
+        res1 = torch.randn(SIZE, SIZE)
+        res2 = torch.Tensor()
+        torch.manual_seed(123456)
+        torch.randn(SIZE, SIZE, out=res2)
+        self.assertEqual(res1, res2)
+
+    def test_slice(self):
+        empty = torch.empty(0, 4) if torch._C._use_zero_size_dim() else torch.Tensor()
+        x = torch.arange(0., 16).view(4, 4)
+        self.assertEqual(x[:], x)
+        self.assertEqual(x[:4], x)
+        # start and stop are clamped to the size of dim
+        self.assertEqual(x[:5], x)
+        # if start >= stop then the result is empty
+        self.assertEqual(x[2:1], empty)
+        self.assertEqual(x[2:2], empty)
+        # out of bounds is also empty
+        self.assertEqual(x[10:12], empty)
+        # additional correctness checks
+        self.assertEqual(x[:1].data.tolist(), [[0, 1, 2, 3]])
+        self.assertEqual(x[:-3].data.tolist(), [[0, 1, 2, 3]])
+        self.assertEqual(x[:, -2:3].data.tolist(), [[2], [6], [10], [14]])
+        self.assertEqual(x[0:-1:2].data.tolist(), [[0, 1, 2, 3], [8, 9, 10, 11]])
+
+    def test_is_signed(self):
+        self.assertEqual(torch.IntTensor(5).is_signed(), True)
+        self.assertEqual(torch.ByteTensor(5).is_signed(), False)
+        self.assertEqual(torch.CharTensor(5).is_signed(), True)
+        self.assertEqual(torch.FloatTensor(5).is_signed(), True)
+        self.assertEqual(torch.HalfTensor(10).is_signed(), True)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_is_signed_cuda(self):
+        self.assertEqual(torch.cuda.IntTensor(5).is_signed(), True)
+        self.assertEqual(torch.cuda.ByteTensor(5).is_signed(), False)
+        self.assertEqual(torch.cuda.CharTensor(5).is_signed(), True)
+        self.assertEqual(torch.cuda.FloatTensor(5).is_signed(), True)
+        self.assertEqual(torch.cuda.HalfTensor(10).is_signed(), True)
+
+    @skipIfNoLapack
+    def test_gesv(self):
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
+
+        res1 = torch.gesv(b, a)[0]
+        self.assertLessEqual(b.dist(torch.mm(a, res1)), 1e-12)
+
+        ta = torch.Tensor()
+        tb = torch.Tensor()
+        res2 = torch.gesv(b, a, out=(tb, ta))[0]
+        res3 = torch.gesv(b, a, out=(b, a))[0]
+        self.assertEqual(res1, tb)
+        self.assertEqual(res1, b)
+        self.assertEqual(res1, res2)
+        self.assertEqual(res1, res3)
+
+        # test reuse
+        res1 = torch.gesv(b, a)[0]
+        ta = torch.Tensor()
+        tb = torch.Tensor()
+        torch.gesv(b, a, out=(tb, ta))[0]
+        self.assertEqual(res1, tb)
+        torch.gesv(b, a, out=(tb, ta))[0]
+        self.assertEqual(res1, tb)
+
+    @staticmethod
+    def _test_gesv_batched(self, cast):
+        # test against gesv: one batch
+        A = cast(torch.randn(1, 5, 5))
+        b = cast(torch.randn(1, 5, 10))
+        x_exp, LU_exp = torch.gesv(b.squeeze(0), A.squeeze(0))
+        x, LU = torch.gesv(b, A)
+        self.assertEqual(x, x_exp.unsqueeze(0))
+        self.assertEqual(LU, LU_exp.unsqueeze(0))
+
+        # test against gesv in a loop: four batches
+        A = cast(torch.randn(4, 5, 5))
+        b = cast(torch.randn(4, 5, 10))
+
+        x_exp_list = list()
+        LU_exp_list = list()
+        for i in range(4):
+            x_exp, LU_exp = torch.gesv(b[i], A[i])
+            x_exp_list.append(x_exp)
+            LU_exp_list.append(LU_exp)
+        x_exp = torch.stack(x_exp_list)
+        LU_exp = torch.stack(LU_exp_list)
+
+        x, LU = torch.gesv(b, A)
+        self.assertEqual(x, x_exp)
+        self.assertEqual(LU, LU_exp)
+
+        # basic correctness test
+        A = cast(torch.randn(3, 5, 5))
+        b = cast(torch.randn(3, 5, 10))
+        x, LU = torch.gesv(b, A)
+        self.assertEqual(torch.matmul(A, x), b)
+
+        # Test non-contiguous inputs.
+        if not TEST_NUMPY:
+            return
+        import numpy
+        from numpy.linalg import solve
+        A = cast(torch.randn(2, 2, 2)).permute(1, 0, 2)
+        b = cast(torch.randn(2, 2, 2)).permute(2, 1, 0)
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+    @skipIfNoLapack
+    def test_gesv_batched(self):
+        self._test_gesv_batched(self, lambda t: t)
+
+    @staticmethod
+    def _test_gesv_batched_dims(self, cast):
+        if not TEST_NUMPY:
+            return
+
+        import numpy
+        from numpy.linalg import solve
+
+        # test against numpy.linalg.solve
+        A = cast(torch.randn(2, 1, 3, 4, 4))
+        b = cast(torch.randn(2, 1, 3, 4, 6))
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+        # test column major format
+        A = cast(torch.randn(2, 1, 3, 4, 4)).transpose(-2, -1)
+        b = cast(torch.randn(2, 1, 3, 6, 4)).transpose(-2, -1)
+        assert not A.is_contiguous()
+        assert not b.is_contiguous()
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+        # broadcasting b
+        A = cast(torch.randn(2, 1, 3, 4, 4))
+        b = cast(torch.randn(4, 6))
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+        # broadcasting A
+        A = cast(torch.randn(4, 4))
+        b = cast(torch.randn(2, 1, 3, 4, 2))
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+        # broadcasting both A & b
+        A = cast(torch.randn(1, 3, 1, 4, 4))
+        b = cast(torch.randn(2, 1, 3, 4, 5))
+        x, _ = torch.gesv(b, A)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(x.data, cast(x_exp))
+
+    @skipIfNoLapack
+    def test_gesv_batched_dims(self):
+        self._test_gesv_batched_dims(self, lambda t: t)
+
+    @skipIfNoLapack
+    def test_qr(self):
+
+        # Since the QR decomposition is unique only up to the signs of the rows of
+        # R, we must ensure these are positive before doing the comparison.
+        def canonicalize(q, r):
+            d = r.diag().sign().diag()
+            return torch.mm(q, d), torch.mm(d, r)
+
+        def canon_and_check(q, r, expected_q, expected_r):
+            q_canon, r_canon = canonicalize(q, r)
+            expected_q_canon, expected_r_canon = canonicalize(expected_q, expected_r)
+            self.assertEqual(q_canon, expected_q_canon)
+            self.assertEqual(r_canon, expected_r_canon)
+
+        def check_qr(a, expected_q, expected_r):
+            # standard invocation
+            q, r = torch.qr(a)
+            canon_and_check(q, r, expected_q, expected_r)
+
+            # in-place
+            q, r = torch.Tensor(), torch.Tensor()
+            torch.qr(a, out=(q, r))
+            canon_and_check(q, r, expected_q, expected_r)
+
+            # manually calculate qr using geqrf and orgqr
+            m = a.size(0)
+            n = a.size(1)
+            k = min(m, n)
+            result, tau = torch.geqrf(a)
+            self.assertEqual(result.size(0), m)
+            self.assertEqual(result.size(1), n)
+            self.assertEqual(tau.size(0), k)
+            r = torch.triu(result.narrow(0, 0, k))
+            q = torch.orgqr(result, tau)
+            q, r = q.narrow(1, 0, k), r
+            canon_and_check(q, r, expected_q, expected_r)
+
+        # check square case
+        a = torch.Tensor(((1, 2, 3), (4, 5, 6), (7, 8, 10)))
+
+        expected_q = torch.Tensor((
+            (-1.230914909793328e-01, 9.045340337332914e-01, 4.082482904638621e-01),
+            (-4.923659639173310e-01, 3.015113445777629e-01, -8.164965809277264e-01),
+            (-8.616404368553292e-01, -3.015113445777631e-01, 4.082482904638634e-01)))
+        expected_r = torch.Tensor((
+            (-8.124038404635959e+00, -9.601136296387955e+00, -1.193987e+01),
+            (0.000000000000000e+00, 9.045340337332926e-01, 1.507557e+00),
+            (0.000000000000000e+00, 0.000000000000000e+00, 4.082483e-01)))
+
+        check_qr(a, expected_q, expected_r)
+
+        # check rectangular thin
+        a = torch.Tensor((
+            (1, 2, 3),
+            (4, 5, 6),
+            (7, 8, 9),
+            (10, 11, 13),
+        ))
+        expected_q = torch.Tensor((
+            (-0.0776150525706334, -0.833052161400748, 0.3651483716701106),
+            (-0.3104602102825332, -0.4512365874254053, -0.1825741858350556),
+            (-0.5433053679944331, -0.0694210134500621, -0.7302967433402217),
+            (-0.7761505257063329, 0.3123945605252804, 0.5477225575051663)
+        ))
+        expected_r = torch.Tensor((
+            (-12.8840987267251261, -14.5916298832790581, -17.0753115655393231),
+            (0, -1.0413152017509357, -1.770235842976589),
+            (0, 0, 0.5477225575051664)
+        ))
+
+        check_qr(a, expected_q, expected_r)
+
+        # check rectangular fat
+        a = torch.Tensor((
+            (1, 2, 3, 4),
+            (5, 6, 7, 8),
+            (9, 10, 11, 13)
+        ))
+        expected_q = torch.Tensor((
+            (-0.0966736489045663, 0.907737593658436, 0.4082482904638653),
+            (-0.4833682445228317, 0.3157348151855452, -0.8164965809277254),
+            (-0.870062840141097, -0.2762679632873518, 0.4082482904638621)
+        ))
+        expected_r = torch.Tensor((
+            (-1.0344080432788603e+01, -1.1794185166357092e+01,
+             -1.3244289899925587e+01, -1.5564457473635180e+01),
+            (0.0000000000000000e+00, 9.4720444555662542e-01,
+             1.8944088911132546e+00, 2.5653453733825331e+00),
+            (0.0000000000000000e+00, 0.0000000000000000e+00,
+             1.5543122344752192e-15, 4.0824829046386757e-01)
+        ))
+        check_qr(a, expected_q, expected_r)
+
+        # check big matrix
+        a = torch.randn(1000, 1000)
+        q, r = torch.qr(a)
+        a_qr = torch.mm(q, r)
+        self.assertEqual(a, a_qr, prec=1e-3)
+
+    @skipIfNoLapack
+    def test_ormqr(self):
+        mat1 = torch.randn(10, 10)
+        mat2 = torch.randn(10, 10)
+        q, r = torch.qr(mat1)
+        m, tau = torch.geqrf(mat1)
+
+        res1 = torch.mm(q, mat2)
+        res2 = torch.ormqr(m, tau, mat2)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(mat2, q)
+        res2 = torch.ormqr(m, tau, mat2, False)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(q.t(), mat2)
+        res2 = torch.ormqr(m, tau, mat2, True, True)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(mat2, q.t())
+        res2 = torch.ormqr(m, tau, mat2, False, True)
+        self.assertEqual(res1, res2)
+
+    @staticmethod
+    def _test_trtrs(self, cast):
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
+
+        a = cast(a)
+        b = cast(b)
+
+        U = torch.triu(a)
+        L = torch.tril(a)
+
+        # solve Ux = b
+        x = torch.trtrs(b, U)[0]
+        self.assertLessEqual(b.dist(torch.mm(U, x)), 1e-12)
+        x = torch.trtrs(b, U, True, False, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(U, x)), 1e-12)
+
+        # solve Lx = b
+        x = torch.trtrs(b, L, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L, x)), 1e-12)
+        x = torch.trtrs(b, L, False, False, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L, x)), 1e-12)
+
+        # solve U'x = b
+        x = torch.trtrs(b, U, True, True)[0]
+        self.assertLessEqual(b.dist(torch.mm(U.t(), x)), 1e-12)
+        x = torch.trtrs(b, U, True, True, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(U.t(), x)), 1e-12)
+
+        # solve U'x = b by manual transposition
+        y = torch.trtrs(b, U.t(), False, False)[0]
+        self.assertLessEqual(x.dist(y), 1e-12)
+
+        # solve L'x = b
+        x = torch.trtrs(b, L, False, True)[0]
+        self.assertLessEqual(b.dist(torch.mm(L.t(), x)), 1e-12)
+        x = torch.trtrs(b, L, False, True, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L.t(), x)), 1e-12)
+
+        # solve L'x = b by manual transposition
+        y = torch.trtrs(b, L.t(), True, False)[0]
+        self.assertLessEqual(x.dist(y), 1e-12)
+
+        # test reuse
+        res1 = torch.trtrs(b, a)[0]
+        ta = cast(torch.Tensor())
+        tb = cast(torch.Tensor())
+        torch.trtrs(b, a, out=(tb, ta))
+        self.assertEqual(res1, tb, 0)
+        tb.zero_()
+        torch.trtrs(b, a, out=(tb, ta))
+        self.assertEqual(res1, tb, 0)
+
+    @skipIfNoLapack
+    def test_trtrs(self):
+        self._test_trtrs(self, lambda t: t)
+
+    @skipIfNoLapack
+    def test_gels(self):
+        def _test_underdetermined(a, b, expectedNorm):
+            m = a.size()[0]
+            n = a.size()[1]
+            assert(m <= n)
+
+            a_copy = a.clone()
+            b_copy = b.clone()
+            res1 = torch.gels(b, a)[0]
+            self.assertEqual(a, a_copy, 0)
+            self.assertEqual(b, b_copy, 0)
+            self.assertEqual((torch.mm(a, res1) - b).norm(), expectedNorm, 1e-8)
+
+            ta = torch.Tensor()
+            tb = torch.Tensor()
+            res2 = torch.gels(b, a, out=(tb, ta))[0]
+            self.assertEqual(a, a_copy, 0)
+            self.assertEqual(b, b_copy, 0)
+            self.assertEqual((torch.mm(a, res1) - b).norm(), expectedNorm, 1e-8)
+
+            res3 = torch.gels(b, a, out=(b, a))[0]
+            self.assertEqual((torch.mm(a_copy, b) - b_copy).norm(), expectedNorm, 1e-8)
+            self.assertEqual(res1, tb, 0)
+            self.assertEqual(res1, b, 0)
+            self.assertEqual(res1, res2, 0)
+            self.assertEqual(res1, res3, 0)
+
+        def _test_overdetermined(a, b, expectedNorm):
+            m = a.size()[0]
+            n = a.size()[1]
+            assert(m > n)
+
+            def check_norm(a, b, expected_norm, gels_result):
+                # Checks |ax - b| and the residual info from the result
+                n = a.size()[1]
+
+                # The first n rows is the least square solution.
+                # Rows n to m-1 contain residual information.
+                x = gels_result[:n]
+                resid_info = gels_result[n:]
+
+                resid_norm = (torch.mm(a, x) - b).norm()
+                self.assertEqual(resid_norm, expectedNorm, 1e-8)
+                self.assertEqual(resid_info.norm(), resid_norm, 1e-8)
+
+            a_copy = a.clone()
+            b_copy = b.clone()
+            res1 = torch.gels(b, a)[0]
+            self.assertEqual(a, a_copy, 0)
+            self.assertEqual(b, b_copy, 0)
+            check_norm(a, b, expectedNorm, res1)
+
+            ta = torch.Tensor()
+            tb = torch.Tensor()
+            res2 = torch.gels(b, a, out=(tb, ta))[0]
+            self.assertEqual(a, a_copy, 0)
+            self.assertEqual(b, b_copy, 0)
+            check_norm(a, b, expectedNorm, res2)
+
+            res3 = torch.gels(b, a, out=(b, a))[0]
+            check_norm(a_copy, b_copy, expectedNorm, res3)
+
+            self.assertEqual(res1, tb, 0)
+            self.assertEqual(res1, b, 0)
+            self.assertEqual(res1, res2, 0)
+            self.assertEqual(res1, res3, 0)
+
+        # basic test
+        expectedNorm = 0
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34),
+                          (-7.84, -0.28, 3.24, 8.09),
+                          (-4.39, -3.24, 6.27, 5.28),
+                          (4.53, 3.83, -6.64, 2.06))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28),
+                          (9.35, -4.43, -0.70, -0.26))).t()
+        _test_underdetermined(a, b, expectedNorm)
+
+        # test overderemined
+        expectedNorm = 17.390200628863
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34, 7.08, -5.45),
+                          (-7.84, -0.28, 3.24, 8.09, 2.52, -5.70),
+                          (-4.39, -3.24, 6.27, 5.28, 0.74, -1.19),
+                          (4.53, 3.83, -6.64, 2.06, -2.47, 4.70))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28, 5.72, 8.93),
+                          (9.35, -4.43, -0.70, -0.26, -7.36, -2.52))).t()
+        _test_overdetermined(a, b, expectedNorm)
+
+        # test underdetermined
+        expectedNorm = 0
+        a = torch.Tensor(((1.44, -9.96, -7.55),
+                          (-7.84, -0.28, 3.24),
+                          (-4.39, -3.24, 6.27),
+                          (4.53, 3.83, -6.64))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48),
+                          (9.35, -4.43, -0.70))).t()
+        _test_underdetermined(a, b, expectedNorm)
+
+        # test reuse
+        expectedNorm = 0
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34),
+                          (-7.84, -0.28, 3.24, 8.09),
+                          (-4.39, -3.24, 6.27, 5.28),
+                          (4.53, 3.83, -6.64, 2.06))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28),
+                          (9.35, -4.43, -0.70, -0.26))).t()
+        ta = torch.Tensor()
+        tb = torch.Tensor()
+        torch.gels(b, a, out=(tb, ta))
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
+        torch.gels(b, a, out=(tb, ta))
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
+        torch.gels(b, a, out=(tb, ta))
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
+
+    @skipIfNoLapack
+    def test_eig(self):
+        a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
+                          (-6.49, 3.80, 0.00, 0.00, 0.00),
+                          (-0.47, -6.39, 4.17, 0.00, 0.00),
+                          (-7.20, 1.50, -1.51, 5.70, 0.00),
+                          (-0.65, -6.34, 2.67, 1.80, -7.10))).t().contiguous()
+        e = torch.eig(a)[0]
+        ee, vv = torch.eig(a, True)
+        te = torch.Tensor()
+        tv = torch.Tensor()
+        eee, vvv = torch.eig(a, True, out=(te, tv))
+        self.assertEqual(e, ee, 1e-12)
+        self.assertEqual(ee, eee, 1e-12)
+        self.assertEqual(ee, te, 1e-12)
+        self.assertEqual(vv, vvv, 1e-12)
+        self.assertEqual(vv, tv, 1e-12)
+
+        # test reuse
+        X = torch.randn(4, 4)
+        X = torch.mm(X.t(), X)
+        e, v = torch.zeros(4, 2), torch.zeros(4, 4)
+        torch.eig(X, True, out=(e, v))
+        Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
+        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
+        self.assertFalse(v.is_contiguous(), 'V is contiguous')
+
+        torch.eig(X, True, out=(e, v))
+        Xhat = torch.mm(v, torch.mm(e.select(1, 0).diag(), v.t()))
+        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
+        self.assertFalse(v.is_contiguous(), 'V is contiguous')
+
+        # test non-contiguous
+        X = torch.randn(4, 4)
+        X = torch.mm(X.t(), X)
+        e = torch.zeros(4, 2, 2)[:, 1]
+        v = torch.zeros(4, 2, 4)[:, 1]
+        self.assertFalse(v.is_contiguous(), 'V is contiguous')
+        self.assertFalse(e.is_contiguous(), 'E is contiguous')
+        torch.eig(X, True, out=(e, v))
+        Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
+        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
+
+    @skipIfNoLapack
+    def test_symeig(self):
+        xval = torch.rand(100, 3)
+        cov = torch.mm(xval.t(), xval)
+        rese = torch.zeros(3)
+        resv = torch.zeros(3, 3)
+
+        # First call to symeig
+        self.assertTrue(resv.is_contiguous(), 'resv is not contiguous')
+        torch.symeig(cov.clone(), True, out=(rese, resv))
+        ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
+        self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')
+
+        # Second call to symeig
+        self.assertFalse(resv.is_contiguous(), 'resv is contiguous')
+        torch.symeig(cov.clone(), True, out=(rese, resv))
+        ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
+        self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')
+
+        # test non-contiguous
+        X = torch.rand(5, 5)
+        X = X.t() * X
+        e = torch.zeros(4, 2).select(1, 1)
+        v = torch.zeros(4, 2, 4)[:, 1]
+        self.assertFalse(v.is_contiguous(), 'V is contiguous')
+        self.assertFalse(e.is_contiguous(), 'E is contiguous')
+        torch.symeig(X, True, out=(e, v))
+        Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t())
+        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
+
+    @skipIfNoLapack
+    def test_svd(self):
+        a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84),
+                          (9.93, 6.91, -7.93, 1.64, 4.02, 0.15),
+                          (9.83, 5.04, 4.86, 8.83, 9.80, -8.99),
+                          (5.45, -0.27, 4.85, 0.74, 10.00, -6.02),
+                          (3.16, 7.98, 3.01, 5.80, 4.27, -5.31))).t().clone()
+        u, s, v = torch.svd(a)
+        uu = torch.Tensor()
+        ss = torch.Tensor()
+        vv = torch.Tensor()
+        uuu, sss, vvv = torch.svd(a, out=(uu, ss, vv))
+        self.assertEqual(u, uu, 0, 'torch.svd')
+        self.assertEqual(u, uuu, 0, 'torch.svd')
+        self.assertEqual(s, ss, 0, 'torch.svd')
+        self.assertEqual(s, sss, 0, 'torch.svd')
+        self.assertEqual(v, vv, 0, 'torch.svd')
+        self.assertEqual(v, vvv, 0, 'torch.svd')
+
+        # test reuse
+        X = torch.randn(4, 4)
+        U, S, V = torch.svd(X)
+        Xhat = torch.mm(U, torch.mm(S.diag(), V.t()))
+        self.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')
+
+        self.assertFalse(U.is_contiguous(), 'U is contiguous')
+        torch.svd(X, out=(U, S, V))
+        Xhat = torch.mm(U, torch.mm(S.diag(), V.t()))
+        self.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')
+
+        # test non-contiguous
+        X = torch.randn(5, 5)
+        U = torch.zeros(5, 2, 5)[:, 1]
+        S = torch.zeros(5, 2)[:, 1]
+        V = torch.zeros(5, 2, 5)[:, 1]
+
+        self.assertFalse(U.is_contiguous(), 'U is contiguous')
+        self.assertFalse(S.is_contiguous(), 'S is contiguous')
+        self.assertFalse(V.is_contiguous(), 'V is contiguous')
+        torch.svd(X, out=(U, S, V))
+        Xhat = torch.mm(U, torch.mm(S.diag(), V.t()))
+        self.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')
+
+    @staticmethod
+    def _test_signal_window_functions(self, device='cpu'):
+        if not TEST_SCIPY:
+            raise unittest.SkipTest('Scipy not found')
+
+        def test(name):
+            torch_method = getattr(torch, name + '_window')
+            for size in [1, 2, 5, 10, 50, 100, 1024, 2048]:
+                for periodic in [True, False]:
+                    res = torch_method(size, periodic=periodic, device=device)
+                    ref = torch.from_numpy(signal.get_window(name, size, fftbins=periodic))
+                    self.assertEqual(res, ref)
+            with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'):
+                torch_method(3, layout=torch.sparse_coo)
+            with self.assertRaisesRegex(RuntimeError, r'floating point'):
+                torch_method(3, dtype=torch.long)
+            self.assertTrue(torch_method(3, requires_grad=True).requires_grad)
+            self.assertFalse(torch_method(3).requires_grad)
+
+        for window in ['hann', 'hamming', 'bartlett', 'blackman']:
+            test(window)
+
+    def test_signal_window_functions(self):
+        self._test_signal_window_functions(self)
+
+    @skipIfNoLapack
+    def test_inverse(self):
+        M = torch.randn(5, 5)
+        MI = torch.inverse(M)
+        E = torch.eye(5)
+        self.assertFalse(MI.is_contiguous(), 'MI is contiguous')
+        self.assertEqual(E, torch.mm(M, MI), 1e-8, 'inverse value')
+        self.assertEqual(E, torch.mm(MI, M), 1e-8, 'inverse value')
+
+        MII = torch.Tensor(5, 5)
+        torch.inverse(M, out=MII)
+        self.assertFalse(MII.is_contiguous(), 'MII is contiguous')
+        self.assertEqual(MII, MI, 0, 'inverse value in-place')
+        # second call, now that MII is transposed
+        torch.inverse(M, out=MII)
+        self.assertFalse(MII.is_contiguous(), 'MII is contiguous')
+        self.assertEqual(MII, MI, 0, 'inverse value in-place')
+
+    @staticmethod
+    def _test_pinverse(self, conv_fn):
+        def run_test(M):
+            # Testing against definition for pseudo-inverses
+            MPI = torch.pinverse(M)
+            self.assertEqual(M, M.mm(MPI).mm(M), 1e-8, 'pseudo-inverse condition 1')
+            self.assertEqual(MPI, MPI.mm(M).mm(MPI), 1e-8, 'pseudo-inverse condition 2')
+            self.assertEqual(M.mm(MPI), (M.mm(MPI)).t(), 1e-8, 'pseudo-inverse condition 3')
+            self.assertEqual(MPI.mm(M), (MPI.mm(M)).t(), 1e-8, 'pseudo-inverse condition 4')
+
+        # Square matrix
+        M = conv_fn(torch.randn(5, 5))
+        run_test(M)
+
+        # Rectangular matrix
+        M = conv_fn(torch.randn(3, 4))
+        run_test(M)
+
+        # Test inverse and pseudo-inverse for invertible matrix
+        M = torch.randn(5, 5)
+        M = conv_fn(M.mm(M.t()))
+        self.assertEqual(conv_fn(torch.eye(5)), M.pinverse().mm(M), 1e-7, 'pseudo-inverse for invertible matrix')
+
+    @skipIfNoLapack
+    def test_pinverse(self):
+        self._test_pinverse(self, conv_fn=lambda x: x)
+
+    @staticmethod
+    def _test_det_logdet_slogdet(self, conv_fn):
+        def reference_det(M):
+            # naive row reduction
+            M = M.clone()
+            l = M.size(0)
+            multiplier = 1
+            for i in range(l):
+                if M[i, 0] != 0:
+                    if i != 0:
+                        M[0], M[i] = M[i], M[0]
+                        multiplier = -1
+                    break
+            else:
+                return 0
+            for i in range(1, l):
+                row = M[i]
+                for j in range(i):
+                    row -= row[j] / M[j, j] * M[j]
+                M[i] = row
+            return M.diag().prod() * multiplier
+
+        def test_single_det(M, target, desc):
+            det = M.det()
+            logdet = M.logdet()
+            sdet, logabsdet = M.slogdet()
+            self.assertEqual(det, target, 1e-7, '{} (det)'.format(desc))
+            if det.item() < 0:
+                self.assertTrue(logdet.item() != logdet.item(), '{} (logdet negative case)'.format(desc))
+                self.assertTrue(sdet.item() == -1, '{} (slogdet sign negative case)'.format(desc))
+                self.assertEqual(logabsdet.exp(), det.abs(), 1e-7, '{} (slogdet logabsdet negative case)'.format(desc))
+            elif det.item() == 0:
+                self.assertEqual(logdet.exp().item(), 0, 1e-7, '{} (logdet zero case)'.format(desc))
+                self.assertTrue(sdet.item() == 0, '{} (slogdet sign zero case)'.format(desc))
+                self.assertEqual(logabsdet.exp().item(), 0, 1e-7, '{} (slogdet logabsdet zero case)'.format(desc))
+            else:
+                self.assertEqual(logdet.exp(), det, 1e-7, '{} (logdet positive case)'.format(desc))
+                self.assertTrue(sdet.item() == 1, '{} (slogdet sign  positive case)'.format(desc))
+                self.assertEqual(logabsdet.exp(), det, 1e-7, '{} (slogdet logabsdet positive case)'.format(desc))
+
+        eye = conv_fn(torch.eye(5))
+        test_single_det(eye, torch.tensor(1, dtype=eye.dtype), 'identity')
+
+        def test(M):
+            assert M.size(0) >= 5, 'this helper fn assumes M to be at least 5x5'
+            M = conv_fn(M)
+            M_det = M.det()
+            ref_M_det = reference_det(M)
+
+            test_single_det(M, ref_M_det, 'basic')
+            if abs(ref_M_det.item()) >= 1e-10:  # skip singular
+                test_single_det(M, M.inverse().det().pow_(-1), 'inverse')
+            test_single_det(M, M.t().det(), 'transpose')
+
+            for x in [0, 2, 4]:
+                for scale in [-2, -0.1, 0, 10]:
+                    target = M_det * scale
+                    # dim 0
+                    M_clone = M.clone()
+                    M_clone[:, x] *= scale
+                    test_single_det(M_clone, target, 'scale a row')
+                    # dim 1
+                    M_clone = M.clone()
+                    M_clone[x, :] *= scale
+                    test_single_det(M_clone, target, 'scale a column')
+
+            for x1, x2 in [(0, 3), (4, 1), (3, 2)]:
+                assert x1 != x2, 'x1 and x2 needs to be different for this test'
+                target = M_det.clone().zero_()
+                # dim 0
+                M_clone = M.clone()
+                M_clone[:, x2] = M_clone[:, x1]
+                test_single_det(M_clone, target, 'two rows are same')
+                # dim 1
+                M_clone = M.clone()
+                M_clone[x2, :] = M_clone[x1, :]
+                test_single_det(M_clone, target, 'two columns are same')
+
+                for scale1, scale2 in [(0.3, -1), (0, 2), (10, 0.1)]:
+                    target = -M_det * scale1 * scale2
+                    # dim 0
+                    M_clone = M.clone()
+                    t = M_clone[:, x1] * scale1
+                    M_clone[:, x1] += M_clone[:, x2] * scale2
+                    M_clone[:, x2] = t
+                    test_single_det(M_clone, target, 'exchanging rows')
+                    # dim 1
+                    M_clone = M.clone()
+                    t = M_clone[x1, :] * scale1
+                    M_clone[x1, :] += M_clone[x2, :] * scale2
+                    M_clone[x2, :] = t
+                    test_single_det(M_clone, target, 'exchanging columns')
+
+        def get_random_mat_scale(n):
+            # For matrices with values i.i.d. with 0 mean, unit variance, and
+            # subexponential tail, we have:
+            #   E[log det(A^2)] \approx log((n-1)!)
+            #
+            # Notice:
+            #   log Var[det(A)] = log E[det(A^2)] >= E[log det(A^2)]
+            #
+            # So:
+            #   stddev[det(A)] >= sqrt( (n-1)! )
+            #
+            # We use this as an intuitive guideline to scale random generated
+            # matrices so our closeness tests can work more robustly:
+            #   scale by sqrt( (n-1)! )^(-1/n) = ( (n-1)! )^(-1/(2n))
+            #
+            # source: https://arxiv.org/pdf/1112.0752.pdf
+            return math.factorial(n - 1) ** (-1.0 / (2 * n))
+
+        for n in [5, 10, 25]:
+            scale = get_random_mat_scale(n)
+            test(torch.randn(n, n) * scale)
+            r = torch.randn(n, n) * scale
+            # symmetric psd
+            test(r.mm(r.t()))
+            # symmetric pd
+            r = torch.randn(n, n) * scale
+            test(r.mm(r.t()) + torch.eye(n) * 1e-6)
+            # symmetric
+            r = torch.randn(n, n) * scale
+            for i in range(n):
+                for j in range(i):
+                    r[i, j] = r[j, i]
+            test(r)
+            # non-contiguous
+            test((torch.randn(n, n, n + 1) * scale)[:, 2, 1:])
+            # det = 0
+            r = torch.randn(n, n) * scale
+            u, s, v = r.svd()
+            if reference_det(u) < 0:
+                u = -u
+            if reference_det(v) < 0:
+                v = -v
+            s[0] *= -1
+            s[-1] = 0
+            test(u.mm(s.diag()).mm(v))
+
+    @skipIfNoLapack
+    def test_det_logdet_slogdet(self):
+        self._test_det_logdet_slogdet(self, lambda x: x)
+
+    @staticmethod
+    def _test_fft_ifft_rfft_irfft(self, device='cpu'):
+        def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
+            x = prepro_fn(torch.randn(*sizes, device=device))
+            for normalized in (True, False):
+                res = x.fft(signal_ndim, normalized=normalized)
+                rec = res.ifft(signal_ndim, normalized=normalized)
+                self.assertEqual(x, rec, 1e-8, 'fft and ifft')
+                res = x.ifft(signal_ndim, normalized=normalized)
+                rec = res.fft(signal_ndim, normalized=normalized)
+                self.assertEqual(x, rec, 1e-8, 'ifft and fft')
+
+        def _test_real(sizes, signal_ndim, prepro_fn=lambda x: x):
+            x = prepro_fn(torch.randn(*sizes, device=device))
+            signal_numel = 1
+            signal_sizes = x.size()[-signal_ndim:]
+            for normalized, onesided in product((True, False), repeat=2):
+                res = x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
+                if not onesided:  # check Hermitian symmetry
+                    def test_one_sample(res, test_num=10):
+                        idxs_per_dim = [torch.LongTensor(test_num).random_(s).tolist() for s in signal_sizes]
+                        for idx in zip(*idxs_per_dim):
+                            reflected_idx = tuple((s - i) % s for i, s in zip(idx, res.size()))
+                            idx_val = res.__getitem__(idx)
+                            reflected_val = res.__getitem__(reflected_idx)
+                            self.assertEqual(idx_val[0], reflected_val[0], 'rfft hermitian symmetry on real part')
+                            self.assertEqual(idx_val[1], -reflected_val[1], 'rfft hermitian symmetry on imaginary part')
+                    if len(sizes) == signal_ndim:
+                        test_one_sample(res)
+                    else:
+                        output_non_batch_shape = res.size()[-(signal_ndim + 1):]
+                        flatten_batch_res = res.view(-1, *output_non_batch_shape)
+                        nb = flatten_batch_res.size(0)
+                        test_idxs = torch.LongTensor(min(nb, 4)).random_(nb)
+                        for test_idx in test_idxs.tolist():
+                            test_one_sample(flatten_batch_res[test_idx])
+                    # compare with C2C
+                    xc = torch.stack([x, torch.zeros_like(x)], -1)
+                    xc_res = xc.fft(signal_ndim, normalized=normalized)
+                    self.assertEqual(res, xc_res)
+                test_input_signal_sizes = [signal_sizes]
+                rec = res.irfft(signal_ndim, normalized=normalized,
+                                onesided=onesided, signal_sizes=signal_sizes)
+                self.assertEqual(x, rec, 1e-8, 'rfft and irfft')
+                if not onesided:  # check that we can use C2C ifft
+                    rec = res.ifft(signal_ndim, normalized=normalized)
+                    self.assertEqual(x, rec.select(-1, 0), 1e-8, 'twosided rfft and ifft real')
+                    self.assertEqual(rec.select(-1, 1).data.abs().mean(), 0, 1e-8, 'twosided rfft and ifft imaginary')
+
+        # contiguous case
+        _test_real((100,), 1)
+        _test_real((10, 1, 10, 100), 1)
+        _test_real((100, 100), 2)
+        _test_real((2, 2, 5, 80, 60), 2)
+        _test_real((50, 40, 70), 3)
+        _test_real((30, 1, 50, 25, 20), 3)
+
+        _test_complex((100, 2), 1)
+        _test_complex((100, 100, 2), 1)
+        _test_complex((100, 100, 2), 2)
+        _test_complex((1, 20, 80, 60, 2), 2)
+        _test_complex((50, 40, 70, 2), 3)
+        _test_complex((6, 5, 50, 25, 20, 2), 3)
+
+        # non-contiguous case
+        _test_real((165,), 1, lambda x: x.narrow(0, 25, 100))  # input is not aligned to complex type
+        _test_real((100, 100, 3), 1, lambda x: x[:, :, 0])
+        _test_real((100, 100), 2, lambda x: x.t())
+        _test_real((20, 100, 10, 10), 2, lambda x: x.view(20, 100, 100)[:, :60])
+        _test_real((65, 80, 115), 3, lambda x: x[10:60, 13:53, 10:80])
+        _test_real((30, 20, 50, 25), 3, lambda x: x.transpose(1, 2).transpose(2, 3))
+
+        _test_complex((2, 100), 1, lambda x: x.t())
+        _test_complex((100, 2), 1, lambda x: x.expand(100, 100, 2))
+        _test_complex((300, 200, 3), 2, lambda x: x[:100, :100, 1:])  # input is not aligned to complex type
+        _test_complex((20, 90, 110, 2), 2, lambda x: x[:, 5:85].narrow(2, 5, 100))
+        _test_complex((40, 60, 3, 80, 2), 3, lambda x: x.transpose(2, 0).select(0, 2)[5:55, :, 10:])
+        _test_complex((30, 55, 50, 22, 2), 3, lambda x: x[:, 3:53, 15:40, 1:21])
+
+        # non-contiguous with strides not representable as aligned with complex type
+        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [3, 2, 1]))
+        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [4, 2, 2]))
+        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [4, 3, 1]))
+        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [3, 3, 1]))
+        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [4, 2, 2]))
+        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [4, 3, 1]))
+
+    @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
+    def test_fft_ifft_rfft_irfft(self):
+        self._test_fft_ifft_rfft_irfft(self)
+
+    @staticmethod
+    def _test_stft(self, device='cpu'):
+        if not TEST_LIBROSA:
+            raise unittest.SkipTest('librosa not found')
+
+        def librosa_stft(x, n_fft, hop_length, win_length, window, center):
+            if window is None:
+                window = np.ones(n_fft if win_length is None else win_length)
+            else:
+                window = window.cpu().numpy()
+            input_1d = x.dim() == 1
+            if input_1d:
+                x = x.view(1, -1)
+            result = []
+            for xi in x:
+                ri = librosa.stft(xi.cpu().numpy(), n_fft, hop_length, win_length, window, center=center)
+                result.append(torch.from_numpy(np.stack([ri.real, ri.imag], -1)))
+            result = torch.stack(result, 0)
+            if input_1d:
+                result = result[0]
+            return result
+
+        def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None,
+                  center=True, expected_error=None):
+            x = torch.randn(*sizes, device=device)
+            if win_sizes is not None:
+                window = torch.randn(*win_sizes, device=device)
+            else:
+                window = None
+            if expected_error is None:
+                result = x.stft(n_fft, hop_length, win_length, window, center=center)
+                ref_result = librosa_stft(x, n_fft, hop_length, win_length, window, center)
+                self.assertEqual(result, ref_result, 7e-6, 'stft comparison against librosa')
+            else:
+                self.assertRaises(expected_error,
+                                  lambda: x.stft(n_fft, hop_length, win_length, window, center=center))
+
+        for center in [True, False]:
+            _test((10,), 7, center=center)
+            _test((10, 4000), 1024, center=center)
+
+            _test((10,), 7, 2, center=center)
+            _test((10, 4000), 1024, 512, center=center)
+
+            _test((10,), 7, 2, win_sizes=(7,), center=center)
+            _test((10, 4000), 1024, 512, win_sizes=(1024,), center=center)
+
+            # spectral oversample
+            _test((10,), 7, 2, win_length=5, center=center)
+            _test((10, 4000), 1024, 512, win_length=100, center=center)
+
+        _test((10, 4, 2), 1, 1, expected_error=RuntimeError)
+        _test((10,), 11, 1, center=False, expected_error=RuntimeError)
+        _test((10,), -1, 1, expected_error=RuntimeError)
+        _test((10,), 3, win_length=5, expected_error=RuntimeError)
+        _test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError)
+        _test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError)
+
+    def test_stft(self):
+        self._test_stft(self)
+
+    @unittest.skip("Not implemented yet")
+    def test_conv2(self):
+        x = torch.rand(math.floor(torch.uniform(50, 100)), math.floor(torch.uniform(50, 100)))
+        k = torch.rand(math.floor(torch.uniform(10, 20)), math.floor(torch.uniform(10, 20)))
+        imvc = torch.conv2(x, k)
+        imvc2 = torch.conv2(x, k, 'V')
+        imfc = torch.conv2(x, k, 'F')
+
+        ki = k.clone()
+        ks = k.storage()
+        kis = ki.storage()
+        for i in range(ks.size() - 1, 0, -1):
+            kis[ks.size() - i + 1] = ks[i]
+        # for i=ks.size(), 1, -1 do kis[ks.size()-i+1]=ks[i] end
+        imvx = torch.xcorr2(x, ki)
+        imvx2 = torch.xcorr2(x, ki, 'V')
+        imfx = torch.xcorr2(x, ki, 'F')
+
+        self.assertEqual(imvc, imvc2, 0, 'torch.conv2')
+        self.assertEqual(imvc, imvx, 0, 'torch.conv2')
+        self.assertEqual(imvc, imvx2, 0, 'torch.conv2')
+        self.assertEqual(imfc, imfx, 0, 'torch.conv2')
+        self.assertLessEqual(math.abs(x.dot(x) - torch.xcorr2(x, x)[0][0]), 1e-10, 'torch.conv2')
+
+        xx = torch.Tensor(2, x.size(1), x.size(2))
+        xx[1].copy_(x)
+        xx[2].copy_(x)
+        kk = torch.Tensor(2, k.size(1), k.size(2))
+        kk[1].copy_(k)
+        kk[2].copy_(k)
+
+        immvc = torch.conv2(xx, kk)
+        immvc2 = torch.conv2(xx, kk, 'V')
+        immfc = torch.conv2(xx, kk, 'F')
+
+        self.assertEqual(immvc[0], immvc[1], 0, 'torch.conv2')
+        self.assertEqual(immvc[0], imvc, 0, 'torch.conv2')
+        self.assertEqual(immvc2[0], imvc2, 0, 'torch.conv2')
+        self.assertEqual(immfc[0], immfc[1], 0, 'torch.conv2')
+        self.assertEqual(immfc[0], imfc, 0, 'torch.conv2')
+
+    @unittest.skip("Not implemented yet")
+    def test_conv3(self):
+        x = torch.rand(math.floor(torch.uniform(20, 40)),
+                       math.floor(torch.uniform(20, 40)),
+                       math.floor(torch.uniform(20, 40)))
+        k = torch.rand(math.floor(torch.uniform(5, 10)),
+                       math.floor(torch.uniform(5, 10)),
+                       math.floor(torch.uniform(5, 10)))
+        imvc = torch.conv3(x, k)
+        imvc2 = torch.conv3(x, k, 'V')
+        imfc = torch.conv3(x, k, 'F')
+
+        ki = k.clone()
+        ks = k.storage()
+        kis = ki.storage()
+        for i in range(ks.size() - 1, 0, -1):
+            kis[ks.size() - i + 1] = ks[i]
+        imvx = torch.xcorr3(x, ki)
+        imvx2 = torch.xcorr3(x, ki, 'V')
+        imfx = torch.xcorr3(x, ki, 'F')
+
+        self.assertEqual(imvc, imvc2, 0, 'torch.conv3')
+        self.assertEqual(imvc, imvx, 0, 'torch.conv3')
+        self.assertEqual(imvc, imvx2, 0, 'torch.conv3')
+        self.assertEqual(imfc, imfx, 0, 'torch.conv3')
+        self.assertLessEqual(math.abs(x.dot(x) - torch.xcorr3(x, x)[0][0][0]), 4e-10, 'torch.conv3')
+
+        xx = torch.Tensor(2, x.size(1), x.size(2), x.size(3))
+        xx[1].copy_(x)
+        xx[2].copy_(x)
+        kk = torch.Tensor(2, k.size(1), k.size(2), k.size(3))
+        kk[1].copy_(k)
+        kk[2].copy_(k)
+
+        immvc = torch.conv3(xx, kk)
+        immvc2 = torch.conv3(xx, kk, 'V')
+        immfc = torch.conv3(xx, kk, 'F')
+
+        self.assertEqual(immvc[0], immvc[1], 0, 'torch.conv3')
+        self.assertEqual(immvc[0], imvc, 0, 'torch.conv3')
+        self.assertEqual(immvc2[0], imvc2, 0, 'torch.conv3')
+        self.assertEqual(immfc[0], immfc[1], 0, 'torch.conv3')
+        self.assertEqual(immfc[0], imfc, 0, 'torch.conv3')
+
+    @unittest.skip("Not implemented yet")
+    def _test_conv_corr_eq(self, fn, fn_2_to_3):
+        ix = math.floor(random.randint(20, 40))
+        iy = math.floor(random.randint(20, 40))
+        iz = math.floor(random.randint(20, 40))
+        kx = math.floor(random.randint(5, 10))
+        ky = math.floor(random.randint(5, 10))
+        kz = math.floor(random.randint(5, 10))
+
+        x = torch.rand(ix, iy, iz)
+        k = torch.rand(kx, ky, kz)
+
+        o3 = fn(x, k)
+        o32 = torch.zeros(o3.size())
+        fn_2_to_3(x, k, o3, o32)
+        self.assertEqual(o3, o32)
+
+    @unittest.skip("Not implemented yet")
+    def test_xcorr3_xcorr2_eq(self):
+        def reference(x, k, o3, o32):
+            for i in range(o3.size(1)):
+                for j in range(k.size(1)):
+                    o32[i].add(torch.xcorr2(x[i + j - 1], k[j]))
+        self._test_conv_corr_eq(lambda x, k: torch.xcorr3(x, k), reference)
+
+    @unittest.skip("Not implemented yet")
+    def test_xcorr3_xcorr2_eq_full(self):
+        def reference(x, k, o3, o32):
+            for i in range(x.size(1)):
+                for j in range(k.size(1)):
+                    o32[i].add(torch.xcorr2(x[i], k[k.size(1) - j + 1], 'F'))
+        self._test_conv_corr_eq(lambda x, k: torch.xcorr3(x, k, 'F'), reference)
+
+    @unittest.skip("Not implemented yet")
+    def test_conv3_conv2_eq_valid(self):
+        def reference(x, k, o3, o32):
+            for i in range(o3.size(1)):
+                for j in range(k.size(1)):
+                    o32[i].add(torch.conv2(x[i + j - 1], k[k.size(1) - j + 1]))
+        self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k), reference)
+
+    @unittest.skip("Not implemented yet")
+    def test_fconv3_fconv2_eq(self):
+        def reference(x, k, o3, o32):
+            for i in range(o3.size(1)):
+                for j in range(k.size(1)):
+                    o32[i + j - 1].add(torch.conv2(x[i], k[j], 'F'))
+        self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k, 'F'), reference)
+
+    def test_logical(self):
+        x = torch.rand(100, 100) * 2 - 1
+
+        xgt = torch.gt(x, 1)
+        xlt = torch.lt(x, 1)
+
+        xeq = torch.eq(x, 1)
+        xne = torch.ne(x, 1)
+
+        neqs = xgt + xlt
+        all = neqs + xeq
+        self.assertEqual(neqs.long().sum(), xne.long().sum(), 0)
+        self.assertEqual(x.nelement(), all.long().sum())
+
+    def test_isfinite(self):
+        x = torch.Tensor([1, inf, 2, -inf, nan, -10])
+        self.assertEqual(torch.isfinite(x), torch.ByteTensor([1, 0, 1, 0, 0, 1]))
+
+    def test_isinf(self):
+        x = torch.Tensor([1, inf, 2, -inf, nan])
+        self.assertEqual(torch.isinf(x), torch.ByteTensor([0, 1, 0, 1, 0]))
+
+    def test_isnan(self):
+        x = torch.Tensor([1, nan, 2])
+        self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0]))
+
+    def test_RNGState(self):
+        state = torch.get_rng_state()
+        stateCloned = state.clone()
+        before = torch.rand(1000)
+
+        self.assertEqual(state.ne(stateCloned).long().sum(), 0, 0)
+
+        torch.set_rng_state(state)
+        after = torch.rand(1000)
+        self.assertEqual(before, after, 0)
+
+    def test_RNGStateAliasing(self):
+        # Fork the random number stream at this point
+        gen = torch.Generator()
+        gen.set_state(torch.get_rng_state())
+        self.assertEqual(gen.get_state(), torch.get_rng_state())
+
+        target_value = torch.rand(1000)
+        # Dramatically alter the internal state of the main generator
+        _ = torch.rand(100000)
+        forked_value = torch.rand(1000, generator=gen)
+        self.assertEqual(target_value, forked_value, 0, "RNG has not forked correctly.")
+
+    def test_RNG_after_pickle(self):
+        torch.random.manual_seed(100)
+        before = torch.rand(10)
+
+        torch.random.manual_seed(100)
+        buf = io.BytesIO()
+        tensor = torch.Tensor([1, 2, 3])
+        ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(tensor)
+        after = torch.rand(10)
+
+        self.assertEqual(before, after, 0)
+
+    def test_boxMullerState(self):
+        torch.manual_seed(123)
+        odd_number = 101
+        seeded = torch.randn(odd_number)
+        state = torch.get_rng_state()
+        midstream = torch.randn(odd_number)
+        torch.set_rng_state(state)
+        repeat_midstream = torch.randn(odd_number)
+        torch.manual_seed(123)
+        reseeded = torch.randn(odd_number)
+        self.assertEqual(midstream, repeat_midstream, 0,
+                         'get_rng_state/set_rng_state not generating same sequence of normally distributed numbers')
+        self.assertEqual(seeded, reseeded, 0,
+                         'repeated calls to manual_seed not generating same sequence of normally distributed numbers')
+
+    def test_manual_seed(self):
+        rng_state = torch.get_rng_state()
+        torch.manual_seed(2)
+        x = torch.randn(100)
+        self.assertEqual(torch.initial_seed(), 2)
+        torch.manual_seed(2)
+        y = torch.randn(100)
+        self.assertEqual(x, y)
+        torch.set_rng_state(rng_state)
+
+    @skipIfNoLapack
+    def test_cholesky(self):
+        x = torch.rand(10, 10) + 1e-1
+        A = torch.mm(x, x.t())
+
+        # default Case
+        C = torch.potrf(A)
+        B = torch.mm(C.t(), C)
+        self.assertEqual(A, B, 1e-14)
+
+        # test Upper Triangular
+        U = torch.potrf(A, True)
+        B = torch.mm(U.t(), U)
+        self.assertEqual(A, B, 1e-14, 'potrf (upper) did not allow rebuilding the original matrix')
+
+        # test Lower Triangular
+        L = torch.potrf(A, False)
+        B = torch.mm(L, L.t())
+        self.assertEqual(A, B, 1e-14, 'potrf (lower) did not allow rebuilding the original matrix')
+
+    @skipIfNoLapack
+    def test_potrs(self):
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
+
+        # make sure 'a' is symmetric PSD
+        a = torch.mm(a, a.t())
+
+        # upper Triangular Test
+        U = torch.potrf(a)
+        x = torch.potrs(b, U)
+        self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)
+
+        # lower Triangular Test
+        L = torch.potrf(a, False)
+        x = torch.potrs(b, L, False)
+        self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)
+
+    @skipIfNoLapack
+    def tset_potri(self):
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+
+        # make sure 'a' is symmetric PSD
+        a = a * a.t()
+
+        # compute inverse directly
+        inv0 = torch.inverse(a)
+
+        # default case
+        chol = torch.potrf(a)
+        inv1 = torch.potri(chol)
+        self.assertLessEqual(inv0.dist(inv1), 1e-12)
+
+        # upper Triangular Test
+        chol = torch.potrf(a, 'U')
+        inv1 = torch.potri(chol, 'U')
+        self.assertLessEqual(inv0.dist(inv1), 1e-12)
+
+        # lower Triangular Test
+        chol = torch.potrf(a, 'L')
+        inv1 = torch.potri(chol, 'L')
+        self.assertLessEqual(inv0.dist(inv1), 1e-12)
+
+    @skipIfNoLapack
+    def test_pstrf(self):
+        def checkPsdCholesky(a, uplo, inplace):
+            if inplace:
+                u = torch.empty_like(a)
+                piv = a.new(a.size(0)).int()
+                kwargs = {'out': (u, piv)}
+            else:
+                kwargs = {}
+            args = [a]
+
+            if uplo is not None:
+                args += [uplo]
+
+            u, piv = torch.pstrf(*args, **kwargs)
+
+            if uplo is False:
+                a_reconstructed = torch.mm(u, u.t())
+            else:
+                a_reconstructed = torch.mm(u.t(), u)
+
+            piv = piv.long()
+            a_permuted = a.index_select(0, piv).index_select(1, piv)
+            self.assertEqual(a_permuted, a_reconstructed, 1e-14)
+
+        dimensions = ((5, 1), (5, 3), (5, 5), (10, 10))
+        for dim in dimensions:
+            m = torch.Tensor(*dim).uniform_()
+            a = torch.mm(m, m.t())
+            # add a small number to the diagonal to make the matrix numerically positive semidefinite
+            for i in range(m.size(0)):
+                a[i][i] = a[i][i] + 1e-7
+            for inplace in (True, False):
+                for uplo in (None, True, False):
+                    checkPsdCholesky(a, uplo, inplace)
+
+    def test_numel(self):
+        b = torch.ByteTensor(3, 100, 100)
+        self.assertEqual(b.nelement(), 3 * 100 * 100)
+        self.assertEqual(b.numel(), 3 * 100 * 100)
+
+    def _consecutive(self, size, start=1):
+        sequence = torch.ones(int(torch.Tensor(size).prod(0))).cumsum(0)
+        sequence.add_(start - 1)
+        return sequence.resize_(*size)
+
+    @staticmethod
+    def _test_index(self, conv_fn):
+
+        def consec(size, start=1):
+            sequence = torch.ones(int(torch.Tensor(size).prod(0))).cumsum(0)
+            sequence.add_(start - 1)
+            return sequence.view(*size)
+
+        reference = conv_fn(consec((3, 3, 3)))
+
+        # empty tensor indexing
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3))
+        else:
+            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new())
+
+        self.assertEqual(reference[0], consec((3, 3)), 0)
+        self.assertEqual(reference[1], consec((3, 3), 10), 0)
+        self.assertEqual(reference[2], consec((3, 3), 19), 0)
+        self.assertEqual(reference[0, 1], consec((3,), 4), 0)
+        self.assertEqual(reference[0:2], consec((2, 3, 3)), 0)
+        self.assertEqual(reference[2, 2, 2], 27, 0)
+        self.assertEqual(reference[:], consec((3, 3, 3)), 0)
+
+        # indexing with Ellipsis
+        self.assertEqual(reference[..., 2], torch.Tensor([[3, 6, 9],
+                                                          [12, 15, 18],
+                                                          [21, 24, 27]]), 0)
+        self.assertEqual(reference[0, ..., 2], torch.Tensor([3, 6, 9]), 0)
+        self.assertEqual(reference[..., 2], reference[:, :, 2], 0)
+        self.assertEqual(reference[0, ..., 2], reference[0, :, 2], 0)
+        self.assertEqual(reference[0, 2, ...], reference[0, 2], 0)
+        self.assertEqual(reference[..., 2, 2, 2], 27, 0)
+        self.assertEqual(reference[2, ..., 2, 2], 27, 0)
+        self.assertEqual(reference[2, 2, ..., 2], 27, 0)
+        self.assertEqual(reference[2, 2, 2, ...], 27, 0)
+        self.assertEqual(reference[...], reference, 0)
+
+        reference_5d = conv_fn(consec((3, 3, 3, 3, 3)))
+        self.assertEqual(reference_5d[..., 1, 0], reference_5d[:, :, :, 1, 0], 0)
+        self.assertEqual(reference_5d[2, ..., 1, 0], reference_5d[2, :, :, 1, 0], 0)
+        self.assertEqual(reference_5d[2, 1, 0, ..., 1], reference_5d[2, 1, 0, :, 1], 0)
+        self.assertEqual(reference_5d[...], reference_5d, 0)
+
+        # LongTensor indexing
+        reference = conv_fn(consec((5, 5, 5)))
+        idx = conv_fn(torch.LongTensor([2, 4]))
+        self.assertEqual(reference[idx], torch.stack([reference[2], reference[4]]))
+        # TODO: enable one indexing is implemented like in numpy
+        # self.assertEqual(reference[2, idx], torch.stack([reference[2, 2], reference[2, 4]]))
+        # self.assertEqual(reference[3, idx, 1], torch.stack([reference[3, 2], reference[3, 4]])[:, 1])
+
+        # None indexing
+        self.assertEqual(reference[2, None], reference[2].unsqueeze(0))
+        self.assertEqual(reference[2, None, None], reference[2].unsqueeze(0).unsqueeze(0))
+        self.assertEqual(reference[2:4, None], reference[2:4].unsqueeze(1))
+        self.assertEqual(reference[None, 2, None, None], reference.unsqueeze(0)[:, 2].unsqueeze(0).unsqueeze(0))
+        self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2))
+
+        # indexing 0-length slice
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
+            self.assertEqual(torch.empty(0, 5), reference[slice(0), 2])
+            self.assertEqual(torch.empty(0, 5), reference[2, slice(0)])
+        else:
+            self.assertEqual(torch.tensor([]), reference[slice(0)])
+            self.assertEqual(torch.tensor([]), reference[slice(0), 2])
+            self.assertEqual(torch.tensor([]), reference[2, slice(0)])
+        self.assertEqual(torch.tensor([]), reference[2, 1:1, 2])
+
+        # indexing with step
+        reference = consec((10, 10, 10))
+        self.assertEqual(reference[1:5:2], torch.stack([reference[1], reference[3]], 0))
+        self.assertEqual(reference[1:6:2], torch.stack([reference[1], reference[3], reference[5]], 0))
+        self.assertEqual(reference[1:9:4], torch.stack([reference[1], reference[5]], 0))
+        self.assertEqual(reference[2:4, 1:5:2], torch.stack([reference[2:4, 1], reference[2:4, 3]], 1))
+        self.assertEqual(reference[3, 1:6:2], torch.stack([reference[3, 1], reference[3, 3], reference[3, 5]], 0))
+        self.assertEqual(reference[None, 2, 1:9:4], torch.stack([reference[2, 1], reference[2, 5]], 0).unsqueeze(0))
+        self.assertEqual(reference[:, 2, 1:6:2],
+                         torch.stack([reference[:, 2, 1], reference[:, 2, 3], reference[:, 2, 5]], 1))
+
+        lst = [list(range(i, i + 10)) for i in range(0, 100, 10)]
+        tensor = conv_fn(torch.DoubleTensor(lst))
+        for _i in range(100):
+            idx1_start = random.randrange(10)
+            idx1_end = idx1_start + random.randrange(1, 10 - idx1_start + 1)
+            idx1_step = random.randrange(1, 8)
+            idx1 = slice(idx1_start, idx1_end, idx1_step)
+            if random.randrange(2) == 0:
+                idx2_start = random.randrange(10)
+                idx2_end = idx2_start + random.randrange(1, 10 - idx2_start + 1)
+                idx2_step = random.randrange(1, 8)
+                idx2 = slice(idx2_start, idx2_end, idx2_step)
+                lst_indexed = list(map(lambda l: l[idx2], lst[idx1]))
+                tensor_indexed = tensor[idx1, idx2]
+            else:
+                lst_indexed = lst[idx1]
+                tensor_indexed = tensor[idx1]
+            self.assertEqual(torch.DoubleTensor(lst_indexed), tensor_indexed)
+
+        self.assertRaises(ValueError, lambda: reference[1:9:0])
+        self.assertRaises(ValueError, lambda: reference[1:9:-1])
+
+        self.assertRaises(IndexError, lambda: reference[1, 1, 1, 1])
+        self.assertRaises(IndexError, lambda: reference[1, 1, 1, 1:1])
+        self.assertRaises(IndexError, lambda: reference[3, 3, 3, 3, 3, 3, 3, 3])
+
+        self.assertRaises(IndexError, lambda: reference[0.0])
+        self.assertRaises(TypeError, lambda: reference[0.0:2.0])
+        self.assertRaises(IndexError, lambda: reference[0.0, 0.0:2.0])
+        self.assertRaises(IndexError, lambda: reference[0.0, :, 0.0:2.0])
+        self.assertRaises(IndexError, lambda: reference[0.0, ..., 0.0:2.0])
+        self.assertRaises(IndexError, lambda: reference[0.0, :, 0.0])
+
+    def test_index(self):
+        self._test_index(self, lambda x: x)
+
+    @staticmethod
+    def _test_advancedindex(self, conv_fn):
+        # Tests for Integer Array Indexing, Part I - Purely integer array
+        # indexing
+
+        def consec(size, start=1):
+            numel = reduce(lambda x, y: x * y, size, 1)
+            sequence = torch.ones(numel).cumsum(0)
+            sequence.add_(start - 1)
+            return sequence.view(*size)
+
+        # pick a random valid indexer type
+        def ri(indices):
+            choice = random.randint(0, 2)
+            if choice == 0:
+                return conv_fn(torch.LongTensor(indices))
+            elif choice == 1:
+                return list(indices)
+            else:
+                return tuple(indices)
+
+        # First, we will test indexing to generate return values
+
+        # Case 1: Purely Integer Array Indexing
+        reference = conv_fn(consec((10,)))
+        self.assertEqual(reference[[0]], consec((1,)))
+        self.assertEqual(reference[ri([0]), ], consec((1,)))
+        self.assertEqual(reference[ri([3]), ], consec((1,), 4))
+        self.assertEqual(reference[[2, 3, 4]], consec((3,), 3))
+        self.assertEqual(reference[ri([2, 3, 4]), ], consec((3,), 3))
+        self.assertEqual(reference[ri([0, 2, 4]), ], torch.Tensor([1, 3, 5]))
+
+        # setting values
+        reference[[0]] = -2
+        self.assertEqual(reference[[0]], torch.Tensor([-2]))
+        reference[[0]] = -1
+        self.assertEqual(reference[ri([0]), ], torch.Tensor([-1]))
+        reference[[2, 3, 4]] = 4
+        self.assertEqual(reference[[2, 3, 4]], torch.Tensor([4, 4, 4]))
+        reference[ri([2, 3, 4]), ] = 3
+        self.assertEqual(reference[ri([2, 3, 4]), ], torch.Tensor([3, 3, 3]))
+        reference[ri([0, 2, 4]), ] = conv_fn(torch.Tensor([5, 4, 3]))
+        self.assertEqual(reference[ri([0, 2, 4]), ], torch.Tensor([5, 4, 3]))
+
+        # Tensor with stride != 1
+
+        # strided is [1, 3, 5, 7]
+        reference = conv_fn(consec((10,)))
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), storage_offset=0,
+                     size=torch.Size([4]), stride=[2])
+
+        self.assertEqual(strided[[0]], torch.Tensor([1]))
+        self.assertEqual(strided[ri([0]), ], torch.Tensor([1]))
+        self.assertEqual(strided[ri([3]), ], torch.Tensor([7]))
+        self.assertEqual(strided[[1, 2]], torch.Tensor([3, 5]))
+        self.assertEqual(strided[ri([1, 2]), ], torch.Tensor([3, 5]))
+        self.assertEqual(strided[ri([[2, 1], [0, 3]]), ],
+                         torch.Tensor([[5, 3], [1, 7]]))
+
+        # stride is [4, 8]
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), storage_offset=4,
+                     size=torch.Size([2]), stride=[4])
+        self.assertEqual(strided[[0]], torch.Tensor([5]))
+        self.assertEqual(strided[ri([0]), ], torch.Tensor([5]))
+        self.assertEqual(strided[ri([1]), ], torch.Tensor([9]))
+        self.assertEqual(strided[[0, 1]], torch.Tensor([5, 9]))
+        self.assertEqual(strided[ri([0, 1]), ], torch.Tensor([5, 9]))
+        self.assertEqual(strided[ri([[0, 1], [1, 0]]), ],
+                         torch.Tensor([[5, 9], [9, 5]]))
+
+        # reference is 1 2
+        #              3 4
+        #              5 6
+        reference = conv_fn(consec((3, 2)))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.Tensor([1, 3, 5]))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([1])], torch.Tensor([2, 4, 6]))
+        self.assertEqual(reference[ri([0]), ri([0])], consec((1,)))
+        self.assertEqual(reference[ri([2]), ri([1])], consec((1,), 6))
+        self.assertEqual(reference[[ri([0, 0]), ri([0, 1])]], torch.Tensor([1, 2]))
+        self.assertEqual(reference[[ri([0, 1, 1, 0, 2]), ri([1])]],
+                         torch.Tensor([2, 4, 4, 2, 6]))
+        self.assertEqual(reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+                         torch.Tensor([1, 2, 3, 3]))
+
+        rows = ri([[0, 0],
+                   [1, 2]])
+        columns = [0],
+        self.assertEqual(reference[rows, columns], torch.Tensor([[1, 1],
+                                                                [3, 5]]))
+
+        rows = ri([[0, 0],
+                   [1, 2]])
+        columns = ri([1, 0])
+        self.assertEqual(reference[rows, columns], torch.Tensor([[2, 1],
+                                                                [4, 5]]))
+        rows = ri([[0, 0],
+                   [1, 2]])
+        columns = ri([[0, 1],
+                      [1, 0]])
+        self.assertEqual(reference[rows, columns], torch.Tensor([[1, 2],
+                                                                [4, 5]]))
+
+        # setting values
+        reference[ri([0]), ri([1])] = -1
+        self.assertEqual(reference[ri([0]), ri([1])], torch.Tensor([-1]))
+        reference[ri([0, 1, 2]), ri([0])] = conv_fn(torch.Tensor([-1, 2, -4]))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.Tensor([-1,
+                         2, -4]))
+        reference[rows, columns] = conv_fn(torch.Tensor([[4, 6], [2, 3]]))
+        self.assertEqual(reference[rows, columns],
+                         torch.Tensor([[4, 6], [2, 3]]))
+
+        # Verify still works with Transposed (i.e. non-contiguous) Tensors
+
+        reference = conv_fn(torch.Tensor([[0, 1, 2, 3],
+                                          [4, 5, 6, 7],
+                                          [8, 9, 10, 11]])).t_()
+
+        # Transposed: [[0, 4, 8],
+        #              [1, 5, 9],
+        #              [2, 6, 10],
+        #              [3, 7, 11]]
+
+        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.Tensor([0, 1,
+                         2]))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([1])], torch.Tensor([4, 5,
+                         6]))
+        self.assertEqual(reference[ri([0]), ri([0])], torch.Tensor([0]))
+        self.assertEqual(reference[ri([2]), ri([1])], torch.Tensor([6]))
+        self.assertEqual(reference[[ri([0, 0]), ri([0, 1])]], torch.Tensor([0, 4]))
+        self.assertEqual(reference[[ri([0, 1, 1, 0, 3]), ri([1])]],
+                         torch.Tensor([4, 5, 5, 4, 7]))
+        self.assertEqual(reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+                         torch.Tensor([0, 4, 1, 1]))
+
+        rows = ri([[0, 0],
+                   [1, 2]])
+        columns = [0],
+        self.assertEqual(reference[rows, columns], torch.Tensor([[0, 0],
+                                                                [1, 2]]))
+
+        rows = ri([[0, 0],
+                   [1, 2]])
+        columns = ri([1, 0])
+        self.assertEqual(reference[rows, columns], torch.Tensor([[4, 0],
+                                                                [5, 2]]))
+        rows = ri([[0, 0],
+                   [1, 3]])
+        columns = ri([[0, 1],
+                      [1, 2]])
+        self.assertEqual(reference[rows, columns], torch.Tensor([[0, 4],
+                                                                [5, 11]]))
+
+        # setting values
+        reference[ri([0]), ri([1])] = -1
+        self.assertEqual(reference[ri([0]), ri([1])], torch.Tensor([-1]))
+        reference[ri([0, 1, 2]), ri([0])] = conv_fn(torch.Tensor([-1, 2, -4]))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.Tensor([-1,
+                         2, -4]))
+        reference[rows, columns] = conv_fn(torch.Tensor([[4, 6], [2, 3]]))
+        self.assertEqual(reference[rows, columns],
+                         torch.Tensor([[4, 6], [2, 3]]))
+
+        # stride != 1
+
+        # strided is [[1 3 5 7],
+        #             [9 11 13 15]]
+
+        reference = conv_fn(torch.arange(0., 24).view(3, 8))
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), 1, size=torch.Size([2, 4]),
+                     stride=[8, 2])
+
+        self.assertEqual(strided[ri([0, 1]), ri([0])], torch.Tensor([1, 9]))
+        self.assertEqual(strided[ri([0, 1]), ri([1])], torch.Tensor([3, 11]))
+        self.assertEqual(strided[ri([0]), ri([0])], torch.Tensor([1]))
+        self.assertEqual(strided[ri([1]), ri([3])], torch.Tensor([15]))
+        self.assertEqual(strided[[ri([0, 0]), ri([0, 3])]], torch.Tensor([1, 7]))
+        self.assertEqual(strided[[ri([1]), ri([0, 1, 1, 0, 3])]],
+                         torch.Tensor([9, 11, 11, 9, 15]))
+        self.assertEqual(strided[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+                         torch.Tensor([1, 3, 9, 9]))
+
+        rows = ri([[0, 0],
+                   [1, 1]])
+        columns = [0],
+        self.assertEqual(strided[rows, columns], torch.Tensor([[1, 1],
+                                                              [9, 9]]))
+
+        rows = ri([[0, 1],
+                   [1, 0]])
+        columns = ri([1, 2])
+        self.assertEqual(strided[rows, columns], torch.Tensor([[3, 13],
+                                                              [11, 5]]))
+        rows = ri([[0, 0],
+                   [1, 1]])
+        columns = ri([[0, 1],
+                      [1, 2]])
+        self.assertEqual(strided[rows, columns], torch.Tensor([[1, 3],
+                                                              [11, 13]]))
+
+        # setting values
+
+        # strided is [[10, 11],
+        #             [17, 18]]
+
+        reference = conv_fn(torch.arange(0., 24).view(3, 8))
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
+                     stride=[7, 1])
+        self.assertEqual(strided[ri([0]), ri([1])], torch.Tensor([11]))
+        strided[ri([0]), ri([1])] = -1
+        self.assertEqual(strided[ri([0]), ri([1])], torch.Tensor([-1]))
+
+        reference = conv_fn(torch.arange(0., 24).view(3, 8))
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
+                     stride=[7, 1])
+        self.assertEqual(strided[ri([0, 1]), ri([1, 0])], torch.Tensor([11,
+                         17]))
+        strided[ri([0, 1]), ri([1, 0])] = conv_fn(torch.Tensor([-1, 2]))
+        self.assertEqual(strided[ri([0, 1]), ri([1, 0])], torch.Tensor([-1,
+                         2]))
+
+        reference = conv_fn(torch.arange(0., 24).view(3, 8))
+        strided = conv_fn(torch.Tensor())
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
+                     stride=[7, 1])
+
+        rows = ri([[0],
+                   [1]])
+        columns = ri([[0, 1],
+                      [0, 1]])
+        self.assertEqual(strided[rows, columns],
+                         torch.Tensor([[10, 11], [17, 18]]))
+        strided[rows, columns] = conv_fn(torch.Tensor([[4, 6], [2, 3]]))
+        self.assertEqual(strided[rows, columns],
+                         torch.Tensor([[4, 6], [2, 3]]))
+
+        # Tests using less than the number of dims, and ellipsis
+
+        # reference is 1 2
+        #              3 4
+        #              5 6
+        reference = conv_fn(consec((3, 2)))
+        self.assertEqual(reference[ri([0, 2]), ], torch.Tensor([[1, 2], [5, 6]]))
+        self.assertEqual(reference[ri([1]), ...], torch.Tensor([[3, 4]]))
+        self.assertEqual(reference[..., ri([1])], torch.Tensor([[2], [4], [6]]))
+
+        # verify too many indices fails
+        with self.assertRaises(IndexError):
+            reference[ri([1]), ri([0, 2]), ri([3])]
+
+        # test invalid index fails
+        reference = conv_fn(torch.empty(10))
+        # can't test cuda because it is a device assert
+        if not reference.is_cuda:
+            for err_idx in (10, -11):
+                with self.assertRaisesRegex(IndexError, r'out of'):
+                    reference[err_idx]
+                with self.assertRaisesRegex(RuntimeError, r'out of'):
+                    reference[conv_fn(torch.LongTensor([err_idx]))]
+                with self.assertRaisesRegex(RuntimeError, r'out of'):
+                    reference[[err_idx]]
+
+        if TEST_NUMPY:
+            # we use numpy to compare against, to verify that our advanced
+            # indexing semantics are the same, and also for ease of test
+            # writing
+
+            def tensor_indices_to_np(tensor, indices):
+                # convert the Torch Tensor to a numpy array
+                if (tensor.is_cuda):
+                    tensor = tensor.cpu()
+                npt = tensor.numpy()
+
+                # convert indices
+                idxs = tuple(i.tolist() if isinstance(i, torch.LongTensor) else
+                             i for i in indices)
+
+                return npt, idxs
+
+            def get_numpy(tensor, indices):
+                npt, idxs = tensor_indices_to_np(tensor, indices)
+
+                # index and return as a Torch Tensor
+                return torch.Tensor(npt[idxs])
+
+            def set_numpy(tensor, indices, value):
+                if not isinstance(value, int):
+                    if value.is_cuda:
+                        value = value.cpu()
+                    value = value.numpy()
+
+                npt, idxs = tensor_indices_to_np(tensor, indices)
+                npt[idxs] = value
+                return npt
+
+            def assert_get_eq(tensor, indexer):
+                self.assertEqual(tensor[indexer],
+                                 conv_fn(get_numpy(tensor, indexer)))
+
+            def assert_set_eq(tensor, indexer, val):
+                pyt = tensor.clone()
+                numt = tensor.clone()
+                pyt[indexer] = val
+                numt = conv_fn(torch.Tensor(set_numpy(numt, indexer, val)))
+                self.assertEqual(pyt, numt)
+
+            def get_set_tensor(indexed, indexer):
+                set_size = indexed[indexer].size()
+                set_count = indexed[indexer].numel()
+                set_tensor = conv_fn(torch.randperm(set_count).view(set_size).double())
+                return set_tensor
+
+            # Tensor is  0  1  2  3  4
+            #            5  6  7  8  9
+            #           10 11 12 13 14
+            #           15 16 17 18 19
+            reference = conv_fn(torch.arange(0., 20).view(4, 5))
+
+            indices_to_test = [
+                # grab the second, fourth columns
+                [slice(None), [1, 3]],
+
+                # first, third rows,
+                [[0, 2], slice(None)],
+
+                # weird shape
+                [slice(None), [[0, 1],
+                               [2, 3]]],
+                # negatives
+                [[-1], [0]],
+                [[0, 2], [-1]],
+                [slice(None), [-1]],
+            ]
+
+            # only test dupes on gets
+            get_indices_to_test = indices_to_test + [[slice(None), [0, 1, 1, 2, 2]]]
+
+            for indexer in get_indices_to_test:
+                assert_get_eq(reference, indexer)
+
+            for indexer in indices_to_test:
+                assert_set_eq(reference, indexer, 44)
+                assert_set_eq(reference,
+                              indexer,
+                              get_set_tensor(reference, indexer))
+
+            reference = conv_fn(torch.arange(0., 160).view(4, 8, 5))
+
+            indices_to_test = [
+                [slice(None), slice(None), [0, 3, 4]],
+                [slice(None), [2, 4, 5, 7], slice(None)],
+                [[2, 3], slice(None), slice(None)],
+                [slice(None), [0, 2, 3], [1, 3, 4]],
+                [slice(None), [0], [1, 2, 4]],
+                [slice(None), [0, 1, 3], [4]],
+                [slice(None), [[0, 1], [1, 0]], [[2, 3]]],
+                [slice(None), [[0, 1], [2, 3]], [[0]]],
+                [slice(None), [[5, 6]], [[0, 3], [4, 4]]],
+                [[0, 2, 3], [1, 3, 4], slice(None)],
+                [[0], [1, 2, 4], slice(None)],
+                [[0, 1, 3], [4], slice(None)],
+                [[[0, 1], [1, 0]], [[2, 1], [3, 5]], slice(None)],
+                [[[0, 1], [1, 0]], [[2, 3]], slice(None)],
+                [[[0, 1], [2, 3]], [[0]], slice(None)],
+                [[[2, 1]], [[0, 3], [4, 4]], slice(None)],
+                [[[2]], [[0, 3], [4, 1]], slice(None)],
+
+                # less dim, ellipsis
+                [[0, 2], ],
+                [[0, 2], slice(None)],
+                [[0, 2], Ellipsis],
+                [[0, 2], slice(None), Ellipsis],
+                [[0, 2], Ellipsis, slice(None)],
+                [[0, 2], [1, 3]],
+                [[0, 2], [1, 3], Ellipsis],
+                [Ellipsis, [1, 3], [2, 3]],
+                [Ellipsis, [2, 3, 4]],
+                [Ellipsis, slice(None), [2, 3, 4]],
+                [slice(None), Ellipsis, [2, 3, 4]],
+
+                # ellipsis counts for nothing
+                [Ellipsis, slice(None), slice(None), [0, 3, 4]],
+                [slice(None), Ellipsis, slice(None), [0, 3, 4]],
+                [slice(None), slice(None), Ellipsis, [0, 3, 4]],
+                [slice(None), slice(None), [0, 3, 4], Ellipsis],
+                [Ellipsis, [[0, 1], [1, 0]], [[2, 1], [3, 5]], slice(None)],
+                [[[0, 1], [1, 0]], [[2, 1], [3, 5]], Ellipsis, slice(None)],
+                [[[0, 1], [1, 0]], [[2, 1], [3, 5]], slice(None), Ellipsis],
+            ]
+
+            for indexer in indices_to_test:
+                assert_get_eq(reference, indexer)
+                assert_set_eq(reference, indexer, 212)
+                assert_set_eq(reference,
+                              indexer,
+                              get_set_tensor(reference, indexer))
+
+            reference = conv_fn(torch.arange(0., 1296).view(3, 9, 8, 6))
+
+            indices_to_test = [
+                [slice(None), slice(None), slice(None), [0, 3, 4]],
+                [slice(None), slice(None), [2, 4, 5, 7], slice(None)],
+                [slice(None), [2, 3], slice(None), slice(None)],
+                [[1, 2], slice(None), slice(None), slice(None)],
+                [slice(None), slice(None), [0, 2, 3], [1, 3, 4]],
+                [slice(None), slice(None), [0], [1, 2, 4]],
+                [slice(None), slice(None), [0, 1, 3], [4]],
+                [slice(None), slice(None), [[0, 1], [1, 0]], [[2, 3]]],
+                [slice(None), slice(None), [[0, 1], [2, 3]], [[0]]],
+                [slice(None), slice(None), [[5, 6]], [[0, 3], [4, 4]]],
+                [slice(None), [0, 2, 3], [1, 3, 4], slice(None)],
+                [slice(None), [0], [1, 2, 4], slice(None)],
+                [slice(None), [0, 1, 3], [4], slice(None)],
+                [slice(None), [[0, 1], [3, 4]], [[2, 3], [0, 1]], slice(None)],
+                [slice(None), [[0, 1], [3, 4]], [[2, 3]], slice(None)],
+                [slice(None), [[0, 1], [3, 2]], [[0]], slice(None)],
+                [slice(None), [[2, 1]], [[0, 3], [6, 4]], slice(None)],
+                [slice(None), [[2]], [[0, 3], [4, 2]], slice(None)],
+                [[0, 1, 2], [1, 3, 4], slice(None), slice(None)],
+                [[0], [1, 2, 4], slice(None), slice(None)],
+                [[0, 1, 2], [4], slice(None), slice(None)],
+                [[[0, 1], [0, 2]], [[2, 4], [1, 5]], slice(None), slice(None)],
+                [[[0, 1], [1, 2]], [[2, 0]], slice(None), slice(None)],
+                [[[2, 2]], [[0, 3], [4, 5]], slice(None), slice(None)],
+                [[[2]], [[0, 3], [4, 5]], slice(None), slice(None)],
+                [slice(None), [3, 4, 6], [0, 2, 3], [1, 3, 4]],
+                [slice(None), [2, 3, 4], [1, 3, 4], [4]],
+                [slice(None), [0, 1, 3], [4], [1, 3, 4]],
+                [slice(None), [6], [0, 2, 3], [1, 3, 4]],
+                [slice(None), [2, 3, 5], [3], [4]],
+                [slice(None), [0], [4], [1, 3, 4]],
+                [slice(None), [6], [0, 2, 3], [1]],
+                [slice(None), [[0, 3], [3, 6]], [[0, 1], [1, 3]], [[5, 3], [1, 2]]],
+                [[2, 2, 1], [0, 2, 3], [1, 3, 4], slice(None)],
+                [[2, 0, 1], [1, 2, 3], [4], slice(None)],
+                [[0, 1, 2], [4], [1, 3, 4], slice(None)],
+                [[0], [0, 2, 3], [1, 3, 4], slice(None)],
+                [[0, 2, 1], [3], [4], slice(None)],
+                [[0], [4], [1, 3, 4], slice(None)],
+                [[1], [0, 2, 3], [1], slice(None)],
+                [[[1, 2], [1, 2]], [[0, 1], [2, 3]], [[2, 3], [3, 5]], slice(None)],
+
+                # less dim, ellipsis
+                [Ellipsis, [0, 3, 4]],
+                [Ellipsis, slice(None), [0, 3, 4]],
+                [Ellipsis, slice(None), slice(None), [0, 3, 4]],
+                [slice(None), Ellipsis, [0, 3, 4]],
+                [slice(None), slice(None), Ellipsis, [0, 3, 4]],
+                [slice(None), [0, 2, 3], [1, 3, 4]],
+                [slice(None), [0, 2, 3], [1, 3, 4], Ellipsis],
+                [Ellipsis, [0, 2, 3], [1, 3, 4], slice(None)],
+                [[0], [1, 2, 4]],
+                [[0], [1, 2, 4], slice(None)],
+                [[0], [1, 2, 4], Ellipsis],
+                [[0], [1, 2, 4], Ellipsis, slice(None)],
+                [[1], ],
+                [[0, 2, 1], [3], [4]],
+                [[0, 2, 1], [3], [4], slice(None)],
+                [[0, 2, 1], [3], [4], Ellipsis],
+                [Ellipsis, [0, 2, 1], [3], [4]],
+            ]
+
+            for indexer in indices_to_test:
+                assert_get_eq(reference, indexer)
+                assert_set_eq(reference, indexer, 1333)
+                assert_set_eq(reference,
+                              indexer,
+                              get_set_tensor(reference, indexer))
+            indices_to_test += [
+                [slice(None), slice(None), [[0, 1], [1, 0]], [[2, 3], [3, 0]]],
+                [slice(None), slice(None), [[2]], [[0, 3], [4, 4]]],
+            ]
+            for indexer in indices_to_test:
+                assert_get_eq(reference, indexer)
+                assert_set_eq(reference, indexer, 1333)
+
+    def test_advancedindex(self):
+        self._test_advancedindex(self, lambda x: x)
+
+    @staticmethod
+    def _test_advancedindex_big(self, conv_fn):
+        reference = conv_fn(torch.arange(0, 123344).int())
+
+        self.assertEqual(reference[[0, 123, 44488, 68807, 123343], ],
+                         torch.LongTensor([0, 123, 44488, 68807, 123343]))
+
+    def test_advancedindex_big(self):
+        self._test_advancedindex_big(self, lambda x: x)
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_newaxis_numpy_comparison(self):
+        def run_test(tensor, *idx):
+            npt = tensor.numpy()
+            self.assertEqual(tensor[idx], npt[idx])
+
+        # 1D Tensor Tests
+        x = torch.arange(0, 10)
+        cases = [
+            [None],
+            [None, None],
+            [Ellipsis, None],
+            [None, Ellipsis],
+            [2, None],
+            [None, 2],
+            [Ellipsis, None, 2],
+            [Ellipsis, 2, None],
+            [2, Ellipsis, None],
+            [2, None, Ellipsis],
+            [None, 2, Ellipsis],
+            [None, Ellipsis, 2],
+        ]
+
+        for case in cases:
+            run_test(x, *case)
+
+        # 2D Tensor Tests
+        x = torch.arange(0, 12).view(3, 4)
+        cases = [
+            [None],
+            [None, None],
+            [None, None, None],
+            [Ellipsis, None],
+            [Ellipsis, None, None],
+            [None, Ellipsis],
+            [None, Ellipsis, None],
+            [None, None, Ellipsis],
+            [2, None],
+            [2, None, Ellipsis],
+            [2, Ellipsis, None],
+            [None, 2, Ellipsis],
+            [Ellipsis, 2, None],
+            [Ellipsis, None, 2],
+            [None, Ellipsis, 2],
+            [1, 2, None],
+            [1, 2, Ellipsis, None],
+            [1, Ellipsis, 2, None],
+            [Ellipsis, 1, None, 2],
+            [Ellipsis, 1, 2, None],
+            [1, None, 2, Ellipsis],
+            [None, 1, Ellipsis, 2],
+            [None, 1, 2, Ellipsis],
+        ]
+
+        for case in cases:
+            run_test(x, *case)
+
+    def test_newindex(self):
+        reference = self._consecutive((3, 3, 3))
+        # This relies on __index__() being correct - but we have separate tests for that
+
+        def checkPartialAssign(index):
+            reference = torch.zeros(3, 3, 3)
+            reference[index] = self._consecutive((3, 3, 3))[index]
+            self.assertEqual(reference[index], self._consecutive((3, 3, 3))[index], 0)
+            reference[index] = 0
+            self.assertEqual(reference, torch.zeros(3, 3, 3), 0)
+
+        checkPartialAssign(0)
+        checkPartialAssign(1)
+        checkPartialAssign(2)
+        checkPartialAssign((0, 1))
+        checkPartialAssign((1, 2))
+        checkPartialAssign((0, 2))
+        checkPartialAssign(torch.LongTensor((0, 2)))
+
+        with self.assertRaises(IndexError):
+            reference[1, 1, 1, 1] = 1
+        with self.assertRaises(IndexError):
+            reference[1, 1, 1, (1, 1)] = 1
+        with self.assertRaises(IndexError):
+            reference[3, 3, 3, 3, 3, 3, 3, 3] = 1
+        with self.assertRaises(IndexError):
+            reference[0.0] = 1
+        with self.assertRaises(TypeError):
+            reference[0.0:2.0] = 1
+        with self.assertRaises(IndexError):
+            reference[0.0, 0.0:2.0] = 1
+        with self.assertRaises(IndexError):
+            reference[0.0, :, 0.0:2.0] = 1
+        with self.assertRaises(IndexError):
+            reference[0.0, ..., 0.0:2.0] = 1
+        with self.assertRaises(IndexError):
+            reference[0.0, :, 0.0] = 1
+
+    def test_index_copy(self):
+        num_copy, num_dest = 3, 20
+        dest = torch.randn(num_dest, 4, 5)
+        src = torch.randn(num_copy, 4, 5)
+        idx = torch.randperm(num_dest).narrow(0, 0, num_copy)
+        dest2 = dest.clone()
+        dest.index_copy_(0, idx, src)
+        for i in range(idx.size(0)):
+            dest2[idx[i]] = src[i]
+        self.assertEqual(dest, dest2, 0)
+
+        dest = torch.randn(num_dest)
+        src = torch.randn(num_copy)
+        idx = torch.randperm(num_dest).narrow(0, 0, num_copy)
+        dest2 = dest.clone()
+        dest.index_copy_(0, idx, src)
+        for i in range(idx.size(0)):
+            dest2[idx[i]] = src[i]
+        self.assertEqual(dest, dest2, 0)
+
+    def test_index_add(self):
+        num_copy, num_dest = 3, 3
+        dest = torch.randn(num_dest, 4, 5)
+        src = torch.randn(num_copy, 4, 5)
+        idx = torch.randperm(num_dest).narrow(0, 0, num_copy)
+        dest2 = dest.clone()
+        dest.index_add_(0, idx, src)
+        for i in range(idx.size(0)):
+            dest2[idx[i]] += src[i]
+        self.assertEqual(dest, dest2)
+
+        dest = torch.randn(num_dest)
+        src = torch.randn(num_copy)
+        idx = torch.randperm(num_dest).narrow(0, 0, num_copy)
+        dest2 = dest.clone()
+        dest.index_add_(0, idx, src)
+        for i in range(idx.size(0)):
+            dest2[idx[i]] = dest2[idx[i]] + src[i]
+        self.assertEqual(dest, dest2)
+
+    def test_index_select(self):
+        src = torch.randn(3, 4, 5)
+        # Index can be duplicated.
+        idx = torch.LongTensor([2, 1, 0, 1, 2])
+        dest = torch.index_select(src, 0, idx)
+        self.assertEqual(dest.shape, (5, 4, 5))
+        for i in range(idx.size(0)):
+            self.assertEqual(dest[i], src[idx[i]])
+
+        # Check that 'out' is used correctly.
+        out = torch.randn(5 * 4 * 5)
+        dest = torch.index_select(src, 0, idx, out=out.view(5, 4, 5))
+        self.assertEqual(dest.shape, (5, 4, 5))
+        for i in range(idx.size(0)):
+            self.assertEqual(dest[i], src[idx[i]])
+        out.fill_(0.123)
+        self.assertEqual(out, dest.view(-1))  # Must point to the same storage.
+
+    def test_take(self):
+        def check(src, idx):
+            expected = src.contiguous().view(-1).index_select(
+                0, idx.contiguous().view(-1)).view_as(idx)
+            actual = src.take(idx)
+            self.assertEqual(actual.size(), idx.size())
+            self.assertEqual(expected, actual)
+
+        src = torch.randn(2, 3, 5)
+        idx = torch.LongTensor([[0, 2], [3, 4]])
+        check(src, idx)
+        check(src.transpose(1, 2), idx)
+
+    def test_put_(self):
+        def check(dst, idx, value):
+            expected = dst.clone().view(-1).index_copy_(
+                0, idx.contiguous().view(-1), value.contiguous().view(-1))
+            expected = expected.view_as(dst)
+            dst.put_(idx, value)
+            self.assertEqual(expected, dst)
+
+        dst = torch.randn(2, 3, 5)
+        idx = torch.LongTensor([[0, 2], [3, 4]])
+        values = torch.randn(2, 2)
+        check(dst, idx, values)
+        check(dst.transpose(1, 2), idx, values)
+
+    def test_put_accumulate(self):
+        dst = torch.ones(2, 2)
+        idx = torch.LongTensor([[0, 1], [0, 1]])
+        src = torch.Tensor([1, 2, 3, 4])
+        dst.put_(idx, src, accumulate=True)
+        self.assertEqual(dst.tolist(), [[5, 7], [1, 1]])
+
+    # Fill idx with valid indices.
+    @staticmethod
+    def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o):
+        for i in range(1 if dim == 0 else m):
+            for j in range(1 if dim == 1 else n):
+                for k in range(1 if dim == 2 else o):
+                    ii = [i, j, k]
+                    ii[dim] = slice(0, idx.size(dim) + 1)
+                    idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row]
+
+    def test_flatten(self):
+        src = torch.randn(5, 5, 5, 5)
+        flat = src.flatten(0, -1)
+        self.assertEqual(flat.shape, torch.Size([625]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(0, 2)
+        self.assertEqual(flat.shape, torch.Size([125, 5]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(0, 1)
+        self.assertEqual(flat.shape, torch.Size([25, 5, 5]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(1, 2)
+        self.assertEqual(flat.shape, torch.Size([5, 25, 5]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(2, 3)
+        self.assertEqual(flat.shape, torch.Size([5, 5, 25]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(-2, -1)
+        self.assertEqual(flat.shape, torch.Size([5, 5, 25]))
+        self.assertEqual(src.view(-1), flat.view(-1))
+
+        flat = src.flatten(2, 2)
+        self.assertEqual(flat, src)
+
+        # out of bounds index
+        with self.assertRaisesRegex(RuntimeError, 'Dimension out of range'):
+            src.flatten(5, 10)
+
+        # invalid start and end
+        with self.assertRaisesRegex(RuntimeError, 'start_dim cannot come after end_dim'):
+            src.flatten(2, 0)
+
+    @staticmethod
+    def _test_gather(self, cast, test_bounds=True):
+        m, n, o = random.randint(10, 20), random.randint(10, 20), random.randint(10, 20)
+        elems_per_row = random.randint(1, 10)
+        dim = random.randrange(3)
+
+        src = torch.randn(m, n, o)
+        idx_size = [m, n, o]
+        idx_size[dim] = elems_per_row
+        idx = torch.LongTensor().resize_(*idx_size)
+        TestTorch._fill_indices(self, idx, dim, src.size(dim), elems_per_row, m, n, o)
+
+        src = cast(src)
+        idx = cast(idx)
+
+        actual = torch.gather(src, dim, idx)
+        expected = cast(torch.Tensor().resize_(*idx_size))
+        for i in range(idx_size[0]):
+            for j in range(idx_size[1]):
+                for k in range(idx_size[2]):
+                    ii = [i, j, k]
+                    ii[dim] = idx[i, j, k]
+                    expected[i, j, k] = src[tuple(ii)]
+        self.assertEqual(actual, expected, 0)
+
+        if test_bounds:
+            idx[0][0][0] = 23
+            self.assertRaises(RuntimeError, lambda: torch.gather(src, dim, idx))
+
+        src = cast(torch.randn(3, 4, 5))
+        expected, idx = src.max(2, True)
+        expected = cast(expected)
+        idx = cast(idx)
+        actual = torch.gather(src, 2, idx)
+        self.assertEqual(actual, expected, 0)
+
+    def test_gather(self):
+        self._test_gather(self, lambda t: t)
+
+    @staticmethod
+    def _test_scatter_base(self, cast, method, is_scalar=False, test_bounds=True):
+        m, n, o = random.randint(10, 20), random.randint(10, 20), random.randint(10, 20)
+        elems_per_row = random.randint(1, 10)
+        dim = random.randrange(3)
+
+        idx_size = [m, n, o]
+        idx_size[dim] = elems_per_row
+        idx = cast(torch.LongTensor().resize_(*idx_size))
+        TestTorch._fill_indices(self, idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o)
+
+        if is_scalar:
+            src = random.random()
+        else:
+            src = cast(torch.Tensor(*idx_size).normal_())
+
+        base = cast(torch.randn(m, n, o))
+        actual = getattr(base.clone(), method)(dim, idx, src)
+        expected = base.clone()
+        for i in range(idx_size[0]):
+            for j in range(idx_size[1]):
+                for k in range(idx_size[2]):
+                    ii = [i, j, k]
+                    ii[dim] = idx[i, j, k]
+                    if method == 'scatter_' and not is_scalar:
+                        expected[tuple(ii)] = src[i, j, k]
+                    elif method == 'scatter_add_':
+                        expected[tuple(ii)] += src[i, j, k]
+                    else:
+                        expected[tuple(ii)] = src
+        self.assertEqual(actual, expected, 0)
+
+        if test_bounds:
+            idx[0][0][0] = 34
+            with self.assertRaises(RuntimeError):
+                getattr(base.clone(), method)(dim, idx, src)
+
+    def test_scatter(self):
+        self._test_scatter_base(self, lambda t: t, 'scatter_')
+
+    def test_scatterAdd(self):
+        self._test_scatter_base(self, lambda t: t, 'scatter_add_')
+
+    def test_scatterFill(self):
+        self._test_scatter_base(self, lambda t: t, 'scatter_', True)
+
+    def test_masked_scatter(self):
+        num_copy, num_dest = 3, 10
+        dest = torch.randn(num_dest)
+        src = torch.randn(num_copy)
+        mask = torch.ByteTensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0))
+        dest2 = dest.clone()
+        dest.masked_scatter_(mask, src)
+        j = 0
+        for i in range(num_dest):
+            if mask[i]:
+                dest2[i] = src[j]
+                j += 1
+        self.assertEqual(dest, dest2, 0)
+
+        # make source bigger than number of 1s in mask
+        src = torch.randn(num_dest)
+        dest.masked_scatter_(mask, src)
+
+        # make src smaller. this should fail
+        src = torch.randn(num_copy - 1)
+        with self.assertRaises(RuntimeError):
+            dest.masked_scatter_(mask, src)
+
+    def test_masked_select(self):
+        num_src = 10
+        src = torch.randn(num_src)
+        mask = torch.rand(num_src).clamp(0, 1).mul(2).floor().byte()
+        dst = src.masked_select(mask)
+        dst2 = []
+        for i in range(num_src):
+            if mask[i]:
+                dst2 += [src[i]]
+        self.assertEqual(dst, torch.Tensor(dst2), 0)
+
+    def test_masked_fill(self):
+        num_dest = 10
+        dst = torch.randn(num_dest)
+        mask = torch.rand(num_dest).mul(2).floor().byte()
+        val = random.random()
+        dst2 = dst.clone()
+        dst.masked_fill_(mask, val)
+        for i in range(num_dest):
+            if mask[i]:
+                dst2[i] = val
+        self.assertEqual(dst, dst2, 0)
+
+    def test_abs(self):
+        size = 1000
+        max_val = 1000
+        original = torch.rand(size).mul(max_val)
+        # Tensor filled with values from {-1, 1}
+        switch = torch.rand(size).mul(2).floor().mul(2).add(-1)
+
+        types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor',
+                 'torch.IntTensor', 'torch.ShortTensor']
+        for t in types:
+            data = original.type(t)
+            switch = switch.type(t)
+            res = torch.mul(data, switch)
+            # abs is used in assertEqual so we use the slow version instead
+            self.assertTensorsSlowEqual(res.abs(), data, 1e-16)
+
+        # Checking that the right abs function is called for LongTensor
+        bignumber = 2 ^ 31 + 1
+        res = torch.LongTensor((-bignumber,))
+        self.assertGreater(res.abs()[0], 0)
+
+    def test_hardshrink(self):
+        data_original = torch.tensor([1, 0.5, 0.3, 0.6]).view(2, 2)
+        float_types = [
+            'torch.DoubleTensor',
+            'torch.FloatTensor'
+        ]
+        for t in float_types:
+            data = data_original.type(t)
+            self.assertEqual(torch.tensor([1, 0.5, 0, 0.6]).view(2, 2), data.hardshrink(0.3))
+            self.assertEqual(torch.tensor([1, 0, 0, 0.6]).view(2, 2), data.hardshrink(0.5))
+
+            # test default lambd=0.5
+            self.assertEqual(data.hardshrink(), data.hardshrink(0.5))
+
+            # test non-contiguous case
+            self.assertEqual(torch.tensor([1, 0, 0.5, 0.6]).view(2, 2), data.t().hardshrink(0.3))
+
+    def test_unbiased(self):
+        tensor = torch.randn(100)
+        self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True))
+        self.assertEqual(tensor.var(), tensor.var(unbiased=True))
+        self.assertEqual(tensor.var(unbiased=False), tensor.var(0, unbiased=False))
+
+        tensor = torch.FloatTensor([1.0, 2.0])
+        self.assertEqual(tensor.var(unbiased=True), 0.5)
+        self.assertEqual(tensor.var(unbiased=False), 0.25)
+
+        tensor = torch.FloatTensor([1.0, 2.0, 3.0])
+        self.assertEqual(tensor.var(unbiased=True), 1.0)
+        self.assertEqual(tensor.var(unbiased=False), 2.0 / 3.0)
+
+        tensor = torch.randn(100)
+        self.assertEqual(tensor.std(0), tensor.std(0, unbiased=True))
+        self.assertEqual(tensor.std(), tensor.std(unbiased=True))
+        self.assertEqual(tensor.std(unbiased=False), tensor.std(0, unbiased=False))
+
+    def test_var_stability(self):
+        tensor = torch.FloatTensor([2281.5, 2281.25])
+        self.assertEqual(tensor.var(dim=0), 0.03125)
+        self.assertEqual(tensor.var(), 0.03125)
+
+    @staticmethod
+    def _test_view(self, cast):
+        tensor = cast(torch.rand(15))
+        template = cast(torch.rand(3, 5))
+        empty = cast(torch.Tensor())
+        target = template.size()
+        self.assertEqual(tensor.view_as(template).size(), target)
+        self.assertEqual(tensor.view(3, 5).size(), target)
+        self.assertEqual(tensor.view(torch.Size([3, 5])).size(), target)
+        self.assertEqual(tensor.view(-1, 5).size(), target)
+        self.assertEqual(tensor.view(3, -1).size(), target)
+        tensor_view = tensor.view(5, 3)
+        tensor_view.fill_(random.uniform(0, 1))
+        self.assertEqual(empty.view_as(empty), empty)
+        self.assertEqual(empty.view(0), empty)
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, 0))
+        self.assertRaises(RuntimeError, lambda: tensor.view(7, -1))
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, -1, -1))
+        # test view when tensor is not contiguous in every dimension, but only
+        # contiguous dimensions are touched.
+        tensor = cast(torch.rand(4, 2, 5, 1, 6, 2, 9, 3)).transpose(-1, 2).transpose(-2, 3)
+        # size:                      [   4,    2,    3,    9,    6,    2,    1,    5]
+        # stride:                    [3840, 1620,    1,    3,   54,   27,  324,  324]
+        # contiguous dim chunks:     [__________, ____, ____, __________, ____, ____]
+        # merging 1 to chunk after:  [__________, ____, ____, __________, __________]
+        contig_tensor = tensor.clone()
+        # [4, 2] => [8, 1]
+        # [3] => [3]
+        # [9] => [3, 3]
+        # [6, 2] => [4, 1, 3]
+        # [1, 5] => [5]
+        view_size = [8, 1, 3, 3, 3, 4, 1, 3, 5]
+        self.assertEqual(tensor.view(*view_size), contig_tensor.view(*view_size))
+        # [4, 2] => [2, 4]
+        # [3] => [3]
+        # [9] => [1, 9]
+        # [6, 2] => [2, 2, 3]
+        # [1, 5] => [5, 1]
+        view_size = [2, 4, 3, 1, 9, 2, 2, 3, 5, 1]
+        self.assertEqual(tensor.view(*view_size), contig_tensor.view(*view_size))
+        # adding size 1 dims
+        view_size = [1, 1, 2, 1, 4, 3, 1, 1, 9, 1, 2, 1, 2, 3, 1, 5, 1, 1]
+        self.assertEqual(tensor.view(*view_size), contig_tensor.view(*view_size))
+
+        # invalid views
+        self.assertRaises(RuntimeError, lambda: tensor.view(-1))
+        # crossing [4, 2], [3]
+        self.assertRaises(RuntimeError, lambda: tensor.view(24, 9, 6, 2, 1, 5))
+        # crossing [6, 2], [1, 5]
+        self.assertRaises(RuntimeError, lambda: tensor.view(8, 3, 9, 6, 10))
+        # crossing [9], [6, 2]
+        self.assertRaises(RuntimeError, lambda: tensor.view(8, 3, 54, 2, 1, 5))
+
+        # view with stride 0 dims
+        tensor = cast(torch.Tensor(1, 1)).expand(3, 4)  # all dims are contiguous
+        contig_tensor = tensor.clone()
+        self.assertEqual(tensor.view(-1), contig_tensor.view(-1))
+        self.assertEqual(tensor.view(1, -1, 1), contig_tensor.view(1, -1, 1))
+        self.assertEqual(tensor.view(-1, 1), contig_tensor.view(-1, 1))
+        self.assertEqual(tensor.view(6, 2, 1), contig_tensor.view(6, 2, 1))
+        self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1))
+
+    def test_view(self):
+        TestTorch._test_view(self, lambda x: x)
+
+    @skipIfNoZeroSize
+    def test_view_empty(self):
+        x = torch.randn(0, 6)
+        self.assertEqual((1, 0, 6, 1, 1), x.view(1, 0, 6, 1, 1).shape)
+
+    def test_reshape(self):
+        x = torch.randn(3, 3)
+        self.assertEqual(x.data_ptr(), x.reshape(-1).data_ptr())
+        self.assertEqual(x.data_ptr(), x.reshape(1, 9, 1).data_ptr())
+        self.assertEqual(torch.reshape(x, (9,)), x.reshape(9))
+        self.assertRaises(RuntimeError, lambda: x.reshape(-1, -1))
+
+        y = torch.randn(4, 4, 4)[:, 0, :]
+        self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr())
+        self.assertEqual(y.contiguous().view(-1), y.reshape(-1))
+        self.assertEqual(y.reshape(2, 2, 4).data_ptr(), y.data_ptr())
+
+        s = torch.randn(())
+        self.assertEqual(s.data_ptr(), s.reshape(()).data_ptr())
+        self.assertEqual(s.reshape(-1).shape, (1,))
+        self.assertRaises(RuntimeError, lambda: s.reshape(2))
+
+        empty = torch.tensor([])
+        self.assertEqual(empty, empty.reshape(-1))
+        self.assertEqual(empty, empty.reshape([0]))
+        # TODO: fix these once we have multi-dimensional empty tensors
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(empty.reshape([0, 1]).shape, (0, 1))
+            self.assertEqual(empty.reshape([1, -1]).shape, (1, 0))
+        else:
+            self.assertEqual(empty.reshape([0, 1]).shape, (0,))
+            self.assertEqual(empty.reshape([1, -1]).shape, (0,))
+        self.assertRaises(RuntimeError, lambda: empty.reshape(1))
+
+        x = torch.randn(3, 3)
+        self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(9)).data_ptr())
+        self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr())
+        self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10)))
+
+    @skipIfNoZeroSize
+    def test_empty_reshape(self):
+        x = torch.randn(0, 6)
+        self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape)
+        # should be viewable -- i.e. data_ptr is the same.
+        self.assertEqual(x.data_ptr(), x.reshape(1, 0, 6, 1, 1).data_ptr())
+
+        # match NumPy semantics -- don't infer the size of dimension with a degree of freedom
+        self.assertRaises(RuntimeError, lambda: x.reshape(0, -1))
+
+    @skipIfNoZeroSize
+    def test_tensor_shape_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn((0, 1, 3, 0), device=device)
+            # flatten
+            self.assertEqual((0,), torch.flatten(x, 0, 3).shape)
+            self.assertEqual((0, 0), torch.flatten(x, 0, 2).shape)
+            self.assertEqual((0, 3, 0), torch.flatten(x, 1, 2).shape)
+
+            # squeeze, unsqueeze
+            self.assertEqual((0, 1, 1, 3, 0), torch.unsqueeze(x, 1).shape)
+            self.assertEqual((0, 3, 0), torch.squeeze(x, 1).shape)
+            self.assertEqual((0, 3, 0), torch.squeeze(x).shape)
+
+            # transpose, t
+            self.assertEqual((0, 0, 3, 1), torch.transpose(x, 1, 3).shape)
+            y = torch.randn((5, 0), device=device)
+            self.assertEqual((0, 5), y.t().shape)
+
+            # select
+            self.assertEqual((0, 1, 0), torch.select(x, 2, 2).shape)
+            # unfold
+            self.assertEqual((0, 1, 1, 0, 3), x.unfold(2, 3, 2).shape)
+            y = torch.randn((0, 1, 3), device=device)
+            self.assertEqual((1, 1, 3, 0), y.unfold(0, 0, 4).shape)
+
+            # repeat, permute
+            self.assertEqual((9, 0, 5, 6, 0), x.repeat(9, 7, 5, 2, 3).shape)
+            self.assertEqual((3, 0, 0, 1), x.permute(2, 3, 0, 1).shape)
+
+            # diagonal, diagflat
+            self.assertEqual((0,), torch.diagonal(torch.randn((5, 0), device=device)).shape)
+            self.assertEqual((0,), torch.diagonal(torch.randn((0, 5), device=device)).shape)
+            # off the end offsets are valid
+            self.assertEqual((0,), torch.diagonal(torch.randn((5, 0), device=device), offset=1).shape)
+            self.assertEqual((0,), torch.diagonal(torch.randn((0, 5), device=device), offset=1).shape)
+            # check non-zero sized offsets off the end
+            self.assertEqual((5, 6, 0), torch.diagonal(torch.randn((3, 4, 5, 6), device=device), offset=45252).shape)
+            self.assertEqual((5, 6, 0), torch.diagonal(torch.randn((3, 4, 5, 6), device=device), offset=-45252).shape)
+
+            self.assertEqual((0, 0), torch.diagflat(torch.tensor([], device=device)).shape)
+            self.assertEqual(torch.zeros(1, 1), torch.diagflat(torch.tensor([], device=device), offset=1))
+            self.assertEqual((0, 0), torch.diagflat(torch.tensor([[]], device=device)).shape)
+            self.assertEqual(torch.zeros(1, 1), torch.diagflat(torch.tensor([[]], device=device), offset=1))
+
+            # stack, split, chunk
+            self.assertEqual((4, 0, 1, 3, 0), torch.stack((x, x, x, x)).shape)
+            self.assertEqual([(0, 1, 3, 0)],
+                             [z.shape for z in torch.chunk(x, 1, dim=0)])
+
+            self.assertEqual([(0, 1, 3, 0), ] * 3, [z.shape for z in torch.chunk(x, 3, dim=0)])
+            self.assertEqual([(0, 1, 1, 0), ] * 3, [z.shape for z in torch.chunk(x, 3, dim=2)])
+
+            # NOTE: split_with_sizes behaves differently than NumPy in that it
+            # takes sizes rather than offsets
+            self.assertEqual([(0, 1, 0, 0), (0, 1, 1, 0), (0, 1, 2, 0)],
+                             [z.shape for z in torch.split(x, (0, 1, 2), dim=2)])
+
+            self.assertRaises(RuntimeError, lambda: torch.split(x, 0, dim=1))
+            # This is strange because the split size is larger than the dim size, but consistent with
+            # how split handles that case generally (when no 0s are involved).
+            self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 1, dim=0)])
+            self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)])
+
+    # functions that operate over a dimension but don't reduce.
+    @skipIfNoZeroSize
+    def test_dim_function_empty(self):
+        # FIXME: enable CUDA tests.
+        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            shape = (0, 1, 2, 0)
+            x = torch.randn(shape, device=device)
+
+            # size stride
+            self.assertEqual(0, x.size(3))
+            self.assertEqual(2, x.size(2))
+            self.assertEqual(2, x.stride(0))
+            self.assertEqual(1, x.stride(2))
+
+            self.assertEqual(x, torch.nn.functional.glu(x, 0))
+            self.assertEqual((0, 1, 1, 0), torch.nn.functional.glu(x, 2).shape)
+
+            # softmax, logsoftmax
+            self.assertEqual(x, torch.nn.functional.softmax(x, 0))
+            self.assertEqual(x, torch.nn.functional.softmax(x, 2))
+
+            self.assertEqual(x, torch.nn.functional.log_softmax(x, 0))
+            self.assertEqual(x, torch.nn.functional.log_softmax(x, 2))
+
+            # cumsum, cumprod
+            self.assertEqual(shape, torch.cumsum(x, 0).shape)
+            self.assertEqual(shape, torch.cumsum(x, 2).shape)
+            self.assertEqual(shape, torch.cumprod(x, 0).shape)
+            self.assertEqual(shape, torch.cumprod(x, 2).shape)
+
+            # flip
+            self.assertEqual(x, x.flip(0))
+            self.assertEqual(x, x.flip(2))
+
+            # unbind
+            self.assertEqual((), x.unbind(0))
+            self.assertEqual((torch.empty((0, 1, 0), device=device), torch.empty((0, 1, 0), device=device)),
+                             x.unbind(2))
+
+            # cross
+            y = torch.randn((0, 1, 3, 0), device=device)
+            self.assertEqual(y.shape, torch.cross(y, y).shape)
+
+            # renorm
+            self.assertEqual(shape, torch.renorm(x, 1, 0, 5).shape)
+            self.assertEqual(shape, torch.renorm(x, 1, 2, 5).shape)
+
+            # sort
+            self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=0)])
+            self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=2)])
+
+            # topk
+            self.assertEqual([shape, shape], [z.shape for z in torch.topk(x, 0, dim=0)])
+            self.assertEqual([(0, 1, 1, 0), (0, 1, 1, 0)], [z.shape for z in torch.topk(x, 1, dim=2)])
+
+            y = torch.randn((2, 3, 4), device=device)
+            self.assertEqual([(2, 3, 0), (2, 3, 0)], [z.shape for z in torch.topk(y, 0)])
+
+            # gather
+            self.assertEqual(shape, torch.gather(x, 0, torch.empty(shape, dtype=torch.int64)).shape)
+            self.assertEqual(shape, torch.gather(x, 2, torch.empty(shape, dtype=torch.int64)).shape)
+            larger_shape = (0, 1, 3, 0)
+            self.assertEqual(larger_shape, torch.gather(x, 2, torch.empty(larger_shape, dtype=torch.int64)).shape)
+            smaller_shape = (0, 1, 0, 0)
+            self.assertEqual(smaller_shape, torch.gather(x, 2, torch.empty(smaller_shape, dtype=torch.int64)).shape)
+            y = torch.randn((2, 3, 4), device=device)
+            self.assertEqual((0, 3, 4), torch.gather(y, 0, torch.empty((0, 3, 4), dtype=torch.int64)).shape)
+
+            # scatter, scatter_add
+            for dim in [0, 2]:
+                y = torch.randn(shape, device=device)
+                y_src = torch.randn(shape, device=device)
+                self.assertEqual(shape, y.scatter_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape)
+                self.assertEqual(shape, y.scatter_add_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape)
+
+            z = torch.randn((2, 3, 4), device=device)
+            z_src = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.scatter_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src))
+            self.assertEqual(z, z.scatter_add_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src))
+
+            # index_fill, index_copy, index_add
+            c = x.clone()
+            ind_empty = torch.tensor([], dtype=torch.int64)
+            ind_01 = torch.tensor([0, 1], dtype=torch.int64)
+            self.assertEqual(c, c.index_fill_(0, ind_empty, -1))
+            self.assertEqual(c, c.index_fill_(2, ind_empty, -1))
+            self.assertEqual(c, c.index_fill_(2, torch.tensor([0, 1], dtype=torch.int64), -1))
+            self.assertEqual(c, c.index_copy_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_copy_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device)))
+            self.assertEqual(c, c.index_copy_(2, ind_01, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_add_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_add_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device)))
+            self.assertEqual(c, c.index_add_(2, ind_01, torch.empty((0, 1, 2, 0), device=device)))
+
+            # index fill/copy/add non-empty
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_fill_(0, ind_empty, -1))
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_copy_(0, ind_empty, torch.empty((0, 3, 4), device=device)))
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_add_(0, ind_empty, torch.empty((0, 3, 4), device=device)))
+
+            # index_select
+            self.assertEqual(x, x.index_select(0, ind_empty))
+            self.assertEqual((0, 1, 0, 0), x.index_select(2, ind_empty).shape)
+            self.assertEqual(x, x.index_select(2, ind_01))
+            z = torch.randn((2, 3, 4), device=device)  # non-empty
+            self.assertEqual((0, 3, 4), z.index_select(0, ind_empty).shape)
+
+    @skipIfNoZeroSize
+    def test_blas_empty(self):
+        # FIXME: enable CUDA tests.
+        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+
+            def fn(torchfn, *args):
+                return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape
+                                      for shape in args))
+
+            # mm, addmm
+            self.assertEqual((0, 0), fn(torch.mm, (0, 0), (0, 0)).shape)
+            self.assertEqual((0, 5), fn(torch.mm, (0, 0), (0, 5)).shape)
+            self.assertEqual((5, 0), fn(torch.mm, (5, 0), (0, 0)).shape)
+            self.assertEqual((3, 0), fn(torch.mm, (3, 2), (2, 0)).shape)
+            self.assertEqual(torch.zeros((5, 6), device=device), fn(torch.mm, (5, 0), (0, 6)))
+
+            self.assertEqual((0, 0), fn(torch.addmm, (0, 0), (0, 0), (0, 0)).shape)
+            self.assertEqual((5, 6), fn(torch.addmm, (5, 6), (5, 0), (0, 6)).shape)
+
+            # mv, addmv
+            self.assertEqual((0,), fn(torch.mv, (0, 0), (0,)).shape)
+            self.assertEqual((0,), fn(torch.mv, (0, 2), (2,)).shape)
+            self.assertEqual(torch.zeros((3,), device=device), fn(torch.mv, (3, 0), (0,)))
+
+            self.assertEqual((0,), fn(torch.addmv, (0,), (0, 0), (0,)).shape)
+            self.assertEqual((3,), fn(torch.addmv, (3,), (3, 0), (0,)).shape)
+
+            # ger, addr
+            self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape)
+            self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape)
+            self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape)
+
+            self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape)
+            self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape)
+            self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape)
+
+            # bmm, baddbmm
+            self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((0, 5, 6), fn(torch.bmm, (0, 5, 0), (0, 0, 6)).shape)
+            self.assertEqual(torch.zeros((3, 5, 6), device=device), fn(torch.bmm, (3, 5, 0), (3, 0, 6)))
+
+            self.assertEqual((0, 0, 0), fn(torch.baddbmm, (0, 0, 0), (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((3, 0, 5), fn(torch.baddbmm, (3, 0, 5), (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((0, 5, 6), fn(torch.baddbmm, (0, 5, 6), (0, 5, 0), (0, 0, 6)).shape)
+            self.assertEqual((3, 5, 6), fn(torch.baddbmm, (3, 5, 6), (3, 5, 0), (3, 0, 6)).shape)
+
+            # addbmm
+            self.assertEqual((0, 0), fn(torch.addbmm, (0, 0), (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((0, 5), fn(torch.addbmm, (0, 5), (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((5, 6), fn(torch.addbmm, (5, 6), (0, 5, 0), (0, 0, 6)).shape)
+
+            # matmul
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.matmul, (0,), (0,)))
+            self.assertEqual((0, 0), fn(torch.matmul, (0, 0), (0, 0)).shape)
+            self.assertEqual((0, 0, 0), fn(torch.matmul, (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((5, 0, 0), fn(torch.matmul, (5, 0, 0), (5, 0, 0)).shape)
+            self.assertEqual(torch.zeros((5, 3, 4), device=device), fn(torch.matmul, (5, 3, 0), (5, 0, 4)))
+
+            # dot
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.dot, (0,), (0,)))
+
+    @skipIfNoZeroSize
+    @skipIfNoLapack
+    def test_lapack_empty(self):
+        # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here.
+        # The LAPACK functions themselves generally do NOT work with zero sized dimensions, although
+        # numpy/sci often has a direct wrapper (e.g. lu_factor) and a wrapper that "does the right thing"
+        # (e.g. lu).  We often name our functions identically to the lapack function, so it will take work
+        # to name / migrate-to better wrappers.
+
+        # FIXME: enable CUDA tests.
+        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+
+            def fn(torchfn, *args):
+                return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape
+                                      for shape in args))
+
+            # inverse, pinverse
+            self.assertEqual((0, 0), fn(torch.inverse, (0, 0)).shape)
+            self.assertEqual((5, 0), fn(torch.pinverse, (0, 5)).shape)
+            self.assertEqual((0, 5), fn(torch.pinverse, (5, 0)).shape)
+            self.assertEqual((0, 0), fn(torch.pinverse, (0, 0)).shape)
+
+            # svd
+            self.assertRaises(RuntimeError, lambda: fn(torch.svd, (0, 0)))
+
+            # det, logdet, slogdet
+            self.assertEqual(torch.tensor(1., device=device), fn(torch.det, (0, 0)))
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.logdet, (0, 0)))
+            self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)),
+                             fn(torch.slogdet, (0, 0)))
+
+            # eig, symeig
+            evalues, evectors = fn(torch.eig, (0, 0), True)
+            self.assertEqual([(0, 2), (0, 0)], [evalues.shape, evectors.shape])
+            evalues, evectors = fn(torch.symeig, (0, 0), True)
+            self.assertEqual([(0,), (0, 0)], [evalues.shape, evectors.shape])
+
+            # qr, gels
+            self.assertRaises(RuntimeError, lambda: torch.qr(torch.randn(0, 0)))
+            self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0, 0), torch.randn(0, 0)))
+            self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0,), torch.randn(0, 0)))
+
+            # btrifact
+            A_LU, pivots = fn(torch.btrifact, (0, 5, 5))
+            self.assertEqual([(0, 5, 5), (0, 5)], [A_LU.shape, pivots.shape])
+            A_LU, pivots = fn(torch.btrifact, (0, 0, 0))
+            self.assertEqual([(0, 0, 0), (0, 0)], [A_LU.shape, pivots.shape])
+            A_LU, pivots = fn(torch.btrifact, (2, 0, 0))
+            self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
+
+    def test_expand(self):
+        tensor = torch.rand(1, 8, 1)
+        tensor2 = torch.rand(5)
+        template = torch.rand(4, 8, 5)
+        target = template.size()
+        self.assertEqual(tensor.expand_as(template).size(), target)
+        self.assertEqual(tensor.expand(4, 8, 5).size(), target)
+        self.assertEqual(tensor.expand(target).size(), target)
+        self.assertEqual(tensor2.expand_as(template).size(), target)
+        self.assertEqual(tensor2.expand(4, 8, 5).size(), target)
+        self.assertEqual(tensor2.expand(target).size(), target)
+
+        # test double expand
+        self.assertEqual(tensor2.expand(1, 5).expand(2, 2, 5), tensor2.repeat(2, 2, 1))
+
+        # test non-contiguous
+        noncontig = torch.randn(5, 2, 1, 3)[:, 0]
+        self.assertFalse(noncontig.is_contiguous())
+        self.assertEqual(noncontig.expand(2, 5, 4, 3), noncontig.contiguous().repeat(2, 1, 4, 1))
+
+        # make sure it's compatible with unsqueeze
+        expanded = tensor2.expand(1, 1, 5)
+        unsqueezed = tensor2.unsqueeze(0).unsqueeze(1)
+        self.assertEqual(expanded, unsqueezed)
+        self.assertEqual(expanded.stride(), unsqueezed.stride())
+
+        # test -1 as target size
+        self.assertEqual(tensor.expand(4, -1, 5), tensor.expand(4, 8, 5))
+        self.assertRaises(RuntimeError, lambda: tensor2.expand(-1, -1))
+
+        # test expanding empty to empty
+        self.assertEqual(torch.zeros(0).expand((0,)), torch.zeros(0))
+
+    def test_repeat(self):
+
+        initial_shape = (8, 4)
+        tensor = torch.rand(*initial_shape)
+
+        size = (3, 1, 1)
+        torchSize = torch.Size(size)
+        target = [3, 8, 4]
+        self.assertEqual(tensor.repeat(*size).size(), target, 'Error in repeat')
+        self.assertEqual(tensor.repeat(torchSize).size(), target,
+                         'Error in repeat using LongStorage')
+        result = tensor.repeat(*size)
+        self.assertEqual(result.size(), target, 'Error in repeat using result')
+        result = tensor.repeat(torchSize)
+        self.assertEqual(result.size(), target, 'Error in repeat using result and LongStorage')
+        self.assertEqual(result.mean(0).view(8, 4), tensor, 'Error in repeat (not equal)')
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_repeat_tile(self):
+
+        initial_shape = (8, 4)
+
+        repeats = ((3, 1, 1),
+                   (3, 3, 3),
+                   (1, 2, 1),
+                   (2, 2, 2, 2))
+
+        def _generate_noncontiguous_input():
+
+            out = np.broadcast_to(np.random.random((1, 4)),
+                                  initial_shape)
+
+            assert not (out.flags.c_contiguous or out.flags.f_contiguous)
+
+            return out
+
+        for repeat in repeats:
+            for tensor in (torch.from_numpy(np.random.random(initial_shape)),
+                           torch.from_numpy(_generate_noncontiguous_input()),):
+
+                self.assertEqual(tensor.repeat(*repeat).numpy(),
+                                 np.tile(tensor.numpy(), repeat))
+
+    def test_is_same_size(self):
+        t1 = torch.Tensor(3, 4, 9, 10)
+        t2 = torch.Tensor(3, 4)
+        t3 = torch.Tensor(1, 9, 3, 3)
+        t4 = torch.Tensor(3, 4, 9, 10)
+
+        self.assertFalse(t1.is_same_size(t2))
+        self.assertFalse(t1.is_same_size(t3))
+        self.assertTrue(t1.is_same_size(t4))
+
+    def test_is_set_to(self):
+        t1 = torch.Tensor(3, 4, 9, 10)
+        t2 = torch.Tensor(3, 4, 9, 10)
+        t3 = torch.Tensor().set_(t1)
+        t4 = t3.clone().resize_(12, 90)
+        self.assertFalse(t1.is_set_to(t2))
+        self.assertTrue(t1.is_set_to(t3))
+        self.assertTrue(t3.is_set_to(t1), "is_set_to should be symmetric")
+        self.assertFalse(t1.is_set_to(t4))
+        self.assertFalse(torch.Tensor().is_set_to(torch.Tensor()),
+                         "Tensors with no storages should not appear to be set "
+                         "to each other")
+
+    def test_tensor_set(self):
+        t1 = torch.Tensor()
+        t2 = torch.Tensor(3, 4, 9, 10).uniform_()
+        t1.set_(t2)
+        self.assertEqual(t1.storage()._cdata, t2.storage()._cdata)
+        size = torch.Size([9, 3, 4, 10])
+        t1.set_(t2.storage(), 0, size)
+        self.assertEqual(t1.size(), size)
+        t1.set_(t2.storage(), 0, tuple(size))
+        self.assertEqual(t1.size(), size)
+        self.assertEqual(t1.stride(), (120, 40, 10, 1))
+        stride = (10, 360, 90, 1)
+        t1.set_(t2.storage(), 0, size, stride)
+        self.assertEqual(t1.stride(), stride)
+        t1.set_(t2.storage(), 0, size=size, stride=stride)
+        self.assertEqual(t1.size(), size)
+        self.assertEqual(t1.stride(), stride)
+
+        # test argument names
+        t1 = torch.Tensor()
+        # 1. case when source is tensor
+        t1.set_(source=t2)
+        self.assertEqual(t1.storage()._cdata, t2.storage()._cdata)
+        # 2. case when source is storage
+        t1.set_(source=t2.storage())
+        self.assertEqual(t1.storage()._cdata, t2.storage()._cdata)
+        # 3. case when source is storage, and other args also specified
+        t1.set_(source=t2.storage(), storage_offset=0, size=size, stride=stride)
+        self.assertEqual(t1.size(), size)
+        self.assertEqual(t1.stride(), stride)
+
+    def test_equal(self):
+        # Contiguous, 1D
+        t1 = torch.Tensor((3, 4, 9, 10))
+        t2 = t1.contiguous()
+        t3 = torch.Tensor((1, 9, 3, 10))
+        t4 = torch.Tensor((3, 4, 9))
+        t5 = torch.Tensor()
+        self.assertTrue(t1.equal(t2))
+        self.assertFalse(t1.equal(t3))
+        self.assertFalse(t1.equal(t4))
+        self.assertFalse(t1.equal(t5))
+        self.assertTrue(torch.equal(t1, t2))
+        self.assertFalse(torch.equal(t1, t3))
+        self.assertFalse(torch.equal(t1, t4))
+        self.assertFalse(torch.equal(t1, t5))
+
+        # Non contiguous, 2D
+        s = torch.Tensor(((1, 2, 3, 4), (5, 6, 7, 8)))
+        s1 = s[:, 1:3]
+        s2 = s1.clone()
+        s3 = torch.Tensor(((2, 3), (6, 7)))
+        s4 = torch.Tensor(((0, 0), (0, 0)))
+
+        self.assertFalse(s1.is_contiguous())
+        self.assertTrue(s1.equal(s2))
+        self.assertTrue(s1.equal(s3))
+        self.assertFalse(s1.equal(s4))
+        self.assertTrue(torch.equal(s1, s2))
+        self.assertTrue(torch.equal(s1, s3))
+        self.assertFalse(torch.equal(s1, s4))
+
+    def test_element_size(self):
+        byte = torch.ByteStorage().element_size()
+        char = torch.CharStorage().element_size()
+        short = torch.ShortStorage().element_size()
+        int = torch.IntStorage().element_size()
+        long = torch.LongStorage().element_size()
+        float = torch.FloatStorage().element_size()
+        double = torch.DoubleStorage().element_size()
+
+        self.assertEqual(byte, torch.ByteTensor().element_size())
+        self.assertEqual(char, torch.CharTensor().element_size())
+        self.assertEqual(short, torch.ShortTensor().element_size())
+        self.assertEqual(int, torch.IntTensor().element_size())
+        self.assertEqual(long, torch.LongTensor().element_size())
+        self.assertEqual(float, torch.FloatTensor().element_size())
+        self.assertEqual(double, torch.DoubleTensor().element_size())
+
+        self.assertGreater(byte, 0)
+        self.assertGreater(char, 0)
+        self.assertGreater(short, 0)
+        self.assertGreater(int, 0)
+        self.assertGreater(long, 0)
+        self.assertGreater(float, 0)
+        self.assertGreater(double, 0)
+
+        # These tests are portable, not necessarily strict for your system.
+        self.assertEqual(byte, 1)
+        self.assertEqual(char, 1)
+        self.assertGreaterEqual(short, 2)
+        self.assertGreaterEqual(int, 2)
+        self.assertGreaterEqual(int, short)
+        self.assertGreaterEqual(long, 4)
+        self.assertGreaterEqual(long, int)
+        self.assertGreaterEqual(double, float)
+
+    def test_split(self):
+        tensor = torch.rand(7, 4)
+        split_size = 3
+        dim = 0
+        target_sizes = ([3, 4], [3, 4], [1, 4])
+        splits = tensor.split(split_size, dim)
+        start = 0
+        for target_size, split in zip(target_sizes, splits):
+            self.assertEqual(split.size(), target_size)
+            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
+            start = start + target_size[dim]
+
+        # Variable sections split
+        tensor = torch.randn(20, 10)
+        dim = 0
+        split_sizes = [5, 5, 10]
+        target_sizes = ([[5, 10], [5, 10], [10, 10]])
+        splits = tensor.split(split_sizes, dim)
+        start = 0
+        for target_size, split in zip(target_sizes, splits):
+            self.assertEqual(split.size(), target_size)
+            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
+            start = start + target_size[dim]
+
+        split_sizes = [2, 2, 6]
+        target_sizes = ([20, 2], [20, 2], [20, 6])
+        dim = 1
+        splits = tensor.split(split_sizes, dim)
+        start = 0
+        for target_size, split in zip(target_sizes, splits):
+            self.assertEqual(split.size(), target_size)
+            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
+            start = start + target_size[dim]
+
+    def test_chunk(self):
+        tensor = torch.rand(4, 7)
+        num_chunks = 3
+        dim = 1
+        target_sizes = ([4, 3], [4, 3], [4, 1])
+        splits = tensor.chunk(num_chunks, dim)
+        start = 0
+        for target_size, split in zip(target_sizes, splits):
+            self.assertEqual(split.size(), target_size)
+            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
+            start = start + target_size[dim]
+
+        # Invalid chunk sizes
+        error_regex = 'chunk expects.*greater than 0'
+        with self.assertRaisesRegex(RuntimeError, error_regex):
+            tensor.chunk(0)
+        with self.assertRaisesRegex(RuntimeError, error_regex):
+            tensor.chunk(-2)
+
+    def test_tolist(self):
+        list0D = []
+        tensor0D = torch.Tensor(list0D)
+        self.assertEqual(tensor0D.tolist(), list0D)
+
+        table1D = [1, 2, 3]
+        tensor1D = torch.Tensor(table1D)
+        storage = torch.Storage(table1D)
+        self.assertEqual(tensor1D.tolist(), table1D)
+        self.assertEqual(storage.tolist(), table1D)
+        self.assertEqual(tensor1D.tolist(), table1D)
+        self.assertEqual(storage.tolist(), table1D)
+
+        table2D = [[1, 2], [3, 4]]
+        tensor2D = torch.Tensor(table2D)
+        self.assertEqual(tensor2D.tolist(), table2D)
+
+        tensor3D = torch.Tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+        tensorNonContig = tensor3D.select(1, 1)
+        self.assertFalse(tensorNonContig.is_contiguous())
+        self.assertEqual(tensorNonContig.tolist(), [[3, 4], [7, 8]])
+
+    def test_permute(self):
+        orig = [1, 2, 3, 4, 5, 6, 7]
+        perm = torch.randperm(7).tolist()
+        x = torch.Tensor(*orig).fill_(0)
+        new = list(map(lambda x: x - 1, x.permute(*perm).size()))
+        self.assertEqual(perm, new)
+        self.assertEqual(x.size(), orig)
+
+    @staticmethod
+    def _test_flip(self, use_cuda=False):
+        if use_cuda:
+            cuda = torch.device("cuda")
+            data = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], device=cuda).view(2, 2, 2)
+            # large data testing
+            large_data = torch.arange(0, 100000000, device=cuda).view(10000, 10000)
+            large_data.flip([0, 1])
+        else:
+            data = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(2, 2, 2)
+
+        self.assertEqual(torch.tensor([5, 6, 7, 8, 1, 2, 3, 4]).view(2, 2, 2), data.flip(0))
+        self.assertEqual(torch.tensor([3, 4, 1, 2, 7, 8, 5, 6]).view(2, 2, 2), data.flip(1))
+        self.assertEqual(torch.tensor([2, 1, 4, 3, 6, 5, 8, 7]).view(2, 2, 2), data.flip(2))
+        self.assertEqual(torch.tensor([7, 8, 5, 6, 3, 4, 1, 2]).view(2, 2, 2), data.flip(0, 1))
+        self.assertEqual(torch.tensor([8, 7, 6, 5, 4, 3, 2, 1]).view(2, 2, 2), data.flip(0, 1, 2))
+
+        # check for permute
+        self.assertEqual(torch.tensor([6, 5, 8, 7, 2, 1, 4, 3]).view(2, 2, 2), data.flip(0, 2))
+        self.assertEqual(torch.tensor([6, 5, 8, 7, 2, 1, 4, 3]).view(2, 2, 2), data.flip(2, 0))
+
+        # not allow flip on the same dim more than once
+        self.assertRaises(RuntimeError, lambda: data.flip(0, 1, 1))
+        # not allow empty list as input
+        self.assertRaises(TypeError, lambda: data.flip())
+        # not allow size of flip dim > total dims
+        self.assertRaises(RuntimeError, lambda: data.flip(0, 1, 2, 3))
+        # not allow dim < 0
+        self.assertRaises(RuntimeError, lambda: data.flip(-1))
+        # not allow dim > max dim
+        self.assertRaises(RuntimeError, lambda: data.flip(3))
+
+        # test for non-contiguous case
+        if use_cuda:
+            expanded_data = torch.arange(1, 4, device=cuda).view(3, 1).expand(3, 2)
+            tranposed_data = torch.arange(1, 9, device=cuda).view(2, 2, 2).transpose(0, 1)
+        else:
+            expanded_data = torch.arange(1, 4).view(3, 1).expand(3, 2)
+            tranposed_data = torch.arange(1, 9).view(2, 2, 2).transpose(0, 1)
+        self.assertEqual(torch.tensor([3, 3, 2, 2, 1, 1]).view(3, 2), expanded_data.flip(0))
+        self.assertEqual(torch.tensor([8, 7, 4, 3, 6, 5, 2, 1]).view(2, 2, 2), tranposed_data.flip(0, 1, 2))
+
+        # test rectangular case
+        data = torch.tensor([1, 2, 3, 4, 5, 6]).view(2, 3)
+        flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]])
+        flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]])
+        if use_cuda:
+            data = data.cuda()
+            flip0_result = flip0_result.cuda()
+            flip1_result = flip1_result.cuda()
+        self.assertEqual(flip0_result, data.flip(0))
+        self.assertEqual(flip1_result, data.flip(1))
+
+    def test_flip(self):
+        self._test_flip(self, use_cuda=False)
+
+    def test_reversed(self):
+        val = torch.arange(0, 10)
+        self.assertEqual(reversed(val), torch.arange(9, -1, -1))
+
+        val = torch.arange(1, 10).view(3, 3)
+        self.assertEqual(reversed(val), torch.tensor([[7, 8, 9], [4, 5, 6], [1, 2, 3]]))
+
+        val = torch.tensor(42)
+        self.assertEqual(reversed(val), torch.tensor(42))
+
+    def test_storage(self):
+        v = torch.randn(3, 5)
+        self.assertEqual(v.storage()[0], v.data[0][0])
+        self.assertEqual(v.storage()[14], v.data[2][4])
+
+    def test_nonzero(self):
+        num_src = 12
+
+        types = [
+            'torch.ByteTensor',
+            'torch.CharTensor',
+            'torch.ShortTensor',
+            'torch.IntTensor',
+            'torch.FloatTensor',
+            'torch.DoubleTensor',
+            'torch.LongTensor',
+        ]
+
+        shapes = [
+            torch.Size((12,)),
+            torch.Size((12, 1)),
+            torch.Size((1, 12)),
+            torch.Size((6, 2)),
+            torch.Size((3, 2, 2)),
+        ]
+
+        for t in types:
+            while True:
+                tensor = torch.rand(num_src).mul(2).floor().type(t)
+                if tensor.sum() > 0:
+                    break
+            for shape in shapes:
+                tensor = tensor.clone().resize_(shape)
+                dst1 = torch.nonzero(tensor)
+                dst2 = tensor.nonzero()
+                dst3 = torch.LongTensor()
+                torch.nonzero(tensor, out=dst3)
+                if len(shape) == 1:
+                    dst = []
+                    for i in range(num_src):
+                        if tensor[i] != 0:
+                            dst += [i]
+
+                    self.assertEqual(dst1.select(1, 0), torch.LongTensor(dst), 0)
+                    self.assertEqual(dst2.select(1, 0), torch.LongTensor(dst), 0)
+                    self.assertEqual(dst3.select(1, 0), torch.LongTensor(dst), 0)
+                elif len(shape) == 2:
+                    # This test will allow through some False positives. It only checks
+                    # that the elements flagged positive are indeed non-zero.
+                    for i in range(dst1.size(0)):
+                        self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1]].item(), 0)
+                elif len(shape) == 3:
+                    # This test will allow through some False positives. It only checks
+                    # that the elements flagged positive are indeed non-zero.
+                    for i in range(dst1.size(0)):
+                        self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]].item(), 0)
+
+    def test_nonzero_empty(self):
+        if not torch._C._use_zero_size_dim():
+            return
+
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            x = torch.randn(0, 2, 0, 5, 0, device=device)
+            y = torch.nonzero(x)
+            self.assertEqual(0, y.numel())
+            self.assertEqual(torch.Size([0, 5]), y.shape)
+
+    def test_deepcopy(self):
+        from copy import deepcopy
+        a = torch.randn(5, 5)
+        b = torch.randn(5, 5)
+        c = a.view(25)
+        q = [a, [a.storage(), b.storage()], b, c]
+        w = deepcopy(q)
+        self.assertEqual(w[0], q[0], 0)
+        self.assertEqual(w[1][0], q[1][0], 0)
+        self.assertEqual(w[1][1], q[1][1], 0)
+        self.assertEqual(w[1], q[1], 0)
+        self.assertEqual(w[2], q[2], 0)
+
+        # Check that deepcopy preserves sharing
+        w[0].add_(1)
+        for i in range(a.numel()):
+            self.assertEqual(w[1][0][i], q[1][0][i] + 1)
+        self.assertEqual(w[3], c + 1)
+        w[2].sub_(1)
+        for i in range(a.numel()):
+            self.assertEqual(w[1][1][i], q[1][1][i] - 1)
+
+    def test_deepcopy_scalar(self):
+        from copy import deepcopy
+        a = torch.tensor(5)
+        self.assertEqual(a.size(), deepcopy(a).size())
+        self.assertEqual(a, deepcopy(a))
+
+    def test_copy(self):
+        from copy import copy
+        a = torch.randn(5, 5)
+        a_clone = a.clone()
+        b = copy(a)
+        b.fill_(1)
+        # copy is a shallow copy, only copies the tensor view,
+        # not the data
+        self.assertEqual(a, b)
+
+    def test_pickle(self):
+        if sys.version_info[0] == 2:
+            import cPickle as pickle
+        else:
+            import pickle
+        a = torch.randn(5, 5)
+        serialized = pickle.dumps(a)
+        b = pickle.loads(serialized)
+        self.assertEqual(a, b)
+
+    def test_pickle_parameter(self):
+        if sys.version_info[0] == 2:
+            import cPickle as pickle
+        else:
+            import pickle
+        a = torch.nn.Parameter(torch.randn(5, 5))
+        serialized = pickle.dumps(a)
+        b = pickle.loads(serialized)
+        self.assertTrue(isinstance(b, torch.nn.Parameter))
+        self.assertEqual(a.requires_grad, b.requires_grad)
+        self.assertEqual(a, b)
+
+    def test_pickle_parameter_no_requires_grad(self):
+        if sys.version_info[0] == 2:
+            import cPickle as pickle
+        else:
+            import pickle
+        a = torch.nn.Parameter(torch.randn(5, 5), requires_grad=False)
+        serialized = pickle.dumps(a)
+        b = pickle.loads(serialized)
+        self.assertTrue(isinstance(b, torch.nn.Parameter))
+        self.assertEqual(a.requires_grad, b.requires_grad)
+        self.assertEqual(a, b)
+
+    def test_norm_fastpaths(self):
+        x = torch.randn(3, 5)
+
+        # slow path
+        result = torch.norm(x, 4.5, 1)
+        expected = torch.pow(x.abs().pow(4.5).sum(1), 1.0 / 4.5)
+        self.assertEqual(result, expected)
+
+        # fast 0-norm
+        result = torch.norm(x, 0, 1)
+        expected = (x != 0).type_as(x).sum(1)
+        self.assertEqual(result, expected)
+
+        # fast 1-norm
+        result = torch.norm(x, 1, 1)
+        expected = x.abs().sum(1)
+        self.assertEqual(result, expected)
+
+        # fast 2-norm
+        result = torch.norm(x, 2, 1)
+        expected = torch.sqrt(x.pow(2).sum(1))
+        self.assertEqual(result, expected)
+
+        # fast 3-norm
+        result = torch.norm(x, 3, 1)
+        expected = torch.pow(x.pow(3).abs().sum(1), 1.0 / 3.0)
+        self.assertEqual(result, expected)
+
+    def test_bernoulli(self):
+        t = torch.ByteTensor(10, 10)
+
+        def isBinary(t):
+            return torch.ne(t, 0).mul_(torch.ne(t, 1)).sum() == 0
+
+        p = 0.5
+        t.bernoulli_(p)
+        self.assertTrue(isBinary(t))
+
+        p = torch.rand(10, 10)
+        t.bernoulli_(p)
+        self.assertTrue(isBinary(t))
+
+        q = torch.rand(5, 5)
+        self.assertTrue(isBinary(q.bernoulli()))
+
+    def test_normal(self):
+        q = torch.Tensor(100, 100)
+        q.normal_()
+        self.assertEqual(q.mean(), 0, 0.2)
+        self.assertEqual(q.std(), 1, 0.2)
+
+        q.normal_(2, 3)
+        self.assertEqual(q.mean(), 2, 0.3)
+        self.assertEqual(q.std(), 3, 0.3)
+
+        mean = torch.Tensor(100, 100)
+        std = torch.Tensor(100, 100)
+        mean[:50] = 0
+        mean[50:] = 1
+        std[:, :50] = 4
+        std[:, 50:] = 1
+
+        r = torch.normal(mean)
+        self.assertEqual(r[:50].mean(), 0, 0.2)
+        self.assertEqual(r[50:].mean(), 1, 0.2)
+        self.assertEqual(r.std(), 1, 0.2)
+
+        r = torch.normal(mean, 3)
+        self.assertEqual(r[:50].mean(), 0, 0.2)
+        self.assertEqual(r[50:].mean(), 1, 0.2)
+        self.assertEqual(r.std(), 3, 0.2)
+
+        r = torch.normal(2, std)
+        self.assertEqual(r.mean(), 2, 0.2)
+        self.assertEqual(r[:, :50].std(), 4, 0.3)
+        self.assertEqual(r[:, 50:].std(), 1, 0.2)
+
+        r = torch.normal(mean, std)
+        self.assertEqual(r[:50].mean(), 0, 0.2)
+        self.assertEqual(r[50:].mean(), 1, 0.2)
+        self.assertEqual(r[:, :50].std(), 4, 0.3)
+        self.assertEqual(r[:, 50:].std(), 1, 0.2)
+
+    def test_parsing_int64(self):
+        # accepts integer arguments
+        x = torch.cumsum(torch.ones(5, 5), 0)
+        self.assertEqual(x, torch.cumsum(torch.ones(5, 5), torch.tensor(0)))
+        # doesn't accept floating point variables
+        self.assertRaises(TypeError, lambda: torch.cumsum(torch.ones(5, 5), torch.tensor(0.)))
+
+    def test_parsing_double(self):
+        # accepts floating point and integer arguments
+        x = torch.randn(2, 3)
+        torch.isclose(x, x, 1, 1)
+        self.assertTrue(torch.isclose(x, x, 1, 1).all())
+        self.assertTrue(torch.isclose(x, x, 1.5, 1.).all())
+        # accepts floating point and integer tensors
+        self.assertTrue(torch.isclose(x, x, torch.tensor(1), torch.tensor(1)).all())
+        self.assertTrue(torch.isclose(x, x, torch.tensor(1.5), torch.tensor(1.)).all())
+        # doesn't accept variables with requires_grad
+        self.assertRaises(TypeError,
+                          lambda: torch.isclose(x, x, torch.tensor(1.5), torch.tensor(1., requires_grad=True)).all())
+
+    def test_parsing_intlist(self):
+        #  parse with integer variables
+        self.assertEqual(torch.Size([3, 4]), torch.ones((torch.tensor(3), torch.tensor(4))).shape)
+        self.assertEqual(torch.Size([3, 4]), torch.ones(torch.tensor(3), torch.tensor(4)).shape)
+        # parse with numpy integers
+        if TEST_NUMPY:
+            self.assertEqual(torch.Size([3, 4]), torch.ones((np.array(3), np.int64(4))).shape)
+            self.assertEqual(torch.Size([3, 4]), torch.ones(np.array(3), np.int64(4)).shape)
+            self.assertEqual(torch.Size([3, 4]), torch.ones((np.int64(3), np.array(4))).shape)
+            self.assertEqual(torch.Size([3, 4]), torch.ones(np.int64(3), np.array(4)).shape)
+
+        # fail parse with float variables
+        self.assertRaises(TypeError, lambda: torch.ones((torch.tensor(3.), torch.tensor(4))))
+        # fail parse with numpy floats
+        if TEST_NUMPY:
+            self.assertRaises(TypeError, lambda: torch.ones((np.float(3.), torch.tensor(4))))
+            self.assertRaises(TypeError, lambda: torch.ones((np.array(3.), torch.tensor(4))))
+
+        # fail parse with > 1 element variables
+        self.assertRaises(TypeError, lambda: torch.ones(torch.tensor(3, 3)))
+        self.assertRaises(TypeError, lambda: torch.ones((torch.tensor(3, 3))))
+        if TEST_NUMPY:
+            self.assertRaises(TypeError, lambda: torch.ones(np.array(3, 3)))
+            self.assertRaises(TypeError, lambda: torch.ones((np.array(3, 3))))
+
+    def _test_serialization_data(self):
+        a = [torch.randn(5, 5).float() for i in range(2)]
+        b = [a[i % 2] for i in range(4)]  # 0-3
+        b += [a[0].storage()]  # 4
+        b += [a[0].reshape(-1)[1:4].storage()]  # 5
+        b += [torch.arange(1, 11).int()]  # 6
+        t1 = torch.FloatTensor().set_(a[0].reshape(-1)[1:4].clone().storage(), 0, (3,), (1,))
+        t2 = torch.FloatTensor().set_(a[0].reshape(-1)[1:4].clone().storage(), 0, (3,), (1,))
+        b += [(t1.storage(), t1.storage(), t2.storage())]  # 7
+        b += [a[0].reshape(-1)[0:2].storage()]  # 8
+        return b
+
+    def _test_serialization_assert(self, b, c):
+        self.assertEqual(b, c, 0)
+        self.assertTrue(isinstance(c[0], torch.FloatTensor))
+        self.assertTrue(isinstance(c[1], torch.FloatTensor))
+        self.assertTrue(isinstance(c[2], torch.FloatTensor))
+        self.assertTrue(isinstance(c[3], torch.FloatTensor))
+        self.assertTrue(isinstance(c[4], torch.FloatStorage))
+        c[0].fill_(10)
+        self.assertEqual(c[0], c[2], 0)
+        self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0)
+        c[1].fill_(20)
+        self.assertEqual(c[1], c[3], 0)
+        # I have to do it in this roundabout fashion, because there's no
+        # way to slice storages
+        for i in range(4):
+            self.assertEqual(c[4][i + 1], c[5][i])
+
+        # check that serializing the same storage view object unpickles
+        # it as one object not two (and vice versa)
+        views = c[7]
+        self.assertEqual(views[0]._cdata, views[1]._cdata)
+        self.assertEqual(views[0], views[2])
+        self.assertNotEqual(views[0]._cdata, views[2]._cdata)
+
+        rootview = c[8]
+        self.assertEqual(rootview.data_ptr(), c[0].data_ptr())
+
+    def test_serialization(self):
+        # Test serialization with a real file
+        b = self._test_serialization_data()
+        for use_name in (False, True):
+            # Passing filename to torch.save(...) will cause the file to be opened twice,
+            # which is not supported on Windows
+            if sys.platform == "win32" and use_name:
+                continue
+            with tempfile.NamedTemporaryFile() as f:
+                handle = f if not use_name else f.name
+                torch.save(b, handle)
+                f.seek(0)
+                c = torch.load(handle)
+            self._test_serialization_assert(b, c)
+
+    def test_serialization_filelike(self):
+        # Test serialization (load and save) with a filelike object
+        b = self._test_serialization_data()
+        with BytesIOContext() as f:
+            torch.save(b, f)
+            f.seek(0)
+            c = torch.load(f)
+        self._test_serialization_assert(b, c)
+
+    def test_serialization_gzip(self):
+        # Test serialization with gzip file
+        b = self._test_serialization_data()
+        f1 = tempfile.NamedTemporaryFile(delete=False)
+        f2 = tempfile.NamedTemporaryFile(delete=False)
+        torch.save(b, f1)
+        with open(f1.name, 'rb') as f_in, gzip.open(f2.name, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+        with gzip.open(f2.name, 'rb') as f:
+            c = torch.load(f)
+        self._test_serialization_assert(b, c)
+
+    def test_serialization_offset(self):
+        a = torch.randn(5, 5)
+        i = 41
+        for use_name in (False, True):
+            # Passing filename to torch.save(...) will cause the file to be opened twice,
+            # which is not supported on Windows
+            if sys.platform == "win32" and use_name:
+                continue
+            with tempfile.NamedTemporaryFile() as f:
+                handle = f if not use_name else f.name
+                pickle.dump(i, f)
+                torch.save(a, f)
+                f.seek(0)
+                j = pickle.load(f)
+                b = torch.load(f)
+            self.assertTrue(torch.equal(a, b))
+            self.assertEqual(i, j)
+
+    def test_serialization_offset_filelike(self):
+        a = torch.randn(5, 5)
+        i = 41
+        with BytesIOContext() as f:
+            pickle.dump(i, f)
+            torch.save(a, f)
+            f.seek(0)
+            j = pickle.load(f)
+            b = torch.load(f)
+        self.assertTrue(torch.equal(a, b))
+        self.assertEqual(i, j)
+
+    def test_serialization_offset_gzip(self):
+        a = torch.randn(5, 5)
+        i = 41
+        f1 = tempfile.NamedTemporaryFile(delete=False)
+        f2 = tempfile.NamedTemporaryFile(delete=False)
+        with open(f1.name, 'wb') as f:
+            pickle.dump(i, f)
+            torch.save(a, f)
+        with open(f1.name, 'rb') as f_in, gzip.open(f2.name, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+        with gzip.open(f2.name, 'rb') as f:
+            j = pickle.load(f)
+            b = torch.load(f)
+        self.assertTrue(torch.equal(a, b))
+        self.assertEqual(i, j)
+
+    def test_half_tensor(self):
+        x = torch.randn(5, 5).float()
+        y = torch.randn(5, 5).float()
+        xh, yh = x.half(), y.half()
+
+        self.assertEqual(x.half().float(), x, 1e-3)
+
+        z = torch.Tensor(5, 5)
+        self.assertEqual(z.copy_(xh), x, 1e-3)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(xh, f)
+            f.seek(0)
+            xh2 = torch.load(f)
+            self.assertEqual(xh.float(), xh2.float())
+
+    def test_serialize_device(self):
+        device_str = ['cpu', 'cpu:0', 'cuda', 'cuda:0']
+        device_obj = [torch.device(d) for d in device_str]
+        for device in device_obj:
+            device_copied = copy.deepcopy(device)
+            self.assertEqual(device, device_copied)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_half_tensor_cuda(self):
+        x = torch.randn(5, 5).half()
+        self.assertEqual(x.cuda(), x)
+
+        xc = x.cuda()
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(xc, f)
+            f.seek(0)
+            xc2 = torch.load(f)
+            self.assertIsInstance(xc2, type(xc))
+            self.assertEqual(xc.float(), xc2.float())
+
+    def _test_serialization_cuda(self, filecontext_lambda):
+        device_count = torch.cuda.device_count()
+        t0 = torch.cuda.FloatTensor(5).fill_(1)
+        torch.cuda.set_device(device_count - 1)
+        tn = torch.cuda.FloatTensor(3).fill_(2)
+        torch.cuda.set_device(0)
+        b = (t0, tn)
+        with filecontext_lambda() as f:
+            torch.save(b, f)
+            f.seek(0)
+            c = torch.load(f)
+            self.assertEqual(b, c, 0)
+            u0, un = c
+            self.assertEqual(u0.get_device(), 0)
+            self.assertEqual(un.get_device(), device_count - 1)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_serialization_cuda(self):
+        self._test_serialization_cuda(tempfile.NamedTemporaryFile)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_serialization_cuda_filelike(self):
+        self._test_serialization_cuda(BytesIOContext)
+
+    def test_serialization_backwards_compat(self):
+        a = [torch.arange(1 + i, 26 + i).view(5, 5).float() for i in range(2)]
+        b = [a[i % 2] for i in range(4)]
+        b += [a[0].storage()]
+        b += [a[0].reshape(-1)[1:4].clone().storage()]
+        path = download_file('https://download.pytorch.org/test_data/legacy_serialized.pt')
+        c = torch.load(path)
+        self.assertEqual(b, c, 0)
+        self.assertTrue(isinstance(c[0], torch.FloatTensor))
+        self.assertTrue(isinstance(c[1], torch.FloatTensor))
+        self.assertTrue(isinstance(c[2], torch.FloatTensor))
+        self.assertTrue(isinstance(c[3], torch.FloatTensor))
+        self.assertTrue(isinstance(c[4], torch.FloatStorage))
+        c[0].fill_(10)
+        self.assertEqual(c[0], c[2], 0)
+        self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0)
+        c[1].fill_(20)
+        self.assertEqual(c[1], c[3], 0)
+
+        # test some old tensor serialization mechanism
+        class OldTensorBase(object):
+            def __init__(self, new_tensor):
+                self.new_tensor = new_tensor
+
+            def __getstate__(self):
+                return (self.new_tensor.storage(),
+                        self.new_tensor.storage_offset(),
+                        tuple(self.new_tensor.size()),
+                        self.new_tensor.stride())
+
+        class OldTensorV1(OldTensorBase):
+            def __reduce__(self):
+                return (torch.Tensor, (), self.__getstate__())
+
+        class OldTensorV2(OldTensorBase):
+            def __reduce__(self):
+                return (_rebuild_tensor, self.__getstate__())
+
+        x = torch.randn(30).as_strided([2, 3], [9, 3], 2)
+        for old_cls in [OldTensorV1, OldTensorV2]:
+            with tempfile.NamedTemporaryFile() as f:
+                old_x = old_cls(x)
+                torch.save(old_x, f)
+                f.seek(0)
+                load_x = torch.load(f)
+                self.assertEqual(x.storage(), load_x.storage())
+                self.assertEqual(x.storage_offset(), load_x.storage_offset())
+                self.assertEqual(x.size(), load_x.size())
+                self.assertEqual(x.stride(), load_x.stride())
+
+    # unique_key is necessary because on Python 2.7, if a warning passed to
+    # the warning module is the same, it is not raised again.
+    def _test_serialization_container(self, unique_key, filecontext_lambda):
+        tmpmodule_name = 'tmpmodule{}'.format(unique_key)
+
+        def import_module(name, filename):
+            if sys.version_info >= (3, 5):
+                import importlib.util
+                spec = importlib.util.spec_from_file_location(name, filename)
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+            else:
+                import imp
+                module = imp.load_source(name, filename)
+            sys.modules[module.__name__] = module
+            return module
+
+        with filecontext_lambda() as checkpoint:
+            fname = get_file_path_2(os.path.dirname(__file__), 'data', 'network1.py')
+            module = import_module(tmpmodule_name, fname)
+            torch.save(module.Net(), checkpoint)
+
+            # First check that the checkpoint can be loaded without warnings
+            checkpoint.seek(0)
+            with warnings.catch_warnings(record=True) as w:
+                loaded = torch.load(checkpoint)
+                self.assertTrue(isinstance(loaded, module.Net))
+                if can_retrieve_source:
+                    self.assertEquals(len(w), 0)
+
+            # Replace the module with different source
+            fname = get_file_path_2(os.path.dirname(__file__), 'data', 'network2.py')
+            module = import_module(tmpmodule_name, fname)
+            checkpoint.seek(0)
+            with warnings.catch_warnings(record=True) as w:
+                loaded = torch.load(checkpoint)
+                self.assertTrue(isinstance(loaded, module.Net))
+                if can_retrieve_source:
+                    self.assertEquals(len(w), 1)
+                    self.assertTrue(w[0].category, 'SourceChangeWarning')
+
+    def test_serialization_container(self):
+        self._test_serialization_container('file', tempfile.NamedTemporaryFile)
+
+    def test_serialization_container_filelike(self):
+        self._test_serialization_container('filelike', BytesIOContext)
+
+    def test_serialization_map_location(self):
+        test_file_path = download_file('https://download.pytorch.org/test_data/gpu_tensors.pt')
+
+        def map_location(storage, loc):
+            return storage
+
+        def load_bytes():
+            with open(test_file_path, 'rb') as f:
+                return io.BytesIO(f.read())
+
+        fileobject_lambdas = [lambda: test_file_path, load_bytes]
+        cpu_map_locations = [
+            map_location,
+            {'cuda:0': 'cpu'},
+            'cpu',
+            torch.device('cpu'),
+        ]
+        gpu_0_map_locations = [
+            {'cuda:0': 'cuda:0'},
+            'cuda',
+            'cuda:0',
+            torch.device('cuda'),
+            torch.device('cuda', 0)
+        ]
+        gpu_last_map_locations = [
+            'cuda:{}'.format(torch.cuda.device_count() - 1),
+        ]
+
+        def check_map_locations(map_locations, tensor_class, intended_device):
+            for fileobject_lambda in fileobject_lambdas:
+                for map_location in map_locations:
+                    tensor = torch.load(fileobject_lambda(), map_location=map_location)
+
+                    self.assertEqual(tensor.device, intended_device)
+                    self.assertIsInstance(tensor, tensor_class)
+                    self.assertEqual(tensor, tensor_class([[1.0, 2.0], [3.0, 4.0]]))
+
+        check_map_locations(cpu_map_locations, torch.FloatTensor, torch.device('cpu'))
+        if torch.cuda.is_available():
+            check_map_locations(gpu_0_map_locations, torch.cuda.FloatTensor, torch.device('cuda', 0))
+            check_map_locations(
+                gpu_last_map_locations,
+                torch.cuda.FloatTensor,
+                torch.device('cuda', torch.cuda.device_count() - 1)
+            )
+
+    @unittest.skipIf(torch.cuda.is_available(), "Testing torch.load on CPU-only machine")
+    @unittest.skipIf(not PY3, "Test tensors were serialized using python 3")
+    def test_load_nonexistent_device(self):
+        # Setup: create a serialized file object with a 'cuda:0' restore location
+        # The following was generated by saving a torch.randn(2, device='cuda') tensor.
+        serialized = (b'\x80\x02\x8a\nl\xfc\x9cF\xf9 j\xa8P\x19.\x80\x02M\xe9'
+                      b'\x03.\x80\x02}q\x00(X\x10\x00\x00\x00protocol_versionq'
+                      b'\x01M\xe9\x03X\r\x00\x00\x00little_endianq\x02\x88X\n'
+                      b'\x00\x00\x00type_sizesq\x03}q\x04(X\x05\x00\x00\x00shortq'
+                      b'\x05K\x02X\x03\x00\x00\x00intq\x06K\x04X\x04\x00\x00\x00'
+                      b'longq\x07K\x04uu.\x80\x02ctorch._utils\n_rebuild_tensor_v2'
+                      b'\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage'
+                      b'\nq\x02X\x0e\x00\x00\x0094919395964320q\x03X\x06\x00\x00'
+                      b'\x00cuda:0q\x04K\x02Ntq\x05QK\x00K\x02\x85q\x06K\x01\x85q'
+                      b'\x07\x89Ntq\x08Rq\t.\x80\x02]q\x00X\x0e\x00\x00\x00'
+                      b'94919395964320q\x01a.\x02\x00\x00\x00\x00\x00\x00\x00\xbb'
+                      b'\x1f\x82\xbe\xea\x81\xd1>')
+
+        buf = io.BytesIO(serialized)
+
+        error_msg = r'Attempting to deserialize object on a CUDA device'
+        with self.assertRaisesRegex(RuntimeError, error_msg):
+            _ = torch.load(buf)
+
+    def test_serialization_filelike_api_requirements(self):
+        filemock = FilelikeMock(b'', has_readinto=False)
+        tensor = torch.randn(3, 5)
+        torch.save(tensor, filemock)
+        expected_superset = set(['write', 'flush'])
+        self.assertTrue(expected_superset.issuperset(filemock.calls))
+
+        # Reset between save and load
+        filemock.seek(0)
+        filemock.calls.clear()
+
+        _ = torch.load(filemock)
+        expected_superset = set(['read', 'readline', 'seek', 'tell'])
+        self.assertTrue(expected_superset.issuperset(filemock.calls))
+
+    def _test_serialization_filelike(self, tensor, mock, desc):
+        f = mock(b'')
+        torch.save(tensor, f)
+        f.seek(0)
+        data = mock(f.read())
+
+        msg = 'filelike serialization with {}'
+
+        b = torch.load(data)
+        self.assertTrue(torch.equal(tensor, b), msg.format(desc))
+
+    def test_serialization_filelike_missing_attrs(self):
+        # Test edge cases where filelike objects are missing attributes.
+        # The Python io docs suggests that these attributes should really exist
+        # and throw io.UnsupportedOperation, but that isn't always the case.
+        mocks = [
+            ('no readinto', lambda x: FilelikeMock(x)),
+            ('has readinto', lambda x: FilelikeMock(x, has_readinto=True)),
+            ('no fileno', lambda x: FilelikeMock(x, has_fileno=False)),
+        ]
+
+        to_serialize = torch.randn(3, 10)
+        for desc, mock in mocks:
+            self._test_serialization_filelike(to_serialize, mock, desc)
+
+    def test_serialization_filelike_stress(self):
+        a = torch.randn(11 * (2 ** 9) + 1, 5 * (2 ** 9))
+
+        # This one should call python read multiple times
+        self._test_serialization_filelike(a, lambda x: FilelikeMock(x, has_readinto=False),
+                                          'read() stress test')
+        self._test_serialization_filelike(a, lambda x: FilelikeMock(x, has_readinto=True),
+                                          'readinto() stress test')
+
+    def test_serialization_filelike_uses_readinto(self):
+        # For maximum effiency, when reading a file-like object,
+        # ensure the C API calls readinto instead of read.
+        a = torch.randn(5, 4)
+
+        f = io.BytesIO()
+        torch.save(a, f)
+        f.seek(0)
+        data = FilelikeMock(f.read(), has_readinto=True)
+
+        b = torch.load(data)
+        self.assertTrue(data.was_called('readinto'))
+
+    def test_load_error_msg(self):
+        expected_err_msg = (".*You can only torch.load from a file that is seekable. " +
+                            "Please pre-load the data into a buffer like io.BytesIO and " +
+                            "try to load from it instead.")
+        if PY3:
+            import urllib.request
+            import io
+            resource = urllib.request.urlopen('https://download.pytorch.org/test_data/linear.pt')
+            self.assertRaisesRegex(io.UnsupportedOperation, expected_err_msg, lambda: torch.load(resource))
+        else:
+            import urllib
+            resource = urllib.urlopen('https://download.pytorch.org/test_data/linear.pt')
+            self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource))
+
+    def test_from_buffer(self):
+        a = bytearray([1, 2, 3, 4])
+        self.assertEqual(torch.ByteStorage.from_buffer(a).tolist(), [1, 2, 3, 4])
+        shorts = torch.ShortStorage.from_buffer(a, 'big')
+        self.assertEqual(shorts.size(), 2)
+        self.assertEqual(shorts.tolist(), [258, 772])
+        ints = torch.IntStorage.from_buffer(a, 'little')
+        self.assertEqual(ints.size(), 1)
+        self.assertEqual(ints[0], 67305985)
+        f = bytearray([0x40, 0x10, 0x00, 0x00])
+        floats = torch.FloatStorage.from_buffer(f, 'big')
+        self.assertEqual(floats.size(), 1)
+        self.assertEqual(floats[0], 2.25)
+
+    @unittest.skipIf(IS_WINDOWS, "TODO: need to fix this test case for Windows")
+    def test_from_file(self):
+        size = 10000
+        with tempfile.NamedTemporaryFile() as f:
+            s1 = torch.FloatStorage.from_file(f.name, True, size)
+            t1 = torch.FloatTensor(s1).copy_(torch.randn(size))
+
+            # check mapping
+            s2 = torch.FloatStorage.from_file(f.name, True, size)
+            t2 = torch.FloatTensor(s2)
+            self.assertEqual(t1, t2, 0)
+
+            # check changes to t1 from t2
+            rnum = random.uniform(-1, 1)
+            t1.fill_(rnum)
+            self.assertEqual(t1, t2, 0)
+
+            # check changes to t2 from t1
+            rnum = random.uniform(-1, 1)
+            t2.fill_(rnum)
+            self.assertEqual(t1, t2, 0)
+
+    def test_print(self):
+        default_type = torch.Tensor().type()
+        for t in torch._tensor_classes:
+            if t == torch.HalfTensor:
+                continue  # HalfTensor does not support fill
+            if t.is_sparse:
+                continue
+            if t.is_cuda and not torch.cuda.is_available():
+                continue
+            obj = t(100, 100).fill_(1)
+            obj.__repr__()
+            str(obj)
+        for t in torch._storage_classes:
+            if t.is_cuda and not torch.cuda.is_available():
+                continue
+            obj = t(100).fill_(1)
+            obj.__repr__()
+            str(obj)
+
+        # test big integer
+        x = torch.tensor(2341234123412341)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='bigint')
+
+        # test scientific notation
+        x = torch.tensor([1e28, 1e-28])
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='scimode')
+
+        # test no leading space if all elements positive
+        x = torch.tensor([1, 2])
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='posint')
+
+        # test for leading space if there are negative elements
+        x = torch.tensor([1, -2])
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='negint')
+
+        # test inf and nan
+        x = torch.tensor([4, inf, 1.5, -inf, 0, nan, 1])
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='nonfinite')
+
+        # test dtype
+        torch.set_default_dtype(torch.float)
+        x = torch.tensor([1e-324, 1e-323, 1e-322, 1e307, 1e308, 1e309], dtype=torch.float64)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='dtype')
+
+        # test changing default dtype
+        torch.set_default_dtype(torch.float64)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='default_dtype')
+
+        # test summary
+        x = torch.zeros(10000)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='summary')
+
+        # test device
+        if torch.cuda.is_available():
+            x = torch.tensor([123], device='cuda:0')
+            self.assertEqual(x.__repr__(), str(x))
+            self.assertExpected(str(x), subname='device')
+
+            # test changing default to cuda
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            self.assertEqual(x.__repr__(), str(x))
+            self.assertExpected(str(x), subname='default_device')
+        torch.set_default_tensor_type(default_type)
+
+        # test integral floats and requires_grad
+        x = torch.tensor([123.], requires_grad=True)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpected(str(x), subname='requires_grad')
+
+    def test_sizeof(self):
+        sizeof_empty = torch.randn(0).storage().__sizeof__()
+        sizeof_10 = torch.randn(10).storage().__sizeof__()
+        sizeof_100 = torch.randn(100).storage().__sizeof__()
+        self.assertEqual((sizeof_100 - sizeof_empty) // (sizeof_10 - sizeof_empty), 10)
+        self.assertEqual((sizeof_100 - sizeof_empty) % (sizeof_10 - sizeof_empty), 0)
+
+        sizeof_empty = torch.randn(0).type(torch.ByteTensor).storage().__sizeof__()
+        sizeof_10 = torch.randn(10).type(torch.ByteTensor).storage().__sizeof__()
+        sizeof_100 = torch.randn(100).type(torch.ByteTensor).storage().__sizeof__()
+        self.assertEqual((sizeof_100 - sizeof_empty) // (sizeof_10 - sizeof_empty), 10)
+        self.assertEqual((sizeof_100 - sizeof_empty) % (sizeof_10 - sizeof_empty), 0)
+
+    def test_unsqueeze(self):
+        x = torch.randn(2, 3, 4)
+        y = x.unsqueeze(1)
+        self.assertEqual(y, x.view(2, 1, 3, 4))
+        y = x.clone().unsqueeze_(2)
+        self.assertEqual(y, x.view(2, 3, 1, 4))
+
+        x = x[:, 1]
+        self.assertFalse(x.is_contiguous())
+        y = x.unsqueeze(1)
+        self.assertEqual(y, x.contiguous().view(2, 1, 4))
+        y = x.clone().unsqueeze_(2)
+        self.assertEqual(y, x.contiguous().view(2, 4, 1))
+
+        self.assertRaises(RuntimeError, lambda: torch.Tensor().unsqueeze(0))
+
+    def test_iter(self):
+        x = torch.randn(5, 5)
+        for i, sub in enumerate(x):
+            self.assertEqual(sub, x[i])
+
+        x = torch.Tensor()
+        self.assertEqual(list(x), [])
+
+    def test_accreal_type(self):
+        x = torch.ones(2, 3, 4)
+        self.assertIsInstance(x.double().sum().item(), float)
+        self.assertIsInstance(x.float().sum().item(), float)
+        self.assertIsInstance(x.long().sum().item(), int)
+        self.assertIsInstance(x.int().sum().item(), int)
+        self.assertIsInstance(x.short().sum().item(), int)
+        self.assertIsInstance(x.char().sum().item(), int)
+        self.assertIsInstance(x.byte().sum().item(), int)
+
+    def test_assertEqual(self):
+        x = torch.FloatTensor([0])
+        self.assertEqual(x, 0)
+        xv = torch.autograd.Variable(x)
+        self.assertEqual(xv, 0)
+        self.assertEqual(x, xv)
+        self.assertEqual(xv, x)
+
+    def test_new(self):
+        x = torch.autograd.Variable(torch.Tensor())
+        y = torch.autograd.Variable(torch.randn(4, 4))
+        z = torch.autograd.Variable(torch.IntTensor([1, 2, 3]))
+        self.assertEqual(x.new().shape, [0])
+        self.assertEqual(x.new(), x)
+        self.assertEqual(x.new(1, 2).shape, [1, 2])
+        self.assertEqual(x.new(torch.Size([3, 4])).shape, [3, 4])
+        self.assertEqual(x.new([3, 4]).shape, [2])
+        self.assertEqual(x.new([3, 4]).tolist(), [3, 4])
+        self.assertEqual(x.new((3, 4)).tolist(), [3, 4])
+        if TEST_NUMPY:
+            self.assertEqual(x.new([np.int32(3), np.float64(4)]).tolist(), [3, 4])
+            self.assertEqual(x.new(np.array((3, 4))).tolist(), [3, 4])
+        self.assertEqual(x.new([z[2], z[0] + 3]).tolist(), [3, 4])
+        self.assertEqual(x.new(size=(3, 4)).shape, [3, 4])
+        self.assertEqual(x.new(tuple()).shape, [0])
+        self.assertEqual(x.new(y.storage()).data_ptr(), y.data_ptr())
+        self.assertEqual(x.new(y).data_ptr(), y.data_ptr())
+        self.assertIsNot(x.new(y), y)
+
+        self.assertRaises(TypeError, lambda: x.new(z))
+        # TypeError would be better
+        self.assertRaises(RuntimeError, lambda: x.new(z.storage()))
+
+    def test_empty_like(self):
+        x = torch.autograd.Variable(torch.Tensor())
+        y = torch.autograd.Variable(torch.randn(4, 4))
+        z = torch.autograd.Variable(torch.IntTensor([1, 2, 3]))
+        for a in (x, y, z):
+            self.assertEqual(torch.empty_like(a).shape, a.shape)
+            self.assertEqual(torch.empty_like(a).type(), a.type())
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_pin_memory(self):
+        x = torch.randn(3, 5)
+        self.assertFalse(x.is_pinned())
+        pinned = x.pin_memory()
+        self.assertTrue(pinned.is_pinned())
+        self.assertEqual(pinned, x)
+        self.assertNotEqual(pinned.data_ptr(), x.data_ptr())
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_numpy_unresizable(self):
+        x = np.zeros((2, 2))
+        y = torch.from_numpy(x)
+        with self.assertRaises(ValueError):
+            x.resize((5, 5))
+
+        z = torch.randn(5, 5)
+        w = z.numpy()
+        with self.assertRaises(RuntimeError):
+            z.resize_(10, 10)
+        with self.assertRaises(ValueError):
+            w.resize((10, 10))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_toNumpy(self):
+        types = [
+            'torch.ByteTensor',
+            'torch.IntTensor',
+            'torch.HalfTensor',
+            'torch.FloatTensor',
+            'torch.DoubleTensor',
+            'torch.LongTensor',
+        ]
+        for tp in types:
+            # 1D
+            sz = 10
+            x = torch.randn(sz).mul(255).type(tp)
+            y = x.numpy()
+            for i in range(sz):
+                self.assertEqual(x[i], y[i])
+
+            # 1D > 0 storage offset
+            xm = torch.randn(sz * 2).mul(255).type(tp)
+            x = xm.narrow(0, sz - 1, sz)
+            self.assertTrue(x.storage_offset() > 0)
+            y = x.numpy()
+            for i in range(sz):
+                self.assertEqual(x[i], y[i])
+
+            def check2d(x, y):
+                for i in range(sz1):
+                    for j in range(sz2):
+                        self.assertEqual(x[i][j], y[i][j])
+
+            # empty
+            x = torch.Tensor().type(tp)
+            y = x.numpy()
+            self.assertEqual(y.size, 0)
+
+            # contiguous 2D
+            sz1 = 3
+            sz2 = 5
+            x = torch.randn(sz1, sz2).mul(255).type(tp)
+            y = x.numpy()
+            check2d(x, y)
+            self.assertTrue(y.flags['C_CONTIGUOUS'])
+
+            # with storage offset
+            xm = torch.randn(sz1 * 2, sz2).mul(255).type(tp)
+            x = xm.narrow(0, sz1 - 1, sz1)
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+            self.assertTrue(y.flags['C_CONTIGUOUS'])
+
+            # non-contiguous 2D
+            x = torch.randn(sz2, sz1).mul(255).type(tp).t()
+            y = x.numpy()
+            check2d(x, y)
+            self.assertFalse(y.flags['C_CONTIGUOUS'])
+
+            # with storage offset
+            xm = torch.randn(sz2 * 2, sz1).mul(255).type(tp)
+            x = xm.narrow(0, sz2 - 1, sz2).t()
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+
+            # non-contiguous 2D with holes
+            xm = torch.randn(sz2 * 2, sz1 * 2).mul(255).type(tp)
+            x = xm.narrow(0, sz2 - 1, sz2).narrow(1, sz1 - 1, sz1).t()
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+
+            if tp != 'torch.HalfTensor':
+                # check writeable
+                x = torch.randn(3, 4).mul(255).type(tp)
+                y = x.numpy()
+                self.assertTrue(y.flags.writeable)
+                y[0][1] = 3
+                self.assertTrue(x[0][1] == 3)
+                y = x.t().numpy()
+                self.assertTrue(y.flags.writeable)
+                y[0][1] = 3
+                self.assertTrue(x[0][1] == 3)
+
+    def test_dlpack_conversion(self):
+        x = torch.randn(1, 2, 3, 4).type('torch.FloatTensor')
+        z = from_dlpack(to_dlpack(x))
+        self.assertEqual(z, x)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "No CUDA")
+    def test_dlpack_cuda(self):
+        x = torch.randn(1, 2, 3, 4).cuda()
+        z = from_dlpack(to_dlpack(x))
+        self.assertEqual(z, x)
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_from_numpy(self):
+        dtypes = [
+            np.double,
+            np.float,
+            np.float16,
+            np.int64,
+            np.int32,
+            np.int16,
+            np.uint8,
+            np.longlong,
+        ]
+        for dtype in dtypes:
+            array = np.array([1, 2, 3, 4], dtype=dtype)
+            tensor_from_array = torch.from_numpy(array)
+            # TODO: change to tensor equality check once HalfTensor
+            # implements `==`
+            for i in range(len(array)):
+                self.assertEqual(tensor_from_array[i], array[i])
+
+        # check storage offset
+        x = np.linspace(1, 125, 125)
+        x.shape = (5, 5, 5)
+        x = x[1]
+        expected = torch.arange(1, 126).view(5, 5, 5)[1]
+        self.assertEqual(torch.from_numpy(x), expected)
+
+        # check noncontiguous
+        x = np.linspace(1, 25, 25)
+        x.shape = (5, 5)
+        expected = torch.arange(1, 26).view(5, 5).t()
+        self.assertEqual(torch.from_numpy(x.T), expected)
+
+        # check noncontiguous with holes
+        x = np.linspace(1, 125, 125)
+        x.shape = (5, 5, 5)
+        x = x[:, 1]
+        expected = torch.arange(1, 126).view(5, 5, 5)[:, 1]
+        self.assertEqual(torch.from_numpy(x), expected)
+
+        # check zero dimensional
+        x = np.zeros((0, 2))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(torch.from_numpy(x).shape, (0, 2))
+        else:
+            self.assertEqual(torch.from_numpy(x).shape, (0,))
+
+        # check ill-sized strides raise exception
+        x = np.array([3., 5., 8.])
+        x.strides = (3,)
+        self.assertRaises(ValueError, lambda: torch.from_numpy(x))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_ctor_with_numpy_array(self):
+        dtypes = [
+            np.double,
+            np.float,
+            np.float16,
+            np.int64,
+            np.int32,
+            np.int16,
+            np.uint8
+        ]
+        for dtype in dtypes:
+            array = np.array([1, 2, 3, 4], dtype=dtype)
+
+            # Upcast
+            tensor = torch.DoubleTensor(array)
+            for i in range(len(array)):
+                self.assertEqual(tensor[i], array[i])
+
+            if torch.cuda.is_available():
+                tensor = torch.cuda.DoubleTensor(array)
+                for i in range(len(array)):
+                    self.assertEqual(tensor[i], array[i])
+
+            # Downcast (sometimes)
+            tensor = torch.FloatTensor(array)
+            for i in range(len(array)):
+                self.assertEqual(tensor[i], array[i])
+
+            tensor = torch.HalfTensor(array)
+            for i in range(len(array)):
+                self.assertEqual(tensor[i], array[i])
+
+            if torch.cuda.is_available():
+                tensor = torch.cuda.FloatTensor(array)
+                for i in range(len(array)):
+                    self.assertEqual(tensor[i], array[i])
+
+                tensor = torch.cuda.HalfTensor(array)
+                for i in range(len(array)):
+                    self.assertEqual(tensor[i], array[i])
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_numpy_index(self):
+        i = np.int32([0, 1, 2])
+        x = torch.randn(5, 5)
+        for idx in i:
+            self.assertFalse(isinstance(idx, int))
+            self.assertEqual(x[idx], x[int(idx)])
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_numpy_array_interface(self):
+        types = [
+            torch.DoubleTensor,
+            torch.FloatTensor,
+            torch.HalfTensor,
+            torch.LongTensor,
+            torch.IntTensor,
+            torch.ShortTensor,
+            torch.ByteTensor,
+        ]
+        dtypes = [
+            np.float64,
+            np.float32,
+            np.float16,
+            np.int64,
+            np.int32,
+            np.int16,
+            np.uint8,
+        ]
+        for tp, dtype in zip(types, dtypes):
+            if np.dtype(dtype).kind == 'u':
+                x = torch.Tensor([1, 2, 3, 4]).type(tp)
+                array = np.array([1, 2, 3, 4], dtype=dtype)
+            else:
+                x = torch.Tensor([1, -2, 3, -4]).type(tp)
+                array = np.array([1, -2, 3, -4], dtype=dtype)
+
+            # Test __array__ w/o dtype argument
+            asarray = np.asarray(x)
+            self.assertIsInstance(asarray, np.ndarray)
+            self.assertEqual(asarray.dtype, dtype)
+            for i in range(len(x)):
+                self.assertEqual(asarray[i], x[i])
+
+            # Test __array_wrap__, same dtype
+            abs_x = np.abs(x)
+            abs_array = np.abs(array)
+            self.assertIsInstance(abs_x, tp)
+            for i in range(len(x)):
+                self.assertEqual(abs_x[i], abs_array[i])
+
+        # Test __array__ with dtype argument
+        for dtype in dtypes:
+            x = torch.IntTensor([1, -2, 3, -4])
+            asarray = np.asarray(x, dtype=dtype)
+            self.assertEqual(asarray.dtype, dtype)
+            if np.dtype(dtype).kind == 'u':
+                wrapped_x = np.array([1, -2, 3, -4], dtype=dtype)
+                for i in range(len(x)):
+                    self.assertEqual(asarray[i], wrapped_x[i])
+            else:
+                for i in range(len(x)):
+                    self.assertEqual(asarray[i], x[i])
+
+        # Test some math functions with float types
+        float_types = [torch.DoubleTensor, torch.FloatTensor]
+        float_dtypes = [np.float64, np.float32]
+        for tp, dtype in zip(float_types, float_dtypes):
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            array = np.array([1, 2, 3, 4], dtype=dtype)
+            for func in ['sin', 'sqrt', 'ceil']:
+                ufunc = getattr(np, func)
+                res_x = ufunc(x)
+                res_array = ufunc(array)
+                self.assertIsInstance(res_x, tp)
+                for i in range(len(x)):
+                    self.assertEqual(res_x[i], res_array[i])
+
+        # Test functions with boolean return value
+        for tp, dtype in zip(types, dtypes):
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            array = np.array([1, 2, 3, 4], dtype=dtype)
+            geq2_x = np.greater_equal(x, 2)
+            geq2_array = np.greater_equal(array, 2).astype('uint8')
+            self.assertIsInstance(geq2_x, torch.ByteTensor)
+            for i in range(len(x)):
+                self.assertEqual(geq2_x[i], geq2_array[i])
+
+    def test_error_msg_type_translation(self):
+        with self.assertRaisesRegex(
+                RuntimeError,
+                # message includes both torch.DoubleTensor and torch.LongTensor
+                '(?=.*torch\.DoubleTensor)(?=.*torch\.LongTensor)'):
+
+            # Calls model with a DoubleTensor input but LongTensor weights
+            input = torch.autograd.Variable(torch.randn(1, 1, 1, 6).double())
+            weight = torch.zeros(1, 1, 1, 3).long()
+            model = torch.nn.Conv2d(1, 1, (1, 3), stride=1, padding=0, bias=False)
+            model.weight.data = weight
+            out = model(input)
+
+    def test_tensor_from_sequence(self):
+        class MockSequence(object):
+            def __init__(self, lst):
+                self.lst = lst
+
+            def __len__(self):
+                return len(self.lst)
+
+            def __getitem__(self, item):
+                raise TypeError
+
+        class GoodMockSequence(MockSequence):
+            def __getitem__(self, item):
+                return self.lst[item]
+
+        bad_mock_seq = MockSequence([1.0, 2.0, 3.0])
+        good_mock_seq = GoodMockSequence([1.0, 2.0, 3.0])
+        with self.assertRaisesRegex(ValueError, 'could not determine the shape'):
+            torch.Tensor(bad_mock_seq)
+        self.assertEqual(torch.Tensor([1.0, 2.0, 3.0]), torch.Tensor(good_mock_seq))
+
+    def test_comparison_ops(self):
+        x = torch.randn(5, 5)
+        y = torch.randn(5, 5)
+
+        eq = x == y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] == y[idx], eq[idx] == 1)
+
+        ne = x != y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] != y[idx], ne[idx] == 1)
+
+        lt = x < y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] < y[idx], lt[idx] == 1)
+
+        le = x <= y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] <= y[idx], le[idx] == 1)
+
+        gt = x > y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] > y[idx], gt[idx] == 1)
+
+        ge = x >= y
+        for idx in iter_indices(x):
+            self.assertEqual(x[idx] >= y[idx], ge[idx] == 1)
+
+    def test_bitwise_ops(self):
+        x = torch.randn(5, 5).gt(0)
+        y = torch.randn(5, 5).gt(0)
+
+        and_result = x & y
+        for idx in iter_indices(x):
+            if and_result[idx]:
+                self.assertTrue(x[idx] and y[idx])
+            else:
+                self.assertFalse(x[idx] and y[idx])
+
+        or_result = x | y
+        for idx in iter_indices(x):
+            if or_result[idx]:
+                self.assertTrue(x[idx] or y[idx])
+            else:
+                self.assertFalse(x[idx] or y[idx])
+
+        xor_result = x ^ y
+        for idx in iter_indices(x):
+            if xor_result[idx]:
+                self.assertTrue(x[idx] ^ y[idx])
+            else:
+                self.assertFalse(x[idx] ^ y[idx])
+
+        invert_result = ~x
+        for idx in iter_indices(x):
+            self.assertEqual(1 - x[idx], invert_result[idx])
+
+        x_clone = x.clone()
+        x_clone &= y
+        self.assertEqual(x_clone, and_result)
+
+        x_clone = x.clone()
+        x_clone |= y
+        self.assertEqual(x_clone, or_result)
+
+        x_clone = x.clone()
+        x_clone ^= y
+        self.assertEqual(x_clone, xor_result)
+
+    def test_invert(self):
+        x = torch.ByteTensor([0, 1, 1])
+        self.assertEqual((~x).tolist(), [1, 0, 0])
+
+    def test_apply(self):
+        x = torch.arange(1, 6)
+        res = x.clone().apply_(lambda k: k + k)
+        self.assertEqual(res, x * 2)
+        self.assertRaises(TypeError, lambda: x.apply_(lambda k: "str"))
+
+    def test_map(self):
+        x = torch.autograd.Variable(torch.randn(3, 3))
+        y = torch.autograd.Variable(torch.randn(3))
+        res = x.clone()
+        res.map_(y, lambda a, b: a + b)
+        self.assertEqual(res, x + y)
+        self.assertRaisesRegex(TypeError, "not callable", lambda: res.map_(y, "str"))
+
+    def test_map2(self):
+        x = torch.autograd.Variable(torch.randn(3, 3))
+        y = torch.autograd.Variable(torch.randn(3))
+        z = torch.autograd.Variable(torch.randn(1, 3))
+        res = x.clone()
+        res.map2_(y, z, lambda a, b, c: a + b * c)
+        self.assertEqual(res, x + y * z)
+        z.requires_grad = True
+        self.assertRaisesRegex(
+            RuntimeError, "requires grad",
+            lambda: res.map2_(y, z, lambda a, b, c: a + b * c))
+
+    def test_Size(self):
+        x = torch.Size([1, 2, 3])
+        self.assertIsInstance(x, tuple)
+        self.assertEqual(x[0], 1)
+        self.assertEqual(x[1], 2)
+        self.assertEqual(x[2], 3)
+        self.assertEqual(len(x), 3)
+        self.assertRaises(TypeError, lambda: torch.Size(torch.ones(3)))
+
+        self.assertIsInstance(x * 2, torch.Size)
+        self.assertIsInstance(x[:-1], torch.Size)
+        self.assertIsInstance(x + x, torch.Size)
+
+    def test_Size_scalar(self):
+        three = torch.tensor(3)
+        two = torch.tensor(2)
+        x = torch.Size([0, 1, two, three, 4])
+        for i in range(1, 5):
+            self.assertEqual(x[i], i)
+
+    def test_Size_iter(self):
+        for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
+            x = torch.Size(sizes)
+            for i in range(0, 5):
+                self.assertEqual(x[i], i + 1)
+
+    def test_t_not_2d_error(self):
+        self.assertRaises(RuntimeError, lambda: torch.randn(2, 3, 4).t())
+        self.assertRaises(RuntimeError, lambda: torch.randn(2, 3, 4).t_())
+
+    # unit test for THTensor_(copyTranspose)
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_big_transpose(self):
+        t = torch.rand(456, 789)
+        t1 = t.t().contiguous()
+        t2 = torch.from_numpy(t.numpy().transpose())
+        self.assertEqual(t1, t2)
+
+    def test_inplace_division(self):
+        t = torch.rand(5, 5)
+        id_before = id(t)
+        t /= 2
+        id_after = id(t)
+        self.assertEqual(id_before, id_after)
+
+    def test_simple_scalar_cast(self):
+        ok = [torch.Tensor([1.5]), torch.zeros(1, 1, 1, 1)]
+        ok_values = [1.5, 0]
+
+        not_ok = map(torch.Tensor, [[], [1, 2], [[1, 2], [3, 4]]])
+
+        for tensor, value in zip(ok, ok_values):
+            self.assertEqual(int(tensor), int(value))
+            self.assertEqual(float(tensor), float(value))
+            if sys.version_info[0] < 3:
+                self.assertEqual(long(tensor), long(value))
+
+        for tensor in not_ok:
+            self.assertRaises(ValueError, lambda: int(tensor))
+            self.assertRaises(ValueError, lambda: float(tensor))
+            if sys.version_info[0] < 3:
+                self.assertRaises(ValueError, lambda: long(tensor))
+
+    def test_offset_scalar_cast(self):
+        x = torch.Tensor([1, 2, 3])
+        y = x[2:]
+        self.assertEqual(int(y), 3)
+
+    # skip this test for now as it affects all tests
+    @unittest.skipIf(True, "flush_denormal not supported")
+    def test_set_flush_denormal(self):
+        tiny_float = 1e-42
+        tiny_double = 1e-320
+        float_tensor = torch.FloatTensor([1.0, tiny_float])
+        double_tensor = torch.DoubleTensor([1.0, tiny_float, tiny_double])
+
+        self.assertEqual(float_tensor[0], 1.0, prec=0.0)
+        self.assertEqual(float_tensor[1], tiny_float, prec=tiny_float / 16)
+        self.assertEqual(double_tensor[0], 1.0, prec=0.0)
+        self.assertEqual(double_tensor[1], tiny_float, prec=0.0)
+        self.assertEqual(double_tensor[2], tiny_double, prec=0.0)
+
+        torch.set_flush_denormal(True)
+        self.assertEqual(float_tensor[0], 1.0, prec=0.0)
+        self.assertEqual(float_tensor[1], 0.0, prec=0.0)  # tiny_float to zero
+        self.assertEqual(double_tensor[0], 1.0, prec=0.0)
+        # tiny_float is not converted to zero in double type
+        self.assertEqual(double_tensor[1], tiny_float, prec=0.0)
+        self.assertEqual(double_tensor[2], 0.0, prec=0.0)  # tiny_double to zero
+        torch.set_flush_denormal(False)
+
+    def test_unique(self):
+        x = torch.LongTensor([1, 2, 3, 2, 8, 5, 2, 3])
+        expected_unique = torch.LongTensor([1, 2, 3, 5, 8])
+        expected_inverse = torch.LongTensor([0, 1, 2, 1, 4, 3, 1, 2])
+
+        x_unique = torch.unique(x)
+        self.assertEqual(
+            expected_unique.tolist(), sorted(x_unique.tolist()))
+
+        x_unique, x_inverse = x.unique(return_inverse=True)
+        self.assertEqual(
+            expected_unique.tolist(), sorted(x_unique.tolist()))
+        self.assertEqual(expected_inverse.numel(), x_inverse.numel())
+
+        x_unique = x.unique(sorted=True)
+        self.assertEqual(expected_unique, x_unique)
+
+        x_unique, x_inverse = torch.unique(
+            x, sorted=True, return_inverse=True)
+        self.assertEqual(expected_unique, x_unique)
+        self.assertEqual(expected_inverse, x_inverse)
+
+        # Tests per-element unique on a higher rank tensor.
+        y = x.view(2, 2, 2)
+        y_unique, y_inverse = y.unique(sorted=True, return_inverse=True)
+        self.assertEqual(expected_unique, y_unique)
+        self.assertEqual(expected_inverse.view(y.size()), y_inverse)
+
+        # Tests unique on other types.
+        int_unique, int_inverse = torch.unique(
+            torch.IntTensor([2, 1, 2]), sorted=True, return_inverse=True)
+        self.assertEqual(torch.IntTensor([1, 2]), int_unique)
+        self.assertEqual(torch.LongTensor([1, 0, 1]), int_inverse)
+
+        double_unique, double_inverse = torch.unique(
+            torch.DoubleTensor([2., 1.5, 2.1, 2.]),
+            sorted=True,
+            return_inverse=True,
+        )
+        self.assertEqual(torch.DoubleTensor([1.5, 2., 2.1]), double_unique)
+        self.assertEqual(torch.LongTensor([1, 0, 2, 1]), double_inverse)
+
+        byte_unique, byte_inverse = torch.unique(
+            torch.ByteTensor([133, 7, 7, 7, 42, 128]),
+            sorted=True,
+            return_inverse=True,
+        )
+        self.assertEqual(torch.ByteTensor([7, 42, 128, 133]), byte_unique)
+        self.assertEqual(torch.LongTensor([3, 0, 0, 0, 1, 2]), byte_inverse)
+
+    @staticmethod
+    def _test_bincount(self, device):
+        # negative input throws
+        with self.assertRaisesRegex(RuntimeError, '1-d non-negative integral'):
+            torch.bincount(torch.tensor([1, -1], device=device))
+        # n-d input, with n > 1 throws
+        with self.assertRaisesRegex(RuntimeError, '1-d non-negative integral'):
+            torch.bincount(torch.tensor([[1, 2], [3, 4]], device=device))
+        # floating input type throws
+        with self.assertRaisesRegex(RuntimeError, 'not implemented'):
+            torch.bincount(torch.tensor([1., 0.3], device=device))
+        # minlength < 0 throws
+        with self.assertRaisesRegex(RuntimeError, 'minlength should be >= 0'):
+            torch.bincount(torch.tensor([1, 3], device=device),
+                           torch.tensor([.2, .2], device=device),
+                           minlength=-1)
+        # input and weights dim mismatch
+        with self.assertRaisesRegex(RuntimeError, 'same length'):
+            torch.bincount(torch.tensor([1, 0], device=device),
+                           torch.tensor([1., 0.3, 0.5], device=device))
+
+        # test tensor method without weights
+        long_counts = torch.tensor(
+            [0, 3, 2, 1, 3], dtype=torch.uint8, device=device).bincount()
+        self.assertEqual(
+            torch.tensor([1, 1, 1, 2], dtype=torch.int64, device=device),
+            long_counts)
+        # test minlength functionality
+        int_counts = torch.bincount(
+            torch.tensor([1, 1, 1, 1], device=device), minlength=5)
+        self.assertEqual(
+            torch.tensor([0, 4, 0, 0, 0], dtype=torch.int64, device=device),
+            int_counts)
+        # test weights
+        byte_counts = torch.bincount(
+            torch.tensor([0, 1, 1, 1, 4], device=device),
+            torch.tensor([.1, .2, .3, .4, .5], device=device))
+        self.assertEqual(
+            torch.tensor([0.1, 0.9, 0, 0, 0.5], device=device), byte_counts)
+        byte_counts = torch.bincount(
+            torch.tensor([0, 1, 1, 1, 4], device=device),
+            torch.tensor([1, 2, 3, 4, 5], dtype=torch.int8, device=device))
+        self.assertEqual(
+            torch.tensor([1, 9, 0, 0, 5], device=device), byte_counts)
+        # test large number of bins - global memory use
+        big_exp = torch.zeros(10000000, device=device)
+        big_exp[-1] = 50.0
+        big_w = torch.tensor([.5] * 100, device=device)
+        big_out = torch.tensor([9999999] * 100, device=device).bincount(big_w)
+        self.assertEqual(big_exp, big_out)
+        # test large input size
+        big_exp = torch.zeros(2, device=device)
+        big_exp[1] = 1000000
+        big_out = torch.ones(1000000, dtype=torch.int8, device=device).bincount()
+        self.assertEqual(big_exp, big_out)
+
+    def test_bincount_cpu(self):
+        self._test_bincount(self, device='cpu')
+
+    def test_is_nonzero(self):
+        self.assertExpectedRaises(RuntimeError, lambda: torch.tensor([]).is_nonzero(), subname="empty")
+        self.assertExpectedRaises(RuntimeError, lambda: torch.tensor([0, 0]).is_nonzero(), subname="multiple")
+        self.assertFalse(torch.tensor(0).is_nonzero())
+        self.assertTrue(torch.tensor(1).is_nonzero())
+        self.assertFalse(torch.tensor([0]).is_nonzero())
+        self.assertTrue(torch.tensor([1]).is_nonzero())
+        self.assertFalse(torch.tensor([[0]]).is_nonzero())
+        self.assertTrue(torch.tensor([[1]]).is_nonzero())
+
+    def test_meshgrid(self):
+        a = torch.tensor(1)
+        b = torch.tensor([1, 2, 3])
+        c = torch.tensor([1, 2])
+        grid_a, grid_b, grid_c = torch.meshgrid([a, b, c])
+        self.assertEqual(grid_a.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_b.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_c.shape, torch.Size([1, 3, 2]))
+        expected_grid_a = torch.ones(1, 3, 2, dtype=torch.int64)
+        expected_grid_b = torch.tensor([[[1, 1],
+                                         [2, 2],
+                                         [3, 3]]])
+        expected_grid_c = torch.tensor([[[1, 2],
+                                         [1, 2],
+                                         [1, 2]]])
+        self.assertTrue(grid_a.equal(expected_grid_a))
+        self.assertTrue(grid_b.equal(expected_grid_b))
+        self.assertTrue(grid_c.equal(expected_grid_c))
+
+
+# Functions to test negative dimension wrapping
+METHOD = 1
+INPLACE_METHOD = 2
+FUNCTIONAL = 4
+DIM_ARG = None
+
+
+def make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim=0):
+    def neg_dim_test(self):
+        if isinstance(tensor_arg, list):
+            assert METHOD not in types and INPLACE_METHOD not in types
+            x = [torch.randn(arg) for arg in tensor_arg]
+            ndim = len(tensor_arg[-1])
+        else:
+            x = torch.randn(*tensor_arg)
+            ndim = len(tensor_arg)
+        ndim += extra_dim
+
+        n_dim_to_test = sum(map(lambda e: e is DIM_ARG, arg_constr()))
+
+        for dims_val in combinations(range(ndim), n_dim_to_test):
+            arg = arg_constr()
+            arg_neg = copy.deepcopy(arg)
+            idx = 0
+            for i, v in enumerate(arg):
+                if v is DIM_ARG:
+                    arg[i] = dims_val[idx]
+                    arg_neg[i] = dims_val[idx] - ndim
+                    idx += 1
+
+            if METHOD in types:
+                a = getattr(x, name)(*arg)
+                b = getattr(x, name)(*arg_neg)
+                self.assertEqual(a, b)
+
+            if INPLACE_METHOD in types:
+                a = x.clone()
+                getattr(a, name + '_')(*arg)
+                b = x.clone()
+                getattr(b, name + '_')(*arg_neg)
+                self.assertEqual(a, b)
+
+            if FUNCTIONAL in types:
+                a = getattr(torch, name)(x, *arg)
+                b = getattr(torch, name)(x, *arg_neg)
+                self.assertEqual(a, b)
+
+    return neg_dim_test
+
+
+def idx_tensor(size, max_val):
+    return torch.LongTensor(*size).random_(0, max_val - 1)
+
+neg_dim_tests = [
+    ('narrow', (10, 20, 30), lambda: [DIM_ARG, 0, 5], [METHOD]),
+    ('transpose', (10, 20, 30), lambda: [DIM_ARG, DIM_ARG], [METHOD, INPLACE_METHOD, FUNCTIONAL]),
+    ('size', (10, 20, 30), lambda: [DIM_ARG], [METHOD]),
+    ('cat', [(2, 3, 4), (2, 3, 4)], lambda: [DIM_ARG], [FUNCTIONAL]),
+    ('chunk', (10, 20, 30), lambda: [5, DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('gather', (10, 20), lambda: [DIM_ARG, idx_tensor((10, 20), 10)], [METHOD, FUNCTIONAL]),
+    ('index_select', (10, 10), lambda: [DIM_ARG, idx_tensor((10,), 10)], [METHOD, FUNCTIONAL]),
+    ('split', (10, 20), lambda: [5, DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('squeeze', (10, 1, 20, 1), lambda: [DIM_ARG], [METHOD, INPLACE_METHOD, FUNCTIONAL]),
+    ('unbind', (2, 3, 4), lambda: [DIM_ARG], [FUNCTIONAL]),
+    ('unsqueeze', (10, 20), lambda: [DIM_ARG], [METHOD, INPLACE_METHOD, FUNCTIONAL], 1),
+    ('cumprod', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('cumsum', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('mean', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('median', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('mode', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('norm', (10, 20), lambda: [2, DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('prod', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('std', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('sum', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('var', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('kthvalue', (10, 20), lambda: [3, DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('max', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('min', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('sort', (10, 20), lambda: [DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('topk', (10, 20), lambda: [5, DIM_ARG], [METHOD, FUNCTIONAL]),
+    ('renorm', (10, 20), lambda: [2, DIM_ARG, 1], [METHOD, INPLACE_METHOD, FUNCTIONAL]),
+    ('index_add', (10, 10), lambda: [DIM_ARG, idx_tensor((10,), 10), torch.randn(10, 10)], [INPLACE_METHOD]),
+    ('index_copy', (10, 10), lambda: [DIM_ARG, idx_tensor((10,), 10), torch.randn(10, 10)], [INPLACE_METHOD]),
+    ('index_fill', (10, 10), lambda: [DIM_ARG, idx_tensor((10,), 10), 12], [INPLACE_METHOD]),
+    ('scatter', (10, 10), lambda: [DIM_ARG, idx_tensor((10, 10), 10), torch.randn(10, 10)], [INPLACE_METHOD]),
+    ('select', (10, 20), lambda: [DIM_ARG, 3], [METHOD]),
+    ('unfold', (10, 20), lambda: [DIM_ARG, 5, 2], [METHOD]),
+]
+
+for decl in neg_dim_tests:
+    if len(decl) == 4:
+        name, tensor_arg, arg_constr, types = decl
+        extra_dim = 0
+    elif len(decl) == 5:
+        name, tensor_arg, arg_constr, types, extra_dim = decl
+
+    test_name = 'test_' + name + '_neg_dim'
+
+    assert not hasattr(TestTorch, test_name), "Duplicated test name: " + test_name
+    setattr(TestTorch, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_utils.py b/test/test_utils.py
new file mode 100644
index 0000000..9e6041f
--- /dev/null
+++ b/test/test_utils.py
@@ -0,0 +1,703 @@
+from __future__ import print_function
+import sys
+import os
+import re
+import math
+import shutil
+import random
+import tempfile
+import unittest
+import traceback
+import torch
+import torch.nn as nn
+import torch.utils.data
+import torch.cuda
+import warnings
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+from torch.utils.trainer import Trainer
+from torch.utils.trainer.plugins import *
+from torch.utils.trainer.plugins.plugin import Plugin
+from torch.autograd._functions.utils import prepare_onnx_paddings
+from torch.autograd._functions.utils import check_onnx_broadcast
+from common import IS_WINDOWS, IS_PPC
+
+HAS_CUDA = torch.cuda.is_available()
+
+from common import TestCase, run_tests, download_file
+
+try:
+    import cffi
+    HAS_CFFI = True
+except ImportError:
+    HAS_CFFI = False
+
+
+if HAS_CFFI:
+    from torch.utils.ffi import create_extension
+
+
+class SimplePlugin(Plugin):
+
+    def __init__(self, interval):
+        super(SimplePlugin, self).__init__(interval)
+        self.trainer = None
+        self.num_iteration = 0
+        self.num_epoch = 0
+        self.num_batch = 0
+        self.num_update = 0
+
+    def register(self, trainer):
+        self.trainer = trainer
+
+    def iteration(self, *args):
+        self.iteration_args = args
+        self.num_iteration += 1
+
+    def epoch(self, *args):
+        self.epoch_args = args
+        self.num_epoch += 1
+
+    def batch(self, *args):
+        self.batch_args = args
+        self.num_batch += 1
+
+    def update(self, *args):
+        self.update_args = args
+        self.num_update += 1
+
+
+class ModelMock(object):
+
+    def __init__(self):
+        self.num_calls = 0
+        self.output = torch.ones(1, 1, requires_grad=True)
+
+    def __call__(self, i):
+        self.num_calls += 1
+        return self.output * 2
+
+
+class CriterionMock(object):
+
+    def __init__(self):
+        self.num_calls = 0
+
+    def __call__(self, out, target):
+        self.num_calls += 1
+        return out
+
+
+class OptimizerMock(object):
+    max_evals = 5
+    min_evals = 1
+
+    def __init__(self):
+        self.num_steps = 0
+        self.num_evals = 0
+
+    def step(self, closure):
+        for i in range(random.randint(self.min_evals, self.max_evals)):
+            loss = closure()
+            self.num_evals += 1
+        self.num_steps += 1
+
+    def zero_grad(self):
+        pass
+
+
+class DatasetMock(object):
+
+    def __iter__(self):
+        for i in range(10):
+            yield torch.randn(2, 10), torch.randperm(10)[:2]
+
+    def __len__(self):
+        return 10
+
+
+class RandomDatasetMock(object):
+
+    def __getitem__(self, index):
+        return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
+
+    def __len__(self):
+        return 1000
+
+
+class TestCheckpoint(TestCase):
+
+    # Test whether checkpoint is being triggered or not. For this, we check
+    # the number of times forward pass happens
+    def test_checkpoint_trigger(self):
+
+        class Net(nn.Module):
+
+            def __init__(self):
+                super(Net, self).__init__()
+                self.counter = 0
+
+            def forward(self, input_var):
+                self.counter += 1
+                return input_var
+
+        # checkpointed
+        modules = [Net() for _ in range(10)]
+        for m in modules:
+            self.assertEqual(m.counter, 0)
+        input_var = torch.randn(3, 4, requires_grad=True)
+        out = checkpoint_sequential(modules, 2, input_var)
+        for m in modules:
+            self.assertEqual(m.counter, 1)
+        out.sum().backward()
+        for m in modules[:(len(modules) // 2)]:
+            self.assertEqual(m.counter, 2)
+        for m in modules[(len(modules) // 2):]:
+            self.assertEqual(m.counter, 1)
+
+    def test_checkpoint_valid(self):
+        model = nn.Sequential(
+            nn.Linear(100, 50),
+            nn.ReLU(),
+            nn.Linear(50, 20),
+            nn.ReLU(),
+            nn.Linear(20, 5),
+            nn.ReLU()
+        )
+
+        input_var = torch.randn(1, 100, requires_grad=True)
+
+        # checkpointed
+        chunks = 2
+        modules = list(model.children())
+        out = checkpoint_sequential(modules, chunks, input_var)
+        with self.assertRaisesRegex(RuntimeError, "Checkpointing is not compatible"):
+            torch.autograd.grad(
+                outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True
+            )
+
+    def test_checkpoint(self):
+        model = nn.Sequential(
+            nn.Linear(100, 50),
+            nn.ReLU(),
+            nn.Linear(50, 20),
+            nn.ReLU(),
+            nn.Linear(20, 5),
+            nn.ReLU()
+        )
+
+        x = torch.randn(1, 100, requires_grad=True)
+
+        # not checkpointed
+        out = model(x)
+        out_not_checkpointed = out.data.clone()
+        model.zero_grad()
+        out.sum().backward()
+        grad_not_checkpointed = {}
+        for name, param in model.named_parameters():
+            grad_not_checkpointed[name] = param.grad.data.clone()
+        input_grad = x.grad.data.clone()
+
+        # checkpointed model by passing list of modules
+        chunks = 2
+        modules = list(model.children())
+        input_var = x.detach()
+        input_var.requires_grad = True
+        # pass list of modules to checkpoint
+        out = checkpoint_sequential(modules, chunks, input_var)
+        out_checkpointed = out.data.clone()
+        model.zero_grad()
+        out.sum().backward()
+        grad_checkpointed = {}
+        for name, param in model.named_parameters():
+            grad_checkpointed[name] = param.grad.data.clone()
+        checkpoint_input_grad = input_var.grad.data.clone()
+        # compare the output, input and parameters gradients
+        self.assertEqual(out_checkpointed, out_not_checkpointed)
+        self.assertEqual(input_grad, checkpoint_input_grad)
+        for name in grad_checkpointed:
+            self.assertEqual(grad_checkpointed[name], grad_not_checkpointed[name])
+
+        # checkpointed by passing sequential directly
+        input_var1 = x.detach()
+        input_var1.requires_grad = True
+        # pass the sequential itself
+        out = checkpoint_sequential(model, 2, input_var1)
+        out_checkpointed = out.data.clone()
+        model.zero_grad()
+        out.sum().backward()
+        grad_checkpointed = {}
+        for name, param in model.named_parameters():
+            grad_checkpointed[name] = param.grad.data.clone()
+        checkpoint_input_grad = input_var1.grad.data.clone()
+        # compare the output, input and parameters gradients
+        self.assertEqual(out_checkpointed, out_not_checkpointed)
+        self.assertEqual(input_grad, checkpoint_input_grad)
+        for name in grad_checkpointed:
+            self.assertEqual(grad_checkpointed[name], grad_not_checkpointed[name])
+
+
+class TestDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = torch.randn(5, 3, 3, 2)
+        self.batch_size = 3
+
+    def test_random_seed(self):
+        def run():
+            dataloader = torch.utils.data.DataLoader(RandomDatasetMock(),
+                                                     batch_size=2,
+                                                     num_workers=4,
+                                                     shuffle=True)
+            return next(iter(dataloader))
+
+        torch.manual_seed(2018)
+        x1 = run()
+        torch.manual_seed(2018)
+        x2 = run()
+        self.assertEqual(x1, x2)
+
+    def test_single_keep(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=False)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 2)
+
+    def test_single_drop(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=True)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 1)
+
+    @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN")
+    def test_multi_keep(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=False)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 2)
+
+    def test_multi_drop(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=True)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 1)
+
+
+class TestTrainer(TestCase):
+
+    intervals = [
+        [(1, 'iteration')],
+        [(1, 'epoch')],
+        [(1, 'batch')],
+        [(1, 'update')],
+        [(5, 'iteration')],
+        [(5, 'epoch')],
+        [(5, 'batch')],
+        [(5, 'update')],
+        [(1, 'iteration'), (1, 'epoch')],
+        [(5, 'update'), (1, 'iteration')],
+        [(2, 'epoch'), (1, 'batch')],
+    ]
+
+    def setUp(self):
+        self.optimizer = OptimizerMock()
+        self.trainer = Trainer(ModelMock(), CriterionMock(),
+                               self.optimizer, DatasetMock())
+        self.num_epochs = 3
+        self.dataset_size = len(self.trainer.dataset)
+        self.num_iters = self.num_epochs * self.dataset_size
+
+    def test_register_plugin(self):
+        for interval in self.intervals:
+            simple_plugin = SimplePlugin(interval)
+            self.trainer.register_plugin(simple_plugin)
+            self.assertEqual(simple_plugin.trainer, self.trainer)
+
+    def test_optimizer_step(self):
+        self.trainer.run(epochs=1)
+        self.assertEqual(self.trainer.optimizer.num_steps, 10)
+
+    def test_plugin_interval(self):
+        for interval in self.intervals:
+            self.setUp()
+            simple_plugin = SimplePlugin(interval)
+            self.trainer.register_plugin(simple_plugin)
+            self.trainer.run(epochs=self.num_epochs)
+            units = {
+                ('iteration', self.num_iters),
+                ('epoch', self.num_epochs),
+                ('batch', self.num_iters),
+                ('update', self.num_iters)
+            }
+            for unit, num_triggers in units:
+                call_every = None
+                for i, i_unit in interval:
+                    if i_unit == unit:
+                        call_every = i
+                        break
+                if call_every:
+                    expected_num_calls = math.floor(num_triggers / call_every)
+                else:
+                    expected_num_calls = 0
+                num_calls = getattr(simple_plugin, 'num_' + unit)
+                self.assertEqual(num_calls, expected_num_calls, 0)
+
+    def test_model_called(self):
+        self.trainer.run(epochs=self.num_epochs)
+        num_model_calls = self.trainer.model.num_calls
+        num_crit_calls = self.trainer.criterion.num_calls
+        self.assertEqual(num_model_calls, num_crit_calls)
+        for num_calls in [num_model_calls, num_crit_calls]:
+            lower_bound = OptimizerMock.min_evals * self.num_iters
+            upper_bound = OptimizerMock.max_evals * self.num_iters
+            self.assertEqual(num_calls, self.trainer.optimizer.num_evals)
+            self.assertLessEqual(lower_bound, num_calls)
+            self.assertLessEqual(num_calls, upper_bound)
+
+    def test_model_gradient(self):
+        self.trainer.run(epochs=self.num_epochs)
+        output_var = self.trainer.model.output
+        expected_grad = torch.ones(1, 1) * 2 * self.optimizer.num_evals
+        self.assertEqual(output_var.grad.data, expected_grad)
+
+
+test_dir = os.path.abspath(os.path.dirname(str(__file__)))
+
+
+class TestFFI(TestCase):
+
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        os.chdir(self.tmpdir)
+        sys.path.append(self.tmpdir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+
+    @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
+    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
+    @unittest.skipIf(IS_PPC, "skip for ppc64le due to incompatible exception handling")
+    def test_cpu(self):
+        create_extension(
+            name='test_extensions.cpulib',
+            headers=[test_dir + '/ffi/src/cpu/lib.h'],
+            sources=[
+                test_dir + '/ffi/src/cpu/lib1.c',
+                test_dir + '/ffi/src/cpu/lib2.c',
+            ],
+            verbose=False,
+        ).build()
+        from test_extensions import cpulib
+        tensor = torch.ones(2, 2).float()
+
+        cpulib.good_func(tensor, 2, 1.5)
+        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
+
+        new_tensor = cpulib.new_tensor(4)
+        self.assertEqual(new_tensor, torch.ones(4, 4) * 4)
+
+        f = cpulib.int_to_float(5)
+        self.assertIs(type(f), float)
+
+        self.assertRaises(TypeError,
+                          lambda: cpulib.good_func(tensor.double(), 2, 1.5))
+        self.assertRaises(torch.FatalError,
+                          lambda: cpulib.bad_func(tensor, 2, 1.5))
+
+    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
+    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
+    def test_gpu(self):
+        from torch.utils.cpp_extension import CUDA_HOME
+        create_extension(
+            name='gpulib',
+            headers=[test_dir + '/ffi/src/cuda/cudalib.h'],
+            sources=[
+                test_dir + '/ffi/src/cuda/cudalib.c',
+            ],
+            with_cuda=True,
+            verbose=False,
+            include_dirs=[os.path.join(CUDA_HOME, 'include')],
+        ).build()
+        import gpulib
+        tensor = torch.ones(2, 2).float()
+
+        gpulib.good_func(tensor, 2, 1.5)
+        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
+
+        ctensor = tensor.cuda().fill_(1)
+        gpulib.cuda_func(ctensor, 2, 1.5)
+        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
+
+        self.assertRaises(TypeError,
+                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
+        self.assertRaises(TypeError,
+                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
+
+
+class TestLuaReader(TestCase):
+
+    @staticmethod
+    def _module_test(name, test):
+        def do_test(self):
+            module = test['module']
+            input = test['input']
+            grad_output = test['grad_output']
+            if hasattr(self, '_transform_' + name):
+                input = getattr(self, '_transform_' + name)(input)
+            output = module.forward(input)
+            module.zeroGradParameters()
+            grad_input = module.backward(input, grad_output)
+            self.assertEqual(output, test['output'])
+            self.assertEqual(grad_input, test['grad_input'])
+            if module.parameters() is not None:
+                params, d_params = module.parameters()
+                self.assertEqual(params, test['params'])
+                self.assertEqual(d_params, test['d_params'])
+            else:
+                self.assertFalse('params' in test and test['params'])
+                self.assertFalse('params' in test and test['d_params'])
+        return do_test
+
+    @staticmethod
+    def _criterion_test(name, test):
+        def do_test(self):
+            module = test['module']
+            input = test['input']
+            if name == 'L1Cost':
+                target = None
+            else:
+                target = test['target']
+            if hasattr(self, '_transform_' + name):
+                input, target = getattr(self, '_transform_' + name)(input, target)
+
+            output = module.forward(input, target)
+            grad_input = module.backward(input, target)
+            self.assertEqual(output, test['loss'])
+            self.assertEqual(grad_input, test['grad_input'])
+        return do_test
+
+    @classmethod
+    def init(cls):
+        try:
+            path = download_file('https://download.pytorch.org/test_data/legacy_modules.t7')
+        except unittest.SkipTest:
+            return
+        long_size = 8 if sys.platform == 'win32' else None
+        tests = load_lua(path, long_size=long_size)
+        for name, test in tests['modules'].items():
+            if name == "HardShrink":
+                continue
+            test_name = 'test_' + name.replace('nn.', '')
+            setattr(cls, test_name, cls._module_test(name, test))
+        for name, test in tests['criterions'].items():
+            if name == "HardShrink":
+                continue
+            test_name = 'test_' + name.replace('nn.', '')
+            setattr(cls, test_name, cls._criterion_test(name, test))
+
+    def _transform_Index(self, input):
+        return [input[0], input[1].sub(1)]
+
+    def _transform_LookupTable(self, input):
+        return input.sub(1)
+
+    def _transform_MultiLabelMarginCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_ClassNLLCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_SpatialClassNLLCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_ClassSimplexCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_CrossEntropyCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_ParallelCriterion(self, input, target):
+        return input, [target[0].sub(1), target[1]]
+
+    def _transform_MultiCriterion(self, input, target):
+        return input, target.sub(1)
+
+    def _transform_MultiMarginCriterion(self, input, target):
+        return input, target.sub(1)
+
+
+@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
+class TestBottleneck(TestCase):
+    def _run(self, command):
+        """Returns (return-code, stdout, stderr)"""
+        import subprocess
+        from common import PY3
+
+        p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE, shell=True)
+        output, err = p.communicate()
+        rc = p.returncode
+        if PY3:
+            output = output.decode("ascii")
+            err = err.decode("ascii")
+        return (rc, output, err)
+
+    def _run_bottleneck(self, test_file, scriptargs=''):
+        curdir = os.path.dirname(os.path.abspath(__file__))
+        filepath = '{}/{}'.format(curdir, test_file)
+        if scriptargs != '':
+            scriptargs = ' {}'.format(scriptargs)
+        rc, out, err = self._run(
+            '{} -m torch.utils.bottleneck {}{}'.format(sys.executable, filepath, scriptargs))
+        return rc, out, err
+
+    def _check_run_args(self):
+        # Check that this fails due to missing args
+        rc, out, err = self._run_bottleneck('bottleneck/test_args.py')
+        self.assertEqual(rc, 2, None, self._fail_msg('Missing args should error', out + err))
+
+        # This should succeed
+        rc, out, err = self._run_bottleneck('bottleneck/test_args.py', '--foo foo --bar bar')
+        self.assertEqual(rc, 0, None, self._fail_msg('Should pass args to script', out + err))
+
+    def _fail_msg(self, msg, output):
+        return '{}, output was:\n{}'.format(msg, output)
+
+    def _check_environment_summary(self, output):
+        results = re.search('Environment Summary', output)
+        self.assertIsNotNone(results, self._fail_msg('Should have Enviroment Summary', output))
+
+        # Up to five lines away from the heading, there should be the version number
+        results = re.search(r'Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+', output)
+        self.assertIsNotNone(results, self._fail_msg('Should have PyTorch version', output))
+
+    def _check_cprof_summary(self, output):
+        results = re.search('cProfile output', output)
+        self.assertIsNotNone(results, self._fail_msg('Should have cProfile output', output))
+
+        # This assumes that after the cProfile output section we have
+        # the autograd profiler output
+        results = re.search(r'cProfile output.*(\n.*){6,50}\n.*autograd profiler output', output)
+        self.assertIsNotNone(results, self._fail_msg(
+            'Distance between cProfile and autograd prof out not in [6, 50] lines', output))
+
+    def _check_autograd_summary(self, output):
+        results = re.search('autograd profiler output', output)
+        self.assertIsNotNone(results, self._fail_msg('Should have autograd profiler output', output))
+
+        # This assumes that after the autograd profiler output is the end of the
+        # output.
+        results = re.search(r'autograd profiler output.*(\n.*){6,100}', output)
+        self.assertIsNotNone(results, self._fail_msg(
+            'Distance between autograd prof output and end of output not in [6, 100] lines', output))
+
+    def _check_cuda(self, output):
+        if HAS_CUDA:
+            results = re.search('CUDA mode', output)
+            self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output))
+        else:
+            results = re.search('CUDA mode', output)
+            self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output))
+
+    @unittest.skipIf(HAS_CUDA, 'CPU-only test')
+    def test_bottleneck_cpu_only(self):
+        rc, out, err = self._run_bottleneck('bottleneck/test.py')
+        self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err))
+
+        self._check_run_args()
+        self._check_environment_summary(out)
+        self._check_autograd_summary(out)
+        self._check_cprof_summary(out)
+        self._check_cuda(out)
+
+    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    def test_bottleneck_cuda(self):
+        rc, out, err = self._run_bottleneck('bottleneck/test_cuda.py')
+        self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err))
+
+        self._check_run_args()
+        self._check_environment_summary(out)
+        self._check_autograd_summary(out)
+        self._check_cprof_summary(out)
+        self._check_cuda(out)
+
+
+from torch.utils.collect_env import get_pretty_env_info
+
+
+class TestCollectEnv(TestCase):
+    def test_smoke(self):
+        info_output = get_pretty_env_info()
+        self.assertTrue(info_output.count('\n') >= 17)
+
+
+class TestONNXUtils(TestCase):
+    def test_prepare_onnx_paddings(self):
+        sizes = [2, 3, 4]
+        pad = [1, 2, 3, 4]
+        paddings = prepare_onnx_paddings(len(sizes), pad)
+        self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
+
+    def test_check_onnx_broadcast(self):
+
+        def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
+            broadcast = True
+            fail = False
+            try:
+                broadcast = check_onnx_broadcast(dims1, dims2)
+            except ValueError:
+                fail = True
+            self.assertEqual(broadcast, expect_broadcast)
+            self.assertEqual(fail, expect_fail)
+
+        # Case 1, check the case when len(dims1) < len(dims2) and numel(dims2) > 1
+        dims1 = [3, 4]
+        dims2 = [2, 3, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 2, check the case when len(dims1) < len(dims2) and numel(dims2) == 1
+        dims1 = [3, 4]
+        dims2 = [1, 1, 1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 3, check the case when len(dims1) > len(dims2) and numel(dims2) == 1
+        dims1 = [1, 1]
+        dims2 = [1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 4, check the case when len(dims1) > len(dims2) and dims1[x:] == dims2
+        dims1 = [2, 3, 4]
+        dims2 = [3, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 5, check the case when len(dims1) > len(dims2), but dims1[x:] != dims2
+        dims1 = [2, 3, 4]
+        dims2 = [1, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 6, check the equal case, no broadcast
+        dims1 = [3, 4]
+        dims2 = [3, 4]
+        try_check_onnx_broadcast(dims1, dims2, False, False)
+
+        # Case 7, check the case when len(dims1) == len(dims2), but dims1 != dims2
+        dims1 = [3, 4]
+        dims2 = [1, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 8, check the case when len(dims1) == len(dims2) and numel(s2) == 1
+        dims1 = [3, 4]
+        dims2 = [1, 1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+
+if __name__ == '__main__':
+    from torch.utils.serialization import load_lua
+    TestLuaReader.init()
+    run_tests()
diff --git a/third_party/ComputeLibrary b/third_party/ComputeLibrary
new file mode 160000
index 0000000..2922279
--- /dev/null
+++ b/third_party/ComputeLibrary
@@ -0,0 +1 @@
+Subproject commit 292227986edb37b01061afcad6df18ba9d6ccbeb
diff --git a/third_party/FP16 b/third_party/FP16
new file mode 160000
index 0000000..43d6d17
--- /dev/null
+++ b/third_party/FP16
@@ -0,0 +1 @@
+Subproject commit 43d6d17df48ebf622587e7ed9472ea76573799b9
diff --git a/third_party/FXdiv b/third_party/FXdiv
new file mode 160000
index 0000000..811b482
--- /dev/null
+++ b/third_party/FXdiv
@@ -0,0 +1 @@
+Subproject commit 811b482bcd9e8d98ad80c6c78d5302bb830184b0
diff --git a/third_party/NNPACK b/third_party/NNPACK
new file mode 160000
index 0000000..3eb0d45
--- /dev/null
+++ b/third_party/NNPACK
@@ -0,0 +1 @@
+Subproject commit 3eb0d453662d05a708f43b108bed9e17b705383e
diff --git a/third_party/README.md b/third_party/README.md
new file mode 100644
index 0000000..d3a361f
--- /dev/null
+++ b/third_party/README.md
@@ -0,0 +1,2 @@
+This folder contains vendored copies of third-party libraries that we
+use.
diff --git a/third_party/benchmark b/third_party/benchmark
new file mode 160000
index 0000000..505be96
--- /dev/null
+++ b/third_party/benchmark
@@ -0,0 +1 @@
+Subproject commit 505be96ab23056580a3a2315abba048f4428b04e
diff --git a/third_party/catch b/third_party/catch
new file mode 160000
index 0000000..0a34cc2
--- /dev/null
+++ b/third_party/catch
@@ -0,0 +1 @@
+Subproject commit 0a34cc201ef28bf25c88b0062f331369596cb7b7
diff --git a/third_party/cereal b/third_party/cereal
new file mode 160000
index 0000000..51cbda5
--- /dev/null
+++ b/third_party/cereal
@@ -0,0 +1 @@
+Subproject commit 51cbda5f30e56c801c07fe3d3aba5d7fb9e6cca4
diff --git a/third_party/cpuinfo b/third_party/cpuinfo
new file mode 160000
index 0000000..1e6c8c9
--- /dev/null
+++ b/third_party/cpuinfo
@@ -0,0 +1 @@
+Subproject commit 1e6c8c99d27f2b5eb9d2e6231055c6a4115b85e5
diff --git a/third_party/cub b/third_party/cub
new file mode 160000
index 0000000..285aeeb
--- /dev/null
+++ b/third_party/cub
@@ -0,0 +1 @@
+Subproject commit 285aeebaa34b0e8a7670867a2e66c1a52d998d6a
diff --git a/third_party/eigen b/third_party/eigen
new file mode 160000
index 0000000..e9e9548
--- /dev/null
+++ b/third_party/eigen
@@ -0,0 +1 @@
+Subproject commit e9e95489a0b241412e31f0525e85b2fab386c786
diff --git a/third_party/gloo b/third_party/gloo
new file mode 160000
index 0000000..69eef74
--- /dev/null
+++ b/third_party/gloo
@@ -0,0 +1 @@
+Subproject commit 69eef748cc1dfbe0fefed69b34e6545495f67ac5
diff --git a/third_party/googletest b/third_party/googletest
new file mode 160000
index 0000000..69e48e9
--- /dev/null
+++ b/third_party/googletest
@@ -0,0 +1 @@
+Subproject commit 69e48e92de43960a316a826293510b7b3deb9eca
diff --git a/third_party/ideep b/third_party/ideep
new file mode 160000
index 0000000..4bd9a68
--- /dev/null
+++ b/third_party/ideep
@@ -0,0 +1 @@
+Subproject commit 4bd9a6800bf7db068187619e0582d34dec9651dc
diff --git a/third_party/ios-cmake b/third_party/ios-cmake
new file mode 160000
index 0000000..8abaed6
--- /dev/null
+++ b/third_party/ios-cmake
@@ -0,0 +1 @@
+Subproject commit 8abaed637d56f1337d6e1d2c4026e25c1eade724
diff --git a/third_party/nanopb b/third_party/nanopb
new file mode 160000
index 0000000..14efb1a
--- /dev/null
+++ b/third_party/nanopb
@@ -0,0 +1 @@
+Subproject commit 14efb1a47a496652ab08b1ebcefb0ea24ae4a5e4
diff --git a/third_party/nccl/.gitignore b/third_party/nccl/.gitignore
new file mode 100644
index 0000000..34a07c2
--- /dev/null
+++ b/third_party/nccl/.gitignore
@@ -0,0 +1,2 @@
+# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+/build
diff --git a/third_party/nccl/CMakeLists.txt b/third_party/nccl/CMakeLists.txt
new file mode 100644
index 0000000..695a4d9
--- /dev/null
+++ b/third_party/nccl/CMakeLists.txt
@@ -0,0 +1,21 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.8)
+
+IF(NOT CUDA_FOUND)
+  FIND_PACKAGE(CUDA 7.0 REQUIRED)
+ENDIF()
+
+include("${CMAKE_UTILS_PATH}")
+torch_cuda_get_nvcc_gencode_flag(NVCC_GENCODE)
+string(REPLACE "-gencode;" "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
+message(STATUS "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
+
+ADD_CUSTOM_COMMAND(
+   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+   OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so
+   COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j${NUM_JOBS}
+)
+
+ADD_CUSTOM_TARGET(nccl ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so)
+
+INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/nccl.h DESTINATION "include")
diff --git a/third_party/nccl/LICENSE.txt b/third_party/nccl/LICENSE.txt
new file mode 100644
index 0000000..c7efd73
--- /dev/null
+++ b/third_party/nccl/LICENSE.txt
@@ -0,0 +1,31 @@
+
+ Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
diff --git a/third_party/nccl/Makefile b/third_party/nccl/Makefile
new file mode 100644
index 0000000..96b0719
--- /dev/null
+++ b/third_party/nccl/Makefile
@@ -0,0 +1,260 @@
+#
+# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+KEEP ?= 0
+DEBUG ?= 0
+PROFAPI ?= 0
+BUILDDIR ?= build
+BUILDDIR := $(abspath $(BUILDDIR))
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+
+CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+
+ifeq ($(CUDA_MAJOR), 7)
+NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_52,code=sm_52 \
+                -gencode=arch=compute_52,code=compute_52
+else ifeq ($(CUDA_MAJOR), 8)
+NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_52,code=sm_52 \
+                -gencode=arch=compute_60,code=sm_60\
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_60,code=compute_60
+else
+NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_52,code=sm_52 \
+                -gencode=arch=compute_60,code=sm_60\
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_60,code=compute_60 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96 -Xfatbin -compress-all
+# Use addprefix so that we can specify more than one path
+LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
+
+# If CUDA < 8.0, add workaround C++ flags
+CUDA_MAJOR_LT_8 := $(shell [ $(CUDA_MAJOR) -lt 8 ] && echo true)
+ifeq ($(CUDA_MAJOR_LT_8), true)
+CXXFLAGS += -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
+endif
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3
+else
+NVCUFLAGS += -O0 -G
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+CXXFLAGS  += -Wall -Wextra
+else
+.SILENT:
+endif
+
+ifneq ($(KEEP), 0)
+NVCUFLAGS += -keep
+endif
+
+ifneq ($(PROFAPI), 0)
+CXXFLAGS += -DPROFAPI
+endif
+
+NCCL_MAJOR   := 1
+NCCL_MINOR   := 3
+NCCL_PATCH   := 5
+CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
+
+CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
+
+.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
+.DEFAULT : all
+
+INCEXPORTS  := nccl.h
+LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
+LIBNAME     := libnccl.so
+STATICLIBNAME := libnccl_static.a
+
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+
+INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
+LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
+LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
+STATICLIBTARGET := $(STATICLIBNAME)
+LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
+LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
+DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
+
+all : lib staticlib
+
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+
+staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
+
+-include $(DEPFILES)
+
+$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
+	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
+	@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	ar cr $@ $(LIBOBJ)
+
+$(INCDIR)/%.h : src/%.h
+	@printf "Grabbing  %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : src/%.cu
+	@printf "Compiling %-35s > %s\n" $< $@
+	mkdir -p $(OBJDIR)
+	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
+	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+clean :
+	rm -rf $(BUILDDIR)
+
+install : lib
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include
+	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+
+
+#### TESTS ####
+
+TEST_ONLY ?= 0
+
+# Tests depend on lib, except in TEST_ONLY mode.
+ifeq ($(TEST_ONLY), 0)
+TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+endif
+
+NCCL_LIB ?= $(LIBDIR)
+NCCL_INC ?= $(INCDIR)
+
+MPI_HOME ?= /usr
+MPI_INC ?= $(MPI_HOME)/include
+MPI_LIB ?= $(MPI_HOME)/lib
+MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
+
+TESTS       := all_gather_test     all_gather_scan \
+               all_reduce_test     all_reduce_scan \
+               broadcast_test      broadcast_scan \
+               reduce_test         reduce_scan \
+               reduce_scatter_test reduce_scatter_scan
+MPITESTS    := mpi_test
+
+TSTINC     := -I$(NCCL_INC) -Itest/include
+TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
+TSTDIR     := $(BUILDDIR)/test/single
+MPITSTDIR  := $(BUILDDIR)/test/mpi
+TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
+MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
+
+test : $(TESTBINS)
+
+$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
+	@printf "Building  %-35s > %s\n" $< $@
+	mkdir -p $(TSTDIR)
+	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
+	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
+	@rm -f $(@:%=%.d.tmp)
+
+mpitest : $(MPITESTBINS)
+
+$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
+	@printf "Building  %-35s > %s\n" $< $@
+	mkdir -p $(MPITSTDIR)
+	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
+	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
+	@rm -f $(@:%=%.d.tmp)
+
+#### PACKAGING ####
+
+DEBIANDIR  := $(BUILDDIR)/debian
+
+DEBGEN_IN  := $(shell (cd debian ; ls *.in))
+DEBGEN     := $(DEBGEN_IN:.in=)
+DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
+DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
+
+DEB_REVISION   ?= 1
+DEB_TIMESTAMP  := $(shell date -R)
+DEB_ARCH       ?= amd64
+
+debian : $(DEBTARGETS)
+
+deb : lib debian
+	@printf "Building Debian package\n"
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
+	mkdir -p $(BUILDDIR)/deb/
+	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
+
+debclean :
+	rm -Rf $(DEBIANDIR)
+
+$(DEBIANDIR)/% : debian/%.in
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
+	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
+	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
+	    $< > $@
+
+$(DEBIANDIR)/% : debian/%
+	@printf "Grabbing  %-35s > %s\n" $< $@
+	mkdir -p $(DEBIANDIR)
+	cp -f $< $@
+
+#### FORTRAN BINDINGS ####
+
+export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
+
+forlib : lib
+	$(MAKE) -C fortran lib
+fortest : forlib
+	$(MAKE) -C fortran test
+forclean :
+	$(MAKE) -C fortran clean
diff --git a/third_party/nccl/README.md b/third_party/nccl/README.md
new file mode 100644
index 0000000..17b9546
--- /dev/null
+++ b/third_party/nccl/README.md
@@ -0,0 +1,128 @@
+**IMPORTANT NOTE**
+
+**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
+
+**http://developer.nvidia.com/nccl.**
+
+# NCCL
+
+Optimized primitives for collective multi-GPU communication.
+
+## Introduction
+
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
+[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
+
+## What's inside
+
+At present, the library implements the following collectives:
+- all-reduce
+- all-gather
+- reduce-scatter
+- reduce
+- broadcast
+
+These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+
+## Requirements
+
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
+
+Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
+
+## Build & run
+
+To build the library and tests.
+
+```shell
+$ cd nccl
+$ make CUDA_HOME=<cuda install path> test
+```
+
+Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
+
+```shell
+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
+$ ./build/test/single/all_reduce_test
+Error: must specify at least data size in bytes!
+
+Tests nccl AllReduce with user supplied arguments.
+    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
+
+$ ./build/test/single/all_reduce_test 10000000
+# Using devices
+#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
+#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
+#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
+#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
+
+#                                                 out-of-place                    in-place
+#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
+    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
+    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
+    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
+    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
+    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
+    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
+    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
+    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
+    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
+    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
+    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
+    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
+    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
+    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
+    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
+    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+```
+
+To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
+
+## Usage
+
+NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
+
+```c
+#include <nccl.h>
+
+typedef struct {
+  double* sendBuff;
+  double* recvBuff;
+  int size;
+  cudaStream_t stream;
+} PerThreadData;
+
+int main(int argc, char* argv[])
+{
+  int nGPUs;
+  cudaGetDeviceCount(&nGPUs);
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
+  ncclCommInitAll(comms, nGPUs); // initialize communicator
+                                // One communicator per process
+
+  PerThreadData* data;
+
+  ... // Allocate data and issue work to each GPU's
+      // perDevStream to populate the sendBuffs.
+
+  for(int i=0; i<nGPUs; ++i) {
+    cudaSetDevice(i); // Correct device must be set
+                      // prior to each collective call.
+    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
+        ncclDouble, ncclSum, comms[i], data[i].stream);
+  }
+
+  ... // Issue work into data[*].stream to consume buffers, etc.
+}
+```
+
+## Copyright and License
+
+NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
+accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
+rights reserved.
+
diff --git a/third_party/nccl/debian/.gitignore b/third_party/nccl/debian/.gitignore
new file mode 100644
index 0000000..1e97a9f
--- /dev/null
+++ b/third_party/nccl/debian/.gitignore
@@ -0,0 +1,7 @@
+/*.debhelper.log
+/*.debhelper
+/*.substvars
+/tmp/
+/files
+/libnccl1/
+/libnccl-dev/
diff --git a/third_party/nccl/debian/changelog.in b/third_party/nccl/debian/changelog.in
new file mode 100644
index 0000000..ad569a0
--- /dev/null
+++ b/third_party/nccl/debian/changelog.in
@@ -0,0 +1,5 @@
+nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
+
+  * Automatic Debian package from build
+
+ -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
diff --git a/third_party/nccl/debian/compat b/third_party/nccl/debian/compat
new file mode 100644
index 0000000..ec63514
--- /dev/null
+++ b/third_party/nccl/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/third_party/nccl/debian/control.in b/third_party/nccl/debian/control.in
new file mode 100644
index 0000000..e5ca48e
--- /dev/null
+++ b/third_party/nccl/debian/control.in
@@ -0,0 +1,28 @@
+Source: nccl
+Section: libs
+Maintainer: cudatools <cudatools@nvidia.com>
+Priority: optional
+Build-depends: debhelper(>=9)
+Standards-Version: 3.9.5
+
+Package: libnccl${nccl:Major}
+Section: libs
+Architecture: ${deb:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: NVIDIA Collectives Communication Library (NCCL) Runtime
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
+ that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
+ to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
+ applications.
+
+Package: libnccl-dev
+Section: libdevel
+Architecture: ${deb:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
+Description: NVIDIA Collectives Communication Library (NCCL) Development Files
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
+ that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
+ to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
+ applications.
diff --git a/third_party/nccl/debian/copyright b/third_party/nccl/debian/copyright
new file mode 120000
index 0000000..4ab4373
--- /dev/null
+++ b/third_party/nccl/debian/copyright
@@ -0,0 +1 @@
+../LICENSE.txt
\ No newline at end of file
diff --git a/third_party/nccl/debian/libnccl-dev.install b/third_party/nccl/debian/libnccl-dev.install
new file mode 100644
index 0000000..90299a0
--- /dev/null
+++ b/third_party/nccl/debian/libnccl-dev.install
@@ -0,0 +1,2 @@
+include/nccl.h usr/include
+lib/libnccl.so /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/libnccl-dev.manpages b/third_party/nccl/debian/libnccl-dev.manpages
new file mode 100644
index 0000000..4bfc2cb
--- /dev/null
+++ b/third_party/nccl/debian/libnccl-dev.manpages
@@ -0,0 +1 @@
+debian/nccl.7
diff --git a/third_party/nccl/debian/libnccl1.install.in b/third_party/nccl/debian/libnccl1.install.in
new file mode 100644
index 0000000..73b4c0a
--- /dev/null
+++ b/third_party/nccl/debian/libnccl1.install.in
@@ -0,0 +1,2 @@
+lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
+lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/nccl.7 b/third_party/nccl/debian/nccl.7
new file mode 100644
index 0000000..0cb5601
--- /dev/null
+++ b/third_party/nccl/debian/nccl.7
@@ -0,0 +1,139 @@
+.TH NCCL
+.SH NAME
+.PP
+nccl \- Optimized primitives for collective multi\-GPU communication.
+
+.SH Introduction
+.PP
+NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
+
+.SH What's inside
+.PP
+At present, the library implements the following collectives:
+\- all\-reduce
+\- all\-gather
+\- reduce\-scatter
+\- reduce
+\- broadcast
+
+.PP
+These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+
+.SH Requirements
+.PP
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
+
+.PP
+Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
+
+.SH Build & run
+.PP
+To build the library and tests.
+
+.PP
+.RS
+
+.nf
+$ cd nccl
+$ make CUDA\_HOME=<cuda install path> test
+
+.fi
+.RE
+
+.PP
+Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
+
+.PP
+.RS
+
+.nf
+$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
+$ ./build/test/all\_reduce\_test
+Error: must specify at least data size in bytes!
+
+Tests nccl AllReduce with user supplied arguments.
+    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
+
+$ ./build/test/all\_reduce\_test 10000000
+# Using devices
+#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
+#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
+#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
+#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
+
+#                                                 out\-of\-place                    in\-place
+#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
+    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
+    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
+    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
+    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
+    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
+    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
+    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
+    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
+    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
+    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
+    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
+    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
+    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
+    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
+    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
+    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+
+.fi
+.RE
+
+.PP
+To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
+
+.SH Usage
+.PP
+NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
+
+.PP
+.RS
+
+.nf
+#include <nccl.h>
+
+typedef struct \{
+  double* sendBuff;
+  double* recvBuff;
+  int size;
+  cudaStream\_t stream;
+\} PerThreadData;
+
+int main(int argc, char* argv[])
+\{
+  int nGPUs;
+  cudaGetDeviceCount(\&nGPUs);
+  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
+  ncclCommInitAll(comms, nGPUs); // initialize communicator
+                                // One communicator per process
+
+  PerThreadData* data;
+
+  ... // Allocate data and issue work to each GPU's
+      // perDevStream to populate the sendBuffs.
+
+  for(int i=0; i<nGPUs; ++i) \{
+    cudaSetDevice(i); // Correct device must be set
+                      // prior to each collective call.
+    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
+        ncclDouble, ncclSum, comms[i], data[i].stream);
+  \}
+
+  ... // Issue work into data[*].stream to consume buffers, etc.
+\}
+
+.fi
+.RE
+
+.SH Copyright
+.PP
+All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
+rights reserved.
diff --git a/third_party/nccl/debian/rules b/third_party/nccl/debian/rules
new file mode 100755
index 0000000..23b90a9
--- /dev/null
+++ b/third_party/nccl/debian/rules
@@ -0,0 +1,13 @@
+#!/usr/bin/make -f
+
+%:
+	dh $@ --parallel
+
+override_dh_auto_install:
+	PREFIX=debian/tmp dh_auto_install
+
+override_dh_auto_test:
+	# Do not make test
+
+override_dh_auto_clean:
+	# Do not make clean
diff --git a/third_party/nccl/debian/shlibs.local.in b/third_party/nccl/debian/shlibs.local.in
new file mode 100644
index 0000000..82505da
--- /dev/null
+++ b/third_party/nccl/debian/shlibs.local.in
@@ -0,0 +1 @@
+libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
diff --git a/third_party/nccl/debian/source/format b/third_party/nccl/debian/source/format
new file mode 100644
index 0000000..89ae9db
--- /dev/null
+++ b/third_party/nccl/debian/source/format
@@ -0,0 +1 @@
+3.0 (native)
diff --git a/third_party/nccl/fortran/Makefile b/third_party/nccl/fortran/Makefile
new file mode 100644
index 0000000..b60b016
--- /dev/null
+++ b/third_party/nccl/fortran/Makefile
@@ -0,0 +1,81 @@
+FC := gfortran
+FCNAME := $(notdir $(FC))
+
+BUILDDIR ?= ../build
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+
+LIBNAME    := libncclfor.so
+LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
+LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
+LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
+
+LIBCUDAFOR := libcudafor.so
+
+ifneq ($(filter pgf%, $(FCNAME)), )
+# PGI compiler (pgfortran, pgf90, pgf95)
+FCMODFLAGS  := -module $(INCDIR)
+FCPREFLAGS  := -Mpreprocess
+FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
+FCFLAGS     := -fast -O3
+else
+# non-PGI compilers do not have CUDA support, compile our own CUDA lib
+CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
+CUDALINK    := -L$(CUDA_LIB) -lcudart
+CUDAFORLINK := -lcudafor
+ifeq ($(FCNAME), gfortran)
+FCMODFLAGS  := -J$(INCDIR)
+FCPREFLAGS  += -cpp
+FCFLAGS     += -ffree-line-length-none
+else ifeq ($(FCNAME), ifort)
+FCMODFLAGS  := -module $(INCDIR)
+FCPREFLAGS  += -fpp
+endif
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+lib: $(CUDAFORDEP)
+	$(MAKE) $(LIBDIR)/$(LIBTARGET)
+
+$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
+	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
+	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
+	mkdir -p $(LIBDIR)
+	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
+
+$(OBJDIR)/%.o: src/%.f90
+	@printf "Building  %-35s > %s\n" $< $@
+	mkdir -p $(OBJDIR)
+	mkdir -p $(INCDIR)
+	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
+
+TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
+ifneq ($(filter pgf%, $(FCNAME)), )
+TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
+endif
+
+TESTDIR  := $(BUILDDIR)/test/fortran
+TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
+
+test: lib $(TESTBINS)
+
+$(TESTDIR)/%: test/%.f90 lib
+	@printf "Building  %-35s > %s\n" $< $@
+	@mkdir -p $(TESTDIR)
+	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
+
+clean:
+	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
+	rm -rf $(TESTDIR)/
+
diff --git a/third_party/nccl/fortran/src/cudafor.f90 b/third_party/nccl/fortran/src/cudafor.f90
new file mode 100644
index 0000000..4ecd0f4
--- /dev/null
+++ b/third_party/nccl/fortran/src/cudafor.f90
@@ -0,0 +1,171 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+#ifndef _CUDA
+
+!Start cudaFor module
+module cudaFor
+use iso_c_binding
+implicit none
+private
+public :: c_devptr
+public :: cudaMemcpyKind,           &
+          cudaMemcpyHostToHost,     &
+          cudaMemcpyHostToDevice,   &
+          cudaMemcpyDeviceToHost,   &
+          cudaMemcpyDeviceToDevice, &
+          cudaMemcpyDefault
+public :: cuda_stream_kind
+public :: cudaGetDeviceCount
+public :: cudaSetDevice
+public :: cudaMalloc
+public :: cudaMemcpy
+public :: cudaFree
+public :: cudaStreamCreate
+public :: cudaStreamSynchronize
+public :: cudaStreamDestroy
+
+!Start types
+
+!Start c_devptr
+type, bind(c) :: c_devptr
+type(c_ptr) :: member
+end type c_devptr
+!End c_devptr
+
+!Start cudaMemcpyKind
+type, bind(c) :: cudaMemcpyKind
+integer(c_int) :: member
+end type cudaMemcpyKind
+
+type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
+                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
+                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
+                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
+                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
+!End cudaMemcpyKind
+
+!Start cuda_stream_kind
+integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
+!End cuda_stream_kind
+
+!End types
+
+!Start interfaces
+
+!Start cudaGetDeviceCount
+interface cudaGetDeviceCount
+integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
+import :: c_int
+implicit none
+integer(c_int) :: count
+end function cudaGetDeviceCount
+end interface cudaGetDeviceCount
+!End cudaGetDeviceCount
+
+!Start cudaSetDevice
+interface cudaSetDevice
+integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
+import :: c_int
+implicit none
+integer(c_int), value :: device
+end function cudaSetDevice
+end interface cudaSetDevice
+!End cudaSetDevice
+
+!Start cudaMalloc
+interface cudaMalloc
+integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
+import :: c_int, c_size_t
+import :: c_devptr
+implicit none
+type(c_devptr) :: devPtr
+integer(c_size_t), value :: size
+end function cudaMalloc
+end interface cudaMalloc
+!End cudaMalloc
+
+!Start cudaMemcpy
+interface cudaMemcpy
+
+!Start cudaMemcpyH2D
+integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_devptr), value :: dst
+type(c_ptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyH2D
+!End cudaMemcpyH2D
+
+!Start cudaMemcpyD2H
+integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_ptr), value :: dst
+type(c_devptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyD2H
+!End cudaMemcpyD2H
+
+end interface cudaMemcpy
+!End cudaMemcpy
+
+!Start cudaFree
+interface cudaFree
+integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
+import :: c_int
+import :: c_devptr
+implicit none
+type(c_devptr), value :: devPtr
+end function cudaFree
+end interface cudaFree
+!End cudaFree
+
+!Start cudaStreamCreate
+interface cudaStreamCreate
+integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind) :: pStream
+end function cudaStreamCreate
+end interface cudaStreamCreate
+!End cudaStreamCreate
+
+!Start cudaStreamSynchronize
+interface cudaStreamSynchronize
+integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamSynchronize
+end interface cudaStreamSynchronize
+!End cudaStreamSynchronize
+
+!Start cudaStreamDestroy
+interface cudaStreamDestroy
+integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamDestroy
+end interface cudaStreamDestroy
+!End cudaStreamDestroy
+
+!End interfaces
+
+end module cudaFor
+!End cudaFor module
+
+#endif
diff --git a/third_party/nccl/fortran/src/ncclfor.f90 b/third_party/nccl/fortran/src/ncclfor.f90
new file mode 100644
index 0000000..2ed4d3d
--- /dev/null
+++ b/third_party/nccl/fortran/src/ncclfor.f90
@@ -0,0 +1,312 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+!Start defines
+#define NCCL_UNIQUE_ID_BYTES 128
+!End defines
+
+!Start ncclFor module
+module ncclFor
+use iso_c_binding
+use cudaFor
+implicit none
+private
+public :: ncclUniqueId
+public :: ncclComm
+public :: ncclResult,                 &
+          ncclSuccess,                &
+          ncclUnhandledCudaError,     &
+          ncclSystemError,            &
+          ncclInternalError,          &
+          ncclInvalidDevicePointer,   &
+          ncclInvalidRank,            &
+          ncclUnsupportedDeviceCount, &
+          ncclDeviceNotFound,         &
+          ncclInvalidDeviceIndex,     &
+          ncclLibWrapperNotSet,       &
+          ncclCudaMallocFailed,       &
+          ncclRankMismatch,           &
+          ncclInvalidArgument,        &
+          ncclInvalidType,            &
+          ncclInvalidOperation,       &
+          nccl_NUM_RESULTS
+public :: ncclDataType, &
+          ncclChar,     &
+          ncclInt,      &
+#ifdef CUDA_HAS_HALF
+          ncclHalf,     &
+#endif
+          ncclFloat,    &
+          ncclDouble,   &
+          ncclInt64,    &
+          ncclUInt64,   &
+          nccl_NUM_TYPES
+public :: ncclRedOp, &
+          ncclSum,   &
+          ncclProd,  &
+          ncclMax,   &
+          ncclMin,   &
+          nccl_NUM_OPS
+public :: ncclGetUniqueId
+public :: ncclCommInitRank
+public :: ncclCommInitAll
+public :: ncclCommCuDevice
+public :: ncclCommUserRank
+public :: ncclCommCount
+public :: ncclCommDestroy
+public :: ncclReduce
+public :: ncclAllReduce
+public :: ncclReduceScatter
+public :: ncclBcast
+public :: ncclAllGather
+
+!Start types
+
+!Start ncclUniqueId
+type, bind(c) :: ncclUniqueId
+character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
+end type ncclUniqueId
+!End ncclUniqueId
+
+!Start ncclComm
+type, bind(c) :: ncclComm
+type(c_ptr) :: member
+end type ncclComm
+!End ncclComm
+
+!Start ncclResult
+type, bind(c) :: ncclResult
+integer(c_int) :: member
+end type ncclResult
+
+type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
+                               ncclUnhandledCudaError     = ncclResult( 1), &
+                               ncclSystemError            = ncclResult( 2), &
+                               ncclInternalError          = ncclResult( 3), &
+                               ncclInvalidDevicePointer   = ncclResult( 4), &
+                               ncclInvalidRank            = ncclResult( 5), &
+                               ncclUnsupportedDeviceCount = ncclResult( 6), &
+                               ncclDeviceNotFound         = ncclResult( 7), &
+                               ncclInvalidDeviceIndex     = ncclResult( 8), &
+                               ncclLibWrapperNotSet       = ncclResult( 9), &
+                               ncclCudaMallocFailed       = ncclResult(10), &
+                               ncclRankMismatch           = ncclResult(11), &
+                               ncclInvalidArgument        = ncclResult(12), &
+                               ncclInvalidType            = ncclResult(13), &
+                               ncclInvalidOperation       = ncclResult(14), &
+                               nccl_NUM_RESULTS           = ncclResult(15)
+!End ncclResult
+
+!Start ncclDataType
+type, bind(c) :: ncclDataType
+integer(c_int) :: member
+end type ncclDataType
+
+type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
+                                 ncclInt        = ncclDataType(1), &
+#ifdef CUDA_HAS_HALF
+                                 ncclHalf       = ncclDataType(2), &
+#endif
+                                 ncclFloat      = ncclDataType(3), &
+                                 ncclDouble     = ncclDataType(4), &
+                                 ncclInt64      = ncclDataType(5), &
+                                 ncclUInt64     = ncclDataType(6), &
+                                 nccl_NUM_TYPES = ncclDataType(7)
+!End ncclDataType
+
+!Start ncclRedOp
+type, bind(c) :: ncclRedOp
+integer(c_int) :: member
+end type ncclRedOp
+
+type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
+                              ncclProd     = ncclRedOp(1), &
+                              ncclMax      = ncclRedOp(2), &
+                              ncclMin      = ncclRedOp(3), &
+                              nccl_NUM_OPS = ncclRedOp(4)
+!End ncclRedOp
+
+!End types
+
+!Start interfaces
+
+!Start ncclGetUniqueId
+interface ncclGetUniqueId
+type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
+import :: ncclResult, ncclUniqueId
+implicit none
+type(ncclUniqueId) :: uniqueId
+end function ncclGetUniqueId
+end interface ncclGetUniqueId
+!End ncclGetUniqueId
+
+!Start ncclCommInitRank
+interface ncclCommInitRank
+type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
+import :: c_int
+import :: ncclResult, ncclUniqueId, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+type(ncclUniqueId), value :: commId
+integer(c_int), value :: rank
+end function ncclCommInitRank
+end interface ncclCommInitRank
+!End ncclCommInitRank
+
+!Start ncclCommInitAll
+interface ncclCommInitAll
+type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+integer(c_int) :: devlist(*)
+end function ncclCommInitAll
+end interface ncclCommInitAll
+!End ncclCommInitAll
+
+!Start ncclCommCuDevice
+interface ncclCommCuDevice
+type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: devid
+end function ncclCommCuDevice
+end interface ncclCommCuDevice
+!End ncclCommCuDevice
+
+!Start ncclCommUserRank
+interface ncclCommUserRank
+type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: rank
+end function ncclCommUserRank
+end interface ncclCommUserRank
+!End ncclCommUserRank
+
+!Start ncclCommCount
+interface ncclCommCount
+type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: count
+end function ncclCommCount
+end interface ncclCommCount
+!End ncclCommCount
+
+!Start ncclCommDestroy
+interface ncclCommDestroy
+subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
+import :: ncclComm
+implicit none
+type(ncclComm), value :: comm
+end subroutine ncclCommDestroy
+end interface ncclCommDestroy
+!End ncclCommDestroy
+
+!Start ncclReduce
+interface ncclReduce
+type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduce
+end interface ncclReduce
+!End ncclReduce
+
+!Start ncclAllReduce
+interface ncclAllReduce
+type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllReduce
+end interface ncclAllReduce
+!End ncclAllReduce
+
+!Start ncclReduceScatter
+interface ncclReduceScatter
+type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: recvcount
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduceScatter
+end interface ncclReduceScatter
+!End ncclReduceScatter
+
+!Start ncclBcast
+interface ncclBcast
+type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: buff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclBcast
+end interface ncclBcast
+!End ncclBcast
+
+!Start ncclAllGather
+interface ncclAllGather
+type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: sendbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(c_devptr), value :: recvbuff
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllGather
+end interface ncclAllGather
+!End ncclAllGather
+
+!End interfaces
+
+end module ncclFor
+!End nccl module
diff --git a/third_party/nccl/fortran/test/allgather_arr_out.f90 b/third_party/nccl/fortran/test/allgather_arr_out.f90
new file mode 100644
index 0000000..17fbf7a
--- /dev/null
+++ b/third_party/nccl/fortran/test/allgather_arr_out.f90
@@ -0,0 +1,162 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 1))
+
+  call random_number(hostBuff)
+
+  print "(a)", "before allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl * nDev))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+    hostBuff(:, i) = recvBuff
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
+  end do
+
+  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+  print "(a)", ""
+  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/allgather_ptr_out.f90 b/third_party/nccl/fortran/test/allgather_ptr_out.f90
new file mode 100644
index 0000000..f7d1962
--- /dev/null
+++ b/third_party/nccl/fortran/test/allgather_ptr_out.f90
@@ -0,0 +1,171 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 1))
+
+  call random_number(hostBuff)
+
+  print "(a)", "before allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev))
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+  end do
+
+  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+  print "(a)", ""
+  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/allreduce_arr_out.f90 b/third_party/nccl/fortran/test/allreduce_arr_out.f90
new file mode 100644
index 0000000..50c1b64
--- /dev/null
+++ b/third_party/nccl/fortran/test/allreduce_arr_out.f90
@@ -0,0 +1,165 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before allreduce:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allreduce:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    hostBuff(:, nDev + 1) = recvBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/allreduce_ptr_out.f90 b/third_party/nccl/fortran/test/allreduce_ptr_out.f90
new file mode 100644
index 0000000..2c1248f
--- /dev/null
+++ b/third_party/nccl/fortran/test/allreduce_ptr_out.f90
@@ -0,0 +1,166 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before allreduce:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allreduce:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/broadcast_arr.f90 b/third_party/nccl/fortran/test/broadcast_arr.f90
new file mode 100644
index 0000000..867fa1a
--- /dev/null
+++ b/third_party/nccl/fortran/test/broadcast_arr.f90
@@ -0,0 +1,137 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: devBuff(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 1))
+
+  call random_number(hostBuff(:, 1:nDev))
+
+  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+  print "(a)", "before broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+
+  allocate(devBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(devBuff(nEl))
+    devBuffPtr(i) = c_devloc(devBuff)
+    devBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+    hostBuff(:, i) = devBuff
+  end do
+
+  print "(a)", ""
+  print "(a)", "after broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+    deallocate(devBuff)
+  end do
+
+  deallocate(devBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/broadcast_ptr.f90 b/third_party/nccl/fortran/test/broadcast_ptr.f90
new file mode 100644
index 0000000..963afee
--- /dev/null
+++ b/third_party/nccl/fortran/test/broadcast_ptr.f90
@@ -0,0 +1,142 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 1))
+
+  call random_number(hostBuff(:, 1:nDev))
+
+  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+  print "(a)", "before broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev))
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(devBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+  end do
+
+  print "(a)", ""
+  print "(a)", "after broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(devBuffPtr(i))
+  end do
+
+  deallocate(devBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/reduce_arr_out.f90 b/third_party/nccl/fortran/test/reduce_arr_out.f90
new file mode 100644
index 0000000..17e41b4
--- /dev/null
+++ b/third_party/nccl/fortran/test/reduce_arr_out.f90
@@ -0,0 +1,164 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  stat = cudaSetDevice(devList(root + 1))
+  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
+  hostBuff(:, nDev + 1) = recvBuff
+
+  print "(a)", ""
+  print "(a)", "after reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/reduce_ptr_out.f90 b/third_party/nccl/fortran/test/reduce_ptr_out.f90
new file mode 100644
index 0000000..777f8ea
--- /dev/null
+++ b/third_party/nccl/fortran/test/reduce_ptr_out.f90
@@ -0,0 +1,165 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  stat = cudaSetDevice(devList(root + 1))
+  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+
+  print "(a)", ""
+  print "(a)", "after reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_arr_out.f90 b/third_party/nccl/fortran/test/reducescatter_arr_out.f90
new file mode 100644
index 0000000..6a976da
--- /dev/null
+++ b/third_party/nccl/fortran/test/reducescatter_arr_out.f90
@@ -0,0 +1,165 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reducescatter:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl * nDev))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after reducescatter:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_ptr_out.f90 b/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
new file mode 100644
index 0000000..9df35bf
--- /dev/null
+++ b/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
@@ -0,0 +1,174 @@
+!*************************************************************************
+!* Copyright (c) 2016 Research Computing Services (RCS), University of
+!* Cambridge. All rights reserved.
+!*
+!* See LICENSE.txt for license information
+!*************************************************************************
+
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reducescatter:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after reduceScatter:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/third_party/nccl/src/all_gather.cu b/third_party/nccl/src/all_gather.cu
new file mode 100644
index 0000000..cb36b71
--- /dev/null
+++ b/third_party/nccl/src/all_gather.cu
@@ -0,0 +1,202 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "primitives.h"
+
+#define NUM_SUBSTEPS 2
+#define NUM_BUFCHUNKS 2
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<int THREADS, int UNROLL, class FUNC, typename T>
+__launch_bounds__(THREADS+WARP_SIZE, 1)
+__global__ void AllGatherKernel(const KernelArgs<T> args) {
+  const int tid = threadIdx.x;
+  __shared__ T* sharedNextOutput;
+  __shared__ DevRing<T> ring;
+  bool pushrecv = args.pushrecv;
+
+  LoadRing<THREADS>(args.ring, &ring);
+  __syncthreads();
+
+  if (tid == 0) {
+    WaitFlag prevCommOp(ring.prevOpCounter, 0);
+    WaitFlag nextCommOp(ring.nextOpCounter, 0);
+    prevCommOp.wait(args.opIndex);
+    nextCommOp.wait(args.opIndex);
+    if (pushrecv) {
+      *ring.sendPtrToPrev = (T*)args.ThisOutput;
+      Wait([=] {
+        return *ring.recvPtrFromNext != nullptr;
+      });
+      sharedNextOutput = *ring.recvPtrFromNext;
+      *ring.recvPtrFromNext = nullptr;
+    }
+  }
+  __syncthreads();
+
+  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
+  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
+  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
+
+  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
+
+  const int size = args.N;
+  const int nranks = args.nRanks;
+  const int buffSize = args.buffSize / sizeof(T);
+  const int sliceSize = buffSize / NUM_BUFCHUNKS;
+  
+  int step = 0;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = args.ThisInput;
+  T * __restrict__ thisOutput =  args.ThisOutput;
+  T * __restrict__ prevInput = ring.recvBuffer;
+  T * __restrict__ nextOutput =  ring.sendBuffer;
+
+  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
+    /////////////// begin AllGather steps ///////////////
+    int offset;
+    int maxOffset = size-chunkOffset;
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring.userRank[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput == thisOutput) {
+      Prims::Copy(
+          thisInput  + offset,
+          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    } else {
+      Prims::DoubleCopy(
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    }
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: copy to next GPU
+    if (pushrecv) {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring.userRank[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::Copy(
+            thisOutput + offset,
+            sharedNextOutput + offset,
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring.userRank[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::DoubleCopy(
+            prevInput + poffset,
+            thisOutput + offset,
+            nextOutput + noffset,
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      rankDest = ring.userRank[1];
+      offset = chunkOffset + rankDest * size;
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+  }
+
+  // wait for the last data to be pushed to us
+  if (tid == 0) {
+    // Wait for last update from next then reset the flag
+    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
+    *ring.recvFlagFromNext = 0;
+
+    // Wait for last update from prev then reset the flag
+    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
+    *ring.recvFlagFromPrev = 0;
+
+    incrementOpCounter(&args);
+  }
+}
+
+#define THREADS 512
+#define UNROLL 8
+
+template<class FUNC, typename T>
+ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
+    const int count, ncclComm* comm, cudaStream_t stream) {
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
+  } else {
+    KernelArgs<T> args;
+    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
+    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
+  }
+
+  return ncclSuccess;
+}
+
+template<typename T, template<typename> class RedOp>
+class AllGather {
+  public:
+  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
+      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
+    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
+  }
+};
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
+    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
+    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
+  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
+}
+
diff --git a/third_party/nccl/src/all_reduce.cu b/third_party/nccl/src/all_reduce.cu
new file mode 100644
index 0000000..2f38d6e
--- /dev/null
+++ b/third_party/nccl/src/all_reduce.cu
@@ -0,0 +1,234 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "primitives.h"
+
+#define NUM_SUBSTEPS 2
+#define NUM_BUFCHUNKS 2
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<int THREADS, int UNROLL, class FUNC, typename T>
+__launch_bounds__(THREADS+WARP_SIZE, 1)
+__global__ void AllReduceKernel(const KernelArgs<T> args) {
+  const int tid = threadIdx.x;
+  __shared__ T* sharedNextOutput;
+  __shared__ DevRing<T> ring;
+  bool pushrecv = args.pushrecv;
+
+  LoadRing<THREADS>(args.ring, &ring);
+  __syncthreads();
+
+  if (tid == 0) {
+    WaitFlag prevCommOp(ring.prevOpCounter, 0);
+    WaitFlag nextCommOp(ring.nextOpCounter, 0);
+    prevCommOp.wait(args.opIndex);
+    nextCommOp.wait(args.opIndex);
+    if (pushrecv) {
+      *ring.sendPtrToPrev = (T*)args.ThisOutput;
+      Wait([=] {
+        return *ring.recvPtrFromNext != nullptr;
+      });
+      sharedNextOutput = *ring.recvPtrFromNext;
+      *ring.recvPtrFromNext = nullptr;
+    }
+  }
+  __syncthreads();
+
+  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
+  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
+  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
+
+  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
+
+  const int size = args.N;
+  const int nranks = args.nRanks;
+  const int buffSize = args.buffSize / sizeof(T);
+  const int sliceSize = buffSize / NUM_BUFCHUNKS;
+  
+  int step = 0;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = args.ThisInput;
+  T * __restrict__ thisOutput =  args.ThisOutput;
+  T * __restrict__ prevInput = ring.recvBuffer;
+  T * __restrict__ nextOutput =  ring.sendBuffer;
+
+  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
+    /////////////// begin AllReduce steps ///////////////
+    int offset;
+    int maxOffset;
+    int slice;
+    int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
+    ALIGN_SIZE(chunkSize, THREADS*UNROLL);
+
+    // step 0: push data to next GPU
+    slice = ring.userRank[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::Copy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring.userRank[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      Prims::Reduce(
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring.userRank[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::ReduceCopy(
+        prevInput  + poffset,
+        thisInput  + offset,
+        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
+        thisOutput + offset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP;
+
+    if (pushrecv) {
+      // k-2 steps: copy result to next GPU
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring.userRank[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::Copy(
+            thisOutput + offset,
+            sharedNextOutput + offset,
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+    } else {
+      // k-2 steps: copy result to next GPU
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring.userRank[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::DoubleCopy(
+            prevInput + poffset,
+            thisOutput + offset,
+            nextOutput + noffset,
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      slice = ring.userRank[1];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+  }
+
+  // wait for the last data to be pushed to us
+  if (tid == 0) {
+    // Wait for last update from next then reset the flag
+    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
+    *ring.recvFlagFromNext = 0;
+
+    // Wait for last update from prev then reset the flag
+    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
+    *ring.recvFlagFromPrev = 0;
+
+    incrementOpCounter(&args);
+  }
+}
+
+#define THREADS 512
+#define UNROLL 8
+
+template<class FUNC, typename T>
+ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
+    const int count, ncclComm* comm, cudaStream_t stream) {
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
+  } else {
+    KernelArgs<T> args;
+    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
+    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
+  }
+
+  return ncclSuccess;
+}
+
+template<typename T, template <typename> class RedOp>
+class AllReduce {
+  public:
+  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
+      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
+    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
+  }
+};
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
+  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
+}
+
diff --git a/third_party/nccl/src/broadcast.cu b/third_party/nccl/src/broadcast.cu
new file mode 100644
index 0000000..3a7cb11
--- /dev/null
+++ b/third_party/nccl/src/broadcast.cu
@@ -0,0 +1,164 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "primitives.h"
+
+#define NUM_SUBSTEPS 4
+#define NUM_BUFCHUNKS 2
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<int THREADS, int UNROLL, class FUNC, typename T>
+__launch_bounds__(THREADS+WARP_SIZE, 1)
+__global__ void BroadcastKernel(const KernelArgs<T> args) {
+  const int tid = threadIdx.x;
+  __shared__ T* sharedNextOutput;
+  __shared__ DevRing<T> ring;
+  bool pushrecv = args.pushrecv;
+
+  LoadRing<THREADS>(args.ring, &ring);
+  __syncthreads();
+
+  if (tid == 0) {
+    WaitFlag prevCommOp(ring.prevOpCounter, 0);
+    WaitFlag nextCommOp(ring.nextOpCounter, 0);
+    prevCommOp.wait(args.opIndex);
+    nextCommOp.wait(args.opIndex);
+    if (pushrecv) {
+      *ring.sendPtrToPrev = (T*)args.ThisOutput;
+      Wait([=] {
+        return *ring.recvPtrFromNext != nullptr;
+      });
+      sharedNextOutput = *ring.recvPtrFromNext;
+      *ring.recvPtrFromNext = nullptr;
+    }
+  }
+  __syncthreads();
+
+  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
+  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
+  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
+
+  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
+
+  const int size = args.N;
+  const int rank = ring.userRank[0];
+  const int nextRank = ring.userRank[1];
+  const int root = args.root;
+  const int buffSize = args.buffSize / sizeof(T);
+  const int sliceSize = buffSize / NUM_BUFCHUNKS;
+  
+  int step = 0;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = args.ThisInput;
+  T * __restrict__ thisOutput =  args.ThisOutput;
+  T * __restrict__ prevInput = ring.recvBuffer;
+  T * __restrict__ nextOutput =  ring.sendBuffer;
+
+  for (int offset = 0; offset < size; offset += sliceSize) {
+    int maxOffset = size-offset;
+    if (rank == root) {
+      Prims::Copy(
+          thisInput + offset,
+          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else if (nextRank == root) {
+      if (pushrecv) maxOffset = 0; // Only wait for signals
+      Prims::Copy(
+          prevInput  + boffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      if (pushrecv) {
+        Prims::Copy(
+            thisOutput + offset,
+            sharedNextOutput + offset,
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      } else {
+        Prims::DoubleCopy(
+            prevInput + boffset,
+            thisOutput + offset,
+            nextOutput + boffset,
+	    sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      }
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  // wait for the last data to be pushed to us
+  if (tid == 0) {
+    if (nextRank != root) {
+      // Wait for last update from next then reset the flag
+      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
+      *ring.recvFlagFromNext = 0;
+    }
+
+    if (rank != root) {
+      // reset the flag
+      *ring.recvFlagFromPrev = 0;
+    }
+
+    incrementOpCounter(&args);
+  }
+}
+
+#define THREADS 256
+#define UNROLL 8
+
+template<class FUNC, typename T>
+ncclResult_t RingBroadcast(void* buff, const int count, const int root,
+    ncclComm* comm, cudaStream_t stream) {
+  if (comm->nRanks != 1) {
+    KernelArgs<T> args;
+    ArgsSetup(&args, buff, buff, root, count, comm);
+    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
+  }
+
+  return ncclSuccess;
+}
+
+template<typename T, template<typename> class RedOp>
+class Broadcast {
+  public:
+  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
+      int count, int root, ncclComm* comm, cudaStream_t stream) {
+    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
+  }
+};
+
+NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
+  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
+}
+
diff --git a/third_party/nccl/src/common_coll.h b/third_party/nccl/src/common_coll.h
new file mode 100644
index 0000000..54050f8
--- /dev/null
+++ b/third_party/nccl/src/common_coll.h
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_COLL_H_
+#define COMMON_COLL_H_
+
+#include "core.h"
+
+static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer\n", opname, ptrname);
+    return ncclInvalidDevicePointer;
+  }
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidDevicePointer;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
+  NCCLCHECK(PtrCheck(comm, opname, "comm"));
+  // First, the easy ones
+  if (root < 0 || root >= comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
+    return ncclInvalidRank;
+  }
+  if (type < 0 || type >= nccl_NUM_TYPES) {
+    WARN("%s : invalid type %d\n", opname, type);
+    return ncclInvalidType;
+  }
+  if (op < 0 || op >= nccl_NUM_OPS) {
+    WARN("%s : invalid reduction operation %d\n", opname, op);
+    return ncclInvalidOperation;
+  }
+  if (count < 0) {
+    WARN("%s : invalid count %d\n", opname, count);
+    return ncclInvalidArgument;
+  }
+
+  // Check pointers
+  NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
+  if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
+    // No need to check recvbuff pointer for non-root reduce
+    return ncclSuccess;
+  }
+  NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
+  return ncclSuccess;
+}
+
+// Kernel launch
+template<typename T>
+struct KernelArgs {
+  // general parameters
+  int nRanks;
+  int root;
+  int buffSize;
+  int N;
+  int opIndex;
+  volatile int * __restrict__ opCounter;
+  int * __restrict__ doneCount;
+  bool pushrecv;
+
+  // some pre-computed sizes
+  int SliceSize;
+  int SliceOffset;
+  int ChunkSize;
+  int NumChunks;
+
+  // local and remote input, output, and buffer
+  const T * __restrict__ ThisInput;
+  T * __restrict__ ThisOutput;
+
+  DevRing<char>* ring;
+};
+
+template<typename T>
+void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
+		const int root, const int count, ncclComm *comm) {
+  args->nRanks = comm->nRanks;
+  args->root = root;
+  args->buffSize = comm->buffSize;
+  args->N = count;
+  args->opIndex = comm->opSched;
+  args->opCounter = comm->opCounter;
+  args->ThisInput = (const T*)sendbuff;
+  args->ThisOutput = (T*)recvbuff;
+  args->ring = comm->devRing;
+  args->pushrecv = comm->globalMemSpace;
+}
+
+#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
+		args, stream) do { \
+  dim3 grid(1, 1, 1); \
+  dim3 block(THREADS+1, 1, 1); \
+  void* argptrs[] = {&args}; \
+  CUDACHECK(cudaLaunchKernel( \
+            (void*)K<THREADS, UNROLL, FUNC, T>, \
+            grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
+} while (0)
+
+#endif
diff --git a/third_party/nccl/src/common_kernel.h b/third_party/nccl/src/common_kernel.h
new file mode 100644
index 0000000..b96519f
--- /dev/null
+++ b/third_party/nccl/src/common_kernel.h
@@ -0,0 +1,362 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef COMMON_KERNEL_H_
+#define COMMON_KERNEL_H_
+
+#include <cstdio>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+// BAR macro and helpers
+#define WARP_SIZE 32
+#define ROUNDUP(x, y)                                                           \
+    (((((x) + (y) - 1) / (y))) * (y))
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define BAR_EXEC(type, barid, nthreads) \
+    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
+#define BAR_EXPAND(type, barid, nthreads) \
+    BAR_EXEC(type, barid, (nthreads))
+
+// Named barrier macro.
+// Expands to asm("bar.type barid, nthreads") where
+// nthreads has been rounded up to WARP_SIZE.
+#define BAR(type, barid, nthreads) \
+    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+#ifdef CUDA_HAS_HALF
+#if CUDART_VERSION < 9000
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r.x = ptr->x;
+  return r;
+}
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ptr->x = val.x;
+}
+#else
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  return *((half*)ptr);
+}
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  *((half*)ptr) = val;
+}
+#endif
+#endif
+
+__device__ unsigned int spinct;
+
+// Spin wait until func evaluates to true
+template<typename FUNC>
+__device__ inline void Wait(const FUNC& func) {
+  while (!func()) {
+    // waste time
+    atomicInc(&spinct, 10);
+  }
+}
+
+typedef uint64_t PackType;
+
+// unpack x and y to elements of type T and apply FUNC to each element
+template<class FUNC, typename T>
+struct MULTI {
+  __device__ PackType operator()(const PackType x, const PackType y) const;
+};
+
+template<class FUNC>
+struct MULTI<FUNC, char> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int> {
+  static_assert(sizeof(PackType) == 2 * sizeof(int),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      int a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+#ifdef CUDA_HAS_HALF
+template<class FUNC>
+struct MULTI<FUNC, half> {
+  static_assert(sizeof(PackType) == 4 * sizeof(half),
+      "PackType must be four times the size of half.");
+
+  struct PackHalf2 {
+    half2 a, b;
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    struct PackHalf2 cx, cy, cr;
+    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
+    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return *(reinterpret_cast<PackType*>(&cr));
+  }
+};
+#endif
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, double> {
+  static_assert(sizeof(PackType) == sizeof(double),
+      "PackType must be the same size as double.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
+    return __double_as_longlong(rv);
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, unsigned long long> {
+  static_assert(sizeof(PackType) == sizeof(unsigned long long),
+      "PackType must be the same size as unsigned long long.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    unsigned long long rv = FUNC()(x, y);
+    return rv;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, long long> {
+  static_assert(sizeof(PackType) == sizeof(long long),
+      "PackType must be the same size as long long.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    long long rv = FUNC()((long long)x, (long long)y);
+    return rv;
+  }
+};
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__device__ inline void ReduceCopy(
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int idx) {
+  T val = vFetch(src0+idx);
+  if (TWO_INPUTS) {
+    val = FUNC()(val, vFetch(src1+idx));
+  }
+  vStore(dest0+idx, val);
+  if (TWO_OUTPUTS) {
+    vStore(dest1+idx, val);
+  }
+}
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
+__device__ inline void ReduceCopy64b(
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int offset) {
+  PackType t0[UNROLL];
+  PackType t1[UNROLL];
+  #pragma unroll
+  for (int u = 0; u < UNROLL; ++u) {
+    int idx = offset + u*THREADS;
+    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
+    if (TWO_INPUTS) {
+      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
+    }
+  }
+  #pragma unroll
+  for (int u = 0; u < UNROLL; ++u) {
+    int idx = offset + u*THREADS;
+    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
+    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
+    if (TWO_OUTPUTS) {
+      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
+    }
+  }
+}
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of producer threads
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
+    bool HAS_SRC1>
+__device__ inline void ReduceOrCopy(const int tid,
+    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
+    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+    int N) {
+  if (N<=0) {
+    return;
+  }
+
+  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
+
+  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
+  // If not, we'll just use the slow preamble path for the whole operation
+  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
+      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
+      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
+
+  if (!alignable) {
+    Npreamble = N;
+  }
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  for (int idx = tid; idx < Npreamble; idx += THREADS) {
+    // ought to be no way this is ever more than one iteration, except when
+    // alignable is false
+    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
+  }
+
+  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 64-bit alignable.
+  if (alignable) {
+    const int PackFactor = sizeof(PackType) / sizeof(T);
+    int Nrem = N - Npreamble;
+    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
+    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+
+    // stage 2a: main loop
+    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
+        * (UNROLL * THREADS); // round down
+
+    #pragma unroll 1 // don't unroll this loop
+    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
+      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
+    }
+
+    int Ndone2a = Nalign2a * PackFactor;
+    Nrem -= Ndone2a;
+
+    // stage 2b: slightly less optimized for section when we don't have full
+    // UNROLLs
+
+    int Nalign2b = Nrem / PackFactor;
+
+    #pragma unroll 4
+    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
+      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
+    }
+
+    int Ndone2b = Nalign2b * PackFactor;
+    Nrem -= Ndone2b;
+    int Ndone2 = Ndone2a + Ndone2b;
+    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
+    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }
+
+    // stage 2c: tail
+
+    for (int idx = tid; idx < Nrem; idx += THREADS) {
+      // never ought to make it more than one time through this loop.  only a
+      // few threads should even participate
+      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
+    }
+  } // done fast path
+}
+
+template <typename T>
+__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
+  // increment comm's operation counts
+  __threadfence_system(); // Technically need to ensure that cleared flags
+  // are visible before incrementing op counter.
+  *args->opCounter = args->opIndex+1;
+}
+
+template <int THREADS, typename T> __device__ __forceinline__
+void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
+  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
+  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
+  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
+  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
+  long long* lldst = reinterpret_cast<long long*>(dst);
+  const long long* llsrc = reinterpret_cast<const long long*>(src);
+  if (threadIdx.x < NUM_WORDS) {
+    lldst[threadIdx.x] = llsrc[threadIdx.x];
+  }
+}
+
+
+#endif // COMMON_KERNEL_H_
diff --git a/third_party/nccl/src/copy_kernel.h b/third_party/nccl/src/copy_kernel.h
new file mode 100644
index 0000000..0f69748
--- /dev/null
+++ b/third_party/nccl/src/copy_kernel.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef COPY_KERNEL_H_
+#define COPY_KERNEL_H_
+
+#include "common_kernel.h"
+
+template<typename T>
+struct FuncPassA {
+  __device__ T operator()(const T x, const T y) const {
+    return x;
+  }
+};
+
+#ifdef CUDA_HAS_HALF
+template <>
+struct FuncPassA<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    return x;
+  }
+  __device__ half operator()(const half x, const half y) const {
+    return x;
+  }
+};
+#endif
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of producer threads
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, typename T>
+__device__ void Copy(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src, const int N) {
+  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
+      dest, nullptr, src, nullptr, N);
+}
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of producer threads
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, typename T>
+__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src, const int N) {
+  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
+      dest0, dest1, src, nullptr, N);
+}
+
+#endif // COPY_KERNEL_H_
diff --git a/third_party/nccl/src/core.cu b/third_party/nccl/src/core.cu
new file mode 100644
index 0000000..1420d21
--- /dev/null
+++ b/third_party/nccl/src/core.cu
@@ -0,0 +1,1019 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "core.h"
+#include "libwrap.h"
+#include "common_coll.h"
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <string.h>
+#include <errno.h>
+
+DebugLevel ncclDebugLevel;
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  pid_t pid = getpid();
+  static int count = 0;
+  int commId = __sync_fetch_and_add(&count, 1);
+  int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId);
+  if(strlen(out->internal) < len) {
+    WARN("ncclUniqueId truncated");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+
+static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) {
+  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+    WARN("shm_open failed to open %s", shmname);
+    return ncclSystemError;
+  }
+
+  if (ftruncate(fd, bytes) == -1) {
+    WARN("ftruncate failed to allocate %ld bytes", bytes);
+    shm_unlink(shmname);
+    close(fd);
+    return ncclSystemError;
+  }
+
+  *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  if (*ptr == MAP_FAILED) {
+    WARN("failure in mmap");
+    shm_unlink(shmname);
+    close(fd);
+    return ncclSystemError;
+  }
+
+  close(fd);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmUnlink(const char* shmname) {
+  if(shm_unlink(shmname) == -1) {
+    WARN("smh_unlink failed");
+    return ncclSystemError;
+  } else {
+    return ncclSuccess;
+  }
+}
+
+static ncclResult_t shmUnmap(void* ptr, size_t bytes) {
+  if(munmap(ptr, bytes) == -1) {
+    WARN("munmap failed");
+    return ncclSystemError;
+  } else {
+    return ncclSuccess;
+  }
+}
+
+
+typedef struct {
+  int rank;
+  int ndev;
+  int cudaDev;
+  int sortId;
+  pid_t pid;
+  ncclMem* hostptr;
+  ncclMem* devptr;
+  cudaIpcMemHandle_t devipc;
+  size_t buffSize;
+} RankEntry;
+
+static int compRanks(const void* a, const void* b) {
+  const RankEntry* A = (const RankEntry*)a;
+  const RankEntry* B = (const RankEntry*)b;
+  if (A->sortId < B->sortId) return -1;
+  if (A->sortId > B->sortId) return  1;
+  return 0;
+}
+
+static void orderRanks(RankEntry* ranks, int count) {
+  qsort(ranks, count, sizeof(RankEntry), compRanks);
+}
+
+
+typedef struct {
+  union {
+    struct {
+      volatile int bar;
+      int globalMemSpaceBroke;
+    };
+    char pad[16];
+   };
+   RankEntry ranks[1];
+} RankGather;
+
+static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
+    int ndev, int rank, RankEntry myInfo) {
+  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
+  RankGather* tmp = NULL;
+  int bar_tmp;
+
+  ncclResult_t res = shmOpen(commId.internal, bytes, (void**)&tmp);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to open shm segment for gather", rank);
+    return res;
+  }
+
+  tmp->ranks[rank] = myInfo;
+
+  bar_tmp = tmp->bar - 1;
+  bool swapped;
+  do {
+    bar_tmp += 1;
+    if (bar_tmp == ndev-1) { // everyone is done
+      ncclResult_t res = shmUnlink(commId.internal);
+      if (res != ncclSuccess) {
+        WARN("rank %d failed to unlink shm segment for gather", rank);
+        shmUnmap(tmp, bytes);
+        return res;
+      }
+
+      orderRanks(tmp->ranks, ndev);
+    }
+    swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1);
+  } while(!swapped);
+
+  while (tmp->bar < ndev)
+    sched_yield();
+  __sync_synchronize();
+
+  *gather = tmp;
+  return ncclSuccess;
+}
+
+static void syncRingDirect(RankGather* gather, int* globalMemSpaceOk) {
+  int bar_tmp = gather->bar - 1;
+  int ndev = gather->ranks[0].ndev;
+  bool swapped;
+  do {
+    bar_tmp += 1;
+    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
+  } while(!swapped);
+
+  while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
+    sched_yield();
+  __sync_synchronize();
+
+  *globalMemSpaceOk = gather->globalMemSpaceBroke ? 0 : 1;
+}
+
+static ncclResult_t closeGather(RankGather* gather, int ndev) {
+  int bar_tmp = gather->bar - 1;
+  bool swapped;
+  do {
+    bar_tmp += 1;
+    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
+  } while(!swapped);
+
+  while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
+    sched_yield();
+  __sync_synchronize();
+
+  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
+  ncclResult_t res = shmUnmap(gather, bytes);
+  if (res != ncclSuccess) {
+    WARN("failed to unmap %ld bytes of gather", bytes);
+    return res;
+  }
+
+  return ncclSuccess;
+}
+
+
+static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) {
+  size_t size = offsetof(struct ncclMem, buff) + buffSize;
+  cudaError_t res = cudaMalloc((void**)ptr, size);
+  if (res != cudaSuccess) {
+    *ptr = NULL;
+    WARN("failed to allocate %lu byte device buffer", size);
+    return ncclCudaMallocFailed;
+  }
+  if (cudaMemset(*ptr, 0, size) != cudaSuccess) {
+    WARN("failed to memset device buffer.");
+    cudaFree(*ptr);
+    *ptr = NULL;
+    return ncclUnhandledCudaError;
+  }
+  return ncclSuccess;
+}
+
+static const int ShmMapped = 1;
+static const int ShmLinked = 2;
+
+static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) {
+  size_t size = offsetof(struct ncclMem, buff) + buffSize;
+  cudaError_t res = cudaMallocHost((void**)ptr, size);
+  if (res != cudaSuccess) {
+    *ptr = NULL;
+    WARN("failed to allocate %lu byte host buffer", size);
+    return ncclSystemError;
+  }
+  memset(*ptr, 0, size);
+  return ncclSuccess;
+}
+
+static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) {
+  size_t size = offsetof(struct ncclMem, buff) + buffSize;
+  ncclResult_t res = shmOpen(shmname, size, (void**)ptr);
+  if (res != ncclSuccess) {
+    WARN("failed to allocate %lu byte shm buffer", size);
+    *ptr = NULL;
+    return res;
+  }
+
+  if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) {
+    WARN("failed to register host buffer");
+    shmUnlink(shmname);
+    shmUnmap(*ptr, size);
+    *ptr = NULL;
+    return ncclUnhandledCudaError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) {
+  char busId[13];
+  nvmlDevice_t nvmlHandle;
+  cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev);
+  if (res == cudaErrorInvalidDevice) {
+    WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev);
+    return ncclInvalidDeviceIndex;
+  } else if (res != cudaSuccess) {
+    WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev);
+    return ncclUnhandledCudaError;
+  }
+  INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId);
+
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
+    WARN("rank %d failed to get nvml handle for device %s", rank, busId);
+    return ncclUnhandledCudaError;
+  }
+  // Order by nvml index
+  if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->sortId) != ncclSuccess) {
+    WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev);
+    return ncclUnhandledCudaError;
+  }
+
+  info->rank = rank;
+  info->ndev = comm->nRanks;
+  info->cudaDev = comm->cudaDev;
+  info->pid = getpid();
+  info->buffSize = comm->buffSize;
+  info->hostptr = comm->hostMem;
+  info->devptr = comm->devMem;
+  if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
+    WARN("rank %d failed to open CUDA IPC handle", rank);
+    return ncclUnhandledCudaError;
+  }
+
+  return ncclSuccess;
+}
+
+
+static ncclResult_t commClearMaps(ncclComm_t comm) {
+  ncclResult_t res, retval = ncclSuccess;
+  cudaError_t cures;
+
+  for(int d=0; d<comm->nRanks; ++d) {
+    if (comm->ptrs[d].hostCleanup != NULL) {
+      cures = cudaHostUnregister(comm->ptrs[d].hostCleanup);
+      if (cures != cudaSuccess) {
+        WARN("rank %d failed to unregister handle to device %d",
+          comm->rank, d);
+          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
+      }
+      res = shmUnmap(comm->ptrs[d].hostCleanup, offsetof(ncclMem, buff) + comm->buffSize);
+      if (res != ncclSuccess) {
+        WARN("rank %d failed to unmap handle to device %d",
+          comm->rank, d);
+          retval = (retval == ncclSuccess) ? res : retval;
+      }
+      comm->ptrs[d].hostCleanup = NULL;
+    }
+
+    if (comm->ptrs[d].devCleanup != NULL) {
+      cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].devCleanup);
+      if (cures != cudaSuccess) {
+        WARN("rank %d failed to close IPC handle to device %d: %s",
+          comm->rank, d, cudaGetErrorString(cures));
+        retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
+      }
+    }
+  }
+
+  if (comm->userFromRing != NULL)
+    memset(comm->userFromRing, 0, sizeof(int)*comm->nRanks);
+  if (comm->ncclFromRing != NULL)
+    memset(comm->ncclFromRing, 0, sizeof(int)*comm->nRanks);
+
+  if (comm->devUserFromRing != NULL) {
+    cures = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nRanks);
+    if (cures != cudaSuccess) {
+      WARN("Faild to clear dev map: %s", cudaGetErrorString(cures));
+      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
+    }
+  }
+
+  if (comm->devRing != NULL) {
+    cures = cudaMemset(comm->devRing, 0, sizeof(DevRing<char>));
+    if (cures != cudaSuccess) {
+      WARN("Failed to clear devRing: %s", cudaGetErrorString(cures));
+      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
+    }
+  }
+  return retval;
+}
+
+static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* globalMemSpaceBroke) {
+  int ndev = comm->nRanks;
+  comm->rank = rank;
+
+  if (ndev > MAXRANKS) {
+    WARN("%d ranks exceeds MAXRANKS of %d", ndev, MAXRANKS);
+    return ncclUnsupportedDeviceCount;
+  }
+
+  // Check for inconsistencies between ranks
+  // If two ranks use the same rank, then one slot of
+  // ranks[] will be left unset with zero ndev/buffSize.
+  for(int i=0; i<ndev; ++i) {
+    if (ranks[i].buffSize != comm->buffSize
+        || ranks[i].ndev != comm->nRanks) {
+      commClearMaps(comm);
+      return ncclRankMismatch;
+    }
+  }
+
+  // Find self among ranks of gather
+  int myNcclId = -1;
+  for (int i=0; i<ndev; ++i) {
+    if(ranks[i].rank == rank) {
+      myNcclId = i;
+      break;
+    }
+  }
+  if (myNcclId == -1) {
+    WARN("rank %d not found in communicator", rank);
+    return ncclInvalidRank;
+  }
+
+  for(int ringPos=0; ringPos<ndev; ++ringPos) {
+    int ncclPos = (ringPos+myNcclId) % ndev; // ring order relative to self
+    int userRank = ranks[ncclPos].rank;
+    comm->userFromRing[ringPos] = userRank;
+    comm->ncclFromRing[ringPos] = ncclPos;
+  }
+
+  int myDev = ranks[myNcclId].cudaDev;
+  pid_t myPid = ranks[myNcclId].pid;
+
+  for (int i=0; i<ndev; ++i) {
+    int iRank = ranks[i].rank;
+    int iDev = ranks[i].cudaDev;
+    pid_t iPid = ranks[i].pid;
+    int canpeer = 0;
+
+    int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
+
+    if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
+      INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
+        rank, myDev, iRank, iDev);
+      canpeer = 0;
+    }
+
+    cudaError_t err;
+    ncclMem* remoteHostBuff;
+
+    comm->ptrs[i].type = NodeRef::HOST; // Assume host buffer
+    comm->ptrs[i].devCleanup = NULL;
+    comm->ptrs[i].hostCleanup = NULL;
+
+    if (iPid == myPid) {
+      remoteHostBuff = ranks[i].hostptr;
+
+      if (myDev == iDev) { // shared device
+        INFO("rank access %d -> %d via common device", rank, iRank);
+        comm->ptrs[i].type = NodeRef::DEVICE;
+        comm->ptrs[i].local = ranks[myNcclId].devptr;
+        comm->ptrs[i].remote = ranks[i].devptr;
+      } else if (canpeer) {
+        INFO("rank access %d -> %d via P2P device mem", rank, iRank);
+        err = cudaDeviceEnablePeerAccess(iDev, 0);
+        if (err == cudaErrorPeerAccessAlreadyEnabled) {
+          cudaGetLastError();
+        } else if (err != cudaSuccess) {
+          WARN("rank %d failed to peer with device %d: %s",
+              rank, iDev, cudaGetErrorString(err));
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        comm->ptrs[i].type = NodeRef::DEVICE;
+        comm->ptrs[i].local = ranks[myNcclId].devptr;
+        comm->ptrs[i].remote = ranks[i].devptr;
+      }
+    } else { // Separate processes
+      *globalMemSpaceBroke = 1;
+      char rankname[1024];
+      sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
+      if (openHostMemShm(rankname, &remoteHostBuff, ranks[i].buffSize)
+          != ncclSuccess) {
+        WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
+        commClearMaps(comm);
+        return ncclUnhandledCudaError;
+      }
+      comm->ptrs[i].hostCleanup = remoteHostBuff;
+
+      // TODO: Extend to same device (MPS) case.
+      // At present that would go through host mem.
+      if (canpeer) {
+        INFO("rank access %d -> %d via IPC device mem", rank, iRank);
+        comm->ptrs[i].type = NodeRef::DEVICE;
+        comm->ptrs[i].local  = ranks[myNcclId].devptr;
+        err = cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
+            ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess);
+        if (err != cudaSuccess) {
+          WARN("rank %d failed to open Ipc handle to rank %d: %s",
+              rank, iRank, cudaGetErrorString(err));
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        comm->ptrs[i].devCleanup = comm->ptrs[i].remote;
+      }
+    }
+
+    err = cudaHostGetDevicePointer(&comm->ptrs[i].opCounter,
+          &(remoteHostBuff->opCounter), 0);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to obtain %d's zero copy pointer: %s",
+          rank, iRank, cudaGetErrorString(err));
+      commClearMaps(comm);
+      return ncclUnhandledCudaError;
+    }
+
+    if (comm->ptrs[i].type == NodeRef::HOST) {
+      *globalMemSpaceBroke = 1;
+      INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
+      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myNcclId].hostptr, 0) != cudaSuccess) {
+        WARN("rank %d failed to map zero copy buffer to device", rank);
+        commClearMaps(comm);
+        return ncclUnhandledCudaError;
+      }
+      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, remoteHostBuff, 0) != cudaSuccess) {
+        WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
+        commClearMaps(comm);
+        return ncclUnhandledCudaError;
+      }
+    }
+  }
+
+  // Setup device-side ring view
+  if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int),
+      cudaMemcpyHostToDevice) != cudaSuccess) {
+    WARN("rank %d failed to copy maps to device", rank);
+    commClearMaps(comm);
+    return ncclUnhandledCudaError;
+  }
+
+  DevRing<char> ringTemp;
+  memcpy(ringTemp.userRank, comm->userFromRing, ndev*sizeof(int));
+
+  int prevIdx = comm->ncclFromRing[comm->nRanks-1];
+  int nextIdx = comm->ncclFromRing[1 % comm->nRanks];
+  NodeRef* prevPtrs = comm->ptrs+prevIdx;
+  NodeRef* nextPtrs = comm->ptrs+nextIdx;
+
+  ringTemp.prevOpCounter    = prevPtrs->opCounter;
+  ringTemp.nextOpCounter    = nextPtrs->opCounter;
+  ringTemp.sendFlagToNext   = nextPtrs->remote->flags;
+  ringTemp.recvFlagFromPrev = prevPtrs->local->flags;
+  ringTemp.sendFlagToPrev   = prevPtrs->remote->flags+1;
+  ringTemp.recvFlagFromNext = nextPtrs->local->flags+1;
+
+  ringTemp.recvPtrFromNext = (char**)&nextPtrs->local->recvPtrs;
+  ringTemp.sendPtrToPrev   = (char**)&prevPtrs->remote->recvPtrs;
+
+  ringTemp.recvBuffer = prevPtrs->local->buff;
+  ringTemp.sendBuffer = nextPtrs->remote->buff;
+
+  if (cudaMemcpy(comm->devRing, &ringTemp, sizeof(ringTemp),
+      cudaMemcpyHostToDevice) != cudaSuccess) {
+    WARN("rank %d failed to copy ring maps to device", rank);
+    commClearMaps(comm);
+    return ncclUnhandledCudaError;
+  }
+
+  return ncclSuccess;
+}
+
+static void initDebug() {
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NONE;
+  } else if (strcmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = VERSION;
+  } else if (strcmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = WARN;
+  } else if (strcmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = INFO;
+    INFO("NCCL debug level set to INFO");
+  } else if (strcmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = ABORT;
+    INFO("NCCL debug level set to ABORT");
+  }
+}
+
+static void commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return;
+
+  if (comm->doneEvent != NULL)
+    if (cudaEventDestroy(comm->doneEvent) != cudaSuccess)
+      INFO("ncclComm failed to destroy doneEvent");
+
+  ncclResult_t res = commClearMaps(comm);
+  if (res != ncclSuccess)
+    INFO("failed to cleanup comm maps");
+
+  if (comm->devRing != NULL)
+    if (cudaFree(comm->devRing) != cudaSuccess)
+      INFO("commFree failed to free devRing");
+
+  if (comm->userFromRing != NULL)
+    free(comm->userFromRing);
+
+  if (comm->devUserFromRing != NULL)
+    if (cudaFree(comm->devUserFromRing) != cudaSuccess)
+      INFO("commFree failed to free dev maps");
+
+  if (comm->ncclFromRing != NULL)
+    free(comm->ncclFromRing);
+
+  if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess)
+    INFO("Failed to free devMap");
+
+  if (comm->hostMem != NULL) {
+    if (comm->hostMemState & ShmMapped) {
+      if (cudaHostUnregister(comm->hostMem) != cudaSuccess)
+        INFO("Failed to unregister hostMem");
+      size_t size = offsetof(ncclMem, buff) + comm->buffSize;
+      if (shmUnmap(comm->hostMem, size) != ncclSuccess)
+        INFO("Failed to unmap hostMem");
+      comm->hostMemState ^= ShmMapped;
+    } else {
+      cudaFreeHost(comm->hostMem);
+    }
+  }
+  free(comm);
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) {
+  size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(NodeRef);
+  struct ncclComm* comm = (struct ncclComm*)malloc(commBytes);
+  if (comm == NULL) {
+    WARN("comm allocation failed");
+    return ncclSystemError;
+  }
+  memset(comm, 0, commBytes);
+
+  comm->nRanks = ndev;
+  cudaGetDevice(&comm->cudaDev);
+
+  const char* str = getenv("NCCL_BUFFSIZE");
+  int buffsize;
+  if (str != NULL) {
+    errno = 0;
+    buffsize = strtol(str, NULL, 10);
+    if (errno == ERANGE || buffsize == 0) {
+      INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu",
+          rank, str, DEFAULT_BUFFER_SIZE_BYTES);
+      buffsize = DEFAULT_BUFFER_SIZE_BYTES;
+    }
+  } else {
+    buffsize = DEFAULT_BUFFER_SIZE_BYTES;
+  }
+  comm->buffSize = buffsize;
+  INFO("rank %d using buffSize = %lu", rank, comm->buffSize);
+
+
+  ncclResult_t res;
+  res = allocDevMem(&comm->devMem, comm->buffSize);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to allocate device buffer", rank);
+    commFree(comm);
+    return res;
+  }
+
+  if (cudaMalloc(&comm->devRing, sizeof(DevRing<char>)) != cudaSuccess) {
+    WARN("rank %d failed to allocate device-side ring views", rank);
+    commFree(comm);
+    return ncclCudaMallocFailed;
+  }
+
+  if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess ) {
+    WARN("rank %d failed to allocated device maps", rank);
+    commFree(comm);
+    return ncclCudaMallocFailed;
+  }
+
+  comm->userFromRing = (int*)malloc(ndev*sizeof(int));
+  if (comm->userFromRing == NULL) {
+    WARN("rank %d failed to allocate host maps", rank);
+    commFree(comm);
+    return ncclSystemError;
+  }
+
+  comm->ncclFromRing = (int*)malloc(ndev*sizeof(int));
+  if (comm->ncclFromRing == NULL) {
+    WARN("rank %d failed to allocate host maps", rank);
+    commFree(comm);
+    return ncclSystemError;
+  }
+
+  if (cudaEventCreateWithFlags(&comm->doneEvent, cudaEventDisableTiming) != cudaSuccess) {
+    WARN("ncclComm on rank %d failed to create doneEvent", rank);
+    commFree(comm);
+    return ncclUnhandledCudaError;
+  }
+
+  if(commId == NULL) {
+    comm->hostMemState = 0;
+    res = allocHostMem(&comm->hostMem, comm->buffSize);
+  } else {
+    char rankname[1024];
+    sprintf(rankname, "%s-%d", commId->internal, rank);
+    res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize);
+    if (res != ncclSuccess) {
+      WARN("rank %d failed to allocate host buffer", rank);
+      commFree(comm);
+      return res;
+    }
+    comm->hostMemState = ShmMapped | ShmLinked;
+  }
+
+  if (cudaHostGetDevicePointer(&comm->opCounter, &comm->hostMem->opCounter, 0) != cudaSuccess) {
+    WARN("ncclComm on rank %d failed to map opCounter to device", rank);
+    commFree(comm);
+    return ncclUnhandledCudaError;
+  }
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommUpdate(ncclComm_t comm) {
+  // Copy the comm on the device
+  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
+  if (cudaMemcpy(comm->devComm, comm, commBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
+    WARN("failed to copy device comm");
+    return ncclUnhandledCudaError;
+  }
+  // Fix the host pointer to be accessible from the device
+  void* dptr;
+  if (cudaHostGetDevicePointer(&dptr, comm->hostMem, 0) != cudaSuccess) {
+    WARN("failed to get device pointer for host mem");
+    return ncclUnhandledCudaError;
+  }
+  if (cudaMemcpy(&comm->devComm->hostMem, &dptr, sizeof(dptr), cudaMemcpyHostToDevice) != cudaSuccess) {
+    WARN("failed to update host pointer");
+    return ncclUnhandledCudaError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Fully duplicate the comm on the device
+  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
+  if (cudaMalloc(&comm->devComm, commBytes) != cudaSuccess) {
+    WARN("failed to allocated device comm");
+    return ncclCudaMallocFailed;
+  }
+  return devCommUpdate(comm);
+}
+
+static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) {
+  char rankname[1024];
+  sprintf(rankname, "%s-%d", commId.internal, rank);
+  if (comm->hostMemState & ShmLinked)
+    comm->hostMemState ^= ShmLinked;
+  return shmUnlink(rankname);
+}
+
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= VERSION) {
+    printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
+    fflush(stdout);
+    shown = 1;
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+  if (myrank == 0) showVersion();
+
+  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
+
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclUnsupportedDeviceCount;
+  }
+  if (myrank >= ndev || myrank < 0) {
+    WARN("Invalid rank %d, should be in the range 0..%d", myrank, ndev-1);
+    return ncclInvalidRank;
+  }
+
+  if (strlen(commId.internal) < 1 ||
+      strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
+    WARN("rank %d invalid commId", myrank);
+    return ncclInvalidArgument;
+  }
+
+  initDebug();
+  ncclResult_t res;
+  RankEntry myStuff;
+  RankGather* gath = NULL;
+
+  res = wrapSymbols();
+  if (res != ncclSuccess) {
+    WARN("NCCL failed to initialize client libs");
+    return res;
+  }
+
+  res = wrapNvmlInit();
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to initialize nvml", myrank);
+    return res;
+  }
+
+  res = commAlloc(newcomm, ndev, &commId, myrank);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to allocate communicator", myrank);
+    return res;
+  }
+
+  res = populateRankInfo(&myStuff, myrank, *newcomm);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to obtain rank info", myrank);
+    goto cleanup;
+  }
+
+  res = initGather(&gath, commId, ndev, myrank, myStuff);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to gather rank info", myrank);
+    goto cleanup;
+  }
+
+  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
+  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to build comm maps", myrank);
+    goto cleanup;
+  }
+
+  INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");
+
+  res = closeGather(gath, ndev); // includes a barrier
+  gath = NULL;
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to close gather", myrank);
+    goto cleanup;
+  }
+
+  res = devCommSetup(*newcomm);
+  if (res != ncclSuccess) {
+    WARN("rank %d failed to copy dcomm", myrank);
+    goto cleanup;
+  }
+
+  res = ncclSuccess;
+  goto final;
+
+  cleanup:
+  if (gath != NULL)
+    closeGather(gath, ndev);
+  commFree(*newcomm);
+
+  final:
+  if ((*newcomm)->hostMemState & ShmLinked) {
+    if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess)
+      INFO("rank %d failed to unlink host mem shm segment", myrank);
+  }
+
+  if (wrapNvmlShutdown() != ncclSuccess)
+    INFO("rank %d did not shutdown nvml properly", myrank);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  initDebug();
+
+  showVersion();
+
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclUnsupportedDeviceCount;
+  }
+
+  ncclResult_t res;
+  int savedDevice;
+  RankEntry* ranks = NULL;
+  int rank, cudaDev;
+  ncclComm_t comm = NULL;
+  char busId[13];
+  nvmlDevice_t nvmlHandle;
+  int affinity_set = 0;
+  int globalMemSpaceBroke = 0; // Assume direct access to recv ptr OK
+
+  res = wrapSymbols();
+  if (res != ncclSuccess) {
+    WARN("NCCL failed to initialize client libs");
+    return res;
+  }
+
+  cudaGetDevice(&savedDevice);
+  ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry));
+  if (ranks == NULL) {
+    WARN("NCCL allocation failed");
+    return ncclSystemError;
+  }
+  memset(ranks, 0, ndev*sizeof(RankEntry));
+
+  res = wrapNvmlInit();
+  if (res != ncclSuccess) {
+    WARN("nccl failed to initialize nvml");
+    return res;
+  }
+
+  for(rank=0; rank<ndev; ++rank)
+    comms[rank] = NULL;
+
+  for (rank=0; rank<ndev; ++rank) {
+    cudaDev = (devlist == NULL) ? rank : devlist[rank];
+    if (cudaSetDevice(cudaDev) != cudaSuccess) {
+      WARN("rank %d failed to set cuda device %d", rank, cudaDev);
+      res = ncclInvalidDeviceIndex;
+      goto cleanup;
+    }
+
+    // Set CPU affinity
+    affinity_set = 0;
+    if (cudaDeviceGetPCIBusId(busId, 13, cudaDev) != cudaSuccess) {
+      INFO("rank %d failed to get PCI Bus Id for device %d", rank, cudaDev);
+      goto skipaffinity;
+    }
+    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
+      INFO("rank %d failed to get nvml handle for device %s", rank, busId);
+      goto skipaffinity;
+    }
+    if (wrapNvmlDeviceSetCpuAffinity(nvmlHandle) != ncclSuccess) {
+      INFO("rank %d failed to set affinity", rank);
+      goto skipaffinity;
+    }
+    affinity_set = 1;
+    skipaffinity:
+
+    res = commAlloc(&comm, ndev, NULL, rank);
+    if (res != ncclSuccess) {
+      WARN("rank %d failed to allocate communicator", rank);
+      goto cleanup;
+    }
+    comms[rank] = comm;
+
+    if (affinity_set && wrapNvmlDeviceClearCpuAffinity(nvmlHandle) != ncclSuccess) {
+      INFO("rank %d set but failed to clear cpu affinity", rank);
+    }
+    res = populateRankInfo(ranks+rank, rank, comm);
+    if (res != ncclSuccess) {
+      WARN("rank %d failed to obtain rank info", rank);
+      goto cleanup;
+    }
+  }
+
+  orderRanks(ranks, ndev);
+  for(rank=0; rank<ndev; ++rank) {
+    comm = comms[rank];
+    cudaSetDevice(comm->cudaDev);
+    res = commBuildMaps(comm, NULL, rank, ranks, &globalMemSpaceBroke);
+    if (res != ncclSuccess) {
+      WARN("rank %d failed to build comm maps", rank);
+      goto cleanup;
+    }
+  }
+
+  INFO("Global device memory space is %s", (globalMemSpaceBroke) ? "disabled" : "enabled");
+  for(rank=0; rank<ndev; ++rank) {
+    comms[rank]->globalMemSpace = globalMemSpaceBroke ? 0 : 1;
+  }
+ 
+  for(rank=0; rank<ndev; ++rank) {
+    res = devCommSetup(comms[rank]);
+    if (res != ncclSuccess) {
+      WARN("rank %d failed to copy dcomm", rank);
+      goto cleanup;
+    }
+  }
+
+  free(ranks);
+  ranks = NULL;
+  res = ncclSuccess;
+  goto final;
+
+  cleanup:
+  if (ranks != NULL)
+    free(ranks);
+  for(rank=0; rank<ndev; ++rank) {
+    if(comms[rank] != NULL) {
+      commFree(comms[rank]);
+    }
+  }
+
+  final:
+  if(wrapNvmlShutdown() != ncclSuccess)
+    INFO("NCCL did not shutdown nvml properly");
+  cudaSetDevice(savedDevice);
+  return res;
+}
+
+NCCL_API(void, ncclCommDestroy, ncclComm_t comm);
+void ncclCommDestroy(ncclComm_t comm) {
+  if (comm == NULL)
+    return;
+
+  int savedDevice;
+  cudaGetDevice(&savedDevice);
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(cudaSetDevice(commDevice), void());
+  }
+
+  commFree(comm);
+
+  if (savedDevice != commDevice)
+    cudaSetDevice(savedDevice);
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+  case ncclSuccess                : return "no error";
+  case ncclUnhandledCudaError     : return "unhandled cuda error";
+  case ncclSystemError            : return "system error";
+  case ncclInternalError          : return "internal error";
+  case ncclInvalidDevicePointer   : return "invalid device pointer";
+  case ncclInvalidRank            : return "invalid rank";
+  case ncclUnsupportedDeviceCount : return "unsupported device count";
+  case ncclDeviceNotFound         : return "device not found";
+  case ncclInvalidDeviceIndex     : return "invalid device index";
+  case ncclLibWrapperNotSet       : return "lib wrapper not initialized";
+  case ncclCudaMallocFailed       : return "cuda malloc failed";
+  case ncclRankMismatch           : return "parameter mismatch between ranks";
+  case ncclInvalidArgument        : return "invalid argument";
+  case ncclInvalidType            : return "invalid data type";
+  case ncclInvalidOperation       : return "invalid reduction operations";
+  }
+  return "unknown result code";
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
+
diff --git a/third_party/nccl/src/core.h b/third_party/nccl/src/core.h
new file mode 100644
index 0000000..17794d7
--- /dev/null
+++ b/third_party/nccl/src/core.h
@@ -0,0 +1,162 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef CORE_H_
+#define CORE_H_
+
+
+#include "nccl.h"
+#include <cstdio>
+#include <cuda_runtime.h>
+
+#define MAXRANKS 32
+#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
+#define NCCL_MEM_PAD_ALIGN 65536
+
+
+struct ncclMem {
+  union { // Pad this block so that devBuff is correctly aligned
+    struct {
+      int   flags[2];
+      void* recvPtrs;
+      int   opCounter; // Used to determine when remote Communicators are ready.
+                       // Only used in host memory.
+    };
+    char pad[NCCL_MEM_PAD_ALIGN];
+  };
+  // devBuff will be bigger ; we only use its offset/address.
+  char buff[1];
+};
+
+template <typename T>
+struct alignas(long long) DevRing {
+  volatile int* __restrict__ prevOpCounter;
+  volatile int* __restrict__ nextOpCounter;
+  volatile int* __restrict__ sendFlagToNext;
+  volatile int* __restrict__ sendFlagToPrev;
+  volatile int* __restrict__ recvFlagFromNext;
+  volatile int* __restrict__ recvFlagFromPrev;
+
+  T* volatile * __restrict__ recvPtrFromNext;
+  T* volatile * __restrict__ sendPtrToPrev;
+  T*   __restrict__ recvBuffer;
+  T*   __restrict__ sendBuffer;
+
+  int userRank[MAXRANKS];
+};
+
+struct NodeRef {
+  ncclMem* remote; // TODO: Verify if these
+  ncclMem* local;  //       are still needed.
+  enum {DEVICE, HOST} type;
+  ncclMem* devCleanup;  // Used only when remote comm uses same process & GPU
+  ncclMem* hostCleanup; // Used whenever target is in different process
+  int* opCounter; // TODO: see if this can be removed too.
+};
+
+
+struct ncclComm {
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+
+  // Device and Host allocated chunks. Stored here to correctly free() memory.
+  ncclMem* devMem;
+  ncclMem* hostMem;
+  int hostMemState;
+  int opSched; // Scheduling operation index
+  int* opCounter; // Counter of completed operations
+
+  cudaStream_t prevStream; // cache last used stream
+  cudaEvent_t doneEvent; // orders operations in different streams
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userFromRing;
+
+  // copy of the above stored on each device
+  int* devUserFromRing;
+
+  // Ring order
+  int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
+
+  // Size of temp buffer in bytes.
+  size_t buffSize;
+
+  // Whether we have remote access to the recvbuff pointers passed from remote
+  // GPUs. In single process mode this can be used as long as QPI links are
+  // not present. In multi-process, we never push to a remote recvbuff.
+  int globalMemSpace;
+
+  // Device copy of the communicator
+  struct ncclComm *devComm;  // TODO: Remove this if not useful
+
+  // Device-side ring view
+  DevRing<char>* devRing;
+
+  // Device-to-device communication structures to access remote or local device
+  // memory. Actual allocation larger than 1.
+  NodeRef ptrs[1];
+};
+
+
+typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
+extern DebugLevel ncclDebugLevel;
+
+#define WARN(...) do {                                           \
+  if (ncclDebugLevel >= WARN) {                                  \
+    printf("WARN %s:%d ", __FILE__, __LINE__);                   \
+    printf(__VA_ARGS__);                                         \
+    printf("\n");                                                \
+    fflush(stdout);                                              \
+    if (ncclDebugLevel >= ABORT) abort();                        \
+  }                                                              \
+} while(0)
+
+#define INFO(...) do {                                           \
+  if (ncclDebugLevel >= INFO) {                                  \
+    printf("INFO "); printf(__VA_ARGS__); printf("\n");          \
+    fflush(stdout);                                              \
+  }                                                              \
+} while(0)
+
+// Check CUDA calls
+#define CUDACHECK(cmd, retcode) do {                        \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
+        return retcode;                                     \
+    }                                                       \
+} while(false)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    return res; \
+  } \
+} while (0);
+
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+
+
+#endif // end include guard
+
diff --git a/third_party/nccl/src/enqueue.h b/third_party/nccl/src/enqueue.h
new file mode 100644
index 0000000..43d570e
--- /dev/null
+++ b/third_party/nccl/src/enqueue.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef enqueue_h_
+#define enqueue_h_
+
+#include "core.h"
+#include "reduce_kernel.h"
+
+/* Syncronize previous collective (if in different stream) and enqueue
+ * collective. Work is performed asynchronously with the host thread.
+ * The ColFunc class should be templated on the datatype and reduction
+ * operator (if applicable) and define a static entry() method as
+ * follows.
+ *   template <typename T, template <typename> class RedOp>
+ *   class CollectiveFunctor {
+ *     public:
+ *     static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
+ *         int root, ncclComm* comm, cudaStream_t stream);
+ *   };
+ * The entry() method can assume that the appropriate cuda device has been set. */
+template< template<typename, template<typename> class> class ColFunc,
+          typename T,
+          template<typename> class Op >
+ncclResult_t enqueue(const void* sendbuff,
+                     void* recvbuff,
+                     int count,
+                     int root,
+                     ncclComm_t comm,
+                     cudaStream_t stream)
+{
+  if (stream != comm->prevStream) { // sync required for calls in different streams
+    comm->prevStream = stream;
+    CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
+  }
+
+  ncclResult_t ret;
+  ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
+
+  // Always have to record done event because we don't know what stream next
+  // collective will be in.
+  CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
+  comm->opSched += 1;
+  return ret;
+}
+
+
+// This version decodes type
+template< template<typename, template<typename> class> class ColFunc,
+          template<typename> class Op >
+ncclResult_t enqueue(const void* sendbuff,
+                     void* recvbuff,
+                     int count,
+                     ncclDataType_t type,
+                     int root,
+                     ncclComm_t comm,
+                     cudaStream_t stream)
+{
+  switch(type) {
+  case ncclChar:
+    return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
+  case ncclInt:
+    return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
+#ifdef CUDA_HAS_HALF
+  case ncclHalf:
+    return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
+#endif
+  case ncclFloat:
+    return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
+  case ncclDouble:
+    return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
+  case ncclInt64:
+    return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
+  case ncclUint64:
+    return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
+  default:
+    WARN("Invalid ncclType %d", type);
+    return ncclInvalidType;
+  }
+}
+
+// This version decodes both type and reduction op
+template< template<typename, template<typename> class> class ColFunc>
+ncclResult_t enqueue(const void* sendbuff,
+                     void* recvbuff,
+                     int count,
+                     ncclDataType_t type,
+                     ncclRedOp_t op,
+                     int root,
+                     ncclComm_t comm,
+                     cudaStream_t stream)
+{
+  switch(op) {
+  case ncclSum:
+    return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
+  case ncclProd:
+    return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
+  case ncclMax:
+    return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
+  case ncclMin:
+    return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
+  default:
+    WARN("Invalid ncclRedOp: %d", op);
+    return ncclInvalidOperation;
+  }
+}
+
+#endif // End include guard
+
diff --git a/third_party/nccl/src/libwrap.cu b/third_party/nccl/src/libwrap.cu
new file mode 100644
index 0000000..1ac19a6
--- /dev/null
+++ b/third_party/nccl/src/libwrap.cu
@@ -0,0 +1,155 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "libwrap.h"
+#include <dlfcn.h>
+#include "core.h"
+
+int symbolsLoaded = 0;
+
+static nvmlReturn_t (*nvmlInternalInit)(void);
+static nvmlReturn_t (*nvmlInternalShutdown)(void);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
+static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
+static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
+static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
+
+ncclResult_t wrapSymbols(void) {
+
+  if (symbolsLoaded)
+    return ncclSuccess;
+
+  static void* nvmlhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
+  if (!nvmlhandle) {
+    nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
+    if (!nvmlhandle) {
+      WARN("Failed to open libnvidia-ml.so[.1]");
+      goto teardown;
+    }
+  }
+
+  #define LOAD_SYM(handle, symbol, funcptr) do {         \
+    cast = (void**)&funcptr;                             \
+    tmp = dlsym(handle, symbol);                         \
+    if (tmp == NULL) {                                   \
+      WARN("dlsym failed on %s - %s", symbol, dlerror());\
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
+  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
+
+  symbolsLoaded = 1;
+  return ncclSuccess;
+
+  teardown:
+  nvmlInternalInit = NULL;
+  nvmlInternalShutdown = NULL;
+  nvmlInternalDeviceGetHandleByPciBusId = NULL;
+  nvmlInternalDeviceGetIndex = NULL;
+  nvmlInternalDeviceSetCpuAffinity = NULL;
+  nvmlInternalDeviceClearCpuAffinity = NULL;
+
+  if (nvmlhandle != NULL) dlclose(nvmlhandle);
+  return ncclSystemError;
+}
+
+
+ncclResult_t wrapNvmlInit(void) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalInit();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlInit() failed: %s",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlShutdown(void) {
+  if (nvmlInternalShutdown == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalShutdown();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlShutdown() failed: %s ",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  if (nvmlInternalDeviceGetIndex == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetIndex() failed: %s ",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclLibWrapperNotSet;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+      nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
diff --git a/third_party/nccl/src/libwrap.h b/third_party/nccl/src/libwrap.h
new file mode 100644
index 0000000..cdce480
--- /dev/null
+++ b/third_party/nccl/src/libwrap.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+// Dynamically handle dependencies on external libraries (other than cudart).
+
+#ifndef SRC_LIBWRAP_H_
+#define SRC_LIBWRAP_H_
+
+#include "core.h"
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+/* End of nvml.h */
+
+ncclResult_t wrapSymbols(void);
+
+ncclResult_t wrapNvmlInit(void);
+ncclResult_t wrapNvmlShutdown(void);
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+
+#endif // End include guard
+
diff --git a/third_party/nccl/src/nccl.h b/third_party/nccl/src/nccl.h
new file mode 100644
index 0000000..7bb5aa5
--- /dev/null
+++ b/third_party/nccl/src/nccl.h
@@ -0,0 +1,203 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+
+#if CUDART_VERSION >= 7050
+#include <cuda_fp16.h>
+#define CUDA_HAS_HALF 1
+#else
+#undef CUDA_HAS_HALF
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidDevicePointer    =  4,
+               ncclInvalidRank             =  5,
+               ncclUnsupportedDeviceCount  =  6,
+               ncclDeviceNotFound          =  7,
+               ncclInvalidDeviceIndex      =  8,
+               ncclLibWrapperNotSet        =  9,
+               ncclCudaMallocFailed        = 10,
+               ncclRankMismatch            = 11,
+               ncclInvalidArgument         = 12,
+               ncclInvalidType             = 13,
+               ncclInvalidOperation        = 14,
+               nccl_NUM_RESULTS            = 15 } ncclResult_t;
+
+/* Generates a unique Id with each call. Used to generate commId for
+ * ncclCommInitAll. uniqueId will be created in such a way that it is
+ * guaranteed to be unique accross the host. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Creates a new communicator (multi process version).
+ * rank must be between 0 and ndev-1 and unique within a communicator clique.
+ * ndev is number of logical devices
+ * The communicator is created on the current CUDA device.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
+ * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators.
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Frees resources associated with communicator object. */
+void  ncclCommDestroy(ncclComm_t comm);
+void pncclCommDestroy(ncclComm_t comm);
+
+/* Returns nice error message. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Sets count to number of devices in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns cuda device number associated with communicator. */
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns user-ordered "rank" assocaiated with communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Reduction opperation selector */
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               nccl_NUM_OPS   = 4 } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclChar       = 0,
+               ncclInt        = 1,
+#ifdef CUDA_HAS_HALF
+               ncclHalf       = 2,
+#endif
+               ncclFloat      = 3,
+               ncclDouble     = 4,
+               ncclInt64      = 5,
+               ncclUint64     = 6,
+               nccl_NUM_TYPES = 7 } ncclDataType_t;
+
+/* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
+ * recvbuf may be NULL on all calls except for root device.
+ * On the root device, sendbuff and recvbuff are assumed to reside on
+ * the same device.
+ * Must be called separately for each communicator in communicator clique.
+*/
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/* Reduces data arrays of length count in sendbuff using op operation, and leaves
+ * identical copies of result on each GPUs recvbuff.
+ * Sendbuff and recvbuff are assumed to reside on the same device.
+ * Must be called separately for each communicator in communicator clique. */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/* Reduces data in sendbuff using op operation and leaves reduced result scattered
+ * over the devices so that recvbuff on the i-th GPU will contain the i-th block of
+ * the result. Sendbuff and recvbuff are assumed to reside on same device. Assumes
+ * sendbuff has size at least ndev*recvcount elements, where ndev is number of
+ * communicators in communicator clique
+ * Must be called separately for each communicator in communicator clique.*/
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/* Copies count values from root to all other devices.
+ * Root specifies the source device in user-order
+ * (see ncclCommInit).
+ * Must be called separately for each communicator in communicator clique. */
+ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+
+/* Each device gathers count values from other GPUs.
+ * Result is ordered by comm's logical device order.
+ * Assumes recvbuff has size at least ndev*count, where ndev is number of communicators
+ * in communicator clique.
+ * Sendbuff and recvbuff are assumed to reside on same device.
+ * Must be called separately for each communicator in communicator clique. */
+ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
+    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
+    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+
+
+/* The following collective operations are not implemented yet */
+///* Gather count values from each device to recvbuff.
+// * Result is ordered by comm's logical device order.
+// * recvbuff may be NULL for all calls except for root device.
+// * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
+// * Must be called separately for each communicator in communicator clique. */
+// * All GPUs, including root, perform copies into recvbuff.
+//ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
+//                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
+
+///* Root device scatters count values to each devices.
+// * sendbuff may be NULL on all devices except a single root
+// * device where it is assumed to have size at least nGPUs*count.
+// * recvbuff allocated on each gpu, including root, size=count.
+// * Result is ordered by comm's logical device order.
+// * Called separately for each device in the ncclComm. */
+//ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
+//    int count, int root, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
+//    int count, int root, ncclComm_t comm, cudaStream_t stream);
+//
+///* All GPUs scatter blocks of count elements to other devices.
+// * Must be called separately for each device in the ncclComm.
+// * sendbuff and recvbuff assumed to reside on same device and
+// * have size at least nGPUs*count.
+// * Called separately for each device in the ncclComm. */
+//ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
+
diff --git a/third_party/nccl/src/primitives.h b/third_party/nccl/src/primitives.h
new file mode 100644
index 0000000..bcaeca8
--- /dev/null
+++ b/third_party/nccl/src/primitives.h
@@ -0,0 +1,206 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PRIMITIVES_H_
+#define PRIMITIVES_H_
+
+#include <type_traits>
+#include "copy_kernel.h" // for FuncPassA
+#include "reduce_kernel.h" // for reduction funcs
+
+
+/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
+ *
+ * In order to reduce the reptetion of template arguments, the operations
+ * are bundled as static methods of the Primitives class.
+ *
+ * Each primitive operation copies/reduces a contiguous buffer and syncs
+ * an optional set of flags against a sub-step counter. The sync value is
+ * based on the step parameter. Sync flags must be of type WaitFlag or
+ * PostFlag. The primitive routines wait for all WaitFlag args to attain
+ * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
+ * corresponding substep by previous step) before executing the transfer.
+ * After each substep is transfered, all PostFlag arguments get updated to
+ * the value SUBSTEPS*step+substep+1.
+ */
+
+
+class WaitFlag {
+  volatile int * const flag;
+  const int shift;
+  public:
+  __device__ __forceinline__
+  WaitFlag(volatile int * const flag, const int shift) : flag(flag), shift(shift) { }
+  __device__ __forceinline__
+  void wait(int val) { while (*flag < (val + shift)) /*SPIN*/; }
+};
+
+
+class PostFlag {
+  volatile int * const flag;
+  const int shift;
+  public:
+  __device__ __forceinline__
+  PostFlag(volatile int* const flag, const int shift) : flag(flag), shift(shift) { }
+  __device__ __forceinline__
+  void post(int val) { *flag = (val + shift); }
+};
+
+
+// Helper to check if any argument is of type T.
+// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
+template<typename T> __device__ __forceinline__
+bool AnyAre() { return false; }
+
+template<typename T, typename FIRST_T, typename... TAIL_Ts>
+__device__ __forceinline__
+bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
+  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
+}
+
+
+// Wait on all WaitFlags, ignore PostFlags
+__device__ __forceinline__
+void WaitOnFlags(int val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
+  flag.wait(val);
+  WaitOnFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(int val, PostFlag, TAIL_Ts... tail) {
+  WaitOnFlags(val, tail...);
+}
+
+
+// Post all PostFlags, ingnore WaitFlags
+__device__ __forceinline__
+void PostToFlags(int val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
+  PostToFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(int val, PostFlag flag, TAIL_Ts... tail) {
+  flag.post(val);
+  PostToFlags(val, tail...);
+}
+
+
+// Create pointer arithmetic syntax that doesn't break for nullptr_t
+template <typename Tptr> __device__ __forceinline__
+Tptr ptradd(Tptr ptr, int i) {
+  return ptr + i;
+}
+
+__device__ __forceinline__
+std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
+  return nullptr;
+}
+
+
+// Implementation of primitive types
+template <int THREADS, int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
+class Primitives {
+  private:
+  template <typename SRC2_T, // either T* or nullptr_t
+            typename DST2_T, // either T* or nullptr_t
+            typename... SYNC_Ts> // either WaitFunc or PostFunc
+  static __device__ __forceinline__ void
+  GenericOp(const T*     src1,
+            const SRC2_T src2,
+                  T*     dst1,
+                  DST2_T dst2,
+            int len, int maxoffset, int step, SYNC_Ts... flags) {
+
+    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
+    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
+    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
+        "src2 must be of type T* or nullptr_t");
+    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
+        "dst2 must be of type T* or nullptr_t");
+
+    using OpType = typename std::conditional<noSrc2, FuncPassA<T>, REDOP>::type;
+
+    if (threadIdx.x < THREADS) {
+      int sliceSize = len / SUBSTEPS;
+      int sliceOffset = 0;
+      #pragma unroll 1
+      for (int sub=0; sub<SUBSTEPS; ++sub) {
+        if (AnyAre<WaitFlag>(flags...)) {
+          if (threadIdx.x == 0) {
+            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
+          }
+          asm volatile ("bar.sync 1, %0;" :: "r"(THREADS));
+        }
+        ReduceOrCopy
+            <
+             UNROLL,
+             THREADS,
+             OpType,
+             T,
+             !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
+             !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
+            >
+            (
+             threadIdx.x,
+             ptradd(dst1, sliceOffset),
+             ptradd(dst2, sliceOffset),
+             ptradd(src1, sliceOffset),
+             ptradd(src2, sliceOffset),
+             min(sliceSize, maxoffset-sliceOffset)
+            );
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+        }
+        sliceOffset += sliceSize;
+      }
+    } else {
+      for(int sub=0; sub<SUBSTEPS; ++sub) {
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+          __threadfence_system();
+          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+        }
+      }
+    }
+  }
+
+  public:
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Copy(const T* src, T* dst,
+      int len, int maxOffset, int step, SYNC_Ts... flags) {
+    GenericOp(src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  DoubleCopy(const T* src, T* dst1, T* dst2,
+      int len, int maxOffset, int step, SYNC_Ts... flags) {
+    GenericOp(src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Reduce(const T* src1, const T* src2, T* dst,
+      int len, int maxOffset, int step, SYNC_Ts... flags) {
+    GenericOp(src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  ReduceCopy(const T* src1, const T* src2, T* dst1, T* dst2,
+      int len, int maxOffset, int step, SYNC_Ts... flags) {
+    GenericOp(src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  }
+};
+
+#endif // end include guard
diff --git a/third_party/nccl/src/reduce.cu b/third_party/nccl/src/reduce.cu
new file mode 100644
index 0000000..7215183
--- /dev/null
+++ b/third_party/nccl/src/reduce.cu
@@ -0,0 +1,148 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "primitives.h"
+
+#define NUM_SUBSTEPS 2
+#define NUM_BUFCHUNKS 2
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<int THREADS, int UNROLL, class FUNC, typename T>
+__launch_bounds__(THREADS+WARP_SIZE, 1)
+__global__ void ReduceKernel(const KernelArgs<T> args) {
+  const int tid = threadIdx.x;
+  __shared__ DevRing<T> ring;
+
+  LoadRing<THREADS>(args.ring, &ring);
+  __syncthreads();
+
+  if (tid == 0) {
+    WaitFlag prevCommOp(ring.prevOpCounter, 0);
+    WaitFlag nextCommOp(ring.nextOpCounter, 0);
+    prevCommOp.wait(args.opIndex);
+    nextCommOp.wait(args.opIndex);
+  }
+  __syncthreads();
+
+  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
+  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
+  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
+
+  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
+
+  const int size = args.N;
+  const int nranks = args.nRanks;
+  const int rank = ring.userRank[0];
+  const int prevRank = ring.userRank[nranks-1];
+  const int root = args.root;
+  const int buffSize = args.buffSize / sizeof(T);
+  const int sliceSize = buffSize / NUM_BUFCHUNKS;
+  
+  int step = 0;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = args.ThisInput;
+  T * __restrict__ thisOutput =  args.ThisOutput;
+  T * __restrict__ prevInput = ring.recvBuffer;
+  T * __restrict__ nextOutput =  ring.sendBuffer;
+
+  for (int offset = 0; offset < size; offset += sliceSize) {
+    int maxOffset = size-offset;
+    if (prevRank == root) {
+      Prims::Copy(
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else if (rank == root) {
+      Prims::Reduce(
+          thisInput + offset,
+          prevInput + boffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      Prims::Reduce(
+          thisInput + offset,
+          prevInput + boffset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  // wait for the last data to be pushed to us
+  if (tid == 0) {
+    if (rank != root) {
+      // Wait for last update from next then reset the flag
+      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
+      *ring.recvFlagFromNext = 0;
+    }
+
+    if (prevRank != root) {
+      // reset the flag
+      *ring.recvFlagFromPrev = 0;
+    }
+
+    incrementOpCounter(&args);
+  }
+}
+
+#define THREADS 512
+#define UNROLL 8
+
+template<class FUNC, typename T>
+ncclResult_t RingReduce(const void* sendbuff, void* recvbuff, const int count, const int root,
+    ncclComm* comm, cudaStream_t stream) {
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
+  } else {
+    KernelArgs<T> args;
+    ArgsSetup(&args, sendbuff, recvbuff, root, count, comm);
+    LAUNCH_KERNEL(ReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
+  }
+
+  return ncclSuccess;
+}
+
+template<typename T, template<typename> class RedOp>
+class ReduceFunctor {
+  public:
+  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
+      int count, int root, ncclComm* comm, cudaStream_t stream) {
+    return RingReduce<RedOp<T>, T>(sendbuff, recvbuff, count, root, comm, stream);
+  }
+};
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, root, comm, "Reduce"));
+  return enqueue<ReduceFunctor>(sendbuff, recvbuff, count, datatype, op, root, comm, stream);
+}
+
diff --git a/third_party/nccl/src/reduce_kernel.h b/third_party/nccl/src/reduce_kernel.h
new file mode 100644
index 0000000..f2cd512
--- /dev/null
+++ b/third_party/nccl/src/reduce_kernel.h
@@ -0,0 +1,309 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef REDUCE_KERNEL_H_
+#define REDUCE_KERNEL_H_
+
+#include "common_kernel.h"
+#include <limits>
+
+template<typename T>
+struct FuncNull {
+  __device__ T operator()(const T x, const T y) const {
+    return 0;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd {
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<char> {
+  union converter {
+    uint32_t storage;
+    char4 a;
+  };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500)
+    int32_t rv;
+    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ char operator()(const char x, const char y) const {
+    return x+y;
+  }
+};
+
+template<>
+struct FuncProd<char> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300)
+    int32_t rv, zero=0;
+    asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
+        " vmad.u32.u32.u32 t3, %1.b3, %2.b3, %3;\n\t"
+        " vmad.u32.u32.u32 t2, %1.b2, %2.b2, %3;\n\t"
+        " shl.b32          t3, t3, 16;\n\t"
+        " shl.b32          t2, t2, 16;\n\t"
+        " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
+        " shl.b32          t1, t1, 8;\n\t"
+        " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
+        " and.b32          t1, t1, 0xff00ff00;\n\t"
+        " and.b32          t0, t0, 0x00ff00ff;\n\t"
+        " or.b32           %0,  t0, t1;\n\t"
+        "}" : "=r"(rv) : "r"(x), "r"(y), "r"(zero));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x * cy.a.x;
+    cr.a.y = cx.a.y * cy.a.y;
+    cr.a.z = cx.a.z * cy.a.z;
+    cr.a.w = cx.a.w * cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ char operator()(const char x, const char y) const {
+    return x*y;
+  }
+};
+
+template<>
+struct FuncMax<char> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    if (std::numeric_limits<char>::is_signed)
+      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    else
+      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500)
+    int32_t rv;
+    if (std::numeric_limits<char>::is_signed)
+      asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+          "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+          "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+          "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    else
+      asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+          "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+          "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+          "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ char operator()(const char x, const char y) const {
+    return (x>y) ? x : y;
+  }
+};
+
+template<>
+struct FuncMin<char> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    if (std::numeric_limits<char>::is_signed)
+      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    else
+      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500)
+    int32_t rv;
+    if (std::numeric_limits<char>::is_signed)
+      asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+          "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+          "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+          "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    else
+      asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+          "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+          "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+          "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ char operator()(const char x, const char y) const {
+    return (x<y) ? x : y;
+  }
+};
+
+#ifdef CUDA_HAS_HALF
+template<>
+struct FuncSum<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530
+    return __hadd2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x + fy.x;
+    fr.y = fx.y + fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530
+    return __hadd(x, y);
+#else
+    return __float2half( __half2float(x) + __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncProd<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530
+    return __hmul2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x * fy.x;
+    fr.y = fx.y * fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530
+    return __hmul(x, y);
+#else
+    return __float2half( __half2float(x) * __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncMax<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fmaxf(fx.x, fy.x);
+    fr.y = fmaxf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fmaxf(fx, fy);
+    return __float2half(fm);
+  }
+};
+
+template<>
+struct FuncMin<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fminf(fx.x, fy.x);
+    fr.y = fminf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fminf(fx, fy);
+    return __float2half(fm);
+  }
+};
+#endif
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of threads in the CTA
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, class FUNC, typename T>
+__device__ void Reduce(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  ReduceOrCopy<UNROLL, THREADS, FUNC, T, false, true>(threadIdx.x, dest,
+      nullptr, src0, src1, N);
+}
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of threads in the CTA
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, class FUNC, typename T>
+__device__ void ReduceAndCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  ReduceOrCopy<UNROLL, THREADS, FUNC, T, true, true>(threadIdx.x, dest0, dest1,
+      src0, src1, N);
+}
+
+#endif // REDUCE_KERNEL_H_
diff --git a/third_party/nccl/src/reduce_scatter.cu b/third_party/nccl/src/reduce_scatter.cu
new file mode 100644
index 0000000..b1100dd
--- /dev/null
+++ b/third_party/nccl/src/reduce_scatter.cu
@@ -0,0 +1,165 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "primitives.h"
+
+#define NUM_SUBSTEPS 2
+#define NUM_BUFCHUNKS 2
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<int THREADS, int UNROLL, class FUNC, typename T>
+__launch_bounds__(THREADS+WARP_SIZE, 1)
+__global__ void ReduceScatterKernel(const KernelArgs<T> args) {
+  const int tid = threadIdx.x;
+  __shared__ DevRing<T> ring;
+
+  LoadRing<THREADS>(args.ring, &ring);
+  __syncthreads();
+
+  if (tid == 0) {
+    WaitFlag prevCommOp(ring.prevOpCounter, 0);
+    WaitFlag nextCommOp(ring.nextOpCounter, 0);
+    prevCommOp.wait(args.opIndex);
+    nextCommOp.wait(args.opIndex);
+  }
+  __syncthreads();
+
+  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
+  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
+  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
+
+  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
+
+  const int size = args.N;
+  const int nranks = args.nRanks;
+  const int buffSize = args.buffSize / sizeof(T);
+  const int sliceSize = buffSize / NUM_BUFCHUNKS;
+  
+  int step = 0;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = args.ThisInput;
+  T * __restrict__ thisOutput =  args.ThisOutput;
+  T * __restrict__ prevInput = ring.recvBuffer;
+  T * __restrict__ nextOutput =  ring.sendBuffer;
+
+  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
+    /////////////// begin ReduceScatter steps ///////////////
+    int offset;
+    int maxOffset = size-chunkOffset;
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring.userRank[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Copy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring.userRank[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      Prims::Reduce(
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    rankDest = ring.userRank[0];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Reduce(
+        prevInput  + poffset,
+        thisInput  + offset,
+        thisOutput + chunkOffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP;
+  }
+
+  // wait for the last data to be pushed to us
+  if (tid == 0) {
+    // Wait for last update from next then reset the flag
+    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
+    *ring.recvFlagFromNext = 0;
+
+    // Wait for last update from prev then reset the flag
+    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
+    *ring.recvFlagFromPrev = 0;
+
+    incrementOpCounter(&args);
+  }
+}
+
+#define THREADS 512
+#define UNROLL 8
+
+template<class FUNC, typename T>
+ncclResult_t RingReduceScatter(const void* sendbuff, void* recvbuff,
+    const int count, ncclComm* comm, cudaStream_t stream) {
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
+  } else {
+    KernelArgs<T> args;
+    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
+    LAUNCH_KERNEL(ReduceScatterKernel, THREADS, UNROLL, FUNC, T, args, stream);
+  }
+
+  return ncclSuccess;
+}
+
+template<typename T, template <typename> class RedOp>
+class ReduceScatter {
+  public:
+  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
+      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
+    return RingReduceScatter<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
+  }
+};
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, int recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, int recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, "ReduceScatter"));
+  return enqueue<ReduceScatter>(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream);
+}
+
diff --git a/third_party/nccl/test/include/test_utilities.h b/third_party/nccl/test/include/test_utilities.h
new file mode 100644
index 0000000..c194205
--- /dev/null
+++ b/third_party/nccl/test/include/test_utilities.h
@@ -0,0 +1,438 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+
+#ifndef SRC_TEST_UTILITIES_H_
+#define SRC_TEST_UTILITIES_H_
+
+#include <curand.h>
+#include <cerrno>
+#include <string>
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t e = cmd;                              \
+  if( e != cudaSuccess ) {                          \
+    printf("Cuda failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,cudaGetErrorString(e));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("NCCL failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+template<typename T>
+void Randomize(T* const dest, const int N, const int randomSeed);
+
+template<typename T>
+void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op);
+
+template<typename T>
+double CheckDelta(const T* results, const T* expected, int N);
+
+#define CURAND_CHK(cmd)                                                         \
+    do {                                                                        \
+      curandStatus_t error = (cmd);                                             \
+      if (error != CURAND_STATUS_SUCCESS) {                                     \
+        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
+        exit(EXIT_FAILURE);                                                     \
+      }                                                                         \
+    } while (false)
+
+
+template<typename T>
+void GenerateRandom(curandGenerator_t generator, T * const dest,
+    const int N);
+
+template<>
+void GenerateRandom<char>(curandGenerator_t generator, char * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest,
+      N * sizeof(char) / sizeof(int)));
+}
+
+template<>
+void GenerateRandom<int>(curandGenerator_t generator, int * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
+}
+
+template<>
+void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerateUniform(generator, dest, N));
+}
+
+template<>
+void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
+}
+
+template<>
+void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerateLongLong(generator, dest, N));
+}
+
+
+template<typename T>
+void Randomize(T* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
+  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
+  GenerateRandom<T>(gen, dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+template<>
+void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
+  GenerateRandom<unsigned long long>(gen, dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+template<>
+void Randomize(long long* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
+  GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+#ifdef CUDA_HAS_HALF
+__global__ void halve(const float * src, half* dest, int N) {
+  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
+      tid < N; tid += blockDim.x * gridDim.x)
+    dest[tid] = __float2half(src[tid]);
+}
+
+template<>
+void Randomize<half>(half* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
+  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
+
+  float* temp;
+  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
+  GenerateRandom<float>(gen, temp, N);
+  halve<<<128, 512>>>(temp, dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaFree(temp));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+#endif
+
+void makeRandom(void* ptr, int n, ncclDataType_t type, int seed) {
+  if (type == ncclChar)
+    Randomize<char>((char*)ptr, n, seed);
+  else if (type == ncclInt)
+    Randomize<int>((int*)ptr, n, seed);
+#ifdef CUDA_HAS_HALF
+  else if (type == ncclHalf)
+    Randomize<half>((half*)ptr, n, seed);
+#endif
+  else if (type == ncclFloat)
+    Randomize<float>((float*)ptr, n, seed);
+  else if (type == ncclDouble)
+    Randomize<double>((double*)ptr, n, seed);
+  else if (type == ncclInt64)
+    Randomize<long long>((long long*)ptr, n, seed);
+  else if (type == ncclUint64)
+    Randomize<unsigned long long>((unsigned long long*)ptr, n, seed);
+
+  return;
+}
+
+template<typename T, int OP> __global__ static
+void accumKern(T* acum, const T* contrib, int N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    T c = contrib[i];
+    T a = acum[i];
+    if(OP == ncclSum) {
+      acum[i] = a+c;
+    } else if(OP == ncclProd) {
+      acum[i] = a*c;
+    } else if(OP == ncclMax) {
+      acum[i] = (a > c) ? a : c;
+    } else if(OP == ncclMin) {
+      acum[i] = (a < c) ? a : c;
+    }
+  }
+}
+
+#ifdef CUDA_HAS_HALF
+template<> __global__
+void accumKern<half, ncclSum>(half* acum, const half* contrib, int N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( a + c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclProd>(half* acum, const half* contrib, int N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( a * c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclMax>(half* acum, const half* contrib, int N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( (a>c) ? a : c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclMin>(half* acum, const half* contrib, int N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( (a<c) ? a : c );
+  }
+}
+#endif
+
+template<typename T>
+void accVecType(void* out, void* in, int n, ncclRedOp_t op) {
+  switch(op) {
+    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
+    default:
+      printf("Unknown reduction operation.\n");
+      exit(EXIT_FAILURE);
+  }
+}
+
+template<typename T>
+void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op) {
+
+  T* devdest;
+  CUDACHECK(cudaHostRegister(dest, N*sizeof(T), 0));
+  CUDACHECK(cudaHostGetDevicePointer(&devdest, dest, 0));
+  accVecType<T>((void*)devdest, (void*)contrib, N, op);
+  CUDACHECK(cudaHostUnregister(dest));
+}
+
+void accVec(void* out, void* in, int n, ncclDataType_t type, ncclRedOp_t op) {
+  switch (type) {
+    case ncclChar:   accVecType<char>               (out, in, n, op); break;
+    case ncclInt:    accVecType<int>                (out, in, n, op); break;
+#ifdef CUDA_HAS_HALF
+    case ncclHalf:   accVecType<half>               (out, in, n, op); break;
+#endif
+    case ncclFloat:  accVecType<float>              (out, in, n, op); break;
+    case ncclDouble: accVecType<double>             (out, in, n, op); break;
+    case ncclInt64:  accVecType<long long>          (out, in, n, op); break;
+    case ncclUint64: accVecType<unsigned long long> (out, in, n, op); break;
+    default:
+      printf("Unknown reduction type.\n");
+      exit(EXIT_FAILURE);
+  }
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+#ifdef CUDA_HAS_HALF
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(const T* A, const T* B, int N, double* max) {
+  __shared__ double temp[BSIZE];
+  int tid = threadIdx.x;
+  double locmax = 0.0;
+  for(int i=tid; i<N; i+=blockDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax )
+      locmax = delta;
+  }
+
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    *max = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+template<typename T>
+double CheckDelta(const T* results, const T* expected, int N) {
+  T* devexp;
+  double maxerr;
+  double* devmax;
+  CUDACHECK(cudaHostRegister((void*)expected, N*sizeof(T), 0));
+  CUDACHECK(cudaHostGetDevicePointer((void**)&devexp, (void*)expected, 0));
+  CUDACHECK(cudaHostRegister((void*)&maxerr, sizeof(double), 0));
+  CUDACHECK(cudaHostGetDevicePointer(&devmax, &maxerr, 0));
+  deltaKern<T, 512><<<1, 512>>>(results, devexp, N, devmax);
+  CUDACHECK(cudaHostUnregister(&maxerr));
+  CUDACHECK(cudaHostUnregister((void*)devexp));
+  return maxerr;
+}
+
+void maxDiff(double* max, void* first, void* second, int n, ncclDataType_t type, cudaStream_t s) {
+  switch (type) {
+    case ncclChar:   deltaKern<char, 512>              <<<1,512,0,s>>>((char*)first, (char*)second, n, max); break;
+    case ncclInt:    deltaKern<int, 512>               <<<1,512,0,s>>>((int*)first, (int*)second, n, max); break;
+#ifdef CUDA_HAS_HALF
+    case ncclHalf:   deltaKern<half, 512>              <<<1,512,0,s>>>((half*)first, (half*)second, n, max); break;
+#endif
+    case ncclFloat:  deltaKern<float, 512>             <<<1,512,0,s>>>((float*)first, (float*)second, n, max); break;
+    case ncclDouble: deltaKern<double, 512>            <<<1,512,0,s>>>((double*)first, (double*)second, n, max); break;
+    case ncclInt64:  deltaKern<long long, 512>         <<<1,512,0,s>>>((long long*)first, (long long*)second, n, max); break;
+    case ncclUint64: deltaKern<unsigned long long, 512><<<1,512,0,s>>>((unsigned long long*)first, (unsigned long long*)second, n, max); break;
+    default:
+      printf("Unknown reduction type.\n");
+      exit(EXIT_FAILURE);
+  }
+}
+
+std::string TypeName(const ncclDataType_t type) {
+  switch (type) {
+    case ncclChar:   return "char";
+    case ncclInt:    return "int";
+#ifdef CUDA_HAS_HALF
+    case ncclHalf:   return "half";
+#endif
+    case ncclFloat:  return "float";
+    case ncclDouble: return "double";
+    case ncclInt64:  return "int64";
+    case ncclUint64: return "uint64";
+    default:         return "unknown";
+  }
+}
+
+std::string OperationName(const ncclRedOp_t op) {
+  switch (op) {
+    case ncclSum:  return "sum";
+    case ncclProd: return "prod";
+    case ncclMax:  return "max";
+    case ncclMin:  return "min";
+    default:       return "unknown";
+  }
+}
+
+ncclDataType_t strToType(const char* s) {
+  if (strcmp(s, "char") == 0)
+    return ncclChar;
+  if (strcmp(s, "int") == 0)
+    return ncclInt;
+#ifdef CUDA_HAS_HALF
+  if (strcmp(s, "half") == 0)
+    return ncclHalf;
+#endif
+  if (strcmp(s, "float") == 0)
+    return ncclFloat;
+  if (strcmp(s, "double") == 0)
+    return ncclDouble;
+  if (strcmp(s, "int64") == 0)
+    return ncclInt64;
+  if (strcmp(s, "uint64") == 0)
+    return ncclUint64;
+
+  return nccl_NUM_TYPES;
+}
+
+size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:   return sizeof(char);
+    case ncclInt:    return sizeof(int);
+#ifdef CUDA_HAS_HALF
+    case ncclHalf:   return sizeof(short);
+#endif
+    case ncclFloat:  return sizeof(float);
+    case ncclDouble: return sizeof(double);
+    case ncclInt64:  return sizeof(long long);
+    case ncclUint64: return sizeof(unsigned long long);
+  }
+
+  return 0;
+}
+
+double deltaMaxValue(ncclDataType_t type, bool is_reduction) {
+  if (is_reduction) {
+    switch(type) {
+#ifdef CUDA_HAS_HALF
+      case ncclHalf:   return 5e-2;
+#endif
+      case ncclFloat:  return 1e-5;
+      case ncclDouble: return 1e-12;
+    }
+  }
+  return 1e-200;
+}
+
+ncclRedOp_t strToOp(const char* s) {
+  if (strcmp(s, "sum") == 0)
+    return ncclSum;
+  if (strcmp(s, "prod") == 0)
+    return ncclProd;
+  if (strcmp(s, "max") == 0)
+    return ncclMax;
+  if (strcmp(s, "min") == 0)
+    return ncclMin;
+
+  return nccl_NUM_OPS;
+}
+
+int strToPosInt(const char* s) {
+  errno = 0;
+  long temp = strtol(s, NULL, 10);
+  if (errno != 0 || temp > INT_MAX || temp < 0)
+    return 0;
+  return (int)temp;
+}
+
+int strToNonNeg(const char* s) {
+  errno = 0;
+  long temp = strtol(s, NULL, 10);
+  if (errno != 0 || temp > INT_MAX || temp < 0)
+    return -1;
+  return (int)temp;
+}
+
+#endif // SRC_TEST_UTILITIES_H_
diff --git a/third_party/nccl/test/mpi/mpi_test.cu b/third_party/nccl/test/mpi/mpi_test.cu
new file mode 100644
index 0000000..fea6ae5
--- /dev/null
+++ b/third_party/nccl/test/mpi/mpi_test.cu
@@ -0,0 +1,93 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#include "nccl.h"
+#include "mpi.h"
+#include "test_utilities.h"
+
+#define SIZE 128
+#define NITERS 1
+
+int main(int argc, char *argv[]) {
+  ncclUniqueId commId;
+  int size, rank;
+  ncclResult_t ret;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  if (argc < size) {
+    if (rank == 0)
+      printf("Usage : %s <GPU list per rank>\n", argv[0]);
+    exit(1);
+  }
+
+  int gpu = atoi(argv[rank+1]);
+
+  // We have to set our device before NCCL init
+  CUDACHECK(cudaSetDevice(gpu));
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  // NCCL Communicator creation
+  ncclComm_t comm;
+  NCCLCHECK(ncclGetUniqueId(&commId));
+  MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
+  ret = ncclCommInitRank(&comm, size, commId, rank);
+  if (ret != ncclSuccess) {
+    printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
+    exit(1);
+  }
+
+  // CUDA stream creation
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // Initialize input values
+  int *dptr;
+  CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int)));
+  int *val = (int*) malloc(SIZE*sizeof(int));
+  for (int v=0; v<SIZE; v++) {
+    val[v] = rank + 1;
+  }
+  CUDACHECK(cudaMemcpy(dptr, val, SIZE*sizeof(int), cudaMemcpyHostToDevice));
+
+  // Compute final value
+  int ref = size*(size+1)/2;
+
+  // Run allreduce
+  int errors = 0;
+  for (int i=0; i<NITERS; i++) {
+    NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
+  }
+
+  // Check results
+  CUDACHECK(cudaStreamSynchronize(stream));
+  CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
+  for (int v=0; v<SIZE; v++) {
+    if (val[v] != ref) {
+      errors++;
+      printf("[%d] Error at %d : got %d instead of %d\n", rank, v, val[v], ref);
+    }
+  }
+  CUDACHECK(cudaFree(dptr));
+
+  MPI_Allreduce(MPI_IN_PLACE, &errors, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD);
+  if (rank == 0) {
+    if (errors)
+      printf("%d errors. Test FAILED.\n", errors);
+    else
+      printf("Test PASSED.\n");
+  }
+
+  MPI_Finalize();
+  ncclCommDestroy(comm);
+  return errors ? 1 : 0;
+}
diff --git a/third_party/nccl/test/single/all_gather_scan.cu b/third_party/nccl/test/single/all_gather_scan.cu
new file mode 100644
index 0000000..becf315
--- /dev/null
+++ b/third_party/nccl/test/single/all_gather_scan.cu
@@ -0,0 +1,239 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 4) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[2]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[3]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 4) {
+    delta = strToPosInt(argv[4]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[4]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 5) {
+    gpus = strToPosInt(argv[5]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 6 && argc != 6+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 6) {
+      list[g] = strToNonNeg(argv[6+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_input = n_max * word;
+  size_t max_output = max_input * gpus;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_output));
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_input));
+    CUDACHECK(cudaMalloc(&output[g], max_output));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t out_bytes = word * n * gpus;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int slice=0; slice<gpus; ++slice) {
+      void* refSlice = (void*)((char*)refout + slice*max_input);
+      for(int g=0; g<gpus; ++g) {
+        CUDACHECK(cudaSetDevice(list[g]));
+        void* mySlice = (void*)((char*)output[g] + slice*n*word);
+        maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
+      }
+      for(int g=0; g<gpus; ++g) {
+        CUDACHECK(cudaSetDevice(list[g]));
+        CUDACHECK(cudaStreamSynchronize(stream[g]));
+        max_error = max(max_error, *localError[g]);
+      }
+    }
+
+    double mb = (double)(n*word * (gpus-1)) * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, max_error, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/all_gather_test.cu b/third_party/nccl/test/single/all_gather_test.cu
new file mode 100644
index 0000000..40d2f31
--- /dev/null
+++ b/third_party/nccl/test/single/all_gather_test.cu
@@ -0,0 +1,235 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "nccl.h"
+#include "test_utilities.h"
+
+int errors = 0;
+double avg_bw = 0.0;
+int avg_count = 0;
+bool is_reduction = false;
+
+template<typename T>
+void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
+    ncclComm_t* const comms, const std::vector<int>& dList) {
+  // initialize data
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
+  T* buffer = (T*)malloc(nDev * N * sizeof(T));
+  T* result = (T*)malloc(nDev * N * sizeof(T));
+  memset(buffer, 0, nDev * N * sizeof(T));
+  memset(result, 0, nDev * N * sizeof(T));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamCreate(s+i));
+    CUDACHECK(cudaMemset(recvbuff[i], 0, nDev * N * sizeof(T)));
+    Randomize(sendbuff[i], N, i);
+
+    CUDACHECK(cudaMemcpy(result + i * N, sendbuff[i], N * sizeof(T),
+        cudaMemcpyDeviceToHost));
+  }
+
+  // warm up GPU
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
+        (void*)recvbuff[i], comms[i], s[i]));
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamSynchronize(s[i]));
+  }
+
+  //for (int n = 1; n <= N; n = n << 1)
+  {
+    int n = N;
+    printf("%12i  %12i  %6s", (int)(n * sizeof(T)), n, TypeName(type).c_str());
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
+          s[i]));
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count();
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 * (double)(nDev - 1)
+        / elapsedSec;
+    double busbw = algbw;
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(recvbuff[i], result, nDev*N);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
+        maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamDestroy(s[i]));
+  }
+  free(s);
+  free(buffer);
+  free(result);
+}
+
+template<typename T>
+void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
+    const std::vector<int>& dList) {
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
+  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
+    CUDACHECK(cudaMalloc(recvbuff + i, nDev * N * sizeof(T)));
+  }
+
+  RunTest<T>(sendbuff, recvbuff, N, type, comms, dList);
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaFree(sendbuff[i]));
+    CUDACHECK(cudaFree(recvbuff[i]));
+  }
+
+  free(sendbuff);
+  free(recvbuff);
+}
+
+void usage() {
+  printf("Tests nccl AllGather with user supplied arguments.\n"
+      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
+      "[GPU 0] [GPU 1] ...\n\n");
+}
+
+int main(int argc, char* argv[]) {
+  int nVis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nVis));
+
+  int N = 0;
+  if (argc > 1) {
+    int t = sscanf(argv[1], "%d", &N);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    printf("Error: must specify at least data size in bytes!\n\n");
+    usage();
+    exit(EXIT_FAILURE);
+  }
+
+  int nDev = nVis;
+  if (argc > 2) {
+    int t = sscanf(argv[2], "%d", &nDev);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  std::vector<int> dList(nDev);
+  for (int i = 0; i < nDev; ++i)
+    dList[i] = i % nVis;
+
+
+  if (argc > 3) {
+    if (argc - 3 != nDev) {
+      printf("Error: insufficient number of GPUs in list\n\n");
+      usage();
+      exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
+      if (t == 0) {
+        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
+        usage();
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
+
+  printf("# Using devices\n");
+  for (int g=0; g<nDev; ++g) {
+    int cudaDev;
+    int rank;
+    cudaDeviceProp prop;
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+        prop.pciBusID, prop.name);
+  }
+  printf("\n");
+
+  printf("# %10s  %12s  %6s  %7s  %5s  %5s  %7s\n",
+      "bytes", "N", "type", "time", "algbw", "busbw", "delta");
+
+  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
+  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
+#ifdef CUDA_HAS_HALF
+  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
+#endif
+  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
+  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
+
+  printf("\n");
+
+  for(int i=0; i<nDev; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  avg_bw /= avg_count;
+
+  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
+  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
+  printf("\n");
+  if (errors || avg_bw < check_avg_bw)
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/all_reduce_scan.cu b/third_party/nccl/test/single/all_reduce_scan.cu
new file mode 100644
index 0000000..f93a099
--- /dev/null
+++ b/third_party/nccl/test/single/all_reduce_scan.cu
@@ -0,0 +1,247 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_size));
+    CUDACHECK(cudaMalloc(&output[g], max_size));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    if (g == 0)
+      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+    else
+      accVec(refout, input[g], n_max, type, op);
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      maxDiff(localError[g], output[g], refout, n, type, stream[g]);
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      max_error = max(max_error, *localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
+    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+        n*word, max_error, ms, algbw, busbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/all_reduce_test.cu b/third_party/nccl/test/single/all_reduce_test.cu
new file mode 100644
index 0000000..1935a38
--- /dev/null
+++ b/third_party/nccl/test/single/all_reduce_test.cu
@@ -0,0 +1,301 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+int csv = false;
+int errors = 0;
+double avg_bw = 0.0;
+int avg_count = 0;
+bool is_reduction = true;
+
+template<typename T>
+void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
+    const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) {
+  // initialize data
+  T* buffer = (T*)malloc(N * sizeof(T));
+  T* result = (T*)malloc(N * sizeof(T));
+  memset(buffer, 0, N * sizeof(T));
+  memset(result, 0, N * sizeof(T));
+
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamCreate(s+i));
+    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
+    Randomize(sendbuff[i], N, i);
+    if(i == 0) {
+      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate<T>(result, sendbuff[i], N, op);
+    }
+  }
+
+  // warm up GPU
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamSynchronize(s[i]));
+  }
+
+//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
+  {
+    int n = N;
+    printf((csv) ? "%i,%i,%s,%s," : "%12i  %12i  %6s  %6s",
+        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
+        OperationName(op).c_str());
+
+    // do out-of-place reduction first
+    nvtxRangePushA("out of place");
+    auto start = std::chrono::high_resolution_clock::now();
+    //for (int i=0; i<100; i++) {
+      for (int i = 0; i < nDev; ++i) {
+        CUDACHECK(cudaSetDevice(dList[i]));
+        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
+            comms[i], s[i]));
+      }
+    //}
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    nvtxRangePop();
+
+    nvtxRangePushA("out of place bookkeeping");
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count(); // / 100.0;
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(recvbuff[i], result, N);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
+        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+    nvtxRangePop();
+  }
+
+
+//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
+  {
+    int n = N;
+    // now do in-place reduction
+    nvtxRangePushA("in place");
+    auto start = std::chrono::high_resolution_clock::now();
+    //for (int i=0; i<100; i++) {
+      for (int i = 0; i < nDev; ++i) {
+        CUDACHECK(cudaSetDevice(dList[i]));
+        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
+            comms[i], s[i]));
+      }
+    //}
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    nvtxRangePop();
+
+    nvtxRangePushA("in place bookkeeping");
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count(); // / 100.0;
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(sendbuff[i], result, N);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
+        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+    nvtxRangePop();
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamDestroy(s[i]));
+  }
+  free(s);
+  free(buffer);
+  free(result);
+}
+
+template<typename T>
+void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
+    const std::vector<int>& dList) {
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
+  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
+    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
+  }
+
+  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
+//  for (ncclRedOp_t op : { ncclSum }) {
+    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaFree(sendbuff[i]));
+    CUDACHECK(cudaFree(recvbuff[i]));
+  }
+
+  free(sendbuff);
+  free(recvbuff);
+}
+
+void usage() {
+  printf("Tests nccl AllReduce with user supplied arguments.\n"
+      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
+      "[GPU 0] [GPU 1] ...\n\n");
+}
+
+int main(int argc, char* argv[]) {
+  int nVis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nVis));
+
+  int N = 0;
+  if (argc > 1) {
+    int t = sscanf(argv[1], "%d", &N);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    printf("Error: must specify at least data size in bytes!\n\n");
+    usage();
+    exit(EXIT_FAILURE);
+  }
+
+  int nDev = nVis;
+  if (argc > 2) {
+    int t = sscanf(argv[2], "%d", &nDev);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  std::vector<int> dList(nDev);
+  for (int i = 0; i < nDev; ++i)
+    dList[i] = i % nVis;
+
+  if (argc > 3) {
+    if (argc - 3 != nDev) {
+      printf("Error: insufficient number of GPUs in list\n\n");
+      usage();
+      exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
+      if (t == 0) {
+        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
+        usage();
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
+
+  if (!csv) {
+    printf("# Using devices\n");
+    for (int g = 0; g < nDev; ++g) {
+      int cudaDev;
+      int rank;
+      cudaDeviceProp prop;
+      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
+      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+          prop.pciBusID, prop.name);
+    }
+    printf("\n");
+
+    printf("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
+    printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
+               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  }
+  else {
+    printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
+  }
+
+  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
+  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
+#ifdef CUDA_HAS_HALF
+  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
+#endif
+  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
+  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
+
+  printf("\n");
+
+  for(int i=0; i<nDev; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  avg_bw /= avg_count;
+
+  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
+  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
+  printf("\n");
+  if (errors || avg_bw < check_avg_bw)
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/broadcast_scan.cu b/third_party/nccl/test/single/broadcast_scan.cu
new file mode 100644
index 0000000..ea11c7d
--- /dev/null
+++ b/third_party/nccl/test/single/broadcast_scan.cu
@@ -0,0 +1,232 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 4) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[2]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[3]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 4) {
+    delta = strToPosInt(argv[4]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[4]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 5) {
+    gpus = strToPosInt(argv[5]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 6 && argc != 6+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 6) {
+      list[g] = strToNonNeg(argv[6+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void** io;
+  double* localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  io = (void**)malloc(gpus*sizeof(void*));
+  CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    CUDACHECK(cudaMalloc(&io[g], max_size));
+    if(g == 0) {
+      makeRandom(io[g], n_max, type, 42+g);
+      CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
+    }
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[0]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      maxDiff(localError+g, io[g], refout, n, type, stream[g]);
+    }
+    double maxError = 0.0;
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      maxError = max(maxError, localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, maxError, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(io[g]));
+  }
+
+  free(io);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  CUDACHECK(cudaFreeHost(localError));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/broadcast_test.cu b/third_party/nccl/test/single/broadcast_test.cu
new file mode 100644
index 0000000..6b1e04f
--- /dev/null
+++ b/third_party/nccl/test/single/broadcast_test.cu
@@ -0,0 +1,235 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "nccl.h"
+#include "test_utilities.h"
+
+int errors = 0;
+double avg_bw = 0.0;
+int avg_count = 0;
+bool is_reduction = false;
+
+template<typename T>
+void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
+    ncclComm_t* const comms, const std::vector<int>& dList) {
+  // initialize data
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
+  T* buffer = (T*)malloc(N * sizeof(T));
+  T* result = (T*)malloc(N * sizeof(T));
+  memset(result, 0, N * sizeof(T));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamCreate(s+i));
+
+    if (i == root) {
+      Randomize(buff[root], N, root);
+      CUDACHECK(cudaMemcpy(result, buff[root], N * sizeof(T),
+          cudaMemcpyDeviceToHost));
+    } else {
+      CUDACHECK(cudaMemset(buff[i], 0, N * sizeof(T)));
+    }
+
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  // warm up GPU
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamSynchronize(s[i]));
+  }
+
+//  for (int n = 1; n <= N; n = n << 1)
+  {
+    int n = N;
+    printf("%12i  %12i  %6s  %4i", (int)(n * sizeof(T)), n,
+        TypeName(type).c_str(), root);
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count();
+    double algbw = (double)(n * sizeof(T)) / 1.0E9  / elapsedSec;
+    double busbw = algbw;
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(buff[i], result, n);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
+            maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+  }
+
+  for(int i=0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamDestroy(s[i]));
+  }
+  free(s);
+  free(buffer);
+  free(result);
+}
+
+template<typename T>
+void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
+    const std::vector<int>& dList) {
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  T** buff = (T**)malloc(nDev * sizeof(T*));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaMalloc(buff + i, N * sizeof(T)));
+  }
+
+  //for (int root = 1; root < 2; ++root) {
+  for (int root = 0; root < nDev; ++root) {
+    RunTest<T>(buff, N, type, root, comms, dList);
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaFree(buff[i]));
+  }
+
+  free(buff);
+}
+
+void usage() {
+  printf("Tests nccl Broadcast with user supplied arguments.\n"
+      "    Usage: broadcast_test <data size in bytes> [number of GPUs] "
+      "[GPU 0] [GPU 1] ...\n\n");
+}
+
+int main(int argc, char* argv[]) {
+  int nVis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nVis));
+
+  unsigned long long N = 0;
+  if (argc > 1) {
+    int t = sscanf(argv[1], "%llu", &N);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    printf("Error: must specify at least data size in bytes!\n\n");
+    usage();
+    exit(EXIT_FAILURE);
+  }
+
+  int nDev = nVis;
+  if (argc > 2) {
+    int t = sscanf(argv[2], "%d", &nDev);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  std::vector<int> dList(nDev);
+  for (int i = 0; i < nDev; ++i)
+    dList[i] = i % nVis;
+
+  if (argc > 3) {
+    if (argc - 3 != nDev) {
+      printf("Error: insufficient number of GPUs in list\n\n");
+      usage();
+      exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
+      if (t == 0) {
+        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
+        usage();
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
+
+  printf("# Using devices\n");
+  for (int g = 0; g < nDev; ++g) {
+    int cudaDev;
+    int rank;
+    cudaDeviceProp prop;
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+        prop.pciBusID, prop.name);
+  }
+  printf("\n");
+
+  printf("# %10s  %12s  %6s  %4s  %7s  %5s  %5s  %7s\n",
+      "bytes", "N", "type", "root", "time", "algbw", "busbw", "delta");
+
+  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
+  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
+#ifdef CUDA_HAS_HALF
+  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
+#endif
+  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
+  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
+
+  printf("\n");
+
+  for(int i = 0; i < nDev; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  avg_bw /= avg_count;
+
+  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
+  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
+  printf("\n");
+  if (errors || avg_bw < check_avg_bw)
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/reduce_scan.cu b/third_party/nccl/test/single/reduce_scan.cu
new file mode 100644
index 0000000..f42643e
--- /dev/null
+++ b/third_party/nccl/test/single/reduce_scan.cu
@@ -0,0 +1,238 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void** input;
+  void* output; // always goes on rank 0
+  double* maxError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_size));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    if (g == 0) {
+      CUDACHECK(cudaMalloc(&output, max_size));
+      CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
+      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+    } else {
+      accVec(refout, input[g], n_max, type, op);
+    }
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    CUDACHECK(cudaSetDevice(list[0]));
+    CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
+    for(int g=0; g<gpus; ++g)
+      CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    CUDACHECK(cudaSetDevice(list[0]));
+    maxDiff(maxError, output, refout, n, type, stream[0]);
+    CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, *maxError, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    if(g == 0) {
+      CUDACHECK(cudaFree(output));
+      CUDACHECK(cudaFreeHost(maxError));
+    }
+  }
+
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/reduce_scatter_scan.cu b/third_party/nccl/test/single/reduce_scatter_scan.cu
new file mode 100644
index 0000000..8c37508
--- /dev/null
+++ b/third_party/nccl/test/single/reduce_scatter_scan.cu
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_output = n_max * word;
+  size_t max_input = gpus * max_output;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_input));
+    CUDACHECK(cudaMalloc(&output[g], max_output));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max*gpus, type, 42+g);
+
+    if (g == 0)
+      CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
+    else
+      accVec(refout, input[g], n_max*gpus, type, op);
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      void* myRef = (void*)((char*)refout + g*bytes);
+      maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      max_error = max(max_error, *localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    double busbw = algbw * (double)(gpus - 1);
+    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+        n*word, max_error, ms, algbw, busbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/reduce_scatter_test.cu b/third_party/nccl/test/single/reduce_scatter_test.cu
new file mode 100644
index 0000000..b702800
--- /dev/null
+++ b/third_party/nccl/test/single/reduce_scatter_test.cu
@@ -0,0 +1,285 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "nccl.h"
+#include "test_utilities.h"
+
+int errors = 0;
+double avg_bw = 0.0;
+int avg_count = 0;
+bool is_reduction = true;
+
+template<typename T>
+void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
+    const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
+  // initialize data
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
+
+  T* buffer = (T*)malloc(N * nDev * sizeof(T));
+  T* result = (T*)malloc(N * nDev * sizeof(T));
+  memset(buffer, 0, N * nDev * sizeof(T));
+  memset(result, 0, N * nDev * sizeof(T));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamCreate(s+i));
+    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
+    Randomize(sendbuff[i], N * nDev, i);
+
+    if (i == 0) {
+      CUDACHECK(cudaMemcpy(result, sendbuff[i], N * nDev * sizeof(T),
+          cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate<T>(result, sendbuff[i], N * nDev, op);
+    }
+  }
+
+  // warm up GPU
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
+        std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamSynchronize(s[i]));
+  }
+
+//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
+  {
+    int n = N;
+    printf("%12i  %12i  %6s  %6s", (int)(n * sizeof(T)), n,
+        TypeName(type).c_str(), OperationName(op).c_str());
+
+    // do out-of-place reduction first
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
+          op, comms[i], s[i]));
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count();
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw * (double)(nDev - 1);
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(recvbuff[i], result+i*n, n);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf("  %7.3f  %5.2f  %5.2f  %7.0le", elapsedSec * 1.0E3, algbw, busbw,
+        maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+  }
+
+  {
+    // now do in-place reduction
+    int n = N;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
+          op, comms[i], s[i]));
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count();
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw * (double)(nDev - 1);
+
+    double maxDelta = 0.0;
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      double tmpDelta = CheckDelta<T>(sendbuff[i], result+i*n, n);
+      maxDelta = std::max(tmpDelta, maxDelta);
+    }
+
+    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
+        maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamDestroy(s[i]));
+  }
+  free(s);
+  free(buffer);
+  free(result);
+}
+
+template<typename T>
+void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
+    const std::vector<int>& dList) {
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
+  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaMalloc(sendbuff + i, N * nDev * sizeof(T)));
+    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
+  }
+
+  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
+//  for (ncclRedOp_t op : { ncclSum }) {
+    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaFree(sendbuff[i]));
+    CUDACHECK(cudaFree(recvbuff[i]));
+  }
+
+  free(sendbuff);
+  free(recvbuff);
+}
+
+void usage() {
+  printf("Tests nccl ReduceScatter with user supplied arguments.\n"
+      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
+      "[GPU 0] [GPU 1] ...\n\n");
+}
+
+int main(int argc, char* argv[]) {
+  int nVis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nVis));
+
+  int N = 0;
+  if (argc > 1) {
+    int t = sscanf(argv[1], "%d", &N);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    printf("Error: must specify at least data size in bytes!\n\n");
+    usage();
+    exit(EXIT_FAILURE);
+  }
+
+  int nDev = nVis;
+  if (argc > 2) {
+    int t = sscanf(argv[2], "%d", &nDev);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  std::vector<int> dList(nDev);
+  for (int i = 0; i < nDev; ++i)
+    dList[i] = i % nVis;
+
+  if (argc > 3) {
+    if (argc - 3 != nDev) {
+      printf("Error: insufficient number of GPUs in list\n\n");
+      usage();
+      exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
+      if (t == 0) {
+        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
+        usage();
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
+
+  printf("# Using devices\n");
+  for (int g = 0; g < nDev; ++g) {
+    int cudaDev;
+    int rank;
+    cudaDeviceProp prop;
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+        prop.pciBusID, prop.name);
+  }
+  printf("\n");
+
+  printf("# %10s  %12s  %6s  %6s        out-of-place                      "
+      "in-place\n", "", "", "", "");
+  printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
+      "bytes", "N", "type", "op", "time", "algbw", "busbw", "delta", "time",
+      "algbw", "busbw", "delta");
+
+  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
+  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
+#ifdef CUDA_HAS_HALF
+  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
+#endif
+  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
+  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
+
+  printf("\n");
+
+  for(int i=0; i<nDev; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  avg_bw /= avg_count;
+
+  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
+  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
+  printf("\n");
+  if (errors || avg_bw < check_avg_bw)
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nccl/test/single/reduce_test.cu b/third_party/nccl/test/single/reduce_test.cu
new file mode 100644
index 0000000..6abb49c
--- /dev/null
+++ b/third_party/nccl/test/single/reduce_test.cu
@@ -0,0 +1,299 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+int csv = false;
+int errors = 0;
+double avg_bw = 0.0;
+int avg_count = 0;
+bool is_reduction = true;
+
+template<typename T>
+void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
+    const ncclRedOp_t op, int root, ncclComm_t* const comms,
+    const std::vector<int>& dList) {
+
+  // initialize data
+  T* buffer = (T*)malloc(N * sizeof(T));
+  T* result = (T*)malloc(N * sizeof(T));
+  memset(buffer, 0, N * sizeof(T));
+  memset(result, 0, N * sizeof(T));
+
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamCreate(s+i));
+    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
+    Randomize(sendbuff[i], N, i);
+    if(i == 0) {
+      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate<T>(result, sendbuff[i], N, op);
+    }
+  }
+
+  // warm up GPU
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
+        type, op, root, comms[i], s[i]));
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamSynchronize(s[i]));
+  }
+
+//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
+  {
+    int n = N;
+    printf((csv) ? "%i,%i,%s,%s,%d," : "%12i  %12i  %6s  %6s %4d",
+        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
+        OperationName(op).c_str(), root);
+
+    // do out-of-place reduction first
+    nvtxRangePushA("out of place");
+    auto start = std::chrono::high_resolution_clock::now();
+    //for (int i=0; i<100; i++) {
+      for (int i = 0; i < nDev; ++i) {
+        CUDACHECK(cudaSetDevice(dList[i]));
+        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
+            root, comms[i], s[i]));
+      }
+    //}
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    nvtxRangePop();
+
+    nvtxRangePushA("out of place bookkeeping");
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count(); // / 100.0;
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw;
+
+    CUDACHECK(cudaSetDevice(dList[root]));
+    double maxDelta = CheckDelta<T>(recvbuff[root], result, N);
+
+    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
+        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+    nvtxRangePop();
+  }
+
+
+//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
+  {
+    int n = N;
+    // now do in-place reduction
+    nvtxRangePushA("in place");
+    auto start = std::chrono::high_resolution_clock::now();
+    //for (int i=0; i<100; i++) {
+      for (int i = 0; i < nDev; ++i) {
+        CUDACHECK(cudaSetDevice(dList[i]));
+        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
+            root, comms[i], s[i]));
+      }
+    //}
+
+    for (int i = 0; i < nDev; ++i) {
+      CUDACHECK(cudaSetDevice(dList[i]));
+      CUDACHECK(cudaStreamSynchronize(s[i]));
+    }
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    nvtxRangePop();
+
+    nvtxRangePushA("in place bookkeeping");
+    double elapsedSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            stop - start).count(); // / 100.0;
+    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
+    double busbw = algbw;
+
+    CUDACHECK(cudaSetDevice(dList[root]));
+    double maxDelta = CheckDelta<T>(sendbuff[root], result, N);
+
+    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
+        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
+
+    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
+    avg_bw += busbw;
+    avg_count++;
+
+    nvtxRangePop();
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaStreamDestroy(s[i]));
+  }
+  free(s);
+  free(buffer);
+  free(result);
+}
+
+template<typename T>
+void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
+    const std::vector<int>& dList) {
+  int nDev = 0;
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
+  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
+  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
+    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
+  }
+
+  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
+//  for (ncclRedOp_t op : { ncclSum }) {
+    for(int root=0; root<nDev; ++root) {
+      RunTest<T>(sendbuff, recvbuff, N, type, op, root, comms, dList);
+    }
+  }
+
+  for (int i = 0; i < nDev; ++i) {
+    CUDACHECK(cudaSetDevice(dList[i]));
+    CUDACHECK(cudaFree(sendbuff[i]));
+    CUDACHECK(cudaFree(recvbuff[i]));
+  }
+
+  free(sendbuff);
+  free(recvbuff);
+}
+
+void usage() {
+  printf("Tests nccl Reduce with user supplied arguments.\n"
+      "    Usage: reduce_test <data size in bytes> [number of GPUs] "
+      "[GPU 0] [GPU 1] ...\n\n");
+}
+
+int main(int argc, char* argv[]) {
+  int nVis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nVis));
+
+  int N = 0;
+  if (argc > 1) {
+    int t = sscanf(argv[1], "%d", &N);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    printf("Error: must specify at least data size in bytes!\n\n");
+    usage();
+    exit(EXIT_FAILURE);
+  }
+
+  int nDev = nVis;
+  if (argc > 2) {
+    int t = sscanf(argv[2], "%d", &nDev);
+    if (t == 0) {
+      printf("Error: %s is not an integer!\n\n", argv[1]);
+      usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  std::vector<int> dList(nDev);
+  for (int i = 0; i < nDev; ++i)
+    dList[i] = i % nVis;
+
+  if (argc > 3) {
+    if (argc - 3 != nDev) {
+      printf("Error: insufficient number of GPUs in list\n\n");
+      usage();
+      exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < nDev; ++i) {
+      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
+      if (t == 0) {
+        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
+        usage();
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
+
+  if (!csv) {
+    printf("# Using devices\n");
+    for (int g = 0; g < nDev; ++g) {
+      int cudaDev;
+      int rank;
+      cudaDeviceProp prop;
+      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
+      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+        prop.pciBusID, prop.name);
+    }
+    printf("\n");
+
+    printf("# %10s  %12s  %6s  %6s  %4s        out-of-place                    in-place\n", "", "", "", "", "");
+    printf("# %10s  %12s  %6s  %6s  %4s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
+               "bytes", "N", "type", "op", "root",
+               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  }
+  else {
+    printf("B,N,type,op,root,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
+  }
+
+  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
+  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
+#ifdef CUDA_HAS_HALF
+  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
+#endif
+  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
+  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
+
+  printf("\n");
+
+  for(int i = 0; i < nDev; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  avg_bw /= avg_count;
+
+  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
+  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
+  printf("\n");
+  if (errors || avg_bw < check_avg_bw)
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
+
diff --git a/third_party/nervanagpu b/third_party/nervanagpu
new file mode 160000
index 0000000..d4eefd5
--- /dev/null
+++ b/third_party/nervanagpu
@@ -0,0 +1 @@
+Subproject commit d4eefd50fbd7d34a17dddbc829888835d67b5f4a
diff --git a/third_party/onnx b/third_party/onnx
new file mode 160000
index 0000000..b2817a6
--- /dev/null
+++ b/third_party/onnx
@@ -0,0 +1 @@
+Subproject commit b2817a682f25f960586f06caa539bbbd7a96b859
diff --git a/third_party/onnx-tensorrt b/third_party/onnx-tensorrt
new file mode 160000
index 0000000..fa0964e
--- /dev/null
+++ b/third_party/onnx-tensorrt
@@ -0,0 +1 @@
+Subproject commit fa0964e8477fc004ee2f49ee77ffce0bf7f711a9
diff --git a/third_party/protobuf b/third_party/protobuf
new file mode 160000
index 0000000..2761122
--- /dev/null
+++ b/third_party/protobuf
@@ -0,0 +1 @@
+Subproject commit 2761122b810fe8861004ae785cc3ab39f384d342
diff --git a/third_party/psimd b/third_party/psimd
new file mode 160000
index 0000000..4ac61b1
--- /dev/null
+++ b/third_party/psimd
@@ -0,0 +1 @@
+Subproject commit 4ac61b112252778b174575931c641bef661ab3cd
diff --git a/third_party/pthreadpool b/third_party/pthreadpool
new file mode 160000
index 0000000..2b06b31
--- /dev/null
+++ b/third_party/pthreadpool
@@ -0,0 +1 @@
+Subproject commit 2b06b31f6a315162348e1f3c24325eedaf6cc559
diff --git a/third_party/pybind11 b/third_party/pybind11
new file mode 160000
index 0000000..add56cc
--- /dev/null
+++ b/third_party/pybind11
@@ -0,0 +1 @@
+Subproject commit add56ccdcac23a6c522a2c1174a866e293c61dab
diff --git a/third_party/python-enum b/third_party/python-enum
new file mode 160000
index 0000000..4cfedc4
--- /dev/null
+++ b/third_party/python-enum
@@ -0,0 +1 @@
+Subproject commit 4cfedc426c4e2fc52e3f5c2b4297e15ed8d6b8c7
diff --git a/third_party/python-peachpy b/third_party/python-peachpy
new file mode 160000
index 0000000..07d8fde
--- /dev/null
+++ b/third_party/python-peachpy
@@ -0,0 +1 @@
+Subproject commit 07d8fde8ac45d7705129475c0f94ed8925b93473
diff --git a/third_party/python-six b/third_party/python-six
new file mode 160000
index 0000000..15e3143
--- /dev/null
+++ b/third_party/python-six
@@ -0,0 +1 @@
+Subproject commit 15e31431af97e5e64b80af0a3f598d382bcdd49a
diff --git a/third_party/sleef b/third_party/sleef
new file mode 160000
index 0000000..6ff7a13
--- /dev/null
+++ b/third_party/sleef
@@ -0,0 +1 @@
+Subproject commit 6ff7a135a1e31979d1e1844a2e7171dfbd34f54f
diff --git a/third_party/zstd b/third_party/zstd
new file mode 160000
index 0000000..aec56a5
--- /dev/null
+++ b/third_party/zstd
@@ -0,0 +1 @@
+Subproject commit aec56a52fbab207fc639a1937d1e708a282edca8
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
new file mode 100644
index 0000000..3d3ff29
--- /dev/null
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -0,0 +1,71 @@
+"""Requires the hipify-python.py script (https://github.com/ROCm-Developer-Tools/pyHIPIFY)."""
+import shutil
+import subprocess
+import os
+import sys
+from shutil import copytree, ignore_patterns
+from functools import reduce
+
+amd_build_dir = os.path.dirname(os.path.realpath(__file__))
+proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
+include_dirs = [
+    "aten",
+    "torch"
+]
+
+# List of operators currently disabled
+yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")
+
+# Apply patch files.
+patch_folder = os.path.join(amd_build_dir, "patches")
+for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
+    subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
+
+# HIPCC Compiler doesn't provide host defines - Automatically include them.
+for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")):
+    for filename in files:
+        if filename.endswith(".cu") or filename.endswith(".cuh"):
+            filepath = os.path.join(root, filename)
+
+            # Add the include header!
+            with open(filepath, "r+") as f:
+                txt = f.read()
+                result = '#include "hip/hip_runtime.h"\n%s' % txt
+                f.seek(0)
+                f.write(result)
+                f.truncate()
+                f.flush()
+
+                # Flush to disk
+                os.fsync(f)
+
+# Make various replacements inside AMD_BUILD/torch directory
+ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
+                "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
+for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
+    for filename in files:
+        if filename.endswith(".cpp") or filename.endswith(".h"):
+            source = os.path.join(root, filename)
+            # Disabled files
+            if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
+                continue
+            # Update contents.
+            with open(source, "r+") as f:
+                contents = f.read()
+                contents = contents.replace("USE_CUDA", "USE_ROCM")
+                contents = contents.replace("CUDA_VERSION", "0")
+                f.seek(0)
+                f.write(contents)
+                f.truncate()
+                f.flush()
+                os.fsync(f)
+
+# Execute the Hipify Script.
+args = (["--project-directory", proj_dir] +
+        ["--output-directory", proj_dir] +
+        ["--include-dirs"] + include_dirs +
+        ["--yaml-settings", yaml_file] +
+        ["--add-static-casts", "True"] +
+        ["--show-progress", "False"])
+
+os.execv(os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py"), ['python'] + args)
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
new file mode 100644
index 0000000..a41c459
--- /dev/null
+++ b/tools/amd_build/disabled_features.yaml
@@ -0,0 +1,205 @@
+{
+  "disable_unsupported_hip_calls":
+    [
+      {
+        "path": "aten/src/THC/THCBlas.cu",
+        "functions": {
+          "cublasSgemmEx": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasSgetrfBatched": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasDgetrfBatched": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasSgetrsBatched": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasDgetrsBatched": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasSgetriBatched": "HIPBLAS_STATUS_INTERNAL_ERROR",
+          "cublasDgetriBatched": "HIPBLAS_STATUS_INTERNAL_ERROR"
+        },
+        "constants": {
+            "HIPBLAS_DATA_HALF": "0"
+        }
+      },
+      {
+        "path": "aten/src/THC/THCStream.cpp",
+        "functions": {
+          "cudaStreamCreateWithFlags": "hipSuccess",
+          "cudaStreamCreateWithPriority": "hipSuccess"
+        }
+      },
+      {
+        "path": "aten/src/THC/THCAllocator.cpp",
+        "functions": {
+          "cudaMallocManaged": "hipSuccess"
+        }
+      },
+      {
+        "path": "aten/src/TH/generic/THTensorMath.cpp",
+        "constants": {
+          "_OPENMP": "_OPENMP_STUB"
+        }
+      },
+      {
+        "path": "aten/src/ATen/Context.cpp",
+        "s_constants": {
+          "#ifdef USE_SSE3": "#if defined(USE_SSE3) && !defined(__HIP_DEVICE_COMPILE__)"
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/Distributions.h",
+        "s_constants": {
+          "scalar_cast": "static_cast"
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/Distributions.cu",
+        "s_constants": {
+          "#include <nvfunctional>": ""
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/Distributions.cu",
+        "s_constants": {
+          "#include <nvfunctional>": ""
+        }
+      },
+      {
+        "path": "aten/src/THC/THCNumerics.cuh",
+        "s_constants": {
+          "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
+          "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/RoiPooling.cu",
+        "s_constants": {
+          "RoiPooling2d_forward_kernel<<<": "RoiPooling2d_forward_kernel<float><<<"
+        }
+      },
+      {
+        "path": "aten/src/ATen/Error.h",
+        "s_constants": {
+          "#if !defined(_WIN32)": "#if !defined(_WIN32) && !defined(__HIP_PLATFORM_HCC__)"
+        }
+      },
+      {
+        "path": "aten/src/ATen/Error.cpp",
+        "s_constants": {
+          "#if !defined(_WIN32)": "#if !defined(_WIN32) && !defined(__HIP_PLATFORM_HCC__)",
+          "#if defined(_MSC_VER)": "#if defined(_MSC_VER) || defined(__HIP_PLATFORM_HCC__)"
+        }
+      },
+      {
+        "path": "aten/src/THC/THCTensorRandom.cpp",
+        "s_constants": {
+          "struct curandStateMtgp32*": "curandStateMtgp32*"
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/CuFFTUtils.h",
+        "s_constants": {
+          "#include <cufft.h>": "",
+          "#include <cufftXt.h>": ""
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
+        "s_constants": {
+          "#include <cufft.h>": "",
+          "#include <cufftXt.h>": ""
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/SpectralOps.cu",
+        "s_constants": {
+          "#include <cufft.h>": "",
+          "#include <cufftXt.h>": ""
+        }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/RoiPooling.cu",
+        "s_constants": {
+            "RoiPooling2d_backward_kernel<<<": "RoiPooling2d_backward_kernel<float><<<"
+        }
+      },
+      {
+        "path": "aten/src/ATen/CUDAStream.cpp",
+        "s_constants": {
+            # FIXME: ROCm currently does not support multi-GPU setup; and getNumGPUs runs into a seg fault
+            # ROCm Pytorch issue: https://github.com/ROCmSoftwarePlatform/pytorch/issues/31
+            "getCUDAHooks().getNumGPUs()": "1",
+        }
+      },
+      {
+        "path": "aten/src/ATen/Context.h",
+        "s_constants": {
+            # FIXME: ROCm currently does not support multi-GPU setup; and getNumGPUs runs into a seg fault
+            # ROCm Pytorch issue: https://github.com/ROCmSoftwarePlatform/pytorch/issues/31
+            "detail::getCUDAHooks().getNumGPUs()": "1",
+        }
+      }
+    ],
+  "disabled_modules": [
+    "aten/src/ATen/native/cuda/CuFFTUtils.h",
+    "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
+    "aten/src/ATen/native/cuda/SpectralOps.cu",
+    "aten/src/ATen/native/cuda/Distributions.cu"
+  ],
+  "disabled_functions": [
+    {
+      "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
+      "functions": [
+        "kernelPointwiseApply4"
+      ]
+    },
+    {
+      "path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
+      "non_device_functions": [
+        "maybeOverlappingIndices"
+      ]
+    },
+    {
+      "path": "aten/src/THCUNN/LookupTable.cu",
+      "functions": [
+        "warpHasCollision"
+      ]
+    },
+    {
+      "path": "aten/src/ATen/native/cuda/Distributions.cu",
+      "functions": [
+        "_s_poisson_cuda",
+        "poisson_cuda_kernel",
+        "gamma_cuda_kernel"
+      ]
+    },
+    {
+      "path": "aten/src/THC/THCGeneral.cpp",
+      "functions": [
+        "THC_float2half",
+        "THC_half2float"
+      ]
+    },
+    {
+      "path": "aten/src/THCUNN/generic/SparseLinear.cu",
+      "functions": [
+        "THNN_(SparseLinear_updateOutput)",
+        "THNN_(SparseLinear_accGradParameters)"
+      ]
+    },
+    {
+      "path": "aten/src/THCUNN/generic/LookupTable.cu",
+      "functions": [
+        "THNN_(LookupTable_accGradParameters)",
+        "THNN_(LookupTable_renorm)"
+      ]
+    },
+    {
+      "path": "aten/src/THCUNN/LookupTable.cu",
+      "functions": [
+        "calculate_norms_and_renorm"
+      ]
+    },
+    {
+      "path": "aten/src/THC/generic/THCTensor.cu",
+      "functions": [
+        "THCTensor_(getTextureObject)"
+      ]
+    }
+  ]
+}
diff --git a/tools/amd_build/patches/a_torch_cuda___init__.py.patch b/tools/amd_build/patches/a_torch_cuda___init__.py.patch
new file mode 100644
index 0000000..94c2d59
--- /dev/null
+++ b/tools/amd_build/patches/a_torch_cuda___init__.py.patch
@@ -0,0 +1,26 @@
+diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
+index f52ab04f1..4e3f63c4b 100644
+--- a/torch/cuda/__init__.py
++++ b/torch/cuda/__init__.py
+@@ -123,7 +123,7 @@ def _lazy_call(callable):
+         # Don't store the actual traceback to avoid memory cycle
+         _queued_calls.append((callable, traceback.format_stack()))
+ 
+-_lazy_call(_check_capability)
++#_lazy_call(_check_capability)
+ 
+ 
+ class DeferredCudaCallError(Exception):
+@@ -159,9 +159,9 @@ def _lazy_init():
+             "Cannot re-initialize CUDA in forked subprocess. " + msg)
+     _check_driver()
+     torch._C._cuda_init()
+-    _cudart = _load_cudart()
+-    _cudart.cudaGetErrorName.restype = ctypes.c_char_p
+-    _cudart.cudaGetErrorString.restype = ctypes.c_char_p
++    # _cudart = _load_cudart()
++    #_cudart.cudaGetErrorName.restype = ctypes.c_char_p
++    #_cudart.cudaGetErrorString.restype = ctypes.c_char_p
+     _original_pid = os.getpid()
+     _initialized = True
+     # Important to do this after _initialized, since some queued calls
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py
new file mode 100644
index 0000000..1ea8f81
--- /dev/null
+++ b/tools/amd_build/pyHIPIFY/constants.py
@@ -0,0 +1,55 @@
+""" Constants for annotations in the mapping.
+The constants defined here are used to annotate the mapping tuples in cuda_to_hip_mappings.py.
+They are based on
+https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/Statistics.h
+and fall in three categories: 1) type of mapping, 2) API of mapping, 3) unsupported
+mapping.
+"""
+
+CONV_VERSION = 0,
+CONV_INIT = 1
+CONV_DEVICE = 2
+CONV_MEM = 3
+CONV_KERN = 4
+CONV_COORD_FUNC = 5
+CONV_MATH_FUNC = 6
+CONV_DEVICE_FUNC = 7
+CONV_SPECIAL_FUNC = 8
+CONV_STREAM = 9
+CONV_EVENT = 10
+CONV_OCCUPANCY = 11
+CONV_CONTEXT = 12
+CONV_PEER = 13
+CONV_MODULE = 14
+CONV_CACHE = 15
+CONV_EXEC = 16
+CONV_ERROR = 17
+CONV_DEF = 18
+CONV_TEX = 19
+CONV_GL = 20
+CONV_GRAPHICS = 21
+CONV_SURFACE = 22
+CONV_JIT = 23
+CONV_D3D9 = 24
+CONV_D3D10 = 25
+CONV_D3D11 = 26
+CONV_VDPAU = 27
+CONV_EGL = 28
+CONV_THREAD = 29
+CONV_OTHER = 30
+CONV_INCLUDE = 31
+CONV_INCLUDE_CUDA_MAIN_H = 32
+CONV_TYPE = 33
+CONV_LITERAL = 34
+CONV_NUMERIC_LITERAL = 35
+CONV_LAST = 36
+
+API_DRIVER = 37
+API_RUNTIME = 38
+API_BLAS = 39
+API_SPARSE = 40
+API_RAND = 41
+API_LAST = 42
+
+HIP_UNSUPPORTED = 43
+API_PYTORCH = 1337
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
new file mode 100644
index 0000000..c035628
--- /dev/null
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -0,0 +1,2110 @@
+from constants import *
+
+""" Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents
+
+This closely follows the implementation in hipify-clang
+https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp
+and its structure.
+There are different maps for fundamental names, include files, identifies, sparse, and
+PyTorch specific translations.
+Each of the entries in these maps translates a CUDA string to a tuple containing the
+ROCm/HIP string, a type and API annotation and - optionally - an annotation if it is not
+supported in ROCm/HIP yet.
+"""
+
+CUDA_TYPE_NAME_MAP = {
+    "CUresult": ("hipError_t", CONV_TYPE, API_DRIVER),
+    "cudaError_t": ("hipError_t", CONV_TYPE, API_RUNTIME),
+    "cudaError": ("hipError_t", CONV_TYPE, API_RUNTIME),
+    "CUDA_ARRAY3D_DESCRIPTOR": ("HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY_DESCRIPTOR": ("HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER),
+    "CUDA_MEMCPY2D": ("hip_Memcpy2D", CONV_TYPE, API_DRIVER),
+    "CUDA_MEMCPY3D": ("HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_MEMCPY3D_PEER": ("HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS": ("HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_RESOURCE_DESC": ("HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_RESOURCE_VIEW_DESC": ("HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUipcEventHandle": ("hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUipcMemHandle": ("hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUaddress_mode": ("hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUarray_cubemap_face": ("hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUarray_format": ("hipArray_format", CONV_TYPE, API_DRIVER),
+    "CUcomputemode": ("hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmem_advise": ("hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmem_range_attribute": ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUctx_flags": ("hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUdevice": ("hipDevice_t", CONV_TYPE, API_DRIVER),
+    "CUdevice_attribute_enum": ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER),
+    "CUdevice_attribute": ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER),
+    "CUdeviceptr": ("hipDeviceptr_t", CONV_TYPE, API_DRIVER),
+    "CUarray_st": ("hipArray", CONV_TYPE, API_DRIVER),
+    "CUarray": ("hipArray *", CONV_TYPE, API_DRIVER),
+    "CUdevprop_st": ("hipDeviceProp_t", CONV_TYPE, API_DRIVER),
+    "CUdevprop": ("hipDeviceProp_t", CONV_TYPE, API_DRIVER),
+    "CUfunction": ("hipFunction_t", CONV_TYPE, API_DRIVER),
+    "CUgraphicsResource": ("hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmipmappedArray": ("hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUfunction_attribute": ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUfunction_attribute_enum": ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUgraphicsMapResourceFlags": ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUgraphicsMapResourceFlags_enum": ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUgraphicsRegisterFlags": ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUgraphicsRegisterFlags_enum": ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUoccupancy_flags": ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUoccupancy_flags_enum": ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUfunc_cache_enum": ("hipFuncCache", CONV_TYPE, API_DRIVER),
+    "CUfunc_cache": ("hipFuncCache", CONV_TYPE, API_DRIVER),
+    "CUipcMem_flags": ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUipcMem_flags_enum": ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_cacheMode": ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_cacheMode_enum": ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_fallback": ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_fallback_enum": ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_option": ("hipJitOption", CONV_JIT, API_DRIVER),
+    "CUjit_option_enum": ("hipJitOption", CONV_JIT, API_DRIVER),
+    "CUjit_target": ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjit_target_enum": ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjitInputType": ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUjitInputType_enum": ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUlimit": ("hipLimit_t", CONV_TYPE, API_DRIVER),
+    "CUlimit_enum": ("hipLimit_t", CONV_TYPE, API_DRIVER),
+    "CUmemAttach_flags": ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmemAttach_flags_enum": ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmemorytype": ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUmemorytype_enum": ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUresourcetype": ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CUresourcetype_enum": ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CUresourceViewFormat": ("hipResourceViewFormat", CONV_TEX, API_DRIVER),
+    "CUresourceViewFormat_enum": ("hipResourceViewFormat", CONV_TEX, API_DRIVER),
+    "CUsharedconfig": ("hipSharedMemConfig", CONV_TYPE, API_DRIVER),
+    "CUsharedconfig_enum": ("hipSharedMemConfig", CONV_TYPE, API_DRIVER),
+    "CUcontext": ("hipCtx_t", CONV_TYPE, API_DRIVER),
+    "CUmodule": ("hipModule_t", CONV_TYPE, API_DRIVER),
+    "CUstream": ("hipStream_t", CONV_TYPE, API_DRIVER),
+    "CUstream_st": ("ihipStream_t", CONV_TYPE, API_DRIVER),
+    "CUstreamCallback": ("hipStreamCallback_t", CONV_TYPE, API_DRIVER),
+    "CUsurfObject": ("hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUsurfref": ("hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUtexObject": ("hipTextureObject_t", CONV_TYPE, API_DRIVER),
+    "CUtexref": ("textureReference", CONV_TYPE, API_DRIVER),
+    "CUstream_flags": ("hipStreamFlags", CONV_TYPE, API_DRIVER),
+    "CUstreamWaitValue_flags": ("hipStreamWaitValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUstreamWriteValue_flags": ("hipStreamWriteValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUstreamBatchMemOpType": ("hipStreamBatchMemOpType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUdevice_P2PAttribute": ("hipDeviceP2PAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUevent": ("hipEvent_t", CONV_TYPE, API_DRIVER),
+    "CUevent_flags": ("hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED),
+    "CUfilter_mode": ("hipTextureFilterMode", CONV_TEX, API_DRIVER),
+    "CUGLDeviceList": ("hipGLDeviceList", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CUGLmap_flags": ("hipGLMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d9DeviceList": ("hipD3D9DeviceList", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d9map_flags": ("hipD3D9MapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d9register_flags": ("hipD3D9RegisterFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d10DeviceList": ("hipd3d10DeviceList", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d10map_flags": ("hipD3D10MapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d10register_flags": ("hipD3D10RegisterFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CUd3d11DeviceList": ("hipd3d11DeviceList", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "CUeglStreamConnection_st": ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "CUeglStreamConnection": ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "libraryPropertyType_t": ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "libraryPropertyType": ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaStreamCallback_t": ("hipStreamCallback_t", CONV_TYPE, API_RUNTIME),
+    "cudaArray": ("hipArray", CONV_MEM, API_RUNTIME),
+    "cudaArray_t": ("hipArray_t", CONV_MEM, API_RUNTIME),
+    "cudaArray_const_t": ("hipArray_const_t", CONV_MEM, API_RUNTIME),
+    "cudaMipmappedArray_t": ("hipMipmappedArray_t", CONV_MEM, API_RUNTIME),
+    "cudaMipmappedArray_const_t": ("hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME),
+    "cudaArrayDefault": ("hipArrayDefault", CONV_MEM, API_RUNTIME),
+    "cudaArrayLayered": ("hipArrayLayered", CONV_MEM, API_RUNTIME),
+    "cudaArraySurfaceLoadStore": ("hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME),
+    "cudaArrayCubemap": ("hipArrayCubemap", CONV_MEM, API_RUNTIME),
+    "cudaArrayTextureGather": ("hipArrayTextureGather", CONV_MEM, API_RUNTIME),
+    "cudaMemoryAdvise": ("hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeAttribute": ("hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpyKind": ("hipMemcpyKind", CONV_MEM, API_RUNTIME),
+    "cudaMemoryType": ("hipMemoryType", CONV_MEM, API_RUNTIME),
+    "cudaExtent": ("hipExtent", CONV_MEM, API_RUNTIME),
+    "cudaPitchedPtr": ("hipPitchedPtr", CONV_MEM, API_RUNTIME),
+    "cudaPos": ("hipPos", CONV_MEM, API_RUNTIME),
+    "cudaEvent_t": ("hipEvent_t", CONV_TYPE, API_RUNTIME),
+    "cudaStream_t": ("hipStream_t", CONV_TYPE, API_RUNTIME),
+    "cudaPointerAttributes": ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceAttr": ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceProp": ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceP2PAttr": ("hipDeviceP2PAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaComputeMode": ("hipComputeMode", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaFuncCache": ("hipFuncCache_t", CONV_CACHE, API_RUNTIME),
+    "cudaFuncAttributes": ("hipFuncAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSharedMemConfig": ("hipSharedMemConfig", CONV_TYPE, API_RUNTIME),
+    "cudaLimit": ("hipLimit_t", CONV_TYPE, API_RUNTIME),
+    "cudaOutputMode": ("hipOutputMode", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaTextureReadMode": ("hipTextureReadMode", CONV_TEX, API_RUNTIME),
+    "cudaTextureFilterMode": ("hipTextureFilterMode", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatKind": ("hipChannelFormatKind", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatDesc": ("hipChannelFormatDesc", CONV_TEX, API_RUNTIME),
+    "cudaResourceDesc": ("hipResourceDesc", CONV_TEX, API_RUNTIME),
+    "cudaResourceViewDesc": ("hipResourceViewDesc", CONV_TEX, API_RUNTIME),
+    "cudaTextureDesc": ("hipTextureDesc", CONV_TEX, API_RUNTIME),
+    "surfaceReference": ("hipSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaTextureObject_t": ("hipTextureObject_t", CONV_TEX, API_RUNTIME),
+    "cudaResourceType": ("hipResourceType", CONV_TEX, API_RUNTIME),
+    "cudaResourceViewFormat": ("hipResourceViewFormat", CONV_TEX, API_RUNTIME),
+    "cudaTextureAddressMode": ("hipTextureAddressMode", CONV_TEX, API_RUNTIME),
+    "cudaSurfaceBoundaryMode": ("hipSurfaceBoundaryMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSurfaceFormatMode": ("hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaTextureType1D": ("hipTextureType1D", CONV_TEX, API_RUNTIME),
+    "cudaTextureType2D": ("hipTextureType2D", CONV_TEX, API_RUNTIME),
+    "cudaTextureType3D": ("hipTextureType3D", CONV_TEX, API_RUNTIME),
+    "cudaTextureTypeCubemap": ("hipTextureTypeCubemap", CONV_TEX, API_RUNTIME),
+    "cudaTextureType1DLayered": ("hipTextureType1DLayered", CONV_TEX, API_RUNTIME),
+    "cudaTextureType2DLayered": ("hipTextureType2DLayered", CONV_TEX, API_RUNTIME),
+    "cudaTextureTypeCubemapLayered": ("hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME),
+    "cudaIpcEventHandle_t": ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME),
+    "cudaIpcEventHandle_st": ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME),
+    "cudaIpcMemHandle_t": ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME),
+    "cudaIpcMemHandle_st": ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME),
+    "cudaGraphicsCubeFace": ("hipGraphicsCubeFace", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsMapFlags": ("hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlags": ("hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLDeviceList": ("hipGLDeviceList", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapFlags": ("hipGLMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9DeviceList": ("hipD3D9DeviceList", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapFlags": ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9RegisterFlags": ("hipD3D9RegisterFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10DeviceList": ("hipd3d10DeviceList", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10MapFlags": ("hipD3D10MapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10RegisterFlags": ("hipD3D10RegisterFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11DeviceList": ("hipd3d11DeviceList", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEglStreamConnection": ("hipEglStreamConnection", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cublasHandle_t": ("hipblasHandle_t", CONV_TYPE, API_BLAS),
+    "cublasOperation_t": ("hipblasOperation_t", CONV_TYPE, API_BLAS),
+    "cublasStatus_t": ("hipblasStatus_t", CONV_TYPE, API_BLAS),
+    "cublasFillMode_t": ("hipblasFillMode_t", CONV_TYPE, API_BLAS),
+    "cublasDiagType_t": ("hipblasDiagType_t", CONV_TYPE, API_BLAS),
+    "cublasSideMode_t": ("hipblasSideMode_t", CONV_TYPE, API_BLAS),
+    "cublasPointerMode_t": ("hipblasPointerMode_t", CONV_TYPE, API_BLAS),
+    "cublasAtomicsMode_t": ("hipblasAtomicsMode_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDataType_t": ("hipblasDataType_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+    "curandStatus": ("hiprngStatus_t", CONV_TYPE, API_RAND),
+    "curandStatus_t": ("hiprngStatus_t", CONV_TYPE, API_RAND),
+    "curandRngType": ("hiprngRngType_t", CONV_TYPE, API_RAND),
+    "curandRngType_t": ("hiprngRngType_t", CONV_TYPE, API_RAND),
+    "curandGenerator_st": ("hiprngGenerator_st", CONV_TYPE, API_RAND),
+    "curandGenerator_t": ("hiprngGenerator_t", CONV_TYPE, API_RAND),
+    "curandDirectionVectorSet": ("hiprngDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDirectionVectorSet_t": ("hiprngDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandOrdering": ("hiprngOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandOrdering_t": ("hiprngOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistribution_st": ("hiprngDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2V_st": ("hiprngDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistribution_t": ("hiprngDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2V_t": ("hiprngDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionShift_st": ("hiprngDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionShift_t": ("hiprngDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionM2Shift_st": ("hiprngDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionM2Shift_t": ("hiprngDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2_st": ("hiprngHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2_t": ("hiprngHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2K_st": ("hiprngHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2K_t": ("hiprngHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDiscreteDistribution_st": ("hiprngDiscreteDistribution_st", CONV_TYPE, API_RAND),
+    "curandDiscreteDistribution_t": ("hiprngDiscreteDistribution_t", CONV_TYPE, API_RAND),
+    "curandMethod": ("hiprngMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandMethod_t": ("hiprngMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDirectionVectors32_t": ("hiprngDirectionVectors32_t", CONV_TYPE, API_RAND),
+    "curandDirectionVectors64_t": ("hiprngDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateMtgp32_t": ("hiprngStateMtgp32_t", CONV_TYPE, API_RAND),
+    "curandStateMtgp32": ("hcrngStateMtgp32", CONV_TYPE, API_RAND),
+    "curandStateScrambledSobol64_t": ("hiprngStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateSobol64_t": ("hiprngStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateScrambledSobol32_t": ("hiprngStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateSobol32_t": ("hiprngStateSobol32_t", CONV_TYPE, API_RAND),
+    "curandStateMRG32k3a_t": ("hiprngStateMRG32k3a_t", CONV_TYPE, API_RAND),
+    "curandStatePhilox4_32_10_t": ("hiprngStatePhilox4_32_10_t", CONV_TYPE, API_RAND),
+    "curandStateXORWOW_t": ("hiprngStateXORWOW_t", CONV_TYPE, API_RAND),
+    "curandState_t": ("hiprngState_t", CONV_TYPE, API_RAND),
+    "curandState": ("hiprngState_t", CONV_TYPE, API_RAND)
+}
+
+CUDA_INCLUDE_MAP = {
+    "cuda.h": ("hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER),
+    "cuda_runtime.h": ("hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME),
+    "cuda_runtime_api.h": ("hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME),
+    "channel_descriptor.h": ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME),
+    "device_functions.h": ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME),
+    "driver_types.h": ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME),
+    "cuComplex.h": ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME),
+    "cuda_fp16.h": ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME),
+    "cuda_texture_types.h": ("hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME),
+    "vector_types.h": ("hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME),
+    "cublas.h": ("hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS),
+    "cublas_v2.h": ("hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS),
+    "curand.h": ("hiprng.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND),
+    "curand_kernel.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_discrete.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_discrete2.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_globals.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_lognormal.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mrg32k3a.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32_host.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32_kernel.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32dc_p_11213.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_normal.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_normal_static.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_philox4x32_x.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_poisson.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_precalc.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_uniform.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "cusparse.h": ("hipsparse.h", CONV_INCLUDE, API_RAND),
+    "#include <cufft.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
+    "#include <cufftXt.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
+    "#include <nvfunctional>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
+}
+
+CUDA_IDENTIFIER_MAP = {
+    "__CUDACC__": ("__HIPCC__", CONV_DEF, API_RUNTIME),
+    "CUDA_ERROR_INVALID_CONTEXT": ("hipErrorInvalidContext", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_CONTEXT_ALREADY_CURRENT": ("hipErrorContextAlreadyCurrent", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_ARRAY_IS_MAPPED": ("hipErrorArrayIsMapped", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_ALREADY_MAPPED": ("hipErrorAlreadyMapped", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_ALREADY_ACQUIRED": ("hipErrorAlreadyAcquired", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_NOT_MAPPED": ("hipErrorNotMapped", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_NOT_MAPPED_AS_ARRAY": ("hipErrorNotMappedAsArray", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_NOT_MAPPED_AS_POINTER": ("hipErrorNotMappedAsPointer", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_CONTEXT_ALREADY_IN_USE": ("hipErrorContextAlreadyInUse", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_INVALID_SOURCE": ("hipErrorInvalidSource", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_FILE_NOT_FOUND": ("hipErrorFileNotFound", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_NOT_FOUND": ("hipErrorNotFound", CONV_TYPE, API_DRIVER),
+    "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING": ("hipErrorLaunchIncompatibleTexturing", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE": ("hipErrorPrimaryContextActive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ERROR_CONTEXT_IS_DESTROYED": ("hipErrorContextIsDestroyed", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ERROR_NOT_PERMITTED": ("hipErrorNotPermitted", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ERROR_NOT_SUPPORTED": ("hipErrorNotSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorMissingConfiguration": ("hipErrorMissingConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorPriorLaunchFailure": ("hipErrorPriorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidDeviceFunction": ("hipErrorInvalidDeviceFunction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidConfiguration": ("hipErrorInvalidConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidPitchValue": ("hipErrorInvalidPitchValue", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidSymbol": ("hipErrorInvalidSymbol", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidHostPointer": ("hipErrorInvalidHostPointer", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidDevicePointer": ("hipErrorInvalidDevicePointer", CONV_TYPE, API_RUNTIME),
+    "cudaErrorInvalidTexture": ("hipErrorInvalidTexture", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidTextureBinding": ("hipErrorInvalidTextureBinding", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidChannelDescriptor": ("hipErrorInvalidChannelDescriptor", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidMemcpyDirection": ("hipErrorInvalidMemcpyDirection", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorAddressOfConstant": ("hipErrorAddressOfConstant", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorTextureFetchFailed": ("hipErrorTextureFetchFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorTextureNotBound": ("hipErrorTextureNotBound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorSynchronizationError": ("hipErrorSynchronizationError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidFilterSetting": ("hipErrorInvalidFilterSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidNormSetting": ("hipErrorInvalidNormSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorMixedDeviceExecution": ("hipErrorMixedDeviceExecution", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorNotYetImplemented": ("hipErrorNotYetImplemented", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorMemoryValueTooLarge": ("hipErrorMemoryValueTooLarge", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInsufficientDriver": ("hipErrorInsufficientDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorSetOnActiveProcess": ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorInvalidSurface": ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorDuplicateVariableName": ("hipErrorDuplicateVariableName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorDuplicateTextureName": ("hipErrorDuplicateTextureName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorDuplicateSurfaceName": ("hipErrorDuplicateSurfaceName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorDevicesUnavailable": ("hipErrorDevicesUnavailable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorIncompatibleDriverContext": ("hipErrorIncompatibleDriverContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorDeviceAlreadyInUse": ("hipErrorDeviceAlreadyInUse", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorLaunchMaxDepthExceeded": ("hipErrorLaunchMaxDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorLaunchFileScopedTex": ("hipErrorLaunchFileScopedTex", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorLaunchFileScopedSurf": ("hipErrorLaunchFileScopedSurf", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorSyncDepthExceeded": ("hipErrorSyncDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorLaunchPendingCountExceeded": ("hipErrorLaunchPendingCountExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorNotPermitted": ("hipErrorNotPermitted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorNotSupported": ("hipErrorNotSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorStartupFailure": ("hipErrorStartupFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaErrorApiFailureBase": ("hipErrorApiFailureBase", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_SUCCESS": ("hipSuccess", CONV_TYPE, API_DRIVER),
+    "cudaSuccess": ("hipSuccess", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_INVALID_VALUE": ("hipErrorInvalidValue", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidValue": ("hipErrorInvalidValue", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_OUT_OF_MEMORY": ("hipErrorMemoryAllocation", CONV_TYPE, API_DRIVER),
+    "cudaErrorMemoryAllocation": ("hipErrorMemoryAllocation", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_NOT_INITIALIZED": ("hipErrorNotInitialized", CONV_TYPE, API_DRIVER),
+    "cudaErrorInitializationError": ("hipErrorInitializationError", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_DEINITIALIZED": ("hipErrorDeinitialized", CONV_TYPE, API_DRIVER),
+    "cudaErrorCudartUnloading": ("hipErrorDeinitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PROFILER_DISABLED": ("hipErrorProfilerDisabled", CONV_TYPE, API_DRIVER),
+    "cudaErrorProfilerDisabled": ("hipErrorProfilerDisabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PROFILER_NOT_INITIALIZED": ("hipErrorProfilerNotInitialized", CONV_TYPE, API_DRIVER),
+    "cudaErrorProfilerNotInitialized": ("hipErrorProfilerNotInitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PROFILER_ALREADY_STARTED": ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_DRIVER),
+    "cudaErrorProfilerAlreadyStarted": ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PROFILER_ALREADY_STOPPED": ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_DRIVER),
+    "cudaErrorProfilerAlreadyStopped": ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_NO_DEVICE": ("hipErrorNoDevice", CONV_TYPE, API_DRIVER),
+    "cudaErrorNoDevice": ("hipErrorNoDevice", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_INVALID_DEVICE": ("hipErrorInvalidDevice", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidDevice": ("hipErrorInvalidDevice", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_INVALID_IMAGE": ("hipErrorInvalidImage", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidKernelImage": ("hipErrorInvalidImage", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_MAP_FAILED": ("hipErrorMapFailed", CONV_TYPE, API_DRIVER),
+    "cudaErrorMapBufferObjectFailed": ("hipErrorMapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_UNMAP_FAILED": ("hipErrorUnmapFailed", CONV_TYPE, API_DRIVER),
+    "cudaErrorUnmapBufferObjectFailed": ("hipErrorUnmapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_NO_BINARY_FOR_GPU": ("hipErrorNoBinaryForGpu", CONV_TYPE, API_DRIVER),
+    "cudaErrorNoKernelImageForDevice": ("hipErrorNoBinaryForGpu", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_ECC_UNCORRECTABLE": ("hipErrorECCNotCorrectable", CONV_TYPE, API_DRIVER),
+    "cudaErrorECCUncorrectable": ("hipErrorECCNotCorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_UNSUPPORTED_LIMIT": ("hipErrorUnsupportedLimit", CONV_TYPE, API_DRIVER),
+    "cudaErrorUnsupportedLimit": ("hipErrorUnsupportedLimit", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED": ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_DRIVER),
+    "cudaErrorPeerAccessUnsupported": ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_INVALID_PTX": ("hipErrorInvalidKernelFile", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidPtx": ("hipErrorInvalidKernelFile", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT": ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidGraphicsContext": ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_NVLINK_UNCORRECTABLE": ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorNvlinkUncorrectable": ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND": ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_DRIVER),
+    "cudaErrorSharedObjectSymbolNotFound": ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED": ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_DRIVER),
+    "cudaErrorSharedObjectInitFailed": ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_OPERATING_SYSTEM": ("hipErrorOperatingSystem", CONV_TYPE, API_DRIVER),
+    "cudaErrorOperatingSystem": ("hipErrorOperatingSystem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_INVALID_HANDLE": ("hipErrorInvalidResourceHandle", CONV_TYPE, API_DRIVER),
+    "cudaErrorInvalidResourceHandle": ("hipErrorInvalidResourceHandle", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_NOT_READY": ("hipErrorNotReady", CONV_TYPE, API_DRIVER),
+    "cudaErrorNotReady": ("hipErrorNotReady", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_ILLEGAL_ADDRESS": ("hipErrorIllegalAddress", CONV_TYPE, API_DRIVER),
+    "cudaErrorIllegalAddress": ("hipErrorIllegalAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES": ("hipErrorLaunchOutOfResources", CONV_TYPE, API_DRIVER),
+    "cudaErrorLaunchOutOfResources": ("hipErrorLaunchOutOfResources", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_LAUNCH_TIMEOUT": ("hipErrorLaunchTimeOut", CONV_TYPE, API_DRIVER),
+    "cudaErrorLaunchTimeout": ("hipErrorLaunchTimeOut", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED": ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_DRIVER),
+    "cudaErrorPeerAccessAlreadyEnabled": ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED": ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_DRIVER),
+    "cudaErrorPeerAccessNotEnabled": ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_ASSERT": ("hipErrorAssert", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorAssert": ("hipErrorAssert", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_TOO_MANY_PEERS": ("hipErrorTooManyPeers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorTooManyPeers": ("hipErrorTooManyPeers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED": ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_DRIVER),
+    "cudaErrorHostMemoryAlreadyRegistered": ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED": ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_DRIVER),
+    "cudaErrorHostMemoryNotRegistered": ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_RUNTIME),
+    "CUDA_ERROR_HARDWARE_STACK_ERROR": ("hipErrorHardwareStackError", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorHardwareStackError": ("hipErrorHardwareStackError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_ILLEGAL_INSTRUCTION": ("hipErrorIllegalInstruction", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorIllegalInstruction": ("hipErrorIllegalInstruction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_MISALIGNED_ADDRESS": ("hipErrorMisalignedAddress", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorMisalignedAddress": ("hipErrorMisalignedAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_INVALID_ADDRESS_SPACE": ("hipErrorInvalidAddressSpace", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorInvalidAddressSpace": ("hipErrorInvalidAddressSpace", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_INVALID_PC": ("hipErrorInvalidPc", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorInvalidPc": ("hipErrorInvalidPc", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_LAUNCH_FAILED": ("hipErrorLaunchFailure", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorLaunchFailure": ("hipErrorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_ERROR_UNKNOWN": ("hipErrorUnknown", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaErrorUnknown": ("hipErrorUnknown", CONV_TYPE, API_RUNTIME),
+    "CU_TR_ADDRESS_MODE_WRAP": ("HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TR_ADDRESS_MODE_CLAMP": ("HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TR_ADDRESS_MODE_MIRROR": ("HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TR_ADDRESS_MODE_BORDER": ("HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_POSITIVE_X": ("HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_NEGATIVE_X": ("HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_POSITIVE_Y": ("HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_NEGATIVE_Y": ("HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_POSITIVE_Z": ("HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CUBEMAP_FACE_NEGATIVE_Z": ("HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_AD_FORMAT_UNSIGNED_INT8": ("HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_UNSIGNED_INT16": ("HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_UNSIGNED_INT32": ("HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_SIGNED_INT8": ("HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_SIGNED_INT16": ("HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_SIGNED_INT32": ("HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_HALF": ("HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER),
+    "CU_AD_FORMAT_FLOAT": ("HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER),
+    "CU_COMPUTEMODE_DEFAULT": ("hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_COMPUTEMODE_EXCLUSIVE": ("hipComputeModeExclusive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_COMPUTEMODE_PROHIBITED": ("hipComputeModeProhibited", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_COMPUTEMODE_EXCLUSIVE_PROCESS": ("hipComputeModeExclusiveProcess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_SET_READ_MOSTLY": ("hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_UNSET_READ_MOSTLY": ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_SET_PREFERRED_LOCATION": ("hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION": ("hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_SET_ACCESSED_BY": ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ADVISE_UNSET_ACCESSED_BY": ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY": ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION": ("hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY": ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION": ("hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_SCHED_AUTO": ("HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_SCHED_SPIN": ("HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_SCHED_YIELD": ("HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_SCHED_BLOCKING_SYNC": ("HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_BLOCKING_SYNC": ("HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_SCHED_MASK": ("HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_MAP_HOST": ("HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_LMEM_RESIZE_TO_MAX": ("HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_CTX_FLAGS_MASK": ("HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LAUNCH_PARAM_BUFFER_POINTER": ("HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_TYPE, API_DRIVER),
+    "CU_LAUNCH_PARAM_BUFFER_SIZE": ("HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_TYPE, API_DRIVER),
+    "CU_LAUNCH_PARAM_END": ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER),
+    "CU_IPC_HANDLE_SIZE": ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTALLOC_DEVICEMAP": ("HIP_MEMHOSTALLOC_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTALLOC_PORTABLE": ("HIP_MEMHOSTALLOC_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTALLOC_WRITECOMBINED": ("HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTREGISTER_DEVICEMAP": ("HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTREGISTER_IOMEMORY": ("HIP_MEMHOSTREGISTER_IOMEMORY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMHOSTREGISTER_PORTABLE": ("HIP_MEMHOSTREGISTER_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_PARAM_TR_DEFAULT": ("HIP_PARAM_TR_DEFAULT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_LEGACY": ("HIP_STREAM_LEGACY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_PER_THREAD": ("HIP_STREAM_PER_THREAD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TRSA_OVERRIDE_FORMAT": ("HIP_TRSA_OVERRIDE_FORMAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TRSF_NORMALIZED_COORDINATES": ("HIP_TRSF_NORMALIZED_COORDINATES", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TRSF_READ_AS_INTEGER": ("HIP_TRSF_READ_AS_INTEGER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TRSF_SRGB": ("HIP_TRSF_SRGB", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_2DARRAY": ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_CUBEMAP": ("HIP_ARRAY3D_CUBEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_DEPTH_TEXTURE": ("HIP_ARRAY3D_DEPTH_TEXTURE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_LAYERED": ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_SURFACE_LDST": ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_ARRAY3D_TEXTURE_GATHER": ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    # "CUDA_VERSION": ("HIP_VERSION", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK": ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X": ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y": ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z": ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X": ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y": ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z": ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK": ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK": ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY": ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_WARP_SIZE": ("hipDeviceAttributeWarpSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_PITCH": ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK": ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK": ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CLOCK_RATE": ("hipDeviceAttributeClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT": ("hipDeviceAttributeTextureAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP": ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT": ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT": ("hipDeviceAttributeKernelExecTimeout", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_INTEGRATED": ("hipDeviceAttributeIntegrated", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY": ("hipDeviceAttributeCanMapHostMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_COMPUTE_MODE": ("hipDeviceAttributeComputeMode", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH": ("hipDeviceAttributeMaxTexture1DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH": ("hipDeviceAttributeMaxTexture2DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT": ("hipDeviceAttributeMaxTexture2DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH": ("hipDeviceAttributeMaxTexture3DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT": ("hipDeviceAttributeMaxTexture3DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH": ("hipDeviceAttributeMaxTexture3DDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH": ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT": ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS": ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH": ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT": ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES": ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT": ("hipDeviceAttributeSurfaceAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS": ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_ECC_ENABLED": ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_PCI_BUS_ID": ("hipDeviceAttributePciBusId", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID": ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_TCC_DRIVER": ("hipDeviceAttributeTccDriver", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE": ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH": ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE": ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR": ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT": ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING": ("hipDeviceAttributeUnifiedAddressing", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH": ("hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS": ("hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER": ("hipDeviceAttributeCanTex2DGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH": ("hipDeviceAttributeMaxTexture2DGatherWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT": ("hipDeviceAttributeMaxTexture2DGatherHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE": ("hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE": ("hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE": ("hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID": ("hipDeviceAttributePciDomainId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT": ("hipDeviceAttributeTexturePitchAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH": ("hipDeviceAttributeMaxTextureCubemapWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH": ("hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS": ("hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH": ("hipDeviceAttributeMaxSurface1DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH": ("hipDeviceAttributeMaxSurface2DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT": ("hipDeviceAttributeMaxSurface2DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH": ("hipDeviceAttributeMaxSurface3DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT": ("hipDeviceAttributeMaxSurface3DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH": ("hipDeviceAttributeMaxSurface3DDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH": ("hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS": ("hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH": ("hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT": ("hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS": ("hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH": ("hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH": ("hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS": ("hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH": ("hipDeviceAttributeMaxTexture1DLinearWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH": ("hipDeviceAttributeMaxTexture2DLinearWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT": ("hipDeviceAttributeMaxTexture2DLinearHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH": ("hipDeviceAttributeMaxTexture2DLinearPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH": ("hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT": ("hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR": ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR": ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH": ("hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED": ("hipDeviceAttributeStreamPrioritiesSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED": ("hipDeviceAttributeGlobalL1CacheSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED": ("hipDeviceAttributeLocalL1CacheSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR": ("hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR": ("hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY": ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD": ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_DRIVER),
+    "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID": ("hipDeviceAttributeMultiGpuBoardGroupId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED": ("hipDeviceAttributeHostNativeAtomicSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO": ("hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS": ("hipDeviceAttributePageableMemoryAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS": ("hipDeviceAttributeConcurrentManagedAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED": ("hipDeviceAttributeComputePreemptionSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM": ("hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_ATTRIBUTE_MAX": ("hipDeviceAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_CONTEXT": ("hipPointerAttributeContext", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_MEMORY_TYPE": ("hipPointerAttributeMemoryType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_DEVICE_POINTER": ("hipPointerAttributeDevicePointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_HOST_POINTER": ("hipPointerAttributeHostPointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_P2P_TOKENS": ("hipPointerAttributeP2pTokens", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_SYNC_MEMOPS": ("hipPointerAttributeSyncMemops", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_BUFFER_ID": ("hipPointerAttributeBufferId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_POINTER_ATTRIBUTE_IS_MANAGED": ("hipPointerAttributeIsManaged", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK": ("hipFuncAttributeMaxThreadsPerBlocks", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES": ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES": ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES": ("hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_NUM_REGS": ("hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_PTX_VERSION": ("hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_BINARY_VERSION": ("hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_CACHE_MODE_CA": ("hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_ATTRIBUTE_MAX": ("hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE": ("hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY": ("hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD": ("hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_REGISTER_FLAGS_NONE": ("hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY": ("hipGraphicsRegisterFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD": ("hipGraphicsRegisterFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST": ("hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER": ("hipGraphicsRegisterFlagsTextureGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_OCCUPANCY_DEFAULT": ("hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE": ("hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_FUNC_CACHE_PREFER_NONE": ("hipFuncCachePreferNone", CONV_CACHE, API_DRIVER),
+    "CU_FUNC_CACHE_PREFER_SHARED": ("hipFuncCachePreferShared", CONV_CACHE, API_DRIVER),
+    "CU_FUNC_CACHE_PREFER_L1": ("hipFuncCachePreferL1", CONV_CACHE, API_DRIVER),
+    "CU_FUNC_CACHE_PREFER_EQUAL": ("hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER),
+    "CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS": ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CUDA_IPC_HANDLE_SIZE": ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER),
+    "CU_JIT_CACHE_OPTION_NONE": ("hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_CACHE_OPTION_CG": ("hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_CACHE_OPTION_CA": ("hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_PREFER_PTX": ("hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_PREFER_BINARY": ("hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_MAX_REGISTERS": ("hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER),
+    "CU_JIT_THREADS_PER_BLOCK": ("hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER),
+    "CU_JIT_WALL_TIME": ("hipJitOptionWallTime", CONV_JIT, API_DRIVER),
+    "CU_JIT_INFO_LOG_BUFFER": ("hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER),
+    "CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES": ("hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+    "CU_JIT_ERROR_LOG_BUFFER": ("hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER),
+    "CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES": ("hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+    "CU_JIT_OPTIMIZATION_LEVEL": ("hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER),
+    "CU_JIT_TARGET_FROM_CUCONTEXT": ("hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER),
+    "CU_JIT_TARGET": ("hipJitOptionTarget", CONV_JIT, API_DRIVER),
+    "CU_JIT_FALLBACK_STRATEGY": ("hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER),
+    "CU_JIT_GENERATE_DEBUG_INFO": ("hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER),
+    "CU_JIT_LOG_VERBOSE": ("hipJitOptionLogVerbose", CONV_JIT, API_DRIVER),
+    "CU_JIT_GENERATE_LINE_INFO": ("hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER),
+    "CU_JIT_CACHE_MODE": ("hipJitOptionCacheMode", CONV_JIT, API_DRIVER),
+    "CU_JIT_NEW_SM3X_OPT": ("hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER),
+    "CU_JIT_FAST_COMPILE": ("hipJitOptionFastCompile", CONV_JIT, API_DRIVER),
+    "CU_JIT_NUM_OPTIONS": ("hipJitOptionNumOptions", CONV_JIT, API_DRIVER),
+    "CU_TARGET_COMPUTE_10": ("hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_11": ("hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_12": ("hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_13": ("hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_20": ("hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_21": ("hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_30": ("hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_32": ("hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_35": ("hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_37": ("hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_50": ("hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_52": ("hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_53": ("hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_60": ("hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_61": ("hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TARGET_COMPUTE_62": ("hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_INPUT_CUBIN": ("hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_INPUT_PTX": ("hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_INPUT_FATBINARY": ("hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_INPUT_OBJECT": ("hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_INPUT_LIBRARY": ("hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_JIT_NUM_INPUT_TYPES": ("hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LIMIT_STACK_SIZE": ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LIMIT_PRINTF_FIFO_SIZE": ("hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LIMIT_MALLOC_HEAP_SIZE": ("hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER),
+    "CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH": ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT": ("hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_LIMIT_STACK_SIZE": ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ATTACH_GLOBAL": ("hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ATTACH_HOST": ("hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEM_ATTACH_SINGLE": ("hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMORYTYPE_HOST": ("hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMORYTYPE_DEVICE": ("hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMORYTYPE_ARRAY": ("hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_MEMORYTYPE_UNIFIED": ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_RESOURCE_TYPE_ARRAY": ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_RESOURCE_TYPE_MIPMAPPED_ARRAY": ("hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_RESOURCE_TYPE_LINEAR": ("hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_RESOURCE_TYPE_PITCH2D": ("hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_RES_VIEW_FORMAT_NONE": ("hipResViewFormatNone", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_1X8": ("hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_2X8": ("hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_4X8": ("hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_1X8": ("hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_2X8": ("hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_4X8": ("hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_1X16": ("hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_2X16": ("hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_4X16": ("hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_1X16": ("hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_2X16": ("hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_4X16": ("hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_1X32": ("hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_2X32": ("hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UINT_4X32": ("hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_1X32": ("hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_2X32": ("hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SINT_4X32": ("hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_1X16": ("hipResViewFormatHalf1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_2X16": ("hipResViewFormatHalf2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_4X16": ("hipResViewFormatHalf4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_1X32": ("hipResViewFormatFloat1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_2X32": ("hipResViewFormatFloat2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_FLOAT_4X32": ("hipResViewFormatFloat4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC1": ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC2": ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC3": ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC4": ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SIGNED_BC4": ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC5": ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SIGNED_BC5": ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC6H": ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_SIGNED_BC6H": ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+    "CU_RES_VIEW_FORMAT_UNSIGNED_BC7": ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER),
+    "CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE": ("hipSharedMemBankSizeDefault", CONV_TYPE, API_DRIVER),
+    "CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE": ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_DRIVER),
+    "CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE": ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_DRIVER),
+    "CU_STREAM_DEFAULT": ("hipStreamDefault", CONV_TYPE, API_DRIVER),
+    "CU_STREAM_NON_BLOCKING": ("hipStreamNonBlocking", CONV_TYPE, API_DRIVER),
+    "CU_STREAM_WAIT_VALUE_GEQ": ("hipStreamWaitValueGeq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_WAIT_VALUE_EQ": ("hipStreamWaitValueEq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_WAIT_VALUE_AND": ("hipStreamWaitValueAnd", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_WAIT_VALUE_FLUSH": ("hipStreamWaitValueFlush", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_WRITE_VALUE_DEFAULT": ("hipStreamWriteValueDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER": ("hipStreamWriteValueNoMemoryBarrier", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_MEM_OP_WAIT_VALUE_32": ("hipStreamBatchMemOpWaitValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_MEM_OP_WRITE_VALUE_32": ("hipStreamBatchMemOpWriteValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES": ("hipStreamBatchMemOpFlushRemoteWrites", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGetErrorName": ("hipGetErrorName___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGetErrorString": ("hipGetErrorString___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+    "cuInit": ("hipInit", CONV_INIT, API_DRIVER),
+    "cuDriverGetVersion": ("hipDriverGetVersion", CONV_VERSION, API_DRIVER),
+    "cuCtxCreate_v2": ("hipCtxCreate", CONV_CONTEXT, API_DRIVER),
+    "cuCtxDestroy_v2": ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetApiVersion": ("hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetCacheConfig": ("hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetCurrent": ("hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetDevice": ("hipCtxGetDevice", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetFlags": ("hipCtxGetFlags", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetLimit": ("hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+    "cuCtxGetSharedMemConfig": ("hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+    "cuCtxGetStreamPriorityRange": ("hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+    "cuCtxPopCurrent_v2": ("hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER),
+    "cuCtxPushCurrent_v2": ("hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER),
+    "cuCtxSetCacheConfig": ("hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER),
+    "cuCtxSetCurrent": ("hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER),
+    "cuCtxSetLimit": ("hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+    "cuCtxSetSharedMemConfig": ("hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+    "cuCtxSynchronize": ("hipCtxSynchronize", CONV_CONTEXT, API_DRIVER),
+    "cuCtxAttach": ("hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+    "cuCtxDetach": ("hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+    "cuCtxEnablePeerAccess": ("hipCtxEnablePeerAccess", CONV_PEER, API_DRIVER),
+    "cuCtxDisablePeerAccess": ("hipCtxDisablePeerAccess", CONV_PEER, API_DRIVER),
+    "cuDeviceCanAccessPeer": ("hipDeviceCanAccessPeer", CONV_PEER, API_DRIVER),
+    "cuDeviceGetP2PAttribute": ("hipDeviceGetP2PAttribute", CONV_PEER, API_DRIVER, HIP_UNSUPPORTED),
+    "cuDevicePrimaryCtxGetState": ("hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER),
+    "cuDevicePrimaryCtxRelease": ("hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER),
+    "cuDevicePrimaryCtxReset": ("hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER),
+    "cuDevicePrimaryCtxRetain": ("hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER),
+    "cuDevicePrimaryCtxSetFlags": ("hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER),
+    "cuDeviceGet": ("hipGetDevice", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetName": ("hipDeviceGetName", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetCount": ("hipGetDeviceCount", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetAttribute": ("hipDeviceGetAttribute", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetPCIBusId": ("hipDeviceGetPCIBusId", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetByPCIBusId": ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_DRIVER),
+    "cuDeviceTotalMem_v2": ("hipDeviceTotalMem", CONV_DEVICE, API_DRIVER),
+    "cuDeviceComputeCapability": ("hipDeviceComputeCapability", CONV_DEVICE, API_DRIVER),
+    "cuDeviceGetProperties": ("hipGetDeviceProperties", CONV_DEVICE, API_DRIVER),
+    "cuLinkAddData": ("hipLinkAddData", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLinkAddFile": ("hipLinkAddFile", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLinkComplete": ("hipLinkComplete", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLinkCreate": ("hipLinkCreate", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLinkDestroy": ("hipLinkDestroy", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuModuleGetFunction": ("hipModuleGetFunction", CONV_MODULE, API_DRIVER),
+    "cuModuleGetGlobal_v2": ("hipModuleGetGlobal", CONV_MODULE, API_DRIVER),
+    "cuModuleGetSurfRef": ("hipModuleGetSurfRef", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuModuleGetTexRef": ("hipModuleGetTexRef", CONV_MODULE, API_DRIVER),
+    "cuModuleLoad": ("hipModuleLoad", CONV_MODULE, API_DRIVER),
+    "cuModuleLoadData": ("hipModuleLoadData", CONV_MODULE, API_DRIVER),
+    "cuModuleLoadDataEx": ("hipModuleLoadDataEx", CONV_MODULE, API_DRIVER),
+    "cuModuleLoadFatBinary": ("hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuModuleUnload": ("hipModuleUnload", CONV_MODULE, API_DRIVER),
+    "CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK": ("hipDeviceP2PAttributePerformanceRank", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED": ("hipDeviceP2PAttributeAccessSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED": ("hipDeviceP2PAttributeNativeAtomicSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_EVENT_DEFAULT": ("hipEventDefault", CONV_EVENT, API_DRIVER),
+    "CU_EVENT_BLOCKING_SYNC": ("hipEventBlockingSync", CONV_EVENT, API_DRIVER),
+    "CU_EVENT_DISABLE_TIMING": ("hipEventDisableTiming", CONV_EVENT, API_DRIVER),
+    "CU_EVENT_INTERPROCESS": ("hipEventInterprocess", CONV_EVENT, API_DRIVER),
+    "cuEventCreate": ("hipEventCreate", CONV_EVENT, API_DRIVER),
+    "cuEventDestroy_v2": ("hipEventDestroy", CONV_EVENT, API_DRIVER),
+    "cuEventElapsedTime": ("hipEventElapsedTime", CONV_EVENT, API_DRIVER),
+    "cuEventQuery": ("hipEventQuery", CONV_EVENT, API_DRIVER),
+    "cuEventRecord": ("hipEventRecord", CONV_EVENT, API_DRIVER),
+    "cuEventSynchronize": ("hipEventSynchronize", CONV_EVENT, API_DRIVER),
+    "cuFuncGetAttribute": ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuFuncSetCacheConfig": ("hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER),
+    "cuFuncSetSharedMemConfig": ("hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLaunchKernel": ("hipModuleLaunchKernel", CONV_MODULE, API_DRIVER),
+    "cuFuncSetBlockShape": ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuFuncSetSharedSize": ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLaunch": ("hipLaunch", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLaunchGrid": ("hipLaunchGrid", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuLaunchGridAsync": ("hipLaunchGridAsync", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuParamSetf": ("hipParamSetf", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuParamSeti": ("hipParamSeti", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuParamSetSize": ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuParamSetSize": ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuParamSetv": ("hipParamSetv", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuOccupancyMaxActiveBlocksPerMultiprocessor": ("hipOccupancyMaxActiveBlocksPerMultiprocessor", CONV_OCCUPANCY, API_DRIVER),
+    "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": ("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", CONV_OCCUPANCY, API_DRIVER, HIP_UNSUPPORTED),
+    "cuOccupancyMaxPotentialBlockSize": ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER),
+    "cuOccupancyMaxPotentialBlockSizeWithFlags": ("hipOccupancyMaxPotentialBlockSizeWithFlags", CONV_OCCUPANCY, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamAddCallback": ("hipStreamAddCallback", CONV_STREAM, API_DRIVER),
+    "cuStreamAttachMemAsync": ("hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamCreate": ("hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamCreateWithPriority": ("hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamDestroy_v2": ("hipStreamDestroy", CONV_STREAM, API_DRIVER),
+    "cuStreamGetFlags": ("hipStreamGetFlags", CONV_STREAM, API_DRIVER),
+    "cuStreamGetPriority": ("hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamQuery": ("hipStreamQuery", CONV_STREAM, API_DRIVER),
+    "cuStreamSynchronize": ("hipStreamSynchronize", CONV_STREAM, API_DRIVER),
+    "cuStreamWaitEvent": ("hipStreamWaitEvent", CONV_STREAM, API_DRIVER),
+    "cuStreamWaitValue32": ("hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamWriteValue32": ("hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuStreamBatchMemOp": ("hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuArray3DCreate": ("hipArray3DCreate", CONV_MEM, API_DRIVER),
+    "cuArray3DGetDescriptor": ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuArrayCreate": ("hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuArrayDestroy": ("hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuArrayGetDescriptor": ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuIpcCloseMemHandle": ("hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuIpcGetEventHandle": ("hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuIpcGetMemHandle": ("hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuIpcOpenEventHandle": ("hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuIpcOpenMemHandle": ("hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemAlloc_v2": ("hipMalloc", CONV_MEM, API_DRIVER),
+    "cuMemAllocHost": ("hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemAllocManaged": ("hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemAllocPitch": ("hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy": ("hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy2D": ("hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy2DAsync": ("hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy2DUnaligned": ("hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy3D": ("hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy3DAsync": ("hipMemcpy3DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy3DPeer": ("hipMemcpy3DPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpy3DPeerAsync": ("hipMemcpy3DPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyAsync": ("hipMemcpyAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyAtoA": ("hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyAtoD": ("hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyAtoH": ("hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyAtoHAsync": ("hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyDtoA": ("hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyDtoD_v2": ("hipMemcpyDtoD", CONV_MEM, API_DRIVER),
+    "cuMemcpyDtoDAsync_v2": ("hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER),
+    "cuMemcpyDtoH_v2": ("hipMemcpyDtoH", CONV_MEM, API_DRIVER),
+    "cuMemcpyDtoHAsync_v2": ("hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER),
+    "cuMemcpyHtoA": ("hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyHtoAAsync": ("hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyHtoD_v2": ("hipMemcpyHtoD", CONV_MEM, API_DRIVER),
+    "cuMemcpyHtoDAsync_v2": ("hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER),
+    "cuMemcpyPeerAsync": ("hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemcpyPeer": ("hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemFree_v2": ("hipFree", CONV_MEM, API_DRIVER),
+    "cuMemFreeHost": ("hipHostFree", CONV_MEM, API_DRIVER),
+    "cuMemGetAddressRange": ("hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemGetInfo_v2": ("hipMemGetInfo", CONV_MEM, API_DRIVER),
+    "cuMemHostAlloc": ("hipHostMalloc", CONV_MEM, API_DRIVER),
+    "cuMemHostGetDevicePointer": ("hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemHostGetFlags": ("hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemHostRegister_v2": ("hipHostRegister", CONV_MEM, API_DRIVER),
+    "cuMemHostUnregister": ("hipHostUnregister", CONV_MEM, API_DRIVER),
+    "cuMemsetD16_v2": ("hipMemsetD16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD16Async": ("hipMemsetD16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D16_v2": ("hipMemsetD2D16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D16Async": ("hipMemsetD2D16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D32_v2": ("hipMemsetD2D32", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D32Async": ("hipMemsetD2D32Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D8_v2": ("hipMemsetD2D8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD2D8Async": ("hipMemsetD2D8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD32_v2": ("hipMemset", CONV_MEM, API_DRIVER),
+    "cuMemsetD32Async": ("hipMemsetAsync", CONV_MEM, API_DRIVER),
+    "cuMemsetD8_v2": ("hipMemsetD8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemsetD8Async": ("hipMemsetD8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMipmappedArrayCreate": ("hipMipmappedArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMipmappedArrayDestroy": ("hipMipmappedArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMipmappedArrayGetLevel": ("hipMipmappedArrayGetLevel", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemPrefetchAsync": ("hipMemPrefetchAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemAdvise": ("hipMemAdvise", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemRangeGetAttribute": ("hipMemRangeGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuMemRangeGetAttributes": ("hipMemRangeGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuPointerGetAttribute": ("hipPointerGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuPointerGetAttributes": ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "cuPointerSetAttribute": ("hipPointerSetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_TR_FILTER_MODE_POINT": ("hipFilterModePoint", CONV_TEX, API_DRIVER),
+    "CU_TR_FILTER_MODE_LINEAR": ("hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetAddress": ("hipTexRefGetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetAddressMode": ("hipTexRefGetAddressMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetArray": ("hipTexRefGetArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetBorderColor": ("hipTexRefGetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetFilterMode": ("hipTexRefGetFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetFlags": ("hipTexRefGetFlags", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetFormat": ("hipTexRefGetFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetMaxAnisotropy": ("hipTexRefGetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetMipmapFilterMode": ("hipTexRefGetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetMipmapLevelBias": ("hipTexRefGetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetMipmapLevelClamp": ("hipTexRefGetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefGetMipmappedArray": ("hipTexRefGetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetAddress": ("hipTexRefSetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetAddress2D": ("hipTexRefSetAddress2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetAddressMode": ("hipTexRefSetAddressMode", CONV_TEX, API_DRIVER),
+    "cuTexRefSetArray": ("hipTexRefSetArray", CONV_TEX, API_DRIVER),
+    "cuTexRefSetBorderColor": ("hipTexRefSetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetFilterMode": ("hipTexRefSetFilterMode", CONV_TEX, API_DRIVER),
+    "cuTexRefSetFlags": ("hipTexRefSetFlags", CONV_TEX, API_DRIVER),
+    "cuTexRefSetFormat": ("hipTexRefSetFormat", CONV_TEX, API_DRIVER),
+    "cuTexRefSetMaxAnisotropy": ("hipTexRefSetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetMipmapFilterMode": ("hipTexRefSetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetMipmapLevelBias": ("hipTexRefSetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetMipmapLevelClamp": ("hipTexRefSetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefSetMipmappedArray": ("hipTexRefSetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefCreate": ("hipTexRefCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexRefDestroy": ("hipTexRefDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuSurfRefGetArray": ("hipSurfRefGetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuSurfRefSetArray": ("hipSurfRefSetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexObjectCreate": ("hipTexObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexObjectDestroy": ("hipTexObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexObjectGetResourceDesc": ("hipTexObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexObjectGetResourceViewDesc": ("hipTexObjectGetResourceViewDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuTexObjectGetTextureDesc": ("hipTexObjectGetTextureDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuSurfObjectCreate": ("hipSurfObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuSurfObjectDestroy": ("hipSurfObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuSurfObjectGetResourceDesc": ("hipSurfObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsMapResources": ("hipGraphicsMapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsResourceGetMappedMipmappedArray": ("hipGraphicsResourceGetMappedMipmappedArray", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsResourceGetMappedPointer": ("hipGraphicsResourceGetMappedPointer", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsResourceSetMapFlags": ("hipGraphicsResourceSetMapFlags", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsSubResourceGetMappedArray": ("hipGraphicsSubResourceGetMappedArray", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsUnmapResources": ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsUnregisterResource": ("hipGraphicsUnregisterResource", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+    "cuProfilerInitialize": ("hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED),
+    "cuProfilerStart": ("hipProfilerStart", CONV_OTHER, API_DRIVER),
+    "cuProfilerStop": ("hipProfilerStop", CONV_OTHER, API_DRIVER),
+    "CU_GL_DEVICE_LIST_ALL": ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GL_DEVICE_LIST_CURRENT_FRAME": ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GL_DEVICE_LIST_NEXT_FRAME": ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLGetDevices": ("hipGLGetDevices", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsGLRegisterBuffer": ("hipGraphicsGLRegisterBuffer", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsGLRegisterImage": ("hipGraphicsGLRegisterImage", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuWGLGetDevice": ("hipWGLGetDevice", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GL_MAP_RESOURCE_FLAGS_NONE": ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY": ("HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD": ("HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLCtxCreate": ("hipGLCtxCreate", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLInit": ("hipGLInit", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLMapBufferObject": ("hipGLMapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLMapBufferObjectAsync": ("hipGLMapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLRegisterBufferObject": ("hipGLRegisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLSetBufferObjectMapFlags": ("hipGLSetBufferObjectMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLUnmapBufferObject": ("hipGLUnmapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLUnmapBufferObjectAsync": ("hipGLUnmapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGLUnregisterBufferObject": ("hipGLUnregisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_DEVICE_LIST_ALL": ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_DEVICE_LIST_CURRENT_FRAME": ("HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_DEVICE_LIST_NEXT_FRAME": ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9CtxCreate": ("hipD3D9CtxCreate", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9CtxCreateOnDevice": ("hipD3D9CtxCreateOnDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9GetDevice": ("hipD3D9GetDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9GetDevices": ("hipD3D9GetDevices", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9GetDirect3DDevice": ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsD3D9RegisterResource": ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_MAPRESOURCE_FLAGS_NONE": ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_MAPRESOURCE_FLAGS_READONLY": ("HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD": ("HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_REGISTER_FLAGS_NONE": ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D9_REGISTER_FLAGS_ARRAY": ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9MapResources": ("hipD3D9MapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9RegisterResource": ("hipD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceGetMappedArray": ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceGetMappedPitch": ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceGetMappedPointer": ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceGetMappedSize": ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceGetSurfaceDimensions": ("hipD3D9ResourceGetSurfaceDimensions", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9ResourceSetMapFlags": ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9UnmapResources": ("hipD3D9UnmapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D9UnregisterResource": ("hipD3D9UnregisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_DEVICE_LIST_ALL": ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_DEVICE_LIST_CURRENT_FRAME": ("HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_DEVICE_LIST_NEXT_FRAME": ("HIP_D3D10_DEVICE_LIST_NEXT_FRAME", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10GetDevice": ("hipD3D10GetDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10GetDevices": ("hipD3D10GetDevices", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsD3D10RegisterResource": ("hipGraphicsD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_MAPRESOURCE_FLAGS_NONE": ("HIP_D3D10_MAPRESOURCE_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_MAPRESOURCE_FLAGS_READONLY": ("HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD": ("HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_REGISTER_FLAGS_NONE": ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D10_REGISTER_FLAGS_ARRAY": ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10CtxCreate": ("hipD3D10CtxCreate", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10CtxCreateOnDevice": ("hipD3D10CtxCreateOnDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10GetDirect3DDevice": ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10MapResources": ("hipD3D10MapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10RegisterResource": ("hipD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10ResourceGetMappedArray": ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10ResourceGetMappedPitch": ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10ResourceGetMappedPointer": ("hipD3D10ResourceGetMappedPointer", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10ResourceGetMappedSize": ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10ResourceGetSurfaceDimensions": ("hipD3D10ResourceGetSurfaceDimensions", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD310ResourceSetMapFlags": ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10UnmapResources": ("hipD3D10UnmapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D10UnregisterResource": ("hipD3D10UnregisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D11_DEVICE_LIST_ALL": ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D11_DEVICE_LIST_CURRENT_FRAME": ("HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "CU_D3D11_DEVICE_LIST_NEXT_FRAME": ("HIP_D3D11_DEVICE_LIST_NEXT_FRAME", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D11GetDevice": ("hipD3D11GetDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D11GetDevices": ("hipD3D11GetDevices", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsD3D11RegisterResource": ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D11CtxCreate": ("hipD3D11CtxCreate", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D11CtxCreateOnDevice": ("hipD3D11CtxCreateOnDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuD3D11GetDirect3DDevice": ("hipD3D11GetDirect3DDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsVDPAURegisterOutputSurface": ("hipGraphicsVDPAURegisterOutputSurface", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsVDPAURegisterVideoSurface": ("hipGraphicsVDPAURegisterVideoSurface", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+    "cuVDPAUGetDevice": ("hipVDPAUGetDevice", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+    "cuVDPAUCtxCreate": ("hipVDPAUCtxCreate", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamConsumerAcquireFrame": ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamConsumerConnect": ("hipEGLStreamConsumerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamConsumerConnectWithFlags": ("hipEGLStreamConsumerConnectWithFlags", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamConsumerDisconnect": ("hipEGLStreamConsumerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamConsumerReleaseFrame": ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamProducerConnect": ("hipEGLStreamProducerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamProducerDisconnect": ("hipEGLStreamProducerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamProducerPresentFrame": ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuEGLStreamProducerReturnFrame": ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsEGLRegisterImage": ("hipGraphicsEGLRegisterImage", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cuGraphicsResourceGetMappedEglFrame": ("hipGraphicsResourceGetMappedEglFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+    "cudaDataType_t": ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDataType": ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_16F": ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_16F": ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_32F": ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_32F": ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_64F": ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_64F": ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_8I": ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_8I": ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_8U": ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_8U": ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_32I": ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_32I": ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_R_32U": ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "CUDA_C_32U": ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "MAJOR_VERSION": ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "MINOR_VERSION": ("hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "PATCH_LEVEL": ("hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAttachGlobal": ("hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAttachHost": ("hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAttachSingle": ("hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyDefault": ("hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyDisableCachingOverride": ("hipOccupancyDisableCachingOverride", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetLastError": ("hipGetLastError", CONV_ERROR, API_RUNTIME),
+    "cudaPeekAtLastError": ("hipPeekAtLastError", CONV_ERROR, API_RUNTIME),
+    "cudaGetErrorName": ("hipGetErrorName", CONV_ERROR, API_RUNTIME),
+    "cudaGetErrorString": ("hipGetErrorString", CONV_ERROR, API_RUNTIME),
+    "cudaMemcpy3DParms": ("hipMemcpy3DParms", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy3DPeerParms": ("hipMemcpy3DPeerParms", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy": ("hipMemcpy", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyToArray": ("hipMemcpyToArray", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyToSymbol": ("hipMemcpyToSymbol", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyToSymbolAsync": ("hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyAsync": ("hipMemcpyAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy2D": ("hipMemcpy2D", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy2DAsync": ("hipMemcpy2DAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy2DToArray": ("hipMemcpy2DToArray", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy2DArrayToArray": ("hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy2DFromArray": ("hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy2DFromArrayAsync": ("hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy2DToArrayAsync": ("hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy3D": ("hipMemcpy3D", CONV_MEM, API_RUNTIME),
+    "cudaMemcpy3DAsync": ("hipMemcpy3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy3DPeer": ("hipMemcpy3DPeer", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpy3DPeerAsync": ("hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpyArrayToArray": ("hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpyFromArrayAsync": ("hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpyFromSymbol": ("hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyFromSymbolAsync": ("hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemAdvise": ("hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeGetAttribute": ("hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeGetAttributes": ("hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseSetReadMostly": ("hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseUnsetReadMostly": ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseSetPreferredLocation": ("hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseUnsetPreferredLocation": ("hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseSetAccessedBy": ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemAdviseUnsetAccessedBy": ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeAttributeReadMostly": ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeAttributePreferredLocation": ("hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeAttributeAccessedBy": ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemRangeAttributeLastPrefetchLocation": ("hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemcpyHostToHost": ("hipMemcpyHostToHost", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyHostToDevice": ("hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyDeviceToHost": ("hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyDeviceToDevice": ("hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyDefault": ("hipMemcpyDefault", CONV_MEM, API_RUNTIME),
+    "cudaMemset": ("hipMemset", CONV_MEM, API_RUNTIME),
+    "cudaMemsetAsync": ("hipMemsetAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemset2D": ("hipMemset2D", CONV_MEM, API_RUNTIME),
+    "cudaMemset2DAsync": ("hipMemset2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemset3D": ("hipMemset3D", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemset3DAsync": ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemGetInfo": ("hipMemGetInfo", CONV_MEM, API_RUNTIME),
+    "cudaArrayGetInfo": ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaFreeMipmappedArray": ("hipFreeMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetMipmappedArrayLevel": ("hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetSymbolAddress": ("hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetSymbolSize": ("hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMemPrefetchAsync": ("hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMalloc": ("hipMalloc", CONV_MEM, API_RUNTIME),
+    "cudaMallocHost": ("hipHostMalloc", CONV_MEM, API_RUNTIME),
+    "cudaMallocArray": ("hipMallocArray", CONV_MEM, API_RUNTIME),
+    "cudaMalloc3D": ("hipMalloc3D", CONV_MEM, API_RUNTIME),
+    "cudaMalloc3DArray": ("hipMalloc3DArray", CONV_MEM, API_RUNTIME),
+    "cudaMallocManaged": ("hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMallocMipmappedArray": ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaMallocPitch": ("hipMallocPitch", CONV_MEM, API_RUNTIME),
+    "cudaFree": ("hipFree", CONV_MEM, API_RUNTIME),
+    "cudaFreeHost": ("hipHostFree", CONV_MEM, API_RUNTIME),
+    "cudaFreeArray": ("hipFreeArray", CONV_MEM, API_RUNTIME),
+    "cudaHostRegister": ("hipHostRegister", CONV_MEM, API_RUNTIME),
+    "cudaHostUnregister": ("hipHostUnregister", CONV_MEM, API_RUNTIME),
+    "cudaHostAlloc": ("hipHostMalloc", CONV_MEM, API_RUNTIME),
+    "cudaMemoryTypeHost": ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME),
+    "cudaMemoryTypeDevice": ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME),
+    "make_cudaExtent": ("make_hipExtent", CONV_MEM, API_RUNTIME),
+    "make_cudaPitchedPtr": ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME),
+    "make_cudaPos": ("make_hipPos", CONV_MEM, API_RUNTIME),
+    "cudaHostAllocDefault": ("hipHostMallocDefault", CONV_MEM, API_RUNTIME),
+    "cudaHostAllocPortable": ("hipHostMallocPortable", CONV_MEM, API_RUNTIME),
+    "cudaHostAllocMapped": ("hipHostMallocMapped", CONV_MEM, API_RUNTIME),
+    "cudaHostAllocWriteCombined": ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME),
+    "cudaHostGetFlags": ("hipHostGetFlags", CONV_MEM, API_RUNTIME),
+    "cudaHostRegisterDefault": ("hipHostRegisterDefault", CONV_MEM, API_RUNTIME),
+    "cudaHostRegisterPortable": ("hipHostRegisterPortable", CONV_MEM, API_RUNTIME),
+    "cudaHostRegisterMapped": ("hipHostRegisterMapped", CONV_MEM, API_RUNTIME),
+    "cudaHostRegisterIoMemory": ("hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME),
+    # "warpSize": ("hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME), (HIP actually uses warpSize...)
+    "cudaEventCreate": ("hipEventCreate", CONV_EVENT, API_RUNTIME),
+    "cudaEventCreateWithFlags": ("hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME),
+    "cudaEventDestroy": ("hipEventDestroy", CONV_EVENT, API_RUNTIME),
+    "cudaEventRecord": ("hipEventRecord", CONV_EVENT, API_RUNTIME),
+    "cudaEventElapsedTime": ("hipEventElapsedTime", CONV_EVENT, API_RUNTIME),
+    "cudaEventSynchronize": ("hipEventSynchronize", CONV_EVENT, API_RUNTIME),
+    "cudaEventQuery": ("hipEventQuery", CONV_EVENT, API_RUNTIME),
+    "cudaEventDefault": ("hipEventDefault", CONV_EVENT, API_RUNTIME),
+    "cudaEventBlockingSync": ("hipEventBlockingSync", CONV_EVENT, API_RUNTIME),
+    "cudaEventDisableTiming": ("hipEventDisableTiming", CONV_EVENT, API_RUNTIME),
+    "cudaEventInterprocess": ("hipEventInterprocess", CONV_EVENT, API_RUNTIME),
+    "cudaStreamCreate": ("hipStreamCreate", CONV_STREAM, API_RUNTIME),
+    "cudaStreamCreateWithFlags": ("hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME),
+    "cudaStreamCreateWithPriority": ("hipStreamCreateWithPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaStreamDestroy": ("hipStreamDestroy", CONV_STREAM, API_RUNTIME),
+    "cudaStreamWaitEvent": ("hipStreamWaitEvent", CONV_STREAM, API_RUNTIME),
+    "cudaStreamSynchronize": ("hipStreamSynchronize", CONV_STREAM, API_RUNTIME),
+    "cudaStreamGetFlags": ("hipStreamGetFlags", CONV_STREAM, API_RUNTIME),
+    "cudaStreamQuery": ("hipStreamQuery", CONV_STREAM, API_RUNTIME),
+    "cudaStreamAddCallback": ("hipStreamAddCallback", CONV_STREAM, API_RUNTIME),
+    "cudaStreamAttachMemAsync": ("hipStreamAttachMemAsync", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaStreamGetPriority": ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaStreamDefault": ("hipStreamDefault", CONV_TYPE, API_RUNTIME),
+    "cudaStreamNonBlocking": ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceSynchronize": ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceReset": ("hipDeviceReset", CONV_DEVICE, API_RUNTIME),
+    "cudaSetDevice": ("hipSetDevice", CONV_DEVICE, API_RUNTIME),
+    "cudaGetDevice": ("hipGetDevice", CONV_DEVICE, API_RUNTIME),
+    "cudaGetDeviceCount": ("hipGetDeviceCount", CONV_DEVICE, API_RUNTIME),
+    "cudaChooseDevice": ("hipChooseDevice", CONV_DEVICE, API_RUNTIME),
+    "cudaThreadExit": ("hipDeviceReset", CONV_THREAD, API_RUNTIME),
+    "cudaThreadGetCacheConfig": ("hipDeviceGetCacheConfig", CONV_THREAD, API_RUNTIME),
+    "cudaThreadGetLimit": ("hipThreadGetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaThreadSetCacheConfig": ("hipDeviceSetCacheConfig", CONV_THREAD, API_RUNTIME),
+    "cudaThreadSetLimit": ("hipThreadSetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaThreadSynchronize": ("hipDeviceSynchronize", CONV_THREAD, API_RUNTIME),
+    "cudaDeviceGetAttribute": ("hipDeviceGetAttribute", CONV_DEVICE, API_RUNTIME),
+    "cudaDevAttrMaxThreadsPerBlock": ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxBlockDimX": ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxBlockDimY": ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxBlockDimZ": ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxGridDimX": ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxGridDimY": ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxGridDimZ": ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxSharedMemoryPerBlock": ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrTotalConstantMemory": ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrWarpSize": ("hipDeviceAttributeWarpSize", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxPitch": ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxRegistersPerBlock": ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrClockRate": ("hipDeviceAttributeClockRate", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrTextureAlignment": ("hipDeviceAttributeTextureAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrGpuOverlap": ("hipDeviceAttributeGpuOverlap", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMultiProcessorCount": ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrKernelExecTimeout": ("hipDeviceAttributeKernelExecTimeout", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrIntegrated": ("hipDeviceAttributeIntegrated", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrCanMapHostMemory": ("hipDeviceAttributeCanMapHostMemory", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrComputeMode": ("hipDeviceAttributeComputeMode", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxTexture1DWidth": ("hipDeviceAttributeMaxTexture1DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DWidth": ("hipDeviceAttributeMaxTexture2DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DHeight": ("hipDeviceAttributeMaxTexture2DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DWidth": ("hipDeviceAttributeMaxTexture3DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DHeight": ("hipDeviceAttributeMaxTexture3DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DDepth": ("hipDeviceAttributeMaxTexture3DDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLayeredWidth": ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLayeredHeight": ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLayeredLayers": ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrSurfaceAlignment": ("hipDeviceAttributeSurfaceAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrConcurrentKernels": ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrEccEnabled": ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrPciBusId": ("hipDeviceAttributePciBusId", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrPciDeviceId": ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrTccDriver": ("hipDeviceAttributeTccDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMemoryClockRate": ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrGlobalMemoryBusWidth": ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrL2CacheSize": ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxThreadsPerMultiProcessor": ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrAsyncEngineCount": ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrUnifiedAddressing": ("hipDeviceAttributeUnifiedAddressing", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture1DLayeredWidth": ("hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture1DLayeredLayers": ("hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DGatherWidth": ("hipDeviceAttributeMaxTexture2DGatherWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DGatherHeight": ("hipDeviceAttributeMaxTexture2DGatherHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DWidthAlt": ("hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DHeightAlt": ("hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture3DDepthAlt": ("hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrPciDomainId": ("hipDeviceAttributePciDomainId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrTexturePitchAlignment": ("hipDeviceAttributeTexturePitchAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTextureCubemapWidth": ("hipDeviceAttributeMaxTextureCubemapWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTextureCubemapLayeredWidth": ("hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTextureCubemapLayeredLayers": ("hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface1DWidth": ("hipDeviceAttributeMaxSurface1DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface2DWidth": ("hipDeviceAttributeMaxSurface2DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface2DHeight": ("hipDeviceAttributeMaxSurface2DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface3DWidth": ("hipDeviceAttributeMaxSurface3DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface3DHeight": ("hipDeviceAttributeMaxSurface3DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface3DDepth": ("hipDeviceAttributeMaxSurface3DDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface1DLayeredWidth": ("hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface1DLayeredLayers": ("hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface2DLayeredWidth": ("hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface2DLayeredHeight": ("hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurface2DLayeredLayers": ("hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurfaceCubemapWidth": ("hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurfaceCubemapLayeredWidth": ("hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSurfaceCubemapLayeredLayers": ("hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture1DLinearWidth": ("hipDeviceAttributeMaxTexture1DLinearWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLinearWidth": ("hipDeviceAttributeMaxTexture2DLinearWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLinearHeight": ("hipDeviceAttributeMaxTexture2DLinearHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DLinearPitch": ("hipDeviceAttributeMaxTexture2DLinearPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DMipmappedWidth": ("hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxTexture2DMipmappedHeight": ("hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrComputeCapabilityMajor": ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrComputeCapabilityMinor": ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxTexture1DMipmappedWidth": ("hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrStreamPrioritiesSupported": ("hipDeviceAttributeStreamPrioritiesSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrGlobalL1CacheSupported": ("hipDeviceAttributeGlobalL1CacheSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrLocalL1CacheSupported": ("hipDeviceAttributeLocalL1CacheSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrMaxSharedMemoryPerMultiprocessor": ("hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMaxRegistersPerMultiprocessor": ("hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrManagedMemory": ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrIsMultiGpuBoard": ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_RUNTIME),
+    "cudaDevAttrMultiGpuBoardGroupID": ("hipDeviceAttributeMultiGpuBoardGroupID", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrHostNativeAtomicSupported": ("hipDeviceAttributeHostNativeAtomicSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrSingleToDoublePrecisionPerfRatio": ("hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrPageableMemoryAccess": ("hipDeviceAttributePageableMemoryAccess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrConcurrentManagedAccess": ("hipDeviceAttributeConcurrentManagedAccess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrComputePreemptionSupported": ("hipDeviceAttributeComputePreemptionSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevAttrCanUseHostPointerForRegisteredMem": ("hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaPointerGetAttributes": ("hipPointerGetAttributes", CONV_MEM, API_RUNTIME),
+    "cudaHostGetDevicePointer": ("hipHostGetDevicePointer", CONV_MEM, API_RUNTIME),
+    "cudaGetDeviceProperties": ("hipGetDeviceProperties", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceGetPCIBusId": ("hipDeviceGetPCIBusId", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceGetByPCIBusId": ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceGetStreamPriorityRange": ("hipDeviceGetStreamPriorityRange", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSetValidDevices": ("hipSetValidDevices", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevP2PAttrPerformanceRank": ("hipDeviceP2PAttributePerformanceRank", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevP2PAttrAccessSupported": ("hipDeviceP2PAttributeAccessSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDevP2PAttrNativeAtomicSupported": ("hipDeviceP2PAttributeNativeAtomicSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceGetP2PAttribute": ("hipDeviceGetP2PAttribute", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaComputeModeDefault": ("hipComputeModeDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaComputeModeExclusive": ("hipComputeModeExclusive", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaComputeModeProhibited": ("hipComputeModeProhibited", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaComputeModeExclusiveProcess": ("hipComputeModeExclusiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetDeviceFlags": ("hipGetDeviceFlags", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSetDeviceFlags": ("hipSetDeviceFlags", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceScheduleAuto": ("hipDeviceScheduleAuto", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceScheduleSpin": ("hipDeviceScheduleSpin", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceScheduleYield": ("hipDeviceScheduleYield", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceBlockingSync": ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceScheduleBlockingSync": ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceScheduleMask": ("hipDeviceScheduleMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceMapHost": ("hipDeviceMapHost", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceLmemResizeToMax": ("hipDeviceLmemResizeToMax", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceMask": ("hipDeviceMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceSetCacheConfig": ("hipDeviceSetCacheConfig", CONV_CACHE, API_RUNTIME),
+    "cudaDeviceGetCacheConfig": ("hipDeviceGetCacheConfig", CONV_CACHE, API_RUNTIME),
+    "cudaFuncSetCacheConfig": ("hipFuncSetCacheConfig", CONV_CACHE, API_RUNTIME),
+    "cudaFuncCachePreferNone": ("hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME),
+    "cudaFuncCachePreferShared": ("hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME),
+    "cudaFuncCachePreferL1": ("hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME),
+    "cudaFuncCachePreferEqual": ("hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME),
+    "cudaFuncGetAttributes": ("hipFuncGetAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaFuncSetSharedMemConfig": ("hipFuncSetSharedMemConfig", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetParameterBuffer": ("hipGetParameterBuffer", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSetDoubleForDevice": ("hipSetDoubleForDevice", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSetDoubleForHost": ("hipSetDoubleForHost", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaConfigureCall": ("hipConfigureCall", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaLaunch": ("hipLaunch", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaSetupArgument": ("hipSetupArgument", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDriverGetVersion": ("hipDriverGetVersion", CONV_VERSION, API_RUNTIME),
+    "cudaRuntimeGetVersion": ("hipRuntimeGetVersion", CONV_VERSION, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyMaxPotentialBlockSize": ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_RUNTIME),
+    "cudaOccupancyMaxPotentialBlockSizeWithFlags": ("hipOccupancyMaxPotentialBlockSizeWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyMaxActiveBlocksPerMultiprocessor": ("hipOccupancyMaxActiveBlocksPerMultiprocessor", CONV_OCCUPANCY, API_RUNTIME),
+    "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": ("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyMaxPotentialBlockSizeVariableSMem": ("hipOccupancyMaxPotentialBlockSizeVariableSMem", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags": ("hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceCanAccessPeer": ("hipDeviceCanAccessPeer", CONV_PEER, API_RUNTIME),
+    "cudaDeviceDisablePeerAccess": ("hipDeviceDisablePeerAccess", CONV_PEER, API_RUNTIME),
+    "cudaDeviceEnablePeerAccess": ("hipDeviceEnablePeerAccess", CONV_PEER, API_RUNTIME),
+    "cudaMemcpyPeerAsync": ("hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME),
+    "cudaMemcpyPeer": ("hipMemcpyPeer", CONV_MEM, API_RUNTIME),
+    "cudaIpcMemLazyEnablePeerAccess": ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME),
+    "cudaDeviceSetSharedMemConfig": ("hipDeviceSetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+    "cudaDeviceGetSharedMemConfig": ("hipDeviceGetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+    "cudaSharedMemBankSizeDefault": ("hipSharedMemBankSizeDefault", CONV_TYPE, API_RUNTIME),
+    "cudaSharedMemBankSizeFourByte": ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_RUNTIME),
+    "cudaSharedMemBankSizeEightByte": ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_RUNTIME),
+    "cudaLimitStackSize": ("hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaLimitPrintfFifoSize": ("hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaLimitMallocHeapSize": ("hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME),
+    "cudaLimitDevRuntimeSyncDepth": ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaLimitDevRuntimePendingLaunchCount": ("hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDeviceGetLimit": ("hipDeviceGetLimit", CONV_DEVICE, API_RUNTIME),
+    "cudaProfilerInitialize": ("hipProfilerInitialize", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaProfilerStart": ("hipProfilerStart", CONV_OTHER, API_RUNTIME),
+    "cudaProfilerStop": ("hipProfilerStop", CONV_OTHER, API_RUNTIME),
+    "cudaKeyValuePair": ("hipKeyValuePair", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaCSV": ("hipCSV", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaReadModeElementType": ("hipReadModeElementType", CONV_TEX, API_RUNTIME),
+    "cudaReadModeNormalizedFloat": ("hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME),
+    "cudaFilterModePoint": ("hipFilterModePoint", CONV_TEX, API_RUNTIME),
+    "cudaFilterModeLinear": ("hipFilterModeLinear", CONV_TEX, API_RUNTIME),
+    "cudaBindTexture": ("hipBindTexture", CONV_TEX, API_RUNTIME),
+    "cudaUnbindTexture": ("hipUnbindTexture", CONV_TEX, API_RUNTIME),
+    "cudaBindTexture2D": ("hipBindTexture2D", CONV_TEX, API_RUNTIME),
+    "cudaBindTextureToArray": ("hipBindTextureToArray", CONV_TEX, API_RUNTIME),
+    "cudaBindTextureToMipmappedArray": ("hipBindTextureToMipmappedArray", CONV_TEX, API_RUNTIME),
+    "cudaGetTextureAlignmentOffset": ("hipGetTextureAlignmentOffset", CONV_TEX, API_RUNTIME),
+    "cudaGetTextureReference": ("hipGetTextureReference", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatKindSigned": ("hipChannelFormatKindSigned", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatKindUnsigned": ("hipChannelFormatKindUnsigned", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatKindFloat": ("hipChannelFormatKindFloat", CONV_TEX, API_RUNTIME),
+    "cudaChannelFormatKindNone": ("hipChannelFormatKindNone", CONV_TEX, API_RUNTIME),
+    "cudaCreateChannelDesc": ("hipCreateChannelDesc", CONV_TEX, API_RUNTIME),
+    "cudaGetChannelDesc": ("hipGetChannelDesc", CONV_TEX, API_RUNTIME),
+    "cudaResourceTypeArray": ("hipResourceTypeArray", CONV_TEX, API_RUNTIME),
+    "cudaResourceTypeMipmappedArray": ("hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME),
+    "cudaResourceTypeLinear": ("hipResourceTypeLinear", CONV_TEX, API_RUNTIME),
+    "cudaResourceTypePitch2D": ("hipResourceTypePitch2D", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatNone": ("hipResViewFormatNone", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedChar1": ("hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedChar2": ("hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedChar4": ("hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedChar1": ("hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedChar2": ("hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedChar4": ("hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedShort1": ("hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedShort2": ("hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedShort4": ("hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedShort1": ("hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedShort2": ("hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedShort4": ("hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedInt1": ("hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedInt2": ("hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedInt4": ("hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedInt1": ("hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedInt2": ("hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedInt4": ("hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatHalf1": ("hipResViewFormatHalf1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatHalf2": ("hipResViewFormatHalf2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatHalf4": ("hipResViewFormatHalf4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatFloat1": ("hipResViewFormatFloat1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatFloat2": ("hipResViewFormatFloat2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatFloat4": ("hipResViewFormatFloat4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed1": ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed2": ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed3": ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed4": ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedBlockCompressed4": ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed5": ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedBlockCompressed5": ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed6H": ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatSignedBlockCompressed6H": ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+    "cudaResViewFormatUnsignedBlockCompressed7": ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME),
+    "cudaAddressModeWrap": ("hipAddressModeWrap", CONV_TEX, API_RUNTIME),
+    "cudaAddressModeClamp": ("hipAddressModeClamp", CONV_TEX, API_RUNTIME),
+    "cudaAddressModeMirror": ("hipAddressModeMirror", CONV_TEX, API_RUNTIME),
+    "cudaAddressModeBorder": ("hipAddressModeBorder", CONV_TEX, API_RUNTIME),
+    "cudaCreateTextureObject": ("hipCreateTextureObject", CONV_TEX, API_RUNTIME),
+    "cudaDestroyTextureObject": ("hipDestroyTextureObject", CONV_TEX, API_RUNTIME),
+    "cudaGetTextureObjectResourceDesc": ("hipGetTextureObjectResourceDesc", CONV_TEX, API_RUNTIME),
+    "cudaGetTextureObjectResourceViewDesc": ("hipGetTextureObjectResourceViewDesc", CONV_TEX, API_RUNTIME),
+    "cudaGetTextureObjectTextureDesc": ("hipGetTextureObjectTextureDesc", CONV_TEX, API_RUNTIME),
+    "cudaBindSurfaceToArray": ("hipBindSurfaceToArray", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetSurfaceReference": ("hipGetSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaBoundaryModeZero": ("hipBoundaryModeZero", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaBoundaryModeClamp": ("hipBoundaryModeClamp", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaBoundaryModeTrap": ("hipBoundaryModeTrap", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaFormatModeForced": ("hipFormatModeForced", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaFormatModeAuto": ("hipFormatModeAuto", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaCreateSurfaceObject": ("hipCreateSurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaDestroySurfaceObject": ("hipDestroySurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGetSurfaceObjectResourceDesc": ("hipGetSurfaceObjectResourceDesc", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaIpcCloseMemHandle": ("hipIpcCloseMemHandle", CONV_DEVICE, API_RUNTIME),
+    "cudaIpcGetEventHandle": ("hipIpcGetEventHandle", CONV_DEVICE, API_RUNTIME),
+    "cudaIpcGetMemHandle": ("hipIpcGetMemHandle", CONV_DEVICE, API_RUNTIME),
+    "cudaIpcOpenEventHandle": ("hipIpcOpenEventHandle", CONV_DEVICE, API_RUNTIME),
+    "cudaIpcOpenMemHandle": ("hipIpcOpenMemHandle", CONV_DEVICE, API_RUNTIME),
+    "cudaGLGetDevices": ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsGLRegisterBuffer": ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsGLRegisterImage": ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaWGLGetDevice": ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsMapResources": ("hipGraphicsMapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsResourceGetMappedMipmappedArray": ("hipGraphicsResourceGetMappedMipmappedArray", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsResourceGetMappedPointer": ("hipGraphicsResourceGetMappedPointer", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsResourceSetMapFlags": ("hipGraphicsResourceSetMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsSubResourceGetMappedArray": ("hipGraphicsSubResourceGetMappedArray", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsUnmapResources": ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsUnregisterResource": ("hipGraphicsUnregisterResource", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFacePositiveX": ("hipGraphicsCubeFacePositiveX", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFaceNegativeX": ("hipGraphicsCubeFaceNegativeX", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFacePositiveY": ("hipGraphicsCubeFacePositiveY", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFaceNegativeY": ("hipGraphicsCubeFaceNegativeY", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFacePositiveZ": ("hipGraphicsCubeFacePositiveZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsCubeFaceNegativeZ": ("hipGraphicsCubeFaceNegativeZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsMapFlagsNone": ("hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsMapFlagsReadOnly": ("hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsMapFlagsWriteDiscard": ("hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlagsNone": ("hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlagsReadOnly": ("hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlagsWriteDiscard": ("hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlagsSurfaceLoadStore": ("hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsRegisterFlagsTextureGather": ("hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLDeviceListAll": ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLDeviceListCurrentFrame": ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLDeviceListNextFrame": ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLGetDevices": ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsGLRegisterBuffer": ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsGLRegisterImage": ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaWGLGetDevice": ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapFlagsNone": ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapFlagsReadOnly": ("HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapFlagsWriteDiscard": ("HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapBufferObject": ("hipGLMapBufferObject__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLMapBufferObjectAsync": ("hipGLMapBufferObjectAsync__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLRegisterBufferObject": ("hipGLRegisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLSetBufferObjectMapFlags": ("hipGLSetBufferObjectMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLSetGLDevice": ("hipGLSetGLDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLUnmapBufferObject": ("hipGLUnmapBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLUnmapBufferObjectAsync": ("hipGLUnmapBufferObjectAsync", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGLUnregisterBufferObject": ("hipGLUnregisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9DeviceListAll": ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9DeviceListCurrentFrame": ("HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9DeviceListNextFrame": ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9GetDevice": ("hipD3D9GetDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9GetDevices": ("hipD3D9GetDevices", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9GetDirect3DDevice": ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9SetDirect3DDevice": ("hipD3D9SetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsD3D9RegisterResource": ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapFlags": ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapFlagsNone": ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapFlagsReadOnly": ("HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapFlagsWriteDiscard": ("HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9RegisterFlagsNone": ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9RegisterFlagsArray": ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9MapResources": ("hipD3D9MapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9RegisterResource": ("hipD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceGetMappedArray": ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceGetMappedPitch": ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceGetMappedPointer": ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceGetMappedSize": ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceGetSurfaceDimensions": ("hipD3D9ResourceGetSurfaceDimensions", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9ResourceSetMapFlags": ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9UnmapResources": ("hipD3D9UnmapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D9UnregisterResource": ("hipD3D9UnregisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10DeviceListAll": ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10DeviceListCurrentFrame": ("HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10DeviceListNextFrame": ("HIP_D3D10_DEVICE_LIST_NEXT_FRAME", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10GetDevice": ("hipD3D10GetDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10GetDevices": ("hipD3D10GetDevices", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsD3D10RegisterResource": ("hipGraphicsD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10MapFlagsNone": ("HIP_D3D10_MAPRESOURCE_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10MapFlagsReadOnly": ("HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10MapFlagsWriteDiscard": ("HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10RegisterFlagsNone": ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10RegisterFlagsArray": ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10GetDirect3DDevice": ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10MapResources": ("hipD3D10MapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10RegisterResource": ("hipD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceGetMappedArray": ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceGetMappedPitch": ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceGetMappedPointer": ("hipD3D10ResourceGetMappedPointer", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceGetMappedSize": ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceGetSurfaceDimensions": ("hipD3D10ResourceGetSurfaceDimensions", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10ResourceSetMapFlags": ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10SetDirect3DDevice": ("hipD3D10SetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10UnmapResources": ("hipD3D10UnmapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D10UnregisterResource": ("hipD3D10UnregisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11DeviceListAll": ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11DeviceListCurrentFrame": ("HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11DeviceListNextFrame": ("HIP_D3D11_DEVICE_LIST_NEXT_FRAME", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11GetDevice": ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11GetDevices": ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsD3D11RegisterResource": ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11GetDevice": ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaD3D11GetDevices": ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsD3D11RegisterResource": ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsVDPAURegisterOutputSurface": ("hipGraphicsVDPAURegisterOutputSurface", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsVDPAURegisterVideoSurface": ("hipGraphicsVDPAURegisterVideoSurface", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaVDPAUGetDevice": ("hipVDPAUGetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaVDPAUSetVDPAUDevice": ("hipVDPAUSetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamConsumerAcquireFrame": ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamConsumerConnect": ("hipEGLStreamConsumerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamConsumerConnectWithFlags": ("hipEGLStreamConsumerConnectWithFlags", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamConsumerReleaseFrame": ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamProducerConnect": ("hipEGLStreamProducerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamProducerDisconnect": ("hipEGLStreamProducerDisconnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamProducerPresentFrame": ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaEGLStreamProducerReturnFrame": ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsEGLRegisterImage": ("hipGraphicsEGLRegisterImage", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cudaGraphicsResourceGetMappedEglFrame": ("hipGraphicsResourceGetMappedEglFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+    "cublasInit": ("hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasShutdown": ("hipblasShutdown", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGetVersion": ("hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGetError": ("hipblasGetError", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasAlloc": ("hipblasAlloc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasFree": ("hipblasFree", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetKernelStream": ("hipblasSetKernelStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGetAtomicsMode": ("hipblasGetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetAtomicsMode": ("hipblasSetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGetMathMode": ("hipblasGetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetMathMode": ("hipblasSetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_OP_N": ("HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_OP_T": ("HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_OP_C": ("HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_SUCCESS": ("HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_NOT_INITIALIZED": ("HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_ALLOC_FAILED": ("HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_INVALID_VALUE": ("HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_MAPPING_ERROR": ("HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_EXECUTION_FAILED": ("HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_INTERNAL_ERROR": ("HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_NOT_SUPPORTED": ("HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_STATUS_ARCH_MISMATCH": ("HIPBLAS_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_FILL_MODE_LOWER": ("HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_FILL_MODE_UPPER": ("HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_DIAG_NON_UNIT": ("HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_DIAG_UNIT": ("HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_SIDE_LEFT": ("HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_SIDE_RIGHT": ("HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_POINTER_MODE_HOST": ("HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_POINTER_MODE_DEVICE": ("HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUBLAS_ATOMICS_NOT_ALLOWED": ("HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_ATOMICS_ALLOWED": ("HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_DATA_FLOAT": ("HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_DATA_DOUBLE": ("HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_DATA_HALF": ("HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "CUBLAS_DATA_INT8": ("HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCreate": ("hipblasCreate", CONV_MATH_FUNC, API_BLAS),
+    "cublasDestroy": ("hipblasDestroy", CONV_MATH_FUNC, API_BLAS),
+    "cublasSetVector": ("hipblasSetVector", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetVector": ("hipblasGetVector", CONV_MATH_FUNC, API_BLAS),
+    "cublasSetVectorAsync": ("hipblasSetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGetVectorAsync": ("hipblasGetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetMatrix": ("hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetMatrix": ("hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetMatrixAsync": ("hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetMatrixAsync": ("hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasXerbla": ("hipblasXerbla", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSnrm2": ("hipblasSnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDnrm2": ("hipblasDnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasScnrm2": ("hipblasScnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDznrm2": ("hipblasDznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasNrm2Ex": ("hipblasNrm2Ex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSdot": ("hipblasSdot", CONV_MATH_FUNC, API_BLAS),
+    "cublasSdotBatched": ("hipblasSdotBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDdot": ("hipblasDdot", CONV_MATH_FUNC, API_BLAS),
+    "cublasDdotBatched": ("hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasCdotu": ("hipblasCdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCdotc": ("hipblasCdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdotu": ("hipblasZdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdotc": ("hipblasZdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSscal": ("hipblasSscal", CONV_MATH_FUNC, API_BLAS),
+    "cublasSscalBatched": ("hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDscal": ("hipblasDscal", CONV_MATH_FUNC, API_BLAS),
+    "cublasDscalBatched": ("hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasCscal": ("hipblasCscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsscal": ("hipblasCsscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZscal": ("hipblasZscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdscal": ("hipblasZdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSaxpy": ("hipblasSaxpy", CONV_MATH_FUNC, API_BLAS),
+    "cublasSaxpyBatched": ("hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDaxpy": ("hipblasDaxpy", CONV_MATH_FUNC, API_BLAS),
+    "cublasCaxpy": ("hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZaxpy": ("hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasScopy": ("hipblasScopy", CONV_MATH_FUNC, API_BLAS),
+    "cublasScopyBatched": ("hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDcopy": ("hipblasDcopy", CONV_MATH_FUNC, API_BLAS),
+    "cublasDcopyBatched": ("hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasCcopy": ("hipblasCcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZcopy": ("hipblasZcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSswap": ("hipblasSswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDswap": ("hipblasDswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCswap": ("hipblasCswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZswap": ("hipblasZswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIsamax": ("hipblasIsamax", CONV_MATH_FUNC, API_BLAS),
+    "cublasIdamax": ("hipblasIdamax", CONV_MATH_FUNC, API_BLAS),
+    "cublasIcamax": ("hipblasIcamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIzamax": ("hipblasIzamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIsamin": ("hipblasIsamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIdamin": ("hipblasIdamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIcamin": ("hipblasIcamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIzamin": ("hipblasIzamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSasum": ("hipblasSasum", CONV_MATH_FUNC, API_BLAS),
+    "cublasSasumBatched": ("hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDasum": ("hipblasDasum", CONV_MATH_FUNC, API_BLAS),
+    "cublasDasumBatched": ("hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasScasum": ("hipblasScasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDzasum": ("hipblasDzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrot": ("hipblasSrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrot": ("hipblasDrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCrot": ("hipblasCrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsrot": ("hipblasCsrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZrot": ("hipblasZrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdrot": ("hipblasZdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotg": ("hipblasSrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotg": ("hipblasDrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCrotg": ("hipblasCrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZrotg": ("hipblasZrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotm": ("hipblasSrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotm": ("hipblasDrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotmg": ("hipblasSrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotmg": ("hipblasDrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemv": ("hipblasSgemv", CONV_MATH_FUNC, API_BLAS),
+    "cublasSgemvBatched": ("hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgemv": ("hipblasDgemv", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgemv": ("hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemv": ("hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgbmv": ("hipblasSgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgbmv": ("hipblasDgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgbmv": ("hipblasCgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgbmv": ("hipblasZgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrmv": ("hipblasStrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrmv": ("hipblasDtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrmv": ("hipblasCtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrmv": ("hipblasZtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStbmv": ("hipblasStbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtbmv": ("hipblasDtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtbmv": ("hipblasCtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtbmv": ("hipblasZtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStpmv": ("hipblasStpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtpmv": ("hipblasDtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtpmv": ("hipblasCtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtpmv": ("hipblasZtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsv": ("hipblasStrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsv": ("hipblasDtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsv": ("hipblasCtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsv": ("hipblasZtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStpsv": ("hipblasStpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtpsv": ("hipblasDtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtpsv": ("hipblasCtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtpsv": ("hipblasZtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStbsv": ("hipblasStbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtbsv": ("hipblasDtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtbsv": ("hipblasCtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtbsv": ("hipblasZtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsymv": ("hipblasSsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsymv": ("hipblasDsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsymv": ("hipblasCsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsymv": ("hipblasZsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChemv": ("hipblasChemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhemv": ("hipblasZhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsbmv": ("hipblasSsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsbmv": ("hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChbmv": ("hipblasChbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhbmv": ("hipblasZhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspmv": ("hipblasSspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspmv": ("hipblasDspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpmv": ("hipblasChpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpmv": ("hipblasZhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSger": ("hipblasSger", CONV_MATH_FUNC, API_BLAS),
+    "cublasDger": ("hipblasDger", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgeru": ("hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgerc": ("hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgeru": ("hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgerc": ("hipblasZgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr": ("hipblasSsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr": ("hipblasDsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher": ("hipblasCher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher": ("hipblasZher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspr": ("hipblasSspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspr": ("hipblasDspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpr": ("hipblasChpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpr": ("hipblasZhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr2": ("hipblasSsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr2": ("hipblasDsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher2": ("hipblasCher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher2": ("hipblasZher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspr2": ("hipblasSspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspr2": ("hipblasDspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpr2": ("hipblasChpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpr2": ("hipblasZhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemm": ("hipblasSgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasDgemm": ("hipblasDgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgemm": ("hipblasCgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasZgemm": ("hipblasZgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasHgemm": ("hipblasHgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemmBatched": ("hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasDgemmBatched": ("hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS),
+    "cublasHgemmBatched": ("hipblasHgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemmStridedBatched": ("hipblasSgemmStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgemmStridedBatched": ("hipblasDgemmStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemmBatched": ("hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemm3mBatched": ("hipblasCgemm3mBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemmBatched": ("hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemmStridedBatched": ("hipblasCgemmStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemm3mStridedBatched": ("hipblasCgemm3mStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemmStridedBatched": ("hipblasZgemmStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasHgemmStridedBatched": ("hipblasHgemmStridedBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyrk": ("hipblasSsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyrk": ("hipblasDsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyrk": ("hipblasCsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyrk": ("hipblasZsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCherk": ("hipblasCherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZherk": ("hipblasZherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr2k": ("hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr2k": ("hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyr2k": ("hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyr2k": ("hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyrkx": ("hipblasSsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyrkx": ("hipblasDsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyrkx": ("hipblasCsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyrkx": ("hipblasZsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher2k": ("hipblasCher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher2k": ("hipblasZher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCherkx": ("hipblasCherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZherkx": ("hipblasZherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsymm": ("hipblasSsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsymm": ("hipblasDsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsymm": ("hipblasCsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsymm": ("hipblasZsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChemm": ("hipblasChemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhemm": ("hipblasZhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsm": ("hipblasStrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsm": ("hipblasDtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsm": ("hipblasCtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsm": ("hipblasZtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsmBatched": ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsmBatched": ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsmBatched": ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsmBatched": ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrmm": ("hipblasStrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrmm": ("hipblasDtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrmm": ("hipblasCtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrmm": ("hipblasZtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgeam": ("hipblasSgeam", CONV_MATH_FUNC, API_BLAS),
+    "cublasDgeam": ("hipblasDgeam", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgeam": ("hipblasCgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgeam": ("hipblasZgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgetrfBatched": ("hipblasSgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgetrfBatched": ("hipblasDgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgetrfBatched": ("hipblasCgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgetrfBatched": ("hipblasZgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgetriBatched": ("hipblasSgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgetriBatched": ("hipblasDgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgetriBatched": ("hipblasCgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgetriBatched": ("hipblasZgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgetrsBatched": ("hipblasSgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgetrsBatched": ("hipblasDgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgetrsBatched": ("hipblasCgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgetrsBatched": ("hipblasZgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsmBatched": ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsmBatched": ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsmBatched": ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsmBatched": ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSmatinvBatched": ("hipblasSmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDmatinvBatched": ("hipblasDmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCmatinvBatched": ("hipblasCmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZmatinvBatched": ("hipblasZmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgeqrfBatched": ("hipblasSgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgeqrfBatched": ("hipblasDgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgeqrfBatched": ("hipblasCgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgeqrfBatched": ("hipblasZgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgelsBatched": ("hipblasSgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgelsBatched": ("hipblasDgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgelsBatched": ("hipblasCgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgelsBatched": ("hipblasZgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSdgmm": ("hipblasSdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDdgmm": ("hipblasDdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCdgmm": ("hipblasCdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdgmm": ("hipblasZdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStpttr": ("hipblasStpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtpttr": ("hipblasDtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtpttr": ("hipblasCtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtpttr": ("hipblasZtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrttp": ("hipblasStrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrttp": ("hipblasDtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrttp": ("hipblasCtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrttp": ("hipblasZtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCreate_v2": ("hipblasCreate", CONV_MATH_FUNC, API_BLAS),
+    "cublasDestroy_v2": ("hipblasDestroy", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetVersion_v2": ("hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSetStream": ("hipblasSetStream", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetStream": ("hipblasGetStream", CONV_MATH_FUNC, API_BLAS),
+    "cublasSetStream_v2": ("hipblasSetStream", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetStream_v2": ("hipblasGetStream", CONV_MATH_FUNC, API_BLAS),
+    "cublasGetPointerMode_v2": ("hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS),
+    "cublasSetPointerMode_v2": ("hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS),
+    "cublasSgemv_v2": ("hipblasSgemv", CONV_MATH_FUNC, API_BLAS),
+    "cublasDgemv_v2": ("hipblasDgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemv_v2": ("hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemv_v2": ("hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgbmv_v2": ("hipblasSgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDgbmv_v2": ("hipblasDgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgbmv_v2": ("hipblasCgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgbmv_v2": ("hipblasZgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrmv_v2": ("hipblasStrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrmv_v2": ("hipblasDtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrmv_v2": ("hipblasCtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrmv_v2": ("hipblasZtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStbmv_v2": ("hipblasStbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtbmv_v2": ("hipblasDtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtbmv_v2": ("hipblasCtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtbmv_v2": ("hipblasZtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStpmv_v2": ("hipblasStpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtpmv_v2": ("hipblasDtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtpmv_v2": ("hipblasCtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtpmv_v2": ("hipblasZtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsv_v2": ("hipblasStrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsv_v2": ("hipblasDtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsv_v2": ("hipblasCtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsv_v2": ("hipblasZtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStpsv_v2": ("hipblasStpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtpsv_v2": ("hipblasDtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtpsv_v2": ("hipblasCtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtpsv_v2": ("hipblasZtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStbsv_v2": ("hipblasStbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtbsv_v2": ("hipblasDtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtbsv_v2": ("hipblasCtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtbsv_v2": ("hipblasZtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsymv_v2": ("hipblasSsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsymv_v2": ("hipblasDsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsymv_v2": ("hipblasCsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsymv_v2": ("hipblasZsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChemv_v2": ("hipblasChemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhemv_v2": ("hipblasZhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsbmv_v2": ("hipblasSsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsbmv_v2": ("hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChbmv_v2": ("hipblasChbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhbmv_v2": ("hipblasZhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspmv_v2": ("hipblasSspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspmv_v2": ("hipblasDspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpmv_v2": ("hipblasChpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpmv_v2": ("hipblasZhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSger_v2": ("hipblasSger", CONV_MATH_FUNC, API_BLAS),
+    "cublasDger_v2": ("hipblasDger", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgeru_v2": ("hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgerc_v2": ("hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgeru_v2": ("hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgerc_v2": ("hipblasZgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr_v2": ("hipblasSsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr_v2": ("hipblasDsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyr_v2": ("hipblasCsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyr_v2": ("hipblasZsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher_v2": ("hipblasCher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher_v2": ("hipblasZher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspr_v2": ("hipblasSspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspr_v2": ("hipblasDspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpr_v2": ("hipblasChpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpr_v2": ("hipblasZhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr2_v2": ("hipblasSsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr2_v2": ("hipblasDsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyr2_v2": ("hipblasCsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyr2_v2": ("hipblasZsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher2_v2": ("hipblasCher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher2_v2": ("hipblasZher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSspr2_v2": ("hipblasSspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDspr2_v2": ("hipblasDspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChpr2_v2": ("hipblasChpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhpr2_v2": ("hipblasZhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemm_v2": ("hipblasSgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasDgemm_v2": ("hipblasDgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgemm_v2": ("hipblasCgemm", CONV_MATH_FUNC, API_BLAS),
+    "cublasCgemm3m": ("hipblasCgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemm3mEx": ("hipblasCgemm3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemm_v2": ("hipblasZgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZgemm3m": ("hipblasZgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSgemmEx": ("hipblasSgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasGemmEx": ("hipblasGemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCgemmEx": ("hipblasCgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasUint8gemmBias": ("hipblasUint8gemmBias", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyrk_v2": ("hipblasSsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyrk_v2": ("hipblasDsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyrk_v2": ("hipblasCsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyrk_v2": ("hipblasZsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyrkEx": ("hipblasCsyrkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyrk3mEx": ("hipblasCsyrk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCherk_v2": ("hipblasCherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCherkEx": ("hipblasCherkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCherk3mEx": ("hipblasCherk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZherk_v2": ("hipblasZherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsyr2k_v2": ("hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsyr2k_v2": ("hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsyr2k_v2": ("hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsyr2k_v2": ("hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCher2k_v2": ("hipblasCher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZher2k_v2": ("hipblasZher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSsymm_v2": ("hipblasSsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDsymm_v2": ("hipblasDsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsymm_v2": ("hipblasCsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZsymm_v2": ("hipblasZsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasChemm_v2": ("hipblasChemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZhemm_v2": ("hipblasZhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrsm_v2": ("hipblasStrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrsm_v2": ("hipblasDtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrsm_v2": ("hipblasCtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrsm_v2": ("hipblasZtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasStrmm_v2": ("hipblasStrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDtrmm_v2": ("hipblasDtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCtrmm_v2": ("hipblasCtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZtrmm_v2": ("hipblasZtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSnrm2_v2": ("hipblasSnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDnrm2_v2": ("hipblasDnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasScnrm2_v2": ("hipblasScnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDznrm2_v2": ("hipblasDznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDotEx": ("hipblasDotEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDotcEx": ("hipblasDotcEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSdot_v2": ("hipblasSdot", CONV_MATH_FUNC, API_BLAS),
+    "cublasDdot_v2": ("hipblasDdot", CONV_MATH_FUNC, API_BLAS),
+    "cublasCdotu_v2": ("hipblasCdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCdotc_v2": ("hipblasCdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdotu_v2": ("hipblasZdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdotc_v2": ("hipblasZdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasScalEx": ("hipblasScalEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSscal_v2": ("hipblasSscal", CONV_MATH_FUNC, API_BLAS),
+    "cublasDscal_v2": ("hipblasDscal", CONV_MATH_FUNC, API_BLAS),
+    "cublasCscal_v2": ("hipblasCscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsscal_v2": ("hipblasCsscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZscal_v2": ("hipblasZscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdscal_v2": ("hipblasZdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasAxpyEx": ("hipblasAxpyEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSaxpy_v2": ("hipblasSaxpy", CONV_MATH_FUNC, API_BLAS),
+    "cublasDaxpy_v2": ("hipblasDaxpy", CONV_MATH_FUNC, API_BLAS),
+    "cublasCaxpy_v2": ("hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZaxpy_v2": ("hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasScopy_v2": ("hipblasScopy", CONV_MATH_FUNC, API_BLAS),
+    "cublasDcopy_v2": ("hipblasDcopy", CONV_MATH_FUNC, API_BLAS),
+    "cublasCcopy_v2": ("hipblasCcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZcopy_v2": ("hipblasZcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSswap_v2": ("hipblasSswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDswap_v2": ("hipblasDswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCswap_v2": ("hipblasCswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZswap_v2": ("hipblasZswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIsamax_v2": ("hipblasIsamax", CONV_MATH_FUNC, API_BLAS),
+    "cublasIdamax_v2": ("hipblasIdamax", CONV_MATH_FUNC, API_BLAS),
+    "cublasIcamax_v2": ("hipblasIcamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIzamax_v2": ("hipblasIzamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIsamin_v2": ("hipblasIsamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIdamin_v2": ("hipblasIdamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIcamin_v2": ("hipblasIcamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasIzamin_v2": ("hipblasIzamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSasum_v2": ("hipblasSasum", CONV_MATH_FUNC, API_BLAS),
+    "cublasDasum_v2": ("hipblasDasum", CONV_MATH_FUNC, API_BLAS),
+    "cublasScasum_v2": ("hipblasScasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDzasum_v2": ("hipblasDzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrot_v2": ("hipblasSrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrot_v2": ("hipblasDrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCrot_v2": ("hipblasCrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCsrot_v2": ("hipblasCsrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZrot_v2": ("hipblasZrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZdrot_v2": ("hipblasZdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotg_v2": ("hipblasSrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotg_v2": ("hipblasDrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasCrotg_v2": ("hipblasCrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasZrotg_v2": ("hipblasZrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotm_v2": ("hipblasSrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotm_v2": ("hipblasDrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasSrotmg_v2": ("hipblasSrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "cublasDrotmg_v2": ("hipblasDrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+    "CURAND_STATUS_SUCCESS": ("HIPRNG_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_VERSION_MISMATCH": ("hiprng_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_NOT_INITIALIZED": ("hiprng_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_ALLOCATION_FAILED": ("hiprng_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_TYPE_ERROR": ("hiprng_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_OUT_OF_RANGE": ("hiprng_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_LENGTH_NOT_MULTIPLE": ("hiprng_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_DOUBLE_PRECISION_REQUIRED": ("hiprng_STATUS_DOUBLE_PRECISION_REQUIRED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_LAUNCH_FAILURE": ("hiprng_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_PREEXISTING_FAILURE": ("hiprng_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_INITIALIZATION_FAILED": ("hiprng_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_ARCH_MISMATCH": ("hiprng_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_INTERNAL_ERROR": ("hiprng_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_TEST": ("hiprng_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND),
+    "mtgp32dc_params_fast_11213": ("mtgp32_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_DEFAULT": ("hiprng_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_XORWOW": ("hiprng_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MRG32K3A": ("hiprng_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MTGP32": ("hiprng_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MT19937": ("hiprng_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_PHILOX4_32_10": ("hiprng_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_DEFAULT": ("hiprng_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SOBOL32": ("hiprng_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SCRAMBLED_SOBOL32": ("hiprng_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SOBOL64": ("hiprng_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SCRAMBLED_SOBOL64": ("hiprng_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_ORDERING_PSEUDO_BEST": ("hiprng_ORDERING_PSEUDO_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_PSEUDO_DEFAULT": ("hiprng_ORDERING_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_PSEUDO_SEEDED": ("hiprng_ORDERING_PSEUDO_SEEDED", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_QUASI_DEFAULT": ("hiprng_ORDERING_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DIRECTION_VECTORS_32_JOEKUO6": ("hiprng_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6": ("hiprng_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DIRECTION_VECTORS_64_JOEKUO6": ("hiprng_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6": ("hiprng_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_CHOOSE_BEST": ("hiprng_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ITR": ("hiprng_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_KNUTH": ("hiprng_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_HITR": ("hiprng_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_M1": ("hiprng_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_M2": ("hiprng_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_BINARY_SEARCH": ("hiprng_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DISCRETE_GAUSS": ("hiprng_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_REJECTION": ("hiprng_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DEVICE_API": ("hiprng_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_FAST_REJECTION": ("hiprng_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_3RD": ("hiprng_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DEFINITION": ("hiprng_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_POISSON": ("hiprng_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curandCreateGenerator": ("hiprngCreateGenerator", CONV_MATH_FUNC, API_RAND),
+    "curandCreateGeneratorHost": ("hiprngCreateGeneratorHost", CONV_MATH_FUNC, API_RAND),
+    "curandCreatePoissonDistribution": ("hiprngCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND),
+    "curandDestroyDistribution": ("hiprngDestroyDistribution", CONV_MATH_FUNC, API_RAND),
+    "curandDestroyGenerator": ("hiprngDestroyGenerator", CONV_MATH_FUNC, API_RAND),
+    "curandGenerate": ("hiprngGenerate", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLogNormal": ("hiprngGenerateLogNormal", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLogNormalDouble": ("hiprngGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLongLong": ("hiprngGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGenerateNormal": ("hiprngGenerateNormal", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateNormalDouble": ("hiprngGenerateNormalDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGeneratePoisson": ("hiprngGeneratePoisson", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateSeeds": ("hiprngGenerateSeeds", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateUniform": ("hiprngGenerateUniform", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateUniformDouble": ("hiprngGenerateUniformDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGetDirectionVectors32": ("hiprngGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetDirectionVectors64": ("hiprngGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetProperty": ("hiprngGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetScrambleConstants32": ("hiprngGetScrambleConstants32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetScrambleConstants64": ("hiprngGetScrambleConstants64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetVersion": ("hiprngGetVersion", CONV_MATH_FUNC, API_RAND),
+    "curandSetGeneratorOffset": ("hiprngSetGeneratorOffset", CONV_MATH_FUNC, API_RAND),
+    "curandSetGeneratorOrdering": ("hiprngSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandSetPseudoRandomGeneratorSeed": ("hiprngSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND),
+    "curandSetQuasiRandomGeneratorDimensions": ("hiprngSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND),
+    "curandSetStream": ("hiprngSetStream", CONV_MATH_FUNC, API_RAND),
+    "curand": ("hiprng", CONV_DEVICE_FUNC, API_RAND),
+    "curand_init": ("hiprng_init", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal": ("hiprng_log_normal", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal_double": ("hiprng_log_normal_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal2": ("hiprng_log_normal2", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal2_double": ("hiprng_log_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal4": ("hiprng_log_normal4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal4_double": ("hiprng_log_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_mtgp32_single": ("hiprng_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_mtgp32_single_specific": ("hiprng_mtgp32_single_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_mtgp32_specific": ("hiprng_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_normal": ("hiprng_normal", CONV_DEVICE_FUNC, API_RAND),
+    "curandMakeMTGP32Constants": ("hiprngMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND),
+    "curandMakeMTGP32KernelState": ("hiprngMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal_double": ("hiprng_normal_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal2": ("hiprng_normal2", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal2_double": ("hiprng_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal4": ("hiprng_normal4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal4_double": ("hiprng_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform": ("hiprng_uniform", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform_double": ("hiprng_uniform_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform2_double": ("hiprng_uniform2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform4": ("hiprng_uniform4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform4_double": ("hiprng_uniform4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_discrete": ("hiprng_discrete", CONV_DEVICE_FUNC, API_RAND),
+    "curand_discrete4": ("hiprng_discrete4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_poisson": ("hiprng_poisson", CONV_DEVICE_FUNC, API_RAND),
+    "curand_poisson4": ("hiprng_poisson4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_Philox4x32_10": ("hiprng_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)
+}
+
+CUDA_SPARSE_MAP = {
+    "cusparseStatus_t": ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPARSE),
+    "cusparseHandle_t": ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPARSE),
+    "cusparseOperation_t": ("hcsparseOperation_t", CONV_TYPE, API_SPARSE),
+    "cusparseCreate": ("hipsparseCreate", CONV_MATH_FUNC, API_SPARSE),
+    "cusparseDestroy": ("hipsparseDestroy", CONV_MATH_FUNC, API_SPARSE),
+    "CUSPARSE_STATUS_SUCCESS": ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_NOT_INITIALIZED": ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_ALLOC_FAILED": ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_INVALID_VALUE": ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_MAPPING_ERROR": ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_EXECUTION_FAILED": ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_INTERNAL_ERROR": ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED": ("HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPARSE),
+    "CUSPARSE_STATUS_ARCH_MISMATCH": ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPARSE),
+}
+
+PYTORCH_SPECIFIC_MAPPINGS = {
+    "cudaHostAllocator": ("hipHostAllocator", API_PYTORCH),
+    "cudaUVAAllocator": ("hipUVAAllocator", API_PYTORCH),
+    "cudaDeviceAllocator": ("hipDeviceAllocator", API_PYTORCH),
+    "define MAX_NUM_BLOCKS 200": ("define MAX_NUM_BLOCKS 64", API_PYTORCH),
+}
+
+CUDA_TO_HIP_MAPPINGS = [CUDA_TYPE_NAME_MAP, CUDA_IDENTIFIER_MAP,
+                        CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, PYTORCH_SPECIFIC_MAPPINGS]
diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py
new file mode 100755
index 0000000..15a717c
--- /dev/null
+++ b/tools/amd_build/pyHIPIFY/hipify-python.py
@@ -0,0 +1,1169 @@
+#!/usr/bin/python
+""" The Python Hipify script.
+##
+# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+#               2017-2018 Advanced Micro Devices, Inc. and
+#                         Facebook Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import constants
+import re
+import shutil
+import sys
+import os
+import yaml
+
+from functools import reduce
+from enum import Enum
+from cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
+
+# Hardcode the PyTorch template map
+"""This dictionary provides the mapping from PyTorch kernel template types
+to their actual types."""
+PYTORCH_TEMPLATE_MAP = {"Dtype": "real", "T": "real"}
+
+
+def openf(filename, mode):
+    if sys.version_info[0] == 3:
+        return open(filename, mode, errors='ignore')
+    else:
+        return open(filename, mode)
+
+
+# Color coding for printing
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+class disablefuncmode(Enum):
+    """ How to disable functions
+    REMOVE - Remove the function entirely (includes the signature).
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```
+            ```
+
+    STUB - Stub the function and return an empty object based off the type.
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ret_type obj;
+                return obj;
+            }```
+
+
+    HCC_MACRO - Add !defined(__HIP_PLATFORM_HCC__) preprocessors around the function.
+        This macro is defined by HIP if the compiler used is hcc.
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```#if !defined(__HIP_PLATFORM_HCC__)
+                    ret_type function(arg_type1 arg1, ..., ){
+                    ...
+                    ...
+                    ...
+                }
+               #endif
+            ```
+
+
+    DEVICE_MACRO - Add !defined(__HIP_DEVICE_COMPILE__) preprocessors around the function.
+        This macro is defined by HIP if either hcc or nvcc are used in the device path.
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```#if !defined(__HIP_DEVICE_COMPILE__)
+                    ret_type function(arg_type1 arg1, ..., ){
+                    ...
+                    ...
+                    ...
+                }
+               #endif
+            ```
+
+
+    EXCEPTION - Stub the function and throw an exception at runtime.
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                throw std::runtime_error("The function function is not implemented.")
+            }```
+
+
+    ASSERT - Stub the function and throw an assert(0).
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                assert(0);
+            }```
+
+
+    EMPTYBODY - Stub the function and keep an empty body.
+        e.g.
+        FROM:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ...
+                ...
+                ...
+            }```
+
+        TO:
+            ```ret_type function(arg_type1 arg1, ..., ){
+                ;
+            }```
+
+
+
+    """
+    REMOVE = 0
+    STUB = 1
+    HCC_MACRO = 2
+    DEVICE_MACRO = 3
+    EXCEPTION = 4
+    ASSERT = 5
+    EMPTYBODY = 6
+
+
+def update_progress_bar(total, progress):
+    """Displays and updates a console progress bar."""
+    barLength, status = 20, ""
+    progress = float(progress) / float(total)
+    if progress >= 1.:
+        progress, status = 1, "\r\n"
+
+    # Number of blocks to display. Used to visualize progress.
+    block = int(round(barLength * progress))
+    text = "\r[{}] {:.0f}% {}".format(
+        "#" * block + "-" * (barLength - block), round(progress * 100, 0),
+        status)
+
+    # Send the progress to stdout.
+    sys.stderr.write(text)
+
+    # Send the buffered text to stdout!
+    sys.stderr.flush()
+
+
+def filename_ends_with_extension(filename, extensions):
+    """Helper method to see if filename ends with certain extension"""
+    for ext in extensions:
+        if filename.endswith("." + ext):
+            return True
+
+    return False
+
+
+def inside_included_directories(dirpath, rootpath, include_dirs):
+    """Helper method to see if filename within included directories"""
+    for included_directory in include_dirs:
+        if re.match(r'{0}\b'.format(os.path.join(rootpath, included_directory)), dirpath):
+            return True
+
+    return False
+
+
+def walk_over_directory(rootpath, extensions, show_detailed=False, include_dirs=None, show_progress=True):
+    """
+    Recursively walk over the directory and call preprocessor on selected files.
+
+    Arguments)
+        extensions - A plist of file extensions ['cu', 'cuh', ..]
+
+        include_dirs - Directories under the rootpath that should be included in the walk.
+
+        show_detailed - Show a detailed summary of the transpilation process.
+    """
+
+    # Default argument for excluded directories.
+    if include_dirs is None:
+        include_dirs = []
+
+    # Compute the total number of files to be traversed.
+    total_files = 0
+    for (dirpath, _dirnames, filenames) in os.walk(rootpath):
+        if inside_included_directories(dirpath, rootpath, include_dirs):
+            for filename in filenames:
+                total_files += filename_ends_with_extension(filename, extensions)
+
+    current_file = 0
+
+    # Preprocessing statistics.
+    stats = {"unsupported_calls": [], "kernel_launches": []}
+
+    # Begin traversing the files.
+    for (dirpath, _dirnames, filenames) in os.walk(rootpath, topdown=True):
+        # Check if file ends with a valid extensions
+        if not inside_included_directories(dirpath, rootpath, include_dirs):
+            continue
+
+        for filename in filenames:
+            if filename_ends_with_extension(filename, extensions):
+                # Construct the file's full path
+                filepath = os.sep.join([dirpath, filename])
+
+                # Execute the preprocessor on the specified file.
+                preprocessor(filepath, stats)
+
+                # Update the progress
+                if show_progress:
+                    print(os.path.join(dirpath, filename))
+                    update_progress_bar(total_files, current_file)
+
+                current_file += 1
+
+    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC)
+
+    # Show detailed summary
+    if show_detailed:
+        compute_stats(stats)
+
+
+def compute_stats(stats):
+    unsupported_calls = {cuda_call for (cuda_call, _filepath) in stats["unsupported_calls"]}
+
+    # Print the number of unsupported calls
+    print("Total number of unsupported CUDA function calls: {0:d}".format(len(unsupported_calls)))
+
+    # Print the list of unsupported calls
+    print(", ".join(unsupported_calls))
+
+    # Print the number of kernel launches
+    print("\nTotal number of replaced kernel launches: {0:d}".format(len(stats["kernel_launches"])))
+
+
+def processKernelLaunches(string, stats):
+    """ Replace the CUDA style Kernel launches with the HIP style kernel launches."""
+    # Concat the namespace with the kernel names. (Find cleaner way of doing this later).
+    string = re.sub(r'([ ]+)(detail?)::[ ]+\\\n[ ]+', lambda inp: "{0}{1}::".format(inp.group(1), inp.group(2)), string)
+
+    def grab_method_and_template(in_kernel):
+        # The positions for relevant kernel components.
+        pos = {
+            "kernel_launch": {"start": in_kernel["start"], "end": in_kernel["end"]},
+            "kernel_name": {"start": -1, "end": -1},
+            "template": {"start": -1, "end": -1}
+        }
+
+        # Count for balancing template
+        count = {"<>": 0}
+
+        # Status for whether we are parsing a certain item.
+        START = 0
+        AT_TEMPLATE = 1
+        AFTER_TEMPLATE = 2
+        AT_KERNEL_NAME = 3
+
+        status = START
+
+        # Parse the string character by character
+        for i in range(pos["kernel_launch"]["start"] - 1, -1, -1):
+            char = string[i]
+
+            # Handle Templating Arguments
+            if status == START or status == AT_TEMPLATE:
+                if char == ">":
+                    if status == START:
+                        status = AT_TEMPLATE
+                        pos["template"]["end"] = i
+                    count["<>"] += 1
+
+                if char == "<":
+                    count["<>"] -= 1
+                    if count["<>"] == 0 and (status == AT_TEMPLATE):
+                        pos["template"]["start"] = i
+                        status = AFTER_TEMPLATE
+
+            # Handle Kernel Name
+            if status != AT_TEMPLATE:
+                if string[i] == "(" or string[i] == ")" or string[i] == "_" or string[i].isalnum() or string[i] == ":":
+                    if status != AT_KERNEL_NAME:
+                        status = AT_KERNEL_NAME
+                        pos["kernel_name"]["end"] = i
+
+                    # Case: Kernel name starts the string.
+                    if i == 0:
+                        pos["kernel_name"]["start"] = 0
+
+                        # Finished
+                        return [(pos["kernel_name"]), (pos["template"]), (pos["kernel_launch"])]
+
+                else:
+                    # Potential ending point if we're already traversing a kernel's name.
+                    if status == AT_KERNEL_NAME:
+                        pos["kernel_name"]["start"] = i
+
+                        # Finished
+                        return [(pos["kernel_name"]), (pos["template"]), (pos["kernel_launch"])]
+
+    def find_kernel_bounds(string):
+        """Finds the starting and ending points for all kernel launches in the string."""
+        kernel_end = 0
+        kernel_positions = []
+
+        # Continue until we cannot find any more kernels anymore.
+        while string.find("<<<", kernel_end) != -1:
+            # Get kernel starting position (starting from the previous ending point)
+            kernel_start = string.find("<<<", kernel_end)
+
+            # Get kernel ending position (adjust end point past the >>>)
+            kernel_end = string.find(">>>", kernel_start) + 3
+            if kernel_end <= 0:
+                raise InputError("no kernel end found")
+
+            # Add to list of traversed kernels
+            kernel_positions.append({"start": kernel_start, "end": kernel_end,
+                                     "group": string[kernel_start: kernel_end]})
+
+        return kernel_positions
+
+    # Grab positional ranges of all kernel launchces
+    get_kernel_positions = [k for k in find_kernel_bounds(string)]
+    output_string = string
+
+    # Replace each CUDA kernel with a HIP kernel.
+    for kernel in get_kernel_positions:
+        # Get kernel components
+        params = grab_method_and_template(kernel)
+
+        # Find parenthesis after kernel launch
+        parenthesis = string.find("(", kernel["end"])
+
+        # Extract cuda kernel
+        cuda_kernel = string[params[0]["start"]:parenthesis + 1]
+
+        # Keep number of kernel launch params consistent (grid dims, group dims, stream, dynamic shared size)
+        num_klp = len(extract_arguments(0, kernel["group"].replace("<<<", "(").replace(">>>", ")")))
+
+        # Transform cuda kernel to hip kernel
+        hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel[0:-1].replace(
+            ">>>", ", 0" * (4 - num_klp) + ">>>").replace("<<<", ", ").replace(">>>", ", ")
+
+        # Replace cuda kernel with hip kernel
+        output_string = output_string.replace(cuda_kernel, hip_kernel)
+
+        # Update the statistics
+        stats["kernel_launches"].append(hip_kernel)
+
+    return output_string
+
+
+def find_parenthesis_end(input_string, start):
+    inside_parenthesis = False
+    parens = 0
+    pos = start
+    p_start, p_end = -1, -1
+
+    while pos < len(input_string):
+        if input_string[pos] == "(":
+            if inside_parenthesis is False:
+                inside_parenthesis = True
+                parens = 1
+                p_start = pos
+            else:
+                parens += 1
+        elif input_string[pos] == ")" and inside_parenthesis:
+            parens -= 1
+
+            if parens == 0:
+                p_end = pos
+                return p_start, p_end
+
+        pos += 1
+    return None, None
+
+
+def disable_asserts(input_string):
+    """ Disables regular assert statements
+    e.g. "assert(....)" -> "/*assert(....)*/"
+    """
+    output_string = input_string
+    asserts = list(re.finditer(r"\bassert[ ]*\(", input_string))
+    for assert_item in asserts:
+        p_start, p_end = find_parenthesis_end(input_string, assert_item.end() - 1)
+        start = assert_item.start()
+        output_string = output_string.replace(input_string[start:p_end + 1], "")
+    return output_string
+
+def replace_forceinline(input_string):
+    """__forceinline__'d methods can cause 'symbol multiply defined' errors in HIP. 
+    Adding 'static' to all such methods leads to compilation errors, so
+    replacing '__forceinline__' with 'inline' as a workaround
+    https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
+    """
+    output_string = input_string
+    output_string = re.sub("__forceinline__", "inline", output_string)
+    return output_string
+
+def replace_math_functions(input_string):
+    """ FIXME: Temporarily replace std:: invocations of math functions with non-std:: versions to prevent linker errors
+        NOTE: This can lead to correctness issues when running tests, since the correct version of the math function (exp/expf) might not get called.
+        Plan is to remove this function once HIP supports std:: math function calls inside device code
+    """
+    output_string = input_string
+    output_string = re.sub("std::exp\(", "::exp(", output_string)
+    output_string = re.sub("std::log\(", "::log(", output_string)
+    output_string = re.sub("std::pow\(", "::pow(", output_string)
+    return output_string
+
+def disable_function(input_string, function, replace_style):
+    """ Finds and disables a function in a particular file.
+
+    If type(function) == List
+        function - The signature of the function to disable.
+            e.g. ["bool", "overlappingIndices", "(const Tensor& t)"]
+            disables function -> "bool overlappingIndices(const Tensor& t)"
+
+    If type(function) == String
+        function - Disables the function by name only.
+            e.g. "overlappingIndices"
+
+    replace_style - The style to use when stubbing functions.
+    """
+# void (*)(hcrngStateMtgp32 *, int, float *, double, double)
+    info = {
+        "function_start": -1,
+        "function_end": -1,
+        "bracket_count": 0
+    }
+
+    STARTED = 0
+    INSIDE_FUNCTION = 1
+    BRACKET_COMPLETE = 2
+
+    STATE = STARTED
+
+    if type(function) == list:
+        # Extract components from function signature.
+        func_info = {
+            "return_type": function[0].strip(),
+            "function_name": function[1].strip(),
+            "function_args": function[2].strip()
+        }
+
+        # Create function string to search for
+        function_string = "{0}{1}{2}".format(
+            func_info["return_type"],
+            func_info["function_name"],
+            func_info["function_args"]
+        )
+
+        # Find the starting position for the function
+        info["function_start"] = input_string.find(function_string)
+    else:
+        # Automatically detect signature.
+        the_match = re.search(r"(((.*) (\*)?)({0})(\([^{{)]*\)))\s*{{".format(
+            function.replace("(", "\(").replace(")", "\)")), input_string)
+        if the_match is None:
+            return input_string
+
+        func_info = {
+            "return_type": the_match.group(2).strip(),
+            "function_name": the_match.group(5).strip(),
+            "function_args": the_match.group(6).strip(),
+        }
+
+        # Find the starting position for the function
+        info["function_start"] = the_match.start()
+        function_string = the_match.group(1)
+
+    # The function can't be found anymore.
+    if info["function_start"] == -1:
+        return input_string
+
+    # Find function block start.
+    pos = info["function_start"] + len(function_string) - 1
+    while pos < len(input_string) and STATE != BRACKET_COMPLETE:
+        if input_string[pos] == "{":
+            if STATE != INSIDE_FUNCTION:
+                STATE = INSIDE_FUNCTION
+                info["bracket_count"] = 1
+            else:
+                info["bracket_count"] += 1
+        elif input_string[pos] == "}":
+            info["bracket_count"] -= 1
+
+            if info["bracket_count"] == 0 and STATE == INSIDE_FUNCTION:
+                STATE = BRACKET_COMPLETE
+                info["function_end"] = pos
+
+        pos += 1
+
+    # Never found the function end. Corrupted file!
+    if STATE != BRACKET_COMPLETE:
+        return input_string
+
+    # Preprocess the source by removing the function.
+    function_body = input_string[info["function_start"]:info["function_end"] + 1]
+
+    # Remove the entire function body
+    if replace_style == disablefuncmode.REMOVE:
+        output_string = input_string.replace(function_body, "")
+
+    # Stub the function based off its return type.
+    elif replace_style == disablefuncmode.STUB:
+        # void return type
+        if func_info["return_type"] == "void" or func_info["return_type"] == "static void":
+            stub = "{0}{{\n}}".format(function_string)
+        # pointer return type
+        elif "*" in func_info["return_type"]:
+            stub = "{0}{{\nreturn {1};\n}}".format(function_string, "NULL")  # nullptr
+        else:
+            stub = "{0}{{\n{1} stub_var;\nreturn stub_var;\n}}".format(function_string, func_info["return_type"])
+
+        output_string = input_string.replace(function_body, stub)
+
+    # Add HIP Preprocessors.
+    elif replace_style == disablefuncmode.HCC_MACRO:
+        output_string = input_string.replace(
+            function_body,
+            "#if !defined(__HIP_PLATFORM_HCC__)\n{0}\n#endif".format(function_body))
+
+    # Add HIP Preprocessors.
+    elif replace_style == disablefuncmode.DEVICE_MACRO:
+        output_string = input_string.replace(
+            function_body,
+            "#if !defined(__HIP_DEVICE_COMPILE__)\n{0}\n#endif".format(function_body))
+
+    # Throw an exception at runtime.
+    elif replace_style == disablefuncmode.EXCEPTION:
+        stub = "{0}{{\n{1};\n}}".format(
+            function_string,
+            'throw std::runtime_error("The function {0} is not implemented.")'.format(
+                function_string.replace("\n", " ")))
+        output_string = input_string.replace(function_body, stub)
+
+    elif replace_style == disablefuncmode.ASSERT:
+        stub = "{0}{{\n{1};\n}}".format(
+            function_string,
+            'assert(0)')
+        output_string = input_string.replace(function_body, stub)
+
+    elif replace_style == disablefuncmode.EMPTYBODY:
+        stub = "{0}{{\n;\n}}".format(function_string)
+        output_string = input_string.replace(function_body, stub)
+    return output_string
+
+
+def preprocessor(filepath, stats):
+    """ Executes the CUDA -> HIP conversion on the specified file. """
+    with openf(filepath, "r+") as fileobj:
+        output_source = fileobj.read()
+
+        # Perform type, method, constant replacements
+        for mapping in CUDA_TO_HIP_MAPPINGS:
+            for cuda_type, value in mapping.items():
+                # Extract relevant information
+                hip_type = value[0]
+                meta_data = value[1:]
+
+                if output_source.find(cuda_type) > -1:
+                    # Check if supported
+                    if constants.HIP_UNSUPPORTED in meta_data:
+                        stats["unsupported_calls"].append((cuda_type, filepath))
+
+                if cuda_type in output_source:
+                    output_source = re.sub(r'\b({0})\b'.format(cuda_type), lambda x: hip_type, output_source)
+
+        # Perform Kernel Launch Replacements
+        output_source = processKernelLaunches(output_source, stats)
+
+        # Disable asserts
+        if not filepath.endswith("THCGeneral.h.in"):
+            output_source = disable_asserts(output_source)
+
+        # Replace std:: with non-std:: versions
+        output_source = replace_math_functions(output_source)
+
+        # Replace __forceinline__ with inline
+        output_source = replace_forceinline(output_source)
+
+        # Overwrite file contents
+        fileobj.seek(0)
+        fileobj.write(output_source)
+        fileobj.truncate()
+        fileobj.flush()
+
+        # Flush to disk
+        os.fsync(fileobj)
+
+
+def file_specific_replacement(filepath, search_string, replace_string, strict=False):
+    with openf(filepath, "r+") as f:
+        contents = f.read()
+        if strict:
+            contents = re.sub(r'\b({0})\b'.format(search_string), lambda x: replace_string, contents)
+        else:
+            contents = contents.replace(search_string, replace_string)
+        f.seek(0)
+        f.write(contents)
+        f.truncate()
+
+
+def file_add_header(filepath, header):
+    with openf(filepath, "r+") as f:
+        contents = f.read()
+        if header[0] != "<" and header[-1] != ">":
+            header = '"{0}"'.format(header)
+        contents = ('#include {0} \n'.format(header)) + contents
+        f.seek(0)
+        f.write(contents)
+        f.truncate()
+
+
+def fix_static_global_kernels(in_txt):
+    """Static global kernels in HIP results in a compilation error."""
+    in_txt = in_txt.replace(" __global__ static", "__global__")
+    return in_txt
+
+
+def get_kernel_template_params(the_file, KernelDictionary, template_param_to_value):
+    """Scan for __global__ kernel definitions then extract its argument types, and static cast as necessary"""
+    # Read the kernel file.
+    with openf(the_file, "r") as f:
+        # Extract all kernels with their templates inside of the file
+        string = f.read()
+
+        get_kernel_definitions = [k for k in re.finditer(
+            r"(template[ ]*<(.*)>\n.*\n?)?__global__ void[\n| ](\w+(\(.*\))?)\(", string)]
+
+        # Create new launch syntax
+        for kernel in get_kernel_definitions:
+            template_arguments = kernel.group(2).split(",") if kernel.group(2) else ""
+            template_arguments = [x.replace("template", "").replace("typename", "").strip() for x in template_arguments]
+            kernel_name = kernel.group(3)
+
+            # Kernel starting / ending positions
+            arguments_start = kernel.end()
+            argument_start_pos = arguments_start
+            current_position = arguments_start + 1
+
+            # Search for final parenthesis
+            arguments = []
+            closures = {"(": 1, "<": 0}
+            while current_position < len(string):
+                if string[current_position] == "(":
+                    closures["("] += 1
+                elif string[current_position] == ")":
+                    closures["("] -= 1
+                elif string[current_position] == "<":
+                    closures["<"] += 1
+                elif string[current_position] == ">":
+                    closures["<"] -= 1
+
+                # Finished all arguments
+                if closures["("] == 0 and closures["<"] == 0:
+                    # Add final argument
+                    arguments.append({"start": argument_start_pos, "end": current_position})
+                    break
+
+                # Finished current argument
+                if closures["("] == 1 and closures["<"] == 0 and string[current_position] == ",":
+                    arguments.append({"start": argument_start_pos, "end": current_position})
+                    argument_start_pos = current_position + 1
+
+                current_position += 1
+
+            # Grab range of arguments
+            arguments_string = [string[arg["start"]: arg["end"]] for arg in arguments]
+
+            argument_types = [None] * len(arguments_string)
+            for arg_idx, arg in enumerate(arguments_string):
+                for i in range(len(arg) - 1, -1, -1):
+                    if arg[i] == "*" or arg[i] == " ":
+                        argument_types[arg_idx] = re.sub(' +', ' ', arg[0:i + 1].replace("\n", "").strip())
+                        break
+
+            # Here we'll use the template_param_to_value dictionary to replace the PyTorch / Caffe2.
+            if len(template_arguments) == 1 and template_arguments[0].strip() in template_param_to_value.keys():
+                # Updates kernel
+                kernel_with_template = "{0}<{1}>".format(
+                    kernel_name, template_param_to_value[template_arguments[0].strip()])
+            else:
+                kernel_with_template = kernel_name
+            formatted_args = {}
+            for idx, arg_type in enumerate(argument_types):
+                formatted_args[idx] = arg_type
+
+            KernelDictionary[kernel_name] = {"kernel_with_template": kernel_with_template, "arg_types": formatted_args}
+
+        # Extract generated kernels
+        # curandStateMtgp32 *state, int size, T *result, ARG1
+        for kernel in re.finditer(r"GENERATE_KERNEL([1-9])\((.*)\)", string):
+            kernel_gen_type = int(kernel.group(1))
+            kernel_name = kernel.group(2).split(",")[0]
+            kernel_params = kernel.group(2).split(",")[1:]
+
+            if kernel_gen_type == 1:
+                kernel_args = {1: "int", 2: "{0} *".format(kernel_params[0]), 3: kernel_params[1]}
+
+            if kernel_gen_type == 2:
+                kernel_args = {1: "int", 2: "{0} *".format(kernel_params[0]), 3: kernel_params[1], 4: kernel_params[2]}
+
+            # Argument at position 1 should be int
+            KernelDictionary[kernel_name] = {"kernel_with_template": kernel_name, "arg_types": kernel_args}
+
+
+def disable_unsupported_function_call(function, input_string, replacement):
+    """Disables calls to an unsupported HIP function"""
+    # Prepare output string
+    output_string = input_string
+
+    # Find all calls to the function
+    calls = re.finditer(r"\b{0}\b".format(function), input_string)
+
+    # Do replacements
+    for call in calls:
+        start = call.start()
+        end = call.end()
+
+        pos = end
+        started_arguments = False
+        bracket_count = 0
+        while pos < len(input_string):
+            if input_string[pos] == "(":
+                if started_arguments is False:
+                    started_arguments = True
+                    bracket_count = 1
+                else:
+                    bracket_count += 1
+            elif input_string[pos] == ")" and started_arguments:
+                bracket_count -= 1
+
+            if bracket_count == 0 and started_arguments:
+                # Finished!
+                break
+            pos += 1
+
+        function_call = input_string[start:pos + 1]
+        output_string = output_string.replace(function_call, replacement)
+
+    return output_string
+
+
+def disable_module(input_file):
+    """Disable a module entirely except for header includes."""
+    with openf(input_file, "r+") as f:
+        txt = f.read()
+        last = list(re.finditer(r"#include .*\n", txt))[-1]
+        end = last.end()
+
+        disabled = "{0}#if !defined(__HIP_PLATFORM_HCC__)\n{1}\n#endif".format(txt[0:end], txt[end:])
+
+        f.seek(0)
+        f.write(disabled)
+        f.truncate()
+
+
+def extract_arguments(start, string):
+    """ Return the list of arguments in the upcoming function parameter closure.
+        Example:
+        string (input): '(blocks, threads, 0, THCState_getCurrentStream(state))'
+        arguments (output):
+            '[{'start': 1, 'end': 7},
+            {'start': 8, 'end': 16},
+            {'start': 17, 'end': 19},
+            {'start': 20, 'end': 53}]'
+    """
+
+    arguments = []
+    closures = {
+        "<": 0,
+        "(": 0
+    }
+    current_position = start
+    argument_start_pos = current_position + 1
+
+    # Search for final parenthesis
+    while current_position < len(string):
+        if string[current_position] == "(":
+            closures["("] += 1
+        elif string[current_position] == ")":
+            closures["("] -= 1
+        elif string[current_position] == "<":
+            closures["<"] += 1
+        elif string[current_position] == ">" and string[current_position - 1] != "-":
+            closures["<"] -= 1
+
+        # Finished all arguments
+        if closures["("] == 0 and closures["<"] == 0:
+            # Add final argument
+            arguments.append({"start": argument_start_pos, "end": current_position})
+            break
+
+        # Finished current argument
+        if closures["("] == 1 and closures["<"] == 0 and string[current_position] == ",":
+            arguments.append({"start": argument_start_pos, "end": current_position})
+            argument_start_pos = current_position + 1
+
+        current_position += 1
+
+    return arguments
+
+
+# Add static_cast to ensure that the type of kernel arguments matches that in the corresponding kernel definition
+def add_static_casts(directory, extensions, KernelTemplateParams):
+    """Add static casts to kernel launches in order to keep launch argument types and kernel definition types matching.
+
+       Example:
+           old_kernel_launch: ' createBatchGemmBuffer, grid, block, 0, THCState_getCurrentStream(state),
+              (const real**)d_result, THCTensor_(data)(state, ra__),
+              ra__->stride[0], num_batches'
+
+           new_kernel_launch: ' createBatchGemmBuffer, grid, block, 0, THCState_getCurrentStream(state),
+              (const real**)d_result, THCTensor_(data)(state, ra__),
+              static_cast<int64_t>(ra__->stride[0]), static_cast<int64_t>(num_batches)'
+    """
+
+    # These are the types that generally have issues with hipKernelLaunch.
+    static_cast_types = ["int", "const int", "int64_t", "THCIndex_t *",
+                         "const int *", "ptrdiff_t", "long", "const int64_t*", "int64_t *", "double"]
+
+    # Add static_casts<> to all kernel launches.
+    for (dirpath, _dirnames, filenames) in os.walk(directory):
+        for filename in filenames:
+            if filename_ends_with_extension(filename, extensions):
+                filepath = os.sep.join([dirpath, filename])
+                with openf(filepath, "r+") as fileobj:
+                    input_source = fileobj.read()
+                    new_output_source = input_source
+                    for kernel in re.finditer("hipLaunchKernelGGL\(", input_source):
+                        arguments = extract_arguments(kernel.end() - 1, input_source)
+
+                        # Check if we have templating + static_cast information
+                        argument_strings = [input_source[arg["start"]:arg["end"]] for arg in arguments]
+                        original_kernel_name_with_template = argument_strings[0].strip()
+                        kernel_name = original_kernel_name_with_template.split("<")[0].strip()
+                        ignore = ["upscale"]
+                        if kernel_name in KernelTemplateParams and kernel_name not in ignore:
+                            # Add template to the kernel
+                            # Add static_casts to relevant arguments
+                            kernel_name_with_template = KernelTemplateParams[kernel_name]["kernel_with_template"]
+                            argument_types = KernelTemplateParams[kernel_name]["arg_types"]
+                            
+                            # The first 5 arguments are simply (function, number blocks, dimension blocks, shared memory, stream)
+                            # old_kernel_launch_parameters - will contain the actual arguments to the function itself.
+                            old_kernel_launch_parameters = input_source[arguments[5]["start"]:arguments[-1]["end"]]
+                            new_kernel_launch_parameters = old_kernel_launch_parameters
+            
+                            # full_old_kernel_launch - will contain the entire kernel launch closure.
+                            full_old_kernel_launch = input_source[arguments[0]["start"]:arguments[-1]["end"]]
+                            full_new_kernel_launch = full_old_kernel_launch
+
+                            kernel_params = argument_strings[5:]
+                            for arg_idx, arg in enumerate(kernel_params):
+                                if arg_idx in argument_types:
+                                    the_type = argument_types[arg_idx]
+                                    the_arg = arg.replace("\n", "").replace("\\", "").strip()
+                                    # Not all types have issues with the hipLaunchKernelGGL.
+                                    if the_type in static_cast_types:
+                                        static_argument = "static_cast<{0}>({1})".format(the_type, the_arg)
+
+                                        def replace_arg(match):
+                                          return match.group(1) + static_argument + match.group(3)
+                                        # Update to static_cast, account for cases where argument is at start/end of string
+                                        new_kernel_launch_parameters = re.sub(r'(^|\W)({0})(\W|$)'.format(
+                                            re.escape(the_arg)), replace_arg, new_kernel_launch_parameters)
+ 
+                            # replace kernel arguments in full kernel launch arguments w/ static_cast ones
+                            full_new_kernel_launch = full_new_kernel_launch.replace(old_kernel_launch_parameters, new_kernel_launch_parameters)
+
+                            # PyTorch Specific: Add template type
+                            # Here the template value will be resolved from <real> to <Dtype>.
+                            if "THCUNN" in filepath.split("/") and "generic" not in filepath.split("/"):
+                                kernel_name_with_template = kernel_name_with_template.replace("<real>", "<Dtype>")
+                            full_new_kernel_launch = re.sub(r'\b{0}\b'.format(original_kernel_name_with_template),
+                                                       lambda x: kernel_name_with_template, full_new_kernel_launch)
+
+                            # Replace Launch
+                            new_output_source = new_output_source.replace(full_old_kernel_launch, full_new_kernel_launch)
+
+                    # Overwrite file contents
+                    fileobj.seek(0)
+                    fileobj.write(new_output_source)
+                    fileobj.truncate()
+                    fileobj.flush()
+
+                    # Flush to disk
+                    os.fsync(fileobj)
+
+
+def str2bool(v):
+    """ArgumentParser doesn't support type=bool. Thus, this helper method will convert
+    from possible string types to True / False."""
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def main():
+    """Example invocation
+
+    python hipify.py --project-directory /home/myproject/ --extensions cu cuh h cpp --output-directory /home/gains/
+    """
+
+    parser = argparse.ArgumentParser(
+        description="The Python Hipify Script.")
+
+    parser.add_argument(
+        '--project-directory',
+        type=str,
+        default=os.getcwd(),
+        help="The root of the project.",
+        required=False)
+
+    parser.add_argument(
+        '--show-detailed',
+        type=str2bool,
+        default=False,
+        help="Show detailed summary of the hipification process.",
+        required=False)
+
+    parser.add_argument(
+        '--extensions',
+        nargs='+',
+        default=["cu", "cuh", "c", "cpp", "h", "in", "hpp"],
+        help="The extensions for files to run the Hipify script over.",
+        required=False)
+
+    parser.add_argument(
+        '--output-directory',
+        type=str,
+        default="",
+        help="The directory to store the hipified project.",
+        required=False)
+
+    parser.add_argument(
+        '--include-dirs',
+        nargs='+',
+        default=[],
+        help="The directories under the root that should be included.",
+        required=False)
+
+    parser.add_argument(
+        '--yaml-settings',
+        type=str,
+        default="",
+        help="The yaml file storing information for disabled functions and modules.",
+        required=False)
+
+    parser.add_argument(
+        '--add-static-casts',
+        type=str2bool,
+        default=False,
+        help="Whether to automatically add static_casts to kernel arguments.",
+        required=False)
+
+    parser.add_argument(
+        '--show-progress',
+        type=str2bool,
+        default=True,
+        help="Whether to show the progress bar during the transpilation proecss.",
+        required=False)
+
+    args = parser.parse_args()
+
+    # Verify the project directory exists.
+    if not os.path.exists(args.project_directory):
+        print("The project folder specified does not exist.")
+        sys.exit(1)
+
+    # If no output directory, provide a default one.
+    if args.output_directory is "":
+        args.project_directory.rstrip("/")
+        args.output_directory = args.project_directory + "_amd"
+
+    # Make sure output directory does not exist.
+    if not os.path.exists(args.output_directory):
+        print("The output folder already exists.")
+        sys.exit(2)
+
+    # Copy from project directory to output directory if not done already.
+    if not os.path.exists(args.output_directory):
+        shutil.copytree(args.project_directory, args.output_directory)
+
+    # Extract all of the kernel parameter and template type information.
+    if args.add_static_casts:
+        KernelTemplateParams = {}
+        for (dirpath, _dirnames, filenames) in os.walk(args.output_directory):
+            for filename in filenames:
+                if filename_ends_with_extension(filename, args.extensions) and inside_included_directories(dirpath, args.output_directory, args.include_dirs):
+                    the_file = os.sep.join([dirpath, filename])
+
+                    # Store param information inside KernelTemplateParams
+                    get_kernel_template_params(
+                        the_file,
+                        KernelTemplateParams,
+                        PYTORCH_TEMPLATE_MAP)
+
+    # Open YAML file with disable information.
+    if args.yaml_settings != "":
+        with openf(args.yaml_settings, "r") as f:
+            yaml_data = yaml.load(f)
+
+        # Disable functions in certain files according to YAML description
+        for disable_info in yaml_data["disabled_functions"]:
+            filepath = os.path.join(args.output_directory, disable_info["path"])
+            if "functions" in disable_info:
+                functions = disable_info["functions"]
+            else:
+                functions = disable_info.get("functions", [])
+
+            if "non_hip_functions" in disable_info:
+                non_hip_functions = disable_info["non_hip_functions"]
+            else:
+                non_hip_functions = disable_info.get("non_hip_functions", [])
+
+            if "non_device_functions" in disable_info:
+                not_on_device_functions = disable_info["non_device_functions"]
+            else:
+                not_on_device_functions = disable_info.get("non_device_functions", [])
+
+            with openf(filepath, "r+") as f:
+                txt = f.read()
+                for func in functions:
+                    # TODO - Find fix assertions in HIP for device code.
+                    txt = disable_function(txt, func, disablefuncmode.ASSERT)
+
+                for func in non_hip_functions:
+                    # Disable this function on HIP stack
+                    txt = disable_function(txt, func, disablefuncmode.HCC_MACRO)
+
+                for func in not_on_device_functions:
+                    # Disable this function when compiling on Device
+                    txt = disable_function(txt, func, disablefuncmode.DEVICE_MACRO)
+
+                f.seek(0)
+                f.write(txt)
+                f.truncate()
+
+        # Disable modules
+        disable_modules = yaml_data["disabled_modules"]
+        for module in disable_modules:
+            disable_module(os.path.join(args.output_directory, module))
+
+        # Disable unsupported HIP functions
+        for disable in yaml_data["disable_unsupported_hip_calls"]:
+            filepath = os.path.join(args.output_directory, disable["path"])
+            if "functions" in disable:
+                functions = disable["functions"]
+            else:
+                functions = disable.get("functions", [])
+
+            if "constants" in disable:
+                constants = disable["constants"]
+            else:
+                constants = disable.get("constants", [])
+
+            if "s_constants" in disable:
+                s_constants = disable["s_constants"]
+            else:
+                s_constants = disable.get("s_constants", [])
+
+            if not os.path.exists(filepath):
+                print("\n" + bcolors.WARNING + "YAML Warning: File {0} does not exist.".format(filepath) + bcolors.ENDC)
+                continue
+
+            with openf(filepath, "r+") as f:
+                txt = f.read()
+
+                # Disable HIP Functions
+                for func in functions:
+                    txt = disable_unsupported_function_call(func, txt, functions[func])
+
+                # Disable Constants w\ Boundary.
+                for const in constants:
+                    txt = re.sub(r"\b{0}\b".format(const), constants[const], txt)
+
+                # Disable Constants
+                for s_const in s_constants:
+                    txt = txt.replace(s_const, s_constants[s_const])
+
+                # Save Changes
+                f.seek(0)
+                f.write(txt)
+                f.truncate()
+
+    # Start Preprocessor
+    walk_over_directory(
+        args.output_directory,
+        extensions=args.extensions,
+        show_detailed=args.show_detailed,
+        include_dirs=args.include_dirs,
+        show_progress=args.show_progress)
+
+    if args.add_static_casts:
+        # Execute the Clang Tool to Automatically add static casts
+        add_static_casts(args.output_directory, args.extensions, KernelTemplateParams)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/aten_mirror.sh b/tools/aten_mirror.sh
new file mode 100755
index 0000000..b1408e0
--- /dev/null
+++ b/tools/aten_mirror.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# This script is run by a cronjob managed by @zdevito
+# which mirrors the ATen-specific directories of PyTorch
+# to zdevito/ATen, for ease of use of projects that wish
+# to depend solely on ATen.
+#
+# See also .travis.aten.yml, which is the Travis configuration
+# for the ATen project (and ensures ATen is separately
+# buildable.)
+
+if [[ -z "$EXTRACTED_REPO" ]]; then
+  echo "Need to set envvar EXTRACTED_REPO"
+  exit 1
+fi
+if [[ -z "$FULL_REPO" ]]; then
+  echo "Need to set envvar FULL_REPO"
+  exit 1
+fi
+rm -rf aten-export-repo
+git clone $EXTRACTED_REPO aten-export-repo
+cd aten-export-repo
+git config user.name "Zach DeVito"
+git config user.email "zdevito@fb.com"
+git remote add fullrepo $FULL_REPO
+git fetch fullrepo
+git checkout -b temporary-split-branch fullrepo/master
+# Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository
+# and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work
+git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
+git checkout master
+git merge temporary-split-branch
+git push
diff --git a/tools/autograd/__init__.py b/tools/autograd/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/autograd/deprecated.yaml b/tools/autograd/deprecated.yaml
new file mode 100644
index 0000000..b7cbf3c
--- /dev/null
+++ b/tools/autograd/deprecated.yaml
@@ -0,0 +1,92 @@
+# Deprecated function signatures. These are exposed in Python, but not included
+# in the error message suggestions.
+
+- name: add(Tensor self, Scalar alpha, Tensor other)
+  aten: add(self, other, alpha)
+
+- name: add(Tensor self, Scalar alpha, Tensor other, *, Tensor out)
+  aten: add_out(out, self, other, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2)
+  aten: addbmm(self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor out)
+  aten: addbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2)
+  aten: addbmm(self, batch1, batch2, beta, 1)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor out)
+  aten: addbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2)
+  aten: addcdiv(self, tensor1, tensor2, value)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor out)
+  aten: addcdiv_out(out, self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2)
+  aten: addcmul(self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor out)
+  aten: addcmul_out(out, self, tensor1, tensor2, value)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2)
+  aten: addmm(self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2, *, Tensor out)
+  aten: addmm_out(out, self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
+  aten: addmm(self, mat1, mat2, beta, 1)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2, *, Tensor out)
+  aten: addmm_out(out, self, mat1, mat2, beta, 1)
+
+- name: sspaddmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2)
+  aten: sspaddmm(self, mat1, mat2, beta, alpha)
+
+- name: sspaddmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
+  aten: sspaddmm(self, mat1, mat2, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec)
+  aten: addmv(self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec, *, Tensor out)
+  aten: addmv_out(out, self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec)
+  aten: addmv(self, mat, vec, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec, *, Tensor out)
+  aten: addmv_out(out, self, mat, vec, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2)
+  aten: addr(self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2, *, Tensor out)
+  aten: addr_out(out, self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2)
+  aten: addr(self, vec1, vec2, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2, *, Tensor out)
+  aten: addr_out(out, self, vec1, vec2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2)
+  aten: baddbmm(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor out)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2)
+  aten: baddbmm(self, batch1, batch2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor out)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other)
+  aten: sub(self, other, alpha)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other, *, Tensor out)
+  aten: sub_out(out, self, other, alpha)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
new file mode 100644
index 0000000..3a396d8
--- /dev/null
+++ b/tools/autograd/derivatives.yaml
@@ -0,0 +1,1223 @@
+# Defines derivative formulas and Python signatures of methods on Variable
+#
+# Each entry consists of:
+#   - A 'name', which specifies the ATen name of the function you
+#     are defining derivatives for, and an argument specification.
+#   - One or more gradients entries, mapping a differentiable input
+#     names to a formula specifying how to compute its gradient.
+#     Note that a single gradient entry can specify the gradient
+#     formula for multiple input names, by specifying a key
+#     "input1, input2" (see atan2 for an example).
+#
+# If a function has out-of-place and in-place variants, then the derivative
+# definition for the in-place variant is optional. It will default to the
+# definition for the out-of-place variant. Similarly, _out variants will
+# default to the derivative for the non _out variant.
+#
+# Gradient expressions are standard C++ expressions operating on ATen
+# variables.  In a gradient expression, the following variables are in
+# scope:
+#
+#   - 'grad', the gradient of the output (often spelled grad_output
+#     in Python) which we are going to left-multiply.
+#
+#     When a function returns multiple *differentiable* outputs,
+#     you can refer to the gradients of each outputs using 'grads',
+#     e.g., 'grads[0]', 'grads[1]'
+#
+#     When a function returns *one* differentiable output (the
+#     first output) and some more nondifferentiable outputs,
+#     you MUST refer to the gradient of the differentiable output with
+#     'grad' (this case is special-cased in our code generation).
+#
+#   - Any of the input arguments, tensor or non-tensor, including
+#     argument names that only appear in Declarations.cwrap, e.g. 'output'.
+#
+#   - 'result', representing the result of evaluating the forward
+#     expression for ATen native function declarations. If the forward
+#     expression outputs a tuple, use 'resultX' instead to access the
+#     X-th entry
+#
+#   - 'grad_input_mask', a std::array<bool, n>, specifies which input
+#     gradients are actually needed.  For example, in the entry
+#     `input0, input1: foo(grad_input_mask)`, `grad_input_mask` is a size
+#     two array, where `grad_input_mask[0]` is true if `input0` requires
+#     grad, and `grad_input_mask[1]` is true if `input1` requires grad.
+#
+#     (NB: if your function computes gradient for a list of tensors,
+#     the `grad_input_mask` will only have a single entry for the list
+#     specifying if either zero or at least one tensor from the list requires
+#     grad.  If we want to support more fine-grained signalling,
+#     we'll need some alternate variable which is not a std::array)
+#
+#   - 'retain_variables', a bool which is true if a user has specified
+#     that saved variables should be retained in case the backwards is
+#     run again later.  This allows an optimization where we can
+#     destroy saved buffers if we know variables are not going to be retained,
+#     e.g., it is used by _cudnn_rnn
+#
+# If you need a complex expression, e.g., with local variables,
+# write a _backward function in tools/autograd/templates/Functions.cpp
+# and invoke it from here.  By the way, go read
+# https://github.com/zdevito/ATen/issues/163; this describes an
+# important hazard that occurs when porting backwards from Python to C++
+#
+# Double backwards gradient expressions can be somewhat confusing;
+# the most important thing to remember is: (1) you need to define a
+# derivative formula for every input, including inputs named things
+# like 'grad_output', and (2) the gradient to multiply with is always
+# called 'grad' (even though it really is a grad-grad).
+#
+# NB: There are a number of gradient definitions in here which are bogus
+# (implemented using zeros_like).  These gradients are (hopefully) not
+# used by our frontend.  You MUST check the frontend code; search for
+# OpName.apply to see if it's still using a legacy Python style API.
+#
+# NB: The parameter names here MUST be consistent with the parameter names
+# in ./torch/lib/ATen/Declarations.cwrap
+- name: abs(Tensor self)
+  self: grad * self.sign()
+
+- name: acos(Tensor self)
+  self: grad * -((-self * self + 1).rsqrt())
+
+- name: add(Tensor self, Scalar other, *, Scalar alpha)
+  self: grad
+
+- name: s_native_add(Tensor self, Tensor other, *, Scalar alpha)
+  self: grad
+  other: maybe_multiply(grad, alpha)
+
+- name: th_add(Tensor self, Tensor other, *, Scalar alpha)
+  self: grad
+  other: maybe_multiply(grad, alpha)
+
+- name: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  batch1: grad.unsqueeze(0).expand({ batch1.size(0), batch1.size(1), batch2.size(2) }).bmm(batch2.transpose(1, 2)) * alpha
+  batch2: batch1.transpose(1, 2).bmm(grad.unsqueeze(0).expand({ batch1.size(0), batch1.size(1), batch2.size(2) })) * alpha
+
+- name: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value)
+  self: grad
+  tensor1: grad * value / tensor2
+  tensor2: -grad * value * tensor1 / (tensor2 * tensor2)
+
+- name: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value)
+  self: grad
+  tensor1: grad * tensor2 * value
+  tensor2: grad * tensor1 * value
+
+- name: th_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), alpha)
+
+- name: s_native_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), alpha)
+
+- name: _addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  mat: grad.ger(vec) * alpha
+  vec: mat.t().mv(grad) * alpha
+
+- name: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  vec1: grad.mv(vec2) * alpha
+  vec2: grad.t().mv(vec1) * alpha
+
+- name: alias(Tensor self)
+  self: grad
+
+- name: as_strided(Tensor self, IntList size, IntList stride, int64_t storage_offset)
+  self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
+
+- name: asin(Tensor self)
+  self: grad * (-self * self + 1).rsqrt()
+
+- name: atan(Tensor self)
+  self: grad / (self * self + 1)
+
+- name: atan2(Tensor self, Tensor other)
+  self, other: atan2_backward(grad, self, other, grad_input_mask)
+
+- name: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha)
+  self: maybe_multiply(grad, beta)
+  batch1: grad.bmm(batch2.transpose(1, 2)) * alpha
+  batch2: batch1.transpose(1, 2).bmm(grad) * alpha
+
+- name: bernoulli(Tensor self, double p, Generator generator)
+  self: zeros_like(grad)
+
+- name: bmm(Tensor self, Tensor mat2)
+  self: grad.bmm(mat2.transpose(1, 2))
+  mat2: self.transpose(1, 2).bmm(grad)
+
+- name: btrifact(Tensor self, bool pivot)
+  self: not_implemented("btrifact")
+
+- name: btrifact_with_info(Tensor self, bool pivot)
+  self: not_implemented("btrifact_with_info")
+
+- name: btrisolve(Tensor self, Tensor LU_data, Tensor LU_pivots)
+  self: not_implemented("btrisolve")
+
+- name: cat(TensorList tensors, int64_t dim)
+  tensors: cat_tensors_backward(grad, to_args_sizes(tensors), dim)
+
+- name: cauchy_(Tensor self, double median, double sigma, Generator generator)
+  self: zeros_like(grad)
+
+- name: ceil(Tensor self)
+  self: zeros_like(grad)
+
+# For clamp, clamp_min, and clamp_max, gradient is not defined at the
+# boundaries. But empirically it's helpful to be able to get gradient on min and
+# max, so we return the subgradient 1 for these cases.
+- name: clamp(Tensor self, Scalar min, Scalar max)
+  self: grad * ((self >= min) * (self <= max)).type_as(grad)
+
+- name: clamp_min(Tensor self, Scalar min)
+  self: grad * (self >= min).type_as(grad)
+
+- name: clamp_max(Tensor self, Scalar max)
+  self: grad * (self <= max).type_as(grad)
+
+- name: clone(Tensor self)
+  self: grad
+
+- name: cos(Tensor self)
+  self: grad * -self.sin()
+
+- name: cosh(Tensor self)
+  self: grad * self.sinh()
+
+- name: cross(Tensor self, Tensor other, int64_t dim)
+  self: other.cross(grad, dim)
+  other: grad.cross(self, dim)
+
+- name: _cumprod(Tensor self, int64_t dim)
+  self: cumprod_backward(grad, self, dim)
+
+- name: _cumsum(Tensor self, int64_t dim)
+  self: cumsum_backward(grad, dim)
+
+- name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad)
+  self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad)
+
+- name: det(Tensor self)
+  self: det_backward(grad, self, result)
+
+- name: diag(Tensor self, int64_t diagonal)
+  self: diag_backward(grad, self.sizes(), diagonal)
+
+- name: diagonal(Tensor self, int64_t offset, int64_t dim1, int64_t dim2)
+  self: diagonal_backward(grad, self.sizes(), offset, dim1, dim2)
+
+- name: dist(Tensor self, Tensor other, Scalar p)
+  self: norm_backward(grad, self - other, p, result)
+  other: -norm_backward(grad, self - other, p, result)
+
+- name: div(Tensor self, Scalar other)
+  self: grad / other
+
+- name: div(Tensor self, Tensor other)
+  self: grad / other
+  other: -grad * self / (other * other)
+
+- name: dot(Tensor self, Tensor tensor)
+  self: grad * tensor
+  tensor: grad * self
+
+- name: eig(Tensor self, bool eigenvectors)
+  self: not_implemented("eig")
+
+- name: eq_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: eq_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: erf(Tensor self)
+  self: 2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
+
+- name: erfc(Tensor self)
+  self: -2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
+
+- name: erfinv(Tensor self)
+  self: 0.5 * sqrt(M_PI) * exp(self.erfinv().pow(2)) * grad
+
+- name: exp(Tensor self)
+  self: grad * result
+
+- name: expm1(Tensor self)
+  self: grad * (result + 1)
+
+- name: expand(Tensor self, IntList size, *, bool implicit)
+  self: reduce_to(grad, self.sizes())
+
+- name: exponential_(Tensor self, double lambd, Generator generator)
+  self: zeros_like(grad)
+
+- name: fill_(Tensor self, Scalar value)
+  self: zeros_like(grad)
+
+- name: fill_(Tensor self, Tensor value)
+  self: zeros_like(grad)
+  value: grad.sum()
+
+- name: floor(Tensor self)
+  self: zeros_like(grad)
+
+- name: fmod(Tensor self, Scalar other)
+  self: grad
+
+- name: fmod(Tensor self, Tensor other)
+  self: grad
+  other: 'not_implemented("fmod: other")'
+
+- name: frac(Tensor self)
+  self: grad
+
+- name: gather(Tensor self, int64_t dim, Tensor index)
+  self: at::zeros(self.sizes(), grad.type()).scatter_add_(dim, index, grad)
+
+- name: ge_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: ge_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: gels(Tensor self, Tensor A)
+  self: not_implemented("gels")
+  A: not_implemented("gels")
+
+- name: geometric_(Tensor self, double p, Generator generator)
+  self: zeros_like(grad)
+
+- name: geqrf(Tensor self)
+  self: not_implemented("geqrf")
+
+- name: ger(Tensor self, Tensor vec2)
+  self: grad.mv(vec2)
+  vec2: grad.t().mv(self)
+
+- name: _gesv_single(Tensor self, Tensor A)
+  self: gesv_backward_self(grad, self, A)
+  A: gesv_backward_A(grad, self, A, solution)
+
+- name: _gesv_helper(Tensor self, Tensor A)
+  self: gesv_backward_self(grad, self, A)
+  A: gesv_backward_A(grad, self, A, result0)
+
+- name: gt_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: gt_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: histc(Tensor self, int64_t bins, Scalar min, Scalar max)
+  self: not_implemented("histc")
+
+- name: index_add_(Tensor self, int64_t dim, Tensor index, Tensor source)
+  self: grad
+  source: grad.index_select(dim, index)
+
+- name: index_copy_(Tensor self, int64_t dim, Tensor index, Tensor source)
+  self: grad.clone().index_fill_(dim, index, 0)
+  source: grad.index_select(dim, index)
+
+- name: index_fill_(Tensor self, int64_t dim, Tensor index, Scalar value)
+  self: grad.clone().index_fill_(dim, index, 0)
+
+- name: index_fill_(Tensor self, int64_t dim, Tensor index, Tensor value)
+  self: grad.clone().index_fill_(dim, index, 0)
+  value: grad.index_select(dim, index).sum()
+
+- name: index_select(Tensor self, int64_t dim, Tensor index)
+  self: at::zeros(self.sizes(), grad.type()).index_add_(dim, index, grad)
+
+- name: inverse(Tensor self)
+  self: -at::mm(result.t(), at::mm(grad, result.t()))
+
+- name: kthvalue(Tensor self, int64_t k, int64_t dim, bool keepdim)
+  self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
+
+- name: le_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: le_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: lerp(Tensor self, Tensor end, Scalar weight)
+  self: grad * (1 - weight.toDouble())
+  end: grad * weight
+
+- name: lgamma(Tensor self)
+  self: grad * digamma(self)
+
+- name: digamma(Tensor self)
+  self: grad * polygamma(1, self)
+
+- name: polygamma(int64_t n, Tensor self)
+  self: grad * polygamma(n + 1, self)
+
+- name: log(Tensor self)
+  self: grad.div(self)
+
+- name: log10(Tensor self)
+  self: grad / (self * 2.3025850929940456)
+
+- name: log1p(Tensor self)
+  self: log1p_backward(grad, self)
+
+- name: log2(Tensor self)
+  self: grad / (self * 0.6931471805599453)
+
+- name: logdet(Tensor self)
+  self: logdet_backward(grad, self, result)
+
+- name: log_normal_(Tensor self, double mean, double std, Generator generator)
+  self: zeros_like(grad)
+
+- name: logsumexp(Tensor self, int64_t dim, bool keepdim)
+  self: logsumexp_backward(grad, self, result, dim, keepdim)
+
+- name: lt_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: lt_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: masked_fill_(Tensor self, Tensor mask, Scalar value)
+  self: grad.clone().masked_fill_(mask, 0)
+
+- name: masked_fill_(Tensor self, Tensor mask, Tensor value)
+  self: grad.clone().masked_fill_(mask, 0)
+  value: at::where(mask, grad, zeros_like(grad)).sum()
+
+- name: masked_scatter_(Tensor self, Tensor mask, Tensor source)
+  self: grad.clone().masked_fill_(mask, 0)
+  source: masked_scatter_backward(grad, mask, source.sizes())
+
+- name: masked_select(Tensor self, Tensor mask)
+  self: zeros_like(self).masked_scatter_(mask, grad)
+
+- name: max(Tensor self, int64_t dim, bool keepdim)
+  self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
+
+- name: max(Tensor self)
+  self: select_equals_backward(grad, self, result)
+
+- name: max(Tensor self, Tensor other)
+  self: grad.clone().masked_fill_(self <= other, 0)
+  other: grad.clone().masked_fill_(self > other, 0)
+
+- name: mean(Tensor self, int64_t dim, bool keepdim)
+  self: sum_backward(grad, self.sizes(), dim, keepdim) / _safe_size(self.sizes(), dim)
+
+- name: mean(Tensor self)
+  self: grad.expand(self.sizes()) / self.numel()
+
+- name: median(Tensor self)
+  self: select_equals_backward(grad, self, result)
+
+# This is in theory incorrect in the following case:
+#   sorted list: [..., a, b, b, ..., b, b, c, ...] with median = b and the value
+#                            |                     at middle position of the
+#                            |                     list between two `b`s. E.g.,
+#                            |
+#                            ^the middle position
+# The gradient exists and is essentially 0 in this case.
+#
+# In case where the middle position is at the boundary of `b` range, e.g.,
+#   sorted list: [..., a, b, b, ..., b, b, c, ...]
+#                                       |
+#                                       ^the middle position
+# The backward implementation is correct in the sense that it returns the
+# subgradient on one side.
+- name: median(Tensor self, int64_t dim, bool keepdim)
+  self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
+
+- name: min(Tensor self, int64_t dim, bool keepdim)
+  self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
+
+- name: min(Tensor self)
+  self: select_equals_backward(grad, self, result)
+
+- name: min(Tensor self, Tensor other)
+  self: grad.clone().masked_fill_(self >= other, 0)
+  other: grad.clone().masked_fill_(self < other, 0)
+
+- name: _mm(Tensor self, Tensor mat2)
+  self: mm_mat1_backward(grad, mat2, self.sizes(), self.strides(), 1)
+  mat2: mm_mat2_backward(grad, self, mat2.sizes(), mat2.strides(), 1)
+
+- name: mode(Tensor self, int64_t dim, bool keepdim)
+  self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
+
+- name: mul(Tensor self, Scalar other)
+  self: grad * other
+
+- name: s_native_mul(Tensor self, Tensor other)
+  self: grad * other
+  other: grad * self
+
+- name: th_mul(Tensor self, Tensor other)
+  self: grad * other
+  other: grad * self
+
+- name: mv(Tensor self, Tensor vec)
+  self: grad.ger(vec)
+  vec: self.t().mv(grad)
+
+- name: ne_(Tensor self, Scalar other)
+  self: zeros_like(self)
+
+- name: ne_(Tensor self, Tensor other)
+  self: zeros_like(self)
+  other: zeros_like(other)
+
+- name: neg(Tensor self)
+  self: grad.neg()
+
+- name: norm(Tensor self, Scalar p)
+  self: norm_backward(grad, self, p, result)
+
+- name: norm(Tensor self, Scalar p, int64_t dim, bool keepdim)
+  self: norm_backward(grad, self, p, result, dim, keepdim)
+
+- name: normal_(Tensor self, double mean, double std, Generator generator)
+  self: zeros_like(grad)
+
+- name: normal(Tensor mean, double std, Generator generator)
+  mean: at::zeros(mean.sizes(), grad.type())
+
+- name: normal(double mean, Tensor std, Generator generator)
+  std: at::zeros(std.sizes(), grad.type())
+
+- name: normal(Tensor mean, Tensor std, Generator generator)
+  mean: at::zeros(mean.sizes(), grad.type())
+  std: at::zeros(std.sizes(), grad.type())
+
+- name: orgqr(Tensor self, Tensor input2)
+  self: not_implemented("orgqr")
+  input2: not_implemented("orgqr")
+
+- name: ormqr(Tensor self, Tensor input2, Tensor input3, bool left, bool transpose)
+  self: not_implemented("ormqr")
+  input2: not_implemented("ormqr")
+  input3: not_implemented("ormqr")
+
+- name: permute(Tensor self, IntList dims)
+  self: permute_backwards(grad, dims)
+
+- name: poisson(Tensor self, Generator generator)
+  self: zeros_like(self)
+
+- name: potrf(Tensor self, bool upper)
+  self: potrf_backward(grad, upper, output)
+
+- name: potri(Tensor self, bool upper)
+  self: not_implemented("potri")
+
+- name: potrs(Tensor self, Tensor input2, bool upper)
+  self: not_implemented("potri")
+  input2: not_implemented("potri")
+
+- name: pow(Tensor self, Scalar exponent)
+  self: pow_backward(grad, self, exponent)
+
+- name: pow(Tensor self, Tensor exponent)
+  self: pow_backward_self(grad, self, exponent)
+  exponent: pow_backward_exponent(grad, self, exponent)
+
+- name: _prod(Tensor self, int64_t dim, bool keepdim)
+  self: prod_backward(grad, self, result, dim, keepdim)
+
+- name: _prod(Tensor self)
+  self: prod_backward(grad, self, result)
+
+- name: pstrf(Tensor self, bool upper, Scalar tol)
+  self: not_implemented("pstrf")
+
+- name: put_(Tensor self, Tensor index, Tensor source, bool accumulate)
+  self: grad.clone().put_(index, zeros_like(source), accumulate)
+  source: grad.take(index)
+
+- name: qr(Tensor self)
+  self: not_implemented("qr")
+
+- name: random_(Tensor self, int64_t from, int64_t to, Generator generator)
+  self: zeros_like(grad)
+
+- name: random_(Tensor self, int64_t to, Generator generator)
+  self: zeros_like(grad)
+
+- name: random_(Tensor self, Generator generator)
+  self: zeros_like(grad)
+
+- name: reciprocal(Tensor self)
+  self: -grad * result * result
+
+- name: remainder(Tensor self, Scalar other)
+  self: grad
+
+- name: remainder(Tensor self, Tensor other)
+  self: grad
+
+- name: renorm(Tensor self, Scalar p, int64_t dim, Scalar maxnorm)
+  self: renorm_backward(grad, self, p, dim, maxnorm)
+
+- name: repeat(Tensor self, IntList repeats)
+  self: repeat_backward(grad, self.dim(), repeats)
+
+# DO NOT define a backward for reshape!
+# reshape is special in that it sometimes returns a view, and sometimes not.
+# Defining a backward will make codegen spit out the forward call as
+#     as_variable(baseType->reshape(self)),
+# making it impossible (hard) to detect when it is actually a view.
+# - name: reshape(Tensor self, IntList shape)
+
+- name: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale)
+  input: RoiPooling2d_backward(input, rois, pooledHeight, pooledWidth, spatialScale, grad, result1)
+
+- name: round(Tensor self)
+  self: zeros_like(grad)
+
+- name: rsqrt(Tensor self)
+  self: -0.5 * grad * result.pow(3)
+
+- name: scatter_(Tensor self, int64_t dim, Tensor index, Tensor src)
+  self: grad.clone().scatter_(dim, index, 0)
+  src: grad.gather(dim, index)
+
+- name: scatter_(Tensor self, int64_t dim, Tensor index, Scalar value)
+  self: grad.clone().scatter_(dim, index, 0)
+
+- name: scatter_add_(Tensor self, int64_t dim, Tensor index, Tensor src)
+  self: grad
+  src: grad.gather(dim, index)
+
+- name: select(Tensor self, int64_t dim, int64_t index)
+  self: select_backward(grad, self.sizes(), dim, index)
+
+- name: sigmoid(Tensor self)
+  self: _sigmoid_backward(grad, result)
+
+- name: sign(Tensor self)
+  self: zeros_like(grad)
+
+- name: sin(Tensor self)
+  self: grad * self.cos()
+
+- name: sinh(Tensor self)
+  self: grad * self.cosh()
+
+- name: slice(Tensor self, int64_t dim, int64_t start, int64_t end, int64_t step)
+  self: slice_backward(grad, self.sizes(), dim, start, end, step)
+
+- name: slogdet(Tensor self)
+  self: slogdet_backward(grads, self, result0, result1)
+
+- name: sort(Tensor self, int64_t dim, bool descending)
+  self: index_select_backward(grad, dim, indices, self.sizes(), true)
+
+- name: split(Tensor self, int64_t split_size, int64_t dim)
+  self: split_backward(grads, split_size, dim, self.sizes(), self.type())
+
+- name: split_with_sizes(Tensor self, IntList split_sizes, int64_t dim)
+  self: split_with_sizes_backward(grads, split_sizes, dim, self.sizes(), self.type())
+
+- name: sqrt(Tensor self)
+  self: grad / (2 * result)
+
+- name: squeeze(Tensor self)
+  self: unsqueeze_to(grad, self.sizes());
+
+- name: squeeze(Tensor self, int64_t dim)
+  self: unsqueeze_to(grad, dim, self.sizes())
+
+- name: squeeze_(Tensor self)
+  self: unsqueeze_to(grad, self.sizes());
+
+- name: squeeze_(Tensor self, int64_t dim)
+  self: unsqueeze_to(grad, dim, self.sizes())
+
+- name: std(Tensor self, bool unbiased)
+  self: var_backward(grad / (result * 2), self, unbiased)
+
+- name: std(Tensor self, int64_t dim, bool unbiased, bool keepdim)
+  self: var_backward(grad / (result * 2), self, dim, unbiased, keepdim)
+
+- name: sub(Tensor self, Scalar other, *, Scalar alpha)
+  self: grad
+
+- name: s_native_sub(Tensor self, Tensor other, *, Scalar alpha)
+  self: grad
+  other: -grad * alpha
+
+- name: th_sub(Tensor self, Tensor other, *, Scalar alpha)
+  self: grad
+  other: -grad * alpha
+
+- name: _sum(Tensor self)
+  self: grad.expand(self.sizes())
+
+- name: _sum(Tensor self, IntList dim, bool keepdim)
+  self: sum_backward(grad, self.sizes(), dim, keepdim)
+
+- name: svd(Tensor self, bool some)
+  self: svd_backward(grads, self, some, res1, res2, res3)  # resX are defined in Declarations.cwrap
+
+- name: symeig(Tensor self, bool eigenvectors, bool upper)
+  self: not_implemented("symeig")
+
+- name: t(Tensor self)
+  self: grad.t()
+
+- name: flip(Tensor self, IntList dims)
+  self: grad.flip(dims)
+
+- name: take(Tensor self, Tensor index)
+  self: zeros_like(self).put_(index, grad, true)
+
+- name: tan(Tensor self)
+  self: grad * (1 + result.pow(2))
+
+- name: tanh(Tensor self)
+  self: _tanh_backward(grad, result)
+
+- name: topk(Tensor self, int64_t k, int64_t dim, bool largest, bool sorted)
+  self: index_select_backward(grad, dim, indices, self.sizes(), true)
+
+- name: trace(Tensor self)
+  self: trace_backward(grad, self.sizes())
+
+- name: transpose(Tensor self, int64_t dim0, int64_t dim1)
+  self: grad.transpose(dim0, dim1)
+
+- name: transpose_(Tensor self, int64_t dim0, int64_t dim1)
+  self: grad.transpose(dim0, dim1)
+
+- name: tril(Tensor self, int64_t diagonal)
+  self: grad.tril(diagonal)
+
+- name: triu(Tensor self, int64_t diagonal)
+  self: grad.triu(diagonal)
+
+- name: trtrs(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular)
+  self, A: trtrs_backward(grads[0], grads[1], self, A, res1, upper, transpose, unitriangular, grad_input_mask)
+
+- name: trunc(Tensor self)
+  self: zeros_like(grad)
+
+- name: unfold(Tensor self, int64_t dimension, int64_t size, int64_t step)
+  self: unfold_backward(grad, self.sizes(), dimension, size, step)
+
+- name: uniform_(Tensor self, double from, double to, Generator generator)
+  self: zeros_like(grad)
+
+- name: _unique(Tensor self, bool sorted, bool return_inverse)
+  self: not_implemented("_unique")
+
+- name: _unsafe_view(Tensor self, IntList size)
+  self: grad.reshape(self.sizes())
+
+- name: unsqueeze(Tensor self, int64_t dim)
+  self: grad.squeeze(dim)
+
+- name: unsqueeze_(Tensor self, int64_t dim)
+  self: grad.squeeze(dim)
+
+- name: var(Tensor self, bool unbiased)
+  self: var_backward(grad, self, unbiased)
+
+- name: var(Tensor self, int64_t dim, bool unbiased, bool keepdim)
+  self: var_backward(grad, self, dim, unbiased, keepdim)
+
+- name: view(Tensor self, IntList size)
+  self: grad.reshape(self.sizes())
+
+- name: _s_where(Tensor condition, Tensor self, Tensor other)
+  self: where(condition, grad, zeros_like(grad))
+  other: where(condition, zeros_like(grad), grad)
+
+- name: zero_(Tensor self)
+  self: zeros_like(grad)
+
+- name: _sparse_mask(Tensor self, SparseTensorRef mask)
+  self: not_implemented("_sparse_mask")
+  mask: not_implemented("_sparse_mask")
+
+- name: _standard_gamma(Tensor self, Generator generator)
+  self: grad * self._standard_gamma_grad(result)
+
+- name: _standard_gamma_grad(Tensor self, Tensor output)
+  self: not_implemented("_standard_gamma_grad")
+
+# NN
+- name: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim)
+  i1, i2, i3: _trilinear_backward(grad, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim, grad_input_mask)
+
+- name: binary_cross_entropy_forward(Tensor self, Tensor target, Tensor weight, int64_t reduction)
+  self: binary_cross_entropy_backward(grad, self, target, weight, reduction)
+
+- name: embedding(Tensor weight, Tensor indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse)
+  weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
+
+- name: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int64_t mode, bool sparse)
+  weight: _embedding_bag_backward(grad, indices, offsets, result1, result2, result3, weight.size(0), scale_grad_by_freq, mode, sparse)
+
+- name: embedding_renorm_(Tensor self, Tensor indices, double max_norm, double norm_type)
+  self: not_implemented("embedding_renorm")
+
+- name: kl_div_forward(Tensor self, Tensor target, int64_t reduction)
+  self: kl_div_backward(grad, self, target, reduction)
+  target: kl_div_target_backward(grad, self, target, reduction)
+
+- name: l1_loss_forward(Tensor self, Tensor target, int64_t reduction)
+  self: l1_loss_backward(grad, self, target, reduction)
+
+- name: mse_loss_forward(Tensor self, Tensor target, int64_t reduction)
+  self: mse_loss_backward(grad, self, target, reduction)
+
+- name: multi_margin_loss_forward(Tensor self, Tensor target, Scalar p, Scalar margin, Tensor weight, int64_t reduction)
+  self: multi_margin_loss_backward(grad, self, target, p, margin, weight, reduction)
+
+- name: multilabel_margin_loss_forward(Tensor self, Tensor target, int64_t reduction)
+  self: multilabel_margin_loss_backward(grad, self, target, reduction, is_target)
+
+- name: nll_loss_forward(Tensor self, Tensor target, Tensor weight, int64_t reduction, int64_t ignore_index)
+  self: nll_loss_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
+
+- name: nll_loss2d_forward(Tensor self, Tensor target, Tensor weight, int64_t reduction, int64_t ignore_index)
+  self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
+
+- name: smooth_l1_loss_forward(Tensor self, Tensor target, int64_t reduction)
+  self: smooth_l1_loss_backward(grad, self, target, reduction)
+
+- name: soft_margin_loss_forward(Tensor self, Tensor target, int64_t reduction)
+  self: soft_margin_loss_backward(grad, self, target, reduction)
+
+- name: relu(Tensor self)
+  self: threshold_backward(grad, self, 0, 0)
+
+- name: elu_forward(Tensor self, Scalar alpha, Scalar scale)
+  self: elu_backward(grad, alpha, scale, output)
+
+- name: glu_forward(Tensor self, int64_t dim)
+  self: glu_backward(grad, self, dim)
+
+- name: hardshrink(Tensor self, Scalar lambd)
+  self: hardshrink_backward(grad, self, lambd)
+
+- name: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd)
+  grad_out: hardshrink_backward(grad, self, lambd)
+  self: zeros_like(grad)
+
+- name: hardtanh_forward(Tensor self, Scalar min_val, Scalar max_val)
+  self: hardtanh_backward(grad, self, min_val, max_val)
+
+- name: hardtanh_forward_(Tensor self, Scalar min_val, Scalar max_val)
+  self: hardtanh_backward(grad, output, min_val, max_val)
+
+- name: leaky_relu_forward(Tensor self, Scalar negative_slope)
+  self: leaky_relu_backward(grad, self, negative_slope)
+
+- name: leaky_relu_forward_(Tensor self, Scalar negative_slope)
+  self: leaky_relu_backward(grad, output, negative_slope)
+
+- name: log_sigmoid_forward(Tensor self)
+  self: log_sigmoid_backward(grad, self, buffer)
+
+- name: log_softmax(Tensor self, int64_t dim)
+  self: log_softmax_backward_data(grad, result, dim, self)
+
+- name: prelu_forward(Tensor self, Tensor weight)
+  self, weight: prelu_backward(grad, self, weight, grad_input_mask)
+
+- name: rrelu_with_noise_forward(Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, Generator generator)
+  self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training)
+
+- name: rrelu_with_noise_forward_(Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, Generator generator)
+  self: rrelu_with_noise_backward(grad, output, noise, lower, upper, training)
+
+- name: softmax(Tensor self, int64_t dim)
+  self: softmax_backward_data(grad, result, dim, self)
+
+- name: softplus_forward(Tensor self, Scalar beta, Scalar threshold)
+  self: softplus_backward(grad, self, beta, threshold, output)
+
+- name: softshrink_forward(Tensor self, Scalar lambd)
+  self: softshrink_backward(grad, self, lambd)
+
+- name: threshold_forward(Tensor self, Scalar threshold, Scalar value)
+  self: threshold_backward(grad, self, threshold, value)
+
+- name: threshold_forward_(Tensor self, Scalar threshold, Scalar value)
+  self: threshold_backward(grad, output, threshold, value)
+
+- name: reflection_pad1d_forward(Tensor self, IntList padding)
+  self: reflection_pad1d_backward(grad, self, padding)
+
+- name: reflection_pad2d_forward(Tensor self, IntList padding)
+  self: reflection_pad2d_backward(grad, self, padding)
+
+- name: replication_pad1d_forward(Tensor self, IntList padding)
+  self: replication_pad1d_backward(grad, self, padding)
+
+- name: replication_pad2d_forward(Tensor self, IntList padding)
+  self: replication_pad2d_backward(grad, self, padding)
+
+- name: replication_pad3d_forward(Tensor self, IntList padding)
+  self: replication_pad3d_backward(grad, self, padding)
+
+- name: upsample_linear1d_forward(Tensor self, IntList output_size, bool align_corners)
+  self: upsample_linear1d_backward(grad, output_size, self.sizes(), align_corners)
+
+- name: upsample_bilinear2d_forward(Tensor self, IntList output_size, bool align_corners)
+  self: upsample_bilinear2d_backward(grad, output_size, self.sizes(), align_corners)
+
+- name: upsample_trilinear3d_forward(Tensor self, IntList output_size, bool align_corners)
+  self: upsample_trilinear3d_backward(grad, output_size, self.sizes(), align_corners)
+
+- name: upsample_nearest1d_forward(Tensor self, IntList output_size)
+  self: upsample_nearest1d_backward(grad, output_size, self.sizes())
+
+- name: upsample_nearest2d_forward(Tensor self, IntList output_size)
+  self: upsample_nearest2d_backward(grad, output_size, self.sizes())
+
+- name: upsample_nearest3d_forward(Tensor self, IntList output_size)
+  self: upsample_nearest3d_backward(grad, output_size, self.sizes())
+
+- name: adaptive_avg_pool2d_forward(Tensor self, IntList output_size)
+  self: adaptive_avg_pool2d_backward(grad, self)
+
+- name: adaptive_avg_pool3d_forward(Tensor self, IntList output_size)
+  self: adaptive_avg_pool3d_backward(grad, self)
+
+- name: adaptive_max_pool2d_forward(Tensor self, IntList output_size)
+  self: adaptive_max_pool2d_backward(grad, self, indices)
+
+- name: adaptive_max_pool3d_forward(Tensor self, IntList output_size)
+  self: adaptive_max_pool3d_backward(grad, self, indices)
+
+- name: avg_pool2d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad)
+  self: avg_pool2d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad)
+
+- name: avg_pool3d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad)
+  self: avg_pool3d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad)
+
+- name: fractional_max_pool2d_forward(Tensor self, IntList kernel_size, IntList output_size, Tensor random_samples)
+  self: fractional_max_pool2d_backward(grad, self, kernel_size, output_size, indices)
+
+- name: max_pool2d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+  self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+
+- name: max_pool3d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+  self: max_pool3d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+
+- name: max_unpool2d_forward(Tensor self, Tensor indices, IntList output_size)
+  self: max_unpool2d_backward(grad, self, indices, output_size)
+
+- name: max_unpool3d_forward(Tensor self, Tensor indices, IntList output_size, IntList stride, IntList padding)
+  self: max_unpool3d_backward(grad, self, indices, output_size, stride, padding)
+
+- name: thnn_batch_norm_forward(Tensor self, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, bool training, double momentum, double eps)
+  self, weight, bias: thnn_batch_norm_backward(grad.contiguous(), self, weight, running_mean, running_var, training, eps, save_mean, save_std, grad_input_mask)
+
+- name: thnn_batch_norm_backward(Tensor grad_output, Tensor self, Tensor weight, Tensor running_mean, Tensor running_var, bool training, double eps, Tensor save_mean, Tensor save_std, std::array<bool,3> output_mask)
+  self, weight, grad_output: batchnorm_double_backward(self, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, training, eps, save_mean, save_std, grad_input_mask)
+  save_mean: not_implemented("thnn_batch_norm_backward save_mean")
+  save_std: not_implemented("thnn_batch_norm_backward save_std")
+
+- name: thnn_conv_transpose2d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding, IntList output_padding, IntList dilation)
+  self, weight, bias: thnn_conv_transpose2d_backward(grad, self, weight, kernel_size, stride, padding, output_padding, dilation, columns, ones, grad_input_mask)
+
+- name: thnn_conv_transpose2d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, Tensor columns, Tensor ones, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, true, output_padding, 1, false, false, false, grad_input_mask)
+
+- name: thnn_conv_transpose3d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding, IntList output_padding, IntList dilation)
+  self, weight, bias: thnn_conv_transpose3d_backward(grad, self, weight, kernel_size, stride, padding, output_padding, dilation, finput, fgrad_input, grad_input_mask)
+
+- name: thnn_conv_transpose3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, Tensor finput, Tensor fgrad_input, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, true, output_padding, 1, false, false, false, grad_input_mask)
+
+- name: thnn_conv2d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding)
+  self, weight, bias: thnn_conv2d_backward(grad, self, weight, kernel_size, stride, padding, finput, fgrad_input, grad_input_mask)
+
+- name: thnn_conv2d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, Tensor finput, Tensor fgrad_input, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, false, false, false, grad_input_mask)
+
+- name: thnn_conv_depthwise2d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding, IntList dilation)
+  self, weight: thnn_conv_depthwise2d_backward(grad.contiguous(), self, weight, kernel_size, stride, padding, dilation, grad_input_mask)
+  bias: grad.contiguous().view({grad.size(0), grad.size(1), -1}).sum(0).sum(1)
+
+- name: thnn_conv_depthwise2d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array<bool,2> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], {}, grad_output, weight, self, stride, padding, dilation, false, {{0, 0}}, self.size(1), false, false, false, grad_input_mask)
+
+- name: thnn_conv3d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding)
+  self, weight, bias: thnn_conv3d_backward(grad, self, weight, kernel_size, stride, padding, finput, fgrad_input, grad_input_mask)
+
+- name: thnn_conv3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, Tensor finput, Tensor fgrad_input, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1, 1}}, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask)
+
+- name: thnn_conv_dilated2d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding, IntList dilation)
+  self, weight, bias: thnn_conv_dilated2d_backward(grad, self, weight, kernel_size, stride, padding, dilation, columns, ones, grad_input_mask)
+
+- name: thnn_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0}}, 1, false, false, false, grad_input_mask)
+
+- name: thnn_conv_dilated3d_forward(Tensor self, Tensor weight, IntList kernel_size, Tensor bias, IntList stride, IntList padding, IntList dilation)
+  self, weight, bias: thnn_conv_dilated3d_backward(grad, self, weight, kernel_size, stride, padding, dilation, columns, ones, grad_input_mask)
+
+- name: thnn_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask)
+
+- name: thnn_grid_sampler_bilinear2d_forward(Tensor self, Tensor grid, int64_t padding_mode)
+  self, grid: thnn_grid_sampler_bilinear2d_backward(grad, self, grid, padding_mode)
+
+- name: thnn_grid_sampler_bilinear3d_forward(Tensor self, Tensor grid, int64_t padding_mode)
+  self, grid: thnn_grid_sampler_bilinear3d_backward(grad, self, grid, padding_mode)
+
+# NN double backwards support
+
+- name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self)
+  grad_output: adaptive_avg_pool2d(grad, { grad_output.size(-2), grad_output.size(-1) })
+  self: zeros_like(self)
+
+- name: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self)
+  grad_output: adaptive_avg_pool3d(grad, { grad_output.size(-3), grad_output.size(-2), grad_output.size(-1) })
+  self: zeros_like(self)
+
+- name: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices)
+  grad_output: max_pool_double_backward(grad, indices, 2)
+  self: zeros_like(self)
+
+- name: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices)
+  grad_output: max_pool_double_backward(grad, indices, 3)
+  self: zeros_like(self)
+
+- name: avg_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad)
+  grad_output: avg_pool2d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad)
+  self: zeros_like(self)
+
+- name: avg_pool3d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad)
+  grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad)
+  self: zeros_like(self)
+
+- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Tensor output)
+  grad_output: elu_backward(grad, alpha, scale, output)
+  output: grad * grad_output * (output < 0).toType(grad.type())
+
+- name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList output_size, Tensor indices)
+  grad_output: max_pool_double_backward(grad, indices, 2)
+  self: zeros_like(self)
+
+- name: glu_backward(Tensor grad_output, Tensor self, int64_t dim)
+  grad_output: glu_double_backward_grad_output(grad, self, dim)
+  self: glu_double_backward(grad, grad_output, self, dim)
+
+- name: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val)
+  grad_output: hardtanh_backward(grad, self, min_val, max_val)
+  self: zeros_like(grad)
+
+- name: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
+  grad_output: kl_div_double_backward_grad_output(grad, self, target, reduction)
+  self: zeros_like(grad)
+  target: zeros_like(grad)
+
+- name: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
+  grad_output: l1_loss_double_backward_grad_output(grad, self, target, reduction)
+  self: zeros_like(grad)
+
+- name: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer)
+  grad_output: log_sigmoid_backward(grad, self, buffer)
+  self: log_sigmoid_double_backward(grad * grad_output, self)
+
+- name: log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self)
+  grad_output: grad - (grad * output.exp()).sum(dim, true)
+  self: log_softmax_double_backward(grad, grad_output, dim, output)
+
+- name: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope)
+  grad_output: leaky_relu_backward(grad, self, negative_slope)
+  self: zeros_like(grad)
+
+- name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+  grad_output: max_pool_double_backward(grad, indices, 2);
+  self: zeros_like(self)
+
+- name: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+  grad_output: max_pool_double_backward(grad, indices, 3);
+  self: zeros_like(self)
+
+- name: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, IntList output_size)
+  grad_output: max_unpool2d(grad, indices, output_size)
+  self: zeros_like(self)
+
+- name: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
+  grad_output: mse_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
+  self: mse_loss_double_backward(grad * grad_output, self, reduction)
+
+- name: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor weight, int64_t reduction, int64_t ignore_index, Tensor total_weight)
+  grad_output: nll_loss(grad, target, weight, reduction, ignore_index)
+  self: zeros_like(grad)
+
+- name: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor weight, int64_t reduction, int64_t ignore_index, Tensor total_weight)
+  grad_output: nll_loss2d(grad, target, weight, reduction, ignore_index)
+  self: zeros_like(grad)
+
+- name: prelu_backward(Tensor grad_output, Tensor self, Tensor weight, std::array<bool,2> output_mask)
+  grad_output, self, weight: prelu_double_backward(grads[0], grads[1], grad_output, self, weight, grad_input_mask)
+
+- name: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training)
+  grad_output: rrelu_with_noise_backward(grad, self, noise, lower, upper, training)
+  self: zeros_like(grad)
+
+- name: reflection_pad1d_backward(Tensor grad_output, Tensor self, IntList padding)
+  grad_output: reflection_pad1d(grad, padding)
+  self: zeros_like(self)
+
+- name: reflection_pad2d_backward(Tensor grad_output, Tensor self, IntList padding)
+  grad_output: reflection_pad2d(grad, padding)
+  self: zeros_like(self)
+
+- name: replication_pad1d_backward(Tensor grad_output, Tensor self, IntList padding)
+  grad_output: replication_pad1d(grad, padding)
+  self: zeros_like(self)
+
+- name: replication_pad2d_backward(Tensor grad_output, Tensor self, IntList padding)
+  grad_output: replication_pad2d(grad, padding)
+  self: zeros_like(self)
+
+- name: replication_pad3d_backward(Tensor grad_output, Tensor self, IntList padding)
+  grad_output: replication_pad3d(grad, padding)
+  self: zeros_like(self)
+
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
+  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
+  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
+
+- name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output)
+  grad_output: softplus_backward(grad, self, beta, threshold, output)
+  self: softplus_double_backward(grad * grad_output, self, beta, threshold)
+
+- name: softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self)
+  grad_output: softmax_backward_data(grad, output, dim, self)
+  self: softmax_double_backward(grad, grad_output, dim, output)
+
+- name: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
+  grad_output: soft_margin_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
+  self: soft_margin_loss_double_backward(grad * grad_output, self, target, reduction)
+
+- name: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd)
+  grad_output: softshrink_backward(grad, self, lambd)
+  self: zeros_like(grad)
+
+- name: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold, Scalar value)
+  grad_output: threshold_backward(grad, self, threshold, value)
+  self: zeros_like(grad)
+
+- name: upsample_linear1d_backward(Tensor grad_output, IntList output_size, IntList input_size, bool align_corners)
+  grad_output: upsample_linear1d(grad, output_size, align_corners)
+
+- name: upsample_bilinear2d_backward(Tensor grad_output, IntList output_size, IntList input_size, bool align_corners)
+  grad_output: upsample_bilinear2d(grad, output_size, align_corners)
+
+- name: upsample_trilinear3d_backward(Tensor grad_output, IntList output_size, IntList input_size, bool align_corners)
+  grad_output: upsample_trilinear3d(grad, output_size, align_corners)
+
+- name: upsample_nearest1d_backward(Tensor grad_output, IntList output_size, IntList input_size)
+  grad_output: upsample_nearest1d(grad, output_size)
+
+- name: upsample_nearest2d_backward(Tensor grad_output, IntList output_size, IntList input_size)
+  grad_output: upsample_nearest2d(grad, output_size)
+
+- name: upsample_nearest3d_backward(Tensor grad_output, IntList output_size, IntList input_size)
+  grad_output: upsample_nearest3d(grad, output_size)
+
+- name: _sigmoid_backward(Tensor grad_output, Tensor output)
+  grad_output: _sigmoid_backward(grad, output)
+  output: grad * grad_output * (-2 * output + 1)
+
+- name: _tanh_backward(Tensor grad_output, Tensor output)
+  grad_output: _tanh_backward(grad, output)
+  output: -2 * output * grad * grad_output
+
+# cudnn
+
+- name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic)
+  self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask)
+
+- name: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, true, output_padding, groups, benchmark, deterministic, true, grad_input_mask)
+
+- name: cudnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic)
+  self, weight, bias: cudnn_convolution_backward(self, grad, weight, padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask)
+
+- name: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, benchmark, deterministic, true, grad_input_mask)
+
+# The above backward definitions are equivalent to the definitions below.  Why do we bundle
+# everything up?  It's because it's more convenient to define double backwards
+# when there is a single function that manages everything.
+#
+# Unfortuantely, there's one downside to not doing it all in one day: we
+# unconditionally save input and weight, even if weight/input gradients are not
+# being computed.  That's too bad.
+#
+# input: cudnn_convolution_backward_input(input.sizes(), grad.contiguous(), weight, padding, stride, dilation, groups, benchmark, deterministic)
+# weight: cudnn_convolution_backward_weight(weight.sizes(), grad.contiguous(), input, padding, stride, dilation, groups, benchmark, deterministic)
+# bias: cudnn_convolution_backward_bias(grad.contiguous())
+#
+# input: cudnn_convolution_transpose_backward_input(grad.contiguous(), weight, padding, stride, dilation, groups, benchmark, deterministic)
+# weight: cudnn_convolution_transpose_backward_weight(weight.sizes(), grad.contiguous(), input, padding, stride, dilation, groups, benchmark, deterministic)
+# bias: cudnn_convolution_backward_bias(grad.contiguous())
+
+- name: cudnn_grid_sampler(Tensor self, Tensor grid)
+  self, grid: cudnn_grid_sampler_backward(self, grid, grad)
+
+- name: cudnn_affine_grid_generator(Tensor theta, int64_t N, int64_t C, int64_t H, int64_t W)
+  theta: cudnn_affine_grid_generator_backward(grad, N, C, H, W)
+
+# NB: Why is the backwards here so complicated?  CuDNN cannot be used to compute
+# backward in evaluation mode, because the math for backward in evaluation mode
+# is different (since the forward math is different), and CuDNN does not support
+# it.  And in any case, you shouldn't be using this bn in evaluation mode,
+# because it should be merged into the previous convolution (left for future
+# work.)
+# NB2: The quotes around the gradient are needed to appease YAML parsing rules.
+- name: cudnn_batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, bool training, double exponential_average_factor, double epsilon)
+  input, weight, bias: "training ? cudnn_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : thnn_batch_norm_backward(grad.contiguous(), input, weight, running_mean, running_var, training, epsilon, result1, result2, grad_input_mask)"
+
+# HACK: save_mean and save_var are going to be passed in as
+# requires_grad variables (even though we'll never backprop through
+# them) so we need to prevent the unpacking from triggering an error.
+- name: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor running_mean, Tensor running_var, Tensor save_mean, Tensor save_var, double epsilon)
+  save_mean: not_implemented("cudnn_batch_norm_backward save_mean")
+  save_var: not_implemented("cudnn_batch_norm_backward save_var")
+  input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
+
+- name: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, Tensor dropout_state)
+  input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+
+# mkldnn
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation, int64_t groups)
+  self, weight, bias: mkldnn_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask)
+
+# fft
+- name: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes)
+  self: fft_backward(self, grad, signal_ndim, complex_input, complex_output, inverse, checked_signal_sizes, normalized, onesided, output_sizes)
+
+- name: unbind(Tensor self, int64_t dim)
+  self: stack(to_tensor_list(grads), dim)
+
+- name: stack(TensorList tensors, int64_t dim)
+  tensors: unbind(grad, dim)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
new file mode 100644
index 0000000..2960204
--- /dev/null
+++ b/tools/autograd/gen_autograd.py
@@ -0,0 +1,198 @@
+# gen_autograd.py generates C++ autograd functions and Python bindings.
+#
+# It delegates to the following scripts:
+#
+#  gen_autograd_functions.py: generates subclasses of torch::autograd::Functions
+#  gen_variable_type.py: generates VariableType.h which contains all tensor methods
+#  gen_python_functions.py: generates Python bindings to THPVariable
+#
+
+import argparse
+import copy
+import os
+import yaml
+from collections import defaultdict
+from .utils import YamlLoader, split_name_params
+
+VIEW_FUNCTIONS = {
+    'alias', 'as_strided', 'diagonal', 'expand', 'narrow', 'permute', 'select', 'slice',
+    'squeeze', 't', 'transpose', 'unfold', 'unsqueeze', 'view', 'unbind',
+}
+
+# In principle this should live in derivatives.yaml, but I could not
+# think of a good syntax for it
+HARDCODED_DIFFERENTIABLE_OUTPUTS = {
+    # Suppose that 'foo' is a function for which outputs 0 and 1 are
+    # differentiable, and 2 is not.  Then you would write:
+    # 'foo': (0, 1),
+    '_cudnn_rnn': (0, 1, 2),
+    # _cudnn_rnn outputs:
+    #   0 => output
+    #   1 => hy
+    #   2 => cy
+    #   3 => reserve
+    #   4 => weight_buf
+}
+
+
+def format_return_type(returns):
+    if len(returns) == 0:
+        return 'void'
+    elif len(returns) == 1:
+        return returns[0]['type']
+    else:
+        return_types = [r['type'] for r in returns]
+        return 'std::tuple<{}>'.format(','.join(return_types))
+
+
+def get_simple_type(arg):
+    simple_type = arg['type']
+    simple_type = simple_type.replace(' &', '').replace('const ', '')
+    simple_type = simple_type.replace('Generator *', 'Generator')
+    return simple_type
+
+
+def load_aten_declarations(path):
+    with open(path, 'r') as f:
+        declarations = yaml.load(f, Loader=YamlLoader)
+
+    # enrich declarations with additional information
+    selected_declarations = []
+    for declaration in declarations:
+        if declaration.get('deprecated'):
+            continue
+
+        for arg in declaration['arguments']:
+            arg['simple_type'] = get_simple_type(arg)
+        for ret in declaration['returns']:
+            ret['simple_type'] = get_simple_type(ret)
+
+        declaration['formals'] = [arg['type'] + ' ' + arg['name']
+                                  for arg in declaration['arguments']]
+        declaration['args'] = [arg['name'] for arg in declaration['arguments']]
+        declaration['type_method_formals'] = [arg['type'] + ' ' + arg['name']
+                                              for arg in declaration['arguments']
+                                              if not arg.get('is_type_dispatched')]
+        declaration['type_method_args'] = [arg['name'] for arg in declaration['arguments']
+                                           if not arg.get('is_type_dispatched')]
+        declaration['api_name'] = declaration['name']
+        declaration['return_type'] = format_return_type(declaration['returns'])
+
+        declaration['base_name'] = declaration['name']
+        selected_declarations.append(declaration)
+
+    return selected_declarations
+
+
+def load_deprecated_signatures(aten_decls, deprecated_path):
+    def group_declarations_by_signature():
+        d = defaultdict(list)
+        for declaration in aten_decls:
+            name = declaration['name']
+            base_name = name[:-1] if declaration['inplace'] else name
+            simple_types = [arg['simple_type'] for arg in declaration['arguments']]
+            signature = '{}({})'.format(base_name, ', '.join(simple_types))
+            d[signature].append(declaration)
+        return d
+
+    with open(deprecated_path, 'r') as f:
+        deprecated_defs = yaml.load(f, Loader=YamlLoader)
+    declarations = []
+    declarations_by_signature = group_declarations_by_signature()
+
+    def get_signature(name, params, call_args):
+        # create a mapping of parameter name to parameter type
+        types = dict([param.split(' ')[::-1] for param in params if param != '*'])
+        # if the name in the call is not in the parameter list, assume it's
+        # a literal Scalar
+        rearranged_types = [types.get(arg, 'Scalar') for arg in call_args]
+        return '{}({})'.format(name, ', '.join(rearranged_types))
+
+    for deprecated in deprecated_defs:
+        aten_name, call_args = split_name_params(deprecated['aten'])
+        name, params = split_name_params(deprecated['name'])
+        signature = get_signature(aten_name, params, call_args)
+
+        for declaration in declarations_by_signature[signature]:
+            declaration = copy.deepcopy(declaration)
+            declaration['deprecated'] = True
+            declaration['call_args'] = call_args
+
+            call_arg_to_idx = {arg: i for i, arg in enumerate(call_args)}
+            original_args = declaration['arguments']
+
+            # Create an arguments list that uses the types from the original
+            # ATen declaration, but the ordering and parameter names from
+            # the deprecated overload. Any default parameter values from the
+            # original ATen declaration are ignored.
+            arguments = []
+            kwarg_only = False
+            for param in params:
+                if param == '*':
+                    kwarg_only = True
+                    continue
+                _, param_name = param.split(' ')
+                original = original_args[call_arg_to_idx[param_name]]
+                arguments.append({
+                    'name': param_name,
+                    'kwarg_only': kwarg_only,
+                    'type': original['type'],
+                    'simple_type': original['simple_type'],
+                    'dynamic_type': original['dynamic_type'],
+                    'output': original.get('output', False),
+                })
+            declaration['arguments'] = arguments
+            declarations.append(declaration)
+    return declarations
+
+
+def gen_autograd(aten_path, out, autograd_dir):
+    aten_decls = load_aten_declarations(aten_path)
+
+    # Parse and load derivatives.yaml
+    from .load_derivatives import load_derivatives
+    autograd_functions = load_derivatives(
+        os.path.join(autograd_dir, 'derivatives.yaml'), aten_decls)
+
+    template_path = os.path.join(autograd_dir, 'templates')
+
+    # Generate VariableType.h/cpp
+    from .gen_variable_type import gen_variable_type
+    gen_variable_type(out, aten_decls, template_path)
+
+    # Generate Functions.h/cpp
+    from .gen_autograd_functions import gen_autograd_functions
+    gen_autograd_functions(
+        out, autograd_functions, template_path)
+
+    # Load deprecated signatures
+    deprecated = load_deprecated_signatures(
+        aten_decls, os.path.join(autograd_dir, 'deprecated.yaml'))
+
+    # Generate Python bindings
+    from . import gen_python_functions
+    gen_python_functions.gen_py_variable_methods(
+        out, aten_decls + deprecated, template_path)
+    gen_python_functions.gen_py_torch_functions(
+        out, aten_decls + deprecated, template_path)
+    gen_python_functions.gen_py_nn_functions(
+        out, aten_decls, template_path)
+
+    # Generate variable_factories.h
+    from .gen_variable_factories import gen_variable_factories
+    gen_variable_factories(out, aten_decls, template_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate autograd C++ files script')
+    parser.add_argument('declarations', metavar='DECL',
+                        help='path to Declarations.yaml')
+    parser.add_argument('out', metavar='OUT',
+                        help='path to output directory')
+    args = parser.parse_args()
+    gen_autograd(args.declarations, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
new file mode 100644
index 0000000..33f419f
--- /dev/null
+++ b/tools/autograd/gen_autograd_functions.py
@@ -0,0 +1,216 @@
+# Generates C++ autograd functions for the derivatives of ATen operations
+#
+# This writes two files:
+#  Functions.h/cpp: subclasses of autograd::Function
+#  python_functions.h/cpp: Python bindings for the above classes
+#
+import re
+from .utils import nested_dict, CodeTemplate, write
+from .gen_autograd import VIEW_FUNCTIONS
+from .utils import IDENT_REGEX
+
+FUNCTION_DECLARATION = CodeTemplate("""\
+struct ${op} : public ${superclass} {
+  using ${superclass}::${superclass};
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "${op}"; }
+  void release_variables() override {
+    ${release_variables}
+  }
+  ${will_release_variables}
+  ${saved_variables}
+  ${saved_list_sizes}
+};
+""")
+
+WILL_RELEASE_VARIABLES = CodeTemplate("""\
+bool retain_variables = true;
+void will_release_variables() override {
+  retain_variables = false;
+}
+""")
+
+FUNCTION_DEFINITION = CodeTemplate("""\
+variable_list ${op}::apply(variable_list&& grads) {
+  IndexRangeGenerator gen;
+  ${compute_index_ranges}
+  variable_list grad_inputs(gen.size());
+  ${body}
+  return grad_inputs;
+}
+""")
+
+PY_FUNCTION_DEFINITION = CodeTemplate("""\
+static PyTypeObject ${op}Class;
+addClass<${op}>(${op}Class, "${op}");
+""")
+
+GRAD_INPUT_MASK = CodeTemplate("""\
+  auto grad_input_mask = std::array<bool, ${n}>{
+    ${masks}
+  };\
+""")
+
+DERIVATIVE_SINGLE = CodeTemplate("""\
+if (should_compute_output({ ${name}_ix })) {
+  auto grad_result = ${derivative};
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+""")
+
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate("""\
+  if (should_compute_output({ ${name}_ix })) {
+    copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
+  }
+""")
+
+DERIVATIVE_MULTI = CodeTemplate("""\
+if (should_compute_output({ ${idx_ranges} })) {
+  ${grad_input_mask}
+  auto grad_result = ${derivative};
+  ${copy_ranges}
+}
+""")
+
+# These functions have backwards which cannot be traced, and so must have
+# their backward functions traced opaquely.
+# VIEW_FUNCTIONS are not traceable because they use as_strided, which
+# has an untraceable backwards, see
+# https://github.com/pytorch/pytorch/issues/4250
+# TODO: This is probably not exhaustive, but it's a start
+UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS
+
+
+def gen_autograd_functions(out, autograd_functions, template_path):
+    """Functions.h and Functions.cpp body
+
+    These contain the auto-generated subclasses of torch::autograd::Function
+    for each every differentiable torch function.
+    """
+
+    FUNCTIONS_H = CodeTemplate.from_file(template_path + '/Functions.h')
+    FUNCTIONS_CPP = CodeTemplate.from_file(template_path + '/Functions.cpp')
+    PY_FUNCTIONS_H = CodeTemplate.from_file(template_path + '/python_functions.h')
+    PY_FUNCTIONS_CPP = CodeTemplate.from_file(template_path + '/python_functions.cpp')
+
+    function_definitions = []
+    function_declarations = []
+    py_function_initializers = []
+
+    for func in autograd_functions:
+        env = process_function(func)
+
+        function_declarations.append(FUNCTION_DECLARATION.substitute(env))
+        function_definitions.append(FUNCTION_DEFINITION.substitute(env))
+        py_function_initializers.append(PY_FUNCTION_DEFINITION.substitute(env))
+
+    top_env = {
+        'autograd_function_definitions': function_definitions,
+        'autograd_function_declarations': function_declarations,
+        'py_function_initializers': py_function_initializers,
+    }
+
+    write(out, 'Functions.h', FUNCTIONS_H, top_env)
+    write(out, 'Functions.cpp', FUNCTIONS_CPP, top_env)
+    write(out, 'python_functions.h', PY_FUNCTIONS_H, top_env)
+    write(out, 'python_functions.cpp', PY_FUNCTIONS_CPP, top_env)
+
+
+def process_function(func):
+    env = {}
+    saved_variables = []
+    release_variables = []
+    saved_list_sizes = []
+    unpack = []
+
+    env['compute_index_ranges'] = []
+    for arg in func['args_with_gradients']:
+        if arg['type'] == 'TensorList':
+            size = '{}_size_'.format(arg['name'])
+            saved_list_sizes.append('size_t {}_size_;'.format(arg['name']))
+        else:
+            size = '1'
+        env['compute_index_ranges'].append('auto {}_ix = gen.range({});'.format(arg['name'], size))
+
+    def save_arg(arg, is_output):
+        name = arg['name']
+        if arg['type'] == 'Tensor' or (arg['type'] == 'Scalar' and is_output):
+            saved_variables.append('SavedVariable {}_;'.format(name))
+            release_variables.append('{}_.reset_data();'.format(name))
+            ptr = 'shared_from_this()' if is_output else ''
+            unpack.append('auto {} = {}_.unpack({});'.format(name, name, ptr))
+        elif arg['type'] == 'TensorList':
+            saved_variables.append('std::vector<SavedVariable> {}_;'.format(name))
+            release_variables.append('{}_.clear();'.format(name))
+            unpack.append('auto {} = unpack_list({}_);'.format(name, name))
+        elif arg['type'] == 'IntList':
+            saved_variables.append('std::vector<int64_t> {};'.format(name))
+        else:
+            saved_variables.append('{} {};'.format(arg['type'], name))
+
+    for arg in func['saved_inputs']:
+        save_arg(arg, is_output=False)
+    for arg in func['saved_outputs']:
+        save_arg(arg, is_output=True)
+    env['saved_variables'] = saved_variables
+    env['release_variables'] = release_variables
+    env['saved_list_sizes'] = saved_list_sizes
+
+    if uses_retain_variables(func):
+        env['will_release_variables'] = WILL_RELEASE_VARIABLES.substitute()
+    else:
+        env['will_release_variables'] = ''
+
+    body = []
+
+    if uses_single_grad(func):
+        body.append('auto& grad = grads[0];')
+
+    def emit_derivative(derivative):
+        formula = derivative['formula']
+        var_names = derivative['var_names']
+        if len(var_names) == 1:
+            return DERIVATIVE_SINGLE.substitute(name=var_names[0], derivative=formula)
+        else:
+            if 'grad_input_mask' in formula:
+                masks = ['should_compute_output({{ {}_ix }}),'.format(n) for n in var_names]
+                grad_input_mask = GRAD_INPUT_MASK.substitute(masks=masks, n=len(var_names))
+            else:
+                grad_input_mask = ''
+            idx_ranges = ', '.join("{}_ix".format(n) for n in var_names)
+            copy_ranges = []
+            for i, n in enumerate(var_names):
+                copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
+            return DERIVATIVE_MULTI.substitute(
+                idx_ranges=idx_ranges, copy_ranges=copy_ranges,
+                derivative=formula,
+                grad_input_mask=grad_input_mask)
+
+    body.extend(unpack)
+    for derivative in func['derivatives']:
+        body.append(emit_derivative(derivative))
+
+    env['body'] = body
+    if func['name'] in UNTRACEABLE_FUNCTIONS:
+        env['superclass'] = 'Function'
+    else:
+        env['superclass'] = 'TraceableFunction'
+    return nested_dict(env, func)
+
+
+def uses_ident(func, ident):
+    if func is None:
+        return False
+    for derivative in func['derivatives']:
+        formula = derivative['formula']
+        if re.search(IDENT_REGEX.format(ident), formula):
+            return True
+    return False
+
+
+def uses_retain_variables(func):
+    return uses_ident(func, 'retain_variables')
+
+
+def uses_single_grad(func):
+    return uses_ident(func, 'grad')
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
new file mode 100644
index 0000000..96b48e2
--- /dev/null
+++ b/tools/autograd/gen_python_functions.py
@@ -0,0 +1,794 @@
+# Generates Python bindings for ATen functions
+#
+# The bindings are generated as methods on python_variable or functions on the
+# torch._C._nn object.
+#
+from collections import defaultdict
+import re
+from .nested_dict import nested_dict
+from .gen_variable_type import should_trace
+from .utils import write
+
+try:
+    from src.ATen.code_template import CodeTemplate
+except ImportError:
+    from tools.shared.module_loader import import_module
+    CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate
+
+# These functions require manual Python bindings or are not exposed to Python
+SKIP_PYTHON_BINDINGS = [
+    'alias', 'contiguous', 'clamp.*', 'is_cuda', 'is_sparse', 'size', 'stride',
+    '.*_backward', '.*_backward_(out|input|weight|bias)', '.*_forward',
+    '.*_forward_out', 'sparse_raw_resize_', '_unsafe_view', 'tensor',
+    'sparse_coo_tensor', 'th_sparse_coo_tensor', 'native_sparse_coo_tensor',
+    '_arange.*', '_range.*', '_linspace.*', '_logspace.*',
+    'index',
+    '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin',
+    '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
+    'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice',
+    'max_pool1d', 'max_pool2d', 'max_pool3d'
+]
+
+PY_VARIABLE_METHOD_VARARGS = CodeTemplate("""\
+static PyObject * ${pycname}(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    ${signatures}
+  }, /*traceable=*/${traceable});
+  ${unpack_self}
+  ParsedArgs<${max_args}> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  ${dispatch}
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+""")
+
+PY_VARIABLE_METHOD_NOARGS = CodeTemplate("""\
+static PyObject * ${pycname}(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  ${unpack_self}
+  return wrap(${dispatch_name}(${actuals}));
+  END_HANDLE_TH_ERRORS
+}
+""")
+
+PY_VARIABLE_CASE = CodeTemplate("""\
+${cond} (r.idx == ${i}) {
+  ${call_dispatch}
+""")
+
+PY_VARIABLE_OUT = CodeTemplate("""\
+if (r.isNone(${out_idx})) {
+  ${call_dispatch}
+} else {
+  ${call_dispatch_out}
+}
+""")
+
+PY_VARIABLE_OUT_CHECK_TYPE = CodeTemplate("""\
+if (r.isNone(${out_idx})) {
+  ${call_dispatch}
+} else {
+  check_out_type_matches(r.tensor(${out_idx}), r.scalartype(${type_idx}), r.isNone(${type_idx}),
+                         r.layout(${layout_idx}), r.isNone(${layout_idx}),
+                         r.device(${device_idx}), r.isNone(${device_idx}));
+  ${call_dispatch_out}
+}
+""")
+
+PY_VARIABLE_CALL_DISPATCH = CodeTemplate("""\
+${dispatch_name}(${actuals})""")
+
+PY_VARIABLE_SET_REQUIRES_GRAD = CodeTemplate("""\
+${call_dispatch}.set_requires_grad(${requires_grad})""")
+
+PY_VARIABLE_WRAP = CodeTemplate("""\
+return wrap(${call_dispatch});""")
+
+PY_VARIABLE_DISPATCH = CodeTemplate("""\
+inline ${return_type} ${dispatch_name}(${formal_args}) {
+  ${initialize_cuda}
+  ${AutoNoGIL}
+  return ${dispatch_call}(${dispatch_args});
+}
+""")
+
+PY_VARIABLE_METHOD_DEF = CodeTemplate("""\
+{"${name}", (PyCFunction)${pycname}, ${flags}, NULL},""")
+
+UNPACK_SELF = "auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;"
+
+PYTHON_FUNCTION_SIGNATURE = CodeTemplate("""\
+${name}(${py_formal_args})""")
+
+# XXX: if you got here because of an assertion failure, it doesn't mean
+# it's enough to just extend the list here. Before you do this, make sure
+# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h.
+SUPPORTED_RETURN_TYPES = {
+    'Tensor', 'std::tuple<Tensor,Tensor>',
+    'std::tuple<Tensor,Tensor,Tensor>',
+    'std::tuple<Tensor,Tensor,Tensor,Tensor>',
+    'std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor>',
+    'std::vector<Tensor>',
+    'Scalar', 'bool', 'int64_t', 'void*', 'void'
+}
+
+TENSOR_OPTIONS = CodeTemplate("""\
+const auto options = TensorOptions()
+    .dtype(${dtype})
+    .device(${device})
+    .layout(${layout}.layout)
+    .requires_grad(${requires_grad});
+""")
+
+
+def should_generate_python_binding(declaration):
+    name = declaration['name']
+    for pattern in SKIP_PYTHON_BINDINGS:
+        if re.match('^' + pattern + '$', name):
+            return False
+
+    # TODO: fix handling of SparseTensor. We don't want to generate Python
+    # bindings to SparseTensor overloads, such as add(Tensor, SparseTensorRef),
+    # since the Tensor-based signature already dynamically dispatches correctly.
+    # However, _sparse_mask only has a SparseTensor signature so we need to bind
+    # that function.
+    for arg in declaration['arguments']:
+        if arg['type'] == 'SparseTensorRef' and declaration['name'] != '_sparse_mask':
+            return False
+
+    return True
+
+
+def gen_py_variable_methods(out, declarations, template_path):
+    PY_VARIABLE_METHODS_CPP = CodeTemplate.from_file(template_path + '/python_variable_methods.cpp')
+    PY_VARIABLE_DISPATCH_H = CodeTemplate.from_file(template_path + '/python_variable_methods_dispatch.h')
+
+    def should_bind(declaration):
+        return (should_generate_python_binding(declaration) and
+                declaration['mode'] != 'NN' and
+                'Tensor' in declaration['method_of'])
+
+    py_variable_methods = group_declarations_by_name(declarations, should_bind)
+
+    env = create_python_bindings(py_variable_methods, True)
+    write(out, 'python_variable_methods.cpp', PY_VARIABLE_METHODS_CPP, env)
+    write(out, 'python_variable_methods_dispatch.h', PY_VARIABLE_DISPATCH_H, env)
+
+
+def gen_py_nn_functions(out, declarations, template_path):
+    PY_NN_FUNCTIONS_CPP = CodeTemplate.from_file(template_path + '/python_nn_functions.cpp')
+    PY_NN_FUNCTIONS_H = CodeTemplate.from_file(template_path + '/python_nn_functions.h')
+    PY_NN_DISPATCH_H = CodeTemplate.from_file(template_path + '/python_nn_functions_dispatch.h')
+
+    def should_bind(declaration):
+        return (should_generate_python_binding(declaration) and
+                declaration['mode'] == 'NN')
+
+    py_nn_functions = group_declarations_by_name(declarations, should_bind)
+
+    env = create_python_bindings(py_nn_functions, has_self=False, is_module=True)
+    write(out, 'python_nn_functions.cpp', PY_NN_FUNCTIONS_CPP, env)
+    write(out, 'python_nn_functions.h', PY_NN_FUNCTIONS_H, env)
+    write(out, 'python_nn_functions_dispatch.h', PY_NN_DISPATCH_H, env)
+
+
+def gen_py_torch_functions(out, declarations, template_path):
+    PY_TORCH_FUNCTIONS_CPP = CodeTemplate.from_file(template_path + '/python_torch_functions.cpp')
+    PY_TORCH_DISPATCH_H = CodeTemplate.from_file(template_path + '/python_torch_functions_dispatch.h')
+
+    def should_bind(declaration):
+        return (should_generate_python_binding(declaration) and
+                declaration['mode'] != 'NN' and
+                'namespace' in declaration['method_of'])
+
+    py_torch_functions = group_declarations_by_name(declarations, should_bind)
+
+    env = create_python_bindings(py_torch_functions, has_self=False)
+    write(out, 'python_torch_functions.cpp', PY_TORCH_FUNCTIONS_CPP, env)
+    write(out, 'python_torch_functions_dispatch.h', PY_TORCH_DISPATCH_H, env)
+
+
+def group_declarations_by_name(declarations, should_bind_fn):
+    """Group declarations by name ignoring _out suffix"""
+    groups = defaultdict(list)
+    for declaration in declarations:
+        name = declaration['name']
+        if should_bind_fn(declaration):
+            if name.endswith('_out'):
+                groups[name[:-4]].append(declaration)
+            else:
+                groups[name].append(declaration)
+    return groups
+
+
+def get_type_default(declaration):
+    if declaration['name'].startswith('randperm'):
+        return 'torch.int64'
+    else:
+        return 'None'
+
+
+def create_python_bindings(python_functions, has_self, is_module=False):
+    """Generates Python bindings to ATen functions"""
+    py_methods = []
+    py_method_defs = []
+    py_method_dispatch = []
+
+    unpack_methods = {
+        'const Tensor &': 'tensor',
+        'SparseTensorRef': 'tensor',
+        'Tensor &': 'tensor',
+        'Generator *': 'generator',
+        'Storage &': 'storage',
+        'const Type &': 'scalartype',
+        'const THPLayout &': 'layout',
+        'const Device &': 'device',
+        'optional<ScalarType>': 'scalartypeOptional',
+        'int64_t': 'toInt64',
+        'bool': 'toBool',
+        'double': 'toDouble',
+        'std::string': 'string',
+    }
+
+    unpack_with_default_methods = {
+        'IntList': 'setDefaultIntlist',
+        'Scalar': 'scalarWithDefault',
+        'int64_t': 'toInt64WithDefault',
+        'bool': 'setDefaultBool',
+        'double': 'setDefaultDouble',
+        'const Type &': 'scalartypeWithDefault',
+        'const THPLayout &': 'layoutWithDefault',
+        'const Device &': 'deviceWithDefault',
+        'ScalarType': 'scalartypeWithDefault',
+    }
+
+    def emit_single_dispatch(declaration, out_idx, base_env):
+        env = {}
+        simple_return_type = declaration['return_type'].replace(' &', '')
+        assert simple_return_type in SUPPORTED_RETURN_TYPES, \
+            declaration['name'] + ' returns unsupported type: ' + simple_return_type
+
+        body = []
+        actuals = []
+        formal_args = []
+        arg_idx = 0
+
+        def is_output(arg):
+            return arg.get('output', False)
+
+        inputs = [arg for arg in declaration['arguments'] if not is_output(arg)]
+        outputs = [arg for arg in declaration['arguments'] if is_output(arg)]
+
+        has_tensor_options = any(arg['simple_type'] == 'TensorOptions' for arg in declaration['arguments'])
+
+        def get_type_args(args):
+            return [arg for arg in args if arg['simple_type'] == 'Type']
+        type_actual_args = get_type_args(declaration['arguments'])
+        type_binding_args = get_type_args(declaration['python_binding_arguments'])
+        assert len(type_actual_args + type_binding_args) <= 1
+        if type_binding_args and len(outputs) == 0:
+            # out(s) determines the dtype if it is present, so only use this if there are no outputs.
+            type_args = type_binding_args
+        else:
+            type_args = type_actual_args
+
+        if type_args and len(outputs) > 1:
+                raise RuntimeError("Not supported: type dispatched parameter with multiple outputs")
+
+        def parse_arg(arg, arg_index, unpack_args=False):
+            name = arg['name']
+            typename = arg['type']
+            if typename.startswith('IntList['):
+                typename = 'IntList'
+            if typename.startswith('LongTensor'):
+                typename = 'Tensor'
+
+            if arg.get('python_default_init'):
+                assert typename in unpack_with_default_methods, \
+                    '`{}` type is not supported in python_default_init'.format(typename)
+                unpack_with_default = unpack_with_default_methods.get(typename)
+                default_expr = arg.get('python_default_init')
+                # TODO: Type currently maps to ScalarType, figure out a cleaner solution
+                if typename == 'const Type &':
+                    default_expr += '.scalarType()'
+                expr = 'r.{}({}, {})'.format(unpack_with_default, arg_index, default_expr)
+            else:
+                unpack = unpack_methods.get(typename, typename.lower())
+                expr = 'r.{}({})'.format(unpack, arg_index)
+
+            if unpack_args:
+                body.append('auto {} = {};'.format(name, expr))
+                expr = name
+
+            if typename == 'Storage &':
+                expr = '*' + expr
+            if typename == 'SparseTensorRef':
+                expr = 'SparseTensorRef({})'.format(expr)
+
+            dispatch_type = typename
+            if dispatch_type == 'Tensor':
+                dispatch_type = 'const Tensor &'
+            elif dispatch_type == 'Tensor &':
+                dispatch_type = 'Tensor'
+            elif dispatch_type == 'const Device &':
+                dispatch_type = 'at::optional<int32_t>'
+            formal = '{} {}'.format(dispatch_type, name)
+            return expr, formal
+
+        def append_actuals_formals(actual, formal):
+            actuals.append(actual)
+            formal_args.append(formal)
+
+        # We always want to unpack when we have TensorOptions.
+        unpack = any(arg.get('python_default_init') for arg in inputs) or has_tensor_options
+        for arg in inputs:
+            if arg['simple_type'] in ['Type', 'TensorOptions']:
+                continue
+            if has_self and arg['name'] == 'self':
+                formal_args.append('Tensor & self')
+                actuals.append('self_')
+                continue
+            append_actuals_formals(*parse_arg(arg, arg_idx, unpack))
+            arg_idx += 1
+
+        if len(outputs) == 1:
+            append_actuals_formals(*parse_arg(outputs[0], arg_idx))
+        elif len(outputs) > 1:
+            N = len(outputs)
+            body.append('auto results = r.tensorlist_n<{}>({});'.format(N, arg_idx))
+            for i, arg in enumerate(outputs):
+                formal_args.append('Tensor & {}'.format(arg['name']))
+                actuals.append('results[{}]'.format(i))
+
+        layout = None
+        parsed_type_args = None
+        # type args go after the outputs to match the signature generation.
+        arg_idx = arg_idx if out_idx is None else out_idx + 1
+        for arg in type_args:
+            parsed_type_args = parse_arg(arg, arg_idx, unpack)
+            arg_idx += 1
+
+        # check python_binding_arguments
+        has_device_bind = False
+        requires_grad = None
+        python_binding_arguments = declaration.get('python_binding_arguments', [])
+        if 'dtype' in (a['name'] for a in python_binding_arguments):
+            if not has_tensor_options:
+                arg_idx += 1
+
+        if 'layout' in (a['name'] for a in python_binding_arguments):
+            layout_idx, device_idx, requires_grad_idx = (arg_idx, arg_idx + 1, arg_idx + 2)
+        else:
+            device_idx, requires_grad_idx = (arg_idx, arg_idx + 1)
+
+        device = None
+        for arg in python_binding_arguments:
+            if arg['name'] == 'dtype' and arg['simple_type'] == 'Type':
+                pass  # already handled by type_dispatched_args
+            elif arg['name'] == 'layout' and arg['simple_type'] == 'Layout':
+                # out(s) determines the type and layout if it is present, so only use this if there are no outputs.
+                if len(outputs) == 0:
+                    layout = parse_arg(arg, layout_idx, arg.get('python_default_init'))[0]
+            elif arg['name'] == 'device' and arg['simple_type'] == 'Device':
+                if len(outputs) == 0:
+                    assert parsed_type_args
+                    assert layout
+                    device, device_type = parse_arg(arg, device_idx, True)
+
+                    if not has_tensor_options:
+                        # add type, device formals and corresponding actuals.
+                        # The type actual isthe ATen type mapped from (ScalarType, Layout, Device)
+                        # The device actual is the corresponding AutoGPU index for the Device.
+                        formal_args.append(parsed_type_args[1])
+                        formal_args.append(device_type)
+                        actuals.append("torch::getType({}, {}, {})".format(parsed_type_args[0], layout, device))
+                        actuals.append('{}.index()'.format(device))
+
+                    has_device_bind = True
+            elif arg['name'] == 'requires_grad' and arg['simple_type'] == 'bool':
+                requires_grad = parse_arg(arg, requires_grad_idx)[0]
+            else:
+                raise RuntimeError(("found {} in python_binding_arguments but only "
+                                    "\"bool requires_grad\", \"ScalarType dtype\", \"Layout layout\", "
+                                    "\"Device device\" are supported".format(arg)))
+
+        dtype = parsed_type_args[0] if parsed_type_args else None
+        if has_tensor_options and all([dtype, device, layout, requires_grad]):
+            body.append(TENSOR_OPTIONS.substitute({
+                'dtype': dtype,
+                'layout': layout,
+                'device': device,
+                'requires_grad': requires_grad
+            }))
+            formal_args.append('const TensorOptions & options')
+            actuals.append('options')
+
+        env['unpack_args'] = []
+        env['formal_args'] = formal_args
+        env['actuals'] = actuals
+
+        if has_tensor_options:
+            env['initialize_cuda'] = 'maybe_initialize_cuda(options.type());'
+        else:
+            env['initialize_cuda'] = 'maybe_initialize_cuda({});'.format(type_args[0]['name']) if type_args else ''
+
+        if 'call_args' in declaration:
+            env['dispatch_args'] = declaration['call_args']
+        else:
+            env['dispatch_args'] = [arg['name'] for arg in declaration['arguments']]
+
+        if 'Tensor' in declaration['method_of']:
+            env['dispatch_args'] = [arg for arg in env['dispatch_args'] if arg != 'self']
+            env['dispatch_call'] = 'self.{}'.format(declaration['name'])
+        elif 'namespace' in declaration['method_of']:
+            namespace = 'torch' if (has_tensor_options or declaration['name'].endswith('_like')) else 'at'
+            env['dispatch_call'] = '{}::{}'.format(namespace, declaration['name'])
+        else:
+            raise RuntimeError('could not dispatch, neither namespace function nor Tensor method')
+
+        env['AutoNoGIL'] = 'AutoNoGIL no_gil;' if not declaration['with_gil'] else ''
+
+        env = nested_dict(env, nested_dict(base_env, declaration))
+        call_dispatch = PY_VARIABLE_CALL_DISPATCH.substitute(env)
+        if requires_grad and not has_tensor_options:
+            call_dispatch = PY_VARIABLE_SET_REQUIRES_GRAD.substitute(env, call_dispatch=call_dispatch,
+                                                                     requires_grad=requires_grad)
+        if simple_return_type == 'void':
+            body.append('{call_dispatch};'.format(call_dispatch=call_dispatch))
+            body.append('Py_RETURN_NONE;')
+        else:
+            body.append(PY_VARIABLE_WRAP.substitute(env, call_dispatch=call_dispatch))
+        py_method_dispatch.append(PY_VARIABLE_DISPATCH.substitute(env))
+        return body
+
+    def emit_dispatch(i, dictionary, base_env):
+        if 'out' in dictionary:
+            out_idx = len([arg for arg in dictionary['out']['arguments']
+                           if not arg.get('output', False)])
+            env = {}
+            env['call_dispatch_out'] = emit_single_dispatch(dictionary['out'], out_idx, base_env)
+            env['call_dispatch'] = emit_single_dispatch(dictionary['base'], out_idx, base_env)
+
+            has_dtype_bind = 'dtype' in [d['name'] for d in dictionary['out'].get('python_binding_arguments', [])]
+            if has_dtype_bind:
+                body = PY_VARIABLE_OUT_CHECK_TYPE.substitute(env, out_idx=out_idx, type_idx=out_idx + 1,
+                                                             layout_idx=out_idx + 2, device_idx=out_idx + 3).split('\n')
+            else:
+                body = PY_VARIABLE_OUT.substitute(env, out_idx=out_idx).split('\n')
+        else:
+            body = emit_single_dispatch(dictionary['base'], None, base_env)
+
+        cond = 'if' if i == 0 else '} else if'
+        return PY_VARIABLE_CASE.substitute(i=i, cond=cond, call_dispatch=body)
+
+    def get_python_binding_arguments(declaration):
+        python_binding_arguments = []
+        has_tensor_input_arg = False
+        has_type_input_arg = False
+        has_options_arg = False
+        for arg in declaration['arguments']:
+            if arg.get('output', False):
+                continue
+            typename = arg['simple_type']
+            if typename in ['Tensor', 'TensorList']:
+                has_tensor_input_arg = True
+            if arg['simple_type'] == 'Type':
+                has_type_input_arg = True
+            elif arg['simple_type'] == 'TensorOptions':
+                has_options_arg = True
+            if arg['name'] == 'requires_grad':
+                raise ValueError("argument named requires_grad not supported")
+
+        has_tensor_return = False
+        for ret in declaration['returns']:
+            if ret['dynamic_type'] in ['Tensor', 'TensorList']:
+                # this probably won't work if one of the returns is not a tensor, but it will
+                # produce a compile-time error that is obvious
+                has_tensor_return = True
+
+        is_like_function = name.endswith('_like')
+        is_like_function_with_options = is_like_function and has_options_arg
+        is_factory_function = has_tensor_return and not has_tensor_input_arg
+        is_factory_or_like_function = has_tensor_return and (not has_tensor_input_arg or is_like_function)
+
+        if (is_factory_function and not has_type_input_arg) or has_options_arg:
+            default_type = get_type_default(declaration)
+            py_default_dtype = 'self.type()' if is_like_function_with_options else None
+            dtype_arg = {
+                'default': default_type,
+                'dynamic_type': 'Type',
+                'kwarg_only': True,
+                'name': 'dtype',
+                'type': 'const Type &',
+                'simple_type': 'Type',
+                'is_type_dispatched': True,
+                'python_default_init': py_default_dtype,
+            }
+            python_binding_arguments.append(dtype_arg)
+        if is_factory_function or is_like_function_with_options:
+            py_default_layout = '*torch::getLayout(self.type().backend())' if is_like_function_with_options else None
+            layout_arg = {
+                'default': 'torch.strided',
+                'dynamic_type': 'Layout',
+                'kwarg_only': True,
+                'name': 'layout',
+                'type': 'const THPLayout &',
+                'simple_type': 'Layout',
+                'python_default_init': py_default_layout,
+            }
+            python_binding_arguments.append(layout_arg)
+            py_default_device = 'self.device()' if is_like_function_with_options else None
+            device_arg = {
+                'default': 'None',
+                'default_init': 'None',
+                'dynamic_type': 'Device',
+                'kwarg_only': True,
+                'name': 'device',
+                'type': 'const Device &',
+                'simple_type': 'Device',
+                'python_default_init': py_default_device
+            }
+            python_binding_arguments.append(device_arg)
+        if is_factory_or_like_function:
+            requires_grad_arg = {
+                'default': False,
+                'dynamic_type': 'bool',
+                'kwarg_only': True,
+                'name': 'requires_grad',
+                'type': 'bool',
+                'simple_type': 'bool',
+            }
+            python_binding_arguments.append(requires_grad_arg)
+        return python_binding_arguments
+
+    def process_function(name, declarations):
+        for declaration in declarations:
+            declaration['python_binding_arguments'] = get_python_binding_arguments(declaration)
+
+        env = {
+            'name': name,
+            'dispatch_name': 'dispatch_{}'.format(name),
+            'pycname': 'THPVariable_{}'.format(name),
+            'signatures': [],
+            'max_args': max(len(o['arguments']) + len(o['python_binding_arguments']) for o in declarations),
+            'unpack_self': [],
+            'dispatch': [],
+        }
+
+        if has_self:
+            env['unpack_self'] = [UNPACK_SELF]
+
+        grouped = group_declarations(declarations)
+        for i, dictionary in enumerate(grouped):
+            signature = dictionary['signature']
+            if has_self:
+                signature = signature.replace('Tensor self, ', '')
+                signature = signature.replace('Tensor self', '')
+            if not has_self:
+                # Use 'input' instead of 'self' for NN functions
+                signature = signature.replace('Tensor self', 'Tensor input')
+            signature = signature.replace('SparseTensorRef', 'Tensor')
+            if dictionary['base'].get('deprecated', False):
+                signature += '|deprecated'
+            env['signatures'].append('"{}",'.format(signature))
+            env['dispatch'].append(emit_dispatch(i, dictionary, env))
+
+        env['dispatch'].append('}')
+
+        env['traceable'] = 'true' if all(should_trace(d) for d in declarations) else 'false'
+
+        if len(declarations) == 1 and len(declarations[0]['args']) == 1 and has_self:
+            tmpl = PY_VARIABLE_METHOD_NOARGS
+            env['actuals'] = ['self_']
+            env['flags'] = 'METH_NOARGS'
+        else:
+            tmpl = PY_VARIABLE_METHOD_VARARGS
+            env['flags'] = 'METH_VARARGS | METH_KEYWORDS'
+
+        if not is_module and not has_self:
+            env['flags'] += ' | METH_STATIC'
+
+        py_methods.append(tmpl.substitute(env))
+        py_method_defs.append(PY_VARIABLE_METHOD_DEF.substitute(env))
+
+    for name in sorted(python_functions.keys()):
+        process_function(name, python_functions[name])
+
+    return {
+        'py_methods': py_methods,
+        'py_method_defs': py_method_defs,
+        'py_method_dispatch': py_method_dispatch,
+    }
+
+
+def group_declarations(declarations):
+    """Returns a list of dictionaries containing the optional keys:
+
+       "base": the regular ATen declaration (e.g. conv2d)
+       "out": the out variant (e.g. conv2d_out)
+       "signature": the signature used for Python argument parsing
+    """
+    grouped = defaultdict(dict)
+
+    # first group by signature ignoring out arguments
+    for declaration in declarations:
+        signature = get_python_signature(declaration, False)
+        v = grouped[signature]
+        if declaration['name'].endswith('_out'):
+            v['out'] = declaration
+            # prefer the signature with optional out=... arguments
+            v['signature'] = get_python_signature(declaration, True)
+        else:
+            v['base'] = declaration
+            if 'signature' not in v:
+                v['signature'] = signature
+
+    result = []
+    for _, dictionary in sorted(grouped.items()):
+        if 'base' not in dictionary:
+            raise RuntimeError("'base' not in dictionary", dictionary)
+        result.append(dictionary)
+    return sort_declarations(result)
+
+
+# This function declares a partial order on declarations, and sorts them according
+# to its linear extension. This is necessary, because there's some ambiguity in the
+# choice of overload, and we want a different order.
+#
+# See Note[Order of overloads matters]
+def sort_declarations(grouped_decls):
+
+    # TODO: This is a hack!
+    #
+    # For some reason, when you specify a Scalar argument in a native
+    # function, you get a Declarations.yaml entry that looks like this:
+    #
+    #   - default: 1
+    #     dynamic_type: Scalar
+    #     is_nullable: false
+    #     kwarg_only: true
+    #     name: alpha
+    #     type: Scalar
+    #
+    # This is contrast to when there is a 'real' argument in TH
+    # Declarations.cwrap; this gets (correctly?) translated into
+    # dynamic_type: real, and type: Scalar.  I would like to fix this
+    # at the source but I have never understood what dynamic_type is
+    # supposed to be.
+    def normalized_dynamic_type(arg):
+        if arg['dynamic_type'] == 'real':
+            return 'Scalar'
+        return arg['dynamic_type']
+
+    def is_coord_smaller(arg1, arg2):
+        return normalized_dynamic_type(arg1) == 'Scalar' and arg2['dynamic_type'] == 'Tensor'
+
+    def is_smaller(d1, d2):
+        """Returns True if d1 < d2 in the partial order."""
+        args1, args2 = d1['base']['arguments'], d2['base']['arguments']
+        if len(args1) != len(args2):
+            return False
+        any_smaller = any(is_coord_smaller(arg1, arg2) for arg1, arg2 in zip(args1, args2))
+        all_smaller_or_equal = all(normalized_dynamic_type(arg1) == normalized_dynamic_type(arg2) or
+                                   is_coord_smaller(arg1, arg2)
+                                   for arg1, arg2 in zip(args1, args2))
+        return any_smaller and all_smaller_or_equal
+
+    # Construct the relation graph
+    larger_than = defaultdict(set)
+    for i1, decl1 in enumerate(grouped_decls):
+        for i2, decl2 in enumerate(grouped_decls):
+            if is_smaller(decl1, decl2):
+                larger_than[i1].add(i2)
+
+    if not larger_than:
+        return grouped_decls
+
+    # Use a topological sort to sort decls according to the partial order.
+    sorted_deps = [(i, decl) for i, decl in enumerate(grouped_decls)
+                   if i not in larger_than]
+    for i, decl in sorted_deps:
+        for i2 in sorted(larger_than.keys()):
+            larger = larger_than[i2]
+            larger.discard(i)
+            if not larger:
+                del larger_than[i2]
+                sorted_deps.append((i2, grouped_decls[i2]))
+
+    return [decl for i, decl in sorted_deps]
+
+
+def get_python_signature(declaration, include_out):
+    # Compute the Python function signature for argument parsing
+    py_formal_args = []
+    output_args = []
+    type_args = []
+    positional = True
+
+    def get_py_formal_arg(arg):
+        typename = arg['simple_type']
+        opt_match = re.match(r'optional<(.+)>', typename)
+        if opt_match:
+            typename = opt_match.group(1)
+        typename = typename if typename != 'Type' else 'ScalarType'
+        if arg.get('is_nullable') or opt_match:
+            typename = '{}?'.format(typename)
+        if arg.get('size') is not None:
+            typename = '{}[{}]'.format(typename, arg['size'])
+        param = typename + ' ' + arg['name']
+        default = None
+        if arg.get('default') is not None:
+            default = arg['default']
+            if default == 'nullptr' or default == 'nullopt' or default == '{}':
+                default = 'None'
+        if arg.get('python_default_init') is not None:
+            default = 'None'
+        if default is None and arg.get('is_type_dispatched', False):
+            # this is necessary because ATen does not have default_types; in this case,
+            # the type exists in the public API (at:: namespace), but not in the type interface;
+            # to match the PyTorch default_type API, we set the default to None.
+            default = get_type_default(declaration)
+        if default is not None:
+            param += '=' + str(default)
+        return param
+
+    for arg in declaration['arguments']:
+        if arg.get('output', False):
+            output_args.append(arg)
+            continue
+        if arg['simple_type'] == 'Type':
+            type_args.append(arg)
+            continue
+        # Skip `TensorOptions` in Python, as it is only used on the C++ side.
+        if arg['simple_type'] == 'TensorOptions':
+            continue
+        if arg.get('kwarg_only', False) and positional:
+            py_formal_args.append('*')
+            positional = False
+        param = get_py_formal_arg(arg)
+        py_formal_args.append(param)
+
+    # add output arguments
+    name = declaration['name']
+    if name.endswith('_out'):
+        name = name[:-4]
+
+    if len(output_args) > 0 and include_out:
+        assert declaration['name'].endswith('_out')
+        if positional:
+            py_formal_args.append('*')
+            positional = False
+        typenames = [arg['simple_type'] for arg in output_args]
+        if len(typenames) > 1:
+            typename = 'TensorList[{}]'.format(len(typenames))
+        else:
+            typename = typenames[0]
+        py_formal_args.append(typename + ' out=None')
+
+    # we could put this in the loop above but we want to ensure both type dispatched args
+    # and python binding arguments are after the out argument; this matches the case
+    # where there is a python binding argument dtype, which is necessary to match
+    # the function signatures between the out and non-out variant.
+    assert len(type_args) <= 1
+    for arg in type_args:
+        if positional:  # assume type_args should be kwarg_only.
+            py_formal_args.append('*')
+            positional = False
+        py_formal_args.append(get_py_formal_arg(arg))
+
+    if len(declaration['python_binding_arguments']) > 0:
+        for arg in declaration['python_binding_arguments']:
+            if arg.get('kwarg_only', False) and positional:
+                py_formal_args.append('*')
+                positional = False
+            py_formal_args.append(get_py_formal_arg(arg))
+
+    # Python function signature.
+    # This is the string that we give to FunctionParameter, which is
+    # then parsed into the actual structure which we do parsing
+    # with.
+    return PYTHON_FUNCTION_SIGNATURE.substitute(name=name, py_formal_args=py_formal_args)
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
new file mode 100644
index 0000000..9d55784
--- /dev/null
+++ b/tools/autograd/gen_variable_factories.py
@@ -0,0 +1,58 @@
+# Generates C++ functions that wrap ATen tensor factory methods to turn them into Variables.
+#
+# This writes one file: variable_factories.h
+
+import re
+
+from .utils import CodeTemplate, write
+
+FUNCTION_TEMPLATE = CodeTemplate("""\
+inline autograd::Variable ${name}(${formals}) {
+  at::Tensor tensor = at::${name}(${actuals});
+  return autograd::make_variable(tensor, /*requires_grad=*/${requires_grad});
+}
+""")
+
+
+TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
+
+
+def fully_qualified_type(argument_type):
+    match = TYPE_PATTERN.match(argument_type)
+    if match is None:
+        return argument_type
+    index = match.start(1)
+    return "{}at::{}".format(argument_type[:index], argument_type[index:])
+
+
+def gen_variable_factories(out, declarations, template_path):
+    function_definitions = []
+    for decl in declarations:
+        has_tensor_options = any(a["simple_type"] == "TensorOptions" for a in decl["arguments"])
+        if has_tensor_options or decl["name"].endswith("_like"):
+            function_definitions.append(process_function(decl, has_tensor_options))
+    write(out,
+          "variable_factories.h",
+          CodeTemplate.from_file(template_path + "/variable_factories.h"),
+          {"function_definitions": function_definitions})
+
+
+def process_function(decl, has_tensor_options):
+    formals = []
+    actuals = []
+    for argument in decl["arguments"]:
+        type = fully_qualified_type(argument["type"])
+        default = " = {}".format(argument["default"]) if "default" in argument else ""
+        formals.append("{} {}{}".format(type, argument["name"], default))
+        actual = argument["name"]
+        if argument["simple_type"] == "TensorOptions":
+            # We want to discard the runtime type so that `at::{name}` always returns a
+            # tensor and not a variable, since we create a variable right after.
+            actual += ".discard_runtime_type()"
+        actuals.append(actual)
+    requires_grad = "options.requires_grad()" if has_tensor_options else "false"
+    if decl['name'].endswith('_like') and not has_tensor_options:
+        actuals.append('at::TensorOptions({}, /*discard_runtime_type=*/true)'.format(actuals[0]))
+    return FUNCTION_TEMPLATE.substitute(
+        name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad
+    )
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
new file mode 100644
index 0000000..679e3c9
--- /dev/null
+++ b/tools/autograd/gen_variable_type.py
@@ -0,0 +1,630 @@
+# Generates VariableType.h/cpp
+#
+# VariableType is a subclass of at::Type that provides the binding code
+# necessary to provide a differentiable version of ATen operators. There are a
+# number of different things we could mean:
+#
+#   - Given a non-differentiable forward implementation, we might
+#     directly associate it with a backward implementation to make
+#     it differentiable.  This is the common case.
+#
+#   - Some functions don't need a backwards implementation, because
+#     backpropagation will never propagate beyond them.  There are a
+#     number of different reasons why this may be the case:
+#
+#       - The function has no differentiable inputs
+#       - The function's output is not differentiable
+#       - The function has no data dependency on its input
+#
+#   - Some function don't need a backwards implementation because they
+#     are implemented as a composition of other (differentiable) ATen
+#     functions.  These are dispatched directly to the Type superclass,
+#     which will in turn dispatch back to VariableType for its
+#     differentiable subcomponents.
+#
+from __future__ import print_function
+import os
+import sys
+from .utils import CodeTemplate, nested_dict, write, uninplace_api_name
+from .gen_autograd import VIEW_FUNCTIONS, HARDCODED_DIFFERENTIABLE_OUTPUTS
+from .gen_autograd_functions import uses_single_grad
+
+# These functions are written manually in templates/VariableType.cpp
+MANUAL_IMPLEMENTATIONS = {
+    'contiguous', 'resize_', 'resize_as_'
+}
+
+# These functions we don't want to record for tracing, because we always want
+# to trace their constituent parts.  This is a temporary hack in lieue
+# of proper scopes, where subsequent compilation passes can ask for the unfolding
+# on demand.  Only concrete ATen methods can be disabled this way; it will have
+# NO EFFECT otherwise.
+DONT_RECORD_TRACE = {
+    'convolution', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+    'conv_transpose2d', 'conv_transpose3d',
+}
+
+# These functions have their names recorded under trace renamed,
+RENAME_TRACE = {
+    'th_add': 'add',
+    's_native_add': 'add',
+    'th_sub': 'sub',
+    's_native_sub': 'sub',
+    'th_mul': 'mul',
+    's_native_mul': 'mul',
+    'th_addmm': 'addmm',
+    's_native_addmm': 'addmm',
+}
+
+# These functions are not worth profiling because they are very cheap and may
+# be called very often.
+DONT_PROFILE = {
+    'data_ptr', 'get_device', 'is_contiguous', 'is_cuda', 'is_distributed',
+    'is_same_size', 'is_set_to', 'is_signed', 'is_sparse', 'numel',
+    'size', 'storage_offset', 'stride',
+}
+
+# We don't set or modify grad_fn on these methods. Generally, they return
+# tensors that have requires_grad=False. In-place functions listed here will
+# not examine or modify requires_grad or grad_fn.
+DONT_REQUIRE_DERIVATIVE = {
+    # These  only depend on the input Tensor's shape and device, not the data
+    'ones_like', 'zeros_like', 'rand_like', 'randn_like',
+    # Tensor constructors
+    'sparse_coo_tensor', 'th_sparse_coo_tensor', 'native_sparse_coo_tensor',
+    # These are only implemented on integral types
+    '__and__', '__iand__', '__ilshift__', '__ior__', '__irshift__', '__ixor__',
+    '__lshift__', '__or__', '__rshift__', '__xor__',
+}
+
+METHOD_DECLARATION = CodeTemplate("""\
+virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override;
+""")
+
+METHOD_DEFINITION = CodeTemplate("""\
+${return_type} VariableType::${method_prefix_derived}${api_name}(${type_method_formals}) const {
+  ${type_definition_body}
+}
+""")
+
+UNPACK_TENSOR = CodeTemplate("""\
+auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
+
+DECLARE_GRAD_FN = CodeTemplate("""\
+std::shared_ptr<${op}> grad_fn;
+""")
+
+SETUP_DERIVATIVE = CodeTemplate("""\
+if (compute_requires_grad( ${args_with_derivatives} )) {
+  ${setup}
+}
+""")
+
+ASSIGN_GRAD_FN = CodeTemplate("""\
+grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteFunction);
+grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
+""")
+
+CALL_VIA_TYPE = CodeTemplate("""\
+Type::${method_prefix_derived}${api_name}(${type_method_args})""")
+
+CALL_VIA_DERIVED = CodeTemplate("""\
+baseType->${method_prefix_derived}${base_name}(${unpacked_args})""")
+
+SET_HISTORY = CodeTemplate("""\
+${fn}_history(${differentiable_outputs}, grad_fn);
+""")
+
+CONDITIONAL = CodeTemplate("""\
+if (${cond}) {
+  ${statements}
+}
+""")
+
+RECORD_FUNCTION = CodeTemplate("""\
+profiler::RecordFunction profiler("${name}");""")
+
+PRE_RECORD_TRACE = CodeTemplate("""\
+jit::tracer::PreTraceInfo trace_info;
+if (jit::tracer::isTracing( ${tensor_args} )) {
+  trace_info = jit::tracer::preRecordTrace( jit::aten::${trace_name}, ${trace_inputs} );
+  if (!jit::tracer::ArgumentStash::empty()) {
+    ${record_positional_attributes}
+    TORCH_ASSERT(jit::tracer::ArgumentStash::empty());
+  } else {
+    ${record_attributes}
+  }
+}
+""")
+
+POST_RECORD_TRACE = CodeTemplate("""\
+if (trace_info.state != nullptr) {
+  jit::tracer::postRecordTrace( trace_info,  ${trace_outputs} );
+}
+""")
+
+RECORD_ATTRIBUTE = CodeTemplate("""\
+setattr(trace_info.n, jit::attr::${name}, ${name});""")
+
+RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\
+setposattr(trace_info.n, ${i}, "${name}", ${name});""")
+
+POSITIONAL_ATTR_NYI = """\
+throw std::runtime_error("Can't have size-dependent arguments to functions that "
+                         "take variable number of tensor arguments");
+"""
+
+
+def should_trace(declaration):
+    # Operations involving Generator, Storage, Type are not traceable
+    # at the moment
+    if any(arg['simple_type'] in {'Generator', 'Storage', 'ScalarType', 'Type', 'optional<ScalarType>'}
+            for arg in declaration['arguments']):
+        return False
+    # We can't trace functions which don't have any Tensor or TensorList returns
+    if 'Tensor' not in declaration['return_type']:
+        return False
+    tensor_args = [arg for arg in declaration['arguments'] if arg['simple_type'] in {'Tensor', 'TensorList'}]
+    if len(tensor_args) == 0:
+        return False
+    name = declaration['name']
+    base_name = name[:-1] if declaration['inplace'] else name[:-4] if name.endswith('_out') else name
+    if base_name in DONT_RECORD_TRACE:
+        return False
+    # We need to disable these because their inner implementations implement
+    # broadcasting, and if we trace them top level we will lose the expand nodes.
+    # However, we can't use DONT_RECORD_TRACE, because we must only disable
+    # these for overloads that come from native (the TH overloads still "work")
+    overload = [arg['simple_type'] for arg in declaration['arguments'] if not arg.get('output', False)]
+    if base_name == 'add' and overload == ['Tensor', 'Tensor', 'Scalar']:
+        return False
+    if base_name == 'sub' and overload == ['Tensor', 'Tensor', 'Scalar']:
+        return False
+    if base_name == 'mul' and overload == ['Tensor', 'Tensor', 'Scalar']:
+        return False
+    if base_name == 'addmm' and overload == ['Tensor', 'Tensor', 'Tensor', 'Scalar', 'Scalar']:
+        return False
+    return True
+
+
+def gen_variable_type(out, aten_declarations, template_path):
+    """VariableType.h and VariableType.cpp body
+
+    This is the at::Type subclass for differentiable tensors. The
+    implementation of each function dispatches to the base tensor type to
+    compute the output. The grad_fn is attached to differentiable functions.
+    """
+
+    VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h')
+    VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp')
+
+    type_declarations = []
+    type_definitions = []
+
+    for declaration in aten_declarations:
+        # Factory methods shall not appear in `VariableType` at all, since they
+        # don't dispatch via `Type`.
+        if any(arg['simple_type'] == 'TensorOptions' for arg in declaration['arguments']):
+            continue
+        type_declarations.append(METHOD_DECLARATION.substitute(declaration))
+        if declaration['name'] not in MANUAL_IMPLEMENTATIONS:
+            type_definitions.append(emit_method_definition(declaration))
+
+    env = {
+        'type_derived_method_declarations': type_declarations,
+        'type_derived_method_definitions': type_definitions,
+    }
+    write(out, 'VariableType.h', VARIABLE_TYPE_H, env)
+    write(out, 'VariableType.cpp', VARIABLE_TYPE_CPP, env)
+
+
+def emit_method_definition(declaration):
+    body = emit_body(declaration)
+    return METHOD_DEFINITION.substitute(declaration, type_definition_body=body)
+
+
+def emit_body(declaration):
+    strategy = dispatch_strategy(declaration)
+
+    arguments = declaration['arguments']
+    returns = declaration['returns']
+    func = declaration['derivative']
+    name = declaration['name']
+    inplace = declaration['inplace']
+    is_out_fn = name.endswith('_out')
+    modifies_arguments = inplace or is_out_fn
+    returns_void = len(returns) == 1 and returns[0]['type'] == 'void'
+
+    base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name
+    is_view = base_name in VIEW_FUNCTIONS
+
+    # These exclude things like BoolTensor, int64_t, and Scalar
+    def is_differentiable(arg):
+        if 'Tensor' not in arg['type']:
+            return False
+        if arg['dynamic_type'] in {'IndexTensor', 'BoolTensor'}:
+            return False
+        return True
+
+    inputs = [arg for arg in arguments if not arg.get('output', False)]
+    differentiable_inputs = list(filter(is_differentiable, inputs))
+    candidate_differentiable_outputs = list(filter(is_differentiable, returns))
+
+    hardcoded_diff = HARDCODED_DIFFERENTIABLE_OUTPUTS.get(name)
+    if hardcoded_diff:
+        differentiable_outputs = []
+        for i in hardcoded_diff:
+            differentiable_outputs.append(candidate_differentiable_outputs[i])
+    elif uses_single_grad(func):
+        differentiable_outputs = candidate_differentiable_outputs[:1]
+    else:
+        differentiable_outputs = candidate_differentiable_outputs
+
+    requires_derivative = (
+        base_name not in DONT_REQUIRE_DERIVATIVE and
+        len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0 and
+        strategy == 'use_derived')
+
+    if func is not None and not requires_derivative:
+        print('WARNING: derivative ignored for {}'.format(name), file=sys.stderr)
+
+    def setup_derivative():
+        def error_msg():
+            name = declaration['api_name']
+            return '"the derivative for {} is not implemented"'.format(name)
+
+        args_with_derivatives = find_args_with_derivatives()
+
+        env = {}
+        env['args_with_derivatives'] = reference_args(args_with_derivatives)
+        env['op'] = func['op'] if func is not None else 'Error'
+        env['op_ctor'] = '' if func is not None else error_msg()
+
+        if is_out_fn:
+            setup = ['throw_error_out_requires_grad("{}");'.format(base_name)]
+            body = []
+            body.append(DECLARE_GRAD_FN.substitute(op='Function'))
+            body.append(SETUP_DERIVATIVE.substitute(
+                setup=setup,
+                args_with_derivatives=reference_args(differentiable_inputs)))
+            body.append(SETUP_DERIVATIVE.substitute(
+                setup=setup,
+                args_with_derivatives=reference_args(differentiable_outputs)))
+            return body
+
+        setup = []
+        setup.extend(ASSIGN_GRAD_FN.substitute(env).split('\n'))
+        if func is not None:
+            setup.extend(save_variables(func['saved_inputs'], False))
+            for arg in func['args_with_gradients']:
+                if arg['type'] == 'TensorList':
+                    setup.append("grad_fn->{}_size_ = {}.size();".format(arg['name'], arg['name']))
+
+        body = []
+        body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives))
+        body.append(DECLARE_GRAD_FN.substitute(env))
+        body.append(SETUP_DERIVATIVE.substitute(env, setup=setup))
+        return body
+
+    def find_args_with_derivatives():
+        """Find arguments that have derivative definitions"""
+        if func is None:
+            return differentiable_inputs
+        names = set(name for d in func['derivatives'] for name in d['var_names'])
+        differentiable = [arg for arg in differentiable_inputs if arg['name'] in names]
+        if len(differentiable) != len(names):
+            missing = names - set(arg['name'] for arg in differentiable)
+            raise RuntimeError('Missing arguments for derivatives: {} in {}'.format(missing, func['name']))
+        return differentiable
+
+    def emit_check_no_requires_grad(tensor_args, args_with_derivatives):
+        """Checks that arguments without derivatives don't require grad"""
+        body = []
+        for arg in tensor_args:
+            if arg in args_with_derivatives:
+                continue
+            name = arg['name']
+            if name == 'output':
+                # Double-backwards definitions sometimes take in 'input' and
+                # 'output', but only define the derivative for input.
+                continue
+            if arg['dynamic_type'] in {'IndexTensor', 'BoolTensor'}:
+                continue
+            body.append('check_no_requires_grad({}, "{}");'.format(name, name))
+        return body
+
+    def save_variables(saved_variables, is_output):
+        # assign the saved variables to the generated grad_fn
+        stmts = []
+        for arg in saved_variables:
+            name = arg['name']
+            expr = arg.get('expr', arg['name'])
+            if arg['type'] == 'Tensor' or (is_output and arg['type'] == 'Scalar'):
+                name += '_'
+                var = arg['name']
+                if var == 'self' and inplace:
+                    var = 'self.clone()'
+                    assert not is_output
+                if inplace and is_output:
+                    var = 'self'
+                expr = 'SavedVariable({}, {})'.format(var, str(is_output).lower())
+            elif arg['type'] == 'TensorList':
+                name += '_'
+                expr = 'make_saved_variable_list({})'.format(arg['name'])
+            stmts.append('grad_fn->{} = {};'.format(name, expr))
+        return stmts
+
+    def reference_args(args):
+        res = []
+        for arg in args:
+            if arg['type'] == 'SparseTensorRef':
+                res.append('{}.tref'.format(arg['name']))
+            else:
+                res.append(arg['name'])
+        return res
+
+    def get_trace_outputs(declaration):
+        if declaration['return_type'] == 'std::vector<Tensor>':
+            return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name'])
+        elif name.endswith('_out'):
+            output_args = [arg['name'] for arg in arguments
+                           if arg.get('output', False)]
+            return '{' + ', '.join(output_args) + '}'
+        trace_outs = [r['name'] for r in declaration['returns']]
+        if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']):
+            return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs)
+        else:
+            return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs)
+
+    def emit_record_trace(env):
+        if not should_trace(declaration):
+            return ('', '')
+
+        # Note [clang-802.0.42 tuple overload bug]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Originally, my plan for emit_$ecord_trace was to keep it as
+        # simple as possible, if at the expense of some somewhat ugly
+        # overloads.  So this meant we had a 'recordTrace' function
+        # with overloads like this:
+        #
+        #   recordTrace(..., const Variable& out)
+        #   recordTrace(..., const std::tuple<Variable, Variable>& out)
+        #
+        # Unfortunately, this triggers a bug in clang-802.0.42
+        # (widely used in macOS Sierra 10.12.6) wherein a Variable is
+        # implicitly convertible into a std::tuple<Variable, Variable>;
+        # a minimal repro can be seen below here:
+        #
+        #   #include <tuple>
+        #   struct T {};
+        #   void f(const std::tuple<T, T>&) {}
+        #   void g(T& x) { f(x); }
+        #
+        # To work around this bug, the code generator is a bit more
+        # complicated, and is taught how to handle this situation.
+
+        local = {}
+
+        tensor_args = [arg for arg in declaration['arguments'] if arg['simple_type'] in {'Tensor', 'TensorList'}]
+        local['tensor_args'] = [arg['name'] for arg in tensor_args]
+        if any(arg['simple_type'] == 'TensorList' for arg in tensor_args):
+            # Allocate a temporary vector with flatten and pass it in
+            local['trace_inputs'] = CodeTemplate("flatten_tensor_args( $tensor_args )").substitute(local)
+        else:
+            local['trace_inputs'] = CodeTemplate("{ ${tensor_args} }").substitute(local)
+
+        local['record_attributes'] = []
+        for arg in declaration['arguments']:
+            if arg['simple_type'] in {'Tensor', 'TensorList'}:
+                continue
+            local['record_attributes'].append(RECORD_ATTRIBUTE.substitute(name=arg['name']))
+
+        local['record_positional_attributes'] = []
+        for i, arg in enumerate(declaration['arguments']):
+            if arg['simple_type'] == 'Tensor':
+                continue
+            if arg['simple_type'] == 'TensorList':
+                local['record_positional_attributes'] = POSITIONAL_ATTR_NYI
+                break
+            local['record_positional_attributes'].append(
+                RECORD_POSITIONAL_ATTRIBUTE.substitute(name=arg['name'], i=i))
+
+        # Record inplace operations as out-of-place operations (e.g.,
+        # not add_ but add)
+        # TODO: Add a proper concept of side effects to the IR, and
+        # properly record inplace operations.
+        local['trace_name'] = uninplace_api_name(declaration['api_name'])
+        if local['trace_name'] in RENAME_TRACE:
+            local['trace_name'] = RENAME_TRACE[local['trace_name']]
+
+        local['trace_outputs'] = get_trace_outputs(declaration)
+
+        combined = nested_dict(local, nested_dict(env, declaration))
+        return (PRE_RECORD_TRACE.substitute(combined), POST_RECORD_TRACE.substitute(combined))
+
+    def declare_returned_variables():
+        if modifies_arguments:
+            return ''
+        if len(declaration['returns']) == 1:
+            return ''
+        # TODO: this will be ugly
+        names = [ret['type'] + ' ' + ret['name'] + ';' for ret in declaration['returns']]
+        return '\n'.join(names)
+
+    def wrap_output(call):
+        if 'Tensor' not in declaration['return_type']:
+            return call
+        elif is_view:
+            return 'as_view(self, {})'.format(call)
+        else:
+            return 'as_variable({})'.format(call)
+
+    def emit_call(env):
+        combined = nested_dict(env, declaration)
+        if strategy == 'use_derived':
+            call = CALL_VIA_DERIVED.substitute(combined)
+            if not modifies_arguments:
+                call = wrap_output(call)
+        else:
+            call = CALL_VIA_TYPE.substitute(declaration)
+        if not modifies_arguments and not returns_void:
+            call = '{} = {}'.format(tie_return_values(), call)
+        return call + ';'
+
+    def tie_return_values():
+        if len(declaration['returns']) == 1:
+            return 'auto {}'.format(declaration['returns'][0]['name'])
+        names = [ret['name'] for ret in declaration['returns']]
+        return 'std::tie({})'.format(', '.join(names))
+
+    def get_return_value():
+        if inplace:
+            return 'self'
+        if is_out_fn:
+            return_names = [arg['name'] for arg in arguments
+                            if arg.get('output', False)]
+            if len(return_names) == 1:
+                return return_names[0]
+            return 'std::forward_as_tuple({})'.format(', '.join(return_names))
+
+        returns = declaration['returns']
+        if len(returns) == 1:
+            return returns[0]['name']
+        moved = ['std::move({})'.format(r['name']) for r in returns]
+        return 'std::make_tuple({})'.format(', '.join(moved))
+
+    def emit_history():
+        fn = 'rebase' if modifies_arguments and not is_view else 'set'
+        output_names = [r['name'] for r in differentiable_outputs]
+        # TODO: flatten allocates a std::vector, which could be expensive
+        outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names)
+        return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
+
+    def emit_save_outputs():
+        if is_out_fn:
+            # out functions don't currently support differentiation
+            return ''
+        func = declaration['derivative']
+        if func is not None:
+            stmts = save_variables(func['saved_outputs'], True)
+            if len(stmts) == 0:
+                return ''
+            return CONDITIONAL.substitute(cond='grad_fn', statements=stmts)
+        return ''
+
+    def emit_check_inplace():
+        if not inplace:
+            return []
+        return ['check_inplace({});'.format(arg['name']) for arg in differentiable_outputs]
+
+    def emit_increment_version():
+        if not modifies_arguments:
+            return []
+        return ['increment_version({});'.format(arg['name']) for arg in differentiable_outputs]
+
+    env = {}
+    combined = nested_dict(env, declaration)
+
+    body = []
+    if base_name not in DONT_PROFILE:
+        body.append(RECORD_FUNCTION.substitute(combined))
+    if strategy != 'use_type':
+        body.extend(unpack_args(env, declaration))
+    if requires_derivative:
+        body.extend(emit_check_inplace())
+        body.extend(setup_derivative())
+    body.append(declare_returned_variables())
+
+    pre_record_trace, post_record_trace = emit_record_trace(env)
+
+    body.append(pre_record_trace)
+    body.append(emit_call(env))
+    if requires_derivative:
+        # set_flags has to appear after version_counter, because rebase_history
+        # requires that the counter is incremented before it is called
+        body.extend(emit_increment_version())
+        body.append(emit_history())
+    # post_record_trace must appear before save_outputs so that saved outputs
+    # have their tracing state saved (that is setup by recordTrace)
+    body.append(post_record_trace)
+    if requires_derivative:
+        body.append(emit_save_outputs())
+    if not returns_void:
+        body.append('return {};'.format(get_return_value()))
+    return body
+
+
+def unpack_args(env, declaration):
+    def requires_unpack(arg):
+        return 'Tensor' in arg['dynamic_type']
+
+    body = []
+    unpacked_args = []
+    for i, arg in enumerate(declaration['arguments']):
+        # these arguments are skipped from the Type method.
+        if arg.get('is_type_dispatched'):
+            continue
+        if not requires_unpack(arg):
+            unpacked_args.append(arg['name'])
+            continue
+
+        dynamic_type = arg['dynamic_type']
+        is_nullable = arg.get('is_nullable', False)
+        ref = (not is_nullable) and dynamic_type not in ['TensorList', 'SparseTensorRef']
+        suffix = '_opt' if is_nullable else ''
+
+        body.append(UNPACK_TENSOR.substitute(
+            arg_name=arg['name'],
+            arg_pos=i,
+            suffix=suffix,
+            ref='&' if ref else '',
+        ))
+        unpacked_args.append(arg['name'] + '_')
+
+    env['unpacked_args'] = unpacked_args
+    return body
+
+
+def dispatch_strategy(declaration):
+    """How are we going to call the underlying implementation of a
+    declaration?  There are two strategies:
+
+        - use_derived: we want to call the implementation on CPUDoubleType
+          (or a similar, derived Type instance).  Because these derived
+          instances deal in Tensors, not Variables (it's a completely different
+          object, so it doesn't dispatch back to VariableType), code on
+          this dispatch path needs to wrap/unwrap tensors.  If the
+          derived implementation takes and returns tensors, the
+          implementation is usually differentiable (although we also use
+          the derived dispatch path for non-differentiable functions
+          that we still want to dispatch on the derived Type instance;
+          e.g., size())
+
+        - use_type: we want to call the implementation on Type, because
+          it is implemented concretely, and the functions it invokes will
+          get dispatched back to VariableType (which will ensure that they
+          are differentiable.)
+    """
+    if (declaration['abstract'] or declaration['derivative'] is not None or
+            any(arg.get('is_type_dispatched') for arg in declaration['arguments'])):
+        # If the function is abstract (not implemented on at::Type), we must
+        # call the implementation on the derived type with unpacked tensors.
+
+        # If the function has a derivative specified and is concrete, we could
+        # call either implementation. We prefer the calling the derived
+        # type's implementation with unpacked tensors because it is more
+        # performant in some cases: any internal calls to other ATen functions
+        # won't have the history tracked.
+
+        # If the function has a type dispatched argument (i.e. is a factory),
+        # we prefer calling the derived type's implementation both because it is
+        # more performant and to ensure factory functions return tensors with _version
+        # of 0 (probably not strictly necessary, but nice to have to keeps versions simple
+        # to understand.
+        return 'use_derived'
+    else:
+        # If the function is concrete (we don't have to override it) and we
+        # didn't declare it in derivatives.yaml, we'll assume that it is
+        # actually implemented  out of differentiable functions. (This
+        # assumption might not hold, but then you'll see gradcheck fail.)
+        return 'use_type'
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
new file mode 100644
index 0000000..ef4ef8d
--- /dev/null
+++ b/tools/autograd/load_derivatives.py
@@ -0,0 +1,379 @@
+# Parses derivatives.yaml into autograd functions
+#
+# Each autograd function is represented by dictionary containing a list of
+# derivatives (also a dictionary). See `create_autograd_function` and
+# `create_derivative` for the keys.
+from collections import defaultdict
+import copy
+import re
+import yaml
+from .utils import YamlLoader
+from .utils import IDENT_REGEX, split_name_params
+from .gen_autograd import HARDCODED_DIFFERENTIABLE_OUTPUTS
+
+
+def load_derivatives(path, declarations):
+    with open(path, 'r') as f:
+        definitions = yaml.load(f, Loader=YamlLoader)
+
+    declarations_by_signature = defaultdict(list)
+    for declaration in declarations:
+        declarations_by_signature[get_signature(declaration)].append(declaration)
+
+    autograd_functions = [
+        process_definition(defn, declarations_by_signature)
+        for defn in definitions]
+    ensure_unique_names(autograd_functions)
+    match_declarations_with_autograd_functions(declarations, autograd_functions)
+
+    return autograd_functions
+
+
+# How do you feel about pasting declaration inside autograd function...
+def create_autograd_function(name, derivatives, args_with_gradients, signature, declaration):
+    op = to_camel_case(name) + 'Backward'
+    op = op.replace('ForwardBackward', 'Backward')
+    return {
+        'name': name,
+        'op': op,
+        'declaration': declaration,
+        'args_with_gradients': args_with_gradients,
+        'signature': signature,
+        'derivatives': derivatives,
+        'saved_inputs': all_saved_variables(derivatives, 'saved_inputs'),
+        'saved_outputs': all_saved_variables(derivatives, 'saved_outputs'),
+    }
+
+
+def create_derivative(declaration, formula, var_names):
+    def transform_return(r):
+        # In-place functions take in and return self. Call the modified version
+        # "output" so that it can be referred to in derivative definitions.
+        if r['name'] == 'self':
+            r = copy.deepcopy(r)
+            r['name'] = 'output'
+        return r
+
+    returns = [transform_return(r) for r in declaration['returns']]
+    arguments = declaration['arguments']
+    formula, saved_inputs = saved_variables(formula, arguments)
+    formula, saved_outputs = saved_variables(formula, returns)
+
+    # Check that the referenced gradients in the formula are in bounds
+    for i in used_gradient_indices(formula):
+        if i >= len(declaration['returns']):
+            raise RuntimeError(
+                "Out of bounds grads access: derivative formula for {} "
+                "used grads[{}], but the forward only returns {} outputs."
+                .format(declaration['name'], i, len(declaration['returns'])))
+
+    return {
+        'formula': formula,
+        'saved_inputs': saved_inputs,
+        'saved_outputs': saved_outputs,
+        'var_names': var_names,
+    }
+
+
+def process_definition(defn, declarations_by_signature):
+    """Processes a single entry `defn` in derivatives.yaml"""
+
+    def canonical_declaration(declarations, name):
+        for declaration in declarations:
+            if declaration['name'] == name:
+                return declaration
+        # some functions only have in-place variants
+        assert name + '_' == declarations[0]['name']
+        return declarations[0]
+
+    def split_names(raw_names):
+        """Given "foo, bar", return ["foo", "bar"]."""
+        return [x.strip() for x in raw_names.split(',')]
+
+    def lookup_pred(pred, xs):
+        """Return the index of the first element of xs matching pred."""
+        return next((i, x) for i, x in enumerate(xs) if pred(x))
+
+    def check_grad_usage(defn_name, declaration, derivatives):
+        """
+        Check for some subtle mistakes one might make when writing gradients.
+        These mistakes will compile, but will be latent until a function is
+        used with double backwards.
+        """
+
+        used_grad = 0
+        used_grads = 0
+        fully_implemented = True
+        used_grads_indices = []
+        for d in derivatives:
+            formula = d['formula']
+            used_grad += len(re.findall(IDENT_REGEX.format('grad'), formula))
+            used_grads += len(re.findall(IDENT_REGEX.format('grads'), formula))
+            fully_implemented = \
+                fully_implemented and \
+                not re.search(IDENT_REGEX.format('not_implemented'), formula)
+            used_grads_indices.extend(used_gradient_indices(formula))
+        assert used_grads >= len(used_grads_indices)
+        only_used_grads_indices = used_grads == len(used_grads_indices)
+
+        if used_grad and used_grads:
+            raise RuntimeError("Derivative definition of {} in derivatives.yaml illegally "
+                               "mixes use of 'grad' and 'grads'. Consider replacing "
+                               "occurrences of 'grad' with 'grads[0]'".format(defn_name))
+
+        if only_used_grads_indices and set(used_grads_indices) == {0}:
+            raise RuntimeError("Derivative definition of {} in derivatives.yaml solely "
+                               "refers to 'grads[0]'.  If the first output is indeed the "
+                               "only differentiable output, replace 'grads[0]' with 'grad'; "
+                               "otherwise, there is a likely error in your derivatives "
+                               "declaration.".format(defn_name))
+
+        hardcoded_diff = HARDCODED_DIFFERENTIABLE_OUTPUTS.get(defn_name)
+        if hardcoded_diff:
+            if used_grad:
+                raise RuntimeError("Derivative definition {} has hard-coded differentiable "
+                                   "outputs in gen_autograd.py, but used grad (which implies "
+                                   "only the first output is differentiable) in its "
+                                   "derivative declaration.  You likely meant to write "
+                                   "grads[i] for some i instead.".format(defn_name))
+            if only_used_grads_indices and set(used_grads_indices) != set(hardcoded_diff):
+                raise RuntimeError("Derivative definition {} has hard-coded differentiable "
+                                   "outputs {}, but the used grads in the derivative "
+                                   "definitions are only {}.  Either your derivatives "
+                                   "declaration is wrong, or the value of "
+                                   "HARDCODED_DIFFERENTIABLE_OUTPUTS in gen_autograd.py "
+                                   "is wrong.".format(defn_name, hardcoded_diff,
+                                                      used_grads_indices))
+        else:
+            if fully_implemented and not used_grad and \
+               used_grads and only_used_grads_indices and \
+               set(used_grads_indices) != set(range(len(declaration['returns']))):
+                raise RuntimeError("Derivative definition of {} in derivatives.yaml does "
+                                   "not refer to the gradients of all of its outputs.  Either "
+                                   "the derivatives declaration is wrong, OR you have some "
+                                   "non-differentiable outputs.  If you have a single "
+                                   "differentiable output, make it the first output in ATen "
+                                   "and reference its gradient with 'grad'; otherwise, hard "
+                                   "code the list of differentiable outputs in "
+                                   "HARDCODED_DIFFERENTIABLE_OUTPUTS in gen_autograd.py."
+                                   .format(defn_name))
+
+    def set_up_derivatives(defn_name, defn, declaration):
+        # Determine the set of inputs which have gradients
+        args_with_gradients_set = set()
+        for raw_names in defn:
+            args_with_gradients_set |= set(split_names(raw_names))
+
+        # Next, let us determine the list of inputs in order.
+        args_with_gradients = []
+        for arg in declaration['arguments']:
+            if arg['name'] not in args_with_gradients_set:
+                continue
+            args_with_gradients.append(arg)
+
+        # Set up the derivative information
+        derivatives = []
+        for raw_names in sorted(defn.keys()):
+            formula = defn[raw_names]
+            names = split_names(raw_names)
+            derivatives.append(create_derivative(declaration, formula, names))
+
+        # Test to see if the use of 'grads' makes sense.
+        check_grad_usage(defn_name, declaration, derivatives)
+
+        return derivatives, args_with_gradients
+
+    def unzip(xs):
+        return zip(*xs)
+
+    # NB: Removes 'name' from defn dictionary
+    defn_name, params = split_name_params(defn.pop('name'))
+    param_types, param_names = unzip([p.split(' ') for p in params if p != '*'])
+    if 'grad_input_mask' in param_names:
+        raise RuntimeError("Signature for {} has an argument named grad_input_mask, "
+                           "but this name would be shadowed by our codegen. "
+                           "Please use a different name in Declarations.cwrap."
+                           .format(defn_name))
+    signature = '{}({})'.format(defn_name, ', '.join(param_types))
+
+    declarations = declarations_by_signature[signature]
+    if len(declarations) == 0:
+        avail = [k for k, v in declarations_by_signature.items()
+                 if k.startswith(defn_name + '(') and len(v) > 0]
+        raise RuntimeError('no ATen declaration found for: {}.  '
+                           'Available signatures: {}'.format(signature, ', '.join(avail)))
+    canonical = canonical_declaration(declarations, defn_name)
+
+    # TODO: Check the types line up
+    if len(param_names) != len(canonical['args']):
+        raise RuntimeError('Signature for {} has {} arguments ({}), but '
+                           'Declarations.yaml records {} arguments ({})'
+                           .format(defn_name,
+                                   len(param_names),
+                                   ', '.join(param_names),
+                                   len(canonical['args']),
+                                   ', '.join(canonical['args'])))
+    for i, (x, y) in enumerate(zip(param_names, canonical['args'])):
+        if x != y:
+            raise RuntimeError('Argument {} of {} has different names in '
+                               'derivatives.yaml ({}) and '
+                               'Declarations.yaml ({})'
+                               .format(i, defn_name, x, y))
+
+    derivatives, args_with_gradients = set_up_derivatives(defn_name, defn, canonical)
+    return create_autograd_function(defn_name, derivatives, args_with_gradients, signature, canonical)
+
+
+def ensure_unique_names(autograd_functions):
+    # de-duplicate operation names
+    # you end up with something like:
+    #   AddBackward0
+    #   AddBackward1
+    # one for each overload
+    functions_by_name = defaultdict(list)
+    for func in autograd_functions:
+        functions_by_name[func['op']].append(func)
+    for op in functions_by_name.keys():
+        overloads = functions_by_name[op]
+        if len(overloads) > 1:
+            for i, func in enumerate(overloads):
+                func['op'] += str(i)
+
+
+def get_signature(declaration, use_base_variant=False):
+    name = declaration['name']
+    arguments = declaration['arguments']
+    if use_base_variant:
+        if declaration['inplace']:
+            assert name.endswith('_')
+            name = name[:-1]
+        elif name.endswith('_out'):
+            name = name[:-4]
+            arguments = [arg for arg in arguments if not arg.get('output', False)]
+    simple_types = [arg['simple_type'] for arg in arguments]
+    return '{}({})'.format(name, ', '.join(simple_types))
+
+
+GRAD_INDEX_REGEX = r'(?:^|\W)grads\[(\d+)\]'
+
+
+def used_gradient_indices(formula):
+    """Determine a list of gradient indices (the i in grads[i]) that
+    are used by the formula.
+
+    >>> used_gradient_indices("foo(grads[0], grads[1])")
+    [0, 1]
+    """
+    return [int(i) for i in re.findall(GRAD_INDEX_REGEX, formula)]
+
+
+def saved_variables(formula, args):
+    # find which arguments need to be saved
+    saved = []
+
+    REPLACEMENTS = [
+        # replace self.sizes() with self_sizes
+        (r'{}.sizes\(\)', {
+            'suffix': '_sizes',
+            'type': 'IntList',
+        }),
+        # replace zeros_like(self) with self_info
+        (r'zeros_like\({}\)', {
+            'suffix': '_info',
+            'type': 'TypeAndSize',
+            'expr': lambda name: name,  # at save-time
+            'res': lambda name: name + '_info.zeros()',  # at eval-time
+        }),
+        # replace self.size(2) with self_size_2
+        (r'{}.size\((\w+)\)', {
+            'suffix': lambda m: '_argsize_{}'.format(*m.groups()),
+            'type': 'int64_t',
+        }),
+        # replace self.numel() with self_numel
+        (r'{}.numel\(\)', {
+            'suffix': '_numel',
+            'type': 'int64_t',
+        }),
+        # replace to_args_sizes(self) with self_args_sizes
+        (r'to_args_sizes\({}\)', {
+            'suffix': '_args_sizes',
+            'type': 'std::vector<std::vector<int64_t>>',
+        }),
+        # replace TensorGeometry(self) with self_geometry
+        (r'TensorGeometry\({}\)', {
+            'suffix': '_geometry',
+            'type': 'TensorGeometry',
+        }),
+    ]
+
+    for arg in args:
+        if 'name' not in arg:
+            # some returned arguments do not have names
+            continue
+
+        name = arg['name']
+
+        # First search the formula for expressions which can be evaluated
+        # when the autograd Function is created to avoid saving variables
+        for regex, info in REPLACEMENTS:
+            def repl(m):
+                suffix = info['suffix']
+                suffix = suffix(m) if callable(suffix) else suffix
+                expr = info['expr'](name) if 'expr' in info else m.group(0)
+                saved.append({
+                    'name': name + suffix,
+                    'type': info['type'],
+                    'expr': expr,
+                })
+                if 'res' in info:
+                    return info['res'](name)
+                return name + suffix
+
+            formula = re.sub(regex.format(name), repl, formula)
+
+        # Find any variables which remain in the formula and save them
+        if re.search(IDENT_REGEX.format(name), formula):
+            arg = copy.deepcopy(arg)
+            arg['type'] = arg['type'].replace('const ', '').replace(' &', '')
+            saved.append(arg)
+
+    return formula, saved
+
+
+def all_saved_variables(derivatives, key):
+    seen = set()
+    saved = []
+    for d in derivatives:
+        for saved_arg in d[key]:
+            if saved_arg['name'] in seen:
+                continue
+            seen.add(saved_arg['name'])
+            saved.append(saved_arg)
+    return saved
+
+
+def to_camel_case(name):
+    return ''.join([p.title() for p in name.split('_')])
+
+
+def match_declarations_with_autograd_functions(declarations, autograd_functions):
+    """Sets the "derivative" key on declarations to matching autograd functions
+
+    In-place functions will use the out-of-place derivative definition if there
+    is no in-place specific derivative.
+    """
+
+    functions_by_signature = {f['signature']: f for f in autograd_functions}
+
+    def find_function(declaration):
+        signature = get_signature(declaration)
+        if signature in functions_by_signature:
+            return functions_by_signature[signature]
+
+        # if there is no exact match look for the out-of-place signature.
+        # i.e mul() for mul_() or mul_out()
+        signature = get_signature(declaration, use_base_variant=True)
+        return functions_by_signature.get(signature)
+
+    for declaration in declarations:
+        declaration['derivative'] = find_function(declaration)
diff --git a/tools/autograd/nested_dict.py b/tools/autograd/nested_dict.py
new file mode 100644
index 0000000..e1e0981
--- /dev/null
+++ b/tools/autograd/nested_dict.py
@@ -0,0 +1,19 @@
+# TODO: refactor nested_dict into common library with ATen
+class nested_dict(object):
+    """
+    A nested dict is a dictionary with a parent.  If key lookup fails,
+    it recursively continues into the parent.  Writes always happen to
+    the top level dict.
+    """
+
+    def __init__(self, base, parent):
+        self.base, self.parent = base, parent
+
+    def __contains__(self, item):
+        return item in self.base or item in self.parent
+
+    def __getitem__(self, x):
+        r = self.base.get(x)
+        if r is not None:
+            return r
+        return self.parent[x]
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
new file mode 100644
index 0000000..9473dc3
--- /dev/null
+++ b/tools/autograd/templates/Functions.cpp
@@ -0,0 +1,1907 @@
+#include "Functions.h"
+#include <ATen/Utils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/WrapDimUtilsMulti.h>
+#include <THNN/Reduction.h>
+
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <ciso646>
+#endif
+#include <math.h>
+#include <algorithm>
+#include <numeric>
+
+// ${generated_comment}
+
+using at::Tensor;
+using at::Scalar;
+using at::IntList;
+using at::TensorList;
+
+namespace torch { namespace autograd { namespace generated {
+
+namespace {
+
+// Helper functions for autogenerated code
+
+// A simple way to imperatively compute index ranges for slots
+// that have been flattened
+struct IndexRangeGenerator {
+  IndexRange range(size_t range_size) {
+    i += range_size;
+    return {i - range_size, i};
+  }
+  size_t size() { return i; }
+  private:
+    size_t i = 0;
+};
+
+void copy_range(variable_list& out, IndexRange range, const Tensor & t) {
+  AT_ASSERT(range.second <= out.size());
+  AT_ASSERTM(range.second - range.first == 1, "inconsistent range for Tensor output");
+  out[range.first] = t;
+}
+
+void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t) {
+  AT_ASSERT(range.second <= out.size());
+  AT_ASSERTM(range.second - range.first == t.size(), "inconsistent range for TensorList output");
+  std::copy(t.begin(), t.end(), out.begin() + range.first);
+}
+
+std::vector<Tensor> to_tensor_list(const variable_list& variables) {
+  return fmap(variables, [](const Variable &v) { return static_cast<Tensor>(v); } );
+}
+
+Tensor not_implemented(const char* name) {
+  throw std::runtime_error(
+      std::string("the derivative for '") + name + "' is not implemented");
+}
+
+Tensor maybe_multiply(const Tensor & t, const Scalar & s) {
+  bool is_one = false;
+  if (s.isFloatingPoint()) {
+    is_one = s.toDouble() == 1;
+  } else if(s.isIntegral()) {
+    is_one = s.toLong() == 1;
+  }
+
+  if (is_one) {
+    return t;
+  } else {
+    return t * s;
+  }
+}
+
+int64_t _safe_size(IntList sizes, int64_t dim) {
+  dim = at::maybe_wrap_dim(dim, sizes.size());
+  return sizes.size() != 0 ? sizes[dim] : 1;
+}
+
+Tensor norm_backward(const Tensor & grad, const Tensor & self, const Scalar & p_, const Tensor & norm) {
+  double p = p_.toDouble();
+  Tensor self_scaled;
+  Tensor scale_v;
+  if (p == 0.0) {
+    return zeros_like(self);
+  } else if (p == 1.0) {
+    return self.sign() * grad;
+  } else if (p < 2.0) {
+    self_scaled = self.sign() * self.abs().pow(p - 1);
+    scale_v = grad / norm.pow(p - 1);
+  } else if (p == 2.0) {
+    self_scaled = self;
+    scale_v = grad / norm;
+  } else if (p == INFINITY) {
+    self_scaled = self.sign() * (self.abs() == norm).toType(self.type());
+    scale_v = grad.clone();
+  } else {
+    self_scaled = self * self.abs().pow(p - 2);
+    scale_v = grad / norm.pow(p - 1);
+  }
+  // handle case at 0 where we return a subgradient containing 0
+  scale_v.masked_fill_(norm == 0, 0);
+  return self_scaled * scale_v;
+}
+
+Tensor norm_backward(Tensor grad, const Tensor & self, const Scalar & p_, Tensor norm, int64_t dim, bool keepdim) {
+  if (!keepdim && self.dim() != 0) {
+    grad = grad.unsqueeze(dim);
+    norm = norm.unsqueeze(dim);
+  }
+  return norm_backward(grad, self, p_, norm);
+}
+
+Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
+  double exponent = exponent_.toDouble();
+  if (exponent == 0.0) {
+    return zeros_like(self);
+  } else {
+    return grad * exponent * self.pow(exponent - 1);
+  }
+}
+
+Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & exponent) {
+  return at::where(exponent == 0.0, at::zeros({}, grad.type()), grad * exponent * self.pow(exponent - 1));
+}
+
+Tensor pow_backward_exponent(Tensor grad, const Tensor & self, const Tensor & exponent) {
+  return grad * self.pow(exponent) * self.log();
+}
+
+Tensor reduce_to(const Tensor & grad, IntList sizes) {
+  if (sizes.size() == 0) {
+    return grad.sum();
+  }
+  Tensor result = grad;
+  while (result.dim() > (int64_t)sizes.size()) {
+    result = result.sum(0, false);
+  }
+  for (int64_t i = 0; i < result.dim(); ++i) {
+    if (sizes[i] == 1 && result.sizes()[i] > 1) {
+      result = result.sum(i, true);
+    }
+  }
+  return result;
+}
+
+Tensor permute_backwards(const Tensor & grad, IntList fwd_dims) {
+  // invert the permutation
+  auto ndims = fwd_dims.size();
+  std::vector<int64_t> dims(ndims);
+  for (size_t i = 0; i < ndims; i++) {
+    dims[at::maybe_wrap_dim(fwd_dims[i], ndims)] = i;
+  }
+  return grad.permute(dims);
+}
+
+Tensor sum_backward(const Tensor & grad, IntList sizes, IntList dims, bool keepdim) {
+  if (!keepdim && sizes.size() > 0) {
+    if (dims.size()==1) {
+      return grad.unsqueeze(dims[0]).expand(sizes);
+    } else {
+      auto dims_to_unsqueeze = dim_list_to_bitset(dims, sizes.size());
+      Tensor res = grad;
+      for (size_t i = 0; i < sizes.size(); i++){
+	if (dims_to_unsqueeze[i])
+	  res = res.unsqueeze(i);
+      }
+      return res.expand(sizes);
+    }
+  } else {
+    return grad.expand(sizes);
+  }
+}
+
+Tensor reverse_dim(const Tensor& t, int64_t dim) {
+  Tensor index = at::arange(t.size(dim) - 1, -1, -1, t.type().toScalarType(at::ScalarType::Long));
+  return t.index_select(dim, index);
+}
+
+Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t dim) {
+  if (inp.size(dim) == 1) {
+    return grad;
+  }
+
+  std::vector<int64_t> ones_size(inp.sizes());
+  ones_size[dim] = 1;
+  Tensor ones = at::ones(ones_size, grad.type());
+  Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim);
+  Tensor exclusive_normal = exclusive_normal_nocp.cumprod(dim);
+
+  Tensor narrow_reverse = reverse_dim(inp.narrow(dim, 1, inp.size(dim) - 1), dim);
+  Tensor exclusive_reverse_nocp = at::cat({ones, narrow_reverse}, dim);
+  Tensor exclusive_reverse = reverse_dim(exclusive_reverse_nocp.cumprod(dim), dim);
+
+  return grad * (exclusive_normal * exclusive_reverse);
+}
+
+// note that the gradient for prod is equivalent to:
+// cumprod(exclusive, normal) * cumprod(exclusive, reverse), e.g.:
+// input:                        [    a,     b,     c]
+// cumprod(exclusive, normal):   [1    ,     a, a * b]
+// cumprod(exclusive, reverse):  [b * c,     c,     1]
+// product:                      [b * c, a * c, a * b]
+// and this is safe under input with 0s.
+Tensor prod_backward(const Tensor& grad, const Tensor& input, const Tensor& result) {
+  if (input.dim() == 0) {
+    return grad;
+  }
+  Tensor zero_idx = (input == 0).nonzero();
+  if (zero_idx.numel() == 0) {
+    return (grad * result) / input;
+  } else if (zero_idx.size(0) > 1) {
+    return zeros_like(input);
+  } else {
+    return prod_safe_zeros_backward(grad, input.contiguous().view(-1), 0).view_as(input);
+  }
+}
+
+Tensor prod_backward(Tensor grad, const Tensor& input, Tensor result, int64_t dim, bool keepdim) {
+  if (input.dim() == 0) {
+    return grad;
+  }
+  dim = at::maybe_wrap_dim(dim, input.sizes().size());
+  if (!keepdim && input.dim() != 1) {
+    grad = grad.unsqueeze(dim);
+    result = result.unsqueeze(dim);
+  }
+
+  Tensor zero_mask = (input == 0);
+  Tensor slice_zero_count = zero_mask.sum(dim, true);
+  int64_t total_zeros = slice_zero_count.sum().toCLong();
+  if (total_zeros == 0) {
+    return (grad * result) / input;
+  } else {
+    return prod_safe_zeros_backward(grad, input, dim);
+  }
+}
+
+Tensor sum_scan_exclusive(const Tensor& x, int64_t dim) {
+  Tensor ret = at::cumsum(-x, dim);
+
+  int64_t end_idx = ret.size(dim) - 1;
+  Tensor ret_sum = ret.narrow(dim, end_idx, 1).clone();
+  ret -= ret_sum.expand_as(ret);
+  ret += x;
+  return ret;
+}
+
+Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) {
+  /*
+    There are two algorithms to do this. The first one
+    is very efficient, but works only when there are no
+    nonzero elements in the input.
+
+    The second one is much more complex, but it doesn't
+    assume anything on the input. The main downside is
+    that it takes time O(n^2), where n = input.size(self.dim)
+    (i.e. the length of the cumulative product). This is in
+    contrast to the forward pass and the efficient algorithm,
+    which are both O(n).
+
+    The second algorithm is a simple application of the chain
+    rule. If x is an n-dimensional vector, and y = cumprod(x),
+    and F is the final cost, then
+
+    dF / dx_k = sum_j (dF / dy_j) * (dy_j / dx_k)   (1)
+
+    The term dF / dy_j is just grad_output[j] (assuming again
+    everything is one-dimensional).
+
+    The term (dy_j / dx_k) is easilly seen to be
+
+    if j >= k
+      dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i
+    else:
+      dy_j / dx_k = 0
+
+    Note that the indicator (j>=k) can be taken out
+    by replacing the sum in (1) with a sum from
+    j = k to n.
+
+    Thus,
+    df / dx_k = sum_{k <= j <= n} grad_output[j] * (dy_j / dx_k)
+
+    with
+    dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i     (2)
+
+    Note that this last term is just the cumulative product
+    with k omitted. Thus, if x_k (the input) is nonzero, we can
+    just express this as
+
+    dy_j / dx_k = (prod_{1 <= i <= j} x_i) / x_k
+                = y_j / x_k
+
+    So therefore,
+
+    df / dx_k = sum_{k <= j <= n} grad_output[j] * y_j / x_k
+
+    so
+
+    grad_output = sum_scan_exclusiv(grad_output * output) / input
+
+    If the input is nonzero, we need to calculate the dy_j / dx_k
+    by using the formula (2), called in the code omitted_products.
+
+    The way the code calculates it is simply by noting that
+
+    prod_{1 <= i <= j, i != k} x_i
+        = (prod_{1 <= i <= k} x_i) * (prod_{k + 1 <= i <= j} x_i)
+
+    the first term is calculated as prods_until_k, which since
+    doesn't depend in j is easy to vectorize.
+
+    The second term (indexed by j) is the cumulative product of
+    x_{k+1}, x_{k+2}, ..., x_n, and it's named in the code
+    prods_from_k_pkus_1, and it's calculated as a cumprod.
+
+    In order to vectorize this properly, we need to add to
+    omitted_products the dimensions where k > j, and therefore
+    dy_j / dx_k = 0, which is done right after the assert.
+  */
+
+  if (input.dim() == 0) {
+    return grad;
+  }
+  dim = at::maybe_wrap_dim(dim, input.sizes().size());
+  int64_t dim_size = input.size(dim);
+  if (dim_size == 1) {
+    return grad;
+  }
+
+  // Simple case with nonzero elements in the input
+  if ((input != 0).all().toCByte()) {
+    Tensor result = at::cumprod(input, dim);
+    return sum_scan_exclusive(result * grad, dim) / input;
+  }
+
+  std::vector<int64_t> ones_size(input.sizes());
+  ones_size[dim] = 1;
+  Tensor ones = at::ones({1}, grad.type()).expand(ones_size);
+  Tensor grad_input = at::zeros(input.sizes(), grad.type());
+  Tensor prods_from_k_plus_1;
+  Tensor omitted_products;
+  for (int k = 0; k < dim_size; ++k) {
+    if (k == 0) {
+      prods_from_k_plus_1 = at::cumprod(input.slice(dim, k + 1), dim);
+      omitted_products = at::cat({ones, prods_from_k_plus_1}, dim);
+    } else if (k == dim_size - 1) {
+      Tensor prods_until_k = at::prod(input.slice(dim, 0, k), dim, true);
+      omitted_products = prods_until_k;
+    } else {
+      Tensor prods_until_k = at::prod(input.slice(dim, 0, k), dim, true);
+      prods_from_k_plus_1 = at::cumprod(input.slice(dim, k+1), dim);
+      omitted_products = prods_until_k.expand_as(prods_from_k_plus_1) * prods_from_k_plus_1;
+      omitted_products = at::cat({prods_until_k, omitted_products}, dim);
+    }
+
+    // At this point omitted_products is the same size
+    // as input, except on the dimension dim where it's
+    // dim_size - k
+    TORCH_ASSERT(omitted_products.size(dim) == dim_size - k);
+
+    grad_input.select(dim, k).copy_(
+        at::sum(grad.slice(dim, k) * omitted_products,dim));
+  }
+
+  return grad_input;
+}
+
+Tensor gesv_backward_self(const Tensor & grad, const Tensor & self, const Tensor & A) {
+  return std::get<0>(at::gesv(grad, A.transpose(-2, -1)));
+}
+
+Tensor gesv_backward_A(const Tensor & grad, const Tensor & self, const Tensor & A, const Tensor & solution) {
+  Tensor grad_self = gesv_backward_self(grad, self, A);
+  if (self.ndimension() == 2 && A.ndimension() == 2) {
+    return -at::mm(grad_self, solution.transpose(-2, -1));
+  }
+  return -at::matmul(grad_self, solution.transpose(-2, -1));
+}
+
+Tensor cumsum_backward(const Tensor & x, int64_t dim) {
+  if (x.dim() == 0) {
+    return x;
+  }
+  auto ret = at::cumsum(-x, dim);
+  auto ret_sum = ret.narrow(dim, ret.size(dim) - 1, 1).clone();
+  ret -= ret_sum.expand(ret.sizes());
+  ret += x;
+  return ret;
+}
+
+Tensor logsumexp_backward(Tensor grad, const Tensor & self, Tensor result, int64_t dim, bool keepdim) {
+  if (! keepdim) {
+    grad = grad.unsqueeze(dim);
+    result = result.unsqueeze(dim);
+  }
+  return grad * (self - result).exp();
+}
+
+Tensor unsqueeze_to(const Tensor & self, IntList sizes) {
+  auto result = self;
+
+  int64_t nDims = sizes.size();
+  for (int64_t dim = 0; dim < nDims; dim++) {
+    if (sizes[dim] == 1) {
+      result = result.unsqueeze(dim);
+    }
+  }
+  return result;
+}
+
+Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntList sizes) {
+  dim = at::maybe_wrap_dim(dim, sizes.size());
+  // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
+  // unsqueezing in the backward.
+  if (sizes.size() > 0 && sizes[dim] == 1) {
+    return self.unsqueeze(dim);
+  }
+  return self;
+}
+
+std::vector<Tensor> cat_tensors_backward(const Tensor & grad, const std::vector<std::vector<int64_t>> &sizes, int64_t dim) {
+  dim = at::legacy_cat_wrap_dim(dim, sizes);
+  std::vector<Tensor> grad_inputs(sizes.size());
+  int64_t accumulate = 0;
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    auto& shape = sizes[i];
+    // If input was empty tensor, gradInput should be empty tensor.
+    if (shape == std::vector<int64_t>({0})) {
+      grad_inputs[i] = at::zeros({0}, grad.type());
+      continue;
+    }
+    auto size = shape[dim];
+    accumulate += size;
+    grad_inputs[i] = grad.narrow(dim, accumulate - size, size);
+  }
+  return grad_inputs;
+}
+
+Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, IntList sizes, IntList strides, const Scalar & alpha) {
+  // if input was column-major, return grad as column-order for efficiency
+  if (strides[0] == 1 && strides[1] == sizes[0]) {
+    return maybe_multiply(mat2.mm(grad.t()).t(), alpha);
+  } else {
+    return maybe_multiply(grad.mm(mat2.t()), alpha);
+  }
+}
+
+Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntList sizes, IntList strides, const Scalar & alpha) {
+  // if input was column-major, return grad as column-order for efficiency
+  if (strides[0] == 1 && strides[1] == sizes[0]) {
+    return maybe_multiply(grad.t().mm(mat1).t(), alpha);
+  } else {
+    return maybe_multiply(mat1.t().mm(grad), alpha);
+  }
+}
+
+Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+  auto transposed_sizes = std::vector<int64_t>(self.transpose(dim, 0).sizes());
+  auto flatten = [&](const Tensor & t) {
+    return t.transpose(dim, 0).contiguous().view({t.size(dim), -1});
+  };
+  auto unflatten = [&](const Tensor & t) {
+    return t.contiguous().view(transposed_sizes).transpose(dim, 0);
+  };
+
+  // renorm computes the norm over all dimensions except `dim`, which is why
+  // we need the flatten and unflatten business. TODO: simplify this when we
+  // add support for norm over multiple dimensions.
+  auto self_flat = flatten(self);
+  auto grad_flat = flatten(grad);
+  auto norm_flat = self_flat.norm(p, 1, true);
+  auto grad_output = (self_flat * grad_flat).sum(1, true);
+  auto nb = norm_backward(grad_output, self_flat, p, norm_flat, 1, true);
+  auto invnorm = (norm_flat + 1e-7).reciprocal();
+  auto grad_norm = unflatten(maxnorm * invnorm * (grad_flat - invnorm * nb));
+  auto norm = unflatten(norm_flat.expand_as(self_flat));
+
+  // TODO: remove the detach once comparison ops no longer require grad
+  auto mask = Variable(norm < maxnorm).detach();
+  return at::where(mask, grad, grad_norm);
+}
+
+Tensor sum_tensorlist(TensorList tl) {
+  if (tl.size() == 0) {
+    throw std::runtime_error("Can't sum tensorlist of size 0");
+  }
+  Tensor sum = tl[0];
+  for(size_t i = 1; i < tl.size(); ++i) {
+    sum = sum + tl[i];
+  }
+  return sum;
+}
+
+Tensor repeat_backward(Tensor grad, int64_t input_dims, IntList repeats) {
+  int64_t num_unsqueezed = grad.dim() - input_dims;
+  for (int64_t i = 0; i < num_unsqueezed; ++i) {
+    grad = grad.sum(0, false);
+  }
+  for (size_t j = num_unsqueezed; j < repeats.size(); ++j) {
+    int64_t repeat = repeats[j];
+    if (repeat == 1) {
+      continue;
+    }
+    int64_t dim = j - num_unsqueezed;
+    grad = sum_tensorlist(grad.chunk(repeat, dim));
+  }
+  return grad;
+}
+
+Tensor select_equals_backward(Tensor grad, const Tensor & input, const Tensor & value) {
+  auto grad_input = zeros_like(input);
+  grad_input.masked_fill_(input == value, grad);
+  return grad_input;
+}
+
+Tensor index_select_backward(Tensor grad, int64_t dim, Tensor indices, IntList sizes, bool keepdim) {
+  if (!keepdim) {
+    grad = grad.unsqueeze(dim);
+    indices = indices.unsqueeze(dim);
+  }
+  return at::zeros(sizes, grad.type()).scatter_(dim, indices, grad);
+}
+
+Tensor slice_backward(Tensor grad, IntList input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
+  auto grad_input = at::zeros(input_sizes, grad.options());
+  grad_input.slice(dim, start, end, step).copy_(grad);
+  return grad_input;
+}
+
+Tensor select_backward(Tensor grad, IntList input_sizes, int64_t dim, int64_t index) {
+  auto grad_input = at::zeros(input_sizes, grad.options());
+  grad_input.select(dim, index).copy_(grad);
+  return grad_input;
+}
+
+Tensor trace_backward(const Tensor & grad, IntList sizes) {
+  if (sizes.size() != 2) {
+    throw std::runtime_error("expected matrix input");
+  }
+
+  auto& long_type = grad.type().toScalarType(at::kLong);
+
+  auto grad_input = at::zeros(sizes[0] * sizes[1], grad.type());
+  auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, long_type);
+  grad_input.index_fill_(0, indices, grad);
+  return grad_input.view(sizes);
+}
+
+Tensor unfold_backward(const Tensor & grad, IntList input_sizes, int64_t dim, int64_t size, int64_t step) {
+  auto& long_type = grad.type().toScalarType(at::kLong);
+
+  int64_t numel = 1;
+  for (auto size : input_sizes) {
+    numel *= size;
+  }
+
+  auto idx = at::arange(0, numel, long_type).view(input_sizes);
+  auto idx_unfolded = idx.unfold(dim, size, step).contiguous().view(-1);
+  auto grad_input = at::zeros({numel}, grad.type());
+  grad_input.index_add_(0, idx_unfolded, grad.contiguous().view(-1));
+  return grad_input.view(input_sizes);
+}
+
+Tensor var_backward(const Tensor & grad, const Tensor & self, bool unbiased) {
+  return (2.0 / (self.numel() - unbiased)) * grad * (self - self.mean());
+}
+
+Tensor var_backward(Tensor grad, const Tensor & self, int64_t dim, bool unbiased, bool keepdim) {
+  if (self.dim() == 0) {
+    return var_backward(grad, self, unbiased);
+  }
+  if (!keepdim && self.dim() > 1) {
+    grad = grad.unsqueeze(dim);
+  }
+  return (2.0 / (self.size(dim) - unbiased)) * grad * (self - self.mean(dim, true));
+}
+
+Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntList sizes) {
+  int64_t numel = 1;
+  for (auto size : sizes) {
+    numel *= size;
+  }
+  auto mask_selected = grad.masked_select(mask);
+  auto diff_nelem = numel - mask_selected.numel();
+  if (diff_nelem > 0) {
+    // because mask_selected returns a 1-d tensor with size of masked elements that are 1,
+    // we need to fill out the rest with zeros then reshape back to tensor2's size.
+    auto zeros_fillin = at::zeros({diff_nelem}, grad.type());
+    mask_selected = at::cat({mask_selected, zeros_fillin}, 0);
+  }
+  return mask_selected.view(sizes);
+}
+
+Tensor potrf_backward(Tensor grad, bool upper, Tensor L) {
+  // cf. Iain Murray (2016); arXiv 1602.07527
+  if (upper) {
+    L = L.t();
+    grad = grad.t();
+  }
+
+  auto phi = [](const Tensor & A) -> Tensor {
+    auto B = A.tril();
+    B = B - 0.5 * at::diag(at::diag(B));
+    return B;
+  };
+
+  // make sure not to double-count variation, since
+  // only half of output matrix is unique
+  auto Lbar = grad.tril();
+
+  auto P = phi(at::mm(L.t(), Lbar));
+  Tensor S;
+  std::tie(S, std::ignore) = at::gesv(P + P.t(), L.t());
+  std::tie(S, std::ignore) = at::gesv(S.t(), L.t());
+  S = phi(S);
+  if (upper) {
+    S = S.t();
+  }
+  return S;
+}
+
+Tensor split_with_sizes_backward(const std::vector<torch::autograd::Variable> &grads,
+                                 IntList split_sizes, int64_t dim, IntList sizes, const Type &type) {
+  dim = at::maybe_wrap_dim(dim, sizes.size());
+
+  // it's possible some of the grads are not defined (represents tensors of all 0s).
+  // Since at::cat can't handle those, let's define them
+  std::vector<Tensor> grads_all_defined(grads.size());
+  for (size_t j = 0; j < grads.size(); ++j) {
+    if (grads[j].defined()) {
+      grads_all_defined[j] = grads[j];
+    } else {
+      auto length = split_sizes[j];
+      std::vector<int64_t> grad_size(sizes);
+      grad_size[dim] = length;
+      grads_all_defined[j] = at::zeros(grad_size, type);
+    }
+  }
+
+  auto ret =  at::cat(grads_all_defined, dim);
+  return ret;
+}
+
+Tensor split_backward(const std::vector<torch::autograd::Variable> &grads,
+                      int64_t split_size, int64_t dim, IntList sizes, const Type &type) {
+  dim = at::maybe_wrap_dim(dim, sizes.size());
+  int64_t dim_size = sizes[dim];
+  int64_t num_splits = grads.size();
+  std::vector<int64_t> split_sizes(num_splits, split_size);
+  split_sizes[num_splits - 1] = split_size - (split_size * num_splits - dim_size);
+  return split_with_sizes_backward(grads, split_sizes, dim, sizes, type);
+}
+
+Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) {
+  TORCH_ASSERT(indices.dim() >= dim);
+  auto size = std::vector<int64_t>(indices.sizes().slice(0, indices.dim() - dim));
+  size.push_back(-1);
+  auto indices_view = indices.view(size);
+  return grad.contiguous().view(size).gather(-1, indices_view).view(indices.sizes());
+}
+
+Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, const Tensor & input, int64_t dim) {
+  auto& gO = grad_output;
+  auto input_size = input.size(dim) / 2;
+  auto first_half = input.narrow(dim, 0, input_size);
+  auto second_half = input.narrow(dim, input_size, input_size);
+  auto sig_second_half = second_half.sigmoid();
+  auto one_sub_sig_second_half = 1 - sig_second_half;
+  auto sig_one_sub_sig = sig_second_half * one_sub_sig_second_half;
+
+  auto ggI_first_half = grad.narrow(dim, 0, input_size);
+  auto ggI_second_half = grad.narrow(dim, input_size, input_size);
+  auto ggI_second_half_times_first_half = ggI_second_half * first_half;
+
+  auto gI_first_half = ggI_second_half * gO * sig_one_sub_sig;
+  auto second_order_sh = sig_one_sub_sig * one_sub_sig_second_half - sig_second_half * sig_one_sub_sig;
+  auto gI_second_half = ggI_second_half_times_first_half * gO * second_order_sh + ggI_first_half * gO * sig_one_sub_sig;
+  return at::cat({gI_first_half, gI_second_half}, dim);
+}
+
+Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) {
+  if (dim < 0) dim += input.dim();
+  std::vector<int64_t> sizes = input.sizes();
+  sizes[dim] /= 2;
+  auto tmp = grad * glu_backward(at::ones(sizes, input.type()), input, dim);
+  return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]);
+}
+
+Tensor kl_div_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+  auto result = kl_div_backward(grad, input, target, Reduction::None);
+  if (reduction == Reduction::ElementwiseMean) {
+    return result.mean();
+  } else if (reduction == Reduction::Sum) {
+    return result.sum();
+  }
+  return result;
+}
+
+// Compute derivatives for targets.
+// Assume targets are given as probabilities (i.e. without taking the logarithm).
+Tensor kl_div_target_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction) {
+  if (reduction == Reduction::None) {
+    return grad_output.mul(target.log().add_(1).sub_(self)).masked_fill_(target == 0, 0.);
+  }
+  if (reduction == Reduction::ElementwiseMean) {
+    return grad_output.mul(target.log().add_(1).sub_(self)).div_(target.numel()).masked_fill_(target == 0, 0.);
+  }
+  return grad_output.mul(target.log().add_(1).sub_(self)).masked_fill_(target == 0, 0.);
+}
+
+Tensor log_sigmoid_double_backward(const Tensor & grad, const Tensor & input) {
+  auto z = input.sigmoid();
+  return grad * (z - 1) * z;
+}
+
+Tensor softmax_double_backward(const Tensor & grad, const Tensor & grad_output, int dim, const Tensor & output) {
+  auto gO = grad_output;
+  auto ggI = grad;
+
+  auto ggI_output = ggI * output;
+  auto ggI_out_sum = ggI_output.sum(dim, true);
+  auto ggI_out_sum_output = ggI_out_sum * output;
+  auto gO_out_sum = (gO * output).sum(dim, true);
+
+  // gI calculation
+  auto gI_t0 = ggI_output * (gO - gO_out_sum);
+  auto gI_t1 = output * ((ggI_output * gO).sum(dim, true).sub_(gO_out_sum * ggI_out_sum));
+  auto gI_t2 = ggI_out_sum_output * gO;
+  auto gI_t3 = ggI_out_sum_output * gO_out_sum;
+  return gI_t0 - gI_t1 - gI_t2 + gI_t3;
+}
+
+Tensor log_softmax_double_backward(const Tensor & grad, const Tensor & grad_output, int dim, const Tensor & output) {
+  auto z = output.exp();
+  return z * grad_output.sum(dim, true) * ((grad * z).sum(dim, true) - grad);
+}
+
+Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+  auto output = l1_loss_backward(grad, input, target, Reduction::None);
+  if (reduction == Reduction::ElementwiseMean) {
+    return output.mean();
+  } else if (reduction == Reduction::Sum) {
+    return output.sum();
+  }
+  return output;
+}
+
+Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+  auto d = (input - target).abs();
+  auto grad_input = grad * (d < 1).toType(grad.type());
+  if (reduction == Reduction::ElementwiseMean) {
+    grad_input /= input.numel();
+  }
+  return grad_input;
+}
+
+Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+  if (reduction == Reduction::None) {
+    return smooth_l1_loss_backward(grad, input, target, reduction);
+  }
+  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);
+  return (r * grad).sum();
+}
+
+Tensor diag_backward(const Tensor & grad, IntList input_sizes, int64_t diagonal) {
+  auto ndimension = input_sizes.size();
+  TORCH_ASSERT(ndimension == 1 || ndimension == 2);
+
+  if (ndimension == 1 || input_sizes[0] == input_sizes[1]) {
+    return grad.diag(diagonal);
+  }
+
+  // Input was a matrix but was not square
+  auto grad_input = at::zeros(input_sizes, grad.type());
+  auto diag = grad_input.diagonal(diagonal);
+  diag.copy_(grad);
+  return grad_input;
+}
+
+Tensor diagonal_backward(const Tensor & grad, IntList input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+  auto grad_input = at::zeros(input_sizes, grad.type());
+  auto diag = grad_input.diagonal(offset, dim1, dim2);
+  diag.copy_(grad);
+  return grad_input;
+}
+
+Tensor mse_loss_double_backward(const Tensor & grad, const Tensor & input, int64_t reduction) {
+  auto grad_input = 2 * grad;
+  if (reduction == Reduction::ElementwiseMean) {
+    grad_input /= input.numel();
+  }
+  return grad_input;
+}
+
+Tensor mse_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+  if (reduction == Reduction::None) {
+    return mse_loss_backward(grad, input, target, reduction);
+  }
+  auto r = mse_loss_backward(ones_like(grad_output), input, target, reduction);
+  return (r * grad).sum();
+}
+
+Tensor soft_margin_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+  auto z = (input * -target).exp();
+  auto zplus1 = z + 1;
+  auto grad_input = grad * (target * target) * z / (zplus1 * zplus1);
+  if (reduction == Reduction::ElementwiseMean) {
+    grad_input /= input.numel();
+  }
+  return grad_input;
+}
+
+Tensor soft_margin_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+  if (reduction == Reduction::None) {
+    return soft_margin_loss_backward(grad, input, target, reduction);
+  }
+  auto r = soft_margin_loss_backward(ones_like(grad_output), input, target, reduction);
+  return (r * grad).sum();
+}
+
+Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scalar beta, Scalar threshold) {
+  auto x = (input * beta);
+  return _sigmoid_backward(grad, x.sigmoid()) * (x < threshold).toType(grad.type()) * beta;
+}
+
+
+// NOTE [ as_strided Backward ]
+//
+// `storage_offset` is ignored for simplicity in this note. If you just want the
+// full algorithm without explanation, scroll down to bottom of this note.
+//
+// Implementing the backward of as_strided is tricky because you have to deal
+// with mappings that maps one memory location to multiple indices, i.e., the
+// output tensor indices pointing to **overlapping** memory addresses. This can
+// happen in all in all sorts of weird cases. For example,
+//
+//   x = torch.randn(15)
+//   x.as_strided([3, 3], [1, 0])  # "expand" case
+//   x.as_strided([3, 3], [2, 1])  # "size too large" case
+//   x.as_strided([3, 2], [3, 6])  # res[2, 0] points to 2*3 + 0*6 = 6
+//                                 # res[0, 1] points to 0*3 + 1*6 = 6
+//
+// Here is the general strategy we apply in implementing as_strided backward:
+//   0. ??? (optimizaiont step. we will talk about this later)
+//   1. Create some underlying flattened tensor as if it is the base tensor
+//      representing the contiguous memory storage for both input and output.
+//   2. Use the output geometry to scatter (or index_add) the gradients into
+//      this storage tensor.
+//   3. ??? (fix for input tensor with overlapping memory. we will talk about
+//           this later)
+//   4. Return the as_strided view of the storage tensor using input geometry.
+//
+// In step (2), if the output tensor does't have overlapping memory, we can
+// safely scatter (`storage.as_strided(output_geometry).copy_(grad)`);
+// otherwise, we must use `index_add` as gradient at different indices may need
+// to be summed to a single location.
+//
+// For example, in this case:
+//
+//   x = torch.randn(3)
+//   y = x.as_strided([3, 3], [1, 0])  # "expand" case
+//                                     # size   [ 3, 3]
+//                                     # stride [ 1, 0]
+//   y.backward()  # step (1): contiguous storagte tensor `s` of size 3, which
+//                             is large enough to be used as underlying storage
+//                             for `x` and `y`.
+//                               s = [ 0, 0, 0]
+//                 # step (2): since `y` has overlapping memory, index_add grad
+//                             into `s` basing on `y`'s geometry, i.e.,
+//                             s[i * y.stride(0) + j * y.stride(1)] += gy[i, j].
+//                               s = [ 3, 3, 3]
+//                 # step (4): as_strided view `s` using `x`'s geometry
+//                               s = [ 3, 3, 3]
+//                               grad_input = s.as_strided(x.size(), x.stride())
+//                                          = s.as_strided([3], [1])
+//                                          = [ 3, 3, 3]
+//
+// This is exactly what we would get if using `expand`. However, here the input
+// tensor doesn't have overlapping memory. If it does, we must add an extra step
+// before (4). Considering this case:
+//
+//   t = torch.randn(3)
+//   x = t.expand(3, 3)            # input with overlapping memory
+//                                 # size   [3, 3]
+//                                 # stride [0, 1]
+//   y = x.as_strided([3], [1])    # contiguous output
+//                                 # size   [3]
+//                                 # stride [1]
+//   y.backward()  # step (1): contiguous storagte tensor `s` of size 3, which
+//                             is large enough to be used as underlying storage
+//                             for `x` and `y`.
+//                               s = [ 0, 0, 0]
+//                 # step (2): scatter grad into `s` basing on `y`'s geometry
+//                               s = [ 1, 0, 0]
+//                 # step (4): as_strided view `s` using `x`'s geometry
+//                               s = [ 1, 0, 0]
+//                               grad_input = s.as_strided([3, 3], [0, 1])
+//                                          = s.as_strided([3, 3], [0, 1])
+//                                          = [[ 1, 0, 0],
+//                                             [ 1, 0, 0],
+//                                             [ 1, 0, 0]]
+// Is this result correct?
+//
+// `x.as_strided([1], [1])` call is obviously equivalent with
+// `x[(0,) * x.dim()].view(1)` for any `x`. But autograd through the second
+// gives gradient `[ [ 1, 0, 0], [ 0, 0, 0], [ 0, 0, 0]]`. For this specific
+// case, indexing `x` at any index in first column is also equivalent, and
+// yields a gradient of shape `[3 x 3]` containing eight 0's and one 1. There is
+// an `x.size(1)`-times difference between these gradients computed from other
+// PyTorch ops and the gradient we got from as_strided.
+//
+// You might conclude that the gradients from as_strided is wrong. However,
+// let's first see why they are actually reasonable. Consider the pointwise
+// perturbations by `delta` anywhere in the first column of `x`. It will lead to
+// a `delta` change in the same memory location, and then `y` will change by
+// `delta`. So one can say the gradient should be exactly 1 at the first column,
+// as given by our above procedure.
+//
+// In the above computation of numerical gradients, they only match the
+// analytical results because strides and memory locations are considered in the
+// forward pass, i.e., this op (including both forward and backward) is
+// stride-aware.
+//
+// However, most (probably all) other ops (forward and backward) are
+// stride-agnostic. E.g.,
+//
+//   t = torch.randn(1)
+//   x = t.expand(2)
+//   y = x.sum()
+//   y.backward()
+//
+// Stride-agnostic autograd (as it is currently in PyTorch) will give you
+//
+//   gy = 1
+//   gx = [ 1, 1]  # SumBackward:    torch.ones_like(x)
+//   gt = [ 2]     # ExpandBackward: gx.sum()
+//
+// Note that `gx = [ 1, 1]`. However, if you perturb any value in `x` by `delta`
+// (the other will also change by `delta`), `y` will change by `2 * delta`. So
+// the gradients, if strides are taken into consideration, should be 2.
+//
+// Stride-aware autograd should give you
+//
+//   gy = 1
+//   gx = [ 2, 2]  # Because the backward considers the fact that the input `x`
+//                 # is already expanded.
+//   gt = [ 2]     # Stride-aware backward of expand is just a slicing because
+//                 # the previous backward should have already taken care of
+//                 # strides and made sure that gradients are the same along the
+//                 # expanded dimension.
+//
+// As shown above, these two types are not compatible. Therefore, we must either
+// make as_strided stride-agnostic, or make all other ops stride-aware.
+//
+// It is unrealisitc to support stride-aware autograd (at least in the current
+// structure), because it would mean
+//   1. storing tensor geometries of every input tensor for backward
+//   2. depending on input geometry, the gradient computed from backward change
+//   3. ideally enforcing gradient of T to always have same strides as T
+// (although these two methods only differ when it comes to overlapping memory)
+//
+// To formulate `as_strided(input, size, stride)` in a stride-agnostic way, we
+// consider `input.stride()` as a separate independent arguement `input_stride`:
+//   1. "Scatter" each value of `input` into a "storage" using storage location
+//      computed from the value's index in `input`, `input.size()` and
+//      `input_stride`, but if N values end up in the same location, the value
+//      is average of those N values (they will be the same value anyways).
+//
+//      Formal description:
+//        Denote the set of all input indices that pointing to the same storage
+//        location `storage[n]` as `S(n)`, i.e.,
+//
+//            S(n) = { index : index @ input_stride == n, index is valid given input.size() }
+//
+//        Then, the process is:
+//
+//            storage[n] = Avg { S(n) }
+//
+//        Note that all values in `S(n)` are the same (they point to the same
+//        memory location anyways, so this step doesn't change anything, but
+//        effectively avoids using `input.stride()`.
+//
+//      NOTE: for forward pass, we can equivalently simply selet any one of
+//            `S(n)` as `storage[n]`. However, cosnidering this as an average
+//            operation makes backward easier (so all values in set
+//            `{ grad_input[i] : i in S(n) }` are the same, and it can use the
+//            same geometry as input).
+//   2. As usual, return the as_strided view of `storage` using required output
+//      `size` and `stride`.
+//
+// To backward through this stride-agnostic version, we simply add the following
+// step:
+//   .... (scatter gradients into the storage tensor using output geometry)
+//   3. For all storage location n, `storage[n] /= |S(n)|`.
+//   .... (return as_strided view of the storage tensor using input geometry)
+//
+// Finally, we note that these general operations are expensive, so we apply the
+// following optimizations:
+//   Add step (0): For all output dimension `d` with output stride 0, sum the
+//                 gradients along dimension `d` (don't keepdim), and remove
+//                 dimension `d` from output size and stride.
+//                 (An optimization for "expand" cases so we may avoid step (3))
+//  Only apply step (3) when input tensor has overlapping memory.
+//
+// FULL ALGORITHM:
+//   0. For all output dimension `d` with output stride 0, sum the gradients
+//       along dimension `d` (don't keepdim), and remove dimension `d` from
+//       output size and stride.
+//   1. Create some underlying flattened tensor as if it is the base tensor
+//      representing the contiguous memory storage for both input and output.
+//   2. Use the output geometry to scatter (or index_add) the gradients into
+//      this storage tensor `storage`.
+//   3. If input tensor has overlapping memory,
+//      For all storage location `i`, `storage[i] /= N(i)`, where `N(i)` is the
+//      number of indices in input geometry pointing to the same storage
+//      location `i` (i.e., `|S(i)|` in equations above).
+//   4. Return the as_strided view of the storage tensor using input geometry.
+//
+// See NOTE [ Detecting Memory Overlap Within A Strided Tensor ] on how to
+// roughly detech overlapping memory.
+
+
+// NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+//
+// Checking memory overlap within a strided tensor is the special case of
+// detecting memory overlap of two strided tensors, where the two tensors start
+// at the same memory address. The later is HARD (see #8212).
+//
+// But even this special case isn't simple. This note describes a check for a
+// even more constrained simple case where we can be certain that there is no
+// overlap.
+//
+// The checking algorithm can be described as:
+//   0. Return [ pass check ] if any dimension has size 0
+//   1. Ignore all dimensions that have size 1
+//   2. If no remaining dimensions, return [ pass check ]
+//   3. Sort the remaining dimensions according to the strides decreasingly
+//   4. Check that for each dimension k,
+//
+//           stride[k] > \sum_{ i > k } (size[i] - 1) * stride[i]
+//
+//      That is equivalent to, after reording the dimensions so strides are
+//      in decreasing order, checking that stride of each dimension is larger
+//      than the maximum memory offset in a slice at that dimension.
+//
+// Obviously this check passes for contiguous tensors ( the dimensions will be
+// already sorted with LHS = stride[0] = \prod size[i] being exactly 1 larger
+// than RHS ). Similarly, the check passes for tensors contiguous in all but
+// the last dimension, and LHS = stride[0] = stride[-1] * \prod size[i] being
+// exactly stride[-1] larger than RHS. (*)
+//
+// We will show that these view operations, including all our view operations
+// *except for* general as_strided and unfold, also preserve this invariant:
+//
+//  alias:      Obviously preserves
+//
+//  expand:     All changed dimensions are removed in step (1)
+//
+//  view:       Consider the input dimensions as grouped into consecutive
+//              dimension "blocks", where dimensions are contiguous in each one.
+//              one. view only works when the output dimensions can also be
+//              grouped into the same consecutive blocks of same ordering.
+//
+//              NB: this means that the number of elements and stride of the
+//                  last dimension in each block is the same in input and
+//                  output. (**)
+//
+//              Notation:
+//                Consider a single such block B,
+//                    ... B_prev[-1]], [ B[0], ..., B[i], ..., B[k] = B[-1] ], [ B_next[0], ...
+//                                start--^^^^                  ^^^^^^^^^^^^--end
+//                Each B[i] denotes a dimension index such that B[i] = B[0] + i.
+//
+//              We first show that in a tensor (i.e., input) satisfies the
+//              invariant, after sorting, the dimensions within each block
+//              still remain consecutive. (***)
+//
+//                After removing dimensions of size 1, the dimensions within a
+//                block is already sorted by strides in descending order. So
+//                sorting all dimensions will not change the relative ordering
+//                among them.
+//
+//                Assume that some block B is not consecutive after sorting,
+//                i.e., there exists a dimension d between B[0] and B[-1] in
+//                sorted order.
+//
+//                By (*), we know that
+//                       stride[B[0]]
+//                    =  \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + stride[B[-1]]
+//                    <  \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + stride[d]
+//                    <= \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + (size[d] - 1) * stride[d]
+//                    <= \sum{j > B[0]} (size[j]    - 1) * stride[j],
+//
+//                where the first <   comes from sorting and
+//                      the second <= comes from the fact that dimension d
+//                                               exists after step (1) and
+//                                               thus must have size greater
+//                                               than 1
+//                      the third  <= comes from the fact that each term in
+//                                               the sum is non-negative
+//
+//                Then we have a countradiction as the invariant must not be
+//                satisfied at B[0]. So the original proposition is true.
+//
+//              Now that we established the above claim (***), we consider the
+//              view operation as first sorting the dimensions (i.e., blocks),
+//              apply the original view (since it only cares dimensions being
+//              consecutive and contiguous withtin each block), and then undo
+//              the sort.
+//
+//              Consider a single block B in the output,
+//                  ... ], [ B[0], ..., B[i], ..., B[k] = B[-1] ], [ ...
+//                    start--^^^^                  ^^^^^^^^^^^^--end
+//
+//              By (*), we know that for all i
+//                  stride[i] = stride[B[-1]] +
+//                                \sum_{j=i+1}^{k} (size[B[j]] - 1) * stride[B[j]]
+//
+//              Then the invariant is obviously satisfied at every dimension
+//              in this block if it is satisfied at dimnesion B[-1]. It only
+//              remains to show that it is satisfied at the last dimension in
+//              each block.
+//
+//              Since the same blocks are present in both input and output
+//              with the same ordering, we will abuse the notation in the
+//              following statements.
+//
+//              By (*), we know that the following holds for both input and
+//              output, for any block B:
+//                    \sum_{i > B[-1]} (size[i] - 1) * stride[i]
+//                  = \sum_{block B' after B} \prod_{j in B'} size[B[j]] * stride[B'[-1]]
+//                  = \sum_{block B' after B} numel(B') * stride[B'[-1]].
+//                    ^^^^^^^^^^^^^^^^^^^^^^^|^^^^^^^^^^^^^^^^^^^^^^^^^^
+//              By (**), we know that, this quantity in the above equation
+//              remains the same in input and output. So both
+//                  \sum_{i > B[-1]} (size[i] - 1) * stride[i]
+//              and
+//                  stride[B[-1]]
+//              are the same in input and output.
+//
+//              These two quantities are exactly the LHS and RHS of the
+//              invariant inequality. Since by assumption the invariant is
+//              satisfied in input at B[-1], it is also satisfied in output at
+//              B[-1]. This concludes the proof.
+//
+//  squeeze:    Special case of view
+//
+//  unsqueeze:  Special case of view
+//
+//  slice:      Consider slicing dimension i with step = k >= 1.
+//
+//              Let stride' and size' be the output strides and sizes. We have
+//
+//                  stride'[i] = k * stride[i]
+//                  size'[i] <= floor(size[i] / k)
+//
+//              If size'[i] = 1, invariant is obviously satisfied as we are
+//              just removing a dimension (afte step (1)).
+//
+//              Assume size'[i] > 1.
+//
+//              By assumption, the invariant is satisfied at every dimension
+//              in input.
+//
+//              For any dimension j, if stride[j] > stride[i], we have
+//                  stride'[j] =  stride[j]
+//                             >  (size[i] - 1) * stride[i]
+//                             =  (size[i] / k * k - 1) * k * stride[i] / k
+//                             =  (size[i] / k - 1 / k) * stride'[i]
+//                             >= (size'[i]    - 1 / k) * stride'[i]
+//                             >= stride'[i].
+//
+//              If stride[j] < stride[i], we have
+//                  stride'[j] = stride[j] < stride[i] <= stride'[i].
+//
+//              So the sorting order remains unchanged after slice.
+//
+//              Since
+//                     (size'[i] - 1) * stride'[i]
+//                  =  (floor(size[i] / k) - 1) * k * stride[i]
+//                  <= (size[i] / k - 1) * k * stride[i]
+//                  =  (size[i] - k) * stride[i]
+//                  <= (size[i] - 1) * * stride[i],
+//              the term from this dimension i in the invariant inequality at
+//              other dimensions can only decrease after slice. So the
+//              invariant is preserved.
+//
+//  narrow:     Special case of slice
+//
+//  select:     narrow + squeeze
+//
+//  permute:    Sorting makes permutation of dimensions irrelevant
+//
+//  transpose:  Sorting makes swapping dimensions irrelevant
+//
+//  diagonal:   Effectively merging two dimensions i and j into a new
+//              dimension k s.t.
+//                  stride'[k] =  stride[i] + stride[j]
+//                  size'[k]   <= min(size[i], size[j]),
+//              where stride and size are on the input, and stride' and size'
+//              are on the output.
+//
+//              Assuming that size[i] > 1 and size[j] > 1. If any has size 1,
+//              then this is unsqueeze on that dimension.
+//
+//              WLOG, say stride[i] >= stride[j].
+//
+//              Each dimension d in input with stride[d] > stride[j] has
+//                  stride'[d] =  stride[d]
+//                             >  (size[i] - 1) * stride[i] + (size[j] - 1) * stride[j]
+//                             >= stride[i] + stride[j]
+//                             =  stride[k].
+//              So, considering the sorted dimensions, this is effectively
+//              removing i, and replacing j with k.
+//
+//              For dimensions d with stride[i] < stride[d] < stride[j], the
+//              term from dimension i is removed in the invariant inequality.
+//              For dimensions d with stride[d] > stride[j], we have
+//                     (size'[k] - 1) * stride'[k]
+//                  <= (min(size[i], size[j]) - 1) * (stride[i] + stride[j])
+//                  <= (size[i] - 1) * stride[i] + (size[j] - 1) * stride[j],
+//              so the term from i and j in the invariant can only decrease.
+//
+//              So this is generally relaxing the constraint, and thus it
+//              preserves it.
+
+// This implements steps (2)~(4) of the algorithm in
+// NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+// Helper for as_strided_backward
+static inline bool _maybe_overlapping_memory(IntList sizes, IntList strides) {
+  if (sizes.size() > 0) {
+    std::vector<std::size_t> argsort(sizes.size());
+    std::iota(argsort.begin(), argsort.end(), 0);
+    std::sort(argsort.begin(), argsort.end(),
+        [&](std::size_t i, std::size_t j){ return strides[i] < strides[j]; });
+
+    int64_t max_index_in_slice = 0;
+    for (auto i : argsort) {
+      auto stride_ = strides[i];
+      if (stride_ <= max_index_in_slice) {
+        return true;
+      }
+      max_index_in_slice += stride_ * (sizes[i] - 1);
+    }
+  }
+  return false;
+}
+
+// Returns the minimum storage size needed to contain a tensor of sizes, strides, and storage_offset
+// Helper for as_strided_backward
+static inline int64_t _min_storage_size(IntList sizes, IntList strides, int64_t storage_offset) {
+  int64_t storage_size = storage_offset + 1;
+  int64_t dim = sizes.size();
+  for (int64_t i = 0; i < dim; i++) {
+    auto size_i = sizes[i];
+    if (size_i == 0) {
+      return storage_offset;
+    }
+    storage_size += (size_i - 1) * strides[i];
+  }
+  return storage_size;
+}
+
+// See NOTE [ as_strided Backward ] for explanation
+Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntList sizes, IntList strides, int64_t storage_offset) {
+  // For output geometry,
+  //   check for size 0 dimensions,
+  //   skip size 1 dimensions,
+  //   reduce grad on expanded dims (stride=0, size>1)
+  // Step (0)     for the algorithm in NOTE [ as_strided Backward ]
+  // Step (0)~(1) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+  //              on output geometry
+  auto odim = grad.dim();
+  std::vector<int64_t> out_sizes_, out_strides_;
+  out_sizes_.reserve(odim);
+  out_strides_.reserve(odim);
+  for (int64_t i = odim - 1; i >= 0; i--) {
+    auto size_i = sizes[i];
+    auto stride_i = strides[i];
+    if (size_i == 0) {
+      return at::zeros(input_geometry.sizes(), grad.type());
+    } else if (size_i == 1) {
+      grad = grad.squeeze(i);
+    } else if (stride_i == 0) {
+      grad = grad.sum(i, false);
+    } else {
+      out_sizes_.insert(out_sizes_.begin(), size_i);
+      out_strides_.insert(out_strides_.begin(), stride_i);
+    }
+  }
+  // Step (2)~(4) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+  //              on output geometry
+  auto out_maybe_overlap = _maybe_overlapping_memory(out_sizes_, out_strides_);
+
+  // For input geometry,
+  //   check for size 0 dimensions,
+  //   skip size 1 dimensions,
+  // Step (0)~(1) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+  //              on input geometry
+  auto idim = input_geometry.dim();
+  IntList inp_sizes = input_geometry.sizes(), inp_strides = input_geometry.strides();
+  std::vector<int64_t> inp_sizes_, inp_strides_;
+  inp_sizes_.reserve(idim);
+  inp_strides_.reserve(idim);
+  for (int64_t i = idim - 1; i >= 0; i--) {
+    auto size_i = inp_sizes[i];
+    auto stride_i = inp_strides[i];
+    if (size_i == 0) {
+      return at::zeros(input_geometry.sizes(), grad.type());
+    } else if (size_i != 1) {
+      inp_sizes_.insert(inp_sizes_.begin(), size_i);
+      inp_strides_.insert(inp_strides_.begin(), stride_i);
+    }
+  }
+  // Step (1)~(4) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
+  //              on input geometry
+  auto inp_maybe_overlap = _maybe_overlapping_memory(inp_sizes_, inp_strides_);
+
+
+  // Rest of this function implements
+  // Step (1)~(4) for the algorithm in NOTE [ as_strided Backward ]
+  // TODO: Raise if not all output values are visible in input geometry.
+  //       Technically speaking, if you treat those values as constants, not
+  //       raising is fine, and mathematically correct. However, these values
+  //       really are contained in some base tensor, and by treating them as
+  //       constants we are ignoring this tight dependency. Therefore, it is
+  //       more sensible to raise here.
+
+  // Step (1): create underlying tensor as "storage"
+  auto shared_offset = std::min(input_geometry.storage_offset(), storage_offset);
+  auto inp_effective_offset = input_geometry.storage_offset() - shared_offset;
+  auto out_effective_offset = storage_offset - shared_offset;
+  auto base_size = std::max(
+    _min_storage_size(inp_sizes_, inp_strides_, inp_effective_offset),
+    _min_storage_size(out_sizes_, out_strides_, out_effective_offset)
+  );
+  auto& ty = grad.type();
+  auto storage = at::zeros({base_size}, ty);
+
+  // prepare indices tensor if we will do index_add_ later
+  at::optional<at::Tensor> flatten_full_indices;
+  if (inp_maybe_overlap || out_maybe_overlap) {
+    flatten_full_indices = at::arange(0, base_size, ty.toScalarType(at::kLong));
+  }
+
+  // Step (2): use output geometry to scatter gradients into storage
+  if (out_maybe_overlap) {
+    auto out_indices = flatten_full_indices->as_strided(out_sizes_, out_strides_, out_effective_offset);
+    storage.index_add_(0, out_indices.reshape(-1), grad.reshape(-1));
+  } else {
+    // assume that new tensors have 0 storage offset
+    storage.as_strided(out_sizes_, out_strides_, out_effective_offset).copy_(grad);
+  }
+
+  // Step (3): if input tensor has overlapping memory, divide scattered gradient
+  //           at storage[i] by the number of times i shows up in input geometry
+  if (inp_maybe_overlap) {
+    auto count = at::zeros_like(storage);
+    auto inp_indices = flatten_full_indices->as_strided(inp_sizes_, inp_strides_, inp_effective_offset).reshape(-1);
+    count.index_add_(0, inp_indices, at::ones({1}, ty).expand_as(inp_indices));
+    storage.div_(count); // this will give nan outside visible range
+  }
+  // Step (4): return as_strided view of the storage tensor with input geometry
+  return storage.as_strided(inp_sizes, inp_strides, inp_effective_offset);
+}
+
+std::tuple<Tensor, Tensor> atan2_backward(const Tensor& grad, const Tensor& self, const Tensor& other, std::array<bool, 2> output_mask) {
+  auto recip = (self * self + other * other).reciprocal();
+  return std::tuple<Tensor,Tensor>{
+            output_mask[0] ? grad * other * recip : Tensor(),
+            output_mask[1] ? grad * -self * recip : Tensor() };
+}
+
+// TODO: Seriously consider writing the derivative formulas for
+// each output separately; there is not all that much sharing
+// of computation going on here.
+std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
+    const Tensor & mb_ggI,
+    const Tensor & mb_ggW,
+    const Tensor & mb_gO,
+    const Tensor & input,
+    const Tensor & weight,
+    std::array<bool, 3> output_mask) {
+
+  // Zero-fill undefined grads (TODO: do this more efficiently)
+  auto ggI = mb_ggI.defined() ? mb_ggI : input.type().zeros_like(input);
+  auto ggW = mb_ggW.defined() ? mb_ggW : weight.type().zeros_like(weight);
+  auto gO = mb_gO.defined() ? mb_gO : input.type().zeros_like(input);
+
+  auto positive_mask = (input > 0).type_as(ggI);
+  auto nonpositive_mask = (input <= 0).type_as(ggW);
+
+  // Explanation: Let input be i, weight be w, grad_output be gO.
+  // f(i, w) = i  if i > 0
+  //         = wi if i <= 0
+  // df/di * gO  = gO      if i > 0      df/dw * g0 = 0      if i > 0
+  //             = g0 * w  if i <= 0                = g0 * i  if i <= 0
+  // The rest is taking derivatives of these wrt i, w, gO and summing/expanding properly.
+
+  if (weight.numel() == 1) {
+      // from PReLU.forward: num_parameters == 0 is used indicate that a
+      // single weight is shared among all input channels.
+
+      // this is a little tricky because PReLU currently doesn't take a shape so the weight may be
+      // 1-d when the input is a scalar (and there isn't a good Parameter API for that anyway until Variable
+      // and tensor are merged).  So, use weight and ggW as 0-dim in this case.
+      bool scalar_input_1d_weight = (positive_mask.dim() == 0 && weight.dim() == 1);
+      auto weight_maybe_squeeze = scalar_input_1d_weight ? weight.squeeze() : weight;
+      auto ggW_maybe_squeeze = scalar_input_1d_weight ? ggW.squeeze() : ggW;
+
+      auto mask = positive_mask + nonpositive_mask * weight_maybe_squeeze.expand_as(input);
+      auto ggO = ggI * mask + ggW_maybe_squeeze.expand_as(gO) * (nonpositive_mask * input);
+      return std::tuple<Tensor, Tensor, Tensor>(
+                ggO,
+                ggW_maybe_squeeze.expand_as(gO) * gO * nonpositive_mask,
+                (ggI * gO * nonpositive_mask).sum().expand_as(weight)
+          );
+  } else {
+      // Expand ggW to match size of ggI; a simple expand doesn't work because
+      // ggW is the size of the input channel (dim==1 unless there is only 1 dimension).  For example,
+      // let ggI be size (3,4,5,6,7) and ggW be size (4).  Then we unsqueeze ggW to be size (4,1,1,1)
+      // so the expand succeeds.
+      auto dims_to_unsqueeze = std::max<int64_t>(input.dim() - 2, 0);
+      auto ggW_expanded = ggW;
+      for (int64_t i = 0; i < dims_to_unsqueeze; i++) {
+          ggW_expanded = ggW_expanded.unsqueeze(1);
+      }
+      ggW_expanded = ggW_expanded.expand_as(ggI);
+
+      auto gI = ggW_expanded * gO * nonpositive_mask;
+
+      auto gW = ggI * gO * nonpositive_mask;
+      if (input.dim() > 1) {
+          gW = gW.sum(0);
+      }
+      while (gW.dim() > 1) {
+          gW = gW.sum(1);
+      }
+
+      Tensor ggO;
+      if (output_mask[0]) {
+          // expand weight as input as in ggW/ggI above
+          auto weight_expanded = weight;
+          for (int64_t i = 0; i < dims_to_unsqueeze; i++) {
+              weight_expanded = weight_expanded.unsqueeze(1);
+          }
+          weight_expanded = weight_expanded.expand_as(input);
+
+          auto mask = positive_mask + nonpositive_mask * weight_expanded;
+          ggO = ggI * mask + ggW_expanded * nonpositive_mask * input;
+      }
+      return std::tuple<Tensor,Tensor,Tensor>{ggO, gI, gW};
+  }
+}
+
+// https://j-towns.github.io/papers/svd-derivative.pdf
+//
+// This makes no assumption on the signs of sigma.
+Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
+          bool some, const Tensor& raw_u, const Tensor& sigma, const Tensor& raw_v) {
+  auto m = self.size(0);
+  auto n = self.size(1);
+  auto k = sigma.size(0);
+  auto gsigma = grads[1];
+
+  auto u = raw_u;
+  auto v = raw_v;
+  auto gu = grads[0];
+  auto gv = grads[2];
+
+  if (!some) {
+    // We ignore the free subspace here because possible base vectors cancel
+    // each other, e.g., both -v and +v are valid base for a dimension.
+    // Don't assume behavior of any particular implementation of svd.
+    u = raw_u.narrow(1, 0, k);
+    v = raw_v.narrow(1, 0, k);
+    if (gu.defined()) {
+      gu = gu.narrow(1, 0, k);
+    }
+    if (gv.defined()) {
+      gv = gv.narrow(1, 0, k);
+    }
+  }
+  auto vt = v.t();
+
+  Tensor sigma_term;
+  if (gsigma.defined()) {
+    sigma_term = u.mm(gsigma.diag()).mm(vt);
+  } else {
+    sigma_term = at::zeros({1}, self.type()).expand_as(self);
+  }
+  // in case that there are no gu and gv, we can avoid the series of kernel
+  // calls below
+  if (!gv.defined() && !gu.defined()) {
+    return sigma_term;
+  }
+
+  auto ut = u.t();
+  auto im = eye(m, self.type());
+  auto in = eye(n, self.type());
+  auto sigma_mat = sigma.diag();
+  auto sigma_mat_inv = sigma.pow(-1).diag();
+  auto sigma_expanded_sq = sigma.pow(2).expand_as(sigma_mat);
+  auto F = sigma_expanded_sq - sigma_expanded_sq.t();
+  // The following two lines invert values of F, and fills the diagonal with 0s.
+  // Notice that F currently has 0s on diagonal. So we fill diagonal with +inf
+  // first to prevent nan from appearing in backward of this function.
+  F.diagonal().fill_(INFINITY);
+  F = F.pow(-1);
+
+  Tensor u_term, v_term;
+
+  if (gu.defined()) {
+    u_term = u.mm(F.mul(ut.mm(gu) - gu.t().mm(u))).mm(sigma_mat);
+    if (m > k) {
+      u_term = u_term + (im - u.mm(ut)).mm(gu).mm(sigma_mat_inv);
+    }
+    u_term = u_term.mm(vt);
+  } else {
+    u_term = at::zeros({1}, self.type()).expand_as(self);
+  }
+
+  if (gv.defined()) {
+    auto gvt = gv.t();
+    v_term = sigma_mat.mm(F.mul(vt.mm(gv) - gvt.mm(v))).mm(vt);
+    if (n > k) {
+      v_term = v_term + sigma_mat_inv.mm(gvt.mm(in - v.mm(vt)));
+    }
+    v_term = u.mm(v_term);
+  } else {
+    v_term = at::zeros({1}, self.type()).expand_as(self);
+  }
+
+  return u_term + sigma_term + v_term;
+}
+
+// Invertible case is derived from Jacobi's formula, and also can be found at:
+// http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
+Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) {
+  auto det_val = det.toCDouble();
+  if (det_val != 0 /* invertible */) {
+    return grad * det * self.inverse().t();
+  } else /* otherwise det = \prod(sigma) = 0, use svd */ {
+    Tensor u, sigma, v;
+    std::tie(u, sigma, v) = self.svd();
+    auto gsigma = prod_backward(grad, sigma, det);
+    return svd_backward({{}, gsigma, {}}, self, true, u, sigma, v);
+  }
+}
+
+Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& logdet) {
+  auto logdet_val = logdet.toCDouble();
+  if (logdet_val != -INFINITY /* det != 0, invertible */) {
+    return grad * self.inverse().t();
+  } else /* otherwise det = \prod(sigma) = 0, use svd */ {
+    Tensor u, sigma, v;
+    std::tie(u, sigma, v) = self.svd();
+    // backward det = \sum log(sigma)
+    auto gsigma = grad.div(sigma);
+    return svd_backward({{}, gsigma, {}}, self, true, u, sigma, v);
+  }
+}
+
+Tensor slogdet_backward(const std::vector<torch::autograd::Variable> &grads,
+                        const Tensor& self,
+                        const Tensor& signdet, const Tensor& logabsdet) {
+  AT_ASSERTM(!grads[0].defined(), "slogdet's sign output should never have gradient");
+  auto signdet_val = signdet.toCDouble();
+  if (signdet_val != 0 /* det != 0, invertible */) {
+    return grads[1] * self.inverse().t();
+  } else /* otherwise det = \prod(sigma) = 0, use svd */ {
+    Tensor u, sigma, v;
+    std::tie(u, sigma, v) = self.svd();
+    // sigma has all non-negative entries (also with at least one zero entry)
+    // so logabsdet = \sum log(abs(sigma))
+    // but det = 0, so backward logabsdet = \sum log(sigma)
+    auto gsigma = grads[1].div(sigma);
+    return svd_backward({{}, gsigma, {}}, self, true, u, sigma, v);
+  }
+}
+
+// Reference:
+// https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+// Sec. 2.3.1 Matrix inverse product
+std::tuple<Tensor, Tensor> trtrs_backward(
+    const Tensor & grad_x, const Tensor & grad_m,
+    const Tensor & b, const Tensor & a, const Tensor & x,
+    const bool upper, const bool transpose, const bool unitriangular,
+    std::array<bool, 2> output_mask) {
+  Tensor grad_b, grad_a;
+  if (grad_x.defined()) {
+    grad_b = std::get<0>(grad_x.trtrs(a, upper, !transpose, unitriangular));
+    if (output_mask[1]) {
+      grad_a = transpose ? -x.mm(grad_b.t()) : -grad_b.mm(x.t());
+      if (upper) {
+        grad_a = grad_a.triu((int) unitriangular);
+      } else {
+        grad_a = grad_a.tril(-((int) unitriangular));
+      }
+    }
+  }
+  if (!grad_a.defined()) {
+    grad_a = at::zeros({1}, a.type()).expand_as(a);
+  }
+  if (!grad_b.defined()) {
+    grad_b = at::zeros({1}, b.type()).expand_as(b);
+  }
+  if (output_mask[1] && grad_m.defined()) {
+    grad_a = grad_a.add(grad_m);
+  }
+  return std::tuple<Tensor, Tensor>{grad_b, grad_a};
+}
+
+// Generally speaking, fft's backward is ifft.
+Tensor fft_backward(const Tensor& self, const Tensor& grad, int64_t signal_ndim,
+                    bool complex_input, bool complex_output,
+                    bool inverse, IntList checked_signal_sizes,
+                    bool normalized, bool onesided,
+                    IntList output_sizes) {
+  Tensor gI;
+  if (!complex_input && complex_output) {
+    // Forward is R2C
+    // Do inverse C2C and project onto real plane because grad can be
+    // asymmetrical so C2R can't be used.
+    if (onesided) {
+      // Forward is R2C (onesided)
+      // Think of onesided R2C rfft as
+      //     1. view as complex numbers (fill complex dim with zeros)
+      //     2. C2C fft
+      //     3. discard half of results
+      // So backward is
+      //     1. fill the other half with zeros (with `zero_grad_shape` below)
+      //        (C2C ifft only take twosided inputs so we need to fill here)
+      //     2. inverse C2C ifft
+      //     3. discard the complex dim
+      int64_t zero_length = checked_signal_sizes[signal_ndim - 1] - grad.size(signal_ndim);
+      auto complex_full_grad = grad;
+      if (zero_length > 0) {
+        std::vector<int64_t> zero_grad_shape(signal_ndim + 2);
+        zero_grad_shape[0] = self.size(0);
+        for (int64_t i = 1; i < signal_ndim; i++) {
+          zero_grad_shape[i] = checked_signal_sizes[i - 1];
+        }
+        zero_grad_shape[signal_ndim] = zero_length;
+        zero_grad_shape[signal_ndim + 1] = 2;
+        complex_full_grad =  at::cat({ grad, at::zeros(zero_grad_shape, grad.type()) }, signal_ndim);
+      }
+      gI = _fft_with_size(complex_full_grad, signal_ndim,
+                          /* complex_input */ true, /* complex_output */ true,
+                          !inverse, checked_signal_sizes, normalized,
+                          /* onesided */ false, complex_full_grad.sizes()).select(-1, 0);
+    } else {
+      gI = _fft_with_size(grad, signal_ndim, /* complex_input */ true,
+                          /* complex_output */ true, !inverse,
+                          checked_signal_sizes, normalized,
+                          /* onesided */ false, grad.sizes()).select(-1, 0);
+    }
+  } else if (complex_input && !complex_output && onesided) {
+    // Forward is C2R (onesided)
+    // Think of onesided C2R irfft as
+    //    1. fill the other half by conjugate symmetry
+    //    2. inverse C2C ifft
+    //    3. discard the complex dimension
+    // So backward is
+    //    1. R2C rfft (essentially add dummy complex dimension, and dft)
+    //    2. accumulate gradient by conjugate symmetry
+    //       since rfft results follow conjugate symmetry, we only need to
+    //       double some entries from onesided rfft results, i.e., the ones with
+    //       their reflected indices also landing out of the onesided range. So
+    //       consider the index of last dim:
+    //           i.   idx = 0.
+    //                Reflected to (N - 0) % N = 0. Not doubled.
+    //           ii   0 < idx < floor(N/2) (last).
+    //                N > N - idx > ceil(N/2)
+    //                Reflected to ()
+    //           iii. idx = floor(N/2) = N/2 (last) when N even.
+    //                Reflected to (N - N/2) % N = N/2. Not doubled.
+    //           iv.  idx = floor(N/2) = (N-1)/2 (last) when N odd.
+    //                Reflected to (N - (N-1)/2) % N = (N+1)/2. Doubled.
+    //       Therefore, needs to double
+    //           idx = 1, 2, ..., N/2 - 1     when N even
+    //           idx = 1, 2, ..., (N-1)/2     when N odd
+    //       that is
+    //           idx = 1, 2, ..., N - (floor(N/2) + 1)
+    //               = 1, 2, ..., N - onesided_length
+    gI = _fft_with_size(grad, signal_ndim, /* complex_input */ false,
+                        /* complex_output */ true, /* inverse */ false,
+                        checked_signal_sizes, normalized, /* onesided */ true,
+                        self.sizes());
+    int64_t double_length = checked_signal_sizes[signal_ndim - 1] - self.size(signal_ndim);
+    if (double_length > 0) {  // also covers case when signal size is zero
+      gI.narrow(signal_ndim, 1, double_length).mul_(2);
+    }
+  } else {
+    gI = _fft_with_size(grad, signal_ndim, complex_output, complex_input,
+                        !inverse, checked_signal_sizes, normalized, onesided,
+                        self.sizes());
+  }
+  if (normalized) {
+    // If normalized, backward is exactly calling fft with inversed argument as
+    // the forward because both are unitary.
+    return gI;
+  } else {
+    // If not normalized, in backward, we need to upscale or downscale gI basing
+    // on whether the forward is an inverse fft.
+    auto signal_numel = std::accumulate(checked_signal_sizes.begin(),
+                    checked_signal_sizes.end(), 1, std::multiplies<int64_t>());
+    if (!inverse) {
+      return gI.mul_(static_cast<double>(signal_numel));
+    } else {
+      return gI.div_(static_cast<double>(signal_numel));
+    }
+  }
+}
+
+// Helper for batchnorm_double_backward
+Tensor sum_exclude_dim1(const Tensor& to_sum, bool keepdim=true) {
+  auto r = to_sum.sum(0, keepdim);
+  int64_t start_point_exclusive = keepdim ? 1 : 0;
+  for (int64_t dim = r.dim() - 1; dim > start_point_exclusive; dim--) {
+    r = r.sum(dim, keepdim);
+  }
+  return r;
+}
+
+// Helper for batchnorm_double_backward
+// similar to expand_as below, but doesn't do the expand_as; operates as if
+// reductions were done with keepdim=True
+Tensor unsqueeze_dim1(const Tensor& src, const Tensor& target) {
+  auto src_expanded = src;
+  while (src_expanded.sizes().size() < target.sizes().size() - 1) {
+    src_expanded = src_expanded.unsqueeze(1);
+  }
+  if (src_expanded.sizes().size() == target.sizes().size() - 1) {
+    src_expanded = src_expanded.unsqueeze(0);
+  }
+  return src_expanded;
+}
+
+// Helper for batchnorm_double_backward
+// because gamma/ggG/ggB are 1-dimensional and represent dim==1, we can't
+// do a straight expansion because it won't follow the broadcasting rules.
+Tensor expand_as_dim1(const Tensor& src, const Tensor& target) {
+  auto src_expanded = src;
+  while (src_expanded.sizes().size() < target.sizes().size() - 1) {
+    src_expanded = src_expanded.unsqueeze(1);
+  }
+  return src_expanded.expand_as(target);
+}
+
+std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
+    const Tensor & input,
+    const Tensor & gamma,
+    const Tensor & ggI,
+    const Tensor & ggG,
+    const Tensor & ggB,
+    const Tensor & gO,
+    const Tensor & running_mean,
+    const Tensor & running_var,
+    bool training,
+    double eps,
+    const Tensor & save_mean,
+    const Tensor & save_std,
+    std::array<bool,3> output_mask) {
+
+  bool affine = gamma.defined();
+  // TODO: Do we have a ScalarOrTensor type?  Would such a thing exist?
+  Tensor gamma_expanded;
+  Tensor ggG_expanded, ggB_expanded;
+  if (affine) {
+    gamma_expanded = expand_as_dim1(gamma, input);
+    if (ggG.defined()) {
+      ggG_expanded = expand_as_dim1(ggG, input);
+    }
+    if (ggB.defined()) {
+      ggB_expanded = expand_as_dim1(ggB, input);
+    }
+  } else {
+    gamma_expanded = input.type().tensor({}).fill_(1);
+  }
+
+  // define some terms we will reuse
+  auto M = input.size(0);
+  for (auto s : input.sizes().slice(2)) {
+    M *= s;
+  }
+  auto mu = unsqueeze_dim1(training ? save_mean : running_mean, input);
+  auto input_sub_mu = input - mu;
+  auto sigma2_eps_neg_1_2 = unsqueeze_dim1(training ? save_std : running_var.add(Scalar(eps)).pow(-0.5), input);
+  auto sigma2_eps_neg_1 = sigma2_eps_neg_1_2.pow(2);
+  auto sigma2_eps_neg_3_2 = sigma2_eps_neg_1_2.pow(3);
+
+  // calculate gI
+  auto input_mu_sigma2_neg_3_2 = input_sub_mu * sigma2_eps_neg_3_2;
+  auto gOinmu_sum = sum_exclude_dim1(gO * input_sub_mu);
+  auto gO_sum = sum_exclude_dim1(gO);
+
+  Tensor gI;
+  if (ggI.defined() && training) {
+    auto ggI_sum = sum_exclude_dim1(ggI);
+    auto ggIinmu_sum = sum_exclude_dim1(ggI * input_sub_mu);
+    auto all_sub = ((ggI_sum * gO_sum).div_(M)).sub_(sum_exclude_dim1(gO * ggI)).add_(
+                    (sigma2_eps_neg_1 * gOinmu_sum * ggIinmu_sum).mul_(3. / M));
+    auto gI_0t = (input_mu_sigma2_neg_3_2 * all_sub).div_(M);
+    auto gI_1t = (ggIinmu_sum * sigma2_eps_neg_3_2).div_(M) * (gO_sum.div(M) - gO);
+    auto gI_2t = (gOinmu_sum * sigma2_eps_neg_3_2).div_(M) * (ggI_sum.div(M) - ggI);
+    gI = gamma_expanded * (gI_0t.add_(gI_1t).add_(gI_2t));
+  }
+
+  // add contribution of gamma term to gI
+  Tensor gI_G_term;
+  if (affine && ggG.defined()) {
+    if (training) {
+      auto t0 = gO * sigma2_eps_neg_1_2;
+      auto t1 = (sigma2_eps_neg_1_2 * gO_sum).div_(-M);
+      auto t2 = (input_mu_sigma2_neg_3_2 * sum_exclude_dim1(gO * input_sub_mu)).div_(-M);
+      gI_G_term = ggG_expanded * (t0.add_(t1).add_(t2));
+      gI = gI.defined() ? gI.add_(gI_G_term) : gI_G_term;
+    } else {
+      gI_G_term = ggG_expanded * sigma2_eps_neg_1_2 * gO;
+      gI = gI.defined() ? gI.add_(gI_G_term) : gI_G_term;
+    }
+  }
+
+  // this is the first backward's grad_input
+  auto first_back_grad_input = [&](const Tensor& gO, const Tensor& gamma) -> Tensor {
+    auto h0 = (gamma * sigma2_eps_neg_1_2).div_(M);
+    auto h1 = (M * gO).sub_(sum_exclude_dim1(gO)).sub_(
+                input_sub_mu.mul(sigma2_eps_neg_1) * sum_exclude_dim1(gO * input_sub_mu));
+    return h0 * h1;
+  };
+
+  // calculate gG
+  Tensor gG;
+  if (affine && ggI.defined()) {
+    if (training) {
+      // gG is just the first backwards with the gamma term removed (then shaped properly)
+      gG = ggI * first_back_grad_input(gO, sigma2_eps_neg_1_2.type().tensor({}).fill_(1));
+      gG = sum_exclude_dim1(gG, false);
+    } else {
+      gG = sum_exclude_dim1(ggI * gO * sigma2_eps_neg_1_2, false);
+    }
+  }
+
+  // calculate ggO
+  Tensor ggO;
+  // contribution of input term
+  if (ggI.defined()) {
+    if (training) {
+      ggO = first_back_grad_input(ggI, gamma_expanded);
+    } else {
+      ggO = ggI * sigma2_eps_neg_1_2 * gamma_expanded;
+    }
+  }
+  if (ggG.defined()) {
+    auto ggO_G_term = ggG_expanded * input_sub_mu * sigma2_eps_neg_1_2;
+    ggO = ggO.defined() ? ggO.add_(ggO_G_term) : ggO_G_term;
+  }
+  if (ggB.defined()) {
+    auto ggO_B_term = ggB_expanded;
+    ggO = ggO.defined() ? ggO.add_(ggO_B_term) : ggO_B_term;
+  }
+
+  if (output_mask[0] && !ggO.defined()) ggO = at::zeros_like(gO);
+  if (output_mask[1] && !gG.defined()) {
+    AT_ASSERTM(affine, "gamma should always be defined when it requires grad");
+    gG = at::zeros_like(gamma);
+  }
+  if (output_mask[2] && !gI.defined()) gI = at::zeros_like(input);
+
+  return std::tuple<Tensor, Tensor, Tensor>{gI, gG, ggO};
+
+}
+
+std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3,
+						       IntList expand1, IntList expand2, IntList expand3,
+						       IntList sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask) {
+  Tensor grad_i1, grad_i2, grad_i3;
+  if (grad_mask[0])
+    grad_i1 = at::_trilinear(grad_out, i2, i3, sumdim, expand2, expand3, expand1);
+  if (grad_mask[1])
+    grad_i2 = at::_trilinear(i1, grad_out, i3, expand1, sumdim, expand3, expand2);
+  if (grad_mask[2])
+    grad_i3 = at::_trilinear(i1, i2, grad_out, expand1, expand2, sumdim, expand3);
+  return std::tuple<Tensor, Tensor, Tensor>(grad_i1, grad_i2, grad_i3);
+}
+
+Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
+  if (self.is_sparse()) {
+    AT_ERROR(
+      "log1p of a sparse tensor is made to be non-differentiable since ",
+      "local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. ",
+      "Use a different mathematical operation which preserves sparsity of gradients, ",
+      "or report a bug if you think this is an error.");
+  }
+  return grad / (self + 1);
+}
+
+} // anonymous namespace
+
+${autograd_function_definitions}
+
+}}} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
new file mode 100644
index 0000000..7f3e5f9
--- /dev/null
+++ b/tools/autograd/templates/Functions.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ATen.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/utils/functional.h"
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntList;
+using at::Type;
+using at::TensorGeometry;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [](const SavedVariable& x) { return static_cast<Tensor>(x.unpack()); });
+}
+
+struct TypeAndSize {
+  TypeAndSize() : type(nullptr) {}
+  /* implicit */
+  TypeAndSize(const Tensor & t)
+    : sizes(t.sizes())
+    , type(&t.type()) {}
+
+  Tensor zeros() { return at::zeros(sizes, *type); }
+
+private:
+  std::vector<int64_t> sizes;
+  Type* type;
+};
+
+${autograd_function_declarations}
+
+}}} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
new file mode 100644
index 0000000..0695c0d
--- /dev/null
+++ b/tools/autograd/templates/VariableType.cpp
@@ -0,0 +1,499 @@
+#include "VariableType.h"
+
+// ${generated_comment}
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/functions/tensor.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/utils/variadic.h"
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef _MSC_VER
+#ifdef Type
+#undef Type
+#endif
+#endif
+
+using namespace at;
+using namespace torch::autograd::generated;
+
+namespace torch { namespace autograd {
+// Helper methods for working with Attributes (torch/csrc/jit/attributes.h)
+
+// The overloaded accessors are convenient for the generated code (since we
+// don't want to make the codegen do the dispatch manually)
+static void setattr(jit::Node* n, jit::Symbol name, int64_t v)             { n->i_(name, v); }
+static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v)   { n->t_(name, v.toTensor()); }
+static void setattr(jit::Node* n, jit::Symbol name, SparseTensorRef s)     { n->t_(name, s.tref); }
+static void setattr(jit::Node* n, jit::Symbol name, const at::IntList& v)  { n->is_(name, v); }
+static void setattr(jit::Node* n, jit::Symbol name, bool v)                { n->i_(name, v); }
+static void setattr(jit::Node* n, jit::Symbol name, double v)              { n->f_(name, v); }
+static void setattr(jit::Node* n, jit::Symbol name, std::string v)         { n->s_(name, v); }
+template<std::size_t N>
+static void setattr(jit::Node* n, jit::Symbol name, std::array<bool, N> v) { n->is_(name, std::vector<int64_t>(v.begin(), v.end())); }
+
+template<typename T>
+static jit::Value* createConstant(jit::Node* n, T value) {
+  return n->owningGraph()->createConstant(jit::as_tensor(value))->insertBefore(n)->output();
+}
+
+template<typename T>
+static void genericInsertInput(jit::Node* n, size_t idx, T value) {
+  n->insertInput(idx, createConstant(n, value));
+}
+
+void failPositionalAttr() {
+  throw std::runtime_error("unsupported type in setposattr. File a bug report!");
+}
+
+static void setposattr(jit::Node* n, size_t idx, const char *name, int64_t v)             { genericInsertInput(n, idx, v); }
+static void setposattr(jit::Node* n, size_t idx, const char *name, const at::Scalar& v)   { genericInsertInput(n, idx, v); }
+static void setposattr(jit::Node* n, size_t idx, const char *name, SparseTensorRef s)     { failPositionalAttr(); }
+static void setposattr(jit::Node* n, size_t idx, const char *name, const at::IntList& v)  {
+  using ArgumentStash = jit::tracer::ArgumentStash;
+  if (ArgumentStash::hasIntList(name)) {
+    auto info = ArgumentStash::popIntList(name);
+    for (size_t i = 0; i < info.size(); ++i) {
+      if (info[i] != nullptr) continue;
+      info[i] = createConstant(n, v[i]);
+    }
+    jit::TensorType expected_type {at::kLong, -1, {}};
+    for (jit::Value* v : info) {
+      if (*v->type() != expected_type) {
+        throw std::runtime_error(
+          "Type mismatch in setposattr for IntList. Check that your program "
+          "is valid without tracing, and please file a bug report if it is.");
+      }
+    }
+    jit::WithInsertPoint insert_point{n};
+    auto symbolic_info = fmap<jit::SymbolicVariable>(info);
+    auto size = jit::SymbolicVariable::stack(symbolic_info, 0);
+    n->insertInput(idx, size);
+  } else {
+    return genericInsertInput(n, idx, v);
+  }
+}
+static void setposattr(jit::Node* n, size_t idx, const char *name, bool v)                { genericInsertInput(n, idx, v); }
+static void setposattr(jit::Node* n, size_t idx, const char *name, double v)              { genericInsertInput(n, idx, v); }
+template<std::size_t N>
+static void setposattr(jit::Node* n, size_t idx, const char *name, std::array<bool, N> v) { failPositionalAttr(); }
+
+VariableType::VariableType(Context* context, Type* baseType)
+  : Type(context, /*is_variable=*/true, /*is_undefined=*/false)
+  , baseType(baseType)
+  , id_(context->freshTypeID()) {
+  str = std::string("Variable[") + baseType->toString() + "]";
+}
+
+ScalarType VariableType::scalarType() const {
+  return baseType->scalarType();
+}
+Backend VariableType::backend() const {
+  return baseType->backend();
+}
+bool VariableType::is_cuda() const { return baseType->is_cuda(); }
+bool VariableType::is_sparse() const { return baseType->is_sparse(); }
+bool VariableType::is_distributed() const { return baseType->is_distributed(); }
+
+std::unique_ptr<Storage> VariableType::storage() const {
+  return baseType->storage();
+}
+std::unique_ptr<Storage> VariableType::storage(size_t size) const {
+  return baseType->storage(size);
+}
+std::unique_ptr<Storage> VariableType::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
+  return baseType->storageFromBlob(data, size, deleter);
+}
+std::unique_ptr<Storage> VariableType::unsafeStorageFromTH(void * th_pointer, bool retain) const {
+  return baseType->unsafeStorageFromTH(th_pointer, retain);
+}
+std::unique_ptr<Storage> VariableType::storageWithAllocator(int64_t size, Allocator* allocator) const {
+  return baseType->storageWithAllocator(size, allocator);
+}
+Tensor VariableType::unsafeTensorFromTH(void * th_pointer, bool retain) const {
+  return make_variable(baseType->unsafeTensorFromTH(th_pointer, retain), /*requires_grad=*/false);
+}
+std::unique_ptr<Generator> VariableType::generator() const {
+  return baseType->generator();
+}
+
+const char * VariableType::toString() const {
+  return str.c_str();
+}
+size_t VariableType::elementSizeInBytes() const {
+  return baseType->elementSizeInBytes();
+}
+Type & VariableType::toBackend(Backend b) const {
+  return *getType(baseType->toBackend(b));
+}
+Type & VariableType::toScalarType(ScalarType s) const {
+  return *getType(baseType->toScalarType(s));
+}
+TypeID VariableType::ID() const {
+  return static_cast<TypeID>(id_);
+}
+
+const char * VariableType::typeString() {
+  return "VariableType";
+}
+
+std::vector<std::unique_ptr<Type>> type_to_variable_type;
+
+// XXX - this is not threadsafe with uses of Variables
+void register_variable_type_for(Type* baseType) {
+  TORCH_ASSERT(baseType);
+  size_t base_id = static_cast<size_t>(baseType->ID());
+  if(type_to_variable_type.size() <= base_id) {
+    type_to_variable_type.resize(base_id + 1);
+  }
+  type_to_variable_type[base_id].reset(new VariableType(&at::globalContext(), baseType));
+}
+
+struct VariableTypeRegistry {
+  VariableTypeRegistry() {
+    auto& context = at::globalContext();
+    for (int p = 0; p < static_cast<int>(Backend::NumOptions); ++p) {
+      for (int s = 0; s < static_cast<int>(ScalarType::NumOptions); ++s) {
+        auto baseType = context.getTypeRaw(static_cast<Backend>(p), static_cast<ScalarType>(s));
+        if (baseType && baseType->backend() != Backend::Undefined) {
+          register_variable_type_for(baseType);
+        }
+      }
+    }
+  }
+};
+
+static VariableTypeRegistry registry;
+
+bool VariableType::isVariableType(const at::Type& type) {
+  return type.is_variable();
+}
+
+at::Type* VariableType::getType(const at::Type& baseType) {
+  auto id = static_cast<size_t>(baseType.ID());
+  if(id >= type_to_variable_type.size())
+    return nullptr;
+  return type_to_variable_type[id].get();
+}
+
+at::Type* VariableType::getType(const at::Tensor& tensor) {
+  if (!tensor.defined()) {
+    throw std::runtime_error("tensor is undefined");
+  }
+  return getType(tensor.type());
+}
+
+namespace {
+std::vector<at::Type*> allTypesForBackends(at::ArrayRef<at::Backend> backends) {
+  auto& context = at::globalContext();
+  std::vector<Type*> res;
+  res.reserve(backends.size() * static_cast<int>(ScalarType::NumOptions));
+  for (auto p : backends) {
+    for (int s = 0; s < static_cast<int>(ScalarType::NumOptions); s++) {
+      auto baseType = context.getTypeRaw(static_cast<Backend>(p), static_cast<ScalarType>(s));
+      if (baseType) {
+        res.emplace_back(VariableType::getType(*baseType));
+      }
+    }
+  }
+  return res;
+}
+}
+
+std::vector<at::Type*> VariableType::allCPUTypes() {
+  return allTypesForBackends({ Backend::CPU, Backend::SparseCPU });
+}
+
+std::vector<at::Type*> VariableType::allCUDATypes() {
+  at::globalContext().lazyInitCUDA();
+  return allTypesForBackends({ Backend::CUDA, Backend::SparseCUDA });
+}
+
+Variable & VariableType::checked_cast_variable(const Tensor & t, const char * name, int pos) {
+  if (!t.defined()) {
+    AT_ERROR("Expected a Tensor of type Variable but found an undefined Tensor for argument #", pos, " '", name, "'");
+  }
+  if (!isVariableType(t.type())) {
+    AT_ERROR("Expected object of type Variable but found type ", t.type().toString(), " for argument #", pos, " '", name, "'");
+  }
+  return as_variable_ref(const_cast<Tensor&>(t));
+}
+
+Tensor & VariableType::unpack(const Tensor & t, const char * name, int pos) {
+  return checked_cast_variable(t, name, pos).data();
+}
+
+SparseTensorRef VariableType::unpack(SparseTensorRef t, const char * name, int pos) {
+  return SparseTensorRef(checked_cast_variable(t.tref, name, pos).data());
+}
+
+Tensor VariableType::unpack_opt(const Tensor & t, const char * name, int pos) {
+  if (!t.defined()) {
+    return Tensor();
+  }
+  return unpack(t, name, pos);
+}
+
+std::vector<at::Tensor> VariableType::unpack(at::TensorList tl, const char *name, int pos) {
+  std::vector<at::Tensor> ret(tl.size());
+  for (size_t i = 0; i < tl.size(); ++i) {
+    const auto &t = tl[i];
+    if (!t.defined()) {
+      AT_ERROR("Expected a Tensor of type Variable but found an undefined Tensor at position #", i, " "
+                    "for iterable argument #", pos, " '", name, "'");
+    }
+    if (!isVariableType(t.type())) {
+      AT_ERROR("Expected object of type Variable but found type ", t.type().toString(), " at position #", i, " "
+                    "for iterable argument #", pos, " '", name, "'");
+    }
+    ret[i] = static_cast<const Variable&>(t).data();
+  }
+  return ret;
+}
+
+// Assumed that saved tensor lists are never inplace outputs
+static std::vector<SavedVariable> make_saved_variable_list(TensorList tensors) {
+  return fmap(tensors, [](const Tensor& tensor) -> SavedVariable {
+      return SavedVariable{tensor, false /* is output */}; });
+}
+
+static Tensor as_variable(Tensor tensor) {
+  return make_variable(std::move(tensor), /*requires_grad=*/false);
+}
+
+static std::vector<Tensor> as_variable(TensorList tl) {
+  return fmap(tl, [](const Tensor& t) -> Tensor {
+      return make_variable(std::move(t), /*requires_grad=*/false);
+  });
+}
+
+template <typename... Tensors, size_t... Is>
+std::tuple<Tensors...> as_variable_impl(
+    std::tuple<Tensors...> tensors,
+    Indices<Is...>) {
+  // Expand the integer parameter pack into a sequence of Variable
+  // constructions. This turns into (boolean omitted):
+  // Variable(std::get<0>(tensors)), Variable(std::get<1>(tensors)), ...
+  return std::tuple<Tensors...>(
+      as_variable(std::get<Is>(tensors))...);
+}
+
+// NB: Because this was not forward declared, recursive std::tuple won't work.
+// You can probably rejigger this to make it supported if you really need it.
+template <typename... Tensors>
+std::tuple<Tensors...> as_variable(std::tuple<Tensors...> tensors) {
+  // `sizeof...(Tensors)` gets us the size of the `Tensors` parameter pack at
+  // compile time. We use it to parameterize a `MakeIndices` class, which will
+  // expand into an Indices object containing the numbers 0 to
+  // sizeof...(Tensors) - 1.
+  return as_variable_impl(
+      tensors, typename MakeIndices<sizeof...(Tensors)>::indices());
+}
+
+static Tensor as_view(const Tensor & base, Tensor tensor) {
+  auto base_var = Variable(base);
+  if (base_var.is_view()) {
+    base_var = base_var.base();
+  }
+  return make_variable_view(std::move(base_var), std::move(tensor));
+}
+
+static std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tensors) {
+  auto base_var = Variable(base);
+  if (base_var.is_view()) {
+    base_var = base_var.base();
+  }
+  for(Tensor &tensor : tensors) {
+    tensor = make_variable_view(base_var, std::move(tensor));
+  }
+  return tensors;
+}
+
+struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
+  bool out = false;
+  using IterArgs<ComputeRequiresGrad>::operator();
+  void operator()(const at::Tensor& tensor) {
+    const auto& var = static_cast<const Variable&>(tensor);
+    if (var.defined() && var.requires_grad()) {
+      out = true;
+    }
+  }
+  bool short_circuit() { return out; }
+};
+
+template<typename... Args>
+static bool compute_requires_grad(Args&&... args) {
+  if (!GradMode::is_enabled()) {
+    return false;
+  }
+  return ComputeRequiresGrad().apply(std::forward<Args>(args)...).out;
+}
+
+static void check_no_requires_grad(const Tensor& tensor, const char* name) {
+  auto& var = static_cast<const Variable&>(tensor);
+  if (var.defined() && var.requires_grad()) {
+    std::string msg = "the derivative for '";
+    msg += name;
+    msg += "' is not implemented";
+    throw std::runtime_error(msg);
+  }
+}
+
+static void check_inplace(const Tensor& tensor) {
+  auto& var = static_cast<const Variable&>(tensor);
+  if (var.requires_grad() && var.is_leaf() && GradMode::is_enabled()) {
+    AT_ERROR(
+      "a leaf Variable that requires grad has been used in an in-place operation.");
+  }
+}
+
+static void throw_error_out_requires_grad(const char* name) {
+  AT_ERROR(
+      name, "(): functions with out=... arguments don't support automatic differentiation, "
+      "but one of the arguments requires grad.");
+}
+
+// TODO: Blegh, bare references
+
+static void rebase_history(Variable& var, std::shared_ptr<Function> grad_fn) {
+  if (grad_fn && var.defined()) {
+    grad_fn->add_input_metadata(var.type(), var.sizes());
+    var.rebase_history({std::move(grad_fn), 0});
+  }
+}
+
+static void rebase_history(ArrayRef<Variable> vars, std::shared_ptr<Function> grad_fn) {
+  if (grad_fn) {
+    for (auto& var : vars) {
+      if (var.defined()) {
+        // TODO: eliminate const_cast
+        auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes());
+        const_cast<Variable&>(var).rebase_history({grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Function::undefined_input());
+      }
+    }
+  }
+}
+
+static void set_history(ArrayRef<Variable> vars, std::shared_ptr<Function> grad_fn) {
+  if (grad_fn) {
+    for (auto& var : vars) {
+      if (var.defined()) {
+        // TODO: eliminate const_cast
+        auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes());
+        const_cast<Variable&>(var).set_gradient_edge({grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Function::undefined_input());
+      }
+    }
+  }
+}
+
+struct Flatten : IterArgs<Flatten> {
+  Flatten(variable_list& out) : out(out) {}
+  variable_list& out;
+  void operator()(const at::Tensor& x) { out.emplace_back(x); }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    out.insert(out.end(), xs.begin(), xs.end());
+  }
+};
+
+template<typename... Args> inline variable_list flatten_tensor_args(Args&&... args) {
+  variable_list out;
+  out.reserve(count_tensors(std::forward<Args>(args)...));
+  Flatten(out).apply(std::forward<Args>(args)...);
+  return out; // RVO
+}
+
+static void increment_version(Tensor & t) {
+  as_variable_ref(t).bump_version();
+}
+
+static bool isFloatingPoint(ScalarType s) {
+  return s == kFloat || s == kDouble || s == kHalf;
+}
+
+Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
+  // TODO: once copy is exposed in Declarations.yaml we may be able to bind
+  // it automatically
+  auto& self_ = unpack(self, "self", 0);
+  auto& src_ = unpack(src, "src", 1);
+  check_inplace(self);
+  std::shared_ptr<CopyBackwards> grad_fn;
+  auto requires_grad = compute_requires_grad(self, src);
+  requires_grad &= isFloatingPoint(self.type().scalarType());
+  if (requires_grad) {
+    grad_fn = std::make_shared<CopyBackwards>();
+    grad_fn->set_next_edges(collect_next_edges(self, src));
+    grad_fn->src_type = &src.type();
+    if (src.is_cuda()) {
+      grad_fn->src_device = src.get_device();
+    }
+  }
+  baseType->s_copy_(self_, src_, non_blocking);
+  increment_version(self);
+  rebase_history(as_variable_ref( self ), std::move(grad_fn));
+  return self;
+}
+
+Tensor & VariableType::_s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const {
+  AT_ERROR("copy_from does not support automatic differentiation; use copy_ instead");
+}
+
+Tensor & VariableType::resize_(Tensor & self, IntList size) const {
+  auto& self_ = unpack(self, "self", 0);
+  if (as_variable_ref(self).requires_grad()) {
+    AT_ERROR("cannot resize variables that require grad");
+  }
+  baseType->resize_(self_, size);
+  return self;
+}
+
+Tensor & VariableType::resize_as_(Tensor & self, const Tensor & the_template) const {
+  auto& self_ = unpack(self, "self", 0);
+  auto& the_template_ = unpack(the_template, "the_template", 1);
+  if (as_variable_ref(self).requires_grad()) {
+    AT_ERROR("cannot resize variables that require grad");
+  }
+  baseType->resize_as_(self_, the_template_);
+  return self;
+}
+
+Tensor VariableType::contiguous(const Tensor & self) const {
+  unpack(self, "self", 0);
+  if (self.is_contiguous()) {
+    return self;
+  }
+  return self.clone();
+}
+
+static std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
+  std::vector<std::vector<int64_t>> args_sizes(tensors.size());
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    args_sizes[i] = tensors[i].sizes();
+  }
+  return args_sizes;
+}
+
+${type_derived_method_definitions}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
new file mode 100644
index 0000000..567bbdf
--- /dev/null
+++ b/tools/autograd/templates/VariableType.h
@@ -0,0 +1,75 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ATen.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+struct Variable;
+using at::Context;
+using at::Generator;
+using at::IntList;
+using at::Scalar;
+using at::SparseTensorRef;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::Type;
+using at::ScalarType;
+using at::optional;
+
+void register_variable_type_for(at::Type* baseType);
+
+struct VariableType final : public at::Type {
+  VariableType(Context* context, at::Type* baseType);
+  virtual at::ScalarType scalarType() const override;
+  virtual at::Backend backend() const override;
+  virtual bool is_cuda() const override;
+  virtual bool is_sparse() const override;
+  virtual bool is_distributed() const override;
+  virtual std::unique_ptr<at::Storage> storage() const override;
+  virtual std::unique_ptr<at::Storage> storage(size_t size) const override;
+  virtual std::unique_ptr<at::Storage> storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const override;
+  virtual std::unique_ptr<Storage> storageWithAllocator(int64_t size, at::Allocator* allocator) const override;
+  virtual std::unique_ptr<at::Generator> generator() const override;
+  virtual const char * toString() const override;
+  virtual at::TypeID ID() const override;
+  virtual size_t elementSizeInBytes() const override;
+  virtual at::Type & toBackend(at::Backend b) const override;
+  virtual at::Type & toScalarType(at::ScalarType s) const override;
+  static const char * typeString();
+  virtual std::unique_ptr<at::Storage> unsafeStorageFromTH(void * th_pointer, bool retain) const override;
+  virtual at::Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
+
+  static at::Type* getType(const at::Type& baseType);
+  static at::Type* getType(const at::Tensor& tensor);
+  static bool isVariableType(const at::Type& type);
+  static std::vector<at::Type*> allCUDATypes();
+  static std::vector<at::Type*> allCPUTypes();
+
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override;
+  ${type_derived_method_declarations}
+
+private:
+  // checks that t is actually a Variable
+  static Variable & checked_cast_variable(const Tensor & t, const char * name, int pos);
+  static at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  static at::SparseTensorRef unpack(SparseTensorRef t, const char * name, int pos);
+  static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  static std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
+
+  at::Type* baseType;
+  std::string str;
+  size_t id_;
+};
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_functions.cpp b/tools/autograd/templates/python_functions.cpp
new file mode 100644
index 0000000..b0f5efa
--- /dev/null
+++ b/tools/autograd/templates/python_functions.cpp
@@ -0,0 +1,26 @@
+#include "python_functions.h"
+
+// ${generated_comment}
+
+#include <Python.h>
+#include <ATen/ATen.h>
+
+#include "Functions.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+namespace torch { namespace autograd { namespace generated {
+
+template<typename C>
+static void addClass(PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)
+{
+  _initFunctionPyTypeObject(type, name, function_properties, function_methods);
+  Py_INCREF(&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+void initialize_autogenerated_functions() {
+  ${py_function_initializers}
+}
+
+}}} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/python_functions.h b/tools/autograd/templates/python_functions.h
new file mode 100644
index 0000000..c434e57
--- /dev/null
+++ b/tools/autograd/templates/python_functions.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// ${generated_comment}
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+void initialize_autogenerated_functions();
+
+}}} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp
new file mode 100644
index 0000000..52fdeae
--- /dev/null
+++ b/tools/autograd/templates/python_nn_functions.cpp
@@ -0,0 +1,78 @@
+#include "python_nn_functions.h"
+
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+
+#include "python_nn_functions_dispatch.h"
+
+using at::Tensor;
+using at::Scalar;
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto parsed = parse_to_conversion(args, kwargs);
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto tuple = THPObjectPtr{PyTuple_New(3)};
+  if (!tuple) throw python_error();
+  if (device) {
+    PyTuple_SET_ITEM(tuple.get(), 0, THPDevice_New(*device));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
+  }
+  if (scalarType) {
+    PyTuple_SET_ITEM(tuple.get(), 1, torch::autograd::utils::wrap(torch::getDtype(*scalarType)));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
+  }
+  PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+${py_methods}
+
+static PyMethodDef nn_functions[] = {
+  {"_parse_to", (PyCFunction)THPVariable__parse_to, METH_VARARGS | METH_KEYWORDS, nullptr},
+  ${py_method_defs}
+  {NULL}
+};
+
+void initNNFunctions(PyObject* module) {
+#if PY_MAJOR_VERSION == 2
+  PyObject* nn = Py_InitModule("torch._C._nn", nn_functions);
+  Py_XINCREF(nn);  // Py_InitModule returns "borrowed" reference
+#else
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nn",
+     NULL,
+     -1,
+     nn_functions
+  };
+  PyObject* nn = PyModule_Create(&def);
+#endif
+  if (!nn) {
+    throw python_error();
+  }
+  // steals a reference to nn
+  if (PyModule_AddObject(module, "_nn", nn) != 0) {
+    throw python_error();
+  }
+}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_nn_functions.h b/tools/autograd/templates/python_nn_functions.h
new file mode 100644
index 0000000..d406293
--- /dev/null
+++ b/tools/autograd/templates/python_nn_functions.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <Python.h>
+
+namespace torch { namespace autograd {
+
+void initNNFunctions(PyObject* module);
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_nn_functions_dispatch.h b/tools/autograd/templates/python_nn_functions_dispatch.h
new file mode 100644
index 0000000..f7677a7
--- /dev/null
+++ b/tools/autograd/templates/python_nn_functions_dispatch.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "torch/csrc/utils/auto_gil.h"
+
+#include <ATen/ATen.h>
+
+// Contains inline wrappers around ATen functions that release the GIL and
+// switch to the correct CUDA device.
+
+namespace torch { namespace autograd {
+
+using namespace at;
+using at::Generator;
+
+${py_method_dispatch}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
new file mode 100644
index 0000000..6e4079b
--- /dev/null
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -0,0 +1,339 @@
+// ${generated_comment}
+
+// Python bindings for torch.* functions implemented through ATen.
+//
+// The functions are bound as static methods on a class
+// torch._C._VariableFunctions which is also aliased as Variable._torch
+// and also copied into 'torch' module.
+
+#include <Python.h>
+
+#include "python_torch_functions_dispatch.h"
+
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/tensor_layouts.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+
+#include <ATen/ATen.h>
+
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <utility>
+
+using at::Tensor;
+using at::Device;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::DeviceGuard;
+using at::TensorOptions;
+
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+static void check_out_type_matches(Tensor result,
+                                   ScalarType scalarType, bool scalarType_is_none,
+                                   const THPLayout& layout, bool layout_is_none,
+                                   const Device& device, bool device_is_none) {
+  if (scalarType_is_none && layout_is_none && device_is_none) {  // common case
+    return;
+  }
+  auto scalarType_arg = scalarType_is_none ? result.type().scalarType() : scalarType;
+  auto layout_arg = layout_is_none ? *torch::getLayout(result.type().backend()) : layout;
+  auto device_type_arg = device_is_none ? torch::getDeviceType(result.type()) : device.type();
+  const auto& type = torch::getType(scalarType_arg, layout_arg, device_type_arg);
+  if (result.type() != type) {
+    AT_ERROR(
+        "type corresponding to %s does not match type of out parameter (%s)",
+        type.toString(),
+        result.type().toString());
+  }
+}
+
+inline Tensor & dispatch_arange(Scalar end, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::arange_out(result, end);
+}
+
+inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) {
+  maybe_initialize_cuda(options.type());
+  AutoNoGIL no_gil;
+  return torch::arange(end, options);
+}
+
+inline Tensor & dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::arange_out(result, start, end, step);
+}
+
+inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
+  maybe_initialize_cuda(options.type());
+  AutoNoGIL no_gil;
+  return torch::arange(start, end, step, options);
+}
+
+static inline bool allIntegral(std::initializer_list<std::reference_wrapper<Scalar>> l) {
+  for (Scalar& s : l) {
+    if (!(s.isIntegral() || (s.isBackedByTensor() && at::isIntegralType(s.toTensor().type().scalarType())))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "arange(Scalar end, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+    "arange(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<8> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if (r.idx == 0) {
+    if (r.isNone(1)) {
+      auto end = r.scalar(0);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto scalarType = r.isNone(2) && allIntegral({end}) ? at::ScalarType::Long : r.scalartype(2);
+      const auto options = TensorOptions()
+          .dtype(scalarType)
+          .device(r.device(4))
+          .layout(r.layout(3).layout)
+          .requires_grad(r.toBool(5));
+      return wrap(dispatch_arange(end, options));
+    } else {
+      check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3), r.isNone(3),
+                             r.device(4), r.isNone(4));
+      return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(5)));
+    }
+  } else if (r.idx == 1) {
+    if (r.isNone(3)) {
+      auto start = r.scalar(0);
+      auto end = r.scalar(1);
+      auto step = r.scalar(2);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto scalarType = r.isNone(4) && allIntegral({start, end, step}) ? at::ScalarType::Long : r.scalartype(4);
+      const auto options = TensorOptions()
+          .dtype(scalarType)
+          .device(r.device(6))
+          .layout(r.layout(5).layout)
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_arange(start, end, step, options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5), r.isNone(5),
+                               r.device(6), r.isNone(6));
+      return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+inline Tensor & dispatch_range(Scalar start, Scalar end, Scalar step, Tensor result) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(result);
+  return at::range_out(result, start, end, step);
+}
+
+inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
+  maybe_initialize_cuda(options.type());
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(options.device());
+  return torch::range(start, end, step, options);
+}
+
+static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "range(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<8> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    PyErr_WarnEx(PyExc_UserWarning, "torch.range is deprecated in favor of torch.arange "
+        "and will be removed in 0.5. Note that arange generates values in [start; end), "
+        "not [start; end].", 1);
+    if (r.isNone(3)) {
+      const auto options = TensorOptions()
+          .dtype(r.scalartype(4))
+          .device(r.device(6))
+          .layout(r.layout(5).layout)
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
+                             r.layout(5), r.isNone(5),
+                             r.device(6), r.isNone(6));
+      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+// The Python clamp() syntax has to be mapped to one of three C++ functions
+static PyObject * THPVariable_clamp(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "clamp(Tensor input, Scalar min=None, Scalar max=None, *, Tensor out=None)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (!r.isNone(1) && !r.isNone(2)) {
+    if (!r.isNone(3)) {
+        return wrap(dispatch_clamp(r.tensor(0), r.scalar(1), r.scalar(2), r.tensor(3)));
+    } else {
+        return wrap(dispatch_clamp(r.tensor(0), r.scalar(1), r.scalar(2)));
+    }
+  } else if (!r.isNone(1)) {
+    if (!r.isNone(3)) {
+        return wrap(dispatch_clamp_min(r.tensor(0), r.scalar(1), r.tensor(3)));
+    } else {
+        return wrap(dispatch_clamp_min(r.tensor(0), r.scalar(1)));
+    }
+  } else if (!r.isNone(2)) {
+    if (!r.isNone(3)) {
+        return wrap(dispatch_clamp_max(r.tensor(0), r.scalar(2), r.tensor(3)));
+    } else {
+        return wrap(dispatch_clamp_max(r.tensor(0), r.scalar(2)));
+    }
+  } else {
+    throw std::runtime_error("At least one of 'min' or 'max' must not be None");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  auto data = torch::utils::tensor_from_numpy(arg);
+  return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "_promote_types(ScalarType type1, ScalarType type2)",
+  });
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    ScalarType promoted = at::promoteTypes(r.scalartype(0), r.scalartype(1));
+    return torch::autograd::utils::wrap(torch::getDtype(promoted));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+// generated methods start here
+
+${py_methods}
+
+static PyMethodDef torch_functions[] = {
+  {"arange", (PyCFunction)THPVariable_arange, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"as_tensor", (PyCFunction)THPVariable_as_tensor, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"clamp", (PyCFunction)THPVariable_clamp, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"dsmm", (PyCFunction)THPVariable_mm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"from_numpy", (PyCFunction)THPVariable_from_numpy, METH_STATIC | METH_O, NULL},
+  {"hsmm", (PyCFunction)THPVariable_hspmm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"_promote_types", (PyCFunction)THPVariable__promote_types, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"range", (PyCFunction)THPVariable_range, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"saddmm", (PyCFunction)THPVariable_sspaddmm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"sparse_coo_tensor", (PyCFunction)THPVariable_sparse_coo_tensor, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"spmm", (PyCFunction)THPVariable_mm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"tensor", (PyCFunction)THPVariable_tensor, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyTypeObject THPVariableFunctions = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._VariableFunctions",         /* tp_name */
+  0,                                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  torch_functions,                       /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  0                                      /* tp_new */
+};
+
+void initTorchFunctions(PyObject* module) {
+  if (PyType_Ready(&THPVariableFunctions) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPVariableFunctions);
+  if (PyModule_AddObject(module, "_VariableFunctions", (PyObject*)&THPVariableFunctions) < 0) {
+    throw python_error();
+  }
+}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h
new file mode 100644
index 0000000..fd8f2d4
--- /dev/null
+++ b/tools/autograd/templates/python_torch_functions_dispatch.h
@@ -0,0 +1,65 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#include <ATen/ATen.h>
+
+// Contains inline wrappers around ATen functions that release the GIL and
+// switch to the correct CUDA device.
+
+namespace torch { namespace autograd {
+
+using at::Tensor;
+using at::Scalar;
+using at::TensorList;
+using at::IntList;
+using at::Generator;
+using at::SparseTensorRef;
+using at::Storage;
+using at::TensorOptions;
+
+static at::Type& default_type() {
+  return torch::tensors::get_default_tensor_type();
+}
+
+static void maybe_initialize_cuda(const at::Type &type) {
+  if (type.is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+}
+
+// manual dispatch code for clamp
+inline Tensor dispatch_clamp(const Tensor & self, Scalar min, Scalar max) {
+  AutoNoGIL no_gil;
+  return self.clamp(min, max);
+}
+inline Tensor dispatch_clamp_min(const Tensor & self, Scalar min) {
+  AutoNoGIL no_gil;
+  return self.clamp_min(min);
+}
+inline Tensor dispatch_clamp_max(const Tensor & self, Scalar max) {
+  AutoNoGIL no_gil;
+  return self.clamp_max(max);
+}
+inline Tensor & dispatch_clamp(const Tensor & self, Scalar min, Scalar max, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::clamp_out(result, self, min, max);
+}
+inline Tensor & dispatch_clamp_min(const Tensor & self, Scalar min, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::clamp_min_out(result, self, min);
+}
+inline Tensor & dispatch_clamp_max(const Tensor & self, Scalar max, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::clamp_max_out(result, self, max);
+}
+
+${py_method_dispatch}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
new file mode 100644
index 0000000..8e8c876
--- /dev/null
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -0,0 +1,705 @@
+// ${generated_comment}
+
+#include <Python.h>
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Size.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/python_error_messages.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/jit/tracer.h"
+#ifdef USE_CUDA
+#include "torch/csrc/cuda/Stream.h"
+#endif
+#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_tuples.h"
+#include "torch/csrc/utils/tensor_apply.h"
+#include "torch/csrc/utils/tensor_conversion_dispatch.h"
+#include "torch/csrc/utils/tensor_list.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/utils/tensor_types.h"
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include "python_variable_methods_dispatch.h"
+
+#include <stdexcept>
+
+using at::DeviceGuard;
+using at::Backend;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+static PyObject * THPVariable_apply_(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call apply_() on Variable that requires grad. Use "
+        "var.detach().apply_() instead.");
+  }
+  return THPVariable_Wrap(torch::utils::apply_(self_, arg));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_clamp(const Tensor & self, Scalar min, Scalar max) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp(min, max);
+}
+static Tensor dispatch_clamp_min(const Tensor & self, Scalar min) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp_min(min);
+}
+static Tensor dispatch_clamp_max(const Tensor & self, Scalar max) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp_max(max);
+}
+
+static PyObject * THPVariable_clamp(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "clamp(Scalar min=None, Scalar max=None)",
+  }, /*traceable=*/true);
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (!r.isNone(0) && !r.isNone(1)) {
+    return THPVariable_Wrap(dispatch_clamp(self_, r.scalar(0), r.scalar(1)));
+  } else if (!r.isNone(0)) {
+    return THPVariable_Wrap(dispatch_clamp_min(self_, r.scalar(0)));
+  } else if (!r.isNone(1)) {
+    return THPVariable_Wrap(dispatch_clamp_max(self_, r.scalar(1)));
+  } else {
+    throw std::runtime_error("At least one of 'min' or 'max' must not be None");
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor & dispatch_clamp_(Tensor & self, Scalar min, Scalar max) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp_(min, max);
+}
+static Tensor & dispatch_clamp_min_(Tensor & self, Scalar min) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp_min_(min);
+}
+static Tensor & dispatch_clamp_max_(Tensor & self, Scalar max) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.clamp_max_(max);
+}
+
+static PyObject * THPVariable_clamp_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "clamp_(Scalar min=None, Scalar max=None)",
+  }, /*traceable=*/true);
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (!r.isNone(0) && !r.isNone(1)) {
+    return THPVariable_Wrap(dispatch_clamp_(self_, r.scalar(0), r.scalar(1)));
+  } else if (!r.isNone(0)) {
+    return THPVariable_Wrap(dispatch_clamp_min_(self_, r.scalar(0)));
+  } else if (!r.isNone(1)) {
+    return THPVariable_Wrap(dispatch_clamp_max_(self_, r.scalar(1)));
+  } else {
+    throw std::runtime_error("At least one of 'min' or 'max' must not be None");
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_size(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "size(int64_t dim)",
+    "size()",
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    if (jit::tracer::isTracing(self_)) {
+      return wrap(jit::tracer::getSizeOf(self_, r.toInt64(0)));
+    } else {
+      return wrap(self_.size(r.toInt64(0)));
+    }
+  } else if (r.idx == 1) {
+    // we can't do the normal wrapping here because IntList maps to both
+    // torch.Size and tuple in python.
+    return THPSize_New(self_);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_stride(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "stride(int64_t dim)",
+    "stride()",
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    return wrap(self_.stride(r.toInt64(0)));
+  } else if (r.idx == 1) {
+    // yes, this is called strides in ATen.
+    IntList strides = self_.strides();
+    // we can't do the normal wrapping here because IntList maps to both
+    // torch.Size and tuple in python
+    return THPUtils_packInt64Array(strides.size(), strides.data());
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_dim(PyObject* self, PyObject* args)
+{
+   HANDLE_TH_ERRORS
+   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+   return THPUtils_packInt64(self_.dim());
+   END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_contiguous(const Tensor & self) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.contiguous();
+}
+
+static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  // avoids touching the GIL or current device if self is already contiguous
+  if (self_.is_contiguous()) {
+    Py_INCREF(self);
+    return self;
+  }
+  return THPVariable_Wrap(dispatch_contiguous(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_copy_(Tensor & self, const Tensor & other, bool non_blocking) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.copy_(other, non_blocking);
+}
+
+static PyObject * THPVariable_copy_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "copy_(Tensor other, bool non_blocking=False)",
+    "copy_(Tensor other, bool async=False)|deprecated"
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  return THPVariable_Wrap(dispatch_copy_(self_, r.tensor(0), r.toBool(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_detach(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  return THPVariable_Wrap(self_.detach());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_detach_(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  reinterpret_cast<THPVariable*>(self)->cdata.detach_();
+  Py_INCREF(self);
+  return self;
+  END_HANDLE_TH_ERRORS
+}
+
+static double dispatch_to_CDouble(const Tensor & self) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  if (self.numel() != 1) {
+    throw ValueError("only one element tensors can be converted to Python scalars");
+  }
+  return self.toCDouble();
+}
+
+static int64_t dispatch_to_CLong(const Tensor & self) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  if (self.numel() != 1) {
+    throw ValueError("only one element tensors can be converted to Python scalars");
+  }
+  return self.toCLong();
+}
+
+static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  return wrap(dispatch_to_CDouble(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (isFloatingType(self_.type().scalarType())) {
+    // we can't dispatch to toCLong here because we want to avoid ATen overflow checks;
+    // the python integral type (long in python2) can't overflow.
+    return THPUtils_packDoubleAsInt(dispatch_to_CDouble(self_));
+  } else {
+    return wrap(dispatch_to_CLong(self_));
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+// This is the __index__ function in Python which is similar to __int__, but
+// called when used as a slice.
+static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  // TODO: change the condition to `self_.dim() != 0` once we expose scalars
+  // in PyTorch.
+  if (!isIntegralType(self_.type().scalarType()) || self_.numel() != 1) {
+    throw TypeError("only integer tensors of a single element can be converted to an index");
+  }
+  return wrap(dispatch_to_CLong(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_invert(const Tensor & self) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return 1 - self;
+}
+
+static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.type().scalarType() != at::kByte) {
+    throw TypeError("~ (operator.invert) is only implemented on byte tensors");
+  }
+  return THPVariable_Wrap(dispatch_invert(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_cpu(PyObject* self, PyObject* args)
+{
+   HANDLE_TH_ERRORS
+   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+   auto backend = self_.is_sparse() ? Backend::SparseCPU : Backend::CPU;
+   auto& type = self_.type().toBackend(backend);
+   return wrap(torch::utils::dispatch_type_conversion(self_, type));
+   END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "cuda(Device? device=None, bool non_blocking=False)",
+    "cuda(Device? device=None, bool async=False)|deprecated"
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto backend = self_.is_sparse() ? at::kSparseCUDA : at::kCUDA;
+  auto& type = self_.type().toBackend(backend);
+  auto device_obj = r.device(0);
+  if (!r.isNone(0) && device_obj.is_cpu()) {
+    throw std::runtime_error("Invalid device, must be cuda device");
+  }
+  int32_t device_index = -1;
+  if (device_obj.has_index() && device_obj.is_cuda()) {
+    device_index = device_obj.index();
+  }
+  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, r.toBool(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  auto& type = self_.type().toScalarType(scalarType);
+  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  END_HANDLE_TH_ERRORS
+}
+static PyObject * THPVariable_byte(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Byte);
+}
+
+static PyObject * THPVariable_char(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Char);
+}
+
+static PyObject * THPVariable_double(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Double);
+}
+
+static PyObject * THPVariable_float(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Float);
+}
+
+static PyObject * THPVariable_half(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Half);
+}
+
+static PyObject * THPVariable_int(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Int);
+}
+
+static PyObject * THPVariable_long(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Long);
+}
+
+static PyObject * THPVariable_short(PyObject* self, PyObject* args) {
+  return THPVariable_to_type(self, ScalarType::Short);
+}
+
+static PyObject * THPVariable_element_size(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  size_t element_size = self_.type().elementSizeInBytes();
+  return THPUtils_packInt64(element_size);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_numpy(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call numpy() on Variable that requires grad. "
+        "Use var.detach().numpy() instead.");
+  }
+  return torch::utils::tensor_to_numpy(self_.data());
+  END_HANDLE_TH_ERRORS
+}
+
+// TODO: move this to ATen. We would need to expose Stream objects in ATen.
+static PyObject * THPVariable_record_stream(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+#ifdef USE_CUDA
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (!THCPStream_Check(arg)) {
+    return PyErr_Format(PyExc_TypeError, "expected Stream object");
+  }
+  void* data = self_.data_ptr();
+  THCCachingAllocator_recordStream(data, ((THCPStream*)arg)->cdata);
+  Py_RETURN_NONE;
+#else
+  throw std::runtime_error("PyTorch compiled without CUDA support");
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "requires_grad_(bool requires_grad=True)",
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto requires_grad = r.toBool(0);
+  // should we throw if requires_grad is true?  var.requires_grad = True throws here
+  // but it's nice to let this be a no-op.
+  if (!self_.is_leaf() && !requires_grad) {
+    throw std::runtime_error(autograd::utils::requires_grad_leaf_error(requires_grad));
+  }
+  if (requires_grad && !self_.is_floating_point()) {
+    throw std::runtime_error("only Tensors of floating point dtype can require gradients");
+  }
+  self_.set_requires_grad(requires_grad);
+  return THPVariable_Wrap(self_);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_item(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.is_floating_point()) {
+    return wrap(dispatch_to_CDouble(self_));
+  } else {
+    return wrap(dispatch_to_CLong(self_));
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_map_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({ "map_(Tensor other, PyObject* callable)" });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  Variable other = r.tensor(0);
+  if (self_.requires_grad() || other.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call map_() on Variable that requires grad. Use "
+        "var.detach().map_() instead.");
+  }
+  return THPVariable_Wrap(torch::utils::map_(self_, other, r.pyobject(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_map2_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({ "map2_(Tensor x, Tensor y, PyObject* callable)" });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  Variable x = r.tensor(0);
+  Variable y = r.tensor(1);
+  if (self_.requires_grad() || x.requires_grad() || y.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call map2_() on Variable that requires grad. Use "
+        "var.detach().map2_() instead.");
+  }
+  return THPVariable_Wrap(torch::utils::map2_(self_, x, y, r.pyobject(2)));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::legacy_tensor_new(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_empty(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::new_empty(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_full(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::new_full(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_ones(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::new_ones(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::new_tensor(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_zeros(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  return THPVariable_Wrap(torch::utils::new_zeros(self_.type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_storage(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  return createPyObject(*self_.storage());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_storage_type(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  auto storage = THPObjectPtr(createPyObject(*self_.storage()));
+  auto storage_type = (PyObject*)Py_TYPE(storage);
+  Py_INCREF(storage_type);
+  return storage_type;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto parsed = parse_to_conversion(args, kwargs);
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  if (!device) {
+    // device not given
+    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+    auto& type = self_.type().toScalarType(scalarType.value_or(self_.type().scalarType()));
+    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  } else {
+    // device and maybe dtype are given
+    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+    auto& layout = *torch::getLayout(self_.type().backend());
+    auto& type = torch::getType(scalarType.value_or(self_.type().scalarType()), layout, device->type());
+    const int32_t device_index = type.is_cuda() ? device->index() : -1;
+    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, non_blocking));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_tolist(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  auto self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  return torch::utils::tensor_to_list(self_.data());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "type(PyObject* dtype=None, bool non_blocking=False)",
+    "type(PyObject* dtype=None, bool async=False)|deprecated"
+  });
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.isNone(0)) {
+    return THPUtils_packString(torch::utils::type_to_string(self_.type()));
+  }
+  auto obj = r.pyobject(0);
+  std::string type_name;
+  bool is_dtype = false;
+  if (PyType_Check(obj)) {
+    if (obj == THPVariableClass) {
+      type_name = "torch.Tensor";
+    } else {
+      type_name = ((PyTypeObject*)obj)->tp_name;
+    }
+  } else if (THPUtils_checkString(obj)) {
+    type_name = THPUtils_unpackString(obj);
+  } else if (THPDtype_Check(obj)) {
+    is_dtype = true;
+  } else {
+    throw TypeError("dtype must be a type, str, or dtype object");
+  }
+  auto self_device_type = torch::getDeviceType(self_.type());
+  auto& type = is_dtype ? torch::getType(r.scalartype(0), *torch::getLayout(self_.type().backend()), self_device_type) :
+                          torch::utils::type_from_string(type_name);
+  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, at::nullopt, r.toBool(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+// generated methods start here
+
+${py_methods}
+
+PyMethodDef variable_methods[] = {
+  {"__add__", (PyCFunction)THPVariable_add, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__radd__", (PyCFunction)THPVariable_add, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__iadd__", (PyCFunction)THPVariable_add_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__rmul__", (PyCFunction)THPVariable_mul, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__mul__", (PyCFunction)THPVariable_mul, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__imul__", (PyCFunction)THPVariable_mul_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__sub__", (PyCFunction)THPVariable_sub, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__isub__", (PyCFunction)THPVariable_sub_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__div__", (PyCFunction)THPVariable_div, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__truediv__", (PyCFunction)THPVariable_div, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__idiv__", (PyCFunction)THPVariable_div_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__mod__", (PyCFunction)THPVariable_remainder, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__bool__", (PyCFunction)THPVariable_is_nonzero, METH_NOARGS, NULL},
+  {"__float__", (PyCFunction)THPVariable_float_scalar, METH_NOARGS, NULL},
+  {"__int__", (PyCFunction)THPVariable_integral_scalar, METH_NOARGS, NULL},
+  {"__long__", (PyCFunction)THPVariable_integral_scalar, METH_NOARGS, NULL},
+  {"__index__", (PyCFunction)THPVariable_index_scalar, METH_NOARGS, NULL},
+  {"__invert__", (PyCFunction)THPVariable_invert, METH_NOARGS, NULL},
+  {"__nonzero__", (PyCFunction)THPVariable_is_nonzero, METH_NOARGS, NULL},
+  {"__matmul__", (PyCFunction)THPVariable_matmul, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"apply_", (PyCFunction)THPVariable_apply_, METH_O, NULL},
+  {"byte", (PyCFunction)THPVariable_byte, METH_NOARGS, NULL},
+  {"char", (PyCFunction)THPVariable_char, METH_NOARGS, NULL},
+  {"clamp", (PyCFunction)THPVariable_clamp, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"clamp_", (PyCFunction)THPVariable_clamp_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"contiguous", (PyCFunction)THPVariable_contiguous, METH_NOARGS, NULL},
+  {"copy_", (PyCFunction)THPVariable_copy_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cpu", (PyCFunction)THPVariable_cpu, METH_NOARGS, NULL},
+  {"cuda", (PyCFunction)THPVariable_cuda, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"dim", (PyCFunction)THPVariable_dim, METH_NOARGS, NULL},
+  {"detach", (PyCFunction)THPVariable_detach, METH_NOARGS, NULL},
+  {"detach_", (PyCFunction)THPVariable_detach_, METH_NOARGS, NULL},
+  {"double", (PyCFunction)THPVariable_double, METH_NOARGS, NULL},
+  {"element_size", (PyCFunction)THPVariable_element_size, METH_NOARGS, NULL},
+  {"float", (PyCFunction)THPVariable_float, METH_NOARGS, NULL},
+  {"half", (PyCFunction)THPVariable_half, METH_NOARGS, NULL},
+  {"int", (PyCFunction)THPVariable_int, METH_NOARGS, NULL},
+  {"item", (PyCFunction)THPVariable_item, METH_NOARGS, NULL},
+  {"long", (PyCFunction)THPVariable_long, METH_NOARGS, NULL},
+  {"map_", (PyCFunction)THPVariable_map_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"map2_", (PyCFunction)THPVariable_map2_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ndimension", (PyCFunction)THPVariable_dim, METH_NOARGS, NULL},
+  {"nelement", (PyCFunction)THPVariable_numel, METH_NOARGS, NULL},
+  {"new", (PyCFunction)THPVariable_new, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_empty", (PyCFunction)THPVariable_new_empty, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_full", (PyCFunction)THPVariable_new_full, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_ones", (PyCFunction)THPVariable_new_ones, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_tensor", (PyCFunction)THPVariable_new_tensor, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_zeros", (PyCFunction)THPVariable_new_zeros, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"numpy", (PyCFunction)THPVariable_numpy, METH_NOARGS, NULL},
+  {"record_stream", (PyCFunction)THPVariable_record_stream, METH_O, NULL},
+  {"requires_grad_", (PyCFunction)THPVariable_requires_grad_, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"short", (PyCFunction)THPVariable_short, METH_NOARGS, NULL},
+  {"size", (PyCFunction)THPVariable_size, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"storage", (PyCFunction)THPVariable_storage, METH_NOARGS, NULL},
+  {"storage_type", (PyCFunction)THPVariable_storage_type, METH_NOARGS, NULL},
+  {"stride", (PyCFunction)THPVariable_stride, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"to", (PyCFunction)THPVariable_to, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"tolist", (PyCFunction)THPVariable_tolist, METH_NOARGS, NULL},
+  {"type", (PyCFunction)THPVariable_type, METH_VARARGS | METH_KEYWORDS, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_variable_methods_dispatch.h b/tools/autograd/templates/python_variable_methods_dispatch.h
new file mode 100644
index 0000000..97b4c53
--- /dev/null
+++ b/tools/autograd/templates/python_variable_methods_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// ${generated_comment}
+
+#include "torch/csrc/utils/auto_gil.h"
+
+#include <ATen/ATen.h>
+
+// Contains inline wrappers around ATen functions that release the GIL and
+// switch to the correct CUDA device.
+
+namespace torch { namespace autograd {
+
+using at::Tensor;
+using at::Scalar;
+using at::TensorList;
+using at::IntList;
+using at::Generator;
+using at::SparseTensorRef;
+using at::Storage;
+
+${py_method_dispatch}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h
new file mode 100644
index 0000000..08308bd
--- /dev/null
+++ b/tools/autograd/templates/variable_factories.h
@@ -0,0 +1,42 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+#include <ATen/ArrayRef.h>
+
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+#define TENSOR(T, S, _1)                                                    \
+  inline autograd::Variable tensor(                                         \
+      at::ArrayRef<T> values, const at::TensorOptions& options) {           \
+    at::Tensor result = at::tensor(values, options.discard_runtime_type()); \
+    return autograd::make_variable(result, options.requires_grad());        \
+  }                                                                         \
+  inline autograd::Variable tensor(                                         \
+      std::initializer_list<T> values, const at::TensorOptions& options) {  \
+    return torch::tensor(at::ArrayRef<T>(values), options);                 \
+  }                                                                         \
+  inline autograd::Variable tensor(                                         \
+      T value, const at::TensorOptions& options) {                          \
+    return torch::tensor(at::ArrayRef<T>(value), options);                  \
+  }                                                                         \
+  inline autograd::Variable tensor(at::ArrayRef<T> values) {                \
+    return torch::tensor(std::move(values), at::dtype(at::k##S));           \
+  }                                                                         \
+  inline autograd::Variable tensor(std::initializer_list<T> values) {       \
+    return torch::tensor(at::ArrayRef<T>(values));                          \
+  }                                                                         \
+  inline autograd::Variable tensor(T value) {                               \
+    return torch::tensor(at::ArrayRef<T>(value));                           \
+  }
+AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR)
+#undef TENSOR
+
+${function_definitions}
+} // namespace torch
diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py
new file mode 100644
index 0000000..6758e71
--- /dev/null
+++ b/tools/autograd/utils.py
@@ -0,0 +1,65 @@
+import re
+import os
+from .nested_dict import nested_dict
+
+
+__all__ = [
+    'CodeTemplate', 'IDENT_REGEX', 'YamlLoader', 'nested_dict',
+    'split_name_params', 'write',
+]
+
+try:
+    from src.ATen.code_template import CodeTemplate
+except ImportError:
+    from tools.shared.module_loader import import_module
+    CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as YamlLoader
+except ImportError:
+    from yaml import Loader as YamlLoader
+
+
+GENERATED_COMMENT = CodeTemplate(
+    "@" + "generated from tools/autograd/templates/${filename}")
+
+# Matches "foo" in "foo, bar" but not "foobar". Used to search for the
+# occurence of a parameter in the derivative formula
+IDENT_REGEX = r'(^|\W){}($|\W)'
+
+
+# TODO: Use a real parser here; this will get bamboozled
+# by signatures that contain things like std::array<bool, 2> (note the space)
+def split_name_params(prototype):
+    name, params = re.match('(\w+)\((.*)\)', prototype).groups()
+    return name, params.split(', ')
+
+
+# When tracing, we record inplace operations as out-of-place operations,
+# because we don't have a story for side effects in the IR yet.
+#
+# Doing this un-inplacing is a little delicate however; __and__ is NOT inplace!
+# TODO: Do something more robust
+def uninplace_api_name(api_name):
+    if api_name.endswith('_') and not api_name.endswith('__'):
+        api_name = api_name[:-1]
+    return api_name
+
+
+def write(dirname, name, template, env):
+    env['generated_comment'] = GENERATED_COMMENT.substitute(filename=name)
+    path = os.path.join(dirname, name)
+    # See Note [Unchanging results for ninja]
+    try:
+        with open(path, 'r') as f:
+            old_val = f.read()
+    except IOError:
+        old_val = None
+    new_val = template.substitute(env)
+    if old_val != new_val:
+        with open(path, 'w') as f:
+            print("Writing {}".format(path))
+            f.write(new_val)
+    else:
+        print("Skipped writing {}".format(path))
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
new file mode 100755
index 0000000..0bfe87b
--- /dev/null
+++ b/tools/build_pytorch_libs.bat
@@ -0,0 +1,206 @@
+:: @echo off
+cd "%~dp0/.."
+
+set BASE_DIR=%cd:\=/%
+set TORCH_LIB_DIR=%cd:\=/%/torch/lib
+set INSTALL_DIR=%cd:\=/%/torch/lib/tmp_install
+set THIRD_PARTY_DIR=%cd:\=/%/third_party
+set PATH=%INSTALL_DIR%/bin;%PATH%
+set BASIC_C_FLAGS= /I%INSTALL_DIR%/include /I%INSTALL_DIR%/include/TH /I%INSTALL_DIR%/include/THC /I%INSTALL_DIR%/include/THS /I%INSTALLDIR%/include/THCS /I%INSTALLDIR%/include/THPP /I%INSTALLDIR%/include/THNN /I%INSTALLDIR%/include/THCUNN
+set BASIC_CUDA_FLAGS= -I%INSTALL_DIR%/include -I%INSTALL_DIR%/include/TH -I%INSTALL_DIR%/include/THC -I%INSTALL_DIR%/include/THS -I%INSTALLDIR%/include/THCS -I%INSTALLDIR%/include/THPP -I%INSTALLDIR%/include/THNN -I%INSTALLDIR%/include/THCUNN
+set LDFLAGS=/LIBPATH:%INSTALL_DIR%/lib
+:: set TORCH_CUDA_ARCH_LIST=6.1
+
+set CWRAP_FILES=%BASE_DIR%/torch/lib/ATen/Declarations.cwrap;%BASE_DIR%/torch/lib/ATen/Local.cwrap;%BASE_DIR%/torch/lib/THNN/generic/THNN.h;%BASE_DIR%/torch/lib/THCUNN/generic/THCUNN.h;%BASE_DIR%/torch/lib/ATen/nn.yaml
+set C_FLAGS=%BASIC_C_FLAGS% /D_WIN32 /Z7 /EHa /DNOMINMAX
+set LINK_FLAGS=/DEBUG:FULL
+
+mkdir torch/lib/tmp_install
+
+IF "%~1"=="--use-cuda" (
+  set /a USE_CUDA=1
+  shift
+) ELSE (
+  set /a USE_CUDA=0
+)
+
+IF "%~1"=="--use-rocm" (
+  set /a USE_ROCM=1
+  shift
+) ELSE (
+  set /a USE_ROCM=0
+)
+
+IF "%~1"=="--use-nnpack" (
+  set /a NO_NNPACK=0
+  set /a USE_NNPACK=1
+  shift
+) ELSE (
+  set /a NO_NNPACK=1
+  set /a USE_NNPACK=0
+)
+
+IF "%~1"=="--use-mkldnn" (
+  set /a NO_MKLDNN=0
+  shift
+) ELSE (
+  set /a NO_MKLDNN=1
+)
+
+IF "%~1"=="--use-gloo-ibverbs" (
+  set /a USE_GLOO_IBVERBS=1
+  echo Warning: gloo iverbs is enabled but build is not yet implemented 1>&2
+  shift
+) ELSE (
+  set /a USE_GLOO_IBVERBS=0
+)
+
+IF "%~1"=="--use-distributed-mw" (
+  set /a USE_DISTRIBUTED_MW=1
+  echo Warning: distributed mw is enabled but build is not yet implemented 1>&2
+  shift
+) ELSE (
+  set /a USE_DISTRIBUTED_MW=0
+)
+
+set BUILD_TYPE=Release
+IF "%DEBUG%"=="1" (
+  set BUILD_TYPE=Debug
+)
+IF "%REL_WITH_DEB_INFO%"=="1" (
+  set BUILD_TYPE=RelWithDebInfo
+)
+
+IF NOT DEFINED MAX_JOBS (
+  set MAX_JOBS=%NUMBER_OF_PROCESSORS%
+)
+
+IF "%CMAKE_GENERATOR%"=="" (
+  set CMAKE_GENERATOR_COMMAND=
+  set MAKE_COMMAND=msbuild INSTALL.vcxproj /p:Configuration=Release
+) ELSE (
+  set CMAKE_GENERATOR_COMMAND=-G "%CMAKE_GENERATOR%"
+  IF "%CMAKE_GENERATOR%"=="Ninja" (
+    IF "%CC%"== "" set CC=cl.exe
+    IF "%CXX%"== "" set CXX=cl.exe
+    set MAKE_COMMAND=cmake --build . --target install --config %BUILD_TYPE% -- -j%MAX_JOBS%
+  ) ELSE (
+    set MAKE_COMMAND=msbuild INSTALL.vcxproj /p:Configuration=%BUILD_TYPE%
+  )
+)
+
+
+:read_loop
+if "%1"=="" goto after_loop
+if "%1"=="caffe2" (
+  call:build_caffe2 %~1
+) ELSE (
+  set "IS_OURS="
+  IF "%1"=="THD" set IS_OURS=1
+  IF "%1"=="libshm_windows" set IS_OURS=1
+  if defined IS_OURS (
+    cd torch\lib
+    call:build %~1
+    cd ..\..
+  ) ELSE (
+    cd third_party
+    call:build %~1
+    cd ..
+  )
+)
+shift
+goto read_loop
+
+:after_loop
+
+cd torch/lib
+
+copy /Y tmp_install\lib\* .
+IF EXIST ".\tmp_install\bin" (
+  copy /Y tmp_install\bin\* .
+)
+xcopy /Y /E tmp_install\include\*.* include\*.*
+xcopy /Y ..\..\aten\src\THNN\generic\THNN.h  .
+xcopy /Y ..\..\aten\src\THCUNN\generic\THCUNN.h .
+
+cd ..\..
+
+goto:eof
+
+:build
+  @setlocal
+  IF NOT "%PREBUILD_COMMAND%"=="" call "%PREBUILD_COMMAND%" %PREBUILD_COMMAND_ARGS%
+  mkdir build\%~1
+  cd build/%~1
+  cmake ../../%~1 %CMAKE_GENERATOR_COMMAND% ^
+                  -DCMAKE_MODULE_PATH=%BASE_DIR%/cmake/FindCUDA ^
+                  -DTorch_FOUND="1" ^
+                  -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^
+                  -DCMAKE_C_FLAGS="%C_FLAGS%" ^
+                  -DCMAKE_SHARED_LINKER_FLAGS="%LINK_FLAGS%" ^
+                  -DCMAKE_CXX_FLAGS="%C_FLAGS% %CPP_FLAGS%" ^
+                  -DCUDA_NVCC_FLAGS="%BASIC_CUDA_FLAGS%" ^
+                  -Dcwrap_files="%CWRAP_FILES%" ^
+                  -DTH_INCLUDE_PATH="%INSTALL_DIR%/include" ^
+                  -DTH_LIB_PATH="%INSTALL_DIR%/lib" ^
+                  -DTH_LIBRARIES="%INSTALL_DIR%/lib/caffe2.lib" ^
+                  -DTHS_LIBRARIES="%INSTALL_DIR%/lib/caffe2.lib" ^
+                  -DTHC_LIBRARIES="%INSTALL_DIR%/lib/caffe2_gpu.lib" ^
+                  -DTHCS_LIBRARIES="%INSTALL_DIR%/lib/caffe2_gpu.lib" ^
+                  -DCAFFE2_LIBRARIES="%INSTALL_DIR%/lib/caffe2.lib" ^
+                  -DTHNN_LIBRARIES="%INSTALL_DIR%/lib/caffe2.lib" ^
+                  -DTHCUNN_LIBRARIES="%INSTALL_DIR%/lib/caffe2_gpu.lib" ^
+                  -DTH_SO_VERSION=1 ^
+                  -DTHC_SO_VERSION=1 ^
+                  -DTHNN_SO_VERSION=1 ^
+                  -DTHCUNN_SO_VERSION=1 ^
+                  -DUSE_CUDA=%USE_CUDA% ^
+                  -DNO_NNPACK=%NO_NNPACK% ^
+                  -Dnanopb_BUILD_GENERATOR=0 ^
+                  -DCMAKE_BUILD_TYPE=%BUILD_TYPE%
+
+  %MAKE_COMMAND%
+  IF ERRORLEVEL 1 exit 1
+  IF NOT ERRORLEVEL 0 exit 1
+  cd ../..
+  @endlocal
+
+goto:eof
+
+:build_caffe2
+  @setlocal
+  IF NOT "%PREBUILD_COMMAND%"=="" call "%PREBUILD_COMMAND%" %PREBUILD_COMMAND_ARGS%
+  mkdir build
+  cd build
+  cmake .. %CMAKE_GENERATOR_COMMAND% ^
+                  -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^
+                  -DBUILD_CAFFE2=OFF ^
+                  -DBUILD_ATEN=ON ^
+                  -DBUILD_PYTHON=OFF ^
+                  -DBUILD_BINARY=OFF ^
+                  -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
+                  -DUSE_CUDA=%USE_CUDA% ^
+                  -DUSE_NNPACK=%USE_NNPACK% ^
+                  -DCUDNN_INCLUDE_DIR="%CUDNN_INCLUDE_DIR%" ^
+                  -DCUDNN_LIB_DIR="%CUDNN_LIB_DIR%" ^
+                  -DCUDNN_LIBRARY="%CUDNN_LIBRARY%" ^
+                  -DNO_MKLDNN=%NO_MKLDNN% ^
+                  -DMKLDNN_INCLUDE_DIR="%MKLDNN_INCLUDE_DIR%" ^
+                  -DMKLDNN_LIB_DIR="%MKLDNN_LIB_DIR%" ^
+                  -DMKLDNN_LIBRARY="%MKLDNN_LIBRARY%" ^
+                  -DATEN_NO_CONTRIB=1 ^
+                  -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^
+                  -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^
+                  -DCMAKE_C_FLAGS="%USER_CFLAGS%" ^
+                  -DCMAKE_CXX_FLAGS="/EHa %USER_CFLAGS%" ^
+                  -DCMAKE_EXE_LINKER_FLAGS="%USER_LDFLAGS%" ^
+                  -DCMAKE_SHARED_LINKER_FLAGS="%USER_LDFLAGS%" ^
+                  -DUSE_ROCM=%USE_ROCM%
+
+  %MAKE_COMMAND%
+  IF ERRORLEVEL 1 exit 1
+  IF NOT ERRORLEVEL 0 exit 1
+  cd ..
+  @endlocal
+
+goto:eof
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
new file mode 100755
index 0000000..f34ac1f
--- /dev/null
+++ b/tools/build_pytorch_libs.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+
+# Shell script used to build the aten/*, caffe2/*, and third_party/*
+# dependencies prior to linking libraries and passing headers to the Python
+# extension compilation stage. This file is used from setup.py, but can also be
+# called standalone to compile the libraries outside of the overall PyTorch
+# build process.
+#
+# TODO: Replace this with the root-level CMakeLists.txt
+
+set -ex
+
+# Options for building only a subset of the libraries
+USE_CUDA=0
+USE_ROCM=0
+USE_NNPACK=0
+USE_MKLDNN=0
+USE_GLOO_IBVERBS=0
+USE_DISTRIBUTED_MW=0
+FULL_CAFFE2=0
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --use-cuda)
+          USE_CUDA=1
+          ;;
+      --use-rocm)
+          USE_ROCM=1
+          ;;
+      --use-nnpack)
+          USE_NNPACK=1
+          ;;
+      --use-mkldnn)
+          USE_MKLDNN=1
+          ;;
+      --use-gloo-ibverbs)
+          USE_GLOO_IBVERBS=1
+          ;;
+      --use-distributed-mw)
+          USE_DISTRIBUTED_MW=1
+          ;;
+      --full-caffe2)
+          FULL_CAFFE2=1
+          ;;
+      --cuda-static-link)
+          CAFFE2_STATIC_LINK_CUDA=1
+          ;;
+      *)
+          break
+          ;;
+    esac
+    shift
+done
+
+CMAKE_INSTALL=${CMAKE_INSTALL-make install}
+
+# Save user specified env vars, we will manually propagate them
+# to cmake.  We copy distutils semantics, referring to
+# cpython/Lib/distutils/sysconfig.py as the source of truth
+USER_CFLAGS=""
+USER_LDFLAGS=""
+if [[ -n "$LDFLAGS" ]]; then
+  USER_LDFLAGS="$USER_LDFLAGS $LDFLAGS"
+fi
+if [[ -n "$CFLAGS" ]]; then
+  USER_CFLAGS="$USER_CFLAGS $CFLAGS"
+  USER_LDFLAGS="$USER_LDFLAGS $CFLAGS"
+fi
+if [[ -n "$CPPFLAGS" ]]; then
+  # Unlike distutils, NOT modifying CXX
+  USER_C_CFLAGS="$USER_CFLAGS $CPPFLAGS"
+  USER_LDFLAGS="$USER_LDFLAGS $CPPFLAGS"
+fi
+
+# Use ccache if available (this path is where Homebrew installs ccache symlinks)
+if [ "$(uname)" == 'Darwin' ]; then
+  if [ -d '/usr/local/opt/ccache/libexec' ]; then
+    CCACHE_WRAPPER_PATH=/usr/local/opt/ccache/libexec
+  fi
+fi
+
+cd "$(dirname "$0")/.."
+PWD=`printf "%q\n" "$(pwd)"`
+BASE_DIR="$PWD"
+TORCH_LIB_DIR="$BASE_DIR/torch/lib"
+INSTALL_DIR="$TORCH_LIB_DIR/tmp_install"
+THIRD_PARTY_DIR="$BASE_DIR/third_party"
+
+CMAKE_VERSION=${CMAKE_VERSION:="cmake"}
+C_FLAGS=" -I\"$INSTALL_DIR/include\" \
+  -I\"$INSTALL_DIR/include/TH\" -I\"$INSTALL_DIR/include/THC\" \
+  -I\"$INSTALL_DIR/include/THS\" -I\"$INSTALL_DIR/include/THCS\" \
+  -I\"$INSTALL_DIR/include/THNN\" -I\"$INSTALL_DIR/include/THCUNN\""
+# Workaround OpenMPI build failure
+# ImportError: /build/pytorch-0.2.0/.pybuild/pythonX.Y_3.6/build/torch/_C.cpython-36m-x86_64-linux-gnu.so: undefined symbol: _ZN3MPI8Datatype4FreeEv
+# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=686926
+C_FLAGS="${C_FLAGS} -DOMPI_SKIP_MPICXX=1"
+LDFLAGS="-L\"$INSTALL_DIR/lib\" "
+LD_POSTFIX=".so"
+if [[ $(uname) == 'Darwin' ]]; then
+    LD_POSTFIX=".dylib"
+fi
+CPP_FLAGS=" -std=c++11 "
+GLOO_FLAGS=""
+THD_FLAGS=""
+NCCL_ROOT_DIR=${NCCL_ROOT_DIR:-$INSTALL_DIR}
+if [[ $USE_CUDA -eq 1 ]]; then
+    GLOO_FLAGS="-DUSE_CUDA=1 -DNCCL_ROOT_DIR=$NCCL_ROOT_DIR"
+fi
+# Gloo infiniband support
+if [[ $USE_GLOO_IBVERBS -eq 1 ]]; then
+    GLOO_FLAGS+=" -DUSE_IBVERBS=1 -DBUILD_SHARED_LIBS=1"
+    THD_FLAGS="-DUSE_GLOO_IBVERBS=1"
+fi
+if [[ $USE_DISTRIBUTED_MW -eq 1 ]]; then
+    THD_FLAGS+="-DUSE_DISTRIBUTED_MW=1"
+fi
+CWRAP_FILES="\
+$BASE_DIR/torch/lib/ATen/Declarations.cwrap;\
+$BASE_DIR/torch/lib/THNN/generic/THNN.h;\
+$BASE_DIR/torch/lib/THCUNN/generic/THCUNN.h;\
+$BASE_DIR/torch/lib/ATen/nn.yaml"
+CUDA_NVCC_FLAGS=$C_FLAGS
+if [[ -z "$CUDA_DEVICE_DEBUG" ]]; then
+  CUDA_DEVICE_DEBUG=0
+fi
+if [ -z "$NUM_JOBS" ]; then
+  NUM_JOBS="$(getconf _NPROCESSORS_ONLN)"
+fi
+
+BUILD_TYPE="Release"
+if [[ -n "$DEBUG" && $DEBUG -ne 0 ]]; then
+  BUILD_TYPE="Debug"
+elif [[ -n "$REL_WITH_DEB_INFO" && $REL_WITH_DEB_INFO -ne 0 ]]; then
+  BUILD_TYPE="RelWithDebInfo"
+fi
+
+echo "Building in $BUILD_TYPE mode"
+
+# Used to build an individual library
+function build() {
+  # We create a build directory for the library, which will
+  # contain the cmake output
+  mkdir -p build/$1
+  pushd build/$1
+  BUILD_C_FLAGS=''
+  case $1 in
+      THCS | THCUNN ) BUILD_C_FLAGS=$C_FLAGS;;
+      nanopb ) BUILD_C_FLAGS=$C_FLAGS" -fPIC -fexceptions";;
+      *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";;
+  esac
+  # TODO: The *_LIBRARIES cmake variables should eventually be
+  # deprecated because we are using .cmake files to handle finding
+  # installed libraries instead
+  ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+              ${CMAKE_GENERATOR} \
+              -DTorch_FOUND="1" \
+              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+              -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \
+              -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+              -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+              -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+              -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \
+              -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \
+              -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \
+              -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \
+              -Dcwrap_files="$CWRAP_FILES" \
+              -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
+              -DTH_LIB_PATH="$INSTALL_DIR/lib" \
+              -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \
+              -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \
+              -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \
+              -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \
+              -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \
+              -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \
+              -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \
+              -DTH_SO_VERSION=1 \
+              -DTHC_SO_VERSION=1 \
+              -DTHNN_SO_VERSION=1 \
+              -DTHCUNN_SO_VERSION=1 \
+              -DTHD_SO_VERSION=1 \
+              -DUSE_CUDA=$USE_CUDA \
+              -DNO_NNPACK=$((1-$USE_NNPACK)) \
+              -DNCCL_EXTERNAL=1 \
+              -Dnanopb_BUILD_GENERATOR=0 \
+              -DCMAKE_DEBUG_POSTFIX="" \
+              -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+              ${@:2} \
+              -DCMAKE_EXPORT_COMPILE_COMMANDS=1
+  ${CMAKE_INSTALL} -j"$NUM_JOBS"
+  popd
+
+  # Fix rpaths of shared libraries
+  if [[ $(uname) == 'Darwin' ]]; then
+    pushd "$INSTALL_DIR/lib"
+    for lib in *.dylib; do
+      echo "Updating install_name for $lib"
+      install_name_tool -id @rpath/$lib $lib
+    done
+    popd
+  fi
+}
+
+function path_remove {
+  # Delete path by parts so we can never accidentally remove sub paths
+  PATH=${PATH//":$1:"/":"} # delete any instances in the middle
+  PATH=${PATH/#"$1:"/} # delete any instance at the beginning
+  PATH=${PATH/%":$1"/} # delete any instance in the at the end
+}
+
+function build_nccl() {
+  mkdir -p build/nccl
+  pushd build/nccl
+  ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+              ${CMAKE_GENERATOR} \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+              -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \
+              -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+              -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \
+              -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \
+              -DNUM_JOBS="$NUM_JOBS"
+  ${CMAKE_INSTALL} -j"$NUM_JOBS"
+  mkdir -p ${INSTALL_DIR}/lib
+  cp "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1"
+  if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then
+    ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so"
+  fi
+  popd
+}
+
+# purposefully not using build() because we need Caffe2 to build the same
+# regardless of whether it is inside PyTorch or not, so it
+# cannot take any special flags
+# special flags need to be part of the Caffe2 build itself
+#
+# However, we do explicitly pass library paths when setup.py has already
+# detected them (to ensure that we have a consistent view between the
+# PyTorch and Caffe2 builds.)
+function build_caffe2() {
+  if [[ -z $EXTRA_CAFFE2_CMAKE_FLAGS ]]; then
+    EXTRA_CAFFE2_CMAKE_FLAGS=()
+  fi
+  if [[ -n $CCACHE_WRAPPER_PATH ]]; then
+    EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_C_COMPILER=$CCACHE_WRAPPER_PATH/gcc")
+    EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_CXX_COMPILER=$CCACHE_WRAPPER_PATH/g++")
+  fi
+  if [[ -n $CMAKE_PREFIX_PATH ]]; then
+    EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH")
+  fi
+
+  mkdir -p build
+  pushd build
+  ${CMAKE_VERSION} .. \
+  ${CMAKE_GENERATOR} \
+      -DBUILDING_WITH_TORCH_LIBS=ON \
+      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+      -DBUILD_CAFFE2=$FULL_CAFFE2 \
+      -DBUILD_ATEN=ON \
+      -DBUILD_PYTHON=$FULL_CAFFE2 \
+      -DBUILD_BINARY=OFF \
+      -DBUILD_SHARED_LIBS=ON \
+      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+      -DUSE_CUDA=$USE_CUDA \
+      -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
+      -DUSE_ROCM=$USE_ROCM \
+      -DUSE_NNPACK=$USE_NNPACK \
+      -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
+      -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
+      -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
+      -DUSE_MKLDNN=$USE_MKLDNN \
+      -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \
+      -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
+      -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
+      -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+      -DCMAKE_C_FLAGS="$USER_CFLAGS" \
+      -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \
+      -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+      -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]}
+      # STOP!!! Are you trying to add a C or CXX flag?  Add it
+      # to CMakeLists.txt and aten/CMakeLists.txt, not here.
+      # We need the vanilla cmake build to work.
+  ${CMAKE_INSTALL} -j"$NUM_JOBS"
+
+  # Install Python proto files
+  if [[ $FULL_CAFFE2 -ne 0 ]]; then
+    find . -name proto
+    for proto_file in ./caffe/proto/*.py; do
+      cp $proto_file "$BASE_DIR/caffe/proto/"
+    done
+    for proto_file in ./caffe2/proto/*.py; do
+      cp $proto_file "$BASE_DIR/caffe2/proto/"
+    done
+  fi
+  popd
+
+  # Fix rpaths of shared libraries
+  if [[ $(uname) == 'Darwin' ]]; then
+    pushd "$INSTALL_DIR/lib"
+    for lib in *.dylib; do
+      echo "Updating install_name for $lib"
+      install_name_tool -id @rpath/$lib $lib
+    done
+    popd
+  fi
+}
+
+# In the torch/lib directory, create an installation directory
+mkdir -p torch/lib/tmp_install
+
+# Build
+for arg in "$@"; do
+    if [[ "$arg" == "nccl" ]]; then
+        pushd $THIRD_PARTY_DIR
+        build_nccl
+        popd
+    elif [[ "$arg" == "gloo" ]]; then
+        pushd "$THIRD_PARTY_DIR"
+        build gloo $GLOO_FLAGS
+        popd
+    elif [[ "$arg" == "caffe2" ]]; then
+        pushd $BASE_DIR
+        build_caffe2
+        popd
+    elif [[ "$arg" == "THD" ]]; then
+        pushd "$TORCH_LIB_DIR"
+        build THD $THD_FLAGS
+        popd
+    elif [[ "$arg" == "libshm" ]] || [[ "$arg" == "libshm_windows" ]]; then
+        pushd "$TORCH_LIB_DIR"
+        build $arg
+        popd
+    elif [[ "$arg" == "c10d" ]]; then
+        pushd "$TORCH_LIB_DIR"
+        build c10d
+        popd
+    else
+        pushd "$THIRD_PARTY_DIR"
+        build $arg
+        popd
+    fi
+done
+
+pushd torch/lib
+
+# If all the builds succeed we copy the libraries, headers,
+# binaries to torch/lib
+rm -rf "$INSTALL_DIR/lib/cmake"
+rm -rf "$INSTALL_DIR/lib/python"
+cp -r "$INSTALL_DIR/lib"/* .
+if [ -d "$INSTALL_DIR/lib64/" ]; then
+    cp -r "$INSTALL_DIR/lib64"/* .
+fi
+cp ../../aten/src/THNN/generic/THNN.h .
+cp ../../aten/src/THCUNN/generic/THCUNN.h .
+cp -r "$INSTALL_DIR/include" .
+if [ -d "$INSTALL_DIR/bin/" ]; then
+    cp -r "$INSTALL_DIR/bin/"/* .
+fi
+
+popd
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
new file mode 100644
index 0000000..abbadc7
--- /dev/null
+++ b/tools/clang_tidy.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os.path
+import re
+import subprocess
+import sys
+
+DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?"
+
+# @@ -start,count +start,count @@
+CHUNK_PATTERN = r"^@@\s+-\d+,\d+\s+\+(\d+)(?:,(\d+))?\s+@@"
+
+
+def run_shell_command(arguments, process_name=None):
+    """Executes a shell command."""
+    assert len(arguments) > 0
+    try:
+        output = subprocess.check_output(arguments, stderr=subprocess.STDOUT)
+    except OSError:
+        _, e, _ = sys.exc_info()
+        process_name = process_name or arguments[0]
+        raise RuntimeError("Error executing {}: {}".format(process_name, e))
+    else:
+        return output.decode()
+
+
+def transform_globs_into_regexes(globs):
+    """Turns glob patterns into regular expressions."""
+    return [glob.replace("*", ".*").replace("?", ".") for glob in globs]
+
+
+def get_file_patterns(globs, regexes):
+    """Returns a list of compiled regex objects from globs and regex pattern strings."""
+    regexes += transform_globs_into_regexes(globs)
+    if not regexes:
+        regexes = [DEFAULT_FILE_PATTERN]
+    return [re.compile(regex + "$") for regex in regexes]
+
+
+def git_diff(args, verbose):
+    """Executes a git diff command in the shell and returns its output."""
+    # --no-pager gets us the plain output, without pagination.
+    # --no-color removes color codes.
+    command = ["git", "--no-pager", "diff", "--no-color"] + args
+    if verbose:
+        print(" ".join(command))
+    return run_shell_command(command, process_name="git diff")
+
+
+def filter_files(files, file_patterns):
+    """Returns all files that match any of the patterns."""
+    filtered = []
+    for file in files:
+        for pattern in file_patterns:
+            if pattern.match(file):
+                filtered.append(file)
+    return filtered
+
+
+def get_changed_files(revision, paths, verbose):
+    """Runs git diff to get the paths of all changed files."""
+    # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy).
+    # --name-only makes git diff return only the file paths, without any of the source changes.
+    args = ["--diff-filter", "AMU", "--ignore-all-space", "--name-only", revision]
+    output = git_diff(args + paths, verbose)
+    return output.split("\n")
+
+
+def get_all_files(paths):
+    """Yields all files in any of the given paths"""
+    for path in paths:
+        for root, _, files in os.walk(path):
+            for file in files:
+                yield os.path.join(root, file)
+
+
+def get_changed_lines(revision, filename, verbose):
+    """Runs git diff to get the line ranges of all file changes."""
+    output = git_diff(["--unified=0", revision, filename], verbose)
+    changed_lines = []
+    for chunk in re.finditer(CHUNK_PATTERN, output, re.MULTILINE):
+        start = int(chunk.group(1))
+        count = int(chunk.group(2) or 1)
+        changed_lines.append([start, start + count])
+
+    return {"name": filename, "lines": changed_lines}
+
+
+def run_clang_tidy(options, line_filters, files):
+    """Executes the actual clang-tidy command in the shell."""
+    command = [options.clang_tidy_exe, "-p", options.compile_commands_dir]
+    if not options.config_file and os.path.exists(".clang-tidy"):
+        options.config_file = ".clang-tidy"
+    if options.config_file:
+        import yaml
+
+        with open(options.config_file) as config:
+            # Here we convert the YAML config file to a JSON blob.
+            command += ["-config", json.dumps(yaml.load(config))]
+    if options.checks:
+        command += ["-checks", options.checks]
+    if line_filters:
+        command += ["-line-filter", json.dumps(line_filters)]
+    command += ["-{}".format(arg) for arg in options.extra_args]
+    command += files
+
+    if options.verbose:
+        print(" ".join(command))
+    if options.show_command_only:
+        command = [re.sub(r"^([{[].*[]}])$", r"'\1'", arg) for arg in command]
+        return " ".join(command)
+
+    return run_shell_command(command)
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description="Run Clang-Tidy (on your Git changes)")
+    parser.add_argument(
+        "-c",
+        "--clang-tidy-exe",
+        default="clang-tidy",
+        help="Path to clang-tidy executable",
+    )
+    parser.add_argument(
+        "-e",
+        "--extra-args",
+        nargs="+",
+        default=[],
+        help="Extra arguments to forward to clang-tidy, without the hypen (e.g. -e 'header-filter=\"path\"')",
+    )
+    parser.add_argument(
+        "-g",
+        "--glob",
+        nargs="+",
+        default=[],
+        help="File patterns as UNIX globs (support * and ?, not recursive **)",
+    )
+    parser.add_argument(
+        "-x",
+        "--regex",
+        nargs="+",
+        default=[],
+        help="File patterns as regular expressions",
+    )
+    parser.add_argument(
+        "-d",
+        "--compile-commands-dir",
+        default=".",
+        help="Path to the folder containing compile_commands.json",
+    )
+    parser.add_argument("-r", "--revision", help="Git revision to get changes from")
+    parser.add_argument(
+        "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths"
+    )
+    parser.add_argument(
+        "-s",
+        "--show-command-only",
+        action="store_true",
+        help="Only show the command to be executed, without running it",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--config-file",
+        help="Path to a clang-tidy config file. Defaults to '.clang-tidy'.",
+    )
+    parser.add_argument(
+        "--checks", help="Appends checks to those from the config file (if any)"
+    )
+    return parser.parse_args()
+
+
+def main():
+    options = parse_options()
+    if options.revision:
+        files = get_changed_files(options.revision, options.paths, options.verbose)
+    else:
+        files = get_all_files(options.paths)
+    file_patterns = get_file_patterns(options.glob, options.regex)
+    files = filter_files(files, file_patterns)
+
+    # clang-tidy error's when it does not get input files.
+    if not files:
+        print("No files detected.")
+        sys.exit()
+
+    line_filters = []
+    if options.revision:
+        for filename in files:
+            changed_lines = get_changed_lines(
+                options.revision, filename, options.verbose
+            )
+            line_filters.append(changed_lines)
+
+    print(run_clang_tidy(options, line_filters, files))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/convert.vim b/tools/convert.vim
new file mode 100644
index 0000000..c83d953
--- /dev/null
+++ b/tools/convert.vim
@@ -0,0 +1,52 @@
+"Slightly adjust indentation
+%s/^   /        /g
+
+" # -> len
+%s/#\(\S*\) /len(\1)/g
+
+" for loops
+%s/for\( \)\{-\}\(\S*\)\( \)\{-\}=\( \)\{-\}\(\S*\),\( \)\{-\}\(\S*\)\( \)\{-\}do/for \2 in range(\5, \7+1)/g
+
+" Change comments
+%s/--\[\[/"""/g
+%s/]]/"""/g
+%s/--/#/g
+
+" Add spacing between commas
+%s/\(\S\),\(\S\)/\1, \2/g
+
+%s/local //g
+%s/ then/:/g
+%s/ do/:/g
+%s/end//g
+%s/elseif/elif/g
+%s/else/else:/g
+%s/true/True/g
+%s/false/False/g
+%s/\~=/!=/g
+%s/math\.min/min/g
+%s/math\.max/max/g
+%s/math\.abs/abs/g
+
+
+%s/__init/__init__/g
+
+" Rewrite function declarations
+%s/function \w*:\(\w*\)/    def \1/g
+%s/def \(.*\)$/def \1:/g
+
+" class declaration
+%s/\(\w*\), parent = torch\.class.*$/import torch\rfrom torch.legacy import nn\r\rclass \1(nn.Module):/g
+
+%s/input\.THNN/self._backend/g
+%s/\(self\.backend\w*$\)/\1\r        self._backend.library_state,/g
+%s/def \(\w*\)(/def \1(self, /g
+
+%s/__init__(self)/__init__()/g
+
+%s/:\(\S\)/.\1/g
+
+%s/\.cdata()//g
+%s/THNN\.optionalTensor(\(.*\))/\1/g
+
+%s/parent\./super(##, self)./g
diff --git a/tools/cpp_build/build_all.sh b/tools/cpp_build/build_all.sh
new file mode 100755
index 0000000..abc9cd5
--- /dev/null
+++ b/tools/cpp_build/build_all.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -ex
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+source $SCRIPTPATH/build_caffe2.sh
+source $SCRIPTPATH/build_libtorch.sh
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
new file mode 100755
index 0000000..4c3254e
--- /dev/null
+++ b/tools/cpp_build/build_caffe2.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+pushd $SCRIPTPATH
+source ./build_common.sh
+
+echo "Building Caffe2"
+
+mkdir -p $CAFFE2_BUILDPATH
+pushd $CAFFE2_BUILDPATH
+
+cmake -DUSE_CUDA=$USE_CUDA \
+      -DBUILD_CAFFE2=OFF \
+      -DBUILD_ATEN=ON \
+      -DBUILD_PYTHON=OFF \
+      -DBUILD_BINARY=OFF \
+      -DBUILD_SHARED_LIBS=ON \
+      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+      -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
+      -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -G "$GENERATE" \
+      $PYTORCHPATH/
+$MAKE -j "$JOBS" install
+
+popd
+popd
diff --git a/tools/cpp_build/build_common.sh b/tools/cpp_build/build_common.sh
new file mode 100755
index 0000000..6a80193
--- /dev/null
+++ b/tools/cpp_build/build_common.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+BUILD_PATH="${1:-$SCRIPTPATH/build}"
+INSTALL_PREFIX="$BUILD_PATH/install"
+PYTORCHPATH="$SCRIPTPATH/../.."
+
+USE_CUDA=0
+if [ -x "$(command -v nvcc)" ]; then
+  USE_CUDA=1
+fi
+
+CAFFE2_BUILDPATH="$BUILD_PATH/caffe2"
+NANOPB_BUILDPATH="$BUILD_PATH/nanopb"
+LIBTORCH_BUILDPATH="$BUILD_PATH/libtorch"
+
+# Build with Ninja if available. It has much cleaner output.
+GENERATE="Unix Makefiles"
+MAKE=make
+if [ -x "$(command -v ninja)" ]; then
+  GENERATE=Ninja
+  MAKE=ninja
+fi
+
+# Code is developed a lot more than released, so default to Debug.
+BUILD_TYPE=${BUILD_TYPE:-Debug}
+
+# Try to build with as many threads as we have cores, default to 4 if the
+# command fails.
+set +e
+if [ -n "$MAX_JOBS" ]; then  # Use MAX_JOBS if it is set
+  JOBS=$MAX_JOBS
+elif [[ "$(uname)" == "Linux" ]]; then
+  # https://stackoverflow.com/questions/6481005/how-to-obtain-the-number-of-cpus-cores-in-linux-from-the-command-line
+  JOBS="$(grep -c '^processor' /proc/cpuinfo)"
+else # if [[ "$(uname)" == "Darwin"]]
+  # https://stackoverflow.com/questions/1715580/how-to-discover-number-of-logical-cores-on-mac-os-x
+  JOBS="$(sysctl -n hw.ncpu)"
+fi
+set -e
+if [[ $? -ne 0 ]]; then
+  JOBS=4
+fi
+
+# Make sure an ONNX namespace is set
+if [ -z "$ONNX_NAMESPACE" ]; then
+  ONNX_NAMESPACE="onnx_torch"
+fi
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
new file mode 100755
index 0000000..b500112
--- /dev/null
+++ b/tools/cpp_build/build_libtorch.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+pushd $SCRIPTPATH
+source ./build_common.sh
+
+echo "Building Torch"
+
+mkdir -p $LIBTORCH_BUILDPATH
+pushd $LIBTORCH_BUILDPATH
+
+cmake -DUSE_CUDA:BOOL=$USE_CUDA \
+      -DNO_API:BOOL=${NO_API:0} \
+      -DCAFFE2_PATH=$PYTORCHPATH/ \
+      -DCAFFE2_BUILD_PATH=$CAFFE2_BUILDPATH \
+      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+      -DNANOPB_BUILD_PATH=$NANOPB_BUILDPATH \
+      -DINSTALL_PREFIX=$INSTALL_PREFIX \
+      -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
+      -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -Dnanopb_BUILD_GENERATOR:BOOL=OFF \
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \
+      -DVERBOSE:BOOL=${VERBOSE:-0} \
+      -G "$GENERATE" \
+      $PYTORCHPATH/torch
+$MAKE -j "$JOBS"
+
+popd
+popd
diff --git a/tools/cwrap/__init__.py b/tools/cwrap/__init__.py
new file mode 100644
index 0000000..4fa8e29
--- /dev/null
+++ b/tools/cwrap/__init__.py
@@ -0,0 +1 @@
+from .cwrap import cwrap
diff --git a/tools/cwrap/cwrap.py b/tools/cwrap/cwrap.py
new file mode 100644
index 0000000..1444c08
--- /dev/null
+++ b/tools/cwrap/cwrap.py
@@ -0,0 +1,289 @@
+import os
+import yaml
+from string import Template
+from copy import deepcopy
+from .plugins import ArgcountChecker, OptionalArguments, ArgumentReferences, \
+    BeforeAfterCall, ConstantArguments, ReturnArguments, GILRelease
+from ..shared import cwrap_common
+
+
+class cwrap(object):
+    BASE_INDENT_SIZE = 6
+
+    RETURN_WRAPPERS = {
+        'void': Template('Py_RETURN_NONE;'),
+        'long': Template('return PyLong_FromLong($result);'),
+        'int64_t': Template('return PyLong_FromLong($result);'),
+        'bool': Template('return PyBool_FromLong($result);'),
+        'void*': Template('return PyLong_FromVoidPtr($result);'),
+    }
+
+    OPTION_TEMPLATE = Template("""
+    ${els}if ($arg_check) {
+      $pre_arg_assign
+      $arg_assign
+      $code
+    """)
+
+    ARG_ASSIGN_TEMPLATE = Template("""${type} ${name} = ${unpack};""")
+
+    OPTION_CODE_TEMPLATE = [
+        '$call',
+        '$return_result',
+    ]
+
+    FUNCTION_CALL_TEMPLATE = Template("$capture_result$cname($call_arg);")
+
+    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments,
+                              ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]
+
+    def __init__(self, source, destination=None, plugins=None, default_plugins=True, template_path=None):
+        if destination is None:
+            destination = source.replace('.cwrap', '.cpp')
+
+        self.plugins = [] if plugins is None else plugins
+        if default_plugins:
+            defaults = [cls() for cls in self.DEFAULT_PLUGIN_CLASSES]
+            self.plugins = defaults + self.plugins
+
+        for plugin in self.plugins:
+            plugin.initialize(self)
+
+        self.base_path = os.path.dirname(os.path.abspath(source))
+        with open(source, 'r') as f:
+            declarations = f.read()
+
+        # wrap all the declarations in the source .cwrap file
+        wrapper = self.wrap_declarations(declarations)
+
+        # let each plugin do any post-processing of the wrapped file
+        for plugin in self.plugins:
+            wrapper = plugin.process_full_file(wrapper, template_path)
+
+        # See Note [Unchanging results for ninja]
+        try:
+            with open(destination, 'r') as f:
+                old_wrapper = f.read()
+        except IOError:
+            old_wrapper = None
+
+        if old_wrapper != wrapper:
+            with open(destination, 'w') as f:
+                print("Writing {}".format(destination))
+                f.write(wrapper)
+        else:
+            print("Skipped writing {}".format(destination))
+
+    def wrap_declarations(self, declarations):
+        lines = declarations.split('\n')
+        declaration_lines = []
+        output = []
+        in_declaration = False
+        i = 0
+
+        while i < len(lines):
+            line = lines[i]
+            if line == '[[':
+                declaration_lines = []
+                in_declaration = True
+            elif line == ']]':
+                in_declaration = False
+                declaration = yaml.load('\n'.join(declaration_lines))
+                cwrap_common.set_declaration_defaults(declaration)
+
+                # Pass declaration in a list - maybe some plugins want to add
+                # multiple wrappers
+                declarations = [declaration]
+                for plugin in self.plugins:
+                    declarations = plugin.process_declarations(declarations)
+                # Generate wrappers for all declarations and append them to
+                # the output
+                for declaration in declarations:
+                    wrapper = self.generate_wrapper(declaration)
+                    for plugin in self.plugins:
+                        wrapper = plugin.process_wrapper(wrapper, declaration)
+                    output.append(wrapper)
+            elif in_declaration:
+                declaration_lines.append(line)
+            elif '!!inc ' == line[:6]:
+                fname = os.path.join(self.base_path, line[6:].strip())
+                with open(fname, 'r') as f:
+                    included = f.read().split('\n')
+                # insert it into lines at position i+1
+                lines[i + 1:i + 1] = included
+            else:
+                output.append(line)
+            i += 1
+
+        return '\n'.join(output)
+
+    def parse_arguments(self, args):
+        new_args = []
+        for arg in args:
+            # Simple arg declaration of form "<type> <name>"
+            if isinstance(arg, str):
+                t, _, name = arg.partition(' ')
+                new_args.append({'type': t, 'name': name})
+            elif isinstance(arg, dict):
+                if 'arg' in arg:
+                    arg['type'], _, arg['name'] = arg['arg'].partition(' ')
+                    del arg['arg']
+                new_args.append(arg)
+            else:
+                assert False
+        return new_args
+
+    def search_plugins(self, fnname, args, fallback):
+        """Search plugins for the given function to call with args.
+
+        If not found, call fallback with args.
+        """
+        for plugin in self.plugins:
+            wrapper = getattr(plugin, fnname)(*args)
+            if wrapper is not None:
+                return wrapper
+        return fallback(*args)
+
+    def get_type_check(self, arg, option):
+        return self.search_plugins('get_type_check', (arg, option), lambda arg, _: None)
+
+    def get_type_unpack(self, arg, option):
+        return self.search_plugins('get_type_unpack', (arg, option), lambda arg, _: None)
+
+    def get_return_wrapper(self, option):
+        return self.search_plugins('get_return_wrapper', (option,), lambda _: self.RETURN_WRAPPERS[option['return']])
+
+    def get_wrapper_template(self, declaration):
+        return self.search_plugins('get_wrapper_template', (declaration,), lambda _: None)
+
+    def get_assign_args(self, arguments):
+        return self.search_plugins('get_assign_args', (arguments,), lambda _: arguments)
+
+    def get_arg_accessor(self, arg, option):
+        def wrap_accessor(arg, _):
+            if arg.get('idx') is None:
+                raise RuntimeError("Missing accessor for '{} {}'".format(
+                                   arg['type'], arg['name']))
+            return 'PyTuple_GET_ITEM(args, {})'.format(arg['idx'])
+
+        return self.search_plugins('get_arg_accessor', (arg, option), wrap_accessor)
+
+    def generate_wrapper(self, declaration):
+        wrapper = ''
+        for i, option in enumerate(declaration['options']):
+            option_wrapper = self.generate_option(option, is_first=(i == 0))
+            for plugin in self.plugins:
+                option_wrapper = plugin.process_option_code(option_wrapper, option)
+            wrapper += option_wrapper
+        return self.get_wrapper_template(declaration).substitute(name=declaration['name'], options=wrapper)
+
+    def map_selected_arguments(self, base_fn_name, plugin_fn_name, option, arguments):
+        result = []
+        for arg in arguments:
+            accessor = self.get_arg_accessor(arg, option)
+            tmpl = getattr(self, base_fn_name)(arg, option)
+            if tmpl is None:
+                fn = 'check' if base_fn_name == 'get_type_check' else 'unpack'
+                raise RuntimeError("Missing type {} for '{} {}'".format(
+                                   fn, arg['type'], arg['name']))
+            res = tmpl.substitute(arg=accessor, idx=arg.get('idx'))
+            for plugin in self.plugins:
+                res = getattr(plugin, plugin_fn_name)(res, arg, accessor)
+
+            result.append(res)
+        return result
+
+    def build_option_args(self, arguments, arg_unpack):
+        assignement = []
+        call_arg = []
+        # If types or names needs to be changed
+        arguments = self.get_assign_args(arguments)
+        for arg, unpack in zip(arguments, arg_unpack):
+            if arg['type'] == 'CONSTANT':
+                call_arg.append(unpack)
+            else:
+                var_name = "arg_" + str(arg.get('assign_name', arg['name']))
+                res = self.ARG_ASSIGN_TEMPLATE.substitute(
+                    type=arg['type'],
+                    name=var_name,
+                    unpack=unpack)
+
+                if var_name not in call_arg:
+                    assignement.append(res)
+                call_arg.append(var_name)
+        return assignement, call_arg
+
+    def indent_code(self, code):
+        if code == '':
+            return code
+        code_lines = map(lambda s: s.strip(), code.split('\n'))
+        code = '\n'
+        depth = self.BASE_INDENT_SIZE
+        for line in code_lines:
+            depth -= line.count('}') * 2
+            code += ' ' * depth + line + '\n'
+            depth += line.count('{') * 2
+            depth += line.count('(') * 4
+            depth -= line.count(')') * 4
+        return code[:-1]
+
+    def generate_option(self, option, is_first):
+        checked_args = list(filter(
+            lambda arg: 'ignore_check' not in arg or not arg['ignore_check'],
+            option['arguments']))
+        option['num_checked_args'] = len(checked_args)
+        idx_args = list(filter(
+            lambda arg: not arg.get('ignore_check') and not arg.get('no_idx'),
+            option['arguments']))
+        for i, arg in enumerate(idx_args):
+            arg['idx'] = i
+
+        # Generate checks
+        arg_checks = self.map_selected_arguments('get_type_check',
+                                                 'process_single_check', option, checked_args)
+        arg_checks = ' &&\n          '.join(arg_checks)
+        for plugin in self.plugins:
+            arg_checks = plugin.process_all_checks(arg_checks, option)
+
+        # Generate pre_arg assign
+        pre_arg_assign = []
+        for plugin in self.plugins:
+            pre_arg_assign = plugin.process_pre_arg_assign(pre_arg_assign, option)
+
+        # Generate arg assignment and call arguments
+        arg_unpack = self.map_selected_arguments('get_type_unpack',
+                                                 'process_single_unpack', option, option['arguments'])
+        arg_assign, call_arg = self.build_option_args(option['arguments'], arg_unpack)
+
+        call_arg = ', '.join(call_arg)
+        for plugin in self.plugins:
+            call_arg = plugin.process_all_call_arg(call_arg, option)
+
+        # Generate call
+        try:
+            return_result = self.get_return_wrapper(option).substitute()
+            call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result='',
+                                                          cname=option['cname'], call_arg=call_arg)
+        except KeyError:
+            return_result = self.get_return_wrapper(option).substitute(result='__result')
+            call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result=(option['return'] + ' __result = '),
+                                                          cname=option['cname'], call_arg=call_arg)
+
+        code_template = deepcopy(self.OPTION_CODE_TEMPLATE)
+        for plugin in self.plugins:
+            code_template = plugin.process_option_code_template(code_template,
+                                                                option)
+        code_template = Template('\n'.join(code_template))
+        code = code_template.substitute(call=call, return_result=return_result)
+        code = self.indent_code(code)
+        pre_arg_assign = self.indent_code('\n'.join(pre_arg_assign))
+        arg_assign = self.indent_code('\n'.join(arg_assign))
+
+        # Put everything together
+        return self.OPTION_TEMPLATE.substitute(
+            els=('} else ' if not is_first else ''),
+            arg_check=arg_checks,
+            pre_arg_assign=pre_arg_assign,
+            arg_assign=arg_assign,
+            code=code,
+        )
diff --git a/tools/cwrap/plugins/ArgcountChecker.py b/tools/cwrap/plugins/ArgcountChecker.py
new file mode 100644
index 0000000..5852dc9
--- /dev/null
+++ b/tools/cwrap/plugins/ArgcountChecker.py
@@ -0,0 +1,13 @@
+from . import CWrapPlugin
+
+
+class ArgcountChecker(CWrapPlugin):
+
+    def process_all_checks(self, checks, option):
+        if not checks:
+            checks = '__argcount == 0'
+        else:
+            indent = '\n          '
+            argcount = option['num_checked_args'] + option.get('argcount_offset', 0)
+            checks = '__argcount == {} &&'.format(str(argcount)) + indent + checks
+        return checks
diff --git a/tools/cwrap/plugins/ArgcountSortPlugin.py b/tools/cwrap/plugins/ArgcountSortPlugin.py
new file mode 100644
index 0000000..3e4f73f
--- /dev/null
+++ b/tools/cwrap/plugins/ArgcountSortPlugin.py
@@ -0,0 +1,15 @@
+import os
+from . import CWrapPlugin
+from ...shared import cwrap_common
+
+
+class ArgcountSortPlugin(CWrapPlugin):
+
+    def __init__(self, descending=True):
+        self.descending = descending
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            cwrap_common.sort_by_number_of_options(declaration,
+                                                   self.descending)
+        return declarations
diff --git a/tools/cwrap/plugins/ArgumentReferences.py b/tools/cwrap/plugins/ArgumentReferences.py
new file mode 100644
index 0000000..ab341b8
--- /dev/null
+++ b/tools/cwrap/plugins/ArgumentReferences.py
@@ -0,0 +1,29 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class ArgumentReferences(CWrapPlugin):
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['type'] == 'argument':
+                        arg['ignore_check'] = True
+                        arg['is_reference'] = True
+                        # Copy type from referenced argument
+                        idx = int(arg['name'])
+                        arg['type'] = option['arguments'][idx]['type']
+        return declarations
+
+    def _get_true_idx(self, idx, option):
+        return sum(not arg.get('ignore_check', False) for arg in option['arguments'][:idx])
+
+    def get_arg_accessor(self, arg, option):
+        if arg.get('is_reference', False):
+            idx = int(arg['name'])
+            referenced = option['arguments'][idx]
+            return self.cwrap.get_arg_accessor(referenced, option)
diff --git a/tools/cwrap/plugins/AssertNDim.py b/tools/cwrap/plugins/AssertNDim.py
new file mode 100644
index 0000000..d2ba120
--- /dev/null
+++ b/tools/cwrap/plugins/AssertNDim.py
@@ -0,0 +1,29 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class AssertNDim(CWrapPlugin):
+
+    PRE_CODE_TEMPLATE = Template(
+        """if(THTensor_(nDimension)(LIBRARY_STATE ${arg_op}) != ${dim_value}) {
+             THError("Expected argument %s to have %d dimension(s), but has %d",
+                     "${op}", ${dim_value}, THTensor_(nDimension)(LIBRARY_STATE ${arg_op}));
+           }
+        """)
+
+    def process_option_code_template(self, template, option):
+        new_code_pre = []
+
+        for _, arg in enumerate(option['arguments']):
+            if 'assert_ndim' not in arg:
+                continue
+
+            dim_value = arg.get('assert_ndim')
+            op = arg.get('assign_name', arg['name'])
+            arg_op = "arg_" + op
+            new_code_pre.append(self.PRE_CODE_TEMPLATE.substitute(op=op,
+                                                                  arg_op=arg_op,
+                                                                  dim_value=dim_value))
+            template = new_code_pre + template
+
+        return template
diff --git a/tools/cwrap/plugins/AutoGPU.py b/tools/cwrap/plugins/AutoGPU.py
new file mode 100644
index 0000000..d6abea7
--- /dev/null
+++ b/tools/cwrap/plugins/AutoGPU.py
@@ -0,0 +1,14 @@
+from . import CWrapPlugin
+
+
+class AutoGPU(CWrapPlugin):
+
+    def __init__(self, has_self=True, condition=None):
+        self.has_self = has_self
+        self.condition = condition
+
+    def process_pre_arg_assign(self, template, option):
+        if not option.get('device_guard', True):
+            return template
+        call = 'at::DeviceGuard device_guard(get_device(args));'
+        return [call] + template
diff --git a/tools/cwrap/plugins/BeforeAfterCall.py b/tools/cwrap/plugins/BeforeAfterCall.py
new file mode 100644
index 0000000..28ba1a2
--- /dev/null
+++ b/tools/cwrap/plugins/BeforeAfterCall.py
@@ -0,0 +1,37 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class BeforeAfterCall(CWrapPlugin):
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def insert_snippet(self, template, option, offset, name):
+        prepend_str = option.get(name)
+        if isinstance(prepend_str, dict):
+            backend = option['backends'][0]
+            prepend_str = prepend_str.get(backend, None)
+
+        if prepend_str is None:
+            return
+        if '$' in prepend_str:
+            before_call_template = Template(prepend_str)
+            args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
+                    in enumerate(option['arguments'])}
+            prepend_str = before_call_template.substitute(args)
+        template.insert(offset, prepend_str)
+
+    def process_pre_arg_assign(self, template, option):
+        if option.get('before_arg_assign'):
+            self.insert_snippet(template, option, 0, 'before_arg_assign')
+        return template
+
+    def process_option_code_template(self, template, option):
+        if option.get('before_call') or option.get('after_call'):
+            call_idx = template.index('$call')
+            self.insert_snippet(template, option, call_idx, 'before_call')
+            # call position might have changed
+            call_idx = template.index('$call')
+            self.insert_snippet(template, option, call_idx + 1, 'after_call')
+        return template
diff --git a/tools/cwrap/plugins/BoolOption.py b/tools/cwrap/plugins/BoolOption.py
new file mode 100644
index 0000000..7809796
--- /dev/null
+++ b/tools/cwrap/plugins/BoolOption.py
@@ -0,0 +1,35 @@
+from . import CWrapPlugin
+from string import Template
+
+import sys
+if sys.version_info[0] == 3:
+    string_type = str
+else:
+    string_type = basestring
+
+
+class BoolOption(CWrapPlugin):
+
+    UNPACK_TEMPLATE = Template('$arg == Py_True ? $if_true : $if_false')
+
+    def is_bool_option(self, arg):
+        return arg['type'] == 'bool' and 'if_true' in arg and 'if_false' in arg
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if self.is_bool_option(arg):
+                        arg['is_bool_option'] = True
+                        if isinstance(arg['if_true'], string_type):
+                            arg['type'] = 'const char*'
+        return declarations
+
+    def get_type_check(self, arg, option):
+        if arg.get('is_bool_option', False):
+            return Template('PyBool_Check($arg)')
+
+    def get_type_unpack(self, arg, option):
+        if arg.get('is_bool_option', False):
+            return Template(self.UNPACK_TEMPLATE.safe_substitute(
+                if_true=arg['if_true'], if_false=arg['if_false']))
diff --git a/tools/cwrap/plugins/Broadcast.py b/tools/cwrap/plugins/Broadcast.py
new file mode 100644
index 0000000..5b0a741
--- /dev/null
+++ b/tools/cwrap/plugins/Broadcast.py
@@ -0,0 +1,362 @@
+from . import CWrapPlugin
+from string import Template
+
+# Arguments to the Broadcast Plugin:
+# broadcast: args_to_broadcast_against [inplace] [fallback]
+# [args_to_broadcast_against]: either a single argument (e.g. "arg1") or a comma-separated
+#                              list of two arguments (e.g. "tensor1,tensor2") indicating
+#                              arguments to broadcast specified argument (usually "self") against
+# [inplace] will generate code for in-place function, which doesn't allow the in-place
+#           argument to be broadcast
+# [fallback] if tensors aren't broadcastable, preserves "element number" pointwise behavior,
+#            where only number of elements need to match, and tensors are viewed as 1-dimensional.
+# [dims] specify if the tensors shouldn't be broadcast to a specific tensor or tensors, but a combination
+#        of individual dimension sizes of a set of tensors.  For example: addbmm(C,A,B) a.k.a. [C + A @ B]
+#        broadcasts C to the first dimension of A and the second dimension of B.  Each dimension is specified as
+#        [arg].dim[#] and dimensions are comma-separated.  So, to specify that the tensor should be
+#        broadcast to 3-dimensions with sizes:
+#        tensor0->size[0] x tensor1->size[1] x tensor2->size[2]
+#        you would write:
+#        dims:tensor0.dim0,tensor1.dim1,tensor2.dim2
+# [types] if the tensors should be of different types than THTensor, specify as X where
+#         the actual type to use is THXTensor (i.e. Byte for THByteTensor).  If the type
+#         should be THTensor, use 'Real'
+
+# For out of place:
+# Two args: expand the two args together
+# Three args (fused kernels): (e.g. addcmul) expand all three args together
+# Sketch of proof that this is the same:
+# consider addcmul, under expansion we want: a + (b * c) = (a + b * c) [all expanded together]
+# Let e(i, j) be the expansion of i with j, e(i, j, k) be the expansion of i with j,k
+#
+# Then a + (b * c) = e(a, e(b,c) * e(c,b)) + e(e(b,c)   * e(c,b), a)
+#                  = e(a, e(b,c))          + e(e(b,c)   * e(c,b), a)    (only size matters for second param)
+#                  = e(a,b,c)              + e(e(b,c)   * e(c,b), a)    (by associativity of max in expand)
+#                  = e(a,b,c)              + e(b,c,a)   * e(c,b,a)      (see L1)
+# which is a + b * c all expanded together
+#
+# L1: Show e(i * j, a) = e(i,a) * e(j,a) where i,j have same size
+# Consider any index _{ s_0, ..., s_n}
+# e(i * j, a) = (i*j)_{f(s_0), ...,f(s_n)} where f is the expansion of that dimension with a
+#             = i_{f(s_0), ..., f(s_n)} * j_{f(s_0), ..., f(s_n)} by definition of pointwise operator
+#             = e(i,a) * e(j,a)
+
+
+class Broadcast(CWrapPlugin):
+
+    # Save and restore passed in arguments in case later plugins use
+    POST_TEMPLATE = Template(
+        """${arg_op_other} = ${arg_op_other}_save;\n""")
+
+    def getPreArgStringTemplate(self, type=None):
+        if type is None:
+            ret = """THTensor *${arg_op_other}_save = ${arg_op_other};
+                     THTensorPtr ${arg_op_other}_guard(nullptr);\n"""
+        else:
+            cpu_t = "TH" + type + "Tensor"
+            gpu_t = "THCuda" + type + "Tensor"
+            ret = ("#if !IS_CUDA\n" +
+                   cpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
+                   cpu_t + "Ptr ${arg_op_other}_guard(nullptr);\n" +
+                   "#else\n" +
+                   gpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
+                   "THPPointer<" + gpu_t + "> ${arg_op_other}_guard(nullptr);\n" +
+                   "#endif\n")
+        return Template(ret)
+
+    def getNewForExpand(self, type):
+        if type is None:
+            ret = """THTensor_(new)(LIBRARY_STATE_NOARGS);\n"""
+        else:
+            cpu_t = "TH" + type + "Tensor"
+            gpu_t = "THCuda" + type + "Tensor"
+            ret = ("#if !IS_CUDA\n" +
+                   cpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
+                   "#else\n" +
+                   gpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
+                   "#endif\n")
+        return ret
+
+    def getExpandTemplate(self, same_size_check, expand_call, success_code, raise_errors):
+        if not raise_errors:
+            return Template(
+                "bool try_expand = !" + same_size_check + "\n" +
+                "if (try_expand) {\n" +
+                "bool expand_success = false;\n" +
+                "try {\n" +
+                expand_call +
+                "\nexpand_success = true;\n" +
+                "}\n"
+                "catch (std::exception &e) {}\n" +
+                "if(expand_success) {\n" +
+                success_code +
+                "\n}" +
+                "\n}\n")
+        else:
+            return Template(
+                "bool try_expand = !" + same_size_check + "\n" +
+                "if (try_expand) {\n" +
+                expand_call + "\n" +
+                success_code + "\n"
+                "}\n")
+
+    def getOutPlacePreExpand2Template(self, type_op_a, type_op_other, raise_errors):
+        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
+        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
+                       "${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
+                       """expand_outplace2(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_other}_guard.get(),
+                                           ${arg_op_a}, ${arg_op_other},
+                                           \"${op_a}\", \"${op_other}\", !${raise_errors});""")
+        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
+                          ${arg_op_other} = ${arg_op_other}_guard.get();"""
+        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
+
+    def getOutPlacePreExpand3Template(self, type_op_a, type_op_other1, type_op_other2, raise_errors):
+        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
+                        THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                            ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
+        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
+                       "${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
+                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
+                       """expand_outplace3(LIBRARY_STATE ${arg_op_a}_guard.get(),
+                                          ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
+                                          ${arg_op_a}, ${arg_op_other1}, ${arg_op_other2},
+                                          \"${op_a}\", \"${op_other1}\", \"${op_other2}\", !${raise_errors});""")
+        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
+                          ${arg_op_other1} = ${arg_op_other1}_guard.get();
+                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
+        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
+
+    OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE = Template(
+        """if(THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}) <= ${arg_op_dim_value}) {
+             THError("Argument %s requires at least %d dimensions, but only has %d",
+                     "${op_dim}", ${arg_op_dim_value} + 1, THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}));
+           }
+           int64_t ${arg_op_a}_dim${idx}_size = THTensor_(size)(LIBRARY_STATE ${arg_op_dim}, ${arg_op_dim_value});\n""")
+
+    OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE = Template(
+        """THLongStoragePtr ${arg_op_a}_storage(THLongStorage_newWithSize1(${arg_op_a}_dim0_size));\n""")
+
+    OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE = Template(
+        """THLongStoragePtr ${arg_op_a}_storage(
+               THLongStorage_newWithSize2(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size));\n""")
+
+    OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE = Template(
+        """THLongStoragePtr ${arg_op_a}_storage(
+               THLongStorage_newWithSize3(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size, ${arg_op_a}_dim2_size));\n""")
+
+    def getOutPlacePreExpandPostDimTemplate(self, type_op_a, raise_errors):
+        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                            ${arg_op_a}_storage->data, ${arg_op_a}_storage->size);"""
+        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
+                       """expand(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_a}, ${arg_op_a}_storage);""")
+        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();"""
+        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
+
+    OUT_PLACE_PRE_TEMPLATE = Template(
+        """${code_arg_op_a}${code_arg_op_other1}${code_arg_op_other2}
+           ${expand_code}""")
+
+    def getInPlacePreExpand1Template(self, type_op_other, raise_errors):
+        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
+        expand_code = ("${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
+                       """expand_inplace1(LIBRARY_STATE ${arg_op_other}_guard.get(), ${arg_op_other}, ${arg_op_a},
+                                         \"${op_other}\", \"${op_a}\", !${raise_errors});""")
+        success_code = """${arg_op_other} = ${arg_op_other}_guard.get();"""
+        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
+
+    def getInPlacePreExpand2Template(self, type_op_other1, type_op_other2, raise_errors):
+        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
+                         THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
+                                             ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
+        expand_code = ("${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
+                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
+                       """expand_inplace2(LIBRARY_STATE ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
+                                         ${arg_op_other1}, ${arg_op_other2}, ${arg_op_a},
+                                         \"${op_other1}\", \"${op_other2}\", \"${op_a}\", !${raise_errors});""")
+        success_code = """${arg_op_other1} = ${arg_op_other1}_guard.get();
+                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
+        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
+
+    IN_PLACE_PRE_TEMPLATE = Template(
+        """${code_arg_op_other1}${code_arg_op_other2}
+           ${expand_code}""")
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    # Arguments:
+    # [0]: name of tensor to broadcast with (possibly two comma separated)
+    # [1] inplace (optional).  In place operations only broadcast on second tensor argument
+    # [2] fallback (optional).  Will fallback to applying to tensor of equal nElem if broadcast fails
+    def process_option_code_template(self, template, option):
+        new_code_pre = []
+        new_code_post = []
+        for _, arg in enumerate(option['arguments']):
+            if 'broadcast' not in arg:
+                continue
+
+            params = arg.get('broadcast').split(" ")
+            op_a = arg.get('assign_name', arg['name'])
+            in_place = "inplace" in params
+            raise_errors = "false" if "fallback" in params else "true"
+
+            param_others = params[0].split(",")
+            if len(param_others) > 2:
+                raise ValueError('Broadcast only supports up to 2 secondary parameters')
+            op_b = param_others[0]
+            op_c = param_others[1] if len(param_others) == 2 else None
+            arg_op_b = "arg_" + op_b
+            arg_op_a = "arg_" + op_a
+            arg_op_c = ("arg_" + op_c) if op_c else None
+
+            dims_kvs = []
+            for p in params:
+                if p.startswith("dims:"):
+                    assert(raise_errors == "true")
+                    if len(dims_kvs) != 0:
+                        raise ValueError("multiple specifications of dims")
+                    dims = p[len("dims:"):].split(",")
+                    for dim in dims:
+                        batchdim = dim.split(".")
+                        assert len(batchdim) == 2
+                        assert batchdim[1].startswith("dim")
+                        dim_val = batchdim[1][len("dim"):]
+                        dims_kvs.append({"op": batchdim[0], "arg_op": "arg_" + batchdim[0], "val": dim_val})
+
+            assert len(dims_kvs) <= 3
+            for p in params[1:]:
+                if p != "inplace" and p != "fallback" and not p.startswith("dims:") and not p.startswith("types:"):
+                    raise ValueError("invalid parameter {}".format(p))
+
+            type_op_b = None
+            type_op_c = None
+            for p in params:
+                if p.startswith("types:"):
+                    if not in_place and len(dims_kvs) > 0:
+                        raise ValueError("type specification not supported yet for out-of-place functions "
+                                         "that specify explicit dimensions")
+                    types = p[len("types:"):].split(",")
+                    assert(len(types) == (2 if op_c else 1))
+                    type_op_b = None if types[0] == "Real" else types[0]
+                    if op_c:
+                        type_op_c = None if types[1] == "Real" else types[1]
+
+            op_b_mapping = {
+                "op_a": op_a,
+                "op_other": op_b,
+                "arg_op_a": arg_op_a,
+                "arg_op_other": arg_op_b,
+                "raise_errors": raise_errors
+            }
+            op_c_mapping = {
+                "op_a": op_a,
+                "op_other": op_c,
+                "arg_op_a": arg_op_a,
+                "arg_op_other": arg_op_c,
+                "raise_errors": raise_errors
+            }
+            raise_errors_s = raise_errors == "true"
+
+            if in_place:
+                code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
+                code_arg_op_other2 = (
+                    self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping) if op_c else "")
+
+                if op_c:
+                    expand_code = self.getInPlacePreExpand2Template(type_op_b, type_op_c, raise_errors_s).substitute(
+                        op_b_mapping,
+                        op_other1=op_b,
+                        op_other2=op_c,
+                        arg_op_other1=arg_op_b,
+                        arg_op_other2=arg_op_c)
+                else:
+                    expand_code = self.getInPlacePreExpand1Template(type_op_b, raise_errors_s).substitute(op_b_mapping)
+
+                new_code_pre.append(self.IN_PLACE_PRE_TEMPLATE.substitute(
+                    arg_op_a=arg_op_a,
+                    code_arg_op_other1=code_arg_op_other1,
+                    code_arg_op_other2=code_arg_op_other2,
+                    expand_code=expand_code,
+                    raise_errors=raise_errors))
+                new_code_pre.append("")
+
+                post_code = self.POST_TEMPLATE.substitute(op_b_mapping)
+                if op_c:
+                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping)
+
+                new_code_post.append(post_code)
+                new_code_post.append("")
+            else:
+                if len(dims_kvs) != 0:
+                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
+                    code_arg_op_other1 = ""
+                    code_arg_op_other2 = ""
+                    expand_code = ""
+                    for idx, kv in enumerate(dims_kvs):
+                        expand_code += self.OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE.substitute(
+                            arg_op_a=arg_op_a,
+                            op_dim=kv["op"],
+                            arg_op_dim=kv["arg_op"],
+                            arg_op_dim_value=kv["val"],
+                            idx=idx)
+
+                    if len(dims_kvs) == 1:
+                        expand_code += self.OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE.substitute(
+                            arg_op_a=arg_op_a,
+                            arg_op_dim0=dims_kvs[0]["arg_op"])
+                    elif len(dims_kvs) == 2:
+                        expand_code += self.OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE.substitute(
+                            arg_op_a=arg_op_a,
+                            arg_op_dim0=dims_kvs[0]["arg_op"],
+                            arg_op_dim1=dims_kvs[1]["arg_op"])
+                    else:
+                        expand_code += self.OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE.substitute(
+                            arg_op_a=arg_op_a,
+                            arg_op_dim0=dims_kvs[0]["arg_op"],
+                            arg_op_dim1=dims_kvs[1]["arg_op"],
+                            arg_op_dim2=dims_kvs[2]["arg_op"])
+                    expand_code += self.getOutPlacePreExpandPostDimTemplate(None, raise_errors_s).substitute(
+                        arg_op_a=arg_op_a,
+                        raise_errors=raise_errors)
+                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
+
+                else:
+                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
+                    code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
+                    code_arg_op_other2 = (self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping)
+                                          if op_c else "")
+
+                    if op_c:
+                        expand_template = self.getOutPlacePreExpand3Template(None, type_op_b, type_op_c, raise_errors_s)
+                        expand_code = expand_template.substitute(
+                            op_b_mapping,
+                            op_other1=op_b,
+                            op_other2=op_c,
+                            arg_op_other1=arg_op_b,
+                            arg_op_other2=arg_op_c)
+
+                    else:
+                        expand_code = self.getOutPlacePreExpand2Template(None, type_op_b, raise_errors_s).substitute(
+                            op_b_mapping)
+
+                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
+                    post_code += self.POST_TEMPLATE.substitute(op_b_mapping)
+                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping) if op_c else ""
+
+                new_code_pre.append(self.OUT_PLACE_PRE_TEMPLATE.substitute(
+                    code_arg_op_a=code_arg_op_a,
+                    code_arg_op_other1=code_arg_op_other1,
+                    code_arg_op_other2=code_arg_op_other2,
+                    expand_code=expand_code))
+                new_code_pre.append("")
+
+                new_code_post.append(post_code)
+                new_code_post.append("")
+
+        template = new_code_pre + template + new_code_post
+        return template
diff --git a/tools/cwrap/plugins/ConstantArguments.py b/tools/cwrap/plugins/ConstantArguments.py
new file mode 100644
index 0000000..ced2240
--- /dev/null
+++ b/tools/cwrap/plugins/ConstantArguments.py
@@ -0,0 +1,21 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class ConstantArguments(CWrapPlugin):
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['type'] == 'CONSTANT':
+                        arg['ignore_check'] = True
+        return declarations
+
+    def get_type_unpack(self, arg, option):
+        if arg['type'] == 'CONSTANT':
+            return Template('$arg')
+
+    def get_arg_accessor(self, arg, option):
+        if arg['type'] == 'CONSTANT':
+            return arg.get('default', arg['name'])
diff --git a/tools/cwrap/plugins/CuDNNPlugin.py b/tools/cwrap/plugins/CuDNNPlugin.py
new file mode 100644
index 0000000..8c32e98
--- /dev/null
+++ b/tools/cwrap/plugins/CuDNNPlugin.py
@@ -0,0 +1,179 @@
+from string import Template
+import copy
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product
+
+
+class CuDNNPlugin(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THTensor*': Template('createTensor($arg)'),
+        'int': Template('((int) THPUtils_unpackLong($arg))'),
+        'std::vector<int>': Template('THPUtils_unpackIntTuple($arg)'),
+        'cudnnDataType_t': Template('$arg'),
+        'cudnnHandle_t': Template('$arg'),
+        'Convolution*': Template('(Convolution*)THPWrapper_get($arg)'),
+        'bool': Template('$arg == Py_True'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+    }
+
+    INPUT_ARGUMENT_MAP = {
+        'THTensor*': 'const at::Tensor&',
+    }
+
+    TYPE_CHECK = {
+        'Convolution*': Template('THPWrapper_check($arg)'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'std::vector<int>': Template('THPUtils_checkIntTuple($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+    }
+
+    RETURN_WRAPPER = {
+        'Convolution*': Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
+    }
+
+    METHODS_DECLARATION = Template("""
+static PyMethodDef _THCUDNN_methods[] = {
+$methods
+  {NULL}
+};
+
+PyMethodDef* THCUDNN_methods()
+{
+  return _THCUDNN_methods;
+}
+""")
+
+    WRAPPER_TEMPLATE = Template("""\
+static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    HANDLE_TH_ERRORS
+    int __tuplecount = args ? (int) PyTuple_Size(args) : 0;
+    int __dictcount = kwargs ? (int) PyDict_Size(kwargs) : 0;
+    int __argcount = __tuplecount + __dictcount;
+    PyObject* tensorClass = getTensorClass(args);
+    THCPAutoGPU __autogpu_guard = THCPAutoGPU(args);
+
+    $options
+    }
+
+    THPUtils_invalidArguments(args, kwargs, "$readable_name", $num_options, $expected_args);
+    return NULL;
+    END_HANDLE_TH_ERRORS
+}
+""")
+
+    RELEASE_ARG = Template("_${name}_guard.release();")
+
+    TYPE_NAMES = {
+        'THTensor*': '" THPTensorStr "',
+        'long': 'int',
+        'bool': 'bool',
+        'int': 'int',
+    }
+
+    def __init__(self):
+        self.declarations = []
+
+    def get_type_unpack(self, arg, option):
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    def get_assign_args(self, arguments):
+        assign_args = []
+        for arg in arguments:
+            arg = copy.copy(arg)
+            new_type = self.INPUT_ARGUMENT_MAP.get(arg['type'])
+            if new_type is not None:
+                arg['type'] = new_type
+            assign_args.append(arg)
+        return assign_args
+
+    def get_wrapper_template(self, declaration):
+        arg_desc = []
+        for option in declaration['options']:
+            option_desc = [self.TYPE_NAMES.get(arg['type'], arg['type']) + ' ' + arg['name']
+                           for arg in option['arguments']
+                           if not arg.get('ignore_check', False)]
+            # TODO: this should probably go to THPLongArgsPlugin
+            if option_desc:
+                arg_desc.append('({})'.format(', '.join(option_desc)))
+            else:
+                arg_desc.append('no arguments')
+        arg_desc.sort(key=len)
+        arg_desc = ['"' + desc + '"' for desc in arg_desc]
+        arg_str = ', '.join(arg_desc)
+        readable_name = declaration['python_name']
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(
+            readable_name=readable_name, num_options=len(arg_desc),
+            expected_args=arg_str))
+
+    def get_return_wrapper(self, option):
+        return self.RETURN_WRAPPER.get(option['return'], None)
+
+    def get_arg_accessor(self, arg, option):
+        name = arg['name']
+        if name == 'self':
+            return 'self'
+        elif name == 'dataType':
+            return 'getCudnnDataType(tensorClass)'
+        elif name == 'handle':
+            return 'getCudnnHandle()'
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            declaration.setdefault('python_name', '_{}'.format(declaration['name']))
+            declaration['name'] = 'THCUDNN_{}'.format(declaration['name'])
+            self.declarations.append(declaration)
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['name'] in ['self', 'state', 'dataType', 'handle']:
+                        arg['ignore_check'] = True
+            declaration['options'] = self.filter_unique_options(declaration['options'])
+        return [d for d in declarations if not d.get('only_register', False)]
+
+    def filter_unique_options(self, options):
+        def signature(option):
+            return '#'.join(arg['type'] for arg in option['arguments']
+                            if 'ignore_check' not in arg or not arg['ignore_check'])
+        seen_signatures = set()
+        unique = []
+        for option in options:
+            sig = signature(option)
+            if sig not in seen_signatures:
+                unique.append(option)
+                seen_signatures.add(sig)
+        return unique
+
+    def preprocessor_guard(self, code, condition):
+        return '#if ' + condition + '\n' + code + '#endif\n'
+
+    def process_wrapper(self, code, declaration):
+        if 'defined_if' in declaration:
+            return self.preprocessor_guard(code, declaration['defined_if'])
+        return code
+
+    def process_all_call_arg(self, code, option):
+        return 'state, ' + code
+
+    def declare_methods(self):
+        methods = ''
+        for declaration in self.declarations:
+            extra_flags = ' | ' + declaration.get('method_flags') if 'method_flags' in declaration else ''
+            if not declaration.get('only_register'):
+                extra_flags += ' | METH_KEYWORDS'
+            entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS$extra_flags, NULL},\n').substitute(
+                python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
+            )
+            if 'defined_if' in declaration:
+                entry = self.preprocessor_guard(entry, declaration['defined_if'])
+            methods += entry
+        return self.METHODS_DECLARATION.substitute(methods=methods)
+
+    def process_full_file(self, code, template_path):
+        return code + self.declare_methods()
diff --git a/tools/cwrap/plugins/GILRelease.py b/tools/cwrap/plugins/GILRelease.py
new file mode 100644
index 0000000..3a3e5a5
--- /dev/null
+++ b/tools/cwrap/plugins/GILRelease.py
@@ -0,0 +1,31 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class GILRelease(CWrapPlugin):
+
+    OPTION_START = [
+        'PyThreadState *_save = NULL;',
+        'try {',
+    ]
+
+    BEFORE_CALL = 'Py_UNBLOCK_THREADS;'
+
+    AFTER_CALL = 'Py_BLOCK_THREADS;'
+
+    OPTION_END = [
+        '} catch (...) {',
+        'if (_save) {',
+        'Py_BLOCK_THREADS;',
+        '}',
+        'throw;',
+        '}',
+    ]
+
+    def process_option_code_template(self, template, option):
+        if option.get('with_gil', False):
+            return template
+        call_idx = template.index('$call')
+        template.insert(call_idx, self.BEFORE_CALL)
+        template.insert(call_idx + 2, self.AFTER_CALL)
+        return self.OPTION_START + template + self.OPTION_END
diff --git a/tools/cwrap/plugins/KwargsPlugin.py b/tools/cwrap/plugins/KwargsPlugin.py
new file mode 100644
index 0000000..59d6e6a
--- /dev/null
+++ b/tools/cwrap/plugins/KwargsPlugin.py
@@ -0,0 +1,69 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class KwargsPlugin(CWrapPlugin):
+
+    ACCESSOR_TEMPLATE = Template('(__tuplecount > $idx ? PyTuple_GET_ITEM(args, $idx) : __kw_$name)')
+    KWARG_ONLY_ACCESSOR_TEMPLATE = Template('__kw_$name')
+    CHECK_TEMPLATE = Template('(__tuplecount > $idx || __kw_$name) && $code')
+    KWARG_ONLY_CHECK_TEMPLATE = Template('__kw_$name && $code')
+    WRAPPER_TEMPLATE = Template("""
+    $declarations
+    if (kwargs) {
+      $lookups
+    }
+    """)
+
+    def process_declarations(self, declarations):
+        # We don't have access to declaration or options in get_arg_accessor
+        # and process_single_check, so we have to push the flag down to
+        # the args.
+        for declaration in declarations:
+            if declaration.get('no_kwargs'):
+                for option in declaration['options']:
+                    for arg in option['arguments']:
+                        arg['no_kwargs'] = True
+        # we need to use offsets for arg position in *arg if kwarg_only args
+        # are not at the end
+        for declaration in declarations:
+            for option in declaration['options']:
+                offset = 0
+                for arg in option['arguments']:
+                    if arg.get('kwarg_only'):
+                        arg['no_idx'] = True
+        return declarations
+
+    def get_arg_accessor(self, arg, option):
+        if arg.get('no_kwargs'):
+            return
+        if arg.get('kwarg_only'):
+            return self.KWARG_ONLY_ACCESSOR_TEMPLATE.substitute(name=arg['name'])
+        return self.ACCESSOR_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'])
+
+    def process_single_check(self, code, arg, arg_accessor):
+        if arg.get('no_kwargs'):
+            return code
+        if arg.get('kwarg_only'):
+            return self.KWARG_ONLY_CHECK_TEMPLATE.substitute(name=arg['name'], code=code)
+        return self.CHECK_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'], code=code)
+
+    def process_wrapper(self, code, declaration):
+        if declaration.get('no_kwargs'):
+            return code
+        seen_args = set()
+        args = []
+        for option in declaration['options']:
+            for arg in option['arguments']:
+                name = arg['name']
+                if (not arg.get('ignore_check') and
+                        not arg.get('no_kwargs') and
+                        name not in seen_args):
+                    seen_args.add(name)
+                    args.append(name)
+        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(a) for a in args])
+        lookups = '\n      '.join(
+            ['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=a) for a in args])
+        start_idx = code.find('{') + 1
+        new_code = self.WRAPPER_TEMPLATE.substitute(declarations=declarations, lookups=lookups)
+        return code[:start_idx] + new_code + code[start_idx:]
diff --git a/tools/cwrap/plugins/NNExtension.py b/tools/cwrap/plugins/NNExtension.py
new file mode 100644
index 0000000..838fa0b
--- /dev/null
+++ b/tools/cwrap/plugins/NNExtension.py
@@ -0,0 +1,157 @@
+import os
+from string import Template
+from . import CWrapPlugin
+
+
+MODULE_HEAD = """
+#include <Python.h>
+#include <exception>
+
+#include "THP.h"
+#include "torch/csrc/nn/type_checks.h"
+
+#include <ATen/DeviceGuard.h>
+
+"""
+REGISTER_METHOD_TEMPLATE = Template('  {"$name", (PyCFunction)$name, METH_STATIC | METH_VARARGS, NULL},\n')
+
+MODULE_METHODS_TEMPLATE = Template("""
+static PyMethodDef module_methods[] = {
+$METHODS
+  {NULL, NULL, 0, NULL}
+};
+""")
+
+
+class NNExtension(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THFloatTensor*': Template('THNN_FloatTensor_Unpack($arg)'),
+        'THDoubleTensor*': Template('THNN_DoubleTensor_Unpack($arg)'),
+        'THLongTensor*': Template('THNN_LongTensor_Unpack($arg)'),
+        'THIntTensor*': Template('THNN_IntTensor_Unpack($arg)'),
+        'THCudaHalfTensor*': Template('THNN_CudaHalfTensor_Unpack($arg)'),
+        'THCudaTensor*': Template('THNN_CudaFloatTensor_Unpack($arg)'),
+        'THCudaDoubleTensor*': Template('THNN_CudaDoubleTensor_Unpack($arg)'),
+        'THCudaLongTensor*': Template('THNN_CudaLongTensor_Unpack($arg)'),
+        'half': Template('THPHalfUtils_unpackReal($arg)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'int': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'int64_t': Template('THPUtils_unpackLong($arg)'),
+        'void*': Template('(void*)THPUtils_unpackLong($arg)'),
+        'THGenerator*': Template('THPGenerator_TH_CData((THPGenerator*)$arg)'),
+    }
+
+    TYPE_CHECK = {
+        'THFloatTensor*': Template('THNN_FloatTensor_Check($arg)'),
+        'THDoubleTensor*': Template('THNN_DoubleTensor_Check($arg)'),
+        'THLongTensor*': Template('THNN_LongTensor_Check($arg)'),
+        'THIntTensor*': Template('THNN_IntTensor_Check($arg)'),
+        'THCudaHalfTensor*': Template('THNN_CudaHalfTensor_Check($arg)'),
+        'THCudaTensor*': Template('THNN_CudaFloatTensor_Check($arg)'),
+        'THCudaDoubleTensor*': Template('THNN_CudaDoubleTensor_Check($arg)'),
+        'THCudaLongTensor*': Template('THNN_CudaLongTensor_Check($arg)'),
+        'half': Template('THPHalfUtils_checkReal($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'int64_t': Template('THPUtils_checkLong($arg)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+    }
+
+    WRAPPER_TEMPLATE = Template("""
+PyObject * $name(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  int __argcount = args ? PyTuple_Size(args) : 0;
+    $options
+  } else {
+    THPUtils_invalidArguments(args, NULL, "$name", 1, $expected_args);
+    return NULL;
+  }
+  END_HANDLE_TH_ERRORS
+}
+    """)
+
+    TYPE_NAMES = {
+        'THGenerator*': 'Generator',
+        'THCudaHalfTensor*': 'torch.cuda.HalfTensor',
+        'THCudaTensor*': 'torch.cuda.FloatTensor',
+        'THCudaDoubleTensor*': 'torch.cuda.DoubleTensor',
+        'THCudaLongTensor*': 'torch.cuda.LongTensor',
+        'THDoubleTensor*': 'torch.DoubleTensor',
+        'THFloatTensor*': 'torch.FloatTensor',
+        'THBoolTensor*': 'torch.ByteTensor',
+        'THLongTensor*': 'torch.LongTensor',
+        'THIndexTensor*': 'torch.LongTensor',
+        'THIntTensor*': 'torch.IntTensor',
+        'THLongStorage*': 'torch.LongStorage',
+        'long': 'int',
+        'int64_t': 'int',
+        'int': 'int',
+        'real': 'float',
+        'half': 'float',
+        'double': 'float',
+        'float': 'float',
+        'accreal': 'float',
+        'bool': 'bool',
+        'void*': 'int',
+    }
+
+    def __init__(self, module_name):
+        self.module_name = module_name
+        self.declarations = []
+
+    def process_full_file(self, code, template_path):
+        with open(os.path.join(template_path, 'nn_tail.cpp'), 'r') as f:
+            MODULE_TAIL = Template(f.read())
+
+        short_name = self.module_name.split('.')[-1]
+        new_code = MODULE_HEAD
+        new_code += code
+        new_code += self.declare_module_methods()
+        new_code += MODULE_TAIL.substitute(full_name=self.module_name, short_name=short_name)
+        return new_code
+
+    def process_wrapper(self, code, declaration):
+        self.declarations.append(declaration)
+        return code
+
+    def declare_module_methods(self):
+        module_methods = ''
+        for declaration in self.declarations:
+            module_methods += REGISTER_METHOD_TEMPLATE.substitute(name=declaration['name'])
+        return MODULE_METHODS_TEMPLATE.substitute(METHODS=module_methods)
+
+    def get_type_unpack(self, arg, option):
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    def get_wrapper_template(self, declaration):
+        arg_desc = []
+
+        def describe_arg(arg):
+            desc = self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
+            if arg.get('nullable'):
+                return '[{} or None]'.format(desc)
+            return desc
+        for option in declaration['options']:
+            option_desc = [describe_arg(arg)
+                           for arg in option['arguments']
+                           if not arg.get('ignore_check', False)]
+            if option_desc:
+                arg_desc.append('({})'.format(', '.join(option_desc)))
+            else:
+                arg_desc.append('no arguments')
+        arg_desc.sort(key=len)
+        arg_desc = ['"' + desc + '"' for desc in arg_desc]
+        arg_str = ', '.join(arg_desc)
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(expected_args=arg_str))
diff --git a/tools/cwrap/plugins/NullableArguments.py b/tools/cwrap/plugins/NullableArguments.py
new file mode 100644
index 0000000..38d82f0
--- /dev/null
+++ b/tools/cwrap/plugins/NullableArguments.py
@@ -0,0 +1,27 @@
+from . import CWrapPlugin
+
+
+class NullableArguments(CWrapPlugin):
+
+    def process_single_check(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} || {} == Py_None)'.format(code, arg_accessor)
+        return code
+
+    def process_single_unpack(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} == Py_None ? NULL : {})'.format(arg_accessor, code)
+        return code
+
+
+class UndefinedArguments(CWrapPlugin):
+
+    def process_single_check(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} || {} == Py_None)'.format(code, arg_accessor)
+        return code
+
+    def process_single_unpack(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} == Py_None ? at::Tensor() : {})'.format(arg_accessor, code)
+        return code
diff --git a/tools/cwrap/plugins/OptionalArguments.py b/tools/cwrap/plugins/OptionalArguments.py
new file mode 100644
index 0000000..0f51e22
--- /dev/null
+++ b/tools/cwrap/plugins/OptionalArguments.py
@@ -0,0 +1,18 @@
+import os
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product
+from ...shared import cwrap_common
+
+
+class OptionalArguments(CWrapPlugin):
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            cwrap_common.enumerate_options_due_to_default(
+                declaration,
+                allow_kwarg=True,
+                type_to_signature={},
+                remove_self=False)
+
+        return declarations
diff --git a/tools/cwrap/plugins/ProcessorSpecificPlugin.py b/tools/cwrap/plugins/ProcessorSpecificPlugin.py
new file mode 100644
index 0000000..7ec2459
--- /dev/null
+++ b/tools/cwrap/plugins/ProcessorSpecificPlugin.py
@@ -0,0 +1,90 @@
+from copy import deepcopy
+from . import CWrapPlugin
+import yaml
+
+
+class ProcessorSpecificPlugin(CWrapPlugin):
+
+    def process_declarations(self, declarations):
+        # In order to move Torch's random functions into the same cwrap
+        # declaration, we need to be able to handle the fact that on the CPU
+        # these functions take a generator argument, while on the GPU, they
+        # do not. As such, we would like to split those declarations at cwrap
+        # runtime into two separate declarations, one for the CPU (unchanged),
+        # and one for the GPU (with the generator argument removed).
+        #
+        # For example, the declaration arguments:
+        # arguments:
+        #   - THTensor* self
+        #   - arg: THGenerator* generator
+        #     default: THPDefaultGenerator->cdata
+        #     kwarg_only: True
+        #
+        # Would have the generator argument removed when generating for the GPU
+        # backend.
+
+        def arg_contains_generator(arg):
+            return (arg['type'] == 'THGenerator*' or (arg.get('default', None)
+                    is not None and 'THPDefaultGenerator' in
+                    str(arg.get('default', ""))))
+
+        def split_candidate(declaration):
+            # First, check and see if it is a declaration for both CPU/GPU
+            if all([proc in declaration['backends'] for
+                    proc in ['CPU', 'CUDA']]):
+                for option in declaration['options']:
+                    for argument in option['arguments']:
+                        if arg_contains_generator(argument):
+                            return True
+
+            return False
+
+        def can_we_handle_the_split(declaration):
+            # hook into here if the split cannot happen for some reason
+            return True
+
+        def generator_split(declaration):
+            # the split must make two changes: 1. remove the generator argument
+            # for the GPU, and 2. assign the correct backends/types to the
+            # split declaration
+            dec_cpu = declaration
+            dec_gpu = deepcopy(declaration)
+
+            # Remove GPU backend and types from dec_cpu
+            dec_cpu['backends'].remove('CUDA')
+            if dec_cpu.get('backend_type_pairs', False):
+                dec_cpu['backend_type_pairs'] = (
+                    [pair for pair in dec_cpu['backend_type_pairs'] if
+                     pair[1] == 'CPU'])
+            # also need to reach into options
+            for option in dec_cpu['options']:
+                option['backends'].remove('CUDA')
+
+            # Remove CPU backend and types from dec_gpu
+            dec_gpu['backends'].remove('CPU')
+            if dec_gpu.get('backend_type_pairs', False):
+                dec_gpu['backend_type_pairs'] = (
+                    [pair for pair in dec_gpu['backend_type_pairs'] if
+                     pair[1] == 'CUDA'])
+            # also need to reach into options
+            for option in dec_gpu['options']:
+                option['backends'].remove('CPU')
+
+            # Remove generator arguments from dec_gpu options
+            for option in dec_gpu['options']:
+                option['arguments'] = (
+                    [arg for arg in option['arguments'] if
+                     not arg_contains_generator(arg)])
+
+            return [dec_cpu, dec_gpu]
+
+        decs = []
+        for declaration in declarations:
+            if split_candidate(declaration):
+                assert(can_we_handle_the_split(declaration))
+                newdecs = generator_split(declaration)
+                decs.extend(newdecs)
+            else:
+                decs.append(declaration)
+
+        return decs
diff --git a/tools/cwrap/plugins/ReturnArguments.py b/tools/cwrap/plugins/ReturnArguments.py
new file mode 100644
index 0000000..65b19c4
--- /dev/null
+++ b/tools/cwrap/plugins/ReturnArguments.py
@@ -0,0 +1,21 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class ReturnArguments(CWrapPlugin):
+    ARGUMENT_RETURN_TEMPLATE = Template("Py_INCREF($arg);\nreturn (PyObject*)($arg);")
+    TUPLE_RETURN_TEMPLATE = Template("return PyTuple_Pack($num_args, $args);")
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def get_return_wrapper(self, option):
+        if option['return'].startswith('argument '):
+            indices = list(map(int, option['return'][len('argument '):].split(',')))
+            args = [option['arguments'][idx] for idx in indices]
+            accessors = [self.cwrap.get_arg_accessor(arg, option) for arg in args]
+            if len(args) == 1:
+                return Template(self.ARGUMENT_RETURN_TEMPLATE.safe_substitute(arg=accessors[0]))
+            else:
+                return Template(self.TUPLE_RETURN_TEMPLATE.safe_substitute(num_args=len(args),
+                                                                           args=', '.join(accessors)))
diff --git a/tools/cwrap/plugins/THPPlugin.py b/tools/cwrap/plugins/THPPlugin.py
new file mode 100644
index 0000000..3b0fb2d
--- /dev/null
+++ b/tools/cwrap/plugins/THPPlugin.py
@@ -0,0 +1,619 @@
+from string import Template
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product, chain
+from collections import OrderedDict
+
+
+class THPPlugin(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THFloatTensor*': Template('((THPFloatTensor*)$arg)->cdata'),
+        'THDoubleTensor*': Template('((THPDoubleTensor*)$arg)->cdata'),
+        'THLongTensor*': Template('((THPLongTensor*)$arg)->cdata'),
+        'THIntTensor*': Template('((THPIntTensor*)$arg)->cdata'),
+        'THTensor*': Template('((THPTensor*)$arg)->cdata'),
+        'THBoolTensor*': Template('((THPBoolTensor*)$arg)->cdata'),
+        'THIndexTensor*': Template('((THPIndexTensor*)$arg)->cdata'),
+        'THIntegerTensor*': Template('((THPIntegerTensor*)$arg)->cdata'),
+
+        'THCudaTensor*': Template('((THCPFloatTensor*)$arg)->cdata'),
+        'THCudaDoubleTensor*': Template('((THCPDoubleTensor*)$arg)->cdata'),
+        'THCudaIntTensor*': Template('((THCPIntTensor*)$arg)->cdata'),
+        'THCudaLongTensor*': Template('((THCPLongTensor*)$arg)->cdata'),
+
+        'THSFloatTensor*': Template('((THSPFloatTensor*)$arg)->cdata'),
+        'THSDoubleTensor*': Template('((THSPDoubleTensor*)$arg)->cdata'),
+        'THSLongTensor*': Template('((THSPLongTensor*)$arg)->cdata'),
+        'THSIntTensor*': Template('((THSPIntTensor*)$arg)->cdata'),
+        'THSTensor*': Template('((THSPTensor*)$arg)->cdata'),
+        'THSBoolTensor*': Template('((THSPBoolTensor*)$arg)->cdata'),
+        'THSIndexTensor*': Template('((THSPIndexTensor*)$arg)->cdata'),
+
+        'THLongStorage*': Template('((THPLongStorage*)$arg)->cdata'),
+        'THStorage*': Template('((THPStorage*)$arg)->cdata'),
+        'THGenerator*': Template('THPGenerator_TH_CData($arg)'),
+        'THSize*': Template('__size.get()'),
+        'THStride*': Template('__stride.get()'),
+        'void*': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'int': Template('((int) THPUtils_unpackLong($arg))'),
+        'int64_t': Template('THPUtils_unpackLong($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'real': Template('THPUtils_(unpackReal)($arg)'),
+        'accreal': Template('THPUtils_(unpackAccreal)($arg)'),
+    }
+
+    TYPE_CHECK = {
+        'THDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
+        'THBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
+        'THIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
+        'THIntegerTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntegerTensorClass'),
+
+        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THCudaDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPDoubleTensorClass'),
+        'THCudaIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPIntTensorClass'),
+        'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),
+
+        'THSDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPDoubleTensorClass'),
+        'THSFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
+        'THSLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPLongTensorClass'),
+        'THSIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIntTensorClass'),
+        'THSTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPTensorClass'),
+        'THSBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPBoolTensorClass'),
+        'THSIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIndexTensorClass'),
+
+        'THLongStorage*': Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
+        'THStorage*': Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'THSize*': Template('THPUtils_tryUnpackLongs($arg, __size)'),
+        'THStride*': Template('THPUtils_tryUnpackLongs($arg, __stride)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'int64_t': Template('THPUtils_checkLong($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'real': Template('THPUtils_(checkReal)($arg)'),
+        'accreal': Template('THPUtils_(checkAccreal)($arg)'),
+    }
+
+    SIZE_VARARG_CHECK = Template('THPUtils_tryUnpackLongVarArgs(args, $idx, __size)')
+
+    RETURN_WRAPPER = {
+        'THTensor*': Template('return THPTensor_(New)($result);'),
+        'THSTensor*': Template('return THSPTensor_(New)($result);'),
+        'THIndexTensor*': Template('return THPIndexTensor_(New)($result);'),
+        'THLongTensor*': Template('return THPLongTensor_New($result);'),
+        'THLongStorage*': Template('return THPLongStorage_New($result);'),
+        'THCudaIntTensor*': Template('return THCPIntTensor_New($result);'),
+        'THCudaLongTensor*': Template('return THCPLongTensor_New($result);'),
+        # TODO: make it smarter - it should return python long if result doesn't fit into an int
+        'long': Template('return PyInt_FromLong($result);'),
+        'int64_t': Template('return PyInt_FromLong($result);'),
+        'int': Template('return PyLong_FromLong($result);'),
+        'accreal': Template('return THPUtils_(newAccreal)($result);'),
+        'self': Template('Py_INCREF(self);\nreturn (PyObject*)self;'),
+        'real': Template('return THPUtils_(newReal)($result);'),
+    }
+
+    TENSOR_METHODS_DECLARATION = Template("""
+static PyMethodDef TH${sparse}PTensor_$stateless(methods)[] = {
+    $methods
+    {NULL}
+};
+""")
+
+    WRAPPER_TEMPLATE = Template("""\
+PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    HANDLE_TH_ERRORS
+    int __tuplecount = args ? (int) PyTuple_Size(args) : 0;
+    int __dictcount = kwargs ? (int) PyDict_Size(kwargs) : 0;
+    int __argcount = __tuplecount + __dictcount;
+    $variables
+    $init
+
+    $options
+    }
+
+    THPUtils_invalidArguments(args, kwargs, "$readable_name", $num_options, $expected_args);
+    return NULL;
+    END_HANDLE_TH_ERRORS
+}
+    """)
+
+    ALLOCATE_TMPL = Template("""\
+THP${type}TensorPtr _${name}_guard((THP${type}Tensor*) THP${type}Tensor_NewEmpty());
+if (!_${name}_guard.get()) return NULL;
+THP${type}Tensor* $name = _${name}_guard.get();
+""")
+
+    ALLOCATE_CUDA = Template("""\
+#if IS_CUDA
+${cuda}
+#else
+${cpu}
+#endif
+""")
+
+    def _allocate(typename, tmpl, cuda_tmpl=None, sparse=False):
+        code = tmpl.safe_substitute(type=typename)
+        if typename == '':
+            code = code.replace('NewEmpty', '(NewEmpty)')
+        if cuda_tmpl:
+            cuda_code = code.replace('THP', 'THCP')
+            code = cuda_tmpl.substitute(cuda=cuda_code, cpu=code)
+        if sparse:
+            code = code.replace('THP', 'THSP')
+            code = code.replace('THCP', 'THCSP')
+        return Template(code)
+
+    ALLOCATE_TYPE = {
+        'THTensor*': _allocate('', ALLOCATE_TMPL),
+        'THLongTensor*': _allocate('Long', ALLOCATE_TMPL),
+        'THIntTensor*': _allocate('Int', ALLOCATE_TMPL),
+        'THBoolTensor*': _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIndexTensor*': _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIntegerTensor*': _allocate('Int', ALLOCATE_TMPL, ALLOCATE_CUDA),
+
+        'THSTensor*': _allocate('', ALLOCATE_TMPL, sparse=True),
+    }
+
+    TYPE_NAMES = {
+        'THTensor*': '" THPTensorStr "',
+        'THSTensor*': '" THSPTensorStr "',
+        'THStorage*': '" THPStorageStr "',
+        'THGenerator*': 'torch.Generator',
+        'THLongStorage*': '" THPModuleStr "LongStorage',
+        'THLongTensor*': '" THPModuleStr "LongTensor',
+        'THIntTensor*': '" THPModuleStr "IntTensor',
+        'THBoolTensor*': '" THPModuleStr "ByteTensor',
+        'THIndexTensor*': '" THPModuleStr "LongTensor',
+        'THIntegerTensor*': '" THPModuleStr "IntTensor',
+        'THFloatTensor*': '" THPModuleStr "FloatTensor',
+        'THDoubleTensor*': '" THPModuleStr "DoubleTensor',
+        'THCudaTensor*': 'torch.cuda.FloatTensor',
+        'THCudaDoubleTensor*': 'torch.cuda.DoubleTensor',
+        'THCudaIntTensor*': 'torch.cuda.IntTensor',
+        'THCudaLongTensor*': 'torch.cuda.LongTensor',
+        'THSize*': 'torch.Size',
+        'THStride*': 'tuple',
+        'long': 'int',
+        'int64_t': 'int',
+        'int': 'int',
+        'real': '" RealStr "',
+        'double': 'float',
+        'accreal': '" RealStr "',
+        'bool': 'bool',
+        'const char*': 'bool',  # Can come only from bool option.
+    }
+
+    OUT_INIT = """
+    ___out = kwargs ? PyDict_GetItemString(kwargs, "out") : NULL;
+    if (___out == Py_None) { ___out = NULL; __dictcount--; __argcount--; }
+    """
+
+    def __init__(self):
+        self.declarations = []
+        self.stateless_declarations = []
+        self.docstrings = []
+
+    BACKEND_SUBSTITUTIONS = {
+        'CPU': 'TH',
+        'CUDA': 'THCuda',
+    }
+
+    def substitute_tensor_backend(self, arg, option):
+        if 'Backend' in arg['type']:
+            arg['type'] = arg['type'].replace('Backend',
+                                              self.BACKEND_SUBSTITUTIONS.get(option['backends'][0]))
+            # handle the fact that THCudaTensor isn't THCudaFloatTensor
+            if option['backends'][0] == 'CUDA' and 'Float' in arg['type']:
+                arg['type'] = arg['type'].replace('Float', '')
+
+    def get_type_unpack(self, arg, option):
+        self.substitute_tensor_backend(arg, option)
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        if arg['type'] == 'THSize*' and arg.get('long_args', False):
+            return self.SIZE_VARARG_CHECK
+        self.substitute_tensor_backend(arg, option)
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    # TODO: argument descriptions shouldn't be part of THP, but rather a general cwrap thing
+    def get_wrapper_template(self, declaration):
+        arg_desc = OrderedDict()
+
+        def format_arg(arg, var_args=False):
+            if var_args and arg.get('long_args', False):
+                return 'int ... ' + arg['name']
+            else:
+                return self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
+
+        def format_args(args, var_args=False):
+            option_desc = [format_arg(arg, var_args)
+                           for arg in args
+                           if not arg.get('ignore_check', False) and
+                           not arg.get('output')]
+            output_args = list(filter(lambda a: a.get('output'), args))
+            if output_args:
+                if len(output_args) > 1:
+                    out_type = 'tuple['
+                    out_type += ', '.join(
+                        self.TYPE_NAMES[arg['type']] for arg in output_args)
+                    out_type += ']'
+                    option_desc += ['#' + out_type + ' out']
+                else:
+                    arg = output_args[0]
+                    option_desc += ['#' + self.TYPE_NAMES[arg['type']] + ' out']
+
+            if option_desc:
+                return '({})'.format(', '.join(option_desc))
+            else:
+                return 'no arguments'
+
+        for option in declaration['options']:
+            arg_desc[format_args(option['arguments'], False)] = True
+            arg_desc[format_args(option['arguments'], True)] = True
+
+        arg_desc = sorted(list(arg_desc.keys()), key=len)
+        arg_desc = ['"' + desc + '"' for desc in arg_desc]
+        arg_str = ', '.join(arg_desc)
+        variables_str = '\n'.join(declaration.get('variables', []))
+        init_str = '\n'.join(declaration.get('init', []))
+        if 'stateless' in declaration['name']:
+            readable_name = 'torch.' + declaration['python_name']
+        else:
+            readable_name = declaration['python_name']
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(
+            readable_name=readable_name, num_options=len(arg_desc),
+            expected_args=arg_str, variables=variables_str, init=init_str))
+
+    def get_return_wrapper(self, option):
+        return self.RETURN_WRAPPER.get(option['return'], None)
+
+    def get_arg_accessor(self, arg, option):
+        if arg['name'] == 'self':
+            return 'self'
+        if arg.get('output'):
+            if not option['output_provided']:
+                return arg['name']
+            if option['output_count'] == 1:
+                return '___out'
+            else:
+                return 'PyTuple_GET_ITEM(___out, {})'.format(arg['output_idx'])
+
+    def process_docstrings(self):
+        for declaration in self.declarations:
+            docstr = declaration.get('docstring_method')
+            if docstr is None:
+                continue
+            declaration['docstring_content'] = docstr.replace('\n', '\\n')
+            declaration['docstring_var'] = 'docstr_' + declaration['python_name']
+            for declaration in self.stateless_declarations:
+                docstr = declaration.get('docstring_stateless')
+                if docstr is None:
+                    continue
+            declaration['docstring_content'] = docstr.replace('\n', '\\n')
+            declaration['docstring_var'] = 'stateless_docstr_' + declaration['python_name']
+
+    def generate_out_options(self, declaration):
+        new_options = []
+        declaration.setdefault('init', [])
+        declaration['init'] += [self.OUT_INIT]
+        for option in declaration['options']:
+            out_idx = []
+            for i, arg in enumerate(option['arguments']):
+                if arg.get('output'):
+                    out_idx.append(i)
+            if not out_idx:
+                option['has_output'] = True
+                option['output_provided'] = False
+                new_options.append(option)
+                continue
+            for output_provided in (True, False):
+                option_copy = deepcopy(option)
+                option_copy['has_output'] = True
+                option_copy['output_provided'] = output_provided
+                option_copy['output_count'] = len(out_idx)
+                for i, idx in enumerate(out_idx):
+                    arg = option_copy['arguments'][idx]
+                    arg['output_idx'] = i
+                    if not output_provided:
+                        arg['ignore_check'] = True
+                    else:
+                        option_copy['argcount_offset'] = -len(out_idx) + 1
+                        arg['no_kwargs'] = True
+                        arg['no_idx'] = True
+                new_options.append(option_copy)
+        declaration['options'] = new_options
+
+    def process_declarations(self, declarations):
+        new_declarations = []
+
+        def has_arg_type(declaration, type_name):
+            return any(arg['type'] == type_name
+                       for option in declaration['options']
+                       for arg in option['arguments'])
+
+        def has_long_args(declaration):
+            return any(arg.get('long_args', False)
+                       for option in declaration['options']
+                       for arg in option['arguments'])
+
+        def has_output_args(declaration):
+            return any(arg.get('output')
+                       for option in declaration['options']
+                       for arg in option['arguments'])
+
+        def backends_types_to_defined_if_string(declaration):
+            # A declaration has two fields: 'backend', which stores a list of
+            # backends (currently 'cpu' and 'cuda') the declaration applies
+            # to, and 'types', which stores a list of real types the
+            # declaration applies to. In PyTorch, when a function is only
+            # supported by a subset of types, we wrap it in macro definition
+            # checks.
+            #
+            # Previously, we manually required the cwrap declaration to
+            # specify for which backend/type combinations a function was
+            # defined for. Now, we explicitly list the types and backends for
+            # a declaration, if it should only be supported for a specific
+            # subset of types, backends, or type-backend pairs.
+
+            types = declaration.get('types', [])
+            backends = declaration['backends']
+            all_backends = ['CPU', 'CUDA']
+
+            def get_defined_string(backend, real):
+                if backend == 'CUDA':
+                    if real == 'all':
+                        return "IS_CUDA"
+                    else:
+                        return 'CUDA_{0}'.format(real.upper())
+                else:
+                    if real == 'all':
+                        return "!IS_CUDA"
+                    else:
+                        return 'defined(TH_REAL_IS_{0})'.format(real.upper())
+
+            def expand_composite_type(p, t):
+                if t == 'floating_point':
+                    result = ['double', 'float']
+                    if p == 'CUDA':
+                        result.append('half')
+                elif t == 'integral':
+                    result = ['byte', 'char', 'short', 'int', 'long']
+                else:
+                    result = [t]
+                return result
+
+            defineds = []
+
+            # The logic below does not handle corner cases well. We allow the
+            # declaration to have a field 'backend_type_pairs' that stores a
+            # dictionary from type --> backend representing allowed
+            # combinations. Let's use these first.
+            for pair in declaration.get('backend_type_pairs', []):
+                p, t = pair
+                defineds.extend([get_defined_string(p, et) for et in
+                                expand_composite_type(p, t)])
+
+            # In the base case, types is empty and backends contains both
+            # 'CPU' and 'CUDA' --> this means we support all types, and our
+            # string should be empty, or simply the list of explict type
+            # backend pairs
+            if (len(types) == 0 and all([proc in backends for proc in
+                                         all_backends])):
+                return " || ".join(defineds)
+
+            # Case 2: types is empty, but only one backend type is specified
+            if len(types) == 0 and len(backends) == 1:
+                defineds.append('IS_CUDA' if backends[0] == 'CUDA' else
+                                "!IS_CUDA")
+                return " || ".join(defineds)
+
+            # Else, we loop overall all of the backend, type pairs and add
+            # them
+            for p in backends:
+                for t in types:
+                    defineds.extend([get_defined_string(p, et) for et in
+                                    expand_composite_type(p, t)])
+
+            return " || ".join(defineds)
+
+        for declaration in declarations:
+            # Disable all methods for THHalfTensor, unless cpu_half is True
+
+            dfstr = backends_types_to_defined_if_string(declaration)
+            if len(dfstr) > 0:
+                # for now, need to check for distributed defined if as well
+                if 'defined_if' in declaration:
+                    declaration['defined_if'] += ' && (' + dfstr + ')'
+                else:
+                    declaration['defined_if'] = dfstr
+
+            if not declaration.get('cpu_half', False):
+                defined_if = '!defined(TH_REAL_IS_HALF)'
+                if 'defined_if' in declaration:
+                    defined_if += ' && (' + declaration['defined_if'] + ')'
+                declaration['defined_if'] = defined_if
+
+            if declaration.get('only_register', False):
+                continue
+
+            declaration.setdefault('python_name', declaration['name'])
+            declaration.setdefault('variables', [])
+            if has_arg_type(declaration, 'THSize*'):
+                declaration['variables'] += ['THLongStoragePtr __size;']
+            if has_arg_type(declaration, 'THStride*'):
+                declaration['variables'] += ['THLongStoragePtr __stride;']
+            if has_output_args(declaration):
+                declaration['variables'] += ['PyObject *___out;']
+                self.generate_out_options(declaration)
+            if has_long_args(declaration):
+                for option in declaration['options']:
+                    for arg in option['arguments']:
+                        if arg.get('long_args', False):
+                            arg['no_kwargs'] = True
+            for option in declaration['options']:
+                option['cname'] = 'TH{}Tensor_({})'.format(
+                    'S' if option.get('sparse', False) else '', option['cname'])
+                if option.get('sparse', False):
+                    defined_if = option.get('defined_if', '')
+                    option['defined_if'] = '!IS_DISTRIBUTED' + (' && ' if defined_if else '') + defined_if
+
+            variants = declaration.get('variants', ['method'])
+            if 'function' in variants:
+                stateless_declaration = self.make_stateless(declaration)
+                new_declarations.append(stateless_declaration)
+                self.stateless_declarations.append(stateless_declaration)
+            if 'method' not in variants:
+                continue
+
+            self.declarations.append(declaration)
+            declaration['name'] = 'TH{}PTensor_({})'.format(
+                'S' if declaration.get('sparse', False) else '', declaration['name'])
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['name'] == 'self':
+                        arg['ignore_check'] = True
+
+        register_only = [d for d in declarations if d.get('only_register', False)]
+        declarations = [d for d in declarations
+                        if (('method' in d.get('variants', ['method'])) and
+                            (not d.get('only_register', False)))]
+        self.declarations.extend(filter(lambda x: 'method' in x.get('variants',
+                                 ['method']), register_only))
+        self.stateless_declarations.extend(filter(lambda x: 'method' not in
+                                           x.get('variants', ['method']),
+                                           register_only))
+
+        self.process_docstrings()
+
+        all_declarations = declarations + new_declarations
+        return all_declarations
+
+    def make_stateless(self, declaration):
+        declaration = deepcopy(declaration)
+        declaration['name'] = 'TH{}PTensor_stateless_({})'.format(
+            'S' if declaration.get('sparse', False) else '', declaration['name'])
+        for option in declaration['options']:
+            for arg in option['arguments']:
+                if arg['name'] == 'self':
+                    arg['assign_name'] = 'self'
+                    arg['name'] = 'source'
+        return declaration
+
+    def declare_methods(self, stateless, sparse):
+        tensor_methods = ''
+        for declaration in (self.declarations if not stateless else self.stateless_declarations):
+            if declaration.get('sparse', False) != sparse:
+                continue
+            flags = 'METH_VARARGS'
+            flags += ' | ' + declaration.get('method_flags') if 'method_flags' in declaration else ''
+            if not declaration.get('only_register'):
+                flags += ' | METH_KEYWORDS'
+            if declaration.get('override_method_flags'):
+                flags = declaration['override_method_flags']
+            entry = Template('  {"$python_name", (PyCFunction)$name, $flags, $docstring},\n').substitute(
+                python_name=declaration['python_name'], name=declaration['name'], flags=flags,
+                docstring=declaration.get('docstring_var', 'NULL')
+            )
+            if 'defined_if' in declaration:
+                entry = self.preprocessor_guard(entry, declaration['defined_if'])
+            tensor_methods += entry
+        generated = self.TENSOR_METHODS_DECLARATION.substitute(
+            methods=tensor_methods,
+            stateless=('' if not stateless else 'stateless_'),
+            sparse=('' if not sparse else 'S'),
+        )
+        if sparse:
+            generated = '#if !defined(TH_REAL_IS_HALF) && !IS_DISTRIBUTED\n' + generated + '\n#endif\n\n'
+        return generated
+
+    def process_full_file(self, code, template_path):
+        # We have to find a place before all undefs
+        idx = code.find('// PUT DEFINITIONS IN HERE PLEASE')
+        return (code[:idx] +
+                self.declare_methods(False, False) +
+                self.declare_methods(True, False) +
+                self.declare_methods(False, True) +
+                self.declare_methods(True, True) +
+                code[idx:]
+                )
+
+    def preprocessor_guard(self, code, condition):
+        return '#if ' + condition + '\n' + code + '#endif\n'
+
+    def process_wrapper(self, code, declaration):
+        if 'defined_if' in declaration:
+            return self.preprocessor_guard(code, declaration['defined_if'])
+        return code
+
+    def process_all_call_arg(self, code, option):
+        return 'LIBRARY_STATE ' + code
+
+    def process_all_checks(self, code, option):
+        if option.get('has_output'):
+            indent = " " * 10
+            if option['output_provided']:
+                checks = "___out != NULL &&\n" + indent
+                if option['output_count'] > 1:
+                    checks += "PyTuple_Check(___out) &&\n" + indent
+                    length_check = "PyTuple_GET_SIZE(___out) == {} &&\n".format(
+                        option['output_count'])
+                    checks += length_check + indent
+                code = checks + code
+            else:
+                code = "___out == NULL &&\n" + indent + code
+
+        if any(arg.get('long_args', False) for arg in option['arguments']):
+            code = code.replace('__argcount ==', '__argcount >=')
+            expected = str(int(option.get('output_provided', False)) +
+                           sum(not arg.get('no_kwargs', False) and not arg.get('ignore_check', False)
+                               for arg in option['arguments']))
+            code = '__dictcount == ' + expected + ' &&\n          ' + code
+
+        return code
+
+    def process_option_code(self, code, option):
+        if option.get('defined_if', ''):
+            defined_if = option['defined_if']
+            placeholder = ''
+            # This means that it's a first option, so we need a dummy if,
+            # so the next option can be an else if.
+            if 'else if' not in code:
+                placeholder = '\n    #else\n    if (false) {'
+            return '#if ' + defined_if + '\n          ' + code + placeholder + '\n    #endif\n'
+        return code
+
+    def process_pre_arg_assign(self, template, option):
+        new_args = []
+        for arg in option['arguments']:
+            if not option.get('output_provided', True) and arg.get('output'):
+                new_args.append(self.ALLOCATE_TYPE[arg['type']].substitute(name=arg['name']))
+        template = new_args + template
+        return template
+
+    def generate_docstrings_cpp(self):
+        template = Template('char* $name = "$content";')
+        return '\n\n'.join(
+            template.substitute(name=decl['docstring_var'], content=decl['docstring_content'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)
+
+    def generate_docstrings_h(self):
+        template = Template('extern char* $name;')
+        return '\n\n'.join(
+            template.substitute(name=decl['docstring_var'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)
diff --git a/tools/cwrap/plugins/WrapDim.py b/tools/cwrap/plugins/WrapDim.py
new file mode 100644
index 0000000..e4c1e0d
--- /dev/null
+++ b/tools/cwrap/plugins/WrapDim.py
@@ -0,0 +1,42 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class WrapDim(CWrapPlugin):
+
+    NDIM_TEMPLATE = Template(
+        """${arg_tensor}->nDimension""")
+
+    CODE_TEMPLATE = Template(
+        """THPUtils_assert(${ndim} > 0,
+         "dimension specified as %d, but tensor has no dimensions", ${arg_dim});
+         THPUtils_assert(${arg_dim} >= -(${ndim}) && ${arg_dim} < (${ndim}),
+         "dimension out of range (expected to be in range of [%d, %d], but got %d)",
+         -(${ndim}), (${ndim})-1, ${arg_dim});
+         if (${arg_dim} < 0) ${arg_dim} += (${ndim});""")
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def process_option_code_template(self, template, option):
+        new_code = []
+        for i, arg in enumerate(option['arguments']):
+            if 'wrap_dim' not in arg:
+                continue
+
+            params = arg.get('wrap_dim').split("+")
+            arg_tensor = params[0]
+
+            arg_tensor = "arg_" + arg_tensor
+            arg_dim = "arg_" + arg.get('assign_name', arg['name'])
+
+            params[0] = self.NDIM_TEMPLATE.substitute(arg_tensor=arg_tensor)
+            ndim = "+".join(params)
+
+            new_code.append(self.CODE_TEMPLATE.substitute(
+                arg_dim=arg_dim,
+                ndim=ndim))
+            new_code.append("")
+
+        template = new_code + template
+        return template
diff --git a/tools/cwrap/plugins/__init__.py b/tools/cwrap/plugins/__init__.py
new file mode 100644
index 0000000..7efb4a5
--- /dev/null
+++ b/tools/cwrap/plugins/__init__.py
@@ -0,0 +1,435 @@
+
+class CWrapPlugin(object):
+    """Base class from which all cwrap plugins should inherit.
+
+    Override any of the following methods to implement the desired wrapping
+    behavior.
+    """
+
+    def initialize(self, cwrap):
+        """Initialize the Plugin class prior to calling any other functions.
+
+        It is used to give the Plugin access to the cwrap object's helper
+        functions and state.
+
+        Args:
+            cwrap: the cwrap object performing the wrapping.
+
+        """
+        pass
+
+    def get_type_check(self, arg, option):
+        """Used to generate code for runtime checks of object types.
+
+        The type can be found in arg['type']. For example, it could be
+        THTensor*. If this Plugin recognizes the type in arg, it should
+        return a Template string containing code that checks whether a
+        Python object is of this type. For example, the return type in
+        this case would be:
+
+        Template('(PyObject*)Py_TYPE($arg) == THPTensorClass')
+
+        As a simpler example, if the type == 'bool' then we would return:
+
+        Template('PyBool_Check($arg)')
+
+        Note that the name of the identifier that will be subsituted must be
+        $arg.
+
+        Args:
+            arg: a Python object with a 'type' field representing the type
+            to generate a check string for.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            A Template string as described above, or None if this Plugin does
+            not have a corresponding type check for the passed type.
+
+        """
+        pass
+
+    def get_type_unpack(self, arg, option):
+        """Used to generate code unpacking of Python objects into C types.
+
+        Similar to get_type_check, but for unpacking Python objects into their
+        corresponding C types. The type is once again accessible via
+        arg['type']. This time we return a Template string that unpacks an
+        object. For a THTensor*, we know that the corresponding PyTorch type is
+        a THPTensor*, so we need to get the cdata from the object. So we would
+        return:
+
+        Template('((THPTensor*)$arg)->cdata')
+
+        For a simpler type, such as a long, we could do:
+
+        Template('PyLong_AsLong($arg)')
+
+        though in practice we will use our own custom unpacking code. Once
+        again, $arg must be used as the identifier.
+
+        Args:
+            arg: a Python object with a 'type' field representing the type
+            to generate a unpack string for.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            A Template string as described above, or None if this Plugin does
+            not have a corresponding type unpack for the passed type.
+
+        """
+        pass
+
+    def get_return_wrapper(self, option):
+        """Used to generate code wrapping a function's return value.
+
+        Wrapped functions should always return a PyObject *. However,
+        internally, the code will be working with C objects or primitives.
+        Therefore, if a function has a return value we need to convert it back
+        to a PyObject * before the function returns. Plugins can override this
+        function to generate wrapper code for returning specific C types. The
+        type is accessible via option['return'].
+
+        Continuing on with our THTensor* example, we might do something like:
+
+        Template('return THPTensor_(New)($result);')
+
+        In general, you want to do return <statement>; In this case, we call
+        into THP's library routine that takes a THTensor* (the $result
+        identifier) and returns a PyObject *.
+
+        For a bool, we could do Template('return PyBool_FromLong($result);').
+
+        Note that in other cases, our logic might be more complicated. For
+        example, if our return value is also an argument to the function call,
+        we could need to increase the reference count prior to returning.
+
+        Args:
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            A Template string as described above, or None if this Plugin does
+            not have a corresponding return wrapper for the functions return
+            type or specifier.
+
+        """
+        pass
+
+    def get_wrapper_template(self, declaration):
+        """Used to create a code template to wrap the options.
+
+        This function returns a Template string that contains the function call
+        for the overall declaration, including the method definition, opening
+        and closing brackets, and any additional code within the method body.
+        Look through the examples to get a sense of what this might look like.
+        The only requirements are that it contains unsubstituted template
+        identifiers for anything the cwrap engine expects.
+
+        Note that for any declaration only one Plugin can generate the wrapper
+        template.
+
+        Args:
+            declaration: the declaration for the wrapped method.
+
+        Returns:
+            A template string representing the entire function declaration,
+            with identifiers as necessary.
+
+        """
+        pass
+
+    def get_assign_args(self, arguments):
+        """Used to modify argument metadata prior to assignment.
+
+        We have already setup argument checking, and how to unpack arguments.
+        This function allows you to modify the metadata of an argument prior to
+        actually performing the assignment. For example, you might want to
+        check that an argument is of a specific type, but when unpacking it you
+        might want to treat it as a different type. This function will allow
+        you to do stuff like that --> e.g. you could set the 'type' field for a
+        particular argument to be something else.
+
+        Args:
+            arguments: a list of argument metadata dictionaries.
+
+        Returns:
+            The same list of arguments, with any modifications as you see fit.
+
+        """
+        pass
+
+    def get_arg_accessor(self, arg, option):
+        """Used to generate a string for accessing the passed arg.
+
+        One of the key components of the YAML definition for a method to be
+        wrapped are the arguments to that method. Override this function to
+        show how to access that specific arg in the code. For example, you
+        might do something different if the argument is a keyword argument, or
+        a constant, or self. The base cwrap plugin has a fallback arg accessor
+        for loading elements from the args PyObject * tuple passed to the
+        function.
+
+        Its best to look at some of the existing Plugins to get a sense of what
+        one might do.
+
+        Args:
+            arg: a dictionary specifying attributes of the arg to be accessed
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            A a string (note: not a Template string!) of code that can be used
+            to access the given arg. If the plugin does not know how to access
+            the arg, return None.
+        """
+        pass
+
+    def process_full_file(self, code, template_path):
+        """Used to modify the code for the entire output file.
+
+        The last thing any plugin can do. Code contains the results of wrapping
+        all the declarations. The plugin can do things like adding header
+        guards, include statements, etc.
+
+        Args:
+            code: a string source code for the wrapped declarations.
+
+        Returns:
+            The same code, modified as the plugin sees fit.
+
+        """
+        return code
+
+    def process_single_check(self, code, arg, arg_accessor):
+        """Used to postprocess a type check.
+
+        Above we defined a function get_type_check that returns a Template
+        string that allows for type checking a PyObject * for a specific type.
+        In this function, the passed "code" is a combination of that type check
+        along with a specific arg_accessor pasted in. For example:
+
+        '(PyObject*)Py_TYPE(PyTuple_GET_ITEM(args, 1)) == THPTensorClass'
+
+        This function can be overridden to support modifying this check string.
+        For example, if an argument can be null, we might want to check and see
+        if the type is Py_None, as well.
+
+        Args:
+            code: The string code representing a type check for a specific
+            argument being accessed.
+            arg: dictionary containing properties of that specific argument
+            arg_accessor: the arg_accessor string for that specific argument.
+            Note that this is likely also embedded in code, but if you want to
+            be able to access this arg and throw away the other code, you can
+            do so.
+
+        Returns:
+            A string representing the processed check/access string for this
+            arg. If the plugin does not know how to modify a specific input, it
+            should return the original code.
+
+        """
+        return code
+
+    def process_all_checks(self, code, option):
+        """Used to generate additional checks based on all the individual ones.
+
+        After individually processing each argument with get_type_check,
+        get_arg_accessor, process_single_check, this function allows you to
+        inspect the combined checks and do any additional checking/modify that
+        string as you see fit. In particular, given code is a string like:
+
+        CHECK_TYPE(GET_ARG(0)) && CHECK_TYPE(GET_ARG(1)) && ..
+
+        We can process it as we see fit. For example, we may want to add a
+        check at the beginning that we have the specified number of arguments.
+
+        Args:
+            code: A string representing each argument check separated by an
+            '&&'. code can be None if there are no arguments to be checked.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            The modified code string with any additional checks, or just the
+            existing code if no modifications are to be made.
+
+        """
+        return code
+
+    def process_single_unpack(self, code, arg, arg_accessor):
+        """Used to postprocess a type unpack.
+
+        Same as process_single_check above, but for type unpacking. E.g. an
+        example code could be:
+
+        PyLong_FromLong(PyTuple_GET_ITEM(args, 0))
+
+        And this code could modify that as it sees fit. For example, if the
+        result of accessing the argument is None, we would not want to call the
+        unpacking code.
+
+        Args:
+            code: The string code representing a type unpack for a specific
+            argument being accessed.
+            arg: dictionary containing properties of that specific argument
+            arg_accessor: the arg_accessor string for that specific argument.
+            Note that this is likely also embedded in code, but if you want to
+            be able to access this arg and throw away the other code, you can
+            do so.
+
+        Returns:
+            A string representing the processed unpack/access string for this
+            arg. If the plugin does not know how to modify a specific input, it
+            should return the original code.
+
+        """
+        return code
+
+    def process_all_call_arg(self, code, option):
+        """Used to modify the arguments to the underlying C function call.
+
+        Code is the string of comma-separated arguments that will be passed to
+        the wrapped C function. You can use this function to modify that string
+        as you see fit. For example, THP prepends the LIBRARY_STATE definition
+        so that the generated code will follow the conventions it uses for
+        writing one function for both TH/THC calls.
+
+        Args:
+            code: A string as described above.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            The same code, modified as the plugin sees fit.
+
+        """
+        return code
+
+    def process_option_code(self, code, option):
+        """Used to modify the entire code body for an option.
+
+        Code in this case is a string containing the entire generated code for
+        a specific option. Note that this body includes the checks for each
+        option, i.e. if (type checks for one permutation) { ... } else if (type
+        checks for another permutation) { ... } etc.
+
+        Args:
+            code: string representing the generated code for the option
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            The same code, modified as the plugin sees fit.
+
+        """
+        return code
+
+    def process_wrapper(self, code, declaration):
+        """Used to modify the entire code body for a declaration.
+
+        Code in this case is a string containing the entire generated code for
+        a specific declaration. This code can be modified as the plugin sees
+        fit. For example, we might want to wrap the function in preprocessor
+        guards if it is only enabled for floats.
+
+        Args:
+            code: string representing the generated code for the declaration
+            declaration: the declaration metadata.
+
+        Returns:
+            The same code, modified as the plugin sees fit.
+
+        """
+        return code
+
+    def process_declarations(self, declarations):
+        """Used to process/modify the function's declaration.
+
+        Cwrap loads the YAML of a function to be cwrap'd into a dictionary.
+        This is known as the declaration. The cwrap code sets some defaults as
+        necessary, and then passes this dictionary to process_declarations.
+        Overriding this code allows the plugin to modify this declaration as it
+        sees fit prior to any code generation. The plugin may add, remove or
+        modify the fields of the declaration dictionary. It can also save state
+        to the Plugin for use in subsequent function overrides.
+
+        Its best to look at some of the existing Plugins to get a sense of what
+        one might do.
+
+        Args:
+            declarations: a list of declarations, i.e. dictionaries that define
+            the function(s) being wrapped. Note that this can be plural, so the
+            function must take care to modify each input declaration.
+
+        Returns:
+            Those same declarations, modified as the Plugin sees fit. Note that
+            you could insert a declaration, if you wanted to take an input
+            declaration and e.g. wrap it multiple times.
+
+        """
+        return declarations
+
+    def process_option_code_template(self, template, option):
+        """Used to modify the code template for the option.
+
+        The "code template" can be thought of the actual body implementing the
+        wrapped function call --> i.e. it is not the argument check,
+        assignment, etc. but the actual logic of the function. The template is
+        a list containing two operations: the $call, and the $return_result.
+        These represent the "locations" where the function call will happen,
+        and the function will return.
+
+        This function can modify the list to insert arbitrary code around the
+        $call and $return_result. For example, one might want to wrap the code
+        in a try/catch, or post-process the result in some way. This allows a
+        plugin to do that.
+
+        Args:
+            template: a list containing $call and $return_result, in addition
+            to any arbitrary code inserted by other plugins.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            The same "code template", possibly modified by this plugin.
+
+        """
+        return template
+
+    def process_pre_arg_assign(self, template, option):
+        """Used to include any code before argument assignment.
+
+        This function can be used to insert any code that will be part of the
+        resulting function. The code is inserted after argument checks occur,
+        but before argument assignment.
+
+        Args:
+            template: String representing the code to be inserted. If other
+            plugins have included code for pre_arg_assign, it will be included
+            here.
+            option: dictionary containing the information for this specific
+            option.
+
+        Returns:
+            template, with any additional code if needed.
+
+        """
+        return template
+
+
+from .NNExtension import NNExtension
+from .NullableArguments import NullableArguments
+from .OptionalArguments import OptionalArguments
+from .ArgcountChecker import ArgcountChecker
+from .ArgumentReferences import ArgumentReferences
+from .BeforeAfterCall import BeforeAfterCall
+from .ConstantArguments import ConstantArguments
+from .ReturnArguments import ReturnArguments
+from .GILRelease import GILRelease
+from .AutoGPU import AutoGPU
+from .CuDNNPlugin import CuDNNPlugin
+from .WrapDim import WrapDim
+from .Broadcast import Broadcast
diff --git a/tools/cwrap/plugins/templates/nn_tail.cpp b/tools/cwrap/plugins/templates/nn_tail.cpp
new file mode 100644
index 0000000..247dc1e
--- /dev/null
+++ b/tools/cwrap/plugins/templates/nn_tail.cpp
@@ -0,0 +1,21 @@
+namespace torch { namespace nn {
+
+static PyTypeObject thnn_type;
+
+void init_$short_name(PyObject* c_module) {
+  ((PyObject*)&thnn_type)->ob_refcnt = 1;
+  thnn_type.tp_flags = Py_TPFLAGS_DEFAULT;
+  thnn_type.tp_methods = module_methods;
+  thnn_type.tp_name = "torch._C.$short_name";
+  if (PyType_Ready(&thnn_type) < 0) {
+    throw python_error();
+  }
+
+  PyObject* type_obj = (PyObject*)&thnn_type;
+  Py_INCREF(type_obj);
+  if (PyModule_AddObject(c_module, "$short_name", type_obj) < 0) {
+    throw python_error();
+  }
+}
+
+}}  // namespace torch::nn
diff --git a/tools/docker/Dockerfile_runtime b/tools/docker/Dockerfile_runtime
new file mode 100644
index 0000000..c987cfa
--- /dev/null
+++ b/tools/docker/Dockerfile_runtime
@@ -0,0 +1,28 @@
+FROM ubuntu:16.04 
+
+LABEL com.nvidia.volumes.needed="nvidia_driver"
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \ 
+         git \
+         curl \
+         ca-certificates \
+         libjpeg-dev \
+         libpng-dev && \
+     rm -rf /var/lib/apt/lists/*
+
+RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \     
+     rm ~/miniconda.sh && \
+     /opt/conda/bin/conda install conda-build && \
+     /opt/conda/bin/conda install numpy pyyaml scipy ipython&& \
+     /opt/conda/bin/conda clean -ya 
+ENV PATH /opt/conda/bin:$PATH
+RUN conda install pytorch torchvision cuda90 -c pytorch && /opt/conda/bin/conda clean -ya
+
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+WORKDIR /workspace
+RUN chmod -R a+w /workspace
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
new file mode 100644
index 0000000..2a5068f
--- /dev/null
+++ b/tools/download_mnist.py
@@ -0,0 +1,88 @@
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import gzip
+import os
+import sys
+import urllib
+
+try:
+    from urllib.error import URLError
+    from urllib.request import urlretrieve
+except ImportError:
+    from urllib2 import URLError
+    from urllib import urlretrieve
+
+RESOURCES = [
+    'train-images-idx3-ubyte.gz',
+    'train-labels-idx1-ubyte.gz',
+    't10k-images-idx3-ubyte.gz',
+    't10k-labels-idx1-ubyte.gz',
+]
+
+
+def report_download_progress(chunk_number, chunk_size, file_size):
+    if file_size != -1:
+        percent = min(1, (chunk_number * chunk_size) / file_size)
+        bar = '#' * int(64 * percent)
+        sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100)))
+
+
+def download(destination_path, url, quiet):
+    if os.path.exists(destination_path):
+        if not quiet:
+            print('{} already exists, skipping ...'.format(destination_path))
+    else:
+        print('Downloading {} ...'.format(url))
+        try:
+            hook = None if quiet else report_download_progress
+            urlretrieve(url, destination_path, reporthook=hook)
+        except URLError:
+            raise RuntimeError('Error downloading resource!')
+        finally:
+            if not quiet:
+                # Just a newline.
+                print()
+
+
+def unzip(zipped_path, quiet):
+    unzipped_path = os.path.splitext(zipped_path)[0]
+    if os.path.exists(unzipped_path):
+        if not quiet:
+            print('{} already exists, skipping ... '.format(unzipped_path))
+        return
+    with gzip.open(zipped_path, 'rb') as zipped_file:
+        with open(unzipped_path, 'wb') as unzipped_file:
+            unzipped_file.write(zipped_file.read())
+            if not quiet:
+                print('Unzipped {} ...'.format(zipped_path))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Download the MNIST dataset from the internet')
+    parser.add_argument(
+        '-d', '--destination', default='.', help='Destination directory')
+    parser.add_argument(
+        '-q',
+        '--quiet',
+        action='store_true',
+        help="Don't report about progress")
+    options = parser.parse_args()
+
+    if not os.path.exists(options.destination):
+        os.makedirs(options.destination)
+
+    try:
+        for resource in RESOURCES:
+            path = os.path.join(options.destination, resource)
+            url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource)
+            download(path, url, options.quiet)
+            unzip(path, options.quiet)
+    except KeyboardInterrupt:
+        print('Interrupted')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/gen_onnx.sh b/tools/gen_onnx.sh
new file mode 100755
index 0000000..950ffb5
--- /dev/null
+++ b/tools/gen_onnx.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+# This script should be executed in pytorch root folder.
+
+TEMP_DIR=tools/temp
+
+set -ex
+# Assumed to be run like tools/gen_onnx.sh
+(cd torch/lib/nanopb/generator/proto && make)
+# It always searches the same dir as the proto, so
+# we have got to copy the option file over
+mkdir -p $TEMP_DIR
+cp torch/csrc/onnx/onnx.options $TEMP_DIR/onnx.options
+wget https://raw.githubusercontent.com/onnx/onnx/master/onnx/onnx.proto -O $TEMP_DIR/onnx.proto
+protoc --plugin=protoc-gen-nanopb=$PWD/torch/lib/nanopb/generator/protoc-gen-nanopb \
+       $TEMP_DIR/onnx.proto \
+       --nanopb_out=-T:.
+# NB: -T suppresses timestamp. See https://github.com/nanopb/nanopb/issues/274
+# nanopb generated C files are valid CPP! Yay!
+cp $TEMP_DIR/onnx.pb.c torch/csrc/onnx/onnx.npb.cpp
+sed -i s'/\(#include.*onnx\).pb.h/\1.npb.h/' torch/csrc/onnx/onnx.npb.cpp
+cp $TEMP_DIR/onnx.pb.h torch/csrc/onnx/onnx.npb.h
+
+rm -r $TEMP_DIR
diff --git a/tools/jit/__init__.py b/tools/jit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
new file mode 100644
index 0000000..d6458f9
--- /dev/null
+++ b/tools/jit/gen_jit_dispatch.py
@@ -0,0 +1,469 @@
+import os
+import argparse
+import re
+from itertools import count, combinations, groupby
+from ..autograd.utils import CodeTemplate, write, uninplace_api_name
+from ..autograd.gen_autograd import load_aten_declarations
+from collections import OrderedDict
+
+# JIT has a type system of
+# Scalar = int | float | bool # int is the largest int (int64_t),
+# float is the largest float (double) we don't have the others because they are never held in tensors
+# Type = Scalar # primitive numbers
+#      | Tensor # any tensor, as defined by at::Tensor
+#      | Type[] # a dynamically sized list[ of a type
+#      | Scalar[N] # a homogenous fixed size scalar list, single scalars can expand to this list
+#      | (Type1, Type2, ...) # a heterogenous tuple
+#      | Layout | ScalarType | Device | Generator # special singleton types for built-in concepts in tensor lib
+
+# clean up the variety of C++ types in the ATen declarations
+# to be in the restricted set of types that the IR represents
+# note: no default values for this map, to make it clear what types
+# can be passedthrough
+
+TYPE_MAP = {
+    'std::array<bool,2>': 'bool[2]',
+    'std::array<bool,3>': 'bool[3]',
+    'std::array<bool,4>': 'bool[4]',
+    'Scalar': 'Scalar',
+    'Tensor': 'Tensor',
+    'TensorList': 'Tensor[]',
+    # this appears in return values instead of TensorList
+    # since TensorList is a ArrayRef in arguments but a vector
+    # in returns
+    'std::vector<Tensor>': 'Tensor[]',
+    'IntList': 'int[]',
+    'Layout': 'Layout',
+    'Device': 'Device',
+    'ScalarType': 'ScalarType',
+    'int64_t': 'int',
+    'double': 'float',
+    'bool': 'bool',
+    'Generator': 'Generator',
+}
+
+
+def jit_type_of(arg):
+    typ = TYPE_MAP[arg['simple_type']]
+    if is_sized_intlist_arg(arg):
+        typ = 'int[{}]'.format(arg['size'])
+
+    if arg.get('is_nullable'):
+        typ = '{}?'.format(typ)
+    return typ
+
+# map from _jit type_, generated from jit_type_of to attribute used to store it
+ATTR_METHOD_MAP = {
+    'int': 'i',
+    'float': 'f',
+    'bool': 'i',
+    'Scalar': 't',
+    'int[]': 'is',
+    'bool[]': 'is',
+    'Layout': 'i',
+    'Device': 'is',
+    'ScalarType': 'i',
+}
+
+
+def attr_of(jit_type):
+    # for attributes, we dont care about the length of an array,
+    # so strip it from the type
+    jit_type = re.sub("\\[\d+\\]", "[]", jit_type)
+    return ATTR_METHOD_MAP[jit_type]
+
+# map from aten 'simple_type' to the function that will cast a attribute value
+# to that type
+FROM_ATTRIBUTE = {
+    'std::array<bool,2>': 'as_bool_array<2>',
+    'std::array<bool,3>': 'as_bool_array<3>',
+    'std::array<bool,4>': 'as_bool_array<4>',
+    'Scalar': 'Scalar',
+    'IntList': 'std::vector<int64_t>',
+    'Layout': 'int64_t',
+    'Device': 'std::vector<int64_t>',
+    'ScalarType': 'int64_t',
+}
+
+# map from aten 'simple_type' to the function that will turn a tensor into
+# that type
+FROM_TENSOR = {
+    'Device': 'tensor_as<std::vector<int64_t>>',
+    'ScalarType': 'tensor_as<int64_t>',
+    'Layout': 'tensor_as<int64_t>',
+    'IntList': 'tensor_as<std::vector<int64_t>>',
+}
+
+
+def from_tensor(arg):
+    simple_type = arg['simple_type']
+    if simple_type in FROM_TENSOR:
+        return FROM_TENSOR[simple_type]
+    else:
+        return 'tensor_as<{}>'.format(arg['simple_type'])
+
+
+KW_ASSIGNMENT = CodeTemplate("""\
+auto ${name} = ${type_cast}(node->${method}(Symbol::attr("${name}")));\
+""")
+
+POS_ASSIGNMENT = CodeTemplate("""\
+auto ${name} = ${from_tensor}(std::move(peek(stack, ${i}, ${N})).toTensor());\
+""")
+
+CALL_NAMESPACE = CodeTemplate("""\
+auto result = at::${name}(${args});
+""")
+CALL_METHOD = CodeTemplate("""\
+DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs}));
+auto result = (${first}).${name}(${args});
+""")
+CALL_TENSOR_OPTIONS = CodeTemplate("""\
+const auto device_index = static_cast<int32_t>(device[1]);
+const auto options = TensorOptions()
+        .dtype(static_cast<at::ScalarType>(dtype))
+        .layout(static_cast<at::Layout>(layout))
+        .device({static_cast<at::Device::Type>(device[0]), device_index});
+auto result = torch::${name}(${args}, options);
+""")
+
+CONSTRUCTOR = CodeTemplate("""\
+[](Node *node) {
+  ${kw_assignments}
+  return Operation([=](Stack & stack) {
+    autograd::profiler::RecordFunction record("${name}");
+    ${pos_assignments}
+    ${call}
+    drop(stack, ${num_dynamic_inputs});
+    pack(stack, std::move(result));
+    return 0;
+  });
+}
+""")
+
+OPERATOR = CodeTemplate("""\
+Operator(
+    "${signature}",
+    ${ops}
+),
+""")
+
+
+def is_magic_method(api_name):
+    return api_name.startswith('__') and api_name.endswith('__')
+
+
+blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'std::string', 'void*'}
+default_only_types = {'Generator'}
+
+
+def is_jit_arg(i, arg):
+    simple_type = arg['simple_type']
+    if simple_type in blacklisted_types:
+        return False
+    if simple_type in default_only_types and 'default' not in arg:
+        return False
+    if simple_type == 'Type':
+        return False
+    return True
+
+
+def is_jit_op(decl):
+    # We currently don't support functions that return nothing
+    if all(r['type'] == 'void' for r in decl['returns']):
+        return False
+
+    # we currently only support vararg tensor lists when they are the _first_ argument
+    # and the only tensor argument
+    arguments = decl['arguments']
+    # Only support a single TensorList arg
+    if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1:
+        return False
+
+    return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and
+            not decl['name'].endswith('_out') and
+            ('namespace' in decl['method_of'] or 'Tensor' in decl['method_of']) and
+            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['arguments'])) and
+            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['returns'])))
+
+
+def is_tensor_arg(arg):
+    return arg['simple_type'] in {'Tensor', 'TensorList'}
+
+
+def is_sized_intlist_arg(arg):
+    """Returns True for arguments declared as IntList[k], but False for IntList."""
+    return (arg['simple_type'] == 'IntList') and ('size' in arg)
+
+
+def gen_jit_dispatch(declarations, out, template_path):
+    REGISTER_ATEN_OPS_CPP = CodeTemplate.from_file(template_path + '/register_aten_ops.cpp')
+    ATEN_INTERNED_STRINGS_H = CodeTemplate.from_file(template_path + '/aten_interned_strings.h')
+
+    ops = []
+
+    def get_invocation(decl, args, num_dynamic_inputs):
+        if decl.get('has_tensor_options'):
+            return CALL_TENSOR_OPTIONS.substitute(name=decl['name'], args=args[:-3])
+        elif 'namespace' in decl['method_of']:
+            return CALL_NAMESPACE.substitute(name=decl['name'], args=args, num_dynamic_inputs=num_dynamic_inputs)
+        else:
+            return CALL_METHOD.substitute(
+                name=decl['name'], first=args[0], args=args[1:],
+                num_dynamic_inputs=num_dynamic_inputs)
+
+    def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
+        # is_positional_arg is a boolean list the same length as decl['arguments']
+        # that indicates if the argument should come from the postional list
+        # of inputs. If false, the argument comes from the constant attributes
+        kw_assignments = []
+        pos_assignments = []
+        arguments = []
+
+        if has_tensorlist:
+            kw_assignments.append('size_t varargs_length = node->inputs().size();')
+            # arguments look like: [tensor list], arg1, arg2, arg3
+            # we use peek(<i>, static_inputs) to read the non-vararg inputs
+            # from the end of the stack
+            static_inputs = sum(is_positional_arg) - 1
+            num_dynamic_inputs = 'varargs_length'
+            tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0]
+        else:
+            static_inputs = sum(is_positional_arg)
+            num_dynamic_inputs = static_inputs
+
+        real_inputs = 0
+        for i, arg in enumerate(decl['arguments']):
+            # This conditional allows us to process argument lists with a flattened argument list
+            # with a single TensorList. Given the sequence of arguments:
+            # a b c [d e f g] h i # [] is the list
+            #
+            # 1. For the section where we are processing positional inputs before the
+            #    TensorList:
+            #    a b c [d e f g] h i # [] is the list
+            #    ~~~~~~~~~~~~ <- N
+            #   we set this view_length to the total number of varargs inputs (i.e. the length)
+            #   of the whole argument list. This means that indexing into the list using peek()
+            #   we will retrieve arguments ar their true indices (i.e. peek at 0 points to a,
+            #   1 points to b, etc...). Similarly, we can use peekSlice() to index into the
+            #   list itself this way.
+            # 2. After the list:
+            #    a b c [d e f g] h i # [] is the list
+            #                 ~~~~~~ <- N
+            #   Here we set the view length to static_inputs. In our example,
+            #   we effectively ignore the fact that we have a list here. What is
+            #   significant is that our index i is equivalent when the view length
+            #   is right-justified, whether we have the list or not. Concretely,
+            #   indexing h or i from `a b c [d e f g] h i` is equvalent to indexing
+            #   h or i from `a b c h i`.
+            view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs
+
+            if arg['simple_type'] == 'TensorList':
+                # NOTE: don't advance real_inputs here. After this we are going
+                # to switch over to indexing from the end as if we only had
+                # the static arguments.
+                arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))'
+                                 .format(real_inputs, static_inputs))
+            elif arg['simple_type'] in default_only_types:
+                arguments.append(arg['default'])
+            elif is_tensor_arg(arg):
+                arguments.append('std::move(peek(stack, {}, {})).toTensor()'.format(real_inputs, view_length))
+                real_inputs += 1
+            elif is_positional_arg[i]:
+                template_kwargs = dict(from_tensor=from_tensor(arg),
+                                       name=arg['name'],
+                                       i=real_inputs,
+                                       N=view_length)
+                real_inputs += 1
+
+                assign = POS_ASSIGNMENT.substitute(**template_kwargs)
+
+                pos_assignments.append(assign)
+                arguments.append(arg['name'])
+            else:
+                attr_method = attr_of(jit_type_of(arg))
+                simple_type = arg['simple_type']
+                assign = KW_ASSIGNMENT.substitute(type_cast=FROM_ATTRIBUTE.get(simple_type, simple_type),
+                                                  name=arg['name'],
+                                                  method=attr_method)
+                kw_assignments.append(assign)
+                arguments.append(arg['name'])
+
+        call = get_invocation(decl, arguments, num_dynamic_inputs)
+
+        returns = decl['returns']
+        all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns)
+
+        constructor = CONSTRUCTOR.substitute(name=decl['name'],
+                                             call=[call],  # in an array so that substitute handles newlines correctly
+                                             kw_assignments=kw_assignments,
+                                             pos_assignments=pos_assignments,
+                                             num_dynamic_inputs=num_dynamic_inputs)
+        return constructor
+
+    def emit_decl(decl):
+        arguments = decl['arguments']
+        has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
+        num_tensor_args = sum(map(is_tensor_arg, arguments))
+
+        # Right now, we generate dispatch methods that either take all non-tensor arguments
+        # as attributes, or don't use any attributes at all. In the future we might want to
+        # have something in the middle too (might be useful for e.g. constant propagation
+        # into attributes, as that would allow us to avoid reparsing tensors into scalar
+        # args at every invocation).
+
+        all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments)
+        only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments)
+
+        variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)]
+        # in some cases there are no inputs that are possibly attributes, so the
+        # variants are actually the same. If so avoid generating both to save compilation
+        # time.
+        if all_real_arguments_are_inputs != only_tensors_are_inputs:
+            variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)]
+
+        ops.append(OPERATOR.substitute(signature=signature(decl),
+                                       ops=variants))
+
+    # This function declares an order on declarations. This is necessary because
+    # there is some ambiguity in the choice of overload: if an argument is overloaded
+    # to accept both Scalar and Tensor, the schema with the Tensor should come first
+    # TODO: this can (probably) be removed when we remove the implicit conversion
+    # from Tensor -> Number.
+    def sort_decls(jit_decls):
+        def declkey(decl):
+            # key = sum_{i < len(args)} {1 if arg is tensor else 2} * (3 ** i)
+            # This is a ternary encoding where
+            # 0: No argument at this position
+            # 1: Tensor argument at this position
+            # 2: Some other argument at this position.
+            args = decl['arguments']
+            result = 0
+            for i in range(len(args)):
+                result += (3 ** i) * (1 if args[i]['simple_type'] == 'Tensor' else 2)
+            return result
+
+        # NB: itertools.groupby requires the list be sorted.
+        sorted_decls = sorted(jit_decls, key=lambda decl: decl['name'])
+        grouped_decls = [list(g) for _, g in
+                         groupby(sorted_decls, key=lambda decl: decl['name'])]
+        result = []
+        for group in grouped_decls:
+            sorted_decls = sorted(group, key=declkey)
+            result.extend(sorted_decls)
+        return result
+
+    # We need to add methods implemented manually in TensorImpl
+    tensor_impl_methods = [{
+        'name': name,
+        'api_name': name,
+        'method_of': ['Tensor'],
+        'arguments': [{'name': 'self', 'simple_type': 'Tensor'}],
+        'returns': [{'name': 'result', 'type': 'int64_t', 'dynamic_type': 'int64_t', 'simple_type': 'int64_t'}],
+    } for name in ['sizes', 'strides', 'dim']]
+    aten_decls = load_aten_declarations(declarations) + tensor_impl_methods
+
+    jit_decls = [d for d in aten_decls if is_jit_op(d)]
+
+    # add arguments dtype and device for functions like zeros
+    for decl in jit_decls:
+        arguments = decl['arguments']
+        for n, arg in enumerate(arguments):
+            if arg['simple_type'] == 'TensorOptions':
+                del arguments[n]
+                arguments.extend([
+                    # XXX - until we actually have first-class interpreter types for these
+                    # concepts, the default values to be encoded in Tensors
+
+                    # dtype is specified as an int64_t of at::ScalarType
+                    {'name': 'dtype', 'simple_type': 'ScalarType', 'default': 'float', 'kwarg_only': True},
+                    # layout is specified as an int64_t of at::Layout
+                    {'name': 'layout', 'simple_type': 'Layout', 'default': 'strided', 'kwarg_only': True},
+                    # device is specified as an IntList of { at::Device::Type, device_id }
+                    {'name': 'device', 'simple_type': 'Device', 'kwarg_only': True,
+                        'default': '[cpu, -1]'},
+                ])
+                decl['has_tensor_options'] = True
+
+    jit_decls = sort_decls(jit_decls)
+    for decl in jit_decls:
+        emit_decl(decl)
+
+    # Sort the generated snippets to ensure that the generation is deterministic
+    env = {
+        'constructors': ops,
+    }
+    write(out, 'register_aten_ops.cpp', REGISTER_ATEN_OPS_CPP, env)
+
+    # NB: Operate on aten_decls, not jit_decls, because VariableType is
+    # a client for these symbols as well
+    # NB: This means we DON'T generate interned strings for inplace ops.
+    # Change this when you do!
+    # NB: Keep this code synchronized with the code in
+    # tool/autograd/gen_variable_type.py
+    # NB: Some operations have inplace versions, but NOT non-inplace
+    # versions! Thus uninplace_api_name() is mandatory (if you remove
+    # it, you will get missing symbols.)
+    names = set(uninplace_api_name(decl['api_name']) for decl in aten_decls)
+    # NB: This grabs non keyword arguments too, but it's harmless
+    attrs = set(arg['name'] for decl in aten_decls for arg in decl['arguments'])
+    strings_env = {
+        'aten_symbols': ["_(aten, {}) \\".format(n) for n in sorted(names)],
+        'attr_symbols': ["_(attr, {}) \\".format(n) for n in sorted(attrs)]
+    }
+    write(out, 'aten_interned_strings.h', ATEN_INTERNED_STRINGS_H, strings_env)
+
+default_map = {'{}': 'None', 'nullptr': 'None'}
+
+
+def signature(decl):
+    def format_arg(arg):
+        name = arg['name']
+        typ = jit_type_of(arg)
+        decl = '{} {}'.format(typ, name)
+        if 'default' in arg:
+            # clean up initializer lists {{true, true}} -> [true, true]
+            default = str(arg['default']) \
+                .replace('{{', '[') \
+                .replace('}}', ']') \
+                .replace('true', 'True') \
+                .replace('false', 'False') \
+                .replace('nullptr', 'None') \
+                .replace('Reduction::ElementwiseMean', 'ElementwiseMean') \
+                .replace('{}', 'None' if is_tensor_arg(arg) else '[]')
+
+            default = default_map.get(default, default)
+            decl = '{}={}'.format(decl, default)
+        return decl
+
+    args = []
+    kwarg_only = False
+    for a in decl['arguments']:
+        if not kwarg_only and a.get('kwarg_only'):
+            args.append('*')
+            kwarg_only = True
+        args.append(format_arg(a))
+
+    arg_list = ', '.join(args)
+    if len(decl['returns']) == 1:
+        ret_list = jit_type_of(decl['returns'][0])
+    else:
+        ret_list = '({})'.format(', '.join(jit_type_of(r) for r in decl['returns']))
+    return 'aten::{}({}) -> {}'.format(decl['name'], arg_list, ret_list)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate JIT op dispatch')
+    parser.add_argument('declarations', metavar='DECL',
+                        help='path to Declarations.yaml')
+    parser.add_argument('out', metavar='OUT',
+                        help='path to output directory')
+    parser.add_argument('template-path', metavar='TEMPLATE_PATH',
+                        help='path to templates directory')
+    args = parser.parse_args()
+    gen_jit_dispatch(args.declarations, args.out, args.template_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/jit/templates/aten_interned_strings.h b/tools/jit/templates/aten_interned_strings.h
new file mode 100644
index 0000000..5b117dd
--- /dev/null
+++ b/tools/jit/templates/aten_interned_strings.h
@@ -0,0 +1,16 @@
+#pragma once
+
+// ${generated_comment}
+
+// ATen symbols correspond exactly to operators defined in ATen.  Every
+// symbol here corresponds exactly to an ATen operation which is defined
+// in Declarations.yaml; attributes are in one-to-one correspondence with
+// their ATen name.
+
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+${aten_symbols}
+/* nothing */
+
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+${attr_symbols}
+/* nothing */
diff --git a/tools/jit/templates/aten_schema_declarations.cpp b/tools/jit/templates/aten_schema_declarations.cpp
new file mode 100644
index 0000000..7b955c0
--- /dev/null
+++ b/tools/jit/templates/aten_schema_declarations.cpp
@@ -0,0 +1,5 @@
+namespace torch { namespace jit {
+const char * schema_declarations = R"===(
+  ${declarations}
+)===";
+}}
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
new file mode 100644
index 0000000..2f4d055
--- /dev/null
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -0,0 +1,66 @@
+#include "torch/csrc/jit/operator.h"
+
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/variable_tensor_functions.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+
+#include <ATen/ATen.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// ${generated_comment}
+
+namespace torch { namespace jit {
+
+using autograd::Variable;
+using autograd::variable_list;
+using at::Scalar;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::DeviceGuard;
+
+namespace {
+
+int deviceForInputs(Stack & stack, size_t N) {
+  if(N == 0)
+    return -1;
+  auto t = (stack.end() - N)->toTensor();
+  return t.type().is_cuda() ? (int) t.get_device() : -1;
+}
+
+std::vector<at::Tensor> toTensors(at::ArrayRef<IValue> ivalues) {
+  return fmap(ivalues, [](const IValue& v) {
+    return v.toTensor();
+  });
+}
+
+template<size_t N>
+std::array<bool, N> as_bool_array(const std::vector<int64_t>& vec) {
+  std::array<bool, N> res;
+  JIT_ASSERT(vec.size() == N);
+  std::copy(vec.begin(), vec.end(), res.begin());
+  return res;
+}
+
+RegisterOperators reg({
+${constructors}
+});
+
+} // anon namespace
+
+
+}} // namespace torch::jit
diff --git a/tools/nnwrap/__init__.py b/tools/nnwrap/__init__.py
new file mode 100644
index 0000000..d6457a5
--- /dev/null
+++ b/tools/nnwrap/__init__.py
@@ -0,0 +1 @@
+from .generate_wrappers import generate_wrappers, wrap_function, import_module
diff --git a/tools/nnwrap/generate_wrappers.py b/tools/nnwrap/generate_wrappers.py
new file mode 100644
index 0000000..db4caf6
--- /dev/null
+++ b/tools/nnwrap/generate_wrappers.py
@@ -0,0 +1,135 @@
+import os
+import sys
+from string import Template, ascii_lowercase
+from ..cwrap import cwrap
+from ..cwrap.plugins import NNExtension, NullableArguments, AutoGPU
+from ..shared import import_module
+
+from ..shared._utils_internal import get_file_path
+
+THNN_H_PATH = get_file_path('torch', 'lib', 'THNN.h')
+THCUNN_H_PATH = get_file_path('torch', 'lib', 'THCUNN.h')
+
+THNN_UTILS_PATH = get_file_path('torch', '_thnn', 'utils.py')
+
+thnn_utils = import_module('torch._thnn.utils', THNN_UTILS_PATH)
+
+FUNCTION_TEMPLATE = Template("""\
+[[
+  name: $name
+  return: void
+  cname: $cname
+  arguments:
+""")
+
+COMMON_TRANSFORMS = {
+    'THIndex_t': 'int64_t',
+    'THCIndex_t': 'int64_t',
+    'THInteger_t': 'int',
+}
+COMMON_CPU_TRANSFORMS = {
+    'THNNState*': 'void*',
+    'THIndexTensor*': 'THLongTensor*',
+    'THIntegerTensor*': 'THIntTensor*',
+}
+COMMON_GPU_TRANSFORMS = {
+    'THCState*': 'void*',
+    'THCIndexTensor*': 'THCudaLongTensor*',
+}
+
+TYPE_TRANSFORMS = {
+    'Float': {
+        'THTensor*': 'THFloatTensor*',
+        'real': 'float',
+        'accreal': 'double',
+    },
+    'Double': {
+        'THTensor*': 'THDoubleTensor*',
+        'real': 'double',
+        'accreal': 'double',
+    },
+    'CudaHalf': {
+        'THCTensor*': 'THCudaHalfTensor*',
+        'real': 'half',
+        'accreal': 'float',
+    },
+    'Cuda': {
+        'THCTensor*': 'THCudaTensor*',
+        'real': 'float',
+        'accreal': 'float',
+    },
+    'CudaDouble': {
+        'THCTensor*': 'THCudaDoubleTensor*',
+        'real': 'double',
+        'accreal': 'double',
+    },
+}
+for t, transforms in TYPE_TRANSFORMS.items():
+    transforms.update(COMMON_TRANSFORMS)
+
+for t in ['Float', 'Double']:
+    TYPE_TRANSFORMS[t].update(COMMON_CPU_TRANSFORMS)
+for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
+    TYPE_TRANSFORMS[t].update(COMMON_GPU_TRANSFORMS)
+
+
+def wrap_function(name, type, arguments):
+    cname = 'THNN_' + type + name
+    declaration = ''
+    declaration += 'TH_API void ' + cname + \
+        '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type)
+                        for arg in arguments) + ');\n'
+    declaration += FUNCTION_TEMPLATE.substitute(name=type + name, cname=cname)
+    indent = ' ' * 4
+    dict_indent = ' ' * 6
+    prefix = indent + '- '
+    for arg in arguments:
+        if not arg.is_optional:
+            declaration += prefix + \
+                TYPE_TRANSFORMS[type].get(
+                    arg.type, arg.type) + ' ' + arg.name + '\n'
+        else:
+            t = TYPE_TRANSFORMS[type].get(arg.type, arg.type)
+            declaration += prefix + 'type: ' + t + '\n' + \
+                dict_indent + 'name: ' + arg.name + '\n' + \
+                dict_indent + 'nullable: True' + '\n'
+    declaration += ']]\n\n\n'
+    return declaration
+
+
+def generate_wrappers(nn_root=None, install_dir=None, template_path=None):
+    wrap_nn(os.path.join(nn_root, 'THNN', 'generic', 'THNN.h') if nn_root else None, install_dir, template_path)
+    wrap_cunn(os.path.join(nn_root, 'THCUNN', 'generic', 'THCUNN.h') if nn_root else None, install_dir, template_path)
+
+
+def wrap_nn(thnn_h_path, install_dir, template_path):
+    wrapper = '#include <TH/TH.h>\n\n\n'
+    nn_functions = thnn_utils.parse_header(thnn_h_path or THNN_H_PATH)
+    for fn in nn_functions:
+        for t in ['Float', 'Double']:
+            wrapper += wrap_function(fn.name, t, fn.arguments)
+    install_dir = install_dir or 'torch/csrc/nn'
+    try:
+        os.makedirs(install_dir)
+    except OSError:
+        pass
+    with open(os.path.join(install_dir, 'THNN.cwrap'), 'w') as f:
+        f.write(wrapper)
+    cwrap(os.path.join(install_dir, 'THNN.cwrap'),
+          plugins=[NNExtension('torch._C._THNN'), NullableArguments()],
+          template_path=template_path)
+
+
+def wrap_cunn(thcunn_h_path, install_dir, template_path):
+    wrapper = '#include <TH/TH.h>\n'
+    wrapper += '#include <THC/THC.h>\n\n\n'
+    cunn_functions = thnn_utils.parse_header(thcunn_h_path or THCUNN_H_PATH)
+    for fn in cunn_functions:
+        for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
+            wrapper += wrap_function(fn.name, t, fn.arguments)
+    install_dir = install_dir or 'torch/csrc/nn'
+    with open(os.path.join(install_dir, 'THCUNN.cwrap'), 'w') as f:
+        f.write(wrapper)
+    cwrap(os.path.join(install_dir, 'THCUNN.cwrap'),
+          plugins=[NNExtension('torch._C._THCUNN'), NullableArguments(), AutoGPU(has_self=False)],
+          template_path=template_path)
diff --git a/tools/pytorch.version b/tools/pytorch.version
new file mode 100644
index 0000000..c952400
--- /dev/null
+++ b/tools/pytorch.version
@@ -0,0 +1,31 @@
+{
+     global:
+         _TH*;
+         __TH*;
+         TH*;
+         *THP*;
+         *THCP*;
+         PyInit*;
+         init*;
+         state;
+	 _ZGVZN2at*;
+         _ZN2at*;
+	 _ZNK2at*Type*;
+	 _ZNK2at*Tensor*;
+	 _ZNK2at*Storage*;
+	 _ZNK2at*Scalar*;
+	 _ZNK2at*CUDA*;
+	 *2at7Context*;
+	 _ZTIN2at*;
+	 _ZTIZN2at*;
+	 _ZTSN2at*;
+	 _ZTSPN2at*;
+	 _ZTSZN2at*;
+	 _ZTVN2at*;
+	 _ZZN2at*;
+	 _Z*torch*;
+	 _Z*Tensor*;
+	 _Z*tensor*;
+     local:
+         *;
+ };
\ No newline at end of file
diff --git a/tools/setup_helpers/__init__.py b/tools/setup_helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/setup_helpers/cuda.py b/tools/setup_helpers/cuda.py
new file mode 100644
index 0000000..b67844d
--- /dev/null
+++ b/tools/setup_helpers/cuda.py
@@ -0,0 +1,87 @@
+import os
+import glob
+import re
+import ctypes.util
+from subprocess import Popen, PIPE
+
+from .env import IS_WINDOWS, IS_LINUX, IS_DARWIN, check_env_flag, check_negative_env_flag
+
+LINUX_HOME = '/usr/local/cuda'
+WINDOWS_HOME = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+
+
+def find_nvcc():
+    if IS_WINDOWS:
+        proc = Popen(['where', 'nvcc.exe'], stdout=PIPE, stderr=PIPE)
+    else:
+        proc = Popen(['which', 'nvcc'], stdout=PIPE, stderr=PIPE)
+    out, err = proc.communicate()
+    out = out.decode().strip()
+    if len(out) > 0:
+        if IS_WINDOWS:
+            if out.find('\r\n') != -1:
+                out = out.split('\r\n')[0]
+            out = os.path.abspath(os.path.join(os.path.dirname(out), ".."))
+            out = out.replace('\\', '/')
+            out = str(out)
+        return os.path.dirname(out)
+    else:
+        return None
+
+
+def find_cuda_version(cuda_home):
+    if cuda_home is None:
+        return None
+    if IS_WINDOWS:
+        candidate_names = [os.path.basename(cuda_home)]
+    else:
+        # get CUDA lib folder
+        cuda_lib_dirs = ['lib64', 'lib']
+        for lib_dir in cuda_lib_dirs:
+            cuda_lib_path = os.path.join(cuda_home, lib_dir)
+            if os.path.exists(cuda_lib_path):
+                break
+        # get a list of candidates for the version number
+        # which are files containing cudart
+        candidate_names = list(glob.glob(os.path.join(cuda_lib_path, '*cudart*')))
+        candidate_names = [os.path.basename(c) for c in candidate_names]
+
+    # suppose version is MAJOR.MINOR.PATCH, all numbers
+    version_regex = re.compile('[0-9]+\.[0-9]+\.[0-9]+')
+    candidates = [c.group() for c in map(version_regex.search, candidate_names) if c]
+    if len(candidates) > 0:
+        # normally only one will be retrieved, take the first result
+        return candidates[0]
+    # if no candidates were found, try MAJOR.MINOR
+    version_regex = re.compile('[0-9]+\.[0-9]+')
+    candidates = [c.group() for c in map(version_regex.search, candidate_names) if c]
+    if len(candidates) > 0:
+        return candidates[0]
+
+if check_negative_env_flag('USE_CUDA') or check_env_flag('USE_ROCM'):
+    USE_CUDA = False
+    CUDA_HOME = None
+    CUDA_VERSION = None
+else:
+    if IS_LINUX or IS_DARWIN:
+        CUDA_HOME = os.getenv('CUDA_HOME', LINUX_HOME)
+    else:
+        CUDA_HOME = os.getenv('CUDA_PATH', '').replace('\\', '/')
+        if CUDA_HOME == '' and len(WINDOWS_HOME) > 0:
+            CUDA_HOME = WINDOWS_HOME[0].replace('\\', '/')
+    if not os.path.exists(CUDA_HOME):
+        # We use nvcc path on Linux and cudart path on macOS
+        if IS_LINUX or IS_WINDOWS:
+            cuda_path = find_nvcc()
+        else:
+            cudart_path = ctypes.util.find_library('cudart')
+            if cudart_path is not None:
+                cuda_path = os.path.dirname(cudart_path)
+            else:
+                cuda_path = None
+        if cuda_path is not None:
+            CUDA_HOME = os.path.dirname(cuda_path)
+        else:
+            CUDA_HOME = None
+    CUDA_VERSION = find_cuda_version(CUDA_HOME)
+    USE_CUDA = CUDA_HOME is not None
diff --git a/tools/setup_helpers/cudnn.py b/tools/setup_helpers/cudnn.py
new file mode 100644
index 0000000..9516e26
--- /dev/null
+++ b/tools/setup_helpers/cudnn.py
@@ -0,0 +1,96 @@
+import os
+import glob
+
+from .env import IS_WINDOWS, IS_CONDA, CONDA_DIR, check_negative_env_flag, gather_paths, lib_paths_from_base
+from .cuda import USE_CUDA, CUDA_HOME
+
+
+USE_CUDNN = False
+CUDNN_LIB_DIR = None
+CUDNN_INCLUDE_DIR = None
+CUDNN_LIBRARY = None
+WITH_STATIC_CUDNN = os.getenv("USE_STATIC_CUDNN")
+
+if USE_CUDA and not check_negative_env_flag('USE_CUDNN'):
+    lib_paths = list(filter(bool, [
+        os.getenv('CUDNN_LIB_DIR')
+    ] + lib_paths_from_base(CUDA_HOME) + [
+        '/usr/lib/x86_64-linux-gnu/',
+        '/usr/lib/powerpc64le-linux-gnu/',
+        '/usr/lib/aarch64-linux-gnu/',
+    ] + gather_paths([
+        'LIBRARY_PATH',
+    ]) + gather_paths([
+        'LD_LIBRARY_PATH',
+    ])))
+    include_paths = list(filter(bool, [
+        os.getenv('CUDNN_INCLUDE_DIR'),
+        os.path.join(CUDA_HOME, 'include'),
+        '/usr/include/',
+    ] + gather_paths([
+        'CPATH',
+        'C_INCLUDE_PATH',
+        'CPLUS_INCLUDE_PATH',
+    ])))
+    # Add CUDA related dirs to candidate list
+    if IS_CONDA:
+        lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+        include_paths.append(os.path.join(CONDA_DIR, 'include'))
+    for path in include_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        include_file_path = os.path.join(path, 'cudnn.h')
+        CUDNN_INCLUDE_VERSION = None
+        if os.path.exists(include_file_path):
+            CUDNN_INCLUDE_DIR = path
+            with open(include_file_path) as f:
+                for line in f:
+                    if "#define CUDNN_MAJOR" in line:
+                        CUDNN_INCLUDE_VERSION = int(line.split()[-1])
+                        break
+            if CUDNN_INCLUDE_VERSION is None:
+                raise AssertionError("Could not find #define CUDNN_MAJOR in " + include_file_path)
+            break
+
+    if CUDNN_INCLUDE_VERSION is None:
+        pass
+
+    # Check for standalone cuDNN libraries
+    if CUDNN_INCLUDE_DIR is not None:
+        cudnn_path = os.path.join(os.path.dirname(CUDNN_INCLUDE_DIR))
+        cudnn_lib_paths = lib_paths_from_base(cudnn_path)
+        lib_paths.extend(cudnn_lib_paths)
+
+    for path in lib_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        if IS_WINDOWS:
+            library = os.path.join(path, 'cudnn.lib')
+            if os.path.exists(library):
+                CUDNN_LIBRARY = library
+                CUDNN_LIB_DIR = path
+                break
+        else:
+            if WITH_STATIC_CUDNN is not None:
+                search_name = 'libcudnn_static.a'
+            else:
+                search_name = 'libcudnn*' + str(CUDNN_INCLUDE_VERSION) + "*"
+            libraries = sorted(glob.glob(os.path.join(path, search_name)))
+            if libraries:
+                CUDNN_LIBRARY = libraries[0]
+                CUDNN_LIB_DIR = path
+                break
+    # Specifying the library directly will overwrite the lib directory
+    library = os.getenv('CUDNN_LIBRARY')
+    if library is not None and os.path.exists(library):
+        CUDNN_LIBRARY = library
+        CUDNN_LIB_DIR = os.path.dirname(CUDNN_LIBRARY)
+
+    if not all([CUDNN_LIBRARY, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR]):
+        CUDNN_LIBRARY = CUDNN_LIB_DIR = CUDNN_INCLUDE_DIR = None
+    else:
+        real_cudnn_library = os.path.realpath(CUDNN_LIBRARY)
+        real_cudnn_lib_dir = os.path.realpath(CUDNN_LIB_DIR)
+        assert os.path.dirname(real_cudnn_library) == real_cudnn_lib_dir, (
+            'cudnn library and lib_dir must agree')
+        USE_CUDNN = True
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
new file mode 100644
index 0000000..faae4cd
--- /dev/null
+++ b/tools/setup_helpers/dist_check.py
@@ -0,0 +1,113 @@
+import os
+import subprocess
+import glob
+
+from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, check_negative_env_flag, gather_paths
+from .cuda import USE_CUDA
+
+# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
+USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED_MW = USE_DISTRIBUTED and check_env_flag("USE_DISTRIBUTED_MW")
+USE_GLOO_IBVERBS = False
+USE_C10D = USE_DISTRIBUTED and USE_CUDA and IS_LINUX
+
+IB_DEVINFO_CMD = "ibv_devinfo"
+
+
+def get_command_path(command):
+    """
+    Helper function that checks if the command exists in the path and gets the
+    full path of a given linux command if it exists.
+    """
+    def excutable(command_path):
+        return os.path.isfile(command_path) and os.access(command_path, os.X_OK)
+
+    for path in os.environ["PATH"].split(os.pathsep):
+        command_path = os.path.join(path, command)
+        if excutable(command_path):
+            return command_path
+
+    return None
+
+
+def should_build_ib():
+    """
+    Helper function that detects the system's IB support and returns if we
+    should build with IB support.
+    """
+    ib_util_found = False
+    ib_lib_found = False
+    ib_header_found = False
+
+    try:
+        # If the command doesn't exist, we can directly return instead of
+        # making a subprocess call
+        full_cmd_path = get_command_path(IB_DEVINFO_CMD)
+        if not full_cmd_path:
+            ib_util_found = False
+        subprocess.check_output([full_cmd_path, "--list"])
+        # Here we just would like to simply run the command to test if IB
+        # related tools / lib are installed without parsing the output. We
+        # will enable IB build as long as the command runs successfully.
+        #
+        # The output should look like either:
+        #
+        # > ibv_devinfo --list
+        # 0 HCAs founds:
+        #
+        # or
+        #
+        # > ibv_devinfo --list
+        # 4 HCAs found:
+        #   mlx5_3
+        #   mlx5_2
+        #   mlx5_1
+        #   mlx5_0
+        ib_util_found = True
+    except Exception:
+        # We just take all the exceptions here without affecting the build
+        ib_util_found = False
+
+    lib_paths = list(filter(bool, [
+        "/usr/lib/",
+        "/usr/lib/x86_64-linux-gnu/",
+        "/usr/lib/powerpc64le-linux-gnu/",
+        "/usr/lib/aarch64-linux-gnu/",
+    ] + gather_paths([
+        "LIBRARY_PATH",
+    ]) + gather_paths([
+        "LD_LIBRARY_PATH",
+    ])))
+
+    include_paths = [
+        "/usr/include/",
+    ]
+
+    if IS_CONDA:
+        lib_paths.append(os.path.join(CONDA_DIR, "lib"))
+        include_paths.append(os.path.join(CONDA_DIR, "include"))
+
+    for path in lib_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        ib_libraries = sorted(glob.glob(os.path.join(path, "libibverbs*")))
+        if ib_libraries:
+            ib_lib_found = True
+            break
+
+    for path in include_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        if os.path.exists(os.path.join(path, "infiniband/verbs.h")):
+            ib_header_found = True
+            break
+
+    return ib_util_found and ib_lib_found and ib_lib_found
+
+if USE_DISTRIBUTED:
+    # If the env variable is specified, use the value,
+    # otherwise only build with IB when IB support is detected on the system
+    if "USE_GLOO_IBVERBS" in os.environ:
+        USE_GLOO_IBVERBS = check_env_flag("USE_GLOO_IBVERBS")
+    else:
+        USE_GLOO_IBVERBS = should_build_ib()
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
new file mode 100644
index 0000000..0059125
--- /dev/null
+++ b/tools/setup_helpers/env.py
@@ -0,0 +1,29 @@
+import os
+import platform
+import sys
+from itertools import chain
+
+
+IS_WINDOWS = (platform.system() == 'Windows')
+IS_DARWIN = (platform.system() == 'Darwin')
+IS_LINUX = (platform.system() == 'Linux')
+
+
+IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
+CONDA_DIR = os.path.join(os.path.dirname(sys.executable), '..')
+
+
+def check_env_flag(name, default=''):
+    return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y']
+
+
+def check_negative_env_flag(name, default=''):
+    return os.getenv(name, default).upper() in ['OFF', '0', 'NO', 'FALSE', 'N']
+
+
+def gather_paths(env_vars):
+    return list(chain(*(os.getenv(v, '').split(os.pathsep) for v in env_vars)))
+
+
+def lib_paths_from_base(base_path):
+    return [os.path.join(base_path, s) for s in ['lib/x64', 'lib', 'lib64']]
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
new file mode 100644
index 0000000..4d3db46
--- /dev/null
+++ b/tools/setup_helpers/generate_code.py
@@ -0,0 +1,111 @@
+import argparse
+import os
+import sys
+
+source_files = {'.py', '.cpp', '.h'}
+
+DECLARATIONS_PATH = 'torch/lib/tmp_install/share/ATen/Declarations.yaml'
+
+
+# TODO: This is a little inaccurate, because it will also pick
+# up setup_helper scripts which don't affect code generation
+def all_generator_source():
+    r = []
+    for directory, _, filenames in os.walk('tools'):
+        for f in filenames:
+            if os.path.splitext(f)[1] in source_files:
+                full = os.path.join(directory, f)
+                r.append(full)
+    return sorted(r)
+
+
+inputs = [
+    'torch/lib/THNN.h',
+    'torch/lib/THCUNN.h',
+    'torch/lib/tmp_install/share/ATen/Declarations.yaml',
+    'tools/autograd/derivatives.yaml',
+    'tools/autograd/deprecated.yaml',
+]
+
+outputs = [
+    'torch/csrc/autograd/generated/Functions.cpp',
+    'torch/csrc/autograd/generated/Functions.h',
+    'torch/csrc/autograd/generated/python_functions.cpp',
+    'torch/csrc/autograd/generated/python_functions.h',
+    'torch/csrc/autograd/generated/python_nn_functions.cpp',
+    'torch/csrc/autograd/generated/python_nn_functions.h',
+    'torch/csrc/autograd/generated/python_nn_functions_dispatch.h',
+    'torch/csrc/autograd/generated/python_variable_methods.cpp',
+    'torch/csrc/autograd/generated/python_variable_methods_dispatch.h',
+    'torch/csrc/autograd/generated/variable_factories.h',
+    'torch/csrc/autograd/generated/VariableType.cpp',
+    'torch/csrc/autograd/generated/VariableType.h',
+    'torch/csrc/jit/generated/register_aten_ops.cpp',
+]
+
+
+def generate_code_ninja(w):
+    all_inputs = all_generator_source() + inputs
+    cmd = "{} {}".format(sys.executable, 'tools/setup_helpers/generate_code.py')
+    w.writer.build(
+        outputs, 'do_cmd', all_inputs,
+        variables={
+            'cmd': cmd,
+            # Note [Unchanging results for ninja]
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # generate_code.py will avoid bumping the timestamp on its
+            # output files if the contents of the generated file did not
+            # change.  To let Ninja take advantage of this, it must stat
+            # the output files after the build.  See
+            # https://groups.google.com/forum/#!topic/ninja-build/rExDmgDL2oc
+            # for a more detailed discussion.
+            'restat': True,
+        })
+
+
+def generate_code(ninja_global=None,
+                  declarations_path=None,
+                  nn_path=None,
+                  install_dir=None):
+    # if ninja is enabled, we just register this file as something
+    # ninja will need to call if needed
+    if ninja_global is not None:
+        return generate_code_ninja(ninja_global)
+
+    # cwrap depends on pyyaml, so we can't import it earlier
+    root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    sys.path.insert(0, root)
+    from tools.autograd.gen_autograd import gen_autograd
+    from tools.jit.gen_jit_dispatch import gen_jit_dispatch
+
+    from tools.nnwrap import generate_wrappers as generate_nn_wrappers
+
+    # Build THNN/THCUNN.cwrap and then THNN/THCUNN.cpp. These are primarily
+    # used by the legacy NN bindings.
+    generate_nn_wrappers(nn_path, install_dir, 'tools/cwrap/plugins/templates')
+
+    # Build ATen based Variable classes
+    autograd_gen_dir = install_dir or 'torch/csrc/autograd/generated'
+    jit_gen_dir = install_dir or 'torch/csrc/jit/generated'
+    for d in (autograd_gen_dir, jit_gen_dir):
+        if not os.path.exists(d):
+            os.makedirs(d)
+    gen_autograd(declarations_path or DECLARATIONS_PATH, autograd_gen_dir, 'tools/autograd')
+    gen_jit_dispatch(declarations_path or DECLARATIONS_PATH, jit_gen_dir, 'tools/jit/templates')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Autogenerate code')
+    parser.add_argument('--declarations-path')
+    parser.add_argument('--nn-path')
+    parser.add_argument('--ninja-global')
+    parser.add_argument('--install_dir')
+    options = parser.parse_args()
+    generate_code(options.ninja_global,
+                  options.declarations_path,
+                  options.nn_path,
+                  options.install_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/setup_helpers/mkldnn.py b/tools/setup_helpers/mkldnn.py
new file mode 100644
index 0000000..cc230e5
--- /dev/null
+++ b/tools/setup_helpers/mkldnn.py
@@ -0,0 +1,87 @@
+import platform
+import glob
+import os
+import sys
+
+from itertools import chain
+from .env import check_env_flag, IS_LINUX, IS_WINDOWS, IS_CONDA, CONDA_DIR
+
+
+def gather_paths(env_vars):
+    return list(chain(*(os.getenv(v, '').split(os.pathsep) for v in env_vars)))
+
+MKLDNN_HOME = os.getenv('MKLDNN_HOME', '/usr/local/mkl-dnn')
+
+USE_MKLDNN = False
+MKLDNN_LIB_DIR = None
+MKLDNN_INCLUDE_DIR = None
+MKLDNN_LIBRARY = None
+if (IS_LINUX or IS_WINDOWS) and not check_env_flag('NO_MKLDNN'):
+    lib_paths = list(filter(bool, [
+        os.getenv('MKLDNN_LIB_DIR'),
+        os.path.join(MKLDNN_HOME, 'lib'),
+        os.path.join(MKLDNN_HOME, 'lib64'),
+        os.path.join(MKLDNN_HOME, 'lib/x64'),
+        '/usr/lib/',
+        '/usr/lib64/',
+    ] + gather_paths([
+        'LIBRARY_PATH',
+    ]) + gather_paths([
+        'LD_LIBRARY_PATH',
+    ]) + gather_paths([
+        'LIB'
+    ])))
+    include_paths = list(filter(bool, [
+        os.getenv('MKLDNN_INCLUDE_DIR'),
+        os.path.join(MKLDNN_HOME, 'include'),
+        '/usr/include/',
+    ] + gather_paths([
+        'CPATH',
+        'C_INCLUDE_PATH',
+        'CPLUS_INCLUDE_PATH',
+        'INCLUDE',
+    ])))
+    if IS_WINDOWS:
+        mkldnn_regex = 'mkldnn*.lib'
+        mklml_regex = 'mklml*.lib'
+    else:
+        mkldnn_regex = 'libmkldnn*'
+        mklml_regex = 'libmklml_intel*'
+    if IS_CONDA:
+        lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+        include_paths.append(os.path.join(CONDA_DIR, 'include'))
+    for path in lib_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        else:
+            libraries = sorted(glob.glob(os.path.join(path, mkldnn_regex)))
+            if libraries:
+                if not glob.glob(os.path.join(path, mklml_regex)):
+                    print("WARNING: MKL-DNN is not compiled with Intel MKL small library")
+                    print("Convolution performance might be suboptimal")
+                    print("Refer https://github.com/01org/mkl-dnn for detail info")
+                MKLDNN_LIBRARY = libraries[0]
+                MKLDNN_LIB_DIR = path
+                break
+    for path in include_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        else:
+            if os.path.exists(os.path.join(path, 'mkldnn.hpp')):
+                MKLDNN_INCLUDE_DIR = path
+                break
+
+    # Specifying the library directly will overwrite the lib directory
+    library = os.getenv('MKLDNN_LIBRARY')
+    if library is not None and os.path.exists(library):
+        MKLDNN_LIBRARY = library
+        MKLDNN_LIB_DIR = os.path.dirname(MKLDNN_LIBRARY)
+
+    if not all([MKLDNN_LIBRARY, MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR]):
+        MKLDNN_LIBRARY = MKLDNN_LIB_DIR = MKLDNN_INCLUDE_DIR = None
+    else:
+        real_mkldnn_library = os.path.realpath(MKLDNN_LIBRARY)
+        real_mkldnn_lib_dir = os.path.realpath(MKLDNN_LIB_DIR)
+        assert os.path.dirname(real_mkldnn_library) == real_mkldnn_lib_dir, (
+            'mkldnn library and lib_dir must agree')
+        USE_MKLDNN = True
diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
new file mode 100644
index 0000000..7034465
--- /dev/null
+++ b/tools/setup_helpers/nccl.py
@@ -0,0 +1,75 @@
+import os
+import glob
+import warnings
+from itertools import chain
+
+from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_flag, \
+    gather_paths
+
+from .cuda import USE_CUDA, CUDA_HOME
+
+
+USE_NCCL = USE_CUDA and not IS_DARWIN and not IS_WINDOWS
+USE_SYSTEM_NCCL = False
+NCCL_LIB_DIR = None
+NCCL_SYSTEM_LIB = None
+NCCL_INCLUDE_DIR = None
+NCCL_ROOT_DIR = None
+USE_STATIC_NCCL = os.getenv("USE_STATIC_NCCL")
+LIBNCCL_PREFIX = "libnccl"
+if USE_STATIC_NCCL is not None:
+    LIBNCCL_PREFIX = "libnccl_static"
+
+if USE_CUDA and not check_negative_env_flag('USE_SYSTEM_NCCL'):
+    ENV_ROOT = os.getenv('NCCL_ROOT_DIR', None)
+    LIB_DIR = os.getenv('NCCL_LIB_DIR', None)
+    INCLUDE_DIR = os.getenv('NCCL_INCLUDE_DIR', None)
+
+    lib_paths = list(filter(bool, [
+        LIB_DIR,
+        ENV_ROOT,
+        os.path.join(ENV_ROOT, 'lib') if ENV_ROOT is not None else None,
+        os.path.join(ENV_ROOT, 'lib', 'x86_64-linux-gnu') if ENV_ROOT is not None else None,
+        os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None,
+        os.path.join(CUDA_HOME, 'lib'),
+        os.path.join(CUDA_HOME, 'lib64'),
+        '/usr/lib/x86_64-linux-gnu/',
+        '/usr/lib/powerpc64le-linux-gnu/',
+        '/usr/lib/aarch64-linux-gnu/',
+    ] + gather_paths([
+        'LIBRARY_PATH',
+    ]) + gather_paths([
+        'LD_LIBRARY_PATH',
+    ])))
+    include_paths = list(filter(bool, [
+        INCLUDE_DIR,
+        ENV_ROOT,
+        os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None,
+        '/usr/include'
+    ]))
+
+    if IS_CONDA:
+        lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+    for path in lib_paths:
+        path = os.path.expanduser(path)
+        if path is None or not os.path.exists(path):
+            continue
+        if glob.glob(os.path.join(path, LIBNCCL_PREFIX + '*')):
+            NCCL_LIB_DIR = path
+            # try to find an exact versioned .so/.dylib, rather than libnccl.so
+            preferred_path = glob.glob(os.path.join(path, LIBNCCL_PREFIX + '*[0-9]*'))
+            if len(preferred_path) == 0:
+                NCCL_SYSTEM_LIB = glob.glob(os.path.join(path, LIBNCCL_PREFIX + '*'))[0]
+            else:
+                NCCL_SYSTEM_LIB = os.path.realpath(preferred_path[0])
+            break
+    for path in include_paths:
+        path = os.path.expanduser(path)
+        if path is None or not os.path.exists(path):
+            continue
+        if glob.glob(os.path.join(path, 'nccl.h')):
+            NCCL_INCLUDE_DIR = path
+            break
+    if NCCL_LIB_DIR is not None and NCCL_INCLUDE_DIR is not None:
+        USE_SYSTEM_NCCL = True
+        NCCL_ROOT_DIR = os.path.commonprefix((NCCL_LIB_DIR, NCCL_INCLUDE_DIR))
diff --git a/tools/setup_helpers/ninja_builder.py b/tools/setup_helpers/ninja_builder.py
new file mode 100644
index 0000000..62c59dc
--- /dev/null
+++ b/tools/setup_helpers/ninja_builder.py
@@ -0,0 +1,163 @@
+import re
+import os
+import sys
+import setuptools
+import distutils
+from contextlib import contextmanager
+import subprocess
+
+BUILD_DIR = 'build'
+
+
+# on the fly create a ninja file in build/ and then
+# run it when run() is called.
+class NinjaBuilder(object):
+    def __init__(self, name):
+        import ninja
+        if not os.path.exists(BUILD_DIR):
+            os.mkdir(BUILD_DIR)
+        self.ninja_program = os.path.join(ninja.BIN_DIR, 'ninja')
+        self.name = name
+        self.filename = os.path.join(BUILD_DIR, 'build.{}.ninja'.format(name))
+        self.writer = ninja.Writer(open(self.filename, 'w'))
+        self.writer.rule('do_cmd', '$cmd')
+        self.writer.rule('compile', '$cmd')
+        self.compdb_targets = []
+
+    def run(self):
+        import ninja
+        self.writer.close()
+        try:
+            subprocess.check_call([self.ninja_program, '-f', self.filename])
+        except subprocess.CalledProcessError as err:
+            # avoid printing the setup.py stack trace because it obscures the
+            # C++ errors.
+            sys.stderr.write(str(err) + '\n')
+            sys.exit(1)
+        compile_db_path = os.path.join(BUILD_DIR, '{}_compile_commands.json'.format(self.name))
+        with open(compile_db_path, 'w') as compile_db:
+            subprocess.check_call([self.ninja_program, '-f', self.filename,
+                                   '-t', 'compdb', 'compile'], stdout=compile_db)
+
+        # weird build logic in build develop causes some things to be run
+        # twice so make sure even after we run the command we still
+        # reset this to a valid state
+        # don't use the same name or you can't inspect the real ninja files
+        self.__init__(self.name + "_")
+
+
+class ninja_build_ext(setuptools.command.build_ext.build_ext):
+    def _build_default(self, ext):
+        return setuptools.command.build_ext.build_ext.build_extension(self, ext)
+
+    def build_extension(self, ext):
+        builder = NinjaBuilder(ext.name)
+
+        @contextmanager
+        def patch(obj, attr_name, val):
+            orig_val = getattr(obj, attr_name)
+            setattr(obj, attr_name, val)
+            try:
+                yield
+            finally:
+                setattr(obj, attr_name, orig_val)
+
+        if self.compiler.compiler_type == 'msvc':
+            import distutils.msvccompiler
+            import distutils.msvc9compiler
+            if sys.version_info < (3, ):
+                orig_compiler = distutils.msvc9compiler.MSVCCompiler
+            else:
+                orig_compiler = distutils._msvccompiler.MSVCCompiler
+            orig_compile = orig_compiler.compile
+            orig_link = orig_compiler.link
+            orig_spawn = orig_compiler.spawn
+        else:
+            import distutils.unixccompiler
+            orig_compiler = distutils.unixccompiler.UnixCCompiler
+            orig_compile = orig_compiler._compile
+            orig_link = orig_compiler.link
+
+        def win_compile(self, sources,
+                        output_dir=None, macros=None, include_dirs=None, debug=0,
+                        extra_preargs=None, extra_postargs=None, depends=None):
+
+            def spawn(cmd):
+                # Using regex to match src and obj
+
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [m.group(2) for m in (
+                    src_regex.match(elem) for elem in cmd) if m]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [m.group(1) for m in (
+                    obj_regex.match(elem) for elem in cmd) if m]
+
+                if len(src_list) >= 1 and len(obj_list) >= 1:
+                    src = src_list[0]
+                    obj = obj_list[0]
+                else:
+                    # Cannot find src or obj, revert back to original style
+                    return orig_spawn(cmd)
+
+                quote_regex = re.compile('".*"')
+                quote_list = [quote_regex.search(
+                    arg) is not None for arg in cmd]
+                no_quote = any(quote_list)
+
+                if not no_quote:
+                    from distutils.spawn import _nt_quote_args
+                    cmd = _nt_quote_args(cmd)
+
+                builder.writer.build(
+                    [obj], 'compile', [src],
+                    variables={
+                        'cmd': cmd,
+                        'deps': 'msvc'
+                    })
+
+            with patch(self, 'spawn', spawn):
+                return orig_compile(self, sources,
+                                    output_dir, macros, include_dirs, debug,
+                                    extra_preargs, extra_postargs, depends)
+
+        def unix_compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
+            depfile = os.path.splitext(obj)[0] + '.d'
+
+            def spawn(cmd):
+                builder.writer.build(
+                    [obj], 'compile', [src],
+                    variables={
+                        'cmd': cmd,
+                        'depfile': depfile,
+                        'deps': 'gcc'
+                    })
+
+            extra_postargs = extra_postargs + ['-MMD', '-MF', depfile]
+            with patch(self, 'spawn', spawn):
+                orig_compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts)
+
+        def link(self, target_desc, objects,
+                 output_filename, output_dir=None, libraries=None,
+                 library_dirs=None, runtime_library_dirs=None,
+                 export_symbols=None, debug=0, extra_preargs=None,
+                 extra_postargs=None, build_temp=None, target_lang=None):
+
+            builder.run()
+            orig_link(self, target_desc, objects,
+                      output_filename, output_dir, libraries,
+                      library_dirs, runtime_library_dirs,
+                      export_symbols, debug, extra_preargs,
+                      extra_postargs, build_temp, target_lang)
+
+        if self.compiler.compiler_type == 'msvc':
+            _compile_func = win_compile
+            _compile_func_name = 'compile'
+        else:
+            _compile_func = unix_compile
+            _compile_func_name = '_compile'
+
+        with patch(orig_compiler, _compile_func_name, _compile_func):
+            with patch(orig_compiler, 'link', link):
+                with patch(self, 'force', True):
+                    self._build_default(ext)
diff --git a/tools/setup_helpers/nnpack.py b/tools/setup_helpers/nnpack.py
new file mode 100644
index 0000000..f62d7b7
--- /dev/null
+++ b/tools/setup_helpers/nnpack.py
@@ -0,0 +1,6 @@
+from .env import check_env_flag
+
+if check_env_flag('NO_NNPACK'):
+    USE_NNPACK = False
+else:
+    USE_NNPACK = True
diff --git a/tools/setup_helpers/nvtoolext.py b/tools/setup_helpers/nvtoolext.py
new file mode 100644
index 0000000..c22c2a6
--- /dev/null
+++ b/tools/setup_helpers/nvtoolext.py
@@ -0,0 +1,24 @@
+import os
+import platform
+import ctypes.util
+from subprocess import Popen, PIPE
+
+from .cuda import USE_CUDA
+
+WINDOWS_HOME = 'C:/Program Files/NVIDIA Corporation/NvToolsExt'
+
+if not USE_CUDA:
+    NVTOOLEXT_HOME = None
+else:
+    # We use nvcc path on Linux and cudart path on macOS
+    osname = platform.system()
+    if osname != 'Windows':
+        NVTOOLEXT_HOME = None
+    else:
+        NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', WINDOWS_HOME).replace('\\', '/')
+        if not os.path.exists(NVTOOLEXT_HOME):
+            NVTOOLEXT_HOME = ctypes.util.find_library('nvToolsExt64_1')
+            if NVTOOLEXT_HOME is not None:
+                NVTOOLEXT_HOME = os.path.dirname(NVTOOLEXT_HOME)
+            else:
+                NVTOOLEXT_HOME = None
diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py
new file mode 100644
index 0000000..4aea4e5
--- /dev/null
+++ b/tools/setup_helpers/rocm.py
@@ -0,0 +1,5 @@
+from .env import check_env_flag
+# Check if ROCM is enabled
+USE_ROCM = check_env_flag('USE_ROCM')
+ROCM_HOME = "/opt/rocm"
+ROCM_VERSION = ""
diff --git a/tools/setup_helpers/split_types.py b/tools/setup_helpers/split_types.py
new file mode 100644
index 0000000..a174f90
--- /dev/null
+++ b/tools/setup_helpers/split_types.py
@@ -0,0 +1,87 @@
+import os
+import sys
+
+this_file = os.path.dirname(os.path.abspath(__file__))
+generated_dir = os.path.abspath(os.path.join(this_file, '..', '..', 'torch', 'csrc', 'generated'))
+
+line_start = '//generic_include '
+
+types = [
+    'Double',
+    'Float',
+    'Half',
+    'Long',
+    'Int',
+    'Short',
+    'Char',
+    'Byte'
+]
+
+generic_include = '#define {lib}_GENERIC_FILE "{path}"'
+generate_include = '#include "{lib}/{lib}Generate{type}Type.h"'
+
+
+def get_gen_path_prefix(file_name):
+    gen_name_prefix = file_name[len('torch/csrc/'):].replace('/', '_').replace('.cpp', '')
+    gen_path_prefix = os.path.join(generated_dir, gen_name_prefix)
+    return gen_path_prefix
+
+
+def split_types_ninja(file_name, w):
+    gen_path_prefix = get_gen_path_prefix(file_name)
+    to_build = [gen_path_prefix + t + '.cpp' for t in types]
+    myself = 'tools/setup_helpers/split_types.py'
+    cmd = "{} {} '{}'".format(sys.executable, myself, file_name)
+    w.writer.build(
+        to_build, 'do_cmd', [file_name, myself],
+        variables={
+            'cmd': cmd,
+        })
+    return to_build
+
+
+def split_types(file_name, ninja_global):
+    # when ninja is enabled we just generate the build rule here
+    if ninja_global is not None:
+        return split_types_ninja(file_name, ninja_global)
+
+    assert file_name.startswith('torch/csrc/')
+    if not os.path.exists(generated_dir):
+        os.makedirs(generated_dir)
+
+    with open(file_name, 'r') as f:
+        lines = f.read().split('\n')
+
+    # Find //generic_include
+    for i, l in enumerate(lines):
+        if l.startswith(line_start):
+            args = l[len(line_start):]
+            lib_prefix, generic_file = filter(bool, args.split())
+            break
+    else:
+        raise RuntimeError("generic include not found")
+
+    gen_name_prefix = file_name[len('torch/csrc/'):].replace('/', '_').replace('.cpp', '')
+    gen_path_prefix = os.path.join(generated_dir, gen_name_prefix)
+
+    prefix = '\n'.join(lines[:i])
+    suffix = '\n'.join(lines[i + 1:])
+
+    to_build = []
+
+    g_include = generic_include.format(lib=lib_prefix, path=generic_file)
+    for t in types:
+        t_include = generate_include.format(lib=lib_prefix, type=t)
+        gen_path = gen_path_prefix + t + '.cpp'
+        to_build.append(gen_path)
+        with open(gen_path, 'w') as f:
+            f.write(prefix + '\n' +
+                    g_include + '\n' +
+                    t_include + '\n' +
+                    suffix)
+    return to_build
+
+# when called from ninja
+if __name__ == '__main__':
+    file_name = sys.argv[1].strip("'")
+    split_types(file_name, None)
diff --git a/tools/shared/__init__.py b/tools/shared/__init__.py
new file mode 100644
index 0000000..8494cde
--- /dev/null
+++ b/tools/shared/__init__.py
@@ -0,0 +1,3 @@
+from .module_loader import import_module
+from .cwrap_common import set_declaration_defaults, \
+    sort_by_number_of_options, enumerate_options_due_to_default
diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py
new file mode 100644
index 0000000..c40a20c
--- /dev/null
+++ b/tools/shared/module_loader.py
@@ -0,0 +1,16 @@
+import sys
+
+
+def import_module(name, path):
+    if sys.version_info >= (3, 5):
+        import importlib.util
+        spec = importlib.util.spec_from_file_location(name, path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+    elif sys.version_info >= (3, 0):
+        from importlib.machinery import SourceFileLoader
+        return SourceFileLoader(name, path).load_module()
+    else:
+        import imp
+        return imp.load_source(name, path)
diff --git a/tools/test_aten_install.sh b/tools/test_aten_install.sh
new file mode 100755
index 0000000..d2d5723
--- /dev/null
+++ b/tools/test_aten_install.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+set -xe
+rm -rf aten_build
+rm -rf aten_install
+mkdir aten_build aten_install
+cd aten_build
+cmake ../aten -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../aten_install
+NUM_JOBS="$(getconf _NPROCESSORS_ONLN)"
+make -j"$NUM_JOBS" install
+cd ..
+aten/tools/test_install.sh $(pwd)/aten_install $(pwd)/aten
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
new file mode 100644
index 0000000..8c1aef5
--- /dev/null
+++ b/torch/CMakeLists.txt
@@ -0,0 +1,392 @@
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  include(CMakeDependentOption)
+  option(USE_CUDA "Use CUDA" ON)
+  option(TORCH_BUILD_TEST "Build torch test binaries" ON)
+
+  # Flag for shared dependencies
+  set(BUILD_TORCH ON)
+endif()
+
+cmake_policy(VERSION 3.0)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (VERBOSE)
+  message(STATUS "CAFFE2_PATH is ${CAFFE2_PATH}")
+  message(STATUS "CAFFE2_BUILD_PATH is ${CAFFE2_BUILD_PATH}")
+  message(STATUS "INSTALL_PREFIX is ${INSTALL_PREFIX}")
+endif()
+
+set(CAFFE2_INCLUDE_DIR "${CAFFE2_PATH}")
+set(CAFFE2_BUILD_LIB_DIR "${CAFFE2_BUILD_PATH}/lib")
+set(CAFFE2_INSTALL_INCLUDE_DIR "${INSTALL_PREFIX}/include")
+set(CAFFE2_INSTALL_SHARE_DIR "${INSTALL_PREFIX}/share")
+set(CAFFE2_INSTALL_LIB_DIR "${INSTALL_PREFIX}/lib")
+set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+find_library(CAFFE2_LIBRARY caffe2
+  NAMES libcaffe2.so libcaffe2.dylib caffe2.lib
+  PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH)
+find_library(CAFFE2_GPU_LIBRARY caffe2_gpu
+  NAMES libcaffe2_gpu.so libcaffe2_gpu.dylib caffe2_gpu.lib
+  PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH)
+find_library(PROTOBUF_LIBRARY protobuf
+  NAMES libprotobuf.a libprotobufd.a libprotobuf.lib libprotobufd.lib
+  PATHS ${CAFFE2_BUILD_LIB_DIR} NO_DEFAULT_PATH)
+
+add_subdirectory(../third_party/nanopb protobuf-nanopb)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if(USE_CUDA)
+  set(CMAKE_MODULE_PATH
+    ${INSTALL_PREFIX}/share/cmake
+    ${TORCH_SRC_DIR}/../cmake/Modules
+    ${TORCH_SRC_DIR}/../cmake/public
+    ${TORCH_SRC_DIR}/../cmake/Modules_CUDA_fix
+    /usr/lib/x86_64-linux-gnu/
+    ${CMAKE_MODULE_PATH})
+  set(CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/ ${CMAKE_LIBRARY_PATH})
+
+  if(NOT CUDA_FOUND)
+    find_package(CUDA 7.0)
+  endif()
+
+  find_package(MAGMA)
+  if(CUDA_FOUND AND MAGMA_FOUND)
+    include_directories("${MAGMA_INCLUDE_DIR}")
+    set(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
+    include(CheckPrototypeDefinition)
+    check_prototype_definition(magma_get_sgeqrf_nb
+     "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
+     "0"
+     "magma.h"
+      MAGMA_V2)
+    IF (MAGMA_V2)
+      add_definitions(-DMAGMA_V2)
+    endif (MAGMA_V2)
+
+    set(USE_MAGMA 1)
+    if(VERBOSE)
+      message(STATUS "Compiling with MAGMA support")
+      message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+      message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+      message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+    endif()
+  else()
+    message(STATUS "MAGMA not found. Compiling without MAGMA support")
+  endif()
+endif()
+
+add_definitions(-DUSE_CATCH -D_FORCE_INLINES -DONNX_NAMESPACE=${ONNX_NAMESPACE})
+
+if(NOT TORCH_INSTALL_BIN_DIR)
+  set(TORCH_INSTALL_BIN_DIR bin)
+endif()
+
+if(NOT TORCH_INSTALL_INCLUDE_DIR)
+  set(TORCH_INSTALL_INCLUDE_DIR include/libtorch)
+endif()
+
+if(NOT TORCH_INSTALL_LIB_DIR)
+  set(TORCH_INSTALL_LIB_DIR lib)
+endif()
+
+if(USE_CUDA)
+  add_definitions(-DUSE_CUDA)
+
+  set(TORCH_CUDA_LIBRARIES
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvrtc.so
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
+    ${CUDA_LIBRARIES})
+
+  list(APPEND CUDA_INCLUDE_DIRS
+    ${CAFFE2_INSTALL_INCLUDE_DIR}/THC)
+endif()
+
+# RPATH stuff
+# see https://cmake.org/Wiki/CMake_RPATH_handling
+if(APPLE)
+  set(CMAKE_MACOSX_RPATH ON)
+endif()
+# Use separate rpaths during build and install phases
+set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+# Don't use the install-rpath during the build phase
+set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
+set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+# Automatically add all linked folders that are NOT in the build directory to
+# the rpath (per library?)
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+# Always ensure that CMAKE_INSTALL_PREFIX/lib is in the rpath
+list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
+if("${isSystemDir}" STREQUAL "-1")
+  list(APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+endif()
+
+
+# Generate files
+set(TOOLS_PATH "${TORCH_SRC_DIR}/../tools")
+
+configure_file("${CAFFE2_PATH}/aten/src/ATen/common_with_cwrap.py"
+               "${TOOLS_PATH}/shared/cwrap_common.py"
+               COPYONLY)
+
+configure_file("${CAFFE2_PATH}/torch/_utils_internal.py"
+               "${TOOLS_PATH}/shared/_utils_internal.py"
+               COPYONLY)
+
+add_custom_command(
+  OUTPUT
+  "${TORCH_SRC_DIR}/csrc/nn/THNN.cpp"
+  "${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods_dispatch.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_dispatch.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions_dispatch.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
+  "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp"
+  "${TORCH_SRC_DIR}/csrc/jit/generated/aten_interned_strings.h"
+  COMMAND
+  python tools/setup_helpers/generate_code.py
+    --declarations-path "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml"
+    --nn-path "aten/src/"
+  DEPENDS
+  "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml"
+  "${CAFFE2_INSTALL_INCLUDE_DIR}/THNN/generic/THNN.h"
+  "${TOOLS_PATH}/autograd/templates/VariableType.h"
+  "${TOOLS_PATH}/autograd/templates/VariableType.cpp"
+  "${TOOLS_PATH}/autograd/templates/Functions.h"
+  "${TOOLS_PATH}/autograd/templates/Functions.cpp"
+  "${TOOLS_PATH}/autograd/templates/python_functions.h"
+  "${TOOLS_PATH}/autograd/templates/python_functions.cpp"
+  "${TOOLS_PATH}/autograd/templates/python_variable_methods.cpp"
+  "${TOOLS_PATH}/autograd/templates/python_variable_methods_dispatch.h"
+  "${TOOLS_PATH}/autograd/templates/python_torch_functions.cpp"
+  "${TOOLS_PATH}/autograd/templates/python_torch_functions_dispatch.h"
+  "${TOOLS_PATH}/autograd/templates/python_nn_functions.cpp"
+  "${TOOLS_PATH}/autograd/templates/python_nn_functions.h"
+  "${TOOLS_PATH}/autograd/templates/python_nn_functions_dispatch.h"
+  "${TOOLS_PATH}/autograd/templates/variable_factories.h"
+  "${TOOLS_PATH}/autograd/gen_autograd.py"
+  "${TOOLS_PATH}/autograd/gen_autograd_functions.py"
+  "${TOOLS_PATH}/autograd/gen_variable_type.py"
+  "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp"
+  "${TOOLS_PATH}/jit/templates/aten_interned_strings.h"
+  WORKING_DIRECTORY "${TORCH_SRC_DIR}/..")
+
+set(TORCH_SRCS
+  ${TORCH_SRC_DIR}/csrc/autograd/aten_variable_hooks.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp
+  ${TORCH_SRC_DIR}/csrc/assertions.cpp
+  ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/variable_flags.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/tracer_state.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/type.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/export.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/import.cpp
+  ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp
+  ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp
+  ${TORCH_SRC_DIR}/csrc/torch.cpp)
+
+if (NOT NO_API)
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/cursor.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/functional.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
+  )
+endif()
+
+add_library(torch SHARED ${TORCH_SRCS})
+
+# https://gcc.gnu.org/onlinedocs/gcc-4.0.3/gcc/Warning-Options.html
+target_compile_options(torch
+  PRIVATE
+  -Wall
+  -Wextra
+  -pedantic
+  -Wcast-align
+  -Wcast-qual
+  -Wctor-dtor-privacy
+  -Wdisabled-optimization
+  -Winit-self
+  -Wmissing-include-dirs
+  -Woverloaded-virtual
+  -Wsign-promo
+  -Wstrict-overflow=5
+  -Wundef
+  -fdiagnostics-show-option
+  -Wno-unused-parameter
+  -Wno-missing-braces # This warning is buggy
+  -Wno-unknown-pragmas)
+
+if ($ENV{WERROR})
+  target_compile_options(torch PRIVATE -Werror)
+endif()
+
+target_link_libraries(torch
+  ${TORCH_CUDA_LIBRARIES}
+  ${CAFFE2_LIBRARY}
+  ${PROTOBUF_LIBRARY}
+  protobuf-nanopb
+)
+if(USE_CUDA)
+  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+    target_link_libraries(torch -Wl,--no-as-needed ${CAFFE2_GPU_LIBRARY} -Wl,--as-needed)
+  else()
+    target_link_libraries(torch ${CAFFE2_GPU_LIBRARY})
+  endif()
+endif()
+
+target_include_directories(torch
+  PUBLIC
+  "${CAFFE2_INCLUDE_DIR}"
+  "${CAFFE2_INSTALL_INCLUDE_DIR}"
+  "${CAFFE2_INSTALL_INCLUDE_DIR}/TH"
+  "${TORCH_SRC_DIR}/.."
+  "${TORCH_SRC_DIR}")
+
+if (NOT NO_API)
+  target_include_directories(torch PUBLIC
+    "${TORCH_SRC_DIR}/csrc/api/"
+    "${TORCH_SRC_DIR}/csrc/api/include")
+endif()
+
+# SYSTEM headers are included with -isystem and thus do not trigger warnings.
+target_include_directories(torch SYSTEM PUBLIC
+  "${TORCH_SRC_DIR}/../third_party/cereal/include" # For cereal/
+  "${TORCH_SRC_DIR}/../third_party/nanopb")
+
+if(USE_CUDA)
+  target_include_directories(torch SYSTEM PUBLIC "${CUDA_INCLUDE_DIRS}")
+endif()
+
+set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1)
+
+if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
+  set_property(TARGET torch PROPERTY CXX_STANDARD 11)
+endif()
+
+install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
+        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
+        FILES_MATCHING PATTERN "*.h")
+
+install(TARGETS torch
+  RUNTIME DESTINATION "${TORCH_INSTALL_BIN_DIR}"
+  LIBRARY DESTINATION "${TORCH_INSTALL_LIB_DIR}"
+  ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+if (TORCH_BUILD_TEST)
+  # JIT Tests. TODO: Put into test/cpp/jit folder
+
+  add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
+
+  target_link_libraries(test_jit torch)
+
+  target_include_directories(test_jit PUBLIC
+    "${TORCH_SRC_DIR}/../third_party/catch/single_include")
+
+  # API Tests
+
+  if (NOT NO_API)
+    set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api")
+
+    add_executable(test_api
+      ${TORCH_API_TEST_DIR}/any.cpp
+      ${TORCH_API_TEST_DIR}/modules.cpp
+      ${TORCH_API_TEST_DIR}/cursor.cpp
+      ${TORCH_API_TEST_DIR}/integration.cpp
+      ${TORCH_API_TEST_DIR}/main.cpp
+      ${TORCH_API_TEST_DIR}/misc.cpp
+      ${TORCH_API_TEST_DIR}/module.cpp
+      ${TORCH_API_TEST_DIR}/optim.cpp
+      ${TORCH_API_TEST_DIR}/sequential.cpp
+      ${TORCH_API_TEST_DIR}/rnn.cpp
+      ${TORCH_API_TEST_DIR}/serialization.cpp
+      ${TORCH_API_TEST_DIR}/static.cpp
+      ${TORCH_API_TEST_DIR}/tensor.cpp
+      ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
+      # Temporary until ATen tests are built with Caffe2
+      ${TORCH_API_TEST_DIR}/tensor_options.cpp
+      ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp
+    )
+
+      target_include_directories(test_api
+        PUBLIC
+        "${TORCH_SRC_DIR}/../third_party/catch/single_include")
+
+    target_link_libraries(test_api torch)
+  endif()
+endif()
diff --git a/torch/README.txt b/torch/README.txt
new file mode 100644
index 0000000..a57b7d0
--- /dev/null
+++ b/torch/README.txt
@@ -0,0 +1,14 @@
+Note [TH abstraction violation]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TH/THC provide some hpp headers, which are proper C++ headers rather than
+C headers.  These headers serve double duty as *internal implementation
+detail* headers, whose contents should largely not be used by external
+clients.
+
+Ideally, we would not install these headers at all; instead, you should
+use public functions (in headers like `THTensor.h`, NOT `THTensor.hpp`)
+to manipulate these structs.  However, there are a few places
+in torch/csrc where we violate this abstraction.  They are marked with
+a pointer to this note.  Each of those sites will have to be refactored
+when we refactor the guts of THTensor and related structures.
diff --git a/torch/__init__.py b/torch/__init__.py
new file mode 100644
index 0000000..a32be86
--- /dev/null
+++ b/torch/__init__.py
@@ -0,0 +1,296 @@
+r"""
+The torch package contains data structures for multi-dimensional
+tensors and mathematical operations over these are defined.
+Additionally, it provides many utilities for efficient serializing of
+Tensors and arbitrary types, and other useful utilities.
+
+It has a CUDA counterpart, that enables you to run your tensor computations
+on an NVIDIA GPU with compute capability >= 3.0.
+"""
+
+import os
+import sys
+import platform
+from ._utils import _import_dotted_name
+from ._utils_internal import get_file_path, prepare_multiprocessing_environment
+from .version import __version__
+from ._six import string_classes as _string_classes
+
+__all__ = [
+    'typename', 'is_tensor', 'is_storage', 'set_default_tensor_type',
+    'set_rng_state', 'get_rng_state', 'manual_seed', 'initial_seed',
+    'save', 'load', 'set_printoptions', 'chunk', 'split', 'stack', 'matmul',
+    'no_grad', 'enable_grad',
+    'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage',
+    'ShortStorage', 'CharStorage', 'ByteStorage',
+    'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor',
+    'ShortTensor', 'CharTensor', 'ByteTensor', 'Tensor',
+]
+
+################################################################################
+# Load the extension module
+################################################################################
+
+# Loading the extension with RTLD_GLOBAL option allows to not link extension
+# modules against the _C shared object. Their missing THP symbols will be
+# automatically filled by the dynamic loader.
+import os as _dl_flags
+
+# if we have numpy, it *must* be imported before the call to setdlopenflags()
+# or there is risk that later c modules will segfault when importing numpy
+try:
+    import numpy as _np
+except ImportError:
+    pass
+
+if platform.system() == 'Windows':
+    # first get nvToolsExt PATH
+    def get_nvToolsExt_path():
+        NVTOOLEXT_HOME = _dl_flags.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
+
+        if _dl_flags.path.exists(NVTOOLEXT_HOME):
+            return NVTOOLEXT_HOME + '\\bin\\x64\\'
+        else:
+            return ''
+
+    # then add the path to env
+    _dl_flags.environ['PATH'] = _dl_flags.path.dirname(
+        __file__) + '\\lib\\;' + get_nvToolsExt_path() + ';' + _dl_flags.environ['PATH']
+
+else:
+    # first check if the os package has the required flags
+    if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_LAZY'):
+        try:
+            # next try if DLFCN exists
+            import DLFCN as _dl_flags
+        except ImportError:
+            # as a last attempt, use compile-time constants
+            import torch._dl as _dl_flags
+
+    old_flags = sys.getdlopenflags()
+    sys.setdlopenflags(_dl_flags.RTLD_GLOBAL | _dl_flags.RTLD_LAZY)
+
+del _dl_flags
+
+try:
+    import torch._nvrtc
+except ImportError:
+    pass
+
+from torch._C import *
+
+__all__ += [name for name in dir(_C)
+            if name[0] != '_' and
+            not name.endswith('Base')]
+
+if platform.system() != 'Windows':
+    sys.setdlopenflags(old_flags)
+    del old_flags
+
+################################################################################
+# Define basic utilities
+################################################################################
+
+
+def typename(o):
+    if isinstance(o, torch.Tensor):
+        return o.type()
+
+    module = ''
+    class_name = ''
+    if hasattr(o, '__module__') and o.__module__ != 'builtins' \
+            and o.__module__ != '__builtin__' and o.__module__ is not None:
+        module = o.__module__ + '.'
+
+    if hasattr(o, '__qualname__'):
+        class_name = o.__qualname__
+    elif hasattr(o, '__name__'):
+        class_name = o.__name__
+    else:
+        class_name = o.__class__.__name__
+
+    return module + class_name
+
+
+def is_tensor(obj):
+    r"""Returns True if `obj` is a PyTorch tensor.
+
+    Args:
+        obj (Object): Object to test
+    """
+    return isinstance(obj, torch.Tensor)
+
+
+def is_storage(obj):
+    r"""Returns True if `obj` is a PyTorch storage object.
+
+    Args:
+        obj (Object): Object to test
+    """
+    return type(obj) in _storage_classes
+
+
+def set_default_tensor_type(t):
+    r"""Sets the default ``torch.Tensor`` type to floating point tensor type
+    :attr:`t`. This type will also be used as default floating point type for
+    type inference in :func:`torch.tensor`.
+
+    The default floating point tensor type is initially ``torch.FloatTensor``.
+
+    Args:
+        t (type or string): the floating point tensor type or its name
+
+    Example::
+
+        >>> torch.tensor([1.2, 3]).dtype    # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_tensor_type(torch.DoubleTensor)
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
+        torch.float64
+
+    """
+    if isinstance(t, _string_classes):
+        t = _import_dotted_name(t)
+    _C._set_default_tensor_type(t)
+
+
+def set_default_dtype(d):
+    r"""Sets the default floating point dtype to :attr:`d`. This type will be
+    used as default floating point type for type inference in
+    :func:`torch.tensor`.
+
+    The default floating point dtype is initially ``torch.float32``.
+
+    Args:
+        d (:class:`torch.dtype`): the floating point dtype to make the default
+
+    Example::
+
+        >>> torch.tensor([1.2, 3]).dtype           # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_dtype(torch.float64)
+        >>> torch.tensor([1.2, 3]).dtype           # a new floating point tensor
+        torch.float64
+
+    """
+    _C._set_default_dtype(d)
+
+from .random import set_rng_state, get_rng_state, manual_seed, initial_seed
+from .serialization import save, load
+from ._tensor_str import set_printoptions
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+from .tensor import Tensor
+from .storage import _StorageBase
+
+
+class DoubleStorage(_C.DoubleStorageBase, _StorageBase):
+    pass
+
+
+class FloatStorage(_C.FloatStorageBase, _StorageBase):
+    pass
+
+
+class HalfStorage(_C.HalfStorageBase, _StorageBase):
+    pass
+
+
+class LongStorage(_C.LongStorageBase, _StorageBase):
+    pass
+
+
+class IntStorage(_C.IntStorageBase, _StorageBase):
+    pass
+
+
+class ShortStorage(_C.ShortStorageBase, _StorageBase):
+    pass
+
+
+class CharStorage(_C.CharStorageBase, _StorageBase):
+    pass
+
+
+class ByteStorage(_C.ByteStorageBase, _StorageBase):
+    pass
+
+
+_storage_classes = {
+    DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage,
+    CharStorage, ByteStorage, HalfStorage
+}
+
+# The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings()
+_tensor_classes = set()
+
+
+################################################################################
+# Initialize extension
+################################################################################
+
+def manager_path():
+    if platform.system() == 'Windows':
+        return b""
+    path = get_file_path('torch', 'lib', 'torch_shm_manager')
+    prepare_multiprocessing_environment(get_file_path('torch'))
+    if not os.path.exists(path):
+        raise RuntimeError("Unable to find torch_shm_manager at " + path)
+    return path.encode('utf-8')
+
+
+# Shared memory manager needs to know the exact location of manager executable
+_C._initExtension(manager_path())
+del manager_path
+
+for name in dir(_C._VariableFunctions):
+    globals()[name] = getattr(_C._VariableFunctions, name)
+
+################################################################################
+# Import interface functions defined in Python
+################################################################################
+
+# needs to be after the above ATen bindings so we can overwrite from Python side
+from .functional import *
+
+
+################################################################################
+# Remove unnecessary members
+################################################################################
+
+del DoubleStorageBase
+del FloatStorageBase
+del LongStorageBase
+del IntStorageBase
+del ShortStorageBase
+del CharStorageBase
+del ByteStorageBase
+
+################################################################################
+# Import most common subpackages
+################################################################################
+
+import torch.cuda
+import torch.autograd
+import torch.nn
+import torch.optim
+import torch.multiprocessing
+import torch.sparse
+import torch.utils.backcompat
+import torch.onnx
+import torch.jit
+import torch.random
+import torch.distributions
+import torch.testing
+import torch.backends.cuda
+import torch.backends.mkl
+from torch.autograd import no_grad, enable_grad, set_grad_enabled
+
+_C._init_names(list(torch._storage_classes))
+
+# attach docstrings to torch and tensor functions
+from . import _torch_docs, _tensor_docs, _storage_docs
+del _torch_docs, _tensor_docs, _storage_docs
diff --git a/torch/_six.py b/torch/_six.py
new file mode 100644
index 0000000..1d70df5
--- /dev/null
+++ b/torch/_six.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2010-2017 Benjamin Peterson
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import itertools
+import sys
+
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+
+if PY2:
+    inf = float('inf')
+    nan = float('nan')
+else:
+    import math
+    inf = math.inf
+    nan = math.nan
+
+if PY2:
+    string_classes = basestring
+else:
+    string_classes = (str, bytes)
+
+
+if PY2:
+    int_classes = (int, long)
+else:
+    int_classes = int
+
+
+if PY2:
+    FileNotFoundError = IOError
+else:
+    FileNotFoundError = FileNotFoundError
+
+
+def with_metaclass(meta, *bases):
+    """Create a base class with a metaclass."""
+    # This requires a bit of explanation: the basic idea is to make a dummy
+    # metaclass for one level of class instantiation that replaces itself with
+    # the actual metaclass.
+    class metaclass(meta):
+
+        def __new__(cls, name, this_bases, d):
+            return meta(name, bases, d)
+    return type.__new__(metaclass, 'temporary_class', (), {})
+
+
+# A portable way of referring to the generator version of map
+# in both Python 2 and Python 3.
+# TODO: Move this into an appropriate utility library.
+if hasattr(itertools, 'imap'):
+    imap = itertools.imap
+else:
+    imap = map
+
+
+if PY3:
+    import builtins
+    exec_ = getattr(builtins, "exec")
+else:
+    def exec_(_code_, _globs_=None, _locs_=None):
+        """Execute code in a namespace."""
+        if _globs_ is None:
+            frame = sys._getframe(1)
+            _globs_ = frame.f_globals
+            if _locs_ is None:
+                _locs_ = frame.f_locals
+            del frame
+        elif _locs_ is None:
+            _locs_ = _globs_
+        exec("""exec _code_ in _globs_, _locs_""")
+
+
+if sys.version_info[:2] == (3, 2):
+    exec_("""def raise_from(value, from_value):
+    try:
+        if from_value is None:
+            raise value
+        raise value from from_value
+    finally:
+        value = None
+""")
+elif sys.version_info[:2] > (3, 2):
+    exec_("""def raise_from(value, from_value):
+    try:
+        raise value from from_value
+    finally:
+        value = None
+""")
+else:
+    def raise_from(value, from_value):
+        raise value
diff --git a/torch/_storage_docs.py b/torch/_storage_docs.py
new file mode 100644
index 0000000..bd82962
--- /dev/null
+++ b/torch/_storage_docs.py
@@ -0,0 +1,44 @@
+"""Adds docstrings to Storage functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+
+
+storage_classes = [
+    'DoubleStorageBase',
+    'FloatStorageBase',
+    'LongStorageBase',
+    'IntStorageBase',
+    'ShortStorageBase',
+    'CharStorageBase',
+    'ByteStorageBase',
+]
+
+
+def add_docstr_all(method, docstr):
+    for cls_name in storage_classes:
+        cls = getattr(torch._C, cls_name)
+        try:
+            add_docstr(getattr(cls, method), docstr)
+        except AttributeError:
+            pass
+
+
+add_docstr_all('from_file',
+               """
+from_file(filename, shared=False, size=0) -> Storage
+
+If `shared` is `True`, then memory is shared between all processes.
+All changes are written to the file. If `shared` is `False`, then the changes on
+the storage do not affect the file.
+
+`size` is the number of elements in the storage. If `shared` is `False`,
+then the file must contain at least `size * sizeof(Type)` bytes
+(`Type` is the type of storage). If `shared` is `True` the file will be
+created if needed.
+
+Args:
+    filename (str): file name to map
+    shared (bool): whether to share memory
+    size (int): number of elements in the storage
+""")
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
new file mode 100644
index 0000000..eb9709f
--- /dev/null
+++ b/torch/_tensor_docs.py
@@ -0,0 +1,2631 @@
+"""Adds docstrings to Tensor functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+from ._torch_docs import parse_kwargs
+
+
+def add_docstr_all(method, docstr):
+    add_docstr(getattr(torch._C._TensorBase, method), docstr)
+
+new_common_args = parse_kwargs("""
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+        Default: if None, same :class:`torch.dtype` as this tensor.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, same :class:`torch.device` as this tensor.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+""")
+
+add_docstr_all('new_tensor',
+               r"""
+new_tensor(data, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a new Tensor with :attr:`data` as the tensor data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+.. warning::
+
+    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+Args:
+    data (array_like): The returned Tensor copies :attr:`data`.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.int8)
+    >>> data = [[0, 1], [2, 3]]
+    >>> tensor.new_tensor(data)
+    tensor([[ 0,  1],
+            [ 2,  3]], dtype=torch.int8)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_full',
+               r"""
+new_full(size, fill_value, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    fill_value (scalar): the number to fill the output tensor with.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.float64)
+    >>> tensor.new_full((3, 4), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_empty',
+               r"""
+new_empty(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with uninitialized data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty((2, 3))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(**new_common_args))
+
+add_docstr_all('new_ones',
+               r"""
+new_ones(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with ``1``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.int32)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1,  1,  1],
+            [ 1,  1,  1]], dtype=torch.int32)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_zeros',
+               r"""
+new_zeros(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with ``0``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.float64)
+    >>> tensor.new_zeros((2, 3))
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]], dtype=torch.float64)
+
+""".format(**new_common_args))
+
+add_docstr_all('abs',
+               r"""
+abs() -> Tensor
+
+See :func:`torch.abs`
+""")
+
+add_docstr_all('abs_',
+               r"""
+abs_() -> Tensor
+
+In-place version of :meth:`~Tensor.abs`
+""")
+
+add_docstr_all('acos',
+               r"""
+acos() -> Tensor
+
+See :func:`torch.acos`
+""")
+
+add_docstr_all('acos_',
+               r"""
+acos_() -> Tensor
+
+In-place version of :meth:`~Tensor.acos`
+""")
+
+add_docstr_all('add',
+               r"""
+add(value) -> Tensor
+add(value=1, other) -> Tensor
+
+See :func:`torch.add`
+""")
+
+add_docstr_all('add_',
+               r"""
+add_(value) -> Tensor
+add_(value=1, other) -> Tensor
+
+In-place version of :meth:`~Tensor.add`
+""")
+
+add_docstr_all('addbmm',
+               r"""
+addbmm(beta=1, mat, alpha=1, batch1, batch2) -> Tensor
+
+See :func:`torch.addbmm`
+""")
+
+add_docstr_all('addbmm_',
+               r"""
+addbmm_(beta=1, mat, alpha=1, batch1, batch2) -> Tensor
+
+In-place version of :meth:`~Tensor.addbmm`
+""")
+
+add_docstr_all('addcdiv',
+               r"""
+addcdiv(value=1, tensor1, tensor2) -> Tensor
+
+See :func:`torch.addcdiv`
+""")
+
+add_docstr_all('addcdiv_',
+               r"""
+addcdiv_(value=1, tensor1, tensor2) -> Tensor
+
+In-place version of :meth:`~Tensor.addcdiv`
+""")
+
+add_docstr_all('addcmul',
+               r"""
+addcmul(value=1, tensor1, tensor2) -> Tensor
+
+See :func:`torch.addcmul`
+""")
+
+add_docstr_all('addcmul_',
+               r"""
+addcmul_(value=1, tensor1, tensor2) -> Tensor
+
+In-place version of :meth:`~Tensor.addcmul`
+""")
+
+add_docstr_all('addmm',
+               r"""
+addmm(beta=1, mat, alpha=1, mat1, mat2) -> Tensor
+
+See :func:`torch.addmm`
+""")
+
+add_docstr_all('addmm_',
+               r"""
+addmm_(beta=1, mat, alpha=1, mat1, mat2) -> Tensor
+
+In-place version of :meth:`~Tensor.addmm`
+""")
+
+add_docstr_all('addmv',
+               r"""
+addmv(beta=1, tensor, alpha=1, mat, vec) -> Tensor
+
+See :func:`torch.addmv`
+""")
+
+add_docstr_all('addmv_',
+               r"""
+addmv_(beta=1, tensor, alpha=1, mat, vec) -> Tensor
+
+In-place version of :meth:`~Tensor.addmv`
+""")
+
+add_docstr_all('addr',
+               r"""
+addr(beta=1, alpha=1, vec1, vec2) -> Tensor
+
+See :func:`torch.addr`
+""")
+
+add_docstr_all('addr_',
+               r"""
+addr_(beta=1, alpha=1, vec1, vec2) -> Tensor
+
+In-place version of :meth:`~Tensor.addr`
+""")
+
+add_docstr_all('all',
+               r"""
+.. function:: all() -> bool
+
+Returns True if all elements in the tensor are non-zero, False otherwise.
+
+Example::
+
+    >>> a = torch.randn(1, 3).mul(2).byte()
+    >>> a
+
+     1  1  0
+    [torch.ByteTensor of size 1x3]
+
+    >>> a.all()
+    False
+
+.. function:: all(dim, keepdim=False, out=None) -> Tensor
+
+Returns True if all elements in each row of the tensor in the given
+dimension :attr:`dim` are non-zero, False otherwise.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 2).mul(2).byte()
+    >>> a
+
+     1  1
+     1  0
+     0  0
+     1  0
+    [torch.ByteTensor of size 4x2]
+
+    >>> a.all(dim=1)
+
+     1
+     0
+     0
+     0
+    [torch.ByteTensor of size 4]
+""")
+
+add_docstr_all('any',
+               r"""
+.. function:: any() -> bool
+
+Returns True if any elements in the tensor are non-zero, False otherwise.
+
+Example::
+
+    >>> a = torch.randn(1, 3).mul(2).byte()
+    >>> a
+
+     1  1  0
+    [torch.ByteTensor of size 1x3]
+
+    >>> a.any()
+    True
+
+.. function:: any(dim, keepdim=False, out=None) -> Tensor
+
+Returns True if any elements in each row of the tensor in the given
+dimension :attr:`dim` are non-zero, False otherwise.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 2).mul(2).byte()
+    >>> a
+
+     1  1
+     1  0
+     0  0
+     1  0
+    [torch.ByteTensor of size 4x2]
+
+    >>> a.any(dim=1)
+
+     1
+     1
+     0
+     1
+    [torch.ByteTensor of size 4]
+""")
+
+add_docstr_all('apply_',
+               r"""
+apply_(callable) -> Tensor
+
+Applies the function :attr:`callable` to each element in the tensor, replacing
+each element with the value returned by :attr:`callable`.
+
+.. note::
+
+    This function only works with CPU tensors and should not be used in code
+    sections that require high performance.
+""")
+
+add_docstr_all('asin', r"""
+asin() -> Tensor
+
+See :func:`torch.asin`
+""")
+
+add_docstr_all('asin_',
+               r"""
+asin_() -> Tensor
+
+In-place version of :meth:`~Tensor.asin`
+""")
+
+add_docstr_all('atan',
+               r"""
+atan() -> Tensor
+
+See :func:`torch.atan`
+""")
+
+add_docstr_all('atan2',
+               r"""
+atan2(other) -> Tensor
+
+See :func:`torch.atan2`
+""")
+
+add_docstr_all('atan2_',
+               r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atan2`
+""")
+
+add_docstr_all('atan_',
+               r"""
+atan_() -> Tensor
+
+In-place version of :meth:`~Tensor.atan`
+""")
+
+add_docstr_all('baddbmm',
+               r"""
+baddbmm(beta=1, alpha=1, batch1, batch2) -> Tensor
+
+See :func:`torch.baddbmm`
+""")
+
+add_docstr_all('baddbmm_',
+               r"""
+baddbmm_(beta=1, alpha=1, batch1, batch2) -> Tensor
+
+In-place version of :meth:`~Tensor.baddbmm`
+""")
+
+add_docstr_all('bernoulli',
+               r"""
+bernoulli() -> Tensor
+
+See :func:`torch.bernoulli`
+""")
+
+add_docstr_all('bernoulli_',
+               r"""
+bernoulli_() -> Tensor
+
+In-place version of :meth:`~Tensor.bernoulli`
+""")
+
+add_docstr_all('bmm',
+               r"""
+bmm(batch2) -> Tensor
+
+See :func:`torch.bmm`
+""")
+
+add_docstr_all('btrifact_with_info',
+               r"""
+btrifact_with_info(pivot=True) -> (Tensor, Tensor, Tensor)
+
+See :func:`torch.btrifact_with_info`
+""")
+
+add_docstr_all('cauchy_',
+               r"""
+cauchy_(median=0, sigma=1, *, generator=None) -> Tensor
+
+Fills the tensor with numbers drawn from the Cauchy distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - median)^2 + \sigma^2}
+""")
+
+add_docstr_all('ceil',
+               r"""
+ceil() -> Tensor
+
+See :func:`torch.ceil`
+""")
+
+add_docstr_all('ceil_',
+               r"""
+ceil_() -> Tensor
+
+In-place version of :meth:`~Tensor.ceil`
+""")
+
+add_docstr_all('clamp',
+               r"""
+clamp(min, max) -> Tensor
+
+See :func:`torch.clamp`
+""")
+
+add_docstr_all('clamp_',
+               r"""
+clamp_(min, max) -> Tensor
+
+In-place version of :meth:`~Tensor.clamp`
+""")
+
+add_docstr_all('clone',
+               r"""
+clone() -> Tensor
+
+Returns a copy of the :attr:`self` tensor. The copy has the same size and data
+type as :attr:`self`.
+
+.. note::
+
+    Unlike `copy_()`, this function is recorded in the computation graph. Gradients
+    propagating to the cloned tensor will propagate to the original tensor.
+""")
+
+add_docstr_all('contiguous',
+               r"""
+contiguous() -> Tensor
+
+Returns a contiguous tensor containing the same data as :attr:`self` tensor. If
+:attr:`self` tensor is contiguous, this function returns the :attr:`self`
+tensor.
+""")
+
+add_docstr_all('copy_',
+               r"""
+copy_(src, non_blocking=False) -> Tensor
+
+Copies the elements from :attr:`src` into :attr:`self` tensor and returns
+:attr:`self`.
+
+The :attr:`src` tensor must be :ref:`broadcastable <broadcasting-semantics>`
+with the :attr:`self` tensor. It may be of a different data type or reside on a
+different device.
+
+Args:
+    src (Tensor): the source tensor to copy from
+    non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+        the copy may occur asynchronously with respect to the host. For other
+        cases, this argument has no effect.
+""")
+
+add_docstr_all('cos',
+               r"""
+cos() -> Tensor
+
+See :func:`torch.cos`
+""")
+
+add_docstr_all('cos_',
+               r"""
+cos_() -> Tensor
+
+In-place version of :meth:`~Tensor.cos`
+""")
+
+add_docstr_all('cosh',
+               r"""
+cosh() -> Tensor
+
+See :func:`torch.cosh`
+""")
+
+add_docstr_all('cosh_',
+               r"""
+cosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.cosh`
+""")
+
+add_docstr_all('cross',
+               r"""
+cross(other, dim=-1) -> Tensor
+
+See :func:`torch.cross`
+""")
+
+add_docstr_all('cuda',
+               r"""
+cuda(device=None, non_blocking=False) -> Tensor
+
+Returns a copy of this object in CUDA memory.
+
+If this object is already in CUDA memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination GPU device.
+        Defaults to the current CUDA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+""")
+
+add_docstr_all('cumprod',
+               r"""
+cumprod(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumprod`
+""")
+
+add_docstr_all('cumsum',
+               r"""
+cumsum(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumsum`
+""")
+
+add_docstr_all('data_ptr',
+               r"""
+data_ptr() -> int
+
+Returns the address of the first element of :attr:`self` tensor.
+""")
+
+add_docstr_all('diag',
+               r"""
+diag(diagonal=0) -> Tensor
+
+See :func:`torch.diag`
+""")
+
+add_docstr_all('dim',
+               r"""
+dim() -> int
+
+Returns the number of dimensions of :attr:`self` tensor.
+""")
+
+add_docstr_all('dist',
+               r"""
+dist(other, p=2) -> Tensor
+
+See :func:`torch.dist`
+""")
+
+add_docstr_all('div',
+               r"""
+div(value) -> Tensor
+
+See :func:`torch.div`
+""")
+
+add_docstr_all('div_',
+               r"""
+div_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.div`
+""")
+
+add_docstr_all('dot',
+               r"""
+dot(tensor2) -> Tensor
+
+See :func:`torch.dot`
+""")
+
+add_docstr_all('eig',
+               r"""
+eig(eigenvectors=False) -> (Tensor, Tensor)
+
+See :func:`torch.eig`
+""")
+
+add_docstr_all('element_size',
+               r"""
+element_size() -> int
+
+Returns the size in bytes of an individual element.
+
+Example::
+
+    >>> torch.tensor([]).element_size()
+    4
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
+    1
+
+""")
+
+add_docstr_all('eq',
+               r"""
+eq(other) -> Tensor
+
+See :func:`torch.eq`
+""")
+
+add_docstr_all('eq_',
+               r"""
+eq_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.eq`
+""")
+
+add_docstr_all('equal',
+               r"""
+equal(other) -> bool
+
+See :func:`torch.equal`
+""")
+
+add_docstr_all('erf',
+               r"""
+erf() -> Tensor
+
+See :func:`torch.erf`
+""")
+
+add_docstr_all('erfc',
+               r"""
+erf() -> Tensor
+
+See :func:`torch.erfc`
+""")
+
+add_docstr_all('erfinv',
+               r"""
+erfinv() -> Tensor
+
+See :func:`torch.erfinv`
+""")
+
+add_docstr_all('exp',
+               r"""
+exp() -> Tensor
+
+See :func:`torch.exp`
+""")
+
+add_docstr_all('exp_',
+               r"""
+exp_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp`
+""")
+
+add_docstr_all('expm1',
+               r"""
+expm1() -> Tensor
+
+See :func:`torch.expm1`
+""")
+
+add_docstr_all('expm1_',
+               r"""
+expm1_() -> Tensor
+
+In-place version of :meth:`~Tensor.expm1`
+""")
+
+add_docstr_all('exponential_',
+               r"""
+exponential_(lambd=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the exponential distribution:
+
+.. math::
+
+    f(x) = \lambda e^{-\lambda x}
+""")
+
+add_docstr_all('fill_',
+               r"""
+fill_(value) -> Tensor
+
+Fills :attr:`self` tensor with the specified value.
+""")
+
+add_docstr_all('floor',
+               r"""
+floor() -> Tensor
+
+See :func:`torch.floor`
+""")
+
+add_docstr_all('floor_',
+               r"""
+floor_() -> Tensor
+
+In-place version of :meth:`~Tensor.floor`
+""")
+
+add_docstr_all('fmod',
+               r"""
+fmod(divisor) -> Tensor
+
+See :func:`torch.fmod`
+""")
+
+add_docstr_all('fmod_',
+               r"""
+fmod_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.fmod`
+""")
+
+add_docstr_all('frac',
+               r"""
+frac() -> Tensor
+
+See :func:`torch.frac`
+""")
+
+add_docstr_all('frac_',
+               r"""
+frac_() -> Tensor
+
+In-place version of :meth:`~Tensor.frac`
+""")
+
+add_docstr_all('flatten',
+               r"""
+flatten(input, start_dim=0, end_dim=-1) -> Tensor
+
+see :func:`torch.flatten`
+""")
+
+add_docstr_all('gather',
+               r"""
+gather(dim, index) -> Tensor
+
+See :func:`torch.gather`
+""")
+
+add_docstr_all('ge',
+               r"""
+ge(other) -> Tensor
+
+See :func:`torch.ge`
+""")
+
+add_docstr_all('ge_',
+               r"""
+ge_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ge`
+""")
+
+add_docstr_all('gels',
+               r"""
+gels(A) -> Tensor
+
+See :func:`torch.gels`
+""")
+
+add_docstr_all('geometric_',
+               r"""
+geometric_(p, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the geometric distribution:
+
+.. math::
+
+    f(X=k) = (1 - p)^{k - 1} p
+
+""")
+
+add_docstr_all('geqrf',
+               r"""
+geqrf() -> (Tensor, Tensor)
+
+See :func:`torch.geqrf`
+""")
+
+add_docstr_all('ger',
+               r"""
+ger(vec2) -> Tensor
+
+See :func:`torch.ger`
+""")
+
+add_docstr_all('gesv',
+               r"""
+gesv(A) -> Tensor, Tensor
+
+See :func:`torch.gesv`
+""")
+
+add_docstr_all('get_device',
+               r"""
+get_device(A) -> Device ordinal (Integer)
+
+For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
+For CPU tensors, an error is thrown.
+
+Example::
+
+    >>> x = torch.randn(3, 4, 5, device='cuda:0')
+    >>> x.get_device()
+    0
+    >>> x.cpu().get_device()  # RuntimeError: get_device is not implemented for type torch.FloatTensor
+""")
+
+add_docstr_all('gt',
+               r"""
+gt(other) -> Tensor
+
+See :func:`torch.gt`
+""")
+
+add_docstr_all('gt_',
+               r"""
+gt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gt`
+""")
+
+add_docstr_all('histc',
+               r"""
+histc(bins=100, min=0, max=0) -> Tensor
+
+See :func:`torch.histc`
+""")
+
+add_docstr_all('index_add_',
+               r"""
+index_add_(dim, index, tensor) -> Tensor
+
+Accumulate the elements of :attr:`tensor` into the :attr:`self` tensor by adding
+to the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is added to the
+``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`tensor` to select from
+    tensor (Tensor): the tensor containing values to add
+
+Example::
+
+    >>> x = torch.ones(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_add_(0, index, t)
+    tensor([[  2.,   3.,   4.],
+            [  1.,   1.,   1.],
+            [  8.,   9.,  10.],
+            [  1.,   1.,   1.],
+            [  5.,   6.,   7.]])
+""")
+
+add_docstr_all('index_copy_',
+               r"""
+index_copy_(dim, index, tensor) -> Tensor
+
+Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`tensor` to select from
+    tensor (Tensor): the tensor containing values to copy
+
+Example::
+
+    >>> x = torch.zeros(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_copy_(0, index, t)
+    tensor([[ 1.,  2.,  3.],
+            [ 0.,  0.,  0.],
+            [ 7.,  8.,  9.],
+            [ 0.,  0.,  0.],
+            [ 4.,  5.,  6.]])
+""")
+
+add_docstr_all('index_fill_',
+               r"""
+index_fill_(dim, index, val) -> Tensor
+
+Fills the elements of the :attr:`self` tensor with value :attr:`val` by
+selecting the indices in the order given in :attr:`index`.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`self` tensor to fill in
+    val (float): the value to fill with
+
+Example::
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 2])
+    >>> x.index_fill_(1, index, -1)
+    tensor([[-1.,  2., -1.],
+            [-1.,  5., -1.],
+            [-1.,  8., -1.]])
+""")
+
+add_docstr_all('index_put_',
+               r"""
+index_put_(indices, value) -> Tensor
+
+Puts values from the tensor :attr:`value` into the tensor :attr:`self` using
+the indices specified in :attr:`indices` (which is a tuple of Tensors). The
+expression ``tensor.index_put_(indices, value)`` is equivalent to
+``tensor[indices] = value``. Returns :attr:`self`.
+
+Args:
+    indices (tuple of LongTensor): tensors used to index into `self`.
+    value (Tensor): tensor of same dtype as `self`.
+""")
+
+add_docstr_all('index_select',
+               r"""
+index_select(dim, index) -> Tensor
+
+See :func:`torch.index_select`
+""")
+
+add_docstr_all('inverse',
+               r"""
+inverse() -> Tensor
+
+See :func:`torch.inverse`
+""")
+
+add_docstr_all('is_contiguous',
+               r"""
+is_contiguous() -> bool
+
+Returns True if :attr:`self` tensor is contiguous in memory in C order.
+""")
+
+add_docstr_all('is_set_to',
+               r"""
+is_set_to(tensor) -> bool
+
+Returns True if this object refers to the same ``THTensor`` object from the
+Torch C API as the given tensor.
+""")
+
+add_docstr_all('item', r"""
+item() -> number
+
+Returns the value of this tensor as a standard Python number. This only works
+for tensors with one element. For other cases, see :meth:`~Tensor.tolist`.
+
+This operation is not differentiable.
+
+Example::
+
+    >>> x = torch.tensor([1.0])
+    >>> x.item()
+    1.0
+
+""")
+
+add_docstr_all('kthvalue',
+               r"""
+kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.kthvalue`
+""")
+
+add_docstr_all('le',
+               r"""
+le(other) -> Tensor
+
+See :func:`torch.le`
+""")
+
+add_docstr_all('le_',
+               r"""
+le_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.le`
+""")
+
+add_docstr_all('lerp',
+               r"""
+lerp(start, end, weight) -> Tensor
+
+See :func:`torch.lerp`
+""")
+
+add_docstr_all('lerp_',
+               r"""
+lerp_(start, end, weight) -> Tensor
+
+In-place version of :meth:`~Tensor.lerp`
+""")
+
+add_docstr_all('log',
+               r"""
+log() -> Tensor
+
+See :func:`torch.log`
+""")
+
+add_docstr_all('log_', r"""
+log_() -> Tensor
+
+In-place version of :meth:`~Tensor.log`
+""")
+
+add_docstr_all('log10',
+               r"""
+log10() -> Tensor
+
+See :func:`torch.log10`
+""")
+
+add_docstr_all('log10_',
+               r"""
+log10_() -> Tensor
+
+In-place version of :meth:`~Tensor.log10`
+""")
+
+add_docstr_all('log1p',
+               r"""
+log1p() -> Tensor
+
+See :func:`torch.log1p`
+""")
+
+add_docstr_all('log1p_',
+               r"""
+log1p_() -> Tensor
+
+In-place version of :meth:`~Tensor.log1p`
+""")
+
+add_docstr_all('log2',
+               r"""
+log2() -> Tensor
+
+See :func:`torch.log2`
+""")
+
+add_docstr_all('log2_',
+               r"""
+log2_() -> Tensor
+
+In-place version of :meth:`~Tensor.log2`
+""")
+
+add_docstr_all('log_normal_', u"""
+log_normal_(mean=1, std=2, *, generator=None)
+
+Fills :attr:`self` tensor with numbers samples from the log-normal distribution
+parameterized by the given mean (\u00B5) and standard deviation (\u03C3).
+Note that :attr:`mean` and :attr:`stdv` are the mean and standard deviation of
+the underlying normal distribution, and not of the returned distribution:
+
+.. math::
+
+    f(x) = \\dfrac{1}{x \\sigma \\sqrt{2\\pi}}\ e^{-\\dfrac{(\\ln x - \\mu)^2}{2\\sigma^2}}
+""")
+
+add_docstr_all('logsumexp',
+               r"""
+logsumexp(dim, keepdim=False) -> Tensor
+
+See :func:`torch.logsumexp`
+""")
+
+add_docstr_all('lt',
+               r"""
+lt(other) -> Tensor
+
+See :func:`torch.lt`
+""")
+
+add_docstr_all('lt_',
+               r"""
+lt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lt`
+""")
+
+add_docstr_all('map_',
+               r"""
+map_(tensor, callable)
+
+Applies :attr:`callable` for each element in :attr:`self` tensor and the given
+:attr:`tensor` and stores the results in :attr:`self` tensor. :attr:`self` tensor and
+the given :attr:`tensor` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+The :attr:`callable` should have the signature::
+
+    def callable(a, b) -> number
+""")
+
+add_docstr_all('masked_scatter_',
+               r"""
+masked_scatter_(mask, source)
+
+Copies elements from :attr:`source` into :attr:`self` tensor at positions where
+the :attr:`mask` is one.
+The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
+with the shape of the underlying tensor. The :attr:`source` should have at least
+as many elements as the number of ones in :attr:`mask`
+
+Args:
+    mask (ByteTensor): the binary mask
+    source (Tensor): the tensor to copy from
+
+.. note::
+
+    The :attr:`mask` operates on the :attr:`self` tensor, not on the given
+    :attr:`source` tensor.
+""")
+
+add_docstr_all('masked_fill_',
+               r"""
+masked_fill_(mask, value)
+
+Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+one. The shape of :attr:`mask` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor.
+
+Args:
+    mask (ByteTensor): the binary mask
+    value (float): the value to fill in with
+""")
+
+add_docstr_all('masked_select',
+               r"""
+masked_select(mask) -> Tensor
+
+See :func:`torch.masked_select`
+""")
+
+add_docstr_all('max',
+               r"""
+max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.max`
+""")
+
+add_docstr_all('mean',
+               r"""
+mean(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.mean`
+""")
+
+add_docstr_all('median',
+               r"""
+median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.median`
+""")
+
+add_docstr_all('min',
+               r"""
+min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.min`
+""")
+
+add_docstr_all('mm',
+               r"""
+mm(mat2) -> Tensor
+
+See :func:`torch.mm`
+""")
+
+add_docstr_all('mode',
+               r"""
+mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.mode`
+""")
+
+add_docstr_all('mul',
+               r"""
+mul(value) -> Tensor
+
+See :func:`torch.mul`
+""")
+
+add_docstr_all('mul_',
+               r"""
+mul_(value)
+
+In-place version of :meth:`~Tensor.mul`
+""")
+
+add_docstr_all('multinomial',
+               r"""
+multinomial(num_samples, replacement=False, *, generator=None) -> Tensor
+
+See :func:`torch.multinomial`
+""")
+
+add_docstr_all('mv',
+               r"""
+mv(vec) -> Tensor
+
+See :func:`torch.mv`
+""")
+
+add_docstr_all('narrow',
+               r"""
+narrow(dimension, start, length) -> Tensor
+
+Returns a new tensor that is a narrowed version of :attr:`self` tensor. The
+dimension :attr:`dim` is narrowed from :attr:`start` to :attr:`start + length`. The
+returned tensor and :attr:`self` tensor share the same underlying storage.
+
+Args:
+    dimension (int): the dimension along which to narrow
+    start (int): the starting dimension
+    length (int): the distance to the ending dimension
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> x.narrow(0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> x.narrow(1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
+""")
+
+add_docstr_all('ndimension',
+               r"""
+ndimension() -> int
+
+Alias for :meth:`~Tensor.dim()`
+""")
+
+add_docstr_all('ne',
+               r"""
+ne(other) -> Tensor
+
+See :func:`torch.ne`
+""")
+
+add_docstr_all('ne_',
+               r"""
+ne_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ne`
+""")
+
+add_docstr_all('neg',
+               r"""
+neg() -> Tensor
+
+See :func:`torch.neg`
+""")
+
+add_docstr_all('neg_',
+               r"""
+neg_() -> Tensor
+
+In-place version of :meth:`~Tensor.neg`
+""")
+
+add_docstr_all('nelement',
+               r"""
+nelement() -> int
+
+Alias for :meth:`~Tensor.numel`
+""")
+
+add_docstr_all('nonzero',
+               r"""
+nonzero() -> LongTensor
+
+See :func:`torch.nonzero`
+""")
+
+add_docstr_all('norm',
+               r"""
+norm(p=2, dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.norm`
+""")
+
+add_docstr_all('normal_',
+               r"""
+normal_(mean=0, std=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements samples from the normal distribution
+parameterized by :attr:`mean` and :attr:`std`.
+""")
+
+add_docstr_all('numel',
+               r"""
+numel() -> int
+
+See :func:`torch.numel`
+""")
+
+add_docstr_all('numpy',
+               r"""
+numpy() -> numpy.ndarray
+
+Returns :attr:`self` tensor as a NumPy :class:`ndarray`. This tensor and the
+returned :class:`ndarray` share the same underlying storage. Changes to
+:attr:`self` tensor will be reflected in the :class:`ndarray` and vice versa.
+""")
+
+add_docstr_all('orgqr',
+               r"""
+orgqr(input2) -> Tensor
+
+See :func:`torch.orgqr`
+""")
+
+add_docstr_all('ormqr',
+               r"""
+ormqr(input2, input3, left=True, transpose=False) -> Tensor
+
+See :func:`torch.ormqr`
+""")
+
+
+add_docstr_all('permute',
+               r"""
+permute(*dims) -> Tensor
+
+Permute the dimensions of this tensor.
+
+Args:
+    *dims (int...): The desired ordering of dimensions
+
+Example:
+    >>> x = torch.randn(2, 3, 5)
+    >>> x.size()
+    torch.Size([2, 3, 5])
+    >>> x.permute(2, 0, 1).size()
+    torch.Size([5, 2, 3])
+""")
+
+add_docstr_all('potrf',
+               r"""
+potrf(upper=True) -> Tensor
+
+See :func:`torch.potrf`
+""")
+
+add_docstr_all('potri',
+               r"""
+potri(upper=True) -> Tensor
+
+See :func:`torch.potri`
+""")
+
+add_docstr_all('potrs',
+               r"""
+potrs(input2, upper=True) -> Tensor
+
+See :func:`torch.potrs`
+""")
+
+add_docstr_all('pow',
+               r"""
+pow(exponent) -> Tensor
+
+See :func:`torch.pow`
+""")
+
+add_docstr_all('pow_',
+               r"""
+pow_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.pow`
+""")
+
+add_docstr_all('prod',
+               r"""
+prod(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.prod`
+""")
+
+add_docstr_all('pstrf',
+               r"""
+pstrf(upper=True, tol=-1) -> (Tensor, IntTensor)
+
+See :func:`torch.pstrf`
+""")
+
+add_docstr_all('put_',
+               r"""
+put_(indices, tensor, accumulate=False) -> Tensor
+
+Copies the elements from :attr:`tensor` into the positions specified by
+indices. For the purpose of indexing, the :attr:`self` tensor is treated as if
+it were a 1-D tensor.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`tensor` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
+contain duplicate elements.
+
+Args:
+    indices (LongTensor): the indices into self
+    tensor (Tensor): the tensor containing values to copy from
+    accumulate (bool): whether to accumulate into self
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+                            [6, 7, 8]])
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+    tensor([[  4,   9,   5],
+            [ 10,   7,   8]])
+""")
+
+add_docstr_all('qr',
+               r"""
+qr() -> (Tensor, Tensor)
+
+See :func:`torch.qr`
+""")
+
+add_docstr_all('random_',
+               r"""
+random_(from=0, to=None, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+distribution over ``[from, to - 1]``. If not specified, the values are usually
+only bounded by :attr:`self` tensor's data type. However, for floating point
+types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+will be uniform in ``[0, 2^53]``.
+""")
+
+add_docstr_all('reciprocal',
+               r"""
+reciprocal() -> Tensor
+
+See :func:`torch.reciprocal`
+""")
+
+add_docstr_all('reciprocal_',
+               r"""
+reciprocal_() -> Tensor
+
+In-place version of :meth:`~Tensor.reciprocal`
+""")
+
+add_docstr_all('remainder',
+               r"""
+remainder(divisor) -> Tensor
+
+See :func:`torch.remainder`
+""")
+
+add_docstr_all('remainder_',
+               r"""
+remainder_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.remainder`
+""")
+
+add_docstr_all('renorm',
+               r"""
+renorm(p, dim, maxnorm) -> Tensor
+
+See :func:`torch.renorm`
+""")
+
+add_docstr_all('renorm_',
+               r"""
+renorm_(p, dim, maxnorm) -> Tensor
+
+In-place version of :meth:`~Tensor.renorm`
+""")
+
+add_docstr_all('repeat',
+               r"""
+repeat(*sizes) -> Tensor
+
+Repeats this tensor along the specified dimensions.
+
+Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+
+.. warning::
+
+    :func:`torch.repeat` behaves differently from
+    `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
+    but is more similar to
+    `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+
+Args:
+    sizes (torch.Size or int...): The number of times to repeat this tensor along each
+        dimension
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.repeat(4, 2)
+    tensor([[ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3]])
+    >>> x.repeat(4, 2, 1).size()
+    torch.Size([4, 2, 3])
+""")
+
+add_docstr_all('requires_grad_',
+               r"""
+requires_grad_(requires_grad=True) -> Tensor
+
+Change if autograd should record operations on this tensor: sets this tensor's
+:attr:`requires_grad` attribute in-place. Returns this tensor.
+
+:func:`require_grad_`'s main use case is to tell autograd to begin recording
+operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+(because it was obtained through a DataLoader, or required preprocessing or
+initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+begin to record operations on ``tensor``.
+
+Args:
+    requires_grad (bool): If autograd should record operations on this tensor.
+        Default: ``True``.
+
+Example::
+
+    >>> # Let's say we want to preprocess some saved weights and use
+    >>> # the result as new weights.
+    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+    >>> loaded_weights = torch.tensor(saved_weights)
+    >>> weights = preprocess(loaded_weights)  # some function
+    >>> weights
+    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+
+    >>> # Now, start to record operations done to weights
+    >>> weights.requires_grad_()
+    >>> out = weights.pow(2).sum()
+    >>> out.backward()
+    >>> weights.grad
+    tensor([-1.1007,  0.9853, -4.2316, -1.6606])
+
+""")
+
+add_docstr_all('reshape',
+               r"""
+reshape(*shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`self`,
+but with the specified shape.
+
+Args:
+    shape (tuple of ints or int...): the desired shape
+
+See :func:`torch.reshape`
+""")
+
+add_docstr_all('reshape_as',
+               r"""
+reshape_as(other) -> Tensor
+
+Returns this tensor as the same shape as :attr:`other`.
+``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+
+Please see :meth:`~Tensor.reshape` for more information about ``reshape``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same shape
+        as :attr:`other`.
+""")
+
+add_docstr_all('resize_',
+               r"""
+resize_(*sizes) -> Tensor
+
+Resizes :attr:`self` tensor to the specified size. If the number of elements is
+larger than the current storage size, then the underlying storage is resized
+to fit the new number of elements. If the number of elements is smaller, the
+underlying storage is not changed. Existing elements are preserved but any new
+memory is uninitialized.
+
+Args:
+    sizes (torch.Size or int...): the desired size
+
+Example::
+
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x.resize_(2, 2)
+    tensor([[ 1,  2],
+            [ 3,  4]])
+""")
+
+add_docstr_all('resize_as_',
+               r"""
+resize_as_(tensor) -> Tensor
+
+Resizes the :attr:`self` tensor to be the same size as the specified
+:attr:`tensor`. This is equivalent to ``self.resize_(tensor.size())``.
+""")
+
+add_docstr_all('round',
+               r"""
+round() -> Tensor
+
+See :func:`torch.round`
+""")
+
+add_docstr_all('round_',
+               r"""
+round_() -> Tensor
+
+In-place version of :meth:`~Tensor.round`
+""")
+
+add_docstr_all('rsqrt',
+               r"""
+rsqrt() -> Tensor
+
+See :func:`torch.rsqrt`
+""")
+
+add_docstr_all('rsqrt_',
+               r"""
+rsqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.rsqrt`
+""")
+
+add_docstr_all('scatter_',
+               r"""
+scatter_(dim, index, src) -> Tensor
+
+Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should have same
+number of dimensions. It is also required that ``index.size(d) <= src.size(d)``
+for all dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all
+dimensions ``d != dim``.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row
+along the specified dimension :attr:`dim` must be unique.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter
+    src (Tensor or float): the source element(s) to scatter
+
+Example::
+
+    >>> x = torch.rand(2, 5)
+    >>> x
+    tensor([[ 0.3992,  0.2908,  0.9044,  0.4850,  0.6004],
+            [ 0.5735,  0.9006,  0.6797,  0.4152,  0.1732]])
+    >>> torch.zeros(3, 5).scatter_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
+    tensor([[ 0.3992,  0.9006,  0.6797,  0.4850,  0.6004],
+            [ 0.0000,  0.2908,  0.0000,  0.4152,  0.0000],
+            [ 0.5735,  0.0000,  0.9044,  0.0000,  0.1732]])
+
+    >>> z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23)
+    >>> z
+    tensor([[ 0.0000,  0.0000,  1.2300,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  1.2300]])
+""")
+
+add_docstr_all('scatter_add_',
+               r"""
+scatter_add_(dim, index, other) -> Tensor
+
+Adds all values from the tensor :attr:`other` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`other`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`other`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += other[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += other[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += other[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`other` should have same number of
+dimensions. It is also required that ``index.size(d) <= other.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row along
+the specified dimension :attr:`dim` must be unique.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add
+    other (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> x = torch.rand(2, 5)
+    >>> x
+    tensor([[0.7404, 0.0427, 0.6480, 0.3806, 0.8328],
+            [0.7953, 0.2009, 0.9154, 0.6782, 0.9620]])
+    >>> torch.ones(3, 5).scatter_add_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
+    tensor([[1.7404, 1.2009, 1.9154, 1.3806, 1.8328],
+            [1.0000, 1.0427, 1.0000, 1.6782, 1.0000],
+            [1.7953, 1.0000, 1.6480, 1.0000, 1.9620]])
+
+""")
+
+add_docstr_all('select',
+               r"""
+select(dim, index) -> Tensor
+
+Slices the :attr:`self` tensor along the selected dimension at the given index.
+This function returns a tensor with the given dimension removed.
+
+Args:
+    dim (int): the dimension to slice
+    index (int): the index to select with
+
+.. note::
+
+    :meth:`select` is equivalent to slicing. For example,
+    ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+    ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+""")
+
+add_docstr_all('set_',
+               r"""
+set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+
+Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+:attr:`self` tensor will share the same storage and have the same size and
+strides as :attr:`source`. Changes to elements in one tensor will be reflected
+in the other.
+
+If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+storage, offset, size, and stride.
+
+Args:
+    source (Tensor or Storage): the tensor or storage to use
+    storage_offset (int, optional): the offset in the storage
+    size (torch.Size, optional): the desired size. Defaults to the size of the source.
+    stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+""")
+
+add_docstr_all('sigmoid',
+               r"""
+sigmoid() -> Tensor
+
+See :func:`torch.sigmoid`
+""")
+
+add_docstr_all('sigmoid_',
+               r"""
+sigmoid_() -> Tensor
+
+In-place version of :meth:`~Tensor.sigmoid`
+""")
+
+add_docstr_all('sign',
+               r"""
+sign() -> Tensor
+
+See :func:`torch.sign`
+""")
+
+add_docstr_all('sign_',
+               r"""
+sign_() -> Tensor
+
+In-place version of :meth:`~Tensor.sign`
+""")
+
+add_docstr_all('sin',
+               r"""
+sin() -> Tensor
+
+See :func:`torch.sin`
+""")
+
+add_docstr_all('sin_',
+               r"""
+sin_() -> Tensor
+
+In-place version of :meth:`~Tensor.sin`
+""")
+
+add_docstr_all('sinh',
+               r"""
+sinh() -> Tensor
+
+See :func:`torch.sinh`
+""")
+
+add_docstr_all('sinh_',
+               r"""
+sinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinh`
+""")
+
+add_docstr_all('size',
+               r"""
+size() -> torch.Size
+
+Returns the size of the :attr:`self` tensor. The returned value is a subclass of
+:class:`tuple`.
+
+Example::
+
+    >>> torch.empty(3, 4, 5).size()
+    torch.Size([3, 4, 5])
+
+""")
+
+add_docstr_all('sort',
+               r"""
+sort(dim=None, descending=False) -> (Tensor, LongTensor)
+
+See :func:`torch.sort`
+""")
+
+add_docstr_all('sqrt',
+               r"""
+sqrt() -> Tensor
+
+See :func:`torch.sqrt`
+""")
+
+add_docstr_all('sqrt_',
+               r"""
+sqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.sqrt`
+""")
+
+add_docstr_all('squeeze',
+               r"""
+squeeze(dim=None) -> Tensor
+
+See :func:`torch.squeeze`
+""")
+
+add_docstr_all('squeeze_',
+               r"""
+squeeze_(dim=None) -> Tensor
+
+In-place version of :meth:`~Tensor.squeeze`
+""")
+
+add_docstr_all('std',
+               r"""
+std(dim=None, unbiased=True, keepdim=False) -> Tensor
+
+See :func:`torch.std`
+""")
+
+add_docstr_all('storage',
+               r"""
+storage() -> torch.Storage
+
+Returns the underlying storage
+""")
+
+add_docstr_all('storage_offset',
+               r"""
+storage_offset() -> int
+
+Returns :attr:`self` tensor's offset in the underlying storage in terms of
+number of storage elements (not bytes).
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
+    >>> x.storage_offset()
+    0
+    >>> x[3:].storage_offset()
+    3
+
+""")
+
+add_docstr_all('stride',
+               r"""
+stride(dim) -> tuple or int
+
+Returns the stride of :attr:`self` tensor.
+
+Stride is the jump necessary to go from one element to the next one in the
+specified dimension :attr:`dim`. A tuple of all strides is returned when no
+argument is passed in. Otherwise, an integer value is returned as the stride in
+the particular dimension :attr:`dim`.
+
+Args:
+    dim (int, optional): the desired dimension in which stride is required
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+    >>>x.stride(0)
+    5
+    >>> x.stride(-1)
+    1
+
+""")
+
+add_docstr_all('sub',
+               r"""
+sub(value, other) -> Tensor
+
+Subtracts a scalar or tensor from :attr:`self` tensor. If both :attr:`value` and
+:attr:`other` are specified, each element of :attr:`other` is scaled by
+:attr:`value` before being used.
+
+When :attr:`other` is a tensor, the shape of :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor.
+
+""")
+
+add_docstr_all('sub_',
+               r"""
+sub_(x) -> Tensor
+
+In-place version of :meth:`~Tensor.sub`
+""")
+
+add_docstr_all('sum',
+               r"""
+sum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.sum`
+""")
+
+add_docstr_all('svd',
+               r"""
+svd(some=True) -> (Tensor, Tensor, Tensor)
+
+See :func:`torch.svd`
+""")
+
+add_docstr_all('symeig',
+               r"""
+symeig(eigenvectors=False, upper=True) -> (Tensor, Tensor)
+
+See :func:`torch.symeig`
+""")
+
+add_docstr_all('t',
+               r"""
+t() -> Tensor
+
+See :func:`torch.t`
+""")
+
+add_docstr_all('t_',
+               r"""
+t_() -> Tensor
+
+In-place version of :meth:`~Tensor.t`
+""")
+
+add_docstr_all('to',
+               r"""
+to(*args, **kwargs) -> Tensor
+
+Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+inferred from the arguments of ``self.to(*args, **kwargs)``.
+
+.. note::
+
+    If the ``self`` Tensor already
+    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+    Otherwise, the returned tensor is a copy of ``self`` with the desired
+    :class:`torch.dtype` and :class:`torch.device`.
+
+Here are the ways to call ``to``:
+
+.. function:: to(dtype) -> Tensor
+
+    Returns a Tensor with the specified :attr:`dtype`
+
+.. function:: to(device=None, dtype=None, non_blocking=False) -> Tensor
+
+    Returns a Tensor with the specified :attr:`device` and (optional)
+    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+    When :attr:`non_blocking`, tries to convert asynchronously with respect to
+    the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+    CUDA Tensor.
+
+.. function:: to(other, non_blocking=False) -> Tensor
+
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+    the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+    asynchronously with respect to the host if possible, e.g., converting a CPU
+    Tensor with pinned memory to a CUDA Tensor.
+
+Example::
+
+    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+    >>> tensor.to(torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64)
+
+    >>> cuda0 = torch.device('cuda:0')
+    >>> tensor.to(cuda0)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], device='cuda:0')
+
+    >>> tensor.to(cuda0, dtype=torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+    >>> tensor.to(other, non_blocking=True)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+""")
+
+add_docstr_all('byte',
+               r"""
+byte() -> Tensor
+
+``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+""")
+
+add_docstr_all('char',
+               r"""
+char() -> Tensor
+
+``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+""")
+
+add_docstr_all('double',
+               r"""
+double() -> Tensor
+
+``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+""")
+
+add_docstr_all('float',
+               r"""
+float() -> Tensor
+
+``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+""")
+
+add_docstr_all('half',
+               r"""
+half() -> Tensor
+
+``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+""")
+
+add_docstr_all('int',
+               r"""
+int() -> Tensor
+
+``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+""")
+
+add_docstr_all('long',
+               r"""
+long() -> Tensor
+
+``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+""")
+
+add_docstr_all('short',
+               r"""
+short() -> Tensor
+
+``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+""")
+
+add_docstr_all('take',
+               r"""
+take(indices) -> Tensor
+
+See :func:`torch.take`
+""")
+
+add_docstr_all('tan_',
+               r"""
+tan_() -> Tensor
+
+In-place version of :meth:`~Tensor.tan`
+""")
+
+add_docstr_all('tanh',
+               r"""
+tanh() -> Tensor
+
+See :func:`torch.tanh`
+""")
+
+add_docstr_all('tanh_',
+               r"""
+tanh_() -> Tensor
+
+In-place version of :meth:`~Tensor.tanh`
+""")
+
+add_docstr_all('tolist',
+               r""""
+tolist() -> list or number
+
+Returns the tensor as a (nested) list. For scalars, a standard
+Python number is returned, just like with :meth:`~Tensor.item`.
+Tensors are automatically moved to the CPU first if necessary.
+
+This operation is not differentiable.
+
+Examples::
+
+    >>> a = torch.randn(2, 2)
+    >>> a.tolist()
+    [[0.012766935862600803, 0.5415473580360413],
+     [-0.08909505605697632, 0.7729271650314331]]
+    >>> a[0,0].tolist()
+    0.012766935862600803
+""")
+
+add_docstr_all('topk',
+               r"""
+topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
+
+See :func:`torch.topk`
+""")
+
+add_docstr_all('trace',
+               r"""
+trace() -> Tensor
+
+See :func:`torch.trace`
+""")
+
+add_docstr_all('transpose',
+               r"""
+transpose(dim0, dim1) -> Tensor
+
+See :func:`torch.transpose`
+""")
+
+add_docstr_all('transpose_',
+               r"""
+transpose_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.transpose`
+""")
+
+add_docstr_all('tril',
+               r"""
+tril(k=0) -> Tensor
+
+See :func:`torch.tril`
+""")
+
+add_docstr_all('tril_',
+               r"""
+tril_(k=0) -> Tensor
+
+In-place version of :meth:`~Tensor.tril`
+""")
+
+add_docstr_all('triu',
+               r"""
+triu(k=0) -> Tensor
+
+See :func:`torch.triu`
+""")
+
+add_docstr_all('triu_',
+               r"""
+triu_(k=0) -> Tensor
+
+In-place version of :meth:`~Tensor.triu`
+""")
+
+add_docstr_all('trtrs',
+               r"""
+trtrs(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+
+See :func:`torch.trtrs`
+""")
+
+add_docstr_all('trunc',
+               r"""
+trunc() -> Tensor
+
+See :func:`torch.trunc`
+""")
+
+add_docstr_all('trunc_',
+               r"""
+trunc_() -> Tensor
+
+In-place version of :meth:`~Tensor.trunc`
+""")
+
+add_docstr_all('type',
+               r"""
+type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+Returns the type if `dtype` is not provided, else casts this object to
+the specified type.
+
+If this is already of the correct type, no copy is performed and the
+original object is returned.
+
+Args:
+    dtype (type or string): The desired type
+    non_blocking (bool): If ``True``, and the source is in pinned memory
+        and destination is on the GPU or vice versa, the copy is performed
+        asynchronously with respect to the host. Otherwise, the argument
+        has no effect.
+    **kwargs: For compatibility, may contain the key ``async`` in place of
+        the ``non_blocking`` argument. The ``async`` arg is deprecated.
+""")
+
+add_docstr_all('type_as',
+               r"""
+type_as(tensor) -> Tensor
+
+Returns this tensor cast to the type of the given tensor.
+
+This is a no-op if the tensor is already of the correct type. This is
+equivalent to::
+
+    self.type(tensor.type())
+
+Params:
+    tensor (Tensor): the tensor which has the desired type
+""")
+
+add_docstr_all('unfold',
+               r"""
+unfold(dim, size, step) -> Tensor
+
+Returns a tensor which contains all slices of size :attr:`size` from
+:attr:`self` tensor in the dimension :attr:`dim`.
+
+Step between two slices is given by :attr:`step`.
+
+If `sizedim` is the size of dimension dim for :attr:`self`, the size of
+dimension :attr:`dim` in the returned tensor will be
+`(sizedim - size) / step + 1`.
+
+An additional dimension of size size is appended in the returned tensor.
+
+Args:
+    dim (int): dimension in which unfolding happens
+    size (int): the size of each slice that is unfolded
+    step (int): the step between each slice
+
+Example::
+
+    >>> x = torch.arange(1., 8)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
+    >>> x.unfold(0, 2, 1)
+    tensor([[ 1.,  2.],
+            [ 2.,  3.],
+            [ 3.,  4.],
+            [ 4.,  5.],
+            [ 5.,  6.],
+            [ 6.,  7.]])
+    >>> x.unfold(0, 2, 2)
+    tensor([[ 1.,  2.],
+            [ 3.,  4.],
+            [ 5.,  6.]])
+""")
+
+add_docstr_all('uniform_',
+               r"""
+uniform_(from=0, to=1) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the continuous uniform
+distribution:
+
+.. math::
+    P(x) = \dfrac{1}{\text{to} - \text{from}}
+""")
+
+add_docstr_all('unsqueeze',
+               r"""
+unsqueeze(dim) -> Tensor
+
+See :func:`torch.unsqueeze`
+""")
+
+add_docstr_all('unsqueeze_',
+               r"""
+unsqueeze_(dim) -> Tensor
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""")
+
+add_docstr_all('var',
+               r"""
+var(dim=None, unbiased=True, keepdim=False) -> Tensor
+
+See :func:`torch.var`
+""")
+
+add_docstr_all('view',
+               r"""
+view(*args) -> Tensor
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different size.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. For a tensor to be viewed, the new
+view size must be compatible with its original size and stride, i.e., each new
+view dimension must either be a subspace of an original dimension, or only span
+across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+contiguity-like condition that :math:`\forall i = 0, \dots, k-1`,
+
+.. math::
+
+  stride[i] = stride[i+1] \times size[i+1]
+
+Otherwise, :func:`contiguous` needs to be called before the tensor can be
+viewed.
+
+Args:
+    args (torch.Size or int...): the desired size
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+
+""")
+
+add_docstr_all('view_as',
+               r"""
+view_as(other) -> Tensor
+
+View this tensor as the same size as :attr:`other`.
+``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+
+Please see :meth:`~Tensor.view` for more information about ``view``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""")
+
+add_docstr_all('expand',
+               r"""
+expand(*sizes) -> Tensor
+
+Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+to a larger size.
+
+Passing -1 as the size for a dimension means not changing the size of
+that dimension.
+
+Tensor can be also expanded to a larger number of dimensions, and the
+new ones will be appended at the front. For the new dimensions, the
+size cannot be set to -1.
+
+Expanding a tensor does not allocate new memory, but only creates a
+new view on the existing tensor where a dimension of size one is
+expanded to a larger size by setting the ``stride`` to 0. Any dimension
+of size 1 can be expanded to an arbitrary value without allocating new
+memory.
+
+Args:
+    *sizes (torch.Size or int...): the desired expanded size
+
+Example::
+
+    >>> x = torch.tensor([[1], [2], [3]])
+    >>> x.size()
+    torch.Size([3, 1])
+    >>> x.expand(3, 4)
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+""")
+
+add_docstr_all('expand_as',
+               r"""
+expand_as(other) -> Tensor
+
+Expand this tensor to the same size as :attr:`other`.
+``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+
+Please see :meth:`~Tensor.expand` for more information about ``expand``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""")
+
+add_docstr_all('zero_',
+               r"""
+zero_() -> Tensor
+
+Fills :attr:`self` tensor with zeros.
+""")
+
+add_docstr_all('matmul',
+               r"""
+matmul(tensor2) -> Tensor
+
+See :func:`torch.matmul`
+""")
+
+add_docstr_all('chunk',
+               r"""
+chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.chunk`
+""")
+
+add_docstr_all('stft',
+               r"""
+stft(frame_length, hop, fft_size=None, return_onesided=True, window=None, pad_end=0) -> Tensor
+
+See :func:`torch.stft`
+""")
+
+add_docstr_all('fft',
+               r"""
+fft(signal_ndim, normalized=False) -> Tensor
+
+See :func:`torch.fft`
+""")
+
+add_docstr_all('ifft',
+               r"""
+ifft(signal_ndim, normalized=False) -> Tensor
+
+See :func:`torch.ifft`
+""")
+
+add_docstr_all('rfft',
+               r"""
+rfft(signal_ndim, normalized=False, onesided=True) -> Tensor
+
+See :func:`torch.rfft`
+""")
+
+add_docstr_all('irfft',
+               r"""
+irfft(signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
+
+See :func:`torch.irfft`
+""")
+
+add_docstr_all('det',
+               r"""
+det() -> Tensor
+
+See :func:`torch.det`
+""")
+
+add_docstr_all('where',
+               r"""
+where(condition, y) -> Tensor
+
+``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+See :func:`torch.where`
+""")
+
+add_docstr_all('logdet',
+               r"""
+logdet() -> Tensor
+
+See :func:`torch.logdet`
+""")
+
+add_docstr_all('slogdet',
+               r"""
+slogdet() -> (Tensor, Tensor)
+
+See :func:`torch.slogdet`
+""")
+
+add_docstr_all('unbind',
+               r"""
+unbind(dim=0) -> seq
+
+See :func:`torch.unbind`
+""")
+
+add_docstr_all('pinverse',
+               r"""
+pinverse() -> Tensor
+
+See :func:`torch.pinverse`
+""")
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
new file mode 100644
index 0000000..ee90abf
--- /dev/null
+++ b/torch/_tensor_str.py
@@ -0,0 +1,268 @@
+import math
+import torch
+from functools import reduce
+from sys import float_info
+from torch._six import inf, nan
+
+
+class __PrinterOptions(object):
+    precision = 4
+    threshold = 1000
+    edgeitems = 3
+    linewidth = 80
+
+
+PRINT_OPTS = __PrinterOptions()
+
+
+# We could use **kwargs, but this will give better docs
+def set_printoptions(
+        precision=None,
+        threshold=None,
+        edgeitems=None,
+        linewidth=None,
+        profile=None,
+):
+    r"""Set options for printing. Items shamelessly taken from NumPy
+
+    Args:
+        precision: Number of digits of precision for floating point output
+            (default = 8).
+        threshold: Total number of array elements which trigger summarization
+            rather than full `repr` (default = 1000).
+        edgeitems: Number of array items in summary at beginning and end of
+            each dimension (default = 3).
+        linewidth: The number of characters per line for the purpose of
+            inserting line breaks (default = 80). Thresholded matrices will
+            ignore this parameter.
+        profile: Sane defaults for pretty printing. Can override with any of
+            the above options. (any one of `default`, `short`, `full`)
+    """
+    if profile is not None:
+        if profile == "default":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+        elif profile == "short":
+            PRINT_OPTS.precision = 2
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 2
+            PRINT_OPTS.linewidth = 80
+        elif profile == "full":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = inf
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+
+    if precision is not None:
+        PRINT_OPTS.precision = precision
+    if threshold is not None:
+        PRINT_OPTS.threshold = threshold
+    if edgeitems is not None:
+        PRINT_OPTS.edgeitems = edgeitems
+    if linewidth is not None:
+        PRINT_OPTS.linewidth = linewidth
+
+
+class _Formatter(object):
+    def __init__(self, tensor):
+        self.floating_dtype = tensor.dtype.is_floating_point
+        self.int_mode = True
+        self.sci_mode = False
+        self.max_width = 1
+
+        if not self.floating_dtype:
+            copy = torch.empty(tensor.size(), dtype=torch.long).copy_(tensor).view(tensor.nelement())
+            for value in copy.tolist():
+                value_str = '{}'.format(value)
+                self.max_width = max(self.max_width, len(value_str))
+
+        else:
+            copy = torch.empty(tensor.size(), dtype=torch.float64).copy_(tensor).view(tensor.nelement())
+            copy_list = copy.tolist()
+            try:
+                for value in copy_list:
+                    if value != math.ceil(value):
+                        self.int_mode = False
+                        break
+            # nonfinites will throw errors
+            except (ValueError, OverflowError):
+                self.int_mode = False
+
+            if self.int_mode:
+                for value in copy_list:
+                    value_str = '{:.0f}'.format(value)
+                    if math.isnan(value) or math.isinf(value):
+                        self.max_width = max(self.max_width, len(value_str))
+                    else:
+                        # in int_mode for floats, all numbers are integers, and we append a decimal to nonfinites
+                        # to indicate that the tensor is of floating type. add 1 to the len to account for this.
+                        self.max_width = max(self.max_width, len(value_str) + 1)
+
+            else:
+                copy_abs = copy.abs()
+                pos_inf_mask = copy_abs.eq(inf)
+                neg_inf_mask = copy_abs.eq(-inf)
+                nan_mask = copy_abs.ne(copy)
+                invalid_value_mask = pos_inf_mask + neg_inf_mask + nan_mask
+                if invalid_value_mask.all():
+                    example_value = 0
+                else:
+                    example_value = copy_abs[invalid_value_mask.eq(0)][0]
+                copy_abs[invalid_value_mask] = example_value
+
+                exp_min = copy_abs.min()
+                if exp_min != 0:
+                    exp_min = math.floor(math.log10(exp_min)) + 1
+                else:
+                    exp_min = 1
+                exp_max = copy_abs.max()
+                if exp_max != 0:
+                    exp_max = math.floor(math.log10(exp_max)) + 1
+                else:
+                    exp_max = 1
+
+                # these conditions for using scientific notation are based on numpy
+                if exp_max - exp_min > PRINT_OPTS.precision or exp_max > 8 or exp_min < -4:
+                    self.sci_mode = True
+                    for value in copy_list:
+                        value_str = ('{{:.{}e}}').format(PRINT_OPTS.precision).format(value)
+                        self.max_width = max(self.max_width, len(value_str))
+                else:
+                    for value in copy_list:
+                        value_str = ('{{:.{}f}}').format(PRINT_OPTS.precision).format(value)
+                        self.max_width = max(self.max_width, len(value_str))
+
+    def width(self):
+        return self.max_width
+
+    def format(self, value):
+        if self.floating_dtype:
+            if self.int_mode:
+                ret = '{:.0f}'.format(value)
+                if not (math.isinf(value) or math.isnan(value)):
+                    ret += '.'
+            elif self.sci_mode:
+                ret = ('{{:{}.{}e}}').format(self.max_width, PRINT_OPTS.precision).format(value)
+            else:
+                ret = ('{{:.{}f}}').format(PRINT_OPTS.precision).format(value)
+        else:
+            ret = '{}'.format(value)
+        return (self.max_width - len(ret)) * ' ' + ret
+
+
+def _scalar_str(self, formatter):
+    return formatter.format(self.item())
+
+
+def _vector_str(self, indent, formatter, summarize):
+    # length includes spaces and comma between elements
+    element_length = formatter.width() + 2
+    elements_per_line = max(1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length))))
+    char_per_line = element_length * elements_per_line
+
+    if summarize and self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        data = ([formatter.format(val) for val in self[:PRINT_OPTS.edgeitems].tolist()] +
+                [' ...'] +
+                [formatter.format(val) for val in self[-PRINT_OPTS.edgeitems:].tolist()])
+    else:
+        data = [formatter.format(val) for val in self.tolist()]
+
+    data_lines = [data[i:i + elements_per_line] for i in range(0, len(data), elements_per_line)]
+    lines = [', '.join(line) for line in data_lines]
+    return '[' + (',' + '\n' + ' ' * (indent + 1)).join(lines) + ']'
+
+
+def _tensor_str(self, indent, formatter, summarize):
+    dim = self.dim()
+
+    if dim == 0:
+        return _scalar_str(self, formatter)
+    if dim == 1:
+        return _vector_str(self, indent, formatter, summarize)
+
+    if summarize and self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        slices = ([_tensor_str(self[i], indent + 1, formatter, summarize)
+                   for i in range(0, PRINT_OPTS.edgeitems)] +
+                  ['...'] +
+                  [_tensor_str(self[i], indent + 1, formatter, summarize)
+                   for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))])
+    else:
+        slices = [_tensor_str(self[i], indent + 1, formatter, summarize) for i in range(0, self.size(0))]
+
+    tensor_str = (',' + '\n' * (dim - 1) + ' ' * (indent + 1)).join(slices)
+    return '[' + tensor_str + ']'
+
+
+def _maybe_wrap_suffix(suffix, indent, tensor_str):
+    suffix_len = len(suffix)
+    last_line_len = len(tensor_str) - tensor_str.rfind('\n') + 1
+    if suffix_len > 2 and last_line_len + suffix_len > PRINT_OPTS.linewidth:
+        return ',\n' + ' ' * indent + suffix[2:]
+    return suffix
+
+
+def get_summarized_data(self):
+    dim = self.dim()
+    if dim == 0:
+        return self
+    if dim == 1:
+        if self.size(0) > 2 * PRINT_OPTS.edgeitems:
+            return torch.cat((self[:PRINT_OPTS.edgeitems], self[-PRINT_OPTS.edgeitems:]))
+        else:
+            return self
+    if self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        start = [get_summarized_data(self[i]).view(-1) for i in range(0, PRINT_OPTS.edgeitems)]
+        end = ([get_summarized_data(self[i]).view(-1)
+               for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))])
+        return torch.cat((start + end))
+    else:
+        return self
+
+
+def _str(self):
+    if self.is_sparse:
+        size_str = str(tuple(self.shape)).replace(' ', '')
+        return '{} of size {} with indices:\n{}\nand values:\n{}'.format(
+            self.type(), size_str, self._indices(), self._values())
+
+    prefix = 'tensor('
+    indent = len(prefix)
+    summarize = self.numel() > PRINT_OPTS.threshold
+
+    suffix = ''
+    if not torch._C._is_default_type_cuda():
+        if self.device.type == 'cuda':
+            suffix += ', device=\'' + str(self.device) + '\''
+    else:
+        if self.device.type == 'cpu' or torch.cuda.current_device() != self.device.index:
+            suffix += ', device=\'' + str(self.device) + '\''
+
+    if self.numel() == 0:
+        # Explicitly print the shape if it is not (0,), to match NumPy behavior
+        if self.dim() != 1:
+            suffix += ', size=' + str(tuple(self.shape))
+
+        # In an empty tensor, there are no elements to infer if the dtype should be int64,
+        # so it must be shown explicitly.
+        if self.dtype != torch.get_default_dtype():
+            suffix += ', dtype=' + str(self.dtype)
+        tensor_str = '[]'
+    else:
+        if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
+            suffix += ', dtype=' + str(self.dtype)
+
+        formatter = _Formatter(get_summarized_data(self) if summarize else self)
+        tensor_str = _tensor_str(self, indent, formatter, summarize)
+
+    if self.grad_fn is not None:
+        suffix += ', grad_fn=<{}>'.format(type(self.grad_fn).__name__)
+    elif self.requires_grad:
+        suffix += ', requires_grad=True'
+
+    suffix += ')'
+
+    suffix = _maybe_wrap_suffix(suffix, indent, tensor_str)
+
+    return prefix + tensor_str + suffix
diff --git a/torch/_thnn/__init__.py b/torch/_thnn/__init__.py
new file mode 100644
index 0000000..4f75f3b
--- /dev/null
+++ b/torch/_thnn/__init__.py
@@ -0,0 +1,68 @@
+import threading
+import torch.cuda
+from .utils import THNN_H_PATH, THCUNN_H_PATH, parse_header, load_backend
+
+
+class Backends(object):
+
+    def __init__(self):
+        self.backends = {}
+
+    def __getattr__(self, name):
+        return self.backends[name].load()
+
+    def __getitem__(self, name):
+        return self.backends[name].load()
+
+
+class Backend(object):
+
+    def __init__(self, lib_prefix, lib_name, functions, mixins=tuple()):
+        self.lib_prefix = lib_prefix
+        self.lib_name = lib_name
+        self.functions = functions
+        self.mixins = mixins
+        self.backend = None
+        self.loading_lock = threading.Lock()
+
+    def load(self):
+        # This looks a little weird, but it's necessary for thread safe loading.
+        # Loading the backend can take some time, so multiple threads can enter
+        # the if clause. We have to ensure that only the first one to acquire
+        # the lock will actually load the backend, and that the rest won't
+        # do it again.
+        if self.backend is None:
+            with self.loading_lock:
+                if self.backend is None:
+                    lib = getattr(torch._C, self.lib_name)
+                    self.backend = load_backend(self.lib_prefix, lib,
+                                                self.functions, self.mixins)
+        return self.backend
+
+
+class THNNCudaBackendStateMixin(object):
+
+    @property
+    def library_state(self):
+        return torch.cuda._state_cdata
+
+
+type2backend = Backends()
+
+_thnn_headers = parse_header(THNN_H_PATH)
+_thcunn_headers = parse_header(THCUNN_H_PATH)
+
+for t in ['Float', 'Double']:
+    backend = Backend(t, '_THNN', _thnn_headers)
+
+    type2backend.backends['THNN{}Backend'.format(t)] = backend
+    type2backend.backends['torch.{}Tensor'.format(t)] = backend
+    type2backend.backends[getattr(torch, '{}Tensor'.format(t))] = backend
+
+
+for t in ['Half', '', 'Double']:
+    backend = Backend('Cuda' + t, '_THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
+    type2backend.backends['THNNCuda{}Backend'.format(t)] = backend
+    py_name = 'Float' if t == '' else t
+    type2backend.backends['torch.cuda.{}Tensor'.format(py_name)] = backend
+    type2backend.backends[getattr(torch.cuda, '{}Tensor'.format(py_name))] = backend
diff --git a/torch/_thnn/utils.py b/torch/_thnn/utils.py
new file mode 100644
index 0000000..963599c
--- /dev/null
+++ b/torch/_thnn/utils.py
@@ -0,0 +1,127 @@
+import os
+import itertools
+import importlib
+
+try:
+    # when compiling a cffi extension, this works. When compiling
+    # torch itself, it doesn't work because the parent module can't
+    # yet be imported. However that's fine because we don't need it in
+    # that case.
+    from .._utils_internal import get_file_path
+
+    THNN_H_PATH = get_file_path('torch', 'lib', 'THNN.h')
+    THCUNN_H_PATH = get_file_path('torch', 'lib', 'THCUNN.h')
+except Exception as e:
+    pass
+
+
+def _unpickle_backend(backend_name):
+    import torch._thnn
+    return torch._thnn.type2backend[backend_name]
+
+
+class THNNBackendBase(object):
+
+    def __init__(self):
+        self.methods = {}
+
+    def __getattr__(self, name):
+        method = self.methods.get(name, None)
+        if method is None:
+            raise NotImplementedError
+        return method
+
+    def register_method(self, name, ctypes_fn):
+        self.methods[name] = ctypes_fn
+
+    @property
+    def library_state(self):
+        return 0
+
+    def __reduce__(self):
+        return (_unpickle_backend, (type(self).__name__,))
+
+
+class Function(object):
+
+    def __init__(self, name):
+        self.name = name
+        self.arguments = []
+
+    def add_argument(self, arg):
+        assert isinstance(arg, Argument)
+        self.arguments.append(arg)
+
+    def __repr__(self):
+        return self.name + '(' + ', '.join(map(lambda a: a.__repr__(), self.arguments)) + ')'
+
+
+class Argument(object):
+
+    def __init__(self, _type, name, is_optional):
+        self.type = _type
+        self.name = name
+        self.is_optional = is_optional
+
+    def __repr__(self):
+        return self.type + ' ' + self.name
+
+
+def parse_header(path):
+    with open(path, 'r') as f:
+        lines = f.read().split('\n')
+
+    # Remove empty lines and preprocessor directives
+    lines = filter(lambda l: l and not l.startswith('#'), lines)
+    # Remove line comments
+    lines = map(lambda l: l.partition('//'), lines)
+    # Select line and comment part
+    lines = map(lambda l: (l[0].strip(), l[2].strip()), lines)
+    # Remove trailing special signs
+    lines = map(lambda l: (l[0].rstrip(');').rstrip(','), l[1]), lines)
+    # Split arguments
+    lines = map(lambda l: (l[0].split(','), l[1]), lines)
+    # Flatten lines
+    new_lines = []
+    for l, c in lines:
+        for split in l:
+            new_lines.append((split, c))
+    lines = new_lines
+    del new_lines
+    # Remove unnecessary whitespace
+    lines = map(lambda l: (l[0].strip(), l[1]), lines)
+    # Remove empty lines
+    lines = filter(lambda l: l[0], lines)
+    generic_functions = []
+    for l, c in lines:
+        if l.startswith('TH_API void THNN_'):
+            fn_name = l.lstrip('TH_API void THNN_')
+            if fn_name[0] == '(' and fn_name[-2] == ')':
+                fn_name = fn_name[1:-2]
+            else:
+                fn_name = fn_name[:-1]
+            generic_functions.append(Function(fn_name))
+        elif l.startswith('THC_API void THNN_'):
+            fn_name = l.lstrip('THC_API void THNN_')
+            if fn_name[0] == '(' and fn_name[-2] == ')':
+                fn_name = fn_name[1:-2]
+            else:
+                fn_name = fn_name[:-1]
+            generic_functions.append(Function(fn_name))
+        elif l:
+            t, name = l.split()
+            if '*' in name:
+                t = t + '*'
+                name = name[1:]
+            generic_functions[-1].add_argument(Argument(t, name, '[OPTIONAL]' in c))
+    return generic_functions
+
+
+def load_backend(t, lib, generic_functions, mixins=tuple()):
+    backend_name = 'THNN{}Backend'.format(t)
+    backend = type(backend_name, mixins + (THNNBackendBase,), {})()
+    for function in generic_functions:
+        full_fn_name = '{}{}'.format(t, function.name)
+        fn = getattr(lib, full_fn_name)
+        backend.register_method(function.name, fn)
+    return backend
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
new file mode 100644
index 0000000..84a08a1
--- /dev/null
+++ b/torch/_torch_docs.py
@@ -0,0 +1,5852 @@
+"""Adds docstrings to functions defined in the torch._C"""
+
+import re
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+
+
+def parse_kwargs(desc):
+    """Maps a description of args to a dictionary of {argname: description}.
+    Input:
+        ('    weight (Tensor): a weight tensor\n' +
+         '        Some optional description')
+    Output: {
+        'weight': \
+        'weight (Tensor): a weight tensor\n        Some optional description'
+    }
+    """
+    # Split on exactly 4 spaces after a newline
+    regx = re.compile("\n\s{4}(?!\s)")
+    kwargs = [section.strip() for section in regx.split(desc)]
+    kwargs = [section for section in kwargs if len(section) > 0]
+    return {desc.split(' ')[0]: desc for desc in kwargs}
+
+
+reduceops_common_args = parse_kwargs("""
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        If specified, the input tensor is casted to :attr:`dtype` before the operation
+        is performed. This is useful for preventing data type overflows. Default: None.
+""")
+
+factory_common_args = parse_kwargs("""
+    out (Tensor, optional): the output tensor
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+    layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+        Default: ``torch.strided``.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, uses the current device for the default tensor type
+        (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+""")
+
+factory_like_common_args = parse_kwargs("""
+    input (Tensor): the size of :attr:`input` will determine size of the output tensor
+    layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+        Default: if ``None``, defaults to the layout of :attr:`input`.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+        Default: if ``None``, defaults to the dtype of :attr:`input`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, defaults to the device of :attr:`input`.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+""")
+
+factory_data_common_args = parse_kwargs("""
+    data (array_like): Initial data for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, infers data type from :attr:`data`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, uses the current device for the default tensor type
+        (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+""")
+
+add_docstr(torch.abs,
+           r"""
+abs(input, out=None) -> Tensor
+
+Computes the element-wise absolute value of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = |\text{input}_{i}|
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.abs(torch.tensor([-1, -2, 3]))
+    tensor([ 1,  2,  3])
+""")
+
+add_docstr(torch.acos,
+           r"""
+acos(input, out=None) -> Tensor
+
+Returns a new tensor with the arccosine  of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cos^{-1}(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.3348, -0.5889,  0.2005, -0.1584])
+    >>> torch.acos(a)
+    tensor([ 1.2294,  2.2004,  1.3690,  1.7298])
+""")
+
+add_docstr(torch.add,
+           r"""
+.. function:: add(input, value, out=None)
+
+Adds the scalar :attr:`value` to each element of the input :attr:`input`
+and returns a new resulting tensor.
+
+.. math::
+    out = input + value
+
+If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`value` must be
+a real number, otherwise it should be an integer.
+
+Args:
+    input (Tensor): the input tensor
+    value (Number): the number to be added to each element of :attr:`input`
+
+Keyword arguments:
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+    >>> torch.add(a, 20)
+    tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+
+.. function:: add(input, value=1, other, out=None)
+
+Each element of the tensor :attr:`other` is multiplied by the scalar
+:attr:`value` and added to each element of the tensor :attr:`input`.
+The resulting tensor is returned.
+
+The shapes of :attr:`input` and :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+.. math::
+    out = input + value \times other
+
+If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`value` must be
+a real number, otherwise it should be an integer.
+
+Args:
+    input (Tensor): the first input tensor
+    value (Number): the scalar multiplier for :attr:`other`
+    other (Tensor): the second input tensor
+
+Keyword arguments:
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+    >>> b = torch.randn(4, 1)
+    >>> b
+    tensor([[ 0.3743],
+            [-1.7724],
+            [-0.5811],
+            [-0.8017]])
+    >>> torch.add(a, 10, b)
+    tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+            [-18.6971, -18.0736, -17.0994, -17.3216],
+            [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+            [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+""")
+
+add_docstr(torch.addbmm,
+           r"""
+addbmm(beta=1, mat, alpha=1, batch1, batch2, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices stored
+in :attr:`batch1` and :attr:`batch2`,
+with a reduced add step (all matrix multiplications get accumulated
+along the first dimension).
+:attr:`mat` is added to the final result.
+
+:attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+same number of matrices.
+
+If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+:math:`(b \times m \times p)` tensor, :attr:`mat` must be
+:ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+and :attr:`out` will be a :math:`(n \times p)` tensor.
+
+.. math::
+    out = \beta\ mat + \alpha\ (\sum_{i=0}^{b} batch1_i \mathbin{@} batch2_i)
+
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+must be real numbers, otherwise they should be integers.
+
+Args:
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    mat (Tensor): matrix to be added
+    alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+    batch1 (Tensor): the first batch of matrices to be multiplied
+    batch2 (Tensor): the second batch of matrices to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> M = torch.randn(3, 5)
+    >>> batch1 = torch.randn(10, 3, 4)
+    >>> batch2 = torch.randn(10, 4, 5)
+    >>> torch.addbmm(M, batch1, batch2)
+    tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+            [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+            [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+""")
+
+add_docstr(torch.addcdiv,
+           r"""
+addcdiv(tensor, value=1, tensor1, tensor2, out=None) -> Tensor
+
+Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+multiply the result by the scalar :attr:`value` and add it to :attr:`tensor`.
+
+.. math::
+    out_i = tensor_i + value \times \frac{tensor1_i}{tensor2_i}
+
+The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+a real number, otherwise an integer.
+
+Args:
+    tensor (Tensor): the tensor to be added
+    value (Number, optional): multiplier for :math:`tensor1 ./ tensor2`
+    tensor1 (Tensor): the numerator tensor
+    tensor2 (Tensor): the denominator tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> t = torch.randn(1, 3)
+    >>> t1 = torch.randn(3, 1)
+    >>> t2 = torch.randn(1, 3)
+    >>> torch.addcdiv(t, 0.1, t1, t2)
+    tensor([[-0.2312, -3.6496,  0.1312],
+            [-1.0428,  3.4292, -0.1030],
+            [-0.5369, -0.9829,  0.0430]])
+""")
+
+add_docstr(torch.addcmul,
+           r"""
+addcmul(tensor, value=1, tensor1, tensor2, out=None) -> Tensor
+
+Performs the element-wise multiplication of :attr:`tensor1`
+by :attr:`tensor2`, multiply the result by the scalar :attr:`value`
+and add it to :attr:`tensor`.
+
+.. math::
+    out_i = tensor_i + value \times tensor1_i \times tensor2_i
+
+The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+a real number, otherwise an integer.
+
+Args:
+    tensor (Tensor): the tensor to be added
+    value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+    tensor1 (Tensor): the tensor to be multiplied
+    tensor2 (Tensor): the tensor to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> t = torch.randn(1, 3)
+    >>> t1 = torch.randn(3, 1)
+    >>> t2 = torch.randn(1, 3)
+    >>> torch.addcmul(t, 0.1, t1, t2)
+    tensor([[-0.8635, -0.6391,  1.6174],
+            [-0.7617, -0.5879,  1.7388],
+            [-0.8353, -0.6249,  1.6511]])
+""")
+
+add_docstr(torch.addmm,
+           r"""
+addmm(beta=1, mat, alpha=1, mat1, mat2, out=None) -> Tensor
+
+Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+The matrix :attr:`mat` is added to the final result.
+
+If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+:math:`(m \times p)` tensor, then :attr:`mat` must be
+:ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+and :attr:`out` will be a :math:`(n \times p)` tensor.
+
+:attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+:attr:`mat1` and :attr`mat2` and the added matrix :attr:`mat` respectively.
+
+.. math::
+    out = \beta\ mat + \alpha\ (mat1_i \mathbin{@} mat2_i)
+
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers.
+
+Args:
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    mat (Tensor): matrix to be added
+    alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+    mat1 (Tensor): the first matrix to be multiplied
+    mat2 (Tensor): the second matrix to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> M = torch.randn(2, 3)
+    >>> mat1 = torch.randn(2, 3)
+    >>> mat2 = torch.randn(3, 3)
+    >>> torch.addmm(M, mat1, mat2)
+    tensor([[-4.8716,  1.4671, -1.3746],
+            [ 0.7573, -3.9555, -2.8681]])
+""")
+
+add_docstr(torch.addmv,
+           r"""
+addmv(beta=1, tensor, alpha=1, mat, vec, out=None) -> Tensor
+
+Performs a matrix-vector product of the matrix :attr:`mat` and
+the vector :attr:`vec`.
+The vector :attr:`tensor` is added to the final result.
+
+If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+size `m`, then :attr:`tensor` must be
+:ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+:attr:`out` will be 1-D tensor of size `n`.
+
+:attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+:attr:`mat` and :attr:`vec` and the added tensor :attr:`tensor` respectively.
+
+.. math::
+    out = \beta\ tensor + \alpha\ (mat \mathbin{@} vec)
+
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers
+
+Args:
+    beta (Number, optional): multiplier for :attr:`tensor` (:math:`\beta`)
+    tensor (Tensor): vector to be added
+    alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+    mat (Tensor): matrix to be multiplied
+    vec (Tensor): vector to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> M = torch.randn(2)
+    >>> mat = torch.randn(2, 3)
+    >>> vec = torch.randn(3)
+    >>> torch.addmv(M, mat, vec)
+    tensor([-0.3768, -5.5565])
+""")
+
+add_docstr(torch.addr,
+           r"""
+addr(beta=1, mat, alpha=1, vec1, vec2, out=None) -> Tensor
+
+Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+and adds it to the matrix :attr:`mat`.
+
+Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+:attr:`mat` respectively.
+
+.. math::
+    out = \beta\ mat + \alpha\ (vec1 \otimes vec2)
+
+If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+of size `m`, then :attr:`mat` must be
+:ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+:math:`(n \times m)` and :attr:`out` will be a matrix of size
+:math:`(n \times m)`.
+
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers
+
+Args:
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    mat (Tensor): matrix to be added
+    alpha (Number, optional): multiplier for :math:`vec1 \otimes vec2` (:math:`\alpha`)
+    vec1 (Tensor): the first vector of the outer product
+    vec2 (Tensor): the second vector of the outer product
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> vec1 = torch.arange(1., 4.)
+    >>> vec2 = torch.arange(1., 3.)
+    >>> M = torch.zeros(3, 2)
+    >>> torch.addr(M, vec1, vec2)
+    tensor([[ 1.,  2.],
+            [ 2.,  4.],
+            [ 3.,  6.]])
+""")
+
+add_docstr(torch.as_tensor,
+           r"""
+as_tensor(data, dtype=None, device=None) -> Tensor
+
+Convert the data into a `torch.Tensor`.  If the data is already a `Tensor` of the same `dtype` and `device`, no copy
+will be performed.  Similarly, if the data is an ``ndarray`` of the corresponding `dtype` and the `device` is the cpu,
+no copy will be performed.
+
+Args:
+    {data}
+    {dtype}
+    {device}
+
+Example::
+
+    >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+    tensor([[ 0.1000,  1.2000],
+            [ 2.2000,  3.1000],
+            [ 4.9000,  5.2000]])
+
+    >>> a = numpy.array([1, 2, 3])
+    >>> t = torch.from_numpy(a)
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([-1,  2,  3])
+""".format(**factory_data_common_args))
+
+add_docstr(torch.asin,
+           r"""
+asin(input, out=None) -> Tensor
+
+Returns a new tensor with the arcsine  of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sin^{-1}(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.5962,  1.4985, -0.4396,  1.4525])
+    >>> torch.asin(a)
+    tensor([-0.6387,     nan, -0.4552,     nan])
+""")
+
+add_docstr(torch.atan,
+           r"""
+atan(input, out=None) -> Tensor
+
+Returns a new tensor with the arctangent  of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tan^{-1}(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.2341,  0.2539, -0.6256, -0.6448])
+    >>> torch.atan(a)
+    tensor([ 0.2299,  0.2487, -0.5591, -0.5727])
+""")
+
+add_docstr(torch.atan2,
+           r"""
+atan2(input1, input2, out=None) -> Tensor
+
+Returns a new tensor with the arctangent of the elements of :attr:`input1`
+and :attr:`input2`.
+
+The shapes of :attr:`input1` and :attr:`input2` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input1 (Tensor): the first input tensor
+    input2 (Tensor): the second input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.9041,  0.0196, -0.3108, -2.4423])
+    >>> torch.atan2(a, torch.randn(4))
+    tensor([ 0.9833,  0.0811, -1.9743, -1.4151])
+""")
+
+add_docstr(torch.baddbmm,
+           r"""
+baddbmm(beta=1, mat, alpha=1, batch1, batch2, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+and :attr:`batch2`.
+:attr:`mat` is added to the final result.
+
+:attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+number of matrices.
+
+If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+:math:`(b \times m \times p)` tensor, then :attr:`mat` must be
+:ref:`broadcastable <broadcasting-semantics>` with a
+:math:`(b \times n \times p)` tensor and :attr:`out` will be a
+:math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+same as the scaling factors used in :meth:`torch.addbmm`.
+
+.. math::
+    out_i = \beta\ mat_i + \alpha\ (batch1_i \mathbin{@} batch2_i)
+
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers.
+
+Args:
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    mat (Tensor): the tensor to be added
+    alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+    batch1 (Tensor): the first batch of matrices to be multiplied
+    batch2 (Tensor): the second batch of matrices to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> M = torch.randn(10, 3, 5)
+    >>> batch1 = torch.randn(10, 3, 4)
+    >>> batch2 = torch.randn(10, 4, 5)
+    >>> torch.baddbmm(M, batch1, batch2).size()
+    torch.Size([10, 3, 5])
+""")
+
+add_docstr(torch.bernoulli,
+           r"""
+bernoulli(input, out=None) -> Tensor
+
+Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+
+The :attr:`input` tensor should be a tensor containing probabilities
+to be used for drawing the binary random number.
+Hence, all values in :attr:`input` have to be in the range:
+:math:`0 \leq \text{input}_i \leq 1`.
+
+The :math:`\text{i}^{th}` element of the output tensor will draw a
+value `1` according to the :math:`\text{i}^{th}` probability value given
+in :attr:`input`.
+
+.. math::
+    \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+
+The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+shape as :attr:`input`
+
+Args:
+    input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1]
+    >>> a
+    tensor([[ 0.1737,  0.0950,  0.3609],
+            [ 0.7148,  0.0289,  0.2676],
+            [ 0.9456,  0.8937,  0.7202]])
+    >>> torch.bernoulli(a)
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  0.,  0.],
+            [ 1.,  1.,  1.]])
+
+    >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+    >>> torch.bernoulli(a)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+    >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+    >>> torch.bernoulli(a)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+""")
+
+add_docstr(torch.bincount,
+           r"""
+bincount(self, weights=None, minlength=0) -> Tensor
+
+Count the frequency of each value in an array of non-negative ints.
+
+The number of bins (size 1) is one larger than the largest value in
+:attr:`input`. If :attr:`minlength` is specified, the number of bins is at least
+:attr:`minlength`. If ``n`` is the value at position ``i``,
+:math:`out[n] += weights[i]` if :attr:`weights` is specified else
+:math:`out[n] += 1`.
+
+Arguments:
+    input (Tensor): 1-d int tensor
+    weights (Tensor): optional, weight for each value in the input tensor.
+        Should be of same size as input tensor.
+    minlength (int): optional, min number of bins. Should be non-negative.
+
+Shape:
+    output (Tensor): ``Size([max(input) + 1])``
+
+Example::
+
+    >>> input = torch.randint(0, 8, (5,), dtype=torch.int64)
+    >>> weights = torch.linspace(0, 1, steps=5)
+    >>> input, weights
+    (tensor([4, 3, 6, 3, 4]),
+     tensor([ 0.0000,  0.2500,  0.5000,  0.7500,  1.0000])
+
+    >>> torch.bincount(input)
+    tensor([0, 0, 0, 2, 2, 0, 1])
+
+    >>> input.bincount(weights)
+    tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000])
+""")
+
+add_docstr(torch.bmm,
+           r"""
+bmm(batch1, batch2, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices stored in :attr:`batch1`
+and :attr:`batch2`.
+
+:attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing
+the same number of matrices.
+
+If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+:math:`(b \times m \times p)` tensor, :attr:`out` will be a
+:math:`(b \times n \times p)` tensor.
+
+.. math::
+    out_i = batch1_i \mathbin{@} batch2_i
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+          For broadcasting matrix products, see :func:`torch.matmul`.
+
+Args:
+    batch1 (Tensor): the first batch of matrices to be multiplied
+    batch2 (Tensor): the second batch of matrices to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> batch1 = torch.randn(10, 3, 4)
+    >>> batch2 = torch.randn(10, 4, 5)
+    >>> res = torch.bmm(batch1, batch2)
+    >>> res.size()
+    torch.Size([10, 3, 5])
+""")
+
+add_docstr(torch.stack,
+           r"""
+stack(seq, dim=0, out=None) -> Tensor
+
+Concatenates sequence of tensors along a new dimension.
+
+All tensors need to be of the same size.
+
+Arguments:
+    seq (sequence of Tensors): sequence of tensors to concatenate
+    dim (int): dimension to insert. Has to be between 0 and the number
+        of dimensions of concatenated tensors (inclusive)
+    out (Tensor, optional): the output tensor
+""")
+
+add_docstr(torch.chunk,
+           r"""
+chunk(tensor, chunks, dim=0) -> List of Tensors
+
+Splits a tensor into a specific number of chunks.
+
+Last chunk will be smaller if the tensor size along the given dimension
+:attr:`dim` is not divisible by :attr:`chunks`.
+
+Arguments:
+    tensor (Tensor): the tensor to split
+    chunks (int): number of chunks to return
+    dim (int): dimension along which to split the tensor
+""")
+
+add_docstr(torch.cat,
+           r"""
+cat(seq, dim=0, out=None) -> Tensor
+
+Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+All tensors must either have the same shape (except in the concatenating
+dimension) or be empty.
+
+:func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+and :func:`torch.chunk`.
+
+:func:`torch.cat` can be best understood via examples.
+
+Args:
+    seq (sequence of Tensors): any python sequence of tensors of the same type.
+        Non-empty tensors provided must have the same shape, except in the
+        cat dimension.
+    dim (int, optional): the dimension over which the tensors are concatenated
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497]])
+    >>> torch.cat((x, x, x), 0)
+    tensor([[ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497],
+            [ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497],
+            [ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497]])
+    >>> torch.cat((x, x, x), 1)
+    tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+             -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+             -0.5790,  0.1497]])
+""")
+
+add_docstr(torch.ceil,
+           r"""
+ceil(input, out=None) -> Tensor
+
+Returns a new tensor with the ceil of the elements of :attr:`input`,
+the smallest integer greater than or equal to each element.
+
+.. math::
+    \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil = \left\lfloor \text{input}_{i} \right\rfloor + 1
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.6341, -1.4208, -1.0900,  0.5826])
+    >>> torch.ceil(a)
+    tensor([-0., -1., -1.,  1.])
+""")
+
+add_docstr(torch.reciprocal,
+           r"""
+reciprocal(input, out=None) -> Tensor
+
+Returns a new tensor with the reciprocal of the elements of :attr:`input`
+
+.. math::
+    \text{out}_{i} = \frac{1}{\text{input}_{i}}
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.4595, -2.1219, -1.4314,  0.7298])
+    >>> torch.reciprocal(a)
+    tensor([-2.1763, -0.4713, -0.6986,  1.3702])
+""")
+
+add_docstr(torch.clamp,
+           r"""
+clamp(input, min, max, out=None) -> Tensor
+
+Clamp all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]` and return
+a resulting tensor:
+
+.. math::
+    y_i = \begin{cases}
+        \text{min} & \text{if } x_i < \text{min} \\
+        x_i & \text{if } \text{min} \leq x_i \leq \text{max} \\
+        \text{max} & \text{if } x_i > \text{max}
+    \end{cases}
+
+If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, args :attr:`min`
+and :attr:`max` must be real numbers, otherwise they should be integers.
+
+Args:
+    input (Tensor): the input tensor
+    min (Number): lower-bound of the range to be clamped to
+    max (Number): upper-bound of the range to be clamped to
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+    >>> torch.clamp(a, min=-0.5, max=0.5)
+    tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+
+.. function:: clamp(input, *, min, out=None) -> Tensor
+
+Clamps all elements in :attr:`input` to be larger or equal :attr:`min`.
+
+If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
+should be a real number, otherwise it should be an integer.
+
+Args:
+    input (Tensor): the input tensor
+    value (Number): minimal value of each element in the output
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.0299, -2.3184,  2.1593, -0.8883])
+    >>> torch.clamp(a, min=0.5)
+    tensor([ 0.5000,  0.5000,  2.1593,  0.5000])
+
+.. function:: clamp(input, *, max, out=None) -> Tensor
+
+Clamps all elements in :attr:`input` to be smaller or equal :attr:`max`.
+
+If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
+should be a real number, otherwise it should be an integer.
+
+Args:
+    input (Tensor): the input tensor
+    value (Number): maximal value of each element in the output
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.7753, -0.4702, -0.4599,  1.1899])
+    >>> torch.clamp(a, max=0.5)
+    tensor([ 0.5000, -0.4702, -0.4599,  0.5000])
+""")
+
+add_docstr(torch.cos,
+           r"""
+cos(input, out=None) -> Tensor
+
+Returns a new tensor with the cosine  of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cos(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 1.4309,  1.2706, -0.8562,  0.9796])
+    >>> torch.cos(a)
+    tensor([ 0.1395,  0.2957,  0.6553,  0.5574])
+""")
+
+add_docstr(torch.cosh,
+           r"""
+cosh(input, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic cosine  of the elements of
+:attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cosh(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.1632,  1.1835, -0.6979, -0.7325])
+    >>> torch.cosh(a)
+    tensor([ 1.0133,  1.7860,  1.2536,  1.2805])
+""")
+
+add_docstr(torch.cross,
+           r"""
+cross(input, other, dim=-1, out=None) -> Tensor
+
+
+Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input`
+and :attr:`other`.
+
+:attr:`input` and :attr:`other` must have the same size, and the size of their
+:attr:`dim` dimension should be 3.
+
+If :attr:`dim` is not given, it defaults to the first dimension found with the
+size 3.
+
+Args:
+    input (Tensor): the input tensor
+    other (Tensor): the second input tensor
+    dim  (int, optional): the dimension to take the cross-product in.
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 3)
+    >>> a
+    tensor([[-0.3956,  1.1455,  1.6895],
+            [-0.5849,  1.3672,  0.3599],
+            [-1.1626,  0.7180, -0.0521],
+            [-0.1339,  0.9902, -2.0225]])
+    >>> b = torch.randn(4, 3)
+    >>> b
+    tensor([[-0.0257, -1.4725, -1.2251],
+            [-1.1479, -0.7005, -1.9757],
+            [-1.3904,  0.3726, -1.1836],
+            [-0.9688, -0.7153,  0.2159]])
+    >>> torch.cross(a, b, dim=1)
+    tensor([[ 1.0844, -0.5281,  0.6120],
+            [-2.4490, -1.5687,  1.9792],
+            [-0.8304, -1.3037,  0.5650],
+            [-1.2329,  1.9883,  1.0551]])
+    >>> torch.cross(a, b)
+    tensor([[ 1.0844, -0.5281,  0.6120],
+            [-2.4490, -1.5687,  1.9792],
+            [-0.8304, -1.3037,  0.5650],
+            [-1.2329,  1.9883,  1.0551]])
+""")
+
+add_docstr(torch.cumprod,
+           r"""
+cumprod(input, dim, dtype=None) -> Tensor
+
+Returns the cumulative product of elements of :attr:`input` in the dimension
+:attr:`dim`.
+
+For example, if :attr:`input` is a vector of size N, the result will also be
+a vector of size N, with elements.
+
+.. math::
+    y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+
+Args:
+    input (Tensor): the input tensor
+    dim  (int): the dimension to do the operation over
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> a
+    tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+            -0.2129, -0.4206,  0.1968])
+    >>> torch.cumprod(a, dim=0)
+    tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+             0.0014, -0.0006, -0.0001])
+
+    >>> a[5] = 0.0
+    >>> torch.cumprod(a, dim=0)
+    tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+             0.0000, -0.0000, -0.0000])
+""".format(**reduceops_common_args))
+
+add_docstr(torch.cumsum,
+           r"""
+cumsum(input, dim, out=None) -> Tensor
+
+Returns the cumulative sum of elements of :attr:`input` in the dimension
+:attr:`dim`.
+
+For example, if :attr:`input` is a vector of size N, the result will also be
+a vector of size N, with elements.
+
+.. math::
+    y_i = x_1 + x_2 + x_3 + \dots + x_i
+
+Args:
+    input (Tensor): the input tensor
+    dim  (int): the dimension to do the operation over
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> a
+    tensor([-0.8286, -0.4890,  0.5155,  0.8443,  0.1865, -0.1752, -2.0595,
+             0.1850, -1.1571, -0.4243])
+    >>> torch.cumsum(a, dim=0)
+    tensor([-0.8286, -1.3175, -0.8020,  0.0423,  0.2289,  0.0537, -2.0058,
+            -1.8209, -2.9780, -3.4022])
+""".format(**reduceops_common_args))
+
+add_docstr(torch.diag,
+           r"""
+diag(input, diagonal=0, out=None) -> Tensor
+
+- If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+  with the elements of :attr:`input` as the diagonal.
+- If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with
+  the diagonal elements of :attr:`input`.
+
+The argument :attr:`diagonal` controls which diagonal to consider:
+
+- If :attr:`diagonal` = 0, it is the main diagonal.
+- If :attr:`diagonal` > 0, it is above the main diagonal.
+- If :attr:`diagonal` < 0, it is below the main diagonal.
+
+Args:
+    input (Tensor): the input tensor
+    diagonal (int, optional): the diagonal to consider
+    out (Tensor, optional): the output tensor
+
+.. seealso::
+
+        :func:`torch.diagonal` always returns the diagonal of its input.
+
+        :func:`torch.diagflat` always constructs a tensor with diagonal elements
+        specified by the input.
+
+Examples:
+
+Get the square matrix where the input vector is the diagonal::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([ 0.5950,-0.0872, 2.3298])
+    >>> torch.diag(a)
+    tensor([[ 0.5950, 0.0000, 0.0000],
+            [ 0.0000,-0.0872, 0.0000],
+            [ 0.0000, 0.0000, 2.3298]])
+    >>> torch.diag(a, 1)
+    tensor([[ 0.0000, 0.5950, 0.0000, 0.0000],
+            [ 0.0000, 0.0000,-0.0872, 0.0000],
+            [ 0.0000, 0.0000, 0.0000, 2.3298],
+            [ 0.0000, 0.0000, 0.0000, 0.0000]])
+
+Get the k-th diagonal of a given matrix::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-0.4264, 0.0255,-0.1064],
+            [ 0.8795,-0.2429, 0.1374],
+            [ 0.1029,-0.6482,-1.6300]])
+    >>> torch.diag(a, 0)
+    tensor([-0.4264,-0.2429,-1.6300])
+    >>> torch.diag(a, 1)
+    tensor([ 0.0255, 0.1374])
+""")
+
+add_docstr(torch.diagflat,
+           r"""
+diagflat(input, diagonal=0) -> Tensor
+
+- If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+  with the elements of :attr:`input` as the diagonal.
+- If :attr:`input` is a tensor with more than one dimension, then returns a
+  2-D tensor with diagonal elements equal to a flattened :attr:`input`.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+Args:
+    input (Tensor): the input tensor
+    offset (int, optional): the diagonal to consider. Default: 0 (main
+        diagonal).
+
+Examples::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([-0.2956, -0.9068,  0.1695])
+    >>> torch.diagflat(a)
+    tensor([[-0.2956,  0.0000,  0.0000],
+            [ 0.0000, -0.9068,  0.0000],
+            [ 0.0000,  0.0000,  0.1695]])
+    >>> torch.diagflat(a, 1)
+    tensor([[ 0.0000, -0.2956,  0.0000,  0.0000],
+            [ 0.0000,  0.0000, -0.9068,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  0.1695],
+            [ 0.0000,  0.0000,  0.0000,  0.0000]])
+
+    >>> a = torch.randn(2, 2)
+    >>> a
+    tensor([[ 0.2094, -0.3018],
+            [-0.1516,  1.9342]])
+    >>> torch.diagflat(a)
+    tensor([[ 0.2094,  0.0000,  0.0000,  0.0000],
+            [ 0.0000, -0.3018,  0.0000,  0.0000],
+            [ 0.0000,  0.0000, -0.1516,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  1.9342]])
+""")
+
+add_docstr(torch.diagonal,
+           r"""
+diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+
+Returns a partial view of :attr:`input` with the its diagonal elements
+with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+at the end of the shape.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+Args:
+    input (Tensor): the input tensor. Must be at least 2-dimensional.
+    offset (int, optional): which diagonal to consider. Default: 0
+        (main diagonal).
+    dim1 (int, optional): first dimension with respect to which to
+        take diagonal. Default: 0.
+    dim2 (int, optional): second dimension with respect to which to
+        take diagonal. Default: 1.
+
+.. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+
+Examples::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-1.0854,  1.1431, -0.1752],
+            [ 0.8536, -0.0905,  0.0360],
+            [ 0.6927, -0.3735, -0.4945]])
+
+
+    >>> torch.diagonal(a, 0)
+    tensor([-1.0854, -0.0905, -0.4945])
+
+
+    >>> torch.diagonal(a, 1)
+    tensor([ 1.1431,  0.0360])
+
+
+    >>> x = torch.randn(2, 5, 4, 2)
+    >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+    tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+             [-1.1065,  1.0401, -0.2235, -0.7938]],
+
+            [[-1.7325, -0.3081,  0.6166,  0.2335],
+             [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+""")
+
+add_docstr(torch.dist,
+           r"""
+dist(input, other, p=2) -> Tensor
+
+Returns the p-norm of (:attr:`input` - :attr:`other`)
+
+The shapes of :attr:`input` and :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the input tensor
+    other (Tensor): the Right-hand-side input tensor
+    p (float, optional): the norm to be computed
+
+Example::
+
+    >>> x = torch.randn(4)
+    >>> x
+    tensor([-1.5393, -0.8675,  0.5916,  1.6321])
+    >>> y = torch.randn(4)
+    >>> y
+    tensor([ 0.0967, -1.0511,  0.6295,  0.8360])
+    >>> torch.dist(x, y, 3.5)
+    tensor(1.6727)
+    >>> torch.dist(x, y, 3)
+    tensor(1.6973)
+    >>> torch.dist(x, y, 0)
+    tensor(inf)
+    >>> torch.dist(x, y, 1)
+    tensor(2.6537)
+""")
+
+add_docstr(torch.div,
+           r"""
+.. function:: div(input, value, out=None) -> Tensor
+
+Divides each element of the input :attr:`input` with the scalar :attr:`value`
+and returns a new resulting tensor.
+
+.. math::
+    out_i = \frac{input_i}{value}
+
+If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
+should be a real number, otherwise it should be an integer
+
+Args:
+    input (Tensor): the input tensor
+    value (Number): the number to be divided to each element of :attr:`input`
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([ 0.3810,  1.2774, -0.2972, -0.3719,  0.4637])
+    >>> torch.div(a, 0.5)
+    tensor([ 0.7620,  2.5548, -0.5944, -0.7439,  0.9275])
+
+.. function:: div(input, other, out=None) -> Tensor
+
+Each element of the tensor :attr:`input` is divided by each element
+of the tensor :attr:`other`. The resulting tensor is returned. The shapes of
+:attr:`input` and :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+.. math::
+    out_i = \frac{input_i}{other_i}
+
+Args:
+    input (Tensor): the numerator tensor
+    other (Tensor): the denominator tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.3711, -1.9353, -0.4605, -0.2917],
+            [ 0.1815, -1.0111,  0.9805, -1.5923],
+            [ 0.1062,  1.4581,  0.7759, -1.2344],
+            [-0.1830, -0.0313,  1.1908, -1.4757]])
+    >>> b = torch.randn(4)
+    >>> b
+    tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
+    >>> torch.div(a, b)
+    tensor([[-0.4620, -6.6051,  0.5676,  1.2637],
+            [ 0.2260, -3.4507, -1.2086,  6.8988],
+            [ 0.1322,  4.9764, -0.9564,  5.3480],
+            [-0.2278, -0.1068, -1.4678,  6.3936]])
+""")
+
+add_docstr(torch.dot,
+           r"""
+dot(tensor1, tensor2) -> Tensor
+
+Computes the dot product (inner product) of two tensors.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+
+Example::
+
+    >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+    tensor(7)
+""")
+
+add_docstr(torch.eig,
+           r"""
+eig(a, eigenvectors=False, out=None) -> (Tensor, Tensor)
+
+Computes the eigenvalues and eigenvectors of a real square matrix.
+
+Args:
+    a (Tensor): the square matrix for which the eigenvalues and eigenvectors will be computed
+    eigenvectors (bool): ``True`` to compute both eigenvalues and eigenvectors;
+        otherwise, only eigenvalues will be computed
+    out (tuple, optional): the output tensors
+
+Returns:
+    (Tensor, Tensor): A tuple containing
+
+        - **e** (*Tensor*): the right eigenvalues of ``a``
+        - **v** (*Tensor*): the eigenvectors of ``a`` if ``eigenvectors`` is ``True``; otherwise an empty tensor
+""")
+
+add_docstr(torch.einsum,
+           r"""
+einsum(equation, operands) -> Tensor
+
+This function provides a way of computing multilinear expressions (i.e. sums of products) using the
+Einstein summation convention.
+
+Args:
+    equation (string): The equation is given in terms of lower case letters (indices) to be associated
+           with each dimension of the operands and result. The left hand side lists the operands
+           dimensions, separated by commas. There should be one index letter per tensor dimension.
+           The right hand side follows after `->` and gives the indices for the output.
+           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
+           sorted list of all indices appearing exactly once in the left hand side.
+           The indices not apprearing in the output are summed over after multiplying the operands
+           entries.
+           If an index appears several times for the same operand, a diagonal is taken.
+           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
+           the ellipsis dimensions are at the beginning of the output.
+    operands (list of Tensors): The operands to compute the Einstein sum of.
+           Note that the operands are passed as a list, not as individual arguments.
+
+Examples::
+
+    >>> x = torch.randn(5)
+    >>> y = torch.randn(4)
+    >>> torch.einsum('i,j->ij', (x,y))  # outer product
+    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
+            [ 1.2616,  0.6335,  0.5113, -0.4351],
+            [ 1.4452,  0.7257,  0.5857, -0.4984],
+            [-0.4647, -0.2333, -0.1883,  0.1603],
+            [-1.1130, -0.5588, -0.4510,  0.3838]])
+
+
+    >>> A = torch.randn(3,5,4)
+    >>> l = torch.randn(2,5)
+    >>> r = torch.randn(2,4)
+    >>> torch.einsum('bn,anm,bm->ba', (l,A,r)) # compare torch.nn.functional.bilinear
+    tensor([[-0.3430, -5.2405,  0.4494],
+            [ 0.3311,  5.5201, -3.0356]])
+
+
+    >>> As = torch.randn(3,2,5)
+    >>> Bs = torch.randn(3,5,4)
+    >>> torch.einsum('bij,bjk->bik', (As, Bs)) # batch matrix multiplication
+    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+             [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+            [[ 4.2239,  0.3107, -0.5756, -0.2354],
+             [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+            [[ 2.8153,  1.8787, -4.3839, -1.2112],
+             [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.einsum('ii->i', (A,)) # diagonal
+    tensor([-0.7825,  0.8291, -0.1936])
+
+    >>> A = torch.randn(4, 3, 3)
+    >>> torch.einsum('...ii->...i', (A,)) # batch diagonal
+    tensor([[-1.0864,  0.7292,  0.0569],
+            [-0.9725, -1.0270,  0.6493],
+            [ 0.5832, -1.1716, -1.5084],
+            [ 0.4041, -1.1690,  0.8570]])
+
+    >>> A = torch.randn(2, 3, 4, 5)
+    >>> torch.einsum('...ij->...ji', (A,)).shape # batch permute
+    torch.Size([2, 3, 5, 4])
+""")
+
+add_docstr(torch.eq,
+           r"""
+eq(input, other, out=None) -> Tensor
+
+Computes element-wise equality
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor. Must be a `ByteTensor`
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` containing a 1 at each location where comparison is true
+
+Example::
+
+    >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 1,  0],
+            [ 0,  1]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.equal,
+           r"""
+equal(tensor1, tensor2) -> bool
+
+``True`` if two tensors have the same size and elements, ``False`` otherwise.
+
+Example::
+
+    >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2]))
+    True
+""")
+
+add_docstr(torch.erf,
+           r"""
+erf(tensor, out=None) -> Tensor
+
+Computes the error function of each element. The error function is defined as follows:
+
+.. math::
+    \mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt
+
+Args:
+    tensor (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.erf(torch.tensor([0, -1., 10.]))
+    tensor([ 0.0000, -0.8427,  1.0000])
+""")
+
+add_docstr(torch.erfc,
+           r"""
+erfc(tensor, out=None) -> Tensor
+
+Computes the complementary error function of each element. The complementary error function is defined as follows:
+
+.. math::
+    \mathrm{erfc}(x) = 1 - \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt
+
+Args:
+    tensor (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.erfc(torch.tensor([0, -1., 10.]))
+    tensor([ 1.0000, 1.8427,  0.0000])
+""")
+
+add_docstr(torch.erfinv,
+           r"""
+erfinv(tensor, out=None) -> Tensor
+
+Computes the inverse error function of each element. The inverse error function is defined
+in the range :math:`(-1, 1)` as:
+
+.. math::
+    \mathrm{erfinv}(\mathrm{erf}(x)) = x
+
+Args:
+    tensor (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.erfinv(torch.tensor([0, 0.5, -1.]))
+    tensor([ 0.0000,  0.4769,    -inf])
+""")
+
+add_docstr(torch.exp,
+           r"""
+exp(tensor, out=None) -> Tensor
+
+Returns a new tensor with the exponential of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = e^{x_{i}}
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Args:
+    tensor (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.exp(torch.tensor([0, math.log(2)]))
+    tensor([ 1.,  2.])
+""")
+
+add_docstr(torch.expm1,
+           r"""
+expm1(tensor, out=None) -> Tensor
+
+Returns a new tensor with the exponential of the elements minus 1
+of :attr:`input`.
+
+.. math::
+    y_{i} = e^{x_{i}} - 1
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Args:
+    tensor (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.expm1(torch.tensor([0, math.log(2)]))
+    tensor([ 0.,  1.])
+""")
+
+add_docstr(torch.eye,
+           r"""
+eye(n, m=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+
+Args:
+    n (int): the number of rows
+    m (int, optional): the number of columns with default being :attr:`n`
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+
+Example::
+
+    >>> torch.eye(3)
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1.,  0.],
+            [ 0.,  0.,  1.]])
+""".format(**factory_common_args))
+
+add_docstr(torch.floor,
+           r"""
+floor(input, out=None) -> Tensor
+
+Returns a new tensor with the floor of the elements of :attr:`input`,
+the largest integer less than or equal to each element.
+
+.. math::
+    \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.8166,  1.5308, -0.2530, -0.2091])
+    >>> torch.floor(a)
+    tensor([-1.,  1., -1., -1.])
+""")
+
+add_docstr(torch.fmod,
+           r"""
+fmod(input, divisor, out=None) -> Tensor
+
+Computes the element-wise remainder of division.
+
+The dividend and divisor may contain both for integer and floating point
+numbers. The remainder has the same sign as the dividend :attr:`input`.
+
+When :attr:`divisor` is a tensor, the shapes of :attr:`input` and
+:attr:`divisor` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the dividend
+    divisor (Tensor or float): the divisor, which may be either a number or a tensor of the same shape as the dividend
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+    tensor([-1., -0., -1.,  1.,  0.,  1.])
+    >>> torch.fmod(torch.tensor([1., 2, 3, 4, 5]), 1.5)
+    tensor([ 1.0000,  0.5000,  0.0000,  1.0000,  0.5000])
+
+
+""")
+
+add_docstr(torch.frac,
+           r"""
+frac(tensor, out=None) -> Tensor
+
+Computes the fractional portion of each element in :attr:`tensor`.
+
+.. math::
+    \text{out}_{i} = \text{input}_{i} - \left\lfloor \text{input}_{i} \right\rfloor
+
+Example::
+
+    >>> torch.frac(torch.tensor([1, 2.5, -3.2]))
+    tensor([ 0.0000,  0.5000, -0.2000])
+""")
+
+add_docstr(torch.from_numpy,
+           r"""
+from_numpy(ndarray) -> Tensor
+
+Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
+
+The returned tensor and :attr:`ndarray` share the same memory. Modifications to
+the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
+tensor is not resizable.
+
+Example::
+
+    >>> a = numpy.array([1, 2, 3])
+    >>> t = torch.from_numpy(a)
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([-1,  2,  3])
+""")
+
+add_docstr(torch.flatten,
+           r"""
+flatten(input, start_dim=0, end_dim=-1) -> Tensor
+
+Flattens a contiguous range of dims in a tensor.
+
+Args:
+    input (Tensor): the input tensor
+    start_dim (int): the first dim to flatten
+    end_dim (int): the last dim to flatten
+
+Example::
+
+    >>> t = torch.tensor([[[1, 2],
+                           [3, 4]],
+                          [[5, 6],
+                           [7, 8]]])
+    >>> torch.flatten(t)
+    tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    >>> torch.flatten(t, start_dim=1)
+    tensor([[1, 2, 3, 4],
+            [5, 6, 7, 8]])
+""")
+
+add_docstr(torch.gather,
+           r"""
+gather(input, dim, index, out=None) -> Tensor
+
+Gathers values along an axis specified by `dim`.
+
+For a 3-D tensor the output is specified by::
+
+    out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+    out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+    out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+
+If :attr:`input` is an n-dimensional tensor with size
+:math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})`
+and :attr:`dim` :math:`= i`, then :attr:`index` must be an :math:`n`-dimensional tensor with
+size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1`
+and :attr:`out` will have the same size as :attr:`index`.
+
+Args:
+    input (Tensor): the source tensor
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to gather
+    out (Tensor, optional): the destination tensor
+
+Example::
+
+    >>> t = torch.tensor([[1,2],[3,4]])
+    >>> torch.gather(t, 1, torch.tensor([[0,0],[1,0]]))
+    tensor([[ 1,  1],
+            [ 4,  3]])
+""")
+
+add_docstr(torch.ge,
+           r"""
+ge(input, other, out=None) -> Tensor
+
+Computes :math:`input \geq other` element-wise.
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor that must be a `ByteTensor`
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` containing a 1 at each location where comparison is true
+
+Example::
+
+    >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 1,  1],
+            [ 0,  1]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.gels,
+           r"""
+gels(B, A, out=None) -> Tensor
+
+Computes the solution to the least squares and least norm problems for a full
+rank matrix :math:`A` of size :math:`(m \times n)` and a matrix :math:`B` of
+size :math:`(n \times k)`.
+
+If :math:`m \geq n`, :func:`gels` solves the least-squares problem:
+
+.. math::
+
+   \begin{array}{ll}
+   \min_X & \|AX-B\|_2.
+   \end{array}
+
+If :math:`m < n`, :func:`gels` solves the least-norm problem:
+
+.. math::
+
+   \begin{array}{ll}
+   \min_X & \|X\|_2 & \mbox{subject to} & AX = B.
+   \end{array}
+
+Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n`
+rows of :math:`X` contains the solution. If :math`m \geq n`, the residual sum of squares
+for the solution in each column is given by the sum of squares of elements in the
+remaining :math:`m - n` rows of that column.
+
+Args:
+    B (Tensor): the matrix :math:`B`
+    A (Tensor): the :math:`m` by :math:`n` matrix :math:`A`
+    out (tuple, optional): the optional destination tensor
+
+Returns:
+    (Tensor, Tensor): A tuple containing:
+
+        - **X** (*Tensor*): the least squares solution
+        - **qr** (*Tensor*): the details of the QR factorization
+
+.. note::
+
+    The returned matrices will always be transposed, irrespective of the strides
+    of the input matrices. That is, they will have stride `(1, m)` instead of
+    `(m, 1)`.
+
+Example::
+
+    >>> A = torch.tensor([[1., 1, 1],
+                          [2, 3, 4],
+                          [3, 5, 2],
+                          [4, 2, 5],
+                          [5, 4, 3]])
+    >>> B = torch.tensor([[-10., -3],
+                          [ 12, 14],
+                          [ 14, 12],
+                          [ 16, 16],
+                          [ 18, 16]])
+    >>> X, _ = torch.gels(B, A)
+    >>> X
+    tensor([[  2.0000,   1.0000],
+            [  1.0000,   1.0000],
+            [  1.0000,   2.0000],
+            [ 10.9635,   4.8501],
+            [  8.9332,   5.2418]])
+""")
+
+add_docstr(torch.geqrf,
+           r"""
+geqrf(input, out=None) -> (Tensor, Tensor)
+
+This is a low-level function for calling LAPACK directly.
+
+You'll generally want to use :func:`torch.qr` instead.
+
+Computes a QR decomposition of :attr:`input`, but without constructing
+:math:`Q` and :math:`R` as explicit separate matrices.
+
+Rather, this directly calls the underlying LAPACK function `?geqrf`
+which produces a sequence of 'elementary reflectors'.
+
+See `LAPACK documentation for geqrf`_ for further details.
+
+Args:
+    input (Tensor): the input matrix
+    out (tuple, optional): the output tuple of (Tensor, Tensor)
+
+.. _LAPACK documentation for geqrf:
+    https://software.intel.com/en-us/node/521004
+
+""")
+
+add_docstr(torch.ger,
+           r"""
+ger(vec1, vec2, out=None) -> Tensor
+
+Outer product of :attr:`vec1` and :attr:`vec2`.
+If :attr:`vec1` is a vector of size :math:`n` and :attr:`vec2` is a vector of
+size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+
+Args:
+    vec1 (Tensor): 1-D input vector
+    vec2 (Tensor): 1-D input vector
+    out (Tensor, optional): optional output matrix
+
+Example::
+
+    >>> v1 = torch.arange(1., 5.)
+    >>> v2 = torch.arange(1., 4.)
+    >>> torch.ger(v1, v2)
+    tensor([[  1.,   2.,   3.],
+            [  2.,   4.,   6.],
+            [  3.,   6.,   9.],
+            [  4.,   8.,  12.]])
+""")
+
+add_docstr(torch.gesv,
+           r"""
+torch.gesv(B, A) -> (Tensor, Tensor)
+
+This function returns the solution to the system of linear
+equations represented by :math:`AX = B` and the LU factorization of
+A, in order as a tuple `X, LU`.
+
+`LU` contains `L` and `U` factors for LU factorization of `A`.
+
+`torch.gesv(B, A)` can take in 2D inputs `B, A` or inputs that are
+batches of 2D matrices. If the inputs are batches, then returns
+batched outputs `X, LU`.
+
+.. note::
+
+    The `out` keyword only supports 2D matrix inputs, that is,
+    `B, A` must be 2D matrices.
+
+.. note::
+
+    Irrespective of the original strides, the returned matrices
+    `X` and `LU` will be transposed, i.e. with strides like
+    `B.contiguous().transpose(-1, -2).strides()` and
+    `A.contiguous().transpose(-1, -2).strides()` respectively.
+
+Args:
+    B (Tensor): input matrix of size :math:`(*, m, k)` , where `*`
+    is zero or more batch dimensions.
+    A (Tensor): input square matrix of size :math:`(*, m, m)`, where
+    `*` is zero or more batch dimensions.
+    out ((Tensor, Tensor), optional): optional output tuple.
+
+Example::
+
+    >>> A = torch.tensor([[6.80, -2.11,  5.66,  5.97,  8.23],
+                          [-6.05, -3.30,  5.36, -4.44,  1.08],
+                          [-0.45,  2.58, -2.70,  0.27,  9.04],
+                          [8.32,  2.71,  4.35,  -7.17,  2.14],
+                          [-9.67, -5.14, -7.26,  6.08, -6.87]]).t()
+    >>> B = torch.tensor([[4.02,  6.19, -8.22, -7.57, -3.03],
+                          [-1.56,  4.00, -8.67,  1.75,  2.86],
+                          [9.81, -4.09, -4.57, -8.61,  8.99]]).t()
+    >>> X, LU = torch.gesv(B, A)
+    >>> torch.dist(B, torch.mm(A, X))
+    tensor(1.00000e-06 *
+           7.0977)
+
+    >>> # Batched solver example
+    >>> A = torch.randn(2, 3, 1, 4, 4)
+    >>> B = torch.randn(2, 3, 1, 4, 6)
+    >>> X, LU = torch.gesv(B, A)
+    >>> torch.dist(B, A.matmul(X))
+    tensor(1.00000e-06 *
+       3.6386)
+
+""")
+
+add_docstr(torch.get_default_dtype,
+           r"""
+get_default_dtype() -> :class:`torch.dtype`
+
+Get the current default floating point :class:`torch.dtype`.
+
+Example::
+
+    >>> torch.get_default_dtype()  # initial default for floating point is torch.float32
+    torch.float32
+    >>> torch.set_default_dtype(torch.float64)
+    >>> torch.get_default_dtype()  # default is now changed to torch.float64
+    torch.float64
+    >>> torch.set_default_tensor_type(torch.FloatTensor)  # setting tensor type also affects this
+    >>> torch.get_default_dtype()  # changed to torch.float32, the dtype for torch.FloatTensor
+    torch.float32
+
+""")
+
+add_docstr(torch.get_num_threads,
+           r"""
+get_num_threads() -> int
+
+Gets the number of OpenMP threads used for parallelizing CPU operations
+""")
+
+add_docstr(torch.gt,
+           r"""
+gt(input, other, out=None) -> Tensor
+
+Computes :math:`input > other` element-wise.
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor that must be a `ByteTensor`
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` containing a 1 at each location where comparison is true
+
+Example::
+
+    >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 0,  1],
+            [ 0,  0]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.histc,
+           r"""
+histc(input, bins=100, min=0, max=0, out=None) -> Tensor
+
+Computes the histogram of a tensor.
+
+The elements are sorted into equal width bins between :attr:`min` and
+:attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
+maximum values of the data are used.
+
+Args:
+    input (Tensor): the input tensor
+    bins (int): number of histogram bins
+    min (int): lower end of the range (inclusive)
+    max (int): upper end of the range (inclusive)
+    out (Tensor, optional): the output tensor
+
+Returns:
+    Tensor: Histogram represented as a tensor
+
+Example::
+
+    >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
+    tensor([ 0.,  2.,  1.,  0.])
+""")
+
+add_docstr(torch.index_select,
+           r"""
+index_select(input, dim, index, out=None) -> Tensor
+
+Returns a new tensor which indexes the :attr:`input` tensor along dimension
+:attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+
+The returned tensor has the same number of dimensions as the original tensor
+(:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+of :attr:`index`; other dimensions have the same size as in the original tensor.
+
+.. note:: The returned tensor does **not** use the same storage as the original
+          tensor.  If :attr:`out` has a different shape than expected, we
+          silently change it to the correct shape, reallocating the underlying
+          storage if necessary.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension in which we index
+    index (LongTensor): the 1-D tensor containing the indices to index
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> x
+    tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+            [-0.4664,  0.2647, -0.1228, -1.1068],
+            [-1.1734, -0.6571,  0.7230, -0.6004]])
+    >>> indices = torch.tensor([0, 2])
+    >>> torch.index_select(x, 0, indices)
+    tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+            [-1.1734, -0.6571,  0.7230, -0.6004]])
+    >>> torch.index_select(x, 1, indices)
+    tensor([[ 0.1427, -0.5414],
+            [-0.4664, -0.1228],
+            [-1.1734,  0.7230]])
+""")
+
+add_docstr(torch.inverse,
+           r"""
+inverse(input, out=None) -> Tensor
+
+Takes the inverse of the square matrix :attr:`input`.
+
+.. note::
+
+    Irrespective of the original strides, the returned matrix will be
+    transposed, i.e. with strides `(1, m)` instead of `(m, 1)`
+
+Args:
+    input (Tensor): the input 2-D square tensor
+    out (Tensor, optional): the optional output tensor
+
+Example::
+
+    >>> x = torch.rand(4, 4)
+    >>> y = torch.inverse(x)
+    >>> z = torch.mm(x, y)
+    >>> z
+    tensor([[ 1.0000, -0.0000, -0.0000,  0.0000],
+            [ 0.0000,  1.0000,  0.0000,  0.0000],
+            [ 0.0000,  0.0000,  1.0000,  0.0000],
+            [ 0.0000, -0.0000, -0.0000,  1.0000]])
+    >>> torch.max(torch.abs(z - torch.eye(4))) # Max nonzero
+    tensor(1.00000e-07 *
+           1.1921)
+""")
+
+add_docstr(torch.kthvalue,
+           r"""
+kthvalue(input, k, dim=None, keepdim=False, out=None) -> (Tensor, LongTensor)
+
+Returns the :attr:`k` th smallest element of the given :attr:`input` tensor
+along a given dimension.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+A tuple of `(values, indices)` is returned, where the `indices` is the indices
+of the kth-smallest element in the original `input` tensor in dimension `dim`.
+
+If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+they are of size 1. Otherwise, :attr:`dim` is squeezed
+(see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+:attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    k (int): k for the k-th smallest element
+    dim (int, optional): the dimension to find the kth value along
+    keepdim (bool): whether the output tensors have :attr:`dim` retained or not
+    out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                           can be optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.arange(1., 6.)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.])
+    >>> torch.kthvalue(x, 4)
+    (tensor(4.), tensor(3))
+
+    >>> x=torch.arange(1.,7.).resize_(2,3)
+    >>> x
+    tensor([[ 1.,  2.,  3.],
+            [ 4.,  5.,  6.]])
+    >>> torch.kthvalue(x,2,0,True)
+    (tensor([[ 4.,  5.,  6.]]), tensor([[ 1,  1,  1]]))
+""")
+
+add_docstr(torch.le,
+           r"""
+le(input, other, out=None) -> Tensor
+
+Computes :math:`input \leq other` element-wise.
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor that must be a `ByteTensor`
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` containing a 1 at each location where comparison is true
+
+Example::
+
+    >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 1,  0],
+            [ 1,  1]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.lerp,
+           r"""
+lerp(start, end, weight, out=None)
+
+Does a linear interpolation of two tensors :attr:`start` and :attr:`end` based
+on a scalar :attr:`weight` and returns the resulting :attr:`out` tensor.
+
+.. math::
+    out_i = start_i + weight \times (end_i - start_i)
+
+The shapes of :attr:`start` and :attr:`end` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    start (Tensor): the tensor with the starting points
+    end (Tensor): the tensor with the ending points
+    weight (float): the weight for the interpolation formula
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> start = torch.arange(1., 5.)
+    >>> end = torch.empty(4).fill_(10)
+    >>> start
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> end
+    tensor([ 10.,  10.,  10.,  10.])
+    >>> torch.lerp(start, end, 0.5)
+    tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+""")
+
+add_docstr(torch.linspace,
+           r"""
+linspace(start, end, steps=100, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a one-dimensional tensor of :attr:`steps`
+equally spaced points between :attr:`start` and :attr:`end`.
+
+The output tensor is 1-D of size :attr:`steps`.
+
+Args:
+    start (float): the starting value for the set of points
+    end (float): the ending value for the set of points
+    steps (int): number of points to sample between :attr:`start`
+        and :attr:`end`. Default: ``100``.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+
+Example::
+
+    >>> torch.linspace(3, 10, steps=5)
+    tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+    >>> torch.linspace(-10, 10, steps=5)
+    tensor([-10.,  -5.,   0.,   5.,  10.])
+    >>> torch.linspace(start=-10, end=10, steps=5)
+    tensor([-10.,  -5.,   0.,   5.,  10.])
+""".format(**factory_common_args))
+
+add_docstr(torch.log,
+           r"""
+log(input, out=None) -> Tensor
+
+Returns a new tensor with the natural logarithm of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{e} (x_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([-0.7168, -0.5471, -0.8933, -1.4428, -0.1190])
+    >>> torch.log(a)
+    tensor([ nan,  nan,  nan,  nan,  nan])
+""")
+
+add_docstr(torch.log10,
+           r"""
+log10(input, out=None) -> Tensor
+
+Returns a new tensor with the logarithm to the base 10 of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{10} (x_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.rand(5)
+    >>> a
+    tensor([ 0.5224,  0.9354,  0.7257,  0.1301,  0.2251])
+
+
+    >>> torch.log10(a)
+    tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476])
+
+""")
+
+add_docstr(torch.log1p,
+           r"""
+log1p(input, out=None) -> Tensor
+
+Returns a new tensor with the natural logarithm of (1 + :attr:`input`).
+
+.. math::
+    y_i = \log_{e} (x_i + 1)
+
+.. note:: This function is more accurate than :func:`torch.log` for small
+          values of :attr:`input`
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
+    >>> torch.log1p(a)
+    tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
+""")
+
+add_docstr(torch.log2,
+           r"""
+log2(input, out=None) -> Tensor
+
+Returns a new tensor with the logarithm to the base 2 of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{2} (x_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.rand(5)
+    >>> a
+    tensor([ 0.8419,  0.8003,  0.9971,  0.5287,  0.0490])
+
+
+    >>> torch.log2(a)
+    tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504])
+
+""")
+
+add_docstr(torch.logspace,
+           r"""
+logspace(start, end, steps=100, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a one-dimensional tensor of :attr:`steps` points
+logarithmically spaced between :math:`10^{{\text{{start}}}}` and :math:`10^{{\text{{end}}}}`.
+
+The output tensor is 1-D of size :attr:`steps`.
+
+Args:
+    start (float): the starting value for the set of points
+    end (float): the ending value for the set of points
+    steps (int): number of points to sample between :attr:`start`
+        and :attr:`end`. Default: ``100``.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.logspace(start=-10, end=10, steps=5)
+    tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+    >>> torch.logspace(start=0.1, end=1.0, steps=5)
+    tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+""".format(**factory_common_args))
+
+add_docstr(torch.logsumexp,
+           r"""
+logsumexp(input, dim, keepdim=False, out=None)
+
+Returns the log of summed exponentials of each row of the :attr:`input`
+tensor in the given dimension :attr:`dim`. The computation is numerically
+stabilized.
+
+For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+
+    .. math::
+        \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints): the dimension or dimensions to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+
+Example::
+    >>> a = torch.randn(3, 3)
+    >>> torch.logsumexp(a, 1)
+    tensor([ 0.8442,  1.4322,  0.8711])
+""")
+
+add_docstr(torch.lt,
+           r"""
+lt(input, other, out=None) -> Tensor
+
+Computes :math:`input < other` element-wise.
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor that must be a `ByteTensor`
+
+Returns:
+    Tensor: A `torch.ByteTensor` containing a 1 at each location where comparison is true
+
+Example::
+
+    >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 0,  0],
+            [ 1,  0]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.masked_select,
+           r"""
+masked_select(input, mask, out=None) -> Tensor
+
+Returns a new 1-D tensor which indexes the :attr:`input` tensor according to
+the binary mask :attr:`mask` which is a `ByteTensor`.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need
+to match, but they must be :ref:`broadcastable <broadcasting-semantics>`.
+
+.. note:: The returned tensor does **not** use the same storage
+          as the original tensor
+
+Args:
+    input (Tensor): the input data
+    mask  (ByteTensor): the tensor containing the binary mask to index with
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> x
+    tensor([[ 0.3552, -2.3825, -0.8297,  0.3477],
+            [-1.2035,  1.2252,  0.5002,  0.6248],
+            [ 0.1307, -2.0608,  0.1244,  2.0139]])
+    >>> mask = x.ge(0.5)
+    >>> mask
+    tensor([[ 0,  0,  0,  0],
+            [ 0,  1,  1,  1],
+            [ 0,  0,  0,  1]], dtype=torch.uint8)
+    >>> torch.masked_select(x, mask)
+    tensor([ 1.2252,  0.5002,  0.6248,  2.0139])
+""")
+
+add_docstr(torch.max,
+           r"""
+.. function:: max(input) -> Tensor
+
+Returns the maximum value of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.6763,  0.7445, -2.2369]])
+    >>> torch.max(a)
+    tensor(0.7445)
+
+.. function:: max(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
+
+Returns the maximum value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. The second return value is the index location of each
+maximum value found (argmax).
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size
+as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensors having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensors have :attr:`dim` retained or not
+    out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+            [ 1.1949, -1.1127, -2.2379, -0.6702],
+            [ 1.5717, -0.9207,  0.1297, -1.8768],
+            [-0.6172,  1.0036, -0.6060, -0.2432]])
+    >>> torch.max(a, 1)
+    (tensor([ 0.8475,  1.1949,  1.5717,  1.0036]), tensor([ 3,  0,  0,  1]))
+
+.. function:: max(input, other, out=None) -> Tensor
+
+Each element of the tensor :attr:`input` is compared with the corresponding
+element of the tensor :attr:`other` and an element-wise maximum is taken.
+
+The shapes of :attr:`input` and :attr:`other` don't need to match,
+but they must be :ref:`broadcastable <broadcasting-semantics>`.
+
+.. math::
+    out_i = \max(tensor_i, other_i)
+
+.. note:: When the shapes do not match, the shape of the returned output tensor
+          follows the :ref:`broadcasting rules <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the input tensor
+    other (Tensor): the second input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.2942, -0.7416,  0.2653, -0.1584])
+    >>> b = torch.randn(4)
+    >>> b
+    tensor([ 0.8722, -1.7421, -0.4141, -0.5055])
+    >>> torch.max(a, b)
+    tensor([ 0.8722, -0.7416,  0.2653, -0.1584])
+""")
+
+add_docstr(torch.mean,
+           r"""
+.. function:: mean(input) -> Tensor
+
+Returns the mean value of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.2294, -0.5481,  1.3288]])
+    >>> torch.mean(a)
+    tensor(0.3367)
+
+.. function:: mean(input, dim, keepdim=False, out=None) -> Tensor
+
+Returns the mean value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+output tensor having 1 fewer dimension.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool, optional): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+            [-0.9644,  1.0131, -0.6549, -1.4279],
+            [-0.2951, -1.3350, -0.7694,  0.5600],
+            [ 1.0842, -0.9580,  0.3623,  0.2343]])
+    >>> torch.mean(a, 1)
+    tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+    >>> torch.mean(a, 1, True)
+    tensor([[-0.0163],
+            [-0.5085],
+            [-0.4599],
+            [ 0.1807]])
+""")
+
+add_docstr(torch.median,
+           r"""
+.. function:: median(input) -> Tensor
+
+Returns the median value of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 1.5219, -1.5212,  0.2202]])
+    >>> torch.median(a)
+    tensor(0.2202)
+
+.. function:: median(input, dim=-1, keepdim=False, values=None, indices=None) -> (Tensor, LongTensor)
+
+Returns the median value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. Also returns the index location of the median value
+as a `LongTensor`.
+
+By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size
+as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the outputs tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensors have :attr:`dim` retained or not
+    values (Tensor, optional): the output tensor
+    indices (Tensor, optional): the output index tensor
+
+Example::
+
+    >>> a = torch.randn(4, 5)
+    >>> a
+    tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+            [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+            [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+            [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+    >>> torch.median(a, 1)
+    (tensor([-0.3982,  0.2270,  0.2488,  0.4742]), tensor([ 1,  4,  4,  3]))
+""")
+
+add_docstr(torch.min,
+           r"""
+.. function:: min(input) -> Tensor
+
+Returns the minimum value of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.6750,  1.0857,  1.7197]])
+    >>> torch.min(a)
+    tensor(0.6750)
+
+.. function:: min(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
+
+Returns the minimum value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. The second return value is the index location of each
+minimum value found (argmin).
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+:attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the output tensors having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensors have :attr:`dim` retained or not
+    out (tuple, optional): the tuple of two output tensors (min, min_indices)
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+            [-1.4644, -0.2635, -0.3651,  0.6134],
+            [ 0.2457,  0.0384,  1.0128,  0.7015],
+            [-0.1153,  2.9849,  2.1458,  0.5788]])
+    >>> torch.min(a, 1)
+    (tensor([-1.1899, -1.4644,  0.0384, -0.1153]), tensor([ 2,  0,  1,  0]))
+
+.. function:: min(input, other, out=None) -> Tensor
+
+Each element of the tensor :attr:`input` is compared with the corresponding
+element of the tensor :attr:`other` and an element-wise minimum is taken.
+The resulting tensor is returned.
+
+The shapes of :attr:`input` and :attr:`other` don't need to match,
+but they must be :ref:`broadcastable <broadcasting-semantics>`.
+
+.. math::
+    out_i = \min(tensor_i, other_i)
+
+.. note:: When the shapes do not match, the shape of the returned output tensor
+          follows the :ref:`broadcasting rules <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the input tensor
+    other (Tensor): the second input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.8137, -1.1740, -0.6460,  0.6308])
+    >>> b = torch.randn(4)
+    >>> b
+    tensor([-0.1369,  0.1555,  0.4019, -0.1929])
+    >>> torch.min(a, b)
+    tensor([-0.1369, -1.1740, -0.6460, -0.1929])
+""")
+
+add_docstr(torch.mm,
+           r"""
+mm(mat1, mat2, out=None) -> Tensor
+
+Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+
+If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+:math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+          For broadcasting matrix products, see :func:`torch.matmul`.
+
+Args:
+    mat1 (Tensor): the first matrix to be multiplied
+    mat2 (Tensor): the second matrix to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> mat1 = torch.randn(2, 3)
+    >>> mat2 = torch.randn(3, 3)
+    >>> torch.mm(mat1, mat2)
+    tensor([[ 0.4851,  0.5037, -0.3633],
+            [-0.0760, -3.6705,  2.4784]])
+""")
+
+add_docstr(torch.matmul,
+           r"""
+matmul(tensor1, tensor2, out=None) -> Tensor
+
+Matrix product of two tensors.
+
+The behavior depends on the dimensionality of the tensors as follows:
+
+- If both tensors are 1-dimensional, the dot product (scalar) is returned.
+- If both arguments are 2-dimensional, the matrix-matrix product is returned.
+- If the first argument is 1-dimensional and the second argument is 2-dimensional,
+  a 1 is prepended to its dimension for the purpose of the matrix multiply.
+  After the matrix multiply, the prepended dimension is removed.
+- If the first argument is 2-dimensional and the second argument is 1-dimensional,
+  the matrix-vector product is returned.
+- If both arguments are at least 1-dimensional and at least one argument is
+  N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+  argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+  batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+  The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+  must be broadcastable).  For example, if :attr:`tensor1` is a
+  :math:`(j \times 1 \times n \times m)` tensor and :attr:`tensor2` is a :math:`(k \times m \times p)`
+  tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor.
+
+.. note::
+
+    The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
+
+Arguments:
+    tensor1 (Tensor): the first tensor to be multiplied
+    tensor2 (Tensor): the second tensor to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> # vector x vector
+    >>> tensor1 = torch.randn(3)
+    >>> tensor2 = torch.randn(3)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([])
+    >>> # matrix x vector
+    >>> tensor1 = torch.randn(3, 4)
+    >>> tensor2 = torch.randn(4)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([3])
+    >>> # batched matrix x broadcasted vector
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(4)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3])
+    >>> # batched matrix x batched matrix
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(10, 4, 5)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3, 5])
+    >>> # batched matrix x broadcasted matrix
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(4, 5)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3, 5])
+
+""")
+
+add_docstr(torch.mode,
+           r"""
+mode(input, dim=-1, keepdim=False, values=None, indices=None) -> (Tensor, LongTensor)
+
+Returns the mode value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. Also returns the index location of the mode value
+as a `LongTensor`.
+
+By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+:attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensors having 1 fewer dimension than :attr:`input`.
+
+.. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensors have :attr:`dim` retained or not
+    values (Tensor, optional): the output tensor
+    indices (Tensor, optional): the output index tensor
+
+Example::
+
+    >>> a = torch.randn(4, 5)
+    >>> a
+    tensor([[-1.2808, -1.0966, -1.5946, -0.1148,  0.3631],
+            [ 1.1395,  1.1452, -0.6383,  0.3667,  0.4545],
+            [-0.4061, -0.3074,  0.4579, -1.3514,  1.2729],
+            [-1.0130,  0.3546, -1.4689, -0.1254,  0.0473]])
+    >>> torch.mode(a, 1)
+    (tensor([-1.5946, -0.6383, -1.3514, -1.4689]), tensor([ 2,  2,  3,  2]))
+""")
+
+add_docstr(torch.mul,
+           r"""
+.. function:: mul(input, value, out=None)
+
+Multiplies each element of the input :attr:`input` with the scalar
+:attr:`value` and returns a new resulting tensor.
+
+.. math::
+    out_i = value \times input_i
+
+If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
+should be a real number, otherwise it should be an integer
+
+Args:
+    input (Tensor): the input tensor
+    value (Number): the number to be multiplied to each element of :attr:`input`
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([ 0.2015, -0.4255,  2.6087])
+    >>> torch.mul(a, 100)
+    tensor([  20.1494,  -42.5491,  260.8663])
+
+.. function:: mul(input, other, out=None)
+
+Each element of the tensor :attr:`input` is multiplied by each element of the
+Tensor :attr:`other`. The resulting tensor is returned.
+
+The shapes of :attr:`input` and :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+.. math::
+    out_i = input_i \times other_i
+
+Args:
+    input (Tensor): the first multiplicand tensor
+    other (Tensor): the second multiplicand tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 1)
+    >>> a
+    tensor([[ 1.1207],
+            [-0.3137],
+            [ 0.0700],
+            [ 0.8378]])
+    >>> b = torch.randn(1, 4)
+    >>> b
+    tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
+    >>> torch.mul(a, b)
+    tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
+            [-0.1614, -0.0382,  0.1645, -0.7021],
+            [ 0.0360,  0.0085, -0.0367,  0.1567],
+            [ 0.4312,  0.1019, -0.4394,  1.8753]])
+""")
+
+add_docstr(torch.multinomial,
+           r"""
+multinomial(input, num_samples, replacement=False, out=None) -> LongTensor
+
+Returns a tensor where each row contains :attr:`num_samples` indices sampled
+from the multinomial probability distribution located in the corresponding row
+of tensor :attr:`input`.
+
+.. note::
+    The rows of :attr:`input` do not need to sum to one (in which case we use
+    the values as weights), but must be non-negative, finite and have
+    a non-zero sum.
+
+Indices are ordered from left to right according to when each was sampled
+(first samples are placed in first column).
+
+If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
+
+If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
+:math:`(m \times num\_samples)`.
+
+If replacement is ``True``, samples are drawn with replacement.
+
+If not, they are drawn without replacement, which means that when a
+sample index is drawn for a row, it cannot be drawn again for that row.
+
+This implies the constraint that :attr:`num_samples` must be lower than
+:attr:`input` length (or number of columns of :attr:`input` if it is a matrix).
+
+Args:
+    input (Tensor): the input tensor containing probabilities
+    num_samples (int): number of samples to draw
+    replacement (bool, optional): whether to draw with replacement or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
+    >>> torch.multinomial(weights, 4)
+    tensor([ 1,  2,  0,  0])
+    >>> torch.multinomial(weights, 4, replacement=True)
+    tensor([ 2,  1,  1,  1])
+""")
+
+add_docstr(torch.mv,
+           r"""
+mv(mat, vec, out=None) -> Tensor
+
+Performs a matrix-vector product of the matrix :attr:`mat` and the vector
+:attr:`vec`.
+
+If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+
+Args:
+    mat (Tensor): matrix to be multiplied
+    vec (Tensor): vector to be multiplied
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> mat = torch.randn(2, 3)
+    >>> vec = torch.randn(3)
+    >>> torch.mv(mat, vec)
+    tensor([ 1.0404, -0.6361])
+""")
+
+add_docstr(torch.ne,
+           r"""
+ne(input, other, out=None) -> Tensor
+
+Computes :math:`input \neq other` element-wise.
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+    out (Tensor, optional): the output tensor that must be a `ByteTensor`
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` containing a 1 at each location where comparison is true.
+
+Example::
+
+    >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ 0,  1],
+            [ 1,  0]], dtype=torch.uint8)
+""")
+
+add_docstr(torch.neg,
+           r"""
+neg(input, out=None) -> Tensor
+
+Returns a new tensor with the negative of the elements of :attr:`input`.
+
+.. math::
+    out = -1 \times input
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    >>> torch.neg(a)
+    tensor([-0.0090,  0.2262,  0.0682,  0.2866, -0.3940])
+""")
+
+add_docstr(torch.nonzero,
+           r"""
+nonzero(input, out=None) -> LongTensor
+
+Returns a tensor containing the indices of all non-zero elements of
+:attr:`input`.  Each row in the result contains the indices of a non-zero
+element in :attr:`input`.
+
+If :attr:`input` has `n` dimensions, then the resulting indices tensor
+:attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+non-zero elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    out (LongTensor, optional): the output tensor containing indices
+
+Example::
+
+    >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+    tensor([[ 0],
+            [ 1],
+            [ 2],
+            [ 4]])
+    >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+                                    [0.0, 0.4, 0.0, 0.0],
+                                    [0.0, 0.0, 1.2, 0.0],
+                                    [0.0, 0.0, 0.0,-0.4]]))
+    tensor([[ 0,  0],
+            [ 1,  1],
+            [ 2,  2],
+            [ 3,  3]])
+""")
+
+add_docstr(torch.norm,
+           r"""
+.. function:: norm(input, p=2) -> Tensor
+
+Returns the p-norm of the :attr:`input` tensor.
+
+.. math::
+    ||x||_{p} = \sqrt[p]{x_{1}^{p} + x_{2}^{p} + \ldots + x_{N}^{p}}
+
+Args:
+    input (Tensor): the input tensor
+    p (float, optional): the exponent value in the norm formulation
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[-0.5192, -1.0782, -1.0448]])
+    >>> torch.norm(a, 3)
+    tensor(1.3633)
+
+.. function:: norm(input, p, dim, keepdim=False, out=None) -> Tensor
+
+Returns the p-norm of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    p (float):  the exponent value in the norm formulation
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 2)
+    >>> a
+    tensor([[ 2.1983,  0.4141],
+            [ 0.8734,  1.9710],
+            [-0.7778,  0.7938],
+            [-0.1342,  0.7347]])
+    >>> torch.norm(a, 2, 1)
+    tensor([ 2.2369,  2.1558,  1.1113,  0.7469])
+    >>> torch.norm(a, 0, 1, True)
+    tensor([[ 2.],
+            [ 2.],
+            [ 2.],
+            [ 2.]])
+""")
+
+add_docstr(torch.normal,
+           r"""
+.. function:: normal(mean, std, out=None) -> Tensor
+
+Returns a tensor of random numbers drawn from separate normal distributions
+whose mean and standard deviation are given.
+
+The :attr:`mean` is a tensor with the mean of
+each output element's normal distribution
+
+The :attr:`std` is a tensor with the standard deviation of
+each output element's normal distribution
+
+The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+total number of elements in each tensor need to be the same.
+
+.. note:: When the shapes do not match, the shape of :attr:`mean`
+          is used as the shape for the returned output tensor
+
+Args:
+    mean (Tensor): the tensor of per-element means
+    std (Tensor): the tensor of per-element standard deviations
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+    tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+              8.0505,   8.1408,   9.0563,  10.0566])
+
+.. function:: normal(mean=0.0, std, out=None) -> Tensor
+
+Similar to the function above, but the means are shared among all drawn
+elements.
+
+Args:
+    mean (float, optional): the mean for all distributions
+    std (Tensor): the tensor of per-element standard deviations
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+    tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+
+.. function:: normal(mean, std=1.0, out=None) -> Tensor
+
+Similar to the function above, but the standard-deviations are shared among
+all drawn elements.
+
+Args:
+    mean (Tensor): the tensor of per-element means
+    std (float, optional): the standard deviation for all distributions
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.normal(mean=torch.arange(1., 6.))
+    tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+""")
+
+add_docstr(torch.numel,
+           r"""
+numel(input) -> int
+
+Returns the total number of elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> a = torch.randn(1, 2, 3, 4, 5)
+    >>> torch.numel(a)
+    120
+    >>> a = torch.zeros(4,4)
+    >>> torch.numel(a)
+    16
+
+""")
+
+add_docstr(torch.ones,
+           r"""
+ones(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `1`, with the shape defined
+by the variable argument :attr:`sizes`.
+
+Args:
+    sizes (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.ones(2, 3)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+
+    >>> torch.ones(5)
+    tensor([ 1.,  1.,  1.,  1.,  1.])
+
+""".format(**factory_common_args))
+
+add_docstr(torch.ones_like,
+           r"""
+ones_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `1`, with the same size as
+:attr:`input`. ``torch.ones_like(input)`` is equivalent to
+``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+.. warning::
+    As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+    the old ``torch.ones_like(input, out=output)`` is equivalent to
+    ``torch.ones(input.size(), out=output)``.
+
+Args:
+    {input}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> input = torch.empty(2, 3)
+    >>> torch.ones_like(input)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+""".format(**factory_like_common_args))
+
+add_docstr(torch.orgqr,
+           r"""
+orgqr(a, tau) -> Tensor
+
+Computes the orthogonal matrix `Q` of a QR factorization, from the `(a, tau)`
+tuple returned by :func:`torch.geqrf`.
+
+This directly calls the underlying LAPACK function `?orgqr`.
+See `LAPACK documentation for orgqr`_ for further details.
+
+Args:
+    a (Tensor): the `a` from :func:`torch.geqrf`.
+    tau (Tensor): the `tau` from :func:`torch.geqrf`.
+
+.. _LAPACK documentation for orgqr:
+    https://software.intel.com/en-us/mkl-developer-reference-c-orgqr
+
+""")
+
+add_docstr(torch.ormqr,
+           r"""
+ormqr(a, tau, mat, left=True, transpose=False) -> (Tensor, Tensor)
+
+Multiplies `mat` by the orthogonal `Q` matrix of the QR factorization
+formed by :func:`torch.geqrf` that is represented by `(a, tau)`.
+
+This directly calls the underlying LAPACK function `?ormqr`.
+See `LAPACK documentation for ormqr`_ for further details.
+
+Args:
+    a (Tensor): the `a` from :func:`torch.geqrf`.
+    tau (Tensor): the `tau` from :func:`torch.geqrf`.
+    mat (Tensor): the matrix to be multiplied.
+
+.. _LAPACK documentation for ormqr:
+    https://software.intel.com/en-us/mkl-developer-reference-c-ormqr
+
+""")
+
+add_docstr(torch.potrf, r"""
+potrf(a, upper=True, out=None) -> Tensor
+
+Computes the Cholesky decomposition of a symmetric positive-definite
+matrix :math:`A`.
+
+If :attr:`upper` is ``True``, the returned matrix `U` is upper-triangular, and
+the decomposition has the form:
+
+.. math::
+
+  A = U^TU
+
+If :attr:`upper` is ``False``, the returned matrix `L` is lower-triangular, and
+the decomposition has the form:
+
+.. math::
+
+    A = LL^T
+
+Args:
+    a (Tensor): the input 2-D tensor, a symmetric positive-definite matrix
+    upper (bool, optional): flag that indicates whether to return the
+                            upper or lower triangular matrix
+    out (Tensor, optional): the output matrix
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a = torch.mm(a, a.t()) # make symmetric positive definite
+    >>> u = torch.potrf(a)
+    >>> a
+    tensor([[ 2.4112, -0.7486,  1.4551],
+            [-0.7486,  1.3544,  0.1294],
+            [ 1.4551,  0.1294,  1.6724]])
+    >>> u
+    tensor([[ 1.5528, -0.4821,  0.9371],
+            [ 0.0000,  1.0592,  0.5486],
+            [ 0.0000,  0.0000,  0.7023]])
+    >>> torch.mm(u.t(), u)
+    tensor([[ 2.4112, -0.7486,  1.4551],
+            [-0.7486,  1.3544,  0.1294],
+            [ 1.4551,  0.1294,  1.6724]])
+""")
+
+add_docstr(torch.potri, r"""
+potri(u, upper=True, out=None) -> Tensor
+
+Computes the inverse of a positive semidefinite matrix given its
+Cholesky factor :attr:`u`: returns matrix `inv`
+
+If :attr:`upper` is ``True`` or not provided, :attr:`u` is upper
+triangular such that:
+
+.. math::
+    inv = (u^T u)^{-1}
+
+If :attr:`upper` is ``False``, :attr:`u` is lower triangular
+such that:
+
+.. math::
+    inv = (uu^{T})^{-1}
+
+Args:
+    u (Tensor): the input 2-D tensor, a upper or lower triangular
+           Cholesky factor
+    upper (bool, optional): whether to return a upper (default) or lower triangular matrix
+    out (Tensor, optional): the output tensor for `inv`
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a = torch.mm(a, a.t()) # make symmetric positive definite
+    >>> u = torch.potrf(a)
+    >>> a
+    tensor([[  0.9935,  -0.6353,   1.5806],
+            [ -0.6353,   0.8769,  -1.7183],
+            [  1.5806,  -1.7183,  10.6618]])
+    >>> torch.potri(u)
+    tensor([[ 1.9314,  1.2251, -0.0889],
+            [ 1.2251,  2.4439,  0.2122],
+            [-0.0889,  0.2122,  0.1412]])
+    >>> a.inverse()
+    tensor([[ 1.9314,  1.2251, -0.0889],
+            [ 1.2251,  2.4439,  0.2122],
+            [-0.0889,  0.2122,  0.1412]])
+""")
+
+add_docstr(torch.potrs, r"""
+potrs(b, u, upper=True, out=None) -> Tensor
+
+Solves a linear system of equations with a positive semidefinite
+matrix to be inverted given its Cholesky factor matrix :attr:`u`.
+
+If :attr:`upper` is ``True`` or not provided, :attr:`u` is upper triangular
+and `c` is returned such that:
+
+.. math::
+    c = (u^T u)^{-1} b
+
+If :attr:`upper` is ``False``, :attr:`u` is and lower triangular and `c` is
+returned such that:
+
+.. math::
+    c = (u u^T)^{-1} b
+
+.. note:: :attr:`b` is always a 2-D tensor, use `b.unsqueeze(1)` to convert a vector.
+
+Args:
+    b (Tensor): the right hand side 2-D tensor
+    u (Tensor): the input 2-D tensor, a upper or lower triangular Cholesky factor
+    upper (bool, optional): whether to return a upper (default) or lower triangular matrix
+    out (Tensor, optional): the output tensor for `c`
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a = torch.mm(a, a.t()) # make symmetric positive definite
+    >>> u = torch.potrf(a)
+    >>> a
+    tensor([[ 0.7747, -1.9549,  1.3086],
+            [-1.9549,  6.7546, -5.4114],
+            [ 1.3086, -5.4114,  4.8733]])
+    >>> b = torch.randn(3, 2)
+    >>> b
+    tensor([[-0.6355,  0.9891],
+            [ 0.1974,  1.4706],
+            [-0.4115, -0.6225]])
+    >>> torch.potrs(b,u)
+    tensor([[ -8.1625,  19.6097],
+            [ -5.8398,  14.2387],
+            [ -4.3771,  10.4173]])
+    >>> torch.mm(a.inverse(),b)
+    tensor([[ -8.1626,  19.6097],
+            [ -5.8398,  14.2387],
+            [ -4.3771,  10.4173]])
+""")
+
+add_docstr(torch.pow,
+           r"""
+.. function:: pow(input, exponent, out=None) -> Tensor
+
+Takes the power of each element in :attr:`input` with :attr:`exponent` and
+returns a tensor with the result.
+
+:attr:`exponent` can be either a single ``float`` number or a `Tensor`
+with the same number of elements as :attr:`input`.
+
+When :attr:`exponent` is a scalar value, the operation applied is:
+
+.. math::
+    out_i = x_i ^ {exponent}
+
+When :attr:`exponent` is a tensor, the operation applied is:
+
+.. math::
+    out_i = x_i ^ {exponent_i}
+
+When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the input tensor
+    exponent (float or tensor): the exponent value
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+    >>> torch.pow(a, 2)
+    tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+    >>> exp = torch.arange(1., 5.)
+
+    >>> a = torch.arange(1., 5.)
+    >>> a
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> exp
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> torch.pow(a, exp)
+    tensor([   1.,    4.,   27.,  256.])
+
+.. function:: pow(base, input, out=None) -> Tensor
+
+:attr:`base` is a scalar ``float`` value, and :attr:`input` is a tensor.
+The returned tensor :attr:`out` is of the same shape as :attr:`input`
+
+The operation applied is:
+
+.. math::
+    out_i = base ^ {input_i}
+
+Args:
+    base (float): the scalar base value for the power operation
+    input (Tensor): the exponent tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> exp = torch.arange(1., 5.)
+    >>> base = 2
+    >>> torch.pow(base, exp)
+    tensor([  2.,   4.,   8.,  16.])
+""")
+
+add_docstr(torch.prod,
+           r"""
+.. function:: prod(input, dtype=None) -> Tensor
+
+Returns the product of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[-0.8020,  0.5428, -1.5854]])
+    >>> torch.prod(a)
+    tensor(0.6902)
+
+.. function:: prod(input, dim, keepdim=False, dtype=None) -> Tensor
+
+Returns the product of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(4, 2)
+    >>> a
+    tensor([[ 0.5261, -0.3837],
+            [ 1.1857, -0.2498],
+            [-1.1646,  0.0705],
+            [ 1.1131, -1.0629]])
+    >>> torch.prod(a, 1)
+    tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+""".format(**reduceops_common_args))
+
+add_docstr(torch.pstrf, r"""
+pstrf(a, upper=True, out=None) -> (Tensor, Tensor)
+
+Computes the pivoted Cholesky decomposition of a positive semidefinite
+matrix :attr:`a`. returns matrices `u` and `piv`.
+
+If :attr:`upper` is ``True`` or not provided, `u` is upper triangular
+such that :math:`a = p^T u^T u p`, with `p` the permutation given by `piv`.
+
+If :attr:`upper` is ``False``, `u` is lower triangular such that
+:math:`a = p^T u u^T p`.
+
+Args:
+    a (Tensor): the input 2-D tensor
+    upper (bool, optional): whether to return a upper (default) or lower triangular matrix
+    out (tuple, optional): tuple of `u` and `piv` tensors
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a = torch.mm(a, a.t()) # make symmetric positive definite
+    >>> a
+    tensor([[ 3.5405, -0.4577,  0.8342],
+            [-0.4577,  1.8244, -0.1996],
+            [ 0.8342, -0.1996,  3.7493]])
+    >>> u,piv = torch.pstrf(a)
+    >>> u
+    tensor([[ 1.9363,  0.4308, -0.1031],
+            [ 0.0000,  1.8316, -0.2256],
+            [ 0.0000,  0.0000,  1.3277]])
+    >>> piv
+    tensor([ 2,  0,  1], dtype=torch.int32)
+    >>> p = torch.eye(3).index_select(0,piv.long()).index_select(0,piv.long()).t() # make pivot permutation
+    >>> torch.mm(torch.mm(p.t(),torch.mm(u.t(),u)),p) # reconstruct
+    tensor([[ 3.5405, -0.4577,  0.8342],
+            [-0.4577,  1.8244, -0.1996],
+            [ 0.8342, -0.1996,  3.7493]])
+""")
+
+add_docstr(torch.qr,
+           r"""
+qr(input, out=None) -> (Tensor, Tensor)
+
+Computes the QR decomposition of a matrix :attr:`input`, and returns matrices
+`Q` and `R` such that :math:`\text{input} = Q R`, with :math:`Q` being an
+orthogonal matrix and :math:`R` being an upper triangular matrix.
+
+This returns the thin (reduced) QR factorization.
+
+.. note:: precision may be lost if the magnitudes of the elements of :attr:`input`
+          are large
+
+.. note:: While it should always give you a valid decomposition, it may not
+          give you the same one across platforms - it will depend on your
+          LAPACK implementation.
+
+.. note:: Irrespective of the original strides, the returned matrix :math:`Q` will be
+          transposed, i.e. with strides `(1, m)` instead of `(m, 1)`.
+
+Args:
+    input (Tensor): the input 2-D tensor
+    out (tuple, optional): tuple of `Q` and `R` tensors
+
+Example::
+
+    >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+    >>> q, r = torch.qr(a)
+    >>> q
+    tensor([[-0.8571,  0.3943,  0.3314],
+            [-0.4286, -0.9029, -0.0343],
+            [ 0.2857, -0.1714,  0.9429]])
+    >>> r
+    tensor([[ -14.0000,  -21.0000,   14.0000],
+            [   0.0000, -175.0000,   70.0000],
+            [   0.0000,    0.0000,  -35.0000]])
+    >>> torch.mm(q, r).round()
+    tensor([[  12.,  -51.,    4.],
+            [   6.,  167.,  -68.],
+            [  -4.,   24.,  -41.]])
+    >>> torch.mm(q.t(), q).round()
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1., -0.],
+            [ 0., -0.,  1.]])
+""")
+
+add_docstr(torch.rand,
+           r"""
+rand(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with random numbers from a uniform distribution
+on the interval :math:`[0, 1)`
+
+The shape of the tensor is defined by the variable argument :attr:`sizes`.
+
+Args:
+    sizes (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.rand(4)
+    tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+    >>> torch.rand(2, 3)
+    tensor([[ 0.8237,  0.5781,  0.6879],
+            [ 0.3816,  0.7249,  0.0998]])
+""")
+
+add_docstr(torch.rand_like,
+           r"""
+rand_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` that is filled with
+random numbers from a uniform distribution on the interval :math:`[0, 1)`.
+``torch.rand_like(input)`` is equivalent to
+``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+""".format(**factory_like_common_args))
+
+add_docstr(torch.randint,
+           r"""
+randint(low=0, high, size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with random integers generated uniformly
+between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+
+The shape of the tensor is defined by the variable argument :attr:`size`.
+
+.. note:
+    With the global dtype default (`torch.float32`), this function returns
+    a tensor with dtype `torch.float32`, NOT an integer dtype.
+
+Args:
+    low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+    high (int): One above the highest integer to be drawn from the distribution.
+    size (tuple): a tuple defining the shape of the output tensor.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.randint(3, 5, (3,))
+    tensor([ 4.,  3.,  4.])
+
+
+    >>> torch.randint(10, (2,2))
+    tensor([[ 0.,  2.],
+            [ 5.,  5.]])
+
+
+    >>> torch.randint(3, 10, (2,2))
+    tensor([[ 4.,  5.],
+            [ 6.,  7.]])
+
+
+""".format(**factory_common_args))
+
+add_docstr(torch.randint_like,
+           r"""
+randint_like(input, low=0, high, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor with the same shape as Tensor :attr:`input` filled with
+random integers generated uniformly between :attr:`low` (inclusive) and
+:attr:`high` (exclusive).
+
+.. note:
+    With the global dtype default (`torch.float32`), this function returns
+    a tensor with dtype `torch.float32`, NOT an integer dtype.
+
+Args:
+    {input}
+    low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+    high (int): One above the highest integer to be drawn from the distribution.
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+""".format(**factory_like_common_args))
+
+add_docstr(torch.randn,
+           r"""
+randn(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with random numbers from a normal distribution
+with mean `0` and variance `1` (also called the standard normal
+distribution).
+
+.. math::
+    \text{{out}}_{{i}} \sim \mathcal{{N}}(0, 1)
+
+The shape of the tensor is defined by the variable argument :attr:`sizes`.
+
+Args:
+    sizes (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.randn(4)
+    tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+    >>> torch.randn(2, 3)
+    tensor([[ 1.5954,  2.8929, -1.0923],
+            [ 1.1719, -0.4709, -0.1996]])
+""".format(**factory_common_args))
+
+add_docstr(torch.randn_like,
+           r"""
+randn_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` that is filled with
+random numbers from a normal distribution with mean 0 and variance 1.
+``torch.randn_like(input)`` is equivalent to
+``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+""".format(**factory_like_common_args))
+
+add_docstr(torch.randperm,
+           r"""
+randperm(n, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
+
+Returns a random permutation of integers from ``0`` to ``n - 1``.
+
+Args:
+    n (int): the upper bound (exclusive)
+    {out}
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: ``torch.int64``.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.randperm(4)
+    tensor([ 2,  1,  0,  3])
+""".format(**factory_common_args))
+
+add_docstr(torch.tensor,
+           r"""
+tensor(data, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a tensor with :attr:`data`.
+
+.. warning::
+
+    :func:`torch.tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a NumPy ``ndarray`` and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+Args:
+    {data}
+    {dtype}
+    {device}
+    {requires_grad}
+
+
+Example::
+
+    >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+    tensor([[ 0.1000,  1.2000],
+            [ 2.2000,  3.1000],
+            [ 4.9000,  5.2000]])
+
+    >>> torch.tensor([0, 1])  # Type inference on data
+    tensor([ 0,  1])
+
+    >>> torch.tensor([[0.11111, 0.222222, 0.3333333]],
+                     dtype=torch.float64,
+                     device=torch.device('cuda:0'))  # creates a torch.cuda.DoubleTensor
+    tensor([[ 0.1111,  0.2222,  0.3333]], dtype=torch.float64, device='cuda:0')
+
+    >>> torch.tensor(3.14159)  # Create a scalar (zero-dimensional tensor)
+    tensor(3.1416)
+
+    >>> torch.tensor([])  # Create an empty tensor (of size (0,))
+    tensor([])
+""".format(**factory_data_common_args))
+
+add_docstr(torch.range,
+           r"""
+range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor + 1`
+with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+the gap between two values in the tensor.
+
+.. math::
+    \text{{out}}_{{i+1}} = \text{{out}}_i + step.
+
+.. warning::
+    This function is deprecated in favor of :func:`torch.arange`.
+
+Args:
+    start (float): the starting value for the set of points. Default: ``0``.
+    end (float): the ending value for the set of points
+    step (float): the gap between each pair of adjacent points. Default: ``1``.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.range(1, 4)
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> torch.range(1, 4, 0.5)
+    tensor([ 1.0000,  1.5000,  2.0000,  2.5000,  3.0000,  3.5000,  4.0000])
+""".format(**factory_common_args))
+
+add_docstr(torch.arange,
+           r"""
+arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor`
+with values from the interval ``[start, end)`` taken with common difference
+:attr:`step` beginning from `start`.
+
+Note that non-integer :attr:`step` is subject to floating point rounding errors when
+comparing against :attr:`end`; to avoid inconsistency, we advise adding a small epsilon to :attr:`end`
+in such cases.
+
+.. math::
+    \text{{out}}_{{i+1}} = \text{{out}}_{{i}} + \text{{step}}
+
+Args:
+    start (Number): the starting value for the set of points. Default: ``0``.
+    end (Number): the ending value for the set of points
+    step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    {out}
+    {dtype} If `dtype` is not given, infer the data type from the other input
+        arguments. If any of `start`, `end`, or `stop` are floating-point, the
+        `dtype` is inferred to be the default dtype, see
+        :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+        be `torch.int64`.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.arange(5)
+    tensor([ 0,  1,  2,  3,  4])
+    >>> torch.arange(1, 4)
+    tensor([ 1,  2,  3])
+    >>> torch.arange(1, 2.5, 0.5)
+    tensor([ 1.0000,  1.5000,  2.0000])
+""".format(**factory_common_args))
+
+add_docstr(torch.remainder,
+           r"""
+remainder(input, divisor, out=None) -> Tensor
+
+Computes the element-wise remainder of division.
+
+The divisor and dividend may contain both for integer and floating point
+numbers. The remainder has the same sign as the divisor.
+
+When :attr:`divisor` is a tensor, the shapes of :attr:`input` and
+:attr:`divisor` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the dividend
+    divisor (Tensor or float): the divisor that may be either a number or a
+                               Tensor of the same shape as the dividend
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+    tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+    >>> torch.remainder(torch.tensor([1., 2, 3, 4, 5]), 1.5)
+    tensor([ 1.0000,  0.5000,  0.0000,  1.0000,  0.5000])
+
+.. seealso::
+
+        :func:`torch.fmod`, which computes the element-wise remainder of
+        division equivalently to the C library function ``fmod()``.
+""")
+
+add_docstr(torch.renorm,
+           r"""
+renorm(input, p, dim, maxnorm, out=None) -> Tensor
+
+Returns a tensor where each sub-tensor of :attr:`input` along dimension
+:attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
+than the value :attr:`maxnorm`
+
+.. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged
+
+Args:
+    input (Tensor): the input tensor
+    p (float): the power for the norm computation
+    dim (int): the dimension to slice over to get the sub-tensors
+    maxnorm (float): the maximum norm to keep each sub-tensor under
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.ones(3, 3)
+    >>> x[1].fill_(2)
+    tensor([ 2.,  2.,  2.])
+    >>> x[2].fill_(3)
+    tensor([ 3.,  3.,  3.])
+    >>> x
+    tensor([[ 1.,  1.,  1.],
+            [ 2.,  2.,  2.],
+            [ 3.,  3.,  3.]])
+    >>> torch.renorm(x, 1, 0, 5)
+    tensor([[ 1.0000,  1.0000,  1.0000],
+            [ 1.6667,  1.6667,  1.6667],
+            [ 1.6667,  1.6667,  1.6667]])
+""")
+
+add_docstr(torch.reshape,
+           r"""
+reshape(input, shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`input`,
+but with the specified shape. When possible, the returned tensor will be a view
+of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
+with compatible strides can be reshaped without copying, but you should not
+depend on the copying vs. viewing behavior.
+
+A single dimension may be -1, in which case it's inferred from the remaining
+dimensions and the number of elements in :attr:`input`.
+
+Args:
+    input (Tensor): the tensor to be reshaped
+    shape (tuple of ints): the new shape
+
+Example::
+
+    >>> a = torch.arange(4.)
+    >>> torch.reshape(a, (2, 2))
+    tensor([[ 0.,  1.],
+            [ 2.,  3.]])
+    >>> b = torch.tensor([[0, 1], [2, 3]])
+    >>> torch.reshape(b, (-1,))
+    tensor([ 0,  1,  2,  3])
+""")
+
+
+add_docstr(torch.round,
+           r"""
+round(input, out=None) -> Tensor
+
+Returns a new tensor with each of the elements of :attr:`input` rounded
+to the closest integer.
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.9920,  0.6077,  0.9734, -1.0362])
+    >>> torch.round(a)
+    tensor([ 1.,  1.,  1., -1.])
+""")
+
+add_docstr(torch.rsqrt,
+           r"""
+rsqrt(input, out=None) -> Tensor
+
+Returns a new tensor with the reciprocal of the square-root of each of
+the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}}
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.0370,  0.2970,  1.5420, -0.9105])
+    >>> torch.rsqrt(a)
+    tensor([    nan,  1.8351,  0.8053,     nan])
+""")
+
+add_docstr(torch.set_flush_denormal,
+           r"""
+set_flush_denormal(mode) -> bool
+
+Disables denormal floating numbers on CPU.
+
+Returns ``True`` if your system supports flushing denormal numbers and it
+successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
+is only supported on x86 architectures supporting SSE3.
+
+Args:
+    mode (bool): Controls whether to enable flush denormal mode or not
+
+Example::
+
+    >>> torch.set_flush_denormal(True)
+    True
+    >>> torch.tensor([1e-323], dtype=torch.float64)
+    tensor([ 0.], dtype=torch.float64)
+    >>> torch.set_flush_denormal(False)
+    True
+    >>> torch.tensor([1e-323], dtype=torch.float64)
+    tensor(9.88131e-324 *
+           [ 1.0000], dtype=torch.float64)
+""")
+
+add_docstr(torch.set_num_threads,
+           r"""
+set_num_threads(int)
+
+Sets the number of OpenMP threads used for parallelizing CPU operations
+""")
+
+add_docstr(torch.sigmoid,
+           r"""
+sigmoid(input, out=None) -> Tensor
+
+Returns a new tensor with the sigmoid of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \frac{1}{1 + e^{-\text{input}_{i}}}
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.9213,  1.0887, -0.8858, -1.7683])
+    >>> torch.sigmoid(a)
+    tensor([ 0.7153,  0.7481,  0.2920,  0.1458])
+""")
+
+add_docstr(torch.sign,
+           r"""
+sign(input, out=None) -> Tensor
+
+Returns a new tensor with the sign of the elements of :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 1.0382, -1.4526, -0.9709,  0.4542])
+    >>> torch.sign(a)
+    tensor([ 1., -1., -1.,  1.])
+""")
+
+add_docstr(torch.sin,
+           r"""
+sin(input, out=None) -> Tensor
+
+Returns a new tensor with the sine of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sin(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.5461,  0.1347, -2.7266, -0.2746])
+    >>> torch.sin(a)
+    tensor([-0.5194,  0.1343, -0.4032, -0.2711])
+""")
+
+add_docstr(torch.sinh,
+           r"""
+sinh(input, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic sine of the elements of
+:attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sinh(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.5380, -0.8632, -0.1265,  0.9399])
+    >>> torch.sinh(a)
+    tensor([ 0.5644, -0.9744, -0.1268,  1.0845])
+""")
+
+add_docstr(torch.sort,
+           r"""
+sort(input, dim=None, descending=False, out=None) -> (Tensor, LongTensor)
+
+Sorts the elements of the :attr:`input` tensor along a given dimension
+in ascending order by value.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+If :attr:`descending` is ``True`` then the elements are sorted in descending
+order by value.
+
+A tuple of (sorted_tensor, sorted_indices) is returned, where the
+sorted_indices are the indices of the elements in the original `input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int, optional): the dimension to sort along
+    descending (bool, optional): controls the sorting order (ascending or descending)
+    out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+        be optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> sorted, indices = torch.sort(x)
+    >>> sorted
+    tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+            [-0.5793,  0.0061,  0.6058,  0.9497],
+            [-0.5071,  0.3343,  0.9553,  1.0960]])
+    >>> indices
+    tensor([[ 1,  0,  2,  3],
+            [ 3,  1,  0,  2],
+            [ 0,  3,  1,  2]])
+
+    >>> sorted, indices = torch.sort(x, 0)
+    >>> sorted
+    tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+            [ 0.0608,  0.0061,  0.9497,  0.3343],
+            [ 0.6058,  0.9553,  1.0960,  2.3332]])
+    >>> indices
+    tensor([[ 2,  0,  0,  1],
+            [ 0,  1,  1,  2],
+            [ 1,  2,  2,  0]])
+""")
+
+add_docstr(torch.sparse_coo_tensor,
+           r"""
+sparse_coo_tensor(indices, values, size=None, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a sparse_coo_tensor with non-zero elements at the given :attr:`indices` with the given
+:attr:`values`.
+
+Args:
+    indices (array_like): Initial data for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`
+        internally. The indices are the coordinates of the non-zero values in the matrix, and thus
+        should be two-dimensional where the first dimension is the number of tensor dimensions and
+        the second dimension is the number of non-zero values.
+    values (array_like): Initial values for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types.
+    size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
+        provided the size will be inferred as the minimum size big enough to hold all non-zero
+        elements.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if None, infers data type from :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, uses the current device for the default tensor type
+        (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+
+
+Example::
+
+    >>> i = torch.LongTensor([[0, 1, 1],
+                              [2, 0, 2]])
+    >>> v = torch.FloatTensor([3, 4, 5])
+    >>> torch.sparse_coo_tensor(i, v, torch.Size([2,4]))
+    torch.sparse.FloatTensor of size (2,4) with indices:
+    tensor([[ 0,  1,  1],
+            [ 2,  0,  2]])
+    and values:
+    tensor([ 3.,  4.,  5.])
+
+    >>> torch.sparse_coo_tensor(i, v)  # Shape inference
+    torch.sparse.FloatTensor of size (2,3) with indices:
+    tensor([[ 0,  1,  1],
+            [ 2,  0,  2]])
+    and values:
+    tensor([ 3.,  4.,  5.])
+
+    >>> torch.sparse_coo_tensor(i, v, torch.Size([2,4]), dtype=torch.float64,
+                                device=torch.device('cuda:0'))
+    torch.cuda.sparse.DoubleTensor of size (2,4) with indices:
+    tensor([[ 0,  1,  1],
+            [ 2,  0,  2]], device='cuda:0')
+    and values:
+    tensor([ 3.,  4.,  5.], dtype=torch.float64, device='cuda:0')
+
+    >>> torch.sparse_coo_tensor([], [], torch.Size([])) # Create an empty tensor (of size (0,))
+    torch.sparse.FloatTensor of size () with indices:
+    tensor([], dtype=torch.int64)
+    and values:
+    tensor([])
+""")
+
+add_docstr(torch.sqrt,
+           r"""
+sqrt(input, out=None) -> Tensor
+
+Returns a new tensor with the square-root of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sqrt{\text{input}_{i}}
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+    >>> torch.sqrt(a)
+    tensor([    nan,  1.0112,  0.2883,  0.6933])
+""")
+
+add_docstr(torch.squeeze,
+           r"""
+squeeze(input, dim=None, out=None) -> Tensor
+
+Returns a tensor with all the dimensions of :attr:`input` of size `1` removed.
+
+For example, if `input` is of shape:
+:math:`(A \times 1 \times B \times C \times 1 \times D)` then the `out` tensor
+will be of shape: :math:`(A \times B \times C \times D)`.
+
+When :attr:`dim` is given, a squeeze operation is done only in the given
+dimension. If `input` is of shape: :math:`(A \times 1 \times B)`,
+`squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will
+squeeze the tensor to the shape :math:`(A \times B)`.
+
+.. note:: The returned tensor shares the storage with the input tensor,
+          so changing the contents of one will change the contents of the other.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int, optional): if given, the input will be squeezed only in
+           this dimension
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.zeros(2, 1, 2, 1, 2)
+    >>> x.size()
+    torch.Size([2, 1, 2, 1, 2])
+    >>> y = torch.squeeze(x)
+    >>> y.size()
+    torch.Size([2, 2, 2])
+    >>> y = torch.squeeze(x, 0)
+    >>> y.size()
+    torch.Size([2, 1, 2, 1, 2])
+    >>> y = torch.squeeze(x, 1)
+    >>> y.size()
+    torch.Size([2, 2, 1, 2])
+""")
+
+add_docstr(torch.std,
+           r"""
+.. function:: std(input, unbiased=True) -> Tensor
+
+Returns the standard-deviation of all elements in the :attr:`input` tensor.
+
+If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+via the biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[-0.8166, -1.3802, -0.3560]])
+    >>> torch.std(a)
+    tensor(0.5130)
+
+.. function:: std(input, dim, keepdim=False, unbiased=True, out=None) -> Tensor
+
+Returns the standard-deviation of each row of the :attr:`input` tensor in the
+given dimension :attr:`dim`.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+via the biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    unbiased (bool): whether to use the unbiased estimation or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.2035,  1.2959,  1.8101, -0.4644],
+            [ 1.5027, -0.3270,  0.5905,  0.6538],
+            [-1.5745,  1.3330, -0.5596, -0.6548],
+            [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.std(a, dim=1)
+    tensor([ 1.0311,  0.7477,  1.2204,  0.9087])
+""")
+
+add_docstr(torch.sum,
+           r"""
+.. function:: sum(input, dtype=None) -> Tensor
+
+Returns the sum of all elements in the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.1133, -0.9567,  0.2958]])
+    >>> torch.sum(a)
+    tensor(-0.5475)
+
+.. function:: sum(input, dim, keepdim=False, dtype=None) -> Tensor
+
+Returns the sum of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. If :attr::`dim` is a list of dimensions,
+reduce over all of them.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints): the dimension or dimensions to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+            [-0.2993,  0.9138,  0.9337, -1.6864],
+            [ 0.1132,  0.7892, -0.1003,  0.5688],
+            [ 0.3637, -0.9906, -0.4752, -1.5197]])
+    >>> torch.sum(a, 1)
+    tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+    >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+    >>> torch.sum(b, (2, 1))
+    tensor([  435.,  1335.,  2235.,  3135.])
+""".format(**reduceops_common_args))
+
+add_docstr(torch.svd,
+           r"""
+svd(input, some=True, out=None) -> (Tensor, Tensor, Tensor)
+
+`U, S, V = torch.svd(A)` returns the singular value decomposition of a
+real matrix `A` of size `(n x m)` such that :math:`A = USV^T`.
+
+`U` is of shape :math:`(n \times n)`.
+
+`S` is a diagonal matrix of shape :math:`(n \times m)`, represented as a vector
+of size :math:`\min(n, m)` containing the non-negative diagonal entries.
+
+`V` is of shape :math:`(m \times m)`.
+
+If :attr:`some` is ``True`` (default), the returned `U` and `V` matrices will
+contain only :math:`min(n, m)` orthonormal columns.
+
+.. note:: Irrespective of the original strides, the returned matrix `U`
+          will be transposed, i.e. with strides `(1, n)` instead of `(n, 1)`.
+
+.. note:: Extra care needs to be taken when backward through `U` and `V`
+          outputs. Such operation is really only stable when :attr:`input` is
+          full rank with all distinct singular values. Otherwise, ``NaN`` can
+          appear as the gradients are not properly defined. Also, notice that
+          double backward will usually do an additional backward through `U` and
+          `V` even if the original backward is only on `S`.
+
+.. note:: When :attr:`some` = ``False``, the gradients on ``U[:, min(n, m):]``
+          and ``V[:, min(n, m):]`` will be ignored in backward as those vectors
+          can be arbitrary bases of the subspaces.
+
+Args:
+    input (Tensor): the input 2-D tensor
+    some (bool, optional): controls the shape of returned `U` and `V`
+    out (tuple, optional): the output tuple of tensors
+
+Example::
+
+    >>> a = torch.tensor([[8.79,  6.11, -9.15,  9.57, -3.49,  9.84],
+                          [9.93,  6.91, -7.93,  1.64,  4.02,  0.15],
+                          [9.83,  5.04,  4.86,  8.83,  9.80, -8.99],
+                          [5.45, -0.27,  4.85,  0.74, 10.00, -6.02],
+                          [3.16,  7.98,  3.01,  5.80,  4.27, -5.31]]).t()
+
+    >>> u, s, v = torch.svd(a)
+    >>> u
+    tensor([[-0.5911,  0.2632,  0.3554,  0.3143,  0.2299],
+            [-0.3976,  0.2438, -0.2224, -0.7535, -0.3636],
+            [-0.0335, -0.6003, -0.4508,  0.2334, -0.3055],
+            [-0.4297,  0.2362, -0.6859,  0.3319,  0.1649],
+            [-0.4697, -0.3509,  0.3874,  0.1587, -0.5183],
+            [ 0.2934,  0.5763, -0.0209,  0.3791, -0.6526]])
+    >>> s
+    tensor([ 27.4687,  22.6432,   8.5584,   5.9857,   2.0149])
+    >>> v
+    tensor([[-0.2514,  0.8148, -0.2606,  0.3967, -0.2180],
+            [-0.3968,  0.3587,  0.7008, -0.4507,  0.1402],
+            [-0.6922, -0.2489, -0.2208,  0.2513,  0.5891],
+            [-0.3662, -0.3686,  0.3859,  0.4342, -0.6265],
+            [-0.4076, -0.0980, -0.4933, -0.6227, -0.4396]])
+    >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t()))
+    tensor(1.00000e-06 *
+           9.3738)
+""")
+
+add_docstr(torch.symeig,
+           r"""
+symeig(input, eigenvectors=False, upper=True, out=None) -> (Tensor, Tensor)
+
+This function returns eigenvalues and eigenvectors
+of a real symmetric matrix :attr:`input`, represented by a tuple :math:`(e, V)`.
+
+:attr:`input` and :math:`V` are :math:`(m \times m)` matrices and :math:`e` is a
+:math:`m` dimensional vector.
+
+This function calculates all eigenvalues (and vectors) of :attr:`input`
+such that :math:`input = V diag(e) V^T`.
+
+The boolean argument :attr:`eigenvectors` defines computation of
+eigenvectors or eigenvalues only.
+
+If it is ``False``, only eigenvalues are computed. If it is ``True``,
+both eigenvalues and eigenvectors are computed.
+
+Since the input matrix :attr:`input` is supposed to be symmetric,
+only the upper triangular portion is used by default.
+
+If :attr:`upper` is ``False``, then lower triangular portion is used.
+
+Note: Irrespective of the original strides, the returned matrix `V` will
+be transposed, i.e. with strides `(1, m)` instead of `(m, 1)`.
+
+Args:
+    input (Tensor): the input symmetric matrix
+    eigenvectors(boolean, optional): controls whether eigenvectors have to be computed
+    upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
+    out (tuple, optional): the output tuple of (Tensor, Tensor)
+
+Examples::
+
+
+    >>> a = torch.tensor([[ 1.96,  0.00,  0.00,  0.00,  0.00],
+                          [-6.49,  3.80,  0.00,  0.00,  0.00],
+                          [-0.47, -6.39,  4.17,  0.00,  0.00],
+                          [-7.20,  1.50, -1.51,  5.70,  0.00],
+                          [-0.65, -6.34,  2.67,  1.80, -7.10]]).t()
+    >>> e, v = torch.symeig(a, eigenvectors=True)
+    >>> e
+    tensor([-11.0656,  -6.2287,   0.8640,   8.8655,  16.0948])
+    >>> v
+    tensor([[-0.2981, -0.6075,  0.4026, -0.3745,  0.4896],
+            [-0.5078, -0.2880, -0.4066, -0.3572, -0.6053],
+            [-0.0816, -0.3843, -0.6600,  0.5008,  0.3991],
+            [-0.0036, -0.4467,  0.4553,  0.6204, -0.4564],
+            [-0.8041,  0.4480,  0.1725,  0.3108,  0.1622]])
+""")
+
+add_docstr(torch.t,
+           r"""
+t(input) -> Tensor
+
+Expects :attr:`input` to be a matrix (2-D tensor) and transposes dimensions 0
+and 1.
+
+Can be seen as a short-hand function for :meth:`transpose(input, 0, 1)`
+
+Args:
+    input (Tensor): the input tensor
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.4875,  0.9158, -0.5872],
+            [ 0.3938, -0.6929,  0.6932]])
+    >>> torch.t(x)
+    tensor([[ 0.4875,  0.3938],
+            [ 0.9158, -0.6929],
+            [-0.5872,  0.6932]])
+""")
+
+add_docstr(torch.flip,
+           r"""
+flip(input, dims) -> Tensor
+
+Reverse the order of a n-D tensor along given axis in dims.
+
+Args:
+    input (Tensor): the input tensor
+    dims (a list or tuple): axis to flip on
+
+Example::
+
+    >>> x = torch.arange(8).view(2, 2, 2)
+    >>> x
+    tensor([[[ 0,  1],
+             [ 2,  3]],
+
+            [[ 4,  5],
+             [ 6,  7]]])
+    >>> torch.flip(x, [0, 1])
+    tensor([[[ 6,  7],
+             [ 4,  5]],
+
+            [[ 2,  3],
+             [ 0,  1]]])
+""")
+
+add_docstr(torch.take,
+           r"""
+take(input, indices) -> Tensor
+
+Returns a new tensor with the elements of :attr:`input` at the given indices.
+The input tensor is treated as if it were viewed as a 1-D tensor. The result
+takes the same shape as the indices.
+
+Args:
+    input (Tensor): the input tensor
+    indices (LongTensor): the indices into tensor
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+                            [6, 7, 8]])
+    >>> torch.take(src, torch.tensor([0, 2, 5]))
+    tensor([ 4,  5,  8])
+""")
+
+add_docstr(torch.tan,
+           r"""
+tan(input, out=None) -> Tensor
+
+Returns a new tensor with the tangent of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tan(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-1.2027, -1.7687,  0.4412, -1.3856])
+    >>> torch.tan(a)
+    tensor([-2.5930,  4.9859,  0.4722, -5.3366])
+""")
+
+add_docstr(torch.tanh,
+           r"""
+tanh(input, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic tangent of the elements
+of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tanh(\text{input}_{i})
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.8986, -0.7279,  1.1745,  0.2611])
+    >>> torch.tanh(a)
+    tensor([ 0.7156, -0.6218,  0.8257,  0.2553])
+""")
+
+add_docstr(torch.topk,
+           r"""
+topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor)
+
+Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
+a given dimension.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+If :attr:`largest` is ``False`` then the `k` smallest elements are returned.
+
+A tuple of `(values, indices)` is returned, where the `indices` are the indices
+of the elements in the original `input` tensor.
+
+The boolean option :attr:`sorted` if ``True``, will make sure that the returned
+`k` elements are themselves sorted
+
+Args:
+    input (Tensor): the input tensor
+    k (int): the k in "top-k"
+    dim (int, optional): the dimension to sort along
+    largest (bool, optional): controls whether to return largest or
+           smallest elements
+    sorted (bool, optional): controls whether to return the elements
+           in sorted order
+    out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
+        optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.arange(1., 6.)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.])
+    >>> torch.topk(x, 3)
+    (tensor([ 5.,  4.,  3.]), tensor([ 4,  3,  2]))
+""")
+
+add_docstr(torch.trace,
+           r"""
+trace(input) -> Tensor
+
+Returns the sum of the elements of the diagonal of the input 2-D matrix.
+
+Example::
+
+    >>> x = torch.arange(1., 10.).view(3, 3)
+    >>> x
+    tensor([[ 1.,  2.,  3.],
+            [ 4.,  5.,  6.],
+            [ 7.,  8.,  9.]])
+    >>> torch.trace(x)
+    tensor(15.)
+""")
+
+add_docstr(torch.transpose,
+           r"""
+transpose(input, dim0, dim1) -> Tensor
+
+Returns a tensor that is a transposed version of :attr:`input`.
+The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+
+The resulting :attr:`out` tensor shares it's underlying storage with the
+:attr:`input` tensor, so changing the content of one would change the content
+of the other.
+
+Args:
+    input (Tensor): the input tensor
+    dim0 (int): the first dimension to be transposed
+    dim1 (int): the second dimension to be transposed
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 1.0028, -0.9893,  0.5809],
+            [-0.1669,  0.7299,  0.4942]])
+    >>> torch.transpose(x, 0, 1)
+    tensor([[ 1.0028, -0.1669],
+            [-0.9893,  0.7299],
+            [ 0.5809,  0.4942]])
+""")
+
+add_docstr(torch.tril,
+           r"""
+tril(input, diagonal=0, out=None) -> Tensor
+
+Returns the lower triangular part of the matrix (2-D tensor) :attr:`input`,
+the other elements of the result tensor :attr:`out` are set to 0.
+
+The lower triangular part of the matrix is defined as the elements on and
+below the diagonal.
+
+The argument :attr:`diagonal` controls which diagonal to consider. If
+:attr:`diagonal` = 0, all elements on and below the main diagonal are
+retained. A positive value includes just as many diagonals above the main
+diagonal, and similarly a negative value excludes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+:math:`d_{1}, d_{2}` are the dimensions of the matrix.
+
+Args:
+    input (Tensor): the input tensor
+    diagonal (int, optional): the diagonal to consider
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-1.0813, -0.8619,  0.7105],
+            [ 0.0935,  0.1380,  2.2112],
+            [-0.3409, -0.9828,  0.0289]])
+    >>> torch.tril(a)
+    tensor([[-1.0813,  0.0000,  0.0000],
+            [ 0.0935,  0.1380,  0.0000],
+            [-0.3409, -0.9828,  0.0289]])
+
+    >>> b = torch.randn(4, 6)
+    >>> b
+    tensor([[ 1.2219,  0.5653, -0.2521, -0.2345,  1.2544,  0.3461],
+            [ 0.4785, -0.4477,  0.6049,  0.6368,  0.8775,  0.7145],
+            [ 1.1502,  3.2716, -1.1243, -0.5413,  0.3615,  0.6864],
+            [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0978]])
+    >>> torch.tril(b, diagonal=1)
+    tensor([[ 1.2219,  0.5653,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 0.4785, -0.4477,  0.6049,  0.0000,  0.0000,  0.0000],
+            [ 1.1502,  3.2716, -1.1243, -0.5413,  0.0000,  0.0000],
+            [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0000]])
+    >>> torch.tril(b, diagonal=-1)
+    tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 0.4785,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 1.1502,  3.2716,  0.0000,  0.0000,  0.0000,  0.0000],
+            [-0.0614, -0.7344, -1.3164,  0.0000,  0.0000,  0.0000]])
+""")
+
+add_docstr(torch.triu,
+           r"""
+triu(input, diagonal=0, out=None) -> Tensor
+
+Returns the upper triangular part of the matrix (2-D tensor) :attr:`input`,
+the other elements of the result tensor :attr:`out` are set to 0.
+
+The upper triangular part of the matrix is defined as the elements on and
+above the diagonal.
+
+The argument :attr:`diagonal` controls which diagonal to consider. If
+:attr:`diagonal` = 0, all elements on and below the main diagonal are
+retained. A positive value excludes just as many diagonals above the main
+diagonal, and similarly a negative value includes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+:math:`d_{1}, d_{2}` are the dimensions of the matrix.
+
+Args:
+    input (Tensor): the input tensor
+    diagonal (int, optional): the diagonal to consider
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.2072, -1.0680,  0.6602],
+            [ 0.3480, -0.5211, -0.4573]])
+    >>> torch.triu(a)
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.0000, -1.0680,  0.6602],
+            [ 0.0000,  0.0000, -0.4573]])
+    >>> torch.triu(a, diagonal=1)
+    tensor([[ 0.0000,  0.5207,  2.0049],
+            [ 0.0000,  0.0000,  0.6602],
+            [ 0.0000,  0.0000,  0.0000]])
+    >>> torch.triu(a, diagonal=-1)
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.2072, -1.0680,  0.6602],
+            [ 0.0000, -0.5211, -0.4573]])
+
+    >>> b = torch.randn(4, 6)
+    >>> b
+    tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+            [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+            [ 0.4333,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+            [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.2830]])
+    >>> torch.tril(b, diagonal=1)
+    tensor([[ 0.5876, -0.0794,  0.0000,  0.0000,  0.0000,  0.0000],
+            [-0.2447,  0.9556, -1.2919,  0.0000,  0.0000,  0.0000],
+            [ 0.4333,  0.3146,  0.6576, -1.0432,  0.0000,  0.0000],
+            [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.0000]])
+    >>> torch.tril(b, diagonal=-1)
+    tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [-0.2447,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 0.4333,  0.3146,  0.0000,  0.0000,  0.0000,  0.0000],
+            [-0.9888,  1.0679, -1.3337,  0.0000,  0.0000,  0.0000]])
+""")
+
+add_docstr(torch.trtrs,
+           r"""
+trtrs(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+
+Solves a system of equations with a triangular coefficient matrix `A`
+and multiple right-hand sides `b`.
+
+In particular, solves :math:`AX = b` and assumes `A` is upper-triangular
+with the default keyword arguments.
+
+Args:
+    A (Tensor): the input triangular coefficient matrix
+    b (Tensor): multiple right-hand sides. Each column of `b` is a
+        right-hand side for the system of equations.
+    upper (bool, optional): whether to solve the upper-triangular system
+        of equations (default) or the lower-triangular system of equations. Default: True.
+    transpose (bool, optional): whether `A` should be transposed before
+        being sent into the solver. Default: False.
+    unitriangular (bool, optional): whether `A` is unit triangular.
+        If True, the diagonal elements of `A` are assumed to be
+        1 and not referenced from `A`. Default: False.
+
+Returns:
+    A tuple (X, M) where `M` is a clone of `A` and `X` is the solution to
+    `AX = b` (or whatever variant of the system of equations, depending on
+    the keyword arguments.)
+
+Shape:
+    - A: :math:`(N, N)`
+    - b: :math:`(N, C)`
+    - output[0]: :math:`(N, C)`
+    - output[1]: :math:`(N, N)`
+
+Examples::
+
+    >>> A = torch.randn(2, 2).triu()
+    >>> A
+    tensor([[ 1.1527, -1.0753],
+            [ 0.0000,  0.7986]])
+    >>> b = torch.randn(2, 3)
+    >>> b
+    tensor([[-0.0210,  2.3513, -1.5492],
+            [ 1.5429,  0.7403, -1.0243]])
+    >>> torch.trtrs(b, A)
+    (tensor([[ 1.7840,  2.9045, -2.5405],
+            [ 1.9319,  0.9269, -1.2826]]), tensor([[ 1.1527, -1.0753],
+            [ 0.0000,  0.7986]]))
+""")
+
+add_docstr(torch.trunc,
+           r"""
+trunc(input, out=None) -> Tensor
+
+Returns a new tensor with the truncated integer values of
+the elements of :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 3.4742,  0.5466, -0.8008, -0.9079])
+    >>> torch.trunc(a)
+    tensor([ 3.,  0., -0., -0.])
+""")
+
+add_docstr(torch.unsqueeze,
+           r"""
+unsqueeze(input, dim, out=None) -> Tensor
+
+Returns a new tensor with a dimension of size one inserted at the
+specified position.
+
+The returned tensor shares the same underlying data with this tensor.
+
+A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+applied at :attr:`dim` = ``dim + input.dim() + 1``.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the index at which to insert the singleton dimension
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4])
+    >>> torch.unsqueeze(x, 0)
+    tensor([[ 1,  2,  3,  4]])
+    >>> torch.unsqueeze(x, 1)
+    tensor([[ 1],
+            [ 2],
+            [ 3],
+            [ 4]])
+""")
+
+add_docstr(torch.var,
+           r"""
+.. function:: var(input, unbiased=True) -> Tensor
+
+Returns the variance of all elements in the :attr:`input` tensor.
+
+If :attr:`unbiased` is ``False``, then the variance will be calculated via the
+biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[-0.3425, -1.2636, -0.4864]])
+    >>> torch.var(a)
+    tensor(0.2455)
+
+
+.. function:: var(input, dim, keepdim=False, unbiased=True, out=None) -> Tensor
+
+Returns the variance of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size
+as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the outputs tensor having 1 fewer dimension than :attr:`input`.
+
+If :attr:`unbiased` is ``False``, then the variance will be calculated via the
+biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    unbiased (bool): whether to use the unbiased estimation or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.3567,  1.7385, -1.3042,  0.7423],
+            [ 1.3436, -0.1015, -0.9834, -0.8438],
+            [ 0.6056,  0.1089, -0.3112, -1.4085],
+            [-0.7700,  0.6074, -0.1469,  0.7777]])
+    >>> torch.var(a, 1)
+    tensor([ 1.7444,  1.1363,  0.7356,  0.5112])
+""")
+
+add_docstr(torch.zeros,
+           r"""
+zeros(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `0`, with the shape defined
+by the variable argument :attr:`sizes`.
+
+Args:
+    sizes (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.zeros(2, 3)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+
+    >>> torch.zeros(5)
+    tensor([ 0.,  0.,  0.,  0.,  0.])
+""".format(**factory_common_args))
+
+add_docstr(torch.zeros_like,
+           r"""
+zeros_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `0`, with the same size as
+:attr:`input`. ``torch.zeros_like(input)`` is equivalent to
+``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+.. warning::
+    As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+    the old ``torch.zeros_like(input, out=output)`` is equivalent to
+    ``torch.zeros(input.size(), out=output)``.
+
+Args:
+    {input}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> input = torch.empty(2, 3)
+    >>> torch.zeros_like(input)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+""".format(**factory_like_common_args))
+
+add_docstr(torch.btrifact_with_info,
+           r"""
+btrifact_with_info(A, pivot=True) -> (Tensor, IntTensor, IntTensor)
+
+Batch LU factorization with additional error information.
+
+This is a version of :meth:`torch.btrifact` that always creates an info
+`IntTensor`, and returns it as the third return value.
+
+Arguments:
+    A (Tensor): the tensor to factor
+    pivot (bool, optional): controls whether pivoting is done
+
+Returns:
+    A tuple containing factorization, pivots, and an `IntTensor` where non-zero
+    values indicate whether factorization for each minibatch sample succeeds.
+
+Example::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> A_LU, pivots, info = A.btrifact_with_info()
+    >>> if info.nonzero().size(0) == 0:
+    >>>   print('LU factorization succeeded for all samples!')
+    LU factorization succeeded for all samples!
+""")
+
+add_docstr(torch.btrisolve,
+           r"""
+btrisolve(b, LU_data, LU_pivots) -> Tensor
+
+Batch LU solve.
+
+Returns the LU solve of the linear system :math:`Ax = b`.
+
+Arguments:
+    b (Tensor): the RHS tensor
+    LU_data (Tensor): the pivoted LU factorization of A from :meth:`btrifact`.
+    LU_pivots (IntTensor): the pivots of the LU factorization
+
+Example::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> b = torch.randn(2, 3)
+    >>> A_LU = torch.btrifact(A)
+    >>> x = torch.btrisolve(b, *A_LU)
+    >>> torch.norm(torch.bmm(A, x.unsqueeze(2)) - b.unsqueeze(2))
+    tensor(1.00000e-07 *
+           2.8312)
+""")
+
+add_docstr(torch.empty,
+           r"""
+empty(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument :attr:`sizes`.
+
+Args:
+    sizes (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.empty(2, 3)
+    tensor(1.00000e-08 *
+           [[ 6.3984,  0.0000,  0.0000],
+            [ 0.0000,  0.0000,  0.0000]])
+
+""".format(**factory_common_args))
+
+add_docstr(torch.empty_like,
+           r"""
+empty_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Returns an uninitialized tensor with the same size as :attr:`input`.
+``torch.empty_like(input)`` is equivalent to
+``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> input = torch.empty((2,3), dtype=torch.int64)
+    >>> input.new(input.size())
+    tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+            [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+""".format(**factory_like_common_args))
+
+add_docstr(torch.full,
+           r"""
+full(size, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor of size :attr:`size` filled with :attr:`fill_value`.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    fill_value: the number to fill the output tensor with.
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.full((2, 3), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416]])
+
+""".format(**factory_common_args))
+
+add_docstr(torch.full_like,
+           r"""
+full_like(input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
+``torch.full_like(input, fill_value)`` is equivalent to
+``torch.full_like(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+    fill_value: the number to fill the output tensor with.
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+""".format(**factory_like_common_args))
+
+add_docstr(torch.det,
+           r"""
+det(A) -> Tensor
+
+Calculates determinant of a 2D square tensor.
+
+.. note::
+    Backward through :meth:`det` internally uses SVD results when :attr:`A` is
+    not invertible. In this case, double backward through :meth:`det` will be
+    unstable in when :attr:`A` doesn't have distinct singular values. See
+    :meth:`~torch.svd` for details.
+
+Arguments:
+    A (Tensor): The input 2D square tensor
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.det(A)
+    tensor(3.7641)
+""")
+
+add_docstr(torch.where,
+           r"""
+where(condition, x, y) -> Tensor
+
+Return a tensor of elements selected from either :attr:`x` or :attr:`y`, depending on :attr:`condition`.
+
+The operation is defined as:
+
+.. math::
+    out_i = \begin{cases}
+        x_i & \text{if } condition_i \\
+        y_i & \text{otherwise} \\
+    \end{cases}
+
+.. note::
+    The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Arguments:
+    condition (ByteTensor): When True (nonzero), yield x, otherwise yield y
+    x (Tensor): values selected at indices where :attr:`condition` is ``True``
+    y (Tensor): values selected at indices where :attr:`condition` is ``False``
+
+Returns:
+    Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`x`, :attr:`y`
+
+Example::
+
+    >>> x = torch.randn(3, 2)
+    >>> y = torch.ones(3, 2)
+    >>> x
+    tensor([[-0.4620,  0.3139],
+            [ 0.3898, -0.7197],
+            [ 0.0478, -0.1657]])
+    >>> torch.where(x > 0, x, y)
+    tensor([[ 1.0000,  0.3139],
+            [ 0.3898,  1.0000],
+            [ 0.0478,  1.0000]])
+""")
+
+add_docstr(torch.logdet,
+           r"""
+logdet(A) -> Tensor
+
+Calculates log determinant of a 2D square tensor.
+
+.. note::
+    Result is ``-inf`` if :attr:`A` has zero log determinant, and is ``nan`` if
+    :attr:`A` has negative determinant.
+
+.. note::
+    Backward through :meth:`logdet` internally uses SVD results when :attr:`A`
+    is not invertible. In this case, double backward through :meth:`logdet` will
+    be unstable in when :attr:`A` doesn't have distinct singular values. See
+    :meth:`~torch.svd` for details.
+
+Arguments:
+    A (Tensor): The input 2D square tensor
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.det(A)
+    tensor(0.2611)
+    >>> torch.logdet(A)
+    tensor(-1.3430)
+""")
+
+add_docstr(torch.slogdet,
+           r"""
+slogdet(A) -> (Tensor, Tensor)
+
+Calculates the sign and log value of a 2D square tensor's determinant.
+
+.. note::
+    If ``A`` has zero determinant, this returns ``(0, -inf)``.
+
+.. note::
+    Backward through :meth:`slogdet` internally uses SVD results when :attr:`A`
+    is not invertible. In this case, double backward through :meth:`slogdet`
+    will be unstable in when :attr:`A` doesn't have distinct singular values.
+    See :meth:`~torch.svd` for details.
+
+Arguments:
+    A (Tensor): The input 2D square tensor
+
+Returns:
+    A tuple containing the sign of the determinant, and the log value of the
+    absolute determinant.
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.det(A)
+    tensor(-4.8215)
+    >>> torch.logdet(A)
+    tensor(nan)
+    >>> torch.slogdet(A)
+    (tensor(-1.), tensor(1.5731))
+""")
+
+add_docstr(torch.pinverse,
+           r"""
+pinverse(input, rcond=1e-15) -> Tensor
+
+Calculates the pseudo-inverse (also known as the Moore-Penrose inverse) of a 2D tensor.
+Please look at `Moore-Penrose inverse`_ for more details
+
+.. note::
+    This method is implemented using the Singular Value Decomposition.
+
+.. note::
+    The pseudo-inverse is not necessarily a continuous function in the elements of the matrix `[1]`_.
+    Therefore, derivatives are not always existent, and exist for a constant rank only `[2]`_.
+    However, this method is backprop-able due to the implementation by using SVD results, and
+    could be unstable. Double-backward will also be unstable due to the usage of SVD internally.
+    See :meth:`~torch.svd` for more details.
+
+Arguments:
+    input (Tensor): The input 2D tensor of dimensions :math:`m \times n`
+    rcond (float): A floating point value to determine the cutoff for small singular values.
+                   Default: 1e-15
+
+Returns:
+    The pseudo-inverse of :attr:`input` of dimensions :math:`n \times m`
+
+Example::
+
+    >>> input = torch.randn(3, 5)
+    >>> input
+    tensor([[ 0.5495,  0.0979, -1.4092, -0.1128,  0.4132],
+            [-1.1143, -0.3662,  0.3042,  1.6374, -0.9294],
+            [-0.3269, -0.5745, -0.0382, -0.5922, -0.6759]])
+    >>> torch.pinverse(input)
+    tensor([[ 0.0600, -0.1933, -0.2090],
+            [-0.0903, -0.0817, -0.4752],
+            [-0.7124, -0.1631, -0.2272],
+            [ 0.1356,  0.3933, -0.5023],
+            [-0.0308, -0.1725, -0.5216]])
+
+.. _Moore-Penrose inverse: https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
+
+.. _[1]: https://epubs.siam.org/doi/10.1137/0117004
+
+.. _[2]: https://www.jstor.org/stable/2156365
+""")
+
+add_docstr(torch.fft,
+           r"""
+fft(input, signal_ndim, normalized=False) -> Tensor
+
+Complex-to-complex Discrete Fourier Transform
+
+This method computes the complex-to-complex discrete Fourier transform.
+Ignoring the batch dimensions, it computes the following expression:
+
+.. math::
+    X[\omega_1, \dots, \omega_d] =
+        \frac{1}{\prod_{i=1}^d N_i} \sum_{n_1=0}^{N_1} \dots \sum_{n_d=0}^{N_d} x[n_1, \dots, n_d]
+         e^{-j\ 2 \pi \sum_{i=0}^d \frac{\omega_i n_i}{N_i}},
+
+where :math:`d` = :attr:`signal_ndim` is number of dimensions for the
+signal, and :math:`N_i` is the size of signal dimension :math:`i`.
+
+This method supports 1D, 2D and 3D complex-to-complex transforms, indicated
+by :attr:`signal_ndim`. :attr:`input` must be a tensor with last dimension
+of size 2, representing the real and imaginary components of complex
+numbers, and should have at least ``signal_ndim + 1`` dimensions with optionally
+arbitrary number of leading batch dimensions. If :attr:`normalized` is set to
+``True``, this normalizes the result by dividing it with
+:math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is unitary.
+
+Returns the real and the imaginary parts together as one tensor of the same
+shape of :attr:`input`.
+
+The inverse of this function is :func:`~torch.ifft`.
+
+.. note::
+    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
+    repeatedly running FFT methods on tensors of same geometry with same
+    same configuration.
+
+    Changing ``torch.backends.cuda.cufft_plan_cache.max_size`` (default 1023)
+    controls the capacity of this cache. Some cuFFT plans may allocate GPU
+    memory. You may use ``torch.backends.cuda.cufft_plan_cache.size`` to query
+    the number of plans currently in cache, and
+    ``torch.backends.cuda.cufft_plan_cache.clear()`` to clear the cache.
+
+.. warning::
+    For CPU tensors, this method is currently only available with MKL. Use
+    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
+
+Arguments:
+    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
+        dimensions
+    signal_ndim (int): the number of dimensions in each signal.
+        :attr:`signal_ndim` can only be 1, 2 or 3
+    normalized (bool, optional): controls whether to return normalized results.
+        Default: ``False``
+
+Returns:
+    Tensor: A tensor containing the complex-to-complex Fourier transform result
+
+Example::
+
+    >>> # unbatched 2D FFT
+    >>> x = torch.randn(4, 3, 2)
+    >>> torch.fft(x, 2)
+    tensor([[[-0.0876,  1.7835],
+             [-2.0399, -2.9754],
+             [ 4.4773, -5.0119]],
+
+            [[-1.5716,  2.7631],
+             [-3.8846,  5.2652],
+             [ 0.2046, -0.7088]],
+
+            [[ 1.9938, -0.5901],
+             [ 6.5637,  6.4556],
+             [ 2.9865,  4.9318]],
+
+            [[ 7.0193,  1.1742],
+             [-1.3717, -2.1084],
+             [ 2.0289,  2.9357]]])
+    >>> # batched 1D FFT
+    >>> torch.fft(x, 1)
+    tensor([[[ 1.8385,  1.2827],
+             [-0.1831,  1.6593],
+             [ 2.4243,  0.5367]],
+
+            [[-0.9176, -1.5543],
+             [-3.9943, -2.9860],
+             [ 1.2838, -2.9420]],
+
+            [[-0.8854, -0.6860],
+             [ 2.4450,  0.0808],
+             [ 1.3076, -0.5768]],
+
+            [[-0.1231,  2.7411],
+             [-0.3075, -1.7295],
+             [-0.5384, -2.0299]]])
+    >>> # arbitrary number of batch dimensions, 2D FFT
+    >>> x = torch.randn(3, 3, 5, 5, 2)
+    >>> y = torch.fft(x, 2)
+    >>> y.shape
+    torch.Size([3, 3, 5, 5, 2])
+
+""")
+
+add_docstr(torch.ifft,
+           r"""
+ifft(input, signal_ndim, normalized=False) -> Tensor
+
+Complex-to-complex Inverse Discrete Fourier Transform
+
+This method computes the complex-to-complex inverse discrete Fourier
+transform. Ignoring the batch dimensions, it computes the following
+expression:
+
+.. math::
+    X[\omega_1, \dots, \omega_d] =
+        \frac{1}{\prod_{i=1}^d N_i} \sum_{n_1=0}^{N_1} \dots \sum_{n_d=0}^{N_d} x[n_1, \dots, n_d]
+         e^{\ j\ 2 \pi \sum_{i=0}^d \frac{\omega_i n_i}{N_i}},
+
+where :math:`d` = :attr:`signal_ndim` is number of dimensions for the
+signal, and :math:`N_i` is the size of signal dimension :math:`i`.
+
+The argument specifications are almost identical with :func:`~torch.fft`.
+However, if :attr:`normalized` is set to ``True``, this instead returns the
+results multiplied by :math:`\sqrt{\prod_{i=1}^d N_i}`, to become a unitary
+operator. Therefore, to invert a :func:`~torch.fft`, the :attr:`normalized`
+argument should be set identically for :func:`~torch.fft`.
+
+Returns the real and the imaginary parts together as one tensor of the same
+shape of :attr:`input`.
+
+The inverse of this function is :func:`~torch.fft`.
+
+.. note::
+    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
+    repeatedly running FFT methods on tensors of same geometry with same
+    same configuration.
+
+    Changing ``torch.backends.cuda.cufft_plan_cache.max_size`` (default 1023)
+    controls the capacity of this cache. Some cuFFT plans may allocate GPU
+    memory. You may use ``torch.backends.cuda.cufft_plan_cache.size`` to query
+    the number of plans currently in cache, and
+    ``torch.backends.cuda.cufft_plan_cache.clear()`` to clear the cache.
+
+.. warning::
+    For CPU tensors, this method is currently only available with MKL. Use
+    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
+
+Arguments:
+    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
+        dimensions
+    signal_ndim (int): the number of dimensions in each signal.
+        :attr:`signal_ndim` can only be 1, 2 or 3
+    normalized (bool, optional): controls whether to return normalized results.
+        Default: ``False``
+
+Returns:
+    Tensor: A tensor containing the complex-to-complex inverse Fourier transform result
+
+Example::
+
+    >>> x = torch.randn(3, 3, 2)
+    >>> x
+    tensor([[[ 1.2766,  1.3680],
+             [-0.8337,  2.0251],
+             [ 0.9465, -1.4390]],
+
+            [[-0.1890,  1.6010],
+             [ 1.1034, -1.9230],
+             [-0.9482,  1.0775]],
+
+            [[-0.7708, -0.8176],
+             [-0.1843, -0.2287],
+             [-1.9034, -0.2196]]])
+    >>> y = torch.fft(x, 2)
+    >>> torch.ifft(y, 2)  # recover x
+    tensor([[[ 1.2766,  1.3680],
+             [-0.8337,  2.0251],
+             [ 0.9465, -1.4390]],
+
+            [[-0.1890,  1.6010],
+             [ 1.1034, -1.9230],
+             [-0.9482,  1.0775]],
+
+            [[-0.7708, -0.8176],
+             [-0.1843, -0.2287],
+             [-1.9034, -0.2196]]])
+
+""")
+
+add_docstr(torch.rfft,
+           r"""
+rfft(input, signal_ndim, normalized=False, onesided=True) -> Tensor
+
+Real-to-complex Discrete Fourier Transform
+
+This method computes the real-to-complex discrete Fourier transform. It is
+mathematically equivalent with :func:`~torch.fft` with differences only in
+formats of the input and output.
+
+This method supports 1D, 2D and 3D real-to-complex transforms, indicated
+by :attr:`signal_ndim`. :attr:`input` must be a tensor with at least
+``signal_ndim`` dimensions with optionally arbitrary number of leading batch
+dimensions. If :attr:`normalized` is set to ``True``, this normalizes the result
+by multiplying it with :math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is
+unitary, where :math:`N_i` is the size of signal dimension :math:`i`.
+
+The real-to-complex Fourier transform results follow conjugate symmetry:
+
+.. math::
+    X[\omega_1, \dots, \omega_d] = X^*[N_1 - \omega_1, \dots, N_d - \omega_d],
+
+where the index arithmetic is computed modulus the size of the corresponding
+dimension, :math:`\ ^*` is the conjugate operator, and
+:math:`d` = :attr:`signal_ndim`. :attr:`onesided` flag controls whether to avoid
+redundancy in the output results. If set to ``True`` (default), the output will
+not be full complex result of shape :math:`(*, 2)`, where :math:`*` is the shape
+of :attr:`input`, but instead the last dimension will be halfed as of size
+:math:`\lfloor \frac{N_d}{2} \rfloor + 1`.
+
+The inverse of this function is :func:`~torch.irfft`.
+
+.. note::
+    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
+    repeatedly running FFT methods on tensors of same geometry with same
+    same configuration.
+
+    Changing ``torch.backends.cuda.cufft_plan_cache.max_size`` (default 1023)
+    controls the capacity of this cache. Some cuFFT plans may allocate GPU
+    memory. You may use ``torch.backends.cuda.cufft_plan_cache.size`` to query
+    the number of plans currently in cache, and
+    ``torch.backends.cuda.cufft_plan_cache.clear()`` to clear the cache.
+
+.. warning::
+    For CPU tensors, this method is currently only available with MKL. Use
+    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
+
+Arguments:
+    input (Tensor): the input tensor of at least :attr:`signal_ndim` dimensions
+    signal_ndim (int): the number of dimensions in each signal.
+        :attr:`signal_ndim` can only be 1, 2 or 3
+    normalized (bool, optional): controls whether to return normalized results.
+        Default: ``False``
+    onesided (bool, optional): controls whether to return half of results to
+        avoid redundancy. Default: ``True``
+
+Returns:
+    Tensor: A tensor containing the real-to-complex Fourier transform result
+
+Example::
+
+    >>> x = torch.randn(5, 5)
+    >>> torch.rfft(x, 2).shape
+    torch.Size([5, 3, 2])
+    >>> torch.rfft(x, 2, onesided=False).shape
+    torch.Size([5, 5, 2])
+
+""")
+
+
+add_docstr(torch.irfft,
+           r"""
+irfft(input, signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
+
+Complex-to-real Inverse Discrete Fourier Transform
+
+This method computes the complex-to-real inverse discrete Fourier transform.
+It is mathematically equivalent with :func:`ifft` with differences only in
+formats of the input and output.
+
+The argument specifications are almost identical with :func:`~torch.ifft`.
+Similar to :func:`~torch.ifft`, if :attr:`normalized` is set to ``True``,
+this normalizes the result by multiplying it with
+:math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is unitary, where
+:math:`N_i` is the size of signal dimension :math:`i`.
+
+Due to the conjugate symmetry, :attr:`input` do not need to contain the full
+complex frequency values. Roughly half of the values will be sufficient, as
+is the case when :attr:`input` is given by :func:`~torch.rfft` with
+``rfft(signal, onesided=True)``. In such case, set the :attr:`onesided`
+argument of this method to ``True``. Moreover, the original signal shape
+information can sometimes be lost, optionally set :attr:`signal_sizes` to be
+the size of the original signal (without the batch dimensions if in batched
+mode) to recover it with correct shape.
+
+Therefore, to invert an :func:`~torch.rfft`, the :attr:`normalized` and
+:attr:`onesided` arguments should be set identically for :func:`~torch.irfft`,
+and preferrably a :attr:`signal_sizes` is given to avoid size mismatch. See the
+example below for a case of size mismatch.
+
+See :func:`~torch.rfft` for details on conjugate symmetry.
+
+The inverse of this function is :func:`~torch.rfft`.
+
+.. warning::
+    Generally speaking, the input of this function should contain values
+    following conjugate symmetry. Note that even if :attr:`onesided` is
+    ``True``, often symmetry on some part is still needed. When this
+    requirement is not satisfied, the behavior of :func:`~torch.irfft` is
+    undefined. Since :func:`torch.autograd.gradcheck` estimates numerical
+    Jacobian with point perturbations, :func:`~torch.irfft` will almost
+    certainly fail the check.
+
+.. note::
+    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
+    repeatedly running FFT methods on tensors of same geometry with same
+    same configuration.
+
+    Changing ``torch.backends.cuda.cufft_plan_cache.max_size`` (default 1023)
+    controls the capacity of this cache. Some cuFFT plans may allocate GPU
+    memory. You may use ``torch.backends.cuda.cufft_plan_cache.size`` to query
+    the number of plans currently in cache, and
+    ``torch.backends.cuda.cufft_plan_cache.clear()`` to clear the cache.
+
+.. warning::
+    For CPU tensors, this method is currently only available with MKL. Use
+    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
+
+Arguments:
+    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
+        dimensions
+    signal_ndim (int): the number of dimensions in each signal.
+        :attr:`signal_ndim` can only be 1, 2 or 3
+    normalized (bool, optional): controls whether to return normalized results.
+        Default: ``False``
+    onesided (bool, optional): controls whether :attr:`input` was halfed to avoid
+        redundancy, e.g., by :func:`rfft`. Default: ``True``
+    signal_sizes (list or :class:`torch.Size`, optional): the size of the original
+        signal (without batch dimension). Default: ``None``
+
+Returns:
+    Tensor: A tensor containing the complex-to-real inverse Fourier transform result
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> torch.rfft(x, 2, onesided=True).shape
+    torch.Size([4, 3, 2])
+    >>>
+    >>> # notice that with onesided=True, output size does not determine the original signal size
+    >>> x = torch.randn(4, 5)
+
+    >>> torch.rfft(x, 2, onesided=True).shape
+    torch.Size([4, 3, 2])
+    >>>
+    >>> # now we use the original shape to recover x
+    >>> x
+    tensor([[-0.8992,  0.6117, -1.6091, -0.4155, -0.8346],
+            [-2.1596, -0.0853,  0.7232,  0.1941, -0.0789],
+            [-2.0329,  1.1031,  0.6869, -0.5042,  0.9895],
+            [-0.1884,  0.2858, -1.5831,  0.9917, -0.8356]])
+    >>> y = torch.rfft(x, 2, onesided=True)
+    >>> torch.irfft(y, 2, onesided=True, signal_sizes=x.shape)  # recover x
+    tensor([[-0.8992,  0.6117, -1.6091, -0.4155, -0.8346],
+            [-2.1596, -0.0853,  0.7232,  0.1941, -0.0789],
+            [-2.0329,  1.1031,  0.6869, -0.5042,  0.9895],
+            [-0.1884,  0.2858, -1.5831,  0.9917, -0.8356]])
+
+""")
+
+
+add_docstr(torch.hann_window,
+           """
+hann_window(window_length, periodic=True, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+""" + r"""
+Hann window function.
+
+.. math::
+    w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+            \sin^2 \left( \frac{\pi n}{N - 1} \right),
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window_length} + 1`. Also, we always have
+``torch.hann_window(L, periodic=True)`` equal to
+``torch.hann_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+""" + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window_length}},)` containing the window
+
+""".format(**factory_common_args))
+
+
+add_docstr(torch.hamming_window,
+           """
+hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+""" + r"""
+Hamming window function.
+
+.. math::
+    w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window_length} + 1`. Also, we always have
+``torch.hamming_window(L, periodic=True)`` equal to
+``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+
+.. note::
+    This is a generalized version of :meth:`torch.hann_window`.
+""" + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window_length}},)` containing the window
+
+""".format(**factory_common_args))
+
+
+add_docstr(torch.bartlett_window,
+           """
+bartlett_window(window_length, periodic=True, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+""" + r"""
+Bartlett window function.
+
+.. math::
+    w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+        \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+        2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+    \end{cases},
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window_length} + 1`. Also, we always have
+``torch.bartlett_window(L, periodic=True)`` equal to
+``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+""" + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window_length}},)` containing the window
+
+""".format(**factory_common_args))
+
+
+add_docstr(torch.blackman_window,
+           """
+blackman_window(window_length, periodic=True, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+""" + r"""
+Blackman window function.
+
+.. math::
+    w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window_length} + 1`. Also, we always have
+``torch.blackman_window(L, periodic=True)`` equal to
+``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+""" + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window_length}},)` containing the window
+
+""".format(**factory_common_args))
+
+
+add_docstr(torch.unbind,
+           r"""
+unbind(tensor, dim=0) -> seq
+
+Removes a tensor dimension.
+
+Returns a tuple of all slices along a given dimension, already without it.
+
+Arguments:
+    tensor (Tensor): the tensor to unbind
+    dim (int): dimension to remove
+
+Example::
+
+    >>> torch.unbind(torch.tensor([[1, 2, 3],
+    >>>                            [4, 5, 6],
+    >>>                            [7, 8, 9]]))
+    (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+""")
+
+
+add_docstr(torch.meshgrid,
+           r"""
+meshgrid(seq) -> seq
+
+Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional
+vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by
+expanding the :math:`i`th input over dimensions defined by other inputs.
+
+Arguments:
+    seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be
+        treated as tensors of size :math:`(1,)` automatically.
+
+Returns:
+    seq (sequence of Tensors): If the input has :math:`k` tensors of size
+        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
+        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> y = torch.tensor([4, 5, 6])
+    >>> grid_x, grid_y = torch.meshgrid([x, y])
+    >>> grid_x
+    tensor([[1, 1, 1],
+            [2, 2, 2],
+            [3, 3, 3]])
+    >>> grid_y
+    tensor([[4, 5, 6],
+            [4, 5, 6],
+            [4, 5, 6]])
+""")
diff --git a/torch/_utils.py b/torch/_utils.py
new file mode 100644
index 0000000..03fa124
--- /dev/null
+++ b/torch/_utils.py
@@ -0,0 +1,261 @@
+import torch
+import importlib
+import warnings
+from collections import defaultdict
+
+
+def _type(self, dtype=None, non_blocking=False, **kwargs):
+    """Returns the type if `dtype` is not provided, else casts this object to
+    the specified type.
+
+    If this is already of the correct type, no copy is performed and the
+    original object is returned.
+
+    Args:
+        dtype (type or string): The desired type
+        non_blocking (bool): If ``True``, and the source is in pinned memory
+            and destination is on the GPU or vice versa, the copy is performed
+            asynchronously with respect to the host. Otherwise, the argument
+            has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument. The ``async`` arg is deprecated.
+    """
+    non_blocking = _get_async_or_non_blocking('type', non_blocking, kwargs)
+    if dtype is None:
+        return self.__module__ + '.' + self.__class__.__name__
+
+    if isinstance(dtype, str):
+        dtype = _import_dotted_name(dtype)
+    if dtype == type(self):
+        return self
+    if self.is_sparse:
+        if not dtype.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_module_name = dtype.__module__.replace('.sparse', '')
+        new_values_type_name = new_module_name + '.' + dtype.__name__
+        new_values = self._values().type(new_values_type_name, non_blocking)
+        new_indices_type_name = new_module_name + '.LongTensor'
+        new_indices = self._indices().type(new_indices_type_name, non_blocking)
+        return dtype(new_indices, new_values, self.size())
+    if dtype.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
+    return dtype(self.size()).copy_(self, non_blocking)
+
+
+def _cuda(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in CUDA memory.
+
+    If this object is already in CUDA memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination GPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking('cuda', non_blocking, kwargs)
+    if self.is_cuda:
+        if device is None:
+            device = torch.cuda.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = self._indices().cuda(device, non_blocking)
+            values = self._values().cuda(device, non_blocking)
+            return new_type(indices, values, self.size())
+        else:
+            new_type = getattr(torch.cuda, self.__class__.__name__)
+            return new_type(self.size()).copy_(self, non_blocking)
+
+
+def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
+    if not kwargs:
+        return non_blocking
+    if len(kwargs) != 1 or 'async' not in kwargs:
+        message = "{}() got an unexpected keyword argument '{}'"
+        argument = list(kwargs.keys()).pop()
+        raise TypeError(message.format(function_name, argument))
+    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    return kwargs['async']
+
+
+def _rebuild_tensor(storage, storage_offset, size, stride):
+    class_name = storage.__class__.__name__.replace('Storage', 'Tensor')
+    module = importlib.import_module(storage.__module__)
+    tensor_class = getattr(module, class_name)
+    return tensor_class().set_(storage, storage_offset, size, stride)
+
+
+def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks):
+    tensor = _rebuild_tensor(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _import_dotted_name(name):
+    components = name.split('.')
+    obj = __import__(components[0])
+    for component in components[1:]:
+        obj = getattr(obj, component)
+    return obj
+
+
+# Taken from python 3.5 docs
+def _accumulate(iterable, fn=lambda x, y: x + y):
+    'Return running totals'
+    # _accumulate([1,2,3,4,5]) --> 1 3 6 10 15
+    # _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
+    it = iter(iterable)
+    try:
+        total = next(it)
+    except StopIteration:
+        return
+    yield total
+    for element in it:
+        total = fn(total, element)
+        yield total
+
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+
+
+def _flatten_sparse_tensors(tensors):
+    """Flatten sparse tensors into two contiguous 1D buffers, one of indices and
+    one of values. Assume tensors are of same sparse type.
+
+    Arguments:
+        tensors (Iterable[Tensor]): sparse tensors to flatten.
+
+    Returns:
+        A tuple of two contiguous 1D buffers, one containing input tensors'
+        indices and the other containing the values.
+    """
+    flat_indices = _flatten_dense_tensors([t._indices() for t in tensors])
+    flat_values = _flatten_dense_tensors([t._values() for t in tensors])
+    return flat_indices, flat_values
+
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+
+
+def _unflatten_sparse_tensors(flat, tensors):
+    """View flat buffer (containing indices and values) using the sizes of
+    tensors. Assume that tensors are of same sparse type, and that flat is given
+    by _flatten_sparse_tensors.
+
+    Arguments:
+        flat (tuple(Tensor, Tensor)): flattened indices and values of sparse
+          tensors to unflatten.
+        tensors (Iterable[Tensor]): sparse tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened sparse tensors with sizes same as tensors and values from
+        flat.
+    """
+    flat_indices, flat_values = flat
+    indices = _unflatten_dense_tensors(flat_indices, [t._indices() for t in tensors])
+    values = _unflatten_dense_tensors(flat_values, [t._values() for t in tensors])
+    outputs = []
+    for t, i, v in zip(tensors, indices, values):
+        outputs.append(t.new(i, v, t.size()))
+    return tuple(outputs)
+
+
+def _reorder_tensors_as(tensors, ordered_tensors):
+    """Assume that tensors are of same order as ordered_tensors within their
+    types, e.g., from _take_tensors. Reorder them to be of same order as
+    ordered_tensors.
+
+    Arguments:
+        tensors (Iterable[Tensor]): tensors to be reordered. They should be of
+          the same order as ordered_tensors within their own types.
+        ordered_tensors (Iterable[Tensor]): tensors whose order will be the
+          reference.
+
+    Returns:
+        Ordered tuple of tensors with contents from tensors and order of
+        ordered_tensors.
+    """
+    type_dict = defaultdict(list)
+    for tensor in tensors:
+        type_dict[tensor.type()].append(tensor)
+    type_dict = {t: iter(coll) for t, coll in type_dict.items()}
+    return tuple(next(type_dict[tensor.type()]) for tensor in ordered_tensors)
+
+
+def _take_tensors(tensors, size_limit):
+    """Group tensors into chunks. This generator yields a chunk at each time,
+    each containing tensors of same type up to certain byte limit in total size.
+
+    Args:
+        tensors (Sequence): A sequence of tensors to be separated into chunks.
+        size_limit (int): The limit of each chunk in bytes.
+
+    Yields:
+        Blocks of tensors of same type and within size_limit. The yielded
+        tensors are only ordered as the original sequence within its types.
+    """
+    buf_dict = defaultdict(lambda: [[], 0])
+    for tensor in tensors:
+        t = tensor.type()
+        if tensor.is_sparse:
+            indices = tensor._indices()
+            values = tensor._values()
+            size = indices.numel() * indices.element_size() + values.numel() * values.element_size()
+        else:
+            size = tensor.numel() * tensor.element_size()
+        buf_and_size = buf_dict[t]
+        if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
+            yield buf_and_size[0]
+            buf_and_size = buf_dict[t] = [[], 0]
+        buf_and_size[0].append(tensor)
+        buf_and_size[1] += size
+    for buf, _ in buf_dict.values():
+        if len(buf) > 0:
+            yield buf
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
new file mode 100644
index 0000000..611dc78
--- /dev/null
+++ b/torch/_utils_internal.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+
+# this arbitrary-looking assortment of functionality is provided here
+# to have a central place for overrideable behavior. The motivating
+# use is the FB build environment, where this source file is replaced
+# by an equivalent.
+
+if os.path.basename(os.path.dirname(__file__)) == 'shared':
+    torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+else:
+    torch_parent = os.path.dirname(os.path.dirname(__file__))
+
+
+def get_file_path(*path_components):
+    return os.path.join(torch_parent, *path_components)
+
+
+def get_file_path_2(*path_components):
+    return os.path.join(*path_components)
+
+
+def get_writable_path(path):
+    return path
+
+
+def prepare_multiprocessing_environment(path):
+    pass
+
+
+TEST_MASTER_ADDR = '127.0.0.1'
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
new file mode 100644
index 0000000..9b961c1
--- /dev/null
+++ b/torch/autograd/__init__.py
@@ -0,0 +1,172 @@
+"""
+``torch.autograd`` provides classes and functions implementing automatic
+differentiation of arbitrary scalar valued functions. It requires minimal
+changes to the existing code - you only need to declare :class:`Tensor` s
+for which gradients should be computed with the ``requires_grad=True`` keyword.
+"""
+import torch
+import warnings
+
+from .variable import Variable
+from .function import Function, NestedIOFunction
+from .gradcheck import gradcheck, gradgradcheck
+from .grad_mode import no_grad, enable_grad, set_grad_enabled
+from .anomaly_mode import detect_anomaly, set_detect_anomaly
+from . import profiler
+
+__all__ = ['Variable', 'Function', 'backward', 'grad_mode']
+
+
+def _make_grads(outputs, grads):
+    new_grads = []
+    for out, grad in zip(outputs, grads):
+        if isinstance(grad, torch.Tensor):
+            new_grads.append(grad)
+        elif grad is None:
+            if out.requires_grad:
+                if out.numel() != 1:
+                    raise RuntimeError("grad can be implicitly created only for scalar outputs")
+                new_grads.append(torch.ones_like(out))
+            else:
+                new_grads.append(None)
+        else:
+            raise TypeError("gradients can be either Tensors or None, but got " +
+                            type(grad).__name__)
+    return tuple(new_grads)
+
+
+def backward(tensors, grad_tensors=None, retain_graph=None, create_graph=False, grad_variables=None):
+    r"""Computes the sum of gradients of given tensors w.r.t. graph leaves.
+
+    The graph is differentiated using the chain rule. If any of ``tensors``
+    are non-scalar (i.e. their data has more than one element) and require
+    gradient, the function additionally requires specifying ``grad_tensors``.
+    It should be a sequence of matching length, that contains gradient of
+    the differentiated function w.r.t. corresponding tensors (``None`` is an
+    acceptable value for all tensors that don't need gradient tensors).
+
+    This function accumulates gradients in the leaves - you might need to zero
+    them before calling it.
+
+    Arguments:
+        tensors (sequence of Tensor): Tensors of which the derivative will be
+            computed.
+        grad_tensors (sequence of (Tensor or None)): Gradients w.r.t.
+            each element of corresponding tensors. None values can be specified for
+            scalar Tensors or ones that don't require grad. If a None value would
+            be acceptable for all grad_tensors, then this argument is optional.
+        retain_graph (bool, optional): If ``False``, the graph used to compute the grad
+            will be freed. Note that in nearly all cases setting this option to ``True``
+            is not needed and often can be worked around in a much more efficient
+            way. Defaults to the value of ``create_graph``.
+        create_graph (bool, optional): If ``True``, graph of the derivative will
+            be constructed, allowing to compute higher order derivative products.
+            Defaults to ``False``.
+    """
+    if grad_variables is not None:
+        warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
+        if grad_tensors is None:
+            grad_tensors = grad_variables
+        else:
+            raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
+                               "arguments both passed to backward(). Please only "
+                               "use 'grad_tensors'.")
+
+    tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
+
+    if grad_tensors is None:
+        grad_tensors = [None] * len(tensors)
+    elif isinstance(grad_tensors, torch.Tensor):
+        grad_tensors = [grad_tensors]
+    else:
+        grad_tensors = list(grad_tensors)
+
+    grad_tensors = _make_grads(tensors, grad_tensors)
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    Variable._execution_engine.run_backward(
+        tensors, grad_tensors, retain_graph, create_graph,
+        allow_unreachable=True)  # allow_unreachable flag
+
+
+def grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False,
+         only_inputs=True, allow_unused=False):
+    r"""Computes and returns the sum of gradients of outputs w.r.t. the inputs.
+
+    ``grad_outputs`` should be a sequence of length matching ``output``
+    containing the pre-computed gradients w.r.t. each of the outputs. If an
+    output doesn't require_grad, then the gradient can be ``None``).
+
+    If ``only_inputs`` is ``True``, the function will only return a list of gradients
+    w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining
+    leaves will still be computed, and will be accumulated into their ``.grad``
+    attribute.
+
+    Arguments:
+        outputs (sequence of Tensor): outputs of the differentiated function.
+        inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
+            returned (and not accumulated into ``.grad``).
+        grad_outputs (sequence of Tensor): Gradients w.r.t. each output.
+            None values can be specified for scalar Tensors or ones that don't require
+            grad. If a None value would be acceptable for all grad_tensors, then this
+            argument is optional. Default: None.
+        retain_graph (bool, optional): If ``False``, the graph used to compute the grad
+            will be freed. Note that in nearly all cases setting this option to ``True``
+            is not needed and often can be worked around in a much more efficient
+            way. Defaults to the value of ``create_graph``.
+        create_graph (bool, optional): If ``True``, graph of the derivative will
+            be constructed, allowing to compute higher order derivative products.
+            Default: ``False``.
+        allow_unused (bool, optional): If ``False``, specifying inputs that were not
+            used when computing outputs (and therefore their grad is always zero)
+            is an error. Defaults to ``False``.
+    """
+    if not only_inputs:
+        warnings.warn("only_inputs argument is deprecated and is ignored now "
+                      "(defaults to True). To accumulate gradient for other "
+                      "parts of the graph, please use torch.autograd.backward.")
+
+    outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
+    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
+    if grad_outputs is None:
+        grad_outputs = [None] * len(outputs)
+    elif isinstance(grad_outputs, torch.Tensor):
+        grad_outputs = [grad_outputs]
+    else:
+        grad_outputs = list(grad_outputs)
+
+    grad_outputs = _make_grads(outputs, grad_outputs)
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    return Variable._execution_engine.run_backward(
+        outputs, grad_outputs, retain_graph, create_graph,
+        inputs, allow_unused)
+
+
+# This function applies in case of gradient checkpointing for memory
+# optimization. Currently, for gradient checkpointing, we only support imperative
+# backwards call i.e. torch.autograd.backward() and the torch.autograd.grad() won't
+# work. The reason being that: torch.autograd.grad() only calculates the grads
+# for the inputs that are passed by user but it doesn't calculate grad for
+# anything else e.g. model parameters like weights, bias etc. However, for
+# torch.autograd.backward(), we would actually compute the grad for the weights as well.
+#
+# This function returns whether the checkpointing is valid i.e. torch.autograd.backward
+# or not i.e. torch.autograd.grad. The implementation works by maintaining a thread
+# local variable in torch/csrc/autograd/engine.cpp which looks at the FunctionTask
+# in the stack and before a FunctionTask is executed in evaluate_function, it
+# checks for whether reentrant backwards is imperative or not.
+# See https://github.com/pytorch/pytorch/pull/4594 for more discussion/context
+def _is_checkpoint_valid():
+    return Variable._execution_engine.is_checkpoint_valid()
+
+
+def variable(*args, **kwargs):
+    warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
+    return torch.tensor(*args, **kwargs)
+
+
+if not torch._C._autograd_init():
+    raise RuntimeError("autograd initialization failed")
diff --git a/torch/autograd/_functions/__init__.py b/torch/autograd/_functions/__init__.py
new file mode 100644
index 0000000..c041700
--- /dev/null
+++ b/torch/autograd/_functions/__init__.py
@@ -0,0 +1 @@
+from .tensor import *
diff --git a/torch/autograd/_functions/replace.vim b/torch/autograd/_functions/replace.vim
new file mode 100644
index 0000000..d6aa75c
--- /dev/null
+++ b/torch/autograd/_functions/replace.vim
@@ -0,0 +1,3 @@
+%s/self/ctx/g
+%s/\s\+def forward/    @staticmethod\r    def forward/g
+%s/\s\+def backward/    @staticmethod\r    @once_differentiable\r    def backward/g
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
new file mode 100644
index 0000000..98f3bd9
--- /dev/null
+++ b/torch/autograd/_functions/tensor.py
@@ -0,0 +1,48 @@
+from functools import reduce
+import torch
+import torch._utils
+from ..function import Function
+
+
+class Type(Function):
+
+    @staticmethod
+    def forward(ctx, i, dest_type):
+        ctx.input_type = type(i)
+        ctx.input_device = -1 if not i.is_cuda else i.get_device()
+        return i.type(dest_type)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.input_device == -1:
+            return grad_output.type(ctx.input_type), None
+        else:
+            with torch.cuda.device(ctx.input_device):
+                return grad_output.type(ctx.input_type), None
+
+
+# TODO: deprecate this
+class Resize(Function):
+
+    @staticmethod
+    def forward(ctx, tensor, sizes):
+        ctx.sizes = sizes
+        ctx.numel = reduce(lambda x, y: x * y, sizes, 1)
+        if tensor.numel() != ctx.numel:
+            raise RuntimeError(("requested resize to {} ({} elements in total), "
+                                "but the given tensor has a size of {} ({} elements). "
+                                "autograd's resize can only change the shape of a given "
+                                "tensor, while preserving the number of elements. ").format(
+                'x'.join(map(str, sizes)), ctx.numel,
+                'x'.join(map(str, tensor.size())), tensor.numel()))
+        ctx.input_sizes = tensor.size()
+        if tensor.is_contiguous():
+            result = tensor.new(tensor).contiguous().view(*sizes)
+            return result
+        else:
+            return tensor.contiguous().view(*sizes)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.numel() == ctx.numel
+        return grad_output.contiguous().view(ctx.input_sizes), None
diff --git a/torch/autograd/_functions/utils.py b/torch/autograd/_functions/utils.py
new file mode 100644
index 0000000..55c85e7
--- /dev/null
+++ b/torch/autograd/_functions/utils.py
@@ -0,0 +1,76 @@
+import torch
+from functools import reduce
+
+
+def maybe_view(tensor, size, check_same_size=True):
+    if check_same_size and tensor.size() == size:
+        return tensor
+    return tensor.contiguous().view(size)
+
+
+def maybe_unexpand(tensor, old_size, check_same_size=True):
+    if check_same_size and tensor.size() == old_size:
+        return tensor
+    num_unsqueezed = tensor.dim() - len(old_size)
+    expanded_dims = [dim for dim, (expanded, original)
+                     in enumerate(zip(tensor.size()[num_unsqueezed:], old_size))
+                     if expanded != original]
+
+    for _ in range(num_unsqueezed):
+        tensor = tensor.sum(0, keepdim=False)
+    for dim in expanded_dims:
+        tensor = tensor.sum(dim, keepdim=True)
+    return tensor
+
+
+# Generate paddings in ONNX order based on pad in pytorch.
+# Arguments:
+#     dim: the dimension of the tensor.
+#     pad: the paddings in pytorch.
+#          The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+def prepare_onnx_paddings(dim, pad):
+    assert isinstance(dim, int)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    assert len(pad) <= dim * 2
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    assert len(paddings) == dim * 2
+    return paddings
+
+
+# Check whether the op enable broadcasting, and whether it is supported by ONNX.
+# If dims1 and dims2 are different, then broadcast is True.
+# We always assume the combination of dims1 and dims2 is broadcastable.
+# The following types of broadcasting are supported in ONNX:
+#     1) Only one element in dims2, such as dims2 = [1, 1]
+#     2) dims2 is suffix of dims1, such as dims1 = [2, 3, 4], and dims2 = [3, 4]
+# Details can be found here: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Gemm
+def check_onnx_broadcast(dims1, dims2):
+    broadcast = False
+    supported = True
+    len1 = len(dims1)
+    len2 = len(dims2)
+    numel1 = reduce(lambda x, y: x * y, dims1)
+    numel2 = reduce(lambda x, y: x * y, dims2)
+    if len1 < len2:
+        broadcast = True
+        if numel2 != 1:
+            supported = False
+    elif len1 > len2:
+        broadcast = True
+        if numel2 != 1 and dims1[len1 - len2:] != dims2:
+            supported = False
+    else:
+        if dims1 != dims2:
+            broadcast = True
+            if numel2 != 1:
+                supported = False
+
+    if not supported:
+        raise ValueError("Numpy style broadcasting is not supported in ONNX. "
+                         "Input dims are: {}, {}".format(dims1, dims2))
+    return broadcast
diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
new file mode 100644
index 0000000..f2f53da
--- /dev/null
+++ b/torch/autograd/anomaly_mode.py
@@ -0,0 +1,99 @@
+import torch
+
+
+class detect_anomaly(object):
+    r"""Context-manager that enable anomaly detection for the autograd engine.
+
+    This does two things:
+    - Running the forward pass with detection enabled will allow the backward
+    pass to print the traceback of the forward operation that created the failing
+    backward function.
+    - Any backward computation that generate "nan" value will raise an error.
+
+    Example:
+
+        >>> import torch
+        >>> from torch import autograd
+        >>> class MyFunc(autograd.Function):
+        ...     @staticmethod
+        ...     def forward(ctx, inp):
+        ...         return inp.clone()
+        ...     @staticmethod
+        ...     def backward(ctx, gO):
+        ...         # Error during the backward pass
+        ...         raise RuntimeError("Some error in backward")
+        ...         return gO.clone()
+        >>> def run_fn(a):
+        ...     out = MyFunc.apply(a)
+        ...     return out.sum()
+        >>> inp = torch.rand(10, 10, requires_grad=True)
+        >>> out = run_fn(inp)
+        >>> out.backward()
+            Traceback (most recent call last):
+              File "<stdin>", line 1, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+        >>> with autograd.detect_anomaly():
+        ...     inp = torch.rand(10, 10, requires_grad=True)
+        ...     out = run_fn(inp)
+        ...     out.backward()
+            Traceback of forward call that caused the error:
+              File "tmp.py", line 53, in <module>
+                out = run_fn(inp)
+              File "tmp.py", line 44, in run_fn
+                out = MyFunc.apply(a)
+            Traceback (most recent call last):
+              File "<stdin>", line 4, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+
+    """
+
+    def __init__(self):
+        self.prev = torch.is_anomaly_enabled()
+
+    def __enter__(self):
+        torch.set_anomaly_enabled(True)
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
+
+
+class set_detect_anomaly(object):
+    r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
+
+    ``set_detect_anomaly`` will enable or disable the autograd anomaly detection
+    based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    See ``detect_anomaly`` above for details of the anomaly detection behaviour.
+
+    Arguments:
+        mode (bool): Flag whether to enable anomaly detection (``True``),
+                     or disable (``False``).
+
+    """
+
+    def __init__(self, mode):
+        self.prev = torch.is_anomaly_enabled()
+        torch.set_anomaly_enabled(mode)
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
new file mode 100644
index 0000000..9fc07e5
--- /dev/null
+++ b/torch/autograd/function.py
@@ -0,0 +1,370 @@
+import torch
+import torch._C as _C
+import torch.utils.hooks as hooks
+from torch._six import with_metaclass
+import functools
+import warnings
+from collections import OrderedDict
+
+
+class _ContextMethodMixin(object):
+
+    def save_for_backward(self, *tensors):
+        r"""Saves given tensors for a future call to :func:`~Function.backward`.
+
+        **This should be called at most once, and only from inside the**
+        :func:`forward` **method.**
+
+        Later, saved tensors can be accessed through the :attr:`saved_tensors`
+        attribute. Before returning them to the user, a check is made to ensure
+        they weren't used in any in-place operation that modified their content.
+
+        Arguments can also be ``None``.
+        """
+        self.to_save = tensors
+
+    def mark_dirty(self, *args):
+        r"""Marks given tensors as modified in an in-place operation.
+
+        **This should be called at most once, only from inside the**
+        :func:`forward` **method, and all arguments should be inputs.**
+
+        Every tensor that's been modified in-place in a call to :func:`forward`
+        should be given to this function, to ensure correctness of our checks.
+        It doesn't matter whether the function is called before or after
+        modification.
+        """
+        self.dirty_tensors = args
+
+    def mark_shared_storage(self, *pairs):
+        warnings.warn(
+            'mark_shared_storage is deprecated. '
+            'Tensors with shared storages are automatically tracked. Note '
+            'that calls to `set_()` are not tracked')
+
+    def mark_non_differentiable(self, *args):
+        r"""Marks outputs as non-differentiable.
+
+        **This should be called at most once, only from inside the**
+        :func:`forward` **method, and all arguments should be outputs.**
+
+        This will mark outputs as not requiring gradients, increasing the
+        efficiency of backward computation. You still need to accept a gradient
+        for each output in :meth:`~Function.backward`, but it's always going to
+        be ``None``.
+
+        This is used e.g. for indices returned from a max :class:`Function`.
+        """
+        self.non_differentiable = args
+
+
+class _HookMixin(object):
+
+    @staticmethod
+    def _register_hook(backward_hooks, hook):
+        if backward_hooks is None:
+            backward_hooks = OrderedDict()
+        handle = hooks.RemovableHandle(backward_hooks)
+        backward_hooks[handle.id] = hook
+        return backward_hooks, handle
+
+
+class BackwardCFunction(_C._FunctionBase, _ContextMethodMixin, _HookMixin):
+    _is_legacy = False
+
+    def apply(self, *args):
+        return self._forward_cls.backward(self, *args)
+
+
+class FunctionMeta(type):
+    """Function metaclass.
+
+    This metaclass sets up the following properties:
+        _is_legacy: True if forward is not defined as a static method.
+        _backward_cls: The Function class corresponding to the differentiated
+            version of this function (which is generated on the fly by this
+            metaclass).
+    """
+
+    def __init__(cls, name, bases, attrs):
+        for super_cls in cls.mro():
+            forward = super_cls.__dict__.get('forward')
+            if forward is not None:
+                has_static_forward = isinstance(forward, staticmethod) or isinstance(forward, classmethod)
+                break
+
+        setattr(cls, '_is_legacy', not has_static_forward)
+
+        # old-style functions
+        if not has_static_forward:
+            return super(FunctionMeta, cls).__init__(name, bases, attrs)
+
+        backward_fn = type(name + 'Backward', (BackwardCFunction,), {'_forward_cls': cls})
+        setattr(cls, '_backward_cls', backward_fn)
+
+        return super(FunctionMeta, cls).__init__(name, bases, attrs)
+
+
+class Function(with_metaclass(FunctionMeta, _C._FunctionBase, _ContextMethodMixin, _HookMixin)):
+    r"""Records operation history and defines formulas for differentiating ops.
+
+    Every operation performed on :class:`Tensor` s creates a new function
+    object, that performs the computation, and records that it happened.
+    The history is retained in the form of a DAG of functions, with edges
+    denoting data dependencies (``input <- output``). Then, when backward is
+    called, the graph is processed in the topological ordering, by calling
+    :func:`backward` methods of each :class:`Function` object, and passing
+    returned gradients on to next :class:`Function` s.
+
+    Normally, the only way users interact with functions is by creating
+    subclasses and defining new operations. This is a recommended way of
+    extending torch.autograd.
+
+    Each function object is meant to be used only once (in the forward pass).
+
+    Examples::
+
+        >>> class Exp(Function):
+        >>>
+        >>>     @staticmethod
+        >>>     def forward(ctx, i):
+        >>>         result = i.exp()
+        >>>         ctx.save_for_backward(result)
+        >>>         return result
+        >>>
+        >>>     @staticmethod
+        >>>     def backward(ctx, grad_output):
+        >>>         result, = ctx.saved_tensors
+        >>>         return grad_output * result
+    """
+
+    # only for backward compatibility
+    __call__ = _C._FunctionBase._do_forward
+
+    # for the tracer
+    is_traceable = False
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        r"""Performs the operation.
+
+        This function is to be overridden by all subclasses.
+
+        It must accept a context ctx as the first argument, followed by any
+        number of arguments (tensors or other types).
+
+        The context can be used to store tensors that can be then retrieved
+        during the backward pass.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        r"""Defines a formula for differentiating the operation.
+
+        This function is to be overridden by all subclasses.
+
+        It must accept a context :attr:`ctx` as the first argument, followed by
+        as many outputs did :func:`forward` return, and it should return as many
+        tensors, as there were inputs to :func:`forward`. Each argument is the
+        gradient w.r.t the given output, and each returned value should be the
+        gradient w.r.t. the corresponding input.
+
+        The context can be used to retrieve tensors saved during the forward
+        pass. It also has an attribute :attr:`ctx.needs_input_grad` as a tuple
+        of booleans representing whether each input needs gradient. E.g.,
+        :func:`backward` will have ``ctx.needs_input_grad[0] = True`` if the
+        first input to :func:`forward` needs gradient computated w.r.t. the
+        output.
+        """
+        raise NotImplementedError
+
+
+def once_differentiable(fn):
+
+    @functools.wraps(fn)
+    def wrapper(ctx, *args):
+        with torch.no_grad():
+            outputs = fn(ctx, *args)
+
+        if not torch.is_grad_enabled():
+            return outputs
+
+        # If any of the inputs have requires_grad=True, we force the outputs
+        # to have requires_grad=True but point to a grad_fn which throws an
+        # error message during (double) back-propagation.
+        # XXX: this is only an approximation of requires_grad - there's no way
+        # to figure out if fn didn't use ctx.saved_tensors and as a result
+        # some Tensors might require grad, even if no args do.
+        # Unfortunately, this leads to unexpected error messages ("no nodes
+        # require computing gradients"), but I don't have a better idea.
+        # These functions would raise an error in backward anyway.
+        requires_grad = any(isinstance(arg, torch.Tensor) and arg.requires_grad
+                            for arg in args)
+        if not requires_grad:
+            return outputs
+
+        if not isinstance(outputs, tuple):
+            outputs = (outputs,)
+
+        err_fn = torch._C._functions.DelayedError(
+            b"trying to differentiate twice a function that was marked"
+            b"with @once_differentiable", len(outputs))
+
+        # Create aliases of each output that has requires_grad=True. We need
+        # at least one of the inputs to err_fn to require grad so that the
+        # output will have a grad_fn.
+        def fake_requires_grad(var):
+            if var is not None:
+                var = var.detach()
+                var.requires_grad = True
+            return var
+
+        return err_fn(*[fake_requires_grad(v) for v in outputs])
+    return wrapper
+
+
+def traceable(fn_cls):
+    r"""Marks Function as traceable for the JIT.
+
+    Traceable functions have additional restrictions - they can't pass any
+    data-dependent values to backward (e.g. Prod passes the output, which makes
+    it non-traceable), and their backward should be implemented entirely in terms
+    of operations on autograd Tensors in all cases.
+
+    DON'T USE THIS DECORATOR. IT IS FOR INTERNAL USE ONLY AND SHOULD BE HANDLED WITH
+    CARE (or can give incorrect results otherwise).
+    """
+    fn_cls.is_traceable = True
+    return fn_cls
+
+
+class InplaceFunction(Function):
+
+    def __init__(self, inplace=False):
+        super(InplaceFunction, self).__init__()
+        self.inplace = inplace
+
+
+def _nested_map(condition, fn, condition_msg=None):
+    def _map(obj):
+        if condition(obj):
+            return fn(obj)
+        elif obj is None:
+            return None
+        elif isinstance(obj, (list, tuple)):
+            return type(obj)(_map(x) for x in obj)
+        else:
+            raise ValueError("Auto nesting doesn't know how to process "
+                             "an input object of type " + torch.typename(obj) +
+                             (". Accepted types: " + condition_msg +
+                              ", or lists/tuples of them"
+                              if condition_msg else ""))
+
+    return _map
+
+
+def _iter_filter(condition, allow_unknown=False, condition_msg=None):
+    def _iter(obj):
+        if condition(obj):
+            yield obj
+        elif obj is None:
+            return
+        elif isinstance(obj, (list, tuple)):
+            for o in obj:
+                for var in _iter(o):
+                    yield var
+        elif allow_unknown:
+            yield obj
+        else:
+            raise ValueError("Auto nesting doesn't know how to process "
+                             "an input object of type " + torch.typename(obj) +
+                             (". Accepted types: " + condition_msg +
+                              ", or lists/tuples of them"
+                              if condition_msg else ""))
+
+    return _iter
+
+
+def _unflatten(input, proto):
+    # unflatten a list or tuple input into a nested list/tuple structure
+    # specified by proto
+    def unflatten_helper(input, proto):
+        res = []
+        if not isinstance(proto, (list, tuple)):
+            return input[0], input[1:]
+        for e in proto:
+            if e is None:
+                res.append(e)
+            else:
+                res_e, input = unflatten_helper(input, e)
+                res.append(res_e)
+        return type(proto)(res), input
+
+    return unflatten_helper(input, proto)[0]
+
+
+_iter_jit_values = _iter_filter(lambda o: o is None or isinstance(o, torch._C.Value),
+                                condition_msg="jit's Values or None")
+_iter_tensors = _iter_filter(lambda x: isinstance(x, torch.Tensor), condition_msg="Tensors")
+_iter_tensors_permissive = _iter_filter(lambda x: isinstance(x, torch.Tensor),
+                                        allow_unknown=True,
+                                        condition_msg="Tensors (permissive)")
+_iter_None_tensors = _iter_filter(lambda o: o is None or isinstance(o, torch.Tensor),
+                                  condition_msg="Tensors or None")
+_map_tensor_data = _nested_map(lambda x: isinstance(x, torch.Tensor), lambda o: o.data,
+                               condition_msg="Tensors")
+
+
+class NestedIOFunction(Function):
+
+    def _do_forward(self, *input):
+        self._nested_input = input
+        flat_input = tuple(_iter_tensors(input))
+        flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
+        nested_output = self._nested_output
+        nested_tensors = _unflatten(flat_output, self._nested_output)
+        return nested_tensors
+
+    def _do_backward(self, gradients, retain_variables):
+        self.retain_variables = retain_variables
+        result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
+        if not retain_variables:
+            del self._nested_output
+            del self._to_save_nested
+        return result
+
+    def backward(self, *gradients):
+        nested_gradients = _unflatten(gradients, self._nested_output)
+        result = self.backward_extended(*nested_gradients)
+        return tuple(_iter_None_tensors(result))
+
+    __call__ = _do_forward
+
+    def forward(self, *args):
+        nested_tensors = _map_tensor_data(self._nested_input)
+        result = self.forward_extended(*nested_tensors)
+        del self._nested_input
+        self._nested_output = result
+        return tuple(_iter_tensors(result))
+
+    def save_for_backward(self, *args):
+        self.to_save = tuple(_iter_tensors(args))
+        self._to_save_nested = args
+
+    @property
+    def saved_tensors(self):
+        flat_tensors = super(NestedIOFunction, self).saved_tensors
+        return _unflatten(flat_tensors, self._to_save_nested)
+
+    def mark_dirty(self, *args, **kwargs):
+        self.dirty_tensors = tuple(_iter_tensors((args, kwargs)))
+
+    def mark_non_differentiable(self, *args, **kwargs):
+        self.non_differentiable = tuple(_iter_tensors((args, kwargs)))
+
+    def forward_extended(self, *input):
+        raise NotImplementedError
+
+    def backward_extended(self, *grad_output):
+        raise NotImplementedError
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
new file mode 100644
index 0000000..35dada3
--- /dev/null
+++ b/torch/autograd/grad_mode.py
@@ -0,0 +1,137 @@
+import torch
+import functools
+
+
+class no_grad(object):
+    r"""Context-manager that disabled gradient calculation.
+
+    Disabling gradient calculation is useful for inference, when you are sure
+    that you will not call :meth:`Tensor.backward()`. It will reduce memory
+    consumption for computations that would otherwise have `requires_grad=True`.
+    In this mode, the result of every computation will have
+    `requires_grad=False`, even when the inputs have `requires_grad=True`.
+
+    Also functions as a decorator.
+
+
+    Example::
+
+        >>> x = torch.tensor([1], requires_grad=True)
+        >>> with torch.no_grad():
+        ...   y = x * 2
+        >>> y.requires_grad
+        False
+        >>> @torch.no_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> z = doubler(x)
+        >>> z.requires_grad
+        False
+    """
+
+    def __init__(self):
+        self.prev = torch.is_grad_enabled()
+
+    def __enter__(self):
+        torch._C.set_grad_enabled(False)
+
+    def __exit__(self, *args):
+        torch.set_grad_enabled(self.prev)
+        return False
+
+    def __call__(self, func):
+        @functools.wraps(func)
+        def decorate_no_grad(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+        return decorate_no_grad
+
+
+class enable_grad(object):
+    r"""Context-manager that enables gradient calculation.
+
+    Enables gradient calculation inside a :class:`~no_grad` context. This has
+    no effect outside of :class:`~no_grad`.
+
+    Also functions as a decorator.
+
+
+    Example::
+
+        >>> x = torch.tensor([1], requires_grad=True)
+        >>> with torch.no_grad():
+        ...   with torch.enable_grad():
+        ...     y = x * 2
+        >>> y.requires_grad
+        True
+        >>> y.backward()
+        >>> x.grad
+        >>> @torch.enable_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> with torch.no_grad:
+        ...     z = doubler(x)
+        >>> z.requires_grad
+        True
+
+    """
+
+    def __init__(self):
+        self.prev = torch.is_grad_enabled()
+
+    def __enter__(self):
+        torch._C.set_grad_enabled(True)
+
+    def __exit__(self, *args):
+        torch.set_grad_enabled(self.prev)
+        return False
+
+    def __call__(self, func):
+        @functools.wraps(func)
+        def decorate_enable_grad(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+        return decorate_enable_grad
+
+
+class set_grad_enabled(object):
+    r"""Context-manager that sets gradient calculation to on or off.
+
+    ``set_grad_enabled`` will enable or disable grads based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    Arguments:
+        mode (bool): Flag whether to enable grad (``True``), or disable
+                     (``False``). This can be used to conditionally enable
+                     gradients.
+
+
+    Example::
+
+        >>> x = torch.tensor([1], requires_grad=True)
+        >>> is_train = False
+        >>> with torch.set_grad_enabled(is_train):
+        ...   y = x * 2
+        >>> y.requires_grad
+        False
+        >>> torch.set_grad_enabled(True)
+        >>> y = x * 2
+        >>> y.requires_grad
+        True
+        >>> torch.set_grad_enabled(False)
+        >>> y = x * 2
+        >>> y.requires_grad
+        False
+
+    """
+
+    def __init__(self, mode):
+        self.prev = torch.is_grad_enabled()
+        torch._C.set_grad_enabled(mode)
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        torch.set_grad_enabled(self.prev)
+        return False
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
new file mode 100644
index 0000000..8697497
--- /dev/null
+++ b/torch/autograd/gradcheck.py
@@ -0,0 +1,317 @@
+import torch
+from collections import Iterable
+import torch.testing
+import sys
+from itertools import product
+import warnings
+
+
+def zero_gradients(x):
+    if isinstance(x, torch.Tensor):
+        if x.grad is not None:
+            x.grad.detach_()
+            x.grad.data.zero_()
+    elif isinstance(x, Iterable):
+        for elem in x:
+            zero_gradients(elem)
+
+
+def make_jacobian(input, num_out):
+    if isinstance(input, torch.Tensor):
+        if not input.is_floating_point():
+            return None
+        if not input.requires_grad:
+            return None
+        return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
+    elif isinstance(input, Iterable):
+        jacobians = list(filter(
+            lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
+        if not jacobians:
+            return None
+        return type(input)(jacobians)
+    else:
+        return None
+
+
+def iter_tensors(x, only_requiring_grad=False):
+    if isinstance(x, torch.Tensor):
+        if x.requires_grad or not only_requiring_grad:
+            yield x
+    elif isinstance(x, Iterable):
+        for elem in x:
+            for result in iter_tensors(elem, only_requiring_grad):
+                yield result
+
+
+# `input` is input to `fn`
+# `target` is the Tensors wrt whom Jacobians are calculated (default=`input`)
+#
+# Note that `target` may not even be part of `input` to `fn`, so please be
+# **very careful** in this to not clone `target`.
+def get_numerical_jacobian(fn, input, target=None, eps=1e-3):
+    if target is None:
+        target = input
+    output_size = fn(input).numel()
+    jacobian = make_jacobian(target, output_size)
+
+    # It's much easier to iterate over flattened lists of tensors.
+    # These are reference to the same objects in jacobian, so any changes
+    # will be reflected in it as well.
+    x_tensors = [t for t in iter_tensors(target, True)]
+    j_tensors = [t for t in iter_tensors(jacobian)]
+
+    # TODO: compare structure
+    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
+        # need data here to get around the version check because without .data,
+        # the following code updates version but doesn't change content
+        x_tensor = x_tensor.data
+        for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
+            orig = x_tensor[x_idx].item()
+            x_tensor[x_idx] = orig - eps
+            outa = fn(input).clone()
+            x_tensor[x_idx] = orig + eps
+            outb = fn(input).clone()
+            x_tensor[x_idx] = orig
+
+            r = (outb - outa) / (2 * eps)
+            d_tensor[d_idx] = r.detach().reshape(-1)
+
+    return jacobian
+
+
+def get_analytical_jacobian(input, output):
+    diff_input_list = list(iter_tensors(input, True))
+    jacobian = make_jacobian(input, output.numel())
+    jacobian_reentrant = make_jacobian(input, output.numel())
+    grad_output = torch.zeros_like(output)
+    flat_grad_output = grad_output.view(-1)
+    reentrant = True
+    correct_grad_sizes = True
+
+    for i in range(flat_grad_output.numel()):
+        flat_grad_output.zero_()
+        flat_grad_output[i] = 1
+        for jacobian_c in (jacobian, jacobian_reentrant):
+            grads_input = torch.autograd.grad(output, diff_input_list, grad_output,
+                                              retain_graph=True, allow_unused=True)
+            for jacobian_x, d_x, x in zip(jacobian_c, grads_input, diff_input_list):
+                if d_x is not None and d_x.size() != x.size():
+                    correct_grad_sizes = False
+                elif jacobian_x.numel() != 0:
+                    if d_x is None:
+                        jacobian_x[:, i].zero_()
+                    else:
+                        d_x_dense = d_x.to_dense() if d_x.is_sparse else d_x
+                        assert jacobian_x[:, i].numel() == d_x_dense.numel()
+                        jacobian_x[:, i] = d_x_dense.contiguous().view(-1)
+
+    for jacobian_x, jacobian_reentrant_x in zip(jacobian, jacobian_reentrant):
+        if jacobian_x.numel() != 0 and (jacobian_x - jacobian_reentrant_x).abs().max() != 0:
+            reentrant = False
+
+    return jacobian, reentrant, correct_grad_sizes
+
+
+def _as_tuple(x):
+    if isinstance(x, tuple):
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return x,
+
+
+def _differentiable_outputs(x):
+    return tuple(o for o in _as_tuple(x) if o.requires_grad)
+
+
+def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True):
+    r"""Check gradients computed via small finite differences against analytical
+    gradients w.r.t. tensors in :attr:`inputs` that are of floating point type
+    and with ``requires_grad=True``.
+
+    The check between numerical and analytical gradients has the same behaviour as
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_,
+    i.e., it checks that
+
+    .. math::
+
+        \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert
+
+    holds for all elements of analytical gradient :math:`a` and numerical
+    gradient :math:`n`.
+
+    .. note::
+        The default values are designed for :attr:`input` of double precision.
+        This check will likely fail if :attr:`input` is of less precision, e.g.,
+        ``FloatTensor``.
+
+    .. warning::
+       If any checked tensor in :attr:`input` has overlapping memory, i.e.,
+       different indices pointing to the same memory address (e.g., from
+       :func:`torch.expand`), this check will likely fail because the numerical
+       gradients computed by point perturbation at such indices will change
+       values at all other indices that share the same memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor): inputs to the function
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    tupled_inputs = _as_tuple(inputs)
+
+    # Make sure that gradients are saved for all inputs
+    any_input_requiring_grad = False
+    for inp in tupled_inputs:
+        if isinstance(inp, torch.Tensor):
+            if inp.requires_grad:
+                if inp.dtype != torch.float64:
+                    warnings.warn(
+                        'At least one of the inputs that requires gradient '
+                        'is not of double precision floating point. '
+                        'This check will likely fail if all the inputs are '
+                        'not of double precision floating point. ')
+                any_input_requiring_grad = True
+            inp.retain_grad()
+    if not any_input_requiring_grad:
+        raise ValueError(
+            'gradcheck expects at least one input tensor to require gradient, '
+            'but none of the them have requires_grad=True.')
+
+    output = _differentiable_outputs(func(*inputs))
+
+    def fail_test(msg):
+        if raise_exception:
+            raise RuntimeError(msg)
+        return False
+
+    for i, o in enumerate(output):
+        if not o.requires_grad:
+            continue
+
+        def fn(input):
+            return _as_tuple(func(*input))[i]
+
+        analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o)
+        numerical = get_numerical_jacobian(fn, inputs, eps=eps)
+
+        if not correct_grad_sizes:
+            return fail_test('Analytical gradient has incorrect size')
+
+        for j, (a, n) in enumerate(zip(analytical, numerical)):
+            if a.numel() != 0 or n.numel() != 0:
+                if not ((a - n).abs() <= (atol + rtol * n.abs())).all():
+                    return fail_test('Jacobian mismatch for output %d with respect to input %d,\n'
+                                     'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
+
+        if not reentrant:
+            return fail_test('Backward is not reentrant, i.e., running backward with same '
+                             'input and grad_output multiple times gives different values, '
+                             'although analytical gradient matches numerical gradient')
+
+    # check if the backward multiplies by grad_output
+    output = _differentiable_outputs(func(*inputs))
+    if any([o.requires_grad for o in output]):
+        diff_input_list = list(iter_tensors(inputs, True))
+        if not diff_input_list:
+            raise RuntimeError("no Tensors requiring grad found in input")
+        grads_input = torch.autograd.grad(output, diff_input_list, [torch.zeros_like(o) for o in output],
+                                          allow_unused=True)
+        for gi, i in zip(grads_input, diff_input_list):
+            if gi is None:
+                continue
+            if not gi.eq(0).all():
+                return fail_test('backward not multiplied by grad_output')
+            if gi.type() != i.type():
+                return fail_test("grad is incorrect type")
+            if gi.size() != i.size():
+                return fail_test('grad is incorrect size')
+
+    return True
+
+
+def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-3,
+                  gen_non_contig_grad_outputs=False, raise_exception=True):
+    r"""Check gradients of gradients computed via small finite differences
+    against analytical gradients w.r.t. tensors in :attr:`inputs` and
+    :attr:`grad_outputs` that are of floating point type and with
+    ``requires_grad=True``.
+
+    This function checks that backpropagating through the gradients computed
+    to the given :attr:`grad_outputs` are correct.
+
+    The check between numerical and analytical gradients has the same behaviour as
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_,
+    i.e., it checks that
+
+    .. math::
+
+        \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert
+
+    holds for all elements of analytical gradient :math:`a` and numerical
+    gradient :math:`n`.
+
+    .. note::
+        The default values are designed for :attr:`input` and
+        :attr:`grad_outputs` of double precision. This check will likely fail if
+        they are of less precision, e.g., ``FloatTensor``.
+
+    .. warning::
+       If any checked tensor in :attr:`input` and :attr:`grad_outputs` has
+       overlapping memory, i.e., different indices pointing to the same memory
+       address (e.g., from :func:`torch.expand`), this check will likely fail
+       because the numerical gradients computed by point perturbation at such
+       indices will change values at all other indices that share the same
+       memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor): inputs to the function
+        grad_outputs (tuple of Tensor, optional): The gradients with respect to
+            the function's outputs.
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
+            ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
+            randomly generated gradient outputs are made to be noncontiguous
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    if grad_outputs is None:
+        # If grad_outputs is not specified, create random Tensors of the same
+        # shape, type, and device as the outputs
+        def randn_like(x):
+            y = torch.testing.randn_like(x if x.is_floating_point() else x.double())
+            if gen_non_contig_grad_outputs:
+                y = torch.testing.make_non_contiguous(y)
+            return y.requires_grad_()
+        outputs = _as_tuple(func(*inputs))
+        grad_outputs_gen = (randn_like(x) for x in outputs)
+        grad_outputs = list(grad_outputs_gen) if not isinstance(inputs, tuple) else tuple(grad_outputs_gen)
+
+    num_outputs = len(grad_outputs)
+
+    def new_func(*args):
+        input_args = args[:-num_outputs]
+        grad_outputs = args[-num_outputs:]
+        outputs = _differentiable_outputs(func(*input_args))
+        input_args = tuple(x for x in input_args if isinstance(x, torch.Tensor) and x.requires_grad)
+        grad_inputs = torch.autograd.grad(outputs, input_args, grad_outputs, create_graph=True)
+        return grad_inputs
+
+    return gradcheck(new_func, inputs + grad_outputs, eps, atol, rtol, raise_exception)
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
new file mode 100644
index 0000000..75e309a
--- /dev/null
+++ b/torch/autograd/profiler.py
@@ -0,0 +1,575 @@
+import subprocess
+import re
+import os
+import sys
+import itertools
+from collections import defaultdict
+
+import torch
+from torch._six import FileNotFoundError
+
+
+class range(object):
+    def __init__(self, name):
+        self.name = name
+
+    def __enter__(self):
+        torch.autograd._push_range(self.name)
+
+    def __exit__(self, *args):
+        torch.autograd._pop_range()
+        return False
+
+
+class EventList(list):
+    """A list of Events (for pretty printing)"""
+    def __init__(self, *args, **kwargs):
+        super(EventList, self).__init__(*args, **kwargs)
+
+    def __str__(self):
+        return self.table()
+
+    def table(self, sort_by=None):
+        """Prints an EventList as a nicely formatted table.
+
+        Arguments:
+            sort_by (str, optional): Attribute used to sort entries. By default
+                they are printed in the same order as they were registered.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
+                ``cuda_time_total``, ``count``.
+
+        Returns:
+            A string containing the table.
+        """
+        return build_table(self, sort_by)
+
+    def export_chrome_trace(self, path):
+        """Exports an EventList as a Chrome tracing tools file.
+
+        The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
+
+        Arguments:
+            path (str): Path where the trace will be written.
+        """
+        import json
+        with open(path, 'w') as f:
+            chrome_events = []
+            next_id = 0
+            for evt in self:
+                chrome_events.append(dict(
+                    name=evt.name,
+                    ph='X',
+                    ts=evt.cpu_interval.start,
+                    dur=evt.cpu_interval.elapsed_us(),
+                    tid=evt.thread,
+                    pid='CPU functions',
+                    args={},
+                ))
+                for k in evt.kernels:
+                    # 's' and 'f' draw Flow arrows from
+                    # the CPU launch to the GPU kernel
+                    chrome_events.append(dict(
+                        name=evt.name,
+                        ph='s',
+                        ts=evt.cpu_interval.start,
+                        tid=evt.thread,
+                        pid='CPU functions',
+                        id=next_id,
+                        cat='cpu_to_cuda',
+                        args={},
+                    ))
+                    chrome_events.append(dict(
+                        name=k.name,
+                        ph='f',
+                        ts=k.interval.start,
+                        tid=k.device,
+                        pid='CUDA functions',
+                        id=next_id,
+                        cat='cpu_to_cuda',
+                        args={},
+                    ))
+                    chrome_events.append(dict(
+                        name=k.name,
+                        ph='X',
+                        ts=k.interval.start,
+                        dur=k.interval.elapsed_us(),
+                        tid=k.device,
+                        pid='CUDA functions',
+                        args={},
+                    ))
+                    next_id += 1
+
+            json.dump(chrome_events, f)
+
+    def key_averages(self):
+        """Averages all function events over their keys.
+
+        Returns:
+            An EventList containing FunctionEventAvg objects.
+        """
+        stats = defaultdict(FunctionEventAvg)
+        for evt in self:
+            stats[evt.key] += evt
+        return EventList(stats.values())
+
+    def total_average(self):
+        """Averages all events.
+
+        Returns:
+            A FunctionEventAvg object.
+        """
+        total_stat = FunctionEventAvg()
+        for evt in self:
+            total_stat += evt
+            total_stat.key = None
+        total_stat.key = 'Total'
+        return total_stat
+
+
+class profile(object):
+    """Context manager that manages autograd profiler state and holds a summary of results.
+
+    Arguments:
+        enabled (bool, optional): Setting this to False makes this context manager a no-op.
+            Default: ``True``.
+
+        use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
+            Adds approximately 4us of overhead to each tensor operation.
+            Default: ``False``
+
+    .. warning:
+        This context managers should not be called recursively, i.e. at most one
+        instance should be enabled at any given time.
+
+    Example:
+        >>> x = torch.randn((1, 1), requires_grad=True)
+        >>> with torch.autograd.profiler.profile() as prof:
+        ...     y = x ** 2
+        ...     y.backward()
+        >>> # NOTE: some columns were removed for brevity
+        ... print(prof)
+        -------------------------------------  ---------------  ---------------
+        Name                                          CPU time        CUDA time
+        -------------------------------------  ---------------  ---------------
+        PowConstant                                  142.036us          0.000us
+        N5torch8autograd9GraphRootE                   63.524us          0.000us
+        PowConstantBackward                          184.228us          0.000us
+        MulConstant                                   50.288us          0.000us
+        PowConstant                                   28.439us          0.000us
+        Mul                                           20.154us          0.000us
+        N5torch8autograd14AccumulateGradE             13.790us          0.000us
+        N5torch8autograd5CloneE                        4.088us          0.000us
+    """
+
+    def __init__(self, enabled=True, use_cuda=False):
+        self.enabled = enabled
+        self.use_cuda = use_cuda
+        self.function_events = None
+        if not self.enabled:
+            return
+        self.entered = False
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("autograd profiler traces are not reentrant")
+        self.entered = True
+        profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \
+            else torch.autograd.ProfilerState.CPU
+        torch.autograd._enable_profiler(profiler_kind)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        records = torch.autograd._disable_profiler()
+        self.function_events = EventList(parse_cpu_trace(records))
+        return False
+
+    def __repr__(self):
+        if self.function_events is None:
+            return '<unfinished torch.autograd.profile>'
+        return repr(self.function_events)
+
+    def __str__(self):
+        if self.function_events is None:
+            return '<unfinished torch.autograd.profile>'
+        return str(self.function_events)
+
+    def _check_finish(self):
+        if self.function_events is None:
+            raise RuntimeError("can't export a trace that didn't finish running")
+
+    def table(self, sort_by=None):
+        self._check_finish()
+        return self.function_events.table(sort_by)
+    table.__doc__ = EventList.table.__doc__
+
+    def export_chrome_trace(self, path):
+        self._check_finish()
+        return self.function_events.export_chrome_trace(path)
+    export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
+
+    def key_averages(self):
+        self._check_finish()
+        return self.function_events.key_averages()
+    key_averages.__doc__ = EventList.key_averages.__doc__
+
+    def total_average(self):
+        self._check_finish()
+        return self.function_events.total_average()
+    total_average.__doc__ = EventList.total_average.__doc__
+
+
+class emit_nvtx(object):
+    """Context manager that makes every autograd operation emit an NVTX range.
+
+    It is useful when running the program under nvprof::
+
+        nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+    Unfortunately, there's no way to force nvprof to flush the data it collected
+    to disk, so for CUDA profiling one has to use this context manager to annotate
+    nvprof traces and wait for the process to exit before inspecting them.
+    Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
+    :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
+    e.g. in Python REPL.
+
+    .. warning:
+        This context manager should not be called recursively, i.e. at most one
+        instance should be enabled at any given time.
+
+    Arguments:
+        enabled (bool, optional): Setting this to False makes this context manager a no-op.
+            Default: ``True``.
+
+    Example:
+        >>> with torch.cuda.profiler.profile():
+        ...     model(x) # Warmup CUDA memory allocator and profiler
+        ...     with torch.autograd.profiler.emit_nvtx():
+        ...         model(x)
+    """
+    def __init__(self, enabled=True):
+        self.enabled = enabled
+        self.entered = False
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("NVTX annotation context manager is not reentrant")
+        self.entered = True
+        torch.cuda.synchronize()
+        torch.autograd._enable_profiler(torch.autograd.ProfilerState.NVTX)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        torch.cuda.synchronize()
+        torch.autograd._disable_profiler()
+        return False
+
+
+def load_nvprof(path):
+    """Opens an nvprof trace file and parses autograd annotations.
+
+    Arguments:
+        path (str): path to nvprof trace
+    """
+    return EventList(parse_nvprof_trace(path))
+
+
+################################################################################
+# FunctionEvent
+
+def format_time(time_us):
+    """Defines how to format time in FunctionEvent"""
+    return '{:.3f}us'.format(time_us)
+
+
+def attr_formatter(name):
+    return property(lambda self: format_time(getattr(self, name)))
+
+
+class FormattedTimesMixin(object):
+    """Helpers for FunctionEvent and FunctionEventAvg.
+
+    The subclass should define `*_time_total` and `count` attributes.
+    """
+    cpu_time_str = attr_formatter('cpu_time')
+    cuda_time_str = attr_formatter('cuda_time')
+    cpu_time_total_str = attr_formatter('cpu_time_total')
+    cuda_time_total_str = attr_formatter('cuda_time_total')
+
+    @property
+    def cpu_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count
+
+    @property
+    def cuda_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count
+
+
+class Interval(object):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def elapsed_us(self):
+        return self.end - self.start
+
+
+class Kernel(object):
+    def __init__(self, name, device, interval):
+        self.name = name
+        self.device = device
+        self.interval = interval
+
+
+# TODO: record TID too
+class FunctionEvent(FormattedTimesMixin):
+    """Profiling information about a single function."""
+    def __init__(self, id, name, thread, cpu_start, cpu_end):
+        self.id = id
+        self.name = name
+        self.cpu_interval = Interval(cpu_start, cpu_end)
+        self.thread = thread
+        self.kernels = []
+        self.count = 1
+
+    def append_kernel(self, name, device, start, end):
+        self.kernels.append(Kernel(name, device, Interval(start, end)))
+
+    @property
+    def cuda_time_total(self):
+        return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
+
+    @property
+    def cpu_time_total(self):
+        return self.cpu_interval.elapsed_us()
+
+    @property
+    def key(self):
+        return self.name
+
+    def __repr__(self):
+        return '<FunctionEvent id={} cpu_time={} cuda_time={} name={} thread={}>'.format(
+            self.id, self.cpu_time_str, self.cuda_time_str, self.name, self.thread)
+
+
+class FunctionEventAvg(FormattedTimesMixin):
+    """Used to average stats over multiple FunctionEvent objects."""
+    def __init__(self):
+        self.key = None
+        self.count = self.cpu_time_total = self.cuda_time_total = 0
+
+    def __iadd__(self, other):
+        if self.key is None:
+            self.key = other.key
+        assert isinstance(other, FunctionEvent)
+        assert other.key == self.key
+        self.cpu_time_total += other.cpu_time
+        self.cuda_time_total += other.cuda_time
+        self.count += 1
+        return self
+
+    def __repr__(self):
+        return '<FunctionEventAvg cpu_time={} cuda_time={} key={}>'.format(
+            self.cpu_time_str, self.cuda_time_str, self.key)
+
+
+################################################################################
+# Utilities
+
+def demangle(name):
+    """Demangle a C++ identifier using c++filt"""
+    try:
+        with open(os.devnull, 'w') as devnull:
+            is_win = sys.platform == 'win32'
+            filt_cmd = ['undname', name] if is_win else ['c++filt', '-n', name]
+            orig_name = subprocess.check_output(filt_cmd, stderr=devnull).rstrip().decode("ascii")
+            orig_name = re.search('is :- \"(.*)"', orig_name).group(1) if is_win else orig_name
+            return orig_name
+    except (subprocess.CalledProcessError, AttributeError, FileNotFoundError, OSError):
+        return name
+
+
+class StringTable(defaultdict):
+    def __missing__(self, key):
+        self[key] = demangle(key)
+        return self[key]
+
+
+################################################################################
+# CPU checkpoints
+
+def parse_cpu_trace(thread_records):
+    next_id = 0
+    start_record = None
+    cuda_records = {}
+    functions = []
+    record_stack = []
+    string_table = StringTable()
+
+    # cuda start events and the overall profiler start event don't happen
+    # at exactly the same time because we need to record an event on each device
+    # and each record takes ~4us. So we adjust here by the difference
+    # adding the difference in CPU time between the profiler start event
+    # and the CPU time of the cuda start event for the device
+    def adjusted_time(cuda_record):
+        assert cuda_record.device() != -1
+        cuda_time_0 = cuda_records[cuda_record.device()]
+        return cuda_time_0.cuda_elapsed_us(cuda_record) + start_record.cpu_elapsed_us(cuda_time_0)
+
+    # '__start_profile' is not guarenteed to be first, so we must find it here
+    for record in itertools.chain(*thread_records):
+        if record.name() == '__start_profile':
+            start_record = record
+        elif record.name() == '__cuda_start_event':
+            assert record.device() != -1
+            cuda_records[record.device()] = record
+    assert start_record is not None
+
+    for record in itertools.chain(*thread_records):
+        if record.kind() == 'mark':
+            continue
+        elif record.kind() == 'push':
+            record_stack.append((next_id, record))
+            next_id += 1
+        elif record.kind() == 'pop':
+            function_id, start = record_stack.pop()
+            fe = FunctionEvent(
+                id=function_id,
+                name=string_table[start.name()],
+                thread=start.thread_id(),
+                cpu_start=start_record.cpu_elapsed_us(start),
+                cpu_end=start_record.cpu_elapsed_us(record))
+            if start.has_cuda():
+                cuda_start = adjusted_time(start)
+                cuda_end = adjusted_time(record)
+                fe.append_kernel(start.name(),
+                                 start.device(),
+                                 cuda_start,
+                                 cuda_end)
+            functions.append(fe)
+
+    functions.sort(key=lambda evt: evt.cpu_interval.start)
+    return functions
+
+
+################################################################################
+# CUDA checkpoints
+
+class EnforceUnique(object):
+    """Raises an error if a key is seen more than once."""
+    def __init__(self):
+        self.seen = set()
+
+    def see(self, *key):
+        if key in self.seen:
+            raise RuntimeError('duplicate key: ' + str(key))
+        self.seen.add(key)
+
+
+def parse_nvprof_trace(path):
+    import sqlite3
+    conn = sqlite3.connect(path)
+    conn.row_factory = sqlite3.Row
+
+    # Parse strings table
+    strings = {}
+    for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
+        strings[r["id"]] = demangle(r["value"])
+
+    # First, find all functions and create FunctionEvents for them
+    marker_query = """
+    SELECT
+        start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
+    FROM
+        CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
+        ON start.id = end.id
+    WHERE
+        start.name != 0 AND end.name = 0
+    """
+    functions = []
+    functions_map = {}
+    unique = EnforceUnique()
+    for row in conn.execute(marker_query):
+        unique.see(row['marker_id'])
+        evt = FunctionEvent(id=row['marker_id'],
+                            name=strings[row['name']],
+                            cpu_start=row['start_time'],
+                            cpu_end=row['end_time'],
+                            thread=0)  # TODO: find in sqlite database
+        functions.append(evt)
+        functions_map[evt.id] = evt
+
+    # Now, correlate all kernels with FunctionEvents
+    kernel_query = """
+    SELECT
+        start.id AS marker_id, start.name, start.timestamp, end.timestamp,
+        runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
+        kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
+    FROM
+        CUPTI_ACTIVITY_KIND_MARKER AS start
+        INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
+            ON start.id = end.id
+        INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
+            ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
+        INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
+            ON kernel.correlationId = runtime.correlationId
+    """
+    unique = EnforceUnique()
+    for row in conn.execute(kernel_query):
+        unique.see(row['marker_id'], row['runtime_id'])
+        assert row['cbid'] == 13  # 13 == Launch
+        evt = functions_map[row['marker_id']]
+        evt.append_kernel(row['kernel_name'],
+                          0,
+                          row['kernel_start'],
+                          row['kernel_end'])
+
+    functions.sort(key=lambda evt: evt.cpu_interval.start)
+    return functions
+
+
+################################################################################
+# Pretty printer
+
+def build_table(events, sort_by=None, header=None):
+    """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
+    if sort_by is not None:
+        events = sorted(events, key=lambda evt: getattr(evt, sort_by))
+
+    name_lengths = [len(evt.key) for evt in events]
+    if len(name_lengths) == 0:
+        return ""
+    max_name_length = max(name_lengths)
+    max_name_length += 4  # Add some nice padding
+    col_width = 15
+    col_format = '  {: >' + str(col_width) + '}'
+    row_format = '{: <' + str(max_name_length) + '}' + col_format * 5
+    header_sep = '-' * max_name_length + ('  ' + '-' * col_width) * 5
+
+    # Have to use a list because nonlocal is Py3 only...
+    result = ['']
+
+    def append(s):
+        result[0] += s
+        result[0] += '\n'
+
+    # Actual printing
+    if header is not None:
+        line_length = max_name_length + (col_width + 2) * 5
+        append('=' * line_length)
+        append(header)
+    append(header_sep)
+    append(row_format.format('Name', 'CPU time', 'CUDA time', 'Calls', 'CPU total', 'CUDA total'))
+    append(header_sep)
+    for evt in events:
+        append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str,
+                                 evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str))
+
+    return result[0]
diff --git a/torch/autograd/variable.py b/torch/autograd/variable.py
new file mode 100644
index 0000000..09e5f7e
--- /dev/null
+++ b/torch/autograd/variable.py
@@ -0,0 +1,15 @@
+import torch
+from torch._six import with_metaclass
+
+
+class VariableMeta(type):
+    def __instancecheck__(self, other):
+        return isinstance(other, torch.Tensor)
+
+
+class Variable(with_metaclass(VariableMeta, torch._C._LegacyVariableBase)):
+    pass
+
+
+from torch._C import _ImperativeEngine as ImperativeEngine
+Variable._execution_engine = ImperativeEngine()
diff --git a/torch/backends/__init__.py b/torch/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
new file mode 100644
index 0000000..0252e3a
--- /dev/null
+++ b/torch/backends/cuda/__init__.py
@@ -0,0 +1,39 @@
+import sys
+import torch
+
+
+class ContextProp(object):
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter()
+
+    def __set__(self, obj, val):
+        if isinstance(self.setter, str):
+            raise RuntimeError(self.setter)
+        self.setter(val)
+
+
+class cuFFTPlanCache(object):
+    size = ContextProp(torch._cufft_get_plan_cache_size,
+                       'cufft_plan_cache.size is a read-only property showing the current cache. '
+                       'To set the cache capacity, use cufft_plan_cache.max_size.')
+    max_size = ContextProp(torch._cufft_get_plan_cache_max_size, torch._cufft_set_plan_cache_max_size)
+    clear = torch._cufft_clear_plan_cache
+
+
+class CUDAModule(object):
+    def __init__(self, m):
+        self.__dict__ = m.__dict__
+        # You have to retain the old module, otherwise it will
+        # get GC'ed and a lot of things will break.  See:
+        # https://stackoverflow.com/questions/47540722/how-do-i-use-the-sys-modules-replacement-trick-in-init-py-on-python-2
+        self.__old_mod = m
+
+    cufft_plan_cache = cuFFTPlanCache()
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = CUDAModule(sys.modules[__name__])
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
new file mode 100644
index 0000000..8d7542f
--- /dev/null
+++ b/torch/backends/cudnn/__init__.py
@@ -0,0 +1,459 @@
+import os
+import ctypes
+import sys
+import torch
+import warnings
+from torch.version import cuda
+from contextlib import contextmanager
+from subprocess import Popen, PIPE
+
+# Write:
+#
+#   torch.backends.cudnn.enabled = False
+#
+# to globally disable CuDNN
+
+lib = None
+__cudnn_version = None
+# TODO: dynamic version checks via cudnnGetVersion
+
+
+# The idea for this parameter is that we forbid bare assignment
+# to torch.backends.cudnn.enabled and friends when running our
+# test suite, where it's very easy to forget to undo the change
+# later.
+__allow_nonbracketed_mutation_flag = True
+
+
+def find_cudnn_windows_lib():
+    proc = Popen(['where', 'cudnn64*.dll'], stdout=PIPE, stderr=PIPE)
+    out, err = proc.communicate()
+    out = out.decode().strip()
+    if len(out) > 0:
+        if out.find('\r\n') != -1:
+            out = out.split('\r\n')[0]
+        cudnn_lib_name = os.path.basename(out)
+        cudnn_lib = os.path.splitext(cudnn_lib_name)[0]
+        cudnn_lib = str(cudnn_lib)
+        return ctypes.cdll.LoadLibrary(cudnn_lib)
+    else:
+        return None
+
+
+def _libcudnn():
+    global lib, __cudnn_version
+    if lib is None:
+        if sys.platform == "win32":
+            lib = find_cudnn_windows_lib()
+        else:
+            lib = ctypes.cdll.LoadLibrary(None)
+        if hasattr(lib, 'cudnnGetErrorString'):
+            lib.cudnnGetErrorString.restype = ctypes.c_char_p
+            __cudnn_version = lib.cudnnGetVersion()
+            compile_version = torch._C._cudnn_version()
+            # Check that cuDNN major and minor versions match
+            if (__cudnn_version // 100) != (compile_version // 100):
+                raise RuntimeError(
+                    'cuDNN version mismatch: PyTorch was compiled against {} '
+                    'but linked against {}'.format(compile_version, __cudnn_version))
+        else:
+            lib = None
+    return lib
+
+
+def version():
+    if _libcudnn() is None:
+        return None
+    return __cudnn_version
+
+
+CUDNN_TENSOR_TYPES = {
+    'torch.cuda.HalfTensor',
+    'torch.cuda.FloatTensor',
+    'torch.cuda.DoubleTensor',
+}
+
+
+def is_available():
+    r"""Returns a bool indicating if CUDNN is currently available."""
+    return torch._C.has_cudnn
+
+
+def is_acceptable(tensor):
+    if not torch._C._get_cudnn_enabled():
+        return False
+    if tensor.type() not in CUDNN_TENSOR_TYPES:
+        return False
+    if not is_available():
+        warnings.warn(
+            "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
+            "PyTorch making sure the library is visible to the build system.")
+        return False
+    if _libcudnn() is None:
+        warnings.warn('cuDNN library not found. Check your {libpath}'.format(
+            libpath={
+                'darwin': 'DYLD_LIBRARY_PATH',
+                'win32': 'PATH'
+            }.get(sys.platform, 'LD_LIBRARY_PATH')))
+        return False
+    return True
+
+
+_handles = {}
+
+verbose = False
+
+CUDNN_DATA_FLOAT = 0
+CUDNN_DATA_DOUBLE = 1
+CUDNN_DATA_HALF = 2
+
+CUDNN_TENSOR_NCHW = 0
+CUDNN_TENSOR_NHWC = 1
+
+CUDNN_RNN_RELU = 0
+CUDNN_RNN_TANH = 1
+CUDNN_LSTM = 2
+CUDNN_GRU = 3
+
+CUDNN_LINEAR_INPUT = 0
+CUDNN_SKIP_INPUT = 1
+
+CUDNN_RNN_ALGO_STANDARD = 0
+CUDNN_RNN_ALGO_PERSIST_STATIC = 1
+CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2
+
+CUDNN_DEFAULT_MATH = 0
+CUDNN_TENSOR_OP_MATH = 1
+
+
+def set_flags(_enabled, _benchmark, _deterministic, _verbose):
+    global benchmark, deterministic, verbose
+    orig_flags = (torch._C._get_cudnn_enabled(),
+                  torch._C._get_cudnn_benchmark(),
+                  torch._C._get_cudnn_deterministic(),
+                  verbose)
+    verbose = _verbose
+    torch._C._set_cudnn_enabled(_enabled)
+    torch._C._set_cudnn_benchmark(_benchmark)
+    torch._C._set_cudnn_deterministic(_deterministic)
+    return orig_flags
+
+
+def disable_global_flags():
+    global __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = False
+
+
+def flags_frozen():
+    return not __allow_nonbracketed_mutation_flag
+
+
+@contextmanager
+def __allow_nonbracketed_mutation():
+    global __allow_nonbracketed_mutation_flag
+    old = __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = True
+    try:
+        yield
+    finally:
+        __allow_nonbracketed_mutation_flag = old
+
+
+@contextmanager
+def flags(enabled=False, benchmark=False, deterministic=False, verbose=False):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled, benchmark, deterministic, verbose)
+    try:
+        yield
+    finally:
+        # recover the previous values
+        with __allow_nonbracketed_mutation():
+            set_flags(orig_flags[0], orig_flags[1], orig_flags[2], orig_flags[3])
+
+
+class CuDNNHandle:
+    def __init__(self):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreate(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+
+    def __del__(self):
+        check_error(lib.cudnnDestroy(self))
+
+
+class CuDNNError(RuntimeError):
+    def __init__(self, status):
+        self.status = status
+        msg = '{}: {}'.format(status, get_error_string(status))
+        super(CuDNNError, self).__init__(msg)
+
+
+class TensorDescriptor(object):
+    def __init__(self):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateTensorDescriptor(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyTensorDescriptor(self._as_parameter_))
+        del self._as_parameter_
+
+    def set(self, tensor):
+        self._type = tensor.type()
+        self._size = tensor.size()
+        self._stride = tensor.stride()
+        check_error(lib.cudnnSetTensorNdDescriptor(
+            self, _typemap[tensor.type()], tensor.dim(),
+            int_array(tensor.size()), int_array(tensor.stride())))
+
+    def as_tuple(self):
+        return (self._type, tuple(self._size), tuple(self._stride))
+
+
+class TensorDescriptorArray(object):
+    def __init__(self, N):
+        self.ptrs = (ctypes.c_void_p * N)()
+        for i in range(N):
+            ptr = ctypes.byref(self.ptrs, i * ctypes.sizeof(ctypes.c_void_p))
+            check_error(lib.cudnnCreateTensorDescriptor(ptr))
+        self._as_parameter_ = self.ptrs
+
+    def __del__(self):
+        for ptr in self.ptrs:
+            check_error(lib.cudnnDestroyTensorDescriptor(ctypes.c_void_p(ptr)))
+
+    def __getitem__(self, key):
+        return ctypes.c_void_p(self.ptrs[key])
+
+    def set_all(self, tensor):
+        _type = _typemap[tensor.type()]
+        _ndim = tensor.dim()
+        _size = int_array(tensor.size())
+        _stride = int_array(tensor.stride())
+        for ptr in self.ptrs:
+            check_error(lib.cudnnSetTensorNdDescriptor(
+                ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))
+
+    def set_raw(self, i, _type, _ndim, _size, _stride):
+        ptr = self.ptrs[i]
+        check_error(lib.cudnnSetTensorNdDescriptor(
+            ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))
+
+
+class FilterDescriptor(object):
+    def __init__(self):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateFilterDescriptor(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyFilterDescriptor(self._as_parameter_))
+        del self._as_parameter_
+
+    def set(self, weight):
+        self._size = weight.size()
+        datatype = _typemap[weight.type()]
+        check_error(lib.cudnnSetFilterNdDescriptor(
+            self, datatype, CUDNN_TENSOR_NCHW, weight.ndimension(),
+            int_array(weight.size())))
+
+    def as_tuple(self):
+        return tuple(self._size)
+
+
+class DropoutDescriptor(object):
+    def __init__(self, handle, dropout, seed):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateDropoutDescriptor(ctypes.byref(ptr)))
+
+        self._as_parameter_ = ptr
+        self.state = None
+        self.dropout = dropout
+        self.handle = handle
+
+        self._set(dropout, seed)
+
+    def set_dropout(self, dropout, seed):
+        if dropout != self.dropout:
+            self._set(dropout, seed)
+
+    def _set(self, dropout, seed):
+        if self.state is None and dropout > 0:
+            dropout_states_size = ctypes.c_long()
+            check_error(lib.cudnnDropoutGetStatesSize(
+                self.handle,
+                ctypes.byref(dropout_states_size)))
+            self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+            state_ptr = self.state.data_ptr()
+            state_size = self.state.size(0)
+        else:
+            state_ptr = None
+            state_size = 0
+
+        check_error(lib.cudnnSetDropoutDescriptor(
+            self,
+            self.handle,
+            ctypes.c_float(dropout),
+            ctypes.c_void_p(state_ptr),
+            ctypes.c_size_t(state_size),
+            ctypes.c_ulonglong(seed),
+        ))
+
+        self.dropout = dropout
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyDropoutDescriptor(self))
+
+
+class RNNDescriptor(object):
+    def __init__(self, handle, hidden_size, num_layers, dropout_desc, input_mode,
+                 bidirectional, mode, datatype):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateRNNDescriptor(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+        if version() >= 6000:
+            check_error(lib.cudnnSetRNNDescriptor_v6(
+                handle,
+                self,
+                hidden_size,
+                num_layers,
+                dropout_desc,
+                input_mode,
+                bidirectional,
+                mode,
+                CUDNN_RNN_ALGO_STANDARD,
+                datatype
+            ))
+            if version() >= 7000 and int(cuda[0]) >= 9 and (
+                    torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 7):
+                lib.cudnnSetRNNMatrixMathType(self, CUDNN_DEFAULT_MATH)
+                if datatype == CUDNN_DATA_HALF:
+                    lib.cudnnSetRNNMatrixMathType(self, CUDNN_TENSOR_OP_MATH)
+        else:
+            check_error(lib.cudnnSetRNNDescriptor(
+                self,
+                hidden_size,
+                num_layers,
+                dropout_desc,
+                input_mode,
+                bidirectional,
+                mode,
+                datatype
+            ))
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyRNNDescriptor(self))
+
+
+def check_error(status):
+    if status is not 0:
+        raise CuDNNError(status)
+
+
+def get_error_string(status):
+    return lib.cudnnGetErrorString(status)
+
+
+def get_handle():
+    if _libcudnn() is None:
+        raise RuntimeError('cuDNN not available')
+    current_device = torch.cuda.current_device()
+    handle = _handles.get(current_device, None)
+    if handle is None:
+        handle = CuDNNHandle()
+        _handles[current_device] = handle
+    return handle
+
+
+_typemap = {
+    'torch.cuda.HalfTensor': CUDNN_DATA_HALF,
+    'torch.cuda.FloatTensor': CUDNN_DATA_FLOAT,
+    'torch.cuda.DoubleTensor': CUDNN_DATA_DOUBLE,
+}
+
+_sizeofmap = {
+    CUDNN_DATA_HALF: 2,
+    CUDNN_DATA_FLOAT: 4,
+    CUDNN_DATA_DOUBLE: 8,
+}
+
+
+def c_type(tensor):
+    if isinstance(tensor, torch.cuda.HalfTensor):
+        return ctypes.c_float
+    elif isinstance(tensor, torch.cuda.FloatTensor):
+        return ctypes.c_float
+    elif isinstance(tensor, torch.cuda.DoubleTensor):
+        return ctypes.c_double
+    else:
+        raise ValueError("unknown type '{}'".format(type(tensor)))
+
+
+def int_array(itr):
+    array_type = ctypes.c_int * len(itr)
+    return array_type(*itr)
+
+
+def descriptor(tensor, N=None):
+    padded_size = tensor.size() + ((1,) * (5 - tensor.dim()))
+    tensor = tensor.view(padded_size)
+    if N is not None:
+        descriptor = TensorDescriptorArray(N)
+        descriptor.set_all(tensor)
+    else:
+        descriptor = TensorDescriptor()
+        descriptor.set(tensor)
+    return descriptor
+
+
+def descriptor_sequence(tensor, batch_sizes):
+    descriptors = TensorDescriptorArray(len(batch_sizes))
+    _type = _typemap[tensor.type()]
+    _ndim = 5
+    dim_pad = (1,) * (5 - tensor.dim())
+    _size = int_array(tensor.size() + dim_pad)
+    _stride = int_array(tensor.stride() + dim_pad)
+    for i, batch_size in enumerate(batch_sizes):
+        _size[0] = batch_size
+        descriptors.set_raw(i, _type, _ndim, _size, _stride)
+    return descriptors
+
+
+def add_tensor(*args):
+    check_error(lib.cudnnAddTensor(*args))
+
+
+# The magic here is to allow us to intercept code like this:
+#
+#   torch.backends.cudnn.enabled = True
+
+class ContextProp(object):
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter()
+
+    def __set__(self, obj, val):
+        if not flags_frozen():
+            self.setter(val)
+        else:
+            raise RuntimeError("not allowed to set torch.backends.cudnn flags "
+                               "after disable_global_flags; please use flags() context manager instead")
+
+
+class CudnnModule(object):
+    def __init__(self, m):
+        self.__dict__ = m.__dict__
+        # You have to retain the old module, otherwise it will
+        # get GC'ed and a lot of things will break.  See:
+        # https://stackoverflow.com/questions/47540722/how-do-i-use-the-sys-modules-replacement-trick-in-init-py-on-python-2
+        self.__old_mod = m
+    enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
+    deterministic = ContextProp(torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic)
+    benchmark = ContextProp(torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark)
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = CudnnModule(sys.modules[__name__])
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
new file mode 100644
index 0000000..c83ad19
--- /dev/null
+++ b/torch/backends/cudnn/rnn.py
@@ -0,0 +1,52 @@
+import torch.cuda
+import torch.backends.cudnn as cudnn
+
+
+def get_cudnn_mode(mode):
+    if mode == 'RNN_RELU':
+        return cudnn.CUDNN_RNN_RELU
+    elif mode == 'RNN_TANH':
+        return cudnn.CUDNN_RNN_TANH
+    elif mode == 'LSTM':
+        return cudnn.CUDNN_LSTM
+    elif mode == 'GRU':
+        return cudnn.CUDNN_GRU
+    else:
+        raise Exception("Unknown mode: {}".format(mode))
+
+
+# NB: We don't actually need this class anymore (in fact, we could serialize the
+# dropout state for even better reproducibility), but it is kept for backwards
+# compatibility for old models.
+class Unserializable(object):
+
+    def __init__(self, inner):
+        self.inner = inner
+
+    def get(self):
+        return self.inner
+
+    def __getstate__(self):
+        # Note: can't return {}, because python2 won't call __setstate__
+        # if the value evaluates to False
+        return "<unserializable>"
+
+    def __setstate__(self, state):
+        self.inner = None
+
+
+def init_dropout_state(dropout, train, dropout_seed, dropout_state):
+    dropout_desc_name = 'desc_' + str(torch.cuda.current_device())
+    dropout_p = dropout if train else 0
+    if (dropout_desc_name not in dropout_state) or (dropout_state[dropout_desc_name].get() is None):
+        if dropout_p == 0:
+            dropout_state[dropout_desc_name] = Unserializable(None)
+        else:
+            dropout_state[dropout_desc_name] = Unserializable(torch._cudnn_init_dropout_state(
+                dropout_p,
+                train,
+                dropout_seed,
+                self_ty=torch.uint8,
+                device=torch.device('cuda')))
+    dropout_ts = dropout_state[dropout_desc_name].get()
+    return dropout_ts
diff --git a/torch/backends/mkl/__init__.py b/torch/backends/mkl/__init__.py
new file mode 100644
index 0000000..f3d27d1
--- /dev/null
+++ b/torch/backends/mkl/__init__.py
@@ -0,0 +1,6 @@
+import torch
+
+
+def is_available():
+    r"""Returns whether PyTorch is built with MKL support."""
+    return torch._C.has_mkl
diff --git a/torch/contrib/__init__.py b/torch/contrib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/contrib/_graph_vis.py b/torch/contrib/_graph_vis.py
new file mode 100644
index 0000000..b98c115
--- /dev/null
+++ b/torch/contrib/_graph_vis.py
@@ -0,0 +1,112 @@
+"""
+Experimental. Tools for visualizing the torch.jit.Graph objects.
+"""
+import string
+import json
+
+_vis_template = string.Template("""
+<!doctype html>
+<html>
+<head>
+  <title>$name</title>
+
+  <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.20.1/vis.min.js"></script>
+  <link href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.20.1/vis.min.css" rel="stylesheet" type="text/css" />
+
+  <style type="text/css">
+    #mynetwork {
+      height: 100vh;
+    }
+  </style>
+</head>
+<body>
+
+<div id="mynetwork"></div>
+
+<script type="text/javascript">
+  // create an array with nodes
+  var nodes = new vis.DataSet(
+    $nodes
+  );
+
+  // create an array with edges
+  var edges = new vis.DataSet(
+    $edges
+  );
+
+  // create a network
+  var container = document.getElementById('mynetwork');
+  var data = {
+    nodes: nodes,
+    edges: edges
+  };
+  var options = $options;
+  var network = new vis.Network(container, data, options);
+</script>
+</body>
+</html>
+""")
+
+
+def write(self, filename):
+    """
+    Write an html file that visualizes a torch.jit.Graph using vis.js
+    Arguments:
+        self (torch.jit.Graph): the graph.
+        filename (string): the output filename, an html-file.
+    """
+
+    nodes = []
+    edges = []
+    options = {}
+    for n, i in enumerate(self.inputs()):
+        nodes.append({
+            'id': i.unique(),
+            'label': 'input {}'.format(n),
+            'shape': 'square',
+        })
+
+    existing = set()
+
+    def add_edge(i_, n):
+        i = i_ if i_.kind() != 'Select' else i_.input()
+        if (i, n) in existing:
+            return
+        existing.add((i, n))
+        e = {
+            'from': n.unique(),
+            'to': i.unique(),
+            'arrows': 'from',
+        }
+        if i.stage() != n.stage():
+            e['color'] = 'green'
+        edges.append(e)
+
+    counts = {}
+    offset = 0
+    for n in self.nodes():
+        if len(n.uses()) == 0 or n.kind() == 'Undefined':
+            continue
+        ident = counts.get(n.kind(), 0)
+        counts[n.kind()] = ident + 1
+        d = {
+            'id': n.unique(),
+            'label': '{}_{}'.format(n.kind(), ident),
+            'y': offset,
+            'fixed': {'y': True},
+        }
+        if n in self.outputs():
+            d['shape'] = 'triangle'
+
+        for i in n.inputs():
+            add_edge(i, n)
+
+        nodes.append(d)
+        offset += 30
+
+    result = _vis_template.substitute(nodes=json.dumps(nodes),
+                                      edges=json.dumps(edges),
+                                      options=json.dumps(options),
+                                      name=filename)
+    with open(filename, 'w') as f:
+        f.write(result)
diff --git a/torch/contrib/_tensorboard_vis.py b/torch/contrib/_tensorboard_vis.py
new file mode 100644
index 0000000..d9af6e0
--- /dev/null
+++ b/torch/contrib/_tensorboard_vis.py
@@ -0,0 +1,142 @@
+import time
+from collections import defaultdict
+from functools import partial
+
+
+# Unfortunately it doesn't seem as if there was any way to get TensorBoard to do
+# anything without having TF installed, and so this file has a hard dependency on it
+# as well. It really is a debugging tool, so it doesn't matter.
+try:
+    from tensorflow.core.util import event_pb2
+    from tensorflow.core.framework import graph_pb2
+    from tensorflow.python.summary.writer.writer import FileWriter
+except ImportError:
+    raise ImportError("TensorBoard visualization of GraphExecutors requires having "
+                      "TensorFlow installed")
+
+
+def dump_tensorboard_summary(graph_executor, logdir):
+    with FileWriter(logdir) as w:
+        pb_graph = visualize(graph_executor)
+        evt = event_pb2.Event(wall_time=time.time(), graph_def=pb_graph.SerializeToString())
+        w.add_event(evt)
+
+
+def visualize(graph, name_prefix='', pb_graph=None, executors_it=None):
+    """Visualizes an independent graph, or a graph executor."""
+    value_map = {}
+    pb_graph = pb_graph or graph_pb2.GraphDef()
+
+    if isinstance(graph, (torch._C.GraphExecutor, torch._C.GraphExecutorState)):
+        visualize_graph_executor(graph, name_prefix, pb_graph,
+                                 partial(visualize, pb_graph=pb_graph))
+        return pb_graph
+
+    # Set up an input node
+    input_node = pb_graph.node.add(op='input', name=name_prefix + 'input')
+    for i, value in enumerate(graph.param_node().outputs()):
+        value_map[value.unique()] = name_prefix + 'input:' + str(i)
+
+    visualize_rec(graph, value_map, name_prefix, pb_graph, executors_it)
+
+    # Gather all outputs
+    return_node = pb_graph.node.add(op='output', name=name_prefix + 'output')
+    for value in graph.return_node().inputs():
+        return_node.input.append(value_map[value.unique()])
+
+    return pb_graph
+
+
+def visualize_graph_executor(state, name_prefix, pb_graph, inline_graph):
+    """Appends the state of a given GraphExecutor to the graph protobuf.
+
+    Arguments:
+        state (GraphExecutor or GraphExecutorState): GraphExecutor to display.
+        name_prefix (str): Name prefix of the containing subgraph.
+        pb_graph (GraphDef): graph to append to.
+        inline_graph (callable): a function that handles setting up a value_map,
+            so that some graphs in here can be inlined. This is necessary, because
+            this will simply be `visualize` for the top-level GraphExecutor,
+            or `inline_graph` for all nested ones.
+
+            The signature should look like (Graph, name_prefix) -> ().
+            It will be called exactly once.
+
+    The strategy is to embed all different configurations as independent subgraphs,
+    while inlining the original graph as the one that actually produces the values.
+    """
+    if isinstance(state, torch._C.GraphExecutor):
+        state = state.get_debug_state()
+
+    if state.autograd_fallback_graph is not None:
+        visualize(graph=state.autograd_fallback_graph,
+                  name_prefix=name_prefix + 'autograd_fallback/',
+                  pb_graph=pb_graph,
+                  executors_it=iter(state.autograd_fallback.executors()))
+
+    for i, (arg_spec, plan) in enumerate(state.execution_plans.items()):
+        subgraph_name = name_prefix + 'plan{}/'.format(i)
+
+        # Create a disconnected node that will keep information regarding the input
+        # types of this trace. This is unfortunately a bit too verbose to be included
+        # in the subgraph name.
+        input_kinds = pb_graph.node.add(op='INPUT_KIND', name=subgraph_name)
+        input_kinds.attr['inputs'].s = repr(arg_spec).encode('ascii')
+
+        visualize(plan.graph, subgraph_name, pb_graph, iter(plan.code.executors()))
+
+        # Show gradient as an independent subgraph of this plan
+        if plan.grad_executor is not None:
+            grad_subgraph_name = subgraph_name + 'grad/'
+            visualize(plan.grad_executor, grad_subgraph_name, pb_graph)
+
+    return inline_graph(state.graph, name_prefix + 'original/')
+
+
+def visualize_rec(graph, value_map, name_prefix, pb_graph, executors_it=None):
+    """Recursive part of visualize (basically skips setting up the input and output nodes)."""
+    def inline_graph(subgraph, name, node):
+        rec_value_map = {inp.unique(): value_map[val.unique()]
+                         for inp, val in zip(subgraph.inputs(), node.inputs())}
+        visualize_rec(graph=subgraph,
+                      value_map=rec_value_map,
+                      name_prefix=name,
+                      pb_graph=pb_graph)
+        for out, val in zip(subgraph.outputs(), node.outputs()):
+            value_map[val.unique()] = rec_value_map[out.unique()]
+
+    op_id_counter = defaultdict(int)
+
+    def name_for(node):
+        kind = node.kind()[node.kind().index('::') + 2:]
+        op_id_counter[kind] += 1
+        return kind, name_prefix + kind + '_' + str(op_id_counter[kind])
+
+    def add_fusion_group(node):
+        op, name = name_for(node)
+        inline_graph(node.g('Subgraph'), name + '/', node)
+
+    def add_graph_executor(node):
+        op, name = name_for(node)
+        if executors_it is None:
+            add_node(node)
+        else:
+            ge = next(executors_it)
+            visualize_graph_executor(ge, name + '/', pb_graph,
+                                     partial(inline_graph, node=node))
+
+    def add_node(node):
+        if node.kind() == 'prim::FusionGroup':
+            return add_fusion_group(node)
+        elif node.kind() == 'prim::GraphExecutor':
+            return add_graph_executor(node)
+        op, name = name_for(node)
+        pb_node = pb_graph.node.add(op=op, name=name)
+        for value in node.inputs():
+            pb_node.input.append(value_map[value.unique()])
+        # TODO: handle attrs
+        for i, value in enumerate(node.outputs()):
+            value_map[value.unique()] = name + ':' + str(i)
+
+    for node in graph.nodes():
+        add_node(node)
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
new file mode 100644
index 0000000..c2358a9
--- /dev/null
+++ b/torch/csrc/DataLoader.cpp
@@ -0,0 +1,220 @@
+#include "DataLoader.h"
+
+// In cases like DataLoader, if a worker process die due to bus error/segfault
+// or just hang, the main process, if implemented with
+// multiprocessing.queue.SimpleQueue, will hang waiting for data. This is
+// difficult to avoid on PyTorch side as it can be caused by limited shm, or
+// other libraries users call in the workers. The following methods is an effort
+// to do our best provide some error message to users when such unfortunate
+// events happen.
+
+// TODO: The following don't work on Windows. Specifically, sigaction, waitid
+// calls ,and SIGCHLD handler. Currently, dummy implementations are provided
+// for Windows.
+
+#ifndef _WIN32
+
+#include <atomic>
+#include <map>
+#include <set>
+#include <signal.h>
+#include <sstream>
+#include <sys/wait.h>
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/python_numbers.h"
+
+using namespace torch;
+
+// Critical signal handlers should be registered on worker processes before
+// doing work.
+// The handler will raise default handler so that the kill information will be
+// retrieved from main process.
+// Python handle is _set_worker_signal_handlers().
+#define SIGNAL_HANDLER(SIGNAL, HANDLER_NAME, ERROR_MSG)                       \
+static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx)                 \
+{                                                                             \
+  auto _w = write(STDERR_FILENO, ERROR_MSG, sizeof(ERROR_MSG) / sizeof(char));\
+  (void)_w;                                                                   \
+  struct sigaction sa;                                                        \
+  sa.sa_handler = SIG_DFL;                                                    \
+  sa.sa_flags = 0;                                                            \
+  if (sigemptyset(&sa.sa_mask) != 0 || sigaction(SIGNAL, &sa, NULL) != 0) {   \
+    _exit(EXIT_FAILURE);                                                      \
+  } else {                                                                    \
+    raise(SIGNAL);                                                            \
+  }                                                                           \
+}
+
+// signal(2) is really not portable. So use sigaction.
+// http://man7.org/linux/man-pages/man2/signal.2.html
+static inline void setSignalHandler(int signal, void(*handler)(int, siginfo_t *, void *), struct sigaction *old_sa_ptr)
+{
+  struct sigaction sa;
+  sa.sa_sigaction = handler;
+  sa.sa_flags = SA_RESTART|SA_SIGINFO|SA_NOCLDSTOP|SA_NODEFER;
+  if (sigemptyset(&sa.sa_mask) != 0 || sigaction(signal, &sa, old_sa_ptr) != 0) {
+    std::ostringstream oss;
+    oss << "An error occurred while setting handler for " << strsignal(signal) << ".";
+    throw std::runtime_error(oss.str());
+  }
+}
+
+SIGNAL_HANDLER(SIGBUS, handler_SIGBUS, "ERROR: Unexpected bus error encountered in worker. "
+  "This might be caused by insufficient shared memory (shm).\n");
+SIGNAL_HANDLER(SIGSEGV, handler_SIGSEGV, "ERROR: Unexpected segmentation fault encountered in worker.\n");
+
+// When an error happend in DataLoader methods and Python starts to exit, the
+// error trace will keep the loader alive, and Python may kill the children
+// processes first before deleting the loader object. Then the cleaning up
+// methods in DataLoader.__del__ are not yet called, and SIGCHILD will print an
+// error saying a worker is killed by SIGTERM. So we suppress SIGTERM from main
+// loader process here to avoid this by _exit(EXIT_SUCCESS). Note that if we
+// exit with nonzero code, the loader SIGCHLD handler may report RuntimeError
+// again, and then it defeats the whole purpose.
+static void handler_SIGTERM(int sig, siginfo_t *info, void *ctx)
+{
+  if (info->si_pid == getppid()) {
+    _exit(EXIT_SUCCESS);
+  }
+  struct sigaction sa;
+  sa.sa_handler = SIG_DFL;
+  sa.sa_flags = 0;
+  if (sigemptyset(&sa.sa_mask) != 0 || sigaction(SIGTERM, &sa, NULL) != 0) {
+    _exit(EXIT_FAILURE);
+  } else {
+    raise(SIGTERM);
+  }
+}
+
+static PyObject *THPModule_setWorkerSignalHandlers(PyObject *module, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  setSignalHandler(SIGBUS, &handler_SIGBUS, NULL);
+  setSignalHandler(SIGSEGV, &handler_SIGSEGV, NULL);
+  setSignalHandler(SIGTERM, &handler_SIGTERM, NULL);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static std::map<int64_t, std::set<pid_t>> worker_pids = {};
+
+static PyObject *THPModule_errorIfAnyWorkerFails(PyObject *module) {
+  HANDLE_TH_ERRORS
+  int error;
+  std::set<pid_t> *pid_set;
+  pid_t worker_pid;
+  siginfo_t infop;
+
+  // Only check the pids we care about
+  for (auto it = worker_pids.begin(); it != worker_pids.end(); ++it) {
+    pid_set = &(it->second);
+    for (auto pid_it = pid_set->begin(); pid_it != pid_set->end(); ++pid_it) {
+      worker_pid = *pid_it;
+      // Use waitid rather than waitpid so that we can set NOWAIT, and that Python
+      // and other handlers can get whatever info they want about the child.
+      infop.si_pid = 0;
+      error = waitid(P_PID, worker_pid, &infop, WEXITED|WNOHANG|WNOWAIT);
+      // ignore errors and case with no waitable child
+      if (error < 0 || infop.si_pid == 0)
+        continue;
+      if (infop.si_code == CLD_EXITED && infop.si_status != EXIT_SUCCESS) {  // exit with error
+        std::ostringstream oss;
+        oss << "DataLoader worker (pid " << worker_pid << ") exited "
+            << "unexpectedly with exit code " << infop.si_status << ". "
+            << "Details are lost due to multiprocessing. Rerunning with "
+            << "num_workers=0 may give better error trace.";
+        // This is necessary. Otherwise, the runtime error will kill the other
+        // workers, and trigger this again.
+        pid_set->clear();
+        throw std::runtime_error(oss.str());
+      }  else if (infop.si_code == CLD_KILLED || infop.si_code == CLD_DUMPED) {  // killed by signal
+        std::ostringstream oss;
+        oss << "DataLoader worker (pid " << worker_pid << ") is killed "
+            << "by signal: " << strsignal(infop.si_status) << ". "
+            << "Details are lost due to multiprocessing. Rerunning with "
+            << "num_workers=0 may give better error trace.";
+        // This is necessary. Otherwise, the runtime error will kill the other
+        // workers, and trigger this again.
+        pid_set->clear();
+        throw std::runtime_error(oss.str());
+      }
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// We don't want to exit on any SIGCHLD from any child. child_pids is a tuple
+// of pids we are interested in.
+static PyObject *THPModule_updateWorkerPIDs(PyObject *module, PyObject *args) {
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 2) {
+    throw TypeError("_update_worker_pids expectes exactly 2 arguments.");
+  }
+  int64_t key = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 0));
+  if (worker_pids.find(key) != worker_pids.end()) {
+    throw ValueError("_update_worker_pids should be called only once for each _DataLoaderIter.");
+  }
+  PyObject *child_pids = PyTuple_GET_ITEM(args, 1);
+  if (!PyTuple_Check(child_pids)) {
+    throw TypeError("_update_worker_pids expects a tuple for child_pids, but got %s.",
+        Py_TYPE(child_pids)->tp_name);
+  }
+
+  std::set<pid_t> pids_set = {};
+  auto size = PyTuple_GET_SIZE(child_pids);
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = PyTuple_GET_ITEM(child_pids, idx);
+    pids_set.insert((pid_t) THPUtils_unpackLong(obj));
+  }
+
+  worker_pids[key] = pids_set;
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject *THPModule_removeWorkerPIDs(PyObject *module, PyObject *loader_id) {
+  HANDLE_TH_ERRORS
+
+  int64_t key = THPUtils_unpackLong(loader_id);
+  auto it = worker_pids.find(key);
+  if (it == worker_pids.end()) {
+    throw ValueError("Cannot find worker information for _DataLoaderIter with id %ld.", key);
+  }
+  worker_pids.erase(it);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+#undef SIGNAL_HANDLER
+
+#else
+// dummy implementations for windows
+
+static PyObject *THPModule_setWorkerSignalHandlers(PyObject *module, PyObject *_ignored) {
+  Py_RETURN_NONE;
+}
+
+static PyObject *THPModule_updateWorkerPIDs(PyObject *module, PyObject *_ignored) {
+  Py_RETURN_NONE;
+}
+
+static PyObject *THPModule_removeWorkerPIDs(PyObject *module, PyObject *_ignored) {
+  Py_RETURN_NONE;
+}
+
+static PyObject *THPModule_errorIfAnyWorkerFails(PyObject *module, PyObject *_ignored) {
+  Py_RETURN_NONE;
+}
+
+#endif
+
+PyMethodDef DataLoaderMethods[] = {
+  {"_set_worker_signal_handlers",  (PyCFunction)THPModule_setWorkerSignalHandlers,  METH_NOARGS,   NULL},
+  {"_update_worker_pids",          (PyCFunction)THPModule_updateWorkerPIDs,         METH_VARARGS,  NULL},
+  {"_remove_worker_pids",          (PyCFunction)THPModule_removeWorkerPIDs,         METH_O,        NULL},
+  {"_error_if_any_worker_fails",   (PyCFunction)THPModule_errorIfAnyWorkerFails,    METH_NOARGS,   NULL},
+  {NULL, NULL, 0, NULL}
+};
diff --git a/torch/csrc/DataLoader.h b/torch/csrc/DataLoader.h
new file mode 100644
index 0000000..d20462d
--- /dev/null
+++ b/torch/csrc/DataLoader.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+extern PyMethodDef DataLoaderMethods[];
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
new file mode 100644
index 0000000..fcd0cf2
--- /dev/null
+++ b/torch/csrc/Device.cpp
@@ -0,0 +1,229 @@
+#include "torch/csrc/Device.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/pybind.h"
+
+#include <ATen/Device.h>
+#include <ATen/Error.h>
+
+#include <cstring>
+#include <limits>
+#include <structmember.h>
+#include <sstream>
+
+PyObject *THPDevice_New(const at::Device& device)
+{
+  auto type = (PyTypeObject*)&THPDeviceType;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  if (!self) throw python_error();
+  auto self_ = reinterpret_cast<THPDevice*>(self.get());
+  self_->device = device;
+  return self.release();
+}
+
+PyObject *THPDevice_repr(THPDevice *self)
+{
+  std::ostringstream oss;
+  oss << "device(type=\'" << self->device.type() << "\'";
+  if (self->device.has_index()) {
+    oss << ", index=" << self->device.index();
+  }
+  oss << ")";
+  return THPUtils_packString(oss.str().c_str());
+}
+
+PyObject *THPDevice_str(THPDevice *self)
+{
+  std::ostringstream oss;
+  oss << self->device;
+  return THPUtils_packString(oss.str().c_str());
+}
+
+PyObject *THPDevice_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  static torch::PythonArgParser parser({
+    "Device(Device device)",
+    "Device(std::string type, int64_t? index=-1)"
+  });
+  torch::ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    auto device = r.device(0);
+    return THPDevice_New(device);
+  } else if (r.idx == 1) {
+    auto as_device = r.device(0);  // this works, because device can take strings
+    auto device_type = r.string(0);
+    if (as_device.has_index()) {
+      throw std::runtime_error("type (string) must not include an index because index "
+                                "was passed explicitly: " + device_type);
+    }
+    int32_t device_index = -1;
+    if (!r.isNone(1)) {
+      device_index = r.toInt64(1);
+      // -1 is allowed in ATen/C++, to mean the default device, but not in
+      // Python.
+      AT_CHECK(device_index >= 0, "Device index must not be negative");
+    }
+    at::Device device(as_device.type(), device_index);
+    return THPDevice_New(device);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPDevice_type(THPDevice *self)
+{
+  HANDLE_TH_ERRORS
+  std::ostringstream oss;
+  oss << self->device.type();
+  return THPUtils_packString(oss.str().c_str());
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPDevice_index(THPDevice *self)
+{
+  HANDLE_TH_ERRORS
+  if (self->device.has_index()) {
+    return THPUtils_packInt64(self->device.index());
+  } else {
+    Py_RETURN_NONE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static Py_ssize_t THPDevice_hash(THPDevice *self)
+{
+  HANDLE_TH_ERRORS
+  return static_cast<Py_ssize_t>(std::hash<at::Device>{}(self->device) % std::numeric_limits<Py_ssize_t>::max());
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+PyObject *THPDevice_rc(PyObject *a, PyObject *b, int op) {
+  HANDLE_TH_ERRORS
+  if (!THPDevice_Check(a) || !THPDevice_Check(b)) {
+    // Py_RETURN_NOTIMPLEMENTED not in python 2.
+    Py_INCREF(Py_NotImplemented);
+    return Py_NotImplemented;
+  }
+  THPDevice *da = reinterpret_cast<THPDevice*>(a);
+  THPDevice *db = reinterpret_cast<THPDevice*>(b);
+
+  switch(op) {
+    case Py_EQ:
+      if (da->device == db->device) {
+        Py_RETURN_TRUE;
+      } else {
+        Py_RETURN_FALSE;
+      }
+    case Py_NE:
+      if (da->device == db->device) {
+        Py_RETURN_FALSE;
+      } else {
+        Py_RETURN_TRUE;
+      }
+    case Py_LT:
+    case Py_LE:
+    case Py_GT:
+    case Py_GE:
+      throw torch::TypeError("comparison not implemented");
+    default:
+      throw torch::TypeError("unexpected comparison op");
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPDevice_reduce(THPDevice *self)
+{
+  HANDLE_TH_ERRORS
+  auto ret = THPObjectPtr{PyTuple_New(2)};
+  if (!ret) throw python_error();
+
+  py::object torch_module = py::module::import("torch");
+  py::object torch_device = torch_module.attr("device");
+  PyTuple_SET_ITEM(ret.get(), 0, torch_device.release().ptr());
+
+  THPObjectPtr args;
+  std::ostringstream oss;
+  oss << self->device.type();
+  if (self->device.has_index()) {
+    args = THPObjectPtr{Py_BuildValue("(si)", oss.str().c_str(), self->device.index())};
+  } else {
+    args = THPObjectPtr{Py_BuildValue("(s)", oss.str().c_str())};
+  }
+  if (!args) throw python_error();
+  PyTuple_SET_ITEM(ret.get(), 1, args.release());
+
+  return ret.release();
+  END_HANDLE_TH_ERRORS
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+
+static struct PyGetSetDef THPDevice_properties[] = {
+  {"type",       (getter)THPDevice_type, nullptr, nullptr, nullptr},
+  {"index",      (getter)THPDevice_index, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+static PyMethodDef THPDevice_methods[] = {
+  {"__reduce__", (PyCFunction)THPDevice_reduce, METH_NOARGS, nullptr},
+  {NULL}  /* Sentinel */
+};
+
+PyTypeObject THPDeviceType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch.device",                        /* tp_name */
+  sizeof(THPDevice),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  (reprfunc)THPDevice_repr,              /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  (hashfunc)THPDevice_hash,              /* tp_hash  */
+  0,                                     /* tp_call */
+  (reprfunc)THPDevice_str,               /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  nullptr,                               /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  (richcmpfunc)THPDevice_rc,             /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPDevice_methods,                     /* tp_methods */
+  0,                                     /* tp_members */
+  THPDevice_properties,                  /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPDevice_pynew,                       /* tp_new */
+};
+
+void THPDevice_init(PyObject *module)
+{
+  if (PyType_Ready(&THPDeviceType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPDeviceType);
+  if (PyModule_AddObject(module, "device", (PyObject *)&THPDeviceType) != 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/Device.h b/torch/csrc/Device.h
new file mode 100644
index 0000000..d14c400
--- /dev/null
+++ b/torch/csrc/Device.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include <ATen/Device.h>
+
+struct THPDevice {
+  PyObject_HEAD
+  at::Device device;
+};
+
+extern PyTypeObject THPDeviceType;
+
+inline bool THPDevice_Check(PyObject *obj) {
+  return Py_TYPE(obj) == &THPDeviceType;
+}
+
+PyObject * THPDevice_New(const at::Device& device);
+
+void THPDevice_init(PyObject *module);
diff --git a/torch/csrc/Dtype.cpp b/torch/csrc/Dtype.cpp
new file mode 100644
index 0000000..9436f80
--- /dev/null
+++ b/torch/csrc/Dtype.cpp
@@ -0,0 +1,108 @@
+#include "Dtype.h"
+
+#include <cstring>
+#include <structmember.h>
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/tensor_dtypes.h"
+#include "torch/csrc/utils/tensor_types.h"
+
+PyObject * THPDtype_New(at::ScalarType scalar_type, const std::string& name)
+{
+  auto type = (PyTypeObject*)&THPDtypeType;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  if (!self) throw python_error();
+  auto self_ = reinterpret_cast<THPDtype*>(self.get());
+  self_->scalar_type = scalar_type;
+  std::strncpy (self_->name, name.c_str(), DTYPE_NAME_LEN);
+  self_->name[DTYPE_NAME_LEN] = '\0';
+  return self.release();
+}
+
+PyObject *THPDtype_is_floating_point(THPDtype *self)
+{
+  if (at::isFloatingType(self->scalar_type)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+PyObject *THPDtype_reduce(THPDtype *self)
+{
+  /*
+  * For singletons, a string is returned. The string should be interpreted
+  * as the name of a global variable.
+  */
+  return THPUtils_packString(self->name);
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+
+static struct PyGetSetDef THPDtype_properties[] = {
+  {"is_floating_point", (getter)THPDtype_is_floating_point, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+static PyMethodDef THPDtype_methods[] = {
+  {"__reduce__", (PyCFunction)THPDtype_reduce, METH_NOARGS, nullptr},
+  {NULL}  /* Sentinel */
+};
+
+PyObject *THPDtype_repr(THPDtype *self)
+{
+  return THPUtils_packString(self->name);
+}
+
+PyTypeObject THPDtypeType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch.dtype",                         /* tp_name */
+  sizeof(THPDtype),                      /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  (reprfunc)THPDtype_repr,               /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  nullptr,                               /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPDtype_methods,                      /* tp_methods */
+  0,                                     /* tp_members */
+  THPDtype_properties,                   /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  0,                                     /* tp_new */
+};
+
+void THPDtype_init(PyObject *module)
+{
+  if (PyType_Ready(&THPDtypeType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPDtypeType);
+  if (PyModule_AddObject(module, "dtype", (PyObject *)&THPDtypeType) != 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/Dtype.h b/torch/csrc/Dtype.h
new file mode 100644
index 0000000..4b4769b
--- /dev/null
+++ b/torch/csrc/Dtype.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include "ATen/ATen.h"
+
+const int DTYPE_NAME_LEN = 64;
+
+struct THPDtype {
+  PyObject_HEAD
+  at::ScalarType scalar_type;
+  char name[DTYPE_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPDtypeType;
+
+inline bool THPDtype_Check(PyObject *obj) {
+  return Py_TYPE(obj) == &THPDtypeType;
+}
+
+PyObject * THPDtype_New(at::ScalarType scalar_type, const std::string& name);
+
+void THPDtype_init(PyObject *module);
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
new file mode 100644
index 0000000..b9dfa25
--- /dev/null
+++ b/torch/csrc/DynamicTypes.cpp
@@ -0,0 +1,156 @@
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/PythonTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/utils/cuda_enabled.h"
+
+#include <ATen/ATen.h>
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#ifdef USE_CUDA
+#include <THC/THC.h>
+#endif
+
+namespace torch {
+namespace {
+const std::unordered_map<std::string, at::ScalarType> attype_names = {
+  {"Float", at::kFloat},
+  {"Double", at::kDouble},
+  {"Half", at::kHalf},
+  {"Byte", at::kByte},
+  {"Char", at::kChar},
+  {"Short", at::kShort},
+  {"Int", at::kInt},
+  {"Long", at::kLong},
+};
+
+std::unordered_map<at::Type*, PyTypeObject*> attype_to_py_storage_type;
+std::unordered_map<PyTypeObject*, at::Type*> py_storage_type_to_attype;
+
+THPDtype* dtype_registry
+  [static_cast<int>(at::ScalarType::NumOptions)] = {};
+
+THPLayout* layout_registry
+  [static_cast<int>(at::Backend::NumOptions)] = {};
+
+at::Backend get_backend(bool is_cuda, bool is_sparse) {
+  if (is_cuda) {
+    if (is_sparse){
+      return at::kSparseCUDA;
+    } else {
+      return at::kCUDA;
+    }
+  } else {
+    if (is_sparse){
+      return at::kSparseCPU;
+    } else {
+      return at::kCPU;
+    }
+  }
+}
+
+at::Type* get_type(const std::string& name, bool is_cuda, bool is_sparse) {
+  if (is_sparse && name == "Half") {
+    return nullptr;
+  }
+  at::Backend backend = get_backend(is_cuda, is_sparse);
+  return &at::getType(backend, attype_names.at(name));
+}
+
+PyTypeObject* getPyTypeObject(const at::Storage& storage)
+{
+  auto it = attype_to_py_storage_type.find(&storage.type());
+  if (it != attype_to_py_storage_type.end()) {
+    return it->second;
+  }
+  throw std::invalid_argument("unsupported Storage type");
+}
+} // namespace
+
+void registerStoragePyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse)
+{
+  auto attype = get_type(name, is_cuda, is_sparse);
+  if (attype) {
+    attype_to_py_storage_type[attype] = pytype;
+    py_storage_type_to_attype[pytype] = attype;
+  }
+}
+
+void registerDtypeObject(THPDtype *dtype, at::ScalarType scalarType) {
+  dtype_registry[static_cast<int>(scalarType)] = dtype;
+}
+
+void registerLayoutObject(THPLayout *layout, at::Backend backend) {
+  layout_registry[static_cast<int>(backend)] = layout;
+}
+
+at::Type& getType(at::ScalarType scalarType, const THPLayout& layout, const at::Device& device) {
+  const at::Backend backend = get_backend(device.type() == at::Device::Type::CUDA, layout.layout == at::Layout::Sparse);
+  auto baseType = at::globalContext().getTypeOpt(backend, scalarType);
+  if (!baseType) {
+    std::ostringstream oss;
+    oss << "Error attempting to use dtype " << getDtype(scalarType)->name << " with layout " << layout.name
+        << " and device type " << device.type() << ".";
+    if (device.type() == at::Device::Type::CUDA && !torch::utils::cuda_enabled()) {
+      oss << "  Torch not compiled with CUDA enabled." << std::endl;
+    }
+    throw std::runtime_error(oss.str());
+  }
+  return *torch::autograd::VariableType::getType(*baseType);
+}
+
+THPDtype* getDtype(at::ScalarType scalarType) {
+  auto dtype = dtype_registry[static_cast<int>(scalarType)];
+  if (!dtype) {
+    throw std::invalid_argument("unsupported scalarType");
+  }
+  return dtype;
+}
+
+THPLayout* getLayout(at::Backend backend) {
+  auto layout = layout_registry[static_cast<int>(backend)];
+  if (!layout) {
+    throw std::invalid_argument("unsupported at::Backend");
+  }
+  return layout;
+}
+
+at::Device::Type getDeviceType(const at::Type& type) {
+  return type.is_cuda() ? at::Device::Type::CUDA : at::Device::Type::CPU;
+}
+
+PyObject* createPyObject(const at::Storage& storage)
+{
+  auto type = getPyTypeObject(storage);
+  auto obj = THPObjectPtr(type->tp_alloc(type, 0));
+  if (!obj) throw python_error();
+  ((THPVoidStorage*)obj.get())->cdata = (THVoidStorage *)storage.unsafeGetTH(true);
+  return obj.release();
+}
+
+bool isStorage(PyObject* obj)
+{
+  auto it = py_storage_type_to_attype.find(Py_TYPE(obj));
+  return it != py_storage_type_to_attype.end();
+}
+std::unique_ptr<at::Storage> createStorage(PyObject* obj)
+{
+  auto it = py_storage_type_to_attype.find(Py_TYPE(obj));
+  if (it == py_storage_type_to_attype.end()) {
+    throw TypeError("not a storage '%s'", Py_TYPE(obj)->tp_name);
+  }
+  auto& type = *it->second;
+  return type.unsafeStorageFromTH(((THPVoidStorage*)obj)->cdata, true);
+}
+
+}  // namespace
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
new file mode 100644
index 0000000..25238cb
--- /dev/null
+++ b/torch/csrc/DynamicTypes.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and at::Tensor.
+
+#include "torch/csrc/python_headers.h"
+
+#include <ATen/Device.h>
+
+#include <memory>
+#include <string>
+
+struct THPDtype;
+struct THPLayout;
+
+namespace at {
+enum class Backend;
+enum class ScalarType;
+struct Storage;
+struct Type;
+} // namespace at
+
+namespace torch {
+// Register a PyTypeObject* with the given attributes
+void registerStoragePyTypeObject(
+    PyTypeObject *pytype, const std::string& name,
+    bool is_cuda, bool is_sparse);
+
+void registerDtypeObject(THPDtype *dtype, at::ScalarType scalarType);
+void registerLayoutObject(THPLayout *layout, at::Backend backend);
+
+PyObject* createPyObject(const at::Storage& storage);
+std::unique_ptr<at::Storage> createStorage(PyObject* obj);
+bool isStorage(PyObject* obj);
+
+THPDtype* getDtype(at::ScalarType scalarType);
+THPLayout* getLayout(at::Backend backend);
+at::Type& getType(at::ScalarType scalarType, const THPLayout& layout, const at::Device& device);
+at::Device::Type getDeviceType(const at::Type& type);
+}  // namespace torch
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
new file mode 100644
index 0000000..306dd3a
--- /dev/null
+++ b/torch/csrc/Exceptions.cpp
@@ -0,0 +1,137 @@
+#include "Exceptions.h"
+#include "torch/csrc/python_headers.h"
+
+#include <utility>
+#include <vector>
+#include <cstdarg>
+
+#include "THP.h"
+
+PyObject *THPException_FatalError;
+
+#define ASSERT_TRUE(cond) if (!(cond)) return false
+bool THPException_init(PyObject *module)
+{
+  ASSERT_TRUE(THPException_FatalError = PyErr_NewException("torch.FatalError", NULL, NULL));
+  ASSERT_TRUE(PyModule_AddObject(module, "FatalError", THPException_FatalError) == 0);
+  return true;
+}
+
+namespace torch {
+
+void replaceAll(std::string & str,
+    const std::string & old_str,
+    const std::string & new_str) {
+  std::string::size_type pos = 0u;
+  while ((pos = str.find(old_str, pos)) != std::string::npos) {
+    str.replace(pos, old_str.length(), new_str);
+  }
+}
+
+std::string processErrorMsg(std::string str) {
+
+  // Translate Aten types to their respective pytorch ones
+  std::vector<std::pair<std::string, std::string>> changes {
+    {"Variable[SparseCUDAByteType]", "torch.cuda.sparse.ByteTensor"},
+    {"Variable[SparseCUDACharType]", "torch.cuda.sparse.CharTensor"},
+    {"Variable[SparseCUDADoubleType]", "torch.cuda.sparse.DoubleTensor"},
+    {"Variable[SparseCUDAFloatType]", "torch.cuda.sparse.FloatTensor"},
+    {"Variable[SparseCUDAIntType]", "torch.cuda.sparse.IntTensor"},
+    {"Variable[SparseCUDALongType]", "torch.cuda.sparse.LongTensor"},
+    {"Variable[SparseCUDAShortType]", "torch.cuda.sparse.ShortTensor"},
+    {"Variable[SparseCUDAHalfType]", "torch.cuda.sparse.HalfTensor"},
+    {"Variable[SparseCPUByteType]", "torch.sparse.ByteTensor"},
+    {"Variable[SparseCPUCharType]", "torch.sparse.CharTensor"},
+    {"Variable[SparseCPUDoubleType]", "torch.sparse.DoubleTensor"},
+    {"Variable[SparseCPUFloatType]", "torch.sparse.FloatTensor"},
+    {"Variable[SparseCPUIntType]", "torch.sparse.IntTensor"},
+    {"Variable[SparseCPULongType]", "torch.sparse.LongTensor"},
+    {"Variable[SparseCPUShortType]", "torch.sparse.ShortTensor"},
+    {"Variable[SparseCPUHalfType]", "torch.sparse.HalfTensor"},
+    {"Variable[CUDAByteType]", "torch.cuda.ByteTensor"},
+    {"Variable[CUDACharType]", "torch.cuda.CharTensor"},
+    {"Variable[CUDADoubleType]", "torch.cuda.DoubleTensor"},
+    {"Variable[CUDAFloatType]", "torch.cuda.FloatTensor"},
+    {"Variable[CUDAIntType]", "torch.cuda.IntTensor"},
+    {"Variable[CUDALongType]", "torch.cuda.LongTensor"},
+    {"Variable[CUDAShortType]", "torch.cuda.ShortTensor"},
+    {"Variable[CUDAHalfType]", "torch.cuda.HalfTensor"},
+    {"Variable[CPUByteType]", "torch.ByteTensor"},
+    {"Variable[CPUCharType]", "torch.CharTensor"},
+    {"Variable[CPUDoubleType]", "torch.DoubleTensor"},
+    {"Variable[CPUFloatType]", "torch.FloatTensor"},
+    {"Variable[CPUIntType]", "torch.IntTensor"},
+    {"Variable[CPULongType]", "torch.LongTensor"},
+    {"Variable[CPUShortType]", "torch.ShortTensor"},
+    {"Variable[CPUHalfType]", "torch.HalfTensor"},
+    {"SparseCUDAByteType", "torch.cuda.sparse.ByteTensor"},
+    {"SparseCUDACharType", "torch.cuda.sparse.CharTensor"},
+    {"SparseCUDADoubleType", "torch.cuda.sparse.DoubleTensor"},
+    {"SparseCUDAFloatType", "torch.cuda.sparse.FloatTensor"},
+    {"SparseCUDAIntType", "torch.cuda.sparse.IntTensor"},
+    {"SparseCUDALongType", "torch.cuda.sparse.LongTensor"},
+    {"SparseCUDAShortType", "torch.cuda.sparse.ShortTensor"},
+    {"SparseCUDAHalfType", "torch.cuda.sparse.HalfTensor"},
+    {"SparseCPUByteType", "torch.sparse.ByteTensor"},
+    {"SparseCPUCharType", "torch.sparse.CharTensor"},
+    {"SparseCPUDoubleType", "torch.sparse.DoubleTensor"},
+    {"SparseCPUFloatType", "torch.sparse.FloatTensor"},
+    {"SparseCPUIntType", "torch.sparse.IntTensor"},
+    {"SparseCPULongType", "torch.sparse.LongTensor"},
+    {"SparseCPUShortType", "torch.sparse.ShortTensor"},
+    {"SparseCPUHalfType", "torch.sparse.HalfTensor"},
+    {"CUDAByteType", "torch.cuda.ByteTensor"},
+    {"CUDACharType", "torch.cuda.CharTensor"},
+    {"CUDADoubleType", "torch.cuda.DoubleTensor"},
+    {"CUDAFloatType", "torch.cuda.FloatTensor"},
+    {"CUDAIntType", "torch.cuda.IntTensor"},
+    {"CUDALongType", "torch.cuda.LongTensor"},
+    {"CUDAShortType", "torch.cuda.ShortTensor"},
+    {"CUDAHalfType", "torch.cuda.HalfTensor"},
+    {"CPUByteType", "torch.ByteTensor"},
+    {"CPUCharType", "torch.CharTensor"},
+    {"CPUDoubleType", "torch.DoubleTensor"},
+    {"CPUFloatType", "torch.FloatTensor"},
+    {"CPUIntType", "torch.IntTensor"},
+    {"CPULongType", "torch.LongTensor"},
+    {"CPUShortType", "torch.ShortTensor"},
+    {"CPUHalfType", "torch.HalfTensor"},
+  };
+
+  for (const auto & it : changes) {
+    replaceAll(str, it.first, it.second);
+  }
+
+  return str;
+}
+
+static std::string formatMessage(const char *format, va_list fmt_args) {
+  static const size_t ERROR_BUF_SIZE = 1024;
+  char error_buf[ERROR_BUF_SIZE];
+  vsnprintf(error_buf, ERROR_BUF_SIZE, format, fmt_args);
+  return std::string(error_buf);
+}
+
+IndexError::IndexError(const char *format, ...) {
+  va_list fmt_args;
+  va_start(fmt_args, format);
+  msg = formatMessage(format, fmt_args);
+  va_end(fmt_args);
+}
+
+TypeError::TypeError(const char *format, ...) {
+  va_list fmt_args;
+  va_start(fmt_args, format);
+  msg = formatMessage(format, fmt_args);
+  va_end(fmt_args);
+}
+
+ValueError::ValueError(const char *format, ...) {
+  va_list fmt_args;
+  va_start(fmt_args, format);
+  msg = formatMessage(format, fmt_args);
+  va_end(fmt_args);
+}
+
+} // namespace torch
+
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
new file mode 100644
index 0000000..cc6a56c
--- /dev/null
+++ b/torch/csrc/Exceptions.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <exception>
+#include <stdexcept>
+#include <string>
+
+#include "ATen/Error.h"
+#include "THP_export.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/auto_gil.h"
+
+#define HANDLE_TH_ERRORS                                                       \
+  try {
+
+#define END_HANDLE_TH_ERRORS_RET(retval)                                       \
+  } catch (python_error &e) {                                                  \
+    return retval;                                                             \
+  } catch (const at::Error &e) {                                               \
+    auto msg = torch::processErrorMsg(e.what_without_backtrace());              \
+    PyErr_SetString(PyExc_RuntimeError, msg.c_str());                          \
+    return retval;                                                             \
+  } catch (torch::PyTorchError &e) {                                           \
+    auto msg = torch::processErrorMsg(e.what());                               \
+    PyErr_SetString(e.python_type(), msg.c_str());                             \
+    return retval;                                                             \
+  } catch (const std::exception &e) {                                          \
+    auto msg = torch::processErrorMsg(e.what());                               \
+    PyErr_SetString(PyExc_RuntimeError, msg.c_str());                          \
+    return retval;                                                             \
+  }
+
+#define END_HANDLE_TH_ERRORS END_HANDLE_TH_ERRORS_RET(NULL)
+
+extern PyObject *THPException_FatalError;
+
+// Throwing this exception means that the python error flags have been already
+// set and control should be immediately returned to the interpreter.
+struct python_error : public std::exception {
+  python_error() : type(nullptr), value(nullptr), traceback(nullptr) {}
+
+  python_error(const python_error &other) : type(other.type), value(other.value), traceback(other.traceback) {
+    AutoGIL gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+  }
+
+  python_error(python_error&& other) {
+    type = std::move(other.type);
+    value = std::move(other.value);
+    traceback = std::move(other.traceback);
+    other.type = nullptr;
+    other.value = nullptr;
+    other.traceback = nullptr;
+  }
+
+  ~python_error() {
+    if (type || value || traceback) {
+      AutoGIL gil;
+      Py_XDECREF(type);
+      Py_XDECREF(value);
+      Py_XDECREF(traceback);
+    }
+  }
+
+  /** Saves the exception so that it can be re-thrown on a different thread */
+  inline void persist() {
+    if (type) return; // Don't overwrite exceptions
+    // PyErr_Fetch overwrites the pointers
+    AutoGIL gil;
+    Py_XDECREF(type);
+    Py_XDECREF(value);
+    Py_XDECREF(traceback);
+    PyErr_Fetch(&type, &value, &traceback);
+  }
+
+  /** Sets the current Python error from this exception */
+  inline void restore() {
+    if (!type) return;
+    // PyErr_Restore steals references
+    AutoGIL gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+    PyErr_Restore(type, value, traceback);
+  }
+
+  PyObject* type;
+  PyObject* value;
+  PyObject* traceback;
+};
+
+#ifdef _THP_CORE
+
+bool THPException_init(PyObject *module);
+#endif
+
+namespace torch {
+
+THP_CLASS std::string processErrorMsg(std::string str);
+
+// Abstract base class for exceptions which translate to specific Python types
+struct PyTorchError : public std::exception {
+  virtual PyObject* python_type() = 0;
+  virtual const char* what() const noexcept override {
+    return msg.c_str();
+  }
+  std::string msg;
+};
+
+// Translates to Python IndexError
+struct IndexError : public PyTorchError {
+  IndexError(const char *format, ...);
+  virtual PyObject* python_type() override {
+    return PyExc_IndexError;
+  }
+};
+
+// Translates to Python TypeError
+struct TypeError : public PyTorchError {
+  TypeError(const char *format, ...);
+  virtual PyObject* python_type() override {
+    return PyExc_TypeError;
+  }
+};
+
+// Translates to Python ValueError
+struct ValueError : public PyTorchError {
+  ValueError(const char *format, ...);
+  virtual PyObject* python_type() override {
+    return PyExc_ValueError;
+  }
+};
+
+} // namespace torch
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
new file mode 100644
index 0000000..8834190
--- /dev/null
+++ b/torch/csrc/Generator.cpp
@@ -0,0 +1,185 @@
+#include "Generator.h"
+
+#include <structmember.h>
+#include <ATen/ATen.h>
+
+#include <stdbool.h>
+#include <TH/TH.h>
+#include "THP.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/utils/tensor_types.h"
+
+using namespace at;
+using namespace torch;
+
+PyObject *THPGeneratorClass = NULL;
+
+PyObject * THPGenerator_New()
+{
+  PyObject *args = PyTuple_New(0);
+  if (!args) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create a new generator object - "
+        "failed to allocate argument tuple");
+    return NULL;
+  }
+  PyObject *result = PyObject_Call((PyObject*)THPGeneratorClass, args, NULL);
+  Py_DECREF(args);
+  return result;
+}
+
+PyObject * THPGenerator_NewWithGenerator(at::Generator& cdata)
+{
+  auto type = (PyTypeObject*)THPGeneratorClass;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  if (!self) throw python_error();
+  auto self_ = reinterpret_cast<THPGenerator*>(self.get());
+  self_->cdata = &cdata;
+  return self.release();
+}
+
+static void THPGenerator_dealloc(THPGenerator* self)
+{
+  if (self->owner) {
+    delete self->cdata;
+  }
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject * THPGenerator_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  if ((args && PyTuple_Size(args) != 0) || kwargs) {
+    THPUtils_setError("torch.Generator constructor doesn't accept any arguments");
+    return NULL;
+  }
+  THPGeneratorPtr self((THPGenerator *)type->tp_alloc(type, 0));
+  // having to pick a specific type rather than just a backend here is strange,
+  // but we don't really have fully fledged backend objects.
+  self->cdata = at::CPU(at::kFloat).generator().release();
+  self->owner = true;
+  return (PyObject*)self.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPGenerator_getState(THPGenerator *self)
+{
+  using namespace torch::autograd;
+  HANDLE_TH_ERRORS
+  THGenerator *generator = THPGenerator_TH_CData(self);
+  auto tensor = VariableType::getType(CPU(kByte))->tensor();
+  THByteTensor_getRNGState(generator, (THByteTensor*)tensor.unsafeGetTH(false));
+  return THPVariable_Wrap(std::move(tensor));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPGenerator_setState(THPGenerator *self, PyObject *_new_state)
+{
+  using namespace torch::autograd;
+  HANDLE_TH_ERRORS
+  if (!THPVariable_Check(_new_state)) {
+    throw TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name);
+  }
+  auto& tensor = ((THPVariable*)_new_state)->cdata.data();
+  if (tensor.type() != CPU(kByte)) {
+    auto type_name = torch::utils::type_to_string(tensor.type());
+    throw TypeError("expected a torch.ByteTensor, but got %s", type_name.c_str());
+  }
+  THGenerator *generator = THPGenerator_TH_CData(self);
+  THByteTensor_setRNGState(generator, (THByteTensor*)tensor.unsafeGetTH(false));
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPGenerator_manualSeed(THPGenerator *self, PyObject *seed)
+{
+  HANDLE_TH_ERRORS
+  auto generator = self->cdata;
+  THPUtils_assert(THPUtils_checkLong(seed), "manual_seed expected a long, "
+          "but got %s", THPUtils_typename(seed));
+  generator->manualSeed(THPUtils_unpackLong(seed));
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPGenerator_seed(THPGenerator *self)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(self->cdata->seed());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPGenerator_initialSeed(THPGenerator *self)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(self->cdata->initialSeed());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef THPGenerator_methods[] = {
+  {"get_state",       (PyCFunction)THPGenerator_getState,       METH_NOARGS,  NULL},
+  {"set_state",       (PyCFunction)THPGenerator_setState,       METH_O,       NULL},
+  {"manual_seed",     (PyCFunction)THPGenerator_manualSeed,     METH_O,       NULL},
+  {"seed",            (PyCFunction)THPGenerator_seed,           METH_NOARGS,  NULL},
+  {"initial_seed",    (PyCFunction)THPGenerator_initialSeed,    METH_NOARGS,  NULL},
+  {NULL}
+};
+
+static struct PyMemberDef THPGenerator_members[] = {
+  {(char*)"_cdata", T_ULONGLONG, offsetof(THPGenerator, cdata), READONLY, NULL},
+  {NULL}
+};
+
+PyTypeObject THPGeneratorType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C.Generator",                  /* tp_name */
+  sizeof(THPGenerator),                  /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPGenerator_dealloc,      /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPGenerator_methods,                  /* tp_methods */
+  THPGenerator_members,                  /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPGenerator_pynew,                    /* tp_new */
+};
+
+bool THPGenerator_init(PyObject *module)
+{
+  THPGeneratorClass = (PyObject*)&THPGeneratorType;
+  if (PyType_Ready(&THPGeneratorType) < 0)
+    return false;
+  Py_INCREF(&THPGeneratorType);
+  PyModule_AddObject(module, "Generator", (PyObject *)&THPGeneratorType);
+  return true;
+}
diff --git a/torch/csrc/Generator.h b/torch/csrc/Generator.h
new file mode 100644
index 0000000..0af4fea
--- /dev/null
+++ b/torch/csrc/Generator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+#include "THP_export.h"
+
+struct THPGenerator {
+  PyObject_HEAD
+  at::Generator *cdata;
+  bool owner;  // if true, frees cdata in destructor
+};
+
+#define THPGenerator_Check(obj) \
+  PyObject_IsInstance(obj, THPGeneratorClass)
+
+#define THPGenerator_TH_CData(obj) \
+  (THGenerator*)((THPGenerator*)obj)->cdata->unsafeGetTH()
+
+THP_API PyObject * THPGenerator_New();
+
+// Creates a new Python object wrapping the at::Generator. The reference is
+// borrowed. The caller should ensure that the THGenerator* object lifetime
+// last at least as long as the Python wrapper.
+THP_API PyObject * THPGenerator_NewWithGenerator(at::Generator& cdata);
+
+THP_API PyObject *THPGeneratorClass;
+
+#ifdef _THP_CORE
+bool THPGenerator_init(PyObject *module);
+#endif
diff --git a/torch/csrc/Layout.cpp b/torch/csrc/Layout.cpp
new file mode 100644
index 0000000..2aa6d7a
--- /dev/null
+++ b/torch/csrc/Layout.cpp
@@ -0,0 +1,80 @@
+#include "torch/csrc/Layout.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_strings.h"
+
+#include <ATen/Layout.h>
+
+#include <structmember.h>
+#include <cstring>
+#include <string>
+
+PyObject *THPLayout_New(at::Layout layout, const std::string& name)
+{
+  auto type = (PyTypeObject*)&THPLayoutType;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  if (!self) throw python_error();
+  auto self_ = reinterpret_cast<THPLayout*>(self.get());
+  self_->layout = layout;
+  std::strncpy (self_->name, name.c_str(), LAYOUT_NAME_LEN);
+  self_->name[LAYOUT_NAME_LEN] = '\0';
+  return self.release();
+}
+
+PyObject *THPLayout_repr(THPLayout *self)
+{
+  return THPUtils_packString(self->name);
+}
+
+PyTypeObject THPLayoutType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch.layout",                        /* tp_name */
+  sizeof(THPLayout),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  (reprfunc)THPLayout_repr,              /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  nullptr,                               /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  0,                                     /* tp_new */
+};
+
+void THPLayout_init(PyObject *module)
+{
+  if (PyType_Ready(&THPLayoutType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPLayoutType);
+  if (PyModule_AddObject(module, "layout", (PyObject *)&THPLayoutType) != 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/Layout.h b/torch/csrc/Layout.h
new file mode 100644
index 0000000..a166169
--- /dev/null
+++ b/torch/csrc/Layout.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include <ATen/Layout.h>
+
+#include <string>
+
+const int LAYOUT_NAME_LEN = 64;
+
+struct THPLayout {
+  PyObject_HEAD
+  at::Layout layout;
+  char name[LAYOUT_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPLayoutType;
+
+inline bool THPLayout_Check(PyObject *obj) {
+  return Py_TYPE(obj) == &THPLayoutType;
+}
+
+PyObject * THPLayout_New(at::Layout layout, const std::string& name);
+
+void THPLayout_init(PyObject *module);
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
new file mode 100644
index 0000000..2194310
--- /dev/null
+++ b/torch/csrc/Module.cpp
@@ -0,0 +1,674 @@
+#include "torch/csrc/python_headers.h"
+#include <sys/types.h>
+
+#ifndef _MSC_VER
+#include <sys/socket.h>
+#endif
+
+#include <stdbool.h>
+#include <unordered_map>
+#include <cstdlib>
+#include <libshm.h>
+#include <TH/TH.h>
+#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/dlpack.h>
+#include <ATen/DLConvertor.h>
+#include <ATen/Utils.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "THP.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Device.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DataLoader.h"
+#include "torch/csrc/Generator.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/autograd/generated/python_nn_functions.h"
+#include "torch/csrc/autograd/python_legacy_variable.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/utils/tensor_dtypes.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/tensor_layouts.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/jit/init.h"
+#include "torch/csrc/jit/python_ir.h"
+#include "torch/csrc/onnx/init.h"
+
+#ifdef USE_CUDNN
+#include "cudnn.h"
+#endif
+
+#ifdef USE_C10D
+#include "torch/csrc/distributed/c10d/c10d.h"
+#endif
+
+#define WITH_NUMPY_IMPORT_ARRAY
+#include "torch/csrc/utils/numpy_stub.h"
+
+namespace py = pybind11;
+
+PyObject* module;
+
+THPGenerator *THPDefaultGenerator   = NULL;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+static PyObject * THPModule_initNames(PyObject *self, PyObject *arg)
+{
+  static std::vector<std::string> names;
+
+  THPObjectPtr types(PySequence_Fast(arg, "expected a sequence"));
+  if (!types) return NULL;
+
+  int num_classes = PySequence_Fast_GET_SIZE(types.get());
+  names.reserve(names.size() + num_classes);
+  for (int i = 0; i < num_classes; i++) {
+    PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i);
+    THPUtils_assert(PyType_Check(obj), "expected a PyTypeObject");
+    PyTypeObject* type = (PyTypeObject*)obj;
+
+    THPObjectPtr module_name(PyObject_GetAttrString(obj, "__module__"));
+    if (!module_name) return NULL;
+    THPUtils_assert(THPUtils_checkString(module_name.get()),
+        "expected __module__ to be a string");
+    std::string name = THPUtils_unpackString(module_name.get());
+    names.push_back(name + "." + type->tp_name);
+    type->tp_name = names.back().c_str();
+  }
+  Py_RETURN_NONE;
+}
+//
+// Callback for python part. Used for additional initialization of python classes
+static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manager_path)
+{
+  HANDLE_TH_ERRORS
+  if (!THPUtils_checkString(shm_manager_path)) {
+    THPUtils_setError("initialization error - expected bytes/string object as shm_manager_path!");
+    return NULL;
+  }
+  torch::utils::initializeLayouts();
+  torch::utils::initializeDtypes();
+  torch::tensors::initialize_python_bindings();
+  std::string path = THPUtils_unpackString(shm_manager_path);
+  libshm_init(path.c_str());
+
+  auto module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!module) throw python_error();
+
+  THPDoubleStorage_postInit(module);
+  THPFloatStorage_postInit(module);
+  THPHalfStorage_postInit(module);
+  THPLongStorage_postInit(module);
+  THPIntStorage_postInit(module);
+  THPShortStorage_postInit(module);
+  THPCharStorage_postInit(module);
+  THPByteStorage_postInit(module);
+  THPAutograd_initFunctions();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// The idea behind these two functions is to make it easy to test if we are
+// built with ASAN: they're designed not to crash if ASAN is not enabled, but
+// to trigger ASAN if it is enabled.  This lets us run a "canary" tests which
+// checks if our build environment is misconfigured.
+
+static PyObject * THPModule_crashIfCsrcASAN(PyObject *module, PyObject *arg) {
+  THPUtils_assert(THPUtils_checkLong(arg), "crash_if_csrc_asan expects an int, "
+          "but got %s", THPUtils_typename(arg));
+  volatile char x[3];
+  x[static_cast<int>(THPUtils_unpackLong(arg))] = 0;
+  return PyLong_FromLong(x[0]);
+}
+
+static PyObject * THPModule_crashIfCsrcUBSAN(PyObject *module, PyObject *arg) {
+  THPUtils_assert(THPUtils_checkLong(arg), "crash_if_csrc_ubsan expects an int, "
+          "but got %s", THPUtils_typename(arg));
+  int32_t x = static_cast<int>(THPUtils_unpackLong(arg));
+  double y = 1.0 / x;
+  return PyLong_FromLong((int)y);
+}
+
+static PyObject * THPModule_crashIfATenASAN(PyObject *module, PyObject *arg) {
+  THPUtils_assert(THPUtils_checkLong(arg), "crash_if_aten_asan expects an int, "
+          "but got %s", THPUtils_typename(arg));
+  return PyLong_FromLong(at::_crash_if_asan(static_cast<int>(THPUtils_unpackLong(arg))));
+}
+
+static PyObject * THPModule_getNumThreads(PyObject *module)
+{
+  return PyLong_FromLong(THGetNumThreads());
+}
+
+static PyObject * THPModule_setNumThreads(PyObject *module, PyObject *arg)
+{
+  THPUtils_assert(THPUtils_checkLong(arg), "set_num_threads expects an int, "
+          "but got %s", THPUtils_typename(arg));
+  THSetNumThreads((int)THPUtils_unpackLong(arg));
+  at::set_num_threads((int)THPUtils_unpackLong(arg));
+  Py_RETURN_NONE;
+}
+
+PyObject * THPModule_setDefaultTensorType(PyObject *_unused, PyObject *type)
+{
+  HANDLE_TH_ERRORS
+  torch::tensors::py_set_default_tensor_type(type);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPModule_setDefaultDtype(PyObject *_unused, PyObject *dtype)
+{
+  HANDLE_TH_ERRORS
+  torch::tensors::py_set_default_dtype(dtype);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPModule_safeCall(PyObject *_unused, PyObject *args, PyObject *kwargs)
+{
+  PyObject *result = NULL;
+  PyObject *args_slice = NULL;
+  PyThreadState *thread_state = PyThreadState_Get();
+  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
+  THPUtils_assert(num_args > 0, "expected at least one argument");
+  try {
+    args_slice = PyTuple_GetSlice(args, 1, num_args);
+    result = PyObject_Call(PyTuple_GET_ITEM(args, 0), args_slice, kwargs);
+  } catch (std::exception &e) {
+    PyEval_RestoreThread(thread_state);
+    Py_DECREF(args_slice);
+    PyErr_SetString(THPException_FatalError, e.what());
+    Py_LeaveRecursiveCall();
+  }
+  Py_DECREF(args_slice);
+  return result;
+}
+
+PyObject *THPModule_addDocStr(PyObject *_unused, PyObject *args)
+{
+  // adds a __doc__ string to a function, similar to numpy's arr_add_docstring
+  static std::vector<std::string> all_docs;
+  PyObject *obj;
+  PyObject *doc_obj;
+  if (!PyArg_ParseTuple(args, "OO", &obj, &doc_obj)) {
+    return NULL;
+  }
+
+  const char* doc_str = "<invalid string>";
+  if (THPUtils_checkString(doc_obj)) {
+    all_docs.push_back(THPUtils_unpackString(doc_obj));
+    doc_str = all_docs.back().c_str();
+  }
+
+  if (Py_TYPE(obj) == &PyCFunction_Type) {
+    PyCFunctionObject* f = (PyCFunctionObject *)obj;
+    if (f->m_ml->ml_doc) {
+      return PyErr_Format(PyExc_RuntimeError,
+          "function '%s' already has a docstring", f->m_ml->ml_name);
+    }
+    f->m_ml->ml_doc = doc_str;
+  } else if (strcmp(Py_TYPE(obj)->tp_name, "method_descriptor") == 0) {
+    PyMethodDescrObject* m = (PyMethodDescrObject *)obj;
+    if (m->d_method->ml_doc) {
+      return PyErr_Format(PyExc_RuntimeError,
+          "method '%s' already has a docstring", m->d_method->ml_name);
+    }
+    m->d_method->ml_doc = doc_str;
+  } else {
+    return PyErr_Format(PyExc_TypeError,
+        "don't know how to add docstring to type '%s'", Py_TYPE(obj)->tp_name);
+  }
+
+  Py_INCREF(obj);
+  return obj;
+}
+
+
+PyObject *THPModule_inferSize(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  Py_ssize_t num_args = args ? (Py_ssize_t) PyTuple_Size(args) : 0;
+  THPUtils_assert(num_args == 2, "expected exactly 2 arguments");
+  PyObject *arg1 = PyTuple_GET_ITEM(args, 0);
+  THPUtils_assert(THPSize_Check(arg1), "expected a torch.Size as argument 1");
+  PyObject *arg2 = PyTuple_GET_ITEM(args, 1);
+  THPUtils_assert(THPSize_Check(arg2), "expected a torch.Size as argument 2");
+
+  auto size1 = THPUtils_unpackLongs(arg1);
+  auto size2 = THPUtils_unpackLongs(arg2);
+  auto sizes = at::infer_size(size1, size2);
+  return THPSize_NewFromSizes(sizes.size(), sizes.data());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject *THPModule_setBackcompatBroadcastWarn(PyObject *module, PyObject *arg) {
+  THPUtils_assert(PyBool_Check(arg), "set_backcompat_broadcast_warn expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  setBackCompatBroadcastWarn(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+static PyObject *THPModule_getBackcompatBroadcastWarn(PyObject *module)
+{
+  if (getBackCompatBroadcastWarn()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
+static PyObject *THPModule_setBackcompatKeepdimWarn(PyObject *module, PyObject *arg) {
+  THPUtils_assert(PyBool_Check(arg), "set_backcompat_keepdim_warn expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  setBackCompatKeepdimWarn(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+static PyObject *THPModule_getBackcompatKeepdimWarn(PyObject *module)
+{
+  if (getBackCompatKeepdimWarn()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
+PyObject *THPModule_hasDistributed(PyObject *_unused)
+{
+#ifdef USE_DISTRIBUTED
+  Py_RETURN_TRUE;
+#else
+  Py_RETURN_FALSE;
+#endif
+}
+
+void DLPack_Capsule_Destructor(PyObject* data) {
+  HANDLE_TH_ERRORS
+  DLManagedTensor * dlMTensor = (DLManagedTensor *)PyCapsule_GetPointer(data, "dltensor");
+  if (dlMTensor) {
+    // the dlMTensor has not been consumed, call deleter ourselves
+    dlMTensor->deleter(const_cast<DLManagedTensor*>(dlMTensor));
+  } else {
+    // the dlMTensor has been consumed
+    // PyCapsule_GetPointer has set an error indicator
+    PyErr_Clear();
+  }
+  END_HANDLE_TH_ERRORS_RET()
+}
+
+PyObject *THPModule_toDLPack(PyObject *_unused, PyObject *data)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPVariable_Check(data), "data must be a Tensor");
+  DLManagedTensor* dlMTensor = at::toDLPack(THPVariable_UnpackData(data));
+  return PyCapsule_New(dlMTensor, "dltensor", DLPack_Capsule_Destructor);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPModule_fromDLPack(PyObject *_unused, PyObject *data)
+{
+  using namespace torch::autograd;
+  HANDLE_TH_ERRORS
+  DLManagedTensor * dlMTensor = (DLManagedTensor *)PyCapsule_GetPointer(data, "dltensor");
+  THPUtils_assert(dlMTensor, "from_dlpack received an invalid capsule. "
+    "Note that DLTensor capsules can be consumed only once, "
+    "so you might have already constructed a tensor from it once.")
+  // atensor steals the ownership of the underlying storage. It also passes a
+  // destructor function that will be called when the underlying storage goes
+  // out of scope. When the destructor is called, the dlMTensor is destructed too.
+  auto atensor = make_variable(at::fromDLPack(dlMTensor), false);
+
+  // It is possible that the call to at::fromDLPack is the very first
+  // call to create a Tensor in PyTorch. If so, then _lazy_init has
+  // not been called, and the attempt to call createPyObject will fail
+  // because cuda ATen types have not been registered in Python yet.
+  // so if we have a cuda tensor, then we need to make sure
+  // we have called _lazy_init here
+  if(atensor.is_cuda()) {
+    py::module::import("torch.cuda").attr("init")();
+  }
+  // Make sure this capsule will never be used again.
+  PyCapsule_SetName(data, "used_dltensor");
+  return THPVariable_Wrap(std::move(atensor));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPModule_setUserEnabledCuDNN(PyObject *_unused, PyObject *arg)
+{
+  THPUtils_assert(PyBool_Check(arg), "set_enabled_cudnn expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  at::globalContext().setUserEnabledCuDNN(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+PyObject *THPModule_userEnabledCuDNN(PyObject *_unused)
+{
+  if (at::globalContext().userEnabledCuDNN()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
+PyObject *THPModule_setDeterministicCuDNN(PyObject *_unused, PyObject *arg)
+{
+  THPUtils_assert(PyBool_Check(arg), "set_deterministic_cudnn expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  at::globalContext().setDeterministicCuDNN(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+PyObject *THPModule_deterministicCuDNN(PyObject *_unused)
+{
+  if (at::globalContext().deterministicCuDNN()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
+PyObject *THPModule_setBenchmarkCuDNN(PyObject *_unused, PyObject *arg)
+{
+  THPUtils_assert(PyBool_Check(arg), "set_benchmark_cudnn expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  at::globalContext().setBenchmarkCuDNN(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+PyObject *THPModule_benchmarkCuDNN(PyObject *_unused)
+{
+  if (at::globalContext().benchmarkCuDNN()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
+PyObject *THPModule_setFlushDenormal(PyObject *_unused, PyObject *arg) {
+  THPUtils_assert(PyBool_Check(arg), "flush_denormal expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  if (!at::globalContext().setFlushDenormal(arg == Py_True)) {
+    Py_RETURN_FALSE;
+  };
+  Py_RETURN_TRUE;
+}
+
+PyObject *THPModule_getDefaultDtype(PyObject *_unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  auto& type = torch::tensors::get_default_tensor_type();
+  auto dtype = (PyObject*)torch::getDtype(type.scalarType());
+  Py_INCREF(dtype);
+  return dtype;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPModule_isDefaultTypeCuda(PyObject *_unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (torch::tensors::get_default_tensor_type().is_cuda()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPModule_useZeroSizeDim(PyObject *_unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+#ifdef USE_TH_SIZE_ZERO_DIM
+  Py_RETURN_TRUE;
+#else
+  Py_RETURN_FALSE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef TorchMethods[] = {
+  {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       NULL},
+  {"_autograd_init",  (PyCFunction)THPAutograd_initExtension, METH_NOARGS,  NULL},
+  {"_add_docstr",     (PyCFunction)THPModule_addDocStr,       METH_VARARGS, NULL},
+  {"_init_names",     (PyCFunction)THPModule_initNames,       METH_O,       NULL},
+  {"_has_distributed",(PyCFunction)THPModule_hasDistributed,  METH_NOARGS,  NULL},
+  {"_safe_call",      (PyCFunction)THPModule_safeCall,          METH_VARARGS | METH_KEYWORDS, NULL},
+  {"_set_default_tensor_type", (PyCFunction)THPModule_setDefaultTensorType, METH_O, NULL},
+  {"_set_default_dtype", (PyCFunction)THPModule_setDefaultDtype, METH_O, NULL},
+  {"_infer_size",     (PyCFunction)THPModule_inferSize,         METH_VARARGS, NULL},
+  {"_crash_if_csrc_asan", (PyCFunction)THPModule_crashIfCsrcASAN, METH_O, NULL},
+  {"_crash_if_csrc_ubsan", (PyCFunction)THPModule_crashIfCsrcUBSAN, METH_O, NULL},
+  {"_crash_if_aten_asan", (PyCFunction)THPModule_crashIfATenASAN, METH_O, NULL},
+  {"_set_backcompat_broadcast_warn", (PyCFunction)THPModule_setBackcompatBroadcastWarn, METH_O, NULL},
+  {"_get_backcompat_broadcast_warn", (PyCFunction)THPModule_getBackcompatBroadcastWarn, METH_NOARGS, NULL},
+  {"_set_backcompat_keepdim_warn", (PyCFunction)THPModule_setBackcompatKeepdimWarn, METH_O, NULL},
+  {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, NULL},
+  {"get_num_threads", (PyCFunction)THPModule_getNumThreads,     METH_NOARGS,  NULL},
+  {"set_num_threads", (PyCFunction)THPModule_setNumThreads,     METH_O,       NULL},
+  {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS,     NULL},
+  {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O,  NULL},
+  {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS,     NULL},
+  {"_set_cudnn_benchmark", (PyCFunction)THPModule_setBenchmarkCuDNN, METH_O,  NULL},
+  {"_get_cudnn_deterministic", (PyCFunction)THPModule_deterministicCuDNN, METH_NOARGS,     NULL},
+  {"_set_cudnn_deterministic", (PyCFunction)THPModule_setDeterministicCuDNN, METH_O,  NULL},
+  {"_to_dlpack",      (PyCFunction)THPModule_toDLPack,          METH_O,       NULL},
+  {"_from_dlpack",    (PyCFunction)THPModule_fromDLPack,        METH_O,       NULL},
+  {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O,     NULL},
+  {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS,  NULL},
+  {"_is_default_type_cuda", (PyCFunction)THPModule_isDefaultTypeCuda, METH_NOARGS,  NULL},
+  {"_use_zero_size_dim", (PyCFunction)THPModule_useZeroSizeDim, METH_NOARGS,  NULL},
+  {NULL, NULL, 0, NULL}
+};
+
+bool THCPDoubleStorage_init(PyObject *module);
+bool THCPFloatStorage_init(PyObject *module);
+bool THCPHalfStorage_init(PyObject *module);
+bool THCPLongStorage_init(PyObject *module);
+bool THCPIntStorage_init(PyObject *module);
+bool THCPShortStorage_init(PyObject *module);
+bool THCPCharStorage_init(PyObject *module);
+bool THCPByteStorage_init(PyObject *module);
+
+bool THCPStream_init(PyObject *module);
+
+#ifdef USE_CUDA
+PyMethodDef* THCPModule_methods();
+namespace torch { namespace cuda {
+
+void initModule(PyObject *module);
+
+}} // namespace torch::cuda
+#endif
+
+namespace torch { namespace nn {
+
+void init__THNN(PyObject*);
+#ifdef USE_CUDA
+void init__THCUNN(PyObject*);
+#endif
+
+}} // namespace torch::nn
+
+bool THDPDoubleStorage_init(PyObject *module);
+bool THDPFloatStorage_init(PyObject *module);
+//bool THDPHalfStorage_init(PyObject *module);
+bool THDPLongStorage_init(PyObject *module);
+bool THDPIntStorage_init(PyObject *module);
+bool THDPShortStorage_init(PyObject *module);
+bool THDPCharStorage_init(PyObject *module);
+bool THDPByteStorage_init(PyObject *module);
+
+static std::vector<PyMethodDef> methods;
+
+#ifdef USE_DISTRIBUTED
+PyMethodDef* THDPModule_methods();
+#endif
+
+// TODO: Refactor this in some less manual way
+#ifdef USE_CUDNN
+static PyObject * THCUDNN_cudnn_version(PyObject *self, PyObject *args)
+{
+  return PyLong_FromLong(CUDNN_VERSION);
+}
+
+static PyMethodDef _THCUDNN_methods[] = {
+  {"_cudnn_version", (PyCFunction)THCUDNN_cudnn_version, METH_VARARGS, NULL},
+  {NULL}
+};
+
+PyMethodDef* THCUDNN_methods() {
+  return _THCUDNN_methods;
+}
+#endif
+
+// ATen warning handler for Python
+static void warning_handler(const at::SourceLocation& source_location, const char* msg) {
+  AutoGIL gil;
+  if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
+    throw python_error();
+  }
+}
+
+static PyObject* initModule() {
+  HANDLE_TH_ERRORS
+  THInferNumThreads();
+
+#define ASSERT_TRUE(cmd) if (!(cmd)) return NULL
+
+  THPUtils_addPyMethodDefs(methods, TorchMethods);
+  THPUtils_addPyMethodDefs(methods, DataLoaderMethods);
+  THPUtils_addPyMethodDefs(methods, torch::autograd::python_functions());
+#ifdef USE_CUDA
+  THPUtils_addPyMethodDefs(methods, THCPModule_methods());
+#endif
+#ifdef USE_CUDNN
+  THPUtils_addPyMethodDefs(methods, THCUDNN_methods());
+#endif
+#ifdef USE_DISTRIBUTED
+  THPUtils_addPyMethodDefs(methods, THDPModule_methods());
+#endif
+#ifdef USE_C10D
+  THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
+#endif
+
+#if PY_MAJOR_VERSION == 2
+  ASSERT_TRUE(module = Py_InitModule("torch._C", methods.data()));
+#else
+  static struct PyModuleDef torchmodule = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C",
+     NULL,
+     -1,
+     methods.data()
+  };
+  ASSERT_TRUE(module = PyModule_Create(&torchmodule));
+#endif
+  ASSERT_TRUE(THPWrapper_init(module));
+  ASSERT_TRUE(THPGenerator_init(module));
+  ASSERT_TRUE(THPException_init(module));
+  THPSize_init(module);
+  THPDtype_init(module);
+  THPLayout_init(module);
+  THPDevice_init(module);
+  ASSERT_TRUE(THPVariable_initModule(module));
+  ASSERT_TRUE(THPFunction_initModule(module));
+  ASSERT_TRUE(THPEngine_initModule(module));
+  // NOTE: We need to be able to access OperatorExportTypes from ONNX for use in
+  // the export side of JIT, so this ONNX init needs to appear before the JIT
+  // init.
+  torch::onnx::initONNXBindings(module);
+  torch::jit::initJITBindings(module);
+  torch::autograd::initNNFunctions(module);
+  torch::autograd::init_legacy_variable(module);
+#ifdef USE_CUDA
+  torch::cuda::initModule(module);
+#endif
+  ASSERT_TRUE(THPDoubleStorage_init(module));
+  ASSERT_TRUE(THPFloatStorage_init(module));
+  ASSERT_TRUE(THPHalfStorage_init(module));
+  ASSERT_TRUE(THPLongStorage_init(module));
+  ASSERT_TRUE(THPIntStorage_init(module));
+  ASSERT_TRUE(THPShortStorage_init(module));
+  ASSERT_TRUE(THPCharStorage_init(module));
+  ASSERT_TRUE(THPByteStorage_init(module));
+
+#ifdef USE_CUDA
+  // This will only initialise base classes and attach them to library namespace
+  // They won't be ready for real usage until importing cuda module, that will
+  // complete the process (but it defines Python classes before calling back into
+  // C, so these lines have to execute first)..
+  ASSERT_TRUE(THCPDoubleStorage_init(module));
+  ASSERT_TRUE(THCPFloatStorage_init(module));
+  ASSERT_TRUE(THCPHalfStorage_init(module));
+  ASSERT_TRUE(THCPLongStorage_init(module));
+  ASSERT_TRUE(THCPIntStorage_init(module));
+  ASSERT_TRUE(THCPShortStorage_init(module));
+  ASSERT_TRUE(THCPCharStorage_init(module));
+  ASSERT_TRUE(THCPByteStorage_init(module));
+
+  ASSERT_TRUE(THCPStream_init(module));
+#endif
+
+#ifdef USE_CUDNN
+  PyObject *has_cudnn = Py_True;
+#else
+  PyObject *has_cudnn = Py_False;
+#endif
+  Py_INCREF(has_cudnn);
+  ASSERT_TRUE(PyModule_AddObject(module, "has_cudnn", has_cudnn) == 0);
+
+#ifdef USE_DISTRIBUTED_MW
+  // See comment on CUDA objects
+  ASSERT_TRUE(THDPDoubleStorage_init(module));
+  ASSERT_TRUE(THDPFloatStorage_init(module));
+  //ASSERT_TRUE(THDPHalfStorage_init(module));
+  ASSERT_TRUE(THDPLongStorage_init(module));
+  ASSERT_TRUE(THDPIntStorage_init(module));
+  ASSERT_TRUE(THDPShortStorage_init(module));
+  ASSERT_TRUE(THDPCharStorage_init(module));
+  ASSERT_TRUE(THDPByteStorage_init(module));
+#endif
+
+  // force ATen to initialize because it handles
+  // setting up TH Errors so that they throw C++ exceptions
+  at::init();
+
+  // Set ATen warnings to issue Python warnings
+  at::Warning::set_warning_handler(&warning_handler);
+
+  ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0);
+
+  auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU);
+  THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator(
+    defaultGenerator);
+  ASSERT_TRUE(PyModule_AddObject(module, "default_generator", (PyObject*)THPDefaultGenerator) == 0);
+
+#ifdef USE_NUMPY
+  if (_import_array() < 0) return NULL;
+#endif
+
+  torch::nn::init__THNN(module);
+#ifdef USE_CUDA
+  torch::nn::init__THCUNN(module);
+#endif
+
+  return module;
+  END_HANDLE_TH_ERRORS
+}
+
+// Checks that the _C shared library isn't initialized multiple times. This
+// can happen if the same csrc files are compiled into multiple shared
+// libraries.
+inline void pytorch_duplicate_guard() {
+  static int initialized = 0;
+  if (initialized) {
+    fprintf(stderr, "pytorch: _C shared library re-initialized\n");
+    abort();
+  }
+  initialized = 1;
+;}
+
+struct call_duplicate_guard {
+  call_duplicate_guard() { pytorch_duplicate_guard(); }
+};
+
+static call_duplicate_guard _call_duplicate_guard;
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_C()
+#else
+PyMODINIT_FUNC PyInit__C()
+#endif
+{
+#if PY_MAJOR_VERSION == 2
+  initModule();
+#else
+  return initModule();
+#endif
+}
diff --git a/torch/csrc/Module.h b/torch/csrc/Module.h
new file mode 100644
index 0000000..e206d10
--- /dev/null
+++ b/torch/csrc/Module.h
@@ -0,0 +1,8 @@
+#ifndef THP_MODULE_INC
+#define THP_MODULE_INC
+
+#define THP_STATELESS_ATTRIBUTE_NAME "_torch"
+
+extern THPGenerator *THPDefaultGenerator;
+
+#endif
diff --git a/torch/csrc/PtrWrapper.cpp b/torch/csrc/PtrWrapper.cpp
new file mode 100644
index 0000000..52d7b1f
--- /dev/null
+++ b/torch/csrc/PtrWrapper.cpp
@@ -0,0 +1,102 @@
+#include "torch/csrc/python_headers.h"
+#include "ATen/Utils.h"
+#include <functional>
+
+static PyObject* THPWrapperClass = NULL;
+
+struct THPWrapper {
+  PyObject_HEAD
+  void *data;
+  void (*destructor)(void*);
+};
+
+PyObject * THPWrapper_New(void *data, void (*destructor)(void*))
+{
+  PyObject *args = PyTuple_New(0);
+  if (!args) {
+    return NULL;
+  }
+  PyObject *result = PyObject_Call(THPWrapperClass, args, NULL);
+  if (result) {
+    THPWrapper* wrapper = (THPWrapper*) result;
+    wrapper->data = data;
+    wrapper->destructor = destructor;
+  }
+  Py_DECREF(args);
+  return result;
+}
+
+bool THPWrapper_check(PyObject * obj)
+{
+  return (PyObject*)Py_TYPE(obj) == THPWrapperClass;
+}
+
+void * THPWrapper_get(PyObject * obj)
+{
+  return ((THPWrapper*)obj)->data;
+}
+
+static PyObject * THPWrapper_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  PyObject* self = type->tp_alloc(type, 0);
+  THPWrapper* wrapper = (THPWrapper*) self;
+  wrapper->data = NULL;
+  wrapper->destructor = NULL;
+  return self;
+}
+
+static void THPWrapper_dealloc(THPWrapper* self)
+{
+  self->destructor(self->data);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+PyTypeObject THPWrapperType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._PtrWrapper",                /* tp_name */
+  sizeof(THPWrapper),                    /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPWrapper_dealloc,        /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPWrapper_pynew,                      /* tp_new */
+};
+
+bool THPWrapper_init(PyObject *module)
+{
+  THPWrapperClass = (PyObject*)&THPWrapperType;
+  if (PyType_Ready(&THPWrapperType) < 0)
+    return false;
+  Py_INCREF(&THPWrapperType);
+  return true;
+}
diff --git a/torch/csrc/PtrWrapper.h b/torch/csrc/PtrWrapper.h
new file mode 100644
index 0000000..38df102
--- /dev/null
+++ b/torch/csrc/PtrWrapper.h
@@ -0,0 +1,17 @@
+#ifndef THP_PTR_WRAPPER_H
+#define THP_PTR_WRAPPER_H
+
+#include "torch/csrc/python_headers.h"
+#include <functional>
+
+/**
+ * Python wrapper around arbitrary opaque C++ class
+ */
+
+bool THPWrapper_init(PyObject *module);
+
+PyObject * THPWrapper_New(void *data, void (*destructor)(void*));
+void * THPWrapper_get(PyObject * obj);
+bool THPWrapper_check(PyObject * obj);
+
+#endif
diff --git a/torch/csrc/PythonTypes.h b/torch/csrc/PythonTypes.h
new file mode 100644
index 0000000..68a528e
--- /dev/null
+++ b/torch/csrc/PythonTypes.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/Types.h"
+
+namespace torch {
+
+struct THPVoidTensor {
+  PyObject_HEAD
+  THVoidTensor *cdata;
+  char device_type;
+  char data_type;
+};
+
+struct THPVoidStorage {
+  PyObject_HEAD
+  THVoidStorage *cdata;
+};
+
+} // namespace torch
diff --git a/torch/csrc/README.md b/torch/csrc/README.md
new file mode 100644
index 0000000..daa04e8
--- /dev/null
+++ b/torch/csrc/README.md
@@ -0,0 +1,100 @@
+# csrc
+
+The csrc directory contains all of the code concerned with integration
+with Python.  This is in contrast to lib, which contains the Torch
+libraries that are Python agnostic.  csrc depends on lib, but not vice
+versa.
+
+There are a number of utilities for easing integration with Python which
+are worth knowing about, which we briefly describe here.  But the most
+important gotchas:
+
+* DO NOT forget to take out the GIL with `AutoGil` before calling Python
+  API or bringing a `THPObjectPtr` into scope.
+
+* Make sure you include `Python.h` first in your header files, before
+  any system headers; otherwise, you will get `error: "_XOPEN_SOURCE" redefined`
+  error.  If you pay attention to warnings, you will see where you need to
+  do this.
+
+## Notes
+
+### Note [Storage is not NULL]
+
+Historically, Torch supported NULL storage, as a minor optimization to
+avoid having to allocate a storage object when it would be empty.
+However, this is actually a confusing special case to deal with, so
+by-in-large, PyTorch assumes that, in fact, storage is never NULL.
+
+One important case where this assumption is important is when tracking
+the CUDA device a tensor is stored in: this information is stored
+solely in the storage, so if a storage is NULL, we lose this information.
+
+Although storage is never NULL, the data field of THStorage may be NULL.  This
+mostly occurs when we want to pre-allocate an output tensor struct, but then
+have it be resized and filled with data by some operator: there's no point in
+allocating data for it in this case!
+
+## Files
+
+### `Exceptions.h`
+
+Frequently when working with the Python API, you may call a function
+which returns an error.  In this case, we want to return directly to the
+Python interpreter, so that this exception can be propagated
+accordingly; however, because the Python API is C-based, what actually
+will happen is it will return control to whatever C++ code called it.
+Similarly, if we raise a C++ exception, prior to returning to the Python
+interpreter, we must set the Python error flags, so it turns into a C++
+exception.
+
+Exceptions defines some useful helpers: `HANDLE_TH_ERRORS`, `END_HANDLE_TH_ERRORS`
+and an exception class `python_error`.  You call them like this:
+
+```
+// Entry point from Python interpreter
+PyObject* run() {
+  HANDLE_TH_ERRORS
+  ...
+  if (!x) throw python_error();
+  ...
+  END_HANDLE_TH_ERRORS
+}
+```
+
+The `HANDLE_TH_ERRORS` macro will catch all exceptions and convert them
+into an appropriate Python signal.  `python_error` is a special
+exception which doesn't contain any info, instead it says, "An error
+occurred in the Python API; if you return to the interpreter, Python
+will raise that exception, nothing else needs to be done."
+
+### `utils/auto_gil.h`
+
+Whenever you make any calls to the Python API, you must have taken out
+the Python GIL, as none of these calls are thread safe.  `AutoGIL` is
+a RAII struct which handles taking and releasing the GIL.  Use it like
+this:
+
+```
+void iWantToUsePython() {
+  AutoGil gil;
+  ...
+}
+```
+
+In general, the compiler will NOT warn you if you use Python
+functionality without taking out the GIL, so DO NOT FORGET this call.
+
+### `utils/object_ptr.h`
+
+`THPPointer` is a smart pointer class analogous to `std::shared_ptr`,
+but which is overloaded to handle reference counting scheme of various
+objects which are not based on `shared_ptr`.  The most important overloads are:
+
+* `PyObject` (so important we've aliased it as `THPObjectPtr`), which
+  hooks into Python reference counting.  (By the way, that means you
+  MUST take out the GIL before bringing one of these into scope!)
+
+* The various TH tensor and storage types (e.g., `THTensor`), which
+  hook into TH's reference counting.  (TH's reference counting
+  IS thread safe, no locks necessary.)
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
new file mode 100644
index 0000000..0708eba
--- /dev/null
+++ b/torch/csrc/Size.cpp
@@ -0,0 +1,189 @@
+#include "Size.h"
+
+#include <string>
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_tuples.h"
+
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/jit/tracer.h"
+
+struct THPSize {
+  PyTupleObject tuple;
+};
+
+PyObject * THPSize_New(const torch::autograd::Variable& var)
+{
+  if (!torch::jit::tracer::isTracing(var)) {
+    auto sizes = var.sizes();
+    return THPSize_NewFromSizes(var.dim(), sizes.data());
+  }
+  auto self = THPObjectPtr(THPSizeType.tp_alloc(&THPSizeType, var.dim()));
+  if (!self) throw python_error();
+
+  for (int64_t i = 0; i < var.dim(); ++i) {
+    PyObject *py_size_tensor = THPVariable_Wrap(torch::jit::tracer::getSizeOf(var, i));
+    if (!py_size_tensor) throw python_error();
+    PyTuple_SET_ITEM(self.get(), i, py_size_tensor);
+  }
+
+  return self.release();
+}
+
+PyObject * THPSize_NewFromSizes(int dim, const int64_t *sizes)
+{
+  auto self = THPObjectPtr(THPSizeType.tp_alloc(&THPSizeType, dim));
+  if (!self) throw python_error();
+  THPUtils_packInt64Array(self, dim, sizes);
+  return self.release();
+}
+
+static bool isTracedVar(PyObject *item) {
+  if (!THPVariable_Check(item)) return false;
+  auto & var = reinterpret_cast<THPVariable*>(item)->cdata;
+  return torch::jit::tracer::isTracing(var);
+}
+
+static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  THPObjectPtr self(PyTuple_Type.tp_new(type, args, kwargs));
+  if (self) {
+    for (Py_ssize_t i = 0; i < PyTuple_Size(self); ++i) {
+      PyObject *item = PyTuple_GET_ITEM(self.get(), i);
+      if (isTracedVar(item)) {
+        continue;
+      }
+      if (THPUtils_checkLong(item)) {
+        continue;
+      }
+      // item.__index__() works with 0-dim tensors and tensors with one element
+      THPObjectPtr number(PyNumber_Index(item));
+      if (number && THPUtils_checkLong(number.get())) {
+        Py_INCREF(number.get());
+        auto status = PyTuple_SetItem(self, i, number.get());
+        if (status != 0) {
+          throw python_error();
+        }
+        continue;
+      }
+      return PyErr_Format(PyExc_TypeError,
+                          "torch.Size() takes an iterable of 'int' (item %zd is '%s')",
+                          i, Py_TYPE(item)->tp_name);
+    }
+  }
+  return self.release();
+}
+
+static PyObject * THPSize_repr(THPSize *self)
+{
+  HANDLE_TH_ERRORS
+  std::string repr("torch.Size([");
+  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+    if (i != 0) {
+      repr += ", ";
+    }
+    repr += std::to_string(PyLong_AsLong(PyTuple_GET_ITEM(self, i)));
+  }
+  repr += "])";
+  return THPUtils_packString(repr);
+  END_HANDLE_TH_ERRORS
+}
+
+extern PyTypeObject THPSizeType;
+
+template<typename FnType, FnType fn, typename ...Args>
+static PyObject* wrap_tuple_fn(Args ... args)
+{
+  THPObjectPtr result((*fn)(std::forward<Args>(args)...));
+  if (!result) return NULL;
+  if (PyTuple_Check(result.get())) {
+    return PyObject_CallFunctionObjArgs((PyObject*)&THPSizeType, result.get(), NULL);
+  }
+  return result.release();
+}
+
+// We use an anonymous namespace instead of static to work around
+// (what @peterjc123 think is) a bug in Visual Studio
+namespace {
+  auto sq_concat = PyTuple_Type.tp_as_sequence->sq_concat;
+  auto sq_repeat = PyTuple_Type.tp_as_sequence->sq_repeat;
+  #if PY_MAJOR_VERSION == 2
+  auto sq_slice = PyTuple_Type.tp_as_sequence->sq_slice;
+  #endif
+  binaryfunc mp_subscript = PyTuple_Type.tp_as_mapping->mp_subscript;
+}
+
+
+static PySequenceMethods THPSize_as_sequence = {
+  PyTuple_Type.tp_as_sequence->sq_length,
+  wrap_tuple_fn<decltype(&sq_concat), &sq_concat>,
+  wrap_tuple_fn<decltype(&sq_repeat), &sq_repeat>,
+  PyTuple_Type.tp_as_sequence->sq_item,
+#if PY_MAJOR_VERSION == 2
+  wrap_tuple_fn<decltype(&sq_slice), &sq_slice>,
+#else
+  0,                                          /* sq_slice */
+#endif
+  0,                                          /* sq_ass_item */
+  0,                                          /* sq_ass_slice */
+  PyTuple_Type.tp_as_sequence->sq_contains
+};
+
+static PyMappingMethods THPSize_as_mapping = {
+    PyTuple_Type.tp_as_mapping->mp_length,
+    wrap_tuple_fn<decltype(&mp_subscript), &mp_subscript>,
+    0
+};
+
+
+PyTypeObject THPSizeType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch.Size",                          /* tp_name */
+  sizeof(THPSize),                       /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  (reprfunc)THPSize_repr,                /* tp_repr */
+  0,                                     /* tp_as_number */
+  &THPSize_as_sequence,                  /* tp_as_sequence */
+  &THPSize_as_mapping,                   /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  &PyTuple_Type,                         /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPSize_pynew,                         /* tp_new */
+};
+
+void THPSize_init(PyObject *module)
+{
+  if (PyType_Ready(&THPSizeType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPSizeType);
+  if (PyModule_AddObject(module, "Size", (PyObject*)&THPSizeType) < 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/Size.h b/torch/csrc/Size.h
new file mode 100644
index 0000000..03a4c48
--- /dev/null
+++ b/torch/csrc/Size.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/autograd/variable.h"
+#include "stdint.h"
+
+extern PyTypeObject THPSizeType;
+
+#define THPSize_Check(obj) (Py_TYPE(obj) == &THPSizeType)
+
+PyObject * THPSize_New(const torch::autograd::Variable& t);
+PyObject * THPSize_NewFromSizes(int dim, const int64_t *sizes);
+
+#ifdef _THP_CORE
+void THPSize_init(PyObject *module);
+#endif
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
new file mode 100644
index 0000000..80c6705
--- /dev/null
+++ b/torch/csrc/Storage.cpp
@@ -0,0 +1,48 @@
+#define __STDC_FORMAT_MACROS
+
+#include "torch/csrc/python_headers.h"
+#ifdef _MSC_VER
+#include <Windows.h>
+#endif
+#include <structmember.h>
+
+#define THP_HOST_HALF
+
+#include <stdbool.h>
+#include <TH/TH.h>
+// See Note [TH abstraction violation]
+//  - Used to get at the allocator associated with a storage
+#include <TH/THStorage.hpp>
+#include <torch/csrc/finalizer.h>
+#include <libshm.h>
+#include "THP.h"
+#include "copy_utils.h"
+#include "DynamicTypes.h"
+
+#ifdef USE_CUDA
+#include <THC/THCStorage.hpp>
+#endif
+
+#include "generic/Storage.cpp"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/Storage.cpp"
+#include <TH/THGenerateHalfType.h>
+
+// NB: If you ever divest libtorch of USE_CUDA, you'll have to virtualize
+// the CUDA call.
+template<>
+void THPPointer<THStorage>::free() {
+  if (ptr) {
+    if (ptr->data_ptr.device().is_cpu()) {
+      THStorage_free(ptr);
+    } else {
+      AT_ASSERT(ptr->data_ptr.device().is_cuda());
+#ifdef USE_CUDA
+      THCStorage_free(at::globalContext().lazyInitCUDA(), ptr);
+#else
+      AT_ERROR("Cannot free THCStorage when not built with CUDA");
+#endif
+    }
+  }
+}
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
new file mode 100644
index 0000000..f64fe84
--- /dev/null
+++ b/torch/csrc/Storage.h
@@ -0,0 +1,46 @@
+#ifndef THP_STORAGE_INC
+#define THP_STORAGE_INC
+
+#define THPStorage TH_CONCAT_3(THP,Real,Storage)
+#define THPStorageStr TH_CONCAT_STRING_3(torch.,Real,Storage)
+#define THPStorageClass TH_CONCAT_3(THP,Real,StorageClass)
+#define THPStorage_(NAME) TH_CONCAT_4(THP,Real,Storage_,NAME)
+
+#define THPDoubleStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPDoubleStorageClass)
+#define THPFloatStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPFloatStorageClass)
+#define THPHalfStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPFloatStorageClass)
+#define THPLongStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPLongStorageClass)
+#define THPIntStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPIntStorageClass)
+#define THPShortStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPShortStorageClass)
+#define THPCharStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPCharStorageClass)
+#define THPByteStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPByteStorageClass)
+
+#define THPDoubleStorage_CData(obj)  (obj)->cdata
+#define THPFloatStorage_CData(obj)   (obj)->cdata
+#define THPHalfStorage_CData(obj)    (obj)->cdata
+#define THPLongStorage_CData(obj)    (obj)->cdata
+#define THPIntStorage_CData(obj)     (obj)->cdata
+#define THPShortStorage_CData(obj)   (obj)->cdata
+#define THPCharStorage_CData(obj)    (obj)->cdata
+#define THPByteStorage_CData(obj)    (obj)->cdata
+
+#ifdef _THP_CORE
+#define THPStorageType TH_CONCAT_3(THP,Real,StorageType)
+#define THPStorageBaseStr TH_CONCAT_STRING_2(Real,StorageBase)
+#endif
+
+#include "generic/Storage.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/Storage.h"
+#include <TH/THGenerateHalfType.h>
+
+#endif
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
new file mode 100644
index 0000000..fe52534
--- /dev/null
+++ b/torch/csrc/THP.h
@@ -0,0 +1,48 @@
+#ifndef THP_H
+#define THP_H
+
+#include "torch/csrc/python_headers.h"
+#include <stdbool.h>
+#include <TH/TH.h>
+#include <TH/THTensor.hpp>
+
+#include "THP_export.h"
+
+// Back-compatibility macros, Thanks to http://cx-oracle.sourceforge.net/
+// define PyInt_* macros for Python 3.x.  NB: We must include Python.h first,
+// otherwise we'll incorrectly conclude PyInt_Check isn't defined!
+#ifndef PyInt_Check
+#define PyInt_Check             PyLong_Check
+#define PyInt_FromLong          PyLong_FromLong
+#define PyInt_AsLong            PyLong_AsLong
+#define PyInt_Type              PyLong_Type
+#endif
+
+// By default, don't specify library state (TH doesn't use one)
+#define LIBRARY_STATE
+#define LIBRARY_STATE_NOARGS
+#define LIBRARY_STATE_TYPE
+#define LIBRARY_STATE_TYPE_NOARGS
+
+#define THWStorage THStorage
+#define THWStorage_(NAME) THStorage_(NAME)
+#define THWTensor THTensor
+#define THWTensor_(NAME) THTensor_(NAME)
+
+#include "PtrWrapper.h"
+#include "Exceptions.h"
+#include "Generator.h"
+#include "Storage.h"
+#include "Size.h"
+#include "Module.h"
+#include "Types.h"
+#include "utils.h" // This requires defined Storage and Tensor types
+#include "byte_order.h"
+
+#ifdef _THP_CORE
+#include "serialization.h"
+
+#include "autograd/autograd.h"
+#endif
+
+#endif
diff --git a/torch/csrc/THP_API.h b/torch/csrc/THP_API.h
new file mode 100644
index 0000000..c3351f1
--- /dev/null
+++ b/torch/csrc/THP_API.h
@@ -0,0 +1,16 @@
+#ifndef THP_API_H
+#define THP_API_H
+
+#ifdef _THP_CORE
+#error Using the THP API header, but _THP_CORE is defined! This macro should \
+    be defined only when compiling the core torch package.
+#endif
+
+#ifdef USE_CUDA
+#include "cuda/THCP.h"
+#include "cuda/undef_macros.h"
+#endif
+
+#include "THP.h"
+
+#endif
diff --git a/torch/csrc/THP_export.h b/torch/csrc/THP_export.h
new file mode 100644
index 0000000..02aa322
--- /dev/null
+++ b/torch/csrc/THP_export.h
@@ -0,0 +1,23 @@
+#ifndef THP_EXPORT_H
+#define THP_EXPORT_H
+
+#ifdef __cplusplus
+# define THP_EXTERNC extern "C"
+#else
+# define THP_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# ifdef _THP_CORE
+#  define THP_API THP_EXTERNC __declspec(dllexport)
+#  define THP_CLASS __declspec(dllexport)
+# else
+#  define THP_API THP_EXTERNC __declspec(dllimport)
+#  define THP_CLASS __declspec(dllimport)
+# endif
+#else
+# define THP_API THP_EXTERNC
+# define THP_CLASS
+#endif
+
+#endif
diff --git a/torch/csrc/Types.h b/torch/csrc/Types.h
new file mode 100644
index 0000000..20e5cee
--- /dev/null
+++ b/torch/csrc/Types.h
@@ -0,0 +1,30 @@
+#ifndef THP_TYPES_INC
+#define THP_TYPES_INC
+
+#include <cstddef>
+#include <TH/TH.h>
+
+#ifndef INT64_MAX
+#include "stdint.h"
+#endif
+
+template <typename T> struct THPTypeInfo {};
+
+namespace torch {
+
+typedef THFloatStorage THVoidStorage;  // all THXXXStorage types are the same.
+
+typedef struct THVoidTensor
+{
+   int64_t *size;
+   int64_t *stride;
+   int nDimension;
+   THVoidStorage *storage;
+   ptrdiff_t storageOffset;
+   int refcount;
+   char flag;
+} THVoidTensor;
+
+}  // namespace torch
+
+#endif
diff --git a/torch/csrc/api/README.md b/torch/csrc/api/README.md
new file mode 100644
index 0000000..3f11fda
--- /dev/null
+++ b/torch/csrc/api/README.md
@@ -0,0 +1,50 @@
+# AUTOGRADPP
+
+This is an experimental C++ frontend to pytorch's C++ backend. Use at your own
+risk.
+
+How to build:
+```
+git submodule update --init --recursive
+
+cd pytorch
+# On Linux:
+python setup.py build
+# On macOS (may need to prefix with `MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++` when using anaconda)
+LDSHARED="cc -dynamiclib -undefined dynamic_lookup" python setup.py build
+
+cd ..; mkdir -p build; cd build
+cmake .. -DPYTHON_EXECUTABLE:FILEPATH=$(which python)  # helpful if you use anaconda
+make -j
+```
+
+# Stuff
+
+- Check out the [MNIST example](https://github.com/ebetica/autogradpp/blob/eee977ddd377c484af5fce09ae8676410bb6fcce/tests/integration_t.cpp#L320-L355),
+which tries to replicate PyTorch's MNIST model + training loop
+- The principled way to write a model is probably something like 
+```
+TORCH_AUTOGRAD_CONTAINER_CLASS(MyModel) {
+  // This does a 2D convolution, followed by global sum pooling, followed by a linear.
+ public:
+  void initialize_containers() override {
+    myConv_ = add(Conv2d(1, 50, 3, 3).stride(2).make(), "conv");
+    myLinear_ = add(Linear(50, 1).make(), "linear");
+  }
+  std::vector<Tensor> forward(std::vector<Tensor> x) override {
+    auto v = myConv_->forward(x);
+    v = v.mean(-1).mean(-1);
+    return myLinear_.forward({v});
+  }
+ private:
+  Container myLinear_;
+  Container myConv_;
+}
+```
+
+Some things are not implemented:
+- SGD, Adagrad, RMSprop, and Adam are the only optimizers implemented
+- Bidirectional, batch first, and PackedSequence are not implemented for LSTMs
+- Sparse Tensors might work but are very untested
+
+Otherwise, lots of other things work. There may be breaking API changes.
diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h
new file mode 100644
index 0000000..87ad35b
--- /dev/null
+++ b/torch/csrc/api/include/torch/cuda.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstddef>
+
+namespace torch {
+namespace cuda {
+/// Returns the number of CUDA devices available.
+size_t device_count();
+
+/// Returns true if at least one CUDA device is available.
+bool is_available();
+
+/// Returns true if CUDA is available, and CuDNN is available.
+bool cudnn_is_available();
+} // namespace cuda
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/detail/ordered_dict.h b/torch/csrc/api/include/torch/detail/ordered_dict.h
new file mode 100644
index 0000000..c165ca5
--- /dev/null
+++ b/torch/csrc/api/include/torch/detail/ordered_dict.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <ATen/Error.h>
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace detail {
+
+/// A simple ordered dictionary implementation, akin to Python's `OrderedDict`.
+template <typename Key, typename Value>
+class OrderedDict {
+ public:
+  struct Item {
+    Item(Key key_, Value value_)
+        : key(std::move(key_)), value(std::move(value_)) {}
+
+    Value& operator*() {
+      return value;
+    }
+    const Value& operator*() const {
+      return value;
+    }
+    Value* operator->() {
+      return &value;
+    }
+    const Value* operator->() const {
+      return &value;
+    }
+
+    const Key key;
+    Value value;
+  };
+
+  // The lifetime of an iterator is bound to the lifetime of the `OrderedDict`.
+  // Further, any `insert()` operation may invalidate all iterators
+  // pointing into the vector.
+
+  using Iterator = typename std::vector<Item>::iterator;
+  using ConstIterator = typename std::vector<Item>::const_iterator;
+
+  /// Constructs the `OrderedDict` with a string that should describe the kind
+  /// of value stored in this `OrderedDict`, for example 'parameter' or
+  /// 'module'.
+  explicit OrderedDict(std::string subject = "Key")
+      : subject_(std::move(subject)) {}
+
+  // Copy we have to do ourselves, because items' keys are const, so we have to
+  // re-insert the items.
+  OrderedDict(const OrderedDict& other)
+      : index_(other.index_), subject_(other.subject_) {
+    for (const auto& item : other.items_) {
+      items_.push_back(item);
+    }
+  }
+
+  OrderedDict& operator=(const OrderedDict& other) {
+    index_ = other.index_;
+    items_.clear();
+    for (auto& item : other.items_) {
+      items_.push_back(item);
+    }
+    subject_ = other.subject_;
+    return *this;
+  }
+
+  // Move works by default, because you can move-construct vectors of const
+  // values..
+  OrderedDict(OrderedDict&& other) = default;
+  OrderedDict& operator=(OrderedDict&& other) = default;
+
+  ~OrderedDict() = default;
+
+  /*implicit */ OrderedDict(std::initializer_list<Item> initializer_list)
+      : OrderedDict("Key") {
+    items_.reserve(initializer_list.size());
+    for (auto& item : initializer_list) {
+      // Copy the key here and move it into the index.
+      items_.emplace_back(item.key, std::move(item.value));
+      index_.emplace(std::move(item.key), size() - 1);
+    }
+  }
+
+  Iterator begin() {
+    return items_.begin();
+  }
+
+  ConstIterator begin() const {
+    return items_.begin();
+  }
+
+  Iterator end() {
+    return items_.end();
+  }
+
+  ConstIterator end() const {
+    return items_.end();
+  }
+
+  Item& front() {
+    return items_.front();
+  }
+
+  const Item& front() const {
+    return items_.front();
+  }
+
+  Item& back() {
+    return items_.back();
+  }
+
+  const Item& back() const {
+    return items_.back();
+  }
+
+  Item& operator[](size_t index) {
+    return items_[index];
+  }
+
+  const Item& operator[](size_t index) const {
+    return items_[index];
+  }
+
+  Value& operator[](const Key& key) {
+    return get(key);
+  }
+
+  const Value& operator[](const Key& key) const {
+    return get(key);
+  }
+
+  template <typename K, typename V>
+  Value& insert(K&& key, V&& value) {
+    AT_CHECK(index_.count(key) == 0, subject_, " '", key, "' already defined");
+    // Copy `key` here and move it into the index.
+    items_.emplace_back(key, std::forward<V>(value));
+    index_.emplace(std::forward<K>(key), size() - 1);
+    return items_.back().value;
+  }
+
+  /// Allows calling `insert` with an initializer list for the value, e.g.
+  /// `insert(key, {...})`.
+  Value& insert(Key key, Value&& value) {
+    return insert<Key, Value>(std::move(key), std::move(value));
+  }
+
+  void update(OrderedDict&& other) {
+    for (auto& item : other) {
+      // We want to call `insert()` to prevent duplicate keys.
+      insert(std::move(item.key), std::move(item.value));
+    }
+  }
+
+  void update(const OrderedDict& other) {
+    for (auto& item : other) {
+      // We want to call `insert()` to prevent duplicate keys.
+      insert(item.key, item.value);
+    }
+  }
+
+  Value* find(const Key& key) noexcept {
+    auto iterator = index_.find(key);
+    if (iterator == index_.end()) {
+      return nullptr;
+    }
+    return &items_[iterator->second].value;
+  }
+
+  const Value* find(const Key& key) const noexcept {
+    auto iterator = index_.find(key);
+    if (iterator == index_.end()) {
+      return nullptr;
+    }
+    return &items_[iterator->second].value;
+  }
+
+  Value& get(const Key& key) {
+    if (auto* value = find(key)) {
+      return *value;
+    }
+    AT_ERROR(subject_, " '", key, "' is not defined");
+  }
+
+  const Value& get(const Key& key) const {
+    if (auto* value = find(key)) {
+      return *value;
+    }
+    AT_ERROR(subject_, " '", key, "' is not defined");
+  }
+
+  void clear() {
+    index_.clear();
+    items_.clear();
+  }
+
+  size_t size() const noexcept {
+    return items_.size();
+  }
+
+  bool is_empty() const noexcept {
+    return items_.empty();
+  }
+
+  const std::string& subject() const noexcept {
+    return subject_;
+  }
+
+ private:
+  std::unordered_map<Key, size_t> index_;
+  std::vector<Item> items_;
+  std::string subject_;
+};
+} // namespace detail
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/detail/static.h b/torch/csrc/api/include/torch/detail/static.h
new file mode 100644
index 0000000..e33e2c2
--- /dev/null
+++ b/torch/csrc/api/include/torch/detail/static.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace torch {
+namespace nn {
+class Module;
+} // namespace nn
+} // namespace torch
+
+namespace torch {
+namespace detail {
+/// Detects if a type T has a forward() method.
+template <typename T>
+struct has_forward {
+  // Declare two types with differing size.
+  using yes = int8_t;
+  using no = int16_t;
+
+  // Here we declare two functions. The first is only enabled if `&U::forward`
+  // is well-formed and returns the `yes` type. In C++, the ellipsis parameter
+  // type (`...`) always puts the function at the bottom of overload resolution.
+  // This is specified in the standard as: 1) A standard conversion sequence is
+  // always better than a user-defined conversion sequence or an ellipsis
+  // conversion sequence. 2) A user-defined conversion sequence is always better
+  // than an ellipsis conversion sequence This means that if the first overload
+  // is viable, it will be preferred over the second as long as we pass any
+  // convertible type. The type of `&U::forward` is a pointer type, so we can
+  // pass e.g. 0.
+  template <typename U>
+  static yes test(decltype(&U::forward));
+  template <typename U>
+  static no test(...);
+
+  // Finally we test statically whether the size of the type returned by the
+  // selected overload is the size of the `yes` type.
+  static constexpr bool value = (sizeof(test<T>(nullptr)) == sizeof(yes));
+};
+
+template <typename Head = void, typename... Tail>
+constexpr bool check_not_lvalue_references() {
+  return (!std::is_lvalue_reference<Head>::value ||
+          std::is_const<typename std::remove_reference<Head>::type>::value) &&
+      check_not_lvalue_references<Tail...>();
+}
+
+template <>
+inline constexpr bool check_not_lvalue_references<void>() {
+  return true;
+}
+
+/// A type trait whose `::value` member is true if `M` derives from `Module`.
+template <typename M>
+using is_module = std::is_base_of<torch::nn::Module, typename std::decay<M>::type>;
+
+template <typename M, typename T = void>
+using enable_if_module_t =
+    typename std::enable_if<is_module<M>::value, T>::type;
+} // namespace detail
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/expanding_array.h b/torch/csrc/api/include/torch/expanding_array.h
new file mode 100644
index 0000000..24f6977
--- /dev/null
+++ b/torch/csrc/api/include/torch/expanding_array.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+namespace torch {
+
+/// A utility class that accepts either a container of `D`-many values, or a
+/// single value, which is internally repeated `D` times. This is useful to
+/// represent parameters that are multidimensional, but often equally sized in
+/// all dimensions. For example, the kernel size of a 2D convolution has an `x`
+/// and `y` length, but `x` and `y` are often equal. In such a case you could
+/// just pass `3` to an `ExpandingArray<2>` and it would "expand" to `{3, 3}`.
+template <size_t D, typename T = int64_t>
+class ExpandingArray {
+ public:
+  /// Constructs an `ExpandingArray` from an `initializer_list`. The extent of
+  /// the lenght is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(std::initializer_list<T> list)
+      : ExpandingArray(std::vector<T>(list)) {}
+
+  /// Constructs an `ExpandingArray` from a `vector`. The extent of the
+  /// lenght is checked against the `ExpandingArray`'s extent parameter `D` at
+  /// runtime.
+  /*implicit*/ ExpandingArray(const std::vector<T>& values) {
+    AT_CHECK(
+        values.size() == D,
+        "Expected ",
+        D,
+        " values, but instead got ",
+        values.size());
+    std::copy(values.begin(), values.end(), values_.begin());
+  }
+
+  /// Constructs an `ExpandingArray` from a single value, which is repeated `D`
+  /// times (where `D` is the extent parameter of the `ExpandingArray`).
+  /*implicit*/ ExpandingArray(T single_size) {
+    values_.fill(single_size);
+  }
+
+  /// Constructs an `ExpandingArray` from a correctly sized `std::array`.
+  /*implicit*/ ExpandingArray(const std::array<T, D>& values)
+      : values_(values) {}
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>& operator*() {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>& operator*() const {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>* operator->() {
+    return &values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>* operator->() const {
+    return &values_;
+  }
+
+  /// Returns an `ArrayRef` to the underlying `std::array`.
+  operator at::ArrayRef<T>() const {
+    return values_;
+  }
+
+  /// Returns the extent of the `ExpandingArray`.
+  size_t size() const noexcept {
+    return D;
+  }
+
+ private:
+  /// The backing array.
+  std::array<T, D> values_;
+};
+
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn.h b/torch/csrc/api/include/torch/nn.h
new file mode 100644
index 0000000..408330d
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/cursor.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules.h>
+#include <torch/nn/pimpl.h>
diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h
new file mode 100644
index 0000000..61a32e2
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/tensor.h>
+
+#include <ATen/Error.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch {
+namespace nn {
+/// The `clone()` method in the base `Module` class does not have knowledge of
+/// the concrete runtime type of its subclasses. Therefore, `clone()` must
+/// either be called from within the subclass, or from a base class that has
+/// knowledge of the concrete type. `Cloneable` uses the CRTP to gain
+/// knowledge of the subclass' static type and provide an implementation of the
+/// `clone()` method. We do not want to use this pattern in the base class,
+/// because then storing a module would always require templatizing it.
+template <typename Derived>
+class Cloneable : public Module {
+ public:
+  using Module::Module;
+
+  /// `reset()` must perform initialization of all members with reference
+  /// semantics, most importantly parameters, buffers and submodules.
+  virtual void reset() = 0;
+
+  /// Performs a recursive "deep copy" of the `Module`, such that all parameters
+  /// and submodules in the cloned module are different from those in the
+  /// original module.
+  std::shared_ptr<Module> clone() const override {
+    const auto& self = static_cast<const Derived&>(*this);
+    auto copy = std::make_shared<Derived>(self);
+    copy->parameters_.clear();
+    copy->buffers_.clear();
+    copy->children_.clear();
+    copy->reset();
+    AT_CHECK(
+        copy->parameters_.size() == parameters_.size(),
+        "The cloned module does not have the same number of "
+        "parameters as the original module after calling reset(). "
+        "Are you sure you called register_parameter() inside reset() "
+        "and not the constructor?");
+    for (const auto& parameter : parameters_) {
+      copy->parameters_[parameter.key].data().copy_(parameter->data());
+    }
+    AT_CHECK(
+        copy->buffers_.size() == buffers_.size(),
+        "The cloned module does not have the same number of "
+        "buffers as the original module after calling reset(). "
+        "Are you sure you called register_buffer() inside reset() "
+        "and not the constructor?");
+    for (const auto& buffer : buffers_) {
+      copy->buffers_[buffer.key].data().copy_(buffer->data());
+    }
+    AT_CHECK(
+        copy->children_.size() == children_.size(),
+        "The cloned module does not have the same number of "
+        "child modules as the original module after calling reset(). "
+        "Are you sure you called register_module() inside reset() "
+        "and not the constructor?");
+    for (const auto& child : children_) {
+      copy->children_[child.key]->clone_(*child.value);
+    }
+    return copy;
+  }
+
+ private:
+  void clone_(Module& other) final override {
+    // Here we are *pretty* certain that `other's` type is `Derived` (because it
+    // was registered under the same name as `this`), but you never know what
+    // crazy things `reset()` does, so `dynamic_cast` just to be safe.
+    auto clone = std::dynamic_pointer_cast<Derived>(other.clone());
+    AT_CHECK(
+        clone != nullptr,
+        "Attempted to clone submodule, but it is of a "
+        "different type than the submodule it was to be cloned into");
+    static_cast<Derived&>(*this) = std::move(*clone);
+  }
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h
new file mode 100644
index 0000000..c0f56ee
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/cursor.h
@@ -0,0 +1,240 @@
+#pragma once
+
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+// Forward declarations.
+namespace torch {
+namespace detail {
+template <typename T>
+struct CursorCollector;
+} // namespace detail
+namespace nn {
+class Module;
+} // namespace nn
+} // namespace torch
+
+namespace torch {
+namespace detail {
+/// A cursor provides hierarchical iteration support, with convenient iterator
+/// functions like `map` or `find`.
+///
+/// Fundamentally, cursors are similar to `std::map`, with `[]`, `.at()`,
+/// `.size()` etc. When iterating over a cursor, use `.key()` to get the
+/// associated string, and dereference the returned item or use `.value()` to
+/// get the associated value (e.g. the variable, the module or the buffer).
+///
+/// Note that cursors *eagerly* collect items. So if you want to perform many
+/// operations with a single cursor, it is better to store it in a local
+/// variable. This also you means should not store iterators into a temporary
+/// cursor, e.g. do not write `auto iterator = module.parameters().begin()`, as
+/// the parameter cursor will die at the end of the expression.
+///
+/// A cursor's lifetime is bound to the lifetime of the module hierarchy into
+/// which it points.
+template <typename T>
+class CursorBase {
+ public:
+  using ValueType = T;
+
+  // NOTE: This is a template class, but we explicitly instantiate it in the
+  // .cpp file for every type necessary, so we can define it in the .cpp file.
+  // Hooray!
+
+  /// A `(key, value)` pair exposed by cursor iterators.
+  struct Item {
+    Item(const std::string& key_, T& module_);
+
+    T& operator*();
+    const T& operator*() const;
+    T* operator->();
+    const T* operator->() const;
+
+    const std::string key;
+    T& value;
+  };
+
+  // Iterators are valid for the lifetime of the cursor.
+
+  // Picks either `const_iterator` or `iterator` as the iterator type, depending
+  // on whether `T` is const.
+  using Iterator = typename std::vector<Item>::iterator;
+  using ConstIterator = typename std::vector<Item>::const_iterator;
+
+  CursorBase() = default;
+
+  /// Constructs the `CursorBase` from a vector of items.
+  explicit CursorBase(std::vector<Item>&& items);
+
+  // No need for a virtual destructor, as cursors are not intended to be used
+  // polymorhpically (i.e. we are relying on non-virtual inheritance).
+
+  // Note that these functions may only be called on lvalues (that's the
+  // ampersand next to the function)! This prevents code like `auto iterator =
+  // module.modules().begin()`, since `iterator` would be pointing to a `vector`
+  // that gets destructed at the end of the expression. This is not a problem
+  // for range loops, as they capture the range expression (the thing to the
+  // right of the colon in `for (auto x : ...)`) before iteration. This is
+  // smart.
+  Iterator begin() & noexcept;
+  ConstIterator begin() const& noexcept;
+
+  Iterator end() & noexcept;
+  ConstIterator end() const& noexcept;
+
+  /// Applies a function to every *value* available. The function should accept
+  /// a single argument, that is a reference to the value type (e.g. `Module&`).
+  template <typename Function>
+  void apply(const Function& function) {
+    for (auto& item : items_) {
+      function(*item);
+    }
+  }
+  template <typename Function>
+  void apply(const Function& function) const {
+    for (auto& item : items_) {
+      function(*item);
+    }
+  }
+
+  /// Applies a function to every *item* available. The function should accept
+  /// two arguments, one taking a reference to the key type (always `const
+  /// std::string&`) and the other taking a reference to the value type (e.g.
+  /// `Module&`).
+  template <typename Function>
+  void apply_items(const Function& function) {
+    for (auto& item : items_) {
+      function(item.key, item.value);
+    }
+  }
+  template <typename Function>
+  void apply_items(const Function& function) const {
+    for (auto& item : items_) {
+      function(item.key, item.value);
+    }
+  }
+
+  /// Applies a function to every *value* available, and stores the return value
+  /// of the function into the iterator. The function should accept
+  /// a single argument, that is a reference to the value type (e.g. `Module&`).
+  template <typename Iterator, typename Function>
+  void map(Iterator output_iterator, Function function) {
+    for (auto& item : items_) {
+      *output_iterator++ = function(*item);
+    }
+  }
+  template <typename Iterator, typename Function>
+  void map(Iterator output_iterator, Function function) const {
+    for (auto& item : items_) {
+      *output_iterator++ = function(*item);
+    }
+  }
+
+  /// Applies a function to every *value* available, and stores the return value
+  /// of the function into the iterator. The function should accept
+  /// two arguments, one taking a reference to the key type (always `const
+  /// std::string&`) and the other taking a referen
+  template <typename Iterator, typename Function>
+  void map_items(Iterator output_iterator, Function function) {
+    for (auto& item : items_) {
+      *output_iterator++ = function(item.key, item.value);
+    }
+  }
+  template <typename Iterator, typename Function>
+  void map_items(Iterator output_iterator, Function function) const {
+    for (auto& item : items_) {
+      *output_iterator++ = function(item.key, item.value);
+    }
+  }
+
+  /// Attempts to find a value for the given `key`. If found, returns a pointer
+  /// to the value. If not, returns a null pointer.
+  T* find(const std::string& key) noexcept;
+  const T* find(const std::string& key) const noexcept;
+
+  /// Attempts to find a value for the given `key`. If found, returns a
+  /// reference to the value. If not, throws an exception.
+  T& at(const std::string& key);
+  const T& at(const std::string& key) const;
+
+  /// Attempts to return the item at the given index. If the index is in range,
+  /// returns a reference to the item. If not, throws an exception.
+  Item& at(size_t index);
+
+  /// Equivalent to `at(key)`.
+  T& operator[](const std::string& key);
+  const T& operator[](const std::string& key) const;
+
+  /// Equivalent to `at(index)`.
+  Item& operator[](size_t index);
+
+  /// Returns true if an item with the given `key` exists.
+  bool contains(const std::string& key) const noexcept;
+
+  /// Counts the number of items available.
+  size_t size() const noexcept;
+
+ protected:
+  /// Helper struct to collect items.
+  struct Collector;
+
+  /// The (eagerly) collected vector of items.
+  std::vector<Item> items_;
+};
+} // namespace detail
+
+namespace nn {
+
+// Module cursors (`.modules()` and `.children()`)
+
+class ModuleCursor : public detail::CursorBase<Module> {
+ public:
+  friend class ConstModuleCursor;
+  explicit ModuleCursor(
+      Module& module,
+      size_t maximum_depth = std::numeric_limits<size_t>::max());
+};
+
+class ConstModuleCursor : public detail::CursorBase<const Module> {
+ public:
+  explicit ConstModuleCursor(
+      const Module& module,
+      size_t maximum_depth = std::numeric_limits<size_t>::max());
+
+  /* implicit */ ConstModuleCursor(const ModuleCursor& cursor);
+};
+
+// Parameter cursors (`.parameters()`)
+
+class ParameterCursor : public detail::CursorBase<Tensor> {
+ public:
+  friend class ConstParameterCursor;
+  explicit ParameterCursor(Module& module);
+};
+
+class ConstParameterCursor : public detail::CursorBase<const Tensor> {
+ public:
+  explicit ConstParameterCursor(const Module& module);
+  /* implicit */ ConstParameterCursor(const ParameterCursor& cursor);
+};
+
+// Buffer cursors (`.buffers()`)
+
+class BufferCursor : public detail::CursorBase<Tensor> {
+ public:
+  friend class ConstBufferCursor;
+  explicit BufferCursor(Module& module);
+};
+
+class ConstBufferCursor : public detail::CursorBase<const Tensor> {
+ public:
+  explicit ConstBufferCursor(const Module& module);
+  /* implicit */ ConstBufferCursor(const BufferCursor& cursor);
+};
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/init.h b/torch/csrc/api/include/torch/nn/init.h
new file mode 100644
index 0000000..9054e84
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/init.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/tensor.h>
+
+namespace torch {
+namespace nn {
+namespace init {
+
+Tensor constant_(Tensor tensor, Scalar value);
+Tensor dirac_(Tensor tensor);
+Tensor eye_(Tensor tensor);
+Tensor normal_(Tensor tensor, double mean = 0, double std = 1);
+Tensor ones_(Tensor tensor);
+Tensor orthogonal_(Tensor tensor, double gain = 1.0);
+Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01);
+Tensor uniform_(Tensor tensor, double low = 0, double high = 1);
+Tensor xavier_normal_(Tensor tensor, double gain = 1.0);
+Tensor xavier_uniform_(Tensor tensor, double gain = 1.0);
+Tensor zeros_(Tensor tensor);
+
+} // namespace init
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
new file mode 100644
index 0000000..a80b608
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -0,0 +1,232 @@
+#pragma once
+
+#include <torch/detail/ordered_dict.h>
+#include <torch/nn/cursor.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+
+namespace torch {
+namespace detail {
+template <typename T>
+class CursorBase;
+} // namespace detail
+} // namespace torch
+
+namespace torch {
+namespace nn {
+
+class Module {
+ public:
+  /// Tells the base `Module` about the name of the submodule.
+  explicit Module(std::string name);
+
+  /// Constructs the base module without immediate knowledge of the submodule's
+  /// name. The name of the submodule is inferred via RTTI the first time
+  /// `.name()` is invoked.
+  Module() = default;
+
+  virtual ~Module() = default;
+
+  /// Returns the name of the `Module`.
+  const std::string& name() const noexcept;
+
+  /// Performs a recursive deep copy of the module and all its registered
+  /// parameters, buffers and submodules.
+  virtual std::shared_ptr<Module> clone() const;
+
+  /// Provides a means to traverse the `Module` tree.
+  ModuleCursor modules();
+  ConstModuleCursor modules() const;
+
+  /// Traverses the (immediate) children of the `Module`.
+  ModuleCursor children();
+  ConstModuleCursor children() const;
+
+  /// Provides a means to recursively access the parameters of the `Module`
+  /// tree.
+  ParameterCursor parameters();
+  ConstParameterCursor parameters() const;
+
+  /// Provides a means to recursively access the buffers of the `Module` tree.
+  BufferCursor buffers();
+  ConstBufferCursor buffers() const;
+
+  /// Enables training mode.
+  virtual void train();
+
+  /// Disables training mode.
+  virtual void eval();
+
+  /// True if the module is in training mode.
+  virtual bool is_training() const noexcept;
+
+  /// Recursively casts all parameters to the given dtype and device.
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(
+      torch::Device device,
+      torch::Dtype dtype,
+      bool non_blocking = false);
+
+  /// Recursively casts all parameters to the given dtype.
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Dtype dtype, bool non_blocking = false);
+
+  /// Recursively moves all parameters to the given device.
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Device device, bool non_blocking = false);
+
+  /// Recursively zeros out the `grad` values of all parameters.
+  virtual void zero_grad();
+
+  /// Serializes the `Module`.
+  template <class Archive>
+  void save(Archive& ar) const;
+
+  /// Deserializes the `Module`.
+  template <class Archive>
+  void load(Archive& ar);
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  template <typename ModuleType>
+  typename ModuleType::ContainedType* as() noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  template <
+      typename ModuleType,
+      typename = torch::detail::disable_if_module_holder_t<ModuleType>>
+  ModuleType* as() noexcept;
+
+ protected:
+  /// Registers a parameter with this `Module`.
+  Tensor& register_parameter(
+      std::string name,
+      Tensor tensor,
+      bool requires_grad = true);
+  /// Registers a buffer with this `Module`.
+  Tensor& register_buffer(std::string name, Tensor tensor);
+
+  /// Registers a submodule with this `Module`.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      std::shared_ptr<ModuleType> module);
+
+  /// Registers a submodule with this `Module`.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      ModuleHolder<ModuleType> module_holder);
+
+ private:
+  template <typename T>
+  using OrderedDict = torch::detail::OrderedDict<std::string, T>;
+
+  template <typename Derived>
+  friend class Cloneable;
+  template <typename T>
+  friend class detail::CursorBase;
+
+  virtual void clone_(Module& other);
+
+  /// The implementation of the various `to()` methods.
+  template <typename... Ts>
+  void to_impl(Ts&&... ts);
+
+  OrderedDict<Tensor> parameters_;
+  OrderedDict<Tensor> buffers_;
+  OrderedDict<std::shared_ptr<Module>> children_;
+
+  /// The module's name (e.g. "LSTM").
+  mutable at::optional<std::string> name_;
+
+  /// Whether the module is in training mode.
+  bool is_training_{true};
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nn::Module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <class Archive>
+void Module::save(Archive& ar) const {
+  auto params = parameters();
+  size_t size = params.size();
+  ar(size);
+  for (auto& p : params) {
+    ar(p.key, p.value);
+  }
+}
+
+template <class Archive>
+void Module::load(Archive& ar) {
+  auto params = parameters();
+  size_t size;
+  ar(size);
+  std::string name;
+  for (size_t i = 0; i < size; i++) {
+    ar(name);
+    ar(params[name]);
+  }
+}
+
+template <typename ModuleType>
+typename ModuleType::ContainedType* Module::as() noexcept {
+  // Use the contained type of the `ModuleHolder`, e.g. `LinearImpl` for
+  // `Linear`, since `LinearImpl` inherits `nn::Module`.
+  return as<typename ModuleType::ContainedType>();
+}
+
+template <typename ModuleType, typename>
+ModuleType* Module::as() noexcept {
+  return dynamic_cast<ModuleType*>(this);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    std::shared_ptr<ModuleType> module) {
+  auto& base_module = children_.insert(std::move(name), std::move(module));
+  return std::static_pointer_cast<ModuleType>(base_module);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    ModuleHolder<ModuleType> module_holder) {
+  return register_module(std::move(name), module_holder.ptr());
+}
+
+template <typename... Ts>
+void Module::to_impl(Ts&&... ts) {
+  // First call `to()` on every child module.
+  for (auto& child : children_) {
+    child.value->to(ts...);
+  }
+  // Then move every parameter to the new dtype/device.
+  for (auto& parameter : parameters_) {
+    at::detail::set_data(*parameter, parameter->data().to(ts...));
+  }
+  // Then move every buffer to the new dtype/device.
+  for (auto& buffer : buffers_) {
+    at::detail::set_data(*buffer, buffer->data().to(ts...));
+  }
+}
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules.h b/torch/csrc/api/include/torch/nn/modules.h
new file mode 100644
index 0000000..0e6feaa
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/nn/modules/any.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/embedding.h>
+#include <torch/nn/modules/functional.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/rnn.h>
+#include <torch/nn/modules/sequential.h>
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
new file mode 100644
index 0000000..be5fd3a
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -0,0 +1,443 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <torch/csrc/utils/memory.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// A class to store a type erased module, whose `forward()` method can be
+/// invoked, with dynamic type checking. An `AnyModule` has an empty state, into
+/// which it is default constructed. `is_empty()` can be used to query whether
+/// the `AnyModule` is empty.
+class AnyModule {
+ public:
+  /// A type-erased value.
+  class Value;
+
+  /// A default-constructed `AnyModule` is in an empty state.
+  AnyModule() = default;
+
+  /// Constructs an `AnyModule` from a `shared_ptr` to concrete module object.
+  template <typename ModuleType>
+  explicit AnyModule(std::shared_ptr<ModuleType> module);
+
+  /// Constructs an `AnyModule` from a concrete module object.
+  template <
+      typename ModuleType,
+      typename = torch::detail::enable_if_module_t<ModuleType>>
+  explicit AnyModule(ModuleType&& module);
+
+  /// Constructs an `AnyModule` from a module holder.
+  template <typename ModuleType>
+  explicit AnyModule(const ModuleHolder<ModuleType>& module_holder);
+
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  AnyModule(AnyModule&&) = default;
+  AnyModule& operator=(AnyModule&&) = default;
+
+  /// Creates a shallow copy of an `AnyModule`.
+  AnyModule(const AnyModule& other);
+  AnyModule& operator=(const AnyModule& other);
+
+  /// Creates a deep copy of an `AnyModule` if it contains a module, else an
+  /// empty `AnyModule` if it is empty.
+  AnyModule clone() const;
+
+  /// Assigns a module to the `AnyModule` (to circumvent the explicit
+  /// constructor).
+  template <typename ModuleType>
+  AnyModule& operator=(std::shared_ptr<ModuleType> module);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// returns the return value as an `Value`. Use this method when chaining
+  /// `AnyModule`s in a loop.
+  template <typename... ArgumentTypes>
+  Value forward(ArgumentTypes&&... arguments);
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  T& get();
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  const T& get() const;
+
+  /// Returns the contained module in a `nn::ModuleHolder` subclass if possible
+  /// (i.e. if `T` has a constructor for the underlying module type).
+  template <typename T, typename ContainedType = typename T::ContainedType>
+  T get() const;
+
+  /// Returns a `std::shared_ptr` whose dynamic type is that of the underlying
+  /// module.
+  std::shared_ptr<Module> ptr() const;
+
+  /// Like `ptr()`, but casts the pointer to the given type.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  std::shared_ptr<T> ptr() const;
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const;
+
+  /// Returns true if the `AnyModule` does not contain a module.
+  bool is_empty() const noexcept;
+
+ private:
+  /// The static type of the object we store in the `AnyModule`, which erases
+  /// the actual type, but allows us to call `forward()` on the underlying
+  /// module.
+  struct Placeholder;
+
+  /// The dynamic type of the object stored in the `AnyModule`. It contains the
+  /// concrete instance to which all calls are forwarded. It is parameterized
+  /// over the concrete type of the module, and the types of the arguments the
+  /// module takes in its `forward()` method.
+  template <typename ModuleType, typename... ArgumentTypes>
+  struct Holder;
+
+  /// Creates a `unique_ptr<Placeholder>` pointing to a `Holder` of the correct
+  /// type. This method is used to deduce the arguments of the module's
+  /// `forward()` method.
+  template <
+      typename ModuleType,
+      typename Class, // = std::remove_reference<ModuleType>::type
+      typename ReturnType,
+      typename... ArgumentTypes>
+  std::unique_ptr<Placeholder> make_holder(
+      std::shared_ptr<ModuleType>&& module,
+      ReturnType (Class::*)(ArgumentTypes...));
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename T>
+  T& get_() const;
+
+  /// The type erased module.
+  std::unique_ptr<Placeholder> content_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule::Value ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A simplified implementation of `std::any` which stores
+/// a type erased object, whose concrete value can be retrieved at runtime by
+/// checking if the `typeid()` of a requested type matches the `typeid()` of
+/// the object stored. It is simplified in that it does not handle copying, as
+/// we do not require it for our use cases. Moves are sufficient.
+class AnyModule::Value {
+ public:
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  Value(Value&&) = default;
+  Value& operator=(Value&&) = default;
+
+  /// Copy is disallowed, because we don't need it.
+  Value(const Value& other) = delete;
+  Value& operator=(const Value& other) = delete;
+
+  /// Returns a pointer to the value contained in the `Value` if the type passed
+  /// as template parameter matches the type of the value stored, and returns a
+  /// null pointer otherwise.
+  template <typename T>
+  T* try_get() {
+    static_assert(
+        !std::is_reference<T>::value,
+        "Value stores decayed types, you cannot cast it to a reference type");
+    static_assert(
+        !std::is_array<T>::value,
+        "Value stores decayed types, you must cast it to T* instead of T[]");
+    if (typeid(T).hash_code() == type_info().hash_code()) {
+      return &static_cast<Holder<T>&>(*content_).value;
+    }
+    return nullptr;
+  }
+
+  /// Returns the value contained in the `Value` if the type passed as template
+  /// parameter matches the type of the value stored, and throws an exception
+  /// otherwise.
+  template <typename T>
+  T get() {
+    if (auto* maybe_value = try_get<T>()) {
+      return *maybe_value;
+    }
+    AT_ERROR(
+        "Attempted to cast Value to ",
+        at::demangle(typeid(T).name()),
+        ", but its actual type is ",
+        at::demangle(type_info().name()));
+  }
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const noexcept {
+    return content_->type_info;
+  }
+
+ private:
+  friend class AnyModule;
+  friend class TestValue;
+
+  /// Constructs the `Value` from value type.
+  template <
+      typename T,
+      typename = torch::disable_if_t<std::is_same<at::Tensor, T>::value>>
+  explicit Value(T&& value)
+      : content_(
+            torch::make_unique<Holder<decay_t<T>>>(std::forward<T>(value))) {}
+
+  /// Constructs the `Value`, but converts an `at::Tensor` to a `torch::Tensor`
+  /// implicitly if the `at::Tensor` is really a `torch::Tensor` (a variable).
+  explicit Value(at::Tensor tensor) {
+    if (tensor.is_variable()) {
+      content_ = torch::make_unique<Holder<torch::Tensor>>(std::move(tensor));
+    } else {
+      content_ = torch::make_unique<Holder<at::Tensor>>(std::move(tensor));
+    }
+  }
+
+  /// The static type of the object we store in the `Value`, which erases the
+  /// actual object's type, allowing us only to check the `type_info` of the
+  /// type stored in the dynamic type.
+  struct Placeholder {
+    explicit Placeholder(const std::type_info& type_info_) noexcept
+        : type_info(type_info_) {}
+    virtual ~Placeholder() = default;
+    const std::type_info& type_info;
+  };
+
+  /// The dynamic type of the object we store in the `Value`, which hides the
+  /// actual object we have erased in this `Value`.
+  template <typename T>
+  struct Holder : public Placeholder {
+    /// A template because T&& would not be universal reference here.
+    template <typename U>
+    explicit Holder(U&& value_) noexcept
+        : Placeholder(typeid(T)), value(std::forward<U>(value_)) {}
+    T value;
+  };
+
+  /// The type erased object.
+  std::unique_ptr<Placeholder> content_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule::Placeholder ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct AnyModule::Placeholder : public AnyModule::Value::Placeholder {
+  using AnyModule::Value::Placeholder::Placeholder;
+
+  /// The "erased" `forward()` method.
+  virtual Value forward(std::vector<Value>&& arguments) = 0;
+
+  /// Returns std::shared_ptr<Module> pointing to the erased module.
+  virtual std::shared_ptr<Module> ptr() = 0;
+
+  /// Returns a `Placeholder` with a shallow copy of this `AnyModule`.
+  virtual std::unique_ptr<Placeholder> copy() const = 0;
+
+  /// Returns a `Placeholder` with a deep copy of this `AnyModule`.
+  virtual std::unique_ptr<Placeholder> clone() const = 0;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule::Holder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType, typename... ArgumentTypes>
+struct AnyModule::Holder : public AnyModule::Placeholder {
+  struct CheckedGetter {
+    template <typename T>
+    decay_t<T>&& operator()(size_t index) {
+      AT_ASSERT(index < arguments_.size());
+      auto& value = arguments_[index];
+      if (auto* maybe_value = value.template try_get<decay_t<T>>()) {
+        return std::move(*maybe_value);
+      }
+      AT_ERROR(
+          "Expected argument #",
+          index,
+          " to be of type ",
+          at::demangle(typeid(T).name()),
+          ", but received value of type ",
+          at::demangle(value.type_info().name()));
+    }
+    std::vector<Value>& arguments_;
+  };
+
+  struct InvokeForward {
+    template <typename... Ts>
+    Value operator()(Ts&&... ts) {
+      return Value(module_->forward(std::forward<Ts>(ts)...));
+    }
+    std::shared_ptr<ModuleType>& module_;
+  };
+
+  /// Constructs the `Holder` from a concrete module.
+  explicit Holder(std::shared_ptr<ModuleType>&& module_)
+      : Placeholder(typeid(ModuleType)), module(std::move(module_)) {}
+
+  /// Calls `forward()` on the underlying module, casting each `Value` in the
+  /// argument vector to a concrete value.
+  Value forward(std::vector<Value>&& arguments) override {
+    AT_CHECK(
+        arguments.size() == sizeof...(ArgumentTypes),
+        at::demangle(type_info.name()),
+        "'s forward() method expects ",
+        sizeof...(ArgumentTypes),
+        " arguments, but received ",
+        arguments.size());
+    // FYI: During invocation of a module's `forward()` method, the values live
+    // in the `arguments` vector inside this function.
+    return torch::unpack<ArgumentTypes...>(
+        InvokeForward{module}, CheckedGetter{arguments});
+  }
+
+  std::shared_ptr<Module> ptr() override {
+    return module;
+  }
+
+  std::unique_ptr<Placeholder> copy() const override {
+    return torch::make_unique<Holder>(*this);
+  }
+
+  std::unique_ptr<Placeholder> clone() const override {
+    return torch::make_unique<Holder>(
+        std::static_pointer_cast<ModuleType>(module->clone()));
+  }
+
+  /// The actual concrete module instance.
+  std::shared_ptr<ModuleType> module;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType>
+AnyModule::AnyModule(std::shared_ptr<ModuleType> module)
+    : content_(make_holder(
+          std::move(module),
+          &std::remove_reference<ModuleType>::type::forward)) {}
+
+template <typename ModuleType, typename>
+AnyModule::AnyModule(ModuleType&& module)
+    : AnyModule(
+          std::make_shared<ModuleType>(std::forward<ModuleType>(module))) {}
+
+template <typename ModuleType>
+AnyModule::AnyModule(const ModuleHolder<ModuleType>& module_holder)
+    : AnyModule(module_holder.ptr()) {}
+
+inline AnyModule::AnyModule(const AnyModule& other)
+    : content_(other.content_ ? other.content_->copy() : nullptr) {}
+
+inline AnyModule& AnyModule::operator=(const AnyModule& other) {
+  if (this != &other) {
+    content_ = other.content_ ? other.content_->copy() : nullptr;
+  }
+  return *this;
+}
+
+inline AnyModule AnyModule::clone() const {
+  AnyModule clone;
+  clone.content_ = content_ ? content_->clone() : nullptr;
+  return clone;
+}
+
+template <typename ModuleType>
+AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
+  return (*this = AnyModule(std::move(module)));
+}
+
+template <typename... ArgumentTypes>
+AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) {
+  AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
+  std::vector<Value> values;
+  values.reserve(sizeof...(ArgumentTypes));
+  torch::apply(
+      [&values](Value&& value) { values.push_back(std::move(value)); },
+      Value(std::forward<ArgumentTypes>(arguments))...);
+  return content_->forward(std::move(values));
+}
+
+template <typename T, typename>
+T& AnyModule::get() {
+  AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename>
+const T& AnyModule::get() const {
+  AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename ContainedType>
+T AnyModule::get() const {
+  return T(ptr<ContainedType>());
+}
+
+inline std::shared_ptr<Module> AnyModule::ptr() const {
+  AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  return content_->ptr();
+}
+
+template <typename T, typename>
+std::shared_ptr<T> AnyModule::ptr() const {
+  AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  /// Call get() but discard the value, just to do the type checking.
+  get_<T>();
+  return std::static_pointer_cast<T>(ptr());
+}
+
+inline const std::type_info& AnyModule::type_info() const {
+  AT_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule");
+  return content_->type_info;
+}
+
+inline bool AnyModule::is_empty() const noexcept {
+  return content_ == nullptr;
+}
+
+// Private Methods
+
+template <
+    typename ModuleType,
+    typename Class,
+    typename ReturnType,
+    typename... ArgumentTypes>
+std::unique_ptr<AnyModule::Placeholder> AnyModule::make_holder(
+    std::shared_ptr<ModuleType>&& module,
+    ReturnType (Class::*)(ArgumentTypes...)) {
+  static_assert(
+      torch::detail::check_not_lvalue_references<ArgumentTypes...>(),
+      "Modules stored inside AnyModule must not take references. "
+      "Use pointers instead.");
+  static_assert(
+      !std::is_void<ReturnType>::value,
+      "AnyModule cannot store modules that return void "
+      "(you can return a dummy value).");
+  return torch::make_unique<Holder<decay_t<ModuleType>, ArgumentTypes...>>(
+      std::move(module));
+}
+
+template <typename T>
+T& AnyModule::get_() const {
+  if (typeid(T).hash_code() == type_info().hash_code()) {
+    return *static_cast<Holder<T>&>(*content_).module;
+  }
+  AT_ERROR(
+      "Attempted to cast module of type ",
+      at::demangle(type_info().name()),
+      " to type ",
+      at::demangle(typeid(T).name()));
+}
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
new file mode 100644
index 0000000..25c75b8
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <cstdint>
+
+namespace torch {
+namespace nn {
+struct BatchNormOptions {
+  /* implicit */ BatchNormOptions(int64_t features);
+  TORCH_ARG(int64_t, features);
+  TORCH_ARG(bool, affine) = true;
+  TORCH_ARG(bool, stateful) = false;
+  TORCH_ARG(double, eps) = 1e-5;
+  TORCH_ARG(double, momentum) = 0.1;
+};
+
+class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
+ public:
+  template <typename... Ts>
+  explicit BatchNormImpl(Ts&&... ts)
+      : BatchNormImpl(BatchNormOptions(std::forward<Ts>(ts)...)) {}
+  explicit BatchNormImpl(BatchNormOptions options);
+
+  void reset() override;
+
+  Tensor forward(Tensor input);
+  Tensor pure_forward(Tensor input, Tensor mean, Tensor variance);
+
+  BatchNormOptions options;
+  Tensor weight;
+  Tensor bias;
+  Tensor running_mean;
+  Tensor running_variance;
+};
+
+TORCH_MODULE(BatchNorm);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
new file mode 100644
index 0000000..f7a7cc0
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+template <size_t D>
+struct ConvOptions {
+  ConvOptions(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<D> kernel_size);
+
+  TORCH_ARG(int64_t, input_channels);
+  TORCH_ARG(int64_t, output_channels);
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+  TORCH_ARG(bool, transposed) = false;
+  TORCH_ARG(bool, with_bias) = true;
+  TORCH_ARG(int64_t, groups) = 1;
+};
+
+template <size_t D, typename Derived>
+class ConvImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  template <typename... Ts>
+  explicit ConvImpl(Ts&&... ts)
+      : ConvImpl(ConvOptions<D>(std::forward<Ts>(ts)...)) {}
+  explicit ConvImpl(ConvOptions<D> options);
+
+  void reset() override;
+
+  ConvOptions<D> options;
+  Tensor weight;
+  Tensor bias;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+class Conv1dImpl : public ConvImpl<1, Conv1dImpl> {
+ public:
+  using ConvImpl<1, Conv1dImpl>::ConvImpl;
+  Tensor forward(Tensor input);
+};
+using Conv1dOptions = ConvOptions<1>;
+TORCH_MODULE(Conv1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+class Conv2dImpl : public ConvImpl<2, Conv2dImpl> {
+ public:
+  using ConvImpl<2, Conv2dImpl>::ConvImpl;
+  Tensor forward(Tensor input);
+};
+using Conv2dOptions = ConvOptions<2>;
+TORCH_MODULE(Conv2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+class Conv3dImpl : public ConvImpl<3, Conv3dImpl> {
+ public:
+  using ConvImpl<3, Conv3dImpl>::ConvImpl;
+  Tensor forward(Tensor input);
+};
+using Conv3dOptions = ConvOptions<3>;
+TORCH_MODULE(Conv3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
new file mode 100644
index 0000000..91f4c5b
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+struct DropoutOptions {
+  DropoutOptions(double rate);
+  TORCH_ARG(double, rate) = 0.5;
+};
+
+namespace detail {
+template <typename Derived>
+class DropoutImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  template <typename... Ts>
+  explicit DropoutImplBase(Ts&&... ts)
+      : DropoutImplBase(DropoutOptions(std::forward<Ts>(ts)...)) {}
+  explicit DropoutImplBase(DropoutOptions options_);
+
+  void reset() override;
+  Tensor forward(Tensor input);
+  virtual Tensor noise_mask(Tensor input) const = 0;
+
+  DropoutOptions options;
+};
+} // namespace detail
+
+class DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
+ public:
+  using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
+  Tensor noise_mask(Tensor input) const override;
+};
+
+class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
+ public:
+  using detail::DropoutImplBase<Dropout2dImpl>::DropoutImplBase;
+  Tensor noise_mask(Tensor input) const override;
+};
+
+TORCH_MODULE(Dropout);
+TORCH_MODULE(Dropout2d);
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
new file mode 100644
index 0000000..f35cd05
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+struct EmbeddingOptions {
+  EmbeddingOptions(int64_t count, int64_t dimension);
+  TORCH_ARG(int64_t, count);
+  TORCH_ARG(int64_t, dimension);
+};
+
+class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
+ public:
+  template <typename... Ts>
+  explicit EmbeddingImpl(Ts&&... ts)
+      : EmbeddingImpl(EmbeddingOptions(std::forward<Ts>(ts)...)) {}
+  explicit EmbeddingImpl(EmbeddingOptions options);
+
+  void reset() override;
+  Tensor forward(Tensor);
+
+  EmbeddingOptions options;
+  Tensor table;
+};
+
+TORCH_MODULE(Embedding);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/functional.h b/torch/csrc/api/include/torch/nn/modules/functional.h
new file mode 100644
index 0000000..4e234a8
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/functional.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <functional>
+
+namespace torch {
+namespace nn {
+
+// Lets you create a container from a function, designed for use in
+// Sequential.
+class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
+ public:
+  using Function = std::function<Tensor(Tensor)>;
+
+  explicit FunctionalImpl(Function function);
+
+  template <
+      typename SomeFunction,
+      typename... Args,
+      typename = torch::enable_if_t<(sizeof...(Args) > 0)>>
+  explicit FunctionalImpl(SomeFunction original_function, Args&&... args)
+      : function_(std::bind(
+            original_function,
+            /*input=*/std::placeholders::_1,
+            std::forward<Args>(args)...)) {
+    // std::bind is normally evil, but (1) gcc is broken w.r.t. handling
+    // parameter pack expansion in lambdas and (2) moving parameter packs into
+    // a lambda only works with C++14, so std::bind is the more move-aware
+    // solution here.
+  }
+
+  void reset() override;
+  Tensor forward(Tensor input);
+
+  /// Calls forward(input).
+  Tensor operator()(Tensor input);
+
+ private:
+  Function function_;
+};
+
+TORCH_MODULE(Functional);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
new file mode 100644
index 0000000..34f6749
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+struct LinearOptions {
+  LinearOptions(int64_t in, int64_t out);
+  TORCH_ARG(int64_t, in);
+  TORCH_ARG(int64_t, out);
+  TORCH_ARG(bool, with_bias) = true;
+};
+
+class LinearImpl : public Cloneable<LinearImpl> {
+ public:
+  template <typename... Ts>
+  explicit LinearImpl(Ts&&... ts)
+      : LinearImpl(LinearOptions(std::forward<Ts>(ts)...)) {}
+  explicit LinearImpl(LinearOptions options);
+
+  void reset() override;
+  Tensor forward(Tensor);
+
+  LinearOptions options;
+  Tensor weight;
+  Tensor bias;
+};
+
+TORCH_MODULE(Linear);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
new file mode 100644
index 0000000..e6d2ea9
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <ATen/ATen.h>
+#include <ATen/Error.h>
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+struct RNNOutput {
+  Tensor output;
+  Tensor state;
+};
+
+namespace detail {
+struct RNNOptionsBase {
+  RNNOptionsBase(int64_t input_size, int64_t hidden_size);
+  virtual ~RNNOptionsBase() = default;
+  TORCH_ARG(int64_t, input_size);
+  TORCH_ARG(int64_t, hidden_size);
+  TORCH_ARG(int64_t, layers) = 1;
+  TORCH_ARG(bool, with_bias) = true;
+  TORCH_ARG(double, dropout) = 0.0;
+};
+
+template <typename Derived>
+class RNNImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  // These must line up with the CUDNN mode codes:
+  // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
+  enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 };
+
+  RNNImplBase(
+      RNNOptionsBase options_,
+      at::optional<CuDNNMode> cudnn_mode = at::nullopt,
+      int64_t number_of_gates = 1,
+      bool has_cell_state = false);
+
+  RNNOutput forward(Tensor input, Tensor state = {});
+
+  void reset() override;
+
+  /// Recursively casts all parameters to the given device and dtype.
+  void to(torch::Device device, torch::Dtype dtype, bool non_blocking = false)
+      override;
+
+  /// Recursively casts all parameters to the given dtype.
+  void to(torch::Dtype dtype, bool non_blocking = false) override;
+
+  /// Recursively moves all parameters to the given device.
+  void to(torch::Device device, bool non_blocking = false) override;
+
+  /// Fills the internal flattened parameter buffers passed to cuDNN. Call this
+  /// method if you mess around with the variable storages and want to use
+  /// cuDNN.
+  void flatten_parameters_for_cudnn();
+
+  RNNOptionsBase options;
+
+  std::vector<Tensor> w_ih;
+  std::vector<Tensor> w_hh;
+  std::vector<Tensor> b_ih;
+  std::vector<Tensor> b_hh;
+
+  Dropout dropout;
+
+ protected:
+  virtual Tensor cell_forward(Tensor input, Tensor state, int64_t layer) = 0;
+
+  RNNOutput CUDNN_forward(Tensor input, Tensor state);
+  RNNOutput autograd_forward(Tensor input, Tensor state);
+
+  std::vector<Tensor> flat_weights() const;
+  bool use_cudnn(Tensor sample) const;
+  Tensor create_dropout_state(Tensor input) const;
+
+  int64_t number_of_gates_;
+  bool has_cell_state_;
+  at::optional<CuDNNMode> cudnn_mode_;
+
+  // This is copied from pytorch, to determine whether weights are flat for the
+  // fast CUDNN route. Otherwise, we have to use non flattened weights, which
+  // are much slower.
+  // https://github.com/pytorch/pytorch/blob/1848cad10802db9fa0aa066d9de195958120d863/torch/nn/modules/rnn.py#L159-L165
+  // TODO Actually since we are in C++ we can probably just actually check if
+  // the parameters are flat, instead of relying on data pointers and stuff.
+  std::vector<void*> data_ptrs_;
+  Tensor flat_weights_;
+};
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// TODO: Replace this with passing an activation module.
+
+enum class RNNActivation { ReLU, Tanh };
+
+struct RNNOptions {
+  RNNOptions(int64_t input_size, int64_t hidden_size);
+
+  RNNOptions& tanh();
+  RNNOptions& relu();
+
+  TORCH_ARG(int64_t, input_size);
+  TORCH_ARG(int64_t, hidden_size);
+  TORCH_ARG(int64_t, layers) = 1;
+  TORCH_ARG(bool, with_bias) = true;
+  TORCH_ARG(double, dropout) = 0.0;
+  TORCH_ARG(RNNActivation, activation) = RNNActivation::ReLU;
+};
+
+class RNNImpl : public detail::RNNImplBase<RNNImpl> {
+ public:
+  template <typename... Ts>
+  explicit RNNImpl(Ts&&... ts) : RNNImpl(RNNOptions(std::forward<Ts>(ts)...)) {}
+  explicit RNNImpl(RNNOptions options);
+
+  RNNOptions options;
+
+ private:
+  Tensor cell_forward(Tensor input, Tensor state, int64_t layer) override;
+  std::function<Tensor(Tensor)> activation_function_;
+};
+
+TORCH_MODULE(RNN);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+using LSTMOptions = detail::RNNOptionsBase;
+
+class LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
+ public:
+  template <typename... Ts>
+  explicit LSTMImpl(Ts&&... ts)
+      : LSTMImpl(LSTMOptions(std::forward<Ts>(ts)...)) {}
+  explicit LSTMImpl(LSTMOptions options);
+
+ private:
+  Tensor cell_forward(Tensor input, Tensor state, int64_t layer) override;
+};
+
+TORCH_MODULE(LSTM);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+using GRUOptions = detail::RNNOptionsBase;
+
+class GRUImpl : public detail::RNNImplBase<GRUImpl> {
+ public:
+  template <typename... Ts>
+  explicit GRUImpl(Ts&&... ts) : GRUImpl(GRUOptions(std::forward<Ts>(ts)...)) {}
+  explicit GRUImpl(GRUOptions options);
+
+ private:
+  Tensor cell_forward(Tensor input, Tensor state, int64_t layer) override;
+};
+
+TORCH_MODULE(GRU);
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
new file mode 100644
index 0000000..1c28656
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/any.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <ATen/Error.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+/// A `Sequential` module is a container for any number of other modules. Its
+/// `forward()` method chains outputs to inputs and returns the final output.
+/// The `Sequential` class reference semantics.
+class SequentialImpl : public Cloneable<SequentialImpl> {
+ public:
+  using Iterator = std::vector<AnyModule>::iterator;
+  using ConstIterator = std::vector<AnyModule>::const_iterator;
+
+  /// Constructs the `Sequential` from a pack of modules. Each module can either
+  /// be a plain value (e.g. `Linear`) or a boxed value (e.g.
+  /// `shared_ptr<Linear>`). Unboxed modules will be moved into `shared_ptr`s
+  /// internally.
+  template <typename... Modules>
+  explicit SequentialImpl(Modules&&... modules) {
+    modules_.reserve(sizeof...(Modules));
+    push_back(std::forward<Modules>(modules)...);
+  }
+
+  /// Special cloning function for `Sequential` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone() const override {
+    auto clone = std::make_shared<SequentialImpl>();
+    for (const auto& module : modules_) {
+      clone->push_back(module.clone());
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `Sequential`, since it does not have parameter of
+  /// its own.
+  void reset() override {}
+
+  /// Feeds the `inputs` to the first module, then chains the output of each
+  /// module with the input of the next, in order of construction.
+  template <typename ReturnType = Tensor, typename... ArgumentTypes>
+  ReturnType forward(ArgumentTypes&&... arguments) {
+    AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
+
+    auto iterator = modules_.begin();
+    auto input = iterator->forward(std::forward<ArgumentTypes>(arguments)...);
+
+    for (++iterator; iterator != modules_.end(); ++iterator) {
+      input = iterator->forward(std::move(input));
+    }
+
+    // Check the return value and give a nice error message if the requsted
+    // return type was incorrect.
+    if (auto* return_value = input.template try_get<ReturnType>()) {
+      return std::move(*return_value);
+    }
+    AT_ERROR(
+        "The type of the return value is ",
+        at::demangle(input.type_info().name()),
+        ", but you asked for type ",
+        at::demangle(typeid(ReturnType).name()));
+  }
+
+  /// Adds a new (boxed) `Module` to the `Sequential` container.
+  template <typename ModuleType>
+  void push_back(std::shared_ptr<ModuleType> module_ptr) {
+    // Nesting Sequential doesn't work because `forward()`'s return type is
+    // templatized, so it'll give a nasty compiler error.
+    static_assert(
+        !std::is_same<SequentialImpl, ModuleType>::value,
+        "Sequential is not nestable");
+    static_assert(
+        torch::detail::is_module<ModuleType>::value,
+        "Can only add objects derived from nn::Module to Sequential");
+    static_assert(
+        torch::detail::has_forward<ModuleType>::value,
+        "Can only add modules with a forward() method to Sequential");
+    push_back(AnyModule(std::move(module_ptr)));
+  }
+
+  /// Adds a new `Module` to the `Sequential` container, moving or copying it
+  /// into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing. This means you can write
+  /// `Sequential(Module(3, 4))` instead of
+  /// `Sequential(std::make_shared<Module>(3, 4))`.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(M&& module) {
+    // Need to get rid of any reference components for make_unique.
+    using Type = typename std::remove_reference<M>::type;
+    // Here we move (or copy) the module into a new shared_ptr.
+    push_back(std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and adds it to the
+  /// `Sequential`.
+  template <typename M>
+  void push_back(const ModuleHolder<M>& module_holder) {
+    push_back(module_holder.ptr());
+  }
+
+  /// Iterates over the container and calls `push_back()` on each value.
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& module : container) {
+      push_back(module);
+    }
+  }
+
+  /// Returns an iterator to the start of the `Sequential`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Returns an iterator to the end of the `Sequential`.
+  Iterator end() {
+    return modules_.end();
+  }
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  T& at(size_t index) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    AT_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  const T& at(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    AT_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
+  /// underlying module at the given index. Throws an exception if the index is
+  /// out of bounds.
+  std::shared_ptr<Module> ptr(size_t index) const {
+    AT_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose type is the one provided.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  std::shared_ptr<T> ptr(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::ptr with an nn::Module type");
+    AT_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr<T>();
+  }
+
+  /// Like `ptr(index)`.
+  std::shared_ptr<Module> operator[](size_t index) const {
+    // This is the only method we can call without a type.
+    return ptr(index);
+  }
+
+  /// The current size of the `Sequential` container.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// True if there are no modules in the `Sequential`.
+  bool is_empty() const noexcept {
+    return size() == 0;
+  }
+
+ private:
+  /// Takes a First *and* Second parameter, to avoid ambiguity when a parameter
+  /// pack has only one type, in which case the template would be preferred,
+  /// even if the other `push_back` functions are better fits (e.g. `unique_ptr`
+  /// -> `shared_ptr` overload).
+  template <typename First, typename Second, typename... Rest>
+  void push_back(First&& first, Second&& second, Rest&&... rest) {
+    push_back(std::forward<First>(first));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back(std::forward<Second>(second), std::forward<Rest>(rest)...);
+  }
+
+  /// Adds a type-erased `AnyModule` to the `Sequential`.
+  void push_back(AnyModule any_module) {
+    modules_.push_back(std::move(any_module));
+    const auto index = modules_.size() - 1;
+    register_module(std::to_string(index), modules_[index].ptr());
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back() {}
+
+  // Box the AnyModules to give Sequential reference semantics, like the rest of
+  // the API. Note that this is not required otherwise, this could just be a
+  // `vector<AnyModule>`.
+  std::vector<AnyModule> modules_;
+};
+
+TORCH_MODULE(Sequential);
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
new file mode 100644
index 0000000..e4b6aa7
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/tensor.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace detail {
+/// This class exists  only to do SFINAE on abstract types `T` that are really
+/// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
+/// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
+/// enable_if_t<is_base_of<ModuleHolderIndicator, T>::value>::type.
+struct ModuleHolderIndicator {};
+
+template <typename T>
+using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
+
+template <typename T>
+using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+} // namespace detail
+
+namespace nn {
+
+/// A `ModuleHolder` is essentially a wrapper around `std::shared_ptr<M>` where
+/// `M` is an `nn::Module` subclass, with convenient constructors defined for
+/// the kind of constructions we want to allow for our modules.
+template <typename Contained>
+class ModuleHolder : torch::detail::ModuleHolderIndicator {
+ protected:
+  /// The module pointer this class wraps.
+  /// NOTE: Must be placed at the top of the class so that we can use it with
+  /// trailing return types below.
+  std::shared_ptr<Contained> impl_;
+
+ public:
+  using ContainedType = Contained;
+
+  /// Constructs the `ModuleHolder` with an empty contained value. Access to
+  /// the underlying module is not permitted and will throw an exception, until
+  /// a value is assigned.
+  explicit ModuleHolder(std::nullptr_t) : impl_(nullptr) {}
+
+  /// Constructs the `ModuleHolder` with a contained module, forwarding all
+  /// arguments to its constructor.
+  template <typename... Ts>
+  explicit ModuleHolder(Ts&&... ts)
+      : impl_(new Contained(std::forward<Ts>(ts)...)) {}
+
+  /// Constructs the `ModuleHolder` from a pointer to the contained type.
+  /// Example: `Linear(std::make_shared<LinearImpl>(...))`.
+  /* implicit */ ModuleHolder(std::shared_ptr<Contained> module)
+      : impl_(std::move(module)) {}
+
+  /// Returns true if the `ModuleHolder` contains a module, or false if it is
+  /// `nullptr`.
+  explicit operator bool() const noexcept {
+    return !is_empty();
+  }
+
+  /// Forwards to the contained module.
+  Contained* operator->() {
+    return get();
+  }
+
+  /// Forwards to the contained module.
+  const Contained* operator->() const {
+    return get();
+  }
+
+  /// Returns a reference to the contained module.
+  Contained& operator*() {
+    return *get();
+  }
+
+  /// Returns a const reference to the contained module.
+  const Contained& operator*() const {
+    return *get();
+  }
+
+  /// Returns a shared pointer to the underlying module.
+  const std::shared_ptr<Contained>& ptr() const {
+    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_;
+  }
+
+  /// Returns a pointer to the underlying module.
+  Contained* get() {
+    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Returns a const pointer to the underlying module.
+  const Contained* get() const {
+    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Forwards to the call operator of the contained module.
+  template <typename... Args>
+  auto operator()(Args&&... args)
+      -> decltype((*impl_)(std::forward<Args>(args)...)) {
+    return (*impl_)(std::forward<Args>(args)...);
+  }
+
+  /// Forwards to the subscript operator of the contained module.
+  template <typename Arg>
+  auto operator[](Arg&& arg) -> decltype((*impl_)[std::forward<Arg>(arg)]) {
+    return (*impl_)[std::forward<Arg>(arg)];
+  }
+
+  /// Returns true if the `ModuleHolder` does not contain a module.
+  bool is_empty() const noexcept {
+    return impl_ == nullptr;
+  }
+};
+} // namespace nn
+} // namespace torch
+
+#define TORCH_ARG(T, name)                          \
+  auto name(const T& new_##name)->decltype(*this) { \
+    this->name##_ = new_##name;                     \
+    return *this;                                   \
+  }                                                 \
+  auto name(T&& new_##name)->decltype(*this) {      \
+    this->name##_ = std::move(new_##name);          \
+    return *this;                                   \
+  }                                                 \
+  const T& name() const noexcept {                  \
+    return this->name##_;                           \
+  }                                                 \
+  T name##_
+
+/// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
+/// wrapper over a `std::shared_ptr<Impl>`.
+#define TORCH_MODULE_IMPL(Name, Impl)                            \
+  class Name : public torch::nn::ModuleHolder<Impl> {            \
+   public:                                                       \
+    using torch::nn::ModuleHolder<Impl>::ModuleHolder;           \
+    Name(const Name&) = default;                                 \
+    Name(Name&&) = default;                                      \
+    Name(Name& other) : Name(static_cast<const Name&>(other)) {} \
+    Name& operator=(const Name&) = default;                      \
+    Name& operator=(Name&&) = default;                           \
+  }
+
+/// Like `TORCH_MODULE_IMPL`, but defaults the `Impl` name to `<Name>Impl`.
+#define TORCH_MODULE(Name) TORCH_MODULE_IMPL(Name, Name##Impl)
diff --git a/torch/csrc/api/include/torch/optim.h b/torch/csrc/api/include/torch/optim.h
new file mode 100644
index 0000000..5482555
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/optim/adagrad.h>
+#include <torch/optim/adam.h>
+#include <torch/optim/lbfgs.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/rmsprop.h>
+#include <torch/optim/sgd.h>
diff --git a/torch/csrc/api/include/torch/optim/adagrad.h b/torch/csrc/api/include/torch/optim/adagrad.h
new file mode 100644
index 0000000..0e2b2be
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/adagrad.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/tensor.h>
+
+#include <ATen/ATen.h>
+
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct AdagradOptions {
+  AdagradOptions(double learning_rate);
+  TORCH_ARG(double, learning_rate);
+  TORCH_ARG(double, lr_decay) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+};
+
+class Adagrad : public Optimizer {
+ public:
+  template <typename ParameterContainer>
+  explicit Adagrad(
+      ParameterContainer&& parameters,
+      const AdagradOptions& options)
+      : Optimizer(std::forward<ParameterContainer>(parameters)),
+        options_(options),
+        sum_(zero_buffers_like(parameters_)),
+        step_(parameters_.size(), 0) {}
+
+  void step() override;
+
+  const AdagradOptions& options() const noexcept;
+
+  template <class Archive>
+  void serialize(Archive& ar) {
+    ar(CEREAL_NVP(sum_));
+    ar(CEREAL_NVP(step_));
+  }
+
+ private:
+  friend class cereal::access;
+  Adagrad() : options_(0) {}
+
+  AdagradOptions options_;
+
+  std::vector<Tensor> sum_;
+  std::vector<double> step_;
+};
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
new file mode 100644
index 0000000..5bf3ef0
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/optim/optimizer.h>
+
+#include <ATen/ATen.h>
+
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct AdamOptions {
+  /* implicit */ AdamOptions(double learning_rate);
+  TORCH_ARG(double, learning_rate);
+  TORCH_ARG(double, beta1) = 0.9;
+  TORCH_ARG(double, beta2) = 0.999;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(bool, amsgrad) = false;
+};
+
+class Adam : public Optimizer {
+ public:
+  template <typename ParameterContainer>
+  explicit Adam(ParameterContainer&& parameters, const AdamOptions& options)
+      : Optimizer(std::forward<ParameterContainer>(parameters)),
+        options_(options),
+        step_buffers_(parameters_.size(), 0),
+        exp_average_buffers_(zero_buffers_like(parameters_)),
+        exp_average_sq_buffers_(zero_buffers_like(parameters_)) {
+    if (options_.amsgrad_) {
+      max_exp_average_sq_buffers_ = zero_buffers_like(parameters_);
+    }
+  }
+
+  void step() override;
+
+  template <class Archive>
+  void serialize(Archive& ar) {
+    ar(CEREAL_NVP(step_buffers_),
+       CEREAL_NVP(exp_average_buffers_),
+       CEREAL_NVP(exp_average_sq_buffers_),
+       CEREAL_NVP(max_exp_average_sq_buffers_));
+  }
+
+  const AdamOptions& options() const noexcept;
+
+ private:
+  friend class cereal::access;
+  Adam() : options_(0) {}
+
+  AdamOptions options_;
+
+  std::vector<int64_t> step_buffers_;
+  std::vector<Tensor> exp_average_buffers_;
+  std::vector<Tensor> exp_average_sq_buffers_;
+  std::vector<Tensor> max_exp_average_sq_buffers_;
+};
+
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
new file mode 100644
index 0000000..fe969c8
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+
+#include <ATen/ATen.h>
+
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct LBFGSOptions {
+  LBFGSOptions(double learning_rate);
+  TORCH_ARG(double, learning_rate);
+  TORCH_ARG(int64_t, max_iter) = 20;
+  TORCH_ARG(int64_t, max_eval) = 25;
+  TORCH_ARG(float, tolerance_grad) = 1e-5;
+  TORCH_ARG(float, tolerance_change) = 1e-9;
+  TORCH_ARG(size_t, history_size) = 100;
+};
+
+class LBFGS : public LossClosureOptimizer {
+ public:
+  template <typename ParameterContainer>
+  explicit LBFGS(ParameterContainer&& parameters, const LBFGSOptions& options)
+      : LossClosureOptimizer(std::forward<ParameterContainer>(parameters)),
+        options_(options),
+        ro(options_.history_size_),
+        al(options_.history_size_) {}
+
+  torch::Tensor step(LossClosure closure) override;
+
+  const LBFGSOptions& options() const noexcept;
+
+  template <class Archive>
+  void serialize(Archive& ar) {
+    ar(CEREAL_NVP(d));
+    ar(CEREAL_NVP(t));
+    ar(CEREAL_NVP(H_diag));
+    ar(CEREAL_NVP(prev_flat_grad));
+    ar(CEREAL_NVP(prev_loss));
+    ar(CEREAL_NVP(old_dirs));
+    ar(CEREAL_NVP(old_stps));
+  }
+
+ private:
+  friend class cereal::access;
+  LBFGS() : options_(0) {}
+
+  at::Tensor gather_flat_grad();
+  void add_grad(const torch::Scalar& step_size, const at::Tensor& update);
+
+  LBFGSOptions options_;
+
+  at::Tensor d{torch::empty({0})};
+  at::Tensor H_diag{torch::empty({0})};
+  at::Tensor prev_flat_grad{torch::empty({0})};
+  torch::Scalar t{0};
+  torch::Scalar prev_loss{0};
+  std::vector<at::Tensor> ro;
+  std::vector<at::Tensor> al;
+  std::deque<at::Tensor> old_dirs;
+  std::deque<at::Tensor> old_stps;
+  int64_t func_evals{0};
+  int64_t state_n_iter{0};
+};
+
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/optimizer.h b/torch/csrc/api/include/torch/optim/optimizer.h
new file mode 100644
index 0000000..eed600a
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/optimizer.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/nn/cursor.h>
+#include <torch/tensor.h>
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace optim {
+namespace detail {
+
+/// Base class for all optimizers, that does not yet define a `step()`
+/// mechanism. All it specifies is that optimizers must be supplied with a
+/// vector of parameters. It also defines certain methods that all optimizers
+/// shall have, such as `zero_grad`.
+class OptimizerBase {
+ public:
+  using ParameterCursor = torch::detail::CursorBase<Tensor>;
+
+  /// Constructs the `Optimizer` from a vector of parameters.
+  explicit OptimizerBase(std::vector<Tensor> parameters);
+
+  /// Constructs the `Optimizer` from a ParameterCursor, such as
+  /// `nn::Module::parameters()` returns.
+  explicit OptimizerBase(const ParameterCursor& cursor);
+
+  virtual ~OptimizerBase() = default;
+
+  /// Adds the given vector of parameters to the optimizer's parameter list.
+  /// Override this method if you want to modify the way parameters are added to
+  /// the `Optimizer`.
+  virtual void add_parameters(const std::vector<Tensor>& parameters);
+
+  /// Adds the `ParameterCursor`'s parameters to the optimizer's parameter list.
+  /// NOTE: Calls the `vector<Tensor>` overload of `add_parameters` -- override
+  /// that method if you want to modify the behavior of `add_parameters`.
+  virtual void add_parameters(const ParameterCursor& cursor);
+
+  /// Zeros out the gradients of all parameters.
+  virtual void zero_grad();
+
+  /// Provides a reference to the parameters this optimizer holds.
+  const std::vector<Tensor>& parameters() const noexcept;
+
+  /// Returns the number of parameters referenced by the optimizer.
+  size_t size() const noexcept;
+
+ protected:
+  OptimizerBase() = default;
+
+  /// Helper function to construct a vector of zero-d out variables, each the
+  /// same shape as the variable at the corresponding index in the input
+  /// container.
+  template <typename ParameterContainer>
+  std::vector<Tensor> zero_buffers_like(const ParameterContainer& parameters) {
+    std::vector<Tensor> result;
+    result.reserve(parameters.size());
+    for (auto& parameter : parameters) {
+      result.push_back(torch::zeros_like(parameter));
+    }
+    return result;
+  }
+
+  /// Accesses a buffer at the given index, converts it to the type of the
+  /// parameter at the corresponding index (a no-op if they match).
+  Tensor& buffer_at(std::vector<Tensor>& buffers, size_t index) {
+    const auto& parameter = parameters_.at(index);
+    const auto& buffer = buffers.at(index);
+    if (buffer.device() != parameter.device() ||
+        buffer.dtype() != parameter.dtype()) {
+      buffers[index] = buffer.to(parameter.device(), parameter.dtype());
+    }
+    return buffers[index];
+  }
+
+  /// The parameters this optimizer optimizes.
+  std::vector<Tensor> parameters_;
+};
+} // namespace detail
+
+/// Optimizer that defines a required `step()` method that takes no arguments
+/// and produces no values. The only side effect is that parameters are updated
+/// according to the concrete optimization algorithm.
+class Optimizer : public detail::OptimizerBase {
+ public:
+  using detail::OptimizerBase::OptimizerBase;
+  virtual void step() = 0;
+};
+
+/// Optimizer that requires the loss function to be supplied to the `step()`
+/// function, as it may evaluate the loss function multiple times per step.
+/// Examples of such algorithms are conjugate gradient and LBFGS. The `step()`
+/// function also returns the loss value.
+class LossClosureOptimizer : public detail::OptimizerBase {
+ public:
+  /// A loss function closure, which is expected to return the loss value.
+  using LossClosure = std::function<Tensor()>;
+  using detail::OptimizerBase::OptimizerBase;
+  virtual Tensor step(LossClosure closure) = 0;
+};
+
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
new file mode 100644
index 0000000..e51cacc
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+
+#include <ATen/ATen.h>
+
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct RMSpropOptions {
+  RMSpropOptions(double learning_rate);
+  TORCH_ARG(double, learning_rate);
+  TORCH_ARG(double, alpha) = 0.99;
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(bool, centered) = false;
+};
+
+class RMSprop : public Optimizer {
+ public:
+  template <typename ParameterContainer>
+  explicit RMSprop(
+      ParameterContainer&& parameters,
+      const RMSpropOptions& options)
+      : Optimizer(std::forward<ParameterContainer>(parameters)),
+        options_(options),
+        square_average_buffers_(zero_buffers_like(parameters_)) {
+    if (options.centered_ > 0) {
+      grad_average_buffers_ = zero_buffers_like(parameters_);
+    }
+    if (options.momentum_ > 0) {
+      momentum_buffers_ = zero_buffers_like(parameters_);
+    }
+  }
+
+  void step() override;
+
+  const RMSpropOptions& options() const noexcept;
+
+  template <class Archive>
+  void serialize(Archive& ar) {
+    ar(CEREAL_NVP(square_average_buffers_));
+    ar(CEREAL_NVP(momentum_buffers_));
+    ar(CEREAL_NVP(grad_average_buffers_));
+  }
+
+ private:
+  friend class cereal::access;
+  RMSprop() : options_(0) {}
+
+  RMSpropOptions options_;
+
+  std::vector<Tensor> square_average_buffers_;
+  std::vector<Tensor> momentum_buffers_;
+  std::vector<Tensor> grad_average_buffers_;
+};
+
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h
new file mode 100644
index 0000000..4719607
--- /dev/null
+++ b/torch/csrc/api/include/torch/optim/sgd.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/optim/optimizer.h>
+#include <torch/tensor.h>
+
+#include <ATen/ATen.h>
+
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct SGDOptions {
+  /* implicit */ SGDOptions(double learning_rate);
+  TORCH_ARG(double, learning_rate);
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(double, dampening) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, nesterov) = false;
+};
+
+class SGD : public Optimizer {
+ public:
+  template <typename ParameterContainer>
+  explicit SGD(ParameterContainer&& parameters, const SGDOptions& options)
+      : Optimizer(std::forward<ParameterContainer>(parameters)),
+        options_(options) {
+    if (options_.momentum_ > 0) {
+      momentum_buffers_ = zero_buffers_like(parameters_);
+    }
+  }
+
+  void step() override;
+
+  template <class Archive>
+  void serialize(Archive& ar) {
+    ar(CEREAL_NVP(momentum_buffers_));
+  }
+
+  const SGDOptions& options() const noexcept;
+
+ private:
+  friend class cereal::access;
+  SGD() : options_(0) {}
+
+  SGDOptions options_;
+  std::vector<Tensor> momentum_buffers_;
+  /// Counts how often `step()` is called, for dampening.
+  size_t iteration_{0};
+};
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h
new file mode 100644
index 0000000..61b5b53
--- /dev/null
+++ b/torch/csrc/api/include/torch/serialization.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include <fstream>
+
+#include <torch/tensor.h>
+#include <torch/optim.h>
+
+#include "cereal/archives/binary.hpp"
+#include "cereal/types/polymorphic.hpp"
+
+#include "cereal/types/string.hpp"
+#include "cereal/types/unordered_map.hpp"
+#include "cereal/types/vector.hpp"
+
+namespace torch {
+
+// Some convenience functions for saving and loading
+template <typename T>
+void save(std::ostream& stream, T const& obj) {
+  cereal::BinaryOutputArchive archive(stream);
+  archive(*obj);
+}
+template <typename T>
+void load(std::istream& stream, T& obj) {
+  cereal::BinaryInputArchive archive(stream);
+  archive(*obj);
+}
+template <typename T>
+void save(std::ostream& stream, T const* obj) {
+  cereal::BinaryOutputArchive archive(stream);
+  archive(*obj);
+}
+template <typename T>
+void load(std::istream& stream, T* obj) {
+  cereal::BinaryInputArchive archive(stream);
+  archive(*obj);
+}
+template <typename T>
+void save(std::string const& path, T const& obj) {
+  std::ofstream os(path, std::ios::binary);
+  torch::save(os, obj);
+}
+template <typename T>
+void load(std::string const& path, T& obj) {
+  std::ifstream is(path, std::ios::binary);
+  torch::load(is, obj);
+}
+
+namespace detail {
+
+// We use our own hard-coded type<->id mapping so that serialization is robust
+// wrt changes in ATen; see e.g. https://git.io/vxd6R
+// The mapping is consistent with the ScalarType enum as of pytorch version
+// v0.1.11-7675-ge94c67e.
+inline int32_t scalarTypeId(torch::Dtype type) {
+  switch (type) {
+    case torch::Dtype::Byte: return 0;
+    case torch::Dtype::Char: return 1;
+    case torch::Dtype::Short: return 2;
+    case torch::Dtype::Int: return 3;
+    case torch::Dtype::Long: return 4;
+    case torch::Dtype::Half: return 5;
+    case torch::Dtype::Float: return 6;
+    case torch::Dtype::Double: return 7;
+    case torch::Dtype::Undefined: return 8;
+    default:
+      throw std::runtime_error(
+          "Unknown scalar type: " + std::to_string(static_cast<int>(type)));
+  }
+}
+
+inline torch::Dtype scalarTypeFromId(int32_t id) {
+  switch (id) {
+    case 0: return torch::Dtype::Byte;
+    case 1: return torch::Dtype::Char;
+    case 2: return torch::Dtype::Short;
+    case 3: return torch::Dtype::Int;
+    case 4: return torch::Dtype::Long;
+    case 5: return torch::Dtype::Half;
+    case 6: return torch::Dtype::Float;
+    case 7: return torch::Dtype::Double;
+    case 8: return torch::Dtype::Undefined;
+    default:
+      throw std::runtime_error("Unknown scalar type id: " + std::to_string(id));
+  }
+}
+
+inline int32_t backendId(at::Backend backend) {
+  switch (backend) {
+    case at::Backend::CPU: return 0;
+    case at::Backend::CUDA: return 1;
+    case at::Backend::SparseCPU: return 2;
+    case at::Backend::SparseCUDA: return 3;
+    case at::Backend::Undefined: return 4;
+    default:
+      throw std::runtime_error(
+          "Unknown backend: " + std::to_string(static_cast<int>(backend)));
+  }
+}
+
+inline at::Backend backendFromId(int32_t id) {
+  switch (id) {
+    case 0: return at::Backend::CPU;
+    case 1: return at::Backend::CUDA;
+    case 2: return at::Backend::SparseCPU;
+    case 3: return at::Backend::SparseCUDA;
+    case 4: return at::Backend::Undefined;
+    default:
+      throw std::runtime_error("Unknown backend id: " + std::to_string(id));
+  }
+}
+
+} // namespace detail
+} // namespace torch
+
+// This is super ugly and I don't know how to simplify it
+CEREAL_REGISTER_TYPE(torch::optim::SGD);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::SGD);
+CEREAL_REGISTER_TYPE(torch::optim::Adagrad);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adagrad);
+CEREAL_REGISTER_TYPE(torch::optim::RMSprop);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::RMSprop);
+CEREAL_REGISTER_TYPE(torch::optim::Adam);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adam);
+
+namespace cereal {
+
+namespace agimpl {
+
+template <class Archive>
+void saveBinary(Archive& archive, void const* data, size_t size) {
+  // In general, there's no direct `saveBinary`-like method on archives
+  std::vector<char> v(
+      static_cast<char const*>(data), static_cast<char const*>(data) + size);
+  archive(v);
+}
+template <>
+inline void
+saveBinary(BinaryOutputArchive& archive, void const* data, size_t size) {
+  // Writes to output stream without extra copy
+  archive.saveBinary(data, size);
+}
+
+template <class Archive>
+void loadBinary(Archive& archive, void* data, size_t size) {
+  // In general, there's no direct `loadBinary`-like method on archives
+  std::vector<char> v(size);
+  archive(v);
+  std::memcpy(data, v.data(), size);
+}
+template <>
+inline void
+loadBinary(BinaryInputArchive& archive, void* data, size_t size) {
+  // Read from input stream without extra copy
+  archive.loadBinary(data, size);
+}
+
+} // namespace agimpl
+
+// Gradients will not be saved for variables
+template <class Archive>
+void save(Archive& archive, torch::Tensor const& tensor) {
+  if (!tensor.defined()) {
+    int32_t typeId = ::torch::detail::scalarTypeId(torch::Dtype::Undefined);
+    archive(CEREAL_NVP(typeId));
+    return;
+  } else {
+    int32_t typeId = ::torch::detail::scalarTypeId(tensor.data().type().scalarType());
+    archive(CEREAL_NVP(typeId));
+  }
+  auto sizes = std::vector<int64_t>();
+  auto buf = std::vector<uint8_t>();
+  for (auto s : tensor.sizes()) {
+    sizes.push_back(s);
+  }
+  auto contig = tensor.toBackend(torch::kCPU).contiguous();
+  int32_t backend = ::torch::detail::backendId(tensor.data().type().backend());
+
+  archive(CEREAL_NVP(backend), CEREAL_NVP(sizes));
+  agimpl::saveBinary(
+      archive,
+      contig.data_ptr(),
+      tensor.numel() * tensor.data().type().elementSizeInBytes());
+}
+
+/**
+ * We follow these rules for loading:
+ * 1. If tensor is defined, and the same ScalarType as the saved tensor,
+ *    then we simply copy the data into the tensor, with resizing.
+ * 2. Otherwise, overwrite the provided tensor with the right type and backend
+ **/
+template <class Archive>
+void load(Archive& archive, torch::Tensor& tensor) {
+  torch::Dtype type;
+  int32_t typeId;
+  archive(CEREAL_NVP(typeId));
+  type = ::torch::detail::scalarTypeFromId(typeId);
+  if (type == torch::Dtype::Undefined) {
+    tensor = torch::Tensor();
+    return;
+  }
+
+  int32_t backendId;
+  auto sizes = std::vector<int64_t>();
+  auto buf = std::vector<uint8_t>();
+  archive(CEREAL_NVP(backendId), CEREAL_NVP(sizes));
+
+  at::Backend backend = ::torch::detail::backendFromId(backendId);
+  if (!tensor.defined() || tensor.data().type().scalarType() != type) {
+    tensor = torch::empty({}, torch::getType(backend, type));
+  }
+  tensor.data().resize_(sizes);
+
+  if (tensor.type().is_cuda()) {
+    // should actually use cudamemcpy probably
+    auto cputensor = torch::empty(sizes, tensor.data().type().scalarType());
+    agimpl::loadBinary(
+        archive,
+        cputensor.data_ptr(),
+        cputensor.numel() * cputensor.type().elementSizeInBytes());
+    tensor.copy_(cputensor);
+  } else {
+    agimpl::loadBinary(
+        archive,
+        tensor.data_ptr(),
+        tensor.numel() * tensor.data().type().elementSizeInBytes());
+  }
+}
+} // namespace cereal
diff --git a/torch/csrc/api/include/torch/tensor.h b/torch/csrc/api/include/torch/tensor.h
new file mode 100644
index 0000000..55a3acd
--- /dev/null
+++ b/torch/csrc/api/include/torch/tensor.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch {
+using namespace at;
+
+using Tensor = autograd::Variable;
+using Dtype = at::ScalarType;
+
+/// Fixed width dtypes.
+constexpr auto kUInt8 = at::kByte;
+constexpr auto kInt8 = at::kChar;
+constexpr auto kInt16 = at::kShort;
+constexpr auto kInt32 = at::kInt;
+constexpr auto kInt64 = at::kLong;
+constexpr auto kFloat32 = at::kFloat;
+constexpr auto kFloat64 = at::kDouble;
+
+/// Rust-style short dtypes.
+constexpr auto kU8 = kUInt8;
+constexpr auto kI8 = kInt8;
+constexpr auto kI16 = kInt16;
+constexpr auto kI32 = kInt32;
+constexpr auto kI64 = kInt64;
+constexpr auto kF32 = kFloat32;
+constexpr auto kF64 = kFloat64;
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/tensor_list_view.h b/torch/csrc/api/include/torch/tensor_list_view.h
new file mode 100644
index 0000000..67ae0c8
--- /dev/null
+++ b/torch/csrc/api/include/torch/tensor_list_view.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/tensor.h>
+
+#include <ATen/ArrayRef.h>
+#include <ATen/Tensor.h>
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iterator>
+#include <vector>
+
+namespace torch {
+class TensorListView {
+ public:
+  /// Constructs an TensorListView from a single element.
+  /*implicit*/ TensorListView(const torch::Tensor& tensor)
+      : backing_vector_({tensor}) {}
+
+  /// Constructs an TensorListView from a std::vector.
+  template <typename A>
+  /*implicit*/ TensorListView(const std::vector<torch::Tensor, A>& vector)
+      : backing_vector_(vector.size()) {
+    std::copy(vector.begin(), vector.end(), backing_vector_.begin());
+  }
+
+  /// Constructs an TensorListView from a std::array
+  template <size_t N>
+  /*implicit*/ TensorListView(const std::array<torch::Tensor, N>& array)
+      : backing_vector_(N) {
+    std::copy(array.begin(), array.end(), backing_vector_.begin());
+  }
+
+  /// Constructs an TensorListView from a C array.
+  template <size_t N>
+  /*implicit*/ TensorListView(const torch::Tensor (&array)[N])
+      : backing_vector_(N) {
+    std::copy(std::begin(array), std::end(array), backing_vector_.begin());
+  }
+
+  /// Constructs an TensorListView from a std::initializer_list.
+  /*implicit*/ TensorListView(const std::initializer_list<torch::Tensor>& list)
+      : backing_vector_(list.size()) {
+    std::copy(list.begin(), list.end(), backing_vector_.begin());
+  }
+
+  /// Implicitly converts the `TensorListView` to an `ArrayRef` of the backing
+  /// vector.
+  operator at::ArrayRef<at::Tensor>() {
+    return backing_vector_;
+  }
+
+ private:
+  std::vector<at::Tensor> backing_vector_;
+};
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/torch.h b/torch/csrc/api/include/torch/torch.h
new file mode 100644
index 0000000..921e4b7
--- /dev/null
+++ b/torch/csrc/api/include/torch/torch.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/cuda.h>
+#include <torch/nn.h>
+#include <torch/optim.h>
+#include <torch/serialization.h>
+#include <torch/tensor.h>
+#include <torch/tensor_list_view.h>
+#include <torch/utils.h>
diff --git a/torch/csrc/api/include/torch/utils.h b/torch/csrc/api/include/torch/utils.h
new file mode 100644
index 0000000..ec047a3
--- /dev/null
+++ b/torch/csrc/api/include/torch/utils.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/autograd/grad_mode.h>
+
+#include <cstdint>
+
+namespace torch {
+using autograd::AutoGradMode;
+
+// A RAII, thread local (!) guard that stops future operations from building
+// gradients.
+struct NoGradGuard : public AutoGradMode {
+  NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
+};
+
+/// Sets the global random seed for all newly created CPU and CUDA tensors.
+void manual_seed(uint64_t seed);
+} // namespace torch
diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp
new file mode 100644
index 0000000..a830e65
--- /dev/null
+++ b/torch/csrc/api/src/cuda.cpp
@@ -0,0 +1,24 @@
+#include <torch/cuda.h>
+
+#include <ATen/Context.h>
+
+#include <cstddef>
+
+namespace torch {
+namespace cuda {
+size_t device_count() {
+  return at::globalContext().getNumGPUs();
+}
+
+bool is_available() {
+  // NB: the semantics of this are different from at::globalContext().hasCUDA();
+  // ATen's function tells you if you have a working driver and CUDA build,
+  // whereas this function also tells you if you actually have any GPUs.
+  return cuda::device_count() > 0;
+}
+
+bool cudnn_is_available() {
+  return is_available() && at::globalContext().hasCuDNN();
+}
+} // namespace cuda
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/cursor.cpp b/torch/csrc/api/src/nn/cursor.cpp
new file mode 100644
index 0000000..06958c1
--- /dev/null
+++ b/torch/csrc/api/src/nn/cursor.cpp
@@ -0,0 +1,270 @@
+#include <torch/nn/cursor.h>
+
+#include <torch/nn/module.h>
+#include <torch/tensor.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <queue>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace detail {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CursorBase::Item ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename T>
+CursorBase<T>::Item::Item(const std::string& key_, T& value_)
+    : key(key_), value(value_) {}
+
+template <typename T>
+T& CursorBase<T>::Item::operator*() {
+  return value;
+}
+
+template <typename T>
+const T& CursorBase<T>::Item::operator*() const {
+  return value;
+}
+
+template <typename T>
+T* CursorBase<T>::Item::operator->() {
+  return &value;
+}
+
+template <typename T>
+const T* CursorBase<T>::Item::operator->() const {
+  return &value;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CursorBase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename T>
+CursorBase<T>::CursorBase(std::vector<Item>&& items)
+    : items_(std::move(items)) {}
+
+template <typename T>
+    typename CursorBase<T>::Iterator CursorBase<T>::begin() & noexcept {
+  return items_.begin();
+}
+
+template <typename T>
+typename CursorBase<T>::ConstIterator CursorBase<T>::begin() const& noexcept {
+  return items_.begin();
+}
+
+template <typename T>
+    typename CursorBase<T>::Iterator CursorBase<T>::end() & noexcept {
+  return items_.end();
+}
+
+template <typename T>
+typename CursorBase<T>::ConstIterator CursorBase<T>::end() const& noexcept {
+  return items_.end();
+}
+
+template <typename T>
+T* CursorBase<T>::find(const std::string& key) noexcept {
+  for (auto& item : items_) {
+    if (item.key == key) {
+      return &item.value;
+    }
+  }
+  return nullptr;
+}
+
+template <typename T>
+const T* CursorBase<T>::find(const std::string& key) const noexcept {
+  for (auto& item : items_) {
+    if (item.key == key) {
+      return &item.value;
+    }
+  }
+  return nullptr;
+}
+
+template <typename T>
+T& CursorBase<T>::at(const std::string& key) {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  AT_ERROR("No such key: '", key, "'");
+}
+
+template <typename T>
+const T& CursorBase<T>::at(const std::string& key) const {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  AT_ERROR("No such key: '", key, "'");
+}
+
+template <typename T>
+typename CursorBase<T>::Item& CursorBase<T>::at(size_t index) {
+  AT_CHECK(
+      index < size(),
+      "Index ",
+      index,
+      " is out of range for cursor of size ",
+      size());
+  return items_[index];
+}
+
+template <typename T>
+T& CursorBase<T>::operator[](const std::string& key) {
+  return at(key);
+}
+
+template <typename T>
+const T& CursorBase<T>::operator[](const std::string& key) const {
+  return at(key);
+}
+
+template <typename T>
+typename CursorBase<T>::Item& CursorBase<T>::operator[](size_t index) {
+  return at(index);
+}
+
+template <typename T>
+bool CursorBase<T>::contains(const std::string& key) const noexcept {
+  return find(key) != nullptr;
+}
+
+template <typename T>
+size_t CursorBase<T>::size() const noexcept {
+  return items_.size();
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CursorCollector ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace {
+/// Joins names hierarchically: "prefix.name" if `prefix` is non-empty, else
+/// just "name".
+std::string join_name(const std::string& prefix, const std::string& name) {
+  size_t total_size = name.size();
+  if (!prefix.empty()) {
+    total_size += prefix.size() + 1;
+  }
+  std::string full_name;
+  full_name.reserve(total_size);
+  if (!prefix.empty()) {
+    full_name += prefix;
+    full_name.push_back('.');
+  }
+  full_name += name;
+  return full_name;
+}
+} // namespace
+
+template <typename T>
+struct CursorBase<T>::Collector {
+  Collector() = default;
+
+  template <typename ModuleType>
+  std::vector<Item>&& collect_children(
+      ModuleType& module,
+      size_t maximum_depth,
+      std::string name_prefix = std::string()) {
+    for (auto& child : module.children_) {
+      auto hierarchical_name = join_name(name_prefix, child.key);
+      items.emplace_back(hierarchical_name, *child.value);
+      if (maximum_depth > 1) {
+        collect_children(
+            *child.value, maximum_depth - 1, std::move(hierarchical_name));
+      }
+    }
+    return std::move(items);
+  }
+
+  template <typename ModuleType>
+  std::vector<Item>&& collect_parameters(
+      ModuleType& module,
+      std::string name_prefix = std::string()) {
+    for (auto& parameter : module.parameters_) {
+      items.emplace_back(
+          join_name(name_prefix, parameter.key), parameter.value);
+    }
+    for (auto& child : module.children_) {
+      collect_parameters(*child.value, join_name(name_prefix, child.key));
+    }
+    return std::move(items);
+  }
+
+  template <typename ModuleType>
+  std::vector<Item>&& collect_buffers(
+      ModuleType& module,
+      std::string name_prefix = std::string()) {
+    for (auto& buffer : module.buffers_) {
+      items.emplace_back(join_name(name_prefix, buffer.key), buffer.value);
+    }
+    for (auto& child : module.children_) {
+      collect_buffers(*child.value, join_name(name_prefix, child.key));
+    }
+    return std::move(items);
+  }
+
+  std::vector<Item> items;
+};
+
+// Explicitly instantiate the CursorBase template for all types we need.
+template class CursorBase<nn::Module>;
+template class CursorBase<const nn::Module>;
+template class CursorBase<Tensor>;
+template class CursorBase<const Tensor>;
+} // namespace detail
+
+namespace nn {
+namespace {
+template <typename Item, typename Cursor>
+std::vector<Item> copy_cursor_items(const Cursor& cursor) {
+  std::vector<Item> result;
+  result.reserve(cursor.size());
+  cursor.apply_items(
+      [&result](
+          const std::string& key, const typename Cursor::ValueType& value) {
+        result.emplace_back(key, value);
+      });
+  return result;
+}
+} // namespace
+
+// Module cursors
+
+ModuleCursor::ModuleCursor(Module& module, size_t maximum_depth)
+    : detail::CursorBase<Module>(
+          Collector().collect_children(module, maximum_depth)) {}
+
+ConstModuleCursor::ConstModuleCursor(const Module& module, size_t maximum_depth)
+    : detail::CursorBase<const Module>(
+          Collector().collect_children(module, maximum_depth)) {}
+
+ConstModuleCursor::ConstModuleCursor(const ModuleCursor& cursor)
+    : detail::CursorBase<const Module>(copy_cursor_items<Item>(cursor)) {}
+
+// Parameter cursors
+
+ParameterCursor::ParameterCursor(Module& module)
+    : detail::CursorBase<Tensor>(Collector().collect_parameters(module)) {}
+
+ConstParameterCursor::ConstParameterCursor(const Module& module)
+    : detail::CursorBase<const Tensor>(Collector().collect_parameters(module)) {
+}
+
+ConstParameterCursor::ConstParameterCursor(const ParameterCursor& cursor)
+    : detail::CursorBase<const autograd::Variable>(
+          copy_cursor_items<Item>(cursor)) {}
+
+// Buffer cursors
+
+BufferCursor::BufferCursor(Module& module)
+    : detail::CursorBase<Tensor>(Collector().collect_buffers(module)) {}
+
+ConstBufferCursor::ConstBufferCursor(const Module& module)
+    : detail::CursorBase<const Tensor>(Collector().collect_buffers(module)) {}
+
+ConstBufferCursor::ConstBufferCursor(const BufferCursor& cursor)
+    : detail::CursorBase<const autograd::Variable>(
+          copy_cursor_items<Item>(cursor)) {}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
new file mode 100644
index 0000000..f33601b
--- /dev/null
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -0,0 +1,173 @@
+#include <torch/nn/init.h>
+
+#include <torch/tensor.h>
+#include <torch/utils.h>
+
+#include <ATen/ATen.h>
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <tuple>
+
+namespace torch {
+namespace nn {
+namespace init {
+namespace {
+struct Fan {
+  explicit Fan(Tensor& tensor) {
+    const auto dimensions = tensor.ndimension();
+    AT_CHECK(
+        dimensions >= 2,
+        "Fan in and fan out can not be computed for tensor with less than 2 dimensions");
+
+    if (dimensions == 2) {
+      in = tensor.size(1);
+      out = tensor.size(0);
+    } else {
+      in = tensor.size(1) * tensor[0][0].numel();
+      out = tensor.size(0) * tensor[0][0].numel();
+    }
+  }
+
+  int64_t in;
+  int64_t out;
+};
+} // namespace
+
+Tensor constant_(Tensor tensor, Scalar value) {
+  NoGradGuard guard;
+  return tensor.fill_(value);
+}
+
+Tensor dirac_(Tensor tensor) {
+  NoGradGuard guard;
+
+  AT_CHECK(
+      tensor.ndimension() >= 3 && tensor.ndimension() <= 5,
+      "Only tensors with 3, 4, or 5 dimensions are supported");
+
+  const auto sizes = tensor.sizes();
+  const auto min_dim = std::min(sizes[0], sizes[1]);
+
+  tensor.zero_();
+  for (int64_t d = 0; d < min_dim; ++d) {
+    switch (tensor.ndimension()) {
+      case 3: // Temporal convolution
+        tensor[d][d][sizes[2] / 2] = 1;
+        break;
+      case 4: // Spatial convolution
+        tensor[d][d][sizes[2] / 2][sizes[3] / 2] = 1;
+        break;
+      case 5: // Volumetric convolution
+        tensor[d][d][sizes[2] / 2][sizes[3] / 2][sizes[4] / 2] = 1;
+        break;
+    }
+  }
+
+  return tensor;
+}
+
+Tensor eye_(Tensor tensor) {
+  NoGradGuard guard;
+  AT_CHECK(
+      tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported");
+  return torch::eye_out(tensor, tensor.size(0), tensor.size(1));
+}
+
+Tensor normal_(Tensor tensor, double mean, double std) {
+  NoGradGuard guard;
+  return tensor.normal_(mean, std);
+}
+
+Tensor ones_(Tensor tensor) {
+  NoGradGuard guard;
+  return tensor.fill_(1);
+}
+
+Tensor orthogonal_(Tensor tensor, double gain) {
+  NoGradGuard guard;
+
+  AT_CHECK(
+      tensor.ndimension() >= 2,
+      "Only tensors with 2 or more dimensions are supported");
+
+  const auto rows = tensor.size(0);
+  const auto columns = tensor.size(1);
+  auto flattened = torch::randn({rows, columns});
+
+  if (rows < columns) {
+    flattened.t_();
+  }
+
+  // Compute the qr factorization
+  Tensor q, r;
+  std::tie(q, r) = torch::qr(flattened);
+  // Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+  auto d = torch::diag(r, 0);
+  auto ph = d.sign();
+  q *= ph;
+
+  if (rows < columns) {
+    q.t_();
+  }
+
+  tensor.view_as(q).copy_(q);
+  tensor.mul_(gain);
+
+  return tensor;
+}
+
+Tensor sparse_(Tensor tensor, double sparsity, double std) {
+  NoGradGuard guard;
+
+  AT_CHECK(
+      tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported");
+
+  const auto rows = tensor.size(0);
+  const auto columns = tensor.size(1);
+  const int64_t num_zeros = std::ceil(sparsity * rows);
+  tensor.normal_(0, std);
+  for (int64_t column = 0; column < columns; ++column) {
+    auto row_indices = torch::randperm(rows, tensor.options().dtype(kLong));
+    auto zero_indices =
+        row_indices.slice(/*dim=*/0, /*start=*/0, /*end=*/num_zeros);
+    tensor.index_put_(
+        {zero_indices, torch::tensor(column, tensor.options().dtype(kLong))},
+        torch::zeros(num_zeros, tensor.options()));
+  }
+
+  return tensor;
+}
+
+Tensor uniform_(Tensor tensor, double low, double high) {
+  NoGradGuard guard;
+  return tensor.uniform_(low, high);
+}
+
+Tensor xavier_normal_(Tensor tensor, double gain) {
+  NoGradGuard guard;
+
+  Fan fan(tensor);
+  const auto std = gain * std::sqrt(2.0 / (fan.in + fan.out));
+  return tensor.normal_(0, std);
+}
+
+Tensor xavier_uniform_(Tensor tensor, double gain) {
+  NoGradGuard guard;
+  Fan fan(tensor);
+  const auto std = gain * std::sqrt(2.0 / (fan.in + fan.out));
+  // Calculate uniform bounds from standard deviation with
+  const auto a = std::sqrt(3.0) * std;
+  return tensor.uniform_(-a, a);
+}
+
+Tensor zeros_(Tensor tensor) {
+  NoGradGuard guard;
+  return tensor.zero_();
+}
+
+} // namespace init
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
new file mode 100644
index 0000000..e280920
--- /dev/null
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -0,0 +1,136 @@
+#include <torch/nn/module.h>
+
+#include <torch/nn/cursor.h>
+
+#include <torch/csrc/autograd/generated/VariableType.h>
+
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <typeinfo>
+#include <unordered_map>
+
+namespace torch {
+namespace nn {
+
+Module::Module(std::string name) : name_(std::move(name)) {}
+
+const std::string& Module::name() const noexcept {
+  // If the name optional is empty at this point, we grab the name of the
+  // dynamic type via RTTI. Note that we cannot do this in the constructor,
+  // because in the constructor of a base class `this` always refers to the base
+  // type. Inheritance effectively does not work in constructors. Also this note
+  // from http://en.cppreference.com/w/cpp/language/typeid:
+  // If typeid is used on an object under construction or destruction (in a
+  // destructor or in a constructor, including constructor's initializer list
+  // or default member initializers), then the std::type_info object referred
+  // to by this typeid represents the class that is being constructed or
+  // destroyed even if it is not the most-derived class.
+  if (!name_.has_value()) {
+    name_ = at::demangle(typeid(*this).name());
+  }
+  return *name_;
+}
+
+std::shared_ptr<Module> Module::clone() const {
+  AT_ERROR(
+      "clone() has not been implemented for ",
+      name(),
+      ". Use the copy constructor if you don't require polymorphic cloning. "
+      "Otherwise, subclass torch::nn::Cloneable<",
+      name(),
+      "> instead of torch::nn::Module to inherit the ability to clone.");
+}
+
+ModuleCursor Module::modules() {
+  return ModuleCursor(*this);
+}
+
+ConstModuleCursor Module::modules() const {
+  return ConstModuleCursor(*this);
+}
+
+ModuleCursor Module::children() {
+  return ModuleCursor(*this, /*maximum_depth=*/1);
+}
+
+ConstModuleCursor Module::children() const {
+  return ConstModuleCursor(*this, /*maximum_depth=*/1);
+}
+
+ParameterCursor Module::parameters() {
+  return ParameterCursor(*this);
+}
+
+ConstParameterCursor Module::parameters() const {
+  return ConstParameterCursor(*this);
+}
+
+BufferCursor Module::buffers() {
+  return BufferCursor(*this);
+}
+
+ConstBufferCursor Module::buffers() const {
+  return ConstBufferCursor(*this);
+}
+
+void Module::train() {
+  for (auto& child : children_) {
+    child.value->train();
+  }
+  is_training_ = true;
+}
+
+void Module::eval() {
+  for (auto& child : children_) {
+    child.value->eval();
+  }
+  is_training_ = false;
+}
+
+void Module::to(torch::Device device, torch::Dtype dtype, bool non_blocking) {
+  to_impl(device, dtype, non_blocking);
+}
+
+void Module::to(torch::Dtype dtype, bool non_blocking) {
+  to_impl(dtype, non_blocking);
+}
+
+void Module::to(torch::Device device, bool non_blocking) {
+  to_impl(device, non_blocking);
+}
+
+bool Module::is_training() const noexcept {
+  return is_training_;
+}
+
+void Module::zero_grad() {
+  for (auto& child : children_) {
+    child.value->zero_grad();
+  }
+  for (auto& parameter : parameters_) {
+    auto& grad = parameter->grad();
+    if (grad.defined()) {
+      grad = grad.detach();
+      grad.zero_();
+    }
+  }
+}
+
+Tensor& Module::register_parameter(
+    std::string name,
+    Tensor tensor,
+    bool requires_grad) {
+  tensor.set_requires_grad(requires_grad);
+  return parameters_.insert(std::move(name), std::move(tensor));
+}
+
+Tensor& Module::register_buffer(std::string name, Tensor tensor) {
+  return buffers_.insert(std::move(name), std::move(tensor));
+}
+
+void Module::clone_(Module& other) {}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
new file mode 100644
index 0000000..708c0e2
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -0,0 +1,65 @@
+#include <torch/nn/modules/batchnorm.h>
+
+#include <torch/cuda.h>
+#include <torch/tensor.h>
+
+#include <ATen/Error.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+BatchNormOptions::BatchNormOptions(int64_t features) : features_(features) {}
+
+BatchNormImpl::BatchNormImpl(BatchNormOptions options)
+    : options(std::move(options)) {
+  reset();
+}
+
+void BatchNormImpl::reset() {
+  if (options.affine_) {
+    weight = register_parameter(
+        "weight", torch::empty({options.features_}).uniform_());
+    bias = register_parameter("bias", torch::zeros({options.features_}));
+  }
+
+  if (options.stateful_) {
+    running_mean =
+        register_buffer("running_mean", torch::zeros({options.features_}));
+    running_variance =
+        register_buffer("running_variance", torch::ones({options.features_}));
+  }
+}
+
+Tensor BatchNormImpl::forward(Tensor input) {
+  return pure_forward(input, Tensor(), Tensor());
+}
+
+Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) {
+  auto& running_mean = options.stateful_ ? this->running_mean : mean;
+  auto& running_variance =
+      options.stateful_ ? this->running_variance : variance;
+
+  if (is_training()) {
+    const auto num_channels = input.dim() > 1 ? input.size(1) : 1;
+    AT_CHECK(
+        input.numel() / num_channels > 1,
+        "BatchNorm expected more than 1 value per channel when training!");
+  }
+
+  return torch::batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_variance,
+      is_training(),
+      options.momentum_,
+      options.eps_,
+      torch::cuda::cudnn_is_available());
+}
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp
new file mode 100644
index 0000000..85f7fe8
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@@ -0,0 +1,151 @@
+#include <torch/nn/modules/conv.h>
+
+#include <torch/expanding_array.h>
+#include <torch/tensor.h>
+
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+namespace torch {
+namespace nn {
+template <size_t D>
+ConvOptions<D>::ConvOptions(
+    int64_t input_channels,
+    int64_t output_channels,
+    ExpandingArray<D> kernel_size)
+    : input_channels_(input_channels),
+      output_channels_(output_channels),
+      kernel_size_(std::move(kernel_size)) {}
+
+template <size_t D, typename Derived>
+ConvImpl<D, Derived>::ConvImpl(ConvOptions<D> options)
+    : options(std::move(options)) {
+  reset();
+}
+
+template <size_t D, typename Derived>
+void ConvImpl<D, Derived>::reset() {
+  if (!options.transposed_) {
+    for (auto pad : *options.output_padding_) {
+      AT_CHECK(
+          pad == 0, "Only transposed convolutions support output padding!");
+    }
+  }
+
+  std::vector<int64_t> weights_size;
+  if (options.transposed_) {
+    weights_size.push_back(options.input_channels_);
+    weights_size.push_back(options.output_channels_ / options.groups_);
+  } else {
+    weights_size.push_back(options.output_channels_);
+    weights_size.push_back(options.input_channels_ / options.groups_);
+  }
+  weights_size.insert(
+      weights_size.end(),
+      options.kernel_size_->begin(),
+      options.kernel_size_->end());
+  AT_ASSERT(weights_size.size() == 2 + options.kernel_size_->size());
+
+  weight = this->register_parameter("weight", torch::empty(weights_size));
+  if (options.with_bias_) {
+    bias = this->register_parameter(
+        "bias", torch::empty(options.output_channels_));
+  }
+
+  const auto number_of_features = std::accumulate(
+      options.kernel_size_->begin(),
+      options.kernel_size_->end(),
+      options.input_channels_,
+      std::multiplies<int64_t>{});
+  const auto stdv = 1.0 / std::sqrt(number_of_features);
+  for (auto& p : this->parameters()) {
+    p->data().uniform_(-stdv, stdv);
+  }
+}
+
+Tensor Conv1dImpl::forward(Tensor input) {
+  AT_ASSERT(input.ndimension() == 3);
+
+  if (options.transposed_) {
+    return torch::conv_transpose1d(
+        input,
+        weight,
+        bias,
+        options.stride_,
+        options.padding_,
+        options.output_padding_,
+        options.groups_,
+        options.dilation_);
+  }
+  return torch::conv1d(
+      input,
+      weight,
+      bias,
+      options.stride_,
+      options.padding_,
+      options.dilation_,
+      options.groups_);
+}
+
+Tensor Conv2dImpl::forward(Tensor input) {
+  AT_ASSERT(input.ndimension() == 4);
+
+  if (options.transposed_) {
+    return torch::conv_transpose2d(
+        input,
+        weight,
+        bias,
+        options.stride_,
+        options.padding_,
+        options.output_padding_,
+        options.groups_,
+        options.dilation_);
+  }
+  return torch::conv2d(
+      input,
+      weight,
+      bias,
+      options.stride_,
+      options.padding_,
+      options.dilation_,
+      options.groups_);
+}
+
+Tensor Conv3dImpl::forward(Tensor input) {
+  AT_ASSERT(input.ndimension() == 5);
+
+  if (options.transposed_) {
+    return torch::conv_transpose3d(
+        input,
+        weight,
+        bias,
+        options.stride_,
+        options.padding_,
+        options.output_padding_,
+        options.groups_,
+        options.dilation_);
+  } else {
+    return torch::conv3d(
+        input,
+        weight,
+        bias,
+        options.stride_,
+        options.padding_,
+        options.dilation_,
+        options.groups_);
+  }
+}
+
+template struct ConvOptions<1>;
+template class ConvImpl<1, Conv1dImpl>;
+
+template struct ConvOptions<2>;
+template class ConvImpl<2, Conv2dImpl>;
+
+template struct ConvOptions<3>;
+template class ConvImpl<3, Conv3dImpl>;
+
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
new file mode 100644
index 0000000..3a2e1b1
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -0,0 +1,50 @@
+#include <torch/nn/modules/dropout.h>
+
+#include <torch/tensor.h>
+
+#include <ATen/Error.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+namespace detail {
+template <typename Derived>
+DropoutImplBase<Derived>::DropoutImplBase(DropoutOptions options_)
+    : options(options_) {
+  AT_CHECK(options.rate_ >= 0, "Dropout rate must not be less than zero");
+  AT_CHECK(options.rate_ <= 1, "Dropout rate must not be greater than one");
+}
+
+template <typename Derived>
+void DropoutImplBase<Derived>::reset() {}
+
+template <typename Derived>
+Tensor DropoutImplBase<Derived>::forward(Tensor input) {
+  if (options.rate_ == 0 || !this->is_training()) {
+    return input;
+  }
+
+  auto scale = 1.0f / (1.0f - options.rate_);
+  auto boolean_mask = noise_mask(input).uniform_(0, 1) > options.rate_;
+  auto noise = boolean_mask.to(input.dtype()).mul_(scale);
+
+  return input * noise;
+}
+
+template class DropoutImplBase<DropoutImpl>;
+template class DropoutImplBase<Dropout2dImpl>;
+} // namespace detail
+
+DropoutOptions::DropoutOptions(double rate) : rate_(rate) {}
+
+Tensor DropoutImpl::noise_mask(Tensor input) const {
+  return torch::empty_like(input);
+}
+
+Tensor Dropout2dImpl::noise_mask(Tensor input) const {
+  return torch::empty({input.size(0), input.size(1), 1, 1}, input.options());
+}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/embedding.cpp b/torch/csrc/api/src/nn/modules/embedding.cpp
new file mode 100644
index 0000000..54f6f31
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@@ -0,0 +1,30 @@
+#include <torch/nn/modules/embedding.h>
+
+#include <torch/tensor.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+EmbeddingOptions::EmbeddingOptions(int64_t count, int64_t dimension)
+    : count_(count), dimension_(dimension) {}
+
+EmbeddingImpl::EmbeddingImpl(EmbeddingOptions options)
+    : options(std::move(options)) {
+  reset();
+}
+
+void EmbeddingImpl::reset() {
+  table = register_parameter(
+      "table", torch::empty({options.count_, options.dimension_}));
+  table.data().normal_(0, 1);
+}
+
+Tensor EmbeddingImpl::forward(Tensor input) {
+  return torch::embedding(table, /*indices=*/input);
+}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/functional.cpp b/torch/csrc/api/src/nn/modules/functional.cpp
new file mode 100644
index 0000000..878a58f
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/functional.cpp
@@ -0,0 +1,23 @@
+#include <torch/nn/modules/functional.h>
+
+#include <torch/tensor.h>
+
+#include <functional>
+#include <utility>
+
+namespace torch {
+namespace nn {
+FunctionalImpl::FunctionalImpl(std::function<Tensor(Tensor)> function)
+    : function_(std::move(function)) {}
+
+void FunctionalImpl::reset() {}
+
+Tensor FunctionalImpl::forward(Tensor input) {
+  return function_(input);
+}
+
+Tensor FunctionalImpl::operator()(Tensor input) {
+  return forward(input);
+}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/linear.cpp b/torch/csrc/api/src/nn/modules/linear.cpp
new file mode 100644
index 0000000..dcca7c0
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@@ -0,0 +1,43 @@
+#include <torch/nn/modules/linear.h>
+
+#include <torch/tensor.h>
+
+#include <cmath>
+#include <cstdint>
+
+namespace torch {
+namespace nn {
+LinearOptions::LinearOptions(int64_t in, int64_t out) : in_(in), out_(out) {}
+
+LinearImpl::LinearImpl(LinearOptions options) : options(std::move(options)) {
+  reset();
+}
+
+void LinearImpl::reset() {
+  weight =
+      register_parameter("weight", torch::empty({options.out_, options.in_}));
+  if (options.with_bias_) {
+    bias = register_parameter("bias", torch::empty(options.out_));
+  }
+
+  const auto stdv = 1.0 / std::sqrt(weight.size(1));
+  for (auto& p : parameters()) {
+    p->data().uniform_(-stdv, stdv);
+  }
+}
+
+Tensor LinearImpl::forward(Tensor input) {
+  if (input.ndimension() == 2 && options.with_bias_) {
+    // Fused op is marginally faster
+    AT_ASSERT(input.size(1) == weight.size(1));
+    return {torch::addmm(bias, input, weight.t())};
+  }
+
+  auto output = input.matmul(weight.t());
+  if (options.with_bias_) {
+    output += bias;
+  }
+  return output;
+}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
new file mode 100644
index 0000000..ae5688f
--- /dev/null
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -0,0 +1,403 @@
+#include <torch/nn/modules/rnn.h>
+
+#include <torch/nn/modules/dropout.h>
+#include <torch/tensor.h>
+#include <torch/tensor_list_view.h>
+#include <torch/utils.h>
+
+#include <ATen/Error.h>
+#include <ATen/optional.h>
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+namespace {
+Tensor linear(Tensor x, Tensor w, Tensor b) {
+  if (x.ndimension() == 2 && b.defined()) {
+    // Fused op is marginally faster
+    assert(x.size(1) == w.size(1));
+    return torch::addmm(b, x, w.t());
+  }
+
+  auto output = x.matmul(w.t());
+  if (b.defined()) {
+    output += b;
+  }
+  return output;
+}
+} // namespace
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNOptionsBase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+RNNOptionsBase::RNNOptionsBase(int64_t input_size, int64_t hidden_size)
+    : input_size_(input_size), hidden_size_(hidden_size) {}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNImplBase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename Derived>
+RNNImplBase<Derived>::RNNImplBase(
+    RNNOptionsBase options_,
+    at::optional<CuDNNMode> cudnn_mode,
+    int64_t number_of_gates,
+    bool has_cell_state)
+    : options(options_),
+      dropout(nullptr),
+      number_of_gates_(number_of_gates),
+      has_cell_state_(has_cell_state),
+      cudnn_mode_(cudnn_mode) {
+  reset();
+}
+
+template <typename Derived>
+void RNNImplBase<Derived>::reset() {
+  if (options.dropout_ > 0.0) {
+    dropout = Dropout(options.dropout_);
+  }
+
+  w_ih.resize(options.layers_);
+  w_hh.resize(options.layers_);
+  b_ih.resize(options.layers_);
+  b_hh.resize(options.layers_);
+
+  const int64_t gate_size = options.hidden_size_ * number_of_gates_;
+
+  for (int64_t layer = 0; layer < options.layers_; ++layer) {
+    const int64_t input_size =
+        (layer == 0) ? options.input_size_ : options.hidden_size_;
+    w_ih[layer] = this->register_parameter(
+        "weight_ih_l" + std::to_string(layer),
+        torch::empty({gate_size, input_size}));
+    w_hh[layer] = this->register_parameter(
+        "weight_hh_l" + std::to_string(layer),
+        torch::empty({gate_size, options.hidden_size_}));
+
+    if (options.with_bias_) {
+      b_ih[layer] = this->register_parameter(
+          "bias_ih_l" + std::to_string(layer), torch::empty({gate_size}));
+      b_hh[layer] = this->register_parameter(
+          "bias_hh_l" + std::to_string(layer), torch::empty({gate_size}));
+    }
+  }
+
+  const auto stdv = 1.0 / std::sqrt(options.hidden_size_);
+  for (auto& p : this->parameters()) {
+    p->data().uniform_(-stdv, stdv);
+  }
+}
+
+template <typename Derived>
+RNNOutput RNNImplBase<Derived>::forward(Tensor input, Tensor state) {
+  if (use_cudnn(/*sample=*/input)) {
+    return CUDNN_forward(input, state);
+  } else {
+    return autograd_forward(input, state);
+  }
+}
+
+template <typename Derived>
+std::vector<Tensor> RNNImplBase<Derived>::flat_weights() const {
+  std::vector<Tensor> flat;
+  for (int64_t layer = 0; layer < options.layers_; layer++) {
+    flat.push_back(w_ih[layer]);
+    flat.push_back(w_hh[layer]);
+    if (options.with_bias_) {
+      flat.push_back(b_ih[layer]);
+      flat.push_back(b_hh[layer]);
+    }
+  }
+  return flat;
+}
+
+template <typename Derived>
+bool RNNImplBase<Derived>::use_cudnn(Tensor sample) const {
+  return cudnn_mode_.has_value() && sample.is_cuda() &&
+      torch::cudnn_is_acceptable(sample);
+}
+
+template <typename Derived>
+Tensor RNNImplBase<Derived>::create_dropout_state(Tensor input) const {
+  static const int64_t dropout_seed =
+      torch::ones({}, torch::kInt64).random_().toCLong();
+  if (options.dropout_ > 0) {
+    torch::DeviceGuard guard(input.device());
+    return torch::_cudnn_init_dropout_state(
+        input.type().toScalarType(torch::kUInt8),
+        options.dropout_,
+        this->is_training(),
+        dropout_seed);
+  }
+  return torch::empty({}, input.options());
+}
+
+template <typename Derived>
+RNNOutput RNNImplBase<Derived>::autograd_forward(Tensor input, Tensor state) {
+  std::vector<Tensor> new_state;
+  auto has_hidden = state.defined();
+  auto layer_dimension = has_hidden ? state.ndimension() - 3 : -1;
+  for (int64_t layer = 0; layer < options.layers_; layer++) {
+    new_state.push_back(
+        has_hidden ? state.select(layer_dimension, layer) : Tensor());
+  }
+
+  auto output = torch::zeros(
+      {input.size(0), input.size(1), options.hidden_size_}, input.options());
+  for (int64_t t = 0; t < input.size(0); t++) {
+    auto x = input.select(0, t);
+    for (int64_t i = 0; i < options.layers_; i++) {
+      // cell_forward() returns a stacked tensor of one or more cell states.
+      auto layer_output = cell_forward(x, new_state[i], i);
+      // If there are multiple cell states, keep all. If there is only one,
+      // the first dimension will be 1, so `.squeeze(0)` will unpack it.
+      new_state[i] = layer_output.squeeze(0);
+      // x should always be the hidden cell state h, assumed to be the zero-th.
+      x = layer_output[0];
+      output.select(0, t).copy_(x);
+      if (options.dropout_ > 0 && i != options.layers_ - 1) {
+        x = dropout->forward(x);
+      }
+    }
+  }
+
+  auto state_output = torch::stack(TensorListView(new_state));
+  if (has_cell_state_) {
+    state_output.transpose_(0, 1);
+  }
+  return {output, state_output};
+}
+
+template <typename Derived>
+void RNNImplBase<Derived>::flatten_parameters_for_cudnn() {
+  data_ptrs_.clear();
+  const auto any_parameter = w_ih.at(0);
+  if (!use_cudnn(/*sample=*/w_ih.at(0))) {
+    return;
+  }
+  std::unordered_set<void*> unique_data_ptrs;
+  auto params = this->parameters();
+  for (auto& p : params) {
+    unique_data_ptrs.insert(p->data().data_ptr());
+  }
+  // TODO PyTorch says: If any parameters alias, we fall back to the slower,
+  // copying code path. This is a sufficient check, because overlapping
+  // parameter buffers that don't completely alias would break the assumptions
+  // of the uniqueness check in Module.named_parameters(). But I'm not sure if
+  // this is the case for us
+  if (unique_data_ptrs.size() != params.size()) {
+    return;
+  }
+
+  {
+    NoGradGuard guard;
+    flat_weights_ = torch::_cudnn_rnn_flatten_weight(
+        TensorListView(flat_weights()),
+        /*weight_stride=*/options.with_bias_ ? 4 : 2,
+        options.input_size_,
+        static_cast<int64_t>(*cudnn_mode_),
+        options.hidden_size_,
+        options.layers_,
+        /*batch_first=*/false,
+        /*bidirectional=*/false);
+  }
+  for (auto& p : params) {
+    data_ptrs_.emplace_back(p->data().data_ptr());
+  }
+}
+
+template <typename Derived>
+RNNOutput RNNImplBase<Derived>::CUDNN_forward(Tensor input, Tensor state) {
+  Tensor hx, cx;
+  if (state.defined()) {
+    if (has_cell_state_) {
+      hx = state[0];
+      cx = state[1];
+    } else {
+      hx = state;
+    }
+  } else {
+    hx = torch::zeros(
+        {options.layers_, input.size(1), options.hidden_size_},
+        input.options());
+    if (has_cell_state_) {
+      cx = torch::zeros(
+          {options.layers_, input.size(1), options.hidden_size_},
+          input.options());
+    }
+  }
+  std::vector<void*> weight_data_ptrs;
+  for (auto& p : this->parameters()) {
+    weight_data_ptrs.emplace_back(p->data().data_ptr());
+  }
+
+  AT_CHECK(
+      weight_data_ptrs == data_ptrs_,
+      "Parameters are unflattened! Code path might be super slow. "
+      "Please call flatten_parameters_for_cudnn() when you muck "
+      "around with storages!")
+
+  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
+  auto cudnn_output = torch::_cudnn_rnn(
+      /*input=*/input,
+      /*weight=*/TensorListView(flat_weights()),
+      /*weight_stride0=*/options.with_bias_ ? 4 : 2,
+      /*weight_buf=*/flat_weights_,
+      /*hx=*/hx,
+      /*cx=*/cx,
+      /*mode=*/static_cast<int64_t>(*cudnn_mode_),
+      /*hidden_size=*/options.hidden_size_,
+      /*num_layers=*/options.layers_,
+      /*batch_first=*/false,
+      /*dropout=*/options.dropout_,
+      /*train=*/this->is_training(),
+      /*bidirectional=*/false,
+      /*batch_sizes=*/{},
+      /*dropout_state=*/create_dropout_state(input));
+
+  Tensor hidden_output = std::get<1>(cudnn_output);
+  if (has_cell_state_) {
+    auto cy = std::get<2>(cudnn_output);
+    hidden_output = torch::stack(TensorListView({hidden_output, cy}));
+  }
+
+  Tensor output = std::get<0>(cudnn_output);
+  return {output, hidden_output};
+}
+
+template <typename Derived>
+void RNNImplBase<Derived>::to(
+    torch::Device device,
+    torch::Dtype dtype,
+    bool non_blocking) {
+  nn::Module::to(device, dtype, non_blocking);
+  flatten_parameters_for_cudnn();
+}
+
+template <typename Derived>
+void RNNImplBase<Derived>::to(torch::Dtype dtype, bool non_blocking) {
+  nn::Module::to(dtype, non_blocking);
+  flatten_parameters_for_cudnn();
+}
+
+template <typename Derived>
+void RNNImplBase<Derived>::to(torch::Device device, bool non_blocking) {
+  nn::Module::to(device, non_blocking);
+  flatten_parameters_for_cudnn();
+}
+
+template class RNNImplBase<LSTMImpl>;
+template class RNNImplBase<GRUImpl>;
+template class RNNImplBase<RNNImpl>;
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RNNOptions::RNNOptions(int64_t input_size, int64_t hidden_size)
+    : input_size_(input_size), hidden_size_(hidden_size) {}
+
+RNNOptions& RNNOptions::tanh() {
+  return activation(RNNActivation::Tanh);
+}
+
+RNNOptions& RNNOptions::relu() {
+  return activation(RNNActivation::ReLU);
+}
+
+RNNImpl::RNNImpl(RNNOptions options)
+    : detail::RNNImplBase<RNNImpl>(
+          detail::RNNOptionsBase(options.input_size_, options.hidden_size_)
+              .layers(options.layers_)
+              .with_bias(options.with_bias_)
+              .dropout(options.dropout_),
+          /*cudnn_mode=*/static_cast<CuDNNMode>(options.activation_)),
+      options(options) {
+  switch (options.activation_) {
+    case RNNActivation::ReLU: {
+      activation_function_ = torch::relu;
+      break;
+    }
+    case RNNActivation::Tanh: {
+      activation_function_ = torch::tanh;
+      break;
+    }
+  }
+}
+
+Tensor RNNImpl::cell_forward(Tensor input, Tensor state, int64_t layer) {
+  auto hx = state.defined()
+      ? state
+      : torch::zeros({input.size(0), options.hidden_size_}, input.options());
+
+  auto h = linear(input, w_ih[layer], b_ih[layer]) +
+      linear(hx, w_hh[layer], b_hh[layer]);
+
+  return torch::stack(activation_function_(h));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LSTMImpl::LSTMImpl(LSTMOptions options)
+    : detail::RNNImplBase<LSTMImpl>(
+          options,
+          /*cudnn_mode=*/CuDNNMode::LSTM,
+          /*number_of_gates=*/4,
+          /*has_cell_state=*/true) {}
+
+Tensor LSTMImpl::cell_forward(Tensor input, Tensor state, int64_t layer) {
+  auto hid = state.defined()
+      ? state
+      : torch::zeros({2, input.size(0), options.hidden_size_}, input.options());
+  auto hx = hid[0];
+  auto cx = hid[1];
+
+  auto gates = linear(input, w_ih[layer], b_ih[layer]) +
+      linear(hx, w_hh[layer], b_hh[layer]);
+
+  auto chunked = gates.chunk(4, 1);
+  auto in_gate = chunked[0].sigmoid();
+  auto forget_gate = chunked[1].sigmoid();
+  auto cell_gate = chunked[2].tanh();
+  auto out_gate = chunked[3].sigmoid();
+
+  auto cy = (forget_gate * cx) + (in_gate * cell_gate);
+  auto hy = out_gate * cy.tanh();
+
+  return torch::stack(TensorListView{hy, cy}, 0);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GRUImpl::GRUImpl(GRUOptions options)
+    : detail::RNNImplBase<GRUImpl>(
+          options,
+          /*cudnn_mode=*/CuDNNMode::GRU,
+          /*number_of_gates=*/3) {}
+
+Tensor GRUImpl::cell_forward(Tensor input, Tensor state, int64_t layer) {
+  auto hx = state.defined()
+      ? state
+      : torch::zeros({input.size(0), options.hidden_size_}, input.options());
+
+  auto gi = linear(input, w_ih[layer], b_ih[layer]);
+  auto gh = linear(input, w_hh[layer], b_hh[layer]);
+  auto gic = gi.chunk(3, 1);
+  auto ghc = gh.chunk(3, 1);
+
+  auto reset_gate = (gic[0] + ghc[0]).sigmoid_();
+  auto input_gate = (gic[1] + ghc[1]).sigmoid_();
+  auto new_gate = (gic[2] + reset_gate * ghc[2]).tanh_();
+  auto hy = new_gate + input_gate * (hx - new_gate);
+
+  return torch::stack(TensorListView(hy));
+}
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/adagrad.cpp b/torch/csrc/api/src/optim/adagrad.cpp
new file mode 100644
index 0000000..7d87e0c
--- /dev/null
+++ b/torch/csrc/api/src/optim/adagrad.cpp
@@ -0,0 +1,43 @@
+#include <torch/optim/adagrad.h>
+
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+
+#include <functional>
+
+namespace torch {
+namespace optim {
+
+AdagradOptions::AdagradOptions(double learning_rate)
+    : learning_rate_(learning_rate) {}
+
+const AdagradOptions& Adagrad::options() const noexcept {
+  return options_;
+}
+
+/// Adapted from
+/// https://github.com/pytorch/pytorch/blob/master/torch/optim/adagrad.py
+void Adagrad::step() {
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    auto& grad = parameters_.at(i).grad();
+    auto& p = parameters_.at(i).data();
+    if (!grad.defined())
+      continue;
+
+    auto d_p = Tensor(grad).data();
+    if (options_.weight_decay_ > 0) {
+      d_p.add_(p, options_.weight_decay_);
+    }
+    step_.at(i) += 1.0;
+    auto clr = options_.learning_rate_ /
+        (1.0 + (step_.at(i) - 1.0) * options_.lr_decay_);
+
+    auto sum = buffer_at(sum_, i);
+    sum.data().addcmul_(d_p, d_p, 1.0);
+    auto std = sum_.at(i).data().sqrt().add_(1e-10);
+    p.addcdiv_(d_p, std, -clr);
+  }
+}
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/adam.cpp b/torch/csrc/api/src/optim/adam.cpp
new file mode 100644
index 0000000..e05b81e
--- /dev/null
+++ b/torch/csrc/api/src/optim/adam.cpp
@@ -0,0 +1,65 @@
+#include <torch/optim/adam.h>
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/nn/module.h>
+
+#include <ATen/ATen.h>
+
+#include <cmath>
+#include <functional>
+
+namespace torch {
+namespace optim {
+
+AdamOptions::AdamOptions(double learning_rate)
+    : learning_rate_(learning_rate) {}
+
+const AdamOptions& Adam::options() const noexcept {
+  return options_;
+}
+
+void Adam::step() {
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    auto& grad = parameters_.at(i).grad();
+    auto& p = parameters_.at(i).data();
+    if (!grad.defined()) {
+      continue;
+    }
+
+    auto exp_average = buffer_at(exp_average_buffers_, i).data();
+    auto exp_average_sq = buffer_at(exp_average_sq_buffers_, i).data();
+
+    step_buffers_.at(i) += 1;
+
+    auto d_p = torch::autograd::as_variable_ref(grad).data();
+    if (options_.weight_decay_ > 0) {
+      d_p.add_(p, options_.weight_decay_);
+    }
+
+    exp_average.mul_(options_.beta1_).add_(d_p, 1 - options_.beta1_);
+    exp_average_sq.mul_(options_.beta2_)
+        .addcmul_(d_p, d_p, 1 - options_.beta2_);
+
+    at::Tensor denom;
+    if (options_.amsgrad_) {
+      auto max_exp_average_sq =
+          buffer_at(max_exp_average_sq_buffers_, i).data();
+      torch::max_out(max_exp_average_sq, max_exp_average_sq, exp_average_sq);
+      denom = max_exp_average_sq.sqrt().add_(options_.eps_);
+    } else {
+      denom = exp_average_sq.sqrt().add_(options_.eps_);
+    }
+
+    const auto bias_correction1 =
+        1 - std::pow(options_.beta1_, step_buffers_.at(i));
+    const auto bias_correction2 =
+        1 - std::pow(options_.beta2_, step_buffers_.at(i));
+    const auto step_size = options_.learning_rate_ *
+        std::sqrt(bias_correction2) / bias_correction1;
+
+    p.addcdiv_(exp_average, denom, -step_size);
+  }
+}
+
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
new file mode 100644
index 0000000..8048abc
--- /dev/null
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -0,0 +1,162 @@
+#include <torch/optim/lbfgs.h>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+
+#include <cmath>
+#include <functional>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+LBFGSOptions::LBFGSOptions(double learning_rate)
+    : learning_rate_(learning_rate) {}
+
+const LBFGSOptions& LBFGS::options() const noexcept {
+  return options_;
+}
+
+at::Tensor LBFGS::gather_flat_grad() {
+  std::vector<at::Tensor> views;
+  for (auto& parameter : parameters_) {
+    views.push_back(torch::Tensor(parameter.grad()).data().view(-1));
+  }
+  return at::cat(views);
+}
+
+void LBFGS::add_grad(const torch::Scalar& step_size, const at::Tensor& update) {
+  int64_t offset = 0;
+  for (auto& parameter : parameters_) {
+    int64_t numel = parameter.numel();
+    at::Tensor& pd = parameter.data();
+    pd.add_(update.slice(0, offset, offset + numel, 1).view_as(pd), step_size);
+    offset += numel;
+  }
+}
+
+torch::Tensor LBFGS::step(LossClosure closure) {
+  torch::Tensor orig_loss = closure();
+  torch::Tensor loss = orig_loss.clone();
+  int64_t current_evals = 1;
+  func_evals += 1;
+
+  at::Tensor flat_grad = gather_flat_grad();
+  torch::Scalar abs_grad_sum = torch::Scalar(flat_grad.abs().sum());
+
+  if (torch::Scalar(abs_grad_sum).toFloat() <= options_.tolerance_grad_) {
+    return loss;
+  }
+
+  at::Tensor ONE = flat_grad.type().scalarTensor(1);
+
+  int64_t n_iter = 0;
+  while (n_iter < options_.max_iter_) {
+    n_iter++;
+    state_n_iter++;
+
+    if (state_n_iter == 1) {
+      d = flat_grad.neg();
+      H_diag = ONE;
+      prev_flat_grad = flat_grad.clone();
+    } else {
+      at::Tensor y = flat_grad.sub(prev_flat_grad);
+      at::Tensor s = d.mul(t);
+      torch::Scalar ys = torch::Scalar(y.dot(s));
+
+      if (ys.toFloat() > 1e-10) {
+        // updating memory
+
+        if (old_dirs.size() == options_.history_size_) {
+          // shift history by one (limited memory)
+          old_dirs.pop_front();
+          old_stps.pop_front();
+        }
+
+        // store new direction/step
+        old_dirs.push_back(y);
+        old_stps.push_back(s);
+
+        // update scale of initial Hessian approximation
+        H_diag = ys / y.dot(y);
+      }
+
+      int64_t num_old = old_dirs.size();
+
+      for (int64_t i = 0; i < num_old; i++) {
+        ro.at(i) = ONE / old_dirs.at(i).dot(old_stps.at(i));
+      }
+
+      at::Tensor q = flat_grad.neg();
+      for (int64_t i = num_old - 1; i >= 0; i--) {
+        al.at(i) = old_stps.at(i).dot(q) * ro.at(i);
+        q.add_(old_dirs.at(i), torch::Scalar(-al.at(i)));
+      }
+
+      // Multiply by initial Hessian
+      // r/d is the final direction
+      at::Tensor r = q.mul(H_diag);
+      d = r;
+
+      for (int64_t i = 0; i < num_old; i++) {
+        at::Tensor be_i = old_dirs.at(i).dot(r) * ro.at(i);
+        r.add_(old_stps.at(i), torch::Scalar(al.at(i) - be_i));
+      }
+      prev_flat_grad.copy_(flat_grad);
+    }
+
+    /**
+     * comute step length
+     */
+
+    // reset initial guess for step size
+    if (n_iter == 1) {
+      t = torch::Scalar(
+          at::min(ONE, ONE / abs_grad_sum) * options_.learning_rate_);
+    } else {
+      t = options_.learning_rate_;
+    }
+
+    torch::Scalar gtd = torch::Scalar(flat_grad.dot(d));
+    add_grad(t, d);
+    int64_t ls_func_evals = 0;
+    if (n_iter != options_.max_iter_) {
+      // re-evaluate function only if not in last iteration
+      // the reason we do this: in a stochastic setting,
+      // no use to re-evaluate that function here
+      loss = closure();
+      flat_grad = gather_flat_grad();
+      abs_grad_sum = torch::Scalar(flat_grad.abs().sum());
+      ls_func_evals = 1;
+    }
+
+    current_evals += ls_func_evals;
+
+    /**
+     * Check conditions
+     */
+
+    if (n_iter == options_.max_iter_) {
+      break;
+    } else if (current_evals >= options_.max_eval_) {
+      break;
+    } else if (abs_grad_sum.toFloat() <= options_.tolerance_grad_) {
+      break;
+    } else if (gtd.toFloat() > -options_.tolerance_grad_) {
+      break;
+    } else if (
+        torch::Scalar(d.mul(t).abs_().sum()).toFloat() <=
+        options_.tolerance_change_) {
+      break;
+    } else if (
+        std::abs(loss.toCFloat() - prev_loss.toFloat()) <
+        options_.tolerance_change_) {
+      break;
+    }
+  }
+  return orig_loss;
+}
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/optimizer.cpp b/torch/csrc/api/src/optim/optimizer.cpp
new file mode 100644
index 0000000..47f2f36
--- /dev/null
+++ b/torch/csrc/api/src/optim/optimizer.cpp
@@ -0,0 +1,45 @@
+#include <torch/optim/optimizer.h>
+
+#include <torch/nn/cursor.h>
+#include <torch/tensor.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace optim {
+namespace detail {
+
+OptimizerBase::OptimizerBase(std::vector<Tensor> parameters)
+    : parameters_(std::move(parameters)) {}
+
+OptimizerBase::OptimizerBase(const ParameterCursor& cursor) {
+  add_parameters(cursor);
+}
+
+void OptimizerBase::add_parameters(const std::vector<Tensor>& parameters) {
+  parameters_.insert(parameters_.end(), parameters.begin(), parameters.end());
+}
+
+void OptimizerBase::add_parameters(const ParameterCursor& cursor) {
+  std::vector<Tensor> tensors(cursor.size());
+  cursor.map(tensors.begin(), [](const Tensor& tensor) { return tensor; });
+  add_parameters(tensors);
+}
+
+void OptimizerBase::zero_grad() {
+  for (auto& parameter : parameters_) {
+    auto& grad = parameter.grad();
+    if (grad.defined()) {
+      grad = grad.detach();
+      Tensor(grad).data().zero_();
+    }
+  }
+}
+
+size_t OptimizerBase::size() const noexcept {
+  return parameters_.size();
+}
+} // namespace detail
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/rmsprop.cpp b/torch/csrc/api/src/optim/rmsprop.cpp
new file mode 100644
index 0000000..abf2bf9
--- /dev/null
+++ b/torch/csrc/api/src/optim/rmsprop.cpp
@@ -0,0 +1,59 @@
+#include <torch/optim/rmsprop.h>
+
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+
+#include <functional>
+
+namespace torch {
+namespace optim {
+
+RMSpropOptions::RMSpropOptions(double learning_rate)
+    : learning_rate_(learning_rate) {}
+
+const RMSpropOptions& RMSprop::options() const noexcept {
+  return options_;
+}
+
+/// Adapted from
+/// https://github.com/pytorch/pytorch/blob/master/torch/optim/rmsprop.py
+void RMSprop::step() {
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    auto& grad = parameters_.at(i).grad();
+    auto& p = parameters_.at(i).data();
+    if (!grad.defined()) {
+      continue;
+    }
+
+    auto d_p = torch::autograd::as_variable_ref(grad).data();
+    if (options_.weight_decay_ > 0) {
+      d_p.add_(p, options_.weight_decay_);
+    }
+
+    auto square_average = buffer_at(square_average_buffers_, i).data();
+    square_average.mul_(options_.alpha_)
+        .addcmul_(d_p, d_p, 1.0 - options_.alpha_);
+
+    at::Tensor average;
+    if (options_.centered_ > 0) {
+      auto grad_average = buffer_at(grad_average_buffers_, i).data();
+      grad_average.mul_(options_.alpha_).add_(d_p, 1.0 - options_.alpha_);
+      average = square_average.addcmul(grad_average, grad_average, -1.0)
+                    .sqrt()
+                    .add_(options_.eps_);
+    } else {
+      average = square_average.sqrt().add_(options_.eps_);
+    }
+
+    if (options_.momentum_ > 0) {
+      auto momentum = buffer_at(momentum_buffers_, i).data();
+      momentum.mul_(options_.momentum_).addcdiv_(d_p, average);
+      p.add_(momentum, -options_.learning_rate_);
+    } else {
+      p.addcdiv_(d_p, average, -options_.learning_rate_);
+    }
+  }
+}
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/optim/sgd.cpp b/torch/csrc/api/src/optim/sgd.cpp
new file mode 100644
index 0000000..4b8ee7a
--- /dev/null
+++ b/torch/csrc/api/src/optim/sgd.cpp
@@ -0,0 +1,50 @@
+#include <torch/optim/sgd.h>
+
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+
+#include <functional>
+
+namespace torch {
+namespace optim {
+SGDOptions::SGDOptions(double learning_rate) : learning_rate_(learning_rate) {}
+
+const SGDOptions& SGD::options() const noexcept {
+  return options_;
+}
+
+void SGD::step() {
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    auto& grad = parameters_.at(i).grad();
+    auto& p = parameters_.at(i).data();
+
+    if (!grad.defined()) {
+      continue;
+    }
+
+    auto d_p = torch::Tensor(grad).data();
+    if (options_.weight_decay_ > 0) {
+      d_p.add_(p, options_.weight_decay_);
+    }
+
+    if (options_.momentum_ != 0) {
+      auto momentum = buffer_at(momentum_buffers_, i).data();
+      if (iteration_ == 0) {
+        momentum.mul_(options_.momentum_).add_(d_p);
+      } else {
+        momentum.mul_(options_.momentum_).add_(d_p, 1 - options_.dampening_);
+      }
+      if (options_.nesterov_) {
+        d_p = d_p.add(momentum, options_.momentum_);
+      } else {
+        d_p = momentum;
+      }
+    }
+
+    p.add_(d_p, -options_.learning_rate_);
+  }
+  iteration_ += 1;
+}
+} // namespace optim
+} // namespace torch
diff --git a/torch/csrc/api/src/utils.cpp b/torch/csrc/api/src/utils.cpp
new file mode 100644
index 0000000..cd8c289
--- /dev/null
+++ b/torch/csrc/api/src/utils.cpp
@@ -0,0 +1,15 @@
+#include <torch/utils.h>
+
+#include <ATen/Context.h>
+
+#include <cstddef>
+
+namespace torch {
+void manual_seed(uint64_t seed) {
+  // TODO: Move this to at::Context
+  at::globalContext().defaultGenerator(at::Backend::CPU).manualSeed(seed);
+  if (at::globalContext().hasCUDA()) {
+    at::globalContext().defaultGenerator(at::Backend::CUDA).manualSeedAll(seed);
+  }
+}
+} // namespace torch
diff --git a/torch/csrc/assertions.cpp b/torch/csrc/assertions.cpp
new file mode 100644
index 0000000..6a854a7
--- /dev/null
+++ b/torch/csrc/assertions.cpp
@@ -0,0 +1,21 @@
+#include "torch/csrc/assertions.h"
+
+#include <cstdarg>
+#include <cstdio>
+
+namespace torch {
+
+void
+barf(const char *fmt, ...)
+{
+  char msg[2048];
+  va_list args;
+
+  va_start(args, fmt);
+  vsnprintf(msg, 2048, fmt, args);
+  va_end(args);
+
+  throw assert_error(msg);
+}
+
+}
diff --git a/torch/csrc/assertions.h b/torch/csrc/assertions.h
new file mode 100644
index 0000000..18c17bc
--- /dev/null
+++ b/torch/csrc/assertions.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <exception>
+#include <string>
+#include "THP_export.h"
+
+namespace torch {
+
+struct assert_error final : public std::exception {
+  const std::string msg;
+  explicit assert_error(const std::string& msg) : msg(msg) {}
+  const char* what() const noexcept override { return msg.c_str(); }
+};
+
+[[noreturn]]
+THP_CLASS void barf(const char *fmt, ...);
+
+} // namespace torch
+
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define TORCH_EXPECT(x, y) (__builtin_expect((x), (y)))
+#else
+#define TORCH_EXPECT(x, y) (x)
+#endif
+
+#define TORCH_ASSERT(cond) \
+  if (TORCH_EXPECT(!(cond), 0)) { \
+    ::torch::barf("%s:%u: %s: Assertion `%s` failed.", __FILE__, __LINE__, __func__, #cond); \
+  }
+
+// The trailing ' ' argument is a hack to deal with the extra comma when ... is empty.
+// Another way to solve this is ##__VA_ARGS__ in _TORCH_ASSERTM, but this is a non-portable
+// extension we shouldn't use.
+#define TORCH_ASSERTM(...) _TORCH_ASSERTM(__VA_ARGS__, " ")
+
+// Note: msg must be a string literal
+#define _TORCH_ASSERTM(cond, msg, ...) \
+  if (TORCH_EXPECT(!(cond), 0)) { \
+    ::torch::barf("%s:%u: %s: Assertion `%s` failed: " msg, __FILE__, __LINE__, __func__, #cond, __VA_ARGS__); \
+  }
+
+#define TORCH_EXPECTM(...) _TORCH_EXPECTM(__VA_ARGS__, " ")
+
+// Note: msg must be a string literal
+#define _TORCH_EXPECTM(cond, msg, ...) \
+  if (TORCH_EXPECT(!(cond), 0)) { \
+    ::torch::barf("%s:%u: %s: " msg, __FILE__, __LINE__, __func__, __VA_ARGS__); \
+  }
+
+#define JIT_ASSERT TORCH_ASSERT
+#define JIT_ASSERTM TORCH_ASSERTM
+#define JIT_EXPECTM TORCH_EXPECTM
diff --git a/torch/csrc/autograd/README.md b/torch/csrc/autograd/README.md
new file mode 100644
index 0000000..6c3575e
--- /dev/null
+++ b/torch/csrc/autograd/README.md
@@ -0,0 +1,33 @@
+## Autograd
+
+Autograd is a hotspot for PyTorch performance, so most of the heavy lifting is
+implemented in C++. This implies that we have to do some shuffling between
+Python and C++; and in general, we want data to be in a form that is convenient
+to manipulate from C++.
+
+Our general model is that for any key data type that autograd manipulates,
+there are two implementations: a C++ type and a Python object type.  For
+example, consider variables in autograd: we have both `Variable` in `variable.h`
+(the C++ type) and `THPVariable` in `python_variable.h` (the Python type.)
+(By the way, THP stands for TorcH Python, not to be confused with THPP, TorcH
+C++).  `Variable` contains the payload of a variable, while `THPVariable` just
+contains a `shared_ptr` reference to `Variable`, as well as references to other
+Python objects which the Python runtime needs to know about.  A lot of
+data accessor implementations in `python_variable.cpp` simply reach through
+to the underlying `Variable` and return the appropriate value.
+
+The most complicated application of this principle is Function, which also
+supports users implementing custom behavior in Python.  We have the following
+classes:
+
+* `Function` in `function.h`, the C++ type.
+* `THPFunction` in `python_function.h`, the Python object type.  In
+  `python_function.cpp`, you can see the boilerplate that tells the Python
+  interpreter about this object.
+* `PyFunction` in `python_function.h`, a subclass of `Function` which forwards
+  `apply` to a Python `THPFunction`. (NOT a Python object, despite its name!)
+
+Outside of `PyFunction`, the C++ objects largely avoid referencing Python
+objects (there are a few exceptions, like `pyobj` in `Variable`, and
+`PyFunction`, whose whole point is to let C++ call into Python). And `pyobj`
+in `Function` to ensure uniqueness of the associated python wrapper (if it exists).
diff --git a/torch/csrc/autograd/anomaly_mode.cpp b/torch/csrc/autograd/anomaly_mode.cpp
new file mode 100644
index 0000000..7392286
--- /dev/null
+++ b/torch/csrc/autograd/anomaly_mode.cpp
@@ -0,0 +1,7 @@
+#include "torch/csrc/autograd/anomaly_mode.h"
+
+namespace torch { namespace autograd {
+
+bool AnomalyMode::_enabled = 0;
+
+}}
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
new file mode 100644
index 0000000..df1e92c
--- /dev/null
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -0,0 +1,24 @@
+#pragma once
+
+namespace torch { namespace autograd {
+
+struct AnomalyMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static void set_enabled(bool enabled) {
+    _enabled = enabled;
+  }
+
+private:
+  static bool _enabled;
+};
+
+
+struct AnomalyMetadata {
+  virtual ~AnomalyMetadata(){};
+  virtual void store_stack() = 0;
+  virtual void print_stack() = 0;
+};
+
+}}
diff --git a/torch/csrc/autograd/aten_variable_hooks.cpp b/torch/csrc/autograd/aten_variable_hooks.cpp
new file mode 100644
index 0000000..7a2c397
--- /dev/null
+++ b/torch/csrc/autograd/aten_variable_hooks.cpp
@@ -0,0 +1,23 @@
+#include <ATen/detail/VariableHooksInterface.h>
+#include <torch/csrc/autograd/generated/VariableType.h>
+
+namespace torch { namespace autograd {
+
+struct VariableHooks : public at::VariableHooksInterface {
+  VariableHooks(at::VariableHooksArgs) {}
+  void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override;
+};
+
+// Sigh, the registry doesn't support namespaces :(
+using at::RegistererVariableHooksRegistry;
+using at::VariableHooksRegistry;
+
+REGISTER_VARIABLE_HOOKS(VariableHooks)
+
+// Pre-condition: backend/scalar_type is a valid type in the type_registry
+void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend backend, at::ScalarType scalar_type) const {
+  auto* baseType = context->getTypeRaw(backend, scalar_type);
+  register_variable_type_for(baseType);
+}
+
+}} // torch::autograd
diff --git a/torch/csrc/autograd/autograd.h b/torch/csrc/autograd/autograd.h
new file mode 100644
index 0000000..07f2b07
--- /dev/null
+++ b/torch/csrc/autograd/autograd.h
@@ -0,0 +1,17 @@
+#ifndef THP_AUTOGRAD_H
+#define THP_AUTOGRAD_H
+
+PyObject * THPAutograd_initExtension(PyObject *_unused);
+void THPAutograd_initFunctions();
+
+namespace torch { namespace autograd {
+
+PyMethodDef* python_functions();
+
+}}
+
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/python_engine.h"
+
+#endif
diff --git a/torch/csrc/autograd/edge.h b/torch/csrc/autograd/edge.h
new file mode 100644
index 0000000..e6b3491
--- /dev/null
+++ b/torch/csrc/autograd/edge.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "torch/csrc/utils/hash.h"
+
+namespace torch { namespace autograd {
+
+struct Function;
+
+/// Represents a particular input of a function.
+struct Edge {
+  Edge() noexcept : function(nullptr), input_nr(0) {}
+
+  Edge(std::shared_ptr<Function> function_, uint32_t input_nr_) noexcept
+      : function(std::move(function_)), input_nr(input_nr_) {}
+
+  /// Convenience method to test if an edge is valid.
+  bool is_valid() const noexcept {
+    return function != nullptr;
+  }
+
+  // Required for use in associative containers.
+  bool operator==(const Edge& other) const noexcept {
+    return this->function == other.function && this->input_nr == other.input_nr;
+  }
+
+  bool operator!=(const Edge& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// The function this `Edge` points to.
+  std::shared_ptr<Function> function;
+
+  /// The identifier of a particular input to the function.
+  uint32_t input_nr;
+};
+}} // namespace torch::autograd
+
+// The idiomatic way of enabling use of a custom type as the key of hash
+// containers in C++11. This method removes the requirement of having to pass
+// a custom hasher to std::unordered_{map, set}.
+// See http://en.cppreference.com/w/cpp/utility/hash for more information.
+namespace std {
+template <>
+struct hash<torch::autograd::Edge> {
+  // These type aliases are required by the standard.
+  using argument_type = torch::autograd::Edge;
+  using return_type = size_t;
+  return_type operator()(const argument_type& edge) const noexcept {
+    return torch::get_hash(edge.function, edge.input_nr);
+  }
+};
+} // namespace std
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
new file mode 100644
index 0000000..ea54d50
--- /dev/null
+++ b/torch/csrc/autograd/engine.cpp
@@ -0,0 +1,675 @@
+#include "torch/csrc/autograd/engine.h"
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include <ATen/DeviceGuard.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <typeinfo>
+#include <sstream>
+#include <queue>
+#include <TH/TH.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <THC/THC.h>
+#endif
+
+namespace torch { namespace autograd {
+
+// NB: -1 indicates the CPU worker!
+static constexpr int NO_DEVICE = -2;
+
+// Threads spawned by the engine are assigned a constant 'worker_device'
+// specifying what device they process work for.  This variable is initialized
+// at thread creation time and is constant afterwards.  This is used when
+// handling reentrant backwards calls; see Note [Reentrant backwards]
+static thread_local int worker_device = NO_DEVICE;
+
+// This variable is true if ALL invocations in the stack of re-entrant engine
+// invocations are imperative backwards. This special variable is needed for the
+// gradient checkpointing feature only.
+static thread_local bool checkpoint_valid = true;
+
+// XXX: Changes to the way multithreading works in execute should be done with
+// great care. Right now the implementation guarantees that a single function's
+// apply will never be entered concurrently (even if multiple graphs are
+// executed at the same time). Adding multiple threads per-device or removing
+// engine thread affinity to the device can break this invariant, and we depend
+// on it in a few places (e.g. AccumulateGrad function).
+
+struct FunctionTask {
+  GraphTask* base;
+  std::shared_ptr<Function> fn;
+  // This buffer serves as an implicit "addition" node for all of the
+  // gradients flowing here.  Once all the dependencies are finished, we
+  // use the contents of this buffer to run the function.
+  InputBuffer inputs;
+
+  FunctionTask(GraphTask* base, std::shared_ptr<Function> fn, InputBuffer inputs)
+    : base(base)
+    , fn(fn)
+    , inputs(std::move(inputs)) {}
+};
+
+// Returns true when t2 should be (weakly) BEFORE t1 in the queue.
+struct CompareFunctionTaskTime {
+  bool operator()(FunctionTask const & t1, FunctionTask const & t2) {
+    return t1.fn->sequence_nr() < t2.fn->sequence_nr();
+  }
+};
+
+struct ReadyQueue {
+  std::priority_queue<FunctionTask, std::vector<FunctionTask>, CompareFunctionTaskTime> heap;
+  std::condition_variable not_empty;
+  std::mutex mutex;
+
+  void push(FunctionTask item);
+  FunctionTask pop();
+};
+
+// Note [Reentrant backwards]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+// To understand the reentrant backwards problem, we have to notice two
+// aspects of how the autograd engine is implemented today:
+//
+//  1. When you call Engine::execute(), you want to block until
+//  differentiation finishes so that you can get the final result variables
+//  of the backwards pass.
+//
+//  2. The engine operates by having a single worker thread per work queue,
+//  and every work queue is pinned to a specific device where the
+//  operation is executed.
+//
+// The problem is, suppose that you call backward() inside of a worker
+// thread.  By property (1), we're supposed to block until the nested task
+// finishes.  However, by property (2), this worker thread is on the
+// hook for processing the tasks assigned to it; we better not block,
+// because then all of our backward executions (including the one we
+// just started) will deadlock!
+//
+// Here's our cunning idea: instead of blocking, just get back to work
+// on whatever task queue you should have been working on previously
+// (this is saved via the thread local variable worker_device)!  There are
+// "simply" two things you have to arrange for:
+//
+//  - We have to promptly kick ourselves out of the thread_main() loop
+//    when our graph_task complete, because we need to unblock the
+//    parent function tasks that started the reentrant execution in
+//    the first place.  This is why thread_main() takes an optional
+//    graph_task as input.
+//
+//  - When we finish a GraphTask, we have to make sure we wake up the worker
+//    thread so that it actually has a chance to exit the thread_main()
+//    loop.  Thus the faffing about in thread_main() after
+//    evaluate_function() completes.
+
+
+// GraphTask holds metadata needed for a single execution of backward()
+struct GraphTask {
+  std::exception_ptr exception;
+  // Indicates if an error occurred while executing any task.  When this is
+  // true, it signals all threads to stop executing.
+  std::atomic_bool has_error;
+  std::atomic<uint64_t> outstanding_tasks;
+  bool keep_graph;
+  bool grad_mode;
+
+  std::mutex mutex;
+  // Notified when a task finishes executing.  Check outstanding_tasks to see
+  // if all tasks are done.
+  std::condition_variable not_done;
+  std::unordered_map<Function*, InputBuffer> not_ready;
+  std::unordered_map<Function*, int> dependencies;
+
+  struct ExecInfo {
+    struct Capture {
+      Capture(int input_idx, int output_idx) : input_idx(input_idx), output_idx(output_idx) {}
+      int input_idx; // within Function inputs
+      int output_idx; // within the output vector of a GraphTask
+    };
+
+    bool should_execute() const {
+      return needed || captures;
+    }
+
+    bool needed = false;
+    std::unique_ptr<std::vector<Capture>> captures;
+  };
+  // Exec info has a bit complicated semantics. If it's empty, it means the task is
+  // run in a "default" mode, which means that all next_edges we encounter should
+  // get executed. If it's not empty, only functions that have an entry and this entry
+  // has needed == True should be executed.
+  // exec_info.empty() means it's .backward(), otherwise it's .grad().
+  std::unordered_map<Function*, ExecInfo> exec_info;
+  std::vector<Variable> captured_vars;
+
+  void init_to_execute(Function& graph_root, const edge_list& captures);
+
+  // The value of worker_device in the thread that created this task.
+  // See Note [Reentrant backwards]
+  int owner;
+
+  bool can_checkpoint() {
+    return exec_info.empty();
+  }
+
+  GraphTask(bool keep_graph, bool grad_mode)
+    : exception()
+    , has_error(false)
+    , outstanding_tasks(0)
+    , keep_graph(keep_graph)
+    , grad_mode(grad_mode)
+    , mutex()
+    , not_done()
+    , not_ready()
+    , dependencies()
+    , owner(NO_DEVICE) {}
+};
+
+auto ReadyQueue::push(FunctionTask item) -> void {
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    ++item.base->outstanding_tasks;
+    heap.push(std::move(item));
+  }
+  not_empty.notify_one();
+}
+
+auto ReadyQueue::pop() -> FunctionTask {
+  std::unique_lock<std::mutex> lock(mutex);
+  not_empty.wait(lock, [this]{ return !heap.empty(); });
+  auto task = std::move(const_cast<FunctionTask&>(heap.top())); heap.pop();
+  return task;
+}
+
+Engine::Engine() : ready_queues() {
+}
+
+// This Engine's ReadyQueues and their corresponding threads are leaked here
+Engine::~Engine() = default;
+
+auto Engine::thread_init(int device) -> void {
+  THInferNumThreads();
+  at::DeviceGuard guard(device);
+  worker_device = device;
+  thread_main(nullptr);
+}
+
+// NOTE: graph_tasks do not necessarily form a stack. Imagine this
+// case:
+//
+//    +----> Eval1
+//  Root
+//    +----> Eval2
+//
+// Once Root is executed, both Eval1 and Eval2 are added to the ready queue.
+// Next, Eval1 is run and this causes the worker to enter thread_main again.
+// Then, it pops the next task from the queue, but at this point it is Eval2.
+// It enters thread_main once again, but now with graph_task of Eval2, which is
+// completely unrelated to that of Eval1 (it's not a recursive call).
+// It's all ok and is handled right now, but it should be accounted for
+// in case this code is to be changed.
+auto Engine::thread_main(GraphTask *graph_task) -> void {
+  auto queue = ready_queues[worker_device + 1];
+  // Why the test on graph_task->outstanding_tasks?  See
+  // Note [Reentrant backwards]
+  while (!graph_task || graph_task->outstanding_tasks > 0) {
+    FunctionTask task = queue->pop();
+    if (task.fn && !task.base->has_error.load()) {
+      GradMode::set_enabled(task.base->grad_mode);
+      try {
+        evaluate_function(task);
+      } catch (std::exception& e) {
+        thread_on_exception(task, e);
+      }
+    }
+    // Notify downstream about the completion of tasks depending
+    // on both where the task was executed, and who owned the overall
+    // graph (in case of reentrant execution.)  See Note [Reentrant backwards].
+    auto base_owner = task.base->owner;
+    // Task from a non-worker thread. Easy case.
+    if (base_owner == NO_DEVICE) {
+      if (--task.base->outstanding_tasks == 0) {
+        std::lock_guard<std::mutex> lock(task.base->mutex);
+        task.base->not_done.notify_all();
+      }
+    } else {
+      // If it's a task initiated from this thread, decrease the counter, but
+      // don't do anything - loop condition will do all checks for us next.
+      if (base_owner == worker_device) {
+        --task.base->outstanding_tasks;
+      // Otherwise send a dummy function task to the owning thread just to
+      // ensure that it's not sleeping. If it has work, it might see that
+      // graph_task->outstanding_tasks == 0 before it gets to the task, but
+      // it's a no-op anyway.
+      } else if (base_owner != worker_device) {
+        if (--task.base->outstanding_tasks == 0) {
+          // Synchronize outstanding_tasks with queue mutex
+          std::atomic_thread_fence(std::memory_order_release);
+          ready_queue(base_owner).push(FunctionTask(task.base, nullptr, InputBuffer(0)));
+        }
+      }
+    }
+  }
+}
+
+auto Engine::thread_on_exception(FunctionTask& task, std::exception& e) -> void {
+  std::lock_guard<std::mutex> lock(task.base->mutex);
+  if (!task.base->has_error.load()) {
+    if (AnomalyMode::is_enabled()) {
+      task.fn->metadata()->print_stack();
+    }
+    task.base->exception = std::current_exception();
+    task.base->has_error = true;
+  }
+}
+
+static variable_list call_pre_hooks(Function& fn, variable_list inputs) {
+  for (const auto& hook : fn.pre_hooks()) {
+    inputs = (*hook)(inputs);
+  }
+  return inputs;
+}
+
+static variable_list call_post_hooks(Function& fn, variable_list outputs, variable_list inputs) {
+  for (const auto& hook : fn.post_hooks()) {
+    outputs = (*hook)(outputs, inputs);
+  }
+  return outputs;
+}
+
+static bool is_compatible_type(const at::Type& expected, const at::Type& actual) {
+  // Types are compatible if they exactly match or if the gradient is a sparse
+  // version of the expected type.
+  return expected == actual || (actual.is_sparse() &&
+      expected == actual.toBackend(toDense(actual.backend())));
+}
+
+template<typename F>
+static void validate_outputs(const edge_list& edges, const variable_list& grads, const F& format_error) {
+  if (grads.size() != edges.size()) {
+    std::stringstream ss;
+    ss << "invalid number of gradients - expected ";
+    ss << edges.size() << ", but got " << grads.size();
+    throw std::runtime_error(format_error(ss.str()));
+  }
+  for (size_t i = 0; i < grads.size(); i++) {
+    const auto& edge = edges[i];
+    if (!edge.is_valid()) continue;
+
+    const auto& metadata = edge.function->input_metadata(edge.input_nr);
+    const auto& output = grads[i];
+    if (!output.defined()) {
+      // FIXME: TestJit.test_ge_optimized fails this assertion.
+      // std::stringstream ss;
+      // ss << "undefined gradient at index " << i;
+      // throw std::runtime_error(format_error(ss.str()));
+      continue;
+    }
+    if (!grads[i].sizes().equals(metadata.shape())) {
+      std::stringstream ss;
+      ss << "invalid gradient at index " << i << " - expected shape ";
+      ss << metadata.shape() << " but got " << grads[i].sizes();
+      throw std::runtime_error(format_error(ss.str()));
+    }
+    if (!is_compatible_type(metadata.type(), grads[i].type())) {
+      std::stringstream ss;
+      ss << "invalid gradient at index " << i << " - expected type ";
+      ss << metadata.type() << " but got " << grads[i].type();
+      throw std::runtime_error(format_error(ss.str()));
+    }
+  }
+}
+
+static variable_list call_function(FunctionTask& task) {
+  bool prev_checkpoint_valid_state = checkpoint_valid;
+  checkpoint_valid = task.base->can_checkpoint() && prev_checkpoint_valid_state;
+  auto& fn = *task.fn;
+  auto inputs = call_pre_hooks(fn, InputBuffer::variables(std::move(task.inputs)));
+
+  if(!task.base->keep_graph) {
+    fn.will_release_variables();
+  }
+
+  const auto has_post_hooks = !fn.post_hooks().empty();
+  variable_list outputs;
+
+  if(has_post_hooks){
+    auto inputs_copy = inputs;
+    outputs = fn(std::move(inputs_copy));
+  }else{
+    outputs = fn(std::move(inputs));
+  }
+
+  validate_outputs(fn.next_edges(), outputs, [&](const std::string& msg) {
+    std::ostringstream ss;
+    ss << "Function "  << fn.name() << " returned an " << msg;
+    return ss.str();
+  });
+  checkpoint_valid = prev_checkpoint_valid_state;
+
+  if(has_post_hooks){
+    return call_post_hooks(fn, std::move(outputs), std::move(inputs));
+  }
+  return outputs;
+}
+
+auto Engine::evaluate_function(FunctionTask& task) -> void {
+  // If exec_info is not empty, we have to instrument the execution
+  auto & exec_info = task.base->exec_info;
+  if (!exec_info.empty()) {
+    auto & fn_info = exec_info.at(task.fn.get());
+    if (auto *capture_vec = fn_info.captures.get()) {
+      std::lock_guard<std::mutex> lock(task.base->mutex);
+      for (auto capture : *capture_vec) {
+        task.base->captured_vars[capture.output_idx] = task.inputs[capture.input_idx];
+      }
+    }
+    if (!fn_info.needed) return;
+  }
+
+  auto outputs = call_function(task);
+
+  auto& fn = *task.fn;
+  if (!task.base->keep_graph) {
+    fn.release_variables();
+  }
+
+  int num_outputs = outputs.size();
+  if (num_outputs == 0) return; // Don't even acquire the mutex
+
+  if (AnomalyMode::is_enabled()) {
+    AutoGradMode grad_mode(false);
+    for (int i = 0; i < num_outputs; ++i) {
+      auto& output = outputs[i];
+      at::DeviceGuard guard(output);
+      if (output.defined() && output.ne(output).any().toCByte()) {
+        std::stringstream ss;
+        ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
+        throw std::runtime_error(ss.str());
+      }
+    }
+  }
+
+  std::lock_guard<std::mutex> lock(task.base->mutex);
+  for (int i = 0; i < num_outputs; ++i) {
+    auto& output = outputs[i];
+    const auto& next = fn.next_edge(i);
+
+    if (!next.is_valid()) continue;
+
+    // Check if the next function is ready to be computed
+    bool is_ready = false;
+    auto& dependencies = task.base->dependencies;
+    auto it = dependencies.find(next.function.get());
+    if (it == dependencies.end()) {
+      auto name = next.function->name();
+      throw std::runtime_error(std::string("dependency not found for ") + name);
+    } else if (--it->second == 0) {
+      dependencies.erase(it);
+      is_ready = true;
+    }
+
+    auto& not_ready = task.base->not_ready;
+    auto not_ready_it = not_ready.find(next.function.get());
+    if (not_ready_it == not_ready.end()) {
+      // Skip functions that aren't supposed to be executed
+      if (!exec_info.empty()) {
+        auto it = exec_info.find(next.function.get());
+        if (it == exec_info.end() || !it->second.should_execute()) {
+          continue;
+        }
+      }
+      // No buffers have been allocated for the function
+      InputBuffer input_buffer(next.function->num_inputs());
+      input_buffer.add(next.input_nr, std::move(output));
+      if (is_ready) {
+        auto& queue = ready_queue(input_buffer.device());
+        queue.push(FunctionTask(task.base, next.function, std::move(input_buffer)));
+      } else {
+        not_ready.emplace(next.function.get(), std::move(input_buffer));
+      }
+    } else {
+      // The function already has a buffer
+      auto &input_buffer = not_ready_it->second;
+      input_buffer.add(next.input_nr, std::move(output));
+      if (is_ready) {
+        auto& queue = ready_queue(input_buffer.device());
+        queue.push(FunctionTask(task.base, next.function, std::move(input_buffer)));
+        not_ready.erase(not_ready_it);
+      }
+    }
+  }
+}
+
+/* Computes the number of dependencies for each function which requires grad */
+auto Engine::compute_dependencies(Function* root, GraphTask& task) -> void {
+  // Just to make sure that they will never be added to the queue again
+  std::unordered_set<Function*> seen;
+  std::vector<Function*> queue { root };
+
+  // Queue contains all nodes that will start propagating gradients.
+  // We no longer have to expand functions that don't require grad.
+  auto& dependencies = task.dependencies;
+  while (queue.size() > 0) {
+    auto fn = queue.back(); queue.pop_back();
+    for (const auto& edge : fn->next_edges()) {
+      if (auto next_ptr = edge.function.get()) {
+        dependencies[next_ptr] += 1;
+        const bool was_inserted = seen.insert(next_ptr).second;
+        if (was_inserted) queue.push_back(next_ptr);
+      }
+    }
+  }
+}
+
+struct ClearCallbacks {
+  ClearCallbacks(std::vector<std::function<void()>>& callbacks,
+                 std::mutex &callbacks_lock)
+    : callbacks(callbacks)
+    , callbacks_lock(callbacks_lock) { clear(); }
+  ~ClearCallbacks() { clear(); }
+
+  void clear() {
+    std::lock_guard<std::mutex> lock(callbacks_lock);
+    callbacks.clear();
+  }
+
+  std::vector<std::function<void()>>& callbacks;
+  std::mutex& callbacks_lock;
+};
+
+auto Engine::execute(const edge_list& input_roots,
+                     const variable_list& inputs,
+                     bool keep_graph,
+                     bool create_graph,
+                     const edge_list& outputs) -> variable_list {
+  std::call_once(start_threads_flag, &Engine::start_threads, this);
+
+  validate_outputs(input_roots, inputs, [](const std::string& msg) {
+    return msg;
+  });
+
+  // Callbacks are only valid for the duration of this run and should always be cleared
+  ClearCallbacks _cb_guard(final_callbacks, post_callbacks_lock);
+
+  GraphTask graph_task(keep_graph, create_graph);
+  std::unique_lock<std::mutex> lock(graph_task.mutex);
+
+  // Now compute the dependencies for all executable functions and queue the root
+  auto graph_root = std::make_shared<GraphRoot>(input_roots, inputs);
+  compute_dependencies(graph_root.get(), graph_task);
+  if (!outputs.empty()) {
+    graph_task.init_to_execute(*graph_root, outputs);
+  }
+  ready_queue(-1).push(FunctionTask(&graph_task, std::move(graph_root), InputBuffer(0)));
+
+  // Not a worker
+  if (worker_device == NO_DEVICE) {
+    // Wait for all tasks to complete
+    graph_task.not_done.wait(lock, [&graph_task]{
+      return graph_task.outstanding_tasks.load() == 0;
+    });
+  } else {
+    // Get back to work while we wait for our new graph_task to
+    // complete!
+    // See Note [Reentrant backwards]
+    graph_task.owner = worker_device;
+    lock.unlock();
+    thread_main(&graph_task);
+  }
+
+  // Check for an exception while running backwards
+  if (graph_task.has_error.load()) {
+    std::rethrow_exception(graph_task.exception);
+  }
+
+  if (!graph_task.not_ready.empty()) {
+    throw std::runtime_error("could not compute gradients for some functions");
+  }
+
+  // Unlocking is necessary, because the callback can register
+  // more callbacks (or they can be registered from other threads
+  // while it's waiting.
+  std::unique_lock<std::mutex> cb_lock(post_callbacks_lock);
+  for (size_t i = 0; i < final_callbacks.size(); ++i) {
+    cb_lock.unlock();
+    final_callbacks[i]();
+    cb_lock.lock();
+  }
+
+  return graph_task.captured_vars;
+}
+
+// note that when python is present, this base engine will be overriden
+// with a PythonEngine. Because this typically happens before get_default_engine
+// is called, this base engine will never be created.
+static Engine& get_base_engine() {
+  static Engine engine;
+  return engine;
+}
+
+std::atomic<EngineStub> engine_stub(get_base_engine);
+
+void set_default_engine_stub(EngineStub stub) {
+  engine_stub.store(stub);
+}
+
+
+Engine& Engine::get_default_engine() {
+  return engine_stub.load()();
+}
+
+void Engine::queue_callback(std::function<void()> callback) {
+  std::lock_guard<std::mutex> lock(post_callbacks_lock);
+  final_callbacks.emplace_back(std::move(callback));
+}
+
+bool Engine::is_checkpoint_valid() {
+  return checkpoint_valid;
+}
+
+auto Engine::ready_queue(int device) -> ReadyQueue& {
+  return *ready_queues.at(device + 1);
+}
+
+auto Engine::start_threads() -> void {
+  int num_devices = 0;
+#ifdef USE_CUDA
+  // check for case of compiled with CUDA but no available devices
+  if (cudaGetDeviceCount(&num_devices) != cudaSuccess) {
+    cudaGetLastError();
+    num_devices = 0;
+  }
+#endif
+  // One for CPU, plus one for every GPU device
+  int num_threads = num_devices + 1;
+  ready_queues = std::vector<std::shared_ptr<ReadyQueue>>(num_threads);
+  for (auto& queue : ready_queues)
+    queue.reset(new ReadyQueue());
+  for (int i = 0; i < num_threads; ++i) {
+    std::thread t(&Engine::thread_init, this, i - 1);
+    t.detach();
+  }
+}
+
+void GraphTask::init_to_execute(Function& graph_root, const edge_list& outputs) {
+  exec_info[&graph_root].needed = true;
+
+  int output_idx = 0;
+  for (auto & output_edge : outputs) {
+    Function *output = output_edge.function.get();
+    auto & info = exec_info[output];
+    if (!info.captures)
+      info.captures.reset(new std::vector<ExecInfo::Capture>());
+    info.captures->emplace_back(output_edge.input_nr, output_idx++);
+  }
+  captured_vars.resize(output_idx);
+
+  // NB: this is an uglier version (recursion replaced with iteration) of the following code:
+  // is_needed = {}
+  // def compute_is_needed(fn):
+  //   if fn not in is_needed:
+  //     is_needed[fn] = any(compute_is_needed(next_edge)
+  //                         for next_edge in fn.next_edges)
+  //   return is_needed[fn]
+  struct Frame {
+    Frame (Function *fn) : fn(fn), next_next_fn(0) {}
+    Function *fn;
+    size_t next_next_fn;
+
+    Function* get_next_fn() {
+      const auto & next = fn->next_edges();
+      auto num_next = next.size();
+      while (next_next_fn < num_next) {
+        auto fn = next[next_next_fn++].function.get();
+        if (fn) return fn;
+      }
+      return nullptr;
+    }
+  };
+  std::vector<Frame> stack;
+  std::unordered_set<Function*> seen;
+  for (const auto & input : graph_root.next_edges()) {
+    if (seen.count(input.function.get()) > 0) continue;
+    stack.emplace_back(input.function.get());
+    while (!stack.empty()) {
+      auto &frame = stack.back();
+      if (Function *next_fn = frame.get_next_fn()) {
+        if (/* bool unseen = */ seen.emplace(next_fn).second) {
+          stack.emplace_back(next_fn);
+          continue; // recurse
+        }
+      } else {
+        // NB: if we were using real recursion we could have saved some lookups
+        // using a return value from recursive call. It would make this manually unrolled
+        // version a lot more complicated, so I skipped that.
+        const auto & next_edges = frame.fn->next_edges();
+        const bool needed = std::any_of(
+            next_edges.begin(), next_edges.end(), [&](const Edge& edge) {
+              auto it = exec_info.find(edge.function.get());
+              return it != exec_info.end() && it->second.should_execute();
+            });
+        exec_info[frame.fn].needed = needed;
+        stack.pop_back();
+      }
+    }
+  }
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
new file mode 100644
index 0000000..734e5b9
--- /dev/null
+++ b/torch/csrc/autograd/engine.h
@@ -0,0 +1,72 @@
+#pragma once
+
+// Engine implements backpropagation from output variables and their gradients
+// to "root" variables (variables created by the user with requires_grad=True).
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/input_buffer.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
+
+#include <deque>
+#include <exception>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace autograd {
+struct ReadyQueue;
+struct FunctionTask;
+struct GraphTask;
+}} // namespace torch::autograd
+
+namespace torch { namespace autograd {
+// A single instance of this struct should be created through the whole process lifetime.
+// The worker thread creation logic and Engine's destructor rely on this.
+struct Engine {
+  /// Returns a reference to a static `Engine` instance.
+  static Engine& get_default_engine();
+
+  Engine();
+  virtual ~Engine();
+
+  using ready_queue_type = std::deque<std::pair<std::shared_ptr<Function>, InputBuffer>>;
+  using dependencies_type = std::unordered_map<Function*, int>;
+
+  // Given a list of (Function, input number) pairs computes the value of the graph
+  // by following next_edge references.
+  virtual variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      const edge_list& outputs = {});
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
+    return nullptr;
+  }
+
+  void queue_callback(std::function<void()> callback);
+
+  bool is_checkpoint_valid();
+
+protected:
+  void compute_dependencies(Function* root, GraphTask& task);
+  void evaluate_function(FunctionTask& task);
+  ReadyQueue& ready_queue(int device);
+  void start_threads();
+  virtual void thread_init(int device);
+  virtual void thread_main(GraphTask *task);
+  virtual void thread_on_exception(FunctionTask& task, std::exception& e);
+
+  std::once_flag start_threads_flag;
+  std::vector<std::shared_ptr<ReadyQueue>> ready_queues;
+  std::vector<std::function<void()>> final_callbacks;
+  std::mutex post_callbacks_lock;
+};
+
+// allow python_engine to override the default engine when it loads
+typedef Engine& (*EngineStub)(void);
+void set_default_engine_stub(EngineStub stub);
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
new file mode 100644
index 0000000..af5e410
--- /dev/null
+++ b/torch/csrc/autograd/function.cpp
@@ -0,0 +1,125 @@
+#include "torch/csrc/autograd/function.h"
+
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/ir.h"
+
+#include <ATen/ATen.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+#include <deque>
+
+namespace torch { namespace autograd {
+
+thread_local uint64_t Function::next_sequence_nr_ = 0;
+
+auto Function::name() const -> std::string {
+  return at::demangle(typeid(*this).name());
+}
+
+AnomalyMetadata* Function::metadata() noexcept {
+  if (!anomaly_metadata_) {
+    anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
+  }
+  return anomaly_metadata_.get();
+}
+
+/*
+ * Fix for #5534: prevent stack overflow on deletion of deep computation graph
+ *
+ * Sometimes one can end up with a very big computation graph of Functions
+ * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and
+ * each Edge contains a std::shared_ptr<Function>. Deleting a
+ * std::shared_ptr<Function> can trigger the recursive deletion of other
+ * std::shared_ptr<Function>'s: this can stack overflow if the graph
+ * is deep enough. Here is an example of such a graph:
+ *
+ * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function>
+ *
+ * The solution here is to use a custom deleter with each
+ * std::shared_ptr<Function>. The custom deleter keeps track of how many
+ * nested deleters it is in. When this number exceeds the maximum allowed
+ * depth, the Function* to be deleted are accumulated in a per-thread
+ * delete queue and handled by one of the deleters.
+ *
+ * Note that these custom deleters are NOT necessary for deleting PyFunction.
+ * This is because a THPFunction Python object owns a PyFunction that is in a
+ * computation graph. When Python objects get recursively destroyed, they
+ * are also queued into a delete list. This happens very early for them
+ * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063
+ * so we don't need to worry about them.
+ */
+
+thread_local std::deque<Function*> deleteFunctionQueue;
+thread_local size_t deleteFunctionRecursionDepth = 0;
+
+/*
+ * If this number is set too high, a deep computation graph can still
+ * stack overflow. The procedure for setting this number was to
+ * 1) find the smallest value that would not guard against stack overflows
+ *    on various machines
+ * 2) Take the minimum of all such values and subtract some leeway because
+ *    the memory of these stack frames will probably grow as time passes.
+ * Testing on a few machines machines, the magic numbers were:
+ * - Mac OSX (Macbook Pro 15) : ~60000
+ * - A beefy Ubuntu 16.04 box : ~15000
+ * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different
+ *   times have gotten the following numbers: ~8300, 3669
+ */
+#ifdef _WIN32
+constexpr size_t kDeleteFunctionMaxRecursionDepth = 3000;
+#else
+constexpr size_t kDeleteFunctionMaxRecursionDepth = 10000;
+#endif
+
+struct RecursionDepthCounter {
+ public:
+  explicit RecursionDepthCounter() {
+    ++deleteFunctionRecursionDepth;
+  }
+  ~RecursionDepthCounter() {
+    --deleteFunctionRecursionDepth;
+  }
+
+  size_t value() {
+    return deleteFunctionRecursionDepth;
+  }
+};
+
+/*
+ * Note that the custom deleter deletes in BFS style. Without using
+ * the custom deleter, the computation graph is deleted in a DFS style.
+ * The BFS deletion is valid (and safe) because if a shared_ptr<Function>
+ * 's reference count hits 0, nothing else will access it.
+ */
+void deleteFunction(Function* function) {
+  RecursionDepthCounter recursion_depth;
+
+  if (recursion_depth.value() > kDeleteFunctionMaxRecursionDepth) {
+    deleteFunctionQueue.push_back(function);
+    return;
+  }
+
+  delete function;
+
+  if (deleteFunctionQueue.size() == 0) {
+    return;
+  }
+  if (recursion_depth.value() != kDeleteFunctionMaxRecursionDepth) {
+    AT_ERROR("Only one deleter per thread should be able to process "
+             "the delete queue. Please open an issue.");
+  }
+  while (deleteFunctionQueue.size() > 0) {
+    auto queued_function = deleteFunctionQueue.front();
+    deleteFunctionQueue.pop_front();
+    delete queued_function;
+  }
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
new file mode 100644
index 0000000..90189e4
--- /dev/null
+++ b/torch/csrc/autograd/function.h
@@ -0,0 +1,395 @@
+#pragma once
+
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/autograd/type_and_shape.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/auto_unique_ptr.h"
+#include "torch/csrc/utils/python_stub.h"
+#include "torch/csrc/utils/variadic.h"
+
+#include <ATen/ATen.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+struct Edge;
+struct FunctionPostHook;
+struct FunctionPreHook;
+
+using tensor_list = std::vector<at::Tensor>;
+using variable_list = std::vector<Variable>;
+using edge_list = std::vector<Edge>;
+using saved_variable_list = std::vector<SavedVariable>;
+using IndexRange = std::pair<size_t, size_t>;
+
+// Custom deleter to prevent stack overflows.
+void deleteFunction(Function* function);
+
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+///                               Function
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// A `Function` is an abstract class that represents an operation taking zero
+/// or more input `Variable`s and producing zero or more output `Variable`s. All
+/// functions in PyTorch's autograd machinery derive from this class and
+/// override its `apply` method. Instances of such subclasses will then be
+/// invokeable via the call operator.
+///
+///                    Functions in the Autograd Graph
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// When viewing the autograd system as a graph, `Function`s are the vertices or
+/// nodes, connected to each other via (directed) `Edge`s, which themselves are
+/// represented via (`Function`, input_nr) pairs. `Variable`s are the outputs to
+/// and inputs of `Function`s, and travel between these edges during execution
+/// of the graph. When two or more `Edge`s (from different sources) point at the
+/// same input to a `Function`, the values produced along all of these edges are
+/// implicitly summed prior to being forwarded to the target `Function`.
+///
+///                              Hierarchy
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Subclasses usually represent differentiable functions as well as their
+/// gradient operators. Note, however, that due to the very general definition
+/// of a `Function` taking *zero* or more inputs and producing *zero* or more
+/// outputs, uses of `Function`s are flexible and extend beyond purely
+/// mathematical operations. For example, the `AccumulateGrad` function is a
+/// *sink*: it takes one input, but produces no outputs, instead accumulating
+/// the input as a side effect. At the other extreme, the `GraphRoot` function
+/// receives no inputs from other functions, but produces multiple outputs.
+///
+///                              Interface
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// The most important method on `Function` is the call operator, which takes in
+/// a list of variables and produces a list of variables. The precise size of
+/// these lists can be determined with `num_inputs()` and `num_outputs()`.
+/// `Function`s are stitched together via their `next_edge` interface, which let
+/// you manipulate the set of outgoing edges of a `Function`. You can add an
+/// edge with `add_next_edge()`, retrieve an edge with `next_edge(index)` and
+/// iterate over them via the `next_edges()` method. Other methods exist for
+/// integration with the JIT and other parts of PyTorch. Every `Function` has a
+/// *sequence number* that increases monotonically in the order of `Function`
+/// construction. It can be retrieved via the `sequence_nr()` method. Note that
+/// this sequence number is *thread local*. This means that when `Function`s
+/// `A`, `B` and `C` are created consecutively in the same thread, their
+/// sequence numbers will be ordered `A` < `B` < `C`. If, however, `A` and `B`
+/// are created in one thread and `C` is created in a new thread, there are *no
+/// guarantees* w.r.t. the ordering of `C` relative to `A` or `B`.
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+struct Function : std::enable_shared_from_this<Function> {
+ public:
+  /// Construct a new `Function` with `num_inputs` inputs and the given
+  /// `next_edges`. sequence_nr is a (currently THE) hint to prioritization
+  /// in the backward() pass, with higher sequence numbers prioritized
+  /// before lower sequence numbers.
+  explicit Function(
+      uint64_t sequence_nr,
+      edge_list&& next_edges = edge_list())
+      : sequence_nr_(sequence_nr),
+      next_edges_(std::move(next_edges)) {
+    if (AnomalyMode::is_enabled()) {
+      metadata()->store_stack();
+    }
+  }
+
+  explicit Function(
+      edge_list&& next_edges = edge_list())
+      : Function(next_sequence_nr_++, std::move(next_edges)) {}
+
+  /// Functions are neither copyable nor moveable.
+  Function(const Function& other) = delete;
+  Function(Function&& other) = delete;
+  Function& operator=(const Function& other) = delete;
+  Function& operator=(Function&& other) = delete;
+  virtual ~Function() = default;
+
+  /// Evaluates the function on the given inputs and returns the result of the
+  /// function call.
+  variable_list operator()(variable_list&& inputs) {
+    profiler::RecordFunction rec(this);
+    return apply(std::move(inputs));
+  }
+
+  // Graph Connectivity API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // Inputs. NOTE: inputs of the grad_fn correspond to Tensor outputs of the
+  // forward function.
+
+  // Marker for expected undefined input
+  struct undefined_input {};
+
+  /// Adds the type and shape metadata for a new input. Returns the index of
+  /// of the new input.
+  uint32_t add_input_metadata(const at::Type& type, at::IntList shape) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back(type, shape);
+    return input_nr;
+  }
+
+  /// Adds a placeholder for an input that will not be used.
+  uint32_t add_input_metadata(undefined_input u) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back();
+    return input_nr;
+  }
+
+  uint32_t num_inputs() const noexcept {
+    return input_metadata_.size();
+  }
+
+  const TypeAndShape& input_metadata(size_t index) const {
+    return input_metadata_[index];
+  }
+
+  void clear_input_metadata() {
+    input_metadata_.clear();
+  }
+
+  // Outputs ("Next Edges")
+
+  const Edge& next_edge(size_t index) const noexcept {
+    return next_edges_[index];
+  }
+
+  void set_next_edge(size_t index, Edge edge) {
+    next_edges_[index] = std::move(edge);
+  }
+
+  void add_next_edge(Edge edge) {
+    next_edges_.push_back(std::move(edge));
+  }
+
+  void set_next_edges(edge_list&& next_edges) {
+    next_edges_ = std::move(next_edges);
+  }
+
+  const edge_list& next_edges() const noexcept {
+    return next_edges_;
+  }
+
+  edge_list& next_edges() noexcept {
+    return next_edges_;
+  }
+
+  uint32_t num_outputs() const noexcept {
+    return next_edges_.size();
+  }
+
+  // Miscellaneous Methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// The sequence number of this `Function`.
+  uint64_t sequence_nr() const noexcept {
+    return sequence_nr_;
+  }
+
+  /// Returns a shared pointer to `this`. `PyFunction`s are not managed by
+  /// `shared_ptr`s by default, but are bound to the lifetime of their Python
+  /// object instead.
+  virtual std::shared_ptr<Function> get_shared_ptr() {
+    return shared_from_this();
+  }
+
+  /// Returns the name of the dynamic type of the function, for debugging.
+  virtual std::string name() const;
+
+  /// Returns true if the particular output edge is active, and that particular
+  /// output of this function should be computed.
+  bool should_compute_output(size_t output_edge_index) const {
+    TORCH_ASSERTM(output_edge_index < num_outputs(), "Index out of range");
+    return next_edges_[output_edge_index].is_valid();
+  }
+
+  /// Returns true if any of the output edges in any of the ranges are active.
+  bool should_compute_output(std::initializer_list<IndexRange> idxs) const {
+    return std::any_of(idxs.begin(), idxs.end(), [this](IndexRange range) {
+      for (auto i = range.first; i < range.second; i++) {
+        if (should_compute_output(i))
+          return true;
+      }
+      return false;
+    });
+  }
+
+  /// Returns the `PyObject` stored for this `Function` (for Python
+  /// interaction).
+  PyObject* pyobj() const noexcept {
+    return pyobj_;
+  }
+
+  /// Sets the `PyObject` stored for this `Function` (for Python interaction).
+  void set_pyobj(PyObject* pyobj) noexcept {
+    pyobj_ = pyobj;
+  }
+
+  /// Returns the anomaly metadata stored for this `Function`.
+  /// If none exist, creates a new empty one.
+  AnomalyMetadata* metadata() noexcept;
+
+  // Hook API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  void add_post_hook(std::unique_ptr<FunctionPostHook>&& post_hook) {
+    post_hooks_.push_back(std::move(post_hook));
+  }
+
+  const std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks() const
+      noexcept {
+    return post_hooks_;
+  }
+
+  std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks() noexcept {
+    return post_hooks_;
+  }
+
+  void add_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
+    pre_hooks_.push_back(std::move(pre_hook));
+  }
+
+  const std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks() const
+      noexcept {
+    return pre_hooks_;
+  }
+
+  std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks() noexcept {
+    return pre_hooks_;
+  }
+
+  // Customization Points for Subclasses
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Releases saved variables if the operation won't be reused.
+  virtual void release_variables() {}
+
+  /// Called before an apply if `release_variables()` is going to be called.
+  /// Allows larger ops like `InterpreterAutogradFunction` to incrementally
+  /// release variables as they run.
+  virtual void will_release_variables() {}
+
+  /// Returns true if this function is traceable. An op is traceable if all
+  /// operations happening within `apply()` are performed on autograd
+  /// `Variables` (i.e. apply mostly instantiates and applies other functions).
+  virtual bool is_traceable() {
+    return false;
+  }
+
+  /// A `Function` is said to pass state transparently to backward, if the
+  /// state consists only of (Saved)Variables and only non-variable objects
+  /// that parameterize the operation in some way that defines the graph
+  /// structure AND the backward function is traceable. In particular,
+  /// parametrization MUST NOT depend on the data of any `Variable`.
+  /// TODO: it might be possible to handle cases where backward is
+  /// non-traceable but state passing could be considered transparent. This
+  /// will probably depend on saved_variable_list being mutable.
+  /// NOTE: this value matters only if is_traceable() returns false.
+  virtual bool passes_state_transparently() {
+    return false;
+  }
+
+  /// Returns `Variable`s saved by this `Function`.
+  /// This let's the JIT find inputs to apply that are not present explicitly
+  /// in arguments. Required only for functions that are not traceable, don't
+  /// pass state to backward transparently, and are not backwards closures of
+  /// functions that don't pass the state transparently. Which means that
+  /// hopefully they will hardly ever need to be implemented :)
+  virtual std::unique_ptr<saved_variable_list> saved_variables() {
+    return nullptr;
+  }
+
+ protected:
+  /// Monotonically incrementing (thread local!) counter to supply sequence
+  /// numbers.
+  static thread_local uint64_t next_sequence_nr_;
+
+  /// Performs the `Function`'s actual operation.
+  virtual variable_list apply(variable_list&& inputs) = 0;
+
+  /// Calls `apply()`, but instruments it with tracing machinery.
+  variable_list traced_apply(variable_list inputs);
+
+  // Since `Function`s are neither copyable nor moveable, we can have const
+  // fields.
+  const uint64_t sequence_nr_;
+
+  edge_list next_edges_;
+  PyObject* pyobj_ = nullptr; // weak reference
+  std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
+  std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
+  std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
+  at::SmallVector<TypeAndShape, 2> input_metadata_;
+};
+
+/// See Function::is_traceable() for definition.
+struct TraceableFunction : public Function {
+  using Function::Function;
+  bool is_traceable() final override {
+    return true;
+  }
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                       Associated Free Functions
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+// Implementation of `collect_next_edges` (see below).
+struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
+  edge_list next_edges;
+  using IterArgs<MakeNextFunctionList>::operator();
+  void operator()(const Variable& variable) {
+    if (variable.defined()) {
+      next_edges.push_back(variable.gradient_edge());
+    } else {
+      next_edges.emplace_back();
+    }
+  }
+};
+} // namespace detail
+
+/// Create an `Edge` between the given `variable` and the `function`, which is
+/// assumed to be the gradient function of this variable (i.e. the function
+/// through which this variable is backpropagated during the backward pass).
+/// This sets the `grad_fn` property of the `variable`. This function assumes
+/// that the `Variable` is a new input to the gradient function and its
+/// `input_nr` thus equal to `function->num_inputs()`. Additionally, it
+/// increments the `Function`'s number of inputs by one. Approximately
+/// equivalent to `variable.set_gradient_edge(function,
+/// function->add_input_metadata(variable.type(), variable.sizes()))`.
+/// If you don't want the `Function`'s `num_inputs` to be incremented, use
+/// `set_gradient_edge` directly.
+inline void create_gradient_edge(
+    Variable& variable,
+    std::shared_ptr<Function> function) {
+  // Copy before move.
+  const auto input_nr = function->add_input_metadata(variable.type(), variable.sizes());
+  variable.set_gradient_edge({std::move(function), input_nr});
+}
+
+/// Return true if any of the variables in the list require a gradient.
+inline bool any_variable_requires_grad(const variable_list& variables) {
+  return std::any_of(
+      variables.begin(), variables.end(), [](const Variable& variable) {
+        return variable.defined() && variable.requires_grad();
+      });
+}
+
+/// Return the next edges of all the given variables, or tuples of variables.
+template <typename... Variables>
+edge_list collect_next_edges(Variables&&... variables) {
+  if (!GradMode::is_enabled())
+    return {};
+  detail::MakeNextFunctionList make;
+  make.apply(std::forward<Variables>(variables)...);
+  return std::move(make.next_edges);
+}
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
new file mode 100644
index 0000000..03c52fe
--- /dev/null
+++ b/torch/csrc/autograd/function_hook.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <vector>
+
+// A hook that's called on gradients
+
+namespace torch { namespace autograd {
+
+struct Variable;
+using variable_list = std::vector<Variable>;
+
+struct FunctionPreHook {
+  virtual ~FunctionPreHook() {}
+  virtual variable_list operator()(const variable_list& grads) = 0;
+};
+
+struct FunctionPostHook {
+  virtual ~FunctionPostHook() {}
+  virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0;
+};
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
new file mode 100644
index 0000000..391cf36
--- /dev/null
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -0,0 +1,63 @@
+#include "torch/csrc/autograd/functions/accumulate_grad.h"
+
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+#include "torch/csrc/autograd/functions/tensor.h"
+#include "torch/csrc/autograd/functions/utils.h"
+
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+
+using at::Tensor;
+
+namespace torch { namespace autograd {
+
+// AccumulateGrad sets sequence_nr to the max value so it's always called
+// ASAP during backwards.
+AccumulateGrad::AccumulateGrad(Variable variable_)
+    : Function(/*sequence_nr=*/UINT64_MAX)
+    , variable(std::move(variable_)) {
+  add_input_metadata(variable.type(), variable.sizes());
+}
+
+auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
+  // XXX: this method is not thread-safe!
+  check_input_variables("AccumulateGrad", grads, 1, 0);
+
+  if (!grads[0].defined())
+    return {};
+  if (variable.grad_fn())
+    throw std::logic_error("leaf variable has been moved into the graph interior");
+  if (!variable.requires_grad())
+    return {};
+
+  auto new_grad = grads[0];
+  for (auto& hook : variable.hooks()) {
+    new_grad = (*hook)({new_grad})[0];
+  }
+
+  at::Tensor& grad = variable.grad();
+  if (!grad.defined()) {
+    variable.grad() = new_grad.clone();
+  } else if (!GradMode::is_enabled()) {
+    Variable& grad_variable = as_variable_ref(grad);
+    // This case is not strictly necessary, but it makes the first-order only case
+    // slightly more efficient and, what's more important, more predictable for
+    // the users. Thanks to this case we can avoid changing the grad tensor,
+    // a thing never promised and documented, but used in some hacks seen
+    // on the internet.
+    if (grad_variable.type().is_sparse() && !new_grad.type().is_sparse()) {
+      grad_variable.data() = new_grad.data() + grad_variable.data();
+    } else {
+      grad_variable.data() += new_grad.data();
+    }
+  } else {
+    variable.grad() = grad + new_grad;
+  }
+
+  return variable_list();
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
new file mode 100644
index 0000000..44d4b7f
--- /dev/null
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct AccumulateGrad : public Function {
+  explicit AccumulateGrad(Variable variable);
+
+  variable_list apply(variable_list&& inputs) override;
+
+  Variable variable;
+};
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
new file mode 100644
index 0000000..b04b0f2
--- /dev/null
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -0,0 +1,30 @@
+#include "torch/csrc/autograd/functions/basic_ops.h"
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/functions/utils.h"
+
+#include <ATen/ATen.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch { namespace autograd {
+
+auto Error::apply(variable_list&& grad_outputs) -> variable_list {
+  throw std::runtime_error(msg);
+}
+
+auto DelayedError::apply(variable_list&& inputs) -> variable_list {
+  tensor_list outputs;
+  outputs.reserve(inputs.size());
+  for (auto& var : inputs) {
+    // FIXME: share version counters
+    outputs.emplace_back(var.defined() ? var.data() : at::Tensor());
+  }
+  return wrap_outputs(inputs, std::move(outputs), [&](edge_list&& next_edges) {
+    return std::make_shared<Error>(msg, std::move(next_edges));
+  });
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h
new file mode 100644
index 0000000..7c92b42
--- /dev/null
+++ b/torch/csrc/autograd/functions/basic_ops.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/symbolic.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+struct Error : public Function {
+  Error(std::string msg, edge_list&& next_edges)
+    : Function(std::move(next_edges))
+    , msg(std::move(msg)) {}
+
+  Error(std::string msg)
+    : msg(std::move(msg)) {}
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::string msg;
+};
+
+// Identity in forward, Error in backward. Used to implement @once_differentiable
+struct DelayedError : public Function {
+  DelayedError(std::string msg, int num_inputs)
+    : msg(std::move(msg)) {
+      for (int i = 0; i < num_inputs; i++)
+        add_input_metadata(Function::undefined_input());
+    }
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::string msg;
+};
+
+struct GraphRoot : public Function {
+  GraphRoot(edge_list functions, variable_list inputs)
+      : Function(std::move(functions)),
+        outputs(std::move(inputs)) {}
+
+  variable_list apply(variable_list&& inputs) override {
+    return outputs;
+  }
+
+  variable_list outputs;
+};
+
+}}
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
new file mode 100644
index 0000000..6988e65
--- /dev/null
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -0,0 +1,114 @@
+#include "Python.h"
+#include "accumulate_grad.h"
+#include "basic_ops.h"
+#include "tensor.h"
+#include "torch/csrc/autograd/functions/pybind.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include "torch/csrc/autograd/generated/python_functions.h"
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/tuple_parser.h"
+
+using namespace torch::autograd;
+using torch::TupleParser;
+
+struct DelayedErrorCtor {
+  DelayedError* operator()(PyObject* args) {
+    std::string msg;
+    int num_inputs;
+
+    TupleParser parser(args, 2);
+    parser.parse(msg, "msg");
+    parser.parse(num_inputs, "num_inputs");
+
+    return new DelayedError(msg, num_inputs);
+  }
+};
+
+struct NoCtor {
+  Function* operator()(PyObject* args) {
+    throw std::runtime_error("Cannot construct");
+  }
+};
+
+template<typename C, typename T>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=nullptr, PyMethodDef* function_methods=nullptr)
+{
+  createForwardFunctionPyTypeObject<T>(type, name, function_properties, function_methods);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+template<typename T, typename ValueT, typename ParamsT, ValueT ParamsT::*ptr,
+         typename ConvertArgT, PyObject* (*Convert)(ConvertArgT)>
+PyObject* getTupleAttr(PyObject* obj, void* _unused)
+{
+  HANDLE_TH_ERRORS
+  THPCppFunction* self = (THPCppFunction*)obj;
+  auto& arr = ((T*)(self->cdata.get()))->*ptr;
+  auto num_elems = arr.size();
+  THPObjectPtr py_tuple(PyTuple_New(num_elems));
+  if (!py_tuple) return nullptr;
+  for (size_t i = 0; i < num_elems; ++i) {
+    PyTuple_SET_ITEM(py_tuple.get(), i, Convert(arr[i]));
+  }
+  return py_tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+template<typename T, typename ValueT, typename ParamsT, ValueT ParamsT::*ptr,
+         typename ConvertArgT, PyObject* (*Convert)(ConvertArgT)>
+PyObject* getValueAttr(PyObject* obj, void* _unused)
+{
+  HANDLE_TH_ERRORS
+  THPCppFunction* self = (THPCppFunction*)obj;
+  auto& val = ((T*)(self->cdata.get()))->*ptr;
+  return Convert(val);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* accumulateGradVar(PyObject *_self, void* _unused)
+{
+  THPCppFunction* self = (THPCppFunction*)_self;
+  auto grad_acc = (AccumulateGrad*)self->cdata.get();
+  return THPVariable_Wrap(grad_acc->variable);
+}
+
+static struct PyGetSetDef accumulate_grad_properties[] = {
+  THP_FUNCTION_DEFAULT_PROPERTIES,
+  {(char*)"variable", accumulateGradVar, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+void THPAutograd_initFunctions()
+{
+  THPObjectPtr module(PyModule_New("torch._C._functions"));
+  if (!module) throw python_error();
+
+  static PyTypeObject AccumulateGradClass;
+  addClass<AccumulateGrad, NoCtor>(module, AccumulateGradClass, "AccumulateGrad", accumulate_grad_properties);
+
+  static PyTypeObject ErrorClass;
+  addClass<Error, NoCtor>(module, ErrorClass, "Error");
+
+  static PyTypeObject DelayedErrorClass;
+  addClass<DelayedError, DelayedErrorCtor>(module, DelayedErrorClass, "DelayedError");
+
+  static PyTypeObject CopyBackwardsClass;
+  addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
+
+  static PyTypeObject CopySlicesClass;
+  addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
+
+  generated::initialize_autogenerated_functions();
+
+  auto c_module = THPObjectPtr(PyImport_ImportModule("torch._C"));
+  if (!c_module) throw python_error();
+
+  Py_INCREF(module);
+  if (PyModule_AddObject(c_module, "_functions", module) < 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/autograd/functions/pybind.h b/torch/csrc/autograd/functions/pybind.h
new file mode 100644
index 0000000..36c82a7
--- /dev/null
+++ b/torch/csrc/autograd/functions/pybind.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+namespace py = pybind11;
+
+namespace pybind11 { namespace detail {
+
+// handle Python <-> torch::autograd::Function conversions
+template <> struct type_caster<std::shared_ptr<torch::autograd::Function>> {
+public:
+  PYBIND11_TYPE_CASTER(std::shared_ptr<torch::autograd::Function>, _("std::shared_ptr<torch::autograd::Function>"));
+
+  bool load(handle src, bool) {
+    if (!THPFunction_Check(src.ptr())) return false;
+    value = THPFunction_asFunction((THPFunction*)src.ptr());
+    return true;
+  }
+  static handle cast(std::shared_ptr<torch::autograd::Function> src, return_value_policy /* policy */, handle /* parent */) {
+    auto fn = functionToPyObject(src);
+    return handle(fn);
+  }
+};
+
+
+}} // namespace pybind11::detail
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
new file mode 100644
index 0000000..622bc25
--- /dev/null
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -0,0 +1,94 @@
+#include "torch/csrc/autograd/functions/tensor.h"
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+#include "torch/csrc/autograd/functions/utils.h"
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <memory>
+#include <stdexcept>
+#include <utility>
+
+namespace torch { namespace autograd {
+
+auto CopyBackwards::apply(variable_list&& grads) -> variable_list {
+  check_input_variables("CopyBackwards", grads, 1);
+  auto& grad = grads[0];
+  variable_list grad_inputs(2);
+  if (should_compute_output(0)) {
+    grad_inputs[0] = at::zeros_like(grad);
+  }
+  if (should_compute_output(1)) {
+    at::DeviceGuard device_guard(src_device);
+    if (grad.is_cuda() && grad.get_device() != src_device) {
+      grad_inputs[1] = src_type->copy(grad);
+    } else {
+      grad_inputs[1] = grad.toType(*src_type);
+    }
+  }
+  return grad_inputs;
+}
+
+CopySlices::CopySlices(
+    const Variable& base_var,
+    at::TensorGeometry view_,
+    std::shared_ptr<Function> fn_)
+    : Function(),
+      base(base_var),
+      view(std::move(view_)),
+      fn(std::move(fn_)) {
+  // Take the next_edges of fn as our own, except for index 0 which goes
+  // to base instead of the view.
+  add_input_metadata(base_var.type(), base_var.sizes());
+  const auto num_outputs = fn->num_outputs();
+  next_edges_.reserve(num_outputs);
+  add_next_edge(base_var.gradient_edge());
+  for (size_t i = 1; i < num_outputs; i++) {
+    add_next_edge(fn->next_edge(i));
+  }
+}
+
+auto CopySlices::apply(variable_list&& inputs) -> variable_list {
+  check_input_variables("CopySlices", inputs, 1);
+  auto& grad = inputs[0];
+
+  if (!fn) {
+    throw std::runtime_error(ERR_BACKWARD_TWICE);
+  }
+
+  auto result = grad.type().tensor(base.sizes(), base.strides());
+  result.copy_(grad);
+
+  auto offset = view.storage_offset() - base.storage_offset();
+  auto grad_slice = result.as_strided(view.sizes(), view.strides(), offset);
+
+  // TODO: We clone grad_slice because we modify it below and "fn" might save
+  // it for the backward of res. We might be able to avoid the clone() if
+  // double-backprop is disabled.
+  auto res = (*fn)({ grad_slice.clone() });
+
+  variable_list grad_inputs(num_outputs());
+  for (size_t i = 0; i < res.size(); i++) {
+    if (should_compute_output(i)) {
+      TORCH_ASSERT(res[i].defined());
+      if (i == 0) {
+        grad_slice.copy_(res[i]);
+        grad_inputs[i] = std::move(result);
+      } else {
+        grad_inputs[i] = std::move(res[i]);
+      }
+    }
+  }
+
+  return grad_inputs;
+}
+
+void CopySlices::release_variables() {
+  fn = nullptr;
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
new file mode 100644
index 0000000..aa4b422
--- /dev/null
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include "ATen/Type.h"
+#include <ATen/TensorGeometry.h>
+#include <ATen/optional.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace torch { namespace autograd {
+
+struct CopyBackwards : public Function {
+  variable_list apply(variable_list&& inputs) override;
+
+  at::Type *src_type;
+  int32_t src_device = -1;
+};
+
+// Performs grad[idx] = fn(grad[idx]), but out-of-place. The slicing operation
+// grad[idx] is defined by the relative sizes, strides, and offset of base and
+// view.
+struct CopySlices : public Function {
+  CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr<Function> fn);
+
+  variable_list apply(variable_list&& grads) override;
+  void release_variables() override;
+
+  at::TensorGeometry base;
+  at::TensorGeometry view;
+  std::shared_ptr<Function> fn;
+};
+
+}}
diff --git a/torch/csrc/autograd/functions/utils.cpp b/torch/csrc/autograd/functions/utils.cpp
new file mode 100644
index 0000000..a753492
--- /dev/null
+++ b/torch/csrc/autograd/functions/utils.cpp
@@ -0,0 +1,58 @@
+#include "torch/csrc/autograd/functions/utils.h"
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include <sstream>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
+                           function_constructor ctr) {
+  variable_list result;
+  result.reserve(outputs.size());
+  if (!any_variable_requires_grad(inputs)) {
+    for (auto& output : outputs) {
+      if (output.defined()) {
+        result.push_back(make_variable(output, /*requires_grad=*/false));
+      } else {
+        result.emplace_back();
+      }
+    }
+  } else {
+    auto grad_fn = ctr(collect_next_edges(inputs));
+    for (auto& output : outputs) {
+      if (output.defined()) {
+        auto variable = autograd::make_variable(output, /*requires_grad=*/false);
+        autograd::create_gradient_edge(variable, grad_fn);
+        result.push_back(std::move(variable));
+      } else {
+        grad_fn->add_input_metadata(Function::undefined_input());
+        result.emplace_back();
+      }
+    }
+  }
+  return result;
+}
+
+void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args) {
+  if (required_args == -1) {
+    required_args = args;
+  }
+  if (inputs.size() != (size_t)args) {
+    std::stringstream ss;
+    ss << name << ": expected " << args << " arguments (got " << inputs.size();
+    ss << ")";
+    throw std::runtime_error(ss.str());
+  }
+  for (int i = 0; i < required_args; ++i) {
+    if (!inputs[i].defined()) {
+      std::stringstream ss;
+      ss << name << ": expected Tensor at argument " << i << " (got None)";
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
new file mode 100644
index 0000000..5f4d8cd
--- /dev/null
+++ b/torch/csrc/autograd/functions/utils.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+using function_constructor = std::function<std::shared_ptr<Function>(edge_list&&)>;
+
+/**
+ * Wraps the tensor outputs in variables and creates the grad_fn and sets the
+ * grad_fn if necessary.
+ */
+variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
+                           function_constructor ctr);
+
+/**
+ * Checks that inputs contains exactly `args` items and that the first `required_args`
+ * items are not nullptr. If not specified, `required_args` defaults to `args`.
+ */
+void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1);
+}}
diff --git a/torch/csrc/autograd/grad_mode.cpp b/torch/csrc/autograd/grad_mode.cpp
new file mode 100644
index 0000000..6409c69
--- /dev/null
+++ b/torch/csrc/autograd/grad_mode.cpp
@@ -0,0 +1,7 @@
+#include "grad_mode.h"
+
+namespace torch { namespace autograd {
+
+thread_local bool GradMode::_enabled = 1;
+
+}}
diff --git a/torch/csrc/autograd/grad_mode.h b/torch/csrc/autograd/grad_mode.h
new file mode 100644
index 0000000..31a5147
--- /dev/null
+++ b/torch/csrc/autograd/grad_mode.h
@@ -0,0 +1,28 @@
+#pragma once
+
+namespace torch { namespace autograd {
+
+struct GradMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static void set_enabled(bool enabled) {
+    _enabled = enabled;
+  }
+private:
+  static thread_local bool _enabled;
+};
+
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AutoGradMode {
+  AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
+    GradMode::set_enabled(enabled);
+  }
+  ~AutoGradMode() {
+    GradMode::set_enabled(prev_mode);
+  }
+  bool prev_mode;
+};
+
+}}
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
new file mode 100644
index 0000000..efc6cad
--- /dev/null
+++ b/torch/csrc/autograd/init.cpp
@@ -0,0 +1,113 @@
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/python_function.h"
+
+PyObject * THPAutograd_initExtension(PyObject *_unused)
+{
+  auto tensor_module = THPObjectPtr(PyImport_ImportModule("torch.tensor"));
+  if (!tensor_module) throw python_error();
+
+  // NOTE: "leaks" THPVariableClass
+  THPVariableClass = PyObject_GetAttrString(tensor_module, "Tensor");
+  if (!THPVariableClass) throw python_error();
+
+  auto autograd_module = THPObjectPtr(PyImport_ImportModule("torch.autograd"));
+  if (!autograd_module) throw python_error();
+
+  // NOTE: "leaks" Function
+  THPFunctionClass = PyObject_GetAttrString(autograd_module, "Function");
+  if (!THPFunctionClass) throw python_error();
+
+  auto m = py::handle(autograd_module).cast<py::module>();
+
+  py::class_<torch::autograd::profiler::Event>(m,"ProfilerEvent")
+  .def("kind",&torch::autograd::profiler::Event::kind)
+  .def("name",&torch::autograd::profiler::Event::name)
+  .def("thread_id",&torch::autograd::profiler::Event::thread_id)
+  .def("device",&torch::autograd::profiler::Event::device)
+  .def("cpu_elapsed_us",&torch::autograd::profiler::Event::cpu_elapsed_us)
+  .def("cuda_elapsed_us",&torch::autograd::profiler::Event::cuda_elapsed_us)
+  .def("has_cuda",&torch::autograd::profiler::Event::has_cuda);
+  py::enum_<torch::autograd::profiler::ProfilerState>(m,"ProfilerState")
+  .value("Disabled", torch::autograd::profiler::ProfilerState::Disabled)
+  .value("CPU", torch::autograd::profiler::ProfilerState::CPU)
+  .value("CUDA", torch::autograd::profiler::ProfilerState::CUDA)
+  .value("NVTX", torch::autograd::profiler::ProfilerState::NVTX);
+
+  m.def("_enable_profiler", torch::autograd::profiler::enableProfiler);
+  m.def("_disable_profiler", torch::autograd::profiler::disableProfiler);
+
+  m.def("_push_range", [](const char *name) {
+    using namespace torch::autograd::profiler;
+    if (state  == ProfilerState::Disabled) return;
+    pushRange(name);
+  });
+  m.def("_pop_range", []() {
+    using namespace torch::autograd::profiler;
+    if (state  == ProfilerState::Disabled) return;
+    popRange();
+  });
+
+  Py_RETURN_TRUE;
+}
+
+namespace torch { namespace autograd {
+
+static PyObject * set_grad_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  GradMode::set_enabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (GradMode::is_enabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  AnomalyMode::set_enabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (AnomalyMode::is_enabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+// autograd methods on torch._C
+static PyMethodDef methods[] = {
+  {"set_grad_enabled", (PyCFunction)set_grad_enabled, METH_O, nullptr},
+  {"is_grad_enabled", (PyCFunction)is_grad_enabled, METH_NOARGS, nullptr},
+  {"set_anomaly_enabled", (PyCFunction)set_anomaly_mode_enabled, METH_O, nullptr},
+  {"is_anomaly_enabled", (PyCFunction)is_anomaly_mode_enabled, METH_NOARGS, nullptr},
+  {nullptr, nullptr, 0, nullptr}
+};
+
+PyMethodDef* python_functions() {
+  return methods;
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
new file mode 100644
index 0000000..78ccc8b
--- /dev/null
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -0,0 +1,48 @@
+#include "torch/csrc/autograd/input_buffer.h"
+
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+
+#include <ATen/DeviceGuard.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+
+void InputBuffer::add(size_t pos, Variable var) {
+  TORCH_ASSERT(pos < buffer.size());
+  if (!var.defined()) {
+    return;
+  }
+  auto& old_var = buffer[pos];
+  if (!old_var.defined()) {
+    buffer[pos] = std::move(var);
+  } else {
+    at::DeviceGuard device_guard(var);
+    // ATen doesn't route sparse additions correctly...
+    if (old_var.type().is_sparse()) {
+      buffer[pos] = var + old_var;
+    } else {
+      buffer[pos] = old_var + var;
+    }
+  }
+}
+
+auto InputBuffer::device() const -> int {
+  for (auto& var : buffer) {
+    if (var.defined() && var.type().is_cuda()) {
+      return var.get_device();
+    }
+  }
+  return -1;
+}
+
+auto InputBuffer::variables(InputBuffer&& g) -> std::vector<Variable> {
+  std::vector<Variable> result = std::move(g.buffer);
+  return result;
+}
+
+}}  // namespace torch::autograd
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
new file mode 100644
index 0000000..2e0febf
--- /dev/null
+++ b/torch/csrc/autograd/input_buffer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+// The InputBuffer class accumulates a list of Variables for use by a
+// function. It implements logic to avoid modifying the passed
+// values in-place (adding an input twice will accumulate the result).
+// This behaviour is needed and used only in backward graphs.
+
+#include <vector>
+#include <utility>
+#include <memory>
+#include <ATen/ATen.h>
+
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct InputBuffer {
+  explicit InputBuffer(size_t size)
+    : buffer(size) {}
+  InputBuffer(const InputBuffer& other) = delete;
+  InputBuffer(InputBuffer&& other) = default;
+  InputBuffer& operator=(InputBuffer&& other) = default;
+
+  // Accumulates the variable at a specified index.
+  void add(size_t idx, Variable var);
+
+  int device() const;
+
+  Variable operator[](size_t pos) { return buffer[pos]; }
+
+  // Returns the inputs as a list of variables. Destroys given InputBuffer.
+  static std::vector<Variable> variables(InputBuffer&& buffer);
+
+private:
+  std::vector<Variable> buffer;
+};
+
+}}  // namespace torch::autograd
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
new file mode 100644
index 0000000..d9f2f37
--- /dev/null
+++ b/torch/csrc/autograd/profiler.cpp
@@ -0,0 +1,90 @@
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/function.h"
+
+namespace torch { namespace autograd { namespace profiler {
+
+ProfilerState state = ProfilerState::Disabled;
+uint32_t next_thread_id = 0;
+std::mutex all_event_lists_mutex;
+std::list<std::shared_ptr<RangeEventList>> all_event_lists;
+thread_local std::shared_ptr<RangeEventList> event_list;
+thread_local int32_t thread_id;
+
+void RecordFunction::pushFunctionRange(Function* fn) {
+  pushRange(fn->name());
+}
+
+#ifdef USE_CUDA
+static void onEachDevice(std::function<void(int)> op) {
+  at::DeviceGuard device_guard;
+  int count;
+  TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
+  for(int i = 0; i < count; i++) {
+    device_guard.set_index(i);
+    op(i);
+  }
+}
+#endif
+
+void enableProfiler(ProfilerState new_state) {
+  TORCH_ASSERT(new_state != ProfilerState::Disabled);
+#ifndef USE_CUDA
+  if (new_state == ProfilerState::NVTX)
+    throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
+#endif
+  if (state != ProfilerState::Disabled && new_state != state) {
+      throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
+  }
+  state = new_state;
+
+#ifdef USE_CUDA
+  if(state == ProfilerState::CUDA) {
+    // event recording appears to have some startup overhead, so we need to
+    // to generate some dummy events first before recording syncrhonization events
+    for(int i = 0; i < 5; i++) {
+      onEachDevice([](int d) {
+          mark("__cuda_startup");
+          cudaDeviceSynchronize();
+      });
+    }
+
+    // cuda events must be on the same device, so we need a start event recorded
+    // for each gpu. we then use this event to synchronize time on the GPU
+    // with the CPU clock.
+    onEachDevice([](int d) {
+        mark("__cuda_start_event");
+    });
+  }
+#endif
+  mark("__start_profile", false);
+}
+
+thread_event_lists disableProfiler() {
+  if (state == ProfilerState::Disabled) {
+    throw std::runtime_error("can't disable profiler when it's not running");
+  }
+  ProfilerState old_state = state;
+  mark("__stop_profile");
+  state = ProfilerState::Disabled;
+  if (old_state == ProfilerState::NVTX) {
+    return thread_event_lists();
+  } else {
+    thread_event_lists result;
+    std::lock_guard<std::mutex> guard(all_event_lists_mutex);
+    for (auto it = all_event_lists.begin(); it != all_event_lists.end();) {
+      auto & list = *it;
+      result.emplace_back(list->consolidate());
+      // GC lists that are not held by any threads
+      if (list.use_count() == 1) {
+        auto current_it = it;
+        ++it;
+        all_event_lists.erase(current_it);
+      } else {
+        ++it;
+      }
+    }
+    return result;
+  }
+}
+
+}}}
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
new file mode 100644
index 0000000..c842e00
--- /dev/null
+++ b/torch/csrc/autograd/profiler.h
@@ -0,0 +1,251 @@
+#pragma once
+
+#ifdef USE_CUDA
+#include <nvToolsExt.h>
+#endif
+#include <thread>
+#include <iostream>
+#include <mutex>
+#include <memory>
+#include <vector>
+#include <cstdint>
+#include <string>
+#include <list>
+#include <sstream>
+#include <forward_list>
+#include <tuple>
+#include "ATen/ATen.h"
+#include "torch/csrc/cuda/cuda_check.h"
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace torch { namespace autograd {
+
+struct Function;
+
+namespace profiler {
+
+constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
+  return ((a + b - 1) / b) * b;
+}
+
+inline uint64_t getTime() {
+  using namespace std::chrono;
+  using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
+  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+}
+
+enum class EventKind {
+  Mark,
+  PushRange,
+  PopRange
+};
+
+struct Event {
+  Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
+  : kind_(kind)
+  , name_(std::move(name))
+  , thread_id_(thread_id) {
+#ifdef USE_CUDA
+    if(record_cuda) {
+      TORCH_CUDA_CHECK(cudaGetDevice(&device_));
+      TORCH_CUDA_CHECK(cudaEventCreate(&event));
+      auto stream = at::globalContext().getCurrentCUDAStream();
+      cpu_ns_ = getTime();
+      TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
+    } else {
+      cpu_ns_ = getTime();
+    }
+#else
+    cpu_ns_ = getTime();
+#endif
+  }
+  std::string kind() const {
+    switch(kind_) {
+      case EventKind::Mark: return "mark";
+      case EventKind::PushRange: return "push";
+      case EventKind::PopRange: return "pop";
+    }
+    throw std::runtime_error("unknown EventKind");
+  }
+  const std::string & name() const {
+    return name_;
+  }
+  uint32_t thread_id() const {
+    return thread_id_;
+  }
+  double cpu_elapsed_us(const Event & e) {
+    return (e.cpu_ns_ - cpu_ns_)/(1000.0);
+  }
+  double cuda_elapsed_us(const Event & e) {
+#ifdef USE_CUDA
+    if(!e.has_cuda() || !has_cuda()) {
+      throw std::logic_error("Events were not recorded for CUDA");
+    }
+    if(e.device() != device()) {
+      throw std::logic_error("Events are not on the same device");
+    }
+    TORCH_CUDA_CHECK(cudaEventSynchronize(event));
+    TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
+    float ms;
+    TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
+    return ms*1000.0;
+#else
+    throw std::logic_error("CUDA not enabled");
+#endif
+  }
+  bool has_cuda() const {
+#ifdef USE_CUDA
+    return event != nullptr;
+#else
+    return false;
+#endif
+  }
+  int device() const {
+    return device_;
+  }
+private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_; // signed to allow for negative intervals
+#ifdef USE_CUDA
+  cudaEvent_t event = nullptr;
+#endif
+  int device_ = -1;
+};
+
+// a linked-list of fixed sized vectors, to avoid
+// a std::vector resize from taking a large amount of time inside
+// a profiling  event
+struct RangeEventList {
+  constexpr static size_t MB = 1024 * 1024;
+  constexpr static size_t event_block_size = 16 * MB;
+  constexpr static size_t num_block_elements =
+    event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
+  static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
+                "num_block_elements is calculated incorrectly");
+  using block_type = std::vector<Event>;
+
+  void allocBlock() {
+    blocks.emplace_front();
+    blocks.front().reserve(num_block_elements);
+  }
+
+  template<typename... Args>
+  void record(Args&&... args) {
+    if (blocks.empty() || blocks.front().size() == num_block_elements) {
+      allocBlock();
+    }
+    blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> consolidate() {
+    std::vector<Event> result;
+    for (auto & block : blocks) {
+      result.insert(result.begin(),
+                    std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    blocks.clear();
+    return result;
+  }
+
+  std::forward_list<block_type> blocks;
+};
+
+enum class ProfilerState {
+    Disabled,
+    CPU, // CPU-only profiling
+    CUDA, // CPU + CUDA events
+    NVTX,  // only emit NVTX markers
+};
+
+extern ProfilerState state;
+extern uint32_t next_thread_id;
+extern std::mutex all_event_lists_mutex;
+extern std::list<std::shared_ptr<RangeEventList>> all_event_lists;
+
+extern thread_local std::shared_ptr<RangeEventList> event_list;
+extern thread_local int32_t thread_id;
+
+inline RangeEventList& getEventList() {
+  if (!event_list) {
+    std::lock_guard<std::mutex> guard(all_event_lists_mutex);
+    event_list = std::make_shared<RangeEventList>();
+    thread_id = next_thread_id++;
+    all_event_lists.emplace_front(event_list);
+  }
+  return *event_list;
+}
+
+inline void mark(std::string name, bool include_cuda = true) {
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxMarkA(name.c_str());
+#else
+    throw std::logic_error("mark called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(EventKind::Mark, std::move(name), thread_id, include_cuda && state == ProfilerState::CUDA);
+  }
+}
+
+inline void pushRange(std::string name) {
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxRangePushA(name.c_str());
+#else
+    throw std::logic_error("pushRange called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(EventKind::PushRange, std::move(name), thread_id, state == ProfilerState::CUDA);
+  }
+}
+
+inline void popRange() {
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxRangePop();
+#else
+    throw std::logic_error("popRange called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(EventKind::PopRange, std::string(), thread_id, state == ProfilerState::CUDA);
+  }
+}
+
+struct RecordFunction {
+  explicit RecordFunction(Function *fn) {
+    if (state == ProfilerState::Disabled) return;
+    pushFunctionRange(fn);
+  }
+
+  explicit RecordFunction(std::string name) {
+    if (state == ProfilerState::Disabled) return;
+    pushRange(std::move(name));
+  }
+
+  explicit RecordFunction(const char *name) {
+    if (state == ProfilerState::Disabled) return;
+    pushRange(name);
+  }
+
+  ~RecordFunction() {
+    if (state == ProfilerState::Disabled) return;
+    popRange();
+  }
+
+  // Needed only because we don't have Function defined yet.
+  void pushFunctionRange(Function *fn);
+};
+
+using thread_event_lists = std::vector<std::vector<Event>>;
+// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
+// there no autograd functions are being executed when these function are used.
+void enableProfiler(ProfilerState state);
+thread_event_lists disableProfiler();
+
+} // namespace profiler
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_anomaly_mode.cpp b/torch/csrc/autograd/python_anomaly_mode.cpp
new file mode 100644
index 0000000..e8b89fd
--- /dev/null
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@@ -0,0 +1,58 @@
+#include "torch/csrc/autograd/python_anomaly_mode.h"
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/Exceptions.h"
+
+#include <iostream>
+
+namespace torch { namespace autograd {
+
+void PyAnomalyMetadata::store_stack() {
+  AutoGIL gil;
+  THPObjectPtr mod(PyImport_ImportModule("traceback"));
+  if (!mod) {
+    throw python_error();
+  }
+
+  THPObjectPtr list(PyObject_CallMethod(mod.get(), "format_stack", ""));
+  if (!list) {
+    throw python_error();
+  }
+
+  if (PyDict_SetItemString(dict(), ANOMALY_TRACE_KEY, list.get())) {
+    throw python_error();
+  }
+}
+
+void PyAnomalyMetadata::print_stack() {
+  AutoGIL gil;
+  if (!PyDict_Check(dict())) {
+    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
+  }
+
+  THPObjectPtr stack(PyDict_GetItemString(dict(), ANOMALY_TRACE_KEY));
+  if (!stack) {
+    AT_WARN("No forward pass information available. Enable detect anomaly "
+            "during forward pass for more information.");
+    return;
+  }
+
+  THPObjectPtr empty_string(PyUnicode_FromString(""));
+  if (!empty_string) {
+    throw python_error();
+  }
+
+  // stack is a list of Python strings ending with newlines. Use join to convert
+  // to a single string.
+  THPObjectPtr msg(PyUnicode_Join(empty_string, stack.get()));
+  if (!msg) {
+    throw python_error();
+  }
+
+  AT_WARN("Traceback of forward call that caused the error:\n",
+          THPUtils_unpackString(msg.get()));
+}
+
+}}
diff --git a/torch/csrc/autograd/python_anomaly_mode.h b/torch/csrc/autograd/python_anomaly_mode.h
new file mode 100644
index 0000000..f53bc48
--- /dev/null
+++ b/torch/csrc/autograd/python_anomaly_mode.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "torch/csrc/autograd/anomaly_mode.h"
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/utils/auto_gil.h"
+
+namespace torch { namespace autograd {
+
+struct PyAnomalyMetadata : public AnomalyMetadata {
+  static constexpr char* ANOMALY_TRACE_KEY = "traceback_";
+
+  PyAnomalyMetadata() {
+    AutoGIL gil;
+    dict_ = PyDict_New();
+  }
+  ~PyAnomalyMetadata() {
+    AutoGIL gil;
+    Py_DECREF(dict_);
+  }
+  virtual void store_stack() override;
+  virtual void print_stack() override;
+
+  PyObject* dict() {
+    return dict_;
+  }
+
+private:
+  PyObject* dict_;
+};
+
+}}
diff --git a/torch/csrc/autograd/python_cpp_function.cpp b/torch/csrc/autograd/python_cpp_function.cpp
new file mode 100644
index 0000000..5563d9d
--- /dev/null
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@@ -0,0 +1,254 @@
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+#include "torch/csrc/python_headers.h"
+#include <memory>
+#include <stdio.h>
+#include <typeindex>
+#include <unordered_map>
+
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/python_hook.h"
+#include "torch/csrc/autograd/python_anomaly_mode.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+
+using namespace torch::autograd;
+
+namespace torch { namespace autograd {
+
+namespace {
+
+PyObject* THPCppFunction_call(PyObject* self, PyObject* args, PyObject *kwargs)
+{
+  if (kwargs && PyDict_Size(kwargs) != 0) {
+    return PyErr_Format(PyExc_TypeError, "keyword arguments are not supported");
+  }
+
+  int num_inputs = PyTuple_GET_SIZE(args);
+  int num_inputs_required = ((THPCppFunction*)self)->cdata->num_inputs();
+  if (num_inputs != num_inputs_required) {
+    return PyErr_Format(PyExc_TypeError, "expected %d arguments, got %d instead",
+                        num_inputs_required, num_inputs);
+  }
+  variable_list vars(num_inputs);
+  for (int i = 0; i != num_inputs; ++i) {
+    PyObject* arg = PyTuple_GET_ITEM(args, i);
+    if (arg == Py_None) {
+      continue;
+    }
+    if (!THPVariable_Check(arg)) {
+      return PyErr_Format(PyExc_TypeError, "argument %d is not a Variable", i);
+    }
+    vars[i] = ((THPVariable*)arg)->cdata;
+  }
+
+  variable_list output;
+
+  HANDLE_TH_ERRORS {
+    AutoNoGIL nogil;
+    output = (*((THPCppFunction*)self)->cdata)(std::move(vars));
+  }
+  END_HANDLE_TH_ERRORS
+
+  int num_outputs = output.size();
+  if (num_outputs == 1) {
+    // assume we want to unpack one element tuples for now
+    return THPVariable_Wrap(output[0]);
+  }
+
+  THPObjectPtr tuple(PyTuple_New(num_outputs));
+  for (int i = 0; i != num_outputs; ++i) {
+    PyTuple_SET_ITEM(tuple.get(), i, THPVariable_Wrap(output[i]));
+  }
+  return tuple.release();
+}
+
+int THPCppFunction_traverse(PyObject* self, visitproc visit, void *arg)
+{
+  auto& fn = *((THPCppFunction*)self)->cdata;
+  for (const auto& hook : fn.pre_hooks()) {
+    if (auto pyhook = dynamic_cast<PyFunctionPreHook*>(hook.get())) {
+      Py_VISIT(pyhook->dict);
+    }
+  }
+  for (const auto& hook : fn.post_hooks()) {
+    if (auto pyhook = dynamic_cast<PyFunctionPostHook*>(hook.get())) {
+      Py_VISIT(pyhook->dict);
+    }
+  }
+  return 0;
+}
+
+int THPCppFunction_clear(PyObject* self)
+{
+  auto f = (THPCppFunction*)self;
+  // Remove the weak ref of the c++ object if it exist
+  if (f->cdata) {
+    f->cdata->set_pyobj(nullptr);
+  }
+  f->cdata.reset();
+  return 0;
+}
+
+void THPCppFunction_dealloc(PyObject* self)
+{
+  THPCppFunction_clear(self);
+  ((THPCppFunction*)self)->cdata.~shared_ptr();
+  Py_TYPE(self)->tp_free(self);
+}
+
+} // namespace
+
+PyObject* THPCppFunction_next_functions(THPCppFunction* self, PyObject* hook)
+{
+  const auto num_next = self->cdata->num_outputs();
+  THPObjectPtr py_functions(PyTuple_New(num_next));
+  if (!py_functions) return nullptr;
+  for (size_t i = 0; i < num_next; ++i) {
+    auto& c_tuple = self->cdata->next_edge(i);
+    THPObjectPtr tuple(PyTuple_New(2));
+    if (!tuple) return nullptr;
+    PyObject *py_fn = functionToPyObject(c_tuple.function);
+    if (!py_fn) return nullptr;
+    PyTuple_SET_ITEM(tuple.get(), 0, py_fn);
+    PyObject *py_idx = PyLong_FromLong(c_tuple.input_nr);
+    if (!py_idx) return nullptr;
+    PyTuple_SET_ITEM(tuple.get(), 1, py_idx);
+    PyTuple_SET_ITEM(py_functions.get(), i, tuple.release());
+  }
+  return py_functions.release();
+}
+
+PyObject* THPCppFunction_metadata(THPCppFunction *self, void *_unused)
+{
+  auto metadata = static_cast<PyAnomalyMetadata*>(self->cdata->metadata())->dict();
+
+  Py_INCREF(metadata);
+  return metadata;
+}
+
+PyObject* THPCppFunction_requires_grad(THPCppFunction* self) {
+  Py_RETURN_TRUE;
+}
+
+PyObject* THPCppFunction_register_hook_dict(PyObject* self, PyObject* _var)
+{
+  if (!THPVariable_Check(_var)) {
+    return PyErr_Format(PyExc_TypeError, "_register_hook_dict expected a variable");
+  }
+  auto var = (THPVariable*)_var;
+  auto& fn = *((THPCppFunction*)self)->cdata;
+  std::unique_ptr<FunctionPreHook> hook(
+      new PyFunctionPreHook(var->backward_hooks, var->cdata.output_nr()));
+  fn.add_pre_hook(std::move(hook));
+  Py_RETURN_NONE;
+}
+
+PyObject* THPCppFunction_register_hook(PyObject* self, PyObject* hook)
+{
+  auto& fn = *((THPCppFunction*)self)->cdata;
+  return registerFunctionHook(fn, hook);
+}
+
+
+static struct PyMethodDef default_methods[] = {
+  THP_FUNCTION_DEFAULT_METHODS,
+  {nullptr}
+};
+
+static struct PyGetSetDef default_properties[] = {
+  THP_FUNCTION_DEFAULT_PROPERTIES,
+  {nullptr}
+};
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties, PyMethodDef* function_methods)
+{
+  type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC;
+  type.tp_name = name;
+  type.tp_basicsize = sizeof(THPCppFunction);
+  type.tp_call = THPCppFunction_call;
+  type.tp_methods = function_methods ? function_methods : default_methods;
+  type.tp_getset = function_properties ? function_properties : default_properties;
+  type.tp_dealloc = THPCppFunction_dealloc;
+  type.tp_traverse = THPCppFunction_traverse;
+  type.tp_clear = THPCppFunction_clear;
+  if (PyType_Ready(&type) < 0) {
+    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
+    throw std::runtime_error(msg);
+  }
+  return &type;
+}
+
+static std::unordered_map<std::type_index, THPObjectPtr> cpp_function_types;
+
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata)
+{
+  if (!cdata) {
+    Py_RETURN_NONE;
+  }
+
+  if (auto pfw = dynamic_cast<PyFunction*>(cdata.get())) {
+    PyObject* obj = pfw->obj;
+    Py_INCREF(obj);
+    return obj;
+  }
+
+  if (cdata->pyobj()) {
+    Py_INCREF(cdata->pyobj());
+  } else {
+    auto& fn = *cdata;
+    auto it = cpp_function_types.find(std::type_index(typeid(fn)));
+    if (it == cpp_function_types.end()) {
+      return PyErr_Format(PyExc_TypeError,
+          "Don't know how to create Python object for %s", typeid(fn).name());
+    }
+
+    PyTypeObject* type = (PyTypeObject*)it->second.get();
+    THPObjectPtr obj(type->tp_alloc(type, 0));
+    if (!obj) return nullptr;
+    THPCppFunction* f = (THPCppFunction*)obj.get();
+    new (&f->cdata) std::shared_ptr<Function>(cdata);
+
+    // No INCREF here as we only have a weak reference
+    cdata->set_pyobj(obj.release());
+  }
+
+  return cdata->pyobj();
+}
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype)
+{
+  Py_INCREF((PyObject*)pytype);
+  cpp_function_types[std::type_index(type)] = THPObjectPtr((PyObject*)pytype);
+}
+
+PyObject* registerFunctionHook(Function& fn, PyObject* hook)
+{
+  PyObject* dict = Py_None;
+  for (const auto& hook : fn.post_hooks()) {
+    if (auto pyhook = dynamic_cast<PyFunctionPostHook*>(hook.get())) {
+      dict = pyhook->dict;
+      break;
+    }
+  }
+
+  THPObjectPtr register_fn(PyObject_GetAttrString(THPFunctionClass, "_register_hook"));
+  if (!register_fn) return nullptr;
+  THPObjectPtr res(PyObject_CallFunctionObjArgs(register_fn.get(), dict, hook, nullptr));
+  if (!res) return nullptr;
+
+  if (dict == Py_None) {
+    dict = PyTuple_GET_ITEM(res.get(), 0);
+    std::unique_ptr<FunctionPostHook> hook(new PyFunctionPostHook(dict));
+    fn.add_post_hook(std::move(hook));
+  }
+
+  PyObject* handle = PyTuple_GET_ITEM(res.get(), 1);
+  Py_INCREF(handle);
+  return handle;
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_cpp_function.h b/torch/csrc/autograd/python_cpp_function.h
new file mode 100644
index 0000000..e03e764
--- /dev/null
+++ b/torch/csrc/autograd/python_cpp_function.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <memory>
+#include <typeinfo>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/Exceptions.h"
+
+namespace torch { namespace autograd {
+
+struct THPCppFunction {
+  PyObject_HEAD
+  std::shared_ptr<Function> cdata;
+};
+
+template<typename Ctor>
+PyObject* CppFunction_pynew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+  THPObjectPtr obj(type->tp_alloc(type, 0));
+  if (!obj) return nullptr;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  HANDLE_TH_ERRORS
+  new (&f->cdata) std::shared_ptr<Function>(Ctor()(args));
+  END_HANDLE_TH_ERRORS
+  if (!f->cdata) {
+    return nullptr;
+  }
+  return obj.release();
+}
+
+#define THP_FUNCTION_DEFAULT_METHODS \
+  {(char*)"_register_hook_dict", (PyCFunction)THPCppFunction_register_hook_dict, METH_O, nullptr}, \
+  {(char*)"register_hook", (PyCFunction)THPCppFunction_register_hook, METH_O, nullptr}
+
+#define THP_FUNCTION_DEFAULT_PROPERTIES \
+  {(char*)"next_functions", (getter)THPCppFunction_next_functions, nullptr, nullptr, nullptr}, \
+  {(char*)"requires_grad", (getter)THPCppFunction_requires_grad, nullptr, nullptr, nullptr}, \
+  {(char*)"metadata", (getter)THPCppFunction_metadata, nullptr, nullptr, nullptr}
+
+PyObject* THPCppFunction_next_functions(THPCppFunction* self, PyObject* hook);
+PyObject* THPCppFunction_metadata(THPCppFunction *self, void *_unused);
+PyObject* THPCppFunction_requires_grad(THPCppFunction* self);
+PyObject* THPCppFunction_register_hook_dict(PyObject* self, PyObject* _var);
+PyObject* THPCppFunction_register_hook(PyObject* self, PyObject* hook);
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties, PyMethodDef* function_methods);
+
+PyObject* registerFunctionHook(Function& fn, PyObject* hook);
+
+template<typename Ctor>
+PyTypeObject* createForwardFunctionPyTypeObject(PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=nullptr, PyMethodDef* function_methods=nullptr)
+{
+  type.tp_new = &CppFunction_pynew<Ctor>;
+  return _initFunctionPyTypeObject(type, name, function_properties, function_methods);
+}
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype);
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata);
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
new file mode 100644
index 0000000..767842a
--- /dev/null
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -0,0 +1,285 @@
+#include "torch/csrc/autograd/python_engine.h"
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/PtrWrapper.h"
+#include "torch/csrc/THP.h"
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/utils/auto_gil.h"
+
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+
+#include <unordered_set>
+#include <memory> // for unique_ptr
+
+using namespace torch::autograd;
+
+struct THPEngine {
+    PyObject_HEAD
+};
+
+static torch::autograd::python::PythonEngine engine;
+
+static Engine& get_python_engine() {
+  return engine;
+}
+
+namespace torch { namespace autograd { namespace python {
+
+void PythonEngine::thread_init(int device) {
+  // Create a PyThreadState, but release the GIL. This lets AutoGIL calls
+  // inside thread_main acquire the GIL without having to create a new
+  // PyThreadState each time.
+  AutoGIL gil;
+  AutoNoGIL no_gil;
+  Engine::thread_init(device);
+}
+
+void PythonEngine::thread_on_exception(FunctionTask& task, std::exception& e) {
+  auto python_err = dynamic_cast<python_error*>(&e);
+  if (python_err) {
+    python_err->persist();
+  }
+  Engine::thread_on_exception(task, e);
+}
+
+std::unique_ptr<AnomalyMetadata> PythonEngine::make_anomaly_metadata() {
+  return std::unique_ptr<AnomalyMetadata>(new PyAnomalyMetadata());
+}
+
+variable_list PythonEngine::execute(
+    const edge_list& roots,
+    const variable_list& inputs,
+    bool keep_graph,
+    bool create_graph,
+    const edge_list& outputs) {
+  try {
+    return Engine::execute(roots, inputs, keep_graph, create_graph, outputs);
+  } catch (python_error& e) {
+    e.restore();
+    throw;
+  }
+}
+
+}}} // namespace torch::autograd::python
+
+PyObject *THPEngineClass = nullptr;
+
+static bool _reinitialize_engine = false;
+
+static void _maybe_reinitialize_engine_after_fork() {
+  // This is "probably" thread-safe because the flag is set in a fork handler
+  // before any threads are created, and this function is only called with the
+  // GIL held. However, using fork + threads is playing with fire so this is
+  // more of a "best effort" thing. For example, if the fork occurs while the
+  // backwards threads hold a lock, we'll probably deadlock in the engine
+  // destructor.
+  if (_reinitialize_engine) {
+    engine.~PythonEngine();
+    new (&engine) torch::autograd::python::PythonEngine();
+    _reinitialize_engine = false;
+  }
+}
+
+// Implementation of torch._C._EngineBase.run_backward
+PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  _maybe_reinitialize_engine_after_fork();
+  PyObject *tensors = nullptr;
+  PyObject *grad_tensors = nullptr;
+  unsigned char keep_graph = 0;
+  unsigned char create_graph = 0;
+  PyObject *inputs = nullptr;
+  unsigned char allow_unreachable = 0;
+  const char *accepted_kwargs[] = {
+      "tensors", "grad_tensors", "keep_graph", "create_graph", "inputs",
+      "allow_unreachable", nullptr
+  };
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OObb|Ob", (char**)accepted_kwargs,
+        &tensors, &grad_tensors, &keep_graph, &create_graph, &inputs, &allow_unreachable))
+    return nullptr;
+
+  THPUtils_assert(PyTuple_Check(tensors), "tensors argument is expected to "
+      "be a tuple, but got %s", THPUtils_typename(tensors));
+  THPUtils_assert(PyTuple_Check(grad_tensors), "grad_tensors argument is "
+      "expected to be a tuple, but got %s", THPUtils_typename(grad_tensors));
+
+  Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors);
+  Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_tensors);
+  THPUtils_assert(num_tensors == num_gradients, "got %ld tensors and %ld "
+      "gradients", num_tensors, num_gradients);
+
+  edge_list roots;
+  roots.reserve(num_tensors);
+  variable_list grads;
+  grads.reserve(num_tensors);
+  for (int i = 0; i < num_tensors; i++) {
+    PyObject *_tensor = PyTuple_GET_ITEM(tensors, i);
+    THPUtils_assert(THPVariable_Check(_tensor), "element %d of tensors "
+        "tuple is not a Tensor", i);
+    auto& variable = ((THPVariable*)_tensor)->cdata;
+    auto gradient_edge = variable.gradient_edge();
+    THPUtils_assert(gradient_edge.function,
+        "element %d of tensors does not require grad and does not have a grad_fn", i);
+    roots.push_back(std::move(gradient_edge));
+
+    PyObject *grad = PyTuple_GET_ITEM(grad_tensors, i);
+    if (THPVariable_Check(grad)) {
+      grads.push_back(((THPVariable*)grad)->cdata);
+    } else {
+      THPUtils_assert(grad == Py_None,
+          "element %d of gradients tuple is not a Tensor or None", i);
+      THPUtils_assert(!variable.requires_grad(),
+          "element %d of gradients tuple is None, but the corresponding Tensor requires grad");
+    }
+  }
+
+  std::vector<Edge> output_edges;
+  if (inputs != nullptr) {
+    int num_inputs = PyTuple_GET_SIZE(inputs);
+    output_edges.reserve(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      PyObject *input = PyTuple_GET_ITEM(inputs, i);
+      THPUtils_assert(THPVariable_Check(input),
+          "all inputs have to be Tensors, but got %s", THPUtils_typename(input));
+      THPVariable *input_var = (THPVariable*)input;
+      const auto output_nr = input_var->cdata.output_nr();
+      auto grad_fn = input_var->cdata.grad_fn();
+      if (!grad_fn) {
+          grad_fn = input_var->cdata.try_get_grad_accumulator();
+      }
+      THPUtils_assert(input_var->cdata.requires_grad(),
+          "One of the differentiated Tensors does not require grad");
+      if (!grad_fn) {
+        output_edges.emplace_back();
+      } else {
+        output_edges.emplace_back(grad_fn, output_nr);
+      }
+    }
+  }
+
+  variable_list outputs;
+  {
+    AutoNoGIL no_gil;
+    outputs = engine.execute(roots, grads, keep_graph, create_graph, output_edges);
+  }
+
+  if (inputs != nullptr) {
+    int num_inputs = PyTuple_GET_SIZE(inputs);
+    THPObjectPtr py_outputs {PyTuple_New(num_inputs)};
+    if (!py_outputs) return nullptr;
+    for (int i = 0; i < num_inputs; i++) {
+      THPUtils_assert(allow_unreachable || outputs[i].defined(), "One of the "
+                      "differentiated Tensors appears to not have been used "
+                      "in the graph. Set allow_unused=True if this is the "
+                      "desired behavior.");
+      PyTuple_SET_ITEM(py_outputs.get(), i, THPVariable_Wrap(outputs[i]));
+    }
+    return py_outputs.release();
+  } else {
+    Py_RETURN_NONE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THPEngine_queue_callback(PyObject *self, PyObject *_callback) {
+  HANDLE_TH_ERRORS
+  _maybe_reinitialize_engine_after_fork();
+  std::shared_ptr<PyObject> callback(_callback, [](PyObject *obj) { AutoGIL gil; Py_DECREF(obj); });
+  Py_INCREF(_callback);
+  engine.queue_callback([callback]() {
+    AutoGIL gil;
+    THPObjectPtr result {PyObject_CallFunctionObjArgs(callback.get(), nullptr)};
+    if (!result) throw python_error();
+  });
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THPEngine_is_checkpoint_valid(PyObject *self) {
+  HANDLE_TH_ERRORS
+  if(engine.is_checkpoint_valid()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  return type->tp_alloc(type, 0);
+}
+
+static struct PyMethodDef THPEngine_methods[] = {
+  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, nullptr},
+  {(char*)"queue_callback", (PyCFunction)THPEngine_queue_callback, METH_O, nullptr},
+  {(char*)"is_checkpoint_valid", (PyCFunction)THPEngine_is_checkpoint_valid, METH_NOARGS, nullptr},
+  {nullptr}
+};
+
+
+PyTypeObject THPEngineType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C._EngineBase",                /* tp_name */
+  sizeof(THPEngine),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  nullptr,                               /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPEngine_methods,                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPEngine_new                          /* tp_new */
+};
+
+static void child_atfork() {
+  _reinitialize_engine = true;
+}
+
+bool THPEngine_initModule(PyObject *module)
+{
+#ifndef _WIN32
+  if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
+    throw std::runtime_error("unable to set pthread_atfork handler");
+  }
+#endif
+  if (PyType_Ready(&THPEngineType) < 0)
+    return false;
+  Py_INCREF(&THPEngineType);
+  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
+  set_default_engine_stub(get_python_engine);
+  return true;
+}
diff --git a/torch/csrc/autograd/python_engine.h b/torch/csrc/autograd/python_engine.h
new file mode 100644
index 0000000..0db36ab
--- /dev/null
+++ b/torch/csrc/autograd/python_engine.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/autograd/python_anomaly_mode.h"
+
+bool THPEngine_initModule(PyObject *module);
+
+namespace torch { namespace autograd { namespace python {
+
+struct PythonEngine : public Engine {
+  virtual void thread_init(int device) override;
+  virtual void thread_on_exception(FunctionTask& task, std::exception& e) override;
+  virtual variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      const edge_list& outputs = {}) override;
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() override;
+};
+
+}}} // namespace torch::autograd::python
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
new file mode 100644
index 0000000..5494334
--- /dev/null
+++ b/torch/csrc/autograd/python_function.cpp
@@ -0,0 +1,1099 @@
+#include "torch/csrc/autograd/python_function.h"
+
+#include "torch/csrc/python_headers.h"
+#include <structmember.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <exception>
+#include <ATen/ATen.h>
+
+#include "THP.h"
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/functions/accumulate_grad.h"
+#include "torch/csrc/autograd/functions/basic_ops.h"
+#include "torch/csrc/autograd/functions/utils.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include "torch/csrc/autograd/python_hook.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/autograd/python_anomaly_mode.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/Exceptions.h"
+
+#include <exception>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace torch;
+using namespace torch::autograd;
+using namespace torch::jit;
+using at::Tensor;
+
+PyObject *THPFunctionClass = nullptr;
+
+#define THPFunction_assert(condition, ...)                                     \
+  if (!(condition)) { THPUtils_setError(__VA_ARGS__); throw python_error(); }
+
+namespace torch { namespace autograd {
+
+VariableInfo::VariableInfo(const Variable& var)
+  : type(&var.type())
+  , size(var.sizes())
+  , requires_grad(var.requires_grad()) {
+  if (var.type().is_cuda()) {
+    device = var.get_device();
+  }
+}
+
+Variable VariableInfo::zeros(at::DeviceGuard& device_guard) const {
+  device_guard.set_index(device);
+  return at::zeros(size, *type);
+}
+
+auto PyFunction::legacy_apply(const variable_list& inputs) -> variable_list {
+  AutoGIL gil;
+
+  THPObjectPtr pyInputs(PyTuple_New(inputs.size()));
+  if (!pyInputs) throw python_error();
+
+  for (size_t i = 0; i != inputs.size(); ++i) {
+    PyTuple_SET_ITEM(pyInputs.get(), i, THPVariable_Wrap(inputs[i]));
+  }
+
+  THPObjectPtr r(PyObject_CallMethod(
+      obj, "_do_backward", "OO", pyInputs.get(), Py_True));
+  if (!r) throw python_error();
+
+  auto num_outputs = PyTuple_GET_SIZE(r.get());
+  tensor_list tensor_results(num_outputs);
+  for (int i = 0; i != num_outputs; ++i) {
+    PyObject* obj = PyTuple_GET_ITEM(r.get(), i);
+    if (obj != Py_None) {
+      if (!THPVariable_Check(obj)) {
+        std::string msg("expected Variable (got '");
+        msg += THPUtils_typename(obj);
+        msg += "')'";
+        throw std::runtime_error(msg);
+      }
+      tensor_results[i] = ((THPVariable*)obj)->cdata.data();
+    }
+  }
+
+  // XXX: this might get requires_grad wrong - there's no way to figure out
+  // if _do_backward didn't use ctx.saved_tensors and as a result some
+  // Variables might require grad, even if no args do. Unfortunately, this
+  // leads to unexpected error messages ("no nodes require computing gradients"),
+  // but I don't have a better idea. These functions would raise an error
+  // in backward anyway.
+  return wrap_outputs(
+      inputs,
+      std::move(tensor_results),
+      [this](edge_list&& next_edges) {
+        return std::make_shared<Error>(
+            name() + " is not differentiable twice", std::move(next_edges));
+      });
+}
+
+// NOTE: this function is written in a way that assumes it's only called for backward;
+// it's used by engine.cpp.  This is responsible for forwarding a call from
+// C++'s Function::apply to a Python method "apply".
+auto PyFunction::apply(variable_list&& inputs) -> variable_list {
+  AutoGIL gil;
+  at::DeviceGuard _device_guard;
+  THPFunction* py_fn = (THPFunction*)obj;
+
+  THPObjectPtr _legacy(PyObject_GetAttrString(obj, "_is_legacy"));
+  if (_legacy == Py_True) {
+    return legacy_apply(inputs);
+  }
+
+  // Massage a C++ variable_list into a Python arguments tuple
+  auto num_inputs = inputs.size();
+  THPObjectPtr pyInputs(PyTuple_New(num_inputs));
+  if (!pyInputs) throw python_error();
+  auto& output_info = py_fn->output_info;
+  for (size_t i = 0; i < num_inputs; ++i) {
+    PyObject* input;
+    if (inputs[i].defined()) {
+      input = THPVariable_Wrap(inputs[i]);
+    } else {
+      input = THPVariable_Wrap(output_info[i].zeros(_device_guard));
+    }
+    if (!input) throw python_error();
+    PyTuple_SET_ITEM(pyInputs.get(), i, input);
+  }
+
+  THPObjectPtr apply_fn(PyObject_GetAttrString(obj, "apply"));
+  if (!apply_fn) throw python_error();
+  THPObjectPtr r(PyObject_CallObject(apply_fn, pyInputs.get()));
+  if (!r) throw python_error();
+  ensure_tuple(r);
+
+  auto& is_variable_input = py_fn->is_variable_input;
+  int num_outputs = PyTuple_GET_SIZE(r.get());
+  int num_forward_inputs = is_variable_input.size();
+  // Returning too many results is ok, but only as long as they're all None.
+  // Truncate the result tuple in that case.
+  if (num_outputs > num_forward_inputs) {
+    bool all_none = true;
+    for (int i = num_forward_inputs; i < num_outputs; i++) {
+      all_none &= PyTuple_GET_ITEM(r.get(), i) == Py_None;
+    }
+    if (all_none) {
+      num_outputs = num_forward_inputs;
+      r = PyTuple_GetSlice(r.get(), 0, num_forward_inputs);
+      if (!r) throw python_error();
+    }
+  }
+
+  // Now the number of gradients should match
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name() + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got " ;
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }
+
+  // Massage the Python results tuple back into a C++ variable_list
+  variable_list results;
+  results.reserve(num_outputs);
+  auto& input_info = py_fn->input_info;
+  for (int i = 0; i != num_outputs; ++i) {
+    PyObject* output = PyTuple_GET_ITEM(r.get(), i);
+    bool was_variable = is_variable_input[i];
+    if (!was_variable) {
+      if (output != Py_None) {
+        std::string msg("function ");
+        msg += name() + " returned a gradient different than None at position ";
+        msg += std::to_string(i + 1) + ", but the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+      continue;
+    }
+    if (output == Py_None) {
+      auto& info = input_info[results.size()];
+      if (info.requires_grad) {
+        results.emplace_back(info.zeros(_device_guard));
+      } else {
+        results.emplace_back();
+      }
+    } else {
+      if (!THPVariable_Check(output)) {
+        std::string msg("expected Variable or None (got ");
+        msg += THPUtils_typename(output);
+        msg += ")";
+        throw std::runtime_error(msg);
+      }
+      results.emplace_back(((THPVariable*)output)->cdata);
+    }
+  }
+
+  return results;
+}
+
+auto PyFunction::is_traceable() -> bool {
+  AutoGIL gil;
+  THPObjectPtr forward_class {PyObject_GetAttrString(obj, "_forward_cls")};
+  if (!forward_class) throw python_error();
+  THPObjectPtr traceable_py_bool {PyObject_GetAttrString(forward_class, "is_traceable")};
+  if (!traceable_py_bool) throw python_error();
+  return traceable_py_bool == Py_True;
+}
+
+auto PyFunction::release_variables() -> void {
+  AutoGIL gil;
+  auto f = (THPFunction*) obj;
+  f->saved_variables.clear();
+  f->has_freed_buffers = 1;
+}
+
+auto PyFunction::name() const -> std::string {
+  AutoGIL gil;
+  auto f = (THPFunction*) obj;
+  auto name = std::string(Py_TYPE(f)->tp_name);
+  THPObjectPtr _legacy(PyObject_GetAttrString(obj, "_is_legacy"));
+  if (_legacy == Py_True) {
+    name += "LegacyBackward";
+  }
+  return name;
+}
+
+auto PyFunction::get_shared_ptr() -> std::shared_ptr<Function> {
+  return THPFunction_asFunction((THPFunction*)obj);
+}
+
+}} // namespace torch::autograd
+
+// Traverse and clear are required for supporting Python's GC cycle handling.
+static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg)
+{
+  for (const auto& hook : self->cdata.pre_hooks()) {
+    if (auto pyhook = dynamic_cast<PyFunctionPreHook*>(hook.get())) {
+      Py_VISIT(pyhook->dict);
+    }
+  }
+  for (const auto& hook : self->cdata.post_hooks()) {
+    if (auto pyhook = dynamic_cast<PyFunctionPostHook*>(hook.get())) {
+      Py_VISIT(pyhook->dict);
+    }
+  }
+  Py_VISIT(self->to_save);
+  Py_VISIT(self->non_differentiable);
+  Py_VISIT(self->dirty_tensors);
+  return 0;
+}
+
+static int THPFunction_clear(THPFunction *self)
+{
+  self->cdata.clear_input_metadata();
+
+  Py_CLEAR(self->needs_input_grad);
+
+  Py_CLEAR(self->to_save);
+  Py_CLEAR(self->non_differentiable);
+  Py_CLEAR(self->dirty_tensors);
+
+  self->output_info.clear();
+  self->input_info.clear();
+  self->saved_variables.clear();
+  self->is_variable_input.clear();
+
+  // Moving the hooks out makes sure to first disassociate them from the
+  // function, but without destroying any of them. They will get deleted when
+  // exiting this scope. This is important, because deleting Python objects can
+  // trigger deletion of other objects, and they can reference this function,
+  // seeing it in a half-deleted state.
+  auto pre_hooks = std::move(self->cdata.pre_hooks());
+  auto post_hooks = std::move(self->cdata.post_hooks());
+
+  return 0;
+}
+
+static void THPFunction_dealloc(THPFunction* self)
+{
+  PyObject_GC_UnTrack(self);
+  THPFunction_clear(self);
+  self->cdata.~PyFunction();
+  self->output_info.~vector();
+  self->input_info.~vector();
+  self->saved_variables.~vector();
+  self->is_variable_input.~vector();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (!obj) return nullptr;
+  // Python zero-initializes the object memory, so there's no need to initialize
+  // most fields
+  THPFunction* self = (THPFunction*)obj;
+  new (&self->cdata) PyFunction(obj);
+  new (&self->output_info) std::vector<VariableInfo>();
+  new (&self->input_info) std::vector<VariableInfo>();
+  new (&self->saved_variables) std::vector<SavedVariable>();
+  new (&self->is_variable_input) std::vector<bool>();
+  return obj;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Forward
+////////////////////////////////////////////////////////////////////////////////
+
+using t2var_type = std::unordered_map<PyObject *, THPVariable *>;
+
+// Bump the counters of all recorded dirty input tensors, adding each of them
+// into dirty_inputs.  Also does some sanity checking.
+static std::vector<PyObject*> _mark_dirty(THPFunction *self)
+{
+  // Increase versions of modified tensors
+  std::vector<PyObject*> dirty_inputs;
+  if (!self->dirty_tensors) return dirty_inputs;
+
+  THPFunction_assert(PyTuple_Check(self->dirty_tensors), "autograd "
+      "internal error: dirty_tensors attribute is expected to be a tuple "
+      "but is %s", THPUtils_typename(self->dirty_tensors));
+  Py_ssize_t num_dirty = PyTuple_GET_SIZE(self->dirty_tensors);
+  for (int i = 0; i < num_dirty; i++) {
+    PyObject *obj = PyTuple_GET_ITEM(self->dirty_tensors, i);
+    THPFunction_assert(THPVariable_Check(obj), "mark_dirty can "
+        "only accept variables, but argument %d is of type %s", i,
+        THPUtils_typename(obj));
+
+    dirty_inputs.push_back(obj);
+    auto variable = (THPVariable*)obj;
+    variable->cdata.bump_version();
+  }
+  // We're not going to ever need this so let's remove references now
+  Py_CLEAR(self->dirty_tensors);
+  return dirty_inputs;
+}
+
+static std::unordered_set<PyObject*> _parse_non_differentiable(THPFunction *self);
+
+// Given a Python tuple of raw output tensors (raw_output), set each of
+// the corresponding entries in a different Python tuple (outputs) with
+// these tensors wrapped with variables.  We save the gradient function (self)
+// to the variable if the output requires grad.
+//
+// There is a considerable amount of complexity to handle if the operation
+// that produced these output tensors is inplace.  A mapping of *input*
+// tensors to variables (t2var) is used to test if this occurred, and
+// the set of dirty tensors (dirty_inputs) is used to figure out what to
+// do in this case.  After this method is run, t2var is extended with
+// mappings for output tensors as well.
+static void _wrap_outputs(THPFunction *self,
+    PyObject* inputs_tuple, PyObject *raw_output, PyObject *outputs, bool is_executable)
+{
+  auto cdata = is_executable ? THPFunction_asFunction(self) : nullptr;
+  Py_ssize_t num_outputs = PyTuple_GET_SIZE(raw_output);
+  if (is_executable) {
+    self->output_info.clear();
+    self->output_info.reserve(num_outputs);
+  }
+
+  std::unordered_set<PyObject*> inputs;
+  int num_inputs = PyTuple_GET_SIZE(inputs_tuple);
+  for (int i = 0; i < num_inputs; i++) {
+    inputs.emplace(PyTuple_GET_ITEM(inputs_tuple, i));
+  }
+
+  auto non_differentiable = _parse_non_differentiable(self);
+  auto dirty_inputs = _mark_dirty(self);
+
+  auto as_variable = [&](PyObject* obj, int i) -> Variable {
+    if (THPVariable_Check(obj)) {
+      return ((THPVariable*)obj)->cdata;
+    }
+    throw TypeError("%s.forward: expected Variable (got %s) for return value %d",
+        Py_TYPE(self)->tp_name, Py_TYPE(obj)->tp_name, i);
+  };
+
+  // Sets the grad_fn and output_nr of an output Variable.
+  auto set_history = [&](Variable& var, uint32_t output_nr, bool is_input, bool is_modified,
+                         bool is_differentiable) {
+    if (!is_differentiable) {
+      if (!var.requires_grad()) {
+        return;
+      }
+      // NB: we don't support returning non-differentiable views that could require grad
+      if (var.is_view()) {
+        throw std::runtime_error("Returning Variables sharing storage with other Variables "
+                                 "that require grad is not supported in Python functions. "
+                                 "Please submit a feature request if you hit this error.");
+      }
+      // Return detached aliases of inputs, instead of changing their requires_grad
+      // property.
+      if (is_input) {
+        var = var.detach();
+      } else {
+        var.detach_();
+      }
+    } else if (is_modified) {
+      if (var.is_leaf() && var.requires_grad()) {
+        throw std::runtime_error("a leaf Variable that requires grad has been used in an in-place operation.");
+      }
+      // If the input was modified, transplant the grad_fn in the graph:
+      // grad_fn <- variable <- self  ==>  grad_fn <- self <- variable
+      var.grad().reset();
+      var.clear_hooks();
+      if (auto grad_acc_fn = var.try_get_grad_accumulator()) {
+        auto grad_acc = dynamic_cast<AccumulateGrad*>(grad_acc_fn.get());
+        grad_acc->variable.reset();
+      }
+      if (cdata) {
+        var.rebase_history({cdata, output_nr});
+      }
+    } else if (is_input) {
+      // An input has been returned, but it wasn't modified. Return it as a view
+      // so that we can attach a new grad_fn to the Variable.
+      var = var.view_as(var);
+      var.set_gradient_edge({cdata, output_nr});
+    } else if (cdata) {
+      var.set_gradient_edge({cdata, output_nr});
+    }
+  };
+
+  for (int i = 0; i < num_outputs; i++) {
+    PyObject* obj = PyTuple_GET_ITEM(raw_output, i);
+
+    bool is_input = inputs.count(obj) > 0;
+    bool is_modified = std::find(dirty_inputs.begin(), dirty_inputs.end(), obj) != dirty_inputs.end();
+    bool is_differentiable = is_executable && non_differentiable.count(obj) == 0;
+
+    // Note that output Variables may be repeated. In that case, the last call
+    // to set_history wins.
+    auto var = as_variable(obj, i);
+    if (cdata) {
+      auto output_nr = cdata->add_input_metadata(var.type(), var.sizes());
+      TORCH_ASSERT(i == (int)output_nr);
+    }
+    set_history(var, i, is_input, is_modified, is_differentiable);
+
+    if (is_executable) {
+      self->output_info.emplace_back(var);
+    }
+
+    PyTuple_SET_ITEM(outputs, i, THPVariable_Wrap(var));
+  }
+}
+
+// Save any variables that requested by to_save
+static void _save_variables(THPFunction* self)
+{
+  if (!self->to_save) return;
+
+  THPFunction_assert(PyTuple_Check(self->to_save), "autograd internal "
+      "error: to_save attribute is expected to be a tuple but is %s",
+      THPUtils_typename(self->to_save));
+  Py_ssize_t num_saved = PyTuple_GET_SIZE(self->to_save);
+  self->saved_variables.clear();
+  self->saved_variables.reserve(num_saved);
+  auto cdata_ptr = &self->cdata;
+  for (int i = 0; i < num_saved; i++) {
+    PyObject *obj = PyTuple_GET_ITEM(self->to_save, i);
+    if (obj == Py_None) {
+      self->saved_variables.emplace_back();
+      continue;
+    } else if (THPVariable_Check(obj)) {
+      auto variable = (THPVariable*)obj;
+      bool is_output = variable->cdata.grad_fn().get() == cdata_ptr;
+      self->saved_variables.emplace_back(variable->cdata, is_output);
+    } else {
+      throw TypeError(
+          "save_for_backward can only save variables, but argument %d is of "
+          "type %s", i, Py_TYPE(obj)->tp_name);
+    }
+  }
+  // Free .to_save
+  Py_CLEAR(self->to_save);
+}
+
+// Mark requires_grad = 0 on non-differentiable variables (as per non_differentiable)
+static std::unordered_set<PyObject*>
+_parse_non_differentiable(THPFunction *self)
+{
+  std::unordered_set<PyObject*> set;
+  if (!self->non_differentiable) return set;
+
+  THPFunction_assert(PyTuple_Check(self->non_differentiable), "autograd "
+      "internal error: non_differentiable attribute is expected to be a "
+      "tuple but is %s", THPUtils_typename(self->non_differentiable));
+  Py_ssize_t num_nondiff = PyTuple_GET_SIZE(self->non_differentiable);
+  set.reserve(num_nondiff);
+  for (int i = 0; i < num_nondiff; i++) {
+    PyObject *t = PyTuple_GET_ITEM(self->non_differentiable, i);
+    THPFunction_assert(THPVariable_Check(t), "mark_non_differentiable "
+        "only accepts variable arguments, but got %s", THPUtils_typename(t));
+    set.insert(t);
+  }
+  Py_CLEAR(self->non_differentiable);
+  return set;
+}
+
+struct UnpackedInput {
+  THPObjectPtr input_tuple;
+  variable_list input_vars;
+};
+
+struct InputFlags {
+  bool is_executable = false;
+  edge_list next_edges;
+  THPObjectPtr needs_input_grad;
+  std::vector<bool> is_variable_input;
+};
+
+template<bool enforce_variables>
+std::pair<UnpackedInput, InputFlags> unpack_input(PyObject *args) {
+  UnpackedInput unpacked;
+  InputFlags flags;
+
+  auto num_args = PyTuple_GET_SIZE(args);
+  unpacked.input_tuple = PyTuple_New(num_args);
+  flags.needs_input_grad = PyTuple_New(num_args);
+  for (int i = 0; i < num_args; i++) {
+    PyObject *arg = PyTuple_GET_ITEM(args, i);
+
+    bool is_variable = THPVariable_Check(arg);
+    flags.is_variable_input.push_back(is_variable);
+    if (!is_variable) {
+      // TODO: remove this code path once Variable and Tensor are merged in Python
+      if (enforce_variables) {
+        THPUtils_setError("expected a Variable argument, but got %s",
+                          THPUtils_typename(arg));
+        throw python_error();
+      }
+      Py_INCREF(Py_False);
+      PyTuple_SET_ITEM(flags.needs_input_grad.get(), i, Py_False);
+    } else {
+      THPVariable* variable = (THPVariable*)arg;
+      unpacked.input_vars.push_back(variable->cdata);
+      PyObject* needs_grad = variable->cdata.requires_grad() ? Py_True : Py_False;
+      Py_INCREF(needs_grad);
+      PyTuple_SET_ITEM(flags.needs_input_grad.get(), i, needs_grad);
+    }
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(unpacked.input_tuple.get(), i, arg);
+  }
+
+  flags.is_executable = GradMode::is_enabled() && any_variable_requires_grad(unpacked.input_vars);
+  flags.next_edges = collect_next_edges(unpacked.input_vars);
+  return std::make_pair(std::move(unpacked), std::move(flags));
+}
+
+static void _assert_not_tracing(const char* name, const variable_list& input_vars) {
+  if (tracer::isTracingVar(input_vars)) {
+    std::ostringstream oss;
+    oss << "Attempted to trace " << name;
+    oss << ", but tracing of legacy functions is not supported";
+    throw std::runtime_error(oss.str());
+  }
+}
+
+static jit::tracer::PreTraceInfo _trace_pre_record(
+    PyObject* op_obj,
+    PyObject *input_objects,
+    const variable_list& input_vars) {
+  if (!tracer::isTracingVar(input_vars)) {
+    return jit::tracer::PreTraceInfo();
+  }
+
+  // Save scalar args and the calling convention
+  auto num_args = PyTuple_GET_SIZE(input_objects);
+  pyobj_list scalar_args;
+  std::string arg_types;
+  arg_types.reserve(num_args);
+  scalar_args.reserve(num_args);
+  for (int i = 0; i < num_args; i++) {
+    PyObject *arg_object = PyTuple_GET_ITEM(input_objects, i);
+    if (THPVariable_Check(arg_object)) {
+      arg_types.push_back('t');
+    } else {
+      arg_types.push_back('s');
+      Py_INCREF(arg_object);
+      scalar_args.emplace_back(arg_object);
+    }
+  }
+
+  Py_INCREF(op_obj);
+  auto pyobj = THPObjectPtr(op_obj);
+  return jit::tracer::preRecordPythonTrace(
+    std::move(pyobj),
+    std::move(arg_types),
+    input_vars,
+    std::move(scalar_args));
+}
+
+static void _trace_post_record(
+    const jit::tracer::PreTraceInfo& trace_info,
+    PyObject* op_obj,
+    const variable_list& input_vars,
+    PyObject *output_objects,
+    bool is_inplace) {
+  if (!trace_info.state) {
+    return;
+  }
+
+  // Isolate C variable ptrs in a vector
+  int num_outputs = PyTuple_GET_SIZE(output_objects);
+  variable_list output_vars(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    auto var = (THPVariable*)PyTuple_GET_ITEM(output_objects, i);
+    output_vars[i] = var->cdata;
+  }
+
+  jit::tracer::postRecordTrace(trace_info, output_vars);
+
+  auto state_lock = trace_info.state->lock();
+  trace_info.n->i_(attr::inplace, is_inplace);
+
+}
+
+PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const UnpackedInput& unpacked,
+                          PyObject *inputs, THPObjectPtr&& raw_output, bool is_executable,
+                          const jit::tracer::PreTraceInfo& trace_info) {
+  bool unpack_output = ensure_tuple(raw_output);
+
+  auto num_outputs = PyTuple_GET_SIZE(raw_output.get());
+
+  THPObjectPtr outputs(PyTuple_New(num_outputs));
+  if (!outputs) throw python_error();
+
+  grad_fn->cdata.clear_input_metadata();
+
+  // Record type, device, and size information about inputs
+  if (is_executable) {
+    grad_fn->input_info.clear();
+    grad_fn->input_info.reserve(unpacked.input_vars.size());
+    for (auto& var : unpacked.input_vars) {
+      grad_fn->input_info.emplace_back(var);
+    }
+  }
+
+  bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
+  _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable);
+  // NOTE: _trace_post_record has to run before _save_variables, because we need
+  // to assign traces to outputs before we convert them to SavedVariables.
+  // On the other hand, it needs to go after _mark_non_differentiable, because
+  // it might be wraping backwards in Evals, and _mark_non_differentiable uses
+  // grad_fn pointer equality for error checking.
+  _trace_post_record(trace_info, op_obj, unpacked.input_vars, outputs, is_inplace);
+  if (is_executable) {
+    _save_variables(grad_fn);
+  } else {
+    // Remove unnecessary attributes
+    Py_XDECREF(grad_fn->to_save);
+    grad_fn->to_save = nullptr;
+    Py_XDECREF(grad_fn->non_differentiable);
+    grad_fn->non_differentiable = nullptr;
+  }
+
+  // Unpack the output, unless .forward() returned a tuple
+  if (unpack_output) {
+    PyObject *output = PyTuple_GET_ITEM(outputs.get(), 0);
+    Py_INCREF(output);
+    return output;
+  }
+
+  return outputs.release();
+}
+
+// Legacy codepath
+PyObject *THPFunction_do_forward(THPFunction *self, PyObject *_inputs)
+{
+  HANDLE_TH_ERRORS
+  torch::autograd::profiler::RecordFunction record(Py_TYPE(self)->tp_name);
+
+  auto info_pair = unpack_input<true>(_inputs);
+  auto& unpacked_input = info_pair.first;
+  auto& input_info = info_pair.second;
+  bool is_executable = input_info.is_executable;
+  self->cdata.set_next_edges(std::move(input_info.next_edges));
+  self->needs_input_grad = input_info.needs_input_grad.release();
+
+  // We don't support tracing in the legacy code path
+  _assert_not_tracing(Py_TYPE(self)->tp_name, unpacked_input.input_vars);
+
+  // Now we're ready to call a forward (implemented in Python)
+  THPObjectPtr raw_output;
+  {
+    AutoGradMode grad_mode(false);
+    THPObjectPtr forward_fn(PyObject_GetAttrString((PyObject*)self, "forward"));
+    if (!forward_fn) return nullptr;
+    raw_output = PyObject_CallObject(forward_fn, unpacked_input.input_tuple);
+    if (!raw_output) return nullptr;
+  }
+
+  return process_outputs(nullptr, self, unpacked_input, _inputs, std::move(raw_output),
+                         is_executable, jit::tracer::PreTraceInfo());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
+{
+  HANDLE_TH_ERRORS
+  torch::autograd::profiler::RecordFunction record(((PyTypeObject*)cls)->tp_name);
+
+  THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls"));
+  if (!backward_cls) return nullptr;
+  THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr));
+  if (!ctx_obj) return nullptr;
+  THPFunction* ctx = (THPFunction*)ctx_obj.get();
+
+  // Prepare inputs and allocate context (grad fn)
+  auto info_pair = unpack_input<false>(inputs);
+  UnpackedInput& unpacked_input = info_pair.first;
+  InputFlags& input_info = info_pair.second;
+
+  // Record input nodes if tracing
+  auto trace_info = _trace_pre_record(cls, inputs, unpacked_input.input_vars);
+  if (trace_info.state) {
+    // TODO: ezyang suggests this is unused and can be removed
+    ctx->is_traced = true;
+  }
+
+  // Initialize backward function (and ctx)
+  bool is_executable = input_info.is_executable;
+  ctx->cdata.set_next_edges(std::move(input_info.next_edges));
+  ctx->needs_input_grad = input_info.needs_input_grad.release();
+  ctx->is_variable_input = std::move(input_info.is_variable_input);
+
+  // Prepend ctx to input_tuple, in preparation for static method call
+  auto num_args = PyTuple_GET_SIZE(inputs);
+  THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
+  PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, ctx_obj.release());
+  for (int i = 0; i < num_args; ++i) {
+    PyObject *arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i);
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg);
+  }
+
+  // Call forward
+  THPObjectPtr tensor_outputs;
+  {
+    AutoGradMode grad_mode(false);
+    THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward"));
+    if (!forward_fn) return nullptr;
+    tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);
+    if (!tensor_outputs) return nullptr;
+  }
+
+  return process_outputs(cls, ctx, unpacked_input, inputs, std::move(tensor_outputs),
+                         is_executable, trace_info);
+  END_HANDLE_TH_ERRORS
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Backward
+////////////////////////////////////////////////////////////////////////////////
+
+static void _prepare_grads(THPFunction *self, THPObjectPtr& raw_grads, bool is_grad_output)
+{
+  at::DeviceGuard device_guard;
+  int num_grads = PyTuple_GET_SIZE(raw_grads.get());
+  // First, check if any of grads is None. If not, there's nothing to do
+  bool has_none = false;
+  for (int i = 0; i < num_grads; i++) {
+    has_none |= PyTuple_GET_ITEM(raw_grads.get(), i) == Py_None;
+  }
+  if (!has_none)
+      return;
+
+  THPObjectPtr grads;
+  grads = PyTuple_New(num_grads);
+  if (!grads) throw python_error();
+
+  // Look for Nones and replace them with new buffers
+  auto& grads_info = is_grad_output ? self->output_info : self->input_info;
+  TORCH_ASSERT(grads_info.size() == (size_t)num_grads);
+  for (int i = 0; i < num_grads; i++) {
+    PyObject *grad = PyTuple_GET_ITEM(raw_grads.get(), i);
+    if (grad == Py_None) {
+      grad = THPVariable_Wrap(grads_info[i].zeros(device_guard));
+      if (!grad) throw python_error();
+    } else {
+      Py_INCREF(grad);
+    }
+    PyTuple_SET_ITEM(grads.get(), i, grad);
+  }
+  raw_grads = grads.release();
+}
+
+static void _trim_grad_input(THPFunction *self, THPObjectPtr& grad_input)
+{
+  int num_grads = PyTuple_GET_SIZE(grad_input.get());
+  const int num_outputs = self->cdata.num_outputs();
+  if (num_grads > num_outputs) {
+    // Check that all extra grads are none
+    bool all_none = true;
+    for (int i = num_outputs; i < num_grads; i++) {
+      all_none = (PyTuple_GET_ITEM(grad_input.get(), i) == Py_None);
+      if (!all_none) break;
+    }
+    // If yes, slice the tuple
+    if (all_none) {
+      num_grads = num_outputs;
+      grad_input = PyTuple_GetSlice(grad_input.get(), 0, num_grads);
+      if (!grad_input) throw python_error();
+    }
+  }
+}
+
+PyObject * THPFunction_do_backward(THPFunction *self, PyObject *args)
+{
+  try {
+    Py_ssize_t num_args = args ? PyTuple_GET_SIZE(args) : 0;
+    THPUtils_assert(num_args == 2, "_do_backward expects exactly two arguments");
+    PyObject *raw_grad_output = PyTuple_GET_ITEM(args, 0);
+    PyObject *retain_variables = PyTuple_GET_ITEM(args, 1);
+    if (!PyTuple_Check(raw_grad_output) || !PyBool_Check(retain_variables)) {
+      THPUtils_invalidArguments(args, nullptr, "_do_backward", 1, "(tuple, bool)");
+      return nullptr;
+    }
+    THPUtils_assert(PyTuple_GET_SIZE(raw_grad_output) == self->cdata.num_inputs(),
+                    "%s got an invalid number of gradients (expected %d got %d)",
+                    THPUtils_typename(self), self->cdata.num_inputs(),
+                    PyTuple_GET_SIZE(raw_grad_output));
+
+    // Some of the output might have been unused, so we have to allocate
+    // zero-filled buffers instead
+    Py_INCREF(raw_grad_output);
+    THPObjectPtr grad_output(raw_grad_output);
+    _prepare_grads(self, grad_output, true);
+
+    // self.backward(*grad_output)
+    THPObjectPtr backward_fn(PyObject_GetAttrString((PyObject*)self, "backward"));
+    THPUtils_assert(backward_fn.get(), "function %s doesn't implement a required "
+        "'backward' method", THPUtils_typename((PyObject*)self));
+    THPObjectPtr grad_input(PyObject_CallObject(backward_fn, grad_output.get()));
+    if (!grad_input) return nullptr;
+    ensure_tuple(grad_input);
+
+    // We allow functions to return more gradients, than there were outputs,
+    // if and only if the additional ones are all None
+    _trim_grad_input(self, grad_input);
+    int num_grads = PyTuple_GET_SIZE(grad_input.get());
+    int num_outputs = self->cdata.num_outputs();
+    THPUtils_assert(num_grads == num_outputs, "%s returned an invalid number of "
+        "gradient tensors (expected %d, but got %d)", THPUtils_typename(self),
+        num_outputs, num_grads);
+
+    // If any of the remaining grad_inputs are None, zero them.
+    _prepare_grads(self, grad_input, false);
+    return grad_input.release();
+
+  } catch (python_error& e) {
+    return nullptr;
+  } catch (std::exception& e) {
+    THPUtils_setError(e.what());
+    return nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Other methods / attributes
+////////////////////////////////////////////////////////////////////////////////
+
+PyObject* THPFunction__register_hook_dict(THPFunction *self, PyObject *_var)
+{
+  THPUtils_assert(THPVariable_Check(_var), "_register_hook_dict expected a variable");
+  THPVariable *var = (THPVariable*)_var;
+  std::unique_ptr<FunctionPreHook> hook(new PyFunctionPreHook(
+      var->backward_hooks, var->cdata.output_nr()));
+  self->cdata.add_pre_hook(std::move(hook));
+  Py_RETURN_NONE;
+}
+
+PyObject* THPFunction_register_hook(THPFunction *self, PyObject *hook)
+{
+  return torch::autograd::registerFunctionHook(self->cdata, hook);
+}
+
+static PyObject *unpack_saved_variables(
+    THPFunction *self,
+    std::function<PyObject*(const Variable&)> unpack_fn)
+{
+  THPUtils_assert(!self->has_freed_buffers, ERR_BACKWARD_TWICE);
+  auto& saved_variables = self->saved_variables;
+  if (saved_variables.empty())
+    return PyTuple_New(0);
+
+  int num_saved = saved_variables.size();
+  THPObjectPtr saved(PyTuple_New(num_saved));
+  if (!saved)
+    return nullptr;
+  auto saved_for = THPFunction_asFunction(self);
+  for (int i = 0; i < num_saved; i++) {
+    auto unpacked_var = saved_variables[i].unpack(saved_for);
+    THPObjectPtr value;
+    if (!unpacked_var.defined()) {
+      Py_INCREF(Py_None);
+      value = Py_None;
+    } else {
+      value = unpack_fn(unpacked_var);
+    }
+    PyTuple_SET_ITEM(saved.get(), i, value.release());
+  }
+  return saved.release();
+}
+
+PyObject *THPFunction_saved_tensors(THPFunction *self, void *_unused)
+{
+  HANDLE_TH_ERRORS
+  return unpack_saved_variables(self, [](const Variable& var) {
+    return THPVariable_Wrap(var);
+  });
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPFunction_saved_variables(THPFunction *self, void *_unused)
+{
+  HANDLE_TH_ERRORS
+  auto r = PyErr_WarnEx(PyExc_DeprecationWarning,
+      "'saved_variables' is deprecated; use 'saved_tensors'", 0);
+  if (r != 0) throw python_error();
+  return unpack_saved_variables(self, [](const Variable& var) {
+    return THPVariable_Wrap(var);
+  });
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPFunction_next_functions(THPFunction *self, void *_unused)
+{
+  const auto num_outputs = self->cdata.num_outputs();
+  THPObjectPtr result(PyTuple_New(num_outputs));
+  if (!result)
+    return nullptr;
+  for (uint32_t i = 0; i < num_outputs; i++) {
+    THPObjectPtr fn_tuple(PyTuple_New(2));
+    if (!fn_tuple) return nullptr;
+    const auto& edge = self->cdata.next_edge(i);
+    PyObject* fn = functionToPyObject(edge.function);
+    if (!fn) return nullptr;
+    PyTuple_SET_ITEM(fn_tuple.get(), 0, fn);
+    PyTuple_SET_ITEM(fn_tuple.get(), 1, THPUtils_packInt64(edge.input_nr));
+    PyTuple_SET_ITEM(result.get(), i, fn_tuple.release());
+  }
+  return result.release();
+}
+
+PyObject *THPFunction_metadata(THPFunction *self, void *_unused)
+{
+  auto metadata = static_cast<PyAnomalyMetadata*>(self->cdata.metadata())->dict();
+
+  Py_INCREF(metadata);
+  return metadata;
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+namespace {
+
+template<PyObject* THPFunction::*ptr>
+PyObject* getObject(PyObject* obj, void* _unused) {
+  auto self = (THPFunction*)obj;
+  PyObject* value = self->*ptr;
+  if (!value) {
+    Py_RETURN_NONE;
+  }
+  Py_INCREF(value);
+  return value;
+}
+
+template<PyObject* THPFunction::*ptr>
+int setObject(PyObject* obj, PyObject* value, void* _unused) {
+  auto self = (THPFunction*)obj;
+  if (value == Py_None) {
+    value = nullptr;
+  }
+  Py_XDECREF((self->*ptr));
+  Py_XINCREF(value);
+  self->*ptr = value;
+  return 0;
+}
+
+template<typename M, M THPFunction::*ptr, PyObject* (*Convert)(long)>
+PyObject* getMember(PyObject* obj, void* _unused) {
+  auto self = (THPFunction*)obj;
+  return Convert(self->*ptr);
+}
+
+template<typename M, M Function::*ptr, PyObject* (*Convert)(long)>
+PyObject* getImplMember(PyObject* obj, void* _unused) {
+  auto self = (THPFunction*)obj;
+  return Convert(self->cdata.*ptr);
+}
+
+PyObject* getRequiresGrad(PyObject* obj, void* _unused) {
+  Py_RETURN_TRUE;
+}
+
+}
+
+static struct PyGetSetDef THPFunction_properties[] = {
+  {"saved_tensors", (getter)THPFunction_saved_tensors, nullptr, nullptr, nullptr},
+  {"saved_variables", (getter)THPFunction_saved_variables, nullptr, nullptr, nullptr},
+  {"next_functions", (getter)THPFunction_next_functions, nullptr, nullptr, nullptr},
+  {"to_save", &getObject<&THPFunction::to_save>, &setObject<&THPFunction::to_save>, nullptr, nullptr},
+  {"non_differentiable", &getObject<&THPFunction::non_differentiable>, &setObject<&THPFunction::non_differentiable>, nullptr, nullptr},
+  {"dirty_tensors", &getObject<&THPFunction::dirty_tensors>, &setObject<&THPFunction::dirty_tensors>, nullptr, nullptr},
+  {"needs_input_grad", &getObject<&THPFunction::needs_input_grad>, nullptr, nullptr, nullptr},
+  {"requires_grad", getRequiresGrad, nullptr, nullptr, nullptr},
+  {"_is_tracing", &getMember<char, &THPFunction::is_traced, PyBool_FromLong>, nullptr, nullptr, nullptr},
+  {"metadata", (getter)THPFunction_metadata, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+static struct PyMethodDef THPFunction_methods[] = {
+  {(char*)"apply", (PyCFunction)THPFunction_apply, METH_CLASS | METH_VARARGS, nullptr},
+  {(char*)"_do_forward", (PyCFunction)THPFunction_do_forward, METH_VARARGS, nullptr},
+  {(char*)"_do_backward", (PyCFunction)THPFunction_do_backward, METH_VARARGS, nullptr},
+  {(char*)"_register_hook_dict", (PyCFunction)THPFunction__register_hook_dict, METH_O, nullptr},
+  {(char*)"register_hook", (PyCFunction)THPFunction_register_hook, METH_O, nullptr},
+  {nullptr}
+};
+
+PyTypeObject THPFunctionType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C._FunctionBase",              /* tp_name */
+  sizeof(THPFunction),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPFunction_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  nullptr,                               /* tp_doc */
+  (traverseproc)THPFunction_traverse,    /* tp_traverse */
+  (inquiry)THPFunction_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPFunction_methods,                   /* tp_methods */
+  0,                                     /* tp_members */
+  THPFunction_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPFunction_new                        /* tp_new */
+};
+
+bool THPFunction_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPFunctionType) < 0)
+    return false;
+  Py_INCREF(&THPFunctionType);
+  PyModule_AddObject(module, "_FunctionBase", (PyObject *)&THPFunctionType);
+  return true;
+}
+
+struct Decref {
+  void operator()(PyFunction* p) const {
+    AutoGIL gil;
+    Py_DECREF(p->obj);
+  }
+};
+
+// Similar to shared_from_this. There's a problem that the Python object
+// and its cdata depend on each other being alive, so we can't keep
+// shared_ptrs as members, but we'd like to be able to manage the lifetime of
+// the objects using shared_ptrs in the C++ graph. This returns a new
+// shared_ptr, which will decrement the Python reference count when it's
+// destructed. WARNING: it's generally not safe to create weak_ptrs from
+// these shared_ptrs since multiple shared_ptrs may control the same underlying
+// object.
+std::shared_ptr<PyFunction> THPFunction_asFunction(THPFunction* self)
+{
+  if (!self) {
+    return std::shared_ptr<PyFunction>();
+  }
+
+  Py_INCREF((PyObject*)self);
+  return std::shared_ptr<PyFunction>(&self->cdata, Decref());
+}
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
new file mode 100644
index 0000000..7bc7548
--- /dev/null
+++ b/torch/csrc/autograd/python_function.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+#include <ATen/optional.h>
+
+#include <vector>
+#include <utility>
+#include <memory>
+
+namespace at {
+struct DeviceGuard;
+} // namespace at
+
+namespace torch { namespace jit { struct Graph; }}
+namespace torch { namespace autograd {
+
+struct VariableInfo {
+  explicit VariableInfo(const Variable& var);
+
+  Variable zeros(at::DeviceGuard& device_guard) const;
+
+  at::Type* type;
+  int32_t device = -1;
+  std::vector<int64_t> size;
+  bool requires_grad;
+};
+
+// A Function which is implemented by a Python object (i.e., a THPFunction).
+// Calls to 'apply' are forwarded to the Python method implementation.
+struct PyFunction : public Function {
+  PyFunction(PyObject* obj) : obj(obj) {}
+
+  virtual variable_list apply(variable_list&& inputs) override;
+  variable_list legacy_apply(const variable_list& inputs);
+
+  virtual void release_variables() override;
+  virtual std::string name() const override;
+  virtual std::shared_ptr<Function> get_shared_ptr() override;
+  virtual bool is_traceable() override;
+
+  // THPFunction this Function is wrapping.
+  PyObject* obj;
+};
+
+/**
+ * Cast an object into a tuple, if it is not a tuple already. Returns true
+ * if the original object was not a tuple.
+ */
+inline bool ensure_tuple(THPObjectPtr& obj) {
+  if (PyTuple_Check(obj.get()))
+    return false;
+
+  PyObject *tuple = PyTuple_New(1);
+  if (!tuple) throw python_error();
+  PyTuple_SET_ITEM(tuple, 0, obj.release());
+  obj = tuple;
+  return true;
+}
+
+}} // namespace torch::autograd
+
+struct THPFunction {
+    PyObject_HEAD
+
+    PyObject *needs_input_grad;
+
+    // Python tuple of tensors whose variables we should save.  Set
+    // by Python with 'save_for_backward'.  If nullptr, no tensors were
+    // saved.
+    PyObject *to_save;
+    // Python tuple of tensors which are not differentiable.  Set by
+    // Python with 'mark_non_differentiable'.  If nullptr, no tensors were
+    // non-differentiable.
+    PyObject *non_differentiable;
+    // Python tuple of tensors which had inplace updates in the forward()
+    // pass.  Set by Python with 'mark_dirty'.  If nullptr, no tensors were
+    // modified inplace.
+    PyObject *dirty_tensors;
+
+    std::vector<torch::autograd::VariableInfo> output_info;
+    std::vector<torch::autograd::VariableInfo> input_info;
+    std::vector<torch::autograd::SavedVariable> saved_variables;
+    // For each input, true if the input is a THPVariable
+    std::vector<bool> is_variable_input;
+    char has_freed_buffers;
+    char is_traced;
+
+    // The C++ wrapper for this Python function.
+    // See a comment in THPFunction_asFunction for details about this field.
+    torch::autograd::PyFunction cdata;
+};
+
+bool THPFunction_initModule(PyObject *module);
+extern PyTypeObject THPFunctionType;
+extern PyObject *THPFunctionClass;
+
+// XXX: this function requires the GIL (it can have side effects).
+std::shared_ptr<torch::autograd::PyFunction> THPFunction_asFunction(THPFunction* self);
+
+inline bool THPFunction_Check(PyObject* obj) {
+  return PyObject_IsInstance(obj, (PyObject*)&THPFunctionType);
+}
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
new file mode 100644
index 0000000..3ceb1f4
--- /dev/null
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -0,0 +1,198 @@
+#include "torch/csrc/autograd/python_hook.h"
+
+#include <sstream>
+
+#include "THP.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/Exceptions.h"
+
+using torch::autograd::variable_list;
+using torch::autograd::Variable;
+
+static PyObject* wrap_variables(const variable_list& c_variables);
+static variable_list unwrap_variables(PyObject* py_variables);
+static std::string hook_name(PyObject* hook);
+static void check_result(PyObject* original, PyObject* result, PyObject* hook);
+static void check_single_result(PyObject* original, PyObject* result, PyObject* hook);
+
+
+namespace torch { namespace autograd {
+
+PyFunctionPreHook::PyFunctionPreHook(PyObject* dict, int value_idx)
+  : dict(dict)
+  , value_idx(value_idx)
+{
+  Py_INCREF(dict);
+}
+
+PyFunctionPreHook::~PyFunctionPreHook() {
+  AutoGIL gil;
+  Py_DECREF(dict);
+}
+
+auto PyFunctionPreHook::operator()(const variable_list& values) -> variable_list
+{
+  AutoGIL gil;
+
+  THPObjectPtr value(THPVariable_Wrap(values.at(value_idx)));
+  if (!value) throw python_error();
+
+  PyObject *key, *hook;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(dict, &pos, &key, &hook)) {
+    THPObjectPtr res(PyObject_CallFunctionObjArgs(hook, value.get(), nullptr));
+    if (!res) throw python_error();
+    if (res == Py_None) continue;
+    check_single_result(value.get(), res.get(), hook);
+    value = std::move(res);
+  }
+
+  variable_list results(values);
+  results[value_idx] = ((THPVariable*)value.get())->cdata;
+  return results;
+}
+
+PyFunctionPostHook::PyFunctionPostHook(PyObject* dict) : dict(dict) {
+  Py_INCREF(dict);
+}
+
+PyFunctionPostHook::~PyFunctionPostHook() {
+  AutoGIL gil;
+  Py_DECREF(dict);
+}
+
+auto PyFunctionPostHook::operator()(
+    const variable_list& _outputs, /* grad_inputs */
+    const variable_list& _inputs /* grad_outputs */) -> variable_list
+{
+  AutoGIL gil;
+
+  THPObjectPtr outputs(wrap_variables(_outputs));
+  THPObjectPtr inputs(wrap_variables(_inputs));
+
+  PyObject *key, *hook;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(dict, &pos, &key, &hook)) {
+    THPObjectPtr res(PyObject_CallFunctionObjArgs(
+        hook, outputs.get(), inputs.get(), nullptr));
+    if (!res) throw python_error();
+    if (res == Py_None) continue;
+    check_result(outputs, res, hook);
+    outputs = std::move(res);
+  }
+
+  return unwrap_variables(outputs.get());
+}
+
+}} // namespace torch::autograd
+
+
+static PyObject *wrap_variables(const variable_list& c_variables)
+{
+  size_t num_vars = c_variables.size();
+  THPObjectPtr tuple(PyTuple_New(num_vars));
+  if (!tuple) throw python_error();
+  for (size_t i = 0; i < num_vars; ++i) {
+    THPObjectPtr var(THPVariable_Wrap(c_variables[i]));
+    if (!var) throw python_error();
+    PyTuple_SET_ITEM(tuple.get(), i, var.release());
+  }
+  return tuple.release();
+}
+
+static variable_list unwrap_variables(PyObject* py_variables)  {
+  variable_list results(PyTuple_GET_SIZE(py_variables));
+  for (size_t i = 0; i < results.size(); i++) {
+    PyObject* item = PyTuple_GET_ITEM(py_variables, i);
+    if (item == Py_None) {
+      continue;
+    } else if (THPVariable_Check(item)) {
+      results[i] = ((THPVariable*)item)->cdata;
+    } else {
+      // this should never happen, but just in case...
+      std::stringstream ss;
+      ss << "expected variable but got " << Py_TYPE(item)->tp_name;
+      throw std::runtime_error(ss.str());
+    }
+  }
+  return results;
+}
+
+static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
+  if (!PyTuple_Check(result)) {
+    PyErr_Format(PyExc_TypeError, "expected tuple, but hook returned '%s'",
+        THPUtils_typename(result));
+    throw python_error();
+  }
+
+  auto prev_size = PyTuple_GET_SIZE(prev);
+  auto result_size = PyTuple_GET_SIZE(result);
+  if (prev_size != result_size) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has returned an incorrect number ";
+    ss << "of values (got " << result_size << ", but expected ";
+    ss << prev_size << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  for (auto i = 0; i < prev_size; i++) {
+    check_single_result(PyTuple_GET_ITEM(prev, i), PyTuple_GET_ITEM(result, i), hook);
+  }
+}
+
+static void check_single_result(PyObject* _original, PyObject* _result, PyObject* hook) {
+  if (_result == Py_None) return;
+
+  if (_original == Py_None) {
+    throw std::runtime_error("can't replace a None gradient with a non-None value");
+  }
+
+  if (!PyObject_IsInstance(_result, THPVariableClass)) {
+    PyErr_Format(PyExc_TypeError, "expected Variable, but hook returned '%s'",
+        THPUtils_typename(_result));
+    throw python_error();
+  }
+
+  auto& original = ((THPVariable*)_original)->cdata.data();
+  auto& result = ((THPVariable*)_result)->cdata.data();
+
+  if (original.type().ID() != result.type().ID()) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has changed the type of value (";
+    ss << "was " << original.toString() << " got ";
+    ss << result.toString() << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  if (original.type().is_cuda() != result.type().is_cuda()) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has changed the type of value";
+    if (original.type().is_cuda()) {
+      ss << " (was CUDA tensor got CPU tensor)";
+    } else {
+      ss << " (was CPU tensor got CUDA tensor)";
+    }
+    throw std::runtime_error(ss.str());
+  }
+
+  if (original.sizes().vec() != result.sizes().vec()) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has changed the size of value";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+static std::string hook_name(PyObject* hook) {
+  THPObjectPtr name(PyObject_GetAttrString(hook, "__name__"));
+  if (name && THPUtils_checkString(name.get())) {
+    return THPUtils_unpackString(name.get());
+  }
+  return "<unknown>";
+}
diff --git a/torch/csrc/autograd/python_hook.h b/torch/csrc/autograd/python_hook.h
new file mode 100644
index 0000000..4cc00d8
--- /dev/null
+++ b/torch/csrc/autograd/python_hook.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/autograd/function_hook.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+namespace torch { namespace autograd {
+
+struct PyFunctionPreHook : public FunctionPreHook {
+  PyFunctionPreHook(PyObject* dict, int value_idx);
+  ~PyFunctionPreHook();
+  variable_list operator()(const variable_list& values) override;
+  PyObject* dict;
+  int value_idx;
+};
+
+struct PyFunctionPostHook : public FunctionPostHook {
+  PyFunctionPostHook(PyObject* dict);
+  ~PyFunctionPostHook();
+  variable_list operator()(const variable_list& outputs, const variable_list& inputs) override;
+  PyObject* dict;
+};
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
new file mode 100644
index 0000000..56eb128
--- /dev/null
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -0,0 +1,126 @@
+#include "python_legacy_variable.h"
+
+#include <ATen/ATen.h>
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/tensor/python_tensor.h"
+
+using namespace at;
+
+namespace torch { namespace autograd {
+
+static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject *kwds) {
+  HANDLE_TH_ERRORS
+  THPObjectPtr _data;
+  PyObject *data = nullptr;
+  PyObject *grad_fn = nullptr;
+  char is_volatile = 0;
+  char requires_grad = 0;
+  const char* name = nullptr;
+
+  const char *accepted_args[] = {"data", "requires_grad", "volatile", "_grad_fn", "name", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(args, kwds, "|ObbOz", (char**)accepted_args,
+      &data, &requires_grad, &is_volatile, &grad_fn, &name))
+    return nullptr;
+
+  if (grad_fn == Py_None)
+    grad_fn = nullptr;
+
+  if (is_volatile) {
+    PyErr_WarnEx(PyExc_UserWarning,
+        "volatile was removed and now has no effect. Use `with torch.no_grad():` "
+        "instead.", 1);
+  }
+
+  if (is_volatile && requires_grad) {
+    throw ValueError("Variable can't be volatile and require_grad at the same time!");
+  }
+  if (grad_fn && !THPFunction_Check(grad_fn)) {
+    throw TypeError("_grad_fn has to be a Function object or None, but got %s",
+        Py_TYPE(grad_fn)->tp_name);
+  }
+  Tensor tensor;
+  if (!data || data == Py_None) {
+    // For legacy serialization code, create an empty tensor. This is also used
+    // by nn.Parameter() with no arguments.
+    auto var = torch::tensors::get_default_tensor_type().tensor();
+    tensor = static_cast<Variable&>(var).data();
+  } else if (THPVariable_Check(data)) {
+    tensor = ((THPVariable*)data)->cdata.data();
+  } else {
+    throw torch::TypeError("Variable data has to be a tensor, but got %s",
+        Py_TYPE(data)->tp_name);
+  }
+
+  Variable var;
+  if (grad_fn) {
+    auto grad_fn_ = THPFunction_asFunction((THPFunction*)grad_fn);
+    Edge edge(grad_fn_, grad_fn_->add_input_metadata(tensor.type(), tensor.sizes()));
+    var = make_variable(std::move(tensor), std::move(edge));
+  } else {
+    var = make_variable(std::move(tensor), requires_grad);
+  }
+
+  if (name) {
+    var.set_name(name);
+  }
+
+  return THPVariable_Wrap(std::move(var));
+  END_HANDLE_TH_ERRORS
+}
+
+PyTypeObject THPLegacyVariableType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C._LegacyVariableBase",        /* tp_name */
+  0,                                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  nullptr,                               /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPVariable_pynew                      /* tp_new */
+};
+
+void init_legacy_variable(PyObject *module) {
+  if (PyType_Ready(&THPLegacyVariableType) < 0) {
+    throw python_error();
+  }
+  auto obj = (PyObject*)&THPLegacyVariableType;
+  Py_INCREF(obj);
+  if (PyModule_AddObject(module, "_LegacyVariableBase", obj) < 0) {
+    throw python_error();
+  }
+}
+
+}}  // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_legacy_variable.h b/torch/csrc/autograd/python_legacy_variable.h
new file mode 100644
index 0000000..d9514a6
--- /dev/null
+++ b/torch/csrc/autograd/python_legacy_variable.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// Instantiates torch._C._LegacyVariableBase, which defines the Python
+// constructor (__new__) for torch.autograd.Variable.
+
+#include "torch/csrc/python_headers.h"
+
+namespace torch { namespace autograd {
+
+void init_legacy_variable(PyObject *module);
+
+}}  // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
new file mode 100644
index 0000000..a93f4e6
--- /dev/null
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -0,0 +1,485 @@
+#include "torch/csrc/autograd/python_variable.h"
+
+#include "THP.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Device.h"
+#include "torch/csrc/Size.h"
+#include "torch/csrc/Types.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include "torch/csrc/autograd/python_hook.h"
+#include "torch/csrc/autograd/python_variable_indexing.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/functions/accumulate_grad.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/utils/python_error_messages.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/jit/tracer_state.h"
+#include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/tensor_new.h"
+
+#include <ATen/ATen.h>
+
+#include <structmember.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace at;
+using namespace torch;
+using namespace torch::autograd;
+
+PyObject *THPVariableClass = nullptr;
+
+static const char* VOLATILE_WARNING =
+    "volatile was removed and now has no effect. Use "
+    "`with torch.no_grad():` instead.";
+
+// Creates a new Python object for a Variable. The Variable must not already
+// have a PyObject* associated with it.
+static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
+{
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (obj) {
+    auto v = (THPVariable*) obj;
+    new (&v->cdata) Variable(std::move(var));
+    v->cdata.set_pyobj(obj);
+    if (auto fn = dynamic_cast<PyFunction*>(v->cdata.grad_fn_unsafe())) {
+      // Create a new reference to the THPFunction. This ensures that ref count
+      // of the THPFunction is at least the number of referring THPVariables.
+      const auto output_nr = v->cdata.output_nr();
+      auto grad_fn = THPFunction_asFunction((THPFunction*)fn->obj);
+      v->cdata.set_gradient_edge({std::move(grad_fn), output_nr});
+    }
+  }
+  return obj;
+}
+
+PyObject * THPVariable_Wrap(Variable var)
+{
+  if (!var.defined()) {
+    Py_RETURN_NONE;
+  }
+
+  if (auto obj = var.pyobj()) {
+    Py_INCREF(obj);
+    return obj;
+  }
+
+  return THPVariable_NewWithVar((PyTypeObject *)THPVariableClass, std::move(var));
+}
+
+static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
+{
+  Py_VISIT(self->backward_hooks);
+  // We don't want to traverse the grad_fn, even if the Variable owns it and the
+  // shared pointer's use count is 1. This is because we would need to treat
+  // the grad_fn as part of the Python state and hold the GIL sometimes when
+  // grad_fn's shared_ptr is copied, otherwise a race condition with the Python
+  // GC could occur. Holding the GIL when the shared_ptr is copied adds
+  // undesirable complexity/overhead.
+  //
+  // When hooks, a Variable, and its grad_fn are involved in a Python reference
+  // cycle, because we're not traversing the grad_fn, the reference cycle will
+  // in fact leak.
+  //
+  // See https://gist.github.com/zou3519/7ac92b84dd7d206dcc6eae55fee8372c
+  // for more details about the race condition involving traversing the grad_fn
+  // and the python GC.
+  if (self->cdata.defined()) {
+    for (const auto& hook : self->cdata.hooks()) {
+      if (auto pyhook = dynamic_cast<PyFunctionPreHook*>(hook.get())) {
+        Py_VISIT(pyhook->dict);
+      }
+    }
+  }
+  return 0;
+}
+
+static int THPVariable_clear(THPVariable *self)
+{
+  Py_CLEAR(self->backward_hooks);
+  if (self->cdata.defined()) {
+    if (auto grad_acc = self->cdata.try_get_grad_accumulator()) {
+      grad_acc->pre_hooks().clear();
+    }
+    self->cdata.set_pyobj(nullptr);
+  }
+  self->cdata.reset();
+  return 0;
+}
+
+static void THPVariable_dealloc(THPVariable* self)
+{
+  PyObject_GC_UnTrack(self);
+  THPVariable_clear(self);
+  self->cdata.~Variable();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto& default_type = torch::tensors::get_default_tensor_type();
+  auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs);
+  return THPVariable_NewWithVar(type, std::move(tensor));
+  END_HANDLE_TH_ERRORS
+}
+
+// Instantiates a subclass of torch.Tensor. Used by nn.Parameter()
+static PyObject* THPVariable_make_subclass(PyObject* _ignored, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "_make_subclass(PyObject* cls, Tensor data, bool require_grad=False)",
+  });
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  PyObject* cls = r.pyobject(0);
+  if (!PyType_Check(cls)) {
+    throw TypeError("cls must be a type (got %s)", Py_TYPE(cls)->tp_name);
+  }
+  auto& data = as_variable_ref(r.tensor(1)).data();
+  auto var = make_variable(data, r.toBool(2));
+  return THPVariable_NewWithVar((PyTypeObject*)cls, std::move(var));
+  END_HANDLE_TH_ERRORS
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+PyObject *THPVariable_get_cdata(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& var = self->cdata;
+  return PyLong_FromVoidPtr(var.unsafeGetTH(false));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_get_version(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& var = self->cdata;
+  return PyInt_FromLong(var.current_version());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_get_grad_fn(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& var = self->cdata;
+  if (!var.grad_fn()) {
+    Py_RETURN_NONE;
+  }
+  return functionToPyObject(var.grad_fn());
+  END_HANDLE_TH_ERRORS
+}
+
+static int THPVariable_set_grad_fn(THPVariable *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assertRet(-1, obj == Py_None, "_grad_fn can be only set to None");
+  self->cdata.detach_();
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+static PyObject *THPVariable_is_leaf(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(!self->cdata.grad_fn());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_get_data(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return THPVariable_Wrap(make_variable(self->cdata.data(), false));
+  END_HANDLE_TH_ERRORS
+}
+
+int THPVariable_set_data(THPVariable *self, PyObject *data)
+{
+  HANDLE_TH_ERRORS
+  if (!THPVariable_Check(data)) {
+    throw torch::TypeError("Variable data has to be a tensor, but got %s", Py_TYPE(data)->tp_name);
+  }
+  at::detail::set_data(self->cdata, THPVariable_UnpackData(data));
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+PyObject *THPVariable_get_grad(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return THPVariable_Wrap(self->cdata.grad());
+  END_HANDLE_TH_ERRORS
+}
+
+int THPVariable_set_grad(THPVariable *self, PyObject *py_grad)
+{
+  HANDLE_TH_ERRORS
+  auto& var = self->cdata;
+  if (py_grad == Py_None) {
+    var.grad().reset();
+    return 0;
+  }
+
+  THPUtils_assertRet(-1, THPVariable_Check(py_grad),
+      "expected Variable or None (got %s)", THPUtils_typename(py_grad));
+  THPUtils_assertRet(-1, self != (THPVariable*)py_grad,
+      "can't assign Variable as its own grad");
+
+  auto& grad = ((THPVariable*)py_grad)->cdata;
+  auto& sparseType = var.type().toBackend(var.is_cuda() ? kSparseCUDA : kSparseCPU);
+
+  THPUtils_assertRet(-1, grad.type() == var.type() || grad.type() == sparseType,
+      "assigned grad has data of a different type");
+  if (var.type().is_cuda()) {
+    THPUtils_assertRet(-1, grad.get_device() == var.get_device(),
+        "assigned grad has data located on a different device");
+  }
+  THPUtils_assertRet(-1, grad.sizes().equals(var.sizes()),
+      "assigned grad has data of a different size");
+
+  var.grad() = grad;
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+PyObject *THPVariable_get_volatile(THPVariable *self)
+{
+  const char* msg = "volatile was removed (Variable.volatile is always False)";
+  PyErr_WarnEx(PyExc_UserWarning, msg, 1);
+  Py_RETURN_FALSE;
+}
+
+int THPVariable_set_volatile(THPVariable *self, PyObject *obj)
+{
+  return PyErr_WarnEx(PyExc_UserWarning, VOLATILE_WARNING, 1);
+}
+
+PyObject *THPVariable_get_output_nr(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  const auto output_nr = static_cast<long>(self->cdata.output_nr());
+  return PyInt_FromLong(output_nr);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_get_requires_grad(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(self->cdata.requires_grad());
+  END_HANDLE_TH_ERRORS
+}
+
+int THPVariable_set_requires_grad(THPVariable *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assertRet(-1, PyBool_Check(obj), "requires_grad must be a bool");
+  auto& var = self->cdata;
+  auto requires_grad = (obj == Py_True);
+  if (!var.is_leaf()) {
+    THPUtils_setError(autograd::utils::requires_grad_leaf_error(obj == Py_True).c_str());
+    return -1;
+  }
+  if (requires_grad && !var.is_floating_point()) {
+    THPUtils_setError("only Tensors of floating point dtype can require gradients");
+    return -1;
+  }
+  var.set_requires_grad(requires_grad);
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+PyObject *THPVariable_get_name(THPVariable* self)
+{
+  if (self->cdata.name() == "")
+    Py_RETURN_NONE;
+  return THPUtils_packString(self->cdata.name().c_str());
+}
+
+PyObject *THPVariable_get_backwards_hooks(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  if (self->backward_hooks) {
+    Py_INCREF(self->backward_hooks);
+    return self->backward_hooks;
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+int THPVariable_set_backwards_hooks(THPVariable *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  if (obj == Py_None) {
+    obj = nullptr;
+  }
+  Py_XINCREF(obj);
+  Py_XDECREF(self->backward_hooks);
+  self->backward_hooks = obj;
+  self->cdata.clear_hooks();
+  if (obj) {
+    self->cdata.add_hook(std::make_shared<PyFunctionPreHook>(obj, 0));
+  }
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+PyObject *THPVariable_get_base(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  if (self->cdata.is_view()) {
+    return THPVariable_Wrap(self->cdata.base());
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_get_shape(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return THPSize_New(self->cdata);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_is_cuda(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = self->cdata;
+  return torch::autograd::utils::wrap(self_.is_cuda());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *THPVariable_is_sparse(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = self->cdata;
+  return torch::autograd::utils::wrap(self_.is_sparse());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject *THPVariable_dtype(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = self->cdata;
+  return torch::autograd::utils::wrap(torch::getDtype(self_.type().scalarType()));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_layout(THPVariable* self) {
+  HANDLE_TH_ERRORS
+  auto& self_ = self->cdata;
+  return torch::autograd::utils::wrap(torch::getLayout(self_.type().backend()));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_device(THPVariable* self) {
+  HANDLE_TH_ERRORS
+  return THPDevice_New(torch::tensors::getDevice(self->cdata));
+  END_HANDLE_TH_ERRORS
+}
+
+static struct PyGetSetDef THPVariable_properties[] = {
+  {"_cdata", (getter)THPVariable_get_cdata, nullptr, nullptr, nullptr},
+  {"_version", (getter)THPVariable_get_version, nullptr, nullptr, nullptr},
+  {"grad_fn", (getter)THPVariable_get_grad_fn, nullptr, nullptr, nullptr},
+  {"_grad_fn", (getter)THPVariable_get_grad_fn, (setter)THPVariable_set_grad_fn, nullptr, nullptr},
+  {"is_leaf", (getter)THPVariable_is_leaf, nullptr, nullptr, nullptr},
+  {"data", (getter)THPVariable_get_data, (setter)THPVariable_set_data, nullptr, nullptr},
+  {"_grad", (getter)THPVariable_get_grad, (setter)THPVariable_set_grad, nullptr, nullptr}, // only for legacy reasons
+  {"grad", (getter)THPVariable_get_grad, (setter)THPVariable_set_grad, nullptr, nullptr},
+  {"_base", (getter)THPVariable_get_base, nullptr, nullptr, nullptr},
+  {"volatile", (getter)THPVariable_get_volatile, (setter)THPVariable_set_volatile, nullptr, nullptr},
+  {"output_nr", (getter)THPVariable_get_output_nr, nullptr, nullptr, nullptr},
+  {"requires_grad", (getter)THPVariable_get_requires_grad, (setter)THPVariable_set_requires_grad, nullptr, nullptr},
+  {"_backward_hooks", (getter)THPVariable_get_backwards_hooks, (setter)THPVariable_set_backwards_hooks, nullptr, nullptr},
+  {"name", (getter)THPVariable_get_name, nullptr, nullptr, nullptr},
+  {"shape", (getter)THPVariable_get_shape, nullptr, nullptr, nullptr},
+  {"is_cuda", (getter)THPVariable_is_cuda, nullptr, nullptr, nullptr},
+  {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr},
+  {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr},
+  {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr},
+  {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+static PyMappingMethods THPVariable_as_mapping = {
+  THPVariable_length,
+  THPVariable_getitem,
+  THPVariable_setitem,
+};
+
+static PyMethodDef extra_methods[] = {
+  {"_make_subclass", (PyCFunction)THPVariable_make_subclass, METH_STATIC | METH_VARARGS | METH_KEYWORDS, NULL},
+  {NULL}
+};
+
+PyTypeObject THPVariableType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C._TensorBase",                /* tp_name */
+  sizeof(THPVariable),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPVariable_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  &THPVariable_as_mapping,               /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  nullptr,                               /* tp_doc */
+  (traverseproc)THPVariable_traverse,    /* tp_traverse */
+  (inquiry)THPVariable_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  THPVariable_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPVariable_pynew                      /* tp_new */
+};
+
+namespace torch { namespace autograd {
+
+extern PyMethodDef variable_methods[];
+extern void initTorchFunctions(PyObject *module);
+
+}}
+
+bool THPVariable_initModule(PyObject *module)
+{
+  static std::vector<PyMethodDef> methods;
+  THPUtils_addPyMethodDefs(methods, torch::autograd::variable_methods);
+  THPUtils_addPyMethodDefs(methods, extra_methods);
+  THPVariableType.tp_methods = methods.data();
+  if (PyType_Ready(&THPVariableType) < 0)
+    return false;
+  Py_INCREF(&THPVariableType);
+  PyModule_AddObject(module, "_TensorBase",   (PyObject *)&THPVariableType);
+  torch::autograd::initTorchFunctions(module);
+  return true;
+}
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
new file mode 100644
index 0000000..5a21264
--- /dev/null
+++ b/torch/csrc/autograd/python_variable.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <memory>
+#include <ATen/ATen.h>
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/THP_export.h"
+
+// Python object that backs torch.autograd.Variable
+struct THPVariable {
+    PyObject_HEAD
+    // Payload
+    torch::autograd::Variable cdata;
+    // Hooks to be run on backwards pass (corresponds to Python attr
+    // '_backwards_hooks', set by 'register_hook')
+    PyObject* backward_hooks;
+};
+
+THP_API PyObject *THPVariableClass;
+
+bool THPVariable_initModule(PyObject *module);
+THP_API PyObject * THPVariable_Wrap(torch::autograd::Variable var);
+
+inline bool THPVariable_Check(PyObject *obj)
+{
+  return THPVariableClass && PyObject_IsInstance(obj, THPVariableClass);
+}
+
+inline torch::autograd::Variable& THPVariable_Unpack(PyObject* obj) {
+  auto var = (THPVariable*)obj;
+  return var->cdata;
+}
+
+inline at::Tensor& THPVariable_UnpackData(PyObject* obj) {
+  auto var = (THPVariable*)obj;
+  return var->cdata.data();
+}
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
new file mode 100644
index 0000000..cd8329c
--- /dev/null
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -0,0 +1,385 @@
+#include "torch/csrc/autograd/python_variable_indexing.h"
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/THP_export.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/python_compat.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_conversion_dispatch.h"
+
+#include <ATen/DeviceGuard.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/TensorOptions.h>
+
+#include <vector>
+#include <tuple>
+
+using namespace at;
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+Py_ssize_t THPVariable_length(PyObject* self) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.dim() == 0) {
+    return 0;
+  }
+  return (Py_ssize_t)self_.size(0);
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+
+// We allow indexing by integers, slices, ellipsis, None, Variables,
+// and tuples of those types. We also handle bools as if they were a
+// Variable[ByteTensor].
+
+static int64_t count_specified_dimensions(PyObject* index) {
+  // Count the number of indexed dimensions (everything but ellipsis and None)
+  int64_t count = 0;
+  auto size = PyTuple_GET_SIZE(index);
+  for (Py_ssize_t i = 0; i < size; i++) {
+    PyObject* obj = PyTuple_GET_ITEM(index, i);
+    if (THPVariable_Check(obj)) {
+      auto& var = reinterpret_cast<THPVariable*>(obj)->cdata;
+      if (var.type().scalarType() == kByte) {
+        count += var.dim();
+      } else {
+        count++;
+      }
+    } else if (obj != Py_None && obj != Py_Ellipsis && obj != Py_True && obj != Py_False) {
+      count++;
+    }
+  }
+  return count;
+}
+
+[[noreturn]]
+static void invalid_index(PyObject* obj) {
+  throw IndexError(
+    "only integers, slices (`:`), ellipsis (`...`), None and long or byte "
+    "Variables are valid indices (got %s)", Py_TYPE(obj)->tp_name);
+}
+
+static Variable applySlice(const Variable& self, int64_t dim, PyObject* slice, bool ensure_view=false) {
+  Py_ssize_t start, stop, step, slicelength;
+  auto length = self.size(dim);
+  if (!THPUtils_parseSlice(slice, length, &start, &stop, &step, &slicelength)) {
+    throw python_error();
+  }
+  if (step == 0) {
+    throw ValueError("step cannot be zero");
+  }
+  if (step < 0) {
+    // TODO: implement negative step
+    throw ValueError("negative step not yet supported");
+  }
+  if (!ensure_view && start == 0 && stop == length && step == 1) {
+    return self;
+  }
+  return self.slice(dim, start, stop, step);
+}
+
+static Variable applySelect(const Variable& self, int64_t dim, int64_t index) {
+  if (index == 0 && dim == 0 && self.dim() == 0) {
+    // Deprecated support for indexing 0-dim tensors as if they were 1-dim.
+    PyErr_WarnEx(PyExc_UserWarning,
+        "invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. "
+        "Use tensor.item() to convert a 0-dim tensor to a Python number", 1);
+    return at::alias(self);
+  }
+  int64_t size = self.size(dim);
+  if (index < -size || index >= size) {
+    throw IndexError("index %lld is out of bounds for dimension %lld with size %lld",
+      index, dim, size);
+  }
+  if (index < 0) {
+    index += size;
+  }
+  return self.select(dim, index);
+}
+
+static Variable sequenceToVariable(const Type& type, PyObject* seq) {
+  auto& idx_type = type.toScalarType(kLong);
+  return torch::utils::legacy_new_from_data(idx_type, at::nullopt, seq);
+}
+
+static Variable valueToTensor(const Type & type, PyObject* value) {
+  if (THPVariable_Check(value)) {
+    return reinterpret_cast<THPVariable*>(value)->cdata;
+  }
+  if (THPUtils_checkLong(value)) {
+    return type.scalarTensor(Scalar(THPUtils_unpackLong(value)));
+  }
+  if (PyFloat_Check(value)) {
+    return type.scalarTensor(Scalar(THPUtils_unpackDouble(value)));
+  }
+  throw TypeError("can't assign a %s to a %s", Py_TYPE(value)->tp_name, type.toString());
+}
+
+static Variable boolToIndexingTensor(const Variable& self, bool value) {
+  // booleans add a dimension of size 1. true indexes this dimension as if 0:, false as empty.
+  if (value) {
+    return at::zeros({1}, self.options().dtype(kLong));
+  } else {
+    return at::empty({0}, self.options().dtype(kLong));
+  }
+}
+
+static Variable applySlicing(const Variable& self, PyObject* index, variable_list& outIndices) {
+  int64_t size = PyTuple_GET_SIZE(index);
+  int64_t dim = 0;
+  int64_t specified_dims = count_specified_dimensions(index);
+
+  auto handle_var = [&](const Variable& var) {
+    // TODO: check scalarType
+    outIndices.resize(dim + 1);
+    outIndices[dim] = var;
+    dim++;
+  };
+
+  if (specified_dims > self.dim()) {
+    throw IndexError("too many indices for tensor of dimension %d", (int)self.dim());
+  }
+
+  Variable result = self;
+  for (int64_t i = 0; i < size; i++) {
+    PyObject* obj = PyTuple_GET_ITEM(index, i);
+    if (THPUtils_checkLong(obj)) {
+      result = applySelect(result, dim, THPUtils_unpackLong(obj));
+    } else if (PySlice_Check(obj)) {
+      result = applySlice(result, dim, obj);
+#ifndef USE_TH_SIZE_ZERO_DIM
+      if (result.numel() == 0) {
+        // TODO: currently we don't have support for 0-sized dims, so slicing a dim
+        // to size 0 will return a size 0 tensor. for now, just shortcircuit slicing
+        // and return that size 0 tensor.
+        return result;
+      }
+#endif
+      dim++;
+    } else if (obj == Py_Ellipsis) {
+      dim += self.dim() - specified_dims;
+    } else if (obj == Py_None) {
+      result = result.unsqueeze(dim);
+      dim++;
+    } else if (PyBool_Check(obj)) {
+      result = result.unsqueeze(dim);
+      handle_var(boolToIndexingTensor(result, obj == Py_True));
+    } else if (THPVariable_Check(obj)) {
+      auto& var = THPVariable_Unpack(obj);
+      auto scalar_type = var.type().scalarType();
+      if (var.dim() == 0 && at::isIntegralType(scalar_type)) {
+        if (scalar_type != at::kByte) {
+          result = applySelect(result, dim, THPUtils_unpackLong(obj));
+        } else {
+          result = result.unsqueeze(dim);
+          handle_var(boolToIndexingTensor(result, var.toCByte() != 0));
+        }
+      } else {
+        handle_var(var);
+      }
+    } else if (PySequence_Check(obj)) {
+      handle_var(sequenceToVariable(self.type(), obj));
+    } else {
+      auto index = THPObjectPtr(PyNumber_Index(obj));
+      if (!index) {
+        PyErr_Clear();
+        invalid_index(obj);
+      }
+      result = applySelect(result, dim, THPUtils_unpackLong(index));
+    }
+  }
+  return result;
+}
+
+static std::vector<Tensor> typeConvertIndices(const Variable& self, const variable_list& indices) {
+  std::vector<Tensor> converted_inds(indices.size());
+  int32_t device = self.is_cuda() ? self.get_device() : -1;
+  for (size_t i = 0; i < indices.size(); ++i) {
+    const auto &ind = indices[i];
+    if (ind.defined()) {
+      auto& new_type = ind.type().toBackend(self.type().backend());
+      converted_inds[i] = torch::utils::dispatch_type_conversion(ind, new_type, device, false);
+    } else {
+      converted_inds[i] = indices[i];
+    }
+  }
+  return converted_inds;
+}
+
+static Variable dispatch_index(const Variable& self, const variable_list& indices) {
+  std::vector<Tensor> converted_indices = typeConvertIndices(self, indices);
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.index(converted_indices);
+}
+
+static Variable dispatch_index_put_(Variable& self, const variable_list& indices, const Variable& value) {
+  std::vector<Tensor> converted_indices = typeConvertIndices(self, indices);
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  return self.index_put_(converted_indices, value);
+}
+
+static bool treatSequenceAsTuple(PyObject* index) {
+  if (PyTuple_Check(index)) {
+    return true;
+  }
+  if (!PySequence_Check(index)) {
+    return false;
+  }
+  // This uses a heuristics from NumPy for determining whether to treat
+  // non-tuple sequences as if they were a tuple. From the NumPy code comments:
+  //
+  // "At this point, we're left with a non-tuple, non-array, sequence:
+  //  typically, a list. We use some somewhat-arbitrary heuristics from here
+  //  onwards to decided whether to treat that list as a single index, or a
+  //  list of indices. Backwards compatibility only takes effect for short
+  //  sequences - otherwise we treat it like any other scalar."
+  auto n = PySequence_Size(index);
+  if (n < 0) {
+    // Negative size indicates a Python error in the PySequence_Size call.
+    PyErr_Clear();
+    return false;
+  }
+  if (n >= 32) {
+    return false;
+  }
+  for (Py_ssize_t i = 0; i < n; i++) {
+    auto obj = THPObjectPtr{PySequence_GetItem(index, i)};
+    if (!obj.get()) {
+      PyErr_Clear();
+      return false;
+    }
+    if (THPVariable_Check(obj.get()) || PySequence_Check(obj.get()) || PySlice_Check(obj.get())) {
+      return true;
+    }
+    if (obj.get() == Py_Ellipsis || obj.get() == Py_None) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static THPObjectPtr wrapTuple(PyObject* index) {
+  THPObjectPtr res;
+  if (treatSequenceAsTuple(index)) {
+    res = PySequence_Tuple(index);
+  } else {
+    res = PyTuple_Pack(1, index);
+  }
+  if (!res) throw python_error();
+  return res;
+}
+
+PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+
+  // handle simple types: integers, slices, ellipsis
+  if (index == Py_None) {
+    return wrap(self_.unsqueeze(0));
+  } else if (index == Py_Ellipsis) {
+    return wrap(at::alias(self_));
+  } else if (THPUtils_checkLong(index)) {
+    return wrap(applySelect(self_, 0, THPUtils_unpackLong(index)));
+  } else if (PySlice_Check(index)) {
+    return wrap(applySlice(self_, 0, index, true));
+  }
+
+  // wrap index in a tuple if it's not already one
+  THPObjectPtr holder = wrapTuple(index);
+
+  variable_list variableIndices;
+  Variable sliced = applySlicing(self_, holder.get(), variableIndices);
+  if (variableIndices.empty()) {
+    if (sliced.is_same(self_)) {
+      // ensure we return a shallow copy for things like x[...]
+      sliced = at::alias(sliced);
+    }
+    return wrap(sliced);
+  }
+
+  // indexing by tensors ("advanced" indexing)
+  return wrap(dispatch_index(sliced, variableIndices));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// To match numpy semantics:
+// As a special case for backwards compatibility,
+// strip away unit dimensions from the left of 'src'
+static IntList slicePrefix1sSize(IntList sizes) {
+  size_t first_non1_src = sizes.size();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (sizes[i] != 1) {
+      first_non1_src = i;
+      break;
+    }
+  }
+
+  return sizes.slice(first_non1_src);
+}
+
+static void copy_to(Variable dst, const Variable& src) {
+  Tensor b_src;
+  IntList sliced_src_sizes = slicePrefix1sSize(src.sizes());
+  std::tie(b_src) = expand_inplace(dst, src.view(sliced_src_sizes), "setitem");
+  dst.copy_(b_src);
+}
+
+int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
+  HANDLE_TH_ERRORS
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  DeviceGuard device_guard(self_);
+  auto value = valueToTensor(self_.type(), py_value);
+
+  // handle simple types: integers, slices, ellipsis, bool
+  if (index == Py_False) {
+    // do nothing for false (technically we should check the size, but we don't have
+    // real 0-sized shapes.
+    return 0;
+  } else if (index == Py_Ellipsis) {
+    copy_to(self_, value);
+    return 0;
+  } else if (index == Py_None || index == Py_True) {
+    copy_to(self_.unsqueeze(0), value);
+    return 0;
+  } else if (THPUtils_checkLong(index)) {
+    copy_to(applySelect(self_, 0, THPUtils_unpackLong(index)), value);
+    return 0;
+  } else if (PySlice_Check(index)) {
+    copy_to(applySlice(self_, 0, index), value);
+    return 0;
+  }
+
+  // wrap index in a tuple if it's not already one
+  THPObjectPtr holder = wrapTuple(index);
+
+  variable_list variableIndices;
+  Variable sliced = applySlicing(self_, holder.get(), variableIndices);
+  if (variableIndices.empty()) {
+    copy_to(sliced, value);
+    return 0;
+  }
+
+  IntList slicedValueSizes = slicePrefix1sSize(value.sizes());
+  torch::autograd::Variable valuesSliced;
+  if (!value.sizes().equals(slicedValueSizes)) {
+    valuesSliced = value.view(slicedValueSizes);
+  } else {
+    valuesSliced = value;
+  }
+  dispatch_index_put_(sliced, variableIndices, valuesSliced);
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_variable_indexing.h b/torch/csrc/autograd/python_variable_indexing.h
new file mode 100644
index 0000000..6419879
--- /dev/null
+++ b/torch/csrc/autograd/python_variable_indexing.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+namespace torch { namespace autograd {
+
+Py_ssize_t THPVariable_length(PyObject* self);
+PyObject* THPVariable_getitem(PyObject* self, PyObject* index);
+int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* value);
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
new file mode 100644
index 0000000..889f456
--- /dev/null
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -0,0 +1,93 @@
+#include "torch/csrc/autograd/saved_variable.h"
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/tracer_state.h"
+
+#include <ATen/Tensor.h>
+
+#include <cstdint>
+#include <list>
+#include <memory>
+
+namespace torch { namespace autograd {
+
+SavedVariable::SavedVariable(const Variable& variable, bool is_output) {
+  if (variable.defined()) {
+    was_default_constructed_ = false;
+    output_nr_ = variable.output_nr();
+    requires_grad_ = variable.requires_grad();
+    has_grad_fn_ = !variable.is_leaf();
+    // These copies are all shared_ptr copies, so slightly more expensive.
+    // Do them here instead of in the init list in case data is undefined.
+    data_ = variable.data();
+    if (variable.is_leaf()) {
+      grad_accumulator_ = variable.grad_accumulator();
+    } else if (!is_output) {
+      grad_fn_ = variable.grad_fn();
+    }
+    version_counter_ = variable.version_counter();
+    saved_version_ = version_counter_.current_version();
+    if (variable.has_tracing_state()) {
+      tracing_state_.reset(
+          new jit::tracer::ValueTracingState(variable.tracing_state()));
+    }
+  }
+}
+
+Variable SavedVariable::unpack(std::shared_ptr<Function> saved_for) const {
+  if (!data_.defined()) {
+    if (!was_default_constructed_) {
+      throw std::runtime_error(ERR_BACKWARD_TWICE);
+    }
+    return Variable();
+  }
+
+  if (saved_version_ != version_counter_.current_version()) {
+    throw std::runtime_error(
+        "one of the variables needed for gradient computation has been "
+        "modified by an inplace operation");
+  }
+
+  auto grad_fn = grad_fn_;
+  if (has_grad_fn_ && !grad_fn) {
+    if (!saved_for) {
+      // If saving the grad_fn would create a circular reference, then it must
+      // be passed in to the unpack function.
+      throw std::runtime_error("No grad_fn for non-leaf saved variable");
+    }
+    grad_fn = std::move(saved_for);
+  }
+
+  // NB: saved views are unpacked as normal Variables (not views) even though
+  // they still share the same storage. This works only because we never call
+  // in-place functions on unpacked variables.
+  Variable var;
+  if (grad_fn) {
+    var = make_variable(data_, Edge(std::move(grad_fn), output_nr_));
+  } else {
+    var = make_variable(data_, requires_grad_);
+  }
+  var.set_version_counter(saved_version_);
+
+  // If a Variable is a leaf (no grad_fn saved), and it requires_grad, then we
+  // should have saved the grad accumulator. Even if the Variable no longer
+  // alive, the accumulator should be kept alive by the references in the
+  // graph).
+  if (requires_grad_ && !var.grad_fn() && grad_accumulator_.expired())
+    throw std::logic_error("No grad accumulator for a saved leaf!");
+  var.set_grad_accumulator(grad_accumulator_);
+  if (tracing_state_) {
+    var.set_tracing_state(new jit::tracer::ValueTracingState(*tracing_state_));
+  }
+
+  return var;
+}
+
+const char* ERR_BACKWARD_TWICE =
+    "Trying to backward through the graph a second time, but the buffers have "
+    "already been freed. Specify retain_graph=True when calling backward "
+    "the first time.";
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
new file mode 100644
index 0000000..7372d10
--- /dev/null
+++ b/torch/csrc/autograd/saved_variable.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "torch/csrc/autograd/variable_version.h"
+#include "torch/csrc/jit/tracer_state.h"
+
+#include <ATen/ATen.h>
+
+#include <cstdint>
+#include <list>
+#include <memory>
+
+namespace torch { namespace autograd {
+
+struct Variable;
+struct Function;
+
+extern const char* ERR_BACKWARD_TWICE;
+
+/// A snapshot of a variable at a certain version. A `SavedVariable` stores
+/// enough information to reconstruct a variable from a certain point in time.
+class SavedVariable {
+ public:
+  SavedVariable() = default;
+  SavedVariable(const Variable& variable, bool is_output);
+  SavedVariable(SavedVariable&&) = default;
+  SavedVariable& operator=(SavedVariable&&) = default;
+
+  /// Reconstructs the saved variable. Pass `saved_for` as the gradient
+  /// function if constructing the `SavedVariable` with it would have caused a
+  /// circular reference.
+  Variable unpack(std::shared_ptr<Function> saved_for = nullptr) const;
+
+  void reset_data() {
+    return data_.reset();
+  }
+
+ private:
+  at::Tensor data_;
+
+  // The gradient function associated with this node. If has_grad_fn
+  // is false, then this is a leaf node. Note that the grad_fn is not saved if
+  // it would create a circular reference. In that case, the grad_fn must be
+  // passed in to the unpack function when reconstructing the Variable.
+  std::shared_ptr<Function> grad_fn_;
+  std::weak_ptr<Function> grad_accumulator_;
+  std::unique_ptr<jit::tracer::ValueTracingState> tracing_state_;
+  VariableVersion version_counter_;
+
+  uint32_t saved_version_;
+  uint32_t output_nr_;
+  bool was_default_constructed_ = true;
+  bool requires_grad_;
+  bool has_grad_fn_;
+};
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/symbolic.h b/torch/csrc/autograd/symbolic.h
new file mode 100644
index 0000000..2012366
--- /dev/null
+++ b/torch/csrc/autograd/symbolic.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/onnx/onnx.h"
+#include <vector>
+
+namespace torch { namespace autograd {
+
+struct SymbolicContext {
+  jit::Block* block;
+};
+
+struct symbolic_unconvertible : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/type_and_shape.h b/torch/csrc/autograd/type_and_shape.h
new file mode 100644
index 0000000..01a62fa
--- /dev/null
+++ b/torch/csrc/autograd/type_and_shape.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "torch/csrc/assertions.h"
+
+namespace torch { namespace autograd {
+
+/// A tensor's type and shape. Each Function records the required type and
+/// shape of its inputs. If is_valid() is false, then the corresponding input
+/// is not used and may be an undefined tensor.
+struct TypeAndShape {
+  TypeAndShape() : type_(nullptr) {}
+
+  TypeAndShape(const at::Type& type, at::IntList shape)
+    : type_(&type) , shape_(shape) {}
+
+  bool is_valid() const {
+    return type_ != nullptr;
+  }
+
+  const at::Type& type() const {
+    TORCH_ASSERT(type_);
+    return *type_;
+  }
+
+  at::IntList shape() const {
+    return shape_;
+  }
+
+  const at::Type* type_;
+  at::DimVector shape_;
+};
+
+}}
diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h
new file mode 100644
index 0000000..e70598a
--- /dev/null
+++ b/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+#include "torch/csrc/utils/python_arg_parser.h"
+
+namespace torch { namespace autograd { namespace utils {
+
+inline std::tuple<at::optional<at::Device>, at::optional<at::ScalarType>, bool>
+parse_to_conversion(PyObject *args, PyObject *kwargs) {
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False)",
+    "to(ScalarType dtype, bool non_blocking=False)",
+    "to(Tensor tensor, bool non_blocking=False)",
+  });
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    return std::make_tuple(r.deviceOptional(0), r.scalartypeOptional(1), r.toBool(2));
+  } else if (r.idx == 1) {
+    return std::make_tuple(at::nullopt, r.scalartype(0), r.toBool(1));
+  } else {
+    auto tensor = r.tensor(0);
+    return std::make_tuple(
+      torch::tensors::getDevice(tensor),
+      tensor.type().scalarType(),
+      r.toBool(1)
+    );
+  }
+}
+
+}}} // namespace torch::autograd::utils
diff --git a/torch/csrc/autograd/utils/python_error_messages.h b/torch/csrc/autograd/utils/python_error_messages.h
new file mode 100644
index 0000000..a83a740
--- /dev/null
+++ b/torch/csrc/autograd/utils/python_error_messages.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <sstream>
+
+namespace torch { namespace autograd { namespace utils {
+
+inline std::string requires_grad_leaf_error(bool requires_grad) {
+  std::ostringstream oss;
+  oss << "you can only change requires_grad flags of leaf variables.";
+  if (requires_grad == false) {
+      oss << " If you want to use a computed variable in a subgraph "
+             "that doesn't require differentiation use "
+             "var_no_grad = var.detach().";
+  }
+  return oss.str();
+}
+
+}}} // namespace torch::autograd::utils
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
new file mode 100644
index 0000000..b48debd
--- /dev/null
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -0,0 +1,102 @@
+#pragma once
+
+// Wrap tensor operation outputs as PyObject*
+
+#include <ATen/ATen.h>
+#include "torch/csrc/python_headers.h"
+#include <tuple>
+
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/python_numbers.h"
+
+namespace torch { namespace autograd { namespace utils {
+
+inline PyObject* wrap(at::Tensor tensor) {
+  return THPVariable_Wrap(Variable(std::move(tensor)));
+}
+
+inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor> tensors) {
+  auto r = THPObjectPtr{PyTuple_New(2)};
+  if (!r) throw python_error();
+  PyTuple_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
+  PyTuple_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
+  return r.release();
+}
+
+inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor> tensors) {
+  auto r = THPObjectPtr{PyTuple_New(3)};
+  if (!r) throw python_error();
+  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
+  return r.release();
+}
+
+inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
+  auto r = THPObjectPtr{PyTuple_New(4)};
+  if (!r) throw python_error();
+  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
+  return r.release();
+}
+
+inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
+  auto r = THPObjectPtr{PyTuple_New(5)};
+  if (!r) throw python_error();
+  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
+  PyTuple_SET_ITEM(r.get(), 4, wrap(std::move(std::get<4>(tensors))));
+  return r.release();
+}
+
+inline PyObject* wrap(at::TensorList tl) {
+  auto r = THPObjectPtr{PyTuple_New(tl.size())};
+  if (!r) throw python_error();
+  for (size_t i = 0; i < tl.size(); ++i) {
+    PyTuple_SET_ITEM(r.get(), i, wrap(tl[i]));
+  }
+  return r.release();
+}
+
+inline PyObject* wrap(bool value) {
+  if (value) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+inline PyObject* wrap(int64_t value) {
+  return THPUtils_packInt64(value);
+}
+
+inline PyObject* wrap(double value) {
+  return PyFloat_FromDouble(value);
+}
+
+inline PyObject* wrap(void* value) {
+  return THPUtils_packInt64(reinterpret_cast<intptr_t>(value));
+}
+
+inline PyObject* wrap(at::Scalar scalar) {
+  return wrap(scalar.toTensor());
+}
+
+inline PyObject* wrap(THPDtype *dtype) {
+  Py_INCREF(dtype);
+  return (PyObject*)dtype;
+}
+
+inline PyObject* wrap(THPLayout *layout) {
+  Py_INCREF(layout);
+  return (PyObject*)layout;
+}
+
+}}} // namespace torch::autograd::utils
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
new file mode 100644
index 0000000..16e8105
--- /dev/null
+++ b/torch/csrc/autograd/variable.cpp
@@ -0,0 +1,217 @@
+#include "torch/csrc/autograd/variable.h"
+
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/functions/accumulate_grad.h"
+#include "torch/csrc/autograd/functions/tensor.h"
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/variable_version.h"
+#include "torch/csrc/jit/tracer_state.h"
+#include "torch/csrc/utils/auto_unique_ptr.h"
+
+#include <ATen/ATen.h>
+
+#include <list>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace torch { namespace autograd {
+Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
+    : TensorImpl(VariableType::getType(data)),
+      data_(std::move(data)),
+      grad_fn_(std::move(gradient_edge.function)),
+      requires_grad_(false),
+      is_view_(false),
+      output_nr_(gradient_edge.input_nr),
+      pyobj_(nullptr) {
+  // set_requires_grad also checks error conditions.
+  set_requires_grad(requires_grad);
+  TORCH_ASSERTM(
+      !grad_fn_ || !requires_grad_,
+      "requires_grad should be false if grad_fn is set");
+  if (!data_.defined()) {
+    throw std::runtime_error("data is undefined");
+  }
+}
+
+Variable::Impl::~Impl() = default;
+
+const char* Variable::Impl::toString() const {
+  // technically this will say Variable[CPUFloatType] rather than
+  // Variable[CPUFloatTensor], but this is better than just Variable
+  return type().toString();
+}
+
+IntList Variable::Impl::sizes() const {
+  return data_.sizes();
+}
+
+IntList Variable::Impl::strides() const {
+  return data_.strides();
+}
+
+int64_t Variable::Impl::dim() const {
+  return data_.dim();
+}
+
+const char* Variable::Impl::typeString() {
+  return "VariableType";
+}
+
+void* Variable::Impl::unsafeGetTH(bool retain) {
+  return data_.unsafeGetTH(retain);
+}
+
+std::unique_ptr<at::Storage> Variable::Impl::storage() {
+  return data_.storage();
+}
+
+Scalar Variable::Impl::localScalar() {
+  return data_.pImpl->localScalar();
+}
+
+std::shared_ptr<Function> Variable::Impl::get_grad_accumulator() {
+  if (grad_fn_) {
+    throw std::logic_error(
+        "get_grad_accumulator() should be only called on leaf Variables");
+  }
+  if (!requires_grad_) {
+    return nullptr;
+  }
+
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto result = grad_accumulator_.lock();
+  if (result)
+    return result;
+
+  result = std::make_shared<AccumulateGrad>(Variable(this, true));
+  grad_accumulator_ = result;
+  return result;
+}
+
+Tensor Variable::Impl::detach() const {
+  auto detached = make_variable(data_, /*requires_grad=*/false);
+  detached.set_version_counter(version_counter_);
+  return detached;
+}
+
+void Variable::Impl::detach_() {
+  if (is_view_) {
+    throw std::runtime_error(
+        "Can't detach views in-place. Use detach() instead");
+  }
+  set_requires_grad(false);
+  grad_fn_.reset();
+  output_nr_ = 0;
+}
+
+void Variable::Impl::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  std::vector<Edge> edges;
+  edges.emplace_back(grad_fn_, output_nr_);
+
+  std::vector<Variable> inputs;
+  if (!gradient.has_value()) {
+    gradient = make_variable(at::ones_like(data_), /*requires_grad=*/false);
+  }
+  inputs.push_back(std::move(as_variable_ref(*gradient)));
+  Engine::get_default_engine().execute(edges, inputs, keep_graph, create_graph);
+}
+
+void Variable::Impl::set_data(Tensor new_data) {
+  if (new_data.type() != data_.type()) {
+    type_ = VariableType::getType(new_data.type());
+    // Clear grad_accumulator if it exists, since it stores the old type info.
+    grad_accumulator_.reset();
+  }
+  data_ = std::move(new_data);
+}
+
+void Variable::Impl::release_resources() {
+  data_.reset();
+  grad_.reset();
+  grad_fn_.reset();
+  hooks_.clear();
+  tracing_state_.reset();
+}
+
+Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
+    : Variable::Impl(std::move(data), false, std::move(gradient_edge)),
+      base_(std::move(base)) {
+  TORCH_ASSERTM(base_.defined(), "base is undefined");
+  if (base_.is_view()) {
+    base_ = base_.base();
+  }
+  is_view_ = true;
+  version_counter_ = base_.version_counter();
+  attr_version = version_counter_.current_version();
+}
+
+std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (!grad_fn_ && !base_.requires_grad()) {
+    return grad_fn_;
+  }
+  auto current_version = version_counter_.current_version();
+  if (attr_version != current_version) {
+    TORCH_ASSERT(output_nr_ == 0);
+    auto fn = std::make_shared<generated::AsStridedBackward>();
+    fn->self_geometry = at::TensorGeometry(base_);
+    fn->size = sizes();
+    fn->stride = strides();
+    fn->storage_offset = data_.storage_offset();
+    fn->set_next_edges(collect_next_edges(base_));
+    fn->add_input_metadata(base_.type(), sizes());
+    grad_fn_ = std::move(fn);
+    attr_version = current_version;
+  }
+  return grad_fn_;
+}
+
+void Variable::ViewImpl::rebase_history(Edge gradient_edge) {
+  TORCH_ASSERT(gradient_edge.input_nr == 0);
+  TORCH_ASSERT(gradient_edge.function);
+  TORCH_ASSERTM(
+      gradient_edge.function->num_inputs() == 1,
+      "Functions which modify views in-place must return a single Variable");
+  this->output_nr_ = gradient_edge.input_nr;
+  auto copy_slices = std::make_shared<CopySlices>(
+      base_, at::TensorGeometry(data_), std::move(gradient_edge.function));
+  base_.set_gradient_edge({std::move(copy_slices), 0});
+  get_grad_fn(); // trigger an update to the view's grad_fn
+}
+
+void Variable::ViewImpl::release_resources() {
+  Variable::Impl::release_resources();
+  base_.reset();
+}
+
+void Variable::rebase_history(Edge gradient_edge) {
+  TORCH_ASSERT(gradient_edge.function != nullptr);
+  if (is_view()) {
+    auto& impl = static_cast<Variable::ViewImpl&>(*get());
+    impl.rebase_history(std::move(gradient_edge));
+  } else {
+    set_gradient_edge(std::move(gradient_edge));
+  }
+}
+
+void Variable::set_tracing_state(
+    jit::tracer::ValueTracingState* new_tracing_state) {
+  get()->tracing_state_.reset(new_tracing_state);
+}
+
+jit::tracer::ValueTracingState& Variable::tracing_state() const noexcept {
+  return *get()->tracing_state_;
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
new file mode 100644
index 0000000..a6d670a
--- /dev/null
+++ b/torch/csrc/autograd/variable.h
@@ -0,0 +1,613 @@
+#pragma once
+
+#include "torch/csrc/utils/python_stub.h"
+
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function_hook.h"
+#include "torch/csrc/autograd/variable_version.h"
+#include "torch/csrc/utils/auto_unique_ptr.h"
+
+#include <ATen/ATen.h>
+#include <ATen/Error.h>
+
+#include <list>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+struct Function;
+} // namespace autograd
+namespace jit { namespace tracer {
+// Has to be forward declared because tracer_state.h has a dependency on
+// variable.h.
+struct ValueTracingStateElem;
+using ValueTracingState = std::list<ValueTracingStateElem>;
+}} // namespace jit::tracer
+} // namespace torch
+
+namespace torch { namespace autograd {
+
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+///                                Variable
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// A `Variable` augments a `Tensor` with the ability to interact in our
+/// autograd machinery. Conceptually, `Variable`s travel along `Edge`s between
+/// `Function`s in the autograd graph. A `Variable` can either be a leaf, like a
+/// weight in a neural network, or an interior variable, when it is the result
+/// of an operation between variables. Every `Variable` also stores another
+/// `Variable` called its `grad` (gradient). If the variable is a leaf, its
+/// gradient will be accumulated into this variable.
+///
+///                              Gradient Edges
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Furthermore, `Variable`s have the notion of a `gradient_edge`, which is the
+/// edge in the autograd graph that connects the variable to a particular input
+/// of the gradient function that will be invoked with the variable during the
+/// backward pass. More precisely, this gradient function can be one of two
+/// things:
+/// 1. A `grad_fn`, if the variable is in the interior of the graph. This is the
+///    gradient of the function that produced the variable.
+/// 2. A `grad_accumulator`, if the variable is a leaf, which accumulates a
+///    scalar gradient value into its `grad` variable.
+///
+///                               Versioning
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Another major feature of `Variable`s are *versions*. Versions are
+/// incremented when an in-place mutation of a variable occurs. Versions are
+/// useful when constructing `SavedVariable`s, which take a snapshot of a
+/// `Variable` at a certain version. You can retrieve a `Variable`'s version
+/// through its `current_version()` method.
+///
+///                                 Views
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// It is possible for a  `Variable` to be a *view* of another `Variable`, in
+/// which case it tracks that `Variable`'s data and autograd history. Beyond
+/// construction, the interface of a view is identical to that of a regular
+/// `Variable`. You can determine whether `Variable` is in fact a view by
+/// probing its `is_view()` method.
+///
+///                               Interface
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// `Variable` inherits from `Tensor` and thus its API is a superset of that of
+/// `Tensor`. This means you can perform all the usual mathematical and other
+/// operations you can perform on `Tensor`s also on `Variable`s. Furthermore,
+/// `Variable` and `Tensor` actually convert implicitly between each other. You
+/// can thus call functions defined on `Tensor`s also with `Variable`s. For
+/// this, the `Variable` class allows implicit construction from `Tensor`. It is
+/// the responsibility of calling code to ensure that this constructor is
+/// invoked only when the `Tensor`'s dynamic type is actually `Variable`. Most
+/// notably, it is *not* correct to construct a brand new `Variable` from a
+/// `Tensor` using this constructor. To do so, you must use the `make_variable`
+/// free function instead. To create a view variable, use `make_variable_view`.
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct Variable : public at::Tensor {
+  /// Default constructor.
+  Variable() = default;
+
+  // Factory Functions
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // NOTE: These factory functions have to be friends to access the
+  // `Variable::Impl`. As a side effect, it allows us to keep them in the class.
+
+  /// Creates a `Variable` that is a *view* of another (*base*) variable.
+  /// The `gradient_edge` is an optional (gradient_function, input_number) pair.
+  friend Variable make_variable_view(
+      Variable base,
+      at::Tensor data,
+      Edge gradient_edge);
+
+  /// Creates a `Variable` from the given `Tensor`. `requires_grad` should be
+  /// set only for leaves, and determines whether the `Variable` will accumulate
+  /// gradients. NOTE: `data` must *not* be a `Variable` already. Its dynamic
+  /// type *must* be `Tensor`.
+  friend Variable make_variable(at::Tensor data, bool requires_grad);
+
+  /// Creates a `Variable` from the given `Tensor` and specify a
+  /// `gradient_edge`, i.e. a (function, input_nr) pair specifying the function
+  /// in the autograd graph, and what particular input of that function, this
+  /// variable is connected to.
+  friend Variable make_variable(at::Tensor data, Edge gradient_edge);
+
+  // Tensor Conversions
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // "Downcasts" a `Tensor` into a `Variable`. Only call this on tensors you
+  // know are Variables.
+  /*implicit*/ Variable(at::Tensor const& rhs) : at::Tensor(rhs) {
+    TORCH_ASSERTM(
+        is_variable() || !defined(),
+        "Tensor that was converted to Variable was not actually a Variable");
+  }
+
+  /*implicit*/ Variable(at::Tensor&& rhs) noexcept
+      : at::Tensor(std::move(rhs)) {
+    TORCH_ASSERTM(
+        is_variable() || !defined(),
+        "Tensor that was converted to Variable was not actually a Variable");
+  }
+
+  // NOTE: Assignment operators to Tensor come for free from the constructors.
+
+  const at::Tensor& data() const noexcept;
+  at::Tensor& data() noexcept;
+
+  // Gradient Function and Edges
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Gets the gradient function of the `Variable`. If this is a leaf variable,
+  /// the pointer returned will be null.
+  const std::shared_ptr<Function>& grad_fn() const;
+
+  /// Gets the raw gradient function pointer, whatever it currently is.
+  Function* grad_fn_unsafe() const;
+
+  /// Set the gradient accumulator of the `Variable`. This is only applicable to
+  /// leaf variables. Interior variables should call `set_gradient_edge()`.
+  void set_grad_accumulator(std::weak_ptr<Function> grad_accumulator);
+
+  /// Attempts to get a pointer to the gradient accumulator of the `Variable`,
+  /// if it still exists. If the gradient accumulator function has been
+  /// destroyed, returns a `nullptr`.
+  std::shared_ptr<Function> try_get_grad_accumulator() const;
+
+  /// Gets the gradient accumulator of the `Variable` if it has one, or else
+  /// create one on the fly and return it.
+  std::shared_ptr<Function> grad_accumulator() const;
+
+  /// Returns the "canonical" gradient edge of this `Variable`, i.e. either the
+  /// gradient function if this is an interior `Variable`, or the gradient
+  /// accumulator otherwise. If the `Variable` is interior, the returned `Edge`
+  /// will store the input index of the `Function` to which this variable is
+  /// connected in its `input_nr` field. For leaves, the `input_nr` is always
+  /// zero. Note that `set_gradient_edge` and `gradient_edge` are not
+  /// symmetric. You must use `set_gradient_edge` to set the `grad_fn` and
+  /// `set_grad_accumulator` to set the accumulator.
+  Edge gradient_edge() const {
+    // If grad_fn is null (as is the case for a leaf node), we instead
+    // interpret the gradient function to be a gradient accumulator, which will
+    // accumulate its inputs into the grad property of the variable. These
+    // nodes get suppressed in some situations, see "suppress gradient
+    // accumulation" below. Note that only variables which have `requires_grad =
+    // True` can have gradient accumulators.
+    if (const auto& gradient = grad_fn()) {
+      return Edge(gradient, output_nr());
+    } else {
+      return Edge(grad_accumulator(), 0);
+    }
+  }
+
+  /// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
+  /// `Variable`.
+  /// NOTE: This will always set the `grad_fn`, even if this is a leaf variable,
+  /// and never the `grad_accumulator`. For the latter, use
+  /// `set_grad_accumulator`. This allows late construction of an interior
+  /// `Variable`.
+  void set_gradient_edge(Edge edge) noexcept;
+
+  /// Returns the input index of the gradient `Function` to which this
+  /// `Variable` is connected.
+  uint32_t output_nr() const noexcept;
+
+  /// True if this `Variable` is a leaf and thus does not have a `grad_fn`.
+  bool is_leaf() const noexcept;
+
+  // Versions
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Increments the version count of this `Variable`.
+  void bump_version() noexcept;
+  void set_version_counter(const VariableVersion& version_counter) noexcept;
+
+  /// Retrieves this `Variable`s version counter.
+  const VariableVersion& version_counter() const noexcept;
+
+  /// Retrieves the current value of the `Variable`'s version counter.
+  /// Equivalent to calling `version_counter().current_version()`.
+  uint32_t current_version() const noexcept;
+
+  // Autograd Graph Interaction
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Update the `grad_fn` of an existing Variable. Called after in-place
+  /// modifications.
+  void rebase_history(Edge gradient_edge);
+
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  void add_hook(std::shared_ptr<FunctionPreHook> hook);
+  const std::vector<std::shared_ptr<FunctionPreHook>>& hooks() const noexcept;
+  void clear_hooks();
+
+  // JIT Tracing
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  void set_tracing_state(jit::tracer::ValueTracingState* new_tracing_state);
+  jit::tracer::ValueTracingState& tracing_state() const noexcept;
+
+  /// Returns true if the `Variable`'s tracing state is not null.
+  bool has_tracing_state() const noexcept;
+
+  // View Variables
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Returns true if this `Variable` is a view of another `Variable`.
+  bool is_view() const noexcept;
+
+  /// Returns the `Variable` that this `Variable` is a view of. If this
+  /// `Variable` is not a view, throw a `std::runtime_error`.
+  const Variable& base() const;
+
+  // Miscellaneous
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Compares this `Variable` to another `Variable` (or `Tensor`) via
+  /// pointer-equality.
+  bool is_same(const Variable& other) const noexcept {
+    return this->pImpl == other.pImpl;
+  }
+
+  void set_name(const std::string& name);
+  const std::string& name() const noexcept;
+
+  PyObject* pyobj() const noexcept;
+  void set_pyobj(PyObject* pyobj) noexcept;
+
+ private:
+  /// Private implementation struct of the `Variable`. This struct declaration
+  /// and the `get()` method which exposes it shall forever remain private and
+  /// never be exposed to the public interface of this class.
+  struct Impl;
+  struct ViewImpl;
+
+  // Private Methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Variable(Variable::Impl* self, bool retain);
+  Impl* get() const noexcept;
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                            Variable::Impl
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct Variable::Impl : public at::TensorImpl {
+  explicit Impl(
+      at::Tensor data,
+      bool requires_grad = false,
+      Edge edge = Edge());
+
+  ~Impl() override;
+
+  const char* toString() const override;
+  at::IntList sizes() const override;
+  at::IntList strides() const override;
+  int64_t dim() const override;
+  at::Scalar localScalar() override;
+  void* unsafeGetTH(bool retain) override;
+  std::unique_ptr<at::Storage> storage() override;
+  static const char* typeString();
+
+  std::shared_ptr<Function> get_grad_accumulator();
+  virtual std::shared_ptr<Function>& get_grad_fn() {
+    return grad_fn_;
+  }
+
+  virtual const Variable& base() const {
+    throw std::runtime_error("Can't get base of non-view Variable");
+  }
+
+  /// Sets the `requires_grad` property of `Variable`. This should be true for
+  /// leaf variables that want to accumulate gradients, and false for all other
+  /// variables.
+  void set_requires_grad(bool requires_grad) override {
+    AT_CHECK(
+        !requires_grad || at::isFloatingType(type().scalarType()),
+        "Only Tensors of floating point dtype can require gradients");
+    requires_grad_ = requires_grad;
+  }
+
+  bool requires_grad() const override {
+    return requires_grad_ || grad_fn_ || (is_view_ && base().requires_grad());
+  }
+
+  /// Accesses the gradient `Variable` of this `Variable`.
+  Tensor& grad() override {
+    return grad_;
+  }
+  const Variable& grad() const override {
+    return grad_;
+  }
+
+  /// Returns a copy of this `Variable` that is detached from its autograd graph
+  /// and has a blank version. This method is OK to call if the `Variable` is a
+  /// view.
+  Tensor detach() const override;
+
+  /// Like `detach()`, but removes this `Variable` in-place. This method may
+  /// only be called on non-view `Variable`s. You can use `is_view()` to check
+  /// this. If this `Variable` is a view, throws an `std::runtime_error()`.
+  void detach_() override;
+
+  /// Sets the type of the Variable.
+  void set_data(Tensor new_data) override;
+
+  /// Computes the gradient of current tensor w.r.t. graph leaves.
+  void backward(
+      at::optional<at::Tensor> gradient,
+      bool keep_graph,
+      bool create_graph) override;
+
+  /// Reset all expensive fields to free up resources
+  void release_resources() override;
+
+  // Make this field public so we can access it from `Variable`.
+  using at::TensorImpl::type_;
+
+  std::string name;
+  at::Tensor data_;
+
+  Variable grad_;
+  std::shared_ptr<Function> grad_fn_;
+  std::weak_ptr<Function> grad_accumulator_;
+
+  VariableVersion version_counter_;
+  std::vector<std::shared_ptr<FunctionPreHook>> hooks_;
+
+  // Only meaningful on leaf variables (must be false otherwise)
+  bool requires_grad_;
+
+  bool is_view_;
+
+  // The "output number" of this variable; e.g., if this variable
+  // was the second output of a function, then output_nr == 1.
+  // We use this to make sure we can setup the backwards trace
+  // correctly when this variable is passed to another function.
+  uint32_t output_nr_;
+  PyObject* pyobj_; // weak reference
+
+  // Mutex to ensure that concurrent read operations that modify internal
+  // state are still thread-safe. Used by get_grad_fn and
+  // get_grad_accumulator.
+  std::mutex mutex_;
+
+  // For use in torch::jit::tracer
+  auto_unique_ptr<jit::tracer::ValueTracingState> tracing_state_;
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                          Variable::ViewImpl
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A Variable that is a view on another Variable. The base and view share the
+/// same version_counter. The grad_fn field of the Variable may become stale
+/// due to in-place modifications of the shared data. Accesses should go
+/// through get_grad_fn(). All other fields are always valid.
+struct Variable::ViewImpl : public Variable::Impl {
+  ViewImpl(Variable base, at::Tensor data, Edge gradient_edge);
+
+  /// Gets the up-to-date grad_fn. If the shared data or base was modified, we
+  /// re-create the grad_fn to express the up-to-date view relationship between
+  /// this and the base Variable.
+  std::shared_ptr<Function>& get_grad_fn() override;
+
+  const Variable& base() const override {
+    return base_;
+  }
+
+  /// Reset all expensive fields to free up resources
+  void release_resources() override;
+
+  /// Called after in-place modifications. Modifies the grad_fn of the base
+  /// Variable.
+  void rebase_history(Edge gradient_edge);
+
+  /// The base `Variable` (never a view).
+  Variable base_;
+
+  /// The value of the version_counter at the time grad_fn was created. The
+  /// grad_fn field is stale if attr_version !=
+  /// version_counter.current_version().
+  uint32_t attr_version;
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                        Variable Implementation
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Factory Functions
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline Variable make_variable_view(
+    Variable base,
+    at::Tensor data,
+    Edge gradient_edge = Edge()) {
+  if (data.defined()) {
+    auto impl = new Variable::ViewImpl(
+        std::move(base), std::move(data), std::move(gradient_edge));
+    return Variable(impl, /*retain=*/false);
+  }
+  return Variable();
+}
+
+inline Variable make_variable(at::Tensor data, bool requires_grad = false) {
+  AT_CHECK(
+      !data.is_variable(),
+      "Must not create a new variable from a variable, use its .data()");
+  if (data.defined()) {
+    auto impl = new Variable::Impl(data, requires_grad);
+    return Variable(impl, /*retain=*/false);
+  }
+  return Variable();
+}
+
+inline Variable make_variable(at::Tensor data, Edge gradient_edge) {
+  AT_CHECK(
+      !data.is_variable(),
+      "Must not create a new variable from a variable, use its .data()");
+  if (data.defined()) {
+    auto impl = new Variable::Impl(data, false, std::move(gradient_edge));
+    return Variable(impl, /*retain=*/false);
+  }
+  return Variable();
+}
+
+// Tensor Conversion
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Downcasts the `Tensor` reference to a `Variable` reference. If compiling
+/// in DEBUG mode and the tensor's dynamic type is not in fact `Variable`,
+/// throws a `std::invalid_argument` exception.
+inline Variable& as_variable_ref(at::Tensor& tensor) {
+  AT_CHECK(
+      tensor.is_variable(),
+      "Attempted to cast a Tensor to a Variable, but "
+      "the dynamic type of the value is not Variable.");
+  return static_cast<Variable&>(tensor);
+}
+
+inline const Variable& as_variable_ref(const at::Tensor& tensor) {
+  AT_CHECK(
+      tensor.is_variable(),
+      "Attempted to cast a Tensor to a Variable, but "
+      "the dynamic type of the value is not Variable.");
+  return static_cast<const Variable&>(tensor);
+}
+
+inline const at::Tensor& Variable::data() const noexcept {
+  return get()->data_;
+}
+
+inline at::Tensor& Variable::data() noexcept {
+  return get()->data_;
+}
+
+// Gradient Function and Edges
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline const std::shared_ptr<Function>& Variable::grad_fn() const {
+  return get()->get_grad_fn();
+}
+
+inline Function* Variable::grad_fn_unsafe() const {
+  return get()->grad_fn_.get();
+}
+
+inline void Variable::set_grad_accumulator(
+    std::weak_ptr<Function> grad_accumulator) {
+  get()->grad_accumulator_ = std::move(grad_accumulator);
+}
+
+inline std::shared_ptr<Function> Variable::try_get_grad_accumulator() const {
+  return get()->grad_accumulator_.lock();
+}
+
+inline std::shared_ptr<Function> Variable::grad_accumulator() const {
+  return get()->get_grad_accumulator();
+}
+
+inline void Variable::set_gradient_edge(Edge edge) noexcept {
+  get()->grad_fn_ = std::move(edge.function);
+  get()->output_nr_ = edge.input_nr;
+}
+
+inline uint32_t Variable::output_nr() const noexcept {
+  return get()->output_nr_;
+}
+
+inline bool Variable::is_leaf() const noexcept {
+  return get()->grad_fn_ == nullptr;
+}
+
+// Versions
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline void Variable::set_version_counter(
+    const VariableVersion& version_counter) noexcept {
+  get()->version_counter_ = version_counter;
+}
+
+inline void Variable::bump_version() noexcept {
+  get()->version_counter_.bump();
+}
+
+inline uint32_t Variable::current_version() const noexcept {
+  return get()->version_counter_.current_version();
+}
+
+inline const VariableVersion& Variable::version_counter() const noexcept {
+  return get()->version_counter_;
+}
+
+// Hooks
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline void Variable::add_hook(std::shared_ptr<FunctionPreHook> hook) {
+  get()->hooks_.push_back(std::move(hook));
+}
+
+inline const std::vector<std::shared_ptr<FunctionPreHook>>& Variable::hooks()
+    const noexcept {
+  return get()->hooks_;
+}
+
+inline void Variable::clear_hooks() {
+  get()->hooks_.clear();
+}
+
+// JIT Tracing
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline bool Variable::has_tracing_state() const noexcept {
+  return get()->tracing_state_ != nullptr;
+}
+
+// View Variables
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline bool Variable::is_view() const noexcept {
+  return get()->is_view_;
+}
+
+inline const Variable& Variable::base() const {
+  return get()->base();
+}
+
+// Miscellaneous
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline void Variable::set_name(const std::string& name) {
+  get()->name = name;
+}
+
+inline const std::string& Variable::name() const noexcept {
+  return get()->name;
+}
+
+inline void Variable::set_pyobj(PyObject* pyobj) noexcept {
+  get()->pyobj_ = pyobj;
+}
+
+inline PyObject* Variable::pyobj() const noexcept {
+  return get()->pyobj_;
+}
+
+// Private Methods
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+inline Variable::Variable(Variable::Impl* self, bool retain)
+    : at::Tensor(self, retain) {}
+
+inline Variable::Impl* Variable::get() const noexcept {
+  TORCH_ASSERTM(defined(), "Called Variable::get() on an undefined Variable");
+  return static_cast<Variable::Impl*>(pImpl);
+}
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable_version.h b/torch/csrc/autograd/variable_version.h
new file mode 100644
index 0000000..a5c5d9d
--- /dev/null
+++ b/torch/csrc/autograd/variable_version.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+
+// Every Variable has a version counter. Version counters are incremented
+// whenever the data or shape of a tensor changes through Variable operations.
+// These are typicallly in-place operations. Version counters are used to
+// detect modifications to saved variables which would result in incorrect
+// gradient calculations. Version counters may be shared between Variables:
+//
+// 1. A view shares the version counter of the base Variable,
+// 2. Detached variables share the version counter of the source,
+// 3. Unpacked saved variables share the version counter of the source.
+
+namespace torch { namespace autograd {
+
+struct VariableVersion {
+ public:
+  // NOTE: As of C++11 and 14, default-constructing a std::atomic variable
+  // leaves it in a persistently undefined state. See
+  // https://cplusplus.github.io/LWG/issue2334.
+  VariableVersion(uint32_t version = 0)
+      : version_block_(std::make_shared<std::atomic<uint32_t>>(version)) {}
+
+  void bump() noexcept {
+    version_block_->fetch_add(1);
+  }
+
+  uint32_t current_version() const noexcept {
+    return version_block_->load();
+  }
+
+ private:
+  std::shared_ptr<std::atomic<uint32_t>> version_block_;
+};
+}} // namespace torch::autograd
diff --git a/torch/csrc/byte_order.cpp b/torch/csrc/byte_order.cpp
new file mode 100644
index 0000000..8bc1ff1
--- /dev/null
+++ b/torch/csrc/byte_order.cpp
@@ -0,0 +1,160 @@
+#include "byte_order.h"
+
+#include <string.h>
+
+static inline uint16_t decodeUInt16LE(const uint8_t *data) {
+  return (data[0]<<0) | (data[1]<<8);
+}
+
+static inline uint16_t decodeUInt16BE(const uint8_t *data) {
+  return (data[1]<<0) | (data[0]<<8);
+}
+
+static inline uint32_t decodeUInt32LE(const uint8_t *data) {
+  return (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+}
+
+static inline uint32_t decodeUInt32BE(const uint8_t *data) {
+  return (data[3]<<0) | (data[2]<<8) | (data[1]<<16) | (data[0]<<24);
+}
+
+static inline uint64_t decodeUInt64LE(const uint8_t *data) {
+  return (((uint64_t)data[0])<< 0) | (((uint64_t)data[1])<< 8) |
+         (((uint64_t)data[2])<<16) | (((uint64_t)data[3])<<24) |
+         (((uint64_t)data[4])<<32) | (((uint64_t)data[5])<<40) |
+         (((uint64_t)data[6])<<48) | (((uint64_t)data[7])<<56);
+}
+
+static inline uint64_t decodeUInt64BE(const uint8_t *data) {
+  return (((uint64_t)data[7])<< 0) | (((uint64_t)data[6])<< 8) |
+         (((uint64_t)data[5])<<16) | (((uint64_t)data[4])<<24) |
+         (((uint64_t)data[3])<<32) | (((uint64_t)data[2])<<40) |
+         (((uint64_t)data[1])<<48) | (((uint64_t)data[0])<<56);
+}
+
+THPByteOrder THP_nativeByteOrder()
+{
+  uint32_t x = 1;
+  return *(uint8_t*)&x ? THP_LITTLE_ENDIAN : THP_BIG_ENDIAN;
+}
+
+void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = (int16_t) (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    src += sizeof(int16_t);
+  }
+}
+
+void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = (int32_t) (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    src += sizeof(int32_t);
+  }
+}
+
+void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = (int64_t) (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    src += sizeof(int64_t);
+  }
+}
+
+void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    union { uint16_t x; THHalf f; };
+    x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    dst[i] = f;
+    src += sizeof(uint16_t);
+  }
+}
+
+void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    union { uint32_t x; float f; };
+    x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    dst[i] = f;
+    src += sizeof(float);
+  }
+}
+
+void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    union { uint64_t x; double d; };
+    x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    dst[i] = d;
+    src += sizeof(double);
+  }
+}
+
+template<size_t size>
+static void swapBytes(uint8_t *ptr)
+{
+  uint8_t tmp;
+  for (size_t i = 0; i < size / 2; i++) {
+    tmp = ptr[i];
+    ptr[i] = ptr[size-i];
+    ptr[size-i] = tmp;
+  }
+}
+
+
+void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int16_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int16_t)>(dst);
+      dst += sizeof(int16_t);
+    }
+  }
+}
+
+void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int32_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int32_t)>(dst);
+      dst += sizeof(int32_t);
+    }
+  }
+}
+
+void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int64_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int64_t)>(dst);
+      dst += sizeof(int64_t);
+    }
+  }
+}
+
+void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(float) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(float)>(dst);
+      dst += sizeof(float);
+    }
+  }
+}
+
+void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(double) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(double)>(dst);
+      dst += sizeof(double);
+    }
+  }
+}
diff --git a/torch/csrc/byte_order.h b/torch/csrc/byte_order.h
new file mode 100644
index 0000000..0b34730
--- /dev/null
+++ b/torch/csrc/byte_order.h
@@ -0,0 +1,28 @@
+#ifndef THP_BYTE_ORDER_H
+#define THP_BYTE_ORDER_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <THHalf.h>
+
+enum THPByteOrder {
+  THP_LITTLE_ENDIAN = 0,
+  THP_BIG_ENDIAN = 1
+};
+
+THPByteOrder THP_nativeByteOrder();
+
+void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len);
+
+void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len);
+void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len);
+void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len);
+void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len);
+void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len);
+
+#endif
diff --git a/torch/csrc/copy_utils.h b/torch/csrc/copy_utils.h
new file mode 100644
index 0000000..f9ff057
--- /dev/null
+++ b/torch/csrc/copy_utils.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <functional>
+#include <vector>
+#include "Types.h"
+
+typedef std::function<void(PyObject*, PyObject*, bool)> THPCopyFunction;
+struct THPCopyInfo {
+  PyTypeObject* srcType;  // Python type of src tensor/storage
+  THPCopyFunction copy;   // copy function
+  bool non_blocking;             // true if copy implements an 'non_blocking' copy
+  bool broadcast;         // true if the copy implements a broadcast copy
+};
+typedef std::vector<THPCopyInfo> THPCopyList;
+
+inline bool tryTHPCopy(const THPCopyList& v, PyObject* dst, PyObject* src, bool non_blocking, bool broadcast)
+{
+  for (auto it = v.begin(); it != v.end(); ++it) {
+    if (it->non_blocking == non_blocking && PyType_IsSubtype(Py_TYPE(src), it->srcType)) {
+      (it->copy)(dst, src, broadcast);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool THPCopy(const THPCopyList& v, PyObject* dst, PyObject* src, bool non_blocking, bool broadcast)
+{
+  if (tryTHPCopy(v, dst, src, non_blocking, broadcast)) {
+    return true;
+  } else if (non_blocking && tryTHPCopy(v, dst, src, false, broadcast)) {
+    return true;
+  }
+  THPUtils_setError("copy from %s to %s isn't implemented",
+      THPUtils_typename(src), THPUtils_typename(dst));
+  return false;
+}
+
+inline PyObject * THPStorageCopyMethod(const THPCopyList& v, PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  PyObject *src;
+  int non_blocking = 0;
+  static char *kwlist[] = {"source", "non_blocking", NULL};
+  // use int as parse type because bool not available in python2.
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:copy_", kwlist, &src, &non_blocking)) {
+    return NULL;
+  }
+
+  if (!THPCopy(v, self, src, non_blocking, false)) {
+    return NULL;
+  }
+
+  Py_INCREF(self);
+  return self;
+}
+
+template <typename THPStorageDst, typename THPStorageSrc, typename StorageDst, typename StorageSrc>
+void THPInsertStorageCopyFunction(
+  PyTypeObject *srcType,
+  THPCopyList& copyList,
+  void (*copyFunc)(LIBRARY_STATE_TYPE StorageDst* x, StorageSrc* z),
+  bool non_blocking=false)
+{
+  auto wrapper = [copyFunc](PyObject* dst_, PyObject* src_, bool broadcast) {
+    auto dst = ((THPStorageDst*)dst_)->cdata;
+    auto src = ((THPStorageSrc*)src_)->cdata;
+
+    PyThreadState *_save = NULL;
+    try {
+      Py_UNBLOCK_THREADS;
+      copyFunc(LIBRARY_STATE dst, src);
+      Py_BLOCK_THREADS;
+    } catch (...) {
+      if (_save) {
+        Py_BLOCK_THREADS;
+      }
+      throw;
+    }
+  };
+
+  copyList.push_back({ srcType, wrapper, non_blocking, false });
+}
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
new file mode 100644
index 0000000..c437205
--- /dev/null
+++ b/torch/csrc/cuda/Module.cpp
@@ -0,0 +1,437 @@
+#include "torch/csrc/python_headers.h"
+
+#include <stdbool.h>
+#include <unordered_map>
+#include <thread>
+#include <chrono>
+#include <sstream>
+#include <TH/TH.h>
+#include <ATen/ATen.h>
+#include <THC/THCCachingAllocator.h>
+#ifdef USE_NCCL
+#include <nccl.h>
+#endif
+
+#include "THCP.h"
+
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/cuda/python_comm.h"
+
+using namespace torch;
+
+THCState *state;
+
+////////////////////////////////////////////////////////////////////////////////
+// CUDA management methods
+////////////////////////////////////////////////////////////////////////////////
+
+void THCPModule_setDevice(int device)
+{
+  THCudaCheck(cudaSetDevice(device));
+}
+
+PyObject * THCPModule_setDevice_wrap(PyObject *self, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to setDevice");
+  int64_t device = THPUtils_unpackLong(arg);
+
+  THCPModule_setDevice(device);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getDevice_wrap(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  return PyLong_FromLong(device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getDeviceCount_wrap(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  int ndevice;
+  if (cudaGetDeviceCount(&ndevice) != cudaSuccess) {
+    cudaGetLastError();
+    ndevice = 0;
+  }
+  return PyLong_FromLong(ndevice);
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject * THCPModule_getCurrentStream_wrap(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  THCStream* stream = THCState_getStream(state);
+  return PyLong_FromVoidPtr(stream);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_setStream_wrap(PyObject *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyLong_Check(obj), "invalid stream");
+  THCStream* stream = (THCStream *)PyLong_AsVoidPtr(obj);
+  THCState_setStream(state, stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_isDriverSufficient(PyObject *self)
+{
+  int count;
+  cudaError_t err = cudaGetDeviceCount(&count);
+  if (err == cudaErrorInsufficientDriver) {
+    return PyBool_FromLong(0);
+  }
+  return PyBool_FromLong(1);
+}
+
+PyObject * THCPModule_getDriverVersion(PyObject *self)
+{
+  int driverVersion = -1;
+  cudaError_t err = cudaDriverGetVersion(&driverVersion);
+  if (err != cudaSuccess) {
+    PyErr_Format(PyExc_RuntimeError,
+                    "Error calling cudaDriverGetVersion: %d %s",
+                    err, cudaGetErrorString(err));
+    return NULL;
+  }
+  return PyLong_FromLong((int64_t) driverVersion);
+}
+
+PyObject * THCPModule_getCompiledVersion(PyObject *self)
+{
+  return PyLong_FromLong((long) CUDA_VERSION);
+}
+
+PyObject * THCPModule_getRNGState(PyObject *_unused)
+{
+  using namespace at;
+  using namespace torch::autograd;
+  HANDLE_TH_ERRORS
+  auto tensor = VariableType::getType(CPU(kByte))->tensor();
+  THCRandom_getRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
+  return THPVariable_Wrap(tensor);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_setRNGState(PyObject *_unused, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  if (!THPVariable_Check(obj) || THPVariable_UnpackData(obj).type().ID() != at::TypeID::CPUByte) {
+    throw TypeError("set_rng_state expects a torch.ByteTensor, but got %s",
+        Py_TYPE(obj)->tp_name);
+  }
+  auto& tensor = THPVariable_UnpackData(obj);
+  THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_manualSeed(PyObject *_unused, PyObject *seed)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(seed), "manual_seed expected a long, "
+          "but got %s", THPUtils_typename(seed));
+  THCRandom_manualSeed(state, THPUtils_unpackLong(seed));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_manualSeedAll(PyObject *_unused, PyObject *seed)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(seed), "manual_seed expected a long, "
+          "but got %s", THPUtils_typename(seed));
+  THCRandom_manualSeedAll(state, THPUtils_unpackLong(seed));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_seed(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(THCRandom_seed(state));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_seedAll(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(THCRandom_seedAll(state));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_initialSeed(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(THCRandom_initialSeed(state));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaHostAllocator(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  THAllocator* allocator = THCState_getCudaHostAllocator(state);
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaSynchronize(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  THCudaCheck(cudaDeviceSynchronize());
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaSleep(PyObject *_unused, PyObject *cycles)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(cycles), "torch.cuda._sleep(): expected 'int'");
+  THC_sleep(LIBRARY_STATE THPUtils_unpackLong(cycles));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// We need to ensure that as long as a thread will NEVER loose the GIL as long as
+// it holds the CUDA mutex. Otherwise another thread might be scheduled and try to
+// e.g. allocate a new tensor which will cause a deadlock. It's enough to have a
+// single global, because it can be only set once (cudaMutex is not recursive)
+// by the thread that owns the mutex (obviously there can be only one such thread).
+static PyGILState_STATE cudaMutexGILState;
+
+PyObject * THCPModule_cudaLockMutex(PyObject *module)
+{
+  auto mutex = THCCachingAllocator_getCudaFreeMutex();
+  // This has to be a busy loop because we **absolutely need to** hold the GIL
+  // or it's a recipe for a deadlock otherwise (if we let other Python threads
+  // run while we have the cudaMutex, but not the GIL, they might try to e.g.
+  // free a CUDA tensor and acquire the cudaMutex without giving up the GIL,
+  // because it happens deep within THC).
+  while (true) {
+    if (mutex->try_lock())
+      break;
+    {
+      AutoNoGIL no_gil;
+      std::this_thread::sleep_for(std::chrono::microseconds(10));
+    }
+  }
+
+  cudaMutexGILState = PyGILState_Ensure();
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_cudaUnlockMutex(PyObject *module)
+{
+  auto mutex = THCCachingAllocator_getCudaFreeMutex();
+  PyGILState_Release(cudaMutexGILState);
+  mutex->unlock();
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_emptyCache(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  THCCachingAllocator_emptyCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_memoryAllocated(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
+  int device = (int) THPUtils_unpackLong(arg);
+  auto memory_allocated = THCCachingAllocator_currentMemoryAllocated(device);
+  return PyLong_FromUnsignedLongLong(memory_allocated);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_maxMemoryAllocated(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to max_memory_allocated");
+  int device = (int) THPUtils_unpackLong(arg);
+  auto max_memory_allocated = THCCachingAllocator_maxMemoryAllocated(device);
+  return PyLong_FromUnsignedLongLong(max_memory_allocated);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_memoryCached(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_cached");
+  int device = (int) THPUtils_unpackLong(arg);
+  auto memory_cached = THCCachingAllocator_currentMemoryCached(device);
+  return PyLong_FromUnsignedLongLong(memory_cached);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_maxMemoryCached(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to max_memory_cached");
+  int device = (int) THPUtils_unpackLong(arg);
+  auto max_memory_cached = THCCachingAllocator_maxMemoryCached(device);
+  return PyLong_FromUnsignedLongLong(max_memory_cached);
+  END_HANDLE_TH_ERRORS
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Cuda module initialization
+////////////////////////////////////////////////////////////////////////////////
+
+static void bindCudaDeviceProperties(PyObject* module) {
+  // Add class and method to torch.cuda
+  auto m = py::handle(module).cast<py::module>();
+  py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
+    .def_readonly("name", &cudaDeviceProp::name)
+    .def_readonly("major", &cudaDeviceProp::major)
+    .def_readonly("minor", &cudaDeviceProp::minor)
+    .def_readonly("is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
+    .def_readonly("is_integrated", &cudaDeviceProp::integrated)
+    .def_readonly("multi_processor_count", &cudaDeviceProp::multiProcessorCount)
+    .def_readonly("total_memory", &cudaDeviceProp::totalGlobalMem)
+    .def("__repr__", [](const cudaDeviceProp &prop) {
+      std::ostringstream stream;
+      stream << "_CudaDeviceProperties(name='" << prop.name << "', major=" << prop.major
+             << ", minor=" << prop.minor << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+             << "MB, multi_processor_count=" << prop.multiProcessorCount << ")";
+      return stream.str();
+    });
+  m.def("_get_device_properties", [](int device) -> cudaDeviceProp * {
+    return at::globalContext().getDeviceProperties(device);
+  }, py::return_value_policy::reference);
+}
+
+// Callback for python part. Used for additional initialization of python classes
+static PyObject * THCPModule_initExtension(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  state = at::globalContext().lazyInitCUDA();
+
+  auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
+  if (!m) throw python_error();
+
+  // Register Storage Python objects with DynamicTypes.cpp
+  THCPDoubleStorage_postInit(m);
+  THCPFloatStorage_postInit(m);
+  THCPHalfStorage_postInit(m);
+  THCPLongStorage_postInit(m);
+  THCPIntStorage_postInit(m);
+  THCPShortStorage_postInit(m);
+  THCPCharStorage_postInit(m);
+  THCPByteStorage_postInit(m);
+
+#ifdef USE_MAGMA
+  THCMagma_init(state);
+  bool has_magma = true;
+#else
+  bool has_magma = false;
+#endif
+
+#ifdef CUDA_HALF_TENSOR
+  bool has_half = true;
+#else
+  bool has_half = false;
+#endif
+
+  auto set_module_attr = [&](const char* name, PyObject* v) {
+    if (PyObject_SetAttrString(m, name, v) < 0) {
+      throw python_error();
+    }
+  };
+
+  set_module_attr("has_magma", has_magma ? Py_True : Py_False);
+  set_module_attr("has_half", has_half ? Py_True : Py_False);
+
+  auto _state_cdata = THPObjectPtr(PyLong_FromVoidPtr(state));
+  if (!_state_cdata) throw python_error();
+  set_module_attr("_state_cdata", _state_cdata.get());
+
+  bindCudaDeviceProperties(m);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+#ifdef USE_NCCL
+#include "python_nccl.h"
+
+void THCPModule_useNccl()
+{
+  // Use NCCL to ensure that the symbols are loaded
+  ncclUniqueId uniqueId;
+  ncclGetUniqueId(&uniqueId);
+}
+#endif
+
+PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  return PyLong_FromVoidPtr(handle);
+  END_HANDLE_TH_ERRORS
+}
+
+static struct PyMethodDef _THCPModule_methods[] = {
+  {"_cuda_init",        (PyCFunction)THCPModule_initExtension,    METH_NOARGS,  NULL},
+  {"_cuda_setDevice",   (PyCFunction)THCPModule_setDevice_wrap,   METH_O,       NULL},
+  {"_cuda_getDevice",   (PyCFunction)THCPModule_getDevice_wrap,   METH_NOARGS,  NULL},
+  {"_cuda_getDeviceCount", (PyCFunction)THCPModule_getDeviceCount_wrap, METH_NOARGS, NULL},
+  {"_cuda_getCurrentStream", (PyCFunction)THCPModule_getCurrentStream_wrap, METH_NOARGS, NULL},
+  {"_cuda_getCurrentBlasHandle", (PyCFunction)THCPModule_getCurrentBlasHandle_wrap, METH_NOARGS, NULL},
+  {"_cuda_setStream",    (PyCFunction)THCPModule_setStream_wrap,  METH_O, NULL},
+  {"_cuda_isDriverSufficient", (PyCFunction)THCPModule_isDriverSufficient, METH_NOARGS, NULL},
+  {"_cuda_getDriverVersion", (PyCFunction)THCPModule_getDriverVersion, METH_NOARGS, NULL},
+  {"_cuda_getCompiledVersion", (PyCFunction)THCPModule_getCompiledVersion, METH_NOARGS, NULL},
+  {"_cuda_getRNGState", (PyCFunction)THCPModule_getRNGState,      METH_NOARGS,  NULL},
+  {"_cuda_setRNGState", (PyCFunction)THCPModule_setRNGState,      METH_O,       NULL},
+  {"_cuda_emptyCache", (PyCFunction) THCPModule_emptyCache,       METH_NOARGS,  NULL},
+  {"_cuda_memoryAllocated", (PyCFunction) THCPModule_memoryAllocated, METH_O,  NULL},
+  {"_cuda_maxMemoryAllocated", (PyCFunction) THCPModule_maxMemoryAllocated, METH_O,  NULL},
+  {"_cuda_memoryCached", (PyCFunction) THCPModule_memoryCached, METH_O,  NULL},
+  {"_cuda_maxMemoryCached", (PyCFunction) THCPModule_maxMemoryCached, METH_O,  NULL},
+  {"_cuda_manualSeed",  (PyCFunction)THCPModule_manualSeed,       METH_O,       NULL},
+  {"_cuda_manualSeedAll", (PyCFunction)THCPModule_manualSeedAll,  METH_O,       NULL},
+  {"_cuda_seed",        (PyCFunction)THCPModule_seed,             METH_NOARGS,  NULL},
+  {"_cuda_seedAll",     (PyCFunction)THCPModule_seedAll,          METH_NOARGS,  NULL},
+  {"_cuda_initialSeed", (PyCFunction)THCPModule_initialSeed,      METH_NOARGS,  NULL},
+  {"_cuda_cudaHostAllocator", (PyCFunction)THCPModule_cudaHostAllocator, METH_NOARGS, NULL},
+  {"_cuda_synchronize", (PyCFunction)THCPModule_cudaSynchronize, METH_NOARGS, NULL},
+  {"_cuda_sleep", (PyCFunction)THCPModule_cudaSleep, METH_O, NULL},
+  {"_cuda_lock_mutex",   (PyCFunction)THCPModule_cudaLockMutex,   METH_NOARGS,  NULL},
+  {"_cuda_unlock_mutex", (PyCFunction)THCPModule_cudaUnlockMutex, METH_NOARGS,  NULL},
+#ifdef USE_NCCL
+  {"_nccl_version", (PyCFunction)THCPModule_nccl_version, METH_NOARGS, NULL},
+  {"_nccl_unique_id", (PyCFunction)THCPModule_nccl_unique_id, METH_NOARGS, NULL},
+  {"_nccl_init_rank", (PyCFunction)THCPModule_nccl_init_rank, METH_VARARGS, NULL},
+  {"_nccl_reduce", (PyCFunction)THCPModule_nccl_reduce, METH_VARARGS, NULL},
+  {"_nccl_all_reduce", (PyCFunction)THCPModule_nccl_all_reduce, METH_VARARGS, NULL},
+  {"_nccl_broadcast", (PyCFunction)THCPModule_nccl_broadcast, METH_VARARGS, NULL},
+  {"_nccl_all_gather", (PyCFunction)THCPModule_nccl_all_gather, METH_VARARGS, NULL},
+  {"_nccl_reduce_scatter", (PyCFunction)THCPModule_nccl_reduce_scatter, METH_VARARGS, NULL},
+#endif
+  {NULL}
+};
+
+PyMethodDef* THCPModule_methods() {
+  return _THCPModule_methods;
+}
+
+namespace torch { namespace cuda {
+
+void initModule(PyObject *module) {
+  python::initCommMethods(module);
+}
+
+}}
diff --git a/torch/csrc/cuda/Module.h b/torch/csrc/cuda/Module.h
new file mode 100644
index 0000000..4f5fe3f
--- /dev/null
+++ b/torch/csrc/cuda/Module.h
@@ -0,0 +1,16 @@
+#ifndef THCP_CUDA_MODULE_INC
+#define THCP_CUDA_MODULE_INC
+
+extern THCState *state;
+
+#ifdef _THP_CORE
+void THCPModule_setDevice(int idx);
+PyObject * THCPModule_getDevice_wrap(PyObject *self);
+PyObject * THCPModule_setDevice_wrap(PyObject *self, PyObject *arg);
+PyObject * THCPModule_getDeviceName_wrap(PyObject *self, PyObject *arg);
+PyObject * THCPModule_getDriverVersion(PyObject *self);
+PyObject * THCPModule_isDriverSufficient(PyObject *self);
+PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *self);
+#endif
+
+#endif
diff --git a/torch/csrc/cuda/Storage.cpp b/torch/csrc/cuda/Storage.cpp
new file mode 100644
index 0000000..f767e94
--- /dev/null
+++ b/torch/csrc/cuda/Storage.cpp
@@ -0,0 +1,19 @@
+#define __STDC_FORMAT_MACROS
+
+#include "torch/csrc/python_headers.h"
+#include <structmember.h>
+
+#include <stdbool.h>
+// See Note [TH abstraction violation]
+//    - Used to get at allocator from storage
+#include <TH/THTensor.hpp>
+#include <THC/THCTensor.hpp>
+#include "THCP.h"
+
+#include "override_macros.h"
+#include "torch/csrc/finalizer.h"
+#include "torch/csrc/copy_utils.h"
+#include "DynamicTypes.h"
+
+#define THC_GENERIC_FILE "torch/csrc/generic/Storage.cpp"
+#include <THC/THCGenerateAllTypes.h>
diff --git a/torch/csrc/cuda/Storage.h b/torch/csrc/cuda/Storage.h
new file mode 100644
index 0000000..898164b
--- /dev/null
+++ b/torch/csrc/cuda/Storage.h
@@ -0,0 +1,44 @@
+#ifndef THCP_STORAGE_INC
+#define THCP_STORAGE_INC
+
+#define THCPStorage TH_CONCAT_3(THCP,Real,Storage)
+#define THCPStorageStr TH_CONCAT_STRING_3(torch.cuda.,Real,Storage)
+#define THCPStorageClass TH_CONCAT_3(THCP,Real,StorageClass)
+#define THCPStorage_(NAME) TH_CONCAT_4(THCP,Real,Storage_,NAME)
+
+#define THCPDoubleStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPDoubleStorageClass)
+#define THCPFloatStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPFloatStorageClass)
+#define THCPHalfStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPHalfStorageClass)
+#define THCPLongStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPLongStorageClass)
+#define THCPIntStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPIntStorageClass)
+#define THCPShortStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPShortStorageClass)
+#define THCPCharStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPCharStorageClass)
+#define THCPByteStorage_Check(obj) \
+    PyObject_IsInstance(obj, THCPByteStorageClass)
+
+#define THCPDoubleStorage_CData(obj)  (obj)->cdata
+#define THCPFloatStorage_CData(obj)   (obj)->cdata
+#define THCPLongStorage_CData(obj)    (obj)->cdata
+#define THCPIntStorage_CData(obj)     (obj)->cdata
+#define THCPShortStorage_CData(obj)   (obj)->cdata
+#define THCPCharStorage_CData(obj)    (obj)->cdata
+#define THCPByteStorage_CData(obj)    (obj)->cdata
+
+#ifdef _THP_CORE
+#define THCPStorageType TH_CONCAT_3(THCP,Real,StorageType)
+#define THCPStorageBaseStr TH_CONCAT_STRING_3(Cuda,Real,StorageBase)
+#endif
+
+#include "override_macros.h"
+
+#define THC_GENERIC_FILE "torch/csrc/generic/Storage.h"
+#include <THC/THCGenerateAllTypes.h>
+
+#endif
diff --git a/torch/csrc/cuda/Stream.cpp b/torch/csrc/cuda/Stream.cpp
new file mode 100644
index 0000000..c8fc743
--- /dev/null
+++ b/torch/csrc/cuda/Stream.cpp
@@ -0,0 +1,117 @@
+#include "Stream.h"
+
+#include "THP.h"
+#include "Module.h"
+
+#include <structmember.h>
+#include <cuda_runtime_api.h>
+
+PyObject *THCPStreamClass = NULL;
+
+static PyObject * THCPStream_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+
+  int current_device;
+  THCudaCheck(cudaGetDevice(&current_device));
+
+  int flags = cudaStreamNonBlocking;
+  int priority = 0;
+  unsigned long long cdata = 0;
+
+  static char *kwlist[] = {"priority", "_cdata", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iK", kwlist, &priority, &cdata)) {
+    return NULL;
+  }
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return NULL;
+  }
+
+  THCStream* stream;
+  if (cdata) {
+    stream = (THCStream*) cdata;
+    if (stream) {
+      THCStream_retain(stream);
+    }
+  } else {
+    stream = THCStream_newWithPriority(flags, priority);
+  }
+
+  THCPStream* self = (THCPStream *)ptr.get();
+  self->cdata = stream;
+  self->device = stream ? THCStream_device(stream) : current_device;
+  self->cuda_stream = stream ? THCStream_stream(stream) : NULL;
+  return (PyObject *)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THCPStream_dealloc(THCPStream* self)
+{
+  THCStream_free(self->cdata);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static struct PyMemberDef THCPStream_members[] = {
+  {(char*)"_cdata", T_ULONGLONG, offsetof(THCPStream, cdata), READONLY, NULL},
+  {(char*)"device", T_INT, offsetof(THCPStream, device), READONLY, NULL},
+  {(char*)"cuda_stream", T_ULONGLONG, offsetof(THCPStream, cuda_stream), READONLY, NULL},
+  {NULL}
+};
+
+static PyMethodDef THCPStream_methods[] = {
+  {NULL}
+};
+
+PyTypeObject THCPStreamType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._CudaStreamBase",             /* tp_name */
+  sizeof(THCPStream),                    /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THCPStream_dealloc,        /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THCPStream_methods,                    /* tp_methods */
+  THCPStream_members,                    /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THCPStream_pynew,                      /* tp_new */
+};
+
+
+bool THCPStream_init(PyObject *module)
+{
+  THCPStreamClass = (PyObject*)&THCPStreamType;
+  if (PyType_Ready(&THCPStreamType) < 0)
+    return false;
+  Py_INCREF(&THCPStreamType);
+  PyModule_AddObject(module, "_CudaStreamBase", (PyObject *)&THCPStreamType);
+  return true;
+}
diff --git a/torch/csrc/cuda/Stream.h b/torch/csrc/cuda/Stream.h
new file mode 100644
index 0000000..e23782c
--- /dev/null
+++ b/torch/csrc/cuda/Stream.h
@@ -0,0 +1,21 @@
+#ifndef THCP_STREAM_INC
+#define THCP_STREAM_INC
+
+#include "torch/csrc/python_headers.h"
+#include <THC/THC.h>
+
+struct THCPStream {
+  PyObject_HEAD
+  THCStream *cdata;
+  int device;
+  cudaStream_t cuda_stream;
+};
+extern PyObject *THCPStreamClass;
+
+bool THCPStream_init(PyObject *module);
+
+inline bool THCPStream_Check(PyObject* obj) {
+  return THCPStreamClass && PyObject_IsInstance(obj, THCPStreamClass);
+}
+
+#endif // THCP_STREAM_INC
diff --git a/torch/csrc/cuda/THCP.h b/torch/csrc/cuda/THCP.h
new file mode 100644
index 0000000..bd3b2d1
--- /dev/null
+++ b/torch/csrc/cuda/THCP.h
@@ -0,0 +1,19 @@
+#ifndef THCP_H
+#define THCP_H
+
+#include "torch/csrc/python_headers.h"
+#include <TH/TH.h>
+#include <THC/THC.h>
+#include <THC/THCHalf.h>
+#include <THC/THCTensor.hpp>
+
+#include "torch/csrc/THP.h"
+#include "serialization.h"
+#include "Module.h"
+#include "Storage.h"
+#include "Stream.h"
+#ifdef _THP_CORE
+#include "utils.h"
+#endif
+
+#endif
diff --git a/torch/csrc/cuda/Tensor.cpp b/torch/csrc/cuda/Tensor.cpp
new file mode 100644
index 0000000..0706e43
--- /dev/null
+++ b/torch/csrc/cuda/Tensor.cpp
@@ -0,0 +1,21 @@
+#define __STDC_FORMAT_MACROS
+
+#include "torch/csrc/python_headers.h"
+#include <structmember.h>
+
+#include <TH/THMath.h>
+#include <stdbool.h>
+#include <vector>
+#include <stack>
+#include <tuple>
+#include "torch/csrc/cuda/THCP.h"
+
+#include "torch/csrc/cuda/override_macros.h"
+#include "torch/csrc/copy_utils.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+
+//generic_include THC torch/csrc/generic/Tensor.cpp
+
+#include "torch/csrc/cuda/undef_macros.h"
+#include "torch/csrc/cuda/restore_macros.h"
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
new file mode 100644
index 0000000..d7c3b76
--- /dev/null
+++ b/torch/csrc/cuda/comm.cpp
@@ -0,0 +1,202 @@
+#include <torch/csrc/cuda/comm.h>
+
+#include <torch/csrc/cuda/device_set.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+
+#ifdef USE_NCCL
+#include <torch/csrc/cuda/nccl.h>
+#endif
+
+#include <torch/csrc/utils/auto_stream.h>
+
+#include <THC/THC.h>
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch { namespace cuda {
+
+using namespace at;
+
+// Some operations can be performed more efficiently if we're handling tensors
+// of a single type only. Adding this logic directly in the loop makes it a bit
+// ugly, so here's a helper for it.
+struct unique_type_checker {
+  void show(const at::Type& t) {
+    if (!unique) return;
+    if (!type) type = &t;
+    unique = (type == &t);
+  }
+
+  const at::Type *type = nullptr;
+  bool unique = true;
+};
+
+std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
+  auto & type = tensor.type();
+  if (type.is_cuda() && tensor.get_device() != devices[0])
+    throw std::runtime_error("device of broadcasted tensor must appear as the "
+                             "first on devices list");
+  std::vector<Tensor> tensors;
+  tensors.reserve(devices.size());
+  at::DeviceGuard _device_guard;
+#ifdef USE_NCCL
+  if (nccl::is_available({tensor})) {
+    tensors.push_back(tensor);
+    for (auto device : devices.slice(1)) {
+      _device_guard.set_index(device);
+      tensors.push_back(type.tensor(tensor.sizes()));
+    }
+    nccl::broadcast(tensors);
+  } else {
+#else
+  {
+#endif
+    auto & gpu_type = type.toBackend(type.is_sparse() ? at::kSparseCUDA : at::kCUDA);
+    if (type.is_cuda()) {
+      tensors.push_back(tensor);
+    }
+    IntList loop_devices = type.is_cuda() ? devices.slice(1) : devices;
+    for (auto device : loop_devices) {
+      _device_guard.set_index(device);
+      tensors.push_back(gpu_type.copy(tensor, true));
+    }
+  }
+  return tensors;
+}
+
+tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t buffer_size) {
+  if (!std::all_of(tensors.begin(), tensors.end(),
+                   [&](const at::Tensor& t) { return t.get_device() == devices[0]; })) {
+    throw std::runtime_error("all tensors must be on devices[0]");
+  }
+
+  tensor_list2d outputs(devices.size());
+  outputs[0] = tensors;
+  for (auto & o : outputs)
+    o.reserve(tensors.size());
+
+  unique_type_checker type_checker;
+  for (auto & chunk : utils::take_tensors(tensors, buffer_size)) {
+    auto & type = chunk.type();
+    type_checker.show(type);
+    std::vector<at::Tensor> results;
+    if (chunk.type().is_sparse()) {
+      auto flat_tuple = utils::flatten_sparse_tensors(chunk.tensors);
+      std::vector<at::Tensor> broadcast_indices = broadcast(flat_tuple.first, devices);
+      std::vector<at::Tensor> broadcast_values = broadcast(flat_tuple.second, devices);
+      results.reserve(devices.size());
+      for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
+        at::DeviceGuard device_guard(devices[i]);
+        auto & device_outputs = outputs[i];
+        auto & inds = broadcast_indices[i];
+        auto & vals = broadcast_values[i];
+        for (auto & t : utils::unflatten_sparse_tensors(inds, vals, chunk.tensors))
+          device_outputs.push_back(std::move(t));
+      }
+    } else {
+      at::DeviceGuard device_guard(devices[0]);
+      std::vector<Tensor> results = broadcast(utils::flatten_dense_tensors(chunk.tensors),
+                                              devices);
+      for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
+        device_guard.set_index(devices[i]);
+        auto & device_outputs = outputs[i];
+        for (auto & t : utils::unflatten_dense_tensors(results[i], chunk.tensors))
+          device_outputs.push_back(std::move(t));
+      }
+    }
+  }
+
+  // If we only saw a single tensor type, then we can skip expensive reordering
+  if (!type_checker.unique) {
+    for (auto & o : outputs)
+      utils::reorder_tensors_like(o, tensors);
+  }
+  return outputs;
+}
+
+std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntList devices,
+    const at::optional<std::vector<int64_t>>& chunk_sizes,
+    int64_t dim,
+    const at::optional<std::vector<THCStream*>>& streams) {
+  std::vector<at::Tensor> chunks;
+  if (chunk_sizes) {
+    const int64_t chunk_size_sum =
+        std::accumulate(chunk_sizes->begin(), chunk_sizes->end(), 0);
+    AT_CHECK(
+      chunk_size_sum == tensor.size(dim),
+      "given chunk sizes don't sum up to the tensor's size ",
+      "(sum(chunk_sizes) == ", chunk_size_sum,
+      ", but expected ", tensor.size(dim), ")");
+    chunks.reserve(chunk_sizes->size());
+    int64_t chunk_start = 0;
+    for (size_t chunk = 0; chunk < chunk_sizes->size(); ++chunk) {
+      const int64_t chunk_size = (*chunk_sizes)[chunk];
+      AT_CHECK(chunk_size > 0, "Chunk size must be positive");
+      chunks.push_back(tensor.narrow(dim, chunk_start, chunk_size));
+      chunk_start += chunk_size;
+    }
+    AT_ASSERT(chunks.size() == chunk_sizes->size());
+  } else {
+    chunks = tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim);
+  }
+  auto* thc_state = at::globalContext().lazyInitCUDA();
+  for (size_t chunk = 0; chunk < chunks.size(); ++chunk) {
+    const int32_t device_index = devices[chunk];
+    // We must set the current device before setting the current stream.
+    const at::DeviceGuard device_guard({at::kCUDA, device_index});
+    const AutoStream stream_guard(
+        streams ? (*streams)[chunk]
+                : THCState_getStreamOnDevice(thc_state, device_index));
+    // Copy the chunk from its current device to its destination device, which
+    // we set as the default device above, thus specified as -1.
+    chunks[chunk] =
+        chunks[chunk].contiguous().to({at::kCUDA, -1}, /*non_blocking=*/true);
+  }
+  return chunks;
+}
+
+at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    at::optional<int32_t> destination_index) {
+  AT_ASSERT(!tensors.empty());
+  at::Tensor result;
+  int64_t total_size = 0;
+  auto& first = tensors.front();
+  const auto first_size = first.sizes();
+  std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
+  for (const auto& tensor : tensors) {
+    AT_CHECK(
+        tensor.type().is_cuda(), "Gather expects all inputs to have CUDA type");
+    AT_CHECK(tensor.ndimension() == static_cast<int64_t>(expected_size.size()));
+    expected_size[dim] = tensor.size(dim);
+    for (size_t dimension = 0; dimension < expected_size.size(); ++dimension) {
+      AT_CHECK(
+          expected_size[dimension] == tensor.size(dimension),
+          "Gather got an input of invalid size: got ",
+          tensor.sizes(), ", but expected ", at::IntList(expected_size));
+    }
+    total_size += tensor.size(dim);
+  }
+  expected_size[dim] = total_size;
+  at::Device device(at::kCPU);
+  if (!destination_index || *destination_index != -1) {
+    device = at::Device(at::kCUDA, destination_index ? *destination_index : -1);
+  }
+  result = at::empty(expected_size, first.options().device(device));
+
+  int64_t chunk_start = 0;
+  for (const auto& tensor : tensors) {
+    result.narrow(dim, chunk_start, tensor.size(dim))
+        .copy_(tensor, /*non_blocking=*/true);
+    chunk_start += tensor.size(dim);
+  }
+  return result;
+}
+}} // namespace torch::cuda
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h
new file mode 100644
index 0000000..a87cc45
--- /dev/null
+++ b/torch/csrc/cuda/comm.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <THC/THC.h>
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch { namespace cuda {
+
+using tensor_list2d = std::vector<std::vector<at::Tensor>>;
+
+std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntList devices);
+tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntList devices,
+                                  size_t buffer_size);
+
+std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntList devices,
+    const at::optional<std::vector<int64_t>>& chunk_sizes = at::nullopt,
+    int64_t dim = 0,
+    const at::optional<std::vector<THCStream*>>& streams = at::nullopt);
+
+at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    at::optional<int32_t> destination_index);
+}}
diff --git a/torch/csrc/cuda/cuda_check.h b/torch/csrc/cuda/cuda_check.h
new file mode 100644
index 0000000..327e90d
--- /dev/null
+++ b/torch/csrc/cuda/cuda_check.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+namespace torch {
+// We're using three CUDA APIs, so define a few helpers for error handling
+static inline void nvrtcCheck(nvrtcResult result,const char * file, int line) {
+  if(result != NVRTC_SUCCESS) {
+    std::stringstream ss;
+    ss << file << ":" << line << ": " << nvrtcGetErrorString(result);
+    throw std::runtime_error(ss.str());
+  }
+}
+#define TORCH_NVRTC_CHECK(result) ::torch::nvrtcCheck(result,__FILE__,__LINE__);
+
+static inline void cuCheck(CUresult result, const char * file, int line) {
+  if(result != CUDA_SUCCESS) {
+    const char * str;
+    cuGetErrorString(result, &str);
+    std::stringstream ss;
+    ss << file << ":" << line << ": " << str;
+    throw std::runtime_error(ss.str());
+  }
+}
+#define TORCH_CU_CHECK(result) ::torch::cuCheck(result,__FILE__,__LINE__);
+
+static inline void cudaCheck(cudaError_t result, const char * file, int line) {
+  if(result != cudaSuccess) {
+    std::stringstream ss;
+    ss << file << ":" << line << ": " << cudaGetErrorString(result);
+    throw std::runtime_error(ss.str());
+  }
+}
+#define TORCH_CUDA_CHECK(result) ::torch::cudaCheck(result,__FILE__,__LINE__);
+
+}
+
+#endif
diff --git a/torch/csrc/cuda/device_set.h b/torch/csrc/cuda/device_set.h
new file mode 100644
index 0000000..a939710
--- /dev/null
+++ b/torch/csrc/cuda/device_set.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <bitset>
+
+namespace torch {
+
+static constexpr size_t MAX_CUDA_DEVICES = 64;
+using device_set = std::bitset<MAX_CUDA_DEVICES>;
+
+}
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
new file mode 100644
index 0000000..cbb419f
--- /dev/null
+++ b/torch/csrc/cuda/nccl.cpp
@@ -0,0 +1,202 @@
+#include "nccl.h"
+#include "torch/csrc/cuda/device_set.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/hash.h"
+
+#include <unordered_map>
+#include <sstream>
+#include <ATen/ATen.h>
+#include <THC/THC.h>
+#include <THC/THCStream.h>
+
+namespace torch { namespace cuda { namespace nccl {
+
+using namespace at;
+
+namespace detail {
+
+void throw_nccl_error(ncclResult_t status) {
+  std::ostringstream err;
+  err << "NCCL Error " << status << ": " << ncclGetErrorString(status);
+  throw std::runtime_error(err.str());
+}
+
+struct NcclCommList {
+  std::unique_ptr<ncclComm_t[]> comms;
+  int ndevices;
+  NcclCommList(const std::vector<int>& devices)
+    : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
+    CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data()));
+  }
+  NcclCommList(NcclCommList&& foo) = default;
+  ~NcclCommList() {
+    /*
+     * TODO(T30279827) Temporarily disable calling ncclCommDestroy
+     * Calling ncclCommDestroy while program exiting is undefined
+     * according to Nvidia, and lead to segfault in NCCL 2
+     * (whether it is called before or after the CUDA runtime destructor).
+     * Temporarily disable it in destructor to avoid segfault.
+     * Following up with Nvidia for long term solution.
+     */
+    return;
+
+    if (comms) {
+      for (int i = 0; i < ndevices; i++) {
+        int dummy_var;
+        if (cudaGetDevice(&dummy_var) != cudaSuccess) {
+          /* there are cases when this destructor is called after the
+           CUDA driver is already unloaded from the process.
+           In these cases, skip ncclCommDestroy */
+          return;
+        }
+        ncclCommDestroy(comms[i]);
+      }
+    }
+  }
+  ArrayRef<ncclComm_t> ref() const {
+    return ArrayRef<ncclComm_t>(comms.get(), ndevices);
+  }
+};
+
+using device_list = std::vector<int>;
+// accesses to this object have to be guarded by THC's CudaFreeMutex
+static std::unordered_map<device_list, NcclCommList, torch::hash<device_list>> _communicators;
+
+ArrayRef<ncclComm_t> _get_communicators(TensorList inputs) {
+  static auto get_device = [](const at::Tensor& t) -> int { return t.get_device(); };
+  device_list devices = fmap(inputs, get_device);
+  auto it = _communicators.find(devices);
+  if (it == _communicators.end())
+    std::tie(it, std::ignore) = _communicators.emplace(devices, devices);
+  return it->second.ref();
+}
+
+ncclDataType_t _get_data_type(const Type& type) {
+  if (type.backend() != kCUDA) {
+    throw std::runtime_error("Unconvertible NCCL type");
+  }
+  switch (type.scalarType()) {
+  case at::kFloat   : return ncclFloat;
+  case at::kHalf    : return ncclHalf;
+  case at::kDouble  : return ncclDouble;
+  case at::kLong    : return ncclInt64;
+  case at::kInt     : return ncclInt;
+  case at::kChar    : return ncclChar;
+  case at::kByte    : return ncclChar;
+  default: throw std::runtime_error("Unconvertible NCCL type");
+  }
+}
+
+void _check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, int output_multiplier) {
+  // len(inputs) == len(outputs)
+  size_t len = inputs.size();
+
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+
+  if (len != outputs.size()) {
+    std::stringstream err;
+    err << "inputs and outputs sequences have to be of the same length, but got input of length " << len << " and output of length " << outputs.size();
+    throw std::runtime_error(err.str());
+  }
+
+  device_set devices;
+  int64_t numel = inputs[0].numel();
+  auto& type = inputs[0].type();
+
+  for (size_t i = 0; i < len; i++) {
+    auto input = inputs[i];
+    auto output = outputs[i];
+
+    if (!(input.type().is_cuda() && !input.type().is_sparse()
+        && output.type().is_cuda()  && !output.type().is_sparse())) {
+      throw std::runtime_error("input and output elements have to be cuda dense Tensors");
+    }
+
+    if (!(type == input.type() && type == output.type())) {
+      throw std::runtime_error("all inputs and outputs must be of the same Tensor type");
+    }
+
+    if (!input.is_contiguous() || !output.is_contiguous()) {
+      throw std::runtime_error("all inputs and outputs have to be contiguous");
+    }
+
+    auto input_device = input.get_device();
+    // inputs must be on unique devices
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+    devices.set(input_device);
+
+    // inputs and outputs must be on same device respectively
+    if (input_device != output.get_device()) {
+      throw std::runtime_error("input and output must be on the same device");
+    }
+
+    // all inputs must be same size
+    if (input.numel() != numel) {
+      throw std::runtime_error("all inputs must have the same number of elements");
+    }
+
+    if (output.numel() * output_multiplier != numel * input_multiplier) {
+      throw std::runtime_error("output must be of size input_size * size_multiplier");
+    }
+  }
+}
+
+} // namespace detail
+
+bool is_available(TensorList tensors) {
+#ifdef USE_NCCL
+  device_set devices;
+  for (auto & tensor : tensors) {
+    auto & type = tensor.type();
+    if (!type.is_cuda() || type.is_sparse())
+      return false;
+    if (!tensor.is_contiguous())
+      return false;
+    auto device = tensor.get_device();
+    if (devices[device])
+      return false;
+    devices[device] = true;
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+std::uint64_t version() {
+#if defined(NCCL_MAJOR)
+  return NCCL_MAJOR * 1000 + NCCL_MINOR * 100 + NCCL_PATCH;
+#elif defined(USE_NCCL)
+  return 1000;
+#else
+  return 0;
+#endif
+}
+
+void broadcast(TensorList tensors, const stream_list& streams, const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  _check_inputs(tensors, tensors, 1, 1);
+  ncclDataType_t data_type = _get_data_type(tensors[0].type());
+  int64_t numel = tensors[0].numel();
+
+  std::lock_guard<std::mutex> free_mutex(*(THCCachingAllocator_getCudaFreeMutex()));
+  const auto comms = user_comms.empty() ? _get_communicators(tensors) : ArrayRef<ncclComm_t>(user_comms);
+  at::DeviceGuard device_guard;
+  AutoNcclGroup nccl_group_guard;
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
+    device_guard.set_index(tensors[i].get_device());
+    // TODO: use current stream
+    const auto stream = (streams.empty() || !streams[i]) ? NULL : THCStream_stream(streams[i]);
+    CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
+  }
+#else
+  throw std::runtime_error("PyTorch built without NCCL support");
+#endif
+}
+
+}}}
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
new file mode 100644
index 0000000..57bb595
--- /dev/null
+++ b/torch/csrc/cuda/nccl.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <nccl.h>
+#include <ATen/ATen.h>
+#include <THC/THC.h>
+
+namespace torch { namespace cuda { namespace nccl {
+
+// NOTE: this is exposed only so that python_nccl.cpp can some of these helpers.
+// Don't use them outside of these files.
+namespace detail {
+
+void throw_nccl_error(ncclResult_t status);
+
+static inline void CHECK(ncclResult_t status) {
+  if (status != ncclSuccess) {
+    throw_nccl_error(status);
+  }
+}
+
+struct AutoNcclGroup {
+  AutoNcclGroup() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+    CHECK(ncclGroupStart());
+#endif
+  }
+  ~AutoNcclGroup() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+    CHECK(ncclGroupEnd());
+#endif
+  }
+};
+
+at::ArrayRef<ncclComm_t> _get_communicators(at::TensorList inputs);
+void _check_inputs(at::TensorList inputs, at::TensorList outputs,
+                   int input_multiplier, int output_multiplier);
+ncclDataType_t _get_data_type(const at::Type& type);
+
+} // namespace detail
+
+using comm_list = std::vector<ncclComm_t>;
+using stream_list = std::vector<THCStream*>;
+
+std::uint64_t version();
+bool is_available(at::TensorList tensors);
+void broadcast(at::TensorList tensors,
+               const stream_list& streams = {},
+               const comm_list& user_comms = {});
+
+}}}
diff --git a/torch/csrc/cuda/override_macros.h b/torch/csrc/cuda/override_macros.h
new file mode 100644
index 0000000..439f33d
--- /dev/null
+++ b/torch/csrc/cuda/override_macros.h
@@ -0,0 +1,55 @@
+#include "undef_macros.h"
+
+#define THWStoragePtr THCStoragePtr
+#define THPStoragePtr THCPStoragePtr
+#define THWTensorPtr THCTensorPtr
+#define THPTensorPtr THCPTensorPtr
+
+#define THWStorage THCStorage
+#define THWStorage_(NAME) THCStorage_(NAME)
+#define THWTensor THCTensor
+#define THWTensor_(NAME) THCTensor_(NAME)
+
+#define THPStorage_(NAME) TH_CONCAT_4(THCP,Real,Storage_,NAME)
+#define THPStorage THCPStorage
+#define THPStorageBaseStr THCPStorageBaseStr
+#define THPStorageStr THCPStorageStr
+#define THPStorageClass THCPStorageClass
+#define THPStorageType THCPStorageType
+
+#define THPTensor_(NAME) TH_CONCAT_4(THCP,Real,Tensor_,NAME)
+#define THPTensor_stateless_(NAME) TH_CONCAT_4(THCP,Real,Tensor_stateless_,NAME)
+#define THPTensor THCPTensor
+#define THPTensorStr THCPTensorStr
+#define THPTensorBaseStr THCPTensorBaseStr
+#define THPTensorClass THCPTensorClass
+#define THPTensorType THCPTensorType
+
+#define THPTensorStatelessType THCPTensorStatelessType
+#define THPTensorStateless THCPTensorStateless
+
+
+#define THSPTensorPtr THCSPTensorPtr
+
+#define THSPTensor_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_,NAME)
+#define THSPTensor_stateless_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_stateless_,NAME)
+#define THSPTensor THCSPTensor
+#define THSPTensorStr THCSPTensorStr
+#define THSPTensorBaseStr THCSPTensorBaseStr
+#define THSPTensorClass THCSPTensorClass
+#define THSPTensorType THCSPTensorType
+
+#define THSPTensorStatelessType THCSPTensorStatelessType
+#define THSPTensorStateless THCSPTensorStateless
+
+
+#define LIBRARY_STATE_NOARGS state
+#define LIBRARY_STATE state,
+#define LIBRARY_STATE_TYPE THCState*,
+#define LIBRARY_STATE_TYPE_NOARGS THCState*
+#define TH_GENERIC_FILE THC_GENERIC_FILE
+
+#define THHostTensor TH_CONCAT_3(TH,Real,Tensor)
+#define THHostTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME)
+#define THHostStorage TH_CONCAT_3(TH,Real,Storage)
+#define THHostStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp
new file mode 100644
index 0000000..902d5b9
--- /dev/null
+++ b/torch/csrc/cuda/python_comm.cpp
@@ -0,0 +1,55 @@
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/cuda/comm.h"
+#include "torch/csrc/cuda/Stream.h"
+#include "torch/csrc/cuda/THCP.h"
+#include "torch/csrc/utils/auto_gil.h"
+
+#include <ATen/ATen.h>
+
+#include <THC/THC.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch { namespace cuda { namespace python {
+void initCommMethods(PyObject *module) {
+  auto m = py::cast<py::module>(module);
+  m.def("_broadcast_coalesced", [](std::vector<at::Tensor>& tensors, std::vector<int64_t> devices, size_t buffer_size) {
+     return broadcast_coalesced(tensors, devices, buffer_size);
+   }, py::arg("tensors"), py::arg("devices"), py::arg("buffer_size"),
+      py::call_guard<py::gil_scoped_release>())
+   .def("_broadcast", [](at::Tensor& tensor, std::vector<int64_t> devices) {
+     return broadcast(tensor, devices);
+   }, py::call_guard<py::gil_scoped_release>())
+   .def("_scatter", [](
+     at::Tensor& tensor,
+     std::vector<int64_t>& devices,
+     at::optional<std::vector<int64_t>> chunk_sizes,
+     int64_t dim,
+     at::optional<py::object> py_streams) {
+     at::optional<std::vector<THCStream*>> streams;
+     if (py_streams) {
+       py::handle handle = *py_streams;
+       streams = THPUtils_PySequence_to_THCStreamList(handle.ptr());
+     }
+     // Note: We're holding the GIL up to here.
+     AutoNoGIL no_gil;
+     return scatter(tensor, devices, chunk_sizes, dim, streams);
+   },
+   py::arg("tensor"),
+   py::arg("devices"),
+   py::arg("chunk_sizes"),
+   py::arg("dim"),
+   py::arg("streams"))
+   .def("_gather", [](
+     std::vector<at::Tensor>& tensors,
+     int64_t dim,
+     at::optional<int32_t> destination_index) {
+     return gather(tensors, dim, destination_index);
+   },
+   py::arg("tensors"),
+   py::arg("dim"),
+   py::arg("destination_index"),
+   py::call_guard<py::gil_scoped_release>());
+}
+}}}
diff --git a/torch/csrc/cuda/python_comm.h b/torch/csrc/cuda/python_comm.h
new file mode 100644
index 0000000..519a114
--- /dev/null
+++ b/torch/csrc/cuda/python_comm.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch { namespace cuda { namespace python {
+
+void initCommMethods(PyObject *module);
+
+}}}
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
new file mode 100644
index 0000000..13aff2a
--- /dev/null
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -0,0 +1,326 @@
+#include "python_nccl.h"
+
+#include "nccl.h"
+#include "torch/csrc/THP.h"
+#include "torch/csrc/Types.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/cuda/THCP.h"
+#include "torch/csrc/cuda/nccl.h"
+#include "torch/csrc/Exceptions.h"
+
+#include <nccl.h>
+#include <sstream>
+#include <unordered_map>
+
+using namespace at;
+using namespace torch;
+using namespace torch::cuda::nccl;
+using namespace torch::cuda::nccl::detail;
+
+static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
+
+PyObject * THCPModule_nccl_version(PyObject *self, PyObject *args) {
+  return PyInt_FromLong(version());
+}
+
+PyObject * THCPModule_nccl_unique_id(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  ncclUniqueId id;
+  CHECK(ncclGetUniqueId(&id));
+  return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES);
+  END_HANDLE_TH_ERRORS
+}
+
+static ncclComm_t unpack_nccl_comm(PyObject* capsule) {
+  ncclComm_t comm = (ncclComm_t)PyCapsule_GetPointer(capsule, COMM_CAPSULE_NAME);
+  if (!comm) throw python_error();
+  return comm;
+}
+
+static void destroy_nccl_comm(PyObject* capsule) {
+  /*
+   * TODO(T30279827) Temporarily disable calling ncclCommDestroy
+   * Calling ncclCommDestroy while program exiting is undefined
+   * according to Nvidia, and lead to segfault in NCCL 2
+   * (whether it is called before or after the CUDA runtime destructor).
+   * Temporarily disable it in destructor to avoid segfault.
+   * Following up with Nvidia for long term solution.
+   */
+  return;
+
+  HANDLE_TH_ERRORS
+  ncclComm_t comm = unpack_nccl_comm(capsule);
+  with_no_gil([&]{
+    ncclCommDestroy(comm);
+  });
+  END_HANDLE_TH_ERRORS_RET()
+}
+
+static std::vector<THCStream*> unpack_streams(PyObject* obj, size_t size) {
+  if (obj == Py_None) {
+    return std::vector<THCStream*>(size, nullptr);
+  }
+  auto streams = THPUtils_PySequence_to_THCStreamList(obj);
+  if (streams.size() != size) {
+    throw std::runtime_error("number of streams is not equal to number of inputs");
+  }
+  return streams;
+}
+
+static std::vector<at::Tensor> extract_tensors(PyObject* obj);
+
+static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
+  if (obj == Py_None) {
+    return std::vector<ncclComm_t>();
+  }
+  std::vector<ncclComm_t> comms;
+  if (PyCapsule_CheckExact(obj)) {
+    comms = { unpack_nccl_comm(obj) };
+  } else {
+    auto seq = THPObjectPtr(PySequence_Fast(obj, "comm is not a sequence"));
+    if (!seq) throw python_error();
+    auto size = PySequence_Fast_GET_SIZE(seq.get());
+    comms = std::vector<ncclComm_t>(size);
+    for (int64_t i = 0; i < size; i++) {
+      comms[i] = unpack_nccl_comm(PySequence_Fast_GET_ITEM(seq.get(), i));
+    }
+  }
+  if (comms.size() != size) {
+    throw std::runtime_error("number of communicators is not equal to number of inputs");
+  }
+  return comms;
+}
+
+PyObject * THCPModule_nccl_init_rank(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  int nranks;
+  const char* id;
+  Py_ssize_t id_len;
+  int rank;
+
+  if (!PyArg_ParseTuple(args, "is#i:nccl_init_rank", &nranks, &id, &id_len, &rank)) {
+    return NULL;
+  }
+  THPUtils_assert(id_len == NCCL_UNIQUE_ID_BYTES,
+      "invalid unqiue_id (expected %d bytes, got %zd)",
+      NCCL_UNIQUE_ID_BYTES, id_len);
+
+  ncclUniqueId commId;
+  memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
+  ncclComm_t comm;
+  with_no_gil([&]{
+    CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
+  });
+  return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs, *_outputs, *_streams, *_comms;
+  int root, op;
+
+  if (!PyArg_ParseTuple(args, "OOiiOO", &_inputs, &_outputs, &root, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(args, NULL, "nccl_reduce", 1,
+			      "(sequence[Tensor] inputs, sequence[Tensor] outputs, int root,"
+            " int op, sequence[torch.cuda.Stream or None]");
+    return NULL;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  std::vector<THCStream*> streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  THPUtils_assert(root >= 0 && (size_t)root < inputs.size(), "invalid root");
+
+  with_no_gil([&]{
+    _check_inputs(inputs, outputs, 1, 1);
+    size_t len = inputs.size();
+
+    ncclDataType_t data_type = _get_data_type(inputs[0].type());
+
+    int64_t count = inputs[0].numel();
+    std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
+    auto comms = user_comms.empty() ? _get_communicators(inputs) : ArrayRef<ncclComm_t>(user_comms);
+    at::DeviceGuard device_guard;
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < len; i++) {
+      int device = inputs[i].get_device();
+      device_guard.set_index(device);
+      auto stream = (streams[i] == NULL) ? NULL : THCStream_stream(streams[i]);
+      CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+           count, data_type, (ncclRedOp_t) op, root, comms[i], stream));
+    }
+  });
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs, *_outputs, *_streams, *_comms;
+  int op;
+
+  if (!PyArg_ParseTuple(args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(args, NULL, "nccl_all_reduce", 1,
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op,"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
+    return NULL;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  with_no_gil([&]{
+    _check_inputs(inputs, outputs, 1, 1);
+    size_t len = inputs.size();
+
+    ncclDataType_t data_type = _get_data_type(inputs[0].type());
+
+    int64_t count = inputs[0].numel();
+    std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
+    auto comms = user_comms.empty() ? _get_communicators(inputs) : ArrayRef<ncclComm_t>(user_comms);
+    at::DeviceGuard device_guard;
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < len; i++) {
+      int device = inputs[i].get_device();
+      device_guard.set_index(device);
+      auto stream = (streams[i] == NULL) ? NULL : THCStream_stream(streams[i]);
+      CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+          count, data_type, (ncclRedOp_t) op, comms[i], stream));
+    }
+  });
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_nccl_broadcast(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs, *_streams, *_comms;
+  int root;
+
+  if (!PyArg_ParseTuple(args, "OiOO", &_inputs, &root, &_streams, &_comms)) {
+    THPUtils_invalidArguments(args, NULL, "nccl_broadcast", 1,
+			      "(sequence[Tensor] inputs, int root)");
+    return NULL;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  THPUtils_assert(root >= 0 && (size_t)root < inputs.size(), "invalid root");
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  with_no_gil([&]{
+    torch::cuda::nccl::broadcast(inputs, streams, user_comms);
+  });
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs, *_outputs, *_streams, *_comms;
+
+  if (!PyArg_ParseTuple(args, "OOOO", &_inputs, &_outputs, &_streams, &_comms)) {
+    THPUtils_invalidArguments(args, NULL, "nccl_all_gather", 1,
+			      "(sequence[Tensor] inputs, sequence[Tensor] outputs");
+    return NULL;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  with_no_gil([&]{
+    size_t len = inputs.size();
+    _check_inputs(inputs, outputs, len, 1);
+
+    ncclDataType_t data_type = _get_data_type(inputs[0].type());
+
+    int64_t count = inputs[0].numel();
+    std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
+    auto comms = user_comms.empty() ? _get_communicators(inputs) : ArrayRef<ncclComm_t>(user_comms);
+    at::DeviceGuard device_guard;
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < len; i++) {
+      int device = inputs[i].get_device();
+      device_guard.set_index(device);
+      auto stream = (streams[i] == NULL) ? NULL : THCStream_stream(streams[i]);
+    #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+      CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(),
+        count, data_type, comms[i], stream));
+    #else
+      CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type,
+        outputs[i].data_ptr(), comms[i], stream));
+    #endif
+    }
+  });
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs, *_outputs, *_streams, *_comms;
+  int op;
+
+  if (!PyArg_ParseTuple(args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(args, NULL, "nccl_reduce_scatter", 1,
+			      "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op");
+    return NULL;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  with_no_gil([&]{
+    size_t len = inputs.size();
+    _check_inputs(inputs, outputs, 1, len);
+
+    ncclDataType_t data_type = _get_data_type(inputs[0].type());
+
+    int64_t count = inputs[0].numel() / len;
+    std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
+    auto comms = user_comms.empty() ? _get_communicators(inputs) : ArrayRef<ncclComm_t>(user_comms);
+    at::DeviceGuard device_guard;
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < len; i++) {
+      int device = inputs[i].get_device();
+      device_guard.set_index(device);
+      auto stream = (streams[i] == NULL) ? NULL : THCStream_stream(streams[i]);
+      CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
+          count, data_type, (ncclRedOp_t) op, comms[i], stream));
+    }
+  });
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static std::vector<at::Tensor> extract_tensors(PyObject* obj) {
+  auto seq = THPObjectPtr(PySequence_Fast(obj, "expected a sequence"));
+  if (!seq) throw python_error();
+
+  std::vector<at::Tensor> list;
+  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < length; i++) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
+    if (!THPVariable_Check(item)) {
+      throw TypeError("expected Tensor at %d (got %s)", (int)i, Py_TYPE(item)->tp_name);
+    }
+    auto var = (THPVariable*) item;
+    list.emplace_back(var->cdata.data());
+  }
+  return list;
+}
diff --git a/torch/csrc/cuda/python_nccl.h b/torch/csrc/cuda/python_nccl.h
new file mode 100644
index 0000000..9e8c09a
--- /dev/null
+++ b/torch/csrc/cuda/python_nccl.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+PyObject * THCPModule_nccl_version(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_unique_id(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_init_rank(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_broadcast(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args);
+PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args);
diff --git a/torch/csrc/cuda/restore_macros.h b/torch/csrc/cuda/restore_macros.h
new file mode 100644
index 0000000..054eec6
--- /dev/null
+++ b/torch/csrc/cuda/restore_macros.h
@@ -0,0 +1,20 @@
+
+#define THWTensor                    TH_CONCAT_3(TH,Real,Tensor)
+#define THWTensor_(NAME)             TH_CONCAT_4(TH,Real,Tensor_,NAME)
+
+#define THPTensor                   TH_CONCAT_3(THP,Real,Tensor)
+#define THPTensorStr                TH_CONCAT_STRING_3(torch.,Real,Tensor)
+#define THPTensorClass              TH_CONCAT_3(THP,Real,TensorClass)
+#define THPTensor_(NAME)            TH_CONCAT_4(THP,Real,Tensor_,NAME)
+
+#define THPStorage TH_CONCAT_3(THP,Real,Storage)
+#define THPStorageStr TH_CONCAT_STRING_3(torch.,Real,Storage)
+#define THPStorageClass TH_CONCAT_3(THP,Real,StorageClass)
+#define THPStorage_(NAME) TH_CONCAT_4(THP,Real,Storage_,NAME)
+
+#ifdef _THP_CORE
+#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
+#define THWTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
+#define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr)
+#define THPTensorPtr  TH_CONCAT_3(THP,Real,TensorPtr)
+#endif
diff --git a/torch/csrc/cuda/serialization.cpp b/torch/csrc/cuda/serialization.cpp
new file mode 100644
index 0000000..5ea4714
--- /dev/null
+++ b/torch/csrc/cuda/serialization.cpp
@@ -0,0 +1,12 @@
+#include "torch/csrc/python_headers.h"
+
+#include "THCP.h"
+
+#include "override_macros.h"
+
+#include <system_error>
+#include <memory>
+
+#define THC_GENERIC_FILE "torch/csrc/generic/serialization.cpp"
+#include <THC/THCGenerateAllTypes.h>
+
diff --git a/torch/csrc/cuda/serialization.h b/torch/csrc/cuda/serialization.h
new file mode 100644
index 0000000..3214aac
--- /dev/null
+++ b/torch/csrc/cuda/serialization.h
@@ -0,0 +1,9 @@
+#ifndef THCP_SERIALIZATION_INC
+#define THCP_SERIALIZATION_INC
+
+#include "override_macros.h"
+
+#define THC_GENERIC_FILE "torch/csrc/generic/serialization.h"
+#include <THC/THCGenerateAllTypes.h>
+
+#endif
diff --git a/torch/csrc/cuda/undef_macros.h b/torch/csrc/cuda/undef_macros.h
new file mode 100644
index 0000000..a6d5e83
--- /dev/null
+++ b/torch/csrc/cuda/undef_macros.h
@@ -0,0 +1,53 @@
+#undef TH_GENERIC_FILE
+#undef LIBRARY_STATE
+#undef LIBRARY_STATE_NOARGS
+#undef LIBRARY_STATE_TYPE
+#undef LIBRARY_STATE_TYPE_NOARGS
+
+#undef THPTensor_
+#undef THPTensor_stateless_
+#undef THPTensor
+#undef THPTensorStr
+#undef THPTensorBaseStr
+#undef THPTensorClass
+
+#undef THPTensorStatelessType
+#undef THPTensorStateless
+#undef THPTensorType
+
+#undef THPStorage_
+#undef THPStorage
+#undef THPStorageBaseStr
+#undef THPStorageStr
+#undef THPStorageClass
+#undef THPStorageType
+
+#undef THWStorage
+#undef THWStorage_
+#undef THWTensor
+#undef THWTensor_
+
+#undef THWStoragePtr
+#undef THPStoragePtr
+#undef THWTensorPtr
+#undef THPTensorPtr
+
+
+#undef THSPTensor_
+#undef THSPTensor_stateless_
+#undef THSPTensor
+#undef THSPTensorStr
+#undef THSPTensorBaseStr
+#undef THSPTensorClass
+
+#undef THSPTensorStatelessType
+#undef THSPTensorStateless
+#undef THSPTensorType
+
+#undef THSPTensorPtr
+
+
+#undef THHostTensor
+#undef THHostTensor_
+#undef THHostStorage
+#undef THHostStorage_
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
new file mode 100644
index 0000000..5090ad6
--- /dev/null
+++ b/torch/csrc/cuda/utils.cpp
@@ -0,0 +1,37 @@
+#include "torch/csrc/python_headers.h"
+#include <stdarg.h>
+#include <string>
+#include "THCP.h"
+
+#include "override_macros.h"
+
+#define THC_GENERIC_FILE "torch/csrc/generic/utils.cpp"
+#include <THC/THCGenerateAllTypes.h>
+
+#ifdef USE_CUDA
+std::vector <THCStream*> THPUtils_PySequence_to_THCStreamList(PyObject *obj) {
+  if (!PySequence_Check(obj)) {
+    throw std::runtime_error("Expected a sequence in THPUtils_PySequence_to_THCStreamList");
+  }
+  THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, NULL));
+  if (seq.get() == NULL) {
+    throw std::runtime_error("expected PySequence, but got " + std::string(THPUtils_typename(obj)));
+  }
+
+  std::vector<THCStream*> streams;
+  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < length; i++) {
+    PyObject *stream = PySequence_Fast_GET_ITEM(seq.get(), i);
+
+    if (PyObject_IsInstance(stream, THCPStreamClass)) {
+      streams.push_back( ((THCPStream *)stream)->cdata);
+    } else if (stream == Py_None) {
+      streams.push_back(NULL);
+    } else {
+      std::runtime_error("Unknown data type found in stream list. Need THCStream or None");
+    }
+  }
+  return streams;
+}
+
+#endif
diff --git a/torch/csrc/cuda/utils.h b/torch/csrc/cuda/utils.h
new file mode 100644
index 0000000..1a367d8
--- /dev/null
+++ b/torch/csrc/cuda/utils.h
@@ -0,0 +1,19 @@
+#ifndef THCP_UTILS_H
+#define THCP_UTILS_H
+
+#define THCPUtils_(NAME) TH_CONCAT_4(THCP,Real,Utils_,NAME)
+
+#define THCStoragePtr  TH_CONCAT_3(THC,Real,StoragePtr)
+#define THCTensorPtr   TH_CONCAT_3(THC,Real,TensorPtr)
+#define THCPStoragePtr TH_CONCAT_3(THCP,Real,StoragePtr)
+#define THCPTensorPtr  TH_CONCAT_3(THCP,Real,TensorPtr)
+
+#define THCSTensorPtr  TH_CONCAT_3(THCS,Real,TensorPtr)
+#define THCSPTensorPtr TH_CONCAT_3(THCSP,Real,TensorPtr)
+
+#include "override_macros.h"
+
+#define THC_GENERIC_FILE "torch/csrc/generic/utils.h"
+#include <THC/THCGenerateAllTypes.h>
+
+#endif
diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp
new file mode 100644
index 0000000..af6b12f
--- /dev/null
+++ b/torch/csrc/distributed/Module.cpp
@@ -0,0 +1,1013 @@
+#include "torch/csrc/python_headers.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "torch/csrc/utils/python_strings.h"
+#include "THDP.h"
+#include "torch/csrc/PythonTypes.h"
+#include "torch/csrc/autograd/python_variable.h"
+
+#ifdef USE_CUDA
+#include "torch/csrc/cuda/Stream.h"
+#endif
+
+
+static std::unordered_map<std::string, THDChannelType> name2channel_type = {
+    {"mpi", THDChannelMPI},
+    {"tcp", THDChannelTCP},
+    {"gloo", THDChannelGloo},
+    {"nccl", THDChannelNccl},
+};
+
+static bool THDPModule_loadClasses(PyObject *self)
+{
+#ifdef USE_DISTRIBUTED_MW
+#define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
+  PyObject *torch_module = PyImport_ImportModule("torch.distributed");
+  if (!torch_module) {
+    THPUtils_setError("class loader couldn't access torch.distributed module");
+    return false;
+  }
+
+  if (!THDPDoubleTensor_postInit(torch_module)) return false;
+  if (!THDPFloatTensor_postInit(torch_module)) return false;
+  if (!THDPHalfTensor_postInit(torch_module)) return false;
+  if (!THDPLongTensor_postInit(torch_module)) return false;
+  if (!THDPIntTensor_postInit(torch_module)) return false;
+  if (!THDPShortTensor_postInit(torch_module)) return false;
+  if (!THDPCharTensor_postInit(torch_module)) return false;
+  if (!THDPByteTensor_postInit(torch_module)) return false;
+
+  ASSERT_NOT_NULL(THDPDoubleStorageClass = PyObject_GetAttrString(torch_module,(char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THDPFloatStorageClass  = PyObject_GetAttrString(torch_module,(char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THDPHalfStorageClass   = PyObject_GetAttrString(torch_module,(char*)"HalfStorage"));
+  ASSERT_NOT_NULL(THDPLongStorageClass   = PyObject_GetAttrString(torch_module,(char*)"LongStorage"));
+  ASSERT_NOT_NULL(THDPIntStorageClass    = PyObject_GetAttrString(torch_module,(char*)"IntStorage"));
+  ASSERT_NOT_NULL(THDPShortStorageClass  = PyObject_GetAttrString(torch_module,(char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THDPCharStorageClass   = PyObject_GetAttrString(torch_module,(char*)"CharStorage"));
+  ASSERT_NOT_NULL(THDPByteStorageClass   = PyObject_GetAttrString(torch_module,(char*)"ByteStorage"));
+
+#undef ASSERT_NOT_NULL
+#endif
+  return true;
+}
+
+static bool THDPModule_assignStateless(PyObject *self)
+{
+#ifdef USE_DISTRIBUTED_MW
+#define INIT_STATELESS(type)                                                   \
+  stateless = PyObject_CallFunctionObjArgs((PyObject*)&TH_CONCAT_3(THDP, type, TensorStatelessType), NULL); \
+  if (!stateless) {                                                            \
+    return false;                                                              \
+  }                                                                            \
+  if (PyObject_SetAttrString(TH_CONCAT_3(THDP,type,TensorClass), THP_STATELESS_ATTRIBUTE_NAME, stateless) == -1) { \
+    return false;                                                              \
+  }
+  PyObject *stateless;
+  INIT_STATELESS(Double);
+  INIT_STATELESS(Float);
+  INIT_STATELESS(Half);
+  INIT_STATELESS(Long);
+  INIT_STATELESS(Int);
+  INIT_STATELESS(Short);
+  INIT_STATELESS(Char);
+  INIT_STATELESS(Byte);
+#undef INIT_STATELESS
+#endif
+  return true;
+}
+
+static std::unordered_map<PyObject*, THDReduceOp> obj2reduceop;
+static std::unordered_map<PyObject*, THDGroup> obj2group;
+
+#ifdef USE_CUDA
+extern THCState* state;
+#endif
+
+PyObject* THDPModule_initProcessGroup(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 5 || !THPUtils_checkString(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkString(PyTuple_GET_ITEM(args, 1)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 2)) ||
+        !THPUtils_checkString(PyTuple_GET_ITEM(args, 3)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 4))) {
+    THPUtils_invalidArguments(args, NULL, "init_process_group", 1, "(string backend, string init_method, int world_size, string group_name, int rank)");
+    return NULL;
+  }
+
+  std::string backend_name = THPUtils_unpackString(PyTuple_GET_ITEM(args, 0));
+  std::string init_method = THPUtils_unpackString(PyTuple_GET_ITEM(args, 1));
+  int world_size = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 2));
+  std::string group_name = THPUtils_unpackString(PyTuple_GET_ITEM(args, 3));
+  int rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 4));
+
+  THDChannelType channel_type = name2channel_type.at(backend_name);
+  {
+    AutoNoGIL nogil;
+    THDProcessGroupInit(channel_type, init_method, world_size, group_name, rank);
+  }
+#ifdef USE_CUDA
+  THDSetCudaStatePtr(&state);
+#endif
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_destroyProcessGroup(PyObject *_unused) {
+  HANDLE_TH_ERRORS
+  {
+    AutoNoGIL nogil;
+    THDProcessGroupDestroy();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+#ifdef USE_DISTRIBUTED_MW
+PyObject* THDPModule_initMasterWorker(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 5 || !THPUtils_checkString(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkString(PyTuple_GET_ITEM(args, 1)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 2)) ||
+        !THPUtils_checkString(PyTuple_GET_ITEM(args, 3)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 4))) {
+    THPUtils_invalidArguments(args, NULL, "init_master_worker", 1, "(string backend, string init_method, int world_size, string group_name, int rank)");
+    return NULL;
+  }
+
+  std::string backend_name = THPUtils_unpackString(PyTuple_GET_ITEM(args, 0));
+  std::string init_method = THPUtils_unpackString(PyTuple_GET_ITEM(args, 1));
+  int world_size = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 2));
+  std::string group_name = THPUtils_unpackString(PyTuple_GET_ITEM(args, 3));
+  int rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 4));
+
+  THDChannelType channel_type = name2channel_type.at(backend_name);
+  {
+    AutoNoGIL nogil;
+    THDMasterWorkerInit(channel_type, init_method, world_size, group_name, rank);
+  }
+#ifdef USE_CUDA
+  THDSetCudaStatePtr(&state);
+#endif
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+#ifdef USE_CUDA
+PyObject* THDPModule_registerStream(PyObject *_unused, PyObject *_stream)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THCPStream_Check(_stream), "_register_stream expects a "
+      "torch.cuda.Stream object");
+  THCPStream *stream = (THCPStream*)_stream;
+  THDRegisterCudaStream(stream->cuda_stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+PyObject* THDPModule_getRank(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  return PyInt_FromLong(THDGetRank());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_getNumProcesses(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  return PyInt_FromLong(THDGetNumProcesses());
+  END_HANDLE_TH_ERRORS
+}
+
+#ifdef USE_CUDA
+extern PyObject* THCPDoubleTensorClass;
+extern PyObject* THCPFloatTensorClass;
+extern PyObject* THCPHalfTensorClass;
+extern PyObject* THCPLongTensorClass;
+extern PyObject* THCPIntTensorClass;
+extern PyObject* THCPShortTensorClass;
+extern PyObject* THCPCharTensorClass;
+extern PyObject* THCPByteTensorClass;
+#endif
+
+THDTensorDescriptor THDPModule_makeDescriptor(PyObject *obj) {
+  auto var = (THPVariable*)obj;
+  return var->cdata.data();
+}
+
+static THDRequest* _unpackRequest(PyObject *obj)
+{
+  return static_cast<THDRequest*>(THPWrapper_get(obj));
+}
+
+static THDReduceOp _getReduceOp(PyObject *obj)
+{
+  auto it = obj2reduceop.find(obj);
+  if (it == obj2reduceop.end()) {
+    throw std::runtime_error("op should be a constant from "
+        "torch.distributed.reduce_op");
+  }
+  return it->second;
+}
+
+static THDGroup _getGroup(PyObject *obj)
+{
+  auto it = obj2group.find(obj);
+  if (it == obj2group.end()) {
+    if (!THPUtils_checkLong(obj))
+      throw std::runtime_error("group should be an int or one of the values "
+          "from torch.distributed.group");
+    return THPUtils_unpackLong(obj);
+  }
+  return it->second;
+}
+
+PyObject* THDPModule_clearGroupCache(PyObject *_unused, PyObject *args) {
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 1) {
+    THPUtils_invalidArguments(args, NULL, "clear_group_cache", 1, "(group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 0));
+
+  {
+    AutoNoGIL nogil;
+    THDClearGroupCache(group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_isend(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 2 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "isend", 1, "(tensor input, int dst_rank)");
+    return NULL;
+  }
+
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int dst_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  THDRequest* req;
+  {
+    AutoNoGIL guard;
+    req = THDIsend(desc, dst_rank);
+  }
+  return THPWrapper_New(req, (void(*)(void*))THDRequest_free);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_irecv(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 2 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "irecv", 1, "(tensor output, int src_rank)");
+    return NULL;
+  }
+
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int src_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  THDRequest* req;
+  {
+    AutoNoGIL guard;
+    req = THDIrecv(desc, src_rank);
+  }
+  return THPWrapper_New(req, (void(*)(void*))THDRequest_free);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_send(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 2 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "send", 1, "(tensor input, int dst_rank)");
+    return NULL;
+  }
+
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int dst_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDSend(desc, dst_rank);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_recvAnySource(PyObject *_unused, PyObject *_tensor)
+{
+  HANDLE_TH_ERRORS
+  if (!THPVariable_Check(_tensor)) {
+    THPUtils_invalidArguments(_tensor, NULL, "recv", 1, "(tensor output)");
+    return NULL;
+  }
+
+  auto desc = THDPModule_makeDescriptor(_tensor);
+  int sender;
+  {
+    AutoNoGIL guard;
+    sender = THDRecvAnySource(desc);
+  }
+  return PyInt_FromLong(sender);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_recv(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 2 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "recv", 1, "(tensor output, int src_rank)");
+    return NULL;
+  }
+
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int src_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDRecv(desc, src_rank);
+  }
+  // Return sender rank
+  Py_INCREF(PyTuple_GET_ITEM(args, 1));
+  return PyTuple_GET_ITEM(args, 1);
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject* THDPModule_allReduceMultiGPU(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  std::vector<at::Tensor> descriptors;
+  size_t length;
+  THDGroup group;
+  THDReduceOp op;
+  THPObjectPtr sequence;
+
+  if (PyTuple_GET_SIZE(args) != 3) {
+    goto invalid_arguments;
+  }
+
+  if (!PySequence_Check(PyTuple_GET_ITEM(args, 0))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i))) {
+      goto invalid_arguments;
+    }
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  op = _getReduceOp(PyTuple_GET_ITEM(args, 1));
+
+  {
+    AutoNoGIL guard;
+    THDAllReduceMultiGPU(descriptors.data(), length, op, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "all_reduce_multigpu", 1,
+                            "(list[tensor] in_out, reduce_op op, group gr)");
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject* THDPModule_reduceMultiGPU(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<at::Tensor> descriptors;
+  THDGroup group;
+  THDReduceOp op;
+  int dst_rank;
+
+  if (PyTuple_GET_SIZE(args) != 4) {
+    goto invalid_arguments;
+  }
+
+  if (!PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i))) {
+      goto invalid_arguments;
+    }
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  group = _getGroup(PyTuple_GET_ITEM(args, 3));
+  op = _getReduceOp(PyTuple_GET_ITEM(args, 2));
+  dst_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+
+  {
+    AutoNoGIL guard;
+    THDReduceMultiGPU(descriptors.data(), length, op, dst_rank, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "reduce_multigpu", 1,
+                            "(list[tensor] in_out, int dst_rank, "
+                            "reduce_op op, group gr)");
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject* THDPModule_broadcastMultiGPU(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<at::Tensor> descriptors;
+  THDGroup group;
+  int src_rank;
+
+  if (PyTuple_GET_SIZE(args) != 3) {
+    goto invalid_arguments;
+  }
+
+  if (!PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i))) {
+      goto invalid_arguments;
+    }
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  src_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+
+  {
+    AutoNoGIL guard;
+    THDBroadcastMultiGPU(descriptors.data(), length, src_rank, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "broadcast_multigpu", 1,
+                            "(list[tensor] in_out, int src_rank, group gr)");
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject* THDPModule_allGatherMultiGPU(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence_one;
+  THPObjectPtr sequence_two;
+
+  size_t length_one;
+  size_t length_two;
+
+  std::vector<at::Tensor> output_descriptors;
+  std::vector<at::Tensor> input_descriptors;
+
+  THDGroup group;
+
+  if (PyTuple_GET_SIZE(args) != 3) {
+    goto invalid_arguments;
+  }
+
+  if (!PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !PySequence_Check(PyTuple_GET_ITEM(args, 1))) {
+    goto invalid_arguments;
+  }
+
+  sequence_one = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                              "expected a sequence"));
+  sequence_two = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 1),
+                                              "expected a sequence"));
+
+  if (!sequence_one.get() || !sequence_two.get()) {
+    goto invalid_arguments;
+  }
+
+  length_one = static_cast<size_t>(
+      PySequence_Fast_GET_SIZE(sequence_one.get()));
+
+  length_two = static_cast<size_t>(
+      PySequence_Fast_GET_SIZE(sequence_two.get()));
+
+  if (length_one != length_two) {
+    goto invalid_arguments;
+  }
+
+  output_descriptors.reserve(length_one);
+  input_descriptors.reserve(length_two);
+
+  // Get the input list
+  for (size_t i = 0; i < length_two; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence_two.get(), i)) ||
+        !THPVariable_Check(PySequence_Fast_GET_ITEM(sequence_one.get(), i))) {
+      goto invalid_arguments;
+    }
+
+    input_descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence_two.get(), i))
+    );
+
+    output_descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence_one.get(), i))
+    );
+  }
+
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+
+  {
+    AutoNoGIL guard;
+    THDAllGatherMultiGPU(output_descriptors.data(),
+                         length_one,
+                         input_descriptors.data(),
+                         length_two,
+                         group);
+  }
+
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "all_gather_multigpu", 1,
+      "(list[list[tensor]] output, list[tensor] input, group gr)");
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+
+PyObject* THDPModule_allReduce(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 3 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0))) {
+    THPUtils_invalidArguments(args, NULL, "all_reduce", 1, "(tensor in_out, reduce_op op, group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  THDReduceOp op = _getReduceOp(PyTuple_GET_ITEM(args, 1));
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  {
+    AutoNoGIL guard;
+    THDAllReduce(desc, op, group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_reduce(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 4 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "reduce", 1,
+        "(tensor reduced, int dst_rank, reduce_op op, group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 3));
+  THDReduceOp op = _getReduceOp(PyTuple_GET_ITEM(args, 2));
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int dst_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDReduce(desc, op, dst_rank, group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_broadcast(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 3 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "broadcast", 1,
+        "(tensor src_dst, int src_rank, group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int src_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDBroadcast(desc, src_rank, group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_allGather(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<at::Tensor> descriptors;
+  THDGroup group;
+  at::Tensor desc;
+
+  if (PyTuple_GET_SIZE(args) != 3 ||
+      !PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !THPVariable_Check(PyTuple_GET_ITEM(args, 1))) {
+
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i)))
+      goto invalid_arguments;
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDAllGather(descriptors.data(), length, desc, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "allGather", 1,
+      "(list[tensor] output, tensor input, group gr)");
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_gatherSend(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 3 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0))) {
+    THPUtils_invalidArguments(args, NULL, "gatherSend", 1,
+        "(tensor input, int dst_rank, group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int dst_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDGatherSend(desc, dst_rank, group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_gatherRecv(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<at::Tensor> descriptors;
+  THDGroup group;
+  at::Tensor desc;
+
+  if (PyTuple_GET_SIZE(args) != 3 ||
+      !PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !THPVariable_Check(PyTuple_GET_ITEM(args, 1))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i)))
+      goto invalid_arguments;
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 1));
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  {
+    AutoNoGIL guard;
+    THDGatherRecv(descriptors.data(), length, desc, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "gatherRecv", 1,
+      "(list[tensor] output, tensor input, group gr)");
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_scatterSend(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<at::Tensor> descriptors;
+  THDGroup group;
+  at::Tensor desc;
+
+  if (PyTuple_GET_SIZE(args) != 3 ||
+      !PySequence_Check(PyTuple_GET_ITEM(args, 0)) ||
+      !THPVariable_Check(PyTuple_GET_ITEM(args, 1))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  descriptors.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPVariable_Check(PySequence_Fast_GET_ITEM(sequence.get(), i)))
+      goto invalid_arguments;
+
+    descriptors.push_back(
+      THDPModule_makeDescriptor(PySequence_Fast_GET_ITEM(sequence.get(), i))
+    );
+  }
+
+  desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 1));
+  group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  {
+    AutoNoGIL guard;
+    THDScatterSend(descriptors.data(), length, desc, group);
+  }
+  Py_RETURN_NONE;
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "scatterSend", 1,
+      "(list[tensor] input, tensor output, group gr)");
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_scatterRecv(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  if (PyTuple_GET_SIZE(args) != 3 || !THPVariable_Check(PyTuple_GET_ITEM(args, 0)) ||
+        !THPUtils_checkLong(PyTuple_GET_ITEM(args, 1))) {
+    THPUtils_invalidArguments(args, NULL, "scatterRecv", 1,
+        "(tensor output, int src_rank, group gr)");
+    return NULL;
+  }
+
+  THDGroup group = _getGroup(PyTuple_GET_ITEM(args, 2));
+  auto desc = THDPModule_makeDescriptor(PyTuple_GET_ITEM(args, 0));
+  int src_rank = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 1));
+  {
+    AutoNoGIL guard;
+    THDScatterRecv(desc, src_rank, group);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_barrier(PyObject *_unused, PyObject *_group)
+{
+  HANDLE_TH_ERRORS
+  {
+    AutoNoGIL guard;
+    THDBarrier(_getGroup(_group));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_newGroup(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr sequence;
+  size_t length;
+  std::vector<int> ranks;
+
+  if (PyTuple_GET_SIZE(args) != 1 ||
+      !PySequence_Check(PyTuple_GET_ITEM(args, 0))) {
+    goto invalid_arguments;
+  }
+
+  sequence = THPObjectPtr(PySequence_Fast(PyTuple_GET_ITEM(args, 0),
+                                          "expected a sequence"));
+  if (!sequence.get()) {
+    goto invalid_arguments;
+  }
+
+  length = static_cast<size_t>(PySequence_Fast_GET_SIZE(sequence.get()));
+
+  ranks.reserve(length);
+
+  for (size_t i = 0; i < length; ++i) {
+    if (!THPUtils_checkLong(PySequence_Fast_GET_ITEM(sequence.get(), i)))
+      goto invalid_arguments;
+
+    ranks.push_back(THPUtils_unpackLong(
+          PySequence_Fast_GET_ITEM(sequence.get(), i)));
+
+    for (size_t j = 0; j < i; ++j)
+      THPUtils_assert(ranks[i] != ranks[j], "ranks should be unique");
+  }
+
+  THDGroup group;
+  {
+    AutoNoGIL guard;
+    group = THDNewGroup(ranks.data(), length);
+  }
+  return PyInt_FromLong(group);
+
+invalid_arguments:
+  THPUtils_invalidArguments(args, NULL, "newGroup", 1, "(list[int] ranks)");
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_requestIsCompleted(PyObject *_unused, PyObject *_req)
+{
+  HANDLE_TH_ERRORS
+  if (!THPWrapper_check(_req)) {
+    THPUtils_invalidArguments(_req, NULL, "requestIsCompleted", 1, "(request req)");
+    return NULL;
+  }
+
+  return PyBool_FromLong(THDRequest_isCompleted(_unpackRequest(_req)));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_requestWait(PyObject *_unused, PyObject *_req)
+{
+  HANDLE_TH_ERRORS
+  if (!THPWrapper_check(_req)) {
+    THPUtils_invalidArguments(_req, NULL, "requestWait", 1, "(request req)");
+    return NULL;
+  }
+
+  {
+    AutoNoGIL guard;
+    THDRequest_wait(_unpackRequest(_req));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THDPModule_initExtension(PyObject *_unused, PyObject *args) {
+  if (PyTuple_GET_SIZE(args) != 3) {
+    THPUtils_invalidArguments(args, NULL, "initExtension", 1, "(bool is_master_worker, reduce_op obj, group obj)");
+    return NULL;
+  }
+
+  PyObject* is_master_worker_obj = PyTuple_GET_ITEM(args, 0);
+  PyObject* reduce_op_obj = PyTuple_GET_ITEM(args, 1);
+  PyObject* group_obj = PyTuple_GET_ITEM(args, 2);
+
+  THPUtils_assert(PyBool_Check(is_master_worker_obj), "first argument should be a bool");
+  bool is_master_worker = is_master_worker_obj == Py_True;
+
+  THPObjectPtr reduce_op;
+#define REGISTER_REDUCE_OP(NAME)                                               \
+  reduce_op = PyObject_GetAttrString(reduce_op_obj, #NAME);                    \
+  THPUtils_assert(reduce_op, "Missing object for reduce op " #NAME);           \
+  obj2reduceop.emplace(reduce_op.get(), THDReduce##NAME);
+  REGISTER_REDUCE_OP(SUM);
+  REGISTER_REDUCE_OP(PRODUCT);
+  REGISTER_REDUCE_OP(MIN);
+  REGISTER_REDUCE_OP(MAX);
+#undef REGISTER_REDUCE_OP
+
+  THPObjectPtr group;
+#define REGISTER_GROUP(NAME)                                           \
+  group = PyObject_GetAttrString(group_obj, #NAME);                    \
+  THPUtils_assert(group, "Missing object for group " #NAME);           \
+  obj2group.emplace(group.get(), THDGroup##NAME);
+  REGISTER_GROUP(WORLD);
+#undef REGISTER_GROUP
+
+  if (is_master_worker) {
+    PyObject *module = PyImport_ImportModule("torch.distributed");
+    THPUtils_assert(module, "class loader couldn't access torch.distributed module");
+    PyObject* module_dict = PyModule_GetDict(module);
+    if (!THDPModule_loadClasses(module_dict)) return NULL;
+    if (!THDPModule_assignStateless(module_dict)) return NULL;
+  }
+  Py_RETURN_TRUE;
+}
+
+static struct PyMethodDef _THDPModule_methods[] = {
+  {"_dist_init_extension", (PyCFunction)THDPModule_initExtension, METH_VARARGS, NULL},
+  {"_dist_init_process_group", (PyCFunction)THDPModule_initProcessGroup, METH_VARARGS, NULL},
+  {"_dist_destroy_process_group", (PyCFunction)THDPModule_destroyProcessGroup, METH_NOARGS, NULL},
+  {"_dist_clear_group_cache", (PyCFunction)THDPModule_clearGroupCache, METH_VARARGS, NULL},
+#ifdef USE_DISTRIBUTED_MW
+  {"_dist_init_master_worker", (PyCFunction)THDPModule_initMasterWorker, METH_VARARGS, NULL},
+#endif
+#ifdef USE_CUDA
+  {"_dist_register_stream", (PyCFunction)THDPModule_registerStream, METH_O, NULL},
+#endif
+  {"_dist_get_rank", (PyCFunction)THDPModule_getRank, METH_NOARGS, NULL},
+  {"_dist_get_num_processes", (PyCFunction)THDPModule_getNumProcesses, METH_NOARGS, NULL},
+  {"_dist_isend", (PyCFunction)THDPModule_isend, METH_VARARGS, NULL},
+  {"_dist_irecv", (PyCFunction)THDPModule_irecv, METH_VARARGS, NULL},
+  {"_dist_send", (PyCFunction)THDPModule_send, METH_VARARGS, NULL},
+  {"_dist_recv_any_source", (PyCFunction)THDPModule_recvAnySource, METH_O, NULL},
+  {"_dist_recv", (PyCFunction)THDPModule_recv, METH_VARARGS, NULL},
+  {"_dist_all_reduce", (PyCFunction)THDPModule_allReduce, METH_VARARGS, NULL},
+  {"_dist_all_reduce_multigpu", (PyCFunction)THDPModule_allReduceMultiGPU, METH_VARARGS, NULL},
+  {"_dist_reduce", (PyCFunction)THDPModule_reduce, METH_VARARGS, NULL},
+  {"_dist_reduce_multigpu", (PyCFunction)THDPModule_reduceMultiGPU, METH_VARARGS, NULL},
+  {"_dist_broadcast", (PyCFunction)THDPModule_broadcast, METH_VARARGS, NULL},
+  {"_dist_broadcast_multigpu", (PyCFunction)THDPModule_broadcastMultiGPU, METH_VARARGS, NULL},
+  {"_dist_all_gather", (PyCFunction)THDPModule_allGather, METH_VARARGS, NULL},
+  {"_dist_all_gather_multigpu", (PyCFunction)THDPModule_allGatherMultiGPU, METH_VARARGS, NULL},
+  {"_dist_gather_send", (PyCFunction)THDPModule_gatherSend, METH_VARARGS, NULL},
+  {"_dist_gather_recv", (PyCFunction)THDPModule_gatherRecv, METH_VARARGS, NULL},
+  {"_dist_scatter_send", (PyCFunction)THDPModule_scatterSend, METH_VARARGS, NULL},
+  {"_dist_scatter_recv", (PyCFunction)THDPModule_scatterRecv, METH_VARARGS, NULL},
+  {"_dist_barrier", (PyCFunction)THDPModule_barrier, METH_O, NULL},
+  {"_dist_new_group", (PyCFunction)THDPModule_newGroup, METH_VARARGS, NULL},
+  {"_dist_request_is_completed", (PyCFunction)THDPModule_requestIsCompleted, METH_O, NULL},
+  {"_dist_request_wait", (PyCFunction)THDPModule_requestWait, METH_O, NULL},
+  {NULL}
+};
+
+PyMethodDef* THDPModule_methods() {
+  return _THDPModule_methods;
+}
diff --git a/torch/csrc/distributed/Storage.cpp b/torch/csrc/distributed/Storage.cpp
new file mode 100644
index 0000000..e6b0829
--- /dev/null
+++ b/torch/csrc/distributed/Storage.cpp
@@ -0,0 +1,18 @@
+#include "torch/csrc/python_headers.h"
+#include <structmember.h>
+
+#define THP_HOST_HALF
+
+#include <TH/TH.h>
+#include <libshm.h>
+#include "THDP.h"
+#include "torch/csrc/copy_utils.h"
+
+#include "override_macros.h"
+
+#define THD_GENERIC_FILE "torch/csrc/generic/Storage.cpp"
+#include <THD/base/THDGenerateAllTypes.h>
+
+//#define THD_GENERIC_FILE "torch/csrc/generic/StorageCopy.cpp"
+//#include <THD/THDGenerateAllTypes.h>
+
diff --git a/torch/csrc/distributed/Storage.h b/torch/csrc/distributed/Storage.h
new file mode 100644
index 0000000..5639efb
--- /dev/null
+++ b/torch/csrc/distributed/Storage.h
@@ -0,0 +1,45 @@
+#ifndef THDP_STORAGE_INC
+#define THDP_STORAGE_INC
+
+#define THDPStorage TH_CONCAT_3(THDP,Real,Storage)
+#define THDPStorageStr TH_CONCAT_STRING_3(torch.cuda.,Real,Storage)
+#define THDPStorageClass TH_CONCAT_3(THDP,Real,StorageClass)
+#define THDPStorage_(NAME) TH_CONCAT_4(THDP,Real,Storage_,NAME)
+
+#define THDPDoubleStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPDoubleStorageClass)
+#define THDPFloatStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPFloatStorageClass)
+#define THDPHalfStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPHalfStorageClass)
+#define THDPLongStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPLongStorageClass)
+#define THDPIntStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPIntStorageClass)
+#define THDPShortStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPShortStorageClass)
+#define THDPCharStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPCharStorageClass)
+#define THDPByteStorage_Check(obj) \
+    PyObject_IsInstance(obj, THDPByteStorageClass)
+
+#define THDPDoubleStorage_CData(obj)  (obj)->cdata
+#define THDPFloatStorage_CData(obj)   (obj)->cdata
+#define THDPLongStorage_CData(obj)    (obj)->cdata
+#define THDPIntStorage_CData(obj)     (obj)->cdata
+#define THDPShortStorage_CData(obj)   (obj)->cdata
+#define THDPCharStorage_CData(obj)    (obj)->cdata
+#define THDPByteStorage_CData(obj)    (obj)->cdata
+
+#ifdef _THP_CORE
+#define THDPStorageType TH_CONCAT_3(THDP,Real,StorageType)
+#define THDPStorageBaseStr TH_CONCAT_STRING_3(Distributed,Real,StorageBase)
+#endif
+
+#include "override_macros.h"
+
+#define THD_GENERIC_FILE "torch/csrc/generic/Storage.h"
+#include <THD/base/THDGenerateAllTypes.h>
+
+#endif
+
diff --git a/torch/csrc/distributed/THDP.h b/torch/csrc/distributed/THDP.h
new file mode 100644
index 0000000..911361b
--- /dev/null
+++ b/torch/csrc/distributed/THDP.h
@@ -0,0 +1,16 @@
+#ifndef THDP_H
+#define THDP_H
+
+#include <THD/THD.h>
+
+#include "torch/csrc/THP.h"
+#include "Module.h"
+#ifdef USE_DISTRIBUTED_MW
+#include "Storage.h"
+#include "../PtrWrapper.h"
+#ifdef _THP_CORE
+#include "utils.h"
+#endif
+#endif
+
+#endif
diff --git a/torch/csrc/distributed/c10d/c10d.h b/torch/csrc/distributed/c10d/c10d.h
new file mode 100644
index 0000000..e91d4cb
--- /dev/null
+++ b/torch/csrc/distributed/c10d/c10d.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+
+PyMethodDef* python_functions();
+
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
new file mode 100644
index 0000000..2bd7a87
--- /dev/null
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -0,0 +1,218 @@
+#include "torch/csrc/python_headers.h"
+
+#include <c10d/Def.hpp>
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/ProcessGroupGloo.hpp>
+
+#ifdef USE_C10D_NCCL
+#include <c10d/ProcessGroupNCCL.hpp>
+#endif
+
+#include <c10d/TCPStore.hpp>
+#include <gloo/transport/tcp/device.h>
+#include <pybind11/chrono.h>
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/pybind.h"
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+
+namespace {
+
+template <typename T>
+using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
+
+PyObject* c10d_init(PyObject* _unused) {
+  auto c10d_module =
+      THPObjectPtr(PyImport_ImportModule("torch.distributed.c10d"));
+  if (!c10d_module) {
+    throw python_error();
+  }
+
+  auto module = py::handle(c10d_module).cast<py::module>();
+
+  py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")
+      .def(py::init<>())
+      .def_readwrite("rootRank", &::c10d::BroadcastOptions::rootRank)
+      .def_readwrite("rootTensor", &::c10d::BroadcastOptions::rootTensor);
+
+  py::class_<::c10d::AllreduceOptions>(module, "AllreduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduceOp", &::c10d::AllreduceOptions::reduceOp);
+
+  py::enum_<::c10d::ReduceOp>(module, "ReduceOp")
+      .value("SUM", ::c10d::ReduceOp::SUM)
+      .value("PRODUCT", ::c10d::ReduceOp::PRODUCT)
+      .value("MIN", ::c10d::ReduceOp::MIN)
+      .value("MAX", ::c10d::ReduceOp::MAX);
+
+  auto store =
+      shared_ptr_class_<::c10d::Store>(module, "Store")
+          // Convert from std::string to std::vector<uint8>.
+          .def(
+              "set",
+              [](::c10d::Store& store,
+                 const std::string& key,
+                 const std::string& value) {
+                std::vector<uint8_t> value_(value.begin(), value.end());
+                store.set(key, value_);
+              },
+              py::call_guard<py::gil_scoped_release>())
+          // Convert from std::vector<uint8_t> to py::bytes.
+          // The returned value is not guaranteed to be valid UTF-8.
+          .def(
+              "get",
+              [](::c10d::Store& store, const std::string& key) -> py::bytes {
+                auto value = store.get(key);
+                return py::bytes(
+                    reinterpret_cast<char*>(value.data()), value.size());
+              },
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "add",
+              &::c10d::Store::add,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "wait",
+              &::c10d::Store::wait,
+              py::call_guard<py::gil_scoped_release>());
+
+  shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
+      .def(py::init<const std::string&>());
+
+  shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store)
+      .def(py::init<const std::string&, int, bool>());
+
+  auto processGroup =
+      shared_ptr_class_<::c10d::ProcessGroup>(module, "ProcessGroup")
+          .def("rank", &::c10d::ProcessGroup::getRank)
+          .def("size", &::c10d::ProcessGroup::getSize)
+          .def(
+              "broadcast",
+              &::c10d::ProcessGroup::broadcast,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "broadcast",
+              [](::c10d::ProcessGroup& pg, at::Tensor& x, int rootRank) {
+                ::c10d::BroadcastOptions opts;
+                opts.rootRank = rootRank;
+                std::vector<at::Tensor> xs = {x};
+                return pg.broadcast(xs, opts);
+              },
+              py::arg("tensor"),
+              py::arg("root"),
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "allreduce",
+              &::c10d::ProcessGroup::allreduce,
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "allreduce",
+              [](::c10d::ProcessGroup& pg, at::Tensor& x, ::c10d::ReduceOp op) {
+                ::c10d::AllreduceOptions opts;
+                opts.reduceOp = op;
+                std::vector<at::Tensor> xs = {x};
+                return pg.allreduce(xs, opts);
+              },
+              py::arg("tensor"),
+              py::arg("op") = ::c10d::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>());
+
+  auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>(
+      module, "ProcessGroupGloo", processGroup);
+
+  shared_ptr_class_<::gloo::transport::Device>(processGroupGloo, "Device");
+
+  shared_ptr_class_<::c10d::ProcessGroupGloo::Options>(
+      processGroupGloo, "Options")
+      .def(py::init<>())
+      .def_readwrite("devices", &::c10d::ProcessGroupGloo::Options::devices)
+      .def_readwrite("timeout", &::c10d::ProcessGroupGloo::Options::timeout)
+      .def_readwrite("threads", &::c10d::ProcessGroupGloo::Options::threads)
+      .def_readwrite(
+          "cacheNumAlgorithmEntries",
+          &::c10d::ProcessGroupGloo::Options::cacheNumAlgorithmEntries);
+
+  processGroupGloo.def_static(
+      "create_tcp_device",
+      [](const std::string& hostname, const std::string& interface)
+          -> std::shared_ptr<::gloo::transport::Device> {
+        ::gloo::transport::tcp::attr attr;
+        if (!hostname.empty()) {
+          attr.hostname = hostname;
+        } else if (!interface.empty()) {
+          attr.iface = interface;
+        } else {
+          // Neither argument is specified; Gloo itself will use the hostname
+          // Nothing specified, default to something useful
+        }
+        return ::gloo::transport::tcp::CreateDevice(attr);
+      },
+      py::arg("hostname") = "",
+      py::arg("interface") = "");
+
+  processGroupGloo
+      .def(py::init<
+           const std::shared_ptr<::c10d::Store>&,
+           int,
+           int,
+           ::c10d::ProcessGroupGloo::Options>())
+      .def(py::init(
+          [](const std::shared_ptr<::c10d::Store>& store, int rank, int size) {
+            ::c10d::ProcessGroupGloo::Options options;
+
+            // By default, use the hostname to resolve the network address to
+            // use. Note: if the hostname does not resolve to an address (e.g.
+            // because of misconfigured /etc/hosts file), this will not work.
+            std::array<char, HOST_NAME_MAX> hostname;
+            auto rv = gethostname(hostname.data(), hostname.size());
+            if (rv != 0) {
+              throw std::system_error(errno, std::system_category());
+            }
+
+            ::gloo::transport::tcp::attr attr;
+            attr.hostname = hostname.data();
+            options.devices.push_back(
+                ::gloo::transport::tcp::CreateDevice(attr));
+            return std::make_shared<::c10d::ProcessGroupGloo>(
+                store, rank, size, options);
+          }));
+
+#ifdef USE_C10D_NCCL
+  shared_ptr_class_<::c10d::ProcessGroupNCCL>(
+      module, "ProcessGroupNCCL", processGroup)
+      .def(py::init<const std::shared_ptr<::c10d::Store>&, int, int>());
+#endif
+
+  shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
+      .def("isCompleted", &::c10d::ProcessGroup::Work::isCompleted)
+      .def("isSuccess", &::c10d::ProcessGroup::Work::isSuccess)
+      .def("exception", &::c10d::ProcessGroup::Work::exception)
+      .def("synchronize", &::c10d::ProcessGroup::Work::synchronize)
+      .def(
+          "wait",
+          &::c10d::ProcessGroup::Work::wait,
+          py::call_guard<py::gil_scoped_release>());
+
+  Py_RETURN_TRUE;
+}
+
+} // namespace
+
+// c10d methods on torch._C
+static PyMethodDef methods[] = {
+    {"_c10d_init", (PyCFunction)c10d_init, METH_NOARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}};
+
+PyMethodDef* python_functions() {
+  return methods;
+}
+
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/copy_utils.h b/torch/csrc/distributed/copy_utils.h
new file mode 100644
index 0000000..972e729
--- /dev/null
+++ b/torch/csrc/distributed/copy_utils.h
@@ -0,0 +1,55 @@
+#ifndef THDP_COPY_UTILS_H
+#define THDP_COPY_UTILS_H
+
+extern THDTensorDescriptor* THDPModule_makeDescriptor(PyObject *obj);
+template <typename TensorSrc>
+void THDPInsertCopyFunctionFromWorker(
+  THPCopyList& copyList,
+  void (*copyFunc)(THDTensorDescriptor* x, TensorSrc *z))
+{
+  auto wrapper = [copyFunc](PyObject* dst_, PyObject* src_) {
+    TensorSrc* src = THPTypeInfo<TensorSrc>::cdata(src_);
+
+    PyThreadState *_save = NULL;
+    try {
+      Py_UNBLOCK_THREADS;
+      copyFunc(LIBRARY_STATE THDPModule_makeDescriptor(dst_), src);
+      Py_BLOCK_THREADS;
+    } catch (...) {
+      if (_save) {
+        Py_BLOCK_THREADS;
+      }
+      throw;
+    }
+  };
+
+  PyTypeObject* srcType = THPTypeInfo<TensorSrc>::pyType();
+  copyList.push_back({ srcType, wrapper, false });
+}
+
+template <typename TensorDst>
+void THDPInsertCopyFunctionFromMaster(
+  THPCopyList& copyList,
+  void (*copyFunc)(TensorDst *x, THDTensorDescriptor* z),
+  PyTypeObject *srcType)
+{
+  auto wrapper = [copyFunc](PyObject* dst_, PyObject* src_) {
+    TensorDst* dst = THPTypeInfo<TensorDst>::cdata(dst_);
+
+    PyThreadState *_save = NULL;
+    try {
+      Py_UNBLOCK_THREADS;
+      copyFunc(LIBRARY_STATE dst, THDPModule_makeDescriptor(src_));
+      Py_BLOCK_THREADS;
+    } catch (...) {
+      if (_save) {
+        Py_BLOCK_THREADS;
+      }
+      throw;
+    }
+  };
+
+  copyList.push_back({ srcType, wrapper, false });
+}
+
+#endif
diff --git a/torch/csrc/distributed/override_macros.h b/torch/csrc/distributed/override_macros.h
new file mode 100644
index 0000000..e788533
--- /dev/null
+++ b/torch/csrc/distributed/override_macros.h
@@ -0,0 +1,38 @@
+#include "undef_macros.h"
+
+#define THWStoragePtr THDStoragePtr
+#define THPStoragePtr THDPStoragePtr
+#define THWTensorPtr THDTensorPtr
+#define THPTensorPtr THDPTensorPtr
+
+#define THWStorage THDStorage
+#define THWStorage_(NAME) THDStorage_(NAME)
+#define THWTensor THDTensor
+#define THWTensor_(NAME) THDTensor_(NAME)
+
+#define THPStorage_(NAME) TH_CONCAT_4(THDP,Real,Storage_,NAME)
+#define THPStorage THDPStorage
+#define THPStorageBaseStr THDPStorageBaseStr
+#define THPStorageStr THDPStorageStr
+#define THPStorageClass THDPStorageClass
+#define THPStorageType THDPStorageType
+
+#define THPTensor_(NAME) TH_CONCAT_4(THDP,Real,Tensor_,NAME)
+#define THPTensor_stateless_(NAME) TH_CONCAT_4(THDP,Real,Tensor_stateless_,NAME)
+#define THPTensor THDPTensor
+#define THPTensorStr THDPTensorStr
+#define THPTensorBaseStr THDPTensorBaseStr
+#define THPTensorClass THDPTensorClass
+#define THPTensorType THDPTensorType
+
+#define THPTensorStatelessType THDPTensorStatelessType
+#define THPTensorStateless THDPTensorStateless
+
+#define LIBRARY_STATE_NOARGS
+#define LIBRARY_STATE
+#define TH_GENERIC_FILE THD_GENERIC_FILE
+
+#define THHostTensor TH_CONCAT_3(TH,Real,Tensor)
+#define THHostTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME)
+#define THHostStorage TH_CONCAT_3(TH,Real,Storage)
+#define THHostStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
diff --git a/torch/csrc/distributed/undef_macros.h b/torch/csrc/distributed/undef_macros.h
new file mode 100644
index 0000000..9698175
--- /dev/null
+++ b/torch/csrc/distributed/undef_macros.h
@@ -0,0 +1,36 @@
+#undef TH_GENERIC_FILE
+#undef LIBRARY_STATE
+#undef LIBRARY_STATE_NOARGS
+
+#undef THPTensor_
+#undef THPTensor_stateless_
+#undef THPTensor
+#undef THPTensorStr
+#undef THPTensorBaseStr
+#undef THPTensorClass
+
+#undef THPTensorStatelessType
+#undef THPTensorStateless
+#undef THPTensorType
+
+#undef THPStorage_
+#undef THPStorage
+#undef THPStorageBaseStr
+#undef THPStorageStr
+#undef THPStorageClass
+#undef THPStorageType
+
+#undef THWStorage
+#undef THWStorage_
+#undef THWTensor
+#undef THWTensor_
+
+#undef THWStoragePtr
+#undef THPStoragePtr
+#undef THWTensorPtr
+#undef THPTensorPtr
+
+#undef THHostTensor
+#undef THHostTensor_
+#undef THHostStorage
+#undef THHostStorage_
diff --git a/torch/csrc/distributed/utils.h b/torch/csrc/distributed/utils.h
new file mode 100644
index 0000000..aa0a6ba
--- /dev/null
+++ b/torch/csrc/distributed/utils.h
@@ -0,0 +1,20 @@
+#ifndef THDP_UTILS_H
+#define THDP_UTILS_H
+
+#include "THDP.h"
+
+#define THDPUtils_(NAME) TH_CONCAT_4(THDP,Real,Utils_,NAME)
+
+#define THDStoragePtr TH_CONCAT_3(THD,Real,StoragePtr)
+#define THDTensorPtr  TH_CONCAT_3(THD,Real,TensorPtr)
+#define THDPStoragePtr TH_CONCAT_3(THDP,Real,StoragePtr)
+#define THDPTensorPtr  TH_CONCAT_3(THDP,Real,TensorPtr)
+
+#include "override_macros.h"
+
+#define THD_GENERIC_FILE "torch/csrc/generic/utils.h"
+#include <THD/base/THDGenerateAllTypes.h>
+
+typedef THPPointer<THDTensorDescriptor> THDPTensorDesc;
+
+#endif
diff --git a/torch/csrc/dl.c b/torch/csrc/dl.c
new file mode 100644
index 0000000..7bd4122
--- /dev/null
+++ b/torch/csrc/dl.c
@@ -0,0 +1,48 @@
+#include <Python.h>
+#include <dlfcn.h>
+
+PyObject* module;
+
+static PyMethodDef TorchDlMethods[] = {
+  {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION != 2
+static struct PyModuleDef torchdlmodule = {
+   PyModuleDef_HEAD_INIT,
+   "torch._dl",
+   NULL,
+   -1,
+   TorchDlMethods
+};
+#endif
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_dl(void)
+#else
+PyMODINIT_FUNC PyInit__dl(void)
+#endif
+{
+
+#if PY_MAJOR_VERSION == 2
+#define ASSERT_TRUE(cmd) if (!(cmd)) {PyErr_SetString(PyExc_ImportError, "initialization error"); return;}
+#else
+#define ASSERT_TRUE(cmd) if (!(cmd)) return NULL
+#endif
+
+#if PY_MAJOR_VERSION == 2
+  ASSERT_TRUE(module = Py_InitModule("torch._dl", TorchDlMethods));
+#else
+  ASSERT_TRUE(module = PyModule_Create(&torchdlmodule));
+#endif
+  ASSERT_TRUE(PyModule_AddIntConstant(module, "RTLD_GLOBAL", (int64_t) RTLD_GLOBAL) == 0);
+  ASSERT_TRUE(PyModule_AddIntConstant(module, "RTLD_NOW", (int64_t) RTLD_NOW) == 0);
+  ASSERT_TRUE(PyModule_AddIntConstant(module, "RTLD_LAZY", (int64_t) RTLD_LAZY) == 0);
+
+#if PY_MAJOR_VERSION == 2
+#else
+  return module;
+#endif
+
+#undef ASSERT_TRUE
+}
diff --git a/torch/csrc/finalizer.cpp b/torch/csrc/finalizer.cpp
new file mode 100644
index 0000000..b18f4f7
--- /dev/null
+++ b/torch/csrc/finalizer.cpp
@@ -0,0 +1,6 @@
+#include <Python.h>
+#include <torch/csrc/finalizer.h>
+
+namespace torch {
+
+} // namespace torch
diff --git a/torch/csrc/finalizer.h b/torch/csrc/finalizer.h
new file mode 100644
index 0000000..4335c50
--- /dev/null
+++ b/torch/csrc/finalizer.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <Python.h>
+#include <TH/THStorage.hpp>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/auto_gil.h>
+
+namespace torch {
+
+struct PyObjectFinalizer : public THFinalizer {
+  THPObjectPtr pyobj_;
+  // TODO: This recursive structure can lead to a stack overflow if you
+  // put too many finalizers on the same object
+  std::unique_ptr<THFinalizer> next_;
+  PyObjectFinalizer(PyObject* pyobj) {
+    Py_XINCREF(pyobj);
+    pyobj_ = pyobj;
+  }
+  void operator()() override {
+    if (next_) { (*next_)(); }
+  }
+  ~PyObjectFinalizer() {
+    // We must manually ensure that we have the GIL before
+    // pyobj gets destroyed...
+    AutoGIL gil;
+    pyobj_ = nullptr;
+  }
+};
+
+} // namespace torch
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
new file mode 100644
index 0000000..3169515
--- /dev/null
+++ b/torch/csrc/generic/Storage.cpp
@@ -0,0 +1,347 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Storage.cpp"
+#else
+
+PyObject *THPStorageClass = NULL;
+
+PyObject * THPStorage_(New)(THWStorage *ptr)
+{
+  TORCH_ASSERT(ptr);
+  PyTypeObject *type = (PyTypeObject *)THPStorageClass;
+  PyObject *obj = type->tp_alloc(type, 0);
+  if (obj) {
+    ((THPStorage *)obj)->cdata = ptr;
+  } else {
+    THWStorage_(free)(LIBRARY_STATE ptr);
+  }
+  return obj;
+}
+
+static void THPStorage_(dealloc)(THPStorage* self)
+{
+  THWStorage_(free)(LIBRARY_STATE self->cdata);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static THWStorage* THPStorage_(newWithAllocator)(int64_t size, at::Allocator* allocator)
+{
+#if defined(THC_GENERIC_FILE) || defined(THD_GENERIC_FILE)
+  THPUtils_setError(THPStorageStr " does not support custom allocators");
+  return NULL;
+#else
+  return THWStorage_(newWithAllocator)(LIBRARY_STATE size, allocator);
+#endif
+}
+
+static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
+
+  THPStoragePtr self((THPStorage *)type->tp_alloc(type, 0));
+  THPUtils_assert(self, "failed to allocate a " THPStorageStr " object");
+  THAllocator* allocator = NULL;
+
+  // Internally we allow constructing with a keywoard only argument cdata
+  if (kwargs != NULL) {
+    PyObject *allocator_ptr = PyDict_GetItemString(kwargs, "allocator");
+    if (allocator_ptr) {
+      THPUtils_assert(THPUtils_checkLong(allocator_ptr), "invalid allocator");
+      allocator = (THAllocator*) PyLong_AsVoidPtr(allocator_ptr);
+      PyDict_DelItemString(kwargs, "allocator");
+    }
+
+    Py_ssize_t num_kwargs = PyDict_Size(kwargs);
+    if (num_args == 0) {
+      PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "cdata");
+      if (num_kwargs == 1 && cdata_ptr && THPUtils_checkLong(cdata_ptr)) {
+        THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(cdata_ptr);
+        self->cdata = ptr;
+        return (PyObject*)self.release();
+      }
+    }
+    THPUtils_assert(num_kwargs == 0, THPStorageStr "(): invalid keyword arguments");
+  }
+
+  // torch.Storage()
+  if (num_args == 0) {
+    if (allocator) {
+      self->cdata = THPStorage_(newWithAllocator)(0, allocator);
+    } else {
+      self->cdata = THWStorage_(new)(LIBRARY_STATE_NOARGS);
+    }
+    return (PyObject*)self.release();
+  }
+
+  PyObject *first_arg = PyTuple_GET_ITEM(args, 0);
+
+  // torch.Storage(size)
+  if (num_args == 1 && THPUtils_checkLong(first_arg)) {
+    int64_t size = THPUtils_unpackLong(first_arg);
+    if (allocator) {
+      self->cdata = THPStorage_(newWithAllocator)(size, allocator);
+    } else {
+      self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE size);
+    }
+    return (PyObject*)self.release();
+  }
+
+  // torch.Storage(view_source, [offset, [size]])
+  if (num_args < 4 && THPStorage_(Check)(first_arg)) {
+    THPUtils_setError("storage views not supported");
+    return NULL;
+  }
+
+  // torch.Storage(sequence)
+  if (num_args == 1 && PySequence_Check(first_arg)) {
+#ifdef THD_GENERIC_FILE
+    THPUtils_setError("distributed storages don't support construction from a sequence");
+#else
+    Py_ssize_t length = PySequence_Length(first_arg);
+    THPUtils_assert(length >= 0, "couldn't obtain the length of %s",
+        THPUtils_typename(first_arg));
+    self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE length);
+    THPObjectPtr item;
+    try {
+      for (Py_ssize_t i = 0; i < length; i++) {
+        item = PySequence_GetItem(first_arg, i);
+        real value = THPUtils_(unpackReal)(item.get());
+#if !defined(THC_GENERIC_FILE)
+        self->cdata->unsafe_data<real>()[i] = value;
+#else
+        // TODO: this might be slow - consider batched updates?
+        THCStorage_(set)(LIBRARY_STATE self->cdata, i, value);
+#endif
+      }
+    } catch (std::runtime_error &e) {
+      THPUtils_setError("tried to construct a storage from a sequence (%s), "
+          "but one of the items was of type %s instead of %s",
+          THPUtils_typename(first_arg),
+          THPUtils_typename(item.get()),
+          THPUtils_typeTraits<real>::python_type_str);
+      return NULL;
+    }
+    return (PyObject*)self.release();
+#endif
+  }
+
+  THPUtils_invalidArguments(args, kwargs, THPStorageStr " constructor", 6,
+          "no arguments",
+          "(int size)",
+          "(Sequence data)",
+          "(" THPStorageStr " view_source)",
+          "(" THPStorageStr " view_source, int offset)",
+          "(" THPStorageStr " view_source, int offset, int size)");
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
+
+static Py_ssize_t THPStorage_(length)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return THWStorage_(size)(LIBRARY_STATE self->cdata);
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
+{
+  HANDLE_TH_ERRORS
+  /* Integer index */
+  if (THPUtils_checkLong(index)) {
+    int64_t nindex = THPUtils_unpackLong(index);
+    if (nindex < 0)
+      nindex += THWStorage_(size)(LIBRARY_STATE self->cdata);
+    if (nindex < 0 || nindex >= self->cdata->size) {
+      PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of "
+              "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size);
+      return NULL;
+    }
+    real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex);
+    return THPUtils_(newReal)(value);
+  /* Slice index */
+  } else if (PySlice_Check(index)) {
+    THPUtils_setError("storages don't support slicing");
+    return NULL;
+  }
+  PyErr_Format(PyExc_TypeError, "can't index a " THPStorageStr " with %s",
+      THPUtils_typename(index));
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
+
+static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value)
+{
+  HANDLE_TH_ERRORS
+  if (!THPUtils_(checkReal)(value)) {
+    THPUtils_setError("can only set storage content with a %s, but got "
+        "%s instead", THPUtils_typeTraits<real>::python_type_str,
+        THPUtils_typename(value));
+    return -1;
+  }
+
+  real rvalue = THPUtils_(unpackReal)(value);
+  if (THPUtils_checkLong(index)) {
+    int64_t nindex = THPUtils_unpackLong(index);
+    THWStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue);
+    return 0;
+  } else if (PySlice_Check(index)) {
+    Py_ssize_t start, stop, slicelength, step;
+    int64_t len = THWStorage_(size)(LIBRARY_STATE self->cdata);
+    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
+      return -1;
+    if (step != 1) {
+      THPUtils_setError("Trying to slice with a step of %" PRId64 ", but only a step of "
+          "1 is supported", (int64_t)step);
+      return 0;
+    }
+    // TODO: check the bounds only once
+    // TODO: fill?
+    for (;start < stop; start++)
+      THWStorage_(set)(LIBRARY_STATE self->cdata, start, rvalue);
+    return 0;
+  }
+  THPUtils_setError("can't index a " THPStorageStr " with %s",
+      THPUtils_typename(index));
+  return -1;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+static PyMappingMethods THPStorage_(mappingmethods) = {
+  (lenfunc)THPStorage_(length),
+  (binaryfunc)THPStorage_(get),
+  (objobjargproc)THPStorage_(set)
+};
+
+// TODO: implement equality
+PyTypeObject THPStorageType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C." THPStorageBaseStr,         /* tp_name */
+  sizeof(THPStorage),                    /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPStorage_(dealloc),      /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  &THPStorage_(mappingmethods),          /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,   /* will be assigned in init */    /* tp_methods */
+  0,   /* will be assigned in init */    /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPStorage_(pynew),                    /* tp_new */
+};
+
+static struct PyMemberDef THPStorage_(members)[] = {
+  {(char*)"_cdata", T_ULONGLONG, offsetof(THPStorage, cdata), READONLY, NULL},
+  {NULL}
+};
+
+extern THPCopyList THWStorage_(copy_functions);
+THPCopyList THWStorage_(copy_functions);
+
+void THPStorage_(initCopyMethods)()
+{
+#ifndef THD_GENERIC_FILE
+  auto& h = THWStorage_(copy_functions);
+  // copy from CPU types
+  THPInsertStorageCopyFunction<THPStorage, THPByteStorage>(&THPByteStorageType, h, &THWStorage_(copyByte));
+  THPInsertStorageCopyFunction<THPStorage, THPCharStorage>(&THPCharStorageType, h, &THWStorage_(copyChar));
+  THPInsertStorageCopyFunction<THPStorage, THPShortStorage>(&THPShortStorageType, h, &THWStorage_(copyShort));
+  THPInsertStorageCopyFunction<THPStorage, THPIntStorage>(&THPIntStorageType, h, &THWStorage_(copyInt));
+  THPInsertStorageCopyFunction<THPStorage, THPLongStorage>(&THPLongStorageType, h, &THWStorage_(copyLong));
+  THPInsertStorageCopyFunction<THPStorage, THPHalfStorage>(&THPHalfStorageType, h, &THWStorage_(copyHalf));
+  THPInsertStorageCopyFunction<THPStorage, THPFloatStorage>(&THPFloatStorageType, h, &THWStorage_(copyFloat));
+  THPInsertStorageCopyFunction<THPStorage, THPDoubleStorage>(&THPDoubleStorageType, h, &THWStorage_(copyDouble));
+#ifdef THC_GENERIC_FILE
+  // copy from GPU types
+  THPInsertStorageCopyFunction<THPStorage, THCPByteStorage>(&THCPByteStorageType, h, &THWStorage_(copyCudaByte));
+  THPInsertStorageCopyFunction<THPStorage, THCPCharStorage>(&THCPCharStorageType, h, &THWStorage_(copyCudaChar));
+  THPInsertStorageCopyFunction<THPStorage, THCPShortStorage>(&THCPShortStorageType, h, &THWStorage_(copyCudaShort));
+  THPInsertStorageCopyFunction<THPStorage, THCPIntStorage>(&THCPIntStorageType, h, &THWStorage_(copyCudaInt));
+  THPInsertStorageCopyFunction<THPStorage, THCPLongStorage>(&THCPLongStorageType, h, &THWStorage_(copyCudaLong));
+  THPInsertStorageCopyFunction<THPStorage, THCPFloatStorage>(&THCPFloatStorageType, h, &THWStorage_(copyCudaFloat));
+  THPInsertStorageCopyFunction<THPStorage, THCPDoubleStorage>(&THCPDoubleStorageType, h, &THWStorage_(copyCudaDouble));
+#ifdef CUDA_HALF_TENSOR
+  THPInsertStorageCopyFunction<THPStorage, THCPHalfStorage>(&THCPHalfStorageType, h, &THWStorage_(copyCudaHalf));
+#endif
+  // add CPU <- GPU copies to base type
+  #define THPCpuStorage TH_CONCAT_3(THP, Real, Storage)
+  #define THCpuStorage_(name) TH_CONCAT_4(TH, Real, Storage_, name)
+  extern THPCopyList THCpuStorage_(copy_functions);
+  auto& b = THCpuStorage_(copy_functions);
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPByteStorage>(&THCPByteStorageType, b, &THCpuStorage_(copyCudaByte));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPCharStorage>(&THCPCharStorageType, b, &THCpuStorage_(copyCudaChar));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPShortStorage>(&THCPShortStorageType, b, &THCpuStorage_(copyCudaShort));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPIntStorage>(&THCPIntStorageType, b, &THCpuStorage_(copyCudaInt));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPLongStorage>(&THCPLongStorageType, b, &THCpuStorage_(copyCudaLong));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPFloatStorage>(&THCPFloatStorageType, b, &THCpuStorage_(copyCudaFloat));
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPDoubleStorage>(&THCPDoubleStorageType, b, &THCpuStorage_(copyCudaDouble));
+#ifdef CUDA_HALF_TENSOR
+  THPInsertStorageCopyFunction<THPCpuStorage, THCPHalfStorage>(&THCPHalfStorageType, b, &THCpuStorage_(copyCudaHalf));
+#endif
+  #undef THCpuStorage
+  #undef THCpuStorage_
+#endif
+#endif // !defined(THD_GENERIC_FILE)
+}
+
+#include "StorageMethods.cpp"
+#ifndef THD_GENERIC_FILE
+#include "StorageSharing.cpp"
+#endif
+
+bool THPStorage_(init)(PyObject *module)
+{
+  static std::vector<PyMethodDef> methods;
+  THPUtils_addPyMethodDefs(methods, THPStorage_(methods));
+#ifndef THD_GENERIC_FILE
+  THPUtils_addPyMethodDefs(methods, THPStorage_(sharingMethods));
+#endif
+
+  THPStorageType.tp_methods = methods.data();
+  THPStorageType.tp_members = THPStorage_(members);
+  if (PyType_Ready(&THPStorageType) < 0)
+    return false;
+  Py_INCREF(&THPStorageType);
+  PyModule_AddObject(module, THPStorageBaseStr, (PyObject *)&THPStorageType);
+  THPStorage_(initCopyMethods)();
+  return true;
+}
+
+void THPStorage_(postInit)(PyObject *module)
+{
+  THPStorageClass = PyObject_GetAttrString(module,(char*)TH_CONCAT_STRING_2(Real,Storage));
+  if (!THPStorageClass) throw python_error();
+
+  bool is_cuda = false;
+#ifdef THC_GENERIC_FILE
+  is_cuda = true;
+#endif
+  const char *type_name = TH_CONCAT_STRING_2(Real,);
+  torch::registerStoragePyTypeObject((PyTypeObject*)THPStorageClass, type_name, is_cuda, false);
+}
+
+#endif
diff --git a/torch/csrc/generic/Storage.h b/torch/csrc/generic/Storage.h
new file mode 100644
index 0000000..4a9b0ac
--- /dev/null
+++ b/torch/csrc/generic/Storage.h
@@ -0,0 +1,22 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Storage.h"
+#else
+
+struct THPStorage {
+  PyObject_HEAD
+  THWStorage *cdata;
+};
+
+THP_API PyObject * THPStorage_(New)(THWStorage *ptr);
+extern PyObject *THPStorageClass;
+
+#ifdef _THP_CORE
+#include "torch/csrc/Types.h"
+
+bool THPStorage_(init)(PyObject *module);
+void THPStorage_(postInit)(PyObject *module);
+
+extern PyTypeObject THPStorageType;
+#endif
+
+#endif
diff --git a/torch/csrc/generic/StorageMethods.cpp b/torch/csrc/generic/StorageMethods.cpp
new file mode 100644
index 0000000..f4e6cd0
--- /dev/null
+++ b/torch/csrc/generic/StorageMethods.cpp
@@ -0,0 +1,320 @@
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+static PyObject * THPStorage_(size)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return PyLong_FromLong(THWStorage_(size)(LIBRARY_STATE self->cdata));
+  END_HANDLE_TH_ERRORS
+}
+
+#ifndef THD_GENERIC_FILE
+static PyObject * THPStorage_(dataPtr)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return PyLong_FromVoidPtr(THWStorage_(data)(LIBRARY_STATE self->cdata));
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+static PyObject * THPStorage_(copy_)(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  return THPStorageCopyMethod(THWStorage_(copy_functions), self, args, kwargs);
+  END_HANDLE_TH_ERRORS
+}
+
+#ifndef THD_GENERIC_FILE
+static PyObject * THPStorage_(isPinned)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+#if defined(USE_CUDA)
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, THWStorage_(data)(LIBRARY_STATE self->cdata));
+  if (err != cudaSuccess) {
+    cudaGetLastError();
+    Py_RETURN_FALSE;
+  }
+  return PyBool_FromLong(attr.memoryType == cudaMemoryTypeHost);
+#else
+  Py_RETURN_FALSE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+static PyObject * THPStorage_(elementSize)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return PyLong_FromLong(THWStorage_(elementSize)(LIBRARY_STATE_NOARGS));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(new)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  THWStoragePtr new_storage(THWStorage_(new)(LIBRARY_STATE_NOARGS));
+  PyObject *_ret = THPStorage_(New)(new_storage);
+  new_storage.release();
+  return _ret;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(resize_)(THPStorage *self, PyObject *number_arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(number_arg), "resize_ expects an int, "
+      "but got %s", THPUtils_typename(number_arg));
+  int64_t newsize = THPUtils_unpackLong(number_arg);
+  THWStorage_(resize)(LIBRARY_STATE self->cdata, newsize);
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(fill_)(THPStorage *self, PyObject *number_arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_(checkReal)(number_arg), "fill_ expects %s, "
+      "but got %s", THPUtils_typeTraits<real>::python_type_str,
+      THPUtils_typename(number_arg));
+  THWStorage_(fill)(LIBRARY_STATE self->cdata, THPUtils_(unpackReal)(number_arg));
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+#if !defined(THC_GENERIC_FILE) && !defined(THD_GENERIC_FILE)
+static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyObject *keywds)
+{
+  HANDLE_TH_ERRORS
+  PyObject *obj = NULL;
+  const char* byte_order_str = NULL;
+  Py_ssize_t count = -1, offset = 0;
+  Py_buffer buffer;
+  static char *kwlist[] = {"buffer", "byte_order", "count", "offset", NULL};
+  const char* argtypes;
+#if defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR)
+  argtypes = "O|snn";
+#else
+  argtypes = "Os|nn";
+#endif
+
+  if (!PyArg_ParseTupleAndKeywords(args, keywds, argtypes, kwlist,
+        &obj, &byte_order_str, &count, &offset)) {
+    return NULL;
+  }
+
+#if !(defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR))
+  THPByteOrder byte_order;
+  if (strcmp(byte_order_str, "native") == 0) {
+    byte_order = THP_nativeByteOrder();
+  } else if (strcmp(byte_order_str, "big") == 0) {
+    byte_order = THP_BIG_ENDIAN;
+  } else if (strcmp(byte_order_str, "little") == 0) {
+    byte_order = THP_LITTLE_ENDIAN;
+  } else {
+    PyErr_Format(PyExc_ValueError,
+      "invalid byte_order '%s' (expected 'big', 'little', or 'native')",
+      byte_order_str);
+    return NULL;
+  }
+#endif
+
+  if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0)
+    return NULL;
+
+  if (offset < 0 || offset > buffer.len) {
+    PyErr_Format(PyExc_ValueError,
+      "offset must be non-negative and no greater than buffer length (%" PRId64 "), "
+      "but got %" PRId64, (int64_t)offset, (int64_t)buffer.len);
+    PyBuffer_Release(&buffer);
+    return NULL;
+  }
+
+  if (count < 0) {
+    if ((buffer.len - offset) % sizeof(real) != 0) {
+      PyErr_Format(PyExc_ValueError, "buffer size (%" PRId64 ") must be a multiple "
+          "of element size (%" PRId64 ")", (int64_t)buffer.len, (int64_t)sizeof(real));
+      PyBuffer_Release(&buffer);
+      return NULL;
+    }
+    count = (buffer.len - offset) / sizeof(real);
+  }
+
+  if (offset + (count * (Py_ssize_t)sizeof(real)) > buffer.len) {
+    PyErr_Format(PyExc_ValueError, "buffer has only %" PRId64 " elements after offset "
+        "%" PRId64 ", but specified a size of %" PRId64, (int64_t)(buffer.len - offset),
+        (int64_t)offset, (int64_t)count);
+    PyBuffer_Release(&buffer);
+    return NULL;
+  }
+
+  uint8_t* src = (uint8_t*) buffer.buf;
+  THWStorage* storage = THWStorage_(newWithSize)(count);
+
+#if defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR)
+  memcpy(THWStorage_(data)(storage), src + offset, count);
+#elif defined(TH_REAL_IS_SHORT)
+  THP_decodeInt16Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_INT)
+  THP_decodeInt32Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_LONG)
+  // TODO: remove the cast
+  THP_decodeInt64Buffer((int64_t*) THWStorage_(data)(storage), src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_HALF)
+  THP_decodeHalfBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_FLOAT)
+  THP_decodeFloatBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_DOUBLE)
+  THP_decodeDoubleBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+#else
+#error "Unknown type"
+#endif
+
+  PyBuffer_Release(&buffer);
+  return (PyObject*)THPStorage_(New)(storage);
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+static PyObject * THPStorage_(fromFile)(PyObject *_unused, PyObject *args, PyObject *keywds)
+{
+  HANDLE_TH_ERRORS
+  const char *filename;
+  Py_ssize_t size = 0;
+  int shared = 0;
+  static char *kwlist[] = {"filename", "shared", "size", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, keywds, "s|in", kwlist,
+              &filename, &shared, &size)) {
+    return NULL;
+  }
+  if (shared)
+    shared = TH_ALLOCATOR_MAPPED_SHARED;
+  THWStorage *storage = THWStorage_(newWithMapping)(LIBRARY_STATE filename, size, shared);
+  return (PyObject*)THPStorage_(New)(storage);
+  END_HANDLE_TH_ERRORS
+}
+
+#ifndef THD_GENERIC_FILE
+PyObject * THPStorage_(writeFile)(THPStorage *self, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject *file = PyTuple_GET_ITEM(args, 0);
+  bool is_real_file = PyTuple_GET_ITEM(args, 1) == Py_True;
+
+  if (!is_real_file) {
+    THPStorage_(writeFileRaw<PyObject*>)(self->cdata, file);
+    Py_RETURN_NONE;
+  }
+
+  int fd = PyObject_AsFileDescriptor(file);
+  THPUtils_assert(fd != -1, "_write_file couldn't retrieve a file descriptor "
+      "from given object");
+  THPStorage_(writeFileRaw)(self->cdata, fd);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *file)
+{
+  HANDLE_TH_ERRORS
+  int fd = PyObject_AsFileDescriptor(file);
+  THPUtils_assert(fd != -1, "_new_with_file couldn't retrieve a file "
+      "descriptor from given object");
+  THWStorage *storage = THPStorage_(readFileRaw<int>)(fd, nullptr);
+  if (storage == nullptr)
+    return nullptr;
+  PyObject *result = THPStorage_(New)(storage);
+  return result;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject *file = PyTuple_GET_ITEM(args, 0);
+  PyObject *offset = PyTuple_GET_ITEM(args, 1);
+  bool is_real_file = PyTuple_GET_ITEM(args, 2) == Py_True;
+
+  if (!is_real_file) {
+    // offset can be implemented with a call to the Python object's seek()
+    // but it is currently unnecessary to support this.
+    THPUtils_assert(offset == Py_None,
+                    "_set_from_file: offset is NYI for filelike objects");
+    THWStorage *storage = THPStorage_(readFileRaw<PyObject*>)(file, self->cdata);
+    if (storage == nullptr) {
+      return nullptr;
+    }
+    Py_INCREF(self);
+    return (PyObject *) self;
+  }
+
+  // file is backed by a fd
+  int fd = PyObject_AsFileDescriptor(file);
+  if (offset != Py_None) {
+    lseek(fd, THPUtils_unpackLong(offset), SEEK_SET);
+  }
+  THPUtils_assert(fd != -1, "_set_from_file couldn't retrieve a file "
+      "descriptor from given object");
+  THWStorage *storage = THPStorage_(readFileRaw<int>)(fd, self->cdata);
+  if (storage == nullptr)
+    return nullptr;
+  Py_INCREF(self);
+
+  return (PyObject *) self;
+  END_HANDLE_TH_ERRORS
+}
+#endif // !defined(THD_GENERIC_FILE)
+
+#ifdef THC_GENERIC_FILE
+PyObject * THPStorage_(getDevice)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return PyLong_FromLong(THCStorage_(getDevice)(LIBRARY_STATE self->cdata));
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+PyObject * THPStorage_(_setCdata)(THPStorage *self, PyObject *new_cdata)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(new_cdata), "given an invalid argument to "
+      "_set_cdata - expected an int or long, but got %s",
+      THPUtils_typename(new_cdata));
+  THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(new_cdata);
+  THWStorage_(retain)(LIBRARY_STATE ptr);
+  THWStorage_(free)(LIBRARY_STATE self->cdata);
+  self->cdata = ptr;
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef THPStorage_(methods)[] = {
+  {"copy_", (PyCFunction)THPStorage_(copy_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"element_size", (PyCFunction)THPStorage_(elementSize), METH_NOARGS, NULL},
+  {"fill_", (PyCFunction)THPStorage_(fill_), METH_O, NULL},
+  {"new", (PyCFunction)THPStorage_(new), METH_NOARGS, NULL},
+  {"resize_", (PyCFunction)THPStorage_(resize_), METH_O, NULL},
+  {"size", (PyCFunction)THPStorage_(size), METH_NOARGS, NULL},
+#ifndef THD_GENERIC_FILE
+  {"data_ptr", (PyCFunction)THPStorage_(dataPtr), METH_NOARGS, NULL},
+  {"is_pinned", (PyCFunction)THPStorage_(isPinned), METH_NOARGS, NULL},
+  {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, NULL},
+  {"_new_with_file", (PyCFunction)THPStorage_(newWithFile), METH_O | METH_STATIC, NULL},
+  {"_set_from_file", (PyCFunction)THPStorage_(setFromFile), METH_VARARGS, NULL},
+#endif // !defined(THD_GENERIC_FILE)
+#if !defined(THC_GENERIC_FILE) && !defined(THD_GENERIC_FILE)
+  {"from_buffer", (PyCFunction)THPStorage_(fromBuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+#endif
+  {"from_file", (PyCFunction)THPStorage_(fromFile), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+#ifdef THC_GENERIC_FILE
+  {"get_device", (PyCFunction)THPStorage_(getDevice), METH_NOARGS, NULL},
+#endif
+  {"_set_cdata", (PyCFunction)THPStorage_(_setCdata), METH_O, NULL},
+#ifndef THD_GENERIC_FILE
+#endif
+  {NULL}
+};
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
new file mode 100644
index 0000000..c68dfbf
--- /dev/null
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -0,0 +1,400 @@
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <random>
+
+static PyObject * THPStorage_(sharedDecref)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+#ifndef THC_GENERIC_FILE
+  THWStorage *storage = self->cdata;
+  THManagedMapAllocator *ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr);
+  if (ctx) {
+    ctx->decref();
+  }
+#endif
+  Py_INCREF(self);
+  return (PyObject *)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(sharedIncref)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+#ifndef THC_GENERIC_FILE
+  THWStorage *storage = self->cdata;
+  THManagedMapAllocator *ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr);
+  if (ctx) {
+    ctx->incref();
+  }
+#endif
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+#ifndef THC_GENERIC_FILE
+// TODO: move this somewhere - we only need one version
+static std::string THPStorage_(__newHandle)() {
+  static std::random_device rd;
+  std::string handle = "/torch_";
+#ifdef _MSC_VER
+  handle += std::to_string(GetCurrentProcessId());
+#else
+  handle += std::to_string(getpid());
+#endif
+  handle += "_";
+  handle += std::to_string(rd());
+  return handle;
+}
+
+static THWStorage* THPStorage_(newFilenameStorage)(ptrdiff_t size)
+{
+  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_EXCLUSIVE;
+  std::string handle = THPStorage_(__newHandle)();
+  return THWStorage_(newWithDataAndAllocator)(
+      THManagedMapAllocator::makeDataPtr("", handle.c_str(), flags, size * sizeof(real)), size, /* allocator */ nullptr);
+}
+
+static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  long long size;
+  if (!PyArg_ParseTuple(args, "L", &size)) {
+    return NULL;
+  }
+  return THPStorage_(New)(THPStorage_(newFilenameStorage)(size));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(shareFilename)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  THWStorage *storage = self->cdata;
+  THManagedMapAllocator *ctx;
+  // Storage is already in shared memory, just return a handle
+  if ((ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr))) {
+    // done
+  } else {
+    // TODO: retry on collision
+    // TODO: free GIL - but remember to reacquire it when an exception is thrown
+    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size));
+    THWStorage_(copy)(new_storage, storage);
+    THWStorage_(swap)(storage, new_storage);
+    ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr);
+    AT_ASSERT(ctx);
+  }
+
+  THPObjectPtr manager_handle(PyBytes_FromString(ctx->manager_handle()));
+  if (!manager_handle) return NULL;
+  THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename()));
+  if (!storage_handle) return NULL;
+  THPObjectPtr size(PyLong_FromLong(storage->size));
+  if (!size) return NULL;
+
+  THPObjectPtr tuple(PyTuple_New(3));
+  if (!tuple) return NULL;
+  PyTuple_SET_ITEM(tuple.get(), 0, manager_handle.release());
+  PyTuple_SET_ITEM(tuple.get(), 1, storage_handle.release());
+  PyTuple_SET_ITEM(tuple.get(), 2, size.release());
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyTuple_GET_SIZE(args) == 3, "tuple of 3 items expected");
+  PyObject *_manager_handle = PyTuple_GET_ITEM(args, 0);
+  PyObject *_object_handle = PyTuple_GET_ITEM(args, 1);
+  PyObject *_size = PyTuple_GET_ITEM(args, 2);
+  if (!PyBytes_Check(_manager_handle) || !PyBytes_Check(_object_handle) || !THPUtils_checkLong(_size)) {
+    THPUtils_invalidArguments(args, NULL, "_new_shared in file system mode", 1,
+        "a handle (string/bytes) and storage size (int)");
+    return NULL;
+  }
+  const char *manager_handle = PyBytes_AS_STRING(_manager_handle);
+  const char *object_handle = PyBytes_AS_STRING(_object_handle);
+  int64_t size = THPUtils_unpackLong(_size);
+  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
+              TH_ALLOCATOR_MAPPED_NOCREATE;
+  return THPStorage_(New)(
+          THWStorage_(newWithDataAndAllocator)(
+            THManagedMapAllocator::makeDataPtr(manager_handle, object_handle, flags, size * sizeof(real)),
+            size,
+            /* allocator */ nullptr));
+  END_HANDLE_TH_ERRORS
+}
+
+static THWStorage* THPStorage_(newFdStorage)(ptrdiff_t size)
+{
+  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
+              TH_ALLOCATOR_MAPPED_EXCLUSIVE |
+              TH_ALLOCATOR_MAPPED_KEEPFD |
+              TH_ALLOCATOR_MAPPED_UNLINK;
+  std::string handle = THPStorage_(__newHandle)();
+  auto sptr = THMapAllocator::makeDataPtr(handle.c_str(), flags, size * sizeof(real), nullptr);
+  return THWStorage_(newWithDataAndAllocator)(std::move(sptr), size, /* allocator */ nullptr);
+}
+
+static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  long long size;
+  if (!PyArg_ParseTuple(args, "L", &size)) {
+    return NULL;
+  }
+  return THPStorage_(New)(THPStorage_(newFdStorage)(size));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(shareFd)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  THWStorage *storage = self->cdata;
+  THMapAllocator *ctx;
+  // Storage is already in shared memory, just return a handle
+  if ((ctx = THMapAllocator::fromDataPtr(storage->data_ptr))) {
+    // done
+  } else {
+    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size));
+    THWStorage_(copy)(new_storage, storage);
+    THWStorage_(swap)(storage, new_storage);
+    ctx = THMapAllocator::fromDataPtr(storage->data_ptr);
+    AT_ASSERT(ctx);
+  }
+
+  THPObjectPtr storage_handle(PyLong_FromLong(ctx->fd()));
+  if (!storage_handle) return NULL;
+  THPObjectPtr size(PyLong_FromLong(storage->size));
+  if (!size) return NULL;
+
+  THPObjectPtr tuple(PyTuple_New(2));
+  if (!tuple) return NULL;
+  PyTuple_SET_ITEM(tuple.get(), 0, storage_handle.release());
+  PyTuple_SET_ITEM(tuple.get(), 1, size.release());
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected");
+  PyObject *_tmp_fd = PyTuple_GET_ITEM(args, 0);
+  PyObject *_size = PyTuple_GET_ITEM(args, 1);
+  if (!THPUtils_checkLong(_tmp_fd) || !THPUtils_checkLong(_size)) {
+    THPUtils_invalidArguments(args, NULL, "_new_shared in file descriptor mode",
+        1, "a file descriptor (int) and storage size (int)");
+    return NULL;
+  }
+  int fd;
+  int tmp_fd = (int) THPUtils_unpackLong(_tmp_fd);
+  int64_t size = THPUtils_unpackLong(_size);
+  if ((fd = dup(tmp_fd)) == -1) {
+    THPUtils_setError("could not duplicate a shared memory file descriptor");
+    return NULL;
+  }
+
+  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
+              TH_ALLOCATOR_MAPPED_NOCREATE |
+              TH_ALLOCATOR_MAPPED_KEEPFD |
+              TH_ALLOCATOR_MAPPED_FROMFD;
+  return THPStorage_(New)(
+          THWStorage_(newWithDataAndAllocator)(
+            // TODO: Maybe we should read out the real size and use it for size
+            THMapAllocator::makeDataPtr(WITH_FD, nullptr, fd, flags, size * sizeof(real), nullptr),
+            size, /* allocator */ nullptr));
+  END_HANDLE_TH_ERRORS
+}
+
+#else // THC_GENERIC_FILE
+
+static PyObject * THPStorage_(shareCuda)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  THWStorage *storage = self->cdata;
+  at::DeviceGuard device_guard(storage->data_ptr.device().index());
+  THPObjectPtr tuple(PyTuple_New(4));
+  THPObjectPtr device(PyLong_FromLong(storage->data_ptr.device().index()));
+  THPObjectPtr _handle(Py_None);
+  Py_INCREF(Py_None);
+  THPObjectPtr size(PyLong_FromLong(storage->size));
+  THPObjectPtr _offset(PyLong_FromLong(0));
+  if (THWStorage_(data)(LIBRARY_STATE storage)) {
+    size_t base_size;
+    void *base_ptr = THCCachingAllocator_getBaseAllocation(THWStorage_(data)(LIBRARY_STATE storage), &base_size);
+    ptrdiff_t offset = (char*)storage->data<real>() - (char*)base_ptr;
+
+    cudaIpcMemHandle_t handle;
+    THCudaCheck(cudaIpcGetMemHandle(&handle, base_ptr));
+
+    _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE);
+    _offset = PyLong_FromSsize_t((Py_ssize_t)offset / sizeof(real));
+    size = PyLong_FromSize_t(base_size / sizeof(real));
+  }
+  if (!tuple || !device || !_handle || !size || !_offset) {
+    return NULL;
+  }
+  PyTuple_SET_ITEM(tuple.get(), 0, device.release());
+  PyTuple_SET_ITEM(tuple.get(), 1, _handle.release());
+  PyTuple_SET_ITEM(tuple.get(), 2, size.release());
+  PyTuple_SET_ITEM(tuple.get(), 3, _offset.release());
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyTuple_GET_SIZE(args) == 3, "tuple of 3 items expected");
+  PyObject *_device = PyTuple_GET_ITEM(args, 0);
+  PyObject *_handle = PyTuple_GET_ITEM(args, 1);
+  PyObject *_size = PyTuple_GET_ITEM(args, 2);
+  if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size)
+      && (_handle == Py_None || PyBytes_Check(_handle)))) {
+    THPUtils_invalidArguments(args, NULL, "_new_shared in CUDA mode", 1,
+        "(int device, bytes handle, int storage_size)");
+    return NULL;
+  }
+
+  size_t storage_size = (size_t)THPUtils_unpackLong(_size);
+
+  int64_t device = THPUtils_unpackLong(_device);
+  at::DeviceGuard device_guard(device);
+
+  char *buffer;
+  Py_ssize_t handle_size;
+  if (PyBytes_AsStringAndSize(_handle, &buffer, &handle_size) == -1) {
+    return NULL;
+  }
+  THPUtils_assert(handle_size == CUDA_IPC_HANDLE_SIZE, "incorrect handle size");
+  cudaIpcMemHandle_t handle = *(cudaIpcMemHandle_t*)buffer;
+
+  void *devPtr = NULL;
+  THCudaCheck(cudaIpcOpenMemHandle(&devPtr, handle, cudaIpcMemLazyEnablePeerAccess));
+
+  THWStoragePtr base(THWStorage_(newWithDataAndAllocator)(
+      LIBRARY_STATE
+      THCIpcDeleter::makeDataPtr(devPtr, device),
+      storage_size, /* allocator */ nullptr));
+  base->flag = TH_STORAGE_REFCOUNTED;  // NB: Not resizable
+
+  return THPStorage_(New)(base.release());
+  END_HANDLE_TH_ERRORS
+}
+#endif
+
+// Returns an object that holds a "weak" pointer to the THStorage.  This
+// pointer keeps the THStorage struct live, but does not retain the data
+// pointer.
+//
+// NB: This does NOT preserve object identity when you call it multiple times
+static PyObject * THPStorage_(weakRef)(THPStorage *self, PyObject *weak_ref_class) {
+  HANDLE_TH_ERRORS
+  THStorage* storage = self->cdata;
+
+  THStorage_weakRetain(storage);
+
+  THPObjectPtr args(Py_BuildValue("(N)", PyLong_FromVoidPtr(storage)));
+  if (!args) return NULL;
+  THPObjectPtr ref(PyObject_Call(weak_ref_class, args, NULL));
+  if (!ref) return NULL;
+
+  // We need to also add a finalizer with an owning reference to the weak class,
+  // so that we can keep the "weak" object live until it should actually be
+  // cleared form the map.
+  // Access to storage->finalizer protected by GIL
+  torch::PyObjectFinalizer* finalizer = new torch::PyObjectFinalizer(ref.get());
+  std::swap(storage->finalizer, finalizer->next_);
+  storage->finalizer.reset(finalizer);
+
+  return ref.release();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPObjectPtr ref(PyObject_GetAttrString(arg, "cdata"));
+  if (!ref) {
+    return NULL;
+  } else if (ref.get() == Py_None) {
+    Py_RETURN_NONE;
+  }
+  THPUtils_assert(THPUtils_checkLong(ref.get()),
+      "_new_with_weak_ptr(): arg.cdata must be an 'int'");
+  THStorage *weak_storage = (THStorage*)PyLong_AsVoidPtr(ref.get());
+  if (auto* storage = THStorage_weakLock(weak_storage)) {
+    return THPStorage_(New)(storage);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPStorage_(freeWeakRef)(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  if (arg == Py_None) {
+    Py_RETURN_NONE;
+  }
+  THPUtils_assert(THPUtils_checkLong(arg),
+      "_free_weak_ref(): arg must be an 'int'");
+  THStorage *weak_storage = (THStorage*)PyLong_AsVoidPtr(arg);
+  THStorage_weakFree(weak_storage);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPStorage_(sharedFd)(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  THMapAllocator *ctx = nullptr;
+#ifndef THC_GENERIC_FILE
+  THWStorage *storage = self->cdata;
+  ctx = THMapAllocator::fromDataPtr(storage->data_ptr);
+#endif
+
+  THPUtils_assert(ctx, "couldn't retrieve a shared file descriptor");
+  return PyLong_FromLong(ctx->fd());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THPStorage_(isShared)(THPStorage *self)
+{
+#ifdef THC_GENERIC_FILE
+  Py_RETURN_TRUE;
+#else
+  if (THMapAllocator::fromDataPtr(self->cdata->data_ptr) ||
+      THManagedMapAllocator::fromDataPtr(self->cdata->data_ptr)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+#endif
+}
+
+static PyMethodDef THPStorage_(sharingMethods)[] = {
+  {"_new_with_weak_ptr", (PyCFunction)THPStorage_(newWithWeakPtr), METH_O | METH_CLASS, NULL},
+#ifdef THC_GENERIC_FILE
+  {"_share_cuda_", (PyCFunction)THPStorage_(shareCuda), METH_NOARGS, NULL},
+  {"_new_shared_cuda", (PyCFunction)THPStorage_(newSharedCuda), METH_VARARGS | METH_STATIC, NULL},
+#else
+  {"_share_fd_", (PyCFunction)THPStorage_(shareFd), METH_NOARGS, NULL},
+  {"_new_shared_fd", (PyCFunction)THPStorage_(newSharedFd), METH_VARARGS | METH_STATIC, NULL},
+  {"_new_using_fd", (PyCFunction)THPStorage_(pyNewFdStorage), METH_VARARGS | METH_STATIC, NULL},
+  {"_share_filename_", (PyCFunction)THPStorage_(shareFilename), METH_NOARGS, NULL},
+  {"_new_shared_filename", (PyCFunction)THPStorage_(newSharedFilename), METH_VARARGS | METH_STATIC, NULL},
+  {"_new_using_filename", (PyCFunction)THPStorage_(pyNewFilenameStorage), METH_VARARGS | METH_STATIC, NULL},
+#endif
+  {"_weak_ref", (PyCFunction)THPStorage_(weakRef), METH_O, NULL},
+  {"_free_weak_ref", (PyCFunction)THPStorage_(freeWeakRef), METH_O | METH_STATIC, NULL},
+  {"_shared_decref", (PyCFunction)THPStorage_(sharedDecref), METH_NOARGS, NULL},
+  {"_shared_incref", (PyCFunction)THPStorage_(sharedIncref), METH_NOARGS, NULL},
+  {"_get_shared_fd", (PyCFunction)THPStorage_(sharedFd), METH_NOARGS, NULL},
+  {"is_shared", (PyCFunction)THPStorage_(isShared), METH_NOARGS, NULL},
+  {NULL}
+};
diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
new file mode 100644
index 0000000..42dff61
--- /dev/null
+++ b/torch/csrc/generic/serialization.cpp
@@ -0,0 +1,147 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/serialization.cpp"
+#else
+
+#define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); }
+
+template <class io>
+void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
+{
+  real *data;
+  int64_t size = THWStorage_(size)(LIBRARY_STATE self);
+#ifndef THC_GENERIC_FILE
+  data = THWStorage_(data)(LIBRARY_STATE self);
+#else
+  std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
+  data = (real*)cpu_data.get();
+  THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(real), cudaMemcpyDeviceToHost));
+#endif
+  ssize_t result = doWrite(fd, &size, sizeof(int64_t));
+  if (result != sizeof(int64_t))
+    throw std::system_error(result, std::system_category());
+  // fast track for bytes and little endian
+  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
+    char *bytes = (char *) data;
+    int64_t remaining = sizeof(real) * size;
+    while (remaining > 0) {
+      // we write and read in 1GB blocks to avoid bugs on some OSes
+      ssize_t result = doWrite(fd, bytes, THMin(remaining, 1073741824));
+      if (result < 0)
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
+    if (remaining != 0)
+      throw std::system_error(result, std::system_category());
+  } else {
+    int64_t buffer_size = std::min(size, (int64_t)5000);
+    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
+    for (int64_t i = 0; i < size; i += buffer_size) {
+      size_t to_convert = std::min(size - i, buffer_size);
+      if (sizeof(real) == 2) {
+        THP_encodeInt16Buffer((uint8_t*)le_buffer.get(),
+            (const int16_t*)data + i,
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (sizeof(real) == 4) {
+        THP_encodeInt32Buffer((uint8_t*)le_buffer.get(),
+            (const int32_t*)data + i,
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (sizeof(real) == 8) {
+        THP_encodeInt64Buffer((uint8_t*)le_buffer.get(),
+            (const int64_t*)data + i,
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      }
+      SYSCHECK(doWrite(fd, le_buffer.get(), to_convert * sizeof(real)));
+    }
+  }
+}
+
+template void THPStorage_(writeFileRaw<int>)(THWStorage *self, int fd);
+template void THPStorage_(writeFileRaw<PyObject*>)(THWStorage *self, PyObject* fd);
+
+template <class io>
+THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
+{
+  real *data;
+  int64_t size;
+  ssize_t result = doRead(file, &size, sizeof(int64_t));
+  if (result == 0)
+    throw std::runtime_error("unexpected EOF. The file might be corrupted.");
+  if (result != sizeof(int64_t))
+    throw std::system_error(result, std::system_category());
+  THWStoragePtr storage;
+  if (_storage == nullptr) {
+    storage = THWStorage_(newWithSize)(LIBRARY_STATE size);
+  } else {
+    THPUtils_assert(THWStorage_(size)(LIBRARY_STATE _storage) == size,
+        "storage has wrong size: expected %ld got %ld",
+        size, THWStorage_(size)(LIBRARY_STATE _storage));
+    storage = _storage;
+  }
+
+#ifndef THC_GENERIC_FILE
+  data = THWStorage_(data)(LIBRARY_STATE storage);
+#else
+  std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
+  data = (real*)cpu_data.get();
+#endif
+
+  // fast track for bytes and little endian
+  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
+    char *bytes = (char *) data;
+    int64_t remaining = sizeof(real) * THWStorage_(size)(LIBRARY_STATE storage);
+    while (remaining > 0) {
+      // we write and read in 1GB blocks to avoid bugs on some OSes
+      ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824));
+      if (result == 0) // 0 means EOF, which is also an error
+        throw std::runtime_error("unexpected EOF. The file might be corrupted.");
+      if (result < 0)
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
+    if (remaining != 0)
+      throw std::system_error(result, std::system_category());
+  } else {
+    int64_t buffer_size = std::min(size, (int64_t)5000);
+    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
+
+
+    for (int64_t i = 0; i < size; i += buffer_size) {
+      size_t to_convert = std::min(size - i, buffer_size);
+      SYSCHECK(doRead(file, le_buffer.get(), sizeof(real) * to_convert));
+
+      if (sizeof(real) == 2) {
+        THP_decodeInt16Buffer((int16_t*)data + i,
+            le_buffer.get(),
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (sizeof(real) == 4) {
+        THP_decodeInt32Buffer((int32_t*)data + i,
+            le_buffer.get(),
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (sizeof(real) == 8) {
+        THP_decodeInt64Buffer((int64_t*)data + i,
+            le_buffer.get(),
+            THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      }
+    }
+  }
+
+#ifdef THC_GENERIC_FILE
+  THCudaCheck(cudaMemcpy(THWStorage_(data)(LIBRARY_STATE storage), data, size * sizeof(real), cudaMemcpyHostToDevice));
+#endif
+  return storage.release();
+}
+
+template THWStorage* THPStorage_(readFileRaw<int>)(int fd, THWStorage* storage);
+template THWStorage* THPStorage_(readFileRaw<PyObject*>)(PyObject* fd, THWStorage* storage);
+
+#undef SYSCHECK
+
+#endif
diff --git a/torch/csrc/generic/serialization.h b/torch/csrc/generic/serialization.h
new file mode 100644
index 0000000..e73ac7c
--- /dev/null
+++ b/torch/csrc/generic/serialization.h
@@ -0,0 +1,11 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/serialization.h"
+#else
+
+template <class io>
+void THPStorage_(writeFileRaw)(THWStorage *self, io fd);
+
+template <class io>
+THWStorage * THPStorage_(readFileRaw)(io fd, THWStorage *storage);
+
+#endif
diff --git a/torch/csrc/generic/utils.cpp b/torch/csrc/generic/utils.cpp
new file mode 100644
index 0000000..4bf6609
--- /dev/null
+++ b/torch/csrc/generic/utils.cpp
@@ -0,0 +1,21 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/utils.cpp"
+#else
+
+#if defined(THD_GENERIC_FILE) || defined(TH_REAL_IS_HALF)
+#define GENERATE_SPARSE 0
+#else
+#define GENERATE_SPARSE 1
+#endif
+
+template<>
+void THPPointer<THPStorage>::free() {
+  if (ptr)
+    Py_DECREF(ptr);
+}
+
+template class THPPointer<THPStorage>;
+
+#undef GENERATE_SPARSE
+
+#endif
diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h
new file mode 100644
index 0000000..bcccffa
--- /dev/null
+++ b/torch/csrc/generic/utils.h
@@ -0,0 +1,34 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/utils.h"
+#else
+
+#if defined(THD_GENERIC_FILE) || defined(TH_REAL_IS_HALF)
+#define GENERATE_SPARSE 0
+#else
+#define GENERATE_SPARSE 1
+#endif
+
+struct THPStorage;
+struct THSPTensor;
+
+typedef class THPPointer<THWStorage>      THWStoragePtr;
+typedef class THPPointer<THWTensor>       THWTensorPtr;
+typedef class THPPointer<THPStorage>     THPStoragePtr;
+
+#if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \
+    (!defined(THD_GENERIC_FILE))
+template<>
+struct THPUtils_typeTraits<real> {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || \
+    defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || \
+    defined(THC_REAL_IS_HALF)
+  static constexpr char *python_type_str = "float";
+#else
+  static constexpr char *python_type_str = "int";
+#endif
+};
+#endif
+
+#undef GENERATE_SPARSE
+
+#endif
diff --git a/torch/csrc/jit/README.md b/torch/csrc/jit/README.md
new file mode 100644
index 0000000..02b4be8
--- /dev/null
+++ b/torch/csrc/jit/README.md
@@ -0,0 +1,72 @@
+# jit
+
+The jit directory contains infrastructure for a just-in-time compiler for
+PyTorch and associated 'script' subset of python it can execute directly.
+
+The JIT compiler has several phases.
+
+1. Parsing - An AST (defined in tree_views.h) is generated either by parsing a string of python-like code (`jit/script/parser.h`) or by translation from the Python AST (`jit/frontend.py`). This phase only checks for syntactic correctness and for use of the syntactic subset of python that
+the script supports.
+
+2. Semantic Checking/Specialization - We lower the AST into an IR Graph object. In this
+phase we check that variables are in scope and resolve any free variables to python objects.
+When we find free variables that are python objects, or references to non-first-class values
+such as modules, we temporarily represent them as `SugaredValue` objects. This phase then
+de-sugars these values by e.g. inserting a `PythonOp` into the graph to call a python function.
+
+3. Optimizations - A `GraphExecutor` works on an initial `Graph` object, performing optimizations,
+possibly differentiating it, and possibly specializing it to a particular size.
+
+4. Translation to Instructions - to execute a graph, it is lowered by the interpreter
+into a linear list of Instruction objects.
+
+5. Execution - the interpreter reads the instruction stream, executing ATen operations and
+any generated code fragments.
+
+## Well-known functions
+
+Ordinarily, when defining a compiler you want the set of functions to be user
+extensible; e.g., a user can add to the set of defined functions by defining an
+appropriate autograd Function.  However, there are some functions where we want
+to make assumptions about their semantics, because we are going to write
+optimizations over them or insert them into the program.  Such functions are
+"well-known" functions, because the JIT compiler knows about them, and a user
+implementation must abide by the contract (sometimes implicitly) specified by
+the compiler.
+
+A well-known function is usually implemented in several parts:
+
+* First, we pre-intern the string (`interned_strings.h`) that identifies
+  the node.  This allows us to more conveniently refer to these operators
+  without having to first do a lookup through the intern table.
+
+* If we generate this operator during optimizations, we will often have
+  a helper function in `Graph` (`ir.h`) for creating the operator.  This is
+  the easiest way to find out, in code, what attributes we assume for an
+  operator.
+
+* There is a runtime interpretation of the operator in
+  `torch/csrc/autograd/functions/interpreter.cpp`, which specifies how we
+  actually interpret programs that contain such an operator.
+
+So, whence the specifications!  For the most part, we are following
+the [ONNX operator specification](https://github.com/onnx/onnx/blob/master/docs/Operators.md)
+to determine the semantics of our operators.  However, there are a few
+other well-known functions which are specific to PyTorch.
+
+* **FusionGroup**
+
+  A fusion group takes some number of input tensors, applies a graph `Subgraph`
+  to them, producing the returned tensors of the subgraph.  Operationally,
+  operators inside a FusionGroup are fused into a single kernel, so that their
+  intermediate results are never materialized.  Not all operators support
+  fusion:
+
+  * **attribute**:
+    <dl>
+      <dt>Subgraph</dt>
+      <dd>The graph of fused operators.  Its inputs and outputs should match
+      the number of inputs and outputs to the FusionGroup operator.</dd>
+    </dl>
+  * **input**: 1 - ∞ (same as inputs of Subgraph)
+  * **output**: 1 - ∞ (same as outputs of Subgraph)
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
new file mode 100644
index 0000000..a3e3c0f
--- /dev/null
+++ b/torch/csrc/jit/argument_spec.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/hash.h"
+#include "torch/csrc/jit/variable_tensor_list.h"
+
+namespace torch { namespace jit {
+
+// GraphExecutor creates specializations of Graphs for different dimensionalitities
+// and types of inputs.
+
+// ArgumentSpec represents one particular specialization.
+// It is designed so that it can be created, hashed, and compared quickly
+// since it is used along the hot-path of the JIT to check if the code
+// we have created is valid for the given inputs.
+
+// TensorInfoPOD is only used internally in ArgumentSpec
+// API users should use TensorInfo
+struct TensorInfoPOD {
+  // total size is 64-bit
+  unsigned type : 8;
+  unsigned defined : 1;
+  unsigned requires_grad : 1;
+  signed device : 22;
+  uint32_t total_dims; // all TensorInfoPODs are in ArgumentSpec's tensor_info() array.
+                       // total_dims is the total number of dimensions seen so far
+                       // in all previous members of tensor_info(), including this tensor
+                       // 2*total_dims becomes the offset into the sizes_strides list
+                       // for the _next_ tensor in the tensor_info array
+                       // for tensor 0, the offset is always 0
+};
+
+static_assert(sizeof(TensorInfoPOD) == sizeof(int64_t),
+  "TensorInfoPOD must be 64-bit struct for ArgumentSpec encoding to work");
+
+struct TensorInfo;
+
+struct ArgumentSpec {
+  // note: tensors must always be variables
+  ArgumentSpec(bool with_grad, const variable_tensor_list & tensors)
+  :  hash_code(0), ntensors(tensors.size()) {
+    int all_dims = 0;
+    for(size_t i = 0; i < ntensors; i++) {
+      all_dims += tensors[i].defined() ? tensors[i].ndimension() : 0;
+    }
+    // allocate enough room for all TensorPODs and dimensions
+    data.resize(ntensors + all_dims*2);
+
+    // and reinterpret our data array as these structs
+    TensorInfoPOD * pods = reinterpret_cast<TensorInfoPOD*>(data.data());
+    int64_t * next_dim = sizes_strides();
+    int total_dims = 0;
+    for(size_t i = 0; i < ntensors; i++) {
+      const auto & t = tensors[i];
+      auto & pod = pods[i];
+      pod.defined = t.defined();
+      if(t.defined()) {
+        pod.type = static_cast<unsigned int>(t.type().scalarType());
+        pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
+        pod.requires_grad = with_grad && static_cast<const autograd::Variable&>(t).requires_grad();
+        total_dims += t.ndimension();
+        auto sizes = t.sizes();
+        std::copy(sizes.begin(),sizes.end(), next_dim);
+        next_dim += sizes.size();
+        auto strides = t.strides();
+        std::copy(strides.begin(), strides.end(), next_dim);
+        next_dim += strides.size();
+      }
+      // each POD has a running tally of all dimensions including its own
+      pod.total_dims = total_dims;
+    }
+    // we precompute the hash_code to minimize the time inside of hash
+    // table operations where we may need to hold a compiler cache lock.
+    hash_code = hash_combine(0, ntensors);
+    for(auto d : data) {
+      hash_code = hash_combine(hash_code, d);
+    }
+  }
+
+  // equality is fast: check ntensors, and then check the raw array data,
+  // there are no size/stride indirections
+  bool operator==(const ArgumentSpec & spec) const {
+    return ntensors == spec.ntensors && data == spec.data;
+  }
+  bool operator!=(const ArgumentSpec & spec) const {
+    return !(*this == spec);
+  }
+  friend struct TensorInfo;
+  TensorInfo tensorInfo(size_t i) const;
+  size_t size() const {
+    return ntensors;
+  }
+  size_t hashCode() const {
+    return hash_code;
+  }
+
+private:
+  ArrayRef<TensorInfoPOD> tensor_info() const {
+    return ArrayRef<TensorInfoPOD>(reinterpret_cast<const TensorInfoPOD*>(data.data()), ntensors);
+  }
+  // the start of the sizes_strides information, which comes after the TensorInfoPOD list.
+  const int64_t* sizes_strides() const {
+    return data.data() + ntensors;
+  }
+  int64_t* sizes_strides() {
+    return data.data() + ntensors;
+  }
+  size_t hash_code; // precomputed on construction
+  uint32_t ntensors;
+  // layout is ntensors of TensorPOD (each 64-bit) followed by their size and stride info
+  // for 3 tensors: [t0POD][t1POD][t2POD][t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
+  std::vector<int64_t> data;
+};
+
+// public view of compressed TensorInfo
+struct TensorInfo {
+  TensorInfo(const ArgumentSpec & spec, const int i)
+  : spec(spec), i(i) {}
+  at::ScalarType type() const {
+    return at::ScalarType(pod(i).type);
+  }
+  bool defined() const {
+    return pod(i).defined;
+  }
+  bool requires_grad() const {
+    return pod(i).requires_grad;
+  }
+  int device() const {
+    return pod(i).device;
+  }
+  int ndimension() const {
+    // See [valid range], it is always valid to ask for offset for (i + 1)
+    return (sizes_strides_offset(i + 1) - sizes_strides_offset(i))/2;
+  }
+  at::IntList sizes() const {
+    return at::IntList(spec.sizes_strides() + sizes_strides_offset(i), ndimension());
+  }
+  at::IntList strides() const {
+    int ndim = ndimension();
+    return at::IntList(spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim);
+  }
+  operator TypePtr() const {
+    if(!defined())
+      return DynamicType::get();
+    return std::make_shared<TensorType>(type(), device(), sizes(), strides());
+  }
+private:
+  // offsetinto sizes_strides() array where the sizes start for tensor j
+  // [valid range] valid range is [0, ntensors]
+  // (i.e. you can ask for the offset at ntensors, which would be the offset of the next tensor if it existed)
+  int sizes_strides_offset(int j) const {
+    if(j == 0) return 0;
+    return 2*pod(j - 1).total_dims;
+  }
+  const TensorInfoPOD & pod(int j) const {
+    return spec.tensor_info().at(j);
+  }
+  const ArgumentSpec & spec;
+  const int i;
+};
+
+inline std::ostream & operator<<(std::ostream & out, const TensorInfo & info) {
+  if(!info.defined()) {
+    return out << "<undefined>";
+  }
+  out << "Tensor(device=" << info.device()
+    << ", type=" << toString(info.type())
+    << ", requires_grad=" << info.requires_grad()
+    << ", sizes=" << info.sizes()
+    << ", strides=" << info.strides() << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream & out, const ArgumentSpec & spec) {
+  out << "{";
+  for(size_t i = 0; i < spec.size(); ++i) {
+    if (i > 0)
+      out << ", ";
+    out << spec.tensorInfo(i);
+  }
+  out << "}";
+  return out;
+}
+
+inline TensorInfo ArgumentSpec::tensorInfo(size_t i) const {
+  return TensorInfo(*this, i);
+}
+
+}}
+
+namespace std {
+  template<>
+  struct hash<torch::jit::ArgumentSpec> {
+    size_t operator()(const torch::jit::ArgumentSpec & spec) const {
+      return spec.hashCode();
+    }
+  };
+}
diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h
new file mode 100644
index 0000000..9d22c61
--- /dev/null
+++ b/torch/csrc/jit/attributes.h
@@ -0,0 +1,256 @@
+#pragma once
+#include <vector>
+#include <stdint.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <ATen/ATen.h>
+#include "ATen/Utils.h"
+
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/assertions.h"
+
+namespace torch { namespace jit {
+
+enum class AttributeKind {
+  f,fs,i,is,s,ss,t,ts,g,gs
+};
+static inline const char * toString(AttributeKind kind) {
+  static const char* names[] = {"f","fs","i","is","s","ss","t","ts","g","gs"};
+  JIT_ASSERT(size_t(kind) < sizeof(names)/sizeof(AttributeKind));
+  return names[int(kind)];
+}
+
+struct AttributeValue {
+  AttributeValue(Symbol name)
+  : name(name) {}
+  using Ptr = std::unique_ptr<AttributeValue>;
+  Symbol name;
+  virtual AttributeKind kind() const = 0;
+  virtual Ptr clone() const = 0;
+  virtual ~AttributeValue() {}
+};
+
+template<typename T, AttributeKind Kind>
+struct ScalarAttributeValue : public AttributeValue {
+  using ConstructorType = T;
+  using ValueType = T;
+  ScalarAttributeValue(Symbol name, ConstructorType value_)
+  : AttributeValue(name), value_(value_) {}
+  ValueType & value() {
+    return value_;
+  }
+  Ptr clone() const override {
+    return Ptr(new ScalarAttributeValue(name, value_));
+  }
+  AttributeKind kind() const override { return Kind; }
+private:
+  ValueType value_;
+};
+
+template<typename T, AttributeKind Kind>
+struct VectorAttributeValue : public AttributeValue {
+  using ConstructorType = std::vector<T>;
+  using ValueType = std::vector<T>;
+  VectorAttributeValue(Symbol name, ConstructorType value_)
+  : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType & value() {
+    return value_;
+  }
+  AttributeKind kind() const override { return Kind; }
+  std::unique_ptr<AttributeValue> clone() const override {
+    auto copy = value_;
+    return Ptr(new VectorAttributeValue(name, std::move(copy)));
+  }
+private:
+  ValueType value_;
+};
+
+using FloatAttr = ScalarAttributeValue<double,AttributeKind::f>;
+using FloatsAttr = VectorAttributeValue<double,AttributeKind::fs>;
+using IntAttr = ScalarAttributeValue<int64_t,AttributeKind::i>;
+using IntsAttr = VectorAttributeValue<int64_t,AttributeKind::is>;
+using StringAttr = ScalarAttributeValue<std::string,AttributeKind::s>;
+using StringsAttr = VectorAttributeValue<std::string,AttributeKind::ss>;
+using TensorAttr = ScalarAttributeValue<at::Tensor,AttributeKind::t>;
+using TensorsAttr = VectorAttributeValue<at::Tensor,AttributeKind::ts>;
+struct Graph;
+using GraphAttr = ScalarAttributeValue<std::shared_ptr<Graph>,AttributeKind::g>;
+using GraphsAttr = VectorAttributeValue<std::shared_ptr<Graph>,AttributeKind::gs>;
+
+struct AttributeError : public std::exception {
+  AttributeError(Symbol name, bool defined) {
+    std::stringstream ss;
+    if(!defined) {
+      ss << "required keyword attribute '" << name.toUnqualString() << "' is undefined.";
+    } else {
+      ss << "required keyword attribute '" << name.toUnqualString() << "' has the wrong type";
+    }
+    msg = ss.str();
+  }
+  const char* what() const noexcept override  {
+    return msg.c_str();
+  }
+private:
+  std::string msg;
+};
+
+// CRTP so that Node which inherits Attributes can be return for
+// method chaining e.g:
+// Node * n = g->create(kSelect)->i_(kOffset,3)->f_(kValue,3.5);
+// we return Derived* pointers because Nodes are normally held as pointers.
+template<typename Derived>
+struct Attributes {
+  Attributes() {}
+  void copyAttributes(const Attributes & rhs) {
+    values_.clear();
+    for(auto & i : rhs.values_) {
+      values_.push_back(i->clone());
+    }
+  }
+  bool hasAttribute(Symbol name) const {
+    JIT_ASSERT(name.is_attr());
+    return find(name,false) != values_.end();
+  }
+  // We want direct string accessors, as it is nicer to use than
+  // hasAttribute(Symbol::attr("blah"))
+  //
+  // For some reason, &Attributes<Node>::hasAttribute in pybind11 is able to
+  // give the pybind11 metaprogramming machinery "the right type", but
+  // the equivalent looking lambda [](Attributes<Node>& a, const std::string&)
+  // doesn't work!  So instead we define the methods on the class so we can
+  // continue using the old idiom.
+  bool hasAttributeS(const std::string& name) const {
+    return hasAttribute(Symbol::attr(name));
+  }
+  AttributeKind kindOf(Symbol name) const {
+    JIT_ASSERT(name.is_attr());
+    return (*find(name,true))->kind();
+  }
+  AttributeKind kindOfS(const std::string& name) const {
+    return kindOf(Symbol::attr(name));
+  }
+  Derived* removeAttribute(Symbol name) {
+    JIT_ASSERT(name.is_attr());
+    values_.erase(find(name,true));
+    return This();
+  }
+  Derived* removeAttributeS(const std::string& name) {
+    return removeAttribute(Symbol::attr(name));
+  }
+  bool hasAttributes() const {
+    return values_.size() > 0;
+  }
+  size_t numAttributes() const {
+    return values_.size();
+  }
+  // The names are returned in order, since name actually is the index.
+  std::vector<Symbol> attributeNames() const {
+    std::vector<Symbol> names;
+    for(auto & a : values_)
+      names.push_back(a->name);
+    return names;
+  }
+  std::vector<const char*> attributeNamesS() const {
+    std::vector<const char*> names;
+    for(auto & a : values_)
+      names.push_back(a->name.toUnqualString());
+    return names;
+  }
+
+  #define CREATE_ACCESSOR(Kind, method) \
+  Derived* method##_(Symbol name, Kind##Attr::ConstructorType v) { \
+    return set<Kind##Attr>(name,std::forward<Kind##Attr::ConstructorType>(v)); \
+  } \
+  const Kind##Attr::ValueType& method(Symbol name) const { \
+    return get<Kind##Attr>(name); \
+  }
+  CREATE_ACCESSOR(Float,f)
+  CREATE_ACCESSOR(Floats,fs)
+  CREATE_ACCESSOR(String,s)
+  CREATE_ACCESSOR(Strings,ss)
+  CREATE_ACCESSOR(Int,i)
+  CREATE_ACCESSOR(Ints,is)
+  CREATE_ACCESSOR(Graph,g)
+  CREATE_ACCESSOR(Graphs,gs)
+
+  #undef CREATE_ACCESSOR
+
+  // does not use CREATE_ACCESSOR because we need additional asserts
+  Derived* t_(Symbol name, TensorAttr::ConstructorType v) {
+    JIT_ASSERT(!v.defined() || !v.is_variable());
+    return set<TensorAttr>(name,std::forward<TensorAttr::ConstructorType>(v));
+  }
+  const TensorAttr::ValueType& t(Symbol name) const {
+    return get<TensorAttr>(name);
+  }
+
+  Derived* ts_(Symbol name, TensorsAttr::ConstructorType v) {
+    for(auto & t : v) {
+      JIT_ASSERT(!t.defined() || !t.is_variable());
+    }
+    return set<TensorsAttr>(name,std::forward<TensorsAttr::ConstructorType>(v));
+  }
+  const TensorsAttr::ValueType& ts(Symbol name) const {
+    return get<TensorsAttr>(name);
+  }
+
+private:
+  // UBSAN error: https://github.com/pytorch/pytorch/issues/9055
+  Derived* This() __ubsan_ignore_vptr__ {
+    return static_cast<Derived*>(this);
+  }
+  template<typename T>
+  Derived* set(Symbol name, typename T::ConstructorType v) {
+    JIT_ASSERT(name.is_attr());
+    auto it = find(name, false);
+    auto nv = AVPtr(new T(name, std::forward<typename T::ConstructorType>(v)));
+    if(it == values_.end()) {
+      values_.push_back(std::move(nv));
+    } else {
+      *it = std::move(nv);
+    }
+    return This();
+  }
+  template<typename T>
+  typename T::ValueType & get(Symbol name) const {
+    JIT_ASSERT(name.is_attr());
+    auto it = find(name, true);
+    T* child = dynamic_cast<T*>(it->get());
+    if(child == nullptr) {
+      throw AttributeError(name, true);
+    }
+    return child->value();
+  }
+  using AVPtr = AttributeValue::Ptr;
+  // NB: For determinism, we use a vector rather than a hash map.  This does
+  // mean that lookups are O(n), so you shouldn't use Attributes to store
+  // a big pile of messages.
+  std::vector<AVPtr> values_;
+  using iterator = std::vector<AVPtr>::iterator;
+  iterator find(Symbol name, bool required) {
+    JIT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(),[&](const AVPtr & v) {
+      return v->name == name;
+    });
+    if(required && it == values_.end()) {
+      throw AttributeError(name, false);
+    }
+    JIT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+  using const_iterator = std::vector<AVPtr>::const_iterator;
+  const_iterator find(Symbol name, bool required) const {
+    JIT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(),[&](const AVPtr & v) {
+      return v->name == name;
+    });
+    if(required && it == values_.end()) {
+      throw AttributeError(name, false);
+    }
+    JIT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+};
+
+}}
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
new file mode 100644
index 0000000..fdeb0ef
--- /dev/null
+++ b/torch/csrc/jit/autodiff.cpp
@@ -0,0 +1,505 @@
+#include "torch/csrc/jit/autodiff.h"
+
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/utils/functional.h"
+
+#include <algorithm>
+
+namespace torch { namespace jit {
+
+using value_map = std::unordered_map<Value*, Value*>;
+using value_set = std::unordered_set<Value*>;
+
+bool hasOneValuedAttribute(Node *n, torch::jit::Symbol name) {
+  return n->hasAttribute(name) && at::Scalar(n->t(name)).toDouble() == 1.0;
+}
+
+bool isDifferentiable(Node * n) {
+  static std::unordered_set<Symbol> differentiable_kinds = {
+    aten::add, aten::sub, aten::mul, prim::Constant,
+    aten::sigmoid, aten::tanh, aten::mm, aten::chunk, aten::split, aten::t, aten::neg,
+    aten::unsqueeze, aten::expand, aten::addmm, aten::gt, aten::lt, aten::eq, aten::ne, aten::ge, aten::le, aten::type_as,
+    aten::relu, aten::exp, prim::AutogradAdd
+  };
+  // TODO: check this more generally via schema
+  // This check ensures that the `alpha` and `beta` attributes on this addmm
+  // node are constant and equivalent to 1.0
+  if (n->kind() == aten::addmm) {
+    if (n->inputs().size() > 3)
+      return false;
+    if (!hasOneValuedAttribute(n, attr::alpha) || !hasOneValuedAttribute(n, attr::beta))
+      return false;
+  }
+  if (n->kind() == aten::type_as && !n->inputs().at(1)->isTensor()) {
+    return false;
+  }
+
+  // linear blocks may appear as inputs to graph executors, but they are removed
+  // before differentiation occurs
+  if (n->kind() == prim::GradOf) {
+    auto body = n->blocks().at(0);
+    return std::all_of(
+        body->nodes().begin(),
+        body->nodes().end(),
+        static_cast<bool (*)(Node*)>(isDifferentiable));
+  }
+
+  return differentiable_kinds.count(n->kind()) > 0;
+}
+
+
+bool isDifferentiable(Graph & g) {
+  return std::all_of(g.nodes().begin(), g.nodes().end(),
+                     static_cast<bool(*)(Node*)>(isDifferentiable));
+}
+
+
+bool outputRequiresGrad(Node* node, std::function<bool(Value*)> requires_grad) {
+  switch (node->kind()) {
+    case aten::le:
+    case aten::ge:
+    case aten::lt:
+    case aten::gt:
+    case aten::ne:
+    case aten::eq:
+      return false;
+    case aten::type_as:
+    //type_as has two inputs, the second of which (setting type) might require grad, but it still won't affect the output of type_as requiring grad.
+      return requires_grad(node->inputs().at(0));
+    default:
+      return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad);
+  }
+}
+
+
+
+static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_values) {
+  const auto build_sym_grad = [node](const std::vector<SymbolicVariable>& grads) -> std::vector<SymbolicVariable> {
+    auto inputs = fmap<SymbolicVariable>(node->inputs());
+    auto outputs = fmap<SymbolicVariable>(node->outputs());
+    switch(node->kind()) {
+      case aten::add:
+        // o = a - alpha*other
+        if(inputs.size() == 1)
+          return { grads.at(0) };
+          // o = a + alpha*b
+        return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha)) };
+      case aten::sub:
+        // o = a - alpha*other
+        if(inputs.size() == 1)
+          return {grads.at(0)};
+        // o = a - alpha*b
+        return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))};
+      case aten::mul:
+        // o = a * other
+        if(inputs.size() == 1)
+          return {grads.at(0) * at::Scalar(node->t(attr::other))};
+        // o = a * b
+        return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
+      case prim::Constant:
+        return {};
+      case aten::sigmoid:
+        return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))};
+      case aten::tanh:
+        return {grads.at(0) * (1 - outputs.at(0) * outputs.at(0))};
+      case aten::relu:
+        return {grads.at(0) * (outputs.at(0) > at::Scalar(0)).type_as(outputs.at(0))};
+      case aten::exp:
+        return {grads.at(0) * (outputs.at(0))};
+      case aten::chunk:
+      case aten::split:
+        return {SymbolicVariable::cat(grads, node->i(attr::dim))};
+      case aten::t:
+        return {grads.at(0).t()};
+      case aten::neg:
+        return {-grads.at(0)};
+      case aten::view:
+        return {grads.at(0).view(inputs.at(0).sizes())};
+      case aten::type_as:
+        return {grads.at(0).type_as(inputs.at(0))};
+      case aten::unsqueeze:
+        return {grads.at(0).squeeze(node->i(attr::dim))};
+      case aten::mm: {
+        SymbolicVariable dmat1, dmat2;
+        if (auto type = inputs.at(0).value()->type()->cast<TensorType>()) {
+          auto sizes = type->sizes(), strides = type->strides();
+          if (strides.at(0) == 1 && strides.at(1) == sizes.at(0)) {
+            dmat1 = inputs.at(1).mm(grads.at(0).t()).t();
+          } else {
+            dmat1 = grads.at(0).mm(inputs.at(1).t());
+          }
+        } else {
+          dmat1 = grads.at(0).mm(inputs.at(1).t());
+        }
+        if (auto type = inputs.at(1).value()->type()->cast<TensorType>()) {
+          auto sizes = type->sizes(), strides = type->strides();
+          if (strides.at(0) == 1 && strides.at(1) == sizes.at(0)) {
+            dmat2 = grads.at(0).t().mm(inputs.at(0)).t();
+          } else {
+            dmat2 = inputs.at(0).t().mm(grads.at(0));
+          }
+        } else {
+          dmat2 = inputs.at(0).t().mm(grads.at(0));
+        }
+        return {dmat1, dmat2};
+      }
+      case aten::expand: {
+        const auto& input_sizes = inputs.at(0).sizes();
+        if (input_sizes.size() == 0)
+          return {grads.at(0).sum()};
+        auto grad_sizes = node->is(attr::size);
+        auto grad = grads.at(0);
+        while (grad_sizes.size() > input_sizes.size()) {
+          grad = grad.sum(0, false);
+          grad_sizes.erase(grad_sizes.begin());
+        }
+        for (size_t i = 0; i < input_sizes.size(); ++i) {
+          if (input_sizes[i] == 1 && grad_sizes[i] > 1) {
+            grad = grad.sum(i, true);
+          }
+        }
+        return {grad};
+      }
+      case aten::squeeze: {
+        const auto& sizes = inputs.at(0).sizes();
+        if (node->hasAttribute(attr::dim)) {
+          int dim = node->i(attr::dim);
+          return {sizes.at(dim) > 1 ? grads.at(0) : grads.at(0).unsqueeze(dim)};
+        } else {
+          std::vector<size_t> squeezed_dims;
+          for (size_t i = 0; i < sizes.size(); ++i) {
+            if (sizes[i] != 1) continue;
+            squeezed_dims.push_back(i);
+          }
+          SymbolicVariable returned_grad = grads.at(0);
+          for (auto it = squeezed_dims.rbegin(); it != squeezed_dims.rend(); ++it)
+            returned_grad = returned_grad.unsqueeze(*it);
+          return {returned_grad};
+        }
+      }
+      case aten::cat: {
+        int dim = node->i(attr::dim);
+        const auto& first_sizes = inputs.at(0).sizes();
+        const auto has_first_sizes = [&first_sizes](SymbolicVariable var) {
+          return var.sizes() == first_sizes;
+        };
+        // NB: this is a specialization for the common case where all inputs are
+        // of equal sizes. We can use a single split operation to handle that.
+        if (std::all_of(inputs.begin(), inputs.end(), has_first_sizes)) {
+          return grads.at(0).chunk(inputs.size(), dim);
+        } else {
+          size_t offset = 0;
+          auto grad = grads.at(0);
+          std::vector<SymbolicVariable> returned_grads;
+          for (auto input : inputs) {
+            returned_grads.push_back(grad.narrow(dim, offset, input.sizes()[dim]));
+            offset += input.sizes()[dim];
+          }
+          return returned_grads;
+        }
+      }
+    }
+    throw std::runtime_error(std::string("don't support differentiation of `") +
+                            node->kind().toDisplayString() + "`");
+  };
+  if (!isDifferentiable(node)) {
+    throw std::runtime_error(std::string("differentiation of ") + node->kind().toDisplayString() + " "
+                             "is not supported, or it is missing necessary type information");
+  }
+  auto sym_grads = build_sym_grad(fmap<SymbolicVariable>(grad_values));
+  return fmap(sym_grads, [](const SymbolicVariable &v) { return v.value(); });
+}
+
+static value_set findAllRequiresGradNodes(
+        Graph& graph, const std::vector<bool>& input_requires_grad) {
+  JIT_ASSERT(graph.inputs().size() == input_requires_grad.size());
+  std::unordered_set<Value*> requires_grad_set;
+  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
+
+  auto inputs = graph.inputs();
+  for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
+    if (!input_requires_grad[i]) continue;
+    requires_grad_set.emplace(inputs[i]);
+  }
+
+  for (Node * node : graph.nodes()) {
+    if (!outputRequiresGrad(node, requires_grad)) continue;
+    for (Value * output : node->outputs())
+      requires_grad_set.emplace(output);
+  }
+
+  return requires_grad_set;
+}
+
+
+// If we have a function y = f(x) with jacobian J, the backwards of f is dx = J^t dy.
+// Note that because the backwards always implements this matrix multiply,
+// we know that it maps an input vector of zeros to an output vector of zero
+// regardless of what operations it choses to do inside to actually implement
+// the matrix multiply (most use some optimized form and never generate J^t).
+// More generally, we know that all of the backward computations are linear and
+// can use this property to do more aggressive optimizations later.
+// It is ok to replace any backward function with known-zero inputs with something
+// that produces known-zero outputs. This function encloses each know-linear
+// backward function in a 'GradOf' sub-block so that we can perform optimizations
+// using this information. In particular, specializeUndef will observe if
+// all the inputs to the linear block are Undef, which the autograd uses to represent
+// zeros, and then propagate the undefs to the outputs of the block.
+static std::vector<Value*> linearGradientForNode(Node* node, ArrayRef<Value*> grad_values) {
+  auto & graph = *node->owningGraph();
+  auto linear = graph.insertNode(graph.create(prim::GradOf, {grad_values}, 0));
+  // to make reading gradient graphs easier, remember the name of the forward op
+  linear->s_(attr::name, node->kind().toDisplayString());
+  auto block = linear->addBlock();
+  {
+    WithInsertPoint guard(block);
+    auto results = gradientForNode(node, grad_values);
+    for(auto r : results) {
+      block->registerOutput(r);
+      linear->addOutput()->copyMetadata(r);
+    }
+  }
+  return linear->outputs();
+}
+
+struct ReverseDetails {
+  ReverseDetails(value_map&& grad_map, value_set&& requires_grad_set, Block * reverse_block)
+    : grad_map(std::move(grad_map))
+    , requires_grad_set(std::move(requires_grad_set))
+    , reverse_block(reverse_block) {}
+
+  value_map grad_map;
+  value_set requires_grad_set;
+  Block * reverse_block;
+};
+
+// AutogradAdd is a special addition function that handles Undef
+// AutogradAdd(a, b) == a + b if defined(a) and defined(b)
+// AutogradAdd(Undef, b) == b
+// AutogradAdd(a, Undef) == a
+// AutogradAdd(Undef, Undef) == Undef
+static Value* createAutogradAdd(Value* a, Value* b) {
+  auto graph = a->owningGraph();
+  return graph->insertNode(graph->create(prim::AutogradAdd, {a, b}))->output();
+}
+
+// Before:
+//   - grad_desc has field f initialized to the original 0-stage graph
+// After:
+//   - the last node of f (f->nodes().reverse()[0]) is a gradient node
+//     whose block has vjp inputs for all outputs that require_grad
+//     and vjp outputs for all primal inputs that require_grad
+//   - grad_desc has df_input_vjps and df_output_vjps set
+//     (but df_input_vjps will be modified later as well)
+static ReverseDetails addReverseInline(Gradient& grad_desc,
+                                  const std::vector<bool>& input_requires_grad) {
+  auto & graph = *grad_desc.f;
+  // note: reverse_node is intentionally not inserted to avoid
+  // accidentally acting on it (e.g. in elminate dead code),
+  // std::cout << *reverse_node << to view its state.
+  auto reverse_node = graph.create(prim::Reverse, 0);
+  auto reverse_block = reverse_node->addBlock();
+  WithInsertPoint guard(reverse_block);
+  auto requires_grad_set = findAllRequiresGradNodes(graph, input_requires_grad);
+  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
+
+  value_map grad_map; // x -> dx mapping
+  const auto get_grad = [&](Value* v) -> Value* {
+    auto it = grad_map.find(v);
+    if (it == grad_map.end()) {
+      auto undef = graph.insertNode(graph.createUndefined());
+      std::tie(it, std::ignore) = grad_map.emplace(v, undef->output());
+    }
+    return it->second;
+  };
+  const auto set_grad = [&](Value *x, Value *dx) {
+    if (Value * prev_grad = grad_map[x]) {
+      grad_map[x] = createAutogradAdd(prev_grad, dx);
+    } else {
+      grad_map[x] = dx;
+    }
+  };
+
+  auto outputs = graph.outputs();
+  for (size_t i = 0, num_outputs = outputs.size(); i < num_outputs; ++i) {
+    Value * output = outputs[i];
+    if (!requires_grad(output))
+      continue;
+    Value * output_grad = reverse_block->addInput()->setType(output->type());
+    set_grad(output, output_grad);
+    grad_desc.df_input_vjps.push_back(i);
+  }
+
+  for (auto it = graph.nodes().rbegin(), end = graph.nodes().rend(); it != end; ++it) {
+    Node *node = *it;
+    auto inputs = node->inputs();
+    if (!outputRequiresGrad(node, requires_grad)) continue;
+
+    value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad));
+    JIT_ASSERT(grad_inputs.size() == node->inputs().size());
+    for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) {
+      set_grad(inputs[i], grad_inputs[i]);
+    }
+  }
+
+  auto inputs = graph.inputs();
+  for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
+    Value * input = inputs[i];
+    if (!requires_grad(input))
+      continue;
+    reverse_block->registerOutput(get_grad(input));
+    grad_desc.df_output_vjps.push_back(i);
+  }
+  return ReverseDetails(std::move(grad_map), std::move(requires_grad_set), reverse_block);
+}
+
+// Takes a grad_desc.f returned from `addReverseInline` and splits off the
+// reverse_block into its own graph, storing it in df.
+// All intermediates needed in the second stage are added to
+// outputs of f, and taken as inputs in df. For a more
+// detailed description see Note [Gradient graphs] in autodiff.h.
+// This function also initializes the fields in grad_desc that were undefined after
+// `addReverseInline` (and extends `df_input_vjps` with vjps for captured temporaries).
+static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
+  auto & graph = *grad_desc.f;
+  auto primal_block = graph.block();
+  auto reverse_block = rev_info.reverse_block;
+
+  // --------------------------------------------------------------------------
+  // 1. Find values of f that need to be captured.
+  // --------------------------------------------------------------------------
+  // First, we need to find all values that are produced in f,
+  // and used in df. They will need to be added as inputs of the df
+  // and some of them may also need to be appended as outputs of f if
+  // they are not already an input or an output of f
+  value_set reverse_captures_set;
+  value_list reverse_captures; // Invariant: topo sorted
+  auto check_uses = [&](Value *v) {
+    for (auto use : v->uses()) {
+      if (use.user->owningBlock() == primal_block)
+        continue;
+      if (/* bool unseen = */ reverse_captures_set.emplace(v).second) {
+        reverse_captures.push_back(v);
+      }
+    }
+  };
+  for (Value * input : graph.inputs()) {
+    if (input->stage() != 0) break;
+    check_uses(input);
+  }
+  for (Node * node : graph.nodes()) {
+    if (node->stage() != 0) break;
+    for (Value * output : node->outputs())
+      check_uses(output);
+  }
+
+  // --------------------------------------------------------------------------
+  // 2. Prepare input/outputs lists for f and df
+  // --------------------------------------------------------------------------
+  // It's simple to construct primal_inputs/reverse_outputs,
+  // but primal_outputs/reverse_inputs are much more subtle.
+  // Here's a summary of how they are supposed to look like:
+  //
+  // Primal outputs:
+  //   [original outputs], [temporaries]
+  //
+  // Reverse inputs:
+  //   [output vjps (aka grad_outputs)], [temporary vjps]
+  //   [captured primal values, in topological order],
+
+  // -- Construct primal_outputs, df_input_captures, f_real_outputs ----
+  grad_desc.f_real_outputs = graph.outputs().size();
+
+  std::unordered_map<Value*, size_t> orig_primal_outputs_idx;
+  std::unordered_map<Value*, size_t> orig_primal_inputs_idx;
+  // NOTE: we use emplace to avoid replacing an existing index if an output is repeated
+  for (size_t i = 0, num_outputs = graph.outputs().size(); i < num_outputs; ++i)
+    orig_primal_outputs_idx.emplace(graph.outputs()[i], i);
+  for (size_t i = 0, num_inputs = graph.inputs().size(); i < num_inputs; ++i)
+    orig_primal_inputs_idx[graph.inputs()[i]] = i;
+
+  // NB: reverse_captures are already deduplicated, and in topo order
+  for (Value * capture_val : reverse_captures) {
+    // If it's already an output we don't have to add anything,
+    // but register the fact that it needs to be captured.
+    if (orig_primal_outputs_idx.count(capture_val) > 0) {
+      grad_desc.df_input_captured_outputs.push_back(orig_primal_outputs_idx[capture_val]);
+    // If it's an input, we could add it as an output but in fact it's
+    // more efficient to use a special kind of capture.
+    } else if (orig_primal_inputs_idx.count(capture_val) > 0) {
+      grad_desc.df_input_captured_inputs.push_back(orig_primal_inputs_idx.at(capture_val));
+    // Otherwise it's just a regular intermediate value that we need to add as an output
+    } else {
+      // we need to create a new temporary output for this capture because it wasn't availiable.
+      graph.registerOutput(capture_val);
+      grad_desc.df_input_captured_outputs.emplace_back(graph.outputs().size() - 1);
+    }
+  }
+
+  // -- Add VJPs for temporaries, adjust df_input_vjps -------------------------
+  // NB [possible optimization]: use the newly added vjp input as soon as the first
+  // vjp for that value is generated, to reduce the lifespan of this input
+  // (currently we add it to the final vjp after all adds).
+  for (size_t i = grad_desc.f_real_outputs; i < graph.outputs().size(); ++i) {
+    Value * tmp = graph.outputs().at(i);
+    // Add VJP inputs only for intermediates that actually required grad.
+    if (rev_info.requires_grad_set.count(tmp) == 0) continue;
+    Value * tmp_vjp_in = reverse_block->addInput()->setType(tmp->type());
+    Value * tmp_vjp_prev = rev_info.grad_map.at(tmp);
+    // This is quite weird because we can't first make a sum and then replace all uses
+    // of tmp_vjp_prev (that would replace its use in the sum too!), so we create an
+    // incorrect sum that doesn't use prev vjp, replace uses, and fix the sum.
+    Value * new_vjp = createAutogradAdd(tmp_vjp_in, tmp_vjp_in);
+    new_vjp->node()->moveAfter(tmp_vjp_prev->node());
+    tmp_vjp_prev->replaceAllUsesWith(new_vjp);
+    new_vjp->node()->replaceInput(1, tmp_vjp_prev);
+    grad_desc.df_input_vjps.emplace_back(i);
+  }
+
+  // add the captures as formal arguments to the reverse_block
+  // afterward inputs: [output vjps][temporary vjps][captures]
+  // construct a map from captured 'value' to the index in the input list
+  // used to extract this block into its own function
+  std::unordered_map<Value*, size_t> capture_to_formal_index;
+  const auto & add_capture = [&](Value * captured) {
+    capture_to_formal_index[captured] = reverse_block->inputs().size();
+    reverse_block->addInput()->copyMetadata(captured);
+  };
+  for(auto & offset : grad_desc.df_input_captured_inputs)
+    add_capture(graph.inputs()[offset]);
+  for(auto & offset : grad_desc.df_input_captured_outputs)
+    add_capture(graph.outputs()[offset]);
+
+  grad_desc.df = std::make_shared<Graph>();
+  grad_desc.df->block()->cloneFrom(reverse_block, [&](Value* v) {
+    return grad_desc.df->inputs()[capture_to_formal_index.at(v)];
+  });
+  // reverse_node was just to hold onto reverse_block in a debuggable way
+  // we can remove it now.
+  reverse_block->owningNode()->destroy();
+}
+
+Gradient differentiate(std::shared_ptr<Graph>& _graph, const std::vector<bool>& requires_grad) {
+  Gradient grad_desc;
+  // Take ownership of the graph
+  JIT_ASSERTM(_graph.use_count() == 1,
+              "differentiate will mutate and destroy the graph, so it requires "
+              "graph.use_count() == 1, but found %d", _graph.use_count());
+  std::swap(_graph, grad_desc.f);
+  // XXX: Take care when handling outputs - they can be duplicated!
+
+  WithInsertPoint guard(grad_desc.f->block());
+  // Fills in df_input_vjps and df_output_vjps
+  auto rev_info = addReverseInline(grad_desc, requires_grad);
+  // addReverseInline has to call gradientForNode if *any* of the outputs
+  // require grad, but it will emit vjps for *all* outputs. Use DCE to remove
+  // unnecessary nodes.
+  EliminateDeadCode(grad_desc.f);
+  // Fills in f, df, f_real_outputs, df_input_captures,
+  // modifies df_input_vjps (new vjps are added for temporaries)
+  lambdaLiftReverse(grad_desc, rev_info);
+  return grad_desc;
+}
+
+}}
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
new file mode 100644
index 0000000..e0dd63c
--- /dev/null
+++ b/torch/csrc/jit/autodiff.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+#include <ATen/ATen.h>
+#include <vector>
+
+namespace torch { namespace jit {
+
+using value_list = std::vector<Value*>;
+// Example showcasing how Gradient is constructed:
+//
+// Let's assume we have a function f, `m` and `n` do not require grad
+// (`n` can depend only on `m`):
+//   y, n = f(x, m)
+//
+// Now, let's assume that the reverse of f (called f') needs to use values of `x`, `t` and `y`.
+// `t` is an intermediate value produced in the body of f, and let's assume that it requires
+// grad too.
+//
+// In this case differentiate(f) will return this:
+//   y, n, t = f(x, m)        // `t` is appended to the output list
+//   dx = f'(dy, dt, x, t, y) // No `dm` or `dn` because they do not require gradient
+//                            // All needed values from f are prepended to the input list
+//
+//   f_real_outputs = 2       // Only first two outputs were present in f originally
+//   df_input_vjps = {0, 2}   // i.e. connect grad_fn of y and t variables produced by f,
+//                    y  t    // with y's output_nr = 0 and t's output_nr = 1
+//   df_input_captures = {I0, O2, O0} // Order matches the prefix of inputs to df
+//                        x   t   y
+//   df_output_vjps = {0}     // i.e. connect next_edge[0] of grad_fn to x's (grad_fn, output_nr).
+//
+// Terminology: vjp = vector-jacobian product
+
+struct Gradient {
+  explicit operator bool() const {
+    return df != nullptr;
+  }
+  std::shared_ptr<Graph> f;
+  std::shared_ptr<Graph> df;
+
+  // Describes how to construct outputs of f from what its graph will return.
+  // This is necessary because some trailing outputs are intermediates produced
+  // only to be saved for df (and should be ignored).
+  size_t f_real_outputs;
+
+  // df inputs are split into two sections: vjps (aka grad_outputs) and captures.
+  // VJPs are "seeds" for the gradient computation given for each input capture
+  // of an Output kind.
+  // Captures are values the need to be saved when f is run. We handle inputs
+  // specially, because this allows us to avoid adding extra vjps as df inputs.
+
+  std::vector<size_t> df_input_vjps; // Offsets into f's outputs.
+  // capture can come from inputs or outputs
+  std::vector<size_t> df_input_captured_inputs; // Offsets into f's inputs
+  std::vector<size_t> df_input_captured_outputs; // Offsets into f's outputs
+
+
+  // df will produce vjps for a subset of inputs of f that required grad.
+  // df_output_vjps[idx] == inp_idx means that idx-th output of df produces a vjp
+  // for inp_idx-th input of f.
+  std::vector<size_t> df_output_vjps; // Offsets into f's inputs.
+
+  // How to use gradient to implement a differentiable autograd function:
+  // When running f:
+  //   - Unwrap input Variables
+  //   - Run f's graph
+  //   - Create grad_fn
+  //   - Wrap outputs in Variables (assume we have a tensor_outputs array):
+  //       outputs = map(Variable, tensor_output)
+  //       for i, offset in enumerate(df_input_vjps):
+  //         outputs[offset].set_grad_fn(grad_fn, output_nr=i)
+  //   - Use df_output_vjps to connect next_edges of grad_fn:
+  //       for idx in df_output_vjps:
+  //         grad_fn.add_next_edge(inputs[idx].gradient_edge())
+  //   - Save captures for df (care needs to be taken to use SavedVariables for inputs and
+  //                           outputs that we will actually return)
+  //   - Return outputs[:f_real_outputs]
+  //
+  // When running df:
+  //   - Concatenate received vjps and captured Variables
+  //   - Interpret df
+  //   - Wrap outputs of df into Variables (that don't require grad)
+};
+// XXX: When calling this function, graph should have complete type information.
+// Use the shape analysis pass to fill in the gaps if it doesn't.
+Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad);
+
+// can we take a derivative of this node symbolically?
+bool isDifferentiable(Node * n);
+bool isDifferentiable(Graph & g);
+bool isZero(Value * v);
+
+}}
diff --git a/torch/csrc/jit/batched/BatchTensor.cpp b/torch/csrc/jit/batched/BatchTensor.cpp
new file mode 100644
index 0000000..a843280
--- /dev/null
+++ b/torch/csrc/jit/batched/BatchTensor.cpp
@@ -0,0 +1,92 @@
+#include "BatchTensor.h"
+
+namespace torch { namespace jit {
+
+BatchTensor::BatchTensor(at::Tensor data, at::Tensor mask, at::Tensor dims){
+  if(data.dim() != mask.dim() || mask.dim() != dims.size(0) + 1){
+    throw std::runtime_error("malformed MaskedBatch with data.dim(): "
+      + std::to_string(data.dim()) + ", mask.dim(): " + std::to_string(mask.dim())
+      + ", dims.size(0): " + std::to_string(dims.size(0)));
+  }
+  this->data = data;
+  this->mask = mask;
+  this->dims = dims;
+}
+
+BatchTensor::BatchTensor(at::Tensor data, int64_t batch_size){
+  dims = data.type().toScalarType(at::kByte).tensor(data.dim());
+  dims.fill_(0);
+  std::vector<int64_t> sizes(data.dim() + 1, -1);
+  sizes[0] = batch_size;
+  this->data = data.unsqueeze(0).expand(sizes);
+  std::vector<int64_t> mask_sizes(data.dim() + 1, 1);
+  mask_sizes[0] = batch_size;
+  mask = data.type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask.fill_(1);
+}
+
+BatchTensor::BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims) {
+  auto bs = datalist.size();
+  std::vector<int64_t> sizes(dims.size(0) + 1, 0), mask_sizes(dims.size(0) + 1, 0);
+  sizes[0] = bs;
+  mask_sizes[0] = bs;
+  for(int64_t i = 1; i < dims.size(0) + 1; i++){
+    for(auto x : datalist){
+      sizes[i] = std::max(sizes[i], x.size(i));
+    }
+    mask_sizes[i] = *dims[i - 1].toByteData() ? sizes[i] : 1;
+  }
+  data = datalist[0].type().tensor(sizes);
+  data.fill_(0);
+  mask = datalist[0].type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask.fill_(0);
+  for(std::size_t i = 0; i < datalist.size(); i++){
+    auto data_item = data.narrow(0, i, 1);
+    auto mask_item = mask.narrow(0, i, 1);
+    for(int64_t j = 0; j < dims.size(0); j++){
+      if(*dims[j].toByteData()){
+        data_item = data_item.narrow(j + 1, 0, datalist[i].size(j + 1));
+        mask_item = mask_item.narrow(j + 1, 0, datalist[i].size(j + 1));
+      }
+    }
+    data_item += datalist[i];
+    mask_item.fill_(1);
+  }
+  this->dims = dims;
+}
+
+std::vector<at::Tensor> BatchTensor::examples() {
+  std::vector<at::Tensor> result;
+  // calculate number of valid entries in dth dimension of data
+  auto mask_sum = [](at::Tensor data, int d) -> int64_t{
+    data = data.sum(d, /*keepdim=*/true);
+    while(data.dim() >= 1)
+      data = data[0];
+    return *data.toLongData();
+  };
+  for(int64_t i = 0; i < data.size(0); i++){
+    auto data_tmp = data.narrow(0, i, 1);
+    for(int64_t d = 0; d < dims.size(0); d++){
+      if(*dims[d].toByteData()){
+        data_tmp = data_tmp.narrow(d + 1, 0, mask_sum(mask[i], d));
+      }
+    }
+    result.push_back(data_tmp);
+  }
+  return result;
+}
+
+void initBatchTensorBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+  auto jit = m.def_submodule("_jit");
+  py::class_<BatchTensor>(jit, "BatchTensor")
+      .def(py::init<at::Tensor, at::Tensor, at::Tensor>())
+      .def(py::init<at::Tensor, int64_t>())
+      .def(py::init<std::vector<at::Tensor>, at::Tensor>())
+      .def("examples", &BatchTensor::examples)
+      .def("get_data", &BatchTensor::get_data)
+      .def("get_mask", &BatchTensor::get_mask)
+      .def("get_dims", &BatchTensor::get_dims);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/batched/BatchTensor.h b/torch/csrc/jit/batched/BatchTensor.h
new file mode 100644
index 0000000..dd624c3
--- /dev/null
+++ b/torch/csrc/jit/batched/BatchTensor.h
@@ -0,0 +1,53 @@
+#pragma once
+#include "ATen/Tensor.h"
+#include "torch/csrc/jit/pybind.h"
+#include "ATen/ATen.h"
+#include <iostream>
+#include <vector>
+
+namespace torch { namespace jit {
+struct BatchTensor {
+public:
+  BatchTensor(at::Tensor data, at::Tensor mask, at::Tensor dims);
+  // expand a tensor to a batchtensor given batch_size
+  BatchTensor(at::Tensor data, int64_t batch_size);
+  BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims);
+  ~BatchTensor(){};
+  const char * toString() const {
+    return "BatchTensor";
+  }
+  at::IntList sizes() const {
+    return data.sizes();
+  }
+  int64_t dim() const {
+    return data.dim();
+  }
+  std::vector<at::Tensor> examples();
+  at::Tensor get_data(){
+    return data;
+  }
+  at::Tensor get_mask(){
+    return mask;
+  }
+  at::Tensor get_dims(){
+    return dims;
+  }
+
+public:
+  // data is a Tensor whose size is the batch size in the batch dimension,
+  // the size of all examples in static dimensions,
+  // and at least as large as the largest example in the batch in dynamic dimensions.
+  at::Tensor data;
+  // mask is a Tensor whose size is the batch size in the batch dimension,
+  // one in static dimensions,
+  // and at least as large as the largest example in the batch in dynamic dimensions.
+  // Each entry in the mask corresponds to one or more entries in the data array (singleton, i.e., static, dimensions are broadcasted),
+  // with a one in the mask denoting that the corresponding data entries represent valid, meaningful data and a zero denoting that they do not.
+  at::Tensor mask;
+  // dims is a 1-dimensional tensor with a bool for each non-batch dimension,
+  // representing whether that dimension is static (False) or dynamic (True).
+  at::Tensor dims;
+};
+
+void initBatchTensorBindings(PyObject* module);
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/code_template.h b/torch/csrc/jit/code_template.h
new file mode 100644
index 0000000..8795a34
--- /dev/null
+++ b/torch/csrc/jit/code_template.h
@@ -0,0 +1,224 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <sstream>
+
+namespace torch { namespace jit {
+
+// A template environment is a mapping from template variable names, e.g.,
+// identifier (corresponding to $identifier) to their expansions.
+//
+// This template environment supports storing strings, numbers and lists
+// of strings, and can be chained together (so that lookup proceeds in
+// in the top level environment, and then recurses into a parent
+// environment if the key is not found.)
+struct TemplateEnv {
+  TemplateEnv()
+  : parent(nullptr) {}
+  TemplateEnv(TemplateEnv & parent)
+  : parent(&parent) {}
+
+  using string_list = std::vector<std::string>;
+
+  // Add a string 'v' to the map at key 'k'.
+  void s(const std::string & k, const std::string & v) {
+    strings_[k] = v;
+    lists_.erase(k);
+  }
+
+  // Add a number 'v' to the map at key 'k'
+  template<typename T>
+  void d(const std::string & k, const T & v) {
+    strings_[k] = std::to_string(v);
+    lists_.erase(k);
+  }
+
+  // Retrieve the string representation of the value stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const std::string & s(const std::string & k) const {
+    if(strings_.count(k) == 0) {
+      if(parent) {
+        return parent->s(k);
+      }
+      notFound(k);
+    }
+    return strings_.at(k);
+  }
+
+  // Store a list of strings 'v' in the map at 'k'.
+  void v(const std::string & k, const string_list & v) {
+    lists_[k] = v;
+    strings_.erase(k);
+  }
+
+  // Retrieve a list of strings stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const string_list & v(const std::string & k) const {
+    if(lists_.count(k) == 0) {
+      if(parent) {
+        return parent->v(k);
+      }
+      notFound(k);
+    }
+    return lists_.at(k);
+  }
+
+  // Test if a string 'k' is a string (as opposed to a list.)
+  bool keyIsString(const std::string & k) const {
+    if(strings_.count(k) > 0)
+      return true;
+    if(lists_.count(k) > 0)
+      return false;
+    if(parent)
+      return parent->keyIsString(k);
+    notFound(k);
+  }
+private:
+  [[ noreturn ]]
+  void notFound(const std::string & k) const {
+    std::stringstream ss;
+    ss << "key not found: " << k;
+    throw std::logic_error(ss.str());
+  }
+  std::unordered_map<std::string,std::string> strings_;
+  std::unordered_map<std::string,string_list> lists_;
+  TemplateEnv * parent;
+};
+
+/*
+# Match $identifier or ${identifier} and replace with the value in env.
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block substitution by indenting all lines of all elements.
+# If the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated. ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+*/
+struct CodeTemplate {
+  /* implicit */ CodeTemplate(const std::string & t)
+  : template_text(t) {}
+
+  std::string format(const TemplateEnv & env) {
+    std::stringstream out;
+    size_t pos = 0;
+    size_t indent = 0;
+    bool all_whitespace = true;
+    while(pos < template_text.size()) {
+      char c = template_text[pos];
+      if(c == '$') {
+        std::stringstream kss;
+        bool comma_before;
+        bool comma_after;
+        size_t new_pos = parseKey(pos,kss,comma_before,comma_after);
+        std::string k = kss.str();
+        bool is_string = env.keyIsString(k);
+        if(all_whitespace) {
+          if(is_string)
+            emitStringWithIndents(out, indent, env.s(k));
+          else
+            emitLinesIndented(out, indent, env.v(k));
+        } else {
+          if(is_string)
+            out << env.s(k);
+          else
+            emitCommaSeparatedList(out, env.v(k), comma_before, comma_after);
+        }
+        all_whitespace = false;
+        pos = new_pos;
+      } else {
+        out << c;
+        if(!isspace(c))
+          all_whitespace = false;
+        indent++;
+        if(c == '\n') {
+          indent = 0;
+          all_whitespace = true;
+        }
+        pos++;
+      }
+    }
+    return out.str();
+  }
+private:
+  using string_list = std::vector<std::string>;
+  char charAt(size_t p) {
+    if (p >= template_text.size())
+      throw std::logic_error("EOS found in key");
+    return template_text[p];
+  }
+  size_t parseKey(size_t pos, std::ostream & k, bool & comma_before, bool & comma_after) {
+    comma_before = false;
+    comma_after = false;
+    pos++;
+    if(charAt(pos) == '{') {
+      pos++;
+      if(charAt(pos) == ',') {
+        comma_before = true;
+        pos++;
+      }
+      pos = parseIdent(pos, k);
+      if(charAt(pos) == ',') {
+        comma_after = true;
+        pos++;
+      }
+      if(charAt(pos) != '}')
+        throw std::logic_error("missing terminating '}'");
+      pos++;
+      return pos;
+    } else {
+      return parseIdent(pos, k);
+    }
+  }
+  size_t parseIdent(size_t pos, std::ostream & k) {
+    while(pos < template_text.size() &&
+      (isalnum(template_text[pos]) || template_text[pos] == '_')) {
+      k << template_text[pos];
+      pos++;
+    }
+    return pos;
+  }
+  void emitCommaSeparatedList(std::ostream & out, const string_list & strings, bool comma_before, bool comma_after) {
+    if(comma_before && strings.size() > 0)
+      out << ", ";
+    for(size_t i = 0; i < strings.size(); ++i) {
+      if(i > 0)
+        out << ", ";
+      out << strings[i];
+    }
+    if(comma_after && strings.size() > 0)
+      out << ", ";
+  }
+  // These indentation functions follow the convention that they never emit
+  // leading or trailing newlines when the input string does not have leading
+  // or trailing newlines. It's the responsibility of the calling function
+  // to indent correctly in the context.
+  void emitIndent(std::ostream & out, size_t indent) {
+    for(size_t i = 0; i < indent; ++i) {
+      out << " ";
+    }
+  }
+  void emitStringWithIndents(std::ostream & out, size_t indent, const std::string & str) {
+    for(auto c : str) {
+      out << c;
+      if(c == '\n') {
+        emitIndent(out, indent);
+      }
+    }
+  }
+  void emitLinesIndented(std::stringstream & out, size_t indent, const string_list & strings) {
+    for(size_t i = 0; i < strings.size(); ++i) {
+      if(i > 0)
+        emitIndent(out, indent);
+      emitStringWithIndents(out,indent,strings[i]);
+      if(i+1 != strings.size())
+        out << "\n";
+    }
+  }
+  std::string template_text;
+};
+static inline std::string format(const std::string & fmt, TemplateEnv & env) {
+  return CodeTemplate(fmt).format(env);
+}
+
+}}
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
new file mode 100644
index 0000000..f1283da
--- /dev/null
+++ b/torch/csrc/jit/export.cpp
@@ -0,0 +1,448 @@
+#include "torch/csrc/jit/export.h"
+#include "torch/csrc/onnx/onnx.h"
+#include "torch/csrc/autograd/symbolic.h"
+
+#include "torch/csrc/utils/functional.h"
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <string>
+
+namespace torch { namespace jit {
+
+namespace {
+
+namespace onnx = ::torch::onnx;
+
+std::string value_name(Value* n) {
+  return n->uniqueName();
+}
+
+struct ExportContext {
+  size_t num_blocks = 0;
+  onnx::OperatorExportTypes operator_export_type;
+};
+
+void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr<Graph> & g,
+                 const std::vector<at::Tensor> & initializers,
+                 ExportContext *ctx, RawDataExportMap* raw_data_export_map=nullptr);
+
+void encodeBlock(onnx::GraphProto * p_g, Block *b,
+                const std::vector<at::Tensor> & initializers,
+                ExportContext *ctx, RawDataExportMap* raw_data_export_map);
+
+void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor,
+                  at::optional<std::string> external_ref={},
+                  RawDataExportMap* raw_data_export_map = nullptr) {
+  for(auto d : tensor.sizes()) {
+    p->add_dims(d);
+  }
+  onnx::DataType onnx_type;
+  // Most integral types and float16 need to be serialized as int32
+  at::ScalarType cast_type = tensor.type().scalarType();
+  switch(tensor.type().scalarType()) {
+    case at::kDouble:
+      onnx_type = onnx::kDOUBLE;
+      break;
+    case at::kFloat:
+      onnx_type = onnx::kFLOAT;
+      break;
+    case at::kHalf:
+      onnx_type = onnx::kFLOAT16;
+      cast_type = at::kInt;
+      break;
+    case at::kByte:
+    case at::kChar:
+      onnx_type = onnx::kINT8;
+      cast_type = at::kInt;
+      break;
+    case at::kShort:
+      onnx_type = onnx::kINT16;
+      cast_type = at::kInt;
+      break;
+    case at::kInt:
+      onnx_type = onnx::kINT32;
+      break;
+    case at::kLong:
+      onnx_type = onnx::kINT64;
+      break;
+    default:
+      torch::barf("unexpected tensor scalar type");
+      break;
+  }
+  p->set_data_type(onnx_type);
+  // CPU's HalfTensor doesn't have contiguous(), so first calling contiguous()
+  auto t = tensor.contiguous().toBackend(at::kCPU).toType(cast_type);
+  // Add a buffer to the raw_data_export_map for the caller to dump into an
+  // external data store. If external_ref is not specified, we instead dump
+  // the contiguous data into the protobuf itself
+  if (external_ref) {
+    // For now, we use the name of the tensor as the external lookup name to
+    // avoid ONNX protobuf changes.
+    JIT_ASSERT(external_ref.value() == p->get_name());
+    JIT_ASSERT(raw_data_export_map != nullptr);
+    JIT_ASSERT(raw_data_export_map->count(external_ref.value()) == 0);
+    (*raw_data_export_map)[external_ref.value()] = t;
+    p->set_external_data_present();
+  } else {
+    p->set_raw_data(t);
+  }
+}
+
+void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, ExportContext *ctx) {
+  auto attr = n_p->add_attribute();
+  JIT_ASSERT(name.is_attr());
+  attr->set_name(name.toUnqualString());
+  switch(n->kindOf(name)) {
+    case AttributeKind::f:
+      attr->set_f(n->f(name));
+      attr->set_type(onnx::aFLOAT);
+      break;
+    case AttributeKind::fs:
+      attr->set_type(onnx::aFLOATS);
+      for(auto & v : n->fs(name))
+        attr->add_floats(v);
+      break;
+    case AttributeKind::i:
+      attr->set_type(onnx::aINT);
+      attr->set_i(n->i(name));
+      break;
+    case AttributeKind::is:
+      attr->set_type(onnx::aINTS);
+      for(auto & v : n->is(name))
+        attr->add_ints(v);
+      break;
+    case AttributeKind::s:
+      attr->set_type(onnx::aSTRING);
+      attr->set_s(n->s(name));
+      break;
+    case AttributeKind::ss:
+      attr->set_type(onnx::aSTRINGS);
+      for(auto & v : n->ss(name))
+        attr->add_strings(v);
+      break;
+    case AttributeKind::t: {
+      attr->set_type(onnx::aTENSOR);
+      auto t = attr->mutable_t();
+      encodeTensor(t, n->t(name));
+    } break;
+    case AttributeKind::ts:
+      attr->set_type(onnx::aTENSORS);
+      for(auto & v : n->ts(name)) {
+        auto t = attr->add_tensors();
+        encodeTensor(t, v);
+      }
+      break;
+    case AttributeKind::g: {
+      attr->set_type(onnx::aGRAPH);
+      auto g = attr->mutable_g();
+      encodeGraph(g, n->g(name), {}, ctx, nullptr);
+    } break;
+    case AttributeKind::gs:
+      attr->set_type(onnx::aGRAPHS);
+      for(auto & v : n->gs(name)) {
+        auto g = attr->add_graphs();
+        encodeGraph(g, v, {}, ctx, nullptr);
+      }
+      break;
+  }
+}
+
+void encodeTypeProtoTensorType(onnx::TypeProtoTensor* tensor_type, Value* n) {
+  onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
+  if (TensorType* node_type = n->type()->cast<TensorType>()) {
+    const std::vector<std::int64_t>& sizes = node_type->sizes();
+    for (std::int64_t s : sizes) {
+      shape->add_dim(s);
+    }
+    onnx::DataType onnx_type;
+    switch(node_type->scalarType()) {
+      case at::kDouble:
+        onnx_type = onnx::kDOUBLE;
+        break;
+      case at::kFloat:
+        onnx_type = onnx::kFLOAT;
+        break;
+      case at::kHalf:
+        onnx_type = onnx::kFLOAT16;
+        break;
+      case at::kByte:
+      case at::kChar:
+        onnx_type = onnx::kINT8;
+        break;
+      case at::kShort:
+        onnx_type = onnx::kINT16;
+        break;
+      case at::kInt:
+        onnx_type = onnx::kINT32;
+        break;
+      case at::kLong:
+        onnx_type = onnx::kINT64;
+        break;
+      default:
+        torch::barf("unexpected tensor scalar type");
+        break;
+    }
+    tensor_type->set_data_type(onnx_type);
+  }
+}
+
+void encodeValueInfo(onnx::ValueInfoProto* v, Value* n) {
+  v->set_name(value_name(n));
+  onnx::TypeProto* t = v->mutable_type();
+  onnx::TypeProtoTensor* tensor_type = t->mutable_tensor_type();
+  encodeTypeProtoTensorType(tensor_type, n);
+}
+
+void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr<Graph>& g,
+                 const std::vector<at::Tensor> & initializers,
+                 ExportContext *ctx, RawDataExportMap* raw_data_export_map) {
+  encodeBlock(p_g, g->block(), initializers, ctx, raw_data_export_map);
+}
+
+void encodeBlock(onnx::GraphProto * p_g, Block *b,
+                 const std::vector<at::Tensor> & initializers,
+                 ExportContext *ctx, RawDataExportMap* raw_data_export_map) {
+  JIT_ASSERT(p_g != nullptr);
+  std::string block_name = "torch-jit-export";
+  if (ctx->num_blocks) {
+    block_name += std::to_string(ctx->num_blocks);
+  }
+  ctx->num_blocks++;
+  p_g->set_name(block_name);
+
+  for (auto input : b->inputs()) {
+    onnx::ValueInfoProto* v = p_g->add_input();
+    encodeValueInfo(v, input);
+  }
+  for (auto output : b->outputs()) {
+    onnx::ValueInfoProto* v = p_g->add_output();
+    encodeValueInfo(v, output);
+  }
+  for (auto node : b->nodes()) {
+    bool is_raw_export = ctx->operator_export_type == onnx::OperatorExportTypes::RAW;
+    if (node->kind() == prim::Undefined && !is_raw_export) {
+      // Undefined nodes are used to implement optional inputs. One
+      // way to "not provide" an optional input is to create an
+      // Undefined node, and pass its output as that input.
+      continue;
+    }
+    auto p_n = p_g->add_node();
+    if (node->getSourceLocation()) {
+      std::stringstream ss;
+      node->getSourceLocation()->highlight(ss);
+      p_n->set_doc_string(ss.str());
+    }
+    for(auto input : node->inputs()) {
+      if (input->node()->kind() == prim::Undefined && !is_raw_export) {
+        p_n->add_input("");
+      } else {
+        p_n->add_input(value_name(input));
+      }
+    }
+    for(auto output : node->outputs()) {
+      p_n->add_output(value_name(output));
+    }
+    if (is_raw_export) {
+      JIT_ASSERT(!node->kind().is_onnx());
+      p_n->set_domain(node->kind().domainString());
+    }
+    else if (ctx->operator_export_type != onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
+      JIT_ASSERT(node->kind().is_onnx());
+    }
+    p_n->set_op_type(node->kind().toUnqualString());
+    for(auto attr_name : node->attributeNames()) {
+      addAttribute(p_n, node, attr_name, ctx);
+    }
+    if (is_raw_export && node->blocks().size() > 0) {
+      auto blocks = p_n->add_attribute();
+      blocks->set_name("_blocks");
+      blocks->set_type(onnx::aGRAPHS);
+      for (auto block : node->blocks()) {
+        auto graph = blocks->add_graphs();
+        encodeBlock(graph, block, initializers, ctx, raw_data_export_map);
+      }
+    }
+    if (node->kind() == torch::jit::onnx::Loop) {
+      JIT_ASSERT(node->blocks().size() == 1);
+
+      auto body = p_n->add_attribute();
+      body->set_name("body");
+      body->set_type(onnx::aGRAPH);
+      auto g = body->mutable_g();
+      encodeBlock(g, node->blocks()[0], {}, ctx, raw_data_export_map);
+    }
+    if (node->kind() == torch::jit::onnx::If) {
+      JIT_ASSERT(node->blocks().size() == 2);
+
+      auto true_branch = p_n->add_attribute();
+      true_branch->set_name("then_branch");
+      true_branch->set_type(onnx::aGRAPH);
+      auto true_g = true_branch->mutable_g();
+      encodeBlock(true_g, node->blocks()[0], {}, ctx, raw_data_export_map);
+
+      auto false_branch = p_n->add_attribute();
+      false_branch->set_name("else_branch");
+      false_branch->set_type(onnx::aGRAPH);
+      auto false_g = false_branch->mutable_g();
+      encodeBlock(false_g, node->blocks()[1], {}, ctx, raw_data_export_map);
+    }
+  }
+  auto num_initializers = initializers.size();
+  JIT_ASSERT(b->inputs().size() >= num_initializers);
+  size_t inputs_count = b->inputs().size() - num_initializers;
+  for (auto & tensor : initializers) {
+    // TODO: stop using positions to determine which initializers
+    // match to which inputs
+    std::string name = p_g->get_input_name(inputs_count++);
+    auto p = p_g->add_initializer();
+    p->set_name(name);
+    if (raw_data_export_map) {
+      encodeTensor(p, tensor, name, raw_data_export_map);
+    } else {
+      encodeTensor(p, tensor, {});
+    }
+  }
+}
+
+void encodeModel(onnx::ModelProto* p_m, const std::shared_ptr<Graph>& g,
+                 const std::vector<at::Tensor>& initializers,
+                 RawDataExportMap* raw_data_export_map = nullptr,
+                 onnx::OperatorExportTypes operator_export_type
+                   = onnx::OperatorExportTypes::ONNX) {
+  onnx::GraphProto* p_g = p_m->mutable_graph();
+  ExportContext ctx;
+  ctx.operator_export_type = operator_export_type;
+  encodeGraph(p_g, g, initializers, &ctx, raw_data_export_map);
+}
+
+namespace {
+std::string getNodeStackTraceString(Node* n) {
+  std::stringstream ss;
+  if (n->getSourceLocation()) {
+    n->getSourceLocation()->highlight(ss);
+  } else {
+    ss << "<unknown location>";
+  }
+  return ss.str();
+}
+} // namespace
+
+void validateGraph(const std::shared_ptr<Graph>& graph, onnx::OperatorExportTypes operator_export_type) {
+  for (auto node : graph->nodes()) {
+      // Macro'ed so we get a marginally better line number on failed export
+#define FAIL_EXPORT(name) \
+      throw std::runtime_error(std::string("ONNX export failed: ") + name + "\n\nGraph we tried to export:\n" + graph->toString());
+    IR_IF(node, PythonOp)
+      auto py_node = static_cast<torch::jit::PythonOp*>(value);
+      FAIL_EXPORT(
+          "Couldn't export Python operator " + py_node->name() +
+          "\n\nDefined at:\n" + getNodeStackTraceString(node))
+    IR_ELSE()
+      // Special error messages for certain types of operators
+      if (node->kind() == aten::expand) {
+        FAIL_EXPORT(
+            "Could not export a broadcasted operation; ONNX likely does not support this form of broadcasting.\n\nBroadcast occurred at:\n" +
+            getNodeStackTraceString(node));
+      }
+      if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
+        FAIL_EXPORT(
+            "Cannot export individual pack_padded_sequence or pad_packed_sequence; these operations must occur in pairs.\n\nUsage of this operation occurred at:\n" +
+            getNodeStackTraceString(node));
+      }
+      bool is_aten_fallback = operator_export_type == onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK;
+      if (!node->kind().is_onnx() && !is_aten_fallback && node->kind() != prim::Undefined) {
+        FAIL_EXPORT(
+            "Couldn't export operator " + node->kind().toDisplayString() + "\n\nDefined at:\n" +
+            getNodeStackTraceString(node));
+      }
+    IR_END()
+#undef FAIL_EXPORT
+  }
+}
+
+}
+
+namespace {
+
+RawDataExportMap ToModelProto(
+    const std::shared_ptr<Graph>& graph,
+    const std::vector<at::Tensor> & initializers,
+    int64_t onnx_opset_version,
+    bool defer_weight_export,
+    onnx::OperatorExportTypes operator_export_type,
+    onnx::ModelProto *model_proto) {
+  if (operator_export_type != onnx::OperatorExportTypes::RAW) {
+    validateGraph(graph, operator_export_type);
+  }
+
+  model_proto->set_producer_name("pytorch");
+  model_proto->set_producer_version("0.3");
+  auto* imp = model_proto->add_opset_import();
+  // This is the version of ONNX operator set we are targeting
+  imp->set_version(onnx_opset_version);
+
+  // Map {external_data_ref -> raw data} for external serialization of weights
+  RawDataExportMap raw_data_export_map;
+
+  // Set up nanopb callbacks and compute the amount of space needed to store
+  // the resulting protobuf
+  if (defer_weight_export) {
+    encodeModel(model_proto, graph, initializers, &raw_data_export_map, operator_export_type);
+  } else {
+    encodeModel(model_proto, graph, initializers, nullptr, operator_export_type);
+  }
+
+  return raw_data_export_map;
+}
+
+}  // namespace
+
+
+std::string PrettyPrintExportedGraph(
+                        const std::shared_ptr<Graph>& graph,
+                        const std::vector<at::Tensor> & initializers,
+                        int64_t onnx_opset_version,
+                        bool defer_weight_export,
+                        ::torch::onnx::OperatorExportTypes operator_export_type) {
+  ::torch::onnx::ModelProto model_proto;
+  RawDataExportMap raw_data_export_map;
+  raw_data_export_map = ToModelProto(
+    graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
+    &model_proto);
+  return model_proto.prettyPrint();
+}
+
+// export_raw_ir will export IR ops without turning them into ONNX ops.
+// The output will use the ONNX protobuf format, but the ops will not
+// conform to the ONNX op specification. Thus, the output will not
+// be interpretable by a ONNX-compatible framework. However, PyTorch or
+// libtorch will be able to import the IR and play it back.
+std::tuple<std::string, RawDataExportMap> ExportGraph(
+                        const std::shared_ptr<Graph>& graph,
+                        const std::vector<at::Tensor> & initializers,
+                        int64_t onnx_opset_version,
+                        bool defer_weight_export,
+                        ::torch::onnx::OperatorExportTypes operator_export_type) {
+  ::torch::onnx::ModelProto model_proto;
+  RawDataExportMap raw_data_export_map;
+  raw_data_export_map = ToModelProto(
+    graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
+    &model_proto);
+
+  size_t out_size;
+  pb_get_encoded_size(&out_size, onnx_ModelProto_fields, &model_proto.proto);
+
+  // Allocate storage and export the graph
+  std::string out(out_size, '\0');
+  pb_ostream_t ostream = pb_ostream_from_buffer(reinterpret_cast<pb_byte_t *>(&out[0]), out_size);
+  pb_encode(&ostream, onnx_ModelProto_fields, &model_proto.proto);
+
+  return std::make_tuple(out, raw_data_export_map);
+}
+
+}}
diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h
new file mode 100644
index 0000000..95758dc
--- /dev/null
+++ b/torch/csrc/jit/export.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/onnx/onnx.h"
+
+namespace torch { namespace jit {
+
+// This map is used to keep track of parameters that should be exported
+// externally. When `defer_weight_export` is true, the returned map contains
+// kv pairs that map {external reference name} -> {at::Tensor to be exported}.
+// It is the responsibility of the caller to export these appropriately.
+//
+// For example, when exporting to a zip archive, the caller may write out files
+// for each entry in the export map, with the filename being the key and the
+// file contents being the raw tensor data.
+using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
+
+std::tuple<std::string, RawDataExportMap> ExportGraph(
+    const std::shared_ptr<Graph>& graph,
+    const std::vector<at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    bool defer_weight_export = false,
+    ::torch::onnx::OperatorExportTypes operator_export_type
+      = ::torch::onnx::OperatorExportTypes::ONNX);
+
+// For testing purposes
+std::string PrettyPrintExportedGraph(
+    const std::shared_ptr<Graph>& graph,
+    const std::vector<at::Tensor> & initializers,
+    int64_t onnx_opset_version,
+    bool defer_weight_export,
+    ::torch::onnx::OperatorExportTypes operator_export_type
+      = ::torch::onnx::OperatorExportTypes::ONNX);
+
+}}
diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h
new file mode 100644
index 0000000..13c81dc
--- /dev/null
+++ b/torch/csrc/jit/function_schema.h
@@ -0,0 +1,107 @@
+#pragma once
+#include "ATen/ATen.h"
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+// schema as used in the compiler for resolving function calls and reporting
+// errors. These objects should be constructed from C10 schema once those
+// are availiable
+struct Argument {
+  Argument(
+      std::string name = "",
+      TypePtr type = nullptr,
+      at::optional<int32_t> N = at::nullopt,
+      at::optional<at::Tensor> default_value = at::nullopt,
+      bool kwarg_only = true)
+      : name(std::move(name)),
+        type(type? type : DynamicType::get()),
+        N(N),
+        default_value(default_value),
+        kwarg_only(kwarg_only) {}
+  std::string name;
+  TypePtr type;
+
+  // for list types, an optional statically known length for the list
+  // e.g. for int[3]: type = ListType::ofInts(), N = 3
+  // If present, this will allow scalars to be broadcast to this length to
+  // become a list.
+  at::optional<int32_t> N;
+
+  // encoded using as_tensor, use tensor_as<T> to get value for attribute
+  at::optional<at::Tensor> default_value;
+  // is this only specifyable as a keyword argument?
+  bool kwarg_only;
+};
+
+struct FunctionSchema {
+  FunctionSchema(
+      std::string name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      bool is_vararg = false,
+      bool is_varret = false)
+      : name(std::move(name)),
+        arguments(std::move(arguments)),
+        returns(std::move(returns)),
+        is_vararg(is_vararg),
+        is_varret(is_varret) {}
+  FunctionSchema(
+      Symbol name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      bool is_vararg = false,
+      bool is_varret = false)
+      : FunctionSchema(
+            name.toQualString(),
+            std::move(arguments),
+            std::move(returns),
+            is_vararg,
+            is_varret) {}
+
+  const std::string name;
+  const std::vector<Argument> arguments;
+  const std::vector<Argument> returns;
+  // if true then this schema takes an arbitrary number of additional arguments
+  // after the argument specified in arguments
+  // currently this is used primarily to represent 'primtive' operators whose
+  // arguments are not checked by schema
+  const bool is_vararg;
+  const bool is_varret;
+  at::optional<int> argumentIndexWithName(const std::string& name) const {
+    for(size_t i = 0; i < arguments.size(); ++i) {
+      if(name == arguments[i].name)
+        return i;
+    }
+    return at::nullopt;
+  }
+};
+
+// for debugging, make sure we can describe the call site
+inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
+  return out << arg.type->str() << " " << arg.name << (arg.default_value ? "=<default>" : "");
+}
+
+inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
+  // eventually this should look almost identical to python arg parser, but
+  // it is simpler for now to work directly on this schema
+  auto emitList = [&](const std::vector<Argument>& args) {
+    out << "(";
+    for(size_t i = 0; i < args.size(); ++i) {
+      if(i > 0)
+        out << ", ";
+      out << args[i];
+    }
+    out << ")";
+  };
+
+  out << schema.name;
+  emitList(schema.arguments);
+  if(schema.returns.size() > 1) {
+    out << " -> ";
+    emitList(schema.returns);
+  }
+  return out;
+}
+
+}}
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
new file mode 100644
index 0000000..78087f8
--- /dev/null
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -0,0 +1,964 @@
+#ifndef _WIN32
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/resource_guard.h"
+#include "torch/csrc/utils/disallow_copy.h"
+#include "torch/csrc/variable_tensor_functions.h"
+
+#include "ATen/ATen.h"
+#ifdef USE_CUDA
+#include "THC/THC.h"
+#include "torch/csrc/cuda/cuda_check.h"
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <dlfcn.h>
+#include <unistd.h>
+
+namespace torch { namespace jit {
+
+std::vector<bool> TensorDesc::findContiguous(
+    const at::IntList& sizes,
+    const at::IntList& strides) {
+  JIT_ASSERT(sizes.size() == strides.size());
+  std::vector<bool> cont(sizes.size());
+  for(size_t i = 0; i < sizes.size(); ++i) {
+    int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
+    cont[i] = strides[i] == expected_stride;
+  }
+  return cont;
+}
+
+namespace {
+
+#ifdef USE_CUDA
+
+static int ceilDiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+#endif
+
+std::ostream& operator<<(std::ostream & out, const TensorDesc & d) {
+  out << d.scalar_type << "[";
+  for(auto b : d.contiguity)
+    out << b << ";";
+  out << "]";
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Code generation
+
+namespace codegen {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
+Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
+so typedefs help it handle those cases*/
+
+auto type_declarations_template = CodeTemplate(R"(
+#if defined(__CUDACC_RTC__)
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef short int  int16_t;
+typedef long long int int64_t;
+${HalfHeader}
+#endif
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T * data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+)");
+
+auto cuda_compilation_unit_template = CodeTemplate(R"(
+${type_declarations}
+
+extern "C" __global__
+void ${kernelName}(IndexType totalElements, ${formals}) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+        linearIndex < totalElements;
+        linearIndex += gridDim.x * blockDim.x) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+)");
+
+auto cpu_compilation_unit_template = CodeTemplate(R"(
+#include <cstddef>
+#include <cstdint>
+#include <math.h>
+${type_declarations}
+
+#define OMP_THRESHOLD 100000
+static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
+  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
+  for (IndexType linearIndex = 0;
+        linearIndex < totalElements;
+        linearIndex += 1) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+
+extern "C"
+void ${kernelName}(IndexType totalElements, void ** args) {
+  ${kernelName}_kernel(totalElements ${,argument_loads});
+}
+)");
+
+// This snippet enables half support in the jit. Following the pattern for
+// reductions, fp16 input data is immediately upconverted to float
+// with __half2float(). All mathematical operations are done on float
+// values, and if needed the intermediate float representation is 
+// converted to half with __float2half() when writing to a half tensor.
+constexpr auto half_support_literal  = R"(    
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#if defined(__cplusplus)
+  struct __align__(2) __half {
+    __host__ __device__ __half() { }
+
+  protected:
+    unsigned short __x;
+  };
+
+  /* All intrinsic functions are only available to nvcc compilers */
+  #if defined(__CUDACC__)
+    /* Definitions of intrinsics */
+    __device__ __half __float2half(const float f) {
+      __half val;
+      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
+      return val;
+    }
+
+    __device__ float __half2float(const __half h) {
+      float val;
+      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
+      return val;
+    }
+  #endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+
+typedef __half half;
+)";
+
+// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
+// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
+// linearId /= sizes[i];
+auto dim_calc = CodeTemplate(R"(
+//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
+size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
+${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
+)");
+
+void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
+  TemplateEnv env;
+  env.s("tensor",tensor);
+  out << format("IndexType ${tensor}_offset = 0;\n",env);
+  out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
+  for(int d = ndim - 1; d >= 0; --d) {
+    env.d("d",d);
+    env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
+    env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
+      format("* ${tensor}.strides[${d}]",env) : "");
+    out << dim_calc.format(env);
+    if(d > 0) {
+      out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
+    }
+  }
+}
+
+std::string valueName(Value * n) {
+  return "n" + std::to_string(n->unique());
+}
+
+std::string scalarValue(const at::Tensor & t) {
+  auto s =  at::Scalar(t);
+  if (s.isIntegral()){ 
+    return std::to_string(s.toLong()); 
+  } else {
+     std::ostringstream out;
+     out << std::scientific << s.toDouble() << "f";
+     return out.str();
+  }
+}
+
+const char * scalarTypeName(at::ScalarType type) {
+  if (type == at::ScalarType::Half) {
+    return "half";
+  }
+
+  switch(type) {
+    #define DEFINE_CASE(ctype,name,_) \
+      case at::ScalarType::name: return #ctype;
+    AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
+    #undef DEFINE_CASE
+    default:
+      throw std::runtime_error("unknown scalar type");
+  }
+}
+
+std::string encodeRHS(Node * n) {
+  static std::unordered_map<NodeKind, std::string> simple_map_ops = {
+    // unary
+    {aten::abs, "absf(${0})"},
+    {aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
+    {aten::relu, "${0} < 0 ? 0.f : ${0} "},
+    {aten::log, "logf(${0})"},
+    {aten::log10, "log10f(${0})"},
+    {aten::log1p, "log1pf(${0})"},
+    {aten::log2,  "log2f(${0})"},
+    {aten::lgamma, "lgammaf(${0})"},
+    {aten::exp, "expf(${0})"},
+    {aten::expm1, "expm1f(${0})"},
+    {aten::cos, "cosf(${0})"},
+    {aten::acos, "acosf(${0})"},
+    {aten::cosh, "coshf(${0})"},
+    {aten::sin, "sinf(${0})"},
+    {aten::asin, "asinf(${0})"},
+    {aten::sinh, "sinhf(${0})"},
+    {aten::tan, "tanf(${0})"},
+    {aten::atan, "atanf(${0})"},
+    {aten::tanh, "tanhf(${0})"},
+    {aten::sqrt, "sqrtf(${0})"},
+    {aten::rsqrt, "rsqrtf(${0})"},
+    {aten::ceil, "ceilf(${0})"},
+    {aten::floor, "floorf(${0})"},
+    {aten::round, "roundf(${0})"},
+    {aten::trunc, "truncf(${0})"},
+    {aten::frac, "fracf(${0})"},
+    {aten::reciprocal, "reciprocalf(${0})"},
+    {aten::neg, "-${0}"},
+    //simple binary
+    {aten::atan2, "atan2(${0}, ${1})"},
+    {aten::min, "fminf(${0}, ${1})"},
+    {aten::max, "fmaxf(${0}, ${1})"},
+
+    //binary with other
+    // TODO: some of these ops will not get generated because
+    // we only work on float inputs/outputs, but they are here to record
+    // that they are valid mappable ops once we handle more type
+    {aten::__and__, "${0} && ${1}"},
+    {aten::__lshift__, "${0} << ${1}"},
+    {aten::__or__, "${0} || ${1}"},
+    {aten::__rshift__, "${0} >> ${1}"},
+    {aten::__xor__, "${0} ^ ${1}"},
+    {aten::div, "${0} / ${1}"},
+    {aten::eq, "${0} == ${1}"},
+    {aten::fmod, "fmodf(${0}, ${1})"},
+    {aten::ge, "(${0} >= ${1})"},
+    {aten::gt, "${0} > ${1}"},
+    {aten::le, "(${0} <= ${1})"},
+    {aten::lt, "${0} < ${1}"},
+    {aten::type_as, "(${0})"}, //everything is implicitly convertible to float
+    {aten::mul, "${0} * ${1}"},
+    {aten::ne, "${0} != ${1}"},
+    {aten::remainder, "remainderf(${0}, ${1})"},
+    {aten::pow, "powf(${0}, ${1})"},
+
+    //alpha
+    {aten::add, "${0} + ${alpha}*${1}"},
+    {aten::sub, "(${0} - ${alpha}*${1})"},
+
+    // special
+    {aten::lerp, "${0} + ${weight}*(${1} - ${0})"},
+    {aten::clamp, "min(max(${0},${min}),${max})"},
+
+    // simple derivatives
+    {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
+    {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
+  };
+
+
+  TemplateEnv env;
+  size_t i = 0;
+  for(auto in : n->inputs()) {
+    env.s(std::to_string(i++),valueName(in));
+  }
+  // ops like div have a / b or a / 2 with the constant having the attribute other
+  // so we add other as an input if it is present
+  // 'pow' is the same but uses exponent as the attribute, so we handle that here as well
+  if(n->hasAttribute(attr::other) || n->hasAttribute(attr::exponent)) {
+    env.s(std::to_string(i), scalarValue(n->t(attr::other)));
+  }
+  // we also add any other scalar tensors to the env for special ops
+  for(auto a : n->attributeNames()) {
+    if(n->kindOf(a) == AttributeKind::t) {
+      auto v = n->t(a);
+      if(v.dim() == 0) {
+        JIT_ASSERT(a.is_attr());
+        env.s(a.toUnqualString(), scalarValue(v));
+      }
+    }
+  }
+  const auto & str = simple_map_ops.at(n->kind());
+  return format(str, env);
+}
+
+std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
+                                            const std::string & name,
+                                            AnnotatedGraph & agraph,
+                                            bool use_cuda) {
+  Graph& subgraph = *agraph.graph;
+  TemplateEnv env;
+  env.s("kernelName",name);
+  // TODO: handle cases where we need to generate > 2^32 element tensors
+  env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
+
+  std::stringstream body;
+  std::stringstream tensorOffsets;
+  std::vector<std::string> formals;
+  std::vector<std::string> argument_loads;
+  auto emitFormal = [&](Value * n, const TensorDesc & desc) {
+    std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
+    size_t nDim = desc.nDim();
+    emitIndexingFor(tensorOffsets, tensor, nDim,  desc.lastIsContiguous());
+    env.s("tensor",tensor);
+    env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
+    env.d("nDim",nDim);
+    env.s("scalar_type",scalarTypeName(desc.scalar_type));
+    formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
+    argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",env));
+  };
+  {
+    size_t i = 0;
+    for(auto p : subgraph.inputs())
+      emitFormal(p,agraph.input_desc[i++]);
+  }
+  std::vector<ConcatDesc> concat_desc;
+  std::vector<Value*> flat_output_nodes;
+  {
+    size_t i = 0;
+    for(auto o : subgraph.outputs()) {
+      auto & desc = agraph.output_desc[i++];
+      if(o->node()->kind() != aten::cat) {
+        emitFormal(o, desc);
+        concat_desc.emplace_back();
+        flat_output_nodes.push_back(o);
+      } else {
+        auto cat = o->node();
+        size_t nInputs = cat->inputs().size();
+        concat_desc.emplace_back(desc, nInputs, cat->i(attr::dim));
+        for(auto c : cat->inputs()) {
+          emitFormal(c, *concat_desc.back().subtensorDesc);
+          flat_output_nodes.push_back(c);
+        }
+      }
+    }
+  }
+
+  bool has_half_tensor = false;
+  size_t formal_count = 0;
+  for(auto p : subgraph.inputs()) {
+    env.s("node",valueName(p));
+    env.d("formal",formal_count++);
+
+    // Acquires and converts (if needed) inputs
+    auto pt = p->type()->cast<TensorType>();
+    if (use_cuda && pt && pt->scalarType() == at::ScalarType::Half) {
+      env.s(
+        "access"
+      , format("__half2float(t${formal}.data[t${formal}_offset])", env));
+      has_half_tensor = true;
+    } else {
+      env.s("access", format("t${formal}.data[t${formal}_offset]", env));
+    }
+    
+    //TODO: actual type propagation rather than relying on auto..
+    body << format("auto ${node} = ${access};\n",env);
+  }
+
+  for(auto n : subgraph.nodes()) {
+    if(n->kind() == aten::cat)
+      continue; // Concat nodes by narrowing the output Tensors before the kernel runs
+    env.s("node",valueName(n->output()));
+    env.s("rhs", encodeRHS(n));
+    body << format("auto ${node} = ${rhs};\n",env);
+  }
+
+  for(auto o : flat_output_nodes) {
+    env.d("formal",formal_count++);
+    env.s("access",format("t${formal}.data[t${formal}_offset]",env));
+    env.s("node",valueName(o));
+
+    // Acquires and converts (if needed) outputs
+    auto ot = o->type()->cast<TensorType>();
+    if (use_cuda && ot && ot->scalarType() == at::ScalarType::Half) {
+      body << format("${access} = __float2half(${node});\n",env);
+      has_half_tensor = true;
+    } else {
+      body << format("${access} = ${node};\n",env);
+    }
+  }
+
+  // Includes half support if any half tensors are involved
+  if (has_half_tensor) {
+    env.s("HalfHeader", half_support_literal);
+  } else {
+    env.s("HalfHeader", "");
+  }
+
+  env.s("tensorOffsets",tensorOffsets.str());
+  env.s("kernelBody",body.str());
+  env.v("formals",formals);
+  env.v("argument_loads",argument_loads);
+  env.s("type_declarations", type_declarations_template.format(env));
+  if(use_cuda) {
+    out << cuda_compilation_unit_template.format(env);
+  } else {
+    out << cpu_compilation_unit_template.format(env);
+  }
+
+  return concat_desc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // codegen namespace
+} // anonymous namespace
+
+// Host-side view of TensorInfo (that visivle for the kernel is defined above).
+// Note dims[0] - we need to dynamically allocate the dims.
+struct TensorInfo {
+  void * data;
+#pragma GCC diagnostic ignored "-Wpedantic"
+  uint32_t sizes_strides[0];
+#pragma GCC diagnostic pop
+
+  uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
+  uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
+};
+
+CompiledFusionFunction::CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph)
+  : name(name)
+  , input_desc(agraph.input_desc)
+  , output_desc(agraph.output_desc) {}
+
+namespace {
+
+// Tries to compress sizes and strides according to cont. Emits the result t
+// c_sizes, c_strides and throws an error on failure (if can't compress)
+void compressContiguous(
+    at::IntList sizes,
+    at::IntList strides,
+    const std::vector<bool> & cont,
+    uint32_t * c_sizes,
+    uint32_t * c_strides) {
+  size_t compressed_dims = 0;
+  size_t cur = 0;
+  size_t ndim = sizes.size();
+  while(cur < ndim) {
+    size_t total_size = sizes[cur];
+    cur++;
+    while(cont[cur-1] && cur < ndim) {
+      JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
+      total_size *= sizes[cur];
+      cur++;
+    }
+   // cur starts pointing at the beginning of run to compress
+   // cur ends one _after_ the terminating false or end of list.
+   // total_size is the size of all dimensions [begin,end)
+   // examples:
+   // f = not cont.
+   // t = cont.
+   // x = don't care, including past end of list
+   // s = start of cur
+   // e = end of cur
+
+
+   // f x x x
+   // s e
+
+   //  t f x x
+   //  s   e
+
+   //  t t f x
+   //  s     e
+
+    c_sizes[compressed_dims] = total_size;
+    c_strides[compressed_dims] = strides[cur-1];
+    compressed_dims++;
+  }
+  JIT_ASSERT(!cont.back() || strides.back() == 1);
+}
+
+} // anonymous namespace
+
+void CompiledFusionFunction::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
+  at::DeviceGuard device_guard(inputs);
+  JIT_ASSERT(inputs.size() == input_desc.size());
+  JIT_ASSERT(outputs.size() == output_desc.size());
+  size_t flat_outputs_size = 0;
+  for(auto & c : concat_desc)
+    flat_outputs_size += c.nSubtensors;
+  // XXX: this code assumes that inputs are 32-bit addressable
+  // XXX: this code assumes that all inputs are of the same size
+  JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
+  uint32_t numel = inputs[0].numel();
+  at::IntList map_size = inputs[0].sizes();
+  // Compute the storage needed to store TensorInfo structs for inputs and outputs.
+  size_t uncompressedDim = input_desc.at(0).contiguity.size();
+  size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
+  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (inputs.size() + flat_outputs_size);
+  std::vector<char> buffer(maxPossibleBufferSize);
+  char * buffer_next = buffer.data();
+  // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
+  std::vector<void*> arguments;
+  arguments.reserve(1 + inputs.size() + flat_outputs_size);
+  // Asserts that t's dims can be compressed in the same way as in desc
+  // (that's what the kernel assumes), and appends it to the arguments vector.
+  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
+    size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
+    JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
+    auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
+    ti->data = t.data_ptr();
+    compressContiguous(t.sizes(), t.strides(), desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
+    buffer_next += maxPossibleTensorInfoSize;
+    arguments.push_back(ti);
+  };
+  arguments.push_back(&numel);
+  for (size_t i = 0; i < input_desc.size(); ++i)
+    addTensorInfo(input_desc[i], inputs[i]);
+  for (size_t i = 0; i < output_desc.size(); ++i) {
+    auto & c = concat_desc[i];
+    at::Tensor o = outputs[i];
+    if(c.nSubtensors == 1) {
+      o.resize_(map_size);
+      addTensorInfo(output_desc[i], outputs[i]);
+    } else {
+      size_t small_size = map_size[c.dim];
+      std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
+      concat_size[c.dim] = small_size * c.nSubtensors;
+      o.resize_(concat_size);
+      size_t offset = 0;
+      for(size_t j = 0; j < c.nSubtensors; ++j) {
+        // because the concatenated_output stays live, the underlying data
+        // in this view remains live through the end of this function
+        // so there is not need to hold onto this tensor
+        auto view = o.narrow(c.dim, offset, small_size);
+        addTensorInfo(*c.subtensorDesc, view);
+        offset += small_size;
+      }
+    }
+  }
+  launch_raw(numel, arguments.data());
+}
+
+void CompiledFusionFunction::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {
+  at::DeviceGuard guard(inputs.back());
+  outputs.clear();
+  outputs.reserve(outputDescriptors().size());
+  for(auto & od : outputDescriptors()) {
+    outputs.push_back(torch::getType(backend(),od.scalar_type).tensor());
+  }
+  launch_with_tensors(inputs, outputs);
+}
+
+#ifdef USE_CUDA
+
+void checkCUDAVersion(const cudaDeviceProp & prop) {
+  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
+      (prop.major >= 7 && CUDA_VERSION < 9000)) {
+    std::stringstream err_string;
+    err_string << "In CompiledFusionFunction, PyTorch compiled with insufficient CUDA version: "
+         << CUDA_VERSION << " for the current GPU device " << prop.name
+         << " with device capability " << prop.major << "." << prop.minor;
+    throw std::runtime_error(err_string.str());
+  }
+}
+
+struct CUDAFusionFunction : public CompiledFusionFunction {
+  CUDAFusionFunction(const std::string & name, AnnotatedGraph & agraph)
+  : CompiledFusionFunction(name, agraph) {
+    at::DeviceGuard device_guard(agraph.device);
+
+    TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
+    checkCUDAVersion(prop);
+
+    std::stringstream cu;
+    concat_desc = codegen::emitCompilationUnit(cu, name, agraph, true);
+    compilation_unit = cu.str();
+    nvrtcProgram program;
+    TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), NULL, 0, nullptr, nullptr));
+
+    std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
+    std::vector<const char *> args = {"--std=c++11", compute.c_str()};
+    nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
+    if (result == NVRTC_ERROR_COMPILATION) {
+      size_t logsize;
+      nvrtcGetProgramLogSize(program, &logsize);
+      std::vector<char> log(logsize);
+      nvrtcGetProgramLog(program, log.data());
+      cu << log.data();
+      throw std::runtime_error(cu.str());
+    }
+    ResourceGuard holdProgram([&] {
+      TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
+    });
+    TORCH_NVRTC_CHECK(result);
+
+    size_t ptx_size;
+    TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
+    ptx.resize(ptx_size);
+    TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
+
+    TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
+    TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
+
+    TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &maxBlocks, function, 128, 0));
+    maxBlocks *= prop.multiProcessorCount;
+  }
+  virtual ~CUDAFusionFunction() override {
+    TORCH_CU_CHECK(cuModuleUnload(module));
+  }
+protected:
+  virtual at::Backend backend() const override {
+    return at::kCUDA;
+  }
+  virtual void launch_raw(uint32_t numel, void ** arguments) override {
+     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
+     //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
+     //          << " numblocks =  " << numBlocks;
+
+     // it is possible that this is the first cuda call on this thread
+     // so make sure we initialize the Driver API's context
+     // cudaFree(0) accomplishes this.
+     CUcontext pctx = 0;
+     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+     if (!pctx) {
+        std::unique_lock<std::mutex> cudaFreeMutexLock(
+            *(THCCachingAllocator_getCudaFreeMutex()));
+        cudaFree(0);
+     }
+     CUstream stream = at::globalContext().getCurrentCUDAStream();
+     TORCH_CU_CHECK(cuLaunchKernel(
+       function,
+       numBlocks, 1, 1,
+       blockSize, 1, 1,
+       0, stream,
+       arguments,
+       nullptr));
+  }
+  std::vector<char> ptx;
+  CUmodule module;
+  CUfunction function;
+
+  // we record prop/device so if they are availiable for launch heuristics
+  // querying at launch is too slow for device properties.
+  int device;
+  cudaDeviceProp prop;
+  int blockSize = 128;
+  int maxBlocks;
+};
+
+#endif
+
+struct TempFile {
+  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
+  TempFile(const std::string & t, int suffix) {
+    // mkstemps edits its first argument in places
+    // so we make a copy of the string here, including null terminator
+    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
+    int fd = mkstemps(tt.data(), suffix);
+    JIT_ASSERT(fd != -1);
+    file_ = fdopen(fd, "r+");
+
+    // - 1 becuase tt.size() includes the null terminator,
+    // but std::string does not expect one
+    name_ = std::string(tt.begin(), tt.end() - 1);
+  }
+  const std::string & name() const {
+    return name_;
+  }
+  void sync() {
+    fflush(file_);
+  }
+  void write(const std::string & str) {
+    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
+    JIT_ASSERT(str.size() == result);
+  }
+  FILE* file()  {
+    return file_;
+  }
+  ~TempFile() {
+    if(file_ != nullptr) {
+      // unlink first to ensure another mkstemps doesn't
+      // race between close and unlink
+      unlink(name_.c_str());
+      fclose(file_);
+    }
+  }
+private:
+  FILE * file_ = nullptr;
+  std::string name_;
+};
+
+static void* checkDL(void * x) {
+  if(!x) {
+    barf("error in dlopen or dlsym: %s", dlerror());
+  }
+  return x;
+}
+
+struct DynamicLibrary {
+  TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+  DynamicLibrary(const char * name) {
+    handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+  }
+  void * sym(const char * name) {
+    JIT_ASSERT(handle);
+    return checkDL(dlsym(handle, name));
+  }
+  ~DynamicLibrary() {
+    if(!handle) return;
+    int r = dlclose(handle);
+    if(r) {
+      barf("error in dlclose: %s", dlerror());
+    }
+  }
+private:
+  void * handle = nullptr;
+};
+
+static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
+static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
+
+// NB: -march=native not supported on PPC64 g++.  It's a bit annoying
+// to do a configure-style test to decide whether or not the g++
+// actually supports it or not, so we heuristically use the host
+// compiler to predict if the runtime compiler supports the option we
+// want.  This probably won't work if you're cross-compiling.
+static const std::string compile_string =
+  "\"${cxx}\" -O3 -g "
+#ifndef __PPC64__
+  "-march=native "
+#endif
+  "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
+
+static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) {
+  TemplateEnv env;
+  env.s("cxx", config.cxx);
+  env.s("fopenmp", config.openmp ? "-fopenmp" : "");
+  env.s("cpp_file",cpp_file);
+  env.s("so_file",so_file);
+  std::string result = format(compile_string,env);
+  int r = system(result.c_str());
+  if(config.openmp && r != 0) {
+    std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
+    config.openmp = false; // disable for future compiles
+    return runCompiler(config, cpp_file, so_file);
+  }
+  JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
+}
+
+
+static const std::string disas_string =
+  "objdump -M  intel -d \"${so_file}\"";
+static void disas(const std::string & so_file) {
+  TemplateEnv env;
+  env.s("so_file", so_file);
+  std::string cmd = format(disas_string, env);
+  int r = system(cmd.c_str());
+  JIT_ASSERT(r == 0);
+}
+
+struct CPUFusionFunction : public CompiledFusionFunction {
+  CPUFusionFunction(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config)
+  : CompiledFusionFunction(name, agraph) {
+    TempFile so_file(so_template, 3);
+    TempFile cpp_file(cpp_template, 4);
+
+    std::stringstream cu;
+    concat_desc = codegen::emitCompilationUnit(cu, name, agraph, false);
+    compilation_unit = cu.str();
+    cpp_file.write(compilation_unit);
+    cpp_file.sync();
+    runCompiler(config, cpp_file.name(), so_file.name());
+    if(config.debug) {
+      disas(so_file.name());
+    }
+    so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
+#pragma GCC diagnostic ignored "-Wpedantic"
+    kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
+#pragma GCC diagnostic pop
+  }
+protected:
+  virtual at::Backend backend() const override {
+    return at::kCPU;
+  }
+  virtual void launch_raw(uint32_t numel, void ** arguments) override {
+    kernel(numel, arguments);
+  }
+  std::unique_ptr<DynamicLibrary> so_lib;
+  void (*kernel)(uint32_t, void**) = nullptr;
+};
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(AnnotatedGraph & agraph) {
+  std::stringstream key;
+  key << *agraph.graph << "\n";
+  key << "device " << agraph.device << "\n";
+  for(auto & i : agraph.input_desc)
+    key << i << "\n";
+  for(auto & i : agraph.output_desc)
+    key << i << "\n";
+  std::string key_ = key.str();
+
+  auto it = cache.find(key_);
+  if (it == cache.end()) {
+    std::string name = "kernel_" + std::to_string(cache.size());
+    CompiledFusionFunction * raw_func;
+    if(agraph.device != kCPUDevice) {
+#ifdef USE_CUDA
+      raw_func = new CUDAFusionFunction(name, agraph);
+#else
+      throw std::runtime_error("cannot compile a CUDA fusion group, CUDA is not enabled.");
+#endif
+    } else {
+      JIT_ASSERT(canCompileOnCPU());
+      raw_func = new CPUFusionFunction(name, agraph, config_);
+    }
+    it = cache.emplace(key_, std::shared_ptr<CompiledFusionFunction>(raw_func)).first;
+  }
+  return it->second;
+}
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Node* fusion_group) {
+  auto & graph = *fusion_group->g(attr::Subgraph);
+  AnnotatedGraph agraph(graph, fusion_group->i(attr::device));
+  for(auto & input : graph.inputs()) {
+    auto t = input->type()->expect<TensorType>();
+    agraph.input_desc.emplace_back(t);
+  }
+  for(auto & output : graph.outputs()) {
+    auto t = output->type()->expect<TensorType>();
+    agraph.output_desc.emplace_back(t);
+  }
+  return getOrCompile(agraph);
+}
+
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Graph & graph,
+                                                     int device,
+                                                     at::ArrayRef<at::Tensor> inputs,
+                                                     at::ArrayRef<at::Tensor> outputs) {
+  AnnotatedGraph agraph(graph, device);
+  for(auto & i : inputs) {
+   agraph.input_desc.emplace_back(i);
+  }
+  for(auto & i : outputs) {
+   agraph.output_desc.emplace_back(i);
+  }
+  return getOrCompile(agraph);
+}
+
+void FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
+  auto func = getOrCompile(graph, device, inputs, outputs);
+  func->launch_with_tensors(inputs, outputs);
+}
+
+static const std::string check_exists_string =
+  "which '${program}' > /dev/null";
+
+static bool programExists(const std::string & program) {
+  TemplateEnv env;
+  env.s("program", program);
+  std::string cmd = format(check_exists_string, env);
+  return 0 == system(cmd.c_str());
+}
+
+FusionCompiler::FusionCompiler() {
+  const char * cxx_env = getenv("CXX");
+  if(cxx_env != nullptr) {
+    config_.cxx = cxx_env;
+  }
+  if(!programExists(config_.cxx)) {
+    config_.cxx = "";
+  }
+  const char * debug_env = getenv("PYTORCH_FUSION_DEBUG");
+  config_.debug = debug_env && atoi(debug_env) != 0;
+}
+
+//TODO: thread safety
+FusionCompiler & sharedFusionCompiler() {
+  static FusionCompiler compiler;
+  return compiler;
+}
+
+}}
+
+# else
+// dummy implementations for windows
+
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/resource_guard.h"
+#include "torch/csrc/utils/disallow_copy.h"
+#include "ATen/ATen.h"
+#ifdef USE_CUDA
+#include "torch/csrc/cuda/cuda_check.h"
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+#include <sstream>
+#include <iostream>
+
+namespace torch { namespace jit {
+
+CompiledFusionFunction::CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph) {}
+
+void CompiledFusionFunction::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {}
+
+void CompiledFusionFunction::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {}
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(AnnotatedGraph & agraph) {
+  return nullptr;
+}
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Node* fusion_group) {
+  return nullptr;
+}
+
+
+std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Graph & graph,
+                                                     int device,
+                                                     at::ArrayRef<at::Tensor> inputs,
+                                                     at::ArrayRef<at::Tensor> outputs) {
+  return nullptr;
+}
+
+void FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {}
+
+FusionCompiler::FusionCompiler() {}
+
+FusionCompiler & sharedFusionCompiler() {
+  throw std::runtime_error("NYI: fuser is not supported on Windows.");
+}
+
+}}
+
+# endif
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
new file mode 100644
index 0000000..de2b25a
--- /dev/null
+++ b/torch/csrc/jit/fusion_compiler.h
@@ -0,0 +1,156 @@
+#pragma once
+#include <torch/csrc/jit/ir.h>
+#include "torch/csrc/utils/disallow_copy.h"
+#include "ATen/ATen.h"
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// type information needed by the compiler for input/outputs
+// contiguity[i] is true if the dim i is contiguous with dim i + 1.
+// contiguity.back() == true means strides.back() == 1.
+struct TensorDesc {
+  at::ScalarType scalar_type;
+  std::vector<bool> contiguity;
+
+  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
+  : scalar_type(type), contiguity(contiguity) {
+    nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
+  }
+
+  TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
+  : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
+  TensorDesc(const at::Tensor& t)
+    : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
+  TensorDesc(TensorType *type)
+    : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}
+
+  // number of dimensions after contiguity compression
+  size_t nDim() const {
+    return nDim_;
+  }
+
+  // do we have inner stride == 1?
+  bool lastIsContiguous() const {
+    return contiguity.size() == 0 || contiguity.back();
+  }
+
+  static std::vector<bool> findContiguous(
+    const at::IntList& sizes,
+    const at::IntList& strides);
+
+private:
+  size_t nDim_;
+};
+
+constexpr int kCPUDevice = -1;
+struct AnnotatedGraph {
+  // short-term storage only, so it borrows Graph.
+  AnnotatedGraph(Graph & graph, int device)
+  : graph(&graph), device(device) {}
+  Graph* graph = nullptr;
+  int device = kCPUDevice;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+};
+
+struct ConcatDesc {
+  size_t nSubtensors; // == 1 for outputs that are not concats, otherwise it is the number tensors concatenated
+  size_t dim; // dimension along which the concat occurs
+  std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
+  ConcatDesc()
+  : nSubtensors(1), dim(0) {}
+  ConcatDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
+  : nSubtensors(nSubtensors), dim(dim) {
+    JIT_ASSERT(nSubtensors > 1);
+    std::vector<bool> cont = desc.contiguity;
+    if(dim > 0) {
+      // when we narrow the concatenated output
+      // we make the size[dim] smaller while keeping the stride[dim] the same,
+      // meaning: stride[dim - 1] != stride[dim]*size[dim]
+      // so dim - 1 is no longer contiguous
+      cont[dim - 1] = false;
+    }
+    subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
+  }
+};
+
+struct CompiledFusionFunction {
+  TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);
+
+  CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
+  virtual ~CompiledFusionFunction() {}
+
+  // expects outputs to be pre-allocated
+  void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
+
+  // creates new tensors for outputs
+  void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
+  const std::vector<TensorDesc> & outputDescriptors() const {
+    return output_desc;
+  }
+protected:
+  virtual at::Backend backend() const = 0;
+
+  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
+  // code.
+  // The format of arguments is suitable for directly passing to a call to
+  // cuLaunchKernel as the kernel arguments.
+  // Currently the first argument is a pointer to numel (for passing to
+  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
+  // that compiled code uses to load Tensor data.
+  // launch_with_tensors handles packing at::Tensors into this arguments array.
+  // CPU code uses the same convension so that launch_with_tensors can be shared.
+  virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
+  std::string name;
+  // We keep these around for debugging
+  std::string compilation_unit;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+
+  // same size as output_desc, describes whether
+  // an output is actually a concatenation of
+  // many subtensors that the fusion group produces
+  std::vector<ConcatDesc> concat_desc;
+};
+
+struct FusionCompilerConfig {
+  std::string cxx = "g++"; // compiler location
+  bool debug = false; // emit debugging information about fusions
+  bool openmp = true;
+};
+
+// caching compiler
+struct FusionCompiler {
+  TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
+  FusionCompiler();
+
+  // ignores types in graph, and uses specific contiguity annotations
+  std::shared_ptr<CompiledFusionFunction> getOrCompile(AnnotatedGraph & agraph);
+  // uses type annotations in fusion_group to create Annotated graph
+  std::shared_ptr<CompiledFusionFunction> getOrCompile(Node * fusion_group);
+
+  // uses inputs/outputs as examples to infer continuity, does not run the graph
+  std::shared_ptr<CompiledFusionFunction> getOrCompile(Graph & graph,
+                                                       int device,
+                                                       at::ArrayRef<at::Tensor> inputs,
+                                                       at::ArrayRef<at::Tensor> outputs);
+  // debugging function that lets you do everything from compilation to execution
+  // in one step.
+  // this should not be used in the hot path of execution because it has to serialize
+  // the graph each time
+  void debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
+  bool canCompileOnCPU() const {
+    return config_.cxx.size() > 0;
+  }
+private:
+  FusionCompilerConfig config_;
+  std::unordered_map<std::string, std::shared_ptr<CompiledFusionFunction>> cache;
+};
+
+FusionCompiler & sharedFusionCompiler();
+
+}}
diff --git a/torch/csrc/jit/generic_if.h b/torch/csrc/jit/generic_if.h
new file mode 100644
index 0000000..fa839ba
--- /dev/null
+++ b/torch/csrc/jit/generic_if.h
@@ -0,0 +1,17 @@
+// TODO: I'm pretty sure Constness can be done with C++ templates, ala
+// std::is_const, but no time to work it out...
+#define GENERIC_IF(Constness, FullKind, x, Kind) \
+  auto && __match_key = x; \
+  switch(__match_key->kind()) { \
+    case FullKind: { \
+      auto * value = static_cast<Constness ::torch::jit::Kind*>(__match_key); (void) value;
+#define GENERIC_ELSEIF(Constness, FullKind, Kind) \
+    } break; \
+    case FullKind: { \
+      auto * value = static_cast<Constness ::torch::jit::Kind*>(__match_key); (void) value;
+#define GENERIC_ELSE() \
+    } break; \
+    default: {
+#define GENERIC_END() \
+    } break; \
+  };
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
new file mode 100644
index 0000000..5ef60d9
--- /dev/null
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -0,0 +1,535 @@
+#include "torch/csrc/jit/graph_executor.h"
+
+#include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/autodiff.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/passes/batch_mm.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/erase_number_types.h"
+#include "torch/csrc/jit/passes/graph_fuser.h"
+#include "torch/csrc/jit/passes/inplace_check.h"
+#include "torch/csrc/jit/passes/peephole.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/passes/remove_expands.h"
+#include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/specialize_undef.h"
+#include "torch/csrc/jit/passes/loop_unrolling.h"
+#include "torch/csrc/jit/passes/lower_grad_of.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/ivalue.h"
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/jit/script/compiler.h"
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace jit {
+
+namespace {
+
+using tensor_list = std::vector<at::Tensor>;
+using Variable = autograd::Variable;
+using autograd::variable_list;
+
+// this type is in ExecutionPlan to run its Gradient if it is
+// specified. It has a list of inputs captured by ExecutionPlan that
+// it concats with inputs to form the full set of inputs to graph.
+// see struct Gradient for a description of how the derivative graph
+// is constructed and what variables are captured.
+struct ExecutionPlanAutogradFunction : public autograd::Function {
+  ExecutionPlanAutogradFunction(GraphExecutor graph, size_t capture_size)
+  : graph(std::move(graph)) {
+    captures.reserve(capture_size);
+  }
+  virtual variable_list apply(variable_list&& inputs) override {
+    // TODO: expensive copies here to convert to/from tensor_list
+    // TODO: because inputs is passed by const reference there is no
+    // way to release tensors incrementally as this runs
+    variable_tensor_list all_inputs;
+    all_inputs.reserve(captures.size() + inputs.size());
+    all_inputs.insert(all_inputs.end(), inputs.begin(), inputs.end());
+    for(auto & sv : captures) {
+      all_inputs.push_back(sv.unpack(this->shared_from_this()));
+    }
+    auto tensors = graph.run(std::move(all_inputs));
+    // TODO: another copy that needs to be removed
+    return autograd::variable_list(tensors.begin(), tensors.end());
+  }
+private:
+  friend struct ExecutionPlan;
+  GraphExecutor graph;
+  std::vector<autograd::SavedVariable> captures;
+};
+
+
+// helper to run interpreter on variables until we switch
+// everything to IValue
+inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list inputs) {
+  std::vector<IValue> stack(inputs.begin(), inputs.end());
+  InterpreterState(code).runOneStage(stack);
+  return variable_tensor_list(fmap(stack, [](IValue& v) {
+    return std::move(v).toTensor();
+  }));
+}
+
+// an optimized way of executing the subgraph computed directly on
+// tensors rather than Variables.
+// This will unwrap Variables, run the plan, and re-wrap them.
+// It can optionally also have a gradient which is hooked up
+// to the output Variables if present.
+struct ExecutionPlan {
+  ExecutionPlan(std::shared_ptr<Graph>& graph)
+      : f(graph), graph(graph) {}
+  ExecutionPlan(std::shared_ptr<Graph>& graph, Gradient grad)
+      : f(graph),
+        graph(graph),
+        grad(std::move(grad)),
+        grad_executor(this->grad.df) {}
+
+  variable_tensor_list run(variable_tensor_list&& stack) const {
+    if(grad) {
+      return runWithGrad(std::move(stack));
+    }
+    return runOneStage(f, std::move(stack));
+  }
+  std::shared_ptr<Graph> get_graph() const {
+    return graph;
+  }
+
+  ExecutionPlanState getDebugState() {
+    ExecutionPlanState state;
+    state.f = &f;
+    state.graph = graph.get();
+    if (grad) {
+      state.grad = &grad;
+      state.grad_executor = std::unique_ptr<GraphExecutorState>(
+          new GraphExecutorState(grad_executor.getDebugState()));
+    } else {
+      state.grad = nullptr;
+      state.grad_executor.reset();
+    }
+    return state;
+  }
+
+private:
+  // note: should be inplace to avoid allocations, but we have to switch from
+  // a list of tensor to a list of ivalues
+  std::vector<IValue> unwrapVariables(variable_tensor_list && list) const {
+    return fmap(list, [](const Variable& v) -> IValue {
+      return v.defined() ? autograd::as_variable_ref(v).detach() : at::Tensor();
+    });
+  }
+  // note: should be inplace to avoid allocations, but we have to switch from
+  // a list of tensor to a list of ivalues
+  variable_tensor_list wrapTensors(tensor_list && list) const {
+    for(auto & v : list) {
+      v = autograd::make_variable(v, /*requires_grad=*/false);
+    }
+    return variable_tensor_list(std::move(list));
+  }
+  // Capture (save) inputs that would be required to subsequently run backwards
+  void captureInputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & inputs) const {
+    for(auto offset : grad.df_input_captured_inputs) {
+      grad_fn.captures.emplace_back(autograd::as_variable_ref(inputs[offset]), false);
+    }
+  }
+  void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & outputs) const {
+    for(auto offset : grad.df_input_captured_outputs) {
+      grad_fn.captures.emplace_back(autograd::as_variable_ref(outputs[offset]), true);
+    }
+  }
+
+  variable_tensor_list runWithGrad(variable_tensor_list&& inputs) const {
+    auto grad_fn = std::make_shared<ExecutionPlanAutogradFunction>(grad_executor,
+      grad.df_input_captured_inputs.size() + grad.df_input_captured_outputs.size());
+    // hook up the outputs of df to the gradient functions of the inputs that require
+    // gradients
+    for(auto idx : grad.df_output_vjps) {
+      auto & v = autograd::as_variable_ref(inputs[idx]);
+      grad_fn->add_next_edge(v.gradient_edge());
+    }
+    captureInputs(*grad_fn, inputs);
+
+    auto stack = unwrapVariables(std::move(inputs));
+    InterpreterState(f).runOneStage(stack);
+    variable_tensor_list outputs(
+        fmap(stack, [](IValue& v) { return std::move(v).toTensor(); }));
+
+    // hookup the gradients for the output tensors that require gradients
+    // to the inputs to our gradient function df
+    // TODO - XXX - if any output is the same tensor multiple times, views have to be
+    // setup here. We need to refactor autograd until it is safe for
+    // tensors to be constructed without all the viewing infrastructure.
+    // this is currently intentionally not done here so we can get an idea of our
+    // perf before introducing overhead for correctness
+    for(auto idx : grad.df_input_vjps) {
+      // Note: we have to set this up in place, or we have to throw away and
+      // reallocate variables that were already created in wrapTensors. We
+      // should add an API for this.
+      auto& output = autograd::as_variable_ref(outputs[idx]);
+      autograd::create_gradient_edge(output, grad_fn);
+      output.set_requires_grad(true);
+    }
+    captureOutputs(*grad_fn, outputs);
+    // drop the temporary outputs so that we return the same number of
+    // outputs as if we were not also calculating gradient
+    outputs.erase(outputs.begin() + grad.f_real_outputs, outputs.end());
+    return outputs;
+  }
+  Code f;
+  // optimized graph for debugging and testing
+  std::shared_ptr<Graph> graph;
+  // description of gradient as a graph
+  Gradient grad; // if(grad) is false when this is unused
+  // executor for df, including code caches
+  GraphExecutor grad_executor;
+};
+
+} // anonymous namespace
+
+// a Graph can be created via tracing, or via a language-based frontend
+// GraphExecutor runs it. It can run the same graph on many different sizes
+// and different requires_grad states, and handles specializations for each situation.
+// GraphExecutor is completely unaware of tracing or module parameters to keep the
+// tracing concerns separated.
+struct GraphExecutorImpl {
+
+  GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
+  : graph(std::move(graph))
+  , optimize(optimize)
+  , num_inputs(this->graph->inputs().size())
+  , symbolically_differentiable(symbolically_differentiable)
+  , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
+  GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
+  : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {}
+
+  // entry point where execution begins
+  variable_tensor_list run(variable_tensor_list inputs) {
+    if(inputs.size() != num_inputs) {
+      std::stringstream ss;
+      ss << "expected " << num_inputs << " inputs but got " << inputs.size() << " inputs";
+      throw std::runtime_error(ss.str());
+    }
+
+    // the tracer has called a graph executor
+    // there is no need to optimize, but we do need to splice the graph of
+    // this excutor into the trace. Otherwise we might unroll control-flow
+    // operations.
+    if(isTracing(inputs)) {
+      return runTraced(std::move(inputs));
+    }
+
+    // this is the fallback pathway, when we cannot differentiate
+    if(!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
+      return runFallback(std::move(inputs));
+    }
+
+    // either we can symbolically differentiate, or we do not need a gradient.
+    // go down the route where we treat the inputs as tensors
+    // and fully optimize
+    auto & implementation = getOrCompile(inputs);
+    return implementation.run(std::move(inputs));
+  }
+
+  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const {
+    ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
+
+    if (!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
+      JIT_ASSERTM(autograd_fallback_graph, "No graph found for given inputs");
+      return autograd_fallback_graph;
+    }
+
+    auto it = plan_cache.find(spec);
+    JIT_ASSERTM(it != plan_cache.end(), "No graph found for given inputs");
+    return it->second.get_graph();
+  }
+
+  GraphExecutorState getDebugState() {
+    GraphExecutorState state;
+    state.graph = graph.get();
+    if (autograd_fallback) {
+      state.autograd_fallback = &autograd_fallback;
+      state.autograd_fallback_graph = autograd_fallback_graph.get();
+    } else {
+      state.autograd_fallback = nullptr;
+      state.autograd_fallback_graph = nullptr;
+    }
+    for (auto & entry : plan_cache) {
+      state.execution_plans.emplace(entry.first, entry.second.getDebugState());
+    }
+    return state;
+  }
+
+private:
+  friend struct GraphExecutor;
+
+  // TODO: switching tracing to be part of the local thread state, instead of
+  // a per-variable property will make this check significantly faster.
+  // It is along the fast path, so this is important.
+  static bool isTracing(const variable_tensor_list& inputs) {
+    for(auto & i : inputs) {
+      if(i.defined() && tracer::isTracingVar(autograd::as_variable_ref(i)))
+        return true;
+    }
+    return false;
+  }
+  variable_tensor_list runTraced(variable_tensor_list inputs) {
+    // TODO: unnecessary copy to variable_list
+    variable_list input_vars(inputs.begin(), inputs.end());
+    auto state = tracer::getTracingState(input_vars);
+    auto input_values = fmap(input_vars, [&](const Variable& v) {
+      return tracer::getValueTrace(state, v);
+    });
+
+    ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
+    input_vars.clear(); // don't hold inputs during execution
+    auto outputs = runFallback(std::move(inputs));
+
+    auto all_dynamic = [](const at::ArrayRef<Value*> xs) {
+      for(Value* x : xs) {
+        if(x->type()->kind() != TypeKind::DynamicType)
+          return false;
+      }
+      return true;
+    };
+    // Traces always have types propagated through them, so we make sure to
+    // also propagate types through the graph we are inserting here.
+    // However, this->graph itself may already have been generated with
+    // tracing and so we only do the type propgation if no concrete types have
+    // been set.
+    auto local_graph = this->graph;
+    if(all_dynamic(local_graph->inputs()) && all_dynamic(local_graph->outputs())) {
+      local_graph = this->graph->copy();
+      PropagateInputShapes(*local_graph, spec);
+    }
+    auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values);
+
+    for(size_t i = 0; i < outputs.size(); ++i) {
+      tracer::setValueTrace(state, outputs[i], output_values[i]);
+    }
+    return outputs;
+  }
+
+  variable_tensor_list runFallback(variable_tensor_list inputs) {
+    auto & fb = getOrCreateAutogradFallback();
+    return runOneStage(fb, std::move(inputs));
+  }
+
+  static bool calcMayIntroduceGradient(Block* b) {
+    for(Node* n : b->nodes()) {
+      if(n->kind() == prim::PythonOp)
+        return true;
+      for(Block* bb : n->blocks()) {
+        if(calcMayIntroduceGradient(bb))
+          return true;
+      }
+    }
+    return false;
+  }
+  bool needsGradient(const variable_tensor_list & inputs) const {
+    if (!autograd::GradMode::is_enabled()) {
+      return false;
+    }
+    if(may_introduce_gradient)
+      return true;
+    for (const auto & tensor : inputs) {
+      if(tensor.defined() && static_cast<const Variable&>(tensor).requires_grad())
+        return true;
+    }
+    return false;
+  }
+
+  const Code & getOrCreateAutogradFallback() {
+    std::lock_guard<std::mutex> lock(compile_mutex);
+    if(autograd_fallback) {
+      return autograd_fallback;
+    }
+    auto graph_ = graph->copy();
+    runRequiredPasses(graph_);
+    if(optimize) {
+      if(!symbolically_differentiable)
+        CreateAutodiffSubgraphs(*graph_);
+      runOptimization(graph_, /*graphMustSupportVariables=*/true);
+    }
+    autograd_fallback_graph = graph_;
+    autograd_fallback = Code(graph_);
+    return autograd_fallback;
+  }
+  const ExecutionPlan & getOrCompile(const variable_tensor_list & inputs) {
+    // outside lock guard, to minimize the time holding the lock on the fast path
+    // ArgumentSpec even computes its hashCode here.
+    ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
+    {
+      std::lock_guard<std::mutex> lock(compile_mutex);
+      auto it = plan_cache.find(spec);
+      if(it != plan_cache.end())
+        return it->second;
+      auto plan = compileSpec(spec);
+      auto r = plan_cache.emplace(std::move(spec), std::move(plan));
+      return r.first->second;
+    }
+  }
+
+  bool argumentSpecRequiresGradient(const ArgumentSpec & spec) {
+    for(size_t i = 0; i < spec.size(); ++i) {
+      if(spec.tensorInfo(i).requires_grad())
+        return true;
+    }
+    return false;
+  }
+
+  ExecutionPlan compileSpec(const ArgumentSpec & spec) {
+    auto graph_ = graph->copy();
+
+    specializeToSpec(graph_, spec);
+
+    if(!argumentSpecRequiresGradient(spec)) {
+      runOptimization(graph_, /*graphMustSupportVariables=*/false);
+      return ExecutionPlan(graph_);
+    }
+    JIT_ASSERT(symbolically_differentiable);
+
+    std::vector<bool> requires_grads;
+    requires_grads.reserve(spec.size());
+    for(size_t i = 0; i < spec.size(); i++)
+      requires_grads.push_back(spec.tensorInfo(i).requires_grad());
+
+    Gradient gradient = differentiate(graph_, requires_grads);
+    graph_ = gradient.f;
+    runOptimization(graph_, /*graphMustSupportVariables=*/false);
+    return ExecutionPlan(graph_, std::move(gradient));
+  }
+  // the unoptimized starting graph
+  // this is never mutated
+  std::shared_ptr<Graph> graph;
+
+  // true - do everything we can to make this graph run fast
+  // false - do not modifiy the graph at all and just use the interpreter
+  // to run the graph. Useful for debugging correctness issues in the implementation
+  bool optimize;
+  size_t num_inputs;
+
+  // GraphExecutor optimizes more aggresively when we _know_ the graph will be
+  // symbolically differentiable.
+  bool symbolically_differentiable;
+
+  // some ops, including python operations, can intorduce requires_grad=True
+  // variables even though no inputs to this graph are availiable, if
+  // the graph includes those operators then needGradient must be true
+  // regardles of input state.
+  bool may_introduce_gradient;
+
+  // when this graph has some parts that are not symbolically_differentable,
+  // but some input does require a derivative, we create and use autograd_fallback,
+  // which wraps up the fully differentiable subgraphs, and then runs the outer
+  // graph through autograd.
+  // Since we can't optimize black box functions anyway, there is only one fallback path,
+  // and it must work on all sizes (so no optimizations that inspect sizes can run on it)
+  std::shared_ptr<Graph> autograd_fallback_graph;
+  Code autograd_fallback;
+
+  // optimizable code paths, used when we can differentiate or when no derivative is needed
+  // Spec describes input conditions, Plan describes how to execute them.
+  std::unordered_map<ArgumentSpec, ExecutionPlan> plan_cache;
+
+  // GraphExecutor can be accessed from  multiple thread so
+  // anytime we are checking or updating the autograd_fallback or
+  // plan_cache, we must hold the compile mutex.
+  // along the fast path (no compilation) code should
+  // hold this for as little time as possible.
+  std::mutex compile_mutex;
+};
+
+GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
+: pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {}
+
+GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
+: pImpl(new GraphExecutorImpl(std::move(graph), optimize, symbolically_differentiable)) {}
+
+variable_tensor_list GraphExecutor::run(variable_tensor_list && inputs) {
+  return pImpl->run(std::move(inputs));
+}
+
+std::shared_ptr<Graph> GraphExecutor::graph() const {
+  return pImpl->graph;
+}
+
+std::shared_ptr<Graph> GraphExecutor::graphFor(const variable_tensor_list& inputs) const {
+  return pImpl->graphFor(inputs);
+}
+
+GraphExecutorState GraphExecutor::getDebugState() {
+  return pImpl->getDebugState();
+}
+
+
+void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
+  LowerGradOf(*g);
+  // implicit inserted expand nodes are not necessarily always valid
+  // when used inside script methods that might have unstable shapes
+  // we remove the implicitly created ones, and have shape analysis
+  // add valid expand nodes when the shapes are stable
+  RemoveExpands(g);
+}
+
+void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec& spec) {
+  // clean up GradOf and AutogradAdd nodes
+  // this must be first because later passes do not know what GradOfs are
+  std::vector<bool> defined;
+  for(size_t i = 0; i < spec.size(); ++i) {
+    defined.push_back(spec.tensorInfo(i).defined());
+  }
+  specializeUndef(*graph_, defined);
+
+  // required passes shared with autograd fallback
+  runRequiredPasses(graph_);
+
+  // Decompose addmm nodes to add + mm, so expands can be inserted and
+  // gradients accumulated on the backward pass
+  //
+  // In the future, if we need more passes like this, we should convert this
+  // into a generic canonicalization pass.
+  DecomposeAddmm(graph_);
+  // clean up dead constants from specialization
+  EliminateDeadCode(graph_);
+  // calculate all input shapes
+  PropagateInputShapes(*graph_, spec);
+}
+
+void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables) {
+
+  // these optimizations must run in the presence of variables
+  // and when shape information is not statically known.
+  EliminateDeadCode(graph);
+  CheckInplace(graph);
+  EliminateCommonSubexpression(graph);
+
+  if (!graphMustSupportVariables) {
+    // These optimizations can introduce operators like FusionGroup that
+    // do not work on variables
+
+    // They also may assume that concrete sizes/strides are availiable
+    UnrollLoops(graph);
+
+    //TODO: create peephole optimizations that are safe to run
+    // when we are using variables, and when we do not know sizes.
+    PeepholeOptimize(graph);
+    // TODO: remove mandatory size checking in BatchMM, otherwise
+    // it works fine on variables.
+    BatchMM(graph);
+    FuseGraph(graph);
+  }
+}
+
+}}
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
new file mode 100644
index 0000000..affcd38
--- /dev/null
+++ b/torch/csrc/jit/graph_executor.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <memory>
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/variable_tensor_list.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/autodiff.h"
+#include "torch/csrc/jit/argument_spec.h"
+
+namespace torch { namespace jit {
+
+struct GraphExecutorState;
+
+// Notice that those structs don't manage lifetime of their members.
+// They is only valid only right after you call getDebugState() and should never
+// be used again once another GraphExecutor function is called.
+struct ExecutionPlanState {
+  Code* f;
+  Graph* graph;
+
+  // Those two fields are optional
+  Gradient* grad;
+  std::shared_ptr<GraphExecutorState> grad_executor; // shared_ptr to break the cycle...
+};
+
+struct GraphExecutorState {
+  Graph* graph;
+  std::unordered_map<ArgumentSpec, ExecutionPlanState> execution_plans;
+
+  // Those two fields are optional
+  Code* autograd_fallback;
+  Graph* autograd_fallback_graph;
+};
+
+struct GraphExecutorImpl;
+struct GraphExecutor {
+  GraphExecutor() {}
+  GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
+  // note: if not specified, symbolically_differentiable is computed from the graph.
+  GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
+  variable_tensor_list run(variable_tensor_list && inputs);
+  explicit operator bool() const {
+    return pImpl != nullptr;
+  }
+  std::shared_ptr<Graph> graph() const;
+  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const;
+  GraphExecutorState getDebugState();
+private:
+  std::shared_ptr<GraphExecutorImpl> pImpl;
+};
+
+// These passes need to run before it is valid to pass to the interpreter
+// regardless of whether sizes have been specialized or not.
+void runRequiredPasses(const std::shared_ptr<Graph>& g);
+
+// specialize 'graph' to the types, sizes, and other properties described in spec
+// this prepares the graph for execution, including running runRequiredPasses,
+// but the execution only remains valid for tensors whose properties match spec
+// otherwise running the graph will have undefined results.
+void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec);
+
+// apply standard optimizations. if graphMustSupportVariables=false then
+// then the passes are allowed to modify the graph in ways that make it no longer
+// work with tensors that have requires_grad=True
+void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables);
+
+}}
diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h
new file mode 100644
index 0000000..c03d31a
--- /dev/null
+++ b/torch/csrc/jit/graph_node_list.h
@@ -0,0 +1,157 @@
+#include "torch/csrc/assertions.h"
+
+namespace torch { namespace jit {
+
+// Intrusive doubly linked lists with sane reverse iterators.
+// The header file is named generic_graph_node_list.h because it is ONLY
+// used for Graph's Node lists, and if you want to use it for other
+// things, you will have to do some refactoring.
+//
+// At the moment, the templated type T must support a few operations:
+//
+//  - It must have a field: T* next_in_graph[2] = { nullptr, nullptr };
+//    which are used for the intrusive linked list pointers.
+//
+//  - It must have a method 'destroy()', which removes T from the
+//    list and frees a T.
+//
+// In practice, we are only using it with Node and const Node.  'destroy()'
+// needs to be renegotiated if you want to use this somewhere else.
+//
+// Besides the benefits of being intrusive, unlike std::list, these lists handle
+// forward and backward iteration uniformly because we require a
+// "before-first-element" sentinel.  This means that reverse iterators
+// physically point to the element they logically point to, rather than
+// the off-by-one behavior for all standard library reverse iterators.
+
+static constexpr int kNextDirection = 0;
+static constexpr int kPrevDirection = 1;
+
+template <typename T>
+struct generic_graph_node_list;
+
+template <typename T>
+struct generic_graph_node_list_iterator;
+
+struct Node;
+using graph_node_list = generic_graph_node_list<Node>;
+using const_graph_node_list = generic_graph_node_list<const Node>;
+using graph_node_list_iterator = generic_graph_node_list_iterator<Node>;
+using const_graph_node_list_iterator = generic_graph_node_list_iterator<const Node>;
+
+template <typename T>
+struct generic_graph_node_list_iterator {
+  generic_graph_node_list_iterator()
+    : cur(nullptr), d(kNextDirection) {}
+  generic_graph_node_list_iterator(T * cur, int d)
+    : cur(cur), d(d) {}
+  generic_graph_node_list_iterator(const generic_graph_node_list_iterator & rhs)
+    : cur(rhs.cur), d(rhs.d) {}
+  T * operator*() const { return cur; }
+  T * operator->() const { return cur; }
+  generic_graph_node_list_iterator & operator++() {
+    JIT_ASSERT(cur);
+    cur = cur->next_in_graph[d];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator++(int) {
+    generic_graph_node_list_iterator old = *this;
+    ++(*this);
+    return old;
+  }
+  generic_graph_node_list_iterator & operator--() {
+    JIT_ASSERT(cur);
+    cur = cur->next_in_graph[reverseDir()];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator--(int) {
+    generic_graph_node_list_iterator old = *this;
+    --(*this);
+    return old;
+  }
+
+  // erase cur without invalidating this iterator
+  // named differently from destroy so that ->/. bugs do not
+  // silently cause the wrong one to be called.
+  // iterator will point to the previous entry after call
+  void destroyCurrent() {
+    T * n = cur;
+    cur = cur->next_in_graph[reverseDir()];
+    n->destroy();
+  }
+  generic_graph_node_list_iterator reverse() {
+    return generic_graph_node_list_iterator(cur, reverseDir());
+  }
+private:
+  int reverseDir() {
+    return d == kNextDirection ? kPrevDirection : kNextDirection;
+  }
+  T * cur;
+  int d; //direction 0 is forward 1 is reverse, see next_in_graph
+};
+
+template <typename T>
+struct generic_graph_node_list {
+  using iterator = generic_graph_node_list_iterator<T>;
+  using const_iterator = generic_graph_node_list_iterator<const T>;
+  generic_graph_node_list_iterator<T> begin() {
+    return generic_graph_node_list_iterator<T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<const T> begin() const {
+    return generic_graph_node_list_iterator<const T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<T> end() {
+    return generic_graph_node_list_iterator<T>(head,d);
+  }
+  generic_graph_node_list_iterator<const T> end() const {
+    return generic_graph_node_list_iterator<const T>(head,d);
+  }
+  generic_graph_node_list_iterator<T> rbegin() {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<const T> rbegin() const {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<T> rend() {
+    return reverse().end();
+  }
+  generic_graph_node_list_iterator<const T> rend() const {
+    return reverse().end();
+  }
+  generic_graph_node_list reverse() {
+    return generic_graph_node_list(head, d == kNextDirection ? kPrevDirection : kNextDirection);
+  }
+  const generic_graph_node_list reverse() const {
+    return generic_graph_node_list(head, d == kNextDirection ? kPrevDirection : kNextDirection);
+  }
+  generic_graph_node_list(T * head, int d)
+    : head(head), d(d) {}
+private:
+  T * head;
+  int d;
+};
+
+template <typename T>
+static inline bool operator==(generic_graph_node_list_iterator<T> a, generic_graph_node_list_iterator<T> b) {
+  return *a == *b;
+}
+
+template <typename T>
+static inline bool operator!=(generic_graph_node_list_iterator<T> a, generic_graph_node_list_iterator<T> b) {
+  return *a != *b;
+}
+
+}}
+
+namespace std {
+
+template<typename T>
+struct iterator_traits<torch::jit::generic_graph_node_list_iterator<T>> {
+  using difference_type = int64_t;
+  using value_type = T*;
+  using pointer = T**;
+  using reference = T*&;
+  using iterator_category = bidirectional_iterator_tag;
+};
+
+}
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
new file mode 100644
index 0000000..75eca1e
--- /dev/null
+++ b/torch/csrc/jit/import.cpp
@@ -0,0 +1,528 @@
+#include "torch/csrc/jit/import.h"
+#include "torch/csrc/onnx/onnx.npb.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/functional.h"
+
+#include <ATen/ATen.h>
+
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include <pb_decode.h>
+
+namespace torch { namespace jit {
+
+namespace {
+
+// Deserialized data
+
+struct Tensor_ {
+  std::vector<int64_t> dims;
+  std::vector<uint8_t> raw_data;
+  onnx_TensorProto_DataType data_type;
+};
+
+struct AttributeValue_ {
+  std::string name;
+  onnx_AttributeProto_AttributeType type;
+  double f;
+  int64_t i;
+  std::string s;
+  Tensor_ t;
+  std::string g;
+  std::vector<double> fs;
+  std::vector<int64_t> is;
+  std::vector<std::string> ss;
+  std::vector<Tensor_> ts;
+  std::vector<std::string> gs;
+};
+
+struct Value_ {
+  std::string name;
+};
+
+struct Node_ {
+  std::string op_type;
+  std::string domain;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+  std::vector<AttributeValue_> attrs;
+};
+
+struct Graph_ {
+  std::vector<Value_> inputs;
+  std::vector<Value_> outputs;
+  std::vector<Node_> nodes;
+  std::vector<Tensor_> initializers;
+};
+
+struct Model_ {
+  Graph_ graph;
+};
+
+
+// Readers
+
+struct ReaderBase {
+  ReaderBase() {}
+  ReaderBase(pb_callback_t& cb) {
+    initialize_callback(cb);
+  }
+
+  void initialize_callback(pb_callback_t& cb) {
+    cb.funcs.decode = ReaderBase::decode;
+    cb.arg = this;
+  }
+
+  virtual void decode(pb_istream_t *stream) = 0;
+
+  static bool decode(pb_istream_t *stream, const pb_field_t *, void **_self) {
+    ReaderBase* self = *reinterpret_cast<ReaderBase* const *>(_self);
+    self->decode(stream);
+    return true;
+  }
+};
+
+
+template<typename T>
+struct Reader : ReaderBase {};
+
+template<typename T>
+struct Reader<std::vector<T>> : Reader<T> {
+  Reader(pb_callback_t& cb) : Reader<T>(cb) {}
+  // Decode is going to be called repeatedly from the callback
+  // (registered in the parent class constructor) each time an
+  // element is encountered. So all we do is relay the decoding
+  // through the parent class decode and push the result, every
+  // time this decode is called.
+  virtual void decode(pb_istream_t *stream) override {
+    Reader<T>::decode(stream);
+    values.push_back(std::move(Reader<T>::value));
+  }
+  std::vector<T> values;
+};
+
+template<>
+struct Reader<std::string> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    // For string and bytes, the length value has already been
+    // parsed, and is available at stream->bytes_left.
+    std::vector<uint8_t> res(stream->bytes_left);
+    if (!pb_read(stream, res.data(), stream->bytes_left)) {
+      throw std::runtime_error("Decoding failed");
+    }
+    value.assign(res.begin(), res.end());
+  }
+  std::string value;
+};
+
+template<>
+struct Reader<double> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode_fixed32(stream, &value)) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  double value;
+};
+
+template<>
+struct Reader<int64_t> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode_varint(stream, reinterpret_cast<uint64_t*>(&value))) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  int64_t value;
+};
+
+template<>
+struct Reader<std::vector<uint8_t>> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    // For string and bytes, the length value has already been
+    // parsed, and is available at stream->bytes_left.
+    value.resize(stream->bytes_left);
+    if (!pb_read(stream, value.data(), stream->bytes_left)) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  std::vector<uint8_t> value;
+};
+
+template<>
+struct Reader<Tensor_> : ReaderBase {
+  Reader()
+    : proto(onnx_TensorProto_init_default)
+    , dims_reader(proto.dims)
+    , raw_data_reader(proto.raw_data)
+  {}
+
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_TensorProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.dims = std::move(dims_reader.values);
+    value.raw_data = std::move(raw_data_reader.value);
+    value.data_type = proto.data_type;
+  }
+
+  onnx_TensorProto proto;
+  Reader<std::vector<int64_t>> dims_reader;
+  Reader<std::vector<uint8_t>> raw_data_reader;
+  Tensor_ value;
+};
+
+template<>
+struct Reader<AttributeValue_> : ReaderBase {
+  Reader()
+    : proto(onnx_AttributeProto_init_default)
+    , name_reader(proto.name)
+    , str_reader(proto.s)
+    , tensor_reader(proto.t)
+    , graph_reader(proto.g)
+    , floats_reader(proto.floats)
+    , ints_reader(proto.ints)
+    , strings_reader(proto.strings)
+    , tensors_reader(proto.tensors)
+    , graphs_reader(proto.graphs) {}
+
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_AttributeProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.name = std::move(name_reader.value);
+    value.type = proto.type;
+    value.f = proto.f;
+    value.i = proto.i;
+    value.s = std::move(str_reader.value);
+    value.t = std::move(tensor_reader.value);
+    value.g = std::move(graph_reader.value);
+    value.fs = std::move(floats_reader.values);
+    value.is = std::move(ints_reader.values);
+    value.ss = std::move(strings_reader.values);
+    value.ts = std::move(tensors_reader.values);
+    value.gs = std::move(graphs_reader.values);
+  }
+
+  onnx_AttributeProto proto;
+  Reader<std::string> name_reader;
+  Reader<std::string> str_reader;
+  Reader<Tensor_> tensor_reader;
+  Reader<std::string> graph_reader;
+  Reader<std::vector<double>> floats_reader;
+  Reader<std::vector<int64_t>> ints_reader;
+  Reader<std::vector<std::string>> strings_reader;
+  Reader<std::vector<Tensor_>> tensors_reader;
+  Reader<std::vector<std::string>> graphs_reader;
+  AttributeValue_ value;
+};
+
+template<>
+struct Reader<Value_> : ReaderBase {
+  Reader()
+    : proto(onnx_ValueInfoProto_init_default)
+    , name_reader(proto.name) {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_ValueInfoProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.name = std::move(name_reader.value);
+  }
+
+  onnx_ValueInfoProto proto;
+  Reader<std::string> name_reader;
+  Value_ value;
+};
+
+
+template<>
+struct Reader<Node_> : ReaderBase {
+  Reader()
+    : proto(onnx_NodeProto_init_default)
+    , op_type_reader(proto.op_type)
+    , domain_reader(proto.domain)
+    , inputs_reader(proto.input)
+    , outputs_reader(proto.output)
+    , attrs_reader(proto.attribute)
+  {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_NodeProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.op_type = std::move(op_type_reader.value);
+    value.domain = std::move(domain_reader.value);
+    value.inputs = std::move(inputs_reader.values);
+    value.outputs = std::move(outputs_reader.values);
+    value.attrs = std::move(attrs_reader.values);
+  }
+
+  onnx_NodeProto proto;
+  Reader<std::string> op_type_reader;
+  Reader<std::string> domain_reader;
+  Reader<std::vector<std::string>> inputs_reader;
+  Reader<std::vector<std::string>> outputs_reader;
+  Reader<std::vector<AttributeValue_>> attrs_reader;
+  Node_ value;
+};
+
+
+template<>
+struct Reader<Graph_> : ReaderBase {
+  Reader()
+    : proto(onnx_GraphProto_init_default)
+    , input_reader(proto.input)
+    , output_reader(proto.output)
+    , node_reader(proto.node)
+    , initializer_reader(proto.initializer)
+  {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_GraphProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.inputs = std::move(input_reader.values);
+    value.outputs = std::move(output_reader.values);
+    value.nodes = std::move(node_reader.values);
+    value.initializers = std::move(initializer_reader.values);
+  }
+
+  static Graph_ read(pb_istream_t *stream) {
+    Reader<Graph_> reader;
+    reader.decode(stream);
+    return reader.value;
+  }
+
+  onnx_GraphProto proto;
+  Reader<std::vector<Value_>> input_reader;
+  Reader<std::vector<Value_>> output_reader;
+  Reader<std::vector<Node_>> node_reader;
+  Reader<std::vector<Tensor_>> initializer_reader;
+  Graph_ value;
+};
+
+
+template<>
+struct Reader<Model_> : ReaderBase {
+  Reader()
+    : proto(onnx_ModelProto_init_default)
+    , graph_reader(proto.graph) {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_ModelProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.graph = std::move(graph_reader.value);
+  }
+
+  static Model_ read(pb_istream_t *stream) {
+    Reader<Model_> reader;
+    reader.decode(stream);
+    return reader.value;
+  }
+
+  onnx_ModelProto proto;
+  Reader<Graph_> graph_reader;
+  Model_ value;
+};
+
+
+// IR graph construction
+
+at::Tensor buildTensor(const Tensor_& tensor_) {
+
+  at::Tensor tensor;
+
+  switch(tensor_.data_type) {
+    case onnx_TensorProto_DataType_UINT8:
+      tensor = at::CPU(at::kByte).tensor();
+      break;
+    case onnx_TensorProto_DataType_INT8:
+      tensor = at::CPU(at::kChar).tensor();
+      break;
+    case onnx_TensorProto_DataType_INT16:
+      tensor = at::CPU(at::kShort).tensor();
+      break;
+    case onnx_TensorProto_DataType_INT32:
+      tensor = at::CPU(at::kInt).tensor();
+      break;
+    case onnx_TensorProto_DataType_INT64:
+      tensor = at::CPU(at::kLong).tensor();
+      break;
+    case onnx_TensorProto_DataType_FLOAT16:
+      tensor = at::CPU(at::kHalf).tensor();
+      break;
+    case onnx_TensorProto_DataType_FLOAT:
+      tensor = at::CPU(at::kFloat).tensor();
+      break;
+    case onnx_TensorProto_DataType_DOUBLE:
+      tensor = at::CPU(at::kDouble).tensor();
+      break;
+    default:
+      throw std::runtime_error("Unsupported data type");
+  }
+
+  tensor.resize_(tensor_.dims);
+
+  TORCH_ASSERT(tensor.storage()->size() * tensor.storage()->elementSize() == tensor_.raw_data.size());
+
+  std::memcpy(tensor.data_ptr(), tensor_.raw_data.data(), tensor_.raw_data.size());
+
+  return tensor;
+}
+
+Graph_ readSubgraph(const std::string& serialized_subgraph) {
+  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_subgraph.data()), serialized_subgraph.size());
+
+  return Reader<Graph_>::read(&istream);
+}
+
+void buildBlock(const Graph_& graph_, Block* block,
+                std::unordered_map<std::string, Value*>& value_map);
+
+void buildBlocks(const std::vector<Graph_>& graphs_, Node* node,
+                 std::unordered_map<std::string, Value*>& value_map) {
+  for (auto g_ : graphs_) {
+    auto block = node->addBlock();
+    buildBlock(g_, block, value_map);
+  }
+}
+
+std::shared_ptr<Graph> buildGraph(const Graph_& graph_) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> value_map;
+
+  buildBlock(graph_, graph->block(), value_map);
+
+  return graph;
+}
+
+void buildBlock(const Graph_& graph_, Block* block,
+                std::unordered_map<std::string, Value*>& value_map) {
+
+  for (auto & input : graph_.inputs) {
+    value_map[input.name] = block->addInput();
+  }
+
+  for (auto & node_ : graph_.nodes) {
+    TORCH_ASSERT(node_.op_type != "PythonOp");
+
+    auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain, node_.op_type),
+                                             node_.outputs.size());
+
+    for (auto & attr : node_.attrs) {
+      Symbol name = Symbol::attr(attr.name);
+
+      switch(attr.type) {
+        case onnx_AttributeProto_AttributeType_UNDEFINED:
+          throw std::runtime_error("UNDEFINED attribute unsupported");
+          break;
+        case onnx_AttributeProto_AttributeType_FLOAT:
+          node->f_(name, attr.f);
+          break;
+        case onnx_AttributeProto_AttributeType_INT:
+          node->i_(name, attr.i);
+          break;
+        case onnx_AttributeProto_AttributeType_STRING:
+          node->s_(name, std::move(attr.s));
+          break;
+        case onnx_AttributeProto_AttributeType_TENSOR:
+          node->t_(name, buildTensor(attr.t));
+          break;
+        case onnx_AttributeProto_AttributeType_GRAPH:
+          node->g_(name, buildGraph(readSubgraph(attr.g)));
+          break;
+        case onnx_AttributeProto_AttributeType_FLOATS:
+          node->fs_(name, std::move(attr.fs));
+          break;
+        case onnx_AttributeProto_AttributeType_INTS:
+          node->is_(name, std::move(attr.is));
+          break;
+        case onnx_AttributeProto_AttributeType_STRINGS:
+          node->ss_(name, std::move(attr.ss));
+          break;
+        case onnx_AttributeProto_AttributeType_TENSORS:
+          node->ts_(name, fmap(attr.ts, [](const Tensor_& t) { return buildTensor(t); }));
+          break;
+        case onnx_AttributeProto_AttributeType_GRAPHS:
+          if (attr.name == "_blocks") {
+            buildBlocks(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); }), node, value_map);
+          }
+          else {
+            node->gs_(name, fmap(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); } ),
+                                               [](const Graph_& g_) { return buildGraph(g_); }));
+          }
+          break;
+      }
+    }
+
+    for (auto & input : node_.inputs) {
+      auto v = value_map[input];
+      node->addInput(v);
+    }
+
+    for (size_t i=0; i<node_.outputs.size(); i++) {
+      value_map[node_.outputs[i]] = node->outputs()[i];
+    }
+
+    block->appendNode(node);
+  }
+
+  for (auto & output : graph_.outputs) {
+    Value* v = value_map.at(output.name);
+    block->registerOutput(v);
+  }
+}
+
+std::shared_ptr<Graph> buildGraph(const Graph_& graph_, std::vector<at::Tensor>& initializers) {
+
+  auto graph = buildGraph(graph_);
+
+  for (auto tensor_ : graph_.initializers) {
+    initializers.push_back(buildTensor(tensor_));
+  }
+
+  return graph;
+}
+
+}
+
+std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph,
+                                     std::vector<at::Tensor>& initializers) {
+
+  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_graph.data()), serialized_graph.size());
+
+  auto model = Reader<Model_>::read(&istream);
+
+  auto graph = buildGraph(model.graph, initializers);
+
+  return graph;
+}
+
+}}
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
new file mode 100644
index 0000000..fa1878d
--- /dev/null
+++ b/torch/csrc/jit/import.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph, std::vector<at::Tensor> & initializers);
+
+}}
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
new file mode 100644
index 0000000..e1b9ac5
--- /dev/null
+++ b/torch/csrc/jit/init.cpp
@@ -0,0 +1,212 @@
+#include "torch/csrc/utils/pybind.h"
+
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/python_ir.h"
+#include "torch/csrc/jit/python_arg_flatten.h"
+#include "torch/csrc/jit/export.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/passes/graph_fuser.h"
+#include "torch/csrc/jit/passes/onnx.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/erase_number_types.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/peephole.h"
+#include "torch/csrc/jit/passes/canonicalize.h"
+#include "torch/csrc/jit/passes/onnx/peephole.h"
+#include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/loop_unrolling.h"
+#include "torch/csrc/jit/passes/to_batch.h"
+#include "torch/csrc/jit/passes/specialize_undef.h"
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/script/init.h"
+#include "torch/csrc/jit/script/python_tree_views.h"
+#include "torch/csrc/jit/batched/BatchTensor.h"
+#include "torch/csrc/jit/pybind_utils.h"
+
+namespace torch  { namespace jit {
+
+namespace {
+
+using autograd::variable_list;
+
+bool loadPythonClasses() {
+  // Leaving this code here, because it will likely be useful at some point
+  //PyObject *jit_module = PyImport_ImportModule("torch.jit");
+  //THPUtils_assert(jit_module, "class loader couldn't access "
+          //"torch.jit module");
+  //PyObject *jit_dict = PyModule_GetDict(jit_module);
+
+  return true;
+}
+
+} // anonymous namespace
+
+extern std::string runJITCPPTests();
+
+void initJITBindings(PyObject *module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  py::class_<python::IODescriptor>(m, "IODescriptor");
+
+  m.def("_jit_init", loadPythonClasses)
+   .def("_jit_pass_onnx", ToONNX)
+   .def("_jit_pass_onnx_peephole", PeepholeOptimizeONNX)
+   .def("_jit_pass_fuse", FuseGraph)
+   .def("_jit_pass_dce", [](std::shared_ptr<Graph>& g){
+     return EliminateDeadCode(g); // overload resolution
+   })
+   .def("_jit_pass_cse", EliminateCommonSubexpression)
+   .def("_jit_pass_peephole", PeepholeOptimize)
+   .def("_jit_pass_canonicalize", [](const std::shared_ptr<Graph>& g) {
+     return Canonicalize(g);
+   })
+   .def("_jit_pass_lint", LintGraph)
+   .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) {
+     auto tensor_inputs = createVariableTensorList(inputs);
+     PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs));
+   })
+   .def("_jit_pass_erase_number_types", EraseNumberTypes)
+   .def("_jit_pass_loop_unrolling", UnrollLoops)
+   .def("_jit_run_cpp_tests", [] {
+     // We have to release the GIL inside this method, because if we happen to
+     // initialize the autograd engine in these tests, the newly spawned worker threads will
+     // try to initialize their PyThreadState*, and they need the GIL for this.
+     AutoNoGIL _no_gil;
+     return runJITCPPTests();
+   })
+   .def("_jit_flatten", [](py::handle& obj) {
+     auto res =  python::flatten(obj);
+     return std::make_pair(res.vars, res.desc);
+   })
+   .def("_jit_unflatten", [](autograd::variable_list vars, python::IODescriptor& desc) {
+     return py::reinterpret_steal<py::object>(python::unflatten(vars, desc));
+   })
+   .def("_jit_pass_onnx_block", BlockToONNX)
+   .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops)
+   .def("_jit_pass_decompose_addmm", DecomposeAddmm)
+    .def("_jit_pass_specialize_undef", specializeUndef)
+   .def("_jit_differentiate", [](Graph &g, const std::vector<bool>& requires_grad) {
+       // the python binding slightly differs in semantics
+       // it makes a copy of the input Graph, and works on that
+       // jit::differentiate mutates the input Graph
+       auto g_clone = g.copy();
+       return differentiate(g_clone, requires_grad);
+   });
+
+  py::class_<ArgumentSpec>(m, "ArgumentSpec")
+      .def("__repr__", [](ArgumentSpec& self) {
+        std::ostringstream s;
+        s << self;
+        return s.str();
+      });
+  py::class_<Code>(m, "Code")
+      .def("executors", [](Code& c) {
+        return py::make_iterator(c.executors().begin(), c.executors().end());
+      });
+
+  py::class_<ExecutionPlanState>(m, "ExecutionPlanState")
+    .def_property_readonly("graph", [](ExecutionPlanState& s) {
+      return s.graph;
+    })
+    .def_property_readonly("code", [](ExecutionPlanState& s) {
+      return s.f;
+    })
+    .def_property_readonly("grad_executor", [](ExecutionPlanState& s) {
+      return s.grad_executor.get();
+    });
+
+  py::class_<Gradient>(m, "Gradient")
+    .def_property_readonly("f", [](Gradient& m) {
+      return m.f;
+    })
+    .def_property_readonly("df", [](Gradient& m) {
+      return m.df;
+    })
+    .def_property_readonly("f_real_outputs", [](Gradient& m) {
+      return m.f_real_outputs;
+    })
+    .def_property_readonly("df_input_vjps", [](Gradient& m) {
+      return m.df_input_vjps;
+    })
+    .def_property_readonly("df_input_captured_inputs", [](Gradient& m) {
+      return m.df_input_captured_inputs;
+    })
+    .def_property_readonly("df_input_captured_outputs", [](Gradient& m) {
+      return m.df_input_captured_outputs;
+    })
+    .def_property_readonly("df_output_vjps", [](Gradient& m) {
+      return m.df_output_vjps;
+    });
+
+  py::class_<GraphExecutorState>(m, "GraphExecutorState")
+    .def_property_readonly("graph", [](GraphExecutorState& s) {
+      return s.graph;
+    })
+    .def_property_readonly("execution_plans", [](GraphExecutorState& s) {
+      return s.execution_plans;
+    })
+    .def_property_readonly("autograd_fallback", [](GraphExecutorState& s) {
+      return s.autograd_fallback;
+    })
+    .def_property_readonly("autograd_fallback_graph", [](GraphExecutorState& s) {
+      return s.autograd_fallback_graph;
+    });
+
+  py::class_<GraphExecutor>(m, "GraphExecutor", py::dynamic_attr())
+      .def(
+          py::init([](py::function func,
+                      variable_list inputs,
+                      bool optimize) {
+              size_t num_inputs = inputs.size();
+              auto graph = tracer::createGraphByTracing(func, std::move(inputs), num_inputs);
+              return GraphExecutor(graph, optimize);
+          }),
+          py::arg("func"),
+          py::arg("inputs"),
+          py::arg("optimize") = true)
+      .def(
+          py::init([](std::shared_ptr<Graph> graph, bool optimize) {
+            return GraphExecutor(std::move(graph), optimize);
+          }),
+          py::arg("graph"),
+          py::arg("optimize") = true)
+      .def_property_readonly("graph", [](GraphExecutor& ge) {
+        return ge.graph();
+      })
+      .def("graph_for", [](GraphExecutor& ge, py::args args) {
+        return ge.graphFor(createVariableTensorList(args));
+      })
+      .def("get_debug_state", [](GraphExecutor& ge) {
+        return ge.getDebugState();
+      })
+      .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
+        auto inputs = createVariableTensorList(args);
+        auto outputs = ge.run(std::move(inputs));
+        // if we don't tell pybind these are variables it chokes on the
+        // conversion.
+        // TODO: fix conversions to be sane and make sure this works.
+        if (outputs.size() == 0) {
+          return py::none();
+        } else if (outputs.size() == 1) {
+          return py::cast(autograd::as_variable_ref(outputs[0]));
+        } else {
+          py::tuple tuple(outputs.size());
+          for(size_t i = 0; i < outputs.size(); i++) {
+            tuple[i] = py::cast(autograd::as_variable_ref(outputs[i]));
+          }
+          return tuple;
+        }
+      });
+
+  initPythonIRBindings(module);
+  tracer::initPythonTracerBindings(module);
+  script::initTreeViewBindings(module);
+  script::initJitScriptBindings(module);
+  initBatchTensorBindings(module);
+  initRegisterBatchOpsBindings(module);
+}
+
+}}
diff --git a/torch/csrc/jit/init.h b/torch/csrc/jit/init.h
new file mode 100644
index 0000000..fbc902e
--- /dev/null
+++ b/torch/csrc/jit/init.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch { namespace jit {
+
+void initJITBindings(PyObject *module);
+
+}}
diff --git a/torch/csrc/jit/interned_strings.cpp b/torch/csrc/jit/interned_strings.cpp
new file mode 100644
index 0000000..77ec184
--- /dev/null
+++ b/torch/csrc/jit/interned_strings.cpp
@@ -0,0 +1,135 @@
+#include <vector>
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <mutex>
+#include <sstream>
+#include "ATen/optional.h"
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "string.h"
+#include <iostream>
+
+namespace torch { namespace jit {
+
+struct InternedStrings {
+  InternedStrings()
+  : sym_to_info_(static_cast<size_t>(_keys::num_symbols)) {
+    #define REGISTER_SYMBOL(n, s) \
+      string_to_sym_[#n "::" #s] = n::s; \
+      sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s};
+
+    FORALL_NS_SYMBOLS(REGISTER_SYMBOL)
+    #undef REGISTER_SYMBOL
+  }
+  Symbol symbol(const std::string & s) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return _symbol(s);
+  }
+  std::pair<const char *, const char *> string(Symbol sym) {
+    // Builtin Symbols are also in the maps, but
+    // we can bypass the need to acquire a lock
+    // to read the map for Builtins because we already
+    // know their string value
+    switch(sym) {
+      #define DEFINE_CASE(ns, s) \
+        case ns::s: return {#ns "::" #s, #s};
+      FORALL_NS_SYMBOLS(DEFINE_CASE)
+      #undef DEFINE_CASE
+        default:
+          return customString(sym);
+    }
+  }
+  Symbol ns(Symbol sym) {
+    switch(sym) {
+      #define DEFINE_CASE(ns, s) \
+        case ns::s: return namespaces::ns;
+      FORALL_NS_SYMBOLS(DEFINE_CASE)
+      #undef DEFINE_CASE
+        default: {
+          std::lock_guard<std::mutex> guard(mutex_);
+          return sym_to_info_.at(sym).ns;
+        }
+    }
+  }
+private:
+  // prereq - holding mutex_
+  Symbol _symbol(const std::string & s) {
+    auto it = string_to_sym_.find(s);
+    if(it != string_to_sym_.end())
+      return it->second;
+
+    auto pos = s.find("::");
+    if(pos == std::string::npos) {
+      throw std::runtime_error("all symbols must have a namespace, <namespace>::<string>");
+    }
+    Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
+
+    Symbol sym(sym_to_info_.size());
+    string_to_sym_[s] = sym;
+    sym_to_info_.push_back({ns, s, s.substr(pos + strlen("::"))});
+    return sym;
+  }
+
+  std::pair<const char *, const char *> customString(Symbol sym) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    SymbolInfo& s = sym_to_info_.at(sym);
+    return {s.qual_name.c_str(), s.unqual_name.c_str()};
+  }
+  std::unordered_map<std::string, Symbol> string_to_sym_;
+
+  struct SymbolInfo {
+    Symbol ns;
+    std::string qual_name;
+    std::string unqual_name;
+  };
+  std::vector<SymbolInfo> sym_to_info_;
+
+  std::mutex mutex_;
+};
+
+static InternedStrings & globalStrings() {
+  static InternedStrings s;
+  return s;
+}
+
+Symbol Symbol::fromQualString(const std::string & s) {
+  return globalStrings().symbol(s);
+}
+
+const char * Symbol::toUnqualString() const {
+  return globalStrings().string(*this).second;
+}
+
+const char * Symbol::toQualString() const {
+  return globalStrings().string(*this).first;
+}
+
+const char * Symbol::toDisplayString() const {
+  // TODO: Make this actually return something that's "user friendly".
+  // The trouble is that, for this to be usable in printf-style assert
+  // statements, this has to return a const char* (whose lifetime is
+  // global), so we can't actually assemble a string on the fly.
+  return toQualString();
+}
+
+Symbol Symbol::ns() const {
+  return globalStrings().ns(*this);
+}
+
+std::string Symbol::domainString() const {
+  return domain_prefix + ns().toUnqualString();
+}
+
+Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
+  if (d.compare(0, domain_prefix.size(), domain_prefix) != 0) {
+    std::ostringstream ss;
+    ss << "Symbol: domain string is expected to be prefixed with '"
+       << domain_prefix << "', e.g. 'org.pytorch.aten'";
+    throw std::runtime_error(ss.str());
+  }
+  std::string qualString = d.substr(domain_prefix.size()) + "::" + s;
+  return fromQualString(qualString);
+}
+
+}}
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
new file mode 100644
index 0000000..b61a49b
--- /dev/null
+++ b/torch/csrc/jit/interned_strings.h
@@ -0,0 +1,235 @@
+#pragma once
+#include <vector>
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#include "torch/csrc/jit/generated/aten_interned_strings.h"
+
+namespace torch { namespace jit {
+
+#define FORALL_NS_SYMBOLS(_) \
+_(namespaces, prim) \
+_(namespaces, aten) \
+_(namespaces, onnx) \
+_(namespaces, attr) \
+_(namespaces, scope) \
+_(namespaces, namespaces) \
+_(prim, Assign) \
+_(prim, Constant) \
+_(prim, Drop) \
+_(prim, Eval) \
+_(prim, Expand) /* onnx */ \
+_(prim, FusionGroup) \
+_(prim, GraphExecutor) \
+_(prim, If) \
+_(prim, Jump) /* debug */ \
+_(prim, JumpNZ) /* debug */ \
+_(prim, JumpZ) /* debug */ \
+_(prim, Load) \
+_(prim, Loop) \
+_(prim, Param) \
+_(prim, PackPadded) /* onnx */ \
+_(prim, PadPacked) /* onnx */ \
+_(prim, Placeholder) /* debug */ \
+_(prim, Print) \
+_(prim, PythonOp) \
+_(prim, Reverse) \
+_(prim, Return) \
+_(prim, Store) \
+_(prim, Undefined) \
+_(prim, Starred) \
+_(prim, TupleConstruct) \
+_(prim, TupleUnpack) \
+_(prim, NumToTensor) \
+_(prim, TensorToNum) \
+_(prim, AutogradAdd) \
+_(prim, GradOf) \
+_(prim, AnyDefined) \
+_(aten, __not__) \
+FORALL_ATEN_BASE_SYMBOLS(_) \
+_(onnx, Add) \
+_(onnx, Concat) \
+_(onnx, Constant) \
+_(onnx, ConstantFill) \
+_(onnx, Div) \
+_(onnx, GRU) \
+_(onnx, Gather) \
+_(onnx, Gemm) \
+_(onnx, LSTM) \
+_(onnx, Mul) \
+_(onnx, Pow) \
+_(onnx, RNN) \
+_(onnx, Shape) \
+_(onnx, Size) \
+_(onnx, Slice) \
+_(onnx, Squeeze) \
+_(onnx, Sub) \
+_(onnx, Transpose) \
+_(onnx, Unsqueeze) \
+_(onnx, Loop) \
+_(onnx, If) \
+_(onnx, Reshape) \
+FORALL_ATTR_BASE_SYMBOLS(_) \
+_(attr, Subgraph) \
+_(attr, axes) \
+_(attr, axis) \
+_(attr, broadcast) \
+_(attr, direction) \
+_(attr, ends) \
+_(attr, inplace) \
+_(attr, input_as_shape) \
+_(attr, is_zero) \
+_(attr, perm) \
+_(attr, sizes) \
+_(attr, starts) \
+_(attr, transA) \
+_(attr, transB) \
+_(attr, name)
+
+// 'prim' symbols are synthetic operators that occur only in the IR
+// and don't have corresponding implementations in ATen.
+
+// 'onnx' symbols correspond to ONNX operators.  Their semantics
+// are defined in https://github.com/onnx/onnx/blob/master/docs/Operators.md
+// The particular version we are targeting is specified by '_onnx_opset_version'
+// in torch.onnx.symbolic
+//
+// In general, most ONNX operators won't get an entry here, because they
+// are handled from the Python end.  However, you may occasionally need
+// to intern an ONNX symbol here so that you can conveniently write an
+// optimization on ONNX operations.
+
+// 'attr' symbols are attribute keys.  They are shared between both ONNX and ATen
+// operators (you disambiguate their meaning by looking at the operator itself).
+// In general, you only need to define attribute keys that are used by
+// onnx or prim; ATen attributes are automatically generated in FORALL_ATTR_BASE_SYMBOLS.
+
+// Note [Symbol allocation]
+// ~~~~~~~~~~~~~~~~~~~~~~~~
+//
+//  1. Symbol namespace is split up into namespaces.
+//
+//  2. The intended access pattern for built-in symbols is onnx::MatMul
+//  in the torch::jit namespace (this is a Symbol).
+//
+
+
+// Built-in constant definition strategy:
+// - Enum is the most convenient way to generate a contiguous sequence
+//   of numbers for an identifier.
+// - However, an enum gives you a fresh type.  We want onnx::MatMul to
+//   be type Symbol, not some random enum type!
+// - Therefore, after using enums to generate the sequence of integers,
+//   we then declare constexpr Symbols to get everything the actual Symbol
+//   type we want.  Symbols must be constexpr to be valid to be "case"ed on.
+
+typedef uint32_t unique_t;
+
+static const std::string domain_prefix = "org.pytorch.";
+
+// A Symbol is like an interned string, but with a little extra
+// structure; it is namespaced via SymbolNamespace and the resulting
+// intern pointers support efficient namespace testing.
+struct Symbol {
+  explicit constexpr Symbol() : value(0) {};
+  explicit constexpr Symbol(unique_t uniq)
+  : value(uniq) {}
+
+  // Get a Symbol for a qualified string like "attr::bar"
+  static Symbol fromQualString(const std::string & s);
+
+  // Get a Symbol from a domain and an unqualified string like "org.pytorch.attr" and "bar"
+  static Symbol fromDomainAndUnqualString(const std::string & d, const std::string & s);
+
+  // Constructors for our various namespaced strings.  This will construct
+  // the appropriate namespaced string, e.g., "attr::foo" for the
+  // argument "foo", and then attempt to intern it.  DO NOT USE THIS
+  // with a string literal; attr::foo should be available in that case
+  // (and if it's not, you should add it to the built-ins list above.)
+  static Symbol attr(const std::string & s);
+  static Symbol aten(const std::string & s);
+  static Symbol onnx(const std::string & s);
+  static Symbol prim(const std::string & s);
+  // TODO: eliminate me
+  static Symbol scope(const std::string & s);
+
+  bool is_attr() const;
+  bool is_aten() const;
+  bool is_prim() const;
+  bool is_onnx() const;
+
+  // So we can switch on this
+  constexpr operator unique_t() const {
+    return value;
+  }
+
+  Symbol ns() const;
+
+  // Give a string corresponding to the unqualified version of this name, e.g.,
+  // "mm". Use this in a context where the intended namespace of the string is
+  // obvious; this is a *lossy* conversion.
+  const char * toUnqualString() const;
+
+  // Give a string corresponding to the qualified version of this name,
+  // e.g., "aten::mm".  This string format is made available to Python bindings
+  // (so we know how to parse it.)
+  const char * toQualString() const;
+
+  // This describes a symbol in a case where humans read it.  At the moment it's
+  // the same as toQualString.  This has to be a const char* returned because
+  // a lot of printf style macros use it.
+  const char * toDisplayString() const;
+
+  // Give a string corresponding to the domain name for the symbol,
+  // e.g., "org.pytorch.aten".
+  std::string domainString() const;
+
+private:
+  explicit Symbol(Symbol ns, const std::string & s);
+  unique_t value;
+};
+
+static inline bool operator==(Symbol lhs, Symbol rhs) {
+  return static_cast<unique_t>(lhs) == static_cast<unique_t>(rhs);
+}
+
+enum class _keys : unique_t {
+    #define DEFINE_KEY(ns, s) ns##_##s,
+    FORALL_NS_SYMBOLS(DEFINE_KEY)
+    #undef DEFINE_KEY
+    num_symbols
+};
+
+#define DEFINE_SYMBOL(s) \
+  constexpr Symbol s(static_cast<unique_t>(_keys::s));
+
+#undef DEFINE_SYMBOL
+
+#define DEFINE_SYMBOL(ns, s) \
+  namespace ns { constexpr Symbol s(static_cast<unique_t>(_keys::ns##_##s)); }
+FORALL_NS_SYMBOLS(DEFINE_SYMBOL)
+#undef DEFINE_SYMBOL
+
+inline Symbol Symbol::attr(const std::string & s) { return Symbol::fromQualString("attr::" + s); }
+inline Symbol Symbol::aten(const std::string & s)  { return Symbol::fromQualString("aten::" + s); }
+inline Symbol Symbol::onnx(const std::string & s)  { return Symbol::fromQualString("onnx::" + s); }
+inline Symbol Symbol::prim(const std::string & s)  { return Symbol::fromQualString("prim::" + s); }
+inline Symbol Symbol::scope(const std::string & s) { return Symbol::fromQualString("scope::" + s); }
+inline bool Symbol::is_attr() const { return ns() == namespaces::attr; }
+inline bool Symbol::is_aten() const { return ns() == namespaces::aten; }
+inline bool Symbol::is_prim() const { return ns() == namespaces::prim; }
+inline bool Symbol::is_onnx() const { return ns() == namespaces::onnx; }
+
+}} // namespace torch::jit
+
+// make symbol behave like an integer in hash tables
+namespace std {
+  template<>
+  struct hash<torch::jit::Symbol> {
+    size_t operator()(torch::jit::Symbol s) const {
+      return std::hash<uint32_t>()(static_cast<uint32_t>(s));
+    }
+  };
+}
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
new file mode 100644
index 0000000..1dd6ea6
--- /dev/null
+++ b/torch/csrc/jit/interpreter.cpp
@@ -0,0 +1,817 @@
+#include "interpreter.h"
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/ivalue.h"
+#include "torch/csrc/variable_tensor_functions.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <stdexcept>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// Before we translate to intepreter instructions, we do
+// some preprocessing of the graph to turn it into a form that is closer
+// to what the instructions will look like.
+// In particular we:
+// * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop
+// * flatten stages so that each stage starts with a load from the stack
+//   and ends with a store to the stack
+// *. computes move_flags (see Outputs), and inserts
+// *  Drop nodes are inserted for any node that is unused to create a dummy use
+//    that will cause the interpreter to free the node.
+//    A drop node is just a node with no outputs that just pops its inputs off the stack,
+//    to ensure the interpreter release references to nodes that are never used.
+//    Drop nodes are also inserted when the last use of a node is in some conditionally
+//    run control flow (e.g. one side of an If) and the interpreter must free
+//    the node only after the control flow has reconverged
+// Outputs are:
+// * graph - the post processed copy of g
+// * move_flags[n] - a list of booleans, one for each input,
+//   indicating whether this is the last use of the value. The interpreter
+//   should generate a move rather than a copy in this case.
+// * stage_input_types: the type annotations on the inputs to each stage
+//   these can be removed once the the backward tracer is no longer used
+
+namespace {
+
+// new_cond = (i < max_trip_count) && cond
+Value* createTripCountConjunctiveCondition(
+    Graph* g,
+    Value* cur_trip_count,
+    Value* max_trip_count,
+    Value* cond) {
+  // Emit initial comparison -- initial_trip_count < max_trip_count
+  Value* initial_comparison_value =
+      g->insertNode(g->create(aten::lt, {cur_trip_count, max_trip_count}, 1))
+          ->output();
+
+  // Replace initial condition with logical `and` of trip count and
+  // initial condition
+  Value* new_cond =
+      g->insertNode(
+           g->create(aten::__and__, {initial_comparison_value, cond}, 1))
+          ->output();
+  return new_cond;
+}
+
+} // namespace
+
+// this currently just _removes_ the trip count inputs and checks they are
+// unused. In the future they will be desugared into normal arithmetic to
+// provide a loop counter
+void desugarTripCounts(Block * b) {
+  for(auto n : b->nodes()) {
+
+    if(n->kind() == prim::Loop) {
+      auto g = n->owningGraph();
+      auto body_block = n->blocks()[0];
+
+      Value* block_trip_count_input = body_block->inputs()[0];
+      // Treat loop iteration number as a loop-carried dependency. We emit an
+      // increment at the end of the body block.
+      n->insertOutput(0);
+
+      Value* max_trip_count_value = n->input(0);
+      {
+        WithInsertPoint guard(n);
+        // int i = 0
+        Value* initial_trip_count =
+            g->insertNode(g->createConstant(at::zeros({1}, at::kLong)))
+                ->output();
+        // Set up initial iteration number value for loop-carried dependency
+        n->removeInput(0);
+        // Input 0 is now initial termination condition, insert this after that.
+        // LCD's start at index 1.
+        n->insertInput(1, initial_trip_count);
+
+        Value* new_cond = createTripCountConjunctiveCondition(
+            g, initial_trip_count, max_trip_count_value, n->input(0));
+        n->replaceInput(0, new_cond);
+      }
+
+      {
+        WithInsertPoint guard(body_block);
+        // Trip count is now a loop carried dependency. We emit an op to
+        // increment the trip count at the end of the body. Then, emit the same
+        // conjunctive stopping condition as above.
+
+        Value* const_one =
+            g->insertNode(g->createConstant(at::ones({1}, at::kLong)))
+                ->output();
+
+        Value* inc_trip_count =
+            g->insertNode(g->create(
+                    aten::add, {block_trip_count_input, const_one, const_one}, 1))
+             ->output();
+        body_block->insertOutput(1, inc_trip_count);
+
+        Value* body_cond = createTripCountConjunctiveCondition(
+            g, inc_trip_count, max_trip_count_value, body_block->outputs()[0]);
+        body_block->eraseOutput(0);
+        body_block->insertOutput(0, body_cond);
+      }
+    }
+    for(auto sb : n->blocks()) {
+      desugarTripCounts(sb);
+    }
+  }
+}
+
+// removes all inputs and outputs to a graph, replacing them with nodes before of after each insertStage
+static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) {
+  // because JIT classic needs this to fix up gradients, remove when possible
+  std::vector<std::vector<TypePtr>> stage_input_types;
+
+  WithInsertPoint guard(*graph.nodes().begin());
+  size_t input_pos = 0;
+  size_t output_pos = 0;
+  auto it = graph.nodes().begin();
+  for(size_t i = 0; i <= graph.stage(); i++) {
+    stage_input_types.emplace_back();
+    auto store = graph.create(prim::Store, 0)->insertBefore(*it);
+    while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) {
+      auto nv = store->addOutput();
+      auto old_node = graph.inputs()[input_pos];
+      stage_input_types[i].push_back(old_node->type());
+      old_node->replaceAllUsesWith(nv);
+      input_pos++;
+    }
+    while(it != graph.nodes().end() && it->stage() == i)
+      ++it;
+    auto load = graph.create(prim::Load, 0)->insertBefore(*it);
+    while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) {
+      load->addInput(graph.outputs()[output_pos]);
+      output_pos++;
+    }
+  }
+  while (graph.inputs().size() > 0)
+    graph.eraseInput(graph.inputs().size() - 1);
+  while (graph.outputs().size() > 0)
+    graph.eraseOutput(graph.outputs().size() - 1);
+
+  return stage_input_types;
+}
+
+
+// insert Drop nodes to kill references for anything unused:
+// this can happen in a few places, e.g. when a node returns
+// many values but only one is used
+// a, b = foo()
+// return a
+void dropUnused(Block *b) {
+  auto createDropIfUnused = [&](ArrayRef<Value*> values) -> Node* {
+    std::vector<Value*> to_drop;
+    for(auto v : values) {
+      if(v->uses().size() == 0)
+        to_drop.push_back(v);
+    }
+    if(to_drop.size() == 0)
+      return nullptr;
+    return b->owningGraph()->create(prim::Drop, to_drop, 0);
+  };
+
+  if(auto d = createDropIfUnused(b->inputs())) {
+    b->prependNode(d);
+  }
+  for(auto n : b->nodes()) {
+    if(auto d = createDropIfUnused(n->outputs())) {
+      d->insertAfter(n);
+    }
+    for(auto b : n->blocks())
+      dropUnused(b);
+  }
+}
+
+
+// for each input, should we move rather than copy the inputs
+std::unordered_map<Node*, std::vector<uint8_t>> findLastUses(Graph & g) {
+  // struct to share common data structures
+  struct FindLastUses {
+    Graph & graph;
+    // have we seen this value, yet, if not, it is the last use of the value
+    std::unordered_set<Value*> seen;
+
+    std::unordered_map<Node*, std::vector<uint8_t>> move_flags;
+    // A map from an If or Loop node to the optional Drop block that
+    // occurs directly after it to release any tensors that go out of scope
+    // when the If/Loop exits. These are created and inserted on demand.
+    std::unordered_map<Node*, Node*> drop_for_node;
+
+    FindLastUses(Graph & g)
+    : graph(g) {
+      scanBlock(graph.block());
+    }
+    void scanBlock(Block * b) {
+      scanNode(b->return_node());
+      for(auto n : b->nodes().reverse()) {
+        scanNode(n);
+      }
+    }
+    void scanNode(Node * n) {
+      for(auto b : n->blocks()) {
+        scanBlock(b);
+      }
+      move_flags[n].resize(n->inputs().size());
+      // scan backwards so if a value is used twice in the list then it is a move
+      for(size_t i = n->inputs().size(); i > 0; --i) {
+        scanUse(n, i-1);
+      }
+    }
+    void scanUse(Node * n, size_t i) {
+      auto & move_flags_n = move_flags[n];
+      auto v = n->inputs()[i];
+      auto inserted = seen.insert(v).second;
+      if(!inserted) {
+        move_flags_n[i] = false;
+        return;
+      }
+
+      // the last use of v may be in a nested block of an If or Loop statement
+      // find the node 'same_depth_node' at the same depth as the definition of v,
+      // and consider that node to be the last use of v.
+      // This ensures we do not delete nodes in nested scopes
+      // that may be executed multiple times
+      // and that nodes used on one side of an if
+      // but not the other get deleted regardless of the branch
+      // e.g.
+      // a = 4
+      // while <...>:
+      //   y = a + a
+      // drop(a)
+      // In other words, we find the first program point for v that
+      // _reverse_ dominates the definition of v, and add a drop point there.
+      Node * same_depth_node = findOwnerInBlock(n, v->node()->owningBlock());
+      JIT_ASSERT(same_depth_node); // failure means v is not in scope for n, use lint!
+
+      // In the case where v and n are in the same block, just mark
+      // its move_flags to be true
+      if(same_depth_node == n) {
+        move_flags_n[i] = true;
+        return;
+      }
+
+      // in the case where the use is nested in a block
+      // add a Drop node after that block which will drop 'v'.
+      move_flags_n[i] = false;
+      addToDropIfNotExists(findOrCreateDropInstructionForNode(same_depth_node), v);
+    }
+
+    // finds the node in block 'block' that contains in 'n'
+    // or nullptr if no such node exists, e.g.:
+    // n0: a = 4
+    // n1: if <cond>:
+    // n2:    b = a + a
+    // findOwnerInBlock(n2, n0.block()) == n1
+    Node * findOwnerInBlock(Node * n, Block * block) {
+      while(n != nullptr && block != n->owningBlock()) {
+        n = n->owningBlock()->owningNode();
+      }
+      return n;
+    }
+
+    Node * findOrCreateDropInstructionForNode(Node * n) {
+      auto it = drop_for_node.find(n);
+      if(it == drop_for_node.end()) {
+        auto drop_node = graph.create(prim::Drop, 0);
+        drop_node->insertAfter(n);
+        it = drop_for_node.emplace(n, drop_node).first;
+      }
+      return it->second;
+    }
+
+    void addToDropIfNotExists(Node * drop, Value * v) {
+      for(auto i : drop->inputs()) {
+        // we already accounted for this use
+        if(i == v)
+          return;
+      }
+      drop->addInput(v);
+      move_flags[drop].push_back(true);
+    }
+  };
+
+  return FindLastUses(g).move_flags;
+}
+
+// pre-processing that happens once per graph
+struct PreprocessGraph {
+  PreprocessGraph(Graph & g)
+  : graph(g.copy()) {
+    desugarTripCounts(graph->block());
+    stage_input_types = flattenStages(*graph);
+    dropUnused(graph->block());
+    // fill in move_flags by scanning blocks;
+    move_flags = findLastUses(*graph);
+    //TODO: desugar Loop trip counts, for now we drop trip counts
+  }
+  // Outputs of the preprocessing:
+  std::shared_ptr<Graph> graph;
+  // for each input, should we move rather than copy the inputs
+  std::unordered_map<Node*, std::vector<uint8_t>> move_flags;
+  std::vector<std::vector<TypePtr>> stage_input_types;
+
+};
+
+// previously the interpreter worked with at::Retainable values,
+// which are annoying to handle since 99% of values are at::Tensor anyway
+// instead we create a fake subclass of TensorImpl that can be subclassed
+// to hold arbitrary things
+// Note: this is currently unused but will probably be useful in the future,
+// so we keep it around
+struct ContainerTensor : public at::TensorImpl {
+public:
+  ContainerTensor()
+  : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined))) {}
+
+  virtual ~ContainerTensor() {}
+  virtual const char * toString() const override {
+    throw std::runtime_error("toString() on ContainerTensor");
+  }
+  virtual at::IntList sizes() const override {
+    throw std::runtime_error("sizes() on ContainerTensor");
+  }
+  virtual at::IntList strides() const override {
+    throw std::runtime_error("strides() on ContainerTensor");
+  }
+  virtual int64_t dim() const override {
+    throw std::runtime_error("dim() on ContainerTensor");
+  }
+  virtual at::Scalar localScalar() override {
+    throw std::runtime_error("localScalar() on ContainerTensor");
+  }
+  virtual void * unsafeGetTH(bool retain) override {
+    throw std::runtime_error("unsafeGetTH() on ContainerTensor");
+  }
+  virtual std::unique_ptr<at::Storage> storage() override {
+    throw std::runtime_error("storage() on ContainerTensor");
+  }
+};
+
+// We need some lists for inputs and outputs. To keep all the memory
+// contiguous we allocate a single vector and use offsets into the vector
+// which are stored in the ListHandle struct
+// start is an offset into int_data of Code for ListHandle<int>
+// and bool_data of Code for ListHandle<bool>
+template<typename T>
+struct ListHandle {
+  int start;
+  int size;
+};
+
+struct UseList {
+  // values to be used
+  ListHandle<int> values;
+  // boolean flags indicating whether to free the Tensor after this use
+  ListHandle<bool> free_flags;
+};
+
+// one instruction plus meta-data
+struct Instruction {
+  Operation callback;
+  UseList inputs;
+  ListHandle<int> outputs;
+  Symbol debug_name; // used in dump to understand the generated code
+  std::shared_ptr<SourceLocation> debug_location; // for error reporting
+};
+
+
+int relativeJump(int from_inst, int to_inst) {
+  return to_inst - (from_inst + 1);
+}
+
+struct CodeImpl {
+  CodeImpl(std::shared_ptr<Graph>& graph_)
+      : preprocess(*graph_) {
+    graph = preprocess.graph;
+    //std::cout << "into code graph:\n" << *graph << "\n";
+    insertNodesFromBlock(graph->block());
+  }
+
+  // jump when input is 0
+  void createJumpZ(int from_inst, int to_inst) {
+    auto & inst = instructions[from_inst];
+    JIT_ASSERT(inst.debug_name == prim::Placeholder);
+    auto offset = relativeJump(from_inst, to_inst);
+    inst.callback = [offset](Stack & stack) {
+      auto t = tensor_as<int64_t>(pop(stack).toTensor());
+      return (t == 0) ? offset : 0;
+    };
+    inst.debug_name = prim::JumpZ;
+  }
+
+  // jump when input is not 0
+  void createJumpNZ(int from_inst, int to_inst) {
+    auto & inst = instructions[from_inst];
+    JIT_ASSERT(inst.debug_name == prim::Placeholder);
+    auto offset = relativeJump(from_inst, to_inst);
+    inst.callback = [offset](Stack & stack) {
+      auto t = tensor_as<int64_t>(pop(stack).toTensor());
+      return (t != 0) ? offset : 0;
+    };
+    inst.debug_name = prim::JumpNZ;
+  }
+
+  void createJump(int from_inst, int to_inst) {
+    auto & inst = instructions[from_inst];
+    JIT_ASSERT(inst.debug_name == prim::Placeholder);
+    auto offset = relativeJump(from_inst, to_inst);
+    inst.callback = [=](Stack & stack) {
+      return offset;
+    };
+    inst.debug_name = prim::Jump;
+  }
+
+  void insertNodesFromBlock(Block* block) {
+    for(auto node : block->nodes()) {
+      const auto & source_location = node->getSourceLocation();
+      switch(node->kind()) {
+        case prim::If: {
+          // x = if c:
+          //   <then_block>
+          //   -> (vt)
+          // else:
+          //    <else_block>
+          //   -> (vf)
+
+          // turns into:
+          //   JumpNZ c, then
+          //   <else_block>
+          //   x = vf
+          //   Jump end
+          // then:
+          //   <then_block>
+          //   x = vt
+          // end:
+
+          // prim::Placeholder instructions are replaced with branch instructions
+          // when the branch target locations are known
+          auto cond_branch = insertInstruction(prim::Placeholder, source_location, node->inputs(), moveFlags(node), {});
+          auto then_block = node->blocks()[0];
+          auto else_block = node->blocks()[1];
+          insertNodesFromBlock(else_block);
+          insertAssign(source_location,else_block->outputs(), moveFlags(else_block), node->outputs());
+          auto jump = insertInstruction(prim::Placeholder, source_location, {}, {}, {});
+          auto then_block_start = instructions.size();
+          insertNodesFromBlock(then_block);
+          insertAssign(source_location, then_block->outputs(), moveFlags(then_block), node->outputs());
+          createJump(jump, instructions.size());
+          createJumpNZ(cond_branch, then_block_start);
+        } break;
+        case prim::Loop: {
+          // o0 = while c i0
+          //        block 0: l0
+          //          <body>
+          //          -> (v0, v1)
+
+          // turns into:
+          // l0 = i0
+          // JumpZ c, end
+          // begin:
+          //   <body>
+          //   c, l0 = v0, v1
+          //   JumpNZ c, begin
+          // end:
+
+          auto body_block = node->blocks()[0];
+
+          // before assign op: stack: ... <cond> <loop-carried-depdencies>
+          insertAssign(source_location, node->inputs(), moveFlags(node), body_block->inputs());
+          // after assign op: stack: ... <cond>
+          // cond_branch consumes <cond> from top of the stack
+          auto cond_branch = insertInstruction(prim::Placeholder, source_location,{}, {}, {});
+          // after branch: stack: ...
+
+          auto entry = instructions.size();
+          insertNodesFromBlock(body_block);
+          // before assign op: stack: ... <cond> <loop-carried-depdencies>
+          insertAssign(source_location, body_block->outputs(), moveFlags(body_block), body_block->inputs());
+          // after assign op: stack: ... <cond>
+          auto cond_branch_end = insertInstruction(prim::Placeholder, source_location, {}, {}, {});
+          // after branch: stack: ...
+
+          aliasRegistersTo(node->outputs(), body_block->inputs());
+          createJumpZ(cond_branch, instructions.size());
+          createJumpNZ(cond_branch_end, entry);
+        } break;
+        default: {
+          insertInstruction(node);
+        } break;
+      }
+      // each stage ends with a load instruction
+      // we record where these instructions occur, and use them to
+      // exit the interpreter
+      if(node->kind() == prim::Load) {
+        stage_end.push_back(instructions.size());
+      }
+    }
+  }
+
+  size_t insertInstruction(Node * n) {
+    auto inst = insertInstruction(n->kind(), n->getSourceLocation(), n->inputs(), moveFlags(n) , n->outputs());
+    instructions[inst].callback = getInterpreterOperation(n);
+    return inst;
+  }
+  size_t insertInstruction(Symbol sym,
+                           std::shared_ptr<SourceLocation> debug_location,
+                                 ArrayRef<Value*> inputs,
+                                 ArrayRef<uint8_t> move_flags,
+                                 ArrayRef<Value*> outputs) {
+    instructions.emplace_back();
+    auto & inst = instructions.back();
+    inst.debug_name = sym;
+    inst.debug_location = std::move(debug_location);
+    listBegin(inst.inputs.values);
+    for(auto input : inputs) {
+      listInsert(inst.inputs.values, getOrAllocateRegister(input, true));
+    }
+    listBegin(inst.inputs.free_flags);
+    for(auto flag : move_flags) {
+      listInsert(inst.inputs.free_flags, flag);
+    }
+    listBegin(inst.outputs);
+    for(auto output : outputs) {
+      listInsert(inst.outputs, getOrAllocateRegister(output));
+    }
+    return instructions.size() - 1;
+  }
+  ArrayRef<uint8_t> moveFlags(Node * n) {
+    return preprocess.move_flags.at(n);
+  }
+  ArrayRef<uint8_t> moveFlags(Block *b) {
+    return moveFlags(b->return_node());
+  }
+
+  size_t insertAssign(std::shared_ptr<SourceLocation> debug_location, ArrayRef<Value*> inputs, ArrayRef<uint8_t> move_flags, ArrayRef<Value*> outputs) {
+    auto inst = insertInstruction(prim::Assign, std::move(debug_location),inputs, move_flags, outputs);
+    // This node effectively forwards its inputs into different places in a register list.
+    // We don't need to manipulate the stack in any way, because all inputs are also outputs,
+    // and the interpreter will take care of putting them in correct places.
+    instructions[inst].callback = [](Stack& stack) { return 0; };
+    return inst;
+  }
+
+  // helpers to build/access RegList objects
+  int get(const ListHandle<int> & list, int i)  const {
+    return int_data[list.start + i];
+  }
+  bool get(const ListHandle<bool> & list, int i) const {
+    return bool_data[list.start + i];
+  }
+  void listBegin(ListHandle<int> & list) {
+    list.start = int_data.size();
+    list.size = 0;
+  }
+  void listInsert(ListHandle<int> & list, int value) {
+    JIT_ASSERTM(list.start + list.size == (int)int_data.size(), "another list already started");
+    int_data.push_back(value);
+    list.size++;
+  }
+  void listBegin(ListHandle<bool> & list) {
+    list.start = bool_data.size();
+    list.size = 0;
+  }
+  void listInsert(ListHandle<bool> & list, int value) {
+    JIT_ASSERTM(list.start + list.size == (int)bool_data.size(), "another list already started");
+    bool_data.push_back(value);
+    list.size++;
+  }
+  // must be called before any new_allocations are used, otherwise they will
+  // already have registers assigned
+  void aliasRegistersTo(ArrayRef<Value*> new_allocations, ArrayRef<Value*> existing_allocations) {
+    JIT_ASSERT(new_allocations.size() == existing_allocations.size());
+    for(size_t i = 0; i < new_allocations.size(); ++i) {
+      auto n = new_allocations[i]->unique();
+      auto e = existing_allocations[i]->unique();
+      JIT_ASSERT(unique_to_reg.count(e) > 0 && unique_to_reg.count(n) == 0);
+      unique_to_reg[n] = unique_to_reg[e];
+    }
+  }
+  int getOrAllocateRegister(Value * n, bool required = false) {
+    size_t u = n->unique();
+    if(unique_to_reg.count(u) > 0)
+      return unique_to_reg[u];
+    JIT_ASSERT(!required);
+    int r = register_size++;
+    unique_to_reg[u] = r;
+    return r;
+  }
+
+  // Returns a function implementing functionality of a given node,
+  // or nullptr if it's a no-op for autograd.
+  Operation getInterpreterOperation(jit::Node* node) {
+    if(node->kind() != prim::GraphExecutor) {
+      return getOperation(node);
+    }
+    // recursive graph executors cannot be Operators because they
+    // have to register themselves with the interpreter so that
+    // we can provide useful debugging information
+
+    auto executor = std::make_shared<GraphExecutor>(node->g(attr::Subgraph));
+    graph_executors.emplace_back(executor.get());
+    auto num_inputs = node->inputs().size();
+    return [=](Stack& stack) mutable {
+      autograd::profiler::RecordFunction record("GraphExecutor");
+      auto inputs = last(stack, num_inputs);
+      variable_tensor_list tinputs(
+          fmap(inputs, [](const IValue& v) { return v.toTensor(); }));
+      drop(stack, num_inputs);
+      //TODO: has graph executor work from a stack as well
+      variable_tensor_list toutputs = executor->run(variable_tensor_list(std::move(tinputs)));
+      stack.insert(stack.end(), toutputs.begin(), toutputs.end());
+      return 0;
+    };
+  }
+
+  const std::vector<GraphExecutor*>& executors() {
+    return graph_executors;
+  }
+
+  void dumpInstruction(std::ostream & out, size_t pc) const {
+    auto writeList = [&](const ListHandle<int> & list) {
+      for(int i = 0; i < list.size; i++) {
+        if(i > 0)
+          out << ", ";
+        out << get(list, i);
+      }
+    };
+    auto writeUseList = [&](const UseList & list) {
+      for(int i = 0; i < list.values.size; i++) {
+        if(i > 0)
+          out << ", ";
+        if(get(list.free_flags, i))
+          out << "move(" << get(list.values, i) << ")";
+        else
+          out << get(list.values, i);
+      }
+    };
+    auto & inst = instructions.at(pc);
+    writeList(inst.outputs);
+    // NB: debug names are the kind of operator used to select
+    // dispatch
+    out << " = " << inst.debug_name.toUnqualString() << " ";
+    writeUseList(inst.inputs);
+  }
+  void dump(std::ostream & out) const {
+    for(size_t i = 0; i < instructions.size(); ++i) {
+      dumpInstruction(out, i);
+      out << "\n";
+    }
+  }
+
+  // We MUST hold onto graph here because some Operators stored in the
+  // instruction lists have dependencies on meta-data stored in the graph
+  // that would be dead otherwise.
+  // It is also very useful for debugging interpreter problems to
+  // keep this around.
+  std::shared_ptr<Graph> graph;
+  std::vector<GraphExecutor*> graph_executors; // for debugging
+  PreprocessGraph preprocess;
+
+  std::unordered_map<size_t, int> unique_to_reg; // map from unique of nodes to register in register table
+
+  friend struct InterpreterState;
+  std::vector<Instruction> instructions;
+  std::vector<size_t> stage_end; // each stage runs while(pc < stage_end[stage])
+  int register_size = 0;
+
+  // all memory ArrayRef<int> are slices of this, to make sure
+  // the interpreter is mostly linearly scanning through memory
+  std::vector<int> int_data;
+  std::vector<bool> bool_data;
+};
+
+// InterpreterState state that is held across stages and used to compute a Code
+struct InterpreterStateImpl {
+  InterpreterStateImpl(const Code & function_)
+  : function(function_.pImpl),
+    int_data(function->int_data.data()),
+    bool_data(function->bool_data),
+    registers(function->register_size) {
+  }
+  void runOneStage(Stack & stack) {
+    // std::cout << "running stage: " << current_stage << " of " << function->stage_end.size() << "\n";
+    // std::cout << *function->graph << "\n";
+    // function->dump(std::cout);
+    size_t pc = current_pc;
+    size_t last = function->stage_end[current_stage];
+    auto & instructions = function->instructions;
+    while(pc < last) {
+        // std::cout << "executing " << pc << ": ";
+        // function->dumpInstruction(std::cout, pc);
+        // std::cout << "\n";
+        try {
+          auto & inst = instructions[pc];
+          loadTensorsFromRegisters(inst.inputs, stack);
+          size_t new_pc = pc + 1 + inst.callback(stack);
+          for(int i = inst.outputs.size - 1; i >= 0; i--) {
+            int reg = get(inst.outputs,i);
+            registers[reg] = pop(stack);
+            // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n";
+          }
+          pc = new_pc;
+        } catch(std::exception & e) {
+          if(!instructions[pc].debug_location)
+            throw; // rethrow original exception
+          // throw a new exception with enhanced debugging information
+          instructions[pc].debug_location->wrapAndRethrowException(e, "operation failed in interpreter");
+        }
+    }
+    current_pc = pc;
+    current_stage++;
+  }
+  const TensorType & tensorTypeForInput(size_t i) const {
+    return *function->preprocess.stage_input_types.at(current_stage).at(i)->expect<TensorType>();
+  }
+  int get(const ListHandle<int> & list, int i) {
+    return int_data[list.start + i];
+  };
+  bool get(const ListHandle<bool> & list, int i) {
+    return bool_data[list.start + i];
+  }
+  void loadTensorsFromRegisters(const UseList & uses, Stack & stack) {
+    for(int i = 0; i < uses.values.size; i++) {
+      int reg = get(uses.values,i);
+      // std::cout << "push reg[" << reg << "];\n" << registers[reg] << "\n\n";
+      if(get(uses.free_flags,i)) {
+        stack.push_back(std::move(registers[reg]));
+      } else {
+        stack.push_back(registers[reg]);
+      }
+
+    }
+  }
+  size_t current_stage = 0;
+  size_t current_pc = 0;
+  std::shared_ptr<CodeImpl> function; // keep function alive
+  // these are just copies of function to prevent indirections in interpreter
+  int * int_data;
+  const std::vector<bool> & bool_data;
+
+
+  // this holds all the tensors for this interpreter run
+  // we don't bother minimizing the size of this vector, since the extra
+  // memory used by the pointers in this will be small
+  // instead we are very aggresive about releasing tensors when they become dead
+  // to make sure memory management happens efficiently.
+
+  // We optimize for the case where derivatives are run with retain_graph=False
+  // in the case where it is true, then the interpreter and this array get copied
+  // if this every becomes a bottleneck then we _should_ consider minimizing the
+  // total number or register
+  std::vector<IValue> registers;
+
+  // single buffer for input/output calls to ATen functions, so that we do not reallocate
+  Stack stack;
+};
+
+std::ostream & operator<<(std::ostream & out, const Code & code) {
+  out << *code.pImpl->graph << "\n";
+  code.pImpl->dump(out);
+  return out;
+}
+
+Code::Code(std::shared_ptr<Graph>& graph)
+    : pImpl(new CodeImpl(graph)) {}
+Code::~Code() {}
+
+const std::vector<GraphExecutor*>& Code::executors() {
+  return pImpl->executors();
+}
+
+InterpreterState::InterpreterState(const Code & function)
+  : pImpl(new InterpreterStateImpl(function)) {}
+InterpreterState::~InterpreterState() {}
+
+void InterpreterState::runOneStage(Stack & stack) {
+  return pImpl->runOneStage(stack);
+}
+
+const TensorType & InterpreterState::tensorTypeForInput(size_t i) const {
+  return pImpl->tensorTypeForInput(i);
+}
+
+InterpreterState InterpreterState::clone() const {
+  return InterpreterState(new InterpreterStateImpl(*pImpl));
+}
+
+InterpreterState::InterpreterState(InterpreterStateImpl * pImpl) : pImpl(pImpl) {}
+
+}}
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
new file mode 100644
index 0000000..b908552
--- /dev/null
+++ b/torch/csrc/jit/interpreter.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <memory>
+#include <vector>
+#include "ATen/optional.h"
+
+namespace at {
+  struct Tensor;
+}
+namespace torch { namespace jit {
+
+// The interpreter run Graphs with Tensor inputs and Tensor outputs
+// a separate component in the autograd handles unwrapping and wrapping
+// variable objects for use in the interpreter.
+
+struct Node;
+struct GraphExecutor;
+struct CodeImpl;
+struct InterpreterStateImpl;
+struct Graph;
+struct Node;
+struct TensorType;
+struct IValue;
+using Stack = std::vector<IValue>;
+
+struct Code {
+  Code()
+    : pImpl(nullptr) {}
+  Code(std::shared_ptr<Graph>& graph);
+  ~Code();
+
+  // Returns pointers to GraphExecutors created to run GraphExecutor nodes in the given graph.
+  const std::vector<GraphExecutor*>& executors();
+
+  explicit operator bool() const {
+    return pImpl != nullptr;
+  }
+
+private:
+  std::shared_ptr<CodeImpl> pImpl;
+  friend struct InterpreterStateImpl;
+  friend std::ostream & operator<<(std::ostream & out, const Code & code);
+};
+
+struct InterpreterState {
+  InterpreterState(const Code & code);
+  // advance the interpreter state by running one stage. Returning the
+  // outputs for that stage, suspending the computation.
+  // Call this function again continues computation where it left off.
+  void runOneStage(Stack & stack);
+  const TensorType & tensorTypeForInput(size_t i) const;
+  ~InterpreterState();
+  // create a copy of InterpreterState with its current state
+  // used when retain_graph=True so that stages can be re-run
+  InterpreterState clone() const;
+private:
+  InterpreterState(InterpreterStateImpl * pImpl);
+  std::shared_ptr<InterpreterStateImpl> pImpl;
+};
+
+}}
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
new file mode 100644
index 0000000..a340dde
--- /dev/null
+++ b/torch/csrc/jit/ir.cpp
@@ -0,0 +1,582 @@
+#include "ir.h"
+
+#include "torch/csrc/autograd/function.h"
+
+#include <iostream>
+#include <unordered_map>
+#include <unordered_set>
+#include <set>
+#include <stack>
+#include <sstream>
+#include <algorithm>
+#include <string>
+
+namespace torch { namespace jit {
+
+// Sigh, see https://stackoverflow.com/questions/8016780/undefined-reference-to-static-constexpr-char
+constexpr Symbol PythonOp::Kind;
+
+constexpr int max_tensor_display_size = 10;
+
+void printValueRef(std::ostream & out, const Value * n) {
+  out << "%" << n->uniqueName();
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream & out, const std::vector<T> & nodes) {
+  out << at::ArrayRef<T>{nodes};
+  return out;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream & out, const at::ArrayRef<T> & nodes) {
+  size_t i = 0;
+  for(auto n : nodes) {
+    if(i++ > 0)
+      out << ", ";
+    printValueRef(out, n);
+  }
+  return out;
+}
+
+struct const_value_list_with_types {
+  const std::vector<const Value*>& values;
+  bool use_newlines;
+  const_value_list_with_types(const std::vector<const Value*>& values, bool use_newlines = false)
+    : values(values), use_newlines(use_newlines) {}
+};
+std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) {
+  size_t i = 0;
+  size_t prev_stage = 0;
+  for(auto n : l.values) {
+    if(i++ > 0) {
+      if (l.use_newlines) {
+        // TODO: Indent here is hard-coded for "graph(": un-hard-code it
+        out << "\n      ";
+        if (n->stage() != prev_stage) {
+          out << "-------- stage " << n->stage() << " --------\n      ";
+          prev_stage = n->stage();
+        }
+      } else {
+        out << ", ";
+      }
+    }
+    printValueRef(out, n);
+    out << " : ";
+    out << *n->type();
+  }
+  return out;
+}
+template<typename T>
+void printPrimList(std::ostream & out, const std::vector<T> & items) {
+  out << "[";
+  int i = 0;
+  for(auto & item : items) {
+    if(i++ > 0)
+      out << ", ";
+    out << item;
+  }
+  out << "]";
+}
+void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=false) {
+  out << "[";
+  auto names = n->attributeNames();
+  int i = 0;
+  for(auto name : names) {
+    if (ignore_subgraph && name == attr::Subgraph)
+      continue;
+    if(i++ > 0)
+      out << ", ";
+    // TODO: debugging mode to see the qualifier.  We definitely
+    // don't want to print the qualifier since it should always
+    // be attribute, but you might be able to track down a weird
+    // bug by printing it out.
+    out << name.toUnqualString() <<"=";
+    switch(n->kindOf(name)) {
+      case AttributeKind::f:
+        out << n->f(name);
+        break;
+      case AttributeKind::fs:
+        printPrimList(out,n->fs(name));
+        break;
+      case AttributeKind::i:
+        out << n->i(name);
+        break;
+      case AttributeKind::is:
+        printPrimList(out,n->is(name));
+        break;
+      case AttributeKind::s:
+        out << n->s(name);
+        break;
+      case AttributeKind::ss:
+        printPrimList(out,n->ss(name));
+        break;
+      case AttributeKind::t:
+        {
+          at::Tensor t = n->t(name);
+          // 1-elem tensors are usually boxed scalars, so print them like it
+          if (t.numel() == 1) {
+            auto scalar = at::Scalar(t.view({})).local();
+            out << "{";
+            if (scalar.isFloatingPoint()) {
+              out << scalar.toDouble();
+            } else {
+              out << scalar.toLong();
+            }
+            out << "}";
+          } else if (t.numel() <= max_tensor_display_size) {
+            // TODO: This is awful code.  Also it doesn't work on Windows.
+            std::ostringstream tensor_ss;
+            tensor_ss << t;
+            std::string tensor_s{tensor_ss.str()};
+            // Remove newlines
+            std::replace(tensor_s.begin(), tensor_s.end(), '\n', ' ');
+            out << tensor_s;
+          } else {
+            out << "<Tensor>";
+          }
+          break;
+        }
+      case AttributeKind::ts:
+        out << "[<Tensors>]";
+        break;
+      case AttributeKind::g:
+        out << "<Graph>";
+        break;
+      case AttributeKind::gs:
+        out << "[<Graphs>]";
+        break;
+    }
+  }
+  out << "]";
+}
+
+static std::ostream & indent(std::ostream & out, size_t level) {
+  for(size_t i = 0; i < level; ++i)
+    out << "  ";
+  return out;
+}
+
+std::ostream& printNode(std::ostream & out, size_t level, const Node * n, std::vector<const Node*> * groups) {
+  auto outputs = n->outputs();
+  indent(out, level) << const_value_list_with_types(outputs);
+  out << " = ";
+  IR_IFM_CONST(n,PythonOp)
+    out << "^" << value->name();
+    value->writeScalars(out);
+  IR_ELSE()
+    if(n->hasAttribute(attr::Subgraph) && groups) {
+      out << n->kind().toQualString() << "_" << groups->size();
+      if (n->numAttributes() > 1) {
+        printAttributes(out, n, /*ignore_subgraph=*/true);
+      }
+      groups->push_back(n);
+    } else {
+      out << n->kind().toQualString();
+      if(n->hasAttributes()) {
+        printAttributes(out,n);
+      }
+    }
+  IR_END()
+  out << "(" << n->inputs() << ")";
+  std::string scopeName = n->scopeName();
+  if (scopeName.empty()) {
+    out << "\n";
+  }
+  else {
+    out << ", ";
+    out << "scope: " << scopeName << "\n";
+  }
+  for(size_t i = 0; i < n->blocks().size(); ++i) {
+    auto b = n->blocks()[i];
+    indent(out, level + 1) << "block" << i << "(" << const_value_list_with_types(b->inputs(), false) << ") {\n";
+    for(auto n : b->nodes()) {
+      printNode(out, level + 2, n, groups);
+    }
+    indent(out, level + 2) << "-> (" << b->outputs() << ")\n";
+    indent(out, level + 1) << "}\n";
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream & out, const Node & n) {
+  return printNode(out, 0, &n, nullptr);
+}
+
+std::ostream& operator<<(std::ostream & out, const Graph & g) {
+  out << "graph(" << const_value_list_with_types(g.inputs(), true) << ") {\n";
+  std::vector<const Node*> groups;
+  size_t prev_stage = 0;
+  for(auto n : g.nodes()) {
+    if (n->stage() != prev_stage) {
+      out << "  ---------------- stage " << n->stage() << " ----------------\n";
+      prev_stage = n->stage();
+    }
+    printNode(out, 1, n, &groups);
+  }
+  out << "  return (" << g.outputs() << ");\n}\n";
+  size_t i = 0;
+  for(auto fg : groups) {
+    out << "with " << fg->kind().toQualString() << "_" <<i++ << " = " << *fg->g(attr::Subgraph);
+  }
+  /*
+  // Uncomment this to debug all_nodes issues
+  {
+    out << "\n";
+    out << "all_nodes:\n";
+    for (auto& n : g.all_nodes) {
+      printNode(out, const_cast<Node*>(n), nullptr);
+    }
+  }
+  */
+  return out;
+}
+
+static void checkSameDevice(const Node* node) {
+  bool has_device = false;
+  int device;
+  auto checkValue = [&](const Value* v) {
+    if(TensorType* type = v->type()->cast<TensorType>()) {
+      if(!has_device) {
+        has_device = true;
+        device = type->device();
+      } else {
+        JIT_ASSERT(device == type->device());
+      }
+    }
+  };
+  for(auto input : node->inputs()) {
+    checkValue(input);
+  }
+  for(auto output : node->outputs()) {
+    checkValue(output);
+  }
+}
+
+using node_set = std::set<const Node*>;
+#define ALL_OF(container) container.begin(), container.end()
+
+// These functions purposely operate on the internal members directly, to force
+// you to think about how the invariants change if you change the data
+// representation (even if the external API does not change.)
+
+// NB: This assert is written to assume you don't have any unattached
+// nodes.  Unattached nodes can occur while manipulations to the
+// graph are occurring.
+void Node::lint() const {
+  // Node invariants
+  // - if node should live in list, nodes_iter is consistent
+  // - Inputs are all marked as a use by the nodes they refer to
+  // - Stage is consistent (stage is >= all input stages)
+  // - Owning graph is non-null and consistent
+  // - The "Select" invariant, when the node is MultiReturn
+  //
+  // The handle invariant:
+  //    If a node takes a handle as an input, it is always the
+  //    LAST input of the node.  There is at most one handle input.
+
+  {
+    size_t i = 0;
+    for (auto input : inputs_) {
+      // WARNING: O(n^2)
+      JIT_ASSERT(std::find(ALL_OF(input->uses_), Use(const_cast<Node*>(this), i)) != input->uses_.end());
+      JIT_ASSERT(stage_ >= input->stage_);
+      JIT_ASSERT(graph_->all_nodes.count(this) == 1);
+      i++;
+    }
+  }
+
+  for(auto o : outputs()) {
+    size_t i = 0;
+    for (auto use : o->uses()) {
+      // Use invariants
+      // - Use is consistent with inputs
+      // - Every user node is live (checked in Graph)
+      JIT_ASSERT(use.user->inputs_[use.offset] == o);
+      i++;
+    }
+  }
+
+  // Node subclass invariants
+  // - Return uses is zero
+  // - Param inputs is zero
+  // - Select inputs is one
+  // - Python operator cconv is correct
+
+  IR_IF(this,Constant)
+    JIT_ASSERT(inputs_.size() == 0);
+  IR_ELSEIF(Return)
+    JIT_ASSERT(outputs().size() == 0);
+  IR_ELSEIF(Param)
+    JIT_ASSERT(inputs_.size() == 0);
+  IR_ELSEIFM_CONST(PythonOp)
+    size_t n_scalars = 0, n_tensors = 0;
+    for (auto c : value->cconv) {
+      if (c == 's') {
+        n_scalars++;
+      } else if (c == 't') {
+        n_tensors++;
+      } else {
+        JIT_ASSERT(0);
+      }
+      JIT_ASSERT(static_cast<bool>(value->pyobj));
+    }
+    JIT_ASSERT(n_scalars == value->scalar_args.size());
+    JIT_ASSERT(n_tensors == inputs_.size());
+  IR_ELSEIF(Eval)
+    // TODO: add invariants
+  // TODO: It's not good for these ops to be top-level, it makes cases longer.
+  IR_ELSEIF(FusionGroup)
+    checkSameDevice(value);
+    // TODO: Typecheck the parameters
+    value->g(attr::Subgraph)->lint();
+  IR_END()
+
+}
+
+// TODO: When lint fails, give better indication about which
+// instruction triggered the failure.
+void Graph::lint() const {
+  // Graph invariants
+
+  // Uncomment the following to see the graph
+  // std::cout << *const_cast<Graph*>(this);
+
+  // nodes
+  // - nodes_ is a valid topological ordering for inputs
+  // - No repeated nodes
+  // - Params and return do NOT occur in nodes
+  // - next_unique_ is greater than all uniques in graph
+  // - uniques in all_nodes are unique
+  // - every use will occur later in the topsort
+
+  struct LintScope {
+    LintScope() {}
+    LintScope(std::unique_ptr<LintScope> parent)
+    : parent(std::move(parent)) {}
+    bool contains(const Value * v) {
+      return values.count(v) > 0 || (parent && parent->contains(v));
+    }
+    bool contains(const Node * n) {
+      return nodes.count(n) > 0 || (parent && parent->contains(n));
+    }
+    void insert(const Value * v) {
+      JIT_ASSERT(!contains(v));
+      values.insert(v);
+    }
+    void insert(const Node * n) {
+      JIT_ASSERT(!contains(n));
+      nodes.insert(n);
+    }
+    std::unique_ptr<LintScope> parent;
+  private:
+    std::unordered_set<const Value*> values;
+    std::unordered_set<const Node*> nodes;
+  };
+  // Struct enables mutual recursion in linting methods.
+  // Putting it inside Graph::lint enables access to private Graph members
+  struct LintImpl {
+    LintImpl(const Graph & g)
+    : g(g)
+    , scope(new LintScope())
+    , all_nodes_set(ALL_OF(g.all_nodes)) {} // NB: all_nodes is *unordered*
+    const Graph & g;
+    std::unique_ptr<LintScope> scope;
+    std::unordered_set<size_t> seen_uniques;
+    std::unordered_map<const Node*, int64_t> anticipated_uses;
+    node_set all_nodes_set;
+    node_set sum_set;
+
+    void check_value(const Value* v) {
+      scope->insert(v);
+      auto b2 = seen_uniques.insert(v->unique());
+      JIT_ASSERT(b2.second);  // insertion took place
+      JIT_ASSERT(v->unique() < g.next_unique_);
+
+      for (auto use : v->uses()) {
+        JIT_ASSERT(!scope->contains(use.user));
+        JIT_ASSERT(g.all_nodes.count(use.user) == 1);
+        anticipated_uses[use.user]++;  // int default constructs to 0
+      }
+    }
+    void check_node(const Node* n) {
+      for (auto input : n->inputs_) {
+        if (!scope->contains(input)) {
+          JIT_ASSERTM(0, "%%%d not in scope", input->unique());
+        }
+      }
+      JIT_ASSERT(anticipated_uses[n] == static_cast<int64_t>(n->inputs_.size()));
+      anticipated_uses[n] = -1;  // we saw the anticipated user!
+      scope->insert(n);
+      for(auto block : n->blocks()) {
+        std::unique_ptr<LintScope> new_scope(new LintScope(std::move(scope)));
+        scope = std::move(new_scope);
+        check_block(block);
+        scope = std::move(scope->parent);
+      }
+      size_t i = 0;
+      for(auto o : n->outputs()) {
+        JIT_ASSERT(o->node() == n);
+        JIT_ASSERT(i++ == o->offset_);
+        check_value(o);
+      }
+      n->lint();
+    }
+    void check_block(const Block *b) {
+      for (auto input : b->inputs()) {
+        check_value(input);
+        JIT_ASSERT(input->node()->kind_ == prim::Param);
+      }
+
+      for (auto n : b->nodes()) {
+        JIT_ASSERT(n->kind_ != prim::Param);
+        JIT_ASSERT(n->kind_ != prim::Return);
+        check_node(n);
+      }
+
+      JIT_ASSERT(b->output_->kind() == prim::Return);
+      check_node(b->output_);
+
+      // all_nodes
+      // - inputs_, output_ and nodes_ are all included in all_nodes
+      // - all_nodes does not contain dead nodes??? (likely to be temporarily
+      // suspended).  Weaker: all_nodes contains all inputs and returns
+      // - only one return node???
+
+      node_set nodes_set(ALL_OF(b->nodes()));
+      node_set inputs_set {b->input_};
+      node_set output_set {b->output_};
+      // TODO: Make a more type safe std::includes wrapper which disallows use on
+      // non-ordered containers
+      JIT_ASSERT(std::includes(ALL_OF(all_nodes_set), ALL_OF(nodes_set)));
+      JIT_ASSERT(std::includes(ALL_OF(all_nodes_set), ALL_OF(inputs_set)));
+      JIT_ASSERT(std::includes(ALL_OF(all_nodes_set), ALL_OF(output_set)));
+
+      sum_set.insert(ALL_OF(nodes_set));
+      sum_set.insert(ALL_OF(inputs_set));
+      sum_set.insert(ALL_OF(output_set));
+    }
+    void check_graph() {
+      node_set all_nodes_set(ALL_OF(g.all_nodes)); // NB: all_nodes is *unordered*
+
+      check_block(g.block_);
+      for (auto kv : anticipated_uses) {
+        JIT_ASSERT(kv.second == -1);
+      }
+      // graph->stage() should be equal to max(node.stage for node in graph->nodes())
+      if (g.nodes().begin() == g.nodes().end()) {
+        JIT_ASSERT(g.stage() == 0);
+      } else {
+        JIT_ASSERT(g.stage() == g.nodes().rbegin()->stage());
+      }
+      JIT_ASSERT(std::includes(ALL_OF(sum_set), ALL_OF(all_nodes_set)));
+    }
+  };
+  LintImpl(*this).check_graph();
+}
+
+void Graph::dump() const {
+  std::cout << *this << "\n";
+}
+
+void LintGraph(std::shared_ptr<Graph>& graph) {
+  graph->lint();
+}
+
+void Block::cloneFrom(Block * src, std::function<Value*(Value*)> outer_map) {
+  std::unordered_map<Value*, Value*> local_map;
+  auto env = [&](Value * v) {
+    auto it = local_map.find(v);
+    if(it != local_map.end())
+      return it->second;
+    return outer_map(v);
+  };
+
+  auto graph = owningGraph();
+  for(auto input : src->inputs()) {
+    local_map[input] = this->addInput()->copyMetadata(input)->setStage(input->stage());
+    graph->setStage(std::max(graph->stage(), input->stage()));
+  }
+  for(auto node : src->nodes()) {
+    auto new_node = this->appendNode(graph->createClone(node, env));
+    new_node->setStage(node->stage());
+    graph->setStage(std::max(graph->stage(), node->stage()));
+    for(size_t i = 0; i < node->outputs().size(); ++i) {
+      auto oo = node->outputs()[i];
+      auto no = new_node->outputs()[i];
+      local_map[oo] = no;
+      no->copyMetadata(oo);
+      no->setStage(oo->stage());
+    }
+  }
+  for(auto output : src->outputs()) {
+    this->registerOutput(env(output));
+  }
+}
+
+std::shared_ptr<Graph> Graph::copy() {
+  auto new_g = std::make_shared<Graph>();
+  auto env = [](Value *) -> Value* {
+    barf("Graph::copy() encountered a use of a value not in scope. Run lint!");
+  };
+  new_g->block()->cloneFrom(this->block(), env);
+  return new_g;
+}
+
+Value* Value::setUniqueName(const std::string & name) {
+  if (name.size() > 0 && name.find_first_not_of("0123456789") == std::string::npos) {
+    throw std::runtime_error("names may not be integers: " + name);
+  }
+
+  auto & names = node()->owningGraph()->unique_names_;
+
+  // clear any old name from the map
+  if(hasUniqueName()) {
+    names.erase(unique_name_);
+    unique_name_ = "";
+  }
+
+  // allow "" to clear the uniquename
+  if(name == "")
+    return this;
+
+  // if someone else has this name, then rename the other value
+  auto old_owner_of_name = names.find(name);
+  if(old_owner_of_name != names.end()) {
+    size_t suffix = 1;
+    std::string name_base = name;
+    auto last_dot_pos = name.find_last_of('.');
+    if (last_dot_pos != std::string::npos && last_dot_pos + 1 != name.size()) {
+      if (name.find_first_not_of("0123456789", last_dot_pos + 1) == std::string::npos) {
+        suffix = std::stoll(name.substr(last_dot_pos + 1));
+        name_base = name.substr(0, last_dot_pos);
+      }
+    }
+    std::string replacement_name;
+    do {
+      std::stringstream ss;
+      ss << name_base << "." << suffix++;
+      replacement_name = ss.str();
+    } while(names.count(replacement_name) > 0);
+    old_owner_of_name->second->setUniqueName(replacement_name);
+  }
+
+  names[name] = this;
+  unique_name_ = name;
+  return this;
+}
+
+PythonOp* defaultAllocPythonOp(Graph*g) {
+  throw std::runtime_error("Trying to allocate a Python object without python bindings loaded");
+}
+std::atomic<decltype(&defaultAllocPythonOp)> alloc_python_op;
+
+// patched in when python bindings are loaded
+PythonOp* allocPythonOp(Graph* g) {
+  return alloc_python_op.load()(g);
+}
+void setAllocPythonOp(PythonOp* (*v)(Graph* g)) {
+  alloc_python_op.store(v);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
new file mode 100644
index 0000000..a42915b
--- /dev/null
+++ b/torch/csrc/jit/ir.h
@@ -0,0 +1,1324 @@
+#pragma once
+
+#include "torch/csrc/jit/attributes.h"
+#include "torch/csrc/jit/generic_if.h"
+#include "torch/csrc/jit/graph_node_list.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/resource_guard.h"
+#include "torch/csrc/jit/source_location.h"
+#include "torch/csrc/jit/type.h"
+#include "torch/csrc/jit/variable_flags.h"
+
+#include "torch/csrc/utils/disallow_copy.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_stub.h"
+
+#include "torch/csrc/assertions.h"
+
+#include <ATen/ATen.h>
+#include "ATen/ArrayRef.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+namespace torch { namespace autograd {
+
+struct Function;
+
+}} // namespace torch::autograd
+
+namespace torch { namespace jit {
+
+// Graph represents one "function" of computation.
+// It uses a simple ownership model where the graph owns all the nodes inside it.
+// All references inside the graph are raw pointers.
+// Destroying the Graph will invalidate any pointers to nodes in the graph.
+struct Graph;
+
+// Node is the base class of the IR graph. It represents one computation
+// and dependencies on a list of Values. The "prim-ops", so to speak.
+struct Node;
+
+// A Value represents an input or output to node that is either a
+// Tensor or an opaque Handle object, as determined by type().
+struct Value;
+
+std::ostream& operator<<(std::ostream & out, const Graph & g);
+std::ostream& operator<<(std::ostream & out, const Type & t);
+std::ostream& operator<<(std::ostream & out, const Node & t);
+
+// A list of nodes, with inputs and outputs
+struct Block;
+
+// Each use is represented by this type, see Node::uses()
+// 'user' is the consumer of the value, offset is the index into
+// 'user's input this where the produces will be found.
+struct Use {
+  Use(Node * user, size_t offset)
+  : user(user), offset(offset) {}
+  Node * user;
+  size_t offset;
+};
+static inline bool operator==(const Use & a, const Use & b) {
+  return a.user == b.user && a.offset == b.offset;
+}
+
+// Note [User node does not uniquely identify use]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A while back, we wrote some code manipulating uses that looked like this:
+//
+//    for (auto& use : used_val->uses_) {
+//      if (use.user == this_node) {
+//        use.offset += 1;
+//        break;
+//      }
+//    }
+//
+// This code is trying to find a particular use (our node's use) to update it.
+// However, it's wrong: there may be *multiple* uses of a value %x in a node,
+// as might be the case in this IR:
+//
+//    %y = Add %x %x
+//
+// In this case, there are two uses of %x whose user is the node 'Add %x %x'.
+// So, "use induced by this node" is not a well-formed concept.
+//
+// If you are looking for "use induced by an input", it's best to use
+// findUseForInput() to get it.
+
+
+// Scope is a node of a trie that represents the tree of nested scopes.
+// Individual scopes are pushed and popped from Graph, which holds a
+// pointer to the current scope. Each Node in Graph holds a pointer
+// to the scope that was current when the node was created.
+// The trie never needs to shrink, it only grows until it is disposed
+// of when Graph is deallocated. Hence, pointers to scopes held by nodes
+// will always be valid as long as Graph is alive.
+struct Scope {
+private:
+  Scope* parent_;
+  Symbol name_;
+  std::vector<std::unique_ptr<Scope> > children_;
+public:
+  Scope() {
+    name_ = Symbol::scope("");
+    parent_ = NULL;
+  }
+  Scope(Scope* parent, Symbol name) {
+    name_ = name;
+    parent_ = parent;
+  }
+  Scope* push(Symbol name) {
+    children_.push_back(std::unique_ptr<Scope>(new Scope(this, name)));
+    return children_.back().get();
+  }
+  Scope* parent() {
+    if (parent_ == NULL) {
+      throw std::runtime_error("Cannot get parent from Scope with no parent");
+    }
+    return parent_;
+  }
+  bool isRoot() {
+    return parent_ == NULL;
+  }
+  Scope* getRoot() {
+    Scope* current = this;
+    while (current->parent_) {
+      current = current->parent_;
+    }
+    return current;
+  }
+  Symbol name() {
+    return name_;
+  }
+  std::string namesFromRoot(const std::string& separator="/") {
+    // TODO: I think the answer is we shouldn't have used Symbol here
+    std::string out = this->name_.toUnqualString();
+    if (this->isRoot()) {
+      return out;
+    }
+    Scope* parent = this->parent_;
+    while (!parent->isRoot()) {
+      out = std::string(parent->name_.toUnqualString()) + separator + out;
+      parent = parent->parent_;
+    }
+    return out;
+  }
+};
+
+// the list types are intentionally simple, but we type-def
+// them here so if we need to change them, refactoring will be easier
+using node_list = std::vector<Node*>;
+using value_list = std::vector<Value*>;
+using use_list = std::vector<Use>;
+using pyobj_list = std::vector<THPObjectPtr>;
+template<typename T>
+using ArrayRef = at::ArrayRef<T>;
+using NodeKind = Symbol;
+
+struct Value {
+  TH_DISALLOW_COPY_AND_ASSIGN(Value);
+  Value(Node * node_, size_t offset_);
+private:
+  friend struct Node;
+  friend struct Graph;
+  Node * node_;
+  size_t offset_;
+  size_t unique_ = 0;          // unique id
+  size_t stage_ = 0;           // 0-forward, 1-backward, 2-double-backward,...
+  use_list uses_;
+  std::string unique_name_;
+  TypePtr type_;
+public:
+  Value* setType(const TypePtr type) {
+    JIT_ASSERT(type);
+    type_ = type;
+    return this;
+  }
+  void inferTypeFrom(const at::Tensor& output) {
+    setType(std::make_shared<TensorType>(output));
+  }
+  const TypePtr & type() const {
+    JIT_ASSERT(type_ != nullptr);
+    return type_;
+  }
+  bool isTensor() const {
+    return type()->kind() == TypeKind::TensorType;
+  }
+  size_t unique() const {
+    return unique_;
+  }
+  bool hasUniqueName() const {
+    return unique_name_ != "";
+  }
+  Value* setUniqueName(const std::string & name);
+  std::string uniqueName() const {
+    if (hasUniqueName())
+      return unique_name_;
+    return std::to_string(unique());
+  }
+  Value* setStage(size_t s) {
+    stage_ = s;
+    return this;
+  }
+  size_t stage() const {
+    return stage_;
+  }
+  Node* node() {
+    return node_;
+  }
+  size_t offset() const {
+    return offset_;
+  }
+  void setOffset(size_t offset) {
+    offset_ = offset;
+  }
+  const Node * node() const {
+    return node_;
+  }
+  Graph * owningGraph();
+  const Graph * owningGraph() const;
+  // TODO: make this more const correct
+  const use_list & uses() const {
+    return uses_;
+  }
+
+  void replaceFirstUseWith(Value * newValue);
+
+  // Replaces all uses of this node with 'newValue'.
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%3, %3)
+  // Execute: %3.replaceAllUsesWith(%6)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%6)
+  //          %5 = h(%6, %6)
+  void replaceAllUsesWith(Value * newValue);
+
+  Value* copyMetadata(Value * from) {
+    setType(from->type());
+    if (from->hasUniqueName())
+      setUniqueName(from->uniqueName());
+    return this;
+  }
+
+};
+
+struct Node : public Attributes<Node> {
+  TH_DISALLOW_COPY_AND_ASSIGN(Node);
+  friend struct Graph;
+  friend struct Block;
+  friend struct Value;
+  friend graph_node_list;
+  friend const_graph_node_list;
+  friend graph_node_list_iterator;
+  friend const_graph_node_list_iterator;
+private:
+  // each node but Return/Param
+  // is associated with exactly one place in the node list...
+  // of the graph_
+  // this circular is a doubly-linked list, the Return node is used as the sentinel for the beginning and end of the list
+  // such that the list never has null pointers
+  // next_in_graph[0] is next pointer
+  // next_in_graph[1] is prev pointer
+  // using an array to allow the same iterator class for forward and reverse node lists
+  // This list represents a topological sort
+
+  Node* next_in_graph[2] = { nullptr, nullptr };
+  Node* & next() { return next_in_graph[kNextDirection]; }
+  Node* & prev() { return next_in_graph[kPrevDirection]; }
+  Node* const & next() const { return next_in_graph[kNextDirection]; }
+  Node* const & prev() const { return next_in_graph[kPrevDirection]; }
+
+  const NodeKind kind_;
+  std::vector<Value*> inputs_;
+  std::vector<Value*> outputs_;
+  // subblocks
+  std::vector<Block*> blocks_;
+  Graph* graph_;
+  Block* owning_block_;
+  std::shared_ptr<SourceLocation> source_location_;
+  size_t stage_;
+  Scope* scope_;
+protected:
+  Node(Graph * graph_, NodeKind kind_); //defined after graph
+public:
+  NodeKind kind() const {
+    return kind_;
+  }
+  Node* setSourceLocation(std::shared_ptr<SourceLocation> sl) {
+    source_location_ = std::move(sl);
+    return this;
+  }
+  std::shared_ptr<SourceLocation> getSourceLocation() const {
+    return source_location_;
+  }
+  Graph * owningGraph() {
+    return graph_;
+  }
+  const Graph * owningGraph() const {
+    return graph_;
+  }
+  Block * owningBlock() {
+    return owning_block_;
+  }
+  size_t stage() const {
+    return stage_;
+  }
+  Node* setStage(size_t s) {
+    stage_ = s;
+    return this;
+  }
+  Scope* scope() {
+    return scope_;
+  }
+  void setScope(Scope* scope) {
+    scope_ = scope;
+  }
+  std::string scopeName() const {
+    if (scope_ == NULL) {
+      return "";
+    }
+    return scope_->namesFromRoot();
+  }
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> inputs() {
+    return inputs_;
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {inputs_.data(), inputs_.size()};
+  }
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> outputs() {
+    return outputs_;
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {outputs_.data(), outputs_.size()};
+  }
+  bool hasUses() const {
+    for(auto o : outputs()) {
+      if(o->uses().size() > 0)
+        return true;
+    }
+    return false;
+  }
+  void replaceAllUsesWith(Node * n) {
+    JIT_ASSERT(outputs().size() == n->outputs().size());
+    size_t nOutputs = outputs().size();
+    for(size_t i = 0; i < nOutputs; i++) {
+      outputs()[i]->replaceAllUsesWith(n->outputs()[i]);
+    }
+  }
+  // lots of things like chunk have a single input or singel output, so we have a
+  // helper to make accessing it easier
+  Value * input() {
+    JIT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  Value * output() {
+    JIT_ASSERT(outputs_.size() == 1);
+    return outputs_.at(0);
+  }
+  const  Value * input() const {
+    JIT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  // Access a particular input.  This is a checked index.
+  Value * input(size_t i) {
+    return inputs_.at(i);
+  }
+  const Value * input(size_t i) const {
+    return inputs_.at(i);
+  }
+
+  // Graphs
+
+  // Note [Topological invariant]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // We always maintain an up-to-date topological ordering of all nodes via
+  // the next()/prev() links.  All transformations to graphs must preserve
+  // this topological ordering: for example, it is only valid to 'addInput'
+  // with an input which is topologically before the current node.
+  //
+  // Usually, it is obvious whether or not topological order is maintained;
+  // for example, if you are adding nodes to the end of the topsort, it's
+  // impossible for them to refer to inputs that are not in the topsort.
+  // If it is not obvious, please comment accordingly.
+
+  // Add 'node' as an input to 'this' at the end of existing
+  // arguments.  Returns the added node for ease of chaining.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.addInput(%4)
+  // Result:  %3 = f(%1, %2, %4)
+  Value* addInput(Value * node) {
+    JIT_ASSERT(graph_ == node->owningGraph());
+    node->uses_.emplace_back(this, inputs_.size());
+    inputs_.push_back(node);
+    return node;
+  }
+
+  // Add 'node' as an input to 'this' at the specified position in the
+  // arguments. Returns the added node for ease of chaining.
+  Value* insertInput(size_t i, Value* node) {
+    JIT_ASSERT(graph_ == node->owningGraph());
+    // First we update the offsets for all existing inputs that will reside
+    // after the one we're inserting. Concretely, these are the inputs at
+    // indices [i, # input). Since we're inserting one input before all of
+    // these inputs, increment their use offsets for this Node by 1
+    for (size_t use_itr = i; use_itr < inputs_.size(); ++use_itr) {
+      // See Note [User node does not uniquely identify use]
+      auto use = findUseForInput(use_itr);
+      use->offset += 1;
+    }
+    // Insert the actual input at the specified index
+    inputs_.insert(inputs_.begin() + i, node);
+    // Register the new use of the value we're inserted as an input.
+    node->uses_.emplace_back(this, i);
+    return node;
+  }
+
+  // Replace the input of 'this' at position 'i' with
+  // 'newValue', returning the old node.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.replaceInput(1, %4)
+  // Result:  %3 = f(%1, %4)
+  Value * replaceInput(size_t i, Value * newValue) {
+    JIT_ASSERT(newValue->owningGraph() == graph_);
+    Value * old = dropInput(i);
+    inputs_[i] = newValue;
+    newValue->uses_.emplace_back(this, i);
+    return old;
+  }
+
+  // Replace all occurrences of 'from' in the inputs of this
+  // node with 'to'. Corresponds to llvm's replaceUsesOfWith.
+  //
+  // Given:   %3 = f(%1, %2, %1)
+  // Execute: %3.replaceInputWith(%1, %4)
+  // Result:  %3 = f(%4, %2, %4)
+  void replaceInputWith(Value * from, Value * to) {
+    JIT_ASSERT(from->owningGraph() == graph_);
+    JIT_ASSERT(to->owningGraph() == graph_);
+    size_t i = 0;
+    for(auto input : inputs()) {
+      if(input == from)
+        replaceInput(i, to);
+      i++;
+    }
+  }
+
+  Value* addOutput() {
+    outputs_.push_back(new Value(this, outputs_.size()));
+    return outputs_.back();
+  }
+
+  Value* insertOutput(size_t i) {
+    outputs_.insert(outputs_.begin() + i, new Value(this, i));
+    for (size_t itr = i + 1; itr < outputs_.size(); ++itr) {
+      outputs_[itr]->setOffset(outputs_[itr]->offset() + 1);
+    }
+    return outputs_.at(i);
+  }
+
+  void eraseOutput(size_t i);
+
+  Block * addBlock();
+  void eraseBlock(size_t i);
+
+  // Each Node can have a list of subblocks. These are used to define structured
+  // nested control flow operators such as If and Loop.
+  // The meaning of a block is specific to the kind of node it is in, but
+  // all blocks share these semantics:
+  // * Nested lexical scoping: If a node 'Parent' has a subblock which contains a
+  //   node 'Child', Child can use any value that was in scope for the Parent
+  //   node in addition to any values defined before 'Child' in the subblock.
+  // * The list of inputs to the block are in scope for the duration of the block
+  // * the outputs of the Parent node are not in scope for the subblocks
+  // Typically the inputs to a block that represents control flow act as
+  // as the equivalents phi-nodes in standard SSA form,
+  // defining a new Value to represent any term that has multiple
+  // definitions depending on how control flowed. Outputs of the node containing
+  // control flow serve a similiar purpose defining new values for variables
+  // that would have different defintions depending on which way control flowed.
+
+  at::ArrayRef<Block*> blocks() {
+    return blocks_;
+  }
+  at::ArrayRef<const Block*> blocks() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {blocks_.data(), blocks_.size()};
+  }
+
+  // Insert unattached 'this' node after 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertBefore(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %5 = h(%1)
+  //          %4 = g(%3)
+  Node* insertBefore(Node * n) {
+    JIT_ASSERT(n->inBlockList());
+    insertAfter(n->prev());
+    return this;
+  }
+
+  // Insert unattached 'this' node after 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given: %3 = f(%1, %2)
+  //        %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertAfter(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%1)
+  Node* insertAfter(Node * n) {
+    JIT_ASSERT(!inBlockList() && n->inBlockList());
+    JIT_ASSERT(n->owningBlock());
+    this->owning_block_ = n->owningBlock();
+    Node * next = n->next();
+    n->next() = this;
+    this->prev() = n;
+    this->next() = next;
+    next->prev() = this;
+    return this;
+  }
+
+  // Move 'this' (already in the graph) after 'n' in the topological order.
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.moveAfter(%3)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  //
+  void moveAfter(Node * n) {
+    removeFromList();
+    insertAfter(n);
+  }
+
+  // Move a node 'n' (already in the graph) before 'this' in the topological order.
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %3.moveBefore(%2)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  void moveBefore(Node * n) {
+    removeFromList();
+    insertBefore(n);
+  }
+
+  // Remove the input at 'i' from this node.
+  //
+  // WARNING: This is O(n) in the number of inputs, so avoid repeatedly calling
+  // removeInput.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeInput(1)
+  // Result: %3 = f(%1)
+  void removeInput(size_t i) {
+    dropInput(i);
+    // everything after this input shifts left,
+    // so we need to update their use offsets to match
+    for(size_t j = i+1; j < inputs_.size(); j++) {
+      auto it = findUseForInput(j);
+      it->offset--;
+    }
+    inputs_.erase(inputs_.begin() + i);
+  }
+
+  // Remove all inputs from a node.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeAllInputs()
+  // Result: %3 = f()
+  void removeAllInputs() {
+    for(size_t i = 0; i < inputs().size(); ++i)
+      dropInput(i);
+    inputs_.clear();
+  }
+
+  // iterators of the node list starting at this node
+  // useful for resuming a search starting at this node
+  graph_node_list_iterator iterator();
+  graph_node_list_iterator reverseIterator();
+  const_graph_node_list_iterator iterator() const;
+  const_graph_node_list_iterator reverseIterator() const;
+
+  // Remove 'this' from the instruction list and deallocate it.
+  //
+  // Invariant: no outputs of 'this' may have any uses.
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.destroy()
+  // Result: %3 = g(%1)
+  void destroy();
+
+  // Dynamically cast this node to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid..
+  //
+  // Example usage: if(auto s = n.cast<Select>()) { ... }
+  //
+  // TODO: Make this const correct
+  template<typename T>
+  T* cast() {
+    if(T::Kind == kind())
+      return static_cast<T*>(this);
+    return nullptr;
+  }
+  template<typename T>
+  T* expect() {
+    JIT_ASSERTM(T::Kind == kind(), "expected a %s but found a %s", T::Kind.toDisplayString(), kind().toDisplayString());
+    return static_cast<T*>(this);
+  }
+
+  virtual ~Node() {}
+private:
+  // Lookup iterator in use list of _input i_ that corresponds to its use of _this_
+  use_list::iterator findUseForInput(size_t i) {
+    auto & input_uses = inputs_[i]->uses_;
+    // O(N) on the use list, but unless we get nodes with +100 uses
+    // vector traversal still is probably faster than linked list
+    auto use_it = std::find(input_uses.begin(), input_uses.end(), Use(this, i));
+    JIT_ASSERT(use_it != input_uses.end());
+    return use_it;
+  }
+
+  // remove the use of input i, this sets input i to nullptr, but
+  // is only used internally to Node before setting it to a new value
+  // or erasing the entry from the list.
+  Value* dropInput(size_t i) {
+    JIT_ASSERT(i < inputs_.size());
+    auto input_node = inputs_[i];
+    auto use_it = findUseForInput(i);
+    input_node->uses_.erase(use_it);
+    inputs_[i] = nullptr;
+    return input_node;
+  }
+
+  bool inBlockList() const {
+    if(next() == nullptr) {
+      JIT_ASSERT(prev() == nullptr);
+    }
+    return next() != nullptr;
+  }
+  void removeFromList() {
+    JIT_ASSERT(inBlockList());
+    this->owning_block_ = nullptr;
+    Node * next = this->next();
+    Node * prev = this->prev();
+    prev->next() = next;
+    next->prev() = prev;
+    this->next() = nullptr;
+    this->prev() = nullptr;
+  }
+  void lint() const;
+protected:
+  // subclasses must override
+  // this function is used by createClone to initialize a new version
+  // of a node in another graph. It should allocate a new instance of the same
+  // concrete type as 'this', but in graph 'g' which might be different
+  // than graph_
+  virtual Node * allocNewInstance(Graph * g) {
+    return new Node(g, kind());
+  }
+  // create a copy of all properties of Node s into this.
+  // subclasses should extend if they have additional information to copy.
+  // 'this' will be allocated with s->allocNewInstance(g) so it should have
+  // the same concrete type as 's'
+  //
+  // NB: This does NOT clone stages.  You're expected to set the stage correctly
+  // if you are going to preserve it.
+  virtual void cloneFrom(Node * s);
+};
+
+struct Block {
+  friend struct Node;
+  friend struct Graph;
+  TH_DISALLOW_COPY_AND_ASSIGN(Block);
+  Block(Graph * graph_, Node * node_);
+  at::ArrayRef<Value*> inputs() {
+    return input_->outputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const auto & inputs = input_->outputs();
+    return {inputs.data(), inputs.size()};
+  }
+  at::ArrayRef<Value*> outputs() {
+    return output_->inputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    return static_cast<const Node*>(output_)->inputs();
+  }
+  graph_node_list nodes() {
+    return graph_node_list(output_, kNextDirection);
+  }
+  const_graph_node_list nodes() const {
+    return const_graph_node_list(output_, kNextDirection);
+  }
+  Node * return_node() {
+    return output_;
+  }
+  const Node * return_node() const {
+    return output_;
+  }
+  Node * param_node() {
+    return input_;
+  }
+  const Node * param_node() const {
+    return input_;
+  }
+  Value * addInput(std::string name="") {
+    Value * v = input_->addOutput();
+    v->setUniqueName(name);
+    return v;
+  }
+  Value* insertInput(size_t i, std::string name = "") {
+    Value* v = input_->insertOutput(i);
+    v->setUniqueName(name);
+    return v;
+  }
+  void eraseInput(size_t i) {
+    input_->eraseOutput(i);
+  }
+  size_t registerOutput(Value * n) {
+    output_->addInput(n);
+    return outputs().size() - 1;
+  }
+  size_t insertOutput(size_t i, Value* n) {
+    output_->insertInput(i, n);
+    return i;
+  }
+  void eraseOutput(size_t i) {
+    output_->removeInput(i);
+  }
+  Node * appendNode(Node * n) {
+    JIT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertBefore(output_);
+    return n;
+  }
+
+  Node * prependNode(Node * n) {
+    JIT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertAfter(output_);
+    return n;
+  }
+  Graph * owningGraph() {
+    return graph_;
+  }
+  Node * owningNode() {
+    return owning_node_;
+  }
+  // clone all inputs, nodes, and outputs from src and append them
+  // to the inputs, nodes, and outputs of this block
+  // value_map is used whenever a node in src references a free variable
+  // in src to look up its corresponding value
+  void cloneFrom(Block * src, std::function<Value*(Value*)> value_map);
+private:
+  // should only be called in the constructor
+  Node* initOutput(Node* p) {
+    p->next() = p;
+    p->prev() = p;
+    p->setStage(std::numeric_limits<size_t>::max());
+    return p;
+  }
+
+  // get rid of all nodes
+  // destroys in reverse order so that uses internal to this block
+  // do not have to be removed before you can destroy the block
+  void destroy();
+
+  Graph * const graph_;
+  // holds outputs in a way that can be reflected
+  // as a Use object
+  // also used as the beginning/end of the circular node list to avoid
+  // having corner cases where the list is empty.
+  Node * const output_;
+  Node * const input_;
+  Node * const owning_node_; // either the node that has this block or nullptr for root
+};
+
+struct Graph {
+TH_DISALLOW_COPY_AND_ASSIGN(Graph);
+friend struct Node;
+friend struct Value;
+friend struct Block;
+private:
+
+  // only used to keep track of allocated nodes
+  // actual representation of Graph is done with
+  // inputs, outputs, nodes
+
+  std::unordered_set<const Node*> all_nodes;
+  std::unordered_set<const Value*> all_values;
+  std::unordered_set<const Block*> all_blocks;
+  size_t next_unique_;
+
+  std::unordered_map<std::string, Value*> unique_names_;
+
+  size_t new_node_stage_;
+
+  std::shared_ptr<Scope> scope_root_;
+  Scope * current_scope_;
+
+  Block* const block_;
+  // when insertNode() is called, the node is inserted before this node
+  // by default this is set to append to the top level block
+  Node* insert_before_;
+
+public:
+
+  Graph(std::shared_ptr<Scope> scope_root)
+  : next_unique_(0)
+  , new_node_stage_(0)
+  , scope_root_(scope_root)
+  , current_scope_(scope_root_.get())
+  , block_(new Block(this, nullptr))
+  , insert_before_(return_node()) {}
+
+  Graph()
+  : Graph( std::make_shared<Scope>()) {}
+
+  at::ArrayRef<Value*> inputs() {
+    return block_->inputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const auto & block = *block_;
+    return block.inputs();
+  }
+  at::ArrayRef<Value*> outputs() {
+    return block_->outputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    const auto & block = *block_;
+    return block.outputs();
+  }
+  graph_node_list nodes() {
+    return block_->nodes();
+  }
+  const_graph_node_list nodes() const {
+    const auto & block = *block_;
+    return block.nodes();
+  }
+  Node * return_node() {
+    return block_->return_node();
+  }
+  const Node * return_node() const {
+    return block_->return_node();
+  }
+  void push_scope(const std::string& scope_name) {
+    current_scope_ = current_scope_->push(Symbol::scope(scope_name));
+  }
+  void pop_scope() {
+    current_scope_ = current_scope_->parent();
+  }
+  Scope * current_scope() {
+    return current_scope_;
+  }
+  void set_current_scope(Scope* scope) {
+    if (scope->getRoot() != scope_root_.get()) {
+      throw std::runtime_error("trying to set a scope as current that does not belong to the Graph's scope trie");
+    }
+    current_scope_ = scope;
+  }
+  std::shared_ptr<Scope> scope_root() {
+    return scope_root_;
+  }
+  Value * addInput(std::string name="") {
+    return block_->addInput(std::move(name));
+  }
+  Value* insertInput(size_t i, std::string name = "") {
+    return block_->insertInput(i, std::move(name));
+  }
+  void eraseInput(size_t i) {
+    block_->eraseInput(i);
+  }
+  void eraseOutput(size_t i) {
+    block_->eraseOutput(i);
+  }
+  void advanceStage() {
+    new_node_stage_++;
+  }
+  void setStage(size_t new_stage) {
+    new_node_stage_ = new_stage;
+  }
+  size_t stage() const {
+    return new_node_stage_;
+  }
+  ResourceGuard setStageTemporary(size_t s) {
+    auto prev_stage = new_node_stage_;
+    new_node_stage_ = s;
+    return ResourceGuard([prev_stage, this]() { this->new_node_stage_ = prev_stage; });
+  }
+
+  size_t registerOutput(Value * n) {
+    return block_->registerOutput(n);
+  }
+
+  Node * create(NodeKind kind, size_t num_outputs=1) {
+    // NB: Node constructor adds node to all_nodes
+    auto n = new Node(this, kind);
+    for(size_t i = 0; i < num_outputs; i++)
+      n->addOutput();
+    return n;
+  }
+
+  Node * create(NodeKind kind, ArrayRef<Value*> inputs, size_t num_outputs=1) {
+    auto n = create(kind, num_outputs);
+    for(auto i : inputs)
+      n->addInput(i);
+    return n;
+  }
+
+  Node * createUndefined() {
+    return create(prim::Undefined);
+  }
+  Node * createConstant(const at::Tensor& ref) {
+    JIT_ASSERT(ref.defined());
+    auto n = create(prim::Constant);
+    n->t_(attr::value, ref.clone());
+    n->output()->inferTypeFrom(ref);
+    return n;
+  }
+  Node * createFusionGroup(int device) {
+    auto n = create(prim::FusionGroup, 0);
+    n->g_(attr::Subgraph,std::make_shared<Graph>(scope_root_));
+    n->i_(attr::device, device);
+    return n;
+  }
+  Node* createTuple(at::ArrayRef<Value*> values) {
+    auto types = fmap(values, [](Value* v) { return v->type(); });
+    auto tt = std::make_shared<TupleType>(std::move(types));
+    auto n = create(prim::TupleConstruct, values);
+    n->output()->setType(tt);
+    return n;
+  }
+  Node* createTupleUnpack(Value * v) {
+    TupleType* tt = v->type()->expect<TupleType>();
+    auto n = create(prim::TupleUnpack, {v}, 0);
+    for(auto & element : tt->elements()) {
+      n->addOutput()->setType(element);
+    }
+    return n;
+  }
+  Node* createPythonOp(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args);
+  // clone n, making a new node in _this_ graph.
+  // use node_map to translate inputs of n to inputs of the cloned node
+  // if copy_blocks is false, it will not recursively clone the nested blocks
+  // this node contains.
+  Node * createClone(Node * n, std::function<Value*(Value*)> value_map, bool copy_blocks=true) {
+    //n can be from a different graph
+    Node * r = n->allocNewInstance(this);
+    for(auto o : n->outputs()) {
+      r->addOutput()->copyMetadata(o);
+    }
+    r->cloneFrom(n);
+    for(auto i : n->inputs()) {
+      r->addInput(value_map(i));
+    }
+    if(copy_blocks) {
+      for(auto b : n->blocks()) {
+        r->addBlock()->cloneFrom(b, value_map);
+      }
+    }
+    return r;
+  }
+
+  Node * appendNode(Node * n) {
+    return block_->appendNode(n);
+  }
+
+  Node * prependNode(Node * n) {
+    return block_->prependNode(n);
+  }
+
+  // insert before insert_before_ node
+  // initialized to insert at the end of the top level block
+  // can be changed with setInsertPoint()
+  Node * insertNode(Node * n) {
+    JIT_ASSERT(insert_before_->inBlockList() && "insert point node is no longer in a block list");
+    return n->insertBefore(insert_before_);
+  }
+  // set where nodes are inserted to append to the end of this block
+  void setInsertPoint(Block * b) {
+    JIT_ASSERT(b->owningGraph() == this);
+    insert_before_ = b->return_node();
+  }
+  // set where nodes are inserted to insert _before_ this node
+  // for implementation simplicity we only support inserting before a node for now
+  void setInsertPoint(Node * n) {
+    JIT_ASSERT(n->owningGraph() == this && n->inBlockList());
+    insert_before_ = n;
+  }
+  Node * insertPoint() {
+    return insert_before_;
+  }
+
+  // the top level block
+  Block * block() {
+    return block_;
+  }
+  const Block * block() const {
+    return block_;
+  }
+
+  // Checks well-formedness and invariants of graph
+  void lint() const;
+  // for use in debugger
+  void dump() const;
+
+  ~Graph() {
+    for (const Node * n : all_nodes)
+      delete n;
+    for (const Value * v : all_values)
+      delete v;
+    for (const Block * b : all_blocks)
+      delete b;
+  }
+
+  std::string toString() const {
+    std::ostringstream oss;
+    oss << *this;
+    return oss.str();
+  }
+
+  friend std::ostream& operator<<(std::ostream & out, const Graph & g);
+  std::shared_ptr<Graph> copy();
+
+private:
+
+  void freeNode(Node * n) {
+    auto it = all_nodes.find(n);
+    JIT_ASSERT(it != all_nodes.end());
+    delete *it;
+    all_nodes.erase(it);
+  }
+  void freeValue(Value * v) {
+    v->setUniqueName("");
+    auto it = all_values.find(v);
+    JIT_ASSERT(it != all_values.end());
+    delete *it;
+    all_values.erase(it);
+  }
+  void freeBlock(Block * b) {
+    auto it = all_blocks.find(b);
+    JIT_ASSERT(it != all_blocks.end());
+    delete *it;
+    all_blocks.erase(it);
+  }
+};
+
+struct WithInsertPoint : public ResourceGuard {
+  WithInsertPoint(Node * n)
+  : ResourceGuard([this] {
+    prev->owningGraph()->setInsertPoint(prev);
+  })
+  , prev(n->owningGraph()->insertPoint()) {
+    n->owningGraph()->setInsertPoint(n);
+  }
+  WithInsertPoint(Block * b)
+  : WithInsertPoint(b->return_node()) {}
+private:
+  Node * prev;
+};
+
+struct WithCurrentScope : public ResourceGuard {
+  WithCurrentScope(Graph & g, Scope* scope)
+  : ResourceGuard([&g, this]() {
+    g.set_current_scope(prev_scope);
+  })
+  , prev_scope(g.current_scope()) {
+    g.set_current_scope(scope);
+  }
+private:
+  Scope * prev_scope;
+};
+
+inline Value::Value(Node * node_, size_t offset_)
+: node_(node_),
+  offset_(offset_),
+  unique_(node_->graph_->next_unique_++),
+  stage_(node_->graph_->new_node_stage_),
+  type_(DynamicType::get()) {
+  node_->graph_->all_values.emplace(this);
+}
+
+inline Graph * Value::owningGraph() {
+  return node()->owningGraph();
+}
+
+inline const Graph * Value::owningGraph() const {
+  return node()->owningGraph();
+}
+
+inline void Value::replaceFirstUseWith(Value * newValue) {
+  JIT_ASSERT(owningGraph() == newValue->owningGraph());
+  auto u = uses()[0];
+  u.user->inputs_[u.offset] = newValue;
+  newValue->uses_.push_back(u);
+  uses_.erase(uses_.begin());
+}
+
+inline void Value::replaceAllUsesWith(Value * newValue) {
+  while (!uses().empty()) {
+    replaceFirstUseWith(newValue);
+  }
+}
+
+inline Node::Node(Graph * graph_, NodeKind kind_) :
+  kind_(kind_),
+  graph_(graph_),
+  owning_block_(nullptr),
+  stage_(graph_->new_node_stage_),
+  scope_(graph_->current_scope_) {
+  graph_->all_nodes.emplace(this);
+}
+
+inline void Node::eraseOutput(size_t i) {
+  JIT_ASSERT(i < outputs_.size());
+  JIT_ASSERT(outputs_[i]->uses().size() == 0);
+  Value * n = outputs_[i];
+  outputs_.erase(outputs_.begin() + i);
+  owningGraph()->freeValue(n);
+  for(size_t j = i; j < outputs_.size(); j++) {
+    outputs_[j]->offset_--;
+  }
+}
+
+inline Block * Node::addBlock() {
+  blocks_.push_back(new Block(owningGraph(), this));
+  return blocks_.back();
+}
+
+inline void Node::eraseBlock(size_t i) {
+  JIT_ASSERT(i < blocks_.size());
+  Block * n = blocks_[i];
+  blocks_.erase(blocks_.begin() + i);
+  n->destroy();
+}
+
+inline void Node::destroy() {
+  while(outputs().size() > 0)
+    eraseOutput(outputs().size() - 1);
+  while(blocks().size() > 0)
+    eraseBlock(blocks().size() - 1);
+  removeAllInputs();
+  if(inBlockList())
+    removeFromList();
+  graph_->freeNode(this);
+}
+
+inline void Node::cloneFrom(Node * s) {
+	setSourceLocation(s->getSourceLocation());
+	if (s->owningGraph()->scope_root_ == owningGraph()->scope_root_) {
+		scope_ = s->scope_;
+	}
+	copyAttributes(*s);
+}
+
+inline Block::Block(Graph * graph_, Node * node_)
+: graph_(graph_)
+, output_(initOutput(graph_->create(prim::Return, 0)))
+, input_(graph_->create(prim::Param,0))
+, owning_node_(node_) {
+  graph_->all_blocks.emplace(this);
+  output_->owning_block_ = this;
+  input_->owning_block_ = this;
+}
+
+inline void Block::destroy() {
+  // we cannot destroy the output because it is used as the sentinel
+  // for the nodes() list and has to remain valid for the loop
+  output_->removeAllInputs();
+  for(auto it = this->nodes().reverse().begin(),
+      end = this->nodes().reverse().end();
+      it != end; ++it) {
+    it.destroyCurrent();
+  }
+  output_->destroy();
+  input_->destroy();
+  graph_->freeBlock(this);
+}
+
+// Helper macros for constructing switch statements over Node types
+// instead of heavy-weight visitors
+// read 'between' these defines to see how they turn into a big switch
+// statement
+
+// Mutable case
+// The IFM/ELSEIFM indicate that subclass *refinement* occurs.
+// This is only valid for node types for which we have subclasses.
+#define IR_IFM(x,Kind) GENERIC_IF(,prim::Kind,x,Kind)
+#define IR_ELSEIFM(Kind) GENERIC_ELSEIF(,prim::Kind,Kind)
+
+#define IR_IFM_CONST(x,Kind) GENERIC_IF(const,prim::Kind,x,Kind)
+#define IR_ELSEIFM_CONST(Kind) GENERIC_ELSEIF(const,prim::Kind,Kind)
+
+#define IR_IF(x, Kind) \
+  auto && __match_key = x; \
+  switch(__match_key->kind()) { \
+    case ::torch::jit::prim::Kind: { \
+      auto * value = __match_key; (void) value;
+#define IR_ELSEIF(Kind) \
+    } break; \
+    case ::torch::jit::prim::Kind: { \
+      auto * value = __match_key; (void) value;
+
+#define IR_ELSE() GENERIC_ELSE()
+#define IR_END() GENERIC_END()
+
+/* example:
+  Node * n = ...;
+  IR_IF(n,Select)
+    cout << "Select of" << value->input() << "\n";
+  IR_ELSEIF(PythonOp)
+    cout << value->pyobj << "\n";
+  IR_ELSEIF(Add)
+    cout << "Add" << \n";
+  IR_ELSE() // optional
+    cout << "something else\n";
+  IR_END()
+*/
+
+/************* All nodes not required to be defined before Graph **************/
+
+ // execute a Python function, used for Ops we can't optimize but that we want to optimize around
+struct PythonOp : public Node {
+  static constexpr Symbol Kind = prim::PythonOp;
+
+  PythonOp(Graph * graph)
+  : Node(graph,prim::PythonOp) {}
+  PythonOp* init(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args) {
+    this->pyobj = std::move(pyobj);
+    this->scalar_args = std::move(scalar_args);
+    this->cconv = cconv;
+    return this;
+  }
+  // The Python object which contains the implementation of this function.
+  // This is either a class (non-legacy) or an object (legacy).  See
+  // TraceInterpreterState for execution semantics.
+  THPObjectPtr pyobj;
+  // The calling convention for the Python function.
+  // 's' -- python scalar argument
+  // 't' -- tensor argument
+  std::string cconv;
+  // Scalar arguments to the Python function.  Not necessarily passed to
+  // the function in this order; see cconv for the correct order.
+  std::vector<THPObjectPtr> scalar_args;
+  virtual std::string name() const = 0;
+  virtual void writeScalars(std::ostream& out) const = 0;
+  void cloneFrom(Node * other_) override = 0;
+  Node * allocNewInstance(Graph * g) override = 0;
+  // recover the autograd.Function instance, if this PythonOp's function
+  // was originally SomeFunction.apply
+  // used in ONNX for discovering symbolics
+  virtual at::optional<THPObjectPtr> autogradFunction() const = 0;
+
+};
+// patched in when python bindings are loaded
+PythonOp* allocPythonOp(Graph* g);
+void setAllocPythonOp(PythonOp* (*v)(Graph* g));
+
+inline Node* Graph::createPythonOp(
+    THPObjectPtr&& pyobj,
+    const std::string& cconv,
+    pyobj_list&& scalar_args) {
+  auto op = allocPythonOp(this);
+  return op->init(
+      std::move(pyobj),
+      cconv,
+      std::move(scalar_args));
+}
+
+inline graph_node_list_iterator Node::iterator() {
+  return graph_node_list_iterator(this, 0);
+}
+inline graph_node_list_iterator Node::reverseIterator() {
+  return iterator().reverse();
+}
+inline const_graph_node_list_iterator Node::iterator() const {
+  return const_graph_node_list_iterator(this, 0);
+}
+inline const_graph_node_list_iterator Node::reverseIterator() const {
+  return iterator().reverse();
+}
+
+void LintGraph(std::shared_ptr<Graph>& graph);
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
new file mode 100644
index 0000000..c31436d
--- /dev/null
+++ b/torch/csrc/jit/ivalue.h
@@ -0,0 +1,278 @@
+#pragma once
+#include <ATen/ATen.h>
+#include "torch/csrc/assertions.h"
+
+namespace torch { namespace jit {
+
+// smart pointer to hold onto at::Retainable objects in a generic way
+// this is close to the implementation of boost's intrusive_ptr
+template<typename PointerType>
+struct Shared {
+  Shared(): Shared(nullptr, false) {}
+  Shared(PointerType * self, bool retain)
+  : pImpl(self) {
+    if(retain && pImpl)
+      pImpl->retain();
+  }
+  Shared(const Shared & rhs)
+  : pImpl(rhs.pImpl) {
+    if (pImpl)
+      pImpl->retain();
+  }
+  Shared(Shared && rhs) noexcept
+  : pImpl(rhs.pImpl) {
+    rhs.pImpl = nullptr;
+  }
+  ~Shared() {
+    if (pImpl)
+      pImpl->release();
+  }
+  Shared & operator=(Shared && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  Shared & operator=(Shared const & rhs) & {
+      //Shared ctor retains original rhs.pImpl
+      //then rhs.pImpl is swapped with this->pImpl
+      //finally Shared dtor releases rhs.pImpl, which was originally this->pImpl
+      Shared(rhs).swap(*this);
+      return *this;
+  }
+  void reset() {
+    Shared().swap(*this);
+  }
+  void reset(PointerType * rhs) {
+    Shared(rhs, true).swap(*this);
+  }
+  void reset(PointerType * rhs, bool retain) {
+    Shared(rhs, retain).swap(*this);
+  }
+  void swap(Shared & rhs) {
+    PointerType * tmp = pImpl;
+    pImpl = rhs.pImpl;
+    rhs.pImpl = tmp;
+  }
+  PointerType* get() const {
+    return pImpl;
+  }
+  PointerType* detach() {
+    PointerType * ret = pImpl;
+    pImpl = nullptr;
+    return ret;
+  }
+  PointerType& operator*() const {
+    return  *get();
+  }
+  PointerType* operator->() const {
+    return get();
+  }
+  operator bool() const {
+    return pImpl != nullptr;
+  }
+private:
+  PointerType * pImpl;
+};
+
+
+template<typename T>
+struct ConstantList;
+struct IValue;
+using Tuple = ConstantList<IValue>;
+using IntList = ConstantList<int64_t>;
+using DoubleList = ConstantList<double>;
+
+// IValue is the generic tagged union used by the interpreter to hold
+// all value types.
+// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
+// The tag is currently 4 bytes to determine the type, and 1 byte
+// to mark whether that type is a subtype of at::Retainable and needs
+// retain/release calls.
+struct IValue {
+  IValue()
+  : payload(0)
+  , tag(Tag::None)
+  , retainable(false) {}
+  IValue(const IValue& rhs)
+      : payload(rhs.payload),
+        tag(rhs.tag),
+        retainable(rhs.retainable) {
+    if (retainable)
+      as_retainable->retain();
+  }
+  IValue(IValue&& rhs) noexcept : IValue() {
+    swap(rhs);
+  }
+  ~IValue() {
+    if (retainable) {
+      as_retainable->release();
+    }
+  }
+  IValue & operator=(IValue && rhs) & {
+    rhs.swap(*this);
+    return *this;
+  }
+  IValue & operator=(IValue const & rhs) & {
+      IValue(rhs).swap(*this);
+      return *this;
+  }
+  void swap(IValue & rhs) {
+    std::swap(payload, rhs.payload);
+    std::swap(retainable, rhs.retainable);
+    std::swap(tag, rhs.tag);
+  }
+  // Accessors for subtypes are arragned together below
+  // While some of these accessors could be generated through templates,
+  // we prefer to write them manually for clarity
+
+  // Tensor
+  IValue(at::Tensor t)
+  : tag(Tag::Tensor), retainable(t.defined())  {
+    // note: the undefined tensor is not refcounted, so while it
+    // is tagged as a tensor, retainable is set to false.
+    as_tensor_impl = t.at::detail::TensorBase::detach();
+  }
+  bool isTensor() const { return Tag::Tensor == tag; }
+  at::Tensor toTensor() && {
+    JIT_ASSERT(isTensor());
+    at::Tensor t(as_tensor_impl, /*retain=*/false);
+    clearToNone();
+    return t;
+  }
+  at::Tensor toTensor() const & {
+    JIT_ASSERT(isTensor());
+    return at::Tensor(as_tensor_impl, /*retain=*/true);
+  }
+
+  // Tuple
+  IValue(Shared<Tuple> v);
+  bool isTuple() const { return Tag::Tuple == tag; }
+  Shared<Tuple> toTuple() && {
+    JIT_ASSERT(isTuple());
+    return moveToRetainable<Tuple>();
+  }
+  Shared<Tuple> toTuple() const & {
+    JIT_ASSERT(isTuple());
+    return toRetainable<Tuple>();
+  }
+
+  // Double
+  IValue(double d)
+  : tag(Tag::Double), retainable(false) {
+    as_double = d;
+  }
+  bool isDouble() const { return Tag::Double == tag; }
+  double toDouble() const {
+    JIT_ASSERT(isDouble());
+    return as_double;
+  }
+
+  // Int
+  IValue(int64_t i)
+  : tag(Tag::Int), retainable(false) {
+    as_int = i;
+  }
+  // allow you to pass literals (3, 4) without ambiguity
+  IValue(int32_t i)
+  : IValue(static_cast<int64_t>(i)) {}
+
+  bool isInt() const { return Tag::Int == tag; }
+  int64_t toInt() const {
+    JIT_ASSERT(isInt());
+    return as_int;
+  }
+
+  // IntList
+  IValue(Shared<IntList> v);
+  bool isIntList() const { return Tag::IntList == tag; }
+  Shared<IntList> toIntList() && {
+    JIT_ASSERT(isIntList());
+    return moveToRetainable<IntList>();
+  }
+  Shared<IntList> toIntList() const & {
+    JIT_ASSERT(isIntList());
+    return toRetainable<IntList>();
+  }
+
+  // DoubleList
+  IValue(Shared<DoubleList> v);
+  bool isDoubleList() const { return Tag::DoubleList == tag; }
+  Shared<DoubleList> toDoubleList() && {
+    JIT_ASSERT(isDoubleList());
+    return moveToRetainable<DoubleList>();
+  }
+  Shared<DoubleList> toDoubleList() const & {
+    JIT_ASSERT(isDoubleList());
+    return toRetainable<DoubleList>();
+  }
+
+  bool isNone() {
+    return Tag::None == tag;
+  }
+
+private:
+  template<typename T>
+  Shared<T> moveToRetainable() {
+    Shared<T> t(static_cast<T*>(as_retainable), false);
+    clearToNone();
+    return t;
+  }
+  template<typename T>
+  Shared<T> toRetainable() const {
+    return Shared<T>(static_cast<T*>(as_retainable), true);
+  }
+  void clearToNone() {
+    payload = 0;
+    tag = Tag::None;
+    retainable = false;
+  }
+  enum class Tag : uint32_t {
+    None, Tensor, Double, Int, Tuple, IntList, DoubleList
+  };
+  union {
+    at::TensorImpl* as_tensor_impl;
+    at::Retainable* as_retainable;
+    double as_double;
+    int64_t as_int;
+    // this type should be as big as all the other types because it will
+    // be used to copy the union's value in certain cases
+    int64_t payload;
+  };
+  Tag tag;
+  bool retainable;
+};
+
+
+// non-mutable list
+template<typename Elem>
+struct ConstantList : at::Retainable {
+ private:
+  ConstantList(std::vector<Elem> elements_)
+  : elements_(std::move(elements_)) {}
+  std::vector<Elem> elements_;
+ public:
+  static Shared<ConstantList<Elem>> create(std::vector<Elem> elements_) {
+    return Shared<ConstantList<Elem>>(
+        new ConstantList<Elem>(std::move(elements_)), false);
+  }
+  at::ArrayRef<Elem> elements() const {
+    return elements_;
+  }
+};
+
+inline IValue::IValue(Shared<Tuple> v)
+: tag(Tag::Tuple), retainable(true) {
+  as_retainable = v.detach();
+}
+
+inline IValue::IValue(Shared<IntList> v)
+: tag(Tag::IntList), retainable(true) {
+  as_retainable = v.detach();
+}
+
+inline IValue::IValue(Shared<DoubleList> v)
+: tag(Tag::DoubleList), retainable(true) {
+  as_retainable = v.detach();
+}
+
+
+}}
diff --git a/torch/csrc/jit/named_value.h b/torch/csrc/jit/named_value.h
new file mode 100644
index 0000000..73e2406
--- /dev/null
+++ b/torch/csrc/jit/named_value.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "ATen/ATen.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/script/tree.h"
+
+namespace torch { namespace jit {
+
+struct NamedValue {
+  NamedValue(const script::SourceRange& loc, const std::string& name, Value* value)
+  : loc(loc), name(name), value(value) {}
+  NamedValue(const script::SourceRange& loc, int i, Value* value)
+  : loc(loc), name("argument " + std::to_string(i)), value(value) {}
+
+  script::SourceRange loc;
+  std::string name;
+  Value* value;
+};
+
+}}
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
new file mode 100644
index 0000000..90c43fe
--- /dev/null
+++ b/torch/csrc/jit/operator.cpp
@@ -0,0 +1,383 @@
+#include "ATen/ATen.h"
+#include "torch/csrc/jit/script/lexer.h"
+#include "torch/csrc/jit/script/tree.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/script/error_report.h"
+
+namespace torch { namespace jit {
+
+namespace script {
+struct SchemaParser {
+  SchemaParser(const std::string& str)
+  : L(str) {}
+
+  FunctionSchema parseDeclaration() {
+    auto name = L.expect(TK_IDENT).text();
+    if(L.nextIf(':')) {
+      L.expect(':');
+      name = name + "::" + L.expect(TK_IDENT).text();
+    }
+    std::vector<Argument> arguments;
+    std::vector<Argument> returns;
+    kwarg_only = false;
+    parseList('(', ',', ')', arguments, &SchemaParser::parseArgument);
+    L.expect(TK_ARROW);
+    if(L.cur().kind == '(') {
+      parseList('(', ',', ')', returns, &SchemaParser::parseReturn);
+    } else {
+      parseReturn(returns);
+    }
+    return FunctionSchema { name, arguments, returns };
+  }
+
+  std::vector<FunctionSchema> parseDeclarations() {
+    std::vector<FunctionSchema> results;
+    do {
+      results.push_back(parseDeclaration());
+    } while(L.nextIf(TK_NEWLINE));
+    L.expect(TK_EOF);
+    return results;
+  }
+
+  TreeRef parseIdent() {
+    return String::create(L.expect(TK_IDENT).text());
+  }
+  TypePtr parseBaseType() {
+    static std::unordered_map<std::string, TypePtr> type_map = {
+      {"Tensor", DynamicType::get() },
+      {"Generator", DynamicType::get() },
+      {"ScalarType", IntType::get() },
+      {"Layout", IntType::get() },
+      {"Device", ListType::ofInts() },
+      {"Scalar", NumberType::get() },
+    };
+    switch(L.cur().kind) {
+      case TK_FLOAT:
+        L.next();
+        return FloatType::get();
+      case TK_INT:
+      case TK_BOOL: // TODO: add separate bool type
+        L.next();
+        return IntType::get();
+      default:
+        auto tok = L.expect(TK_IDENT);
+        auto text = tok.text();
+        auto it = type_map.find(text);
+        if(it == type_map.end())
+          throw ErrorReport(tok.range) << "unknown type specifier";
+        return it->second;
+    }
+  }
+  void parseType(Argument& arg) {
+    arg.type = parseBaseType();
+    if(L.nextIf('[')) {
+      arg.type = std::make_shared<ListType>(arg.type);
+      if(L.cur().kind == TK_NUMBER) {
+        arg.N = std::stoll(L.next().text());
+      }
+      L.expect(']');
+    }
+  }
+
+  void parseArgument(std::vector<Argument>& arguments) {
+    // varargs
+    if(L.nextIf('*')) {
+      kwarg_only = true;
+      return;
+    }
+    Argument arg;
+    parseType(arg);
+
+    // nullability is ignored for now, since the JIT never cares about it
+    L.nextIf('?');
+    arg.name = L.expect(TK_IDENT).text();
+    if(L.nextIf('=')) {
+      parseDefaultValue(arg);
+    }
+    arg.kwarg_only = kwarg_only;
+    arguments.push_back(std::move(arg));
+  }
+  void parseReturn(std::vector<Argument>& args) {
+    Argument arg("ret" + std::to_string(args.size()));
+    parseType(arg);
+    args.push_back(std::move(arg));
+  }
+  at::Tensor parseSingleConstant(TypeKind kind) {
+    switch(L.cur().kind) {
+      case TK_TRUE:
+        L.next();
+        return one();
+      case TK_FALSE:
+        L.next();
+        return zero();
+      case TK_FLOAT:
+        L.next();
+        return as_tensor(static_cast<int64_t>(at::kFloat));
+      case TK_IDENT: {
+        auto tok = L.next();
+        auto text = tok.text();
+        if("cpu" == text) {
+          return as_tensor(static_cast<int64_t>(at::Device::Type::CPU));
+        } else if("strided" == text) {
+          return as_tensor(static_cast<int64_t>(at::kStrided));
+        } else if("ElementwiseMean" == text) {
+          return as_tensor(static_cast<int64_t>(Reduction::ElementwiseMean));
+        } else {
+          throw ErrorReport(L.cur().range) << "invalid numeric default value";
+        }
+      } default:
+        std::string n;
+        if(L.nextIf('-'))
+          n = "-" + L.expect(TK_NUMBER).text();
+        else
+          n = L.expect(TK_NUMBER).text();
+        if(kind == TypeKind::FloatType || n.find(".") != std::string::npos || n.find("e") != std::string::npos) {
+          return at::full({}, std::stod(n), at::kDouble); // float?
+        } else {
+          int64_t v = std::stoll(n);
+          return at::full({}, v, at::kLong);
+        }
+    }
+  }
+  at::Tensor parseConstantList(TypeKind kind) {
+    auto tok = L.expect('[');
+    std::vector<at::Tensor> vs;
+    if(L.cur().kind != ']') {
+      do {
+        vs.push_back(parseSingleConstant(kind));
+      } while(L.nextIf(','));
+    }
+    L.expect(']');
+    if(vs.size() == 0) {
+      switch(kind) {
+        case TypeKind::FloatType:
+          return at::empty({}, at::kFloat);
+        case TypeKind::IntType:
+          return at::empty({}, at::kLong);
+        default:
+          throw ErrorReport(tok) << "empty lists are only supported for float or int types.";
+      }
+    }
+    return at::stack(vs);
+  }
+  at::Tensor parseTensorDefault(const SourceRange& range) {
+    if("None" == L.expect(TK_IDENT).text()) {
+      return at::Tensor();
+    } else {
+      throw ErrorReport(range) << "invalid tensor default value";
+    }
+  }
+  void parseDefaultValue(Argument& arg) {
+    auto range = L.cur().range;
+    switch(arg.type->kind()) {
+      case TypeKind::DynamicType: {
+        arg.default_value = parseTensorDefault(range);
+      }  break;
+      case TypeKind::NumberType:
+      case TypeKind::IntType:
+      case TypeKind::FloatType:
+        arg.default_value = parseSingleConstant(arg.type->kind());
+        break;
+      case TypeKind::ListType: {
+        auto elem_kind = arg.type->cast<ListType>()->getElementType();
+        if(L.cur().kind == TK_IDENT) {
+          arg.default_value = parseTensorDefault(range);
+        } else if(arg.N && L.cur().kind != '[') {
+          arg.default_value = parseSingleConstant(elem_kind->kind()).expand({*arg.N});
+        } else {
+          arg.default_value = parseConstantList(elem_kind->kind());
+        }
+      } break;
+      default:
+        throw ErrorReport(range) << "unexpected type, file a bug report";
+    }
+  }
+
+  template<typename T>
+  void parseList(int begin, int sep, int end, std::vector<T>& result, void (SchemaParser::*parse)(std::vector<T>&)) {
+    auto r = L.cur().range;
+    if (begin != TK_NOTHING)
+      L.expect(begin);
+    if (L.cur().kind != end) {
+      do {
+        (this->*parse)(result);
+      } while (L.nextIf(sep));
+    }
+    if (end != TK_NOTHING)
+      L.expect(end);
+  }
+  Lexer L;
+  bool kwarg_only;
+  static at::Tensor one() {
+    static at::Tensor v = at::full({}, 1, at::kLong);
+    return v;
+  }
+  static at::Tensor zero() {
+    static at::Tensor v = at::full({}, 0, at::kLong);
+    return v;
+  }
+};
+}
+
+
+namespace {
+
+using OperatorMap = std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>>;
+struct OperatorRegistry  {
+  OperatorMap operators;
+  std::mutex lock;
+  void registerOperator(Operator&& op){
+    std::lock_guard<std::mutex> guard(lock);
+    Symbol sym = Symbol::fromQualString(op.schema.name);
+    operators[sym].push_back(std::make_shared<Operator>(std::move(op)));
+  }
+  const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
+    std::lock_guard<std::mutex> guard(lock);
+    static std::vector<std::shared_ptr<Operator>> empty;
+    auto it = operators.find(name);
+    if(it != operators.end())
+      return it->second;
+    return empty;
+  }
+};
+
+OperatorRegistry& getRegsitry() {
+  static OperatorRegistry r;
+  return r;
+}
+
+}
+
+void registerOperator(Operator&& op) {
+  getRegsitry().registerOperator(std::move(op));
+}
+
+const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name) {
+  return getRegsitry().getOperators(name);
+}
+
+FunctionSchema parseSchema(const std::string& schema) {
+  return script::SchemaParser(schema).parseDeclarations().at(0);
+}
+
+at::optional<AttributeKind> attributeKindOf(TypePtr type) {
+  switch(type->kind()) {
+    case TypeKind::IntType: return AttributeKind::i;
+    case TypeKind::FloatType: return AttributeKind::f;
+    case TypeKind::NumberType: return AttributeKind::t;
+    case TypeKind::ListType:
+      if(type->isSubtypeOf(*ListType::ofInts()))
+        return AttributeKind::is;
+      else
+        return at::nullopt;
+    default:
+      return at::nullopt;
+  }
+}
+
+bool typeMatches(TypePtr actual, TypePtr formal) {
+  if(actual->isSubtypeOf(*formal))
+    return true;
+
+  // XXX - this is here because we allow tensors to be used in place of numbers
+  // or lists of numbers in the script because of the restriction that all inputs to script must be tensors.
+  // Once numbers are always treated as seperate types from Tensors, this line
+  // should be removed, since it opens up the possibility of ambigous declarations
+  // dispatching to the wrong implementation.
+  if ((formal->isSubtypeOf(*NumberType::get()) ||
+       formal->isSubtypeOf(*ListType::ofInts())) &&
+      actual->isSubtypeOf(*DynamicType::get()))
+    return true;
+
+  return false;
+}
+
+bool Operator::matchesNode(Node* node) const {
+  size_t attributes_size = node->numAttributes();
+  size_t attributes_seen = 0;
+  auto inputs_size = node->inputs().size();
+  size_t input_i = 0;
+  for(size_t arg_i = 0; arg_i < schema.arguments.size(); ++arg_i) {
+    at::optional<AttributeKind> attribute_kind;
+    const Argument& arg = schema.arguments[arg_i];
+    if(attributes_size > 0 && (attribute_kind = attributeKindOf(arg.type))) {
+      auto name = Symbol::fromQualString("attr::" + arg.name);
+      if(!node->hasAttribute(name) || node->kindOf(name) != *attribute_kind) {
+        // std::cout << "missing attribute: " << name << "\n";
+        return false;
+      }
+      attributes_seen++;
+    } else if(*arg.type == *ListType::ofTensors()) {
+      // Tensor[] is handled as varargs, consume inputs until the remaining required arguments
+      // XXX - there can only be a single Tensor[] in a declaration
+      size_t remaining_required = 0;
+      for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){
+        // remaining arguments are only those that won't be consumed from attributes
+        if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type))
+          remaining_required++;
+      }
+      while(inputs_size - input_i > remaining_required) {
+        auto input = node->inputs()[input_i++];
+        if(!typeMatches(input->type(), DynamicType::get())) {
+          // std::cout << "vararg argument is not Dynamic\n";
+          return false;
+        }
+      }
+    } else {
+      if(input_i == inputs_size) {
+        // std::cout << "not enough inputs\n";
+        return false;
+      }
+      auto input = node->inputs()[input_i++];
+      if(!typeMatches(input->type(), arg.type)) {
+        // std::cout << "argument " << arg_i << " has the wrong type\n";
+        return false;
+      }
+    }
+  }
+
+  if(!schema.is_vararg && input_i != inputs_size) {
+    // std::cout << "not all inputs used\n" << input_i << " " << inputs_size << "\n";
+    return false;
+  }
+  if(!schema.is_vararg && attributes_seen != attributes_size) {
+    // std::cout << "not all attributes used\n" << attributes_seen << " " << attributes_size << "\n";
+    return false;
+  }
+  return true;
+}
+
+std::shared_ptr<Operator> findOperatorFor(Node* node) {
+  const auto& candidates = getAllOperatorsFor(node->kind());
+  for(const auto& candidate : candidates) {
+    if(candidate->matchesNode(node)) {
+      return candidate;
+    }
+  }
+  return nullptr;
+}
+
+const Operator& getOperatorFor(Node* node) {
+  auto op = findOperatorFor(node);
+  if(op)
+    return *op;
+
+  auto er = script::ErrorReport(node->getSourceLocation());
+  er << "Schema not found for node. File a bug report.\n";
+  er << "Node: " << *node << "\n";
+  er << "Input types:";
+  for(size_t i = 0; i < node->inputs().size(); ++i) {
+    if(i > 0)
+      er << ", ";
+    er << *node->inputs()[i]->type();
+  }
+  er << "\ncandidates were:\n";
+  const auto& candidates = getAllOperatorsFor(node->kind());
+  for(auto & candidate : candidates) {
+    er << "  " << candidate->schema << "\n";
+  }
+  throw er;
+}
+
+}}
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
new file mode 100644
index 0000000..9db66cd
--- /dev/null
+++ b/torch/csrc/jit/operator.h
@@ -0,0 +1,73 @@
+// in memory description of all ATen Ops similar to Caffe2 schema
+// once C10 exists this can be removed, or stubbed out, but we need
+// it now to implement correct semantic checking for script
+#pragma once
+#include "ATen/ATen.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/function_schema.h"
+#include "torch/csrc/jit/stack.h"
+
+namespace torch { namespace jit {
+
+FunctionSchema parseSchema(const std::string& decl);
+
+using OperationCreator = std::function<Operation(Node*)>;
+
+struct Operator {
+  Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
+    : schema(std::move(schema))
+    , op(std::move(op))
+    , op_const_attributes(std::move(op_const_attributes)) {}
+
+  Operator(const std::string& schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
+    : Operator(parseSchema(schema), std::move(op), std::move(op_const_attributes)) {}
+
+  // Helper constructor to regsiter `op` to run
+  // run for _every_ IR Node where n.kind() == name, regardless of arguments.
+  // This is accomplished by marking the schema varargs and having no required arguments.
+  // This is used for things like prim::While or prim::If that can take a number
+  // of different valid input types and lengths.
+  Operator(Symbol name, OperationCreator op)
+  : Operator(FunctionSchema(name, {}, {}, true), op, op) {}
+
+  FunctionSchema schema;
+
+  bool matchesNode(Node* n) const;
+  // Operators have different versions depending on if some inputs are encoded
+  // as attributes or inputs. This function returns the right Operation function,
+  // given a node encoded for one variant.
+  // Behavior is undefined if matchesNode(n) == false
+  Operation selectVariant(Node* n) const {
+    if(n->hasAttributes()) {
+      JIT_ASSERT(op_const_attributes != nullptr);
+      return op_const_attributes(n);
+    } else {
+      return op(n);
+    }
+  }
+private:
+  OperationCreator op;
+  OperationCreator op_const_attributes;
+};
+
+const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name);
+std::shared_ptr<Operator> findOperatorFor(Node* node);
+const Operator& getOperatorFor(Node* node);
+
+inline Operation getOperation(Node* node) {
+  // note: getOperatorFor ensures that getOperatorFor(node).matchesNode(node) == true
+  // so the call to selectVariant is always valid.
+  return getOperatorFor(node).selectVariant(node);
+}
+
+void registerOperator(Operator&& op);
+
+struct RegisterOperators {
+  RegisterOperators(std::vector<Operator> operators) {
+    for(Operator& o : operators) {
+      registerOperator(std::move(o));
+    }
+  }
+};
+
+}}
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
new file mode 100644
index 0000000..15926fd
--- /dev/null
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -0,0 +1,212 @@
+#include "torch/csrc/jit/passes/batch_mm.h"
+
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/utils/functional.h"
+
+#include <ATen/ATen.h>
+#include <algorithm>
+#include <unordered_map>
+
+namespace torch { namespace jit {
+
+// This pass looks for trees in the graph, where leaves are mm ops, and the inner
+// vertices are add nodes. Once we have such a tree they can be reduced to two
+// concats and a single mm (basically into a single multiply of a wide matrix, with
+// a tall matrix).
+// Such patterns show up mostly in backward of RNNs, since the derivative of many
+// uses of matrix multiplies with same weights forms exactly such a tree
+// (note that it's usually also highly imbalanced i.e. has O(n) depth).
+//
+// This (or any tree of adds of MMs):
+//
+// +------+ +------+   +------+ +------+   +------+
+// |      | |      |   |      | |      |   |      |
+// |  L1  | |  R1  | + |  L2  | |  R2  | = |  O   |
+// |      | |      |   |      | |      |   |      |
+// +------+ +------+   +------+ +------+   +------+
+//
+// can be basically transformed into a single MM which looks like this
+// (we concat all lhs operands, concat rhs operands, do mm):
+//
+//                 +------+
+//                 |      |
+//                 |  R1  |
+//                 |      |
+//                 +------+
+//                 |      |
+//                 |  R2  |
+//                 |      |
+//                 +------+
+// +------+------+ +------+
+// |      |      | |      |
+// |  L1  |  L2  | |  O   |
+// |      |      | |      |
+// +------+------+ +------+
+
+// Note [Further optimizations]
+// It would be straightforward to extend the TreeToken class to also detect if all
+// MMs had the same lhs/rhs. In such case it's more efficient to expand the lhs
+// and use bmm + sum instead of repeating it in memory via concat.
+
+// Note [Overlapping trees]
+// Additionally it wouldn't be too hard to add support for partially overlapping
+// trees. Right now the it's forbidden in the algorithm (only a single tree will
+// be allowed), so theoretically we might miss some optimization options, especially
+// that the rejected tree could be much larger. I didn't implement that because it's
+// not necessary for the simple RNN cases I saw, so I decided to keep stuff simple.
+// If we ever get around implementing this, the right solution is probably to fuse
+// MMs for the common part, and assume it's an input leaf for the outer two parts
+// (I don't think it's beneficial to recompute, unless the subtree is super small,
+// but let's not get into such details).
+
+// The algorithm we're using is simple. We're iterating through the graph in the
+// topological order and labeling nodes with TreeTokens. Then, we look for roots of
+// the trees we formed and fuse them.
+
+// Tunable parameter. Set to something larger if it turns out to be better.
+static constexpr size_t min_fusion_size = 2;
+
+static std::array<int64_t, 2> as_array(at::IntList sizes) {
+  JIT_ASSERT(sizes.size() == 2);
+  std::array<int64_t, 2> arr;
+  arr[0] = sizes[0];
+  arr[1] = sizes[1];
+  return arr;
+}
+
+// TreeTokens will be used to label nodes of the graph, if the nodes will fit
+// our mm/add tree pattern. Basically we do dynamic programming on DAGs, where
+// when we reach node N with inputs A and B, then A and B have already been
+// procesed, and we can try to unify their TreeTokens (if they have them)
+// and build a larger tree.
+struct TreeToken {
+  uint64_t tree_size = 0; // NOTE: measured in number of leaves i.e. mm ops
+  std::array<int64_t, 2> lhs_sizes;
+  std::array<int64_t, 2> rhs_sizes;
+  Node *node = nullptr;
+  bool is_root = false;
+
+  static TreeToken fromMM(Node *mm) {
+    TreeToken token;
+    token.tree_size = 1;
+    Value *lhs = mm->inputs()[0];
+    Value *rhs = mm->inputs()[1];
+    token.lhs_sizes = as_array(lhs->type()->expect<TensorType>()->sizes());
+    token.rhs_sizes = as_array(rhs->type()->expect<TensorType>()->sizes());
+    token.node = mm;
+    token.is_root = true;
+    return token;
+  }
+
+  static TreeToken unify(Node *add, TreeToken& l, TreeToken& r) {
+    TreeToken token;
+    // See Note [Overlapping trees]
+    if (&l == &r || !l.is_root || !r.is_root)
+      return token;
+    // We can batch the tree only if all sizes match, because we need to
+    // cat inputs for both operands
+    if (l.lhs_sizes != r.lhs_sizes)
+      return token;
+    if (l.rhs_sizes != r.rhs_sizes)
+      return token;
+    token.tree_size = l.tree_size + r.tree_size;
+    token.lhs_sizes = l.lhs_sizes;
+    token.rhs_sizes = l.rhs_sizes;
+    token.node = add;
+    token.is_root = true;
+    l.is_root = r.is_root = false; // Reserve the subtrees, so they can't be used again.
+    return token;
+  }
+
+  explicit operator bool() {
+    return is_root;
+  }
+
+  std::vector<Node*> gatherMatMuls() {
+    std::vector<Node*> matmuls;
+    std::vector<Node*> queue {node};
+    while (!queue.empty()) {
+      auto n = queue.back(); queue.pop_back();
+      if (n->kind() == aten::mm) {
+        matmuls.push_back(n);
+      } else {
+        queue.push_back(n->inputs()[0]->node());
+        queue.push_back(n->inputs()[1]->node());
+      }
+    }
+    return matmuls;
+  }
+};
+
+void BatchMMBlock(Block* block) {
+  enum class Side { LHS, RHS };
+  auto graph = block->owningGraph();
+
+  // Look for trees in the block
+  std::unordered_map<Node*, TreeToken> tokens;
+  for (auto node : block->nodes()) {
+    if (node->kind() == aten::mm) {
+      tokens[node] = TreeToken::fromMM(node);
+    } else if (node->kind() == aten::add) {
+      // NOTE: x + 2 is add[other={2}](%x)
+      if (node->inputs().size() != 2) continue;
+      Node *lhs = node->inputs()[0]->node();
+      Node *rhs = node->inputs()[1]->node();
+      auto lhs_it = tokens.find(lhs);
+      auto rhs_it = tokens.find(rhs);
+      // See Note [Overlapping trees] (regarding the uses().size() == 1 check)
+      // We could treat a subtree with multiple uses as if it was overlapping.
+      // XXX: uses().size() == 1 is also something that guarantees that this
+      // transform is valid, because we know for sure that the none of these
+      // operands depend on the result of the other. If we were to remove this,
+      // we need to compute a transitive closure and actually check the dependencies.
+      if (lhs_it != tokens.end() && rhs_it != tokens.end() &&
+          lhs->output()->uses().size() == 1 && rhs->output()->uses().size() == 1) {
+        if (auto token = TreeToken::unify(node, lhs_it->second, rhs_it->second))
+          tokens[node] = token;
+      }
+    } else {
+      for (auto block : node->blocks()) {
+        BatchMMBlock(block);
+      }
+    }
+  }
+
+  // Merge trees we've found
+  for (auto & item : tokens) {
+    auto & root = item.second;
+    if (!root || root.tree_size < min_fusion_size)
+      continue;
+    auto matmuls = root.gatherMatMuls();
+    auto type = root.node->output()->type()->expect<TensorType>();
+
+    auto batch_inputs = [&](Side s, std::array<int64_t, 2> cat_sizes) -> Value* {
+      int inputs_off = s == Side::LHS ? 0 : 1;
+      int cat_dim    = s == Side::LHS ? 1 : 0;
+      cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes
+
+      auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; });
+      Node *cat = graph->create(aten::cat, inputs)
+                       ->i_(attr::dim, cat_dim);
+      cat->insertBefore(root.node);
+      cat->output()->setType(type->withSizes(cat_sizes));
+      return cat->output();
+    };
+
+    auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes);
+    auto rhs_batch = batch_inputs(Side::RHS, root.rhs_sizes);
+    Node *batch_mm = graph->create(aten::mm, {lhs_batch, rhs_batch});
+    batch_mm->output()->setType(type->asShared());
+    batch_mm->insertBefore(root.node);
+    root.node->output()->replaceAllUsesWith(batch_mm->output());
+    // NB: don't bother with cleaning up after yourself. We'll use DCE for that.
+  }
+  EliminateDeadCode(block);
+}
+
+void BatchMM(std::shared_ptr<Graph>& graph) {
+  BatchMMBlock(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/batch_mm.h b/torch/csrc/jit/passes/batch_mm.h
new file mode 100644
index 0000000..efb635d
--- /dev/null
+++ b/torch/csrc/jit/passes/batch_mm.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void BatchMM(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/canonicalize.cpp b/torch/csrc/jit/passes/canonicalize.cpp
new file mode 100644
index 0000000..12e9439
--- /dev/null
+++ b/torch/csrc/jit/passes/canonicalize.cpp
@@ -0,0 +1,37 @@
+#include "torch/csrc/jit/passes/canonicalize.h"
+
+namespace torch { namespace jit {
+
+// Canonicalize a graph, renumbering it so that all structurally equivalent
+// graphs have same numbers.
+std::shared_ptr<Graph> Canonicalize(const std::shared_ptr<Graph>& graph) {
+  auto r = std::make_shared<Graph>(graph->scope_root());
+  std::unordered_map<Value*, Value*> rn_env;
+  auto rn_fn = [&](Value* v) { return rn_env.at(v); };
+  for (auto* input : graph->inputs()) {
+    auto* r_input = r->addInput();
+    r_input->copyMetadata(input);
+    r_input->setStage(input->stage());
+    rn_env[input] = r_input;
+  }
+  for (auto* node : graph->nodes()) {
+    auto* r_node = r->createClone(node, rn_fn);
+    r_node->setStage(node->stage());
+    r->appendNode(r_node);
+    auto outputs = node->outputs();
+    auto r_outputs = r_node->outputs();
+    for (size_t i = 0; i < outputs.size(); i++) {
+      r_outputs.at(i)->setStage(outputs.at(i)->stage());
+      rn_env[outputs.at(i)] = r_outputs.at(i);
+    }
+  }
+  for (auto* output : graph->outputs()) {
+    r->registerOutput(rn_fn(output));
+  }
+  r->setStage(graph->stage());
+
+  return r;
+
+}
+
+}}
diff --git a/torch/csrc/jit/passes/canonicalize.h b/torch/csrc/jit/passes/canonicalize.h
new file mode 100644
index 0000000..09a9883
--- /dev/null
+++ b/torch/csrc/jit/passes/canonicalize.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+std::shared_ptr<Graph> Canonicalize(const std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.cpp b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
new file mode 100644
index 0000000..f616402
--- /dev/null
+++ b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
@@ -0,0 +1,146 @@
+#include "torch/csrc/jit/ir.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/hash.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+bool tensorEqual(const at::Tensor& lhs, const at::Tensor& rhs) {
+  return &lhs.type() == &rhs.type() && lhs.equal(rhs);
+}
+
+bool tensorListEqual(const std::vector<at::Tensor>& lhs, const std::vector<at::Tensor>& rhs) {
+  if (lhs.size() != rhs.size()) return false;
+  return std::equal(lhs.begin(), lhs.end(), rhs.begin(), tensorEqual);
+}
+
+
+// Check whether two nodes have the same attributes in CSE.
+// This function may be too conservative for general use.
+// Do NOT support t/ts/g/gs attributes.
+// If t/ts are supported, CONSTANT node comparison may need to consider device.
+bool attributesEqualCSE(const Node* lhs, const Node* rhs) {
+  JIT_ASSERT(lhs != nullptr);
+  JIT_ASSERT(rhs != nullptr);
+  // One has attributes, the other does not.
+  if (lhs->hasAttributes() != rhs->hasAttributes()) return false;
+  // Neither has attributes.
+  if (!lhs->hasAttributes() && !rhs->hasAttributes()) return true;
+
+  auto lnames = lhs->attributeNames();
+  auto rnames = rhs->attributeNames();
+  std::sort(lnames.begin(), lnames.end());
+  std::sort(rnames.begin(), rnames.end());
+  if (lnames != rnames) return false;
+
+  for (auto name : lnames) {
+    if (lhs->kindOf(name) != rhs->kindOf(name)) return false;
+
+    #define COMPARE_ATTRIBUTEVALUE(type) \
+      case AttributeKind::type: \
+        { if (lhs->type(name) != rhs->type(name)) return false; } break;
+
+    switch(lhs->kindOf(name)) {
+      COMPARE_ATTRIBUTEVALUE(f)
+      COMPARE_ATTRIBUTEVALUE(fs)
+      COMPARE_ATTRIBUTEVALUE(i)
+      COMPARE_ATTRIBUTEVALUE(is)
+      COMPARE_ATTRIBUTEVALUE(s)
+      COMPARE_ATTRIBUTEVALUE(ss)
+      case AttributeKind::t: {
+        if (!tensorEqual(lhs->t(name), rhs->t(name))) return false;
+        break;
+      }
+      case AttributeKind::ts: {
+        if (!tensorListEqual(lhs->ts(name), rhs->ts(name))) return false;
+        break;
+      }
+      case AttributeKind::g:
+      case AttributeKind::gs:
+        return false;
+    }
+
+    #undef COMPARE_ATTRIBUTEVALUE
+  }
+
+  return true;
+}
+
+struct HashNodeCSE {
+  size_t operator()(const Node* k) const {
+    JIT_ASSERT(k != nullptr);
+    return get_hash(k->kind(),
+                    k->stage(),
+                    fmap(k->inputs(), [](const Value *v) { return v->unique(); }));
+  }
+};
+
+struct EqualNodeCSE {
+  bool operator()(const Node* lhs, const Node* rhs) const {
+    if (lhs == nullptr && rhs == nullptr) return true;
+    if (lhs == nullptr || rhs == nullptr) return false;
+
+    // Check whether two nodes are the same kind.
+    if (lhs->kind() != rhs->kind()) return false;
+
+    // Check the stage.
+    if (lhs->stage() != rhs->stage()) return false;
+
+    // Check whether the inputs are the same.
+    auto lhs_inputs = lhs->inputs();
+    auto rhs_inputs = rhs->inputs();
+
+    if (lhs_inputs.size() != rhs_inputs.size()) return false;
+
+    if (!std::equal(lhs_inputs.begin(), lhs_inputs.end(), rhs_inputs.begin())) return false;
+
+    // Check the attributes.
+    if (!attributesEqualCSE(lhs, rhs)) return false;
+
+    return true;
+  }
+};
+
+} // anonymous namespace
+
+// The function implements common subexpression elimination.
+// Since the nodes are visited in topological order, one pass is enough.
+void EliminateCommonSubexpression(Block * block) {
+  std::unordered_set<Node*, HashNodeCSE, EqualNodeCSE> subexprs;
+  for (auto it = block->nodes().begin(); it != block->nodes().end(); ++ it) {
+    auto node = *it;
+    if (node->kind() == prim::PythonOp
+        || node->kind() == prim::Eval
+        || node->blocks().size() > 0
+       ) {
+      // Do NOT have enough information to do CSE on these nodes.
+      continue;
+    }
+
+    // Check whether the same subexpression already exists.
+    auto subit = subexprs.find(node);
+    if (subit == subexprs.end()) {
+      // If not put current node into the map
+      subexprs.insert(node);
+    } else {
+      // Subexpression exists, replace the uses of node, and destroy it.
+      auto existing = *subit;
+      node->replaceAllUsesWith(existing);
+      // Destroy the node.
+      it.destroyCurrent();
+    }
+  }
+}
+
+void EliminateCommonSubexpression(std::shared_ptr<Graph>& graph) {
+  EliminateCommonSubexpression(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.h b/torch/csrc/jit/passes/common_subexpression_elimination.h
new file mode 100644
index 0000000..483c573
--- /dev/null
+++ b/torch/csrc/jit/passes/common_subexpression_elimination.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void EliminateCommonSubexpression(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
new file mode 100644
index 0000000..5c05ce8
--- /dev/null
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -0,0 +1,116 @@
+#include <cstddef>
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/autodiff.h"
+
+namespace torch { namespace jit {
+
+struct Graph;
+
+namespace {
+
+// Move nodes that exist in graph g into a 'group_node_kind' node.
+// All inputs shared by the nodes become inputs to the new node.
+// Outputs from 'nodes' are redirected to outputs of the new node,
+// and the original nodes are removed.
+// prereq: it is topologically valid to place the new node
+// right before nodes[0] (i.e. it will not create cycles and all uses of
+// new node will be after this position).
+// prereq: nodes are in topological order
+void mergeNodes(Block * block, Symbol group_node_kind, ArrayRef<Node*> nodes) {
+  JIT_ASSERT(nodes.size() > 0);
+  std::unordered_map<Value*, Value*> value_map;
+  Graph * graph = block->owningGraph();
+
+  auto new_graph = std::make_shared<Graph>();
+  Node * group_node = graph->create(group_node_kind, 0);
+  group_node->g_(attr::Subgraph, new_graph);
+
+  auto getOrCreateInput = [&](Value * v) {
+    if(value_map.count(v) > 0) {
+      return value_map[v];
+    }
+    Value * nv = new_graph->addInput()->setType(v->type());
+    group_node->addInput(v);
+    value_map[v] = nv;
+    return nv;
+  };
+  std::unordered_set<Node*> group_set;
+  for(auto n : nodes) {
+    group_set.insert(n);
+  }
+  for(auto n : nodes) {
+    auto nn = new_graph->appendNode(new_graph->createClone(n, getOrCreateInput));
+    for(size_t i = 0; i < nn->outputs().size(); ++i) {
+      auto old_output = n->outputs()[i];
+      auto new_output = nn->outputs()[i];
+      value_map[old_output] = new_output;
+      std::vector<Use> to_replace;
+      for(auto u : old_output->uses()) {
+        // Uses within the set do not need to be made outputs
+        if(group_set.count(u.user) > 0)
+          continue;
+        // Other uses do, but we
+        // cannot replace them here or we invalid the uses list iterator
+        to_replace.push_back(u);
+      }
+      if(to_replace.size() > 0) {
+        new_graph->registerOutput(new_output);
+        Value * external_output = group_node->addOutput()->setType(old_output->type());
+        for(auto u : to_replace) {
+          u.user->replaceInput(u.offset, external_output);
+        }
+      }
+    }
+  }
+  group_node->insertBefore(nodes[0]);
+  // delete backward, so that nodes are use-free before deletion
+  for(size_t i = nodes.size(); i > 0; --i) {
+    nodes[i - 1]->destroy();
+  }
+  JIT_ASSERT(isDifferentiable(*new_graph));
+}
+
+}
+
+void CreateAutodiffSubgraphs(Block * block, size_t threshold) {
+  // This implementation is not optimal, but it is simple.
+  // It just scans through the list in order looking for runs of
+  // differentiable ops, and then grouping them together when
+  // it hits the first non-differentiable op.
+  // It cannot handle things like:
+  // a = f(x, y)
+  // b = black_box(a)
+  // c = g(a)
+  // where you could group {f, g} together if the nodes were in a different
+  // topological order
+
+  // a better strategy would be to try to treat this like a fusion problem
+  // and group maximal groups
+
+  std::vector<Node*> groupable;
+  for(Node * node : block->nodes()) { // Note: nodes() iterator stays valid since it is
+                            // always pointing _after_ the nodes that mergeNodes
+                            // mutates.
+    if(isDifferentiable(node)) {
+      groupable.push_back(node);
+    } else {
+      if(groupable.size() >= threshold) {
+        mergeNodes(block, prim::GraphExecutor, groupable);
+      }
+      groupable.clear();
+      for (Block * sub_block : node->blocks()) {
+        CreateAutodiffSubgraphs(sub_block, threshold);
+      }
+    }
+  }
+  if(groupable.size() >= threshold) {
+    mergeNodes(block, prim::GraphExecutor, groupable);
+  }
+}
+
+void CreateAutodiffSubgraphs(Graph & graph, size_t threshold) {
+  CreateAutodiffSubgraphs(graph.block(), threshold);
+}
+
+
+}}
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
new file mode 100644
index 0000000..7582257
--- /dev/null
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <cstddef>
+
+namespace torch { namespace jit {
+
+struct Graph;
+
+// insert GraphExecutor nodes that group together
+// subgraphs that are differentiable by the jit's autodiff passes
+// threshold - minimum number of nodes that will appear in a block
+void CreateAutodiffSubgraphs(Graph & graph, size_t threshold = 2);
+
+}}
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
new file mode 100644
index 0000000..d8341cb
--- /dev/null
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -0,0 +1,49 @@
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+#include <unordered_map>
+
+namespace torch { namespace jit {
+
+using bool_memo_type = std::unordered_map<Node*, bool>;
+
+bool hasSideEffects(Node * node, bool_memo_type& memo) {
+  // FIXME: PythonOp should be treated as having side effects as well!
+  //        Unfortunately ONNX depends on it getting removed in this pass, so it's not
+  //        a simple change.
+  auto it = memo.find(node);
+  if (it != memo.end())
+    return it->second;
+  bool has_side_effects = node->kind() == prim::Print ||
+    std::any_of(node->blocks().begin(), node->blocks().end(),
+                [&](Block *b) {
+                  return std::any_of(b->nodes().begin(), b->nodes().end(),
+                                    [&](Node *n) { return hasSideEffects(n, memo); });
+                });
+  memo.emplace(node, has_side_effects);
+  return has_side_effects;
+}
+
+void EliminateDeadCode(Block *block, bool recurse, bool_memo_type& memo) {
+  auto nodes = block->nodes().reverse();
+  for (auto it = nodes.begin(); it != nodes.end(); it++) {
+    auto node = *it;
+    if (recurse) {
+      for (Block * block : node->blocks())
+        EliminateDeadCode(block, true, memo);
+    }
+    if (!node->hasUses() && !hasSideEffects(node, memo))
+      it.destroyCurrent();
+  }
+}
+
+void EliminateDeadCode(const std::shared_ptr<Graph>& graph) {
+  bool_memo_type side_effect_memo;
+  EliminateDeadCode(graph->block(), true, side_effect_memo);
+}
+
+void EliminateDeadCode(Block *block, bool recurse) {
+  bool_memo_type side_effect_memo;
+  EliminateDeadCode(block, recurse, side_effect_memo);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/dead_code_elimination.h b/torch/csrc/jit/passes/dead_code_elimination.h
new file mode 100644
index 0000000..51c92ab
--- /dev/null
+++ b/torch/csrc/jit/passes/dead_code_elimination.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void EliminateDeadCode(const std::shared_ptr<Graph>& graph);
+void EliminateDeadCode(Block *block, bool recurse=true);
+
+}}
diff --git a/torch/csrc/jit/passes/decompose_addmm.cpp b/torch/csrc/jit/passes/decompose_addmm.cpp
new file mode 100644
index 0000000..de9a3f6
--- /dev/null
+++ b/torch/csrc/jit/passes/decompose_addmm.cpp
@@ -0,0 +1,44 @@
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+
+namespace torch { namespace jit {
+
+static void DecomposeAddmm(Block* block) {
+  for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
+       ++it) {
+    for (auto sub : it->blocks())
+      DecomposeAddmm(sub);
+    // For the case where we have an addmm where alpha and beta are Attributes
+    // and both of those scalars are equal to 1.0, decompose this into an mm
+    // followed by an add so that it can go through the existing optimization,
+    // shape analysis and differentiation passes for those two individual ops.
+    // Later, we will fuse together those two ops into a single addmm.
+    if (it->kind() == aten::addmm && it->inputs().size() == 3) {
+      auto alpha = at::Scalar(it->t(attr::alpha));
+      auto beta = at::Scalar(it->t(attr::beta));
+
+      if (alpha.to<double>() != 1.0 || beta.to<double>() != 1.0) {
+        continue;
+      }
+
+      WithInsertPoint guard(*it);
+
+      SymbolicVariable mat(it->inputs()[0]);
+      SymbolicVariable mat1(it->inputs()[1]);
+      SymbolicVariable mat2(it->inputs()[2]);
+
+      auto mm_result = mat1.mm(mat2);
+      auto result = mat + mm_result;
+
+      it->output()->replaceAllUsesWith(result);
+      it.destroyCurrent();
+    }
+  }
+}
+
+void DecomposeAddmm(const std::shared_ptr<Graph>& graph) {
+  DecomposeAddmm(graph->block());
+}
+
+
+}}
diff --git a/torch/csrc/jit/passes/decompose_addmm.h b/torch/csrc/jit/passes/decompose_addmm.h
new file mode 100644
index 0000000..500d4a2
--- /dev/null
+++ b/torch/csrc/jit/passes/decompose_addmm.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+// Decompose addmm nodes to add + mm, so expands can be inserted and
+// gradients accumulated on the backward pass
+//
+// In the future, if we need more passes like this, we should convert this
+// into a generic canonicalization pass.
+void DecomposeAddmm(const std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/erase_number_types.cpp b/torch/csrc/jit/passes/erase_number_types.cpp
new file mode 100644
index 0000000..03e77e3
--- /dev/null
+++ b/torch/csrc/jit/passes/erase_number_types.cpp
@@ -0,0 +1,52 @@
+#include "torch/csrc/jit/passes/erase_number_types.h"
+
+namespace torch { namespace jit {
+
+static bool isNumberTypeCast(const Value* value, const Use& use) {
+  auto* node = use.user;
+  if (node->kind() != aten::type_as) {
+    return false;
+  }
+  return node->inputs()[0] == value;
+}
+
+static void EraseNumberTypesOnBlock(Block* block) {
+  for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
+       ++it) {
+    for (auto sub : it->blocks()) {
+      EraseNumberTypesOnBlock(sub);
+    }
+    switch (it->kind()) {
+      case prim::Constant: {
+        it->output()->inferTypeFrom(it->t(attr::value));
+      } break;
+      case prim::TensorToNum: {
+        it->output()->replaceAllUsesWith(it->inputs()[0]);
+        // Let DCE cleanup
+      } break;
+      case prim::NumToTensor: {
+        auto* ten = it->output();
+        for (const auto& use : ten->uses()) {
+          if (isNumberTypeCast(ten, use)) {
+            use.user->output()->replaceAllUsesWith(ten);
+          }
+        }
+        ten->replaceAllUsesWith(it->inputs()[0]);
+        // Let DCE cleanup
+      } break;
+      default: {
+        for(auto o : it->outputs()) {
+          if (o->type()->isSubtypeOf(*NumberType::get())) {
+            o->setType(DynamicType::get());
+          }
+        }
+      } break;
+    }
+  }
+}
+
+void EraseNumberTypes(const std::shared_ptr<Graph>& graph) {
+  EraseNumberTypesOnBlock(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/erase_number_types.h b/torch/csrc/jit/passes/erase_number_types.h
new file mode 100644
index 0000000..a736346
--- /dev/null
+++ b/torch/csrc/jit/passes/erase_number_types.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+// Erase NumberType information. This is necessary for and only used in
+// exporting to ONNX.
+//
+// The following things are done to erase NumberType info:
+// - NumberType outputs are changed to DynamicType.
+// - Any aten::type_as nodes that are added to correct Number math
+//   are removed because ONNX export does not support them.
+// - prim::Constant nodes' outputs get assigned their default type from ir.h
+// - prim::TensorToNum, and prim::NumToTensor nodes are erased.
+//
+// The pass assumes that DCE will be called sometime after.
+void EraseNumberTypes(const std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
new file mode 100644
index 0000000..712c88d
--- /dev/null
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -0,0 +1,583 @@
+#include "torch/csrc/jit/passes/graph_fuser.h"
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/autodiff.h"
+#include <unordered_map>
+
+#ifdef USE_CUDA
+  #include "cuda.h" // for CUDA_VERSION
+#endif 
+
+namespace torch { namespace jit {
+
+namespace {
+
+// What is a simple mappable operator?  It is:
+//    - Has an output with the same sizes as its input
+//    - Single output
+//    - Can handle non-contiguous input
+//    - Produces contiguous output
+// Some of these restrictions may be relaxable, but you should
+// carefully read the code first, as we rely on these assumptions.
+std::unordered_set<NodeKind> simple_mappable = {
+  aten::__and__,
+  aten::__lshift__,
+  aten::__or__,
+  aten::__rshift__,
+  aten::__xor__,
+  aten::abs,
+  aten::acos,
+  aten::add,
+  aten::asin,
+  aten::atan,
+  aten::atan2,
+  aten::ceil,
+  aten::clamp,
+  aten::cos,
+  aten::cosh,
+  aten::div,
+  aten::eq,
+  aten::exp,
+  aten::expm1,
+  aten::floor,
+  aten::fmod,
+  aten::frac,
+  aten::ge,
+  aten::gt,
+  aten::le,
+  aten::lerp,
+  aten::lgamma,
+  aten::log,
+  aten::log10,
+  aten::log1p,
+  aten::log2,
+  aten::lt,
+  aten::max,
+  aten::min,
+  aten::mul,
+  aten::ne,
+  aten::neg,
+  aten::pow,
+  aten::reciprocal,
+  aten::relu,
+  aten::remainder,
+  aten::round,
+  aten::rsqrt,
+  aten::sigmoid,
+  aten::sin,
+  aten::sinh,
+  aten::sqrt,
+  aten::sub,
+  aten::tan,
+  aten::tanh,
+  aten::trunc,
+  aten::type_as,
+  aten::_sigmoid_backward,
+  aten::_tanh_backward,
+};
+
+bool isSimpleMap(Node *node) {
+  if(simple_mappable.count(node->kind()) == 0)
+    return false;
+  if((node->kind() == aten::min || node->kind() == aten::max) && node->inputs().size() == 1)
+    return false;
+  // Make sure that the node doesn't broadcast.
+  JIT_ASSERT(node->inputs().size() > 0);
+  TensorType* expected_type = node->inputs()[0]->type()->cast<TensorType>();
+  if (!expected_type) return false;
+//type checking is intentionally dropped from isSimpleMap
+//isFusable is checking input/output types as there are some exceptions from allFloatIO requirement
+  static const auto equal_modulo_strides = [](TensorType* expected, const TypePtr& _actual) {
+     TensorType* actual = _actual->cast<TensorType>();
+     return actual &&
+           expected->device() == actual->device() &&
+           expected->sizes() == actual->sizes();
+  };
+  for (Value * val : node->inputs()) {
+    if (!equal_modulo_strides(expected_type, val->type()))
+      return false;
+  }
+  for (Value * val : node->outputs()) {
+    if (!equal_modulo_strides(expected_type, val->type()))
+      return false;
+  }
+  return true;
+}
+
+
+struct GraphFuser {
+  Block * block;
+
+  // Used to order nodes so we always consider producer-consumer fusions
+  // in reverse topological order.
+  // If topological_index[a] > topological_index[b] then a occurs after b.
+  // Because nodes can be added to this graph during optimization, this mapping is not bijective.
+  // Newly generated nodes will copy the location where they are inserted.
+  std::unordered_map<Node*,size_t> topological_index;
+
+  GraphFuser(Block * block)
+  : block(block) {}
+
+  at::optional<int> getDevice(Node * node) {
+    if(node->kind() == prim::FusionGroup) {
+      return node->i(attr::device);
+    }
+    if(auto tt = node->output()->type()->cast<TensorType>()) {
+      return tt->device();
+    }
+    return at::nullopt;
+  }
+  // TODO: the fusion compiler has a lot of float-specific codegen
+  // so for now we only consider nodes that operate on floating point numbers
+  // and half values when running on a GPU with sufficient CUDA arch
+  bool hasSupportedType(Value* node) {
+    if (auto tt = node->type()->cast<TensorType>()) {
+      if (tt->scalarType() == at::kFloat) return true;
+
+      #ifdef USE_CUDA
+        // Checks for half tensor on GPU
+        // const auto device = tt->device();
+        if (tt->device() != kCPUDevice 
+          && CUDA_VERSION >= 9
+          && tt->scalarType() == at::ScalarType::Half) {
+          return true;
+        }
+      #endif 
+    } 
+
+    return false;
+  }
+
+  bool allSupportedList(at::ArrayRef<Value*> list){
+    for (auto& o: list){
+      if (!hasSupportedType(o)) return false;
+    }
+
+    return true;
+  }
+
+  bool allSupportedIO(Node* node) {
+    return (allSupportedList(node->inputs()) && allSupportedList(node->outputs()));
+  }
+
+  bool isFusable(Node * node) {
+    if (node->owningBlock() != block) return false;
+    if (node->kind() == prim::FusionGroup) return true;
+    if (!isSimpleMap(node)) return false;
+    switch (node->kind()){
+//comparison operators produce Byte type, and it's ok, check only inputs
+      case aten::le:
+      case aten::ge:
+      case aten::lt:
+      case aten::gt:
+      case aten::ne:
+      case aten::eq:
+         return allSupportedList(node->inputs());
+      case aten::type_as:
+//type_as can have different input types as long as output is float, check only output
+         return allSupportedList(node->outputs());
+      default:
+         return allSupportedIO(node);
+    }
+  }
+
+  bool allOutputsHaveSameSize(Node * node) {
+    TensorType *tt_ptr = nullptr;
+    for (const auto i : node->inputs()) {
+      auto cur_tt_ptr = i->type()->cast<TensorType>();
+      if (!cur_tt_ptr) {
+        return false;
+      }
+
+      if (tt_ptr && tt_ptr->sizes() != cur_tt_ptr->sizes()) {
+        return false;
+      }
+      tt_ptr = cur_tt_ptr;
+    }
+    return true;
+  }
+
+  // Can this node produce an _output_ of a fusion group?
+  // all Fusable nodes can do this, but additionally Concat, which normally cannot be fused
+  // because it is not a simple map, can be put in a fusion group
+  // as long as no items in the group read the output of concat
+  bool isFusableAsExitNode(Node * node) {
+    if(isFusable(node))
+      return true;
+    // this concat fusion only works when all the inputs are the same size
+    // otherwise they cannot partipate in the same map
+    if(node->kind() == aten::cat && allOutputsHaveSameSize(node))
+      return true;
+
+    return false;
+  }
+
+  // necessary condition for fusion. If all of the uses of producer are consumer
+  // then it is safe to merge producer into consumer, because it doesn't have any other uses
+  // If there are other uses, but they occur _after_ consumer, then we can still merge in producer
+  // with consumer, by rewriting those later uses to use the version of producer generated by the fused blob
+  // In this case, producer becomes an output of the fusion group.
+  bool allUsersAreThisConsumerOrOccurAfterIt(Node * consumer, Value * producer) {
+    auto defining_node = producer->node();
+    for(auto o : defining_node->outputs()) {
+      for(auto u : o->uses()) {
+        if(u.user != consumer && topological_index.at(consumer) > topological_index.at(u.user))
+          return false;
+      }
+    }
+    return true;
+  }
+  bool allUsersAreThisConsumer(Node * consumer, Value * producer) {
+    auto defining_node = producer->node();
+    for(auto o : defining_node->outputs()) {
+      for(auto u : o->uses()) {
+        if(u.user != consumer)
+          return false;
+      }
+    }
+    return true;
+  }
+
+  bool shouldFuse(Node * consumer, Value * producer) {
+    // this handles cases where producer can be moved _into_ the fusion group of consumer.
+    // TODO: extend to fusion of consumer into _producer's_ fusion blob
+    // if the consumer allInputsAreThisProducer(consumer,producer)
+    // we can move the consumer up into the producer.
+    // but this requires better handling of merging fusion groups so it is not done now
+    at::optional<int> consumer_device = getDevice(consumer);
+    return isFusable(producer->node()) &&
+      allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) &&
+      consumer_device && consumer_device == getDevice(producer->node()) &&
+      (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU());
+  }
+
+  // insert a producer node into a consuming fusion group.
+  // DOES NOT WORK if n is a consumer of an output of the fusion group
+  // returns the node _inside_ the group that represents the node
+  Graph & getSubgraph(Node * n) {
+    JIT_ASSERT(n->kind() == prim::FusionGroup);
+    return *n->g(attr::Subgraph);
+  }
+
+  void mergeFusionGroups(Node *consumer_group, Node *producer_group) {
+    // Now we have two fusion groups!
+    // Revert the fusion - place all inner nodes of producer back in the outer graph.
+    std::vector<Node*> temporary_nodes;
+    auto producer_subgraph = &getSubgraph(producer_group);
+
+    // Initialize a map of inner graph values to outer graph values
+    std::unordered_map<Value*, Value*> inner_to_outer;
+    auto inner_inputs = producer_subgraph->inputs();
+    auto outer_inputs = producer_group->inputs();
+    for (size_t i = 0; i < inner_inputs.size(); ++i) {
+      inner_to_outer[inner_inputs[i]] = outer_inputs[i];
+    }
+
+    // Clone all nodes
+    for (auto inner : producer_subgraph->nodes()) {
+      Node * outer = block->owningGraph()->createClone(inner, [&](Value * k) -> Value* {
+        return inner_to_outer.at(k);
+      });
+      outer->insertBefore(producer_group);
+      temporary_nodes.emplace_back(outer);
+      auto inner_outputs = inner->outputs();
+      auto outer_outputs = outer->outputs();
+      for (size_t i = 0; i < inner_outputs.size(); ++i)
+        inner_to_outer[inner_outputs[i]] = outer_outputs[i];
+    }
+
+    // Replace uses of producer_group outputs and destroy the producer
+    auto subgraph_outputs = producer_subgraph->outputs();
+    for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
+      auto outer_output = inner_to_outer.at(subgraph_outputs[i]);
+      producer_group->outputs()[i]->replaceAllUsesWith(outer_output);
+    }
+    producer_group->destroy();
+    producer_group = nullptr; // Just to get a clear error in case someone uses it
+
+    // Inline the temporary nodes into the first group
+    auto consumer_subgraph = &getSubgraph(consumer_group);
+    for (auto it = temporary_nodes.rbegin(); it != temporary_nodes.rend(); ++it) {
+      Node *node = *it;
+      Node *merged = mergeNodeIntoGroup(consumer_group, node);
+      // If any of the outputs are still used then we need to add them
+      auto outputs = node->outputs();
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        auto output = outputs[i];
+        if (output->uses().size() == 0) continue;
+        consumer_subgraph->registerOutput(merged->outputs()[i]);
+        auto new_output = consumer_group->addOutput();
+        output->replaceAllUsesWith(new_output);
+        new_output->setType(output->type());
+      }
+      node->destroy();
+    }
+  }
+
+  Node * mergeNodeIntoGroup(Node* group, Node * n) {
+    JIT_ASSERT(n->kind() != prim::FusionGroup);
+    auto & subgraph = getSubgraph(group);
+    // map from nodes in the surrounding graph to parameters in the fusion
+    // group's subgraph that correspond to them
+    std::unordered_map<Value*,Value*> inputs_map;
+    size_t i = 0;
+    JIT_ASSERT(group->inputs().size() == subgraph.inputs().size());
+    for(auto input : group->inputs()) {
+      inputs_map[input] = subgraph.inputs()[i++];
+    }
+    // add n's inputs to the fusion group's input list if we don't already have them
+    for (auto input : n->inputs()) {
+      if (inputs_map.count(input) == 0) {
+        auto in_group = subgraph.addInput();
+        in_group->setType(input->type());
+        inputs_map[input] = in_group;
+        group->addInput(input);
+      }
+    }
+    // copy n into the graph, remapping its inputs to internal nodes
+    Node * in_graph = subgraph.createClone(n,[&](Value * k)-> Value* {
+      return inputs_map[k];
+    });
+    // if n is already an input to the fusion group,
+    // we need to remove it because n is now inside the fusion group
+    // remapping nodes that used the input to the newly-merged node
+    // n is not an input when the fusion group is empty
+    auto inputs = group->inputs();
+    auto it = std::find(inputs.begin(), inputs.end(), n->output());
+    if(it != inputs.end()) {
+      size_t p = it - inputs.begin();
+      group->removeInput(p);
+      subgraph.inputs()[p]->replaceAllUsesWith(in_graph->output());
+      subgraph.eraseInput(p);
+    }
+    return subgraph.prependNode(in_graph);
+  }
+
+  // turn consumer node n into a fusion group with just n inside
+  // to prepare for fusion and replace uses of n with the new group
+  Node * createSingletonFusionGroup(Node * n) {
+    auto group = block->owningGraph()->createFusionGroup(getDevice(n).value());
+    // propogate position information for the new node so we can always
+    // have a valid mapping
+    topological_index[group] = topological_index[n];
+    group->insertBefore(n);
+    Node * mergedNode = mergeNodeIntoGroup(group,n);
+    getSubgraph(group).registerOutput(mergedNode->output());
+    auto sel = group->addOutput();
+    sel->copyMetadata(n->output());
+    n->replaceAllUsesWith(group);
+    n->destroy();
+    return group;
+  }
+  void insertAfter(Node * n, Node * after) {
+    n->insertAfter(after);
+    topological_index[n] = topological_index[after];
+  }
+
+  void insertAt(Node ** insertion_point, Node * n) {
+    insertAfter(n, *insertion_point);
+    *insertion_point = n;
+  }
+
+  Node * fuse(Node * consumer, Value * producer) {
+    auto group = consumer;
+    if(group->kind() != prim::FusionGroup) {
+      group = createSingletonFusionGroup(consumer);
+    }
+    if (producer->node()->kind() == prim::FusionGroup) {
+      mergeFusionGroups(group, producer->node());
+      return group;
+    }
+    Node * merged = mergeNodeIntoGroup(group, producer->node());
+    // remaining uses of this producer can occur because we allow
+    // fusion in cases where uses remain after the consumer
+    // if these exist, re-route them to the version of producer
+    // created in FusionGroup
+    if(producer->uses().size() != 0) {
+      getSubgraph(group).registerOutput(merged->output());
+      Value * new_producer = group->addOutput();
+      new_producer->copyMetadata(producer);
+      producer->replaceAllUsesWith(new_producer);
+    }
+    producer->node()->destroy();
+    return group;
+  }
+
+  // TODO: desugar chunks into splits and then remove this special case
+  bool isChunk(Node * node) {
+    return node->kind() == aten::split || node->kind() == aten::chunk;
+  }
+
+  // in places where op can be fused into a consumer but chunk is in the way
+  // distribute chunk to op's operands:
+  // replace a,b = chunk(op(x,y,z)) with:
+  // x0,x1 = chunk(x) (x0 has a's type, x1 has b's type)
+  // y0,y1 = chunk(y) (y0 has a's type, y1 has b's type)
+  // z0,z1 = chunk(z) (z0 has a's type, z1 has b's type)
+  // a = op(x0,y0,z0) (a,b have their same size but are now contiguous)
+  // b = op(x1,y1,x1)
+  //
+  // NB: Chunk motion only occurs with fusable consumers, which implies
+  // that there is always some other operation, e.g., a+b, that happens
+  // after the chunk, and will be put into the fusion group. This is
+  // important, because distributing the chunk changes the contiguity
+  // of a and b, and so the results would be invalid, except that we know
+  // that simple_mappable operations will restore contiguity before
+  // we exit the fusion group.
+
+  bool tryToMoveChunk(Node * consumer, Value * producer) {
+    // is the output from a chunk node?
+    auto * chunk = producer->node();
+    if (!isChunk(chunk))
+      return false;
+    // and the thing being chunked is fusable into the consumer
+    Value * producer_for_chunk = chunk->input();
+    if (!isFusable(producer_for_chunk->node()) || !allUsersAreThisConsumer(chunk,producer_for_chunk))
+      return false;
+    // and all uses of the chunk are in this consumer
+    for (auto s : chunk->outputs()) {
+      for (auto u : s->uses()) {
+        if (u.user != consumer)
+          return false;
+      }
+    }
+
+    // TODO: Remove this restriction if we ever need to distribute across
+    // multiple return operators
+    Node * producer_for_chunk_node = producer_for_chunk->node();
+    JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
+    // Make sure we lay out the nodes in the correct topological order.
+    // TODO: There should be some more enshrined way to do this
+    Node * insertion_point = chunk;
+
+    // apply chunk to each of op's operands
+    // chunked_inputs[input_nr][chunk_output_idx]
+    //  = Node* for chunk_output_idx'th output of the chunk(inputs[input_nr])
+    std::vector<std::vector<Value*>> chunked_inputs;
+    for (auto input : producer_for_chunk_node->inputs()) {
+      auto input_type = input->type()->cast<TensorType>();
+      // NB: I decided not to use cloneFrom here, because if we make cloneFrom
+      // copy selects one day, it is definitely not what you want here (selects
+      // have different types).
+      // TODO: Perhaps we should use cloneFrom now, as it seems unlikely
+      // to copy select nodes now that we have refactored to have a Value
+      // distinct from Node.
+      Node * input_chunk = block->owningGraph()->create(chunk->kind(), 0);
+      input_chunk->copyAttributes(*chunk);
+      input_chunk->addInput(input);
+      insertAt(&insertion_point, input_chunk);
+
+      chunked_inputs.emplace_back(); // alas, to not be C++17
+      for (auto chunk_sel : chunk->outputs()) {
+          auto chunk_sel_type = chunk_sel->type()->cast<TensorType>();
+          Value * input_chunk_sel = input_chunk->addOutput();
+          input_chunk_sel->setType(
+            input_type->withSizesStrides(chunk_sel_type->sizes(),
+                                         chunk_sel_type->strides()));
+          chunked_inputs.back().push_back(input_chunk_sel);
+      }
+    }
+
+    // apply the op to each chunk of the chunked operands,
+    // and then rewrite the graph to use them!
+    for (auto chunk_sel : chunk->outputs()) {
+      Node * chunked_op = block->owningGraph()->create(producer_for_chunk_node->kind());
+      chunked_op->copyAttributes(*producer_for_chunk_node);
+      // Invariant: mappable operators always produce contiguous output
+      chunked_op->output()->setType(chunk_sel->type()->cast<TensorType>()->contiguous());
+      for (auto by_chunk_output_idx : chunked_inputs) {
+        chunked_op->addInput(by_chunk_output_idx.at(chunk_sel->offset()));
+      }
+      insertAt(&insertion_point, chunked_op);
+      chunk_sel->replaceAllUsesWith(chunked_op->output());
+    }
+    chunk->destroy();
+    producer_for_chunk_node->destroy();
+    return true;
+  }
+
+  // returns where to continue scanning, and whether any fusion was made
+  std::pair<graph_node_list::iterator, bool> scanNode(Node * consumer) {
+    auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
+    if(isFusableAsExitNode(consumer)) {
+      // handle inputs in reverse topological order as well...
+      // otherwise in f(a,a+b) it will appear a is used twice if we consider
+      // the f-a fusion before the f-(a+b) fusion first.
+      value_list inputs;
+      for(auto i : consumer->inputs()) {
+        if (i->node()->owningBlock() == block) {
+          inputs.push_back(i);
+          JIT_ASSERT(topological_index.count(i->node()) > 0);
+        }
+      }
+      std::sort(inputs.begin(), inputs.end(), [&](Value * a, Value * b) {
+        return topological_index.at(a->node()) > topological_index.at(b->node());
+      });
+      for(auto producer : inputs) {
+        // Don't fuse accross stage boundaries
+        if (producer->stage() != consumer->stage()) continue;
+        if(tryToMoveChunk(consumer,producer)) {
+          // the chunk before this consumer was re-arranged to allow fusion,
+          // we scan this consumer again to perform the fusion
+          return std::make_pair(consumer->reverseIterator(), true);
+        }
+        if(shouldFuse(consumer, producer)) {
+          auto fusion_group = fuse(consumer,producer);
+          // after fusion, consumer moves into a FusionGroup, so inputs is no longer valid
+          // so we rescan the new FusionGroup for more fusions...
+          return std::make_pair(fusion_group->reverseIterator(), true);
+        }
+      }
+    }
+    return std::make_pair(++consumer->reverseIterator(), false);
+  }
+
+  void run() {
+    for(auto p : block->inputs()) {
+      topological_index[p->node()] = 0;
+    }
+    size_t i = 1;
+    for(auto consumer : block->nodes()) {
+      topological_index[consumer] = i++;
+    }
+    topological_index[block->return_node()] = i++;
+
+    // Run the pass until no changes are made.
+    // This is neccessary, because the algorithm can miss out on certain fusion
+    // opportunities if ran only once. Consider this graph:
+    //
+    // %1 = f(...)
+    // %2 = g(%1)
+    // %3 = h(%1)
+    // %4 = l(%3)
+    // return (%4, %2)
+    //
+    // where f, g, h, l are simple map ops.
+    // The first iteration will fuse %4 and %3, and see that %1 is an input, but
+    // can't be fused, because it has a different use before the fusion group
+    // in our topological ordering. Then, %2 will be considered, and fused with %1.
+    // If we do another iteration, the algorithm will consider the fusion of these
+    // two groups and fix the situation.
+    bool any_changed = true;
+    while (any_changed) {
+      any_changed = false;
+      for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+        bool changed;
+        std::tie(it, changed) = scanNode(*it);
+        any_changed |= changed;
+      }
+    }
+    for (Node * node : block->nodes()) {
+      for (Block * sub_block : node->blocks()) {
+        GraphFuser(sub_block).run();
+      }
+    }
+  }
+};
+
+} // anonymous namespace
+
+void FuseGraph(std::shared_ptr<Graph>& graph) {
+  GraphFuser(graph->block()).run();
+}
+
+}}
diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h
new file mode 100644
index 0000000..17e730e
--- /dev/null
+++ b/torch/csrc/jit/passes/graph_fuser.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+// NB: Be sure to run DCE before fusion, because dead instructions
+// can prevent fusion opportunities from being exploited.
+void FuseGraph(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/inplace_check.cpp b/torch/csrc/jit/passes/inplace_check.cpp
new file mode 100644
index 0000000..12b30cd
--- /dev/null
+++ b/torch/csrc/jit/passes/inplace_check.cpp
@@ -0,0 +1,21 @@
+#include "torch/csrc/jit/passes/inplace_check.h"
+
+namespace torch { namespace jit {
+
+void CheckInplace(Block * block) {
+  for (auto node : block->nodes()) {
+    if (node->kind() == prim::PythonOp && node->hasAttribute(attr::inplace)) {
+      if (node->i(attr::inplace)) {
+        throw std::runtime_error(std::string("inplace ") +
+                                 static_cast<PythonOp*>(node)->name() +
+                                 " not supported in the JIT");
+      }
+    }
+  }
+}
+
+void CheckInplace(std::shared_ptr<Graph>& graph) {
+  CheckInplace(graph->block());
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/inplace_check.h b/torch/csrc/jit/passes/inplace_check.h
new file mode 100644
index 0000000..cae7805
--- /dev/null
+++ b/torch/csrc/jit/passes/inplace_check.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void CheckInplace(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp
new file mode 100644
index 0000000..bbf05f1
--- /dev/null
+++ b/torch/csrc/jit/passes/loop_unrolling.cpp
@@ -0,0 +1,212 @@
+#include "torch/csrc/jit/passes/loop_unrolling.h"
+
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+static constexpr int64_t kUnrollFactor = 8;
+static constexpr int64_t kMaxBodySize = 16;
+static constexpr int64_t kMaxBodyRepeats = 64;
+
+bool isTrueConstant(Value *val) {
+  at::optional<bool> maybe_value = constant_as<bool>(val);
+  return maybe_value && *maybe_value;
+}
+
+bool isForLoop(Node* node) {
+  if (node->kind() != prim::Loop)
+    return false;
+  Value *start_cond = node->inputs().at(1);
+  Value *continue_cond = node->blocks().at(0)->outputs().at(0);
+  return isTrueConstant(start_cond) && isTrueConstant(continue_cond);
+}
+
+// Counts the size of this block, stopping and returning once reaches limit instructions.
+int64_t limitedBlockSize(Block *body, int64_t limit) {
+  auto it = body->nodes().begin();
+  auto end = body->nodes().end();
+  for (int64_t i = 0; i < limit; ++i, ++it) {
+    for (Block *subblock : it->blocks()) {
+      i += limitedBlockSize(subblock, limit - i);
+    }
+    if (it == end) {
+      return i;
+    }
+  }
+  return limit;
+}
+
+bool isSmallBlock(Block *body) {
+  return limitedBlockSize(body, kMaxBodySize + 1) <= kMaxBodySize;
+}
+
+// XXX: This function can only be called with a loop that is guaranteed to execute EXACTLY ONCE.
+void inlineBody(Node *loop) {
+  auto graph = loop->owningGraph();
+  auto body = loop->blocks().at(0);
+  WithInsertPoint insert_point_guard { loop };
+
+  std::unordered_map<Value*, Value*> value_map;
+  auto get_value = [&](Value *v) {
+    auto it = value_map.find(v);
+    if (it != value_map.end())
+      return it->second;
+    return v;
+  };
+
+  // Loop node has extra (max_iters, initial_cond) inputs,
+  // body has an extra (loop_counter) input.
+  for (size_t i = 2; i < loop->inputs().size(); ++i) {
+    value_map[body->inputs()[i - 1]] = loop->inputs()[i];
+  }
+
+  for (Node *orig : body->nodes()) {
+    Node *clone = graph->insertNode(graph->createClone(orig, get_value));
+    for (size_t i = 0; i < orig->outputs().size(); ++i) {
+      value_map[orig->outputs()[i]] = clone->outputs()[i];
+    }
+  }
+  for (size_t i = 0; i < loop->outputs().size(); ++i) {
+    loop->outputs().at(i)->replaceAllUsesWith(get_value(body->outputs().at(i + 1)));
+  }
+  // XXX: it is extremely important to destroy the loop in here. DCE might not be able
+  // to conclude that it's safe, because the loop might contain side effects.
+  loop->destroy();
+}
+
+void repeatBody(Block *body, int64_t times) {
+  // We will be adding nodes to the body, so cache the initial start and end.
+  // XXX: they are both inclusive, because the exclusive body_end would point to
+  //      return_node, which would move further away if we were to add nodes, and we
+  //      would enter an infinite loop.
+  auto body_start = body->nodes().begin();
+  auto body_end = std::prev(body->nodes().end());
+  auto graph = body->owningGraph();
+  WithInsertPoint insert_point_guard { body };
+
+  std::unordered_map<Value*, Value*> value_map;
+  auto get_value = [&](Value *v) {
+    auto it = value_map.find(v);
+    if (it != value_map.end())
+      return it->second;
+    return v;
+  };
+
+  for (int64_t i = 1; i < times; ++i) {
+    // Update loop-carried values
+    // NB: note that we don't need to worry about the loop counter, because we've
+    //     replaced it with a loop-carried variable
+    JIT_ASSERT(body->inputs().size() == body->outputs().size());
+    for (size_t i = 1; i < body->inputs().size(); ++i) {
+      value_map[body->inputs()[i]] = get_value(body->outputs()[i]);
+    }
+
+    // Clone the nodes
+    for (auto it = body_start; it != std::next(body_end); ++it) {
+      Node *orig = *it;
+      Node *clone = graph->insertNode(graph->createClone(orig, get_value));
+      for (size_t i = 0; i < orig->outputs().size(); ++i) {
+        value_map[orig->outputs()[i]] = clone->outputs()[i];
+      }
+    }
+  }
+
+  // Update outputs of the body
+  const std::vector<Value*> new_outputs = fmap(body->outputs(), get_value);
+  for (int64_t i = new_outputs.size() - 1; i >= 0; --i) {
+    body->eraseOutput(i);
+  }
+  for (Value *output : new_outputs) {
+    body->registerOutput(output);
+  }
+
+  // It's likely that we have some dead nodes now - for example the "true" constant
+  // that prevents the loop from breaking. We shouldn't wait too long before removing
+  // them because they might artificially increase the loop size and prevent outer loop
+  // unrolling.
+  EliminateDeadCode(body, false);
+}
+
+// Replaces the builtin loop counter with a "mutable" variable outside of the loop.
+void replaceLoopCounter(Node *loop) {
+  Graph *graph = loop->owningGraph();
+  Block *body = loop->blocks().at(0);
+  Node *init_counter_node = graph->createConstant(at::CPU(at::kLong).scalarTensor(0))
+                                 ->insertBefore(loop);
+  loop->insertInput(2, init_counter_node->output());
+  loop->insertOutput(0);
+
+  Value * internal_counter = body->insertInput(1);
+  body->inputs()[0]->replaceAllUsesWith(internal_counter);
+
+  WithInsertPoint insertPointGuard{ body->return_node() };
+  body->insertOutput(1, SymbolicVariable(internal_counter) + at::Scalar(1));
+}
+
+void unroll(Node *loop) {
+  Graph *graph = loop->owningGraph();
+  Block *body = loop->blocks().at(0);
+  if (!isSmallBlock(body))
+    return;
+
+  // We will be using a "mutable" counter outside of the loop instead of the default
+  // one, because this will allow us to share it between the unrolled loop and its epilogue.
+  // This is necessary only if the loop counter is actually used in the body.
+  if (body->inputs()[0]->uses().size() > 0)
+    replaceLoopCounter(loop);
+
+  // Some optimization for constant-length loops. If we know they won't run too many
+  // times, then we can unroll them entirely.
+  Value *trip_count = loop->inputs().at(0);
+  int64_t const_len = constant_as<int64_t>(trip_count).value_or(-1);
+  if (const_len != -1 && const_len < kMaxBodyRepeats) {
+    repeatBody(body, const_len);
+    inlineBody(loop);
+    return;
+  }
+
+  WithInsertPoint insert_point_guard { loop };
+
+  // Clone the loop before we unroll it. The clone will become the epilogue.
+  Node *loop_epilogue = graph->createClone(loop, [](Value *v) { return v; })
+                             ->insertAfter(loop);
+  for (size_t i = 0; i < loop->outputs().size(); ++i) {
+    loop->outputs()[i]->replaceAllUsesWith(loop_epilogue->outputs()[i]);
+    loop_epilogue->replaceInput(i + 2, loop->outputs()[i]);
+  }
+
+  repeatBody(body, kUnrollFactor);
+
+  // Change the iteration counts of both loops
+  SymbolicVariable iter_count = loop->inputs().at(0);
+  SymbolicVariable unrolled_iter_count = iter_count / kUnrollFactor;
+  loop->replaceInput(0, unrolled_iter_count);
+  loop_epilogue->replaceInput(0, iter_count - (unrolled_iter_count * kUnrollFactor));
+}
+
+void UnrollLoops(Block *block) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+    // XXX: unroll might destroy the current node, so we need to pre-increment the iterator
+    Node *node = *it; ++it;
+    for (Block *subblock : node->blocks()) {
+      UnrollLoops(subblock);
+    }
+    if (isForLoop(node)) {
+      unroll(node);
+    }
+  }
+}
+
+} // anonymous namespace
+
+void UnrollLoops(std::shared_ptr<Graph>& graph) {
+  UnrollLoops(graph->block());
+  EliminateDeadCode(graph);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/loop_unrolling.h b/torch/csrc/jit/passes/loop_unrolling.h
new file mode 100644
index 0000000..4ca1fd7
--- /dev/null
+++ b/torch/csrc/jit/passes/loop_unrolling.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void UnrollLoops(std::shared_ptr<Graph>& graph);
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/lower_grad_of.cpp b/torch/csrc/jit/passes/lower_grad_of.cpp
new file mode 100644
index 0000000..ddd31bc
--- /dev/null
+++ b/torch/csrc/jit/passes/lower_grad_of.cpp
@@ -0,0 +1,31 @@
+#include "torch/csrc/jit/passes/lower_grad_of.h"
+
+namespace torch { namespace jit {
+
+void LowerGradOf(Graph& g) {
+  for(auto it = g.nodes().begin(); it != g.nodes().end(); ++it) {
+    if(it->kind() == prim::GradOf) {
+      // if any_defined(inputs):
+      //  outputs = <original_computation>
+      // else:
+      //  outputs = undefineds
+      WithInsertPoint guard(*it);
+      auto cond = g.insertNode(g.create(prim::AnyDefined, it->inputs()));
+      auto if_stat = g.insertNode(g.create(prim::If,{cond->output()}, it->outputs().size()));
+      if_stat->addBlock()->cloneFrom(
+          it->blocks().at(0), [](Value* v) { return v; });
+      auto else_block = if_stat->addBlock();
+      auto undef = g.createUndefined()
+                       ->insertBefore(else_block->return_node())
+                       ->output();
+      for (size_t i = 0; i < it->outputs().size(); ++i) {
+        else_block->registerOutput(undef);
+        if_stat->outputs().at(i)->copyMetadata(it->outputs().at(i));
+      }
+      it->replaceAllUsesWith(if_stat);
+      it.destroyCurrent();
+    }
+  }
+}
+
+}}
diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h
new file mode 100644
index 0000000..104adcb
--- /dev/null
+++ b/torch/csrc/jit/passes/lower_grad_of.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+// This pass removes 'grad_of' nodes, replacing them with conditionals of
+// the form:
+// if any_defined(inputs):
+//  outputs = <original_computation>
+// else:
+//  outputs = undefineds
+void LowerGradOf(Graph& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/lower_tuples.cpp b/torch/csrc/jit/passes/lower_tuples.cpp
new file mode 100644
index 0000000..49b9c99
--- /dev/null
+++ b/torch/csrc/jit/passes/lower_tuples.cpp
@@ -0,0 +1,128 @@
+#include "torch/csrc/jit/passes/lower_tuples.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/utils/functional.h"
+
+namespace torch { namespace jit {
+
+// operators where we expect to find tuples as inputs/outputs
+// this is to assert we are only  doing modifications when we know
+// we can flatten tuples
+std::unordered_set<Symbol> white_list = {
+  prim::If,
+  prim::Loop,
+  prim::TupleUnpack,
+  prim::TupleConstruct,
+  prim::Param,
+  prim::Return,
+};
+
+
+static void LowerTuples(Block* block);
+
+static void VisitNode(Node* n, Node* insert_point) {
+  auto & graph = *n->owningGraph();
+
+  // tuple construction operators will become dead when the unpacks are replaced
+  if(n->kind() == prim::TupleConstruct) {
+    return;
+  }
+
+  // make any TupleUnpack dead by undoing TupleUnpack(TupleConstruct())
+  if(n->kind() == prim::TupleUnpack) {
+    auto construct = n->input()->node();
+    // note: removing these asserts changes this pass from a complete lowering
+    // pass to one that removes tuples when possible. When tuples are first-class
+    // in the interpreter, we should still run this pass to remove extraneous uses
+    JIT_ASSERTM(construct->kind() == prim::TupleConstruct, "tuple unpack not matched to tuple construct");
+    for(size_t i = 0; i < n->outputs().size(); ++i) {
+      n->outputs()[i]->replaceAllUsesWith(construct->inputs()[i]);
+    }
+    return;
+  }
+  // flatten the input list  op(a, tup, b) --> op(a, t0, t1, b)
+  for(size_t i = 0; i < n->inputs().size();) {
+    auto input = n->inputs()[i];
+    if(TupleType* tt = input->type()->cast<TupleType>()) {
+      JIT_ASSERTM(white_list.count(n->kind()) > 0, "tuple appears in op that does not forward tuples");
+      JIT_ASSERTM(input->node()->kind() == prim::TupleConstruct, "tuple use not matched to tuple construct");
+      for(size_t j = 0; j < tt->elements().size(); ++j) {
+        n->insertInput(i + 1 + j, input->node()->inputs().at(j));
+      }
+      n->removeInput(i);
+      // note: no update to i
+      // since tuples might be nested we need to recursively scan
+      // the new flattened inputs
+    } else {
+      ++i;
+    }
+  }
+  for(auto b : n->blocks()) {
+    LowerTuples(b);
+  }
+
+  // flatten the outputs list
+  for(size_t i = 0; i < n->outputs().size();) {
+    Value * output = n->outputs()[i];
+    // (a, b, tup, c) -> (a, b, t0, t1, c)
+    // and:
+    //    tup = (t0, t1)
+    // is placed at the current insertion point
+    if(TupleType* tt = output->type()->cast<TupleType>()) {
+      JIT_ASSERTM(white_list.count(n->kind()) > 0, "tuple appears in op that does not forward tuples");
+      for(size_t j = 0; j < tt->elements().size(); j++) {
+        n->insertOutput(i + 1 + j)->setType(tt->elements()[j]);
+      }
+      auto new_tup = graph.createTuple(n->outputs().slice(i + 1, tt->elements().size()));
+      new_tup->insertBefore(insert_point);
+      insert_point = new_tup;
+      output->replaceAllUsesWith(new_tup->output());
+      n->eraseOutput(i);
+      // note: no update to i to handle nested tuples
+    } else {
+      ++i;
+    }
+  }
+}
+
+static void LowerTuples(Block* block) {
+  // tuples in parameter lists of a block behave exactly the same as
+  // _outputs_ of normal instructions, since the param_node represents the
+  // parameters as outputs, we can handle it by simply visiting the node
+  VisitNode(block->param_node(), *block->nodes().begin());
+  for(auto it = block->nodes().begin(), end = block->nodes().end(); it != end;) {
+    auto n = *it++;
+    VisitNode(n, *it);
+  }
+  // tuples in return lists of blocks behave exactly the same as
+  // _inputs_ of normal instructions, so we can use VisitNode here as well
+  // insert_point is null because it will never be used since return nodes
+  // have no outputs
+  VisitNode(block->return_node(), nullptr);
+}
+
+static void EnsureNoTuples(Block* block) {
+  for (Node* n : block->nodes()) {
+    for (Block* b : n->blocks()) {
+      EnsureNoTuples(b);
+    }
+    for (Value * o : n->outputs()) {
+      JIT_ASSERTM(o->type()->kind() != TypeKind::TupleType,
+                  "Couldn't lower all tuples. This is an error because "
+                  "they're not implemented in the interpreter just yet.");
+    }
+  }
+}
+
+void LowerTuples(std::shared_ptr<Graph>& graph) {
+  for(auto input : graph->inputs()) {
+    JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph");
+  }
+  for(auto output : graph->outputs()) {
+    JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph");
+  }
+  LowerTuples(graph->block());
+  EliminateDeadCode(graph);
+  EnsureNoTuples(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/lower_tuples.h b/torch/csrc/jit/passes/lower_tuples.h
new file mode 100644
index 0000000..6dda8c2
--- /dev/null
+++ b/torch/csrc/jit/passes/lower_tuples.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void LowerTuples(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
new file mode 100644
index 0000000..0ead52f
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -0,0 +1,198 @@
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/jit/passes/onnx.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/symbolic.h"
+#include "torch/csrc/utils/functional.h"
+#include <unordered_map>
+#include <sstream>
+
+namespace torch { namespace jit {
+
+// Transform PythonOps into Nodes that match ONNX semantics.
+std::shared_ptr<Graph> ToONNX(std::shared_ptr<Graph>& graph, ::torch::onnx::OperatorExportTypes operator_export_type) {
+  auto new_graph = std::make_shared<Graph>(graph->scope_root());
+  std::unordered_map<Value*, Value*> env;
+  BlockToONNX(graph->block(), new_graph->block(), operator_export_type, env);
+  return new_graph;
+}
+
+void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map<Value*, Value*> env) {
+  torch::autograd::SymbolicContext ctx;
+  ctx.block = new_block;
+
+  py::object onnx = py::module::import("torch.onnx");
+  py::object onnx_symbolic = py::module::import("torch.onnx.symbolic");
+
+  // Returns a node that n maps to in the new graph
+  auto envFn = [&env](Value * n) -> Value* {
+    auto it = env.find(n);
+    JIT_ASSERTM(it != env.end(), "Dangling node reference");
+    JIT_ASSERTM(it->second, "Unused node was subsequently used");
+    return it->second;
+  };
+
+  // Initialize context and environment
+  for (auto input : old_block->inputs()) {
+    auto n = ctx.block->addInput()->copyMetadata(input);
+    n->setStage(input->stage());
+    env[input] = n;
+  }
+  // Put the new outputs in our environment map, and copy the type from the
+  // input graph if they were not set by the symbolic. This is called only
+  // with results of symbolic call (not for nodes that are just cloned).
+  auto setOutputs = [&](const std::string& op_name, Node * node, const value_list & outputs) {
+    auto old_outputs = node->outputs();
+    // Count all outputs, excluding Handles
+    auto num_old_outputs = old_outputs.size();
+    if (outputs.size() != num_old_outputs) {
+      std::ostringstream ss;
+      ss << "symbolic for " << op_name << " produced an incorrect number of outputs (expected ";
+      ss << num_old_outputs << ", but got " << outputs.size() << ")";
+      throw std::runtime_error(ss.str());
+    }
+    for (size_t i = 0; i < num_old_outputs; ++i) {
+      auto old = old_outputs[i];
+      if (outputs[i]) {
+        // Allow symbolic() to skip specifying the type of the return node.
+        // Unfortunately, they are on the hook for all internal nodes
+        // (though in practice, the types are not computed.)
+        outputs[i]->setType(old->type());
+        // Copy over source location information to all nodes created by
+        // the symbolic
+        outputs[i]->node()->setSourceLocation(node->getSourceLocation());
+        env[old] = outputs[i];
+      } else {
+        // Null output means that the ONNX op doesn't have outputs corresponding
+        // to certain PyTorch outputs
+        env[old] = nullptr;
+        if (!old->uses().empty()) {
+          std::ostringstream ss;
+          ss << "symbolic for " << op_name << " returned None for the output " << i;
+          ss << " (indicating conversion for that particular output is not supported), ";
+          ss << "but the network uses this output later";
+          // TODO: Say what actually used it
+          throw std::runtime_error(ss.str());
+        }
+      }
+    }
+  };
+
+  // Clone the node and add it to the new graph
+  auto cloneNode = [&](Node * node) {
+    auto n_ = ctx.block->appendNode(ctx.block->owningGraph()->createClone(node, envFn));
+    for(size_t i = 0; i < node->outputs().size(); i++) {
+      // n_->outputs()[i]->setType(node->outputs()[i]->type());
+      env[node->outputs()[i]] = n_->outputs()[i];
+    }
+  };
+
+  // Cast output of symbolic() python implementation
+  auto processSymbolicOutput = [&](const std::string& op_name, Node* n, const py::object& raw_output) {
+    if (raw_output.ptr() == Py_None) {
+      cloneNode(n);
+      return;
+    }
+    // Cast the outputs back to C++ and put them in the new graph
+    std::vector<Value*> outputs;
+    try {
+      if (py::isinstance<Value>(raw_output)) {
+        outputs = value_list{py::cast<Value*>(raw_output)};
+      } else {
+        outputs = py::cast<std::vector<Value*>>(raw_output);
+      }
+    } catch (const std::exception& ex) {
+      std::ostringstream ss;
+      ss << "Error casting results of symbolic for " << op_name
+         << ": expected to return list of op nodes, instead received type ''"
+         << py::str(raw_output.get_type()) << "': " << py::str(raw_output);
+      throw std::runtime_error(ss.str());
+    }
+
+    setOutputs(op_name, n, outputs);
+  };
+
+  auto callPySymbolicFunction = [&](Node* n) {
+    // The idea is delegate as much of the actual argument massaging to
+    // Python as possible
+
+    py::tuple py_inputs(n->inputs().size());
+    Py_ssize_t input_nr = 0;
+    for (auto* input : n->inputs()) {
+        py_inputs[input_nr++] = py::cast(envFn(input));
+    }
+
+    WithInsertPoint insert_point_guard(ctx.block);
+    WithCurrentScope scope_guard(*ctx.block->owningGraph(), n->scope());
+    py::object raw_output = onnx.attr("_run_symbolic_function")(ctx.block->owningGraph(), n, py_inputs, env, operator_export_type);
+
+    // TODO: Assert it's an ATen identifier???
+    // (Sometimes it's not...)
+    processSymbolicOutput(n->kind().toUnqualString(), n, raw_output);
+  };
+
+  auto callPySymbolicMethod = [&](PythonOp* op) {
+
+    // Test if there is a symbolic function; bail if there is not
+    auto pyobj = py::handle(op->pyobj.get());
+    auto func = op->autogradFunction();
+    if(func) {
+      pyobj = func->get();
+    }
+
+    if(!py::hasattr(pyobj, "symbolic")) {
+      cloneNode(op);
+      return;
+    }
+
+    // Prepare args for Python. First one is the graph, and is followed
+    // by regular args, with Variables replaced by corresponding nodes.
+    Py_ssize_t input_nr = 0;
+    py::tuple py_symbolic_args(1 + op->cconv.size());
+    py_symbolic_args[input_nr++] = py::cast(ctx.block->owningGraph());
+    auto inputs = op->inputs();
+    auto node_it = inputs.begin();
+    auto scalar_it = op->scalar_args.begin();
+    for (auto arg_type : op->cconv) {
+      py::object obj;
+      if (arg_type == 's') {
+        JIT_ASSERTM(scalar_it != op->scalar_args.end(), "expected too many scalar args");
+        obj = py::reinterpret_borrow<py::object>(py::handle((scalar_it++)->get()));
+      } else if (arg_type == 't') {
+        JIT_ASSERTM(node_it != inputs.end(), "expected too many inputs");
+        obj = py::cast(envFn(*node_it++));
+      } else {
+        throw std::runtime_error("unexpected calling convention");
+      }
+      py_symbolic_args[input_nr++] = obj;
+    }
+
+    WithInsertPoint insert_point_guard(ctx.block);
+    WithCurrentScope scope_guard(*ctx.block->owningGraph(), op->scope());
+    // Call the symbolic function
+    // Use a little trampoline function so we can give good error messages
+    // upon argument mismatch
+    py::object raw_output = onnx.attr("_run_symbolic_method")(op->name(), pyobj.attr("symbolic"), py_symbolic_args);
+
+    processSymbolicOutput(op->name(), op, raw_output);
+  };
+
+  // Finally, visit all nodes in the graph
+  for (auto node : old_block->nodes()) {
+    // Needed so that symbolic calls create nodes with correct stages.
+    auto stage_guard = ctx.block->owningGraph()->setStageTemporary(node->stage());
+    IR_IFM(node, PythonOp)
+      callPySymbolicMethod(value);
+    IR_ELSE()
+      callPySymbolicFunction(node);
+    IR_END()
+  }
+  for (auto output : old_block->outputs()) {
+    ctx.block->registerOutput(env.at(output));
+    env.at(output)->setType(output->type());
+  }
+
+  // Copy stage from original graph
+  ctx.block->owningGraph()->setStage(old_block->owningGraph()->stage());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h
new file mode 100644
index 0000000..bd6f6e4
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/tracer_state.h"
+#include "torch/csrc/onnx/onnx.h"
+
+namespace torch { namespace jit {
+
+std::shared_ptr<Graph> ToONNX(std::shared_ptr<Graph>& state, ::torch::onnx::OperatorExportTypes operator_export_type);
+void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map<Value*, Value*> env);
+
+}}
diff --git a/torch/csrc/jit/passes/onnx/README.md b/torch/csrc/jit/passes/onnx/README.md
new file mode 100644
index 0000000..6d37f14
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/README.md
@@ -0,0 +1,4 @@
+The optimization passes in this directory work exclusively on ONNX-style IRs,
+e.g., IRs that have had ToONNX applied to them.  ONNX defines operators
+differently from ATen, so there are different opportunities for peephole
+optimization.
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp
new file mode 100644
index 0000000..72ff528
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp
@@ -0,0 +1,22 @@
+#include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
+
+namespace torch { namespace jit {
+
+void FixupONNXLoops(Block *block) {
+  for (auto *node : block->nodes()) {
+    if (node->kind() == torch::jit::onnx::Loop) {
+      JIT_ASSERT(node->blocks().size() == 1);
+      auto *sub_block = node->blocks()[0];
+      sub_block->insertInput(1, "cond");
+    }
+    for (Block * block : node->blocks()) {
+      FixupONNXLoops(block);
+    }
+  }
+}
+
+void FixupONNXLoops(std::shared_ptr<Graph>& graph) {
+  FixupONNXLoops(graph->block());
+}
+
+}}  // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_loop.h b/torch/csrc/jit/passes/onnx/fixup_onnx_loop.h
new file mode 100644
index 0000000..fe311ac
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_loop.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void FixupONNXLoops(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
new file mode 100644
index 0000000..b3c87c9
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -0,0 +1,462 @@
+#include "torch/csrc/jit/passes/onnx/peephole.h"
+
+#include <ATen/optional.h>
+
+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+namespace torch { namespace jit {
+
+bool isRNN(const Node *node) {
+  auto k = node->kind();
+  return k == onnx::RNN || k == onnx::LSTM || k == onnx::GRU;
+}
+
+bool isNopTranspose(const std::vector<int64_t> & perm) {
+  for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++)
+    if (perm[i] != i)
+      return false;
+  return true;
+}
+
+// returns a vector `ret` such that transposing by `ret` is equivalent
+// to transposing by `t1` and then by `t2`
+//
+// This fires in the case that we have transpose ops T1 -> T2. We are
+// fusing the transpose op T1 into T2 and discarding T1. We assume the elements
+// of the permutation in `t1` are raw indices into its input, since a previous
+// iteration would have folded all the transposes up to that point. Thus,
+// `ret[i] = t1[t2[i]]` says "the output of t2 at position i takes the value of
+// the input tensor index contained in t1 at position `t2[i]``".
+std::vector<int64_t> composeTransposes(const std::vector<int64_t> & t1,
+                                       const std::vector<int64_t> & t2) {
+  JIT_ASSERT(t1.size() == t2.size());
+  std::vector<int64_t> ret;
+  ret.reserve(t1.size());
+  for (size_t i = 0; i < t2.size(); i++) {
+    JIT_ASSERT(t2[i] < int64_t(t1.size()));
+    ret.push_back(t1[t2[i]]);
+  }
+  return ret;
+}
+
+bool isBroadcasting(Node* node) {
+  // Broadcasting operators have the following property:
+  // They support a 'broadcast' flag, which enables broadcasting
+  // on the last argument.  ATM this is not full-Numpy broadcasting,
+  // only left-size extension (no size 1 to size n broadcast)
+  static std::unordered_set<NodeKind> broadcasting = {
+    onnx::Add,
+    onnx::Div,
+    onnx::Mul,
+    onnx::Pow,
+    onnx::Sub,
+    onnx::Gemm,
+  };
+
+  return broadcasting.count(node->kind());
+}
+
+// Determine whether `from` can broadcast to `to`, and if so at which
+// position. `from` must be a suffix of `to`, except that any
+// occurences of 1 in `from` are treated as wildcards.
+at::optional<size_t> fusibleExpandTo(at::IntList from, at::IntList to) {
+  if (from.size() > to.size()) {
+    return at::nullopt;
+  }
+
+  for (size_t i = 0; i < from.size(); i++) {
+    auto fdim = from[from.size() - 1 - i];
+    auto tdim = to[to.size() - 1 - i];
+    if (fdim != 1 && fdim != tdim) {
+      return at::nullopt;
+    }
+  }
+
+  return to.size() - from.size();
+}
+
+void fuseBroadcast(Block *b) {
+  for(auto n : b->nodes()) {
+    for (auto *child_block : n->blocks()) {
+      fuseBroadcast(child_block);
+    }
+
+    // Can't fuse into nodes that don't support broadcasting
+    if (!isBroadcasting(n)) continue;
+
+    // If the node already broadcasts, can't "rebroadcast"
+    // TODO: Actually, maybe you can, if there is a broadcast for some
+    // dims, and then another broadcast for the rest.  But this will
+    // never happen in practice so I didn't implement it.
+    if (n->hasAttribute(attr::broadcast) && n->i(attr::broadcast)) continue;
+    JIT_ASSERT(!n->hasAttribute(attr::axis));
+
+    auto input_index = n->inputs().size() - 1;
+    auto* expanded_rhs = n->input(input_index)->node();
+
+    // The expanded_rhs input isn't actually an expand, so no fusion available
+    if (expanded_rhs->kind() != aten::expand) continue;
+    if (expanded_rhs->inputs().size() != 1) continue;
+
+    auto* unexpanded_rhs = expanded_rhs->input();
+
+    // We need to know what the type pre-expand is.  We should basically
+    // always have this information (because expands are only ever traced,
+    // not generated from symbolic), but if for some reason we don't
+    // have it, we need to skip.
+    if (!unexpanded_rhs->isTensor()) continue;
+
+    // Not all broadcasts are supported by ONNX broadcast.
+    at::optional<size_t> axis = fusibleExpandTo(
+        unexpanded_rhs->type()->expect<TensorType>()->sizes(), // from
+        expanded_rhs->output()->type()->expect<TensorType>()->sizes()); // to
+    if (axis == at::nullopt)
+      continue;
+
+    n->replaceInput(input_index, unexpanded_rhs);
+    n->i_(attr::broadcast, 1);
+    if (axis) {
+      // Gemm doesn't support the axis argument, so be sure to omit it
+      // for that op. It also only supports an axis of 1.
+      if (n->kind() == onnx::Gemm) {
+        JIT_ASSERT(axis.value() == 1);
+      } else {
+        n->i_(attr::axis, axis.value());
+      }
+    }
+    if (!expanded_rhs->hasUses()) {
+      expanded_rhs->destroy();
+    }
+  }
+}
+
+void fuseConsecutiveTransposes(Block *b) {
+  for(auto n : b->nodes()) {
+    for (auto *child_block : n->blocks()) {
+      fuseConsecutiveTransposes(child_block);
+    }
+    if (n->kind() == onnx::Transpose && n->input()->node()->kind() == onnx::Transpose) {
+      auto origInput = n->input();
+      n->is_(attr::perm, composeTransposes(origInput->node()->is(attr::perm), n->is(attr::perm)));
+      n->replaceInput(0, origInput->node()->input());
+      if (origInput->uses().size() == 0) {
+        origInput->node()->destroy();
+      }
+      continue;
+    }
+  }
+}
+
+void eliminateNopTranspose(Block *b) {
+  for(auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
+    auto n = *it;
+    for (auto *child_block : n->blocks()) {
+      eliminateNopTranspose(child_block);
+    }
+    if (n->kind() == onnx::Transpose) {
+      if (isNopTranspose(n->is(attr::perm))) {
+        n->replaceAllUsesWith(n->input()->node());
+        it.destroyCurrent();
+        continue;
+      }
+    }
+  }
+}
+
+void fuseTransposeIntoGemm(Block *b) {
+  static const std::vector<int64_t> simpleTransPerm({1,0});
+
+  for(auto n : b->nodes()) {
+    for (auto *child_block : n->blocks()) {
+      fuseTransposeIntoGemm(child_block);
+    }
+    if (n->kind() == onnx::Gemm) {
+      for (size_t i : {0,1}) {
+        auto inp = n->inputs()[i];
+        auto trans = i == 0 ? attr::transA : attr::transB;
+        if (inp->node()->kind() == onnx::Transpose && inp->node()->is(attr::perm) == simpleTransPerm) {
+          n->replaceInput(i, inp->node()->input());
+          n->i_(trans, n->hasAttribute(trans) ? !n->i(trans) : 1);
+          if (inp->uses().size() == 0) {
+            inp->node()->destroy();
+          }
+        }
+      }
+    }
+  }
+}
+
+// Why this is here:
+//
+//   Pytorch has a "packed" representation of sequences, as well as a
+//   "padded" representation. ONNX has only one representation,
+//   corresponding to pytorch's "padded". Therefore, we need to remove
+//   any use of packed sequences before exporting.
+//
+// What this does:
+//
+//   This code uses the observation that
+//     RNN(PackPadded(x)) == PackPadded(RNN(x))
+//   and converts the first form to the second whenever possible,
+//   "pushing" the packing operation past the RNN operation. Then,
+//   the removeNopPacking pass removes the packing operations
+//   entirely by pairing them with their inverse PadPacked. If the
+//   input graph does not pair the operations, export will fail.
+
+void pushPackingPastRnn(Block *b) {
+  for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
+    auto* n = *it;
+    for (auto *child_block : n->blocks()) {
+      pushPackingPastRnn(child_block);
+    }
+
+    if (n->kind() != prim::PackPadded) {
+      continue;
+    }
+    if (n->outputs()[0]->uses().size() != 1) {
+      // For now, only handle the case where there is one consumer.
+      continue;
+    }
+    Node * rnn = n->outputs()[0]->uses()[0].user;
+    if (!isRNN(rnn)) {
+      continue;
+    }
+
+    if(rnn->owningBlock() != n->owningBlock())
+      continue;
+
+    // The rnn is followed by a transpose and a reshape (if
+    // bidirectional), or by a squeeze (if unidirectional).
+    Node * next = rnn->outputs()[0]->uses()[0].user;
+    if (next->kind() == onnx::Transpose) {
+      next = next->outputs()[0]->uses()[0].user;
+      if (next->kind() != onnx::Reshape) {
+        continue;
+      }
+    } else if (next->kind() != onnx::Squeeze) {
+      continue;
+    }
+
+    // remove PackPadded from in front of the RNN
+    n->outputs()[0]->replaceAllUsesWith(n->inputs()[0]);
+
+    // note there can be multiple uses of the length blob. If we are
+    // translating a multi-level RNN it will be an input to each level.
+    n->outputs()[1]->replaceFirstUseWith(n->inputs()[1]);
+
+    // and insert new PackPadded after the RNN
+    Node * newPackPadded = b->owningGraph()->create(prim::PackPadded, 2);
+    newPackPadded->insertAfter(next);
+
+    // make things consume from the new PackPadded
+    next->outputs()[0]->replaceAllUsesWith(newPackPadded->outputs()[0]);
+    n->outputs()[1]->replaceAllUsesWith(newPackPadded->outputs()[1]);
+
+    // setup the new PackPadded's inputs
+    newPackPadded->addInput(next->outputs()[0]);
+    newPackPadded->addInput(n->inputs()[1]);
+
+    it.destroyCurrent();
+  }
+}
+
+void removeNopPacking(Block* graph) {
+  for (auto it = graph->nodes().begin(); it != graph->nodes().end(); ++it) {
+    auto* n = *it;
+    for (auto *child_block : n->blocks()) {
+      removeNopPacking(child_block);
+    }
+
+    if (n->kind() != prim::PadPacked) {
+      continue;
+    }
+    Node* input = n->inputs()[0]->node();
+    if (input->kind() != prim::PackPadded) {
+      continue;
+    }
+    if (input->outputs()[0] != n->inputs()[0]) {
+      continue;
+    }
+    if (input->outputs()[1] != n->inputs()[1]) {
+      continue;
+    }
+    n->outputs()[0]->replaceAllUsesWith(input->inputs()[0]);
+    n->outputs()[1]->replaceAllUsesWith(input->inputs()[1]);
+
+    n->removeAllInputs();
+    it.destroyCurrent();
+  }
+}
+
+void fixDefaultRNNState(Graph* graph, Node * n, int input_index) {
+  auto initial_state = n->inputs()[input_index];
+
+  // The RNN code in pytorch accepts an optional hidden state. When it
+  // is provided, everything works great. When it is not provided, it
+  // is default-initialized by constructing a new Variable, which gets
+  // traced as a Constant. Recognize that pattern here and replace it
+  // with something that doesn't fix the batch size.  Note that for
+  // multi-layer RNNs there will be a Slice operation between the
+  // Constant and the RNN.
+  bool needsFixing =
+    initial_state->node()->kind() == onnx::Constant ||
+    (initial_state->node()->kind() == onnx::Slice &&
+     initial_state->node()->inputs()[0]->node()->kind() == onnx::Constant);
+
+  if (!needsFixing) {
+    return;
+  }
+
+  Node * shape_of_input = graph->create(onnx::Shape, 1);
+  shape_of_input->insertBefore(n);
+  shape_of_input->addInput(n->inputs()[0]);
+
+  Node * gather_indices = graph->create(onnx::Constant, 1);
+  gather_indices->insertBefore(n);
+  gather_indices->t_(attr::value, at::Scalar(1).toTensor());
+
+  Node * batch_size = graph->create(onnx::Gather, 1);
+  batch_size->insertBefore(n);
+  batch_size->addInput(shape_of_input->outputs()[0]);
+  batch_size->addInput(gather_indices->outputs()[0]);
+
+  Node * unsqueezed_batch_size = graph->create(onnx::Unsqueeze, 1);
+  unsqueezed_batch_size->insertBefore(n);
+  unsqueezed_batch_size->addInput(batch_size->outputs()[0]);
+  unsqueezed_batch_size->is_(attr::axes, {0});
+
+  Node * hidden_size = graph->create(onnx::Constant, 1);
+  hidden_size->insertBefore(n);
+  hidden_size->t_(attr::value, at::full({1}, n->i(attr::hidden_size), at::kLong)); // at::Scalar(n->i(attr::hidden_size)).toTensor());
+
+  Node * num_directions = graph->create(onnx::Constant, 1);
+  num_directions->insertBefore(n);
+  num_directions->t_(attr::value, at::Scalar(n->hasAttribute(attr::direction) && n->s(attr::direction) == "bidirectional" ? 2 : 1).toTensor());
+
+  Node * unsqueezed_num_directions = graph->create(onnx::Unsqueeze, 1);
+  unsqueezed_num_directions->insertBefore(n);
+  unsqueezed_num_directions->addInput(num_directions->outputs()[0]);
+  unsqueezed_num_directions->is_(attr::axes, {0});
+
+  Node * concated_dims = graph->create(onnx::Concat, 1);
+  concated_dims->insertBefore(n);
+  concated_dims->i_(attr::axis, 0);
+  concated_dims->addInput(unsqueezed_num_directions->outputs()[0]);
+  concated_dims->addInput(unsqueezed_batch_size->outputs()[0]);
+  concated_dims->addInput(hidden_size->outputs()[0]);
+
+  Node * constant_fill = graph->create(onnx::ConstantFill, 1);
+  constant_fill->insertBefore(n);
+  constant_fill->i_(attr::input_as_shape, 1);
+  constant_fill->addInput(concated_dims->outputs()[0]);
+
+  n->replaceInput(input_index, constant_fill->outputs()[0]);
+  if (initial_state->uses().size() == 0) {
+    initial_state->node()->destroy();
+  }
+}
+
+void fixDefaultRnnHiddenState(Block* b) {
+  for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
+    auto* n = *it;
+    for (auto *child_block : n->blocks()) {
+      fixDefaultRnnHiddenState(child_block);
+    }
+
+    if (!isRNN(n)) {
+      continue;
+    }
+    // Hidden state is the sixth input for RNN, LSTM, GRU.
+    // See http://pytorch.org/docs/master/nn.html#torch.nn.RNN
+    if (n->inputs().size() < 6) {
+      continue;
+    }
+    fixDefaultRNNState(b->owningGraph(), n, 5);
+  }
+}
+
+void fixDefaultLstmCellState(Block *b) {
+  for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
+    auto* n = *it;
+    for (auto *child_block : n->blocks()) {
+      fixDefaultLstmCellState(child_block);
+    }
+
+    if (n->kind() != onnx::LSTM) {
+      continue;
+    }
+    // Cell state is the seventh input for LSTM.
+    // See http://pytorch.org/docs/master/nn.html#torch.nn.LSTM
+    if (n->inputs().size() < 7) {
+      continue;
+    }
+    fixDefaultRNNState(b->owningGraph(), n, 6);
+  }
+}
+
+static bool isSafeToSpeculate(Node* n) {
+  return n->kind() == onnx::Transpose;
+}
+
+static void speculateOps(Block* block) {
+  for(auto it = block->nodes().begin(), end = block->nodes().end();
+      it != end;) {
+    Node * n = *it;
+    ++it; //note: increment first so that it is safe to move the node if needed
+
+    for(auto b : n->blocks()) {
+      speculateOps(b);
+    }
+    if(!isSafeToSpeculate(n))
+      continue;
+    // XXX - only works for nodes with a single input
+    // move node n outside of the control flow it is nested in
+    auto node_input = n->input()->node();
+    if(node_input->owningBlock() == n->owningBlock())
+      continue;
+    // find the control flow node in the same block as node_input that contains
+    // Node n
+    auto control_flow_node = n->owningBlock()->owningNode();
+    while(control_flow_node->owningBlock() != node_input->owningBlock())
+      control_flow_node = control_flow_node->owningBlock()->owningNode();
+    // put the node right before this flow node
+    n->moveBefore(control_flow_node);
+  }
+}
+
+// This optimization does ONNX-specific peephole optimizations.
+//
+// At the moment, here are the optimizations it does:
+//  - This optimization fuses expand calls into ONNX operators, because it is
+//    easier for non-strided backends to more efficiently do broadcasts if this is
+//    local information.  This optimization is not useful for PyTorch as 'expand'
+//    is free.
+//  - Fusing of consecutive transposes
+//  - Elimination of NOP transposes
+//  - Fusing of transposes into Gemm
+//  - Elimination of PaddedSequences
+//
+// Before you write an optimization here, ask yourself, "Could I do this
+// optimization on ATen operators"?  If so, you should seriously consider
+// writing your optimization in jit/passes/peephole.cpp rather than
+// here, as it will be generally applicable to the JIT as well.  The
+// optimizations here are ONLY applied on ONNX update
+void PeepholeOptimizeONNX(std::shared_ptr<Graph>& graph) {
+  // TODO: decide on fixpoint strategy
+  // TODO: make it easier not to do O(k) iterations over the graph, where
+  // k is the number of distinct peephole optimizations
+  pushPackingPastRnn(graph->block());
+  removeNopPacking(graph->block());
+  fixDefaultRnnHiddenState(graph->block());
+  fixDefaultLstmCellState(graph->block());
+  fuseBroadcast(graph->block());
+  fuseConsecutiveTransposes(graph->block());
+  eliminateNopTranspose(graph->block());
+  fuseTransposeIntoGemm(graph->block());
+  speculateOps(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/onnx/peephole.h b/torch/csrc/jit/passes/onnx/peephole.h
new file mode 100644
index 0000000..85d02df
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/peephole.h
@@ -0,0 +1,7 @@
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void PeepholeOptimizeONNX(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
new file mode 100644
index 0000000..4d997bb
--- /dev/null
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -0,0 +1,99 @@
+#include "torch/csrc/jit/passes/peephole.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch { namespace jit {
+
+// The intent for this optimization pass is to catch all of the small, easy to
+// catch peephole optimizations you might be interested in doing.
+//
+// Right now, it does:
+//    - Eliminate no-op 'expand' nodes
+//    - Simply x.t().t() to x
+//
+// TODO: Decide what kind of fixed point strategy we will have
+void PeepholeOptimize(Block * block) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
+    auto* n = *it;
+
+    for (Block * sub_block : n->blocks()) {
+        PeepholeOptimize(sub_block);
+    }
+
+    // XXX: remember that if you want to simplify an expression by combining multiple nodes
+    // into a different one, then you need to check that they all belong to the given block
+    switch (n->kind()) {
+      case aten::expand: {
+        // Eliminate redundant expand
+        if (!n->input()->isTensor()) break;
+        // the sizes are dynamic
+        if(n->inputs().size() != 1) break;
+        if (n->is(attr::size) == n->input()->type()->expect<TensorType>()->sizes()) {
+          n->output()->replaceAllUsesWith(n->input());
+          // Let DCE clean up any unused nodes at this point
+        }
+      } break;
+      case aten::t: {
+        // x.t().t() == x
+        auto input_node = n->input()->node();
+        if (input_node->kind() == aten::t)  {
+          n->output()->replaceAllUsesWith(input_node->input());
+          // Let DCE clean up any unused nodes at this point
+        }
+      } break;
+      case aten::type_as: {
+        JIT_ASSERT(n->inputs().size() == 2);
+        Value *lhs = n->input(0);
+        Value *rhs = n->input(1);
+        // If LHS and RHS have the same static type, remove the type_as operator.
+        if (lhs->type()->kind() == TypeKind::TensorType &&
+            rhs->type()->kind() == TypeKind::TensorType) {
+           auto ltype = (*lhs->type()).cast<TensorType>();
+           auto rtype = (*rhs->type()).cast<TensorType>();
+           if(ltype->device() == rtype->device() &&
+              ltype->scalarType() == rtype->scalarType()) {
+              n->output()->replaceAllUsesWith(lhs);
+           }
+        }
+      } break;
+      // Fuse mm + add into addmm
+      case aten::add: {
+        // Must have two inputs
+        if (n->inputs().size() != 2) {
+          continue;
+        }
+        // Alpha parameter must be 1.0
+        auto alpha = at::Scalar(it->t(attr::alpha));
+        if (alpha.to<double>() != 1.0) {
+          continue;
+        }
+
+        auto input_node = n->input(1)->node();
+        // Input must be an mm node
+        if (input_node->kind() != aten::mm) {
+          continue;
+        }
+
+        WithInsertPoint guard(n);
+
+        SymbolicVariable mat(n->input(0));
+        SymbolicVariable mat1(input_node->input(0));
+        SymbolicVariable mat2(input_node->input(1));
+        SymbolicVariable addmm_value = mat.addmm(mat1, mat2);
+
+        // Copy shape information from output node
+        ((Value*)addmm_value)->copyMetadata(n->output());
+        n->output()->replaceAllUsesWith(addmm_value);
+        // Let DCE clean up any unused nodes at this point
+      } break;
+    }
+  }
+}
+
+void PeepholeOptimize(std::shared_ptr<Graph>& graph) {
+  PeepholeOptimize(graph->block());
+  // Eliminate dead code created by any peephole passes we've just done
+  EliminateDeadCode(graph->block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/peephole.h b/torch/csrc/jit/passes/peephole.h
new file mode 100644
index 0000000..2ab80a2
--- /dev/null
+++ b/torch/csrc/jit/passes/peephole.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void PeepholeOptimize(std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/remove_expands.cpp b/torch/csrc/jit/passes/remove_expands.cpp
new file mode 100644
index 0000000..25286e9
--- /dev/null
+++ b/torch/csrc/jit/passes/remove_expands.cpp
@@ -0,0 +1,22 @@
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch { namespace jit {
+
+static void RemoveExpands(Block* block) {
+  for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
+       ++it) {
+    for (auto sub : it->blocks())
+      RemoveExpands(sub);
+    if (it->kind() == aten::expand && it->hasAttribute(attr::implicit) && it->i(attr::implicit)) {
+      it->output()->replaceAllUsesWith(it->input());
+      it.destroyCurrent();
+    }
+  }
+}
+
+void RemoveExpands(const std::shared_ptr<Graph>& graph) {
+  RemoveExpands(graph->block());
+}
+
+
+}}
diff --git a/torch/csrc/jit/passes/remove_expands.h b/torch/csrc/jit/passes/remove_expands.h
new file mode 100644
index 0000000..0688560
--- /dev/null
+++ b/torch/csrc/jit/passes/remove_expands.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void RemoveExpands(const std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
new file mode 100644
index 0000000..f8239c5
--- /dev/null
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -0,0 +1,405 @@
+#include "torch/csrc/jit/passes/shape_analysis.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/operator.h"
+
+#include <ATen/DeviceGuard.h>
+#include <ATen/ExpandUtils.h>
+
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace jit {
+
+struct propagation_error : std::exception {};
+
+#define SHAPE_ASSERT(cond) if (!(cond)) throw propagation_error()
+
+namespace {
+void setDynamicType(Node * node) {
+  for(auto o : node->outputs()) {
+    o->setType(DynamicType::get());
+  }
+}
+
+at::Tensor representativeTensor(const TensorType * type) {
+  auto backend = type->device() == -1 ? at::kCPU : at::kCUDA;
+  at::DeviceGuard device_guard(type->device());
+  auto & attype = at::getType(backend, type->scalarType());
+  return attype.tensor(type->sizes(), type->strides()).zero_();
+}
+
+void PropagateShapeOnBlock(Block * block, bool insert_expands=true);
+
+std::pair<std::vector<TensorType*>, bool> gatherTypes(at::ArrayRef<Value*> values) {
+  std::vector<TensorType*> types;
+  bool present = true;
+  for(auto v : values) {
+    TensorType* type = v->type()->cast<TensorType>();
+    if(!type)
+      present = false;
+    types.push_back(type);
+  }
+  return std::make_pair(std::move(types), present);
+}
+
+bool mergeTypes(ArrayRef<Value*> lhs, ArrayRef<Value*> rhs, ArrayRef<Value*> outputs) {
+  JIT_ASSERT(lhs.size() == rhs.size() && rhs.size() == outputs.size());
+  bool changed = false;
+  for(size_t i = 0; i < lhs.size(); ++i) {
+    auto old_output_type = outputs[i]->type();
+    if(*lhs[i]->type() == *rhs[i]->type()) {
+      outputs[i]->setType(lhs[i]->type());
+    } else {
+      outputs[i]->setType(DynamicType::get());
+    }
+    if(*old_output_type != *outputs[i]->type())
+      changed = true;
+  }
+  return changed;
+}
+
+void PropagateShapeOnNode(Node * node, bool insert_expands=true);
+
+void broadcastPointwise(Node *node, std::vector<TensorType*>& types) {
+  JIT_ASSERT(types.size() == 2);
+  auto expected_size = at::infer_size(types[0]->sizes(), types[1]->sizes());
+  auto broadcast = [&](size_t input_idx) {
+    TensorType* input_type = types.at(input_idx);
+    if (input_type->sizes() == expected_size)
+      return;
+    auto graph = node->owningGraph();
+    Node *expand = graph->create(aten::expand, {node->inputs().at(input_idx)})
+                        ->is_(attr::size, expected_size)
+                        ->i_(attr::implicit, 0)
+                        ->insertBefore(node);
+    PropagateShapeOnNode(expand);
+    node->replaceInput(input_idx, expand->output());
+  };
+  broadcast(0);
+  broadcast(1);
+  types[0] = node->inputs().at(0)->type()->expect<TensorType>();
+  types[1] = node->inputs().at(1)->type()->expect<TensorType>();
+}
+
+void PropagateShapeOnNodeByRunningIt(Node* node, const std::vector<TensorType*>& types) {
+  auto op = getOperation(node);
+  Stack stack;
+
+  for(auto & type : types) {
+    stack.push_back(representativeTensor(type));
+  }
+
+  // XXX: we're not catching any exceptions from the op for now. This
+  // is to uncover any mistakes we could make when editing this code,
+  // and eventually it shouldn't matter, because this phase should be
+  // preceded by schema checking.
+  op(stack);
+
+  JIT_ASSERT(stack.size() == node->outputs().size());
+  for(size_t i = 0; i < stack.size(); ++i) {
+    node->outputs()[i]->inferTypeFrom(stack[i].toTensor());
+  }
+}
+
+void PropagateShapeOnNode(Node * node, bool insert_expands) {
+  using AKind = AttributeKind;
+  // These don't require the types and present flag. Return early after we
+  // process them
+  switch(node->kind()) {
+    case prim::If: {
+      auto then_block = node->blocks().at(0);
+      auto else_block = node->blocks().at(1);
+      PropagateShapeOnBlock(then_block);
+      PropagateShapeOnBlock(else_block);
+      mergeTypes(then_block->outputs(), else_block->outputs(), node->outputs());
+      return;
+    }
+    case prim::Loop: {
+      auto body_block = node->blocks().at(0);
+      // propagate counter type
+      body_block->inputs().at(0)->setType(node->inputs().at(0)->type());
+      // propagate loop-carried input types to block inputs
+      auto loop_carried_inputs = node->inputs().slice(2); // skip max, cond
+      auto loop_carried_block = body_block->inputs().slice(1); // skip trip
+      for(size_t i = 0; i < loop_carried_inputs.size(); ++i) {
+        loop_carried_block[i]->setType(loop_carried_inputs[i]->type());
+      }
+      auto loop_carried_outputs = body_block->outputs().slice(1); // skip cond
+
+      do {
+        PropagateShapeOnBlock(body_block, /*insert_expands=*/false);
+        // note: inserting expands is unsafe at this point, we don't know
+        // if the types are stable yet, so the arguments to expand may change
+      } while(mergeTypes(loop_carried_block, loop_carried_outputs, loop_carried_block));
+
+      // now that the types are stable, we can insert the expands
+      PropagateShapeOnBlock(body_block, /*insert_expands=*/true);
+
+
+      for(size_t i = 0; i < loop_carried_inputs.size(); ++i) {
+        node->outputs()[i]->setType(loop_carried_block[i]->type());
+      }
+      return;
+    }
+    default: ; // fall-through
+  }
+  std::vector<TensorType*> types;
+  bool present;
+  std::tie(types, present) = gatherTypes(node->inputs());
+  if(!present) {
+    return setDynamicType(node);
+  }
+
+  bool handled = false;
+  // XXX: real attributes of node can be a superset of attrs
+  // XXX: if this returns true then you are obliged to set the types
+  auto check_overload = [&](size_t num_inputs, size_t num_outputs,
+                            std::vector<std::pair<AttributeKind,Symbol>> attrs) {
+    JIT_ASSERT(!handled);
+    if (node->inputs().size() != num_inputs) return false;
+    if (node->outputs().size() != num_outputs) return false;
+    for (auto & attr : attrs) {
+      if (!node->hasAttribute(attr.second)) return false;
+      if (node->kindOf(attr.second) != attr.first) return false;
+    }
+    handled = true;
+    return true;
+  };
+
+  switch(node->kind()) {
+    //TODO: for expensive ops we can directly encode their shape propagation
+    // here, otherwise we fallback to running a fake version of the op
+    // to get a quick and dirty propagation
+    case aten::add:
+    case aten::sub:
+    case aten::mul:
+    case aten::div:
+    case aten::pow:
+    case aten::min:
+    case aten::max:
+    case aten::lt:
+    case aten::le:
+    case aten::gt:
+    case aten::ge:
+    case aten::eq:
+    case aten::ne: {
+      if (node->inputs().size() == 2 && insert_expands) {
+        broadcastPointwise(node, types);
+      }
+      // NB: we don't handle the nodes in any other way, because the type casting
+      // logic in scalar cases is non-trivial. It's better to just run them.
+    } break;
+    case aten::neg: {
+      if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break;
+      node->output()->setType(types.at(0)->contiguous());
+    } break;
+    case aten::mm: {
+      if (!check_overload(/*num_inputs=*/2, /*num_outputs=*/1, {})) break;
+      auto lhs_type = types.at(0);
+      auto rhs_type = types.at(1);
+      SHAPE_ASSERT(lhs_type->sizes().size() == 2 && rhs_type->sizes().size() == 2);
+      node->output()->setType(std::make_shared<TensorType>(
+        lhs_type->scalarType(), lhs_type->device(),
+        at::IntList{lhs_type->sizes().at(0), rhs_type->sizes().at(1)}));
+    } break;
+    case aten::t: {
+      if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break;
+      auto tp = types.at(0);
+      auto sizes = tp->sizes();
+      auto strides = tp->strides();
+      SHAPE_ASSERT(sizes.size() == 2);
+      std::swap(sizes.at(0), sizes.at(1));
+      std::swap(strides.at(0), strides.at(1));
+      node->output()->setType(tp->withSizesStrides(sizes, strides));
+    } break;
+    case aten::narrow: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::i, attr::dim},
+                          {AKind::i, attr::length}})) {
+        auto tp = types.at(0);
+        auto sizes = tp->sizes();
+        int64_t dim = node->i(attr::dim);
+        int64_t length = node->i(attr::length);
+        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+        sizes.at(dim) = length;
+        node->output()->setType(tp->withSizesStrides(sizes, tp->strides()));
+      }
+    } break;
+    case aten::sum: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::is, attr::dim},
+                          {AKind::i, attr::keepdim}})) {
+        auto tp = types.at(0);
+        auto sizes = tp->sizes();
+        int64_t dim = node->is(attr::dim).at(0);
+        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+        if (node->i(attr::keepdim)) {
+          sizes.at(dim) = 1;
+        } else {
+          sizes.erase(sizes.begin() + dim);
+        }
+        node->output()->setType(tp->withSizes(sizes));
+      } else if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) {
+        node->output()->setType(types.at(0)->withSizes({}));
+      }
+    } break;
+    case aten::squeeze: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::i, attr::dim}})) {
+        auto tp = types.at(0);
+        auto sizes = tp->sizes();
+        auto strides = tp->strides();
+        int64_t dim = node->i(attr::dim);
+        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+        if (sizes.at(dim) == 1) {
+          sizes.erase(sizes.begin() + dim);
+          strides.erase(strides.begin() + dim);
+        }
+        node->output()->setType(tp->withSizesStrides(sizes, strides));
+      }
+    } break;
+    case aten::unsqueeze: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::i, attr::dim}})) {
+        auto tp = types.at(0);
+        auto sizes = tp->sizes();
+        auto strides = tp->strides();
+        int64_t dim = node->i(attr::dim);
+        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) <= sizes.size());
+        sizes.insert(sizes.begin() + dim, 1);
+        strides.insert(strides.begin() + dim, 1);
+        node->output()->setType(tp->withSizesStrides(sizes, strides));
+      }
+    } break;
+    case aten::view: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::is, attr::size}})) {
+        auto sizes = node->is(attr::size);
+        bool inferred = false;
+        size_t inferred_idx;
+        int64_t size_product = 1;
+        for (size_t i=0; i<sizes.size(); ++i) {
+          if (sizes[i] == -1) {
+            if (inferred) throw propagation_error();
+            inferred = true;
+            inferred_idx = i;
+          } else {
+            size_product *= sizes[i];
+          }
+        }
+
+        if (inferred) {
+          auto rep_ten = representativeTensor(types[0]);
+          SHAPE_ASSERT(size_product != 0);
+          int64_t inferred_size = rep_ten.numel() / size_product;
+          sizes[inferred_idx] = inferred_size;
+        }
+        node->output()->setType(types.at(0)->withSizes(sizes));
+      }
+    } break;
+    case aten::expand: {
+      if(check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
+                         {{AKind::is, attr::size}})) {
+        // it is safe to run this, even if we have an integer input tensor
+        PropagateShapeOnNodeByRunningIt(node, types);
+      }
+    } break;
+    case aten::index_select: {
+      if(check_overload(/*num_inputs=*/2, /*num_outputs=*/1,
+                        {{AKind::i, attr::dim}})) {
+        auto ten = types.at(0);
+        auto index = types.at(1);
+        int64_t dim = node->i(attr::dim);
+        SHAPE_ASSERT(index->sizes().size() == 1);
+        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < ten->sizes().size());
+        std::vector<int64_t> sizes = ten->sizes();
+        sizes[dim] = index->sizes()[0];
+        node->output()->setType(ten->withSizes(sizes));
+      }
+    } break;
+    case prim::Constant: {
+      node->output()->inferTypeFrom(node->t(attr::value));
+      handled = true;
+    } break;
+    case prim::TensorToNum:
+    case prim::NumToTensor: {
+      node->output()->setType(node->inputs()[0]->type());
+      handled = true;
+    } break;
+    case prim::Undefined: {
+      node->output()->setType(DynamicType::get());
+      handled = true;
+    } break;
+    case prim::PythonOp: {
+      setDynamicType(node);
+      handled = true;
+    } break;
+    case prim::Print: {
+      setDynamicType(node);
+      handled = true;
+    } break;
+    case onnx::Shape: {
+      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) {
+        std::vector<int64_t> dim_vec = {(int64_t)types.at(0)->sizes().size()};
+        at::IntList dims(dim_vec);
+        node->output()->setType(
+            std::make_shared<TensorType>(at::kLong, -1, dims));
+      }
+    } break;
+    case onnx::Reshape: {
+      setDynamicType(node);
+      handled = true;
+    }
+    default: {
+    } break;
+  }
+
+  // If we haven't manage to handle the op so far, we fall back to inferring the
+  // shapes by doing an example run of the op (if we can).
+  if (!handled) {
+    // Integral typed inputs are often an indicator that we're indexing into
+    // a tensor, so we should special-case these ops in the shape propagation.
+    // Additionally, passing in a zero representative tensor into an integer
+    // division op causes divide-by-zero errors
+    bool shape_inferenceable = !std::any_of(types.begin(), types.end(), [](TensorType* t){
+      return at::isIntegralType(t->scalarType());
+    });
+    if (node->kind() == aten::type_as || shape_inferenceable ) {
+      PropagateShapeOnNodeByRunningIt(node, types);
+    } else {
+      setDynamicType(node);
+    }
+  }
+}
+
+void PropagateShapeOnBlock(Block * block, bool insert_expands) {
+  for (Node * node : block->nodes()) {
+    try {
+      PropagateShapeOnNode(node, insert_expands);
+    } catch(propagation_error& e) {
+      setDynamicType(node);
+    } catch(std::exception & e) {
+      if(auto sl = node->getSourceLocation()) {
+        sl->wrapAndRethrowException(e, "operation failed shape propagation");
+      } else {
+        throw;
+      }
+    }
+  }
+}
+
+}
+void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec) {
+  JIT_ASSERT(graph.inputs().size() == spec.size());
+  for(size_t i = 0; i < spec.size(); ++i) {
+    graph.inputs()[i]->setType(spec.tensorInfo(i));
+  }
+  PropagateShapeOnBlock(graph.block());
+}
+
+}}
diff --git a/torch/csrc/jit/passes/shape_analysis.h b/torch/csrc/jit/passes/shape_analysis.h
new file mode 100644
index 0000000..4a36406
--- /dev/null
+++ b/torch/csrc/jit/passes/shape_analysis.h
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace torch { namespace jit {
+struct Graph;
+struct ArgumentSpec;
+void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec);
+
+}}
diff --git a/torch/csrc/jit/passes/specialize_undef.cpp b/torch/csrc/jit/passes/specialize_undef.cpp
new file mode 100644
index 0000000..e308850
--- /dev/null
+++ b/torch/csrc/jit/passes/specialize_undef.cpp
@@ -0,0 +1,97 @@
+#include "torch/csrc/jit/passes/specialize_undef.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+
+namespace torch { namespace jit {
+
+
+// propagate undefined information through a gradient graph and
+// remove grad_of blocks if present.
+// Note: this is a very limited pass. It only propagates undefines for
+// operations generated by the symbolic autodiff code and cleans up
+// AutogradAdds when possible. Outputs of other nodes are conservatively
+// marked Unknown and not optimized.
+void specializeUndef(Graph & g, const std::vector<bool>& defined) {
+  enum class State { Defined, Undefined, Unknown };
+  std::unordered_map<Value*, State> state;
+  for(size_t i = 0; i < defined.size(); i++) {
+    state[g.inputs()[i]] = defined.at(i) ? State::Defined : State::Undefined;
+  }
+  for(auto it = g.nodes().begin(); it != g.nodes().end(); ++it) {
+    auto n = *it;
+    switch(n->kind()) {
+      case prim::GradOf: {
+        auto all_undefined =
+            std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+              return state[v] == State::Undefined;
+            });
+        // Property 1: if all the gradInputs to the GradOf are undefined
+        // then the gradOutputs are also zero and will be represented as undefined nodes
+        if(all_undefined) {
+          auto undef = g.createUndefined()->insertAfter(n)->output();
+          for(auto o : n->outputs()) {
+            o->replaceAllUsesWith(undef);
+          }
+        } else {
+        // Property 2: GradOfs are required to correctly handle combinations
+        // of defined and undefined inputs. They are expected to produce defined
+        // output tensors in this case.
+
+          // Remove the GradOf, splicing its body back into the surrounding block
+          auto body = n->blocks().at(0);
+          for(auto input : n->inputs()){
+            // we should never get into a situation when specializing a GradOf
+            // where we do not know if a value is defined since at the top level
+            // a gradient graph is composed of Linear nodes and AutogradAdds
+            // and LinearNodes only appear in these graphs
+            JIT_ASSERT(state[input] != State::Unknown);
+          }
+          // hoist the nodes in the GradOf body to be before the linear block
+          for(auto it = body->nodes().begin(); it != body->nodes().end();) {
+            auto block_node = *it++;
+            block_node->moveBefore(n);
+          }
+
+          for(size_t i = 0; i < n->outputs().size(); ++i)
+            n->outputs().at(i)->replaceAllUsesWith(body->outputs().at(i));
+        }
+        it.destroyCurrent();
+      } break;
+      case prim::AutogradAdd: {
+        auto a = n->input(0);
+        auto b = n->input(1);
+        // if one is undefined, we can just drop the add
+        if(state[a] == State::Undefined) {
+          // Undef + b == b
+          n->output()->replaceAllUsesWith(b);
+          it.destroyCurrent();
+        } else if(state[b] == State::Undefined) {
+          // a + Undef == a
+          n->output()->replaceAllUsesWith(a);
+          it.destroyCurrent();
+        } else if(state[a] == State::Defined && state[b] == State::Defined) {
+          // when both are defined, we can use a normal, optimizable add instruction
+          WithInsertPoint guard(n);
+          Value* new_add = toVar(a) + toVar(b);
+          state[new_add] = State::Defined;
+          n->output()->replaceAllUsesWith(new_add);
+          it.destroyCurrent();
+        } else {
+          // otherwise we have conditionally-defined things, and we need
+          // to actually run an AutogradAdd which will guard for undefs
+          // so we leave the op as is
+          state[n->output()] = State::Unknown;
+        }
+      } break;
+      case prim::Undefined: {
+        state[n->output()] = State::Undefined;
+      } break;
+      default:
+        for(auto o : n->outputs()) {
+          state[o] = State::Unknown;
+        }
+        break;
+    }
+  }
+}
+
+}}
diff --git a/torch/csrc/jit/passes/specialize_undef.h b/torch/csrc/jit/passes/specialize_undef.h
new file mode 100644
index 0000000..c468ceb
--- /dev/null
+++ b/torch/csrc/jit/passes/specialize_undef.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+
+// propagate undefined information through a gradient graph and
+// remove grad_of blocks if present.
+// Note: this is a very limited pass. It only propagates undefines for
+// operations generated by the symbolic autodiff code and cleans up
+// AutogradAdds when possible. Outputs of other nodes are conservatively
+// marked Unknown and not optimized.
+void specializeUndef(Graph & g, const std::vector<bool>& defined);
+
+}}
diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp
new file mode 100644
index 0000000..5494cf2
--- /dev/null
+++ b/torch/csrc/jit/passes/to_batch.cpp
@@ -0,0 +1,73 @@
+#include "torch/csrc/jit/passes/to_batch.h"
+#include "torch/csrc/jit/script/compiler.h"
+
+namespace torch { namespace jit {
+
+std::unordered_map<std::string, std::shared_ptr<Graph>> ToBatch::batch_operator_table;
+
+void ToBatch::toBatch(Block* block, Block* res_block) {
+  // change inputs of a graph - expand tensor to {data, mask, dims}
+  auto size = block->inputs().size();
+  for(size_t i = 0; i < size; i++){
+    auto input = block->inputs()[i];
+    auto name = input->uniqueName();
+    res_block->addInput(name + "_data");
+    res_block->addInput(name + "_mask");
+    res_block->addInput(name + "_dims");
+    batch_map[input] = std::vector<Value*>(res_block->inputs().slice(i * 3, 3));
+  }
+
+  for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
+    auto n = *it;
+    // replace tensor operator to BatchTensor operator
+    if(n->kind().is_aten()){
+      auto batch_graph = batch_operator_table.at(n->kind().toUnqualString());
+      WithInsertPoint guard(res_block);
+      std::vector<Value*> new_inputs;
+      for(Value *input : n->inputs()){
+        if(batch_map.find(input) != batch_map.end()){
+          auto new_input = batch_map.at(input);
+          new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end());
+        }
+        else{
+          throw std::runtime_error("NYI: non-tensor input for aten operator is not supported yet");
+        }
+      }
+      auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs);
+      // Assume all outputs from inlined operator implementation are in the triple form.
+      for(size_t i = 0; i < n->outputs().size(); i++){
+        auto output = n->outputs()[i];
+        batch_map[output] = std::vector<Value*>(outputs.begin() + i * 3, outputs.begin() + i * 3 + 3);
+      }
+    }
+    else if(n->kind().is_prim()){
+      throw std::runtime_error("NYI: node of prim kind is not supported to transform to batch graph yet");
+    }
+  }
+  // change outputs of a graph - expand tensor to {data, mask, dims}
+  for(Value* output : block->outputs()){
+    auto r_output = batch_map.at(output);
+    res_block->registerOutput(r_output[0]);
+    res_block->registerOutput(r_output[1]);
+    res_block->registerOutput(r_output[2]);
+  }
+}
+
+std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph){
+  // std::cout<<graph->toString()<<std::endl;
+  auto res_graph = std::make_shared<Graph>(graph->scope_root());
+  ToBatch to_batch;
+  to_batch.toBatch(graph->block(), res_graph->block());
+  // std::cout<<res_graph->toString()<<std::endl;
+  return res_graph;
+}
+
+void initRegisterBatchOpsBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+  m.def("to_batch_graph", &to_batch_graph);
+  m.def("register_batch_operator", [](std::string name, std::shared_ptr<Graph> graph){
+    ToBatch::batch_operator_table[name] = graph;
+  });
+}
+
+}} // namespace torch.jit
diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h
new file mode 100644
index 0000000..1d3113c
--- /dev/null
+++ b/torch/csrc/jit/passes/to_batch.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "torch/csrc/jit/pybind.h"
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+class ToBatch {
+private:
+  // mapping from tensor in original graph to {data, mask, dims} in new graph
+  std::unordered_map<Value*, std::vector<Value*>> batch_map;
+public:
+  static std::unordered_map<std::string, std::shared_ptr<Graph>> batch_operator_table;
+  void toBatch(Block* block, Block* res_block);
+};
+
+std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph);
+void initRegisterBatchOpsBindings(PyObject* module);
+}}
diff --git a/torch/csrc/jit/pybind.h b/torch/csrc/jit/pybind.h
new file mode 100644
index 0000000..11baa1a
--- /dev/null
+++ b/torch/csrc/jit/pybind.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/THP.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/tracer.h"
+
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace pybind11 { namespace detail {
+
+template <> struct type_caster<torch::jit::Symbol> {
+public:
+  PYBIND11_TYPE_CASTER(torch::jit::Symbol, _("Symbol"));
+
+  bool load(handle src, bool) {
+    // TODO: Is there a way to py::cast that doesn't raise an exception on
+    // failure?  Can we catch pybind11::cast_error here instead?
+    std::string src_str;
+    try {
+      src_str = py::cast<std::string>(src);
+    } catch (std::exception& e) {
+      return false;
+    }
+    value = torch::jit::Symbol::fromQualString(src_str);
+    return true;
+  }
+
+  static handle cast(torch::jit::Symbol src, return_value_policy /* policy */, handle /* parent */) {
+    return py::cast(std::string(src.toQualString()), return_value_policy::copy).release();
+  }
+};
+
+template <> struct type_caster<torch::jit::AttributeKind> {
+public:
+  PYBIND11_TYPE_CASTER(torch::jit::AttributeKind, _("AttributeKind"));
+
+  bool load(handle src, bool) {
+    return false;
+  }
+
+  static handle cast(torch::jit::AttributeKind src, return_value_policy /* policy */, handle /* parent */) {
+    return py::cast(std::string(torch::jit::toString(src)), return_value_policy::copy).release();
+  }
+};
+
+// See https://github.com/pybind/pybind11/issues/637
+using ListCasterBase = pybind11::detail::list_caster<std::vector<torch::jit::Node *>, torch::jit::Node *>;
+template<> struct type_caster<std::vector<torch::jit::Node *>> : ListCasterBase {
+    static handle cast(const std::vector<torch::jit::Node *> &src, return_value_policy, handle parent) {
+        return ListCasterBase::cast(src, return_value_policy::reference, parent);
+    }
+    static handle cast(const std::vector<torch::jit::Node *> *src, return_value_policy pol, handle parent) {
+        return cast(*src, pol, parent);
+    }
+};
+
+}} // namespace pybind11::detail
+
+namespace torch { namespace jit {
+
+static inline py::tuple tuple_tail(const py::tuple & tup) {
+  py::tuple r(tup.size() - 1);
+  for(size_t i = 1; i < tup.size(); i++) {
+    r[i-1] = tup[i];
+  }
+  return r;
+}
+
+}}
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
new file mode 100644
index 0000000..8b7e78a
--- /dev/null
+++ b/torch/csrc/jit/pybind_utils.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "torch/csrc/utils/pybind.h"
+
+#include "torch/csrc/jit/variable_tensor_list.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+// we cannot use the default py:cast<autograd::Variable> because it currently
+// unwraps the data tensor in the conversion process
+// TODO: replace with bs type
+variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_extra_space = 0) {
+  variable_tensor_list result;
+  result.reserve(tuple.size() + reserve_extra_space);
+  for(auto e : tuple) {
+    result.push_back(py::cast<autograd::Variable>(e));
+  }
+  return result;
+}
+
+}  // namespace
+
+} }  // namespace torch::jit
diff --git a/torch/csrc/jit/python_arg_flatten.cpp b/torch/csrc/jit/python_arg_flatten.cpp
new file mode 100644
index 0000000..4e8068a
--- /dev/null
+++ b/torch/csrc/jit/python_arg_flatten.cpp
@@ -0,0 +1,112 @@
+#include "python_arg_flatten.h"
+
+#include "torch/csrc/autograd/grad_mode.h"
+
+namespace torch { namespace jit { namespace python {
+
+using namespace torch::autograd;
+using namespace at;
+
+// Alphabet used to describe structure of inputs/outputs (D for desc)
+namespace D {
+static constexpr char ListOpen          = '[';
+static constexpr char ListClose         = ']';
+static constexpr char TupleOpen         = '(';
+static constexpr char TupleClose        = ')';
+static constexpr char Variable          = 'v';
+} // namespace D
+
+namespace {
+
+template<typename T>
+py::object cast_handle_sequence(std::vector<py::handle> objs) {
+  auto num_objs = objs.size();
+  T sequence { num_objs };
+  for (size_t i = 0; i < num_objs; ++i)
+    sequence[i] = py::reinterpret_borrow<py::object>(objs[i]);
+  return sequence;
+}
+
+void flatten_rec(PyObject* obj, ParsedArgs& args) {
+  auto & structure = args.desc.structure;
+  if (PyTuple_Check(obj)) {
+    structure.push_back(D::TupleOpen);
+    for (auto item : py::reinterpret_borrow<py::tuple>(obj))
+      flatten_rec(item.ptr(), args);
+    structure.push_back(D::TupleClose);
+  } else if (PyList_Check(obj)) {
+    structure.push_back(D::ListOpen);
+    for (auto item : py::reinterpret_borrow<py::list>(obj))
+      flatten_rec(item.ptr(), args);
+    structure.push_back(D::ListClose);
+  } else if (THPVariable_Check(obj)) {
+    auto& var = reinterpret_cast<THPVariable*>(obj)->cdata;
+    args.vars.push_back(var);
+    args.desc.metadata.emplace_back(var);
+    args.desc.structure.push_back(D::Variable);
+  } else {
+    std::string msg = "Only tuples, lists and Variables supported as JIT inputs, but got ";
+    msg += THPUtils_typename(obj);
+    throw std::runtime_error(msg);
+  }
+}
+
+} // anonymous namespace
+
+ParsedArgs flatten(py::handle obj) {
+  ParsedArgs args;
+  args.desc.grad_enabled = autograd::GradMode::is_enabled();
+  flatten_rec(obj.ptr(), args);
+  return args;
+}
+
+namespace {
+
+template<typename T>
+py::object cast_sequence(std::vector<py::object> objs) {
+  auto num_objs = objs.size();
+  T sequence { num_objs };
+  for (size_t i = 0; i < num_objs; ++i)
+    sequence[i] = std::move(objs[i]);
+  return sequence;
+}
+
+py::object unflatten_rec(ArrayRef<Variable>::iterator& var_it,
+                         ArrayRef<Variable>::iterator& var_it_end,
+                         std::string::const_iterator& desc_it) {
+  char type = *desc_it++;
+  if (type == D::TupleOpen) {
+    std::vector<py::object> objs;
+    while (*desc_it != D::TupleClose)
+      objs.push_back(unflatten_rec(var_it, var_it_end, desc_it));
+    ++desc_it;
+    return cast_sequence<py::tuple>(objs);
+  } else if (type == D::ListOpen) {
+    std::vector<py::object> objs;
+    while (*desc_it != D::ListClose)
+      objs.push_back(unflatten_rec(var_it, var_it_end, desc_it));
+    ++desc_it;
+    return cast_sequence<py::list>(objs);
+  } else {
+    if (var_it == var_it_end)
+      throw std::runtime_error("Not enough Variables given to unflatten");
+    auto var = *var_it++;
+    return py::reinterpret_steal<py::object>(THPVariable_Wrap(var));
+  }
+}
+
+} // anonymous namespace
+
+PyObject* unflatten(ArrayRef<Variable> vars, const IODescriptor& desc) {
+  // NB: We don't do correctness checking on descriptor.
+  // It has to be a correct bytes object produced by unflatten.
+  auto vars_it = vars.begin();
+  auto vars_it_end = vars.end();
+  auto desc_it = desc.structure.begin();
+  auto output = unflatten_rec(vars_it, vars_it_end, desc_it);
+  if (vars_it != vars_it_end)
+    throw std::runtime_error("Too many Variables given to unflatten");
+  return output.release().ptr();
+}
+
+}}} // namespace torch::jit::python
diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h
new file mode 100644
index 0000000..b513903
--- /dev/null
+++ b/torch/csrc/jit/python_arg_flatten.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "torch/csrc/jit/pybind.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/hash.h"
+
+#include <ATen/ATen.h>
+#include <tuple>
+#include <vector>
+#include <functional>
+
+namespace torch { namespace jit { namespace python {
+
+struct IODescriptor {
+  struct VariableMetadata {
+    VariableMetadata(const autograd::Variable& var)
+      : sizes(var.sizes())
+      , type(var.type().scalarType())
+      , device(var.type().is_cuda() ? var.get_device() : -1)
+      , requires_grad(var.requires_grad()) {}
+
+    bool operator==(const VariableMetadata& o) const {
+      return std::tie(  device,   requires_grad,   type,  sizes) ==
+             std::tie(o.device, o.requires_grad, o.type, o.sizes);
+    }
+
+    static size_t hash(const VariableMetadata& m) {
+      return get_hash(m.sizes, m.device, m.requires_grad, m.type);
+    }
+
+    std::vector<int64_t> sizes;
+    at::ScalarType type;
+    int device;
+    bool requires_grad;
+  };
+
+  bool operator==(const IODescriptor& o) const {
+    return std::tie(  structure,   metadata,   grad_enabled) ==
+           std::tie(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  static size_t hash(const IODescriptor& o) {
+    return get_hash(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  void extend(const autograd::variable_list& list) {
+    metadata.reserve(metadata.size() + list.size());
+    for (auto & var : list)
+      metadata.emplace_back(var);
+  }
+
+  // Description of argument structure. Variables are replaced with
+  // different characters, depending on their flags, beginnings and
+  // ends of tuples and lists are denoted by a pair of parenthesis
+  // of their corresponding kind. They should always be paired.
+  // Example desc: (vv[v(v)v])
+  // NOTE: if extend() was ever called then metadata.size() can be
+  // different than the number of 'v's in structure.
+  std::string structure;
+  std::vector<VariableMetadata> metadata;
+  bool grad_enabled;
+};
+
+static inline std::ostream& operator<<(std::ostream& out, const IODescriptor::VariableMetadata& meta) {
+  auto & t = at::getType(meta.device < 0 ? at::kCPU : at::kCUDA, meta.type);
+  out << t << "(requires_grad=" << meta.requires_grad;
+  if (meta.device > 0) {
+    out << ", device=" << meta.device;
+  }
+  out << ") {";
+  for(size_t i = 0; i < meta.sizes.size(); ++i) {
+    if(i > 0)
+      out << ", ";
+    out << meta.sizes[i];
+  }
+  out << "}";
+  return out;
+}
+
+static inline std::ostream& operator<<(std::ostream & out, const IODescriptor & desc) {
+  out << desc.structure << "\n";
+  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
+  for(size_t i = 0; i < desc.metadata.size(); ++i) {
+    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
+  }
+  return out;
+}
+
+struct ParsedArgs {
+  // Flat vector of Variables found in arguments
+  autograd::variable_list vars;
+  // Metadata describing nesting of objects received from Python and
+  // metadata of vars and whether grad is enabled.
+  IODescriptor desc;
+
+  void extend(const autograd::variable_list& list) {
+    if (list.empty()) return;
+    vars.reserve(vars.size() + list.size());
+    for (auto & var : list)
+      vars.emplace_back(var);
+    desc.extend(list);
+  }
+};
+
+
+ParsedArgs flatten(py::handle obj);
+PyObject* unflatten(at::ArrayRef<autograd::Variable> outputs,
+                    const IODescriptor& structure);
+
+}}} // namespace torch::jit::python
diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp
new file mode 100644
index 0000000..5af53c4
--- /dev/null
+++ b/torch/csrc/jit/python_interpreter.cpp
@@ -0,0 +1,93 @@
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/jit/interpreter.h"
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/variable_tensor_functions.h"
+
+#include <typeinfo>
+
+#include "torch/csrc/autograd/python_engine.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/jit/pybind.h"
+#include "torch/csrc/utils/auto_gil.h"
+
+namespace py = pybind11;
+
+namespace torch { namespace jit {
+
+namespace {
+
+Operation createPythonOperation(Node* op_) {
+  PythonOp* op = static_cast<PythonOp*>(op_);
+  py::function func = py::reinterpret_borrow<py::function>(py::handle(op->pyobj.get()));
+  size_t num_inputs = 0;
+  for(auto arg_type : op->cconv) {
+    if(arg_type == 't')
+      num_inputs++;
+  }
+  return [=](Stack & stack) {
+    AutoGIL gil;
+    py::tuple py_inputs(op->cconv.size());
+    size_t i = 0;
+    size_t next_scalar = 0;
+    size_t next_tensor = 0;
+    for (auto arg_type : op->cconv) {
+      if (arg_type == 's') {
+        py_inputs[i] = py::reinterpret_borrow<py::object>(
+            op->scalar_args[next_scalar++].get());
+      } else if (arg_type == 't') {
+        auto var = std::move(peek(stack, next_tensor, num_inputs)).toTensor();
+        py_inputs[i] =
+            py::reinterpret_steal<py::object>(THPVariable_Wrap(var));
+        next_tensor++;
+      }
+      i++;
+    }
+    drop(stack, num_inputs);
+    py::object py_outputs(func(*py_inputs));
+
+    auto num_outputs = op->outputs().size();
+    auto addOutput = [&](py::handle entry) {
+      if (!THPVariable_Check(entry.ptr())) {
+        throw std::runtime_error(
+            "Function application returned a non-Variable output");
+      }
+      THPVariable* var = (THPVariable*)entry.ptr();
+      auto cdata = var->cdata;
+      stack.push_back(std::move(cdata));
+    };
+
+    if (!PyTuple_Check(py_outputs.ptr())) {
+      if (num_outputs != 1) {
+        throw std::runtime_error(
+            "Function.apply returned the wrong number of outputs.");
+      }
+      addOutput(py_outputs);
+    } else {
+      auto output_tuple = py::tuple(py_outputs);
+      if (output_tuple.size() != num_outputs) {
+        throw std::runtime_error(
+            "Function application returned the wrong number of outputs.");
+      }
+      for (py::handle entry : py::tuple(py_outputs)) {
+        addOutput(entry);
+      }
+    }
+    return 0;
+  };
+}
+
+
+RegisterOperators reg({
+  Operator(prim::PythonOp, createPythonOperation)
+});
+
+}}} // torch::jit::anon
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
new file mode 100644
index 0000000..534297a
--- /dev/null
+++ b/torch/csrc/jit/python_ir.cpp
@@ -0,0 +1,478 @@
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/import.h"
+#include "torch/csrc/jit/pybind.h"
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/jit/export.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/python_strings.h"
+
+
+#include <iostream>
+#include <sstream>
+
+namespace torch { namespace jit {
+
+std::string getPythonName(const PyObject* obj_) {
+  AutoGIL gil;
+  PyObject* obj = const_cast<PyObject*>(obj_);
+  auto v = py::getattr(obj, "__name__", py::str("<python_value>"));
+  // if this was a autograd.Function recover the name of the class
+  return py::str(v);
+}
+
+std::ostream& printPyObject(std::ostream & out, const THPObjectPtr& obj) {
+  AutoGIL gil;
+  auto pyobj = py::handle(const_cast<PyObject*>(obj.get()));
+  if (py::isinstance<py::tuple>(pyobj)) {
+    // This special-case for printing tuples handles a problem where
+    // str((2L, 3L)) outputs "(2L, 3L)" in Python 2 but "(2, 3)"
+    // in Python 3.  In order to suppress the L-suffix, we must
+    // manually print the string ourselves, calling str() on the
+    // sub-elements.
+    //
+    // This is a fairly fragile fix (What if you have nested tuples
+    // in tuples? What if you have dictionaries?) but it seems to hit
+    // the cases that are triggered in practice in onnx-pytorch.  Revisit
+    // this code if this is not the case.
+    //
+    // By the way, one non-solution for this problem is to monkeypatch
+    // tuple.__str__; this doesn't work because Python doesn't allow
+    // monkeypatching methods of built-in types.
+    auto pytuple = pyobj.cast<py::tuple>();
+    out << "(";
+    size_t i = 0;
+    for (auto& o : pytuple) {
+      if (i > 0) {
+        out << ", ";
+      }
+      THPObjectPtr str(py::str(o).release().ptr());
+      out << THPUtils_unpackString(str.get());
+      i++;
+    }
+    if (i == 1) {
+      out << ",";
+    }
+    out << ")";
+    return out;
+  } else {
+    return out << THPUtils_unpackString(py::str(pyobj).ptr());
+  }
+}
+
+// execute a Python function, used for Ops we can't optimize but that we want to optimize around
+struct ConcretePythonOp : public PythonOp {
+ ConcretePythonOp(Graph * graph)
+ : PythonOp(graph) {}
+ virtual std::string name() const override {
+   AutoGIL gil;
+   if(auto autograd = autogradFunction()) {
+     return getPythonName(autograd->get());
+   } else {
+     return getPythonName(pyobj.get());
+   }
+ }
+ virtual void cloneFrom(Node * other_) override {
+   Node::cloneFrom(other_);
+   auto other = other_->cast<PythonOp>();
+   this->cconv = other->cconv;
+   Py_INCREF(other->pyobj.get());
+   this->pyobj = THPObjectPtr(other->pyobj.get());
+   for(auto & sa : other->scalar_args) {
+     Py_INCREF(sa.get());
+     this->scalar_args.emplace_back(sa.get());
+   }
+ }
+ virtual Node * allocNewInstance(Graph * g) override {
+   return new ConcretePythonOp(g);
+ }
+ // recover the autograd.Function instance, if this PythonOp's function
+ // was originally SomeFunction.apply
+ // used in ONNX for discovering symbolics
+ virtual at::optional<THPObjectPtr> autogradFunction() const override {
+   AutoGIL gil;
+   py::handle obj = const_cast<PyObject*>(pyobj.get());
+
+   auto r = py::getattr(obj, "__self__", py::none());
+   if(r.is_none())
+     return at::nullopt;
+
+   auto apply = py::getattr(r, "apply", py::none());
+   if(apply.is_none())
+     return at::nullopt;
+
+   auto c = PyObject_RichCompareBool(apply.ptr(), obj.ptr(), Py_NE);
+   if(PyErr_Occurred())
+     throw py::error_already_set();
+   if(c)
+     return at::nullopt;
+
+   return THPObjectPtr(r.release().ptr());
+ }
+
+ virtual void writeScalars(std::ostream& out) const override {
+   out << "(";
+   int i = 0;
+   for (auto& scalar : scalar_args) {
+     if (i++ > 0)
+       out << ", ";
+     printPyObject(out, scalar);
+   }
+   out << ")";
+ }
+
+};
+
+PythonOp* pythonAllocPythonOp(Graph* g) {
+  return new ConcretePythonOp(g);
+}
+
+void initPythonIRBindings(PyObject * module_) {
+  setAllocPythonOp(pythonAllocPythonOp);
+
+  auto m = py::handle(module_).cast<py::module>();
+  #define GS(name) \
+    def(#name,&Graph :: name)
+  py::class_<Graph,std::shared_ptr<Graph>>(m,"Graph")
+    .def(py::init<>())
+    .def("__repr__",[](Graph & g) {
+      std::stringstream ss;
+      ss << g;
+      return ss.str();
+    })
+    .def("propagate_shapes", [](Graph& g, std::vector<at::Tensor> inputs, bool with_grad) {
+      PropagateInputShapes(g, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+    })
+    .def("export", [](const std::shared_ptr<Graph> g, const std::vector<at::Tensor>& initializers,
+                      int64_t onnx_opset_version, bool defer_weight_export,
+                      ::torch::onnx::OperatorExportTypes operator_export_type) {
+      std::string graph;
+      RawDataExportMap export_map;
+      std::tie(graph, export_map) = ExportGraph(
+        g, initializers, onnx_opset_version, defer_weight_export, operator_export_type);
+      std::unordered_map<std::string, py::bytes> python_serialized_export_map;
+      for (auto& kv : export_map) {
+        auto t = kv.second;
+        size_t copy_bytes = t.type().elementSizeInBytes() * t.numel();
+        // TODO: this is an unecessary copy. In theory we can directly return
+        // the map from identifier to Tensor, but we need some API in Python
+        // to get raw `bytes` containing the raw tensor data.
+        python_serialized_export_map[kv.first] = py::bytes(static_cast<const char*>(t.data_ptr()), copy_bytes);
+      }
+      return std::make_tuple(py::bytes(graph), python_serialized_export_map);
+    }, py::arg("initializers"),
+       py::arg("onnx_opset_version")=0,
+       py::arg("defer_weight_export")=false,
+       py::arg("operator_export_type")=::torch::onnx::OperatorExportTypes::ONNX)
+    .def("prettyPrintExport", [](const std::shared_ptr<Graph> g, const std::vector<at::Tensor>& initializers,
+                      int64_t onnx_opset_version, bool defer_weight_export,
+                      ::torch::onnx::OperatorExportTypes operator_export_type) {
+      return PrettyPrintExportedGraph(
+        g, initializers, onnx_opset_version, defer_weight_export, operator_export_type);
+    }, py::arg("initializers"),
+       py::arg("onnx_opset_version")=0,
+       py::arg("defer_weight_export")=false,
+       py::arg("operator_export_type")=::torch::onnx::OperatorExportTypes::ONNX)
+    .def("wrapPyFuncWithSymbolic", [](Graph &g, py::function func, std::vector<Value*> inputs, size_t n_outputs, py::function symbolic) {
+      // This function should be used for situations where we have a Python function
+      // that should have different behavior when exporting for JIT interpreter
+      // execution v.s. for ONNX export. For example, nn.utils.rnn.pack_padded_sequence
+      // emits a placeholder under ONNX export, but we want to keep the ability to
+      // run this in the interpreter, thus we emit a PythonOp for that use case.
+
+      // Concretely, this function emits a PythonOp wrapping the passed-in
+      // parameter `func`, while storing the function `symbolic` for use by the
+      // ONNX export
+      std::string cconv(inputs.size(), 't');
+      func.attr("symbolic") = symbolic;
+      Node* new_node = g.insertNode(g.createPythonOp(
+        THPObjectPtr(func.release().ptr()), cconv, {}));
+      for (auto i : inputs)
+        new_node->addInput(i);
+      std::vector<Value*> outputs;
+      for (size_t i = 0; i < n_outputs; ++i)
+        new_node->addOutput();
+      auto sl = std::make_shared<StringSourceLocation>(tracer::getPythonInterpreterStackTrace());
+      new_node->setSourceLocation(sl);
+      return py::make_iterator(new_node->outputs().begin(), new_node->outputs().end());
+    }, py::return_value_policy::reference_internal)
+    .def("inputs",[](Graph &g) {
+      return py::make_iterator(g.inputs().begin(), g.inputs().end());
+    })
+    .def("outputs",[](Graph &g) {
+      return py::make_iterator(g.outputs().begin(), g.outputs().end());
+    })
+    // TODO: Iterator invalidation might make this hazardous
+    .def("nodes",[](Graph &g) {
+      return py::make_iterator(g.nodes().begin(), g.nodes().end());
+    })
+    .def("addInput",[](Graph &g) { return g.addInput(); })
+    .def("copy",[](Graph &g) {
+      return g.copy();
+    })
+    .GS(advanceStage)
+    .GS(stage)
+    .GS(eraseInput)
+    .GS(registerOutput)
+    .def("create",[](Graph & g, const char * str) {
+      return g.create(Symbol::fromQualString(str));
+    })
+    .def("create",[](Graph & g, const char * str, size_t noutputs) {
+      return g.create(Symbol::fromQualString(str), noutputs);
+    })
+    .def("create",[](Graph & g, const char * str, const std::vector<Value*> & inputs) {
+      return g.create(Symbol::fromQualString(str),inputs);
+    })
+    .def("create",[](Graph & g, const char * str, const std::vector<Value*> & inputs, size_t noutputs) {
+      return g.create(Symbol::fromQualString(str),inputs, noutputs);
+    })
+    .def("param_node", [](Graph &g) {
+      return g.block()->param_node();
+    })
+    .def("return_node", [](Graph &g) {
+      return g.block()->return_node();
+    })
+    .GS(createConstant)
+    .GS(createFusionGroup)
+    .def("createClone",[](Graph & g, Node * n, py::object fn) {
+      return g.createClone(n, [&](Value * e) {
+        return fn(e).cast<Value*>();
+      });
+    })
+    .GS(appendNode)
+    .GS(prependNode)
+    .GS(lint)
+    .GS(insertNode)
+    ;
+    #undef GS
+
+  #define VS(name) \
+    def(#name,&Value :: name)
+  py::class_<Value,std::unique_ptr<Value, py::nodelete>>(m,"Value")
+    .def("__repr__",[](Value & n) {
+      std::stringstream ss;
+      ss << n.uniqueName() << " defined in (" << *n.node() << ")";
+      return ss.str();
+    })
+    .VS(type)
+    .VS(setType)
+    .VS(inferTypeFrom)
+    // skip owningGraph because it returns a raw pointer to a otherwise
+    // std::shared_ptr stored graph object, and would cause a double free
+    .VS(unique)
+    .VS(uniqueName)
+    .VS(setUniqueName)
+    .VS(setStage)
+    .VS(stage)
+    .VS(offset)
+    .VS(uses)
+    .VS(replaceAllUsesWith)
+    .def("node",[](Value &v) { return v.node(); })
+    .def("setTypeAs", [](Value * node, Value * other) {
+      node->setType(other->type());
+      return node;
+    })
+    .VS(copyMetadata)
+    .VS(isTensor)
+    ;
+
+  #undef VS
+
+  py::class_<Block, std::unique_ptr<Block, py::nodelete>>(m, "Block");
+
+  #define NS(name) \
+    def(#name,&Node :: name)
+  py::class_<Node,std::unique_ptr<Node, py::nodelete>>(m,"Node")
+    .def("__repr__",[](Node & n) {
+      std::stringstream ss;
+      ss << n;
+      return ss.str();
+    })
+    .def("hasMultipleOutputs",[](Node&n) {
+      return n.outputs().size() > 1;
+    })
+    .def("outputsSize",[](Node &n) {
+      return n.outputs().size();
+    })
+    .NS(kind)
+    .NS(stage)
+    .NS(setStage)
+    .def("inputs",[](Node &n) {
+      return py::make_iterator(n.inputs().begin(), n.inputs().end());
+    })
+    .def("outputs",[](Node &n) {
+      return py::make_iterator(n.outputs().begin(), n.outputs().end());
+    })
+    .NS(output)
+    .NS(addInput)
+    .NS(replaceInput)
+    .NS(replaceInputWith)
+    .NS(replaceAllUsesWith)
+    .NS(insertBefore)
+    .NS(insertAfter)
+    .NS(moveAfter)
+    .NS(moveBefore)
+    .NS(removeInput)
+    .NS(removeAllInputs)
+    .NS(destroy)
+    .NS(hasUses)
+    .NS(eraseOutput)
+    .NS(addOutput)
+    .NS(scopeName)
+    .def("blocks", [](Node& n) {
+      return py::make_iterator(n.blocks().begin(), n.blocks().end());
+    })
+    .NS(addBlock)
+
+#define AS(name) def(#name,&Attributes<Node> :: name)
+    // methods from Attributes
+    .AS(copyAttributes)
+    .AS(hasAttributes)
+#undef AS
+#define AS(name) def(#name,&Attributes<Node> :: name ## S)
+    // The default method names take Symbol, but the string conversion for
+    // Symbol you to qualify with attr::. This is not very user friendly
+    // for attributes, so expose the string variants instead.
+    .AS(hasAttribute)
+    .AS(kindOf)
+    .AS(removeAttribute)
+    .AS(attributeNames)
+#undef AS
+#define CREATE_ACCESSOR(Kind,method) \
+    def(#method "_",[](Node & n, const char * name, Kind##Attr::ValueType v) { \
+      return n . method ## _(Symbol::attr(name), std::move(v)); \
+    }) \
+    .def(#method, [](Node & n, const char * name) { \
+      return n.method(Symbol::attr(name)); \
+    })
+    .CREATE_ACCESSOR(Float,f)
+    .CREATE_ACCESSOR(Floats,fs)
+    .CREATE_ACCESSOR(String,s)
+    .CREATE_ACCESSOR(Strings,ss)
+    .CREATE_ACCESSOR(Int,i)
+    .CREATE_ACCESSOR(Ints,is)
+    .CREATE_ACCESSOR(Graph,g)
+    .CREATE_ACCESSOR(Graphs,gs)
+#undef CREATE_ACCESSOR
+    // Tensor (t_) -- manually written to unwrap the variable into a tensor.
+    .def("t_",[](Node & n, const char * name, torch::autograd::Variable v) {
+      return n.t_(Symbol::attr(name), std::move(v.data()));
+    })
+    .def("t", [](Node & n, const char * name) {
+      return torch::autograd::make_variable(n.t(Symbol::attr(name)), /*requires_grad=*/false);
+    })
+    // Tensors (ts_) -- manually written to unwrap variables into tensors.
+    .def("ts_",[](Node & n, const char * name, std::vector<torch::autograd::Variable> vs) {
+      std::vector<at::Tensor> tensors;
+      tensors.reserve(vs.size());
+      for (auto& variable : vs) {
+        tensors.push_back(std::move(variable.data()));
+      }
+      return n.ts_(Symbol::attr(name), std::move(tensors));
+    })
+    .def("ts", [](Node & n, const char * name) {
+      auto tensors = n.ts(Symbol::attr(name));
+      std::vector<torch::autograd::Variable> variables;
+      variables.reserve(tensors.size());
+      for (auto& tensor : tensors) {
+        variables.push_back(torch::autograd::make_variable(
+            std::move(tensor), /*requires_grad=*/false));
+      }
+      return variables;
+    })
+    .def("z_",[](Node & n, const char * name, at::Tensor v) {
+        return n.t_(Symbol::attr(name), autograd::Variable(v.view({})).data());
+    })
+    .def("z",[](Node & n, const char * name) {
+        return n.t(Symbol::attr(name));
+    })
+    .def("zs_",[](Node & n, const char * name, TensorsAttr::ValueType v) {
+        for (size_t i = 0; i < v.size(); ++ i) {
+            v[i] = autograd::Variable(v[i].view({})).data();
+        }
+        return n.ts_(Symbol::attr(name), std::move(v));
+    })
+    .def("zs",[](Node & n, const char * name) {
+        return n.ts(Symbol::attr(name));
+    })
+    .def("pyobj",[](Node & n) {
+      return py::handle(n.expect<PythonOp>()->pyobj.get()).cast<py::object>();
+    })
+    .def("cconv",[](Node & n) {
+      return n.expect<PythonOp>()->cconv;
+    })
+    .def("pyname",[](Node & n) {
+      return n.expect<PythonOp>()->name();
+    })
+    .def("scalar_args",[](Node & n) {
+      auto op = n.expect<PythonOp>();
+      auto scalars = py::list();
+      auto append = scalars.attr("append");
+      for(auto & arg : op->scalar_args) {
+        append(py::handle(arg.get()));
+      }
+      return scalars;
+    })
+    ;
+
+  py::class_<Type,std::shared_ptr<Type>>(m,"Type")
+    .def("__repr__",[](Type & t) {
+      return t.str();
+    })
+    .def("kind",[](Type& t_) {
+      Type * t = &t_;
+      switch(t->kind()) {
+        case TypeKind::DynamicType:
+          return "DynamicType";
+        case TypeKind::TensorType:
+          return "TensorType";
+        case TypeKind::TupleType:
+          return "TupleType";
+        default:
+          torch::barf("unknown type kind");
+          return "";
+        }
+    })
+    .def("sizes",[](Type& t) {
+      return t.expect<TensorType>()->sizes();
+    })
+    .def("strides",[](Type& t) {
+      return t.expect<TensorType>()->strides();
+    })
+    .def("contiguous",[](Type& t) {
+      return t.expect<TensorType>()->contiguous();
+    })
+    .def("scalarType",[](Type& t) {
+      return at::toString(t.expect<TensorType>()->scalarType());
+    })
+    ;
+
+  py::class_<DynamicType, Type, std::shared_ptr<DynamicType>>(m, "DynamicType")
+    .def(py::init<>());
+  py::class_<TupleType, Type, std::shared_ptr<TupleType>>(m, "TupleType")
+    .def(py::init<std::vector<TypePtr>>());
+
+  py::class_<Use>(m,"Use")
+  .def_readonly("user",&Use::user)
+  .def_readonly("offset",&Use::offset);
+
+  m.def("_jit_import_graph", [](const std::string& serialized_graph) {
+    std::vector<at::Tensor> initializers;
+    auto graph = ImportIRGraph(serialized_graph, initializers);
+    std::vector<torch::autograd::Variable> variables;
+    variables.reserve(initializers.size());
+    for (auto& tensor : initializers) {
+      variables.push_back(torch::autograd::make_variable(
+          std::move(tensor), /*requires_grad=*/false));
+    }
+    return std::make_tuple(graph, variables);
+  });
+  m.def("_jit_is_tracing", [](const autograd::Variable& var) {
+    return tracer::isTracing(var);
+  });
+}
+}}
diff --git a/torch/csrc/jit/python_ir.h b/torch/csrc/jit/python_ir.h
new file mode 100644
index 0000000..f9c890e
--- /dev/null
+++ b/torch/csrc/jit/python_ir.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void initPythonIRBindings(PyObject* module);
+
+}}
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
new file mode 100644
index 0000000..2ad7a79
--- /dev/null
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -0,0 +1,136 @@
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/jit/export.h"
+#include "torch/csrc/jit/pybind.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+#include <sstream>
+
+using namespace torch::autograd;
+using namespace torch::jit;
+using namespace torch::jit::tracer;
+
+
+namespace torch { namespace jit { namespace tracer {
+
+
+// Python interpreter retrieval routine adapted from
+// https://stackoverflow.com/a/8706144
+std::string getPythonInterpreterStackTrace() {
+  std::stringstream stack_trace;
+  AutoGIL gil;
+  PyThreadState *tstate = PyThreadState_GET();
+  if (NULL != tstate && NULL != tstate->frame) {
+    PyFrameObject *frame = tstate->frame;
+
+    while (NULL != frame) {
+      int line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+      std::string filename = THPUtils_unpackString(frame->f_code->co_filename);
+      std::string funcname = THPUtils_unpackString(frame->f_code->co_name);
+      stack_trace << filename << "(" << line << "): " << funcname << "\n";
+      frame = frame->f_back;
+    }
+  }
+  return stack_trace.str();
+}
+
+// This is a temporary constructor so that we can write python tests of
+// the executor. It does not have most of the functionality of CompiledFunction
+// such as being able to hold parameters...
+std::shared_ptr<torch::jit::Graph> createGraphByTracing(
+        py::function func,
+        tracer::variable_list trace_inputs,
+        size_t num_func_inputs) {
+  auto enter_info = tracer::enter(std::move(trace_inputs));
+  py::tuple py_inputs(num_func_inputs);
+  for(size_t i = 0; i < num_func_inputs; ++i) {
+    py_inputs[i] = py::cast(enter_info.second[i]);
+  }
+  auto out = func(*py_inputs);
+  std::vector<autograd::Variable> outputs;
+  if(PyTuple_Check(out.ptr())) {
+    outputs = py::cast<std::vector<autograd::Variable>>(out);
+  } else {
+    outputs.push_back(py::cast<autograd::Variable>(out));
+  }
+  tracer::exit(outputs);
+  auto graph = enter_info.first->graph;
+  EliminateDeadCode(graph);
+  return graph;
+}
+
+PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj,
+                                  std::string arg_types,
+                                  at::ArrayRef<Variable> inputs,
+                                  pyobj_list scalar_args) {
+  THPObjectPtr apply(PyObject_GetAttrString(pyobj.get(), "apply"));
+  if(!apply) {
+    throw python_error();
+  }
+  return makePreTraceInfo(inputs, [&](const std::shared_ptr<TracingState>& state, Graph& graph) {
+    return graph.createPythonOp(
+        std::move(apply),
+        arg_types,
+        std::move(scalar_args));
+  });
+}
+
+void pythonRecordSourceLocation(Node* n) {
+  auto sl = std::make_shared<StringSourceLocation>(getPythonInterpreterStackTrace());
+  n->setSourceLocation(sl);
+}
+
+void initPythonTracerBindings(PyObject* module_) {
+  setRecordSourceLocation(pythonRecordSourceLocation);
+
+  auto m = py::handle(module_).cast<py::module>();
+  py::class_<TracingState,std::shared_ptr<TracingState>>(m, "TracingState", py::dynamic_attr())
+    // NB: no constructor; you have to get it from C++ code
+    .def("__repr__", [](const TracingState& s) {
+      std::ostringstream ss;
+      ss << "<TracingState " << (const void*)&s << ">";
+      return ss.str();
+    })
+    .def("__str__", [](const TracingState& s) -> std::string {
+      std::ostringstream ss;
+      ss << *s.graph;
+      return ss.str();
+    })
+    .def("push_scope", [](TracingState& s, const std::string& scope_name) {
+      s.graph->push_scope(scope_name);
+    })
+    .def("pop_scope", [](TracingState& s) {
+      s.graph->pop_scope();
+    })
+    .def("set_graph", [](TracingState& s, std::shared_ptr<Graph> g) {
+      s.graph = g;
+    })
+    .def("graph", [](TracingState& s) {
+      return s.graph;
+    });
+
+  m.def("_tracer_enter", [](variable_list trace_inputs) {
+    return tracer::enter(std::move(trace_inputs));
+  });
+  m.def("_tracer_exit", [](variable_list var_outputs) {
+    tracer::exit(var_outputs);
+  });
+  m.def("_get_tracing_state", [](const variable_list& vars) {
+    return getTracingState(vars);
+  });
+  m.def("_get_value_trace", [](std::shared_ptr<TracingState>& state, const Variable& var) {
+    return getValueTrace(state, var);
+  });
+  m.def("_set_value_trace", [](std::shared_ptr<TracingState>& state, const Variable& var, Value* value) {
+    return setValueTrace(state, var, value);
+  });
+  m.def("_is_tracing", [](const variable_list& vars) {
+    return isTracingVar(vars);
+  });
+}
+
+}}} // namespace torch::jit::tracing
diff --git a/torch/csrc/jit/python_tracer.h b/torch/csrc/jit/python_tracer.h
new file mode 100644
index 0000000..960f485
--- /dev/null
+++ b/torch/csrc/jit/python_tracer.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <memory>
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/utils/pybind.h"
+
+namespace torch { namespace jit { namespace tracer {
+void initPythonTracerBindings(PyObject *module);
+
+
+std::string getPythonInterpreterStackTrace();
+tracer::PreTraceInfo preRecordPythonTrace(
+    THPObjectPtr pyobj, std::string arg_types, at::ArrayRef<autograd::Variable> inputs,
+    pyobj_list scalar_args);
+
+std::shared_ptr<Graph> createGraphByTracing(
+        py::function func,
+        autograd::variable_list inputs,
+        size_t num_inputs);
+} // namespace tracer
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
new file mode 100644
index 0000000..3a2ae20
--- /dev/null
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -0,0 +1,185 @@
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/variable_tensor_functions.h"
+
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <stdexcept>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+Operation noop(Node* n) {
+  return [](Stack& stack) { return 0; };
+}
+
+RegisterOperators reg({
+
+    Operator(
+        prim::FusionGroup,
+        [](Node* node) {
+          auto fusion_fn = sharedFusionCompiler().getOrCompile(node);
+          auto num_inputs = node->inputs().size();
+          return [fusion_fn, num_inputs](Stack& stack) {
+            autograd::profiler::RecordFunction record("FusionGroup");
+            std::vector<at::Tensor> toutputs;
+            // TODO: have fusion_fn work off of a stack as well
+            auto tinputs = fmap(last(stack, num_inputs), [](const IValue& v) {
+              return v.toTensor();
+            });
+            fusion_fn->launch(tinputs, toutputs);
+            drop(stack, num_inputs);
+            stack.insert(stack.end(), toutputs.begin(), toutputs.end());
+            return 0;
+          };
+        }),
+
+    Operator(
+        prim::Constant,
+        [](Node* node) {
+          auto t = autograd::make_variable(node->t(attr::value));
+          return [t](Stack& stack) {
+            stack.push_back(t);
+            return 0;
+          };
+        }),
+
+    Operator(prim::NumToTensor, noop),
+    Operator(prim::TensorToNum, noop),
+    Operator(
+        prim::Undefined,
+        [](Node* node) {
+          return [](Stack& stack) {
+            stack.push_back(at::Tensor());
+            return 0;
+          };
+        }),
+    Operator(
+        prim::Print,
+        [](Node* node) {
+          size_t num_inputs = node->inputs().size();
+          return [num_inputs](Stack& stack) {
+            bool first = true;
+            for (const IValue& i_ : last(stack, num_inputs)) {
+              auto i = i_.toTensor();
+              if (!first)
+                std::cout << " ";
+              first = false;
+              if (auto tensor_impl = dynamic_cast<at::TensorImpl*>(i.get())) {
+                std::cout << at::Tensor(tensor_impl, true);
+              } else if (!i.defined()) {
+                std::cout << "<undefined tensor>";
+              } else {
+                auto& r = *i.get();
+                std::cout << "<" << typeid(r).name() << " at " << i << ">";
+              }
+            }
+            drop(stack, num_inputs);
+            std::cout << std::endl;
+            return 0;
+          };
+        }),
+    // Load x, y
+    // loads values from registers onto the stack, the actual callback does
+    // nothing since the stack manipulation is already encoded in inst.inputs
+    // and inst.outputs
+    Operator(prim::Load, noop),
+    // x, y = Store
+    // stores vales from stack into registers, the actual callback does
+    // nothing since the stack manipulation is already encoded in inst.inputs
+    // and inst.outputs
+    Operator(prim::Store, noop),
+
+    Operator(
+        prim::Drop,
+        [](Node* node) {
+          auto N = node->inputs().size();
+          return [=](Stack& stack) {
+            drop(stack, N);
+            return 0;
+          };
+        }),
+    Operator(
+        onnx::Reshape,
+        [](Node* node) {
+          return [=](Stack& stack) {
+            auto shape = pop(stack).toTensor().contiguous();
+            auto input = pop(stack).toTensor();
+            JIT_ASSERT(shape.ndimension() == 1);
+            at::IntList shape_list(shape.data<int64_t>(), shape.size(0));
+            stack.push_back(input.reshape(shape_list));
+            return 0;
+          };
+        }),
+    Operator(
+        onnx::Shape,
+        [](Node* node) {
+          return [=](Stack& stack) {
+            auto t = pop(stack).toTensor();
+            at::IntList sizes = t.sizes();
+            auto sizes_tensor = torch::empty(
+                {static_cast<int64_t>(sizes.size())}, at::dtype(at::kLong));
+            auto accessor = sizes_tensor.accessor<int64_t, 1>();
+            for (size_t i = 0; i < sizes.size(); ++i) {
+              accessor[i] = sizes[i];
+            }
+            stack.push_back(sizes_tensor);
+            return 0;
+          };
+        }),
+
+    Operator(
+        prim::AnyDefined,
+        [](Node* node) {
+          size_t num_inputs = node->inputs().size();
+          auto true_ = at::full({}, 1, at::kLong);
+          auto false_ = at::full({}, 0, at::kLong);
+          return [=](Stack& stack) {
+            bool result = false;
+            for (const IValue& t : last(stack, num_inputs)) {
+              if (std::move(t).toTensor().defined()) {
+                result = true;
+                break;
+              }
+            }
+            drop(stack, num_inputs);
+            stack.push_back(result ? true_ : false_);
+            return 0;
+          };
+        }),
+
+    Operator(
+        prim::AutogradAdd,
+        [](Node* node) {
+          return [=](Stack& stack) {
+            auto a = pop(stack).toTensor();
+            auto b = pop(stack).toTensor();
+            if (!a.defined())
+              stack.push_back(b);
+            else if (!b.defined())
+              stack.push_back(a);
+            else
+              stack.push_back(a + b);
+            return 0;
+          };
+        }),
+});
+}}} // torch::jit::anon
diff --git a/torch/csrc/jit/resource_guard.h b/torch/csrc/jit/resource_guard.h
new file mode 100644
index 0000000..430d0cb
--- /dev/null
+++ b/torch/csrc/jit/resource_guard.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <functional>
+
+namespace torch { namespace jit {
+
+class ResourceGuard {
+  std::function<void()> _destructor;
+  bool _released;
+
+public:
+  ResourceGuard(std::function<void()> destructor)
+    : _destructor(std::move(destructor))
+    , _released(false) {}
+
+  ~ResourceGuard() {
+    if (!_released) _destructor();
+  }
+
+  void release() {
+    _released = true;
+  }
+};
+
+}}
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
new file mode 100644
index 0000000..df3ff81
--- /dev/null
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -0,0 +1,1737 @@
+#include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/passes/lower_tuples.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/script/parser.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+
+#include "ATen/optional.h"
+
+
+#include <climits>
+#include <set>
+
+namespace torch {
+namespace jit {
+namespace script {
+
+using SugaredValuePtr = std::shared_ptr<SugaredValue>;
+using FunctionTable = std::unordered_map<std::string, Method&>;
+using ValueTable = std::unordered_map<std::string, SugaredValuePtr>;
+using AttributeMap = std::unordered_map<std::string, Const>;
+using ListAttributeMap = std::unordered_map<std::string, std::vector<Const>>;
+
+// what type will this have in the interpreter, ignoring extra static information
+// in particular Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
+static TypePtr interpreterType(const TypePtr& type) {
+  if(TupleType* t = type->cast<TupleType>()) {
+    return std::make_shared<TupleType>(fmap(t->elements(), interpreterType));
+  } else if(type->kind() == TypeKind::TensorType) {
+    return DynamicType::get();
+  } else {
+    return type;
+  }
+}
+
+// Auxiliary data structure for desugaring variable binding into our always
+// explicitly scoped language as we descend down
+// nested control structures in the frontend (which themselves don't introduce
+// scopes)
+//
+// The algorithm is roughly as follows:
+// 1) While emitting a block within a control operator, add inputs and outputs
+//      from the block for each value referenced (both "reads" and "writes").
+//      This sets the value up as a candidate loop carried dependency.
+// 2) When we reach the end of the block, examine all the values in the current
+//      scope's value map. If the name also resides in an outer scope with a
+//      different Value*, this is a true loop-carried dependency. If not, this
+//      value was not assigned to. Replace all references to the block input
+//      with the Value* pointed to in the tightest enclosing scope. Then delete
+//      that block input and output.
+// 3) When we emit the actual control operator, take all of the loop-carried
+//      dependency values as inputs and return them as outputs from the control
+//      op
+//
+//  Note that an alternative implementation could only add the loop-carried dep
+//      inputs and outputs when we see a value that is mutated. This, however
+//      requires replacing all references to that value *within the current
+//      block* with a new input. That is to say: we need to traverse the pre-
+//      decessor nodes and replace inputs that reference that value with the
+//      newly-created input. This could be made less expensive with a change to
+//      the IR API, but for now we choose to pessimisitically create inputs and
+//      delete unnecessary ones later with replaceAllusesWith().
+struct Environment {
+  Environment(Method & method, const Resolver& resolver, Block* b, std::shared_ptr<Environment> next = nullptr)
+      : method(method), resolver(resolver), b(b), next(next) {}
+
+  Method & method;
+  const Resolver& resolver;
+  std::vector<std::string> captured_inputs;
+  Block* b;
+
+  std::shared_ptr<Environment> next;
+
+  SugaredValuePtr findInThisFrame(const std::string& name) {
+    if (value_table.count(name)) {
+      return value_table.at(name);
+    }
+    return nullptr;
+  }
+
+  SugaredValuePtr findInParentFrame(const std::string& name) {
+    return next ? next->findInAnyFrame(name) : nullptr;
+  }
+
+  SugaredValuePtr findInAnyFrame(const std::string& name) {
+    for (auto runner = this; runner; runner = runner->next.get()) {
+      if(auto r = runner->findInThisFrame(name)) {
+        return r;
+      }
+    }
+    return nullptr;
+  }
+
+  Value* getValueInThisFrame(const SourceRange& loc, const std::string& name) {
+    return value_table.at(name)->asValue(loc, method);
+  }
+
+  SugaredValuePtr createCapturedInput(Value* orig, const std::string& name) {
+    // Create the input
+    Value* new_input = b->addInput()->setType(orig->type());
+
+    // Associate this name with this value
+    auto sv = std::make_shared<SimpleValue>(new_input);
+    value_table[name] = sv;
+
+    // List as a positional input
+    captured_inputs.push_back(name);
+
+    return sv;
+  }
+
+  SugaredValuePtr createCapturedInputIfNeeded(const SourceRange& loc, std::string ident) {
+    auto in_frame = findInThisFrame(ident);
+    if (in_frame)
+      return in_frame;
+
+    // recursively handles the case where parent blocks are also loops
+    auto from_parent = next ? next->createCapturedInputIfNeeded(loc, ident) : nullptr;
+
+    // recursively create the captured input if it is the loop block
+    if (from_parent && getBlockOwningKind() == prim::Loop) {
+      if (Value* simple_val = asSimple(from_parent))
+        from_parent = createCapturedInput(simple_val, ident);
+    }
+    return from_parent;
+  }
+
+  Block* block() {
+    return b;
+  }
+  Symbol getBlockOwningKind() {
+    Symbol owning_kind = Symbol();
+    if (b->owningNode()) {
+      owning_kind = b->owningNode()->kind();
+    }
+    return owning_kind;
+  }
+
+  void setVar(const SourceRange& loc, const std::string& name, Value* value) {
+    setSugaredVar(loc, name, std::make_shared<SimpleValue>(value));
+  }
+  static Value* asSimple(SugaredValuePtr value) {
+    if(SimpleValue* sv = dynamic_cast<SimpleValue*>(value.get())) {
+      return sv->getValue();
+    }
+    return nullptr;
+  }
+
+  void setSugaredVar(const SourceRange& loc, const std::string& name, SugaredValuePtr value) {
+    Value* as_simple_value = asSimple(value);
+    if (as_simple_value)
+      as_simple_value->setUniqueName(name);
+    // prevent re-assignment involving any sugared values
+    // any reassignment like:
+    // a = ...
+    // while ...
+    //   a = ..
+    // requires 'a' to be first-class in the graph since its value depends on
+    // control flow
+    if(auto parent = findInParentFrame(name)) {
+      if(!as_simple_value) {
+        throw ErrorReport(loc) << "Cannot re-assign '" << name << "' to a value of type " << value->kind() <<
+	" because " << name << " is not a first-class value.  Only reassignments to first-class values are allowed";
+      }
+      Value* simple_parent = asSimple(parent);
+      if(!simple_parent) {
+        throw ErrorReport(loc) << "Cannot re-assign '" << name << "' because it has type " << value->kind() <<
+	" and " << name << " is not a first-class value.  Only reassignments to first-class values are allowed";
+      }
+      if(!as_simple_value->type()->isSubtypeOf(*interpreterType(simple_parent->type()))) {
+        throw ErrorReport(loc) << "variable '" << name << "' previously has type " << simple_parent->type()->str()
+        << " but is now being assigned to a value of type " << as_simple_value->type()->str();
+      }
+    }
+    if (as_simple_value)
+      createCapturedInputIfNeeded(loc, name);
+    value_table[name] = std::move(value);
+  }
+
+  SugaredValuePtr getSugaredVar(const Ident& ident, bool required=true) {
+    return getSugaredVar(ident.name(), ident.range());
+  }
+  Value* getVar(const Ident& ident) {
+    return getSugaredVar(ident)->asValue(ident.range(), method);
+  }
+
+  SugaredValuePtr getSugaredVar(const std::string& ident, SourceRange range, bool required=true) {
+    auto retval = createCapturedInputIfNeeded(range, ident);
+
+    if(!retval) {
+      retval = resolver(ident);
+    }
+
+    if (!retval && required) {
+      throw ErrorReport(range) << "undefined value " << ident;
+    }
+    return retval;
+  }
+
+  Value* getVar(const std::string& ident, SourceRange range) {
+    return getSugaredVar(ident, range)->asValue(range, method);
+  }
+
+  // Given that after emitting statements in a block, we've added block inputs
+  // for all value references and assignments, delete inputs for which there was
+  // no assignment, only references.
+  void deleteExtraInputs(const SourceRange& loc) {
+    // note: skip i == 0, it is the loop trip count for inputs
+    // and the loop condition for outputs.
+    // captured_inputs is indexed by i - 1 since it only contains loop
+    // carried dependencies
+    //          inputs: loop_counter, lcd0, lcd1, ...
+    //         outputs: loop_condition, lcd0, lcd1, ...
+    // captured_inputs: lcd0, lcd1, ...
+    JIT_ASSERT(b->inputs().size() == b->outputs().size());
+    JIT_ASSERT(b->inputs().size() == captured_inputs.size() + 1);
+    for(size_t i = b->inputs().size() - 1; i > 0; i--) {
+      // nothing changed along this loop
+      if(b->inputs()[i] == b->outputs()[i]) {
+        auto name = captured_inputs[i - 1];
+        Value* orig = findInParentFrame(name)->asValue(loc, method);
+        b->inputs()[i]->replaceAllUsesWith(orig);
+        b->eraseInput(i);
+        b->eraseOutput(i);
+        captured_inputs.erase(captured_inputs.begin() + i - 1);
+      }
+    }
+  }
+  std::vector<std::string> definedVariables() {
+    std::vector<std::string> result;
+    for(auto & kv : value_table) {
+      result.push_back(kv.first);
+    }
+    return result;
+  }
+private:
+  ValueTable value_table;
+};
+
+std::shared_ptr<SugaredValue> packOutputs(Graph& g, at::ArrayRef<Value*> values) {
+  if(values.size() == 1) {
+    return std::make_shared<SimpleValue>(values[0]);
+  }
+  return std::make_shared<SimpleValue>(g.insertNode(g.createTuple(values))->output());
+}
+
+Value* createConstant(Graph& g, const SourceRange& loc, const at::Tensor& val) {
+  auto n = g.createConstant(val);
+  n->setSourceLocation(std::make_shared<SourceRange>(loc));
+  return g.insertNode(n)->output();
+}
+
+Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) {
+  JIT_ASSERT(val.numel() == 1);
+  auto* output = createConstant(g, loc, val);
+  if (val.type().scalarType() == at::kLong) {
+    output->setType(IntType::get());
+  } else if (val.type().scalarType() == at::kFloat) {
+    output->setType(FloatType::get());
+  } else {
+    throw ErrorReport(loc) << "createNumber with unknown scalar type ("
+							<< val.type().scalarType() << "). Please file a bug report.";
+  }
+  return output;
+}
+
+Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef<Value*> inputs) {
+  // bake in constant propagation for the all-constant case because it is
+  // common to see constant lists like [1, 2] passed to attributes
+  bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) {
+    return v->node()->kind() == prim::Constant;
+  });
+  if(all_constant) {
+    auto values = fmap(inputs, [&](Value* v) {
+      return v->node()->t(attr::value);
+    });
+    return createConstant(g, loc, at::stack(values));
+  }
+  return g.insertNode(g.create(aten::stack, inputs)
+                      ->i_(attr::dim, 0)
+                      ->setSourceLocation(std::make_shared<SourceRange>(loc)))->output();
+}
+
+static bool isTensorSubtype(Value* v) {
+  return v->type()->isSubtypeOf(*DynamicType::get());
+}
+
+static bool isNumberSubtype(const Value* v) {
+  return v->type()->isSubtypeOf(*NumberType::get());
+}
+
+static bool isNumberSubtype(const TypePtr& type) {
+  return type->isSubtypeOf(*NumberType::get());
+}
+
+at::optional<std::vector<int64_t>> getIntListAttribute(at::optional<int32_t> N, Value* input) {
+  auto list = constant_as<std::vector<int64_t>>(input);
+  if(list)
+    return list;
+  // broadcast IntList[3] with value 4 -> {4, 4, 4}
+  if(!N)
+    return at::nullopt;
+  auto r = constant_as<int64_t>(input);
+  if(!r)
+    return at::nullopt;
+  // broadcast to attribute size
+  return std::vector<int64_t>(*N, *r);
+}
+
+// try to turn constant inputs into attributes
+void liftConstantAttributes(const FunctionSchema& schema, Node* node) {
+  // we shouldn't start with attributes, just inputs
+  JIT_ASSERT(!node->hasAttributes());
+  std::vector<Value*> new_inputs;
+  Attributes<Node> attributes;
+  for(size_t i = 0, n = 0; i < schema.arguments.size(); ++i) {
+    const auto& arg = schema.arguments[i];
+    // this was a builtin with a vararg list lowered,
+    if(*arg.type == *ListType::ofTensors()) {
+      // we need to skip all the vararg nodes, and continue parsing the
+      // possible attribute nodes
+      size_t vararg_list_size = node->inputs().size() - (schema.arguments.size() - 1);
+      while(n < i + vararg_list_size) {
+        new_inputs.push_back(node->input(n++));
+      }
+      continue;
+    }
+    auto input = node->input(n++);
+    switch(arg.type->kind()) {
+      case TypeKind::IntType:{
+        auto r = constant_as<int64_t>(input);
+        if(!r)
+          return;
+        attributes.i_(Symbol::attr(arg.name), *r);
+      } break;
+      case TypeKind::FloatType: {
+        auto r = constant_as<double>(input);
+        if(!r)
+          return;
+        attributes.f_(Symbol::attr(arg.name), *r);
+      } break;
+      case TypeKind::NumberType: {
+        auto r = constant_as<at::Tensor>(input);
+        if(!r)
+          return;
+        attributes.t_(Symbol::attr(arg.name), *r);
+      } break;
+      case TypeKind::ListType: {
+        auto elem = arg.type->expect<ListType>()->getElementType();
+        if(elem->kind() == TypeKind::IntType) {
+          auto r = getIntListAttribute(arg.N, input);
+          if(!r)
+            return;
+          attributes.is_(Symbol::attr(arg.name), *r);
+        } else {
+          // only IntLists can become attributes, other
+          // types are not attribute-able
+          new_inputs.push_back(input);
+        }
+      } break;
+      default:
+        new_inputs.push_back(input);
+    }
+  }
+  // nothing changed no need to modify the node
+  if(!attributes.hasAttributes())
+    return;
+
+  node->removeAllInputs();
+  for(Value* input : new_inputs) {
+    node->addInput(input);
+  }
+  node->copyAttributes(attributes);
+}
+
+at::ArrayRef<Value*> createTupleUnpack(Value* v) {
+  // small peephole optimization to ensure IntList attributes can still turn
+  // into constants e.g. in x.expand([3, 4])
+  if(v->node()->kind() == prim::TupleConstruct)
+    return v->node()->inputs();
+  auto & g = *v->owningGraph();
+  return g.insertNode(g.createTupleUnpack(v))->outputs();
+}
+
+
+static Value* numToTensor(
+    const SourceRange& loc,
+    Graph& graph,
+    Value* value) {
+  JIT_ASSERT(isNumberSubtype(value));
+  auto* result = graph.insertNode(graph.create(prim::NumToTensor, {value})
+      ->setSourceLocation(std::make_shared<SourceRange>(loc)))
+      ->output();
+  result->setType(DynamicType::get());
+  return result;
+}
+
+static Value* tensorToNum(
+    const SourceRange& loc,
+    Graph& graph,
+    Value* value,
+    const TypePtr type) {
+  JIT_ASSERT(isTensorSubtype(value));
+  JIT_ASSERT(isNumberSubtype(type));
+  auto* result = graph.insertNode(graph.create(prim::TensorToNum, {value})
+      ->setSourceLocation(std::make_shared<SourceRange>(loc)))
+      ->output();
+  result->setType(type);
+  return result;
+}
+
+static inline bool isIntUsedAsIntList(
+    const Value* value,
+    const Argument& arg) {
+  // Look for int[N]
+  return value->type()->kind() == TypeKind::IntType &&
+         *arg.type == *ListType::ofInts() && arg.N;
+}
+
+at::optional<std::vector<Value*>> tryMatchSchema(
+  const FunctionSchema& schema,
+  const SourceRange& loc,
+  Graph& graph,
+  at::ArrayRef<NamedValue> inputs,
+  at::ArrayRef<NamedValue> attributes,
+  std::ostream& failure_messages) {
+    auto err = [&]() -> std::ostream& {
+      failure_messages << "\nfor operator " << schema << ":\n";
+      return failure_messages;
+    };
+
+    std::vector<at::optional<NamedValue>> positional_inputs(schema.arguments.size(), at::nullopt);
+
+    size_t total_inputs = attributes.size() + inputs.size();
+    if(total_inputs > schema.arguments.size()) {
+      err() << "expected at most " << schema.arguments.size() << " arguments "
+      << "but found " << total_inputs << "\n" << loc << "\n";
+      return at::nullopt;
+    }
+    // fill in positional arguments
+    for(size_t i = 0; i < inputs.size(); ++i) {
+      positional_inputs[i] = inputs[i];
+    }
+    // fill in named arguments
+    for(const NamedValue& nv : attributes) {
+      auto idx = schema.argumentIndexWithName(nv.name);
+      if(!idx) {
+        err() << "unknown keyword argument '" << nv.name << "'\n" << nv.loc;
+        return at::nullopt;
+      }
+      if(positional_inputs[*idx]) {
+        err() << "argument '" <<  nv.name << "' specified twice \n" << nv.loc;
+        return at::nullopt;
+      }
+      positional_inputs[*idx] = nv;
+    }
+    // fill in default values
+    for(size_t i = 0; i < positional_inputs.size(); ++i) {
+      if(positional_inputs[i])
+        continue;
+      auto default_value = schema.arguments[i].default_value;
+      if(!default_value) {
+        err() << "argument '" << schema.arguments[i].name << "' not provided.\n" << loc;
+        return at::nullopt;
+      }
+      positional_inputs[i] = NamedValue(
+          loc,
+          i,
+          createConstant(graph, loc, *default_value)
+              ->setType(schema.arguments[i].type));
+    }
+
+    // check input types
+    std::vector<Value*> flat_inputs;
+    for(size_t i = 0; i < schema.arguments.size(); ++i) {
+      NamedValue v = *positional_inputs[i];
+      const auto& arg = schema.arguments[i];
+
+
+      // some functions that take lists of integers for fixed size arrays
+      // also allow single ints to be passed in their place.
+      // the single int is then repeated to the length of the list
+      if (isIntUsedAsIntList(v.value, arg)) {
+        std::vector<Value*> repeated(*arg.N, v.value);
+        v.value = graph.insertNode(graph.createTuple(repeated))->output();
+      }
+
+      // Tuples of integers are created using TuplePack which we do not actually
+      // support in the interpreter, so we have to replace it with a
+      // stack call, which creates a Tensor to represent the list.
+      if(*ListType::ofInts() == *arg.type &&
+         v.value->type()->kind() == TypeKind::TupleType &&
+         v.value->type()->isSubtypeOf(*ListType::ofInts())) {
+        auto unpacked = createTupleUnpack(v.value);
+        // elements are numbers so we have to convert to tensors before
+        // stack will be valid
+        auto unpacked_t = fmap(unpacked, [&](Value* e) {
+          return numToTensor(v.loc, graph, e);
+        });
+        v.value = createStack(graph, loc, unpacked_t)->setType(ListType::ofInts());
+      }
+
+      // implicit conversion from Tensor to Python Number
+      // FIXME: remove this when we support passing numbers into script fns
+      if (isTensorSubtype(v.value) && isNumberSubtype(arg.type)) {
+        v.value = tensorToNum(loc, graph, v.value, arg.type);
+      }
+
+      if(!v.value->type()->isSubtypeOf(*arg.type)) {
+        err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
+              << v.value->type()->str() << "\n"
+              << v.loc;
+        return at::nullopt;
+      }
+
+      // we only support tensor lists for builtins, where they must be flattened
+      if(arg.type->isSubtypeOf(*ListType::ofTensors())) {
+        auto outputs = createTupleUnpack(v.value);
+        flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end());
+      } else {
+        flat_inputs.push_back(v.value);
+      }
+    }
+
+    return flat_inputs;
+}
+
+
+static std::shared_ptr<SugaredValue> tryEmitBuiltin(
+  const FunctionSchema& schema,
+  std::stringstream& failure_messages,
+  const SourceRange& loc,
+  Method& method,
+  const std::string & name,
+  at::ArrayRef<NamedValue> inputs,
+  at::ArrayRef<NamedValue> attributes) {
+
+  auto graph = method.graph();
+  auto flat_inputs = tryMatchSchema(schema, loc, *graph, inputs, attributes, failure_messages);
+  if(!flat_inputs)
+    return nullptr;
+  // we successfully matched this schema, construct the node
+
+  // note: we always construct purely positional nodes here
+  // the pass liftConstantAttributes replaces the node with with one that
+  // uses attributes if all the attributes ended up as constants
+
+  NodeKind kind(Symbol::aten(name));
+  auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0))
+                ->setSourceLocation(std::make_shared<SourceRange>(loc));
+
+  size_t num_outputs = schema.returns.size();
+
+  // special case for chunk when the chunks=<const> is known
+  // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF
+  // NEEDED
+  if(n->kind() == aten::chunk) {
+    auto value = constant_as<int64_t>((*flat_inputs)[1]);
+    if(!value) {
+      throw ErrorReport(loc) << "argument 'chunks' must be a constant";
+    }
+    num_outputs = *value;
+  }
+
+  for(size_t i = 0; i < num_outputs; ++i)
+    n->addOutput();
+
+  liftConstantAttributes(schema, n);
+
+  // assert that we did indeed create an op that has implementation
+  // otherwise schema and dispatch are not in sync
+  getOperation(n);
+
+  return packOutputs(*graph, n->outputs());
+}
+
+static std::string prefixLine(const std::string& str, std::string prefix) {
+  std::stringstream ss;
+  bool was_newline = true;
+  for(auto c : str) {
+    if(was_newline)
+      ss << prefix;
+    ss.put(c);
+    was_newline = c == '\n';
+  }
+  return ss.str();
+}
+
+std::shared_ptr<SugaredValue> emitBuiltinCall(
+  const SourceRange& loc,
+  Method& method,
+  const std::string & name,
+  at::ArrayRef<NamedValue> inputs,
+  at::ArrayRef<NamedValue> attributes,
+  // if true, emitBuiltinCall will throw an exception if this builtin does not exist,
+  // otherwise it will return nullptr if the builtin is not found.
+  bool required) {
+
+  const auto& variants = getAllOperatorsFor(Symbol::aten(name));
+  std::stringstream failure_messages;
+  for (const std::shared_ptr<Operator>& op : variants) {
+    if (auto result = tryEmitBuiltin(
+            op->schema, failure_messages, loc, method, name, inputs, attributes)) {
+      return result;
+    }
+  }
+  // none of the options worked
+  if(!required) {
+    return nullptr;
+  }
+  if(variants.size() == 0) {
+    throw ErrorReport(loc) << "unknown builtin op";
+  }
+  throw ErrorReport(loc) << "arguments for call are not valid:\n"
+                         << prefixLine(failure_messages.str(), "  ")
+                         << "for call at";
+}
+
+struct NoneValue : SugaredValue {
+  NoneValue() {}
+  virtual std::string kind() const override {
+    return "None";
+  }
+};
+
+static Value* ensureTensor(const SourceRange& range, Value* v) {
+  if(!isTensorSubtype(v)) {
+    throw ErrorReport(range) << "expected a tensor value but found a "
+                             << *v->type();
+  }
+  return v;
+}
+
+static Value* ensureTensorOrNumber(const SourceRange& range, Value* v) {
+  if(!isNumberSubtype(v) && !isTensorSubtype(v)) {
+    throw ErrorReport(range) << "expected a Number or Tensor value but found a "
+                             << *v->type();
+  }
+  return v;
+}
+
+
+void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> values) {
+  for(auto value : values) {
+    ensureTensor(range, value);
+  }
+}
+
+static Value* identity(const SourceRange& range, Value* v) {
+  return v;
+}
+
+
+std::shared_ptr<SugaredValue> BuiltinFunction::call(
+    SourceRange loc,
+    Method & m,
+    at::ArrayRef<NamedValue> inputs_,
+    at::ArrayRef<NamedValue> attributes,
+    size_t n_binders) {
+  std::vector<NamedValue> inputs;
+  if (value)
+    inputs.push_back(*value);
+  inputs.insert(inputs.end(), inputs_.begin(), inputs_.end());
+  return emitBuiltinCall(loc, m, name, inputs, attributes, true);
+}
+
+struct to_ir {
+  to_ir(
+      Def def,
+      FunctionTable& function_table,
+      const Resolver& resolver,
+      SugaredValuePtr self,
+      Method& method) // method being constructed
+      : method(method)
+      , graph(method.graph())
+      , def(def)
+      , function_table(function_table)
+      , resolver(resolver)
+      , environment_stack(nullptr) {
+    pushFrame(graph->block());
+
+    std::vector<Argument> arguments, returns; // for schema
+    // inputs
+    auto it = def.params().begin();
+    auto end = def.params().end();
+    if(self) {
+      if(it == end)
+        throw ErrorReport(def.params().range()) << "methods must have a self argument";
+      environment_stack->setSugaredVar(def.range(), (*it).ident().name(), self);
+      ++it;
+    }
+    for(;it != end; ++it) {
+      auto& name = (*it).ident().name();
+      arguments.push_back({name, DynamicType::get()});
+      environment_stack->setVar((*it).ident().range(), name, graph->addInput(name));
+    }
+    // body
+    auto stmts = def.statements();
+    auto stmts_begin = stmts.begin();
+    auto stmts_end = stmts.end();
+    bool has_return = false;
+    if (stmts_begin != stmts_end && (*std::prev(stmts_end)).kind() == TK_RETURN) {
+      --stmts_end;
+      has_return = true;
+    }
+
+    emitStatements(stmts_begin, stmts_end);
+
+    // outputs
+    if (has_return) {
+      auto return_stmt = Return(*stmts_end);
+      auto results = getValues(return_stmt.values(), true, identity);
+      // a single return value that is a tuple expands in place:
+      // return a
+      if (return_stmt.values().size() == 1 && results.size() == 1) {
+        auto result = results.at(0);
+        if(result->type()->cast<TupleType>()) {
+          results = createTupleUnpack(result);
+        }
+      }
+      ensureTensors(return_stmt.range(), results);
+      for(auto r : results) {
+        graph->registerOutput(r);
+        returns.push_back({"", DynamicType::get()});
+      }
+    }
+
+    method.setSchema({def.name().name(), std::move(arguments), std::move(returns)});
+    // remove any uses of tuples that we inserted
+    LowerTuples(graph);
+  }
+
+private:
+  Method& method;
+  std::shared_ptr<Graph> graph;
+  Def def;
+  FunctionTable& function_table;
+  const Resolver& resolver;
+
+  // Singly-linked list of environments. This top element contains a member
+  // `next` that points to the most immediate enclosing scope's value.
+  std::shared_ptr<Environment> environment_stack;
+
+  void pushFrame(Block * b) {
+    environment_stack = std::make_shared<Environment>(method, resolver, b, environment_stack);
+  }
+  std::shared_ptr<Environment> popFrame() {
+    auto old_frame = environment_stack;
+    environment_stack = environment_stack->next;
+    return old_frame;
+  }
+  void emitStatements(const List<Stmt>& statements) {
+    return emitStatements(statements.begin(), statements.end());
+  }
+  void emitStatements(List<Stmt>::const_iterator begin, List<Stmt>::const_iterator end) {
+    for (; begin != end; ++begin) {
+      auto stmt = *begin;
+      switch (stmt.kind()) {
+        case TK_IF:
+          emitIf(If(stmt));
+          break;
+        case TK_WHILE:
+          emitWhile(While(stmt));
+          break;
+        case TK_FOR:
+          emitFor(For(stmt));
+          break;
+        case TK_ASSIGN:
+          emitAssignment(Assign(stmt));
+          break;
+        case TK_GLOBAL:
+          for (auto ident : Global(stmt).names()) {
+            const auto& name = Ident(ident).name();
+            environment_stack->setVar(ident.range(), name, graph->addInput(name));
+          }
+          break;
+        case TK_EXPR_STMT: {
+          auto exprs = ExprStmt(stmt).exprs();
+          for (const auto& expr : exprs) {
+            emitSugaredExpr(expr, 0);
+          }
+        }
+        break;
+        case TK_RETURN:
+          throw ErrorReport(stmt) << "return statements can appear only at the end "
+                                  << "of the function body";
+          break;
+      }
+    }
+  }
+
+  std::shared_ptr<Environment> emitSingleIfBranch(
+      Block* b,
+      const List<Stmt> branch) {
+    pushFrame(b);
+    WithInsertPoint guard(b);
+    emitStatements(branch);
+    return popFrame();
+  }
+
+  Node* create(Symbol kind, const SourceRange& loc,  size_t n_outputs) {
+    return graph
+             ->create(kind, n_outputs)
+             ->setSourceLocation(std::make_shared<SourceRange>(loc));
+  }
+
+  Value* emitTernaryIf(const TernaryIf& expr) {
+    Value* cond_value = emitExpr(expr.cond());
+
+    Node* n = graph->insertNode(create(prim::If, expr.range(), 0));
+    n->addInput(cond_value);
+    auto* true_block = n->addBlock();
+    auto* false_block = n->addBlock();
+
+    auto emit_if_expr = [this](Block* b, const Expr& expr) {
+      pushFrame(b);
+      WithInsertPoint guard(b);
+      Value* out_val = emitExpr(expr);
+      b->registerOutput(out_val);
+      popFrame();
+    };
+
+    emit_if_expr(true_block, expr.true_expr());
+    emit_if_expr(false_block, expr.false_expr());
+
+    // Add op outputs
+    auto expr_value = n->addOutput(); // Resulting value
+
+    return expr_value;
+  }
+
+  void emitIf(const If& stmt) {
+    Value* cond_value = emitExpr(stmt.cond());
+
+    Node* n = graph->insertNode(create(prim::If, stmt.range(), 0));
+    n->addInput(cond_value);
+    auto* true_block = n->addBlock();
+    auto* false_block = n->addBlock();
+
+    // Emit both blocks once to get the union of all mutated values
+    auto save_true = emitSingleIfBranch(true_block, stmt.trueBranch());
+    auto save_false = emitSingleIfBranch(false_block, stmt.falseBranch());
+
+    // In python, every variable assigned in an if statement escapes
+    // the scope of the if statement (all variables are scoped to the function).
+    // Script is a subset of python: we consider variables to be in scope
+    // as long as there is a definition of the variable along all paths
+    // through the if statemnent
+    // ----
+    // if ...:
+    //   a =
+    // else:
+    //   ...
+    // ... = a  # error, a is not defined along all paths
+    // ----
+    // if ...:
+    //   a =
+    // else:
+    //   a =
+    // ... = a # OK, a is defined along all paths
+    // ----
+    // a = ...
+    // if ...:
+    //   a =
+    // ... = a # OK, a is defined along all paths
+
+
+    //ordered set, because we want deterministic graph output
+    std::set<std::string> mutated_variables;
+
+    for(auto & v : save_true->definedVariables()) {
+      if(save_false->findInAnyFrame(v)) {
+        mutated_variables.insert(v);
+      }
+    }
+    for(auto & v : save_false->definedVariables()) {
+      if(save_true->findInAnyFrame(v)) {
+        mutated_variables.insert(v);
+      }
+    }
+
+    // Register outputs in each block
+    for (const auto& x : mutated_variables) {
+      auto tv = save_true->getVar(x, stmt.range());
+      true_block->registerOutput(tv);
+      auto fv = save_false->getVar(x, stmt.range());
+      false_block->registerOutput(fv);
+      environment_stack->setVar(stmt.range(), x, n->addOutput()->setType(tv->type()));
+    }
+
+  }
+
+  // *********************** Loop Operators ************************************
+  // Emits a loop operators conforming to the semantics specified at
+  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#experimental-loop
+  // TODO: implement scan_outputs
+
+  // the format of the Loop instruction is:
+  // loop_carried_outputs* = Loop(max_trip_count, start_condition,
+  // loop_carried_inputs*)
+  //                          block0(loop_counter, loop_carried_block*) {
+  //                             <body>
+  //                             -> (continue_condition,
+  //                             loop_carried_block_outputs*)
+  //                          }
+  // all loop_carried_... lists are the same length and represent the value of
+  // loop-carried variables whose definitions are updated as the loop executes
+  // in a way that ensure single static assignment.
+
+
+  void emitLoopCommon(
+      SourceRange range,
+      at::optional<Expr> max_trip_count,
+      at::optional<Expr> cond,
+      const List<Stmt>& body,
+      at::optional<Ident> itr_ident) {
+    Node* n = graph->insertNode(create(prim::Loop, range, 0));
+    Value *max_trip_count_val, *cond_val;
+    {
+      WithInsertPoint guard(n);
+      if (max_trip_count) {
+        max_trip_count_val = emitExpr(max_trip_count.value(), ensureTensorOrNumber);
+      } else {
+        max_trip_count_val =
+            emitConst(Const::create(range, std::to_string(INT_MAX)));
+      }
+      if (cond) {
+        cond_val = emitExpr(cond.value(), ensureTensorOrNumber);
+      } else {
+        cond_val = emitBooleanConst(range, true);
+      }
+    }
+    n->addInput(max_trip_count_val);
+    n->addInput(cond_val);
+    auto* body_block = n->addBlock();
+    Value* trip_count = body_block->addInput(); // Iteration num
+
+    {
+      pushFrame(body_block);
+      if (itr_ident) {
+        environment_stack->setVar(itr_ident->range(), itr_ident->name(), trip_count);
+      }
+      WithInsertPoint guard(body_block);
+      emitStatements(body);
+
+      // Also emit the conditional
+      if (cond) {
+        Value* body_cond_value = emitExpr(cond.value(), ensureTensorOrNumber);
+        body_block->registerOutput(body_cond_value);
+      } else {
+        Value* cond_value_dummy = emitBooleanConst(range, true);
+        body_block->registerOutput(cond_value_dummy);
+      }
+
+      auto body_frame = popFrame();
+      auto outer_frame = environment_stack;
+
+      // Add block outputs to correspond to each captured input
+      // some of these will be removed.
+      for (const auto& x : body_frame->captured_inputs) {
+        auto fv = body_frame->getValueInThisFrame(range, x);
+        body_block->registerOutput(fv);
+      }
+
+      // Remove inputs for values that did not mutate within the
+      // block
+      body_frame->deleteExtraInputs(range);
+
+      // register node inputs/outputs for the true loop carried deps,
+      for(size_t i = 0; i < body_frame->captured_inputs.size(); ++i) {
+        auto x = body_frame->captured_inputs[i];
+        n->addInput(outer_frame->getVar(x, range));
+        // body_block->inputs(): loop_counter, lcd0, lcd1, ...
+        // captured_inputs: lcd0, lcd1, ...
+        auto typ = body_block->inputs()[i + 1]->type();
+        outer_frame->setVar(range, x, n->addOutput()->setType(typ));
+      }
+
+    }
+  }
+
+  void emitForRange(SourceRange range, const Ident& target, const List<Expr>& args, const List<Stmt>& body) {
+    // TODO: start, stop, step loop
+    if (args.size() != 1) {
+      throw ErrorReport(range)
+          << "range() expects 1 argument but got " << args.size();
+    }
+    emitLoopCommon(range, {args[0]}, {}, body, target);
+  }
+
+  void emitFor(const For& stmt) {
+    // For now, we only support range loops. e.g. for i in range(3): ...
+    auto targets = stmt.targets();
+    auto itrs = stmt.itrs();
+    auto body = stmt.body();
+
+    if (stmt.itrs().size() != 1) {
+      throw ErrorReport(stmt)
+          << "List of iterables is not supported currently.";
+    }
+    if (targets.size() != 1) {
+      throw ErrorReport(stmt) << "Iteration variable unpacking is not supported";
+    }
+
+    if (targets[0].kind() != TK_VAR) {
+      throw ErrorReport(targets[0]) << "Starred unpacking is currently not"
+          << " supported for for loops.";
+    }
+    auto target = Var(targets[0]).name();
+
+    // match range(<expr>) style loops
+    // itrs must consist of a single Apply node
+    if (itrs[0].kind() == TK_APPLY) {
+      Apply range_iterator = Apply(itrs[0]);
+      if (range_iterator.callee().kind() == TK_VAR) {
+        Var var = Var(range_iterator.callee());
+        if (var.name().name() == "range") {
+          return emitForRange(stmt.range(), target, range_iterator.inputs(), body);
+        }
+      }
+    }
+
+    // it isn't a range(<expr>) loop, treat it as a sugared value that maybe can be
+    // unrolled
+    auto sv = emitSugaredExpr(itrs[0], 1);
+    auto instances = sv->asTuple(stmt.range(), method);
+    const std::string& target_name = target.name();
+    pushFrame(environment_stack->block());
+    for(auto inst : instances) {
+      environment_stack->setSugaredVar(itrs[0].range(), target_name, inst);
+      emitStatements(body);
+    }
+
+    for (const auto & n : environment_stack->definedVariables()) {
+      if (environment_stack->findInParentFrame(n)) {
+        environment_stack->next->setVar(stmt.range(), n, environment_stack->getVar(n, stmt.range()));
+      }
+    }
+    popFrame();
+  }
+
+  void emitWhile(const While& stmt) {
+    auto cond = stmt.cond();
+    emitLoopCommon(stmt.range(), {}, {cond}, stmt.body(), {});
+  }
+
+  // Validate that the `lhs` Expr's in an assignment statement are valid. That
+  // is:
+  //
+  // 1) All lhs Expr's are either Var or Starred nodes
+  // 2) There is at most one Starred node in the lhs Expr
+  // 3) A Starred node can only appear when there is another non-Starred lhs Expr
+  //    Concretely this means that `*abc = func()` is illegal. Unpacking all
+  //    outputs into a tuple is covered by `abc = func()`.
+  bool calcNumStarredUnpack(const List<Expr>& lhs, const SourceRange& r) {
+    size_t num_normal_assign = 0;
+    size_t num_starred = 0;
+    for (const auto& assignee : lhs) {
+      if (assignee.kind() == TK_VAR) {
+        num_normal_assign++;
+      } else if (assignee.kind() == TK_STARRED) {
+        num_starred++;
+      } else {
+        throw ErrorReport(assignee)
+            << "lhs of assignment must be a variable or starred expression.";
+      }
+    }
+
+    if (num_starred > 1) {
+      throw ErrorReport(r)
+          << "Only one starred expression is allowed on the lhs.";
+    }
+
+    if (num_starred > 0 && num_normal_assign == 0) {
+      throw ErrorReport(r) << "A Starred expression may only appear on the "
+                              << "lhs within the presence of another non-starred"
+                              << " expression.";
+    }
+
+    return num_starred;
+  }
+
+  void emitAssignment(const Assign& stmt) {
+    bool starred_unpack = calcNumStarredUnpack(stmt.lhs(), stmt.range());
+    if (stmt.reduction() != '=') {
+      if (stmt.lhs().size() != 1) {
+        throw ErrorReport(stmt)
+            << "reductions are only allowed when there is a single variable "
+            << "on the left-hand side.";
+      }
+      Ident lhs = Var(stmt.lhs()[0]).name();
+      Expr expr = BinOp::create(stmt.range(), stmt.reduction(),
+                                Var::create(lhs.range(), lhs), stmt.rhs());
+      environment_stack->setVar(lhs.range(), lhs.name(), emitExpr(expr));
+      return;
+    }
+
+    // See [N_BINDERS]
+    size_t n_binders = stmt.lhs().size();
+    if(starred_unpack)
+      n_binders--;
+
+    auto output = emitSugaredExpr(stmt.rhs(), n_binders);
+
+    if(stmt.lhs().size() == 1) {
+      JIT_ASSERT(!starred_unpack);
+      auto v = Var(stmt.lhs()[0]);
+      environment_stack->setSugaredVar(v.range(), v.name().name(), output);
+      return;
+    }
+
+    auto outputs = output->asTuple(stmt.rhs().range(), method);
+    if(outputs.size() < n_binders) {
+      throw ErrorReport(stmt)
+        << "need " << (starred_unpack ? "at least " : "")
+        << n_binders << " values to unpack but found only "
+        << outputs.size();
+    }
+    if(outputs.size() > n_binders && !starred_unpack) {
+      throw ErrorReport(stmt)
+      << "too many values to unpack, need " << n_binders << " but found "
+      << outputs.size();
+    }
+    int i = 0;
+    for (auto assignee : stmt.lhs()) {
+      if (assignee.kind() == TK_VAR) {
+        environment_stack->setSugaredVar(assignee.range(), Var(assignee).name().name(), outputs.at(i));
+        i++;
+      } else if (assignee.kind() == TK_STARRED) {
+        auto var = Starred(assignee).expr();
+        if (var.kind() != TK_VAR) {
+          throw ErrorReport(var) << "Cannot pack a tuple into a non-variable.";
+        }
+        size_t n_matched = outputs.size() - n_binders;
+        ArrayRef<std::shared_ptr<SugaredValue>> outputs_ref = outputs;
+        auto values = fmap(outputs_ref.slice(i, n_matched), [&](const std::shared_ptr<SugaredValue>& v) {
+          return v->asValue(assignee.range(), method);
+        });
+        auto tup = graph->insertNode(graph->createTuple(values))->output();
+        environment_stack->setVar(
+          var.range(), Var(var).name().name(), tup);
+        i += n_matched;
+      }
+    }
+  }
+
+  NodeKind getNodeKind(int kind, int ninputs) {
+    switch (kind) {
+      case '+':
+        return aten::add;
+      case '-':
+        return aten::sub;
+      case TK_UNARY_MINUS:
+        return aten::neg;
+      case '*':
+        return aten::mul;
+      case TK_POW:
+        return aten::pow;
+      case '@':
+        return aten::matmul;
+      case TK_STARRED:
+        return prim::Starred;
+      case '/':
+        return aten::div;
+      case TK_NE:
+        return aten::ne;
+      case TK_EQ:
+        return aten::eq;
+      case '<':
+        return aten::lt;
+      case '>':
+        return aten::gt;
+      case TK_LE:
+        return aten::le;
+      case TK_GE:
+        return aten::ge;
+      case TK_AND:
+        return aten::__and__;
+      case TK_OR:
+        return aten::__or__;
+      case TK_NOT:
+        return aten::__not__;
+      default:
+        throw std::runtime_error("unknown kind " + std::to_string(kind));
+    }
+  }
+
+
+
+  std::vector<NamedValue> getNamedValues(
+      TreeList trees,
+      bool maybe_unpack=false,
+      std::function<Value*(const SourceRange&, Value*)> post_process = ensureTensor) {
+    std::vector<NamedValue> values;
+    size_t next_arg = 0;
+    for (const auto& tree : trees) {
+      if(maybe_unpack && tree->kind() == TK_STARRED) {
+        auto starred = Starred(tree);
+        auto entries = emitSugaredExpr(starred.expr(), 1)->asTuple(starred.range(), method);
+        for(auto entry : entries) {
+          values.push_back(NamedValue(
+              tree->range(),
+              next_arg++,
+              post_process(
+                  starred.range(), entry->asValue(starred.range(), method))));
+        }
+      } else {
+        values.push_back(NamedValue(
+            tree->range(), next_arg++, emitExpr(Expr(tree), post_process)));
+      }
+    }
+    return values;
+  }
+  std::vector<NamedValue> getNamedValues(
+      List<Expr> trees,
+      bool maybe_unpack=false,
+      std::function<Value*(const SourceRange&, Value*)> post_process = ensureTensor) {
+    return getNamedValues(trees.tree()->trees(), maybe_unpack, post_process);
+  }
+
+  std::vector<Value*> getValues(
+      TreeList trees,
+      bool maybe_unpack=false,
+      std::function<Value*(const SourceRange&, Value*)> post_process = ensureTensor) {
+    return toValues(getNamedValues(trees, maybe_unpack, post_process));
+  }
+  std::vector<Value*> getValues(
+      List<Expr> trees,
+      bool maybe_unpack=false,
+      std::function<Value*(const SourceRange&, Value*)> post_process = ensureTensor) {
+    return getValues(trees.tree()->trees(), maybe_unpack, post_process);
+  }
+
+  // special rules apply when we directly call foo(a,b) when foo is an ident
+  std::shared_ptr<SugaredValue> emitApplyIdent(Ident ident, const std::vector<NamedValue>& inputs, at::ArrayRef<NamedValue> attributes, size_t n_binders) {
+    auto it = function_table.find(ident.name());
+    if (it != function_table.end()) {
+      return packOutputs(*graph, method.emit_call_to(ident.range(), it->second, inputs, attributes));
+    } else if (ident.name() == "print") {
+      if (!attributes.empty())
+        throw ErrorReport(ident) << "print doesn't accept any keyword arguments";
+      ensureTensors(ident.range(), toValues(inputs));
+      emitNode(prim::Print, ident.range(), toValues(inputs), 0);
+      return std::make_shared<NoneValue>();
+    }
+    if(auto result = emitBuiltinCall(ident.range(), method, ident.name(), inputs, attributes, false)) {
+      return result;
+    }
+    // it wasn't known built in, so treat it like standard apply
+    return emitApplyExpr(Var::create(ident.range(), ident), inputs, attributes, n_binders);
+  }
+
+  std::shared_ptr<SugaredValue> emitApplyExpr(Expr callee, const std::vector<NamedValue>& inputs, at::ArrayRef<NamedValue> attributes, size_t n_binders) {
+    // otherwise we evaluate the callee and then desugar it
+    auto sv = emitSugaredExpr(callee, 1);
+    return sv->call(callee.range(), method, inputs, attributes, n_binders);
+  }
+
+  Value* emitExpr(Expr tree, std::function<Value*(const SourceRange&, Value*)> post_process = ensureTensor) {
+    return post_process(tree.range(), emitSugaredExpr(tree, 1)->asValue(tree.range(), method));
+  }
+
+  NodeKind reverseComparision(NodeKind kind) {
+    if (kind == aten::lt) {
+      return aten::gt;
+    } else if (kind == aten::le) {
+      return aten::ge;
+    } else if (kind == aten::gt) {
+      return aten::lt;
+    } else if (kind == aten::ge) {
+      return aten::le;
+    }
+    throw std::runtime_error("reverseComparision: unsupported NodeKind. File a bug");
+  }
+
+  std::vector<NamedValue> toNamedValues(
+      const SourceRange& loc,
+      ArrayRef<Value*> inputs) {
+    return fmap(inputs, [&](Value* v) {
+      return NamedValue(loc, "", v);
+    });
+  }
+
+  Value* emitBasicMath(
+      const SourceRange& loc,
+      Method& method,
+      NodeKind kind,
+      at::ArrayRef<Value*> inputs) {
+    auto sugared_ptr = emitBuiltinCall(
+        loc,
+        method,
+        kind.toUnqualString(),
+        toNamedValues(loc, inputs),
+        /*attributes=*/{},
+        /*required=*/true);
+    auto simple_ptr = std::dynamic_pointer_cast<SimpleValue>(sugared_ptr);
+    JIT_ASSERT(simple_ptr);
+    return simple_ptr->getValue();
+  }
+
+  // Handles binary python math ops.
+  Value* emitPythonMath(
+      const SourceRange& loc,
+      Method& method,
+      NodeKind kind,
+      Value* lhs,
+      Value* rhs) {
+    // Assume lhs, rhs are either IntType or FloatType.
+    bool lhs_is_float = lhs->type()->kind() == TypeKind::FloatType;
+    bool rhs_is_float = rhs->type()->kind() == TypeKind::FloatType;
+    JIT_ASSERT(lhs_is_float || lhs->type()->kind() == TypeKind::IntType);
+    JIT_ASSERT(rhs_is_float || rhs->type()->kind() == TypeKind::IntType);
+
+    auto out_type = lhs->type();
+    if (kind == aten::ge || kind == aten::le || kind == aten::eq ||
+        kind == aten::gt || kind == aten::lt || kind == aten::ne) {
+      // Stand-in for bool type.
+      out_type = NumberType::get();
+    } else {
+      // If the types are different, one must be FloatType.
+      // We should promote the other value to FloatType.
+      if (lhs_is_float != rhs_is_float) {
+        out_type = FloatType::get();
+      }
+    }
+
+    // Strategy: cast inputs to tensor, perform op, recast to number
+    lhs = numToTensor(loc, *graph, lhs);
+    rhs = numToTensor(loc, *graph, rhs);
+
+    // FIXME: support (python) math between IntType and FloatType.
+    // Here, without loss of generality, let's say lhs is a float and rhs is an
+    // int. We should insert an aten::type_as(lhs, rhs) node into the graph.
+    // However, the graph fuser generally has problems working with scalar tensors
+    // (#8560), so we don't support this right now.
+    if (lhs_is_float != rhs_is_float) {
+      throw std::runtime_error("NYI: math between float and int. See #8560.");
+    }
+
+    auto* out = emitBasicMath(loc, method, kind, { lhs, rhs });
+    return tensorToNum(loc, *graph, out, out_type);
+  }
+
+  // math ops between a tensor and a number require that the number be the
+  // the same type (ScalarType and Backend) as the tensor, because numbers
+  // in the JIT are represented as scalar tensors.
+  // This function casts the number to the same type as the tensor.
+  Value* emitTensorNumberMath(
+      const SourceRange& loc,
+      Method& method,
+      NodeKind kind,
+      Value* lhs,
+      Value* rhs) {
+    auto rhs_kind = rhs->type()->kind();
+    JIT_ASSERT(rhs_kind == TypeKind::FloatType || rhs_kind == TypeKind::IntType);
+    JIT_ASSERT(isTensorSubtype(lhs));
+
+    rhs = numToTensor(loc, *graph, rhs);
+    auto args = { rhs, lhs };
+    rhs = graph->insertNode(graph->create(aten::type_as, args))
+               ->output();
+    return emitBasicMath(loc, method, kind, { lhs, rhs });
+  }
+
+  // Handles binary math ops.
+  Value* emitMath(
+      const SourceRange& loc,
+      Method& method,
+      NodeKind kind,
+      ArrayRef<Value*> inputs) {
+    JIT_ASSERT(inputs.size() == 2);
+    auto& lhs = inputs[0];
+    auto& rhs = inputs[1];
+    bool lhs_is_number = isNumberSubtype(lhs);
+    bool lhs_is_tensor = isTensorSubtype(lhs);
+    bool rhs_is_number = isNumberSubtype(rhs);
+    bool rhs_is_tensor = isTensorSubtype(rhs);
+    JIT_ASSERT(lhs_is_tensor || lhs_is_number);
+    JIT_ASSERT(rhs_is_tensor || rhs_is_number);
+
+    if (lhs_is_number && rhs_is_number) {
+      return emitPythonMath(loc, method, kind, lhs, rhs);
+    }
+
+    if (lhs_is_number && rhs_is_tensor) {
+
+      // commutative operations: just swap the args
+      if (kind == aten::mul || kind == aten::add ||
+          kind == aten::ne || kind == aten::eq) {
+        return emitTensorNumberMath(loc, method, kind, rhs, lhs);
+
+      // rsub
+      } else if (kind == aten::sub) {
+        auto* node = emitNode(aten::neg, loc, { rhs }, 1);
+        return emitTensorNumberMath(loc, method, aten::add, node->output(), lhs);
+
+      // rdiv
+      } else if (kind == aten::div) {
+        auto* node = emitNode(aten::reciprocal, loc, { rhs }, 1);
+        return emitTensorNumberMath(loc, method, aten::mul, node->output(), lhs);
+
+      // Comparision ops: swap args and use reverse comparison
+      } else if (kind == aten::lt || kind == aten::le ||
+                 kind == aten::gt || kind == aten::ge) {
+        return emitTensorNumberMath(loc, method,
+                                    reverseComparision(kind),
+                                    rhs, lhs);
+      } else {
+        throw std::runtime_error("Unknown node kind, please file a bug report");
+      }
+    }
+
+    if (lhs_is_tensor && rhs_is_number) {
+      return emitTensorNumberMath(loc, method, kind, lhs, rhs);
+    }
+
+    return emitBasicMath(loc, method, kind, inputs);
+  }
+
+  // Handles unary math ops.
+  Value* emitUnaryMath(
+      const SourceRange& loc,
+      Method& method,
+      NodeKind kind,
+      ArrayRef<Value*> inputs) {
+    JIT_ASSERT(inputs.size() == 1);
+    auto* in = inputs[0];
+    bool in_is_number = isNumberSubtype(in);
+    bool in_is_tensor = isTensorSubtype(in);
+    JIT_ASSERT(in_is_number || in_is_tensor);
+
+    if (in_is_tensor) {
+      return emitBasicMath(loc, method, kind, inputs);
+    }
+
+    // Cast to tensor, perform op, recast to number
+    auto out_type = in->type();
+    in = numToTensor(loc, *graph, in);
+    auto* out = emitBasicMath(loc, method, kind, { in });
+    return tensorToNum(loc, *graph, out, out_type);
+  }
+
+  // any expression that can produce a SugaredValue is handled here
+  // expressions that only return a single Value* are handled in emitSimpleExpr
+  std::shared_ptr<SugaredValue> emitSugaredExpr(Expr tree, size_t n_binders) {
+    switch(tree.kind()) {
+      case TK_VAR:
+        return environment_stack->getSugaredVar(Var(tree).name());
+      case '.': {
+        auto select = Select(tree);
+        auto sv = emitSugaredExpr(select.value(), 1);
+        return sv->attr(select.range(), method, select.selector().name());
+      }
+      case TK_APPLY: {
+        auto apply = Apply(tree);
+        auto inputs = getNamedValues(apply.inputs(), true, identity);
+        auto attributes = fmap(apply.attributes(), [&](const Attribute& attr) {
+          return NamedValue(attr.range(), attr.name().name(), emitExpr(attr.value(), identity));
+        });
+        // the apply is directly an identifier 'foo'
+        if(apply.callee().kind() == TK_VAR) {
+          return emitApplyIdent(Var(apply.callee()).name(), inputs, attributes, n_binders);
+        }
+        return emitApplyExpr(apply.callee(), inputs, attributes, n_binders);
+      } break;
+      default:
+        return std::make_shared<SimpleValue>(emitSimpleExpr(tree));
+    }
+  }
+
+  Value* emitSimpleExpr(
+      const TreeRef& tree) {
+    switch (tree->kind()) {
+      case '@':
+      case TK_POW:
+      case TK_AND:
+      case TK_OR:
+      case TK_NOT: {
+        const auto& inputs = tree->trees();
+        auto kind = getNodeKind(tree->kind(), inputs.size());
+        return emitNode(kind, tree->range(), getValues(inputs), 1)->output();
+      } break;
+      case TK_NE:
+      case TK_EQ:
+      case '<':
+      case '>':
+      case TK_LE:
+      case TK_GE:
+      case '*':
+      case '/':
+      case '+':
+      case '-': {
+        const auto& inputs = tree->trees();
+        auto kind = getNodeKind(tree->kind(), inputs.size());
+        auto input_vals = getValues(inputs, /*maybe_unpack*/false, ensureTensorOrNumber);
+        return emitMath(tree->range(), method, kind, input_vals);
+      }
+      case TK_UNARY_MINUS: {
+        const auto& inputs = tree->trees();
+        auto kind = getNodeKind(tree->kind(), inputs.size());
+        auto input_vals = getValues(inputs, /*maybe_unpack*/false, ensureTensorOrNumber);
+        return emitUnaryMath(tree->range(), method, kind, input_vals);
+      }
+      case TK_STARRED: {
+        throw ErrorReport(tree) << "Unexpected starred expansion. File a bug report.";
+      }
+      case TK_CAST: {
+        const auto cast = Cast(tree);
+        return emitCast(cast.input(), cast.type());
+      } break;
+      case TK_CONST: {
+        return emitConst(Const(tree));
+      } break;
+      case TK_TRUE: {
+        return emitBooleanConst(tree->range(), true);
+      } break;
+      case TK_FALSE: {
+        return emitBooleanConst(tree->range(), false);
+      } break;
+      case TK_SLICE: {
+        const auto slice = Slice(tree);
+        return emitSlice(
+            slice.range(),
+            {slice.value(), slice.startOr(0), slice.endOr(-1)});
+      } break;
+      case TK_GATHER: {
+        const auto gather = Gather(tree);
+        return emitGather(
+            gather.range(), {gather.value(), gather.indices()});
+      } break;
+      case TK_IF_EXPR: {
+        return emitTernaryIf(TernaryIf(tree));
+      } break;
+      case TK_LIST_LITERAL: {
+        auto ll = ListLiteral(tree);
+        auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
+        return graph->insertNode(graph->createTuple(values))->output();
+      } break;
+      default:
+        throw ErrorReport(tree) << "NYI: " << tree;
+        break;
+    }
+  }
+
+  Value* emitCast(Expr input, const ScalarType& type) {
+    at::ScalarType t;
+    switch (type.kind()) {
+      case TK_INT:
+        t = at::kInt;
+        break;
+      case TK_FLOAT:
+        t = at::kFloat;
+        break;
+      case TK_LONG:
+        t = at::kLong;
+        break;
+      case TK_BOOL:
+        t = at::kByte;
+        break;
+      default:
+        throw ErrorReport(input) << "Unrecognized type: " << type;
+    }
+    return emitNode(
+               Symbol::aten("type_as"),
+               input.range(),
+               {emitExpr(input), createConstant(*graph, input.range(), at::ones({1}, t))},
+               1)
+        ->output();
+  }
+
+  Value* emitBooleanConst(SourceRange range, bool val) {
+    return createConstant(*graph, range, at::CPU(at::kByte).scalarTensor(val));
+  }
+
+  Value* emitConst(const Const& c) {
+    if (c.isFloatingPoint()) {
+      return createNumber(
+          *graph,
+          c.range(),
+          at::CPU(at::kFloat).scalarTensor(c.asFloatingPoint()));
+    } else {
+      return createNumber(
+          *graph,
+          c.range(),
+          at::CPU(at::kLong).scalarTensor(c.asIntegral()));
+    }
+  }
+
+  Node* emitNode(
+      NodeKind kind,
+      const SourceRange& loc,
+      const std::vector<Value*> inputs,
+      size_t n_outputs) {
+    Node* n = graph->insertNode(create(kind, loc, n_outputs));
+    for (auto* input_value : inputs) {
+      n->addInput(input_value);
+    }
+    return n;
+  }
+
+  // Desugars slice syntactic sugar tensor[begin:end] -> tensor.slice(begin,
+  // end).
+  Value* emitSlice(
+      const SourceRange& loc,
+      TreeList&& inputs) {
+    const auto applyInputs =
+        Compound::create(TK_LIST, loc, std::move(inputs));
+    const auto input_values = getNamedValues(applyInputs->trees(),
+                                             /*maybe_unpack*/false,
+                                             ensureTensorOrNumber);
+    NamedValue tensor = input_values[0];
+    NamedValue begin = input_values[1];
+    NamedValue end = input_values[2];
+    NamedValue dim = NamedValue(loc, "dim",
+        createConstant(*graph, loc, at::CPU(at::kLong).scalarTensor(0)));
+    NamedValue step = NamedValue(loc, "step",
+        createConstant(*graph, loc, at::CPU(at::kLong).scalarTensor(1)));
+
+    return emitBuiltinCall(
+               loc, method, "slice", {tensor, dim, begin, end, step}, {}, true)
+        ->asValue(loc, method);
+  }
+
+  // Desugars gather syntactic sugar tensor[idx] -> tensor.select(idx).
+  Value* emitGather(
+      const SourceRange& loc,
+      TreeList&& inputs) {
+    const auto applyInputs =
+        Compound::create(TK_LIST, loc, std::move(inputs));
+    auto input_values = getNamedValues(applyInputs->trees(),
+                                        /*maybe_unpack*/false,
+                                        ensureTensorOrNumber);
+    NamedValue tensor = input_values[0];
+    NamedValue dim = NamedValue(
+        loc,
+        "dim",
+        createConstant(*graph, loc, at::CPU(at::kLong).scalarTensor(0)));
+    NamedValue idx = input_values[1];
+
+    return emitBuiltinCall(loc, method, "select", {tensor, dim, idx}, {}, true)
+        ->asValue(loc, method);
+  }
+};
+
+// support syntax sugar for x.foo(y, z) by allowing x.foo to return a
+// callable value that will resolve to foo(x, y, z) when called.
+std::shared_ptr<SugaredValue> SimpleValue::attr(SourceRange loc, Method & m, const std::string& field) {
+  return std::make_shared<BuiltinFunction>(field, NamedValue(loc, "self", value));
+}
+
+std::vector<Value*> inlineCallTo(Graph& g, Graph& callee, ArrayRef<Value*> inputs) {
+  std::unordered_map<Value*, Value*> value_map;
+  auto value_map_func = [&](Value* v) { return value_map.at(v); };
+  JIT_ASSERT(callee.inputs().size() == inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    value_map[callee.inputs()[i]] = inputs[i];
+  }
+  for (auto* node : callee.nodes()) {
+    auto* new_node =
+        g.insertNode(g.createClone(node, value_map_func));
+    for (size_t i = 0; i < node->outputs().size(); ++i) {
+      value_map[node->outputs()[i]] = new_node->outputs()[i];
+    }
+  }
+
+  std::vector<Value*> outputs;
+  for (auto* output : callee.outputs()) {
+    outputs.push_back(value_map_func(output));
+  }
+  return outputs;
+}
+
+void defineMethodsInModule(Module & m, const std::vector<Def>& definitions, const std::vector<Resolver>& resolvers, SugaredValuePtr self) {
+  FunctionTable table;
+  JIT_ASSERT(definitions.size() == resolvers.size());
+  auto resolver_it = resolvers.begin();
+  std::vector<Method*> methods;
+  for(Def def : definitions) {
+    const std::string& name = def.name().name();
+    Resolver resolver = *resolver_it++;
+    auto creator = [def, &table, resolver, self](Method& method) {
+      to_ir(def, table, resolver, self,  method);
+    };
+    Method& method = m.create_method(name, creator);
+    // if self is defined, then these are methods and do not go into the global namespace
+    // otherwise, they get defined together so we add them to the function table
+    // so the methods can see each other
+    if(!self) {
+      auto result = table.emplace(name, method);
+      JIT_ASSERT(result.second);
+    }
+    methods.push_back(&method);
+  }
+  for(Method* method : methods) {
+    method->ensure_defined();
+  }
+}
+
+void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, SugaredValuePtr self) {
+  Parser p(source);
+  std::vector<Def> definitions;
+  std::vector<Resolver> resolvers;
+  while (p.lexer().cur().kind != TK_EOF) {
+    definitions.push_back(Def(p.parseFunction()));
+    resolvers.push_back(resolver);
+  }
+  defineMethodsInModule(m, definitions, resolvers, self);
+}
+
+std::shared_ptr<Graph> compileFunction(Def def, const Resolver& resolver) {
+  Module m; //note: we don't use 'm' to execute so this setting is unused
+  defineMethodsInModule(m, {def}, {resolver}, nullptr);
+  return m.get_method(def.name().name()).graph();
+}
+
+std::vector<std::shared_ptr<SugaredValue>> SimpleValue::asTuple(SourceRange loc, Method& m) {
+  if(value->type()->kind() == TypeKind::TupleType) {
+    auto outputs = createTupleUnpack(value);
+    return fmap(outputs, [](Value* v) -> std::shared_ptr<SugaredValue> {
+      return std::make_shared<SimpleValue>(v);
+    });
+  }
+  throw ErrorReport(loc) << value->type()->str() << " cannot be used as a tuple";
+}
+
+void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what) {
+  if(expected != actual) {
+    throw ErrorReport(loc) << "expected " << expected << " " << what << " but found " << actual;
+  }
+}
+
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
new file mode 100644
index 0000000..f5b80e1
--- /dev/null
+++ b/torch/csrc/jit/script/compiler.h
@@ -0,0 +1,156 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/script/error_report.h"
+#include "torch/csrc/jit/script/tree_views.h"
+#include "torch/csrc/jit/script/module.h"
+
+namespace torch {
+namespace jit {
+namespace script {
+
+struct CallsiteDescriptor {
+  size_t n_outputs;
+  bool allow_varargs;
+};
+
+static inline std::vector<Value*> toValues(at::ArrayRef<NamedValue> nvs) {
+  return fmap(nvs, [](const NamedValue& v) {
+    return v.value;
+  });
+}
+
+// The AST can contain nodes like `self`, `self.b` or `python_fn` that
+// are not first-class values in the graph representation, but instead
+// will be desugared based on how they are used in the AST.
+
+// SugaredValue is used to temporarily represent these values in a way
+// that separates their behavior from the AST -> IR converter itself.
+// This allows us to keep dependencies on python minimal.
+
+struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
+  // what is this node? for error reporting (e.g. Module, python function)
+  virtual std::string kind() const = 0;
+
+  // what can we do with this thing?
+  // use it as a value e.g.  `this + 4`
+  virtual Value * asValue(SourceRange loc, Method & m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a value";
+  }
+
+  // select an attribute on it, e.g. `this.field`
+  virtual std::shared_ptr<SugaredValue> attr(SourceRange loc, Method & m, const std::string& field) {
+    throw ErrorReport(loc) << "attribute lookup is not defined on " << kind();
+  }
+
+  // use it as a vector of values, e.g. a tuple of values as return value from
+  // a method invocation
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(SourceRange loc, Method& m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a tuple";
+  }
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  virtual std::shared_ptr<SugaredValue> call(
+    SourceRange loc,
+    Method & m,
+    // note: names for args will be 'argument 0', 'argument 1', etc..
+    at::ArrayRef<NamedValue> inputs,
+    at::ArrayRef<NamedValue> attributes,
+    size_t n_binders) {
+// n_binders is always set to the number of variables an expression is
+// syntactically bound to:
+//     a = foo() # 1 binder (note in this case the single binder might be a tuple)
+//     a, * b = foo() # 1 binder
+//     a, b = foo() # 2 binders
+//     foo() # 0 binders
+//
+// In subexpressions, like bar() in foo(bar()), n_binders is always set to
+// 1. n_binders is used as a hint to subexpressions to determine how many
+// values they should return when that number is ambiguous statically. In
+// particular it is currently used to decide how many tensors a call to a
+// python function will return. It is only a hint, functions do not have to
+// check that n_binders match the number of things they are returning, the
+// assignment logic will do that anyway.
+
+    throw ErrorReport(loc) << "cannot call a " << kind();
+  }
+
+  virtual ~SugaredValue() {}
+};
+
+// most things in the environment are just simple value types
+// and not special python syntax sugar types
+struct SimpleValue : public SugaredValue {
+  SimpleValue(Value * value)
+  : value(value) {}
+  virtual std::string kind() const override {
+    return "value";
+  }
+  virtual Value * asValue(SourceRange range, Method & m) override {
+    return value;
+  }
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(SourceRange loc, Method& m) override;
+  virtual std::shared_ptr<SugaredValue> attr(SourceRange loc, Method & m, const std::string& field) override;
+  Value* getValue() const {
+    return value;
+  }
+private:
+  Value* value;
+};
+
+struct BuiltinFunction : public SugaredValue {
+  BuiltinFunction(const std::string& name, at::optional<NamedValue> value)
+    : name(name), value(std::move(value)) {}
+  std::string name;
+
+  // if this is method, then this is the self argument.
+  at::optional<NamedValue> value;
+
+  virtual std::string kind() const override {
+    return "builtin";
+  }
+  virtual std::shared_ptr<SugaredValue> call(
+    SourceRange loc,
+    Method & m,
+    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> inputs,
+    size_t n_binders) override;
+};
+
+using Resolver = std::function<std::shared_ptr<SugaredValue>(const std::string& name)>;
+void defineMethodsInModule(
+  Module & m,
+  const std::vector<Def>& definitions,
+  const std::vector<Resolver>& resolvers, /* determines how we handle free variables in each definition*/
+  std::shared_ptr<SugaredValue> self /* if non-null, the first argument to each def, is bound to this value */
+);
+
+// same as above but parse the definitions from source
+void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, std::shared_ptr<SugaredValue> self);
+std::shared_ptr<Graph> compileFunction(Def def, const Resolver& resolver);
+
+// pack outputs of a function following python rules. If there is a single value return
+// a SimpleValue, otherwise pack all the values into a Tuple.
+std::shared_ptr<SugaredValue> packOutputs(Graph& g, at::ArrayRef<Value*> values);
+std::vector<Value*> inlineCallTo(Graph& g, Graph& callee, ArrayRef<Value*> inputs);
+void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what);
+void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> values);
+
+// try to match a list if inputs and keyword 'attributes' to this schema,
+// if it works return the flat list of positional inputs to the call
+// if it returns nullopt, then failure_messages contains a good error report
+at::optional<std::vector<Value*>> tryMatchSchema(
+  const FunctionSchema& schema,
+  const SourceRange& loc,
+  Graph& graph,
+  at::ArrayRef<NamedValue> inputs,
+  at::ArrayRef<NamedValue> attributes,
+  std::ostream& failure_messages);
+
+
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/error_report.h b/torch/csrc/jit/script/error_report.h
new file mode 100644
index 0000000..426c32f
--- /dev/null
+++ b/torch/csrc/jit/script/error_report.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "torch/csrc/jit/script/tree.h"
+
+namespace torch {
+namespace jit {
+namespace script {
+
+struct ErrorReport : public std::exception {
+  ErrorReport(const ErrorReport& e)
+      : ss(e.ss.str()), context(e.context), the_message(e.the_message) {}
+
+  ErrorReport() : context(nullptr) {}
+  explicit ErrorReport(const SourceRange& r)
+      : context(std::make_shared<SourceRange>(r)) {}
+  explicit ErrorReport(std::shared_ptr<SourceLocation> loc)
+  : context(std::move(loc)) {}
+  explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
+  explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
+  virtual const char* what() const noexcept override {
+    std::stringstream msg;
+    msg << "\n" << ss.str();
+    if (context != nullptr) {
+      msg << ":\n";
+      context->highlight(msg);
+    } else {
+      msg << ".\n";
+    }
+    the_message = msg.str();
+    return the_message.c_str();
+  }
+
+ private:
+  template <typename T>
+  friend const ErrorReport& operator<<(const ErrorReport& e, const T& t);
+
+  mutable std::stringstream ss;
+  std::shared_ptr<SourceLocation> context;
+  mutable std::string the_message;
+};
+
+template <typename T>
+const ErrorReport& operator<<(const ErrorReport& e, const T& t) {
+  e.ss << t;
+  return e;
+}
+
+#define JIT_SCRIPT_ASSERT(ctx, cond)                                       \
+  if (!(cond)) {                                                           \
+    throw ::torch::jit::script::ErrorReport(ctx)                           \
+        << __FILE__ << ":" << __LINE__ << ": assertion failed: " << #cond; \
+  }
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
new file mode 100644
index 0000000..2bb72c4
--- /dev/null
+++ b/torch/csrc/jit/script/init.cpp
@@ -0,0 +1,514 @@
+#include "torch/csrc/jit/script/init.h"
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/python_tracer.h"
+#include "torch/csrc/jit/pybind_utils.h"
+#include "torch/csrc/jit/passes/to_batch.h"
+
+#include <torch/csrc/api/include/torch/detail/ordered_dict.h>
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace script {
+
+using ResolutionCallback = std::function<py::function(std::string)>;
+
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+static std::string typeString(py::handle h) {
+  return py::str(h.get_type().attr("__name__"));
+}
+
+static std::shared_ptr<SugaredValue> createConstant(SourceRange loc, Method& m, const at::Tensor& val, TypePtr typ=nullptr) {
+  auto n = m.graph()->createConstant(val);
+  if(typ)
+    n->output()->setType(typ);
+  n->setSourceLocation(std::make_shared<SourceRange>(loc));
+  return std::make_shared<SimpleValue>(m.graph()->insertNode(n)->output());
+}
+
+struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
+  PythonValue(py::object self)
+  : self(std::move(self)) {}
+
+  std::pair<std::vector<TypePtr>, TypePtr> getFunctionType(size_t n_args, size_t n_binders) {
+    auto annotations = py::module::import("torch.jit.annotations");
+    return py::cast<std::pair<std::vector<TypePtr>, TypePtr>>(annotations.attr("get_signature")(self, n_args, n_binders));
+  }
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  virtual std::shared_ptr<SugaredValue> call(SourceRange loc, Method & m, at::ArrayRef<NamedValue> inputs_, at::ArrayRef<NamedValue> attributes, size_t n_binders) override {
+    auto inputs = toValues(inputs_);
+    std::vector<TypePtr> arg_types;
+    TypePtr ret_type;
+    std::tie(arg_types, ret_type) = getFunctionType(inputs.size(), n_binders);
+
+    if (arg_types.size() != inputs.size())
+      throw ErrorReport(loc) << "calling a Python function with an incorrect number "
+                             << "of arguments: expected " << arg_types.size() << ", but got "
+                             << inputs.size();
+    for (size_t i = 0; i < arg_types.size(); ++i) {
+      if (!inputs[i]->type()->isSubtypeOf(*arg_types[i]))
+        throw ErrorReport(loc) << "type mismatch at argument " << i << ": expected "
+                               << arg_types[i]->str() << ", but got " << inputs[i]->type()->str();
+    }
+    // We have to do this check here, because implementation of this function is tightly
+    // coupled with the impl for PythonOp in the interpreter. Right now it assumes that
+    // all inputs taken from the stack are Tensors, so that's what we have to do.
+    ensureTensors(loc, inputs);
+
+    if (attributes.size() > 0)
+      throw ErrorReport(loc) << "keyword arguments in Python calls aren't supported";
+    Graph& g = *m.graph();
+
+    // this python object might be a @trace or @script function/module
+    // if so, inline the graph rather than calling the python
+
+    if(py::isinstance<Module>(self)) {
+      Module& mod = py::cast<Module&>(self);
+      if (Method * forward = mod.find_method("forward")) {
+        // This code path should only get called for Modules that are really
+        // wrappers around pure script/traced functions. Modules with parameters
+        // should be submodules of the caller, and thus will be represented as
+        // ModuleValue and not go through here.
+        if (mod.get_parameters().size() != 0) {
+          throw ErrorReport(loc) << "Attempted to inline a Module with parameters. "
+            "Stateful modules to be inlined must be submodules of the callee.";
+        }
+        std::vector<torch::jit::NamedValue> named_inputs;
+        for (auto inp : inputs)
+          named_inputs.push_back(NamedValue(loc, "", inp));
+        return packOutputs(*m.graph(), m.emit_call_to(loc, *forward, named_inputs, {}));
+      }
+    }
+
+    // Release the function object so we can wrap it in a PythonOp
+    py::object func = self;
+    std::string cconv(inputs.size(), 't');
+    Node* new_node = g.insertNode(g.createPythonOp(
+      THPObjectPtr(func.release().ptr()), cconv, {}));
+    new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
+    for(auto i : inputs)
+      new_node->addInput(i);
+
+    // This is really dumb, but relaxing the constraints on return types would
+    // require us to change the implementation of PythonOps in the interpreter.
+    // Note that this effectively makes the return type of Tuple[Tensor] and Tensor
+    // equivalent, but the PythonOp impl ends with an optional tuple unpack, so we need
+    // to do it.
+    std::shared_ptr<TupleType> ret_tuple_type;
+    if (ret_type->kind() != TypeKind::TupleType) {
+      ret_tuple_type = std::make_shared<TupleType>(std::vector<TypePtr>{ret_type});
+    } else {
+      ret_tuple_type = std::static_pointer_cast<TupleType>(ret_type);
+    }
+    for (auto & ret_type_elem : ret_tuple_type->elements()) {
+      if (!ret_type_elem->isSubtypeOf(*DynamicType::get())) {
+        throw ErrorReport(loc) << "Python functions can currently only return Tensors";
+      }
+    }
+
+    std::vector<Value*> outputs;
+    for(size_t i = 0; i < ret_tuple_type->elements().size(); ++i)
+      outputs.push_back(new_node->addOutput());
+    return packOutputs(*m.graph(), outputs);
+  }
+
+  virtual std::shared_ptr<SugaredValue> attr(SourceRange loc, Method & m, const std::string& field) override;
+
+  virtual std::string kind() const override {
+    std::stringstream ss;
+    ss << "python value of type '" << typeString(self) << "'";
+    return ss.str();
+  }
+
+protected:
+  bool isBuiltinModule() {
+    // XXX: these can't be static, or they will be destructed after the Python interpreter
+    // exits and that generally sounds like a bad idea
+    py::object torch = py::module::import("torch");
+    py::object functional = py::module::import("torch.nn.functional");
+    return self.is(torch) || self.is(functional);
+  }
+
+  py::object getattr(SourceRange loc, const std::string& name) {
+    try {
+      return py::getattr(self, name.c_str());
+    } catch (py::error_already_set& e) {
+      throw ErrorReport(loc) << "object has no attribute " << name;
+    }
+  }
+
+  py::object self;
+};
+
+// by using torch.jit.Const, a user can mark a python value constant
+// we then make that value immutable.
+// once marked constant, we enable additional behavior such as
+// 1. conversion via asValue to a constant Tensor
+// 2. unrolling of for loops
+struct VISIBILITY_HIDDEN ConstantPythonValue : public PythonValue {
+  using PythonValue::PythonValue;
+  virtual Value * asValue(SourceRange loc, Method & m) override {
+
+    return PythonValue::asValue(loc, m);
+  }
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(SourceRange loc, Method& m) override {
+    if(!py::isinstance<py::tuple>(self))
+      return PythonValue::asTuple(loc, m);
+
+    py::tuple tup = self;
+    std::vector<std::shared_ptr<SugaredValue>> result;
+    for(size_t i = 0; i < tup.size(); ++i) {
+      result.push_back(create(loc, m, tup[i]));
+    }
+    return result;
+  }
+  static std::shared_ptr<SugaredValue> create(SourceRange loc, Method& m, py::object self) {
+    // directly create SimpleValues when possible, because they are first-class
+    // and can be re-assigned. Otherwise, this would be invalid:
+    // f = python_constant
+    // while ...
+    //   f = f + 1
+    if(py::isinstance<py::int_>(self)) {
+      return createConstant(loc, m, at::CPU(at::kLong).scalarTensor(py::cast<int64_t>(self)));
+    } else if(py::isinstance<py::float_>(self)) {
+      return createConstant(loc, m, at::CPU(at::kFloat).scalarTensor(py::cast<float>(self)));
+    } else if(py::isinstance<py::bool_>(self)) {
+      return createConstant(loc, m, at::CPU(at::kByte).scalarTensor(py::cast<bool>(self)));
+    } else if(THPDevice_Check(self.ptr())) {
+      auto device = (THPDevice*) self.ptr();
+      auto t = as_tensor({static_cast<int64_t>(device->device.type()), device->device.index()});
+      return createConstant(loc, m, t, ListType::ofInts());
+    } else if(THPLayout_Check(self.ptr())) {
+      auto layout = (THPLayout*) self.ptr();
+      const auto v = static_cast<int64_t>(layout->layout);
+      return createConstant(loc, m, at::CPU(at::kLong).scalarTensor(v), IntType::get());
+    } else if(THPDtype_Check(self.ptr())) {
+      auto dtype = (THPDtype*)(self.ptr());
+      const auto v = static_cast<int64_t>(dtype->scalar_type);
+      return createConstant(loc, m, at::CPU(at::kLong).scalarTensor(v), IntType::get());
+    }
+    return std::make_shared<ConstantPythonValue>(self);
+  }
+};
+
+std::shared_ptr<SugaredValue> PythonValue::attr(SourceRange loc, Method & m, const std::string& field) {
+  // We generally don't want to allow traversing arbitrary Python objects, but we
+  // make an exception for traversing modules because we want to be access
+  // torch, torch.nn.functional, and the functions they expose.
+  py::object member = getattr(loc, field);
+  if (isBuiltinModule()) {
+    if(py::isinstance<py::function>(member)) {
+      return std::make_shared<BuiltinFunction>(field, at::nullopt);
+    }
+    //e.g. any tensor attribute objects such as torch.uint8
+    if(THPDtype_Check(member.ptr()) ||
+       THPLayout_Check(member.ptr()) ||
+       THPDevice_Check(member.ptr())) {
+      return ConstantPythonValue::create(loc, m, member);
+    }
+  }
+  if (py::isinstance<py::module>(self) && py::isinstance<py::module>(member)) {
+    return std::make_shared<PythonValue>(member);
+  }
+  throw ErrorReport(loc) << "unsupported attribute lookup on " << py::repr(self) << ".";
+}
+
+Resolver pythonResolver(ResolutionCallback rcb) {
+  return [=](const std::string& name) -> std::shared_ptr<SugaredValue> {
+      AutoGIL ag;
+      py::object obj = rcb(name);
+      if(obj.is(py::none())) {
+        return nullptr;
+      }
+      return std::make_shared<PythonValue>(obj);
+  };
+}
+
+// defines how modules/methods behave inside the script subset.
+// for now this does not have any interaction with python.
+// in the future, we will add the ability to resolve `self.foo` to python
+// {functions, modules, contants} so this SugaredValue is defined here
+// anticipating we will eventually need to replace Module with a py::object
+// holding the actual nn.Module class.
+
+// defines how a method obtained from a module behaves in script
+struct MethodValue : public SugaredValue {
+  MethodValue(std::shared_ptr<Module> module, Method& method)
+  : module(std::move(module)) //insurance that method stays alive
+  , method(method) {}
+  std::string kind() const override {
+    return "method";
+  }
+  virtual std::shared_ptr<SugaredValue> call(SourceRange loc, Method & caller, at::ArrayRef<NamedValue> inputs, at::ArrayRef<NamedValue> attributes, size_t n_binders) override {
+    return packOutputs(*caller.graph(), caller.emit_call_to(loc, method, inputs, attributes));
+  }
+private:
+  std::shared_ptr<Module> module;
+  Method& method;
+
+};
+
+
+struct ModuleValue : public SugaredValue {
+  ModuleValue(std::shared_ptr<Module> module)
+  : module(std::move(module)) {}
+
+  virtual std::string kind() const override {
+    return "module";
+  }
+
+  // select an attribute on it, e.g. `this.field`
+  virtual std::shared_ptr<SugaredValue> attr(SourceRange loc, Method & m, const std::string& field) override {
+    if(NamedModule* v = module->find_module(field)) {
+      return std::make_shared<ModuleValue>(v->module);
+    } else if(Method* v = module->find_method(field)) {
+      return std::make_shared<MethodValue>(module, *v);
+    } else if(NamedParameter* v = module->find_parameter(field)) {
+      return std::make_shared<SimpleValue>(m.get_or_add_parameter(v->slot()));
+    }
+    // This can also be a call to a non-script module, or a plain
+    // python method. If so return this as a python value.
+    py::object py_module = py::cast(module);
+    if(py::object attr = py::getattr(py_module, field.c_str(), py::none())) {
+      if(py::isinstance<py::function>(attr) ||
+         py::isinstance(attr, py::module::import("torch.nn").attr("Module"))) {
+        return std::make_shared<PythonValue>(attr);
+      } else if(py_module.attr("_constants_set").contains(field.c_str())) {
+        return ConstantPythonValue::create(loc, m, attr);
+      } else {
+        throw ErrorReport(loc) << "attribute '" << field << "' of type '" << typeString(attr) << "' is not usable in a script method (did you forget to add it __constants__?)";
+      }
+    }
+    throw ErrorReport(loc) << "module has no attribute '" << field << "'";
+  }
+
+  // call module.forward
+  virtual std::shared_ptr<SugaredValue> call(SourceRange loc, Method & caller, at::ArrayRef<NamedValue> inputs, at::ArrayRef<NamedValue> attributes, size_t n_binders) override {
+    return attr(loc, caller, "forward")->call(loc, caller, inputs, attributes, n_binders);
+  }
+
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(SourceRange loc, Method& m) override {
+    py::object py_module = py::cast(module);
+    if(!py::isinstance(py_module, py::module::import("torch.jit").attr("_ConstModuleList")))
+      return SugaredValue::asTuple(loc, m);
+    std::vector<std::shared_ptr<SugaredValue>> result;
+    for(py::handle module : py_module) {
+      py::object obj = py::reinterpret_borrow<py::object>(module);
+      if(py::isinstance<Module>(obj)) {
+        auto r = py::cast<std::shared_ptr<Module>>(obj);
+        result.push_back(std::make_shared<ModuleValue>(r));
+      } else {
+        result.push_back(ConstantPythonValue::create(loc, m, obj));
+      }
+    }
+    return result;
+  }
+
+private:
+  std::shared_ptr<Module> module;
+};
+
+
+py::object unpackVariableTensorList(std::vector<at::Tensor> outputs) {
+  // if we don't tell pybind these are variables it chokes on the
+  // conversion.
+  // TODO: fix conversions to be sane and make sure this works.
+  if (outputs.size() == 0) {
+    return py::none();
+  } else if (outputs.size() == 1) {
+    return py::cast(autograd::as_variable_ref(outputs[0]));
+  } else {
+    py::tuple tuple(outputs.size());
+    for(size_t i = 0; i < outputs.size(); i++) {
+      tuple[i] = py::cast(autograd::as_variable_ref(outputs[i]));
+    }
+    return tuple;
+  }
+}
+
+static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const Module & m) {
+  for(auto & param : m.get_parameters()) {
+    values.push_back(param->slot());
+  }
+  for(const auto & sub : m.get_modules()) {
+    gatherParametersAndBuffers(values, *sub->module);
+  }
+}
+
+py::object runMethodFromPython(Method& m, py::args args) {
+  auto inputs = createVariableTensorList(args);
+  auto outputs = m.run(std::move(inputs));
+  return unpackVariableTensorList(std::move(outputs));
+}
+
+void initJitScriptBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+  // torch.jit.ScriptModule is a subclass of this C++ object.
+  // Methods here are prefixed with _ since they should not be
+  // public.
+  py::class_<Module, std::shared_ptr<Module>>(m, "ScriptModule")
+      .def(py::init<>())
+      .def("_set_optimized", &Module::set_optimized)
+      .def(
+          "_define",
+          [](Module& m,
+             const std::string& script,
+             ResolutionCallback rcb, bool has_self) {
+            auto self = has_self ? std::make_shared<ModuleValue>(m.shared_from_this()) : nullptr;
+            return defineMethodsInModule(m, script, pythonResolver(rcb), self);
+          })
+      .def("_create_methods", [](Module& m, const std::vector<Def>& defs, const std::vector<ResolutionCallback>& rcbs) {
+        std::vector<Resolver> resolvers;
+        for(auto & callback : rcbs) {
+          resolvers.push_back(pythonResolver(callback));
+        }
+        defineMethodsInModule(
+          m,
+          defs,
+          resolvers,
+          std::make_shared<ModuleValue>(m.shared_from_this()));
+      })
+      .def("_get_method",
+      [](Module& self, const std::string& name) -> const Method& {
+        return self.get_method(name);
+      }, py::return_value_policy::reference_internal)
+      .def("_register_parameter", &Module::register_parameter)
+      .def("_register_module", &Module::register_module)
+      .def("_set_parameter", &Module::set_parameter)
+      .def("_get_parameter", &Module::get_parameter)
+      .def("_get_module", &Module::get_module)
+      .def("_get_modules", [](Module& self) -> py::tuple {
+        auto & modules = self.get_modules();
+        py::tuple result(modules.size());
+        for(size_t i = 0; i < modules.size(); ++i) {
+          auto & item = modules[i];
+          result[i] = std::make_pair(item.key, item.value);
+        }
+        return result;
+      })
+      .def("_get_parameters", [](Module& self) -> py::tuple {
+        auto & parameters = self.get_parameters();
+        py::tuple result(parameters.size());
+        for(size_t i = 0; i < parameters.size(); ++i) {
+          auto & p = parameters[i];
+          py::tuple r(3);
+          result[i] = std::make_tuple(
+            p.key,
+            autograd::as_variable_ref(*p->slot()),
+            p->is_buffer);
+
+        }
+        return result;
+      })
+      .def("_has_parameter", [](Module& self, const std::string& name) {
+        if(auto r = self.find_parameter(name)) {
+          return !r->is_buffer;
+        }
+        return false;
+      })
+      .def("_has_buffer", [](Module& self, const std::string& name) {
+        if(auto r = self.find_parameter(name)) {
+          return r->is_buffer;
+        }
+        return false;
+      })
+      .def("_has_module", [](Module& self, const std::string& name) {
+        return bool(self.find_module(name));
+      })
+      .def("_has_method", [](Module& self, const std::string& name) {
+        return bool(self.find_method(name));
+      })
+      .def("_method_names", [](Module& self) {
+        using Item = torch::detail::OrderedDict<std::string, std::unique_ptr<Method>>::Item;
+        return fmap(self.get_methods(), [](const Item & item) {
+          return (*item)->name();
+        });
+      })
+      .def("_create_method_from_graph", [](
+        Module& self,
+        const std::string& name,
+        std::shared_ptr<Graph> graph
+      ){
+        std::vector<at::Tensor*> parameters;
+        self.create_method(name, std::move(graph), std::move(parameters));
+      })
+      .def("_create_method_from_trace", [](
+        Module& self,
+        const std::string& name,
+        py::function func,
+        tracer::variable_list inputs) {
+          size_t num_inputs = inputs.size();
+          // prereq: Module's buffers and parameters are unique
+          // this was ensured in python before calling this function
+          std::vector<at::Tensor*> parameters;
+          gatherParametersAndBuffers(parameters, self);
+          for(at::Tensor* param : parameters) {
+            inputs.push_back(autograd::as_variable_ref(*param));
+          }
+          auto graph = tracer::createGraphByTracing(func, std::move(inputs), num_inputs);
+          self.create_method(name, std::move(graph), std::move(parameters));
+      })
+      .def("graph_for", [](Module& self, py::args args) {
+        if (self.find_method("forward")) {
+          return self.get_method("forward").graph_for(createVariableTensorList(args));
+        }
+        throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()");
+      })
+      .def("forward", [](Module& self, py::args args) {
+        // We implement this in C++ to avoid incurring the pybind11 dispatch
+        // overhead twice: once to call into the method lookup for "forward"
+        // and once to actually invoke the method.
+        //
+        // There is a thin wrapper on top of this method in the C++ version of
+        // ScriptModule.
+        return runMethodFromPython(self.get_method("forward"), args);
+      });
+
+  py::class_<Method>(m, "ScriptMethod", py::dynamic_attr())
+    .def("graph", [&](Method& self) {
+      return self.graph();
+    })
+    .def("__call__", [](Method& m, py::args args) -> py::object {
+      return runMethodFromPython(m, args);
+    })
+    .def_property_readonly("graph", [](Method& m) {
+      return m.graph();
+    })
+    .def("propagate_shapes", &Method::propagate_shapes)
+    .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes)
+    .def("params", &Method::params)
+    .def("graph_for", [](Method& self, py::args args) {
+      return self.graph_for(createVariableTensorList(args));
+    });
+
+  m.def("_jit_script_compile", [](Def def, ResolutionCallback rcb) {
+    return compileFunction(def, pythonResolver(rcb));
+  });
+
+}
+
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/init.h b/torch/csrc/jit/script/init.h
new file mode 100644
index 0000000..c10afe0
--- /dev/null
+++ b/torch/csrc/jit/script/init.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "torch/csrc/jit/pybind.h"
+
+namespace torch {
+namespace jit {
+namespace script {
+void initJitScriptBindings(PyObject* module);
+
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp
new file mode 100644
index 0000000..2c7d42b
--- /dev/null
+++ b/torch/csrc/jit/script/lexer.cpp
@@ -0,0 +1,48 @@
+#include "torch/csrc/jit/script/lexer.h"
+#include <string>
+#include <unordered_map>
+#include <mutex>
+
+namespace torch {
+namespace jit {
+namespace script {
+
+int stringToKind(std::string str) {
+  static std::once_flag init_flag;
+  static std::unordered_map<std::string, int> str_to_kind;
+  std::call_once(init_flag, []() {
+    for (char tok : std::string(valid_single_char_tokens))
+      str_to_kind[std::string(1, tok)] = tok;
+#define DEFINE_CASE(tok, _, str) \
+    if (std::string(str) != "") str_to_kind[str] = tok;
+    TC_FORALL_TOKEN_KINDS(DEFINE_CASE)
+#undef DEFINE_CASE
+  });
+  try {
+    return str_to_kind.at(str);
+  } catch (std::out_of_range& err) {
+    throw std::out_of_range("unknown token in stringToKind");
+  }
+}
+
+std::string kindToString(int kind) {
+  if (kind < 256)
+    return std::string(1, kind);
+  switch (kind) {
+#define DEFINE_CASE(tok, str, _) \
+  case tok:                      \
+    return str;
+    TC_FORALL_TOKEN_KINDS(DEFINE_CASE)
+#undef DEFINE_CASE
+    default:
+      throw std::runtime_error("Unknown kind: " + std::to_string(kind));
+  }
+}
+
+SharedParserData& sharedParserData() {
+  static SharedParserData data; // safely handles multi-threaded init
+  return data;
+}
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
new file mode 100644
index 0000000..7e2c812
--- /dev/null
+++ b/torch/csrc/jit/script/lexer.h
@@ -0,0 +1,547 @@
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/jit/source_location.h"
+
+
+namespace torch {
+namespace jit {
+namespace script {
+
+// single character tokens are just the character itself '+'
+// multi-character tokens need an entry here
+// if the third entry is not the empty string, it is used
+// in the lexer to match this token.
+
+// These kinds are also used in Tree.h as the kind of the AST node.
+// Some kinds TK_APPLY, TK_LIST are only used in the AST and are not seen in the
+// lexer.
+
+#define TC_FORALL_TOKEN_KINDS(_)                 \
+  _(TK_EOF, "eof", "")                           \
+  _(TK_WHITESPACE, "whitespace", "")             \
+  _(TK_NUMBER, "number", "")                     \
+  _(TK_NEWLINE, "newline", "")                   \
+  _(TK_INDENT, "indent", "")                     \
+  _(TK_DEDENT, "dedent", "")                     \
+  _(TK_FLOAT, "float", "float")                  \
+  _(TK_DOUBLE, "double", "double")               \
+  _(TK_LONG, "long", "long")                     \
+  _(TK_INT, "int", "int")                        \
+  _(TK_DEF, "def", "def")                        \
+  _(TK_EQUIVALENT, "equivalent", "<=>")          \
+  _(TK_IDENT, "ident", "")                       \
+  _(TK_STRING, "string", "")                     \
+  _(TK_CONST, "const", "")                       \
+  _(TK_LIST, "list", "")                         \
+  _(TK_OPTION, "option", "")                     \
+  _(TK_APPLY, "apply", "")                       \
+  _(TK_COMPREHENSION, "comprehension", "")       \
+  _(TK_TENSOR_TYPE, "tensor_type", "")           \
+  _(TK_RANGE_CONSTRAINT, "range_constraint", "") \
+  _(TK_PARAM, "param", "")                       \
+  _(TK_INFERRED, "inferred", "")                 \
+  _(TK_BOOL, "bool", "bool")                     \
+  _(TK_ACCESS, "access", "")                     \
+  _(TK_ASSIGN, "assign", "")                     \
+  _(TK_ATTRIBUTE, "attribute", "")               \
+  _(TK_IF, "if", "if")                           \
+  _(TK_ELSE, "else", "else")                     \
+  _(TK_ELIF, "elif", "elif")                     \
+  _(TK_WHILE, "while", "while")                  \
+  _(TK_EXPR_STMT, "expression statement", "")    \
+  _(TK_RETURN, "return", "return")               \
+  _(TK_NE, "ne", "!=")                           \
+  _(TK_EQ, "eq", "==")                           \
+  _(TK_LE, "le", "<=")                           \
+  _(TK_GE, "ge", ">=")                           \
+  _(TK_IF_EXPR, "if", "")                        \
+  _(TK_TRUE, "True", "True")                     \
+  _(TK_FALSE, "False", "False")                  \
+  _(TK_AND, "and", "and")                        \
+  _(TK_OR, "or", "or")                           \
+  _(TK_NOT, "not", "not")                        \
+  _(TK_CAST, "cast", "")                         \
+  _(TK_PLUS_EQ, "+=", "+=")                      \
+  _(TK_MINUS_EQ, "-=", "-=")                     \
+  _(TK_TIMES_EQ, "*=", "*=")                     \
+  _(TK_DIV_EQ, "/=", "/=")                       \
+  _(TK_GLOBAL, "global", "global")               \
+  _(TK_BUILT_IN, "built-in", "")                 \
+  _(TK_SLICE, "slice", "")                       \
+  _(TK_VAR, "variable", "")                      \
+  _(TK_GATHER, "gather", "")                     \
+  _(TK_NOTHING, "nothing", "")                   \
+  _(TK_LIST_LITERAL, "list-literal", "")         \
+  _(TK_FOR, "for", "for")                        \
+  _(TK_IN, "in", "in")                           \
+  _(TK_STARRED, "starred", "")                   \
+  _(TK_UNARY_MINUS, "unary minus", "")           \
+  _(TK_POW, "pow operator", "**")                \
+  _(TK_ARROW, "arrow", "->")                     \
+
+static const char* valid_single_char_tokens = "+-*/@()[]:,={}><.?";
+
+enum TokenKind {
+  // we use characters to represent themselves so skip all valid characters
+  // before
+  // assigning enum values to multi-char tokens.
+  TK_DUMMY_START = 256,
+#define DEFINE_TOKEN(tok, _, _2) tok,
+  TC_FORALL_TOKEN_KINDS(DEFINE_TOKEN)
+#undef DEFINE_TOKEN
+};
+
+std::string kindToString(int kind);
+int stringToKind(std::string str);
+
+// nested hash tables that indicate char-by-char what is a valid token.
+struct TokenTrie;
+using TokenTrieRef = std::unique_ptr<TokenTrie>;
+struct TokenTrie {
+  TokenTrie() : kind(0) {}
+  void insert(const char* str, int tok) {
+    if (*str == '\0') {
+      TORCH_ASSERT(kind == 0);
+      kind = tok;
+      return;
+    }
+    auto& entry = children[*str];
+    if (entry == nullptr) {
+      entry.reset(new TokenTrie());
+    }
+    entry->insert(str + 1, tok);
+  }
+  int kind; // 0 == invalid token
+  std::unordered_map<char, TokenTrieRef> children;
+};
+
+// stuff that is shared against all TC lexers/parsers and is initialized only
+// once.
+struct SharedParserData {
+  SharedParserData() : head(new TokenTrie()) {
+    // listed in increasing order of precedence
+    std::vector<std::vector<int>> binary_ops = {
+        {TK_IF},
+        {TK_AND, TK_OR},
+        {}, // reserve a level for unary not
+        {'<', '>', TK_EQ, TK_LE, TK_GE, TK_NE},
+        {'+', '-'},
+        {'*', '/', '@'},
+        {TK_POW},
+    };
+    std::vector<std::vector<int>> unary_ops = {
+        {'-', '*'},
+    };
+
+    std::stringstream ss;
+    for (const char* c = valid_single_char_tokens; *c; c++) {
+      std::string str(1, *c);
+      head->insert(str.c_str(), *c);
+    }
+
+#define ADD_CASE(tok, _, tokstring) \
+  if (*tokstring != '\0') {         \
+    head->insert(tokstring, tok);   \
+  }
+    TC_FORALL_TOKEN_KINDS(ADD_CASE)
+#undef ADD_CASE
+
+    // precedence starts at 1 so that there is always a 0 precedence
+    // less than any other precedence
+    int prec = 1;
+    for (auto& group : binary_ops) {
+      for (auto& element : group) {
+        binary_prec[element] = prec;
+      }
+      prec++;
+    }
+    // unary ops
+    for (auto& group : unary_ops) {
+      for (auto& element : group) {
+        unary_prec[element] = prec;
+      }
+      prec++;
+    }
+    // add unary not separately because it slots into the precedence of
+    // binary operators
+    unary_prec[TK_NOT] = binary_prec[TK_AND] + 1;
+  }
+  // 1. skip whitespace
+  // 2. handle comment or newline
+  //
+  bool isNumber(const std::string& str, size_t start, size_t* len) {
+    char first = str[start];
+    // strtod allows numbers to start with + or - or nan or inf
+    // http://en.cppreference.com/w/cpp/string/byte/strtof
+    // but we want only the number part, otherwise 1+3 will turn into two
+    // adjacent numbers in the lexer
+    if (first == '-' || first == '+' || isalpha(first))
+      return false;
+    const char* startptr = str.c_str() + start;
+    char* endptr;
+    std::strtod(startptr, &endptr);
+    *len = endptr - startptr;
+    return *len > 0;
+  }
+  bool isblank(int n) {
+    return isspace(n) && n != '\n';
+  }
+  // find the longest match of str.substring(pos) against a token, return true
+  // if successful
+  // filling in kind, start,and len
+  bool match(
+      const std::string& str,
+      size_t pos,
+      bool continuation, // are we inside a scope where newlines don't count
+                         // (e.g. inside parens)
+      bool whitespace_token, // should we treat whitespace as a token
+      int* kind,
+      size_t* start,
+      size_t* len) {
+    *start = pos;
+    // skip whitespace
+    while (pos < str.size() && isblank(str[pos]))
+      pos++;
+
+    // special handling
+    if (pos < str.size()) {
+      if (str[pos] == '#') {
+        // skip comments
+        while (pos < str.size() && str[pos] != '\n')
+          pos++;
+        // tail call, handle whitespace and more comments
+        return match(
+            str, pos, continuation, whitespace_token, kind, start, len);
+      }
+      if (str[pos] == '\\' && pos + 1 < str.size() && str[pos + 1] == '\n' &&
+          !whitespace_token) {
+        return match(str, pos + 2, continuation, false, kind, start, len);
+      }
+      if (str[pos] == '\n') {
+        return match(
+            str, pos + 1, continuation, !continuation, kind, start, len);
+      }
+    }
+    if (pos == str.size()) {
+      *kind = TK_EOF;
+      *start = pos;
+      *len = 0;
+      return true;
+    }
+    // invariant: the next token is not whitespace or newline
+    if (whitespace_token) {
+      *kind = TK_WHITESPACE;
+      *len = pos - *start;
+      return true;
+    }
+    *start = pos;
+    // check for a valid number
+    if (isNumber(str, pos, len)) {
+      *kind = TK_NUMBER;
+      return true;
+    }
+    // check for either an ident or a token
+    // ident tracks whether what we have scanned so far could be an identifier
+    // matched indicates if we have found any match.
+    bool matched = false;
+    bool ident = true;
+    TokenTrie* cur = head.get();
+    for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr); i++) {
+      ident = ident && validIdent(i, str[pos + i]);
+      if (ident) {
+        matched = true;
+        *len = i + 1;
+        *kind = TK_IDENT;
+      }
+      // check for token second, so that e.g. 'max' matches the token TK_MAX
+      // rather the
+      // identifier 'max'
+      if (cur) {
+        auto it = cur->children.find(str[pos + i]);
+        cur = (it == cur->children.end()) ? nullptr : it->second.get();
+        if (cur && cur->kind != 0) {
+          matched = true;
+          *len = i + 1;
+          *kind = cur->kind;
+        }
+      }
+    }
+    return matched;
+  }
+  bool isUnary(int kind, int* prec) {
+    auto it = unary_prec.find(kind);
+    if (it != unary_prec.end()) {
+      *prec = it->second;
+      return true;
+    }
+    return false;
+  }
+  bool isBinary(int kind, int* prec) {
+    auto it = binary_prec.find(kind);
+    if (it != binary_prec.end()) {
+      *prec = it->second;
+      return true;
+    }
+    return false;
+  }
+  bool isRightAssociative(int kind) {
+    switch (kind) {
+      case '?':
+      case TK_POW:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ private:
+  bool validIdent(size_t i, char n) {
+    return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+  }
+  TokenTrieRef head;
+  std::unordered_map<int, int>
+      unary_prec; // map from token to its unary precedence
+  std::unordered_map<int, int>
+      binary_prec; // map from token to its binary precedence
+};
+
+SharedParserData& sharedParserData();
+
+// a range of a shared string 'file_' with functions to help debug by highlight
+// that
+// range.
+struct SourceRange : public SourceLocation {
+  SourceRange(
+      const std::shared_ptr<std::string>& file_,
+      size_t start_,
+      size_t end_)
+      : file_(file_), start_(start_), end_(end_) {}
+  const std::string text() const {
+    return file().substr(start(), end() - start());
+  }
+  size_t size() const {
+    return end() - start();
+  }
+
+  static const size_t CONTEXT = 10;
+  virtual void highlight(std::ostream& out) const override {
+    const std::string& str = file();
+    size_t begin_line = start(); // beginning of line to highlight
+    size_t end_line = start(); // end of line to highlight
+    while (begin_line > 0 && str[begin_line - 1] != '\n')
+      --begin_line;
+    while (end_line < str.size() && str[end_line] != '\n')
+      ++end_line;
+    TORCH_ASSERT(begin_line == 0 || str[begin_line - 1] == '\n');
+    TORCH_ASSERT(end_line == str.size() || str[end_line] == '\n');
+
+    size_t begin_highlight = begin_line; // beginning of context, CONTEXT lines before the highlight line
+    for(size_t i = 0; begin_highlight > 0; --begin_highlight) {
+      if(str[begin_highlight - 1] == '\n')
+        ++i;
+      if(i >= CONTEXT)
+        break;
+    }
+    TORCH_ASSERT(begin_highlight == 0 || str[begin_highlight - 1] == '\n');
+
+    size_t end_highlight = end_line; // end of context, CONTEXT lines after the highlight line
+    for(size_t i = 0; end_highlight < str.size(); ++end_highlight) {
+      if(str[end_highlight] == '\n')
+        ++i;
+      if(i >= CONTEXT)
+        break;
+    }
+    TORCH_ASSERT(end_highlight == str.size() || str[end_highlight] == '\n');
+
+    out << str.substr(begin_highlight, end_line - begin_highlight) << "\n";
+    out << std::string(start() - begin_line, ' ');
+    size_t len = std::min(size(), end_line - start());
+    out << std::string(len, '~')
+        << (len < size() ? "...  <--- HERE" : " <--- HERE");
+    out << str.substr(end_line, end_highlight - end_line);
+    if (str.size() > 0 && str.back() != '\n')
+      out << "\n";
+  }
+  const std::string& file() const {
+    return *file_;
+  }
+  const std::shared_ptr<std::string>& file_ptr() const {
+    return file_;
+  }
+  size_t start() const {
+    return start_;
+  }
+  size_t end() const {
+    return end_;
+  }
+
+ private:
+  std::shared_ptr<std::string> file_;
+  size_t start_;
+  size_t end_;
+};
+
+struct Token {
+  int kind;
+  SourceRange range;
+  Token(int kind, const SourceRange& range) : kind(kind), range(range) {}
+  std::string text() {
+    return range.text();
+  }
+  std::string kindString() const {
+    return kindToString(kind);
+  }
+};
+
+struct Lexer {
+  explicit Lexer(const std::string& str)
+      : file(std::make_shared<std::string>(str)),
+        pos(0),
+        nesting(0),
+        indent_stack(),
+        next_tokens(),
+        shared(sharedParserData()) {
+    auto first_indent = lexRaw(true);
+    indent_stack.push_back(first_indent.range.size());
+    lex();
+  }
+  // Return the current token, and then move to the next one
+  Token next() {
+    if (next_tokens.size() == 0)
+      reportError("Lexer invariant violated: empty token queue");
+    Token r = next_tokens.front();
+    next_tokens.erase(next_tokens.begin());
+    if (next_tokens.size() == 0) {
+      lex();
+    }
+    return r;
+  }
+  // Skip the current token if it matches the given kind
+  bool nextIf(int kind) {
+    if (cur().kind != kind)
+      return false;
+    next();
+    return true;
+  }
+
+  [[noreturn]] void reportError(const std::string& what) {
+    reportError(what, cur());
+  }[[noreturn]] void reportError(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << what << ":\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << "expected " << what << " but found '" << t.kindString()
+       << "' here:\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }[[noreturn]] void expected(const std::string& what) {
+    expected(what, cur());
+  }
+  // Check that the current token has a given kind, return the current token,
+  // and advance to the next one.
+  Token expect(int kind) {
+    if (cur().kind != kind) {
+      expected(kindToString(kind));
+    }
+    return next();
+  }
+  Token& lookahead() {
+    if (next_tokens.size() < 2) {
+      lex();
+    }
+    return next_tokens[1];
+  }
+  Token& cur() {
+    return next_tokens.front();
+  }
+
+ private:
+  void lex() {
+    auto r = lexRaw();
+    switch (r.kind) {
+      case '(':
+      case '[':
+      case '{':
+        nesting++;
+        break;
+      case ')':
+      case ']':
+      case '}':
+        nesting--;
+        break;
+      case TK_WHITESPACE: {
+        int depth = r.range.size();
+        if (depth > indent_stack.back()) {
+          indent_stack.push_back(depth);
+          r.kind = TK_INDENT;
+        } else if (depth == indent_stack.back()) {
+          r.kind = TK_NEWLINE;
+        } else {
+          next_tokens.emplace_back(TK_NEWLINE, r.range);
+          while (indent_stack.back() != depth) {
+            indent_stack.pop_back();
+            next_tokens.emplace_back(TK_DEDENT, r.range);
+            if (indent_stack.size() == 0) {
+              reportError("invalid ident level", r);
+            }
+          }
+          return; // We've already queued the tokens
+        }
+      } break;
+      case TK_EOF:
+        if (indent_stack.size() > 1) {
+          next_tokens.emplace_back(TK_NEWLINE, r.range);
+          next_tokens.emplace_back(TK_DEDENT, r.range);
+          indent_stack.pop_back();
+          return;
+        }
+        break;
+      default:
+        break;
+    }
+    next_tokens.push_back(std::move(r));
+  }
+  Token lexRaw(bool whitespace_token = false) {
+    int kind;
+    size_t start;
+    size_t length;
+    TORCH_ASSERT(file);
+    if (!shared.match(
+            *file,
+            pos,
+            nesting > 0,
+            whitespace_token,
+            &kind,
+            &start,
+            &length)) {
+      expected(
+          "a valid token",
+          Token((*file)[start], SourceRange(file, start, start + 1)));
+    }
+    auto t = Token(kind, SourceRange(file, start, start + length));
+    pos = start + length;
+    return t;
+  }
+
+  std::shared_ptr<std::string> file;
+  size_t pos;
+  size_t nesting; // depth of ( [ { nesting...
+  std::vector<int> indent_stack; // stack of identation level of blocks
+  // Invariant: this should always contain at least a single element
+  std::vector<Token> next_tokens;
+  SharedParserData& shared;
+};
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
new file mode 100644
index 0000000..1058b6a
--- /dev/null
+++ b/torch/csrc/jit/script/module.cpp
@@ -0,0 +1,64 @@
+#include "torch/csrc/jit/script/module.h"
+#include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/script/error_report.h"
+#include "torch/csrc/jit/operator.h"
+
+namespace torch { namespace jit { namespace script {
+
+
+struct RecursiveMethodCallError : public std::exception {};
+void placeholderCreator(Method&) {
+  throw RecursiveMethodCallError();
+}
+
+static FunctionSchema defaultSchemaFor(Method& method) {
+  std::vector<Argument> args;
+  std::vector<Argument> returns;
+  Graph& g = *method.graph();
+  size_t num_inputs = method.num_inputs();
+  for(size_t i = 0; i < num_inputs; ++i) {
+    const Value* v = g.inputs().at(i);
+    std::string name = v->hasUniqueName() ? v->uniqueName() : ("argument_"  + std::to_string(i));
+    args.push_back({std::move(name), DynamicType::get()});
+  }
+  for(size_t i = 0; i < g.outputs().size(); ++i) {
+    returns.push_back({"", DynamicType::get()});
+  }
+  return { method.name(), std::move(args), std::move(returns) };
+}
+
+std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs) {
+  JIT_ASSERT(!executor);
+  try {
+    callee.ensure_defined();
+  } catch (RecursiveMethodCallError&) {
+    throw ErrorReport(loc) << " method '" << callee.name()
+        << "' is called recursively involving this call site. Recursive calls are not supported";
+  }
+  auto fn = callee.graph();
+
+  std::stringstream failure_messages;
+  auto all_inputs = tryMatchSchema(
+    callee.schema ? *callee.schema : defaultSchemaFor(callee),
+    loc, *graph(), args, kwargs, failure_messages);
+  if(!all_inputs)
+    throw ErrorReport(loc) << failure_messages.str();
+
+  // parameters to callee method (which become parameters to _this_ method
+  // if they were not already)
+  for(at::Tensor* member : callee.member_inputs) {
+    all_inputs->push_back(get_or_add_parameter(member));
+  }
+  return inlineCallTo(*graph(), *callee.graph(), *all_inputs);
+}
+
+void Method::ensure_defined() {
+  if(method_creator) {
+    auto creator = method_creator;
+    method_creator = placeholderCreator;
+    creator(*this);
+    method_creator = nullptr;
+  }
+}
+
+}}}
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
new file mode 100644
index 0000000..fbf4575
--- /dev/null
+++ b/torch/csrc/jit/script/module.h
@@ -0,0 +1,304 @@
+#pragma once
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/function_schema.h"
+#include "torch/csrc/jit/named_value.h"
+
+#include <torch/csrc/api/include/torch/detail/ordered_dict.h>
+
+#include <ATen/optional.h>
+#include <ATen/ArrayRef.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// This file contains classes which assist in desugaring Python style
+// modules and their methods into flattened graphs which don't have any
+// function calls.
+
+namespace torch { namespace jit { namespace script {
+
+// A method in a module, e.g. f in:
+//
+// class M(ScriptModule):
+//   @script_method
+//   def f(self, x):
+//     ...
+// Note: because Method/Module are exposed to python these
+// classes use python method naming conventions
+
+struct SourceRange;
+
+struct Method {
+  Method(std::string name, bool optimize,
+         std::shared_ptr<Graph> graph,
+         std::vector<at::Tensor*> initial_members,
+         std::function<void(Method&)> method_creator)
+  : name_(std::move(name))
+  , graph_(std::move(graph))
+  , optimize(optimize)
+  , member_inputs(std::move(initial_members))
+  , method_creator(method_creator) {
+    JIT_ASSERT(graph_->inputs().size() >= member_inputs.size());
+    int i = graph_->inputs().size() - member_inputs.size();
+    for(at::Tensor* member : member_inputs) {
+      member_input_index[member] = i++;
+    }
+  }
+
+  variable_tensor_list run(variable_tensor_list && inputs) {
+    for(auto tp : member_inputs) {
+      inputs.push_back(*tp);
+    }
+    return get_executor().run(std::move(inputs));
+  }
+  std::shared_ptr<Graph> graph_for(const variable_tensor_list& inputs) {
+    return get_executor().graphFor(inputs);
+  }
+  std::shared_ptr<Graph> graph() const {
+    return graph_;
+  }
+
+  const std::string & name() const {
+    return name_;
+  }
+  // emit a function call by inlining the callees Graph into this one
+  // adding any extra parameters necessary to do this call
+
+  // defined here to keep details of member_input handling confined to this class
+  std::vector<Value*> emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs);
+  // if this isn't yet defined, run its method_creator function
+  void ensure_defined();
+
+
+  size_t num_inputs() const {
+    return graph()->inputs().size() - member_inputs.size();
+  }
+  Value * get_or_add_parameter(at::Tensor* slot) {
+    auto it = member_input_index.find(slot);
+    if(it != member_input_index.end()) {
+      return graph()->inputs().at(it->second);
+    }
+    // add it as a new parameter
+    member_inputs.push_back(slot);
+    member_input_index[slot] = graph()->inputs().size();
+    return graph()->addInput();
+  }
+
+  std::shared_ptr<Graph> propagate_shapes(std::vector<at::Tensor> inputs, bool with_grad=false) {
+    auto retval = graph_->copy();
+    for (auto inp : member_inputs) {
+      inputs.push_back(*inp);
+    }
+    PropagateInputShapes(
+      *retval,
+      ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+    return retval;
+  }
+
+  std::shared_ptr<Graph> propagate_and_assign_input_and_output_shapes(std::vector<at::Tensor> inputs, std::vector<at::Tensor> outputs, bool with_grad=false, bool propagate=true) {
+    auto retval = graph_->copy();
+    for (auto inp : member_inputs) {
+      inputs.push_back(*inp);
+    }
+    if (propagate) {
+      auto inputs_copy = inputs;
+      PropagateInputShapes(*retval, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs_copy))));
+    }
+    JIT_ASSERT(retval->inputs().size() == inputs.size());
+    for (size_t i=0; i < retval->inputs().size(); ++i) {
+      auto scalar_type = inputs[i].type().scalarType();
+      auto sizes = inputs[i].sizes();
+      auto type = std::make_shared<torch::jit::TensorType>(scalar_type, -1, sizes);
+      retval->inputs()[i]->setType(type);
+    }
+    JIT_ASSERT(retval->outputs().size() == outputs.size());
+    for (size_t i=0; i < retval->outputs().size(); ++i) {
+      auto scalar_type = outputs[i].type().scalarType();
+      auto sizes = outputs[i].sizes();
+      auto type = std::make_shared<torch::jit::TensorType>(scalar_type, -1, sizes);
+      retval->outputs()[i]->setType(type);
+    }
+    return retval;
+  }
+
+  std::vector<at::Tensor*> params() {
+    return member_inputs;
+  }
+
+  Method& setSchema(FunctionSchema schema_) {
+    schema.reset(new FunctionSchema(std::move(schema_)));
+    return *this;
+  }
+
+private:
+  std::string name_;
+  std::shared_ptr<Graph> graph_; // for debugging and for inlining
+  bool optimize;
+
+  GraphExecutor& get_executor() {
+    std::call_once(executor_init, [&]{
+      executor = GraphExecutor(graph(), optimize);
+    });
+    return executor;
+  }
+
+  GraphExecutor executor; // for execution
+  // member_inputs are a list of additional arguments appended to graph that are
+  // inputs that come from the members of the Module or its submodules.
+  // each is a pointer to a slot in the module that owns this parameter
+  // parameters and submodules can only be _added_ to script Modules to ensure
+  // these pointers always stay valid
+  std::vector<at::Tensor*> member_inputs;
+
+  // map from a at::Tensor* in member_inputs to the offset it appears at
+  // in graph. used to accelerate get_or_add_parameter
+  std::unordered_map<at::Tensor*, size_t> member_input_index;
+
+  // TODO: support that case where we allow _writes_ to parameters from
+  // compiled functions.
+  // This requires more sophisticated tracking of ssa values in Graphs so that
+  // stores to all modules can be lifted to the end of a graph execution.
+  // It also adds more complexity to adding actual module invocations
+  // to the executor, so currently it is not done.
+  // std::vector<at::Tensor*> member_outputs;
+
+  std::once_flag executor_init;
+
+  // an optional function that actually creates the method when emit_call_to(this,...)
+  // is first called.
+  // this is used by the compiler so that it can construct methods out of order
+  std::function<void(Method&)> method_creator;
+
+  // if absent, then we generate a default schema based on the graph
+  std::unique_ptr<FunctionSchema> schema;
+};
+
+struct Module;
+
+struct NamedModule {
+  std::string name;
+  std::shared_ptr<Module> module;
+};
+
+struct NamedParameter {
+  NamedParameter(std::string name, at::Tensor tensor, bool is_buffer)
+  : name(std::move(name))
+  , is_buffer(is_buffer)
+  , parameter(new at::Tensor(std::move(tensor))) {}
+
+  const std::string name;
+  bool is_buffer; // buffers are part of the module state but
+                        // are not modified by optimizers during SGD
+  at::Tensor* slot() const {
+    return parameter.get();
+  }
+private:
+  // the extra level of indirection allows Methods to safely store pointers
+  // to the slots where parameters are kept while also allow parameters
+  // to be reassigned
+  std::unique_ptr<at::Tensor> parameter;
+};
+
+struct Module : public std::enable_shared_from_this<Module> {
+  TH_DISALLOW_COPY_AND_ASSIGN(Module);
+  Module()
+  : modules("Module")
+  , parameters("Parameter")
+  , methods("Method")
+  , optimize(true) {}
+
+  // note this doesn't change the flags of existing methods just ones
+  // added afterward.
+  void set_optimized(bool o) {
+    optimize = o;
+  }
+
+  void register_parameter(const std::string & name, autograd::Variable v, bool is_buffer) {
+    if(auto p = parameters.find(name)){
+      *p->slot() = v;
+      p->is_buffer = is_buffer;
+      return;
+    }
+    parameters.insert(name, NamedParameter(name, std::move(v), is_buffer));
+  }
+  void register_module(const std::string& name, std::shared_ptr<Module> module) {
+    modules.insert(name, {name, std::move(module)});
+  }
+
+  Method& create_method(const std::string & name, std::shared_ptr<Graph> graph, std::vector<at::Tensor*> member_inputs) {
+    JIT_ASSERT(graph);
+    std::unique_ptr<Method> method(new Method(name, optimize, std::move(graph), std::move(member_inputs), nullptr));
+    return *methods.insert(name, std::move(method));
+  }
+
+  Method& create_method(const std::string & name, std::function<void(Method&)> creator) {
+    std::unique_ptr<Method> method(new Method(name, optimize, std::make_shared<Graph>(), {}, creator));
+    return *methods.insert(name, std::move(method));
+  }
+
+  at::Tensor* parameter_slot(const std::string & name) const {
+    return parameters.get(name).slot();
+  }
+
+  void set_parameter(const std::string & name, at::Tensor v) {
+    *parameter_slot(name) = std::move(v);
+  }
+
+  autograd::Variable get_parameter(const std::string& name) const {
+    return autograd::as_variable_ref(*parameter_slot(name));
+  }
+
+  // each module owns its method. The reference returned here
+  // is guarenteed to stay valid until this module has been destroyed
+  Method& get_method(const std::string& name) const {
+    return *methods.get(name);
+  }
+
+  std::shared_ptr<Module> get_module(const std::string& name) const {
+    return modules.get(name).module;
+  }
+
+  const detail::OrderedDict<std::string, NamedModule>& get_modules() const {
+    return modules;
+  }
+  const detail::OrderedDict<std::string, NamedParameter>& get_parameters() const {
+    return parameters;
+  }
+  const detail::OrderedDict<std::string, std::unique_ptr<Method>>& get_methods() const {
+    return methods;
+  }
+
+  NamedParameter* find_parameter(const std::string& name) {
+    return parameters.find(name);
+  }
+  NamedModule* find_module(const std::string& name) {
+    return modules.find(name);
+  }
+  Method* find_method(const std::string& name) {
+    if (auto* pm = methods.find(name)) {
+      return pm->get();
+    }
+    return nullptr;
+  }
+
+ private:
+
+  // invariant: to ensure member_inputs of Methods stay valid,
+  // it is only legal to _add_ new modules and parameters.
+  // removing them will allow member_inputs to point to invalid parameters
+  // no such restriction exists for methods
+  detail::OrderedDict<std::string, NamedModule> modules;
+  detail::OrderedDict<std::string, NamedParameter> parameters;
+  detail::OrderedDict<std::string, std::unique_ptr<Method>> methods;
+  bool optimize;
+};
+
+}}}
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
new file mode 100644
index 0000000..4da07a6
--- /dev/null
+++ b/torch/csrc/jit/script/parser.h
@@ -0,0 +1,379 @@
+#pragma once
+#include "lexer.h"
+#include "tree.h"
+#include "tree_views.h"
+
+namespace torch {
+namespace jit {
+namespace script {
+
+struct Parser {
+  explicit Parser(const std::string& str)
+      : L(str), shared(sharedParserData()) {}
+
+  Ident parseIdent() {
+    auto t = L.expect(TK_IDENT);
+    // whenever we parse something that has a TreeView type we always
+    // use its create method so that the accessors and the constructor
+    // of the Compound tree are in the same place.
+    return Ident::create(t.range, t.text());
+  }
+  TreeRef createApply(Expr expr) {
+    TreeList attributes;
+    auto range = L.cur().range;
+    TreeList inputs;
+    parseOperatorArguments(inputs, attributes);
+    return Apply::create(
+        range,
+        expr,
+        List<Expr>(makeList(range, std::move(inputs))),
+        List<Attribute>(makeList(range, std::move(attributes))));
+  }
+  // exp | expr, | expr, expr, ...
+  TreeRef parseExpOrExpList(int end) {
+    auto prefix = parseExp();
+    if(L.cur().kind == ',') {
+      std::vector<Expr> exprs = { prefix };
+      while(L.cur().kind != end) {
+        L.expect(',');
+        exprs.push_back(parseExp());
+      }
+      auto list = List<Expr>::create(prefix.range(), exprs);
+      prefix = ListLiteral::create(list.range(), list);
+    }
+    return prefix;
+  }
+  // things like a 1.0 or a(4) that are not unary/binary expressions
+  // and have higher precedence than all of them
+  TreeRef parseBaseExp() {
+    TreeRef prefix;
+    switch (L.cur().kind) {
+      case TK_NUMBER: {
+        prefix = parseConst();
+      } break;
+      case TK_TRUE:
+      case TK_FALSE: {
+        auto k = L.cur().kind;
+        auto r = L.cur().range;
+        prefix = c(k, r, {});
+        L.next();
+      } break;
+      case '(': {
+        L.next();
+        prefix = parseExpOrExpList(')');
+        L.expect(')');
+      } break;
+      case '[': {
+        auto list = parseList('[', ',', ']', &Parser::parseExp);
+        prefix = ListLiteral::create(list.range(), List<Expr>(list));
+      } break;
+      case TK_FLOAT:
+      case TK_INT:
+      case TK_LONG: {
+        auto r = L.cur().range;
+        auto type = c(L.next().kind, r, {});
+        L.expect('(');
+        auto exp = parseExp();
+        L.expect(')');
+        prefix = Cast::create(r, Type(type), Expr(exp));
+      } break;
+      default: {
+        Ident name = parseIdent();
+        prefix = Var::create(name.range(), name);
+      } break;
+    }
+    while (true) {
+      if (L.nextIf('.')) {
+        const auto name = parseIdent();
+        prefix = Select::create(name.range(), Expr(prefix), Ident(name));
+      } else if (L.cur().kind == '(') {
+        prefix = createApply(Expr(prefix));
+      } else if (L.cur().kind == '[') {
+        prefix = parseSliceOrGather(prefix);
+      } else {
+        break;
+      }
+    }
+    return prefix;
+  }
+  TreeRef parseOptionalReduction() {
+    auto r = L.cur().range;
+    switch (L.cur().kind) {
+      case TK_PLUS_EQ:
+      case TK_MINUS_EQ:
+      case TK_TIMES_EQ:
+      case TK_DIV_EQ: {
+        int modifier = L.next().text()[0];
+        return c(modifier, r, {});
+      } break;
+      default: {
+        L.expect('=');
+        return c('=', r, {}); // no reduction
+      } break;
+    }
+  }
+  TreeRef
+  parseTrinary(TreeRef true_branch, const SourceRange& range, int binary_prec) {
+    auto cond = parseExp();
+    L.expect(TK_ELSE);
+    auto false_branch = parseExp(binary_prec);
+    return c(TK_IF_EXPR, range, {cond, true_branch, false_branch});
+  }
+  // parse the longest expression whose binary operators have
+  // precedence strictly greater than 'precedence'
+  // precedence == 0 will parse _all_ expressions
+  // this is the core loop of 'top-down precedence parsing'
+  Expr parseExp() { return parseExp(0); }
+  Expr parseExp(int precedence) {
+    TreeRef prefix = nullptr;
+    int unary_prec;
+    if (shared.isUnary(L.cur().kind, &unary_prec)) {
+      auto kind = L.cur().kind;
+      auto pos = L.cur().range;
+      L.next();
+      auto unary_kind = kind == '*' ? TK_STARRED :
+                        kind == '-' ? TK_UNARY_MINUS :
+                                      kind;
+      auto subexp = parseExp(unary_prec);
+      // fold '-' into constant numbers, so that attributes can accept
+      // things like -1
+      if(unary_kind == TK_UNARY_MINUS && subexp.kind() == TK_CONST) {
+        prefix = Const::create(subexp.range(), "-" + Const(subexp).text());
+      } else {
+        prefix = c(unary_kind, pos, {subexp});
+      }
+    } else {
+      prefix = parseBaseExp();
+    }
+    int binary_prec;
+    while (shared.isBinary(L.cur().kind, &binary_prec)) {
+      if (binary_prec <= precedence) // not allowed to parse something which is
+        // not greater than 'precedenc'
+        break;
+
+      int kind = L.cur().kind;
+      auto pos = L.cur().range;
+      L.next();
+      if (shared.isRightAssociative(kind))
+        binary_prec--;
+
+      // special case for trinary operator
+      if (kind == TK_IF) {
+        prefix = parseTrinary(prefix, pos, binary_prec);
+        continue;
+      }
+
+      prefix = c(kind, pos, {prefix, parseExp(binary_prec)});
+    }
+    return Expr(prefix);
+  }
+  template<typename T>
+  List<T> parseList(int begin, int sep, int end, T (Parser::*parse)()) {
+    auto r = L.cur().range;
+    if (begin != TK_NOTHING)
+      L.expect(begin);
+    std::vector<T> elements;
+    if (L.cur().kind != end) {
+      do {
+        elements.push_back((this->*parse)());
+      } while (L.nextIf(sep));
+    }
+    if (end != TK_NOTHING)
+      L.expect(end);
+    return List<T>::create(r, elements);
+  }
+  Const parseConst() {
+    auto range = L.cur().range;
+    auto t = L.expect(TK_NUMBER);
+    return Const::create(t.range, t.text());
+  }
+  Expr parseAttributeValue() {
+    return parseExp();
+  }
+  void parseOperatorArguments(TreeList& inputs, TreeList& attributes) {
+    L.expect('(');
+    if (L.cur().kind != ')') {
+      do {
+        if (L.cur().kind == TK_IDENT && L.lookahead().kind == '=') {
+          auto ident = parseIdent();
+          L.expect('=');
+          auto v = parseAttributeValue();
+          attributes.push_back(Attribute::create(ident.range(), Ident(ident), v));
+        } else {
+          inputs.push_back(parseExp());
+        }
+      } while (L.nextIf(','));
+    }
+    L.expect(')');
+  }
+
+  // OK: [a] (gather), [a:], [:a], [a:b], [:] (slice)
+  // Not OK: []
+  TreeRef parseSliceOrGather(TreeRef value) {
+    const auto range = L.cur().range;
+    L.expect('[');
+
+    // `first` will either be the gather indices, or the start of the slice.
+    TreeRef first, second;
+
+    // Here we can either have a colon (which starts a slice), or an expression.
+    // If an expression, we don't know yet if it will be a slice or a gather.
+    if (L.cur().kind != ':') {
+      first = parseExp();
+      if (L.nextIf(']')) {
+        return Gather::create(range, Expr(value), Expr(first));
+      } else {
+        first = c(TK_OPTION, range, {first});
+      }
+    } else {
+      first = c(TK_OPTION, range, {});
+    }
+    L.expect(':');
+    // Now we *may* have an expression.
+    if (L.cur().kind != ']') {
+      second = c(TK_OPTION, range, {parseExp()});
+    } else {
+      second = c(TK_OPTION, range, {});
+    }
+    L.expect(']');
+
+    return Slice::create(range, Expr(value), Maybe<Expr>(first), Maybe<Expr>(second));
+  }
+
+  TreeRef parseParam() {
+    auto typ = TensorType::create(L.cur().range);
+    auto ident = parseIdent();
+    return Param::create(typ.range(), Ident(ident), Type(typ));
+  }
+
+  // 'first' has already been parsed since expressions can exist
+  // alone on a line:
+  // first[,other,lhs] = rhs
+  Assign parseAssign(List<Expr> list) {
+    auto red = parseOptionalReduction();
+    auto rhs = parseExpOrExpList(TK_NEWLINE);
+    L.expect(TK_NEWLINE);
+    return Assign::create(list.range(), list, AssignKind(red), Expr(rhs));
+  }
+  TreeRef parseStmt() {
+    switch (L.cur().kind) {
+      case TK_IF:
+        return parseIf();
+      case TK_WHILE:
+        return parseWhile();
+      case TK_FOR:
+        return parseFor();
+      case TK_GLOBAL: {
+        auto range = L.next().range;
+        auto idents = parseList(TK_NOTHING, ',', TK_NOTHING, &Parser::parseIdent);
+        L.expect(TK_NEWLINE);
+        return Global::create(range, idents);
+      }
+      case TK_RETURN: {
+        auto range = L.next().range;
+        // XXX: TK_NEWLINE makes it accept an empty list
+        auto values = parseList(TK_NOTHING, ',', TK_NEWLINE, &Parser::parseExp);
+        return Return::create(range, values);
+      }
+      default: {
+        List<Expr> exprs = parseList(TK_NOTHING, ',', TK_NOTHING, &Parser::parseExp);
+        if (L.cur().kind != TK_NEWLINE) {
+          return parseAssign(exprs);
+        } else {
+          L.expect(TK_NEWLINE);
+          return ExprStmt::create(exprs[0].range(), exprs);
+        }
+      }
+    }
+  }
+  TreeRef parseScalarType() {
+    switch (L.cur().kind) {
+      case TK_INT:
+      case TK_FLOAT:
+      case TK_LONG:
+      case TK_DOUBLE: {
+        auto t = L.next();
+        return c(t.kind, t.range, {});
+      }
+      default:
+        return parseIdent();
+    }
+  }
+  TreeRef parseOptionalIdentList() {
+    TreeRef list = nullptr;
+    if (L.cur().kind == '(') {
+      list = parseList('(', ',', ')', &Parser::parseIdent);
+    } else {
+      list = c(TK_LIST, L.cur().range, {});
+    }
+    return list;
+  }
+  TreeRef parseIf() {
+    auto r = L.cur().range;
+    L.expect(TK_IF);
+    auto cond = parseExp();
+    L.expect(':');
+    auto true_branch = parseStatements();
+    auto false_branch = makeList(L.cur().range, {});
+    if (L.nextIf(TK_ELSE)) {
+      L.expect(':');
+      false_branch = parseStatements();
+    }
+    return If::create(r, Expr(cond), List<Stmt>(true_branch), List<Stmt>(false_branch));
+  }
+  TreeRef parseWhile() {
+    auto r = L.cur().range;
+    L.expect(TK_WHILE);
+    auto cond = parseExp();
+    L.expect(':');
+    auto body = parseStatements();
+    return While::create(r, Expr(cond), List<Stmt>(body));
+  }
+  TreeRef parseFor() {
+    auto r = L.cur().range;
+    L.expect(TK_FOR);
+    auto targets = parseList(TK_NOTHING, ',', TK_NOTHING, &Parser::parseExp);
+    L.expect(TK_IN);
+    auto itrs = parseList(TK_NOTHING, ',', TK_NOTHING, &Parser::parseExp);
+    L.expect(':');
+    auto body = parseStatements();
+    return For::create(r, targets, itrs, body);
+  }
+  TreeRef parseStatements() {
+    auto r = L.cur().range;
+    L.expect(TK_INDENT);
+    TreeList stmts;
+    while (true) {
+      stmts.push_back(parseStmt());
+      if (L.nextIf(TK_DEDENT))
+        break;
+    }
+    return c(TK_LIST, r, std::move(stmts));
+  }
+  TreeRef parseFunction() {
+    L.expect(TK_DEF);
+    auto name = parseIdent();
+    auto paramlist = parseList('(', ',', ')', &Parser::parseParam);
+    L.expect(':');
+    auto stmts_list = parseStatements();
+    return Def::create(name.range(), Ident(name), List<Param>(paramlist),
+                       List<Stmt>(stmts_list));
+  }
+  Lexer& lexer() {
+    return L;
+  }
+
+ private:
+  // short helpers to create nodes
+  TreeRef c(int kind, const SourceRange& range, TreeList&& trees) {
+    return Compound::create(kind, range, std::move(trees));
+  }
+  TreeRef makeList(const SourceRange& range, TreeList&& trees) {
+    return c(TK_LIST, range, std::move(trees));
+  }
+  Lexer L;
+  SharedParserData& shared;
+};
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp
new file mode 100644
index 0000000..85f294b
--- /dev/null
+++ b/torch/csrc/jit/script/python_tree_views.cpp
@@ -0,0 +1,210 @@
+#include "torch/csrc/jit/script/python_tree_views.h"
+
+#include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/script/tree_views.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <sstream>
+
+namespace py = pybind11;
+
+namespace torch { namespace jit { namespace script {
+
+struct SourceRangeFactory {
+  SourceRangeFactory(std::string source)
+    : source_(std::make_shared<std::string>(std::move(source))) {
+    size_t pos = 0;
+    do {
+      line_len_prefix_sum_.push_back(pos);
+      pos++;
+    } while ((pos = source_->find('\n', pos)) != std::string::npos);
+  }
+  SourceRange create(int line, int start_col, int end_col) {
+    // Python has a weird convention where col_offset points to the column *before*
+    // the token starts.
+    start_col++;
+    end_col++;
+    // Also, lines are counted from 1.
+    line--;
+    auto line_start = line_len_prefix_sum_.at(line);
+    return SourceRange(source_, line_start + start_col, line_start + end_col);
+  }
+
+  std::shared_ptr<std::string> source_;
+  std::vector<size_t> line_len_prefix_sum_;
+};
+
+template<typename T>
+List<T> wrap_list(const SourceRange& fallback_pos, std::vector<T>&& vec) {
+  if (vec.empty())
+    return List<T>::create(fallback_pos, std::move(vec));
+  return List<T>::create(vec.front().range(), std::move(vec));
+}
+
+template<typename T>
+Maybe<T> wrap_maybe(const SourceRange& fallback_pos, T* val) {
+  return val ? Maybe<T>::create(val->range(), *val) : Maybe<T>::create(fallback_pos);
+}
+
+void initTreeViewBindings(PyObject *module) {
+  auto _C = py::handle(module).cast<py::module>();
+  auto m = _C.def_submodule("_jit_tree_views");
+
+  py::class_<SourceRange>(m, "SourceRange")
+    .def("highlight", [](const SourceRange& self) {
+      std::ostringstream stream;
+      self.highlight(stream);
+      return stream.str();
+    })
+    .def_property_readonly("start", &SourceRange::start)
+    .def_property_readonly("end", &SourceRange::end);
+  py::class_<SourceRangeFactory>(m, "SourceRangeFactory")
+    .def(py::init<std::string&&>())
+    .def("make_range", &SourceRangeFactory::create)
+    .def("make_raw_range", [](const SourceRangeFactory& self, size_t start, size_t end) {
+      return SourceRange(self.source_, start, end);
+    })
+    .def_property_readonly("source", [](const SourceRangeFactory& self) {
+      return *self.source_;
+    });
+
+  py::class_<TreeView>(m, "TreeView")
+    .def("range", &TreeView::range)
+    .def("__str__", [](const TreeView& tree) {
+      std::ostringstream stream;
+      stream << tree.get();
+      return stream.str();
+    });
+
+  py::class_<Ident, TreeView>(m, "Ident")
+      .def(py::init(&Ident::create))
+      .def_property_readonly(
+          "name", [](const Ident& self) { return self.name(); });
+
+  py::class_<Param, TreeView>(m, "Param")
+    .def(py::init([](const Type& type, const Ident& name) {
+      return Param::create(name.range(), name, type);
+    }));
+  py::class_<Attribute, TreeView>(m, "Attribute")
+    .def(py::init([](const Ident& name, const Expr& value) {
+      return Attribute::create(name.range(), name, value);
+    }));
+  m.def("TrueLiteral", [](const SourceRange& range) {
+    return Expr(Compound::create(TK_TRUE, range, {}));
+  });
+  m.def("FalseLiteral", [](const SourceRange& range) {
+    return Expr(Compound::create(TK_FALSE, range, {}));
+  });
+  py::class_<Type, TreeView>(m, "Type");
+  py::class_<TensorType, Type>(m, "TensorType")
+    .def(py::init(&TensorType::create));
+
+  py::class_<Stmt, TreeView>(m, "Stmt");
+  py::class_<Expr, TreeView>(m, "Expr");
+  py::class_<Def, TreeView>(m, "Def")
+    .def(py::init([](const Ident& name,
+                     std::vector<Param> params,
+                     std::vector<Stmt> body) {
+      auto r = name.range();
+      return Def::create(r,
+                         name,
+                         wrap_list(r, std::move(params)),
+                         wrap_list(r, std::move(body)));
+    }));
+
+
+  py::class_<Assign, Stmt>(m, "Assign")
+    .def(py::init([](std::vector<Expr> lhs, std::string kind_str, const Expr& rhs) {
+      auto r = lhs.at(0).range();
+      auto kind = AssignKind(Compound::create(stringToKind(kind_str), r, {}));
+      return Assign::create(r, List<Expr>::create(r, std::move(lhs)), kind, rhs);
+    }));
+  py::class_<Return, Stmt>(m, "Return")
+    .def(py::init([](const SourceRange& range, std::vector<Expr> values) {
+      return Return::create(range, wrap_list(range, std::move(values)));
+    }));
+  py::class_<If, Stmt>(m, "If")
+    .def(py::init([](const SourceRange& range, const Expr& cond, std::vector<Stmt> true_branch, std::vector<Stmt> false_branch) {
+      return If::create(range, cond,
+                        wrap_list(range, std::move(true_branch)),
+                        wrap_list(range, std::move(false_branch)));
+    }));
+  py::class_<While, Stmt>(m, "While")
+    .def(py::init([](const SourceRange& range, const Expr& cond, std::vector<Stmt> body) {
+      return While::create(range, cond, wrap_list(range, std::move(body)));
+    }));
+  py::class_<For, Stmt>(m, "For").def(py::init([](const SourceRange range,
+                                                  std::vector<Expr>& targets,
+                                                  std::vector<Expr>& itrs,
+                                                  std::vector<Stmt> body) {
+    return For::create(
+        range,
+        wrap_list(range, std::move(targets)),
+        wrap_list(range, std::move(itrs)),
+        wrap_list(range, std::move(body)));
+  }));
+  py::class_<ExprStmt, Stmt>(m, "ExprStmt")
+    .def(py::init([](std::vector<Expr>& exprs) {
+      auto r = exprs[0].range();
+      return ExprStmt::create(r, wrap_list(r, std::move(exprs)));
+    }));
+
+  py::class_<Var, Expr>(m, "Var")
+    .def(py::init([](const Ident& name) {
+      return Var::create(name.range(), name);
+    }))
+    .def_property_readonly("name", [](const Var& var) { return var.name(); });
+  py::class_<BinOp, Expr>(m, "BinOp")
+    .def(py::init([](std::string kind, const Expr& lhs, const Expr& rhs) {
+      return BinOp::create(lhs.range(), stringToKind(kind), lhs, rhs);
+    }));
+  // NB: we take range here, because unary ops precede their exprs, so we need to include them
+  py::class_<UnaryOp, Expr>(m, "UnaryOp")
+    .def(py::init([](const SourceRange& range, std::string kind, const Expr& expr) {
+      auto resolved_kind = stringToKind(kind);
+      resolved_kind = resolved_kind == '-' ? TK_UNARY_MINUS : resolved_kind;
+      return UnaryOp::create(range, resolved_kind, expr);
+    }));
+  py::class_<Const, Expr>(m, "Const")
+    .def(py::init([](const SourceRange& range, std::string value) {
+      return Const::create(range, value);
+    }));
+  py::class_<Apply, Expr>(m, "Apply")
+    .def(py::init([](const Expr& expr, std::vector<Expr> args, std::vector<Attribute> kwargs) {
+      auto r = expr.range();
+      return Apply::create(expr.range(), expr,
+                           wrap_list(r, std::move(args)), wrap_list(r, std::move(kwargs)));
+    }));
+  py::class_<Select, Expr>(m, "Select")
+    .def(py::init([](const Expr& expr, const Ident& field) {
+      auto r = expr.range();
+      return Select::create(expr.range(), expr, field);
+    }));
+  py::class_<TernaryIf, Expr>(m, "TernaryIf")
+    .def(py::init([](const Expr& cond, const Expr& true_expr, const Expr& false_expr) {
+      return TernaryIf::create(cond.range(), cond, true_expr, false_expr);
+    }));
+  py::class_<ListLiteral, Expr>(m, "ListLiteral")
+    .def(py::init([](const SourceRange& range, std::vector<Expr> args) {
+      return ListLiteral::create(range, wrap_list(range, std::move(args)));
+    }));
+  py::class_<Gather, Expr>(m, "Gather")
+    .def(py::init([](const Expr& base, const Expr& index) {
+      return Gather::create(base.range(), base, index);
+    }));
+  py::class_<Slice, Expr>(m, "Slice")
+    .def(py::init([](const Expr& base, Expr* lower, Expr* upper) {
+      return Slice::create(base.range(),
+                           base,
+                           wrap_maybe(base.range(), lower),
+                           wrap_maybe(base.range(), upper));
+    }));
+  py::class_<Starred, Expr>(m, "Starred")
+    .def(py::init([](const SourceRange& range, Expr expr){
+      return Starred::create(range, expr);
+    }));
+}
+
+}}} // namespace torch::jit::script
diff --git a/torch/csrc/jit/script/python_tree_views.h b/torch/csrc/jit/script/python_tree_views.h
new file mode 100644
index 0000000..672e707
--- /dev/null
+++ b/torch/csrc/jit/script/python_tree_views.h
@@ -0,0 +1,8 @@
+#include "torch/csrc/python_headers.h"
+
+namespace torch { namespace jit { namespace script {
+
+void initTreeViewBindings(PyObject *module);
+
+}}} // namespace torch::jit::script
+
diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h
new file mode 100644
index 0000000..e3d69d2
--- /dev/null
+++ b/torch/csrc/jit/script/tree.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <functional>
+
+#include "torch/csrc/jit/script/lexer.h"
+
+namespace torch {
+namespace jit {
+namespace script {
+
+// Tree's are used to represent all forms of TC IR, pre- and post- typechecking.
+// Rather than have a full class hierarchy for all TC statements,
+// Trees are a slight variation of Lisp S-expressions.
+// for instance the expression a*b+1 is represented as:
+// (+ (* (ident a) (ident b)) (const 1))
+// Atoms like 'a', 'b', and '1' are represented by subclasses of Tree which
+// define stringValue().
+// Everything else is a Compound object, which has a 'kind' that is a token from
+// Lexer.h's TokenKind enum, and contains a list of subtrees.
+// Like TokenKind single-character operators like '+' are representing using the
+// character itself, so add.kind() == '+'.
+// Compound objects are also always associated with a SourceRange for
+// reporting error message.
+
+// Memory management of trees is done using shared_ptr.
+
+struct Tree;
+using TreeRef = std::shared_ptr<Tree>;
+using TreeList = std::vector<TreeRef>;
+
+static const TreeList empty_trees = {};
+
+struct Tree : std::enable_shared_from_this<Tree> {
+  Tree(int kind_) : kind_(kind_) {}
+  int kind() const {
+    return kind_;
+  }
+  virtual bool isAtom() const {
+    return true;
+  }
+  virtual const SourceRange& range() const {
+    throw std::runtime_error("is an Atom");
+  }
+  virtual const std::string& stringValue() const {
+    throw std::runtime_error("stringValue can only be called on TK_STRING");
+  }
+  virtual const TreeList& trees() const {
+    return empty_trees;
+  }
+  const TreeRef& tree(size_t i) const {
+    return trees().at(i);
+  }
+  virtual TreeRef map(std::function<TreeRef(TreeRef)> fn) {
+    return shared_from_this();
+  }
+  template <typename... Args>
+  void match(int k, Args&... args) {
+    matchD(k, "unknown", 0, args...);
+  }
+  template <typename... Args>
+  void matchD(int k, const char* filename, int lineno, Args&... args) {
+    std::initializer_list<TreeRef*> vars = {&args...};
+    matchNumSubtreesD(k, filename, lineno, vars.size(), true);
+    size_t i = 0;
+    for (TreeRef* v : vars) {
+      *v = trees()[i++];
+    }
+  }
+  void matchNumSubtrees(int k, size_t expected_subtrees) {
+    return matchNumSubtreesD(k, "unknown", 0, expected_subtrees, false);
+  }
+  void matchNumSubtreesD(int k, const char* filename, int lineno,
+                         size_t expected_subtrees, bool allow_more) {
+    if (kind() != k) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expecting kind '" << kindToString(k)
+         << "' but found '" << kind() << "'\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+    if (trees().size() < expected_subtrees ||
+        (!allow_more && trees().size() != expected_subtrees)) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expected at least " << expected_subtrees
+         << " subtrees, but found only " << trees().size() << "\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+  }
+  virtual ~Tree() {}
+
+ private:
+  int kind_;
+};
+
+struct String : public Tree {
+  String(const std::string& value_) : Tree(TK_STRING), value_(value_) {}
+  virtual const std::string& stringValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return std::make_shared<String>(std::forward<Args>(args)...);
+  }
+
+ private:
+  std::string value_;
+};
+
+static SourceRange mergeRanges(SourceRange c, const TreeList& others) {
+  for (auto t : others) {
+    if (t->isAtom())
+      continue;
+    size_t s = std::min(c.start(), t->range().start());
+    size_t e = std::max(c.end(), t->range().end());
+    c = SourceRange(c.file_ptr(), s, e);
+  }
+  return c;
+}
+
+struct Compound : public Tree {
+  Compound(int kind, const SourceRange& range_) : Tree(kind), range_(range_) {}
+  Compound(int kind, const SourceRange& range_, TreeList&& trees_)
+      : Tree(kind),
+        range_(mergeRanges(range_, trees_)),
+        trees_(std::move(trees_)) {}
+  virtual const TreeList& trees() const override {
+    return trees_;
+  }
+  static TreeRef
+  create(int kind, const SourceRange& range_, TreeList&& trees_) {
+    return std::make_shared<Compound>(kind, range_, std::move(trees_));
+  }
+  virtual bool isAtom() const override {
+    return false;
+  }
+  virtual TreeRef map(std::function<TreeRef(TreeRef)> fn) override {
+    TreeList trees_;
+    for (auto& t : trees()) {
+      trees_.push_back(fn(t));
+    }
+    return Compound::create(kind(), range(), std::move(trees_));
+  }
+  const SourceRange& range() const override {
+    return range_;
+  }
+
+ private:
+  SourceRange range_;
+  TreeList trees_;
+};
+
+// tree pretty printer
+struct pretty_tree {
+  pretty_tree(const TreeRef& tree, size_t col = 40) : tree(tree), col(col) {}
+  const TreeRef& tree;
+  size_t col;
+  std::unordered_map<TreeRef, std::string> flat_strings;
+  const std::string& get_flat(const TreeRef& t) {
+    auto it = flat_strings.find(t);
+    if (it != flat_strings.end())
+      return it->second;
+
+    std::stringstream out;
+    switch (t->kind()) {
+      case TK_STRING:
+        out << t->stringValue();
+        break;
+      default:
+        out << "(" << kindToString(t->kind());
+        for (auto e : t->trees()) {
+          out << " " << get_flat(e);
+        }
+        out << ")";
+        break;
+    }
+    auto it_ = flat_strings.emplace(t, out.str());
+    return it_.first->second;
+  }
+  void print(std::ostream& out, const TreeRef& t, int indent) {
+    const std::string& s = get_flat(t);
+    if (indent + s.size() < col || t->isAtom()) {
+      out << s;
+      return;
+    }
+    std::string k = kindToString(t->kind());
+    out << "(" << k;
+    for (auto e : t->trees()) {
+      out << "\n" << std::string(indent + 2, ' ');
+      print(out, e, indent + 2);
+    }
+    out << ")";
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& out, pretty_tree t_) {
+  t_.print(out, t_.tree, 0);
+  return out << std::endl;
+}
+
+static inline std::ostream& operator<<(std::ostream& out, TreeRef t) {
+  return out << pretty_tree(t);
+}
+
+} // namespace script
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
new file mode 100644
index 0000000..77325df
--- /dev/null
+++ b/torch/csrc/jit/script/tree_views.h
@@ -0,0 +1,753 @@
+#pragma once
+#include "error_report.h"
+#include "tree.h"
+
+#include <functional>
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace script {
+
+// TreeView provides a statically-typed way to traverse the tree, which should
+// be formed according to the grammar below.
+//
+// A few notes on types and their aliases:
+// - List<T> is really a Tree with kind TK_LIST and elements as subtrees
+// - Maybe<T> is really a Tree with kind TK_OPTION that has 0 or 1 subtree of type T
+// - Builtin types are: Ident (TK_IDENT), String (TK_STRING)
+//
+// Type  = TensorType()                                                 TK_TENSOR_TYPE
+// Param = Param(Type type, Ident name)                                 TK_PARAM
+//
+// Def   = Def(Ident name, List<Param> params, List<Stmt> body)         TK_DEF
+//
+// Stmt  = If(Expr cond, List<Stmt> true_body, List<Stmt> false_body)   TK_IF
+//       | For(List<Expr> targets, List<Expr> iters, List<Stmt> body)   TK_FOR
+//       | While(Expr cond, List<Stmt> body)                            TK_WHILE
+//       | Global(List<Ident> idents)                                   TK_GLOBAL
+//       -- NB: the only type of Expr's allowed on lhs are Starred and Var
+//       | Assign(List<Expr> lhs, AssignType maybe_reduce, Expr rhs)    TK_ASSIGN
+//       | Return(List<Expr> values)                                    TK_RETURN
+//       | ExprStmt(List<Expr> expr)                                    TK_EXPR_STMT
+//
+// Expr  = TernaryIf(Expr cond, Expr true_expr, Expr false_expr)        TK_IF_EXPR
+//       | BinOp(Expr lhs, Expr rhs)
+//       |     And                                                      TK_AND
+//       |     Or                                                       TK_OR
+//       |     Lt                                                       '<'
+//       |     Gt                                                       '>'
+//       |     Eq                                                       TK_EQ
+//       |     Le                                                       TK_LE
+//       |     Ge                                                       TK_GE
+//       |     Ne                                                       TK_NE
+//       |     Add                                                      '+'
+//       |     Sub                                                      '-'
+//       |     Mul                                                      '*'
+//       |     Div                                                      '/'
+//       |     MatMult                                                  '@'
+//       |     Pow                                                      TK_POW
+//       | UnaryOp(Expr expr)
+//       |     Not                                                      TK_NOT
+//       |     USub                                                     '-'
+//       | Const(String value)                                          TK_CONST
+//       | Cast(ScalarType type, Expr expr)                             TK_CAST
+//       -- NB: x.name(y) is desugared into name(x, y)
+//       | Apply(Ident name, List<Expr> args, List<Attribute> kwargs)   TK_APPLY
+//       | Select(Expr base, Ident attr_name)                           '.'
+//       | Slice(Expr value, Maybe<Expr> first, Maybe<Expr> second)     TK_SLICE
+//       | Gather(Expr value, Expr indices)                             TK_GATHER
+//       | Var(Ident name)                                              TK_VAR
+//       | ListLiteral(List<Expr> inputs)                               TK_LIST_LITERAL
+//       | Starred(Expr expr)                                           TK_STARRED
+//
+// -- NB: only allowed expressions are Const or List(Const)
+//        (List as a value, not type constructor)
+// Attribute = Attribute(Ident name, Expr value)                        TK_ATTRIBUTE
+//
+// AssignKind = Regular()                                               '='
+//            | Add()                                                   TK_PLUS_EQ
+//            | Sub()                                                   TK_MINUS_EQ
+//            | Mul()                                                   TK_TIMES_EQ
+//            | Div()                                                   TK_DIV_EQ
+//
+// ScalarType = IntType()                                               TK_INT
+//            | FloatType()                                             TK_FLOAT
+//            | LongType()                                              TK_LONG
+//            | DoubleType()                                            TK_DOUBLE
+
+// Each subclass of TreeView should provide:
+// 1. Constructor that takes a TreeRef, and checks that it's of the right type.
+// 2. Accessors that get underlying information out of the object. If they
+//    return subtrees, they should wrap them in appropriate views too.
+// 3. Static method 'create' that creates the underlying TreeRef object
+//    for every TreeRef kind that has a TreeView, the parser always uses
+//    (e.g.) Ident::create rather than Compound::Create, this means that
+//    changes to the structure of Ident are always made right here rather
+//    than both in the parser and in this code.
+// XXX: these structs should have no fields to prevent slicing when passing by value
+struct TreeView {
+  explicit TreeView(const TreeRef& tree_) : tree_(tree_) {}
+  TreeRef tree() const {
+    return tree_;
+  }
+  const SourceRange& range() const {
+    return tree_->range();
+  }
+  operator TreeRef() const {
+    return tree_;
+  }
+  const TreeRef& get() const {
+    return tree_;
+  }
+  int kind() const {
+    return tree_->kind();
+  }
+
+protected:
+  const TreeRef& subtree(size_t i) const {
+    return tree_->trees().at(i);
+  }
+  TreeRef tree_;
+};
+
+template<typename T>
+struct ListIterator {
+  ListIterator(TreeList::const_iterator it) : it(it) {}
+  bool operator!=(const ListIterator& rhs) const { return it != rhs.it; }
+  bool operator==(const ListIterator& rhs) const { return it == rhs.it; }
+  T operator*() const { return T(*it); }
+  ListIterator& operator+=(std::ptrdiff_t n) { it += n; return *this; }
+  ListIterator& operator++() { ++it; return *this; }
+  ListIterator& operator--() { --it; return *this; }
+
+private:
+  TreeList::const_iterator it;
+};
+
+template <typename T>
+struct List : public TreeView {
+  using iterator = ListIterator<T>;
+  using const_iterator = ListIterator<T>;
+
+  List(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_LIST);
+    // Iterate over list to temporarily instantiate Ts that will check the type
+    for (const T& elem : *this) {
+      (void) elem; //silence unused warning
+    }
+  }
+  iterator begin() const {
+    return iterator(tree_->trees().begin());
+  }
+  iterator end() const {
+    return iterator(tree_->trees().end());
+  }
+  bool empty() const {
+    return tree_->trees().begin() == tree_->trees().end();
+  }
+  T operator[](size_t i) const {
+    return T(subtree(i));
+  }
+  TreeRef map(std::function<TreeRef(const T&)> fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static List create(const SourceRange& range, const std::vector<T>& subtrees) {
+    TreeList type_erased_sub {subtrees.begin(), subtrees.end()};
+    return List(Compound::create(TK_LIST, range, std::move(type_erased_sub)));
+  }
+  size_t size() const {
+    return tree_->trees().size();
+  }
+};
+
+template <typename T>
+struct Maybe : public TreeView {
+  explicit Maybe(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_OPTION);
+    if (tree_->trees().size() > 1)
+      throw ErrorReport(tree) << "Maybe trees can have at most one subtree";
+  }
+  /* implicit */ Maybe(const T& tree) : TreeView(tree) {}
+  bool present() const {
+    return tree_->trees().size() > 0;
+  }
+  T get() const {
+    return T(tree_->trees().at(0));
+  }
+  TreeRef map(std::function<TreeRef(const T&)> fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static Maybe<T> create(const SourceRange& range) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {}));
+  }
+  static Maybe<T> create(const SourceRange& range, const T& value) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {value}));
+  }
+};
+
+struct Ident : public TreeView {
+  explicit Ident(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_IDENT);
+  }
+  const std::string& name() const {
+    return subtree(0)->stringValue();
+  }
+  static Ident create(const SourceRange& range, const std::string& name) {
+    return Ident(Compound::create(TK_IDENT, range, {String::create(name)}));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Base types (production LHS)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Type : public TreeView {
+  explicit Type(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_TENSOR_TYPE:
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid Type";
+    }
+  }
+};
+
+struct Stmt : public TreeView {
+  explicit Stmt(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF:
+      case TK_FOR:
+      case TK_WHILE:
+      case TK_GLOBAL:
+      case TK_ASSIGN:
+      case TK_RETURN:
+      case TK_EXPR_STMT:
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid Stmt";
+    }
+  }
+};
+
+struct Expr : public TreeView {
+  explicit Expr(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF_EXPR:
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '-':
+      case TK_UNARY_MINUS:
+      case '*':
+      case TK_STARRED:
+      case '/':
+      case TK_NOT:
+      case TK_CONST:
+      case TK_TRUE:
+      case TK_FALSE:
+      case TK_CAST:
+      case TK_APPLY:
+      case '.':
+      case TK_SLICE:
+      case TK_GATHER:
+      case TK_VAR:
+      case TK_LIST_LITERAL:
+      case '@':
+      case TK_POW:
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid Expr";
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper nodes (mostly for function arguments)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Attribute : public TreeView {
+  explicit Attribute(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_ATTRIBUTE);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Expr value() const {
+    return Expr(subtree(1));
+  }
+  static Attribute create(const SourceRange& range, const Ident& name, const TreeRef& value) {
+    return Attribute(Compound::create(TK_ATTRIBUTE, range, {name, value}));
+  }
+};
+
+
+struct Param : public TreeView {
+  explicit Param(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_PARAM);
+  }
+  static Param create(const SourceRange& range, const Ident& ident, const Type& type) {
+    return Param(Compound::create(TK_PARAM, range, {ident, type}));
+  }
+  Ident ident() const {
+    return Ident(subtree(0));
+  }
+  Type type() const {
+    return Type(subtree(1));
+  }
+  template<typename T>
+  T typeExpect() const {
+    return T(type());
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Type
+////////////////////////////////////////////////////////////////////////////////
+
+struct TensorType : public Type {
+  explicit TensorType(const TreeRef& tree) : Type(tree) {
+    tree_->match(TK_TENSOR_TYPE);
+  }
+  static TensorType create(const SourceRange& range) {
+    return TensorType(Compound::create(TK_TENSOR_TYPE, range, {}));
+  }
+};
+
+struct ScalarType : public TreeView {
+  explicit ScalarType(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_INT:
+      case TK_LONG:
+      case TK_FLOAT:
+      case TK_DOUBLE:
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid ScalarType";
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Top level definitions
+////////////////////////////////////////////////////////////////////////////////
+
+struct Def : public TreeView {
+  explicit Def(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DEF);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  List<Param> params() const {
+    return List<Param>(subtree(1));
+  }
+  List<Stmt> statements() const {
+    return List<Stmt>(subtree(2));
+  }
+  static Def create(
+      const SourceRange& range,
+      const Ident& name,
+      const List<Param>& params,
+      const List<Stmt>& stmts) {
+    return Def(Compound::create(
+        TK_DEF, range, {name, params, stmts}));
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Statements
+////////////////////////////////////////////////////////////////////////////////
+
+struct If : public Stmt {
+  explicit If(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_IF);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> trueBranch() const {
+    return List<Stmt>(subtree(1));
+  }
+  List<Stmt> falseBranch() const {
+    return List<Stmt>(subtree(2));
+  }
+  static If create(
+      const SourceRange& range,
+      const Expr& cond,
+      const List<Stmt>& true_branch,
+      const List<Stmt>& false_branch) {
+    return If(Compound::create(TK_IF, range, {cond, true_branch, false_branch}));
+  }
+};
+
+struct While : public Stmt {
+  explicit While(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_WHILE);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(1));
+  }
+  static While create(const SourceRange& range, const Expr& cond, const List<Stmt>& body) {
+    return While(Compound::create(TK_WHILE, range, {cond, body}));
+  }
+};
+
+struct For : public Stmt {
+  explicit For(const TreeRef& tree) : Stmt(tree) {
+    tree->match(TK_FOR);
+  }
+  List<Expr> targets() const {
+    return List<Expr>(subtree(0));
+  }
+  List<Expr> itrs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(2));
+  }
+  static For create(
+      const SourceRange& range,
+      const List<Expr>& targets,
+      const List<Expr>& itrs,
+      const List<Stmt>& body) {
+    return For(Compound::create(TK_FOR, range, {targets, itrs, body}));
+  }
+};
+
+struct Global : public Stmt {
+  explicit Global(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_GLOBAL);
+  }
+  List<Ident> names() {
+    return List<Ident>(subtree(0));
+  }
+  static Global create(const SourceRange& range, const List<Ident>& names) {
+    return Global(Compound::create(TK_GLOBAL, range, {names}));
+  }
+};
+
+struct AssignKind : public TreeView {
+  explicit AssignKind(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case '=':
+      case '+':
+      case '-':
+      case '*':
+      case '/':
+        return;
+      default:
+        throw ErrorReport(tree) << "is not a valid AssignKind";
+    }
+  }
+};
+
+struct Assign : public Stmt {
+  explicit Assign(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_ASSIGN);
+  }
+  static Assign create(
+      const SourceRange& range,
+      const List<Expr>& lhs,
+      const AssignKind& reduction,
+      const Expr& rhs) {
+    return Assign(Compound::create(TK_ASSIGN, range, {lhs, reduction, rhs}));
+  }
+  List<Expr> lhs() const {
+    return List<Expr>(subtree(0));
+  }
+  int reduction() const {
+    return subtree(1)->kind();
+  }
+  Expr rhs() const {
+    return Expr(subtree(2));
+  }
+};
+
+struct Return : public Stmt {
+  explicit Return(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_RETURN);
+  }
+  List<Expr> values() const {
+    return List<Expr>(subtree(0));
+  }
+  static Return create(const SourceRange& range, const List<Expr>& values) {
+    return Return(Compound::create(TK_RETURN, range, {values}));
+  }
+};
+
+struct ExprStmt : public Stmt {
+  explicit ExprStmt(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_EXPR_STMT);
+  }
+  List<Expr> exprs() {
+    return List<Expr>(subtree(0));
+  }
+  static ExprStmt create(const SourceRange& range, const List<Expr>& list) {
+    return ExprStmt(Compound::create(TK_EXPR_STMT, range, {list}));
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Expressions
+////////////////////////////////////////////////////////////////////////////////
+
+struct BinOp : public Expr {
+  explicit BinOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '*':
+      case '/':
+      case '-':
+      case '@':
+      case TK_POW:
+        if (tree->trees().size() != 2)
+          throw ErrorReport(tree) << "BinOp expected 2 subtrees, found " << tree->trees().size();
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid BinOp";
+    }
+  }
+  Expr lhs() const {
+    return Expr(subtree(0));
+  }
+  Expr rhs() const {
+    return Expr(subtree(1));
+  }
+  static BinOp create(const SourceRange& range, int kind, const Expr& lhs, const Expr& rhs) {
+    return BinOp(Compound::create(kind, range, {lhs, rhs}));
+  }
+};
+
+struct UnaryOp : public Expr {
+  explicit UnaryOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_UNARY_MINUS:
+      case TK_NOT:
+        if (tree->trees().size() != 1)
+          throw ErrorReport(tree) << "UnaryOp expected 1 subtree, found " << tree->trees().size();
+        return;
+      default:
+        throw ErrorReport(tree) << kindToString(tree->kind()) << " is not a valid UnaryOp";
+    }
+  }
+  static UnaryOp create(const SourceRange& range, int kind, const Expr& expr) {
+    return UnaryOp(Compound::create(kind, range, {expr}));
+  }
+};
+
+struct Const : public Expr {
+  explicit Const(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_CONST, 1);
+  }
+  bool isFloatingPoint() const {
+    return subtree(0)->stringValue().find_first_of(".eE") != std::string::npos;
+  }
+  bool isIntegral() const {
+    return !isFloatingPoint();
+  }
+  int64_t asIntegral() const {
+    return std::stoll(subtree(0)->stringValue());
+  }
+  double asFloatingPoint() const {
+    return std::stod(subtree(0)->stringValue());
+  }
+  const std::string& text() const {
+    return subtree(0)->stringValue();
+  }
+  static Const create(const SourceRange& range, const std::string& value) {
+    return Const(Compound::create(TK_CONST, range, {String::create(value)}));
+  }
+};
+
+struct Cast : public Expr {
+  explicit Cast(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_CAST);
+  }
+  ScalarType type() const {
+    return ScalarType(subtree(0));
+  }
+  Expr input() const {
+    return Expr(subtree(1));
+  }
+  static Cast create(const SourceRange& range, const Type& type, const Expr& input) {
+    return Cast(Compound::create(TK_CAST, range, {type, input}));
+  }
+};
+
+struct Apply : public Expr {
+  explicit Apply(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_APPLY);
+  }
+  Expr callee() const {
+    return Expr(subtree(0));
+  }
+  List<Expr> inputs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Attribute> attributes() const {
+    return List<Attribute>(subtree(2));
+  }
+  static Apply create(
+      const SourceRange& range,
+      const Expr& callee,
+      const List<Expr>& inputs,
+      const List<Attribute>& attributes) {
+    return Apply(Compound::create(TK_APPLY, range, {callee, inputs, attributes}));
+  }
+};
+
+struct Select : public Expr {
+  explicit Select(const TreeRef& tree) : Expr(tree) {
+    tree_->match('.');
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  Ident selector() const {
+    return Ident(subtree(1));
+  }
+  static Select create(const SourceRange& range, const Expr& value, const Ident& selector) {
+    return Select(Compound::create('.', range, {value, selector}));
+  }
+};
+
+struct Slice : public Expr {
+  explicit Slice(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_SLICE);
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  Maybe<Expr> start() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  Maybe<Expr> end() const {
+    return Maybe<Expr>(subtree(2));
+  }
+  Expr startOr(int alternative) const {
+    const auto startOption = start();
+    return startOption.present() ? startOption.get() : createInt(alternative);
+  }
+  Expr endOr(int alternative) const {
+    const auto endOption = end();
+    return endOption.present() ? endOption.get() : createInt(alternative);
+  }
+  static Slice create(
+      const SourceRange& range,
+      const Expr& value,
+      const Maybe<Expr>& start,
+      const Maybe<Expr>& end) {
+    return Slice(Compound::create(TK_SLICE, range, {value, start, end}));
+  }
+private:
+  Expr createInt(int value) const {
+    return Expr(Const::create(range(), std::to_string(value)));
+  }
+};
+
+struct Gather : public Expr {
+  explicit Gather(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_GATHER);
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  Expr indices() const {
+    return Expr(subtree(1));
+  }
+  static Gather create(const SourceRange& range, const Expr& value, const Expr& indices) {
+    return Gather(Compound::create(TK_GATHER, range, {value, indices}));
+  }
+};
+
+struct Var : public Expr {
+  explicit Var(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_VAR);
+  };
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  static Var create(const SourceRange& range, const Ident& name) {
+    return Var(Compound::create(TK_VAR, range, {name}));
+  }
+};
+
+struct TernaryIf : public Expr {
+  explicit TernaryIf(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_IF_EXPR, 3);
+  };
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  Expr true_expr() const {
+    return Expr(subtree(1));
+  }
+  Expr false_expr() const {
+    return Expr(subtree(2));
+  }
+  static TernaryIf create(const SourceRange& range,
+                          const Expr& cond,
+                          const Expr& true_expr,
+                          const Expr& false_expr) {
+    return TernaryIf(Compound::create(TK_IF_EXPR, range, {cond, true_expr, false_expr}));
+  };
+};
+
+
+struct ListLiteral : public Expr {
+  explicit ListLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_LIST_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static ListLiteral create(const SourceRange& range, const List<Expr>& inputs) {
+    return ListLiteral(Compound::create(TK_LIST_LITERAL, range, {inputs}));
+  }
+};
+
+
+struct Starred : public Expr {
+  explicit Starred(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_STARRED);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Starred create(const SourceRange& range, const Expr& expr) {
+    return Starred(Compound::create(TK_STARRED, range, {expr}));
+  }
+};
+
+} // namespace script
+} // namespace jit
+} // namespace torch
+
+namespace std {
+
+template<typename T>
+struct iterator_traits<torch::jit::script::ListIterator<T>>
+  : std::iterator_traits<torch::jit::script::TreeList::const_iterator> {};
+
+} // namespace std
diff --git a/torch/csrc/jit/source_location.h b/torch/csrc/jit/source_location.h
new file mode 100644
index 0000000..ec55ce8
--- /dev/null
+++ b/torch/csrc/jit/source_location.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <ostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace torch { namespace jit {
+// SourceLocation represents source code-level debug information for a node.
+// It contains information about where a node got generated.
+// In the case of tracing this will be a python stack trace.
+// In the case of using the scripting frontend this will be backed
+// by a SourceRange object
+struct SourceLocation {
+  virtual ~SourceLocation() = default;
+  virtual void highlight(std::ostream & out) const = 0;
+  void wrapAndRethrowException(const std::exception & e, const std::string & additional = "") {
+    std::stringstream msg;
+    msg << "\n" << e.what() << ":\n";
+    if(additional.size() != 0) {
+      msg << additional << ":\n";
+    }
+    highlight(msg);
+    throw std::runtime_error(msg.str());
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const SourceLocation& sl) {
+  sl.highlight(out);
+  return out;
+}
+
+
+// normally a python stack trace
+struct StringSourceLocation : public SourceLocation {
+  StringSourceLocation(std::string context)
+  : context(std::move(context)) {}
+  void highlight(std::ostream & out) const override {
+    out << context;
+  }
+private:
+  std::string context;
+};
+
+}}
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
new file mode 100644
index 0000000..e4d1d18
--- /dev/null
+++ b/torch/csrc/jit/stack.h
@@ -0,0 +1,95 @@
+#pragma once
+#include "ATen/ATen.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/ivalue.h"
+
+namespace torch { namespace jit {
+
+using Stack = std::vector<IValue>;
+using Operation = std::function<int(Stack&)>;
+
+// An operation with N inputs and M outputs pops the last N inputs off
+// the stack and pushes its M inputs onto the stack
+// before: <other stack items> I0, I1, ... IN <- stack.back()
+// after: <other stack items> O0, O1, ... OM
+// operations are defined this way so that ownership of inputs can be transferred
+// to the operation and it can incrementally drop ownership of tensors
+// when they become unneeded. For large operations, like 'run an entire subgraph',
+// this functionality is very important for minimizing gpu memory usage
+// return value is the relative 'offset' to jump to for the next operation:
+// pc += 1 + offset
+// so a return value of 0 goes to the next instruction
+
+// treat the last N elements of the stack as a list, looking up
+// element i
+static inline IValue & peek(Stack & stack, size_t i, size_t N) {
+  return *(stack.end() - N + i);
+}
+// treat the last N elements of the stack as a list, looking up the
+// slice starting at index i and having length len
+static inline at::ArrayRef<IValue> peekSlice(Stack & stack, size_t i, size_t len, size_t N) {
+  return at::ArrayRef<IValue>(stack).slice(stack.size() - N + i, len);
+}
+static inline at::ArrayRef<IValue> last(Stack & stack, size_t N) {
+  return peekSlice(stack, 0, N, N);
+}
+static inline void drop(Stack & stack, size_t n) {
+  stack.erase(stack.end() - n, stack.end());
+}
+static inline IValue pop(Stack & stack) {
+  auto r = std::move(stack.back());
+  stack.pop_back();
+  return r;
+}
+
+// The packer here is carefully written not to make any unnecessary
+// copies.
+
+// pack takes the return values of aten functions pushes them onto the stack
+template<typename T>
+inline void pack(Stack & stack, T&& v) {
+  stack.push_back(IValue(as_variable(std::move(v))));
+}
+template<>
+inline void pack(Stack & stack, at::Tensor&& v) {
+  stack.push_back(IValue(std::move(v)));
+}
+
+template<>
+inline void pack(Stack & stack, autograd::Variable&& v) {
+  stack.push_back(IValue(std::move(v)));
+}
+
+template<>
+inline void pack(Stack & stack, std::vector<at::Tensor>&& ts) {
+  for(auto& t : ts) {
+    stack.push_back(IValue(std::move(t)));
+  }
+}
+
+template<std::size_t remaining, typename... Args>
+struct TuplePacker
+{
+  // NB: *Not* a universal reference.
+  static void execute(Stack & stack, std::tuple<Args...> && t)
+  {
+    // NB: The move here does not "destroy" the entire tuple, that is
+    // not what std::move does; only the particular tuple index
+    // processed here gets stolen.
+    pack(stack, std::get<sizeof...(Args) - remaining>(std::move(t)));
+    TuplePacker<remaining - 1, Args...>::execute(stack, std::move(t));
+  }
+};
+
+template<typename... Args>
+struct TuplePacker<0, Args...>
+{
+  static void execute(Stack & stack, std::tuple<Args...> && t) {};
+};
+
+template<typename... Args>
+inline void pack(Stack & stack, std::tuple<Args...> && t) {
+  TuplePacker<sizeof...(Args), Args...>::execute(stack, std::move(t));
+}
+
+}}
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
new file mode 100644
index 0000000..dfbeb33
--- /dev/null
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -0,0 +1,268 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+struct SymbolicVariable {
+  SymbolicVariable() : v(nullptr) {}
+  /* implicit */ SymbolicVariable(Value * v) : v(v) {}
+  // we allow implicit conversions to/from Value since
+  // this type truly just provides more methods for value
+  operator Value*() {
+    return v;
+  }
+  static SymbolicVariable asNewInput(Graph & g, std::string name = "") {
+    return g.addInput(name);
+  }
+  static SymbolicVariable asNewInput(Graph & g, TypePtr type) {
+    return g.addInput()->setType(std::move(type));
+  }
+  const std::vector<int64_t>& sizes() {
+    return v->type()->expect<TensorType>()->sizes();
+  }
+  void addAsOutput() {
+    v->owningGraph()->registerOutput(v);
+  }
+  static std::vector<SymbolicVariable> create(Symbol kind, ArrayRef<SymbolicVariable> inputs,
+                                 int num_outputs = 1,
+                                 Node** created_node = nullptr,
+                                 Graph * g = nullptr) {
+      if(g == nullptr) {
+        g = inputs.at(0).value()->owningGraph();
+      }
+      Node * n = g->insertNode(g->create(kind, num_outputs));
+      for(auto i : inputs) {
+        n->addInput(i.value());
+      }
+      if(created_node) {
+        *created_node = n;
+      }
+      std::vector<SymbolicVariable> out;
+      for(auto v : n->outputs()) {
+        out.emplace_back(v);
+      }
+      return out;
+  }
+  static bool isConstInt(at::Scalar s, int32_t i) {
+    // int32_t is safely convertible to both double and int64_t
+    if(s.isFloatingPoint()) {
+      return (double) i == s.toDouble();
+    } else {
+      return (int64_t) i == s.toLong();
+    }
+  }
+  SymbolicVariable operator*(const SymbolicVariable rhs) const {
+    return create(aten::mul, {*this, rhs})[0].typeLike(*this);
+  }
+  SymbolicVariable operator*(at::Scalar rhs) const {
+    if(isConstInt(rhs, 1))
+      return *this;
+    Node * n;
+    auto r = create(aten::mul, {*this}, 1, &n)[0];
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator>(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::gt, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator<(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::lt, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator>=(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::ge, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator<=(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::le, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator==(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::eq, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator!=(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::ne, {*this}, 1, &n)[0].typeLikeWithScalarType(*this, at::kByte);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator+(const SymbolicVariable rhs) const {
+    Node * n;
+    auto r = create(aten::add, {*this, rhs}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::alpha, at::Scalar(1).toTensor());
+    return r;
+  }
+  SymbolicVariable operator+(at::Scalar rhs) const {
+    Node * n;
+    auto r = create(aten::add, {*this}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::alpha, at::Scalar(1).toTensor());
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator-() const {
+    return create(aten::neg, {*this})[0].typeLike(*this);
+  }
+  SymbolicVariable operator-(const SymbolicVariable rhs) const {
+    Node *n;
+    auto r = create(aten::sub, {*this, rhs}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::alpha, at::Scalar(1).toTensor());
+    return r;
+  }
+  SymbolicVariable operator/(at::Scalar rhs) const {
+    Node *n;
+    auto r = create(aten::div, {*this}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator%(at::Scalar rhs) const {
+    Node *n;
+    auto r = create(aten::remainder, {*this}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable mm(const SymbolicVariable rhs) const {
+    auto r = create(t("mm"), {*this, rhs})[0];
+    return r;
+  }
+  SymbolicVariable t() const {
+    auto r = create(t("t"), {*this})[0];
+    return r;
+  }
+  SymbolicVariable sigmoid() const {
+    return create(aten::sigmoid, {*this})[0].typeLike(*this);
+  }
+  SymbolicVariable tanh() const {
+    return create(aten::tanh, {*this})[0].typeLike(*this);
+  }
+  std::vector<SymbolicVariable> chunk(int32_t chunks, uint32_t dim) const {
+    Node * n;
+    auto r = create(t("chunk"), { *this }, chunks, &n);
+    n->i_(a("chunks"), chunks)
+     ->i_(a("dim"), dim);
+    return r;
+  }
+  SymbolicVariable type_as(const SymbolicVariable rhs) const {
+    return create(aten::type_as, {*this, rhs})[0].typeLikeWithRhsScalarType(*this, rhs);
+  }
+  SymbolicVariable narrow(int dim, int64_t start, int64_t length) const {
+    Node * n;
+    auto r = create(t("narrow"), { *this }, 1, &n)[0];
+    n->i_(a("dim"), dim)
+     ->i_(a("start"), start)
+     ->i_(a("length"), length);
+    return r;
+  }
+  static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, int32_t dim) {
+    Node* n;
+    auto r = create(aten::cat, inputs, 1, &n)[0];
+    n->i_(attr::dim, dim);
+    return r;
+  }
+  static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, int32_t dim) {
+    Node* n;
+    auto r = create(aten::stack, inputs, 1, &n)[0];
+    n->i_(attr::dim, dim);
+    return r;
+  }
+  SymbolicVariable sum() const {
+    auto r = create(t("sum"), {*this})[0];
+    return r;
+  }
+  SymbolicVariable sum(int dim, bool keepdim) const {
+    Node * n;
+    auto r = create(t("sum"), {*this}, 1, &n)[0];
+    n->is_(a("dim"), {dim})
+     ->i_(a("keepdim"), keepdim);
+    return r;
+  }
+  SymbolicVariable squeeze(int dim) const {
+    Node * n;
+    auto r = create(t("squeeze"), {*this}, 1, &n)[0];
+    n->i_(a("dim"), dim);
+    return r;
+  }
+  SymbolicVariable unsqueeze(int dim) const {
+    Node * n;
+    auto r = create(t("unsqueeze"), {*this}, 1, &n)[0];
+    n->i_(a("dim"), dim);
+    return r;
+  }
+  SymbolicVariable view(std::vector<std::int64_t> sizes) const {
+    Node *n;
+    auto r =  create(aten::view, {*this}, 1, &n)[0];
+    n->is_(a("size"), std::move(sizes));
+    return r;
+  }
+  SymbolicVariable addmm(SymbolicVariable mat1, SymbolicVariable mat2) const {
+    Node *n;
+    auto r = create(aten::addmm, {*this, mat1, mat2}, 1, &n)[0];
+    n->t_(a("alpha"), at::CPU(at::kFloat).scalarTensor(1.0));
+    n->t_(a("beta"), at::CPU(at::kFloat).scalarTensor(1.0));
+    return r;
+  }
+  Value * value() const {
+    return v;
+  }
+private:
+  SymbolicVariable typeLike(SymbolicVariable other) {
+    if (auto other_type = other.v->type()->cast<TensorType>())
+      v->setType(other_type->contiguous());
+    return *this;
+  }
+  SymbolicVariable typeLikeWithScalarType(SymbolicVariable other, at::ScalarType type) {
+    if (auto other_type = other.v->type()->cast<TensorType>()){
+      auto new_type = other_type->toScalarType(type)->contiguous();
+      v->setType(new_type);
+    }
+    return *this;
+  }
+  SymbolicVariable typeLikeWithRhsScalarType(SymbolicVariable other, SymbolicVariable rhs) {
+    auto other_type = other.v->type()->cast<TensorType>();
+    auto rhs_type = rhs.v->type()->cast<TensorType>();
+    if (other_type && rhs_type){
+      auto new_type = other_type->toScalarType(rhs_type->scalarType())->contiguous();
+      v->setType(new_type);
+    }
+    return *this;
+  }
+  static Symbol a(const char * s_) {
+    return Symbol::attr(s_);
+  }
+  static Symbol t(const char * s_) {
+    return Symbol::aten(s_);
+  }
+  Value * v;
+};
+
+// shorter method so that toVar(v) + toVar(c) is short.
+static inline SymbolicVariable toVar(Value * v) {
+  return SymbolicVariable(v);
+}
+
+template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
+inline SymbolicVariable operator+(T lhs, SymbolicVariable rhs) {
+  return rhs + at::Scalar(lhs);
+}
+
+inline SymbolicVariable operator+(at::Scalar lhs, SymbolicVariable rhs) {
+  return rhs + lhs;
+}
+
+inline SymbolicVariable operator-(at::Scalar lhs, SymbolicVariable rhs) {
+  return (lhs + (-rhs));
+}
+
+}}
diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h
new file mode 100644
index 0000000..84162a4
--- /dev/null
+++ b/torch/csrc/jit/tensor_conversions.h
@@ -0,0 +1,145 @@
+#pragma once
+#include "ATen/ATen.h"
+
+#include <array>
+#include <type_traits>
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace jit {
+
+//////////////////////////////////////////////////////////////////////////////////
+// Tensor -> T conversion
+//////////////////////////////////////////////////////////////////////////////////
+struct tensor_conversion_error : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+template<typename T>
+inline T tensor_as(at::Tensor&& t);
+
+namespace detail {
+
+template<typename T, typename EnableIf = void>
+struct tensor_as_impl {};
+
+template<typename T>
+struct tensor_as_impl<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
+  T operator()(at::Tensor&& t) {
+    // workaround for 1-dim 1-element pytorch tensors until zero-dim
+    // tensors are fully supported
+    if(t.ndimension() == 1 && t.size(0) == 1) {
+      t = t[0];
+    }
+    return at::Scalar(t).to<T>();
+  }
+};
+
+template<>
+struct tensor_as_impl<bool> {
+  bool operator()(at::Tensor&& t) {
+    return tensor_as<int64_t>(std::move(t)) != 0;
+  }
+};
+
+// this is an identity but is needed in constant_as in the compiler
+template<>
+struct tensor_as_impl<at::Tensor> {
+  at::Tensor operator()(at::Tensor&& t) {
+    return t;
+  }
+};
+
+template<size_t N>
+struct tensor_as_impl<std::array<bool, N>> {
+  std::array<bool, N> operator()(at::Tensor&& t) {
+    throw tensor_conversion_error("tensor_as<std::array<bool, N>>: NYI");
+  }
+};
+
+template<>
+struct tensor_as_impl<std::vector<int64_t>> {
+  std::vector<int64_t> operator()(at::Tensor&& t) {
+    if (t.type().scalarType() != at::ScalarType::Long)
+      throw tensor_conversion_error("Expected a LongTensor");
+    if (t.dim() != 1)
+      throw tensor_conversion_error("Expected a 1D LongTensor");
+    if (!t.is_contiguous())
+      throw tensor_conversion_error("Expected a contiguous LongTensor");
+    return std::vector<int64_t>(t.data<int64_t>(), t.data<int64_t>() + t.numel());
+  }
+};
+
+template<>
+struct tensor_as_impl<at::Scalar> {
+  at::Scalar operator()(at::Tensor&& t) {
+    return at::Scalar(t.view({}));
+  }
+};
+
+}
+
+template<typename T>
+inline T tensor_as(at::Tensor&& t) {
+  return detail::tensor_as_impl<T>()(std::move(t));
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// T -> Tensor conversion
+//////////////////////////////////////////////////////////////////////////////////
+
+inline at::Tensor as_tensor(int64_t v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(double v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(bool v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(at::IntList l) {
+  void* data = const_cast<void*>(reinterpret_cast<const void*>(l.data()));
+  auto sizes = {static_cast<int64_t>(l.size())};
+  return at::from_blob(data, sizes, at::kLong).clone();
+}
+
+inline at::Tensor as_tensor(const at::Scalar& s) {
+  return s.toTensor();
+}
+
+template<size_t N>
+inline at::Tensor as_tensor(std::array<bool, N>&& bools) {
+  auto r = at::empty({N}, at::kByte);
+  auto accessor = r.accessor<uint8_t, 1>();
+  for(size_t i = 0; i < N; ++i) {
+    accessor[i] = bools[i];
+  }
+  return r;
+}
+
+template<typename T>
+inline at::Tensor as_variable(const T& t) {
+  return autograd::make_variable(as_tensor(t));
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// Helper for retrieving constants
+//////////////////////////////////////////////////////////////////////////////////
+
+// if a value is a constant then try to turn into type T using the
+// same rules as the interpreter
+template<typename T>
+at::optional<T> constant_as(Value* v) {
+  if(v->node()->kind() != prim::Constant)
+    return at::nullopt;
+  auto tensor = v->node()->t(attr::value);
+  try {
+    return tensor_as<T>(std::move(tensor));
+  } catch (tensor_conversion_error& err) {
+    return at::nullopt;
+  }
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
new file mode 100644
index 0000000..7b784f0
--- /dev/null
+++ b/torch/csrc/jit/test_jit.cpp
@@ -0,0 +1,1022 @@
+#ifdef USE_CATCH
+
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#else
+
+#define REQUIRE JIT_ASSERT
+
+#endif
+
+#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/attributes.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/autodiff.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/hash.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/lower_grad_of.h"
+#include "torch/csrc/variable_tensor_functions.h"
+
+#include "torch/csrc/assertions.h"
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/jit/passes/shape_analysis.h"
+
+#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/script/module.h"
+#include "torch/csrc/jit/ivalue.h"
+
+#include "onnx/onnx_pb.h"
+
+
+#include <ATen/ATen.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace jit {
+
+using Var = SymbolicVariable;
+
+using namespace torch::autograd;
+
+template<typename T>
+static std::ostream & operator<<(std::ostream & out, const std::vector<T> & list) {
+  size_t i = 0;
+  out << "{";
+  for(auto && e : list) {
+    if(i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << "}";
+  return out;
+}
+static auto ct = CodeTemplate(R"(
+int foo($args) {
+
+    $bar
+        $bar
+    $a+$b
+}
+int commatest(int a${,stuff})
+int notest(int a${,empty,})
+)");
+static auto ct_expect = R"(
+int foo(hi, 8) {
+
+    what
+    on many
+    lines...
+    7
+        what
+        on many
+        lines...
+        7
+    3+4
+}
+int commatest(int a, things..., others)
+int notest(int a)
+)";
+
+static void codeTemplateTest() {
+  {
+    TemplateEnv e;
+    e.s("hi","foo");
+    e.v("what",{"is","this"});
+    TemplateEnv c(e);
+    c.s("hi","foo2");
+    REQUIRE(e.s("hi") == "foo");
+    REQUIRE(c.s("hi") == "foo2");
+    REQUIRE(e.v("what")[0] == "is");
+  }
+
+  {
+    TemplateEnv e;
+    e.v("args",{"hi","8"});
+    e.v("bar",{"what\non many\nlines...","7"});
+    e.s("a","3");
+    e.s("b","4");
+    e.v("stuff",{"things...","others"});
+    e.v("empty",{});
+    auto s = ct.format(e);
+    //std::cout << "'" << s << "'\n";
+    //std::cout << "'" << ct_expect << "'\n";
+    REQUIRE(s == ct_expect);
+  }
+}
+
+Value * appendNewNode(NodeKind kind, Graph& graph, ArrayRef<Value*> inputs) {
+  return graph.appendNode(graph.create(kind,inputs))->output();
+}
+
+
+static void fusionTests() {
+  FusionCompiler comp;
+
+  auto testSimple = [&] {
+    Graph graph;
+    Var i0 = Var::asNewInput(graph);
+    Var i1 = Var::asNewInput(graph);
+    auto o0 = i0 * i1;
+    o0.addAsOutput();
+    auto a = at::rand({3,4}, at::kCUDA);
+    auto b = at::rand({4,3}, at::kCUDA).transpose(0,1);
+    auto o = at::zeros({3,4}, at::kCUDA);
+    comp.debugLaunchGraph(graph, 0, {a,b}, {o});
+    auto o2 = a*b;
+    float max_diff = (o2 - o).abs().max().toCDouble();
+    //std::cout << "max diff: " << max_diff << "\n";
+    REQUIRE(max_diff == 0);
+  };
+  testSimple();
+
+  auto testOne = [&](int ti, int tj, int toi, int toj) {
+
+    Graph graph;
+
+    Var i0 = Var::asNewInput(graph);
+    Var i1 = Var::asNewInput(graph);
+    Var i2 = Var::asNewInput(graph);
+    Var i3 = Var::asNewInput(graph);
+    Var i4 = Var::asNewInput(graph);
+
+    auto p22 =  i4.sigmoid();
+    auto p20 = i3.sigmoid();
+    auto p18 = i2.tanh();
+    auto p16 = i1.sigmoid();
+    auto p14 = p20 * i0;
+    auto p11 = p22 * p18;
+    auto o1 = p14 + p11;
+    auto p5 = o1.tanh();
+    auto o0 = p16 * p5;
+    o0.addAsOutput();
+    o1.addAsOutput();
+
+    graph.lint();
+
+    std::vector<at::Tensor> inputs;
+    std::vector<at::Tensor> outputs;
+    // We want to generate input/output tensors with dimension 128x128x32, but
+    // with different internal strides.  To do this, we generate a tensor
+    // with the "wrong" dimensions, and then use transpose to get an appropriately
+    // sized view.
+    for(size_t i = 0; i < graph.inputs().size(); i++) {
+      std::vector<int64_t> dims = {128, 128, 32};
+      std::swap(dims[ti],dims[tj]);
+      inputs.push_back(at::rand(dims, at::kCUDA).transpose(ti, tj));
+    }
+    for(size_t i = 0; i < graph.outputs().size(); i++) {
+      std::vector<int64_t> dims = {128, 128, 32};
+      std::swap(dims[toi],dims[toj]);
+      outputs.push_back(at::zeros(dims, at::kCUDA).transpose(toi,toj));
+    }
+
+    auto t22 = inputs[4].sigmoid();
+    auto t20 = inputs[3].sigmoid();
+    auto t18 = inputs[2].tanh();
+    auto t16 = inputs[1].sigmoid();
+    auto t14 = t20*inputs[0];
+    auto t11 = t22*t18;
+    auto out1 = t14+t11;
+    auto t5 = out1.tanh();
+    auto out0 = t16*t5;
+
+
+    //auto out0 = inputs[0]*inputs[1];
+    comp.debugLaunchGraph(graph, 0, inputs, outputs);
+    REQUIRE(out0.is_same_size(outputs.front()));
+    float max_diff = (outputs.front() - out0).abs().max().toCDouble();
+    REQUIRE(max_diff < 1e-6);
+
+  };
+  testOne(0,0,0,0);
+  testOne(0,1,0,0);
+  testOne(1,2,0,0);
+  testOne(0,2,0,0);
+
+  testOne(0,0,0,1);
+  testOne(0,1,1,2);
+  testOne(1,2,0,2);
+
+
+
+  auto testConcat = [&](int dim) {
+    Graph graph;
+    Var i0 = Var::asNewInput(graph);
+    Var i1 = Var::asNewInput(graph);
+    auto o0 = i0 * i1;
+    o0.addAsOutput();
+    Var::cat({i0, o0}, dim).addAsOutput();
+
+    auto a = at::rand({3,4,5}, at::kCUDA);
+    auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1);
+    auto o = at::zeros({3,4,5}, at::kCUDA);
+
+    auto o_r = a*b;
+    auto o2_r = at::cat({a, o_r}, dim);
+    auto o2 = at::zeros(o2_r.sizes(), at::kCUDA);
+    comp.debugLaunchGraph(graph, 0, {a,b}, {o, o2});
+
+    float max_diff = (o_r - o).abs().max().toCDouble();
+    REQUIRE(max_diff == 0);
+    float max_diff2 = (o2_r - o2).abs().max().toCDouble();
+    REQUIRE(max_diff2 == 0);
+  };
+  testConcat(0);
+  testConcat(1);
+  testConcat(2);
+}
+
+struct Attr : public Attributes<Attr> {
+};
+void attributesTest() {
+  auto one = attr::alpha;
+  auto two = attr::device;
+  auto three = attr::end;
+  auto four = attr::perm;
+  Attr attr;
+  attr.f_(one,3.4)->i_(two,5)->s_(three,"what");
+  REQUIRE(attr.f(one) == 3.4);
+  REQUIRE(attr.s(three) == "what");
+  REQUIRE(attr.i(two) == 5);
+  attr.s_(one,"no");
+  REQUIRE(attr.s(one) == "no");
+  REQUIRE(attr.hasAttribute(three));
+  REQUIRE(!attr.hasAttribute(four));
+  attr.ss_(two, {"hi", "now"});
+  REQUIRE(attr.ss(two).at(1) == "now");
+
+  Attr attr2;
+  attr2.copyAttributes(attr);
+  REQUIRE(attr2.s(one) == "no");
+  attr2.f_(one,5);
+  REQUIRE(attr.s(one) == "no");
+  REQUIRE(attr2.f(one) == 5);
+}
+
+void internedStringsTests () {
+
+  REQUIRE(prim::Param == Symbol::prim("Param"));
+  REQUIRE(prim::Return == Symbol::prim("Return"));
+  REQUIRE(prim::Return.toUnqualString() == std::string("Return"));
+  REQUIRE(prim::Return.toQualString() == std::string("prim::Return"));
+  Symbol newsym = Symbol::aten("__NEW_SYMBOL");
+  size_t symstart = newsym;
+  REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL"));
+  // TODO: This test is a bit too close to the implementation details.
+  REQUIRE(Symbol::aten("What") == symstart+1);
+  REQUIRE(Symbol::aten("What2") == symstart+2);
+  REQUIRE(Symbol::aten("What") == symstart+1);
+  REQUIRE(Symbol::aten("What2") == symstart+2);
+  REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2"));
+}
+
+void fromQualStringTests() {
+  REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param"));
+  REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm"));
+  REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM"));
+  REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value"));
+  REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope(""));
+  REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string(""));
+  REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::"));
+  REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param"));
+  REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns"));
+  REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns"));
+
+  auto bad_inputs = {"scope", ":", ""};
+  for (auto input : bad_inputs) {
+    try {
+      Symbol::fromQualString(input);
+      REQUIRE(0);
+    } catch (std::runtime_error c) {
+    }
+  }
+}
+
+at::Tensor t_use(at::Tensor x) {
+  return x;
+}
+at::Tensor t_def(at::Tensor x) {
+  return x.t();
+}
+
+// given the difference of output vs expected tensor, check whether the
+// difference is within a relative tolerance range. This is a standard way of
+// matching tensor values upto certain precision
+bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
+  double maxValue = 0.0;
+  for (auto& tensor : inputs) {
+    maxValue = fmax(tensor.abs().max().toCFloat(), maxValue);
+  }
+  return diff.abs().max().toCFloat() < 2e-6 * maxValue;
+}
+bool almostEqual(const at::Tensor & a, const at::Tensor & b) {
+  return checkRtol(a - b,{a, b});
+}
+
+bool exactlyEqual(const at::Tensor & a, const at::Tensor & b) {
+  return (a - b).abs().max().toCFloat() == 0.f;
+}
+
+std::pair<at::Tensor, at::Tensor>
+lstm(at::Tensor input,
+      at::Tensor hx,
+      at::Tensor cx,
+      at::Tensor w_ih,
+      at::Tensor w_hh) {
+  auto gates = input.mm(t_use(w_ih)) + hx.mm(t_use(w_hh));
+
+  auto chunked_gates = gates.chunk(4, 1);
+  auto ingate     = chunked_gates[0];
+  auto forgetgate = chunked_gates[1];
+  auto cellgate = chunked_gates[2];
+  auto outgate    = chunked_gates[3];
+
+  ingate = ingate.sigmoid();
+  outgate = outgate.sigmoid();
+  cellgate = cellgate.tanh();
+  forgetgate = forgetgate.sigmoid();
+
+  auto cy = (forgetgate * cx) + (ingate * cellgate);
+  auto hy = outgate * cy.tanh();
+
+  return {hy, cy};
+}
+
+std::tuple<Var, Var> build_lstm_body(
+  Graph & g,
+  Var input,
+  Var hx,
+  Var cx,
+  Var w_ih,
+  Var w_hh) {
+    auto gates = input.mm(w_ih);
+    gates = gates + hx.mm(w_hh);
+    auto outputs = gates.chunk(4, 1);
+    auto ingate = outputs[0];
+    auto forgetgate = outputs[1];
+    auto cellgate = outputs[2];
+    auto outgate = outputs[3];
+    ingate = ingate.sigmoid();
+    outgate = outgate.sigmoid();
+    cellgate = cellgate.tanh();
+    forgetgate = forgetgate.sigmoid();
+
+    auto cy = forgetgate*cx;
+    cy =  cy + ingate*cellgate;
+    auto hy = outgate*cy.tanh();
+
+    return std::make_tuple(hy,cy);
+}
+
+std::shared_ptr<Graph> build_lstm() {
+  auto r = std::make_shared<Graph>();
+  auto & g = *r;
+  Value * input = g.addInput();
+  Value * hx = g.addInput();
+  Value * cx = g.addInput();
+  Value * w_ih = g.addInput();
+  Value * w_hh = g.addInput();
+
+  Var hy;
+  Var cy;
+  std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
+
+  hy.addAsOutput();
+  cy.addAsOutput();
+  g.lint();
+
+  return r;
+}
+
+std::shared_ptr<Graph> build_lstm_stages() {
+  auto r = std::make_shared<Graph>();
+  auto & g = *r;
+  Var input = g.addInput();
+  Var hx = g.addInput();
+  Var cx = g.addInput();
+  Var w_ih = g.addInput();
+  Var w_hh = g.addInput();
+
+  Var hy;
+  Var cy;
+  std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
+
+  // use some stuff from the previous stage as well
+  // as a new input
+  g.advanceStage();
+  hx = hy;
+  cy.addAsOutput();
+  cx = g.addInput();
+
+  std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
+
+  hy.addAsOutput();
+  cy.addAsOutput();
+  g.lint();
+
+  return r;
+}
+
+void runOneStage(InterpreterState & interp, const std::vector<at::Tensor> & inputs, std::vector<at::Tensor> & outputs) {
+  std::vector<IValue> stack(inputs.begin(), inputs.end());
+  interp.runOneStage(stack);
+  outputs.clear();
+  for(auto & ivalue : stack) {
+    outputs.push_back(std::move(ivalue).toTensor());
+  }
+}
+
+void interpTest() {
+    constexpr int batch_size = 4;
+    constexpr int input_size = 256;
+    constexpr int seq_len = 32;
+
+    int hidden_size = 2*input_size;
+
+    auto input = at::randn({seq_len, batch_size, input_size}, at::kCUDA);
+    auto hx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+    auto cx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+    auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
+    auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
+
+    auto lstm_g = build_lstm();
+    Code lstm_function(lstm_g);
+    std::vector<at::Tensor> outputs;
+    InterpreterState lstm_interp(lstm_function);
+    runOneStage(lstm_interp, {input[0], hx, cx, w_ih, w_hh}, outputs);
+    std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
+
+    //std::cout << almostEqual(outputs[0],hx) << "\n";
+    REQUIRE(exactlyEqual(outputs[0],hx));
+    REQUIRE(exactlyEqual(outputs[1],cx));
+}
+
+void interpStageTest() {
+    constexpr int batch_size = 4;
+    constexpr int input_size = 256;
+    constexpr int seq_len = 32;
+
+    int hidden_size = 2*input_size;
+    auto input = at::randn({seq_len, batch_size, input_size}, at::kCUDA);
+    auto hx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+    auto cx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+    auto cx1 = at::randn({batch_size, hidden_size}, at::kCUDA);
+    auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
+    auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
+
+
+    auto lstm_g = build_lstm_stages();
+    Code lstm_function(lstm_g);
+    std::vector<at::Tensor> outputs;
+    InterpreterState lstm_interp(lstm_function);
+    runOneStage(lstm_interp, {input[0], hx, cx, w_ih, w_hh}, outputs);
+    auto cy0 = outputs[0];
+    runOneStage(lstm_interp, {cx1}, outputs);
+    at::Tensor ihx = outputs[0];
+    at::Tensor icx = outputs[1];
+
+
+    std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
+    std::tie(hx, cx) = lstm(input[0], hx, cx1, w_ih, w_hh);
+
+    //std::cout << almostEqual(outputs[0],hx) << "\n";
+    REQUIRE(exactlyEqual(outputs[0],hx));
+    REQUIRE(exactlyEqual(outputs[1],cx));
+}
+
+using var_meta_type = std::vector<int64_t>;
+using var_meta_list = std::vector<var_meta_type>;
+using test_fn_type = std::function<variable_list(const variable_list&)>;
+
+struct ADTestSpec {
+  ADTestSpec(const char *name, var_meta_list input_meta, test_fn_type test_fn)
+    : name(name)
+    , input_meta(input_meta)
+    , test_fn(test_fn) {}
+
+  variable_list operator()(const variable_list& inputs) const {
+    return test_fn(inputs);
+  };
+
+  std::vector<Variable> make_vars() const {
+    std::vector<Variable> out;
+    for (const auto & m : input_meta) {
+      out.emplace_back(autograd::make_variable(at::CPU(at::kFloat).tensor(m).normal_(), /*requires_grad=*/true));
+    }
+    return out;
+  }
+
+  const char *name;
+  var_meta_list input_meta;
+  test_fn_type test_fn;
+};
+
+variable_list get_grad_outputs(const variable_list& vars) {
+  return fmap(vars, [](const Variable& v) -> Variable {
+                      return v.type().tensor(v.sizes()).normal_();
+                    });
+}
+
+std::shared_ptr<Graph> trace(const ADTestSpec& test, const variable_list& vars_in) {
+  std::shared_ptr<tracer::TracingState> state;
+  variable_list trace_vars_in;
+  std::tie(state, trace_vars_in) = tracer::enter(vars_in);
+  auto trace_vars_out = test(trace_vars_in);
+  tracer::exit(trace_vars_out);
+  return state->graph;
+}
+
+variable_list grad(const variable_list& outputs, const variable_list& inputs, const variable_list& grad_outputs) {
+  static const auto get_edge = [](const Variable& v) { return v.gradient_edge(); };
+  auto & engine = torch::autograd::Engine::get_default_engine();
+  return engine.execute(fmap(outputs, get_edge), grad_outputs, true, false, fmap(inputs, get_edge));
+}
+
+void assertAllClose(const tensor_list& a, const tensor_list& b) {
+  REQUIRE(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    REQUIRE(a[i].is_same_size(b[i]));
+    REQUIRE(a[i].allclose(b[i]));
+  }
+}
+
+std::pair<tensor_list, tensor_list> runGradient(Gradient& grad_spec,
+                                                tensor_list& tensors_in,
+                                                tensor_list& tensor_grads_in) {
+  tensor_list tensors_out, tensor_grads_out;
+  Code f_code{grad_spec.f},
+      df_code{grad_spec.df};
+  InterpreterState f_interpreter { f_code }, df_interpreter { df_code };
+
+  runOneStage(f_interpreter, tensors_in, tensors_out);
+
+  tensor_list df_inputs;
+  df_inputs.insert(df_inputs.end(), tensor_grads_in.begin(), tensor_grads_in.end());
+  for(auto offset : grad_spec.df_input_captured_inputs)
+    df_inputs.push_back(tensors_in[offset]);
+  for(auto offset : grad_spec.df_input_captured_outputs)
+    df_inputs.push_back(tensors_out[offset]);
+  runOneStage(df_interpreter, df_inputs, tensor_grads_out);
+
+  // Outputs of f needs to be sliced
+  tensors_out.erase(tensors_out.begin() + grad_spec.f_real_outputs, tensors_out.end());
+  return std::make_pair(tensors_out, tensor_grads_out);
+}
+
+void testADFormulas() {
+  static const auto unwrap = [](const Variable& v) { return v.data(); };
+
+  using VL = variable_list;
+  static const var_meta_list binary_pointwise = {{2, 3, 4, 5}, {2, 3, 4, 5}};
+  static const var_meta_list unary_pointwise  = {{2, 3, 4, 5}};
+  static const var_meta_list unary_pointwise_2d  = {{2, 3}};
+  static const std::vector<ADTestSpec> ad_tests = {
+    {"add",     binary_pointwise, [](const VL& v) -> VL { return {v[0] + v[1]}; }},
+    {"sub",     binary_pointwise, [](const VL& v) -> VL { return {v[0] - v[1]}; }},
+    {"mul",     binary_pointwise, [](const VL& v) -> VL { return {v[0] * v[1]}; }},
+    {"sigmoid", unary_pointwise,  [](const VL& v) -> VL { return {v[0].sigmoid()}; }},
+    {"tanh",    unary_pointwise,  [](const VL& v) -> VL { return {v[0].tanh()}; }},
+    {"t",       unary_pointwise_2d,  [](const VL& v) -> VL { return {v[0].t()}; }},
+    {"mm",      {{10, 12}, {12, 15}}, [](const VL& v) -> VL { return {v[0].mm(v[1])}; }},
+    {"chunk",   {{10, 12, 15}}, [](const VL& v) -> VL { return fmap<Variable>(v[0].chunk(4, 1)); }},
+    {"chunk",   {{10, 12, 15}}, [](const VL& v) -> VL { return fmap<Variable>(v[0].chunk(3, 2)); }},
+    {"split",   {{10, 12, 15}}, [](const VL& v) -> VL { return fmap<Variable>(v[0].split(4, 1)); }},
+    {"split",   {{10, 12, 15}}, [](const VL& v) -> VL { return fmap<Variable>(v[0].split(3, 2)); }},
+  };
+
+  for (const auto & test : ad_tests) {
+    // Get reference values form autograd
+    auto vars_in        = test.make_vars();
+    auto vars_out       = test(vars_in);
+    auto var_grads_in   = get_grad_outputs(vars_out);
+    auto var_grads_out  = grad(vars_out, vars_in, var_grads_in);
+
+    // Trace and differentiate the op
+    auto graph = trace(test, vars_in);
+    EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick
+    auto grad_spec = differentiate(graph, std::vector<bool>(vars_in.size(), true));
+    LowerGradOf(*grad_spec.df);
+    // Get outputs from the interpreter
+    auto tensors_in                = fmap(vars_in, unwrap);
+    auto tensor_grads_in           = fmap(var_grads_in, unwrap);
+    tensor_list tensors_out, tensor_grads_out;
+    std::tie(tensors_out, tensor_grads_out) = runGradient(grad_spec, tensors_in, tensor_grads_in);
+
+    // Compare results
+    auto expected_tensors_out      = fmap(vars_out, unwrap);
+    auto expected_tensor_grads_out = fmap(var_grads_out, unwrap);
+    assertAllClose(tensors_out,      expected_tensors_out);
+    assertAllClose(tensor_grads_out, expected_tensor_grads_out);
+  }
+}
+
+std::string toString(std::shared_ptr<Graph>& graph) {
+  std::ostringstream s;
+  s << *graph;
+  return s.str();
+}
+
+void testDifferentiate(std::ostream & out) {
+  auto graph = std::make_shared<Graph>();
+  at::ScalarType s = at::ScalarType::Float;
+  auto type = std::shared_ptr<TensorType>(new TensorType(s, -1, {2, 3, 4}, {12, 4, 1}));
+
+  // Build up a fake graph
+  auto a = SymbolicVariable::asNewInput(*graph, type);
+  auto b = SymbolicVariable::asNewInput(*graph, type);
+  auto c = a * b * a + b;
+  graph->registerOutput(c.value());
+
+  auto grad_spec = differentiate(graph, {true, true});
+  std::vector<size_t> expected_captured_inputs = {0, 1};
+  std::vector<size_t> expected_captured_outputs = {1};
+  std::vector<size_t> expected_input_vjps = {0, 1};
+  std::vector<size_t> expected_output_vjps = {0, 1};
+  REQUIRE(grad_spec.f_real_outputs == 1);
+  REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs);
+  REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs);
+  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  out << "testDifferentiate\n";
+  out << *grad_spec.f;
+  out << *grad_spec.df;
+  out << "\n";
+}
+
+void testDifferentiateWithRequiresGrad(std::ostream & out) {
+  auto graph = std::make_shared<Graph>();
+  at::ScalarType s = at::ScalarType::Float;
+  auto type = std::shared_ptr<TensorType>(new TensorType(s, -1, {2, 3, 4}, {12, 4, 1}));
+
+  // Build up a fake graph
+  auto a = SymbolicVariable::asNewInput(*graph, type);
+  auto b = SymbolicVariable::asNewInput(*graph, type);
+  auto d = b * b + b;
+  auto e = (d + a) * a + b;
+  graph->registerOutput(d.value());
+  graph->registerOutput(e.value());
+
+  auto grad_spec = differentiate(graph, {true, false});
+  std::vector<size_t> expected_input_vjps = {1, 2};  // for e and %4 = (d + a)
+  std::vector<size_t> expected_output_vjps = {0};    // only a requires grad
+  REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
+  REQUIRE(grad_spec.df_input_captured_inputs == std::vector<size_t>({0}));
+  REQUIRE(grad_spec.df_input_captured_outputs == std::vector<size_t>({2}));
+  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  out << "testDifferentiateWithRequiresGrad\n";
+  out << *grad_spec.f;
+  out << *grad_spec.df;
+  out << "\n";
+}
+
+void testCreateAutodiffSubgraphs(std::ostream & out) {
+  auto graph = build_lstm();
+  CreateAutodiffSubgraphs(*graph, /*threshold=*/2);
+  out << "testCreateAutodiffSubgraphs\n";
+  out << *graph << "\n";
+}
+
+autograd::Variable var(at::Type & t, at::IntList sizes, bool requires_grad) {
+  return autograd::make_variable(at::rand(sizes, t), requires_grad);
+}
+autograd::Variable undef() {
+  return autograd::Variable();
+}
+
+int device(const autograd::Variable & v) {
+  return v.type().is_cuda() ? v.get_device() : -1;
+}
+
+bool isEqual(at::IntList lhs, at::IntList rhs) {
+  return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+bool isEqual(const TensorInfo & ti, const autograd::Variable & v) {
+  if(!ti.defined())
+    return ti.defined() == v.defined();
+  return
+    ti.device() == device(v) &&
+    ti.requires_grad() == v.requires_grad() &&
+    ti.type() == v.type().scalarType() &&
+    isEqual(ti.sizes(), v.sizes()) &&
+    isEqual(ti.strides(), v.strides());
+}
+
+// work around the fact that variable_tensor_list doesn't duplicate all
+// of std::vector's constructors.
+// most constructors are never used in the implementation, just in our tests.
+variable_tensor_list createVarList(std::vector<at::Tensor> && list) {
+  return variable_tensor_list(std::move(list));
+}
+
+void argumentSpecTest() {
+  auto & CF = at::CPU(at::kFloat);
+  auto & CD = at::CPU(at::kDouble);
+  auto & GF = at::CUDA(at::kFloat);
+  auto & GD = at::CUDA(at::kDouble);
+
+  auto list =  createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+
+  // make sure we have some non-standard strides
+  list[1].transpose_(0, 1);
+
+  // same list but different backing values
+  auto list2 = createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+  list2[1].transpose_(0, 1);
+
+
+  ArgumentSpec a(true, list);
+  ArgumentSpec b(true, list);
+  REQUIRE(a.hashCode() == b.hashCode());
+
+  REQUIRE(a == b);
+  ArgumentSpec d(true, list2);
+  REQUIRE(d == a);
+  REQUIRE(d.hashCode() == a.hashCode());
+
+  for(size_t i = 0; i < list.size(); ++i) {
+    REQUIRE(isEqual(a.tensorInfo(i), list[i]));
+  }
+  ArgumentSpec no_grad(/*with_grad=*/false, list);
+  REQUIRE(no_grad != a);
+
+  std::unordered_set<ArgumentSpec> spec;
+  spec.insert(std::move(a));
+  REQUIRE(spec.count(b) > 0);
+  REQUIRE(spec.count(no_grad) == 0);
+  spec.insert(std::move(no_grad));
+  REQUIRE(spec.count(ArgumentSpec(true,list)) == 1);
+
+  list2[1].transpose_(0,1);
+  ArgumentSpec c(true, list2); // same as list, except for one stride
+  REQUIRE(!(c == a));
+  REQUIRE(spec.count(c) == 0);
+
+}
+
+void shapeAnalysisTest() {
+
+  constexpr int batch_size = 4;
+  constexpr int input_size = 256;
+
+  int hidden_size = 2*input_size;
+
+  auto v = [](at::Tensor t) { return autograd::make_variable(t, false); };
+
+  auto input = at::randn({batch_size, input_size}, at::kCUDA);
+  auto hx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+  auto cx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+  auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
+  auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
+
+  auto g = build_lstm();
+  ArgumentSpec spec(false, createVarList({v(input), v(hx), v(cx), v(w_ih), v(w_hh) }));
+  PropagateInputShapes(*g, spec);
+  at::Tensor r0, r1;
+  std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
+  auto o0 = g->outputs()[0]->type()->expect<TensorType>();
+  auto o1 = g->outputs()[1]->type()->expect<TensorType>();
+  REQUIRE(o0->sizes() == std::vector<int64_t>(r0.sizes().begin(), r0.sizes().end()));
+  REQUIRE(o1->sizes() == std::vector<int64_t>(r1.sizes().begin(), r1.sizes().end()));
+
+}
+
+void testGraphExecutor() {
+  constexpr int batch_size = 4;
+  constexpr int input_size = 256;
+
+  int hidden_size = 2*input_size;
+
+  auto v = [](at::Tensor t) { return autograd::make_variable(t, false); };
+
+  auto input = at::randn({batch_size, input_size}, at::kCUDA);
+  auto hx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+  auto cx    = at::randn({batch_size, hidden_size}, at::kCUDA);
+  auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
+  auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
+
+  std::vector<at::Tensor> inputs = {v(input), v(hx), v(cx), v(w_ih), v(w_hh) };
+  auto g = build_lstm();
+  GraphExecutor executor(g);
+  auto outputs = executor.run(variable_tensor_list(std::move(inputs)));
+  at::Tensor r0, r1;
+  std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
+  REQUIRE(almostEqual(Variable(outputs[0]).data(), r0));
+  REQUIRE(almostEqual(Variable(outputs[1]).data(), r1));
+}
+
+void testBlocks(std::ostream & out) {
+  Graph g;
+  auto a = Var::asNewInput(g, "a");
+  auto b = Var::asNewInput(g, "b");
+  auto c = a + b;
+  auto r = g.appendNode(g.create(prim::If, {Var::asNewInput(g, "c").value()}));
+  auto then_block = r->addBlock();
+  auto else_block = r->addBlock();
+  {
+    WithInsertPoint guard(then_block);
+    auto t = c + c;
+    then_block->registerOutput(t.value());
+  }
+  {
+    WithInsertPoint guard(else_block);
+    auto  d = b + c;
+    auto e = d + c;
+    else_block->registerOutput(e.value());
+  }
+  g.registerOutput((Var(r->output()) + c).value());
+  g.lint();
+  out << "testBlocks\n" << g << "\n";
+  r->eraseBlock(0);
+  out << g << "\n";
+  g.lint();
+  // test recursive copy of blocks works
+  auto g2 = g.copy();
+  out << *g2 << "\n";
+}
+
+
+const static auto cf_examples = R"JIT(
+  def if_test(a, b):
+      # FIXME: use 0 instead of a.
+      # c = 0
+      c = a
+      if a < b:
+        c = b
+      else:
+        c = a
+      return c
+  def if_one(a, b):
+    c = b
+    if a < b:
+      c = a
+    return c
+  def while_test(a, i):
+    while i < 3:
+      a *= a
+      i += 1
+    return a
+)JIT";
+void testControlFlow() {
+  script::Module cu;
+  script::defineMethodsInModule(cu, cf_examples, torch::jit::script::Resolver(), nullptr);
+  auto run = [&](const std::string & name, std::vector<IValue> stack) {
+    auto graph = cu.get_method(name).graph();
+    Code code(graph);
+    InterpreterState interp(code);
+    interp.runOneStage(stack);
+    return stack;
+  };
+
+  auto L = [](int64_t l) { return IValue(autograd::make_variable(at::Scalar(l).toTensor())); };
+  auto V = [](IValue t) { return at::Scalar(std::move(t).toTensor()).toLong(); };
+  auto run_binary = [&](const std::string & name, int64_t a, int64_t b) {
+    return V(run(name, {L(a), L(b)})[0]);
+  };
+  REQUIRE(2 == run_binary("if_test", 1, 2));
+  REQUIRE(3 == run_binary("if_test", 3, 2));
+  REQUIRE(2 == run_binary("if_one", 2, 3));
+  REQUIRE(2 == run_binary("if_one", 3, 2));
+  REQUIRE(256 == run_binary("while_test",2,0));
+}
+
+void testIValue() {
+  Shared<IntList> foo = IntList::create({3, 4, 5});
+  JIT_ASSERT(foo->use_count() == 1);
+  IValue bar(foo);
+  JIT_ASSERT(foo->use_count() == 2);
+  auto baz = bar;
+  JIT_ASSERT(foo->use_count() == 3);
+  auto foo2 = std::move(bar);
+  JIT_ASSERT(foo->use_count() == 3);
+  JIT_ASSERT(foo2.isIntList());
+  JIT_ASSERT(bar.isNone());
+  foo2 = IValue(4.0);
+  JIT_ASSERT(foo2.isDouble());
+  JIT_ASSERT(foo2.toDouble() == 4.0);
+  JIT_ASSERT(foo->use_count() == 2);
+  JIT_ASSERT(baz.toIntList()->elements().equals({3,4,5}));
+
+  auto move_it = std::move(baz).toIntList();
+  JIT_ASSERT(foo->use_count() == 2);
+  JIT_ASSERT(baz.isNone());
+  IValue i(4);
+  JIT_ASSERT(i.isInt() && i.toInt() == 4);
+  IValue dlist(DoubleList::create({3.5}));
+  JIT_ASSERT(
+      dlist.isDoubleList() &&
+      std::move(dlist).toDoubleList()->elements().equals({3.5}));
+  JIT_ASSERT(dlist.isNone());
+  dlist = IValue(DoubleList::create({3.4}));
+  JIT_ASSERT(dlist.toDoubleList()->elements().equals({3.4}));
+  IValue the_list(Tuple::create({IValue(3.4), IValue(4), IValue(foo)}));
+  JIT_ASSERT(foo->use_count() == 3);
+  JIT_ASSERT(the_list.isTuple());
+  auto first = std::move(the_list).toTuple()->elements().at(1);
+  JIT_ASSERT(first.toInt() == 4);
+  at::Tensor tv = at::rand({3,4});
+  IValue ten(tv);
+  JIT_ASSERT(tv.get()->use_count() == 2);
+  auto ten2 = ten;
+  JIT_ASSERT(tv.get()->use_count() == 3);
+  JIT_ASSERT(ten2.toTensor().equal(ten.toTensor()));
+  std::move(ten2).toTensor();
+  JIT_ASSERT(tv.get()->use_count() == 2);
+}
+
+void testProto() {
+  ::ONNX_NAMESPACE::ModelProto proto;
+  proto.set_producer_name("foo");
+}
+
+std::string runJITCPPTests() {
+  std::stringstream out;
+  testIValue();
+  testControlFlow();
+  testGraphExecutor();
+  testBlocks(out);
+  testCreateAutodiffSubgraphs(out);
+  testDifferentiate(out);
+  testDifferentiateWithRequiresGrad(out);
+  testADFormulas();
+  interpTest();
+  interpStageTest();
+  codeTemplateTest();
+  fusionTests();
+  attributesTest();
+  internedStringsTests();
+  fromQualStringTests();
+  argumentSpecTest();
+  shapeAnalysisTest();
+  testProto();
+  return out.str();
+}
+
+#ifdef USE_CATCH
+
+TEST_CASE( "jit test CPU", "[cpu]" ) {
+
+  std::stringstream out;
+  SECTION( "control flow" )
+    testControlFlow();
+  SECTION( "blocks" )
+    testBlocks(out);
+  SECTION( "create autodiff subgraphs" )
+    testCreateAutodiffSubgraphs(out);
+  SECTION( "differentiate" )
+    testDifferentiate(out);
+  SECTION( "differentiate with requires grad" )
+    testDifferentiateWithRequiresGrad(out);
+  SECTION( "AD formulas" )
+    testADFormulas();
+  SECTION( "code template" )
+    codeTemplateTest();
+  SECTION( "attributes" )
+    attributesTest();
+  SECTION( "interned strings" )
+    internedStringsTests();
+}
+
+TEST_CASE( "jit test CUDA", "[cuda]" ) {
+
+  SECTION( "graph executor" )
+    testGraphExecutor();
+  SECTION( "fusion" )
+    fusionTests();
+  SECTION( "interp" )
+    interpTest();
+  SECTION( "interp stage" )
+    interpStageTest();
+  SECTION( "argument spec" )
+    argumentSpecTest();
+  SECTION( "shape analysis" )
+    shapeAnalysisTest();
+}
+
+#endif
+
+}}
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
new file mode 100644
index 0000000..0fda835
--- /dev/null
+++ b/torch/csrc/jit/tracer.cpp
@@ -0,0 +1,80 @@
+#include "torch/csrc/jit/tracer.h"
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/remove_expands.h"
+#include "torch/csrc/variable_tensor_functions.h"
+
+#include <string>
+#include <sstream>
+#include <memory>
+
+namespace torch { namespace jit { namespace tracer {
+
+PreTraceInfo preRecordTrace(Symbol op,
+                            at::ArrayRef<Variable> inputs) {
+  return makePreTraceInfo(inputs, [&op](const std::shared_ptr<TracingState>& state, Graph& graph) {
+    return graph.create(op, 0 /* initial outputs */);
+  });
+}
+
+void postRecordTrace(const PreTraceInfo& info,
+                     at::ArrayRef<Variable> outputs) {
+  // TODO: Technically, we could reduce the scope of the lock, but since we
+  // haven't actually specified what the locking contract is, be conservative.
+  auto state_lock = info.state->lock();
+
+  auto assignOutput = [&info](const Variable & output, Value * value) {
+    if (output.defined()) {
+      value->inferTypeFrom(output.data());
+      setValueTrace(info.state, output, value);
+    }
+  };
+
+  for (size_t i = 0; i < outputs.size(); i++) {
+    assignOutput(outputs[i], info.n->addOutput());
+  }
+}
+
+thread_local ArgumentStash ArgumentStash::stash;
+
+void ArgumentStash::stashIntListElem(const std::string& arg_name, size_t size, size_t idx, const Variable& var) {
+  // TODO: check type?
+  if (!isTracing(var)) return;
+  auto tracing_state = getTracingState({var});
+  auto & list_trace = stash.intlists.emplace(arg_name, size).first->second;
+  JIT_ASSERT(size == list_trace.size());
+  JIT_ASSERT(idx < list_trace.size());
+  JIT_ASSERT(list_trace[idx] == nullptr);
+  list_trace[idx] = getValueTrace(tracing_state, var);
+}
+
+autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) {
+  auto tracing_state = getTracingState({var});
+  auto & graph = tracing_state->graph;
+
+  auto size_var = autograd::make_variable(at::Scalar(var.size(dim)).toTensor());
+  auto* value = getValueTrace(tracing_state, var);
+  auto* node = graph->create(aten::size, {value})
+                    ->i_(attr::dim, dim);
+  node->output()->inferTypeFrom(size_var);
+  graph->appendNode(node);
+  setValueTrace(tracing_state, size_var, node->output());
+
+  return size_var;
+}
+
+
+// no python present so we just do not record source information
+void defaultRecordSourceLocation(Node* n) {}
+std::atomic<decltype(&defaultRecordSourceLocation)> record_source_location(defaultRecordSourceLocation);
+void recordSourceLocation(Node* n) {
+  return record_source_location.load()(n);
+}
+void setRecordSourceLocation(void (*v)(Node*)) {
+  record_source_location.store(v);
+}
+
+}}}
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
new file mode 100644
index 0000000..5775091
--- /dev/null
+++ b/torch/csrc/jit/tracer.h
@@ -0,0 +1,281 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/tracer_state.h"
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/variadic.h"
+#include "torch/csrc/autograd/function_hook.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/auto_unique_ptr.h"
+#include <memory>
+#include <mutex>
+#include <vector>
+#include <iostream>
+#include <cstdint>
+#include <unordered_map>
+
+namespace torch { namespace jit { namespace tracer {
+
+using torch::autograd::Variable;
+using variable_list = std::vector<Variable>;
+
+namespace detail {
+
+inline ValueTracingStateElem* getValueState(const std::shared_ptr<TracingState>& state, const Variable& var, bool alloc = true) {
+  auto& tracing_state = var.tracing_state();
+  for (auto it = tracing_state.begin(); it != tracing_state.end();) {
+    auto ts = it->state.lock();
+    // GC of invalidated tracing states
+    if (!ts) {
+      auto current_it = it++;
+      tracing_state.erase(current_it);
+      continue;
+    } else if (ts == state) {
+      return &(*it);
+    }
+    ++it;
+  }
+  if (alloc) {
+    tracing_state.emplace_front();
+    auto & vts = tracing_state.front();
+    vts.state = state;
+    return &vts;
+  } else {
+    return nullptr;
+  }
+}
+
+inline bool isElemActive(const ValueTracingStateElem& vts) {
+  auto state = vts.state.lock();
+  return state && state->active;
+}
+
+} // namespace detail
+
+
+// This is meant to be used as a thread local place, where we can store extra
+// info that gets lost when we call into ATen from Python bindings. One example
+// for when this happens is when we get an IntList argument with e.g. sizes for
+// view. When tracing, those might be tensors, which let us encode extra data
+// dependencies, but once they get to the ATen call where we actually have the
+// tracing logic, they get converted into a raw IntList, and we loose all
+// information. To prevent this, we temporarily stash it in here.
+struct ArgumentStash {
+  struct IntListTrace : std::vector<Value*> {
+    IntListTrace(int size)
+      : std::vector<Value*>(size, nullptr) {}
+  };
+
+  static bool empty() {
+    return stash.intlists.empty();
+  }
+
+  static void stashIntListElem(const std::string& arg_name,
+                               size_t size,
+                               size_t idx,
+                               const Variable& var);
+
+  static bool hasIntList(const std::string& arg_name) {
+    return stash.intlists.count(arg_name) > 0;
+  }
+
+  static IntListTrace popIntList(const std::string& arg_name) {
+    auto info = std::move(stash.intlists.at(arg_name));
+    stash.intlists.erase(arg_name);
+    return info;
+  }
+
+private:
+  static thread_local ArgumentStash stash;
+  std::unordered_map<std::string, IntListTrace> intlists;
+};
+
+// Should a function which takes 'vars' as inputs be traced?
+// It suffices for ONE variable to be tracing: any "untraced" variables
+// are treated as constants.
+//
+// NB: This code lives in the hotpath; make sure it is fast
+//
+// NB: Variable overload is not variadic because we don't actually
+// need it (in most cases if we have a variable_list it is already
+// flattened).
+inline bool isTracingVar(const Variable& var) {
+  if (!var.defined() || !var.has_tracing_state()) return false;
+  return std::any_of(var.tracing_state().begin(), var.tracing_state().end(), detail::isElemActive);
+}
+
+inline bool isTracingVar(at::ArrayRef<Variable> vars) {
+  // Reference to avoid refcount bump
+  for (const Variable& var : vars) {
+    if (isTracingVar(var)) return true;
+  }
+  return false;
+}
+
+struct IsTracing : IterArgs<IsTracing> {
+  bool out = false;
+  using IterArgs<IsTracing>::operator();
+  void operator()(const at::Tensor& var) {
+    out = out || isTracingVar(var);
+  }
+  bool short_circuit() { return out; }
+};
+
+// To be called with Tensor arguments from generated code
+template<typename... Args>
+inline bool isTracing(Args&&... args) {
+  return IsTracing().apply(std::forward<Args>(args)...).out;
+}
+
+// Retrieve the tracing state which a function applied with 'vars' should
+// be recorded to.  Precondition: isTracing(vars) == true.  At the moment,
+// we don't support mixing up variables from different traces; this code
+// will need to be revisited if that ever becomes supported.
+inline std::shared_ptr<TracingState> getTracingState(const variable_list& vars) {
+  std::shared_ptr<TracingState> state;
+  for (auto& var : vars) {
+    if (!var.defined() || !var.has_tracing_state()) continue;
+    for (auto & vts : var.tracing_state()) {
+      auto var_state = vts.state.lock();
+      if (!var_state || !var_state->active) continue;
+      if (!state) state = var_state;
+      JIT_ASSERT(var_state == state);
+    }
+  }
+  JIT_ASSERT(state);
+  return state;
+}
+
+// Having finished adding a new 'node' to the graph IR owned by TracingState 'state',
+// 'setValueTrace' associates this node with an output variable, so that further operations
+// involving this variable know which node in the IR to reference.
+inline void setValueTrace(const std::shared_ptr<TracingState>& state, const Variable& var, Value *value) {
+  JIT_ASSERT(var.defined());
+  auto vts = detail::getValueState(state, var);
+  vts->trace = value;
+}
+
+// Given a variable 'var', return the 'node' which represents the instruction
+// which computes the value of this variable in the IR.  When 'mustExist' is
+// false, we interpret untraced variables as constants that are just embedded
+// in the graph.  This is useful to handle code which does things like this
+// (from torch.autograd.variable):
+//
+//    def mm(self, matrix):
+//      output = Variable(self.data.new(self.data.size(0), matrix.data.size(1)))
+//      return Addmm.apply(output, self, matrix, 0, 1, True)
+//
+// Here, mm fakes up a dummy variable with uninitialized data to do an inplace
+// update on, but subsequently ignores it because the alpha scaling factor is zero.
+// This is one of the cases where a Variable can be created inside of a trace, and
+// if we treat it as a constant, everything will work out.
+inline Value* getValueTrace(const std::shared_ptr<TracingState>& state, const Variable& var) {
+  if (!var.defined()) {
+    Node *n = state->graph->createUndefined();
+    return state->graph->appendNode(n)->output();
+  }
+
+  auto vts = detail::getValueState(state, var, true);
+  if (vts->trace) return vts->trace;
+
+  Value *constant = state->graph->appendNode(state->graph->createConstant(var.data()))->output();
+  constant->inferTypeFrom(var.data());
+  setValueTrace(state, var, constant);
+  return constant;
+}
+
+inline Value* getOutputTrace(const std::shared_ptr<TracingState>& state, const Variable& var, size_t output_no) {
+  if (!var.defined()) {
+    Node *n = state->graph->createUndefined();
+    return state->graph->appendNode(n)->output();
+  }
+
+  auto vts = detail::getValueState(state, var, false);
+  if (!vts) {
+    std::ostringstream os;
+    os << "output " << output_no << " of traced region did not have observable "
+       << "data dependence with trace inputs; this probably indicates your program "
+       << "cannot be understood by the tracer.";
+    throw std::runtime_error(os.str());
+  }
+  return vts->trace;
+}
+
+// Start tracing, treating 'inputs' as inputs to the trace, which can be
+// varied on subsequent invocations of the trace.  Any other variables
+// will be treated as constants.
+//
+// NB: Why does this take an rvalue reference?  We need to get a non-const
+// reference to at::Tensor buffer to call unsafeGetTH, but you can't get this
+// out of a const vector (silly std::vector...)
+inline std::pair<std::shared_ptr<TracingState>, variable_list> enter(
+    variable_list inputs) {
+  auto state = std::make_shared<TracingState>();
+  for (auto& input : inputs) {
+    auto * value_state = detail::getValueState(state, input, false);
+    if (value_state) {
+      // See Note [Repeated inputs] in tracer.cpp
+      input = input.view(input.sizes());
+    }
+    auto input_node = state->graph->addInput(input.name());
+    setValueTrace(state, input, input_node);
+    input_node->inferTypeFrom(input.data());
+  }
+  return std::make_pair(state, inputs);
+}
+
+// Exit a trace, treating 'outputs' as the outputs of the trace.  These
+// are the variables whose values will be computed upon subsequent
+// invocations of the trace.
+inline void exit(const variable_list& outputs) {
+  auto state = getTracingState(outputs);
+  size_t i = 0;
+  for (auto& output : outputs) {
+    state->graph->registerOutput(getOutputTrace(state, output, i));
+    i++;
+  }
+  state->active = false;
+}
+
+// Pre-recorded information about the trace before we actually carry
+// out the trace
+struct PreTraceInfo {
+  std::shared_ptr<TracingState> state;
+  Node *n;
+};
+
+PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef<Variable> inputs);
+void postRecordTrace(const PreTraceInfo& info, at::ArrayRef<Variable> outputs);
+
+autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
+
+void recordSourceLocation(Node* n);
+void setRecordSourceLocation(void (*v)(Node*));
+
+// We must record the nodes of inputs before we actually carry out
+// the operation, because an inplace operation may destroy the information
+// we're interested in.  See #4480.
+template<typename F>
+PreTraceInfo makePreTraceInfo(at::ArrayRef<Variable> inputs, F ctor) {
+  PreTraceInfo info;
+  info.state = getTracingState(inputs);
+  auto& graph = info.state->graph;
+  auto state_lock = info.state->lock();
+
+  Node *n = ctor(info.state, *graph);
+  recordSourceLocation(n);
+
+  for (Variable input : inputs) {
+    n->addInput(getValueTrace(info.state, input));
+  }
+
+  // NB: Order matters. This must append after inputs but before outputs.
+  graph->appendNode(n);
+
+  info.n = n;
+
+  return info;
+}
+
+}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/tracer_state.cpp b/torch/csrc/jit/tracer_state.cpp
new file mode 100644
index 0000000..6f44562
--- /dev/null
+++ b/torch/csrc/jit/tracer_state.cpp
@@ -0,0 +1,12 @@
+#include "torch/csrc/jit/tracer_state.h"
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit { namespace tracer {
+
+TracingState::TracingState()
+    : graph(new Graph())
+    , active(true) {}
+
+TracingState::~TracingState() = default;
+
+}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/tracer_state.h b/torch/csrc/jit/tracer_state.h
new file mode 100644
index 0000000..887ad94
--- /dev/null
+++ b/torch/csrc/jit/tracer_state.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "torch/csrc/autograd/edge.h"
+#include "torch/csrc/autograd/variable.h"
+
+#include <atomic>
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace jit {
+struct Graph;
+struct Value;
+}} // namespace torch::jit
+
+namespace torch { namespace jit { namespace tracer {
+
+// TracingState tracks the necessary state when we are tracing the execution of
+// autograd code; most importantly, it holds a reference to the actual IR
+// graph which we are recording the trace to.
+//
+// The liveness of a TracingState is expected to be a superset of the region
+// of code being traced; in particular, Variables do not keep a TracingState
+// live.  Instead, they hold weak pointers to TracingState, to prevent leaks
+// from arising when a variable that participated in a trace outlives the
+// actual trace itself.
+
+struct TracingState : public std::enable_shared_from_this<TracingState> {
+  TracingState();
+  ~TracingState();
+
+  std::shared_ptr<Graph> graph;
+  std::mutex mutex;
+  bool active;
+
+  std::unique_lock<std::mutex> lock() {
+    return std::unique_lock<std::mutex>(mutex);
+  }
+};
+
+struct ValueTracingStateElem {
+  std::weak_ptr<TracingState> state;
+  // it's only valid to use this field if !state.exired()
+  Value* trace = nullptr;
+
+  void reset() {
+    state.reset();
+    trace = nullptr;
+  }
+};
+
+using ValueTracingState = std::list<ValueTracingStateElem>;
+
+}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
new file mode 100644
index 0000000..79171ed
--- /dev/null
+++ b/torch/csrc/jit/type.cpp
@@ -0,0 +1,72 @@
+#include "torch/csrc/jit/type.h"
+
+#include <iostream>
+
+namespace torch { namespace jit {
+
+std::ostream& operator<<(std::ostream & out, const Type & t) {
+  if(auto value = t.cast<TensorType>()) {
+    out << at::toString(value->scalarType()) << "(";
+    auto& sizes = value->sizes();
+    auto& strides = value->strides();
+    JIT_ASSERT(sizes.size() == strides.size());
+    for (size_t i = 0; i < sizes.size(); i++) {
+      if (i > 0) {
+        out << ", ";
+      }
+      // TODO: figure out a good way to output strides, or
+      // add a "debug" printing mode which adds the extra stuff
+      out << sizes[i]; // << "%" << strides[i];
+      int64_t expected = i + 1 < sizes.size() ? sizes[i+1]*strides[i+1] : 1;
+      if (strides[i] != expected) {
+        out << "!"; //mark non-contiguous
+      }
+    }
+    out << ")";
+  } else if(t.kind() == TypeKind::DynamicType) {
+    out << "Dynamic";
+  } else if(t.kind() == TypeKind::TupleType) {
+    out << "Tuple";
+  } else if(t.kind() == TypeKind::NumberType) {
+    out << "Number";
+  } else if(t.kind() == TypeKind::FloatType) {
+    out << "float";
+  } else if(t.kind() == TypeKind::IntType) {
+    out << "int";
+  } else if(t.kind() == TypeKind::ListType) {
+    auto prim = t.cast<ListType>()->getElementType();
+    out << *prim << "[]";
+  } else {
+    barf("unknown type kind");
+  }
+  return out;
+}
+
+TypePtr DynamicType::get() {
+  static auto value = std::make_shared<DynamicType>();
+  return value;
+}
+TypePtr NumberType::get() {
+  static auto value = std::make_shared<NumberType>();
+  return value;
+}
+TypePtr IntType::get() {
+  static auto value = std::make_shared<IntType>();
+  return value;
+}
+TypePtr FloatType::get() {
+  static auto value = std::make_shared<FloatType>();
+  return value;
+}
+
+
+TypePtr ListType::ofTensors() {
+  static auto value = std::make_shared<ListType>(DynamicType::get());
+  return value;
+}
+TypePtr ListType::ofInts() {
+  static auto value = std::make_shared<ListType>(IntType::get());
+  return value;
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
new file mode 100644
index 0000000..18949f2
--- /dev/null
+++ b/torch/csrc/jit/type.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/assertions.h"
+
+#include <ATen/ATen.h>
+
+#include <memory>
+#include <iostream>
+
+namespace torch { namespace jit {
+
+#define TH_FORALL_TYPES(_) \
+_(DynamicType) \
+_(TensorType) \
+_(TupleType) \
+_(ListType) \
+_(NumberType) \
+_(FloatType) \
+_(IntType) \
+
+enum class TypeKind {
+#define DEFINE_TYPE(T) T,
+  TH_FORALL_TYPES(DEFINE_TYPE)
+#undef DEFINE_TYPE
+};
+
+struct Type;
+using TypePtr = std::shared_ptr<Type>;
+
+
+struct Type : std::enable_shared_from_this<Type> {
+
+private:
+  TypeKind kind_;
+
+protected:
+  Type(TypeKind kind)
+    : kind_(kind) {}
+
+public:
+  virtual bool operator==(const Type& rhs) const = 0;
+
+  // subtyping relation. By default, we return true for the case
+  // when the type is exactly equal
+  virtual bool isSubtypeOf(const Type& rhs) const {
+    return *this == rhs;
+  }
+  // user-friendly form of the type, separate from
+  // operator<< which is verbose and unambiguous
+  virtual std::string str() const = 0;
+
+  TypeKind kind() const {
+    return kind_;
+  }
+
+  // Dynamically cast this object to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid..
+  template<typename T>
+  T* cast() {
+    if (T::Kind == kind())
+      return static_cast<T*>(this);
+    return nullptr;
+  }
+  template<typename T>
+  const T* cast() const {
+    if (T::Kind == kind())
+      return static_cast<const T*>(this);
+    return nullptr;
+  }
+  template<typename T>
+  T* expect() {
+    JIT_ASSERT(T::Kind == kind());
+    return static_cast<T*>(this);
+  }
+  template<typename T>
+  const T* expect() const {
+    JIT_ASSERT(T::Kind == kind());
+    return static_cast<const T*>(this);
+  }
+  std::shared_ptr<Type> asShared() {
+    return shared_from_this();
+  }
+  virtual ~Type() {}
+};
+
+inline bool operator!=(const Type & lhs, const Type & rhs) {
+  return !(lhs == rhs);
+}
+
+// This node represents a single Tensor value, with an unknown shape.
+struct DynamicType : public Type {
+  DynamicType()
+  : Type(TypeKind::DynamicType) {}
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Tensor";
+  }
+  static const TypeKind Kind = TypeKind::DynamicType;
+  // global singleton
+  static TypePtr get();
+};
+
+struct TensorType;
+using TensorTypePtr = std::shared_ptr<TensorType>;
+// This node represents a single Tensor value with a specific size
+struct TensorType : public Type {
+  friend struct Type;
+  TensorType(const at::Tensor& tensor)
+    : Type(TypeKind::TensorType)
+    , scalar_type_(tensor.type().scalarType())
+    , device_(tensor.type().is_cuda() ? tensor.get_device() : -1)
+    , sizes_(tensor.sizes())
+    , strides_(tensor.strides()) {}
+  TensorType(at::ScalarType scalar_type, int device, at::IntList sizes)
+    : TensorType(scalar_type, device, sizes, TensorType::contiguousStridesOf(sizes)) {}
+  TensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides)
+    : Type(TypeKind::TensorType)
+    , scalar_type_(scalar_type)
+    , device_(device)
+    , sizes_(sizes)
+    , strides_(strides)
+    {}
+
+  static const TypeKind Kind = TypeKind::TensorType;
+
+  at::ScalarType scalarType() const { return scalar_type_; }
+  int device() const { return device_; }
+  const std::vector<int64_t>& sizes() const { return sizes_; }
+  const std::vector<int64_t>& strides() const { return strides_; }
+
+  TypePtr withSizesStrides(at::IntList sizes, at::IntList strides) const {
+    return std::make_shared<TensorType>(scalar_type_, device_, sizes, strides);
+  }
+
+  TypePtr withSizes(at::IntList sizes) const {
+    return withSizesStrides(sizes, TensorType::contiguousStridesOf(sizes));
+  }
+
+  TensorTypePtr contiguous() const {
+    auto t = std::make_shared<TensorType>(*this);
+    t->strides_ = TensorType::contiguousStridesOf(sizes_);
+    return t;
+  }
+
+  TensorTypePtr toScalarType(at::ScalarType type){
+    auto t = std::make_shared<TensorType>(*this);
+    t->scalar_type_ = type;
+    return t;
+  }
+
+  bool operator==(const Type& rhs) const override {
+    if(rhs.kind() != kind())
+      return false;
+    auto rt = rhs.expect<TensorType>();
+    return scalarType() == rt->scalarType() &&
+           sizes() == rt->sizes() &&
+           strides() == rt->strides() &&
+           device() == rt->device();
+  }
+  bool isSubtypeOf(const Type& rhs) const override {
+    return *this == rhs || rhs.kind() == TypeKind::DynamicType;
+  }
+  std::string str() const override {
+    // str is used for user-facing error messages, where we
+    // don't want to reveal underlying size information.
+    return "Tensor";
+  }
+private:
+  static std::vector<int64_t> contiguousStridesOf(at::IntList sizes) {
+    std::vector<int64_t> strides(sizes.size());
+    if(sizes.size() == 0) // zero-dim case
+      return strides;
+    strides.back() = 1;
+    for(size_t i = strides.size() - 1; i > 0; i--) {
+      strides[i-1] = strides[i] * sizes[i];
+    }
+    return strides;
+  }
+  at::ScalarType scalar_type_;
+  int device_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+};
+
+struct ListType : public Type {
+  friend struct Type;
+  static const TypeKind Kind = TypeKind::ListType;
+  ListType(TypePtr elem)
+  : Type(TypeKind::ListType), elem(elem) {}
+  bool operator==(const Type& rhs) const override {
+    if(auto rhs_ = rhs.cast<ListType>()) {
+      return *getElementType() == *rhs_->getElementType();
+    }
+    return false;
+  }
+  std::string str() const override {
+    std::stringstream ss;
+    ss << getElementType()->str() << "[]";
+    return ss.str();
+  }
+  TypePtr getElementType() const {
+    return elem;
+  }
+  // common cast List[Tensor]
+  static TypePtr ofTensors();
+  static TypePtr ofInts();
+private:
+  TypePtr elem;
+};
+
+struct TupleType : public Type {
+  friend struct Type;
+  TupleType(std::vector<TypePtr> elements_)
+  : Type(TypeKind::TupleType)
+  , elements_(std::move(elements_)) {}
+  static const TypeKind Kind = TypeKind::TupleType;
+  at::ArrayRef<TypePtr> elements() const {
+    return elements_;
+  }
+  bool operator==(const Type& rhs) const override {
+    return compare(rhs, [](const Type& a, const Type& b) {
+      return a == b;
+    });
+  }
+  bool isSubtypeOf(const Type& rhs) const override {
+    // e.g. (Tensor, Tensor, Tensor) <: List[Tensor]
+    if(auto lt = rhs.cast<ListType>()) {
+      for(auto e : elements()) {
+        if(!e->isSubtypeOf(*lt->getElementType()))
+          return false;
+      }
+      return true;
+    }
+    // co-variant rules for tuples
+    return compare(rhs, [](const Type& a, const Type&b) {
+      return a.isSubtypeOf(b);
+    });
+  }
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "(";
+    for(size_t i = 0; i < elements().size(); ++i) {
+      if(i > 0)
+        ss << ", ";
+      ss << elements()[i]->str();
+    }
+    ss << ")";
+    return ss.str();
+  }
+private:
+  bool compare(const Type& rhs, std::function<bool(const Type&, const Type&)> fn) const {
+    if(rhs.kind() != kind())
+      return false;
+    const auto & l_elements = elements();
+    const auto & r_elements = rhs.cast<TupleType>()->elements();
+    if(l_elements.size() != r_elements.size())
+      return false;
+    for(size_t i = 0; i < l_elements.size(); ++i) {
+      if(!fn(*l_elements[i], *r_elements[i]))
+        return false;
+    }
+    return true;
+  }
+  std::vector<TypePtr> elements_;
+};
+
+// This node represents a Python number value
+struct NumberType : public Type {
+  NumberType()
+  : Type(TypeKind::NumberType) {}
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Scalar"; // match what PythonArgParser says for clarity
+  }
+  static const TypeKind Kind = TypeKind::NumberType;
+  // global singleton
+  static TypePtr get();
+};
+
+// This node represents a Python float number value
+struct FloatType : public Type {
+  FloatType()
+  : Type(TypeKind::FloatType) {}
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "float";
+  }
+  bool isSubtypeOf(const Type& rhs) const override {
+    return *this == rhs || rhs.kind() == TypeKind::NumberType;
+  }
+  static const TypeKind Kind = TypeKind::FloatType;
+  // global singleton
+  static TypePtr get();
+};
+
+// This node represents a Python int number value
+struct IntType : public Type {
+  IntType()
+  : Type(TypeKind::IntType) {}
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "int";
+  }
+  bool isSubtypeOf(const Type& rhs) const override {
+    return *this == rhs || rhs.kind() == TypeKind::NumberType;
+  }
+  static const TypeKind Kind = TypeKind::IntType;
+  // global singleton
+  static TypePtr get();
+};
+
+
+std::ostream& operator<<(std::ostream & out, const Type & t);
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/variable_flags.cpp b/torch/csrc/jit/variable_flags.cpp
new file mode 100644
index 0000000..8ab565d
--- /dev/null
+++ b/torch/csrc/jit/variable_flags.cpp
@@ -0,0 +1,19 @@
+#include "torch/csrc/jit/variable_flags.h"
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/tracer_state.h"
+
+using torch::autograd::Variable;
+
+namespace torch { namespace jit {
+
+// These definitions require Variable struct to be defined, so they can't be
+// in tracer_state.h
+VariableFlags VariableFlags::of(const Variable& var) {
+  VariableFlags f;
+  f.defined = var.defined();
+  f.requires_grad = f.defined && var.requires_grad();
+  return f;
+}
+
+}}
diff --git a/torch/csrc/jit/variable_flags.h b/torch/csrc/jit/variable_flags.h
new file mode 100644
index 0000000..43c3ef9
--- /dev/null
+++ b/torch/csrc/jit/variable_flags.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <iostream>
+namespace torch { namespace autograd {
+struct Variable;
+}}
+
+namespace torch { namespace jit {
+
+struct VariableFlags {
+  static VariableFlags of(const autograd::Variable& var);
+
+  bool requires_grad;
+  bool defined;
+};
+
+static inline std::ostream & operator<<(std::ostream & out, const VariableFlags& v) {
+  return out
+    << "(requires_grad=" << v.requires_grad
+    << ", defined=" << v.defined << ")";
+}
+
+}}
diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h
new file mode 100644
index 0000000..eeae2a6
--- /dev/null
+++ b/torch/csrc/jit/variable_tensor_list.h
@@ -0,0 +1,17 @@
+#pragma once
+#include "ATen/ATen.h"
+
+namespace torch { namespace jit {
+
+// a wrapper to mark places where we expect all the at::Tensors to be
+// variables
+struct variable_tensor_list : public std::vector<at::Tensor> {
+  variable_tensor_list() {}
+  template<class InputIt>
+  variable_tensor_list(InputIt first, InputIt last)
+  : std::vector<at::Tensor>(first, last) {} 
+  explicit variable_tensor_list(std::vector<at::Tensor> && tensor)
+  : std::vector<at::Tensor>(std::move(tensor)) {}
+};
+
+}}
diff --git a/torch/csrc/nn/.gitkeep b/torch/csrc/nn/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/torch/csrc/nn/type_checks.h b/torch/csrc/nn/type_checks.h
new file mode 100644
index 0000000..45a9aa1
--- /dev/null
+++ b/torch/csrc/nn/type_checks.h
@@ -0,0 +1,105 @@
+#pragma once
+
+// Defines type checks and unpacking code for the legacy THNN/THCUNN bindings.
+// These checks accept Tensors and Variables.
+
+#include <ATen/ATen.h>
+
+#include "torch/csrc/autograd/python_variable.h"
+
+namespace torch { namespace nn {
+
+inline bool check_type(PyObject* obj, at::TypeID typeID) {
+  if (THPVariable_Check(obj)) {
+    return ((THPVariable*)obj)->cdata.data().type().ID() == typeID;
+  }
+  return false;
+}
+
+template<typename T>
+inline T* unpack(PyObject* obj) {
+  return (T*) ((THPVariable*)obj)->cdata.data().unsafeGetTH(false);
+}
+
+}} // namespace torch::nn
+
+static inline int get_device(PyObject* args) {
+  for (int i = 0, n = PyTuple_GET_SIZE(args); i != n; i++) {
+    PyObject* arg = PyTuple_GET_ITEM(args, i);
+    if (THPVariable_Check(arg)) {
+      auto& tensor = THPVariable_UnpackData(arg);
+      if (tensor.type().is_cuda()) {
+        return tensor.get_device();
+      }
+    }
+  }
+  return -1;
+}
+
+static inline bool THNN_FloatTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CPUFloat);
+}
+
+static inline bool THNN_DoubleTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CPUDouble);
+}
+
+static inline bool THNN_LongTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CPULong);
+}
+
+static inline bool THNN_IntTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CPUInt);
+}
+
+static inline THFloatTensor* THNN_FloatTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THFloatTensor>(obj);
+}
+
+static inline THDoubleTensor* THNN_DoubleTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THDoubleTensor>(obj);
+}
+
+static inline THLongTensor* THNN_LongTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THLongTensor>(obj);
+}
+
+static inline THIntTensor* THNN_IntTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THIntTensor>(obj);
+}
+
+#ifdef USE_CUDA
+
+static inline bool THNN_CudaHalfTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CUDAHalf);
+}
+
+static inline bool THNN_CudaFloatTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CUDAFloat);
+}
+
+static inline bool THNN_CudaDoubleTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CUDADouble);
+}
+
+static inline bool THNN_CudaLongTensor_Check(PyObject* obj) {
+  return torch::nn::check_type(obj, at::TypeID::CUDALong);
+}
+
+static inline THCudaHalfTensor* THNN_CudaHalfTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THCudaHalfTensor>(obj);
+}
+
+static inline THCudaTensor* THNN_CudaFloatTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THCudaTensor>(obj);
+}
+
+static inline THCudaDoubleTensor* THNN_CudaDoubleTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THCudaDoubleTensor>(obj);
+}
+
+static inline THCudaLongTensor* THNN_CudaLongTensor_Unpack(PyObject* obj) {
+  return torch::nn::unpack<THCudaLongTensor>(obj);
+}
+
+#endif  // USE_CUDA
diff --git a/torch/csrc/nvrtc.cpp b/torch/csrc/nvrtc.cpp
new file mode 100644
index 0000000..307581d
--- /dev/null
+++ b/torch/csrc/nvrtc.cpp
@@ -0,0 +1,44 @@
+#include "torch/csrc/python_headers.h"
+
+static PyObject* module;
+
+static PyMethodDef TorchNvrtcMethods[] = {
+  {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION != 2
+static struct PyModuleDef torchnvrtcmodule = {
+   PyModuleDef_HEAD_INIT,
+   "torch._nvrtc",
+   NULL,
+   -1,
+   TorchNvrtcMethods
+};
+#endif
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_nvrtc(void)
+#else
+PyMODINIT_FUNC PyInit__nvrtc(void)
+#endif
+{
+
+#if PY_MAJOR_VERSION == 2
+#define ASSERT_TRUE(cmd) if (!(cmd)) {PyErr_SetString(PyExc_ImportError, "initialization error in torch._nvrtc"); return;}
+#else
+#define ASSERT_TRUE(cmd) if (!(cmd)) return NULL
+#endif
+
+#if PY_MAJOR_VERSION == 2
+  ASSERT_TRUE(module = Py_InitModule("torch._nvrtc", TorchNvrtcMethods));
+#else
+  ASSERT_TRUE(module = PyModule_Create(&torchnvrtcmodule));
+#endif
+
+#if PY_MAJOR_VERSION == 2
+#else
+  return module;
+#endif
+
+#undef ASSERT_TRUE
+}
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
new file mode 100644
index 0000000..b09824e
--- /dev/null
+++ b/torch/csrc/onnx/init.cpp
@@ -0,0 +1,36 @@
+#include "torch/csrc/onnx/init.h"
+#include "torch/csrc/onnx/onnx.npb.h"
+#include "torch/csrc/onnx/onnx.h"
+
+namespace torch { namespace onnx {
+void initONNXBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+  auto onnx = m.def_submodule("_onnx");
+  py::enum_<onnx_TensorProto_DataType>(onnx, "TensorProtoDataType")
+      .value("UNDEFINED", onnx_TensorProto_DataType_UNDEFINED)
+      .value("FLOAT", onnx_TensorProto_DataType_FLOAT)
+      .value("UINT8", onnx_TensorProto_DataType_UINT8)
+      .value("INT8", onnx_TensorProto_DataType_INT8)
+      .value("UINT16", onnx_TensorProto_DataType_UINT16)
+      .value("INT16", onnx_TensorProto_DataType_INT16)
+      .value("INT32", onnx_TensorProto_DataType_INT32)
+      .value("INT64", onnx_TensorProto_DataType_INT64)
+      .value("STRING", onnx_TensorProto_DataType_STRING)
+      .value("BOOL", onnx_TensorProto_DataType_BOOL)
+      .value("FLOAT16", onnx_TensorProto_DataType_FLOAT16)
+      .value("DOUBLE", onnx_TensorProto_DataType_DOUBLE)
+      .value("UINT32", onnx_TensorProto_DataType_UINT32)
+      .value("UINT64", onnx_TensorProto_DataType_UINT64)
+      .value("COMPLEX64", onnx_TensorProto_DataType_COMPLEX64)
+      .value("COMPLEX128", onnx_TensorProto_DataType_COMPLEX128);
+
+  py::enum_<OperatorExportTypes>(onnx, "OperatorExportTypes")
+    .value("ONNX", OperatorExportTypes::ONNX)
+    .value("ONNX_ATEN", OperatorExportTypes::ONNX_ATEN)
+    .value("ONNX_ATEN_FALLBACK", OperatorExportTypes::ONNX_ATEN_FALLBACK)
+    .value("RAW", OperatorExportTypes::RAW);
+
+  py::class_<ModelProto>(onnx, "ModelProto")
+      .def("prettyPrint", &ModelProto::prettyPrint);
+}
+}} // namespace torch::onnx
diff --git a/torch/csrc/onnx/init.h b/torch/csrc/onnx/init.h
new file mode 100644
index 0000000..f2cb339
--- /dev/null
+++ b/torch/csrc/onnx/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/utils/pybind.h"
+
+namespace torch { namespace onnx {
+
+void initONNXBindings(PyObject* module);
+
+}} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.cpp b/torch/csrc/onnx/onnx.cpp
new file mode 100644
index 0000000..7b18420
--- /dev/null
+++ b/torch/csrc/onnx/onnx.cpp
@@ -0,0 +1,214 @@
+#include "torch/csrc/onnx/onnx.h"
+
+namespace torch { namespace onnx {
+
+template <>
+bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg) {
+  return pb_encode_string(stream, reinterpret_cast<const pb_byte_t *>(arg->c_str()), arg->size());
+}
+// NB: Overloads don't work so great for signed variables.  Hope this doesn't
+// come up!
+template <>
+bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg) {
+  // Yes, this looks dodgy, and yes, this is what the docs say to do:
+  // https://jpa.kapsi.fi/nanopb/docs/reference.html#pb-encode-varint
+  return pb_encode_varint(stream, *reinterpret_cast<uint64_t*>(arg));
+}
+template <>
+bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg) {
+  return pb_encode_fixed32(stream, static_cast<void*>(arg));
+}
+template <>
+bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg) {
+  return pb_encode_fixed64(stream, static_cast<void*>(arg));
+}
+
+template <>
+bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg) {
+  return pb_encode_submessage(stream, onnx_TensorShapeProto_Dimension_fields,
+                              static_cast<void*>(arg));
+}
+
+// TODO: I'm not entirely sure why this can't be in the header...
+bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  at::Tensor* t = static_cast<at::Tensor*>(*arg);
+  JIT_ASSERT(t->is_contiguous());
+  // Packed array format!
+  pb_encode_tag_for_field(stream, field);
+  pb_encode_string(stream, (pb_byte_t*)(t->data_ptr()),  t->type().elementSizeInBytes()*t->numel());
+
+  return true;
+}
+
+GraphProto* AttributeProto::add_graphs() {
+  auto ptr = new GraphProto();
+  graphs.emplace_back(ptr);
+  return ptr;
+}
+
+constexpr char indent_char = ' ';
+constexpr size_t indent_multiplier = 2;
+
+std::string idt(size_t indent) {
+  return std::string(indent * indent_multiplier, indent_char);
+}
+
+std::string nlidt(size_t indent) {
+  return std::string("\n") + idt(indent);
+}
+
+void TensorProto::dump(std::ostream& stream, size_t indent) {
+  stream << "TensorProto shape: [";
+  for (size_t i = 0; i < dims.size(); ++i) {
+    stream << *dims[i] << (i == dims.size() - 1 ? "" : " ");
+  }
+  stream << "]";
+}
+
+void TensorShapeProto::dump(std::ostream& stream, size_t indent) {
+  for (size_t i=0; i < dims.size(); ++i) {
+    auto &dim = dims[i];
+    if (dim->has_dim_value) {
+      stream << dim->dim_value;
+    } else {
+      stream << "?";
+    }
+    stream << (i == dims.size() - 1 ? "" : " ");
+  }
+}
+
+void TypeProtoTensor::dump(std::ostream& stream, size_t indent) {
+  stream << "Tensor dims: ";
+  shape->dump(stream);
+}
+
+void TypeProto::dump(std::ostream& stream, size_t indent) {
+  tensor_type->dump(stream);
+}
+
+void ValueInfoProto::dump(std::ostream& stream, size_t indent) {
+  stream << "{name: \"" << name
+         << "\", type:";
+  type->dump(stream);
+  stream << "}";
+}
+
+void AttributeProto::dump(std::ostream& stream, size_t indent) {
+  stream << "{ name: '" << name << "', type: ";
+  if (proto.has_f) {
+    stream << "float, value: " << proto.f;
+  } else if (proto.has_i) {
+    stream << "int, value: " << proto.i;
+  } else if (s.length()) {
+    stream << "string, value: '" << s << "'";
+  } else if (g) {
+    stream << "graph, value:\n";
+    g->dump(stream, indent+1);
+    stream << nlidt(indent);
+  } else if (t) {
+    stream << "tensor, value:";
+    t->dump(stream, indent+1);
+  } else if (floats.size()) {
+    stream << "floats, values: [";
+    for (size_t i=0; i < floats.size(); ++i)
+      stream << *floats[i] << (i == floats.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (ints.size()) {
+    stream << "ints, values: [";
+    for (size_t i=0; i < ints.size(); ++i)
+      stream << *ints[i] << (i == ints.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (strings.size()) {
+    stream << "strings, values: [";
+    for (size_t i=0; i < strings.size(); ++i)
+      stream << "'" << *strings[i] << "'" << (i == strings.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (tensors.size()) {
+    stream << "tensors, values: [";
+    for (auto& t : tensors) {
+      t->dump(stream, indent+1);
+    }
+    stream << "]";
+  } else if (graphs.size()) {
+    stream << "graphs, values: [";
+    for (auto& g : graphs) {
+      g->dump(stream, indent+1);
+    }
+    stream << "]";
+  } else {
+    stream << "UNKNOWN";
+  }
+  stream << "}";
+}
+
+void NodeProto::dump(std::ostream& stream, size_t indent) {
+  stream << "Node {type: \"" << op_type << "\", inputs: [";
+  for (size_t i=0; i < inputs.size(); ++i) {
+    stream << *inputs[i] << (i == inputs.size() - 1 ? "" : ",");
+  }
+  stream << "], outputs: [";
+  for (size_t i=0; i < outputs.size(); ++i) {
+    stream << *outputs[i] << (i == outputs.size() - 1 ? "" : ",");
+  }
+  stream << "], attributes: [";
+  for (size_t i=0; i < attributes.size(); ++i) {
+    attributes[i]->dump(stream, indent+1);
+    stream << (i == attributes.size() - 1 ? "" : ",");
+  }
+  stream << "]}";
+}
+
+void GraphProto::dump(std::ostream& stream, size_t indent) {
+  stream << idt(indent) << "GraphProto {" << nlidt(indent+1)
+         << "name: \"" << name << "\"" << nlidt(indent+1)
+         << "inputs: [";
+  for (size_t i=0; i < inputs.size(); ++i) {
+    inputs[i]->dump(stream, indent+2);
+    stream << (i == inputs.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "outputs: [";
+  for (size_t i=0; i < outputs.size(); ++i) {
+    outputs[i]->dump(stream, indent+2);
+    stream << (i == outputs.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "initializers: [";
+  for (size_t i=0; i < initializers.size(); ++i) {
+    initializers[i]->dump(stream, indent+2);
+    stream << (i == initializers.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "nodes: [" << nlidt(indent+2);
+  for (size_t i=0; i < nodes.size(); ++i) {
+    nodes[i]->dump(stream, indent+2);
+    if (i != nodes.size() - 1) stream << "," << nlidt(indent+2);
+  }
+  stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n";
+}
+
+void OperatorSetIdProto::dump(std::ostream& stream, size_t indent) {
+  stream << "OperatorSetIdProto { domain: " << domain << "}";
+}
+
+void ModelProto::dump(std::ostream& stream, size_t indent) {
+  stream << idt(indent)
+         << "ModelProto {" << nlidt(indent+1)
+         << "producer_name: \"" << producer_name << "\"" << nlidt(indent+1)
+         << "domain: \"" << domain << "\"" << nlidt(indent+1)
+         << "doc_string: \"" << doc_string << "\"";
+  if (graph) {
+    stream << nlidt(indent+1) << "graph:\n";
+    graph->dump(stream, indent+2);
+  }
+  if (opset_import.size()) {
+    stream << idt(indent+1) << "opset_import: [";
+    for (auto &opset_imp : opset_import) {
+      opset_imp->dump(stream, indent+2);
+    }
+    stream << "],\n";
+  }
+  stream << idt(indent) << "}\n";
+}
+
+}} // namespace onnx
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
new file mode 100644
index 0000000..0487e5f
--- /dev/null
+++ b/torch/csrc/onnx/onnx.h
@@ -0,0 +1,435 @@
+#pragma once
+
+#include "torch/csrc/onnx/onnx.npb.h"
+#include "torch/csrc/assertions.h"
+
+#include <pb_encode.h>
+#include <ATen/ATen.h>
+
+#include <vector>
+#include <fstream>
+#include <memory>
+
+namespace torch { namespace onnx {
+
+using DataType = onnx_TensorProto_DataType;
+using Dimension = onnx_TensorShapeProto_Dimension;
+
+// Note [Unique vector]
+// ~~~~~~~~~~~~~~~~~~~~
+// Why do we need vectors of unique pointers?  A Google-style C++ Protobuf API
+// returns raw pointers T* which are expected to stay valid as long as the
+// enclosing protobuf is live.  However, if we store T directly in a vector, if
+// the vector ever resizes (which it may, because we don't know a priori how
+// many elements are in the vector) all of these pointers will be invalidated.
+// Thus, up-front, we have to give them permanent, dynamically allocated
+// addresses.
+template<typename T>
+using unique_vector = std::vector<std::unique_ptr<T>>;
+
+// Helper function for encoding inside callbacks
+template<typename T, const pb_field_t* Field>
+bool micropb_encode(pb_ostream_t *stream, T* arg) {
+  static_assert(Field != nullptr, "no overload in micropb_encode");
+  return pb_encode_submessage(stream, Field, static_cast<void*>(&arg->proto));
+}
+template <> bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg);
+template <> bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg);
+template <> bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg);
+template <> bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg);
+template <> bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg);
+// NB: If we ever add support for signed protobuf integers, we'll need a special
+// wrapper, since we can't overload over them (they look the same from C++ side)
+
+// Callback functions of type pb_callback_t.
+
+// Write out a single protobuf field inside a message
+template<typename T, const pb_field_t* Field>
+bool micropb_callback(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  if (!pb_encode_tag_for_field(stream, field)) return false;
+  if (!micropb_encode<T, Field>(stream, static_cast<T*>(*arg))) return false;
+  return true;
+}
+
+// Write out a repeated protobuf field inside a message
+template<typename T, const pb_field_t* Field>
+bool micropb_callback_list(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  std::vector<std::unique_ptr<T>>* vals = static_cast<std::vector<std::unique_ptr<T>>*>(*arg);
+  for (std::unique_ptr<T>& val : *vals) {
+    auto ptr = static_cast<void*>(val.get());
+    if (!micropb_callback<T, Field>(stream, field, &ptr)) return false;
+  }
+  return true;
+}
+
+bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg);
+
+// MicroProto helper class
+template<typename T>
+struct MicroProto {
+  // The actual nanopb generated protobuf struct we are filling.
+  T proto;
+
+  // The constructor takes the protobuf struct by value for initialization
+  // (since it is a C-style struct).  In the constructor you're
+  // expected to call this with something like onnx_TensorProto_init_default
+  MicroProto(T proto) : proto(proto) {}
+
+  // Usage:
+  //    std::string owning_slot;
+  //    proto.string_field = string(&owning_slot, value_to_set)
+  //
+  // This function takes a string 's' and copies it into the
+  // owning slot specified by 'slot'.  It then returns a callback
+  // intended to be assigned into the particular protobuf field.
+  // The employed callback reads out the string from owning
+  // slot and writes it out to the protobuf.
+  //
+  // You should call this function IN THE SETTER METHOD, because
+  // the no-op callback is different from a callback with an empty
+  // string: in the former case, the field is absent; in the latter,
+  // the field is present but an empty string.
+  pb_callback_t string(std::string* slot, const std::string& s) {
+    *slot = s; // copy construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback<std::string, nullptr>;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  // Usage:
+  //    at::Tensor owning_slot;
+  //    proto.string_field = string_from_tensor(&owning_slot, value_to_set)
+  //
+  // This function takes an at::Tensor and copies it into the
+  // owning slot specified by 'slot'.  It then returns a callback
+  // intended to be assigned into the particular protobuf field.
+  // The employed callback reads out the tensor's data as if it
+  // were a string (adjusting for endianness, if necessary)
+  // writes it out to the protobuf.
+  //
+  // You should call this function IN THE SETTER METHOD, because
+  // the no-op callback is different from a callback with an undefined
+  // Tensor.
+  pb_callback_t string_from_tensor(at::Tensor* slot, const at::Tensor& t) {
+    *slot = t; // copy construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback_string_from_tensor;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  // Usage:
+  //    unique_vector<ElemType> owning_slot;
+  //    proto.list_field = list<ElemType>(&owning_slot)
+  //
+  // This function returns a callback intended to be
+  // assigned into a particular protobuf field.  The employed
+  // callback reads out the vector of elements from the owning
+  // slot and writes the entries into the protobuf.
+  //
+  // You should call this function IN THE CONSTRUCTOR, because
+  // the no-op callback is equivalent to a callback with an empty
+  // list.  (While it's harmless to call this in the setter, but
+  // a bit wasteful.)
+  template<typename S, const pb_field_t* Field = nullptr>
+  pb_callback_t list(unique_vector<S>* slot) {
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback_list<S, Field>;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  template<typename S, const pb_field_t* Field = nullptr>
+  pb_callback_t msg(std::unique_ptr<S>* slot) {
+    *slot = std::unique_ptr<S>(new S()); // default construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback<S, Field>;
+    r.arg = static_cast<void*>(slot->get());
+    return r; // RVO
+  }
+};
+
+#define DEFINE_CONST(C) \
+const auto k##C = onnx_TensorProto_DataType_##C;
+DEFINE_CONST(FLOAT)
+DEFINE_CONST(UINT8)
+DEFINE_CONST(INT8)
+DEFINE_CONST(UINT16)
+DEFINE_CONST(INT16)
+DEFINE_CONST(INT32)
+DEFINE_CONST(INT64)
+DEFINE_CONST(STRING)
+DEFINE_CONST(BOOL)
+DEFINE_CONST(FLOAT16)
+DEFINE_CONST(DOUBLE)
+DEFINE_CONST(UINT32)
+DEFINE_CONST(UINT64)
+DEFINE_CONST(COMPLEX64)
+DEFINE_CONST(COMPLEX128)
+#undef DEFINE_CONST
+
+#define DEFINE_CONST(C) \
+const auto a##C = onnx_AttributeProto_AttributeType_##C;
+DEFINE_CONST(FLOAT)
+DEFINE_CONST(INT)
+DEFINE_CONST(STRING)
+DEFINE_CONST(TENSOR)
+DEFINE_CONST(GRAPH)
+DEFINE_CONST(FLOATS)
+DEFINE_CONST(INTS)
+DEFINE_CONST(STRINGS)
+DEFINE_CONST(TENSORS)
+DEFINE_CONST(GRAPHS)
+#undef DEFINE_CONST
+
+// C++ wrappers which simulate the Google C++ Protobuf API
+//
+// These are NOT COMPLETE wrappers. If you find something is missing, add it!
+
+class AttributeProto;
+class TensorShapeProto;
+class TypeProtoTensor;
+class TensorProto;
+class TypeProto;
+class ValueInfoProto;
+class NodeProto;
+class GraphProto;
+class ModelProto;
+
+class TensorProto : public MicroProto<onnx_TensorProto> {
+private:
+  std::string name; // namespace ValueInfoProto.
+  unique_vector<int64_t> dims;
+  at::Tensor raw_data;
+  std::string dump_;
+public:
+  TensorProto() : MicroProto(onnx_TensorProto_init_default) {
+    proto.dims       = list<int64_t>(&dims);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  void add_dims(int64_t d) { dims.emplace_back(new int64_t(d)); }
+  // Google Protobuf divergence!
+  void set_raw_data(const at::Tensor& t) { proto.raw_data = string_from_tensor(&raw_data, t); }
+  void set_external_data_present() { proto.raw_data = string(&dump_, "__EXTERNAL"); }
+  void set_data_type(onnx_TensorProto_DataType t) { proto.has_data_type = true; proto.data_type = t; }
+  std::string get_name() const { return name; }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TensorShapeProto : public MicroProto<onnx_TensorShapeProto> {
+private:
+  unique_vector<Dimension> dims;
+public:
+  TensorShapeProto() : MicroProto(onnx_TensorShapeProto_init_default) {
+    proto.dim = list<Dimension>(&dims);
+  }
+  void add_dim(std::int64_t d) {
+    Dimension* p_d = new Dimension();
+    p_d->has_dim_value = true;
+    p_d->dim_value = d;
+    dims.emplace_back(p_d);
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TypeProtoTensor : public MicroProto<onnx_TypeProto_Tensor> {
+private:
+  std::unique_ptr<TensorShapeProto> shape;
+public:
+  TypeProtoTensor() : MicroProto(onnx_TypeProto_Tensor_init_default) {}
+  void set_data_type(onnx_TensorProto_DataType t) { proto.has_elem_type = true; proto.elem_type = t; }
+  TensorShapeProto* mutable_shape() {
+    proto.shape = msg<TensorShapeProto, onnx_TensorShapeProto_fields>(&shape);
+    return shape.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TypeProto : public MicroProto<onnx_TypeProto> {
+private:
+  std::unique_ptr<TypeProtoTensor> tensor_type;
+public:
+  TypeProto() : MicroProto(onnx_TypeProto_init_default) {}
+  TypeProtoTensor* mutable_tensor_type() {
+    proto.tensor_type = msg<TypeProtoTensor, onnx_TypeProto_Tensor_fields>(&tensor_type);
+    return tensor_type.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class ValueInfoProto : public MicroProto<onnx_ValueInfoProto> {
+private:
+  std::string name;
+  std::unique_ptr<TypeProto> type;
+public:
+  ValueInfoProto() : MicroProto(onnx_ValueInfoProto_init_default) {}
+  std::string get_name() { return name; }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  TypeProto* mutable_type() {
+    proto.type = msg<TypeProto, onnx_TypeProto_fields>(&type);
+    return type.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class AttributeProto : public MicroProto<onnx_AttributeProto> {
+private:
+  std::string name;
+  std::string s;
+  std::unique_ptr<GraphProto> g;
+  std::unique_ptr<TensorProto> t;
+  unique_vector<float> floats;
+  unique_vector<int64_t> ints;
+  unique_vector<std::string> strings;
+  unique_vector<TensorProto> tensors;
+  unique_vector<GraphProto> graphs;
+public:
+  AttributeProto() : MicroProto(onnx_AttributeProto_init_default) {
+    proto.floats  = list<float>(&floats);
+    proto.ints    = list<int64_t>(&ints);
+    proto.strings = list<std::string>(&strings);
+    proto.tensors = list<TensorProto, onnx_TensorProto_fields>(&tensors);
+    proto.graphs  = list<GraphProto, onnx_GraphProto_fields>(&graphs);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  void set_type(onnx_AttributeProto_AttributeType t) { proto.has_type = true; proto.type = t; }
+  void set_f(float f) { proto.has_f = true; proto.f = f; }
+  void set_i(int64_t i) { proto.has_i = true; proto.i = i; }
+  void set_s(std::string s_) { proto.s = string(&s, s_); }
+  // See https://developers.google.com/protocol-buffers/docs/reference/cpp-generated#embeddedmessage
+  GraphProto* mutable_g() { proto.g = msg<GraphProto, onnx_GraphProto_fields>(&g); return g.get(); }
+  TensorProto* mutable_t() { proto.t = msg<TensorProto, onnx_TensorProto_fields>(&t); return t.get(); }
+  void add_floats(float f) { floats.emplace_back(new float(f)); }
+  void add_ints(int64_t i) { ints.emplace_back(new int64_t(i)); }
+  void add_strings(std::string s) { strings.emplace_back(new std::string(s)); }
+  TensorProto* add_tensors() {
+    auto ptr = new TensorProto();
+    tensors.emplace_back(ptr);
+    return ptr;
+  }
+  GraphProto* add_graphs();
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class NodeProto : public MicroProto<onnx_NodeProto> {
+private:
+  std::string op_type;
+  std::string domain;
+  std::string doc_string;
+  unique_vector<std::string> inputs;
+  unique_vector<std::string> outputs;
+  unique_vector<AttributeProto> attributes;
+public:
+  NodeProto() : MicroProto(onnx_NodeProto_init_default) {
+    proto.input = list<std::string>(&inputs);
+    proto.output = list<std::string>(&outputs);
+    proto.attribute = list<AttributeProto, onnx_AttributeProto_fields>(&attributes);
+  }
+  void add_input(const std::string& s) { inputs.emplace_back(new std::string(s)); }
+  void clear_input() { inputs.clear(); }
+  void add_output(const std::string& s) { outputs.emplace_back(new std::string(s)); }
+  void clear_output() { outputs.clear(); }
+  AttributeProto* add_attribute() {
+    auto ptr = new AttributeProto();
+    attributes.emplace_back(ptr);
+    return ptr;
+  }
+  void set_op_type(const std::string& s) { proto.op_type = string(&op_type, s); }
+  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
+  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class GraphProto : public MicroProto<onnx_GraphProto> {
+private:
+  std::string name;
+  unique_vector<ValueInfoProto> inputs;
+  unique_vector<ValueInfoProto> outputs;
+  unique_vector<NodeProto> nodes;
+  unique_vector<TensorProto> initializers;
+public:
+  GraphProto() : MicroProto(onnx_GraphProto_init_default) {
+    proto.input = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&inputs);
+    proto.output = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&outputs);
+    proto.node = list<NodeProto, onnx_NodeProto_fields>(&nodes);
+    proto.initializer = list<TensorProto, onnx_TensorProto_fields>(&initializers);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  ValueInfoProto* add_input() {
+    auto ptr = new ValueInfoProto();
+    inputs.emplace_back(ptr);
+    return ptr;
+  }
+  std::string get_input_name(size_t i) { return inputs.at(i)->get_name(); }
+  ValueInfoProto* add_output() {
+    auto ptr = new ValueInfoProto();
+    outputs.emplace_back(ptr);
+    return ptr;
+  }
+  NodeProto* add_node() {
+    auto ptr = new NodeProto();
+    nodes.emplace_back(ptr);
+    return ptr;
+  }
+  TensorProto* add_initializer() {
+    auto ptr = new TensorProto();
+    initializers.emplace_back(ptr);
+    return ptr;
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class OperatorSetIdProto : public MicroProto<onnx_OperatorSetIdProto> {
+private:
+  std::string domain;
+public:
+  OperatorSetIdProto() : MicroProto(onnx_OperatorSetIdProto_init_default) {}
+  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
+  void set_version(int64_t v) { proto.has_version = true; proto.version = v; }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class ModelProto : public MicroProto<onnx_ModelProto> {
+private:
+  std::string producer_name;
+  std::string producer_version;
+  std::string domain;
+  std::string doc_string;
+  std::unique_ptr<GraphProto> graph;
+  unique_vector<OperatorSetIdProto> opset_import;
+public:
+  ModelProto() : MicroProto(onnx_ModelProto_init_default) {
+    proto.has_ir_version = true;
+    proto.ir_version = onnx_Version_IR_VERSION;
+    proto.opset_import = list<OperatorSetIdProto, onnx_OperatorSetIdProto_fields>(&opset_import);
+  }
+  void set_model_version(int64_t i) { proto.has_model_version = true; proto.model_version = i; }
+  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
+  void set_producer_name(const std::string& s) { proto.producer_name = string(&producer_name, s); }
+  void set_producer_version(const std::string& s) { proto.producer_version = string(&producer_version, s); }
+  GraphProto* mutable_graph() {
+    proto.graph = msg<GraphProto, onnx_GraphProto_fields>(&graph);
+    return graph.get();
+  }
+  OperatorSetIdProto* add_opset_import() {
+    auto ptr = new OperatorSetIdProto();
+    opset_import.emplace_back(ptr);
+    return ptr;
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+  std::string prettyPrint() {
+    std::stringstream ss;
+    dump(ss, 0);
+    return ss.str();
+  }
+};
+
+enum class OperatorExportTypes {
+  ONNX, // Strict ONNX export
+  ONNX_ATEN, // ONNX With ATen op everywhere
+  ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback
+  RAW, // Raw export (no ONNX)
+};
+
+}} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.npb.cpp b/torch/csrc/onnx/onnx.npb.cpp
new file mode 100644
index 0000000..2d8ee60
--- /dev/null
+++ b/torch/csrc/onnx/onnx.npb.cpp
@@ -0,0 +1,162 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.3.9-dev */
+
+#include "onnx.npb.h"
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+
+
+const pb_field_t onnx_AttributeProto_fields[14] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_AttributeProto, name, name, 0),
+    PB_FIELD(  2, FLOAT   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, f, name, 0),
+    PB_FIELD(  3, INT64   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, i, f, 0),
+    PB_FIELD(  4, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, s, i, 0),
+    PB_FIELD(  5, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, t, s, &onnx_TensorProto_fields),
+    PB_FIELD(  6, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, g, t, &onnx_GraphProto_fields),
+    PB_FIELD(  7, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, floats, g, 0),
+    PB_FIELD(  8, INT64   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, ints, floats, 0),
+    PB_FIELD(  9, BYTES   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, strings, ints, 0),
+    PB_FIELD( 10, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, tensors, strings, &onnx_TensorProto_fields),
+    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, graphs, tensors, &onnx_GraphProto_fields),
+    PB_FIELD( 13, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, doc_string, graphs, 0),
+    PB_FIELD( 20, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, type, doc_string, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_ValueInfoProto_fields[4] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_ValueInfoProto, name, name, 0),
+    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, type, name, &onnx_TypeProto_fields),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, doc_string, type, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_NodeProto_fields[8] = {
+    PB_FIELD(  1, STRING  , REPEATED, CALLBACK, FIRST, onnx_NodeProto, input, input, 0),
+    PB_FIELD(  2, STRING  , REPEATED, CALLBACK, OTHER, onnx_NodeProto, output, input, 0),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, name, output, 0),
+    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, op_type, name, 0),
+    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_NodeProto, attribute, op_type, &onnx_AttributeProto_fields),
+    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, doc_string, attribute, 0),
+    PB_FIELD(  7, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, domain, doc_string, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_ModelProto_fields[10] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_ModelProto, ir_version, ir_version, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_name, ir_version, 0),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_version, producer_name, 0),
+    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, domain, producer_version, 0),
+    PB_FIELD(  5, INT64   , OPTIONAL, STATIC  , OTHER, onnx_ModelProto, model_version, domain, 0),
+    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, doc_string, model_version, 0),
+    PB_FIELD(  7, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, graph, doc_string, &onnx_GraphProto_fields),
+    PB_FIELD(  8, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, opset_import, graph, &onnx_OperatorSetIdProto_fields),
+    PB_FIELD( 14, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, metadata_props, opset_import, &onnx_StringStringEntryProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_StringStringEntryProto_fields[3] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_StringStringEntryProto, key, key, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_StringStringEntryProto, value, key, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_GraphProto_fields[8] = {
+    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_GraphProto, node, node, &onnx_NodeProto_fields),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, name, node, 0),
+    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, initializer, name, &onnx_TensorProto_fields),
+    PB_FIELD( 10, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, doc_string, initializer, 0),
+    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, input, doc_string, &onnx_ValueInfoProto_fields),
+    PB_FIELD( 12, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, output, input, &onnx_ValueInfoProto_fields),
+    PB_FIELD( 13, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, value_info, output, &onnx_ValueInfoProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorProto_fields[13] = {
+    PB_FIELD(  1, INT64   , REPEATED, CALLBACK, FIRST, onnx_TensorProto, dims, dims, 0),
+    PB_FIELD(  2, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, data_type, dims, 0),
+    PB_FIELD(  3, MESSAGE , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, segment, data_type, &onnx_TensorProto_Segment_fields),
+    PB_FIELD(  4, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, float_data, segment, 0),
+    PB_FIELD(  5, INT32   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int32_data, float_data, 0),
+    PB_FIELD(  6, BYTES   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, string_data, int32_data, 0),
+    PB_FIELD(  7, INT64   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int64_data, string_data, 0),
+    PB_FIELD(  8, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, name, int64_data, 0),
+    PB_FIELD(  9, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, raw_data, name, 0),
+    PB_FIELD( 10, DOUBLE  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, double_data, raw_data, 0),
+    PB_FIELD( 11, UINT64  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, uint64_data, double_data, 0),
+    PB_FIELD( 12, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, doc_string, uint64_data, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorProto_Segment_fields[3] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorProto_Segment, begin, begin, 0),
+    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto_Segment, end, begin, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorShapeProto_fields[2] = {
+    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_TensorShapeProto, dim, dim, &onnx_TensorShapeProto_Dimension_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorShapeProto_Dimension_fields[3] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorShapeProto_Dimension, dim_value, dim_value, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorShapeProto_Dimension, dim_param, dim_value, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TypeProto_fields[2] = {
+    PB_FIELD(  1, MESSAGE , OPTIONAL, CALLBACK, FIRST, onnx_TypeProto, tensor_type, tensor_type, &onnx_TypeProto_Tensor_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TypeProto_Tensor_fields[3] = {
+    PB_FIELD(  1, UENUM   , OPTIONAL, STATIC  , FIRST, onnx_TypeProto_Tensor, elem_type, elem_type, 0),
+    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_TypeProto_Tensor, shape, elem_type, &onnx_TensorShapeProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_OperatorSetIdProto_fields[3] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_OperatorSetIdProto, domain, domain, 0),
+    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_OperatorSetIdProto, version, domain, 0),
+    PB_LAST_FIELD
+};
+
+
+
+
+
+/* Check that field information fits in pb_field_t */
+#if !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_32BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ *
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in 8 or 16 bit
+ * field descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
+#endif
+
+#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_16BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ *
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in the default
+ * 8 bit descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
+#endif
+
+
+/* On some platforms (such as AVR), double is really float.
+ * These are not directly supported by nanopb, but see example_avr_double.
+ * To get rid of this error, remove any double fields from your .proto.
+ */
+PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES)
+
+/* @@protoc_insertion_point(eof) */
diff --git a/torch/csrc/onnx/onnx.npb.h b/torch/csrc/onnx/onnx.npb.h
new file mode 100644
index 0000000..84d3b31
--- /dev/null
+++ b/torch/csrc/onnx/onnx.npb.h
@@ -0,0 +1,333 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.3.9-dev */
+
+#ifndef PB_ONNX_ONNX_PB_H_INCLUDED
+#define PB_ONNX_ONNX_PB_H_INCLUDED
+#include <pb.h>
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Enum definitions */
+typedef enum _onnx_Version {
+    onnx_Version__START_VERSION = 0,
+    onnx_Version_IR_VERSION_2017_10_10 = 1,
+    onnx_Version_IR_VERSION_2017_10_30 = 2,
+    onnx_Version_IR_VERSION = 3
+} onnx_Version;
+#define _onnx_Version_MIN onnx_Version__START_VERSION
+#define _onnx_Version_MAX onnx_Version_IR_VERSION
+#define _onnx_Version_ARRAYSIZE ((onnx_Version)(onnx_Version_IR_VERSION+1))
+
+typedef enum _onnx_AttributeProto_AttributeType {
+    onnx_AttributeProto_AttributeType_UNDEFINED = 0,
+    onnx_AttributeProto_AttributeType_FLOAT = 1,
+    onnx_AttributeProto_AttributeType_INT = 2,
+    onnx_AttributeProto_AttributeType_STRING = 3,
+    onnx_AttributeProto_AttributeType_TENSOR = 4,
+    onnx_AttributeProto_AttributeType_GRAPH = 5,
+    onnx_AttributeProto_AttributeType_FLOATS = 6,
+    onnx_AttributeProto_AttributeType_INTS = 7,
+    onnx_AttributeProto_AttributeType_STRINGS = 8,
+    onnx_AttributeProto_AttributeType_TENSORS = 9,
+    onnx_AttributeProto_AttributeType_GRAPHS = 10
+} onnx_AttributeProto_AttributeType;
+#define _onnx_AttributeProto_AttributeType_MIN onnx_AttributeProto_AttributeType_UNDEFINED
+#define _onnx_AttributeProto_AttributeType_MAX onnx_AttributeProto_AttributeType_GRAPHS
+#define _onnx_AttributeProto_AttributeType_ARRAYSIZE ((onnx_AttributeProto_AttributeType)(onnx_AttributeProto_AttributeType_GRAPHS+1))
+
+typedef enum _onnx_TensorProto_DataType {
+    onnx_TensorProto_DataType_UNDEFINED = 0,
+    onnx_TensorProto_DataType_FLOAT = 1,
+    onnx_TensorProto_DataType_UINT8 = 2,
+    onnx_TensorProto_DataType_INT8 = 3,
+    onnx_TensorProto_DataType_UINT16 = 4,
+    onnx_TensorProto_DataType_INT16 = 5,
+    onnx_TensorProto_DataType_INT32 = 6,
+    onnx_TensorProto_DataType_INT64 = 7,
+    onnx_TensorProto_DataType_STRING = 8,
+    onnx_TensorProto_DataType_BOOL = 9,
+    onnx_TensorProto_DataType_FLOAT16 = 10,
+    onnx_TensorProto_DataType_DOUBLE = 11,
+    onnx_TensorProto_DataType_UINT32 = 12,
+    onnx_TensorProto_DataType_UINT64 = 13,
+    onnx_TensorProto_DataType_COMPLEX64 = 14,
+    onnx_TensorProto_DataType_COMPLEX128 = 15
+} onnx_TensorProto_DataType;
+#define _onnx_TensorProto_DataType_MIN onnx_TensorProto_DataType_UNDEFINED
+#define _onnx_TensorProto_DataType_MAX onnx_TensorProto_DataType_COMPLEX128
+#define _onnx_TensorProto_DataType_ARRAYSIZE ((onnx_TensorProto_DataType)(onnx_TensorProto_DataType_COMPLEX128+1))
+
+/* Struct definitions */
+typedef struct _onnx_GraphProto {
+    pb_callback_t node;
+    pb_callback_t name;
+    pb_callback_t initializer;
+    pb_callback_t doc_string;
+    pb_callback_t input;
+    pb_callback_t output;
+    pb_callback_t value_info;
+/* @@protoc_insertion_point(struct:onnx_GraphProto) */
+} onnx_GraphProto;
+
+typedef struct _onnx_NodeProto {
+    pb_callback_t input;
+    pb_callback_t output;
+    pb_callback_t name;
+    pb_callback_t op_type;
+    pb_callback_t attribute;
+    pb_callback_t doc_string;
+    pb_callback_t domain;
+/* @@protoc_insertion_point(struct:onnx_NodeProto) */
+} onnx_NodeProto;
+
+typedef struct _onnx_StringStringEntryProto {
+    pb_callback_t key;
+    pb_callback_t value;
+/* @@protoc_insertion_point(struct:onnx_StringStringEntryProto) */
+} onnx_StringStringEntryProto;
+
+typedef struct _onnx_TensorShapeProto {
+    pb_callback_t dim;
+/* @@protoc_insertion_point(struct:onnx_TensorShapeProto) */
+} onnx_TensorShapeProto;
+
+typedef struct _onnx_TypeProto {
+    pb_callback_t tensor_type;
+/* @@protoc_insertion_point(struct:onnx_TypeProto) */
+} onnx_TypeProto;
+
+typedef struct _onnx_ValueInfoProto {
+    pb_callback_t name;
+    pb_callback_t type;
+    pb_callback_t doc_string;
+/* @@protoc_insertion_point(struct:onnx_ValueInfoProto) */
+} onnx_ValueInfoProto;
+
+typedef struct _onnx_AttributeProto {
+    pb_callback_t name;
+    bool has_f;
+    float f;
+    bool has_i;
+    int64_t i;
+    pb_callback_t s;
+    pb_callback_t t;
+    pb_callback_t g;
+    pb_callback_t floats;
+    pb_callback_t ints;
+    pb_callback_t strings;
+    pb_callback_t tensors;
+    pb_callback_t graphs;
+    pb_callback_t doc_string;
+    bool has_type;
+    onnx_AttributeProto_AttributeType type;
+/* @@protoc_insertion_point(struct:onnx_AttributeProto) */
+} onnx_AttributeProto;
+
+typedef struct _onnx_ModelProto {
+    bool has_ir_version;
+    int64_t ir_version;
+    pb_callback_t producer_name;
+    pb_callback_t producer_version;
+    pb_callback_t domain;
+    bool has_model_version;
+    int64_t model_version;
+    pb_callback_t doc_string;
+    pb_callback_t graph;
+    pb_callback_t opset_import;
+    pb_callback_t metadata_props;
+/* @@protoc_insertion_point(struct:onnx_ModelProto) */
+} onnx_ModelProto;
+
+typedef struct _onnx_OperatorSetIdProto {
+    pb_callback_t domain;
+    bool has_version;
+    int64_t version;
+/* @@protoc_insertion_point(struct:onnx_OperatorSetIdProto) */
+} onnx_OperatorSetIdProto;
+
+typedef struct _onnx_TensorProto_Segment {
+    bool has_begin;
+    int64_t begin;
+    bool has_end;
+    int64_t end;
+/* @@protoc_insertion_point(struct:onnx_TensorProto_Segment) */
+} onnx_TensorProto_Segment;
+
+typedef struct _onnx_TensorShapeProto_Dimension {
+    bool has_dim_value;
+    int64_t dim_value;
+    pb_callback_t dim_param;
+/* @@protoc_insertion_point(struct:onnx_TensorShapeProto_Dimension) */
+} onnx_TensorShapeProto_Dimension;
+
+typedef struct _onnx_TypeProto_Tensor {
+    bool has_elem_type;
+    onnx_TensorProto_DataType elem_type;
+    pb_callback_t shape;
+/* @@protoc_insertion_point(struct:onnx_TypeProto_Tensor) */
+} onnx_TypeProto_Tensor;
+
+typedef struct _onnx_TensorProto {
+    pb_callback_t dims;
+    bool has_data_type;
+    onnx_TensorProto_DataType data_type;
+    bool has_segment;
+    onnx_TensorProto_Segment segment;
+    pb_callback_t float_data;
+    pb_callback_t int32_data;
+    pb_callback_t string_data;
+    pb_callback_t int64_data;
+    pb_callback_t name;
+    pb_callback_t raw_data;
+    pb_callback_t double_data;
+    pb_callback_t uint64_data;
+    pb_callback_t doc_string;
+/* @@protoc_insertion_point(struct:onnx_TensorProto) */
+} onnx_TensorProto;
+
+/* Default values for struct fields */
+
+/* Initializer values for message structs */
+#define onnx_AttributeProto_init_default         {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
+#define onnx_ValueInfoProto_init_default         {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_NodeProto_init_default              {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_ModelProto_init_default             {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_StringStringEntryProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_GraphProto_init_default             {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_init_default            {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_default, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_Segment_init_default    {false, 0, false, 0}
+#define onnx_TensorShapeProto_init_default       {{{NULL}, NULL}}
+#define onnx_TensorShapeProto_Dimension_init_default {false, 0, {{NULL}, NULL}}
+#define onnx_TypeProto_init_default              {{{NULL}, NULL}}
+#define onnx_TypeProto_Tensor_init_default       {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
+#define onnx_OperatorSetIdProto_init_default     {{{NULL}, NULL}, false, 0}
+#define onnx_AttributeProto_init_zero            {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
+#define onnx_ValueInfoProto_init_zero            {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_NodeProto_init_zero                 {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_ModelProto_init_zero                {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_StringStringEntryProto_init_zero    {{{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_GraphProto_init_zero                {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_init_zero               {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_zero, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_Segment_init_zero       {false, 0, false, 0}
+#define onnx_TensorShapeProto_init_zero          {{{NULL}, NULL}}
+#define onnx_TensorShapeProto_Dimension_init_zero {false, 0, {{NULL}, NULL}}
+#define onnx_TypeProto_init_zero                 {{{NULL}, NULL}}
+#define onnx_TypeProto_Tensor_init_zero          {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
+#define onnx_OperatorSetIdProto_init_zero        {{{NULL}, NULL}, false, 0}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define onnx_GraphProto_node_tag                 1
+#define onnx_GraphProto_name_tag                 2
+#define onnx_GraphProto_initializer_tag          5
+#define onnx_GraphProto_doc_string_tag           10
+#define onnx_GraphProto_input_tag                11
+#define onnx_GraphProto_output_tag               12
+#define onnx_GraphProto_value_info_tag           13
+#define onnx_NodeProto_input_tag                 1
+#define onnx_NodeProto_output_tag                2
+#define onnx_NodeProto_name_tag                  3
+#define onnx_NodeProto_op_type_tag               4
+#define onnx_NodeProto_domain_tag                7
+#define onnx_NodeProto_attribute_tag             5
+#define onnx_NodeProto_doc_string_tag            6
+#define onnx_StringStringEntryProto_key_tag      1
+#define onnx_StringStringEntryProto_value_tag    2
+#define onnx_TensorShapeProto_dim_tag            1
+#define onnx_TypeProto_tensor_type_tag           1
+#define onnx_ValueInfoProto_name_tag             1
+#define onnx_ValueInfoProto_type_tag             2
+#define onnx_ValueInfoProto_doc_string_tag       3
+#define onnx_AttributeProto_name_tag             1
+#define onnx_AttributeProto_doc_string_tag       13
+#define onnx_AttributeProto_type_tag             20
+#define onnx_AttributeProto_f_tag                2
+#define onnx_AttributeProto_i_tag                3
+#define onnx_AttributeProto_s_tag                4
+#define onnx_AttributeProto_t_tag                5
+#define onnx_AttributeProto_g_tag                6
+#define onnx_AttributeProto_floats_tag           7
+#define onnx_AttributeProto_ints_tag             8
+#define onnx_AttributeProto_strings_tag          9
+#define onnx_AttributeProto_tensors_tag          10
+#define onnx_AttributeProto_graphs_tag           11
+#define onnx_ModelProto_ir_version_tag           1
+#define onnx_ModelProto_opset_import_tag         8
+#define onnx_ModelProto_producer_name_tag        2
+#define onnx_ModelProto_producer_version_tag     3
+#define onnx_ModelProto_domain_tag               4
+#define onnx_ModelProto_model_version_tag        5
+#define onnx_ModelProto_doc_string_tag           6
+#define onnx_ModelProto_graph_tag                7
+#define onnx_ModelProto_metadata_props_tag       14
+#define onnx_OperatorSetIdProto_domain_tag       1
+#define onnx_OperatorSetIdProto_version_tag      2
+#define onnx_TensorProto_Segment_begin_tag       1
+#define onnx_TensorProto_Segment_end_tag         2
+#define onnx_TensorShapeProto_Dimension_dim_value_tag 1
+#define onnx_TensorShapeProto_Dimension_dim_param_tag 2
+#define onnx_TypeProto_Tensor_elem_type_tag      1
+#define onnx_TypeProto_Tensor_shape_tag          2
+#define onnx_TensorProto_dims_tag                1
+#define onnx_TensorProto_data_type_tag           2
+#define onnx_TensorProto_segment_tag             3
+#define onnx_TensorProto_float_data_tag          4
+#define onnx_TensorProto_int32_data_tag          5
+#define onnx_TensorProto_string_data_tag         6
+#define onnx_TensorProto_int64_data_tag          7
+#define onnx_TensorProto_name_tag                8
+#define onnx_TensorProto_doc_string_tag          12
+#define onnx_TensorProto_raw_data_tag            9
+#define onnx_TensorProto_double_data_tag         10
+#define onnx_TensorProto_uint64_data_tag         11
+
+/* Struct field encoding specification for nanopb */
+extern const pb_field_t onnx_AttributeProto_fields[14];
+extern const pb_field_t onnx_ValueInfoProto_fields[4];
+extern const pb_field_t onnx_NodeProto_fields[8];
+extern const pb_field_t onnx_ModelProto_fields[10];
+extern const pb_field_t onnx_StringStringEntryProto_fields[3];
+extern const pb_field_t onnx_GraphProto_fields[8];
+extern const pb_field_t onnx_TensorProto_fields[13];
+extern const pb_field_t onnx_TensorProto_Segment_fields[3];
+extern const pb_field_t onnx_TensorShapeProto_fields[2];
+extern const pb_field_t onnx_TensorShapeProto_Dimension_fields[3];
+extern const pb_field_t onnx_TypeProto_fields[2];
+extern const pb_field_t onnx_TypeProto_Tensor_fields[3];
+extern const pb_field_t onnx_OperatorSetIdProto_fields[3];
+
+/* Maximum encoded size of messages (where known) */
+/* onnx_AttributeProto_size depends on runtime parameters */
+/* onnx_ValueInfoProto_size depends on runtime parameters */
+/* onnx_NodeProto_size depends on runtime parameters */
+/* onnx_ModelProto_size depends on runtime parameters */
+/* onnx_StringStringEntryProto_size depends on runtime parameters */
+/* onnx_GraphProto_size depends on runtime parameters */
+/* onnx_TensorProto_size depends on runtime parameters */
+#define onnx_TensorProto_Segment_size            22
+/* onnx_TensorShapeProto_size depends on runtime parameters */
+/* onnx_TensorShapeProto_Dimension_size depends on runtime parameters */
+/* onnx_TypeProto_size depends on runtime parameters */
+/* onnx_TypeProto_Tensor_size depends on runtime parameters */
+/* onnx_OperatorSetIdProto_size depends on runtime parameters */
+
+/* Message IDs (where set with "msgid" option) */
+#ifdef PB_MSGID
+
+#define ONNX_MESSAGES \
+
+
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+/* @@protoc_insertion_point(eof) */
+
+#endif
diff --git a/torch/csrc/onnx/onnx.options b/torch/csrc/onnx/onnx.options
new file mode 100644
index 0000000..dd02d20
--- /dev/null
+++ b/torch/csrc/onnx/onnx.options
@@ -0,0 +1,24 @@
+# Note [Callback for nested messages]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# nanopb's default translation for a nested, non-repeated (possibly
+# optional) message is to include it *inline* (no indirection), with
+# a boolean has_g/has_t field to indicate its presence or not.  Why
+# do we not like this?  It's not compatible with our ownership model,
+# where a TensorProto/GraphProto class owns the protobuf struct it
+# is constructing.  With the default translation, the protobuf struct
+# occurs in two places: a TensorProto, AND the parent protobuf struct
+# field.  That's bad.  Turning it back into a callback solves the
+# ownership problem.
+#
+# Two more bonuses: at the cost of an indirection, we no longer waste fields
+# when we aren't actually storing a graph/tensor; furthermore, circular
+# dependencies now work!
+
+onnx.AttributeProto.g type:FT_CALLBACK
+onnx.AttributeProto.t type:FT_CALLBACK
+onnx.ModelProto.graph type:FT_CALLBACK
+onnx.TypeProto.Tensor.shape type:FT_CALLBACK
+onnx.TypeProto.tensor_type type:FT_CALLBACK
+onnx.ValueInfoProto.type type:FT_CALLBACK
+onnx.TypeProto no_unions:true
+onnx.TensorShapeProto.Dimension no_unions:true
diff --git a/torch/csrc/python_headers.h b/torch/csrc/python_headers.h
new file mode 100644
index 0000000..390ce97
--- /dev/null
+++ b/torch/csrc/python_headers.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// workaround for Python 2 issue: https://bugs.python.org/issue17120
+#pragma push_macro("_XOPEN_SOURCE")
+#pragma push_macro("_POSIX_C_SOURCE")
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+
+#include <Python.h>
+
+#pragma pop_macro("_XOPEN_SOURCE")
+#pragma pop_macro("_POSIX_C_SOURCE")
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
new file mode 100644
index 0000000..eaf93b9
--- /dev/null
+++ b/torch/csrc/serialization.cpp
@@ -0,0 +1,119 @@
+#include "torch/csrc/python_headers.h"
+#include <system_error>
+
+#include "THP.h"
+#include "serialization.h"
+
+static ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes);
+
+template <>
+ssize_t doRead<int>(int fildes, void* buf, size_t nbytes) {
+  return read(fildes, buf, nbytes);
+}
+
+template <>
+ssize_t doRead<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+  // Try to use fildes.readinto() instead of fildes.read()
+  // because it is more memory efficient.
+  auto has_readinto = PyObject_HasAttrString(fildes, "readinto") == 1;
+  if (has_readinto) {
+    return doPythonReadInto(fildes, buf, nbytes);
+  }
+  return doPythonReadBuffered(fildes, buf, nbytes);
+}
+
+template <>
+ssize_t doWrite<int>(int fildes, void* buf, size_t nbytes) {
+  return write(fildes, buf, nbytes);
+}
+
+template <>
+ssize_t doWrite<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPythonWrite(fildes, buf, nbytes);
+}
+
+static inline bool isUnsupportedOperation() {
+  THPObjectPtr io(PyImport_ImportModule("io"));
+  if (!io) throw python_error();
+  THPObjectPtr exception(PyObject_GetAttrString(io, "UnsupportedOperation"));
+  if (!exception) python_error();
+  return PyErr_ExceptionMatches(exception.get());
+}
+
+// Call Python fildes.read(nbytes) and copy it to buf.
+static inline ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes) {
+  const size_t buffer_size = 262144;  // 2^18
+  size_t read_bytes = 0;
+
+  while (read_bytes < nbytes) {
+    auto remaining = nbytes - read_bytes;
+    auto to_read = remaining > buffer_size ? buffer_size : remaining;
+    THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", to_read));
+    if (!r) throw python_error();
+
+    // read output is String (Python 2) / Bytes (Python 3)
+#if PY_MAJOR_VERSION >= 3
+    auto size = PyBytes_GET_SIZE(r.get());
+    const void* bytes = PyBytes_AsString(r.get());
+#else
+    auto size = PyString_GET_SIZE(r.get());
+    const void* bytes = PyString_AsString(r.get());
+#endif
+
+    // we read EOF
+    if (size == 0) {
+      return read_bytes;
+    }
+
+    memcpy(reinterpret_cast<char*>(buf) + read_bytes, bytes, size);
+    read_bytes += size;
+  } // Reading loop
+
+  return read_bytes;
+}
+
+// Either does fildes.readinto(buf) or fildes.write(buf)
+static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) {
+#if PY_MAJOR_VERSION >= 3
+  auto rw_flag = is_read ? PyBUF_WRITE : PyBUF_READ;
+  THPObjectPtr memview(PyMemoryView_FromMemory(
+      reinterpret_cast<char*>(buf), nbytes, rw_flag));
+#else
+  THPObjectPtr memview(PyBuffer_FromReadWriteMemory(buf, nbytes));
+#endif
+  if (!memview) throw python_error();
+
+  char* method = "write";
+  if (is_read) {
+    method = "readinto";
+  }
+  THPObjectPtr r(PyObject_CallMethod(fildes, method, "O", memview.get()));
+  if (r) {
+    return PyLong_AsSsize_t(r.get());
+  }
+
+  // fildes.readinto can return UnsupportedOperation so fall back to fildes.read.
+  if (is_read && isUnsupportedOperation()) {
+    PyErr_Clear();
+    return doPythonReadBuffered(fildes, buf, nbytes);
+  }
+  throw python_error();
+}
+
+// Call Python fildes.readinto(buf)
+static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPythonIO(fildes, buf, nbytes, /* is_read */ true);
+}
+
+// Call Python fildes.write(buf)
+static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPythonIO(fildes, buf, nbytes, /* is_read */ false);
+}
+
+#include "generic/serialization.cpp"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/serialization.cpp"
+#include <TH/THGenerateHalfType.h>
diff --git a/torch/csrc/serialization.h b/torch/csrc/serialization.h
new file mode 100644
index 0000000..410619a
--- /dev/null
+++ b/torch/csrc/serialization.h
@@ -0,0 +1,16 @@
+#ifndef THP_SERIALIZATION_INC
+#define THP_SERIALIZATION_INC
+
+#include "generic/serialization.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/serialization.h"
+#include <TH/THGenerateHalfType.h>
+
+template <class io>
+ssize_t doRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+ssize_t doWrite(io fildes, void* buf, size_t nbytes);
+
+#endif
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
new file mode 100644
index 0000000..a4b6d11
--- /dev/null
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -0,0 +1,397 @@
+#include "python_tensor.h"
+
+#include <structmember.h>
+#include <pybind11/pybind11.h>
+
+#include "torch/csrc/torch.h"
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/utils/cuda_enabled.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_types.h"
+
+#include <ATen/ATen.h>
+
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace torch { namespace tensors {
+
+using namespace at;
+using namespace torch::autograd;
+
+struct PyTensorType {
+  PyTypeObject py_type;
+  at::Type* aten_type_;
+  THPDtype* dtype;
+  THPLayout* layout;
+  bool is_cuda;
+  char name[64];
+  int backend;
+  int scalar_type;
+
+  // Precondition: Access to this struct is protected by the GIL
+  at::Type* aten_type() {
+    if (!aten_type_) {
+      auto* baseType = globalContext().getTypeOpt(static_cast<at::Backend>(backend), static_cast<at::ScalarType>(scalar_type));
+      aten_type_ = baseType ? torch::autograd::VariableType::getType(*baseType) : nullptr;
+    }
+    return aten_type_;
+  }
+};
+
+static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
+
+// This is always an instance of VariableType
+static at::Type* default_tensor_type;
+
+static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types);
+
+static TypeError unavailable_type(const PyTensorType& type) {
+  const char* cuda_msg = torch::utils::cuda_enabled() ? ". Torch not compiled with CUDA enabled." : "";
+  return TypeError("type %s not available%s", type.name, cuda_msg);
+}
+
+static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+  HANDLE_TH_ERRORS
+  auto& tensor_type = *((PyTensorType*)type);
+  auto aten_type = tensor_type.aten_type();
+  if (!aten_type) {
+    throw unavailable_type(tensor_type);
+  }
+  if (aten_type->is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+  return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(*aten_type, args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* Tensor_instancecheck(PyTensorType* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  if (THPVariable_Check(arg)) {
+    auto& var = ((THPVariable*)arg)->cdata;
+    // NB: This is a little unfortunate, in that if I do an isinstance check
+    // against torch.cuda.FloatTensor, this will immediately initialize CUDA.
+    // I originally thought that it would not be possible for aten_type_ to
+    // be nullptr if you had a tensor of some type, in which case you can
+    // skip initializign aten_type(), but TestAutograd.test_type_conversions
+    // seems to violate this property (for whatever reason.)
+    if (&var.type() == self->aten_type()) {
+      Py_RETURN_TRUE;
+    }
+  }
+  Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject *Tensor_dtype(PyTensorType* self) {
+  return torch::autograd::utils::wrap(self->dtype);
+}
+
+PyObject *Tensor_layout(PyTensorType* self) {
+  return torch::autograd::utils::wrap(self->layout);
+}
+
+PyObject *Tensor_is_cuda(PyTensorType* self) {
+  if (self->is_cuda) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+PyObject *Tensor_is_sparse(PyTensorType *self) {
+  if (self->layout->layout == at::Layout::Strided) {
+    Py_RETURN_FALSE;
+  } else {
+    Py_RETURN_TRUE;
+  }
+}
+
+static struct PyMethodDef metaclass_methods[] = {
+  {"__instancecheck__", (PyCFunction)Tensor_instancecheck, METH_O, NULL},
+  {NULL}
+};
+
+typedef PyObject *(*getter)(PyObject *, void *);
+
+static struct PyGetSetDef metaclass_properties[] = {
+  {"dtype",        (getter)Tensor_dtype, nullptr, nullptr, nullptr},
+  {"layout",       (getter)Tensor_layout, nullptr, nullptr, nullptr},
+  {"is_cuda",      (getter)Tensor_is_cuda, nullptr, nullptr, nullptr},
+  {"is_sparse",    (getter)Tensor_is_sparse, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+static PyTypeObject metaclass;
+
+static void py_initialize_metaclass(PyTypeObject& metaclass) {
+  ((PyObject*)&metaclass)->ob_refcnt = 1;
+  metaclass.tp_basicsize = sizeof(PyTypeObject);
+  metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+  metaclass.tp_methods = metaclass_methods;
+  metaclass.tp_getset = metaclass_properties;
+  metaclass.tp_name = "torch.tensortype";
+  metaclass.tp_base = &PyType_Type;
+  if (PyType_Ready(&metaclass) < 0) {
+    throw python_error();
+  }
+}
+
+static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) {
+  // NOTE: we don't use the typical static declaration of PyTypeObject because
+  // we need to initialize as many types as there are VariableType instances.
+  // The typical PyVarObject_HEAD_INIT(NULL, 0) is described in the Python
+  // documentation: it initializes the refcnt to 1 and the other object header
+  // fields to zero.
+  memset(&type, 0, sizeof(PyTypeObject));
+  ((PyObject*)&type)->ob_refcnt = 1;
+  ((PyObject*)&type)->ob_type = &metaclass;
+  type.tp_basicsize = sizeof(PyTensorType);
+  type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+  type.tp_name = name;
+  type.tp_new = Tensor_new;
+  if (PyType_Ready(&type) < 0) {
+    throw python_error();
+  }
+  if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
+    throw python_error();
+  }
+}
+
+static const char* get_module(Backend backend) {
+  switch (backend) {
+    case kCPU: return "torch";
+    case kCUDA: return "torch.cuda";
+    case kSparseCPU: return "torch.sparse";
+    case kSparseCUDA: return "torch.cuda.sparse";
+    default: AT_ERROR("invalid backend: ", toString(backend));
+  }
+}
+
+static std::string get_name(Backend backend, ScalarType scalarType) {
+  std::ostringstream ss;
+  ss << get_module(backend) << "." << at::toString(scalarType) << "Tensor";
+  return ss.str();
+}
+
+static THPObjectPtr get_storage_obj(const Type& type) {
+  auto module_name = get_module(type.backend());
+  auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name));
+  if (!module_obj) throw python_error();
+
+  auto storage_name = std::string(at::toString(type.scalarType())) + "Storage";
+  THPObjectPtr storage(PyObject_GetAttrString(module_obj.get(), storage_name.c_str()));
+  if (!storage.get()) {
+    throw TypeError("couldn't find storage object %s", storage_name.c_str());
+  }
+  return storage;
+}
+
+static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) {
+  // This field is lazily initialized from backend and scalar_type
+  type_obj.aten_type_ = nullptr;
+  type_obj.backend = static_cast<int>(backend);
+  type_obj.scalar_type = static_cast<int>(scalarType);
+  type_obj.layout = torch::getLayout(backend);
+  type_obj.dtype = torch::getDtype(scalarType);
+  type_obj.is_cuda = (backend == at::Backend::CUDA || backend == at::Backend::SparseCUDA);
+}
+
+static void set_name(PyTensorType& type_obj, const std::string& name) {
+  size_t n = sizeof(type_obj.name);
+  strncpy(type_obj.name, name.c_str(), n);
+  type_obj.name[n - 1] = '\0';
+}
+
+static THPObjectPtr get_tensor_dict() {
+  auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch) throw python_error();
+
+  auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
+  if (!tensor_class) throw python_error();
+
+  auto tensor_type = (PyTypeObject*)tensor_class.get();
+  TORCH_ASSERTM(tensor_type->tp_base, "missing base type for Tensor");
+
+  auto res = THPObjectPtr(PyDict_New());
+  if (!res) throw python_error();
+
+  if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
+    throw python_error();
+  }
+  if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
+    throw python_error();
+  }
+
+  return res;
+}
+
+static std::vector<PyTensorType> tensor_types;
+
+static void initialize_aten_types(std::vector<PyTensorType>& tensor_types) {
+  // includes CUDA types even when PyTorch is not built with CUDA
+  auto declared_types = torch::utils::all_declared_types();
+  tensor_types.resize(declared_types.size());
+
+  for (size_t i = 0, end = declared_types.size(); i != end; i++) {
+    auto& tensor_type = tensor_types[i];
+    Backend backend = declared_types[i].first;
+    ScalarType scalar_type = declared_types[i].second;
+    set_type(tensor_type, backend, scalar_type);
+    set_name(tensor_type, get_name(backend, scalar_type));
+  }
+}
+
+void initialize_python_bindings() {
+  // Initialize the at::Type* pointers, name, and properties of the PyTensorType
+  // vector. After this call, the vector must not be resized.
+  initialize_aten_types(tensor_types);
+
+  // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
+  // The metaclass handles __instancecheck__ checks and binds the dtype property
+  // on the type objects.
+  py_initialize_metaclass(metaclass);
+
+  // Get the tp_dict of the Variable class. We copy function definitions
+  // onto each Tensor type object so that they can be accessed via e.g.
+  // `torch.FloatTensor.add`.
+  auto tensor_dict = get_tensor_dict();
+
+  // Initialize each Python type object torch.FloatTensor, torch.DoubleTensor, etc.
+  for (auto& tensor_type : tensor_types) {
+    py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
+  }
+
+  // Add the type objects to their corresponding modules. e.g. torch.FloatTensor
+  // is added to the `torch` module as `FloatTensor`. Also add all the type
+  // objects to the set torch._tensor_classes.
+  py_bind_tensor_types(tensor_types);
+
+  // Use torch.float32 as the default tensor type
+  set_default_tensor_type(torch::CPU(kFloat));
+}
+
+static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types) {
+  auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch_module) throw python_error();
+
+  auto tensor_classes = THPObjectPtr(PyObject_GetAttrString(torch_module.get(), "_tensor_classes"));
+  if (!tensor_classes) throw python_error();
+
+  for (auto& tensor_type : tensor_types) {
+    auto name = std::string(tensor_type.name);
+    auto idx = name.rfind(".");
+    auto type_name = name.substr(idx + 1);
+    auto module_name = name.substr(0, idx);
+
+    auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
+    if (!module_obj) throw python_error();
+
+    PyObject* type_obj = (PyObject*)&tensor_type;
+    Py_INCREF(type_obj);
+    if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) {
+      throw python_error();
+    }
+    if (PySet_Add(tensor_classes.get(), type_obj) < 0) {
+      throw python_error();
+    }
+  }
+}
+
+static bool PyTensorType_Check(PyObject* obj) {
+  auto it = std::find_if(tensor_types.begin(), tensor_types.end(),
+    [obj](const PyTensorType& x) {
+      return (PyObject*)&x == obj;
+    });
+  return it != tensor_types.end();
+}
+
+static PyTensorType& get_tensor_type(THPDtype *dtype, THPLayout *layout, bool is_cuda) {
+  auto it = std::find_if(tensor_types.begin(), tensor_types.end(),
+    [dtype, layout, is_cuda](const PyTensorType& x) {
+      return x.dtype == dtype && x.layout == layout && x.is_cuda == is_cuda;
+    });
+  if (it == tensor_types.end()) {
+    throw TypeError("invalid dtype object");
+  }
+  return *it;
+}
+
+void py_set_default_tensor_type(PyObject* obj) {
+  PyTensorType *type;
+  if (PyTensorType_Check(obj)) {
+    type = (PyTensorType*)obj;
+  } else {
+    throw TypeError("invalid type object");
+  }
+  auto aten_type = type->aten_type();
+  if (!aten_type) {
+    throw unavailable_type(*type);
+  }
+  set_default_tensor_type(*aten_type);
+}
+
+void py_set_default_dtype(PyObject* obj) {
+  PyTensorType *type;
+  if (THPDtype_Check(obj)) {
+    auto &current_default = get_default_tensor_type();
+    type = &get_tensor_type((THPDtype*)obj, torch::getLayout(current_default.backend()),
+                            torch::getDeviceType(current_default) == at::Device::Type::CUDA);
+  } else {
+    throw TypeError("invalid type object");
+  }
+  auto aten_type = type->aten_type();
+  if (!aten_type) {
+    throw unavailable_type(*type);
+  }
+  set_default_tensor_type(*aten_type);
+}
+
+void set_default_tensor_type(const at::Type& type) {
+  if (!at::isFloatingType(type.scalarType())) {
+    throw TypeError("only floating-point types are supported as the default type");
+  }
+  if (!type.is_variable() && !type.is_undefined()) {
+    throw TypeError("only variable types are supported");
+  }
+  if (type.is_sparse()) {
+    throw TypeError("only dense types are supported as the default type");
+  }
+
+  // get the storage first, so if it doesn't exist we don't change the default tensor type
+  THPObjectPtr storage = get_storage_obj(type);
+  default_tensor_type = const_cast<Type*>(&type);
+
+  auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch_module) throw python_error();
+
+  if (PyObject_SetAttrString(torch_module.get(), "Storage", storage) != 0) {
+    // technically, we should undo the change of default tensor type.
+    throw python_error();
+  }
+}
+
+at::Type& get_default_tensor_type() {
+  TORCH_ASSERT(default_tensor_type);
+  return *default_tensor_type;
+}
+
+Device getDevice(const at::Tensor& tensor) {
+  if (tensor.type().is_cuda()) {
+    return at::Device(at::kCUDA, tensor.get_device());
+  }
+  return at::Device(at::kCPU);
+}
+}} // namespace torch::tensors
diff --git a/torch/csrc/tensor/python_tensor.h b/torch/csrc/tensor/python_tensor.h
new file mode 100644
index 0000000..64ebbef
--- /dev/null
+++ b/torch/csrc/tensor/python_tensor.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+namespace at {
+struct Type;
+struct Device;
+struct Tensor;
+} // namespace at
+
+namespace torch { namespace tensors {
+
+// Initializes the Python tensor type objects: torch.FloatTensor,
+// torch.DoubleTensor, etc. and binds them in their containing modules.
+void initialize_python_bindings();
+
+// Sets the concrete type constructed by calls to torch.Tensor() and most
+// factory methods on the torch module.
+void set_default_tensor_type(const at::Type& type);
+
+// Same as set_default_tensor_type() but takes a PyObject*
+void py_set_default_tensor_type(PyObject* type_obj);
+
+// Same as py_set_default_tensor_type, but only changes the dtype (ScalarType).
+void py_set_default_dtype(PyObject* dtype_obj);
+
+// Gets the ATen type object for the default tensor type. Note that the
+// returned value will be a VariableType instance.
+at::Type& get_default_tensor_type();
+
+// Gets the torch::Device object of a given at::Tensor
+at::Device getDevice(const at::Tensor& tensor);
+
+}} // namespace torch::tensors
diff --git a/torch/csrc/torch.cpp b/torch/csrc/torch.cpp
new file mode 100644
index 0000000..094067f
--- /dev/null
+++ b/torch/csrc/torch.cpp
@@ -0,0 +1,29 @@
+#include <torch/csrc/variable_tensor_functions.h>
+#include <torch/csrc/autograd/generated/VariableType.h>
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch {
+at::Type& getType(at::Backend backend, at::ScalarType type) {
+  return *autograd::VariableType::getType(at::getType(backend, type));
+}
+
+at::Type& CPU(at::ScalarType type) {
+  return torch::getType(at::kCPU, type);
+}
+
+at::Type& CUDA(at::ScalarType type) {
+  return torch::getType(at::kCUDA, type);
+}
+
+at::Tensor toTensor(const at::Scalar& scalar) {
+  return autograd::make_variable(scalar.toTensor());
+}
+
+void set_requires_grad(at::Tensor& tensor, bool requires_grad) noexcept {
+  autograd::as_variable_ref(tensor).set_requires_grad(requires_grad);
+}
+
+bool requires_grad(const at::Tensor& tensor) noexcept {
+  return autograd::as_variable_ref(tensor).requires_grad();
+}
+} // namespace torch
diff --git a/torch/csrc/torch.h b/torch/csrc/torch.h
new file mode 100644
index 0000000..5761b8e
--- /dev/null
+++ b/torch/csrc/torch.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <Python.h>
+
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/variable_tensor_functions.h>
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
new file mode 100644
index 0000000..56f1803
--- /dev/null
+++ b/torch/csrc/utils.cpp
@@ -0,0 +1,236 @@
+#include "torch/csrc/python_headers.h"
+#include <stdarg.h>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <unordered_map>
+#include "THP.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/invalid_arguments.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/DynamicTypes.h"
+
+#include "generic/utils.cpp"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/utils.cpp"
+#include <TH/THGenerateHalfType.h>
+
+int THPUtils_getCallable(PyObject *arg, PyObject **result) {
+  if (!PyCallable_Check(arg))
+    return 0;
+  *result = arg;
+  return 1;
+}
+
+THLongStoragePtr THPUtils_unpackSize(PyObject *arg) {
+  THLongStoragePtr result;
+  if (!THPUtils_tryUnpackLongs(arg, result)) {
+    std::string msg = "THPUtils_unpackSize() expects a torch.Size (got '";
+    msg += Py_TYPE(arg)->tp_name;
+    msg += "')";
+    throw std::runtime_error(msg);
+  }
+  return result;
+}
+
+bool THPUtils_tryUnpackLongs(PyObject *arg, THLongStoragePtr& result) {
+  bool tuple = PyTuple_Check(arg);
+  bool list = PyList_Check(arg);
+  if (tuple || list) {
+    int nDim = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+    THLongStoragePtr storage(THLongStorage_newWithSize(nDim));
+    for (int i = 0; i != nDim; ++i) {
+      PyObject* item = tuple ? PyTuple_GET_ITEM(arg, i) : PyList_GET_ITEM(arg, i);
+      if (!THPUtils_checkLong(item)) {
+        return false;
+      }
+      THLongStorage_set(storage, i, THPUtils_unpackLong(item));
+    }
+    result  = std::move(storage);
+    return true;
+  }
+  return false;
+}
+
+std::vector<int64_t> THPUtils_unpackLongs(PyObject *arg) {
+  bool tuple = PyTuple_Check(arg);
+  bool list = PyList_Check(arg);
+  if (tuple || list) {
+    int nDim = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+    std::vector<int64_t> sizes(nDim);
+    for (int i = 0; i != nDim; ++i) {
+      PyObject* item = tuple ? PyTuple_GET_ITEM(arg, i) : PyList_GET_ITEM(arg, i);
+      if (!THPUtils_checkLong(item)) {
+        std::ostringstream oss;
+        oss << "expected int at position " << i << ", but got: " << THPUtils_typename(item);
+        throw std::runtime_error(oss.str());
+      }
+      sizes[i] = THPUtils_unpackLong(item);
+    }
+    return sizes;
+  }
+  throw std::runtime_error("Expected tuple or list");
+}
+
+bool THPUtils_tryUnpackLongVarArgs(PyObject *args, int ignore_first, THLongStoragePtr& result) {
+  Py_ssize_t length = PyTuple_Size(args) - ignore_first;
+  if (length < 1) {
+    return false;
+  }
+
+  PyObject *first_arg = PyTuple_GET_ITEM(args, ignore_first);
+  if (length == 1 && THPUtils_tryUnpackLongs(first_arg, result)) {
+    return true;
+  }
+
+  // Try to parse the numbers
+  result = THLongStorage_newWithSize(length);
+  for (Py_ssize_t i = 0; i < length; ++i) {
+    PyObject *arg = PyTuple_GET_ITEM(args, i + ignore_first);
+    if (!THPUtils_checkLong(arg)) {
+      return false;
+    }
+    THLongStorage_set(result, i, THPUtils_unpackLong(arg));
+  }
+  return true;
+}
+
+bool THPUtils_checkIntTuple(PyObject *arg)
+{
+  if (!PyTuple_Check(arg)) {
+    return false;
+  }
+  for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(arg); ++i) {
+    if (!THPUtils_checkLong(PyTuple_GET_ITEM(arg, i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<int> THPUtils_unpackIntTuple(PyObject *arg)
+{
+  if (!THPUtils_checkIntTuple(arg)) {
+    throw std::runtime_error("Couldn't unpack int tuple");
+  }
+  std::vector<int> values(PyTuple_GET_SIZE(arg));
+  for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(arg); ++i) {
+    values[i] = (int)THPUtils_unpackLong(PyTuple_GET_ITEM(arg, i));
+  }
+  return values;
+}
+
+void THPUtils_setError(const char *format, ...)
+{
+  static const size_t ERROR_BUFFER_SIZE = 1000;
+  char buffer[ERROR_BUFFER_SIZE];
+  va_list fmt_args;
+
+  va_start(fmt_args, format);
+  vsnprintf(buffer, ERROR_BUFFER_SIZE, format, fmt_args);
+  va_end(fmt_args);
+  PyErr_SetString(PyExc_RuntimeError, buffer);
+}
+
+void THPUtils_addPyMethodDefs(std::vector<PyMethodDef>& vector, PyMethodDef* methods)
+{
+  if (!vector.empty()) {
+    // remove NULL terminator
+    vector.pop_back();
+  }
+  while (1) {
+    vector.push_back(*methods);
+    if (!methods->ml_name) {
+      break;
+    }
+    methods++;
+  }
+}
+
+static const char* classOrTypename(PyObject* obj) {
+  if (PyType_Check(obj)) {
+    return ((PyTypeObject*)obj)->tp_name;
+  }
+  return Py_TYPE(obj)->tp_name;
+}
+
+PyObject * THPUtils_dispatchStateless(
+    PyObject *tensor, const char *name, PyObject *args, PyObject *kwargs)
+{
+  THPObjectPtr methods(PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME));
+  if (!methods) {
+    return PyErr_Format(
+        PyExc_TypeError,
+        "Type %s doesn't implement stateless methods",
+        classOrTypename(tensor));
+  }
+  THPObjectPtr method(PyObject_GetAttrString(methods, name));
+  if (!method) {
+    return PyErr_Format(
+        PyExc_TypeError,
+        "Type %s doesn't implement stateless method %s",
+        classOrTypename(tensor),
+        name);
+  }
+  return PyObject_Call(method.get(), args, kwargs);
+}
+
+void THPUtils_invalidArguments(PyObject *given_args, PyObject *given_kwargs,
+        const char *function_name, size_t num_options, ...) {
+  std::vector<std::string> option_strings;
+  va_list option_list;
+  va_start(option_list, num_options);
+  for (size_t i = 0; i < num_options; i++)
+    option_strings.push_back(va_arg(option_list, const char*));
+  va_end(option_list);
+
+  PyErr_SetString(PyExc_TypeError, torch::format_invalid_args(
+      given_args, given_kwargs, function_name, option_strings).c_str());
+}
+
+template<>
+void THPPointer<THPGenerator>::free() {
+  if (ptr)
+    Py_DECREF(ptr);
+}
+
+template class THPPointer<THPGenerator>;
+
+static bool backCompatBroadcastWarn = false;
+
+void setBackCompatBroadcastWarn(bool warn) {
+  backCompatBroadcastWarn = warn;
+}
+
+bool getBackCompatBroadcastWarn() {
+  return backCompatBroadcastWarn;
+}
+
+static bool backCompatKeepdimWarn = false;
+
+void setBackCompatKeepdimWarn(bool warn) {
+  backCompatKeepdimWarn = warn;
+}
+
+bool getBackCompatKeepdimWarn() {
+  return backCompatKeepdimWarn;
+}
+
+bool maybeThrowBackCompatKeepdimWarn(char *func) {
+  if(getBackCompatKeepdimWarn()) {
+     std::ostringstream ss;
+     ss << "backwards compatibility: call to \"" << func
+        << "\" uses default value for keepdim which has changed default to False.  Consider passing as kwarg.",
+    PyErr_WarnEx(PyExc_UserWarning, ss.str().c_str(), 1);
+  }
+  return true;
+}
+
+template<>
+void THPPointer<THTensor>::free() {
+  if (ptr) {
+    THTensor_free(LIBRARY_STATE ptr);
+  }
+}
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
new file mode 100644
index 0000000..51339d1
--- /dev/null
+++ b/torch/csrc/utils.h
@@ -0,0 +1,185 @@
+#ifndef THP_UTILS_H
+#define THP_UTILS_H
+
+#include <vector>
+#include <string>
+#include <type_traits>
+#include <ATen/ATen.h>
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_compat.h"
+
+#ifdef USE_CUDA
+#include <THC/THC.h>
+#endif
+
+#define THPUtils_(NAME) TH_CONCAT_4(THP,Real,Utils_,NAME)
+
+#define THPUtils_typename(obj) (Py_TYPE(obj)->tp_name)
+
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define THP_EXPECT(x, y) (__builtin_expect((x), (y)))
+#else
+#define THP_EXPECT(x, y) (x)
+#endif
+
+#if PY_MAJOR_VERSION == 2
+#define THPUtils_checkReal_FLOAT(object)                                       \
+    (PyFloat_Check(object) || PyLong_Check(object) || PyInt_Check(object))
+
+#define THPUtils_unpackReal_FLOAT(object)                                      \
+    (PyFloat_Check(object) ? PyFloat_AsDouble(object) :                        \
+    PyLong_Check(object) ? PyLong_AsLongLong(object) :                         \
+    PyInt_Check(object) ? PyInt_AsLong(object) :                               \
+    (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_checkReal_INT(object)                                         \
+    (PyLong_Check(object) || PyInt_Check(object))
+
+#define THPUtils_unpackReal_INT(object)                                        \
+    (PyLong_Check(object) ? PyLong_AsLongLong(object) :                        \
+    PyInt_Check(object) ? PyInt_AsLong(object) :                               \
+    (throw std::runtime_error("Could not parse real"), 0))
+#else /* PY_MAJOR_VERSION == 2 */
+#define THPUtils_checkReal_FLOAT(object)                                       \
+    (PyFloat_Check(object) || PyLong_Check(object))
+
+#define THPUtils_unpackReal_FLOAT(object)                                      \
+    (PyFloat_Check(object) ? PyFloat_AsDouble(object) :                        \
+    PyLong_Check(object) ? PyLong_AsLongLong(object) :                         \
+    (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_checkReal_INT(object)                                         \
+    PyLong_Check(object)
+
+#define THPUtils_unpackReal_INT(object)                                        \
+    (PyLong_Check(object) ? PyLong_AsLongLong(object) :                        \
+    (throw std::runtime_error("Could not parse real"), 0))
+#endif
+
+#define THPUtils_newReal_FLOAT(value) PyFloat_FromDouble(value)
+// TODO: handle int overflows for py2
+#define THPUtils_newReal_INT(value) PyInt_FromLong(value)
+
+#define THPDoubleUtils_checkReal(object)      THPUtils_checkReal_FLOAT(object)
+#define THPDoubleUtils_unpackReal(object)     (double)THPUtils_unpackReal_FLOAT(object)
+#define THPDoubleUtils_newReal(value)         THPUtils_newReal_FLOAT(value)
+#define THPDoubleUtils_checkAccreal(object)   THPUtils_checkReal_FLOAT(object)
+#define THPDoubleUtils_unpackAccreal(object)  (double)THPUtils_unpackReal_FLOAT(object)
+#define THPDoubleUtils_newAccreal(value)      THPUtils_newReal_FLOAT(value)
+#define THPFloatUtils_checkReal(object)       THPUtils_checkReal_FLOAT(object)
+#define THPFloatUtils_unpackReal(object)      (float)THPUtils_unpackReal_FLOAT(object)
+#define THPFloatUtils_newReal(value)          THPUtils_newReal_FLOAT(value)
+#define THPFloatUtils_checkAccreal(object)    THPUtils_checkReal_FLOAT(object)
+#define THPFloatUtils_unpackAccreal(object)   (double)THPUtils_unpackReal_FLOAT(object)
+#define THPFloatUtils_newAccreal(value)       THPUtils_newReal_FLOAT(value)
+#define THPHalfUtils_checkReal(object)        THPUtils_checkReal_FLOAT(object)
+#ifndef THP_HOST_HALF
+#define THPHalfUtils_unpackReal(object)       (half)THC_float2half(THPUtils_unpackReal_FLOAT(object))
+#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(THC_half2float(value))
+#else
+#define THPHalfUtils_unpackReal(object)       TH_float2half(THPUtils_unpackReal_FLOAT(object))
+#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(TH_half2float(value))
+#endif
+#define THPHalfUtils_checkAccreal(object)     THPUtils_checkReal_FLOAT(object)
+#define THPHalfUtils_unpackAccreal(object)    (double)THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newAccreal(value)        THPUtils_newReal_FLOAT(value)
+
+#define THPLongUtils_checkReal(object)        THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackReal(object)       (int64_t)THPUtils_unpackReal_INT(object)
+#define THPLongUtils_newReal(value)           THPUtils_newReal_INT(value)
+#define THPLongUtils_checkAccreal(object)     THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackAccreal(object)    (int64_t)THPUtils_unpackReal_INT(object)
+#define THPLongUtils_newAccreal(value)        THPUtils_newReal_INT(value)
+#define THPIntUtils_checkReal(object)         THPUtils_checkReal_INT(object)
+#define THPIntUtils_unpackReal(object)        (int)THPUtils_unpackReal_INT(object)
+#define THPIntUtils_newReal(value)            THPUtils_newReal_INT(value)
+#define THPIntUtils_checkAccreal(object)      THPUtils_checkReal_INT(object)
+#define THPIntUtils_unpackAccreal(object)     (int64_t)THPUtils_unpackReal_INT(object)
+#define THPIntUtils_newAccreal(value)         THPUtils_newReal_INT(value)
+#define THPShortUtils_checkReal(object)       THPUtils_checkReal_INT(object)
+#define THPShortUtils_unpackReal(object)      (short)THPUtils_unpackReal_INT(object)
+#define THPShortUtils_newReal(value)          THPUtils_newReal_INT(value)
+#define THPShortUtils_checkAccreal(object)    THPUtils_checkReal_INT(object)
+#define THPShortUtils_unpackAccreal(object)   (int64_t)THPUtils_unpackReal_INT(object)
+#define THPShortUtils_newAccreal(value)       THPUtils_newReal_INT(value)
+#define THPCharUtils_checkReal(object)        THPUtils_checkReal_INT(object)
+#define THPCharUtils_unpackReal(object)       (char)THPUtils_unpackReal_INT(object)
+#define THPCharUtils_newReal(value)           THPUtils_newReal_INT(value)
+#define THPCharUtils_checkAccreal(object)     THPUtils_checkReal_INT(object)
+#define THPCharUtils_unpackAccreal(object)    (int64_t)THPUtils_unpackReal_INT(object)
+#define THPCharUtils_newAccreal(value)        THPUtils_newReal_INT(value)
+#define THPByteUtils_checkReal(object)        THPUtils_checkReal_INT(object)
+#define THPByteUtils_unpackReal(object)       (unsigned char)THPUtils_unpackReal_INT(object)
+#define THPByteUtils_newReal(value)           THPUtils_newReal_INT(value)
+#define THPByteUtils_checkAccreal(object)     THPUtils_checkReal_INT(object)
+#define THPByteUtils_unpackAccreal(object)    (int64_t)THPUtils_unpackReal_INT(object)
+#define THPByteUtils_newAccreal(value)        THPUtils_newReal_INT(value)
+
+#define THPUtils_assert(cond, ...) THPUtils_assertRet(NULL, cond, __VA_ARGS__)
+#define THPUtils_assertRet(value, cond, ...)                                   \
+if (THP_EXPECT(!(cond), 0)) { THPUtils_setError(__VA_ARGS__); return value; }
+THP_API void THPUtils_setError(const char *format, ...);
+THP_API void THPUtils_invalidArguments(
+        PyObject *given_args, PyObject *given_kwargs,
+        const char *function_name, size_t num_options, ...);
+
+#ifdef _THP_CORE
+
+bool THPUtils_checkIntTuple(PyObject *arg);
+std::vector<int> THPUtils_unpackIntTuple(PyObject *arg);
+
+void THPUtils_addPyMethodDefs(std::vector<PyMethodDef>& vector, PyMethodDef* methods);
+
+int THPUtils_getCallable(PyObject *arg, PyObject **result);
+
+#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
+#define THWTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
+#define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr)
+#define THPTensorPtr  TH_CONCAT_3(THP,Real,TensorPtr)
+#define THSPTensorPtr  TH_CONCAT_3(THSP,Real,TensorPtr)
+
+typedef THPPointer<THPGenerator> THPGeneratorPtr;
+
+template <typename T>
+struct THPUtils_typeTraits {};
+
+#include "generic/utils.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/utils.h"
+#include <TH/THGenerateHalfType.h>
+
+THLongStoragePtr THPUtils_unpackSize(PyObject *arg);
+bool THPUtils_tryUnpackLongs(PyObject *arg, THLongStoragePtr& result);
+std::vector<int64_t> THPUtils_unpackLongs(PyObject *arg);
+bool THPUtils_tryUnpackLongVarArgs(PyObject *args, int ignore_first, THLongStoragePtr& result);
+PyObject * THPUtils_dispatchStateless(PyObject *tensor, const char *name, PyObject *args, PyObject *kwargs);
+
+template<typename _real, typename = void>
+struct mod_traits {};
+
+template<typename _real>
+struct mod_traits<_real, typename std::enable_if<std::is_floating_point<_real>::value>::type> {
+  static _real mod(_real a, _real b) { return fmod(a, b); }
+};
+
+template<typename _real>
+struct mod_traits<_real, typename std::enable_if<std::is_integral<_real>::value>::type> {
+  static _real mod(_real a, _real b) { return a % b; }
+};
+
+void setBackCompatBroadcastWarn(bool warn);
+bool getBackCompatBroadcastWarn();
+
+void setBackCompatKeepdimWarn(bool warn);
+bool getBackCompatKeepdimWarn();
+bool maybeThrowBackCompatKeepdimWarn(char *func);
+
+#ifdef USE_CUDA
+std::vector <THCStream*> THPUtils_PySequence_to_THCStreamList(PyObject *obj);
+#endif
+
+#endif /* _THP_CORE */
+
+#endif
diff --git a/torch/csrc/utils/auto_gil.h b/torch/csrc/utils/auto_gil.h
new file mode 100644
index 0000000..cfc2d32
--- /dev/null
+++ b/torch/csrc/utils/auto_gil.h
@@ -0,0 +1,34 @@
+#pragma once
+
+// RAII structs to acquire and release Python's global interpreter lock (GIL)
+
+#include "torch/csrc/python_headers.h"
+
+// Acquires the GIL on construction
+struct AutoGIL {
+  AutoGIL() : gstate(PyGILState_Ensure()) {
+  }
+  ~AutoGIL() {
+    PyGILState_Release(gstate);
+  }
+
+  PyGILState_STATE gstate;
+};
+
+// Releases the GIL on construction
+struct AutoNoGIL {
+  AutoNoGIL() : save(PyEval_SaveThread()) {
+  }
+  ~AutoNoGIL() {
+    PyEval_RestoreThread(save);
+  }
+
+  PyThreadState* save;
+};
+
+// Runs the function without the GIL
+template<typename F>
+inline void with_no_gil(F f) {
+  AutoNoGIL no_gil;
+  f();
+}
diff --git a/torch/csrc/utils/auto_stream.h b/torch/csrc/utils/auto_stream.h
new file mode 100644
index 0000000..8d7b4d7
--- /dev/null
+++ b/torch/csrc/utils/auto_stream.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// RAII structs to set CUDA stream
+
+#ifdef USE_CUDA
+#include <THC/THC.h>
+extern THCState* state;
+#endif
+
+struct AutoStream {
+#ifdef USE_CUDA
+  explicit AutoStream(THCStream* stream)
+    : original_stream(THCState_getStream(state))
+  {
+    THCStream_retain(original_stream);
+    THCState_setStream(state, stream);
+  }
+
+  ~AutoStream() {
+    THCState_setStream(state, original_stream);
+    THCStream_free(original_stream);
+  }
+
+  THCStream* original_stream;
+#endif
+};
diff --git a/torch/csrc/utils/auto_unique_ptr.h b/torch/csrc/utils/auto_unique_ptr.h
new file mode 100644
index 0000000..d49a036
--- /dev/null
+++ b/torch/csrc/utils/auto_unique_ptr.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <memory>
+
+namespace torch {
+
+// A unique_ptr that automatically constructs the object on first dereference.
+template<typename T>
+struct auto_unique_ptr : public std::unique_ptr<T> {
+  T& operator*() {
+    if (!this->get()) this->reset(new T());
+    return *this->get();
+  }
+
+  T* operator->() {
+    if (!this->get()) this->reset(new T());
+    return this->get();
+  }
+};
+
+} // namespace torch
diff --git a/torch/csrc/utils/cuda_enabled.h b/torch/csrc/utils/cuda_enabled.h
new file mode 100644
index 0000000..84353e0
--- /dev/null
+++ b/torch/csrc/utils/cuda_enabled.h
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace torch {
+namespace utils {
+
+static inline bool cuda_enabled() {
+#ifdef USE_CUDA
+  return true;
+#else
+  return false;
+#endif
+}
+
+}
+}
diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp
new file mode 100644
index 0000000..ac35e04
--- /dev/null
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@@ -0,0 +1,23 @@
+#include "cuda_lazy_init.h"
+
+#include "torch/csrc/python_headers.h"
+#include <mutex>
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+namespace torch {
+namespace utils {
+
+void cuda_lazy_init() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
+    if (!module) throw python_error();
+    auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
+    if (!res) throw python_error();
+  });
+}
+
+}
+}
diff --git a/torch/csrc/utils/cuda_lazy_init.h b/torch/csrc/utils/cuda_lazy_init.h
new file mode 100644
index 0000000..8b1d4be
--- /dev/null
+++ b/torch/csrc/utils/cuda_lazy_init.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// It initially lies in torch/csrc/cuda, but to unconditionlly compile it
+// we have to put it here.
+
+namespace torch {
+namespace utils {
+
+void cuda_lazy_init();
+
+}
+}
diff --git a/torch/csrc/utils/disallow_copy.h b/torch/csrc/utils/disallow_copy.h
new file mode 100644
index 0000000..786b8b7
--- /dev/null
+++ b/torch/csrc/utils/disallow_copy.h
@@ -0,0 +1,4 @@
+#pragma once
+#define TH_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete; \
+  void operator=(const TypeName&) = delete
diff --git a/torch/csrc/utils/functional.h b/torch/csrc/utils/functional.h
new file mode 100644
index 0000000..af5099e
--- /dev/null
+++ b/torch/csrc/utils/functional.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <vector>
+#include <ATen/ATen.h>
+
+namespace torch {
+
+// The passed in function must take T by value (T), or by
+// const reference (const T&); taking T by non-const reference
+// will result in an error like:
+//
+//    error: no type named 'type' in 'class std::result_of<foobar::__lambda(T)>'
+//
+// No explicit template parameters are required.
+
+// Overload for explicit function and ArrayRef
+template<typename F, typename T>
+inline auto fmap(const T& inputs, const F& fn) -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for(const auto & input : inputs)
+    r.push_back(fn(input));
+  return r;
+}
+
+template<typename F, typename T>
+inline auto fmap(T& inputs, const F& fn) -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs)
+    r.push_back(fn(input));
+  return r;
+}
+
+// C++ forbids taking an address of a constructor, so here's a workaround...
+// Overload for constructor (R) application
+template<typename R, typename T>
+inline std::vector<R> fmap(const T& inputs) {
+  std::vector<R> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs)
+    r.push_back(R(input));
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(at::ArrayRef<T> inputs, const F& fn) {
+  std::vector<T> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs) {
+    if (fn(input)) {
+      r.push_back(input);
+    }
+  }
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(const std::vector<T>& inputs, const F& fn) {
+  return filter<F, T>(static_cast<at::ArrayRef<T>>(inputs), fn);
+}
+
+}
diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h
new file mode 100644
index 0000000..05a5a27
--- /dev/null
+++ b/torch/csrc/utils/hash.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <functional>
+#include <vector>
+
+namespace torch {
+
+// NOTE: hash_combine is based on implementation from Boost
+//
+// Boost Software License - Version 1.0 - August 17th, 2003
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+inline size_t hash_combine(size_t seed, size_t value) {
+  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// torch::hash implementation
+////////////////////////////////////////////////////////////////////////////////
+
+namespace _hash_detail {
+
+// Use template argument deduction to shorten calls to torch::hash
+template<typename T>
+size_t simple_get_hash(const T& o);
+
+template<typename T, typename V>
+using type_if_not_enum = typename std::enable_if<!std::is_enum<T>::value, V>::type;
+
+// Use SFINAE to dispatch to std::hash if possible, cast enum types to int automatically,
+// and fall back to T::hash otherwise.
+// NOTE: C++14 added support for hashing enum types to the standard, and some compilers
+// implement it even when C++14 flags aren't specified. This is why we have to disable
+// this overload if T is an enum type (and use the one below in this case).
+template<typename T>
+auto dispatch_hash(const T& o) -> decltype(std::hash<T>()(o), type_if_not_enum<T, size_t>()) {
+  return std::hash<T>()(o);
+}
+
+template<typename T>
+typename std::enable_if<std::is_enum<T>::value, size_t>::type dispatch_hash(const T& o) {
+  using R = typename std::underlying_type<T>::type;
+  return std::hash<R>()(static_cast<R>(o));
+}
+
+template<typename T>
+auto dispatch_hash(const T& o) -> decltype(T::hash(o), size_t()) {
+  return T::hash(o);
+}
+
+} // namespace _hash_detail
+
+// Hasher struct
+template<typename T>
+struct hash {
+  size_t operator()(const T& o) const {
+    return _hash_detail::dispatch_hash(o);
+  };
+};
+
+// Specialization for std::tuple
+template<typename... Types>
+struct hash<std::tuple<Types...>> {
+  template<size_t idx, typename... Ts>
+  struct tuple_hash {
+    size_t operator()(const std::tuple<Ts...>& t) const {
+      return hash_combine(_hash_detail::simple_get_hash(std::get<idx>(t)),
+                          tuple_hash<idx-1, Ts...>()(t));
+    }
+  };
+
+  template<typename... Ts>
+  struct tuple_hash<0, Ts...> {
+    size_t operator()(const std::tuple<Ts...>& t) const {
+      return _hash_detail::simple_get_hash(std::get<0>(t));
+    }
+  };
+
+  size_t operator()(const std::tuple<Types...>& t) const {
+    return tuple_hash<sizeof...(Types)-1, Types...>()(t);
+  }
+};
+
+// Specialization for std::vector
+template<typename T>
+struct hash<std::vector<T>> {
+  size_t operator()(const std::vector<T>& v) const {
+    size_t seed = 0;
+    for (const auto & elem : v) {
+      seed = hash_combine(seed, _hash_detail::simple_get_hash(elem));
+    }
+    return seed;
+  }
+};
+
+namespace _hash_detail {
+
+template<typename T>
+size_t simple_get_hash(const T& o) {
+  return torch::hash<T>()(o);
+}
+
+} // namespace _hash_detail
+
+// Use this function to actually hash multiple things in one line.
+// Dispatches to torch::hash, so it can hash containers.
+// Example:
+//
+// static size_t hash(const MyStruct& s) {
+//   return get_hash(s.member1, s.member2, s.member3);
+// }
+template<typename... Types>
+size_t get_hash(const Types&... args) {
+  return torch::hash<decltype(std::tie(args...))>()(std::tie(args...));
+}
+
+} // namespace torch
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
new file mode 100644
index 0000000..f8d5fd1
--- /dev/null
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -0,0 +1,399 @@
+#include "invalid_arguments.h"
+
+#include "python_strings.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <memory>
+
+namespace torch {
+
+namespace {
+
+std::string py_typename(PyObject *object) {
+  return Py_TYPE(object)->tp_name;
+}
+
+struct Type {
+  virtual bool is_matching(PyObject *object) = 0;
+  virtual ~Type() {};
+};
+
+struct SimpleType: public Type {
+  SimpleType(std::string& name): name(name) {};
+
+  bool is_matching(PyObject *object) {
+    return py_typename(object) == name;
+  }
+
+  std::string name;
+};
+
+struct MultiType: public Type {
+  MultiType(std::initializer_list<std::string> accepted_types):
+    types(accepted_types) {};
+
+  bool is_matching(PyObject *object) {
+    auto it = std::find(types.begin(), types.end(), py_typename(object));
+    return it != types.end();
+  }
+
+  std::vector<std::string> types;
+};
+
+struct NullableType: public Type {
+  NullableType(std::unique_ptr<Type> type): type(std::move(type)) {};
+
+  bool is_matching(PyObject *object) {
+    return object == Py_None || type->is_matching(object);
+  }
+
+  std::unique_ptr<Type> type;
+};
+
+struct TupleType: public Type {
+  TupleType(std::vector<std::unique_ptr<Type>> types):
+    types(std::move(types)) {};
+
+  bool is_matching(PyObject *object) {
+    if (!PyTuple_Check(object)) return false;
+    auto num_elements = PyTuple_GET_SIZE(object);
+    if (num_elements != (long)types.size()) return false;
+    for (int i = 0; i < num_elements; i++) {
+      if (!types[i]->is_matching(PyTuple_GET_ITEM(object, i)))
+        return false;
+    }
+    return true;
+  }
+
+  std::vector<std::unique_ptr<Type>> types;
+};
+
+struct SequenceType: public Type {
+  SequenceType(std::unique_ptr<Type> type):
+    type(std::move(type)) {};
+
+  bool is_matching(PyObject *object) {
+    if (!PySequence_Check(object)) return false;
+    auto num_elements = PySequence_Length(object);
+    for (int i = 0; i < num_elements; i++) {
+      if (!type->is_matching(PySequence_GetItem(object, i)))
+        return false;
+    }
+    return true;
+  }
+
+  std::unique_ptr<Type> type;
+};
+
+struct Argument {
+  Argument(std::string name, std::unique_ptr<Type> type):
+      name(name), type(std::move(type)) {};
+
+  std::string name;
+  std::unique_ptr<Type> type;
+};
+
+struct Option {
+  Option(std::vector<Argument> arguments, bool is_variadic, bool has_out):
+      arguments(std::move(arguments)), is_variadic(is_variadic), has_out(has_out) {};
+  Option(bool is_variadic, bool has_out):
+      arguments(), is_variadic(is_variadic), has_out(has_out) {};
+  Option(const Option&) = delete;
+  Option(Option&& other):
+    arguments(std::move(other.arguments)), is_variadic(other.is_variadic),
+    has_out(other.has_out) {};
+
+  std::vector<Argument> arguments;
+  bool is_variadic;
+  bool has_out;
+};
+
+std::vector<std::string> _splitString(const std::string &s, const std::string& delim) {
+  std::vector<std::string> tokens;
+  size_t start = 0;
+  size_t end;
+  while((end = s.find(delim, start)) != std::string::npos) {
+    tokens.push_back(s.substr(start, end-start));
+    start = end + delim.length();
+  }
+  tokens.push_back(s.substr(start));
+  return tokens;
+}
+
+std::unique_ptr<Type> _buildType(std::string type_name, bool is_nullable) {
+  std::unique_ptr<Type> result;
+  if (type_name == "float") {
+    result.reset(new MultiType({"float", "int", "long"}));
+  } else if (type_name == "int") {
+    result.reset(new MultiType({"int", "long"}));
+  } else if (type_name.find("tuple[") == 0) {
+    auto type_list = type_name.substr(6);
+    type_list.pop_back();
+    std::vector<std::unique_ptr<Type>> types;
+    for (auto& type: _splitString(type_list, ","))
+      types.emplace_back(_buildType(type, false));
+    result.reset(new TupleType(std::move(types)));
+  } else if (type_name.find("sequence[") == 0) {
+    auto subtype = type_name.substr(9);
+    subtype.pop_back();
+    result.reset(new SequenceType(_buildType(subtype, false)));
+  } else {
+    result.reset(new SimpleType(type_name));
+  }
+  if (is_nullable)
+    result.reset(new NullableType(std::move(result)));
+  return result;
+}
+
+std::pair<Option, std::string> _parseOption(const std::string& _option_str,
+    const std::unordered_map<std::string, PyObject*> kwargs)
+{
+  if (_option_str == "no arguments")
+    return std::pair<Option, std::string>(Option(false, false), _option_str);
+  bool has_out = false;
+  std::vector<Argument> arguments;
+  std::string printable_option = _option_str;
+  std::string option_str = _option_str.substr(1, _option_str.length()-2);
+
+  /// XXX: this is a hack only for the out arg in TensorMethods
+  auto out_pos = printable_option.find('#');
+  if (out_pos != std::string::npos) {
+    if (kwargs.count("out") > 0) {
+      std::string kwonly_part = printable_option.substr(out_pos+1);
+      printable_option.erase(out_pos);
+      printable_option += "*, ";
+      printable_option += kwonly_part;
+    } else if (out_pos >= 2) {
+      printable_option.erase(out_pos-2);
+      printable_option += ")";
+    } else {
+      printable_option.erase(out_pos);
+      printable_option += ")";
+    }
+    has_out = true;
+  }
+
+  for (auto& arg: _splitString(option_str, ", ")) {
+    bool is_nullable = false;
+    auto type_start_idx = 0;
+    if (arg[type_start_idx] == '#') {
+      type_start_idx++;
+    }
+    if (arg[type_start_idx] == '[') {
+      is_nullable = true;
+      type_start_idx++;
+      arg.erase(arg.length() - std::string(" or None]").length());
+    }
+
+    auto type_end_idx = arg.find_last_of(' ');
+    auto name_start_idx = type_end_idx + 1;
+
+    // "type ... name" => "type ... name"
+    //          ^              ^
+    auto dots_idx = arg.find("...");
+    if (dots_idx != std::string::npos)
+        type_end_idx -= 4;
+
+    std::string type_name =
+      arg.substr(type_start_idx, type_end_idx-type_start_idx);
+    std::string name =
+        arg.substr(name_start_idx);
+
+    arguments.emplace_back(name, _buildType(type_name, is_nullable));
+  }
+
+  bool is_variadic = option_str.find("...") != std::string::npos;
+  return std::pair<Option, std::string>(
+    Option(std::move(arguments), is_variadic, has_out),
+    std::move(printable_option)
+  );
+}
+
+bool _argcountMatch(
+    const Option& option,
+    const std::vector<PyObject*>& arguments,
+    const std::unordered_map<std::string, PyObject*>& kwargs)
+{
+  auto num_expected = option.arguments.size();
+  auto num_got = arguments.size() + kwargs.size();
+  // Note: variadic functions don't accept kwargs, so it's ok
+  if (option.has_out && kwargs.count("out") == 0)
+    num_expected--;
+  return num_got == num_expected ||
+    (option.is_variadic && num_got > num_expected);
+}
+
+std::string _formattedArgDesc(
+    const Option& option,
+    const std::vector<PyObject*>& arguments,
+    const std::unordered_map<std::string, PyObject*>& kwargs)
+{
+  std::string red;
+  std::string reset_red;
+  std::string green;
+  std::string reset_green;
+  if (isatty(1) && isatty(2)) {
+    red = "\33[31;1m";
+    reset_red = "\33[0m";
+    green = "\33[32;1m";
+    reset_green = "\33[0m";
+  } else {
+    red = "!";
+    reset_red = "!";
+    green = "";
+    reset_green = "";
+  }
+
+  auto num_args = arguments.size() + kwargs.size();
+  std::string result = "(";
+  for (size_t i = 0; i < num_args; i++) {
+    bool is_kwarg = i >= arguments.size();
+    PyObject *arg = is_kwarg ? kwargs.at(option.arguments[i].name) : arguments[i];
+
+    bool is_matching = false;
+    if (i < option.arguments.size()) {
+      is_matching = option.arguments[i].type->is_matching(arg);
+    } else if (option.is_variadic) {
+      is_matching = option.arguments.back().type->is_matching(arg);
+    }
+
+    if (is_matching)
+      result += green;
+    else
+      result += red;
+    if (is_kwarg) result += option.arguments[i].name + "=";
+    result += py_typename(arg);
+    if (is_matching)
+        result += reset_green;
+    else
+        result += reset_red;
+    result += ", ";
+  }
+  if (arguments.size() > 0)
+    result.erase(result.length()-2);
+  result += ")";
+  return result;
+}
+
+std::string _argDesc(const std::vector<PyObject *>& arguments,
+    const std::unordered_map<std::string, PyObject *>& kwargs)
+{
+  std::string result = "(";
+  for (auto& arg: arguments)
+    result += std::string(py_typename(arg)) + ", ";
+  for (auto& kwarg: kwargs)
+    result += kwarg.first + "=" + py_typename(kwarg.second) + ", ";
+  if (arguments.size() > 0)
+    result.erase(result.length()-2);
+  result += ")";
+  return result;
+}
+
+std::vector<std::string> _tryMatchKwargs(const Option& option,
+    const std::unordered_map<std::string, PyObject*>& kwargs) {
+  std::vector<std::string> unmatched;
+  int start_idx = option.arguments.size() - kwargs.size();
+  if (option.has_out && kwargs.count("out") == 0)
+    start_idx--;
+  if (start_idx < 0)
+    start_idx = 0;
+  for (auto& entry: kwargs) {
+    bool found = false;
+    for (unsigned int i = start_idx; i < option.arguments.size(); i++) {
+      if (option.arguments[i].name == entry.first) {
+        found = true;
+        break;
+      }
+    }
+    if (!found)
+      unmatched.push_back(entry.first);
+  }
+  return unmatched;
+}
+
+} // anonymous namespace
+
+std::string format_invalid_args(
+    PyObject *given_args, PyObject *given_kwargs, const std::string& function_name,
+    const std::vector<std::string>& options)
+{
+  std::vector<PyObject *> args;
+  std::unordered_map<std::string, PyObject *> kwargs;
+  std::string error_msg;
+  error_msg.reserve(2000);
+  error_msg += function_name;
+  error_msg += " received an invalid combination of arguments - ";
+
+  Py_ssize_t num_args = PyTuple_Size(given_args);
+  for (int i = 0; i < num_args; i++) {
+    PyObject *arg = PyTuple_GET_ITEM(given_args, i);
+    args.push_back(arg);
+  }
+
+  bool has_kwargs = given_kwargs && PyDict_Size(given_kwargs) > 0;
+  if (has_kwargs) {
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+
+    while (PyDict_Next(given_kwargs, &pos, &key, &value)) {
+      kwargs.emplace(THPUtils_unpackString(key), value);
+    }
+  }
+
+  if (options.size() == 1) {
+    auto pair = _parseOption(options[0], kwargs);
+    auto& option = pair.first;
+    auto& option_str = pair.second;
+    std::vector<std::string> unmatched_kwargs;
+    if (has_kwargs)
+      unmatched_kwargs = _tryMatchKwargs(option, kwargs);
+    if (unmatched_kwargs.size()) {
+      error_msg += "got unrecognized keyword arguments: ";
+      for (auto& kwarg: unmatched_kwargs)
+        error_msg += kwarg + ", ";
+      error_msg.erase(error_msg.length()-2);
+    } else {
+      error_msg += "got ";
+      if (_argcountMatch(option, args, kwargs)) {
+        error_msg += _formattedArgDesc(option, args, kwargs);
+      } else {
+        error_msg += _argDesc(args, kwargs);
+      }
+      error_msg += ", but expected ";
+      error_msg += option_str;
+    }
+  } else {
+    error_msg += "got ";
+    error_msg += _argDesc(args, kwargs);
+    error_msg += ", but expected one of:\n";
+    for (auto &option_str: options) {
+      auto pair = _parseOption(option_str, kwargs);
+      auto& option = pair.first;
+      auto& printable_option_str = pair.second;
+      error_msg += " * ";
+      error_msg += printable_option_str;
+      error_msg += "\n";
+      if (_argcountMatch(option, args, kwargs)) {
+        std::vector<std::string> unmatched_kwargs;
+        if (has_kwargs)
+          unmatched_kwargs = _tryMatchKwargs(option, kwargs);
+        if (unmatched_kwargs.size() > 0) {
+          error_msg += "      didn't match because some of the keywords were incorrect: ";
+          for (auto& kwarg: unmatched_kwargs)
+            error_msg += kwarg + ", ";
+          error_msg.erase(error_msg.length()-2);
+          error_msg += "\n";
+        } else {
+          error_msg += "      didn't match because some of the arguments have invalid types: ";
+          error_msg += _formattedArgDesc(option, args, kwargs);
+          error_msg += "\n";
+        }
+      }
+    }
+  }
+  return error_msg;
+}
+
+
+} // namespace torch
diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h
new file mode 100644
index 0000000..138c333
--- /dev/null
+++ b/torch/csrc/utils/invalid_arguments.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <string>
+#include <vector>
+
+namespace torch {
+
+std::string format_invalid_args(
+    PyObject *args, PyObject *kwargs, const std::string& name,
+    const std::vector<std::string>& options);
+
+} // namespace torch
diff --git a/torch/csrc/utils/memory.h b/torch/csrc/utils/memory.h
new file mode 100644
index 0000000..775e960
--- /dev/null
+++ b/torch/csrc/utils/memory.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <memory>
+
+namespace torch {
+
+// Reference:
+// https://github.com/llvm-mirror/libcxx/blob/master/include/memory#L3091
+
+template <typename T>
+struct unique_type_for {
+  using value = std::unique_ptr<T>;
+};
+
+template <typename T>
+struct unique_type_for<T[]> {
+  using unbounded_array = std::unique_ptr<T[]>;
+};
+
+template <typename T, size_t N>
+struct unique_type_for<T[N]> {
+  using bounded_array = void;
+};
+
+template <typename T, typename... Args>
+typename unique_type_for<T>::value make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <typename T>
+typename unique_type_for<T>::unbounded_array make_unique(size_t size) {
+  using U = typename std::remove_extent<T>::type;
+  return std::unique_ptr<T>(new U[size]());
+}
+
+template <typename T, size_t N, typename... Args>
+typename unique_type_for<T>::bounded_array make_unique(Args&&...) = delete;
+} // namespace torch
diff --git a/torch/csrc/utils/numpy_stub.h b/torch/csrc/utils/numpy_stub.h
new file mode 100644
index 0000000..253f7fa
--- /dev/null
+++ b/torch/csrc/utils/numpy_stub.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#ifdef USE_NUMPY
+
+#if !defined(NO_IMPORT_ARRAY) && !defined(WITH_NUMPY_IMPORT_ARRAY)
+#define NO_IMPORT_ARRAY
+#endif
+
+#ifndef PY_ARRAY_UNIQUE_SYMBOL
+#define PY_ARRAY_UNIQUE_SYMBOL __numpy_array_api
+#endif
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif
+
+#include <numpy/arrayobject.h>
+
+#endif // USE_NUMPY
diff --git a/torch/csrc/utils/object_ptr.cpp b/torch/csrc/utils/object_ptr.cpp
new file mode 100644
index 0000000..33d41d3
--- /dev/null
+++ b/torch/csrc/utils/object_ptr.cpp
@@ -0,0 +1,11 @@
+#include "torch/csrc/utils/object_ptr.h"
+
+#include "torch/csrc/python_headers.h"
+
+template<>
+void THPPointer<PyObject>::free() {
+  if (ptr)
+    Py_DECREF(ptr);
+}
+
+template class THPPointer<PyObject>;
diff --git a/torch/csrc/utils/object_ptr.h b/torch/csrc/utils/object_ptr.h
new file mode 100644
index 0000000..14991b8
--- /dev/null
+++ b/torch/csrc/utils/object_ptr.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "torch/csrc/utils/python_stub.h"
+
+template<class T>
+class THPPointer {
+public:
+  THPPointer(): ptr(nullptr) {};
+  explicit THPPointer(T *ptr): ptr(ptr) {};
+  THPPointer(THPPointer &&p) { free(); ptr = p.ptr; p.ptr = nullptr; };
+
+  ~THPPointer() { free(); };
+  T * get() { return ptr; }
+  const T * get() const { return ptr; }
+  T * release() { T *tmp = ptr; ptr = nullptr; return tmp; }
+  operator T*() { return ptr; }
+  THPPointer& operator =(T *new_ptr) { free(); ptr = new_ptr; return *this; }
+  THPPointer& operator =(THPPointer &&p) { free(); ptr = p.ptr; p.ptr = nullptr; return *this; }
+  T * operator ->() { return ptr; }
+  explicit operator bool() const { return ptr != nullptr; }
+
+private:
+  void free();
+  T *ptr = nullptr;
+};
+
+/**
+ * An RAII-style, owning pointer to a PyObject.  You must protect
+ * destruction of this object with the GIL.
+ *
+ * WARNING: Think twice before putting this as a field in a C++
+ * struct.  This class does NOT take out the GIL on destruction,
+ * so if you will need to ensure that the destructor of your struct
+ * is either (a) always invoked when the GIL is taken or (b) takes
+ * out the GIL itself.  Easiest way to avoid this problem is to
+ * not use THPPointer in this situation.
+ */
+typedef THPPointer<PyObject> THPObjectPtr;
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
new file mode 100644
index 0000000..65ddc5d
--- /dev/null
+++ b/torch/csrc/utils/pybind.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include <ATen/ATen.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/autograd/python_variable.h"
+
+#include <stdexcept>
+
+namespace py = pybind11;
+
+namespace pybind11 { namespace detail {
+
+// torch.autograd.Variable <-> at::Tensor conversions (without unwrapping)
+template <>
+struct type_caster<at::Tensor> {
+ public:
+  PYBIND11_TYPE_CASTER(at::Tensor, _("at::Tensor"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (THPVariable_Check(obj)) {
+      value = reinterpret_cast<THPVariable*>(obj)->cdata;
+      return true;
+    }
+    return false;
+  }
+
+  static handle
+  cast(at::Tensor src, return_value_policy /* policy */, handle /* parent */) {
+    if (!src.is_variable()) {
+      throw std::runtime_error(
+          "Expected tensor's dynamic type to be Variable, not Tensor");
+    }
+    return handle(THPVariable_Wrap(torch::autograd::Variable(src)));
+  }
+};
+
+template<> struct type_caster<torch::autograd::Variable> {
+public:
+  PYBIND11_TYPE_CASTER(torch::autograd::Variable, _("torch::autograd::Variable"));
+  bool load(handle src, bool) {
+    PyObject *source = src.ptr();
+    if (THPVariable_Check(source)) {
+      value = ((THPVariable*)source)->cdata;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  static handle cast(torch::autograd::Variable src, return_value_policy /* policy */, handle /* parent */) {
+    return handle(THPVariable_Wrap(src));
+  }
+};
+
+// http://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html#c-17-library-containers
+template <typename T>
+struct type_caster<at::optional<T>> : optional_caster<at::optional<T>> {};
+}} // namespace pybind11::detail
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
new file mode 100644
index 0000000..c6d2315
--- /dev/null
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -0,0 +1,527 @@
+#include "torch/csrc/utils/python_arg_parser.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/utils/invalid_arguments.h"
+#include "torch/csrc/utils/python_strings.h"
+
+#include <ATen/ATen.h>
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+
+static std::unordered_map<std::string, ParameterType> type_map = {
+  {"Tensor", ParameterType::TENSOR},
+  {"Scalar", ParameterType::SCALAR},
+  {"int64_t", ParameterType::INT64},
+  {"double", ParameterType::DOUBLE},
+  {"TensorList", ParameterType::TENSOR_LIST},
+  {"IntList", ParameterType::INT_LIST},
+  {"Generator", ParameterType::GENERATOR},
+  {"bool", ParameterType::BOOL},
+  {"Storage", ParameterType::STORAGE},
+  {"PyObject*", ParameterType::PYOBJECT},
+  {"ScalarType", ParameterType::SCALARTYPE},
+  {"optional<ScalarType>", ParameterType::SCALARTYPE},
+  {"Layout", ParameterType::LAYOUT},
+  {"Device", ParameterType::DEVICE},
+  {"std::string", ParameterType::STRING},
+};
+
+FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
+  : optional(false)
+  , allow_none(false)
+  , keyword_only(keyword_only)
+  , size(0)
+  , default_scalar(0)
+{
+  auto space = fmt.find(' ');
+  if (space == std::string::npos) {
+    throw std::runtime_error("FunctionParameter(): missing type: " + fmt);
+  }
+
+  auto type_str = fmt.substr(0, space);
+
+  auto question = type_str.find('?');
+  if (question != std::string::npos) {
+    allow_none = true;
+    type_str = type_str.substr(0, question);
+  }
+
+  // Parse and remove brackets from type_str
+  auto bracket = type_str.find('[');
+  if (bracket != std::string::npos) {
+    auto size_str = type_str.substr(bracket + 1, type_str.length() - bracket - 2);
+    size = atoi(size_str.c_str());
+    type_str = type_str.substr(0, bracket);
+  }
+
+  auto name_str = fmt.substr(space + 1);
+  auto it = type_map.find(type_str);
+  if (it == type_map.end()) {
+    throw std::runtime_error("FunctionParameter(): invalid type string: " + type_str);
+  }
+  type_ = it->second;
+
+  auto eq = name_str.find('=');
+  if (eq != std::string::npos) {
+    name = name_str.substr(0, eq);
+    optional = true;
+    set_default_str(name_str.substr(eq + 1));
+  } else {
+    name = name_str;
+  }
+#if PY_MAJOR_VERSION == 2
+  python_name = PyString_InternFromString(name.c_str());
+#else
+  python_name = PyUnicode_InternFromString(name.c_str());
+#endif
+}
+
+bool FunctionParameter::check(PyObject* obj) {
+  switch (type_) {
+    case ParameterType::TENSOR: {
+      return THPVariable_Check(obj);
+    }
+    case ParameterType::SCALAR:
+    case ParameterType::DOUBLE: {
+      // NOTE: we don't currently accept most NumPy types as Scalars. np.float64
+      // is okay because it's a subclass of PyFloat. We may want to change this
+      // in the future.
+      if (THPUtils_checkDouble(obj)) {
+        return true;
+      }
+      if (THPVariable_Check(obj)) {
+        auto& var = ((THPVariable*)obj)->cdata;
+        return !var.requires_grad() && var.dim() == 0;
+      }
+      return false;
+    }
+    case ParameterType::INT64: {
+      if (THPUtils_checkLong(obj)) {
+        return true;
+      }
+      if (THPVariable_Check(obj)) {
+        auto& var = ((THPVariable*)obj)->cdata;
+        return at::isIntegralType(var.type().scalarType()) && !var.requires_grad() && var.dim() == 0;
+      }
+      return false;
+    }
+    case ParameterType::TENSOR_LIST: return PyTuple_Check(obj) || PyList_Check(obj);
+    case ParameterType::INT_LIST: {
+      if (PyTuple_Check(obj) || PyList_Check(obj)) {
+        return true;
+      }
+      // if a size is specified (e.g. IntList[2]) we also allow passing a single int
+      return size > 0 && THPUtils_checkLong(obj);
+    }
+    case ParameterType::GENERATOR: return THPGenerator_Check(obj);
+    case ParameterType::BOOL: return PyBool_Check(obj);
+    case ParameterType::STORAGE: return isStorage(obj);
+    case ParameterType::PYOBJECT: return true;
+    case ParameterType::SCALARTYPE: return THPDtype_Check(obj);
+    case ParameterType::LAYOUT: return THPLayout_Check(obj);
+    case ParameterType::DEVICE:
+      return THPUtils_checkLong(obj) || THPUtils_checkString(obj) || THPDevice_Check(obj);
+    case ParameterType::STRING: return THPUtils_checkString(obj);
+    default: throw std::runtime_error("unknown parameter type");
+  }
+}
+
+std::string FunctionParameter::type_name() const {
+  switch (type_) {
+    case ParameterType::TENSOR: return "Tensor";
+    case ParameterType::SCALAR: return "Number";
+    case ParameterType::INT64: return "int";
+    case ParameterType::DOUBLE: return "float";
+    case ParameterType::TENSOR_LIST: return "tuple of Tensors";
+    case ParameterType::INT_LIST: return "tuple of ints";
+    case ParameterType::GENERATOR: return "torch.Generator";
+    case ParameterType::BOOL: return "bool";
+    case ParameterType::STORAGE: return "torch.Storage";
+    case ParameterType::PYOBJECT: return "object";
+    case ParameterType::SCALARTYPE: return "torch.dtype";
+    case ParameterType::LAYOUT: return "torch.layout";
+    case ParameterType::DEVICE: return "torch.device";
+    case ParameterType::STRING: return "str";
+    default: throw std::runtime_error("unknown parameter type");
+  }
+}
+
+static inline at::optional<int64_t> parse_as_integer(const std::string& s) {
+  if (s.empty()) return at::nullopt;
+  char *str_end;
+  long ans = strtol(s.c_str(), &str_end, 0);
+  // *str_end == 0 if the entire string was parsed as an integer.
+  return (*str_end == 0) ? at::optional<int64_t>(ans) : at::nullopt;
+}
+
+
+void FunctionParameter::set_default_str(const std::string& str) {
+  if (str == "None") {
+    allow_none = true;
+  }
+  if (type_ == ParameterType::TENSOR) {
+    if (str != "None") {
+      throw std::runtime_error("default value for Tensor must be none, got: " + str);
+    }
+  } else if (type_ == ParameterType::INT64) {
+    default_int = atol(str.c_str());
+  } else if (type_ == ParameterType::BOOL) {
+    default_bool = (str == "True" || str == "true");
+  } else if (type_ == ParameterType::DOUBLE) {
+    default_double = atof(str.c_str());
+  } else if (type_ == ParameterType::SCALAR) {
+    if (str == "None") {
+      // This is a bit awkward, but convenient for clamp which takes Scalars,
+      // but allows None.
+      default_scalar = at::Scalar(NAN);
+    } else {
+      // we sometimes rely on integer-vs-float values, e.g. with arange.
+      const auto as_integer = parse_as_integer(str);
+      default_scalar = as_integer.has_value() ? at::Scalar(as_integer.value()) :
+                                                at::Scalar(atof(str.c_str()));
+    }
+  } else if (type_ == ParameterType::INT_LIST) {
+    if (str != "None") {
+      default_intlist.assign(size, std::stoi(str));
+    }
+  } else if (type_ == ParameterType::SCALARTYPE) {
+    if (str == "None") {
+      default_scalartype = at::ScalarType::Undefined;
+    } else if (str == "torch.int64") {
+      default_scalartype = at::ScalarType::Long;
+    } else {
+      throw std::runtime_error("invalid default value for ScalarType: " + str);
+    }
+  } else if (type_ == ParameterType::LAYOUT) {
+    if (str == "None") {
+      default_layout = nullptr;
+    } else if (str == "torch.strided") {
+      default_layout = torch::getLayout(at::Backend::CPU);
+    } else if (str == "torch.sparse_coo") {
+      default_layout = torch::getLayout(at::Backend::SparseCPU);
+    } else {
+      throw std::runtime_error("invalid default value for layout: " + str);
+    }
+  } else if (type_ == ParameterType::DEVICE) {
+    if (str != "None") {
+      throw std::runtime_error("invalid device: " + str);
+    }
+  } else if (type_ == ParameterType::STRING) {
+    if (str != "None" || str != "") {
+      throw std::runtime_error("invalid default string: " + str);
+    }
+  }
+}
+
+FunctionSignature::FunctionSignature(const std::string& fmt)
+  : min_args(0)
+  , max_args(0)
+  , max_pos_args(0)
+  , hidden(false)
+  , deprecated(false)
+{
+  auto open_paren = fmt.find('(');
+  if (open_paren == std::string::npos) {
+    throw std::runtime_error("missing opening parenthesis: " + fmt);
+  }
+  name = fmt.substr(0, open_paren);
+
+  auto last_offset = open_paren + 1;
+  auto next_offset = last_offset;
+  bool keyword_only = false;
+  bool done = false;
+  while (!done) {
+    auto offset = fmt.find(", ", last_offset);
+    if (offset == std::string::npos) {
+      offset = fmt.find(")", last_offset);
+      done = true;
+      next_offset = offset + 1;
+    } else {
+      next_offset = offset + 2;
+    }
+    if (offset == std::string::npos) {
+      throw std::runtime_error("missing closing parenthesis: " + fmt);
+    }
+    if (offset == last_offset) {
+      break;
+    }
+
+    auto param_str = fmt.substr(last_offset, offset - last_offset);
+    last_offset = next_offset;
+    if (param_str == "*") {
+      keyword_only = true;
+    } else {
+      params.emplace_back(param_str, keyword_only);
+    }
+  }
+
+  if (fmt.substr(last_offset) == "|deprecated") {
+    hidden = true;
+    // TODO: raise warning when parsing deprecated signatures
+    deprecated = true;
+  } else if (fmt.substr(last_offset) == "|hidden") {
+    hidden = true;
+  }
+
+  max_args = params.size();
+
+  // count the number of non-optional args
+  for (auto& param : params) {
+    if (!param.optional) {
+      min_args++;
+    }
+    if (!param.keyword_only) {
+      max_pos_args++;
+    }
+  }
+}
+
+std::string FunctionSignature::toString() const {
+  std::ostringstream ss;
+  ss << "(";
+  int i = 0;
+  for (auto& param : params) {
+    if (i != 0) {
+      ss << ", ";
+    }
+    ss << param.type_name() << " " << param.name;
+    i++;
+  }
+  ss << ")";
+  return ss.str();
+}
+
+[[noreturn]]
+static void extra_args(const FunctionSignature& signature, ssize_t nargs) {
+  auto max_pos_args = signature.max_pos_args;
+  auto min_args = signature.min_args;
+  if (min_args != max_pos_args) {
+    throw TypeError("%s() takes from %d to %d positional arguments but %d were given",
+        signature.name.c_str(), min_args, max_pos_args, nargs);
+  }
+  throw TypeError("%s() takes %d positional argument%s but %d %s given",
+      signature.name.c_str(),
+      max_pos_args, max_pos_args == 1 ? "" : "s",
+      nargs, nargs == 1 ? "was" : "were");
+}
+
+[[noreturn]]
+static void missing_args(const FunctionSignature& signature, int idx) {
+  int num_missing = 0;
+  std::stringstream ss;
+
+  auto& params = signature.params;
+  for (auto it = params.begin() + idx; it != params.end(); ++it) {
+    if (!it->optional) {
+      if (num_missing > 0) {
+        ss << ", ";
+      }
+      ss << '"' << it->name << '"';
+      num_missing++;
+    }
+  }
+
+  throw TypeError("%s() missing %d required positional argument%s: %s",
+      signature.name.c_str(),
+      num_missing,
+      num_missing == 1 ? "s" : "",
+      ss.str().c_str());
+}
+
+static ssize_t find_param(FunctionSignature& signature, PyObject* name) {
+  ssize_t i = 0;
+  for (auto& param : signature.params) {
+    int cmp = PyObject_RichCompareBool(name, param.python_name, Py_EQ);
+    if (cmp < 0) {
+      throw python_error();
+    } else if (cmp) {
+      return i;
+    }
+    i++;
+  }
+  return -1;
+}
+
+[[noreturn]]
+static void extra_kwargs(FunctionSignature& signature, PyObject* kwargs, ssize_t num_pos_args) {
+  PyObject *key, *value;
+  ssize_t pos = 0;
+
+  while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (!THPUtils_checkString(key)) {
+      throw TypeError("keywords must be strings");
+    }
+
+    auto param_idx = find_param(signature, key);
+    if (param_idx < 0) {
+      throw TypeError("%s() got an unexpected keyword argument '%s'",
+          signature.name.c_str(), THPUtils_unpackString(key).c_str());
+    }
+
+    if (param_idx < num_pos_args) {
+      throw TypeError("%s() got multiple values for argument '%s'",
+          signature.name.c_str(), THPUtils_unpackString(key).c_str());
+    }
+  }
+
+  // this should never be hit
+  throw TypeError("invalid keyword arguments");
+}
+
+bool FunctionSignature::parse(PyObject* args, PyObject* kwargs, PyObject* dst[],
+                              bool raise_exception) {
+  auto nargs = PyTuple_GET_SIZE(args);
+  ssize_t remaining_kwargs = kwargs ? PyDict_Size(kwargs) : 0;
+  ssize_t arg_pos = 0;
+  bool allow_varargs_intlist = false;
+
+  // if there is a single positional IntList argument, i.e. expand(..), view(...),
+  // allow a var-args style IntList, so expand(5,3) behaves as expand((5,3))
+  if (max_pos_args == 1 && params[0].type_ == ParameterType::INT_LIST) {
+    allow_varargs_intlist = true;
+  }
+
+  if (nargs > max_pos_args && !allow_varargs_intlist) {
+    if (raise_exception) {
+      // foo() takes takes 2 positional arguments but 3 were given
+      extra_args(*this, nargs);
+    }
+    return false;
+  }
+
+  int i = 0;
+  for (auto& param : params) {
+    PyObject* obj = nullptr;
+    bool is_kwd = false;
+    if (arg_pos < nargs) {
+      obj = PyTuple_GET_ITEM(args, arg_pos);
+    } else if (kwargs) {
+      obj = PyDict_GetItem(kwargs, param.python_name);
+      is_kwd = true;
+    }
+
+    if ((!obj && param.optional) || (obj == Py_None && param.allow_none)) {
+      dst[i++] = nullptr;
+    } else if (!obj) {
+      if (raise_exception) {
+        // foo() missing 1 required positional argument: "b"
+        missing_args(*this, i);
+      }
+      return false;
+    } else if (param.check(obj)) {
+      dst[i++] = obj;
+    // XXX: the Variable check is necessary because sizes become tensors when
+    // tracer is enabled. This behavior easily leads to ambiguities, and we
+    // should avoid having complex signatures that make use of it...
+    } else if (allow_varargs_intlist && arg_pos == 0 && !is_kwd &&
+               THPUtils_checkIndex(obj)) {
+      // take all positional arguments as this parameter
+      // e.g. permute(1, 2, 3) -> permute((1, 2, 3))
+      dst[i++] = args;
+      arg_pos = nargs;
+      continue;
+    } else if (raise_exception) {
+      if (is_kwd) {
+        // foo(): argument 'other' must be str, not int
+        throw TypeError("%s(): argument '%s' must be %s, not %s",
+            name.c_str(), param.name.c_str(), param.type_name().c_str(),
+            Py_TYPE(obj)->tp_name);
+      } else {
+        // foo(): argument 'other' (position 2) must be str, not int
+        throw TypeError("%s(): argument '%s' (position %d) must be %s, not %s",
+            name.c_str(), param.name.c_str(), arg_pos + 1,
+            param.type_name().c_str(), Py_TYPE(obj)->tp_name);
+      }
+    } else {
+      return false;
+    }
+
+    if (!is_kwd) {
+      arg_pos++;
+    } else if (obj) {
+      remaining_kwargs--;
+    }
+  }
+
+  if (remaining_kwargs > 0) {
+    if (raise_exception) {
+      // foo() got an unexpected keyword argument "b"
+      extra_kwargs(*this, kwargs, nargs);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+PythonArgParser::PythonArgParser(std::vector<std::string> fmts, bool traceable)
+ : max_args(0)
+ , traceable(traceable)
+{
+  for (auto& fmt : fmts) {
+    signatures_.push_back(FunctionSignature(fmt));
+  }
+  for (auto& signature : signatures_) {
+    if (signature.max_args > max_args) {
+      max_args = signature.max_args;
+    }
+  }
+  if (signatures_.size() > 0) {
+    function_name = signatures_[0].name;
+  }
+}
+
+PythonArgs PythonArgParser::raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {
+  if (signatures_.size() == 1) {
+    auto& signature = signatures_[0];
+    signature.parse(args, kwargs, parsed_args, true);
+    return PythonArgs(0, traceable, signature, parsed_args);
+  }
+
+  int i = 0;
+  for (auto& signature : signatures_) {
+    if (signature.parse(args, kwargs, parsed_args, false)) {
+      return PythonArgs(i, traceable, signature, parsed_args);
+    }
+    i++;
+  }
+
+  print_error(args, kwargs, parsed_args);
+}
+
+void PythonArgParser::print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {
+  auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
+  std::vector<int> plausible_idxs;
+  ssize_t i = 0;
+  for (auto& signature : signatures_) {
+    if (num_args >= signature.min_args && num_args <= signature.max_args && !signature.hidden) {
+      plausible_idxs.push_back(i);
+    }
+    i++;
+  }
+
+  if (plausible_idxs.size() == 1) {
+    auto& signature = signatures_[plausible_idxs[0]];
+    signature.parse(args, kwargs, parsed_args, true);
+  }
+
+  std::vector<std::string> options;
+  for (auto& signature : signatures_) {
+    if (!signature.hidden) {
+      options.push_back(signature.toString());
+    }
+  }
+
+  auto msg = torch::format_invalid_args(args, kwargs, function_name + "()", options);
+  throw TypeError("%s", msg.c_str());
+}
+
+
+} // namespace torch
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
new file mode 100644
index 0000000..953a413
--- /dev/null
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -0,0 +1,429 @@
+#pragma once
+
+// Parse arguments to Python functions implemented in C++
+// This is similar to PyArg_ParseTupleAndKeywords(), but specifically handles
+// the types relevant to PyTorch and distinguishes between overloaded function
+// signatures.
+//
+// Example:
+//
+//   static PythonArgParser parser({
+//     "norm(Scalar p, int64_t dim, bool keepdim=False)",
+//     "norm(Scalar p=2)",
+//   });
+//   ParsedArgs<3> parsed_args;
+//   auto r = parser.parse(args, kwargs, parsed_args);
+//   if (r.idx == 0) {
+//     norm(r.scalar(0), r.int64(1), r.bool(0));
+//   } else {
+//     norm(r.scalar(0));
+//   }
+//
+// We auto-generate most uses of PythonArgParser; the generated files
+// are torch/csrc/autograd/generated/python_*.cpp
+//
+// Some gotchas that you should watch out for:
+//
+//    - Note [Order of overloads matters]
+//      Order of overloads matters.  A set of input arguments may
+//      bind to multiple argument specs; we will always pick the
+//      first one in PythonArgParser.  However, when you are writing
+//      overloads in, e.g., native_functions.yaml, you don't have to
+//      worry about what order you write them, because the code
+//      generation logic always gives the overloads a canonical
+//      order, where Tensor overloads come first, before Scalar overloads.
+//      This logic is in sort_declarations in
+//      tools/autograd/gen_python_functions.py
+//
+//    - Zero-dim tensors (e.g., torch.tensor(2)) bind to both
+//      Scalar and Tensor, UNLESS they require grad (in which case
+//      they only bind to Tensor).
+
+
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Generator.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/utils/numpy_stub.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_strings.h"
+
+#include <ATen/ATen.h>
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+enum class ParameterType {
+  TENSOR, SCALAR, INT64, DOUBLE, TENSOR_LIST, INT_LIST, GENERATOR,
+  BOOL, STORAGE, PYOBJECT, SCALARTYPE, LAYOUT, DEVICE, STRING
+};
+
+struct FunctionParameter;
+struct FunctionSignature;
+struct PythonArgs;
+
+// Contains bound Python arguments in declaration order
+template<int N>
+struct ParsedArgs {
+  PyObject* args[N];
+};
+
+struct PythonArgParser {
+  explicit PythonArgParser(std::vector<std::string> fmts, bool traceable=false);
+
+  template<int N>
+  inline PythonArgs parse(PyObject* args, PyObject* kwargs, ParsedArgs<N>& dst);
+
+private:
+  [[noreturn]]
+  void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]);
+  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]);
+
+  std::vector<FunctionSignature> signatures_;
+  std::string function_name;
+  ssize_t max_args;
+  bool traceable;
+};
+
+struct PythonArgs {
+  PythonArgs(int idx, bool traceable, const FunctionSignature& signature, PyObject** args)
+    : idx(idx)
+    , traceable(traceable)
+    , signature(signature)
+    , args(args) {}
+
+  int idx;
+  bool traceable;
+  const FunctionSignature& signature;
+  PyObject** args;
+
+  inline at::Tensor tensor(int i);
+  inline at::Scalar scalar(int i);
+  inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar);
+  inline std::vector<at::Tensor> tensorlist(int i);
+  template<int N>
+  inline std::array<at::Tensor, N> tensorlist_n(int i);
+  inline std::vector<int64_t> intlist(int i);
+  inline std::vector<int64_t> intlistWithDefault(int i, std::vector<int64_t> default_intlist);
+  inline at::Generator* generator(int i);
+  inline std::unique_ptr<at::Storage> storage(int i);
+  inline at::ScalarType scalartype(int i);
+  inline at::ScalarType scalartypeWithDefault(int i, at::ScalarType default_scalartype);
+  inline at::optional<at::ScalarType> scalartypeOptional(int i);
+  inline const THPLayout& layout(int i);
+  inline const THPLayout& layoutWithDefault(int i, const THPLayout& default_layout);
+  inline at::Device device(int i);
+  inline at::Device deviceWithDefault(int i, const at::Device& default_device);
+  inline at::optional<at::Device> deviceOptional(int i);
+  inline std::string string(int i);
+  inline PyObject* pyobject(int i);
+  inline int64_t toInt64(int i);
+  inline int64_t toInt64WithDefault(int i, int64_t default_int);
+  inline double toDouble(int i);
+  inline double toDoubleWithDefault(int i, double default_double);
+  inline bool toBool(int i);
+  inline bool toBoolWithDefault(int i, bool default_bool);
+  inline bool isNone(int i);
+};
+
+struct FunctionSignature {
+  explicit FunctionSignature(const std::string& fmt);
+
+  bool parse(PyObject* args, PyObject* kwargs, PyObject* dst[], bool raise_exception);
+  std::string toString() const;
+
+  std::string name;
+  std::vector<FunctionParameter> params;
+  ssize_t min_args;
+  ssize_t max_args;
+  ssize_t max_pos_args;
+  bool hidden;
+  bool deprecated;
+};
+
+struct FunctionParameter {
+  FunctionParameter(const std::string& fmt, bool keyword_only);
+
+  bool check(PyObject* obj);
+  void set_default_str(const std::string& str);
+  std::string type_name() const;
+
+  ParameterType type_;
+  bool optional;
+  bool allow_none;
+  bool keyword_only;
+  int size;
+  std::string name;
+  // having this as a raw PyObject * will presumably leak it, but these are only held by static objects
+  // anyway, and Py_Finalize can already be called when this is destructed.
+  PyObject *python_name;
+  at::Scalar default_scalar;
+  std::vector<int64_t> default_intlist;
+  union {
+    bool default_bool;
+    int64_t default_int;
+    double default_double;
+    at::ScalarType default_scalartype;
+    THPLayout* default_layout;
+  };
+};
+
+template<int N>
+inline PythonArgs PythonArgParser::parse(PyObject* args, PyObject* kwargs, ParsedArgs<N>& dst) {
+  if (N < max_args) {
+    throw ValueError("PythonArgParser: dst ParsedArgs buffer does not have enough capacity, expected %d (got %d)",
+        (int)max_args, N);
+  }
+  return raw_parse(args, kwargs, dst.args);
+}
+
+inline at::Tensor PythonArgs::tensor(int i) {
+  if (!args[i]) return at::Tensor();
+  if (!THPVariable_Check(args[i])) {
+    // NB: Are you here because you passed None to a Variable method,
+    // and you expected an undefined tensor to be returned?   Don't add
+    // a test for Py_None here; instead, you need to mark the argument
+    // as *allowing none*; you can do this by writing 'Tensor?' instead
+    // of 'Tensor' in the ATen metadata.
+    throw TypeError("expected Tensor as argument %d, but got %s", i,
+        Py_TYPE(args[i])->tp_name);
+  }
+  return reinterpret_cast<THPVariable*>(args[i])->cdata;
+}
+
+inline at::Scalar PythonArgs::scalar(int i) {
+  return scalarWithDefault(i, signature.params[i].default_scalar);
+}
+
+inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar) {
+  if (!args[i]) return default_scalar;
+  // Zero-dim tensors are converted to Scalars as-is. Note this doesn't currently
+  // handle most NumPy scalar types except np.float64.
+  if (THPVariable_Check(args[i])) {
+    return at::Scalar(((THPVariable*)args[i])->cdata);
+  }
+  if (THPUtils_checkLong(args[i])) {
+    return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(args[i])));
+  }
+  return at::Scalar(THPUtils_unpackDouble(args[i]));
+}
+
+inline std::vector<at::Tensor> PythonArgs::tensorlist(int i) {
+  if (!args[i]) return std::vector<at::Tensor>();
+  PyObject* arg = args[i];
+  auto tuple = PyTuple_Check(arg);
+  auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<at::Tensor> res(size);
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    if (!THPVariable_Check(obj)) {
+      throw TypeError("expected Tensor as element %d in argument %d, but got %s",
+                 idx, i, Py_TYPE(args[i])->tp_name);
+    }
+    res[idx] = reinterpret_cast<THPVariable*>(obj)->cdata;
+  }
+  return res;
+}
+
+template<int N>
+inline std::array<at::Tensor, N> PythonArgs::tensorlist_n(int i) {
+  auto res = std::array<at::Tensor, N>();
+  PyObject* arg = args[i];
+  if (!arg) return res;
+  auto tuple = PyTuple_Check(arg);
+  auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  if (size != N) {
+    throw TypeError("expected tuple of %d elements but got %d", N, (int)size);
+  }
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    if (!THPVariable_Check(obj)) {
+      throw TypeError("expected Tensor as element %d in argument %d, but got %s",
+                 idx, i, Py_TYPE(args[i])->tp_name);
+    }
+    res[idx] = reinterpret_cast<THPVariable*>(obj)->cdata;
+  }
+  return res;
+}
+
+inline std::vector<int64_t> PythonArgs::intlist(int i) {
+  return intlistWithDefault(i, signature.params[i].default_intlist);
+}
+
+inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<int64_t> default_intlist) {
+  if (!args[i]) return default_intlist;
+  PyObject* arg = args[i];
+  auto size = signature.params[i].size;
+  if (size > 0 && THPUtils_checkLong(arg)) {
+    return std::vector<int64_t>(size, THPUtils_unpackIndex(arg));
+  }
+  auto tuple = PyTuple_Check(arg);
+  size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<int64_t> res(size);
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    try {
+      // Elements of torch.Size are tensors during tracing, and we need to record extra
+      // information before they are turned into an IntList
+      if (traceable && THPVariable_Check(obj)) {
+        auto & var = THPVariable_Unpack(obj);
+        jit::tracer::ArgumentStash::stashIntListElem(
+            signature.params[i].name, size, idx, var);
+        res[idx] = var.toCLong();
+        continue;
+      } else {
+        res[idx] = THPUtils_unpackIndex(obj);
+      }
+    } catch (std::runtime_error &e) {
+      throw TypeError("%s(): argument '%s' must be %s, but found element of type %s at pos %d",
+          signature.name.c_str(), signature.params[i].name.c_str(),
+          signature.params[i].type_name().c_str(), Py_TYPE(obj)->tp_name, idx + 1);
+    }
+  }
+  return res;
+}
+
+inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
+  if (!args[i]) return default_scalartype;
+  return scalartype(i);
+}
+
+inline at::ScalarType PythonArgs::scalartype(int i) {
+  if (!args[i]) {
+    auto scalartype = signature.params[i].default_scalartype;
+    return (scalartype == at::ScalarType::Undefined) ?
+            torch::tensors::get_default_tensor_type().scalarType() : scalartype;
+  }
+  return reinterpret_cast<THPDtype*>(args[i])->scalar_type;
+}
+
+inline at::optional<at::ScalarType> PythonArgs::scalartypeOptional(int i) {
+  if (!args[i]) return at::nullopt;
+  return scalartype(i);
+}
+
+inline const THPLayout& PythonArgs::layout(int i) {
+  if (!args[i]) return *signature.params[i].default_layout;
+  return *reinterpret_cast<THPLayout*>(args[i]);
+}
+
+inline const THPLayout& PythonArgs::layoutWithDefault(int i, const THPLayout& default_layout) {
+  if (!args[i]) return default_layout;
+  return layout(i);
+}
+
+static std::string cuda_str = "cuda";
+static std::string cpu_str = "cpu";
+static std::string cuda_prefix = "cuda:";
+static std::string cpu_prefix = "cpu:";
+
+inline at::Device PythonArgs::device(int i) {
+  if (!args[i]) {
+    const auto& default_tensor_type = torch::tensors::get_default_tensor_type();
+    return at::Device(default_tensor_type.backend());
+  }
+  if (THPDevice_Check(args[i])) {
+    const auto device = reinterpret_cast<THPDevice*>(args[i]);
+    return device->device;
+  }
+  if (THPUtils_checkLong(args[i])) {
+    const auto device_index = THPUtils_unpackLong(args[i]);
+    AT_CHECK(device_index >= 0, "Device index must not be negative");
+    return at::Device(at::kCUDA, device_index);
+  }
+  const std::string device_str = THPUtils_unpackString(args[i]);
+  if (device_str == cpu_str) {
+    return at::Device(at::kCPU);
+  } else if (device_str == cuda_str) {
+    return at::Device(at::kCUDA);
+  } else if (device_str.compare(0, cpu_prefix.length(), cpu_prefix) == 0) {
+    const auto device_index = std::stoi(device_str.substr(cpu_prefix.length()));
+    AT_CHECK(device_index >= 0, "Device index must not be negative");
+    return at::Device(at::kCPU, device_index);
+  } else if (device_str.compare(0, cuda_prefix.length(), cuda_prefix) == 0) {
+    const auto device_index = std::stoi(device_str.substr(cuda_prefix.length()));
+    AT_CHECK(device_index >= 0, "Device index must not be negative");
+    return at::Device(at::kCUDA, device_index);
+  }
+  throw torch::TypeError("only \"cuda\" and \"cpu\" are valid device types, got %s", device_str.c_str());
+}
+
+inline at::Device PythonArgs::deviceWithDefault(int i, const at::Device& default_device) {
+  if (!args[i]) return default_device;
+  return device(i);
+}
+
+inline at::optional<at::Device> PythonArgs::deviceOptional(int i) {
+  if (!args[i]) return at::nullopt;
+  return device(i);
+}
+
+inline std::string PythonArgs::string(int i) {
+  if (!args[i]) return "";
+  return THPUtils_unpackString(args[i]);
+}
+
+inline int64_t PythonArgs::toInt64(int i) {
+  if (!args[i]) return signature.params[i].default_int;
+  return THPUtils_unpackLong(args[i]);
+}
+
+inline int64_t PythonArgs::toInt64WithDefault(int i, int64_t default_int) {
+  if (!args[i]) return default_int;
+  return toInt64(i);
+}
+
+inline double PythonArgs::toDouble(int i) {
+  if (!args[i]) return signature.params[i].default_double;
+  return THPUtils_unpackDouble(args[i]);
+}
+
+inline double PythonArgs::toDoubleWithDefault(int i, double default_double) {
+  if (!args[i]) return default_double;
+  return toDouble(i);
+}
+
+inline bool PythonArgs::toBool(int i) {
+  if (!args[i]) return signature.params[i].default_bool;
+  return args[i] == Py_True;
+}
+
+inline bool PythonArgs::toBoolWithDefault(int i, bool default_bool) {
+  if (!args[i]) return default_bool;
+  return toBool(i);
+}
+
+inline bool PythonArgs::isNone(int i) {
+  return args[i] == nullptr;
+}
+
+inline at::Generator* PythonArgs::generator(int i) {
+  if (!args[i]) return nullptr;
+  return reinterpret_cast<THPGenerator*>(args[i])->cdata;
+}
+
+inline std::unique_ptr<at::Storage> PythonArgs::storage(int i) {
+  if (!args[i]) return nullptr;
+  return createStorage(args[i]);
+}
+
+inline PyObject* PythonArgs::pyobject(int i) {
+  if (!args[i]) return Py_None;
+  return args[i];
+}
+
+} // namespace torch
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
new file mode 100644
index 0000000..54d1111
--- /dev/null
+++ b/torch/csrc/utils/python_compat.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+// https://bugsfiles.kde.org/attachment.cgi?id=61186
+#if PY_VERSION_HEX >= 0x03020000
+#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
+  (PySlice_GetIndicesEx(SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
+#else
+#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
+  (PySlice_GetIndicesEx((PySliceObject*)SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
+#endif
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
new file mode 100644
index 0000000..32de68d
--- /dev/null
+++ b/torch/csrc/utils/python_numbers.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <stdint.h>
+#include <stdexcept>
+#include "torch/csrc/Exceptions.h"
+
+// largest integer that can be represented consecutively in a double
+const int64_t DOUBLE_INT_MAX = 9007199254740992;
+
+inline PyObject* THPUtils_packInt64(int64_t value) {
+#if PY_MAJOR_VERSION == 2
+  if (sizeof(long) == sizeof(int64_t)) {
+    return PyInt_FromLong(static_cast<long>(value));
+  } else if (value <= INT32_MAX && value >= INT32_MIN) {
+    return PyInt_FromLong(static_cast<long>(value));
+  }
+#endif
+  return PyLong_FromLongLong(value);
+}
+
+inline PyObject* THPUtils_packUInt64(uint64_t value) {
+#if PY_MAJOR_VERSION == 2
+  if (value <= INT32_MAX) {
+    return PyInt_FromLong(static_cast<long>(value));
+  }
+#endif
+  return PyLong_FromUnsignedLongLong(value);
+}
+
+inline PyObject* THPUtils_packDoubleAsInt(double value) {
+#if PY_MAJOR_VERSION == 2
+  if (value <= INT32_MAX && value >= INT32_MIN) {
+    return PyInt_FromLong(static_cast<long>(value));
+  }
+#endif
+  return PyLong_FromDouble(value);
+}
+
+inline bool THPUtils_checkLong(PyObject* obj) {
+#if PY_MAJOR_VERSION == 2
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+inline int64_t THPUtils_unpackLong(PyObject* obj) {
+  int overflow;
+  long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+  return (int64_t)value;
+}
+
+inline bool THPUtils_checkIndex(PyObject *obj) {
+  if (PyBool_Check(obj)) {
+    return false;
+  }
+  if (THPUtils_checkLong(obj)) {
+    return true;
+  }
+  auto index = THPObjectPtr(PyNumber_Index(obj));
+  if (!index) {
+    PyErr_Clear();
+    return false;
+  }
+  return true;
+}
+
+inline int64_t THPUtils_unpackIndex(PyObject* obj) {
+  if (!THPUtils_checkLong(obj)) {
+    auto index = THPObjectPtr(PyNumber_Index(obj));
+    if (index == nullptr) {
+      throw python_error();
+    }
+    obj = index.get();
+  }
+  return THPUtils_unpackLong(obj);
+}
+
+inline bool THPUtils_checkDouble(PyObject* obj) {
+#if PY_MAJOR_VERSION == 2
+  return PyFloat_Check(obj) || PyLong_Check(obj) || PyInt_Check(obj);
+#else
+  return PyFloat_Check(obj) || PyLong_Check(obj);
+#endif
+}
+
+inline double THPUtils_unpackDouble(PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    return PyFloat_AS_DOUBLE(obj);
+  }
+  if (PyLong_Check(obj)) {
+    int overflow;
+    long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+    if (overflow != 0) {
+      throw std::runtime_error("Overflow when unpacking double");
+    }
+    if (value > DOUBLE_INT_MAX || value < -DOUBLE_INT_MAX) {
+      throw std::runtime_error("Precision loss when unpacking double");
+    }
+    return (double)value;
+  }
+#if PY_MAJOR_VERSION == 2
+  if (PyInt_Check(obj)) {
+    return (double)PyInt_AS_LONG(obj);
+  }
+#endif
+  double value = PyFloat_AsDouble(obj);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  return value;
+}
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
new file mode 100644
index 0000000..820e6d2
--- /dev/null
+++ b/torch/csrc/utils/python_scalars.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "torch/csrc/python_headers.h"
+
+#include "python_numbers.h"
+#include "torch/csrc/Exceptions.h"
+
+namespace torch { namespace utils {
+
+inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
+  switch (scalarType) {
+    case at::kByte: *(uint8_t*)data = (uint8_t)THPUtils_unpackLong(obj); break;
+    case at::kChar: *(char*)data = (char)THPUtils_unpackLong(obj); break;
+    case at::kShort: *(int16_t*)data = (int16_t)THPUtils_unpackLong(obj); break;
+    case at::kInt: *(int32_t*)data = (int32_t)THPUtils_unpackLong(obj); break;
+    case at::kLong: *(int64_t*)data = THPUtils_unpackLong(obj); break;
+    case at::kHalf:
+      *(at::Half*)data = at::convert<at::Half, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat: *(float*)data = (float)THPUtils_unpackDouble(obj); break;
+    case at::kDouble: *(double*)data = THPUtils_unpackDouble(obj); break;
+    default: throw std::runtime_error("invalid type");
+  }
+}
+
+inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
+  switch (scalarType) {
+    case at::kByte: return THPUtils_packInt64(*(uint8_t*)data);
+    case at::kChar: return THPUtils_packInt64(*(char*)data);
+    case at::kShort: return THPUtils_packInt64(*(int16_t*)data);
+    case at::kInt: return THPUtils_packInt64(*(int32_t*)data);
+    case at::kLong: return THPUtils_packInt64(*(int64_t*)data);
+    case at::kHalf: return PyFloat_FromDouble(at::convert<double, at::Half>(*(at::Half*)data));
+    case at::kFloat: return PyFloat_FromDouble(*(float*)data);
+    case at::kDouble: return PyFloat_FromDouble(*(double*)data);
+    default: throw std::runtime_error("invalid type");
+  }
+}
+
+}}  // namespace torch::utils
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
new file mode 100644
index 0000000..b9c168a
--- /dev/null
+++ b/torch/csrc/utils/python_strings.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <stdexcept>
+#include <string>
+#include "object_ptr.h"
+
+// Utilities for handling Python strings. Note that PyString, when defined, is
+// the same as PyBytes.
+
+// Returns true if obj is a bytes/str or unicode object
+// As of Python 3.6, this does not require the GIL
+inline bool THPUtils_checkString(PyObject* obj) {
+  return PyBytes_Check(obj) || PyUnicode_Check(obj);
+}
+
+// Unpacks PyBytes (PyString) or PyUnicode as std::string
+// PyBytes are unpacked as-is. PyUnicode is unpacked as UTF-8.
+// NOTE: this method requires the GIL
+inline std::string THPUtils_unpackString(PyObject* obj) {
+  if (PyBytes_Check(obj)) {
+    size_t size = PyBytes_GET_SIZE(obj);
+    return std::string(PyBytes_AS_STRING(obj), size);
+  }
+  if (PyUnicode_Check(obj)) {
+#if PY_MAJOR_VERSION == 2
+    THPObjectPtr bytes(PyUnicode_AsUTF8String(obj));
+    if (!bytes) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    size_t size = PyBytes_GET_SIZE(bytes.get());
+    return std::string(PyBytes_AS_STRING(bytes.get()), size);
+#else
+    Py_ssize_t size;
+    const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+    if (!data) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    return std::string(data, (size_t)size);
+#endif
+  }
+  throw std::runtime_error("unpackString: expected bytes or unicode object");
+}
+
+inline PyObject* THPUtils_packString(const char* str) {
+#if PY_MAJOR_VERSION == 2
+  return PyString_FromString(str);
+#else
+  return PyUnicode_FromString(str);
+#endif
+}
+
+inline PyObject* THPUtils_packString(const std::string& str) {
+#if PY_MAJOR_VERSION == 2
+  return PyString_FromStringAndSize(str.c_str(), str.size());
+#else
+  return PyUnicode_FromStringAndSize(str.c_str(), str.size());
+#endif
+}
diff --git a/torch/csrc/utils/python_stub.h b/torch/csrc/utils/python_stub.h
new file mode 100644
index 0000000..485b3b5
--- /dev/null
+++ b/torch/csrc/utils/python_stub.h
@@ -0,0 +1,4 @@
+#pragma once
+
+struct _object;
+typedef _object PyObject;
diff --git a/torch/csrc/utils/python_tuples.h b/torch/csrc/utils/python_tuples.h
new file mode 100644
index 0000000..7dea678
--- /dev/null
+++ b/torch/csrc/utils/python_tuples.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/python_numbers.h"
+
+inline void THPUtils_packInt64Array(PyObject *tuple, size_t size, const int64_t *sizes) {
+  for (size_t i = 0; i != size; ++i) {
+    PyObject *i64 = THPUtils_packInt64(sizes[i]);
+    if (!i64) {
+      throw python_error();
+    }
+    PyTuple_SET_ITEM(tuple, i, THPUtils_packInt64(sizes[i]));
+  }
+}
+
+inline PyObject* THPUtils_packInt64Array(size_t size, const int64_t *sizes) {
+  THPObjectPtr tuple(PyTuple_New(size));
+  if (!tuple) throw python_error();
+  THPUtils_packInt64Array(tuple.get(), size, sizes);
+  return tuple.release();
+}
diff --git a/torch/csrc/utils/tensor_apply.cpp b/torch/csrc/utils/tensor_apply.cpp
new file mode 100644
index 0000000..2e53f60
--- /dev/null
+++ b/torch/csrc/utils/tensor_apply.cpp
@@ -0,0 +1,99 @@
+#include "tensor_apply.h"
+
+#include <ATen/TensorUtils.h>
+#include <ATen/ExpandUtils.h>
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_scalars.h"
+
+using namespace at;
+
+namespace torch { namespace utils {
+
+struct StridedData {
+  StridedData(const Tensor & tensor)
+    : data(tensor.data_ptr())
+    , strides(tensor.strides())
+    , elementSize(tensor.type().elementSizeInBytes()) {}
+
+  void* data;
+  IntList strides;
+  int64_t elementSize;
+
+  void step(int dim) {
+    data = (char*)data + (strides[dim] * elementSize);
+  }
+};
+
+template<size_t N>
+static void recursive_apply(IntList sizes, ScalarType scalarType, int64_t dim,
+                            PyObject* fn, std::array<StridedData, N> strided_data) {
+  int64_t ndim = sizes.size();
+  if (dim == ndim) {
+    auto args = THPObjectPtr(PyTuple_New(N));
+    if (!args) throw python_error();
+    for (size_t i = 0; i < N; i++) {
+      PyObject* arg = load_scalar(strided_data[i].data, scalarType);
+      if (!arg) throw python_error();
+      PyTuple_SET_ITEM(args.get(), i, arg);
+    }
+    auto ret = THPObjectPtr(PyObject_CallObject(fn, args.get()));
+    if (!ret) throw python_error();
+    store_scalar(strided_data[0].data, scalarType, ret.get());
+    return;
+  }
+
+  auto n = sizes[dim];
+  for (int64_t i = 0; i < n; i++) {
+    recursive_apply(sizes, scalarType, dim + 1, fn, strided_data);
+    for (auto& td : strided_data) {
+      td.step(dim);
+    }
+  }
+}
+
+Tensor & apply_(Tensor & self, PyObject* fn) {
+  if (self.type().backend() != kCPU) {
+    throw TypeError("apply_ is only implemented on CPU tensors");
+  }
+  auto scalarType = self.type().scalarType();
+  recursive_apply<1>(self.sizes(), scalarType, 0, fn, {{ self }});
+  return self;
+}
+
+Tensor & map_(Tensor & self, const Tensor & other_, PyObject* fn) {
+  if (self.type().backend() != kCPU) {
+    throw TypeError("map_ is only implemented on CPU tensors");
+  }
+  if (other_.type() != self.type()) {
+    throw TypeError("map_: expected %s for 'other' (got %s)",
+        self.type().toString(), other_.type().toString());
+  }
+  Tensor other;
+  std::tie(other) = expand_inplace(self, other_, "map_");
+  auto scalarType = self.type().scalarType();
+  recursive_apply<2>(self.sizes(), scalarType, 0, fn, {{ self, other }});
+  return self;
+}
+
+Tensor & map2_(Tensor & self, const Tensor & x_, const Tensor & y_, PyObject* fn) {
+  if (self.type().backend() != kCPU || x_.type().backend() != kCPU || y_.type().backend() != kCPU) {
+    throw TypeError("map2_ is only implemented on CPU tensors");
+  }
+  if (x_.type() != self.type()) {
+    throw TypeError("map2_: expected %s for argument 'x' (got %s)",
+        self.type().toString(), x_.type().toString());
+  }
+  if (y_.type() != self.type()) {
+    throw TypeError("map2_: expected %s for argument 'y' (got %s)",
+        self.type().toString(), y_.type().toString());
+  }
+  Tensor other1, other2;
+  std::tie(other1, other2) = expand_inplace(self, x_, y_, "map2_");
+  auto scalarType = self.type().scalarType();
+  recursive_apply<3>(self.sizes(), scalarType, 0, fn, {{ self, other1, other2 }});
+  return self;
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h
new file mode 100644
index 0000000..47fbaa6
--- /dev/null
+++ b/torch/csrc/utils/tensor_apply.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+namespace torch { namespace utils {
+
+at::Tensor & apply_(at::Tensor & self, PyObject* fn);
+at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn);
+at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1,
+                   const at::Tensor & other2, PyObject* fn);
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.cpp b/torch/csrc/utils/tensor_conversion_dispatch.cpp
new file mode 100644
index 0000000..030565e
--- /dev/null
+++ b/torch/csrc/utils/tensor_conversion_dispatch.cpp
@@ -0,0 +1,53 @@
+#include <Python.h>
+
+#include "tensor_conversion_dispatch.h"
+
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#include <ATen/DeviceGuard.h>
+
+#include <cstddef>
+
+namespace torch { namespace utils {
+
+at::Tensor dispatch_type_conversion(
+    const at::Tensor & self,
+    const at::Type & type,
+    at::optional<int32_t> device_index,
+    bool non_blocking) {
+  if (type.is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+  AutoNoGIL no_gil;
+
+  const int32_t tensor_device = self.is_cuda() ? self.get_device() : -1;
+  at::DeviceGuard device_guard(device_index.value_or(tensor_device));
+
+  if (self.is_cuda() && type.is_cuda() && tensor_device != at::current_device()) {
+    // copy if the devices are different even if the types are the same
+    return type.copy(self, non_blocking);
+  }
+
+  // Don't specialize cross-backend copies
+  if (self.type().backend() != type.backend()) {
+    return self.toType(type, non_blocking);
+  }
+
+  // Dispatch to specialized, traceable cast operators for the JIT. These
+  // specialized ops are ATen native and thus have the tracing mechanisms auto-
+  // generated, whereas the default case is not traceable since it requires a
+  // Type as a parameter/attribute. TODO: support Types in the JIT and remove
+  // this once we have that
+  switch (type.scalarType()) {
+#define DEFINE_CAST_DISPATCH(_1, n, _2)   \
+  case at::ScalarType::n: {               \
+    return self._cast_##n(non_blocking); \
+  } break;
+    AT_FORALL_SCALAR_TYPES(DEFINE_CAST_DISPATCH)
+#undef DEFINE_CAST_DISPATCH
+    default: { return self.toType(type, non_blocking); } break;
+  }
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.h b/torch/csrc/utils/tensor_conversion_dispatch.h
new file mode 100644
index 0000000..3554458
--- /dev/null
+++ b/torch/csrc/utils/tensor_conversion_dispatch.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// "Convert" tensor a different type and / or device
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+
+namespace torch { namespace utils {
+
+// Returns a tensor with the same data as `self` and the specified type and
+// device. Returns `self` unmodified if neither the type nor device change;
+// otherwise a copy is made.
+//
+// The `device` argument is only relevant if `type` is a CUDA type. There are
+// a few special cases for device:
+//
+//  - if device is -1 then the returned tensor will be on the current device
+//  - if device is nullopt then the returned tensor will be on the same device
+//    as `self` if possible; otherwise it will be on the current device.
+//
+// If `non_blocking` is true, then the copy may be performed asynchronously
+// w.r.t the host if `self` is a CPU tensor in pinned memory and `type` is a
+// CUDA type. Note that copies between CUDA devices are always asynchronous
+// w.r.t the host.
+at::Tensor dispatch_type_conversion(const at::Tensor & self,
+                                    const at::Type & type,
+                                    at::optional<int32_t> device_index = at::nullopt,
+                                    bool non_blocking=false);
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
new file mode 100644
index 0000000..c305a99
--- /dev/null
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -0,0 +1,67 @@
+#include "torch/csrc/python_headers.h"
+#include "tensor_dtypes.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/utils/tensor_types.h"
+
+namespace torch { namespace utils {
+
+static std::pair<std::string, std::string> getDtypeNames(at::ScalarType scalarType) {
+  switch(scalarType) {
+    case at::ScalarType::Byte:
+      // no "byte" because byte is signed in numpy and we overload
+      // byte to mean bool often
+      return std::make_pair("uint8", "");
+    case at::ScalarType::Char:
+      // no "char" because it is not consistently signed or unsigned; we want
+      // to move to int8
+      return std::make_pair("int8", "");
+    case at::ScalarType::Double:
+      return std::make_pair("float64", "double");
+    case at::ScalarType::Float:
+      return std::make_pair("float32", "float");
+    case at::ScalarType::Int:
+      return std::make_pair("int32", "int");
+    case at::ScalarType::Long:
+      return std::make_pair("int64", "long");
+    case at::ScalarType::Short:
+      return std::make_pair("int16", "short");
+    case at::ScalarType::Half:
+      return std::make_pair("float16", "half");
+    default:
+      throw std::runtime_error("Unimplemented scalar type");
+  }
+}
+
+void initializeDtypes() {
+  auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch_module) python_error();
+
+#define DEFINE_SCALAR_TYPE(_1,n,_2) at::ScalarType::n,
+
+  at::ScalarType all_scalar_types[] = {
+    AT_FORALL_SCALAR_TYPES(DEFINE_SCALAR_TYPE)
+  };
+
+  for (at::ScalarType scalarType: all_scalar_types) {
+    std::string primary_name, legacy_name;
+    std::tie(primary_name, legacy_name) = getDtypeNames(scalarType);
+    std::string name = std::string(PyModule_GetName(torch_module.get())) + '.' + primary_name;
+    PyObject *dtype = THPDtype_New(scalarType, name);
+    torch::registerDtypeObject((THPDtype*)dtype, scalarType);
+    Py_INCREF(dtype);
+    if (PyModule_AddObject(torch_module.get(), primary_name.c_str(), dtype) != 0) {
+      throw python_error();
+    }
+    if (legacy_name != "") {
+      Py_INCREF(dtype);
+      if (PyModule_AddObject(torch_module.get(), legacy_name.c_str(), dtype) != 0) {
+        throw python_error();
+      }
+    }
+  }
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_dtypes.h b/torch/csrc/utils/tensor_dtypes.h
new file mode 100644
index 0000000..fffea6f
--- /dev/null
+++ b/torch/csrc/utils/tensor_dtypes.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <tuple>
+#include <string>
+#include <ATen/ATen.h>
+
+namespace torch { namespace utils {
+
+void initializeDtypes();
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp
new file mode 100644
index 0000000..b973642
--- /dev/null
+++ b/torch/csrc/utils/tensor_flatten.cpp
@@ -0,0 +1,95 @@
+#include "torch/csrc/utils/tensor_flatten.h"
+
+#include <unordered_map>
+
+namespace torch { namespace utils {
+
+using namespace at;
+
+std::vector<TensorGroup> take_tensors(TensorList tensors, size_t size_limit) {
+  std::vector<TensorGroup> results;
+  results.reserve(tensors.size()); // an overapproximation, but at least we won't have to copy stuff around
+  std::unordered_map<at::Type*, TensorGroup> groups;
+  for (const auto & tensor : tensors) {
+    auto & type = tensor.type();
+    size_t tensor_size;
+    if (type.is_sparse()) {
+      const auto& indices = tensor._indices();
+      const auto& values = tensor._values();
+      tensor_size = indices.numel() * indices.type().elementSizeInBytes() +
+                    values.numel() * indices.type().elementSizeInBytes();
+    } else {
+      tensor_size = tensor.numel() * type.elementSizeInBytes();
+    }
+    auto & type_group = groups[&type];
+    type_group.tensors.push_back(tensor);
+    type_group.size += tensor_size;
+    if (type_group.size + tensor_size >= size_limit) {
+      results.emplace_back();
+      std::swap(results.back(), type_group);
+    }
+  }
+  // End case. Look for any remaining groups and return them.
+  for (auto & entry : groups) {
+    auto & group = entry.second;
+    if (group.size > 0) {
+      results.emplace_back(std::move(group));
+    }
+  }
+  return results;
+}
+
+void reorder_tensors_like(std::vector<Tensor>& tensors, TensorList order) {
+  TORCH_ASSERT(tensors.size() == order.size());
+  std::unordered_map<at::Type*, std::vector<size_t>> type_indices;
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i)
+    type_indices[&tensors[i].type()].push_back(i);
+
+  std::unordered_map<at::Type*, size_t> type_used;
+  std::vector<Tensor> ordered_tensors;
+  ordered_tensors.reserve(tensors.size());
+  for (auto & tmpl_tensor : order) {
+    auto * type = &tmpl_tensor.type();
+    auto & indices = type_indices[type];
+    auto & used = type_used[type];
+    ordered_tensors.push_back(tensors[indices[used++]]);
+  }
+  std::swap(tensors, ordered_tensors);
+}
+
+namespace {
+
+at::Tensor get_indices(const at::Tensor& t) {
+  return t._indices();
+}
+
+at::Tensor get_values(const at::Tensor& t) {
+  return t._values();
+}
+
+}
+
+std::pair<at::Tensor, at::Tensor> flatten_sparse_tensors(at::TensorList tensors) {
+  auto flat_indices = flatten_dense_tensors(fmap(tensors, &get_indices));
+  auto flat_values = flatten_dense_tensors(fmap(tensors, &get_values));
+  return std::make_pair(flat_indices, flat_values);
+}
+
+std::vector<at::Tensor> unflatten_sparse_tensors(
+        const at::Tensor& flat_indices, const at::Tensor& flat_values,
+        at::TensorList tensors) {
+  if (tensors.size() == 0) return {};
+
+  auto indices = unflatten_dense_tensors(flat_indices, fmap(tensors, &get_indices));
+  auto values = unflatten_dense_tensors(flat_values, fmap(tensors, &get_values));
+
+  std::vector<at::Tensor> outputs;
+  outputs.reserve(tensors.size());
+  auto & type = tensors[0].type();
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i)
+    outputs.emplace_back(type._sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
+  return outputs;
+}
+
+
+}}
diff --git a/torch/csrc/utils/tensor_flatten.h b/torch/csrc/utils/tensor_flatten.h
new file mode 100644
index 0000000..aba8dda
--- /dev/null
+++ b/torch/csrc/utils/tensor_flatten.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/assertions.h"
+
+#include <ATen/ATen.h>
+#include <utility>
+
+namespace torch { namespace utils {
+
+inline at::Tensor flatten_dense_tensors(at::TensorList tensors) {
+  static auto flatten = [](const at::Tensor &t) { return t.contiguous().view({-1}); };
+  if (tensors.size() == 1)
+    return flatten(tensors[0]);
+  return at::cat(fmap(tensors, flatten));
+}
+
+inline std::vector<at::Tensor> unflatten_dense_tensors(const at::Tensor& flat, at::TensorList tensors) {
+  std::vector<at::Tensor> outputs;
+  outputs.reserve(tensors.size());
+  size_t offset = 0;
+  for (const auto & tensor : tensors) {
+    auto numel = tensor.numel();
+    outputs.push_back(flat.narrow(0, offset, numel).view(tensor.sizes()));
+    offset += numel;
+  }
+  return outputs;
+}
+
+
+struct TensorGroup {
+  std::vector<at::Tensor> tensors;
+  size_t size = 0;
+
+  at::Type& type() {
+    TORCH_ASSERT(!tensors.empty());
+    return tensors[0].type();
+  }
+};
+
+std::vector<TensorGroup> take_tensors(at::TensorList tensors, size_t size_limit);
+void reorder_tensors_like(std::vector<at::Tensor>& tensors, at::TensorList order);
+
+std::pair<at::Tensor, at::Tensor> flatten_sparse_tensors(at::TensorList tensors);
+
+std::vector<at::Tensor> unflatten_sparse_tensors(
+    const at::Tensor& flat_indices,
+    const at::Tensor& flat_values,
+    at::TensorList tensors);
+
+}}
diff --git a/torch/csrc/utils/tensor_layouts.cpp b/torch/csrc/utils/tensor_layouts.cpp
new file mode 100644
index 0000000..6fd210f
--- /dev/null
+++ b/torch/csrc/utils/tensor_layouts.cpp
@@ -0,0 +1,36 @@
+#include "torch/csrc/python_headers.h"
+
+#include "torch/csrc/utils/tensor_layouts.h"
+
+#include "torch/csrc/Layout.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+
+#include <ATen/ScalarType.h>
+#include <ATen/Layout.h>
+
+namespace torch { namespace utils {
+
+void initializeLayouts() {
+  auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch_module) python_error();
+
+  PyObject *strided_layout = THPLayout_New(at::Layout::Strided, "torch.strided");
+  Py_INCREF(strided_layout);
+  if (PyModule_AddObject(torch_module, "strided", strided_layout) != 0) {
+    throw python_error();
+  }
+  // for now, let's look these up by Backend; we could create our own enum in the future.
+  registerLayoutObject((THPLayout*)strided_layout, at::Backend::CPU);
+  registerLayoutObject((THPLayout*)strided_layout, at::Backend::CUDA);
+
+  PyObject *sparse_coo_layout = THPLayout_New(at::Layout::Sparse, "torch.sparse_coo");
+  Py_INCREF(sparse_coo_layout);
+  if (PyModule_AddObject(torch_module, "sparse_coo", sparse_coo_layout) != 0) {
+    throw python_error();
+  }
+  registerLayoutObject((THPLayout*)sparse_coo_layout, at::Backend::SparseCPU);
+  registerLayoutObject((THPLayout*)sparse_coo_layout, at::Backend::SparseCUDA);
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_layouts.h b/torch/csrc/utils/tensor_layouts.h
new file mode 100644
index 0000000..57142d5
--- /dev/null
+++ b/torch/csrc/utils/tensor_layouts.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch { namespace utils {
+
+void initializeLayouts();
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
new file mode 100644
index 0000000..30408fc
--- /dev/null
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -0,0 +1,44 @@
+#include "tensor_list.h"
+
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/python_scalars.h"
+
+using namespace at;
+
+namespace torch { namespace utils {
+
+static PyObject* recursive_to_list(
+    char* data, IntList sizes, IntList strides, int64_t dim,
+    ScalarType scalarType, int64_t elementSize)
+{
+  int64_t ndim = sizes.size();
+  if (dim == ndim) {
+    return torch::utils::load_scalar(data, scalarType);
+  }
+  auto n = sizes[dim];
+  auto list = THPObjectPtr(PyList_New(n));
+  if (!list) throw python_error();
+  for (int64_t i = 0; i < n; i++) {
+    PyObject* obj = recursive_to_list(data, sizes, strides, dim + 1, scalarType, elementSize);
+    if (!obj) throw python_error();
+    PyList_SET_ITEM(list.get(), i, obj);
+    data += strides[dim] * elementSize;
+  }
+  return list.release();
+}
+
+PyObject* tensor_to_list(const Tensor& tensor) {
+  Tensor data = tensor;
+  if (data.type().backend() != kCPU) {
+    with_no_gil([&]() {
+      data = data.toBackend(kCPU);
+    });
+  }
+  auto& type = data.type();
+  return recursive_to_list(
+      (char*)data.data_ptr(), data.sizes(), data.strides(), 0,
+      type.scalarType(), type.elementSizeInBytes());
+}
+
+}}  // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_list.h b/torch/csrc/utils/tensor_list.h
new file mode 100644
index 0000000..963071e
--- /dev/null
+++ b/torch/csrc/utils/tensor_list.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+namespace torch { namespace utils {
+
+PyObject* tensor_to_list(const at::Tensor& tensor);
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
new file mode 100644
index 0000000..3a8b4a7
--- /dev/null
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -0,0 +1,556 @@
+#include "torch/csrc/python_headers.h"
+#include "tensor_new.h"
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Size.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/numpy_stub.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_scalars.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/tensor_conversion_dispatch.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+
+#include <ATen/ATen.h>
+#include <ATen/Error.h>
+#include <ATen/optional.h>
+
+#include <stdexcept>
+#include <vector>
+
+using at::Device;
+using at::IntList;
+using at::kCPU;
+using at::kCUDA;
+using at::kLong;
+using at::kSparseCPU;
+using at::kSparseCUDA;
+using at::optional;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorOptions;
+using at::Type;
+
+namespace torch { namespace utils {
+namespace {
+const int MAX_DIMS = 128;
+
+void maybe_initialize_cuda(const Type &type) {
+  if (type.is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+}
+
+Tensor dispatch_zeros(const Type& type, int32_t device_index, IntList sizes) {
+  maybe_initialize_cuda(type);
+  AutoNoGIL no_gil;
+  return torch::zeros(sizes, TensorOptions(type, device_index));
+}
+
+Tensor dispatch_ones(const Type& type, int32_t device_index, IntList sizes) {
+  maybe_initialize_cuda(type);
+  AutoNoGIL no_gil;
+  return torch::ones(sizes, TensorOptions(type, device_index));
+}
+
+Tensor dispatch_full(const Type& type, Scalar fill_value, int32_t device_index, IntList sizes) {
+  maybe_initialize_cuda(type);
+  AutoNoGIL no_gil;
+  return torch::full(sizes, fill_value, TensorOptions(type, device_index));
+}
+
+Tensor new_with_sizes(const Type& type, int32_t device_index, IntList sizes) {
+  maybe_initialize_cuda(type);
+  AutoNoGIL no_gil;
+  return torch::empty(sizes, TensorOptions(type, device_index));
+}
+
+Tensor new_with_storage(const Type& type, Storage& storage) {
+  auto tensor = at::empty({}, type);
+  tensor.set_(storage);
+  return tensor;
+}
+
+Tensor new_with_tensor(const Type& type, Tensor other) {
+  if (other.type() != type) {
+    throw TypeError("expected %s (got %s)", type.toString(), other.type().toString());
+  }
+  return other.slice();
+}
+
+Tensor new_with_type_conversion(const Type& type, Tensor other, int32_t device_index) {
+  return dispatch_type_conversion(other, type, device_index, false);
+}
+
+Tensor new_with_tensor_copy(const Type& type, Tensor other, int32_t device_index) {
+  maybe_initialize_cuda(type);
+  AutoNoGIL no_gil;
+  at::DeviceGuard device_guard;
+  if (type.is_cuda()) {
+    device_guard.set_index(device_index);
+  }
+  return type.copy(other);
+}
+
+std::vector<int64_t> compute_sizes(PyObject* seq) {
+  std::vector<int64_t> sizes;
+  THPObjectPtr handle;
+  while (PySequence_Check(seq)) {
+    auto length = PySequence_Length(seq);
+    if (length < 0) throw python_error();
+    sizes.push_back(length);
+    if (sizes.size() > MAX_DIMS) {
+      throw ValueError("too many dimensions '%s'", Py_TYPE(seq)->tp_name);
+    }
+    if (length == 0) break;
+    handle = THPObjectPtr(PySequence_GetItem(seq, 0));
+    if (!handle) {
+      throw ValueError("could not determine the shape of object type '%s'", Py_TYPE(seq)->tp_name);
+    }
+    seq = handle.get();
+  }
+
+  return sizes;
+}
+
+ScalarType infer_scalar_type(PyObject *obj) {
+  if (PyFloat_Check(obj)) {
+    // this is always guaranteed to be a floating-point type, and makes it more
+    // convenient to write e.g. torch.tensor(0.) than torch.tensor(0., dtype=torch.Tensor.dtype).
+    return torch::tensors::get_default_tensor_type().scalarType();
+  }
+  if (THPUtils_checkLong(obj)) {
+    return ScalarType::Long;
+  }
+  if (PyBool_Check(obj)) {
+    // TODO: infer Bool when we have Bool ScalarType
+    return ScalarType::Byte;
+  }
+  if (THPVariable_Check(obj)) {
+    auto var = reinterpret_cast<THPVariable*>(obj)->cdata;
+    return var.type().scalarType();
+  }
+#ifdef USE_NUMPY
+  if (PyArray_Check(obj)) {
+    auto array = (PyArrayObject*)obj;
+    return numpy_dtype_to_aten(PyArray_TYPE(array));
+  }
+#endif
+  if (PySequence_Check(obj)) {
+    at::optional<ScalarType> scalarType;
+    auto length = PySequence_Length(obj);
+    if (length < 0) throw python_error();
+    // match NumPy semantics, except use default tensor type instead of double.
+    if (length == 0) return torch::tensors::get_default_tensor_type().scalarType();
+    for (int i = 0; i < length; ++i) {
+      THPObjectPtr handle(PySequence_GetItem(obj, i));
+      if (!handle) throw python_error();
+      ScalarType item_scalarType = infer_scalar_type(handle.get());
+      scalarType = (scalarType) ?
+          at::promoteTypes(*scalarType, item_scalarType) : item_scalarType;
+      if (scalarType == ScalarType::Double) {
+        // this won't change (unless we hit undefined, but that will fail later).
+        return *scalarType;
+      }
+    }
+    return *scalarType;
+  }
+  AT_ERROR("Could not infer dtype of ", Py_TYPE(obj)->tp_name);
+}
+
+void recursive_store(char* data, IntList sizes, IntList strides, int64_t dim,
+                            ScalarType scalarType, int elementSize, PyObject* obj) {
+  int64_t ndim = sizes.size();
+  if (dim == ndim) {
+    torch::utils::store_scalar(data, scalarType, obj);
+    return;
+  }
+
+  auto n = sizes[dim];
+  auto seq = THPObjectPtr(PySequence_Fast(obj, "not a sequence"));
+  if (!seq) throw python_error();
+  auto seq_size = PySequence_Fast_GET_SIZE(seq.get());
+  if (seq_size != n) {
+    throw ValueError("expected sequence of length %lld at dim %lld (got %lld)",
+      (long long)n, (long long)dim, (long long)seq_size);
+  }
+
+  PyObject** items = PySequence_Fast_ITEMS(seq.get());
+  for (int64_t i = 0; i < n; i++) {
+    recursive_store(data, sizes, strides, dim + 1, scalarType, elementSize, items[i]);
+    data += strides[dim] * elementSize;
+  }
+}
+
+Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt, PyObject* data,
+                                     bool copy_variables, bool copy_numpy,
+                                     bool type_inference) {
+  int32_t device_index = -1;
+  if (device_opt.has_value()) {
+    device_index = device_opt->index();
+  }
+  if (THPUtils_checkString(data)) {
+    throw TypeError("new(): invalid data type '%s'", Py_TYPE(data)->tp_name);
+  }
+
+  if (THPVariable_Check(data)) {
+      auto var = reinterpret_cast<THPVariable*>(data)->cdata;
+      auto type_inference_device_type = device_opt.has_value() ? device_opt->type()
+                                                               : torch::getDeviceType(var.type());
+      // infer the scalar type and device type; it's not expected to infer the layout since these constructors
+      // are defined per-layout-type (e.g. tensor vs sparse_coo_tensor).
+      const auto& type_inference_type = torch::getType(var.type().scalarType(),
+                                                       *torch::getLayout(type.backend()),
+                                                       type_inference_device_type);
+      const auto& type_to_use = type_inference ? type_inference_type : type;
+      return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
+                              new_with_type_conversion(type_to_use, var, device_index);
+  }
+
+#ifdef USE_NUMPY
+  if (PyArray_Check(data)) {
+    auto tensor = autograd::make_variable(tensor_from_numpy(data), /*requires_grad=*/false);
+    const auto& type_to_use = type_inference ? type.toScalarType(tensor.type().scalarType()) : type;
+    return copy_numpy ? new_with_tensor_copy(type_to_use, tensor, device_index) :
+                        new_with_type_conversion(type_to_use, tensor, device_index);
+  }
+#endif
+
+  auto sizes = compute_sizes(data);
+  ScalarType scalarType = type_inference ? infer_scalar_type(data) : type.scalarType();
+  auto tensor = autograd::make_variable(CPU(scalarType).tensor(sizes), /*requires_grad=*/false);
+  recursive_store(
+      (char*)tensor.data_ptr(), tensor.sizes(), tensor.strides(), 0,
+      scalarType, tensor.type().elementSizeInBytes(), data);
+  const auto& type_to_use = type_inference ? type.toScalarType(scalarType) : type;
+  return new_with_type_conversion(type_to_use, tensor, device_index);
+}
+
+Tensor new_from_data_copy(const Type & type, at::optional<Device> device, PyObject *data) {
+  return internal_new_from_data(type, device, data, true, true, false);
+}
+
+Tensor legacy_new_from_sequence(const Type & type, at::optional<Device> device, PyObject* data) {
+  if (!PySequence_Check(data)) {
+    throw TypeError("new(): data must be a sequence (got %s)", Py_TYPE(data)->tp_name);
+  }
+  return legacy_new_from_data(type, device, data);
+}
+
+Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new(*, Device? device=None)",
+    "new(*, int64_t cdata)|hidden",
+    "new(Tensor indices, Tensor values, *, Device? device=None)",
+    "new(Tensor indices, Tensor values, IntList size, *, Device? device=None)",
+    "new(IntList size, *, Device? device=None)",
+  });
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    return at::empty({}, TensorOptions(type, r.device(0).index()));
+  } else if (r.idx == 1) {
+    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
+    return type.unsafeTensorFromTH(cdata, true);
+  } else if (r.idx == 2) {
+    at::DeviceGuard device_guard(r.device(2));
+    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+  } else if (r.idx == 3) {
+    at::DeviceGuard device_guard(r.device(3));
+    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+  } else if (r.idx == 4) {
+    PyObject* arg = r.pyobject(0);
+    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
+      // new(sequence) binds to this signature but should be treated differently
+      // unless the sequences is a torch.Size
+      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    }
+    return new_with_sizes(type, r.device(1).index(), r.intlist(0));
+  }
+  throw std::runtime_error("new(): invalid arguments");
+}
+
+Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new(*, Device? device=None)",
+    "new(*, int64_t cdata)|hidden",
+    "new(Tensor indices, Tensor values, *, Device? device=None)",
+    "new(Tensor indices, Tensor values, IntList size, *, Device? device=None)",
+    "new(IntList size, *, Device? device=None)",
+  });
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    at::DeviceGuard device_guard(r.device(0));
+    return type.tensor();
+  } else if (r.idx == 1) {
+    auto cdata = reinterpret_cast<void*>(r.device(0).index());
+    return type.unsafeTensorFromTH(cdata, true);
+  } else if (r.idx == 2) {
+    // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
+    // have a device (we should infer it).
+    at::DeviceGuard device_guard(r.device(2));
+    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+  } else if (r.idx == 3) {
+    // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
+    // have a device (we should infer it).
+    at::DeviceGuard device_guard(r.device(3));
+    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+  } else if (r.idx == 4) {
+    PyObject* arg = r.pyobject(0);
+    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
+      // new(sequence) binds to this signature but should be treated differently
+      // unless the sequences is a torch.Size
+      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    }
+    return new_with_sizes(type, r.device(1).index(), r.intlist(0));
+  }
+  throw std::runtime_error("new(): invalid arguments");
+}
+
+const Type& typeWithDefault(PythonArgs& r, int64_t dtype_idx, int64_t device_idx, const Type& type) {
+  const auto scalartype = r.scalartypeWithDefault(dtype_idx, type.scalarType());
+  const Device types_device_type(toDense(type.backend()));
+  const auto device_type = r.isNone(device_idx) ? types_device_type : r.device(device_idx).type();
+  return torch::getType(scalartype, *torch::getLayout(type.backend()), device_type);
+}
+} // namespace
+
+Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new(*, Device? device=None)",
+    "new(Storage storage)",
+    "new(*, int64_t cdata)|hidden",
+    "new(Tensor other)",
+    "new(IntList size, *, Device? device=None)",
+    "new(PyObject* data, *, Device? device=None)",
+  });
+
+  if (type.is_sparse()) {
+    return legacy_sparse_tensor_ctor(type, args, kwargs);
+  }
+
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    at::DeviceGuard device_guard(r.device(0));
+    return type.tensor();
+  } else if (r.idx == 1) {
+    return new_with_storage(type, *r.storage(0));
+  } else if (r.idx == 2) {
+    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
+    return type.unsafeTensorFromTH(cdata, true);
+  } else if (r.idx == 3) {
+    return new_with_tensor(type, r.tensor(0));
+  } else if (r.idx == 4) {
+    PyObject* arg = r.pyobject(0);
+    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
+      // new(sequence) binds to this signature but should be treated differently
+      // unless the sequences is a torch.Size
+      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    }
+    return new_with_sizes(type, r.device(1).index(), r.intlist(0));
+  } else if (r.idx == 5) {
+    return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+  }
+  throw std::runtime_error("new(): invalid arguments");
+}
+
+Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new(*, Device? device=None)",
+    "new(Storage storage)",
+    "new(*, int64_t cdata)|hidden",
+    "new(Tensor other)",  // this doesn't have a dtype/device because it creates an alias.
+    "new(IntList size, *, Device? device=None)",
+    "new(PyObject* data, *, Device? device=None)",
+  });
+
+  if (type.is_sparse()) {
+    return legacy_sparse_tensor_new(type, args, kwargs);
+  }
+
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    at::DeviceGuard device_guard(r.device(0));
+    return type.tensor();
+  } else if (r.idx == 1) {
+    return new_with_storage(type, *r.storage(0));
+  } else if (r.idx == 2) {
+    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
+    return type.unsafeTensorFromTH(cdata, true);
+  } else if (r.idx == 3) {
+    return new_with_tensor(type, r.tensor(0));
+  } else if (r.idx == 4) {
+    PyObject* arg = r.pyobject(0);
+    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
+      // new(sequence) binds to this signature but should be treated differently
+      // unless the sequences is a torch.Size
+      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    }
+    return new_with_sizes(type, r.device(1).index(), r.intlist(0));
+  } else if (r.idx == 5) {
+    return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+  }
+  throw std::runtime_error("new(): invalid arguments");
+}
+
+Tensor legacy_new_from_data(const Type & type, at::optional<Device> device, PyObject *data) {
+  return internal_new_from_data(type, device, data, false, false, false);
+}
+
+Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
+  const auto sparse_backend = type.is_cuda() ? kSparseCUDA : kSparseCPU;
+  const auto& default_sparse_type = type.toBackend(sparse_backend);
+
+  static PythonArgParser parser({
+    "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+    "sparse_coo_tensor(PyObject* indices, PyObject* values, IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<6> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    bool type_inference = r.isNone(2);
+    const auto& sparse_type = typeWithDefault(r, 2, 3, default_sparse_type);
+    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? kCUDA : kCPU);
+    at::DeviceGuard device_guard(r.device(3));
+    Tensor values = internal_new_from_data(dense_type, r.deviceOptional(3), r.pyobject(1), false, true, type_inference);
+    // if no dtype provided, infer type based on value type.
+    const auto& index_type = values.type().toScalarType(kLong);
+    Tensor indices = internal_new_from_data(index_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
+    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? kSparseCUDA : kSparseCPU);
+    return sparse_type_to_use.sparse_coo_tensor(indices, values).set_requires_grad(r.toBool(4));
+  } else if (r.idx == 1) {
+    bool type_inference = r.isNone(3);
+    const auto& sparse_type = typeWithDefault(r, 3, 4, default_sparse_type);
+    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? kCUDA : kCPU);
+    at::DeviceGuard device_guard(r.device(4));
+    Tensor values = internal_new_from_data(dense_type, r.deviceOptional(4), r.pyobject(1), false, true, type_inference);
+    const auto& index_type = values.type().toScalarType(kLong);
+    Tensor indices = internal_new_from_data(index_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
+    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? kSparseCUDA : kSparseCPU);
+    return sparse_type_to_use.sparse_coo_tensor(indices, values, r.intlist(2)).set_requires_grad(r.toBool(5));
+  }
+  throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
+}
+
+Tensor tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    bool type_inference = r.isNone(1);
+    return internal_new_from_data(
+               typeWithDefault(r, 1, 2, type),
+               r.deviceOptional(2),
+               r.pyobject(0),
+               true,
+               true,
+               type_inference)
+        .set_requires_grad(r.toBool(3));
+  }
+  throw std::runtime_error("tensor(): invalid arguments");
+}
+
+Tensor as_tensor(const Type& type, PyObject* args, PyObject* kwargs) {
+  // TODO: add requires_grad once we decide on semantics for sharing data.
+  static PythonArgParser parser({
+    "as_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None)",
+  });
+
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    bool type_inference = r.isNone(1);
+    return internal_new_from_data(
+        typeWithDefault(r, 1, 2, type), r.deviceOptional(2), r.pyobject(0), false, false, type_inference);
+  }
+  throw std::runtime_error("tensor(): invalid arguments");
+}
+
+Tensor new_tensor(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    return new_from_data_copy(
+               typeWithDefault(r, 1, 2, type),
+               r.deviceOptional(2),
+               r.pyobject(0))
+        .set_requires_grad(r.toBool(3));
+  }
+  throw std::runtime_error("new_tensor(): invalid arguments");
+}
+
+Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new_empty(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    const auto& actual_type = typeWithDefault(r, 1, 2, type);
+    return new_with_sizes(actual_type, r.device(2).index(), r.intlist(0)).set_requires_grad(r.toBool(3));
+  }
+  throw std::runtime_error("new_empty(): invalid arguments");
+}
+
+Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new_full(IntList size, Scalar fill_value, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    const auto& actual_type = typeWithDefault(r, 2, 3, type);
+    return dispatch_full(actual_type, r.scalar(1), r.device(3).index(), r.intlist(0)).set_requires_grad(r.toBool(4));
+  }
+  throw std::runtime_error("new_full(): invalid arguments");
+}
+
+Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new_ones(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    const auto& actual_type = typeWithDefault(r, 1, 2, type);
+    return dispatch_ones(actual_type, r.device(2).index(), r.intlist(0)).set_requires_grad(r.toBool(3));
+  }
+  throw std::runtime_error("new_ones(): invalid arguments");
+}
+
+Tensor new_zeros(const Type& type, PyObject* args, PyObject* kwargs) {
+  static PythonArgParser parser({
+    "new_zeros(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    const auto& actual_type = typeWithDefault(r, 1, 2, type);
+    return dispatch_zeros(actual_type, r.device(2).index(), r.intlist(0)).set_requires_grad(r.toBool(3));
+  }
+  throw std::runtime_error("new_zeros(): invalid arguments");
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h
new file mode 100644
index 0000000..39993a5
--- /dev/null
+++ b/torch/csrc/utils/tensor_new.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+
+#include <ATen/ATen.h>
+
+namespace torch { namespace utils {
+
+at::Tensor legacy_tensor_ctor(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor legacy_tensor_new(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor legacy_new_from_data(const at::Type& type, at::optional<at::Device> device, PyObject *data);
+at::Tensor sparse_coo_tensor_ctor(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor tensor_ctor(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor as_tensor(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor new_tensor(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor new_empty(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor new_full(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor new_ones(const at::Type& type, PyObject* args, PyObject* kwargs);
+at::Tensor new_zeros(const at::Type& type, PyObject* args, PyObject* kwargs);
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
new file mode 100644
index 0000000..7215de5
--- /dev/null
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -0,0 +1,179 @@
+#include "tensor_numpy.h"
+
+#include "torch/csrc/utils/numpy_stub.h"
+
+#ifndef USE_NUMPY
+namespace torch { namespace utils {
+PyObject* tensor_to_numpy(const at::Tensor& tensor) {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+at::Tensor tensor_from_numpy(PyObject* obj) {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+}}
+#else
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_variable.h"
+
+#include <ATen/ATen.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+using namespace at;
+using namespace torch::autograd;
+
+namespace torch { namespace utils {
+
+static std::vector<npy_intp> to_numpy_shape(IntList x) {
+  // shape and stride conversion from int64_t to npy_intp
+  auto nelem = x.size();
+  auto result = std::vector<npy_intp>(nelem);
+  for (size_t i = 0; i < nelem; i++) {
+    result[i] = static_cast<npy_intp>(x[i]);
+  }
+  return result;
+}
+
+static std::vector<int64_t> to_aten_shape(int ndim, npy_intp* values) {
+  // shape and stride conversion from npy_intp to int64_t
+  auto result = std::vector<int64_t>(ndim);
+  for (int i = 0; i < ndim; i++) {
+    result[i] = static_cast<int64_t>(values[i]);
+  }
+  return result;
+}
+
+static int aten_to_dtype(const at::Type& type);
+
+PyObject* tensor_to_numpy(const at::Tensor& tensor) {
+  auto dtype = aten_to_dtype(tensor.type());
+  auto sizes = to_numpy_shape(tensor.sizes());
+  auto strides = to_numpy_shape(tensor.strides());
+  // NumPy strides use bytes. Torch strides use element counts.
+  auto element_size_in_bytes = tensor.type().elementSizeInBytes();
+  for (auto& stride : strides) {
+    stride *= element_size_in_bytes;
+  }
+
+  auto array = THPObjectPtr(PyArray_New(
+      &PyArray_Type,
+      tensor.dim(),
+      sizes.data(),
+      dtype,
+      strides.data(),
+      tensor.data_ptr(),
+      0,
+      NPY_ARRAY_ALIGNED | NPY_ARRAY_WRITEABLE,
+      nullptr));
+  if (!array) return NULL;
+
+  // TODO: This attempts to keep the underlying memory alive by setting the base
+  // object of the ndarray to the tensor and disabling resizes on the storage.
+  // This is not sufficient. For example, the tensor's storage may be changed
+  // via Tensor.set_, which can free the underlying memory.
+  PyObject* py_tensor = THPVariable_Wrap(make_variable(tensor, false));
+  if (!py_tensor) throw python_error();
+  if (PyArray_SetBaseObject((PyArrayObject*)array.get(), py_tensor) == -1) {
+    return NULL;
+  }
+  tensor.storage()->clear_flag(Storage::RESIZABLE);
+
+  return array.release();
+}
+
+at::Tensor tensor_from_numpy(PyObject* obj) {
+  if (!PyArray_Check(obj)) {
+    throw TypeError("expected np.ndarray (got %s)", Py_TYPE(obj)->tp_name);
+  }
+
+  auto array = (PyArrayObject*)obj;
+  int ndim = PyArray_NDIM(array);
+  auto sizes = to_aten_shape(ndim, PyArray_DIMS(array));
+  auto strides = to_aten_shape(ndim, PyArray_STRIDES(array));
+  // NumPy strides use bytes. Torch strides use element counts.
+  auto element_size_in_bytes = PyArray_ITEMSIZE(array);
+  for (auto& stride : strides) {
+    if (stride%element_size_in_bytes != 0) {
+      throw ValueError(
+        "given numpy array strides not a multiple of the element byte size. "
+        "Copy the numpy array to reallocate the memory.");
+    }
+    stride /= element_size_in_bytes;
+  }
+
+  size_t storage_size = 1;
+  for (int i = 0; i < ndim; i++) {
+    if (strides[i] < 0) {
+      throw ValueError(
+          "some of the strides of a given numpy array are negative. This is "
+          "currently not supported, but will be added in future releases.");
+    }
+    // XXX: this won't work for negative strides
+    storage_size += (sizes[i] - 1) * strides[i];
+  }
+
+  void* data_ptr = PyArray_DATA(array);
+  auto& type = CPU(numpy_dtype_to_aten(PyArray_TYPE(array)));
+  Py_INCREF(obj);
+  return type.tensorFromBlob(data_ptr, sizes, strides, [obj](void* data) {
+    AutoGIL gil;
+    Py_DECREF(obj);
+  });
+}
+
+static int aten_to_dtype(const at::Type& type) {
+  if (type.is_cuda()) {
+    throw TypeError(
+        "can't convert CUDA tensor to numpy. Use Tensor.cpu() to "
+        "copy the tensor to host memory first.");
+  }
+  if (type.is_sparse()) {
+    throw TypeError(
+        "can't convert sparse tensor to numpy. Use Tensor.to_dense() to "
+        "convert to a dense tensor first.");
+  }
+  if (type.backend() == kCPU) {
+    switch (type.scalarType()) {
+      case kDouble: return NPY_DOUBLE;
+      case kFloat: return NPY_FLOAT;
+      case kHalf: return NPY_HALF;
+      case kLong: return NPY_INT64;
+      case kInt: return NPY_INT32;
+      case kShort: return NPY_INT16;
+      case kByte: return NPY_UINT8;
+      default: break;
+    }
+  }
+  throw TypeError("NumPy conversion for %s is not supported", type.toString());
+}
+
+ScalarType numpy_dtype_to_aten(int dtype) {
+  switch (dtype) {
+    case NPY_DOUBLE: return kDouble;
+    case NPY_FLOAT: return kFloat;
+    case NPY_HALF: return kHalf;
+    case NPY_INT32: return kInt;
+    case NPY_INT16: return kShort;
+    case NPY_UINT8: return kByte;
+    default:
+      // Workaround: MSVC does not support two switch cases that have the same value
+      if (dtype == NPY_LONGLONG || dtype == NPY_INT64) {
+        return kLong;
+      } else {
+        break;
+      }
+  }
+  auto pytype = THPObjectPtr(PyArray_TypeObjectFromType(dtype));
+  if (!pytype) throw python_error();
+  throw TypeError(
+      "can't convert np.ndarray of type %s. The only supported types are: "
+      "double, float, float16, int64, int32, and uint8.",
+      ((PyTypeObject*)pytype.get())->tp_name);
+}
+
+}} // namespace torch::utils
+
+#endif  // USE_NUMPY
diff --git a/torch/csrc/utils/tensor_numpy.h b/torch/csrc/utils/tensor_numpy.h
new file mode 100644
index 0000000..4a86306
--- /dev/null
+++ b/torch/csrc/utils/tensor_numpy.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+namespace torch { namespace utils {
+
+PyObject* tensor_to_numpy(const at::Tensor& tensor);
+at::Tensor tensor_from_numpy(PyObject* obj);
+
+at::ScalarType numpy_dtype_to_aten(int dtype);
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
new file mode 100644
index 0000000..7fd6319
--- /dev/null
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -0,0 +1,89 @@
+#include <Python.h>
+
+#include "tensor_types.h"
+
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/tensor/python_tensor.h"
+
+#include <sstream>
+#include <unordered_map>
+#include <algorithm>
+
+using namespace at;
+
+namespace torch { namespace utils {
+
+static const char* backend_to_string(const at::Type& type) {
+  switch (type.backend()) {
+    case at::kCPU: return "torch";
+    case at::kCUDA: return "torch.cuda";
+    case at::kSparseCPU: return "torch.sparse";
+    case at::kSparseCUDA: return "torch.cuda.sparse";
+    default: throw std::runtime_error("Unimplemented backend");
+  }
+}
+
+std::string type_to_string(const at::Type& type) {
+  std::ostringstream ss;
+  ss << backend_to_string(type) << "." << toString(type.scalarType()) << "Tensor";
+  return ss.str();
+}
+
+at::Type& type_from_string(const std::string& str) {
+  static std::string cuda_prefix("torch.cuda.");
+  static std::once_flag cpu_once;
+  static std::once_flag cuda_once;
+  static std::unordered_map<std::string, Type*> cpu_map;
+  static std::unordered_map<std::string, Type*> cuda_map;
+
+  const std::unordered_map<std::string, Type*>* map = nullptr;
+
+  if (str == "torch.Tensor") {
+    return torch::tensors::get_default_tensor_type();
+  }
+
+  if (std::mismatch(cuda_prefix.begin(), cuda_prefix.end(), str.begin()).first == cuda_prefix.end()) {
+    // torch.cuda. is prefix of str
+    std::call_once(cuda_once, []() {
+      for (auto type : autograd::VariableType::allCUDATypes()) {
+        cuda_map.emplace(type_to_string(*type), type);
+      }
+    });
+    map = &cuda_map;
+  } else {
+    std::call_once(cpu_once, []() {
+      for (auto type : autograd::VariableType::allCPUTypes()) {
+        cpu_map.emplace(type_to_string(*type), type);
+      }
+    });
+    map = &cpu_map;
+  }
+
+  auto it = map->find(str);
+  if (it == map->end()) {
+    throw ValueError("invalid type: '%s'", str.c_str());
+  }
+  return *it->second;
+}
+
+std::vector<std::pair<Backend, ScalarType>> all_declared_types() {
+  std::vector<std::pair<Backend, ScalarType>> ret;
+  // can't easily iterate over enum classes
+  std::vector<Backend> backends = { Backend::CPU, Backend::CUDA, Backend::SparseCPU, Backend::SparseCUDA };
+  std::vector<ScalarType> scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
+                                           ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half};
+  for (auto& backend : backends) {
+    for (auto& scalar_type : scalar_types) {
+      // there is no sparse half types.
+      if (scalar_type == ScalarType::Half && (backend == Backend::SparseCUDA || backend == Backend::SparseCPU)) {
+        continue;
+      }
+      ret.emplace_back(std::make_pair(backend, scalar_type));
+    }
+  }
+
+  return ret;
+}
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_types.h b/torch/csrc/utils/tensor_types.h
new file mode 100644
index 0000000..6fbd725
--- /dev/null
+++ b/torch/csrc/utils/tensor_types.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <utility>
+#include <vector>
+
+namespace torch { namespace utils {
+
+std::string type_to_string(const at::Type& type);
+at::Type& type_from_string(const std::string& str);
+
+// return a vector of all "declared" types, even those that weren't compiled
+std::vector<std::pair<at::Backend, at::ScalarType>> all_declared_types();
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tuple_parser.cpp b/torch/csrc/utils/tuple_parser.cpp
new file mode 100644
index 0000000..86d1979
--- /dev/null
+++ b/torch/csrc/utils/tuple_parser.cpp
@@ -0,0 +1,107 @@
+#include "tuple_parser.h"
+
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "python_strings.h"
+#include "python_numbers.h"
+
+#include <string>
+#include <stdexcept>
+#include <vector>
+
+namespace torch {
+
+TupleParser::TupleParser(PyObject* args, int num_args) : args(args), idx(0) {
+   int size = (int) PyTuple_GET_SIZE(args);
+   if (num_args >= 0 && size != num_args) {
+     std::string msg("missing required arguments (expected ");
+     msg += std::to_string(num_args) + " got " + std::to_string(size) + ")";
+     throw std::runtime_error(msg);
+   }
+ }
+
+auto TupleParser::parse(bool& x, const std::string& param_name) -> void {
+  PyObject* obj = next_arg();
+  if (!PyBool_Check(obj)) {
+    throw invalid_type("bool", param_name);
+  }
+  x = (obj == Py_True);
+}
+
+auto TupleParser::parse(int& x, const std::string& param_name) -> void {
+  PyObject* obj = next_arg();
+  if (!THPUtils_checkLong(obj)) {
+    throw invalid_type("int", param_name);
+  }
+  x = THPUtils_unpackLong(obj);
+}
+
+auto TupleParser::parse(double& x, const std::string& param_name) -> void {
+  PyObject* obj = next_arg();
+  if (!THPUtils_checkDouble(obj)) {
+    throw invalid_type("float", param_name);
+  }
+  x = THPUtils_unpackDouble(obj);
+}
+
+auto TupleParser::parse(std::vector<int>& x, const std::string& param_name) -> void {
+  PyObject* obj = next_arg();
+  if (!PyTuple_Check(obj)) {
+    throw invalid_type("tuple of int", param_name);
+  }
+  int size = PyTuple_GET_SIZE(obj);
+  x.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyTuple_GET_ITEM(obj, i);
+    if (!THPUtils_checkLong(item)) {
+      throw invalid_type("tuple of int", param_name);
+    }
+    x[i] = THPUtils_unpackLong(item);
+  }
+}
+
+auto TupleParser::parse(std::string& x, const std::string& param_name) -> void {
+  PyObject* obj = next_arg();
+  if (!THPUtils_checkString(obj)) {
+    throw invalid_type("bytes/str", param_name);
+  }
+  x = THPUtils_unpackString(obj);
+}
+
+auto TupleParser::next_arg() -> PyObject* {
+  if (idx >= PyTuple_GET_SIZE(args)) {
+    throw std::runtime_error("out of range");
+  }
+  return PyTuple_GET_ITEM(args, idx++);
+}
+
+auto TupleParser::invalid_type(const std::string& expected, const std::string& param_name) -> std::runtime_error {
+  std::string msg("argument ");
+  msg += std::to_string(idx - 1);
+  msg += " (";
+  msg += param_name;
+  msg += ") ";
+  msg += "must be ";
+  msg += expected;
+
+  PyObject* obj = PyTuple_GET_ITEM(args, idx -1);
+  if (PyTuple_Check(obj)){
+    msg += " but got tuple of (";
+    int size = PyTuple_GET_SIZE(obj);
+    for (int i = 0; i < size; ++i) {
+      msg += Py_TYPE(PyTuple_GET_ITEM(obj, i))->tp_name;
+      if (i != size - 1){
+        msg += ", ";
+      }
+    }
+    msg += ")";
+  }
+  else{
+    msg += ", not ";
+    msg += Py_TYPE(PyTuple_GET_ITEM(args, idx - 1))->tp_name;
+  }
+  return std::runtime_error(msg);
+}
+
+} // namespace torch
diff --git a/torch/csrc/utils/tuple_parser.h b/torch/csrc/utils/tuple_parser.h
new file mode 100644
index 0000000..f28c1cf
--- /dev/null
+++ b/torch/csrc/utils/tuple_parser.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <memory>
+#include <vector>
+#include <ATen/ATen.h>
+
+namespace torch {
+
+struct TupleParser {
+  TupleParser(PyObject* args, int num_args=-1);
+
+  void parse(bool& x, const std::string& param_name);
+  void parse(int& x, const std::string& param_name);
+  void parse(double& x, const std::string& param_name);
+  void parse(std::vector<int>& x, const std::string& param_name);
+  void parse(std::string& x, const std::string& param_name);
+
+protected:
+  std::runtime_error invalid_type(const std::string& expected, const std::string& param_name);
+  PyObject* next_arg();
+
+private:
+  PyObject* args;
+  int idx;
+};
+
+} // namespace torch
diff --git a/torch/csrc/utils/variadic.cpp b/torch/csrc/utils/variadic.cpp
new file mode 100644
index 0000000..729c72a
--- /dev/null
+++ b/torch/csrc/utils/variadic.cpp
@@ -0,0 +1 @@
+// Intentionally blank so I get a compile_commands.json entry for the header
diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h
new file mode 100644
index 0000000..c5d8984
--- /dev/null
+++ b/torch/csrc/utils/variadic.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "torch/csrc/autograd/variable.h"
+
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+
+// This class allows you to write variadic functions which
+// call a (possibly overloaded) function on each argument,
+// in order.  This is most commonly used in autogenerated code,
+// where it is convenient to have a function that can uniformly
+// take arguments of different types.  If your arguments
+// are homogenous consider using a std::initializer_list instead.
+template <typename F>
+struct IterArgs {
+  template <typename... Args>
+  inline F& apply() {
+    return self();
+  }
+
+  // NB: Use perfect forwarding here, otherwise we'll make value
+  // copies of all arguments!
+  template <typename T, typename... Args>
+  inline F& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  // Here are some handy overloads which provide sensible
+  // defaults for container-like structures that one might
+  // be interested in recursing into.  You can enable them
+  // by adding:
+  //
+  //    using IterArgs<YourStructName>::operator()
+  //
+  // to your struct.  These are not enabled by default because
+  // you may be able to process these structures more efficiently
+  // than handling them one-by-one.
+
+  template <typename T>
+  void operator()(at::ArrayRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (short_circuit())
+        return;
+    }
+  }
+
+  // NB: we need to specify std::vector manually as C++ won't
+  // do an implicit conversion to make a template deduction go through.
+  template <typename T>
+  void operator()(const std::vector<T>& args) {
+    self()(at::ArrayRef<T>{args});
+  }
+
+  bool short_circuit() {
+    return false;
+  }
+
+ private:
+  inline F& self() {
+    return *static_cast<F*>(this);
+  }
+};
+
+struct CountTensors : IterArgs<CountTensors> {
+  size_t out = 0;
+  void operator()(const at::Tensor& x) {
+    out += 1;
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    out += xs.size();
+  }
+};
+
+template <typename... Args>
+size_t count_tensors(Args&&... args) {
+  return CountTensors().apply(std::forward<Args>(args)...).out;
+}
+
+struct CountVariables : IterArgs<CountVariables> {
+  size_t out = 0;
+  void operator()(const autograd::Variable& x) {
+    out += 1;
+  }
+  void operator()(at::ArrayRef<autograd::Variable> xs) {
+    out += xs.size();
+  }
+};
+
+template <typename... Args>
+inline size_t count_variables(Args&&... args) {
+  return CountVariables().apply(std::forward<Args>(args)...).out;
+}
+
+//===----------------------------------------------------------------------===//
+//                std::index_sequence shim for C++11
+//===----------------------------------------------------------------------===//
+
+// A container of type-template parameter indices.
+template <size_t... Is>
+struct Indices {};
+
+// Decrements the index N, adds N-1 to the list of indices and forwards
+// whatever we arleady have.
+template <size_t N, size_t... Is>
+struct MakeIndices : MakeIndices<N - 1, N - 1, Is...> {};
+
+// Partial specialization that forms our base case. When N is zero, we stop
+// and define a typedef that will be visible to earlier classes due to
+// inheritance. The typedef we define is an index list containing the numbers
+// 0 through N-1.
+template <size_t... Is>
+struct MakeIndices<0, Is...> {
+  using indices = Indices<Is...>;
+};
+
+//===----------------------------------------------------------------------===//
+//                                 Utilities
+//===----------------------------------------------------------------------===//
+
+template <bool value, typename T = void>
+using enable_if_t = typename std::enable_if<value, T>::type;
+
+template <bool value, typename T = void>
+using disable_if_t = enable_if_t<!value, T>;
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+namespace detail {
+template <bool...>
+struct pack;
+} // namespace detail
+
+template <bool... values>
+struct all_of : std::is_same<
+                    detail::pack<values..., true>,
+                    detail::pack<true, values...>> {};
+
+template <bool...>
+struct any_of;
+
+template <>
+struct any_of<> : std::false_type {};
+
+template <bool head, bool... tail>
+struct any_of<head, tail...> {
+  static constexpr bool value = head || any_of<tail...>::value;
+};
+
+template <bool... values>
+struct none_of {
+  static constexpr bool value = !any_of<values...>::value;
+};
+
+template <bool... values>
+using enable_if_all_of_t = enable_if_t<all_of<values...>::value>;
+
+template <typename T, typename... Ts>
+using disable_if_contains_t =
+    enable_if_all_of_t<(!std::is_same<T, decay_t<Ts>>::value)...>;
+
+template <typename Function, typename... Ts>
+void apply(Function function, Ts&&... ts) {
+  //
+  // https://stackoverflow.com/questions/13978916/inserting-a-variadic-argument-list-into-a-vector
+  // Creates a dummy array, so that each function call is evaluated in order.
+  // `(function(), 0)` is because `function` should (!) return `void`, so
+  // according to the comma operator, it is evaluated and its result (`void`)
+  // is discarded. Then the zero is evaluated and used as an element in the
+  // array. The first zero ensures the array is not empty.
+  int _[]{0, (function(std::forward<Ts>(ts)), 0)...};
+  (void)_;
+}
+
+template <typename... Ts, typename Function, typename Accessor>
+auto unpack(Function function, Accessor accessor)
+    -> decltype(function(std::declval<Ts>()...)) {
+  return unpack<Ts...>(
+      std::move(function),
+      std::move(accessor),
+      typename MakeIndices<sizeof...(Ts)>::indices());
+}
+
+template <typename... Ts, typename Function, typename Accessor, size_t... Is>
+auto unpack(Function function, Accessor accessor, Indices<Is...>)
+    -> decltype(function(std::declval<Ts>()...)) {
+  return function(accessor.template operator()<Ts>(Is)...);
+}
+} // namespace torch
diff --git a/torch/csrc/variable_tensor_functions.h b/torch/csrc/variable_tensor_functions.h
new file mode 100644
index 0000000..1933a19
--- /dev/null
+++ b/torch/csrc/variable_tensor_functions.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/THP_export.h>
+
+namespace torch {
+
+// NOTE: This API is currently highly experimental and may change drastically
+// in the near future.
+
+// These functions provide a small wrapper around aten ensuring
+// that we create tensors with type Variable rather than raw tensors
+// when we create new tensors. We also provide a few accessors like requires_grad
+// that make it easier to get to varible information when we have a at::Tensor
+
+/// Returns a `Type` object for the given backend (e.g. `at::kCPU`) and
+/// `ScalarType` (e.g. `at::kDouble`).
+THP_CLASS at::Type& getType(at::Backend backend, at::ScalarType type);
+
+/// Returns a `Type` object for the CPU backend and the given `ScalarType`
+/// (e.g. `at::kDouble`). Equivalent to `getType(kCPU, type)`.
+THP_CLASS at::Type& CPU(at::ScalarType type);
+
+/// Returns a `Type` object for the CUDA backend and the given `ScalarType`
+/// (e.g. `at::kDouble`). Equivalent to `getType(kCUDA, type)`.
+THP_CLASS at::Type& CUDA(at::ScalarType type);
+
+/// Sets the `requires_grad` property of the given `Tensor`.
+THP_CLASS void set_requires_grad(at::Tensor& tensor, bool requires_grad) noexcept;
+
+/// Returns the `requires_grad` of the given `Tensor`.
+THP_CLASS bool requires_grad(const at::Tensor& tensor) noexcept;
+
+} // namespace torch
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
new file mode 100644
index 0000000..fad3f32
--- /dev/null
+++ b/torch/cuda/__init__.py
@@ -0,0 +1,553 @@
+r"""
+This package adds support for CUDA tensor types, that implement the same
+function as CPU tensors, but they utilize GPUs for computation.
+
+It is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports CUDA.
+
+:ref:`cuda-semantics` has more details about working with CUDA.
+"""
+
+import contextlib
+import platform
+import ctypes
+import os
+import torch
+import traceback
+import warnings
+from torch._six import raise_from
+from subprocess import Popen, PIPE
+from multiprocessing.util import register_after_fork as _register_after_fork
+
+_initialized = False
+_queued_calls = []  # don't invoke these until initialization occurs
+_in_bad_fork = False  # this global is also used in torch.manual_seed
+_original_pid = False
+_cudart = None
+
+
+def find_cuda_windows_lib():
+    proc = Popen(['where', 'cudart64*.dll'], stdout=PIPE, stderr=PIPE)
+    out, err = proc.communicate()
+    out = out.decode().strip()
+    if len(out) > 0:
+        if out.find('\r\n') != -1:
+            out = out.split('\r\n')[0]
+        cuda_lib_name = os.path.basename(out)
+        cuda_lib = os.path.splitext(cuda_lib_name)[0]
+        cuda_lib = str(cuda_lib)
+        return ctypes.cdll.LoadLibrary(cuda_lib)
+    else:
+        return None
+
+
+def is_available():
+    r"""Returns a bool indicating if CUDA is currently available."""
+    if (not hasattr(torch._C, '_cuda_isDriverSufficient') or
+            not torch._C._cuda_isDriverSufficient()):
+        return False
+    return torch._C._cuda_getDeviceCount() > 0
+
+
+def _sleep(cycles):
+    torch._C._cuda_sleep(cycles)
+
+
+def _load_cudart():
+    # First check the main program for CUDA symbols
+    if platform.system() == 'Windows':
+        lib = find_cuda_windows_lib()
+    else:
+        lib = ctypes.cdll.LoadLibrary(None)
+    if hasattr(lib, 'cudaGetErrorName'):
+        return lib
+
+    raise RuntimeError(
+        "couldn't find libcudart. Make sure CUDA libraries are installed in a"
+        "default location, or that they're in {}."
+        .format('DYLD_LIBRARY_PATH' if platform.system() == 'Darwin' else
+                'LD_LIBRARY_PATH'))
+
+
+def _check_driver():
+    if not hasattr(torch._C, '_cuda_isDriverSufficient'):
+        raise AssertionError("Torch not compiled with CUDA enabled")
+    if not torch._C._cuda_isDriverSufficient():
+        if torch._C._cuda_getDriverVersion() == 0:
+            # found no NVIDIA driver on the system
+            raise AssertionError("""
+Found no NVIDIA driver on your system. Please check that you
+have an NVIDIA GPU and installed a driver from
+http://www.nvidia.com/Download/index.aspx""")
+        else:
+            # TODO: directly link to the alternative bin that needs install
+            raise AssertionError("""
+The NVIDIA driver on your system is too old (found version {}).
+Please update your GPU driver by downloading and installing a new
+version from the URL: http://www.nvidia.com/Download/index.aspx
+Alternatively, go to: http://pytorch.org to install
+a PyTorch version that has been compiled with your version
+of the CUDA driver.""".format(str(torch._C._cuda_getDriverVersion())))
+
+
+def _check_capability():
+    incorrect_binary_warn = """
+    Found GPU%d %s which requires CUDA_VERSION >= %d for
+     optimal performance and fast startup time, but your PyTorch was compiled
+     with CUDA_VERSION %d. Please install the correct PyTorch binary
+     using instructions from http://pytorch.org
+    """
+
+    old_gpu_warn = """
+    Found GPU%d %s which is of cuda capability %d.%d.
+    PyTorch no longer supports this GPU because it is too old.
+    """
+
+    CUDA_VERSION = torch._C._cuda_getCompiledVersion()
+    for d in range(device_count()):
+        capability = get_device_capability(d)
+        major = capability[0]
+        name = get_device_name(d)
+        if CUDA_VERSION < 8000 and major >= 6:
+            warnings.warn(incorrect_binary_warn % (d, name, 8000, CUDA_VERSION))
+        elif CUDA_VERSION < 9000 and major >= 7:
+            warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
+        elif capability == (3, 0) or major < 3:
+            warnings.warn(old_gpu_warn % (d, name, major, capability[1]))
+
+
+def _lazy_call(callable):
+    if _initialized:
+        callable()
+    else:
+        # Don't store the actual traceback to avoid memory cycle
+        _queued_calls.append((callable, traceback.format_stack()))
+
+_lazy_call(_check_capability)
+
+
+class DeferredCudaCallError(Exception):
+    pass
+
+
+def init():
+    r"""Initialize PyTorch's CUDA state.  You may need to call
+    this explicitly if you are interacting with PyTorch via
+    its C API, as Python bindings for CUDA functionality will not
+    be until this initialization takes place.  Ordinary users
+    should not need this, as all of PyTorch's CUDA methods
+    automatically initialize CUDA state on-demand.
+
+    Does nothing if the CUDA state is already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _cudart, _original_pid, _queued_calls
+    if _initialized:
+        return
+    if _in_bad_fork:
+        from sys import version_info
+        if version_info < (3, 4):
+            msg = ("To use CUDA with multiprocessing, you must use Python "
+                   "3.4+ and the 'spawn' start method")
+        else:
+            msg = ("To use CUDA with multiprocessing, you must use the "
+                   "'spawn' start method")
+        raise RuntimeError(
+            "Cannot re-initialize CUDA in forked subprocess. " + msg)
+    _check_driver()
+    torch._C._cuda_init()
+    _cudart = _load_cudart()
+    _cudart.cudaGetErrorName.restype = ctypes.c_char_p
+    _cudart.cudaGetErrorString.restype = ctypes.c_char_p
+    _original_pid = os.getpid()
+    _initialized = True
+    # Important to do this after _initialized, since some queued calls
+    # may themselves call _lazy_init()
+    for queued_call, orig_traceback in _queued_calls:
+        try:
+            queued_call()
+        except Exception as e:
+            msg = ("CUDA call failed lazily at initialization with error: {}\n\n"
+                   "CUDA call was originally invoked at:\n\n{}").format(str(e), orig_traceback)
+            raise_from(DeferredCudaCallError(msg), e)
+
+
+def _after_fork(arg):
+    global _initialized, _in_bad_fork
+    if _initialized and _original_pid != os.getpid():
+        _initialized = False
+        _in_bad_fork = True
+        _CudaBase.__new__ = _lazy_new
+
+
+_register_after_fork(_after_fork, _after_fork)
+
+
+def cudart():
+    _lazy_init()
+    return _cudart
+
+
+class cudaStatus(object):
+    SUCCESS = 0
+    ERROR_NOT_READY = 34
+
+
+class CudaError(RuntimeError):
+    def __init__(self, code):
+        msg = cudart().cudaGetErrorString(code).decode('utf-8')
+        super(CudaError, self).__init__('{0} ({1})'.format(msg, code))
+
+
+def check_error(res):
+    if res != cudaStatus.SUCCESS:
+        raise CudaError(res)
+
+
+class device(object):
+    r"""Context-manager that changes the selected device.
+
+    Arguments:
+        idx (int): device index to select. It's a no-op if this argument
+            is negative.
+    """
+
+    def __init__(self, idx):
+        self.idx = int(idx)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        if self.idx is -1:
+            return
+        self.prev_idx = torch._C._cuda_getDevice()
+        if self.prev_idx != self.idx:
+            torch._C._cuda_setDevice(self.idx)
+        _lazy_init()
+
+    def __exit__(self, *args):
+        if self.prev_idx != self.idx:
+            torch._C._cuda_setDevice(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a GPU, this is a no-op.
+
+    Arguments:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_cuda else -1
+        super(device_of, self).__init__(idx)
+
+
+def set_device(device):
+    r"""Sets the current device.
+
+    Usage of this function is discouraged in favor of :any:`device`. In most
+    cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable.
+
+    Arguments:
+        device (int): selected device. This function is a no-op if this
+            argument is negative.
+    """
+    if device >= 0:
+        torch._C._cuda_setDevice(device)
+
+
+def get_device_name(device):
+    r"""Gets the name of a device.
+
+    Arguments:
+        device (int): device for which to return the name. This function is a
+            no-op if this argument is negative.
+    """
+    return get_device_properties(device).name
+
+
+def get_device_capability(device):
+    r"""Gets the cuda capability of a device.
+
+    Arguments:
+        device (int): device for which to return the name. This function is a
+            no-op if this argument is negative.
+    Returns:
+        tuple(int, int): the major and minor cuda capability of the device
+    """
+    prop = get_device_properties(device)
+    return prop.major, prop.minor
+
+
+def get_device_properties(device):
+    if not _initialized:
+        init()  # will define _get_device_properties and _CudaDeviceProperties
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    return _get_device_properties(device)
+
+
+@contextlib.contextmanager
+def stream(stream):
+    r"""Context-manager that selects a given stream.
+
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+
+    .. note:: Streams are per-device, and this function changes the "current
+       stream" only for the currently selected device.  It is illegal to select
+       a stream that belongs to a different device.
+    """
+    if stream is None:
+        yield
+        return
+    prev_stream = current_stream()
+    torch._C._cuda_setStream(stream._cdata)
+    try:
+        yield
+    finally:
+        torch._C._cuda_setStream(prev_stream._cdata)
+
+
+def device_count():
+    """Returns the number of GPUs available."""
+    if is_available():
+        return torch._C._cuda_getDeviceCount()
+    else:
+        return 0
+
+
+def current_device():
+    r"""Returns the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._cuda_getDevice()
+
+
+def synchronize():
+    r"""Waits for all kernels in all streams on current device to complete."""
+    _lazy_init()
+    return torch._C._cuda_synchronize()
+
+
+def current_stream():
+    r"""Returns a currently selected :class:`Stream`."""
+    _lazy_init()
+    return torch.cuda.Stream(_cdata=torch._C._cuda_getCurrentStream())
+
+
+def current_blas_handle():
+    r"""Returns cublasHandle_t pointer to current cuBLAS handle"""
+    _lazy_init()
+    return torch._C._cuda_getCurrentBlasHandle()
+
+
+def empty_cache():
+    r"""Releases all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU application and visible in
+    `nvidia-smi`.
+
+    .. note::
+        :meth:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
+        memory available for PyTorch. See :ref:`cuda-memory-management` for
+        more details about GPU memory management.
+    """
+    if _initialized:
+        torch._C._cuda_emptyCache()
+
+
+def memory_allocated(device=None):
+    r"""Returns the current GPU memory usage by tensors in bytes for a given
+    device.
+
+    Arguments:
+        device (int, optional): selected device. Returns statistic for the
+                                current device, given by
+                                :meth:`~torch.cuda.current_device`, if
+                                :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `nvidia-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU. See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    if device is None:
+        device = current_device()
+    return torch._C._cuda_memoryAllocated(device)
+
+
+def max_memory_allocated(device=None):
+    r"""Returns the maximum GPU memory usage by tensors in bytes for a given
+    device.
+
+    Arguments:
+        device (int, optional): selected device. Returns statistic for the
+                                current device, given by
+                                :meth:`~torch.cuda.current_device`, if
+                                :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = current_device()
+    return torch._C._cuda_maxMemoryAllocated(device)
+
+
+def memory_cached(device=None):
+    r"""Returns the current GPU memory managed by the caching allocator in bytes
+    for a given device.
+
+    Arguments:
+        device (int, optional): selected device. Returns statistic for the
+                                current device, given by
+                                :meth:`~torch.cuda.current_device`, if
+                                :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = current_device()
+    return torch._C._cuda_memoryCached(device)
+
+
+def max_memory_cached(device=None):
+    r"""Returns the maximum GPU memory managed by the caching allocator in bytes
+    for a given device.
+
+    Arguments:
+        device (int, optional): selected device. Returns statistic for the
+                                current device, given by
+                                :meth:`~torch.cuda.current_device`, if
+                                :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = current_device()
+    return torch._C._cuda_maxMemoryCached(device)
+
+
+def _host_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaHostAllocator()
+
+
+@contextlib.contextmanager
+def _free_mutex():
+    torch._C._cuda_lock_mutex()
+    try:
+        yield
+    finally:
+        torch._C._cuda_unlock_mutex()
+
+
+from .random import *
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+
+from ..storage import _StorageBase
+
+
+def _dummy_type(name):
+    def init_err(self):
+        class_name = self.__class__.__name__
+        raise RuntimeError(
+            "Tried to instantiate dummy base class {}".format(class_name))
+    return type(storage_name, (object,), {"__init__": init_err})
+
+
+if not hasattr(torch._C, 'CudaDoubleStorageBase'):
+    # Define dummy base classes
+    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte', 'Half']:
+        storage_name = 'Cuda{0}StorageBase'.format(t)
+        tensor_name = 'Cuda{0}TensorBase'.format(t)
+
+        torch._C.__dict__[storage_name] = _dummy_type(storage_name)
+        torch._C.__dict__[tensor_name] = _dummy_type(tensor_name)
+
+    torch._C.__dict__['_CudaStreamBase'] = _dummy_type('CudaStreamBase')
+
+
+@staticmethod
+def _lazy_new(cls, *args, **kwargs):
+    _lazy_init()
+    # We need this method only for lazy init, so we can remove it
+    del _CudaBase.__new__
+    return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
+
+
+class _CudaBase(object):
+    is_cuda = True
+    is_sparse = False
+
+    def type(self, *args, **kwargs):
+        with device(self.get_device()):
+            return super(_CudaBase, self).type(*args, **kwargs)
+
+    __new__ = _lazy_new
+
+
+class DoubleStorage(_CudaBase, torch._C.CudaDoubleStorageBase, _StorageBase):
+    pass
+
+
+class FloatStorage(_CudaBase, torch._C.CudaFloatStorageBase, _StorageBase):
+    pass
+
+
+class LongStorage(_CudaBase, torch._C.CudaLongStorageBase, _StorageBase):
+    pass
+
+
+class IntStorage(_CudaBase, torch._C.CudaIntStorageBase, _StorageBase):
+    pass
+
+
+class ShortStorage(_CudaBase, torch._C.CudaShortStorageBase, _StorageBase):
+    pass
+
+
+class CharStorage(_CudaBase, torch._C.CudaCharStorageBase, _StorageBase):
+    pass
+
+
+class ByteStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase):
+    pass
+
+
+class HalfStorage(_CudaBase, torch._C.CudaHalfStorageBase, _StorageBase):
+    pass
+
+
+torch._storage_classes.add(DoubleStorage)
+torch._storage_classes.add(FloatStorage)
+torch._storage_classes.add(LongStorage)
+torch._storage_classes.add(IntStorage)
+torch._storage_classes.add(ShortStorage)
+torch._storage_classes.add(CharStorage)
+torch._storage_classes.add(ByteStorage)
+torch._storage_classes.add(HalfStorage)
+
+from . import sparse
+from . import profiler
+from . import nvtx
+from .streams import Stream, Event
diff --git a/torch/cuda/comm.py b/torch/cuda/comm.py
new file mode 100644
index 0000000..ab7f1a2
--- /dev/null
+++ b/torch/cuda/comm.py
@@ -0,0 +1,160 @@
+import torch
+from . import nccl
+from torch._utils import _accumulate, _take_tensors, _flatten_dense_tensors, \
+    _flatten_sparse_tensors, _unflatten_dense_tensors, \
+    _unflatten_sparse_tensors, _reorder_tensors_as
+
+
+def broadcast(tensor, devices):
+    """Broadcasts a tensor to a number of GPUs.
+
+    Arguments:
+        tensor (Tensor): tensor to broadcast.
+        devices (Iterable): an iterable of devices among which to broadcast.
+          Note that it should be like (src, dst1, dst2, ...), the first element
+          of which is the source device to broadcast from.
+
+    Returns:
+        A tuple containing copies of the ``tensor``, placed on devices
+        corresponding to indices from ``devices``.
+    """
+    return torch._C._broadcast(tensor, devices)
+
+
+def broadcast_coalesced(tensors, devices, buffer_size=10485760):
+    """Broadcasts a sequence tensors to the specified GPUs.
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Arguments:
+        tensors (sequence): tensors to broadcast.
+        devices (Iterable): an iterable of devices among which to broadcast.
+          Note that it should be like (src, dst1, dst2, ...), the first element
+          of which is the source device to broadcast from.
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple containing copies of the ``tensor``, placed on devices
+        corresponding to indices from ``devices``.
+    """
+    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
+
+
+def reduce_add(inputs, destination=None):
+    """Sums tensors from multiple GPUs.
+
+    All inputs should have matching shapes.
+
+    Arguments:
+        inputs (Iterable[Tensor]): an iterable of tensors to add.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+
+    Returns:
+        A tensor containing an elementwise sum of all inputs, placed on the
+        ``destination`` device.
+    """
+    # TODO: try to find an input on another gpu, copy it,
+    # and accumulate into the copy
+    if destination is None:
+        destination = torch.cuda.current_device()
+    input_size = inputs[0].size()
+    nccl_root = None
+    for i, inp in enumerate(inputs):
+        assert inp.is_cuda, "reduce_add expects all inputs to be on GPUs"
+        if inp.get_device() == destination:
+            nccl_root = i
+        if inp.size() != input_size:
+            got = 'x'.join(str(x) for x in inp.size())
+            expected = 'x'.join(str(x) for x in input_size)
+            raise ValueError("input {} has invalid size: got {}, but expected "
+                             "{}".format(i, got, expected))
+    if nccl_root is None:
+        raise RuntimeError("reduce_add expects destination to be on the same GPU with one of the tensors")
+    result = inp.new(device=destination).resize_as_(inp).zero_()
+
+    if nccl.is_available(inputs) and inputs[0].get_device() == destination:
+        outputs = [result] + [t.new(t.size()) for t in inputs[1:]]
+        nccl.reduce(inputs, outputs, root=nccl_root)
+        return result
+    for inp in inputs:
+        input_correct_gpu = inp.cuda(result.get_device())
+        result.add_(input_correct_gpu)
+    return result
+
+
+def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
+    """Sums tensors from multiple GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Arguments:
+        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
+            contain tensors from a single device.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple of tensors containing an elementwise sum of each group of
+        inputs, placed on the ``destination`` device.
+    """
+    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
+    output = []
+    ref_order = []
+    # process sparse ones first since they may have different sizes on different gpus
+    for tensor_at_gpus in zip(*inputs):
+        if all(t.is_sparse for t in tensor_at_gpus):
+            result = reduce_add(tensor_at_gpus, destination)
+            output.append(result)
+            ref_order.append(tensor_at_gpus[0])
+        else:
+            for coll, t in zip(dense_tensors, tensor_at_gpus):
+                coll.append(t.to_dense() if t.is_sparse else t)
+            ref_order.append(dense_tensors[0][-1])
+    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
+    # now the dense ones, which have consistent sizes
+    for chunks in zip(*itrs):
+        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
+        flat_result = reduce_add(flat_tensors, destination)
+        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
+    return tuple(_reorder_tensors_as(output, ref_order))
+
+
+def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None):
+    """Scatters tensor across multiple GPUs.
+
+    Arguments:
+        tensor (Tensor): tensor to scatter.
+        devices (Iterable[int]): iterable of ints, specifying among which
+            devices the tensor should be scattered.
+        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
+            each device. It should match ``devices`` in length and sum to
+            ``tensor.size(dim)``. If not specified, the tensor will be divided
+            into equal chunks.
+        dim (int, optional): A dimension along which to chunk the tensor.
+
+    Returns:
+        A tuple containing chunks of the ``tensor``, spread across given
+        ``devices``.
+    """
+    return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
+
+
+def gather(tensors, dim=0, destination=None):
+    """Gathers tensors from multiple GPUs.
+
+    Tensor sizes in all dimension different than ``dim`` have to match.
+
+    Arguments:
+        tensors (Iterable[Tensor]): iterable of tensors to gather.
+        dim (int): a dimension along which the tensors will be concatenated.
+        destination (int, optional): output device (-1 means CPU, default:
+            current device)
+
+    Returns:
+        A tensor located on ``destination`` device, that is a result of
+        concatenating ``tensors`` along ``dim``.
+    """
+    return torch._C._gather(tensors, dim, destination)
diff --git a/torch/cuda/error.py b/torch/cuda/error.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
new file mode 100644
index 0000000..f6b295c
--- /dev/null
+++ b/torch/cuda/nccl.py
@@ -0,0 +1,63 @@
+import warnings
+import torch.cuda
+
+__all__ = ['all_reduce', 'reduce', 'broadcast', 'all_gather', 'reduce_scatter']
+
+SUM = 0  # ncclRedOp_t
+
+
+def is_available(tensors):
+    devices = set()
+    for tensor in tensors:
+        if tensor.is_sparse:
+            return False
+        if not tensor.is_contiguous():
+            return False
+        if not tensor.is_cuda:
+            return False
+        device = tensor.get_device()
+        if device in devices:
+            return False
+        devices.add(device)
+
+    if not hasattr(torch._C, '_nccl_all_reduce'):
+        warnings.warn('PyTorch is not compiled with NCCL support')
+        return False
+
+    return True
+
+
+def version():
+    return torch._C._nccl_version()
+
+
+def unique_id():
+    return torch._C._nccl_unique_id()
+
+
+def init_rank(num_ranks, uid, rank):
+    return torch._C._nccl_init_rank(num_ranks, uid, rank)
+
+
+def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
+    if outputs is None:
+        outputs = inputs
+    torch._C._nccl_all_reduce(inputs, outputs, op, streams, comms)
+
+
+def reduce(inputs, outputs=None, root=0, op=SUM, streams=None, comms=None):
+    if outputs is None:
+        outputs = inputs
+    torch._C._nccl_reduce(inputs, outputs, root, op, streams, comms)
+
+
+def broadcast(inputs, root=0, streams=None, comms=None):
+    torch._C._nccl_broadcast(inputs, root, streams, comms)
+
+
+def all_gather(inputs, outputs, streams=None, comms=None):
+    torch._C._nccl_all_gather(inputs, outputs, streams, comms)
+
+
+def reduce_scatter(inputs, outputs, op=SUM, streams=None, comms=None):
+    torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
new file mode 100644
index 0000000..7c70a0b
--- /dev/null
+++ b/torch/cuda/nvtx.py
@@ -0,0 +1,75 @@
+import os
+import glob
+import ctypes
+import platform
+
+lib = None
+
+__all__ = ['range_push', 'range_pop', 'mark']
+
+
+def windows_nvToolsExt_lib():
+    lib_path = windows_nvToolsExt_path()
+    if len(lib_path) > 0:
+        lib_name = os.path.basename(lib_path)
+        lib = os.path.splitext(lib_name)[0]
+        return ctypes.cdll.LoadLibrary(lib)
+    else:
+        return None
+
+
+def windows_nvToolsExt_path():
+    WINDOWS_HOME = 'C:/Program Files/NVIDIA Corporation/NvToolsExt'
+    NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', WINDOWS_HOME)
+    if os.path.exists(NVTOOLEXT_HOME):
+        lib_paths = glob.glob(NVTOOLEXT_HOME + '/bin/x64/nvToolsExt*.dll')
+        if len(lib_paths) > 0:
+            lib_path = lib_paths[0]
+            return lib_path
+    return ''
+
+
+def _libnvToolsExt():
+    global lib
+    if lib is None:
+        if platform.system() != 'Windows':
+            lib = ctypes.cdll.LoadLibrary(None)
+        else:
+            lib = windows_nvToolsExt_lib()
+        lib.nvtxMarkA.restype = None
+    return lib
+
+
+def range_push(msg):
+    """
+    Pushes a range onto a stack of nested range span.  Returns zero-based
+    depth of the range that is started.
+
+    Arguments:
+        msg (string): ASCII message to associate with range
+    """
+    if _libnvToolsExt() is None:
+        raise RuntimeError('Unable to load nvToolsExt library')
+    return lib.nvtxRangePushA(ctypes.c_char_p(msg.encode("ascii")))
+
+
+def range_pop():
+    """
+    Pops a range off of a stack of nested range spans.  Returns the
+    zero-based depth of the range that is ended.
+    """
+    if _libnvToolsExt() is None:
+        raise RuntimeError('Unable to load nvToolsExt library')
+    return lib.nvtxRangePop()
+
+
+def mark(msg):
+    """
+    Describe an instantaneous event that occurred at some point.
+
+    Arguments:
+        msg (string): ASCII message to associate with the event.
+    """
+    if _libnvToolsExt() is None:
+        raise RuntimeError('Unable to load nvToolsExt library')
+    return lib.nvtxMarkA(ctypes.c_char_p(msg.encode("ascii")))
diff --git a/torch/cuda/profiler.py b/torch/cuda/profiler.py
new file mode 100644
index 0000000..411a7cb
--- /dev/null
+++ b/torch/cuda/profiler.py
@@ -0,0 +1,55 @@
+import ctypes
+import tempfile
+import contextlib
+from . import cudart, check_error
+
+
+class cudaOutputMode(object):
+    cudaKeyValuePair = ctypes.c_int(0)
+    cudaCSV = ctypes.c_int(1)
+
+    @staticmethod
+    def for_key(key):
+        if key == 'key_value':
+            return cudaOutputMode.cudaKeyValuePair
+        elif key == 'csv':
+            return cudaOutputMode.cudaCSV
+        else:
+            raise RuntimeError("supported CUDA profiler output modes are: key_value and csv")
+
+DEFAULT_FLAGS = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+def init(output_file, flags=None, output_mode='key_value'):
+    flags = DEFAULT_FLAGS if flags is None else flags
+    output_mode = cudaOutputMode.for_key(output_mode)
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+        f.write(b'\n'.join(map(lambda f: f.encode('ascii'), flags)))
+        f.flush()
+        check_error(cudart().cudaProfilerInitialize(
+            ctypes.c_char_p(f.name.encode('ascii')), ctypes.c_char_p(output_file.encode('ascii')), output_mode))
+
+
+def start():
+    check_error(cudart().cudaProfilerStart())
+
+
+def stop():
+    check_error(cudart().cudaProfilerStop())
+
+
+@contextlib.contextmanager
+def profile():
+    try:
+        start()
+        yield
+    finally:
+        stop()
diff --git a/torch/cuda/random.py b/torch/cuda/random.py
new file mode 100644
index 0000000..aece89c
--- /dev/null
+++ b/torch/cuda/random.py
@@ -0,0 +1,116 @@
+from torch import _C
+from . import _lazy_init, _lazy_call, device_count, device as device_ctx_manager
+
+
+def get_rng_state(device=-1):
+    r"""Returns the random number generator state of the current
+    GPU as a ByteTensor.
+
+    Args:
+        device (int, optional): The device to return the RNG state of.
+            Default: -1 (i.e., use the current device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    with device_ctx_manager(device):
+        return _C._cuda_getRNGState()
+
+
+def get_rng_state_all():
+    r"""Returns a tuple of ByteTensor representing the random number states of all devices."""
+
+    results = []
+    for i in range(device_count()):
+        with device_ctx_manager(i):
+            results.append(get_rng_state())
+    return results
+
+
+def set_rng_state(new_state, device=-1):
+    r"""Sets the random number generator state of the current GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    new_state_copy = new_state.clone()
+
+    # NB: What if device=-1?  You might be afraid that the "current"
+    # device would change by the time we actually get around to invoking
+    # the lazy callback.  But actually, this is not possible: changing
+    # the current device involves a CUDA call, which would in turn
+    # initialize the state.  So then _lazy_call would execute cb
+    # immediately.
+    def cb():
+        with device_ctx_manager(device):
+            _C._cuda_setRNGState(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states):
+    r"""Sets the random number generator state of all devices.
+
+    Args:
+        new_state (tuple of torch.ByteTensor): The desired state for each device"""
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed):
+    r"""Sets the seed for generating random numbers for the current GPU.
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+    _lazy_call(lambda: _C._cuda_manualSeed(seed))
+
+
+def manual_seed_all(seed):
+    r"""Sets the seed for generating random numbers on all GPUs.
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+    _lazy_call(lambda: _C._cuda_manualSeedAll(seed))
+
+
+def seed():
+    r"""Sets the seed for generating random numbers to a random number for the current GPU.
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+    _lazy_call(lambda: _C._cuda_seed())
+
+
+def seed_all():
+    r"""Sets the seed for generating random numbers to a random number on all GPUs.
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+    """
+    _lazy_call(lambda: _C._cuda_seedAll())
+
+
+def initial_seed():
+    r"""Returns the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    return _C._cuda_initialSeed()
diff --git a/torch/cuda/sparse.py b/torch/cuda/sparse.py
new file mode 100644
index 0000000..f37a341
--- /dev/null
+++ b/torch/cuda/sparse.py
@@ -0,0 +1 @@
+# The Tensor classes are added to this module by python_tensor.cpp
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
new file mode 100644
index 0000000..b43df62
--- /dev/null
+++ b/torch/cuda/streams.py
@@ -0,0 +1,209 @@
+import ctypes
+import torch
+from . import cudart, check_error, cudaStatus
+
+
+class Stream(torch._C._CudaStreamBase):
+    """Wrapper around a CUDA stream.
+
+    A CUDA stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+
+    Arguments:
+        device(int, optional): a device on which to allocate the Stream.
+        priority(int, optional): priority of the stream. Lower numbers
+                                 represent higher priorities.
+    """
+
+    def __new__(cls, device=-1, priority=0, **kwargs):
+        with torch.cuda.device(device):
+            return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event):
+        """Makes all future work submitted to the stream wait for an event.
+
+        Arguments:
+            event (Event): an event to wait for.
+
+        .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see `CUDA
+           documentation`_ for more info.
+
+           This function returns without waiting for :attr:`event`: only future
+           operations are affected.
+
+        .. _CUDA documentation:
+           http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+        """
+        check_error(cudart().cudaStreamWaitEvent(self, event, ctypes.c_int(0)))
+
+    def wait_stream(self, stream):
+        """Synchronizes with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Arguments:
+            stream (Stream): a stream to synchronize.
+
+        .. note:: This function returns without waiting for currently enqueued
+           kernels in :attr:`stream`: only future operations are affected.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        """Records an event.
+
+        Arguments:
+            event (Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        check_error(cudart().cudaEventRecord(event, self))
+        return event
+
+    def query(self):
+        """Checks if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        res = cudart().cudaStreamQuery(self)
+        if res == cudaStatus.ERROR_NOT_READY:
+            return False
+        check_error(res)
+        return True
+
+    def synchronize(self):
+        """Wait for all the kernels in this stream to complete.
+
+        .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
+           `CUDA documentation`_ for more info.
+
+        .. _CUDA documentation:
+           http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+        """
+        check_error(cudart().cudaStreamSynchronize(self))
+
+    @staticmethod
+    def priority_range():
+        least_priority = ctypes.c_int()
+        greatest_priority = ctypes.c_int()
+        check_error(cudart().cudaDeviceGetStreamPriorityRange(
+            ctypes.byref(least_priority), ctypes.byref(greatest_priority)))
+        return (least_priority.value, greatest_priority.value)
+
+    @property
+    def priority(self):
+        priority = ctypes.c_int()
+        check_error(cudart().cudaStreamGetPriority(self, ctypes.byref(priority)))
+        return priority.value
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.cuda_stream)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return o.device == self.device and o.cuda_stream == self.cuda_stream
+        return False
+
+    def __hash__(self):
+        return hash((self.cuda_stream, self.device))
+
+    def __repr__(self):
+        return ('<torch.cuda.Stream device={0} cuda_stream={1:#x}>'
+                .format(self.device, self.cuda_stream))
+
+
+class EventHandle(ctypes.Structure):
+    IPC_HANDLE_SIZE = 64
+    _fields_ = [('reserved', ctypes.c_char * IPC_HANDLE_SIZE)]
+
+
+class Event(object):
+    """Wrapper around CUDA event.
+
+    Arguments:
+        enable_timing (bool): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+    """
+
+    DEFAULT = 0x0
+    BLOCKING_SYNC = 0x1
+    DISABLE_TIMING = 0x2
+    INTERPROCESS = 0x4
+
+    def __init__(self, enable_timing=False, blocking=False, interprocess=False,
+                 _handle=None):
+        flags = Event.DEFAULT
+        if not enable_timing:
+            flags |= Event.DISABLE_TIMING
+        if blocking:
+            flags |= Event.BLOCKING_SYNC
+        if interprocess:
+            flags |= Event.INTERPROCESS
+
+        ptr = ctypes.c_void_p()
+        self._cudart = cudart()
+        if _handle:
+            check_error(self._cudart.cudaIpcOpenEventHandle(ctypes.byref(ptr), _handle))
+        else:
+            check_error(self._cudart.cudaEventCreateWithFlags(ctypes.byref(ptr), ctypes.c_uint(flags)))
+        self._as_parameter_ = ptr
+
+    def __del__(self):
+        if hasattr(self, '_as_parameter_'):
+            check_error(self._cudart.cudaEventDestroy(self._as_parameter_))
+            del self._as_parameter_
+
+    def record(self, stream=None):
+        """Records the event in a given stream."""
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        stream.record_event(self)
+
+    def wait(self, stream=None):
+        """Makes a given stream wait for the event."""
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        stream.wait_event(self)
+
+    def query(self):
+        """Checks if the event has been recorded.
+
+        Returns:
+            A boolean indicating if the event has been recorded.
+        """
+        res = cudart().cudaEventQuery(self)
+        if res == cudaStatus.ERROR_NOT_READY:
+            return False
+        check_error(res)
+        return True
+
+    def elapsed_time(self, end_event):
+        """Returns the time elapsed before the event was recorded."""
+        time_ms = ctypes.c_float()
+        check_error(cudart().cudaEventElapsedTime(
+            ctypes.byref(time_ms), self, end_event))
+        return time_ms.value
+
+    def synchronize(self):
+        """Synchronizes with the event."""
+        check_error(cudart().cudaEventSynchronize(self))
+
+    def ipc_handle(self):
+        """Returns an IPC handle of this event."""
+        handle = EventHandle()
+        check_error(cudart().cudaIpcGetEventHandle(ctypes.byref(handle), self))
+        return handle
+
+    def __repr__(self):
+        return '<torch.cuda.Event {0:#x}>'.format(self._as_parameter_.value)
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
new file mode 100644
index 0000000..f8b26b1
--- /dev/null
+++ b/torch/distributed/__init__.py
@@ -0,0 +1,553 @@
+"""
+torch.distributed provides an MPI-like interface for exchanging tensor
+data across multi-machine networks. It supports a few different backends
+and initialization methods.
+"""
+import torch
+import atexit
+import warnings
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class dist_backend:
+    UNDEFINED = -1
+    TCP = 0
+    MPI = 1
+    GLOO = 2
+    NCCL = 3
+
+
+_INITIALIZED_PG = 1
+_INITIALIZED_MW = 2
+_initialized = 0
+_backend = dist_backend.UNDEFINED
+_scope = locals()
+
+
+def _extend_scope(module):
+    _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')})
+
+
+def is_available():
+    return torch._C._has_distributed()
+
+
+def destroy_process_group():
+    """
+    Destroy the initialized distributed package
+    """
+    global _backend
+    global _initialized
+    torch._C._dist_destroy_process_group()
+    _backend = dist_backend.UNDEFINED
+    _initialized = 0
+
+
+def is_initialized():
+    """Checking if the process group has been initialized
+    """
+    return _initialized == _INITIALIZED_PG
+
+
+def init_process_group(backend, init_method='env://', **kwargs):
+    """Initializes the distributed package.
+
+    Arguments:
+        backend (str): Name of the backend to use. Depending on build-time configuration
+            valid values include: ``tcp``, ``mpi`` and ``gloo``.
+        init_method (str, optional): URL specifying how to initialize the package.
+        world_size (int, optional): Number of processes participating in the job.
+        rank (int, optional): Rank of the current process.
+        group_name (str, optional): Group name. See description of init methods.
+
+    To enable ``backend == mpi``, PyTorch needs to built from source on a system that
+    supports MPI.
+
+    """
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed twice!")
+
+    # Checking and assigning the distributed backend
+    global _backend
+
+    if backend == "tcp":
+        _backend = dist_backend.TCP
+    elif backend == "mpi":
+        _backend = dist_backend.MPI
+    elif backend == "gloo":
+        _backend = dist_backend.GLOO
+    elif backend == "nccl":
+        _backend = dist_backend.NCCL
+    else:
+        raise RuntimeError("Invalid distributed backend name: " + backend)
+
+    torch._C._dist_init_process_group(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_PG
+
+    if _backend == dist_backend.NCCL:
+        atexit.register(destroy_process_group)
+
+    if not torch._C._dist_init_extension(False, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+def init_master_worker(backend, init_method='env://', **kwargs):
+    warnings.warn("""
+    ================================================================================
+                                        WARNING
+    ================================================================================
+    Master-worker mode is still experimental. The API will change without
+    notice and we're can't guarantee full correctness and expected performance yet.
+    We'll announce it once it's ready.
+    """)
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed twice!")
+    torch._C._dist_init_master_worker(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_MW
+    import torch.distributed.collectives as collectives
+    import torch.distributed.remote_types as remote_types
+    _extend_scope(collectives)
+    _extend_scope(remote_types)
+    if not torch._C._dist_init_extension(True, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+class reduce_op(object):
+    SUM = object()
+    PRODUCT = object()
+    MAX = object()
+    MIN = object()
+
+
+class group(object):
+    WORLD = object()
+
+
+class _DistributedRequest(object):
+    def __init__(self, request):
+        self.request = request
+
+    def is_completed(self):
+        return torch._C._dist_request_is_completed(self.request)
+
+    def wait(self):
+        torch._C._dist_request_wait(self.request)
+
+
+def get_rank():
+    """Returns the rank of current process.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    group. They are always consecutive integers ranging from 0 to ``world_size``.
+    """
+    assert torch.distributed._initialized
+    return torch._C._dist_get_rank()
+
+
+def get_world_size():
+    """Returns the number of processes in the distributed group."""
+    assert torch.distributed._initialized
+    return torch._C._dist_get_num_processes()
+
+
+def isend(tensor, dst):
+    """Sends a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_isend(tensor, dst))
+
+
+def irecv(tensor, src):
+    """Receives a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int): Source rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_irecv(tensor, src))
+
+
+def send(tensor, dst):
+    """Sends a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_send(tensor, dst)
+
+
+def recv(tensor, src=None):
+    """Receives a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
+
+    Returns:
+        Sender rank.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if src is None:
+        return torch._C._dist_recv_any_source(tensor)
+    return torch._C._dist_recv(tensor, src)
+
+
+def broadcast_multigpu(tensor_list, src, group=group.WORLD):
+    """Broadcasts the tensor to the whole group with multiple GPU tensors
+    per node.
+
+    ``tensor`` must have the same number of elements in all the GPUs from
+    all processes participating in the collective. each tensor in the list must
+    be on a different GPU
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Tensors that participate in the collective
+            operation. if ``src`` is the rank, then the first element of
+            ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all
+            other tensors (on different GPUs) in the src process and all tensors
+            in ``tensor_list`` of other non-src processes. You also need to make
+            sure that ``len(tensor_list)`` is the same for all the distributed
+            processes calling this function.
+
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_broadcast_multigpu(tensor_list, src, group)
+
+
+def broadcast(tensor, src, group=group.WORLD):
+    """Broadcasts the tensor to the whole group.
+
+    ``tensor`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and tensor to be used to save received data otherwise.
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_broadcast(tensor, src, group)
+
+
+def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
+    """Reduces the tensor data across all machines in such a way that all get
+    the final result. This function reduces a number of tensors on every node,
+    while each tensor resides on different GPUs.
+    Therefore, the input tensor in the tensor list needs to be GPU tensors.
+    Also, each tensor in the tensor list needs to reside on a different GPU.
+
+    After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise
+    identical in all processes.
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor list (List[Tensor]): List of input and output tensors of
+            the collective. The function operates in-place and requires that
+            each tensor to be a GPU tensor on different GPUs.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        op (optional): One of the values from ``torch.distributed.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_all_reduce_multigpu(tensor_list, op, group)
+
+
+def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
+    """Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call ``tensor`` is going to be bitwise identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from ``torch.distributed.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_all_reduce(tensor, op, group)
+
+
+def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
+    """Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in ``tensor_list`` should reside on a separate GPU
+
+    Only the GPU of ``tensor_list[0]`` on the process with rank ``dst`` is
+    going to receive the final result.
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Input and output GPU tensors of the
+            collective. The function operates in-place.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
+
+
+def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
+    """Reduces the tensor data across all machines.
+
+    Only the process with rank ``dst`` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_reduce(tensor, dst, op, group)
+
+
+def all_gather_multigpu(output_tensor_lists,
+                        input_tensor_list,
+                        group=group.WORLD):
+    """Gathers tensors from the whole group in a list.
+    Each tensor in ``tensor_list`` should reside on a separate GPU
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        output_tensor_lists (List[List[Tensor]]): Output lists. It should
+            contain correctly-sized tensors on each GPU to be used for output of
+            the collective.
+            e.g. ``output_tensor_lists[i]`` contains the all_gather
+            result that resides on the GPU of ``input_tensor_list[i]``.
+            Note that each element of ``output_tensor_lists[i]`` has the size of
+            ``world_size * len(input_tensor_list)``, since the function all
+            gathers the result from every single GPU in the group. To interpret
+            each element of ``output_tensor_list[i]``, note that
+            ``input_tensor_list[j]`` of rank k will be appear in
+            ``output_tensor_list[i][rank * world_size + j]``
+            Also note that ``len(output_tensor_lists)``, and the size of each
+            element in ``output_tensor_lists`` (each element is a list,
+            therefore ``len(output_tensor_lists[i])``) need to be the same
+            for all the distributed processes calling this function.
+
+        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
+            be broadcast from current process.
+            Note that ``len(input_tensor_list)`` needs to be the same for
+            all the distributed processes calling this function.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    flatten_tensor_list = []
+    for output_tensor_list in output_tensor_lists:
+        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))
+
+    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
+                                             input_tensor_list,
+                                             group)
+
+    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
+                                                  flatten_tensor_list):
+        for tensor, value in zip(output_tensor_list,
+                                 _unflatten_dense_tensors(flatten_tensor,
+                                                          output_tensor_list)):
+            tensor.copy_(value)
+
+    return ret
+
+
+def all_gather(tensor_list, tensor, group=group.WORLD):
+    """Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if _backend != dist_backend.NCCL:
+        return torch._C._dist_all_gather(tensor_list, tensor, group)
+    else:
+        return all_gather_multigpu([tensor_list], [tensor], group)
+
+
+def gather(tensor, **kwargs):
+    """Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        dst (int): Destination rank. Required in all processes except the one that
+            is receiveing the data.
+        gather_list (list[Tensor]): List of appropriately-sized tensors to
+            use for received data. Required only in the receiving process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    dst = kwargs.pop('dst', my_rank)
+    gather_list = kwargs.pop('gather_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs")
+    if dst == my_rank:
+        if gather_list is None:
+            raise RuntimeError("gather_list is a required argument in gather destination")
+        return torch._C._dist_gather_recv(gather_list, tensor, _group)
+    else:
+        if gather_list:
+            raise RuntimeError("non-empty gather_list can be given only to gather destination")
+        return torch._C._dist_gather_send(tensor, dst, _group)
+
+
+def scatter(tensor, **kwargs):
+    """Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    ``tensor`` argument.
+
+    Arguments:
+        tensor (Tensor): Output tensor.
+        src (int): Source rank. Required in all processes except the one that
+            is sending the data.
+        scatter_list (list[Tensor]): List of tensors to scatter. Required only
+            in the process that is sending the data.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    src = kwargs.pop('src', my_rank)
+    scatter_list = kwargs.pop('scatter_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs")
+    if src == my_rank:
+        if scatter_list is None:
+            raise RuntimeError("scatter_list is a required argument in scatter source")
+        return torch._C._dist_scatter_send(scatter_list, tensor, _group)
+    else:
+        if scatter_list:
+            raise RuntimeError("non-empty can be given only to scatter source")
+        return torch._C._dist_scatter_recv(tensor, src, _group)
+
+
+def barrier(group=group.WORLD):
+    """Synchronizes all processes.
+
+    This collective blocks processes until the whole group enters this function.
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_barrier(group)
+
+
+def new_group(ranks=None):
+    """Creates a new distributed group.
+
+    This function requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    Arguments:
+        ranks (list[int]): List of ranks of group members.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+    assert torch.distributed._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if ranks is None:
+        ranks = list(range(get_world_size()))
+    return torch._C._dist_new_group(ranks)
+
+
+def _clear_group_cache(group=group.WORLD):
+    """Clear the created distributed group's cached resource
+
+    Only nccl backend is currently supported
+
+    Cached resource includes NCCL communicators and CUDA events
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    return torch._C._dist_clear_group_cache(group)
+
+
+def _register_stream(stream):
+    if not _initialized:
+        raise RuntimeError("torch.distributed needs to be initialized first")
+    return torch._C._dist_register_stream(stream)
diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py
new file mode 100644
index 0000000..3b98424
--- /dev/null
+++ b/torch/distributed/c10d/__init__.py
@@ -0,0 +1,25 @@
+import torch
+
+
+def is_available():
+    return hasattr(torch._C, "_c10d_init")
+
+
+if is_available() and not torch._C._c10d_init():
+    raise RuntimeError("c10d initialization failed")
+
+
+if is_available():
+    from .rendezvous import rendezvous, register_rendezvous_handler
+    from . import BroadcastOptions, AllreduceOptions
+
+    DEFAULT_REDUCE_OPTIONS = AllreduceOptions()
+
+    def broadcast(tensor, src, process_group):
+        opts = BroadcastOptions()
+        opts.rootRank = src
+        opts.rootTensor = 0
+        return process_group.broadcast([tensor], opts)
+
+    def all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS):
+        return process_group.allreduce([tensor], opts)
diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/c10d/rendezvous.py
new file mode 100644
index 0000000..062443f
--- /dev/null
+++ b/torch/distributed/c10d/rendezvous.py
@@ -0,0 +1,98 @@
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+from . import FileStore, TCPStore
+
+
+_rendezvous_handlers = {}
+
+
+def register_rendezvous_handler(scheme, handler):
+    """Registers a new rendezvous handler.
+
+    Before we can run collective algorithms, participating processes
+    need to find each other and exchange information to be able to
+    communicate. We call this process rendezvous.
+
+    The outcome of the rendezvous process is a triplet containing a
+    shared key/value store, the rank of the process, and the total
+    number of participating processes.
+
+    If none of the bundled rendezvous methods apply to your execution
+    environment you can opt to register your own rendezvous handler.
+    Pick a unique name and use the URL scheme to identify it when
+    calling the `rendezvous()` function.
+
+    Arguments:
+        scheme (str): URL scheme to identify your rendezvous handler.
+        handler (function): Handler that is invoked when the
+            `rendezvous()` function is called with a URL that uses
+            the corresponding scheme. It must be a generator function
+            that yields the triplet.
+    """
+    global _rendezvous_handlers
+    if scheme in _rendezvous_handlers:
+        raise RuntimeError(
+            "Rendezvous handler for {}:// already registered".format(scheme)
+        )
+    _rendezvous_handlers[scheme] = handler
+
+
+def rendezvous(url, **kwargs):
+    global _rendezvous_handlers
+    result = urlparse(url)
+    if result.scheme not in _rendezvous_handlers:
+        raise RuntimeError("No rendezvous handler for {}://".format(result.scheme))
+    return _rendezvous_handlers[result.scheme](url, **kwargs)
+
+
+def _file_rendezvous_handler(url):
+    def _error(msg):
+        return ValueError("file:// rendezvous: " + msg)
+
+    result = urlparse(url)
+    path = result.path
+    if not path:
+        raise _error("path missing")
+    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
+    if "rank" not in query:
+        raise _error("rank parameter missing")
+    if "size" not in query:
+        raise _error("size parameter missing")
+
+    rank = int(query["rank"])
+    size = int(query["size"])
+    store = FileStore(path)
+    yield (store, rank, size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform rerendezvous using file:// method")
+
+
+def _tcp_rendezvous_handler(url):
+    def _error(msg):
+        return ValueError("tcp:// rendezvous: " + msg)
+
+    result = urlparse(url)
+    if not result.port:
+        raise _error("port number missing")
+    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
+    if "rank" not in query:
+        raise _error("rank parameter missing")
+    if "size" not in query:
+        raise _error("size parameter missing")
+
+    rank = int(query["rank"])
+    size = int(query["size"])
+    start_daemon = rank == 0
+    store = TCPStore(result.hostname, result.port, start_daemon)
+    yield (store, rank, size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
+
+
+register_rendezvous_handler("file", _file_rendezvous_handler)
+register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
new file mode 100644
index 0000000..c3317c4
--- /dev/null
+++ b/torch/distributed/launch.py
@@ -0,0 +1,209 @@
+r"""
+`torch.distributed.launch` is a module that spawns up multiple distributed
+training processes on each of the training nodes.
+
+The utility can be used for single-node distributed training, in which one or
+more processes per node will be spawned. The utility can be used for either
+CPU training or GPU training. If the utility is used for GPU training,
+each distributed process will be operating on a single GPU. This can achieve
+well-improved single-node training performance. It can also be used in
+multi-node distributed training, by spawning up multiple processes on each node
+for well-improved multi-node distributed training performance as well.
+This will especially be benefitial for systems with multiple Infiniband
+interfaces that have direct-GPU support, since all of them can be utilized for
+aggregated communication bandwidth.
+
+In both cases of single-node distributed training or multi-node distributed
+training, this utility will launch the given number of processes per node
+(``--nproc_per_node``). If used for GPU training, this number needs to be less
+or euqal to the number of GPUs on the current system (``nproc_per_node``),
+and each process will be operating on a single GPU from *GPU 0 to
+GPU (nproc_per_node - 1)*.
+
+**How to use this module:**
+
+1. Single-Node multi-process distributed training
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+
+2. Multi-Node multi-process distributed training: (e.g. two nodes)
+
+
+Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+Node 2:
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+3. To look up what optional arguments this module offers:
+
+::
+
+    >>> python -m torch.distributed.launch --help
+
+
+**Important Notices:**
+
+1. This utilty and multi-process distributed (single-node or
+multi-node) GPU training currently only achieves the best performance using
+the NCCL distributed backend. Thus NCCL backend is the recommended backend to
+use for GPU training.
+
+2. In your training program, you must parse the command-line argument:
+``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+If your training program uses GPUs, you should ensure that your code only
+runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
+
+Parsing the local_rank argument
+
+::
+
+    >>> import argparse
+    >>> parser = argparse.ArgumentParser()
+    >>> parser.add_argument("--local_rank", type=int)
+    >>> args = parser.parse_args()
+
+Set your device to local rank using either
+
+::
+
+    >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
+
+    or
+
+    >>> with torch.cuda.device(arg.local_rank):
+    >>>    # your code to run
+
+3. In your training program, you are supposed to call the following function
+at the beginning to start the distributed backend. You need to make sure that
+the init_method uses ``env://``, which is the only supported ``init_method``
+by this module.
+
+::
+
+    torch.distributed.init_process_group(backend='YOUR BACKEND',
+                                         init_method='env://')
+
+4. In your training program, you can either use regular distributed functions
+or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
+training program uses GPUs for training and you would like to use
+:func:`torch.nn.parallel.DistributedDataParallel` module,
+here is how to configure it.
+
+::
+
+    model = torch.nn.parallel.DistributedDataParallel(model,
+                                                      device_ids=[arg.local_rank],
+                                                      output_device=arg.local_rank)
+
+Please ensure that ``device_ids`` argument is set to be the only GPU device id
+that your code will be operating on. This is generally the local rank of the
+process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
+and ``output_device`` needs to be ``args.local_rank`` in order to use this
+utility
+
+"""
+
+
+import sys
+import subprocess
+import os
+import socket
+from argparse import ArgumentParser, REMAINDER
+
+import torch
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+
+        # spawn the processes
+        cmd = [sys.executable,
+               "-u",
+               args.training_script,
+               "--local_rank={}".format(local_rank)] + args.training_script_args
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torch/distributed/remote_types.py b/torch/distributed/remote_types.py
new file mode 100644
index 0000000..a8d10cd
--- /dev/null
+++ b/torch/distributed/remote_types.py
@@ -0,0 +1,60 @@
+import torch
+
+from ..storage import _StorageBase
+
+
+class _DistributedBase(object):
+    is_cuda = False
+    is_distributed = True
+
+
+class DoubleStorage(_DistributedBase, torch._C.DistributedDoubleStorageBase, _StorageBase):
+    pass
+
+
+class FloatStorage(_DistributedBase, torch._C.DistributedFloatStorageBase, _StorageBase):
+    pass
+
+
+class LongStorage(_DistributedBase, torch._C.DistributedLongStorageBase, _StorageBase):
+    pass
+
+
+class IntStorage(_DistributedBase, torch._C.DistributedIntStorageBase, _StorageBase):
+    pass
+
+
+class ShortStorage(_DistributedBase, torch._C.DistributedShortStorageBase, _StorageBase):
+    pass
+
+
+class CharStorage(_DistributedBase, torch._C.DistributedCharStorageBase, _StorageBase):
+    pass
+
+
+class ByteStorage(_DistributedBase, torch._C.DistributedByteStorageBase, _StorageBase):
+    pass
+
+
+class HalfStorage(_DistributedBase, torch._C.DistributedHalfStorageBase, _StorageBase):
+    pass
+
+
+torch._storage_classes.add(DoubleStorage)
+torch._storage_classes.add(FloatStorage)
+torch._storage_classes.add(HalfStorage)
+torch._storage_classes.add(LongStorage)
+torch._storage_classes.add(IntStorage)
+torch._storage_classes.add(ShortStorage)
+torch._storage_classes.add(CharStorage)
+torch._storage_classes.add(ByteStorage)
+
+
+_type_names = ['Double', 'Float', 'Half', 'Long', 'Int', 'Short', 'Char', 'Byte']
+_locals = locals()
+_tensors = [_locals[t + 'Tensor'] for t in _type_names]
+_storages = [_locals[t + 'Storage'] for t in _type_names]
+for cls in _tensors + _storages:
+    cls.__module__ = 'torch.distributed'
+torch._C._init_names(_tensors + _storages)
+del _locals, _type_names, _tensors, _storages
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
new file mode 100644
index 0000000..de8c9e0
--- /dev/null
+++ b/torch/distributions/__init__.py
@@ -0,0 +1,144 @@
+r"""
+The ``distributions`` package contains parameterizable probability distributions
+and sampling functions. This allows the construction of stochastic computation
+graphs and stochastic gradient estimators for optimization. This package
+generally follows the design of the `TensorFlow Distributions`_ package.
+
+.. _`TensorFlow Distributions`:
+    https://arxiv.org/abs/1711.10604
+
+It is not possible to directly backpropagate through random samples. However,
+there are two main methods for creating surrogate functions that can be
+backpropagated through. These are the score function estimator/likelihood ratio
+estimator/REINFORCE and the pathwise derivative estimator. REINFORCE is commonly
+seen as the basis for policy gradient methods in reinforcement learning, and the
+pathwise derivative estimator is commonly seen in the reparameterization trick
+in variational autoencoders. Whilst the score function only requires the value
+of samples :math:`f(x)`, the pathwise derivative requires the derivative
+:math:`f'(x)`. The next sections discuss these two in a reinforcement learning
+example. For more details see
+`Gradient Estimation Using Stochastic Computation Graphs`_ .
+
+.. _`Gradient Estimation Using Stochastic Computation Graphs`:
+     https://arxiv.org/abs/1506.05254
+
+Score function
+^^^^^^^^^^^^^^
+
+When the probability density function is differentiable with respect to its
+parameters, we only need :meth:`~torch.distributions.Distribution.sample` and
+:meth:`~torch.distributions.Distribution.log_prob` to implement REINFORCE:
+
+.. math::
+
+    \Delta\theta  = \alpha r \frac{\partial\log p(a|\pi^\theta(s))}{\partial\theta}
+
+where :math:`\theta` are the parameters, :math:`\alpha` is the learning rate,
+:math:`r` is the reward and :math:`p(a|\pi^\theta(s))` is the probability of
+taking action :math:`a` in state :math:`s` given policy :math:`\pi^\theta`.
+
+In practice we would sample an action from the output of a network, apply this
+action in an environment, and then use ``log_prob`` to construct an equivalent
+loss function. Note that we use a negative because optimizers use gradient
+descent, whilst the rule above assumes gradient ascent. With a categorical
+policy, the code for implementing REINFORCE would be as follows::
+
+    probs = policy_network(state)
+    # Note that this is equivalent to what used to be called multinomial
+    m = Categorical(probs)
+    action = m.sample()
+    next_state, reward = env.step(action)
+    loss = -m.log_prob(action) * reward
+    loss.backward()
+
+Pathwise derivative
+^^^^^^^^^^^^^^^^^^^
+
+The other way to implement these stochastic/policy gradients would be to use the
+reparameterization trick from the
+:meth:`~torch.distributions.Distribution.rsample` method, where the
+parameterized random variable can be constructed via a parameterized
+deterministic function of a parameter-free random variable. The reparameterized
+sample therefore becomes differentiable. The code for implementing the pathwise
+derivative would be as follows::
+
+    params = policy_network(state)
+    m = Normal(*params)
+    # Any distribution with .has_rsample == True could work based on the application
+    action = m.rsample()
+    next_state, reward = env.step(action)  # Assuming that reward is differentiable
+    loss = -reward
+    loss.backward()
+"""
+
+from .bernoulli import Bernoulli
+from .beta import Beta
+from .binomial import Binomial
+from .categorical import Categorical
+from .cauchy import Cauchy
+from .chi2 import Chi2
+from .constraint_registry import biject_to, transform_to
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exp_family import ExponentialFamily
+from .exponential import Exponential
+from .fishersnedecor import FisherSnedecor
+from .gamma import Gamma
+from .geometric import Geometric
+from .gumbel import Gumbel
+from .half_cauchy import HalfCauchy
+from .half_normal import HalfNormal
+from .independent import Independent
+from .kl import kl_divergence, register_kl
+from .laplace import Laplace
+from .log_normal import LogNormal
+from .logistic_normal import LogisticNormal
+from .multinomial import Multinomial
+from .multivariate_normal import MultivariateNormal
+from .normal import Normal
+from .one_hot_categorical import OneHotCategorical
+from .pareto import Pareto
+from .poisson import Poisson
+from .relaxed_bernoulli import RelaxedBernoulli
+from .relaxed_categorical import RelaxedOneHotCategorical
+from .studentT import StudentT
+from .transformed_distribution import TransformedDistribution
+from .transforms import *
+from .uniform import Uniform
+
+__all__ = [
+    'Bernoulli',
+    'Beta',
+    'Binomial',
+    'Categorical',
+    'Cauchy',
+    'Chi2',
+    'Dirichlet',
+    'Distribution',
+    'Exponential',
+    'ExponentialFamily',
+    'FisherSnedecor',
+    'Gamma',
+    'Geometric',
+    'Gumbel',
+    'Independent',
+    'Laplace',
+    'LogNormal',
+    'LogisticNormal',
+    'Multinomial',
+    'MultivariateNormal',
+    'Normal',
+    'OneHotCategorical',
+    'Pareto',
+    'RelaxedBernoulli',
+    'RelaxedOneHotCategorical',
+    'StudentT',
+    'Poisson',
+    'Uniform',
+    'TransformedDistribution',
+    'biject_to',
+    'kl_divergence',
+    'register_kl',
+    'transform_to',
+]
+__all__.extend(transforms.__all__)
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
new file mode 100644
index 0000000..7b773b8
--- /dev/null
+++ b/torch/distributions/bernoulli.py
@@ -0,0 +1,97 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property
+from torch.nn.functional import binary_cross_entropy_with_logits
+
+
+class Bernoulli(ExponentialFamily):
+    r"""
+    Creates a Bernoulli distribution parameterized by `probs` or `logits`.
+
+    Samples are binary (0 or 1). They take the value `1` with probability `p`
+    and `0` with probability `1 - p`.
+
+    Example::
+
+        >>> m = Bernoulli(torch.tensor([0.3]))
+        >>> m.sample()  # 30% chance 1; 70% chance 0
+        tensor([ 0.])
+
+    Args:
+        probs (Number, Tensor): the probabilty of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+    """
+    arg_constraints = {'probs': constraints.unit_interval}
+    support = constraints.boolean
+    has_enumerate_support = True
+    _mean_carrier_measure = 0
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            is_scalar = isinstance(probs, Number)
+            self.probs, = broadcast_all(probs)
+        else:
+            is_scalar = isinstance(logits, Number)
+            self.logits, = broadcast_all(logits)
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @property
+    def mean(self):
+        return self.probs
+
+    @property
+    def variance(self):
+        return self.probs * (1 - self.probs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.bernoulli(self.probs.expand(shape))
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        return -binary_cross_entropy_with_logits(logits, value, reduction='none')
+
+    def entropy(self):
+        return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none')
+
+    def enumerate_support(self):
+        values = self._new((2,))
+        torch.arange(2, out=values)
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        values = values.expand((-1,) + self._batch_shape)
+        return values
+
+    @property
+    def _natural_params(self):
+        return (torch.log(self.probs / (1 - self.probs)), )
+
+    def _log_normalizer(self, x):
+        return torch.log(1 + torch.exp(x))
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
new file mode 100644
index 0000000..fd7192f
--- /dev/null
+++ b/torch/distributions/beta.py
@@ -0,0 +1,85 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.dirichlet import Dirichlet
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+
+class Beta(ExponentialFamily):
+    r"""
+    Beta distribution parameterized by `concentration1` and `concentration0`.
+
+    Example::
+
+        >>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
+        >>> m.sample()  # Beta distributed with concentration concentration1 and concentration0
+        tensor([ 0.1046])
+
+    Args:
+        concentration1 (float or Tensor): 1st concentration parameter of the distribution
+            (often referred to as alpha)
+        concentration0 (float or Tensor): 2nd concentration parameter of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {'concentration1': constraints.positive, 'concentration0': constraints.positive}
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, concentration1, concentration0, validate_args=None):
+        if isinstance(concentration1, Number) and isinstance(concentration0, Number):
+            concentration1_concentration0 = torch.tensor([float(concentration1), float(concentration0)])
+        else:
+            concentration1, concentration0 = broadcast_all(concentration1, concentration0)
+            concentration1_concentration0 = torch.stack([concentration1, concentration0], -1)
+        self._dirichlet = Dirichlet(concentration1_concentration0)
+        super(Beta, self).__init__(self._dirichlet._batch_shape, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        return self.concentration1 / (self.concentration1 + self.concentration0)
+
+    @property
+    def variance(self):
+        total = self.concentration1 + self.concentration0
+        return (self.concentration1 * self.concentration0 /
+                (total.pow(2) * (total + 1)))
+
+    def rsample(self, sample_shape=()):
+        value = self._dirichlet.rsample(sample_shape).select(-1, 0)
+        if isinstance(value, Number):
+            value = self._dirichlet.concentration.new_tensor(value)
+        return value
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        heads_tails = torch.stack([value, 1.0 - value], -1)
+        return self._dirichlet.log_prob(heads_tails)
+
+    def entropy(self):
+        return self._dirichlet.entropy()
+
+    @property
+    def concentration1(self):
+        result = self._dirichlet.concentration[..., 0]
+        if isinstance(result, Number):
+            return torch.tensor([result])
+        else:
+            return result
+
+    @property
+    def concentration0(self):
+        result = self._dirichlet.concentration[..., 1]
+        if isinstance(result, Number):
+            return torch.tensor([result])
+        else:
+            return result
+
+    @property
+    def _natural_params(self):
+        return (self.concentration1, self.concentration0)
+
+    def _log_normalizer(self, x, y):
+        return torch.lgamma(x) + torch.lgamma(y) - torch.lgamma(x + y)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
new file mode 100644
index 0000000..28756d4
--- /dev/null
+++ b/torch/distributions/binomial.py
@@ -0,0 +1,111 @@
+from numbers import Number
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs
+
+
+class Binomial(Distribution):
+    r"""
+    Creates a Binomial distribution parameterized by `total_count` and
+    either `probs` or `logits` (but not both). `total_count` must be
+    broadcastable with `probs`/`logits`.
+
+    Example::
+
+        >>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
+        >>> x = m.sample()
+        tensor([   0.,   22.,   71.,  100.])
+
+        >>> m = Binomial(torch.tensor([[5.], [10.]]), torch.tensor([0.5, 0.8]))
+        >>> x = m.sample()
+        tensor([[ 4.,  5.],
+                [ 7.,  6.]])
+
+    Args:
+        total_count (int or Tensor): number of Bernoulli trials
+        probs (Tensor): Event probabilities
+        logits (Tensor): Event log-odds
+    """
+    arg_constraints = {'total_count': constraints.nonnegative_integer,
+                       'probs': constraints.unit_interval}
+    has_enumerate_support = True
+
+    def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            self.total_count, self.probs, = broadcast_all(total_count, probs)
+            self.total_count = self.total_count.type_as(self.logits)
+            is_scalar = isinstance(self.probs, Number)
+        else:
+            self.total_count, self.logits, = broadcast_all(total_count, logits)
+            self.total_count = self.total_count.type_as(self.logits)
+            is_scalar = isinstance(self.logits, Number)
+
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        super(Binomial, self).__init__(batch_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @constraints.dependent_property
+    def support(self):
+        return constraints.integer_interval(0, self.total_count)
+
+    @property
+    def mean(self):
+        return self.total_count * self.probs
+
+    @property
+    def variance(self):
+        return self.total_count * self.probs * (1 - self.probs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def sample(self, sample_shape=torch.Size()):
+        with torch.no_grad():
+            max_count = max(int(self.total_count.max()), 1)
+            shape = self._extended_shape(sample_shape) + (max_count,)
+            bernoullis = torch.bernoulli(self.probs.unsqueeze(-1).expand(shape))
+            if self.total_count.min() != max_count:
+                arange = torch.arange(max_count, out=self.total_count.new_empty(max_count))
+                mask = arange >= self.total_count.unsqueeze(-1)
+                bernoullis.masked_fill_(mask, 0.)
+            return bernoullis.sum(dim=-1)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        log_factorial_n = torch.lgamma(self.total_count + 1)
+        log_factorial_k = torch.lgamma(value + 1)
+        log_factorial_nmk = torch.lgamma(self.total_count - value + 1)
+        max_val = (-self.logits).clamp(min=0.0)
+        # Note that: torch.log1p(-self.probs)) = max_val - torch.log1p((self.logits + 2 * max_val).exp()))
+        return (log_factorial_n - log_factorial_k - log_factorial_nmk +
+                value * self.logits + self.total_count * max_val -
+                self.total_count * torch.log1p((self.logits + 2 * max_val).exp()))
+
+    def enumerate_support(self):
+        total_count = int(self.total_count.max())
+        if not self.total_count.min() == total_count:
+            raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.")
+        values = self._new(1 + total_count,)
+        torch.arange(1 + total_count, out=values)
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        values = values.expand((-1,) + self._batch_shape)
+        return values
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
new file mode 100644
index 0000000..b1ecc5e
--- /dev/null
+++ b/torch/distributions/categorical.py
@@ -0,0 +1,113 @@
+import torch
+from torch._six import nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import probs_to_logits, logits_to_probs, lazy_property, broadcast_all
+
+
+class Categorical(Distribution):
+    r"""
+    Creates a categorical distribution parameterized by either :attr:`probs` or
+    :attr:`logits` (but not both).
+
+    .. note::
+        It is equivalent to the distribution that :func:`torch.multinomial`
+        samples from.
+
+    Samples are integers from `0 ... K-1` where `K` is probs.size(-1).
+
+    If :attr:`probs` is 1D with length-`K`, each element is the relative
+    probability of sampling the class at that index.
+
+    If :attr:`probs` is 2D, it is treated as a batch of relative probability
+    vectors.
+
+    .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1.
+
+    See also: :func:`torch.multinomial`
+
+    Example::
+
+        >>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m.sample()  # equal probability of 0, 1, 2, 3
+        tensor(3)
+
+    Args:
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities
+    """
+    arg_constraints = {'probs': constraints.simplex}
+    has_enumerate_support = True
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            self.probs = probs / probs.sum(-1, keepdim=True)
+        else:
+            self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
+        self._param = self.probs if probs is not None else self.logits
+        self._num_events = self._param.size()[-1]
+        batch_shape = self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
+        super(Categorical, self).__init__(batch_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @constraints.dependent_property
+    def support(self):
+        return constraints.integer_interval(0, self._num_events - 1)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    @property
+    def mean(self):
+        return self.probs.new_tensor(nan).expand(self._extended_shape())
+
+    @property
+    def variance(self):
+        return self.probs.new_tensor(nan).expand(self._extended_shape())
+
+    def sample(self, sample_shape=torch.Size()):
+        sample_shape = self._extended_shape(sample_shape)
+        param_shape = sample_shape + torch.Size((self._num_events,))
+        probs = self.probs.expand(param_shape)
+        if self.probs.dim() == 1 or self.probs.size(0) == 1:
+            probs_2d = probs.view(-1, self._num_events)
+        else:
+            probs_2d = probs.contiguous().view(-1, self._num_events)
+        sample_2d = torch.multinomial(probs_2d, 1, True)
+        return sample_2d.contiguous().view(sample_shape)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        value_shape = torch._C._infer_size(value.size(), self.batch_shape) if self.batch_shape else value.size()
+        param_shape = value_shape + (self._num_events,)
+        value = value.expand(value_shape)
+        log_pmf = self.logits.expand(param_shape)
+        return log_pmf.gather(-1, value.unsqueeze(-1).long()).squeeze(-1)
+
+    def entropy(self):
+        p_log_p = self.logits * self.probs
+        return -p_log_p.sum(-1)
+
+    def enumerate_support(self):
+        num_events = self._num_events
+        values = torch.arange(num_events).long()
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        values = values.expand((-1,) + self._batch_shape)
+        if self._param.is_cuda:
+            values = values.cuda(self._param.get_device())
+        return values
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
new file mode 100644
index 0000000..dec9cfa
--- /dev/null
+++ b/torch/distributions/cauchy.py
@@ -0,0 +1,68 @@
+import math
+from torch._six import inf, nan
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+
+class Cauchy(Distribution):
+    r"""
+    Samples from a Cauchy (Lorentz) distribution. The distribution of the ratio of
+    independent normally distributed random variables with means `0` follows a
+    Cauchy distribution.
+
+    Example::
+
+        >>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Cauchy distribution with loc=0 and scale=1
+        tensor([ 2.3214])
+
+    Args:
+        loc (float or Tensor): mode or median of the distribution.
+        scale (float or Tensor): half width at half maximum.
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.real
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super(Cauchy, self).__init__(batch_shape, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        return self.loc.new_tensor(nan).expand(self._extended_shape())
+
+    @property
+    def variance(self):
+        return self.loc.new_tensor(inf).expand(self._extended_shape())
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = self.loc.new(shape).cauchy_()
+        return self.loc + eps * self.scale
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return -math.log(math.pi) - self.scale.log() - (1 + ((value - self.loc) / self.scale)**2).log()
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5
+
+    def icdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc
+
+    def entropy(self):
+        return math.log(4 * math.pi) + self.scale.log()
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
new file mode 100644
index 0000000..f9dc43d
--- /dev/null
+++ b/torch/distributions/chi2.py
@@ -0,0 +1,26 @@
+from torch.distributions import constraints
+from torch.distributions.gamma import Gamma
+
+
+class Chi2(Gamma):
+    r"""
+    Creates a Chi2 distribution parameterized by shape parameter `df`.
+    This is exactly equivalent to Gamma(alpha=0.5*df, beta=0.5)
+
+    Example::
+
+        >>> m = Chi2(torch.tensor([1.0]))
+        >>> m.sample()  # Chi2 distributed with shape df=1
+        tensor([ 0.1046])
+
+    Args:
+        df (float or Tensor): shape parameter of the distribution
+    """
+    arg_constraints = {'df': constraints.positive}
+
+    def __init__(self, df, validate_args=None):
+        super(Chi2, self).__init__(0.5 * df, 0.5, validate_args=validate_args)
+
+    @property
+    def df(self):
+        return self.concentration * 2
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
new file mode 100644
index 0000000..a263082
--- /dev/null
+++ b/torch/distributions/constraint_registry.py
@@ -0,0 +1,208 @@
+r"""
+PyTorch provides two global :class:`ConstraintRegistry` objects that link
+:class:`~torch.distributions.constraints.Constraint` objects to
+:class:`~torch.distributions.transforms.Transform` objects. These objects both
+input constraints and return transforms, but they have different guarantees on
+bijectivity.
+
+1. ``biject_to(constraint)`` looks up a bijective
+   :class:`~torch.distributions.transforms.Transform` from ``constraints.real``
+   to the given ``constraint``. The returned transform is guaranteed to have
+   ``.bijective = True`` and should implement ``.log_abs_det_jacobian()``.
+2. ``transform_to(constraint)`` looks up a not-necessarily bijective
+   :class:`~torch.distributions.transforms.Transform` from ``constraints.real``
+   to the given ``constraint``. The returned transform is not guaranteed to
+   implement ``.log_abs_det_jacobian()``.
+
+The ``transform_to()`` registry is useful for performing unconstrained
+optimization on constrained parameters of probability distributions, which are
+indicated by each distribution's ``.arg_constraints`` dict. These transforms often
+overparameterize a space in order to avoid rotation; they are thus more
+suitable for coordinate-wise optimization algorithms like Adam::
+
+    loc = torch.zeros(100, requires_grad=True)
+    unconstrained = torch.zeros(100, requires_grad=True)
+    scale = transform_to(Normal.arg_constraints['scale'])(unconstrained)
+    loss = -Normal(loc, scale).log_prob(data).sum()
+
+The ``biject_to()`` registry is useful for Hamiltonian Monte Carlo, where
+samples from a probability distribution with constrained ``.support`` are
+propagated in an unconstrained space, and algorithms are typically rotation
+invariant.::
+
+    dist = Exponential(rate)
+    unconstrained = torch.zeros(100, requires_grad=True)
+    sample = biject_to(dist.support)(unconstrained)
+    potential_energy = -dist.log_prob(sample).sum()
+
+.. note::
+
+    An example where ``transform_to`` and ``biject_to`` differ is
+    ``constraints.simplex``: ``transform_to(constraints.simplex)`` returns a
+    :class:`~torch.distributions.transforms.SoftmaxTransform` that simply
+    exponentiates and normalizes its inputs; this is a cheap and mostly
+    coordinate-wise operation appropriate for algorithms like SVI. In
+    contrast, ``biject_to(constraints.simplex)`` returns a
+    :class:`~torch.distributions.transforms.StickBreakingTransform` that
+    bijects its input down to a one-fewer-dimensional space; this a more
+    expensive less numerically stable transform but is needed for algorithms
+    like HMC.
+
+The ``biject_to`` and ``transform_to`` objects can be extended by user-defined
+constraints and transforms using their ``.register()`` method either as a
+function on singleton constraints::
+
+    transform_to.register(my_constraint, my_transform)
+
+or as a decorator on parameterized constraints::
+
+    @transform_to.register(MyConstraintClass)
+    def my_factory(constraint):
+        assert isinstance(constraint, MyConstraintClass)
+        return MyTransform(constraint.param1, constraint.param2)
+
+You can create your own registry by creating a new :class:`ConstraintRegistry`
+object.
+"""
+
+import numbers
+
+from torch.distributions import constraints, transforms
+
+__all__ = [
+    'ConstraintRegistry',
+    'biject_to',
+    'transform_to',
+]
+
+
+class ConstraintRegistry(object):
+    """
+    Registry to link constraints to transforms.
+    """
+    def __init__(self):
+        self._registry = {}
+
+    def register(self, constraint, factory=None):
+        """
+        Registers a :class:`~torch.distributions.constraints.Constraint`
+        subclass in this registry. Usage::
+
+            @my_registry.register(MyConstraintClass)
+            def construct_transform(constraint):
+                assert isinstance(constraint, MyConstraint)
+                return MyTransform(constraint.arg_constraints)
+
+        Args:
+            constraint (subclass of :class:`~torch.distributions.constraints.Constraint`):
+                A subclass of :class:`~torch.distributions.constraints.Constraint`, or
+                a singleton object of the desired class.
+            factory (callable): A callable that inputs a constraint object and returns
+                a  :class:`~torch.distributions.transforms.Transform` object.
+        """
+        # Support use as decorator.
+        if factory is None:
+            return lambda factory: self.register(constraint, factory)
+
+        # Support calling on singleton instances.
+        if isinstance(constraint, constraints.Constraint):
+            constraint = type(constraint)
+
+        if not isinstance(constraint, type) or not issubclass(constraint, constraints.Constraint):
+            raise TypeError('Expected constraint to be either a Constraint subclass or instance, '
+                            'but got {}'.format(constraint))
+
+        self._registry[constraint] = factory
+        return factory
+
+    def __call__(self, constraint):
+        """
+        Looks up a transform to constrained space, given a constraint object.
+        Usage::
+
+            constraint = Normal.arg_constraints['scale']
+            scale = transform_to(constraint)(torch.zeros(1))  # constrained
+            u = transform_to(constraint).inv(scale)           # unconstrained
+
+        Args:
+            constraint (:class:`~torch.distributions.constraints.Constraint`):
+                A constraint object.
+
+        Returns:
+            A :class:`~torch.distributions.transforms.Transform` object.
+
+        Raises:
+            `NotImplementedError` if no transform has been registered.
+        """
+        # Look up by Constraint subclass.
+        try:
+            factory = self._registry[type(constraint)]
+        except KeyError:
+            raise NotImplementedError(
+                'Cannot transform {} constraints'.format(type(constraint).__name__))
+        return factory(constraint)
+
+
+biject_to = ConstraintRegistry()
+transform_to = ConstraintRegistry()
+
+
+################################################################################
+# Registration Table
+################################################################################
+
+@biject_to.register(constraints.real)
+@transform_to.register(constraints.real)
+def _transform_to_real(constraint):
+    return transforms.identity_transform
+
+
+@biject_to.register(constraints.positive)
+@transform_to.register(constraints.positive)
+def _transform_to_positive(constraint):
+    return transforms.ExpTransform()
+
+
+@biject_to.register(constraints.greater_than)
+@transform_to.register(constraints.greater_than)
+def _transform_to_greater_than(constraint):
+    return transforms.ComposeTransform([transforms.ExpTransform(),
+                                        transforms.AffineTransform(constraint.lower_bound, 1)])
+
+
+@biject_to.register(constraints.less_than)
+@transform_to.register(constraints.less_than)
+def _transform_to_less_than(constraint):
+    return transforms.ComposeTransform([transforms.ExpTransform(),
+                                        transforms.AffineTransform(constraint.upper_bound, -1)])
+
+
+@biject_to.register(constraints.interval)
+@transform_to.register(constraints.interval)
+def _transform_to_interval(constraint):
+    # Handle the special case of the unit interval.
+    lower_is_0 = isinstance(constraint.lower_bound, numbers.Number) and constraint.lower_bound == 0
+    upper_is_1 = isinstance(constraint.upper_bound, numbers.Number) and constraint.upper_bound == 1
+    if lower_is_0 and upper_is_1:
+        return transforms.SigmoidTransform()
+
+    loc = constraint.lower_bound
+    scale = constraint.upper_bound - constraint.lower_bound
+    return transforms.ComposeTransform([transforms.SigmoidTransform(),
+                                        transforms.AffineTransform(loc, scale)])
+
+
+@biject_to.register(constraints.simplex)
+def _biject_to_simplex(constraint):
+    return transforms.StickBreakingTransform()
+
+
+@transform_to.register(constraints.simplex)
+def _transform_to_simplex(constraint):
+    return transforms.SoftmaxTransform()
+
+
+# TODO define a bijection for LowerCholeskyTransform
+@transform_to.register(constraints.lower_cholesky)
+def _transform_to_lower_cholesky(constraint):
+    return transforms.LowerCholeskyTransform()
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
new file mode 100644
index 0000000..18da2bf
--- /dev/null
+++ b/torch/distributions/constraints.py
@@ -0,0 +1,249 @@
+r"""
+The following constraints are implemented:
+
+- ``constraints.boolean``
+- ``constraints.dependent``
+- ``constraints.greater_than(lower_bound)``
+- ``constraints.integer_interval(lower_bound, upper_bound)``
+- ``constraints.interval(lower_bound, upper_bound)``
+- ``constraints.lower_cholesky``
+- ``constraints.lower_triangular``
+- ``constraints.nonnegative_integer``
+- ``constraints.positive``
+- ``constraints.positive_definite``
+- ``constraints.positive_integer``
+- ``constraints.real``
+- ``constraints.real_vector``
+- ``constraints.simplex``
+- ``constraints.unit_interval``
+"""
+
+import torch
+from torch.distributions.utils import batch_tril
+
+__all__ = [
+    'Constraint',
+    'boolean',
+    'dependent',
+    'dependent_property',
+    'greater_than',
+    'integer_interval',
+    'interval',
+    'is_dependent',
+    'less_than',
+    'lower_cholesky',
+    'lower_triangular',
+    'nonnegative_integer',
+    'positive',
+    'positive_definite',
+    'positive_integer',
+    'real',
+    'real_vector',
+    'simplex',
+    'unit_interval',
+]
+
+
+class Constraint(object):
+    """
+    Abstract base class for constraints.
+
+    A constraint object represents a region over which a variable is valid,
+    e.g. within which a variable can be optimized.
+    """
+    def check(self, value):
+        """
+        Returns a byte tensor of `sample_shape + batch_shape` indicating
+        whether each event in value satisfies this constraint.
+        """
+        raise NotImplementedError
+
+
+class _Dependent(Constraint):
+    """
+    Placeholder for variables whose support depends on other variables.
+    These variables obey no simple coordinate-wise constraints.
+    """
+    def check(self, x):
+        raise ValueError('Cannot determine validity of dependent constraint')
+
+
+def is_dependent(constraint):
+    return isinstance(constraint, _Dependent)
+
+
+class _DependentProperty(property, _Dependent):
+    """
+    Decorator that extends @property to act like a `Dependent` constraint when
+    called on a class and act like a property when called on an object.
+
+    Example::
+
+        class Uniform(Distribution):
+            def __init__(self, low, high):
+                self.low = low
+                self.high = high
+            @constraints.dependent_property
+            def support(self):
+                return constraints.interval(self.low, self.high)
+    """
+    pass
+
+
+class _Boolean(Constraint):
+    """
+    Constrain to the two values `{0, 1}`.
+    """
+    def check(self, value):
+        return (value == 0) | (value == 1)
+
+
+class _IntegerInterval(Constraint):
+    """
+    Constrain to an integer interval `[lower_bound, upper_bound]`.
+    """
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+
+    def check(self, value):
+        return (value % 1 == 0) & (self.lower_bound <= value) & (value <= self.upper_bound)
+
+
+class _IntegerLessThan(Constraint):
+    """
+    Constrain to an integer interval `(-inf, upper_bound]`.
+    """
+    def __init__(self, upper_bound):
+        self.upper_bound = upper_bound
+
+    def check(self, value):
+        return (value % 1 == 0) & (value <= self.upper_bound)
+
+
+class _IntegerGreaterThan(Constraint):
+    """
+    Constrain to an integer interval `[lower_bound, inf)`.
+    """
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+
+    def check(self, value):
+        return (value % 1 == 0) & (value >= self.lower_bound)
+
+
+class _Real(Constraint):
+    """
+    Trivially constrain to the extended real line `[-inf, inf]`.
+    """
+    def check(self, value):
+        return value == value  # False for NANs.
+
+
+class _GreaterThan(Constraint):
+    """
+    Constrain to a real half line `(lower_bound, inf]`.
+    """
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+
+    def check(self, value):
+        return self.lower_bound < value
+
+
+class _LessThan(Constraint):
+    """
+    Constrain to a real half line `[-inf, upper_bound)`.
+    """
+    def __init__(self, upper_bound):
+        self.upper_bound = upper_bound
+
+    def check(self, value):
+        return value < self.upper_bound
+
+
+class _Interval(Constraint):
+    """
+    Constrain to a real interval `[lower_bound, upper_bound]`.
+    """
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+
+    def check(self, value):
+        return (self.lower_bound <= value) & (value <= self.upper_bound)
+
+
+class _Simplex(Constraint):
+    """
+    Constrain to the unit simplex in the innermost (rightmost) dimension.
+    Specifically: `x >= 0` and `x.sum(-1) == 1`.
+    """
+    def check(self, value):
+        return (value >= 0).all() & ((value.sum(-1, True) - 1).abs() < 1e-6).all()
+
+
+class _LowerTriangular(Constraint):
+    """
+    Constrain to lower-triangular square matrices.
+    """
+    def check(self, value):
+        value_tril = batch_tril(value)
+        return (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
+
+
+class _LowerCholesky(Constraint):
+    """
+    Constrain to lower-triangular square matrices with positive diagonals.
+    """
+    def check(self, value):
+        value_tril = batch_tril(value)
+        lower_triangular = (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
+
+        n = value.size(-1)
+        diag_mask = torch.eye(n, n, out=value.new(n, n))
+        positive_diagonal = (value * diag_mask > (diag_mask - 1)).min(-1)[0].min(-1)[0]
+        return lower_triangular & positive_diagonal
+
+
+class _PositiveDefinite(Constraint):
+    """
+    Constrain to positive-definite matrices.
+    """
+    def check(self, value):
+        matrix_shape = value.shape[-2:]
+        batch_shape = value.unsqueeze(0).shape[:-2]
+        # TODO: replace with batched linear algebra routine when one becomes available
+        # note that `symeig()` returns eigenvalues in ascending order
+        flattened_value = value.contiguous().view((-1,) + matrix_shape)
+        return torch.stack([v.symeig(eigenvectors=False)[0][:1] > 0.0
+                            for v in flattened_value]).view(batch_shape)
+
+
+class _RealVector(Constraint):
+    """
+    Constrain to real-valued vectors. This is the same as `constraints.real`,
+    but additionally reduces across the `event_shape` dimension.
+    """
+    def check(self, value):
+        return (value == value).all()  # False for NANs.
+
+
+# Public interface.
+dependent = _Dependent()
+dependent_property = _DependentProperty
+boolean = _Boolean()
+nonnegative_integer = _IntegerGreaterThan(0)
+positive_integer = _IntegerGreaterThan(1)
+integer_interval = _IntegerInterval
+real = _Real()
+real_vector = _RealVector()
+positive = _GreaterThan(0.)
+greater_than = _GreaterThan
+less_than = _LessThan
+unit_interval = _Interval(0., 1.)
+interval = _Interval
+simplex = _Simplex()
+lower_triangular = _LowerTriangular()
+lower_cholesky = _LowerCholesky()
+positive_definite = _PositiveDefinite()
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
new file mode 100644
index 0000000..fb66a5b
--- /dev/null
+++ b/torch/distributions/dirichlet.py
@@ -0,0 +1,96 @@
+from numbers import Number
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import _finfo, broadcast_all, clamp_probs
+
+
+def _dirichlet_sample_nograd(concentration):
+    probs = torch._standard_gamma(concentration)
+    probs /= probs.sum(-1, True)
+    return clamp_probs(probs)
+
+
+# This helper is exposed for testing.
+def _Dirichlet_backward(x, concentration, grad_output):
+    total = concentration.sum(-1, True).expand_as(concentration)
+    grad = torch._dirichlet_grad(x, concentration, total)
+    return grad * (grad_output - (x * grad_output).sum(-1, True))
+
+
+class _Dirichlet(Function):
+    @staticmethod
+    def forward(ctx, concentration):
+        x = _dirichlet_sample_nograd(concentration)
+        ctx.save_for_backward(x, concentration)
+        return x
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        x, concentration = ctx.saved_tensors
+        return _Dirichlet_backward(x, concentration, grad_output)
+
+
+class Dirichlet(ExponentialFamily):
+    r"""
+    Creates a Dirichlet distribution parameterized by concentration `concentration`.
+
+    Example::
+
+        >>> m = Dirichlet(torch.tensor([0.5, 0.5]))
+        >>> m.sample()  # Dirichlet distributed with concentrarion concentration
+        tensor([ 0.1046,  0.8954])
+
+    Args:
+        concentration (Tensor): concentration parameter of the distribution
+            (often referred to as alpha)
+    """
+    arg_constraints = {'concentration': constraints.positive}
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, concentration, validate_args=None):
+        self.concentration, = broadcast_all(concentration)
+        batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
+        super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def rsample(self, sample_shape=()):
+        shape = self._extended_shape(sample_shape)
+        concentration = self.concentration.expand(shape)
+        if isinstance(concentration, torch.Tensor):
+            return _Dirichlet.apply(concentration)
+        return _dirichlet_sample_nograd(concentration)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return ((torch.log(value) * (self.concentration - 1.0)).sum(-1) +
+                torch.lgamma(self.concentration.sum(-1)) -
+                torch.lgamma(self.concentration).sum(-1))
+
+    @property
+    def mean(self):
+        return self.concentration / self.concentration.sum(-1, True)
+
+    @property
+    def variance(self):
+        con0 = self.concentration.sum(-1, True)
+        return self.concentration * (con0 - self.concentration) / (con0.pow(2) * (con0 + 1))
+
+    def entropy(self):
+        k = self.concentration.size(-1)
+        a0 = self.concentration.sum(-1)
+        return (torch.lgamma(self.concentration).sum(-1) - torch.lgamma(a0) -
+                (k - a0) * torch.digamma(a0) -
+                ((self.concentration - 1.0) * torch.digamma(self.concentration)).sum(-1))
+
+    @property
+    def _natural_params(self):
+        return (self.concentration, )
+
+    def _log_normalizer(self, x):
+        return x.lgamma().sum(-1) - torch.lgamma(x.sum(-1))
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
new file mode 100644
index 0000000..8e6be31
--- /dev/null
+++ b/torch/distributions/distribution.py
@@ -0,0 +1,224 @@
+import torch
+import warnings
+from torch.distributions import constraints
+from torch.distributions.utils import lazy_property
+
+
+class Distribution(object):
+    r"""
+    Distribution is the abstract base class for probability distributions.
+    """
+
+    has_rsample = False
+    has_enumerate_support = False
+    _validate_args = False
+    support = None
+    arg_constraints = {}
+
+    @staticmethod
+    def set_default_validate_args(value):
+        if value not in [True, False]:
+            raise ValueError
+        Distribution._validate_args = value
+
+    def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_args=None):
+        self._batch_shape = batch_shape
+        self._event_shape = event_shape
+        if validate_args is not None:
+            self._validate_args = validate_args
+        if self._validate_args:
+            for param, constraint in self.arg_constraints.items():
+                if constraints.is_dependent(constraint):
+                    continue  # skip constraints that cannot be checked
+                if param not in self.__dict__ and isinstance(getattr(type(self), param), lazy_property):
+                    continue  # skip checking lazily-constructed args
+                if not constraint.check(getattr(self, param)).all():
+                    raise ValueError("The parameter {} has invalid values".format(param))
+
+    @property
+    def batch_shape(self):
+        """
+        Returns the shape over which parameters are batched.
+        """
+        return self._batch_shape
+
+    @property
+    def event_shape(self):
+        """
+        Returns the shape of a single sample (without batching).
+        """
+        return self._event_shape
+
+    @property
+    def arg_constraints(self):
+        """
+        Returns a dictionary from argument names to
+        :class:`~torch.distributions.constraints.Constraint` objects that
+        should be satisfied by each argument of this distribution. Args that
+        are not tensors need not appear in this dict.
+        """
+        raise NotImplementedError
+
+    @property
+    def support(self):
+        """
+        Returns a :class:`~torch.distributions.constraints.Constraint` object
+        representing this distribution's support.
+        """
+        raise NotImplementedError
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        raise NotImplementedError
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        raise NotImplementedError
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+    def sample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped sample or sample_shape shaped batch of
+        samples if the distribution parameters are batched.
+        """
+        with torch.no_grad():
+            return self.rsample(sample_shape)
+
+    def rsample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched.
+        """
+        raise NotImplementedError
+
+    def sample_n(self, n):
+        """
+        Generates n samples or n batches of samples if the distribution
+        parameters are batched.
+        """
+        warnings.warn('sample_n will be deprecated. Use .sample((n,)) instead', UserWarning)
+        return self.sample(torch.Size((n,)))
+
+    def log_prob(self, value):
+        """
+        Returns the log of the probability density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def cdf(self, value):
+        """
+        Returns the cumulative density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def icdf(self, value):
+        """
+        Returns the inverse cumulative density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def enumerate_support(self):
+        """
+        Returns tensor containing all values supported by a discrete
+        distribution. The result will enumerate over dimension 0, so the shape
+        of the result will be `(cardinality,) + batch_shape + event_shape`
+        (where `event_shape = ()` for univariate distributions).
+
+        Note that this enumerates over all batched tensors in lock-step
+        `[[0, 0], [1, 1], ...]`. To iterate over the full Cartesian product
+        use `itertools.product(m.enumerate_support())`.
+
+        Returns:
+            Tensor iterating over dimension 0.
+        """
+        raise NotImplementedError
+
+    def entropy(self):
+        """
+        Returns entropy of distribution, batched over batch_shape.
+
+        Returns:
+            Tensor of shape batch_shape.
+        """
+        raise NotImplementedError
+
+    def perplexity(self):
+        """
+        Returns perplexity of distribution, batched over batch_shape.
+
+        Returns:
+            Tensor of shape batch_shape.
+        """
+        return torch.exp(self.entropy())
+
+    def _extended_shape(self, sample_shape=torch.Size()):
+        """
+        Returns the size of the sample returned by the distribution, given
+        a `sample_shape`. Note, that the batch and event shapes of a distribution
+        instance are fixed at the time of construction. If this is empty, the
+        returned shape is upcast to (1,).
+
+        Args:
+            sample_shape (torch.Size): the size of the sample to be drawn.
+        """
+        return torch.Size(sample_shape + self._batch_shape + self._event_shape)
+
+    def _validate_sample(self, value):
+        """
+        Argument validation for distribution methods such as `log_prob`,
+        `cdf` and `icdf`. The rightmost dimensions of a value to be
+        scored via these methods must agree with the distribution's batch
+        and event shapes.
+
+        Args:
+            value (Tensor): the tensor whose log probability is to be
+                computed by the `log_prob` method.
+        Raises
+            ValueError: when the rightmost dimensions of `value` do not match the
+                distribution's batch and event shapes.
+        """
+        if not isinstance(value, torch.Tensor):
+            raise ValueError('The value argument to log_prob must be a Tensor')
+
+        event_dim_start = len(value.size()) - len(self._event_shape)
+        if value.size()[event_dim_start:] != self._event_shape:
+            raise ValueError('The right-most size of value must match event_shape: {} vs {}.'.
+                             format(value.size(), self._event_shape))
+
+        actual_shape = value.size()
+        expected_shape = self._batch_shape + self._event_shape
+        for i, j in zip(reversed(actual_shape), reversed(expected_shape)):
+            if i != 1 and j != 1 and i != j:
+                raise ValueError('Value is not broadcastable with batch_shape+event_shape: {} vs {}.'.
+                                 format(actual_shape, expected_shape))
+
+        if not self.support.check(value).all():
+            raise ValueError('The value argument must be within the support')
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
diff --git a/torch/distributions/exp_family.py b/torch/distributions/exp_family.py
new file mode 100644
index 0000000..f686863
--- /dev/null
+++ b/torch/distributions/exp_family.py
@@ -0,0 +1,60 @@
+import torch
+from torch.distributions.distribution import Distribution
+
+
+class ExponentialFamily(Distribution):
+    r"""
+    ExponentialFamily is the abstract base class for probability distributions belonging to an
+    exponential family, whose probability mass/density function has the form is defined below
+
+    .. math::
+
+        p_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle) - F(\theta) + k(x))
+
+    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes the sufficient statistic,
+    :math:`F(\theta)` is the log normalizer function for a given family and :math:`k(x)` is the carrier
+    measure.
+
+    Note:
+        This class is an intermediary between the `Distribution` class and distributions which belong
+        to an exponential family mainly to check the correctness of the `.entropy()` and analytic KL
+        divergence methods. We use this class to compute the entropy and KL divergence using the AD frame-
+        work and Bregman divergences (courtesy of: Frank Nielsen and Richard Nock, Entropies and
+        Cross-entropies of Exponential Families).
+    """
+
+    @property
+    def _natural_params(self):
+        """
+        Abstract method for natural parameters. Returns a tuple of Tensors based
+        on the distribution
+        """
+        raise NotImplementedError
+
+    def _log_normalizer(self, *natural_params):
+        """
+        Abstract method for log normalizer function. Returns a log normalizer based on
+        the distribution and input
+        """
+        raise NotImplementedError
+
+    @property
+    def _mean_carrier_measure(self):
+        """
+        Abstract method for expected carrier measure, which is required for computing
+        entropy.
+        """
+        raise NotImplementedError
+
+    def entropy(self):
+        """
+        Method to compute the entropy using Bregman divergence of the log normalizer.
+        """
+        result = -self._mean_carrier_measure
+        nparams = [p.detach().requires_grad_() for p in self._natural_params]
+        lg_normal = self._log_normalizer(*nparams)
+        gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True)
+        result += lg_normal.clone()
+        for np, g in zip(nparams, gradients):
+            result -= np * g
+        return result
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
new file mode 100644
index 0000000..64d8f8a
--- /dev/null
+++ b/torch/distributions/exponential.py
@@ -0,0 +1,71 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+
+class Exponential(ExponentialFamily):
+    r"""
+    Creates a Exponential distribution parameterized by `rate`.
+
+    Example::
+
+        >>> m = Exponential(torch.tensor([1.0]))
+        >>> m.sample()  # Exponential distributed with rate=1
+        tensor([ 0.1046])
+
+    Args:
+        rate (float or Tensor): rate = 1 / scale of the distribution
+    """
+    arg_constraints = {'rate': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.rate.reciprocal()
+
+    @property
+    def stddev(self):
+        return self.rate.reciprocal()
+
+    @property
+    def variance(self):
+        return self.rate.pow(-2)
+
+    def __init__(self, rate, validate_args=None):
+        self.rate, = broadcast_all(rate)
+        batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size()
+        super(Exponential, self).__init__(batch_shape, validate_args=validate_args)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        return self.rate.new(shape).exponential_() / self.rate
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return self.rate.log() - self.rate * value
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 1 - torch.exp(-self.rate * value)
+
+    def icdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return -torch.log(1 - value) / self.rate
+
+    def entropy(self):
+        return 1.0 - torch.log(self.rate)
+
+    @property
+    def _natural_params(self):
+        return (-self.rate, )
+
+    def _log_normalizer(self, x):
+        return -torch.log(-x)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
new file mode 100644
index 0000000..2391559
--- /dev/null
+++ b/torch/distributions/fishersnedecor.py
@@ -0,0 +1,72 @@
+from numbers import Number
+import torch
+import math
+from torch._six import nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.gamma import Gamma
+from torch.distributions.utils import broadcast_all, _finfo
+
+
+class FisherSnedecor(Distribution):
+    r"""
+    Creates a Fisher-Snedecor distribution parameterized by `df1` and `df2`.
+
+    Example::
+
+        >>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
+        >>> m.sample()  # Fisher-Snedecor-distributed with df1=1 and df2=2
+        tensor([ 0.2453])
+
+    Args:
+        df1 (float or Tensor): degrees of freedom parameter 1
+        df2 (float or Tensor): degrees of freedom parameter 2
+    """
+    arg_constraints = {'df1': constraints.positive, 'df2': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, df1, df2, validate_args=None):
+        self.df1, self.df2 = broadcast_all(df1, df2)
+        self._gamma1 = Gamma(self.df1 * 0.5, self.df1)
+        self._gamma2 = Gamma(self.df2 * 0.5, self.df2)
+
+        if isinstance(df1, Number) and isinstance(df2, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.df1.size()
+        super(FisherSnedecor, self).__init__(batch_shape, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        df2 = self.df2.clone()
+        df2[df2 <= 2] = nan
+        return df2 / (df2 - 2)
+
+    @property
+    def variance(self):
+        df2 = self.df2.clone()
+        df2[df2 <= 4] = nan
+        return 2 * df2.pow(2) * (self.df1 + df2 - 2) / (self.df1 * (df2 - 2).pow(2) * (df2 - 4))
+
+    def rsample(self, sample_shape=torch.Size(())):
+        shape = self._extended_shape(sample_shape)
+        #   X1 ~ Gamma(df1 / 2, 1 / df1), X2 ~ Gamma(df2 / 2, 1 / df2)
+        #   Y = df2 * df1 * X1 / (df1 * df2 * X2) = X1 / X2 ~ F(df1, df2)
+        X1 = self._gamma1.rsample(sample_shape).view(shape)
+        X2 = self._gamma2.rsample(sample_shape).view(shape)
+        X2.clamp_(min=_finfo(X2).tiny)
+        Y = X1 / X2
+        Y.clamp_(min=_finfo(X2).tiny)
+        return Y
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        ct1 = self.df1 * 0.5
+        ct2 = self.df2 * 0.5
+        ct3 = self.df1 / self.df2
+        t1 = (ct1 + ct2).lgamma() - ct1.lgamma() - ct2.lgamma()
+        t2 = ct1 * ct3.log() + (ct1 - 1) * torch.log(value)
+        t3 = (ct1 + ct2) * torch.log1p(ct3 * value)
+        return t1 + t2 - t3
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
new file mode 100644
index 0000000..0b25cd3
--- /dev/null
+++ b/torch/distributions/gamma.py
@@ -0,0 +1,73 @@
+from numbers import Number
+
+import torch
+from torch.autograd import Function
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import _finfo, broadcast_all, lazy_property
+
+
+def _standard_gamma(concentration):
+    return concentration._standard_gamma()
+
+
+class Gamma(ExponentialFamily):
+    r"""
+    Creates a Gamma distribution parameterized by shape `concentration` and `rate`.
+
+    Example::
+
+        >>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # Gamma distributed with concentration=1 and rate=1
+        tensor([ 0.1046])
+
+    Args:
+        concentration (float or Tensor): shape parameter of the distribution
+            (often referred to as alpha)
+        rate (float or Tensor): rate = 1 / scale of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {'concentration': constraints.positive, 'rate': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.concentration / self.rate
+
+    @property
+    def variance(self):
+        return self.concentration / self.rate.pow(2)
+
+    def __init__(self, concentration, rate, validate_args=None):
+        self.concentration, self.rate = broadcast_all(concentration, rate)
+        if isinstance(concentration, Number) and isinstance(rate, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.concentration.size()
+        super(Gamma, self).__init__(batch_shape, validate_args=validate_args)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        value = _standard_gamma(self.concentration.expand(shape)) / self.rate.expand(shape)
+        value.detach().clamp_(min=_finfo(value).tiny)  # do not record in autograd graph
+        return value
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return (self.concentration * torch.log(self.rate) +
+                (self.concentration - 1) * torch.log(value) -
+                self.rate * value - torch.lgamma(self.concentration))
+
+    def entropy(self):
+        return (self.concentration - torch.log(self.rate) + torch.lgamma(self.concentration) +
+                (1.0 - self.concentration) * torch.digamma(self.concentration))
+
+    @property
+    def _natural_params(self):
+        return (self.concentration - 1, -self.rate)
+
+    def _log_normalizer(self, x, y):
+        return torch.lgamma(x + 1) + (x + 1) * torch.log(-y.reciprocal())
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
new file mode 100644
index 0000000..7812ab0
--- /dev/null
+++ b/torch/distributions/geometric.py
@@ -0,0 +1,77 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property, _finfo
+from torch.nn.functional import binary_cross_entropy_with_logits
+
+
+class Geometric(Distribution):
+    r"""
+    Creates a Geometric distribution parameterized by `probs`, where `probs` is the probability of success of Bernoulli
+    trials. It represents the probability that in k + 1 Bernoulli trials, the first k trials failed, before
+    seeing a success.
+
+    Samples are non-negative integers [0, inf).
+
+    Example::
+
+        >>> m = Geometric(torch.tensor([0.3]))
+        >>> m.sample()  # underlying Bernoulli has 30% chance 1; 70% chance 0
+        tensor([ 2.])
+
+    Args:
+        probs (Number, Tensor): the probabilty of sampling `1`. Must be in range (0, 1]
+        logits (Number, Tensor): the log-odds of sampling `1`.
+    """
+    arg_constraints = {'probs': constraints.unit_interval}
+    support = constraints.nonnegative_integer
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            self.probs, = broadcast_all(probs)
+            if not self.probs.gt(0).all():
+                raise ValueError('All elements of probs must be greater than 0')
+        else:
+            self.logits, = broadcast_all(logits)
+        probs_or_logits = probs if probs is not None else logits
+        if isinstance(probs_or_logits, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = probs_or_logits.size()
+        super(Geometric, self).__init__(batch_shape, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        return 1. / self.probs - 1.
+
+    @property
+    def variance(self):
+        return (1. / self.probs - 1.) / self.probs
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1)
+            return (u.log() / (-self.probs).log1p()).floor()
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        value, probs = broadcast_all(value, self.probs.clone())
+        probs[(probs == 1) & (value == 0)] = 0
+        return value * (-probs).log1p() + self.probs.log()
+
+    def entropy(self):
+        return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none') / self.probs
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
new file mode 100644
index 0000000..e4e96aa
--- /dev/null
+++ b/torch/distributions/gumbel.py
@@ -0,0 +1,56 @@
+from numbers import Number
+import math
+import torch
+from torch.distributions import constraints
+from torch.distributions.uniform import Uniform
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, ExpTransform
+from torch.distributions.utils import _finfo, broadcast_all
+
+euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
+
+
+class Gumbel(TransformedDistribution):
+    r"""
+    Samples from a Gumbel Distribution.
+
+    Examples::
+
+        >>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
+        >>> m.sample()  # sample from Gumbel distribution with loc=1, scale=2
+        tensor([ 1.0124])
+
+    Args:
+        loc (float or Tensor): Location parameter of the distribution
+        scale (float or Tensor): Scale parameter of the distribution
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.real
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        finfo = _finfo(self.loc)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+            base_dist = Uniform(finfo.tiny, 1 - finfo.eps)
+        else:
+            batch_shape = self.scale.size()
+            base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps)
+        transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
+                      ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
+        super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        return self.loc + self.scale * euler_constant
+
+    @property
+    def stddev(self):
+        return (math.pi / math.sqrt(6)) * self.scale
+
+    @property
+    def variance(self):
+        return self.stddev.pow(2)
+
+    def entropy(self):
+        return self.scale.log() + (1 + euler_constant)
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
new file mode 100644
index 0000000..77a50d3
--- /dev/null
+++ b/torch/distributions/half_cauchy.py
@@ -0,0 +1,58 @@
+import math
+
+from torch._six import inf
+from torch.distributions import constraints
+from torch.distributions.transforms import AbsTransform
+from torch.distributions.cauchy import Cauchy
+from torch.distributions.transformed_distribution import TransformedDistribution
+
+
+class HalfCauchy(TransformedDistribution):
+    r"""
+    Creates a half-normal distribution parameterized by `scale` where::
+
+        X ~ Cauchy(0, scale)
+        Y = |X| ~ HalfCauchy(scale)
+
+    Example::
+
+        >>> m = HalfCauchy(torch.tensor([1.0]))
+        >>> m.sample()  # half-cauchy distributed with scale=1
+        tensor([ 2.3214])
+
+    Args:
+        scale (float or Tensor): scale of the full Cauchy distribution
+    """
+    arg_constraints = {'scale': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, scale, validate_args=None):
+        super(HalfCauchy, self).__init__(Cauchy(0, scale), AbsTransform(),
+                                         validate_args=validate_args)
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return self.base_dist.mean
+
+    @property
+    def variance(self):
+        return self.base_dist.variance
+
+    def log_prob(self, value):
+        log_prob = self.base_dist.log_prob(value) + math.log(2)
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
+        return log_prob
+
+    def cdf(self, value):
+        return 2 * self.base_dist.cdf(value) - 1
+
+    def icdf(self, prob):
+        return self.base_dist.icdf((prob + 1) / 2)
+
+    def entropy(self):
+        return self.base_dist.entropy() - math.log(2)
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
new file mode 100644
index 0000000..059f383
--- /dev/null
+++ b/torch/distributions/half_normal.py
@@ -0,0 +1,58 @@
+import math
+
+from torch._six import inf
+from torch.distributions import constraints
+from torch.distributions.transforms import AbsTransform
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+
+
+class HalfNormal(TransformedDistribution):
+    r"""
+    Creates a half-normal distribution parameterized by `scale` where::
+
+        X ~ Normal(0, scale)
+        Y = |X| ~ HalfNormal(scale)
+
+    Example::
+
+        >>> m = HalfNormal(torch.tensor([1.0]))
+        >>> m.sample()  # half-normal distributed with scale=1
+        tensor([ 0.1046])
+
+    Args:
+        scale (float or Tensor): scale of the full Normal distribution
+    """
+    arg_constraints = {'scale': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, scale, validate_args=None):
+        super(HalfNormal, self).__init__(Normal(0, scale), AbsTransform(),
+                                         validate_args=validate_args)
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return self.scale * math.sqrt(2 / math.pi)
+
+    @property
+    def variance(self):
+        return self.scale.pow(2) * (1 - 2 / math.pi)
+
+    def log_prob(self, value):
+        log_prob = self.base_dist.log_prob(value) + math.log(2)
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
+        return log_prob
+
+    def cdf(self, value):
+        return 2 * self.base_dist.cdf(value) - 1
+
+    def icdf(self, prob):
+        return self.base_dist.icdf((prob + 1) / 2)
+
+    def entropy(self):
+        return self.base_dist.entropy() - math.log(2)
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
new file mode 100644
index 0000000..1445cce
--- /dev/null
+++ b/torch/distributions/independent.py
@@ -0,0 +1,88 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import _sum_rightmost
+
+
+class Independent(Distribution):
+    r"""
+    Reinterprets some of the batch dims of a distribution as event dims.
+
+    This is mainly useful for changing the shape of the result of
+    :meth:`log_prob`. For example to create a diagonal Normal distribution with
+    the same shape as a Multivariate Normal distribution (so they are
+    interchangeable), you can::
+
+        >>> loc = torch.zeros(3)
+        >>> scale = torch.ones(3)
+        >>> mvn = MultivariateNormal(loc, scale_tril=torch.diag(scale))
+        >>> [mvn.batch_shape, mvn.event_shape]
+        [torch.Size(()), torch.Size((3,))]
+        >>> normal = Normal(loc, scale)
+        >>> [normal.batch_shape, normal.event_shape]
+        [torch.Size((3,)), torch.Size(())]
+        >>> diagn = Independent(normal, 1)
+        >>> [diagn.batch_shape, diagn.event_shape]
+        [torch.Size(()), torch.Size((3,))]
+
+    Args:
+        base_distribution (torch.distributions.distribution.Distribution): a
+            base distribution
+        reinterpreted_batch_ndims (int): the number of batch dims to
+            reinterpret as event dims
+    """
+    arg_constraints = {}
+
+    def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=None):
+        if reinterpreted_batch_ndims > len(base_distribution.batch_shape):
+            raise ValueError("Expected reinterpreted_batch_ndims <= len(base_distribution.batch_shape), "
+                             "actual {} vs {}".format(reinterpreted_batch_ndims,
+                                                      len(base_distribution.batch_shape)))
+        shape = base_distribution.batch_shape + base_distribution.event_shape
+        event_dim = reinterpreted_batch_ndims + len(base_distribution.event_shape)
+        batch_shape = shape[:len(shape) - event_dim]
+        event_shape = shape[len(shape) - event_dim:]
+        self.base_dist = base_distribution
+        self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
+        super(Independent, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    @property
+    def has_rsample(self):
+        return self.base_dist.has_rsample
+
+    @property
+    def has_enumerate_support(self):
+        if self.reinterpreted_batch_ndims > 0:
+            return False
+        return self.base_dist.has_enumerate_support
+
+    @constraints.dependent_property
+    def support(self):
+        return self.base_dist.support
+
+    @property
+    def mean(self):
+        return self.base_dist.mean
+
+    @property
+    def variance(self):
+        return self.base_dist.variance
+
+    def sample(self, sample_shape=torch.Size()):
+        return self.base_dist.sample(sample_shape)
+
+    def rsample(self, sample_shape=torch.Size()):
+        return self.base_dist.rsample(sample_shape)
+
+    def log_prob(self, value):
+        log_prob = self.base_dist.log_prob(value)
+        return _sum_rightmost(log_prob, self.reinterpreted_batch_ndims)
+
+    def entropy(self):
+        entropy = self.base_dist.entropy()
+        return _sum_rightmost(entropy, self.reinterpreted_batch_ndims)
+
+    def enumerate_support(self):
+        if self.reinterpreted_batch_ndims > 0:
+            raise NotImplementedError("Enumeration over cartesian product is not implemented")
+        return self.base_dist.enumerate_support()
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
new file mode 100644
index 0000000..caedb3e
--- /dev/null
+++ b/torch/distributions/kl.py
@@ -0,0 +1,643 @@
+import math
+import warnings
+from functools import total_ordering
+
+import torch
+from torch._six import inf
+
+from .bernoulli import Bernoulli
+from .beta import Beta
+from .binomial import Binomial
+from .categorical import Categorical
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exponential import Exponential
+from .exp_family import ExponentialFamily
+from .gamma import Gamma
+from .geometric import Geometric
+from .gumbel import Gumbel
+from .half_normal import HalfNormal
+from .laplace import Laplace
+from .logistic_normal import LogisticNormal
+from .multivariate_normal import MultivariateNormal, _batch_mahalanobis, _batch_diag, _batch_inverse
+from .normal import Normal
+from .one_hot_categorical import OneHotCategorical
+from .pareto import Pareto
+from .poisson import Poisson
+from .transformed_distribution import TransformedDistribution
+from .uniform import Uniform
+from .utils import _sum_rightmost
+
+_KL_REGISTRY = {}  # Source of truth mapping a few general (type, type) pairs to functions.
+_KL_MEMOIZE = {}  # Memoized version mapping many specific (type, type) pairs to functions.
+
+
+def register_kl(type_p, type_q):
+    """
+    Decorator to register a pairwise function with :meth:`kl_divergence`.
+    Usage::
+
+        @register_kl(Normal, Normal)
+        def kl_normal_normal(p, q):
+            # insert implementation here
+
+    Lookup returns the most specific (type,type) match ordered by subclass. If
+    the match is ambiguous, a `RuntimeWarning` is raised. For example to
+    resolve the ambiguous situation::
+
+        @register_kl(BaseP, DerivedQ)
+        def kl_version1(p, q): ...
+        @register_kl(DerivedP, BaseQ)
+        def kl_version2(p, q): ...
+
+    you should register a third most-specific implementation, e.g.::
+
+        register_kl(DerivedP, DerivedQ)(kl_version1)  # Break the tie.
+
+    Args:
+        type_p (type): A subclass of :class:`~torch.distributions.Distribution`.
+        type_q (type): A subclass of :class:`~torch.distributions.Distribution`.
+    """
+    if not isinstance(type_p, type) and issubclass(type_p, Distribution):
+        raise TypeError('Expected type_p to be a Distribution subclass but got {}'.format(type_p))
+    if not isinstance(type_q, type) and issubclass(type_q, Distribution):
+        raise TypeError('Expected type_q to be a Distribution subclass but got {}'.format(type_q))
+
+    def decorator(fun):
+        _KL_REGISTRY[type_p, type_q] = fun
+        _KL_MEMOIZE.clear()  # reset since lookup order may have changed
+        return fun
+
+    return decorator
+
+
+@total_ordering
+class _Match(object):
+    __slots__ = ['types']
+
+    def __init__(self, *types):
+        self.types = types
+
+    def __eq__(self, other):
+        return self.types == other.types
+
+    def __le__(self, other):
+        for x, y in zip(self.types, other.types):
+            if not issubclass(x, y):
+                return False
+            if x is not y:
+                break
+        return True
+
+
+def _dispatch_kl(type_p, type_q):
+    """
+    Find the most specific approximate match, assuming single inheritance.
+    """
+    matches = [(super_p, super_q) for super_p, super_q in _KL_REGISTRY
+               if issubclass(type_p, super_p) and issubclass(type_q, super_q)]
+    if not matches:
+        return NotImplemented
+    # Check that the left- and right- lexicographic orders agree.
+    left_p, left_q = min(_Match(*m) for m in matches).types
+    right_q, right_p = min(_Match(*reversed(m)) for m in matches).types
+    left_fun = _KL_REGISTRY[left_p, left_q]
+    right_fun = _KL_REGISTRY[right_p, right_q]
+    if left_fun is not right_fun:
+        warnings.warn('Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
+            type_p.__name__, type_q.__name__, left_p.__name__, right_q.__name__),
+            RuntimeWarning)
+    return left_fun
+
+
+def _infinite_like(tensor):
+    """
+    Helper function for obtaining infinite KL Divergence throughout
+    """
+    return tensor.new_tensor(inf).expand_as(tensor)
+
+
+def _x_log_x(tensor):
+    """
+    Utility function for calculating x log x
+    """
+    return tensor * tensor.log()
+
+
+def _batch_trace_XXT(bmat):
+    """
+    Utility function for calculating the trace of XX^{T} with X having arbitrary trailing batch dimensions
+    """
+    mat_size = bmat.size(-1)
+    flat_trace = bmat.reshape(-1, mat_size * mat_size).pow(2).sum(-1)
+    return flat_trace.view(bmat.shape[:-2])
+
+
+def kl_divergence(p, q):
+    r"""
+    Compute Kullback-Leibler divergence :math:`KL(p \| q)` between two distributions.
+
+    .. math::
+
+        KL(p \| q) = \int p(x) \log\frac {p(x)} {q(x)} \,dx
+
+    Args:
+        p (Distribution): A :class:`~torch.distributions.Distribution` object.
+        q (Distribution): A :class:`~torch.distributions.Distribution` object.
+
+    Returns:
+        Tensor: A batch of KL divergences of shape `batch_shape`.
+
+    Raises:
+        NotImplementedError: If the distribution types have not been registered via
+            :meth:`register_kl`.
+    """
+    try:
+        fun = _KL_MEMOIZE[type(p), type(q)]
+    except KeyError:
+        fun = _dispatch_kl(type(p), type(q))
+        _KL_MEMOIZE[type(p), type(q)] = fun
+    if fun is NotImplemented:
+        raise NotImplementedError
+    return fun(p, q)
+
+
+################################################################################
+# KL Divergence Implementations
+################################################################################
+
+_euler_gamma = 0.57721566490153286060
+
+# Same distributions
+
+
+@register_kl(Bernoulli, Bernoulli)
+def _kl_bernoulli_bernoulli(p, q):
+    t1 = p.probs * (p.probs / q.probs).log()
+    t1[q.probs == 0] = inf
+    t1[p.probs == 0] = 0
+    t2 = (1 - p.probs) * ((1 - p.probs) / (1 - q.probs)).log()
+    t2[q.probs == 1] = inf
+    t2[p.probs == 1] = 0
+    return t1 + t2
+
+
+@register_kl(Beta, Beta)
+def _kl_beta_beta(p, q):
+    sum_params_p = p.concentration1 + p.concentration0
+    sum_params_q = q.concentration1 + q.concentration0
+    t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + (sum_params_p).lgamma()
+    t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + (sum_params_q).lgamma()
+    t3 = (p.concentration1 - q.concentration1) * torch.digamma(p.concentration1)
+    t4 = (p.concentration0 - q.concentration0) * torch.digamma(p.concentration0)
+    t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p)
+    return t1 - t2 + t3 + t4 + t5
+
+
+@register_kl(Binomial, Binomial)
+def _kl_binomial_binomial(p, q):
+    # from https://math.stackexchange.com/questions/2214993/
+    # kullback-leibler-divergence-for-binomial-distributions-p-and-q
+    if (p.total_count < q.total_count).any():
+        raise NotImplementedError('KL between Binomials where q.total_count > p.total_count is not implemented')
+    kl = p.total_count * (p.probs * (p.logits - q.logits) + (-p.probs).log1p() - (-q.probs).log1p())
+    inf_idxs = p.total_count > q.total_count
+    kl[inf_idxs] = _infinite_like(kl[inf_idxs])
+    return kl
+
+
+@register_kl(Categorical, Categorical)
+def _kl_categorical_categorical(p, q):
+    t = p.probs * (p.logits - q.logits)
+    t[q.probs == 0] = inf
+    t[p.probs == 0] = 0
+    return t.sum(-1)
+
+
+@register_kl(Dirichlet, Dirichlet)
+def _kl_dirichlet_dirichlet(p, q):
+    # From http://bariskurt.com/kullback-leibler-divergence-between-two-dirichlet-and-beta-distributions/
+    sum_p_concentration = p.concentration.sum(-1)
+    sum_q_concentration = q.concentration.sum(-1)
+    t1 = sum_p_concentration.lgamma() - sum_q_concentration.lgamma()
+    t2 = (p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)
+    t3 = p.concentration - q.concentration
+    t4 = p.concentration.digamma() - sum_p_concentration.digamma().unsqueeze(-1)
+    return t1 - t2 + (t3 * t4).sum(-1)
+
+
+@register_kl(Exponential, Exponential)
+def _kl_exponential_exponential(p, q):
+    rate_ratio = q.rate / p.rate
+    t1 = -rate_ratio.log()
+    return t1 + rate_ratio - 1
+
+
+@register_kl(ExponentialFamily, ExponentialFamily)
+def _kl_expfamily_expfamily(p, q):
+    if not type(p) == type(q):
+        raise NotImplementedError("The cross KL-divergence between different exponential families cannot \
+                            be computed using Bregman divergences")
+    p_nparams = [np.detach().requires_grad_() for np in p._natural_params]
+    q_nparams = q._natural_params
+    lg_normal = p._log_normalizer(*p_nparams)
+    gradients = torch.autograd.grad(lg_normal.sum(), p_nparams, create_graph=True)
+    result = q._log_normalizer(*q_nparams) - lg_normal.clone()
+    for pnp, qnp, g in zip(p_nparams, q_nparams, gradients):
+        term = (qnp - pnp) * g
+        result -= _sum_rightmost(term, len(q.event_shape))
+    return result
+
+
+@register_kl(Gamma, Gamma)
+def _kl_gamma_gamma(p, q):
+    t1 = q.concentration * (p.rate / q.rate).log()
+    t2 = torch.lgamma(q.concentration) - torch.lgamma(p.concentration)
+    t3 = (p.concentration - q.concentration) * torch.digamma(p.concentration)
+    t4 = (q.rate - p.rate) * (p.concentration / p.rate)
+    return t1 + t2 + t3 + t4
+
+
+@register_kl(Gumbel, Gumbel)
+def _kl_gumbel_gumbel(p, q):
+    ct1 = p.scale / q.scale
+    ct2 = q.loc / q.scale
+    ct3 = p.loc / q.scale
+    t1 = -ct1.log() - ct2 + ct3
+    t2 = ct1 * _euler_gamma
+    t3 = torch.exp(ct2 + (1 + ct1).lgamma() - ct3)
+    return t1 + t2 + t3 - (1 + _euler_gamma)
+
+
+@register_kl(Geometric, Geometric)
+def _kl_geometric_geometric(p, q):
+    return -p.entropy() - torch.log1p(-q.probs) / p.probs - q.logits
+
+
+@register_kl(HalfNormal, HalfNormal)
+def _kl_halfnormal_halfnormal(p, q):
+    return _kl_normal_normal(p.base_dist, q.base_dist)
+
+
+@register_kl(Laplace, Laplace)
+def _kl_laplace_laplace(p, q):
+    # From http://www.mast.queensu.ca/~communications/Papers/gil-msc11.pdf
+    scale_ratio = p.scale / q.scale
+    loc_abs_diff = (p.loc - q.loc).abs()
+    t1 = -scale_ratio.log()
+    t2 = loc_abs_diff / q.scale
+    t3 = scale_ratio * torch.exp(-loc_abs_diff / p.scale)
+    return t1 + t2 + t3 - 1
+
+
+@register_kl(MultivariateNormal, MultivariateNormal)
+def _kl_multivariatenormal_multivariatenormal(p, q):
+    # From https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback%E2%80%93Leibler_divergence
+    if p.event_shape != q.event_shape:
+        raise ValueError("KL-divergence between two Multivariate Normals with\
+                          different event shapes cannot be computed")
+
+    term1 = _batch_diag(q.scale_tril).log().sum(-1) - _batch_diag(p.scale_tril).log().sum(-1)
+    term2 = _batch_trace_XXT(torch.matmul(_batch_inverse(q.scale_tril), p.scale_tril))
+    term3 = _batch_mahalanobis(q.scale_tril, (q.loc - p.loc))
+    return term1 + 0.5 * (term2 + term3 - p.event_shape[0])
+
+
+@register_kl(Normal, Normal)
+def _kl_normal_normal(p, q):
+    var_ratio = (p.scale / q.scale).pow(2)
+    t1 = ((p.loc - q.loc) / q.scale).pow(2)
+    return 0.5 * (var_ratio + t1 - 1 - var_ratio.log())
+
+
+@register_kl(OneHotCategorical, OneHotCategorical)
+def _kl_onehotcategorical_onehotcategorical(p, q):
+    return _kl_categorical_categorical(p._categorical, q._categorical)
+
+
+@register_kl(Pareto, Pareto)
+def _kl_pareto_pareto(p, q):
+    # From http://www.mast.queensu.ca/~communications/Papers/gil-msc11.pdf
+    scale_ratio = p.scale / q.scale
+    alpha_ratio = q.alpha / p.alpha
+    t1 = q.alpha * scale_ratio.log()
+    t2 = -alpha_ratio.log()
+    result = t1 + t2 + alpha_ratio - 1
+    result[p.support.lower_bound < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Poisson, Poisson)
+def _kl_poisson_poisson(p, q):
+    return p.rate * (p.rate.log() - q.rate.log()) - (p.rate - q.rate)
+
+
+@register_kl(TransformedDistribution, TransformedDistribution)
+def _kl_transformed_transformed(p, q):
+    if p.transforms != q.transforms:
+        raise NotImplementedError
+    if p.event_shape != q.event_shape:
+        raise NotImplementedError
+    # extra_event_dim = len(p.event_shape) - len(p.base_dist.event_shape)
+    extra_event_dim = len(p.event_shape)
+    base_kl_divergence = kl_divergence(p.base_dist, q.base_dist)
+    return _sum_rightmost(base_kl_divergence, extra_event_dim)
+
+
+@register_kl(Uniform, Uniform)
+def _kl_uniform_uniform(p, q):
+    result = ((q.high - q.low) / (p.high - p.low)).log()
+    result[(q.low > p.low) | (q.high < p.high)] = inf
+    return result
+
+
+# Different distributions
+@register_kl(Bernoulli, Poisson)
+def _kl_bernoulli_poisson(p, q):
+    return -p.entropy() - (p.probs * q.rate.log() - q.rate)
+
+
+@register_kl(Beta, Pareto)
+def _kl_beta_infinity(p, q):
+    return _infinite_like(p.concentration1)
+
+
+@register_kl(Beta, Exponential)
+def _kl_beta_exponential(p, q):
+    return -p.entropy() - q.rate.log() + q.rate * (p.concentration1 / (p.concentration1 + p.concentration0))
+
+
+@register_kl(Beta, Gamma)
+def _kl_beta_gamma(p, q):
+    t1 = -p.entropy()
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (q.concentration - 1) * (p.concentration1.digamma() - (p.concentration1 + p.concentration0).digamma())
+    t4 = q.rate * p.concentration1 / (p.concentration1 + p.concentration0)
+    return t1 + t2 - t3 + t4
+
+# TODO: Add Beta-Laplace KL Divergence
+
+
+@register_kl(Beta, Normal)
+def _kl_beta_normal(p, q):
+    E_beta = p.concentration1 / (p.concentration1 + p.concentration0)
+    var_normal = q.scale.pow(2)
+    t1 = -p.entropy()
+    t2 = 0.5 * (var_normal * 2 * math.pi).log()
+    t3 = (E_beta * (1 - E_beta) / (p.concentration1 + p.concentration0 + 1) + E_beta.pow(2)) * 0.5
+    t4 = q.loc * E_beta
+    t5 = q.loc.pow(2) * 0.5
+    return t1 + t2 + (t3 - t4 + t5) / var_normal
+
+
+@register_kl(Beta, Uniform)
+def _kl_beta_uniform(p, q):
+    result = -p.entropy() + (q.high - q.low).log()
+    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = inf
+    return result
+
+
+@register_kl(Exponential, Beta)
+@register_kl(Exponential, Pareto)
+@register_kl(Exponential, Uniform)
+def _kl_exponential_infinity(p, q):
+    return _infinite_like(p.rate)
+
+
+@register_kl(Exponential, Gamma)
+def _kl_exponential_gamma(p, q):
+    ratio = q.rate / p.rate
+    t1 = -q.concentration * torch.log(ratio)
+    return t1 + ratio + q.concentration.lgamma() + q.concentration * _euler_gamma - (1 + _euler_gamma)
+
+
+@register_kl(Exponential, Gumbel)
+def _kl_exponential_gumbel(p, q):
+    scale_rate_prod = p.rate * q.scale
+    loc_scale_ratio = q.loc / q.scale
+    t1 = scale_rate_prod.log() - 1
+    t2 = torch.exp(loc_scale_ratio) * scale_rate_prod / (scale_rate_prod + 1)
+    t3 = scale_rate_prod.reciprocal()
+    return t1 - loc_scale_ratio + t2 + t3
+
+# TODO: Add Exponential-Laplace KL Divergence
+
+
+@register_kl(Exponential, Normal)
+def _kl_exponential_normal(p, q):
+    var_normal = q.scale.pow(2)
+    rate_sqr = p.rate.pow(2)
+    t1 = 0.5 * torch.log(rate_sqr * var_normal * 2 * math.pi)
+    t2 = rate_sqr.reciprocal()
+    t3 = q.loc / p.rate
+    t4 = q.loc.pow(2) * 0.5
+    return t1 - 1 + (t2 - t3 + t4) / var_normal
+
+
+@register_kl(Gamma, Beta)
+@register_kl(Gamma, Pareto)
+@register_kl(Gamma, Uniform)
+def _kl_gamma_infinity(p, q):
+    return _infinite_like(p.concentration)
+
+
+@register_kl(Gamma, Exponential)
+def _kl_gamma_exponential(p, q):
+    return -p.entropy() - q.rate.log() + q.rate * p.concentration / p.rate
+
+
+@register_kl(Gamma, Gumbel)
+def _kl_gamma_gumbel(p, q):
+    beta_scale_prod = p.rate * q.scale
+    loc_scale_ratio = q.loc / q.scale
+    t1 = (p.concentration - 1) * p.concentration.digamma() - p.concentration.lgamma() - p.concentration
+    t2 = beta_scale_prod.log() + p.concentration / beta_scale_prod
+    t3 = torch.exp(loc_scale_ratio) * (1 + beta_scale_prod.reciprocal()).pow(-p.concentration) - loc_scale_ratio
+    return t1 + t2 + t3
+
+# TODO: Add Gamma-Laplace KL Divergence
+
+
+@register_kl(Gamma, Normal)
+def _kl_gamma_normal(p, q):
+    var_normal = q.scale.pow(2)
+    beta_sqr = p.rate.pow(2)
+    t1 = 0.5 * torch.log(beta_sqr * var_normal * 2 * math.pi) - p.concentration - p.concentration.lgamma()
+    t2 = 0.5 * (p.concentration.pow(2) + p.concentration) / beta_sqr
+    t3 = q.loc * p.concentration / p.rate
+    t4 = 0.5 * q.loc.pow(2)
+    return t1 + (p.concentration - 1) * p.concentration.digamma() + (t2 - t3 + t4) / var_normal
+
+
+@register_kl(Gumbel, Beta)
+@register_kl(Gumbel, Exponential)
+@register_kl(Gumbel, Gamma)
+@register_kl(Gumbel, Pareto)
+@register_kl(Gumbel, Uniform)
+def _kl_gumbel_infinity(p, q):
+    return _infinite_like(p.loc)
+
+# TODO: Add Gumbel-Laplace KL Divergence
+
+
+@register_kl(Gumbel, Normal)
+def _kl_gumbel_normal(p, q):
+    param_ratio = p.scale / q.scale
+    t1 = (param_ratio / math.sqrt(2 * math.pi)).log()
+    t2 = (math.pi * param_ratio * 0.5).pow(2) / 3
+    t3 = ((p.loc + p.scale * _euler_gamma - q.loc) / q.scale).pow(2) * 0.5
+    return -t1 + t2 + t3 - (_euler_gamma + 1)
+
+
+@register_kl(Laplace, Beta)
+@register_kl(Laplace, Exponential)
+@register_kl(Laplace, Gamma)
+@register_kl(Laplace, Pareto)
+@register_kl(Laplace, Uniform)
+def _kl_laplace_infinity(p, q):
+    return _infinite_like(p.loc)
+
+
+@register_kl(Laplace, Normal)
+def _kl_laplace_normal(p, q):
+    var_normal = q.scale.pow(2)
+    scale_sqr_var_ratio = p.scale.pow(2) / var_normal
+    t1 = 0.5 * torch.log(2 * scale_sqr_var_ratio / math.pi)
+    t2 = 0.5 * p.loc.pow(2)
+    t3 = p.loc * q.loc
+    t4 = 0.5 * q.loc.pow(2)
+    return -t1 + scale_sqr_var_ratio + (t2 - t3 + t4) / var_normal - 1
+
+
+@register_kl(Normal, Beta)
+@register_kl(Normal, Exponential)
+@register_kl(Normal, Gamma)
+@register_kl(Normal, Pareto)
+@register_kl(Normal, Uniform)
+def _kl_normal_infinity(p, q):
+    return _infinite_like(p.loc)
+
+
+@register_kl(Normal, Gumbel)
+def _kl_normal_gumbel(p, q):
+    mean_scale_ratio = p.loc / q.scale
+    var_scale_sqr_ratio = (p.scale / q.scale).pow(2)
+    loc_scale_ratio = q.loc / q.scale
+    t1 = var_scale_sqr_ratio.log() * 0.5
+    t2 = mean_scale_ratio - loc_scale_ratio
+    t3 = torch.exp(-mean_scale_ratio + 0.5 * var_scale_sqr_ratio + loc_scale_ratio)
+    return -t1 + t2 + t3 - (0.5 * (1 + math.log(2 * math.pi)))
+
+# TODO: Add Normal-Laplace KL Divergence
+
+
+@register_kl(Pareto, Beta)
+@register_kl(Pareto, Uniform)
+def _kl_pareto_infinity(p, q):
+    return _infinite_like(p.scale)
+
+
+@register_kl(Pareto, Exponential)
+def _kl_pareto_exponential(p, q):
+    scale_rate_prod = p.scale * q.rate
+    t1 = (p.alpha / scale_rate_prod).log()
+    t2 = p.alpha.reciprocal()
+    t3 = p.alpha * scale_rate_prod / (p.alpha - 1)
+    result = t1 - t2 + t3 - 1
+    result[p.alpha <= 1] = inf
+    return result
+
+
+@register_kl(Pareto, Gamma)
+def _kl_pareto_gamma(p, q):
+    common_term = p.scale.log() + p.alpha.reciprocal()
+    t1 = p.alpha.log() - common_term
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (1 - q.concentration) * common_term
+    t4 = q.rate * p.alpha * p.scale / (p.alpha - 1)
+    result = t1 + t2 + t3 + t4 - 1
+    result[p.alpha <= 1] = inf
+    return result
+
+# TODO: Add Pareto-Laplace KL Divergence
+
+
+@register_kl(Pareto, Normal)
+def _kl_pareto_normal(p, q):
+    var_normal = 2 * q.scale.pow(2)
+    common_term = p.scale / (p.alpha - 1)
+    t1 = (math.sqrt(2 * math.pi) * q.scale * p.alpha / p.scale).log()
+    t2 = p.alpha.reciprocal()
+    t3 = p.alpha * common_term.pow(2) / (p.alpha - 2)
+    t4 = (p.alpha * common_term - q.loc).pow(2)
+    result = t1 - t2 + (t3 + t4) / var_normal - 1
+    result[p.alpha <= 2] = inf
+    return result
+
+
+@register_kl(Poisson, Bernoulli)
+@register_kl(Poisson, Binomial)
+def _kl_poisson_infinity(p, q):
+    return _infinite_like(p.rate)
+
+
+@register_kl(Uniform, Beta)
+def _kl_uniform_beta(p, q):
+    common_term = p.high - p.low
+    t1 = torch.log(common_term)
+    t2 = (q.concentration1 - 1) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term
+    t3 = (q.concentration0 - 1) * (_x_log_x((1 - p.high)) - _x_log_x((1 - p.low)) + common_term) / common_term
+    t4 = q.concentration1.lgamma() + q.concentration0.lgamma() - (q.concentration1 + q.concentration0).lgamma()
+    result = t3 + t4 - t1 - t2
+    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf
+    return result
+
+
+@register_kl(Uniform, Exponential)
+def _kl_uniform_exponetial(p, q):
+    result = q.rate * (p.high + p.low) / 2 - ((p.high - p.low) * q.rate).log()
+    result[p.low < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Uniform, Gamma)
+def _kl_uniform_gamma(p, q):
+    common_term = p.high - p.low
+    t1 = common_term.log()
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (1 - q.concentration) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term
+    t4 = q.rate * (p.high + p.low) / 2
+    result = -t1 + t2 + t3 + t4
+    result[p.low < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Uniform, Gumbel)
+def _kl_uniform_gumbel(p, q):
+    common_term = q.scale / (p.high - p.low)
+    high_loc_diff = (p.high - q.loc) / q.scale
+    low_loc_diff = (p.low - q.loc) / q.scale
+    t1 = common_term.log() + 0.5 * (high_loc_diff + low_loc_diff)
+    t2 = common_term * (torch.exp(-high_loc_diff) - torch.exp(-low_loc_diff))
+    return t1 - t2
+
+# TODO: Uniform-Laplace KL Divergence
+
+
+@register_kl(Uniform, Normal)
+def _kl_uniform_normal(p, q):
+    common_term = p.high - p.low
+    t1 = (math.sqrt(math.pi * 2) * q.scale / common_term).log()
+    t2 = (common_term).pow(2) / 12
+    t3 = ((p.high + p.low - 2 * q.loc) / 2).pow(2)
+    return t1 + 0.5 * (t2 + t3) / q.scale.pow(2)
+
+
+@register_kl(Uniform, Pareto)
+def _kl_uniform_pareto(p, q):
+    support_uniform = p.high - p.low
+    t1 = (q.alpha * q.scale.pow(q.alpha) * (support_uniform)).log()
+    t2 = (_x_log_x(p.high) - _x_log_x(p.low) - support_uniform) / support_uniform
+    result = t2 * (q.alpha + 1) - t1
+    result[p.low < q.support.lower_bound] = inf
+    return result
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
new file mode 100644
index 0000000..5df1fe9
--- /dev/null
+++ b/torch/distributions/laplace.py
@@ -0,0 +1,70 @@
+from numbers import Number
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import _finfo, broadcast_all
+
+
+class Laplace(Distribution):
+    r"""
+    Creates a Laplace distribution parameterized by `loc` and 'scale'.
+
+    Example::
+
+        >>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # Laplace distributed with loc=0, scale=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of the distribution
+        scale (float or Tensor): scale of the distribution
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.real
+    has_rsample = True
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        return 2 * self.scale.pow(2)
+
+    @property
+    def stddev(self):
+        return (2 ** 0.5) * self.scale
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super(Laplace, self).__init__(batch_shape, validate_args=validate_args)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1)
+        # TODO: If we ever implement tensor.nextafter, below is what we want ideally.
+        # u = self.loc.new(shape).uniform_(self.loc.nextafter(-.5, 0), .5)
+        return self.loc - self.scale * u.sign() * torch.log1p(-u.abs())
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return -torch.log(2 * self.scale) - torch.abs(value - self.loc) / self.scale
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(-(value - self.loc).abs() / self.scale)
+
+    def icdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        term = value - 0.5
+        return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs())
+
+    def entropy(self):
+        return 1 + torch.log(2 * self.scale)
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
new file mode 100644
index 0000000..d5a0ba1
--- /dev/null
+++ b/torch/distributions/log_normal.py
@@ -0,0 +1,49 @@
+from torch.distributions import constraints
+from torch.distributions.transforms import ExpTransform
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+
+
+class LogNormal(TransformedDistribution):
+    r"""
+    Creates a log-normal distribution parameterized by
+    `loc` and `scale` where::
+
+        X ~ Normal(loc, scale)
+        Y = exp(X) ~ LogNormal(loc, scale)
+
+    Example::
+
+        >>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # log-normal distributed with mean=0 and stddev=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of log of distribution
+        scale (float or Tensor): standard deviation of log of the distribution
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        super(LogNormal, self).__init__(Normal(loc, scale), ExpTransform(), validate_args=validate_args)
+
+    @property
+    def loc(self):
+        return self.base_dist.loc
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return (self.loc + self.scale.pow(2) / 2).exp()
+
+    @property
+    def variance(self):
+        return (self.scale.pow(2).exp() - 1) * (2 * self.loc + self.scale.pow(2)).exp()
+
+    def entropy(self):
+        return self.base_dist.entropy() + self.loc
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
new file mode 100644
index 0000000..63098a8
--- /dev/null
+++ b/torch/distributions/logistic_normal.py
@@ -0,0 +1,47 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import ComposeTransform, ExpTransform, StickBreakingTransform
+
+
+class LogisticNormal(TransformedDistribution):
+    r"""
+    Creates a logistic-normal distribution parameterized by `loc` and `scale`
+    that define the base `Normal` distribution transformed with the
+    `StickBreakingTransform` such that::
+
+        X ~ LogisticNormal(loc, scale)
+        Y = log(X / (1 - X.cumsum(-1)))[..., :-1] ~ Normal(loc, scale)
+
+    Args:
+        loc (float or Tensor): mean of the base distribution
+        scale (float or Tensor): standard deviation of the base distribution
+
+    Example::
+
+        >>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
+        >>> # of the base Normal distribution
+        >>> m = distributions.LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
+        >>> m.sample()
+        tensor([ 0.7653,  0.0341,  0.0579,  0.1427])
+
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        super(LogisticNormal, self).__init__(
+            Normal(loc, scale), StickBreakingTransform(),
+            validate_args=validate_args)
+        # Adjust event shape since StickBreakingTransform adds 1 dimension
+        self._event_shape = torch.Size([s + 1 for s in self._event_shape])
+
+    @property
+    def loc(self):
+        return self.base_dist.loc
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
new file mode 100644
index 0000000..c045557
--- /dev/null
+++ b/torch/distributions/multinomial.py
@@ -0,0 +1,99 @@
+import torch
+from torch._six import inf
+from torch.distributions.distribution import Distribution
+from torch.distributions import Categorical
+from numbers import Number
+from torch.distributions import constraints
+from torch.distributions.utils import broadcast_all
+
+
+class Multinomial(Distribution):
+    r"""
+    Creates a Multinomial distribution parameterized by `total_count` and
+    either `probs` or `logits` (but not both). The innermost dimension of
+    `probs` indexes over categories. All other dimensions index over batches.
+
+    Note that `total_count` need not be specified if only :meth:`log_prob` is
+    called (see example below)
+
+    .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1.
+
+    -   :meth:`sample` requires a single shared `total_count` for all
+        parameters and samples.
+    -   :meth:`log_prob` allows different `total_count` for each parameter and
+        sample.
+
+    Example::
+
+        >>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.]))
+        >>> x = m.sample()  # equal probability of 0, 1, 2, 3
+        tensor([ 21.,  24.,  30.,  25.])
+
+        >>> Multinomial(probs=torch.tensor([1., 1., 1., 1.])).log_prob(x)
+        tensor([-4.1338])
+
+    Args:
+        total_count (int): number of trials
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities
+    """
+    arg_constraints = {'logits': constraints.real}  # Let logits be the canonical parameterization.
+
+    @property
+    def mean(self):
+        return self.probs * self.total_count
+
+    @property
+    def variance(self):
+        return self.total_count * self.probs * (1 - self.probs)
+
+    def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+        if not isinstance(total_count, Number):
+            raise NotImplementedError('inhomogeneous total_count is not supported')
+        self.total_count = total_count
+        self._categorical = Categorical(probs=probs, logits=logits)
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @constraints.dependent_property
+    def support(self):
+        return constraints.integer_interval(0, self.total_count)
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    def sample(self, sample_shape=torch.Size()):
+        sample_shape = torch.Size(sample_shape)
+        samples = self._categorical.sample(torch.Size((self.total_count,)) + sample_shape)
+        # samples.shape is (total_count, sample_shape, batch_shape), need to change it to
+        # (sample_shape, batch_shape, total_count)
+        shifted_idx = list(range(samples.dim()))
+        shifted_idx.append(shifted_idx.pop(0))
+        samples = samples.permute(*shifted_idx)
+        counts = samples.new(self._extended_shape(sample_shape)).zero_()
+        counts.scatter_add_(-1, samples, torch.ones_like(samples))
+        return counts.type_as(self.probs)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits.clone(), value)
+        log_factorial_n = torch.lgamma(value.sum(-1) + 1)
+        log_factorial_xs = torch.lgamma(value + 1).sum(-1)
+        logits[(value == 0) & (logits == -inf)] = 0
+        log_powers = (logits * value).sum(-1)
+        return log_factorial_n - log_factorial_xs + log_powers
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
new file mode 100644
index 0000000..d8bf4d0
--- /dev/null
+++ b/torch/distributions/multivariate_normal.py
@@ -0,0 +1,192 @@
+import math
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import lazy_property
+
+
+def _get_batch_shape(bmat, bvec):
+    r"""
+    Given a batch of matrices and a batch of vectors, compute the combined `batch_shape`.
+    """
+    try:
+        vec_shape = torch._C._infer_size(bvec.shape, bmat.shape[:-1])
+    except RuntimeError:
+        raise ValueError("Incompatible batch shapes: vector {}, matrix {}".format(bvec.shape, bmat.shape))
+    return torch.Size(vec_shape[:-1])
+
+
+def _batch_mv(bmat, bvec):
+    r"""
+    Performs a batched matrix-vector product, with compatible but different batch shapes.
+
+    This function takes as input `bmat`, containing :math:`n \times n` matrices, and
+    `bvec`, containing length :math:`n` vectors.
+
+    Both `bmat` and `bvec` may have any number of leading dimensions, which correspond
+    to a batch shape. They are not necessarily assumed to have the same batch shape,
+    just ones which can be broadcasted.
+    """
+    n = bvec.size(-1)
+    batch_shape = _get_batch_shape(bmat, bvec)
+
+    # to conform with `torch.bmm` interface, both bmat and bvec should have `.dim() == 3`
+    bmat = bmat.expand(batch_shape + (n, n)).reshape((-1, n, n))
+    bvec = bvec.unsqueeze(-1).expand(batch_shape + (n, 1)).reshape((-1, n, 1))
+    return torch.bmm(bmat, bvec).view(batch_shape + (n,))
+
+
+def _batch_potrf_lower(bmat):
+    r"""
+    Applies a Cholesky decomposition to all matrices in a batch of arbitrary shape.
+    """
+    n = bmat.size(-1)
+    cholesky = torch.stack([C.potrf(upper=False) for C in bmat.reshape((-1, n, n))])
+    return cholesky.view(bmat.shape)
+
+
+def _batch_diag(bmat):
+    r"""
+    Returns the diagonals of a batch of square matrices.
+    """
+    return bmat.reshape(bmat.shape[:-2] + (-1,))[..., ::bmat.size(-1) + 1]
+
+
+def _batch_inverse(bmat):
+    r"""
+    Returns the inverses of a batch of square matrices.
+    """
+    n = bmat.size(-1)
+    flat_bmat = bmat.reshape(-1, n, n)
+    flat_inv_bmat = torch.stack([m.inverse() for m in flat_bmat], 0)
+    return flat_inv_bmat.view(bmat.shape)
+
+
+def _batch_mahalanobis(L, x):
+    r"""
+    Computes the squared Mahalanobis distance :math:`\mathbf{x}^\top\mathbf{M}^{-1}\mathbf{x}`
+    for a factored :math:`\mathbf{M} = \mathbf{L}\mathbf{L}^\top`.
+
+    Accepts batches for both L and x.
+    """
+    # TODO: use `torch.potrs` or similar once a backwards pass is implemented.
+    flat_L = L.unsqueeze(0).reshape((-1,) + L.shape[-2:])
+    L_inv = torch.stack([torch.inverse(Li.t()) for Li in flat_L]).view(L.shape)
+    return (x.unsqueeze(-1) * L_inv).sum(-2).pow(2.0).sum(-1)
+
+
+class MultivariateNormal(Distribution):
+    r"""
+    Creates a multivariate normal (also called Gaussian) distribution
+    parameterized by a mean vector and a covariance matrix.
+
+    The multivariate normal distribution can be parameterized either
+    in terms of a positive definite covariance matrix :math:`\mathbf{\Sigma}`
+    or a positive definite precision matrix :math:`\mathbf{\Sigma}^{-1}`
+    or a lower-triangular matrix :math:`\mathbf{L}` with positive-valued
+    diagonal entries, such that
+    :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`. This triangular matrix
+    can be obtained via e.g. Cholesky decomposition of the covariance.
+
+    Example:
+
+        >>> m = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        >>> m.sample()  # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
+        tensor([-0.2102, -0.5429])
+
+    Args:
+        loc (Tensor): mean of the distribution
+        covariance_matrix (Tensor): positive-definite covariance matrix
+        precision_matrix (Tensor): positive-definite precision matrix
+        scale_tril (Tensor): lower-triangular factor of covariance, with positive-valued diagonal
+
+    Note:
+        Only one of :attr:`covariance_matrix` or :attr:`precision_matrix` or
+        :attr:`scale_tril` can be specified.
+
+        Using :attr:`scale_tril` will be more efficient: all computations internally
+        are based on :attr:`scale_tril`. If :attr:`covariance_matrix` or
+        :attr:`precision_matrix` is passed instead, it is only used to compute
+        the corresponding lower triangular matrices using a Cholesky decomposition.
+    """
+    arg_constraints = {'loc': constraints.real_vector,
+                       'covariance_matrix': constraints.positive_definite,
+                       'precision_matrix': constraints.positive_definite,
+                       'scale_tril': constraints.lower_cholesky}
+    support = constraints.real
+    has_rsample = True
+
+    def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tril=None, validate_args=None):
+        if loc.dim() < 1:
+            loc = loc.unsqueeze(0)
+        event_shape = torch.Size(loc.shape[-1:])
+        if (covariance_matrix is not None) + (scale_tril is not None) + (precision_matrix is not None) != 1:
+            raise ValueError("Exactly one of covariance_matrix or precision_matrix or scale_tril may be specified.")
+        if scale_tril is not None:
+            if scale_tril.dim() < 2:
+                raise ValueError("scale_tril matrix must be at least two-dimensional, "
+                                 "with optional leading batch dimensions")
+            self.scale_tril = scale_tril
+            batch_shape = _get_batch_shape(scale_tril, loc)
+        elif covariance_matrix is not None:
+            if covariance_matrix.dim() < 2:
+                raise ValueError("covariance_matrix must be at least two-dimensional, "
+                                 "with optional leading batch dimensions")
+            self.covariance_matrix = covariance_matrix
+            batch_shape = _get_batch_shape(covariance_matrix, loc)
+        else:
+            if precision_matrix.dim() < 2:
+                raise ValueError("precision_matrix must be at least two-dimensional, "
+                                 "with optional leading batch dimensions")
+            self.precision_matrix = precision_matrix
+            self.covariance_matrix = _batch_inverse(precision_matrix)
+            batch_shape = _get_batch_shape(precision_matrix, loc)
+        self.loc = loc
+        super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    @lazy_property
+    def scale_tril(self):
+        return _batch_potrf_lower(self.covariance_matrix)
+
+    @lazy_property
+    def covariance_matrix(self):
+        return torch.matmul(self.scale_tril, self.scale_tril.transpose(-1, -2))
+
+    @lazy_property
+    def precision_matrix(self):
+        # TODO: use `torch.potri` on `scale_tril` once a backwards pass is implemented.
+        scale_tril_inv = _batch_inverse(self.scale_tril)
+        return torch.matmul(scale_tril_inv.transpose(-1, -2), scale_tril_inv)
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        n = self.covariance_matrix.size(-1)
+        var = torch.stack([cov.diag() for cov in self.covariance_matrix.view(-1, n, n)])
+        return var.view(self.covariance_matrix.size()[:-1])
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = self.loc.new(*shape).normal_()
+        return self.loc + _batch_mv(self.scale_tril, eps)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        diff = value - self.loc
+        M = _batch_mahalanobis(self.scale_tril, diff)
+        log_det = _batch_diag(self.scale_tril).abs().log().sum(-1)
+        return -0.5 * (M + self.loc.size(-1) * math.log(2 * math.pi)) - log_det
+
+    def entropy(self):
+        log_det = _batch_diag(self.scale_tril).abs().log().sum(-1)
+        H = 0.5 * (1.0 + math.log(2 * math.pi)) * self._event_shape[0] + log_det
+        if len(self._batch_shape) == 0:
+            return H
+        else:
+            return H.expand(self._batch_shape)
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
new file mode 100644
index 0000000..24c24ae
--- /dev/null
+++ b/torch/distributions/normal.py
@@ -0,0 +1,87 @@
+import math
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+
+class Normal(ExponentialFamily):
+    r"""
+    Creates a normal (also called Gaussian) distribution parameterized by
+    `loc` and `scale`.
+
+    Example::
+
+        >>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # normally distributed with loc=0 and scale=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of the distribution (often referred to as mu)
+        scale (float or Tensor): standard deviation of the distribution
+            (often referred to as sigma)
+    """
+    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.real
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def stddev(self):
+        return self.scale
+
+    @property
+    def variance(self):
+        return self.stddev.pow(2)
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super(Normal, self).__init__(batch_shape, validate_args=validate_args)
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.normal(self.loc.expand(shape), self.scale.expand(shape))
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = self.loc.new(shape).normal_()
+        return self.loc + eps * self.scale
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        # compute the variance
+        var = (self.scale ** 2)
+        log_scale = math.log(self.scale) if isinstance(self.scale, Number) else self.scale.log()
+        return -((value - self.loc) ** 2) / (2 * var) - log_scale - math.log(math.sqrt(2 * math.pi))
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 0.5 * (1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2)))
+
+    def icdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2)
+
+    def entropy(self):
+        return 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(self.scale)
+
+    @property
+    def _natural_params(self):
+        return (self.loc / self.scale.pow(2), -0.5 * self.scale.pow(2).reciprocal())
+
+    def _log_normalizer(self, x, y):
+        return -0.25 * x.pow(2) / y + 0.5 * torch.log(-math.pi / y)
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
new file mode 100644
index 0000000..4213c44
--- /dev/null
+++ b/torch/distributions/one_hot_categorical.py
@@ -0,0 +1,86 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.categorical import Categorical
+from torch.distributions.distribution import Distribution
+
+
+class OneHotCategorical(Distribution):
+    r"""
+    Creates a one-hot categorical distribution parameterized by :attr:`probs` or
+    :attr:`logits`.
+
+    Samples are one-hot coded vectors of size ``probs.size(-1)``.
+
+    .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1.
+
+    See also: :func:`torch.distributions.Categorical` for specifications of
+    :attr:`probs` and :attr:`logits`.
+
+    Example::
+
+        >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m.sample()  # equal probability of 0, 1, 2, 3
+        tensor([ 0.,  0.,  0.,  1.])
+
+    Args:
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities
+    """
+    arg_constraints = {'probs': constraints.simplex}
+    support = constraints.simplex
+    has_enumerate_support = True
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        self._categorical = Categorical(probs, logits)
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def mean(self):
+        return self._categorical.probs
+
+    @property
+    def variance(self):
+        return self._categorical.probs * (1 - self._categorical.probs)
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    def sample(self, sample_shape=torch.Size()):
+        sample_shape = torch.Size(sample_shape)
+        probs = self._categorical.probs
+        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
+        indices = self._categorical.sample(sample_shape)
+        if indices.dim() < one_hot.dim():
+            indices = indices.unsqueeze(-1)
+        return one_hot.scatter_(-1, indices, 1)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        indices = value.max(-1)[1]
+        return self._categorical.log_prob(indices)
+
+    def entropy(self):
+        return self._categorical.entropy()
+
+    def enumerate_support(self):
+        n = self.event_shape[0]
+        values = self._new((n, n))
+        torch.eye(n, out=values)
+        values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
+        return values.expand((n,) + self.batch_shape + (n,))
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
new file mode 100644
index 0000000..82fff3c
--- /dev/null
+++ b/torch/distributions/pareto.py
@@ -0,0 +1,52 @@
+from numbers import Number
+
+import math
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exponential import Exponential
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, ExpTransform
+from torch.distributions.utils import broadcast_all
+
+
+class Pareto(TransformedDistribution):
+    r"""
+    Samples from a Pareto Type 1 distribution.
+
+    Example::
+
+        >>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Pareto distribution with scale=1 and alpha=1
+        tensor([ 1.5623])
+
+    Args:
+        scale (float or Tensor): Scale parameter of the distribution
+        alpha (float or Tensor): Shape parameter of the distribution
+    """
+    arg_constraints = {'alpha': constraints.positive, 'scale': constraints.positive}
+
+    def __init__(self, scale, alpha, validate_args=None):
+        self.scale, self.alpha = broadcast_all(scale, alpha)
+        base_dist = Exponential(self.alpha)
+        transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
+        super(Pareto, self).__init__(base_dist, transforms, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        # mean is inf for alpha <= 1
+        a = self.alpha.clone().clamp(min=1)
+        return a * self.scale / (a - 1)
+
+    @property
+    def variance(self):
+        # var is inf for alpha <= 2
+        a = self.alpha.clone().clamp(min=2)
+        return self.scale.pow(2) * a / ((a - 1).pow(2) * (a - 2))
+
+    @constraints.dependent_property
+    def support(self):
+        return constraints.greater_than(self.scale)
+
+    def entropy(self):
+        return ((self.scale / self.alpha).log() + (1 + self.alpha.reciprocal()))
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
new file mode 100644
index 0000000..fde403a
--- /dev/null
+++ b/torch/distributions/poisson.py
@@ -0,0 +1,62 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+
+class Poisson(ExponentialFamily):
+    r"""
+    Creates a Poisson distribution parameterized by `rate`, the rate parameter.
+
+    Samples are nonnegative integers, with a pmf given by
+
+    .. math::
+      \mathrm{rate}^k \frac{e^{-\mathrm{rate}}}{k!}
+
+    Example::
+
+        >>> m = Poisson(torch.tensor([4]))
+        >>> m.sample()
+        tensor([ 3.])
+
+    Args:
+        rate (Number, Tensor): the rate parameter
+    """
+    arg_constraints = {'rate': constraints.positive}
+    support = constraints.nonnegative_integer
+
+    @property
+    def mean(self):
+        return self.rate
+
+    @property
+    def variance(self):
+        return self.rate
+
+    def __init__(self, rate, validate_args=None):
+        self.rate, = broadcast_all(rate)
+        if isinstance(rate, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.rate.size()
+        super(Poisson, self).__init__(batch_shape, validate_args=validate_args)
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.poisson(self.rate.expand(shape))
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        rate, value = broadcast_all(self.rate, value)
+        return (rate.log() * value) - rate - (value + 1).lgamma()
+
+    @property
+    def _natural_params(self):
+        return (torch.log(self.rate), )
+
+    def _log_normalizer(self, x):
+        return torch.exp(x)
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
new file mode 100644
index 0000000..b75341f
--- /dev/null
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -0,0 +1,113 @@
+import torch
+from numbers import Number
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import SigmoidTransform
+from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property, clamp_probs
+
+
+class LogitRelaxedBernoulli(Distribution):
+    r"""
+    Creates a LogitRelaxedBernoulli distribution parameterized by `probs` or `logits`,
+    which is the logit of a RelaxedBernoulli distribution.
+
+    Samples are logits of values in (0, 1). See [1] for more details.
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Number, Tensor): the probabilty of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+
+    [1] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables
+    (Maddison et al, 2017)
+
+    [2] Categorical Reparametrization with Gumbel-Softmax
+    (Jang et al, 2017)
+    """
+    arg_constraints = {'probs': constraints.unit_interval}
+    support = constraints.real
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        self.temperature = temperature
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            is_scalar = isinstance(probs, Number)
+            self.probs, = broadcast_all(probs)
+        else:
+            is_scalar = isinstance(logits, Number)
+            self.logits, = broadcast_all(logits)
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        super(LogitRelaxedBernoulli, self).__init__(batch_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        probs = clamp_probs(self.probs.expand(shape))
+        uniforms = clamp_probs(self.probs.new(shape).uniform_())
+        return (uniforms.log() - (-uniforms).log1p() + probs.log() - (-probs).log1p()) / self.temperature
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        diff = logits - value.mul(self.temperature)
+        return self.temperature.log() + diff - 2 * diff.exp().log1p()
+
+
+class RelaxedBernoulli(TransformedDistribution):
+    r"""
+    Creates a RelaxedBernoulli distribution, parametrized by `temperature`, and either
+    `probs` or `logits`. This is a relaxed version of the `Bernoulli` distribution, so
+    the values are in (0, 1), and has reparametrizable samples.
+
+    Example::
+
+        >>> m = RelaxedBernoulli(torch.tensor([2.2]),
+                                 torch.tensor([0.1, 0.2, 0.3, 0.99]))
+        >>> m.sample()
+        tensor([ 0.2951,  0.3442,  0.8918,  0.9021])
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Number, Tensor): the probabilty of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+    """
+    arg_constraints = {'probs': constraints.unit_interval}
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        super(RelaxedBernoulli, self).__init__(LogitRelaxedBernoulli(temperature, probs, logits),
+                                               SigmoidTransform(), validate_args=validate_args)
+
+    @property
+    def temperature(self):
+        return self.base_dist.temperature
+
+    @property
+    def logits(self):
+        return self.base_dist.logits
+
+    @property
+    def probs(self):
+        return self.base_dist.probs
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
new file mode 100644
index 0000000..e5b3c71
--- /dev/null
+++ b/torch/distributions/relaxed_categorical.py
@@ -0,0 +1,114 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.categorical import Categorical
+from torch.distributions.utils import clamp_probs, broadcast_all
+from torch.distributions.distribution import Distribution
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import ExpTransform
+
+
+class ExpRelaxedCategorical(Distribution):
+    r"""
+    Creates a ExpRelaxedCategorical parameterized by
+    :attr:`temperature`, and either :attr:`probs` or :attr:`logits`.
+    Returns the log of a point in the simplex. Based on the interface to
+    :class:`OneHotCategorical`.
+
+    Implementation based on [1].
+
+    See also: :func:`torch.distributions.OneHotCategorical`
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Tensor): event probabilities
+        logits (Tensor): the log probability of each event.
+
+    [1] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables
+    (Maddison et al, 2017)
+
+    [2] Categorical Reparametrization with Gumbel-Softmax
+    (Jang et al, 2017)
+    """
+    arg_constraints = {'probs': constraints.simplex}
+    support = constraints.real
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        self._categorical = Categorical(probs, logits)
+        self.temperature = temperature
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super(ExpRelaxedCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    def rsample(self, sample_shape=torch.Size()):
+        sample_shape = torch.Size(sample_shape)
+        uniforms = clamp_probs(self.logits.new(self._extended_shape(sample_shape)).uniform_())
+        gumbels = -((-(uniforms.log())).log())
+        scores = (self.logits + gumbels) / self.temperature
+        return scores - scores.logsumexp(dim=-1, keepdim=True)
+
+    def log_prob(self, value):
+        K = self._categorical._num_events
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        log_scale = (self.temperature.new(self.temperature.shape).fill_(K).lgamma() -
+                     self.temperature.log().mul(-(K - 1)))
+        score = logits - value.mul(self.temperature)
+        score = (score - score.logsumexp(dim=-1, keepdim=True)).sum(-1)
+        return score + log_scale
+
+
+class RelaxedOneHotCategorical(TransformedDistribution):
+    r"""
+    Creates a RelaxedOneHotCategorical distribution parametrized by
+    :attr:`temperature`, and either :attr:`probs` or :attr:`logits`.
+    This is a relaxed version of the :class:`OneHotCategorical` distribution, so
+    its samples are on simplex, and are reparametrizable.
+
+    Example::
+
+        >>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
+                                         torch.tensor([0.1, 0.2, 0.3, 0.4]))
+        >>> m.sample()
+        tensor([ 0.1294,  0.2324,  0.3859,  0.2523])
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Tensor): event probabilities
+        logits (Tensor): the log probability of each event.
+    """
+    arg_constraints = {'probs': constraints.simplex}
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        super(RelaxedOneHotCategorical, self).__init__(ExpRelaxedCategorical(temperature, probs, logits),
+                                                       ExpTransform(), validate_args=validate_args)
+
+    @property
+    def temperature(self):
+        return self.base_dist.temperature
+
+    @property
+    def logits(self):
+        return self.base_dist.logits
+
+    @property
+    def probs(self):
+        return self.base_dist.probs
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
new file mode 100644
index 0000000..e91c7cf
--- /dev/null
+++ b/torch/distributions/studentT.py
@@ -0,0 +1,78 @@
+from numbers import Number
+import torch
+from torch._six import inf, nan
+import math
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions import Chi2
+from torch.distributions.utils import broadcast_all
+
+
+class StudentT(Distribution):
+    r"""
+    Creates a Student's t-distribution parameterized by `df`.
+
+    Example::
+
+        >>> m = StudentT(torch.tensor([2.0]))
+        >>> m.sample()  # Student's t-distributed with degrees of freedom=2
+        tensor([ 0.1046])
+
+    Args:
+        df (float or Tensor): degrees of freedom
+    """
+    arg_constraints = {'df': constraints.positive, 'loc': constraints.real, 'scale': constraints.positive}
+    support = constraints.real
+    has_rsample = True
+
+    @property
+    def mean(self):
+        m = self.loc.clone()
+        m[self.df <= 1] = nan
+        return m
+
+    @property
+    def variance(self):
+        m = self.df.clone()
+        m[self.df > 2] = self.scale[self.df > 2].pow(2) * self.df[self.df > 2] / (self.df[self.df > 2] - 2)
+        m[(self.df <= 2) & (self.df > 1)] = inf
+        m[self.df <= 1] = nan
+        return m
+
+    def __init__(self, df, loc=0., scale=1., validate_args=None):
+        self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
+        self._chi2 = Chi2(df)
+        batch_shape = torch.Size() if isinstance(df, Number) else self.df.size()
+        super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
+
+    def rsample(self, sample_shape=torch.Size()):
+        # NOTE: This does not agree with scipy implementation as much as other distributions.
+        # (see https://github.com/fritzo/notebooks/blob/master/debug-student-t.ipynb). Using DoubleTensor
+        # parameters seems to help.
+
+        #   X ~ Normal(0, 1)
+        #   Z ~ Chi2(df)
+        #   Y = X / sqrt(Z / df) ~ StudentT(df)
+        shape = self._extended_shape(sample_shape)
+        X = self.df.new(shape).normal_()
+        Z = self._chi2.rsample(sample_shape)
+        Y = X * torch.rsqrt(Z / self.df)
+        return self.loc + self.scale * Y
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        y = (value - self.loc) / self.scale
+        Z = (self.scale.log() +
+             0.5 * self.df.log() +
+             0.5 * math.log(math.pi) +
+             torch.lgamma(0.5 * self.df) -
+             torch.lgamma(0.5 * (self.df + 1.)))
+        return -0.5 * (self.df + 1.) * torch.log1p(y**2. / self.df) - Z
+
+    def entropy(self):
+        lbeta = torch.lgamma(0.5 * self.df) + math.lgamma(0.5) - torch.lgamma(0.5 * (self.df + 1))
+        return (self.scale.log() +
+                0.5 * (self.df + 1) *
+                (torch.digamma(0.5 * (self.df + 1)) - torch.digamma(0.5 * self.df)) +
+                0.5 * self.df.log() + lbeta)
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
new file mode 100644
index 0000000..1cadbdd
--- /dev/null
+++ b/torch/distributions/transformed_distribution.py
@@ -0,0 +1,145 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.transforms import Transform
+from torch.distributions.utils import _sum_rightmost
+
+
+class TransformedDistribution(Distribution):
+    r"""
+    Extension of the Distribution class, which applies a sequence of Transforms
+    to a base distribution.  Let f be the composition of transforms applied::
+
+        X ~ BaseDistribution
+        Y = f(X) ~ TransformedDistribution(BaseDistribution, f)
+        log p(Y) = log p(X) + log |det (dX/dY)|
+
+    Note that the ``.event_shape`` of a :class:`TransformedDistribution` is the
+    maximum shape of its base distribution and its transforms, since transforms
+    can introduce correlations among events.
+
+    An example for the usage of :class:`TransformedDistribution` would be::
+
+        # Building a Logistic Distribution
+        # X ~ Uniform(0, 1)
+        # f = a + b * logit(X)
+        # Y ~ f(X) ~ Logistic(a, b)
+        base_distribution = Uniform(0, 1)
+        transforms = [SigmoidTransform().inv, AffineTransform(loc=a, scale=b)]
+        logistic = TransformedDistribution(base_distribution, transforms)
+
+    For more examples, please look at the implementations of
+    :class:`~torch.distributions.gumbel.Gumbel`,
+    :class:`~torch.distributions.half_cauchy.HalfCauchy`,
+    :class:`~torch.distributions.half_normal.HalfNormal`,
+    :class:`~torch.distributions.log_normal.LogNormal`,
+    :class:`~torch.distributions.pareto.Pareto`,
+    :class:`~torch.distributions.relaxed_bernoulli.RelaxedBernoulli` and
+    :class:`~torch.distributions.relaxed_categorical.RelaxedOneHotCategorical`
+    """
+    arg_constraints = {}
+
+    def __init__(self, base_distribution, transforms, validate_args=None):
+        self.base_dist = base_distribution
+        if isinstance(transforms, Transform):
+            self.transforms = [transforms, ]
+        elif isinstance(transforms, list):
+            if not all(isinstance(t, Transform) for t in transforms):
+                raise ValueError("transforms must be a Transform or a list of Transforms")
+            self.transforms = transforms
+        else:
+            raise ValueError("transforms must be a Transform or list, but was {}".format(transforms))
+        shape = self.base_dist.batch_shape + self.base_dist.event_shape
+        event_dim = max([len(self.base_dist.event_shape)] + [t.event_dim for t in self.transforms])
+        batch_shape = shape[:len(shape) - event_dim]
+        event_shape = shape[len(shape) - event_dim:]
+        super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    @constraints.dependent_property
+    def support(self):
+        return self.transforms[-1].codomain if self.transforms else self.base_dist.support
+
+    @property
+    def has_rsample(self):
+        return self.base_dist.has_rsample
+
+    def sample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped sample or sample_shape shaped batch of
+        samples if the distribution parameters are batched. Samples first from
+        base distribution and applies `transform()` for every transform in the
+        list.
+        """
+        with torch.no_grad():
+            x = self.base_dist.sample(sample_shape)
+            for transform in self.transforms:
+                x = transform(x)
+            return x
+
+    def rsample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched. Samples first from base distribution and applies
+        `transform()` for every transform in the list.
+        """
+        x = self.base_dist.rsample(sample_shape)
+        for transform in self.transforms:
+            x = transform(x)
+        return x
+
+    def log_prob(self, value):
+        """
+        Scores the sample by inverting the transform(s) and computing the score
+        using the score of the base distribution and the log abs det jacobian.
+        """
+        event_dim = len(self.event_shape)
+        log_prob = 0.0
+        y = value
+        for transform in reversed(self.transforms):
+            x = transform.inv(y)
+            log_prob = log_prob - _sum_rightmost(transform.log_abs_det_jacobian(x, y),
+                                                 event_dim - transform.event_dim)
+            y = x
+
+        log_prob = log_prob + _sum_rightmost(self.base_dist.log_prob(y),
+                                             event_dim - len(self.base_dist.event_shape))
+        return log_prob
+
+    def _monotonize_cdf(self, value):
+        """
+        This conditionally flips ``value -> 1-value`` to ensure :meth:`cdf` is
+        monotone increasing.
+        """
+        sign = 1
+        for transform in self.transforms:
+            sign = sign * transform.sign
+        if sign is 1:
+            return value
+        return sign * (value - 0.5) + 0.5
+
+    def cdf(self, value):
+        """
+        Computes the cumulative distribution function by inverting the
+        transform(s) and computing the score of the base distribution.
+        """
+        for transform in self.transforms[::-1]:
+            value = transform.inv(value)
+        if self._validate_args:
+            self.base_dist._validate_sample(value)
+        value = self.base_dist.cdf(value)
+        value = self._monotonize_cdf(value)
+        return value
+
+    def icdf(self, value):
+        """
+        Computes the inverse cumulative distribution function using
+        transform(s) and computing the score of the base distribution.
+        """
+        value = self._monotonize_cdf(value)
+        if self._validate_args:
+            self.base_dist._validate_sample(value)
+        value = self.base_dist.icdf(value)
+        for transform in self.transforms:
+            value = transform(value)
+        return value
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
new file mode 100644
index 0000000..9b241f6
--- /dev/null
+++ b/torch/distributions/transforms.py
@@ -0,0 +1,532 @@
+import math
+import numbers
+import weakref
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.utils import (_sum_rightmost, broadcast_all,
+                                       lazy_property)
+from torch.nn.functional import pad
+
+__all__ = [
+    'AbsTransform',
+    'AffineTransform',
+    'ComposeTransform',
+    'ExpTransform',
+    'LowerCholeskyTransform',
+    'PowerTransform',
+    'SigmoidTransform',
+    'SoftmaxTransform',
+    'StickBreakingTransform',
+    'Transform',
+    'identity_transform',
+]
+
+
+class Transform(object):
+    """
+    Abstract class for invertable transformations with computable log
+    det jacobians. They are primarily used in
+    :class:`torch.distributions.TransformedDistribution`.
+
+    Caching is useful for tranforms whose inverses are either expensive or
+    numerically unstable. Note that care must be taken with memoized values
+    since the autograd graph may be reversed. For example while the following
+    works with or without caching::
+
+        y = t(x)
+        t.log_abs_det_jacobian(x, y).backward()  # x will receive gradients.
+
+    However the following will error when caching due to dependency reversal::
+
+        y = t(x)
+        z = t.inv(y)
+        grad(z.sum(), [y])  # error because z is x
+
+    Derived classes should implement one or both of :meth:`_call` or
+    :meth:`_inverse`. Derived classes that set `bijective=True` should also
+    implement :meth:`log_abs_det_jacobian`.
+
+    Args:
+        cache_size (int): Size of cache. If zero, no caching is done. If one,
+            the latest single value is cached. Only 0 and 1 are supported.
+
+    Attributes:
+        domain (:class:`~torch.distributions.constraints.Constraint`):
+            The constraint representing valid inputs to this transform.
+        codomain (:class:`~torch.distributions.constraints.Constraint`):
+            The constraint representing valid outputs to this transform
+            which are inputs to the inverse transform.
+        bijective (bool): Whether this transform is bijective. A transform
+            ``t`` is bijective iff ``t.inv(t(x)) == x`` and
+            ``t(t.inv(y)) == y`` for every ``x`` in the domain and ``y`` in
+            the codomain. Transforms that are not bijective should at least
+            maintain the weaker pseudoinverse properties
+            ``t(t.inv(t(x)) == t(x)`` and ``t.inv(t(t.inv(y))) == t.inv(y)``.
+        sign (int or Tensor): For bijective univariate transforms, this
+            should be +1 or -1 depending on whether transform is monotone
+            increasing or decreasing.
+        event_dim (int): Number of dimensions that are correlated together in
+            the transform ``event_shape``. This should be 0 for pointwise
+            transforms, 1 for transforms that act jointly on vectors, 2 for
+            transforms that act jointly on matrices, etc.
+    """
+    bijective = False
+    event_dim = 0
+
+    def __init__(self, cache_size=0):
+        self._cache_size = cache_size
+        self._inv = None
+        if cache_size == 0:
+            pass  # default behavior
+        elif cache_size == 1:
+            self._cached_x_y = None, None
+        else:
+            raise ValueError('cache_size must be 0 or 1')
+
+    @property
+    def inv(self):
+        """
+        Returns the inverse :class:`Transform` of this transform.
+        This should satisfy ``t.inv.inv is t``.
+        """
+        inv = None
+        if self._inv is not None:
+            inv = self._inv()
+        if inv is None:
+            inv = _InverseTransform(self)
+            self._inv = weakref.ref(inv)
+        return inv
+
+    @property
+    def sign(self):
+        """
+        Returns the sign of the determinant of the Jacobian, if applicable.
+        In general this only makes sense for bijective transforms.
+        """
+        raise NotImplementedError
+
+    def __eq__(self, other):
+        return self is other
+
+    def __ne__(self, other):
+        # Necessary for Python2
+        return not self.__eq__(other)
+
+    def __call__(self, x):
+        """
+        Computes the transform `x => y`.
+        """
+        if self._cache_size == 0:
+            return self._call(x)
+        x_old, y_old = self._cached_x_y
+        if x is x_old:
+            return y_old
+        y = self._call(x)
+        self._cached_x_y = x, y
+        return y
+
+    def _inv_call(self, y):
+        """
+        Inverts the transform `y => x`.
+        """
+        if self._cache_size == 0:
+            return self._inverse(y)
+        x_old, y_old = self._cached_x_y
+        if y is y_old:
+            return x_old
+        x = self._inverse(y)
+        self._cached_x_y = x, y
+        return x
+
+    def _call(self, x):
+        """
+        Abstract method to compute forward transformation.
+        """
+        raise NotImplementedError
+
+    def _inverse(self, y):
+        """
+        Abstract method to compute inverse transformation.
+        """
+        raise NotImplementedError
+
+    def log_abs_det_jacobian(self, x, y):
+        """
+        Computes the log det jacobian `log |dy/dx|` given input and output.
+        """
+        raise NotImplementedError
+
+
+class _InverseTransform(Transform):
+    """
+    Inverts a single :class:`Transform`.
+    This class is private; please instead use the ``Transform.inv`` property.
+    """
+    def __init__(self, transform):
+        super(_InverseTransform, self).__init__()
+        self._inv = transform
+
+    @constraints.dependent_property
+    def domain(self):
+        return self._inv.codomain
+
+    @constraints.dependent_property
+    def codomain(self):
+        return self._inv.domain
+
+    @property
+    def bijective(self):
+        return self._inv.bijective
+
+    @property
+    def sign(self):
+        return self._inv.sign
+
+    @property
+    def event_dim(self):
+        return self._inv.event_dim
+
+    @property
+    def inv(self):
+        return self._inv
+
+    def __eq__(self, other):
+        if not isinstance(other, _InverseTransform):
+            return False
+        return self._inv == other._inv
+
+    def __call__(self, x):
+        return self._inv._inv_call(x)
+
+    def log_abs_det_jacobian(self, x, y):
+        return -self._inv.log_abs_det_jacobian(y, x)
+
+
+class ComposeTransform(Transform):
+    """
+    Composes multiple transforms in a chain.
+    The transforms being composed are responsible for caching.
+
+    Args:
+        parts (list of :class:`Transform`): A list of transforms to compose.
+    """
+    def __init__(self, parts):
+        super(ComposeTransform, self).__init__()
+        self.parts = parts
+
+    def __eq__(self, other):
+        if not isinstance(other, ComposeTransform):
+            return False
+        return self.parts == other.parts
+
+    @constraints.dependent_property
+    def domain(self):
+        if not self.parts:
+            return constraints.real
+        return self.parts[0].domain
+
+    @constraints.dependent_property
+    def codomain(self):
+        if not self.parts:
+            return constraints.real
+        return self.parts[-1].codomain
+
+    @lazy_property
+    def bijective(self):
+        return all(p.bijective for p in self.parts)
+
+    @lazy_property
+    def sign(self):
+        sign = 1
+        for p in self.parts:
+            sign = sign * p.sign
+        return sign
+
+    @lazy_property
+    def event_dim(self):
+        return max(p.event_dim for p in self.parts) if self.parts else 0
+
+    @property
+    def inv(self):
+        inv = None
+        if self._inv is not None:
+            inv = self._inv()
+        if inv is None:
+            inv = ComposeTransform([p.inv for p in reversed(self.parts)])
+            self._inv = weakref.ref(inv)
+            inv._inv = weakref.ref(self)
+        return inv
+
+    def __call__(self, x):
+        for part in self.parts:
+            x = part(x)
+        return x
+
+    def log_abs_det_jacobian(self, x, y):
+        if not self.parts:
+            return torch.zeros_like(x)
+        result = 0
+        for part in self.parts:
+            y = part(x)
+            result = result + _sum_rightmost(part.log_abs_det_jacobian(x, y),
+                                             self.event_dim - part.event_dim)
+            x = y
+        return result
+
+
+identity_transform = ComposeTransform([])
+
+
+class ExpTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = \exp(x)`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, ExpTransform)
+
+    def _call(self, x):
+        return x.exp()
+
+    def _inverse(self, y):
+        return y.log()
+
+    def log_abs_det_jacobian(self, x, y):
+        return x
+
+
+class PowerTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = x^{\text{exponent}}`.
+    """
+    domain = constraints.positive
+    codomain = constraints.positive
+    bijective = True
+    sign = +1
+
+    def __init__(self, exponent, cache_size=0):
+        super(PowerTransform, self).__init__(cache_size=cache_size)
+        self.exponent, = broadcast_all(exponent)
+
+    def __eq__(self, other):
+        if not isinstance(other, PowerTransform):
+            return False
+        return self.exponent.eq(other.exponent).all().item()
+
+    def _call(self, x):
+        return x.pow(self.exponent)
+
+    def _inverse(self, y):
+        return y.pow(1 / self.exponent)
+
+    def log_abs_det_jacobian(self, x, y):
+        return (self.exponent * y / x).abs().log()
+
+
+class SigmoidTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = \frac{1}{1 + \exp(-x)}` and :math:`x = \text{logit}(y)`.
+    """
+    domain = constraints.real
+    codomain = constraints.unit_interval
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, SigmoidTransform)
+
+    def _call(self, x):
+        return torch.sigmoid(x)
+
+    def _inverse(self, y):
+        return y.log() - (-y).log1p()
+
+    def log_abs_det_jacobian(self, x, y):
+        return -(y.reciprocal() + (1 - y).reciprocal()).log()
+
+
+class AbsTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = |x|`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+
+    def __eq__(self, other):
+        return isinstance(other, AbsTransform)
+
+    def _call(self, x):
+        return x.abs()
+
+    def _inverse(self, y):
+        return y
+
+
+class AffineTransform(Transform):
+    r"""
+    Transform via the pointwise affine mapping :math:`y = \text{loc} + \text{scale} \times x`.
+
+    Args:
+        loc (Tensor or float): Location parameter.
+        scale (Tensor or float): Scale parameter.
+        event_dim (int): Optional size of `event_shape`. This should be zero
+            for univariate random variables, 1 for distributions over vectors,
+            2 for distributions over matrices, etc.
+    """
+    domain = constraints.real
+    codomain = constraints.real
+    bijective = True
+
+    def __init__(self, loc, scale, event_dim=0, cache_size=0):
+        super(AffineTransform, self).__init__(cache_size=cache_size)
+        self.loc = loc
+        self.scale = scale
+        self.event_dim = event_dim
+
+    def __eq__(self, other):
+        if not isinstance(other, AffineTransform):
+            return False
+
+        if isinstance(self.loc, numbers.Number) and isinstance(other.loc, numbers.Number):
+            if self.loc != other.loc:
+                return False
+        else:
+            if not (self.loc == other.loc).all().item():
+                return False
+
+        if isinstance(self.scale, numbers.Number) and isinstance(other.scale, numbers.Number):
+            if self.scale != other.scale:
+                return False
+        else:
+            if not (self.scale == other.scale).all().item():
+                return False
+
+        return True
+
+    @property
+    def sign(self):
+        if isinstance(self.scale, numbers.Number):
+            return 1 if self.scale > 0 else -1 if self.scale < 0 else 0
+        return self.scale.sign()
+
+    def _call(self, x):
+        return self.loc + self.scale * x
+
+    def _inverse(self, y):
+        return (y - self.loc) / self.scale
+
+    def log_abs_det_jacobian(self, x, y):
+        shape = x.shape
+        scale = self.scale
+        if isinstance(scale, numbers.Number):
+            result = x.new_empty(shape).fill_(math.log(abs(scale)))
+        else:
+            result = torch.abs(scale).log()
+        if self.event_dim:
+            result_size = result.size()[:-self.event_dim] + (-1,)
+            result = result.view(result_size).sum(-1)
+            shape = shape[:-self.event_dim]
+        return result.expand(shape)
+
+
+class SoftmaxTransform(Transform):
+    r"""
+    Transform from unconstrained space to the simplex via :math:`y = \exp(x)` then
+    normalizing.
+
+    This is not bijective and cannot be used for HMC. However this acts mostly
+    coordinate-wise (except for the final normalization), and thus is
+    appropriate for coordinate-wise optimization algorithms.
+    """
+    domain = constraints.real
+    codomain = constraints.simplex
+    event_dim = 1
+
+    def __eq__(self, other):
+        return isinstance(other, SoftmaxTransform)
+
+    def _call(self, x):
+        logprobs = x
+        probs = (logprobs - logprobs.max(-1, True)[0]).exp()
+        return probs / probs.sum(-1, True)
+
+    def _inverse(self, y):
+        probs = y
+        return probs.log()
+
+
+class StickBreakingTransform(Transform):
+    """
+    Transform from unconstrained space to the simplex of one additional
+    dimension via a stick-breaking process.
+
+    This transform arises as an iterated sigmoid transform in a stick-breaking
+    construction of the `Dirichlet` distribution: the first logit is
+    transformed via sigmoid to the first probability and the probability of
+    everything else, and then the process recurses.
+
+    This is bijective and appropriate for use in HMC; however it mixes
+    coordinates together and is less appropriate for optimization.
+    """
+    domain = constraints.real
+    codomain = constraints.simplex
+    bijective = True
+    event_dim = 1
+
+    def __eq__(self, other):
+        return isinstance(other, StickBreakingTransform)
+
+    def _call(self, x):
+        offset = (x.shape[-1] + 1) - x.new([1]).expand(x.shape).cumsum(-1)
+        z = torch.sigmoid(x - offset.log())
+        z_cumprod = (1 - z).cumprod(-1)
+        y = pad(z, (0, 1), value=1) * pad(z_cumprod, (1, 0), value=1)
+        return y
+
+    def _inverse(self, y):
+        shape = y.shape[:-1] + (y.shape[-1] - 1,)
+        offset = (shape[-1] + 1) - y.new([1]).expand(shape).cumsum(-1)
+        sf = (1 - y.cumsum(-1))[..., :-1]
+        x = y[..., :-1].log() - sf.log() + offset.log()
+        return x
+
+    def log_abs_det_jacobian(self, x, y):
+        offset = (x.shape[-1] + 1) - x.new([1]).expand(x.shape).cumsum(-1)
+        z = torch.sigmoid(x - offset.log())
+        detJ = ((1 - z).log() + y[..., :-1].log()).sum(-1)
+        return detJ
+
+
+class LowerCholeskyTransform(Transform):
+    """
+    Transform from unconstrained matrices to lower-triangular matrices with
+    nonnegative diagonal entries.
+
+    This is useful for parameterizing positive definite matrices in terms of
+    their Cholesky factorization.
+    """
+    domain = constraints.real
+    codomain = constraints.lower_cholesky
+    event_dim = 2
+
+    def __eq__(self, other):
+        return isinstance(other, LowerCholeskyTransform)
+
+    def _call_on_event(self, x):
+        return x.tril(-1) + x.diag().exp().diag()
+
+    def _inverse_on_event(self, y):
+        return y.tril(-1) + y.diag().log().diag()
+
+    def _call(self, x):
+        flat_x = x.contiguous().view((-1,) + x.shape[-2:])
+        return torch.stack([self._call_on_event(z) for z in flat_x]).view(x.shape)
+
+    def _inverse(self, y):
+        flat_y = y.contiguous().view((-1,) + y.shape[-2:])
+        return torch.stack([self._inverse_on_event(z) for z in flat_y]).view(y.shape)
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
new file mode 100644
index 0000000..3e64de1
--- /dev/null
+++ b/torch/distributions/uniform.py
@@ -0,0 +1,82 @@
+import math
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+
+class Uniform(Distribution):
+    r"""
+    Generates uniformly distributed random samples from the half-open interval
+    `[low, high)`.
+
+    Example::
+
+        >>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
+        >>> m.sample()  # uniformly distributed in the range [0.0, 5.0)
+        tensor([ 2.3418])
+
+    Args:
+        low (float or Tensor): lower range (inclusive).
+        high (float or Tensor): upper range (exclusive).
+    """
+    # TODO allow (loc,scale) parameterization to allow independent constraints.
+    arg_constraints = {'low': constraints.dependent, 'high': constraints.dependent}
+    has_rsample = True
+
+    @property
+    def mean(self):
+        return (self.high + self.low) / 2
+
+    @property
+    def stddev(self):
+        return (self.high - self.low) / 12**0.5
+
+    @property
+    def variance(self):
+        return (self.high - self.low).pow(2) / 12
+
+    def __init__(self, low, high, validate_args=None):
+        self.low, self.high = broadcast_all(low, high)
+
+        if isinstance(low, Number) and isinstance(high, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.low.size()
+        super(Uniform, self).__init__(batch_shape, validate_args=validate_args)
+
+        if self._validate_args and not torch.lt(self.low, self.high).all():
+            raise ValueError("Uniform is not defined when low>= high")
+
+    @constraints.dependent_property
+    def support(self):
+        return constraints.interval(self.low, self.high)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        rand = self.low.new(shape).uniform_()
+        return self.low + rand * (self.high - self.low)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        lb = value.ge(self.low).type_as(self.low)
+        ub = value.lt(self.high).type_as(self.low)
+        return torch.log(lb.mul(ub)) - torch.log(self.high - self.low)
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        result = (value - self.low) / (self.high - self.low)
+        return result.clamp(min=0, max=1)
+
+    def icdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        result = value * (self.high - self.low) + self.low
+        return result
+
+    def entropy(self):
+        return torch.log(self.high - self.low)
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
new file mode 100644
index 0000000..ccc0fff
--- /dev/null
+++ b/torch/distributions/utils.py
@@ -0,0 +1,158 @@
+from collections import namedtuple
+from functools import update_wrapper
+from numbers import Number
+import math
+import torch
+import torch.nn.functional as F
+
+# This follows semantics of numpy.finfo.
+_Finfo = namedtuple('_Finfo', ['eps', 'tiny'])
+_FINFO = {
+    torch.HalfStorage: _Finfo(eps=0.00097656, tiny=6.1035e-05),
+    torch.FloatStorage: _Finfo(eps=1.19209e-07, tiny=1.17549e-38),
+    torch.DoubleStorage: _Finfo(eps=2.22044604925e-16, tiny=2.22507385851e-308),
+    torch.cuda.HalfStorage: _Finfo(eps=0.00097656, tiny=6.1035e-05),
+    torch.cuda.FloatStorage: _Finfo(eps=1.19209e-07, tiny=1.17549e-38),
+    torch.cuda.DoubleStorage: _Finfo(eps=2.22044604925e-16, tiny=2.22507385851e-308),
+}
+
+
+def _finfo(tensor):
+    r"""
+    Return floating point info about a `Tensor`:
+    - `.eps` is the smallest number that can be added to 1 without being lost.
+    - `.tiny` is the smallest positive number greater than zero
+      (much smaller than `.eps`).
+
+    Args:
+        tensor (Tensor): tensor of floating point data.
+    Returns:
+        _Finfo: a `namedtuple` with fields `.eps` and `.tiny`.
+    """
+    return _FINFO[tensor.storage_type()]
+
+
+def _broadcast_shape(shapes):
+    r"""
+    Given a list of tensor sizes, returns the size of the resulting broadcasted
+    tensor.
+
+    Args:
+        shapes (list of torch.Size): list of tensor sizes
+    """
+    shape = torch.Size()
+    for s in shapes:
+        shape = torch._C._infer_size(s, shape)
+    return shape
+
+
+def broadcast_all(*values):
+    r"""
+    Given a list of values (possibly containing numbers), returns a list where each
+    value is broadcasted based on the following rules:
+      - `torch.*Tensor` instances are broadcasted as per the `broadcasting rules
+        <http://pytorch.org/docs/master/notes/broadcasting.html>`_
+      - numbers.Number instances (scalars) are upcast to tensors having
+        the same size and type as the first tensor passed to `values`.  If all the
+        values are scalars, then they are upcasted to Tensors having size
+        `(1,)`.
+
+    Args:
+        values (list of `numbers.Number` or `torch.*Tensor`)
+
+    Raises:
+        ValueError: if any of the values is not a `numbers.Number` or
+            `torch.*Tensor` instance
+    """
+    values = list(values)
+    scalar_idxs = [i for i in range(len(values)) if isinstance(values[i], Number)]
+    tensor_idxs = [i for i in range(len(values)) if values[i].__class__.__name__ == 'Tensor']
+    if len(scalar_idxs) + len(tensor_idxs) != len(values):
+        raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.')
+    if tensor_idxs:
+        broadcast_shape = _broadcast_shape([values[i].size() for i in tensor_idxs])
+        for idx in tensor_idxs:
+            values[idx] = values[idx].expand(broadcast_shape)
+        template = values[tensor_idxs[0]]
+        for idx in scalar_idxs:
+            values[idx] = template.new(template.size()).fill_(values[idx])
+    else:
+        for idx in scalar_idxs:
+            values[idx] = torch.tensor(float(values[idx]))
+    return values
+
+
+def _sum_rightmost(value, dim):
+    r"""
+    Sum out ``dim`` many rightmost dimensions of a given tensor.
+
+    Args:
+        value (Tensor): A tensor of ``.dim()`` at least ``dim``.
+        dim (int): The number of rightmost dims to sum out.
+    """
+    if dim == 0:
+        return value
+    required_shape = value.shape[:-dim] + (-1,)
+    return value.reshape(required_shape).sum(-1)
+
+
+def logits_to_probs(logits, is_binary=False):
+    r"""
+    Converts a tensor of logits into probabilities. Note that for the
+    binary case, each value denotes log odds, whereas for the
+    multi-dimensional case, the values along the last dimension denote
+    the log probabilities (possibly unnormalized) of the events.
+    """
+    if is_binary:
+        return torch.sigmoid(logits)
+    return F.softmax(logits, dim=-1)
+
+
+def clamp_probs(probs):
+    eps = _finfo(probs).eps
+    return probs.clamp(min=eps, max=1 - eps)
+
+
+def probs_to_logits(probs, is_binary=False):
+    r"""
+    Converts a tensor of probabilities into logits. For the binary case,
+    this denotes the probability of occurrence of the event indexed by `1`.
+    For the multi-dimensional case, the values along the last dimension
+    denote the probabilities of occurrence of each of the events.
+    """
+    ps_clamped = clamp_probs(probs)
+    if is_binary:
+        return torch.log(ps_clamped) - torch.log1p(-ps_clamped)
+    return torch.log(ps_clamped)
+
+
+def batch_tril(bmat, diagonal=0):
+    """
+    Given a batch of matrices, returns the lower triangular part of each matrix, with
+    the other entries set to 0. The argument `diagonal` has the same meaning as in
+    `torch.tril`.
+    """
+    if bmat.dim() == 2:
+        return bmat.tril(diagonal=diagonal)
+    else:
+        return bmat * torch.tril(bmat.new(*bmat.shape[-2:]).fill_(1.0), diagonal=diagonal)
+
+
+class lazy_property(object):
+    r"""
+    Used as a decorator for lazy loading of class attributes. This uses a
+    non-data descriptor that calls the wrapped method to compute the property on
+    first call; thereafter replacing the wrapped method into an instance
+    attribute.
+    """
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+        update_wrapper(self, wrapped)
+
+    def __get__(self, instance, obj_type=None):
+        if instance is None:
+            return self
+        with torch.enable_grad():
+            value = self.wrapped(instance)
+        setattr(instance, self.wrapped.__name__, value)
+        return value
diff --git a/torch/for_onnx/__init__.py b/torch/for_onnx/__init__.py
new file mode 100644
index 0000000..f55446b
--- /dev/null
+++ b/torch/for_onnx/__init__.py
@@ -0,0 +1 @@
+from .onnx import *
diff --git a/torch/functional.py b/torch/functional.py
new file mode 100644
index 0000000..19d47f3
--- /dev/null
+++ b/torch/functional.py
@@ -0,0 +1,405 @@
+import torch
+import torch.nn.functional as F
+from torch._six import inf
+from operator import mul
+from functools import reduce
+import math
+
+__all__ = [
+    'argmax',
+    'argmin',
+    'btrifact',
+    'btriunpack',
+    'isfinite',
+    'isinf',
+    'isnan',
+    'split',
+    'stft',
+    'unique',
+]
+
+
+def split(tensor, split_size_or_sections, dim=0):
+    r"""Splits the tensor into chunks.
+
+    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
+    be split into equally sized chunks (if possible). Last chunk will be smaller if
+    the tensor size along the given dimension :attr:`dim= is not divisible by
+    :attr:`split_size`.
+
+    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
+    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
+    to :attr:`split_size_or_sections`.
+
+    Arguments:
+        tensor (Tensor): tensor to split.
+        split_size_or_sections (int) or (list(int)): size of a single chunk or
+            list of sizes for each chunk
+        dim (int): dimension along which to split the tensor.
+    """
+    # Overwriting reason:
+    # This dispatches to two ATen functions depending on the type of
+    # split_size_or_sections. The branching code is in tensor.py, which we
+    # call here.
+    return tensor.split(split_size_or_sections, dim)
+
+
+def btrifact(A, info=None, pivot=True):
+    r"""Batch LU factorization.
+
+    Returns a tuple containing the LU factorization and pivots. Pivoting is done if
+    :attr:`pivot` is set.
+
+    The optional argument :attr:`info` stores information if the factorization
+    succeeded for each minibatch example. The :attr:`info` is provided as an
+    `IntTensor`, its values will be filled from dgetrf and a non-zero value
+    indicates an error occurred. Specifically, the values are from cublas if cuda is
+    being used, otherwise LAPACK.
+
+    .. warning::
+        The :attr:`info` argument is deprecated in favor of :meth:`torch.btrifact_with_info`.
+
+    Arguments:
+        A (Tensor): the tensor to factor
+        info (IntTensor, optional): (deprecated) an `IntTensor` to store values
+            indicating whether factorization succeeds
+        pivot (bool, optional): controls whether pivoting is done
+
+    Returns:
+        A tuple containing factorization and pivots.
+
+    Example::
+
+        >>> A = torch.randn(2, 3, 3)
+        >>> A_LU, pivots = torch.btrifact(A)
+        >>> A_LU
+        tensor([[[ 1.3506,  2.5558, -0.0816],
+                 [ 0.1684,  1.1551,  0.1940],
+                 [ 0.1193,  0.6189, -0.5497]],
+
+                [[ 0.4526,  1.2526, -0.3285],
+                 [-0.7988,  0.7175, -0.9701],
+                 [ 0.2634, -0.9255, -0.3459]]])
+
+        >>> pivots
+        tensor([[ 3,  3,  3],
+                [ 3,  3,  3]], dtype=torch.int32)
+    """
+    # Overwriting reason:
+    # `info` is being deprecated in favor of `btrifact_with_info`. This warning
+    # is in tensor.py, which we call here.
+    return A.btrifact(info, pivot)
+
+
+def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
+    r"""Unpacks the data and pivots from a batched LU factorization (btrifact) of a tensor.
+
+    Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.
+
+    Arguments:
+        LU_data (Tensor): the packed LU factorization data
+        LU_pivots (Tensor): the packed LU factorization pivots
+        unpack_data (bool): flag indicating if the data should be unpacked
+        unpack_pivots (bool): flag indicating if the pivots should be unpacked
+
+    Example::
+
+        >>> A = torch.randn(2, 3, 3)
+        >>> A_LU, pivots = A.btrifact()
+        >>> P, A_L, A_U = torch.btriunpack(A_LU, pivots)
+        >>>
+        >>> # can recover A from factorization
+        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
+    """
+
+    nBatch, sz, _ = LU_data.size()
+
+    if unpack_data:
+        I_U = torch.triu(torch.ones(sz, sz)).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz)
+        I_L = 1 - I_U
+        L = LU_data.new(LU_data.size()).zero_()
+        U = LU_data.new(LU_data.size()).zero_()
+        I_diag = torch.eye(sz).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz)
+        L[I_diag] = 1.0
+        L[I_L] = LU_data[I_L]
+        U[I_U] = LU_data[I_U]
+    else:
+        L = U = None
+
+    if unpack_pivots:
+        P = torch.eye(sz).type_as(LU_data).unsqueeze(0).repeat(nBatch, 1, 1)
+        for i in range(nBatch):
+            for j in range(sz):
+                k = int(LU_pivots[i, j] - 1)
+                t = P[i, :, j].clone()
+                P[i, :, j] = P[i, :, k]
+                P[i, :, k] = t
+    else:
+        P = None
+
+    return P, L, U
+
+
+def isfinite(tensor):
+    r"""Returns a new tensor with boolean elements representing if each element is `Finite` or not.
+
+    Arguments:
+        tensor (Tensor): A tensor to check
+
+    Returns:
+        Tensor: A ``torch.ByteTensor`` containing a 1 at each location of finite elements and 0 otherwise
+
+    Example::
+
+        >>> torch.isfinite(torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([ 1,  0,  1,  0,  0], dtype=torch.uint8)
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError("The argument is not a tensor", str(tensor))
+    return (tensor == tensor) & (tensor.abs() != inf)
+
+
+def isinf(tensor):
+    r"""Returns a new tensor with boolean elements representing if each element is `+/-INF` or not.
+
+    Arguments:
+        tensor (Tensor): A tensor to check
+
+    Returns:
+        Tensor: A ``torch.ByteTensor`` containing a 1 at each location of `+/-INF` elements and 0 otherwise
+
+    Example::
+
+        >>> torch.isinf(torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([ 0,  1,  0,  1,  0], dtype=torch.uint8)
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError("The argument is not a tensor", str(tensor))
+    return tensor.abs() == inf
+
+
+def stft(input, n_fft, hop_length=None, win_length=None, window=None,
+         center=True, pad_mode='reflect', normalized=False, onesided=True):
+    r"""Short-time Fourier transform (STFT).
+
+    Ignoring the optional batch dimension, this method computes the following
+    expression:
+
+    .. math::
+        X[m, \omega] = \sum_{k = 0}^{\text{win_length}}%
+                            window[k]\ input[m \times hop_length + k]\ %
+                            e^{- j \frac{2 \pi \cdot \omega k}{\text{win_length}}},
+
+    where :math:`m` is the index of the sliding window, and :math:`\omega` is
+    the frequency that :math:`0 \leq \omega < \text{n_fft}`. When
+    :attr:`onesided` is the default value ``True``,
+
+    * :attr:`input` must be either a 1-D time sequenceor 2-D a batch of time
+      sequences.
+
+    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
+      ``floor(n_fft / 4)``.
+
+    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
+      :attr:`n_fft`.
+
+    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
+      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
+      treated as if having :math:`1` everywhere in the window. If
+      :math:`\text{win_length} < \text{n_fft}`, :attr:`window` will be padded on
+      both sides to length :attr:`n_fft` before being applied.
+
+    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
+      both sides so that the :math:`t`-th frame is centered at time
+      :math:`t \times \text{hop_length}`. Otherwise, the :math:`t`-th frame
+      begins at time  :math:`t \times \text{hop_length}`.
+
+    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
+      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
+      all available options. Default is ``"reflect"``.
+
+    * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega`
+      in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n_fft}}{2} \right\rfloor + 1\right]`
+      are returned because the real-to-complex Fourier transform satisfies the
+      conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n_fft} - \omega]^*`.
+
+    * If :attr:`normalized` is ``True`` (default is ``False``), the function
+      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame_length})^{-0.5}`.
+
+    Returns the real and the imaginary parts together as one tensor of size
+    :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional
+    batch size of :attr:`input`, :math:`N` is the number of frequencies where
+    STFT is applied, :math:`T` is the total number of frames used, and each pair
+    in the last dimension represents a complex number as the real part and the
+    imaginary part.
+
+    .. warning::
+      This function changed signature at version 0.4.1. Calling with the
+      previous signature may cause error or return incorrect result.
+
+    Arguments:
+        input (Tensor): the input tensor
+        n_fft (int, optional): size of Fourier transform
+        hop_length (int): the distance between neighboring sliding window
+            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
+        win_length (int): the size of window frame and STFT filter.
+            Default: ``None``  (treated as equal to :attr:`n_fft`)
+        window (Tensor, optional): the optional window function.
+            Default: ``None`` (treated as window of all :math:`1`s)
+        center (bool, optional): whether to pad :attr:`input` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        normalized (bool, optional): controls whether to return the normalized STFT results
+             Default: ``False``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy Default: ``True``
+
+    Returns:
+        Tensor: A tensor containing the STFT result with shape described above
+
+    """
+    # TODO: after having proper ways to map Python strings to ATen Enum, move
+    #       this and F.pad to ATen.
+    if center:
+        signal_dim = input.dim()
+        extended_shape = [1] * (3 - signal_dim) + list(input.size())
+        pad = int(n_fft // 2)
+        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
+        input = input.view(input.shape[-signal_dim:])
+    return torch._C._VariableFunctions.stft(input, n_fft, hop_length, win_length, window, normalized, onesided)
+
+
+def isnan(tensor):
+    r"""Returns a new tensor with boolean elements representing if each element is `NaN` or not.
+
+    Arguments:
+        tensor (Tensor): A tensor to check
+
+    Returns:
+        Tensor: A ``torch.ByteTensor`` containing a 1 at each location of `NaN` elements.
+
+    Example::
+
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+        tensor([ 0,  1,  0], dtype=torch.uint8)
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError("The argument is not a tensor", str(tensor))
+    return tensor != tensor
+
+
+def unique(input, sorted=False, return_inverse=False):
+    r"""Returns the unique scalar elements of the input tensor as a 1-D tensor.
+
+    Arguments:
+        input (Tensor): the input tensor
+        sorted (bool): Whether to sort the unique elements in ascending order
+            before returning as output.
+        return_inverse (bool): Whether to also return the indices for where
+            elements in the original input ended up in the returned unique list.
+
+    Returns:
+        (Tensor, Tensor (optional)): A tensor or a tuple of tensors containing
+
+            - **output** (*Tensor*): the output list of unique scalar elements.
+            - **inverse_indices** (*Tensor*): (optional) if
+              :attr:`return_inverse` is True, there will be a
+              2nd returned tensor (same shape as input) representing the indices
+              for where elements in the original input map to in the output;
+              otherwise, this function will only return a single tensor.
+
+    Example::
+
+        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
+        >>> output
+        tensor([ 2,  3,  1])
+
+        >>> output, inverse_indices = torch.unique(
+                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([ 1,  2,  3])
+        >>> inverse_indices
+        tensor([ 0,  2,  1,  2])
+
+        >>> output, inverse_indices = torch.unique(
+                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([ 1,  2,  3])
+        >>> inverse_indices
+        tensor([[ 0,  2],
+                [ 1,  2]])
+
+    """
+    output, inverse_indices = torch._unique(
+        input,
+        sorted=sorted,
+        return_inverse=return_inverse,
+    )
+    if return_inverse:
+        return output, inverse_indices
+    else:
+        return output
+
+
+def argmax(input, dim=None, keepdim=False):
+    """Returns the indices of the maximum values of a tensor across a dimension.
+
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+
+    Args:
+        input (Tensor): the input tensor
+        dim (int): the dimension to reduce. If ``None``, the argmax of the
+            flattened input is returned.
+        keepdim (bool): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if ``dim=None``.
+
+    Example::
+
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+
+
+        >>> torch.argmax(a, dim=1)
+        tensor([ 0,  2,  0,  1])
+    """
+    if dim is None:
+        return torch._argmax(input.contiguous().view(-1), dim=0, keepdim=False)
+    return torch._argmax(input, dim, keepdim)
+
+
+def argmin(input, dim=None, keepdim=False):
+    """Returns the indices of the minimum values of a tensor across a dimension.
+
+    This is the second value returned by :meth:`torch.min`. See its
+    documentation for the exact semantics of this method.
+
+    Args:
+        input (Tensor): the input tensor
+        dim (int): the dimension to reduce. If ``None``, the argmin of the
+            flattened input is returned.
+        keepdim (bool): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if ``dim=None``.
+
+    Example::
+
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+                [ 1.0100, -1.1975, -0.0102, -0.4732],
+                [-0.9240,  0.1207, -0.7506, -1.0213],
+                [ 1.7809, -1.2960,  0.9384,  0.1438]])
+
+
+        >>> torch.argmin(a, dim=1)
+        tensor([ 2,  1,  3,  1])
+    """
+    if dim is None:
+        return torch._argmin(input.contiguous().view(-1), dim=0, keepdim=False)
+    return torch._argmin(input, dim, keepdim)
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
new file mode 100644
index 0000000..fbf3fab
--- /dev/null
+++ b/torch/jit/__init__.py
@@ -0,0 +1,782 @@
+import torch._C
+from torch import Tensor
+from torch.autograd import Variable, function
+from torch.nn import Module, ModuleList, ParameterList, Parameter, Sequential
+from torch.jit.frontend import get_jit_ast
+from torch._six import raise_from, with_metaclass
+from collections import defaultdict, OrderedDict, namedtuple
+import sys
+import warnings
+import itertools
+import weakref
+import types
+import contextlib
+import os
+import functools
+import inspect
+import copy
+import numbers
+import collections
+import re
+
+_flatten = torch._C._jit_flatten
+_unflatten = torch._C._jit_unflatten
+_jit_script_compile = torch._C._jit_script_compile
+BatchTensor = torch._C._jit.BatchTensor
+
+# This global variable is set when we are tracing a *forwards* computation.
+# It is intended to be a cheap way to test if tracing has occurred, before
+# doing the slower path using `get_tracing_state` (below.)
+_tracing = False
+
+
+def get_tracing_state(args):
+    if not torch._C._is_tracing(args):
+        return None
+    return torch._C._get_tracing_state(args)
+
+
+@contextlib.contextmanager
+def scope(scope_name, *vars):
+    tracing_state = get_tracing_state(vars)
+    if tracing_state:
+        tracing_state.push_scope(scope_name)
+    try:
+        yield
+    finally:
+        if tracing_state:
+            tracing_state.pop_scope()
+
+
+def get_trace_graph(f, args=tuple(), kwargs=None):
+    """
+    Trace a function or model, returning a tuple consisting of the both the
+    *trace* of an execution, as well as the original return value.
+
+    Tracing is guaranteed not to change the semantics of the function/module
+    that is traced.
+
+    Arguments:
+        f (torch.nn.Module or function): the function or module
+            to be traced.
+        args (tuple or Tensor): the positional arguments to pass to the
+            function/module to be traced.  A non-tuple is assumed to
+            be a single positional argument to be passed to the model.
+        kwargs (dict): the keyword arguments to pass to the function/module
+            to be traced.
+
+    Example: Trace a cell.
+
+        >>> trace, out = jit.trace(nn.LSTMCell(), (input, hidden))
+        >>> print(trace)
+    """
+    if kwargs is None:
+        kwargs = {}
+    if not isinstance(args, tuple):
+        args = (args,)
+    return LegacyTracedModule(f)(*args, **kwargs)
+
+
+def _unique_state_dict(module, keep_vars=False):
+    state_dict = module.state_dict(keep_vars=keep_vars)
+    filtered_dict = type(state_dict)()
+    seen_ids = set()
+    for k, v in state_dict.items():
+        if id(v) in seen_ids:
+            continue
+        seen_ids.add(id(v))
+        filtered_dict[k] = v
+    return filtered_dict
+
+
+class LegacyTracedModule(Module):
+    def __init__(self, inner):
+        super(LegacyTracedModule, self).__init__()
+        # inner may be a Module, or it may be an arbitrary callable
+        # If it's a Module, we get its parameters automatically, which lets
+        # us avoid a special casing functions versus modules.
+        self.inner = inner
+
+    def forward(self, *args):
+        global _tracing
+        in_vars, in_desc = _flatten(args)
+        # NOTE: use full state, because we need it for BatchNorm export
+        # This differs from the compiler path, which doesn't support it at the moment.
+        module_state = list(_unique_state_dict(self, keep_vars=True).values())
+        trace, all_trace_inputs = torch._C._tracer_enter(in_vars + module_state)
+        _tracing = True
+        trace_inputs = _unflatten(all_trace_inputs[:len(in_vars)], in_desc)
+        out = self.inner(*trace_inputs)
+        out_vars, _ = _flatten(out)
+        _tracing = False
+        torch._C._tracer_exit(out_vars)
+        return trace, out
+
+
+def _clone_inputs(args):
+    def clone_input(a):
+        if a is None:
+            return None
+        elif isinstance(a, torch.Tensor):
+            # TODO: figure out one liner to .clone() and set requires_grad
+            v = Variable(a.data.clone(), requires_grad=a.requires_grad)
+            if a.grad is not None:
+                v.grad = clone_input(v.grad)
+            return v
+        else:
+            return a.clone()
+    return function._nested_map(lambda x: isinstance(x, torch.Tensor),
+                                clone_input, condition_msg="tensors")(args)
+
+
+# This is purely for developer debugging.  We are not going to advertise it.
+_JIT_DUMP = os.environ.get('PYTORCH_JIT_DUMP', False)
+_JIT_TIME = os.environ.get('PYTORCH_JIT_TIME', False)  # CUDA-only timing
+_JIT_DISABLE = os.environ.get('PYTORCH_JIT_DISABLE', False)
+_JIT_STATS = os.environ.get('PYTORCH_JIT_STATS', False)
+
+
+def _dump_trace(trace_name, pass_name, input_key, trace):
+    if not _JIT_DUMP:
+        return
+
+    import torch.contrib._graph_vis as graph_vis
+
+    filename = "{}_{}".format(trace_name, pass_name)
+    # TODO: Also paste out the backtrace when the trace was compiled
+    # (and maybe also when it was run?)
+    with open(filename + ".ir", "w") as f:
+        f.write("Input key: {}\n\n{}".format(input_key, str(trace)))
+    graph_vis.write(trace.graph(), filename + ".html")
+
+
+@contextlib.contextmanager
+def _time(trace_name, name, time=True):
+    if (not _JIT_TIME and not time) or not torch.cuda.is_available():
+        yield
+        return
+    stream = torch.cuda.current_stream()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    stream.record_event(start)
+    try:
+        yield
+    finally:
+        stream.record_event(end)
+        end.synchronize()
+        print("{} {} time: {} ms".format(trace_name, name, start.elapsed_time(end)))
+
+
+def verify(model, args, loss_fn=torch.sum, devices=None):
+    """
+    Verify that a JIT compiled model has the same behavior as its uncompiled
+    version along with its backwards pass.  If your model returns multiple
+    outputs, you must also specify a `loss_fn` to produce a loss for which
+    the backwards will be computed.
+
+    This function has side-effects (e.g., it executes your model / saves and loads
+    parameters), so don't expect the model to come out exactly the same as what
+    you passed in.
+
+    Arguments:
+        model (compiled torch.nn.Module or function): the module/function to be
+            verified.  The module/function definition MUST have been decorated with
+            `@torch.jit.compile`.
+        args (tuple or Tensor): the positional arguments to pass to the
+            compiled function/module to be verified.  A non-tuple is assumed to
+            be a single positional argument to be passed to the model.
+        loss_fn (function, optional): the loss function to be applied to
+            the output of the model, before backwards is invoked.  By default,
+            we assume that a model returns a single result, and we :func:`torch.sum`
+            before calling backwards; if this is inappropriate, you can pass your
+            own loss function.  Note that if a model returns a tuple of results,
+            these are passed as separate positional arguments to `loss_fn`.
+        devices (iterable of device IDs, optional): the GPU devices which the
+            compiled module will be run on.  This determines the RNG state we
+            must save when running both compiled and uncompiled versions of the model.
+    """
+    # TODO: In principle, we track device information in our trace, so it
+    # should be possible to check if our execution actually obeyed the 'devices'
+    # the user provided.
+
+    # TODO: Consider adding a utility function to torch.jit to test
+    # for this case
+    if not isinstance(model, torch._C.CompiledFunction):
+        raise TypeError("Cannot verify an uncompiled module.  Add @torch.jit.compile to compile it")
+    is_module = isinstance(model, Module)
+
+    if not isinstance(args, tuple):
+        args = (args,)
+
+    saved_args = _clone_inputs(args)
+    if is_module:
+        saved_state = copy.deepcopy(model.state_dict())
+
+    def run_fwd_bwd(args, force_trace=False, assert_compiled=False):
+        params = list(model.parameters()) if is_module else []
+        in_vars, _ = _flatten((args, params))
+        # We use a special API to reset the trace and compile it from scratch.
+        compiled_fn = model
+        if force_trace:
+            compiled_fn.clear_cache()
+        if assert_compiled:
+            hits = compiled_fn.hits
+        out = model(*args)
+        if assert_compiled and compiled_fn.hits == hits:
+            raise RuntimeError("failed to use the compiled function")
+        if not isinstance(out, tuple):
+            out = (out, )
+        if loss_fn == torch.sum and len(out) != 1:
+            raise ValueError(("Model returns {} outputs, but default loss function "
+                              "(torch.sum) can only handle a single output").format(len(out)))
+        out_vars, _ = _flatten(out)
+        saved_outs = [v.data.clone() for v in out_vars]
+        loss = loss_fn(*out)
+        grads = torch.autograd.grad([loss], in_vars)
+        # TODO: I'm not sure if the clone here is necessary but it is safer
+        saved_grads = [v.data.clone() for v in grads]
+        return (saved_outs, saved_grads)
+
+    with torch.random.fork_rng(devices, _caller="torch.jit.verify"):
+        uncompiled_outs, uncompiled_grads = run_fwd_bwd(args, force_trace=True)
+        assert model.has_trace_for(*args)
+
+    if is_module:
+        model.load_state_dict(saved_state)
+    compiled_outs, compiled_grads = run_fwd_bwd(args, assert_compiled=True)
+
+    _verify_equal(uncompiled_outs, compiled_outs)
+    _verify_equal(uncompiled_grads, compiled_grads)
+
+
+def _verify_equal(xs, ys):
+    for x, y in zip(xs, ys):
+        if x.sub(y).abs().max() > 1e-6:
+            raise RuntimeError("JIT and real computation mismatch")
+
+
+def trace(*args, **kwargs):
+    """
+    Trace a function and return an executable trace that will be optimized
+    using just-in-time compilation.
+
+    .. warning::
+
+        Just-in-time compilation currently only works for functions/modules
+        which are not data dependent (e.g., have conditionals on data in
+        tensors) and do not have any untracked external dependencies (e.g.,
+        perform input/output or access global variables). If you trace such
+        models, you will silently get incorrect results on subsequent
+        invocations of the model.
+
+    Arg:
+        *args - a list of example tensors that will be passed to the function
+                as inputs while tracing. The resulting trace can be run with
+                inputs of different types and shapes assuming the traced operations
+                support those types and shapes.
+
+    Keyword arguments:
+        optimize (bool, optional): whether or not to apply optimizations.  Default: ``True``.
+
+        >>> @jit.trace(torch.rand(1))
+        ... def f(x):
+        ...     return x * 2
+    """
+    def wrapper(func):
+        executor_options = {'optimize': True}
+        for name in executor_options:
+            executor_options[name] = kwargs.pop(name, executor_options[name])
+        if len(kwargs) != 0:
+            raise TypeError("got unexpected keyword arguments: {}".format(", ".join(kwargs.keys())))
+
+        if isinstance(func, torch.nn.Module):
+            orig = func
+        else:
+            # traced functions become a method on an Empty module
+            orig = Module()
+
+        module = TopLevelTracedModule(orig, **executor_options)
+        module._create_method_from_trace('forward', func, args)
+        return module
+
+    return wrapper
+
+
+def createResolutionCallback(frames_up=0):
+    """
+    Creates a function which, given a string variable name,
+    returns the value of the variable in the scope of the caller of
+    the function which called createResolutionCallback (by default).
+    For example, the following program prints 2::
+
+        def bar():
+            cb = createResolutionCallback()
+            print(x("foo"))
+
+        def baz():
+            foo = 2
+            bar()
+
+        baz()
+
+    This is used to enable access in-scope Python variables inside
+    TorchScript fragments.
+
+    frames_up is
+    """
+    frame = inspect.stack()[1 + frames_up][0]
+
+    def env(key):
+        if key in frame.f_locals:
+            return frame.f_locals[key]
+        elif key in frame.f_globals:
+            return frame.f_globals[key]
+        else:
+            return None
+
+    return env
+
+
+class CompilationUnit(object):
+    def __init__(self, lang=None, optimize=True, _frames_up=0):
+        self.module = torch._C.ScriptModule()
+        self.module._set_optimized(optimize)
+        if lang is not None:
+            self.define(lang, _frames_up=_frames_up + 1)
+        self.optimize = optimize
+
+    def define(self, lang, rcb=None, _frames_up=0):
+        if not rcb:
+            rcb = createResolutionCallback(_frames_up + 1)
+        self.module._define(lang, rcb, False)
+
+    def __getattr__(self, attr):
+        return self.module._get_method(attr)
+
+
+def _script_graph(fn, _frames_up=0):
+    rcb = createResolutionCallback(_frames_up + 1)
+    ast = get_jit_ast(fn)
+    return _jit_script_compile(ast, rcb)
+
+
+def script(fn, optimize=True, _frames_up=0):
+    graph = _script_graph(fn, _frames_up=_frames_up + 1)
+    mod = ScriptModule()
+    mod._create_method_from_graph('forward', graph)
+    # Forward docstrings
+    mod.__doc__ = fn.__doc__
+    return mod
+
+
+ScriptMethodStub = namedtuple('ScriptMethodStub', ('resolution_callback', 'ast', 'original_method'))
+
+
+def script_method(fn):
+    # NOTE: we need to traverse two frames here because the meta-class frame
+    # for ScriptModule will be present, as opposed to invoking @script on a
+    # a function or invoking define() on a CompilationUnit.
+    # The stack will look like:
+    #
+    # 0. createResolutionCallback()
+    # 1. script_method()
+    # 2. ScriptModule metaclass frame
+    # 3. Surrounding scope
+    #
+    # createResolutionCallback internally adds 1 to get us to the scope of this
+    # function (the calling function). Adding 2 gets us to the proper surrounding scope.
+    return ScriptMethodStub(createResolutionCallback(frames_up=2), get_jit_ast(fn), fn)
+
+
+def batch(batch_size=1, optimize=True, _frames_up=0):
+    def decorator(fn):
+        import torch.jit.batchop
+        mod = script(fn, optimize, _frames_up)
+        res_graph = torch.to_batch_graph(mod.graph)
+        res_mod = ScriptModule()
+        res_mod._create_method_from_graph('forward', res_graph)
+
+        def wrapper(*args):
+            new_args = []
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    arg = BatchTensor(arg, batch_size)
+                if isinstance(arg, BatchTensor):
+                    new_args.extend([arg.get_data(), arg.get_mask(), arg.get_dims()])
+                else:
+                    new_args.append(arg)
+            res = res_mod(*new_args)
+            # assert len(res) / 3 == 0
+            # result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)]
+            result = BatchTensor(*res)
+            return result
+        wrapper.__doc__ = fn.__doc__
+        return wrapper
+    return decorator
+
+
+# These OrderedDictWrapper classes replace the actual OrderedDicts in
+# module with versions that get/set properties inside of script::Module.
+# This allows us to reuse most of nn.Module while still storing the
+# data in C++.
+# Each OrderedDict needs to support:
+#  x not in view
+#  x in view
+#  view[name] = ...
+#  view.values()
+#  del view[name]
+#  view.items()
+#  view.keys()
+#  len(view)
+
+class OrderedDictWrapper(object):
+    def __init__(self, module):
+        self.module_ref = weakref.ref(module)
+
+    @property
+    def module(self):
+        r = self.module_ref()
+        if r is None:
+            raise RuntimeError("_parameters or _modules alive after module is dead")
+        return r
+
+    def keys(self):
+        return [k for k, v in self.items()]
+
+    def values(self):
+        return [v for k, v in self.items()]
+
+    def __delitem__(self, k):
+        raise RuntimeError("cannot delete methods or parameters of a script module")
+
+    def items(self):
+        raise NotImplementedError
+
+    def __contains__(self, k):
+        raise NotImplementedError
+
+    def __getitem__(self, k):
+        raise NotImplementedError
+
+    def __setitem__(self, k, v):
+        raise NotImplementedError
+
+
+class OrderedModuleDict(OrderedDictWrapper):
+    def __init__(self, module):
+        super(OrderedModuleDict, self).__init__(module)
+        # contains _both_ script modules and non-script python-only modules
+
+        # because script modules are subclassed in python and the
+        # C++ script::Module class will not hold references to them,
+        # to ensure that you always get the same python value here
+        # we store it in the python dict as well
+        self._python_modules = OrderedDict()
+
+    def items(self):
+        r = self._python_modules.items()
+        return r
+
+    def __contains__(self, k):
+        return k in self._python_modules
+
+    def __setitem__(self, k, v):
+        if k in self._python_modules:
+            raise RuntimeError("cannot re-assign modules in a ScriptModule")
+        if isinstance(v, ScriptModule):
+            self.module._register_module(k, v)
+
+        self._python_modules[k] = v
+
+    def __getitem__(self, k):
+        return self._python_modules[k]
+
+
+class OrderedParameterDict(OrderedDictWrapper):
+    def __init__(self, module):
+        super(OrderedParameterDict, self).__init__(module)
+
+    def items(self):
+        return [(name, param) for name, param, is_buffer
+                in self.module._get_parameters()
+                if not is_buffer]
+
+    def __setitem__(self, k, v):
+        self.module._register_parameter(k, v, False)
+
+    def __contains__(self, k):
+        return self.module._has_parameter(k)
+
+    def __getitem__(self, k):
+        if k not in self:
+            raise KeyError(k)
+        return self.module._get_parameter(k)
+
+
+class OrderedBufferDict(OrderedDictWrapper):
+    def __init__(self, module):
+        super(OrderedBufferDict, self).__init__(module)
+
+    def items(self):
+        return [(name, param) for name, param, is_buffer
+                in self.module._get_parameters()
+                if is_buffer]
+
+    def __setitem__(self, k, v):
+        self.module._register_parameter(k, v, True)
+
+    def __contains__(self, k):
+        return self.module._has_buffer(k)
+
+    def __getitem__(self, k):
+        if k not in self:
+            raise KeyError(k)
+        return self.module._get_parameter(k)
+
+# base types that can be constants
+# in addition, tuples and lists of these base types are also considered constants
+# If you edit this list, then you also need to edit the handlers in
+# ConstantValue in jit/script/init.cpp
+_constant_types = (bool, float, int, types.FunctionType, torch.device, torch.layout, torch.dtype)
+
+
+def _get_valid_constant(v):
+    if isinstance(v, _constant_types):
+        return v
+    elif isinstance(v, tuple) or isinstance(v, list):
+        return tuple(_get_valid_constant(x) for x in v)
+    constants = ", ".join(typ.__name__ for typ in _constant_types)
+    raise TypeError(
+        "'{}' object is not a valid constant.\n".format(type(v).__name__) +
+        "Valid constants are:\n" +
+        "  1. a nn.ModuleList\n" +
+        "  2. a value of type {{{}}}\n".format(constants) +
+        "  3. a list or tuple of (2)\n")
+
+# For each user-defined class that subclasses ScriptModule this meta-class,
+# (1) finds all the methods annotated with @script_method
+# in a ScriptModule and removes them from the class attributes, and
+# (2) puts a wrapper around the class's __init__ method to register
+# all of the script_methods with the module after the original __init__
+# has run. This has to occur after the user-defined __init__ so that
+# submodules and parameters are initialized _before_ the script compiler
+# resolve references to `self.param` or `self.module`.
+
+
+class ScriptMeta(type(torch._C.ScriptModule)):
+    # this has to inherit from pybind11's metaclass otherwise we get
+    # issues because ScriptModule inherits from torch._C.ScriptModule,
+    # a pybind11 type
+    def __init__(cls, name, bases, attrs):
+        # find all the script methods
+        cls._original_methods = {}
+        methods = []
+        for k, v in sorted(attrs.items()):
+            if isinstance(v, ScriptMethodStub):
+                delattr(cls, k)
+                methods.append(v)
+                cls._original_methods[v.original_method.__name__] = v.original_method
+        # after the user's __init__ register all the script methods
+        # with the module
+        original_init = getattr(cls, '__init__', lambda self: None)
+        super_constants = getattr(super(cls), '_constants_set', set())
+        cls._constants_set = set(getattr(cls, '__constants__', ())).union(super_constants)
+
+        def init_then_register(self, *args, **kwargs):
+            # ensure even if the user forgets to call super that
+            # the pybind object is initialized so it will not segfault
+            # run this once, before the most-derived __init__ is called
+            if cls is type(self):
+                torch._C.ScriptModule.__init__(self)
+            original_init(self, *args, **kwargs)
+            asts = [m.ast for m in methods]
+            rcbs = [m.resolution_callback for m in methods]
+            self._create_methods(asts, rcbs)
+
+        cls.__init__ = init_then_register
+        return super(ScriptMeta, cls).__init__(name, bases, attrs)
+
+
+class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
+    def __init__(self, optimize=True):
+        # must be before Module.init since the field is used in __getattr__
+        Module.__init__(self)
+        self._set_optimized(optimize)
+        self._parameters = OrderedParameterDict(self)
+        self._buffers = OrderedBufferDict(self)
+        self._modules = OrderedModuleDict(self)
+
+    def __getattr__(self, attr):
+        if self._has_method(attr):
+            if attr in self.__class__._original_methods:
+                original_method = self.__class__._original_methods[attr]
+                script_method = self._get_method(attr)
+                return functools.wraps(original_method)(script_method)
+            else:
+                return self._get_method(attr)
+        if attr == 'graph' and self._has_method('forward'):
+            return self.__getattr__('forward').graph
+        return Module.__getattr__(self, attr)
+
+    def __setattr__(self, attr, value):
+        if attr not in self._constants_set:
+            return super(ScriptModule, self).__setattr__(attr, value)
+        if hasattr(self, attr):
+            raise RuntimeError("attempting to re-assign constant '{}'".format(attr))
+        if isinstance(value, ModuleList):
+            # special case for list of modules. Modules need to be registered with their
+            # parent module. To do this, we create a ConstModuleList, which is itself a module, that
+            # contains each of these modules as submodules. The ConstModuleList then
+            # is set as an attribute of the parent module.
+            super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value))
+        elif isinstance(value, Sequential):
+            super(ScriptModule, self).__setattr__(attr, _ConstSequential(value))
+        else:
+            super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value))
+
+    def __dir__(self):
+        return sorted(Module.__dir__(self) + self._method_names())
+
+    def define(self, lang):
+        # We use frames_up=1 to get to the proper surrounding scope. The stack
+        # will look like:
+        # 0. createResolutionCallback
+        # 1. define()
+        # 2. surrounding scope.
+        #
+        # createResolutionCallback internally adds 1 to get us to our frame, then
+        # we add 1 to get to the proper surrounding scope.
+        rcb = createResolutionCallback(frames_up=1)
+        self._define(lang, rcb, True)
+
+
+def _get_methods(cls):
+    import inspect
+    # In Python 3 unbound methods are functions, but in Python 2 they are methods
+    return inspect.getmembers(cls, predicate=lambda x: inspect.isfunction(x) or inspect.ismethod(x))
+
+
+_compiled_methods_whitelist = {
+    'forward', 'register_buffer', 'register_parameter', 'add_module',
+    '_apply', 'apply', 'cuda', 'cpu', 'type', 'float', 'double', 'half',
+    'state_dict', 'load_state_dict', '_load_from_state_dict', 'parameters',
+    'named_parameters', '_all_buffers', 'children', 'named_children', 'modules',
+    'named_modules', 'zero_grad', 'share_memory', '_get_name', 'extra_repr',
+    '_slow_forward', '_tracing_name'
+}
+
+
+def _make_fail(name):
+    def fail(self, *args, **kwargs):
+        raise RuntimeError(name + " is not supported on TracedModules")
+    return fail
+
+
+for name, method in _get_methods(torch.nn.Module):
+    if name.startswith('__'):
+        continue
+    if name not in ScriptModule.__dict__ and name not in _compiled_methods_whitelist:
+        setattr(ScriptModule, method.__name__, _make_fail(name))
+
+
+class TracedModule(ScriptModule):
+    __frozen = False
+
+    def __init__(self, orig, id_set=None, optimize=True):
+        super(TracedModule, self).__init__(optimize=optimize)
+        if id_set is None:
+            id_set = set()
+
+        def check_unique(param):
+            if param in id_set:
+                raise ValueError("TracedModules don't support parameter sharing between modules")
+            id_set.add(param)
+
+        self.training = orig.training
+
+        for name, param in orig._parameters.items():
+            if param is not None:
+                self._parameters[name] = param
+                check_unique(param)
+        for name, buf in orig._buffers.items():
+            if buf is not None:
+                self._buffers[name] = buf
+                check_unique(buf)
+        self._orig_class = type(orig)
+
+        if orig._backward_hooks or orig._forward_hooks or orig._forward_pre_hooks:
+            raise ValueError("Modules that have hooks assigned can't be compiled")
+
+        for name, submodule in orig._modules.items():
+            self._modules[name] = TracedModule(submodule, id_set, optimize=optimize)
+
+        self._freeze()
+
+    def forward(self, *args, **kwargs):
+        raise RuntimeError('Trace submodules cannot be called.')
+
+    def _freeze(self):
+        self.__frozen = True
+
+    def _get_name(self):
+        return 'TracedModule[' + self._orig_class.__name__ + ']'
+
+    def __setattr__(self, attr, value):
+        if not self.__frozen or hasattr(self, attr):
+            return super(TracedModule, self).__setattr__(attr, value)
+        raise RuntimeError("Cannot set new properties on a traced module.")
+
+
+class TopLevelTracedModule(TracedModule):
+    def forward(self, *args, **kwargs):
+        return self._get_method('forward')(*args, **kwargs)
+
+
+class _ConstModuleList(ScriptModule):
+    def __init__(self, modules):
+        super(_ConstModuleList, self).__init__()
+        for i, module in enumerate(modules):
+            self.add_module(str(i), module)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return _ConstModuleList(list(self._modules.values())[idx])
+        else:
+            if not (-len(self) <= idx < len(self)):
+                raise IndexError('index {} is out of range'.format(idx))
+            if idx < 0:
+                idx += len(self)
+            return self._modules[str(idx)]
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __iter__(self):
+        return iter(self._modules.values())
+
+    def __dir__(self):
+        keys = super(_ConstModuleList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+
+class _ConstSequential(_ConstModuleList):
+    __constants__ = ['mods']
+
+    def __init__(self, mods):
+        super(_ConstSequential, self).__init__(mods._modules.values())
+
+        # we define the forward method via self.define rather than
+        # making it a direct class member (with a @script) annotation
+        # because, in optimized runtime environments where only .pyc files
+        # are shipped, we cant retrieve the source code.
+        # TODO: find a workaround for this and remove this hack
+        self.define("""
+        def forward(self, input):
+            for m in self:
+                input = m(input)
+            return input
+        """)
+
+if not torch._C._jit_init():
+    raise RuntimeError("JIT initialization failed")
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
new file mode 100644
index 0000000..32fd91a
--- /dev/null
+++ b/torch/jit/annotations.py
@@ -0,0 +1,210 @@
+import re
+import sys
+import ast
+import inspect
+import torch
+from torch._C import DynamicType, TupleType
+from textwrap import dedent
+
+
+PY35 = sys.version_info >= (3, 5)
+
+
+try:
+    import typing
+    from typing import Tuple
+
+    def is_tuple(ann):
+        # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule
+        return ann.__module__ == 'typing' and \
+            (getattr(ann, '__origin__', None) is typing.Tuple or
+             getattr(ann, '__origin__', None) is tuple)
+except ImportError:
+    # A minimal polyfill for versions of Python that don't have typing.
+    # Note that this means that they also don't support the fancy annotation syntax, so
+    # those instances will only be used in our tiny `type: ` comment interpreter.
+
+    # The __getitem__ in typing is implemented using metaclasses, but I'm too lazy for that.
+    class TupleCls(object):
+        def __getitem__(self, types):
+            return TupleInstance(types)
+
+    class TupleInstance(object):
+        def __init__(self, types):
+            setattr(self, '__args__', types)
+
+    Tuple = TupleCls()
+
+    def is_tuple(ann):
+        return isinstance(ann, TupleInstance)
+
+
+class Module(object):
+    def __init__(self, name, members):
+        self.name = name
+        self.members = members
+
+    def __getattr__(self, name):
+        try:
+            return self.members[name]
+        except KeyError:
+            raise RuntimeError("Module {} has no member called {}".format(self.name, name))
+
+
+_eval_env = {
+    'torch': Module('torch', {'Tensor': torch.Tensor}),
+    'Tensor': torch.Tensor,
+    'typing': Module('typing', {'Tuple': Tuple}),
+    'Tuple': Tuple,
+}
+
+
+def get_signature(fn, _n_arguments=None, _n_binders=None):
+    # Python 3.5 adds support for the nice annotation syntax, so try that first.
+    if PY35:
+        sig = try_real_annotations(fn)
+        if sig is not None:
+            return sig
+
+    type_line, source = None, None
+    try:
+        source = dedent(inspect.getsource(fn))
+        type_line = get_type_line(source)
+    except TypeError:
+        pass
+    # This might happen both because we failed to get the source of fn, or
+    # because it didn't have any annotations.
+    if type_line is None:
+        return default_signature(fn, source, _n_arguments, _n_binders)
+
+    return parse_type_line(type_line)
+
+
+def parse_type_line(type_line):
+    """Parses a type annotation specified as a comment.
+
+    Example inputs:
+        # type: (Tensor, torch.Tensor) -> Tuple[Tensor]
+        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tensor
+    """
+    arg_ann_str, ret_ann_str = split_type_line(type_line)
+
+    try:
+        arg_ann = eval(arg_ann_str, _eval_env)
+    except SyntaxError:
+        raise RuntimeError("Failed to parse the argument list of a type annotation")
+
+    if not isinstance(arg_ann, tuple):
+        arg_ann = (arg_ann,)
+
+    try:
+        ret_ann = eval(ret_ann_str, _eval_env)
+    except SyntaxError:
+        raise RuntimeError("Failed to parse the return type of a type annotation")
+
+    return [ann_to_type(ann) for ann in arg_ann], ann_to_type(ret_ann)
+
+
+def default_signature(fn, source, _n_arguments, _n_binders):
+    """Returns the default signature for fn.
+
+    The current formula is to use the source (if available) to determine the
+    number of inputs and outputs, and set all their types as tensors.
+    If the source is missing, we fall back to the numbers provided by the compiler,
+    to make sure we don't cause an error there (although type mismatches can still happen).
+
+    This method also accounts for the self argument if fn is a method.
+    """
+    if _n_binders is None:
+        raise RuntimeError("default_signature needs to know the number of binders")
+    if source is None and _n_arguments is None:
+        raise RuntimeError("default_signature needs either the source or the number of arguments")
+
+    ret_type = TupleType([DynamicType() for _ in range(_n_binders)])
+    if source is not None:
+        py_ast = ast.parse(source)
+        if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+            raise RuntimeError("expected a single top-level function")
+        py_def = py_ast.body[0]
+        # TODO: ideally we'd ignore the type of varargs entirely, but we currently don't
+        # allow passing in anything else than tensors anyway.
+        if py_def.args.vararg is not None:
+            arg_types = [DynamicType()] * _n_arguments
+        else:
+            arg_types = [DynamicType() for _ in py_def.args.args]
+            if inspect.ismethod(fn):
+                arg_types = arg_types[1:]
+    else:
+        arg_types = [DynamicType()] * _n_arguments
+
+    return arg_types, ret_type
+
+
+_def_end_regex = re.compile(r'.*\)\s*:.*')
+
+
+def get_type_line(source):
+    """Tries to find the line containing a comment with the type annotation."""
+    lines = source.split('\n')
+
+    def strip_comment(line):
+        return line[:line.index('#') if '#' in line else None]
+
+    i = 0
+    while not _def_end_regex.match(strip_comment(lines[i])):
+        i += 1
+    i += 1
+
+    type_line = lines[i].strip()
+    if not type_line.startswith('# type:'):
+        return None
+    return type_line
+
+
+def split_type_line(type_line):
+    """Splits the comment with the type annotation into parts for argument and return types.
+
+    For example, for an input of:
+        # type: (Tensor, torch.Tensor) -> Tuple[Tensor, Tensor]
+
+    This function will return:
+        ("(Tensor, torch.Tensor)", "Tuple[Tensor, Tensor]")
+
+    """
+    start_offset = len('# type:')
+    try:
+        arrow_pos = type_line.index('->')
+    except ValueError:
+        raise RuntimeError("Syntax error in type annotation (cound't find `->`)")
+    return type_line[start_offset:arrow_pos].strip(), type_line[arrow_pos + 2:].strip()
+
+
+def try_real_annotations(fn):
+    """Tries to use the Py3.5+ annotation syntax to get the type."""
+    try:
+        sig = inspect.signature(fn)
+    except ValueError:
+        return None
+
+    all_annots = [sig.return_annotation] + [p.annotation for p in sig.parameters.values()]
+    if all(ann is sig.empty for ann in all_annots):
+        return None
+
+    def as_ann(ann):
+        # sig.empty is really annoying so convert it to None
+        return ann if ann is not sig.empty else None
+
+    param_types = [ann_to_type(as_ann(p.annotation))
+                   for p in sig.parameters.values()]
+    return_type = ann_to_type(as_ann(sig.return_annotation))
+    return param_types, return_type
+
+
+def ann_to_type(ann):
+    if ann is None:
+        return DynamicType()
+    elif ann is torch.Tensor:
+        return DynamicType()
+    elif is_tuple(ann):
+        return TupleType([ann_to_type(a) for a in ann.__args__])
+    raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]")
diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py
new file mode 100644
index 0000000..cfad94a
--- /dev/null
+++ b/torch/jit/batchop.py
@@ -0,0 +1,111 @@
+import torch
+
+
+@torch.jit.script
+def batch_tanh(data, mask, dims):
+    data = torch.tanh(data)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_sigmoid(data, mask, dims):
+    data = torch.sigmoid(data)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_add(data1, mask1, dims1, data2, mask2, dims2):
+    data = torch.add(data1, data2)
+    mask = mask1 * mask2
+    dims = dims1 or dims2
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_mul(data1, mask1, dims1, data2, mask2, dims2):
+    data = torch.mul(data1, data2)
+    mask = mask1 * mask2
+    dims = dims1 or dims2
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_mm(data1, mask1, dims1, data2, mask2, dims2):
+    data1 = data1 * mask1.type_as(data1)
+    data2 = data2 * mask2.type_as(data2)
+    data = torch.bmm(data1, data2)
+    mask = torch.bmm(mask1.narrow(2, 0, 1), mask2.narrow(1, 0, 1))
+    dims = torch.cat((dims1[:1], dims2[1:dims2.size(0)]))
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_matmul(data1, mask1, dims1, data2, mask2, dims2):
+    d1 = data1.dim() - 1
+    d2 = data2.dim() - 1
+    data1 = data1 * mask1.type_as(data1)
+    data2 = data2 * mask2.type_as(data2)
+    if d1 == 1:
+        data1 = data1.unsqueeze(-2)
+    if d2 == 1:
+        data2 = data2.unsqueeze(-1)
+    data = torch.bmm(data1, data2)
+    mask = mask1
+    dims = dims1
+    if d1 == 1 and d2 == 1:
+        # if (batch1.dims[0] or batch2.dims[0]) and not batch1.mask.eq(batch2.mask).all():
+        #    raise ValueError("cannot contract non-matching dimensions")
+        data = data.squeeze(-1).squeeze(-1)
+        mask = mask1.narrow(1, 0, 1).squeeze(-1)
+        dims = dims1[:0]  # empty tensor
+    if d1 == 2 and d2 == 1:
+        # if (batch1.dims[1] or batch2.dims[0]) and not batch1.mask[:, 0].eq(batch2.mask).all():
+        #    raise ValueError("cannot contract non-matching dimensions")
+        data = data.squeeze(-1)
+        mask = torch.bmm(mask1.narrow(2, 0, 1), mask2.narrow(1, 0, 1).unsqueeze(-1)).squeeze(-1)
+        dims = dims1[:1]
+    elif d1 == 1 and d2 == 2:
+        # if (batch1.dims[0] or batch2.dims[0]) and not batch1.mask.eq(batch2.mask[:, :, 0]).all():
+        #    raise ValueError("cannot contract non-matching dimensions")
+        data = data.squeeze(-2)
+        mask = torch.bmm(mask1.narrow(1, 0, 1).unsqueeze(-2), mask2.narrow(1, 0, 1)).squeeze(-2)
+        dims = dims2[1:dims2.size(0)]
+    elif d1 == 2 and d2 == 2:
+        # if (batch1.dims[1] or batch2.dims[0]) and not batch1.mask[:, 0].eq(batch2.mask[:, :, 0]).all():
+        #    raise ValueError("cannot contract non-matching dimensions")
+        mask = torch.bmm(mask1.narrow(2, 0, 1), mask2.narrow(1, 0, 1))
+        dims = torch.cat((dims1[:1], dims2[1:dims2.size(0)]))
+    # else:
+    #     raise NotImplementedError("matmul not implemented with batches of 3+D tensors")
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_select(data, mask, dims, dim, index):
+    # if dim == 0:
+    #     raise ValueError("Cannot select 0 dim in BatchTensor")
+    data = data.select(dim, index)
+    if dims[dim - 1]:
+        mask = mask.select(dim, 0)
+    else:
+        mask = mask.select(dim, index)
+    dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
+    return data, mask, dims
+
+
+# assume data, data1, data2 have same size
+@torch.jit.script
+def batch_where(data, mask, dims, data1, mask1, dims1, data2, mask2, dims2):
+    res_data = torch.where(data, data1, data2)
+    res_mask = torch.where(data, mask1, mask2)
+    res_dims = dims1 or dims2
+    return res_data, res_mask, res_dims
+
+torch.register_batch_operator("tanh", batch_tanh.graph)
+torch.register_batch_operator("sigmoid", batch_sigmoid.graph)
+torch.register_batch_operator("add", batch_add.graph)
+torch.register_batch_operator("mul", batch_mul.graph)
+torch.register_batch_operator("matmul", batch_matmul.graph)
+torch.register_batch_operator("mm", batch_mm.graph)
+torch.register_batch_operator("select", batch_select.graph)
+torch.register_batch_operator("where", batch_where.graph)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
new file mode 100644
index 0000000..0c22fd5
--- /dev/null
+++ b/torch/jit/frontend.py
@@ -0,0 +1,458 @@
+import torch
+import sys
+import ast
+import inspect
+import string
+from textwrap import dedent
+from functools import partial
+from collections import namedtuple
+from torch._C._jit_tree_views import *
+
+PY2 = sys.version_info[0] == 2
+_reserved_prefix = '__jit'
+_reserved_names = {'print'}
+_identifier_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits)
+
+
+def is_reserved_name(name):
+    return name.startswith(_reserved_prefix) or name in _reserved_names
+
+
+pretty_node_names = {
+    ast.FunctionDef: "function definitions",
+    ast.For: "for loops",
+    ast.Delete: "del statements",
+    ast.ClassDef: "class definitions",
+    ast.With: "with statements",
+    ast.Raise: "raise statements",
+    ast.Assert: "assertions",
+    ast.Import: "import statements",
+    ast.ImportFrom: "import statements",
+    ast.Global: "global variables",
+    ast.Break: "break statements",
+    ast.Continue: "continue statements",
+}
+
+node_start_tokens = {
+    ast.FunctionDef: "def",
+    ast.For: "for",
+    ast.Delete: "del",
+    ast.ClassDef: "class",
+    ast.With: "with",
+    ast.Raise: "raise",
+    ast.Assert: "assert",
+    ast.Import: "import",
+    ast.ImportFrom: "from",
+    ast.Global: "global",
+    ast.Break: "break",
+    ast.Continue: "continue",
+}
+
+if PY2:
+    pretty_node_names.update({
+        ast.Print: "print statements",
+        ast.TryExcept: "try blocks",
+        ast.TryFinally: "try blocks",
+        ast.Exec: "exec statements",
+    })
+
+    node_start_tokens.update({
+        ast.Print: "print",
+        ast.TryExcept: "try",
+        ast.TryFinally: "try",
+        ast.Exec: "exec",
+    })
+else:
+    pretty_node_names.update({
+        ast.AsyncFunctionDef: "async function definitions",
+        ast.AsyncFor: "async for loops",
+        ast.AsyncWith: "async with statements",
+        ast.Try: "try blocks",
+        ast.Nonlocal: "nonlocal variables",
+    })
+
+    node_start_tokens.update({
+        ast.AsyncFunctionDef: "async def",
+        ast.AsyncFor: "async for",
+        ast.AsyncWith: "async with",
+        ast.Try: "try",
+        ast.Nonlocal: "nonlocal",
+    })
+
+if sys.version_info >= (3, 6):
+    pretty_node_names.update({
+        ast.AnnAssign: "annotated assignments",
+    })
+    # NB: no specific token for AnnAssign
+
+
+class FrontendError(Exception):
+    def __init__(self, source_range, msg):
+        self.source_range = source_range
+        self.msg = msg
+
+    def __str__(self):
+        result = self.msg
+        if self.source_range is not None:
+            result += '\n' + self.source_range.highlight()
+        return result
+
+
+class NotSupportedError(FrontendError):
+    pass
+
+
+class UnsupportedNodeError(NotSupportedError):
+    def __init__(self, ctx, offending_node):
+        # If we don't have a specific token, we default to length of 1
+        node_type = type(offending_node)
+        range_len = len(node_start_tokens.get(node_type, ' '))
+        source_range = ctx.make_range(offending_node.lineno,
+                                      offending_node.col_offset,
+                                      offending_node.col_offset + range_len)
+        feature_name = pretty_node_names.get(node_type, node_type.__name__)
+        msg = "{} aren't supported".format(feature_name)
+        super(NotSupportedError, self).__init__(source_range, msg)
+
+
+class FrontendTypeError(FrontendError):
+    pass
+
+
+def build_stmts(ctx, stmts):
+    stmts = [build_stmt(ctx, s) for s in stmts]
+    return list(filter(None, stmts))
+
+
+def get_jit_ast(fn):
+    source = dedent(inspect.getsource(fn))
+    py_ast = ast.parse(source)
+    if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+        raise RuntimeError("expected a single top-level function")
+    return build_def(SourceRangeFactory(source), py_ast.body[0])
+
+
+class Builder(object):
+    def __call__(self, ctx, node):
+        method = getattr(self, 'build_' + node.__class__.__name__, None)
+        if method is None:
+            raise UnsupportedNodeError(ctx, node)
+        return method(ctx, node)
+
+
+def build_def(ctx, py_def):
+    returns = []
+    ret_body = []
+    body = py_def.body
+    r = ctx.make_range(py_def.lineno, py_def.col_offset,
+                       py_def.col_offset + len("def"))
+    return Def(Ident(r, py_def.name),
+               build_param_list(ctx, py_def.args),
+               build_stmts(ctx, body))
+
+
+_vararg_kwarg_err = ("Compiled functions can't take variable number of arguments, "
+                     "have default values for arguments, nor keyword-only arguments")
+
+
+def build_param_list(ctx, py_args):
+    if py_args.vararg is not None or py_args.kwarg is not None or py_args.defaults:
+        raise ValueError(_vararg_kwarg_err)
+    if not PY2 and (py_args.kw_defaults or py_args.kwonlyargs):
+        raise ValueError(_vararg_kwarg_err)
+    return [build_param(ctx, arg) for arg in py_args.args]
+
+
+def build_param(ctx, py_arg):
+    # NB: In Python3 py_arg is a pair of (str arg, expr? annotation)
+    #     In Python2 py_arg is a Name (Expr subclass)
+    if getattr(py_arg, 'annotation', None) is not None:
+        raise ValueError("Compiled functions don't support annotations")
+    name = py_arg.id if PY2 else py_arg.arg
+    r = ctx.make_range(py_arg.lineno, py_arg.col_offset, py_arg.col_offset + len(name))
+    return Param(TensorType(r), Ident(r, name))
+
+
+class StmtBuilder(Builder):
+    augassign_map = {
+        ast.Add: '+',
+        ast.Sub: '-',
+        ast.Mult: '*',
+        ast.Div: '/',
+    }
+
+    @staticmethod
+    def build_Expr(ctx, stmt):
+        value = stmt.value
+        if value.__class__.__name__ == 'Str':
+            # If a statement is a string literal expression,
+            # then it is a docstring. Just ignore it.
+            return None
+        else:
+            return ExprStmt([build_expr(ctx, value)])
+
+    @staticmethod
+    def get_assign_lhs_expr(ctx, expr):
+        var = build_expr(ctx, expr)
+        if not isinstance(var, Var) and not isinstance(var, Starred):
+            raise NotSupportedError(var.range(),
+                                    "the only expressions allowed on the left hand side of "
+                                    "assignments are variable names and starred expressions")
+        return var
+
+    @staticmethod
+    def build_Assign(ctx, stmt):
+        rhs = build_expr(ctx, stmt.value)
+        if len(stmt.targets) > 1:
+            start_point = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + 1)
+            raise NotSupportedError(ctx.make_raw_range(start_point.start, rhs.range().end),
+                                    "Performing multiple assignments in a single line isn't supported")
+        py_lhs = stmt.targets[0]
+        py_lhs_exprs = py_lhs.elts if isinstance(py_lhs, ast.Tuple) else [py_lhs]
+        return Assign([StmtBuilder.get_assign_lhs_expr(ctx, e) for e in py_lhs_exprs], '=', rhs)
+
+    @staticmethod
+    def build_Return(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("return"))
+        values = (stmt.value,) if not isinstance(stmt.value, ast.Tuple) else stmt.value.elts
+        return Return(r, [build_expr(ctx, val) for val in values if val is not None])
+
+    @staticmethod
+    def build_AugAssign(ctx, stmt):
+        lhs = [StmtBuilder.get_assign_lhs_expr(ctx, stmt.target)]
+        rhs = build_expr(ctx, stmt.value)
+        op = type(stmt.op)
+        if op in StmtBuilder.augassign_map:
+            op_token = StmtBuilder.augassign_map[op]
+        else:
+            raise NotSupportedError(
+                find_before(ctx, rhs.range().start, '=', offsets=(-1, 0)),
+                "unsupported kind of augumented assignment: " + op.__name__)
+        return Assign(lhs, op_token, rhs)
+
+    @staticmethod
+    def build_While(ctx, stmt):
+        if stmt.orelse:
+            # TODO: try to recover the location of else:? Python doesn't give us useful
+            # annotations in this case
+            raise NotSupportedError(None, "else branches of while loops aren't supported")
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("while"))
+        return While(r, build_expr(ctx, stmt.test),
+                     build_stmts(ctx, stmt.body))
+
+    @staticmethod
+    def build_For(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("for"))
+        return For(
+            r, [StmtBuilder.get_assign_lhs_expr(ctx, stmt.target)],
+            [build_expr(ctx, stmt.iter)], build_stmts(ctx, stmt.body))
+
+    @staticmethod
+    def build_If(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("if"))
+        return If(r, build_expr(ctx, stmt.test),
+                  build_stmts(ctx, stmt.body),
+                  build_stmts(ctx, stmt.orelse))
+
+    @staticmethod
+    def build_Print(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("print"))
+        if stmt.dest:
+            raise NotSupportedError(r, "print statements with non-default destinations aren't supported")
+        args = [build_expr(ctx, val) for val in stmt.values]
+        return ExprStmt([Apply(Var(Ident(r, "print")), args, [])])
+
+
+class ExprBuilder(Builder):
+    binop_map = {
+        ast.Add: '+',
+        ast.Sub: '-',
+        ast.Mult: '*',
+        ast.Div: '/',
+        ast.Pow: '**',
+    }
+
+    if not PY2:
+        binop_map[ast.MatMult] = '@'
+
+    unop_map = {
+        ast.Not: 'not',
+        ast.USub: '-',
+    }
+
+    boolop_map = {
+        ast.And: 'and',
+        ast.Or: 'or',
+    }
+
+    cmpop_map = {
+        ast.Eq: '==',
+        ast.NotEq: '!=',
+        ast.LtE: '<=',
+        ast.Lt: '<',
+        ast.GtE: '>=',
+        ast.Gt: '>',
+    }
+
+    @staticmethod
+    def build_Attribute(ctx, expr):
+        # NB: the only attributes we support are for getting methods
+        value = build_expr(ctx, expr.value)
+        # <sigh> name is just a string, so it's not annotated in any way.
+        source = ctx.source
+        pos = find_after(ctx, value.range().end, '.').end  # Start with the dot
+        while source[pos] in string.whitespace:  # Skip whitespace
+            pos += 1
+        start_pos = pos
+        while source[pos] in _identifier_chars:  # Find the identifier itself
+            pos += 1
+        name_range = ctx.make_raw_range(start_pos, pos)
+        return Select(value, Ident(name_range, expr.attr))
+
+    @staticmethod
+    def build_Call(ctx, expr):
+        func = build_expr(ctx, expr.func)
+        args = [build_expr(ctx, py_arg) for py_arg in expr.args]
+        if hasattr(expr, 'starargs') and expr.starargs:
+            stararg_expr = build_expr(ctx, expr.starargs)
+            args += [Starred(stararg_expr.range(), stararg_expr)]
+        kwargs = []
+        for kw in expr.keywords:
+            kw_expr = build_expr(ctx, kw.value)
+            # XXX: we could do a better job at figuring out the range for the name here
+            kwargs.append(Attribute(Ident(kw_expr.range(), kw.arg), kw_expr))
+        return Apply(func, args, kwargs)
+
+    @staticmethod
+    def build_Name(ctx, expr):
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(expr.id))
+        if expr.id.startswith(_reserved_prefix):
+            raise NotSupportedError(r, "names of variables used in JIT-ed functions "
+                                       "can't start with " + _reserved_prefix)
+        if expr.id == "True":
+            return TrueLiteral(r)
+        elif expr.id == "False":
+            return FalseLiteral(r)
+        return Var(Ident(r, expr.id))
+
+    @staticmethod
+    def build_NameConstant(ctx, expr):
+        text = "True" if expr.value else "False"
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(text))
+        if expr.value:
+            return TrueLiteral(r)
+        else:
+            return FalseLiteral(r)
+
+    @staticmethod
+    def build_BinOp(ctx, expr):
+        lhs = build_expr(ctx, expr.left)
+        rhs = build_expr(ctx, expr.right)
+        op = type(expr.op)
+        op_token = ExprBuilder.binop_map.get(op)
+        if op_token is None:
+            err_range = ctx.make_raw_range(lhs.range().end, rhs.range().start)
+            raise NotSupportedError(err_range, "unsupported binary operator: " + op.__name__)
+        return BinOp(op_token, lhs, rhs)
+
+    @staticmethod
+    def build_UnaryOp(ctx, expr):
+        sub_expr = build_expr(ctx, expr.operand)
+        op = type(expr.op)
+        op_token = ExprBuilder.unop_map.get(op)
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(op_token))
+        if op_token is None:
+            err_range = ctx.make_raw_range(r.start, sub_expr.range().end)
+            raise NotSupportedError(err_range, "unsupported unary operator: " + op.__name__)
+        return UnaryOp(r, op_token, sub_expr)
+
+    @staticmethod
+    def build_BoolOp(ctx, expr):
+        if len(expr.values) < 2:
+            raise AssertionError("expected at least 2 values in BoolOp, but got " + str(len(expr.values)))
+        sub_exprs = [build_expr(ctx, sub_expr) for sub_expr in expr.values]
+        op = type(expr.op)
+        op_token = ExprBuilder.boolop_map.get(op)
+        if op_token is None:
+            err_range = ctx.make_raw_range(sub_exprs[0].range().end, sub_exprs[1].range().start)
+            raise NotSupportedError(err_range, "unsupported boolean operator: " + op.__name__)
+        lhs = sub_exprs[0]
+        for rhs in sub_exprs[1:]:
+            lhs = BinOp(op_token, lhs, rhs)
+        return lhs
+
+    @staticmethod
+    def build_IfExp(ctx, expr):
+        return TernaryIf(build_expr(ctx, expr.test),
+                         build_expr(ctx, expr.body),
+                         build_expr(ctx, expr.orelse))
+
+    @staticmethod
+    def build_Compare(ctx, expr):
+        operands = [build_expr(ctx, e) for e in [expr.left] + list(expr.comparators)]
+        result = None
+        for lhs, op_, rhs in zip(operands, expr.ops, operands[1:]):
+            op = type(op_)
+            op_token = ExprBuilder.cmpop_map.get(op)
+            if op_token is None:
+                err_range = ctx.make_raw_range(lhs.range().end, rhs.range().start)
+                raise NotSupportedError(err_range, "unsupported comparison operator: " + op.__name__)
+            cmp_expr = BinOp(op_token, lhs, rhs)
+            if result is None:
+                result = cmp_expr
+            else:
+                result = BinOp('and', result, cmp_expr)
+        return result
+
+    @staticmethod
+    def build_Subscript(ctx, expr):
+        base = build_expr(ctx, expr.value)
+        sub_type = type(expr.slice)
+        if sub_type is ast.Index:
+            index = build_expr(ctx, expr.slice.value)
+            return Gather(base, index)
+        elif sub_type is ast.Slice:
+            lower = build_expr(ctx, expr.slice.lower) if expr.slice.lower is not None else None
+            upper = build_expr(ctx, expr.slice.upper) if expr.slice.upper is not None else None
+            if expr.slice.step is not None:
+                step = build_expr(ctx, expr.slice.step)
+                raise NotSupportedError(step.range(), "slices with ranges are not supported yet")
+            return Slice(base, lower, upper)
+        elif sub_type is ast.ExtSlice:
+            raise NotSupportedError(base.range(), "slicing multiple dimensions at the same time isn't supported yet")
+        else:  # Ellipsis (can only happen in Python 2)
+            raise NotSupportedError(base.range(), "ellipsis is not supported")
+
+    @staticmethod
+    def build_List(ctx, expr):
+        return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+                           [build_expr(ctx, e) for e in expr.elts])
+
+    @staticmethod
+    def build_Tuple(ctx, expr):
+        return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+                           [build_expr(ctx, e) for e in expr.elts])
+
+    @staticmethod
+    def build_Num(ctx, expr):
+        value = str(expr.n)
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(value))
+        return Const(r, value)
+
+    @staticmethod
+    def build_Starred(ctx, expr):
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1)
+        return Starred(r, build_expr(ctx, expr.value))
+
+build_expr = ExprBuilder()
+build_stmt = StmtBuilder()
+
+
+def find_after(ctx, pos, substr, offsets=(0, 0)):
+    new_pos = pos + ctx.source[pos:].index(substr)
+    return ctx.make_raw_range(new_pos + offsets[0], new_pos + len(substr) + offsets[1])
+
+
+def find_before(ctx, pos, substr, offsets=(0, 0)):
+    new_pos = ctx.source[:pos].rindex(substr)
+    return ctx.make_raw_range(new_pos + offsets[0], new_pos + len(substr) + offsets[1])
diff --git a/torch/legacy/__init__.py b/torch/legacy/__init__.py
new file mode 100644
index 0000000..0954abd
--- /dev/null
+++ b/torch/legacy/__init__.py
@@ -0,0 +1,7 @@
+"""Package containing code ported from Lua torch.
+
+To make it possible to work with existing models and ease the transition
+for current Lua torch users, we've created this package. You can find the
+``nn`` code in ``torch.legacy.nn``, and ``optim`` in ``torch.legacy.optim``.
+The APIs should exactly match Lua torch.
+"""
diff --git a/torch/legacy/nn/Abs.py b/torch/legacy/nn/Abs.py
new file mode 100644
index 0000000..4b61c32
--- /dev/null
+++ b/torch/legacy/nn/Abs.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class Abs(Module):
+
+    def __init__(self):
+        super(Abs, self).__init__()
+
+    def updateOutput(self, input):
+        self._backend.Abs_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.Abs_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/AbsCriterion.py b/torch/legacy/nn/AbsCriterion.py
new file mode 100644
index 0000000..66f7615
--- /dev/null
+++ b/torch/legacy/nn/AbsCriterion.py
@@ -0,0 +1,36 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class AbsCriterion(Criterion):
+
+    def __init__(self, sizeAverage=True):
+        super(AbsCriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.output_tensor = torch.Tensor(1)
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.AbsCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.AbsCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/Add.py b/torch/legacy/nn/Add.py
new file mode 100644
index 0000000..3e528e8
--- /dev/null
+++ b/torch/legacy/nn/Add.py
@@ -0,0 +1,57 @@
+import math
+import torch
+from .Module import Module
+
+
+class Add(Module):
+
+    def __init__(self, inputSize, scalar=False):
+        super(Add, self).__init__()
+        size = inputSize
+        if scalar:
+            assert size == 1
+        self.scalar = scalar
+        self.bias = torch.Tensor(size)
+        self.gradBias = torch.Tensor(size)
+
+        self._ones = torch.Tensor((1,))
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.bias.size(0))
+
+        self.bias.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input).copy_(input)
+        if self.scalar:
+            self.output.add_(self.bias[0])
+        else:
+            batchSize = input.size(0)
+            if self._ones.size(0) != batchSize:
+                self._ones.resize_(batchSize).fill_(1)
+
+            bias = self.bias.view(-1)
+            output = self.output.view(batchSize, -1)
+            output.addr_(self._ones, bias)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is not None:
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+            return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        if self.gradBias.size(0) == 1:
+            self.gradBias[0] = self.gradBias[0] + scale * gradOutput.sum()
+        else:
+            if input.is_same_size(self.bias):
+                self.gradBias.add_(scale, gradOutput)
+            else:
+                gradOutput = gradOutput.contiguous().view(input.size(0), -1)
+                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
diff --git a/torch/legacy/nn/AddConstant.py b/torch/legacy/nn/AddConstant.py
new file mode 100644
index 0000000..4e9f10d
--- /dev/null
+++ b/torch/legacy/nn/AddConstant.py
@@ -0,0 +1,32 @@
+import torch
+from .Module import Module
+
+
+class AddConstant(Module):
+
+    def __init__(self, constant_scalar, inplace=False):
+        super(AddConstant, self).__init__()
+        self.constant_scalar = constant_scalar
+        self.inplace = inplace
+
+    def updateOutput(self, input):
+        if self.inplace:
+            input.add_(self.constant_scalar)
+            self.output.set_(input)
+        else:
+            self.output.resize_as_(input)
+            self.output.copy_(input)
+            self.output.add_(self.constant_scalar)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.inplace:
+            self.gradInput.set_(gradOutput)
+            # restore previous input value
+            input.add_(-self.constant_scalar)
+        else:
+            self.gradInput.resize_as_(gradOutput)
+            self.gradInput.copy_(gradOutput)
+
+        return self.gradInput
diff --git a/torch/legacy/nn/BCECriterion.py b/torch/legacy/nn/BCECriterion.py
new file mode 100644
index 0000000..48d9c42
--- /dev/null
+++ b/torch/legacy/nn/BCECriterion.py
@@ -0,0 +1,95 @@
+import torch
+from .Criterion import Criterion
+
+# TODO: use THNN
+
+
+class BCECriterion(Criterion):
+    eps = 1e-12
+
+    def __init__(self, weights=None, sizeAverage=True):
+        if weights is not None and weights.dim() != 1:
+            raise ValueError("weights input should be 1D Tensor")
+
+        super(BCECriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.buffer = None
+        self.weights = weights
+
+    def updateOutput(self, input, target):
+        # - log(input) * target - log(1 - input) * (1 - target)
+        if input.nelement() != target.nelement():
+            raise RuntimeError("input and target size mismatch")
+
+        if self.buffer is None:
+            self.buffer = input.new()
+
+        buffer = self.buffer
+        weights = self.weights
+
+        buffer.resize_as_(input)
+
+        if weights is not None and target.dim() != 1:
+            weights = self.weights.view(1, target.size(1)).expand_as(target)
+
+        # log(input) * target
+        torch.add(input, self.eps, out=buffer).log_()
+        if weights is not None:
+            buffer.mul_(weights)
+
+        target_1d = target.contiguous().view(-1)
+        # don't save a 1-d view of buffer: it should already be contiguous, and it's
+        # used as non-1d tensor later.
+        output = torch.dot(target_1d, buffer.contiguous().view(-1))
+
+        # log(1 - input) * (1 - target)
+        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
+        if weights is not None:
+            buffer.mul_(weights)
+
+        output = output + torch.sum(buffer)
+        output = output - torch.dot(target_1d, buffer.contiguous().view(-1))
+
+        if self.sizeAverage:
+            output = output / input.nelement()
+
+        self.output = - output.item()
+
+        return self.output
+
+    def updateGradInput(self, input, target):
+        # - (target - input) / ( input (1 - input) )
+        # The gradient is slightly incorrect:
+        # It should have be divided by (input + self.eps) (1 - input + self.eps)
+        # but it is divided by input (1 - input + self.eps) + self.eps
+        # This modification requires less memory to be computed.
+        if input.nelement() != target.nelement():
+            raise RuntimeError("input and target size mismatch")
+
+        if self.buffer is None:
+            self.buffer = input.new()
+
+        buffer = self.buffer
+        weights = self.weights
+        gradInput = self.gradInput
+
+        if weights is not None and target.dim() != 1:
+            weights = self.weights.view(1, target.size(1)).expand_as(target)
+
+        buffer.resize_as_(input)
+        # - x ( 1 + self.eps -x ) + self.eps
+        torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
+
+        gradInput.resize_as_(input)
+        # y - x
+        torch.add(target, -1, input, out=gradInput)
+        # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
+        gradInput.div_(buffer)
+
+        if weights is not None:
+            gradInput.mul_(weights)
+
+        if self.sizeAverage:
+            gradInput.div_(target.nelement())
+
+        return gradInput
diff --git a/torch/legacy/nn/BatchNormalization.py b/torch/legacy/nn/BatchNormalization.py
new file mode 100644
index 0000000..2238798
--- /dev/null
+++ b/torch/legacy/nn/BatchNormalization.py
@@ -0,0 +1,192 @@
+"""
+        This file implements Batch Normalization as described in the paper:
+        "Batch Normalization: Accelerating Deep Network Training
+                              by Reducing Internal Covariate Shift"
+                        by Sergey Ioffe, Christian Szegedy
+
+        This implementation is useful for inputs NOT coming from convolution layers.
+        For convolution layers, use nn.SpatialBatchNormalization.
+
+        The operation implemented is:
+        y =     ( x - mean(x) )
+             ########## * gamma + beta
+             standard-deviation(x)
+        where gamma and beta are learnable parameters.
+
+        The learning of gamma and beta is optional.
+
+        Usage:
+        with    learnable parameters: nn.BatchNormalization(N [, eps] [, momentum])
+                                      where N = dimensionality of input
+        without learnable parameters: nn.BatchNormalization(N [, eps] [, momentum], False)
+
+        eps is a small value added to the standard-deviation to avoid divide-by-zero.
+            Defaults to 1e-5
+
+        In training time, this layer keeps a running estimate of it's computed mean and std.
+        The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+        In test time, this running mean/std is used to normalize.
+"""
+
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class BatchNormalization(Module):
+    # expected dimension of input
+    nDim = 2
+
+    def __init__(self, nOutput, eps=1e-5, momentum=0.1, affine=True):
+        super(BatchNormalization, self).__init__()
+        assert nOutput != 0
+
+        self.affine = affine
+        self.eps = eps
+        self.train = True
+        self.momentum = momentum
+        self.running_mean = torch.zeros(nOutput)
+        self.running_var = torch.ones(nOutput)
+
+        self.save_mean = None
+        self.save_std = None
+        self._input = None
+        self._gradOutput = None
+
+        if self.affine:
+            self.weight = torch.Tensor(nOutput)
+            self.bias = torch.Tensor(nOutput)
+            self.gradWeight = torch.Tensor(nOutput)
+            self.gradBias = torch.Tensor(nOutput)
+            self.reset()
+        else:
+            self.weight = None
+            self.bias = None
+            self.gradWeight = None
+            self.gradBias = None
+
+    def reset(self):
+        if self.weight is not None:
+            self.weight.uniform_()
+
+        if self.bias is not None:
+            self.bias.zero_()
+
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+
+    def _checkInputDim(self, input):
+        if input.dim() != self.nDim:
+            raise RuntimeError(
+                'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
+        if input.size(1) != self.running_mean.nelement():
+            raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement()))
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+
+        return input, gradOutput
+
+    def updateOutput(self, input):
+        self._checkInputDim(input)
+
+        input = self._makeContiguous(input)[0]
+
+        self.output.resize_as_(input)
+        if self.save_mean is None:
+            self.save_mean = input.new()
+        self.save_mean.resize_as_(self.running_mean)
+        if self.save_std is None:
+            self.save_std = input.new()
+        self.save_std.resize_as_(self.running_var)
+
+        self._backend.BatchNormalization_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            self.save_mean,
+            self.save_std,
+            self.train,
+            self.momentum,
+            self.eps
+        )
+
+        return self.output
+
+    def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None):
+        self._checkInputDim(input)
+        self._checkInputDim(gradOutput)
+        if not hasattr(self, 'save_mean') or not hasattr(self, 'save_std'):
+            raise RuntimeError('you have to call updateOutput() at least once before backward()')
+
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+
+        scale = scale or 1.
+        if gradInput is not None:
+            gradInput.resize_as_(gradOutput)
+
+        self._backend.BatchNormalization_backward(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            gradInput,
+            gradWeight,
+            gradBias,
+            self.weight,
+            self.running_mean,
+            self.running_var,
+            self.save_mean,
+            self.save_std,
+            self.train,
+            scale,
+            self.eps
+        )
+
+        return self.gradInput
+
+    def backward(self, input, gradOutput, scale=1.):
+        return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
+
+    def updateGradInput(self, input, gradOutput):
+        return self._backward(input, gradOutput, 1., self.gradInput)
+
+    def accGradParameters(self, input, gradOutput, scale=1.):
+        return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias)
+
+    def read(self, file, version):
+        super(BatchNormalization, self).read(self, file)
+        if version < 2:
+            if self.running_std:
+                self.running_var = self.running_std.pow_(-2).add_(-self.eps)
+                self.running_std = None
+
+    def clearState(self):
+        # first 5 buffers are not present in the current implementation,
+        # but we keep them for cleaning old saved models
+        clear(self, [
+            'buffer',
+            'buffer2',
+            'centered',
+            'std',
+            'normalized',
+            '_input',
+            '_gradOutput',
+            'save_mean',
+            'save_std',
+        ])
+        return super(BatchNormalization, self).clearState()
diff --git a/torch/legacy/nn/Bilinear.py b/torch/legacy/nn/Bilinear.py
new file mode 100644
index 0000000..dd8b684
--- /dev/null
+++ b/torch/legacy/nn/Bilinear.py
@@ -0,0 +1,137 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Bilinear(Module):
+
+    def _assertInput(self, input):
+        if len(input) != 2 or not isinstance(input[0], torch.Tensor) or not isinstance(input[1], torch.Tensor):
+            raise RuntimeError('input should be a table containing two data Tensors')
+        if input[0].ndimension() != 2 or input[1].ndimension() != 2:
+            raise RuntimeError('input Tensors should be two-dimensional')
+        if input[0].size(0) != input[1].size(0):
+            raise RuntimeError('input Tensors should have the same number of rows')
+        if input[0].size(1) != self.weight.size(1):
+            raise RuntimeError('dimensionality of first input is erroneous')
+        if input[1].size(1) != self.weight.size(2):
+            raise RuntimeError('dimensionality of second input is erroneous')
+
+    def _assertInputGradOutput(self, input, gradOutput):
+        if input[0].size(0) != gradOutput.size(0):
+            raise RuntimeError('number of rows in gradOutput.es not match input')
+        if gradOutput.size(1) != self.weight.size(0):
+            raise RuntimeError('number of columns in gradOutput does not match layer\'s output size')
+
+    def __init__(self, inputSize1, inputSize2, outputSize, bias=True):
+        # set up model:
+        super(Bilinear, self).__init__()
+        self.weight = torch.Tensor(outputSize, inputSize1, inputSize2)
+        self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
+        if bias:
+            self.bias = torch.Tensor(outputSize)
+            self.gradBias = torch.Tensor(outputSize)
+        else:
+            self.bias = None
+            self.gradBias = None
+
+        self.buff1 = None
+        self.buff2 = None
+
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(1))
+
+        self.weight.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.uniform_(-stdv, stdv)
+        return self
+
+    def updateOutput(self, input):
+        self._assertInput(input)
+
+        # set up buffer:
+        if self.buff2 is None:
+            self.buff2 = input[0].new()
+        self.buff2.resize_as_(input[1])
+
+        # compute output scores:
+        self.output.resize_(input[0].size(0), self.weight.size(0))
+        for k in range(self.weight.size(0)):
+            torch.mm(input[0], self.weight[k], out=self.buff2)
+            self.buff2.mul_(input[1])
+            torch.sum(self.buff2, 1, True, out=self.output.narrow(1, k, 1))
+
+        if self.bias is not None:
+            self.output.add_(self.bias.view(1, self.bias.nelement()).expand_as(self.output))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        self._assertInputGradOutput(input, gradOutput)
+        # compute d output / d input:
+        self.gradInput[0].resize_as_(input[0]).fill_(0)
+        self.gradInput[1].resize_as_(input[1]).fill_(0)
+
+        #: first slice of weight tensor (k = 1)
+        self.gradInput[0].addmm_(input[1], self.weight[0].t())
+        self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
+                                                                 self.gradInput[0].size(1)))
+        self.gradInput[1].addmm_(input[0], self.weight[0])
+        self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
+                                                                 self.gradInput[1].size(1)))
+
+        #: remaining slices of weight tensor
+        if self.weight.size(0) > 1:
+            if self.buff1 is None:
+                self.buff1 = input[0].new()
+            self.buff1.resize_as_(input[0])
+
+            for k in range(1, self.weight.size(0)):
+                torch.mm(input[1], self.weight[k].t(), out=self.buff1)
+                self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
+                                                                  self.gradInput[0].size(1)))
+                self.gradInput[0].add_(self.buff1)
+
+                torch.mm(input[0], self.weight[k], out=self.buff2)
+                self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
+                                                                  self.gradInput[1].size(1)))
+                self.gradInput[1].add_(self.buff2)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._assertInputGradOutput(input, gradOutput)
+
+        # make sure we have buffer:
+        if self.buff1 is None:
+            self.buff1 = input[0].new()
+        self.buff1.resize_as_(input[0])
+
+        # accumulate parameter gradients:
+        for k in range(self.weight.size(0)):
+            torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1)
+            self.gradWeight[k].addmm_(self.buff1.t(), input[1])
+
+        if self.bias is not None:
+            self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
+
+    def __repr__(self):
+        return str(type(self)) + \
+            '({}x{} -> {}) {}'.format(
+            self.weight.size(1), self.weight.size(2), self.weight.size(0),
+            (' without bias' if self.bias is None else '')
+        )
+
+    def clearState(self):
+        clear(self, 'buff1', 'buff2')
+        return super(Bilinear, self).clearState()
diff --git a/torch/legacy/nn/CAddTable.py b/torch/legacy/nn/CAddTable.py
new file mode 100644
index 0000000..bcefa11
--- /dev/null
+++ b/torch/legacy/nn/CAddTable.py
@@ -0,0 +1,36 @@
+import torch
+from .Module import Module
+
+
+class CAddTable(Module):
+
+    def __init__(self, inplace=False):
+        super(CAddTable, self).__init__()
+        self.inplace = inplace
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        if self.inplace:
+            self.output.set_(input[0])
+        else:
+            self.output.resize_as_(input[0]).copy_(input[0])
+
+        for i in range(1, len(input)):
+            self.output.add_(input[i])
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        for i in range(len(input)):
+            if i >= len(self.gradInput):
+                assert i == len(self.gradInput)
+                self.gradInput.append(input[0].new())
+
+            if self.inplace:
+                self.gradInput[i].set_(gradOutput)
+            else:
+                self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
+
+        del self.gradInput[len(input):]
+
+        return self.gradInput
diff --git a/torch/legacy/nn/CDivTable.py b/torch/legacy/nn/CDivTable.py
new file mode 100644
index 0000000..c7f1080
--- /dev/null
+++ b/torch/legacy/nn/CDivTable.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class CDivTable(Module):
+
+    def __init__(self, ):
+        super(CDivTable, self).__init__()
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input[0]).copy_(input[0])
+        self.output.div_(input[1])
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        while len(self.gradInput) < 2:
+            self.gradInput.append(input[0].new())
+        gradOutput = gradOutput.contiguous().view_as(input[0])
+        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput).div_(input[1])
+        self.gradInput[1].resize_as_(input[1]).zero_().addcdiv_(-1, self.gradInput[0], input[1]).mul_(input[0])
+
+        del self.gradInput[len(input):]
+
+        return self.gradInput
diff --git a/torch/legacy/nn/CMul.py b/torch/legacy/nn/CMul.py
new file mode 100644
index 0000000..0d4265a
--- /dev/null
+++ b/torch/legacy/nn/CMul.py
@@ -0,0 +1,117 @@
+import math
+
+import torch
+from .Module import Module
+from .utils import clear, contiguousView
+
+
+class CMul(Module):
+
+    def __init__(self, *args):
+        super(CMul, self).__init__()
+
+        if len(args) == 1 and isinstance(args[0], torch.Size):
+            self.size = args[0]
+        else:
+            self.size = torch.Size(args)
+
+        self.weight = torch.Tensor(self.size)
+        self.gradWeight = torch.Tensor(self.size)
+        self.output.resize_(self.size)
+        self.reset()
+
+        self._output = None
+        self._weight = None
+        self._expand = None
+        self._repeat = None
+        self._gradOutput = None
+        self._gradInput = None
+        self._input = None
+        self._gradWeight = None
+        self._sum = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.nelement())
+
+        self.weight.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        # lazy-initialize
+        if self._output is None:
+            self._output = input.new()
+            self._weight = input.new()
+            self._expand = input.new()
+            self._repeat = input.new()
+
+        self.output.resize_as_(input).copy_(input)
+        batchSize = input.size(0)
+        # TODO: expand_as_, view_
+        self._output = self.output.view(batchSize, -1)
+        self._weight = self.weight.view(1, -1)
+        self._expand = self._weight.expand_as(self._output)
+
+        if torch.typename(input) == 'torch.cuda.FloatTensor':
+            self._repeat.resize_as_(self._expand).copy_(self._expand)
+            self._output.mul_(self._repeat)
+        else:
+            self._output.mul_(self._expand)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        if self._gradOutput is None:
+            self._gradOutput = input.new()
+            self._gradInput = input.new()
+
+        self.gradInput.resize_as_(input).zero_()
+        batchSize = input.size(0)
+        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+        contiguousView(self._gradInput, self.gradInput, batchSize, -1)
+        self._weight = self.weight.view(1, -1)
+        self._expand = self._weight.expand_as(self._gradOutput)
+
+        if torch.typename(input) == 'torch.cuda.FloatTensor':
+            self._repeat.resize_as_(self._expand).copy_(self._expand)
+            self._gradInput.addcmul_(1, self._repeat, self._gradOutput)
+        else:
+            self._gradInput.addcmul_(1, self._expand, self._gradOutput)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        if self._input is None:
+            self._input = input.new()
+            self._gradWeight = input.new()
+            self._sum = input.new()
+
+        batchSize = input.size(0)
+        contiguousView(self._input, input, batchSize, -1)
+        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+        self._gradWeight = self.gradWeight.view(1, -1)
+
+        torch.mul(self._input, self._gradOutput, out=self._repeat)
+        torch.sum(self._repeat, 0, True, out=self._sum)
+        self._gradWeight.add_(scale, self._sum)
+
+    def type(self, type=None, tensorCache=None):
+        if type:
+            self.clearState()
+        return super(CMul, self).type(type, tensorCache)
+
+    def clearState(self):
+        clear(self, [
+            '_input',
+            '_output',
+            '_weight',
+            '_gradWeight',
+            '_expand',
+            '_repeat',
+            '_sum',
+        ])
+        return super(CMul, self).clearState()
diff --git a/torch/legacy/nn/CMulTable.py b/torch/legacy/nn/CMulTable.py
new file mode 100644
index 0000000..64a58f0
--- /dev/null
+++ b/torch/legacy/nn/CMulTable.py
@@ -0,0 +1,49 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class CMulTable(Module):
+
+    def __init__(self, ):
+        super(CMulTable, self).__init__()
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input[0]).copy_(input[0])
+        for i in range(1, len(input)):
+            self.output.mul_(input[i])
+
+        return self.output
+
+    def updateGradInput_efficient(self, input, gradOutput):
+        if self.tout is None:
+            self.tout = input[0].new()
+        self.tout.resize_as_(self.output)
+        for i in range(len(input)):
+            if len(self.gradInput) <= i:
+                assert i == len(self.gradInput)
+                self.gradInput.append(input[0].new())
+            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
+            self.tout.copy_(self.output).div_(input[i])
+            self.gradInput[i].mul_(self.tout)
+
+        self.gradInput = self.gradInput[:len(input)]
+        return self.gradInput
+
+    def updateGradInput(self, input, gradOutput):
+        for i in range(len(input)):
+            if len(self.gradInput) <= i:
+                assert i == len(self.gradInput)
+                self.gradInput.append(input[0].new())
+            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
+            for j in range(len(input)):
+                if i != j:
+                    self.gradInput[i].mul_(input[j])
+
+        self.gradInput = self.gradInput[:len(input)]
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'tout')
+        return super(CMulTable, self).clearState()
diff --git a/torch/legacy/nn/CSubTable.py b/torch/legacy/nn/CSubTable.py
new file mode 100644
index 0000000..85d8527
--- /dev/null
+++ b/torch/legacy/nn/CSubTable.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class CSubTable(Module):
+
+    def __init__(self, ):
+        super(CSubTable, self).__init__()
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input[0]).copy_(input[0])
+        self.output.add_(-1, input[1])
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput[0] is None:
+            self.gradInput[0] = input[0].new()
+        if self.gradInput[1] is None:
+            self.gradInput[1] = input[1].new()
+        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput)
+        self.gradInput[1].resize_as_(input[1]).copy_(gradOutput).mul_(-1)
+
+        self.gradInput = self.gradInput[:2]
+        return self.gradInput
diff --git a/torch/legacy/nn/Clamp.py b/torch/legacy/nn/Clamp.py
new file mode 100644
index 0000000..0bfcac3
--- /dev/null
+++ b/torch/legacy/nn/Clamp.py
@@ -0,0 +1,8 @@
+import torch
+from .HardTanh import HardTanh
+
+
+class Clamp(HardTanh):
+
+    def __init__(self, min_value, max_value):
+        super(Clamp, self,).__init__(min_value, max_value)
diff --git a/torch/legacy/nn/ClassNLLCriterion.py b/torch/legacy/nn/ClassNLLCriterion.py
new file mode 100644
index 0000000..33c28e5
--- /dev/null
+++ b/torch/legacy/nn/ClassNLLCriterion.py
@@ -0,0 +1,53 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class ClassNLLCriterion(Criterion):
+
+    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
+        super(ClassNLLCriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.ignore_index = ignore_index
+
+        if weights is not None:
+            assert weights.dim() == 1
+        self.weights = weights
+
+        self.output_tensor = torch.zeros(1)
+        self.total_weight_tensor = torch.ones(1)
+
+    def updateOutput(self, input, target):
+        self.ignore_index = getattr(self, "ignore_index", -100)
+        target = target.long()
+        self._backend.ClassNLLCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.weights,
+            self.total_weight_tensor,
+            self.ignore_index,
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self.gradInput.resize_as_(input).zero_()
+        target = target.long()
+        implicit_gradOutput = torch.ones(1).type_as(input)
+
+        self._backend.ClassNLLCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.weights,
+            self.total_weight_tensor,
+            self.ignore_index,
+        )
+
+        return self.gradInput
diff --git a/torch/legacy/nn/ClassSimplexCriterion.py b/torch/legacy/nn/ClassSimplexCriterion.py
new file mode 100644
index 0000000..1de5851
--- /dev/null
+++ b/torch/legacy/nn/ClassSimplexCriterion.py
@@ -0,0 +1,108 @@
+import math
+import torch
+from torch.nn.functional import _Reduction
+from .MSECriterion import MSECriterion
+
+"""
+         This file implements a criterion for multi-class classification.
+         It learns an embedding per class, where each class' embedding
+         is a point on an (N-1)-dimensional simplex, where N is
+         the number of classes.
+         For example usage of this class, look at.c/criterion.md
+
+         Reference: http.//arxiv.org/abs/1506.08230
+"""
+
+
+class ClassSimplexCriterion(MSECriterion):
+
+    def __init__(self, nClasses):
+        super(ClassSimplexCriterion, self).__init__()
+        self.nClasses = nClasses
+
+        # embedding the simplex in a space of dimension strictly greater than
+        # the minimum possible (nClasses-1) is critical for effective training.
+        simp = self._regsplex(nClasses - 1)
+        self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
+        self._target = torch.Tensor(nClasses)
+
+        self.output_tensor = None
+
+    def _regsplex(self, n):
+        """
+        regsplex returns the coordinates of the vertices of a
+        regular simplex centered at the origin.
+        The Euclidean norms of the vectors specifying the vertices are
+        all equal to 1. The input n is the dimension of the vectors;
+        the simplex has n+1 vertices.
+
+        input:
+        n # dimension of the vectors specifying the vertices of the simplex
+
+        output:
+        a # tensor dimensioned (n+1, n) whose rows are
+             vectors specifying the vertices
+
+        reference:
+        http.//en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
+        """
+        a = torch.zeros(n + 1, n)
+
+        for k in range(n):
+            # determine the last nonzero entry in the vector for the k-th vertex
+            if k == 0:
+                a[k][k] = 1
+            else:
+                a[k][k] = math.sqrt(1 - a[k:k + 1, 0:k + 1].norm() ** 2)
+
+            # fill_ the k-th coordinates for the vectors of the remaining vertices
+            c = (a[k][k] ** 2 - 1 - 1 / n) / a[k][k]
+            a[k + 1:n + 2, k:k + 1].fill_(c)
+
+        return a
+
+    # handle target being both 1D tensor, and
+    # target being 2D tensor (2D tensor means.nt: anything)
+    def _transformTarget(self, target):
+        assert target.dim() == 1
+        nSamples = target.size(0)
+        self._target.resize_(nSamples, self.nClasses)
+        for i in range(nSamples):
+            self._target[i].copy_(self.simplex[int(target[i])])
+
+    def updateOutput(self, input, target):
+        self._transformTarget(target)
+
+        assert input.nelement() == self._target.nelement()
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.MSECriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            self._target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        assert input.nelement() == self._target.nelement()
+        implicit_gradOutput = torch.Tensor([1]).type(input.type())
+        self._backend.MSECriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            self._target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
+
+    def getPredictions(self, input):
+        return torch.mm(input, self.simplex.t())
+
+    def getTopPrediction(self, input):
+        prod = self.getPredictions(input)
+        _, maxs = prod.max(prod.ndimension() - 1)
+        return maxs.view(-1)
diff --git a/torch/legacy/nn/Concat.py b/torch/legacy/nn/Concat.py
new file mode 100644
index 0000000..cb54d76
--- /dev/null
+++ b/torch/legacy/nn/Concat.py
@@ -0,0 +1,106 @@
+import torch
+from .Container import Container
+
+
+class Concat(Container):
+
+    def __init__(self, dimension):
+        super(Concat, self).__init__()
+        self.outputSize = torch.Size()
+        self.dimension = dimension
+
+    def updateOutput(self, input):
+        outs = []
+        for i in range(len(self.modules)):
+            currentOutput = self.modules[i].updateOutput(input)
+            outs.append(currentOutput)
+            if i == 0:
+                size = list(currentOutput.size())
+            else:
+                size[self.dimension] += currentOutput.size(self.dimension)
+        self.outputSize = torch.Size(size)
+        self.output.resize_(self.outputSize)
+
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = outs[i]
+            self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
+            offset = offset + currentOutput.size(self.dimension)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input)
+
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            currentGradInput = module.updateGradInput(input, gradOutput.narrow(
+                self.dimension, offset, currentOutput.size(self.dimension)))
+
+            # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
+            if currentGradInput:
+                if i == 0:
+                    self.gradInput.copy_(currentGradInput)
+                else:
+                    self.gradInput.add_(currentGradInput)
+
+            offset = offset + currentOutput.size(self.dimension)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            module.accGradParameters(
+                input,
+                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
+                scale)
+            offset = offset + currentOutput.size(self.dimension)
+
+    def backward(self, input, gradOutput, scale=1):
+        self.gradInput.resize_as_(input)
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            currentGradInput = module.backward(input, gradOutput.narrow(
+                self.dimension, offset, currentOutput.size(self.dimension)), scale)
+            # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
+            if currentGradInput is not None:
+                if i == 0:
+                    self.gradInput.copy_(currentGradInput)
+                else:
+                    self.gradInput.add_(currentGradInput)
+            offset = offset + currentOutput.size(self.dimension)
+
+        return self.gradInput
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            module.accUpdateGradParameters(
+                input,
+                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
+                lr)
+            offset = offset + currentOutput.size(self.dimension)
+
+    def __tostring__(self):
+        tab = '  '
+        line = '\n'
+        next = '  |`-> '
+        ext = '  |    '
+        extlast = '       '
+        last = '   +. -> '
+        res = torch.type(self)
+        res += ' {' + line + tab + 'input'
+        for i in range(len(self.modules)):
+            if i == len(self.modules) - 1:
+                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
+
+        res += line + tab + last + 'output'
+        res += line + '}'
+        return res
diff --git a/torch/legacy/nn/ConcatTable.py b/torch/legacy/nn/ConcatTable.py
new file mode 100644
index 0000000..afebf8c
--- /dev/null
+++ b/torch/legacy/nn/ConcatTable.py
@@ -0,0 +1,112 @@
+import torch
+from .Container import Container
+
+
+class ConcatTable(Container):
+
+    def __init__(self, ):
+        super(ConcatTable, self).__init__()
+        self.modules = []
+        self.output = []
+
+    def updateOutput(self, input):
+        self.output = [module.updateOutput(input) for module in self.modules]
+        return self.output
+
+    def _map_list(self, l1, l2, f):
+        for i, v in enumerate(l2):
+            if isinstance(v, list):
+                res = self._map_list(l1[i] if i < len(l1) else [], v, f)
+                if i >= len(l1):
+                    assert i == len(l1)
+                    l1.append(res)
+                else:
+                    l1[i] = res
+            else:
+                f(l1, i, v)
+        for i in range(len(l1) - 1, len(l2) - 1, -1):
+            del l1[i]
+        return l1
+
+    def _backward(self, method, input, gradOutput, scale=1):
+        isTable = isinstance(input, list)
+        wasTable = isinstance(self.gradInput, list)
+        if isTable:
+            for i, module in enumerate(self.modules):
+                if method == 'updateGradInput':
+                    currentGradInput = module.updateGradInput(input, gradOutput[i])
+                elif method == 'backward':
+                    currentGradInput = module.backward(input, gradOutput[i], scale)
+                if not isinstance(currentGradInput, list):
+                    raise RuntimeError("currentGradInput is not a table!")
+
+                if len(input) != len(currentGradInput):
+                    raise RuntimeError("table size mismatch")
+
+                if i == 0:
+                    self.gradInput = self.gradInput if wasTable else []
+
+                    def fn(l, i, v):
+                        if i >= len(l):
+                            assert len(l) == i
+                            l.append(v.clone())
+                        else:
+                            l[i].resize_as_(v)
+                            l[i].copy_(v)
+                    self._map_list(self.gradInput, currentGradInput, fn)
+                else:
+                    def fn(l, i, v):
+                        if i < len(l):
+                            l[i].add_(v)
+                        else:
+                            assert len(l) == i
+                            l.append(v.clone())
+                    self._map_list(self.gradInput, currentGradInput, fn)
+        else:
+            self.gradInput = self.gradInput if not wasTable else input.clone()
+            for i, module in enumerate(self.modules):
+                if method == 'updateGradInput':
+                    currentGradInput = module.updateGradInput(input, gradOutput[i])
+                elif method == 'backward':
+                    currentGradInput = module.backward(input, gradOutput[i], scale)
+                if i == 0:
+                    self.gradInput.resize_as_(currentGradInput).copy_(currentGradInput)
+                else:
+                    self.gradInput.add_(currentGradInput)
+
+        return self.gradInput
+
+    def updateGradInput(self, input, gradOutput):
+        return self._backward('updateGradInput', input, gradOutput)
+
+    def backward(self, input, gradOutput, scale=1):
+        return self._backward('backward', input, gradOutput, scale)
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        for i, module in ipairs(self.modules):
+            self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        for i, module in ipairs(self.modules):
+            self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
+
+    def __repr__(self):
+        tab = '  '
+        line = '\n'
+        next = '  |`-> '
+        ext = '  |    '
+        extlast = '       '
+        last = '   +. -> '
+        res = torch.typename(self)
+        res = res + ' {' + line + tab + 'input'
+        for i in range(len(self.modules)):
+            if i == len(self.modules) - 1:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + ext)
+
+        res = res + line + tab + last + 'output'
+        res = res + line + '}'
+        return res
diff --git a/torch/legacy/nn/Container.py b/torch/legacy/nn/Container.py
new file mode 100644
index 0000000..84a726e
--- /dev/null
+++ b/torch/legacy/nn/Container.py
@@ -0,0 +1,66 @@
+import torch
+from .Module import Module
+from .utils import clear
+from functools import wraps
+import sys
+
+
+class Container(Module):
+
+    def __init__(self, *args):
+        super(Container, self).__init__(*args)
+        self.modules = []
+
+    def add(self, module):
+        self.modules.append(module)
+        return self
+
+    def get(self, index):
+        return self.modules[index]
+
+    def size(self):
+        return len(self.modules)
+
+    def applyToModules(self, func):
+        for module in self.modules:
+            func(module)
+
+    def zeroGradParameters(self):
+        self.applyToModules(lambda m: m.zeroGradParameters())
+
+    def updateParameters(self, learningRate):
+        self.applyToModules(lambda m: m.updateParameters(learningRate))
+
+    def training(self):
+        self.applyToModules(lambda m: m.training())
+        super(Container, self).training()
+
+    def evaluate(self, ):
+        self.applyToModules(lambda m: m.evaluate())
+        super(Container, self).evaluate()
+
+    def share(self, mlp, *args):
+        for module, other_module in zip(self.modules, mlp.modules):
+            module.share(other_module, *args)
+
+    def reset(self, stdv=None):
+        self.applyToModules(lambda m: m.reset(stdv))
+
+    def parameters(self):
+        w = []
+        gw = []
+        for module in self.modules:
+            mparam = module.parameters()
+            if mparam is not None:
+                w.extend(mparam[0])
+                gw.extend(mparam[1])
+        if not w:
+            return
+        return w, gw
+
+    def clearState(self):
+        clear('output')
+        clear('gradInput')
+        for module in self.modules:
+            module.clearState()
+        return self
diff --git a/torch/legacy/nn/Contiguous.py b/torch/legacy/nn/Contiguous.py
new file mode 100644
index 0000000..aacadb0
--- /dev/null
+++ b/torch/legacy/nn/Contiguous.py
@@ -0,0 +1,21 @@
+import torch
+from .Module import Module
+
+
+class Contiguous(Module):
+
+    def updateOutput(self, input):
+        if not input.is_contiguous():
+            self.output.resize_as_(input).copy_(input)
+        else:
+            self.output.set_(input)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if not gradOutput.is_contiguous():
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+        else:
+            self.gradInput.set_(gradOutput)
+
+        return self.gradInput
diff --git a/torch/legacy/nn/Copy.py b/torch/legacy/nn/Copy.py
new file mode 100644
index 0000000..71c8682
--- /dev/null
+++ b/torch/legacy/nn/Copy.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class Copy(Module):
+
+    def __init__(self, intype, outtype, dontCast=False):
+        self.dontCast = dontCast
+        super(Copy, self).__init__()
+        self.gradInput = intype()
+        self.output = outtype()
+
+    def updateOutput(self, input):
+        self.output.resize_(input.size()).copy_(input)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_(gradOutput.size()).copy_(gradOutput)
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        if type and self.dontCast:
+            return self
+
+        return super(Copy, self).type(self, type, tensorCache)
diff --git a/torch/legacy/nn/Cosine.py b/torch/legacy/nn/Cosine.py
new file mode 100644
index 0000000..ae35c85
--- /dev/null
+++ b/torch/legacy/nn/Cosine.py
@@ -0,0 +1,153 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Cosine(Module):
+
+    def __init__(self, inputSize, outputSize):
+        super(Cosine, self).__init__()
+        self.weight = torch.Tensor(outputSize, inputSize)
+        self.gradWeight = torch.Tensor(outputSize, inputSize)
+        self.reset()
+
+        self._weight = None
+        self._sum = None
+        self._gradOutput = None
+        self._sum = None
+        self._weightNorm = None
+        self._inputNorm = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(0))
+        self.weight.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        assert input.dim() == 2
+
+        inputSize = self.weight.size(1)
+        outputSize = self.weight.size(0)
+
+        if self._weightNorm is None:
+            self._weightNorm = self.weight.new()
+        if self._inputNorm is None:
+            self._inputNorm = self.weight.new()
+
+        # y_j = (w_j * x) / ( || w_j || * || x || )
+
+        torch.norm(self.weight, 2, 1, out=self._weightNorm, keepdim=True).add_(1e-12)
+
+        batchSize = input.size(0)
+        nelement = self.output.nelement()
+        self.output.resize_(batchSize, outputSize)
+        if self.output.nelement() != nelement:
+            self.output.zero_()
+
+        self.output.addmm_(0., 1., input, self.weight.t())
+
+        torch.norm(input, 2, 1, out=self._inputNorm, keepdim=True).add_(1e-12)
+        self.output.div_(self._weightNorm.view(1, outputSize).expand_as(self.output))
+        self.output.div_(self._inputNorm.expand_as(self.output))
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 2
+
+        if self.gradInput is None:
+            return
+
+        inputSize = self.weight.size(1)
+        outputSize = self.weight.size(0)
+
+        """
+        dy_j           w_ji                   x_i
+        ---- = -------------------  -  y_j ---------
+        dx_i   || w_j || * || x ||         || x ||^2
+        """
+
+        nelement = self.gradInput.nelement()
+        self.gradInput.resize_as_(input)
+        if self.gradInput.nelement() != nelement:
+            self.gradInput.zero_()
+
+        inputNorm = self._inputNorm.expand_as(input)
+        weightNorm = self._weightNorm.view(1, outputSize).expand_as(gradOutput)
+
+        if self._gradOutput is None:
+            self._gradOutput = gradOutput.new()
+        if self._sum is None:
+            self._sum = input.new()
+
+        self.gradInput.copy_(input).div_(inputNorm)
+        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+        self._gradOutput.mul_(self.output)
+        torch.sum(self._gradOutput, 1, out=self._sum, keepdim=True)
+        self.gradInput.mul_(self._sum.expand_as(input))
+
+        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+        self._gradOutput.div_(weightNorm)
+        self.gradInput.addmm_(-1, 1, self._gradOutput, self.weight)
+        self.gradInput.div_(inputNorm)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        assert input.dim() == 2
+        inputSize = self.weight.size(1)
+        outputSize = self.weight.size(0)
+
+        """
+        dy_j            x_i                     w_ji
+        ----- = -------------------  -  y_j -----------
+        dw_ji   || w_j || * || x ||         || w_j ||^2
+        """
+
+        if self._weight is None:
+            self._weight = self.weight.new()
+        if self._sum is None:
+            self._sum = input.new()
+
+        self._weight.resize_as_(self.weight).copy_(self.weight)
+        if self._gradOutput is None:
+            self._gradOutput = gradOutput.new()
+        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+        self._gradOutput.mul_(self.output)
+        torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True)
+        grad = self._sum[0]
+        grad.div_(self._weightNorm.select(1, 0))
+        self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight))
+
+        input_ = self._gradOutput
+        input_.resize_as_(input).copy_(input)
+        input_.div_(self._inputNorm.expand_as(input))
+        self._weight.addmm_(-1, 1, gradOutput.t(), input_)
+
+        self._weight.div_(self._weightNorm.expand_as(self._weight))
+        self.gradWeight.add_(self._weight)
+
+    def type(self, type=None, tensorCache=None):
+        if type is not None:
+            # prevent premature memory allocations
+            self._input = None
+            self._weight = None
+            self._inputNorm = None
+            self._weightNorm = None
+            self._gradOutput = None
+            self._sum = None
+
+        return super(Cosine, self).type(type, tensorCache)
+
+    def clearState(self):
+        clear(self, [
+            '_input',
+            '_weight',
+            '_gradOutput',
+            '_sum',
+            '_inputNorm',
+            '_weightNorm',
+        ])
+        return super(Cosine, self).clearState()
diff --git a/torch/legacy/nn/CosineDistance.py b/torch/legacy/nn/CosineDistance.py
new file mode 100644
index 0000000..c379dd4
--- /dev/null
+++ b/torch/legacy/nn/CosineDistance.py
@@ -0,0 +1,108 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class CosineDistance(Module):
+
+    def __init__(self, ):
+        super(CosineDistance, self).__init__()
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+        self._input1 = None
+        self._input2 = None
+        self.buffer = None
+        self.w1 = None
+        self.w22 = None
+        self.w = None
+        self.w32 = None
+        self.ones = None
+
+    def _makeContiguous(self, input1, input2):
+        if not input1.is_contiguous():
+            if self._input1 is None:
+                self._input1 = input1.new()
+            self._input1.resize_as_(input1).copy_(input1)
+            input1 = self._input1
+
+        if not input2.is_contiguous():
+            if self._input2 is None:
+                self._input2 = input2.new()
+            self._input2.resize_as_(input2).copy_(input2)
+            input2 = self._input2
+
+        return input1, input2
+
+    def updateOutput(self, input):
+        input1, input2 = input[0], input[1]
+        input1, input2 = self._makeContiguous(input1, input2)
+
+        if self.buffer is None:
+            self.buffer = input1.new()
+            self.w1 = input1.new()
+            self.w22 = input1.new()
+            self.w = input1.new()
+            self.w32 = input1.new()
+            self.ones = input1.new()
+
+        torch.mul(input1, input2, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
+
+        epsilon = 1e-12
+        torch.mul(input1, input1, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
+        self.w22.reciprocal_()
+        self.w.resize_as_(self.w22).copy_(self.w22)
+
+        torch.mul(input2, input2, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
+        self.w32.reciprocal_()
+        self.w.mul_(self.w32)
+        self.w.sqrt_()
+
+        torch.mul(self.w1, self.w, out=self.output)
+        self.output.resize_(input1.size(0))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        v1 = input[0]
+        v2 = input[1]
+        v1, v2 = self._makeContiguous(v1, v2)
+
+        if len(self.gradInput) != 2:
+            if self.gradInput[0] is None:
+                self.gradInput[0] = v1.new()
+            if self.gradInput[1] is None:
+                self.gradInput[1] = v1.new()
+            self.gradInput = self.gradInput[:2]
+
+        gw1 = self.gradInput[0]
+        gw2 = self.gradInput[1]
+        gw1.resize_as_(v1).copy_(v2)
+        gw2.resize_as_(v1).copy_(v1)
+
+        torch.mul(self.w1, self.w22, out=self.buffer)
+        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
+        gw1.mul_(self.w.expand_as(v1))
+
+        torch.mul(self.w1, self.w32, out=self.buffer)
+        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
+        gw2.mul_(self.w.expand_as(v1))
+
+        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
+        gw1.mul_(go)
+        gw2.mul_(go)
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, [
+            'buffer',
+            'w1',
+            'w22',
+            'w',
+            'w32',
+            'ones',
+        ])
+        return super(CosineDistance, self).clearState()
diff --git a/torch/legacy/nn/CosineEmbeddingCriterion.py b/torch/legacy/nn/CosineEmbeddingCriterion.py
new file mode 100644
index 0000000..5a33ee9
--- /dev/null
+++ b/torch/legacy/nn/CosineEmbeddingCriterion.py
@@ -0,0 +1,117 @@
+import torch
+from .Criterion import Criterion
+
+
+class CosineEmbeddingCriterion(Criterion):
+
+    def __init__(self, margin=0, sizeAverage=True):
+        super(CosineEmbeddingCriterion, self).__init__()
+        self.margin = margin
+        self.sizeAverage = sizeAverage
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+        self.buffer = None
+        self.w1 = None
+        self.w22 = None
+        self.w = None
+        self.w32 = None
+        self._outputs = None
+        self._idx = None
+
+    def updateOutput(self, input, y):
+        input1, input2 = input[0], input[1]
+
+        # keep backward compatibility
+        if self.buffer is None:
+            self.buffer = input1.new()
+            self.w1 = input1.new()
+            self.w22 = input1.new()
+            self.w = input1.new()
+            self.w32 = input1.new()
+            self._outputs = input1.new()
+
+            # comparison operators behave differently from cuda/c implementations
+            # TODO: verify name
+            if input1.type() == 'torch.cuda.FloatTensor':
+                self._idx = torch.cuda.ByteTensor()
+            else:
+                self._idx = torch.ByteTensor()
+
+        torch.mul(input1, input2, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
+
+        epsilon = 1e-12
+        torch.mul(input1, input1, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
+        # self._outputs is also used as a temporary buffer
+        self._outputs.resize_as_(self.w22).fill_(1)
+        torch.div(self._outputs, self.w22, out=self.w22)
+        self.w.resize_as_(self.w22).copy_(self.w22)
+
+        torch.mul(input2, input2, out=self.buffer)
+        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
+        torch.div(self._outputs, self.w32, out=self.w32)
+        self.w.mul_(self.w32)
+        self.w.sqrt_()
+
+        torch.mul(self.w1, self.w, out=self._outputs)
+        self._outputs = self._outputs.select(1, 0)
+
+        torch.eq(y, -1, out=self._idx)
+        self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).clamp_(min=0)
+        torch.eq(y, 1, out=self._idx)
+        self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1)
+
+        self.output = self._outputs.sum().item()
+
+        if self.sizeAverage:
+            self.output = self.output / y.size(0)
+
+        return self.output
+
+    def updateGradInput(self, input, y):
+        v1 = input[0]
+        v2 = input[1]
+
+        gw1 = self.gradInput[0]
+        gw2 = self.gradInput[1]
+        gw1.resize_as_(v1).copy_(v2)
+        gw2.resize_as_(v1).copy_(v1)
+
+        torch.mul(self.w1, self.w22, out=self.buffer)
+        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
+        gw1.mul_(self.w.expand_as(v1))
+
+        torch.mul(self.w1, self.w32, out=self.buffer)
+        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
+        gw2.mul_(self.w.expand_as(v1))
+
+        # self._idx = self._outputs <= 0
+        torch.le(self._outputs, 0, out=self._idx)
+        self._idx = self._idx.view(-1, 1).expand(gw1.size())
+        gw1[self._idx] = 0
+        gw2[self._idx] = 0
+
+        torch.eq(y, 1, out=self._idx)
+        self._idx = self._idx.view(-1, 1).expand(gw2.size())
+        gw1[self._idx] = gw1[self._idx].mul_(-1)
+        gw2[self._idx] = gw2[self._idx].mul_(-1)
+
+        if self.sizeAverage:
+            gw1.div_(y.size(0))
+            gw2.div_(y.size(0))
+
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        if not type:
+            return self._type
+
+        self._idx = None
+        super(CosineEmbeddingCriterion, self).type(type, tensorCache)
+        # comparison operators behave differently from cuda/c implementations
+        if type == 'torch.cuda.FloatTensor':
+            self._idx = torch.cuda.ByteTensor()
+        else:
+            self._idx = torch.ByteTensor()
+
+        return self
diff --git a/torch/legacy/nn/Criterion.py b/torch/legacy/nn/Criterion.py
new file mode 100644
index 0000000..ef51b18
--- /dev/null
+++ b/torch/legacy/nn/Criterion.py
@@ -0,0 +1,44 @@
+import torch
+from .Module import Module
+from .utils import recursiveType
+import torch._thnn
+
+
+class Criterion(object):
+
+    def __init__(self):
+        self.gradInput = torch.Tensor()
+        self.output = 0
+        self._backend = torch._thnn.type2backend[self.gradInput.type()]
+
+    def updateOutput(self, input, target):
+        raise NotImplementedError
+
+    def forward(self, input, target):
+        return self.updateOutput(input, target)
+
+    def backward(self, input, target):
+        return self.updateGradInput(input, target)
+
+    def updateGradInput(self, input, target):
+        raise NotImplementedError
+
+    def clone(self):
+        raise NotImplementedError
+
+    def type(self, type, tensorCache=None):
+        # find all tensors and convert them
+        for key, param in self.__dict__.items():
+            setattr(self, key, recursiveType(param, type, tensorCache or {}))
+
+        self._backend = torch._thnn.type2backend[type]
+        return self
+
+    def float(self):
+        return self.type('torch.FloatTensor')
+
+    def double(self):
+        return self.type('torch.DoubleTensor')
+
+    def cuda(self):
+        return self.type('torch.cuda.FloatTensor')
diff --git a/torch/legacy/nn/CriterionTable.py b/torch/legacy/nn/CriterionTable.py
new file mode 100644
index 0000000..7e20a8f
--- /dev/null
+++ b/torch/legacy/nn/CriterionTable.py
@@ -0,0 +1,18 @@
+import torch
+from .Module import Module
+
+
+class CriterionTable(Module):
+
+    def __init__(self, criterion):
+        super(CriterionTable, self).__init__()
+        self.criterion = criterion
+        self.gradInput = [criterion.gradInput]
+
+    def updateOutput(self, input):
+        self.output = self.criterion.updateOutput(*input)
+        return self.output
+
+    def updateGradInput(self, input, grad_output):
+        self.criterion.updateGradInput(*input)
+        return self.gradInput
diff --git a/torch/legacy/nn/CrossEntropyCriterion.py b/torch/legacy/nn/CrossEntropyCriterion.py
new file mode 100644
index 0000000..67e8b0d
--- /dev/null
+++ b/torch/legacy/nn/CrossEntropyCriterion.py
@@ -0,0 +1,29 @@
+import torch
+from .Criterion import Criterion
+from .LogSoftMax import LogSoftMax
+from .ClassNLLCriterion import ClassNLLCriterion
+
+
+class CrossEntropyCriterion(Criterion):
+
+    def __init__(self, weights=None):
+        super(CrossEntropyCriterion, self).__init__()
+        self.lsm = LogSoftMax()
+        self.nll = ClassNLLCriterion(weights)
+
+    def updateOutput(self, input, target):
+        input = input.squeeze()
+        target = target.squeeze()
+        self.lsm.updateOutput(input)
+        self.nll.updateOutput(self.lsm.output, target)
+        self.output = self.nll.output
+        return self.output
+
+    def updateGradInput(self, input, target):
+        size = input.size()
+        input = input.squeeze()
+        target = target.squeeze()
+        self.nll.updateGradInput(self.lsm.output, target)
+        self.lsm.updateGradInput(input, self.nll.gradInput)
+        self.gradInput = self.lsm.gradInput.view(size)
+        return self.gradInput
diff --git a/torch/legacy/nn/DepthConcat.py b/torch/legacy/nn/DepthConcat.py
new file mode 100644
index 0000000..19c3187
--- /dev/null
+++ b/torch/legacy/nn/DepthConcat.py
@@ -0,0 +1,106 @@
+####################################
+# DepthConcat
+# Concatenates the output of Convolutions along the depth dimension
+# (nOutputFrame). This is used to implement the DepthConcat layer
+# of the Going deeper with convolutions paper :
+# http.//arxiv.org/pdf/1409.4842v1.pdf
+# The normal Concat Module can't be used since the spatial dimensions
+# of tensors to be concatenated may have different values. To deal with
+# this, we select the largest spatial dimensions and add zero-padding
+# around the smaller dimensions.
+####################################
+
+import math
+import torch
+from .Concat import Concat
+
+
+class DepthConcat(Concat):
+
+    def windowNarrow(self, output, currentOutput, offset):
+        outputWindow = output.narrow(self.dimension, offset, currentOutput.size(self.dimension))
+        for dim in range(len(self.outputSize)):
+            currentSize = currentOutput.size(dim)
+            if dim != self.dimension and self.outputSize[dim] != currentSize:
+                # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
+                # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
+                # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
+                start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
+                outputWindow = outputWindow.narrow(dim, start, currentSize)
+        return outputWindow
+
+    def updateOutput(self, input):
+        outs = []
+        for i in range(len(self.modules)):
+            currentOutput = self.modules[i].updateOutput(input)
+            outs.append(currentOutput)
+            if i == 0:
+                size = list(currentOutput.size())
+            else:
+                size[self.dimension] += currentOutput.size(self.dimension)
+                for dim in range(len(self.outputSize)):
+                    if dim != self.dimension:
+                        # take the maximum size (shouldn't change anything for batch dim)
+                        size[dim] = max(size[dim], currentOutput.size(dim))
+
+        self.outputSize = torch.Size(size)
+        self.output.resize_(self.outputSize).zero_()  # zero for padding
+
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = outs[i]
+            outputWindow = self.windowNarrow(self.output, currentOutput, offset)
+            outputWindow.copy_(currentOutput)
+            offset = offset + currentOutput.size(self.dimension)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input)
+
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
+            currentGradInput = module.updateGradInput(input, gradOutputWindow)
+            if i == 0:
+                self.gradInput.copy_(currentGradInput)
+            else:
+                self.gradInput.add_(currentGradInput)
+
+            offset += currentOutput.size(self.dimension)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
+            module.accGradParameters(input, gradOutputWindow, scale)
+            offset += currentOutput.size(self.dimension)
+
+    def backward(self, input, gradOutput, scale=1):
+        self.gradInput.resize_as_(input)
+
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
+            currentGradInput = module.backward(input, gradOutputWindow)
+            if i == 0:
+                self.gradInput.copy_(currentGradInput)
+            else:
+                self.gradInput.add_(currentGradInput)
+
+            offset = offset + currentOutput.size(self.dimension)
+
+        return self.gradInput
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        offset = 0
+        for i, module in enumerate(self.modules):
+            currentOutput = module.output
+            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
+            module.accUpdateGradParameters(input, gradOutputWindow, lr)
+            offset = offset + currentOutput.size(self.dimension)
diff --git a/torch/legacy/nn/DistKLDivCriterion.py b/torch/legacy/nn/DistKLDivCriterion.py
new file mode 100644
index 0000000..5aa1756
--- /dev/null
+++ b/torch/legacy/nn/DistKLDivCriterion.py
@@ -0,0 +1,38 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class DistKLDivCriterion(Criterion):
+
+    def __init__(self, sizeAverage=True):
+        super(DistKLDivCriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.output_tensor = torch.Tensor(1)
+
+    def updateOutput(self, input, target):
+        assert input.is_same_size(target)
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.DistKLDivCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        assert input.is_same_size(target)
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.DistKLDivCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/DotProduct.py b/torch/legacy/nn/DotProduct.py
new file mode 100644
index 0000000..70f6490
--- /dev/null
+++ b/torch/legacy/nn/DotProduct.py
@@ -0,0 +1,49 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class DotProduct(Module):
+
+    def __init__(self):
+        super(DotProduct, self).__init__()
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+        self.buffer = None
+
+    def updateOutput(self, input):
+        input1, input2 = input[0], input[1]
+
+        if self.buffer is None:
+            self.buffer = input1.new()
+
+        torch.mul(input1, input2, out=self.buffer)
+        torch.sum(self.buffer, 1, True, out=self.output)
+        self.output.resize_(input1.size(0))
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        v1 = input[0]
+        v2 = input[1]
+        not_batch = False
+
+        if len(self.gradInput) != 2:
+            if self.gradInput[0] is None:
+                self.gradInput[0] = input[0].new()
+            if self.gradInput[1] is None:
+                self.gradInput[1] = input[1].new()
+            self.gradInput = self.gradInput[:2]
+
+        gw1 = self.gradInput[0]
+        gw2 = self.gradInput[1]
+        gw1.resize_as_(v1).copy_(v2)
+        gw2.resize_as_(v2).copy_(v1)
+
+        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
+        gw1.mul_(go)
+        gw2.mul_(go)
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'buffer')
+        return super(DotProduct, self).clearState()
diff --git a/torch/legacy/nn/Dropout.py b/torch/legacy/nn/Dropout.py
new file mode 100644
index 0000000..41330e5
--- /dev/null
+++ b/torch/legacy/nn/Dropout.py
@@ -0,0 +1,48 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Dropout(Module):
+
+    def __init__(self, p=0.5, inplace=False):
+        super(Dropout, self).__init__()
+        self.p = p
+        self.inplace = inplace
+        self.train = True
+        self.noise = torch.Tensor()
+
+    def updateOutput(self, input):
+        if self.inplace:
+            self.output.set_(input)
+        else:
+            self.output.resize_as_(input).copy_(input)
+
+        if self.p > 0 and self.train:
+            self.noise.resize_as_(input)
+            self.noise.bernoulli_(1 - self.p)
+            self.noise.div_(1 - self.p)
+            self.output.mul_(self.noise)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.inplace:
+            self.gradInput.set_(gradOutput)
+        else:
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+
+        if self.p > 0 and self.train:
+            self.gradInput.mul_(self.noise)  # simply mask the gradients with the noise vector
+
+        return self.gradInput
+
+    def setp(self, p):
+        self.p = p
+
+    def __repr__(self):
+        return super(Dropout, self).__repr__() + '({:.4f})'.format(self.p)
+
+    def clearState(self):
+        clear(self, 'noise')
+        return super(Dropout, self).clearState()
diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py
new file mode 100644
index 0000000..6ad2406
--- /dev/null
+++ b/torch/legacy/nn/ELU.py
@@ -0,0 +1,42 @@
+# -*- coding: utf8 -*-
+import torch
+from .Module import Module
+
+
+class ELU(Module):
+    """
+            Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
+            Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+            http.//arxiv.org/pdf/1511.07289.pdf
+    """
+
+    def __init__(self, alpha=1., inplace=False):
+        assert type(alpha) == float
+        super(ELU, self).__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def updateOutput(self, input):
+        self._backend.ELU_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.alpha,
+            1.0,
+            self.inplace
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.ELU_updateGradInput(
+            self._backend.library_state,
+            gradOutput,
+            self.gradInput,
+            self.output,
+            self.alpha,
+            1.0
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)
diff --git a/torch/legacy/nn/Euclidean.py b/torch/legacy/nn/Euclidean.py
new file mode 100644
index 0000000..411ca83
--- /dev/null
+++ b/torch/legacy/nn/Euclidean.py
@@ -0,0 +1,172 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Euclidean(Module):
+
+    def __init__(self, inputSize, outputSize):
+        super(Euclidean, self).__init__()
+
+        self.weight = torch.Tensor(inputSize, outputSize)
+        self.gradWeight = torch.Tensor(inputSize, outputSize)
+
+        # state
+        self.gradInput.resize_(inputSize)
+        self.output.resize_(outputSize)
+
+        self.fastBackward = True
+        self.reset()
+
+        self._input = None
+        self._weight = None
+        self._expand = None
+        self._expand2 = None
+        self._repeat = None
+        self._repeat2 = None
+        self._div = None
+        self._output = None
+        self._gradOutput = None
+        self._expand3 = None
+        self._sum = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(0))
+
+        self.weight.uniform_(-stdv, stdv)
+
+    def _view(self, res, src, *args):
+        if src.is_contiguous():
+            res.set_(src.view(*args))
+        else:
+            res.set_(src.contiguous().view(*args))
+
+    def updateOutput(self, input):
+        # lazy initialize buffers
+        if self._input is None:
+            self._input = input.new()
+        if self._weight is None:
+            self._weight = self.weight.new()
+        if self._expand is None:
+            self._expand = self.output.new()
+        if self._expand2 is None:
+            self._expand2 = self.output.new()
+        if self._repeat is None:
+            self._repeat = self.output.new()
+        if self._repeat2 is None:
+            self._repeat2 = self.output.new()
+
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        # y_j = || w_j - x || = || x - w_j ||
+        assert input.dim() == 2
+
+        batchSize = input.size(0)
+        self._view(self._input, input, batchSize, inputSize, 1)
+        self._expand = self._input.expand(batchSize, inputSize, outputSize)
+        # make the expanded tensor contiguous (requires lots of memory)
+        self._repeat.resize_as_(self._expand).copy_(self._expand)
+
+        self._weight = self.weight.view(1, inputSize, outputSize)
+        self._expand2 = self._weight.expand_as(self._repeat)
+
+        if torch.typename(input) == 'torch.cuda.FloatTensor':
+            # TODO: after adding new allocators this can be changed
+            # requires lots of memory, but minimizes cudaMallocs and loops
+            self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
+            self._repeat.add_(-1, self._repeat2)
+        else:
+            self._repeat.add_(-1, self._expand2)
+
+        torch.norm(self._repeat, 2, 1, True, out=self.output)
+        self.output.resize_(batchSize, outputSize)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        if self._div is None:
+            self._div = input.new()
+        if self._output is None:
+            self._output = self.output.new()
+        if self._gradOutput is None:
+            self._gradOutput = input.new()
+        if self._expand3 is None:
+            self._expand3 = input.new()
+
+        if not self.fastBackward:
+            self.updateOutput(input)
+
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        """
+        dy_j   -2 * (w_j - x)     x - w_j
+        ---- = ---------------- = -------
+         dx    2 || w_j - x ||      y_j
+        """
+
+        # to prevent div by zero (NaN) bugs
+        self._output.resize_as_(self.output).copy_(self.output).add_(0.0000001)
+        self._view(self._gradOutput, gradOutput, gradOutput.size())
+        torch.div(gradOutput, self._output, out=self._div)
+        assert input.dim() == 2
+        batchSize = input.size(0)
+
+        self._div.resize_(batchSize, 1, outputSize)
+        self._expand3 = self._div.expand(batchSize, inputSize, outputSize)
+
+        if torch.typename(input) == 'torch.cuda.FloatTensor':
+            self._repeat2.resize_as_(self._expand3).copy_(self._expand3)
+            self._repeat2.mul_(self._repeat)
+        else:
+            torch.mul(self._repeat, self._expand3, out=self._repeat2)
+
+        torch.sum(self._repeat2, 2, True, out=self.gradInput)
+        self.gradInput.resize_as_(input)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        """
+        dy_j    2 * (w_j - x)    w_j - x
+        ---- = --------------- = -------
+        dw_j   2 || w_j - x ||     y_j
+        """
+        # assumes a preceding call to updateGradInput
+        assert input.dim() == 2
+        if self._sum is None:
+            self._sum = input.new()
+        torch.sum(self._repeat2, 0, True, out=self._sum)
+        self._sum.resize_(inputSize, outputSize)
+        self.gradWeight.add_(-scale, self._sum)
+
+    def type(self, type=None, tensorCache=None):
+        if type:
+            # prevent premature memory allocations
+            self.clearState()
+
+        return super(Euclidean, self).type(type, tensorCache)
+
+    def clearState(self):
+        clear(self, [
+            '_input',
+            '_output',
+            '_gradOutput',
+            '_weight',
+            '_div',
+            '_sum',
+            '_expand',
+            '_expand2',
+            '_expand3',
+            '_repeat',
+            '_repeat2',
+        ])
+        return super(Euclidean, self).clearState()
diff --git a/torch/legacy/nn/Exp.py b/torch/legacy/nn/Exp.py
new file mode 100644
index 0000000..7156a99
--- /dev/null
+++ b/torch/legacy/nn/Exp.py
@@ -0,0 +1,11 @@
+import torch
+from .Module import Module
+
+
+class Exp(Module):
+
+    def updateOutput(self, input):
+        return torch.exp(input, out=self.output)
+
+    def updateGradInput(self, input, gradOutput):
+        return torch.mul(self.output, gradOutput, out=self.gradInput)
diff --git a/torch/legacy/nn/FlattenTable.py b/torch/legacy/nn/FlattenTable.py
new file mode 100644
index 0000000..1468f0c
--- /dev/null
+++ b/torch/legacy/nn/FlattenTable.py
@@ -0,0 +1,85 @@
+import torch
+from .Module import Module
+
+
+class FlattenTable(Module):
+
+    def __init__(self):
+        super(FlattenTable, self).__init__()
+
+        self.output = []
+        self.input_map = []
+        self.gradInput = []
+
+    def _flatten(self, output, input):
+        if isinstance(input, list):
+            input_map = []
+            # forward DFS order
+            for i in range(len(input)):
+                input_map.append(self._flatten(output, input[i]))
+        else:
+            input_map = len(output)
+            output.append(input)
+
+        return input_map
+
+    def _checkMapping(self, output, input, input_map):
+        if isinstance(input, list):
+            if len(input) != len(input_map):
+                return False
+
+            # forward DFS order
+            for i in range(len(input)):
+                if not self._checkMapping(output, input[i], input_map[i]):
+                    return False
+
+            return True
+        else:
+            return output[input_map] is input
+
+    # During BPROP we have to build a gradInput with the same shape as the
+    # input.  This is a recursive function to build up a gradInput
+    def _inverseFlatten(self, gradOutput, input_map):
+        if isinstance(input_map, list):
+            gradInput = []
+            for i in range(len(input_map)):
+                gradInput.append(self._inverseFlatten(gradOutput, input_map[i]))
+
+            return gradInput
+        else:
+            return gradOutput[input_map]
+
+    def updateOutput(self, input):
+        assert isinstance(input, list)
+        # to avoid updating rebuilding the flattened table every updateOutput call
+        # we will: a DFS pass over the existing output table and the inputs to
+        # see if it needs to be rebuilt.
+        if not self._checkMapping(self.output, input, self.input_map):
+            self.output = []
+            self.input_map = self._flatten(self.output, input)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert isinstance(input, list)
+        assert isinstance(gradOutput, list)
+        # If the input changes between the updateOutput and updateGradInput call,
+        #: we may have to rebuild the input_map!  However, let's assume that
+        # the input_map is valid and that forward has already been called.
+
+        # However, we should check that the gradInput is valid:
+        if not self._checkMapping(gradOutput, self.gradInput, self.input_map):
+            self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
+
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        if not type:
+            return self._type
+        # This function just stores references so we don't need to do any type
+        # conversions. Just force the tables to be empty.
+        self.clearState()
+
+    def clearState(self):
+        self.input_map = []
+        return super(FlattenTable, self).clearState()
diff --git a/torch/legacy/nn/GradientReversal.py b/torch/legacy/nn/GradientReversal.py
new file mode 100644
index 0000000..36c048b
--- /dev/null
+++ b/torch/legacy/nn/GradientReversal.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+
+
+class GradientReversal(Module):
+
+    def __init__(self, lambd=1):
+        super(GradientReversal, self).__init__()
+        self.lambd = lambd
+
+    def setLambda(self, lambd):
+        self.lambd = lambd
+
+    def updateOutput(self, input):
+        self.output.set_(input)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(gradOutput)
+        self.gradInput.copy_(gradOutput)
+        self.gradInput.mul_(-self.lambd)
+        return self.gradInput
diff --git a/torch/legacy/nn/HardShrink.py b/torch/legacy/nn/HardShrink.py
new file mode 100644
index 0000000..99b3bb2
--- /dev/null
+++ b/torch/legacy/nn/HardShrink.py
@@ -0,0 +1,29 @@
+import torch
+from .Module import Module
+
+
+class HardShrink(Module):
+
+    def __init__(self, lambd=0.5):
+        assert type(lambd) == float
+        super(HardShrink, self).__init__()
+        self.lambd = lambd
+
+    def updateOutput(self, input):
+        self._backend.HardShrink_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.lambd
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.HardShrink_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.lambd
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/HardTanh.py b/torch/legacy/nn/HardTanh.py
new file mode 100644
index 0000000..b8bae62
--- /dev/null
+++ b/torch/legacy/nn/HardTanh.py
@@ -0,0 +1,35 @@
+import torch
+from .Module import Module
+
+
+class HardTanh(Module):
+
+    def __init__(self, min_value=-1, max_value=1, inplace=False):
+        super(HardTanh, self).__init__()
+        self.min_val = min_value
+        self.max_val = max_value
+        self.inplace = inplace
+        assert self.max_val > self.min_val
+
+    def updateOutput(self, input):
+        self._backend.HardTanh_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.min_val,
+            self.max_val,
+            self.inplace
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.HardTanh_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.min_val,
+            self.max_val,
+            self.inplace
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/HingeEmbeddingCriterion.py b/torch/legacy/nn/HingeEmbeddingCriterion.py
new file mode 100644
index 0000000..068acf9
--- /dev/null
+++ b/torch/legacy/nn/HingeEmbeddingCriterion.py
@@ -0,0 +1,37 @@
+import torch
+from .Criterion import Criterion
+
+
+class HingeEmbeddingCriterion(Criterion):
+
+    def __init__(self, margin=1, sizeAverage=True):
+        super(HingeEmbeddingCriterion, self).__init__()
+        self.margin = margin
+        self.sizeAverage = sizeAverage
+        self.buffer = None
+
+    def updateOutput(self, input, y):
+        if self.buffer is None:
+            self.buffer = input.new()
+        self.buffer.resize_as_(input).copy_(input)
+        self.buffer[torch.eq(y, -1.)] = 0
+        self.output = self.buffer.sum().item()
+
+        self.buffer.fill_(self.margin).add_(-1, input)
+        self.buffer.clamp_(min=0)
+        self.buffer[torch.eq(y, 1.)] = 0
+        self.output = self.output + self.buffer.sum().item()
+
+        if self.sizeAverage:
+            self.output = self.output / input.nelement()
+
+        return self.output
+
+    def updateGradInput(self, input, y):
+        self.gradInput.resize_as_(input).copy_(y)
+        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
+
+        if self.sizeAverage:
+            self.gradInput.mul_(1. / input.nelement())
+
+        return self.gradInput
diff --git a/torch/legacy/nn/Identity.py b/torch/legacy/nn/Identity.py
new file mode 100644
index 0000000..1909bf4
--- /dev/null
+++ b/torch/legacy/nn/Identity.py
@@ -0,0 +1,17 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Identity(Module):
+
+    def updateOutput(self, input):
+        self.output = input
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput = gradOutput
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'gradInput')
diff --git a/torch/legacy/nn/Index.py b/torch/legacy/nn/Index.py
new file mode 100644
index 0000000..eb9a1f8
--- /dev/null
+++ b/torch/legacy/nn/Index.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class Index(Module):
+
+    def __init__(self, dimension):
+        super(Index, self).__init__()
+        self.dimension = dimension
+        self.gradInput = [self.gradInput]
+
+    def updateOutput(self, input):
+        t = input[0]
+        index = input[1]
+        torch.index_select(t, self.dimension, index, out=self.output)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        t = input[0]
+        index = input[1]
+
+        gradInput = self.gradInput[0]  # no gradient for the index tensor
+        gradInput.resize_as_(t).zero_()
+        gradInput.index_add_(self.dimension, index, gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/JoinTable.py b/torch/legacy/nn/JoinTable.py
new file mode 100644
index 0000000..0031945
--- /dev/null
+++ b/torch/legacy/nn/JoinTable.py
@@ -0,0 +1,62 @@
+import torch
+from .Module import Module
+
+
+class JoinTable(Module):
+
+    def __init__(self, dimension):
+        super(JoinTable, self).__init__()
+        self.size = torch.Size()
+        self.dimension = dimension
+        self.gradInput = []
+
+    def _getPositiveDimension(self, input):
+        dimension = self.dimension
+        if dimension < 0:
+            dimension = input[0].dim() + dimension
+
+        return dimension
+
+    def updateOutput(self, input):
+        dim = self._getPositiveDimension(input)
+
+        for i in range(len(input)):
+            currentOutput = input[i]
+            if i == 0:
+                size = list(currentOutput.size())
+            else:
+                size[dim] += currentOutput.size(dim)
+
+        self.size = torch.Size(size)
+        self.output.resize_(self.size)
+
+        # TODO: use cat?
+        offset = 0
+        for i in range(len(input)):
+            currentOutput = input[i]
+            self.output.narrow(dim, offset, currentOutput.size(dim)).copy_(currentOutput)
+            offset += currentOutput.size(dim)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        dim = self._getPositiveDimension(input)
+
+        for i in range(len(input)):
+            if len(self.gradInput) < i + 1:
+                self.gradInput.append(input[i].new())
+            self.gradInput[i].resize_as_(input[i])
+        self.gradInput = self.gradInput[:len(input)]
+
+        offset = 0
+        for i in range(len(input)):
+            currentOutput = input[i]
+            currentGradInput = gradOutput.narrow(dim, offset, currentOutput.size(dim))
+            self.gradInput[i].copy_(currentGradInput)
+            offset = offset + currentOutput.size(dim)
+
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        self.gradInput = []
+        return super(JoinTable, self).type(type, tensorCache)
diff --git a/torch/legacy/nn/L1Cost.py b/torch/legacy/nn/L1Cost.py
new file mode 100644
index 0000000..fabbab3
--- /dev/null
+++ b/torch/legacy/nn/L1Cost.py
@@ -0,0 +1,36 @@
+import torch
+from .Criterion import Criterion
+from .utils import clear
+
+
+class L1Cost(Criterion):
+
+    def __init__(self):
+        super(L1Cost, self).__init__()
+        self.output_tensor = torch.Tensor(1)
+
+    def updateOutput(self, input, target=None):
+        assert target is None
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.L1Cost_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output_tensor
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target=None):
+        assert target is None
+        self._backend.L1Cost_updateGradInput(
+            self._backend.library_state,
+            input,
+            None,
+            self.gradInput
+        )
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'output_tensor')
+        return super(L1Cost, self).clearState()
diff --git a/torch/legacy/nn/L1HingeEmbeddingCriterion.py b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
new file mode 100644
index 0000000..aa4dc6c
--- /dev/null
+++ b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
@@ -0,0 +1,36 @@
+import torch
+from .Criterion import Criterion
+
+
+class L1HingeEmbeddingCriterion(Criterion):
+
+    def __init__(self, margin=1):
+        super(L1HingeEmbeddingCriterion, self).__init__()
+        self.margin = float(margin)
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+    def updateOutput(self, input, y):
+        self.output = float(input[0].dist(input[1], 1))
+        if y == -1:
+            self.output = max(0, self.margin - self.output)
+
+        return self.output
+
+    def _mathsign(t):
+        return 1 if x > 0 else -1
+
+    def updateGradInput(self, input, y):
+        self.gradInput[0].resize_as_(input[0])
+        self.gradInput[1].resize_as_(input[1])
+        self.gradInput[0].copy_(input[0])
+        self.gradInput[0].add_(-1, input[1])
+        dist = self.gradInput[0].norm(1)
+        self.gradInput[0].sign_()
+        if y == -1:  # just to avoid a mul by 1
+            if dist > self.margin:
+                self.gradInput[0].zero_()
+            else:
+                self.gradInput[0].mul_(-1)
+
+        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
+        return self.gradInput
diff --git a/torch/legacy/nn/L1Penalty.py b/torch/legacy/nn/L1Penalty.py
new file mode 100644
index 0000000..05472d7
--- /dev/null
+++ b/torch/legacy/nn/L1Penalty.py
@@ -0,0 +1,37 @@
+import torch
+from .Module import Module
+
+# This module acts as an L1 latent state regularizer, adding the
+# [gradOutput] to the gradient of the L1 loss. The [input] is copied to
+# the [output].
+
+
+class L1Penalty(Module):
+
+    def __init__(self, l1weight, sizeAverage=False, provideOutput=True):
+        super(L1Penalty, self).__init__()
+        self.l1weight = l1weight
+        self.sizeAverage = sizeAverage
+        self.provideOutput = provideOutput
+
+    def updateOutput(self, input):
+        m = self.l1weight
+        if self.sizeAverage:
+            m = m / input.nelement()
+
+        loss = m * input.norm(1)
+        self.loss = loss
+        self.output = input
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        m = self.l1weight
+        if self.sizeAverage:
+            m = m / input.nelement()
+
+        self.gradInput.resize_as_(input).copy_(input).sign_().mul_(m)
+
+        if self.provideOutput:
+            self.gradInput.add_(gradOutput)
+
+        return self.gradInput
diff --git a/torch/legacy/nn/LeakyReLU.py b/torch/legacy/nn/LeakyReLU.py
new file mode 100644
index 0000000..ca3a5cc
--- /dev/null
+++ b/torch/legacy/nn/LeakyReLU.py
@@ -0,0 +1,43 @@
+import torch
+from .Module import Module
+
+
+class LeakyReLU(Module):
+
+    def __init__(self, negval=1 / 100, inplace=False):
+        super(LeakyReLU, self).__init__()
+        if isinstance(negval, bool):
+            inplace = negval
+            self.negval = 1 / 100
+        else:
+            self.negval = negval
+
+        # default for inplace is False
+        self.inplace = inplace
+        if self.negval < 0:
+            # TODO: warning here
+            self.inplace = False
+
+    def updateOutput(self, input):
+        self._backend.LeakyReLU_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.negval,
+            self.inplace
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.LeakyReLU_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.negval,
+            self.inplace
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        return str(type(self)) + '({:.4f})'.format(self.negval)
diff --git a/torch/legacy/nn/Linear.py b/torch/legacy/nn/Linear.py
new file mode 100644
index 0000000..eb69a63
--- /dev/null
+++ b/torch/legacy/nn/Linear.py
@@ -0,0 +1,87 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Linear(Module):
+
+    def __init__(self, inputSize, outputSize, bias=True):
+        super(Linear, self).__init__()
+        self.weight = torch.Tensor(outputSize, inputSize)
+        self.gradWeight = torch.Tensor(outputSize, inputSize)
+        self.bias = torch.Tensor(outputSize) if bias else None
+        self.gradBias = torch.Tensor(outputSize) if bias else None
+        self.reset()
+
+        self.addBuffer = None
+
+    def noBias(self):
+        self.bias = None
+        self.gradBias = None
+        return self
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(1))
+
+        self.weight.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.uniform_(-stdv, stdv)
+
+        return self
+
+    def _updateAddBuffer(self, input):
+        nframe = input.size(0)
+        if self.addBuffer is None:
+            self.addBuffer = input.new()
+        if self.addBuffer.nelement() != nframe:
+            self.addBuffer.resize_(nframe).fill_(1)
+
+    def updateOutput(self, input):
+        assert input.dim() == 2
+        nframe = input.size(0)
+        nelement = self.output.nelement()
+        self.output.resize_(nframe, self.weight.size(0))
+        if self.output.nelement() != nelement:
+            self.output.zero_()
+
+        self._updateAddBuffer(input)
+        self.output.addmm_(0, 1, input, self.weight.t())
+        if self.bias is not None:
+            self.output.addr_(self.addBuffer, self.bias)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        nelement = self.gradInput.nelement()
+        self.gradInput.resize_as_(input)
+        if self.gradInput.nelement() != nelement:
+            self.gradInput.zero_()
+
+        assert input.dim() == 2
+        self.gradInput.addmm_(0, 1, gradOutput, self.weight)
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        assert input.dim() == 2
+        self.gradWeight.addmm_(scale, gradOutput.t(), input)
+        if self.bias is not None:
+            # update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
+            self._updateAddBuffer(input)
+            self.gradBias.addmv_(scale, gradOutput.t(), self.addBuffer)
+
+    def clearState(self):
+        clear(self, 'addBuffer')
+        return super(Linear, self).clearState()
+
+    def __repr__(self):
+        return super(Linear, self).__repr__() + \
+            '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
+            (' without bias' if self.bias is None else '')
diff --git a/torch/legacy/nn/Log.py b/torch/legacy/nn/Log.py
new file mode 100644
index 0000000..1f5e4bd
--- /dev/null
+++ b/torch/legacy/nn/Log.py
@@ -0,0 +1,18 @@
+import torch
+from .Module import Module
+
+
+class Log(Module):
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input)
+        self.output.copy_(input)
+        self.output.log_()
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input)
+        self.gradInput.fill_(1)
+        self.gradInput.div_(input)
+        self.gradInput.mul_(gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/LogSigmoid.py b/torch/legacy/nn/LogSigmoid.py
new file mode 100644
index 0000000..d6b8761
--- /dev/null
+++ b/torch/legacy/nn/LogSigmoid.py
@@ -0,0 +1,35 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class LogSigmoid(Module):
+
+    def __init__(self):
+        super(LogSigmoid, self).__init__()
+        self.buffer = None
+
+    def updateOutput(self, input):
+        if self.buffer is None:
+            self.buffer = input.new()
+        self._backend.LogSigmoid_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.buffer
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.LogSigmoid_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.buffer
+        )
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'buffer')
+        return super(LogSigmoid, self).clearState()
diff --git a/torch/legacy/nn/LogSoftMax.py b/torch/legacy/nn/LogSoftMax.py
new file mode 100644
index 0000000..82deda4
--- /dev/null
+++ b/torch/legacy/nn/LogSoftMax.py
@@ -0,0 +1,29 @@
+import torch
+from .Module import Module
+
+
+class LogSoftMax(Module):
+
+    def __init__(self, dim=None):
+        super(LogSoftMax, self).__init__()
+        if dim is not None:
+            self.dim = dim
+
+    def _get_dim(self, input):
+        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
+
+    def updateOutput(self, input):
+        self.output = torch.log_softmax(
+            input,
+            self._get_dim(input)
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput = torch.log_softmax_backward_data(
+            gradOutput,
+            self.output,
+            self._get_dim(input),
+            input
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/LookupTable.py b/torch/legacy/nn/LookupTable.py
new file mode 100644
index 0000000..ed3db06
--- /dev/null
+++ b/torch/legacy/nn/LookupTable.py
@@ -0,0 +1,152 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class LookupTable(Module):
+
+    def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
+        super(LookupTable, self).__init__()
+        self.weight = torch.Tensor(nIndex, nOutput)
+        self.gradWeight = torch.Tensor(nIndex, nOutput).zero_()
+        self.paddingValue = paddingValue
+        self.maxNorm = maxNorm
+        self.normType = normType
+        self.shouldScaleGradByFreq = False
+
+        self._gradOutput = None
+        self._sorted = None
+        self._indices = None
+
+        self._count = torch.IntTensor()
+        self._input = torch.LongTensor()
+
+        self.reset()
+
+    def accUpdateOnly(self):
+        self.gradWeight = None
+        return self
+
+    def setPadding(self, paddingValue):
+        self.paddingValue = paddingValue
+        return self
+
+    def setMaxNorm(self, maxNorm):
+        self.maxNorm = maxNorm
+        return self
+
+    def setNormType(self, normType):
+        self.normType = normType
+        return self
+
+    def scaleGradByFreq(self):
+        self.shouldScaleGradByFreq = True
+        return self
+
+    def reset(self, stdv=1):
+        self.weight.normal_(0, stdv)
+
+    def _makeInputContiguous(self, input):
+        # make sure input is a contiguous torch.LongTensor
+        if not input.is_contiguous() or input.type() != self._input.type():
+            self.copiedInput = True
+            self._input.resize_(input.size()).copy_(input)
+            return self._input
+        else:
+            self.copiedInput = False
+            return input
+
+    def updateOutput(self, input):
+        self.renorm(input)
+        input = self._makeInputContiguous(input)
+        if input.dim() == 1:
+            torch.index_select(self.weight, 0, input, out=self.output)
+        elif input.dim() == 2:
+            torch.index_select(self.weight, 0, input.view(-1), out=self.output)
+            self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
+        else:
+            raise RuntimeError("input must be a vector or matrix")
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        # the input can be of any type (as in the forward it's
+        # converted anyway to LongTensor) thus, need to allocate
+        # new memory each time the user changes the input type
+        if self.gradInput.type() != input.type():
+            self.gradInput = input.new()
+
+        if not self.gradInput.is_same_size(input):
+            self.gradInput.resize_as_(input).zero_()
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        input = self._input if self.copiedInput else input
+        if input.dim() == 2:
+            input = input.view(-1)
+        elif input.dim() != 1:
+            raise RuntimeError("input must be a vector or matrix")
+
+        if not gradOutput.is_contiguous():
+            if self._gradOutput is None:
+                self._gradOutput = gradOutput.new()
+            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+            gradOutput = self._gradOutput
+
+        self._backend.LookupTable_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self._count,
+            self._sorted,
+            self._indices,
+            self.shouldScaleGradByFreq,
+            self.paddingValue or 0,
+            scale
+        )
+
+    def renorm(self, input):
+        if self.maxNorm is None:
+            return
+
+        # copy input into _input, so _input is continuous.
+        # The copied _input will be modified in the C code.
+        self._input.resize_(input.size()).copy_(input)
+        row_idx = self._input
+        if row_idx.dim() == 2:
+            row_idx = row_idx.view(-1)
+        elif row_idx.dim() != 1:
+            raise RuntimeError("input must be a vector or matrix")
+
+        # "row_idx" and "weight" will be modified in the C code
+        self._backend.LookupTable_renorm(
+            self._backend.library_state,
+            row_idx,
+            self.weight,
+            self.maxNorm,
+            self.normType or 2
+        )
+
+    def type(self, type=None, tensorCache=None):
+        if type is None:
+            return self._type
+        super(LookupTable, self).type(type, tensorCache)
+
+        if type == 'torch.cuda.FloatTensor':
+            # CUDA uses _sorted and _indices temporary tensors
+            self._sorted = torch.cuda.LongTensor()
+            self._indices = torch.cuda.LongTensor()
+            self._count = torch.cuda.LongTensor()
+            self._input = torch.cuda.LongTensor()
+        else:
+            # self._count and self._input should only be converted if using Cuda
+            self._count = torch.IntTensor()
+            self._input = torch.LongTensor()
+
+        return self
+
+    def clearState(self):
+        clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
+        return super(LookupTable, self).clearState()
diff --git a/torch/legacy/nn/MM.py b/torch/legacy/nn/MM.py
new file mode 100644
index 0000000..e3316ea
--- /dev/null
+++ b/torch/legacy/nn/MM.py
@@ -0,0 +1,72 @@
+import torch
+from .Module import Module
+
+
+class MM(Module):
+
+    def __init__(self, transA=False, transB=False):
+        super(MM, self).__init__()
+        self.transA = transA
+        self.transB = transB
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+    def updateOutput(self, input):
+        assert len(input) == 2
+        a, b = input
+        assert a.ndimension() == 2 or a.ndimension() == 3
+        assert a.dim() == b.dim()
+
+        if a.ndimension() == 2:
+            if self.transA:
+                a = a.t()
+            if self.transB:
+                b = b.t()
+            self.output.resize_(a.size(0), b.size(1))
+            torch.mm(a, b, out=self.output)
+        else:
+            if self.transA:
+                a = a.transpose(1, 2)
+            if self.transB:
+                b = b.transpose(1, 2)
+
+            self.output.resize_(a.size(0), a.size(1), b.size(2))
+            torch.bmm(a, b, out=self.output)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput[0] is None:
+            self.gradInput[0] = input[0].new()
+        if self.gradInput[1] is None:
+            self.gradInput[1] = input[1].new()
+
+        assert len(input) == 2
+        a, b = input
+        self.gradInput[0].resize_as_(a)
+        self.gradInput[1].resize_as_(b)
+
+        assert gradOutput.ndimension() == 2 or gradOutput.ndimension() == 3
+        assert a.dim() == b.dim() == gradOutput.dim()
+
+        if gradOutput.ndimension() == 2:
+            h_dim, w_dim = 0, 1
+            f = "mm"
+        else:
+            h_dim, w_dim = 1, 2
+            f = "bmm"
+
+        if self.transA == self.transB:
+            a = a.transpose(h_dim, w_dim)
+            b = b.transpose(h_dim, w_dim)
+
+        if self.transA:
+            getattr(torch, f)(b, gradOutput.transpose(h_dim, w_dim), out=self.gradInput[0])
+        else:
+            getattr(torch, f)(gradOutput, b, out=self.gradInput[0])
+
+        if self.transB:
+            getattr(torch, f)(gradOutput.transpose(h_dim, w_dim), a, out=self.gradInput[1])
+        else:
+            getattr(torch, f)(a, gradOutput, out=self.gradInput[1])
+
+        return self.gradInput
diff --git a/torch/legacy/nn/MSECriterion.py b/torch/legacy/nn/MSECriterion.py
new file mode 100644
index 0000000..2079d36
--- /dev/null
+++ b/torch/legacy/nn/MSECriterion.py
@@ -0,0 +1,37 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class MSECriterion(Criterion):
+
+    def __init__(self, sizeAverage=True):
+        super(MSECriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.MSECriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        implicit_gradOutput = torch.Tensor([1]).type(input.type())
+
+        self._backend.MSECriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/MV.py b/torch/legacy/nn/MV.py
new file mode 100644
index 0000000..ad9ff46
--- /dev/null
+++ b/torch/legacy/nn/MV.py
@@ -0,0 +1,67 @@
+import torch
+from .Module import Module
+
+
+class MV(Module):
+    """Module to perform matrix vector multiplication on two minibatch inputs,
+       producing a minibatch.
+    """
+
+    def __init__(self, trans=False):
+        super(MV, self).__init__()
+
+        self.trans = trans
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+    def updateOutput(self, input):
+        M, v = input
+        assert M.ndimension() == 2 or M.ndimension() == 3
+
+        if M.ndimension() == 2:
+            assert v.ndimension() == 1
+            if self.trans:
+                M = M.transpose(0, 1)
+            self.output.resize_(M.size(0))
+            torch.mv(M, v, out=self.output)
+        else:
+            assert v.ndimension() == 2
+            if self.trans:
+                M = M.transpose(1, 2)
+            self.output.resize_(M.size(0), M.size(1), 1)
+            torch.bmm(M, v.view(v.size(0), v.size(1), 1), out=self.output).resize_(M.size(0), M.size(1))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        M, v = input
+        self.gradInput[0].resize_as_(M)
+        self.gradInput[1].resize_as_(v)
+        gradOutput = gradOutput.contiguous()
+
+        assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2
+
+        if gradOutput.ndimension() == 2:
+            assert M.ndimension() == 3
+            assert v.ndimension() == 2
+            bdim = M.size(0)
+            odim = M.size(1)
+            idim = M.size(2)
+
+            if self.trans:
+                torch.bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim), out=self.gradInput[0])
+                torch.bmm(M, gradOutput.view(bdim, idim, 1), out=self.gradInput[1].view(bdim, odim, 1))
+            else:
+                torch.bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim), out=self.gradInput[0])
+                torch.bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1), out=self.gradInput[1].view(bdim, idim, 1))
+        else:
+            assert M.ndimension() == 2
+            assert v.ndimension() == 1
+
+            if self.trans:
+                torch.ger(v, gradOutput, out=self.gradInput[0])
+                self.gradInput[1] = M * gradOutput
+            else:
+                torch.ger(gradOutput, v, out=self.gradInput[0])
+                self.gradInput[1] = M.t() * gradOutput
+
+        return self.gradInput
diff --git a/torch/legacy/nn/MarginCriterion.py b/torch/legacy/nn/MarginCriterion.py
new file mode 100644
index 0000000..0af7993
--- /dev/null
+++ b/torch/legacy/nn/MarginCriterion.py
@@ -0,0 +1,36 @@
+import torch
+from .Criterion import Criterion
+
+
+class MarginCriterion(Criterion):
+
+    def __init__(self, margin=1, sizeAverage=True):
+        super(MarginCriterion, self).__init__()
+        self.sizeAverage = True
+        self.margin = margin
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.MarginCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            self.sizeAverage,
+            self.margin
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self._backend.MarginCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            self.gradInput,
+            self.sizeAverage,
+            self.margin
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/MarginRankingCriterion.py b/torch/legacy/nn/MarginRankingCriterion.py
new file mode 100644
index 0000000..9b8f512
--- /dev/null
+++ b/torch/legacy/nn/MarginRankingCriterion.py
@@ -0,0 +1,75 @@
+import torch
+from .Criterion import Criterion
+
+
+class MarginRankingCriterion(Criterion):
+
+    def __init__(self, margin=0, sizeAverage=True):
+        super(MarginRankingCriterion, self).__init__()
+        self.margin = margin
+        self.sizeAverage = sizeAverage
+        self.gradInput = [torch.Tensor(), torch.Tensor()]
+
+        self._output = None
+        self.dist = None
+        self.mask = None
+
+    def updateOutput(self, input, y):
+        if input[0].size(0) == 1:
+            self.output = max(0, -y * (input[0][0] - input[1][0]) + self.margin)
+        else:
+            if self._output is None:
+                self._output = input[0].clone()
+            self._output.resize_as_(input[0])
+            self._output.copy_(input[0])
+
+            self._output.add_(-1, input[1])
+            self._output.mul_(-1).mul_(y)
+            self._output.add_(self.margin)
+
+            self._output.clamp_(min=0)
+
+            self.output = self._output.sum().item()
+
+            if self.sizeAverage:
+                self.output = self.output / y.size(0)
+
+        return self.output
+
+    def updateGradInput(self, input, y):
+        if input[0].size(0) == 1:
+            dist = -y * (input[0][0] - input[1][0]) + self.margin
+            if dist < 0:
+                self.gradInput[0][0] = 0
+                self.gradInput[1][0] = 0
+            else:
+                self.gradInput[0][0] = -y
+                self.gradInput[1][0] = y
+        else:
+            if self.dist is None:
+                self.dist = input[0].new()
+            self.dist = self.dist.resize_as_(input[0]).copy_(input[0])
+            dist = self.dist
+
+            dist.add_(-1, input[1])
+            dist.mul_(-1).mul_(y)
+            dist.add_(self.margin)
+
+            self.mask = dist > 0
+            mask = self.mask
+
+            torch.ge(dist, 0, out=mask)
+
+            self.gradInput[0].resize_(dist.size())
+            self.gradInput[1].resize_(dist.size())
+
+            self.gradInput[0].copy_(mask)
+            self.gradInput[0].mul_(-1).mul_(y)
+            self.gradInput[1].copy_(mask)
+            self.gradInput[1].mul_(y)
+
+            if self.sizeAverage:
+                self.gradInput[0].div_(y.size(0))
+                self.gradInput[1].div_(y.size(0))
+
+        return self.gradInput
diff --git a/torch/legacy/nn/MaskedSelect.py b/torch/legacy/nn/MaskedSelect.py
new file mode 100644
index 0000000..a804883
--- /dev/null
+++ b/torch/legacy/nn/MaskedSelect.py
@@ -0,0 +1,64 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class MaskedSelect(Module):
+
+    def __init__(self):
+        super(MaskedSelect, self).__init__()
+        self._maskIndices = torch.LongTensor()
+        self._maskIndexBuffer = torch.LongTensor()
+        self._maskIndexBufferCPU = torch.FloatTensor()
+        self._gradBuffer = torch.Tensor()
+        self._gradMask = torch.ByteTensor()
+
+    def updateOutput(self, input):
+        input, mask = input
+        torch.masked_select(input, mask, out=self.output)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        input, mask = input
+        if input.type() == 'torch.cuda.FloatTensor':
+            torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size())
+            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
+        else:
+            torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size())
+
+        torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
+        self._gradBuffer.resize_(input.nelement()).zero_()
+        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
+        self._gradBuffer.resize_(input.size())
+        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        if type is None:
+            return self._type
+
+        self._gradBuffer = self._gradBuffer.type(type)
+        self.gradInput = self.gradInput.type(type)
+        self.output = self.output.type(type)
+
+        # These casts apply when switching between cuda/non-cuda types
+        if type != 'torch.cuda.FloatTensor':
+            self._maskIndexBuffer = self._maskIndexBuffer.long()
+            self._maskIndices = self._maskIndices.long()
+            self._gradMask = self._gradMask.byte()
+        else:
+            self._maskIndexBuffer = self._maskIndexBuffer.cuda()
+            self._maskIndices = self._maskIndices.cuda()
+            self._gradMask = self._gradMask.cuda()
+
+        self._type = type
+        return self
+
+    def clearState(self):
+        return clear(self, ['output',
+                            'gradInput',
+                            '_maskIndexBuffer',
+                            '_maskIndexBufferCPU',
+                            '_maskIndices',
+                            '_gradBuffer',
+                            '_gradMask'])
diff --git a/torch/legacy/nn/Max.py b/torch/legacy/nn/Max.py
new file mode 100644
index 0000000..aa03dc5
--- /dev/null
+++ b/torch/legacy/nn/Max.py
@@ -0,0 +1,67 @@
+import torch
+from .Module import Module
+from .utils import clear, addSingletondimension
+
+
+class Max(Module):
+
+    def __init__(self, dimension=0):
+        super(Max, self).__init__()
+        self.dimension = dimension
+        self._output = None
+        self._indices = None
+
+    def _getPositiveDimension(self, input):
+        dimension = self.dimension
+        if dimension < 0:
+            dimension = input.dim() + dimension
+
+        return dimension
+
+    def _lazyInit(self):
+        if self._output is None:
+            self._output = self.output.new()
+        if self._indices is None:
+            self._indices = \
+                (torch.cuda.LongTensor() if self.output.is_cuda else torch.LongTensor())
+
+    def updateOutput(self, input):
+        self._lazyInit()
+        dimension = self._getPositiveDimension(input)
+        torch.max(input, dimension, out=(self._output, self._indices), keepdim=True)
+        if input.dim() > 1:
+            self.output.set_(self._output.select(dimension, 0))
+        else:
+            self.output.set_(self._output)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._lazyInit()
+        dimension = self._getPositiveDimension(input)
+        if input.dim() > 1:
+            gradOutputView = addSingletondimension(gradOutput, dimension)
+        else:
+            gradOutputView = gradOutput
+
+        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
+        return self.gradInput
+
+    def type(self, type, tensorCache=None):
+        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+        if type == 'torch.cuda.FloatTensor':
+            indices, self._indices = self._indices, None
+            super(Max, self).type(type, tensorCache)
+            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
+        else:
+            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
+            # unnecessary memory allocations.
+            indices, self._indices = self._indices, None
+            super(Max, self).type(type, tensorCache)
+            self._indices = indices.long() if indices is not None else None
+
+        return self
+
+    def clearState(self):
+        clear(self, '_indices', '_output')
+        return super(Max, self).clearState()
diff --git a/torch/legacy/nn/Mean.py b/torch/legacy/nn/Mean.py
new file mode 100644
index 0000000..67048d2
--- /dev/null
+++ b/torch/legacy/nn/Mean.py
@@ -0,0 +1,16 @@
+import torch
+from .Sum import Sum
+
+"""
+
+This file is still here because of backward compatibility.
+
+Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
+
+"""
+
+
+class Mean(Sum):
+
+    def __init__(self, dimension):
+        super(Mean, self).__init__(dimension, True)
diff --git a/torch/legacy/nn/Min.py b/torch/legacy/nn/Min.py
new file mode 100644
index 0000000..b7bdbca
--- /dev/null
+++ b/torch/legacy/nn/Min.py
@@ -0,0 +1,68 @@
+import torch
+from .Module import Module
+from .utils import clear, addSingletondimension
+
+
+class Min(Module):
+
+    def __init__(self, dimension=0):
+        super(Min, self).__init__()
+        self.dimension = dimension
+        self._output = None
+        self._indices = None
+
+    def _getPositiveDimension(self, input):
+        dimension = self.dimension
+        if dimension < 0:
+            dimension = input.dim() + dimension
+
+        return dimension
+
+    def _lazyInit(self):
+        if self._output is None:
+            self._output = self.output.new()
+        if self._indices is None:
+            self._indices = \
+                (torch.cuda.LongTensor() if self.output.type() == 'torch.cuda.FloatTensor'
+                 else torch.LongTensor())
+
+    def updateOutput(self, input):
+        self._lazyInit()
+        dimension = self._getPositiveDimension(input)
+        torch.min(input, dimension, out=(self._output, self._indices), keepdim=True)
+        if input.dim() > 1:
+            self.output.set_(self._output.select(dimension, 0))
+        else:
+            self.output.set_(self._output)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._lazyInit()
+        dimension = self._getPositiveDimension(input)
+        if input.dim() > 1:
+            gradOutputView = addSingletondimension(gradOutput, dimension)
+        else:
+            gradOutputView = gradOutput
+
+        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
+        return self.gradInput
+
+    def type(self, type, tensorCache=None):
+        # torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+        if type == 'torch.cuda.FloatTensor':
+            indices, self._indices = self._indices, None
+            super(Min, self).type(type, tensorCache)
+            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
+        else:
+            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
+            # unnecessary memory allocations.
+            indices, self._indices = self._indices, None
+            super(Min, self).type(type, tensorCache)
+            self._indices = indices.long() if indices is not None else None
+
+        return self
+
+    def clearState(self):
+        clear(self, '_indices', '_output')
+        return super(Min, self).clearState()
diff --git a/torch/legacy/nn/MixtureTable.py b/torch/legacy/nn/MixtureTable.py
new file mode 100644
index 0000000..541ddd8
--- /dev/null
+++ b/torch/legacy/nn/MixtureTable.py
@@ -0,0 +1,168 @@
+import torch
+from .Module import Module
+from .utils import clear, recursiveResizeAs
+
+
+class MixtureTable(Module):
+
+    def __init__(self, dim=1):
+        super(MixtureTable, self).__init__()
+        self.dim = dim
+        self.size = torch.Size()
+        self.size2 = torch.Size()
+        self.batchSize = 0
+        self.backwardSetup = False
+        self.gradInput = []
+
+        self._gaterView = None
+        self._expert = None
+        self._expertView = None
+        self._sum = None
+        self._expertView2 = None
+        self._expert2 = None
+        self.table = False
+
+    def updateOutput(self, input):
+        gaterInput, expertInputs = input
+
+        # buffers
+        if self._gaterView is None:
+            self._gaterView = input[0].new()
+        if self._expert is None:
+            self._expert = input[0].new()
+        if self._expertView is None:
+            self._expertView = input[0].new()
+
+        self.dimG = 1
+        batchSize = gaterInput.size(0)
+
+        if self.table or isinstance(expertInputs, list):
+            self.table = True
+            if gaterInput.size(self.dimG) != len(expertInputs):
+                raise RuntimeError("Should be one gater output per expert")
+
+            expertInput = expertInputs[0]
+            if self.batchSize != batchSize:
+                size = [1] * (expertInput.dim() + 1)
+                if self.dimG > 0:
+                    size[0] = gaterInput.size(0)
+                size[self.dim] = gaterInput.size(self.dimG)
+                self.size = torch.Size(size)
+                self.output.resize_as_(expertInput)
+                self.backwardSetup = False
+                self.batchSize = batchSize
+
+            self._gaterView = gaterInput.view(self.size)
+            self.output.zero_()
+            # multiply accumulate gater outputs by their commensurate expert
+            for i, expertInput in enumerate(expertInputs):
+                gate = self._gaterView.select(self.dim, i).expand_as(expertInput)
+                self.output.addcmul_(expertInput, gate)
+        else:
+            if self.batchSize != batchSize:
+                size = [1] * expertInputs.dim()
+                if self.dimG > 0:
+                    size[0] = gaterInput.size(0)
+                size[self.dim] = gaterInput.size(self.dimG)
+                self.size = torch.Size(size)
+                self.output.resize_as_(expertInputs.select(self.dim, 0))
+                self.batchSize = batchSize
+                self.backwardSetup = False
+
+            self._gaterView = gaterInput.view(self.size)
+            torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert)
+            torch.sum(self._expert, self.dim, True, out=self.output)
+            self.output.resize_as_(expertInputs.select(self.dim, 0))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        gaterInput, expertInputs = input
+        recursiveResizeAs(self.gradInput, input)
+        gaterGradInput, expertGradInputs = self.gradInput
+
+        # buffers
+        if self._sum is None:
+            self._sum = input[0].new()
+        if self._expertView2 is None:
+            self._expertView2 = input[0].new()
+        if self._expert2 is None:
+            self._expert2 = input[0].new()
+
+        if self.table:
+            if not self.backwardSetup:
+                for i, expertInput in enumerate(expertInputs):
+                    expertGradInput = expertGradInputs[i] or expertInput.clone()
+                    expertGradInput.resize_as_(expertInput)
+                    expertGradInputs[i] = expertGradInput
+
+                gaterGradInput.resize_as_(gaterInput)
+                self.backwardSetup = True
+
+            # like CMulTable, but with broadcasting
+            for i, expertGradInput in enumerate(expertGradInputs):
+                # gater updateGradInput
+                torch.mul(gradOutput, expertInputs[i], out=self._expert)
+                if self.dimG == 0:
+                    self._expertView = self._expert.view(-1)
+                else:
+                    self._expertView = self._expert.view(gradOutput.size(0), -1)
+
+                torch.sum(self._expertView, self.dimG, True, out=self._sum)
+                if self.dimG == 0:
+                    gaterGradInput[i] = self._sum.select(self.dimG, 0)
+                else:
+                    gaterGradInput.select(self.dimG, i).copy_(self._sum.select(self.dimG, 0))
+
+                # expert updateGradInput
+                gate = self._gaterView.select(self.dim, i).expand_as(expertGradInput)
+                expertGradInput.mul_(gate, gradOutput)
+        else:
+            if not self.backwardSetup:
+                size2 = list(expertInputs.size())
+                size2[self.dim] = 1
+                self.size2 = torch.Size(size2)
+                gaterGradInput.resize_as_(gaterInput)
+                self.backwardSetup = True
+
+            # gater updateGradInput
+            self._expertView = gradOutput.contiguous().view(torch.Size(self.size2))
+            gradOutput = self._expertView.expand_as(expertInputs)
+            torch.mul(gradOutput, expertInputs, out=self._expert)
+            expert = self._expert.transpose(self.dim, self.dimG)
+            if not expert.is_contiguous():
+                self._expert2.resize_as_(expert)
+                self._expert2.copy_(expert)
+                expert = self._expert2
+            if self.dimG == 0:
+                self._expertView2 = expert.view(gaterInput.size(0), -1)
+            else:
+                self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)
+
+            torch.sum(self._expertView2, self.dimG + 1, True, out=gaterGradInput)
+            gaterGradInput.resize_as_(gaterInput)
+
+            # expert updateGradInput
+            torch.mul(self._gaterView.expand_as(expertInputs), gradOutput, out=expertGradInputs)
+
+        return self.gradInput
+
+    def type(self, type, tensorCache=None):
+        self._gaterView = None
+        self._expert = None
+        self._expertView = None
+        self._sum = None
+        self._expert2 = None
+        self._expertView2 = None
+        return super(MixtureTable, self).type(type, tensorCache)
+
+    def clearState(self, ):
+        clear(self, [
+            '_gaterView',
+            '_expert',
+            '_expertView',
+            '_sum',
+            '_expert2',
+            '_expertView2',
+        ])
+        return super(MixtureTable, self).clearState()
diff --git a/torch/legacy/nn/Module.py b/torch/legacy/nn/Module.py
new file mode 100644
index 0000000..4c43cf8
--- /dev/null
+++ b/torch/legacy/nn/Module.py
@@ -0,0 +1,296 @@
+import torch
+import torch._thnn
+from .utils import clear, recursiveType
+
+
+class Module(object):
+
+    def __init__(self):
+        self.gradInput = torch.Tensor()
+        self.output = torch.Tensor()
+        self._type = self.output.type()
+        self._backend = torch._thnn.type2backend[self.output.type()]
+
+    def __repr__(self):
+        return 'nn.' + self.__class__.__name__
+
+    def parameters(self):
+        has_weight = hasattr(self, 'weight') and self.weight is not None
+        has_bias = hasattr(self, 'bias') and self.bias is not None
+        if has_weight and has_bias:
+            return [self.weight, self.bias], [self.gradWeight, self.gradBias]
+        elif has_weight:
+            return [self.weight], [self.gradWeight]
+        elif has_bias:
+            return [self.bias], [self.gradBias]
+        else:
+            return
+
+    def updateOutput(self, input):
+        return self.output
+
+    def forward(self, input):
+        return self.updateOutput(input)
+
+    def backward(self, input, gradOutput, scale=1):
+        self.updateGradInput(input, gradOutput)
+        self.accGradParameters(input, gradOutput, scale)
+        return self.gradInput
+
+    def backwardUpdate(self, input, gradOutput, lr):
+        self.updateGradInput(input, gradOutput)
+        self.accUpdateGradParameters(input, gradOutput, lr)
+        return self.gradInput
+
+    def updateGradInput(self, input, gradOutput):
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        pass
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        has_weight = hasattr(self, 'weight') and self.weight is not None
+        has_bias = hasattr(self, 'bias') and self.bias is not None
+        if has_weight:
+            gradWeight = self.gradWeight
+            self.gradWeight = self.weight
+        if has_bias:
+            gradBias = self.gradBias
+            self.gradBias = self.bias
+        self.accGradParameters(input, gradOutput, -lr)
+        if has_weight:
+            self.gradWeight = gradWeight
+        if has_bias:
+            self.gradBias = gradBias
+
+    def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
+        if self.parameters():
+            self.zeroGradParameters()
+            self.accGradParameters(input, gradOutput, 1)
+            self.updateParameters(lr)
+
+    def zeroGradParameters(self):
+        params = self.parameters()
+        if params is not None:
+            for grad in params[1]:
+                grad.zero_()
+
+    def updateParameters(self, learningRate):
+        if self.parameters() is not None:
+            params, gradParams = self.parameters()
+            if params:
+                for p, gp in zip(params, gradParams):
+                    p.add_(-learningRate, gp)
+
+    def training(self):
+        self.train = True
+
+    def evaluate(self):
+        self.train = False
+
+    # TODO
+    def share(self, mlp, *arg):
+        raise NotImplementedError
+
+    def clone(self, *arg):
+        raise NotImplementedError
+
+    def type(self, type=None, tensorCache=None):
+        if type is None:
+            return self._type
+
+        tensorCache = tensorCache or {}
+
+        # find all tensors and convert them
+        for key, param in self.__dict__.items():
+            setattr(self, key, recursiveType(param, type, tensorCache))
+
+        self._backend = torch._thnn.type2backend[type]
+        self._type = type
+        return self
+
+    def float(self, *args):
+        return self.type('torch.FloatTensor', *args)
+
+    def double(self, *args):
+        return self.type('torch.DoubleTensor', *args)
+
+    def cuda(self, *args):
+        return self.type('torch.cuda.FloatTensor', *args)
+
+    def reset(self):
+        pass
+
+    def write(self, f):
+        raise NotImplementedError
+
+    def read(self, f):
+        raise NotImplementedError
+
+    # This function is not easy to understand. It works as follows:
+    #
+    # - gather all parameter tensors for this module (and children);
+    #   count all parameter values (floats)
+    # - create one ginormous memory area (Storage object) with room for all
+    #   parameters
+    # - remap each parameter tensor to point to an area within the ginormous
+    #   Storage, and copy it there
+    #
+    # It has the effect of making all parameters point to the same memory area,
+    # which is: returned.
+    #
+    # The purpose is to allow operations over all parameters (such as momentum
+    # updates and serialization), but it assumes that all parameters are of
+    # the same type (and, in the case of CUDA, on the same device), which
+    # is not always True. Use for_each() to iterate over this module and
+    # children instead.
+    #
+    # Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
+    # to specify the type of temporary buffers. For example, the temporary
+    # buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
+    #
+    # TODO: This logically belongs to torch.Tensor, not nn.
+    _flattenTensorBuffer = {}
+
+    def _flatten(self, parameters=[]):
+
+        # returns True if tensor occupies a contiguous region of memory (no holes)
+        def isCompact(tensor):
+            # isn't it enough to check if strides == size.cumprod(0)?
+            sortedStride, perm = torch.sort(torch.LongTensor(tensor.stride()), 0, True)
+            sortedSize = torch.LongTensor(list(tensor.size())).index_select(0, perm)
+            nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
+            sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
+            sortedSize = sortedSize.narrow(0, 0, nRealDim).clone()
+            t = tensor.new().set_(tensor.storage(), 0,
+                                  tuple(sortedSize),
+                                  tuple(sortedStride))
+            return t.is_contiguous()
+
+        if not parameters:
+            return torch.Tensor()
+
+        Tensor = parameters[0].new
+        BufferTensor = Module._flattenTensorBuffer.get(type(parameters[0]), Tensor)
+
+        # 1. construct the set of all unique storages referenced by parameter tensors
+        storages = {}
+        num_parameters = 0
+        parameterMeta = []
+        for i, param in enumerate(parameters):
+            storage = param.storage()
+            key = storage._cdata
+
+            if key not in storages:
+                storages[key] = (storage, num_parameters)
+                num_parameters = num_parameters + storage.size()
+
+            parameterMeta.append({
+                'storage_offset': param.storage_offset() + storages[key][1],
+                'size': param.size(),
+                'stride': param.stride()
+            })
+
+        # 2. construct a single tensor that will hold all the parameters
+        flatParameters = BufferTensor(num_parameters).zero_()
+
+        # 3. determine if there are elements in the storage that none of the
+        #    parameter tensors reference ('holes')
+        tensorsCompact = True
+        for meta in parameterMeta:
+            tmp = BufferTensor().set_(flatParameters.storage(), meta['storage_offset'], meta['size'], meta['stride'])
+            tmp.fill_(1)
+            tensorsCompact = tensorsCompact and isCompact(tmp)
+
+        maskParameters = flatParameters.byte().clone()
+        compactOffsets = flatParameters.long().cumsum(0)
+        used_parameters = compactOffsets[-1]
+
+        # 4. copy storages into the flattened parameter tensor
+        for storageAndOffset in storages.values():
+            storage, offset = storageAndOffset
+            flatParameters[slice(offset, offset + storage.size())].copy_(Tensor().set_(storage))
+
+        # 5. allow garbage collection
+        storages = None
+        for param in parameters:
+            param.set_()
+
+        # 6. compact the flattened parameters if there were holes
+        if used_parameters != num_parameters:
+            assert tensorsCompact
+
+            flatParameters = BufferTensor(used_parameters).copy_(
+                flatParameters.masked_select(maskParameters))
+            for meta in parameterMeta:
+                meta['storage_offset'] = compactOffsets[meta['storage_offset']]
+
+        if BufferTensor != Tensor:
+            flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
+
+        # 7. fix up the parameter tensors to point at the flattened parameters
+        for param, meta in zip(parameters, parameterMeta):
+            param.set_(flatParameters.storage(),
+                       meta['storage_offset'],
+                       meta['size'],
+                       meta['stride'])
+
+        return flatParameters
+
+    def flattenParameters(self):
+        _params = self.parameters()
+        if _params is None:
+            return
+        parameters, gradParameters = _params
+        p, g = self._flatten(parameters), self._flatten(gradParameters)
+
+        assert p.nelement() == g.nelement()
+        if parameters:
+            for param, grad in zip(parameters, gradParameters):
+                assert param.storage_offset() == grad.storage_offset()
+
+        return p, g
+
+    def apply(self, callback):
+        callback(self)
+        if hasattr(self, 'modules'):
+            for module in self.modules:
+                module.apply(callback)
+
+    def findModules(self, cls, container=None):
+        nodes = []
+        containers = []
+        if isinstance(self, cls):
+            nodes.append(self)
+            containers.append(container)
+
+        # Recurse on nodes with 'modules'
+        if hasattr(self, 'modules'):
+            for child in self.modules:
+                child_nodes, child_containers = child.findModules(cls, self)
+                assert len(child_nodes) == len(child_containers)
+                # add the list items from our child to our list (i.e. return a
+                # flattened table of the return nodes).
+                nodes.extend(child_nodes)
+                containers.extend(child_containers)
+
+        return nodes, containers
+
+    def listModules(self):
+        # include self first
+        modules = [self]
+        if hasattr(self, 'modules'):
+            for child in self.modules:
+                modules.extend(child.listModules())
+        return modules
+
+    def clearState(self):
+        return clear(self, 'output', 'gradInput')
+
+    def replace(self, callback):
+        out = callback(self)
+        # TODO: not out.modules?
+        if hasattr(self, 'modules'):
+            for i, module in enumerate(self.modules):
+                self.modules[i] = module.replace(callback)
+        return out
diff --git a/torch/legacy/nn/Mul.py b/torch/legacy/nn/Mul.py
new file mode 100644
index 0000000..7e7e336
--- /dev/null
+++ b/torch/legacy/nn/Mul.py
@@ -0,0 +1,33 @@
+import math
+import torch
+from .Module import Module
+
+
+class Mul(Module):
+
+    def __init__(self):
+        super(Mul, self).__init__()
+        self.weight = torch.Tensor(1)
+        self.gradWeight = torch.Tensor(1)
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(0))
+        self.weight.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input).copy_(input)
+        self.output.mul_(self.weight[0])
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input).zero_()
+        self.gradInput.add_(self.weight[0], gradOutput)
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self.gradWeight[0] = (self.gradWeight[0] +
+                              scale * input.contiguous().view(-1).dot(gradOutput.contiguous().view(-1)))
diff --git a/torch/legacy/nn/MulConstant.py b/torch/legacy/nn/MulConstant.py
new file mode 100644
index 0000000..6652ffb
--- /dev/null
+++ b/torch/legacy/nn/MulConstant.py
@@ -0,0 +1,37 @@
+import torch
+from .Module import Module
+
+
+class MulConstant(Module):
+
+    def __init__(self, constant_scalar, inplace=False):
+        super(MulConstant, self).__init__()
+        self.constant_scalar = constant_scalar
+        self.inplace = inplace
+
+    def updateOutput(self, input):
+        if self.inplace:
+            input.mul_(self.constant_scalar)
+            self.output.set_(input)
+        else:
+            self.output.resize_as_(input)
+            self.output.copy_(input)
+            self.output.mul_(self.constant_scalar)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        if self.inplace:
+            gradOutput.mul_(self.constant_scalar)
+            self.gradInput.set_(gradOutput)
+            # restore previous input value
+            input.div_(self.constant_scalar)
+        else:
+            self.gradInput.resize_as_(gradOutput)
+            self.gradInput.copy_(gradOutput)
+            self.gradInput.mul_(self.constant_scalar)
+
+        return self.gradInput
diff --git a/torch/legacy/nn/MultiCriterion.py b/torch/legacy/nn/MultiCriterion.py
new file mode 100644
index 0000000..455b32c
--- /dev/null
+++ b/torch/legacy/nn/MultiCriterion.py
@@ -0,0 +1,41 @@
+import torch
+from .Criterion import Criterion
+from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
+
+
+class MultiCriterion(Criterion):
+
+    def __init__(self, ):
+        super(MultiCriterion, self).__init__()
+        self.criterions = []
+        self.weights = torch.DoubleStorage()
+
+    def add(self, criterion, weight=1):
+        self.criterions.append(criterion)
+        new_weights = torch.DoubleStorage(len(self.criterions))
+        for i, v in enumerate(self.weights):
+            new_weights[i] = v
+        new_weights[len(self.criterions) - 1] = weight
+        self.weights = new_weights
+        return self
+
+    def updateOutput(self, input, target):
+        self.output = 0
+        for i in range(len(self.criterions)):
+            self.output = self.output + self.weights[i] * self.criterions[i].updateOutput(input, target)
+
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
+        recursiveFill(self.gradInput, 0)
+        for i in range(len(self.criterions)):
+            recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
+
+        return self.gradInput
+
+    def type(self, type):
+        for criterion in self.criterions:
+            criterion.type(type)
+
+        return super(MultiCriterion, self).type(type)
diff --git a/torch/legacy/nn/MultiLabelMarginCriterion.py b/torch/legacy/nn/MultiLabelMarginCriterion.py
new file mode 100644
index 0000000..9ca2a23
--- /dev/null
+++ b/torch/legacy/nn/MultiLabelMarginCriterion.py
@@ -0,0 +1,41 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class MultiLabelMarginCriterion(Criterion):
+
+    def __init__(self, sizeAverage=True):
+        super(MultiLabelMarginCriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.isTarget = torch.Tensor()
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        target = target.long()
+        self._backend.MultiLabelMarginCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            self.isTarget,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        target = target.long()
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.MultiLabelMarginCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            self.isTarget,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
new file mode 100644
index 0000000..59b2b29
--- /dev/null
+++ b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
@@ -0,0 +1,41 @@
+import torch
+from .Criterion import Criterion
+from .Sigmoid import Sigmoid
+from .BCECriterion import BCECriterion
+
+
+class MultiLabelSoftMarginCriterion(Criterion):
+    """
+    A MultiLabel multiclass criterion based on sigmoid:
+
+    the loss is:
+    l(x, y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
+    where p[i] = exp(x[i]) / (1 + exp(x[i]))
+
+    and with weights:
+    l(x, y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
+
+
+    """
+
+    def __init__(self, weights=None):
+        super(MultiLabelSoftMarginCriterion, self).__init__()
+        self.lsm = Sigmoid()
+        self.nll = BCECriterion(weights)
+
+    def updateOutput(self, input, target):
+        input = input if input.nelement() == 1 else input.squeeze()
+        target = target if target.nelement() == 1 else target.squeeze()
+        self.lsm.updateOutput(input)
+        self.nll.updateOutput(self.lsm.output, target)
+        self.output = self.nll.output
+        return self.output
+
+    def updateGradInput(self, input, target):
+        size = input.size()
+        input = input if input.nelement() == 1 else input.squeeze()
+        target = target if target.nelement() == 1 else target.squeeze()
+        self.nll.updateGradInput(self.lsm.output, target)
+        self.lsm.updateGradInput(input, self.nll.gradInput)
+        self.gradInput = self.lsm.gradInput.view(size)
+        return self.gradInput
diff --git a/torch/legacy/nn/MultiMarginCriterion.py b/torch/legacy/nn/MultiMarginCriterion.py
new file mode 100644
index 0000000..cc9835c
--- /dev/null
+++ b/torch/legacy/nn/MultiMarginCriterion.py
@@ -0,0 +1,51 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class MultiMarginCriterion(Criterion):
+
+    def __init__(self, p=1, weights=None, margin=1, sizeAverage=True):
+        super(MultiMarginCriterion, self).__init__()
+        if p != 1 and p != 2:
+            raise ValueError("only p == 1 and p == 2 supported")
+        self.p = p
+        self.margin = margin
+        self.sizeAverage = sizeAverage
+        if weights is not None:
+            assert weights.dim() == 1
+        self.weights = weights
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        target = target.long()
+        self._backend.MultiMarginCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.p,
+            self.weights,
+            self.margin,
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        target = target.long()
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.MultiMarginCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.p,
+            self.weights,
+            self.margin,
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/Narrow.py b/torch/legacy/nn/Narrow.py
new file mode 100644
index 0000000..419be6c
--- /dev/null
+++ b/torch/legacy/nn/Narrow.py
@@ -0,0 +1,31 @@
+import torch
+from .Module import Module
+
+
+class Narrow(Module):
+
+    def __init__(self, dimension, offset, length=1):
+        super(Narrow, self).__init__()
+        self.dimension = dimension
+        self.index = offset
+        self.length = length
+
+    def updateOutput(self, input):
+        length = self.length
+        if length < 0:
+            length = input.size(self.dimension) - self.index + self.length + 1
+
+        output = input.narrow(self.dimension, self.index, length)
+        self.output = self.output.type_as(output)
+        self.output.resize_as_(output).copy_(output)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        length = self.length
+        if length < 0:
+            length = input.size(self.dimension) - self.index + self.length + 1
+
+        self.gradInput = self.gradInput.type_as(input)
+        self.gradInput.resize_as_(input).zero_()
+        self.gradInput.narrow(self.dimension, self.index, length).copy_(gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/NarrowTable.py b/torch/legacy/nn/NarrowTable.py
new file mode 100644
index 0000000..48d8a03
--- /dev/null
+++ b/torch/legacy/nn/NarrowTable.py
@@ -0,0 +1,41 @@
+import torch
+from .Module import Module
+from .utils import clear, recursiveResizeAs, recursiveFill
+
+
+class NarrowTable(Module):
+
+    def __init__(self, offset, length=1):
+        super(NarrowTable, self).__init__()
+        self.offset = offset
+        self.length = length
+        self.output = []
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        self.output[:] = [input[self.offset + i] for i in range(self.length)]
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if len(self.gradInput) != len(input):
+            self.gradInput[:] = [None for i in range(len(input))]
+
+        assert len(gradOutput) == self.length
+        for i in range(self.length):
+            self.gradInput[self.offset + i] = gradOutput[i]
+
+        for i in range(len(input)):
+            if i < self.offset or i >= self.offset + self.length:
+                gi = self.gradInput[i]
+                if gi is None:
+                    gi = input[i].new()
+                self.gradInput[i] = recursiveResizeAs(gi, input[i])[0]
+                recursiveFill(self.gradInput[i], 0)
+
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        if not type:
+            return self._type
+        clear(self, 'output', 'gradInput')
+        return super(NarrowTable, self).type(self, type, tensorCache)
diff --git a/torch/legacy/nn/Normalize.py b/torch/legacy/nn/Normalize.py
new file mode 100644
index 0000000..1704bdf
--- /dev/null
+++ b/torch/legacy/nn/Normalize.py
@@ -0,0 +1,155 @@
+import torch
+from torch._six import inf
+from .Module import Module
+from .utils import clear
+
+
+class Normalize(Module):
+
+    def __init__(self, p, eps=1e-10):
+        super(Normalize, self).__init__()
+        assert p > 0
+        self.p = p
+        self.eps = eps
+
+        self._output = None
+        self.norm = None
+        self.buffer = None
+        self._indices = None
+        self.normp = None
+        self._gradInput = None
+        self.cross = None
+        self.buffer2 = None
+
+    def updateOutput(self, input):
+        assert input.dim() == 2
+        input_size = input.size()
+
+        if self._output is None:
+            self._output = input.new()
+        if self.norm is None:
+            self.norm = input.new()
+        if self.buffer is None:
+            self.buffer = input.new()
+
+        self._output.resize_as_(input)
+
+        # specialization for the infinity norm
+        if self.p == inf:
+            if not self._indices:
+                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
+                    else torch.LongTensor()
+
+            torch.abs(input, out=self.buffer)
+            torch.max(self._indices, self.buffer, 1, out=self.norm, keepdim=True)
+            self.norm.add_(self.eps)
+        else:
+            if self.normp is None:
+                self.normp = input.new()
+            if self.p % 2 != 0:
+                torch.abs(input, out=self.buffer).pow_(self.p)
+            else:
+                torch.pow(input, self.p, out=self.buffer)
+
+            torch.sum(self.buffer, 1, out=self.normp, keepdim=True).add_(self.eps)
+            torch.pow(self.normp, 1. / self.p, out=self.norm)
+
+        torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output)
+
+        self.output = self._output.view(input_size)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 2
+        assert gradOutput.dim() == 2
+
+        input_size = input.size()
+        n = input.size(0)  # batch size
+        d = input.size(1)  # dimensionality of vectors
+
+        if self._gradInput is None:
+            self._gradInput = input.new()
+        if self.cross is None:
+            self.cross = input.new()
+        # compute diagonal term with gradOutput
+        self._gradInput.resize_(n, d)
+        if self.p == inf:
+                # specialization for the inf case
+            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
+            self.buffer.resize_as_(input).zero_()
+            self.cross.resize_(n, 1)
+            torch.gather(input, 1, self._indices, out=self.cross)
+            self.cross.div_(self.norm)
+            self.buffer.scatter_(1, self._indices, self.cross)
+        else:
+            torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
+            # small optimizations for different p
+            # buffer = input*|input|^(p-2)
+            # for non-even p, need to add absolute value
+            if self.p % 2 != 0:
+                if self.p < 2:
+                    # add eps to avoid possible division by 0
+                    torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p - 2).mul_(input)
+                else:
+                    torch.abs(input, out=self.buffer).pow_(self.p - 2).mul_(input)
+            # special case for p == 2, pow(x, 0) = 1
+            elif self.p == 2:
+                self.buffer.copy_(input)
+            else:
+                # p is even and > 2, pow(x, p) is always positive
+                torch.pow(input, self.p - 2, out=self.buffer).mul_(input)
+
+        # compute cross term in two steps
+        self.cross.resize_(n, 1)
+
+        # instead of having a huge temporary matrix (b1*b2),
+        #: the computations as b1*(b2*gradOutput). This avoids redundant
+        # computation and also a huge buffer of size n*d^2
+        if self.buffer2 is None:
+            self.buffer2 = input.new()  # nxd
+        torch.mul(input, gradOutput, out=self.buffer2)
+        torch.sum(self.buffer2, 1, out=self.cross, keepdim=True)
+
+        self.buffer.mul_(self.cross.expand_as(self.buffer))
+        self._gradInput.add_(-1, self.buffer)
+
+        # reuse cross buffer for normalization
+        if self.p == inf:
+            torch.mul(self.norm, self.norm, out=self.cross)
+        else:
+            torch.mul(self.normp, self.norm, out=self.cross)
+
+        self._gradInput.div_(self.cross.expand(n, d))
+
+        self.gradInput = self._gradInput.view(input_size)
+        return self.gradInput
+
+    def __repr__(self):
+        return super(Normalize, self).__repr__() + '({})'.format(self.p)
+
+    def type(self, type, tensorCache=None):
+        if not type:
+            return self._type
+        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+        if type == 'torch.cuda.FloatTensor':
+            super(Normalize, self).type(type, tensorCache)
+        else:
+            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
+            # unnecessary memory allocations.
+            indices, self._indices = self._indices, None
+            super(Normalize, self).type(type, tensorCache)
+            self._indices = indices.long() if indices else None
+
+        return self
+
+    def clearState(self):
+        clear(self, [
+            '_output',
+            '_indices',
+            '_gradInput',
+            'buffer',
+            'norm',
+            'normp',
+            'cross',
+        ])
+        return super(Normalize, self).clearState()
diff --git a/torch/legacy/nn/PReLU.py b/torch/legacy/nn/PReLU.py
new file mode 100644
index 0000000..5958676
--- /dev/null
+++ b/torch/legacy/nn/PReLU.py
@@ -0,0 +1,48 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class PReLU(Module):
+
+    def __init__(self, nOutputPlane=0):
+        super(PReLU, self).__init__()
+        # if no argument provided, use shared model (weight is scalar)
+        self.nOutputPlane = nOutputPlane
+        self.weight = torch.Tensor(nOutputPlane or 1).fill_(0.25)
+        self.gradWeight = torch.Tensor(nOutputPlane or 1)
+
+    def updateOutput(self, input):
+        self._backend.PReLU_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.PReLU_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.PReLU_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.gradWeight,
+            scale
+        )
+        return self.gradWeight
+
+    def clearState(self):
+        clear(self, 'gradWeightBuf', 'gradWeightBuf2')
+        return super(PReLU, self).clearState()
diff --git a/torch/legacy/nn/Padding.py b/torch/legacy/nn/Padding.py
new file mode 100644
index 0000000..0c5cdf0
--- /dev/null
+++ b/torch/legacy/nn/Padding.py
@@ -0,0 +1,74 @@
+import torch
+from .Module import Module
+
+
+class Padding(Module):
+    # pad puts in [pad] amount of [value] over dimension [dim], starting at
+    # index [index] in that dimension. If pad<0, index counts from the left.
+    # If pad>0 index counts from the right index = 1 pads before index 1.
+    # index = 2 pads starting before index 2 and after index 1 in dimension [dim]
+    # When nInputDim is provided, inputs larger than that value will be considered batches
+    # where the actual dim to be padded will be dimension dim + 1.
+
+    def __init__(self, dim, pad, value=0, index=0, nInputDim=0):
+        self.value = value
+        self.index = index
+        self.dim = dim
+        self.pad = pad
+        self.nInputDim = nInputDim
+        self.outputSize = torch.Size()
+        super(Padding, self).__init__()
+
+    def updateOutput(self, input):
+        dim = self.dim
+        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
+            dim = dim + 1
+
+        outputSize = list(input.size())
+        outputSize[dim] += abs(self.pad)
+        self.outputSize = torch.Size(outputSize)
+
+        self.output.resize_(self.outputSize)
+        self.output.fill_(self.value)
+        index = self.index
+        pad = self.pad
+        if pad > 0:
+            index = input.size(dim) - index
+        else:
+            pad = -pad
+
+        if index == 0:
+            self.output.narrow(dim, pad, input.size(dim)).copy_(input)
+        elif index == input.size(dim):
+            self.output.narrow(dim, 0, input.size(dim)).copy_(input)
+        else:
+            self.output.narrow(dim, 0, index).copy_(input.narrow(dim, 0, index))
+            self.output.narrow(dim, index + pad, input.size(dim) -
+                               index).copy_(input.narrow(dim, index, input.size(dim) - index))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input)
+        dim = self.dim
+
+        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
+            dim = dim + 1
+
+        index = self.index
+        pad = self.pad
+        if pad > 0:
+            index = input.size(dim) - index
+        else:
+            pad = -pad
+
+        if index == 0:
+            self.gradInput.copy_(gradOutput.narrow(dim, pad, input.size(dim)))
+        elif index == input.size(dim):
+            self.gradInput.copy_(gradOutput.narrow(dim, 0, input.size(dim)))
+        else:
+            self.gradInput.narrow(dim, 0, index).copy_(gradOutput.narrow(dim, 0, index))
+            self.gradInput.narrow(dim, index, input.size(
+                dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
+
+        return self.gradInput
diff --git a/torch/legacy/nn/PairwiseDistance.py b/torch/legacy/nn/PairwiseDistance.py
new file mode 100644
index 0000000..98c6268
--- /dev/null
+++ b/torch/legacy/nn/PairwiseDistance.py
@@ -0,0 +1,83 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class PairwiseDistance(Module):
+
+    def __init__(self, p):
+        super(PairwiseDistance, self).__init__()
+        assert p % 1 == 0
+        self.gradInput = []
+        self.diff = torch.Tensor()
+        self.norm = p
+
+        self.outExpand = None
+        self.grad = None
+        self.ones = None
+
+    def updateOutput(self, input):
+        self.output.resize_(1)
+        assert input[0].dim() == 2
+
+        if self.diff is None:
+            self.diff = input[0].new()
+
+        torch.add(input[0], -1, input[1], out=self.diff).abs_()
+
+        self.output.resize_(input[0].size(0))
+        self.output.zero_()
+        self.output.add_(self.diff.pow_(self.norm).sum(1, keepdim=False))
+        self.output.pow_(1. / self.norm)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input[0].dim() == 2
+
+        if len(self.gradInput) != 2:
+            self.gradInput[:] = [None, None]
+
+        if self.gradInput[0] is None:
+            self.gradInput[0] = input[0].new()
+        self.gradInput[0].resize_(input[0].size())
+        if self.gradInput[1] is None:
+            self.gradInput[1] = input[1].new()
+        self.gradInput[1].resize_(input[1].size())
+        self.gradInput[0].copy_(input[0])
+        self.gradInput[0].add_(-1, input[1])
+
+        if self.norm == 1:
+            self.gradInput[0].sign_()
+        else:
+            # Note: derivative of p-norm:
+            # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
+            if self.norm > 2:
+                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm - 2))
+
+            if self.outExpand is None:
+                self.outExpand = self.output.new()
+            self.outExpand.resize_(self.output.size(0), 1)
+            self.outExpand.copy_(self.output.view(self.output.size(0), 1))
+            self.outExpand.add_(1e-6)  # Prevent divide by zero errors
+            self.outExpand.pow_(-(self.norm - 1))
+            self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
+                                                         self.gradInput[0].size(1)))
+
+        if self.grad is None:
+            self.grad = gradOutput.new()
+        if self.ones is None:
+            self.ones = gradOutput.new()
+
+        self.grad.resize_as_(input[0]).zero_()
+        self.ones.resize_(input[0].size(1)).fill_(1)
+
+        self.grad.addr_(gradOutput, self.ones)
+        self.gradInput[0].mul_(self.grad)
+
+        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'diff', 'outExpand', 'grad', 'ones')
+        return super(PairwiseDistance, self).clearState()
diff --git a/torch/legacy/nn/Parallel.py b/torch/legacy/nn/Parallel.py
new file mode 100644
index 0000000..6db1c06
--- /dev/null
+++ b/torch/legacy/nn/Parallel.py
@@ -0,0 +1,105 @@
+import torch
+from .Container import Container
+
+
+class Parallel(Container):
+
+    def __init__(self, inputDimension, outputDimension):
+        super(Parallel, self).__init__()
+        self.inputDimension = inputDimension
+        self.outputDimension = outputDimension
+        self.totalOutputSize = None
+
+    def updateOutput(self, input):
+        nModule = input.size(self.inputDimension)
+        outputs = []
+
+        for i in range(nModule):
+            currentInput = input.select(self.inputDimension, i)
+            currentOutput = self.modules[i].updateOutput(currentInput)
+            outputs.append(currentOutput)
+            outputSize = currentOutput.size(self.outputDimension)
+
+            if i == 0:
+                totalOutputSize = list(currentOutput.size())
+            else:
+                totalOutputSize[self.outputDimension] += outputSize
+
+        self.totalOutputSize = torch.Size(totalOutputSize)
+        self.output.resize_(self.totalOutputSize)
+
+        offset = 0
+        for i in range(nModule):
+            currentOutput = outputs[i]
+            outputSize = currentOutput.size(self.outputDimension)
+            self.output.narrow(self.outputDimension, offset, outputSize).copy_(currentOutput)
+            offset = offset + currentOutput.size(self.outputDimension)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        nModule = input.size(self.inputDimension)
+        self.gradInput.resize_as_(input)
+
+        offset = 0
+        for i in range(nModule):
+            module = self.modules[i]
+            currentInput = input.select(self.inputDimension, i)
+            currentOutput = module.output
+            outputSize = currentOutput.size(self.outputDimension)
+            currentGradOutput = gradOutput.narrow(self.outputDimension, offset, outputSize)
+
+            currentGradInput = module.updateGradInput(currentInput, currentGradOutput)
+
+            self.gradInput.select(self.inputDimension, i).copy_(currentGradInput)
+            offset = offset + outputSize
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        nModule = input.size(self.inputDimension)
+
+        offset = 0
+        for i in range(nModule):
+            module = self.modules[i]
+            currentOutput = module.output
+            outputSize = currentOutput.size(self.outputDimension)
+
+            module.accGradParameters(
+                input.select(self.inputDimension, i),
+                gradOutput.narrow(self.outputDimension, offset, outputSize),
+                scale)
+            offset += outputSize
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        nModule = input.size(self.inputDimension)
+
+        offset = 0
+        for i in range(nModule):
+            module = self.modules[i]
+            currentOutput = module.output
+            module.accupdateGradParameters(
+                input.select(self.inputDimension, i),
+                gradOutput.narrow(self.outputDimension, offset, currentOutput.size(self.outputDimension)),
+                lr)
+            offset = offset + currentOutput.size(self.outputDimension)
+
+    def __repr__(self):
+        tab = '  '
+        line = '\n'
+        next = '  |`-> '
+        ext = '  |    '
+        extlast = '       '
+        last = '   ... -> '
+        res = torch.typename(self)
+        res += ' {' + line + tab + 'input'
+        for i in range(len(self.modules)):
+            if i == len(self.modules) - 1:
+                res += line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
+
+        res += line + tab + last + 'output'
+        res += line + '}'
+        return res
diff --git a/torch/legacy/nn/ParallelCriterion.py b/torch/legacy/nn/ParallelCriterion.py
new file mode 100644
index 0000000..7ecfd95
--- /dev/null
+++ b/torch/legacy/nn/ParallelCriterion.py
@@ -0,0 +1,39 @@
+import torch
+from .Criterion import Criterion
+from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
+
+
+class ParallelCriterion(Criterion):
+
+    def __init__(self, repeatTarget=False):
+        super(ParallelCriterion, self).__init__()
+        self.criterions = []
+        self.weights = []
+        self.gradInput = []
+        self.repeatTarget = repeatTarget
+
+    def add(self, criterion, weight=1):
+        self.criterions.append(criterion)
+        self.weights.append(weight)
+        return self
+
+    def updateOutput(self, input, target):
+        self.output = 0
+        for i, criterion in enumerate(self.criterions):
+            current_target = target if self.repeatTarget else target[i]
+            self.output += self.weights[i] * criterion.updateOutput(input[i], current_target)
+
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
+        recursiveFill(self.gradInput, 0)
+        for i, criterion in enumerate(self.criterions):
+            current_target = target if self.repeatTarget else target[i]
+            recursiveAdd(self.gradInput[i], self.weights[i], criterion.updateGradInput(input[i], current_target))
+
+        return self.gradInput
+
+    def type(self, type=None, tensorCache=None):
+        self.gradInput = []
+        return super(ParallelCriterion, self).type(type, tensorCache)
diff --git a/torch/legacy/nn/ParallelTable.py b/torch/legacy/nn/ParallelTable.py
new file mode 100644
index 0000000..41912a6
--- /dev/null
+++ b/torch/legacy/nn/ParallelTable.py
@@ -0,0 +1,60 @@
+import torch
+from .Container import Container
+
+
+class ParallelTable(Container):
+
+    def __init__(self, ):
+        super(ParallelTable, self).__init__()
+        self.modules = []
+        self.output = []
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        for i in range(len(self.modules)):
+            tmp = self.modules[i].updateOutput(input[i])
+            if len(self.output) <= i:
+                self.output.append(tmp)
+            else:
+                self.output[i] = tmp
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        for i, module in enumerate(self.modules):
+            tmp = module.updateGradInput(input[i], gradOutput[i])
+            if len(self.gradInput) <= i:
+                self.gradInput.append(tmp)
+            else:
+                self.gradInput[i] = tmp
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        for i, module in enumerate(self.modules):
+            module.accGradParameters(input[i], gradOutput[i], scale)
+
+    def accUpdateGradParameters(self, input, gradOutput, lr=1):
+        for i, module in enumerate(self.modules):
+            module.accUpdateGradParameters(input[i], gradOutput[i], lr)
+
+    def __repr__(self):
+        tab = '  '
+        line = '\n'
+        next = '  |`-> '
+        ext = '  |    '
+        extlast = '       '
+        last = '   ... -> '
+        res = torch.typename(self)
+        res = res + ' {' + line + tab + 'input'
+        for i in range(len(self.modules)):
+            if i == len(self.modules) - 1:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + ext)
+
+        res = res + line + tab + last + 'output'
+        res = res + line + '}'
+        return res
diff --git a/torch/legacy/nn/PartialLinear.py b/torch/legacy/nn/PartialLinear.py
new file mode 100644
index 0000000..66aa811
--- /dev/null
+++ b/torch/legacy/nn/PartialLinear.py
@@ -0,0 +1,115 @@
+import torch
+from .Module import Module
+from .Identity import Identity
+from .LookupTable import LookupTable
+from .Sequential import Sequential
+from .ParallelTable import ParallelTable
+from .MM import MM
+
+
+class PartialLinear(Module):
+    """
+    PartialLinear is a Linear layer that allows the user to a set a collection of
+    column indices. When the column indices are set, the layer will behave like a
+    Linear layer that only has those columns. Meanwhile, all parameters are
+    preserved, so resetting the PartialLinear layer will result in a module that
+    behaves just like a regular Linear layer.
+
+    This module is useful, for instance, when you want to: forward-backward on
+    only a subset of a Linear layer during training but use the full Linear layer
+    at test time.
+    """
+
+    def __init__(self, inputsize, outputsize, bias=True):
+        super(PartialLinear, self).__init__()
+
+        # define the layer as a small network:
+        pt = ParallelTable()
+        pt.add(Identity()).add(LookupTable(outputsize, inputsize))
+        self.network = Sequential().add(pt).add(MM(False, True))
+        if bias:
+            self.bias = torch.zeros(1, outputsize)
+            self.gradBias = torch.zeros(1, outputsize)
+        else:
+            self.bias = self.gradBias = None
+
+        # set partition:
+        self.inputsize = inputsize
+        self.outputsize = outputsize
+        self.allcolumns = torch.arange(0, self.outputsize).long()
+        self.resetPartition()
+        self.addBuffer = None
+        self.buffer = None
+
+    def setPartition(self, indices):
+        self.partition = indices.type(self.allcolumns.type())
+        return self
+
+    def resetPartition(self):
+        self.partition = self.allcolumns
+        return self
+
+    def parameters(self):
+        return [self.network.get(0).get(1).weight, self.bias], \
+               [self.network.get(0).get(1).gradWeight, self.gradBias]
+        # should return only the relevant partition?
+
+    def updateOutput(self, input):
+        self.output.set_(self.network.forward([input, self.partition]))
+        if self.bias is not None:
+            self.output.add_(torch.index_select(self.bias, 1, self.partition).expand_as(self.output))
+            if self.addBuffer is None:
+                self.addBuffer = input.new()
+            if self.addBuffer.nelement() != input.size(0):
+                self.addBuffer.resize_(input.size(0)).fill_(1)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is not None:
+            self.network.updateGradInput([input, self.partition], gradOutput)
+            self.gradInput.set_(self.network.gradInput[0])
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self.network.accGradParameters([input, self.partition], gradOutput, scale)
+        if self.bias is not None:
+            if self.buffer is None:
+                self.buffer = input.new()
+            self.buffer.resize_(gradOutput.size(1))
+            torch.mv(gradOutput.t(), self.addBuffer, out=self.buffer).mul_(scale)
+            self.gradBias.index_add_(
+                1, self.partition, self.buffer.view(1, self.buffer.nelement())
+            )
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        gradWeight = self.network.get(0).get(1).gradWeight
+        gradBias = self.gradBias
+        self.network.get(0).get(1).gradWeight = self.network.get(0).get(1).weight
+        self.gradBias = self.bias
+        self.accGradParameters(input, gradOutput, -lr)
+        self.network.get(0).get(1).gradWeight = gradWeight
+        self.gradBias = gradBias
+
+    def zeroGradParameters(self):
+        self.network.zeroGradParameters()
+        self.gradBias.zero_()
+
+    def updateParameters(self, learningRate):
+        self.network.updateParameters(learningRate)
+        self.bias._add(-learningRate, self.gradBias)
+
+    def type(self, type=None, tensorCache=None):
+        result = super(PartialLinear, self).type(type, tensorCache)
+        self.partition = self.partition.long()
+        self.allcolumns = self.allcolumns.long()
+        if type == 'torch.cuda.FloatTensor':
+            self.allcolumns = self.allcolumns.cuda()
+            self.partition = self.partition.cuda()
+        return result
+
+    def __repr__(self):
+        return super(ParallelTable, self).__repr__() + \
+            '({} -> {})'.format(self.inputsize, self.outputsize) + \
+            ' without bias' if self.bias is None else ''
diff --git a/torch/legacy/nn/Power.py b/torch/legacy/nn/Power.py
new file mode 100644
index 0000000..20b23ba
--- /dev/null
+++ b/torch/legacy/nn/Power.py
@@ -0,0 +1,20 @@
+import torch
+from .Module import Module
+
+
+class Power(Module):
+
+    def __init__(self, p):
+        super(Power, self).__init__()
+        self.pow = p
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input).copy_(input)
+        self.output.pow_(self.pow)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input).copy_(input)
+        self.gradInput.pow_(self.pow - 1)
+        self.gradInput.mul_(gradOutput).mul_(self.pow)
+        return self.gradInput
diff --git a/torch/legacy/nn/RReLU.py b/torch/legacy/nn/RReLU.py
new file mode 100644
index 0000000..237d927
--- /dev/null
+++ b/torch/legacy/nn/RReLU.py
@@ -0,0 +1,51 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class RReLU(Module):
+
+    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
+        super(RReLU, self).__init__()
+        self.lower = lower
+        self.upper = upper
+        self.inplace = inplace
+
+        assert self.lower <= self.upper and self.lower >= 0 and self.upper >= 0
+        self.noise = torch.Tensor()
+        self.train = True
+
+    def updateOutput(self, input):
+        self._backend.RReLU_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.noise,
+            self.lower,
+            self.upper,
+            self.train,
+            self.inplace,
+            torch.default_generator if not input.is_cuda else 0
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.RReLU_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.noise,
+            self.lower,
+            self.upper,
+            self.train,
+            self.inplace
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        return super(RReLU, self).__repr__() + '({:.4f}, {:.4f})'.format(self.lower, self.upper)
+
+    def clearState(self):
+        clear(self, 'noise')
+        return super(RReLU, self).clearState()
diff --git a/torch/legacy/nn/ReLU.py b/torch/legacy/nn/ReLU.py
new file mode 100644
index 0000000..2674f47
--- /dev/null
+++ b/torch/legacy/nn/ReLU.py
@@ -0,0 +1,8 @@
+import torch
+from .Threshold import Threshold
+
+
+class ReLU(Threshold):
+
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 0, inplace)
diff --git a/torch/legacy/nn/ReLU6.py b/torch/legacy/nn/ReLU6.py
new file mode 100644
index 0000000..cb8b59d
--- /dev/null
+++ b/torch/legacy/nn/ReLU6.py
@@ -0,0 +1,28 @@
+import torch
+from .Module import Module
+
+
+class ReLU6(Module):
+
+    def __init__(self, inplace=False):
+        super(ReLU6, self).__init__()
+        self.inplace = inplace
+
+    def updateOutput(self, input):
+        self._backend.HardTanh_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            0, 6, self.inplace
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.HardTanh_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            0, 6, self.inplace
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/Replicate.py b/torch/legacy/nn/Replicate.py
new file mode 100644
index 0000000..4eed0b5
--- /dev/null
+++ b/torch/legacy/nn/Replicate.py
@@ -0,0 +1,33 @@
+import torch
+from .Module import Module
+
+
+class Replicate(Module):
+
+    def __init__(self, nf, dim=0):
+        super(Replicate, self).__init__()
+        self.nfeatures = nf
+        self.dim = dim
+        assert self.dim >= 0
+
+    def updateOutput(self, input):
+        assert self.dim < input.dim()
+
+        size = list(input.size())
+        size.insert(self.dim, self.nfeatures)
+
+        stride = list(input.stride())
+        stride.insert(self.dim, 0)
+
+        self.output.set_(input.storage(), input.storage_offset(),
+                         torch.Size(size), tuple(stride))
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input).zero_()
+        size = list(input.size())
+        size.insert(self.dim, 1)
+
+        gradInput = self.gradInput.view(*size)
+        torch.sum(gradOutput, self.dim, True, out=gradInput)
+        return self.gradInput
diff --git a/torch/legacy/nn/Reshape.py b/torch/legacy/nn/Reshape.py
new file mode 100644
index 0000000..23d5ad9
--- /dev/null
+++ b/torch/legacy/nn/Reshape.py
@@ -0,0 +1,53 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Reshape(Module):
+
+    def __init__(self, *args):
+        super(Reshape, self).__init__()
+
+        if len(args) == 0 and isinstance(args[0], torch.Size):
+            self.size = args[0]
+        else:
+            self.size = torch.Size(args)
+
+        self.nelement = 1
+        for s in self.size:
+            self.nelement *= s
+
+        self._input = None
+        self._gradOutput = None
+
+    def updateOutput(self, input):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input)
+            self._input.copy_(input)
+            input = self._input
+
+        batchsize = [input.size(0)] + list(self.size)
+        self.output = input.view(torch.Size(batchsize))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if not gradOutput.is_contiguous():
+            if self._gradOutput is None:
+                self._gradOutput = gradOutput.new()
+            self._gradOutput.resize_as_(gradOutput)
+            self._gradOutput.copy_(gradOutput)
+            gradOutput = self._gradOutput
+
+        self.gradInput = gradOutput.view_as(input)
+        return self.gradInput
+
+    def __repr__(self):
+        return super(Reshape, self).__repr__() + \
+            '({})'.format('x'.join(map(lambda x: str(x), self.size)))
+
+    def clearState(self):
+        clear(self, '_input', '_gradOutput')
+        return super(Reshape, self).clearState()
diff --git a/torch/legacy/nn/Select.py b/torch/legacy/nn/Select.py
new file mode 100644
index 0000000..287cb00
--- /dev/null
+++ b/torch/legacy/nn/Select.py
@@ -0,0 +1,23 @@
+import torch
+from .Module import Module
+
+
+class Select(Module):
+
+    def __init__(self, dimension, index):
+        super(Select, self).__init__()
+        self.dimension = dimension
+        self.index = index
+
+    def updateOutput(self, input):
+        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
+        output = input.select(self.dimension, index)
+        self.output.resize_as_(output)
+        return self.output.copy_(output)
+
+    def updateGradInput(self, input, gradOutput):
+        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
+        self.gradInput.resize_as_(input)
+        self.gradInput.zero_()
+        self.gradInput.select(self.dimension, index).copy_(gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/SelectTable.py b/torch/legacy/nn/SelectTable.py
new file mode 100644
index 0000000..9150c44
--- /dev/null
+++ b/torch/legacy/nn/SelectTable.py
@@ -0,0 +1,56 @@
+import torch
+from .Module import Module
+from .utils import recursiveCopy, clear
+
+
+class SelectTable(Module):
+
+    def __init__(self, index):
+        super(SelectTable, self).__init__()
+        self.index = index
+        self.gradInput = []
+
+    def updateOutput(self, input):
+        # handle negative indices
+        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
+        assert len(input) > index
+        self.output = input[index]
+        return self.output
+
+    def _zeroTableCopy(self, l1, l2):
+        for i, v in enumerate(l2):
+            if isinstance(v, list):
+                if len(l1) > i:
+                    l1[i] = self._zeroTableCopy(l1[i], l2[i])
+                else:
+                    l1.append(self._zeroTableCopy([], l2[i]))
+            else:
+                if i >= len(l1):
+                    l1.append(v.new().resize_as_(v).zero_())
+                else:
+                    l1[i].resize_as_(v)
+                    l1[i].zero_()
+        del l1[len(l2):]
+        return l1
+
+    def updateGradInput(self, input, gradOutput):
+        # make gradInput a zeroed copy of input
+        self._zeroTableCopy(self.gradInput, input)
+        # handle negative indices
+        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
+        # copy into gradInput[index] (necessary for variable sized inputs)
+        assert self.gradInput[index] is not None
+        recursiveCopy(self.gradInput[index], gradOutput)
+        return self.gradInput
+
+    def type(self, type, tensorCache=None):
+        del self.gradInput[:]
+        if isinstance(self.output, list):
+            del self.output[:]
+        return super(SelectTable, self).type(type, tensorCache)
+
+    def __repr__(self):
+        return super(SelectTable, self).__repr__() + '({})'.format(self.index)
+
+    def clearState(self):
+        clear(self, 'gradInput')
diff --git a/torch/legacy/nn/Sequential.py b/torch/legacy/nn/Sequential.py
new file mode 100644
index 0000000..e3c4a00
--- /dev/null
+++ b/torch/legacy/nn/Sequential.py
@@ -0,0 +1,86 @@
+import torch
+from .Container import Container
+
+
+class Sequential(Container):
+
+    def __len__(self):
+        return len(self.modules)
+
+    def add(self, module):
+        if len(self.modules) == 0:
+            self.gradInput = module.gradInput
+
+        self.modules.append(module)
+        self.output = module.output
+        return self
+
+    def insert(self, module, index):
+        self.modules.insert(module, index)
+        self.output = self.modules[-1].output
+        self.gradInput = self.modules[0].gradInput
+
+    def remove(self, index=-1):
+        del self.modules[index]
+
+        if len(self.modules) > 0:
+            self.output = self.modules[-1].output
+            self.gradInput = self.modules[0].gradInput
+        else:
+            self.output = torch.Tensor()
+            self.gradInput = torch.Tensor()
+
+    def updateOutput(self, input):
+        currentOutput = input
+        for i, module in enumerate(self.modules):
+            currentOutput = module.updateOutput(currentOutput)
+        self.output = currentOutput
+        return self.output
+
+    def _iter_with_prev(self):
+        return zip(self.modules[-2::-1], self.modules[-1:0:-1])
+
+    def updateGradInput(self, input, gradOutput):
+        currentGradOutput = gradOutput
+        for prev, current in self._iter_with_prev():
+            currentGradOutput = current.updateGradInput(prev.output, currentGradOutput)
+        self.gradInput = self.modules[0].updateGradInput(input, currentGradOutput)
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        currentGradOutput = gradOutput
+        for prev, current in self._iter_with_prev():
+            current.accGradParameters(prev.output, currentGradOutput, scale)
+            currentGradOutput = current.gradInput
+        self.modules[0].accGradParameters(input, currentGradOutput, scale)
+
+    def backward(self, input, gradOutput, scale=1):
+        currentGradOutput = gradOutput
+        for prev, current in self._iter_with_prev():
+            currentGradOutput = current.backward(prev.output, currentGradOutput, scale)
+            # currentModule.gradInput = currentGradOutput
+        self.gradInput = self.modules[0].backward(input, currentGradOutput, scale)
+        return self.gradInput
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        currentGradOutput = gradOutput
+        for prev, current in self._iter_with_prev():
+            current.accUpdateGradParameters(prev.output, currentGradOutput, lr)
+            currentGradOutput = current.gradInput
+        self.modules[0].accUpdateGradParameters(input, currentGradOutput, lr)
+
+    def __repr__(self):
+        tab = '  '
+        line = '\n'
+        next = ' -> '
+        res = 'nn.Sequential'
+        res = res + ' {' + line + tab + '[input'
+        for i in range(len(self.modules)):
+            res = res + next + '(' + str(i) + ')'
+
+        res = res + next + 'output]'
+        for i in range(len(self.modules)):
+            res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
+
+        res = res + line + '}'
+        return res
diff --git a/torch/legacy/nn/Sigmoid.py b/torch/legacy/nn/Sigmoid.py
new file mode 100644
index 0000000..47b42dd
--- /dev/null
+++ b/torch/legacy/nn/Sigmoid.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+
+
+class Sigmoid(Module):
+
+    def updateOutput(self, input):
+        self._backend.Sigmoid_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.Sigmoid_updateGradInput(
+            self._backend.library_state,
+            gradOutput,
+            self.gradInput,
+            self.output
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SmoothL1Criterion.py b/torch/legacy/nn/SmoothL1Criterion.py
new file mode 100644
index 0000000..714d0b6
--- /dev/null
+++ b/torch/legacy/nn/SmoothL1Criterion.py
@@ -0,0 +1,36 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class SmoothL1Criterion(Criterion):
+
+    def __init__(self, sizeAverage=True):
+        super(SmoothL1Criterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.SmoothL1Criterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.SmoothL1Criterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SoftMarginCriterion.py b/torch/legacy/nn/SoftMarginCriterion.py
new file mode 100644
index 0000000..4bfa371
--- /dev/null
+++ b/torch/legacy/nn/SoftMarginCriterion.py
@@ -0,0 +1,36 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class SoftMarginCriterion(Criterion):
+
+    def __init__(self, ):
+        super(SoftMarginCriterion, self).__init__()
+        self.sizeAverage = True
+        self.output_tensor = None
+
+    def updateOutput(self, input, target):
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.SoftMarginCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.SoftMarginCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SoftMax.py b/torch/legacy/nn/SoftMax.py
new file mode 100644
index 0000000..9a2e2c7
--- /dev/null
+++ b/torch/legacy/nn/SoftMax.py
@@ -0,0 +1,25 @@
+import torch
+from .Module import Module
+
+
+class SoftMax(Module):
+
+    def __init__(self, dim=None):
+        super(SoftMax, self).__init__()
+        if dim is not None:
+            self.dim = dim
+
+    def _get_dim(self, input):
+        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
+
+    def updateOutput(self, input):
+        self.output = torch.softmax(input, self._get_dim(input))
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput = torch.softmax_backward_data(
+            gradOutput,
+            self.output,
+            self._get_dim(input),
+            input)
+        return self.gradInput
diff --git a/torch/legacy/nn/SoftMin.py b/torch/legacy/nn/SoftMin.py
new file mode 100644
index 0000000..f85f30d
--- /dev/null
+++ b/torch/legacy/nn/SoftMin.py
@@ -0,0 +1,43 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SoftMin(Module):
+
+    def __init__(self, dim=None):
+        super(SoftMin, self).__init__()
+        self.mininput = None
+        if dim is not None:
+            self.dim = dim
+
+    def _get_dim(self, input):
+        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
+
+    def updateOutput(self, input):
+        if self.mininput is None:
+            self.mininput = input.new()
+        self.mininput.resize_as_(input).copy_(input).mul_(-1)
+        self.output = torch.softmax(
+            self.mininput,
+            self._get_dim(input)
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.mininput is None:
+            self.mininput = input.new()
+        self.mininput.resize_as_(input).copy_(input).mul_(-1)
+        self.gradInput = torch.softmax_backward_data(
+            gradOutput,
+            self.output,
+            self._get_dim(input),
+            self.mininput
+        )
+
+        self.gradInput.mul_(-1)
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'mininput')
+        return super(SoftMin, self).clearState()
diff --git a/torch/legacy/nn/SoftPlus.py b/torch/legacy/nn/SoftPlus.py
new file mode 100644
index 0000000..062600d
--- /dev/null
+++ b/torch/legacy/nn/SoftPlus.py
@@ -0,0 +1,38 @@
+import torch
+from .Module import Module
+
+
+class SoftPlus(Module):
+
+    def __init__(self, beta=1, threshold=20):
+        super(SoftPlus, self).__init__()
+        self.beta = beta              # Beta controls sharpness of transfer function
+        self.threshold = threshold    # Avoid floating point issues with exp(x), x>20
+
+    def updateOutput(self, input):
+        # f(x) = 1/beta * log(1 + exp(beta * x))
+        self._backend.SoftPlus_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.beta,
+            self.threshold
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        # d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+        # SINCE
+        # y = (1/k)*log(1+exp(k*x)) #> x = (1/k)*log(exp(k*y)-1)
+        # THEREFORE:
+        # d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+        self._backend.SoftPlus_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.output,
+            self.beta,
+            self.threshold
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SoftShrink.py b/torch/legacy/nn/SoftShrink.py
new file mode 100644
index 0000000..a3ac316
--- /dev/null
+++ b/torch/legacy/nn/SoftShrink.py
@@ -0,0 +1,28 @@
+import torch
+from .Module import Module
+
+
+class SoftShrink(Module):
+
+    def __init__(self, lambd=0.5):
+        super(SoftShrink, self).__init__()
+        self.lambd = lambd
+
+    def updateOutput(self, input):
+        self._backend.SoftShrink_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.lambd
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.SoftShrink_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.lambd
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SoftSign.py b/torch/legacy/nn/SoftSign.py
new file mode 100644
index 0000000..9aa58c1
--- /dev/null
+++ b/torch/legacy/nn/SoftSign.py
@@ -0,0 +1,29 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SoftSign(Module):
+
+    def __init__(self):
+        super(SoftSign, self).__init__()
+        self.temp = None
+        self.tempgrad = None
+
+    def updateOutput(self, input):
+        if self.temp is None:
+            self.temp = input.new()
+        self.temp.resize_as_(input).copy_(input).abs_().add_(1)
+        self.output.resize_as_(input).copy_(input).div_(self.temp)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.tempgrad is None:
+            self.tempgrad = input.new()
+        self.tempgrad.resize_as_(self.output).copy_(input).abs_().add_(1).mul_(self.tempgrad)
+        self.gradInput.resize_as_(input).copy_(gradOutput).div_(self.tempgrad)
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'temp', 'tempgrad')
+        return super(SoftSign, self).clearState()
diff --git a/torch/legacy/nn/SpatialAdaptiveMaxPooling.py b/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
new file mode 100644
index 0000000..b8ed874
--- /dev/null
+++ b/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
@@ -0,0 +1,40 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialAdaptiveMaxPooling(Module):
+
+    def __init__(self, w, h):
+        super(SpatialAdaptiveMaxPooling, self).__init__()
+        self.w = w
+        self.h = h
+        self.indices = None
+
+    def updateOutput(self, input):
+        if self.indices is None:
+            self.indices = input.new()
+        self.indices = self.indices.long()
+        self._backend.SpatialAdaptiveMaxPooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.w,
+            self.h
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.SpatialAdaptiveMaxPooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices
+        )
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'indices')
+        return super(SpatialAdaptiveMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/SpatialAveragePooling.py b/torch/legacy/nn/SpatialAveragePooling.py
new file mode 100644
index 0000000..acf4c64
--- /dev/null
+++ b/torch/legacy/nn/SpatialAveragePooling.py
@@ -0,0 +1,79 @@
+import torch
+from .Module import Module
+
+
+class SpatialAveragePooling(Module):
+
+    def __init__(self, kW, kH, dW=1, dH=1, padW=0, padH=0):
+        super(SpatialAveragePooling, self).__init__()
+
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+        self.padW = padW
+        self.padH = padH
+        self.ceil_mode = False
+        self.count_include_pad = True
+        self.divide = True
+
+    def ceil(self):
+        self.ceil_mode = True
+        return self
+
+    def floor(self):
+        self.ceil_mode = False
+        return self
+
+    def setCountIncludePad(self):
+        self.count_include_pad = True
+        return self
+
+    def setCountExcludePad(self):
+        self.count_include_pad = False
+        return self
+
+    def updateOutput(self, input):
+        self._backend.SpatialAveragePooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.ceil_mode,
+            self.count_include_pad
+        )
+        # for backward compatibility with saved models
+        # which are not supposed to have "divide" field
+        if not self.divide:
+            self.output.mul_(self.kW * self.kH)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is not None:
+            self._backend.SpatialAveragePooling_updateGradInput(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradInput,
+                self.kW, self.kH,
+                self.dW, self.dH,
+                self.padW, self.padH,
+                self.ceil_mode,
+                self.count_include_pad
+            )
+            # for backward compatibility
+            if not self.divide:
+                self.gradInput.mul_(self.kW * self.kH)
+
+            return self.gradInput
+
+    def __repr__(self):
+        s = super(SpatialAveragePooling, self).__repr__()
+        s += '({}x{}, {}, {}'.format(self.kW, self.kH, self.dW, self.dH)
+        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
+            s += ', {}, {}'.format(self.padW, self.padH)
+        s += ')'
+        return s
diff --git a/torch/legacy/nn/SpatialBatchNormalization.py b/torch/legacy/nn/SpatialBatchNormalization.py
new file mode 100644
index 0000000..725ebff
--- /dev/null
+++ b/torch/legacy/nn/SpatialBatchNormalization.py
@@ -0,0 +1,37 @@
+import torch
+from .BatchNormalization import BatchNormalization
+
+
+class SpatialBatchNormalization(BatchNormalization):
+    """
+       This class implements Batch Normalization as described in the paper:
+       "Batch Normalization: Accelerating Deep Network Training
+                             by Reducing Internal Covariate Shift"
+                   by Sergey Ioffe, Christian Szegedy
+
+       This implementation is useful for inputs coming from convolution layers.
+       For non-convolutional layers, see BatchNormalization.lua
+
+       The operation implemented is:
+               (x - mean(x))
+       y = --------------------- * gamma + beta
+           standard-deviation(x)
+       where gamma and beta are learnable parameters.
+
+       The learning of gamma and beta is optional.
+
+       Usage:
+       with    learnable parameters: nn.SpatialBatchNormalization(N [, eps] [, momentum])
+                                   where N = dimensionality of input
+       without learnable parameters: nn.SpatialBatchNormalization(N [, eps] [, momentum], False)
+
+       eps is a small value added to the variance to avoid divide-by-zero.
+           Defaults to 1e-5
+
+       In training time, this layer keeps a running estimate of it's computed mean and std.
+       The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+       In test time, this running mean/std is used to normalize.
+    """
+
+    # expected dimension of input
+    nDim = 4
diff --git a/torch/legacy/nn/SpatialClassNLLCriterion.py b/torch/legacy/nn/SpatialClassNLLCriterion.py
new file mode 100644
index 0000000..8a7e15c
--- /dev/null
+++ b/torch/legacy/nn/SpatialClassNLLCriterion.py
@@ -0,0 +1,48 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class SpatialClassNLLCriterion(Criterion):
+
+    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
+        assert weights is None or weights.dim() == 1
+        super(SpatialClassNLLCriterion, self).__init__()
+        self.sizeAverage = sizeAverage
+        self.weights = weights
+        self.ignore_index = ignore_index
+
+        self.output_tensor = torch.zeros(1)
+        self.total_weight_tensor = torch.ones(1)
+
+    def updateOutput(self, input, target):
+        if not hasattr(self, 'ignore_index'):
+            self.ignore_index = -100
+        self._backend.SpatialClassNLLCriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            target,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.weights,
+            self.total_weight_tensor,
+            self.ignore_index,
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self.gradInput.resize_as_(input).zero_()
+        implicit_gradOutput = torch.ones(1).type_as(input)
+        self._backend.SpatialClassNLLCriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            target,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+            self.weights,
+            self.total_weight_tensor,
+            self.ignore_index,
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SpatialContrastiveNormalization.py b/torch/legacy/nn/SpatialContrastiveNormalization.py
new file mode 100644
index 0000000..a47f946
--- /dev/null
+++ b/torch/legacy/nn/SpatialContrastiveNormalization.py
@@ -0,0 +1,42 @@
+import torch
+from .Module import Module
+from .Sequential import Sequential
+from .SpatialSubtractiveNormalization import SpatialSubtractiveNormalization
+from .SpatialDivisiveNormalization import SpatialDivisiveNormalization
+
+
+class SpatialContrastiveNormalization(Module):
+
+    def __init__(self, nInputPlane=1, kernel=None, threshold=1e-4, thresval=1e-4):
+        super(SpatialContrastiveNormalization, self).__init__()
+
+        # get args
+        self.nInputPlane = nInputPlane
+        if kernel is None:
+            self.kernel = torch.Tensor(9, 9).fill_(1)
+        else:
+            self.kernel = kernel
+        self.threshold = threshold
+        self.thresval = thresval or threshold
+        kdim = self.kernel.ndimension()
+
+        # check args
+        if kdim != 2 and kdim != 1:
+            raise ValueError('SpatialContrastiveNormalization averaging kernel must be 2D or 1D')
+
+        if self.kernel.size(0) % 2 == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
+            raise ValueError('SpatialContrastiveNormalization averaging kernel must have ODD dimensions')
+
+        # instantiate sub+div normalization
+        self.normalizer = Sequential()
+        self.normalizer.add(SpatialSubtractiveNormalization(self.nInputPlane, self.kernel))
+        self.normalizer.add(SpatialDivisiveNormalization(self.nInputPlane, self.kernel,
+                                                         self.threshold, self.thresval))
+
+    def updateOutput(self, input):
+        self.output = self.normalizer.forward(input)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput = self.normalizer.backward(input, gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/SpatialConvolution.py b/torch/legacy/nn/SpatialConvolution.py
new file mode 100644
index 0000000..d5d8163
--- /dev/null
+++ b/torch/legacy/nn/SpatialConvolution.py
@@ -0,0 +1,165 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialConvolution(Module):
+
+    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None):
+        super(SpatialConvolution, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.nOutputPlane = nOutputPlane
+        self.kW = kW
+        self.kH = kH
+
+        self.dW = dW
+        self.dH = dH
+        self.padW = padW
+        self.padH = padH if padH is not None else padW
+
+        self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+        self.bias = torch.Tensor(nOutputPlane)
+        self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+        self.gradBias = torch.Tensor(nOutputPlane)
+
+        self.reset()
+        self._input = None
+        self._gradOutput = None
+        self.finput = None
+        self.fgradInput = None
+
+    def noBias(self):
+        self.bias = None
+        self.gradBias = None
+        return self
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
+
+        self.weight.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.uniform_(-stdv, stdv)
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+            return input, gradOutput
+
+        return input
+
+    def _init(self):
+        if self.finput is None:
+            self.finput = self.weight.new()
+        if self.fgradInput is None:
+            self.fgradInput = self.weight.new()
+
+    # function to re-view the weight layout in a way that would make the MM ops happy
+    def _viewWeight(self):
+        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+
+    def _unviewWeight(self):
+        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+
+    def updateOutput(self, input):
+        self._init()
+        self._viewWeight()
+        input = self._makeContiguous(input)
+        self._backend.SpatialConvolutionMM_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH
+        )
+        self._unviewWeight()
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        self._init()
+        self._viewWeight()
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        self._backend.SpatialConvolutionMM_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH
+        )
+        self._unviewWeight()
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._init()
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        self._viewWeight()
+        self._backend.SpatialConvolutionMM_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            scale
+        )
+        self._unviewWeight()
+
+    def type(self, type=None, tensorCache={}):
+        if self.finput is not None:
+            self.finput = torch.Tensor()
+        if self.fgradInput is not None:
+            self.fgradInput = torch.Tensor()
+        return super(SpatialConvolution, self).type(type, tensorCache)
+
+    def __repr__(self):
+        s = super(SpatialConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.dW, self.dH)
+
+        if self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.padW, self.padH)
+
+        s += ')'
+        if self.bias is None:
+            s += ' without bias'
+        return s
+
+    def clearState(self):
+        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+        return super(SpatialConvolution, self).clearState()
diff --git a/torch/legacy/nn/SpatialConvolutionLocal.py b/torch/legacy/nn/SpatialConvolutionLocal.py
new file mode 100644
index 0000000..0e0cbaf
--- /dev/null
+++ b/torch/legacy/nn/SpatialConvolutionLocal.py
@@ -0,0 +1,202 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialConvolutionLocal(Module):
+
+    def __init__(self, nInputPlane, nOutputPlane, iW, iH, kW, kH, dW=1, dH=1, padW=0, padH=None):
+        super(SpatialConvolutionLocal, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.nOutputPlane = nOutputPlane
+        self.kW = kW
+        self.kH = kH
+        self.iW = iW
+        self.iH = iH
+
+        self.dW = dW
+        self.dH = dH
+        self.padW = padW
+        self.padH = padH if padH is not None else padW
+        self.oW = int(math.floor((self.padW * 2 + iW - self.kW) / self.dW)) + 1
+        self.oH = int(math.floor((self.padH * 2 + iH - self.kH) / self.dH)) + 1
+        assert 1 <= self.oW and 1 <= self.oH
+
+        self.weight = torch.Tensor(self.oH, self.oW, nOutputPlane, nInputPlane, kH, kW)
+        self.bias = torch.Tensor(nOutputPlane, self.oH, self.oW)
+        self.gradWeight = torch.Tensor().resize_as_(self.weight)
+        self.gradBias = torch.Tensor().resize_as_(self.bias)
+
+        self.reset()
+        self.finput = None
+        self.fgradInput = None
+        self._input = None
+        self._gradOutput = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+            return input, gradOutput
+
+        return input
+
+    def _viewWeight(self):
+        self.weight = self.weight.view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(
+                self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+
+    def _unviewWeight(self):
+        self.weight = self.weight.view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(
+                self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+
+    def _checkInputSize(self, input):
+        if input.ndimension() == 3:
+            if input.size(0) != self.nInputPlane or input.size(1) != self.iH or input.size(1) != self.iW:
+                raise RuntimeError(
+                    'Given input size: ({}x{}x{}) inconsistent with expected input size: ({}x{}x{}).'.format(
+                        input.size(0), input.size(1), input.size(2), self.nInputPlane, self.iH, self.iW))
+        elif input.ndimension() == 4:
+            if input.size(1) != self.nInputPlane or input.size(2) != self.iH or input.size(3) != self.iW:
+                raise RuntimeError(
+                    'Given input size: ({}x{}x{}x{}) inconsistent with expected input size: (*x{}x{}x{}).'.format(
+                        input.size(0), input.size(1), input.size(2), input.size(3), self.nInputPlane, self.iH, self.iW))
+        else:
+            raise RuntimeError('3D or 4D (batch mode) tensor expected')
+
+    def _checkOutputSize(self, input, output):
+        if output.ndimension() != input.ndimension():
+            raise RuntimeError('inconsistent dimension between output and input.')
+
+        if output.ndimension() == 3:
+            if output.size(0) != self.nOutputPlane or output.size(1) != self.oH or output.size(2) != self.oW:
+                raise RuntimeError(
+                    'Given output size: ({}x{}x{}) inconsistent with expected output size: ({}x{}x{}).'.format(
+                        output.size(0), output.size(1), output.size(2), self.nOutputPlane, self.oH, self.oW))
+        elif output.ndimension() == 4:
+            if output.size(1) != self.nOutputPlane or output.size(2) != self.oH or output.size(3) != self.oW:
+                raise RuntimeError('Given output size: ({}x{}x{}x{}) inconsistent with expected output size: '
+                                   '(batchsize x{}x{}x{}).'.format(
+                                       output.size(0), output.size(1), output.size(2),
+                                       output.size(3), self.nOutputPlane, self.oH, self.oW))
+        else:
+            raise RuntimeError('3D or 4D(batch mode) tensor expected')
+
+    def updateOutput(self, input):
+        if self.finput is None:
+            self.finput = input.new()
+        if self.fgradInput is None:
+            self.fgradInput = input.new()
+        self._checkInputSize(input)
+        self._viewWeight()
+        input = self._makeContiguous(input)
+        self._backend.SpatialConvolutionLocal_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.iW, self.iH,
+            self.oW, self.oH
+        )
+        self._unviewWeight()
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        self._checkInputSize(input)
+        self._checkOutputSize(input, gradOutput)
+
+        self._viewWeight()
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        self._backend.SpatialConvolutionLocal_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.iW, self.iH,
+            self.oW, self.oH
+        )
+        self._unviewWeight()
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._checkInputSize(input)
+        self._checkOutputSize(input, gradOutput)
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        self._viewWeight()
+        self._backend.SpatialConvolutionLocal_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.iW, self.iH,
+            self.oW, self.oH,
+            scale
+        )
+        self._unviewWeight()
+
+    def type(self, type=None, tensorCache=None):
+        if self.finput is not None:
+            self.finput = torch.Tensor()
+        if self.fgradInput is not None:
+            self.fgradInput = torch.Tensor()
+        return super(SpatialConvolutionLocal, self).type(type, tensorCache)
+
+    def __tostring__(self, ):
+        s = super(SpatialConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.iW, self.iH, self.kW, self.kH)
+        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.dW, self.dH)
+
+        if self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.padW, self.padH)
+
+        s += ')'
+        return s
+
+    def clearState(self):
+        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+        return super(SpatialConvolutionLocal, self).clearState()
diff --git a/torch/legacy/nn/SpatialConvolutionMap.py b/torch/legacy/nn/SpatialConvolutionMap.py
new file mode 100644
index 0000000..e901140
--- /dev/null
+++ b/torch/legacy/nn/SpatialConvolutionMap.py
@@ -0,0 +1,136 @@
+import random
+import math
+import torch
+from .Module import Module
+
+# TODO fix THNN...
+
+
+class SpatialConvolutionMap(Module):
+
+    class maps(object):
+
+        @staticmethod
+        def full(nin, nout):
+            ft = torch.Tensor(nin * nout, 2)
+            p = 0
+            for j in range(nout):
+                for i in range(nin):
+                    ft[p][0] = i
+                    ft[p][1] = j
+                    p += 1
+            return ft
+
+        @staticmethod
+        def oneToOne(nfeat):
+            ft = torch.Tensor(nfeat, 2)
+            for i in range(nfeat):
+                ft[i][0] = i
+                ft[i][1] = i
+            return ft
+
+        @staticmethod
+        def random(nin, nout, nto):
+            nker = nto * nout
+            tbl = torch.Tensor(nker, 2)
+            fi = torch.randperm(nin)
+            frcntr = 0
+            nfi = math.floor(nin / nto)  # number of distinct nto chunks
+            totbl = tbl.select(1, 1)
+            frtbl = tbl.select(1, 0)
+            fitbl = fi.narrow(0, 0, (nfi * nto))  # part of fi that covers distinct chunks
+            ufrtbl = frtbl.unfold(0, nto, nto)
+            utotbl = totbl.unfold(0, nto, nto)
+            ufitbl = fitbl.unfold(0, nto, nto)
+
+            # start fill_ing frtbl
+            for i in range(nout):  # fro each unit in target map
+                ufrtbl.select(0, i).copy_(ufitbl.select(0, frcntr))
+                frcntr += 1
+                if frcntr - 1 == nfi:  # reset fi
+                    fi.copy_(torch.randperm(nin))
+                    frcntr = 1
+
+            for tocntr in range(utotbl.size(0)):
+                utotbl.select(0, tocntr).fill_(tocntr)
+
+            return tbl
+
+    def __init__(self, conMatrix, kW, kH, dW=1, dH=1):
+        super(SpatialConvolutionMap, self).__init__()
+
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+        self.connTable = conMatrix
+        self.nInputPlane = int(self.connTable.select(1, 0).max()) + 1
+        self.nOutputPlane = int(self.connTable.select(1, 1).max()) + 1
+        self.weight = torch.Tensor(self.connTable.size(0), kH, kW)
+        self.bias = torch.Tensor(self.nOutputPlane)
+        self.gradWeight = torch.Tensor(self.connTable.size(0), kH, kW)
+        self.gradBias = torch.Tensor(self.nOutputPlane)
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+            self.weight.uniform_(-stdv, stdv)
+            self.bias.uniform_(-stdv, stdv)
+        else:
+            ninp = torch.Tensor(self.nOutputPlane).zero_()
+            for i in range(self.connTable.size(0)):
+                idx = int(self.connTable[i, 1])
+                ninp[idx] += 1
+            for k in range(self.connTable.size(0)):
+                idx = int(self.connTable[k, 1])
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
+                self.weight.select(0, k).uniform_(-stdv, stdv)
+            for k in range(self.bias.size(0)):
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
+                # TODO: torch.uniform
+                self.bias[k] = random.uniform(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self._backend.SpatialConvolutionMap_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.SpatialConvolutionMap_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.bias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.SpatialConvolutionMap_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH,
+            scale
+        )
diff --git a/torch/legacy/nn/SpatialCrossMapLRN.py b/torch/legacy/nn/SpatialCrossMapLRN.py
new file mode 100644
index 0000000..57bbe81
--- /dev/null
+++ b/torch/legacy/nn/SpatialCrossMapLRN.py
@@ -0,0 +1,128 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialCrossMapLRN(Module):
+
+    def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
+        super(SpatialCrossMapLRN, self).__init__()
+
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+        self.scale = None
+        self.paddedRatio = None
+        self.accumRatio = None
+
+    def updateOutput(self, input):
+        assert input.dim() == 4
+
+        if self.scale is None:
+            self.scale = input.new()
+        if input.type() == 'torch.cuda.FloatTensor':
+            self._backend.SpatialCrossMapLRN_updateOutput(
+                self._backend.library_state,
+                input,
+                self.output,
+                self.scale,
+                self.size,
+                self.alpha,
+                self.beta,
+                self.k
+            )
+        else:
+            batchSize = input.size(0)
+            channels = input.size(1)
+            inputHeight = input.size(2)
+            inputWidth = input.size(3)
+
+            self.output.resize_as_(input)
+            self.scale.resize_as_(input)
+
+            # use output storage as temporary buffer
+            inputSquare = self.output
+            torch.pow(input, 2, out=inputSquare)
+
+            prePad = int((self.size - 1) / 2 + 1)
+            prePadCrop = channels if prePad > channels else prePad
+
+            scaleFirst = self.scale.select(1, 0)
+            scaleFirst.zero_()
+            # compute first feature map normalization
+            for c in range(prePadCrop):
+                scaleFirst.add_(inputSquare.select(1, c))
+
+            # reuse computations for next feature maps normalization
+            # by adding the next feature map and removing the previous
+            for c in range(1, channels):
+                scalePrevious = self.scale.select(1, c - 1)
+                scaleCurrent = self.scale.select(1, c)
+                scaleCurrent.copy_(scalePrevious)
+                if c < channels - prePad + 1:
+                    squareNext = inputSquare.select(1, c + prePad - 1)
+                    scaleCurrent.add_(1, squareNext)
+
+                if c > prePad:
+                    squarePrevious = inputSquare.select(1, c - prePad)
+                    scaleCurrent.add_(-1, squarePrevious)
+
+            self.scale.mul_(self.alpha / self.size).add_(self.k)
+
+            torch.pow(self.scale, -self.beta, out=self.output)
+            self.output.mul_(input)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 4
+
+        if input.type() == 'torch.cuda.FloatTensor':
+            self._backend.SpatialCrossMapLRN_updateGradInput(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradInput,
+                self.scale,
+                self.output,
+                self.size,
+                self.alpha,
+                self.beta,
+                self.k
+            )
+        else:
+            batchSize = input.size(0)
+            channels = input.size(1)
+            inputHeight = input.size(2)
+            inputWidth = input.size(3)
+
+            if self.paddedRatio is None:
+                self.paddedRatio = input.new()
+            if self.accumRatio is None:
+                self.accumRatio = input.new()
+            self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth)
+            self.accumRatio.resize_(inputHeight, inputWidth)
+
+            cacheRatioValue = 2 * self.alpha * self.beta / self.size
+            inversePrePad = int(self.size - (self.size - 1) / 2)
+
+            self.gradInput.resize_as_(input)
+            torch.pow(self.scale, -self.beta, out=self.gradInput).mul_(gradOutput)
+
+            self.paddedRatio.zero_()
+            paddedRatioCenter = self.paddedRatio.narrow(0, inversePrePad, channels)
+            for n in range(batchSize):
+                torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter)
+                paddedRatioCenter.div_(self.scale[n])
+                torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=self.accumRatio)
+                for c in range(channels):
+                    self.accumRatio.add_(self.paddedRatio[c + self.size - 1])
+                    self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio)
+                    self.accumRatio.add_(-1, self.paddedRatio[c])
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'scale', 'paddedRatio', 'accumRatio')
+        return super(SpatialCrossMapLRN, self).clearState()
diff --git a/torch/legacy/nn/SpatialDilatedConvolution.py b/torch/legacy/nn/SpatialDilatedConvolution.py
new file mode 100644
index 0000000..73056c8
--- /dev/null
+++ b/torch/legacy/nn/SpatialDilatedConvolution.py
@@ -0,0 +1,88 @@
+import torch
+from .SpatialConvolution import SpatialConvolution
+
+
+class SpatialDilatedConvolution(SpatialConvolution):
+
+    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, dilationH=1, dilationW=None):
+        super(SpatialDilatedConvolution, self).__init__(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+
+        self.dilationH = dilationH
+        self.dilationW = dilationW if dilationW is not None else dilationH
+
+    def updateOutput(self, input):
+        if self.finput is None:
+            self.finput = self.weight.new()
+        if self.fgradInput is None:
+            self.fgradInput = self.weight.new()
+        input = self._makeContiguous(input)
+        self._backend.SpatialDilatedConvolution_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.dilationH, self.dilationW
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        if self.fgradInput is None:
+            self.fgradInput = self.weight.new()
+        self._backend.SpatialDilatedConvolution_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.finput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.dilationH, self.dilationW
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        input, gradOutput = self._makeContiguous(input, gradOutput)
+        if self.fgradInput is None:
+            self.fgradInput = self.weight.new()
+        self._backend.SpatialDilatedConvolution_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.dilationH, self.dilationW,
+            scale
+        )
+
+    def __repr__(self):
+        s = super(SpatialConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.dW, self.dH)
+
+        if self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.padW, self.padH)
+
+        s += ', {}, {}'.format(self.dilationW, self.dilationH)
+
+        s += ')'
+        if self.bias is None:
+            s += ' without bias'
+        return s
diff --git a/torch/legacy/nn/SpatialDivisiveNormalization.py b/torch/legacy/nn/SpatialDivisiveNormalization.py
new file mode 100644
index 0000000..6d7f6dc
--- /dev/null
+++ b/torch/legacy/nn/SpatialDivisiveNormalization.py
@@ -0,0 +1,145 @@
+import math
+import torch
+from .Module import Module
+from .Sequential import Sequential
+from .SpatialZeroPadding import SpatialZeroPadding
+from .SpatialConvolution import SpatialConvolution
+from .SpatialConvolutionMap import SpatialConvolutionMap
+from .Replicate import Replicate
+from .Square import Square
+from .Sqrt import Sqrt
+from .CDivTable import CDivTable
+from .Threshold import Threshold
+from .utils import clear
+
+
+class SpatialDivisiveNormalization(Module):
+
+    def __init__(self, nInputPlane=1, kernel=None, threshold=1e-4, thresval=None):
+        super(SpatialDivisiveNormalization, self).__init__()
+
+        # get args
+        self.nInputPlane = nInputPlane
+        if kernel is None:
+            kernel = torch.Tensor(9, 9).fill_(1)
+        self.kernel = kernel
+        self.threshold = threshold
+        self.thresval = thresval if thresval is not None else threshold
+        kdim = self.kernel.ndimension()
+
+        # check args
+        if kdim != 2 and kdim != 1:
+            raise ValueError('SpatialDivisiveNormalization averaging kernel must be 2D or 1D')
+
+        if (self.kernel.size(0) % 2) == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
+            raise ValueError('SpatialDivisiveNormalization averaging kernel must have ODD dimensions')
+
+        # padding values
+        padH = int(math.floor(self.kernel.size(0) / 2))
+        padW = padH
+        if kdim == 2:
+            padW = int(math.floor(self.kernel.size(1) / 2))
+
+        # create convolutional mean estimator
+        self.meanestimator = Sequential()
+        self.meanestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
+        if kdim == 2:
+            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
+        else:
+            self.meanestimator.add(SpatialConvolutionMap(
+                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
+
+        self.meanestimator.add(Replicate(self.nInputPlane, 1))
+
+        # create convolutional std estimator
+        self.stdestimator = Sequential()
+        self.stdestimator.add(Square())
+        self.stdestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
+        if kdim == 2:
+            self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
+        else:
+            self.stdestimator.add(SpatialConvolutionMap(
+                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
+
+        self.stdestimator.add(Replicate(self.nInputPlane, 1))
+        self.stdestimator.add(Sqrt())
+
+        # set kernel and bias
+        if kdim == 2:
+            self.kernel.div_(self.kernel.sum() * self.nInputPlane)
+            for i in range(self.nInputPlane):
+                self.meanestimator.modules[1].weight[0][i] = self.kernel
+                self.stdestimator.modules[2].weight[0][i] = self.kernel
+
+            self.meanestimator.modules[1].bias.zero_()
+            self.stdestimator.modules[2].bias.zero_()
+        else:
+            self.kernel.div_(self.kernel.sum() * math.sqrt(self.nInputPlane))
+            for i in range(self.nInputPlane):
+                self.meanestimator.modules[1].weight[i].copy_(self.kernel)
+                self.meanestimator.modules[2].weight[0][i].copy_(self.kernel)
+                self.stdestimator.modules[2].weight[i].copy_(self.kernel)
+                self.stdestimator.modules[3].weight[0][i].copy_(self.kernel)
+
+            self.meanestimator.modules[1].bias.zero_()
+            self.meanestimator.modules[2].bias.zero_()
+            self.stdestimator.modules[2].bias.zero_()
+            self.stdestimator.modules[3].bias.zero_()
+
+        # other operation
+        self.normalizer = CDivTable()
+        self.divider = CDivTable()
+        self.thresholder = Threshold(self.threshold, self.thresval)
+
+        # coefficient array, to adjust side effects
+        self.coef = torch.Tensor(1, 1, 1)
+
+        self.ones = None
+        self._coef = None
+
+    def updateOutput(self, input):
+        self.localstds = self.stdestimator.updateOutput(input)
+
+        # compute side coefficients
+        dim = input.dim()
+        if (self.localstds.dim() != self.coef.dim() or
+                (input.size(dim - 1) != self.coef.size(dim - 1)) or
+                (input.size(dim - 2) != self.coef.size(dim - 2))):
+            if self.ones is None:
+                self.ones = input.new()
+            self.ones.resize_as_(input[0:1]).fill_(1)
+            coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
+            if self._coef is None:
+                self._coef = input.new()
+            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
+            self.coef = self._coef.view(1, *self._coef.size()).expand_as(self.localstds)
+
+        # normalize std dev
+        self.adjustedstds = self.divider.updateOutput([self.localstds, self.coef.contiguous().view_as(self.localstds)])
+        self.thresholdedstds = self.thresholder.updateOutput(self.adjustedstds)
+        self.output = self.normalizer.updateOutput([input, self.thresholdedstds.contiguous().view_as(input)])
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        # resize grad
+        self.gradInput.resize_as_(input).zero_()
+
+        # backprop through all modules
+        gradnorm = (self.normalizer.updateGradInput(
+            [input, self.thresholdedstds.contiguous().view_as(input)], gradOutput))
+        gradadj = self.thresholder.updateGradInput(self.adjustedstds, gradnorm[1])
+        graddiv = (self.divider.updateGradInput(
+            [self.localstds, self.coef.contiguous().view_as(self.localstds)], gradadj))
+        self.gradInput.add_(self.stdestimator.updateGradInput(input, graddiv[0]))
+        self.gradInput.add_(gradnorm[0])
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'ones', '_coef')
+        self.meanestimator.clearState()
+        self.stdestimator.clearState()
+        return super(SpatialDivisiveNormalization, self).clearState()
diff --git a/torch/legacy/nn/SpatialDropout.py b/torch/legacy/nn/SpatialDropout.py
new file mode 100644
index 0000000..0cf62af
--- /dev/null
+++ b/torch/legacy/nn/SpatialDropout.py
@@ -0,0 +1,49 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialDropout(Module):
+
+    def __init__(self, p=0.5):
+        super(SpatialDropout, self).__init__()
+        self.p = p
+        self.train = True
+        self.noise = torch.Tensor()
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input).copy_(input)
+        if self.train:
+            if input.dim() == 4:
+                self.noise.resize_(input.size(0), input.size(1), 1, 1)
+            else:
+                raise RuntimeError('Input must be 4D (nbatch, nfeat, h, w)')
+
+            self.noise.bernoulli_(1 - self.p)
+            # We expand the random dropouts to the entire feature map because the
+            # features are likely correlated across the map and so the dropout
+            # should also be correlated.
+            self.output.mul_(self.noise.expand_as(input))
+        else:
+            self.output.mul_(1 - self.p)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.train:
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
+        else:
+            raise RuntimeError('backprop only defined while training')
+
+        return self.gradInput
+
+    def setp(self, p):
+        self.p = p
+
+    def __repr__(self):
+        return super(SpatialDropout, self).__repr__()
+
+    def clearState(self):
+        clear(self, 'noise')
+        return super(SpatialDropout, self).clearState()
diff --git a/torch/legacy/nn/SpatialFractionalMaxPooling.py b/torch/legacy/nn/SpatialFractionalMaxPooling.py
new file mode 100644
index 0000000..de0eead
--- /dev/null
+++ b/torch/legacy/nn/SpatialFractionalMaxPooling.py
@@ -0,0 +1,135 @@
+import math
+import torch
+from .Module import Module
+
+
+class SpatialFractionalMaxPooling(Module):
+    # Usage:
+    # nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+    #   the output should be the exact size (outH x outW)
+    # nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
+    #   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW))
+    #   ratios are numbers between (0, 1) exclusive
+
+    def __init__(self, poolSizeW, poolSizeH, arg1, arg2):
+        super(SpatialFractionalMaxPooling, self).__init__()
+        assert poolSizeW >= 2
+        assert poolSizeH >= 2
+
+        # Pool size (how wide the pooling for each output unit is)
+        self.poolSizeW = poolSizeW
+        self.poolSizeH = poolSizeH
+
+        # Random samples are drawn for all
+        # batch * plane * (height, width; i.e., 2) points. This determines
+        # the 2d "pseudorandom" overlapping pooling regions for each
+        # (batch element x input plane). A new set of random samples is
+        # drawn every updateOutput call, unless we disable it via
+        # .fixPoolingRegions().
+        self.randomSamples = None
+
+        # Flag to disable re-generation of random samples for producing
+        # a new pooling. For testing purposes
+        self.newRandomPool = False
+
+        self.indices = None
+
+        if arg1 >= 1 and arg2 >= 1:
+            # Desired output size: the input tensor will determine the reduction
+            # ratio
+            self.outW = arg1
+            self.outH = arg2
+            self.ratioW = self.ratioH = None
+        else:
+            # Reduction ratio specified per each input
+            # This is the reduction ratio that we use
+            self.ratioW = arg1
+            self.ratioH = arg2
+            self.outW = self.outH = None
+
+            # The reduction ratio must be between 0 and 1
+            assert self.ratioW > 0 and self.ratioW < 1
+            assert self.ratioH > 0 and self.ratioH < 1
+
+    def _getBufferSize(self, input):
+        assert input.ndimension() == 4
+        batchSize = input.size(0)
+        planeSize = input.size(1)
+
+        return torch.Size([batchSize, planeSize, 2])
+
+    def _initSampleBuffer(self, input):
+        sampleBufferSize = self._getBufferSize(input)
+
+        if self.randomSamples is None:
+            self.randomSamples = input.new().resize_(sampleBufferSize).uniform_()
+        elif self.randomSamples.size(0) != sampleBufferSize[0] or self.randomSamples.size(1) != sampleBufferSize[1]:
+            self.randomSamples.resize_(sampleBufferSize).uniform_()
+        elif not self.newRandomPool:
+            # Create new pooling windows, since this is a subsequent call
+            self.randomSamples.uniform_()
+
+    def _getOutputSizes(self, input):
+        outW = self.outW
+        outH = self.outH
+        if self.ratioW is not None and self.ratioH is not None:
+            assert input.ndimension() == 4
+            outW = int(math.floor(input.size(3) * self.ratioW))
+            outH = int(math.floor(input.size(2) * self.ratioH))
+
+            # Neither can be smaller than 1
+            assert outW > 0
+            assert outH > 0
+        else:
+            assert outW is not None and outH is not None
+
+        return outW, outH
+
+    # Call this to turn off regeneration of random pooling regions each
+    # updateOutput call.
+    def fixPoolingRegions(self, val=True):
+        self.newRandomPool = val
+        return self
+
+    def updateOutput(self, input):
+        if self.indices is None:
+            self.indices = input.new()
+        self.indices = self.indices.long()
+        self._initSampleBuffer(input)
+        outW, outH = self._getOutputSizes(input)
+
+        self._backend.SpatialFractionalMaxPooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            outW, outH, self.poolSizeW, self.poolSizeH,
+            self.indices, self.randomSamples)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert self.randomSamples is not None
+        outW, outH = self._getOutputSizes(input)
+
+        self._backend.SpatialFractionalMaxPooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            outW, outH, self.poolSizeW, self.poolSizeH,
+            self.indices)
+        return self.gradInput
+
+    # backward compat
+    def empty(self):
+        self.clearState()
+
+    def clearState(self):
+        self.indices = None
+        self.randomSamples = None
+        return super(SpatialFractionalMaxPooling, self).clearState()
+
+    def __repr__(self):
+        return super(SpatialFractionalMaxPooling, self).__repr__() + \
+            '({}x{}, {}, {})'.format(self.outW or self.ratioW,
+                                     self.outH or self.ratioH,
+                                     self.poolSizeW, self.poolSizeH)
diff --git a/torch/legacy/nn/SpatialFullConvolution.py b/torch/legacy/nn/SpatialFullConvolution.py
new file mode 100644
index 0000000..9dc9241
--- /dev/null
+++ b/torch/legacy/nn/SpatialFullConvolution.py
@@ -0,0 +1,219 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialFullConvolution(Module):
+
+    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, adjW=0, adjH=0):
+        super(SpatialFullConvolution, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.nOutputPlane = nOutputPlane
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+        self.padW = padW
+        self.padH = padH if padH is not None else padW
+        self.adjW = adjW
+        self.adjH = adjH
+
+        if self.adjW > self.dW - 1 or self.adjH > self.dH - 1:
+            raise ValueError('adjW and adjH must be smaller than self.dW - 1 and self.dH - 1 respectively')
+
+        self.weight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+        self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+        self.bias = torch.Tensor(self.nOutputPlane)
+        self.gradBias = torch.Tensor(self.nOutputPlane)
+
+        self.ones = torch.Tensor()
+        self.finput = None
+        self.fgradInput = None
+        self.zeroScalar = None
+        self._input = None
+        self._gradOutput = None
+
+        self.reset()
+
+    def noBias(self):
+        self.bias = None
+        self.gradBias = None
+        return self
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            nInputPlane = self.nInputPlane
+            kH = self.kH
+            kW = self.kW
+            stdv = 1 / math.sqrt(kW * kH * nInputPlane)
+
+        self.weight.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.uniform_(-stdv, stdv)
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+            return input, gradOutput
+
+        return input
+
+    def _calculateAdj(self, targetSize, ker, pad, stride):
+        return (targetSize + 2 * pad - ker) % stride
+
+    def updateOutput(self, input):
+        inputTensor = input
+        adjW, adjH = self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(input, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
+            if not hasattr(self, 'finput') or self.finput is None:
+                self.finput = input[0].new()
+            if not hasattr(self, 'fgradInput') or self.fgradInput is None:
+                self.fgradInput = input[0].new()
+        else:
+            if not hasattr(self, 'finput') or self.finput is None:
+                self.finput = input.new()
+            if not hasattr(self, 'fgradInput') or self.fgradInput is None:
+                self.fgradInput = input.new()
+
+        inputTensor = self._makeContiguous(inputTensor)
+        self._backend.SpatialFullConvolution_updateOutput(
+            self._backend.library_state,
+            inputTensor,
+            self.output,
+            self.weight,
+            self.bias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            adjW, adjH
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        inputTensor = input
+        adjW, adjH = self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(input, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
+        # Momentarily extract the gradInput tensor
+        if isinstance(self.gradInput, list):
+            self.gradInput = self.gradInput[0]
+
+        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
+        self._backend.SpatialFullConvolution_updateGradInput(
+            self._backend.library_state,
+            inputTensor,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.finput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            adjW, adjH
+        )
+
+        if isinstance(input, list):
+            # Create a zero tensor to be expanded and used as gradInput[1].
+            if self.zeroScalar is None:
+                self.zeroScalar = input[1].new(1).zero_()
+            self.ones.resize_(input[1].dim()).fill_(1)
+            zeroTensor = self.zeroScalar.view_as(self.ones).expand_as(input[1])
+            self.gradInput = [self.gradInput, zeroTensor]
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        inputTensor = input
+        adjW, adjH = self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(inputTensor, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+
+        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
+        self._backend.SpatialFullConvolution_accGradParameters(
+            self._backend.library_state,
+            inputTensor,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.finput,
+            self.fgradInput,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            adjW, adjH,
+            scale
+        )
+
+    def type(self, type=None, tensorCache=None):
+        if self.finput is not None:
+            self.finput = torch.Tensor()
+        if self.fgradInput is not None:
+            self.fgradInput = torch.Tensor()
+        return super(SpatialFullConvolution, self).type(type, tensorCache)
+
+    def __repr__(self):
+        s = super(SpatialFullConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}'.format(self.dW, self.dH)
+
+        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
+            s += ', {}, {}'.format(self.padW, self.padH)
+
+        if (self.adjW or self.adjH) and (self.adjW != 0 or self.adjH != 0):
+            s += ', {}, {}'.format(self.adjW, self.adjH)
+
+        s += ')'
+        if self.bias is None:
+            s += ' without bias'
+        return s
+
+    def clearState(self):
+        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+        return super(SpatialFullConvolution, self).clearState()
diff --git a/torch/legacy/nn/SpatialFullConvolutionMap.py b/torch/legacy/nn/SpatialFullConvolutionMap.py
new file mode 100644
index 0000000..b4981f3
--- /dev/null
+++ b/torch/legacy/nn/SpatialFullConvolutionMap.py
@@ -0,0 +1,88 @@
+import random
+import math
+import torch
+from .Module import Module
+
+
+class SpatialFullConvolutionMap(Module):
+
+    def __init__(self, conMatrix, kW, kH, dW=1, dH=1):
+        super(SpatialFullConvolutionMap, self).__init__()
+
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+        self.connTable = conMatrix
+        self.nInputPlane = int(self.connTable.select(1, 0).max()) + 1
+        self.nOutputPlane = int(self.connTable.select(1, 1).max()) + 1
+
+        self.weight = torch.Tensor(self.connTable.size(0), kH, kW)
+        self.gradWeight = torch.Tensor(self.connTable.size(0), kH, kW)
+
+        self.bias = torch.Tensor(self.nOutputPlane)
+        self.gradBias = torch.Tensor(self.nOutputPlane)
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+            self.weight.uniform_(-stdv, stdv)
+            self.bias.uniform_(-stdv, stdv)
+        else:
+            ninp = torch.Tensor(self.nOutputPlane).zero_()
+            for i in range(self.connTable.size(0)):
+                idx = int(self.connTable[i][1])
+                ninp[idx] += 1
+            for k in range(self.connTable.size(0)):
+                idx = int(self.connTable[k][1])
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
+                self.weight[k].uniform_(-stdv, stdv)
+            for k in range(self.bias.size(0)):
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
+                # TODO: torch.uniform
+                self.bias[k] = random.uniform(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self._backend.SpatialFullConvolutionMap_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.SpatialFullConvolutionMap_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.bias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.SpatialFullConvolutionMap_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.connTable,
+            self.nInputPlane,
+            self.nOutputPlane,
+            self.dW, self.dH,
+            scale
+        )
diff --git a/torch/legacy/nn/SpatialLPPooling.py b/torch/legacy/nn/SpatialLPPooling.py
new file mode 100644
index 0000000..cf84593
--- /dev/null
+++ b/torch/legacy/nn/SpatialLPPooling.py
@@ -0,0 +1,51 @@
+import torch
+from .Module import Module
+from .Sequential import Sequential
+from .Square import Square
+from .Power import Power
+from .SpatialAveragePooling import SpatialAveragePooling
+from .MulConstant import MulConstant
+from .Sqrt import Sqrt
+
+
+class SpatialLPPooling(Sequential):
+
+    def __init__(self, nInputPlane, pnorm, kW, kH, dW=None, dH=None):
+        super(SpatialLPPooling, self).__init__()
+
+        dW = dW or kW
+        dH = dH or kH
+
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+
+        if pnorm == 2:
+            self.add(Square())
+        else:
+            self.add(Power(pnorm))
+
+        self.add(SpatialAveragePooling(kW, kH, dW, dH))
+        self.add(MulConstant(kW * kH))
+        if pnorm == 2:
+            self.add(Sqrt())
+        else:
+            self.add(Power(1. / pnorm))
+
+    # the module is a Sequential: by default, it'll try to learn the parameters
+    # of the sub sampler: we avoid that by redefining its methods.
+    def reset(self, stdev=None):
+        pass
+
+    def accGradParameters(self, input, gradOutput):
+        pass
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        pass
+
+    def zeroGradParameters(self):
+        pass
+
+    def updateParameters(self, learningRate):
+        pass
diff --git a/torch/legacy/nn/SpatialMaxPooling.py b/torch/legacy/nn/SpatialMaxPooling.py
new file mode 100644
index 0000000..2b2051f
--- /dev/null
+++ b/torch/legacy/nn/SpatialMaxPooling.py
@@ -0,0 +1,79 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class SpatialMaxPooling(Module):
+
+    def __init__(self, kW, kH, dW=None, dH=None, padW=0, padH=0):
+        super(SpatialMaxPooling, self).__init__()
+
+        dW = dW or kW
+        dH = dH or kH
+
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+
+        self.padW = padW
+        self.padH = padH
+
+        self.ceil_mode = False
+        self.indices = torch.LongTensor()
+
+    def ceil(self):
+        self.ceil_mode = True
+        return self
+
+    def floor(self):
+        self.ceil_mode = False
+        return self
+
+    def updateOutput(self, input):
+        if not hasattr(self, 'indices') or self.indices is None:
+            self.indices = input.new()
+        self.indices = self.indices.long()
+
+        dims = input.dim()
+        self.iheight = input.size(dims - 2)
+        self.iwidth = input.size(dims - 1)
+
+        self._backend.SpatialMaxPooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.ceil_mode
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.SpatialMaxPooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            self.padW, self.padH,
+            self.ceil_mode
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        s = super(SpatialMaxPooling, self).__repr__()
+        s += '({}x{}, {}, {}'.format(self.kW, self.kH, self.dW, self.dH)
+        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
+            s += ', {}, {}'.format(self.padW, self.padH)
+        s += ')'
+
+        return s
+
+    def clearState(self):
+        clear(self, 'indices')
+        return super(SpatialMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/SpatialMaxUnpooling.py b/torch/legacy/nn/SpatialMaxUnpooling.py
new file mode 100644
index 0000000..477ef43
--- /dev/null
+++ b/torch/legacy/nn/SpatialMaxUnpooling.py
@@ -0,0 +1,44 @@
+import torch
+from .Module import Module
+from .SpatialMaxPooling import SpatialMaxPooling
+
+
+class SpatialMaxUnpooling(Module):
+
+    def __init__(self, poolingModule):
+        super(SpatialMaxUnpooling, self).__init__()
+        assert isinstance(poolingModule, SpatialMaxPooling)
+        assert poolingModule.kH == poolingModule.dH
+        assert poolingModule.kW == poolingModule.dW
+        self.pooling = poolingModule
+
+    def _setParams(self):
+        self.indices = self.pooling.indices
+        self.oheight = self.pooling.iheight
+        self.owidth = self.pooling.iwidth
+
+    def updateOutput(self, input):
+        self._setParams()
+        self._backend.SpatialMaxUnpooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.owidth, self.oheight
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._setParams()
+        self._backend.SpatialMaxUnpooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices,
+            self.owidth, self.oheight
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        return 'nn.SpatialMaxUnpooling associated to ' + self.pooling.__repr__()
diff --git a/torch/legacy/nn/SpatialReflectionPadding.py b/torch/legacy/nn/SpatialReflectionPadding.py
new file mode 100644
index 0000000..b8f3d15
--- /dev/null
+++ b/torch/legacy/nn/SpatialReflectionPadding.py
@@ -0,0 +1,44 @@
+import torch
+from .Module import Module
+
+
+class SpatialReflectionPadding(Module):
+
+    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
+        super(SpatialReflectionPadding, self).__init__()
+        self.pad_l = pad_l
+        self.pad_r = pad_r if pad_r is not None else pad_l
+        self.pad_t = pad_t if pad_t is not None else pad_l
+        self.pad_b = pad_b if pad_b is not None else pad_l
+
+    def updateOutput(self, input):
+        assert input.dim() == 4
+        self._backend.SpatialReflectionPadding_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.pad_l, self.pad_r, self.pad_t, self.pad_b
+        )
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 4 and gradOutput.dim() == 4
+        assert input.size(0) == gradOutput.size(0) and \
+            input.size(1) == gradOutput.size(1) and \
+            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
+            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
+
+        self._backend.SpatialReflectionPadding_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.pad_l, self.pad_r, self.pad_t, self.pad_b
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        s = super(SpatialReflectionPadding, self).__repr__()
+        s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+        return s
diff --git a/torch/legacy/nn/SpatialReplicationPadding.py b/torch/legacy/nn/SpatialReplicationPadding.py
new file mode 100644
index 0000000..67a79a9
--- /dev/null
+++ b/torch/legacy/nn/SpatialReplicationPadding.py
@@ -0,0 +1,45 @@
+import torch
+from .Module import Module
+
+
+class SpatialReplicationPadding(Module):
+
+    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
+        super(SpatialReplicationPadding, self).__init__()
+        self.pad_l = pad_l
+        self.pad_r = pad_r if pad_r is not None else pad_l
+        self.pad_t = pad_t if pad_t is not None else pad_l
+        self.pad_b = pad_b if pad_b is not None else pad_l
+
+    def updateOutput(self, input):
+        assert input.dim() == 4
+        self._backend.SpatialReplicationPadding_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.pad_l, self.pad_r, self.pad_t, self.pad_b
+        )
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 4 and gradOutput.dim() == 4
+        assert input.size(0) == gradOutput.size(0) and \
+            input.size(1) == gradOutput.size(1) and \
+            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
+            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
+
+        self._backend.SpatialReplicationPadding_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.pad_l, self.pad_r, self.pad_t, self.pad_b
+        )
+
+        return self.gradInput
+
+    def __repr__(self):
+        s = super(SpatialReplicationPadding, self).__repr__()
+        s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+        return s
diff --git a/torch/legacy/nn/SpatialSoftMax.py b/torch/legacy/nn/SpatialSoftMax.py
new file mode 100644
index 0000000..f14c563
--- /dev/null
+++ b/torch/legacy/nn/SpatialSoftMax.py
@@ -0,0 +1,21 @@
+import torch
+from .Module import Module
+
+
+class SpatialSoftMax(Module):
+
+    def updateOutput(self, input):
+        self.output = torch.softmax(
+            input,
+            0 if input.dim() == 1 or input.dim() == 3 else 1
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput = torch.softmax_backward_data(
+            gradOutput,
+            self.output,
+            0 if input.dim() == 1 or input.dim() == 3 else 1,
+            input
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SpatialSubSampling.py b/torch/legacy/nn/SpatialSubSampling.py
new file mode 100644
index 0000000..2429800
--- /dev/null
+++ b/torch/legacy/nn/SpatialSubSampling.py
@@ -0,0 +1,70 @@
+import math
+import torch
+from .Module import Module
+
+
+class SpatialSubSampling(Module):
+
+    def __init__(self, nInputPlane, kW, kH, dW=1, dH=1):
+        super(SpatialSubSampling, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.kW = kW
+        self.kH = kH
+        self.dW = dW
+        self.dH = dH
+
+        self.weight = torch.Tensor(nInputPlane)
+        self.bias = torch.Tensor(nInputPlane)
+        self.gradWeight = torch.Tensor(nInputPlane)
+        self.gradBias = torch.Tensor(nInputPlane)
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kW * self.kH)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self._backend.SpatialSubSampling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.kW, self.kH,
+            self.dW, self.dH
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        self._backend.SpatialSubSampling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.kW, self.kH,
+            self.dW, self.dH
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.SpatialSubSampling_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.kW, self.kH,
+            self.dW, self.dH,
+            scale
+        )
diff --git a/torch/legacy/nn/SpatialSubtractiveNormalization.py b/torch/legacy/nn/SpatialSubtractiveNormalization.py
new file mode 100644
index 0000000..3b8fed3
--- /dev/null
+++ b/torch/legacy/nn/SpatialSubtractiveNormalization.py
@@ -0,0 +1,122 @@
+import math
+import torch
+from .Module import Module
+from .Sequential import Sequential
+from .SpatialZeroPadding import SpatialZeroPadding
+from .SpatialConvolution import SpatialConvolution
+from .SpatialConvolutionMap import SpatialConvolutionMap
+from .Replicate import Replicate
+from .CSubTable import CSubTable
+from .CDivTable import CDivTable
+from .utils import clear
+import warnings
+
+
+class SpatialSubtractiveNormalization(Module):
+
+    def __init__(self, nInputPlane=1, kernel=None):
+        super(SpatialSubtractiveNormalization, self).__init__()
+
+        # get args
+        self.nInputPlane = nInputPlane
+        if kernel is None:
+            kernel = torch.Tensor(9, 9).fill_(1)
+        self.kernel = kernel
+        kdim = self.kernel.ndimension()
+
+        # check args
+        if kdim != 2 and kdim != 1:
+            raise ValueError('SpatialSubtractiveNormalization averaging kernel must be 2D or 1D')
+
+        if (self.kernel.size(0) % 2) == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
+            raise ValueError('SpatialSubtractiveNormalization averaging kernel must have ODD dimensions')
+
+        # normalize kernel
+        self.kernel.div_(self.kernel.sum() * self.nInputPlane)
+
+        # padding values
+        padH = int(math.floor(self.kernel.size(0) / 2))
+        padW = padH
+        if kdim == 2:
+            padW = int(math.floor(self.kernel.size(1) / 2))
+
+        # create convolutional mean extractor
+        self.meanestimator = Sequential()
+        self.meanestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
+        if kdim == 2:
+            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
+        else:
+            # TODO: map
+            self.meanestimator.add(SpatialConvolutionMap(
+                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
+
+        self.meanestimator.add(Replicate(self.nInputPlane, 0))
+
+        # set kernel and bias
+        if kdim == 2:
+            for i in range(self.nInputPlane):
+                self.meanestimator.modules[1].weight[0][i] = self.kernel
+            self.meanestimator.modules[1].bias.zero_()
+        else:
+            for i in range(self.nInputPlane):
+                self.meanestimator.modules[1].weight[i] = self.kernel.unsqueeze(0)
+                self.meanestimator.modules[2].weight[0][i] = self.kernel.unsqueeze(1)
+
+            self.meanestimator.modules[1].bias.zero_()
+            self.meanestimator.modules[2].bias.zero_()
+
+        # other operation
+        self.subtractor = CSubTable()
+        self.divider = CDivTable()
+
+        # coefficient array, to adjust side effects
+        self.coef = torch.Tensor(1, 1, 1)
+
+        self.ones = None
+        self._coef = None
+
+    def updateOutput(self, input):
+        # compute side coefficients
+        dim = input.dim()
+        if (input.dim() + 1 != self.coef.dim() or
+                (input.size(dim - 1) != self.coef.size(dim - 1)) or
+                (input.size(dim - 2) != self.coef.size(dim - 2))):
+            if self.ones is None:
+                self.ones = input.new()
+            if self._coef is None:
+                self._coef = self.coef.new()
+
+            self.ones.resize_as_(input[0:1]).fill_(1)
+            coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
+            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
+            size = list(coef.size())
+            size = [input.size(0)] + size
+            self.coef = self._coef.view(1, *self._coef.size()).expand(*size)
+
+        # compute mean
+        self.localsums = self.meanestimator.updateOutput(input)
+        self.adjustedsums = (self.divider.updateOutput(
+            [self.localsums, self.coef.contiguous().view_as(self.localsums)]))
+        self.output = self.subtractor.updateOutput([input, self.adjustedsums.contiguous().view_as(input)])
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        # resize grad
+        self.gradInput.resize_as_(input).zero_()
+
+        # backprop through all modules
+        gradsub = self.subtractor.updateGradInput([input, self.adjustedsums.contiguous().view_as(input)], gradOutput)
+        graddiv = (self.divider.updateGradInput(
+            [self.localsums, self.coef.contiguous().view_as(self.localsums)], gradsub[1]))
+        size = self.meanestimator.updateGradInput(input, graddiv[0]).size()
+        self.gradInput.add_(self.meanestimator.updateGradInput(input, graddiv[0]))
+        self.gradInput.add_(gradsub[0])
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'ones', '_coef')
+        self.meanestimator.clearState()
+        return super(SpatialSubtractiveNormalization, self).clearState()
diff --git a/torch/legacy/nn/SpatialUpSamplingNearest.py b/torch/legacy/nn/SpatialUpSamplingNearest.py
new file mode 100644
index 0000000..1388f80
--- /dev/null
+++ b/torch/legacy/nn/SpatialUpSamplingNearest.py
@@ -0,0 +1,61 @@
+import torch
+from .Module import Module
+
+
+class SpatialUpSamplingNearest(Module):
+    """
+    Applies a 2D up-sampling over an input image composed of several input planes.
+
+    The upsampling is.ne using the simple nearest neighbor technique.
+
+    The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
+    instance, if the tensor is 4D,: dim 3 is the y dimension and dim 4 is the x.
+
+    owidth  = width*scale_factor
+    oheight  = height*scale_factor
+    """
+
+    def __init__(self, scale):
+        super(SpatialUpSamplingNearest, self).__init__()
+
+        self.scale_factor = scale
+        if self.scale_factor < 1:
+            raise ValueError('scale_factor must be greater than 1')
+        if self.scale_factor % 1 != 0:
+            raise ValueError('scale_factor must be integer')
+
+    def updateOutput(self, input):
+        assert input.dim() == 4
+
+        # Copy the input size
+        xdim = input.dim() - 1
+        ydim = input.dim() - 2
+        outputSize = list(input.size())
+        outputSize[ydim] = outputSize[ydim] * self.scale_factor
+        outputSize[xdim] = outputSize[xdim] * self.scale_factor
+        self.output.resize_(*outputSize)
+        self._backend.SpatialUpSamplingNearest_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            outputSize[ydim],
+            outputSize[xdim]
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.gradInput.resize_as_(input)
+        assert input.dim() == 4
+        input_size = input.size()
+        self._backend.SpatialUpSamplingNearest_updateGradInput(
+            self._backend.library_state,
+            gradOutput,
+            self.gradInput,
+            input_size[0],
+            input_size[1],
+            input_size[2],
+            input_size[3],
+            gradOutput.shape[2],
+            gradOutput.shape[3]
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/SpatialZeroPadding.py b/torch/legacy/nn/SpatialZeroPadding.py
new file mode 100644
index 0000000..430feb4
--- /dev/null
+++ b/torch/legacy/nn/SpatialZeroPadding.py
@@ -0,0 +1,81 @@
+import torch
+from .Module import Module
+
+
+class SpatialZeroPadding(Module):
+
+    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
+        super(SpatialZeroPadding, self).__init__()
+        self.pad_l = pad_l
+        self.pad_r = pad_r if pad_r is not None else pad_l
+        self.pad_t = pad_t if pad_t is not None else pad_l
+        self.pad_b = pad_b if pad_b is not None else pad_l
+
+    def updateOutput(self, input):
+        assert input.dim() == 4
+
+        # sizes
+        h = input.size(2) + self.pad_t + self.pad_b
+        w = input.size(3) + self.pad_l + self.pad_r
+        if w < 1 or h < 1:
+            raise RuntimeError('input is too small (feature map size: {}x{})'.format(h, w))
+        self.output.resize_(input.size(0), input.size(1), h, w)
+        self.output.zero_()
+        # crop input if necessary
+        c_input = input
+        if self.pad_t < 0:
+            c_input = c_input.narrow(2, 0 - self.pad_t, c_input.size(2) + self.pad_t)
+        if self.pad_b < 0:
+            c_input = c_input.narrow(2, 0, c_input.size(2) + self.pad_b)
+        if self.pad_l < 0:
+            c_input = c_input.narrow(3, 0 - self.pad_l, c_input.size(3) + self.pad_l)
+        if self.pad_r < 0:
+            c_input = c_input.narrow(3, 0, c_input.size(3) + self.pad_r)
+        # crop output if necessary
+        c_output = self.output
+        if self.pad_t > 0:
+            c_output = c_output.narrow(2, 0 + self.pad_t, c_output.size(2) - self.pad_t)
+        if self.pad_b > 0:
+            c_output = c_output.narrow(2, 0, c_output.size(2) - self.pad_b)
+        if self.pad_l > 0:
+            c_output = c_output.narrow(3, 0 + self.pad_l, c_output.size(3) - self.pad_l)
+        if self.pad_r > 0:
+            c_output = c_output.narrow(3, 0, c_output.size(3) - self.pad_r)
+        # copy input to output
+        c_output.copy_(c_input)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 4
+
+        self.gradInput.resize_as_(input).zero_()
+        # crop gradInput if necessary
+        cg_input = self.gradInput
+        if self.pad_t < 0:
+            cg_input = cg_input.narrow(2, 0 - self.pad_t, cg_input.size(2) + self.pad_t)
+        if self.pad_b < 0:
+            cg_input = cg_input.narrow(2, 0, cg_input.size(2) + self.pad_b)
+        if self.pad_l < 0:
+            cg_input = cg_input.narrow(3, 0 - self.pad_l, cg_input.size(3) + self.pad_l)
+        if self.pad_r < 0:
+            cg_input = cg_input.narrow(3, 0, cg_input.size(3) + self.pad_r)
+        # crop gradOutput if necessary
+        cg_output = gradOutput
+        if self.pad_t > 0:
+            cg_output = cg_output.narrow(2, 0 + self.pad_t, cg_output.size(2) - self.pad_t)
+        if self.pad_b > 0:
+            cg_output = cg_output.narrow(2, 0, cg_output.size(2) - self.pad_b)
+        if self.pad_l > 0:
+            cg_output = cg_output.narrow(3, 0 + self.pad_l, cg_output.size(3) - self.pad_l)
+        if self.pad_r > 0:
+            cg_output = cg_output.narrow(3, 0, cg_output.size(3) - self.pad_r)
+        # copy gradOutput to gradInput
+        cg_input.copy_(cg_output)
+
+        return self.gradInput
+
+    def __tostring__(self, ):
+        s = super(SpatialZeroPadding, self).__repr__()
+        s += '({}, {}, {}, {})'.foramat(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+        return s
diff --git a/torch/legacy/nn/SplitTable.py b/torch/legacy/nn/SplitTable.py
new file mode 100644
index 0000000..c93079d
--- /dev/null
+++ b/torch/legacy/nn/SplitTable.py
@@ -0,0 +1,39 @@
+import torch
+from .Module import Module
+
+
+class SplitTable(Module):
+
+    def __init__(self, dimension):
+        super(SplitTable, self).__init__()
+        self.dimension = dimension
+
+    def _getPositiveDimension(self, input):
+        dimension = self.dimension
+        if dimension < 0:
+            dimension = input.dim() + dimension
+
+        return dimension
+
+    def updateOutput(self, input):
+        dimension = self._getPositiveDimension(input)
+        slices = input.size(dimension)
+
+        currentOutput = []
+        for i in range(slices):
+            currentOutput.append(input.select(dimension, i))
+
+        self.output = currentOutput
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        dimension = self._getPositiveDimension(input)
+        slices = input.size(dimension)
+        self.gradInput.resize_as_(input)
+
+        for i in range(slices):
+            self.gradInput.select(dimension, i).copy_(gradOutput[i])
+
+        return self.gradInput
diff --git a/torch/legacy/nn/Sqrt.py b/torch/legacy/nn/Sqrt.py
new file mode 100644
index 0000000..e046594
--- /dev/null
+++ b/torch/legacy/nn/Sqrt.py
@@ -0,0 +1,29 @@
+import torch
+from .Module import Module
+
+
+class Sqrt(Module):
+
+    def __init__(self, b=0, eps=0):
+        super(Sqrt, self).__init__()
+        self.eps = b
+        self.eps = eps
+
+    def updateOutput(self, input):
+        self._backend.Sqrt_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.eps
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.Sqrt_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.output
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/Square.py b/torch/legacy/nn/Square.py
new file mode 100644
index 0000000..9ebaa37
--- /dev/null
+++ b/torch/legacy/nn/Square.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+
+
+class Square(Module):
+
+    def updateOutput(self, input):
+        self._backend.Square_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.Square_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/Squeeze.py b/torch/legacy/nn/Squeeze.py
new file mode 100644
index 0000000..782b8f5
--- /dev/null
+++ b/torch/legacy/nn/Squeeze.py
@@ -0,0 +1,19 @@
+import torch
+from .Module import Module
+
+
+class Squeeze(Module):
+
+    def __init__(self, dim=None):
+        super(Squeeze, self).__init__()
+        self.dim = dim
+
+    def updateOutput(self, input):
+        dim = self.dim
+        self.output.set_(input.squeeze(dim) if dim is not None else input.squeeze())
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.nelement() == gradOutput.nelement()
+        self.gradInput.set_(gradOutput.contiguous().view_as(input))
+        return self.gradInput
diff --git a/torch/legacy/nn/Sum.py b/torch/legacy/nn/Sum.py
new file mode 100644
index 0000000..31fd988
--- /dev/null
+++ b/torch/legacy/nn/Sum.py
@@ -0,0 +1,54 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class Sum(Module):
+
+    def __init__(self, dimension=0, sizeAverage=False):
+        super(Sum, self).__init__()
+        self.dimension = dimension
+        self.sizeAverage = sizeAverage
+        self._gradOutput = None
+
+    def _getPositiveDimension(self, input):
+        dimension = self.dimension
+        if dimension < 0:
+            dimension = input.dim() + dimension
+        return dimension
+
+    def updateOutput(self, input):
+        dimension = self._getPositiveDimension(input)
+
+        torch.sum(input, dimension, out=self.output, keepdim=True)
+        if self.sizeAverage:
+            self.output.div_(input.size(dimension))
+        if self.output.dim() > 1:
+            self.output.set_(self.output.select(dimension, 0))
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        dimension = self._getPositiveDimension(input)
+        # zero-strides don't work with MKL/BLAS, so
+        # don't set self.gradInput to zero-stride tensor.
+        # Instead, do a deepcopy.
+        size = list(input.size())
+        size[dimension] = 1
+        if not gradOutput.is_contiguous():
+            if self._gradOutput is None:
+                self._gradOutput = gradOutput.new()
+            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+            gradOutput = self._gradOutput
+
+        gradOutput = gradOutput.view(*size)
+        self.gradInput.resize_as_(input)
+        self.gradInput.copy_(gradOutput.expand_as(input))
+        if self.sizeAverage:
+            self.gradInput.div_(input.size(dimension))
+
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, '_gradOutput')
+        return super(Sum, self).clearState()
diff --git a/torch/legacy/nn/Tanh.py b/torch/legacy/nn/Tanh.py
new file mode 100644
index 0000000..872cf2f
--- /dev/null
+++ b/torch/legacy/nn/Tanh.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+
+
+class Tanh(Module):
+
+    def updateOutput(self, input):
+        self._backend.Tanh_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.Tanh_updateGradInput(
+            self._backend.library_state,
+            gradOutput,
+            self.gradInput,
+            self.output
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/TanhShrink.py b/torch/legacy/nn/TanhShrink.py
new file mode 100644
index 0000000..36ef0f1
--- /dev/null
+++ b/torch/legacy/nn/TanhShrink.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+from .Tanh import Tanh
+
+
+class TanhShrink(Module):
+
+    def __init__(self):
+        super(TanhShrink, self).__init__()
+        self.tanh = Tanh()
+
+    def updateOutput(self, input):
+        th = self.tanh.updateOutput(input)
+        self.output.resize_as_(input).copy_(input)
+        self.output.add_(-1, th)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        dth = self.tanh.updateGradInput(input, gradOutput)
+        self.gradInput.resize_as_(input).copy_(gradOutput)
+        self.gradInput.add_(-1, dth)
+        return self.gradInput
diff --git a/torch/legacy/nn/TemporalConvolution.py b/torch/legacy/nn/TemporalConvolution.py
new file mode 100644
index 0000000..4ac04f2
--- /dev/null
+++ b/torch/legacy/nn/TemporalConvolution.py
@@ -0,0 +1,70 @@
+import math
+import torch
+from .Module import Module
+
+
+class TemporalConvolution(Module):
+
+    def __init__(self, inputFrameSize, outputFrameSize, kW, dW=1):
+        super(TemporalConvolution, self).__init__()
+
+        self.inputFrameSize = inputFrameSize
+        self.outputFrameSize = outputFrameSize
+        self.kW = kW
+        self.dW = dW
+
+        self.weight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
+        self.bias = torch.Tensor(outputFrameSize)
+        self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
+        self.gradBias = torch.Tensor(outputFrameSize)
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kW * self.inputFrameSize)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self._backend.TemporalConvolution_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.kW,
+            self.dW,
+            self.inputFrameSize,
+            self.outputFrameSize
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        self._backend.TemporalConvolution_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.kW,
+            self.dW
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.TemporalConvolution_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.kW,
+            self.dW,
+            scale
+        )
diff --git a/torch/legacy/nn/TemporalMaxPooling.py b/torch/legacy/nn/TemporalMaxPooling.py
new file mode 100644
index 0000000..d3088ca
--- /dev/null
+++ b/torch/legacy/nn/TemporalMaxPooling.py
@@ -0,0 +1,43 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class TemporalMaxPooling(Module):
+
+    def __init__(self, kW, dW=None):
+        super(TemporalMaxPooling, self).__init__()
+        self.kW = kW
+        self.dW = dW or kW
+        self.indices = None
+
+    def updateOutput(self, input):
+        if self.indices is None:
+            self.indices = input.new()
+        self._backend.TemporalMaxPooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.kW,
+            self.dW
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        self._backend.TemporalMaxPooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices,
+            self.kW,
+            self.dW
+        )
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'indices')
+        return super(TemporalMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/TemporalSubSampling.py b/torch/legacy/nn/TemporalSubSampling.py
new file mode 100644
index 0000000..823070b
--- /dev/null
+++ b/torch/legacy/nn/TemporalSubSampling.py
@@ -0,0 +1,68 @@
+import math
+import torch
+from .Module import Module
+
+
+class TemporalSubSampling(Module):
+
+    def __init__(self, inputFrameSize, kW, dW=1):
+        super(TemporalSubSampling, self).__init__()
+
+        self.inputFrameSize = inputFrameSize
+        self.kW = kW
+        self.dW = dW
+
+        self.weight = torch.Tensor(inputFrameSize)
+        self.bias = torch.Tensor(inputFrameSize)
+        self.gradWeight = torch.Tensor(inputFrameSize)
+        self.gradBias = torch.Tensor(inputFrameSize)
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kW)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def updateOutput(self, input):
+        self._backend.TemporalSubSampling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.weight,
+            self.bias,
+            self.kW,
+            self.dW,
+            self.inputFrameSize
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        self._backend.TemporalSubSampling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.kW,
+            self.dW
+        )
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        self._backend.TemporalSubSampling_accGradParameters(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.kW,
+            self.dW,
+            scale
+        )
diff --git a/torch/legacy/nn/Threshold.py b/torch/legacy/nn/Threshold.py
new file mode 100644
index 0000000..f151d02
--- /dev/null
+++ b/torch/legacy/nn/Threshold.py
@@ -0,0 +1,45 @@
+import torch
+from .Module import Module
+
+
+class Threshold(Module):
+
+    def __init__(self, threshold=0, value=0, inplace=False):
+        super(Threshold, self).__init__()
+        self.threshold = threshold
+        self.value = value
+
+        # default for inplace is False
+        self.inplace = inplace
+        self.validateParameters()
+
+    def updateOutput(self, input):
+        self.validateParameters()
+        self._backend.Threshold_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.threshold,
+            self.value,
+            self.inplace
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self.validateParameters()
+        self._backend.Threshold_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.threshold,
+            self.value,
+            self.inplace
+        )
+        return self.gradInput
+
+    def validateParameters(self):
+        if self.inplace:
+            if self.value > self.threshold:
+                raise RuntimeError('in-place processing requires value ({}) to not '
+                                   'exceed threshold ({})'.format(self.value, self.threshold))
diff --git a/torch/legacy/nn/Transpose.py b/torch/legacy/nn/Transpose.py
new file mode 100644
index 0000000..4478c25
--- /dev/null
+++ b/torch/legacy/nn/Transpose.py
@@ -0,0 +1,24 @@
+import torch
+from .Module import Module
+
+
+class Transpose(Module):
+    # transpose dimensions:
+    # n = nn.Transpose({1, 4}, {1, 3})
+    # will transpose dims 1 and 4,: 1 and 3...
+
+    def __init__(self, *args):
+        super(Transpose, self).__init__()
+        self.permutations = args
+
+    def updateOutput(self, input):
+        for perm in self.permutations:
+            input = input.transpose(*perm)
+        self.output.resize_as_(input).copy_(input)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        for perm in self.permutations[::-1]:
+            gradOutput = gradOutput.transpose(*perm)
+        self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/Unsqueeze.py b/torch/legacy/nn/Unsqueeze.py
new file mode 100644
index 0000000..21aaff8
--- /dev/null
+++ b/torch/legacy/nn/Unsqueeze.py
@@ -0,0 +1,22 @@
+import torch
+from .Module import Module
+from .utils import addSingletondimension
+
+
+class Unsqueeze(Module):
+
+    def __init__(self, dim):
+        super(Unsqueeze, self).__init__()
+        self.dim = dim
+
+    def updateOutput(self, input):
+        addSingletondimension(self.output, input, self.dim)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.nelement() == gradOutput.nelement()
+        self.gradInput = gradOutput.contiguous().view(input.size())
+        return self.gradInput
+
+    def __repr__(self):
+        return super(Unsqueeze, self).__repr__() + '({})'.format(self.dim)
diff --git a/torch/legacy/nn/View.py b/torch/legacy/nn/View.py
new file mode 100644
index 0000000..0d5f7ac
--- /dev/null
+++ b/torch/legacy/nn/View.py
@@ -0,0 +1,43 @@
+import torch
+from .Module import Module
+
+
+class View(Module):
+
+    def resetSize(self, *args):
+        if len(args) == 1 and isinstance(args[0], torch.Size):
+            self.size = args[0]
+        else:
+            self.size = torch.Size(args)
+
+        self.numElements = 1
+        inferdim = False
+        for i in range(len(self.size)):
+            szi = self.size[i]
+            if szi >= 0:
+                self.numElements = self.numElements * self.size[i]
+            else:
+                assert szi == -1
+                assert not inferdim
+                inferdim = True
+
+        return self
+
+    def __init__(self, *args):
+        super(View, self).__init__()
+        self.resetSize(*args)
+
+    def updateOutput(self, input):
+        if self.output is None:
+            self.output = input.new()
+        self.output = input.view(self.size)
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            self.gradInput = gradOutput.new()
+        self.gradInput = gradOutput.contiguous().view(input.size())
+        return self.gradInput
+
+    def __repr__(self):
+        return super(View, self).__repr__() + '({})'.format(', '.join(map(str, self.size)))
diff --git a/torch/legacy/nn/VolumetricAveragePooling.py b/torch/legacy/nn/VolumetricAveragePooling.py
new file mode 100644
index 0000000..3e6dd15
--- /dev/null
+++ b/torch/legacy/nn/VolumetricAveragePooling.py
@@ -0,0 +1,63 @@
+import torch
+from .Module import Module
+
+
+class VolumetricAveragePooling(Module):
+
+    def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None,
+                 padT=0, padW=0, padH=0,
+                 ceil_mode=False, count_include_pad=True):
+        super(VolumetricAveragePooling, self).__init__()
+        self.kT = kT
+        self.kH = kH
+        self.kW = kW
+        self.dT = dT or kT
+        self.dW = dW or kW
+        self.dH = dH or kH
+        self.padT = padT
+        self.padW = padW
+        self.padH = padH
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        self.__dict__.setdefault('padT', 0)
+        self.__dict__.setdefault('padH', 0)
+        self.__dict__.setdefault('padW', 0)
+        self.__dict__.setdefault('ceil_mode', False)
+        self.__dict__.setdefault('count_include_pad', True)
+
+    def updateOutput(self, input):
+        self._backend.VolumetricAveragePooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            self.ceil_mode, self.count_include_pad
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.VolumetricAveragePooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            self.ceil_mode, self.count_include_pad
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        s = super(VolumetricAveragePooling, self).__repr__()
+        s += '({}x{}x{}, {}x{}x{}, {}x{}x{}, {}, {}'.format(
+            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            self.ceil_mode, self.count_include_pad)
+        s += ')'
+        return s
diff --git a/torch/legacy/nn/VolumetricBatchNormalization.py b/torch/legacy/nn/VolumetricBatchNormalization.py
new file mode 100644
index 0000000..61bab4c
--- /dev/null
+++ b/torch/legacy/nn/VolumetricBatchNormalization.py
@@ -0,0 +1,7 @@
+import torch
+from .Module import Module
+from .BatchNormalization import BatchNormalization
+
+
+class VolumetricBatchNormalization(BatchNormalization):
+    nDim = 5
diff --git a/torch/legacy/nn/VolumetricConvolution.py b/torch/legacy/nn/VolumetricConvolution.py
new file mode 100644
index 0000000..8e506a1
--- /dev/null
+++ b/torch/legacy/nn/VolumetricConvolution.py
@@ -0,0 +1,192 @@
+import math
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class VolumetricConvolution(Module):
+
+    def __init__(self, nInputPlane, nOutputPlane, kT, kW, kH, dT=1, dW=1, dH=1, padT=0, padW=None, padH=None):
+        super(VolumetricConvolution, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.nOutputPlane = nOutputPlane
+        self.kT = kT
+        self.kW = kW
+        self.kH = kH
+        self.dT = dT
+        self.dW = dW
+        self.dH = dH
+        self.padT = padT
+        self.padW = padW if padW is not None else self.padT
+        self.padH = padH if padH is not None else self.padW
+
+        self.weight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+        self.bias = torch.Tensor(nOutputPlane)
+        self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+        self.gradBias = torch.Tensor(nOutputPlane)
+        self.reset()
+
+        self.finput = None
+        self.fgradInput = None
+        self._input = None
+        self._gradOutput = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.kT * self.kW * self.kH * self.nInputPlane)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+            return input, gradOutput
+
+        return input
+
+    # function to re-view the weight layout in a way that would make the MM ops happy
+    def _viewWeight(self):
+        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
+
+    def _unviewWeight(self):
+        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
+        if self.gradWeight is not None and self.gradWeight.dim() > 0:
+            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
+
+    def updateOutput(self, input):
+        if self.finput is None:
+            self.finput = input.new()
+        if self.fgradInput is None:
+            self.fgradInput = input.new()
+        if input.type() == 'torch.cuda.FloatTensor':
+            self._backend.VolumetricConvolution_updateOutput(
+                self._backend.library_state,
+                input,
+                self.output,
+                self.weight,
+                self.bias,
+                self.finput,
+                self.fgradInput,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH
+            )
+        else:
+            self._viewWeight()
+            input = self._makeContiguous(input)
+            self._backend.VolumetricConvolutionMM_updateOutput(
+                self._backend.library_state,
+                input,
+                self.output,
+                self.weight,
+                self.bias,
+                self.finput,
+                self.fgradInput,
+                self.kT, self.kW, self.kH,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH
+            )
+            self._unviewWeight()
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+        if input.type() == 'torch.cuda.FloatTensor':
+            self._backend.VolumetricConvolution_updateGradInput(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradInput,
+                self.weight,
+                self.finput,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH
+            )
+        else:
+            self._viewWeight()
+            input, gradOutput = self._makeContiguous(input, gradOutput)
+            self._backend.VolumetricConvolutionMM_updateGradInput(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradInput,
+                self.weight,
+                self.finput,
+                self.fgradInput,
+                self.kT, self.kW, self.kH,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH
+            )
+            self._unviewWeight()
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        if input.type() == 'torch.cuda.FloatTensor':
+            self._backend.VolumetricConvolution_accGradParameters(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradWeight,
+                self.gradBias,
+                self.finput,
+                self.fgradInput,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH,
+                scale
+            )
+        else:
+            input, gradOutput = self._makeContiguous(input, gradOutput)
+            self._viewWeight()
+            self._backend.VolumetricConvolutionMM_accGradParameters(
+                self._backend.library_state,
+                input,
+                gradOutput,
+                self.gradWeight,
+                self.gradBias,
+                self.finput,
+                self.fgradInput,
+                self.kT, self.kW, self.kH,
+                self.dT, self.dW, self.dH,
+                self.padT, self.padW, self.padH,
+                scale
+            )
+            self._unviewWeight()
+
+    def type(self, type, tensorCache=None):
+        clear(self, 'finput', 'fgradInput')
+        return super(VolumetricConvolution, self).type(type, tensorCache)
+
+    def clearState(self, ):
+        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+        return super(VolumetricConvolution, self).clearState()
+
+    def __repr__(self):
+        s = super(VolumetricConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+        if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
+           self.padT != 0 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
+
+        if self.padT != 0 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
+
+        s += ')'
+        return s
diff --git a/torch/legacy/nn/VolumetricDropout.py b/torch/legacy/nn/VolumetricDropout.py
new file mode 100644
index 0000000..c1e68be
--- /dev/null
+++ b/torch/legacy/nn/VolumetricDropout.py
@@ -0,0 +1,47 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class VolumetricDropout(Module):
+
+    def __init__(self, p=0.5):
+        super(VolumetricDropout, self).__init__()
+        self.p = p
+        self.train = True
+        self.noise = torch.Tensor()
+
+    def updateOutput(self, input):
+        self.output.resize_as_(input).copy_(input)
+        if self.train:
+            assert input.dim() == 5
+            self.noise.resize_(input.size(0), input.size(1), 1, 1, 1)
+
+            self.noise.bernoulli_(1 - self.p)
+            # We expand the random dropouts to the entire feature map because the
+            # features are likely correlated across the map and so the dropout
+            # should also be correlated.
+            self.output.mul_(self.noise.expand_as(input))
+        else:
+            self.output.mul_(1 - self.p)
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.train:
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
+        else:
+            raise RuntimeError('backprop only defined while training')
+
+        return self.gradInput
+
+    def setp(self, p):
+        self.p = p
+
+    def __repr__(self):
+        return super(VolumetricDropout, self).__repr__() + '({:.4f})'.format(self.p)
+
+    def clearState(self):
+        clear(self, 'noise')
+        return super(VolumetricDropout, self).clearState()
diff --git a/torch/legacy/nn/VolumetricFullConvolution.py b/torch/legacy/nn/VolumetricFullConvolution.py
new file mode 100644
index 0000000..3236a7e
--- /dev/null
+++ b/torch/legacy/nn/VolumetricFullConvolution.py
@@ -0,0 +1,213 @@
+import math
+import torch
+from .Module import Module
+
+
+class VolumetricFullConvolution(Module):
+
+    def __init__(self, nInputPlane, nOutputPlane,
+                 kT, kW, kH,                 # kernel size
+                 dT=1, dW=1, dH=1,           # stride
+                 padT=0, padW=0, padH=0,     # padding
+                 adjT=0, adjW=0, adjH=0):    # extra output adjustment
+        super(VolumetricFullConvolution, self).__init__()
+
+        self.nInputPlane = nInputPlane
+        self.nOutputPlane = nOutputPlane
+        self.kW = kW
+        self.kH = kH
+        self.kT = kT
+        self.dW = dW
+        self.dH = dH
+        self.dT = dT
+        self.padW = padW
+        self.padH = padH
+        self.padT = padT
+        self.adjW = adjW
+        self.adjH = adjH
+        self.adjT = adjT
+
+        if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 or self.adjT > self.dT - 1:
+            raise RuntimeError('adjW, adjH and adjT must be smaller than self.dW - 1, '
+                               ' self.dH - 1 and self.dT - 1 respectively')
+
+        self.weight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+        self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+        self.bias = torch.Tensor(self.nOutputPlane)
+        self.gradBias = torch.Tensor(self.nOutputPlane)
+
+        self.ones = torch.Tensor()
+        self.finput = torch.Tensor()
+        self.fgradInput = torch.Tensor()
+        self._input = None
+        self._gradOutput = None
+
+        self.reset()
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            nInputPlane = self.nInputPlane
+            kT = self.kT
+            kH = self.kH
+            kW = self.kW
+            stdv = 1. / math.sqrt(kW * kH * kT * nInputPlane)
+
+        self.weight.uniform_(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)
+
+    def _makeContiguous(self, input, gradOutput=None):
+        if not input.is_contiguous():
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
+
+        if gradOutput is not None:
+            if not gradOutput.is_contiguous():
+                if self._gradOutput is None:
+                    self._gradOutput = gradOutput.new()
+                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
+                gradOutput = self._gradOutput
+            return input, gradOutput
+
+        return input
+
+    def _calculateAdj(targetSize, ker, pad, stride):
+        return (targetSize + 2 * pad - ker) % stride
+
+    def updateOutput(self, input):
+        inputTensor = input
+        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(input, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
+            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
+
+        inputTensor = self._makeContiguous(inputTensor)
+        self._backend.VolumetricFullConvolution_updateOutput(
+            self._backend.library_state,
+            inputTensor,
+            self.output,
+            self.weight,
+            self.bias,
+            self.finput,
+            self.fgradInput,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            adjT, adjW, adjH
+        )
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        inputTensor = input
+        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(input, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
+            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
+            # Momentarily extract the gradInput tensor
+            if isinstance(self.gradInput, list):
+                self.gradInput = self.gradInput[0]
+
+        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
+        self._backend.VolumetricFullConvolution_updateGradInput(
+            self._backend.library_state,
+            inputTensor,
+            gradOutput,
+            self.gradInput,
+            self.weight,
+            self.finput,
+            self.fgradInput,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            adjT, adjW, adjH
+        )
+
+        if isinstance(input, list):
+            # Create a zero tensor to be expanded and used as gradInput[1].
+            if self.zeroScalar is None:
+                self.zeroScalar = input[1].new(1).zero_()
+            self.ones.resize_(input[1].dim()).fill_(1)
+            zeroTensor = self.zeroScalar.view(self.ones.tolist()).expand_as(input[1])
+            self.gradInput = [self.gradInput, zeroTensor]
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        inputTensor = input
+        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+        # The input can be a table where the second element indicates the target
+        # output size, in which case the adj factors are computed automatically
+        if isinstance(input, list):
+            inputTensor = input[0]
+            targetTensor = input[1]
+            tDims = targetTensor.dim()
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
+            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
+            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
+            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
+
+        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
+        self._backend.VolumetricFullConvolution_accGradParameters(
+            self._backend.library_state,
+            inputTensor,
+            gradOutput,
+            self.gradWeight,
+            self.gradBias,
+            self.finput,
+            self.fgradInput,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            adjT, adjW, adjH,
+            scale
+        )
+
+    def type(self, type, tensorCache=None):
+        self.finput = torch.Tensor()
+        self.fgradInput = torch.Tensor()
+        return super(VolumetricFullConvolution, self).type(type, tensorCache)
+
+    def __repr__(self):
+        s = super(VolumetricFullConvolution, self).__repr__()
+        s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+        if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
+                self.padT != 0 or self.padW != 0 or self.padH != 0 or \
+                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
+            s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
+
+        if self.padT != 0 or self.padW != 0 or self.padH != 0 or \
+                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
+            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
+
+        if self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
+            s += ', {}, {}, {}'.format(self.adjT, self.adjW, self.adjH)
+
+        s += ')'
+        return s
diff --git a/torch/legacy/nn/VolumetricMaxPooling.py b/torch/legacy/nn/VolumetricMaxPooling.py
new file mode 100644
index 0000000..823ab05
--- /dev/null
+++ b/torch/legacy/nn/VolumetricMaxPooling.py
@@ -0,0 +1,78 @@
+import torch
+from .Module import Module
+from .utils import clear
+
+
+class VolumetricMaxPooling(Module):
+
+    def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None, padT=0, padW=0, padH=0):
+        super(VolumetricMaxPooling, self).__init__()
+
+        self.kT = kT
+        self.kH = kH
+        self.kW = kW
+        self.dT = dT or kT
+        self.dW = dW or kW
+        self.dH = dH or kH
+
+        self.padT = padT
+        self.padW = padW
+        self.padH = padH
+
+        self.ceil_mode = False
+        self.indices = torch.LongTensor()
+
+    def ceil(self):
+        self.ceil_mode = True
+        return self
+
+    def floor(self):
+        self.ceil_mode = False
+        return self
+
+    def updateOutput(self, input):
+        dims = input.dim()
+        self.itime = input.size(dims - 3)
+        self.iheight = input.size(dims - 2)
+        self.iwidth = input.size(dims - 1)
+
+        if self.indices is None:
+            self.indices = input.new()
+        self.indices = self.indices.long()
+        self._backend.VolumetricMaxPooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            self.ceil_mode
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._backend.VolumetricMaxPooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices,
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH,
+            self.ceil_mode
+        )
+        return self.gradInput
+
+    def clearState(self):
+        clear(self, 'indices')
+        return super(VolumetricMaxPooling, self).clearState()
+
+    def __repr__(self):
+        s = super(VolumetricMaxPooling, self).__repr__()
+        s += '({}x{}x{}, {}, {}, {}'.format(self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+        if self.padT != 0 or self.padW != 0 or self.padH != 0:
+            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
+        s += ')'
+        return s
diff --git a/torch/legacy/nn/VolumetricMaxUnpooling.py b/torch/legacy/nn/VolumetricMaxUnpooling.py
new file mode 100644
index 0000000..4de3b52
--- /dev/null
+++ b/torch/legacy/nn/VolumetricMaxUnpooling.py
@@ -0,0 +1,56 @@
+import torch
+from .Module import Module
+from .VolumetricMaxPooling import VolumetricMaxPooling
+
+
+class VolumetricMaxUnpooling(Module):
+
+    def __init__(self, poolingModule):
+        super(VolumetricMaxUnpooling, self).__init__()
+        assert isinstance(poolingModule, VolumetricMaxPooling)
+        assert poolingModule.kT == poolingModule.dT
+        assert poolingModule.kH == poolingModule.dH
+        assert poolingModule.kW == poolingModule.dW
+        self.pooling = poolingModule
+
+    def _setParams(self):
+        self.indices = self.pooling.indices
+        self.otime = self.pooling.itime
+        self.oheight = self.pooling.iheight
+        self.owidth = self.pooling.iwidth
+        self.dT = self.pooling.dT
+        self.dH = self.pooling.dH
+        self.dW = self.pooling.dW
+        self.padT = self.pooling.padT
+        self.padH = self.pooling.padH
+        self.padW = self.pooling.padW
+
+    def updateOutput(self, input):
+        self._setParams()
+        self._backend.VolumetricMaxUnpooling_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.indices,
+            self.otime, self.owidth, self.oheight,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH
+        )
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        self._setParams()
+        self._backend.VolumetricMaxUnpooling_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.indices,
+            self.otime, self.owidth, self.oheight,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH
+        )
+        return self.gradInput
+
+    def __repr__(self):
+        return 'nn.VolumetricMaxUnpooling associated to ' + self.pooling.__repr__()
diff --git a/torch/legacy/nn/VolumetricReplicationPadding.py b/torch/legacy/nn/VolumetricReplicationPadding.py
new file mode 100644
index 0000000..16cc7a1
--- /dev/null
+++ b/torch/legacy/nn/VolumetricReplicationPadding.py
@@ -0,0 +1,55 @@
+import torch
+from .Module import Module
+
+
+class VolumetricReplicationPadding(Module):
+
+    def __init__(self, pleft, pright=None, ptop=None, pbottom=None, pfront=None, pback=None):
+        super(VolumetricReplicationPadding, self).__init__()
+        self.pleft = pleft
+        self.pright = pright or pleft
+        self.ptop = ptop or pleft
+        self.pbottom = pbottom or pleft
+        self.pfront = pfront or pleft
+        self.pback = pback or pleft
+
+    def updateOutput(self, input):
+        assert input.dim() == 5
+        self._backend.VolumetricReplicationPadding_updateOutput(
+            self._backend.library_state,
+            input,
+            self.output,
+            self.pleft, self.pright,
+            self.ptop, self.pbottom,
+            self.pfront, self.pback
+        )
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        assert input.dim() == 5 and gradOutput.dim() == 5
+        assert input.size(0) == gradOutput.size(0)
+        assert input.size(1) == gradOutput.size(1)
+        assert input.size(2) + self.pfront + self.pback == gradOutput.size(2)
+        assert input.size(3) + self.ptop + self.pbottom == gradOutput.size(3)
+        assert input.size(4) + self.pleft + self.pright == gradOutput.size(4)
+
+        self._backend.VolumetricReplicationPadding_updateGradInput(
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput,
+            self.pleft, self.pright,
+            self.ptop, self.pbottom,
+            self.pfront, self.pback
+        )
+
+        return self.gradInput
+
+    def __repr__(self):
+        s = super(VolumetricReplicationPadding, self).__repr__()
+        s += '({}, {}, {}, {}, {}, {})'.format(self.pleft, self.pright,
+                                               self.ptop, self.pbottom,
+                                               self.pfront, self.pback
+                                               )
+        return s
diff --git a/torch/legacy/nn/WeightedEuclidean.py b/torch/legacy/nn/WeightedEuclidean.py
new file mode 100644
index 0000000..d8cfaf3
--- /dev/null
+++ b/torch/legacy/nn/WeightedEuclidean.py
@@ -0,0 +1,260 @@
+import math
+import torch
+from .Module import Module
+
+
+class WeightedEuclidean(Module):
+
+    def __init__(self, inputSize, outputSize):
+        super(WeightedEuclidean, self).__init__()
+
+        self.weight = torch.Tensor(inputSize, outputSize)
+        self.gradWeight = torch.Tensor(inputSize, outputSize)
+
+        # each template (output dim) has its own diagonal covariance matrix
+        self.diagCov = torch.Tensor(inputSize, outputSize)
+        self.gradDiagCov = torch.Tensor(inputSize, outputSize)
+
+        self.reset()
+        self._diagCov = self.output.new()
+
+        # TODO: confirm
+        self.fastBackward = False
+
+        self._input = None
+        self._weight = None
+        self._expand = None
+        self._expand2 = None
+        self._expand3 = None
+        self._repeat = None
+        self._repeat2 = None
+        self._repeat3 = None
+        self._div = None
+        self._output = None
+        self._expand4 = None
+        self._gradOutput = None
+        self._sum = None
+
+    def reset(self, stdv=None):
+        if stdv is not None:
+            stdv = stdv * math.sqrt(3)
+        else:
+            stdv = 1. / math.sqrt(self.weight.size(1))
+
+        self.weight.uniform_(-stdv, stdv)
+        self.diagCov.fill_(1)
+
+    def _view(self, res, src, *args):
+        if src.is_contiguous():
+            res.set_(src.view(*args))
+        else:
+            res.set_(src.contiguous().view(*args))
+
+    def updateOutput(self, input):
+        # lazy-initialize
+        if self._diagCov is None:
+            self._diagCov = self.output.new()
+
+        if self._input is None:
+            self._input = input.new()
+        if self._weight is None:
+            self._weight = self.weight.new()
+        if self._expand is None:
+            self._expand = self.output.new()
+        if self._expand2 is None:
+            self._expand2 = self.output.new()
+        if self._expand3 is None:
+            self._expand3 = self.output.new()
+        if self._repeat is None:
+            self._repeat = self.output.new()
+        if self._repeat2 is None:
+            self._repeat2 = self.output.new()
+        if self._repeat3 is None:
+            self._repeat3 = self.output.new()
+
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        # y_j = || c_j * (w_j - x) ||
+        if input.dim() == 1:
+            self._view(self._input, input, inputSize, 1)
+            self._expand.expand_as(self._input, self.weight)
+            self._repeat.resize_as_(self._expand).copy_(self._expand)
+            self._repeat.add_(-1, self.weight)
+            self._repeat.mul_(self.diagCov)
+            torch.norm(self._repeat, 2, 0, True, out=self.output)
+            self.output.resize_(outputSize)
+        elif input.dim() == 2:
+            batchSize = input.size(0)
+
+            self._view(self._input, input, batchSize, inputSize, 1)
+            self._expand = self._input.expand(batchSize, inputSize, outputSize)
+            # make the expanded tensor contiguous (requires lots of memory)
+            self._repeat.resize_as_(self._expand).copy_(self._expand)
+
+            self._weight = self.weight.view(1, inputSize, outputSize)
+            self._expand2 = self._weight.expand_as(self._repeat)
+
+            self._diagCov = self.diagCov.view(1, inputSize, outputSize)
+            self._expand3 = self._diagCov.expand_as(self._repeat)
+            if input.type() == 'torch.cuda.FloatTensor':
+                # TODO: this can be fixed with a custom allocator
+                # requires lots of memory, but minimizes cudaMallocs and loops
+                self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
+                self._repeat.add_(-1, self._repeat2)
+                self._repeat3.resize_as_(self._expand3).copy_(self._expand3)
+                self._repeat.mul_(self._repeat3)
+            else:
+                self._repeat.add_(-1, self._expand2)
+                self._repeat.mul_(self._expand3)
+
+            torch.norm(self._repeat, 2, 1, True, out=self.output)
+            self.output.resize_(batchSize, outputSize)
+        else:
+            raise RuntimeError("1D or 2D input expected")
+
+        return self.output
+
+    def updateGradInput(self, input, gradOutput):
+        if self.gradInput is None:
+            return
+
+        if self._div is None:
+            self._div = input.new()
+        if self._output is None:
+            self._output = self.output.new()
+        if self._expand4 is None:
+            self._expand4 = input.new()
+        if self._gradOutput is None:
+            self._gradOutput = input.new()
+
+        if not self.fastBackward:
+            self.updateOutput(input)
+
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        """
+        dy_j   -2 * c_j * c_j * (w_j - x)   c_j * c_j * (x - w_j)
+        ---- = -------------------------- = ---------------------
+         dx     2 || c_j * (w_j - x) ||              y_j
+        """
+
+        # to prevent div by zero (NaN) bugs
+        self._output.resize_as_(self.output).copy_(self.output).add_(1e-7)
+        self._view(self._gradOutput, gradOutput, gradOutput.size())
+        torch.div(gradOutput, self._output, out=self._div)
+        if input.dim() == 1:
+            self._div.resize_(1, outputSize)
+            self._expand4 = self._div.expand_as(self.weight)
+
+            if torch.type(input) == 'torch.cuda.FloatTensor':
+                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
+                self._repeat2.mul_(self._repeat)
+            else:
+                self._repeat2.mul_(self._repeat, self._expand4)
+
+            self._repeat2.mul_(self.diagCov)
+            torch.sum(self._repeat2, 1, True, out=self.gradInput)
+            self.gradInput.resize_as_(input)
+        elif input.dim() == 2:
+            batchSize = input.size(0)
+
+            self._div.resize_(batchSize, 1, outputSize)
+            self._expand4 = self._div.expand(batchSize, inputSize, outputSize)
+
+            if input.type() == 'torch.cuda.FloatTensor':
+                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
+                self._repeat2.mul_(self._repeat)
+                self._repeat2.mul_(self._repeat3)
+            else:
+                torch.mul(self._repeat, self._expand4, out=self._repeat2)
+                self._repeat2.mul_(self._expand3)
+
+            torch.sum(self._repeat2, 2, True, out=self.gradInput)
+            self.gradInput.resize_as_(input)
+        else:
+            raise RuntimeError("1D or 2D input expected")
+
+        return self.gradInput
+
+    def accGradParameters(self, input, gradOutput, scale=1):
+        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
+
+        """
+        dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
+        ---- = -------------------------- = ---------------------
+        dw_j    2 || c_j * (w_j - x) ||             y_j
+
+        dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2
+        ---- = ----------------------- = -----------------
+        dc_j   2 || c_j * (w_j - x) ||         y_j
+        #"""
+        # assumes a preceding call to updateGradInput
+        if input.dim() == 1:
+            self.gradWeight.add_(-scale, self._repeat2)
+
+            self._repeat.div_(self.diagCov)
+            self._repeat.mul_(self._repeat)
+            self._repeat.mul_(self.diagCov)
+
+            if torch.type(input) == 'torch.cuda.FloatTensor':
+                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
+                self._repeat2.mul_(self._repeat)
+            else:
+                torch.mul(self._repeat, self._expand4, out=self._repeat2)
+
+            self.gradDiagCov.add_(self._repeat2)
+        elif input.dim() == 2:
+            if self._sum is None:
+                self._sum = input.new()
+            torch.sum(self._repeat2, 0, True, out=self._sum)
+            self._sum.resize_(inputSize, outputSize)
+            self.gradWeight.add_(-scale, self._sum)
+
+            if input.type() == 'torch.cuda.FloatTensor':
+                # requires lots of memory, but minimizes cudaMallocs and loops
+                self._repeat.div_(self._repeat3)
+                self._repeat.mul_(self._repeat)
+                self._repeat.mul_(self._repeat3)
+                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
+                self._repeat.mul_(self._repeat2)
+            else:
+                self._repeat.div_(self._expand3)
+                self._repeat.mul_(self._repeat)
+                self._repeat.mul_(self._expand3)
+                self._repeat.mul_(self._expand4)
+
+            torch.sum(self._repeat, 0, True, out=self._sum)
+            self._sum.resize_(inputSize, outputSize)
+            self.gradDiagCov.add_(scale, self._sum)
+        else:
+            raise RuntimeError("1D or 2D input expected")
+
+    def type(self, type=None, tensorCache=None):
+        if type:
+            # prevent premature memory allocations
+            self._input = None
+            self._output = None
+            self._gradOutput = None
+            self._weight = None
+            self._div = None
+            self._sum = None
+            self._expand = None
+            self._expand2 = None
+            self._expand3 = None
+            self._expand4 = None
+            self._repeat = None
+            self._repeat2 = None
+            self._repeat3 = None
+        return super(WeightedEuclidean, self).type(type, tensorCache)
+
+    def parameters(self):
+        return [self.weight, self.diagCov], [self.gradWeight, self.gradDiagCov]
+
+    def accUpdateGradParameters(self, input, gradOutput, lr):
+        gradWeight = self.gradWeight
+        gradDiagCov = self.gradDiagCov
+        self.gradWeight = self.weight
+        self.gradDiagCov = self.diagCov
+        self.accGradParameters(input, gradOutput, -lr)
+        self.gradWeight = gradWeight
+        self.gradDiagCov = gradDiagCov
diff --git a/torch/legacy/nn/WeightedMSECriterion.py b/torch/legacy/nn/WeightedMSECriterion.py
new file mode 100644
index 0000000..2f0da29
--- /dev/null
+++ b/torch/legacy/nn/WeightedMSECriterion.py
@@ -0,0 +1,55 @@
+import torch
+from torch.nn.functional import _Reduction
+from .Criterion import Criterion
+
+
+class WeightedMSECriterion(Criterion):
+
+    def __init__(self, weight, sizeAverage=True):
+        super(WeightedMSECriterion, self).__init__()
+        self.weight = weight.clone()
+        self.buffer = None
+        self.output_tensor = None
+        self.sizeAverage = sizeAverage
+
+    def updateOutput(self, input, target):
+        if self.buffer is None:
+            self.buffer = input.new()
+        self.buffer.resize_as_(input).copy_(target)
+        if input.dim() - 1 == self.weight.dim():
+            for i in range(input.size(0)):
+                self.buffer[i].mul_(self.weight)
+        else:
+            self.buffer.mul_(self.weight)
+
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.MSECriterion_updateOutput(
+            self._backend.library_state,
+            input,
+            self.buffer,
+            self.output_tensor,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        self.output = self.output_tensor[0].item()
+        return self.output
+
+    def updateGradInput(self, input, target):
+        self.buffer.resize_as_(input).copy_(target)
+        if input.dim() - 1 == self.weight.dim():
+            for i in range(input.size(0)):
+                self.buffer[i].mul_(self.weight)
+        else:
+            self.buffer.mul_(self.weight)
+
+        implicit_gradOutput = torch.Tensor([1]).type(input.type())
+
+        self._backend.MSECriterion_updateGradInput(
+            self._backend.library_state,
+            input,
+            self.buffer,
+            implicit_gradOutput,
+            self.gradInput,
+            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
+        )
+        return self.gradInput
diff --git a/torch/legacy/nn/__init__.py b/torch/legacy/nn/__init__.py
new file mode 100644
index 0000000..7e2507a
--- /dev/null
+++ b/torch/legacy/nn/__init__.py
@@ -0,0 +1,152 @@
+from torch._thnn import type2backend
+
+from . import utils
+
+from .Module import Module
+from .Criterion import Criterion
+from .Container import Container
+
+from .Sequential import Sequential
+from .Parallel import Parallel
+from .Concat import Concat
+from .DepthConcat import DepthConcat
+from .ConcatTable import ConcatTable
+from .JoinTable import JoinTable
+from .ParallelTable import ParallelTable
+
+from .Abs import Abs
+from .AbsCriterion import AbsCriterion
+from .Add import Add
+from .AddConstant import AddConstant
+from .BCECriterion import BCECriterion
+from .BatchNormalization import BatchNormalization
+from .Bilinear import Bilinear
+from .CAddTable import CAddTable
+from .CDivTable import CDivTable
+from .CMul import CMul
+from .CMulTable import CMulTable
+from .CSubTable import CSubTable
+from .ClassNLLCriterion import ClassNLLCriterion
+from .Contiguous import Contiguous
+from .Copy import Copy
+from .Cosine import Cosine
+from .CosineDistance import CosineDistance
+from .CosineEmbeddingCriterion import CosineEmbeddingCriterion
+from .CriterionTable import CriterionTable
+from .CrossEntropyCriterion import CrossEntropyCriterion
+from .DistKLDivCriterion import DistKLDivCriterion
+from .DotProduct import DotProduct
+from .Dropout import Dropout
+from .ELU import ELU
+from .Euclidean import Euclidean
+from .Exp import Exp
+from .FlattenTable import FlattenTable
+from .GradientReversal import GradientReversal
+from .HardShrink import HardShrink
+from .HardTanh import HardTanh
+from .HingeEmbeddingCriterion import HingeEmbeddingCriterion
+from .Identity import Identity
+from .Index import Index
+from .L1Cost import L1Cost
+from .L1HingeEmbeddingCriterion import L1HingeEmbeddingCriterion
+from .L1Penalty import L1Penalty
+from .LeakyReLU import LeakyReLU
+from .Linear import Linear
+from .Log import Log
+from .LogSigmoid import LogSigmoid
+from .LogSoftMax import LogSoftMax
+from .LookupTable import LookupTable
+from .MM import MM
+from .MSECriterion import MSECriterion
+from .MarginCriterion import MarginCriterion
+from .MarginRankingCriterion import MarginRankingCriterion
+from .MaskedSelect import MaskedSelect
+from .Max import Max
+from .Min import Min
+from .MixtureTable import MixtureTable
+from .Mul import Mul
+from .MulConstant import MulConstant
+from .MultiCriterion import MultiCriterion
+from .MV import MV
+from .MultiLabelMarginCriterion import MultiLabelMarginCriterion
+from .MultiLabelSoftMarginCriterion import MultiLabelSoftMarginCriterion
+from .MultiMarginCriterion import MultiMarginCriterion
+from .Narrow import Narrow
+from .NarrowTable import NarrowTable
+from .Normalize import Normalize
+from .PReLU import PReLU
+from .Padding import Padding
+from .PairwiseDistance import PairwiseDistance
+from .ParallelCriterion import ParallelCriterion
+from .PartialLinear import PartialLinear
+from .Power import Power
+from .RReLU import RReLU  # TODO implement
+from .ReLU6 import ReLU6
+from .Replicate import Replicate
+from .Reshape import Reshape
+from .Select import Select
+from .SelectTable import SelectTable
+from .Sigmoid import Sigmoid
+from .SmoothL1Criterion import SmoothL1Criterion
+from .SoftMarginCriterion import SoftMarginCriterion
+from .SoftMax import SoftMax
+from .SoftMin import SoftMin
+from .SoftPlus import SoftPlus
+from .SoftShrink import SoftShrink
+from .SoftSign import SoftSign
+from .SpatialAdaptiveMaxPooling import SpatialAdaptiveMaxPooling
+from .SpatialAveragePooling import SpatialAveragePooling
+from .SpatialBatchNormalization import SpatialBatchNormalization
+from .SpatialClassNLLCriterion import SpatialClassNLLCriterion
+from .SpatialContrastiveNormalization import SpatialContrastiveNormalization
+from .SpatialConvolution import SpatialConvolution
+from .SpatialConvolutionLocal import SpatialConvolutionLocal
+from .SpatialConvolutionMap import SpatialConvolutionMap
+from .SpatialCrossMapLRN import SpatialCrossMapLRN
+from .SpatialDilatedConvolution import SpatialDilatedConvolution
+from .SpatialDivisiveNormalization import SpatialDivisiveNormalization
+from .SpatialDropout import SpatialDropout
+from .SpatialFractionalMaxPooling import SpatialFractionalMaxPooling
+from .SpatialFullConvolution import SpatialFullConvolution
+from .SpatialFullConvolutionMap import SpatialFullConvolutionMap
+from .SpatialLPPooling import SpatialLPPooling
+from .SpatialMaxPooling import SpatialMaxPooling
+from .SpatialMaxUnpooling import SpatialMaxUnpooling
+from .SpatialReflectionPadding import SpatialReflectionPadding
+from .SpatialReplicationPadding import SpatialReplicationPadding
+from .SpatialSoftMax import SpatialSoftMax
+from .SpatialSubSampling import SpatialSubSampling
+from .SpatialSubtractiveNormalization import SpatialSubtractiveNormalization
+from .SpatialUpSamplingNearest import SpatialUpSamplingNearest
+from .SpatialZeroPadding import SpatialZeroPadding
+from .SplitTable import SplitTable
+from .Sqrt import Sqrt
+from .Square import Square
+from .Squeeze import Squeeze
+from .Sum import Sum
+from .Tanh import Tanh
+from .TanhShrink import TanhShrink
+from .Threshold import Threshold
+from .Transpose import Transpose
+from .Unsqueeze import Unsqueeze
+from .View import View
+from .WeightedEuclidean import WeightedEuclidean
+from .WeightedMSECriterion import WeightedMSECriterion
+
+from .TemporalConvolution import TemporalConvolution
+from .TemporalMaxPooling import TemporalMaxPooling
+from .TemporalSubSampling import TemporalSubSampling
+
+from .VolumetricAveragePooling import VolumetricAveragePooling
+from .VolumetricBatchNormalization import VolumetricBatchNormalization
+from .VolumetricConvolution import VolumetricConvolution
+from .VolumetricDropout import VolumetricDropout
+from .VolumetricFullConvolution import VolumetricFullConvolution
+from .VolumetricMaxPooling import VolumetricMaxPooling
+from .VolumetricMaxUnpooling import VolumetricMaxUnpooling
+from .VolumetricReplicationPadding import VolumetricReplicationPadding
+
+from .Clamp import Clamp
+from .ClassSimplexCriterion import ClassSimplexCriterion
+from .ReLU import ReLU
+from .Mean import Mean
diff --git a/torch/legacy/nn/convert.vim b/torch/legacy/nn/convert.vim
new file mode 100644
index 0000000..c83d953
--- /dev/null
+++ b/torch/legacy/nn/convert.vim
@@ -0,0 +1,52 @@
+"Slightly adjust indentation
+%s/^   /        /g
+
+" # -> len
+%s/#\(\S*\) /len(\1)/g
+
+" for loops
+%s/for\( \)\{-\}\(\S*\)\( \)\{-\}=\( \)\{-\}\(\S*\),\( \)\{-\}\(\S*\)\( \)\{-\}do/for \2 in range(\5, \7+1)/g
+
+" Change comments
+%s/--\[\[/"""/g
+%s/]]/"""/g
+%s/--/#/g
+
+" Add spacing between commas
+%s/\(\S\),\(\S\)/\1, \2/g
+
+%s/local //g
+%s/ then/:/g
+%s/ do/:/g
+%s/end//g
+%s/elseif/elif/g
+%s/else/else:/g
+%s/true/True/g
+%s/false/False/g
+%s/\~=/!=/g
+%s/math\.min/min/g
+%s/math\.max/max/g
+%s/math\.abs/abs/g
+
+
+%s/__init/__init__/g
+
+" Rewrite function declarations
+%s/function \w*:\(\w*\)/    def \1/g
+%s/def \(.*\)$/def \1:/g
+
+" class declaration
+%s/\(\w*\), parent = torch\.class.*$/import torch\rfrom torch.legacy import nn\r\rclass \1(nn.Module):/g
+
+%s/input\.THNN/self._backend/g
+%s/\(self\.backend\w*$\)/\1\r        self._backend.library_state,/g
+%s/def \(\w*\)(/def \1(self, /g
+
+%s/__init__(self)/__init__()/g
+
+%s/:\(\S\)/.\1/g
+
+%s/\.cdata()//g
+%s/THNN\.optionalTensor(\(.*\))/\1/g
+
+%s/parent\./super(##, self)./g
diff --git a/torch/legacy/nn/utils.py b/torch/legacy/nn/utils.py
new file mode 100644
index 0000000..1dc3008
--- /dev/null
+++ b/torch/legacy/nn/utils.py
@@ -0,0 +1,153 @@
+import torch
+
+# tensorCache maintains a list of all tensors and storages that have been
+# converted (recursively) by calls to recursiveType() and type().
+# It caches conversions in order to preserve sharing semantics
+# i.e. if two tensors share a common storage, then type conversion
+# should preserve that.
+#
+# You can preserve sharing semantics across multiple networks by
+# passing tensorCache between the calls to type, e.g.
+#
+# > tensorCache = {}
+# > net1:type('torch.cuda.FloatTensor', tensorCache)
+# > net2:type('torch.cuda.FloatTensor', tensorCache)
+# > nn.utils.recursiveType(anotherTensor, 'torch.cuda.FloatTensor', tensorCache)
+
+
+def recursiveType(param, type, tensorCache={}):
+    from .Criterion import Criterion
+    from .Module import Module
+    if isinstance(param, list):
+        for i, p in enumerate(param):
+            param[i] = recursiveType(p, type, tensorCache)
+    elif isinstance(param, Module) or isinstance(param, Criterion):
+        param.type(type, tensorCache)
+    elif isinstance(param, torch.Tensor):
+        if param.type() != type:
+            key = param._cdata
+            if key in tensorCache:
+                newparam = tensorCache[key]
+            else:
+                newparam = torch.Tensor().type(type)
+                storageType = type.replace('Tensor', 'Storage')
+                param_storage = param.storage()
+                if param_storage:
+                    storage_key = param_storage._cdata
+                    if storage_key not in tensorCache:
+                        tensorCache[storage_key] = torch._import_dotted_name(
+                            storageType)(param_storage.size()).copy_(param_storage)
+                    newparam.set_(
+                        tensorCache[storage_key],
+                        param.storage_offset(),
+                        param.size(),
+                        param.stride()
+                    )
+                tensorCache[key] = newparam
+            param = newparam
+    return param
+
+
+def recursiveResizeAs(t1, t2):
+    if isinstance(t2, list):
+        t1 = t1 if isinstance(t1, list) else [t1]
+        if len(t1) < len(t2):
+            t1 += [None] * (len(t2) - len(t1))
+        for i, _ in enumerate(t2):
+            t1[i], t2[i] = recursiveResizeAs(t1[i], t2[i])
+        t1 = t1[:len(t2)]
+    elif isinstance(t2, torch.Tensor):
+        t1 = t1 if isinstance(t1, torch.Tensor) else t2.new()
+        t1.resize_as_(t2)
+    else:
+        raise RuntimeError("Expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + "instead")
+    return t1, t2
+
+
+def recursiveFill(t2, val):
+    if isinstance(t2, list):
+        t2 = [recursiveFill(x, val) for x in t2]
+    elif isinstance(t2, torch.Tensor):
+        t2.fill_(val)
+    else:
+        raise RuntimeError("expecting tensor or table thereof. Got " +
+                           type(t2).__name__ + " instead")
+    return t2
+
+
+def recursiveAdd(t1, val=1, t2=None):
+    if t2 is None:
+        t2 = val
+        val = 1
+    if isinstance(t2, list):
+        t1 = t1 if isinstance(t1, list) else [t1]
+        for i, _ in enumerate(t2):
+            t1[i], t2[i] = recursiveAdd(t1[i], val, t2[i])
+    elif isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
+        t1.add_(val, t2)
+    else:
+        raise RuntimeError("expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
+    return t1, t2
+
+
+def recursiveCopy(t1, t2):
+    if isinstance(t2, list):
+        t1 = t1 if isinstance(t1, list) else [t1]
+        for i, _ in enumerate(t2):
+            t1[i], t2[i] = recursiveCopy(t1[i], t2[i])
+    elif isinstance(t2, torch.Tensor):
+        t1 = t1 if isinstance(t1, torch.Tensor) else t2.new()
+        t1.resize_as_(t2).copy_(t2)
+    else:
+        raise RuntimeError("expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
+    return t1, t2
+
+
+def addSingletondimension(*args):
+    view = None
+    if len(args) < 3:
+        t, dim = args
+        return t.unsqueeze(dim)
+    else:
+        view, t, dim = args
+        assert isinstance(view, torch.Tensor)
+        view.set_(t)
+        return view.unsqueeze_(dim)
+
+
+def contiguousView(output, input, *args):
+    if output is None:
+        output = input.new()
+    if input.is_contiguous():
+        output.set_(input.view(*args))
+    else:
+        output.resize_as_(input)
+        output.copy_(input)
+        output.set_(output.view(*args))
+    return output
+
+# go over specified fields and clear them. accepts
+# nn.clearState(self, ['_buffer', '_buffer2']) and
+# nn.clearState(self, '_buffer', '_buffer2')
+
+
+def clear(self, *args):
+    if len(args) == 1 and isinstance(args[0], list):
+        args = args[0]
+
+    def _clear(f):
+        if not hasattr(self, f):
+            return
+        attr = getattr(self, f)
+        if isinstance(attr, torch.Tensor):
+            attr.set_()
+        elif isinstance(attr, list):
+            del attr[:]
+        else:
+            setattr(self, f, None)
+    for key in args:
+        _clear(key)
+    return self
diff --git a/torch/legacy/optim/__init__.py b/torch/legacy/optim/__init__.py
new file mode 100644
index 0000000..7e07b45
--- /dev/null
+++ b/torch/legacy/optim/__init__.py
@@ -0,0 +1,11 @@
+from .adadelta import adadelta
+from .adagrad import adagrad
+from .adam import adam
+from .adamax import adamax
+from .asgd import asgd
+from .cg import cg
+from .nag import nag
+from .rmsprop import rmsprop
+from .rprop import rprop
+from .sgd import sgd
+from .lbfgs import lbfgs
diff --git a/torch/legacy/optim/adadelta.py b/torch/legacy/optim/adadelta.py
new file mode 100644
index 0000000..86f264b
--- /dev/null
+++ b/torch/legacy/optim/adadelta.py
@@ -0,0 +1,55 @@
+
+def adadelta(opfunc, x, config, state=None):
+    """ADADELTA implementation http://arxiv.org/abs/1212.5701
+
+    ARGUMENTS:
+    - `opfunc` : a function that takes a single input (X), the point of
+                evaluation, and returns f(X) and df/dX
+    - `x` : the initial point
+    - `config` : a table of hyper-parameters
+    - `config['rho']` : interpolation parameter
+    - `config['eps']` : for numerical stability
+    - `config['weightDecay']` : weight decay
+    - `state` : a table describing the state of the optimizer; after each
+            call the state is modified
+    - `state['paramVariance']` : vector of temporal variances of parameters
+    - `state['accDelta']` : vector of accummulated delta of gradients
+    RETURNS:
+    - `x` : the new x vector
+    - `f(x)` : the value of optimized function, evaluated before the update
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("adadelta requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    rho = config.get('rho', 0.9)
+    eps = config.get('eps', 1e-6)
+    wd = config.get('weightDecay', 0)
+    state['evalCounter'] = state.get('evalCounter', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # (3) parameter update
+    if 'paramVariance' not in state:
+        state['paramVariance'] = x.new().resize_as_(dfdx).zero_()
+        state['paramStd'] = x.new().resize_as_(dfdx).zero_()
+        state['delta'] = x.new().resize_as_(dfdx).zero_()
+        state['accDelta'] = x.new().resize_as_(dfdx).zero_()
+
+    state['paramVariance'].mul_(rho).addcmul_(1 - rho, dfdx, dfdx)
+    state['paramStd'].resize_as_(state['paramVariance']).copy_(state['paramVariance']).add_(eps).sqrt_()
+    state['delta'].resize_as_(state['paramVariance']).copy_(
+        state['accDelta']).add_(eps).sqrt_().div_(state['paramStd']).mul_(dfdx)
+    x.add_(-1, state['delta'])
+    state['accDelta'].mul_(rho).addcmul_(1 - rho, state['delta'], state['delta'])
+
+    # (4) update evaluation counter
+    state['evalCounter'] += 1
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/adagrad.py b/torch/legacy/optim/adagrad.py
new file mode 100644
index 0000000..42f2b91
--- /dev/null
+++ b/torch/legacy/optim/adagrad.py
@@ -0,0 +1,51 @@
+
+def adagrad(opfunc, x, config, state=None):
+    """ADAGRAD implementation
+
+    ARGS:
+    - `opfunc` : a function that takes a single input (X), the point of
+            evaluation, and returns f(X) and df/dX
+    - `x` : the initial point
+    - `state` : a table describing the state of the optimizer; after each
+            call the state is modified
+    - `state['learningRate']` : learning rate
+    - `state['paramVariance']` : vector of temporal variances of parameters
+    - `state['weightDecay']` : scalar that controls weight decay
+    RETURN:
+    - `x` : the new x vector
+    - `f(x)` : the value of optimized function, evaluated before the update
+
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("adagrad requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    lr = config.get('learningRate', 1e-3)
+    lrd = config.get('learningRateDecay', 0)
+    wd = config.get('weightDecay', 0)
+    state['evalCounter'] = state.get('evalCounter', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay with a single parameter
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # (3) learning rate decay (annealing)
+    clr = lr / (1 + state['evalCounter'] * lrd)
+
+    # (4) parameter update with single or individual learning rates
+    if 'paramVariance' not in state:
+        state['paramVariance'] = x.new().resize_as_(dfdx).zero_()
+        state['paramStd'] = x.new().resize_as_(dfdx)
+
+    state['paramVariance'].addcmul_(1, dfdx, dfdx)
+    state['paramStd'].resize_as_(state['paramVariance']).copy_(state['paramVariance']).sqrt_()
+    x.addcdiv_(-clr, dfdx, state['paramStd'].add_(1e-10))
+
+    # (5) update evaluation counter
+    state['evalCounter'] += 1
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/adam.py b/torch/legacy/optim/adam.py
new file mode 100644
index 0000000..f0d6b85
--- /dev/null
+++ b/torch/legacy/optim/adam.py
@@ -0,0 +1,68 @@
+import math
+
+
+def adam(opfunc, x, config, state=None):
+    """ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
+
+    ARGS:
+
+    - 'opfunc' : a function that takes a single input (X), the point
+                of a evaluation, and returns f(X) and df/dX
+    - 'x'      : the initial point
+    - 'config` : a table with configuration parameters for the optimizer
+    - 'config.learningRate'      : learning rate
+    - 'config.beta1'             : first moment coefficient
+    - 'config.beta2'             : second moment coefficient
+    - 'config.epsilon'           : for numerical stability
+    - 'config.weightDecay'       : weight decay
+    - 'state'                    : a table describing the state of the optimizer; after each
+                                call the state is modified
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the value of optimized function, evaluated before the update
+
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("adam requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    lr = config.get('learningRate', 0.001)
+    beta1 = config.get('beta1', 0.9)
+    beta2 = config.get('beta2', 0.999)
+    epsilon = config.get('epsilon', 1e-8)
+    wd = config.get('weightDecay', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # Initialization
+    if 't' not in state:
+        state['t'] = 0
+        # Exponential moving average of gradient values
+        state['m'] = x.new().resize_as_(dfdx).zero_()
+        # Exponential moving average of squared gradient values
+        state['v'] = x.new().resize_as_(dfdx).zero_()
+        # A tmp tensor to hold the sqrt(v) + epsilon
+        state['denom'] = x.new().resize_as_(dfdx).zero_()
+
+    state['t'] += 1
+
+    # Decay the first and second moment running average coefficient
+    state['m'].mul_(beta1).add_(1 - beta1, dfdx)
+    state['v'].mul_(beta2).addcmul_(1 - beta2, dfdx, dfdx)
+
+    state['denom'].copy_(state['v']).sqrt_().add_(epsilon)
+
+    biasCorrection1 = 1 - beta1 ** state['t']
+    biasCorrection2 = 1 - beta2 ** state['t']
+    stepSize = lr * math.sqrt(biasCorrection2) / biasCorrection1
+    # (3) update x
+    x.addcdiv_(-stepSize, state['m'], state['denom'])
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/adamax.py b/torch/legacy/optim/adamax.py
new file mode 100644
index 0000000..916991b
--- /dev/null
+++ b/torch/legacy/optim/adamax.py
@@ -0,0 +1,68 @@
+import torch
+
+
+def adamax(opfunc, x, config, state=None):
+    """ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
+
+    ARGS:
+
+    - 'opfunc' : a function that takes a single input (X), the point
+                of a evaluation, and returns f(X) and df/dX
+    - 'x'      : the initial point
+    - 'config` : a table with configuration parameters for the optimizer
+    - 'config.learningRate'      : learning rate
+    - 'config.beta1'             : first moment coefficient
+    - 'config.beta2'             : second moment coefficient
+    - 'config.epsilon'           : for numerical stability
+    - 'config.weightDecay'       : weight decay
+    - 'state'                    : a table describing the state of the optimizer;
+                                   after each call the state is modified.
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the value of optimized function, evaluated before the update
+
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("adamax requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    lr = config.get('learningRate', 0.002)
+    beta1 = config.get('beta1', 0.9)
+    beta2 = config.get('beta2', 0.999)
+    epsilon = config.get('epsilon', 1e-38)
+    wd = config.get('weightDecay', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # Initialization
+    if 't' not in state:
+        state['t'] = 0
+        # Exponential moving average of gradient values
+        state['m'] = x.new().resize_as_(dfdx).zero_()
+        # Exponential moving average of the infinity norm
+        state['u'] = x.new().resize_as_(dfdx).zero_()
+        # A tmp tensor to hold the input to max()
+        state['max'] = x.new(*((2,) + dfdx.size())).zero_()
+
+    state['t'] += 1
+
+    # Update biased first moment estimate.
+    state['m'].mul_(beta1).add_(1 - beta1, dfdx)
+    # Update the exponentially weighted infinity norm.
+    state['max'][0].copy_(state['u']).mul_(beta2)
+    state['max'][1].copy_(dfdx).abs_().add_(epsilon)
+    torch.max(state['max'], 0, keepdim=False, out=(state['u'], state['u'].new().long()))
+
+    biasCorrection1 = 1 - beta1 ** state['t']
+    stepSize = lr / biasCorrection1
+    # (2) update x
+    x.addcdiv_(-stepSize, state['m'], state['u'])
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/asgd.py b/torch/legacy/optim/asgd.py
new file mode 100644
index 0000000..edaa62d
--- /dev/null
+++ b/torch/legacy/optim/asgd.py
@@ -0,0 +1,75 @@
+import math
+
+
+def asgd(opfunc, x, config, state=None):
+    """ An implementation of ASGD
+
+    ASGD:
+
+        x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
+        a := a + mu_t [ x - a ]
+
+        eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
+        mu_t = 1/max(1,t-t0)
+
+    implements ASGD algorithm as in L.Bottou's sgd-2.0
+
+    ARGS:
+
+    - `opfunc` : a function that takes a single input (X), the point of
+            evaluation, and returns f(X) and df/dX
+    - `x`      : the initial point
+    - `state`  : a table describing the state of the optimizer; after each
+            call the state is modified
+    - `state['eta0']`   : learning rate
+    - `state['lambda']` : decay term
+    - `state['alpha']`  : power for eta update
+    - `state['t0']`     : point at which to start averaging
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the function, evaluated before the update
+    - `ax`    : the averaged x vector
+
+    (Clement Farabet, 2012)
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("asgd requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    config['eta0'] = config.get('eta0', 1e-4)
+    config['lambda'] = config.get('lambda', 1e-4)
+    config['alpha'] = config.get('alpha', 0.75)
+    config['t0'] = config.get('t0', 1e6)
+
+    # (hidden state)
+    state['eta_t'] = state.get('eta_t', config['eta0'])
+    state['mu_t'] = state.get('mu_t', 1)
+    state['t'] = state.get('t', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) decay term
+    x.mul_(1 - config['lambda'] * state['eta_t'])
+
+    # (3) update x
+    x.add_(-state['eta_t'], dfdx)
+
+    # (4) averaging
+    state['ax'] = state.get('ax', x.new().resize_as_(x).zero_())
+    state['tmp'] = state.get('tmp', state['ax'].new().resize_as_(state['ax']))
+    if state['mu_t'] != 1:
+        state['tmp'].copy_(x)
+        state['tmp'].add_(-1, state['ax']).mul_(state['mu_t'])
+        state['ax'].add_(state['tmp'])
+    else:
+        state['ax'].copy_(x)
+
+    # (5) update eta_t and mu_t
+    state['t'] += 1
+    state['eta_t'] = config['eta0'] / math.pow((1 + config['lambda'] * config['eta0'] * state['t']), config['alpha'])
+    state['mu_t'] = 1 / max(1, state['t'] - config['t0'])
+
+    # return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
+    return x, fx, state['ax']
diff --git a/torch/legacy/optim/cg.py b/torch/legacy/optim/cg.py
new file mode 100644
index 0000000..7880489
--- /dev/null
+++ b/torch/legacy/optim/cg.py
@@ -0,0 +1,217 @@
+import math
+
+INFINITY = float('inf')
+NAN = float('nan')
+
+
+def sqrt_nothrow(x):
+    return math.sqrt(x) if x >= 0 else NAN
+
+
+def cg(opfunc, x, config, state=None):
+    """
+
+    This cg implementation is a rewrite of minimize.m written by Carl
+    E. Rasmussen. It is supposed to produce exactly same results (give
+    or take numerical accuracy due to some changed order of
+    operations). You can compare the result on rosenbrock with minimize.m.
+    http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
+
+        [x fx c] = minimize([0 0]', 'rosenbrock', -25)
+
+    Note that we limit the number of function evaluations only, it seems much
+    more important in practical use.
+
+    ARGS:
+
+    - `opfunc` : a function that takes a single input, the point of evaluation.
+    - `x`      : the initial point
+    - `state` : a table of parameters and temporary allocations.
+    - `state['maxEval']`     : max number of function evaluations
+    - `state['maxIter']`     : max number of iterations
+    - `state['df0']` : if you pass torch.Tensor they will be used for temp storage
+    - `state['df1']` : if you pass torch.Tensor they will be used for temp storage
+    - `state['df2']` : if you pass torch.Tensor they will be used for temp storage
+    - `state['df3']` : if you pass torch.Tensor they will be used for temp storage
+    - `state['s']`   : if you pass torch.Tensor they will be used for temp storage
+    - `state['x0']`  : if you pass torch.Tensor they will be used for temp storage
+
+    RETURN:
+    - `x*` : the new x vector, at the optimal point
+    - `f`  : a table of all function values where
+        `f[1]` is the value of the function before any optimization and
+        `f[#f]` is the final fully optimized value, at x*
+
+    (Koray Kavukcuoglu, 2012)
+    """
+    # parameters
+    if config is None and state is None:
+        raise ValueError("cg requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    rho = config.get('rho', 0.01)
+    sig = config.get('sig', 0.5)
+    _int = config.get('int', 0.1)
+    ext = config.get('ext', 3.0)
+    maxIter = config.get('maxIter', 20)
+    ratio = config.get('ratio', 100)
+    maxEval = config.get('maxEval', maxIter * 1.25)
+    red = 1
+
+    i = 0
+    ls_failed = 0
+    fx = []
+
+    # we need three points for the interpolation/extrapolation stuff
+    z1, z2, z3 = 0, 0, 0
+    d1, d2, d3 = 0, 0, 0
+    f1, f2, f3 = 0, 0, 0
+
+    df1 = state.get('df1', x.new())
+    df2 = state.get('df2', x.new())
+    df3 = state.get('df3', x.new())
+
+    df1.resize_as_(x)
+    df2.resize_as_(x)
+    df3.resize_as_(x)
+
+    # search direction
+    s = state.get('s', x.new())
+    s.resize_as_(x)
+
+    # we need a temp storage for X
+    x0 = state.get('x0', x.new())
+    f0 = 0
+    df0 = state.get('df0', x.new())
+    x0.resize_as_(x)
+    df0.resize_as_(x)
+
+    # evaluate at initial point
+    f1, tdf = opfunc(x)
+    fx.append(f1)
+    df1.copy_(tdf)
+    i = i + 1
+
+    # initial search direction
+    s.copy_(df1).mul_(-1)
+
+    d1 = -s.dot(s)         # slope
+    z1 = red / (1 - d1)         # initial step
+
+    while i < abs(maxEval):
+        x0.copy_(x)
+        f0 = f1
+        df0.copy_(df1)
+
+        x.add_(z1, s)
+        f2, tdf = opfunc(x)
+        df2.copy_(tdf)
+        i = i + 1
+        d2 = df2.dot(s)
+        f3, d3, z3 = f1, d1, -z1   # init point 3 equal to point 1
+        m = min(maxIter, maxEval - i)
+        success = 0
+        limit = -1
+
+        while True:
+            while (f2 > f1 + z1 * rho * d1 or d2 > -sig * d1) and m > 0:
+                limit = z1
+                if f2 > f1:
+                    z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3)
+                else:
+                    A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
+                    B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
+                    z2 = (sqrt_nothrow(B * B - A * d2 * z3 * z3) - B) / A
+
+                if z2 != z2 or z2 == INFINITY or z2 == -INFINITY:
+                    z2 = z3 / 2
+
+                z2 = max(min(z2, _int * z3), (1 - _int) * z3)
+                z1 = z1 + z2
+                x.add_(z2, s)
+                f2, tdf = opfunc(x)
+                df2.copy_(tdf)
+                i = i + 1
+                m = m - 1
+                d2 = df2.dot(s)
+                z3 = z3 - z2
+
+            if f2 > f1 + z1 * rho * d1 or d2 > -sig * d1:
+                break
+            elif d2 > sig * d1:
+                success = 1
+                break
+            elif m == 0:
+                break
+
+            A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
+            B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
+            _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3))
+            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else NAN
+
+            if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0:
+                if limit < -0.5:
+                    z2 = z1 * (ext - 1)
+                else:
+                    z2 = (limit - z1) / 2
+            elif (limit > -0.5) and (z2 + z1) > limit:
+                z2 = (limit - z1) / 2
+            elif limit < -0.5 and (z2 + z1) > z1 * ext:
+                z2 = z1 * (ext - 1)
+            elif z2 < -z3 * _int:
+                z2 = -z3 * _int
+            elif limit > -0.5 and z2 < (limit - z1) * (1 - _int):
+                z2 = (limit - z1) * (1 - _int)
+
+            f3 = f2
+            d3 = d2
+            z3 = -z2
+            z1 = z1 + z2
+            x.add_(z2, s)
+
+            f2, tdf = opfunc(x)
+            df2.copy_(tdf)
+            i = i + 1
+            m = m - 1
+            d2 = df2.dot(s)
+
+        if success == 1:
+            f1 = f2
+            fx.append(f1)
+            ss = (df2.dot(df2) - df2.dot(df1)) / df1.dot(df1)
+            s.mul_(ss)
+            s.add_(-1, df2)
+            tmp = df1.clone()
+            df1.copy_(df2)
+            df2.copy_(tmp)
+            d2 = df1.dot(s)
+            if d2 > 0:
+                s.copy_(df1)
+                s.mul_(-1)
+                d2 = -s.dot(s)
+
+            z1 = z1 * min(ratio, d1 / (d2 - 1e-320))
+            d1 = d2
+            ls_failed = 0
+        else:
+            x.copy_(x0)
+            f1 = f0
+            df1.copy_(df0)
+            if ls_failed or i > maxEval:
+                break
+
+            tmp = df1.clone()
+            df1.copy_(df2)
+            df2.copy_(tmp)
+            s.copy_(df1)
+            s.mul_(-1)
+            d1 = -s.dot(s)
+            z1 = 1 / (1 - d1)
+            ls_failed = 1
+
+    state['df0'] = df0
+    state['df1'] = df1
+    state['df2'] = df2
+    state['df3'] = df3
+    state['x0'] = x0
+    state['s'] = s
+    return x, fx, i
diff --git a/torch/legacy/optim/lbfgs.py b/torch/legacy/optim/lbfgs.py
new file mode 100644
index 0000000..838db03
--- /dev/null
+++ b/torch/legacy/optim/lbfgs.py
@@ -0,0 +1,254 @@
+import torch
+
+
+def lbfgs(opfunc, x, config, state=None):
+    """
+    An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
+    This implementation of L-BFGS relies on a user-provided line
+    search function (state.lineSearch). If this function is not
+    provided, then a simple learningRate is used to produce fixed
+    size steps. Fixed size steps are much less costly than line
+    searches, and can be useful for stochastic problems.
+    The learning rate is used even when a line search is provided.
+    This is also useful for large-scale stochastic problems, where
+    opfunc is a noisy approximation of f(x). In that case, the learning
+    rate allows a reduction of confidence in the step size.
+
+    Args:
+    - `opfunc` : a function that takes a single input (X), the point of
+             evaluation, and returns f(X) and df/dX
+    - `x` : the initial point
+    - `state` : a table describing the state of the optimizer; after each
+             call the state is modified
+    - `state.maxIter` : Maximum number of iterations allowed
+    - `state.maxEval` : Maximum number of function evaluations
+    - `state.tolFun` : Termination tolerance on the first-order optimality
+    - `state.tolX` : Termination tol on progress in terms of func/param changes
+    - `state.lineSearch` : A line search function
+    - `state.learningRate` : If no line search provided, then a fixed step size is used
+
+    Returns:
+    - `x*` : the new `x` vector, at the optimal point
+    - `f`  : a table of all function values:
+         `f[1]` is the value of the function before any optimization and
+         `f[#f]` is the final fully optimized value, at `x*`
+
+    (Clement Farabet, 2012)
+    """
+
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("lbfgs requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    maxIter = config.get('maxIter', 20)
+    maxEval = config.get('maxEval', maxIter * 1.25)
+    tolFun = config.get('tolFun', 1e-5)
+    tolX = config.get('tolX', 1e-9)
+    nCorrection = config.get('nCorrection', 100)
+    lineSearch = config.get('lineSearch')
+    lineSearchOptions = config.get('lineSearchOptions')
+    learningRate = config.get('learningRate', 1)
+    isverbose = config.get('verbose', False)
+
+    state.setdefault('funcEval', 0)
+    state.setdefault('nIter', 0)
+
+    # verbose function
+    if isverbose:
+        def verbose(*args):
+            args = ('<optim.lbfgs>',) + args
+            print(args)
+    else:
+        def verbose(*args):
+            pass
+
+    # evaluate initial f(x) and df/dx
+    f, g = opfunc(x)
+    f_hist = [f]
+    currentFuncEval = 1
+    state['funcEval'] += 1
+    p = g.size(0)
+
+    # check optimality of initial point
+    if 'tmp1' not in state:
+        state['tmp1'] = g.new(g.size()).zero_()
+    tmp1 = state['tmp1']
+    tmp1.copy_(g).abs_()
+    if tmp1.sum() <= tolFun:
+        verbose('optimality condition below tolFun')
+        return x, f_hist
+
+    if 'dir_bufs' not in state:
+        # reusable buffers for y's and s's, and their histories
+        verbose('creating recyclable direction/step/history buffers')
+        state['dir_bufs'] = list(g.new(nCorrection + 1, p).split(1))
+        state['stp_bufs'] = list(g.new(nCorrection + 1, p).split(1))
+        for i in range(len(state['dir_bufs'])):
+            state['dir_bufs'][i] = state['dir_bufs'][i].squeeze(0)
+            state['stp_bufs'][i] = state['stp_bufs'][i].squeeze(0)
+
+    # variables cached in state (for tracing)
+    d = state.get('d')
+    t = state.get('t')
+    old_dirs = state.get('old_dirs')
+    old_stps = state.get('old_stps')
+    Hdiag = state.get('Hdiag')
+    g_old = state.get('g_old')
+    f_old = state.get('f_old')
+
+    # optimize for a max of maxIter iterations
+    nIter = 0
+    while nIter < maxIter:
+        # keep track of nb of iterations
+        nIter += 1
+        state['nIter'] += 1
+
+        ############################################################
+        # compute gradient descent direction
+        ############################################################
+        if state['nIter'] == 1:
+            d = g.neg()
+            old_dirs = []
+            old_stps = []
+            Hdiag = 1
+        else:
+            # do lbfgs update (update memory)
+            y = state['dir_bufs'].pop()
+            s = state['stp_bufs'].pop()
+            torch.add(g, g_old, alpha=-1, out=y)
+            torch.mul(d, t, out=s)
+            ys = y.dot(s)  # y*s
+            if ys > 1e-10:
+                # updating memory
+                if len(old_dirs) == nCorrection:
+                    # shift history by one (limited-memory)
+                    state['dir_bufs'].append(old_dirs.pop(0))
+                    state['stp_bufs'].append(old_stps.pop(0))
+
+                # store new direction/step
+                old_dirs.append(s)
+                old_stps.append(y)
+
+                # update scale of initial Hessian approximation
+                Hdiag = ys / y.dot(y)  # (y*y)
+            else:
+                # put y and s back into the buffer pool
+                state['dir_bufs'].append(y)
+                state['stp_bufs'].append(s)
+
+            # compute the approximate (L-BFGS) inverse Hessian
+            # multiplied by the gradient
+            k = len(old_dirs)
+
+            # need to be accessed element-by-element, so don't re-type tensor:
+            if 'ro' not in state:
+                state['ro'] = torch.Tensor(nCorrection)
+            ro = state['ro']
+
+            for i in range(k):
+                ro[i] = 1 / old_stps[i].dot(old_dirs[i])
+
+            # iteration in L-BFGS loop collapsed to use just one buffer
+            q = tmp1  # reuse tmp1 for the q buffer
+            # need to be accessed element-by-element, so don't re-type tensor:
+            if 'al' not in state:
+                state['al'] = torch.zeros(nCorrection)
+            al = state['al']
+
+            torch.mul(g, -1, out=q)
+            for i in range(k - 1, -1, -1):
+                al[i] = old_dirs[i].dot(q) * ro[i]
+                q.add_(-al[i], old_stps[i])
+
+            # multiply by initial Hessian
+            r = d  # share the same buffer, since we don't need the old d
+            torch.mul(q, Hdiag, out=r)
+            for i in range(k):
+                be_i = old_stps[i].dot(r) * ro[i]
+                r.add_(al[i] - be_i, old_dirs[i])
+            # final direction is in r/d (same object)
+        if g_old is None:
+            g_old = g.clone()
+        else:
+            g_old.copy_(g)
+        f_old = f
+
+        ############################################################
+        # compute step length
+        ############################################################
+        # directional derivative
+        gtd = g.dot(d)  # g * d
+
+        # reset initial guess for step size
+        if state['nIter'] == 1:
+            tmp1.copy_(g).abs_()
+            t = min(1, 1 / tmp1.sum()) * learningRate
+        else:
+            t = learningRate
+
+        # optional line search: user function
+        lsFuncEval = 0
+        if lineSearch is not None:
+            # perform line search, using user function
+            f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd, lineSearchOpts)
+            f_hist.append(f)
+        else:
+            # no line search, simply move with fixed-step
+            x.add_(t, d)
+            if nIter != maxIter:
+                # re-evaluate function only if not in last iteration
+                # the reason we do this: in a stochastic setting,
+                # no use to re-evaluate that function here
+                f, g = opfunc(x)
+                lsFuncEval = 1
+                f_hist.append(f)
+
+        # update func eval
+        currentFuncEval += lsFuncEval
+        state['funcEval'] += lsFuncEval
+
+        ############################################################
+        # check conditions
+        ############################################################
+        if nIter == maxIter:
+            # no use to run tests
+            verbose('reached max number of iterations')
+            break
+
+        if currentFuncEval >= maxEval:
+            # max nb of function evals
+            verbose('max nb of function evals')
+            break
+
+        tmp1.copy_(g).abs_()
+        if tmp1.sum() <= tolFun:
+            # check optimality
+            verbose('optimality condition below tolFun')
+            break
+
+        # check that progress can be made along that direction
+        if gtd > -tolX:
+            break
+
+        tmp1.copy_(d).mul_(t).abs_()
+        if tmp1.sum() <= tolX:
+            # step size below tolX
+            verbose('step size below tolX')
+            break
+
+        if abs(f - f_old) < tolX:
+            # function value changing less than tolX
+            verbose('function value changing less than tolX')
+            break
+
+    # save state
+    state['old_dirs'] = old_dirs
+    state['old_stps'] = old_stps
+    state['Hdiag'] = Hdiag
+    state['g_old'] = g_old
+    state['f_old'] = f_old
+    state['t'] = t
+    state['d'] = d
+
+    # return optimal x, and history of f(x)
+    return x, f_hist, currentFuncEval
diff --git a/torch/legacy/optim/nag.py b/torch/legacy/optim/nag.py
new file mode 100644
index 0000000..e6f568c
--- /dev/null
+++ b/torch/legacy/optim/nag.py
@@ -0,0 +1,82 @@
+
+def nag(opfunc, x, config, state=None):
+    """
+    An implementation of SGD adapted with features of Nesterov's
+    Accelerated Gradient method, based on the paper
+    On the Importance of Initialization and Momentum in Deep Learning
+    Sutsveker et. al., ICML 2013
+
+    ARGS:
+    opfunc : a function that takes a single input (X), the point of
+            evaluation, and returns f(X) and df/dX
+    x      : the initial point
+    state  : a table describing the state of the optimizer; after each
+            call the state is modified
+    state['learningRate']      : learning rate
+    state['learningRateDecay'] : learning rate decay
+    state['weightDecay']       : weight decay
+    state['momentum']          : momentum
+    state['learningRates']     : vector of individual learning rates
+
+    RETURN:
+    x     : the new x vector
+    f(x)  : the function, evaluated before the update
+
+    (Dilip Krishnan, 2013)
+    """
+
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("nag requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    lr = config.get('learningRate', 1e-3)
+    lrd = config.get('learningRateDecay', 0)
+    wd = config.get('weightDecay', 0)
+    mom = config.get('momentum', 0.9)
+    damp = config.get('dampening', mom)
+    lrs = config.get('learningRates', None)
+    state['evalCounter'] = state.get('evalCounter', 0)
+
+    if mom <= 0:
+        raise ValueError('Momentum must be positive for Nesterov Accelerated Gradient')
+
+    # (1) evaluate f(x) and df/dx
+    # first step in the direction of the momentum vector
+
+    if 'dfdx' in state:
+        x.add_(mom, state['dfdx'])
+
+    #: compute gradient at that point
+    # comment out the above line to get the original SGD
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # (3) learning rate decay (annealing)
+    clr = lr / (1 + state['evalCounter'] * lrd)
+
+    # (4) apply momentum
+    if 'dfdx' not in state:
+        state['dfdx'] = dfdx.new().resize_as_(dfdx).zero_()
+    else:
+        state['dfdx'].mul_(mom)
+
+    # (5) parameter update with single or individual learning rates
+    if lrs is not None:
+        if 'deltaParameters' in state:
+            state['deltaParameters'] = x.new().resize_as_(dfdx)
+
+        state['deltaParameters'].copy_(lrs).mul_(dfdx)
+        x.add_(-clr, state['deltaParameters'])
+        state['dfdx'].add_(-clr, state['deltaParameters'])
+    else:
+        x.add_(-clr, dfdx)
+        state['dfdx'].add_(-clr, dfdx)
+
+    # (6) update evaluation counter
+    state['evalCounter'] += 1
+
+    # return x, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/rmsprop.py b/torch/legacy/optim/rmsprop.py
new file mode 100644
index 0000000..4c8db68
--- /dev/null
+++ b/torch/legacy/optim/rmsprop.py
@@ -0,0 +1,57 @@
+import torch
+
+
+def rmsprop(opfunc, x, config, state=None):
+    """ An implementation of RMSprop
+
+    ARGS:
+
+    - 'opfunc' : a function that takes a single input (X), the point
+                of a evaluation, and returns f(X) and df/dX
+    - 'x'      : the initial point
+    - 'config` : a table with configuration parameters for the optimizer
+    - 'config['learningRate']'      : learning rate
+    - 'config['alpha']'             : smoothing constant
+    - 'config['epsilon']'           : value with which to initialise m
+    - 'config['weightDecay']'       : weight decay
+    - 'state'                    : a table describing the state of the optimizer;
+                                after each call the state is modified
+    - 'state['m']'                  : leaky sum of squares of parameter gradients,
+    - 'state['tmp']'                : and the square root (with epsilon smoothing)
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the function, evaluated before the update
+
+    """
+    # (0) get/update state
+    if config is None and state is None:
+        raise ValueError("rmsprop requires a dictionary to retain state between iterations")
+    state = state if state is not None else config
+    lr = config.get('learningRate', 1e-2)
+    alpha = config.get('alpha', 0.99)
+    epsilon = config.get('epsilon', 1e-8)
+    wd = config.get('weightDecay', 0)
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay
+    if wd != 0:
+        dfdx.add_(wd, x)
+
+    # (3) initialize mean square values and square gradient storage
+    if 'm' not in state:
+        state['m'] = x.new().resize_as_(dfdx).zero_()
+        state['tmp'] = x.new().resize_as_(dfdx)
+
+    # (4) calculate new (leaky) mean squared values
+    state['m'].mul_(alpha)
+    state['m'].addcmul_(1.0 - alpha, dfdx, dfdx)
+
+    # (5) perform update
+    torch.sqrt(state['m'], out=state['tmp']).add_(epsilon)
+    x.addcdiv_(-lr, dfdx, state['tmp'])
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/legacy/optim/rprop.py b/torch/legacy/optim/rprop.py
new file mode 100644
index 0000000..0595336
--- /dev/null
+++ b/torch/legacy/optim/rprop.py
@@ -0,0 +1,99 @@
+import torch
+
+
+def rprop(opfunc, x, config, state=None):
+    """ A plain implementation of RPROP
+
+    ARGS:
+    - `opfunc` : a function that takes a single input (X), the point of
+                evaluation, and returns f(X) and df/dX
+    - `x`      : the initial point
+    - `state`  : a table describing the state of the optimizer; after each
+                call the state is modified
+    - `state['stepsize']`    : initial step size, common to all components
+    - `state['etaplus']`     : multiplicative increase factor, > 1 (default 1.2)
+    - `state['etaminus']`    : multiplicative decrease factor, < 1 (default 0.5)
+    - `state['stepsizemax']` : maximum stepsize allowed (default 50)
+    - `state['stepsizemin']` : minimum stepsize allowed (default 1e-6)
+    - `state['niter']`       : number of iterations (default 1)
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the function, evaluated before the update
+
+    (Martin Riedmiller, Koray Kavukcuoglu 2013)
+    """
+    if config is None and state is None:
+        raise ValueError("rprop requires a dictionary to retain state between iterations")
+
+    # (0) get/update state
+    state = state if state is not None else config
+    stepsize = config.get('stepsize', 0.1)
+    etaplus = config.get('etaplus', 1.2)
+    etaminus = config.get('etaminus', 0.5)
+    stepsizemax = config.get('stepsizemax', 50.0)
+    stepsizemin = config.get('stepsizemin', 1e-06)
+    niter = config.get('niter', 1)
+
+    hfx = []
+
+    for i in range(niter):
+        # (1) evaluate f(x) and df/dx
+        fx, dfdx = opfunc(x)
+
+        # init temp storage
+        if 'delta' not in state:
+            state['delta'] = dfdx.new(dfdx.size()).zero_()
+            state['stepsize'] = dfdx.new(dfdx.size()).fill_(stepsize)
+            state['sign'] = dfdx.new(dfdx.size())
+            state['bytesign'] = torch.ByteTensor(dfdx.size())
+            state['psign'] = torch.ByteTensor(dfdx.size())
+            state['nsign'] = torch.ByteTensor(dfdx.size())
+            state['zsign'] = torch.ByteTensor(dfdx.size())
+            state['dminmax'] = torch.ByteTensor(dfdx.size())
+            if str(type(x)).find('Cuda') > -1:
+                # Push to GPU
+                state['psign'] = state['psign'].cuda()
+                state['nsign'] = state['nsign'].cuda()
+                state['zsign'] = state['zsign'].cuda()
+                state['dminmax'] = state['dminmax'].cuda()
+
+        # sign of derivative from last step to this one
+        torch.mul(dfdx, state['delta'], out=state['sign']).sign_()
+
+        # get indices of >0, <0 and ==0 entries
+        torch.gt(state['sign'], 0, out=state['psign'])
+        torch.lt(state['sign'], 0, out=state['nsign'])
+        torch.eq(state['sign'], 0, out=state['zsign'])
+
+        # get step size updates
+        state['sign'][state['psign']] = etaplus
+        state['sign'][state['nsign']] = etaminus
+        state['sign'][state['zsign']] = 1
+
+        # update stepsizes with step size updates
+        state['stepsize'].mul_(state['sign'])
+
+        # threshold step sizes
+        # >50 => 50
+        torch.gt(state['stepsize'], stepsizemax, out=state['dminmax'])
+        state['stepsize'][state['dminmax']] = stepsizemax
+        # <1e-6 ==> 1e-6
+        torch.lt(state['stepsize'], stepsizemin, out=state['dminmax'])
+        state['stepsize'][state['dminmax']] = stepsizemin
+
+        # for dir<0, dfdx=0
+        # for dir>=0 dfdx=dfdx
+        dfdx[state['nsign']] = 0
+        torch.sign(dfdx, out=state['sign'])
+
+        # update weights
+        x.addcmul_(-1, state['sign'], state['stepsize'])
+
+        # update state['dfdx'] with current dfdx
+        state['delta'].copy_(dfdx)
+
+        hfx.append(fx)
+
+    # return x*, table of f(x) values from each step
+    return x, hfx
diff --git a/torch/legacy/optim/sgd.py b/torch/legacy/optim/sgd.py
new file mode 100644
index 0000000..300654f
--- /dev/null
+++ b/torch/legacy/optim/sgd.py
@@ -0,0 +1,90 @@
+import torch
+
+
+def sgd(opfunc, x, config, state=None):
+    """A plain implementation of SGD
+
+    ARGS:
+
+    - `opfunc` : a function that takes a single input (X), the point
+                of a evaluation, and returns f(X) and df/dX
+    - `x`      : the initial point
+    - `config` : a table with configuration parameters for the optimizer
+    - `config['learningRate']`      : learning rate
+    - `config['learningRateDecay']` : learning rate decay
+    - `config['weightDecay']`       : weight decay
+    - `config['weightDecays']`      : vector of individual weight decays
+    - `config['momentum']`          : momentum
+    - `config['dampening']`         : dampening for momentum
+    - `config['nesterov']`          : enables Nesterov momentum
+    - `config['learningRates']`     : vector of individual learning rates
+    - `state`  : a table describing the state of the optimizer; after each
+                call the state is modified
+    - `state['evalCounter']`        : evaluation counter (optional: 0, by default)
+
+    RETURN:
+    - `x`     : the new x vector
+    - `f(x)`  : the function, evaluated before the update
+
+    (Clement Farabet, 2012)
+    """
+    # (0) get/update state
+    state = state if state is not None else config
+    lr = config.get('learningRate', 1e-3)
+    lrd = config.get('learningRateDecay', 0)
+    wd = config.get('weightDecay', 0)
+    mom = config.get('momentum', 0)
+    damp = config.get('dampening', mom)
+    nesterov = config.get('nesterov', False)
+    lrs = config.get('learningRates', None)
+    wds = config.get('weightDecays', None)
+    if 'evalCounter' not in state:
+        state['evalCounter'] = 0
+    if nesterov and (mom <= 0 and damp != 0):
+        raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+    if wd != 0 and wds is not None:
+        raise ValueError("Only one of wd and wds can be specified")
+
+    # (1) evaluate f(x) and df/dx
+    fx, dfdx = opfunc(x)
+
+    # (2) weight decay with single or individual parameters
+    if wd != 0:
+        dfdx.add_(wd, x)
+    elif wds is not None:
+        if not state['decayParameters']:
+            state['decayParameters'] = torch.Tensor().type_as(x).resize_as_(dfdx)
+
+        state['decayParameters'].copy_(wds).mul_(x)
+        dfdx.add_(state['decayParameters'])
+
+    # (3) apply momentum
+    if mom != 0:
+        if 'dfdx' not in state:
+            state['dfdx'] = torch.Tensor().type_as(dfdx).resize_as_(dfdx).copy_(dfdx)
+        else:
+            state['dfdx'].mul_(mom).add_(1 - damp, dfdx)
+
+        if nesterov:
+            dfdx.add_(mom, state['dfdx'])
+        else:
+            dfdx = state['dfdx']
+
+    # (4) learning rate decay (annealing)
+    clr = lr / (1 + state['evalCounter'] * lrd)
+
+    # (5) parameter update with single or individual learning rates
+    if lrs is not None:
+        if 'deltaParameters' not in state:
+            state['deltaParameters'] = torch.Tensor().type_as(x).resize_as_(dfdx)
+
+        state['deltaParameters'].copy_(lrs).mul_(dfdx)
+        x.add_(-clr, state['deltaParameters'])
+    else:
+        x.add_(-clr, dfdx)
+
+    # (6) update evaluation counter
+    state['evalCounter'] += 1
+
+    # return x*, f(x) before optimization
+    return x, fx
diff --git a/torch/lib/THD/CMakeLists.txt b/torch/lib/THD/CMakeLists.txt
new file mode 100644
index 0000000..e5cf3cb
--- /dev/null
+++ b/torch/lib/THD/CMakeLists.txt
@@ -0,0 +1,203 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+# TODO(jiayq): once we have unified CMake entry, remove this module path.
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules ${CMAKE_MODULE_PATH})
+SET(Gloo_USE_CUDA TRUE)
+
+################################################################################
+# Helper functions
+################################################################################
+
+FUNCTION(EXCLUDE_DIR list_name dir_name)
+  # A helper that excludes all files that contain dir_name in their file path
+  SET(local_list ${${list_name}})
+  FOREACH(source ${local_list})
+    IF(${source} MATCHES ${dir_name})
+      MESSAGE(STATUS "Excluding " ${source} " from the build")
+      LIST(REMOVE_ITEM local_list ${source})
+    ENDIF()
+  ENDFOREACH()
+  SET(${list_name} ${local_list} PARENT_SCOPE)
+ENDFUNCTION()
+
+################################################################################
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+INCLUDE(CheckCXXSourceCompiles)
+
+CHECK_CXX_SOURCE_COMPILES("
+#include <thread>
+
+thread_local int foo=1;
+
+int main() {
+  return 0;
+}" HAS_THREAD_LOCAL)
+
+IF(NOT HAS_THREAD_LOCAL)
+    MESSAGE(FATAL_ERROR "thread_local not supported. THD requires a compiler"
+                        " that supports thread_local. Please upgrade your "
+                        "compiler. If you are on macOS, upgrade to "
+                        "XCode 8 or newer.")
+ENDIF(NOT HAS_THREAD_LOCAL)
+
+
+FIND_PACKAGE(MPI)
+
+FIND_PACKAGE(Gloo)
+
+FIND_PACKAGE(Caffe2 REQUIRED)
+INCLUDE_DIRECTORIES(${CAFFE2_INCLUDE_DIR})
+
+IF(NOT USE_CUDA)
+  MESSAGE(STATUS "ignoring CUDA")
+ELSE()
+  FIND_PACKAGE(CUDA 7.5)
+  IF(CUDA_FOUND)
+    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+    LINK_DIRECTORIES("${CUDA_TOOLKIT_ROOT_DIR}/lib" "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+
+    ADD_DEFINITIONS(-DUSE_CUDA=1)
+  ENDIF()
+
+  FIND_PACKAGE(NCCL)
+ENDIF()
+
+IF(MPI_FOUND)
+  ADD_DEFINITIONS(-DWITH_MPI=1)
+  MESSAGE(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
+ENDIF()
+
+IF(GLOO_FOUND)
+  ADD_DEFINITIONS(-DWITH_GLOO=1)
+  MESSAGE(STATUS "Found Gloo, will compile with Gloo distributed backend")
+  IF(USE_GLOO_IBVERBS)
+    MESSAGE(STATUS "Building the gloo backend with both TCP and infiniband support")
+    ADD_DEFINITIONS(-DUSE_GLOO_IBVERBS=1)
+  ELSE()
+    MESSAGE(STATUS "Building the gloo backend with TCP support only")
+  ENDIF()
+ENDIF()
+
+IF(NCCL_FOUND)
+  MESSAGE(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
+  IF(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
+    MESSAGE(STATUS "NCCL Version 2 or higher found, will "
+                   "compile with NCCL distributed backend")
+    SET(DISTRIBUTED_NCCL_FOUND TRUE)
+    ADD_DEFINITIONS(-DUSE_DISTRIBUTED_NCCL=1)
+  ELSE()
+    MESSAGE(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
+                   "determinable, will not compile with NCCL distributed "
+                   "backend")
+  ENDIF()
+ELSE()
+  MESSAGE(STATUS "Not able to find NCCL, will not "
+                 "compile with NCCL distributed backend")
+ENDIF()
+
+ADD_DEFINITIONS(-D_THD_CORE=1)
+
+# Can be compiled standalone
+IF(NOT THD_INSTALL_BIN_DIR OR NOT THD_INSTALL_LIB_DIR OR NOT THD_INSTALL_INCLUDE_DIR)
+  SET(THD_INSTALL_BIN_DIR "bin" CACHE PATH "THD install binary subdirectory")
+  SET(THD_INSTALL_LIB_DIR "lib" CACHE PATH "THD install library subdirectory")
+  SET(THD_INSTALL_INCLUDE_DIR "include" CACHE PATH "THD install include subdirectory")
+ENDIF()
+
+FILE(GLOB_RECURSE master_worker_h RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "master_worker/*.h")
+FILE(GLOB_RECURSE master_worker_cpp "master_worker/*.cpp")
+FILE(GLOB_RECURSE process_group_h RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "process_group/*.h")
+FILE(GLOB_RECURSE process_group_cpp "process_group/*.cpp")
+FILE(GLOB_RECURSE base_h RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "base/*.h")
+FILE(GLOB_RECURSE base_cpp "base/*.cpp")
+FILE(GLOB_RECURSE test_cpp "test/*.cpp")
+
+IF(NOT MPI_FOUND)
+  LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/DataChannelMPI.cpp")
+  LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_mpi_smoke.cpp")
+ENDIF()
+
+IF(NOT GLOO_FOUND)
+  LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/DataChannelGloo.cpp")
+  LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/Store.cpp")
+  LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_gloo_store.cpp")
+  LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_gloo_cache.cpp")
+ENDIF()
+
+IF(NOT DISTRIBUTED_NCCL_FOUND)
+  LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/DataChannelNccl.cpp")
+ENDIF()
+
+EXCLUDE_DIR(master_worker_cpp ".*/dispatch/.*\\.cpp$")
+
+SET(all_cpp ${base_cpp} ${process_group_cpp})
+SET(all_h THD.h ${base_h} ${process_group_h})
+
+IF(USE_DISTRIBUTED_MW)
+  ADD_DEFINITIONS(-DUSE_DISTRIBUTED_MW=1)
+  SET(all_cpp ${all_cpp} ${master_worker_cpp})
+  SET(all_h THD.h ${all_h} ${master_worker_h})
+ENDIF()
+
+EXCLUDE_DIR(all_cpp ".*/generic/.*\\.cpp$")
+
+# Need to include external NCCL first
+IF(DISTRIBUTED_NCCL_FOUND)
+  INCLUDE_DIRECTORIES(${NCCL_INCLUDE_DIRS})
+  FILE(APPEND "${CMAKE_INSTALL_PREFIX}/THD_deps.txt" "${NCCL_LIBRARIES};")
+ENDIF()
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_LIBRARY(THD STATIC ${all_cpp})
+set_property(TARGET THD PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+FILE(WRITE "${CMAKE_INSTALL_PREFIX}/THD_deps.txt" "${CAFFE2_LIBRARIES};")
+
+IF(MPI_FOUND)
+  INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH})
+  FILE(APPEND "${CMAKE_INSTALL_PREFIX}/THD_deps.txt" "${MPI_LIBRARIES};")
+
+  IF(MPI_COMPILE_FLAGS)
+    MESSAGE(STATUS "MPI_COMPILE_FLAGS: ${MPI_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(THD PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
+  ENDIF()
+
+  IF(MPI_LINK_FLAGS)
+    MESSAGE(STATUS "MPI_LINK_FLAGS: ${MPI_LINK_FLAGS}")
+    SET_TARGET_PROPERTIES(THD PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}")
+  ENDIF()
+ENDIF()
+
+IF(GLOO_FOUND)
+  INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+  FILE(APPEND "${CMAKE_INSTALL_PREFIX}/THD_deps.txt" "${Gloo_LIBRARY};")
+  FILE(APPEND "${CMAKE_INSTALL_PREFIX}/THD_deps.txt" "${Gloo_NATIVE_LIBRARY};")
+ENDIF()
+
+# Test executables
+IF(THD_WITH_TESTS)
+  ENABLE_TESTING()
+  FIND_PACKAGE(Threads)
+  FOREACH(test_source_file ${test_cpp})
+    # Prepare test names
+    GET_FILENAME_COMPONENT(test_source_file ${test_source_file} NAME)
+    STRING(REPLACE ".cpp" "" test_name ${test_source_file})
+    SET(test_executable_name "test_${test_name}")
+
+    ADD_EXECUTABLE(${test_executable_name} "test/${test_source_file}")
+    TARGET_LINK_LIBRARIES(${test_executable_name} THD ${CAFFE2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+    SET_PROPERTY(TARGET ${test_executable_name} PROPERTY CXX_STANDARD 11)
+    ADD_TEST(${test_name} ${test_executable_name})
+  ENDFOREACH()
+ENDIF()
+
+INSTALL(TARGETS THD
+  RUNTIME DESTINATION "${THD_INSTALL_BIN_DIR}"
+  LIBRARY DESTINATION "${THD_INSTALL_LIB_DIR}"
+  ARCHIVE DESTINATION "${THD_INSTALL_LIB_DIR}")
+
+FOREACH(HEADER ${all_h})
+  STRING(REGEX MATCH "(.*)[/\\]" DIR ${HEADER})
+  INSTALL(FILES ${HEADER} DESTINATION ${THD_INSTALL_INCLUDE_DIR}/THD/${DIR})
+ENDFOREACH()
diff --git a/torch/lib/THD/THD.h b/torch/lib/THD/THD.h
new file mode 100644
index 0000000..e7f2f8b
--- /dev/null
+++ b/torch/lib/THD/THD.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifdef __cplusplus
+#define THD_API extern "C"
+#else
+#define THD_API
+#endif
+
+#ifndef _THD_CORE
+#include "base/TensorDescriptor.h"
+#include "base/DataChannelRequest.h"
+#else
+#include "base/TensorDescriptor.hpp"
+#include "base/DataChannelRequest.hpp"
+#endif
+#include "base/ChannelType.h"
+#include "base/Cuda.h"
+
+#include "process_group/General.h"
+#include "process_group/Collectives.h"
+
+#ifdef USE_DISTRIBUTED_MW
+#include "master_worker/master/Master.h"
+#include "master_worker/master/State.h"
+#include "master_worker/master/THDRandom.h"
+#include "master_worker/master/THDStorage.h"
+#include "master_worker/master/THDTensor.h"
+
+#include "master_worker/worker/Worker.h"
+#endif
diff --git a/torch/lib/THD/base/ChannelType.h b/torch/lib/THD/base/ChannelType.h
new file mode 100644
index 0000000..a54e563
--- /dev/null
+++ b/torch/lib/THD/base/ChannelType.h
@@ -0,0 +1,8 @@
+#pragma once
+
+enum THDChannelType {
+  THDChannelTCP = 0,
+  THDChannelMPI,
+  THDChannelGloo,
+  THDChannelNccl
+};
diff --git a/torch/lib/THD/base/ChannelUtils.cpp b/torch/lib/THD/base/ChannelUtils.cpp
new file mode 100644
index 0000000..971282f
--- /dev/null
+++ b/torch/lib/THD/base/ChannelUtils.cpp
@@ -0,0 +1,256 @@
+#include "ChannelUtils.hpp"
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/poll.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <thread>
+#include <algorithm>
+
+namespace thd {
+namespace {
+
+constexpr int LISTEN_QUEUE_SIZE = 64;
+
+void setSocketNoDelay(int socket) {
+  int flag = 1;
+  socklen_t optlen = sizeof(flag);
+  SYSCHECK(setsockopt(socket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, optlen));
+}
+
+port_type getSocketPort(int fd) {
+  port_type listen_port;
+  struct sockaddr_storage addr_storage;
+  socklen_t addr_len = sizeof(addr_storage);
+  SYSCHECK(getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr_storage), &addr_len));
+  if (addr_storage.ss_family == AF_INET) {
+    struct sockaddr_in *addr = reinterpret_cast<struct sockaddr_in*>(&addr_storage);
+    listen_port = ntohs(addr->sin_port);
+  } else if (addr_storage.ss_family == AF_INET6) { // AF_INET6
+    struct sockaddr_in6 *addr = reinterpret_cast<struct sockaddr_in6*>(&addr_storage);
+    listen_port = ntohs(addr->sin6_port);
+  } else {
+    throw std::runtime_error("unsupported protocol");
+  }
+  return listen_port;
+}
+
+} // anonymous namespace
+
+std::pair<std::string, std::string> splitAddress(const std::string &addr) {
+  std::string host, port;
+  auto num_colons = std::count(addr.begin(), addr.end(), ':');
+  if (num_colons > 1) {
+    // IPv6
+    auto end_pos = addr.find(']');
+    if (addr[0] != '[' || end_pos == std::string::npos) {
+      throw std::invalid_argument("IPv6 address in an incorrect format (maybe you forgot to add [ ])");
+    }
+    host = addr.substr(1, end_pos - 1);
+    port = addr.substr(end_pos + 2);
+  } else if (num_colons == 1) {
+    // IPv4 or HOSTNAME:PORT
+    auto sep_pos = addr.find(':');
+    host = addr.substr(0, sep_pos);
+    port = addr.substr(sep_pos + 1);
+  } else {
+    throw std::invalid_argument("expected an address in format IP:PORT or HOSTNAME:PORT");
+  }
+  if (addr == "" || port == "") {
+    throw std::invalid_argument("expected an address in format IP:PORT");
+  }
+  return std::make_pair(host, port);
+}
+
+std::string sockaddrToString(struct sockaddr *addr) {
+  char address[INET6_ADDRSTRLEN + 1];
+  if (addr->sa_family == AF_INET) {
+    struct sockaddr_in *s = reinterpret_cast<struct sockaddr_in*>(addr);
+    SYSCHECK(::inet_ntop(AF_INET, &(s->sin_addr), address, INET_ADDRSTRLEN))
+    address[INET_ADDRSTRLEN] = '\0';
+  } else if (addr->sa_family == AF_INET6) {
+    struct sockaddr_in6 *s = reinterpret_cast<struct sockaddr_in6*>(addr);
+    SYSCHECK(::inet_ntop(AF_INET6, &(s->sin6_addr), address, INET6_ADDRSTRLEN))
+    address[INET6_ADDRSTRLEN] = '\0';
+  } else {
+    throw std::runtime_error("unsupported protocol");
+  }
+  return address;
+}
+
+std::pair<int, port_type> listen(port_type port) {
+  struct addrinfo hints, *res = NULL;
+
+  std::memset(&hints, 0x00, sizeof(hints));
+  hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
+  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_socktype = SOCK_STREAM; // TCP
+
+  // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
+  // by editing `/etc/gai.conf`. so there is no need to manual sorting
+  // or protocol preference.
+  int err = ::getaddrinfo(nullptr, std::to_string(port).data(), &hints, &res);
+  if (err != 0 || !res) {
+    throw std::invalid_argument("cannot find host to listen on: " + std::string(gai_strerror(err)));
+  }
+
+  std::shared_ptr<struct addrinfo> addresses(res, [](struct addrinfo* p) {
+    ::freeaddrinfo(p);
+  });
+
+  struct addrinfo *next_addr = addresses.get();
+  int socket;
+  while (true) {
+    try {
+      SYSCHECK(socket = ::socket(next_addr->ai_family, next_addr->ai_socktype, next_addr->ai_protocol))
+
+      int optval = 1;
+      SYSCHECK(::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(int)))
+      SYSCHECK(::bind(socket, next_addr->ai_addr, next_addr->ai_addrlen))
+      SYSCHECK(::listen(socket, LISTEN_QUEUE_SIZE))
+      break;
+    } catch (const std::system_error& e) {
+      ::close(socket);
+      next_addr = next_addr->ai_next;
+
+      // we have tried all addresses but could not start listening on any of them
+      if (!next_addr) {
+        throw;
+      }
+    }
+  }
+
+  // get listen port and address
+  return {socket, getSocketPort(socket)};
+}
+
+
+int connect(const std::string& address, port_type port, bool wait, int timeout) {
+  struct addrinfo hints, *res = NULL;
+
+  std::memset(&hints, 0x00, sizeof(hints));
+  hints.ai_flags = AI_NUMERICSERV; // specifies that port (service) is numeric
+  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_socktype = SOCK_STREAM; // TCP
+
+  // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
+  // by editing `/etc/gai.conf`. so there is no need to manual sorting
+  // or protcol preference.
+  int err = ::getaddrinfo(address.data(), std::to_string(port).data(), &hints, &res);
+  if (err != 0 || !res) {
+    throw std::invalid_argument("host not found: " + std::string(gai_strerror(err)));
+  }
+
+  std::shared_ptr<struct addrinfo> addresses(res, [](struct addrinfo* p) {
+    ::freeaddrinfo(p);
+  });
+
+  struct addrinfo *next_addr = addresses.get();
+  int socket;
+  // we'll loop over the addresses only if at least of them gave us ECONNREFUSED.
+  // Maybe the host was up, but the server wasn't running.
+  bool any_refused = false;
+  while (true) {
+    try {
+      SYSCHECK(socket = ::socket(next_addr->ai_family, next_addr->ai_socktype, next_addr->ai_protocol))
+      ResourceGuard socket_guard([socket]() { ::close(socket); });
+
+      // We need to connect in non-blocking mode, so we can use a timeout
+      SYSCHECK(::fcntl(socket, F_SETFL, O_NONBLOCK));
+
+      int ret = ::connect(socket, next_addr->ai_addr, next_addr->ai_addrlen);
+      if (ret != 0 && errno != EINPROGRESS)
+        throw std::system_error(errno, std::system_category());
+
+      struct pollfd pfd;
+      pfd.fd = socket;
+      pfd.events = POLLOUT;
+
+      int num_ready = ::poll(&pfd, 1, timeout);
+      if (num_ready < 0) {
+        throw std::system_error(errno, std::system_category());
+      } else if (num_ready == 0) {
+        errno = 0;
+        throw std::runtime_error("connect() timed out");
+      }
+
+      socklen_t err_len = sizeof(errno);
+      errno = 0;
+      ::getsockopt(socket, SOL_SOCKET, SO_ERROR, &errno, &err_len);
+      /* `errno` is set when:
+       *   1. `getsockopt` has failed
+       *   2. there is awaiting error in the socket (the error is saved to the `errno` variable)
+       */
+      if (errno != 0) {
+        throw std::system_error(errno, std::system_category());
+      }
+
+      // Disable non-blocking mode
+      int flags;
+      SYSCHECK(flags = ::fcntl(socket, F_GETFL));
+      SYSCHECK(::fcntl(socket, F_SETFL, flags & (~O_NONBLOCK)));
+      socket_guard.release();
+      break;
+    } catch (std::exception& e) {
+      if (errno == ECONNREFUSED) any_refused = true;
+
+      // We need to move to the next address because this was not available
+      // to connect or to create a socket.
+      next_addr = next_addr->ai_next;
+
+      // We have tried all addresses but could not connect to any of them.
+      if (!next_addr) {
+        if (!wait || !any_refused) throw;
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        any_refused = false;
+        next_addr = addresses.get();
+      }
+    }
+  }
+
+  setSocketNoDelay(socket);
+
+  return socket;
+}
+
+std::tuple<int, std::string> accept(int listen_socket, int timeout) {
+  // poll on listen socket, it allows to make timeout
+  std::unique_ptr<struct pollfd[]> events(new struct pollfd[1]);
+  events[0] = {.fd = listen_socket, .events = POLLIN};
+
+  while (true) {
+    int res = ::poll(events.get(), 1, timeout);
+    if (res == 0) {
+      throw std::runtime_error("waiting for processes to connect has timed out");
+    } else if (res == -1) {
+      if (errno == EINTR) {
+        continue;
+      }
+      throw std::system_error(errno, std::system_category());
+    } else {
+      if (!(events[0].revents & POLLIN))
+        throw std::system_error(ECONNABORTED, std::system_category());
+      break;
+    }
+  }
+
+  int socket;
+  SYSCHECK(socket = ::accept(listen_socket, NULL, NULL))
+
+  // Get address of the connecting process
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  SYSCHECK(::getpeername(socket, reinterpret_cast<struct sockaddr*>(&addr), &addr_len))
+
+  setSocketNoDelay(socket);
+
+  return std::make_tuple(socket, sockaddrToString(reinterpret_cast<struct sockaddr*>(&addr)));
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/base/ChannelUtils.hpp b/torch/lib/THD/base/ChannelUtils.hpp
new file mode 100644
index 0000000..5bcfed0
--- /dev/null
+++ b/torch/lib/THD/base/ChannelUtils.hpp
@@ -0,0 +1,216 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <cstdlib>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <vector>
+
+
+inline void hash_combine(size_t& seed) { }
+
+template <typename T, typename... Rest>
+inline void hash_combine(size_t& seed, const T& v, Rest... rest) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  hash_combine(seed, rest...);
+}
+
+#define MAKE_HASHABLE(type, ...)                                              \
+  namespace std {                                                             \
+    template<> struct hash<type> {                                            \
+      size_t operator()(const type &t) const {                           \
+        size_t ret = 0;                                                  \
+        hash_combine(ret, __VA_ARGS__);                                       \
+        return ret;                                                           \
+      }                                                                       \
+    };                                                                        \
+  }
+
+
+namespace thd {
+
+enum class CollectiveType : std::uint8_t {
+  ALL_GATHER = 0,
+  GATHER,
+  SCATTER,
+  ALL_REDUCE,
+  REDUCE,
+  BROADCAST,
+  SEND,
+  BARRIER,
+  LAST
+};
+
+enum class DeviceType : std::uint8_t {
+  CPU,
+  CUDA,
+  LAST
+};
+
+inline DeviceType getDeviceType(at::Tensor& tensor) {
+    return tensor.type().is_cuda() ? DeviceType::CUDA : DeviceType::CPU;
+}
+
+} // namespace thd
+
+MAKE_HASHABLE(::thd::CollectiveType, static_cast<std::uint8_t>(t));
+MAKE_HASHABLE(::thd::DeviceType, static_cast<std::uint8_t>(t));
+
+
+namespace thd {
+
+using rank_type = uint32_t;
+using port_type = uint16_t;
+using size_type = uint64_t;
+
+#define SYSCHECK(expr) { \
+  errno = 0; auto ___output = (expr); (void)___output;     \
+  if (errno != 0) throw std::system_error(errno, std::system_category()); \
+}
+
+template<typename T>
+void send_bytes(int socket, const T* buffer, size_t length, bool more_data = false)
+{
+  size_t bytes_to_send = sizeof(T) * length;
+  if (bytes_to_send == 0)
+    return;
+
+  auto bytes = reinterpret_cast<const std::uint8_t*>(buffer);
+  std::uint8_t *current_bytes = const_cast<std::uint8_t*>(bytes);
+
+  int flags = 0;
+#ifdef MSG_MORE
+  if (more_data) { // there is more data to send
+    flags |= MSG_MORE;
+  }
+#endif
+
+  while (bytes_to_send > 0) {
+    ssize_t bytes_sent;
+    SYSCHECK(bytes_sent = ::send(socket, current_bytes, bytes_to_send, flags))
+    if (bytes_sent == 0)
+      throw std::system_error(ECONNRESET, std::system_category());
+
+    bytes_to_send -= bytes_sent;
+    current_bytes += bytes_sent;
+  }
+}
+
+
+template<typename T>
+void recv_bytes(int socket, T* buffer, size_t length)
+{
+  size_t bytes_to_receive = sizeof(T) * length;
+  if (bytes_to_receive == 0)
+    return;
+
+  auto bytes = reinterpret_cast<std::uint8_t*>(buffer);
+  std::uint8_t *current_bytes = bytes;
+
+  while (bytes_to_receive > 0) {
+    ssize_t bytes_received;
+    SYSCHECK(bytes_received = ::recv(socket, current_bytes, bytes_to_receive, 0))
+    if (bytes_received == 0)
+      throw std::system_error(ECONNRESET, std::system_category());
+
+    bytes_to_receive -= bytes_received;
+    current_bytes += bytes_received;
+  }
+}
+
+inline port_type convertToPort(int64_t port) {
+  if ((port < 0) || (port >= std::numeric_limits<port_type>::max()))
+    throw std::domain_error("invalid port (value out of range)");
+
+  return static_cast<port_type>(port);
+}
+
+inline rank_type convertToRank(int64_t rank, int64_t min = 0) {
+  if ((rank < min) || (rank >= std::numeric_limits<rank_type>::max()))
+    throw std::domain_error("invalid rank (value out of range)");
+
+  return static_cast<rank_type>(rank);
+}
+
+std::pair<int, port_type> listen(port_type port = 0);
+int connect(const std::string& address, port_type port, bool wait = true, int timeout = -1);
+std::tuple<int, std::string> accept(int listen_socket, int timeout = -1);
+
+std::string sockaddrToString(struct sockaddr *addr);
+std::pair<std::string, std::string> splitAddress(const std::string &addr);
+
+/* send a string's length and data */
+inline void send_string(int socket, const std::string& str,
+                        bool more_data = false) {
+  size_type size = str.size();
+  send_bytes<size_type>(socket, &size, 1, true);
+  send_bytes<char>(socket, str.data(), size, more_data);
+}
+
+/* receive a string as sent in send_string */
+inline std::string recv_string(int socket) {
+  size_type value_size;
+  recv_bytes<size_type>(socket, &value_size, 1);
+  std::vector<char> value(value_size);
+  recv_bytes<char>(socket, value.data(), value.size());
+  return std::string(value.data(), value.size());
+}
+
+/* send a vector's length and data */
+template<typename T>
+void send_vector(int socket, const std::vector<T>& vec,
+                 bool more_data = false) {
+  size_type size = vec.size();
+  send_bytes<size_type>(socket, &size, 1, true);
+  send_bytes<T>(socket, vec.data(), size, more_data);
+}
+
+/* receive a vector as sent in send_vector */
+template<typename T>
+std::vector<T> recv_vector(int socket) {
+  size_type value_size;
+  recv_bytes<size_type>(socket, &value_size, 1);
+  std::vector<char> value(value_size);
+  recv_bytes<char>(socket, value.data(), value.size());
+  return value;
+}
+
+/* this is only for convenience when sending rvalues */
+template<typename T>
+void send_value(int socket, const T& value, bool more_data = false) {
+  send_bytes<T>(socket, &value, 1, more_data);
+}
+
+template<typename T>
+T recv_value(int socket) {
+  T value;
+  recv_bytes<T>(socket, &value, 1);
+  return value;
+}
+
+class ResourceGuard {
+  std::function<void()> _destructor;
+  bool _released;
+
+public:
+  ResourceGuard(std::function<void()> destructor)
+    : _destructor(std::move(destructor))
+    , _released(false) {}
+
+  ~ResourceGuard() {
+    if (!_released) _destructor();
+  }
+
+  void release() {
+    _released = true;
+  }
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/Cuda.cpp b/torch/lib/THD/base/Cuda.cpp
new file mode 100644
index 0000000..e1b0797
--- /dev/null
+++ b/torch/lib/THD/base/Cuda.cpp
@@ -0,0 +1,26 @@
+#include "Cuda.hpp"
+#include <unordered_map>
+
+#ifdef USE_CUDA
+THCState** _THDCudaState;
+
+void THDSetCudaStatePtr(THCState **state) {
+  _THDCudaState = state;
+}
+
+static int nextStreamId = 1; // 0 for the default stream
+static std::unordered_map<cudaStream_t, int> streamIdMap;
+
+void THDRegisterCudaStream(cudaStream_t stream) {
+  streamIdMap.emplace(stream, nextStreamId++);
+}
+
+int THDGetStreamId(cudaStream_t stream) {
+  if (!stream) return 0;
+  auto it = streamIdMap.find(stream);
+  if (it == streamIdMap.end()) {
+    throw std::runtime_error("using a stream that's hasn't been registered in THD");
+  }
+  return it->second;
+}
+#endif
diff --git a/torch/lib/THD/base/Cuda.h b/torch/lib/THD/base/Cuda.h
new file mode 100644
index 0000000..f495835
--- /dev/null
+++ b/torch/lib/THD/base/Cuda.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#ifdef USE_CUDA
+#include "../THD.h"
+
+#include <THC/THC.h>
+
+THD_API void THDSetCudaStatePtr(THCState **state);
+THD_API void THDRegisterCudaStream(cudaStream_t stream);
+#endif
+
diff --git a/torch/lib/THD/base/Cuda.hpp b/torch/lib/THD/base/Cuda.hpp
new file mode 100644
index 0000000..c991744
--- /dev/null
+++ b/torch/lib/THD/base/Cuda.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifdef USE_CUDA
+#include <THC/THC.h>
+#include "Cuda.h"
+
+extern THCState** _THDCudaState;
+
+inline THCState* THDGetCudaState() {
+  return *_THDCudaState;
+}
+
+int THDGetStreamId(cudaStream_t stream);
+#endif
diff --git a/torch/lib/THD/base/DataChannel.cpp b/torch/lib/THD/base/DataChannel.cpp
new file mode 100644
index 0000000..2c94f97
--- /dev/null
+++ b/torch/lib/THD/base/DataChannel.cpp
@@ -0,0 +1,142 @@
+#include "DataChannel.hpp"
+#ifdef WITH_GLOO
+#include "data_channels/DataChannelGloo.hpp"
+#endif // WITH_GLOO
+#ifdef WITH_MPI
+#include "data_channels/DataChannelMPI.hpp"
+#endif // WITH_MPI
+#if defined(USE_CUDA) && defined(USE_DISTRIBUTED_NCCL)
+#include "data_channels/DataChannelNccl.hpp"
+#endif // USE_DISTRIBUTED_NCCL
+#include "data_channels/DataChannelTCP.hpp"
+
+#include <algorithm>
+#include <stdexcept>
+#include <tuple>
+
+namespace thd {
+
+#define GET_CONFIG getInitConfig(init_method, world_size, group_name, rank)
+DataChannel* DataChannel::newChannel(THDChannelType type, std::string init_method,
+                                     int world_size, std::string group_name,
+                                     int rank) {
+  switch (type) {
+    case THDChannelTCP:
+      return new DataChannelTCP(GET_CONFIG);
+
+    case THDChannelMPI:
+#ifdef WITH_MPI
+      return new DataChannelMPI();
+#endif // WITH_MPI
+      throw std::runtime_error(
+        "the MPI backend is not available; "
+        "try to recompile the THD package with MPI support"
+      );
+
+    case THDChannelGloo:
+#ifdef WITH_GLOO
+      return new DataChannelGloo(GET_CONFIG);
+#endif // WITH_GLOO
+      throw std::runtime_error(
+        "the Gloo backend is not available; "
+        "try to recompile the THD package with Gloo support"
+      );
+
+    case THDChannelNccl:
+#if defined(USE_CUDA) && defined(USE_DISTRIBUTED_NCCL)
+      return new DataChannelNccl(GET_CONFIG);
+#endif
+      throw std::runtime_error(
+        "the distributed NCCL backend is not available; "
+        "try to recompile the THD package with CUDA and NCCL 2+ support"
+      );
+
+    default:
+      throw std::runtime_error("unsupported data channel type");
+  }
+}
+#undef GET_CONFIG
+
+
+DataChannel::Group::Group()
+{}
+
+
+DataChannel::Group::Group(std::vector<rank_type> ranks, rank_type max_rank)
+{
+  if (ranks.size() == 0)
+    throw std::logic_error("cannot create empty group");
+
+  sort(ranks.begin(), ranks.end());
+  if (ranks.back() > max_rank) {
+    throw std::out_of_range(
+      "array of ranks contains invalid rank, "
+      "all ranks should be in range: [0, " + std::to_string(max_rank) + "]"
+    );
+  }
+
+  _new2old.reserve(ranks.size());
+  for (size_t i = 0; i < ranks.size(); ++i) {
+    _new2old.push_back(ranks[i]);
+    _old2new.insert({ranks[i], i});
+  }
+}
+
+
+DataChannel::Group::~Group()
+{}
+
+
+auto DataChannel::Group::size() const -> rank_type {
+  return static_cast<rank_type>(_new2old.size());
+}
+
+
+auto DataChannel::Group::mustGetGroupRank(rank_type global_rank) const -> rank_type {
+  rank_type group_rank;
+  bool exists;
+  std::tie(group_rank, exists) = getGroupRank(global_rank);
+
+  if (!exists) {
+    throw std::logic_error(
+      "rank(" + std::to_string(global_rank) + ") is not member of group"
+    );
+  }
+
+  return group_rank;
+}
+
+
+auto DataChannel::Group::getGroupRank(rank_type global_rank) const -> std::pair<rank_type, bool> {
+  auto global_rank_it = _old2new.find(global_rank); // O(1) operation
+  if (global_rank_it != _old2new.end())
+    return std::make_pair(global_rank_it->second, true);
+
+  return std::make_pair(0, false);
+}
+
+
+auto DataChannel::Group::mustGetGlobalRank(rank_type group_rank) const -> rank_type {
+  rank_type global_rank;
+  bool exists;
+  std::tie(global_rank, exists) = getGlobalRank(group_rank);
+
+  if (!exists) {
+    throw std::logic_error(
+      "group rank is invalid, rank should be in "
+      "range: [0, " + std::to_string(_new2old.size() - 1) + "]"
+    );
+  }
+
+  return global_rank;
+}
+
+
+auto DataChannel::Group::getGlobalRank(rank_type group_rank) const -> std::pair<rank_type, bool> {
+  if (group_rank >= _new2old.size())
+    return std::make_pair(0, false);
+
+  return std::make_pair(_new2old[group_rank], true);
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/base/DataChannel.h b/torch/lib/THD/base/DataChannel.h
new file mode 100644
index 0000000..5522f87
--- /dev/null
+++ b/torch/lib/THD/base/DataChannel.h
@@ -0,0 +1,11 @@
+#pragma once
+
+enum THDReduceOp {
+  THDReduceMIN = 0,
+  THDReduceMAX,
+  THDReduceSUM,
+  THDReducePRODUCT,
+};
+
+typedef int THDGroup;
+const THDGroup THDGroupWORLD = 0;
diff --git a/torch/lib/THD/base/DataChannel.hpp b/torch/lib/THD/base/DataChannel.hpp
new file mode 100644
index 0000000..77776b0
--- /dev/null
+++ b/torch/lib/THD/base/DataChannel.hpp
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "ChannelType.h"
+#include "ChannelUtils.hpp"
+#include "DataChannel.h"
+#include "Scalar.hpp"
+#include "init_methods/InitMethod.hpp"
+
+#include <ATen/ATen.h>
+
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+
+MAKE_HASHABLE(THDReduceOp, static_cast<int>(t));
+MAKE_HASHABLE(thd::RPCType, static_cast<char>(t));
+MAKE_HASHABLE(at::ScalarType, static_cast<int>(t));
+
+
+namespace thd {
+
+struct DataChannel {
+
+  struct Request {
+    Request() {};
+    virtual ~Request() {};
+
+    // Checks if request has completed. Non-blocking operation.
+    virtual bool isCompleted() = 0;
+    // Waits until request completes. Blocking operation.
+    virtual void wait() = 0;
+  };
+
+  struct Group {
+    Group();
+    /*
+     * Constructs `Group` from provided `ranks` and checks if all ranks are
+     * in range: [0, `max_rank`].
+     *
+     * `ranks` vector should have mapping from new ranks to old ranks (global ranks)
+     * eg. ranks = {[0] = 6, [1] = 2} which means that 0 and 1 are new ranks in group
+     * and 6, 2 are global ranks corresponding to 0 and 1 respectively.
+     */
+    Group(std::vector<rank_type> ranks, rank_type max_rank);
+    virtual ~Group();
+
+    rank_type size() const;
+
+    /*
+     * In contrast to `getGroupRank` this function throws `std::logic_error`
+     * when rank is member of this group.
+     */
+    rank_type mustGetGroupRank(rank_type global_rank) const;
+    std::pair<rank_type, bool> getGroupRank(rank_type global_rank) const;
+
+    /*
+     * In contrast to `getGlobalRank` this function throws `std::logic_error`
+     * when provided `group_rank` is not in range of group.
+     */
+    rank_type mustGetGlobalRank(rank_type group_rank) const;
+    std::pair<rank_type, bool> getGlobalRank(rank_type group_rank) const;
+
+  private:
+    // maps new group ranks to old ranks (global ranks)
+    std::vector<rank_type> _new2old;
+
+    // maps old ranks (global ranks) to new group ranks
+    std::unordered_map<rank_type, rank_type> _old2new;
+  };
+
+  DataChannel() {};
+  virtual ~DataChannel() {};
+
+  virtual bool init() = 0;
+
+  /**
+   * This is required for NCCL backend, since the destroy cannot be done before
+   * CUDA is unloaded since DataChannel is a static object.
+   */
+  virtual void destroy() = 0;
+
+  virtual rank_type getRank() = 0;
+  virtual rank_type getNumProcesses() = 0;
+
+ /**
+   * All gather inputs from multiple GPUs, each Tensor in input vector should be
+   * on a separate GPU.
+   *
+   * Also note that the output vector is a 1D vector (flattened from 2D),
+   * with the size of input.size() * world_size.
+   *
+   * For instance, rank i 's input[k] tensor would be in
+   * output[i * input.size() + k].
+   */
+  virtual void allGather(std::vector<at::Tensor>& output,
+                         std::vector<at::Tensor>& input,
+                         THDGroup groupId = THDGroupWORLD) = 0;
+  virtual void allGather(std::vector<at::Tensor>& output,
+                         at::Tensor& input,
+                         THDGroup group_id = THDGroupWORLD) = 0;
+  virtual void gather(std::vector<at::Tensor>& output,
+                      at::Tensor& input,
+                      rank_type dst_rank,
+                      THDGroup group_id = THDGroupWORLD) = 0;
+  virtual void scatter(std::vector<at::Tensor>& input,
+                       at::Tensor& output,
+                       rank_type src_rank,
+                       THDGroup group_id = THDGroupWORLD) = 0;
+  // All reduce multiple GPUs on a number of nodes
+  virtual void allReduce(std::vector<at::Tensor>& data,
+                         THDReduceOp operation,
+                         THDGroup group_id = THDGroupWORLD) = 0;
+  virtual void allReduce(at::Tensor& data, THDReduceOp operation,
+                         THDGroup group_id = THDGroupWORLD) = 0;
+  /**
+   * Reduce multiple GPUs on a number of nodes
+   * data[0]'s GPU in dstRank will receive the result
+   */
+  virtual void reduce(std::vector<at::Tensor>& data,
+                      THDReduceOp operation,
+                      rank_type dstRank,
+                      THDGroup groupId = THDGroupWORLD) = 0;
+  virtual void reduce(at::Tensor& data,
+                      THDReduceOp operation,
+                      rank_type dst_rank,
+                      THDGroup group_id = THDGroupWORLD) = 0;
+  /**
+   * Broadcast multiple GPUs on a number of nodes
+   * data[0]'s GPU in srcRank will be the source to broadcast
+   */
+  virtual void broadcast(std::vector<at::Tensor>& data,
+                         rank_type srcRank,
+                         THDGroup groupId = THDGroupWORLD) = 0;
+  virtual void broadcast(at::Tensor& data,
+                         rank_type src_rank,
+                         THDGroup group_id = THDGroupWORLD) = 0;
+  virtual void send(Scalar& value, rank_type src_rank) = 0;
+  virtual void send(at::Tensor& data, rank_type dst_rank) = 0;
+  virtual void receive(Scalar& value, rank_type src_rank) = 0;
+  virtual rank_type receive(at::Tensor& data) = 0; // receive from any source
+  virtual void receive(at::Tensor& data, rank_type src_rank) = 0;
+  virtual Request* isend(at::Tensor& data, rank_type dst_rank) = 0;
+  virtual Request* ireceive(at::Tensor& data, rank_type src_rank) = 0;
+
+  virtual void barrier(THDGroup group_id = THDGroupWORLD) = 0;
+
+  virtual THDGroup newGroup(const std::vector<rank_type>& ranks) = 0;
+  virtual void clearGroupCache(THDGroup group_id = THDGroupWORLD) = 0;
+
+  static DataChannel* newChannel(THDChannelType type,
+                                 std::string init_method,
+                                 int world_size,
+                                 std::string group_name, int rank);
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/DataChannelRequest.cpp b/torch/lib/THD/base/DataChannelRequest.cpp
new file mode 100644
index 0000000..a9536b4
--- /dev/null
+++ b/torch/lib/THD/base/DataChannelRequest.cpp
@@ -0,0 +1,6 @@
+#include "DataChannelRequest.hpp"
+
+
+THD_API void THDRequest_free(void* request) {
+  delete (THDRequest*)request;
+}
diff --git a/torch/lib/THD/base/DataChannelRequest.h b/torch/lib/THD/base/DataChannelRequest.h
new file mode 100644
index 0000000..3a1d70d
--- /dev/null
+++ b/torch/lib/THD/base/DataChannelRequest.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "../THD.h"
+
+#ifndef _THD_CORE
+struct _THDRequest;
+typedef struct _THDRequest THDRequest;
+#endif
+
+THD_API void THDRequest_free(void* req);
diff --git a/torch/lib/THD/base/DataChannelRequest.hpp b/torch/lib/THD/base/DataChannelRequest.hpp
new file mode 100644
index 0000000..6b79dbe
--- /dev/null
+++ b/torch/lib/THD/base/DataChannelRequest.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "DataChannel.hpp"
+using THDRequest = thd::DataChannel::Request;
+
+#include "DataChannelRequest.h"
diff --git a/torch/lib/THD/base/Exceptions.hpp b/torch/lib/THD/base/Exceptions.hpp
new file mode 100644
index 0000000..5ebec96
--- /dev/null
+++ b/torch/lib/THD/base/Exceptions.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <iostream>
+
+#define HANDLE_EXCEPTIONS try {
+#define END_HANDLE_EXCEPTIONS       \
+} catch (std::exception &e) {       \
+  THError(e.what());                \
+}
diff --git a/torch/lib/THD/base/RPCType.cpp b/torch/lib/THD/base/RPCType.cpp
new file mode 100644
index 0000000..13a9743
--- /dev/null
+++ b/torch/lib/THD/base/RPCType.cpp
@@ -0,0 +1,21 @@
+#include "RPCType.hpp"
+
+namespace thd {
+
+// Static constexpr variables have to be defined out-of-source in C++11.
+// https://stackoverflow.com/questions/8016780/undefined-reference-to-static-constexpr-char
+constexpr RPCType type_traits<char>::type;
+constexpr RPCType type_traits<int8_t>::type;
+constexpr RPCType type_traits<uint8_t>::type;
+constexpr RPCType type_traits<float>::type;
+constexpr RPCType type_traits<double>::type;
+constexpr RPCType type_traits<int16_t>::type;
+constexpr RPCType type_traits<int32_t>::type;
+constexpr RPCType type_traits<uint32_t>::type;
+constexpr RPCType type_traits<uint16_t>::type;
+constexpr RPCType type_traits<int64_t>::type;
+constexpr RPCType type_traits<uint64_t>::type;
+constexpr RPCType type_traits<std::conditional<std::is_same<int64_t, long>::value, long long, long>::type>::type;
+constexpr RPCType type_traits<std::conditional<std::is_same<uint64_t, unsigned long>::value, unsigned long long, unsigned long>::type>::type;
+
+} // thd
diff --git a/torch/lib/THD/base/RPCType.hpp b/torch/lib/THD/base/RPCType.hpp
new file mode 100644
index 0000000..e40a780
--- /dev/null
+++ b/torch/lib/THD/base/RPCType.hpp
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <type_traits>
+#include <tuple>
+#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
+
+namespace thd {
+
+/*
+ * The following notation comes from:
+ * docs.python.org/3.5/library/struct.html#module-struct
+ * except from 'T', which stands for Tensor
+ */
+
+enum class RPCType : char {
+  CHAR = 'c',
+  UCHAR = 'B',
+  FLOAT = 'f',
+  DOUBLE = 'd',
+  HALF = 'a',
+  SHORT = 'h',
+  USHORT = 'H',
+  INT = 'i',
+  UINT = 'I',
+  LONG = 'l',
+  ULONG = 'L',
+  LONG_LONG = 'q',
+  ULONG_LONG = 'Q',
+  LONG_STORAGE = 'X',
+  TENSOR = 'T',
+  STORAGE = 'S',
+  GENERATOR = 'G',
+};
+
+inline bool isFloat(RPCType t) {
+  return (t == RPCType::FLOAT || t == RPCType::DOUBLE || t == RPCType::HALF);
+}
+
+inline bool isInteger(RPCType t) {
+  return (t == RPCType::CHAR || t == RPCType::UCHAR ||
+          t == RPCType::SHORT || t == RPCType:: USHORT ||
+          t == RPCType::INT || t == RPCType::UINT ||
+          t == RPCType::LONG || t == RPCType::ULONG ||
+          t == RPCType::LONG_LONG || t == RPCType::ULONG_LONG);
+}
+
+inline const char* toString(RPCType t) {
+  switch (t) {
+    case RPCType::CHAR: return "Char";
+    case RPCType::UCHAR: return "Byte";
+    case RPCType::FLOAT: return "Float";
+    case RPCType::DOUBLE: return "Double";
+    case RPCType::HALF: return "Half";
+    case RPCType::SHORT: return "Short";
+    case RPCType::USHORT: return "UShort";
+    case RPCType::INT: return "Int";
+    case RPCType::UINT: return "UInt";
+    case RPCType::LONG: return "Long";
+    case RPCType::ULONG: return "ULong";
+    case RPCType::LONG_LONG: return "LongLong";
+    case RPCType::ULONG_LONG: return "ULongLong";
+    case RPCType::LONG_STORAGE: return "LongStorage";
+    case RPCType::TENSOR: return "Tensor";
+    case RPCType::STORAGE: return "Storage";
+    default: return "<unknown>";
+  }
+}
+
+inline bool isObject(RPCType t) {
+  return (t == RPCType::TENSOR || t == RPCType::STORAGE || t == RPCType::GENERATOR);
+}
+
+template<typename T>
+struct type_traits {};
+
+// NOTE: The `type` static constexpr variables of these specializations are
+// additionally defined in RPCType.cpp to avoid undefined
+// reference errors in C++11.
+template<>
+struct type_traits<char> {
+  static constexpr RPCType type = RPCType::CHAR;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<int8_t> {
+  static constexpr RPCType type = RPCType::CHAR;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<uint8_t> {
+  static constexpr RPCType type = RPCType::UCHAR;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<float> {
+  static constexpr RPCType type = RPCType::FLOAT;
+  static constexpr bool is_floating_point = true;
+};
+
+template<>
+struct type_traits<double> {
+  static constexpr RPCType type = RPCType::DOUBLE;
+  static constexpr bool is_floating_point = true;
+};
+
+template<>
+struct type_traits<int16_t> {
+  static constexpr RPCType type = RPCType::SHORT;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<uint16_t> {
+  static constexpr RPCType type = RPCType::USHORT;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<int32_t> {
+  static constexpr RPCType type = RPCType::INT;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<uint32_t> {
+  static constexpr RPCType type = RPCType::UINT;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<int64_t> {
+  static constexpr RPCType type = std::is_same<int64_t, long>::value ? RPCType::LONG : RPCType::LONG_LONG;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<uint64_t> {
+  static constexpr RPCType type = std::is_same<uint64_t, unsigned long>::value ? RPCType::ULONG : RPCType::ULONG_LONG;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<std::conditional<std::is_same<int64_t, long>::value, long long, long>::type> {
+  static constexpr RPCType type = std::is_same<int64_t, long>::value ? RPCType::LONG_LONG : RPCType::LONG;
+  static constexpr bool is_floating_point = false;
+};
+
+template<>
+struct type_traits<std::conditional<std::is_same<uint64_t, unsigned long>::value, unsigned long long, unsigned long>::type> {
+  static constexpr RPCType type = std::is_same<uint64_t, unsigned long>::value ? RPCType::ULONG_LONG : RPCType::ULONG;
+  static constexpr bool is_floating_point = false;
+};
+
+template<typename T>
+struct type_traits<const T> : type_traits<T> {};
+
+} // thd
diff --git a/torch/lib/THD/base/Scalar.hpp b/torch/lib/THD/base/Scalar.hpp
new file mode 100644
index 0000000..cdfa347
--- /dev/null
+++ b/torch/lib/THD/base/Scalar.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstddef>
+
+#include "RPCType.hpp"
+
+namespace thd {
+
+struct Scalar {
+  Scalar() {}
+  Scalar(const Scalar& other) = delete;
+  Scalar(Scalar&& other) = delete;
+  virtual ~Scalar() {}
+
+  virtual size_t elementSize() const = 0;
+  virtual void* data() = 0;
+  virtual const void* data() const = 0;
+  virtual RPCType type() const = 0;
+  virtual Scalar* clone() const = 0;
+};
+
+template<typename real>
+struct ScalarWrapper : Scalar {
+  ScalarWrapper() {}
+  ScalarWrapper(real value) : _value(value) {}
+  virtual ~ScalarWrapper() {}
+
+  virtual size_t elementSize() const override {
+    return sizeof(real);
+  }
+
+  virtual void* data() override {
+    return &_value;
+  }
+
+  virtual const void* data() const override {
+    return &_value;
+  }
+
+  virtual RPCType type() const override {
+    return type_traits<real>::type;
+  }
+
+  virtual ScalarWrapper* clone() const override {
+    return new ScalarWrapper(value());
+  }
+
+  real value() const {
+    return _value;
+  }
+
+private:
+  real _value;
+};
+
+using FloatScalar = ScalarWrapper<double>;
+using IntScalar = ScalarWrapper<int64_t>;
+
+} // namespace thd
diff --git a/torch/lib/THD/base/THDGenerateAllTypes.h b/torch/lib/THD/base/THDGenerateAllTypes.h
new file mode 100644
index 0000000..90e54f0
--- /dev/null
+++ b/torch/lib/THD/base/THDGenerateAllTypes.h
@@ -0,0 +1,96 @@
+#ifndef THD_GENERIC_FILE
+#error "You must define THD_GENERIC_FILE before including THDGenerateAllTypes.h"
+#endif
+
+#define real uint8_t
+#define accreal int64_t
+#define Real Byte
+#define THDInf UCHAR_MAX
+#define THD_REAL_IS_BYTE
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_BYTE
+
+#define real int8_t
+#define accreal int64_t
+#define Real Char
+#define THDInf SCHAR_MAX
+#define THD_REAL_IS_CHAR
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_CHAR
+
+#define real int16_t
+#define accreal int64_t
+#define Real Short
+#define THDInf SHRT_MAX
+#define THD_REAL_IS_SHORT
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_SHORT
+
+#define real int32_t
+#define accreal int64_t
+#define Real Int
+#define THDInf INT_MAX
+#define THD_REAL_IS_INT
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_INT
+
+#define real int64_t
+#define accreal int64_t
+#define Real Long
+#define THDInf LONG_MAX
+#define THD_REAL_IS_LONG
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_LONG
+
+#define real float
+#define accreal double
+#define Real Float
+#define THDInf FLT_MAX
+#define THD_REAL_IS_FLOAT
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_FLOAT
+
+#define real double
+#define accreal double
+#define Real Double
+#define THDInf DBL_MAX
+#define THD_REAL_IS_DOUBLE
+#line 1 THD_GENERIC_FILE
+#include THD_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THDInf
+#undef THD_REAL_IS_DOUBLE
+
+#undef THD_GENERIC_FILE
diff --git a/torch/lib/THD/base/TensorDescriptor.h b/torch/lib/THD/base/TensorDescriptor.h
new file mode 100644
index 0000000..5d7bb60
--- /dev/null
+++ b/torch/lib/THD/base/TensorDescriptor.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "../THD.h"
+#include <TH/TH.h>
+#ifdef USE_CUDA
+#include <THC/THC.h>
+#endif
+
+#ifndef _THD_CORE
+#include <ATen/ATen.h>
+using THDTensorDescriptor = at::Tensor;
+#endif
diff --git a/torch/lib/THD/base/TensorDescriptor.hpp b/torch/lib/THD/base/TensorDescriptor.hpp
new file mode 100644
index 0000000..51b990e
--- /dev/null
+++ b/torch/lib/THD/base/TensorDescriptor.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <ATen/ATen.h>
+using THDTensorDescriptor = at::Tensor;
+
+#include "TensorDescriptor.h"
diff --git a/torch/lib/THD/base/data_channels/DataChannelGloo.cpp b/torch/lib/THD/base/data_channels/DataChannelGloo.cpp
new file mode 100644
index 0000000..8de72c5
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelGloo.cpp
@@ -0,0 +1,376 @@
+#include "DataChannelGloo.hpp"
+#include "DataChannelUtils.hpp"
+#include "GlooCache.hpp"
+#include "Store.hpp"
+
+#if defined(USE_GLOO_IBVERBS) && USE_GLOO_IBVERBS
+#include "gloo/transport/ibverbs/device.h"
+#endif
+
+#include "gloo/transport/tcp/device.h"
+
+#include <algorithm>
+#include <unistd.h>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+
+
+#define RETURN_IF_NOT_IN_GROUP                                                \
+  {                                                                           \
+    bool exists;                                                              \
+    std::tie(std::ignore, exists) = _groups.at(group_id).getGroupRank(_rank); \
+    if (!exists) return;                                                      \
+  }
+
+
+// TODO: gloo uses stdint types for integral values and there's some weird template
+// magic going on that mangles names so that they don't always match the types
+// below. Only float and double are left enabled for now, because they're most
+// useful and unambiguous.
+#define GENERATE_ALL_TYPES(type, func, args...)                               \
+  switch (type) {                                                             \
+    case ::at::ScalarType::Float: func<float>(args); break;                   \
+    case ::at::ScalarType::Double: func<double>(args); break;                 \
+    case ::at::ScalarType::Half: func<gloo::float16>(args); break;            \
+    case ::at::ScalarType::Char: func<int8_t>(args); break;                   \
+    case ::at::ScalarType::Byte: func<uint8_t>(args); break;                  \
+    case ::at::ScalarType::Int: func<int32_t>(args); break;                   \
+    case ::at::ScalarType::Long: func<int64_t>(args); break;                  \
+    default:                                                                  \
+      throw std::runtime_error("Invalid " + std::string(#func) + " function type"); \
+  }
+
+
+namespace thd {
+
+DataChannelGloo::RequestGloo::RequestGloo(QueueWorker::Request&& request)
+  : _request(std::move(request)) {
+}
+
+
+DataChannelGloo::RequestGloo::~RequestGloo() {}
+
+
+bool DataChannelGloo::RequestGloo::isCompleted() {
+  return _request.isCompleted();
+}
+
+
+void DataChannelGloo::RequestGloo::wait() {
+  _request.wait();
+}
+
+
+DataChannelGloo::Group::Group(const std::string& addr,
+                              port_type port,
+                              std::vector<rank_type> ranks,
+                              rank_type max_rank,
+                              int store_socket)
+  : DataChannel::Group(std::move(ranks), max_rank)
+  , _store(new Store(addr, port, store_socket)) {}
+
+DataChannelGloo::DataChannelGloo(InitMethod::Config config)
+  : _rank(config.rank)
+  , _listen_socket(-1)
+  , _cache(nullptr)
+{
+  _num_processes = config.world_size;
+
+#if defined(USE_GLOO_IBVERBS) && USE_GLOO_IBVERBS
+
+  // This helper function automatically detects the IB device in the system
+  auto ibDeviceNames = ::gloo::transport::ibverbs::getDeviceNames();
+
+  // If there are IB devices, we will use IB
+  if (!ibDeviceNames.empty()) {
+    // Currently, gloo only supports a single IB device and will use the first
+    auto ibDeviceToUse = ibDeviceNames[0];
+
+    ::gloo::transport::ibverbs::attr attr = {
+      .name = ibDeviceToUse,
+      .port = 1,
+      .index = 0,
+    };
+
+    _deviceList.push_back(::gloo::transport::ibverbs::CreateDevice(attr));
+
+  // Otherwise, fallback to use TCP instead
+  } else
+
+#endif
+
+  {
+    // Default options listen on this host's name.
+    // NOTE: when hostname has bad configuration in `/etc/hosts` processes
+    // will not connect to each other.
+    ::gloo::transport::tcp::attr attr(config.public_address.c_str());
+    _deviceList.push_back(::gloo::transport::tcp::CreateDevice(attr));
+  }
+
+  if (_rank == 0) {
+    _addr = "localhost";
+    _port = config.master.listen_port;
+    _listen_socket = config.master.listen_socket;
+  } else {
+    _addr = config.worker.master_addr;
+    _port = config.worker.master_port;
+  }
+}
+
+
+DataChannelGloo::~DataChannelGloo() {
+  if (_listen_socket != -1) {
+    ::close(_listen_socket);
+  }
+}
+
+void DataChannelGloo::destroy() {}
+
+bool DataChannelGloo::init() {
+  _cache = std::unique_ptr<GlooCache>(new GlooCache(_rank, _deviceList));
+
+  std::vector<rank_type> ranks;
+  ranks.reserve(_num_processes);
+  for (rank_type rank = 0; rank < _num_processes; ++rank)
+    ranks.push_back(rank);
+
+  _groups.insert({
+    THDGroupWORLD,
+    Group(_addr, _port, ranks, _num_processes - 1, _rank == 0 ? _listen_socket : Store::CLIENT_ONLY)
+  });
+  return true;
+}
+
+
+rank_type DataChannelGloo::getRank() {
+  return _rank;
+}
+
+
+rank_type DataChannelGloo::getNumProcesses() {
+  return _num_processes;
+}
+
+
+template<typename T>
+void DataChannelGloo::allGatherT(std::vector<at::Tensor>& output,
+                                 at::Tensor& input, THDGroup group_id) {
+  auto input_device = getDeviceType(input);
+  for (auto& out : output) {
+    if (input_device != getDeviceType(out)) {
+      throw std::runtime_error("allGather got input and output on different devices");
+    }
+  }
+  uint64_t tensor_bytes = input.type().elementSizeInBytes() * input.numel();
+  uint64_t all_tensor_bytes = tensor_bytes * output.size();
+  auto ret = _cache->getAlgorithm<CollectiveType::ALL_GATHER, T>(
+    group_id, _groups.at(group_id), input_device, tensor_bytes, all_tensor_bytes, input.numel());
+
+
+  {
+    std::lock_guard<std::mutex> lock(*GlooCache::mutex(ret));
+    std::memcpy(GlooCache::input_buffer(ret).get(), input.data_ptr(), tensor_bytes);
+    GlooCache::algorithm(ret)->run();
+    for (size_t i = 0; i < output.size(); i++) {
+      std::memcpy(output.at(i).data_ptr(),
+                  GlooCache::output_buffer(ret).get() + (i * tensor_bytes),
+                  tensor_bytes);
+    }
+  }
+
+}
+
+void DataChannelGloo::allGather(std::vector<at::Tensor>& output,
+                                at::Tensor& input, THDGroup group_id) {
+  RETURN_IF_NOT_IN_GROUP
+
+  if (output.size() != _groups.at(group_id).size())
+    throw std::logic_error("allGather: number of output tensors and group size does not match");
+
+  for (auto out_tensor : output)
+    assertSameSizeAndType(out_tensor, input, "allGather");
+
+  GENERATE_ALL_TYPES(input.type().scalarType(), allGatherT, output, input, group_id)
+}
+
+
+// XXX: `gather` is not supported by Gloo yet.
+void DataChannelGloo::gather(std::vector<at::Tensor>& output,
+                             at::Tensor& input, rank_type dst_rank,
+                             THDGroup group_id) {
+  throw std::runtime_error("DataChannelGloo doesn't support gather");
+}
+
+
+// XXX: `scatter` is not supported by Gloo yet.
+void DataChannelGloo::scatter(std::vector<at::Tensor>& input,
+                              at::Tensor& output,
+                              rank_type src_rank, THDGroup group_id) {
+  throw std::runtime_error("DataChannelGloo does not support scatter");
+}
+
+
+template<typename T>
+void DataChannelGloo::allReduceT(at::Tensor& t, THDReduceOp operation,
+                                 THDGroup group_id) {
+  uint64_t tensor_bytes = t.type().elementSizeInBytes() * t.numel();
+  auto ret = _cache->getAlgorithm<CollectiveType::ALL_REDUCE, T>(
+    group_id, _groups.at(group_id), getDeviceType(t), tensor_bytes, t.numel(), operation);
+
+  {
+    std::lock_guard<std::mutex> lock(*GlooCache::mutex(ret));
+    GlooCache::memcpy_input(ret, t);
+    GlooCache::algorithm(ret)->run();
+    GlooCache::memcpy_output(ret, t);
+  }
+}
+
+void DataChannelGloo::allReduce(at::Tensor& data, THDReduceOp operation,
+                                THDGroup group_id) {
+  RETURN_IF_NOT_IN_GROUP
+  GENERATE_ALL_TYPES(data.type().scalarType(), allReduceT, data, operation, group_id)
+}
+
+
+// XXX: `reduce` is not supported by Gloo yet.
+void DataChannelGloo::reduce(at::Tensor& data, THDReduceOp operation,
+                             rank_type dst_rank, THDGroup group_id) {
+  throw std::runtime_error("DataChannelGloo does not support reduce");
+}
+
+
+template<typename T>
+void DataChannelGloo::broadcastT(at::Tensor& data, rank_type src_rank,
+                                 THDGroup group_id) {
+  uint64_t tensor_bytes = data.type().elementSizeInBytes() * data.numel();
+  auto ret = _cache->getAlgorithm<CollectiveType::BROADCAST, T>(
+    group_id, _groups.at(group_id), getDeviceType(data), tensor_bytes, data.numel(),
+    _groups.at(group_id).mustGetGroupRank(src_rank));
+
+  {
+    std::lock_guard<std::mutex> lock(*GlooCache::mutex(ret));
+    if (_rank == src_rank) {
+      GlooCache::memcpy_input(ret, data);
+    }
+
+    GlooCache::algorithm(ret)->run();
+
+    if (_rank != src_rank) {
+      GlooCache::memcpy_output(ret, data);
+    }
+  }
+
+}
+
+
+void DataChannelGloo::broadcast(at::Tensor& data, rank_type src_rank,
+                                THDGroup group_id) {
+  RETURN_IF_NOT_IN_GROUP
+  GENERATE_ALL_TYPES(data.type().scalarType(), broadcastT, data, src_rank, group_id)
+}
+
+
+void DataChannelGloo::send(Scalar& data, rank_type dst_rank) {
+  throw std::runtime_error("DataChannelGloo does not support send");
+}
+
+
+void DataChannelGloo::send(at::Tensor& data, rank_type dst_rank) {
+  throw std::runtime_error("DataChannelGloo does not support send");
+}
+
+
+void DataChannelGloo::receive(Scalar& data, rank_type src_rank) {
+  throw std::runtime_error("DataChannelGloo does not support receive");
+}
+
+
+rank_type DataChannelGloo::receive(at::Tensor& data) {
+  throw std::runtime_error("DataChannelGloo does not support receive from any source");
+}
+
+
+void DataChannelGloo::receive(at::Tensor& data, rank_type src_rank) {
+  throw std::runtime_error("DataChannelGloo does not support receive");
+}
+
+
+auto DataChannelGloo::isend(at::Tensor& data, rank_type dst_rank) -> RequestGloo* {
+  throw std::runtime_error("DataChannelGloo does not support isend");
+}
+
+
+auto DataChannelGloo::ireceive(at::Tensor& data, rank_type src_rank) -> RequestGloo* {
+  throw std::runtime_error("DataChannelGloo does not support ireceive");
+}
+
+
+void DataChannelGloo::allReduce(std::vector<at::Tensor>& data,
+                                THDReduceOp operation,
+                                THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelGloo does not support mult-GPU cross "
+                           "node allreduce");
+}
+
+
+void DataChannelGloo::allGather(std::vector<at::Tensor>& output,
+                                std::vector<at::Tensor>& input,
+                                THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelGloo does not support mult-GPU cross "
+                           "node allgather");
+}
+
+
+void DataChannelGloo::reduce(std::vector<at::Tensor>& data,
+                             THDReduceOp operation,
+                             rank_type dstRank,
+                             THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelGloo does not support mult-GPU cross "
+                           "node reduce");
+}
+
+
+void DataChannelGloo::broadcast(std::vector<at::Tensor>& data,
+                                rank_type srcRank,
+                                THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelGloo does not support mult-GPU cross "
+                           "node broadcast");
+}
+
+
+void DataChannelGloo::clearGroupCache(THDGroup group_id) {
+  throw std::runtime_error("DataChannelGloo does not support clear "
+                           "group cache");
+}
+
+
+void DataChannelGloo::barrier(THDGroup group_id) {
+  RETURN_IF_NOT_IN_GROUP
+  auto ret = _cache->getAlgorithm<CollectiveType::BARRIER, void>(
+    group_id, _groups.at(group_id));
+  {
+    std::lock_guard<std::mutex> lock(*GlooCache::mutex(ret));
+    GlooCache::algorithm(ret)->run();
+  }
+}
+
+
+THDGroup DataChannelGloo::newGroup(const std::vector<rank_type>& ranks) {
+  auto new_group = DataChannelGloo::Group(_addr, _port, ranks, _num_processes - 1, Store::CLIENT_ONLY);
+  THDGroup new_group_id = static_cast<THDGroup>(_groups.size());
+
+  _groups.insert({new_group_id, new_group});
+  return new_group_id;
+}
+
+} // namespace thd
+
diff --git a/torch/lib/THD/base/data_channels/DataChannelGloo.hpp b/torch/lib/THD/base/data_channels/DataChannelGloo.hpp
new file mode 100644
index 0000000..f8087aa
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelGloo.hpp
@@ -0,0 +1,126 @@
+#pragma once
+
+#include "../ChannelUtils.hpp"
+#include "../DataChannel.hpp"
+#include "DataChannelUtils.hpp"
+
+#include "gloo/rendezvous/store.h"
+#include "gloo/transport/device.h"
+
+#include <map>
+
+
+namespace thd {
+
+struct GlooCache;
+
+struct DataChannelGloo : DataChannel {
+  using store_type = ::gloo::rendezvous::Store;
+
+  struct RequestGloo : DataChannel::Request {
+    RequestGloo(QueueWorker::Request&& request);
+    virtual ~RequestGloo();
+
+    virtual bool isCompleted() override;
+    virtual void wait() override;
+
+  private:
+    QueueWorker::Request _request;
+  };
+
+  struct Group : DataChannel::Group {
+    Group(const std::string& addr, port_type port,
+              std::vector<rank_type> ranks, rank_type max_rank,
+              int store_socket);
+
+    std::shared_ptr<store_type> _store;
+  };
+
+  DataChannelGloo(InitMethod::Config config);
+  DataChannelGloo(InitMethod::Config config, int timeout);
+  virtual ~DataChannelGloo();
+
+  bool init() override;
+  void destroy() override;
+
+  rank_type getRank() override;
+  rank_type getNumProcesses() override;
+
+  void allGather(std::vector<at::Tensor>& output,
+                 std::vector<at::Tensor>& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allGather(std::vector<at::Tensor>& output, at::Tensor& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void gather(std::vector<at::Tensor>& output, at::Tensor& input,
+              rank_type dst_rank, THDGroup group_id = THDGroupWORLD) override;
+  void scatter(std::vector<at::Tensor>& input, at::Tensor& output,
+               rank_type src_rank, THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(std::vector<at::Tensor>& data,
+                 THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(at::Tensor& data, THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void reduce(std::vector<at::Tensor>& data,
+              THDReduceOp operation,
+              rank_type dstRank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void reduce(at::Tensor& data, THDReduceOp operation, rank_type dst_rank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(std::vector<at::Tensor>& data,
+                 rank_type srcRank,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(at::Tensor& data, rank_type src_id,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void send(Scalar& data, rank_type dst_id) override;
+  void send(at::Tensor& data, rank_type dst_id) override;
+  void receive(Scalar& data, rank_type src_id) override;
+  rank_type receive(at::Tensor& data) override;
+  void receive(at::Tensor& data, rank_type src_id) override;
+  RequestGloo* isend(at::Tensor& data, rank_type dst_rank) override;
+  RequestGloo* ireceive(at::Tensor& data, rank_type src_rank) override;
+
+  void barrier(THDGroup group_id = THDGroupWORLD) override;
+
+  THDGroup newGroup(const std::vector<rank_type>& ranks) override;
+  void clearGroupCache(THDGroup group_id = THDGroupWORLD) override;
+
+
+private:
+
+  template<typename T>
+  void allGatherT(std::vector<at::Tensor>& output,
+                  at::Tensor& input, THDGroup group_id);
+
+  template<typename T>
+  void allReduceT(at::Tensor& data, THDReduceOp operation,
+                  THDGroup group_id = THDGroupWORLD);
+
+  template<typename T>
+  void broadcastT(at::Tensor& data, rank_type src_rank,
+                  THDGroup group_id = THDGroupWORLD);
+
+  rank_type _rank; // Current process' rank
+  std::string _addr;
+  port_type _port;
+  rank_type _num_processes; // Number of processes in network
+  /**
+   * The list of network devices (such as Infiniband) that will be used by Gloo.
+   * Currently Gloo only supports a single network device. Therefore:
+   *
+   * _deviceList.size() will always be equal or less than 1.
+   *
+   * We make it a vector for the purpose of future extension to support multiple
+   * network devices.
+   */
+  std::vector<std::shared_ptr<::gloo::transport::Device>> _deviceList;
+  std::unordered_map<THDGroup, Group> _groups;
+  int _listen_socket;
+
+  std::unique_ptr<GlooCache> _cache;
+
+  // Workers
+  QueueWorker _send_worker, _receive_worker;
+};
+
+} // namespace thd
+
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
new file mode 100644
index 0000000..cc17693
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -0,0 +1,463 @@
+#include "DataChannelMPI.hpp"
+#include "DataChannelUtils.hpp"
+
+#include <ATen/ATen.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <iostream>
+
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+
+namespace thd {
+
+namespace {
+
+std::unordered_map<THDReduceOp, MPI_Op> mpi_op = {
+  {THDReduceOp::THDReduceMIN, MPI_MIN},
+  {THDReduceOp::THDReduceMAX, MPI_MAX},
+  {THDReduceOp::THDReduceSUM, MPI_SUM},
+  {THDReduceOp::THDReducePRODUCT, MPI_PROD},
+};
+
+std::unordered_map<at::ScalarType, MPI_Datatype> mpi_datatype = {
+  {at::kByte, MPI_UNSIGNED_CHAR},
+  {at::kChar, MPI_CHAR},
+  {at::kDouble, MPI_DOUBLE},
+  {at::kFloat, MPI_FLOAT},
+  {at::kInt, MPI_INT},
+  {at::kLong, MPI_LONG},
+  {at::kShort, MPI_SHORT},
+};
+
+} // namespace
+
+
+DataChannelMPI::RequestMPI::RequestMPI() {}
+
+
+DataChannelMPI::RequestMPI::~RequestMPI() {
+  for (auto& request : _requests) {
+    if (request != MPI_REQUEST_NULL)
+      MPI_Request_free(&request);
+  }
+}
+
+
+bool DataChannelMPI::RequestMPI::isCompleted() {
+  int flag;
+  MPI_Testall(_requests.size(), _requests.data(), &flag, MPI_STATUSES_IGNORE);
+  return static_cast<bool>(flag);
+}
+
+
+void DataChannelMPI::RequestMPI::wait() {
+  MPI_Waitall(_requests.size(), _requests.data(), MPI_STATUSES_IGNORE);
+}
+
+
+template<typename T>
+void DataChannelMPI::RequestMPI::save_buffer(std::shared_ptr<T> ptr) {
+  _buffers.push_back(std::static_pointer_cast<void>(ptr));
+}
+
+void DataChannelMPI::RequestMPI::save_tensor_buffer(at::Tensor& t) {
+  _tensor_buffers.push_back(t);
+}
+
+
+MPI_Request& DataChannelMPI::RequestMPI::new_request() {
+  _requests.push_back(MPI_Request());
+  return _requests.back();
+}
+
+DataChannelMPI::DataChannelMPI()
+  : _rank(-1)
+  , _num_processes(0)
+{}
+
+
+DataChannelMPI::~DataChannelMPI() {
+  for (auto& group : _groups) {
+    auto comm = group.second.first;
+    if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL)
+      MPI_Comm_free(&comm);
+  }
+
+  MPI_Finalize();
+}
+
+
+void DataChannelMPI::destroy() {}
+
+
+bool DataChannelMPI::init() {
+  int provided;
+  MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
+  if (provided != MPI_THREAD_MULTIPLE) {
+    std::cerr << "WARNING: Used MPI implementation doesn't support multithreading, "
+              << "so distributed functions might not work properly."
+              << "If you are using mpich, try setting environment MPICH_MAX_THREAD_SAFETY=multiple and rerun."
+              << std::endl;
+  }
+
+  int rank, num_processes;
+  MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  _rank = convertToRank(rank);
+  _num_processes = convertToRank(num_processes);
+
+  std::vector<rank_type> ranks;
+  ranks.reserve(_num_processes);
+  for (rank_type rank = 0; rank < _num_processes; ++rank)
+    ranks.push_back(rank);
+
+  _groups.insert({
+    THDGroupWORLD,
+    std::make_pair(MPI_COMM_WORLD, DataChannel::Group(ranks, _num_processes - 1))
+  });
+  return true;
+}
+
+rank_type DataChannelMPI::getRank() {
+  return _rank;
+}
+
+
+rank_type DataChannelMPI::getNumProcesses() {
+  return _num_processes;
+}
+
+at::Tensor DataChannelMPI::_newLikeFlat(std::vector<at::Tensor>& tensors) const {
+  // TODO: check if all outputs are contiguous in memory and skip this step is yes
+  if (tensors.size() == 0)
+    throw std::runtime_error("received an empty list");
+  auto & t = tensors[0];
+  at::DeviceGuard gpu_guard(t.is_cuda() ? t.get_device() : -1);
+  std::vector<int64_t> sizes { static_cast<int64_t>(tensors.size()) };  // sizes = [output.size()] + input.sizes()
+  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
+  return t.type().tensor(sizes);
+}
+
+
+void DataChannelMPI::allGather(std::vector<at::Tensor>& output,
+                               at::Tensor& input, THDGroup group_id) {
+  const auto& group_pair = _groups.at(group_id);
+  const auto& comm = group_pair.first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  if (output.size() != group_pair.second.size())
+    throw std::logic_error("allGather: number of output tensors and group size does not match");
+
+  for (auto out_tensor : output)
+    assertSameSizeAndType(out_tensor, input, "allGather");
+
+  auto recv_buffer = _newLikeFlat(output);
+  auto contig_input = input.contiguous();
+
+  MPI_Allgather(
+    contig_input.data_ptr(), contig_input.numel(), mpi_datatype.at(contig_input.type().scalarType()),
+    recv_buffer.data_ptr(), contig_input.numel(), mpi_datatype.at(recv_buffer.type().scalarType()),
+    comm
+  );
+
+  for (size_t i = 0; i < output.size(); ++i)
+    output[i].copy_(recv_buffer[i]);
+}
+
+
+void DataChannelMPI::gather(std::vector<at::Tensor>& output,
+                            at::Tensor& input, rank_type dst_rank,
+                            THDGroup group_id) {
+  const auto& group_pair = _groups.at(group_id);
+  const auto& comm = group_pair.first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  at::Tensor recv_buffer;
+  void *recvbuf = nullptr;
+  if (_rank != dst_rank) {
+    if (output.size() > 0)
+      throw std::logic_error("gather: number of input tensors should be 0 for non root");
+  } else {
+    if (output.size() != group_pair.second.size())
+      throw std::logic_error("gather: number of output tensors and group size does not match");
+
+    for (auto out_tensor : output)
+      assertSameSizeAndType(out_tensor, input, "gather");
+
+    recv_buffer = _newLikeFlat(output);
+    recvbuf = recv_buffer.data_ptr();
+  }
+
+  rank_type group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank);
+  auto contig_input = input.contiguous();
+
+  MPI_Gather(
+    contig_input.data_ptr(), input.numel(), mpi_datatype.at(input.type().scalarType()),
+    recvbuf, input.numel(), mpi_datatype.at(input.type().scalarType()),
+    group_dst_rank, comm
+  );
+
+  // NOTE: this is a no-op in all processes except dst_rank
+  for (size_t i = 0; i < output.size(); ++i)
+    output[i].copy_(recv_buffer[i]);
+}
+
+
+void DataChannelMPI::scatter(std::vector<at::Tensor>& input,
+                             at::Tensor& output,
+                             rank_type src_rank, THDGroup group_id) {
+  const auto& group_pair = _groups.at(group_id);
+  const auto& comm = group_pair.first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  if (!output.is_contiguous())
+    throw std::runtime_error("scatter output has to be a contiguous tensor");
+
+  at::Tensor send_buffer;
+  void *sendbuf = nullptr;
+  if (_rank != src_rank) {
+    if (input.size() > 0)
+      throw std::logic_error("scatter: number of input tensors should be 0 for non root");
+  } else {
+    if (input.size() != group_pair.second.size())
+      throw std::logic_error("scatter: number of input tensors and group size does not match");
+
+    for (auto in_tensor : input)
+      assertSameSizeAndType(in_tensor, output, "scatter");
+
+    send_buffer = _newLikeFlat(input);
+    for (size_t i = 0; i < input.size(); ++i)
+      send_buffer[i].copy_(input[i]);
+    sendbuf = send_buffer.data_ptr();
+  }
+
+  rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank);
+
+  MPI_Scatter(
+    sendbuf, output.numel(), mpi_datatype.at(output.type().scalarType()),
+    output.data_ptr(), output.numel(), mpi_datatype.at(output.type().scalarType()),
+    group_src_rank, comm
+  );
+}
+
+
+void DataChannelMPI::allReduce(at::Tensor& data, THDReduceOp operation,
+                               THDGroup group_id) {
+  const auto& comm = _groups.at(group_id).first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  if (!data.is_contiguous())
+    throw std::runtime_error("all_reduce input has to be contiguous");
+
+  MPI_Allreduce(MPI_IN_PLACE, data.data_ptr(), data.numel(),
+                mpi_datatype.at(data.type().scalarType()), mpi_op.at(operation), comm);
+}
+
+
+void DataChannelMPI::reduce(at::Tensor& data, THDReduceOp operation,
+                            rank_type dst_rank, THDGroup group_id) {
+  const auto& group_pair = _groups.at(group_id);
+  const auto& comm = group_pair.first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  if (!data.is_contiguous())
+    throw std::runtime_error("reduce input has to be contiguous");
+
+  auto group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank);
+  void *sendbuf = (_rank == dst_rank) ? MPI_IN_PLACE    : data.data_ptr();
+  void *recvbuf = (_rank == dst_rank) ? data.data_ptr() : nullptr;
+  MPI_Reduce(sendbuf, recvbuf, data.numel(), mpi_datatype.at(data.type().scalarType()),
+             mpi_op.at(operation), group_dst_rank, comm);
+}
+
+
+void DataChannelMPI::broadcast(at::Tensor& data, rank_type src_rank,
+                               THDGroup group_id) {
+  const auto& group_pair = _groups.at(group_id);
+  const auto& comm = group_pair.first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  if (!data.is_contiguous())
+    throw std::runtime_error("broadcast input has to be contiguous");
+
+  rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank);
+  MPI_Bcast(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+            group_src_rank, comm);
+}
+
+
+void DataChannelMPI::send(Scalar& data, rank_type dst_rank) {
+  MPI_Send(data.data(), data.elementSize(), MPI_UINT8_T,
+           dst_rank, 0, MPI_COMM_WORLD);
+}
+
+
+void DataChannelMPI::send(at::Tensor& data, rank_type dst_rank) {
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to send is not contiguous");
+
+  MPI_Send(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+           dst_rank, 0, MPI_COMM_WORLD);
+}
+
+
+void DataChannelMPI::receive(Scalar& data, rank_type src_rank) {
+  MPI_Recv(data.data(), data.elementSize(), MPI_UINT8_T,
+           src_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+}
+
+
+rank_type DataChannelMPI::receive(at::Tensor& data) {
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to receive is not contiguous");
+
+  MPI_Status status;
+  MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+           MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
+  return status.MPI_SOURCE;
+}
+
+
+void DataChannelMPI::receive(at::Tensor& data, rank_type src_rank) {
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to receive is not contiguous");
+
+  MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+           src_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+}
+
+
+void DataChannelMPI::barrier(THDGroup group_id) {
+  const auto& comm = _groups.at(group_id).first;
+  if (comm == MPI_COMM_NULL)
+    return;
+
+  MPI_Barrier(comm);
+}
+
+
+DataChannelMPI::RequestMPI* DataChannelMPI::isend(at::Tensor& data, rank_type dst_rank) {
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to send is not contiguous");
+
+  std::unique_ptr<RequestMPI> request { new RequestMPI() };
+  request->save_tensor_buffer(data);
+  auto& mpi_request = request->new_request();
+  MPI_Isend(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+            dst_rank, 0, MPI_COMM_WORLD, &mpi_request);
+
+  return request.release();
+}
+
+
+DataChannelMPI::RequestMPI* DataChannelMPI::ireceive(at::Tensor& data,
+                                                     rank_type src_rank) {
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to receive is not contiguous");
+
+  std::unique_ptr<RequestMPI> request { new RequestMPI() };
+  request->save_tensor_buffer(data);
+  auto& mpi_request = request->new_request();
+  MPI_Irecv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
+            src_rank, 0, MPI_COMM_WORLD, &mpi_request);
+
+  return request.release();
+}
+
+THDGroup DataChannelMPI::newGroup(const std::vector<rank_type>& ranks) {
+  MPI_Group world_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+
+  MPI_Group ranks_group;
+  std::vector<int> int_ranks(ranks.begin(), ranks.end());
+  MPI_Group_incl(world_group, int_ranks.size(), int_ranks.data(), &ranks_group);
+
+  MPI_Comm new_comm;
+  MPI_Comm_create(MPI_COMM_WORLD, ranks_group, &new_comm);
+
+  MPI_Group_free(&world_group);
+  MPI_Group_free(&ranks_group);
+
+  DataChannel::Group new_group;
+  if (new_comm != MPI_COMM_NULL) {
+    int size, mapping_ranks[2];
+    MPI_Comm_size(new_comm, &size);
+    MPI_Comm_rank(new_comm, mapping_ranks); // get rank in new communicator
+    mapping_ranks[1] = _rank; // get rank in world communicator
+
+    std::unique_ptr<int[]> all_mapping_ranks(new int[2 * size]);
+    MPI_Allgather(&mapping_ranks, 2, MPI_INT, all_mapping_ranks.get(), 2,
+                  MPI_INT, new_comm);
+
+    // this vector maps new ranks to ranks in COMM_WORLD (global ranks)
+    std::vector<rank_type> new_ranks(size);
+    for (size_t i = 0; i < 2 * size; i += 2)
+      new_ranks[all_mapping_ranks[i]] = all_mapping_ranks[i + 1];
+
+    new_group = DataChannel::Group(new_ranks, _num_processes - 1);
+  }
+
+  THDGroup new_group_id = static_cast<THDGroup>(_groups.size());
+  _groups.insert({new_group_id, std::make_pair(new_comm, new_group)});
+  return new_group_id;
+}
+
+void DataChannelMPI::allReduce(std::vector<at::Tensor>& data,
+                               THDReduceOp operation,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelMPI does not support mult-GPU cross "
+                           "node allreduce");
+}
+
+
+void DataChannelMPI::allGather(std::vector<at::Tensor>& output,
+                               std::vector<at::Tensor>& input,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelMPI does not support mult-GPU cross "
+                           "node allgather");
+}
+
+
+void DataChannelMPI::reduce(std::vector<at::Tensor>& data,
+                            THDReduceOp operation,
+                            rank_type dstRank,
+                            THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelMPI does not support mult-GPU cross "
+                           "node reduce");
+}
+
+
+void DataChannelMPI::broadcast(std::vector<at::Tensor>& data,
+                               rank_type srcRank,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelMPI does not support mult-GPU cross "
+                           "node broadcast");
+}
+
+
+void DataChannelMPI::clearGroupCache(THDGroup group_id) {
+  throw std::runtime_error("DataChannelMPI does not support clear "
+                           "group cache");
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.hpp b/torch/lib/THD/base/data_channels/DataChannelMPI.hpp
new file mode 100644
index 0000000..acee6e9
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.hpp
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "../DataChannel.hpp"
+
+#include <mpi.h>
+#include <memory>
+#include <utility>
+#include <unordered_map>
+#include <vector>
+
+namespace thd {
+
+struct DataChannelMPI : DataChannel {
+  struct RequestMPI : DataChannel::Request {
+    friend class DataChannelMPI; // allows `DataChannelMPI` to access private members
+
+    RequestMPI();
+    virtual ~RequestMPI();
+
+    virtual bool isCompleted() override;
+    virtual void wait() override;
+
+  private:
+    template<typename T>
+    void save_buffer(std::shared_ptr<T> ptr);
+    void save_tensor_buffer(at::Tensor& t);
+    MPI_Request& new_request();
+
+    std::vector<std::shared_ptr<void>> _buffers;
+    std::vector<at::Tensor> _tensor_buffers;
+    std::vector<MPI_Request> _requests;
+  };
+
+  DataChannelMPI();
+  virtual ~DataChannelMPI();
+
+  bool init() override;
+  void destroy() override;
+
+  rank_type getRank() override;
+  rank_type getNumProcesses() override;
+
+  void allGather(std::vector<at::Tensor>& output,
+                 std::vector<at::Tensor>& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allGather(std::vector<at::Tensor>& output, at::Tensor& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void gather(std::vector<at::Tensor>& output, at::Tensor& input,
+              rank_type dst_rank, THDGroup group_id = THDGroupWORLD) override;
+  void scatter(std::vector<at::Tensor>& input, at::Tensor& output,
+               rank_type src_rank, THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(std::vector<at::Tensor>& data,
+                 THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(at::Tensor& data, THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void reduce(std::vector<at::Tensor>& data,
+              THDReduceOp operation,
+              rank_type dstRank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void reduce(at::Tensor& data, THDReduceOp operation, rank_type dst_rank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(std::vector<at::Tensor>& data,
+                 rank_type srcRank,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(at::Tensor& data, rank_type src_rank,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void send(Scalar& data, rank_type dst_rank) override;
+  void send(at::Tensor& data, rank_type dst_rank) override;
+  void receive(Scalar& data, rank_type src_rank) override;
+  rank_type receive(at::Tensor& data) override;
+  void receive(at::Tensor& data, rank_type src_rank) override;
+  RequestMPI* isend(at::Tensor& data, rank_type dst_rank) override;
+  RequestMPI* ireceive(at::Tensor& data, rank_type src_rank) override;
+
+  void barrier(THDGroup group_id = THDGroupWORLD) override;
+  THDGroup newGroup(const std::vector<rank_type>& ranks) override;
+  void clearGroupCache(THDGroup group_id = THDGroupWORLD) override;
+
+private:
+  at::Tensor _newLikeFlat(std::vector<at::Tensor>& tensors) const;
+
+  rank_type _rank; // Current process' rank
+  rank_type _num_processes; // Number of processes in network
+
+  // Existing groups of processes with assigned MPI communicator
+  // and corresponding group ids
+  std::unordered_map<THDGroup, std::pair<MPI_Comm, DataChannel::Group>> _groups;
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelNccl.cpp b/torch/lib/THD/base/data_channels/DataChannelNccl.cpp
new file mode 100644
index 0000000..691f87d
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelNccl.cpp
@@ -0,0 +1,722 @@
+#include "../Cuda.hpp"
+#include "DataChannelNccl.hpp"
+#include "DataChannelUtils.hpp"
+
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <THC/THC.h>
+
+#include <unistd.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <unordered_set>
+#include <sstream>
+#include <algorithm>
+
+namespace thd {
+
+namespace {
+
+
+std::unordered_map<THDReduceOp, ncclRedOp_t> ncclOp = {
+  {THDReduceOp::THDReduceMIN, ncclMin},
+  {THDReduceOp::THDReduceMAX, ncclMax},
+  {THDReduceOp::THDReduceSUM, ncclSum},
+  {THDReduceOp::THDReducePRODUCT, ncclProd},
+};
+
+
+std::unordered_map<at::ScalarType, ncclDataType_t> ncclDatatype = {
+  {at::kChar, ncclInt8},
+  {at::kByte, ncclUint8},
+  {at::kFloat, ncclFloat},
+  {at::kDouble, ncclDouble},
+  {at::kInt, ncclInt32},
+  {at::kLong, ncclInt64},
+  {at::kHalf, ncclHalf},
+};
+
+
+// Helper function that gets the data type and issues error if not supported
+static ncclDataType_t _getNcclDataType(at::ScalarType type) {
+  try {
+    return ncclDatatype.at(type);
+  } catch (std::out_of_range& e) {
+    throw std::runtime_error("Unsupported data type for NCCL backend");
+  }
+}
+
+
+// Helper function that gets the device list to determine the CUDA devices
+std::vector<int> getDevicesList(const std::string& deviceSeq) {
+
+  std::stringstream ss(deviceSeq);
+  std::string device;
+  std::vector<int> devices;
+  while (std::getline(ss, device, ',')) {
+    devices.push_back(stoi(device));
+  }
+  return devices;
+}
+
+} // namespace
+
+
+// DataChannelNccl
+DataChannelNccl::DataChannelNccl(InitMethod::Config config, int timeout)
+  : _rank(config.rank)
+  , _numProcesses(config.world_size)
+  , _timeout(timeout)
+  , _masterListeningSocket(-1)
+  , _slaveSocket(-1) {
+
+  // Establish the socket connections from rank 0 to all others
+  if (_rank == 0) {
+    _masterListeningSocket = config.master.listen_socket;
+    _masterSendingSockets = std::vector<int>(_numProcesses - 1, -1);
+
+    try {
+      for (rank_type i = 0; i < _numProcesses - 1; ++i) {
+        std::tie(_masterSendingSockets[i],
+                 std::ignore) = accept(_masterListeningSocket, _timeout);
+      }
+    } catch (...) {
+      // Destroy the created sockets
+      _destroySockets();
+      throw std::runtime_error("Rank 0 cannot establish thelistening socket");
+    }
+
+  } else {
+    _masterAddr = config.worker.master_addr;
+    _masterPort = config.worker.master_port;
+
+    try {
+      _slaveSocket = connect(_masterAddr, _masterPort, true, _timeout);
+    } catch (...) {
+      // Destroy the created sockets
+      _destroySockets();
+      std::string errStr = "Rank: " + std::to_string(_rank) + " cannot "
+                           "connect to the master: " + _masterAddr + ":" +
+                           std::to_string(_masterPort);
+      throw std::runtime_error(errStr);
+    }
+  }
+}
+
+
+// Use the socket to broadcast NCCL ID
+void DataChannelNccl::broadcastUniqueNcclId(ncclUniqueId* ncclId) {
+  // Send the unique NCCL id to every rank
+  if (_rank == 0) {
+    for (auto socket : _masterSendingSockets) {
+      send_bytes<uint8_t>(socket,
+                          reinterpret_cast<uint8_t*>(ncclId),
+                          NCCL_UNIQUE_ID_BYTES);
+    }
+  } else {
+    recv_bytes<uint8_t>(_slaveSocket,
+                        reinterpret_cast<uint8_t*>(ncclId),
+                        NCCL_UNIQUE_ID_BYTES);
+  }
+}
+
+
+// Destructor will only close all the sockets
+DataChannelNccl::~DataChannelNccl() {
+   /**
+    * Note that destructor will be called after cudaruntime being unloaded since
+    * DataChannel is a global variable.
+    */
+  _destroySockets();
+}
+
+
+void DataChannelNccl::_destroySockets() {
+  // Destroying all the socket
+  if (_masterListeningSocket != -1) {
+    ::close(_masterListeningSocket);
+    _masterListeningSocket = -1;
+  }
+  if (_slaveSocket != -1) {
+    ::close(_slaveSocket);
+    _slaveSocket = -1;
+  }
+  for (size_t i = 0; i < _masterSendingSockets.size(); ++i) {
+    if (_masterSendingSockets[i] != -1) {
+      ::close(_masterSendingSockets[i]);
+      _masterSendingSockets[i] = -1;
+    }
+  }
+}
+
+// Destroy the data channel
+void DataChannelNccl::destroy() {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  // Destroying all the socket
+  _destroySockets();
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  /**
+   * Destroy the CUDA and NCCL resources
+   * TODO: creating C++ wrappers for CUDA and NCCL resources to do the
+   *       cleanup automatically
+   */
+  for (auto& itemPair : _groupNcclResources) {
+    auto groupId = itemPair.first;
+    _destroyNcclResources(groupId);
+  }
+
+  _groupNcclResources.clear();
+  _groupDevices.clear();
+
+  _groups.clear();
+}
+
+
+// Helper function that destroys the CUDA event and NCCL communicator
+void DataChannelNccl::_destroyNcclResources(THDGroup groupId) {
+  if (_groupNcclResources.find(groupId) != _groupNcclResources.end()) {
+    for (int i=0; i < _groupDevices[groupId].size(); i++) {
+
+      // Devices used for this group ID
+      auto devices = getDevicesList(_groupDevices[groupId][i]);
+      // Guard GPU device
+      at::DeviceGuard gpuGuard;
+      // Destroy the CUDA events
+      size_t idx = 0;
+      for (auto& event : *(_groupNcclResources[groupId][i].ncclCudaEvents())) {
+        gpuGuard.set_index(devices[idx++]);
+        THCudaCheck(cudaEventSynchronize(event));
+        THCudaCheck(cudaEventDestroy(event));
+      }
+      // Destroy the communicators
+      for (auto& comm : *(_groupNcclResources[groupId][i].ncclComms())) {
+        NCCL_CHECK(ncclCommDestroy(comm));
+      }
+    }
+  }
+}
+
+
+// Destroy the cached NCCL resource associated with a given group
+void DataChannelNccl::clearGroupCache(THDGroup groupId) {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  _destroyNcclResources(groupId);
+
+  _groupNcclResources.erase(groupId);
+  _groupDevices.erase(groupId);
+}
+
+
+// Initialization function
+bool DataChannelNccl::init() {
+
+  std::vector<rank_type> ranks;
+  ranks.reserve(_numProcesses);
+
+  for (rank_type rank = 0; rank < _numProcesses; ++rank) {
+    ranks.push_back(rank);
+  }
+
+  // Insert the current group
+  _groups.insert({
+    THDGroupWORLD,
+    DataChannel::Group(ranks, _numProcesses - 1)
+  });
+
+  // Get the GPU count
+  THCudaCheck(cudaGetDeviceCount(&_numGPUs));
+
+  return true;
+}
+
+
+rank_type DataChannelNccl::getRank() {
+  return _rank;
+}
+
+
+rank_type DataChannelNccl::getNumProcesses() {
+  return _numProcesses;
+}
+
+
+NcclResourcePair DataChannelNccl::_getNcclResourcePair(
+    std::vector<at::Tensor>& input,
+    THDGroup groupId) {
+
+  if (input.empty()) {
+    throw std::runtime_error("Not able to create/get the Nccl Comm since "
+                             "input tensor is empty");
+  }
+  // Get the deviceList String
+  std::string deviceList;
+  for (auto tensor : input) {
+    if (deviceList.empty()) {
+      deviceList = std::to_string(tensor.get_device());
+    } else {
+      deviceList += "," + std::to_string(tensor.get_device());
+    }
+  }
+
+  int index = -1;
+
+  if (_groupDevices.find(groupId) != _groupDevices.end()) {
+    auto pos = std::find(_groupDevices[groupId].begin(), _groupDevices[groupId].end(), deviceList);
+    if (pos != _groupDevices[groupId].end()) index = pos - _groupDevices[groupId].begin();
+  }
+
+  if (index >= 0) {
+    return std::make_pair(_groupNcclResources[groupId][index].ncclComms(),
+                          _groupNcclResources[groupId][index].ncclCudaEvents());
+  }
+
+  // Add in the device list of the group
+  _groupDevices[groupId].push_back(deviceList);
+
+  // NCCL communicator
+  auto comms =
+    std::unique_ptr<std::vector<ncclComm_t>>(new std::vector<ncclComm_t>());
+
+  comms->resize(input.size());
+
+  // Corresponding CUDA events
+  auto events =
+    std::unique_ptr<std::vector<cudaEvent_t>>(new std::vector<cudaEvent_t>());
+
+  events->resize(input.size());
+
+  // Create the unique NCCL ID and broadcast it
+  ncclUniqueId ncclId;
+  NCCL_CHECK(ncclGetUniqueId(&ncclId));
+
+  // Broadcast so that each process can have a unique NCCL ID
+  broadcastUniqueNcclId(&ncclId);
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  // Now creating the CUDA events
+  for (size_t i = 0; i < input.size(); ++i) {
+    gpuGuard.set_index(input[i].get_device());
+    THCudaCheck(cudaEventCreate(&((*events)[i])));
+  }
+  // Create the communicator on each device of the input
+  NCCL_CHECK(ncclGroupStart());
+  for (size_t i = 0; i < input.size(); ++i) {
+    int nRanks = int(_numProcesses) * input.size();
+    gpuGuard.set_index(input[i].get_device());
+    NCCL_CHECK(ncclCommInitRank(&((*comms)[i]),
+                                nRanks,
+                                ncclId,
+                                _rank * input.size() + i));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+  // Move into the hash table
+  if (_groupNcclResources.find(groupId) == _groupNcclResources.end())
+      _groupNcclResources.emplace(std::make_pair(groupId, std::vector<NcclResources>()));
+
+  _groupNcclResources[groupId].push_back(NcclResources(std::move(comms), std::move(events)));
+
+  return std::make_pair(_groupNcclResources[groupId].back().ncclComms(),
+                        _groupNcclResources[groupId].back().ncclCudaEvents());
+}
+
+
+// Helper function that checks the input and output tensors for validity
+bool DataChannelNccl::_tensorCheckHelper(
+    const std::vector<at::Tensor>& input,
+    const std::vector<at::Tensor>& output,
+    size_t outputOverInput) {
+
+  if (input.size() != output.size()) {
+    throw std::runtime_error("Input tensor sequence should have the same "
+                             "number of tensors as the output tensor sequence");
+  }
+
+  if (input.size() == 0) {
+    // Return false saying this is a no-op
+    return false;
+  }
+
+  if (input.size() > _numGPUs) {
+    throw std::runtime_error("The number of input tensors is larger than "
+                             "the number of available GPUs");
+  }
+
+  // To make sure each tensor is on separate devices
+  std::unordered_set<int> usedDevices;
+  usedDevices.reserve(input.size());
+
+  uint64_t inputNumElement = input[0].numel();
+  auto elementType = input[0].type().scalarType();
+
+  for (size_t i = 0; i < input.size(); ++i) {
+
+    //  Check to make sure it's a GPU dense tensor
+    if (!(input[i].type().is_cuda() && !input[i].type().is_sparse() &&
+          output[i].type().is_cuda()  && !output[i].type().is_sparse())) {
+      throw std::runtime_error("Only CUDA dense tensor is supported for NCCL "
+                               "collective operations");
+    }
+    // Check the tensor type is identical
+    if (input[i].type().scalarType() != elementType ||
+        output[i].type().scalarType() != elementType) {
+      throw std::runtime_error("Expecting all GPU tensors to have identical "
+                               "type");
+    }
+    // Check the input tensor size is identical
+    if (input[i].numel() != inputNumElement) {
+      throw std::runtime_error("Expecting all input tensors to have identical "
+                               "number of elements");
+    }
+    // Check the output tensor size equals to input tensor size
+    if (output[i].numel() != inputNumElement * outputOverInput) {
+      throw std::runtime_error("The number of elements of output tensor does "
+                               "not match the number of elements of the input "
+                               "tensor");
+    }
+    // Contiguous verification
+    if (!input[i].is_contiguous() || !output[i].is_contiguous()) {
+      throw std::runtime_error("Expecting all GPU tensors to be contiguous");
+    }
+
+    bool inserted;
+    std::tie(std::ignore, inserted) = usedDevices.insert(input[i].get_device());
+    // Device verification, if the insertion didn't take place
+    if (!inserted) {
+      throw std::runtime_error("Expecting inputs on different GPU devices");
+    }
+
+    // Now check the output device
+    if (input[i].get_device() != output[i].get_device()) {
+      throw std::runtime_error("Expecting input and output tensors to be on "
+                               "the same device");
+    }
+  }
+  return true;
+}
+
+
+void DataChannelNccl::allReduce(std::vector<at::Tensor>& data,
+                                THDReduceOp operation,
+                                THDGroup groupId) {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+  // Check the tensor vector for consistency
+  if (!_tensorCheckHelper(data, data)) {
+    return;
+  }
+  _checkGroupIdValid(groupId);
+
+  auto ncclResourcePair  = _getNcclResourcePair(data, groupId);
+  auto comms = ncclResourcePair.first;
+  auto events = ncclResourcePair.second;
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  NCCL_CHECK(ncclGroupStart());
+  for (size_t i = 0; i < data.size(); ++i) {
+
+    gpuGuard.set_index(data[i].get_device());
+    auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+    NCCL_CHECK(ncclAllReduce(data[i].data_ptr(),
+                             data[i].data_ptr(),
+                             data[i].numel(),
+                             _getNcclDataType(data[i].type().scalarType()),
+                             ncclOp[operation],
+                             (*comms)[i],
+                             stream));
+    THCudaCheck(cudaEventRecord((*events)[i], stream));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+  cudaFreeMutexLock.unlock();
+}
+
+
+void DataChannelNccl::allReduce(at::Tensor& data,
+                                THDReduceOp operation,
+                                THDGroup groupId) {
+
+  std::vector<at::Tensor> dataVec = {data};
+  allReduce(dataVec, operation, groupId);
+}
+
+
+void DataChannelNccl::allGather(std::vector<at::Tensor>& output,
+                                std::vector<at::Tensor>& input,
+                                THDGroup groupId) {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  if (!_tensorCheckHelper(input, output, _numProcesses * input.size())) {
+    return;
+  }
+  _checkGroupIdValid(groupId);
+
+  auto ncclResourcePair = _getNcclResourcePair(input, groupId);
+  auto comms = ncclResourcePair.first;
+  auto events = ncclResourcePair.second;
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  NCCL_CHECK(ncclGroupStart());
+  for (size_t i = 0; i < input.size(); ++i) {
+
+    gpuGuard.set_index(input[i].get_device());
+    auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+    NCCL_CHECK(ncclAllGather(input[i].data_ptr(),
+                             output[i].data_ptr(),
+                             input[i].numel(),
+                             _getNcclDataType(input[i].type().scalarType()),
+                             (*comms)[i],
+                             stream));
+    THCudaCheck(cudaEventRecord((*events)[i], stream));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+  cudaFreeMutexLock.unlock();
+}
+
+
+void DataChannelNccl::allGather(std::vector<at::Tensor>& output,
+                                at::Tensor& input,
+                                THDGroup groupId) {
+
+  std::vector<at::Tensor> inputDataVec = {input};
+  allGather(output, inputDataVec, groupId);
+}
+
+
+void DataChannelNccl::reduce(std::vector<at::Tensor>& data,
+                             THDReduceOp operation,
+                             rank_type dstRank,
+                             THDGroup groupId) {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  // Check the tensor vector for consistency
+  if (!_tensorCheckHelper(data, data)) {
+    return;
+  }
+  _checkGroupIdValid(groupId);
+
+  auto ncclResourcePair = _getNcclResourcePair(data, groupId);
+  auto comms = ncclResourcePair.first;
+  auto events = ncclResourcePair.second;
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  NCCL_CHECK(ncclGroupStart());
+  for (size_t i = 0; i < data.size(); ++i) {
+
+    gpuGuard.set_index(data[i].get_device());
+    auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+    NCCL_CHECK(ncclReduce(data[i].data_ptr(),
+                          data[i].data_ptr(),
+                          data[i].numel(),
+                          _getNcclDataType(data[i].type().scalarType()),
+                          ncclOp[operation],
+                          dstRank * data.size(),
+                          (*comms)[i],
+                          stream));
+    THCudaCheck(cudaEventRecord((*events)[i], stream));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+  cudaFreeMutexLock.unlock();
+}
+
+void DataChannelNccl::reduce(at::Tensor& data,
+                             THDReduceOp operation,
+                             rank_type dstRank,
+                             THDGroup groupId) {
+
+  std::vector<at::Tensor> dataVec = {data};
+  reduce(dataVec, operation, dstRank, groupId);
+}
+
+void DataChannelNccl::broadcast(std::vector<at::Tensor>& data,
+                                rank_type srcRank,
+                                THDGroup groupId) {
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  // Check the tensor vector for consistency
+  if (!_tensorCheckHelper(data, data)) {
+    return;
+  }
+  _checkGroupIdValid(groupId);
+
+  auto ncclResourcePair = _getNcclResourcePair(data, groupId);
+  auto comms = ncclResourcePair.first;
+  auto events = ncclResourcePair.second;
+
+  // Guard GPU device
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  NCCL_CHECK(ncclGroupStart());
+  for (size_t i = 0; i < data.size(); ++i) {
+
+    gpuGuard.set_index(data[i].get_device());
+    auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+    NCCL_CHECK(ncclBcast(data[i].data_ptr(),
+                         data[i].numel(),
+                         _getNcclDataType(data[i].type().scalarType()),
+                         srcRank * data.size(),
+                         (*comms)[i],
+                         stream));
+    THCudaCheck(cudaEventRecord((*events)[i], stream));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+  cudaFreeMutexLock.unlock();
+}
+
+
+void DataChannelNccl::broadcast(at::Tensor& data,
+                                rank_type srcRank,
+                                THDGroup groupId) {
+
+  std::vector<at::Tensor> dataVec = {data};
+  broadcast(dataVec, srcRank, groupId);
+}
+
+
+void DataChannelNccl::barrier(THDGroup groupId) {
+  throw std::runtime_error("DataChannelNccl does not support barrier");
+}
+
+
+THDGroup DataChannelNccl::newGroup(const std::vector<rank_type>& ranks) {
+  /**
+   * Check if the input rank is a full group since
+   * NCCL data channel currently doesn't support sub-group creation
+   */
+  std::vector<rank_type> ranksToCompare = std::vector<rank_type>(ranks);
+  std::sort(ranksToCompare.begin(), ranksToCompare.end());
+  for (size_t i = 0; i < ranksToCompare.size(); ++i) {
+    if (ranksToCompare[i] != static_cast<rank_type>(i)) {
+      throw std::runtime_error("NCCL backend currently only supports fullgroup "
+                               "creation. In other words, every rank in the "
+                               "process group needs to be a member of the new "
+                               "group to be created and sub-group creation is "
+                               "currently not supported.");
+    }
+  }
+
+  std::unique_lock<std::mutex> channelLock(_mutex);
+
+  auto newGroup = DataChannel::Group(ranks, _numProcesses - 1);
+  THDGroup newGroupId = static_cast<THDGroup>(_groups.size());
+
+  // Insert the current group
+  _groups.insert({
+    newGroupId,
+    newGroup
+  });
+
+  return newGroupId;
+}
+
+
+// Helper function that checks if the given groupId is valid
+void DataChannelNccl::_checkGroupIdValid(THDGroup groupId) {
+
+  if (_groups.find(groupId) == _groups.end()) {
+    std::string errMsg = "Group ID: " + std::to_string(groupId) +
+                         " is not valid";
+    throw std::runtime_error(errMsg);
+  }
+}
+
+
+void DataChannelNccl::gather(std::vector<at::Tensor>& output,
+                             at::Tensor& input,
+                             rank_type dstRank,
+                             THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelNccl does not support gather");
+}
+
+
+void DataChannelNccl::scatter(std::vector<at::Tensor>& input,
+                              at::Tensor& output,
+                              rank_type srcRank,
+                              THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelNccl does not support scatter");
+}
+
+
+void DataChannelNccl::send(Scalar& data, rank_type dstRank) {
+  throw std::runtime_error("DataChannelNccl does not support send");
+}
+
+
+void DataChannelNccl::send(at::Tensor& data, rank_type dstRank) {
+  throw std::runtime_error("DataChannelNccl does not support send");
+}
+
+
+void DataChannelNccl::receive(Scalar& data, rank_type srcRank) {
+  throw std::runtime_error("DataChannelNccl does not support receive");
+}
+
+
+rank_type DataChannelNccl::receive(at::Tensor& data) {
+  throw std::runtime_error("DataChannelNccl does not support receive "
+                           "from any source");
+}
+
+
+void DataChannelNccl::receive(at::Tensor& data, rank_type srcRank) {
+  throw std::runtime_error("DataChannelNccl does not support receive");
+}
+
+
+
+DataChannelNccl::RequestNccl* DataChannelNccl::isend(at::Tensor& data,
+                                                     rank_type dstRank) {
+
+  throw std::runtime_error("DataChannelNccl does not support isend");
+}
+
+
+DataChannelNccl::RequestNccl* DataChannelNccl::ireceive(at::Tensor& data,
+                                                        rank_type srcRank) {
+
+  throw std::runtime_error("DataChannelNccl does not support ireceive");
+}
+
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelNccl.hpp b/torch/lib/THD/base/data_channels/DataChannelNccl.hpp
new file mode 100644
index 0000000..54ed307
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelNccl.hpp
@@ -0,0 +1,249 @@
+#pragma once
+
+#include "../DataChannel.hpp"
+#include "DataChannelUtils.hpp"
+
+#include <nccl.h>
+
+#include <utility>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+
+#define NCCL_CHECK(cmd) do {                                  \
+  ncclResult_t error = cmd;                                   \
+  if (error != ncclSuccess) {                                 \
+    std::string err = "NCCL error in: " +                     \
+                      std::string(__FILE__) + ":" +           \
+                      std::to_string(__LINE__) + ", " +       \
+                      std::string(ncclGetErrorString(error)); \
+    throw std::runtime_error(err);                            \
+  }                                                           \
+} while (0)
+
+
+namespace thd {
+
+// Type aliasing
+using NcclResourcePair =
+  std::pair<std::vector<ncclComm_t>*, std::vector<cudaEvent_t>*>;
+
+struct DataChannelNccl : DataChannel {
+
+  // Nothing to implement
+  struct RequestNccl : DataChannel::Request {};
+
+  // Wrapper on the pair of NCCL resources
+  class NcclResources {
+
+  public:
+
+    NcclResources() = default;
+    NcclResources(std::unique_ptr<std::vector<ncclComm_t>>&& ncclComm,
+                  std::unique_ptr<std::vector<cudaEvent_t>>&& event):
+
+      _commEventPair(std::pair<std::unique_ptr<std::vector<ncclComm_t>>,
+                               std::unique_ptr<std::vector<cudaEvent_t>>>
+                               (std::move(ncclComm), std::move(event))) {}
+    // Delete copy and assignment ctors
+    NcclResources(const NcclResources&) = delete;
+    NcclResources& operator=(const NcclResources&) = delete;
+
+    // Move ctors by default
+    NcclResources(NcclResources&&) = default;
+    NcclResources& operator=(NcclResources&&) = default;
+
+    // Nccl Communicator Getter
+    std::vector<ncclComm_t>* ncclComms() {
+      return _commEventPair.first.get();
+    }
+
+    // Nccl CUDA event Getter
+    std::vector<cudaEvent_t>* ncclCudaEvents() {
+      return _commEventPair.second.get();
+    }
+
+  private:
+
+    std::pair<std::unique_ptr<std::vector<ncclComm_t>>,
+              std::unique_ptr<std::vector<cudaEvent_t>>> _commEventPair;
+  };
+
+
+  // Constructor
+  DataChannelNccl(InitMethod::Config config, int timeout = -1);
+  virtual ~DataChannelNccl();
+
+  bool init() override;
+  void destroy() override;
+
+  rank_type getRank() override;
+  rank_type getNumProcesses() override;
+
+  void allReduce(std::vector<at::Tensor>& data,
+                 THDReduceOp operation,
+                 THDGroup = THDGroupWORLD) override;
+
+  void allReduce(at::Tensor& data,
+                 THDReduceOp operation,
+                 THDGroup groupId = THDGroupWORLD) override;
+
+  void allGather(std::vector<at::Tensor>& output,
+                 std::vector<at::Tensor>& input,
+                 THDGroup groupId = THDGroupWORLD) override;
+
+  void allGather(std::vector<at::Tensor>& output,
+                 at::Tensor& input,
+                 THDGroup groupId = THDGroupWORLD) override;
+
+  void reduce(std::vector<at::Tensor>& input,
+              THDReduceOp operation,
+              rank_type dstRank,
+              THDGroup groupId = THDGroupWORLD) override;
+
+  void reduce(at::Tensor& data,
+              THDReduceOp operation,
+              rank_type dstRank,
+              THDGroup groupId = THDGroupWORLD) override;
+
+  void broadcast(std::vector<at::Tensor>& data,
+                 rank_type srcRank,
+                 THDGroup groupId = THDGroupWORLD) override;
+
+  void broadcast(at::Tensor& data,
+                 rank_type srcRank,
+                 THDGroup groupId = THDGroupWORLD) override;
+
+  void barrier(THDGroup groupId = THDGroupWORLD) override;
+
+  THDGroup newGroup(const std::vector<rank_type>& ranks) override;
+
+  void clearGroupCache(THDGroup groupId = THDGroupWORLD) override;
+
+  // Not supported functions
+  void gather(std::vector<at::Tensor>& output,
+              at::Tensor& input,
+              rank_type dstRank,
+              THDGroup groupId = THDGroupWORLD) override;
+
+  void scatter(std::vector<at::Tensor>& input,
+               at::Tensor& output,
+               rank_type srcRank,
+               THDGroup groupId = THDGroupWORLD) override;
+
+  void send(Scalar& data, rank_type dstRank) override;
+
+  void send(at::Tensor& data, rank_type dstRank) override;
+
+  void receive(Scalar& data, rank_type srcRank) override;
+
+  rank_type receive(at::Tensor& data) override;
+
+  void receive(at::Tensor& data, rank_type srcRank) override;
+
+  RequestNccl* isend(at::Tensor& data, rank_type dstRank) override;
+
+  RequestNccl* ireceive(at::Tensor& data, rank_type srcRank) override;
+
+private:
+
+  // Current process' rank
+  rank_type _rank;
+  // Number of processes in network
+  rank_type _numProcesses;
+
+  // Accept waiting timeout in milliseconds, optional
+  int _timeout;
+  // Master's address
+  std::string _masterAddr;
+  // Master's port
+  port_type _masterPort;
+  // Socket on which the master is listening
+  int _masterListeningSocket;
+  /**
+   * Sockets on which the master is sending to each slave
+   * Note that the sockets in the vector can be in arbitrary order and
+   * are not sorted by ranks
+   */
+  std::vector<int> _masterSendingSockets;
+  /**
+   * Slave socket, which is used for all other slave ranks other than the master
+   * rank (rank 0) to receive rank 0's broadcasted Unique NCCL ID
+   * that is used for building the NCCL communicator
+   */
+  int _slaveSocket;
+
+  // Number of GPUs on each node
+  int _numGPUs;
+  // Mutex for Nccl Data Channel
+  std::mutex _mutex;
+
+  /**
+   * The GPU devices each group is currently using.
+   * The GPU devices are stored in a device sequence and the cache NCCL
+   * communicator is associated with this GPU device sequence
+   *
+   * e.g. If the group only uses device 0, then the value of
+   *      the used device string stored (value of the hashmap) would be "0".
+   *
+   *      If the group uses device 0 - 7 and the each tensor of the
+   *      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
+   *      then the value of the used device string stored would be
+   *      "0,1,2,3,4,5,6,7"
+   *
+   *      If the group uses device 0 - 7 and the each tensor of the
+   *      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
+   *      then the value of the used device string stored would be
+   *      "0,4,5,6,7,1,2,3"
+   *
+   *      Note that the order of the device for the tensor list matters.
+   *
+   *      Also note that each group only caches a single NCCL communicator
+   *      associated with the current "used device string".
+   *
+   *      If a new device string appears, the previous
+   *      cached communicator will be destroyed and a new one with the new
+   *      device string will be built
+   */
+  std::unordered_map<THDGroup, std::vector<std::string> > _groupDevices;
+
+  /**
+   * NCCL resources for for each THDGroup including:
+   * NCCL communicator for the current group
+   * Cuda Events for all GPUs for NCCL operations of the current group
+   */
+  std::unordered_map<THDGroup, std::vector<NcclResources> > _groupNcclResources;
+
+  // Existing groups
+  std::unordered_map<THDGroup, DataChannel::Group> _groups;
+
+
+  // Helper function that gets the NCCL communicator
+  NcclResourcePair _getNcclResourcePair(std::vector<at::Tensor>& input,
+                                        THDGroup groupId);
+
+  /**
+   * Helper function that broadcasts the NCCL unique ID to everyone in the rank
+   * NCCLID pointed by ncclId of Rank 0 will be sent to other ranks' NCCID
+   * pointed by ncclId
+   */
+  void broadcastUniqueNcclId(ncclUniqueId* ncclId);
+
+  // Helper that checks the input and output tensors
+  bool _tensorCheckHelper(const std::vector<at::Tensor>& input,
+                          const std::vector<at::Tensor>& output,
+                          size_t outputOverInput = 1);
+
+  // Helper that destroys a group's NCCL resources
+  void _destroyNcclResources(THDGroup groupId);
+
+  // Group validity checker
+  void _checkGroupIdValid(THDGroup groupId);
+
+  // Helper fucntion that destroys all the open sockets
+  void _destroySockets();
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelTCP.cpp b/torch/lib/THD/base/data_channels/DataChannelTCP.cpp
new file mode 100644
index 0000000..5937d8d
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelTCP.cpp
@@ -0,0 +1,835 @@
+#include "DataChannelTCP.hpp"
+
+#include <sys/poll.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <future>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <system_error>
+
+
+namespace thd {
+namespace {
+
+inline uint32_t log2ceil(uint32_t value) {
+  uint32_t dim = 0;
+#if defined(__GNUC__)
+  if (value <= 1)
+    return 0;
+  dim = 32 - __builtin_clz(value - 1);
+#else
+  for (uint32_t size = 1; size < value; ++dim, size <<= 1) /* empty */;
+#endif // defined(__GNUC__)
+  return dim;
+}
+
+// Finds nearest power-of-two less than or equal to `value`.
+template<typename T>
+inline uint64_t pow2(T value) {
+  uint64_t pof2 = 1;
+  while (pof2 <= value) { pof2 <<= 1; }
+  pof2 >>= 1;
+  return pof2;
+}
+
+} // namespace
+
+
+DataChannelTCP::RequestTCP::RequestTCP(QueueWorker::Request&& request)
+  : _request(std::move(request)) {
+}
+
+
+DataChannelTCP::RequestTCP::~RequestTCP() {}
+
+
+bool DataChannelTCP::RequestTCP::isCompleted() {
+  return _request.isCompleted();
+}
+
+
+void DataChannelTCP::RequestTCP::wait() {
+  _request.wait();
+}
+
+
+DataChannelTCP::DataChannelTCP(InitMethod::Config config)
+  : DataChannelTCP(config, -1)
+{}
+
+
+DataChannelTCP::DataChannelTCP(InitMethod::Config config, int timeout)
+  : _socket(-1)
+  , _port(0)
+  , _timeout(timeout)
+  , _processes(config.world_size)
+  , _poll_events(nullptr)
+{
+  _rank = config.rank;
+
+  if (_rank == 0) { // MASTER
+    _socket = config.master.listen_socket;
+    _port = config.master.listen_port;
+
+    _processes[0] = {
+      .rank = 0,
+      .address = "",
+      .port = 0,
+      .socket = -1,
+    };
+  } else { // WORKER
+    // add master
+    _processes[0] = {
+      .rank = 0,
+      .address = config.worker.master_addr,
+      .port = config.worker.master_port,
+      .socket = -1,
+    };
+  }
+}
+
+
+DataChannelTCP::~DataChannelTCP() {
+
+  if (_socket != -1)
+    ::close(_socket);
+
+  for (const auto& process : _processes) {
+    if ((process.rank != _rank) && (process.socket != -1))
+      ::close(process.socket);
+  }
+}
+
+
+void DataChannelTCP::destroy() {}
+
+
+bool DataChannelTCP::initWorker() {
+  auto& master = _processes[0];
+  master.socket = connect(master.address, master.port);
+
+  std::tie(_socket, _port) = listen();
+
+  send_value<rank_type>(master.socket, _rank, true);
+  send_value<port_type>(master.socket, _port); // send listening port to master
+
+  // get all metadata of other processes in network
+  for (size_t i = 1; i < _processes.size(); ++i) {
+    rank_type p_rank = recv_value<rank_type>(master.socket);
+    port_type p_port = recv_value<port_type>(master.socket);
+    std::string p_address = recv_string(master.socket);
+
+    _processes[p_rank] = {
+      .rank = p_rank,
+      .address = p_address,
+      .port = p_port,
+      .socket = -1,
+    };
+  }
+
+  /*
+   * Firstly we are connecting to workers with rank lower than our rank,
+   * then we accepting connections from other wokers with higher rank.
+   *
+   * This prevents from deadlocks where everyone is accepting or everyone is
+   * trying to connect.
+   */
+
+  for (rank_type r = 1; r < _rank; ++r) {
+    auto& process = _processes[r];
+    process.socket = connect(process.address, process.port);
+
+    // send rank to tell to the accepting process who we are
+    send_value<rank_type>(process.socket, _rank);
+  }
+
+  for (rank_type i = _rank + 1; i < _processes.size(); ++i) {
+    int socket;
+    std::tie(socket, std::ignore) = accept(_socket, _timeout);
+
+    // get rank of process we have just accepted
+    rank_type p_rank = recv_value<rank_type>(socket);
+    _processes[p_rank].socket = socket;
+  }
+
+  // close socket for listening, we will not use it anymore
+  ::close(_socket);
+  _socket = -1;
+
+  return true;
+}
+
+
+bool DataChannelTCP::initMaster() {
+  // wait for all workers to connect
+  for (size_t i = 1; i < _processes.size(); ++i) {
+    std::string p_address;
+    int p_socket;
+    std::tie(p_socket, p_address) = accept(_socket, _timeout);
+
+    rank_type p_rank = recv_value<rank_type>(p_socket);
+    port_type p_port = recv_value<port_type>(p_socket);
+
+    if (p_rank >= _processes.size()) {
+      throw std::out_of_range(
+        "worker's rank(" + std::to_string(p_rank) + ") is out"
+        "of range: [0, " + std::to_string(_processes.size() - 1) + "]"
+      );
+    }
+
+    if (_processes[p_rank].rank == p_rank) {
+      throw std::logic_error(
+        "two processes (" + _processes[p_rank].address + ", " + p_address + ") "
+        "reported a rank of " + std::to_string(p_rank)
+      );
+    }
+
+    _processes[p_rank] = {
+      .rank = p_rank,
+      .address = p_address,
+      .port = p_port,
+      .socket = p_socket,
+    };
+  }
+
+  // send informations about processes to all workers
+  for (const auto& worker : _processes) {
+    if (worker.rank == 0) continue;
+
+    for (auto& process : _processes) {
+      if (process.rank == 0) continue;
+
+      send_value<rank_type>(worker.socket, process.rank, true);
+      send_value<port_type>(worker.socket, process.port, true);
+      send_string(worker.socket, process.address);
+    }
+  }
+
+  // close socket for listening, we will not use it anymore
+  ::close(_socket);
+  _socket = -1;
+
+  return true;
+}
+
+
+bool DataChannelTCP::init() {
+  bool ok = (_rank == 0 ? initMaster() : initWorker());
+  if (ok) {
+    std::vector<rank_type> ranks;
+    ranks.reserve(_processes.size());
+    for (rank_type rank = 0; rank < _processes.size(); ++rank)
+      ranks.push_back(rank);
+
+    _groups.insert({
+      THDGroupWORLD,
+      DataChannel::Group(ranks, _processes.size() - 1)
+    });
+  }
+
+  return ok;
+}
+
+
+rank_type DataChannelTCP::getRank() {
+  return _rank;
+}
+
+
+rank_type DataChannelTCP::getNumProcesses() {
+  return _processes.size();
+}
+
+
+void DataChannelTCP::allGather(std::vector<at::Tensor>& output,
+                               at::Tensor& input, THDGroup group_id) {
+  /*
+   * Allgather algorithm is simple ring algorithm. This algorithm perfroms
+   * well on large data (> 512 KB) and generalize well on large group of nodes.
+   * More about efficiency can be found here:
+   *   > http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf (section 4.1)
+   *
+   * TODO: implement Bruck / recursive doubling algorithms to make allGather
+   * efficient also for small data (< 512 KB).
+   */
+
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  rank_type group_rank;
+  bool exists;
+  std::tie(group_rank, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  if (output.size() != group.size())
+    throw std::logic_error("allGather: number of output tensors and group size does not match");
+
+  for (auto out_tensor : output)
+    assertSameSizeAndType(out_tensor, input, "allGather");
+
+  rank_type left = (group.size() + group_rank - 1) % group.size();
+  rank_type right = (group_rank + 1) % group.size();
+
+  memcpy(output[group_rank].data_ptr(), input.data_ptr(), input.type().elementSizeInBytes() * input.numel());
+
+  auto j = group_rank, jnext = left;
+  for (rank_type i = 0; i < group.size(); ++i) {
+    req_ptr send_request {isend((output[j]), group.mustGetGlobalRank(right))};
+    receive((output[jnext]), group.mustGetGlobalRank(left));
+    send_request->wait();
+
+    j = jnext;
+    jnext = (group.size() + jnext - 1) % group.size();
+  }
+}
+
+
+void DataChannelTCP::gather(std::vector<at::Tensor>& output,
+                            at::Tensor& input, rank_type dst_rank, THDGroup group_id) {
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  bool exists;
+
+  std::tie(std::ignore, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  // assert if dst_rank exists in group
+  group.mustGetGroupRank(dst_rank);
+  if (_rank != dst_rank) {
+    send(input, dst_rank);
+  } else {
+    if (output.size() != group.size())
+      throw std::logic_error("gather: number of output tensors and group size does not match");
+
+    for (auto out_tensor : output)
+      assertSameSizeAndType(out_tensor, input, "gather");
+
+    for (rank_type i = 0; i < group.size(); ++i) {
+      auto global_rank = group.mustGetGlobalRank(i);
+      if (_rank != global_rank) {
+        receive((output.at(i)), global_rank);
+      } else {
+        memcpy(output.at(i).data_ptr(), input.data_ptr(), input.numel() * input.type().elementSizeInBytes());
+      }
+    }
+  }
+}
+
+
+void DataChannelTCP::scatter(std::vector<at::Tensor>& input,
+                             at::Tensor& output, rank_type src_rank,
+                             THDGroup group_id) {
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  bool exists;
+
+  std::tie(std::ignore, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  // assert if src_rank exists in group
+  group.mustGetGroupRank(src_rank);
+  if (_rank != src_rank) {
+    receive(output, src_rank);
+  } else {
+    if (input.size() != group.size())
+      throw std::logic_error("scatter: number of input tensors and group size does not match");
+
+    for (auto in_tensor : input)
+      assertSameSizeAndType(in_tensor, output, "scatter");
+
+    for (rank_type i = 0; i < group.size(); ++i) {
+      auto global_rank = group.mustGetGlobalRank(i);
+      if (_rank != global_rank) {
+        send((input.at(i)), global_rank);
+      } else {
+        memcpy(output.data_ptr(), input.at(i).data_ptr(), output.numel() * output.type().elementSizeInBytes());
+      }
+    }
+  }
+}
+
+
+void DataChannelTCP::allReduce(at::Tensor& data, THDReduceOp operation,
+                               THDGroup group_id) {
+  /*
+   * Allreduce implementation is recursive doubling algorithm. It is good
+   * algorithm for small sizes of message but other (theoratically better)
+   * implementations could not be addapted because of non-commutative
+   * operations on tensors (operation cannot be commutative because this could
+   * introduce different numerical errors on different workers).
+   *
+   * More about efficiency can be found here:
+   *   > http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf (section 4.5)
+   *
+   * Implementation is based on:
+   *   > https://github.com/pmodels/mpich/blob/master/src/mpi/coll/allreduce.c
+   */
+
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  rank_type group_rank;
+  bool exists;
+
+  std::tie(group_rank, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  uint64_t tensor_bytes = data.type().elementSizeInBytes() * data.numel();
+  auto tmp_tensor = data.clone();
+
+  auto pof2 = pow2(group.size());
+  int rem = group.size() - pof2;
+  int newrank = 0;
+
+  if (group_rank < 2 * rem) {
+    if (group_rank % 2 == 0) {
+      send(data, group.mustGetGlobalRank(group_rank + 1));
+      newrank = -1;
+    } else {
+      receive(tmp_tensor, group.mustGetGlobalRank(group_rank - 1));
+      _reduce(data, tmp_tensor, operation);
+      newrank = group_rank / 2;
+    }
+  } else {
+    newrank = group_rank - rem;
+  }
+
+  if (newrank != -1) {
+    int mask = 0x1;
+    while (mask < pof2) {
+      int newdst = newrank ^ mask;
+      int dst = (newdst < rem) ? (newdst * 2 + 1) : (newdst + rem);
+
+      auto dst_global_rank = group.mustGetGlobalRank(dst);
+      req_ptr send_request {isend(data, dst_global_rank)};
+      receive(tmp_tensor, dst_global_rank);
+      send_request->wait();
+
+      if (dst < group_rank) {
+        _reduce(data, tmp_tensor, operation);
+      } else {
+        _reduce(tmp_tensor, data, operation);
+        std::memcpy(data.data_ptr(), tmp_tensor.data_ptr(), tensor_bytes);
+      }
+
+      mask <<= 1;
+    }
+  }
+
+  if (group_rank < 2 * rem) {
+    if (group_rank % 2) {
+      send(data, group.mustGetGlobalRank(group_rank - 1));
+    } else {
+      receive(data, group.mustGetGlobalRank(group_rank + 1));
+    }
+  }
+}
+
+
+void DataChannelTCP::reduce(at::Tensor& data, THDReduceOp operation,
+                            rank_type dst_rank, THDGroup group_id) {
+  /*
+   * Idea of this algorithm is similar to broadcast but with reversed
+   * order and direction of communication.
+   */
+
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  rank_type group_rank;
+  bool exists;
+
+  std::tie(group_rank, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  auto group_dst_rank = group.mustGetGroupRank(dst_rank);
+  int dim = log2ceil(group.size());
+  rank_type virtual_rank = (group_rank + group.size() - group_dst_rank) % group.size();
+  int64_t mask = 0;
+  auto result_tensor = data.clone();
+
+  for (int k = 0; k <= dim - 1; mask ^= (1 << k), ++k) {
+    if ((virtual_rank & mask) == 0) {
+      rank_type partner = virtual_rank ^ (1 << k); // partner has opposite bit `k`
+      if (partner >= group.size())
+        continue;
+
+      partner = group.mustGetGlobalRank((partner + group_dst_rank) % group.size());
+      if ((virtual_rank & (1 << k)) != 0) {
+        send(result_tensor, partner);
+      } else {
+        receive(data, partner);
+        _reduce(result_tensor, data, operation);
+      }
+    }
+  }
+
+  if (_rank == dst_rank)
+    std::memcpy(data.data_ptr(), result_tensor.data_ptr(), data.type().elementSizeInBytes() * data.numel());
+}
+
+
+void DataChannelTCP::broadcast(at::Tensor& data, rank_type src_rank,
+                               THDGroup group_id) {
+  /*
+   * General idea of this algorithm is to send data in `d` dimensional
+   * hypercube where vertices are nodes (processes) and edges are
+   * network connections which can be used to transfer data.
+   *
+   * Since hypercube algorithm works for case when broadcasting rank is 0
+   * we have to create `virtual_rank` which converts regular ranks to
+   * virtual ones where `virtual_rank` for `src_rank` is 0.
+   */
+
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  rank_type group_rank;
+  bool exists;
+
+  std::tie(group_rank, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  auto group_src_rank = group.mustGetGroupRank(src_rank);
+  int dim = log2ceil(group.size());
+  rank_type virtual_rank = (group_rank + group.size() - group_src_rank) % group.size();
+  int64_t mask = (1 << dim) - 1;
+
+  for (int k = dim - 1; k >= 0; --k) {
+    mask ^= (1 << k); // clear bit `k`
+    if ((virtual_rank & mask) == 0) {
+      rank_type partner = virtual_rank ^ (1 << k); // partner has opposite bit `k`
+      if (partner >= group.size())
+        continue;
+
+      partner = group.mustGetGlobalRank((partner + group_src_rank) % group.size());
+      if ((virtual_rank & (1 << k)) == 0) {
+        send(data, partner);
+      } else {
+        receive(data, partner);
+      }
+    }
+  }
+}
+
+
+void DataChannelTCP::send(Scalar& data, rank_type dst_rank) {
+  auto request = _send_worker.push([this, &data, dst_rank]{
+    this->_send(data, dst_rank);
+  });
+  request.wait();
+}
+
+
+void DataChannelTCP::send(at::Tensor& data, rank_type dst_rank) {
+  auto request = _send_worker.push([this, &data, dst_rank]{
+    this->_send(data, dst_rank);
+  });
+  request.wait();
+}
+
+
+void DataChannelTCP::receive(Scalar& data, rank_type src_rank) {
+  auto request = _receive_worker.push([this, &data, src_rank]{
+    this->_receive(data, src_rank);
+  });
+  request.wait();
+}
+
+
+rank_type DataChannelTCP::receive(at::Tensor& data) {
+  rank_type sender;
+  auto request = _receive_worker.push([this, &data, &sender]{
+    if (!this->_poll_events) {
+      // cache poll events array, it will be reused in another `receive` calls
+      this->_poll_events.reset(new struct pollfd[this->_processes.size()]);
+      for (size_t rank = 0; rank < this->_processes.size(); ++rank) {
+        this->_poll_events[rank] = {
+          .fd = this->_processes[rank].socket,
+          .events = POLLIN
+        };
+      }
+    }
+
+    // cleanup
+    for (size_t rank = 0; rank < this->_processes.size(); ++rank) {
+      this->_poll_events[rank].revents = 0;
+    }
+
+    SYSCHECK(::poll(this->_poll_events.get(), this->_processes.size(), -1)) // infinite timeout
+    for (size_t rank = 0; rank < this->_processes.size(); ++rank) {
+      if (this->_poll_events[rank].revents == 0)
+        continue;
+
+      if (this->_poll_events[rank].revents ^ POLLIN)
+        throw std::system_error(ECONNABORTED, std::system_category());
+
+      this->_receive(data, rank);
+      sender = rank;
+      break;
+    }
+  });
+
+  request.wait();
+  return sender;
+}
+
+
+void DataChannelTCP::receive(at::Tensor& data, rank_type src_rank) {
+  auto request = _receive_worker.push([this, &data, src_rank]{
+    this->_receive(data, src_rank);
+  });
+  request.wait();
+}
+
+
+DataChannelTCP::RequestTCP* DataChannelTCP::isend(at::Tensor& data,
+                                                  rank_type dst_rank) {
+  auto request = _send_worker.push([this, data, dst_rank]{
+    this->_send(data, dst_rank);
+  });
+  return new DataChannelTCP::RequestTCP(std::move(request));
+}
+
+
+DataChannelTCP::RequestTCP* DataChannelTCP::ireceive(at::Tensor& data,
+                                                     rank_type src_rank) {
+  auto request = _receive_worker.push([this, data, src_rank]{
+    this->_receive(data, src_rank);
+  });
+  return new DataChannelTCP::RequestTCP(std::move(request));
+}
+
+
+void DataChannelTCP::barrier(THDGroup group_id) {
+  /*
+   * Barrier is implementation of Bruck algorithm. All processes send to
+   * other processes with rank (i + 2^k) and recv from process with rank (i - 2^k)
+   * with wrap-around. Since we cannot do recv and send at the same time
+   * we do recv asynchronously (thread), send byte and then wait for recv to complete.
+   */
+
+  std::lock_guard<std::mutex> lock(_mutex);
+
+  const auto& group = _groups.at(group_id);
+  rank_type group_rank;
+  bool exists;
+
+  std::tie(group_rank, exists) = group.getGroupRank(_rank);
+  if (!exists)
+    return;
+
+  std::uint8_t byte = 1;
+  for (rank_type distance = 1; distance < group.size(); distance <<= 1) {
+    rank_type recv_partner = (group_rank + group.size() - distance) % group.size();
+    const auto& recv_process = _processes.at(group.mustGetGlobalRank(recv_partner));
+    auto recv_request = _receive_worker.push([&recv_process, &byte]{
+      recv_bytes<std::uint8_t>(recv_process.socket, &byte, 1);
+    });
+
+    rank_type send_partner = (group_rank + distance) % group.size();
+    const auto& send_process = _processes.at(group.mustGetGlobalRank(send_partner));
+    auto send_request = _send_worker.push([&send_process, &byte]{
+      send_bytes<std::uint8_t>(send_process.socket, &byte, 1);
+    });
+
+    send_request.wait();
+    recv_request.wait();
+  }
+}
+
+
+THDGroup DataChannelTCP::newGroup(const std::vector<rank_type>& ranks) {
+  auto new_group = DataChannel::Group(ranks, _processes.size() - 1);
+  THDGroup new_group_id = static_cast<THDGroup>(_groups.size());
+
+  _groups.insert({new_group_id, new_group});
+  return new_group_id;
+}
+
+
+void DataChannelTCP::_send(const Scalar& data, rank_type dst_rank) {
+  /*
+   * We have to check if dst_rank is positive to properly use `.at` function in vector.
+   * Not checking that can result in int overflow and strange errors.
+   */
+
+  const auto& process_dst = _processes.at(dst_rank);
+  if (process_dst.rank == _rank)
+    throw std::logic_error("cannot send scalar to process with same rank");
+
+  // send size of scalar in bytes
+  uint64_t scalar_bytes = data.elementSize();
+  send_bytes<uint64_t>(process_dst.socket, &scalar_bytes, 1, true);
+
+  // send data (bytes)
+  send_bytes<std::uint8_t>(
+    process_dst.socket,
+    reinterpret_cast<const std::uint8_t*>(data.data()),
+    scalar_bytes
+  );
+}
+
+
+void DataChannelTCP::_send(const at::Tensor& data, rank_type dst_rank) {
+  /*
+   * We have to check if dst_rank is positive to properly use `.at` function in vector.
+   * Not checking that can result in int overflow and strange errors.
+   */
+
+  const auto& process_dst = _processes.at(dst_rank);
+  if (process_dst.rank == _rank)
+    throw std::logic_error("cannot send tensor to process with same rank");
+
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to send is not contiguous");
+
+  // send size of tensor data in bytes
+  uint64_t tensor_bytes = data.type().elementSizeInBytes() * data.numel();
+  send_bytes<uint64_t>(process_dst.socket, &tensor_bytes, 1, true);
+
+  // send data (bytes)
+  send_bytes<std::uint8_t>(
+    process_dst.socket,
+    reinterpret_cast<const std::uint8_t*>(data.data_ptr()),
+    tensor_bytes
+  );
+}
+
+
+void DataChannelTCP::_receive(Scalar& data, rank_type src_rank) {
+  /*
+   * We have to check if src_rank is positive to properly use `.at` function in vector.
+   * Not checking that can result in int overflow and strange errors.
+   */
+
+  const auto& process_src = _processes.at(src_rank);
+  if (process_src.rank == _rank)
+    throw std::logic_error("cannot receive scalar from process with same rank");
+
+  // get size of scalar in bytes
+  uint64_t scalar_bytes;
+  recv_bytes<uint64_t>(process_src.socket, &scalar_bytes, 1);
+
+  uint64_t actual_scalar_bytes = data.elementSize();
+  if (actual_scalar_bytes == scalar_bytes) {
+    recv_bytes<std::uint8_t>(
+      process_src.socket,
+      reinterpret_cast<std::uint8_t*>(data.data()),
+      scalar_bytes
+    );
+  } else {
+    // remove invalid data from recv buffer
+    std::unique_ptr<std::uint8_t[]> bytes(new std::uint8_t[scalar_bytes]);
+    recv_bytes<std::uint8_t>(process_src.socket, bytes.get(), scalar_bytes);
+    throw std::logic_error("scalar sizes do not match");
+  }
+}
+
+
+void DataChannelTCP::_receive(const at::Tensor& data, rank_type src_rank) {
+  /*
+   * We have to check if src_rank is positive to properly use `.at` function in vector.
+   * Not checking that can result in int overflow and strange errors.
+   */
+
+  const auto& process_src = _processes.at(src_rank);
+  if (process_src.rank == _rank)
+    throw std::logic_error("cannot receive tensor from process with same rank");
+
+  if (!data.is_contiguous())
+    throw std::logic_error("tensor to receive is not contiguous");
+
+  // get size of tensor data in bytes
+  uint64_t tensor_bytes;
+  recv_bytes<uint64_t>(process_src.socket, &tensor_bytes, 1);
+
+  uint64_t actual_tensor_bytes = data.type().elementSizeInBytes() * data.numel();
+  if (actual_tensor_bytes == tensor_bytes) {
+    recv_bytes<std::uint8_t>(
+      process_src.socket,
+      reinterpret_cast<std::uint8_t*>(data.data_ptr()),
+      tensor_bytes
+    );
+  } else {
+    // remove invalid data from recv buffer
+    std::unique_ptr<std::uint8_t[]> bytes(new std::uint8_t[tensor_bytes]);
+    recv_bytes<std::uint8_t>(process_src.socket, bytes.get(), tensor_bytes);
+    throw std::logic_error("tensor sizes do not match");
+  }
+}
+
+void DataChannelTCP::_reduce(at::Tensor& result, at::Tensor& data,
+                             THDReduceOp operation) const {
+  assertSameSizeAndType(result, data, "reduce");
+
+  if (operation == THDReduceOp::THDReduceMIN) {
+    at::min_out(result, result, data);
+  } else if (operation == THDReduceOp::THDReduceMAX) {
+    at::max_out(result, result, data);
+  } else if (operation == THDReduceOp::THDReduceSUM) {
+    result.add_(data);
+  } else if (operation == THDReduceOp::THDReducePRODUCT) {
+    result.mul_(data);
+  } else {
+    throw std::logic_error("unsupported reduce operation");
+  }
+}
+
+void DataChannelTCP::allReduce(std::vector<at::Tensor>& data,
+                               THDReduceOp operation,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelTCP does not support mult-GPU cross "
+                           "node allreduce");
+}
+
+
+void DataChannelTCP::allGather(std::vector<at::Tensor>& output,
+                               std::vector<at::Tensor>& input,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelTCP does not support mult-GPU cross "
+                           "node allgather");
+}
+
+
+void DataChannelTCP::reduce(std::vector<at::Tensor>& data,
+                            THDReduceOp operation,
+                            rank_type dstRank,
+                            THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelTCP does not support mult-GPU cross "
+                           "node reduce");
+}
+
+
+void DataChannelTCP::broadcast(std::vector<at::Tensor>& data,
+                               rank_type srcRank,
+                               THDGroup groupId) {
+
+  throw std::runtime_error("DataChannelTCP does not support mult-GPU cross "
+                           "node broadcast");
+}
+
+
+void DataChannelTCP::clearGroupCache(THDGroup group_id) {
+  throw std::runtime_error("DataChannelTCP does not support clear "
+                           "group cache");
+}
+
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelTCP.hpp b/torch/lib/THD/base/data_channels/DataChannelTCP.hpp
new file mode 100644
index 0000000..b29727d
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelTCP.hpp
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "../DataChannel.hpp"
+#include "DataChannelUtils.hpp"
+
+#include <sys/poll.h>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+
+namespace thd {
+
+struct DataChannelTCP : DataChannel {
+
+  struct RequestTCP : DataChannel::Request {
+    RequestTCP(QueueWorker::Request&& request);
+    virtual ~RequestTCP();
+
+    virtual bool isCompleted() override;
+    virtual void wait() override;
+
+  private:
+    QueueWorker::Request _request;
+  };
+
+  DataChannelTCP(InitMethod::Config config);
+  DataChannelTCP(InitMethod::Config config, int timeout);
+  virtual ~DataChannelTCP();
+
+  bool init() override;
+  void destroy() override;
+
+  rank_type getRank() override;
+  rank_type getNumProcesses() override;
+
+  void allGather(std::vector<at::Tensor>& output,
+                 std::vector<at::Tensor>& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allGather(std::vector<at::Tensor>& output, at::Tensor& input,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void gather(std::vector<at::Tensor>& output, at::Tensor& input,
+              rank_type dst_rank, THDGroup group_id = THDGroupWORLD) override;
+  void scatter(std::vector<at::Tensor>& input, at::Tensor& output,
+               rank_type src_rank, THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(std::vector<at::Tensor>& data,
+                 THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void allReduce(at::Tensor& data, THDReduceOp operation,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void reduce(std::vector<at::Tensor>& data,
+              THDReduceOp operation,
+              rank_type dstRank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void reduce(at::Tensor& data, THDReduceOp operation, rank_type dst_rank,
+              THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(std::vector<at::Tensor>& data,
+                 rank_type srcRank,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void broadcast(at::Tensor& data, rank_type src_id,
+                 THDGroup group_id = THDGroupWORLD) override;
+  void send(Scalar& data, rank_type dst_id) override;
+  void send(at::Tensor& data, rank_type dst_id) override;
+  void receive(Scalar& data, rank_type src_id) override;
+  rank_type receive(at::Tensor& data) override;
+  void receive(at::Tensor& data, rank_type src_id) override;
+  RequestTCP* isend(at::Tensor& data, rank_type dst_rank) override;
+  RequestTCP* ireceive(at::Tensor& data, rank_type src_rank) override;
+
+  void barrier(THDGroup group_id = THDGroupWORLD) override;
+
+  THDGroup newGroup(const std::vector<rank_type>& ranks) override;
+  void clearGroupCache(THDGroup group_id = THDGroupWORLD) override;
+
+private:
+  using req_ptr = std::unique_ptr<RequestTCP>;
+  // Defines process to which master or worker is connected
+  struct Process {
+    rank_type rank;
+    std::string address;
+    port_type port;
+    int socket;
+  };
+
+  bool initMaster();
+  bool initWorker();
+
+  void _send(const Scalar& data, rank_type dst_id);
+  void _send(const at::Tensor& data, rank_type dst_id);
+  void _receive(Scalar& data, rank_type src_id);
+  void _receive(const at::Tensor& data, rank_type src_id);
+  void _reduce(at::Tensor& result, at::Tensor& data,
+               THDReduceOp operation) const;
+
+
+  rank_type _rank; // Rank of current process, range: [0.._processes.size()-1]
+  int _socket; // Socket on which process is listening
+  port_type _port; // Port on which process is listening
+  int _timeout; // Accept waiting timeout in milliseconds (it is optional, default = infinity)
+
+  std::vector<Process> _processes; // Other processes in network
+  std::unique_ptr<struct pollfd[]> _poll_events; // Events array for `poll`
+
+  // General mutex for methods - to protect access to the TCP data channel.
+  std::mutex _mutex;
+
+  // Existing groups of processes and corresponding group ids
+  std::unordered_map<THDGroup, DataChannel::Group> _groups;
+
+  // Workers
+  QueueWorker _send_worker, _receive_worker;
+
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/DataChannelUtils.hpp b/torch/lib/THD/base/data_channels/DataChannelUtils.hpp
new file mode 100644
index 0000000..d5b874a
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/DataChannelUtils.hpp
@@ -0,0 +1,150 @@
+#pragma once
+
+#include "../DataChannel.hpp"
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <stdexcept>
+#include <thread>
+
+namespace thd {
+
+inline void assertSameSizeAndType(const at::Tensor& tensor1,
+                              const at::Tensor& tensor2,
+                              std::string prefix = std::string()) {
+  bool equal = tensor1.type().elementSizeInBytes() == tensor2.type().elementSizeInBytes() &&
+               tensor1.numel() == tensor2.numel() &&
+               tensor1.type() == tensor2.type();
+
+  if (!prefix.empty())
+    prefix = prefix + ": ";
+
+  if (!equal)
+    throw std::logic_error(prefix + "tensors are not equal in size or data type");
+}
+
+struct QueueWorker {
+private:
+  struct Task {
+    Task(std::function<void ()>&& handler): _handler(handler), _completed(false) {}
+    Task(const Task&) = delete;
+    Task& operator=(const Task&) = delete;
+
+    void run() {
+      std::unique_lock<std::mutex> ulock(_mutex);
+
+      try {
+        _handler();
+      } catch (...) {
+        // Do not propagate exception here. We should save it and throw it
+        // in `complete` or `wait` function to user.
+        _exception = std::current_exception();
+      }
+
+      _completed = true;
+      ulock.unlock();
+      _cond.notify_all();
+    }
+
+    bool isCompleted() {
+      std::unique_lock<std::mutex> ulock(_mutex);
+      _validate();
+      return _completed;
+    }
+
+    void wait() {
+      std::unique_lock<std::mutex> ulock(_mutex);
+      if (!_completed)
+        _cond.wait(ulock);
+
+      _validate();
+    }
+
+  private:
+    void _validate() {
+      if (_exception)
+        std::rethrow_exception(_exception);
+    }
+
+    std::function<void ()> _handler;
+    std::atomic<bool> _completed;
+    std::mutex _mutex;
+    std::condition_variable _cond;
+    std::exception_ptr _exception;
+  };
+
+public:
+  struct Request {
+    Request(std::shared_ptr<QueueWorker::Task> item) : _item(item) {}
+
+    void wait() { _item->wait(); }
+    bool isCompleted() { return _item->isCompleted(); }
+
+  private:
+    std::shared_ptr<QueueWorker::Task> _item;
+  };
+
+  QueueWorker() : _exiting(false) {
+    _main_thread = std::thread(&QueueWorker::_runner, this);
+  }
+
+  ~QueueWorker() {
+    _exiting = true;
+    _cond.notify_one();
+    _main_thread.join();
+  }
+
+  QueueWorker(const QueueWorker&) = delete;
+  QueueWorker& operator=(const QueueWorker&) = delete;
+
+  Request push(std::function<void ()>&& f) {
+    auto item = _push(std::make_shared<Task>(std::move(f)));
+    return Request(item);
+  }
+
+private:
+  std::shared_ptr<Task> _pop() {
+    std::unique_lock<std::mutex> ulock(_mutex);
+    if (_queue.empty())
+      _cond.wait(ulock);
+
+    if (_exiting) // check if we were woken up by destructor
+      return nullptr;
+
+    auto val = _queue.front();
+    _queue.pop();
+    return val;
+  }
+
+  std::shared_ptr<Task> _push(std::shared_ptr<Task> item) {
+    std::unique_lock<std::mutex> ulock(_mutex);
+    _queue.push(item);
+    ulock.unlock();
+    _cond.notify_one();
+    return item;
+  }
+
+
+  void _runner() {
+    while (true) {
+      auto item = _pop();
+      if (!item) // empty item -> we need to end (descructor called)
+        return;
+
+      item->run();
+    }
+  }
+
+  std::atomic<bool> _exiting;
+  std::queue<std::shared_ptr<Task>> _queue;
+  std::mutex _mutex;
+  std::condition_variable _cond;
+
+  std::thread _main_thread;
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/GlooCache.hpp b/torch/lib/THD/base/data_channels/GlooCache.hpp
new file mode 100644
index 0000000..c28486a
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/GlooCache.hpp
@@ -0,0 +1,454 @@
+#pragma once
+
+#include "../Cuda.hpp"
+#include "../ChannelUtils.hpp"
+#include "../DataChannel.hpp"
+
+#include "gloo/algorithm.h"
+#include "gloo/allgather_ring.h"
+#include "gloo/allreduce_ring.h"
+#include "gloo/barrier_all_to_all.h"
+#include "gloo/broadcast_one_to_all.h"
+#ifdef USE_CUDA
+#include "gloo/cuda_allreduce_ring.h"
+#include "gloo/cuda_allreduce_halving_doubling.h"
+#include "gloo/cuda_allreduce_halving_doubling_pipelined.h"
+#include "gloo/cuda_broadcast_one_to_all.h"
+#endif
+#include "gloo/rendezvous/context.h"
+#include "gloo/rendezvous/store.h"
+#include "gloo/rendezvous/prefix_store.h"
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <THC/THC.h>
+#endif
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <type_traits>
+
+namespace thd {
+namespace gloo_cache {
+
+using key_type = std::tuple<
+  CollectiveType, // operation
+  THDGroup,       // group
+  DeviceType,     // tensors device type
+  int,            // CUDA stream id used in the algorithm
+  size_t,    // input buffer bytes
+  size_t,    // output buffer bytes
+  THDReduceOp,    // reduce op
+  rank_type       // src/dest rank
+>;
+
+const DeviceType UNUSED_DEVICE = DeviceType::LAST;
+const THDReduceOp UNUSED_OP = THDReduceMIN;
+const int UNUSED_STREAM = -1;
+const rank_type UNUSED_RANK = -1;
+const size_t UNUSED_BYTES = 0;
+
+// Forward declaration
+template<CollectiveType D, typename T>
+struct algorithm_spec;
+
+} // namespace gloo_cache
+} // namespace thd
+
+
+MAKE_HASHABLE(
+  thd::gloo_cache::key_type,
+  std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t),
+  std::get<4>(t), std::get<5>(t), std::get<6>(t), std::get<7>(t)
+);
+
+
+namespace thd {
+
+struct GlooCache {
+  using buffer_type = char;
+  using algorithm_type = ::gloo::Algorithm;
+  using context_type = ::gloo::rendezvous::Context;
+  using prefix_store_type = ::gloo::rendezvous::PrefixStore;
+  using store_type = ::gloo::rendezvous::Store;
+
+  using key_type = gloo_cache::key_type;
+  using value_type = std::tuple<
+    std::shared_ptr<algorithm_type>, // algorithm
+    std::shared_ptr<buffer_type>,    // input buffer (nullptr if not used)
+    std::shared_ptr<buffer_type>,    // output buffer (nullptr if not used)
+    std::shared_ptr<std::mutex>      // mutex to protect same algorithm from running concurrently
+  >;
+
+  GlooCache(rank_type rank,
+            std::vector<std::shared_ptr<::gloo::transport::Device>> deviceList)
+   : _rank(rank)
+   , _deviceList(deviceList)
+  {}
+
+  GlooCache(GlooCache const&)      = delete;
+  void operator=(GlooCache const&) = delete;
+
+
+  // Accessors for value_type tuple
+  static inline std::shared_ptr<algorithm_type> algorithm(const value_type& t) {
+    return std::get<0>(t);
+  }
+
+  static inline std::shared_ptr<buffer_type> input_buffer(const value_type& t) {
+    return std::get<1>(t);
+  }
+
+  static inline std::shared_ptr<buffer_type> output_buffer(const value_type& t) {
+    return std::get<2>(t);
+  }
+
+  static inline std::shared_ptr<std::mutex> mutex(const value_type& t) {
+    return std::get<3>(t);
+  }
+
+
+  // NOTE: this function needs to be thread safe
+  std::shared_ptr<context_type> createContext(
+    const DataChannelGloo::Group& group,
+    const std::string& prefix
+  ) {
+    /**
+     * We currently only supports a single Infiniband interface. In other words,
+     * if there are multiple Infiniband devices in the system, Gloo will detect
+     * all of them and use the first device.
+     *
+     * TODO: This can be extended later to utilize multiple Infiniband devices
+     *
+     * For ethernet, _deviceList[0] will always have the default ethernet
+     * device that is detected from the user's provided IP address and there
+     * won't be multiple one device in _deviceList
+     *
+     * For Infiniband, _deviceList[0], which is the first found IB interfance,
+     * will be used by all Gloo operations.
+     */
+    size_t curDevice = 0;
+    auto context = std::make_shared<context_type>(
+        group.mustGetGroupRank(_rank), group.size());
+    prefix_store_type prefix_store(prefix, *group._store);
+    context->connectFullMesh(prefix_store, _deviceList[curDevice]);
+    return context;
+  }
+
+  // NOTE: this function needs to be thread safe
+  std::shared_ptr<buffer_type> createBuffer(size_t bytes, DeviceType device) const {
+    if (device == DeviceType::CPU) {
+      return std::shared_ptr<buffer_type>(new char[bytes],
+                                          std::default_delete<char[]>());
+#ifdef USE_CUDA
+    } else if (device == DeviceType::CUDA) {
+      buffer_type *buf = static_cast<buffer_type*>(THCudaMalloc(THDGetCudaState(), bytes));
+      return std::shared_ptr<buffer_type>(buf, [](char* ptr) { THCudaFree(THDGetCudaState(), ptr); });
+#endif
+    } else {
+      throw std::runtime_error("unsupported device in GlooCache::createBuffer");
+    }
+  }
+
+  template<CollectiveType D, typename T, typename... Args>
+  value_type getAlgorithm(THDGroup group_id, const DataChannelGloo::Group& group,
+                          Args... args) {
+    auto key = gloo_cache::algorithm_spec<D, T>::key(group_id, args...);
+
+    std::unique_lock<std::mutex> lock(_mutex);
+    auto it = _algorithms.find(key);
+    if (it == _algorithms.end()) {
+      lock.unlock();
+
+      auto algorithm = gloo_cache::algorithm_spec<D, T>::create(*this, group,
+              print_key(key), std::forward<Args>(args)...);
+
+      lock.lock();
+
+      bool inserted;
+      std::tie(it, inserted) = _algorithms.emplace(
+        std::move(key), std::move(algorithm));
+      if (!inserted)
+          throw std::runtime_error("detected a race when creating Gloo algorithm");
+    }
+
+    return it->second;
+  }
+
+  static void memcpy_input(value_type& info, at::Tensor& t) {
+    uint64_t tensor_bytes = t.type().elementSizeInBytes() * t.numel();
+    auto t_dev = getDeviceType(t);
+    auto input_buffer = GlooCache::input_buffer(info).get();
+
+    if (t_dev == DeviceType::CPU) {
+      std::memcpy(input_buffer, t.data_ptr(), tensor_bytes);
+#ifdef USE_CUDA
+    } else if (t_dev == DeviceType::CUDA) {
+      auto stream = THCState_getCurrentStream(THDGetCudaState());
+      THCudaCheck(cudaMemcpyAsync(input_buffer, t.data_ptr(), tensor_bytes,
+                                  cudaMemcpyDeviceToDevice, stream));
+#endif
+    } else {
+      throw std::runtime_error("unsupported device in memcpy_input");
+    }
+  }
+
+  static void memcpy_output(value_type& info, at::Tensor& t) {
+    uint64_t tensor_bytes = t.type().elementSizeInBytes() * t.numel();
+    auto t_dev = getDeviceType(t);
+    auto output_buffer = GlooCache::output_buffer(info).get();
+
+    if (t_dev == DeviceType::CPU) {
+      std::memcpy(t.data_ptr(), output_buffer, tensor_bytes);
+#ifdef USE_CUDA
+    } else if (t_dev == DeviceType::CUDA) {
+      auto stream = THCState_getCurrentStream(THDGetCudaState());
+      THCudaCheck(cudaMemcpyAsync(t.data_ptr(), output_buffer, tensor_bytes,
+                                  cudaMemcpyDeviceToDevice, stream));
+#endif
+    } else {
+      throw std::runtime_error("unsupported device in memcpy_input");
+    }
+  }
+
+private:
+  std::string print_key(const key_type& k) {
+    return std::to_string(static_cast<uint8_t>(std::get<0>(k))) + "-"
+      + std::to_string(std::get<1>(k)) + "-"
+      + std::to_string(static_cast<uint8_t>(std::get<2>(k))) + "-"
+      + std::to_string(std::get<3>(k)) + "-"
+      + std::to_string(std::get<4>(k)) + "-"
+      + std::to_string(std::get<5>(k)) + "-"
+      + std::to_string(std::get<6>(k)) + "-"
+      + std::to_string(std::get<7>(k));
+  }
+
+  rank_type _rank;
+  std::vector<std::shared_ptr<::gloo::transport::Device>> _deviceList;
+  std::shared_ptr<store_type> _store;
+
+  std::mutex _mutex;
+
+  std::unordered_map<key_type, value_type> _algorithms;
+};
+
+namespace gloo_cache {
+
+template<typename T>
+const ::gloo::ReductionFunction<T>* THDToGlooReduceOp(THDReduceOp op) {
+  switch (op) {
+    case THDReduceMIN:
+      return ::gloo::ReductionFunction<T>::min;
+    case THDReduceMAX:
+      return ::gloo::ReductionFunction<T>::max;
+    case THDReduceSUM:
+      return ::gloo::ReductionFunction<T>::sum;
+    case THDReducePRODUCT:
+      return ::gloo::ReductionFunction<T>::product;
+    default:
+      throw std::invalid_argument("unknown reduce operation");
+  }
+}
+
+template<typename T>
+struct algorithm_spec<CollectiveType::ALL_GATHER, T> {
+  static GlooCache::key_type key(
+    THDGroup group_id, DeviceType device, size_t input_bytes, size_t output_bytes, size_t unused_count
+  ) {
+    return std::make_tuple(CollectiveType::ALL_GATHER, group_id, device, UNUSED_STREAM,
+                           input_bytes, output_bytes, UNUSED_OP, UNUSED_RANK);
+  }
+
+  static GlooCache::value_type create(GlooCache& cache,
+    const DataChannelGloo::Group& group, const std::string& store_prefix,
+    DeviceType device, size_t input_bytes, size_t output_bytes, size_t count
+  ) {
+    auto context = cache.createContext(group, store_prefix);
+    auto input_buffer = cache.createBuffer(input_bytes, device);
+    auto output_buffer = cache.createBuffer(output_bytes, device);
+
+    std::shared_ptr<GlooCache::algorithm_type> algo;
+    if (device == DeviceType::CPU) {
+      algo = std::make_shared<::gloo::AllgatherRing<T>>(
+        context,
+        std::initializer_list<const T*>{reinterpret_cast<const T*>(input_buffer.get())},
+        reinterpret_cast<T*>(output_buffer.get()),
+        count);
+    } else {
+      throw std::runtime_error("unsupported device in Gloo allGather");
+    }
+
+    return std::make_tuple(
+      algo,
+      input_buffer,
+      output_buffer,
+      std::make_shared<std::mutex>()
+    );
+  }
+};
+
+template<typename T>
+struct algorithm_spec<CollectiveType::ALL_REDUCE, T> {
+  static GlooCache::key_type key(
+    THDGroup group_id, DeviceType device, size_t input_bytes,
+    size_t unused_count, THDReduceOp op
+  ) {
+    int stream = UNUSED_STREAM;
+    if (device == DeviceType::CUDA) {
+      auto cuda_stream = THCState_getCurrentStream(THDGetCudaState());
+      stream = THDGetStreamId(cuda_stream);
+    }
+    return std::make_tuple(CollectiveType::ALL_REDUCE, group_id, device, stream,
+                           input_bytes, input_bytes, op, UNUSED_RANK);
+  }
+
+  static GlooCache::value_type create(GlooCache& cache,
+    const DataChannelGloo::Group& group, const std::string& store_prefix,
+    DeviceType device, size_t input_bytes, size_t count, THDReduceOp op
+  ) {
+    auto context = cache.createContext(group, store_prefix);
+    auto input_buffer = cache.createBuffer(input_bytes, device);
+
+    std::shared_ptr<GlooCache::algorithm_type> algo;
+    if (device == DeviceType::CPU) {
+      algo = std::make_shared<::gloo::AllreduceRing<T>>(
+        context,
+        std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+        count,
+        THDToGlooReduceOp<T>(op));
+#ifdef USE_CUDA
+    } else if (device == DeviceType::CUDA) {
+      if (op != THDReduceSUM) {
+        throw std::runtime_error("Gloo backend only supports sum op for CUDA all reduce");
+      }
+      auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+#if defined(USE_GLOO_IBVERBS) && USE_GLOO_IBVERBS
+      // Only enable GPU direct if the device supports it
+      if (context->getDevice()->hasGPUDirect()) {
+        algo = std::make_shared<::gloo::CudaAllreduceHalvingDoublingPipelined<T,
+                                ::gloo::CudaDeviceWorkspace<T>>>(
+          context,
+          std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+          count,
+          std::vector<cudaStream_t>{stream});
+      } else
+#endif
+      {
+        algo = std::make_shared<::gloo::CudaAllreduceHalvingDoublingPipelined<T,
+                                ::gloo::CudaHostWorkspace<T>>>(
+          context,
+          std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+          count,
+          std::vector<cudaStream_t>{stream});
+      }
+#endif
+
+    } else {
+      throw std::runtime_error("unsupported tensor device in Gloo allReduce");
+    }
+
+    return std::make_tuple(
+      algo,
+      input_buffer,
+      input_buffer, // we get the result in same buffer
+      std::make_shared<std::mutex>()
+    );
+  }
+};
+
+template<typename T>
+struct algorithm_spec<CollectiveType::BROADCAST, T> {
+  static GlooCache::key_type key(
+    THDGroup group_id, DeviceType device, size_t input_bytes,
+    size_t unused_count, rank_type src_rank
+  ) {
+    int stream = UNUSED_STREAM;
+    if (device == DeviceType::CUDA) {
+      auto cuda_stream = THCState_getCurrentStream(THDGetCudaState());
+      stream = THDGetStreamId(cuda_stream);
+    }
+    return std::make_tuple(CollectiveType::BROADCAST, group_id, device, stream,
+                           input_bytes, input_bytes, UNUSED_OP, src_rank);
+  }
+
+  static GlooCache::value_type create(GlooCache& cache,
+    const DataChannelGloo::Group& group, const std::string& store_prefix,
+    DeviceType device, size_t input_bytes, size_t count, rank_type src_rank
+  ) {
+    auto context = cache.createContext(group, store_prefix);
+    auto input_buffer = cache.createBuffer(input_bytes, device);
+
+    std::shared_ptr<GlooCache::algorithm_type> algo;
+    if (device == DeviceType::CPU) {
+      algo = std::make_shared<::gloo::BroadcastOneToAll<T>>(
+        context,
+        std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+        count,
+        src_rank);
+#ifdef USE_CUDA
+    } else if (device == DeviceType::CUDA) {
+      auto stream = THCState_getCurrentStream(THDGetCudaState());
+
+#if defined(USE_GLOO_IBVERBS) && USE_GLOO_IBVERBS
+      // Only enable GPU direct if the device supports it
+      if (context->getDevice()->hasGPUDirect()) {
+        algo = std::make_shared<::gloo::CudaBroadcastOneToAll<T,
+                                ::gloo::CudaDeviceWorkspace<T>>>(
+          context,
+          std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+          count,
+          src_rank,
+          0,
+          std::vector<cudaStream_t>{stream});
+      } else
+#endif
+      {
+        algo = std::make_shared<::gloo::CudaBroadcastOneToAll<T,
+                                ::gloo::CudaHostWorkspace<T>>>(
+          context,
+          std::initializer_list<T*>{reinterpret_cast<T*>(input_buffer.get())},
+          count,
+          src_rank,
+          0,
+          std::vector<cudaStream_t>{stream});
+      }
+#endif
+
+    } else {
+      throw std::runtime_error("unsupported tensor device in Gloo broadcast");
+    }
+
+    return std::make_tuple(
+      algo,
+      input_buffer,
+      input_buffer, // we get the result in same buffer
+      std::make_shared<std::mutex>()
+    );
+  }
+};
+
+template<typename T> // unused
+struct algorithm_spec<CollectiveType::BARRIER, T> {
+  static GlooCache::key_type key(THDGroup group_id) {
+    return std::make_tuple(CollectiveType::BARRIER, group_id, UNUSED_DEVICE, UNUSED_STREAM,
+                           UNUSED_BYTES, UNUSED_BYTES, UNUSED_OP, UNUSED_RANK);
+  }
+
+  static GlooCache::value_type create(GlooCache& cache,
+    const DataChannelGloo::Group& group, const std::string& store_prefix
+  ) {
+    auto context = cache.createContext(group, store_prefix);
+    return std::make_tuple(
+      std::make_shared<::gloo::BarrierAllToAll>(context),
+      nullptr,
+      nullptr,
+      std::make_shared<std::mutex>()
+    );
+  }
+};
+
+} // namespace gloo_cache
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/Store.cpp b/torch/lib/THD/base/data_channels/Store.cpp
new file mode 100644
index 0000000..4972815
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/Store.cpp
@@ -0,0 +1,200 @@
+#include "Store.hpp"
+#include "../ChannelUtils.hpp"
+
+#include <poll.h>
+#include <system_error>
+#include <unistd.h>
+
+namespace thd {
+
+namespace {
+
+enum class QueryType : std::uint8_t {
+  SET,
+  GET,
+  WAIT,
+  STOP_WAITING
+};
+
+} // anonymous namespace
+
+Store::StoreDeamon::StoreDeamon(int listen_socket)
+ : _listen_socket(listen_socket)
+ , _keys_awaited()
+ , _sockets()
+{
+  _deamon = std::thread(&Store::StoreDeamon::deamon, this);
+}
+
+Store::StoreDeamon::~StoreDeamon()
+{
+  ::close(_listen_socket);
+  for (auto socket : _sockets) {
+    if (socket != -1)
+      ::close(socket);
+  }
+}
+
+void Store::StoreDeamon::join() {
+  _deamon.join();
+}
+
+void Store::StoreDeamon::deamon() {
+  std::vector<struct pollfd> fds;
+  fds.push_back({ .fd = _listen_socket, .events = POLLIN });
+
+  // receive the queries
+  bool finished = false;
+  while (!finished) {
+    for (size_t i = 0; i < _sockets.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+    SYSCHECK(::poll(fds.data(), fds.size(), -1));
+    if (fds[0].revents != 0) {
+      if (fds[0].revents ^ POLLIN)
+        throw std::system_error(ECONNABORTED, std::system_category());
+
+      int sock_fd = std::get<0>(accept(_listen_socket));
+      _sockets.push_back(sock_fd);
+      _keys_awaited.push_back(0);
+      fds.push_back({ .fd = sock_fd, .events = POLLIN });
+    }
+    for (size_t rank = 0; rank < _sockets.size(); rank++) {
+      if (fds[rank + 1].revents == 0)
+        continue;
+
+      if (fds[rank + 1].revents ^ POLLIN)
+        throw std::system_error(ECONNABORTED, std::system_category());
+
+      try {
+        query(rank);
+      } catch (...) {
+        // There was an error when processing query. Probably an exception occurred in
+        // recv/send what would indicate that socket on the other side has been closed.
+        // If the closing was due to normal exit, then the store should exit too.
+        // Otherwise, if it was different exception, other processes will get
+        // an exception once they try to use the store.
+        finished = true;
+        break;
+      }
+    }
+  }
+}
+
+/*
+ * query communicates with the worker. The format
+ * of the query is as follows:
+ * type of query | size of arg1 | arg1 | size of arg2 | arg2 | ...
+ * or, in the case of wait
+ * type of query | number of args | size of arg1 | arg1 | ...
+ */
+void Store::StoreDeamon::query(rank_type rank) {
+  int socket = _sockets[rank];
+  QueryType qt;
+  recv_bytes<QueryType>(socket, &qt, 1);
+  if (qt == QueryType::SET) {
+    std::string key = recv_string(socket);
+    _store[key] = recv_vector<char>(socket);
+    // On "set", wake up all of the processes that wait
+    // for keys already in the store
+    auto to_wake = _waiting.find(key);
+    if (to_wake != _waiting.end()) {
+      for (int proc : to_wake->second) {
+        if (--_keys_awaited[proc] == 0)
+          send_value<QueryType>(_sockets[proc], QueryType::STOP_WAITING);
+      }
+      _waiting.erase(to_wake);
+    }
+  } else if (qt == QueryType::GET) {
+    std::string key = recv_string(socket);
+    std::vector<char> data = _store.at(key);
+    send_vector(socket, data);
+  } else if (qt == QueryType::WAIT) {
+    size_type nargs;
+    recv_bytes<size_type>(socket, &nargs, 1);
+    std::vector<std::string> keys(nargs);
+    for (size_t i = 0; i < nargs; i++) {
+      keys[i] = recv_string(socket);
+    }
+    if (checkAndUpdate(keys)) {
+      send_value<QueryType>(socket, QueryType::STOP_WAITING);
+    } else {
+      for (auto& key : keys) {
+        _waiting[key].push_back(rank);
+      }
+      _keys_awaited[rank] = keys.size();
+    }
+  } else {
+    throw std::runtime_error("expected a query type");
+  }
+}
+
+bool Store::StoreDeamon::checkAndUpdate(std::vector<std::string>& keys) const {
+  bool ret = true;
+  for (auto it = keys.begin(); it != keys.end();) {
+    if (_store.count(*it) == 0) {
+      ret = false;
+      it++;
+    } else {
+      it = keys.erase(it);
+    }
+  }
+  return ret;
+}
+
+
+
+Store::Store(const std::string& addr,
+             port_type port, int listen_socket)
+ : _socket(-1)
+ , _store_addr(addr)
+ , _store_port(port)
+ , _store_thread(nullptr)
+{
+  if (listen_socket != Store::CLIENT_ONLY) {
+    _store_thread = std::unique_ptr<StoreDeamon>(
+      new StoreDeamon(listen_socket)
+    );
+  }
+
+  _socket = connect(_store_addr, _store_port);
+}
+
+Store::~Store() {
+  ::close(_socket);
+
+  // Store deamon should end because of closed connection.
+  if (_store_thread) {
+    _store_thread->join();
+  }
+}
+
+void Store::set(const std::string& key, const std::vector<char>& data) {
+  send_value<QueryType>(_socket, QueryType::SET);
+  send_string(_socket, key, true);
+  send_vector<char>(_socket, data);
+}
+
+std::vector<char> Store::get(const std::string& key) {
+  wait({key});
+  send_value<QueryType>(_socket, QueryType::GET);
+  send_string(_socket, key);
+  return recv_vector<char>(_socket);
+}
+
+void Store::wait(const std::vector<std::string>& keys) {
+  send_value<QueryType>(_socket, QueryType::WAIT);
+  size_type nkeys = keys.size();
+  send_bytes<size_type>(_socket, &nkeys, 1, (nkeys > 0));
+  for (size_t i = 0; i < nkeys; i++) {
+    send_string(_socket, keys[i], (i != (nkeys - 1)));
+  }
+  // after sending the query, wait for a 'stop_waiting' response
+  QueryType qr;
+  recv_bytes<QueryType>(_socket, &qr, 1);
+  if (qr != QueryType::STOP_WAITING)
+    throw std::runtime_error("stop_waiting response expected");
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/base/data_channels/Store.hpp b/torch/lib/THD/base/data_channels/Store.hpp
new file mode 100644
index 0000000..59e9922
--- /dev/null
+++ b/torch/lib/THD/base/data_channels/Store.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "../ChannelUtils.hpp"
+#include "gloo/rendezvous/store.h"
+
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+namespace thd {
+
+struct Store : public ::gloo::rendezvous::Store {
+private:
+  struct StoreDeamon {
+    StoreDeamon() = delete;
+    StoreDeamon(int listen_socket);
+    ~StoreDeamon();
+
+    void join();
+
+  private:
+    using store_type = std::unordered_map<std::string, std::vector<char>>;
+
+    void deamon();
+    void query(rank_type rank);
+    bool checkAndUpdate(std::vector<std::string>& keys) const;
+
+    int _listen_socket;
+
+    std::thread _deamon;
+    store_type _store;
+    std::unordered_map<std::string, std::vector<rank_type>> _waiting;
+    std::vector<size_t> _keys_awaited;
+    std::vector<int> _sockets;
+  };
+
+public:
+  // A special value for listen_socket which doesn't launch the deamon
+  static constexpr int CLIENT_ONLY = -1;
+
+  Store(const std::string& addr, port_type port, int listen_socket = CLIENT_ONLY);
+  ~Store();
+
+  void set(const std::string& key, const std::vector<char>& data) override;
+  std::vector<char> get(const std::string& key) override;
+  void wait(const std::vector<std::string>& keys) override;
+
+private:
+  int _listen_socket;
+  int _socket;
+  std::string _store_addr;
+  port_type _store_port;
+  std::unique_ptr<StoreDeamon> _store_thread; // it is initialised only in a selected process
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethod.cpp b/torch/lib/THD/base/init_methods/InitMethod.cpp
new file mode 100644
index 0000000..94aef9b
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethod.cpp
@@ -0,0 +1,61 @@
+#include "InitMethod.hpp"
+
+#ifdef THD_INIT_EXTENSION_H
+#define INCF(F) INCF_(F)
+#define INCF_(F) #F
+#include INCF(THD_INIT_EXTENSION_H)
+#endif
+
+namespace thd {
+namespace init {
+
+InitMethod::Config initTCP(std::string argument,
+                           int world_size_r,
+                           std::string group_name,
+                           int rank);
+InitMethod::Config initFile(std::string argument,
+                            int world_size_r,
+                            std::string group_name,
+                            int rank);
+InitMethod::Config initEnv(std::string argument,
+                           int world_size_r,
+                           std::string group_name,
+                           int rank);
+
+InitMethodFuncMap initMethods({
+    {"env://", ::thd::init::initEnv},
+    {"file://", ::thd::init::initFile},
+    {"tcp://", ::thd::init::initTCP}
+
+#ifdef THD_INIT_EXTENSION_H
+    ,
+    /**
+     * Additional method pairs can be defined in THD_INIT_EXTENSION_H header
+     * to extend the init methods
+     */
+    THD_INIT_EXTENSION_METHODS
+#endif
+
+});
+
+} // namespace init
+
+InitMethod::Config getInitConfig(std::string argument,
+                                 int world_size,
+                                 std::string group_name,
+                                 int rank) {
+
+  InitMethod::Config config;
+
+  for (auto& methodPair : init::initMethods) {
+    auto initMethodPrefix = methodPair.first;
+    auto initMethodFunc = methodPair.second;
+    if (argument.find(initMethodPrefix) == 0) {
+      config = initMethodFunc(argument, world_size, group_name, rank);
+    }
+  }
+  config.validate();
+  return config;
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethod.hpp b/torch/lib/THD/base/init_methods/InitMethod.hpp
new file mode 100644
index 0000000..461595e
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethod.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "../ChannelUtils.hpp"
+
+#include <string>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+
+
+namespace thd {
+
+struct InitMethod {
+  struct Config {
+    struct MasterConfig {
+      int listen_socket;
+      port_type listen_port;
+    };
+
+    struct WorkerConfig {
+      std::string master_addr;
+      port_type master_port;
+    };
+
+    Config() {
+      rank = -1;
+      world_size = 0;
+      public_address = "";
+      master.listen_socket = -1;
+      master.listen_port = 0;
+      worker.master_addr = "";
+      worker.master_port = 0;
+    }
+
+    rank_type rank;
+    rank_type world_size;
+    std::string public_address;
+    MasterConfig master;
+    WorkerConfig worker;
+
+    void validate() {
+      if (world_size == 0)
+        throw std::logic_error("world_size was not set in config");
+
+      if (rank >= world_size || rank == -1)
+        throw std::logic_error("rank was not set in config");
+
+      if (public_address == "")
+        throw std::logic_error("public_address was not set in config");
+
+      if (rank == 0) {
+        if (master.listen_socket < 0)
+          throw std::logic_error("master:listen_socket was not set in config");
+
+        if (master.listen_port <= 0)
+          throw std::logic_error("master:listen_port was not set in config");
+      } else {
+        if (worker.master_addr == "")
+          throw std::logic_error("worker:master_addr was not set in config");
+
+        if (worker.master_port <= 0)
+          throw std::logic_error("worker:master_port was not set in config");
+      }
+    }
+  };
+};
+
+namespace init {
+
+using InitMethodFuncMap =
+  std::unordered_map<std::string,
+  std::function<::thd::InitMethod::Config(std::string, int, std::string, int)>>;
+
+} // namespace init
+
+
+InitMethod::Config getInitConfig(std::string argument, int world_size = -1,
+                                 std::string group_name = "", int rank = -1);
+
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethodEnv.cpp b/torch/lib/THD/base/init_methods/InitMethodEnv.cpp
new file mode 100644
index 0000000..ea12fdf
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethodEnv.cpp
@@ -0,0 +1,73 @@
+#include "InitMethod.hpp"
+#include "InitMethodUtils.hpp"
+
+namespace thd {
+namespace init {
+
+namespace {
+
+constexpr char RANK_ENV[] = "RANK";
+constexpr char WORLD_SIZE_ENV[] = "WORLD_SIZE";
+constexpr char MASTER_PORT_ENV[] = "MASTER_PORT";
+constexpr char MASTER_ADDR_ENV[] = "MASTER_ADDR";
+
+const char* mustGetEnv(const char* env) {
+  const char* value = std::getenv(env);
+  if (value == nullptr) {
+    throw std::logic_error(std::string("") + "failed to read the " + env +
+        " environmental variable; maybe you forgot to set it?");
+  }
+  return value;
+}
+
+std::tuple<std::string, port_type> loadWorkerEnv() {
+  std::string str_port = mustGetEnv(MASTER_PORT_ENV);
+  auto port = convertToPort(std::stoul(str_port));
+  return std::make_tuple(mustGetEnv(MASTER_ADDR_ENV), port);
+}
+
+rank_type maybeLoadEnv(const char* env_name, int value, std::string parameter_name) {
+  const char *env_value_str = std::getenv(env_name);
+  int env_value = value;
+  if (env_value_str != nullptr)
+    env_value = std::stol(env_value_str);
+  if (value != -1 && env_value != value)
+    throw std::runtime_error(parameter_name + " specified both as an "
+                             "environmental variable and to the initializer");
+  if (env_value == -1)
+    throw std::runtime_error(parameter_name + " is not set but it is required for "
+                             "env:// init method");
+
+  return convertToRank(env_value);
+}
+
+} // anonymous namespace
+
+InitMethod::Config initEnv(std::string argument, /* unused */
+                           int world_size_r,
+                           std::string group_name,
+                           int rank) {
+  InitMethod::Config config;
+
+  config.rank = maybeLoadEnv(RANK_ENV, rank, "rank");
+  config.world_size = maybeLoadEnv(WORLD_SIZE_ENV, world_size_r, "world_size");
+
+  if (group_name != "") {
+    throw std::runtime_error("group_name is not supported in env:// init method");
+  }
+
+  if (config.rank == 0) {
+    config.master.listen_port = convertToPort(std::stoul(mustGetEnv(MASTER_PORT_ENV)));
+    std::tie(config.master.listen_socket, std::ignore) = listen(config.master.listen_port);
+    config.public_address = discoverWorkers(config.master.listen_socket,
+                                            config.world_size);
+  } else {
+    std::tie(config.worker.master_addr, config.worker.master_port) = loadWorkerEnv();
+    std::tie(std::ignore, config.public_address) =
+      discoverMaster({config.worker.master_addr}, config.worker.master_port);
+  }
+  return config;
+}
+
+} // namespace init
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethodFile.cpp b/torch/lib/THD/base/init_methods/InitMethodFile.cpp
new file mode 100644
index 0000000..831cad7
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethodFile.cpp
@@ -0,0 +1,236 @@
+#include "InitMethod.hpp"
+#include "InitMethodUtils.hpp"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <algorithm>
+#include <fstream>
+#include <system_error>
+#include <thread>
+#include <iterator>
+
+namespace thd {
+namespace init {
+
+namespace {
+
+void lockLoop(int fd, struct flock &oflock) {
+  while (true) {
+    int err = ::fcntl(fd, F_SETLKW, &oflock);
+    if (err == 0) break;
+    else if (errno == EINTR) continue;
+    else throw std::system_error(errno, std::system_category());
+  }
+}
+
+void lockFile(int fd) {
+  struct flock oflock;
+  oflock.l_type = F_WRLCK; // write lock
+  oflock.l_whence = SEEK_SET;
+  oflock.l_start = 0;
+  oflock.l_len = 0; // lock whole file
+  lockLoop(fd, oflock);
+}
+
+void unlockFile(int fd) {
+  struct flock oflock;
+  oflock.l_type = F_UNLCK; // unlock
+  oflock.l_whence = SEEK_SET;
+  oflock.l_start = 0;
+  oflock.l_len = 0; // unlock whole file
+  lockLoop(fd, oflock);
+}
+
+// file_descriptor, number_of_lines_in_file
+std::pair<int, size_t> waitForGroup(std::string file_path, std::string group_name,
+                                         std::fstream& file) {
+  int fd;
+  std::string content;
+  struct stat fd_stat, path_stat;
+  // Loop until the file is either empty, or filled with ours group_name
+  while (true) {
+    // Loop until we have an open, locked and valid file
+    while (true) {
+      fd = ::open(file_path.c_str(), O_RDWR | O_CREAT, 0644);
+      if (fd == -1) {
+        throw std::system_error(fd, std::generic_category(),
+                                "cannot access '" + file_path + "' file");
+      }
+      lockFile(fd);
+
+      // This helps prevent a race when while we were waiting for the lock,
+      // the file has been removed from the fs
+      SYSCHECK(::fstat(fd, &fd_stat));
+      int err = stat(file_path.c_str(), &path_stat);
+      if (err == 0 &&
+          fd_stat.st_dev == path_stat.st_dev &&
+          fd_stat.st_ino == path_stat.st_ino) {
+        break;
+      }
+      ::close(fd);
+    }
+
+    file.close(); file.open(file_path);
+    content = {std::istreambuf_iterator<char>(file),
+               std::istreambuf_iterator<char>()};
+
+    if (content.length() == 0 || content.find(group_name) == 0) break;
+
+    unlockFile(fd);
+    ::close(fd);
+    std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  }
+
+  return {fd, std::count(content.begin(), content.end(), '\n')};
+}
+
+size_t waitForData(int fd, std::fstream& file, rank_type world_size) {
+  size_t lines = 0;
+  // Wait until all processes will write their info
+  while (lines < world_size) {
+    unlockFile(fd);
+    std::this_thread::sleep_for(std::chrono::milliseconds(200));
+    lockFile(fd);
+
+    file.seekp(0, std::ios_base::beg);
+    file.sync();
+    std::string content = {std::istreambuf_iterator<char>(file),
+                           std::istreambuf_iterator<char>()};
+    lines = std::count(content.begin(), content.end(), '\n');
+  }
+
+  file.seekp(0, std::ios_base::beg);
+  return lines;
+}
+
+// master_port, master_addrs, ranks
+std::tuple<port_type, std::vector<std::string>, std::vector<int>>
+parseFile(std::fstream& file, rank_type world_size, std::string group_name) {
+  port_type master_port;
+  std::vector<std::string> master_addrs;
+  std::vector<int> ranks(world_size);
+  // Parse the file
+  for (size_t i = 0; i < world_size; ++i) {
+    std::string proc_group_name;
+    size_t proc_addrs_count;
+    int proc_rank;
+    port_type proc_port;
+
+    file >> proc_group_name >> proc_rank >> proc_port >> proc_addrs_count;
+    if (proc_group_name != group_name) {
+      throw std::logic_error("proc_group_name != group_name");
+    }
+
+    std::vector<std::string> proc_addrs(proc_addrs_count);
+    for (auto& str : proc_addrs) {
+      file >> str;
+    }
+
+    ranks[i] = proc_rank;
+    /*
+     * Master data is found only when:
+     *  1. proc_rank has been manually assigned as 0 (first condition)
+     *  2. process has no assigned rank, and it hasn't been initialized yet.
+     */
+    if (proc_rank == 0 || (proc_rank == -1 && master_addrs.size() == 0)) {
+      master_port = proc_port;
+      master_addrs = std::move(proc_addrs);
+    }
+  }
+
+  // Ensure there are no duplicates
+  for (size_t i = 0; i < ranks.size(); ++i) {
+    for (size_t j = i + 1; j < ranks.size(); ++j) {
+      if (ranks[i] >= 0 && (ranks[i] == ranks[j]))
+        throw std::logic_error("more than one node have assigned same rank");
+    }
+  }
+
+  return std::make_tuple(master_port, master_addrs, ranks);
+}
+
+} // anonymous namespace
+
+
+InitMethod::Config initFile(std::string argument,
+                            int world_size_r,
+                            std::string group_name,
+                            int assigned_rank) {
+
+  group_name.append("#"); // To make sure it's not empty
+  std::string file_path = argument.substr(7); // chop "file://"
+  rank_type world_size;
+  try {
+    world_size = convertToRank(world_size_r);
+  } catch(std::exception& e) {
+    if (world_size_r == -1) {
+      throw std::invalid_argument("world_size is not set - it is required for "
+                                  "`file://` init methods with this backend");
+    }
+    throw std::invalid_argument("invalid world_size");
+  }
+
+  InitMethod::Config config;
+  int fd;
+  size_t order;
+  std::fstream file;
+
+  std::tie(fd, order) = waitForGroup(file_path, group_name, file);
+  // NOTE: the function returns a locked fd
+
+  int listen_socket;
+  port_type port;
+  std::tie(listen_socket, port) = listen();
+
+  // Append our information
+  auto if_addrs = getInterfaceAddresses();
+  file << group_name << ' ' << assigned_rank << ' ' << port
+       << ' ' << if_addrs.size();
+  for (auto addr_str : if_addrs) {
+    file << ' ' << addr_str;
+  }
+  file << std::endl;
+
+  size_t lines = waitForData(fd, file, world_size);
+
+  port_type master_port;
+  std::vector<std::string> master_addrs;
+  std::vector<int> ranks;
+  std::tie(master_port, master_addrs, ranks) = parseFile(file, world_size, group_name);
+
+  config.rank = getRank(ranks, assigned_rank, order);
+
+  // Last process removes the file.
+  file.seekp(0, std::ios_base::end);
+  file << std::endl; lines++;
+  if (lines == 2 * world_size) {
+    ::remove(file_path.c_str());
+  }
+
+  file.close();
+  unlockFile(fd);
+
+  config.world_size = world_size;
+  if (config.rank == 0) {
+    config.public_address = discoverWorkers(listen_socket, world_size);
+    config.master = {
+      .listen_socket = listen_socket,
+      .listen_port = master_port,
+    };
+  } else {
+    ::close(listen_socket);
+
+    std::string master_address;
+    std::tie(master_address, config.public_address) = discoverMaster(master_addrs, master_port);
+    config.worker = {
+      .master_addr = master_address,
+      .master_port = master_port,
+    };
+  }
+
+  return config;
+}
+
+} // namespace init
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethodTCP.cpp b/torch/lib/THD/base/init_methods/InitMethodTCP.cpp
new file mode 100644
index 0000000..a0d76c9
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethodTCP.cpp
@@ -0,0 +1,345 @@
+#include "InitMethod.hpp"
+#include "InitMethodUtils.hpp"
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <set>
+#include <thread>
+#include <cstring>
+#include <random>
+#include <sstream>
+#include <iterator>
+
+constexpr size_t num_rand_bytes = 32;
+constexpr size_t max_msg_length = 4000;
+
+namespace thd {
+namespace init {
+namespace {
+
+std::string getRandomString()
+{
+  static constexpr char charset[] =
+    "0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz";
+  int fd;
+  uint8_t rand_bytes[num_rand_bytes];
+  ssize_t bytes_read;
+  SYSCHECK(fd = open("/dev/urandom", O_RDONLY));
+  SYSCHECK(bytes_read = read(fd, &rand_bytes, sizeof(rand_bytes)));
+  if (bytes_read != sizeof(rand_bytes))
+    throw std::runtime_error("failed to read from /dev/urandom");
+  SYSCHECK(::close(fd));
+
+  std::string str;
+  str.reserve(num_rand_bytes);
+  for (uint8_t *byte = rand_bytes; byte != rand_bytes + num_rand_bytes; ++byte) {
+    str.push_back(charset[(*byte) % (sizeof(charset) - 1)]);
+  }
+  return str;
+}
+
+struct MulticastMessage {
+  std::string uid;
+  std::string group_name;
+  std::vector<std::string> addresses;
+  port_type port;
+  int rank;
+
+  MulticastMessage(std::string group_name, port_type port, int rank)
+    : uid(getRandomString())
+    , group_name(group_name)
+    , addresses(getInterfaceAddresses())
+    , port(port)
+    , rank(rank) {}
+
+  MulticastMessage(std::string msg) {
+    std::istringstream ss {msg};
+    ss >> uid >> group_name >> port >> rank;
+    addresses = {std::istream_iterator<std::string>(ss),
+                 std::istream_iterator<std::string>()};
+  }
+
+  std::string pack() {
+    std::ostringstream ss;
+    ss << uid << ' ' << group_name << ' ' << port << ' ' << rank;
+    for (const auto& address : addresses) {
+      ss << ' ' << address;
+    }
+    return ss.str();
+  }
+};
+
+bool isMulticastAddress(struct sockaddr* address) {
+  if (address->sa_family == AF_INET) {
+    struct sockaddr_in *address_ipv4 = reinterpret_cast<struct sockaddr_in*>(address);
+    uint32_t host_addr = ntohl(address_ipv4->sin_addr.s_addr);
+    return (host_addr & 0xF0000000) == 0xE0000000;
+  } else if (address->sa_family == AF_INET6) {
+    struct sockaddr_in6 *address_ipv6 = reinterpret_cast<struct sockaddr_in6*>(address);
+    auto& addr_bytes = address_ipv6->sin6_addr.s6_addr;
+    // NOTE: address is in network byte order
+    return addr_bytes[0] == 0xff;
+  } else {
+    throw std::invalid_argument("unsupported address family");
+  }
+}
+
+int bindMulticastSocket(struct sockaddr* address, struct sockaddr_storage *sock_addr,
+                        int timeout_sec = 1, int ttl = 1) {
+  struct timeval timeout = {.tv_sec = timeout_sec, .tv_usec = 0};
+
+  int socket, optval;
+  SYSCHECK(socket = ::socket(address->sa_family, SOCK_DGRAM, 0));
+  optval = 1; SYSCHECK(::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(int)));
+
+  if (address->sa_family == AF_INET) {
+    struct sockaddr_in *sock_addr_ipv4 = reinterpret_cast<struct sockaddr_in*>(sock_addr);
+    struct sockaddr_in *address_ipv4 = reinterpret_cast<struct sockaddr_in*>(address);
+    std::memset(sock_addr_ipv4, 0, sizeof(*sock_addr_ipv4));
+    sock_addr_ipv4->sin_family = address->sa_family;
+    sock_addr_ipv4->sin_addr.s_addr = INADDR_ANY;
+    sock_addr_ipv4->sin_port = address_ipv4->sin_port;
+
+    SYSCHECK(::bind(socket, reinterpret_cast<struct sockaddr*>(sock_addr_ipv4), sizeof(*sock_addr_ipv4)));
+    SYSCHECK(::setsockopt(socket, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)));
+
+    struct ip_mreq mreq;
+    mreq.imr_multiaddr = address_ipv4->sin_addr;
+    mreq.imr_interface.s_addr = htonl(INADDR_ANY);
+    SYSCHECK(::setsockopt(socket, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)));
+    SYSCHECK(::setsockopt(socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)));
+
+    sock_addr_ipv4->sin_addr = address_ipv4->sin_addr;
+  } else if (address->sa_family == AF_INET6) {
+    struct sockaddr_in6 *sock_addr_ipv6 = reinterpret_cast<struct sockaddr_in6*>(sock_addr);
+    struct sockaddr_in6 *address_ipv6 = reinterpret_cast<struct sockaddr_in6*>(address);
+    std::memset(sock_addr_ipv6, 0, sizeof(*sock_addr_ipv6));
+    sock_addr_ipv6->sin6_family = address->sa_family;
+    sock_addr_ipv6->sin6_addr = in6addr_any;
+    sock_addr_ipv6->sin6_port = address_ipv6->sin6_port;
+
+    SYSCHECK(::bind(socket, reinterpret_cast<struct sockaddr*>(sock_addr_ipv6), sizeof(*sock_addr_ipv6)));
+    SYSCHECK(::setsockopt(socket, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)));
+
+    struct ipv6_mreq mreq;
+    mreq.ipv6mr_multiaddr = address_ipv6->sin6_addr;
+    mreq.ipv6mr_interface = 0;
+    SYSCHECK(::setsockopt(socket, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)));
+    SYSCHECK(::setsockopt(socket, IPPROTO_IPV6, IPV6_JOIN_GROUP, &mreq, sizeof(mreq)));
+
+    sock_addr_ipv6->sin6_addr = address_ipv6->sin6_addr;
+  }
+
+  return socket;
+}
+
+// messages
+std::vector<MulticastMessage> getMessages(struct sockaddr* addr, rank_type world_size,
+                                          std::string group_name, std::string packed_msg) {
+  struct sockaddr_storage sock_addr;
+  int socket = bindMulticastSocket(addr, &sock_addr);
+  // NOTE: Multicast membership is dropped on close
+  ResourceGuard socket_guard([socket]() { ::close(socket); });
+
+  std::set<std::string> msgs = {packed_msg};
+
+  char recv_message[max_msg_length];
+  if (packed_msg.length() + 1 > max_msg_length) {
+    throw std::logic_error("message too long for multicast init");
+  }
+
+  auto broadcast = [socket, &sock_addr, &packed_msg]() {
+    SYSCHECK(::sendto(socket, packed_msg.c_str(), packed_msg.size() + 1, 0,
+                reinterpret_cast<struct sockaddr*>(&sock_addr),
+                sock_addr.ss_family == AF_INET
+                    ? sizeof(struct sockaddr_in)
+                    : sizeof(struct sockaddr_in6)));
+  };
+
+  broadcast();
+
+  // Wait for messages from all processes
+  while (msgs.size() < world_size) {
+    try {
+      SYSCHECK(::recv(socket, recv_message, sizeof(recv_message), 0));
+      std::string recv_message_str(recv_message);
+
+      if (recv_message_str == packed_msg) continue; // ignore multicast loopback
+
+      // We should ignore messages coming from different group
+      auto recv_msg = MulticastMessage(recv_message_str);
+      if (recv_msg.group_name != group_name) {
+        continue;
+      }
+
+      msgs.insert(recv_message_str); // set will automatically deduplicate messages
+    } catch (const std::system_error& e) {
+      // Check if this was really a timeout from `recvfrom` or a different error.
+      if (errno != EAGAIN && errno != EWOULDBLOCK)
+        throw;
+    }
+
+    broadcast();
+  }
+
+  // Just to decrease the probability of packet loss deadlocking the system
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  broadcast();
+
+  std::vector<MulticastMessage> unpacked_msgs;
+  for (auto& msg : msgs) {
+    unpacked_msgs.emplace_back(msg);
+  }
+
+  return unpacked_msgs;
+}
+
+
+InitMethod::Config initTCPMaster(std::string address, std::string str_port,
+                                 rank_type world_size, int assigned_rank) {
+  InitMethod::Config config;
+  if (assigned_rank == -1) {
+    throw std::invalid_argument("tcp:// method with non-multicast addresses "
+                                "requires manual rank assignment");
+  }
+
+  config.rank = convertToRank(assigned_rank);
+  config.world_size = world_size;
+  auto port = convertToPort(std::stoul(str_port));
+  if (config.rank == 0) {
+    config.master.listen_port = port;
+    std::tie(config.master.listen_socket, std::ignore) = listen(port);
+    config.public_address = discoverWorkers(config.master.listen_socket, world_size);
+  } else {
+    config.worker.master_addr = address;
+    config.worker.master_port = port;
+    std::tie(std::ignore, config.public_address) = discoverMaster({address}, port);
+  }
+
+  return config;
+}
+
+InitMethod::Config initTCPMulticast(std::string group_name, rank_type world_size,
+                                    int assigned_rank, struct sockaddr* addr) {
+  InitMethod::Config config;
+
+  int listen_socket;
+  port_type listen_port;
+  std::tie(listen_socket, listen_port) = listen();
+  ResourceGuard listen_socket_guard([listen_socket]() { ::close(listen_socket); });
+
+  MulticastMessage msg {group_name, listen_port, assigned_rank};
+  std::string packed_msg = msg.pack();
+
+  std::vector<MulticastMessage> msgs = getMessages(addr, world_size, group_name,
+                                                   packed_msg);
+
+  std::vector<MulticastMessage*> sorted_msgs(msgs.size());
+
+  // Pre-fill sorted_msgs with processes that had their ranks assigned manually
+  for (auto& msg : msgs) {
+    if (msg.rank >= 0) {
+      if (sorted_msgs[msg.rank] != nullptr)
+        throw std::logic_error("more than one node have assigned same rank");
+      sorted_msgs[msg.rank] = &msg;
+    }
+  }
+
+  // NOTE: msgs are already sorted lexicographically, so we can greedily
+  // insert them into free slots
+  size_t free_pos = 0;
+  for (auto& msg : msgs) {
+    if (msg.rank >= 0) continue; // These were sorted in the previous loop
+    while (sorted_msgs[free_pos] != nullptr) free_pos++;
+    sorted_msgs[free_pos] = &msg;
+  }
+
+  auto& master_msg = *sorted_msgs[0];
+  for (size_t rank = 0; rank < sorted_msgs.size(); ++rank) {
+    if (packed_msg == sorted_msgs[rank]->pack()) {
+      config.rank = rank;
+      config.world_size = world_size;
+      if (config.rank == 0) {
+        listen_socket_guard.release();
+        config.master = {
+          .listen_socket = listen_socket,
+          .listen_port = master_msg.port,
+        };
+
+        config.public_address = discoverWorkers(listen_socket, world_size);
+      } else {
+        std::string master_address;
+        std::tie(master_address, config.public_address) =
+          discoverMaster(master_msg.addresses, master_msg.port);
+        config.worker = {
+          .master_addr = master_address,
+          .master_port = master_msg.port,
+        };
+      }
+      break;
+    }
+  }
+
+  return config;
+}
+
+
+} // anonymous namespace
+
+InitMethod::Config initTCP(std::string argument, int world_size_r,
+                           std::string group_name, int rank) {
+
+  group_name.append("#"); // To make sure it's not empty
+  argument.erase(0, 6); // chop "tcp://"
+  rank_type world_size;
+  try {
+    world_size = convertToRank(world_size_r);
+  } catch(std::exception& e) {
+    if (world_size_r == -1) {
+      throw std::invalid_argument("world_size is not set - it is required for "
+                                  "`tcp://` init methods with this backend");
+    }
+    throw std::invalid_argument("invalid world_size");
+  }
+
+  // Parse arguments
+  std::string address, str_port;
+  std::tie(address, str_port) = splitAddress(argument);
+
+  // Resolve addr and select init method
+  struct addrinfo hints = {0};
+  hints.ai_family = AF_UNSPEC;
+  struct addrinfo *res;
+  if (::getaddrinfo(address.c_str(), str_port.c_str(), &hints, &res)) {
+    throw std::invalid_argument("invalid init address");
+  }
+  ResourceGuard res_guard([res]() { ::freeaddrinfo(res); });
+
+  for (struct addrinfo *head = res; head != NULL; head = head->ai_next) {
+    if (head->ai_family != AF_INET && head->ai_family != AF_INET6) continue;
+    try {
+      if (isMulticastAddress(head->ai_addr)) {
+        return initTCPMulticast(group_name, world_size, rank, head->ai_addr);
+      } else {
+        return initTCPMaster(address, str_port, world_size, rank);
+      }
+    } catch (std::exception &e) {
+      if (!head->ai_next) throw;
+    }
+  }
+  throw std::runtime_error("failed to initialize THD using given address");
+}
+
+} // namespace init
+} // namespace thd
diff --git a/torch/lib/THD/base/init_methods/InitMethodUtils.cpp b/torch/lib/THD/base/init_methods/InitMethodUtils.cpp
new file mode 100644
index 0000000..4ad34b0
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethodUtils.cpp
@@ -0,0 +1,109 @@
+#include "InitMethodUtils.hpp"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <ifaddrs.h>
+
+#include <tuple>
+
+namespace thd {
+
+namespace {
+
+void sendPeerName(int socket) {
+  struct sockaddr_storage master_addr;
+  socklen_t master_addr_len = sizeof(master_addr);
+  SYSCHECK(getpeername(socket, reinterpret_cast<struct sockaddr*>(&master_addr), &master_addr_len));
+
+  std::string addr_str = sockaddrToString(reinterpret_cast<struct sockaddr*>(&master_addr));
+  send_string(socket, addr_str);
+}
+
+}
+
+std::vector<std::string> getInterfaceAddresses() {
+  struct ifaddrs *ifa;
+  SYSCHECK(getifaddrs(&ifa));
+  ResourceGuard ifaddrs_guard([ifa]() { ::freeifaddrs(ifa); });
+
+  std::vector<std::string> addresses;
+
+  while (ifa != nullptr) {
+    struct sockaddr *addr = ifa->ifa_addr;
+    if (addr) {
+      bool is_loopback = ifa->ifa_flags & IFF_LOOPBACK;
+      bool is_ip = addr->sa_family == AF_INET || addr->sa_family == AF_INET6;
+      if (is_ip && !is_loopback) {
+        addresses.push_back(sockaddrToString(addr));
+      }
+    }
+    ifa = ifa->ifa_next;
+  }
+
+  return addresses;
+}
+
+std::string discoverWorkers(int listen_socket, rank_type world_size) {
+  // accept connections from workers so they can know our address
+  std::vector<int> sockets(world_size - 1);
+  for (rank_type i = 0; i < world_size - 1; ++i) {
+    std::tie(sockets[i], std::ignore) = accept(listen_socket);
+  }
+
+  std::string public_addr;
+  for (auto socket : sockets) {
+    sendPeerName(socket);
+    public_addr = recv_string(socket);
+    ::close(socket);
+  }
+  return public_addr;
+}
+
+std::pair<std::string, std::string> discoverMaster(std::vector<std::string> addresses, port_type port) {
+  // try to connect to address via any of the addresses
+  std::string master_address = "";
+  int socket;
+  for (const auto& address : addresses) {
+    try {
+      socket = connect(address, port, true, 2000);
+      master_address = address;
+      break;
+    } catch (...) {} // when connection fails just try different address
+  }
+
+  if (master_address == "") {
+    throw std::runtime_error("could not establish connection with other processes");
+  }
+  ResourceGuard socket_guard([socket]() { ::close(socket); });
+  sendPeerName(socket);
+  std::string my_address = recv_string(socket);
+
+  return std::make_pair(master_address, my_address);
+}
+
+rank_type getRank(const std::vector<int>& ranks, int assigned_rank,
+                  size_t order) {
+  if (assigned_rank >= 0) {
+    return assigned_rank;
+  } else {
+    std::vector<bool> taken_ranks(ranks.size());
+    for (auto rank : ranks) {
+      if (rank >= 0)
+        taken_ranks[rank] = true;
+    }
+
+    auto unassigned = std::count(ranks.begin(), ranks.begin() + order, -1) + 1;
+    rank_type rank = 0;
+    while (true) {
+      if (!taken_ranks[rank]) unassigned--;
+      if (unassigned == 0) break;
+      rank++;
+    }
+
+    return rank;
+  }
+
+}
+}
diff --git a/torch/lib/THD/base/init_methods/InitMethodUtils.hpp b/torch/lib/THD/base/init_methods/InitMethodUtils.hpp
new file mode 100644
index 0000000..2384a5a
--- /dev/null
+++ b/torch/lib/THD/base/init_methods/InitMethodUtils.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "../ChannelUtils.hpp"
+
+#include <string>
+#include <vector>
+
+namespace thd {
+
+std::vector<std::string> getInterfaceAddresses();
+
+std::string discoverWorkers(int listen_socket, rank_type world_size);
+
+// pair of master_address, my_address
+std::pair<std::string, std::string>
+discoverMaster(std::vector<std::string> addresses, port_type port);
+
+// Helper that gets the rank based on the input order
+rank_type getRank(const std::vector<int>& ranks,
+                  int assigned_rank,
+                  size_t order);
+} // namespace thd
diff --git a/torch/lib/THD/benchmark/benchmark.py b/torch/lib/THD/benchmark/benchmark.py
new file mode 100644
index 0000000..31102db
--- /dev/null
+++ b/torch/lib/THD/benchmark/benchmark.py
@@ -0,0 +1,192 @@
+import argparse
+import os
+from timeit import default_timer as timer
+import torch
+import torch.distributed as dist
+
+
+def print_header(title):
+    print(title)
+    print("{:>8}\t{:>5}\t{:<{num_tensors_width}}\t{:>11}\t{:>11}".
+          format("MB/s", "MB", "#", "s", "ms/op",
+                 num_tensors_width=MAX_NUM_TENSORS))
+
+
+def print_stats(bytes, num_tensors, time):
+    print("{:>8.3f}\t{:>5.1f}\t{:<{num_tensors_width}}\t{:>11.3f}\t{:>11.3f}".
+          format(bytes * num_tensors / (2**20 * time),
+                 bytes / 2**20,
+                 num_tensors,
+                 time,
+                 1000 * time / num_tensors,
+                 num_tensors_width=MAX_NUM_TENSORS))
+
+
+parser = argparse.ArgumentParser(description='Benchmark torch.distributed.')
+parser.add_argument('--max-bytes', dest='max_bytes', action='store', default=28,
+                    type=int,
+                    help='set the inclusive upper limit for tensor size; ' +
+                    'default: 22 (2**22 = 4 MB)')
+parser.add_argument('--max-num-tensors', dest='max_num_tensors', action='store',
+                    default=3, type=int,
+                    help='set the inclusive upper limit for the number of ' +
+                    'tensors to be sent during one test run; ' +
+                    'default: 3 (10**3 = 1000)')
+parser.add_argument('--min-bytes', dest='min_bytes', action='store', default=19,
+                    type=int,
+                    help='set the inclusive lower limit for tensor size; ' +
+                    'default: 19 (2**19 = 512 KB)')
+parser.add_argument('--min-num-tensors', dest='min_num_tensors', action='store',
+                    default=2, type=int,
+                    help='set the inclusive lower limit for the number of ' +
+                    'tensors to be sent during one test run; ' +
+                    'default: 2 (10**2 = 100)')
+
+args = parser.parse_args()
+
+MIN_NUM_TENSORS = args.min_num_tensors
+MIN_BYTES = args.min_bytes
+MAX_NUM_TENSORS = args.max_num_tensors + 1
+MAX_BYTES = args.max_bytes + 1
+
+dist.init_process_group(backend=os.environ['BACKEND'])
+
+rank = dist.get_rank()
+dist.barrier()
+
+if rank == 0:
+    print_header("broadcast")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.broadcast(tensor, 0)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.broadcast(tensor, 0)
+dist.barrier()
+
+if rank == 0:
+    print_header("send from 0 to 1")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.send(tensor, 1)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+elif rank == 1:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.recv(tensor, 0)
+dist.barrier()
+
+if rank == 0:
+    print_header("reduce")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.reduce(tensor, 0)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.reduce(tensor, 0)
+dist.barrier()
+
+if rank == 0:
+    print_header("all reduce")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.all_reduce(tensor)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.all_reduce(tensor)
+dist.barrier()
+
+if rank == 0:
+    print_header("scatter")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        tensors = [tensor for n in range(0, dist.get_world_size())]
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.scatter(tensor, scatter_list=tensors)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.scatter(tensor, src=0)
+dist.barrier()
+
+if rank == 0:
+    print_header("gather")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        tensors = [tensor for n in range(0, dist.get_world_size())]
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.gather(tensor, gather_list=tensors)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.gather(tensor, dst=0)
+dist.barrier()
+
+if rank == 0:
+    print_header("all gather")
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        tensors = [tensor for n in range(0, dist.get_world_size())]
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            start = timer()
+            for i in range(0, num_tensors):
+                dist.all_gather(tensors, tensor)
+            end = timer()
+            print_stats(bytes, num_tensors, end - start)
+    print()
+else:
+    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
+        tensor = torch.ByteTensor(bytes).fill_(42)
+        tensors = [tensor for n in range(0, dist.get_world_size())]
+        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
+            for i in range(0, num_tensors):
+                dist.all_gather(tensors, tensor)
+dist.barrier()
diff --git a/torch/lib/THD/benchmark/run_benchmark b/torch/lib/THD/benchmark/run_benchmark
new file mode 100755
index 0000000..9f2ebdc
--- /dev/null
+++ b/torch/lib/THD/benchmark/run_benchmark
@@ -0,0 +1,163 @@
+#! /bin/sh
+
+set -eu
+
+BACKEND=mpi
+engine="$PWD/benchmark.py"
+environment=/dev/null
+master_hostname=localhost
+output_file=/dev/stdout
+hosts=localhost
+MASTER_PORT=29500
+MASTER_ADDR="$master_hostname:$MASTER_PORT"
+WORLD_SIZE=2
+
+errxit() {
+    printf "%s\n" "$*" 1>&2
+    exit 1
+}
+
+usage() {
+    cat <<-EOF
+Usage: ./run_benchmark [ OPTIONS ]
+
+Optional arguments:
+    -., --env FILE
+        Set the path to a file to source. When using the MPI backend, the file
+        will be sourced before running 'mpirun'. In case of the TCP backend
+        the file will be sourced on every host after establishing a successful
+        SSH connection. Default: '/dev/null'.
+
+    -b, --backend BACKEND
+        Set the backend to benchmark. Default: 'mpi'.
+
+    -e, --engine ENGINE
+        Set the path to the benchmarking script to run. Use absolute paths if
+        you'll be using the tcp backend. Default: '\$PWD/benchmark.py'.
+
+    --help
+        Show this help and exit successfully.
+
+    -h, --hosts HOSTS
+        Set the list of hosts to run the benchmark on. Format: 'host1,host2'.
+        Default: 'localhost'.
+
+    --max-bytes MAX_BYTES
+        Set the inclusive upper limit for tensor size.
+        Default: 22 (2**22 = 4 MB).
+
+    --max-num-tensors MAX_NUM_TENSORS
+        Set the inclusive upper limit for the number of tensors to be sent
+        during one test run. Default: 3 (10**3 = 1000).
+
+    --min-bytes MIN_BYTES
+        Set the inclusive lower limit for tensor bytes.
+        Default: 19 (2**19 = 512 KB).
+
+    --min-num-tensors MIN_NUM_TENSORS
+        Set the inclusive upper limit for the number of tensors to be sent
+        during one test run. Default: 2 (10**2 = 100).
+
+    -n, --name NAME
+        Set the ip address/host name of the master node. Default: 'localhost'.
+
+    -o, --output FILE
+        Set the path to the output file where the master host will append
+        benchmark results. Default: '/dev/stdout'.
+
+    -p, --port PORT
+        Set the port number master is listening on. Default: '29500'.
+
+    -s, --world-size WORLD_SIZE
+        Set the number of processes to be spawned. Default: '2'.
+EOF
+}
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        '-.'|--env)
+            environment="$2"
+            shift 2
+            ;;
+        --backend|-b)
+            BACKEND="$2"
+            shift 2
+            ;;
+        --engine|-e)
+            engine="$2"
+            shift 2
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        --hosts|-h)
+            hosts="$2"
+            shift 2
+            ;;
+        --port|-p)
+            MASTER_PORT="$2"
+            shift 2
+            ;;
+        --min-num-tensors)
+            min_num_tensors="--min-num-tensors $2"
+            shift 2
+            ;;
+        --min-bytes)
+            min_bytes="--min-bytes $2"
+            shift 2
+            ;;
+        --max-num-tensors)
+            max_num_tensors="--max-num-tensors $2"
+            shift 2
+            ;;
+        --max-bytes)
+            max_bytes="--max-bytes $2"
+            shift 2
+            ;;
+        --name|-n)
+            master_hostname="$2"
+            shift 2
+            ;;
+        --output|-o)
+            output_file="$2"
+            shift 2
+            ;;
+        --world-size|-s)
+            WORLD_SIZE="$2"
+            shift 2
+            ;;
+        *)
+            errxit "Unknown option '$1'"
+            ;;
+    esac
+done
+
+MASTER_ADDR="$master_hostname:$MASTER_PORT"
+if [ x"$BACKEND" = xtcp ]; then
+    RANK=0
+    host_list="$(printf "%s\n" "$hosts" | tr ',' ' ')"
+    if [ "$(printf '%s\n' "$host_list" | wc -w)" -ne "$WORLD_SIZE" ]; then
+        errxit "Number of hosts ($host_list) doesn't match" \
+            "the world size ($WORLD_SIZE)"
+    fi
+    for host in $host_list; do
+        ssh "$host" ". $environment &&" \
+            "BACKEND=$BACKEND MASTER_ADDR=$MASTER_ADDR" \
+            "MASTER_PORT=$MASTER_PORT WORLD_SIZE=$WORLD_SIZE RANK=$RANK" \
+            "python $engine" \
+            ">> ${output_file:-}" \
+            "${min_num_tensors:-} ${min_bytes:-} ${max_num_tensors:-} ${max_bytes:-}" &
+        RANK=$((RANK+1))
+    done
+    wait
+elif [ x"$BACKEND" = xmpi ]; then
+    . "$environment"
+    export BACKEND
+    mpirun -hosts "$hosts" -n "$WORLD_SIZE" >> ${output_file:-} \
+        python "$engine" \
+        ${min_num_tensors:-} ${min_bytes:-} ${max_num_tensors:-} ${max_bytes:-}
+else
+    errxit "Invalid backend: '$BACKEND'"
+fi
+
diff --git a/torch/lib/THD/build.sh b/torch/lib/THD/build.sh
new file mode 100755
index 0000000..2ce37d2
--- /dev/null
+++ b/torch/lib/THD/build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+if [[ $(basename $(pwd)) != "THD" ]]; then
+  echo "The build script has to be executed from the root directory of THD!"
+  exit 2
+fi
+
+cd ..
+lib_dir="$(pwd)"
+cd THD
+
+mkdir -p build
+cd build
+
+LD_POSTFIX=".so.1"
+if [[ $(uname) == 'Darwin' ]]; then
+    LD_POSTFIX=".1.dylib"
+fi
+
+cmake .. -DCMAKE_CXX_FLAGS=" -I${lib_dir}/tmp_install/include -pthread \
+                             -I${lib_dir}/THPP " \
+         -DCMAKE_SHARED_LINKER_FLAGS="-L${lib_dir}/tmp_install/lib " \
+         -DCMAKE_EXE_LINKER_FLAGS="-L${lib_dir}/tmp_install/lib -pthread " \
+         -DTH_LIBRARIES="${lib_dir}/libTH$LD_POSTFIX" \
+         -DTHC_LIBRARIES="${lib_dir}/libTHC$LD_POSTFIX" \
+         -DTHD_WITH_TESTS="1" \
+         -DTorch_FOUND="1"
+make
diff --git a/torch/lib/THD/master_worker/README.md b/torch/lib/THD/master_worker/README.md
new file mode 100644
index 0000000..6e54181
--- /dev/null
+++ b/torch/lib/THD/master_worker/README.md
@@ -0,0 +1 @@
+This folder is effectively dead at the moment.
diff --git a/torch/lib/THD/master_worker/common/ByteArray.cpp b/torch/lib/THD/master_worker/common/ByteArray.cpp
new file mode 100644
index 0000000..bf34257
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/ByteArray.cpp
@@ -0,0 +1,55 @@
+#include "ByteArray.hpp"
+
+#include <cstdlib>
+#include <cstddef>
+#include <cstring>
+#include <exception>
+#include <string>
+#include <system_error>
+#include <utility>
+
+namespace thd { namespace rpc {
+
+ByteArray::ByteArray()
+  : _data()
+{}
+
+ByteArray::ByteArray(size_t size)
+  : ByteArray()
+{
+  _data.reserve(size);
+}
+
+ByteArray::ByteArray(const char* arr, size_t size)
+  : _data(arr, arr + size)
+{}
+
+ByteArray::ByteArray(ByteArray&& arr)
+{
+  std::swap(_data, arr._data);
+}
+
+ByteArray::ByteArray(const ByteArray& arr)
+  : _data(arr._data)
+{}
+
+ByteArray::~ByteArray() {}
+
+ByteArray& ByteArray::append(const char* arr, size_t size) {
+  _data.append(arr, arr + size);
+  return *this;
+}
+
+const char* ByteArray::data() const {
+  return _data.data();
+}
+
+size_t ByteArray::length() const {
+  return _data.size();
+}
+
+std::string ByteArray::to_string() const {
+  return _data;
+}
+
+}} // namespace rpc, thd
diff --git a/torch/lib/THD/master_worker/common/ByteArray.hpp b/torch/lib/THD/master_worker/common/ByteArray.hpp
new file mode 100644
index 0000000..2099222
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/ByteArray.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+namespace thd { namespace rpc {
+
+struct ByteArray {
+  using size_type = size_t;
+
+  ByteArray();
+  ByteArray(size_t size);
+  ByteArray(const char* arr, size_t size);
+  ByteArray(ByteArray&& arr);
+  ByteArray(const ByteArray& arr);
+  ~ByteArray();
+
+  ByteArray& append(const char* arr, size_t size);
+  const char* data() const;
+  size_type length() const;
+
+  std::string to_string() const;
+
+private:
+  std::string _data;
+};
+
+}} // namespace rpc, thd
+
diff --git a/torch/lib/THD/master_worker/common/CommandChannel.cpp b/torch/lib/THD/master_worker/common/CommandChannel.cpp
new file mode 100644
index 0000000..3ebaeba
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/CommandChannel.cpp
@@ -0,0 +1,212 @@
+#include "CommandChannel.hpp"
+#include "Functions.hpp"
+#include "../../base/ChannelUtils.hpp"
+
+#include <unistd.h>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <memory>
+#include <string>
+#include <stdexcept>
+#include <utility>
+#include <iostream>
+
+namespace thd {
+namespace {
+
+void sendMessage(int socket, std::unique_ptr<rpc::RPCMessage> msg) {
+  auto& bytes = msg.get()->bytes();
+  uint64_t msg_length = static_cast<uint64_t>(bytes.length());
+
+  send_bytes<uint64_t>(socket, &msg_length, 1, true);
+  send_bytes<std::uint8_t>(
+    socket,
+    reinterpret_cast<const std::uint8_t*>(bytes.data()),
+    msg_length
+  );
+}
+
+std::unique_ptr<rpc::RPCMessage> receiveMessage(int socket) {
+  uint64_t msg_length;
+  recv_bytes<uint64_t>(socket, &msg_length, 1);
+
+  std::unique_ptr<std::uint8_t[]> bytes(new std::uint8_t[msg_length]);
+  recv_bytes<std::uint8_t>(socket, bytes.get(), msg_length);
+
+  return std::unique_ptr<rpc::RPCMessage>(
+    new rpc::RPCMessage(reinterpret_cast<char*>(bytes.get()), msg_length)
+  );
+}
+
+} // anonymous namespace
+
+MasterCommandChannel::MasterCommandChannel(InitMethod::Config config)
+  : _rank(0)
+  , _sockets(config.world_size, -1)
+  , _poll_events(nullptr)
+  , _error_pipe(-1)
+  , _error(nullptr)
+  , _mutexes(config.world_size)
+{
+  _sockets[0] = config.master.listen_socket;
+}
+
+MasterCommandChannel::~MasterCommandChannel() {
+  if (_error_thread.joinable()) {
+    if (::write(_error_pipe, "exit", 4) != 4) {
+      std::cerr << "Failed to notify error thread" << std::endl;
+    }
+    _error_thread.join();
+
+    ::close(_error_pipe);
+  }
+
+  auto world_size = _sockets.size();
+  for (size_t i = 0; i < world_size; ++i) {
+    auto socket = _sockets[i];
+    if (socket == -1) continue;
+    try {
+      sendMessage(rpc::packMessage(Functions::exit), i);
+    } catch(...) {}
+    ::close(socket);
+  }
+
+}
+
+bool MasterCommandChannel::init() {
+  int socket;
+  rank_type rank;
+  for (size_t i = 1; i < _sockets.size(); ++i) {
+    std::tie(socket, std::ignore) = accept(_sockets[0]);
+    recv_bytes<rank_type>(socket, &rank, 1);
+    _sockets.at(rank) = socket;
+  }
+
+  /* Sending confirm byte is to test connection and make barrier for workers.
+   * It allows to block connected workers until all remaining workers connect.
+   */
+  for (size_t i = 1; i < _sockets.size(); ++i) {
+    std::uint8_t confirm_byte = 1;
+    send_bytes<std::uint8_t>(_sockets[i], &confirm_byte, 1);
+  }
+
+   // close listen socket
+  ::close(_sockets[0]);
+
+  int fd[2];
+  SYSCHECK(::pipe(fd));
+  _sockets[0] = fd[0];
+  _error_pipe = fd[1];
+  _error_thread = std::thread(&MasterCommandChannel::errorHandler, this);
+  return true;
+}
+
+void MasterCommandChannel::errorHandler() {
+  while (true) {
+    auto error = recvError();
+    if (std::get<0>(error) == 0) {
+      return;
+    }
+
+    _error.reset(new std::string(
+      "error (rank " + std::to_string(std::get<0>(error)) + "): " + std::get<1>(error)
+    ));
+  }
+}
+
+void MasterCommandChannel::sendMessage(std::unique_ptr<rpc::RPCMessage> msg, int rank) {
+  // Throw error received from a worker.
+  if (_error) {
+    throw std::runtime_error(*_error);
+  }
+
+  if ((rank <= 0) || (rank >= _sockets.size())) {
+    throw std::domain_error("sendMessage received invalid rank as parameter");
+  }
+
+  std::lock_guard<std::mutex> guard(_mutexes[rank]);
+  ::thd::sendMessage(_sockets[rank], std::move(msg));
+}
+
+std::tuple<rank_type, std::string> MasterCommandChannel::recvError() {
+  if (!_poll_events) {
+    // cache poll events array, it will be reused in another `receiveError` calls
+    _poll_events.reset(new struct pollfd[_sockets.size()]);
+    for (size_t rank = 0; rank < _sockets.size(); ++rank) {
+      _poll_events[rank] = {
+        .fd = _sockets[rank],
+        .events = POLLIN
+      };
+    }
+  }
+
+  for (size_t rank = 0; rank < _sockets.size(); ++rank) {
+    _poll_events[rank].revents = 0;
+  }
+
+  SYSCHECK(::poll(_poll_events.get(), _sockets.size(), -1))
+  for (size_t rank = 0; rank < _sockets.size(); ++rank) {
+    if (this->_poll_events[rank].revents == 0)
+      continue;
+
+    if (rank == 0) { // we are notified by master to end
+      return std::make_tuple(0, "");
+    }
+
+    if (_poll_events[rank].revents ^ POLLIN) {
+      _poll_events[rank].fd = -1; // mark worker as ignored
+      return std::make_tuple(rank, "connection with worker has been closed");
+    }
+
+    try {
+      // receive error
+      uint64_t error_length;
+      recv_bytes<uint64_t>(_poll_events[rank].fd, &error_length, 1);
+
+      std::unique_ptr<char[]> error(new char[error_length]);
+      recv_bytes<char>(_poll_events[rank].fd, error.get(), error_length);
+      return std::make_tuple(rank, std::string(error.get(), error_length));
+    } catch (const std::exception& e) {
+      return std::make_tuple(rank, "recv: " + std::string(e.what()));
+    }
+  }
+
+  // We did not receive error from any worker despite being notified.
+  return std::make_tuple(0, "failed to receive error from worker");
+}
+
+
+WorkerCommandChannel::WorkerCommandChannel(InitMethod::Config config)
+  : _rank(config.rank)
+  , _socket(-1)
+  , _master_addr(config.worker.master_addr)
+  , _master_port(config.worker.master_port)
+{}
+
+WorkerCommandChannel::~WorkerCommandChannel() {
+  if (_socket != -1)
+    ::close(_socket);
+}
+
+bool WorkerCommandChannel::init() {
+  _socket = connect(_master_addr, _master_port);
+  send_bytes<rank_type>(_socket, &_rank, 1); // send rank
+
+  std::uint8_t confirm_byte;
+  recv_bytes<std::uint8_t>(_socket, &confirm_byte, 1);
+  return true;
+}
+
+std::unique_ptr<rpc::RPCMessage> WorkerCommandChannel::recvMessage() {
+  return ::thd::receiveMessage(_socket);
+}
+
+void WorkerCommandChannel::sendError(const std::string& error) {
+  uint64_t error_length = static_cast<uint64_t>(error.size());
+  send_bytes<uint64_t>(_socket, &error_length, 1, true);
+  send_bytes<char>(_socket, error.data(), error_length);
+}
+
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/common/CommandChannel.hpp b/torch/lib/THD/master_worker/common/CommandChannel.hpp
new file mode 100644
index 0000000..c169076
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/CommandChannel.hpp
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "RPC.hpp"
+
+#include <sys/poll.h>
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+namespace thd {
+
+struct MasterCommandChannel {
+  MasterCommandChannel(InitMethod::Config config);
+  ~MasterCommandChannel();
+
+  bool init();
+
+  void sendMessage(std::unique_ptr<rpc::RPCMessage> msg, int rank);
+
+private:
+  std::tuple<rank_type, std::string> recvError();
+  void errorHandler();
+
+  rank_type _rank;
+  std::vector<int> _sockets;
+  std::unique_ptr<struct pollfd[]> _poll_events;
+
+  int _error_pipe; // informs error handler thread that we are exiting
+  std::unique_ptr<std::string> _error;
+  std::thread _error_thread;
+  std::vector<std::mutex> _mutexes;
+};
+
+struct WorkerCommandChannel {
+  WorkerCommandChannel(InitMethod::Config config);
+  ~WorkerCommandChannel();
+
+  bool init();
+
+  std::unique_ptr<rpc::RPCMessage> recvMessage();
+  void sendError(const std::string& error);
+
+private:
+  rank_type _rank;
+  int _socket;
+
+  std::string _master_addr;
+  port_type _master_port;
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/common/Functions.hpp b/torch/lib/THD/master_worker/common/Functions.hpp
new file mode 100644
index 0000000..71f9346
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/Functions.hpp
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <cstdint>
+
+namespace thd {
+
+enum Functions: std::uint16_t {
+  // generator functions
+  generatorNew,
+  generatorCopy,
+  generatorFree,
+  generatorSeed,
+  generatorManualSeed,
+
+  tensorCopyFromMaster,
+  tensorCopyFromWorker,
+
+  tensorNew,
+  tensorNewWithTensor,
+  tensorNewWithSize,
+  tensorNewWithSize1d,
+  tensorNewWithSize2d,
+  tensorNewWithSize3d,
+  tensorNewWithSize4d,
+  tensorNewWithStorage,
+  tensorNewWithStorage1d,
+  tensorNewWithStorage2d,
+  tensorNewWithStorage3d,
+  tensorNewWithStorage4d,
+  tensorNewClone,
+  tensorNewContiguous,
+  tensorNewSelect,
+  tensorNewNarrow,
+  tensorNewTranspose,
+  tensorNewUnfold,
+  tensorFree,
+  tensorResize,
+  tensorResizeAs,
+  tensorResize1d,
+  tensorResize2d,
+  tensorResize3d,
+  tensorResize4d,
+  tensorResize5d,
+  tensorSet,
+  tensorSetStorage,
+  tensorSetStorage1d,
+  tensorSetStorage2d,
+  tensorSetStorage3d,
+  tensorSetStorage4d,
+  tensorNarrow,
+  tensorSelect,
+  tensorTranspose,
+  tensorUnfold,
+  tensorSqueeze,
+  tensorSqueeze1d,
+  tensorNElement,
+
+  tensorGesv,
+  tensorTrtrs,
+  tensorGels,
+  tensorSyev,
+  tensorGeev,
+  tensorGesvd,
+  tensorGesvd2,
+  tensorGetri,
+  tensorPotrf,
+  tensorPotrs,
+  tensorPotri,
+  tensorQr,
+  tensorGeqrf,
+  tensorOrgqr,
+  tensorOrmqr,
+  tensorPstrf,
+
+  tensorFill,
+  tensorMaskedFill,
+  tensorMaskedCopy,
+  tensorMaskedSelect,
+  tensorNonzero,
+  tensorIndexSelect,
+  tensorIndexCopy,
+  tensorIndexAdd,
+  tensorIndexFill,
+  tensorGather,
+  tensorScatter,
+  tensorScatterFill,
+  tensorDot,
+  tensorMinall,
+  tensorMaxall,
+  tensorMedianall,
+  tensorSumall,
+  tensorProdall,
+  tensorNeg,
+  tensorCinv,
+  tensorAdd,
+  tensorSub,
+  tensorMul,
+  tensorDiv,
+  tensorFmod,
+  tensorRemainder,
+  tensorClamp,
+  tensorCadd,
+  tensorCsub,
+  tensorCmul,
+  tensorCpow,
+  tensorCdiv,
+  tensorCfmod,
+  tensorCremainder,
+  tensorAddcmul,
+  tensorAddcdiv,
+  tensorAddmv,
+  tensorAddmm,
+  tensorAddr,
+  tensorAddbmm,
+  tensorBaddbmm,
+  tensorMatch,
+  tensorNumel,
+  tensorMax,
+  tensorMin,
+  tensorKthvalue,
+  tensorMode,
+  tensorMedian,
+  tensorSum,
+  tensorProd,
+  tensorCumsum,
+  tensorCumprod,
+  tensorSign,
+  tensorTrace,
+  tensorCross,
+  tensorCmax,
+  tensorCmin,
+  tensorCmaxValue,
+  tensorCminValue,
+  tensorDiag,
+  tensorEye,
+  tensorRange,
+  tensorRandperm,
+  tensorReshape,
+  tensorSort,
+  tensorTopk,
+  tensorTril,
+  tensorTriu,
+  tensorCatArray,
+  tensorEqual,
+  tensorLtValue,
+  tensorLeValue,
+  tensorGtValue,
+  tensorGeValue,
+  tensorNeValue,
+  tensorEqValue,
+  tensorLtValueT,
+  tensorLeValueT,
+  tensorGtValueT,
+  tensorGeValueT,
+  tensorNeValueT,
+  tensorEqValueT,
+  tensorLtTensor,
+  tensorLeTensor,
+  tensorGtTensor,
+  tensorGeTensor,
+  tensorNeTensor,
+  tensorEqTensor,
+  tensorLtTensorT,
+  tensorLeTensorT,
+  tensorGtTensorT,
+  tensorGeTensorT,
+  tensorNeTensorT,
+  tensorEqTensorT,
+  tensorAbs,
+  tensorSigmoid,
+  tensorLog,
+  tensorLog10,
+  tensorLog1p,
+  tensorLog2,
+  tensorExp,
+  tensorExpm1,
+  tensorCos,
+  tensorAcos,
+  tensorCosh,
+  tensorSin,
+  tensorAsin,
+  tensorSinh,
+  tensorTan,
+  tensorAtan,
+  tensorAtan2,
+  tensorTanh,
+  tensorPow,
+  tensorTpow,
+  tensorSqrt,
+  tensorRsqrt,
+  tensorCeil,
+  tensorFloor,
+  tensorRound,
+  tensorTrunc,
+  tensorFrac,
+  tensorLerp,
+  tensorMean,
+  tensorStd,
+  tensorVar,
+  tensorNorm,
+  tensorRenorm,
+  tensorDist,
+  tensorHistc,
+  tensorBhistc,
+  tensorMeanall,
+  tensorVarall,
+  tensorStdall,
+  tensorNormall,
+  tensorLinspace,
+  tensorLogspace,
+  tensorRand,
+  tensorRandn,
+  tensorLogicalAndAll,
+  tensorLogicalAnd,
+  tensorLogicalAnyAll,
+  tensorLogicalAny,
+
+  // th_random
+  tensorRandom,
+  tensorGeometric,
+  tensorBernoulli,
+  tensorBernoulli_FloatTensor,
+  tensorBernoulli_DoubleTensor,
+  tensorUniform,
+  tensorNormal,
+  tensorExponential,
+  tensorCauchy,
+  tensorLogNormal,
+  tensorMultinomial,
+
+  // storage functions
+  storageSet,
+  storageGet,
+
+  storageNew,
+  storageNewWithSize,
+  storageNewWithSize1,
+  storageNewWithSize2,
+  storageNewWithSize3,
+  storageNewWithSize4,
+
+  storageFree,
+  storageResize,
+  storageFill,
+
+  // communication requests
+  sendTensor,
+  sendStorage,
+
+  exit
+};
+
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/common/RPC-inl.hpp b/torch/lib/THD/master_worker/common/RPC-inl.hpp
new file mode 100644
index 0000000..b6dfb86
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/RPC-inl.hpp
@@ -0,0 +1,109 @@
+#include <cstdint>
+#include "TH/THStorage.h"
+#include "Traits.hpp"
+
+namespace thd { namespace rpc { namespace detail {
+////////////////////////////////////////////////////////////////////////////////
+
+constexpr size_t INITIAL_BUFFER_SIZE = 256;
+
+template<typename real,
+         typename = typename std::enable_if<std::is_arithmetic<real>::value>::type>
+inline void _appendScalar(ByteArray& str, real data) {
+  str.append(reinterpret_cast<char*>(&data), sizeof(data));
+}
+
+inline void _appendType(ByteArray& str, RPCType _type) {
+  char type = static_cast<char>(_type);
+  str.append(&type, sizeof(type));
+}
+
+template<typename T>
+inline void __appendData(ByteArray& str, const T& arg,
+    std::false_type is_generator, std::false_type is_tensor, std::false_type is_storage) {
+  _appendType(str, type_traits<T>::type);
+  _appendScalar<T>(str, arg);
+}
+
+template<typename T>
+inline void __appendData(ByteArray& str, const T& arg,
+    std::true_type is_generator, std::false_type is_tensor, std::false_type is_storage) {
+  _appendType(str, RPCType::GENERATOR);
+  _appendScalar<object_id_type>(str, arg->generator_id);
+}
+
+template<typename T>
+inline void __appendData(ByteArray& str, const T& arg,
+    std::false_type is_generator, std::true_type is_tensor, std::false_type is_storage) {
+  _appendType(str, RPCType::TENSOR);
+  _appendScalar<object_id_type>(str, arg->tensor_id);
+}
+
+template<typename T>
+inline void __appendData(ByteArray& str, const T& arg,
+    std::false_type is_generator, std::false_type is_tensor, std::true_type is_storage) {
+  _appendType(str, RPCType::STORAGE);
+  _appendScalar<object_id_type>(str, arg->storage_id);
+}
+
+template<typename T>
+inline void _appendData(ByteArray& str, const T& arg) {
+  __appendData(
+      str,
+      arg,
+      is_any_of<T, THDGeneratorPtrTypes>(),
+      is_any_of<T, THDTensorPtrTypes>(),
+      is_any_of<T, THDStoragePtrTypes>()
+  );
+}
+
+inline void _appendData(ByteArray& str, THLongStorage* arg) {
+  _appendType(str, RPCType::LONG_STORAGE);
+  _appendScalar<char>(str, arg == NULL);
+  if (!arg) return;
+  _appendScalar<ptrdiff_t>(str, THLongStorage_size(arg));
+  for (ptrdiff_t i = 0; i < THLongStorage_size(arg); i++)
+    _appendScalar<int64_t>(str, THLongStorage_get(arg, i));
+}
+
+template<typename T>
+inline void _appendData(ByteArray& str, const std::vector<T>& arg) {
+  int l = arg.size();
+  _appendData(str, l);
+  for (size_t i = 0; i < l; i++)
+    __appendData(
+        str,
+        arg[i],
+        is_any_of<T, THDGeneratorPtrTypes>(),
+        is_any_of<T, THDTensorPtrTypes>(),
+        is_any_of<T, THDStoragePtrTypes>()
+    );
+}
+
+inline void _appendData(ByteArray& str, RPCType type) {
+  _appendType(str, type);
+}
+
+inline void _packIntoString(ByteArray& str) {};
+
+template <typename T, typename ...Args>
+inline void _packIntoString(ByteArray& str, const T& arg, const Args&... args) {
+  _appendData(str, arg);
+  _packIntoString(str, args...);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+
+template <typename ...Args>
+inline std::unique_ptr<RPCMessage> packMessage(
+    function_id_type fid,
+    const Args&... args
+) {
+  ByteArray msg(detail::INITIAL_BUFFER_SIZE);
+  detail::_appendScalar<function_id_type>(msg, fid);
+  detail::_packIntoString(msg, args...);
+  return std::unique_ptr<RPCMessage>(new RPCMessage(std::move(msg)));
+}
+
+}} // namespace rpc, thd
diff --git a/torch/lib/THD/master_worker/common/RPC.cpp b/torch/lib/THD/master_worker/common/RPC.cpp
new file mode 100644
index 0000000..9a6a187
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/RPC.cpp
@@ -0,0 +1,157 @@
+#include "RPC.hpp"
+#include "ByteArray.hpp"
+
+#include <cstdarg>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+
+namespace thd {
+namespace rpc {
+
+RPCMessage::RPCMessage()
+  : _msg(0)
+  , _offset(0)
+{}
+
+RPCMessage::RPCMessage(char* str, size_t size)
+  : _msg(str, size)
+  , _offset(0)
+{}
+
+RPCMessage::RPCMessage(const ByteArray& str)
+  : _msg(str)
+  , _offset(0)
+{}
+
+RPCMessage::RPCMessage(ByteArray&& str)
+  : _msg(std::move(str))
+  , _offset(0)
+{}
+
+ByteArray& RPCMessage::bytes() {
+  return _msg;
+}
+
+const char* RPCMessage::data() const {
+  return _msg.data() + _offset;
+}
+
+bool RPCMessage::isEmpty() const {
+  return _offset >= _msg.length();
+}
+
+RPCMessage::size_type RPCMessage::remaining() const {
+  return _msg.length() - _offset;
+}
+
+const char* RPCMessage::read(size_t num_bytes) {
+  if (_offset + num_bytes > _msg.length())
+    throw std::out_of_range("invalid access: out of bounds");
+  const char* ret_val = _msg.data() + _offset;
+  _offset += num_bytes;
+  return ret_val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+template<typename T>
+inline T unpackScalar(RPCMessage& raw_message) {
+  return *reinterpret_cast<const T*>(raw_message.read(sizeof(T)));
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+static_assert(sizeof(RPCType) == sizeof(char), "RPCType has to be of the "
+    "same size as char");
+RPCType unpackType(RPCMessage& raw_message) {
+  char _type = *raw_message.read(sizeof(RPCType));
+  return static_cast<RPCType>(_type);
+}
+
+RPCType peekType(RPCMessage& raw_message) {
+  char _type = *raw_message.data();
+  return static_cast<RPCType>(_type);
+}
+
+function_id_type unpackFunctionId(RPCMessage& raw_message) {
+  return unpackScalar<function_id_type>(raw_message);
+}
+
+double unpackFloat(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type == RPCType::DOUBLE)
+    return unpackScalar<double>(raw_message);
+  else if (type == RPCType::FLOAT)
+    return unpackScalar<float>(raw_message);
+
+  throw std::invalid_argument("wrong real type in the raw message");
+}
+
+int64_t unpackInteger(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type == RPCType::CHAR)
+    return unpackScalar<int8_t>(raw_message);
+  else if (type == RPCType::SHORT)
+    return unpackScalar<int16_t>(raw_message);
+  else if (type == RPCType::INT)
+    return unpackScalar<int32_t>(raw_message);
+  else if (type == RPCType::LONG)
+    return unpackScalar<int64_t>(raw_message);
+  else if (type == RPCType::LONG_LONG)
+    return unpackScalar<int64_t>(raw_message);
+
+  throw std::invalid_argument(std::string("wrong integer type in the raw message (") +
+          std::to_string(static_cast<char>(type)) + ")");
+}
+
+object_id_type unpackTensor(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type == RPCType::TENSOR)
+    return unpackScalar<object_id_type>(raw_message);
+  throw std::invalid_argument("expected tensor in the raw message");
+}
+
+object_id_type unpackStorage(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type == RPCType::STORAGE)
+    return unpackScalar<object_id_type>(raw_message);
+  throw std::invalid_argument("expected storage in the raw message");
+}
+
+object_id_type unpackGenerator(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type == RPCType::GENERATOR) {
+    return unpackScalar<object_id_type>(raw_message);
+  }
+  throw std::invalid_argument("expected generator in the raw message");
+}
+
+THLongStorage* unpackTHLongStorage(RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  if (type != RPCType::LONG_STORAGE)
+    throw std::invalid_argument("expected THLongStorage in the raw message");
+  char is_null = unpackScalar<char>(raw_message);
+  if (is_null) return NULL;
+  ptrdiff_t size = unpackScalar<ptrdiff_t>(raw_message);
+  THLongStorage* storage = THLongStorage_newWithSize(size);
+  int64_t* data = THLongStorage_data(storage);
+
+  try {
+    for (int i = 0; i < size; i++) {
+      data[i] = unpackScalar<int64_t>(raw_message);
+    }
+  } catch (std::exception& e) {
+    THLongStorage_free(storage);
+    throw;
+  }
+
+  return storage;
+}
+
+}} // namespace rpc, thd
diff --git a/torch/lib/THD/master_worker/common/RPC.hpp b/torch/lib/THD/master_worker/common/RPC.hpp
new file mode 100644
index 0000000..af6e804
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/RPC.hpp
@@ -0,0 +1,53 @@
+#pragma once
+#include "../master/THDTensor.h"
+#include "ByteArray.hpp"
+#include "TH/THStorage.h"
+#include "RPCType.hpp"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace thd {
+
+using object_id_type = uint64_t;
+
+namespace rpc {
+
+using function_id_type = uint16_t;
+
+class RPCMessage {
+public:
+  using size_type = ByteArray::size_type;
+  RPCMessage();
+  RPCMessage(char* str, size_t size);
+  RPCMessage(const ByteArray& str);
+  RPCMessage(ByteArray&& str);
+
+  ByteArray& bytes(); // Raw data.
+  const char* data() const; // Offset data.
+  bool isEmpty() const;
+  size_type remaining() const; // Length of the msg left to read.
+  const char* read(size_t num_bytes);
+
+private:
+  ByteArray _msg;
+  size_t _offset;
+};
+
+template <typename ...Args>
+std::unique_ptr<RPCMessage> packMessage(function_id_type fid, const Args&... args);
+
+RPCType unpackType(RPCMessage& raw_message);
+RPCType peekType(RPCMessage& raw_message);
+double unpackFloat(RPCMessage& raw_message);
+function_id_type unpackFunctionId(RPCMessage& raw_message);
+int64_t unpackInteger(RPCMessage& raw_message);
+object_id_type unpackGenerator(RPCMessage& raw_message);
+object_id_type unpackTensor(RPCMessage& raw_message);
+object_id_type unpackStorage(RPCMessage& raw_message);
+THLongStorage* unpackTHLongStorage(RPCMessage& raw_message);
+
+}} // namespace rpc, thd
+
+#include "RPC-inl.hpp"
diff --git a/torch/lib/THD/master_worker/common/Traits.hpp b/torch/lib/THD/master_worker/common/Traits.hpp
new file mode 100644
index 0000000..ada83d2
--- /dev/null
+++ b/torch/lib/THD/master_worker/common/Traits.hpp
@@ -0,0 +1,66 @@
+#include <type_traits>
+#include <tuple>
+
+#include "master_worker/master/THDTensor.h"
+#include "master_worker/master/THDStorage.h"
+
+namespace thd {
+
+template<typename...>
+struct or_trait : std::false_type {};
+
+template<typename T>
+struct or_trait<T> : T {};
+
+template <typename T, typename... Ts>
+struct or_trait<T, Ts...>
+  : std::conditional<T::value, T, or_trait<Ts...>>::type {};
+
+template <typename T, typename U>
+struct is_any_of : std::false_type {};
+
+template <typename T, typename U>
+struct is_any_of<T, std::tuple<U>> : std::is_same<T, U> {};
+
+template <typename T, typename Head, typename... Tail>
+struct is_any_of<T, std::tuple<Head, Tail...>>
+  : or_trait<std::is_same<T, Head>, is_any_of<T, std::tuple<Tail...>>> {};
+
+using THDGeneratorTypes = std::tuple<THDGenerator>;
+
+using THDTensorTypes = std::tuple<
+    THDByteTensor,
+    THDCharTensor,
+    THDShortTensor,
+    THDIntTensor,
+    THDLongTensor,
+    THDFloatTensor,
+    THDDoubleTensor
+>;
+
+using THDStorageTypes = std::tuple<
+    THDByteStorage,
+    THDCharStorage,
+    THDShortStorage,
+    THDIntStorage,
+    THDLongStorage,
+    THDFloatStorage,
+    THDDoubleStorage
+>;
+
+template<template<typename> class Trait, typename U>
+struct map_to_ptr {};
+
+template<template <typename> class Trait, typename... Types>
+struct map_to_ptr<Trait, std::tuple<Types...>> {
+  using type = std::tuple<
+    typename std::add_pointer<Types>::type...,
+    typename std::add_pointer<typename Trait<Types>::type>::type...
+  >;
+};
+
+using THDGeneratorPtrTypes = map_to_ptr<std::add_const, THDGeneratorTypes>::type;
+using THDTensorPtrTypes = map_to_ptr<std::add_const, THDTensorTypes>::type;
+using THDStoragePtrTypes = map_to_ptr<std::add_const, THDStorageTypes>::type;
+
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/master/Master.cpp b/torch/lib/THD/master_worker/master/Master.cpp
new file mode 100644
index 0000000..87c81cf
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/Master.cpp
@@ -0,0 +1,42 @@
+#include "Master.h"
+#include "Master.hpp"
+#include "State.hpp"
+#include "../worker/Worker.h"
+#include "../../process_group/General.hpp"
+#include "../../base/Exceptions.hpp"
+
+namespace thd {
+namespace master {
+
+std::unique_ptr<MasterCommandChannel> masterCommandChannel;
+
+} // namespace master
+} // namespace thd
+
+using namespace thd;
+using namespace thd::master;
+
+void THDMasterWorkerInit(THDChannelType channel_type, std::string init_method = "env://",
+                         int world_size = -1, std::string group_name = "",
+                         int rank = -1) {
+  HANDLE_EXCEPTIONS
+  THDProcessGroupInit(channel_type, init_method, world_size, group_name, rank);
+
+  if (dataChannel->getRank() > 0) {
+    /*
+     * Worker initialization. It goes into infinite loop in which waits
+     * for commands from master. Returning from `THDWorkerMain` indicates
+     * a failure so it will `return false`.
+     */
+    THDWorkerMain(init_method, world_size, group_name, dataChannel->getRank());
+    THError("unexpected exit from worker main loop");
+  }
+
+  THDState::s_workers = std::vector<WorkerState>(dataChannel->getNumProcesses());
+
+  auto config = getInitConfig(init_method, world_size, group_name, dataChannel->getRank());
+  masterCommandChannel.reset(new MasterCommandChannel(config));
+  masterCommandChannel->init();
+
+  END_HANDLE_EXCEPTIONS
+}
diff --git a/torch/lib/THD/master_worker/master/Master.h b/torch/lib/THD/master_worker/master/Master.h
new file mode 100644
index 0000000..7c6088e
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/Master.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "../../THD.h"
+#include <string>
+
+THD_API void THDMasterWorkerInit(THDChannelType channel_type, std::string init_method,
+                                 int world_size, std::string group_name, int rank);
diff --git a/torch/lib/THD/master_worker/master/Master.hpp b/torch/lib/THD/master_worker/master/Master.hpp
new file mode 100644
index 0000000..94f7e35
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/Master.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../common/CommandChannel.hpp"
+#include "../../base/DataChannel.hpp"
+
+#include <memory>
+
+namespace thd {
+namespace master {
+
+extern std::unique_ptr<MasterCommandChannel> masterCommandChannel;
+
+} // namespace master
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/master/State.cpp b/torch/lib/THD/master_worker/master/State.cpp
new file mode 100644
index 0000000..c1c237f
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/State.cpp
@@ -0,0 +1,14 @@
+#include "State.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace thd {
+namespace master {
+
+std::vector<WorkerState> THDState::s_workers;
+thread_local rank_type THDState::s_current_worker = 1;
+std::uint64_t THDState::s_nextId = 0;
+
+} // namespace master
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/master/State.h b/torch/lib/THD/master_worker/master/State.h
new file mode 100644
index 0000000..b111992
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/State.h
@@ -0,0 +1,6 @@
+#pragma once
+
+THD_API int THDGetNode();
+THD_API int THDSetNode();
+THD_API int THDGetDevice();
+THD_API int THDSetDevice();
diff --git a/torch/lib/THD/master_worker/master/State.hpp b/torch/lib/THD/master_worker/master/State.hpp
new file mode 100644
index 0000000..0dc1e9d
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/State.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../../base/DataChannel.h"
+#include "../../base/ChannelUtils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace thd {
+namespace master {
+
+struct WorkerState {
+  WorkerState() : copy_mutex() {}
+
+  std::mutex copy_mutex;
+};
+
+struct THDState {
+  static std::vector<WorkerState> s_workers;
+  thread_local static rank_type s_current_worker;
+  static std::uint64_t s_nextId;
+};
+
+} // namespace master
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/master/THDRandom.cpp b/torch/lib/THD/master_worker/master/THDRandom.cpp
new file mode 100644
index 0000000..c0ea72d
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDRandom.cpp
@@ -0,0 +1,64 @@
+#include "THD.h"
+#include "THDRandom.h"
+
+#include "Utils.hpp"
+#include "State.hpp"
+#include "master_worker/common/RPC.hpp"
+#include "master_worker/common/Functions.hpp"
+#include "master_worker/master/Master.hpp"
+
+#include <cstring>
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+static THDGenerator* THDGenerator_newUnseeded() {
+  THDGenerator *new_generator = new THDGenerator();
+  new_generator->generator_id = THDState::s_nextId++;
+  return new_generator;
+}
+
+THDGenerator* THDGenerator_new() {
+  THDGenerator *generator = THDGenerator_newUnseeded();
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::generatorNew, generator),
+    THDState::s_current_worker
+  );
+  THDRandom_seed(generator);
+  return generator;
+}
+
+THDGenerator* THDGenerator_copy(THDGenerator *self, THDGenerator *from) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::generatorCopy, self, from),
+    THDState::s_current_worker
+  );
+
+  return self;
+}
+
+void THDGenerator_free(THDGenerator *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::generatorFree, self),
+    THDState::s_current_worker
+  );
+
+  delete self;
+}
+
+uint64_t THDRandom_seed(THDGenerator *_generator) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::generatorSeed, _generator),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<uint64_t>(THDState::s_current_worker);
+}
+
+void THDRandom_manualSeed(THDGenerator *_generator, uint64_t the_seed_) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::generatorManualSeed, _generator, the_seed_),
+    THDState::s_current_worker
+  );
+}
diff --git a/torch/lib/THD/master_worker/master/THDRandom.h b/torch/lib/THD/master_worker/master/THDRandom.h
new file mode 100644
index 0000000..fea0081
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDRandom.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <TH/TH.h>
+#include "../../THD.h"
+
+typedef struct THDGenerator {
+  // Additional fields
+  uint64_t generator_id;
+} THDGenerator;
+
+/* Manipulate THDGenerator objects */
+THD_API THDGenerator * THDGenerator_new(void);
+THD_API THDGenerator * THDGenerator_copy(THDGenerator *self, THDGenerator *from);
+THD_API void THDGenerator_free(THDGenerator *gen);
+
+/* Initializes the random number generator from /dev/urandom (or on Windows
+platforms with the current time (granularity: seconds)) and returns the seed. */
+THD_API uint64_t THDRandom_seed(THDGenerator *_generator);
+
+/* Initializes the random number generator with the given uint64_t "the_seed_". */
+THD_API void THDRandom_manualSeed(THDGenerator *_generator, uint64_t the_seed_);
diff --git a/torch/lib/THD/master_worker/master/THDStorage.cpp b/torch/lib/THD/master_worker/master/THDStorage.cpp
new file mode 100644
index 0000000..98286de
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDStorage.cpp
@@ -0,0 +1,14 @@
+#include "THD.h"
+#include "State.hpp"
+#include "Utils.hpp"
+#include "master_worker/common/RPC.hpp"
+#include "master_worker/common/Functions.hpp"
+#include "master_worker/master/Master.hpp"
+#include "process_group/General.hpp"
+
+#include <cstring>
+#include <memory>
+
+#include "master_worker/master/generic/THDStorage.cpp"
+#include "TH/THGenerateAllTypes.h"
+
diff --git a/torch/lib/THD/master_worker/master/THDStorage.h b/torch/lib/THD/master_worker/master/THDStorage.h
new file mode 100644
index 0000000..e53fe46
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDStorage.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <TH/TH.h>
+#include "../../THD.h"
+
+#define THDStorage         TH_CONCAT_3(THD,Real,Storage)
+#define THDStorage_(NAME)  TH_CONCAT_4(THD,Real,Storage_,NAME)
+
+#include "generic/THDStorage.h"
+#include <TH/THGenerateAllTypes.h>
diff --git a/torch/lib/THD/master_worker/master/THDTensor.cpp b/torch/lib/THD/master_worker/master/THDTensor.cpp
new file mode 100644
index 0000000..8ad2ca0
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDTensor.cpp
@@ -0,0 +1,31 @@
+#define __STDC_FORMAT_MACROS
+
+#include "THDTensor.h"
+#include "State.hpp"
+#include "Utils.hpp"
+#include "master_worker/common/RPC.hpp"
+#include "master_worker/common/Functions.hpp"
+#include "master_worker/master/Master.hpp"
+#include "process_group/General.hpp"
+
+#include <cstring>
+#include <memory>
+#include <inttypes.h>
+
+#include "master_worker/master/generic/THDTensorMeta.cpp"
+#include "TH/THGenerateAllTypes.h"
+
+#include "master_worker/master/generic/THDTensor.cpp"
+#include "TH/THGenerateAllTypes.h"
+
+#include "master_worker/master/generic/THDTensorCopy.cpp"
+#include "TH/THGenerateAllTypes.h"
+
+#include "master_worker/master/generic/THDTensorRandom.cpp"
+#include "TH/THGenerateAllTypes.h"
+
+#include "master_worker/master/generic/THDTensorMath.cpp"
+#include "TH/THGenerateAllTypes.h"
+
+#include "master_worker/master/generic/THDTensorLapack.cpp"
+#include "TH/THGenerateFloatTypes.h"
diff --git a/torch/lib/THD/master_worker/master/THDTensor.h b/torch/lib/THD/master_worker/master/THDTensor.h
new file mode 100644
index 0000000..e5aad74
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/THDTensor.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <TH/TH.h>
+#include "../../THD.h"
+
+#define THDTensor         TH_CONCAT_3(THD,Real,Tensor)
+#define THDTensor_(NAME)  TH_CONCAT_4(THD,Real,Tensor_,NAME)
+
+#define THD_DESC_BUFF_LEN 64
+typedef struct {
+  char str[THD_DESC_BUFF_LEN];
+} THDDescBuff;
+
+#include "generic/THDTensor.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/THDTensorCopy.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/THDTensorRandom.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/THDTensorMath.h"
+#include <TH/THGenerateAllTypes.h>
+
+#include "generic/THDTensorLapack.h"
+#include <TH/THGenerateFloatTypes.h>
diff --git a/torch/lib/THD/master_worker/master/Utils.hpp b/torch/lib/THD/master_worker/master/Utils.hpp
new file mode 100644
index 0000000..39431ad
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/Utils.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "process_group/General.hpp"
+
+template<typename T>
+T receiveValueFromWorker(int worker_id) {
+  thd::RPCType type = thd::type_traits<T>::type;
+  if (thd::isInteger(type)) {
+    thd::IntScalar wrapped_value;
+    thd::dataChannel->receive(wrapped_value, worker_id);
+    return static_cast<T>(wrapped_value.value());
+  } else if (thd::isFloat(type)) {
+    thd::FloatScalar wrapped_value;
+    thd::dataChannel->receive(wrapped_value, worker_id);
+    return static_cast<T>(wrapped_value.value());
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
diff --git a/torch/lib/THD/master_worker/master/generic/THDStorage.cpp b/torch/lib/THD/master_worker/master/generic/THDStorage.cpp
new file mode 100644
index 0000000..82c3b1b
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDStorage.cpp
@@ -0,0 +1,215 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDStorage.cpp"
+#else
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+static THDStorage* THDStorage_(_alloc)() {
+  THDStorage* new_storage = new THDStorage();
+  std::memset(reinterpret_cast<void*>(new_storage), 0, sizeof(new_storage));
+  new (&new_storage->refcount) std::atomic<int>(1);
+  new_storage->storage_id = THDState::s_nextId++;
+  new_storage->node_id = THDState::s_current_worker;
+  new_storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
+  return new_storage;
+}
+
+ptrdiff_t THDStorage_(size)(const THDStorage* storage) {
+  return storage->size;
+}
+
+size_t THDStorage_(elementSize)(void) {
+  return sizeof(real);
+}
+
+THDStorage* THDStorage_(new)() {
+  THDStorage* storage = THDStorage_(_alloc)();
+  RPCType type = type_traits<real>::type;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNew,
+      type,
+      storage
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+void THDStorage_(set)(THDStorage* storage, ptrdiff_t offset, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageSet,
+      storage,
+      offset,
+      value
+    ),
+    THDState::s_current_worker
+  );
+}
+
+real THDStorage_(get)(const THDStorage* storage, ptrdiff_t offset) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageGet,
+      storage,
+      offset,
+      type_traits<real>::type
+    ),
+    THDState::s_current_worker
+  );
+  return receiveValueFromWorker<real>(storage->node_id);
+}
+
+THDStorage* THDStorage_(newWithSize)(ptrdiff_t size) {
+  RPCType type = type_traits<real>::type;
+  THDStorage *storage = THDStorage_(_alloc)();
+  storage->size = size;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNewWithSize,
+      type,
+      storage,
+      size
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+THDStorage* THDStorage_(newWithSize1)(real value) {
+  RPCType type = type_traits<real>::type;
+  THDStorage *storage = THDStorage_(_alloc)();
+  storage->size = 1;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNewWithSize1,
+      type,
+      storage,
+      value
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+THDStorage* THDStorage_(newWithSize2)(real value1, real value2) {
+  RPCType type = type_traits<real>::type;
+  THDStorage *storage = THDStorage_(_alloc)();
+  storage->size = 2;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNewWithSize1,
+      type,
+      storage,
+      value1,
+      value2
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+THDStorage* THDStorage_(newWithSize3)(real value1, real value2, real value3) {
+  RPCType type = type_traits<real>::type;
+  THDStorage *storage = THDStorage_(_alloc)();
+  storage->size = 3;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNewWithSize1,
+      type,
+      storage,
+      value1,
+      value2,
+      value3
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+THDStorage* THDStorage_(newWithSize4)(real value1, real value2, real value3, real value4) {
+  RPCType type = type_traits<real>::type;
+  THDStorage *storage = THDStorage_(_alloc)();
+  storage->size = 4;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageNewWithSize1,
+      type,
+      storage,
+      value1,
+      value2,
+      value3,
+      value4
+    ),
+    THDState::s_current_worker
+  );
+  return storage;
+}
+
+void THDStorage_(setFlag)(THDStorage *storage, const char flag) {
+  storage->flag |= flag;
+}
+
+void THDStorage_(clearFlag)(THDStorage *storage, const char flag) {
+  storage->flag &= ~flag;
+}
+
+void THDStorage_(retain)(THDStorage *storage) {
+  if (storage && (storage->flag & TH_STORAGE_REFCOUNTED))
+    storage->refcount++;
+}
+
+void THDStorage_(swap)(THDStorage *storage1, THDStorage *storage2) {
+  THDStorage dummy = *storage1;
+  *storage1 = *storage2;
+  *storage2 = dummy;
+}
+
+void THDStorage_(free)(THDStorage *storage) {
+  if (!storage || !(storage->flag & TH_STORAGE_REFCOUNTED)) return;
+
+  if (--storage->refcount == 0) {
+    masterCommandChannel->sendMessage(
+      packMessage(
+        Functions::storageFree,
+        storage
+      ),
+      THDState::s_current_worker
+    );
+
+    delete storage;
+  }
+}
+
+void THDStorage_(resize)(THDStorage *storage, ptrdiff_t size) {
+  if (!(storage->flag & TH_STORAGE_RESIZABLE))
+    THError("Trying to resize storage that is not resizable");
+  if (size < storage->size)
+    return;
+
+  storage->size = size;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageResize,
+      storage,
+      size
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDStorage_(fill)(THDStorage *storage, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::storageFill,
+      storage,
+      value
+    ),
+    THDState::s_current_worker
+  );
+}
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDStorage.h b/torch/lib/THD/master_worker/master/generic/THDStorage.h
new file mode 100644
index 0000000..4d99332
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDStorage.h
@@ -0,0 +1,53 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDStorage.h"
+#else
+
+typedef struct THDStorage {
+  uint64_t storage_id;
+  ptrdiff_t size;
+  int refcount;
+  char flag;
+  // these are here only so that the struct has a similar structure to TH
+  void* allocator;
+  void* allocatorContext;
+  struct THDStorage *view;
+  // Additional fields
+  int node_id;
+  int device_id; // unused at the moment
+} THDStorage;
+
+THD_API ptrdiff_t THDStorage_(size)(const THDStorage*);
+THD_API size_t THDStorage_(elementSize)(void);
+
+/* slow access -- checks everything */
+THD_API void THDStorage_(set)(THDStorage*, ptrdiff_t, real);
+THD_API real THDStorage_(get)(const THDStorage*, ptrdiff_t);
+
+THD_API THDStorage* THDStorage_(new)(void);
+THD_API THDStorage* THDStorage_(newWithSize)(ptrdiff_t size);
+THD_API THDStorage* THDStorage_(newWithSize1)(real);
+THD_API THDStorage* THDStorage_(newWithSize2)(real, real);
+THD_API THDStorage* THDStorage_(newWithSize3)(real, real, real);
+THD_API THDStorage* THDStorage_(newWithSize4)(real, real, real, real);
+THD_API THDStorage* THDStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags);
+
+/* takes ownership of data */
+//THD_API THDStorage* THDStorage_(newWithAllocator)(ptrdiff_t size,
+                                               //THDAllocator* allocator,
+                                               //void *allocatorContext);
+//THD_API THDStorage* THDStorage_(newWithDataAndAllocator)(
+    //real* data, ptrdiff_t size, THDAllocator* allocator, void *allocatorContext);
+
+/* should not differ with API */
+THD_API void THDStorage_(setFlag)(THDStorage *storage, const char flag);
+THD_API void THDStorage_(clearFlag)(THDStorage *storage, const char flag);
+THD_API void THDStorage_(retain)(THDStorage *storage);
+THD_API void THDStorage_(swap)(THDStorage *storage1, THDStorage *storage2);
+
+/* might differ with other API (like CUDA) */
+THD_API void THDStorage_(free)(THDStorage *storage);
+THD_API void THDStorage_(resize)(THDStorage *storage, ptrdiff_t size);
+THD_API void THDStorage_(fill)(THDStorage *storage, real value);
+
+#endif
+
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensor.cpp b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp
new file mode 100644
index 0000000..93dd5d4
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp
@@ -0,0 +1,1334 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensor.cpp"
+#else
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+// taken from TH (generic/THTensor.c)
+THDDescBuff THDTensor_(sizeDesc)(const THDTensor *tensor) {
+  const int L = THD_DESC_BUFF_LEN;
+  THDDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for (i = 0; i < tensor->nDimension; i++) {
+    if (n >= L) break;
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
+    if (i < tensor->nDimension-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if (n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+THDStorage *THDTensor_(storage)(const THDTensor *self) {
+  return self->storage;
+}
+
+ptrdiff_t THDTensor_(storageOffset)(const THDTensor *self) {
+  return self->storageOffset;
+}
+
+int THDTensor_(nDimension)(const THDTensor *self) {
+  return self->nDimension;
+}
+
+int64_t THDTensor_(size)(const THDTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
+      dim+1, THDTensor_(nDimension)(self));
+  return self->size[dim];
+}
+
+int64_t THDTensor_(stride)(const THDTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor", dim+1,
+      THDTensor_(nDimension)(self));
+  return self->stride[dim];
+}
+
+THLongStorage *THDTensor_(newSizeOf)(THDTensor *self) {
+  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+THLongStorage *THDTensor_(newStrideOf)(THDTensor *self) {
+  THLongStorage *stride = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(stride, self->stride);
+  return stride;
+}
+
+void THDTensor_(setFlag)(THDTensor *self, char flag) {
+  self->flag |= flag;
+}
+
+void THDTensor_(clearFlag)(THDTensor *self, char flag) {
+  self->flag &= ~flag;
+}
+
+THDTensor *THDTensor_(new)() {
+  THDTensor *tensor = THDTensor_(_alloc)();
+  RPCType constructed_type = type_traits<real>::type;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNew,
+      constructed_type,
+      tensor
+    ),
+    THDState::s_current_worker
+  );
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithTensor)(THDTensor *self) {
+  THDTensor *tensor = THDTensor_(_alloc)();
+  THDTensor_(_set)(
+    tensor,
+    self->storage,
+    self->storageOffset,
+    self->nDimension,
+    self->size,
+    self->stride
+  );
+  RPCType constructed_type = type_traits<real>::type;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNewWithTensor,
+      constructed_type,
+      tensor,
+      self
+    ),
+    THDState::s_current_worker
+  );
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride) {
+  THDTensor* tensor = THDTensor_(_alloc)();
+  if (size && stride)
+    THArgCheck(THLongStorage_size(size) == THLongStorage_size(stride), 4, "inconsistent size");
+  THDTensor_(_resize)(tensor, THLongStorage_size(size), THLongStorage_data(size), stride ? THLongStorage_data(stride) : nullptr);
+  RPCType constructed_type = type_traits<real>::type;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNewWithSize,
+      constructed_type,
+      tensor,
+      size,
+      stride
+    ),
+    THDState::s_current_worker
+  );
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithSize1d)(int64_t size0) {
+  THLongStorage *size = THLongStorage_newWithSize1(size0);
+  THDTensor *tensor = THDTensor_(newWithSize)(size, NULL);
+  THLongStorage_free(size);
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithSize2d)(int64_t size0, int64_t size1) {
+  THLongStorage *size = THLongStorage_newWithSize2(size0, size1);
+  THDTensor *tensor = THDTensor_(newWithSize)(size, NULL);
+  THLongStorage_free(size);
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithSize3d)(int64_t size0, int64_t size1, int64_t size2) {
+  THLongStorage *size = THLongStorage_newWithSize3(size0, size1, size2);
+  THDTensor *tensor = THDTensor_(newWithSize)(size, NULL);
+  THLongStorage_free(size);
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithSize4d)(int64_t size0, int64_t size1, int64_t size2, int64_t size3) {
+  THLongStorage *size = THLongStorage_newWithSize4(size0, size1, size2, size3);
+  THDTensor *tensor = THDTensor_(newWithSize)(size, NULL);
+  THLongStorage_free(size);
+  return tensor;
+}
+
+
+THDTensor *THDTensor_(newWithStorage)(THDStorage *storage, ptrdiff_t storageOffset,
+                                      THLongStorage *size, THLongStorage *stride) {
+  THDTensor* tensor = THDTensor_(_alloc)();
+  THDTensor_(_set)(
+    tensor,
+    storage,
+    storageOffset,
+    (size ? THLongStorage_size(size) : (stride ? THLongStorage_size(stride) : 0)),
+    (size ? THLongStorage_data(size) : nullptr),
+    (stride ? THLongStorage_data(stride) : nullptr)
+  );
+  RPCType constructed_type = type_traits<real>::type;
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNewWithStorage,
+      constructed_type,
+      storage,
+      storageOffset,
+      size,
+      stride
+    ),
+    THDState::s_current_worker
+  );
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithStorage1d)(THDStorage *storage, ptrdiff_t storageOffset,
+                                        int64_t size0, int64_t stride0) {
+  THLongStorage *size = THLongStorage_newWithSize1(size0);
+  THLongStorage *stride = THLongStorage_newWithSize1(stride0);
+  THDTensor *tensor = THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithStorage2d)(THDStorage *storage, ptrdiff_t storageOffset,
+                                        int64_t size0, int64_t stride0, int64_t size1, int64_t stride1) {
+  THLongStorage *size = THLongStorage_newWithSize2(size0, size1);
+  THLongStorage *stride = THLongStorage_newWithSize2(stride0, stride1);
+  THDTensor *tensor = THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+  return tensor;
+}
+
+THDTensor *THDTensor_(newWithStorage3d)(THDStorage *storage, ptrdiff_t storageOffset,
+                                        int64_t size0, int64_t stride0, int64_t size1, int64_t stride1,
+                                        int64_t size2, int64_t stride2) {
+  THLongStorage *size = THLongStorage_newWithSize3(size0, size1, size2);
+  THLongStorage *stride = THLongStorage_newWithSize3(stride0, stride1, stride2);
+  THDTensor *tensor = THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+  return THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+}
+
+THDTensor *THDTensor_(newWithStorage4d)(THDStorage *storage, ptrdiff_t storageOffset,
+                                        int64_t size0, int64_t stride0, int64_t size1, int64_t stride1,
+                                        int64_t size2, int64_t stride2, int64_t size3, int64_t stride3) {
+  THLongStorage *size = THLongStorage_newWithSize4(size0, size1, size2, size3);
+  THLongStorage *stride = THLongStorage_newWithSize4(stride0, stride1, stride2, stride3);
+  THDTensor *tensor = THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+  return THDTensor_(newWithStorage)(storage, storageOffset, size, stride);
+}
+
+THDTensor *THDTensor_(newClone)(THDTensor *self) {
+  THDTensor *clone = THDTensor_(_alloc)();
+  THDTensor_(_resize)(clone, self->nDimension, self->size, self->stride);
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNewClone,
+      clone,
+      self
+    ),
+    THDState::s_current_worker
+  );
+  return clone;
+}
+
+THDTensor *THDTensor_(newContiguous)(THDTensor *self) {
+  if (!THDTensor_(isContiguous)(self)) {
+    return THDTensor_(newClone)(self);
+  } else {
+    THDTensor_(retain)(self);
+    return self;
+  }
+}
+
+THDTensor *THDTensor_(newSelect)(THDTensor *tensor, int dimension, int64_t sliceIndex) {
+  THDTensor *self = THDTensor_(newWithTensor)(tensor);
+  THDTensor_(select)(self, NULL, dimension, sliceIndex);
+  return self;
+}
+
+THDTensor *THDTensor_(newNarrow)(THDTensor *tensor, int dimension,
+                                 int64_t firstIndex, int64_t size) {
+  THDTensor *self = THDTensor_(newWithTensor)(tensor);
+  THDTensor_(narrow)(self, NULL, dimension, firstIndex, size);
+  return self;
+}
+
+THDTensor *THDTensor_(newTranspose)(THDTensor *tensor, int dimension1, int dimension2) {
+  THDTensor *self = THDTensor_(newWithTensor)(tensor);
+  THDTensor_(transpose)(self, NULL, dimension1, dimension2);
+  return self;
+}
+
+THDTensor *THDTensor_(newUnfold)(THDTensor *tensor, int dimension, int64_t size, int64_t step) {
+  THDTensor *self = THDTensor_(newWithTensor)(tensor);
+  THDTensor_(unfold)(self, NULL, dimension, size, step);
+  return self;
+}
+
+THDTensor *THDTensor_(newView)(THDTensor *tensor, THLongStorage *size) {
+  // TODO
+  THError("newView not implemented");
+  return nullptr;
+}
+
+THDTensor *THDTensor_(newExpand)(THDTensor *tensor, THLongStorage *size) {
+  // TODO
+  THError("newExpand not implemented");
+  return nullptr;
+}
+
+void THDTensor_(resize)(THDTensor *tensor, THLongStorage *size, THLongStorage *stride) {
+  THArgCheck(size != NULL, 2, "invalid size");
+  if (stride)
+    THArgCheck(THLongStorage_size(stride) == THLongStorage_size(size), 3, "invalid stride");
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize,
+      tensor,
+      size,
+      stride
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize)(tensor, THLongStorage_size(size), THLongStorage_data(size), stride ? THLongStorage_data(stride) : nullptr);
+}
+
+void THDTensor_(resizeAs)(THDTensor *tensor, THDTensor *src) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResizeAs,
+      tensor,
+      src
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize)(tensor, src->nDimension, src->size, nullptr);
+}
+
+void THDTensor_(resize1d)(THDTensor *tensor, int64_t size0) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize1d,
+      tensor,
+      size0
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize)(tensor, 1, &size0, nullptr);
+}
+
+void THDTensor_(resize2d)(THDTensor *tensor, int64_t size0, int64_t size1) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize2d,
+      tensor,
+      size0,
+      size1
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize2d)(tensor, size0, size1);
+}
+
+void THDTensor_(resize3d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize3d,
+      tensor,
+      size0,
+      size1,
+      size2
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize3d)(tensor, size0, size1, size2);
+}
+
+void THDTensor_(resize4d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2, int64_t size3) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize4d,
+      tensor,
+      size0,
+      size1,
+      size2,
+      size3
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize4d)(tensor, size0, size1, size2, size3);
+}
+
+void THDTensor_(resize5d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorResize5d,
+      tensor,
+      size0,
+      size1,
+      size2,
+      size3,
+      size4
+    ),
+    THDState::s_current_worker
+  );
+  THDTensor_(_resize5d)(tensor, size0, size1, size2, size3, size4);
+}
+
+real THDTensor_(get1d)(const THDTensor *tensor, int64_t x0)
+{
+  // TODO
+  THError("get1d not supported!");
+  //THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
+  //THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  //return THDStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+  return 0;
+}
+
+void THDTensor_(set)(THDTensor *self, THDTensor *src) {
+  if (self == src)
+    return;
+
+  THDTensor_(_set)(self, src->storage, src->storageOffset,
+      src->nDimension, src->size, src->stride);
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSet,
+      self,
+      src
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(setStorage)(THDTensor *self, THDStorage *storage,
+                            ptrdiff_t storageOffset, THLongStorage *size,
+                            THLongStorage *stride) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSetStorage,
+      self,
+      storage,
+      storageOffset,
+      size,
+      stride
+    ),
+    THDState::s_current_worker
+  );
+  if (size && stride)
+    THArgCheck(THLongStorage_size(size) == THLongStorage_size(stride), 5, "inconsistent number of sizes and strides");
+
+  THDTensor_(_set)(
+    self,
+    storage,
+    storageOffset,
+    (size ? THLongStorage_size(size) : (stride ? THLongStorage_size(stride) : 0)),
+    (size ? THLongStorage_data(size) : nullptr),
+    (stride ? THLongStorage_data(stride) : nullptr)
+  );
+}
+
+void THDTensor_(setStorage1d)(THDTensor *self,
+                              THDStorage *storage,
+                              ptrdiff_t storageOffset,
+                              int64_t size0, int64_t stride0) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSetStorage1d,
+      self,
+      storage,
+      storageOffset,
+      size0,
+      stride0
+    ),
+    THDState::s_current_worker
+  );
+  int64_t size[] = {size0};
+  int64_t stride[] = {stride0};
+  THDTensor_(_set)(
+    self,
+    storage,
+    storageOffset,
+    1,
+    size,
+    stride
+  );
+}
+
+void THDTensor_(setStorage2d)(THDTensor *self,
+                              THDStorage *storage,
+                              ptrdiff_t storageOffset,
+                              int64_t size0, int64_t stride0,
+                              int64_t size1, int64_t stride1) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSetStorage2d,
+      self,
+      storage,
+      storageOffset,
+      size0,
+      size1,
+      stride0,
+      stride1
+    ),
+    THDState::s_current_worker
+  );
+  int64_t size[] = {size0, size1};
+  int64_t stride[] = {stride0, stride1};
+  THDTensor_(_set)(
+    self,
+    storage,
+    storageOffset,
+    2,
+    size,
+    stride
+  );
+}
+
+void THDTensor_(setStorage3d)(THDTensor *self,
+                              THDStorage *storage,
+                              ptrdiff_t storageOffset,
+                              int64_t size0, int64_t stride0,
+                              int64_t size1, int64_t stride1,
+                              int64_t size2, int64_t stride2) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSetStorage2d,
+      self,
+      storage,
+      storageOffset,
+      size0,
+      size1,
+      size2,
+      stride0,
+      stride1,
+      stride2
+    ),
+    THDState::s_current_worker
+  );
+  int64_t size[] = {size0, size1, size2};
+  int64_t stride[] = {stride0, stride1, stride2};
+  THDTensor_(_set)(
+    self,
+    storage,
+    storageOffset,
+    3,
+    size,
+    stride
+  );
+}
+void THDTensor_(setStorage4d)(THDTensor *self,
+                              THDStorage *storage,
+                              ptrdiff_t storageOffset,
+                              int64_t size0, int64_t stride0,
+                              int64_t size1, int64_t stride1,
+                              int64_t size2, int64_t stride2,
+                              int64_t size3, int64_t stride3) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSetStorage2d,
+      self,
+      storage,
+      storageOffset,
+      size0,
+      size1,
+      size2,
+      size3,
+      stride0,
+      stride1,
+      stride2,
+      stride3
+    ),
+    THDState::s_current_worker
+  );
+  int64_t size[] = {size0, size1, size2, size3};
+  int64_t stride[] = {stride0, stride1, stride2, stride3};
+  THDTensor_(_set)(
+    self,
+    storage,
+    storageOffset,
+    4,
+    size,
+    stride
+  );
+}
+
+void THDTensor_(narrow)(THDTensor *self, THDTensor *src, int dimension,
+    int64_t firstIndex, int64_t size) {
+  if (!src) src = self;
+
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck((firstIndex >= 0) && (firstIndex < src->size[dimension]), 3, "out of range");
+  THArgCheck((size > 0) && (firstIndex <= src->size[dimension] - size), 4, "out of range");
+
+  THDTensor_(set)(self, src);
+
+  if (firstIndex > 0)
+    self->storageOffset += firstIndex*self->stride[dimension];
+
+  self->size[dimension] = size;
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNarrow,
+      self,
+      src,
+      dimension,
+      firstIndex,
+      size
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(select)(THDTensor *self, THDTensor *src, int dimension, int64_t sliceIndex) {
+  if (!src)
+    src = self;
+
+  THArgCheck(src->nDimension > 1, 1, "cannot select on a vector");
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
+
+  THDTensor_(set)(self, src);
+  THDTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
+  for (int d = dimension; d < self->nDimension-1; d++) {
+    self->size[d] = self->size[d+1];
+    self->stride[d] = self->stride[d+1];
+  }
+  self->nDimension--;
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorSelect,
+      self,
+      src,
+      dimension,
+      sliceIndex
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(transpose)(THDTensor *self, THDTensor *src, int dimension1,
+    int dimension2) {
+  if (!src)
+    src = self;
+
+  THArgCheck((dimension1 >= 0) && dimension1 < src->nDimension, 1,
+    "out of range");
+  THArgCheck((dimension2 >= 0) && dimension2 < src->nDimension, 1,
+    "out of range");
+
+  THDTensor_(set)(self, src);
+
+  if (dimension1 == dimension2)
+    return;
+
+  std::swap(self->stride[dimension1], self->stride[dimension2]);
+  std::swap(self->size[dimension1], self->size[dimension2]);
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorTranspose,
+      self,
+      src,
+      dimension1,
+      dimension2
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(unfold)(THDTensor *self, THDTensor *src,
+                        int dimension, int64_t size, int64_t step) {
+  int64_t *newSize, *newStride;
+  if (!src)
+    src = self;
+
+  THArgCheck((src->nDimension > 0), 1, "cannot unfold an empty tensor");
+  THArgCheck((dimension > 0) && (dimension < src->nDimension), 2,
+    "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(step > 0, 4, "invalid step");
+
+  THDTensor_(set)(self, src);
+
+  newSize = new int64_t[self->nDimension + 1];
+  newStride = new int64_t[self->nDimension + 1];
+
+  newSize[self->nDimension] = size;
+  newStride[self->nDimension] = self->stride[dimension];
+
+  for (size_t d = 0; d < self->nDimension; d++) {
+    if (d == dimension) {
+      newSize[d] = (self->size[d] - size) / step + 1;
+      newStride[d] = step * self->stride[d];
+    } else {
+      newSize[d] = self->size[d];
+      newStride[d] = self->stride[d];
+    }
+  }
+
+
+  delete[] self->size;
+  delete[] self->stride;
+
+  self->size = newSize;
+  self->stride = newStride;
+  self->nDimension++;
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorUnfold,
+      self,
+      src,
+      dimension,
+      size,
+      step
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(squeeze)(THDTensor *self, THDTensor *src) {
+  int ndim = 0;
+
+  if (!src)
+    src = self;
+
+  THDTensor_(set)(self, src);
+
+  for (size_t d = 0; d < src->nDimension; d++) {
+    if (src->size[d] != 1) {
+      if (d != ndim) {
+        self->size[ndim] = src->size[d];
+        self->stride[ndim] = src->stride[d];
+      }
+      ndim++;
+    }
+  }
+
+  /* right now, we do not handle 0-dimension tensors */
+  if (ndim == 0 && src->nDimension > 0) {
+    self->size[0] = 1;
+    self->stride[0] = 1;
+    ndim = 1;
+  }
+  self->nDimension = ndim;
+  masterCommandChannel->sendMessage(
+      packMessage(Functions::tensorSqueeze, self, src),
+      THDState::s_current_worker
+  );
+}
+
+void THDTensor_(squeeze1d)(THDTensor *self, THDTensor *src, int dimension) {
+  THDTensor_(_squeeze1d)(self, src, dimension);
+  masterCommandChannel->sendMessage(
+      packMessage(Functions::tensorSqueeze1d, self, src),
+      THDState::s_current_worker
+  );
+}
+
+void THDTensor_(unsqueeze1d)(THDTensor *self, THDTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
+
+  THDTensor_(set)(self, src);
+
+  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1));
+  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
+int THDTensor_(isContiguous)(const THDTensor *self) {
+  int64_t z = 1;
+  for (std::ptrdiff_t d = self->nDimension - 1; d >= 0; d--) {
+    if (self->size[d] != 1) {
+      if (self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return 0;
+    }
+  }
+  return 1;
+}
+
+int THDTensor_(isSameSizeAs)(const THDTensor *self, const THDTensor *src) {
+  if (self->nDimension != src->nDimension)
+    return 0;
+  for (size_t d = 0; d < self->nDimension; d++)
+    if (self->size[d] != src->size[d])
+      return 0;
+  return 1;
+}
+
+int THDTensor_(isSetTo)(const THDTensor *self, const THDTensor *src) {
+  if (!self->storage)
+    return 0;
+  if (self->storage == src->storage &&
+      self->storageOffset == src->storageOffset &&
+      self->nDimension == src->nDimension) {
+    for (size_t d = 0; d < self->nDimension; d++) {
+      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+int THDTensor_(isSize)(const THDTensor *self, const THLongStorage *dims) {
+  if (self->nDimension != THLongStorage_size(dims))
+    return 0;
+  for (size_t d = 0; d < self->nDimension; d++)
+    if (self->size[d] != THLongStorage_get(dims, d))
+      return 0;
+  return 1;
+}
+
+ptrdiff_t THDTensor_(nElement)(const THDTensor *self) {
+  if (self->nDimension == 0) {
+    return 0;
+  } else {
+    ptrdiff_t nElement = 1;
+    for (size_t d = 0; d < self->nDimension; d++) {
+      nElement *= self->size[d];
+    }
+    return nElement;
+  }
+}
+
+void THDTensor_(retain)(THDTensor *tensor) {
+  tensor->refcount++;
+}
+
+void THDTensor_(free)(THDTensor *tensor) {
+  if (!tensor)
+    return;
+
+  // TODO: check refcounted flag?
+  if (--tensor->refcount == 0) {
+    delete[] tensor->size;
+    delete[] tensor->stride;
+    masterCommandChannel->sendMessage(
+        packMessage(
+          Functions::tensorFree,
+          tensor
+          ),
+        THDState::s_current_worker
+    );
+    THDStorage_(free)(tensor->storage);
+  }
+}
+
+accreal THDTensor_(dot)(THDTensor *self, THDTensor *src) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorDot, self, src),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+real THDTensor_(minall)(THDTensor *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMinall, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<real>(THDState::s_current_worker);
+}
+
+real THDTensor_(maxall)(THDTensor *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMaxall, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<real>(THDState::s_current_worker);
+}
+
+real THDTensor_(medianall)(THDTensor *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMedianall, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<real>(THDState::s_current_worker);
+}
+
+accreal THDTensor_(sumall)(THDTensor *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSumall, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+accreal THDTensor_(prodall)(THDTensor *self) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorProdall, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+void THDTensor_(add)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAdd, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(sub)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSub, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(mul)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMul, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(div)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorDiv, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(fmod)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorFmod, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(remainder)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRemainder, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(clamp)(THDTensor *self, THDTensor *src, real min_value, real max_value) {
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorClamp, self, src, min_value, max_value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cadd)(THDTensor *self, THDTensor *src1, real value, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCadd, self, src1, src2, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(csub)(THDTensor *self, THDTensor *src1, real value, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCsub, self, src1, src2, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cmul)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCmul, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cpow)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCpow, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cdiv)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCdiv, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cfmod)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCfmod, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cremainder)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCremainder, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addcmul)(THDTensor *self, THDTensor *src1, real value, THDTensor *src2, THDTensor *src3) {
+  if (self != src1) {
+    THDTensor_(resizeAs)(self, src1);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddcmul, self, src1, src2, src3, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addcdiv)(THDTensor *self, THDTensor *src1, real value, THDTensor *src2, THDTensor *src3) {
+  if (self != src1) {
+    THDTensor_(resizeAs)(self, src1);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddcdiv, self, src1, src2, src3, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addmv)(THDTensor *self, real beta, THDTensor *src, real alpha, THDTensor *mat,  THDTensor *vec) {
+  if ((mat->nDimension != 2) || (vec->nDimension != 1))
+    THError("matrix and vector expected, got %dD, %dD", mat->nDimension, vec->nDimension);
+
+  if (mat->size[1] != vec->size[0]) {
+    THDDescBuff bm = THDTensor_(sizeDesc)(mat);
+    THDDescBuff bv = THDTensor_(sizeDesc)(vec);
+    THError("size mismatch, %s, %s", bm.str, bv.str);
+  }
+
+  if (src->nDimension != 1)
+    THError("vector expected, got src: %dD", src->nDimension);
+
+  if (src->size[0] != mat->size[0]) {
+    THDDescBuff bt = THDTensor_(sizeDesc)(src);
+    THDDescBuff bm = THDTensor_(sizeDesc)(mat);
+    THError("size mismatch, src: %s, mat: %s", bt.str, bm.str);
+  }
+
+  if (self != src) {
+    THDTensor_(resizeAs)(self, src);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddmv, self, src, mat, vec, beta, alpha),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addmm)(THDTensor *self, real beta, THDTensor *src, real alpha, THDTensor *mat1, THDTensor *mat2) {
+  if ((mat1->nDimension != 2) || (mat2->nDimension != 2))
+    THError("matrices expected, got %dD, %dD tensors", mat1->nDimension, mat2->nDimension);
+
+  if (mat1->size[1] != mat2->size[0]) {
+    THDDescBuff bm1 = THDTensor_(sizeDesc)(mat1);
+    THDDescBuff bm2 = THDTensor_(sizeDesc)(mat2);
+    THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
+  }
+
+  if (src->nDimension != 2)
+    THError("matrix expected, got %dD tensor for t", src->nDimension);
+
+  if ((src->size[0] != mat1->size[0]) || (src->size[1] != mat2->size[1])) {
+    THDDescBuff bt  = THDTensor_(sizeDesc)(src);
+    THDDescBuff bm1 = THDTensor_(sizeDesc)(mat1);
+    THDDescBuff bm2 = THDTensor_(sizeDesc)(mat2);
+    THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
+  }
+
+  if (self != src) {
+    THDTensor_(resizeAs)(self, src);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddmm, self, src, mat1, mat2, beta, alpha),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addr)(THDTensor *self,  real beta, THDTensor *src, real alpha, THDTensor *vec1, THDTensor *vec2) {
+  if ((vec1->nDimension != 1) || (vec2->nDimension != 1))
+    THError("vector and vector expected, got %dD, %dD tensors", vec1->nDimension, vec2->nDimension);
+
+  if (src->nDimension != 2)
+    THError("expected matrix, got %dD tensor for t", src->nDimension);
+
+  if ((src->size[0] != vec1->size[0]) || (src->size[1] != vec2->size[0])) {
+    THDDescBuff bt  = THDTensor_(sizeDesc)(src);
+    THDDescBuff bv1 = THDTensor_(sizeDesc)(vec1);
+    THDDescBuff bv2 = THDTensor_(sizeDesc)(vec2);
+    THError("size mismatch, src: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
+  }
+
+  if (self != src) {
+    THDTensor_(resizeAs)(self, src);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddr, self, src, vec1, vec2, beta, alpha),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(addbmm)(THDTensor *self, real beta, THDTensor *src, real alpha, THDTensor *batch1, THDTensor *batch2) {
+  THArgCheck(batch1->nDimension == 3, 1, "expected 3D tensor");
+  THArgCheck(batch2->nDimension == 3, 2, "expected 3D tensor");
+  THArgCheck(batch1->size[0] == batch2->size[0], 2,
+             "equal number of batches expected, got %d, %d",
+             batch1->size[0], batch2->size[0]);
+  THArgCheck(batch1->size[2] == batch2->size[1], 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             batch1->size[1], batch1->size[2], batch2->size[1], batch2->size[2]);
+
+  THArgCheck(src->size[0] == batch1->size[1], 1, "output tensor of incorrect size");
+  THArgCheck(src->size[1] == batch2->size[2], 1, "output tensor of incorrect size");
+
+  if (self != src) {
+    THDTensor_(resizeAs)(self, src);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAddbmm, self, src, batch1, batch2, beta, alpha),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(baddbmm)(THDTensor *self, real beta, THDTensor *src, real alpha, THDTensor *batch1, THDTensor *batch2) {
+  THArgCheck(batch1->nDimension == 3, 1, "expected 3D tensor");
+  THArgCheck(batch2->nDimension == 3, 2, "expected 3D tensor");
+  THArgCheck(batch1->size[0] == batch2->size[0], 2,
+             "equal number of batches expected, got %d, %d",
+             batch1->size[0], batch2->size[0]);
+  THArgCheck(batch1->size[2] == batch2->size[1], 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             batch1->size[1], batch1->size[2], batch2->size[1], batch2->size[2]);
+
+  THArgCheck(src->size[0] == batch1->size[0], 1, "output tensor of incorrect size");
+  THArgCheck(src->size[1] == batch1->size[1], 1, "output tensor of incorrect size");
+  THArgCheck(src->size[2] == batch2->size[2], 1, "output tensor of incorrect size");
+
+  if (self != src) {
+    THDTensor_(resizeAs)(self, src);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorBaddbmm, self, beta, src, alpha, batch1, batch2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(match)(THDTensor *self, THDTensor *m1, THDTensor *m2, real gain) {
+  THDTensor_(resize2d)(self, m1->size[0], m2->size[0]);
+  THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMatch, self, m1, m2, gain),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(sum)(THDTensor *self, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSum, self, src, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+  }
+}
+
+void THDTensor_(prod)(THDTensor *self, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorProd, self, src, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+  }
+}
+
+void THDTensor_(cumsum)(THDTensor *self, THDTensor *src, int dimension) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCumsum, self, src, dimension),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cumprod)(THDTensor *self, THDTensor *src, int dimension) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THDTensor_(resizeAs)(self, src);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCumprod, self, src, dimension),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(sign)(THDTensor *self, THDTensor *src) {
+  THDTensor_(resizeAs)(self, src);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSign, self, src),
+    THDState::s_current_worker
+  );
+}
+
+accreal THDTensor_(trace)(THDTensor *self) {
+  THArgCheck(self->nDimension == 2, 1, "expected a matrix");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTrace, self),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+void THDTensor_(cross)(THDTensor *self, THDTensor *src1, THDTensor *src2, int dimension) {
+  if (src1->nDimension != src2->nDimension)
+    THError("inconsistent tensor dimension %dD, %dD", src1->nDimension, src2->nDimension);
+
+  for (int i = 0; i < src1->nDimension; i++) {
+    if (src1->size[i] != src2->size[i]) {
+      THDDescBuff ba = THDTensor_(sizeDesc)(src1);
+      THDDescBuff bb = THDTensor_(sizeDesc)(src2);
+      THError("inconsistent tensor sizes %s, %s", ba.str, bb.str);
+    }
+  }
+
+  if (dimension < 0) {
+    for (int i = 0; i < src1->nDimension; i++) {
+      if (src1->size[i] == 3) {
+        dimension = i;
+        break;
+      }
+    }
+
+    if (dimension < 0) {
+      THDDescBuff ba = THDTensor_(sizeDesc)(src1);
+      THError("no dimension of size 3 in a: %s", ba.str);
+    }
+  }
+
+  THArgCheck(dimension >= 0 && dimension < src1->nDimension, 3, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+  THArgCheck(src1->size[dimension] == 3, 3, "dimension %d does not have size 3",
+      dimension + TH_INDEX_BASE);
+
+  THDTensor_(resizeAs)(self, src1);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTrace, self, src1, src2, dimension),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cmax)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCmax, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cmin)(THDTensor *self, THDTensor *src1, THDTensor *src2) {
+  THDTensor_(resizeAs)(self, src1);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCmin, self, src1, src2),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cmaxValue)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCmaxValue, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cminValue)(THDTensor *self, THDTensor *src, real value) {
+  THDTensor_(resizeAs)(self, src);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCminValue, self, src, value),
+    THDState::s_current_worker
+  );
+}
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensor.h b/torch/lib/THD/master_worker/master/generic/THDTensor.h
new file mode 100644
index 0000000..60cb11f
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensor.h
@@ -0,0 +1,215 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensor.h"
+#else
+
+typedef struct {
+  int64_t *size;
+  int64_t *stride;
+  int nDimension;
+
+  THDStorage *storage;
+  ptrdiff_t storageOffset;
+
+  std::atomic<int> refcount;
+  char flag;
+
+  // Additional fields
+  uint64_t tensor_id;
+} THDTensor;
+
+/**** helper functions ****/
+THD_API THDDescBuff THDTensor_(sizeDesc)(const THDTensor *tensor);
+
+/**** access methods ****/
+THD_API THDStorage* THDTensor_(storage)(const THDTensor *self);
+THD_API ptrdiff_t THDTensor_(storageOffset)(const THDTensor *self);
+THD_API int THDTensor_(nDimension)(const THDTensor *self);
+THD_API int64_t THDTensor_(size)(const THDTensor *self, int dim);
+THD_API int64_t THDTensor_(stride)(const THDTensor *self, int dim);
+THD_API THLongStorage *THDTensor_(newSizeOf)(THDTensor *self);
+THD_API THLongStorage *THDTensor_(newStrideOf)(THDTensor *self);
+
+THD_API void THDTensor_(setFlag)(THDTensor *self, char flag);
+THD_API void THDTensor_(clearFlag)(THDTensor *self, char flag);
+
+
+/**** creation methods ****/
+THD_API THDTensor *THDTensor_(new)(void);
+THD_API THDTensor *THDTensor_(newWithTensor)(THDTensor *tensor);
+/* stride might be NULL */
+THD_API THDTensor *THDTensor_(newWithStorage)(THDStorage *storage_,
+                                              ptrdiff_t storageOffset_,
+                                              THLongStorage *size_,
+                                              THLongStorage *stride_);
+THD_API THDTensor *THDTensor_(newWithStorage1d)(THDStorage *storage_,
+                                                ptrdiff_t storageOffset_,
+                                                int64_t size0_, int64_t stride0_);
+THD_API THDTensor *THDTensor_(newWithStorage2d)(THDStorage *storage_,
+                                                ptrdiff_t storageOffset_,
+                                                int64_t size0_, int64_t stride0_,
+                                                int64_t size1_, int64_t stride1_);
+THD_API THDTensor *THDTensor_(newWithStorage3d)(THDStorage *storage_,
+                                                ptrdiff_t storageOffset_,
+                                                int64_t size0_, int64_t stride0_,
+                                                int64_t size1_, int64_t stride1_,
+                                                int64_t size2_, int64_t stride2_);
+THD_API THDTensor *THDTensor_(newWithStorage4d)(THDStorage *storage_,
+                                                ptrdiff_t storageOffset_,
+                                                int64_t size0_, int64_t stride0_,
+                                                int64_t size1_, int64_t stride1_,
+                                                int64_t size2_, int64_t stride2_,
+                                                int64_t size3_, int64_t stride3_);
+
+/* stride might be NULL */
+THD_API THDTensor *THDTensor_(newWithSize)(THLongStorage *size_,
+                                           THLongStorage *stride_);
+THD_API THDTensor *THDTensor_(newWithSize1d)(int64_t size0_);
+THD_API THDTensor *THDTensor_(newWithSize2d)(int64_t size0_, int64_t size1_);
+THD_API THDTensor *THDTensor_(newWithSize3d)(int64_t size0_, int64_t size1_,
+                                             int64_t size2_);
+THD_API THDTensor *THDTensor_(newWithSize4d)(int64_t size0_, int64_t size1_,
+                                             int64_t size2_, int64_t size3_);
+
+THD_API THDTensor *THDTensor_(newClone)(THDTensor *self);
+THD_API THDTensor *THDTensor_(newContiguous)(THDTensor *tensor);
+THD_API THDTensor *THDTensor_(newSelect)(THDTensor *tensor, int dimension_,
+                                         int64_t sliceIndex_);
+THD_API THDTensor *THDTensor_(newNarrow)(THDTensor *tensor, int dimension_,
+                                         int64_t firstIndex_, int64_t size_);
+THD_API THDTensor *THDTensor_(newTranspose)(THDTensor *tensor, int dimension1_,
+                                            int dimension2_);
+THD_API THDTensor *THDTensor_(newUnfold)(THDTensor *tensor, int dimension_,
+                                         int64_t size_, int64_t step_);
+THD_API THDTensor *THDTensor_(newView)(THDTensor *tensor, THLongStorage *size);
+THD_API THDTensor *THDTensor_(newExpand)(THDTensor *tensor, THLongStorage *size);
+
+THD_API void THDTensor_(resize)(THDTensor *tensor, THLongStorage *size,
+                                THLongStorage *stride);
+THD_API void THDTensor_(resizeAs)(THDTensor *tensor, THDTensor *src);
+THD_API void THDTensor_(resize1d)(THDTensor *tensor, int64_t size0_);
+THD_API void THDTensor_(resize2d)(THDTensor *tensor, int64_t size0_, int64_t size1_);
+THD_API void THDTensor_(resize3d)(THDTensor *tensor, int64_t size0_, int64_t size1_,
+                                  int64_t size2_);
+THD_API void THDTensor_(resize4d)(THDTensor *tensor, int64_t size0_, int64_t size1_,
+                                  int64_t size2_, int64_t size3_);
+THD_API void THDTensor_(resize5d)(THDTensor *tensor, int64_t size0_, int64_t size1_,
+                                  int64_t size2_, int64_t size3_, int64_t size4_);
+
+THD_API void THDTensor_(set)(THDTensor *self, THDTensor *src);
+THD_API void THDTensor_(setStorage)(THDTensor *self, THDStorage *storage_,
+                                    ptrdiff_t storageOffset_,
+                                    THLongStorage *size_,
+                                    THLongStorage *stride_);
+THD_API void THDTensor_(setStorage1d)(THDTensor *self, THDStorage *storage_,
+                                      ptrdiff_t storageOffset_,
+                                      int64_t size0_, int64_t stride0_);
+THD_API void THDTensor_(setStorage2d)(THDTensor *self, THDStorage *storage_,
+                                      ptrdiff_t storageOffset_,
+                                      int64_t size0_, int64_t stride0_,
+                                      int64_t size1_, int64_t stride1_);
+THD_API void THDTensor_(setStorage3d)(THDTensor *self, THDStorage *storage_,
+                                      ptrdiff_t storageOffset_,
+                                      int64_t size0_, int64_t stride0_,
+                                      int64_t size1_, int64_t stride1_,
+                                      int64_t size2_, int64_t stride2_);
+THD_API void THDTensor_(setStorage4d)(THDTensor *self, THDStorage *storage_,
+                                      ptrdiff_t storageOffset_,
+                                      int64_t size0_, int64_t stride0_,
+                                      int64_t size1_, int64_t stride1_,
+                                      int64_t size2_, int64_t stride2_,
+                                      int64_t size3_, int64_t stride3_);
+
+THD_API void THDTensor_(narrow)(THDTensor *self, THDTensor *src, int dimension_,
+                                int64_t firstIndex_, int64_t size_);
+THD_API void THDTensor_(select)(THDTensor *self, THDTensor *src, int dimension_,
+                                int64_t sliceIndex_);
+THD_API void THDTensor_(transpose)(THDTensor *self, THDTensor *src,
+                                   int dimension1_, int dimension2_);
+THD_API void THDTensor_(unfold)(THDTensor *self, THDTensor *src, int dimension_,
+                                int64_t size_, int64_t step_);
+
+THD_API void THDTensor_(squeeze)(THDTensor *self, THDTensor *src);
+THD_API void THDTensor_(squeeze1d)(THDTensor *self, THDTensor *src,
+                                   int dimension_);
+TH_API void THDTensor_(unsqueeze1d)(THDTensor *self, THDTensor *src, int dimension_);
+
+THD_API int THDTensor_(isContiguous)(const THDTensor *self);
+THD_API int THDTensor_(isSameSizeAs)(const THDTensor *self, const THDTensor *src);
+THD_API int THDTensor_(isSetTo)(const THDTensor *self, const THDTensor *src);
+THD_API int THDTensor_(isSize)(const THDTensor *self, const THLongStorage *dims);
+THD_API ptrdiff_t THDTensor_(nElement)(const THDTensor *self);
+
+THD_API void THDTensor_(retain)(THDTensor *self);
+THD_API void THDTensor_(free)(THDTensor *self);
+THD_API void THDTensor_(freeCopyTo)(THDTensor *self, THDTensor *dst);
+
+/* Slow access methods [check everything] */
+THD_API void THDTensor_(set1d)(THDTensor *tensor, int64_t x0, real value);
+THD_API void THDTensor_(set2d)(THDTensor *tensor, int64_t x0, int64_t x1, real value);
+THD_API void THDTensor_(set3d)(THDTensor *tensor, int64_t x0, int64_t x1,
+                               int64_t x2, real value);
+THD_API void THDTensor_(set4d)(THDTensor *tensor, int64_t x0, int64_t x1,
+                               int64_t x2, int64_t x3, real value);
+
+THD_API real THDTensor_(get1d)(const THDTensor *tensor, int64_t x0);
+THD_API real THDTensor_(get2d)(const THDTensor *tensor, int64_t x0, int64_t x1);
+THD_API real THDTensor_(get3d)(const THDTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
+THD_API real THDTensor_(get4d)(const THDTensor *tensor, int64_t x0, int64_t x1,
+                               int64_t x2, int64_t x3);
+
+THD_API accreal THDTensor_(dot)(THDTensor *self, THDTensor *src);
+THD_API real THDTensor_(minall)(THDTensor *self);
+THD_API real THDTensor_(maxall)(THDTensor *self);
+THD_API real THDTensor_(medianall)(THDTensor *self);
+THD_API accreal THDTensor_(sumall)(THDTensor *self);
+THD_API accreal THDTensor_(prodall)(THDTensor *self);
+THD_API void THDTensor_(neg)(THDTensor *self, THDTensor *src);
+THD_API void THDTensor_(cinv)(THDTensor *self, THDTensor *src);
+THD_API void THDTensor_(add)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(sub)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(mul)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(div)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(fmod)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(remainder)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(clamp)(THDTensor *self, THDTensor *src, real min_value,
+                               real max_value);
+THD_API void THDTensor_(cadd)(THDTensor *self, THDTensor *src1, real value,
+                              THDTensor *src2);
+THD_API void THDTensor_(csub)(THDTensor *self, THDTensor *src1, real value,
+                              THDTensor *src2);
+THD_API void THDTensor_(cmul)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cpow)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cdiv)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cfmod)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cremainder)(THDTensor *self, THDTensor *src1,
+                                    THDTensor *src2);
+THD_API void THDTensor_(addcmul)(THDTensor *self, THDTensor *src1, real value,
+                                 THDTensor *src2, THDTensor *src3);
+THD_API void THDTensor_(addcdiv)(THDTensor *self, THDTensor *src1, real value,
+                                 THDTensor *src2, THDTensor *src3);
+THD_API void THDTensor_(addmv)(THDTensor *self, real beta, THDTensor *src,
+                               real alpha, THDTensor *mat,  THDTensor *vec);
+THD_API void THDTensor_(addmm)(THDTensor *self, real beta, THDTensor *src,
+                               real alpha, THDTensor *mat1, THDTensor *mat2);
+THD_API void THDTensor_(addr)(THDTensor *self,  real beta, THDTensor *src,
+                              real alpha, THDTensor *vec1, THDTensor *vec2);
+THD_API void THDTensor_(addbmm)(THDTensor *self, real beta, THDTensor *src,
+                                real alpha, THDTensor *batch1, THDTensor *batch2);
+THD_API void THDTensor_(baddbmm)(THDTensor *self, real beta, THDTensor *src,
+                                 real alpha, THDTensor *batch1, THDTensor *batch2);
+THD_API void THDTensor_(match)(THDTensor *self, THDTensor *m1,
+                               THDTensor *m2, real gain);
+THD_API void THDTensor_(sum)(THDTensor *self, THDTensor *src, int dimension, int keepdim);
+THD_API void THDTensor_(prod)(THDTensor *self, THDTensor *src, int dimension, int keepdim);
+THD_API void THDTensor_(cumsum)(THDTensor *self, THDTensor *src, int dimension);
+THD_API void THDTensor_(cumprod)(THDTensor *self, THDTensor *src, int dimension);
+THD_API void THDTensor_(sign)(THDTensor *self, THDTensor *src);
+THD_API accreal THDTensor_(trace)(THDTensor *self);
+THD_API void THDTensor_(cross)(THDTensor *self, THDTensor *src1,
+                               THDTensor *src2, int dimension);
+THD_API void THDTensor_(cmax)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cmin)(THDTensor *self, THDTensor *src1, THDTensor *src2);
+THD_API void THDTensor_(cmaxValue)(THDTensor *self, THDTensor *src, real value);
+THD_API void THDTensor_(cminValue)(THDTensor *self, THDTensor *src, real value);
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorCopy.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorCopy.cpp
new file mode 100644
index 0000000..5018461
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorCopy.cpp
@@ -0,0 +1,30 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorCopy.cpp"
+#else
+
+// TODO implement
+void THDTensor_(copy)(THDTensor *tensor, THDTensor *src) {
+  throw std::runtime_error("copy not implemented yet");
+}
+
+void THDTensor_(copyFromMaster)(THDTensor* to, THDTensorDescriptor* from) {
+  std::lock_guard<std::mutex> guard(THDState::s_workers[THDState::s_current_worker].copy_mutex);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCopyFromMaster, to),
+    THDState::s_current_worker
+  );
+
+  thd::dataChannel->send(*from, THDState::s_current_worker);
+}
+
+void THDTensor_(copyFromWorker)(THDTensorDescriptor* to, THDTensor* from) {
+  std::lock_guard<std::mutex> guard(THDState::s_workers[THDState::s_current_worker].copy_mutex);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCopyFromWorker, from),
+    THDState::s_current_worker
+  );
+
+  thd::dataChannel->receive(*to, THDState::s_current_worker);
+}
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorCopy.h b/torch/lib/THD/master_worker/master/generic/THDTensorCopy.h
new file mode 100644
index 0000000..fc701bc
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorCopy.h
@@ -0,0 +1,10 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorCopy.h"
+#else
+
+THD_API void THDTensor_(copy)(THDTensor *tensor, THDTensor *src);
+
+THD_API void THDTensor_(copyFromMaster)(THDTensor *to, THDTensorDescriptor* from);
+THD_API void THDTensor_(copyFromWorker)(THDTensorDescriptor *to, THDTensor *from);
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorLapack.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorLapack.cpp
new file mode 100644
index 0000000..8fac932
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorLapack.cpp
@@ -0,0 +1,458 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorLapack.cpp"
+#else
+
+/*
+Check if self is transpose of a contiguous matrix
+*/
+static int THDTensor_(isTransposedContiguous)(THDTensor *self) {
+  return self->stride[0] == 1 && self->stride[1] == self->size[0];
+}
+/*
+If a matrix is a regular contiguous matrix, make sure it is transposed
+because this is what we return from Lapack calls.
+*/
+static void THDTensor_(checkTransposed)(THDTensor *self) {
+  if (THDTensor_(isContiguous)(self))
+    THDTensor_(transpose)(self, NULL, 0, 1);
+  return;
+}
+/*
+newContiguous followed by transpose
+Similar to (newContiguous), but checks if the transpose of the matrix
+is contiguous and also limited to 2D matrices.
+*/
+static THDTensor *THDTensor_(newTransposedContiguous)(THDTensor *self) {
+  THDTensor *tensor;
+  if (THDTensor_(isTransposedContiguous)(self)) {
+    THDTensor_(retain)(self);
+    tensor = self;
+  } else {
+    tensor = THDTensor_(newContiguous)(self);
+    THDTensor_(transpose)(tensor, NULL, 0, 1);
+  }
+
+  return tensor;
+}
+
+/*
+Given the result tensor and src tensor, decide if the lapack call should use the
+provided result tensor or should allocate a new space to put the result in.
+
+The returned tensor have to be freed by the calling function.
+
+nrows is required, because some lapack calls, require output space smaller than
+input space, like underdetermined gels.
+*/
+static THDTensor *THDTensor_(checkLapackClone)(THDTensor *result, THDTensor *src, int nrows) {
+  /* check if user wants to reuse src and if it is correct shape/size */
+  if (src == result && THDTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
+    THDTensor_(retain)(result);
+  else if (src == result || result == NULL)
+    /* in this case, user wants reuse of src, but its structure is not OK */
+    result = THDTensor_(new)();
+  else
+    THDTensor_(retain)(result);
+  return result;
+}
+
+/*
+Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
+the resulting tensor to be larger than src.
+*/
+static THDTensor *THDTensor_(cloneColumnMajorNrows)(THDTensor *self, THDTensor *src, int nrows) {
+  THDTensor *result;
+  THDTensor *view;
+
+  if (src == NULL)
+    src = self;
+  result = THDTensor_(checkLapackClone)(self, src, nrows);
+  if (src == result)
+    return result;
+
+  THDTensor_(resize2d)(result, src->size[1], nrows);
+  THDTensor_(checkTransposed)(result);
+
+  if (src->size[0] == nrows) {
+    THDTensor_(copy)(result, src);
+  } else {
+    view = THDTensor_(newNarrow)(result, 0, 0, src->size[0]);
+    THDTensor_(copy)(view, src);
+    THDTensor_(free)(view);
+  }
+  return result;
+}
+
+/*
+Create a clone of src in self column major order for use with Lapack.
+If src == self, a new tensor is allocated, in any case, the return tensor should be
+freed by calling function.
+*/
+static THDTensor *THDTensor_(cloneColumnMajor)(THDTensor *self, THDTensor *src) {
+  return THDTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
+}
+
+/*
+ * A verbose set of comments on what the Lapack functions do
+ * is contained in their implementation in the TH library
+ */
+
+/* TODO this might leak on incorrect data */
+void THDTensor_(gesv)(THDTensor *rb, THDTensor *ra, THDTensor *b, THDTensor *a) {
+  bool free_b = false;
+  if (a == NULL) a = ra;
+  if (b == NULL) b = rb;
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THDTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = true;
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGesv, rb, ra, b, a),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(rb, b));
+
+  if (free_b) THDTensor_(free)(b);
+}
+
+void THDTensor_(trtrs)(THDTensor *rb, THDTensor *ra, THDTensor *b, THDTensor *a,
+                       const char *uplo, const char *trans, const char *diag) {
+  bool free_b = false;
+  if (a == NULL) a = ra;
+  if (b == NULL) b = rb;
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THDTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = true;
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTrtrs, rb, ra, b, a, uplo[0], trans[0], diag[0]),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(rb, b));
+
+  if (free_b) THDTensor_(free)(b);
+}
+
+void THDTensor_(gels)(THDTensor *rb, THDTensor *ra, THDTensor *b, THDTensor *a) {
+  bool free_b = 0;
+  if (a == NULL) a = ra;
+  if (b == NULL) b = rb;
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THDTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = true;
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGels, rb, ra, b, a),
+    THDState::s_current_worker
+  );
+
+  int m, n, nrhs, ldb;
+
+  THDTensor *ra_ = NULL;
+  THDTensor *rb_ = NULL;
+
+  ra_ = THDTensor_(cloneColumnMajor)(ra, a);
+
+  m = ra_->size[0];
+  n = ra_->size[1];
+  ldb = (m > n) ? m : n;
+
+  rb_ = THDTensor_(cloneColumnMajorNrows)(rb, b, ldb);
+
+  nrhs = rb_->size[1];
+
+  /* rb_ is currently ldb by nrhs; resize it to n by nrhs */
+  rb_->size[0] = n;
+  if (rb_ != rb)
+    THDTensor_(resize2d)(rb, n, nrhs);
+
+  THDTensor_(free)(ra_);
+  THDTensor_(free)(rb_);
+  if (free_b) THDTensor_(free)(b);
+}
+
+void THDTensor_(syev)(THDTensor *re, THDTensor *rv, THDTensor *a,
+                      const char *jobz, const char *uplo) {
+  if (a == NULL) a = rv;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSyev, re, rv, a, jobz[0], uplo[0]),
+    THDState::s_current_worker
+  );
+
+  THDTensor *rv_ = THDTensor_(cloneColumnMajor)(rv, a);
+  THDTensor_(resize1d)(re, rv_->size[0]);
+  THDTensor_(free)(rv_);
+}
+
+void THDTensor_(geev)(THDTensor *re, THDTensor *rv, THDTensor *a, const char *jobvr) {
+  int n;
+  THDTensor *a_;
+
+  THDTensor *re_ = NULL;
+  THDTensor *rv_ = NULL;
+
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGeev, re, rv, a, jobvr[0]),
+    THDState::s_current_worker
+  );
+
+  /* we want to definitely clone a for geev*/
+  a_ = THDTensor_(cloneColumnMajor)(NULL, a);
+
+  n = a_->size[0];
+
+  if (*jobvr == 'V') {
+    THDTensor_(resize2d)(rv, n, n);
+    /* guard against someone passing a correct size, but wrong stride */
+    rv_ = THDTensor_(newTransposedContiguous)(rv);
+  }
+  THDTensor_(resize2d)(re, n, 2);
+
+  if (*jobvr == 'V') {
+    THDTensor_(checkTransposed)(rv);
+  }
+
+  THDTensor_(free)(a_);
+}
+
+void THDTensor_(gesvd)(THDTensor *ru, THDTensor *rs, THDTensor *rv, THDTensor *a,
+                       const char *jobu) {
+  THDTensor *ra = THDTensor_(new)();
+  THDTensor_(gesvd2)(ru, rs, rv,  ra, a, jobu);
+  THDTensor_(free)(ra);
+}
+
+void THDTensor_(gesvd2)(THDTensor *ru, THDTensor *rs, THDTensor *rv, THDTensor *ra,
+                        THDTensor *a, const char *jobu) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGesvd2, ru, rs, rv, ra, a, jobu[0]),
+    THDState::s_current_worker
+  );
+
+  int k, m, n, ldu, ldvt;
+  THDTensor *rvf = THDTensor_(new)();
+
+  THDTensor *ra_ = NULL;
+  THDTensor *ru_ = NULL;
+
+  ra_ = THDTensor_(cloneColumnMajor)(ra, a);
+
+  m = ra_->size[0];
+  n = ra_->size[1];
+  k = (m < n ? m : n);
+
+  ldu = m;
+  ldvt = n;
+
+  THDTensor_(resize1d)(rs, k);
+  THDTensor_(resize2d)(rvf, ldvt, n);
+  if (*jobu == 'A')
+    THDTensor_(resize2d)(ru, m, ldu);
+  else
+    THDTensor_(resize2d)(ru, k, ldu);
+
+  THDTensor_(checkTransposed)(ru);
+
+  /* guard against someone passing a correct size, but wrong stride */
+  ru_ = THDTensor_(newTransposedContiguous)(ru);
+
+  if (*jobu == 'S') {
+    THDTensor_(narrow)(rvf, NULL, 1, 0, k);
+  }
+  THDTensor_(resizeAs)(rv, rvf);
+  THDTensor_(free)(rvf);
+  THDTensor_(free)(ra_);
+}
+
+void THDTensor_(getri)(THDTensor *ra, THDTensor *a) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGetri, ra, a),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+}
+
+void THDTensor_(potrf)(THDTensor *ra, THDTensor *a, const char *uplo) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorPotrf, ra, a, uplo[0]),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+}
+
+void THDTensor_(potrs)(THDTensor *rb, THDTensor *b, THDTensor *a,  const char *uplo) {
+  bool free_b = false;
+  if (b == NULL) b = rb;
+
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THDTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = true;
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorPotrs, rb, b, a, uplo[0]),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(NULL, a));
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(rb, b));
+
+  if (free_b) THDTensor_(free)(b);
+}
+
+void THDTensor_(potri)(THDTensor *ra, THDTensor *a, const char *uplo) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorPotri, ra, a, uplo[0]),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+}
+
+void THDTensor_(qr)(THDTensor *rq, THDTensor *rr, THDTensor *a) {
+  int m = a->size[0];
+  int n = a->size[1];
+  int k = (m < n ? m : n);
+  THDTensor *ra = THDTensor_(new)();
+  THDTensor *rtau = THDTensor_(new)();
+  THDTensor *rr_ = THDTensor_(new)();
+  THDTensor_(geqrf)(ra, rtau, a);
+  THDTensor_(resize2d)(rr_, k, ra->size[1]);
+  THDTensor_(narrow)(rr_, ra, 0, 0, k);
+  THDTensor_(triu)(rr_, rr_, 0);
+  THDTensor_(resize2d)(rq, ra->size[0], k);
+  THDTensor_(orgqr)(rq, ra, rtau);
+  THDTensor_(narrow)(rq, rq, 1, 0, k);
+  THDTensor_(free)(ra);
+  THDTensor_(free)(rtau);
+  THDTensor_(free)(rr_);
+}
+
+void THDTensor_(geqrf)(THDTensor *ra, THDTensor *rtau, THDTensor *a) {
+  if (a == NULL) ra = a;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGeqrf, ra, rtau, a),
+    THDState::s_current_worker
+  );
+
+  THDTensor *ra_ = THDTensor_(cloneColumnMajor)(ra, a);
+
+  int m = ra_->size[0];
+  int n = ra_->size[1];
+  int k = (m < n ? m : n);
+  THDTensor_(resize1d)(rtau, k);
+  THDTensor_(free)(ra);
+}
+
+void THDTensor_(orgqr)(THDTensor *ra, THDTensor *a, THDTensor *tau) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorOrgqr, ra, a),
+    THDState::s_current_worker
+  );
+
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, a));
+}
+
+void THDTensor_(ormqr)(THDTensor *ra, THDTensor *a, THDTensor *tau, THDTensor *c,
+                       const char *side, const char *trans) {
+  if (a == NULL) a = ra;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorOrmqr, ra, a, tau, c, side[0], trans[0]),
+    THDState::s_current_worker
+  );
+  THDTensor_(free)(THDTensor_(cloneColumnMajor)(ra, c));
+}
+
+void THDTensor_(pstrf)(THDTensor *ra, THDIntTensor *rpiv, THDTensor*a,
+                       const char* uplo, real tol) {
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorPstrf, ra, rpiv, a, uplo[0], tol),
+    THDState::s_current_worker
+  );
+
+  int n = a->size[0];
+
+  THDTensor *ra_ = THDTensor_(cloneColumnMajor)(ra, a);
+  THDIntTensor_resize1d(rpiv, n);
+
+  THDTensor_(free)(ra_);
+}
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorLapack.h b/torch/lib/THD/master_worker/master/generic/THDTensorLapack.h
new file mode 100644
index 0000000..d5c8f21
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorLapack.h
@@ -0,0 +1,27 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorLapack.h"
+#else
+
+THD_API void THDTensor_(gesv)(THDTensor *rb_, THDTensor *ra_, THDTensor *b_, THDTensor *a_);
+THD_API void THDTensor_(trtrs)(THDTensor *rb_, THDTensor *ra_, THDTensor *b_, THDTensor *a_,
+                               const char *uplo, const char *trans, const char *diag);
+THD_API void THDTensor_(gels)(THDTensor *rb_, THDTensor *ra_, THDTensor *b_, THDTensor *a_);
+THD_API void THDTensor_(syev)(THDTensor *re_, THDTensor *rv_, THDTensor *a_,
+                              const char *jobz, const char *uplo);
+THD_API void THDTensor_(geev)(THDTensor *re_, THDTensor *rv_, THDTensor *a_, const char *jobvr);
+THD_API void THDTensor_(gesvd)(THDTensor *ru_, THDTensor *rs_, THDTensor *rv_, THDTensor *a,
+                               const char *jobu);
+THD_API void THDTensor_(gesvd2)(THDTensor *ru_, THDTensor *rs_, THDTensor *rv_, THDTensor *ra_,
+                                THDTensor *a, const char *jobu);
+THD_API void THDTensor_(getri)(THDTensor *ra_, THDTensor *a);
+THD_API void THDTensor_(potrf)(THDTensor *ra_, THDTensor *a, const char *uplo);
+THD_API void THDTensor_(potrs)(THDTensor *rb_, THDTensor *b_, THDTensor *a_,  const char *uplo);
+THD_API void THDTensor_(potri)(THDTensor *ra_, THDTensor *a, const char *uplo);
+THD_API void THDTensor_(qr)(THDTensor *rq_, THDTensor *rr_, THDTensor *a);
+THD_API void THDTensor_(geqrf)(THDTensor *ra_, THDTensor *rtau_, THDTensor *a);
+THD_API void THDTensor_(orgqr)(THDTensor *ra_, THDTensor *a, THDTensor *tau);
+THD_API void THDTensor_(ormqr)(THDTensor *ra_, THDTensor *a, THDTensor *tau, THDTensor *c,
+                               const char *side, const char *trans);
+THD_API void THDTensor_(pstrf)(THDTensor *ra_, THDIntTensor *rpiv_, THDTensor*a,
+                               const char* uplo, real tol);
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorMath.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorMath.cpp
new file mode 100644
index 0000000..c973674
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorMath.cpp
@@ -0,0 +1,933 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorMath.cpp"
+#else
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+void THDTensor_(gather)(THDTensor *self, THDTensor *src, int dim, THDLongTensor *index) {
+  THArgCheck(dim < self->nDimension, 2, "Index dimension is out of bounds");
+  THArgCheck(THDLongTensor_nDimension(index) == self->nDimension, 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(src->nDimension == self->nDimension, 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorGather,
+      self,
+      src,
+      dim,
+      index
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(scatter)(THDTensor *self, int dim, THDLongTensor *index, THDTensor *src) {
+  THArgCheck(dim < self->nDimension, 2, "Index dimension is out of bounds");
+  THArgCheck(THDLongTensor_nDimension(index) == self->nDimension, 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(src->nDimension == self->nDimension, 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorScatter,
+      self,
+      dim,
+      index,
+      src
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(scatterFill)(THDTensor *self, int dim, THDLongTensor *index, real val) {
+  THArgCheck(dim < self->nDimension, 2, "Index dimension is out of bounds");
+  THArgCheck(THDLongTensor_nDimension(index) == self->nDimension, 3,
+             "Index tensor must have same dimensions as output tensor");
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorScatterFill,
+      self,
+      dim,
+      index,
+      val
+    ),
+    THDState::s_current_worker
+  );
+}
+
+THD_API void THDTensor_(scatterAdd)(THDTensor *self, int dim, THDLongTensor *index,
+                                 THDTensor *src) {
+  THError("scatterAdd not implemented");
+}
+
+void THDTensor_(max)(THDTensor *self, THDLongTensor *indices_, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THDLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMax, self, indices_, src, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+    THDLongTensor__squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THDTensor_(min)(THDTensor *self, THDLongTensor *indices_, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 2, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THDLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMin, self, indices_, src, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+    THDLongTensor__squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THDTensor_(kthvalue)(THDTensor *self, THDLongTensor *indices_, THDTensor *src, int64_t k, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 3, "dimension out of range");
+  THArgCheck(k > 0 && k <= src->size[dimension], 2, "selected index out of range");
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THDLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorKthvalue, self, indices_, src, k, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+    THDLongTensor__squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THDTensor_(mode)(THDTensor *self, THDLongTensor *indices_, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 3, "dimension out of range");
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(src);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(self, dim, NULL);
+  THDLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMode, self, indices_, src, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(self, self, dimension);
+    THDLongTensor__squeeze1d(indices_, indices_, dimension);
+  }
+}
+
+void THDTensor_(median)(THDTensor *self, THDLongTensor *indices_, THDTensor *src, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < src->nDimension, 3, "dimension out of range");
+
+  int64_t t_size_dim = src->size[dimension];
+  int64_t k = (t_size_dim - 1) >> 1; /* take middle or one-before-middle element */
+
+  THDTensor_(kthvalue)(self, indices_, src, k + 1, dimension, keepdim);
+}
+
+
+void THDTensor_(fill)(THDTensor *tensor, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorFill,
+      tensor,
+      value
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(zero)(THDTensor *r) {
+  THDTensor_(fill)(r, 0);
+}
+
+void THDTensor_(maskedFill)(THDTensor *tensor, THDByteTensor *mask, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorMaskedFill,
+      tensor,
+      mask,
+      value
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(maskedCopy)(THDTensor *tensor, THDByteTensor *mask, THDTensor* src) {
+  if (THDTensor_(nElement)(tensor) != THDByteTensor_nElement(mask))
+    THError("Number of elements of destination tensor != Number of elements in mask");
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorMaskedCopy,
+      tensor,
+      mask,
+      src
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(maskedSelect)(THDTensor *tensor, THDTensor* src, THDByteTensor *mask) {
+  ptrdiff_t numel = THDByteTensor_sumall(mask);
+  THDTensor_(resize1d)(tensor, numel);
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorMaskedSelect,
+      tensor,
+      src,
+      mask
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(nonzero)(THDLongTensor *subscript, THDTensor *tensor) {
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorNonzero,
+      subscript,
+      tensor
+    ),
+    THDState::s_current_worker
+  );
+  int64_t numel = receiveValueFromWorker<int64_t>(tensor->storage->node_id);
+  THDLongTensor__resize2d(subscript, numel, tensor->nDimension);
+}
+
+void THDTensor_(indexSelect)(THDTensor *tensor, THDTensor *src, int dim, THDLongTensor *index) {
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4, "Indexing dim %d is out of bounds of tensor",
+             dim + TH_INDEX_BASE);
+  THArgCheck(src->nDimension > 0, 2, "Source tensor is empty");
+  THLongStorage *newSize = THLongStorage_newWithSize(src->nDimension);
+  THLongStorage_rawCopy(newSize, src->size);
+  THDTensor_(resize)(tensor, newSize, NULL);
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorIndexSelect,
+      tensor,
+      src,
+      dim,
+      index
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(indexCopy)(THDTensor *tensor, int dim, THDLongTensor *index, THDTensor *src) {
+  ptrdiff_t numel = THDLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4, "Indexing dim %d is out of bounds of tensor",
+             dim + TH_INDEX_BASE);
+  THArgCheck(numel == src->size[dim], 4, "Number of indices should be equal to source:size(dim)");
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorIndexCopy,
+      tensor,
+      dim,
+      index,
+      src
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(indexAdd)(THDTensor *tensor, int dim, THDLongTensor *index, THDTensor *src) {
+  ptrdiff_t numel = THDLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4, "Indexing dim %d is out of bounds of tensor",
+             dim + TH_INDEX_BASE);
+  THArgCheck(numel == src->size[dim], 4, "Number of indices should be equal to source:size(dim)");
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorIndexAdd,
+      tensor,
+      dim,
+      index,
+      src
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(indexFill)(THDTensor *tensor, int dim, THDLongTensor *index, real val) {
+  ptrdiff_t numel = THDLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->nDimension, 4, "Indexing dim %d is out of bounds of tensor",
+             dim + TH_INDEX_BASE);
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorIndexFill,
+      tensor,
+      dim,
+      index,
+      val
+    ),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(zeros)(THDTensor *tensor, THLongStorage *size) {
+  THDTensor_(resize)(tensor, size, nullptr);
+  THDTensor_(zero)(tensor);
+}
+
+void THDTensor_(ones)(THDTensor *tensor, THLongStorage *size) {
+  THDTensor_(resize)(tensor, size, nullptr);
+  THDTensor_(fill)(tensor, 1);
+}
+
+ptrdiff_t THDTensor_(numel)(THDTensor *t) {
+  return THDTensor_(nElement)(t);
+}
+
+void THDTensor_(diag)(THDTensor *r_, THDTensor *t, int k) {
+  THArgCheck(THDTensor_(nDimension)(t) == 1 || THDTensor_(nDimension)(t) == 2,
+      1, "matrix or a vector expected");
+
+  if (THDTensor_(nDimension)(t) == 1) {
+    int64_t t_size = THDTensor_(size)(t, 0);
+    int64_t sz = t_size + (k >= 0 ? k : -k);
+
+    THDTensor_(resize2d)(r_, sz, sz);
+    THDTensor_(zero)(r_);
+  } else {
+    int64_t sz;
+    if (k >= 0)
+      sz = std::min(THDTensor_(size)(t, 0), THDTensor_(size)(t, 1)-k);
+    else
+      sz = std::min(THDTensor_(size)(t, 0)+k, THDTensor_(size)(t, 1));
+    THDTensor_(resize1d)(r_, sz);
+  }
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorDiag, r_, t, k),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(eye)(THDTensor *r, int64_t n, int64_t m) {
+  THArgCheck(n > 0, 1, "invalid argument");
+
+  if (m <= 0)
+    m = n;
+
+  THDTensor_(resize2d)(r, n, m);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorEye, r, n, m),
+    THDState::s_current_worker
+  );
+}
+
+
+void THDTensor_(range)(THDTensor *r_, accreal xmin,
+                              accreal xmax, accreal step) {
+  THArgCheck(step > 0 || step < 0, 3, "step must be a non-null number");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin)),
+              2, "upper bound and larger bound incoherent with step sign");
+
+  ptrdiff_t size = static_cast<ptrdiff_t>((((xmax - xmin) / step) + 1));
+
+  if (THDTensor_(nElement)(r_) != size)
+    THDTensor_(resize1d)(r_, size);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRange, r_, xmin, xmax, step),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(randperm)(THDTensor *r_, THDGenerator *_generator, int64_t n) {
+  THArgCheck(n > 0, 1, "must be strictly positive");
+  THDTensor_(resize1d)(r_, n);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRange, r_, _generator, n),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(reshape)(THDTensor *r_, THDTensor *t, THLongStorage *size) {
+  THDTensor_(resize)(r_, size, NULL);
+  THDTensor_(copy)(r_, t);
+}
+
+void THDTensor_(sort)(THDTensor *rt_, THDLongTensor *ri_,
+                             THDTensor *t, int dimension,
+                             int descendingOrder) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(t),
+      2, "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THDTensor_(resizeAs)(rt_, t);
+  THDTensor_(copy)(rt_, t);
+
+  {
+    THLongStorage *size = THDTensor_(newSizeOf)(t);
+    THDLongTensor_resize(ri_, size, NULL);
+    THLongStorage_free(size);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorSort, rt_, ri_, t, dimension, descendingOrder),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(topk)(THDTensor *rt_, THDLongTensor *ri_,
+                      THDTensor *t, int64_t k, int dim,
+                      int dir, int sorted) {
+  int numDims = THDTensor_(nDimension)(t);
+  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
+
+  int64_t sliceSize = THDTensor_(size)(t, dim);
+  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+
+  THLongStorage *topKSize = THDTensor_(newSizeOf)(t);
+  THLongStorage_set(topKSize, dim, k);
+  THDTensor_(resize)(rt_, topKSize, NULL);
+  THDLongTensor_resize(ri_, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTopk, rt_, ri_, t, k, dim, dir, sorted),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(tril)(THDTensor *r_, THDTensor *t, int64_t k) {
+  THArgCheck(THDTensor_(nDimension)(t) == 2, 1, "expected a matrix");
+
+  THDTensor_(resizeAs)(r_, t);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTril, r_, t, k),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(triu)(THDTensor *r_, THDTensor *t, int64_t k) {
+  THArgCheck(THDTensor_(nDimension)(t) == 2, 1, "expected a matrix");
+
+  THDTensor_(resizeAs)(r_, t);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorTriu, r_, t, k),
+    THDState::s_current_worker
+  );
+}
+void THDTensor_(cat)(THDTensor *r_, THDTensor *ta, THDTensor *tb, int dimension) {
+  THDTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THDTensor_(catArray)(r_, inputs, 2, dimension);
+}
+
+void THDTensor_(catArray)(THDTensor *result, THDTensor **inputs,
+                          int numInputs, int dimension) {
+  THLongStorage *size;
+  int64_t offset;
+  int ndim = dimension + 1;
+  int ldimension = dimension;
+  bool allEmpty = true;
+  for (int i = 0; i < numInputs; i++)
+    ndim = std::max(ndim, inputs[i]->nDimension);
+
+  if (dimension == -2)
+    ldimension = ndim ? (ndim - 1) : 0;
+
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+  THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  size = THLongStorage_newWithSize(ndim);
+
+  for (int i = 0; i < ndim; i++) {
+    int64_t dimSize = i < inputs[0]->nDimension ?
+                   inputs[0]->size[i] :
+                   std::min(inputs[0]->nDimension, 1);
+    if (i == ldimension) {
+      for (int j = 1; j < numInputs; j++) {
+        dimSize += i < inputs[j]->nDimension ?
+                   inputs[j]->size[i] :
+                   std::min(inputs[j]->nDimension, 1);
+      }
+    } else {
+      for (int j = 1; j < numInputs; j++) {
+        int64_t sz = i < inputs[j]->nDimension ?
+                  inputs[j]->size[i] :
+                  std::min(inputs[j]->nDimension, 1);
+        if (dimSize != sz && dimSize && sz) {
+          THLongStorage_free(size);
+          THError("inconsistent tensor sizes");
+        } else if (!dimSize) {
+          dimSize = sz;
+        }
+      }
+    }
+    allEmpty = allEmpty && !dimSize;
+    THLongStorage_set(size, i, dimSize);
+  }
+
+  if (!allEmpty) {
+    THDTensor_(resize)(result, size, NULL);
+    std::vector<THDTensor*> inputs_vec(inputs, inputs + numInputs);
+
+    // There's no need to send numInputs,
+    // since sending inputs_vec does this implicitly
+    masterCommandChannel->sendMessage(
+      packMessage(Functions::tensorCatArray, result, inputs_vec, dimension),
+      THDState::s_current_worker
+    );
+  }
+
+  THLongStorage_free(size);
+}
+
+int THDTensor_(equal)(THDTensor *ta, THDTensor *tb) {
+  if (!THDTensor_(isSameSizeAs)(ta, tb))
+    return 0;
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorEqual, ta, tb),
+    THDState::s_current_worker
+  );
+  return receiveValueFromWorker<int>(ta->storage->node_id);
+}
+
+void THDTensor_(tpow)(THDTensor *r_, real value, THDTensor *t) {
+  THDTensor_(resizeAs)(r_, t);
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorEqual, r_, t, value),
+    THDState::s_current_worker
+  );
+}
+
+#define TENSOR_IMPLEMENT_LOGICAL(NAME,UPPNAME)                                \
+  void THDTensor_(NAME##Value)(THDByteTensor *r_, THDTensor* t, real value) { \
+    THDByteTensor__resize(r_, t->nDimension, t->size, NULL);                  \
+    masterCommandChannel->sendMessage(                                        \
+      packMessage(Functions::tensor##UPPNAME##Value, r_, t, value),           \
+      THDState::s_current_worker                                              \
+    );                                                                        \
+  }                                                                           \
+  void THDTensor_(NAME##ValueT)(THDTensor* r_, THDTensor* t, real value)  {   \
+    THDTensor_(_resize)(r_, t->nDimension, t->size, NULL);                    \
+    masterCommandChannel->sendMessage(                                        \
+      packMessage(Functions::tensor##UPPNAME##ValueT, r_, t, value),          \
+      THDState::s_current_worker                                              \
+    );                                                                        \
+  }                                                                           \
+  void THDTensor_(NAME##Tensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb) { \
+    THDByteTensor__resize(r_, ta->nDimension, ta->size, NULL);                \
+    masterCommandChannel->sendMessage(                                        \
+      packMessage(Functions::tensor##UPPNAME##Tensor, r_, ta, tb),            \
+      THDState::s_current_worker                                              \
+    );                                                                        \
+  }                                                                           \
+  void THDTensor_(NAME##TensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb) { \
+    THDTensor_(_resize)(r_, ta->nDimension, ta->size, NULL);                  \
+    masterCommandChannel->sendMessage(                                        \
+      packMessage(Functions::tensor##UPPNAME##TensorT, r_, ta, tb),           \
+      THDState::s_current_worker                                              \
+    );                                                                        \
+  }                                                                           \
+
+
+TENSOR_IMPLEMENT_LOGICAL(lt,Lt)
+TENSOR_IMPLEMENT_LOGICAL(gt,Lt)
+TENSOR_IMPLEMENT_LOGICAL(le,Le)
+TENSOR_IMPLEMENT_LOGICAL(ge,Ge)
+TENSOR_IMPLEMENT_LOGICAL(eq,Eq)
+TENSOR_IMPLEMENT_LOGICAL(ne,Ne)
+
+#undef TENSOR_IMPLEMENT_LOGICAL
+
+#define TENSOR_IMPLEMENT_POINTWISE_FUNCTION(NAME, UPPNAME)    \
+  void THDTensor_(NAME)(THDTensor *r_, THDTensor *t) {        \
+    THDTensor_(resizeAs)(r_, t);                              \
+    masterCommandChannel->sendMessage(                        \
+      packMessage(Functions::tensor##UPPNAME, r_, t),         \
+      THDState::s_current_worker                              \
+    );                                                        \
+  }                                                           \
+
+#define TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION(NAME, UPPNAME)              \
+  void THDTensor_(NAME)(THDTensor *r_, THDTensor *t, real value) {            \
+    THDTensor_(resizeAs)(r_, t);                                              \
+    masterCommandChannel->sendMessage(                                        \
+      packMessage(Functions::tensor##UPPNAME, r_, t, value),                  \
+      THDState::s_current_worker                                              \
+    );                                                                        \
+  }                                                                           \
+
+
+#if defined(TH_REAL_IS_LONG) || defined(TH_REAL_IS_INT) ||\
+    defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(abs,Abs)
+#endif
+
+#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(sigmoid,Sigmoid)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(log,Log)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(log10,Log10)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(log1p,Log1p)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(log2, Log2)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(exp,Exp)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(expm1,Expm1)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(cos,Cos)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(acos,Acos)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(cosh,Cosh)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(sin,Sin)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(asin,Asin)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(sinh,Sinh)
+
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(tan,Tan)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(atan,Atan)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(tanh,Tanh)
+TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION(pow,Pow)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(sqrt,Sqrt)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(rsqrt,Rsqrt)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(ceil,Ceil)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(floor,Floor)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(round,Round)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(trunc,Trunc)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(frac,Frac)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(neg,Neg)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(cinv,Cinv)
+
+#undef TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION
+#undef TENSOR_IMPLEMENT_POINTWISE_FUNCTION
+
+void THDTensor_(atan2)(THDTensor *r_, THDTensor *tx, THDTensor *ty) {
+  THDTensor_(resizeAs)(r_, tx);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorAtan2, r_, tx, ty),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(lerp)(THDTensor *r_, THDTensor *a, THDTensor *b, real weight) {
+  THArgCheck(THDTensor_(nElement)(a) == THDTensor_(nElement)(b), 2,
+             "sizes do not match");
+  THDTensor_(resizeAs)(r_, a);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLerp, r_, a, b, weight),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(mean)(THDTensor *r_, THDTensor *t, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(t), 2,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMean, r_, t, dimension, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THDTensor_(std)(THDTensor *r_, THDTensor *t, int dimension, int biased, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(t), 3,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorStd, r_, t, dimension, biased, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THDTensor_(var)(THDTensor *r_, THDTensor *t, int dimension, int biased, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(t), 3,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorVar, r_, t, dimension, biased, keepdim),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(r_, r_, dimension);
+  }
+}
+
+void THDTensor_(norm)(THDTensor *r_, THDTensor *t, real value, int dimension, int keepdim) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(t), 3,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THLongStorage *dim = THDTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THDTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorNorm, r_, t, dimension, keepdim, value),
+    THDState::s_current_worker
+  );
+
+  if (!keepdim) {
+    THDTensor_(_squeeze1d)(r_, r_, dimension);
+  }
+}
+
+accreal THDTensor_(normall)(THDTensor *tensor, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorNormall, tensor, value),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+void THDTensor_(renorm)(THDTensor *res, THDTensor *src, real value,
+                        int dimension, real maxnorm) {
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(src), 3,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+  THArgCheck(value > 0, 2, "non-positive-norm not supported");
+  THArgCheck(THDTensor_(nDimension)(src) > 1, 1,
+             "need at least 2 dimensions, got %d dimensions",
+             THDTensor_(nDimension)(src));
+
+  THDTensor_(resizeAs)(res, src);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRenorm, res, src, dimension, value, maxnorm),
+    THDState::s_current_worker
+  );
+}
+
+accreal THDTensor_(dist)(THDTensor *tensor, THDTensor *src, real value) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorDist, tensor, src, value),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+accreal THDTensor_(meanall)(THDTensor *tensor) {
+  THArgCheck(tensor->nDimension > 0, 1, "empty Tensor");
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorMeanall, tensor),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+accreal THDTensor_(varall)(THDTensor *tensor, int biased) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorVarall, tensor, biased),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+accreal THDTensor_(stdall)(THDTensor *tensor, int biased) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorStdall, tensor, biased),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<accreal>(THDState::s_current_worker);
+}
+
+void THDTensor_(linspace)(THDTensor *r_, real a, real b, int64_t n) {
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+
+  if (THDTensor_(nElement)(r_) != n) {
+    THDTensor_(resize1d)(r_, n);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLinspace, r_, n, a, b),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(logspace)(THDTensor *r_, real a, real b, int64_t n) {
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+
+  if (THDTensor_(nElement)(r_) != n) {
+    THDTensor_(resize1d)(r_, n);
+  }
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLogspace, r_, n, a, b),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(rand)(THDTensor *r_, THDGenerator *_generator,
+                      THLongStorage *size) {
+  THDTensor_(resize)(r_, size, NULL);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRand, r_, _generator, size),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(randn)(THDTensor *r_, THDGenerator *_generator,
+                       THLongStorage *size) {
+  THDTensor_(resize)(r_, size, NULL);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRandn, r_, _generator, size),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(histc)(THDTensor *hist, THDTensor *tensor, int64_t nbins,
+                       real minvalue, real maxvalue) {
+  THDTensor_(resize1d)(hist, nbins);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorHistc, hist, tensor, nbins, minvalue, maxvalue),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(bhistc)(THDTensor *hist, THDTensor *tensor, int64_t nbins,
+                        real minvalue, real maxvalue) {
+  THArgCheck(THDTensor_(nDimension)(tensor) < 3, 2,
+             "invalid dimension %d, the input must be a 2d tensor",
+             THDTensor_(nDimension)(tensor));
+
+  int dimension = 1;
+  THArgCheck(dimension >= 0 && dimension < THDTensor_(nDimension)(tensor), 2,
+             "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  THDTensor_(resize2d)(hist, tensor->size[0], nbins);
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorBhistc, hist, tensor, nbins, minvalue, maxvalue),
+    THDState::s_current_worker
+  );
+}
+
+#endif // defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+
+#if defined(TH_REAL_IS_BYTE)
+
+int THDTensor_(logicalAnd)(THDTensor *tensor) {
+  THArgCheck(tensor->nDimension > 0, 1, "empty Tensor");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLogicalAnd, tensor),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<int>(THDState::s_current_worker);
+}
+
+int THDTensor_(logicalAny)(THDTensor *tensor) {
+  THArgCheck(tensor->nDimension > 0, 1, "empty Tensor");
+
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLogicalAny, tensor),
+    THDState::s_current_worker
+  );
+
+  return receiveValueFromWorker<int>(THDState::s_current_worker);
+}
+
+#endif // defined(TH_REAL_IS_BYTE)
+
+THD_API void THDTensor_(clshift)(THDTensor *r_, THDTensor *t, THDTensor *src) {
+  THError("clshift not implemented");
+}
+
+THD_API void THDTensor_(crshift)(THDTensor *r_, THDTensor *t, THDTensor *src) {
+  THError("crshift not implemented");
+}
+
+THD_API void THDTensor_(cbitand)(THDTensor *r_, THDTensor *t, THDTensor *src) {
+  THError("cbitand not implemented");
+}
+
+THD_API void THDTensor_(cbitor)(THDTensor *r_, THDTensor *t, THDTensor *src) {
+  THError("cbitor not implemented");
+}
+
+THD_API void THDTensor_(cbitxor)(THDTensor *r_, THDTensor *t, THDTensor *src) {
+  THError("cbitxor not implemented");
+}
+
+THD_API void THDTensor_(lshift)(THDTensor *r_, THDTensor *t, real value) {
+  THError("lshift not implemented");
+}
+
+THD_API void THDTensor_(rshift)(THDTensor *r_, THDTensor *t, real value) {
+  THError("rshift not implemented");
+}
+
+THD_API void THDTensor_(bitand)(THDTensor *r_, THDTensor *t, real value) {
+  THError("bitand not implemented");
+}
+
+THD_API void THDTensor_(bitor)(THDTensor *r_, THDTensor *t, real value) {
+  THError("bitor not implemented");
+}
+
+THD_API void THDTensor_(bitxor)(THDTensor *r_, THDTensor *t, real value) {
+  THError("bitxor not implemented");
+}
+
+#endif // TH_GENERIC_FILE
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorMath.h b/torch/lib/THD/master_worker/master/generic/THDTensorMath.h
new file mode 100644
index 0000000..0a8d6b7
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorMath.h
@@ -0,0 +1,150 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorMath.h"
+#else
+
+THD_API void THDTensor_(gather)(THDTensor *self, THDTensor *src, int dim,
+                                THDLongTensor *index);
+THD_API void THDTensor_(scatter)(THDTensor *self, int dim, THDLongTensor *index,
+                                 THDTensor *src);
+THD_API void THDTensor_(scatterFill)(THDTensor *self, int dim,
+                                     THDLongTensor *index, real val);
+THD_API void THDTensor_(scatterAdd)(THDTensor *self, int dim, THDLongTensor *index,
+                                 THDTensor *src);
+
+THD_API void THDTensor_(max)(THDTensor *self, THDLongTensor *indices_,
+                             THDTensor *src, int dimension, int keepdim);
+THD_API void THDTensor_(min)(THDTensor *self, THDLongTensor *indices_,
+                             THDTensor *src, int dimension, int keepdim);
+THD_API void THDTensor_(kthvalue)(THDTensor *self, THDLongTensor *indices_,
+                                  THDTensor *src, int64_t k, int dimension, int keepdim);
+THD_API void THDTensor_(mode)(THDTensor *self, THDLongTensor *indices_,
+                              THDTensor *src, int dimension, int keepdim);
+THD_API void THDTensor_(median)(THDTensor *self, THDLongTensor *indices_,
+                                THDTensor *src, int dimension, int keepdim);
+
+THD_API void THDTensor_(fill)(THDTensor *r_, real value);
+THD_API void THDTensor_(zero)(THDTensor *r);
+THD_API void THDTensor_(maskedFill)(THDTensor *tensor, THDByteTensor *mask, real value);
+THD_API void THDTensor_(maskedCopy)(THDTensor *tensor, THDByteTensor *mask, THDTensor* src);
+THD_API void THDTensor_(maskedSelect)(THDTensor *tensor, THDTensor* src, THDByteTensor *mask);
+THD_API void THDTensor_(nonzero)(THDLongTensor *subscript, THDTensor *tensor);
+THD_API void THDTensor_(indexSelect)(THDTensor *tensor, THDTensor *src, int dim,
+                                     THDLongTensor *index);
+THD_API void THDTensor_(indexCopy)(THDTensor *tensor, int dim, THDLongTensor *index, THDTensor *src);
+THD_API void THDTensor_(indexAdd)(THDTensor *tensor, int dim, THDLongTensor *index, THDTensor *src);
+THD_API void THDTensor_(indexFill)(THDTensor *tensor, int dim, THDLongTensor *index, real val);
+
+
+THD_API void THDTensor_(zeros)(THDTensor *r_, THLongStorage *size);
+THD_API void THDTensor_(ones)(THDTensor *r_, THLongStorage *size);
+THD_API ptrdiff_t THDTensor_(numel)(THDTensor *t);
+
+THD_API void THDTensor_(diag)(THDTensor *r_, THDTensor *t, int k);
+THD_API void THDTensor_(eye)(THDTensor *r_, int64_t n, int64_t m);
+THD_API void THDTensor_(range)(THDTensor *r_, accreal xmin, accreal xmax, accreal step);
+THD_API void THDTensor_(randperm)(THDTensor *r_, THDGenerator *_generator, int64_t n);
+THD_API void THDTensor_(reshape)(THDTensor *r_, THDTensor *t, THDLongStorage *size);
+THD_API void THDTensor_(sort)(THDTensor *rt_, THDLongTensor *ri_,
+                              THDTensor *t, int dimension,
+                              int descendingOrder);
+THD_API void THDTensor_(topk)(THDTensor *rt_, THDLongTensor *ri_,
+                              THDTensor *t, int64_t k, int dim,
+                              int dir, int sorted);
+THD_API void THDTensor_(tril)(THDTensor *r_, THDTensor *t, int64_t k);
+THD_API void THDTensor_(triu)(THDTensor *r_, THDTensor *t, int64_t k);
+THD_API void THDTensor_(cat)(THDTensor *r_, THDTensor *ta,
+                             THDTensor *tb, int dimension);
+THD_API void THDTensor_(catArray)(THDTensor *result, THDTensor **inputs,
+                                  int numInputs, int dimension);
+THD_API int THDTensor_(equal)(THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(ltValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(leValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(gtValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(geValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(neValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(eqValue)(THDByteTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(ltValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(leValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(gtValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(geValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(neValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(eqValueT)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(ltTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(leTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(gtTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(geTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(neTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(eqTensor)(THDByteTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(ltTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(leTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(gtTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(geTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(neTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(eqTensorT)(THDTensor *r_, THDTensor *ta, THDTensor *tb);
+THD_API void THDTensor_(abs)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(sigmoid)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(log)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(log10)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(log1p)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(log2)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(exp)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(expm1)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(cos)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(acos)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(cosh)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(sin)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(asin)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(sinh)(THDTensor *r_, THDTensor *t);
+
+THD_API void THDTensor_(tan)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(atan)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(atan2)(THDTensor *r_, THDTensor *tx, THDTensor *ty);
+THD_API void THDTensor_(tanh)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(pow)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(tpow)(THDTensor *r_, real value, THDTensor *t);
+THD_API void THDTensor_(sqrt)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(rsqrt)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(ceil)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(floor)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(round)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(abs)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(trunc)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(frac)(THDTensor *r_, THDTensor *t);
+THD_API void THDTensor_(lerp)(THDTensor *r_, THDTensor *a, THDTensor *b, real weight);
+THD_API void THDTensor_(mean)(THDTensor *r_, THDTensor *t, int dimension, int keepdim);
+THD_API void THDTensor_(std)(THDTensor *r_, THDTensor *t, int dimension, int biased, int keepdim);
+THD_API void THDTensor_(var)(THDTensor *r_, THDTensor *t, int dimension, int biased, int keepdim);
+THD_API void THDTensor_(norm)(THDTensor *r_, THDTensor *t, real value,
+                              int dimension, int keepdim);
+THD_API void THDTensor_(renorm)(THDTensor *r_, THDTensor *t, real value,
+                                int dimension, real maxnorm);
+THD_API accreal THDTensor_(dist)(THDTensor *a, THDTensor *b, real value);
+THD_API void THDTensor_(histc)(THDTensor *hist, THDTensor *tensor, int64_t nbins,
+                               real minvalue, real maxvalue);
+THD_API void THDTensor_(bhistc)(THDTensor *hist, THDTensor *tensor, int64_t nbins,
+                                real minvalue, real maxvalue);
+THD_API accreal THDTensor_(meanall)(THDTensor *self);
+THD_API accreal THDTensor_(varall)(THDTensor *self, int biased);
+THD_API accreal THDTensor_(stdall)(THDTensor *self, int biased);
+THD_API accreal THDTensor_(normall)(THDTensor *t, real value);
+THD_API void THDTensor_(linspace)(THDTensor *r_, real a, real b, int64_t n);
+THD_API void THDTensor_(logspace)(THDTensor *r_, real a, real b, int64_t n);
+THD_API void THDTensor_(rand)(THDTensor *r_, THDGenerator *_generator,
+                              THLongStorage *size);
+THD_API void THDTensor_(randn)(THDTensor *r_, THDGenerator *_generator,
+                               THLongStorage *size);
+THD_API int THDTensor_(logicalAll)(THDTensor *self);
+THD_API int THDTensor_(logicalAny)(THDTensor *self);
+
+THD_API void THDTensor_(clshift)(THDTensor *r_, THDTensor *t, THDTensor *src);
+THD_API void THDTensor_(crshift)(THDTensor *r_, THDTensor *t, THDTensor *src);
+THD_API void THDTensor_(cbitand)(THDTensor *r_, THDTensor *t, THDTensor *src);
+THD_API void THDTensor_(cbitor)(THDTensor *r_, THDTensor *t, THDTensor *src);
+THD_API void THDTensor_(cbitxor)(THDTensor *r_, THDTensor *t, THDTensor *src);
+THD_API void THDTensor_(lshift)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(rshift)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(bitand)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(bitor)(THDTensor *r_, THDTensor *t, real value);
+THD_API void THDTensor_(bitxor)(THDTensor *r_, THDTensor *t, real value);
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp
new file mode 100644
index 0000000..05ec097
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp
@@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorMeta.cpp"
+#else
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+// taken from TH (generic/THTensor.c)
+// with a little fixes done so as to allocate
+// and free memory the way it is done in THDTensor
+static void THDTensor_(_resize)(THDTensor *self, int nDimension, int64_t *size, int64_t *stride) {
+  int nDimension_;
+  ptrdiff_t totalSize;
+  bool hasRequiredSize = true;
+
+  nDimension_ = 0;
+  for (size_t d = 0; d < nDimension; d++) {
+    if (size[d] > 0) {
+      nDimension_++;
+      if ((self->nDimension > d) && (size[d] != self->size[d]))
+        hasRequiredSize = false;
+      if ((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
+        hasRequiredSize = false;
+    } else {
+      break;
+    }
+  }
+  nDimension = nDimension_;
+
+  if (nDimension != self->nDimension)
+    hasRequiredSize = false;
+
+  if (hasRequiredSize)
+    return;
+
+  if (nDimension > 0) {
+    if (nDimension != self->nDimension) {
+      delete[] self->size;
+      delete[] self->stride;
+      self->size = new int64_t[nDimension];
+      self->stride = new int64_t[nDimension];
+      self->nDimension = nDimension;
+    }
+
+    totalSize = 1;
+    for (std::ptrdiff_t d = self->nDimension - 1; d >= 0; d--) {
+      self->size[d] = size[d];
+      if (stride && (stride[d] >= 0)) {
+        self->stride[d] = stride[d];
+      } else {
+        if (d == self->nDimension-1)
+          self->stride[d] = 1;
+        else
+          self->stride[d] = self->size[d+1]*self->stride[d+1];
+      }
+      totalSize += (self->size[d]-1)*self->stride[d];
+    }
+
+    if (totalSize + self->storageOffset > 0) {
+      if (!self->storage)
+        self->storage = THDStorage_(new)();
+      if (totalSize + self->storageOffset > self->storage->size)
+        THDStorage_(resize)(self->storage, totalSize+self->storageOffset);
+    }
+  } else {
+    self->nDimension = 0;
+  }
+}
+
+void THDTensor_(_resize2d)(THDTensor *tensor, int64_t size0, int64_t size1) {
+  int64_t sizes[] = {size0, size1};
+  THDTensor_(_resize)(tensor, 2, sizes, nullptr);
+}
+
+void THDTensor_(_resize3d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2) {
+  int64_t sizes[] = {size0, size1, size2};
+  THDTensor_(_resize)(tensor, 2, sizes, nullptr);
+}
+
+void THDTensor_(_resize4d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2, int64_t size3) {
+  int64_t sizes[] = {size0, size1, size2, size3};
+  THDTensor_(_resize)(tensor, 2, sizes, nullptr);
+}
+
+void THDTensor_(_resize5d)(THDTensor *tensor, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4) {
+  int64_t sizes[] = {size0, size1, size2, size3, size4};
+  THDTensor_(_resize)(tensor, 2, sizes, nullptr);
+}
+
+void THDTensor_(_squeeze1d)(THDTensor *self, THDTensor *src, int dimension) {
+  if (!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "dimension out of range");
+
+  THDTensor_(set)(self, src);
+
+  if (src->size[dimension] == 1 && src->nDimension > 1) {
+    for (size_t d = dimension; d < self->nDimension-1; d++) {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->nDimension--;
+  }
+}
+
+static void THDTensor_(_set)(THDTensor *self, THDStorage *storage,
+                             ptrdiff_t storageOffset, int nDimension,
+                             int64_t *size, int64_t *stride) {
+  /* storage */
+  if (self->storage != storage) {
+    if (self->storage)
+      THDStorage_(free)(self->storage);
+
+    if (storage) {
+      self->storage = storage;
+      THDStorage_(retain)(self->storage);
+    } else {
+      self->storage = NULL;
+    }
+  }
+
+  /* storageOffset */
+  if (storageOffset < 0)
+    THError("can't set negative storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THDTensor_(_resize)(self, nDimension, size, stride);
+}
+
+static THDTensor *THDTensor_(_alloc)() {
+  THDTensor *new_tensor = new THDTensor();
+  std::memset(reinterpret_cast<void*>(new_tensor), 0, sizeof(THDTensor));
+  new_tensor->size = nullptr;
+  new_tensor->stride = nullptr;
+  new_tensor->nDimension = 0;
+
+  new_tensor->storage = nullptr;
+  new_tensor->storageOffset = 0;
+
+  new_tensor->refcount = 1;
+
+  new_tensor->tensor_id = THDState::s_nextId++;
+  return new_tensor;
+}
+
+#endif
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorRandom.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorRandom.cpp
new file mode 100644
index 0000000..966f9ee
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorRandom.cpp
@@ -0,0 +1,129 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorRandom.cpp"
+#else
+
+using namespace thd;
+using namespace rpc;
+using namespace master;
+
+void THDTensor_(random)(THDTensor *self, THDGenerator *_generator) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorRandom, self, _generator),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(geometric)(THDTensor *self, THDGenerator *_generator, double p) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorGeometric, self, _generator, p),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(bernoulli)(THDTensor *self, THDGenerator *_generator, double p) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorBernoulli, self, _generator, p),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(bernoulli_FloatTensor)(THDTensor *self, THDGenerator *_generator,
+                                       THDFloatTensor *p) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorBernoulli_FloatTensor, self, _generator, p),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(bernoulli_DoubleTensor)(THDTensor *self, THDGenerator *_generator,
+                                        THDDoubleTensor *p) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorBernoulli_DoubleTensor, self, _generator, p),
+    THDState::s_current_worker
+  );
+}
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+void THDTensor_(uniform)(THDTensor *self, THDGenerator *_generator, double a,
+                         double b) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorUniform, self, _generator, a, b),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(normal)(THDTensor *self, THDGenerator *_generator, double mean,
+                        double stdv) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorNormal, self, _generator, mean, stdv),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(exponential)(THDTensor *self, THDGenerator *_generator,
+                             double lambda) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorExponential, self, _generator, lambda),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(cauchy)(THDTensor *self, THDGenerator *_generator, double median,
+                        double sigma) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorCauchy, self, _generator, median, sigma),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(logNormal)(THDTensor *self, THDGenerator *_generator, double mean,
+                           double stdv) {
+  masterCommandChannel->sendMessage(
+    packMessage(Functions::tensorLogNormal, self, _generator, mean, stdv),
+    THDState::s_current_worker
+  );
+}
+
+void THDTensor_(multinomial)(THDLongTensor *self, THDGenerator *_generator,
+                             THDTensor *prob_dist, int n_sample,
+                             int with_replacement) {
+  int start_dim = THDTensor_(nDimension)(prob_dist);
+  if (start_dim == 1) {
+    THDTensor_(resize2d)(prob_dist, 1, THDTensor_(size)(prob_dist, 0));
+  }
+
+  long n_dist = THDTensor_(size)(prob_dist, 0);
+  long n_categories = THDTensor_(size)(prob_dist, 1);
+
+  THArgCheck(n_sample > 0, 2, "cannot sample n_sample < 0 samples");
+
+  if (!with_replacement) {
+    THArgCheck((!with_replacement) && (n_sample <= n_categories), 2, \
+    "cannot sample n_sample > prob_dist:size(1) samples without replacement");
+  }
+
+  /* will contain multinomial samples (category indices to be returned) */
+  THDLongTensor_resize2d(self, n_dist, n_sample);
+
+  masterCommandChannel->sendMessage(
+    packMessage(
+      Functions::tensorMultinomial,
+      self,
+      _generator,
+      prob_dist,
+      n_sample,
+      with_replacement
+    ),
+    THDState::s_current_worker
+  );
+
+  if (start_dim == 1) {
+    THDLongTensor_resize1d(self, n_sample);
+    THDTensor_(resize1d)(prob_dist, n_categories);
+  }
+}
+
+#endif // defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+#endif // TH_GENERIC_FILE
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorRandom.h b/torch/lib/THD/master_worker/master/generic/THDTensorRandom.h
new file mode 100644
index 0000000..0857ebd
--- /dev/null
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorRandom.h
@@ -0,0 +1,32 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "master_worker/master/generic/THDTensorRandom.h"
+#else
+
+THD_API void THDTensor_(random)(THDTensor *self, THDGenerator *_generator);
+THD_API void THDTensor_(geometric)(THDTensor *self, THDGenerator *_generator,
+                                   double p);
+THD_API void THDTensor_(bernoulli)(THDTensor *self, THDGenerator *_generator,
+                                   double p);
+THD_API void THDTensor_(bernoulli_FloatTensor)(THDTensor *self,
+                                               THDGenerator *_generator,
+                                               THDFloatTensor *p);
+THD_API void THDTensor_(bernoulli_DoubleTensor)(THDTensor *self,
+                                                THDGenerator *_generator,
+                                                THDDoubleTensor *p);
+THD_API void THDTensor_(uniform)(THDTensor *self, THDGenerator *_generator,
+                                 double a, double b);
+THD_API void THDTensor_(normal)(THDTensor *self, THDGenerator *_generator,
+                                double mean, double stdv);
+THD_API void THDTensor_(exponential)(THDTensor *self, THDGenerator *_generator,
+                                     double lambda);
+THD_API void THDTensor_(cauchy)(THDTensor *self, THDGenerator *_generator,
+                                double median, double sigma);
+THD_API void THDTensor_(logNormal)(THDTensor *self, THDGenerator *_generator,
+                                   double mean, double stdv);
+THD_API void THDTensor_(multinomial)(THDLongTensor *self,
+                                     THDGenerator *_generator,
+                                     THDTensor *prob_dist,
+                                     int n_sample,
+                                     int with_replacement);
+
+#endif
diff --git a/torch/lib/THD/master_worker/worker/Dispatch.cpp b/torch/lib/THD/master_worker/worker/Dispatch.cpp
new file mode 100644
index 0000000..1c5f3a7
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/Dispatch.cpp
@@ -0,0 +1,301 @@
+#include <TH/THStorage.h>
+#include <cstdint>
+#include <unordered_map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+#include "../../process_group/General.hpp"
+#include "../common/Functions.hpp"
+#include "../common/RPC.hpp"
+#include "../master/Master.hpp"
+#include "Worker.hpp"
+
+namespace thd {
+namespace worker {
+
+namespace detail {
+
+void sendValueToMaster(int64_t value) {
+  IntScalar scalar(value);
+  dataChannel->send(scalar, 0);
+}
+
+void sendValueToMaster(double value) {
+  FloatScalar scalar(value);
+  dataChannel->send(scalar, 0);
+}
+
+at::Tensor& unpackRetrieveTensor(rpc::RPCMessage& message) {
+  return workerTensors.at(unpackTensor(message));
+}
+
+at::Storage* unpackRetrieveStorage(rpc::RPCMessage& message) {
+  return workerStorages.at(unpackStorage(message)).get();
+}
+
+at::Generator* unpackRetrieveGenerator(rpc::RPCMessage& message) {
+  return workerGenerators.at(unpackGenerator(message)).get();
+}
+
+static void finalize(rpc::RPCMessage& raw_message) {
+  if (raw_message.remaining() > 0)
+    throw std::invalid_argument("message is too long");
+}
+
+#include "dispatch/Communication.cpp"
+#include "dispatch/Generator.cpp"
+#include "dispatch/Storage.cpp"
+#include "dispatch/Tensor.cpp"
+#include "dispatch/TensorCopy.cpp"
+#include "dispatch/TensorMath.cpp"
+#include "dispatch/TensorRandom.cpp"
+#include "dispatch/TensorLapack.cpp"
+
+using dispatch_fn = void (*)(rpc::RPCMessage&);
+using Functions = thd::Functions;
+
+void exitWorker(rpc::RPCMessage& msg) {
+  finalize(msg);
+  ::exit(0);
+}
+
+
+static const std::unordered_map<rpc::function_id_type, dispatch_fn> functions {
+    {Functions::generatorNew, generatorNew},
+    {Functions::generatorFree, generatorFree},
+    {Functions::generatorCopy, generatorCopy},
+    {Functions::generatorSeed, generatorSeed},
+    {Functions::generatorManualSeed, generatorManualSeed},
+
+    {Functions::tensorCopyFromMaster, tensorCopyFromMaster},
+    {Functions::tensorCopyFromWorker, tensorCopyFromWorker},
+
+    {Functions::tensorNew, tensorNew},
+    {Functions::tensorNewWithSize, tensorNewWithSize},
+    {Functions::tensorNewWithStorage, tensorNewWithStorage},
+    {Functions::tensorResize, tensorResize},
+    {Functions::tensorResizeAs, tensorResizeAs},
+    {Functions::tensorResize1d, tensorResize1d},
+    {Functions::tensorResize2d, tensorResize2d},
+    {Functions::tensorResize3d, tensorResize2d},
+    {Functions::tensorResize4d, tensorResize2d},
+    {Functions::tensorResize5d, tensorResize2d},
+    {Functions::tensorSetStorage, tensorSetStorage},
+    {Functions::tensorSetStorage1d, tensorSetStorage1d},
+    {Functions::tensorSetStorage2d, tensorSetStorage2d},
+    {Functions::tensorSetStorage3d, tensorSetStorage3d},
+    {Functions::tensorSetStorage4d, tensorSetStorage4d},
+    {Functions::tensorNarrow, tensorNarrow},
+    {Functions::tensorSelect, tensorSelect},
+    {Functions::tensorTranspose, tensorTranspose},
+    {Functions::tensorUnfold, tensorUnfold},
+    {Functions::tensorSqueeze, tensorSqueeze},
+    {Functions::tensorSqueeze, tensorSqueeze1d},
+
+    {Functions::tensorFree, tensorFree},
+    {Functions::tensorAdd, tensorAdd},
+
+    {Functions::tensorGather, tensorGather},
+    {Functions::tensorScatter, tensorScatter},
+    {Functions::tensorScatterFill, tensorScatterFill},
+    {Functions::tensorDot, tensorDot},
+    {Functions::tensorMinall, tensorMinall},
+    {Functions::tensorMaxall, tensorMaxall},
+    {Functions::tensorMedianall, tensorMedianall},
+    {Functions::tensorSumall, tensorSumall},
+    {Functions::tensorProdall, tensorProdall},
+    {Functions::tensorNeg, tensorNeg},
+    {Functions::tensorCinv, tensorCinv},
+    {Functions::tensorAdd, tensorAdd},
+    {Functions::tensorSub, tensorSub},
+    {Functions::tensorMul, tensorMul},
+    {Functions::tensorDiv, tensorDiv},
+    {Functions::tensorFmod, tensorFmod},
+    {Functions::tensorRemainder, tensorRemainder},
+    {Functions::tensorClamp, tensorClamp},
+    {Functions::tensorCadd, tensorCadd},
+    {Functions::tensorCsub, tensorCsub},
+    {Functions::tensorCmul, tensorCmul},
+    {Functions::tensorCpow, tensorCpow},
+    {Functions::tensorCdiv, tensorCdiv},
+    {Functions::tensorCfmod, tensorCfmod},
+    {Functions::tensorCremainder, tensorCremainder},
+    {Functions::tensorAddcmul, tensorAddcmul},
+    {Functions::tensorAddcdiv, tensorAddcdiv},
+    {Functions::tensorAddmv, tensorAddmv},
+    {Functions::tensorAddmm, tensorAddmm},
+    {Functions::tensorAddr, tensorAddr},
+    {Functions::tensorAddbmm, tensorAddbmm},
+    {Functions::tensorBaddbmm, tensorBaddbmm},
+    /* {Functions::tensorMatch, tensorMatch}, */
+    {Functions::tensorMax, tensorMax},
+    {Functions::tensorMin, tensorMin},
+    {Functions::tensorKthvalue, tensorKthvalue},
+    {Functions::tensorMode, tensorMode},
+    {Functions::tensorMedian, tensorMedian},
+    {Functions::tensorSum, tensorSum},
+    {Functions::tensorProd, tensorProd},
+    {Functions::tensorCumsum, tensorCumsum},
+    {Functions::tensorCumprod, tensorCumprod},
+    {Functions::tensorSign, tensorSign},
+    {Functions::tensorTrace, tensorTrace},
+    {Functions::tensorCross, tensorCross},
+    {Functions::tensorCmax, tensorCmax},
+    {Functions::tensorCmin, tensorCmin},
+    /* {Functions::tensorCmaxValue, tensorCmaxValue}, */
+    /* {Functions::tensorCminValue, tensorCminValue}, */
+
+    {Functions::tensorFill, tensorFill},
+    {Functions::tensorMaskedFill, tensorMaskedFill},
+    {Functions::tensorMaskedCopy, tensorMaskedCopy},
+    {Functions::tensorMaskedSelect, tensorMaskedSelect},
+    {Functions::tensorNonzero, tensorNonzero},
+    {Functions::tensorIndexSelect, tensorIndexSelect},
+    {Functions::tensorIndexCopy, tensorIndexCopy},
+    {Functions::tensorIndexAdd, tensorIndexAdd},
+    {Functions::tensorIndexFill, tensorIndexFill},
+    {Functions::tensorDiag, tensorDiag},
+    {Functions::tensorEye, tensorEye},
+    {Functions::tensorRange, tensorRange},
+    {Functions::tensorRandperm, tensorRandperm},
+    {Functions::tensorSort, tensorSort},
+    {Functions::tensorTopk, tensorTopk},
+    {Functions::tensorTril, tensorTril},
+    {Functions::tensorTriu, tensorTriu},
+    {Functions::tensorEqual, tensorEqual},
+    {Functions::tensorLtValue, tensorLtValue},
+    {Functions::tensorLeValue, tensorLeValue},
+    {Functions::tensorGtValue, tensorGtValue},
+    {Functions::tensorGeValue, tensorGeValue},
+    {Functions::tensorNeValue, tensorNeValue},
+    {Functions::tensorEqValue, tensorEqValue},
+    {Functions::tensorLtValueT, tensorLtValueT},
+    {Functions::tensorLeValueT, tensorLeValueT},
+    {Functions::tensorGtValueT, tensorGtValueT},
+    {Functions::tensorGeValueT, tensorGeValueT},
+    {Functions::tensorNeValueT, tensorNeValueT},
+    {Functions::tensorEqValueT, tensorEqValueT},
+    {Functions::tensorLtTensor, tensorLtTensor},
+    {Functions::tensorLeTensor, tensorLeTensor},
+    {Functions::tensorGtTensor, tensorGtTensor},
+    {Functions::tensorGeTensor, tensorGeTensor},
+    {Functions::tensorNeTensor, tensorNeTensor},
+    {Functions::tensorEqTensor, tensorEqTensor},
+    {Functions::tensorLtTensorT, tensorLtTensorT},
+    {Functions::tensorLeTensorT, tensorLeTensorT},
+    {Functions::tensorGtTensorT, tensorGtTensorT},
+    {Functions::tensorGeTensorT, tensorGeTensorT},
+    {Functions::tensorNeTensorT, tensorNeTensorT},
+    {Functions::tensorEqTensorT, tensorEqTensorT},
+    {Functions::tensorAbs, tensorAbs},
+    {Functions::tensorSigmoid, tensorSigmoid},
+    {Functions::tensorLog, tensorLog},
+    {Functions::tensorLog10, tensorLog10},
+    {Functions::tensorLog1p, tensorLog1p},
+    {Functions::tensorLog2, tensorLog2},
+    {Functions::tensorExp, tensorExp},
+    {Functions::tensorExpm1, tensorExpm1},
+    {Functions::tensorCos, tensorCos},
+    {Functions::tensorAcos, tensorAcos},
+    {Functions::tensorCosh, tensorCosh},
+    {Functions::tensorSin, tensorSin},
+    {Functions::tensorAsin, tensorAsin},
+    {Functions::tensorSinh, tensorSinh},
+    {Functions::tensorTan, tensorTan},
+    {Functions::tensorAtan, tensorAtan},
+    {Functions::tensorAtan2, tensorAtan2},
+    {Functions::tensorTanh, tensorTanh},
+    {Functions::tensorPow, tensorPow},
+    {Functions::tensorTpow, tensorTpow},
+    {Functions::tensorSqrt, tensorSqrt},
+    {Functions::tensorRsqrt, tensorRsqrt},
+    {Functions::tensorCeil, tensorCeil},
+    {Functions::tensorFloor, tensorFloor},
+    {Functions::tensorRound, tensorRound},
+    {Functions::tensorTrunc, tensorTrunc},
+    {Functions::tensorFrac, tensorFrac},
+    {Functions::tensorLerp, tensorLerp},
+    {Functions::tensorMean, tensorMean},
+    {Functions::tensorStd, tensorStd},
+    {Functions::tensorVar, tensorVar},
+    {Functions::tensorNorm, tensorNorm},
+    {Functions::tensorRenorm, tensorRenorm},
+    {Functions::tensorDist, tensorDist},
+    {Functions::tensorHistc, tensorHistc},
+    /* {Functions::tensorBhistc, tensorBhistc}, */
+    {Functions::tensorMeanall, tensorMeanall},
+    {Functions::tensorVarall, tensorVarall},
+    {Functions::tensorStdall, tensorStdall},
+    {Functions::tensorNormall, tensorNormall},
+    {Functions::tensorLinspace, tensorLinspace},
+    {Functions::tensorLogspace, tensorLogspace},
+    {Functions::tensorRand, tensorRand},
+    {Functions::tensorRandn, tensorRandn},
+    {Functions::tensorLogicalAndAll, tensorLogicalAndAll},
+    {Functions::tensorLogicalAnd, tensorLogicalAnd},
+    {Functions::tensorLogicalAnyAll, tensorLogicalAnyAll},
+    {Functions::tensorLogicalAny, tensorLogicalAny},
+    {Functions::tensorRandom, tensorRandom},
+    {Functions::tensorGeometric, tensorGeometric},
+    {Functions::tensorBernoulli, tensorBernoulli},
+    {Functions::tensorBernoulli_FloatTensor, tensorBernoulli_FloatTensor},
+    {Functions::tensorBernoulli_DoubleTensor, tensorBernoulli_DoubleTensor},
+    {Functions::tensorUniform, tensorUniform},
+    {Functions::tensorNormal, tensorNormal},
+    {Functions::tensorExponential, tensorExponential},
+    {Functions::tensorCauchy, tensorCauchy},
+    {Functions::tensorLogNormal, tensorLogNormal},
+    {Functions::tensorMultinomial, tensorMultinomial},
+
+    {Functions::tensorGesv, tensorGesv},
+    {Functions::tensorTrtrs, tensorTrtrs},
+    {Functions::tensorGels, tensorGels},
+    {Functions::tensorSyev, tensorSyev},
+    {Functions::tensorGeev, tensorGeev},
+    {Functions::tensorGesvd2, tensorGesvd2},
+    {Functions::tensorGetri, tensorGetri},
+    {Functions::tensorPotrf, tensorPotrf},
+    {Functions::tensorPotrs, tensorPotrs},
+    {Functions::tensorPotri, tensorPotri},
+    {Functions::tensorQr, tensorQr},
+    {Functions::tensorGeqrf, tensorGeqrf},
+    {Functions::tensorOrgqr, tensorOrgqr},
+    {Functions::tensorOrmqr, tensorOrmqr},
+    {Functions::tensorPstrf, tensorPstrf},
+
+    {Functions::storageNew, storageNew},
+    {Functions::storageNewWithSize, storageNewWithSize},
+    {Functions::storageNewWithSize1, storageNewWithSize1},
+    {Functions::storageNewWithSize2, storageNewWithSize2},
+    {Functions::storageNewWithSize3, storageNewWithSize3},
+    {Functions::storageNewWithSize4, storageNewWithSize4},
+    {Functions::storageFree, storageFree},
+    {Functions::storageResize, storageResize},
+    {Functions::storageFill, storageFill},
+
+    {Functions::sendTensor, sendTensor},
+    {Functions::sendStorage, sendStorage},
+
+    {Functions::exit, exitWorker}
+};
+
+} // namespace detail
+
+/* On fail throws exceptions which should be caught in worker's loop and reported
+ * to master.
+ */
+void execute(std::unique_ptr<rpc::RPCMessage> raw_message_ptr) {
+  auto &raw_message = *raw_message_ptr;
+  rpc::function_id_type fid = rpc::unpackFunctionId(raw_message);
+  auto iter = detail::functions.find(fid);
+  if (iter != detail::functions.end()) {
+    (*iter->second)(raw_message);
+  } else {
+    throw std::invalid_argument("invalid function id: " + std::to_string(fid));
+  }
+}
+
+} // namespace worker
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/worker/Dispatch.hpp b/torch/lib/THD/master_worker/worker/Dispatch.hpp
new file mode 100644
index 0000000..7ec5941
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/Dispatch.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../common/RPC.hpp"
+
+#include <memory>
+
+namespace thd {
+namespace worker {
+
+void execute(std::unique_ptr<rpc::RPCMessage> raw_message_ptr);
+
+} // namespace worker
+} // namespace thd
diff --git a/torch/lib/THD/master_worker/worker/Worker.cpp b/torch/lib/THD/master_worker/worker/Worker.cpp
new file mode 100644
index 0000000..e5a5a21
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/Worker.cpp
@@ -0,0 +1,43 @@
+#include "../../process_group/General.hpp"
+#include "../common/RPC.hpp"
+#include "Dispatch.hpp"
+#include "Worker.h"
+#include "Worker.hpp"
+
+#include <iostream>
+#include <stdexcept>
+
+namespace thd {
+namespace worker {
+
+std::unique_ptr<WorkerCommandChannel> workerCommandChannel;
+std::unordered_map<object_id_type, at::Tensor> workerTensors;
+std::unordered_map<object_id_type, std::unique_ptr<at::Storage>> workerStorages;
+std::unordered_map<object_id_type, std::unique_ptr<at::Generator>> workerGenerators;
+
+} // namespace worker
+} // namespace thd
+
+using namespace thd::rpc;
+using namespace thd::worker;
+
+void THDWorkerMain(std::string init_method, int world_size,
+                   std::string group_name, int rank) {
+  auto config = thd::getInitConfig(init_method, world_size, group_name, rank);
+  std::unique_ptr<RPCMessage> command;
+  workerCommandChannel.reset(new thd::WorkerCommandChannel(config));
+  if (!workerCommandChannel->init()) {
+    return;
+  }
+
+  while (true) {
+    command = workerCommandChannel->recvMessage();
+    try {
+      execute(std::move(command));
+    } catch (std::exception& e) {
+      std::cerr << "WORKER ERROR: " << e.what() << std::endl;
+      workerCommandChannel->sendError(e.what());
+      ::exit(1);
+    }
+  }
+}
diff --git a/torch/lib/THD/master_worker/worker/Worker.h b/torch/lib/THD/master_worker/worker/Worker.h
new file mode 100644
index 0000000..037decd
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/Worker.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "../../THD.h"
+#include <string>
+
+THD_API void THDWorkerMain(std::string init_method, int world_size,
+                           std::string group_name, int rank);
diff --git a/torch/lib/THD/master_worker/worker/Worker.hpp b/torch/lib/THD/master_worker/worker/Worker.hpp
new file mode 100644
index 0000000..c8d9c6f
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/Worker.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../common/CommandChannel.hpp"
+#include "../../base/DataChannel.hpp"
+
+#include <memory>
+
+namespace thd { namespace worker {
+extern std::unique_ptr<WorkerCommandChannel> workerCommandChannel;
+extern std::unordered_map<object_id_type, at::Tensor>
+  workerTensors;
+extern std::unordered_map<object_id_type, std::unique_ptr<at::Storage>>
+  workerStorages;
+extern std::unordered_map<object_id_type, std::unique_ptr<at::Generator>>
+  workerGenerators;
+}} // namespace worker, thd
diff --git a/torch/lib/THD/master_worker/worker/dispatch/Communication.cpp b/torch/lib/THD/master_worker/worker/dispatch/Communication.cpp
new file mode 100644
index 0000000..547daae
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/Communication.cpp
@@ -0,0 +1,14 @@
+
+static void sendTensor(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dst_rank = unpackInteger(raw_message);
+  finalize(raw_message);
+  dataChannel->send(tensor, dst_rank);
+}
+
+static void sendStorage(rpc::RPCMessage& raw_message) {
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  int dst_rank = unpackInteger(raw_message);
+  finalize(raw_message);
+  fprintf(stderr, "sending storage (to be implemented)\n");
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/Generator.cpp b/torch/lib/THD/master_worker/worker/dispatch/Generator.cpp
new file mode 100644
index 0000000..884149f
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/Generator.cpp
@@ -0,0 +1,36 @@
+
+static std::unique_ptr<at::Generator> createGenerator() {
+  return std::unique_ptr<at::Generator>();
+}
+
+static void generatorNew(rpc::RPCMessage& raw_message) {
+  object_id_type generator_id = unpackGenerator(raw_message);
+  finalize(raw_message);
+  workerGenerators.emplace(generator_id, createGenerator());
+}
+
+static void generatorFree(rpc::RPCMessage& raw_message) {
+  object_id_type generator_id = unpackGenerator(raw_message);
+  workerGenerators.erase(generator_id);
+}
+
+static void generatorCopy(rpc::RPCMessage& raw_message) {
+  at::Generator* self = unpackRetrieveGenerator(raw_message);
+  at::Generator* from = unpackRetrieveGenerator(raw_message);
+  finalize(raw_message);
+  self->copy(*from);
+}
+
+static void generatorSeed(rpc::RPCMessage& raw_message) {
+  at::Generator* _generator = unpackRetrieveGenerator(raw_message);
+  finalize(raw_message);
+  int64_t response = _generator->seed();
+  sendValueToMaster(response);
+}
+
+static void generatorManualSeed(rpc::RPCMessage& raw_message) {
+  at::Generator* _generator = unpackRetrieveGenerator(raw_message);
+  uint64_t seed = unpackInteger(raw_message);
+  finalize(raw_message);
+  _generator->manualSeed(seed);
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/Storage.cpp b/torch/lib/THD/master_worker/worker/dispatch/Storage.cpp
new file mode 100644
index 0000000..ca583a9
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/Storage.cpp
@@ -0,0 +1,151 @@
+static std::unique_ptr<at::Storage> createStorage(RPCType type) {
+  if (type == RPCType::UCHAR)
+    return at::getType(at::Backend::CPU, at::ScalarType::Byte).storage();
+  else if (type == RPCType::CHAR)
+    return at::getType(at::Backend::CPU, at::ScalarType::Char).storage();
+  else if (type == RPCType::SHORT)
+    return at::getType(at::Backend::CPU, at::ScalarType::Short).storage();
+  else if (type == RPCType::INT)
+    return at::getType(at::Backend::CPU, at::ScalarType::Int).storage();
+  else if (type == RPCType::LONG)
+    return at::getType(at::Backend::CPU, at::ScalarType::Long).storage();
+  else if (type == RPCType::FLOAT)
+    return at::getType(at::Backend::CPU, at::ScalarType::Float).storage();
+  else if (type == RPCType::DOUBLE)
+    return at::getType(at::Backend::CPU, at::ScalarType::Double).storage();
+  throw std::invalid_argument("passed character doesn't represent a storage type");
+}
+
+static std::unique_ptr<at::Storage> createStorage(RPCType type, size_t size) {
+  std::unique_ptr<at::Storage> storage = createStorage(type);
+  storage->resize(size);
+  return storage;
+}
+
+static void storageSet(rpc::RPCMessage& raw_message) {
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t offset = unpackInteger(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    storage->set(offset, value);
+  } else if (isFloat(type)) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    storage->set(offset, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void storageGet(rpc::RPCMessage& raw_message) {
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t offset = unpackInteger(raw_message);
+  RPCType type = unpackType(raw_message);
+  finalize(raw_message);
+  if (isInteger(type)) {
+    int64_t value = storage->get(offset).to<int64_t>();
+    sendValueToMaster(value);
+  } else if (isFloat(type)) {
+    double value = storage->get(offset).to<double>();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void storageNew(rpc::RPCMessage& raw_message) {
+  RPCType storage_type = unpackType(raw_message);
+  object_id_type storage_id = unpackStorage(raw_message);
+  finalize(raw_message);
+  workerStorages.emplace(
+    storage_id,
+    createStorage(storage_type)
+  );
+}
+
+static void storageNewWithSize(rpc::RPCMessage& raw_message) {
+  RPCType storage_type = unpackType(raw_message);
+  object_id_type storage_id = unpackStorage(raw_message);
+  int64_t size = unpackInteger(raw_message);
+  finalize(raw_message);
+  workerStorages.emplace(
+    storage_id,
+    createStorage(storage_type, size)
+  );
+}
+
+static void storageNewWithSizeN(rpc::RPCMessage& raw_message, size_t size) {
+  RPCType storage_type = unpackType(raw_message);
+  object_id_type storage_id = unpackStorage(raw_message);
+  std::unique_ptr<at::Storage> storage = createStorage(storage_type, size);
+  RPCType value_type = peekType(raw_message);
+  if (isInteger(value_type)) {
+    int64_t values[size];
+    for (size_t i = 0; i < size; i++)
+      values[i] = unpackInteger(raw_message);
+    finalize(raw_message);
+    for (size_t i = 0; i < size; i++)
+      storage->fast_set(i, values[i]);
+  } else if (isFloat(value_type)) {
+    double values[size];
+    for (size_t i = 0; i < size; i++)
+      values[i] = unpackInteger(raw_message);
+    finalize(raw_message);
+    for (size_t i = 0; i < size; i++)
+      storage->fast_set(i, values[i]);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+  finalize(raw_message);
+  workerStorages.emplace(
+    storage_id,
+    std::move(storage)
+  );
+}
+
+static void storageNewWithSize1(rpc::RPCMessage& raw_message) {
+  storageNewWithSizeN(raw_message, 1);
+}
+
+static void storageNewWithSize2(rpc::RPCMessage& raw_message) {
+  storageNewWithSizeN(raw_message, 2);
+}
+
+static void storageNewWithSize3(rpc::RPCMessage& raw_message) {
+  storageNewWithSizeN(raw_message, 3);
+}
+
+static void storageNewWithSize4(rpc::RPCMessage& raw_message) {
+  storageNewWithSizeN(raw_message, 4);
+}
+
+static void storageFree(rpc::RPCMessage& raw_message) {
+  object_id_type storage_id = unpackStorage(raw_message);
+  finalize(raw_message);
+  workerStorages.erase(storage_id);
+}
+
+static void storageResize(rpc::RPCMessage& raw_message) {
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  int64_t new_size = unpackInteger(raw_message);
+  finalize(raw_message);
+  storage->resize(new_size);
+}
+
+static void storageFill(rpc::RPCMessage& raw_message) {
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    int64_t val = unpackInteger(raw_message);
+    finalize(raw_message);
+    storage->fill(val);
+  } else if (isFloat(type)) {
+    double val = unpackFloat(raw_message);
+    finalize(raw_message);
+    storage->fill(val);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/Tensor.cpp b/torch/lib/THD/master_worker/worker/dispatch/Tensor.cpp
new file mode 100644
index 0000000..82f754e
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/Tensor.cpp
@@ -0,0 +1,979 @@
+template<typename... Ts>
+static at::Tensor createTensor(RPCType type, Ts &... args) {
+  if (type == RPCType::UCHAR)
+    return at::CPU(at::kByte).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::CHAR)
+    return at::CPU(at::kChar).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::SHORT)
+    return at::CPU(at::kShort).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::INT)
+    return at::CPU(at::kInt).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::LONG)
+    return at::CPU(at::kLong).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::FLOAT)
+    return at::CPU(at::kFloat).tensor(std::forward<Ts>(args)...);
+  else if (type == RPCType::DOUBLE)
+    return at::CPU(at::kDouble).tensor(std::forward<Ts>(args)...);
+  throw std::invalid_argument("passed character doesn't represent a tensor type");
+}
+
+static at::Tensor createTensorWithStorage(RPCType type, at::Storage* storage, ptrdiff_t storageOffset, at::IntList size, at::IntList stride) {
+  if (type == RPCType::UCHAR)
+    return at::CPU(at::kByte).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::CHAR)
+    return at::CPU(at::kChar).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::SHORT)
+    return at::CPU(at::kShort).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::INT)
+    return at::CPU(at::kInt).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::LONG)
+    return at::CPU(at::kLong).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::FLOAT)
+    return at::CPU(at::kFloat).tensor(*storage, storageOffset, size, stride);
+  else if (type == RPCType::DOUBLE)
+    return at::CPU(at::kDouble).tensor(*storage, storageOffset, size, stride);
+  throw std::invalid_argument("passed character doesn't represent a tensor type");
+}
+
+static at::Tensor createTensorWithTensor(RPCType type, at::Tensor& tensor) {
+  if (type == RPCType::UCHAR)
+    return at::CPU(at::kByte).alias(tensor);
+  else if (type == RPCType::CHAR)
+    return at::CPU(at::kChar).alias(tensor);
+  else if (type == RPCType::SHORT)
+    return at::CPU(at::kShort).alias(tensor);
+  else if (type == RPCType::INT)
+    return at::CPU(at::kInt).alias(tensor);
+  else if (type == RPCType::LONG)
+    return at::CPU(at::kLong).alias(tensor);
+  else if (type == RPCType::FLOAT)
+    return at::CPU(at::kFloat).alias(tensor);
+  else if (type == RPCType::DOUBLE)
+    return at::CPU(at::kDouble).alias(tensor);
+  throw std::invalid_argument("passed character doesn't represent a tensor type");
+}
+
+static void tensorNew(rpc::RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  thd::object_id_type id = unpackTensor(raw_message);
+  finalize(raw_message);
+  workerTensors.emplace(
+    id,
+    createTensor(type)
+  );
+}
+
+static void tensorNewWithSize(rpc::RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  thd::object_id_type id = unpackTensor(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  THLongStorage *stride = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+
+  at::IntList sz(THLongStorage_data(size), THLongStorage_size(size));
+  at::IntList str(THLongStorage_data(stride), THLongStorage_size(stride));
+  workerTensors.emplace(
+    id,
+    createTensor(type, sz, str)
+  );
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+}
+
+static void tensorNewWithStorage(rpc::RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  thd::object_id_type id = unpackTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  THLongStorage *stride = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+
+  at::IntList sz(THLongStorage_data(size), THLongStorage_size(size));
+  at::IntList str(THLongStorage_data(stride), THLongStorage_size(stride));
+  workerTensors.emplace(
+    id,
+    createTensorWithStorage(type, storage, storageOffset, sz, str)
+  );
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+}
+
+static void tensorNewWithTensor(rpc::RPCMessage& raw_message) {
+  RPCType type = unpackType(raw_message);
+  thd::object_id_type id = unpackTensor(raw_message);
+  at::Tensor self = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  workerTensors.emplace(
+    id,
+    createTensorWithTensor(type, self)
+  );
+}
+
+static void tensorNewClone(rpc::RPCMessage& raw_message) {
+  thd::object_id_type id = unpackTensor(raw_message);
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  workerTensors.emplace(
+    id,
+    tensor.clone()
+  );
+}
+
+static void tensorResize(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(size), THLongStorage_size(size));
+  tensor.resize_(sizeRef);
+  THLongStorage_free(size);
+}
+
+static void tensorResizeAs(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  tensor.resize_as_(src);
+}
+
+static void tensorResizeNd(rpc::RPCMessage& raw_message, size_t N) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  std::vector<int64_t> size(N);
+  for (size_t i = 0; i < N; ++i) {
+    size[i] = unpackInteger(raw_message);
+  }
+  finalize(raw_message);
+  tensor.resize_(size);
+}
+
+static void tensorResize1d(rpc::RPCMessage& raw_message) {
+  tensorResizeNd(raw_message, 1);
+}
+
+static void tensorResize2d(rpc::RPCMessage& raw_message) {
+  tensorResizeNd(raw_message, 2);
+}
+
+static void tensorResize3d(rpc::RPCMessage& raw_message) {
+  tensorResizeNd(raw_message, 3);
+}
+
+static void tensorResize4d(rpc::RPCMessage& raw_message) {
+  tensorResizeNd(raw_message, 4);
+}
+
+static void tensorResize5d(rpc::RPCMessage& raw_message) {
+  tensorResizeNd(raw_message, 5);
+}
+
+static void tensorSet(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  tensor.set_(src);
+}
+
+static void tensorSetStorage(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  THLongStorage *stride = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(size), THLongStorage_size(size));
+  at::ArrayRef<int64_t> strideRef(THLongStorage_data(stride), THLongStorage_size(stride));
+  tensor.set_(
+      *storage,
+      storageOffset,
+      sizeRef,
+      strideRef
+  );
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+}
+
+static void tensorSetStorage1d(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  int64_t size0 = unpackInteger(raw_message);
+  int64_t stride0 = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::ArrayRef<int64_t> sizes(size0);
+  at::ArrayRef<int64_t> strides(stride0);
+  tensor.set_(
+    *storage,
+    storageOffset,
+    sizes,
+    strides
+  );
+}
+
+static void tensorSetStorage2d(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  int64_t size0 = unpackInteger(raw_message);
+  int64_t stride0 = unpackInteger(raw_message);
+  int64_t size1 = unpackInteger(raw_message);
+  int64_t stride1 = unpackInteger(raw_message);
+  finalize(raw_message);
+  THLongStorage *sizes = THLongStorage_newWithSize2(size0, size1);
+  THLongStorage *strides = THLongStorage_newWithSize2(stride0, stride1);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(sizes), THLongStorage_size(sizes));
+  at::ArrayRef<int64_t> strideRef(THLongStorage_data(strides), THLongStorage_size(strides));
+  tensor.set_(
+    *storage,
+    storageOffset,
+    sizeRef,
+    strideRef
+  );
+  THLongStorage_free(sizes);
+  THLongStorage_free(strides);
+}
+
+static void tensorSetStorage3d(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  int64_t size0 = unpackInteger(raw_message);
+  int64_t stride0 = unpackInteger(raw_message);
+  int64_t size1 = unpackInteger(raw_message);
+  int64_t stride1 = unpackInteger(raw_message);
+  int64_t size2 = unpackInteger(raw_message);
+  int64_t stride2 = unpackInteger(raw_message);
+  finalize(raw_message);
+  THLongStorage *sizes = THLongStorage_newWithSize3(size0, size1, size2);
+  THLongStorage *strides = THLongStorage_newWithSize3(stride0, stride1, stride2);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(sizes), THLongStorage_size(sizes));
+  at::ArrayRef<int64_t> strideRef(THLongStorage_data(strides), THLongStorage_size(strides));
+  tensor.set_(
+    *storage,
+    storageOffset,
+    sizeRef,
+    strideRef
+  );
+  THLongStorage_free(sizes);
+  THLongStorage_free(strides);
+}
+
+static void tensorSetStorage4d(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Storage *storage = unpackRetrieveStorage(raw_message);
+  ptrdiff_t storageOffset = unpackInteger(raw_message);
+  int64_t size0 = unpackInteger(raw_message);
+  int64_t stride0 = unpackInteger(raw_message);
+  int64_t size1 = unpackInteger(raw_message);
+  int64_t stride1 = unpackInteger(raw_message);
+  int64_t size2 = unpackInteger(raw_message);
+  int64_t stride2 = unpackInteger(raw_message);
+  int64_t size3 = unpackInteger(raw_message);
+  int64_t stride3 = unpackInteger(raw_message);
+  finalize(raw_message);
+  THLongStorage *sizes = THLongStorage_newWithSize4(size0, size1, size2, size3);
+  THLongStorage *strides = THLongStorage_newWithSize4(stride0, stride1,
+                                                      stride2, stride3);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(sizes), THLongStorage_size(sizes));
+  at::ArrayRef<int64_t> strideRef(THLongStorage_data(strides), THLongStorage_size(strides));
+  tensor.set_(
+    *storage,
+    storageOffset,
+    sizeRef,
+    strideRef
+  );
+  THLongStorage_free(sizes);
+  THLongStorage_free(strides);
+}
+
+static void tensorNarrow(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int64_t firstIndex = unpackInteger(raw_message);
+  int64_t size = unpackInteger(raw_message);
+  finalize(raw_message);
+  tensor = src.narrow(dimension, firstIndex, size);
+}
+
+static void tensorSelect(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int64_t sliceIndex = unpackInteger(raw_message);
+  finalize(raw_message);
+  tensor = src.select(dimension, sliceIndex);
+}
+
+static void tensorTranspose(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension1 = unpackInteger(raw_message);
+  int dimension2 = unpackInteger(raw_message);
+  finalize(raw_message);
+  tensor = src.transpose(dimension1, dimension2);
+}
+
+static void tensorUnfold(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int64_t size = unpackInteger(raw_message);
+  int64_t step = unpackInteger(raw_message);
+  finalize(raw_message);
+  tensor = src.unfold(dimension, size, step);
+}
+
+static void tensorSqueeze(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  // FIXME: could be at::squeeze_out(tensor, src), but we don't generate
+  // _out functions for native ATen functions (and may not want to).
+   tensor.set_(src.squeeze());
+}
+
+static void tensorSqueeze1d(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  finalize(raw_message);
+  // FIXME: could be at::squeeze_out(tensor, src, dimension), but we don't generate
+  // _out functions for native ATen functions (and may not want to).
+  tensor.set_(src.squeeze(dimension));
+}
+
+static void tensorFree(rpc::RPCMessage& raw_message) {
+  object_id_type tensor_id = unpackTensor(raw_message);
+  (void)workerTensors.erase(tensor_id);
+}
+
+static void tensorGather(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::gather_out(tensor, src, dim, index);
+}
+
+static void tensorScatter(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  tensor.scatter_(dim, index, src);
+}
+
+static void tensorScatterFill(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    tensor.scatter_(dim, index, value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    tensor.scatter_(dim, index, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorDot(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.dot(src).toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.dot(src).toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMinall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.min().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.min().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMaxall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.max().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.max().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMedianall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.median().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.median().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorSumall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.sum().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.sum().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorProdall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.prod().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.prod().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAdd(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::add_out(tensor, src, at::Scalar(value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::add_out(tensor, src, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorSub(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::sub_out(tensor, src, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::sub_out(tensor, src, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMul(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::mul_out(tensor, src, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::mul_out(tensor, src, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorDiv(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::div_out(tensor, src, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::div_out(tensor, src, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorFmod(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::fmod_out(tensor, src, at::Scalar((int64_t)value));
+  } else {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::fmod_out(tensor, src, value);
+  }
+}
+
+static void tensorRemainder(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::remainder_out(tensor, src, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::remainder_out(tensor, src, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorClamp(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t min_value = unpackInteger(raw_message);
+    int64_t max_value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::clamp_out(tensor, src, at::Scalar((int64_t)min_value), at::Scalar((int64_t)max_value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double min_value = unpackFloat(raw_message);
+    double max_value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::clamp_out(tensor, src, min_value, max_value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorCadd(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::add_out(tensor, src1, src2, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::add_out(tensor, src1, src2, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorCsub(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::sub_out(tensor, src1, src2, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::sub_out(tensor, src1, src2, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorCmul(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::mul_out(tensor, src1, src2);
+}
+
+static void tensorCpow(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::pow_out(tensor, src1, src2);
+}
+
+static void tensorCdiv(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::div_out(tensor, src1, src2);
+}
+
+static void tensorCfmod(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::fmod_out(tensor, src1, src2);
+}
+
+static void tensorCremainder(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::remainder_out(tensor, src1, src2);
+}
+
+static void tensorAddcmul(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  at::Tensor src3 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addcmul_out(tensor, src1, src2, src3, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addcmul_out(tensor, src1, src2, src3, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAddcdiv(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  at::Tensor src3 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addcdiv_out(tensor, src1, src2, src3, at::Scalar((int64_t)value));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addcdiv_out(tensor, src1, src2, src3, value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAddmv(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor mat = unpackRetrieveTensor(raw_message);
+  at::Tensor vec = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t beta = unpackInteger(raw_message);
+    int64_t alpha = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addmv_out(tensor, src, mat, vec, at::Scalar((int64_t)beta), at::Scalar((int64_t)alpha));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double beta = unpackFloat(raw_message);
+    double alpha = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addmv_out(tensor, src, mat, vec, beta, alpha);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAddmm(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor mat1 = unpackRetrieveTensor(raw_message);
+  at::Tensor mat2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t beta = unpackInteger(raw_message);
+    int64_t alpha = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addmm_out(tensor, src, mat1, mat2, at::Scalar((int64_t)beta), at::Scalar((int64_t)alpha));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double beta = unpackFloat(raw_message);
+    double alpha = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addmm_out(tensor, src, mat1, mat2, beta, alpha);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAddr(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor vec1 = unpackRetrieveTensor(raw_message);
+  at::Tensor vec2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t beta = unpackInteger(raw_message);
+    int64_t alpha = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addr_out(tensor, src, vec1, vec2,at::Scalar((int64_t)beta), at::Scalar((int64_t)alpha));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double beta = unpackFloat(raw_message);
+    double alpha = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addr_out(tensor, src, vec1, vec2, beta, alpha);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorAddbmm(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor batch1 = unpackRetrieveTensor(raw_message);
+  at::Tensor batch2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t beta = unpackInteger(raw_message);
+    int64_t alpha = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::addbmm_out(tensor, src, batch1, batch2, at::Scalar((int64_t)beta), at::Scalar((int64_t)alpha));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double beta = unpackFloat(raw_message);
+    double alpha = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::addbmm_out(tensor, src, batch1, batch2, beta, alpha);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorBaddbmm(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor batch1 = unpackRetrieveTensor(raw_message);
+  at::Tensor batch2 = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t beta = unpackInteger(raw_message);
+    int64_t alpha = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::baddbmm_out(tensor, src, batch1, batch2, at::Scalar((int64_t)beta), at::Scalar((int64_t)alpha));
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double beta = unpackFloat(raw_message);
+    double alpha = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::baddbmm_out(tensor, src, batch1, batch2, beta, alpha);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMax(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor indices_ = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::max_out(tensor, indices_, src, dimension, keepdim);
+}
+
+static void tensorMin(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor indices_ = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::min_out(tensor, indices_, src, dimension, keepdim);
+}
+
+static void tensorKthvalue(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor indices_ = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int k = unpackInteger(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::kthvalue_out(tensor, indices_, src, k, dimension, keepdim);
+}
+
+static void tensorMode(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor indices_ = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::mode_out(tensor, indices_, src, dimension, keepdim);
+}
+
+static void tensorMedian(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor indices_ = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::median_out(tensor, indices_, src, dimension, keepdim);
+}
+
+static void tensorSum(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::sum_out(tensor, src, dimension, keepdim);
+}
+
+static void tensorProd(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::prod_out(tensor, src, dimension, keepdim);
+}
+
+static void tensorCumsum(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::cumsum_out(tensor, src, dimension);
+}
+
+static void tensorCumprod(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::cumprod_out(tensor, src, dimension);
+}
+
+static void tensorSign(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::sign_out(tensor, src);
+}
+
+static void tensorTrace(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = tensor.trace().toCLong();
+    sendValueToMaster(value);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = tensor.trace().toCDouble();
+    sendValueToMaster(value);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorCross(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::cross_out(tensor, src1, src2, dimension);
+}
+
+static void tensorCmax(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::max_out(tensor, src1, src2);
+}
+
+static void tensorCmin(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src1 = unpackRetrieveTensor(raw_message);
+  at::Tensor src2 = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::min_out(tensor, src1, src2);
+}
+
+/* static void tensorCmaxValue(rpc::RPCMessage& raw_message) { */
+/*   at::Tensor tensor = unpackRetrieveTensor(raw_message); */
+/*   at::Tensor src = unpackRetrieveTensor(raw_message); */
+
+/*   if (at::isIntegralType(tensor.type().scalarType())) { */
+/*     int64_t value = unpackInteger(raw_message); */
+/*     finalize(raw_message); */
+/*     at::clamp_out(tensor, src, at::Scalar((int64_t)value)); */
+/*   } else if (at::isFloatingType(tensor.type().scalarType())) { */
+/*     double value = unpackFloat(raw_message); */
+/*     finalize(raw_message); */
+/*     at::clamp_out(tensor, src, value); */
+/*   } else { */
+/*     throw std::invalid_argument("expected scalar type"); */
+/*   } */
+/* } */
+
+/* static void tensorCminValue(rpc::RPCMessage& raw_message) { */
+/*   at::Tensor tensor = unpackRetrieveTensor(raw_message); */
+/*   at::Tensor src = unpackRetrieveTensor(raw_message); */
+
+/*   if (at::isIntegralType(tensor.type().scalarType())) { */
+/*     int64_t value = unpackInteger(raw_message); */
+/*     finalize(raw_message); */
+/*     dynamic_cast<thpp::IntTensor*>(tensor)->cminValue(*src, value); */
+/*   } else if (at::isFloatingType(tensor.type().scalarType())) { */
+/*     double value = unpackFloat(raw_message); */
+/*     finalize(raw_message); */
+/*     dynamic_cast<thpp::FloatTensor*>(tensor)->cminValue(*src, value); */
+/*   } else { */
+/*     throw std::invalid_argument("expected scalar type"); */
+/*   } */
+/* } */
diff --git a/torch/lib/THD/master_worker/worker/dispatch/TensorCopy.cpp b/torch/lib/THD/master_worker/worker/dispatch/TensorCopy.cpp
new file mode 100644
index 0000000..2217900
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/TensorCopy.cpp
@@ -0,0 +1,12 @@
+
+static void tensorCopyFromMaster(rpc::RPCMessage& raw_message) {
+  at::Tensor data = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  dataChannel->receive(data, 0);
+}
+
+static void tensorCopyFromWorker(rpc::RPCMessage& raw_message) {
+  at::Tensor data = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  dataChannel->send(data, 0);
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/TensorLapack.cpp b/torch/lib/THD/master_worker/worker/dispatch/TensorLapack.cpp
new file mode 100644
index 0000000..10e7907
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/TensorLapack.cpp
@@ -0,0 +1,146 @@
+
+static void tensorGesv(rpc::RPCMessage& raw_message) {
+  at::Tensor rb = unpackRetrieveTensor(raw_message);
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor b = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::gesv_out(rb, ra, b, a);
+}
+
+static void tensorTrtrs(rpc::RPCMessage& raw_message) {
+  at::Tensor rb = unpackRetrieveTensor(raw_message);
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor b = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  auto trans = unpackInteger(raw_message);
+  auto diag = unpackInteger(raw_message);
+  at::trtrs_out(rb, ra, b, a, uplo, trans, diag);
+}
+
+static void tensorGels(rpc::RPCMessage& raw_message) {
+  at::Tensor rb = unpackRetrieveTensor(raw_message);
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor b = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::gels_out(rb, ra, b, a);
+}
+
+static void tensorSyev(rpc::RPCMessage& raw_message) {
+  at::Tensor re = unpackRetrieveTensor(raw_message);
+  at::Tensor rv = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto jobz = unpackInteger(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::symeig_out(re, rv, a, jobz, uplo);
+}
+
+static void tensorGeev(rpc::RPCMessage& raw_message) {
+  at::Tensor re = unpackRetrieveTensor(raw_message);
+  at::Tensor rv = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto jobvr = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::eig_out(re, rv, a, jobvr);
+}
+
+static void tensorGesvd2(rpc::RPCMessage& raw_message) {
+  at::Tensor ru = unpackRetrieveTensor(raw_message);
+  at::Tensor rs = unpackRetrieveTensor(raw_message);
+  at::Tensor rv = unpackRetrieveTensor(raw_message);
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto jobu = unpackInteger(raw_message);
+  finalize(raw_message);
+  throw std::runtime_error("gesv2d not implemented in ATen");
+  /* ru->gesvd2(*rs, *rv, *ra, *a, &jobu); */
+}
+
+static void tensorGetri(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::inverse_out(ra, a);
+}
+
+static void tensorPotrf(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::potrf_out(ra, a, uplo);
+}
+
+static void tensorPotrs(rpc::RPCMessage& raw_message) {
+  at::Tensor rb = unpackRetrieveTensor(raw_message);
+  at::Tensor b = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::potrs_out(rb, b, a, uplo);
+}
+
+static void tensorPotri(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::potri_out(ra, a, uplo);
+}
+
+static void tensorQr(rpc::RPCMessage& raw_message) {
+  at::Tensor rq = unpackRetrieveTensor(raw_message);
+  at::Tensor rr = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::qr_out(rq, rr, a);
+}
+
+static void tensorGeqrf(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor rtau = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::geqrf_out(ra, rtau, a);
+}
+
+static void tensorOrgqr(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  at::Tensor tau = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::orgqr_out(ra, a, tau);
+}
+
+static void tensorOrmqr(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  at::Tensor tau = unpackRetrieveTensor(raw_message);
+  at::Tensor c = unpackRetrieveTensor(raw_message);
+  auto side = unpackInteger(raw_message);
+  auto trans = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::ormqr_out(ra, ra, tau, c, side, trans);
+}
+
+static void tensorPstrf(rpc::RPCMessage& raw_message) {
+  at::Tensor ra = unpackRetrieveTensor(raw_message);
+  at::Tensor rpiv = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  auto uplo = unpackInteger(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    auto tol = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::pstrf_out(ra, rpiv, a, uplo, tol);
+  } else if (isFloat(type)) {
+    auto tol = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::pstrf_out(ra, rpiv, a, uplo, tol);
+  } else {
+    throw std::runtime_error("expected scalar type");
+  }
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/TensorMath.cpp b/torch/lib/THD/master_worker/worker/dispatch/TensorMath.cpp
new file mode 100644
index 0000000..f22f94c
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/TensorMath.cpp
@@ -0,0 +1,626 @@
+
+static void tensorFill(rpc::RPCMessage& raw_message) {
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    auto value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    t.fill_(value);
+  } else if (isFloat(type)) {
+    auto value = unpackFloat(raw_message);
+    finalize(raw_message);
+    t.fill_(value);
+  } else {
+    throw std::runtime_error("expected a scalar type");
+  }
+}
+
+static void tensorMaskedFill(rpc::RPCMessage& raw_message) {
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  at::Tensor mask = unpackRetrieveTensor(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    auto value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    t.masked_fill_(mask, value);
+  } else if (isFloat(type)) {
+    auto value = unpackFloat(raw_message);
+    finalize(raw_message);
+    t.masked_fill_(mask, value);
+  } else {
+    throw std::runtime_error("expected a scalar type");
+  }
+}
+
+static void tensorMaskedCopy(rpc::RPCMessage& raw_message) {
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  at::Tensor mask = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  t.masked_scatter_(mask, src);
+}
+
+static void tensorMaskedSelect(rpc::RPCMessage& raw_message) {
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  at::Tensor mask = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::masked_select_out(t, src, mask);
+}
+
+static void tensorNonzero(rpc::RPCMessage& raw_message) {
+  at::Tensor subscript = unpackRetrieveTensor(raw_message);
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::nonzero_out(subscript, tensor);
+  int64_t numel = subscript.sizes().size() > 0 ? subscript.sizes()[0] : 0;
+  sendValueToMaster(numel);
+}
+
+static void tensorIndexSelect(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::index_select_out(tensor, src, dim, index);
+}
+
+static void tensorIndexCopy(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  tensor.index_copy_(dim, index, src);
+}
+
+static void tensorIndexAdd(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  tensor.index_add_(dim, index, src);
+}
+
+static void tensorIndexFill(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dim = unpackInteger(raw_message);
+  at::Tensor index = unpackRetrieveTensor(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    auto val = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    tensor.index_fill_(dim, index, val);
+  } else if (isFloat(type)) {
+    auto val = unpackFloat(raw_message);
+    finalize(raw_message);
+    tensor.index_fill_(dim, index, val);
+  } else {
+    throw std::runtime_error("expected a scalar type");
+  }
+}
+
+static void tensorDiag(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int k = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::diag_out(r, t, k);
+}
+
+static void tensorEye(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int64_t n = unpackInteger(raw_message);
+  int64_t m = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::eye_out(tensor, n, m);
+}
+
+static void tensorRange(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  RPCType type = peekType(raw_message);
+  if (isInteger(type)) {
+    int64_t xmin = unpackInteger(raw_message);
+    int64_t xmax = unpackInteger(raw_message);
+    int64_t step = unpackInteger(raw_message);
+    finalize(raw_message);
+    at::range_out(r, xmin, xmax, step);
+  } else if (isFloat(type)) {
+    double xmin = unpackFloat(raw_message);
+    double xmax = unpackFloat(raw_message);
+    double step = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::range_out(r, xmin, xmax, step);
+  } else {
+    throw std::runtime_error("expected a scalar type");
+  }
+}
+
+static void tensorRandperm(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  int64_t n = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::randperm_out(r, n, _generator);
+}
+
+static void tensorSort(rpc::RPCMessage& raw_message) {
+  at::Tensor rt = unpackRetrieveTensor(raw_message);
+  at::Tensor ri = unpackRetrieveTensor(raw_message);
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int desc = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::sort_out(rt, ri, tensor, dimension, desc);
+}
+
+static void tensorTopk(rpc::RPCMessage& raw_message) {
+  at::Tensor rt = unpackRetrieveTensor(raw_message);
+  at::Tensor ri = unpackRetrieveTensor(raw_message);
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int64_t k = unpackInteger(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int dir = unpackInteger(raw_message);
+  int sorted = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::topk_out(rt, ri, tensor, k, dimension, dir, sorted);
+}
+
+static void tensorTril(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int64_t k = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::tril_out(r, t, k);
+}
+
+static void tensorTriu(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int64_t k = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::triu_out(r, t, k);
+}
+
+static void tensorCatArray(rpc::RPCMessage& raw_message) {
+  at::Tensor result = unpackRetrieveTensor(raw_message);
+  int numInputs = unpackInteger(raw_message);
+  std::vector<at::Tensor> inputs(numInputs);
+  for (size_t i = 0; i < numInputs; i++)
+    inputs[i] = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::cat_out(result, inputs, dimension);
+}
+
+static void tensorEqual(rpc::RPCMessage& raw_message) {
+  at::Tensor ta = unpackRetrieveTensor(raw_message);
+  at::Tensor tb = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  int64_t response = ta.equal(tb);
+  sendValueToMaster(response);
+}
+
+static void tensorTpow(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  if (at::isIntegralType(t.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::pow_out(r, t, value);
+  } else if (at::isFloatingType(t.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::pow_out(r, t, value);
+  } else {
+    throw std::runtime_error("expected a scalar type");
+  }
+}
+
+#define TENSOR_IMPLEMENT_LOGICAL(NAME, METHODNAME)                   \
+  static void tensor##NAME##Value(rpc::RPCMessage& raw_message) {    \
+    at::Tensor r = unpackRetrieveTensor(raw_message);             \
+    at::Tensor t = unpackRetrieveTensor(raw_message);             \
+    if (at::isIntegralType(t.type().scalarType())) {                 \
+      int64_t value = (int64_t) unpackInteger(raw_message);                  \
+      finalize(raw_message);                                         \
+      at::METHODNAME##_out(t, r, value);                              \
+    } else if (at::isFloatingType(t.type().scalarType())) {          \
+      double value = unpackFloat(raw_message);                       \
+      finalize(raw_message);                                         \
+      at::METHODNAME##_out(t, r, value);                              \
+    } else {                                                         \
+      throw std::runtime_error("expected scalar type");              \
+    }                                                                \
+  }                                                                  \
+  static void tensor##NAME##ValueT(rpc::RPCMessage& raw_message) {   \
+    at::Tensor r = unpackRetrieveTensor(raw_message);             \
+    at::Tensor t = unpackRetrieveTensor(raw_message);             \
+    if (at::isIntegralType(t.type().scalarType())) {                 \
+      int64_t value = (int64_t) unpackInteger(raw_message);                  \
+      finalize(raw_message);                                         \
+      at::METHODNAME##_out(r, t, value);                              \
+    } else if (at::isFloatingType(t.type().scalarType())) {          \
+      double value = unpackFloat(raw_message);                       \
+      finalize(raw_message);                                         \
+      at::METHODNAME##_out(r, t, value);                              \
+    } else {                                                         \
+      throw std::runtime_error("expected scalar type");              \
+    }                                                                \
+  }                                                                  \
+  static void tensor##NAME##Tensor(rpc::RPCMessage& raw_message) {   \
+    at::Tensor r = unpackRetrieveTensor(raw_message);             \
+    at::Tensor ta = unpackRetrieveTensor(raw_message);            \
+    at::Tensor tb = unpackRetrieveTensor(raw_message);            \
+    finalize(raw_message);                                           \
+    at::METHODNAME##_out(ta, r, tb);                              \
+  }                                                                  \
+  static void tensor##NAME##TensorT(rpc::RPCMessage& raw_message) {  \
+    at::Tensor r = unpackRetrieveTensor(raw_message);             \
+    at::Tensor ta = unpackRetrieveTensor(raw_message);            \
+    at::Tensor tb = unpackRetrieveTensor(raw_message);            \
+    finalize(raw_message);                                           \
+    at::METHODNAME##_out(r, ta, tb);                               \
+  }                                                                  \
+
+TENSOR_IMPLEMENT_LOGICAL(Lt,lt)
+TENSOR_IMPLEMENT_LOGICAL(Gt,lt)
+TENSOR_IMPLEMENT_LOGICAL(Le,le)
+TENSOR_IMPLEMENT_LOGICAL(Ge,ge)
+TENSOR_IMPLEMENT_LOGICAL(Eq,eq)
+TENSOR_IMPLEMENT_LOGICAL(Ne,ne)
+
+#undef TENSOR_IMPLEMENT_LOGICAL
+
+#define TENSOR_IMPLEMENT_POINTWISE_FUNCTION(NAME, METHODNAME) \
+  static void tensor##NAME(rpc::RPCMessage& raw_message) {    \
+    at::Tensor r = unpackRetrieveTensor(raw_message);      \
+    at::Tensor t = unpackRetrieveTensor(raw_message);      \
+    finalize(raw_message);                                    \
+    at::METHODNAME##_out(r, t);                            \
+  }                                                           \
+
+#define TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION(NAME, METHODNAME) \
+  static void tensor##NAME(rpc::RPCMessage& raw_message) {          \
+    at::Tensor r = unpackRetrieveTensor(raw_message);            \
+    at::Tensor t = unpackRetrieveTensor(raw_message);            \
+    if (at::isIntegralType(t.type().scalarType())) {                 \
+      int64_t value = (int64_t) unpackInteger(raw_message);                 \
+      finalize(raw_message);                                        \
+      at::METHODNAME##_out(r, t, value);                            \
+    } else if (at::isFloatingType(t.type().scalarType())) {          \
+      double value = unpackFloat(raw_message);                      \
+      finalize(raw_message);                                        \
+      at::METHODNAME##_out(r, t, value);                            \
+    } else {                                                        \
+      throw std::runtime_error("expected scalar type");             \
+    }                                                               \
+  }                                                                 \
+
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Abs,abs)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Sigmoid,sigmoid)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Log,log)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Log10,log10)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Log1p,log1p)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Log2,log2)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Exp,exp)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Expm1,expm1)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Cos,cos)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Acos,acos)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Cosh,cosh)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Sin,sin)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Asin,asin)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Sinh,sinh)
+
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Tan,tan)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Atan,atan)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Tanh,tanh)
+TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION(Pow,pow)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Sqrt,sqrt)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Rsqrt,rsqrt)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Ceil,ceil)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Floor,floor)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Round,round)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Trunc,trunc)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Frac,frac)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Neg,neg)
+TENSOR_IMPLEMENT_POINTWISE_FUNCTION(Cinv,inverse)
+
+#undef TENSOR_IMPLEMENT_POINTWISE_VALUE_FUNCTION
+#undef TENSOR_IMPLEMENT_POINTWISE_FUNCTION
+
+static void tensorAtan2(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor tx = unpackRetrieveTensor(raw_message);
+  at::Tensor ty = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  at::atan2_out(r, tx, ty);
+}
+
+static void tensorLerp(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor a = unpackRetrieveTensor(raw_message);
+  at::Tensor b = unpackRetrieveTensor(raw_message);
+  if (at::isIntegralType(r.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::lerp_out(r, a, b, value);
+  } else if (at::isFloatingType(r.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::lerp_out(r, a, b, value);
+  } else {
+    throw std::runtime_error("expected scalar type");
+  }
+}
+
+static void tensorMean(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::mean_out(r, t, dimension, keepdim);
+}
+
+static void tensorStd(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int biased = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::std_out(r, t, dimension, biased, keepdim);
+}
+
+static void tensorVar(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int biased = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::var_out(r, t, dimension, biased, keepdim);
+}
+
+static void tensorNorm(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Tensor t = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  if (at::isIntegralType(r.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::norm_out(r, t, value, dimension, keepdim);
+  } else if (at::isFloatingType(r.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::norm_out(r, t, value, dimension, keepdim);
+  } else {
+    throw std::runtime_error("expected scalar type");
+  }
+}
+
+static void tensorNormall(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(r.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+
+    int64_t response = r.norm(value).toCLong();
+    sendValueToMaster(response);
+  } else if (at::isFloatingType(r.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+
+    double response = r.norm(value).toCDouble();
+    sendValueToMaster(response);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorRenorm(rpc::RPCMessage& raw_message) {
+  at::Tensor res = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+
+  if (at::isIntegralType(res.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    int64_t maxnorm = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+
+    at::renorm_out(res, src, value, dimension, maxnorm);
+  } else if (at::isFloatingType(res.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    double maxnorm = unpackFloat(raw_message);
+    finalize(raw_message);
+
+    at::renorm_out(res, src, value, dimension, maxnorm);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorDist(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t value = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+
+    int64_t response = src.dist(tensor, value).toCLong();
+    sendValueToMaster(response);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double value = unpackFloat(raw_message);
+    finalize(raw_message);
+
+    double response = src.dist(tensor, value).toCDouble();
+    sendValueToMaster(response);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorMeanall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t response = tensor.mean().toCLong();
+    sendValueToMaster(response);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double response = tensor.mean().toCLong();
+    sendValueToMaster(response);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorVarall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int biased = unpackInteger(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t response = tensor.var((bool)biased).toCLong();
+    sendValueToMaster(response);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double response = tensor.var((bool)biased).toCDouble();
+    sendValueToMaster(response);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorStdall(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int biased = unpackInteger(raw_message);
+  finalize(raw_message);
+
+  if (at::isIntegralType(tensor.type().scalarType())) {
+    int64_t response = tensor.std((bool)biased).toCLong();
+    sendValueToMaster(response);
+  } else if (at::isFloatingType(tensor.type().scalarType())) {
+    double response = tensor.std((bool)biased).toCDouble();
+    sendValueToMaster(response);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorLinspace(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  int64_t n = unpackInteger(raw_message);
+
+  if (at::isIntegralType(r.type().scalarType())) {
+    int64_t a = (int64_t) unpackInteger(raw_message);
+    int64_t b = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::linspace_out(r, a, b, n);
+  } else if (at::isFloatingType(r.type().scalarType())) {
+    double a = unpackFloat(raw_message);
+    double b = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::linspace_out(r, a, b, n);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorLogspace(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  int64_t n = unpackInteger(raw_message);
+
+  if (at::isIntegralType(r.type().scalarType())) {
+    int64_t a = (int64_t) unpackInteger(raw_message);
+    int64_t b = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::logspace_out(r, a, b, n);
+  } else if (at::isFloatingType(r.type().scalarType())) {
+    double a = unpackFloat(raw_message);
+    double b = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::logspace_out(r, a, b, n);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorRand(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(size), THLongStorage_size(size));
+  at::rand_out(r, sizeRef, _generator);
+  THLongStorage_free(size);
+}
+
+static void tensorRandn(rpc::RPCMessage& raw_message) {
+  at::Tensor r = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  THLongStorage *size = unpackTHLongStorage(raw_message);
+  finalize(raw_message);
+  at::ArrayRef<int64_t> sizeRef(THLongStorage_data(size), THLongStorage_size(size));
+  at::randn_out(r, sizeRef, _generator);
+  THLongStorage_free(size);
+}
+
+static void tensorHistc(rpc::RPCMessage& raw_message) {
+  at::Tensor hist = unpackRetrieveTensor(raw_message);
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  int64_t nbins = unpackInteger(raw_message);
+
+  if (at::isIntegralType(hist.type().scalarType())) {
+    int64_t minvalue = (int64_t) unpackInteger(raw_message);
+    int64_t maxvalue = (int64_t) unpackInteger(raw_message);
+    finalize(raw_message);
+    at::histc_out(hist, tensor, nbins, minvalue, maxvalue);
+  } else if (at::isFloatingType(hist.type().scalarType())) {
+    double minvalue = unpackFloat(raw_message);
+    double maxvalue = unpackFloat(raw_message);
+    finalize(raw_message);
+    at::histc_out(hist, tensor, nbins, minvalue, maxvalue);
+  } else {
+    throw std::invalid_argument("expected scalar type");
+  }
+}
+
+static void tensorLogicalAndAll(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  int64_t response = tensor.all().toCLong();
+  sendValueToMaster(response);
+}
+
+static void tensorLogicalAnyAll(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  int64_t response = tensor.any().toCLong();
+  sendValueToMaster(response);
+}
+
+static void tensorLogicalAnd(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::all_out(tensor, src, dimension, keepdim);
+}
+
+static void tensorLogicalAny(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Tensor src = unpackRetrieveTensor(raw_message);
+  int dimension = unpackInteger(raw_message);
+  int keepdim = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::any_out(tensor, src, dimension, keepdim);
+}
diff --git a/torch/lib/THD/master_worker/worker/dispatch/TensorRandom.cpp b/torch/lib/THD/master_worker/worker/dispatch/TensorRandom.cpp
new file mode 100644
index 0000000..e699b30
--- /dev/null
+++ b/torch/lib/THD/master_worker/worker/dispatch/TensorRandom.cpp
@@ -0,0 +1,98 @@
+
+static void tensorRandom(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  finalize(raw_message);
+  tensor.random_(_generator);
+}
+
+static void tensorGeometric(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double p = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.geometric_(p, _generator);
+}
+
+static void tensorBernoulli(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double p = unpackFloat(raw_message);
+  finalize(raw_message);
+
+  throw std::runtime_error("bernoulli not yet wrapped in ATen");
+  /* tensor.bernoulli_(p, _generator); */
+}
+
+static void tensorBernoulli_FloatTensor(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  at::Tensor p = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+  throw std::runtime_error("bernoulli not yet wrapped in ATen");
+  /* tensor->bernoulli_FloatTensor(_generator, *p); */
+}
+
+static void tensorBernoulli_DoubleTensor(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  at::Tensor p = unpackRetrieveTensor(raw_message);
+  finalize(raw_message);
+
+  throw std::runtime_error("bernoulli not yet wrapped in ATen");
+  /* tensor->bernoulli_DoubleTensor(_generator, *p); */
+}
+
+static void tensorUniform(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double a = unpackFloat(raw_message);
+  double b = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.uniform_(a, b, _generator);
+}
+
+static void tensorNormal(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double mean = unpackFloat(raw_message);
+  double stdv = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.normal_(mean, stdv, _generator);
+}
+
+static void tensorExponential(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double lambda = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.exponential_(lambda, _generator);
+}
+
+static void tensorCauchy(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double median = unpackFloat(raw_message);
+  double sigma = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.cauchy_(median, sigma, _generator);
+}
+
+static void tensorLogNormal(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  double mean = unpackFloat(raw_message);
+  double stdv = unpackFloat(raw_message);
+  finalize(raw_message);
+  tensor.log_normal_(mean, stdv, _generator);
+}
+
+static void tensorMultinomial(rpc::RPCMessage& raw_message) {
+  at::Tensor tensor = unpackRetrieveTensor(raw_message);
+  at::Generator *_generator = unpackRetrieveGenerator(raw_message);
+  at::Tensor prob_dist = unpackRetrieveTensor(raw_message);
+  int n_sample = unpackInteger(raw_message);
+  int with_replacement = unpackInteger(raw_message);
+  finalize(raw_message);
+  at::multinomial_out(tensor, prob_dist, n_sample, with_replacement, _generator);
+}
diff --git a/torch/lib/THD/process_group/Collectives.cpp b/torch/lib/THD/process_group/Collectives.cpp
new file mode 100644
index 0000000..9b99a7f
--- /dev/null
+++ b/torch/lib/THD/process_group/Collectives.cpp
@@ -0,0 +1,136 @@
+#include "Collectives.hpp"
+#include "General.hpp"
+#include "../base/ChannelUtils.hpp"
+
+#include <vector>
+
+using namespace thd;
+
+int THDGetRank() {
+  return static_cast<int>(dataChannel->getRank());
+}
+
+int THDGetNumProcesses() {
+  return static_cast<int>(dataChannel->getNumProcesses());
+}
+
+void THDAllReduceMultiGPU(THDTensorDescriptor* data,
+                          size_t len,
+                          THDReduceOp operation,
+                          THDGroup group) {
+  std::vector<at::Tensor> dataVec(data, data + len);
+  dataChannel->allReduce(dataVec, operation, group);
+}
+
+
+void THDAllReduce(THDTensorDescriptor& desc, THDReduceOp operation, THDGroup group) {
+  dataChannel->allReduce(desc, operation, group);
+}
+
+void THDReduceMultiGPU(THDTensorDescriptor* desc,
+                       size_t len,
+                       THDReduceOp operation,
+                       int dst_rank,
+                       THDGroup group) {
+  std::vector<at::Tensor> dataVec(desc, desc + len);
+  dataChannel->reduce(dataVec, operation, convertToRank(dst_rank), group);
+}
+
+void THDReduce(THDTensorDescriptor& desc, THDReduceOp operation,
+               int dst_rank, THDGroup group) {
+  dataChannel->reduce(desc, operation, convertToRank(dst_rank), group);
+}
+
+void THDBroadcastMultiGPU(THDTensorDescriptor* desc,
+                          size_t len,
+                          int src_rank,
+                          THDGroup group) {
+  std::vector<at::Tensor> dataVec(desc, desc + len);
+  dataChannel->broadcast(dataVec, convertToRank(src_rank), group);
+}
+
+void THDBroadcast(THDTensorDescriptor& desc, int src_rank, THDGroup group) {
+  dataChannel->broadcast(desc, convertToRank(src_rank), group);
+}
+
+THDRequest* THDIsend(THDTensorDescriptor& desc, int dst_rank) {
+  return dataChannel->isend(desc, convertToRank(dst_rank));
+}
+
+THDRequest* THDIrecv(THDTensorDescriptor& desc, int src_rank) {
+  return dataChannel->ireceive(desc, convertToRank(src_rank));
+}
+
+void THDSend(THDTensorDescriptor& desc, int dst_rank) {
+  dataChannel->send(desc, convertToRank(dst_rank));
+}
+
+int THDRecvAnySource(THDTensorDescriptor& desc) {
+  return dataChannel->receive(desc);
+}
+
+void THDRecv(THDTensorDescriptor& desc, int src_rank) {
+  dataChannel->receive(desc, convertToRank(src_rank));
+}
+
+void THDAllGatherMultiGPU(THDTensorDescriptor* output,
+                          size_t outputLen,
+                          THDTensorDescriptor* input,
+                          size_t inputLen,
+                          THDGroup group) {
+  std::vector<at::Tensor> outputVec(output, output + outputLen);
+  std::vector<at::Tensor> inputVec(input, input + inputLen);
+  dataChannel->allGather(outputVec, inputVec, group);
+}
+
+void THDAllGather(THDTensorDescriptor* output, size_t len,
+                  THDTensorDescriptor& input, THDGroup group) {
+  std::vector<at::Tensor> v_output(output, output + len);
+  dataChannel->allGather(v_output, input, group);
+}
+
+void THDGatherSend(THDTensorDescriptor& input, int dst_rank, THDGroup group) {
+  std::vector<at::Tensor> v_output;
+  dataChannel->gather(v_output, input, convertToRank(dst_rank), group);
+}
+
+void THDGatherRecv(THDTensorDescriptor* output, size_t len,
+                   THDTensorDescriptor& input, THDGroup group) {
+  std::vector<at::Tensor> v_output(output, output + len);
+  dataChannel->gather(v_output, input, dataChannel->getRank(), group);
+}
+
+void THDScatterSend(THDTensorDescriptor* input, size_t len,
+                    THDTensorDescriptor& output, THDGroup group) {
+  std::vector<at::Tensor> v_input(input, input + len);
+  dataChannel->scatter(v_input, output, dataChannel->getRank(), group);
+}
+
+void THDScatterRecv(THDTensorDescriptor& output, int src_rank, THDGroup group) {
+  if (src_rank < 0)
+    throw std::domain_error("src_rank should not be negative");
+
+  std::vector<at::Tensor> v_input;
+  dataChannel->scatter(v_input, output, convertToRank(src_rank), group);
+}
+
+void THDBarrier(THDGroup group) {
+  dataChannel->barrier(group);
+}
+
+THDGroup THDNewGroup(const int *ranks, size_t len) {
+  std::vector<rank_type> v_ranks(len);
+  for (size_t i = 0; i < len; ++i) {
+    v_ranks[i] = convertToRank(ranks[i]);
+  }
+
+  return dataChannel->newGroup(v_ranks);
+}
+
+bool THDRequest_isCompleted(THDRequest* request) {
+  return request->isCompleted();
+}
+
+void THDRequest_wait(THDRequest* request) {
+  request->wait();
+}
diff --git a/torch/lib/THD/process_group/Collectives.h b/torch/lib/THD/process_group/Collectives.h
new file mode 100644
index 0000000..fd09c1a
--- /dev/null
+++ b/torch/lib/THD/process_group/Collectives.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "../THD.h"
+#include "../base/DataChannel.h"
+
+THD_API int THDGetRank();
+THD_API int THDGetNumProcesses();
+THD_API void THDAllReduceMultiGPU(THDTensorDescriptor* data,
+                                  size_t len,
+                                  THDReduceOp operation,
+                                  THDGroup group);
+THD_API void THDAllReduce(THDTensorDescriptor& desc, THDReduceOp operation,
+                          THDGroup group);
+THD_API void THDReduceMultiGPU(THDTensorDescriptor* desc,
+                               size_t len,
+                               THDReduceOp operation,
+                               int dst_rank,
+                               THDGroup group);
+THD_API void THDReduce(THDTensorDescriptor& desc, THDReduceOp operation,
+                       int dst_rank, THDGroup group);
+THD_API void THDBroadcastMultiGPU(THDTensorDescriptor* desc,
+                                  size_t len,
+                                  int src_rank,
+                                  THDGroup group);
+THD_API void THDBroadcast(THDTensorDescriptor& desc, int src_rank, THDGroup group);
+THD_API THDRequest* THDIsend(THDTensorDescriptor& desc, int dst_rank);
+THD_API THDRequest* THDIrecv(THDTensorDescriptor& desc, int src_rank);
+THD_API void THDSend(THDTensorDescriptor& desc, int dst_rank);
+THD_API int THDRecvAnySource(THDTensorDescriptor& desc);
+THD_API void THDRecv(THDTensorDescriptor& desc, int src_rank);
+THD_API void THDAllGatherMultiGPU(THDTensorDescriptor* output,
+                                  size_t outputLen,
+                                  THDTensorDescriptor* input,
+                                  size_t inputLen,
+                                  THDGroup group);
+THD_API void THDAllGather(THDTensorDescriptor* output, size_t len,
+                          THDTensorDescriptor& input, THDGroup group);
+THD_API void THDGatherSend(THDTensorDescriptor& input, int dst_rank, THDGroup group);
+THD_API void THDGatherRecv(THDTensorDescriptor* output, size_t len,
+                           THDTensorDescriptor& input, THDGroup group);
+THD_API void THDScatterSend(THDTensorDescriptor* input, size_t len,
+                            THDTensorDescriptor& output, THDGroup group);
+THD_API void THDScatterRecv(THDTensorDescriptor& output, int src_rank, THDGroup group);
+THD_API void THDBarrier(THDGroup group);
+THD_API THDGroup THDNewGroup(const int* ranks, size_t len);
+THD_API bool THDRequest_isCompleted(THDRequest* request);
+THD_API void THDRequest_wait(THDRequest* request);
diff --git a/torch/lib/THD/process_group/Collectives.hpp b/torch/lib/THD/process_group/Collectives.hpp
new file mode 100644
index 0000000..817a7c9
--- /dev/null
+++ b/torch/lib/THD/process_group/Collectives.hpp
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "base/TensorDescriptor.hpp"
+#include "Collectives.h"
diff --git a/torch/lib/THD/process_group/General.cpp b/torch/lib/THD/process_group/General.cpp
new file mode 100644
index 0000000..e9e7d99
--- /dev/null
+++ b/torch/lib/THD/process_group/General.cpp
@@ -0,0 +1,35 @@
+#include "General.hpp"
+#include "../base/Exceptions.hpp"
+
+namespace thd {
+std::unique_ptr<DataChannel> dataChannel;
+} // namespace thd
+
+using namespace thd;
+
+void THDProcessGroupInit(THDChannelType channel_type, std::string init_method = "env://",
+                         int world_size = -1, std::string group_name = "", int rank = -1) {
+  HANDLE_EXCEPTIONS
+  dataChannel = std::unique_ptr<DataChannel>(
+      thd::DataChannel::newChannel(channel_type, init_method, world_size,
+                                   group_name, rank));
+  dataChannel->init();
+  END_HANDLE_EXCEPTIONS
+}
+
+void THDProcessGroupDestroy() {
+  HANDLE_EXCEPTIONS
+  if (dataChannel) {
+    dataChannel->destroy();
+    dataChannel.reset(nullptr);
+  }
+  END_HANDLE_EXCEPTIONS
+}
+
+void THDClearGroupCache(THDGroup group) {
+  HANDLE_EXCEPTIONS
+  if (dataChannel) {
+    dataChannel->clearGroupCache(group);
+  }
+  END_HANDLE_EXCEPTIONS
+}
diff --git a/torch/lib/THD/process_group/General.h b/torch/lib/THD/process_group/General.h
new file mode 100644
index 0000000..94af0e4
--- /dev/null
+++ b/torch/lib/THD/process_group/General.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "../base/DataChannel.h"
+#include "../THD.h"
+#include <string>
+
+THD_API void THDProcessGroupInit(THDChannelType channel_type, std::string init_method,
+                                 int world_size, std::string group_name, int rank);
+THD_API void THDProcessGroupDestroy();
+THD_API void THDClearGroupCache(THDGroup group);
+
diff --git a/torch/lib/THD/process_group/General.hpp b/torch/lib/THD/process_group/General.hpp
new file mode 100644
index 0000000..f9f2823
--- /dev/null
+++ b/torch/lib/THD/process_group/General.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "base/DataChannel.hpp"
+#include "General.h"
+#include <memory>
+
+namespace thd {
+extern std::unique_ptr<DataChannel> dataChannel;
+} // namespace thd
diff --git a/torch/lib/THD/test/TestUtils.hpp b/torch/lib/THD/test/TestUtils.hpp
new file mode 100644
index 0000000..34a5e5b
--- /dev/null
+++ b/torch/lib/THD/test/TestUtils.hpp
@@ -0,0 +1,84 @@
+#include <THPP/tensors/THTensor.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <condition_variable>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+constexpr char RANK_ENV[] = "RANK";
+constexpr char WORLD_SIZE_ENV[] = "WORLD_SIZE";
+constexpr char MASTER_PORT_ENV[] = "MASTER_PORT";
+constexpr char MASTER_ADDR_ENV[] = "MASTER_ADDR";
+
+struct Barrier {
+  Barrier() : _count(0) {}
+  Barrier(size_t count) : _count(count) {}
+
+  void wait() {
+    std::unique_lock<std::mutex> lock{_mutex};
+    if (--_count == 0) {
+      _cv.notify_all();
+    } else {
+      _cv.wait(lock);
+    }
+  }
+
+private:
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  size_t _count;
+};
+
+template<typename T>
+typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type
+    check_equal(T x, T y, int ulp = 5) {
+  auto eps = std::numeric_limits<T>::epsilon();
+  auto min = std::numeric_limits<T>::min();
+  return (std::abs(x-y) < eps * std::abs(x+y) * ulp) || (std::abs(x-y) < min);
+}
+
+template<typename T>
+typename std::enable_if<std::numeric_limits<T>::is_integer, bool>::type
+    check_equal(T x, T y) {
+  return x == y;
+}
+
+template<typename T>
+std::shared_ptr<thpp::THTensor<T>> buildTensor(std::vector<int64_t> shape, T value) {
+  auto tensor = std::make_shared<thpp::THTensor<T>>();
+  tensor->resize(shape);
+  tensor->fill(value);
+  return tensor;
+}
+
+template<typename T>
+inline bool contains(std::vector<T> v, T value) {
+  return std::find(v.begin(), v.end(), value) != v.end();
+}
+
+inline int64_t nowInMilliseconds() {
+  return std::chrono::duration_cast<std::chrono::milliseconds>
+      (std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+inline int64_t factorial(int n) {
+  int64_t a = 1;
+  for (int64_t i = 1; i <= n; ++i) { a *= i; }
+  return a;
+}
+
+#define ASSERT_TENSOR_VALUE(T, tensor, value) {            \
+  for (size_t idx = 0; idx < (tensor).numel(); idx++) \
+    assert(check_equal(                                    \
+      reinterpret_cast<T*>((tensor).data())[idx], static_cast<T>(value) \
+    ));                                                    \
+}
+
+#define ASSERT_THROWS(exception, expr) {                       \
+  try { (expr); assert(false); } catch (const exception& e) {} \
+}
diff --git a/torch/lib/THD/test/command_channel_smoke.cpp b/torch/lib/THD/test/command_channel_smoke.cpp
new file mode 100644
index 0000000..a07a2cb
--- /dev/null
+++ b/torch/lib/THD/test/command_channel_smoke.cpp
@@ -0,0 +1,138 @@
+#include "../master_worker/common/CommandChannel.hpp"
+#include "TestUtils.hpp"
+
+#include <cassert>
+#include <cerrno>
+#include <cstdlib>
+#include <exception>
+#include <mutex>
+#include <string>
+#include <system_error>
+#include <thread>
+
+using namespace thd;
+
+std::vector<std::thread> g_all_workers;
+std::mutex g_mutex;
+std::unique_ptr<Barrier> g_barrier;
+
+void init_worker(const int& rank, const std::string& master_addr) {
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(rank).data(), 1);
+  setenv(MASTER_ADDR_ENV, master_addr.data(), 1);
+  auto channel = std::make_shared<thd::WorkerCommandChannel>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(channel->init());
+
+  auto msg = channel->recvMessage();
+  std::string expected = std::string("hello to worker ") +
+      std::to_string(rank) + " from master";
+  fprintf(stderr, "Worker %d: received '%.*s'\n", rank,
+      (int)msg.get()->bytes().length(), msg.get()->bytes().data());
+  assert(expected.compare(msg.get()->bytes().to_string()) == 0);
+
+  /*
+   * We need to wait until master will do all receiving and sending. This
+   * is because when worker is destroyed it closes all sockets what results in
+   * triggering `poll` function in master's error_handler and throwing exception.
+   */
+  g_barrier->wait();
+}
+
+void init_master(int world_size, const std::string& master_port) {
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string(world_size).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, master_port.data(), 1);
+  auto channel = std::make_shared<thd::MasterCommandChannel>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(channel->init());
+
+  for (int worker_rank = 1; worker_rank < world_size; ++worker_rank) {
+    rpc::ByteArray arr;
+    arr.append("hello to worker ", sizeof("hello to worker ") - 1);
+    arr.append(std::to_string(worker_rank).c_str(), std::to_string(worker_rank).size());
+    arr.append(" from master", sizeof(" from master") - 1);
+
+    fprintf(stderr, "master: about to send a message to worker %d\n", worker_rank);
+    auto rpc_msg = std::unique_ptr<rpc::RPCMessage>(new rpc::RPCMessage(arr));
+    channel->sendMessage(std::move(rpc_msg), worker_rank);
+  }
+
+  g_barrier->wait();
+
+  // wait for all workers to finish
+  for (auto& worker : g_all_workers) {
+    worker.join();
+  }
+}
+
+void run_test_case(const std::string& name, int world_size,
+                   const std::string& master_addr, const std::string& master_port) {
+  g_barrier.reset(new Barrier(world_size));
+  for (int rank = 1; rank < world_size; ++rank) {
+    g_all_workers.push_back(
+      std::thread(init_worker, rank, master_addr + ":" + master_port)
+    );
+  }
+
+  std::thread master_thread(init_master, world_size, master_port);
+  master_thread.join();
+  g_all_workers.clear();
+
+  fprintf(stderr, "\nPassed %s:\n"
+      "world size =\t\t%d\n"
+      "master address =\t%s\n"
+      "master port =\t\t%s\n"
+      "----------------------------------------------------\n\n",
+      name.c_str(), world_size, master_addr.c_str(), master_port.c_str());
+}
+
+int main() {
+  int world_size;
+  std::string master_addr;
+  std::string master_port;
+  std::string test_name;
+
+  try {
+    test_name = "Master test";
+    world_size = 1;
+    master_addr = "127.0.0.1";
+    master_port = "55555";
+    run_test_case(test_name, world_size, master_addr, master_port);
+
+    test_name = "Basic test";
+    world_size = 4;
+    master_addr = "127.0.0.1";
+    master_port = "55555";
+    run_test_case(test_name, world_size, master_addr, master_port);
+
+    test_name = "Many workers test";
+    world_size = 12;
+    master_addr = "127.0.0.1";
+    master_port = "55555";
+    run_test_case(test_name, world_size, master_addr, master_port);
+
+    test_name = "IPv6 test";
+    world_size = 12;
+    master_addr = "127.0.0.1";
+    master_port = "55555";
+    run_test_case(test_name, world_size, master_addr, master_port);
+
+    test_name = "Hostname resolution test";
+    world_size = 12;
+    master_addr = "localhost";
+    master_port = "55555";
+    run_test_case(test_name, world_size, master_addr, master_port);
+  } catch (const std::exception& e) {
+    throw std::runtime_error(
+        "test for world size = " + std::to_string(world_size) +
+        ", master address = " + master_addr + ", master port = " +
+        master_port + " failed because of: `" + e.what() + "`");
+  }
+
+  fprintf(stdout, "OK\n");
+  return 0;
+}
diff --git a/torch/lib/THD/test/data_channel_collectives.cpp b/torch/lib/THD/test/data_channel_collectives.cpp
new file mode 100644
index 0000000..4a9d64c
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_collectives.cpp
@@ -0,0 +1,774 @@
+#ifdef WITH_GLOO
+#include "../base/data_channels/DataChannelGloo.hpp"
+#endif // WITH_GLOO
+#ifdef WITH_MPI
+#include "../base/data_channels/DataChannelMPI.hpp"
+#endif // WITH_MPI
+#include "../base/data_channels/DataChannelTCP.hpp"
+#include "TestUtils.hpp"
+
+#include <THPP/tensors/THTensor.hpp>
+
+#include <unistd.h>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <thread>
+#include <array>
+
+constexpr std::array<int, 4> WORKERS_NUM = {2, 4, 7, 13};
+constexpr int MASTER_PORT = 45678;
+constexpr int BARRIER_WAIT_TIME = 200; // milliseconds
+
+std::vector<std::thread> g_all_workers;
+std::mutex g_mutex;
+std::string g_data_channel_type;
+std::unique_ptr<Barrier> g_barrier;
+
+
+void test_send_recv_tensor(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support send/recv
+  }
+
+  if (data_channel->getRank() == 0) {
+    auto float_tensor = buildTensor<float>({1, 2, 3}, 4.2);
+    data_channel->send(*float_tensor, 1);
+  } else if (data_channel->getRank() == 1) {
+    auto float_tensor = buildTensor<float>({1, 2, 3}, -1.0);
+    data_channel->receive(*float_tensor, 0);
+    ASSERT_TENSOR_VALUE(float, *float_tensor, 4.2);
+  }
+}
+
+void test_send_recv_tensor_any_source(std::shared_ptr<thd::DataChannel> data_channel,
+                                      int workers) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support send/recv from any source
+  }
+
+  if (data_channel->getRank() == 0) {
+    std::set<int> ranks;
+    for (int i = 0; i < workers; i++) {
+      auto int_tensor = buildTensor<int>({1, 2, 3}, -1);
+      data_channel->receive(*int_tensor);
+      ranks.insert(static_cast<int*>(int_tensor->data())[0]);
+    }
+
+    assert(ranks.size() == workers);
+  } else {
+    auto int_tensor = buildTensor<int>({1, 2, 3}, data_channel->getRank());
+    data_channel->send(*int_tensor, 0);
+  }
+}
+
+void test_send_recv_scalar(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support send/recv
+  }
+
+  if (data_channel->getRank() == 0) {
+    thd::ScalarWrapper<int> scalar((int)1232);
+    data_channel->send(scalar, 1);
+  } else if (data_channel->getRank() == 1) {
+    thd::ScalarWrapper<int> scalar((int)-1);
+    data_channel->receive(scalar, 0);
+    assert(scalar.value() == 1232);
+  }
+}
+
+void test_broadcast(std::shared_ptr<thd::DataChannel> data_channel) {
+  for (size_t dest = 0; dest < data_channel->getNumProcesses(); ++dest) {
+    if (data_channel->getRank() == dest) {
+      auto float_tensor = buildTensor<float>({1, 2, 3, 4, 5}, 10.123);
+      data_channel->broadcast(*float_tensor, dest);
+    } else {
+      auto float_tensor = buildTensor<float>({1, 2, 3, 4, 5}, -1.0);
+      data_channel->broadcast(*float_tensor, dest);
+      ASSERT_TENSOR_VALUE(float, *float_tensor, 10.123)
+    }
+  }
+}
+
+void _test_reduce_helper(std::shared_ptr<thd::DataChannel> data_channel,
+                         THDReduceOp op_type, int64_t init_value, int64_t expected_value) {
+  if (data_channel->getRank() == 0) {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, init_value);
+    data_channel->reduce(*int_tensor, op_type, 0);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, expected_value)
+  } else {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+    data_channel->reduce(*int_tensor, op_type, 0);
+  }
+}
+
+void test_reduce(std::shared_ptr<thd::DataChannel> data_channel, int workers) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support reduce
+  }
+
+  _test_reduce_helper(data_channel, THDReduceOp::THDReduceSUM,
+                      2, 2 + (workers * (workers + 1) / 2));
+  _test_reduce_helper(data_channel, THDReduceOp::THDReducePRODUCT,
+                      2, 2 * factorial(workers));
+  _test_reduce_helper(data_channel, THDReduceOp::THDReduceMIN, 10010, 1);
+  _test_reduce_helper(data_channel, THDReduceOp::THDReduceMAX,
+                      -1, data_channel->getNumProcesses() - 1);
+}
+
+void _test_allReduce_helper(std::shared_ptr<thd::DataChannel> data_channel,
+                            THDReduceOp op_type, int64_t init_value, int64_t expected_value) {
+  if (data_channel->getRank() == 0) {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5, 6, 7, 100}, init_value);
+    data_channel->allReduce(*int_tensor, op_type, 0);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, expected_value)
+  } else {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5, 6, 7, 100}, data_channel->getRank());
+    data_channel->allReduce(*int_tensor, op_type, 0);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, expected_value)
+  }
+}
+
+void test_allReduce(std::shared_ptr<thd::DataChannel> data_channel, int workers) {
+  _test_allReduce_helper(data_channel, THDReduceOp::THDReduceSUM,
+                         2, 2 + (workers * (workers + 1) / 2));
+  _test_allReduce_helper(data_channel, THDReduceOp::THDReducePRODUCT,
+                         2, 2 * factorial(workers));
+  _test_allReduce_helper(data_channel, THDReduceOp::THDReduceMIN, 10010, 1);
+  _test_allReduce_helper(data_channel, THDReduceOp::THDReduceMAX,
+                         -1, data_channel->getNumProcesses() - 1);
+}
+
+void test_scatter(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support scatter
+  }
+
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  if (data_channel->getRank() == 0) {
+    for (size_t i = 0; i < data_channel->getNumProcesses(); ++i) {
+      tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, i));
+      raw_tensors.push_back(tensors.back().get());
+    }
+  }
+
+  auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, -1);
+  data_channel->scatter(raw_tensors, *int_tensor, 0);
+  ASSERT_TENSOR_VALUE(int, *int_tensor, data_channel->getRank())
+}
+
+void test_gather(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support gather
+  }
+
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+  if (data_channel->getRank() == 0) {
+    for (size_t i = 0; i < data_channel->getNumProcesses(); ++i) {
+      tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, -1));
+      raw_tensors.push_back(tensors.back().get());
+    }
+
+    data_channel->gather(raw_tensors, *int_tensor, 0);
+    for (size_t i = 0; i < tensors.size(); ++i)
+      ASSERT_TENSOR_VALUE(int, *(tensors[i]), i)
+  } else {
+    data_channel->gather(raw_tensors, *int_tensor, 0);
+  }
+}
+
+void test_allGather(std::shared_ptr<thd::DataChannel> data_channel) {
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+  for (size_t i = 0; i < data_channel->getNumProcesses(); ++i) {
+    tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, -1));
+    raw_tensors.push_back(tensors.back().get());
+  }
+
+  data_channel->allGather(raw_tensors, *int_tensor, 0);
+  for (size_t i = 0; i < tensors.size(); ++i)
+    ASSERT_TENSOR_VALUE(int, *(tensors[i]), i)
+}
+
+void test_barrier(std::shared_ptr<thd::DataChannel> data_channel) {
+  for (int i = 0; i < data_channel->getNumProcesses(); ++i) {
+    if (data_channel->getRank() == i) {
+      int64_t time_after_barrier = nowInMilliseconds() + BARRIER_WAIT_TIME;
+      auto time_tensor = buildTensor<int64_t>({1}, time_after_barrier);
+      data_channel->broadcast(*time_tensor, i);
+      std::this_thread::sleep_for(std::chrono::milliseconds(BARRIER_WAIT_TIME + 10));
+      data_channel->barrier();
+    } else {
+      auto time_tensor = buildTensor<int64_t>({1}, -1);
+      data_channel->broadcast(*time_tensor, i); // get expected time after barrier
+      data_channel->barrier();
+      assert(nowInMilliseconds() >= reinterpret_cast<int64_t*>(time_tensor->data())[0]);
+    }
+  }
+}
+
+void test_isend(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support isend
+  }
+
+  if (data_channel->getRank() == 0) {
+    std::vector<std::shared_ptr<thd::DataChannel::Request>> requests;
+    for (size_t i = 1; i < data_channel->getNumProcesses(); ++i) {
+      auto tensor = buildTensor<int>({1, 2, 3, 4, 5}, i);
+      requests.push_back(std::shared_ptr<thd::DataChannel::Request>(
+        data_channel->isend(*tensor, i)
+      ));
+    }
+
+    for (auto request : requests) {
+      request->wait();
+      assert(request->isCompleted());
+    }
+  } else {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, -1);
+    data_channel->receive(*int_tensor, 0);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, data_channel->getRank())
+  }
+}
+
+void test_irecv(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support irecv
+  }
+
+  if (data_channel->getRank() == 0) {
+    std::vector<std::shared_ptr<thd::DataChannel::Request>> requests;
+    std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+    for (size_t i = 1; i < data_channel->getNumProcesses(); ++i) {
+      tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, -1));
+      requests.push_back(std::shared_ptr<thd::DataChannel::Request>(
+        data_channel->ireceive(*tensors.back(), i)
+      ));
+    }
+
+    for (size_t i = 0; i < requests.size(); ++i) {
+      requests.at(i)->wait();
+      assert(requests.at(i)->isCompleted());
+      ASSERT_TENSOR_VALUE(int, *tensors.at(i), i + 1)
+    }
+  } else {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+    data_channel->send(*int_tensor, 0);
+  }
+}
+
+
+void test_interlaces(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support isend, irecv, send, recv
+  }
+
+  if (data_channel->getRank() == 0) {
+    std::vector<std::shared_ptr<thd::DataChannel::Request>> requests;
+    for (size_t i = 1; i < data_channel->getNumProcesses(); ++i) {
+      auto tensor = buildTensor<int>({1, 2, 3, 4, 5}, 10);
+      requests.push_back(std::shared_ptr<thd::DataChannel::Request>(
+        data_channel->isend(*tensor, i)
+      ));
+    }
+
+    data_channel->barrier();
+
+    for (size_t i = 1; i < data_channel->getNumProcesses(); ++i) {
+      auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, 20);
+      data_channel->send(*int_tensor, i);
+    }
+  } else {
+    auto int_tensor1 = buildTensor<int>({1, 2, 3, 4, 5}, -1);
+    auto request = std::shared_ptr<thd::DataChannel::Request>(
+      data_channel->ireceive(*int_tensor1, 0)
+    );
+
+    data_channel->barrier();
+
+    auto int_tensor2 = buildTensor<int>({1, 2, 3, 4, 5}, -1);
+    data_channel->receive(*int_tensor2, 0);
+    request->wait();
+
+    ASSERT_TENSOR_VALUE(int, *int_tensor1, 10)
+    ASSERT_TENSOR_VALUE(int, *int_tensor2, 20)
+  }
+}
+
+/*
+ * In group tests we call same functions in processes which do not belong to those
+ * groups to check if it will not affect any computations.
+ *
+ * Processes which do not belong to group do not have to call those methods!
+ */
+
+////////////
+// GROUPS //
+////////////
+
+void test_broadcast_group(std::shared_ptr<thd::DataChannel> data_channel,
+                          THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (contains(group_ranks, data_channel->getRank())) {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, -1);
+    if (data_channel->getRank() == group_ranks[0])
+      int_tensor->fill(2000);
+
+    data_channel->broadcast(*int_tensor, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 2000)
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 1000);
+    data_channel->broadcast(*int_tensor, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+void test_reduce_group(std::shared_ptr<thd::DataChannel> data_channel,
+                       THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support reduce
+  }
+
+  if (contains(group_ranks, data_channel->getRank())) {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 10);
+    data_channel->reduce(*int_tensor, THDReduceOp::THDReduceSUM, group_ranks[0], group);
+    if (data_channel->getRank() == group_ranks[0]) {
+      ASSERT_TENSOR_VALUE(int, *int_tensor, 10 * group_ranks.size())
+    } else {
+      ASSERT_TENSOR_VALUE(int, *int_tensor, 10)
+    }
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 1000);
+    data_channel->reduce(*int_tensor, THDReduceOp::THDReduceSUM, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+void test_allReduce_group(std::shared_ptr<thd::DataChannel> data_channel,
+                          THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (contains(group_ranks, data_channel->getRank())) {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5, 6, 7, 100}, 10);
+    data_channel->allReduce(*int_tensor, THDReduceOp::THDReduceSUM, group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 10 * group_ranks.size())
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5, 6, 7, 100}, 1000);
+    data_channel->allReduce(*int_tensor, THDReduceOp::THDReduceSUM, group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+void test_scatter_group(std::shared_ptr<thd::DataChannel> data_channel,
+                        THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support scatter
+  }
+
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  if (contains(group_ranks, data_channel->getRank())) {
+    if (data_channel->getRank() == group_ranks[0]) {
+      for (size_t i = 0; i < group_ranks.size(); ++i) {
+        tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, group_ranks[i]));
+        raw_tensors.push_back(tensors.back().get());
+      }
+    }
+
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, -1);
+    data_channel->scatter(raw_tensors, *int_tensor, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, data_channel->getRank())
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 1000);
+    data_channel->scatter(raw_tensors, *int_tensor, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+
+void test_gather_group(std::shared_ptr<thd::DataChannel> data_channel,
+                       THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support gather
+  }
+
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  if (contains(group_ranks, data_channel->getRank())) {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+    if (data_channel->getRank() == group_ranks[0]) {
+      for (size_t i = 0; i < group_ranks.size(); ++i) {
+        tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, -1));
+        raw_tensors.push_back(tensors.back().get());
+      }
+
+      data_channel->gather(raw_tensors, *int_tensor, group_ranks[0], group);
+      for (size_t i = 0; i < tensors.size(); ++i)
+        ASSERT_TENSOR_VALUE(int, *(tensors[i]), group_ranks[i])
+    } else {
+      data_channel->gather(raw_tensors, *int_tensor, group_ranks[0], group);
+    }
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 1000);
+    data_channel->gather(raw_tensors, *int_tensor, group_ranks[0], group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+void test_allGather_group(std::shared_ptr<thd::DataChannel> data_channel,
+                          THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors;
+  std::vector<thpp::Tensor*> raw_tensors;
+  if (contains(group_ranks, data_channel->getRank())) {
+    auto int_tensor = buildTensor<int>({1, 2, 3, 4, 5}, data_channel->getRank());
+    for (size_t i = 0; i < group_ranks.size(); ++i) {
+      tensors.push_back(buildTensor<int>({1, 2, 3, 4, 5}, -1));
+      raw_tensors.push_back(tensors.back().get());
+    }
+
+    data_channel->allGather(raw_tensors, *int_tensor, group);
+    for (size_t i = 0; i < tensors.size(); ++i)
+      ASSERT_TENSOR_VALUE(int, *(tensors[i]), group_ranks[i])
+  } else {
+    auto int_tensor = buildTensor({1, 2, 3, 4, 5}, 1000);
+    data_channel->allGather(raw_tensors, *int_tensor, group);
+    ASSERT_TENSOR_VALUE(int, *int_tensor, 1000)
+  }
+}
+
+void test_barrier_group(std::shared_ptr<thd::DataChannel> data_channel,
+                        THDGroup group, std::vector<thd::rank_type> group_ranks) {
+  if (contains(group_ranks, data_channel->getRank())) {
+    for (int i = 0; i < group_ranks.size(); ++i) {
+      if (data_channel->getRank() == group_ranks[i]) {
+        int64_t time_after_barrier = nowInMilliseconds() + BARRIER_WAIT_TIME;
+        auto time_tensor = buildTensor<int64_t>({1}, time_after_barrier);
+        data_channel->broadcast(*time_tensor, group_ranks[i], group);
+        std::this_thread::sleep_for(std::chrono::milliseconds(BARRIER_WAIT_TIME + 10));
+        data_channel->barrier(group);
+      } else {
+        auto time_tensor = buildTensor<int64_t>({1}, -1);
+        data_channel->broadcast(*time_tensor, group_ranks[i], group); // get expected time after barrier
+        data_channel->barrier(group);
+        assert(nowInMilliseconds() >= reinterpret_cast<int64_t*>(time_tensor->data())[0]);
+      }
+    }
+  } else {
+    std::this_thread::sleep_for(std::chrono::milliseconds(BARRIER_WAIT_TIME + 100));
+    data_channel->barrier(group);
+  }
+}
+
+////////////////
+// EXCEPTIONS //
+////////////////
+
+void test_send_recv_invalid_rank(std::shared_ptr<thd::DataChannel> data_channel) {
+  if (g_data_channel_type == "gloo") {
+    return; // XXX: Gloo does not support send/recv
+  }
+
+  if (g_data_channel_type == "mpi") {
+    return; // XXX: MPI does not throw exceptions
+  }
+
+  auto rank = data_channel->getRank();
+  auto int_tensor = buildTensor({1, 2, 3, 4, 5}, -1);
+
+  { // cannot send or receive to self
+    ASSERT_THROWS(std::logic_error, data_channel->send(*int_tensor, rank))
+    ASSERT_THROWS(std::logic_error, data_channel->receive(*int_tensor, rank))
+  }
+
+  { // cannot send or receive to/from process with rank -1
+    ASSERT_THROWS(std::out_of_range, data_channel->send(*int_tensor, -1))
+    ASSERT_THROWS(std::out_of_range, data_channel->receive(*int_tensor, -1))
+  }
+}
+
+// Cannot create empty group or group will be null
+void test_empty_group(std::shared_ptr<thd::DataChannel> data_channel) {
+  // in MPI there will be created NULL_COMM
+  if (g_data_channel_type == "tcp" || g_data_channel_type == "gloo") {
+    ASSERT_THROWS(std::logic_error, data_channel->newGroup({}))
+  }
+}
+
+// Process with rank 0 is not part of group, we cannot perform operation to it
+void test_process_not_in_group(std::shared_ptr<thd::DataChannel> data_channel) {
+  auto int_tensor = buildTensor({1, 2, 3, 4, 5}, -1);
+
+  THDGroup group = data_channel->newGroup({1});
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors = {
+    buildTensor<int>({1, 2, 3, 4, 5}, -1)
+  };
+  std::vector<thpp::Tensor*> raw_tensors = {
+    tensors.back().get()
+  };
+
+  if (data_channel->getRank() == 1) {
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->broadcast(*int_tensor, 0, group)
+    )
+
+    if (g_data_channel_type == "gloo") {
+      return; // XXX: Gloo does not support scatter/gather/reduce
+    }
+
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->reduce(*int_tensor, THDReduceOp::THDReduceSUM, 0, group)
+    )
+
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->scatter(raw_tensors, *int_tensor, 0, group)
+    )
+
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->gather(raw_tensors, *int_tensor, 0, group)
+    )
+  }
+}
+
+// input_tensors does not match size of group
+void test_tensors_do_not_match_group_size(std::shared_ptr<thd::DataChannel> data_channel) {
+  auto int_tensor = buildTensor({1, 2, 3, 4, 5}, -1);
+  THDGroup group = data_channel->newGroup({1, 2});
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors = {
+    buildTensor<int>({1, 2, 3, 4, 5}, -1)
+  };
+  std::vector<thpp::Tensor*> raw_tensors = {
+    tensors.back().get()
+  };
+
+  if (data_channel->getRank() == 1 || data_channel->getRank() == 2) {
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->allGather(raw_tensors, *int_tensor, group)
+    )
+
+    if (g_data_channel_type == "gloo") {
+      return; // XXX: Gloo does not support scatter/gather
+    }
+
+    if (data_channel->getRank() == 1) {
+      ASSERT_THROWS(
+        std::logic_error,
+        data_channel->scatter(raw_tensors, *int_tensor, 1, group)
+      )
+
+      ASSERT_THROWS(
+        std::logic_error,
+        data_channel->gather(raw_tensors, *int_tensor, 1, group)
+      )
+    }
+  }
+}
+
+// input_tensors are not the same
+void test_tensors_are_not_the_same(std::shared_ptr<thd::DataChannel> data_channel) {
+  auto int_tensor = buildTensor({1, 2, 3, 4, 5}, -1);
+  THDGroup group = data_channel->newGroup({1, 2});
+  std::vector<std::shared_ptr<thpp::IntTensor>> tensors = {
+    buildTensor<int>({1, 2, 3, 4, 5}, -1),
+    buildTensor<int>({1, 2, 3, 4}, -1)
+  };
+  std::vector<thpp::Tensor*> raw_tensors = {
+    tensors[0].get(),
+    tensors[1].get()
+  };
+
+  if (data_channel->getRank() == 1 || data_channel->getRank() == 2) {
+    ASSERT_THROWS(
+      std::logic_error,
+      data_channel->allGather(raw_tensors, *int_tensor, group)
+    )
+
+    if (g_data_channel_type == "gloo") {
+      return; // XXX: Gloo does not support scatter/gather
+    }
+
+    if (data_channel->getRank() == 1) {
+      ASSERT_THROWS(
+        std::logic_error,
+        data_channel->scatter(raw_tensors, *int_tensor, 1, group)
+      )
+
+      ASSERT_THROWS(
+        std::logic_error,
+        data_channel->gather(raw_tensors, *int_tensor, 1, group)
+      )
+    }
+  }
+}
+
+void run_all_tests(std::shared_ptr<thd::DataChannel> data_channel, int workers) {
+  test_send_recv_tensor(data_channel);
+  test_send_recv_tensor_any_source(data_channel, workers);
+  test_send_recv_scalar(data_channel);
+  test_broadcast(data_channel);
+  test_reduce(data_channel, workers);
+  test_allReduce(data_channel, workers);
+  test_scatter(data_channel);
+  test_gather(data_channel);
+  test_allGather(data_channel);
+  test_barrier(data_channel);
+  test_isend(data_channel);
+  test_irecv(data_channel);
+  test_interlaces(data_channel);
+
+  std::vector<thd::rank_type> group_ranks = {1, 2};
+  THDGroup group = data_channel->newGroup(group_ranks);
+  test_broadcast_group(data_channel, group, group_ranks);
+  test_reduce_group(data_channel, group, group_ranks);
+  test_allReduce_group(data_channel, group, group_ranks);
+  test_scatter_group(data_channel, group, group_ranks);
+  test_gather_group(data_channel, group, group_ranks);
+  test_allGather_group(data_channel, group, group_ranks);
+  test_barrier_group(data_channel, group, group_ranks);
+
+  test_send_recv_invalid_rank(data_channel);
+  test_empty_group(data_channel);
+  test_process_not_in_group(data_channel);
+  test_tensors_do_not_match_group_size(data_channel);
+  test_tensors_are_not_the_same(data_channel);
+}
+
+
+void init_tcp_master(int workers) {
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string((workers + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(masterChannel->init());
+  run_all_tests(masterChannel, workers);
+
+  // wait for all workers to finish
+  for (auto& worker : g_all_workers) {
+    worker.join();
+  }
+}
+
+
+void init_tcp_worker(unsigned int id, int workers) {
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(id).data(), 1);
+  setenv(MASTER_ADDR_ENV, std::string("127.0.0.1:" + std::to_string(MASTER_PORT)).data(), 1);
+  auto worker_channel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(worker_channel->init());
+  run_all_tests(worker_channel, workers);
+}
+
+#ifdef WITH_GLOO
+void init_gloo_master(int workers) {
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string((workers + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelGloo>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(masterChannel->init());
+  run_all_tests(masterChannel, workers);
+
+  g_barrier->wait();
+}
+
+void init_gloo_worker(unsigned int id, int workers) {
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(id).data(), 1);
+  setenv(MASTER_ADDR_ENV, std::string("127.0.0.1:" + std::to_string(MASTER_PORT)).data(), 1);
+  auto worker_channel = std::make_shared<thd::DataChannelGloo>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(worker_channel->init());
+  run_all_tests(worker_channel, workers);
+
+  g_barrier->wait();
+}
+#endif // WITH_GLOO
+
+#ifdef WITH_MPI
+void init_mpi_process() {
+  auto data_channel = std::make_shared<thd::DataChannelMPI>();
+  assert(data_channel->init());
+  run_all_tests(data_channel, WORKERS_NUM[0]);
+
+  std::cout << "MPI OK (id: " << data_channel->getRank() << ")" << std::endl;
+}
+#endif // WITH_MPI
+
+
+int main(int argc, char const *argv[]) {
+#ifdef WITH_MPI
+  if (argc == 1) {
+#endif // WITH_MPI
+    g_data_channel_type = "tcp";
+    for (auto workers : WORKERS_NUM) {
+      std::cout << "TCP (workers: " << workers << "):" << std::endl;
+      // start tcp master
+      std::thread tcp_master_thread(init_tcp_master, workers);
+
+      // start tcp worker
+      for (int id = 1; id <= workers; ++id) {
+        g_all_workers.push_back(std::thread(init_tcp_worker, id, workers));
+      }
+
+      tcp_master_thread.join();
+      g_all_workers.clear();
+
+      std::cout << "TCP - OK" << std::endl;
+    }
+
+#ifdef WITH_GLOO
+    g_data_channel_type = "gloo";
+    for (auto workers : WORKERS_NUM) {
+      g_barrier.reset(new Barrier(workers + 1));
+      std::cout << "Gloo (workers: " << workers << "):" << std::endl;
+      // start gloo master
+      std::thread gloo_master_thread(init_gloo_master, workers);
+
+      // start gloo worker
+      for (int id = 1; id <= workers; ++id) {
+        g_all_workers.push_back(std::thread(init_gloo_worker, id, workers));
+      }
+
+      // wait for all workers to finish
+      for (auto& worker : g_all_workers) {
+        worker.join();
+      }
+
+      gloo_master_thread.join();
+      g_all_workers.clear();
+
+      std::cout << "Gloo - OK" << std::endl;
+    }
+#endif // WITH_GLOO
+
+#ifdef WITH_MPI
+    std::cout << "--------------------------" << std::endl;
+
+    // start MPI processes
+    std::cout << "MPI:" << std::endl;
+    execlp("mpirun", "mpirun", "-n", std::to_string(WORKERS_NUM[0] + 1).data(), argv[0], "1", NULL);
+  } else {
+    g_data_channel_type = "mpi";
+    init_mpi_process();
+  }
+#endif // WITH_MPI
+  return 0;
+}
diff --git a/torch/lib/THD/test/data_channel_gloo_cache.cpp b/torch/lib/THD/test/data_channel_gloo_cache.cpp
new file mode 100644
index 0000000..f36d881
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_gloo_cache.cpp
@@ -0,0 +1,90 @@
+#include "../base/data_channels/DataChannelGloo.hpp"
+#include "TestUtils.hpp"
+
+#include <THPP/tensors/THTensor.hpp>
+
+#include <unistd.h>
+#include <array>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+
+constexpr std::array<int, 1> WORKERS_NUM = {10};
+constexpr int MASTER_PORT = 45678;
+
+std::vector<std::thread> g_all_workers;
+std::mutex g_mutex;
+
+void test(std::shared_ptr<thd::DataChannel> data_channel) {
+  for (size_t dest = 0; dest < data_channel->getNumProcesses(); ++dest) {
+    if (data_channel->getRank() == dest) {
+      auto float_tensor = buildTensor<float>({1, 2, 3, 4, 5}, 10.123);
+      data_channel->broadcast(*float_tensor, dest);
+    } else {
+      auto float_tensor = buildTensor<float>({1, 2, 3, 4, 5}, -1.0);
+      data_channel->broadcast(*float_tensor, dest);
+      ASSERT_TENSOR_VALUE(float, *float_tensor, 10.123)
+    }
+  }
+}
+
+void run_all_tests(std::shared_ptr<thd::DataChannel> data_channel, int workers) {
+  // NOTE: without properly working GlooCache this test would create
+  // about (1000 * WORKERS ^ 3) connections what is over 'normal' system configuration
+  for (size_t i = 0; i < 1000; ++i) {
+    test(data_channel);
+  }
+}
+
+
+void init_gloo_master(int workers) {
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string((workers + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelGloo>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(masterChannel->init());
+  run_all_tests(masterChannel, workers);
+}
+
+void init_gloo_worker(unsigned int id, int workers) {
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(id).data(), 1);
+  setenv(MASTER_ADDR_ENV, std::string("127.0.0.1:" + std::to_string(MASTER_PORT)).data(), 1);
+  auto worker_channel = std::make_shared<thd::DataChannelGloo>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(worker_channel->init());
+  run_all_tests(worker_channel, workers);
+}
+
+
+int main(void)
+{
+  for (auto workers : WORKERS_NUM) {
+    std::cout << "Gloo (workers: " << workers << "):" << std::endl;
+    // start gloo master
+    std::thread gloo_master_thread(init_gloo_master, workers);
+
+    // start gloo worker
+    for (int id = 1; id <= workers; ++id) {
+      g_all_workers.push_back(std::thread(init_gloo_worker, id, workers));
+    }
+
+    // wait for all workers to finish
+    for (auto& worker : g_all_workers) {
+      worker.join();
+    }
+
+    gloo_master_thread.join();
+    g_all_workers.clear();
+
+    std::cout << "Gloo - OK" << std::endl;
+  }
+}
diff --git a/torch/lib/THD/test/data_channel_mpi_smoke.cpp b/torch/lib/THD/test/data_channel_mpi_smoke.cpp
new file mode 100644
index 0000000..219ce4a
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_mpi_smoke.cpp
@@ -0,0 +1,20 @@
+#include "../base/data_channels/DataChannelMPI.hpp"
+
+#include <unistd.h>
+#include <cassert>
+#include <iostream>
+#include <memory>
+
+constexpr int WORKERS_NUM = 2;
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    execlp("mpirun", "mpirun", "-n", std::to_string(WORKERS_NUM + 1).data(), argv[0], "1", NULL);
+  }
+
+  auto dataChannel = std::make_shared<thd::DataChannelMPI>();
+  assert(dataChannel->init());
+  assert(dataChannel->getNumProcesses() == (WORKERS_NUM + 1));
+  std::cout << "OK (id: " << dataChannel->getRank() << ")" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/THD/test/data_channel_tcp_accept_timeout.cpp b/torch/lib/THD/test/data_channel_tcp_accept_timeout.cpp
new file mode 100644
index 0000000..c2c0b6f
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_tcp_accept_timeout.cpp
@@ -0,0 +1,28 @@
+#include "../base/data_channels/DataChannelTCP.hpp"
+#include "TestUtils.hpp"
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <thread>
+
+constexpr int WORKERS_NUM = 2;
+constexpr int MASTER_PORT = 45680;
+
+void master()
+{
+  setenv(WORLD_SIZE_ENV, std::to_string((WORKERS_NUM + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://"), 2000); // timeout after 2s
+
+  ASSERT_THROWS(std::exception, masterChannel->init())
+}
+
+
+int main() {
+  std::thread master_thread(master);
+  master_thread.join();
+  std::cout << "OK" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/THD/test/data_channel_tcp_slow_master.cpp b/torch/lib/THD/test/data_channel_tcp_slow_master.cpp
new file mode 100644
index 0000000..e025e0c
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_tcp_slow_master.cpp
@@ -0,0 +1,69 @@
+#include "../base/data_channels/DataChannelTCP.hpp"
+#include "TestUtils.hpp"
+
+#include <THPP/tensors/THTensor.hpp>
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+constexpr int WORKERS_NUM = 2;
+constexpr int MASTER_PORT = 45679;
+
+std::vector<std::thread> g_all_workers;
+std::mutex g_mutex;
+
+void master()
+{
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string((WORKERS_NUM + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  // wait a long time before init
+  std::this_thread::sleep_for(std::chrono::seconds(4));
+
+  assert(masterChannel->init());
+
+  auto float_tensor = buildTensor<float>({1, 2, 3}, 4);
+  masterChannel->broadcast(*float_tensor, 0); // send good tensor
+
+  // wait for all workers to finish
+  for (auto& worker : g_all_workers) {
+    worker.join();
+  }
+}
+
+void worker(int id)
+{
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(id).data(), 1);
+  setenv(MASTER_ADDR_ENV, std::string("127.0.0.1:" + std::to_string(MASTER_PORT)).data(), 1);
+  auto workerChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://"));  // reads all env variable
+  g_mutex.unlock();
+
+  assert(workerChannel->init());
+
+  auto float_tensor = buildTensor<float>({1, 2, 3}, -1);
+  workerChannel->broadcast(*float_tensor, 0);
+  ASSERT_TENSOR_VALUE(float, *float_tensor, 4)
+}
+
+
+int main() {
+  // start master
+  std::thread master_thread(master);
+
+  // start worker
+  for (int id = 1; id <= WORKERS_NUM; ++id) {
+    g_all_workers.push_back(std::thread(worker, id));
+  }
+
+  master_thread.join();
+  std::cout << "OK" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/THD/test/data_channel_tcp_smoke.cpp b/torch/lib/THD/test/data_channel_tcp_smoke.cpp
new file mode 100644
index 0000000..b9ec239
--- /dev/null
+++ b/torch/lib/THD/test/data_channel_tcp_smoke.cpp
@@ -0,0 +1,63 @@
+#include "../base/data_channels/DataChannelTCP.hpp"
+#include "TestUtils.hpp"
+
+#include <THPP/tensors/THTensor.hpp>
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+constexpr int WORKERS_NUM = 2;
+constexpr int MASTER_PORT = 45678;
+
+std::vector<std::thread> g_all_workers;
+std::mutex g_mutex;
+
+void master()
+{
+  g_mutex.lock();
+  setenv(WORLD_SIZE_ENV, std::to_string((WORKERS_NUM + 1)).data(), 1);
+  setenv(RANK_ENV, "0", 1);
+  setenv(MASTER_PORT_ENV, std::to_string(MASTER_PORT).data(), 1);
+  auto masterChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(masterChannel->init());
+  assert(masterChannel->getRank() == 0);
+  assert(masterChannel->getNumProcesses() == WORKERS_NUM + 1);
+
+  // wait for all workers to finish
+  for (auto& worker : g_all_workers) {
+    worker.join();
+  }
+}
+
+void worker(int id)
+{
+  g_mutex.lock();
+  setenv(RANK_ENV, std::to_string(id).data(), 1);
+  setenv(MASTER_ADDR_ENV, std::string("127.0.0.1:" + std::to_string(MASTER_PORT)).data(), 1);
+  auto workerChannel = std::make_shared<thd::DataChannelTCP>(thd::getInitConfig("env://")); // reads all env variable
+  g_mutex.unlock();
+
+  assert(workerChannel->init());
+  assert(workerChannel->getRank() == id);
+  assert(workerChannel->getNumProcesses() == WORKERS_NUM + 1);
+}
+
+
+int main() {
+  // start master
+  std::thread master_thread(master);
+
+  // start worker
+  for (int id = 1; id <= WORKERS_NUM; ++id) {
+    g_all_workers.push_back(std::thread(worker, id));
+  }
+
+  master_thread.join();
+  std::cout << "OK" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/THD/test/rpc_serialization.cpp b/torch/lib/THD/test/rpc_serialization.cpp
new file mode 100644
index 0000000..8e9a478
--- /dev/null
+++ b/torch/lib/THD/test/rpc_serialization.cpp
@@ -0,0 +1,67 @@
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <iostream>
+#include <typeinfo>
+#include <vector>
+
+#include <THPP/Type.hpp>
+
+#include "../master_worker/common/RPC.hpp"
+#include "TH/THStorage.h"
+
+using namespace std;
+using namespace thd;
+using namespace thd::rpc;
+
+constexpr ptrdiff_t STORAGE_SIZE = 10;
+constexpr size_t VEC_SIZE = 3;
+
+int main() {
+  THLongStorage *storage1 = THLongStorage_newWithSize(STORAGE_SIZE);
+  int64_t *data = storage1->data;
+  for (int64_t i = 0; i < STORAGE_SIZE; i++)
+    data[i] = i;
+  std::vector<int64_t> vec(VEC_SIZE, 7);  // VEC_SIZE sevens
+  std::unique_ptr<RPCMessage> msg_ptr =
+    packMessage(1, 1.0f, 100l, -12, LLONG_MAX, storage1, vec);
+  auto &msg = *msg_ptr;
+
+  uint16_t fid = unpackFunctionId(msg);
+  assert(fid == 1);
+
+  assert(peekType(msg) == thpp::Type::FLOAT);
+  double arg1 = unpackFloat(msg);
+  assert(arg1 == 1.0);
+
+  assert(peekType(msg) == thpp::Type::LONG);
+  int64_t arg2 = unpackInteger(msg);
+  assert(arg2 == 100);
+
+  assert(peekType(msg) == thpp::Type::INT);
+  int64_t arg3 = unpackInteger(msg);
+  assert(arg3 == -12);
+
+  assert(peekType(msg) == thpp::Type::LONG_LONG);
+  int64_t arg4 = unpackInteger(msg);
+  assert(arg4 == LLONG_MAX);
+
+  assert(peekType(msg) == thpp::Type::LONG_STORAGE);
+  THLongStorage *storage2 = unpackTHLongStorage(msg);
+  assert(storage2->size == STORAGE_SIZE);
+  for (int64_t i = 0; i < STORAGE_SIZE; i++)
+    assert(storage2->data[i] == i);
+  
+  int vec_size = unpackInteger(msg);
+  assert(vec_size == VEC_SIZE);
+  for (int i = 0; i < VEC_SIZE; i++)
+    assert(unpackInteger(msg) == 7);
+
+  assert(msg.isEmpty());
+  try {
+    double arg6 = unpackFloat(msg);
+    assert(false);
+  } catch (exception &e) {}
+  std::cout << "OK" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/THD/test/tensor_smoke.cpp b/torch/lib/THD/test/tensor_smoke.cpp
new file mode 100644
index 0000000..9d6280f
--- /dev/null
+++ b/torch/lib/THD/test/tensor_smoke.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include <cassert>
+#include <typeinfo>
+
+#include <THPP/tensors/THTensor.hpp>
+
+using namespace std;
+
+
+int main() {
+  thpp::FloatTensor *tensor = new thpp::THTensor<float>();
+  thpp::FloatTensor *tensor2 = new thpp::THTensor<float>();
+  assert(tensor->nDim() == 0);
+
+  tensor->resize({1, 2, 3});
+  assert(tensor->nDim() == 3);
+  int i = 0;
+  for (auto s: tensor->sizes())
+    assert(s == ++i);
+
+  vector<int64_t> sizes = {2, 2};
+  tensor2->resize(sizes);
+  tensor2->fill(4);
+  tensor->add(*tensor2, 1);
+  assert(tensor->nDim() == 2);
+
+  for (auto s: tensor->sizes())
+    assert(s == 2);
+  for (int i = 0; i < 2; i++)
+    assert(reinterpret_cast<float*>(tensor->data())[i] == 5);
+
+  bool thrown = false;
+  try {
+    thpp::IntTensor &a = dynamic_cast<thpp::IntTensor&>(*tensor);
+  } catch(std::bad_cast &e) {
+    thrown = true;
+  }
+  assert(thrown);
+
+  delete tensor;
+  delete tensor2;
+  cout << "OK" << endl;
+  return 0;
+}
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
new file mode 100644
index 0000000..c1f7a3b
--- /dev/null
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -0,0 +1,153 @@
+cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
+
+# Find modules.
+list(APPEND CMAKE_MODULE_PATH
+  /usr/lib/x86_64-linux-gnu/
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/public
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules_CUDA_fix)
+
+# Relies on CMAKE_INSTALL_PREFIX to be set to ../tmp_install.
+# It then finds $PREFIX/share/cmake/ATen/ATenConfig.cmake,
+# which defines ATEN_INCLUDE_DIR and ATEN_LIBRARIES.
+find_package(Caffe2 REQUIRED)
+if(NOT Caffe2_FOUND)
+  message(FATAL_ERROR "Caffe2 not found")
+endif()
+
+find_package(Gloo REQUIRED)
+if(Gloo_FOUND)
+  message(STATUS "Gloo_LIBRARY: ${Gloo_LIBRARY}")
+  message(STATUS "Gloo_NATIVE_LIBRARY: ${Gloo_NATIVE_LIBRARY}")
+  message(STATUS "Gloo_INCLUDE_DIR: ${Gloo_INCLUDE_DIR}")
+else()
+  message(FATAL_ERROR "Gloo not found")
+endif()
+
+find_package(MPI)
+if(MPI_FOUND)
+  message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
+  message(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
+  message(STATUS "MPIEXEC: ${MPIEXEC}")
+else()
+  message(STATUS "Not able to find MPI, will compile c10d without MPI support")
+endif()
+
+find_package(NCCL)
+if(NCCL_FOUND)
+  message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
+  message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}")
+  IF(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
+    message(STATUS "NCCL Version 2 or higher found, will "
+                   "compile with NCCL distributed backend")
+    SET(DISTRIBUTED_NCCL_FOUND TRUE)
+  else()
+    message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
+                   "determinable, will not compile with NCCL distributed "
+                   "backend")
+  endif()
+else()
+  message(STATUS "Not able to find NCCL, will not "
+                 "compile with NCCL distributed backend")
+endif()
+
+find_package(CUDA REQUIRED)
+if(NOT CUDA_FOUND)
+  message(FATAL_ERROR "CUDA not found")
+endif()
+
+function(copy_header file)
+  configure_file(${file} ${CMAKE_BINARY_DIR}/include/c10d/${file} COPYONLY)
+endfunction()
+
+if(DISTRIBUTED_NCCL_FOUND)
+  option(USE_C10D_NCCL "USE C10D NCCL" ON)
+endif()
+configure_file(cmake/Def.hpp.in ${CMAKE_BINARY_DIR}/include/c10d/Def.hpp @ONLY)
+
+set(C10D_SRCS
+  CUDAUtils.cpp
+  FileStore.cpp
+  ProcessGroup.cpp
+  Store.cpp
+  TCPStore.cpp
+  Utils.cpp
+  ProcessGroupGloo.cpp
+  )
+
+set(C10D_LIBS
+  caffe2_gpu
+  ${Gloo_LIBRARY}
+  ${Gloo_NATIVE_LIBRARY}
+  )
+
+if(DISTRIBUTED_NCCL_FOUND)
+  list(APPEND C10D_SRCS ProcessGroupNCCL.cpp)
+  list(APPEND C10D_LIBS ${NCCL_LIBRARIES})
+endif()
+
+if(MPI_FOUND)
+  list(APPEND C10D_SRCS ProcessGroupMPI.cpp)
+  list(APPEND C10D_LIBS ${MPI_LIBRARIES})
+endif()
+
+add_library(c10d ${C10D_SRCS})
+set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET c10d PROPERTY CXX_STANDARD 11)
+target_compile_options(c10d PUBLIC
+  -Wall
+  -Wextra
+  -Wno-unused-parameter
+  -Wno-missing-field-initializers
+  -Wno-write-strings
+  -Wno-unknown-pragmas
+  )
+
+# c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path
+target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH)
+target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
+# For <c10d/...>
+target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
+# For torch/csrc/utils/hash.h and torch/csrc/utils/functional.h
+target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+# For <gloo/...>
+target_include_directories(c10d PUBLIC ${GLOO_INCLUDE_DIR})
+
+copy_header(CUDAUtils.hpp)
+copy_header(FileStore.hpp)
+copy_header(ProcessGroup.hpp)
+copy_header(Store.hpp)
+copy_header(TCPStore.hpp)
+copy_header(Types.hpp)
+copy_header(Utils.hpp)
+copy_header(ProcessGroupGloo.hpp)
+
+if(DISTRIBUTED_NCCL_FOUND)
+  target_include_directories(c10d PUBLIC ${NCCL_INCLUDE_DIRS})
+  copy_header(ProcessGroupNCCL.hpp)
+  copy_header(NCCLUtils.hpp)
+endif()
+
+if(MPI_FOUND)
+  target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
+  copy_header(ProcessGroupMPI.hpp)
+endif()
+
+target_link_libraries(c10d PUBLIC ${C10D_LIBS})
+target_include_directories(c10d PRIVATE ${CMAKE_BINARY_DIR}/include)
+
+install(TARGETS c10d ARCHIVE DESTINATION lib)
+
+option(BUILD_EXAMPLES "Build examples" OFF)
+if(BUILD_EXAMPLES)
+  add_subdirectory(example)
+endif()
+
+option(BUILD_TEST "Build tests" ON)
+if(BUILD_TEST)
+  enable_testing()
+  add_subdirectory(test)
+endif()
+
+# Install all header files that were prepared in the build directory
+install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)
diff --git a/torch/lib/c10d/CUDAUtils.cpp b/torch/lib/c10d/CUDAUtils.cpp
new file mode 100644
index 0000000..06d26c6
--- /dev/null
+++ b/torch/lib/c10d/CUDAUtils.cpp
@@ -0,0 +1,43 @@
+#include "CUDAUtils.hpp"
+
+#include <c10d/private/CUDAUtils.hpp>
+
+namespace c10d {
+
+CUDAEvent CUDAEvent::create(unsigned int flags) {
+  int current_device;
+  C10D_CUDA_CHECK(cudaGetDevice(&current_device));
+  CUDAEvent event(nullptr, current_device);
+
+  C10D_CUDA_CHECK(cudaEventCreateWithFlags(&event.event_, flags));
+  return event;
+}
+
+CUDAEvent::~CUDAEvent() {
+  if (event_ != nullptr) {
+    // cudaEventDestroy must run on the same device of the event,
+    // otherwise it creates a context on default device as well.
+    at::DeviceGuard guard(device_);
+
+    C10D_CUDA_CHECK(cudaEventDestroy(event_));
+  }
+}
+
+CUDAStream CUDAStream::create() {
+  CUDAStream stream;
+  stream.stream_ = THCStream_new(cudaStreamNonBlocking);
+  return stream;
+}
+
+CUDAStream::~CUDAStream() {
+  if (stream_ != nullptr) {
+    THCStream_free(stream_);
+    stream_ = nullptr;
+  }
+}
+
+cudaStream_t CUDAStream::getStream() const {
+  return THCStream_stream(stream_);
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/CUDAUtils.hpp b/torch/lib/c10d/CUDAUtils.hpp
new file mode 100644
index 0000000..f57f692
--- /dev/null
+++ b/torch/lib/c10d/CUDAUtils.hpp
@@ -0,0 +1,94 @@
+#pragma once
+
+typedef struct CUDAStreamInternals THCStream;
+
+#include <algorithm>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace c10d {
+
+// RAII wrapper for CUDA events.
+class CUDAEvent {
+ public:
+  CUDAEvent(cudaEvent_t event, int device) : device_(device), event_(event) {}
+
+  CUDAEvent() : CUDAEvent(nullptr, 0) {}
+
+  ~CUDAEvent();
+
+  static CUDAEvent create(unsigned int flags = cudaEventDefault);
+
+  // Must not be copyable.
+  CUDAEvent& operator=(const CUDAEvent&) = delete;
+  CUDAEvent(const CUDAEvent&) = delete;
+
+  // Must be move constructable.
+  CUDAEvent(CUDAEvent&& other) {
+    std::swap(event_, other.event_);
+    std::swap(device_, other.device_);
+  }
+
+  // Must be move assignable.
+  CUDAEvent& operator=(CUDAEvent&& other) {
+    std::swap(event_, other.event_);
+    std::swap(device_, other.device_);
+    return *this;
+  }
+
+  cudaEvent_t getEvent() const {
+    return event_;
+  }
+
+  int getDevice() const {
+    return device_;
+  }
+
+ protected:
+  int device_;
+  cudaEvent_t event_;
+};
+
+// RAII wrapper for CUDA streams.
+//
+// This wrapper uses THCStream instead of cudaStream_t because we need
+// to interact with the THC API for selecting the current stream.
+// Doing this without having a THCStream pointer is cumbersome.
+//
+class CUDAStream {
+ public:
+  CUDAStream(THCStream* stream) : stream_(stream) {}
+
+  CUDAStream() : CUDAStream(nullptr) {}
+
+  ~CUDAStream();
+
+  static CUDAStream create();
+
+  // Must not be copyable.
+  CUDAStream& operator=(const CUDAStream&) = delete;
+  CUDAStream(const CUDAStream&) = delete;
+
+  // Must be move constructable.
+  CUDAStream(CUDAStream&& other) {
+    std::swap(stream_, other.stream_);
+  }
+
+  // Must be move assignable.
+  CUDAStream& operator=(CUDAStream&& other) {
+    std::swap(stream_, other.stream_);
+    return *this;
+  }
+
+  cudaStream_t getStream() const;
+
+  THCStream* getTHCStream() {
+    return stream_;
+  }
+
+ protected:
+  THCStream* stream_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/Def.hpp b/torch/lib/c10d/Def.hpp
new file mode 100644
index 0000000..3a861c4
--- /dev/null
+++ b/torch/lib/c10d/Def.hpp
@@ -0,0 +1,4 @@
+#pragma once
+
+// This is a placeholder for the header that is generated CMake.
+// It is needed if you include the c10d headers directly from this directory.
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
new file mode 100644
index 0000000..75d7931
--- /dev/null
+++ b/torch/lib/c10d/FileStore.cpp
@@ -0,0 +1,272 @@
+#include "FileStore.hpp"
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <system_error>
+#include <thread>
+
+#define SYSASSERT(rv, ...)                                                 \
+  if ((rv) < 0) {                                                          \
+    throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
+  }
+
+namespace c10d {
+
+namespace {
+
+template <typename F>
+typename std::result_of<F()>::type syscall(F fn) {
+  while (true) {
+    auto rv = fn();
+    if (rv == -1) {
+      if (errno == EINTR) {
+        continue;
+      }
+    }
+    return rv;
+  }
+}
+
+// For a comprehensive overview of file locking methods,
+// see: https://gavv.github.io/blog/file-locks/.
+// We stick to flock(2) here because we don't care about
+// locking byte ranges and don't want locks to be process-wide.
+
+// RAII wrapper around flock(2)
+class Lock {
+ public:
+  explicit Lock(int fd, int operation) : fd_(fd) {
+    flock(operation);
+  }
+
+  ~Lock() {
+    unlock();
+  }
+
+  Lock(const Lock& that) = delete;
+
+  Lock(Lock&& other) noexcept {
+    fd_ = other.fd_;
+    other.fd_ = -1;
+  }
+
+  void unlock() {
+    if (fd_ >= 0) {
+      flock(LOCK_UN);
+      fd_ = -1;
+    }
+  }
+
+ protected:
+  int fd_;
+
+  void flock(int operation) {
+    auto rv = syscall(std::bind(::flock, fd_, operation));
+    SYSASSERT(rv, "flock");
+  }
+};
+
+class File {
+ public:
+  explicit File(const std::string& path, int flags) {
+    fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644));
+    SYSASSERT(fd_, "open(" + path + ")");
+  }
+
+  ~File() {
+    ::close(fd_);
+  }
+
+  Lock lockShared() {
+    return Lock(fd_, LOCK_SH);
+  }
+
+  Lock lockExclusive() {
+    return Lock(fd_, LOCK_EX);
+  }
+
+  off_t seek(off_t offset, int whence) {
+    auto rv = syscall(std::bind(lseek, fd_, offset, whence));
+    SYSASSERT(rv, "lseek");
+    return rv;
+  }
+
+  off_t tell() {
+    auto rv = syscall(std::bind(lseek, fd_, 0, SEEK_CUR));
+    SYSASSERT(rv, "lseek");
+    return rv;
+  }
+
+  off_t size() {
+    auto pos = tell();
+    auto size = seek(0, SEEK_END);
+    seek(pos, SEEK_SET);
+    return size;
+  }
+
+  void write(const void* buf, size_t count) {
+    while (count > 0) {
+      auto rv = syscall(std::bind(::write, fd_, buf, count));
+      SYSASSERT(rv, "write");
+      buf = (uint8_t*)buf + count;
+      count -= rv;
+    }
+  }
+
+  void read(void* buf, size_t count) {
+    while (count > 0) {
+      auto rv = syscall(std::bind(::read, fd_, buf, count));
+      SYSASSERT(rv, "read");
+      buf = (uint8_t*)buf + count;
+      count -= rv;
+    }
+  }
+
+  void write(const std::string& str) {
+    uint32_t len = str.size();
+    assert(str.size() <= std::numeric_limits<decltype(len)>::max());
+    write(&len, sizeof(len));
+    write(str.c_str(), len);
+  }
+
+  void write(const std::vector<uint8_t>& data) {
+    uint32_t len = data.size();
+    assert(data.size() <= std::numeric_limits<decltype(len)>::max());
+    write(&len, sizeof(len));
+    write(data.data(), len);
+  }
+
+  void read(std::string& str) {
+    uint32_t len;
+    read(&len, sizeof(len));
+    std::vector<uint8_t> buf(len);
+    read(buf.data(), len);
+    str.assign(buf.begin(), buf.end());
+  }
+
+  void read(std::vector<uint8_t>& data) {
+    uint32_t len;
+    read(&len, sizeof(len));
+    data.resize(len);
+    read(data.data(), len);
+  }
+
+ protected:
+  int fd_;
+};
+
+off_t refresh(
+    File& file,
+    off_t pos,
+    std::unordered_map<std::string, std::vector<uint8_t>>& cache) {
+  auto size = file.size();
+  if (size != pos) {
+    std::string tmpKey;
+    std::vector<uint8_t> tmpValue;
+    file.seek(pos, SEEK_SET);
+    while (size > pos) {
+      file.read(tmpKey);
+      file.read(tmpValue);
+      cache[tmpKey] = std::move(tmpValue);
+      pos = file.tell();
+    }
+  }
+  return pos;
+}
+
+} // namespace
+
+FileStore::FileStore(const std::string& path) : Store(), path_(path), pos_(0) {}
+
+FileStore::~FileStore() {}
+
+void FileStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  File file(path_, O_RDWR | O_CREAT);
+  auto lock = file.lockExclusive();
+  file.seek(0, SEEK_END);
+  file.write(key);
+  file.write(value);
+}
+
+std::vector<uint8_t> FileStore::get(const std::string& key) {
+  while (cache_.count(key) == 0) {
+    File file(path_, O_RDONLY);
+    auto lock = file.lockShared();
+    auto size = file.size();
+    if (size == pos_) {
+      // No new entries; release the shared lock and sleep for a bit
+      lock.unlock();
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      continue;
+    }
+
+    pos_ = refresh(file, pos_, cache_);
+  }
+
+  return cache_[key];
+}
+
+int64_t FileStore::add(const std::string& key, int64_t i) {
+  File file(path_, O_RDWR | O_CREAT);
+  auto lock = file.lockExclusive();
+  pos_ = refresh(file, pos_, cache_);
+
+  const auto& value = cache_[key];
+  int64_t ti = i;
+  if (!value.empty()) {
+    auto buf = reinterpret_cast<const char*>(value.data());
+    auto len = value.size();
+    ti += std::stoll(std::string(buf, len));
+  }
+
+  // File cursor is at the end of the file now, and we have an
+  // exclusive lock, so we can write the new value.
+  file.write(key);
+  file.write(std::to_string(ti));
+
+  return ti;
+}
+
+bool FileStore::check(const std::vector<std::string>& keys) {
+  File file(path_, O_RDONLY);
+  auto lock = file.lockShared();
+  pos_ = refresh(file, pos_, cache_);
+
+  for (const auto& key : keys) {
+    if (cache_.count(key) == 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void FileStore::wait(
+    const std::vector<std::string>& keys,
+    const std::chrono::milliseconds& timeout) {
+  // Not using inotify because it doesn't work on many
+  // shared filesystems (such as NFS).
+  const auto start = std::chrono::steady_clock::now();
+  while (!check(keys)) {
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - start);
+    if (timeout != kNoTimeout && elapsed > timeout) {
+      throw std::runtime_error("Wait timeout");
+    }
+
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
new file mode 100644
index 0000000..a675388
--- /dev/null
+++ b/torch/lib/c10d/FileStore.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <unordered_map>
+
+#include <c10d/Store.hpp>
+
+namespace c10d {
+
+class FileStore : public Store {
+ public:
+  explicit FileStore(const std::string& path);
+
+  virtual ~FileStore();
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
+
+ protected:
+  std::string path_;
+  off_t pos_;
+
+  std::unordered_map<std::string, std::vector<uint8_t>> cache_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/NCCLUtils.hpp b/torch/lib/c10d/NCCLUtils.hpp
new file mode 100644
index 0000000..50a7140
--- /dev/null
+++ b/torch/lib/c10d/NCCLUtils.hpp
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <memory>
+
+#include <nccl.h>
+
+#define C10D_NCCL_CHECK(cmd)                                              \
+  do {                                                                    \
+    ncclResult_t error = cmd;                                             \
+    if (error != ncclSuccess) {                                           \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
+          std::to_string(__LINE__) + ", " +                               \
+          std::string(ncclGetErrorString(error));                         \
+      throw std::runtime_error(err);                                      \
+    }                                                                     \
+  } while (0)
+
+namespace c10d {
+
+// RAII wrapper for NCCL communicator
+class NCCLComm {
+ public:
+  explicit NCCLComm(ncclComm_t ncclComm) : ncclComm_(ncclComm) {}
+
+  NCCLComm() : NCCLComm(nullptr) {}
+
+  ~NCCLComm() {
+    if (ncclComm_) {
+      C10D_NCCL_CHECK(ncclCommDestroy(ncclComm_));
+    }
+  }
+
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId) {
+    auto comm = std::make_shared<NCCLComm>();
+    C10D_NCCL_CHECK(
+        ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank));
+    return comm;
+  }
+
+  // Must not be copyable
+  NCCLComm(const NCCLComm&) = delete;
+  NCCLComm& operator=(const NCCLComm&) = delete;
+
+  // Move constructable
+  NCCLComm(NCCLComm&& other) {
+    std::swap(ncclComm_, other.ncclComm_);
+  }
+  // Move assignable
+  NCCLComm& operator=(NCCLComm&& other) {
+    std::swap(ncclComm_, other.ncclComm_);
+    return *this;
+  }
+
+  ncclComm_t getNcclComm() {
+    return ncclComm_;
+  }
+
+ protected:
+  ncclComm_t ncclComm_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroup.cpp b/torch/lib/c10d/ProcessGroup.cpp
new file mode 100644
index 0000000..e178e8b
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroup.cpp
@@ -0,0 +1,11 @@
+#include "ProcessGroup.hpp"
+
+namespace c10d {
+
+ProcessGroup::Work::~Work() {}
+
+ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
+
+ProcessGroup::~ProcessGroup() {}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
new file mode 100644
index 0000000..3761528
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroup.hpp
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+#include <ATen/ATen.h>
+
+#include <c10d/Types.hpp>
+
+namespace c10d {
+
+// ProcessGroup is a base class that captures collective and point to
+// point communication in a fixed set of processes.
+//
+// The functions specified in the class below describe the API alone;
+// implementations are provided in subclasses.
+//
+// Every function that performs I/O is executed asynchronously by a
+// thread pool owned by the ProcessGroup (by default). They return an
+// object that can be used to wait for completion or error.
+//
+// The ProcessGroup can instantiate subgroups with fewer or an equal
+// number of members. Implementations must take care that multiple
+// process groups can be used in parallel and synchronize accordingly.
+//
+// The ProcessGroup assumes a fixed set of processes. If the set
+// changes, existing instances must be destructed and instantiation
+// and initialization must start from scratch. For members of the
+// process group to find each other (referred to as rendezvous from
+// hereon)
+//
+class ProcessGroup {
+ public:
+  class Work {
+   public:
+    virtual ~Work();
+
+    // Checks if request has completed. Non-blocking operation.
+    virtual bool isCompleted() const = 0;
+
+    // Returns if the work completed successfully.
+    // If false, the exception function can be called to get details.
+    virtual bool isSuccess() const = 0;
+
+    // Ensures that operations on the output tensors that are invoked
+    // after this function returns are correctly sequenced after the
+    // asynchronous completion of this work.
+    //
+    // For CUDA tensors, it inserts stream synchronization such that
+    // the streams of the caller wait for completion of the
+    // asynchronous operations on the destination tensors.
+    //
+    // For CPU tensors, it is currently a nop.
+    //
+    // This function should only be used if the caller polls for
+    // completion through the `isCompleted` function, it has returned
+    // true, and the `isSuccess` function also has returned true.
+    //
+    virtual void synchronize() = 0;
+
+    // Waits until request completes. Blocking operation.
+    // Returns false if the work completed with an exception.
+    //
+    // Functionally equivalent to:
+    //
+    //   while (!isCompleted()) { /* nop */ }
+    //   auto success = isSuccess();
+    //   if (success) { synchronize(); }
+    //   return success;
+    //
+    virtual bool wait() = 0;
+
+    // Returns exception if wait() returned false.
+    virtual const std::exception& exception() const = 0;
+  };
+
+  explicit ProcessGroup(int rank, int size);
+  virtual ~ProcessGroup();
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getSize() const {
+    return size_;
+  }
+
+  virtual std::shared_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) = 0;
+
+  virtual std::shared_ptr<Work> allreduce(
+      std::vector<at::Tensor>& data,
+      const AllreduceOptions& opts = AllreduceOptions()) = 0;
+
+ protected:
+  const int rank_;
+  const int size_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
new file mode 100644
index 0000000..23e32cb
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -0,0 +1,572 @@
+#include "ProcessGroupGloo.hpp"
+
+#include <gloo/allreduce_halving_doubling.h>
+#include <gloo/allreduce_ring_chunked.h>
+#include <gloo/broadcast_one_to_all.h>
+#include <gloo/cuda_allreduce_halving_doubling.h>
+#include <gloo/cuda_allreduce_ring_chunked.h>
+#include <gloo/cuda_broadcast_one_to_all.h>
+#include <gloo/rendezvous/context.h>
+#include <gloo/transport/tcp/device.h>
+
+#include <THC.h>
+
+#include <c10d/private/CUDAUtils.hpp>
+
+#define GENERATE_ALL_TYPES(type, func, args...)        \
+  switch (type) {                                      \
+    case ::at::ScalarType::Float:                      \
+      func<float>(args);                               \
+      break;                                           \
+    case ::at::ScalarType::Double:                     \
+      func<double>(args);                              \
+      break;                                           \
+    case ::at::ScalarType::Half:                       \
+      func<gloo::float16>(args);                       \
+      break;                                           \
+    case ::at::ScalarType::Char:                       \
+      func<int8_t>(args);                              \
+      break;                                           \
+    case ::at::ScalarType::Byte:                       \
+      func<uint8_t>(args);                             \
+      break;                                           \
+    case ::at::ScalarType::Int:                        \
+      func<int32_t>(args);                             \
+      break;                                           \
+    case ::at::ScalarType::Long:                       \
+      func<int64_t>(args);                             \
+      break;                                           \
+    default:                                           \
+      throw std::runtime_error("Invalid scalar type"); \
+  }
+
+namespace c10d {
+
+using KeyType = AlgorithmKey;
+using EntryType = std::unique_ptr<AlgorithmEntry>;
+
+namespace {
+
+// Wrap c10d store as Gloo store
+class GlooStore : public ::gloo::rendezvous::Store {
+ public:
+  GlooStore(const std::shared_ptr<::c10d::Store>& store) : store_(store) {}
+
+  void set(const std::string& key, const std::vector<char>& value) override {
+    std::vector<uint8_t> tmp(value.begin(), value.end());
+    store_->set(key, tmp);
+  }
+
+  std::vector<char> get(const std::string& key) override {
+    auto value = store_->get(key);
+    return std::vector<char>(value.begin(), value.end());
+  }
+
+  void wait(const std::vector<std::string>& keys) override {
+    store_->wait(keys, Store::kDefaultTimeout);
+  }
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override {
+    store_->wait(keys, timeout);
+  }
+
+ protected:
+  std::shared_ptr<::c10d::Store> store_;
+};
+
+template <typename T>
+const ::gloo::ReductionFunction<T>* reductionFunction(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return ::gloo::ReductionFunction<T>::sum;
+    case ReduceOp::PRODUCT:
+      return ::gloo::ReductionFunction<T>::product;
+    case ReduceOp::MIN:
+      return ::gloo::ReductionFunction<T>::min;
+    case ReduceOp::MAX:
+      return ::gloo::ReductionFunction<T>::max;
+    case ReduceOp::UNUSED:
+      break;
+  }
+
+  throw std::runtime_error("Unhandled ReduceOp");
+}
+
+std::vector<cudaStream_t> getStreamVector(AlgorithmEntry& entry) {
+  std::vector<cudaStream_t> streams(entry.streams.size());
+  for (size_t i = 0; i < entry.streams.size(); i++) {
+    streams[i] = entry.streams[i].getStream();
+  }
+  return streams;
+}
+
+// synchronizeStreams ensures that the private streams associated with
+// an algorithm entry wait for the public streams to complete.
+void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) {
+  at::DeviceGuard deviceGuard;
+  const auto& key = entry->key;
+  for (size_t i = 0; i < key.devices.size(); i++) {
+    const auto& device = key.devices[i];
+    auto publicStream = THCState_getCurrentStreamOnDevice(thcState, device);
+    auto privateStream = entry->streams[i].getStream();
+    auto event = entry->events[i].getEvent();
+
+    // Synchronize private stream with public stream.
+    //
+    // We must use the device guard to cover the case where the public
+    // stream is stream 0 and cudaEventRecord relies on the current
+    // device to find the right one.
+    //
+    deviceGuard.set_index(key.devices[i]);
+    C10D_CUDA_CHECK(cudaEventRecord(event, publicStream));
+    C10D_CUDA_CHECK(cudaStreamWaitEvent(privateStream, event, 0));
+  }
+}
+
+} // namespace
+
+ProcessGroupGloo::WorkGloo::WorkGloo() : completed_(false), cuda_(false) {}
+
+ProcessGroupGloo::WorkGloo::~WorkGloo() {}
+
+bool ProcessGroupGloo::WorkGloo::isCompleted() const {
+  return completed_;
+}
+
+bool ProcessGroupGloo::WorkGloo::isSuccess() const {
+  return !ex_;
+}
+
+void ProcessGroupGloo::WorkGloo::synchronize() {
+  if (cuda_) {
+    auto thcState = ::at::globalContext().lazyInitCUDA();
+    for (size_t i = 0; i < devices_.size(); i++) {
+      auto stream = THCState_getCurrentStreamOnDevice(thcState, devices_[i]);
+      auto event = events_[i].getEvent();
+      C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
+    }
+  }
+}
+
+bool ProcessGroupGloo::WorkGloo::wait() {
+  std::unique_lock<std::mutex> lock(m_);
+  while (!completed_) {
+    cv_.wait(lock);
+  }
+  auto success = isSuccess();
+  if (success) {
+    synchronize();
+  }
+  return success;
+}
+
+const std::exception& ProcessGroupGloo::WorkGloo::exception() const {
+  return *ex_;
+}
+
+void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
+  {
+    std::unique_lock<std::mutex> lock(m_);
+    completed_ = true;
+    cuda_ = entry.key.type->is_cuda();
+
+    // Populate devices and events so that we can later synchronize
+    // with the operation associated with this work finishing.
+    if (cuda_) {
+      at::DeviceGuard deviceGuard;
+      devices_ = entry.key.devices;
+      events_.resize(devices_.size());
+      for (size_t i = 0; i < devices_.size(); i++) {
+        deviceGuard.set_index(devices_[i]);
+        events_[i] = CUDAEvent::create();
+        const auto& event = events_[i].getEvent();
+        const auto& stream = entry.streams[i].getStream();
+        C10D_CUDA_CHECK(cudaEventRecord(event, stream));
+      }
+    }
+  }
+  cv_.notify_all();
+}
+
+void ProcessGroupGloo::WorkGloo::finishWithException(
+    const ::gloo::Exception& ex) {
+  {
+    std::unique_lock<std::mutex> lock(m_);
+    completed_ = true;
+    ex_ = std::unique_ptr<::gloo::Exception>(new ::gloo::Exception(ex));
+  }
+  cv_.notify_all();
+}
+
+ProcessGroupGloo::Options::Options()
+    : timeout(std::chrono::milliseconds(10 * 1000)),
+      threads(2),
+      cacheNumAlgorithmEntries(1) {}
+
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<Store>& store,
+    int rank,
+    int size,
+    Options options)
+    : ProcessGroup(rank, size),
+      store_(new GlooStore(store)),
+      stop_(false),
+      cacheNumAlgorithmEntries_(options.cacheNumAlgorithmEntries) {
+  auto& devices = options.devices;
+  if (devices.empty()) {
+    throw std::runtime_error("No device(s) specified");
+  }
+
+  for (auto& device : options.devices) {
+    auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
+    context->setTimeout(options.timeout);
+    context->connectFullMesh(*store_, device);
+    contexts_.push_back(std::move(context));
+  }
+
+  threads_.resize(options.threads);
+  for (size_t i = 0; i < threads_.size(); i++) {
+    threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this);
+  }
+
+  thcState_ = ::at::globalContext().lazyInitCUDA();
+}
+
+ProcessGroupGloo::~ProcessGroupGloo() {
+  std::unique_lock<std::mutex> lock(queueMutex_);
+  while (!queue_.empty()) {
+    queueConsumeCV_.wait(lock);
+  }
+
+  // Queue is empty, signal stop
+  stop_ = true;
+
+  // Release lock to allow threads to terminate
+  queueProduceCV_.notify_all();
+  lock.unlock();
+
+  // Wait for worker threads to terminate
+  for (auto& thread : threads_) {
+    thread.join();
+  }
+}
+
+void ProcessGroupGloo::runLoop(void) {
+  std::unique_lock<std::mutex> lock(queueMutex_);
+
+  while (!stop_) {
+    if (queue_.empty()) {
+      queueProduceCV_.wait(lock);
+      continue;
+    }
+
+    auto tuple = std::move(queue_.front());
+    queue_.pop_front();
+    queueConsumeCV_.notify_one();
+
+    // Continue holding onto the lock; this ensures that we serialize
+    // creation of Gloo algorithm instances for the context associated
+    // with this process group.
+    auto& entry = std::get<0>(tuple);
+    if (!entry->algorithm) {
+      createAlgorithm(*entry);
+    }
+
+    lock.unlock();
+    runSingle(std::move(tuple));
+    lock.lock();
+  }
+}
+
+void ProcessGroupGloo::runSingle(WorkType tuple) {
+  auto& entry = std::get<0>(tuple);
+  auto& work = std::get<1>(tuple);
+
+  try {
+    entry->run();
+    work->finish(*entry);
+  } catch (const ::gloo::Exception& ex) {
+    work->finishWithException(ex);
+  }
+
+  // Unblock anyone waiting for this algorithm entry
+  std::unique_lock<std::mutex> lock(entry->m);
+  entry->busy = false;
+  entry->cv.notify_one();
+}
+
+void ProcessGroupGloo::createAlgorithm(AlgorithmEntry& entry) {
+  const auto& key = entry.key;
+  switch (key.collectiveType) {
+    case CollectiveType::ALLREDUCE:
+      GENERATE_ALL_TYPES(key.type->scalarType(), createAllreduce, entry);
+      return;
+    case CollectiveType::BROADCAST:
+      GENERATE_ALL_TYPES(key.type->scalarType(), createBroadcast, entry);
+      return;
+    case CollectiveType::UNUSED:
+      break;
+  }
+
+  throw std::runtime_error("Unhandled collective type");
+}
+
+template <typename T>
+void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
+  const auto& key = entry.key;
+  const auto& backend = key.type->backend();
+
+  // Create algorithm against first context
+  auto& context = contexts_[0];
+  at::DeviceGuard guard(entry.src[0]);
+
+  if (backend == at::kCPU) {
+    if (getSize() < 16) {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::AllreduceRingChunked<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              reductionFunction<T>(key.reduceOp)));
+    } else {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::AllreduceHalvingDoubling<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              reductionFunction<T>(key.reduceOp)));
+    }
+    return;
+  }
+
+  if (backend == at::kCUDA) {
+    if (getSize() < 16) {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::CudaAllreduceRingChunked<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              getStreamVector(entry)));
+    } else {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::CudaAllreduceHalvingDoubling<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              getStreamVector(entry)));
+    }
+    return;
+  }
+
+  throw std::runtime_error(
+      "Unhandled backend: " + std::string(at::toString(backend)));
+}
+
+template <typename T>
+void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
+  const auto& key = entry.key;
+  const auto& backend = key.type->backend();
+
+  // Create algorithm against first context
+  auto& context = contexts_[0];
+  at::DeviceGuard guard(entry.src[0]);
+
+  if (backend == at::kCPU) {
+    entry.algorithm =
+        std::unique_ptr<::gloo::Algorithm>(new ::gloo::BroadcastOneToAll<T>(
+            context,
+            getDataPointers<T>(entry.src),
+            entry.src[0].numel(),
+            key.srcRank,
+            key.srcTensor));
+    return;
+  }
+
+  if (backend == at::kCUDA) {
+    entry.algorithm =
+        std::unique_ptr<::gloo::Algorithm>(new ::gloo::CudaBroadcastOneToAll<T>(
+            context,
+            getDataPointers<T>(entry.src),
+            entry.src[0].numel(),
+            key.srcRank,
+            key.srcTensor,
+            getStreamVector(entry)));
+    return;
+  }
+
+  throw std::runtime_error(
+      "Unhandled backend: " + std::string(at::toString(backend)));
+}
+
+// Constructs an AlgorithmEntry instance, except for the algorithm
+// itself. It allocates the temporary input/output tensors necessary
+// to have a fixed address to pass to the Gloo algorithms. The
+// AlgorithmEntry is lazily allocated and reused for collective calls
+// with the same signature.
+//
+// Construction of the Gloo algorithm itself it delayed until a thread
+// picks up the work, because it performs I/O and can fail. Any I/O
+// failure must be signaled through the Work future.
+//
+EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
+  at::DeviceGuard deviceGuard;
+  auto entry = std::unique_ptr<AlgorithmEntry>(new AlgorithmEntry);
+  entry->key = key;
+
+  // Allocate source tensors for this entry
+  auto& srcSizes = key.srcSizes;
+  entry->src.resize(srcSizes.size());
+  for (size_t i = 0; i < srcSizes.size(); i++) {
+    deviceGuard.set_index(key.type->is_cuda() ? key.devices[i] : -1);
+    entry->src[i] = key.type->tensor(srcSizes[i]);
+  }
+
+  // If these are CUDA tensors, create streams and events
+  if (key.type->is_cuda()) {
+    entry->streams.resize(key.devices.size());
+    entry->events.resize(key.devices.size());
+    for (size_t i = 0; i < key.devices.size(); i++) {
+      deviceGuard.set_index(key.devices[i]);
+      entry->streams[i] = CUDAStream::create();
+      entry->events[i] = CUDAEvent::create();
+    }
+  }
+
+  return entry;
+}
+
+AlgorithmEntry* ProcessGroupGloo::checkout(const AlgorithmKey& key) {
+  auto& vec = cache_[key];
+  const auto i = cacheCurrentEntry_[key];
+
+  // Ensure the cache vector is appropriately sized
+  if (vec.size() != cacheNumAlgorithmEntries_) {
+    vec.resize(cacheNumAlgorithmEntries_);
+  }
+
+  // The next call must use the next entry
+  cacheCurrentEntry_[key] = (i + 1) % cacheNumAlgorithmEntries_;
+
+  // If there is no entry for this key, create a new one
+  if (!vec[i]) {
+    vec[i] = construct(key);
+  }
+
+  auto& entry = vec[i];
+
+  // Ensure entry is not in use
+  std::unique_lock<std::mutex> lock(entry->m);
+  while (entry->busy) {
+    entry->cv.wait(lock);
+  }
+
+  // Mark entry in use
+  entry->busy = true;
+  return entry.get();
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::enqueue(
+    AlgorithmEntry* entry) {
+  auto work = std::make_shared<WorkGloo>();
+  std::unique_lock<std::mutex> lock(queueMutex_);
+  queue_.push_back(std::make_tuple(entry, work));
+  queueProduceCV_.notify_one();
+  return work;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  assertSameSizeAndType(tensors);
+
+  AlgorithmKey key;
+  key.collectiveType = CollectiveType::BROADCAST;
+  key.type = &tensors[0].type();
+  key.devices = getDevices(tensors);
+  key.srcSizes = getSizes(tensors);
+  key.srcRank = opts.rootRank;
+  key.srcTensor = opts.rootTensor;
+
+  // Retrieve (create or wait for) pointer to cache entry
+  auto entry = checkout(key);
+
+  // Only copy root tensor
+  if (getRank() == opts.rootRank) {
+    entry->src[opts.rootTensor].copy_(tensors[opts.rootTensor]);
+  }
+
+  // In case of CUDA, ensure that operations that are queued after
+  // this collective wait for the collective to complete.
+  if (key.type->is_cuda()) {
+    synchronizeStreams(thcState_, entry);
+    entry->run = [=]() mutable {
+      entry->algorithm->run();
+      for (size_t i = 0; i < tensors.size(); i++) {
+        // The THCStreamGuard is a RAII wrapper for temporarily
+        // overriding the current THCStream. This also sets the
+        // current device to the stream's device.
+        THCStreamGuard guard(thcState_, entry->streams[i]);
+        tensors[i].copy_(entry->src[i]);
+      }
+    };
+  } else {
+    entry->run = [=]() mutable {
+      entry->algorithm->run();
+      for (size_t i = 0; i < tensors.size(); i++) {
+        tensors[i].copy_(entry->src[i]);
+      }
+    };
+  }
+
+  return enqueue(entry);
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  assertSameSizeAndType(tensors);
+
+  AlgorithmKey key;
+  key.collectiveType = CollectiveType::ALLREDUCE;
+  key.type = &tensors[0].type();
+  key.srcSizes = getSizes(tensors);
+  key.devices = getDevices(tensors);
+  key.reduceOp = opts.reduceOp;
+
+  // Retrieve (create or wait for) cache entry
+  auto entry = checkout(key);
+
+  // Copy input tensors
+  for (size_t i = 0; i < tensors.size(); i++) {
+    entry->src[i].copy_(tensors[i]);
+  }
+
+  // In case of CUDA, ensure that operations that are queued after
+  // this collective wait for the collective to complete.
+  if (key.type->is_cuda()) {
+    synchronizeStreams(thcState_, entry);
+    entry->run = [=]() mutable {
+      entry->algorithm->run();
+      for (size_t i = 0; i < tensors.size(); i++) {
+        // The THCStreamGuard is a RAII wrapper for temporarily
+        // overriding the current THCStream. This also sets the
+        // current device to the stream's device.
+        THCStreamGuard guard(thcState_, entry->streams[i]);
+        tensors[i].copy_(entry->src[i]);
+      }
+    };
+  } else {
+    entry->run = [=]() mutable {
+      entry->algorithm->run();
+      for (size_t i = 0; i < tensors.size(); i++) {
+        tensors[i].copy_(entry->src[i]);
+      }
+    };
+  }
+
+  return enqueue(entry);
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
new file mode 100644
index 0000000..44df2e0
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -0,0 +1,296 @@
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+#include <gloo/rendezvous/store.h>
+#include <gloo/transport/device.h>
+
+#include <torch/csrc/utils/hash.h>
+
+#include <c10d/CUDAUtils.hpp>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+#include <c10d/Types.hpp>
+#include <c10d/Utils.hpp>
+
+// Forward declaration
+struct THCState;
+
+namespace c10d {
+
+// AlgorithmKey is a const identifier for a Gloo algorithm.
+//
+// It captures the set of participating devices, the source device,
+// destination device, source rank, destination rank, reduction type
+// (if applicable), etcetera. This key is used to cache instances of a
+// Gloo algorithm for reuse. The number of cached instances can vary
+// over time and is agreed upon between all processes in the group.
+//
+// When we're dealing with multiple entries per key, it is also used
+// to broadcast the number of entries such that all processes agree.
+//
+struct AlgorithmKey {
+  bool operator==(const AlgorithmKey& other) const {
+    return (collectiveType == other.collectiveType) && (type == other.type) &&
+        (devices == other.devices) && (srcSizes == other.srcSizes) &&
+        (dstSizes == other.dstSizes) && (srcRank == other.srcRank) &&
+        (dstRank == other.dstRank) && (srcTensor == other.srcTensor) &&
+        (dstTensor == other.dstTensor) && (reduceOp == other.reduceOp);
+  }
+
+  CollectiveType collectiveType = CollectiveType::UNUSED;
+  at::Type* type = nullptr;
+  std::vector<int> devices;
+  std::vector<std::vector<int64_t>> srcSizes;
+  std::vector<std::vector<int64_t>> dstSizes;
+  int srcRank = -1;
+  int dstRank = -1;
+  int srcTensor = -1;
+  int dstTensor = -1;
+  ReduceOp reduceOp = ReduceOp::UNUSED;
+
+  // This function is called by torch::hash<AlgorithmKey>
+  static size_t hash(const AlgorithmKey& k) {
+    return torch::get_hash(
+        k.collectiveType,
+        k.type,
+        k.devices,
+        k.srcSizes,
+        k.dstSizes,
+        k.srcRank,
+        k.dstRank,
+        k.srcTensor,
+        k.dstTensor,
+        k.reduceOp);
+  }
+};
+
+// AlgorithmEntry is the state associated with a single algorithm instance.
+//
+// Keeping Gloo algorithms around for reuse is a win, since most of
+// them end up allocating some memory, constructing them takes some
+// time, and they may do some I/O (to setup communication buffers
+// between processes). Also, until it supports executing on arbitrary
+// memory, we need to hold on to memory that we instantiated the
+// algorithm with. The lifecycle of memory in ATen is arbitrary, so to
+// do caching, this entry holds on to memory that we copy to/from.
+//
+// Every unique call (in terms of number of tensors, tensor types,
+// tensor sizes, etc.) gets its own entry. In the future we may extend
+// this to allow multiple entries per unique call, to better exploit
+// parallelism for calls with the same signature.
+//
+struct AlgorithmEntry {
+  AlgorithmKey key;
+  std::unique_ptr<::gloo::Algorithm> algorithm;
+  std::vector<at::Tensor> src;
+  std::vector<at::Tensor> dst;
+  std::function<void()> run;
+
+  // For CUDA tensors, the following happens:
+  //
+  // - Input tensor A is copied to persistent tensor B on the stream
+  //   associated with the device that stores A (the stream is a
+  //   per-device thread local stored by THC).
+  // - This stream is recorded in an event (see events below) so that
+  //   the copy can be synchronized.
+  // - The private stream (see streams below) that is used to execute
+  //   the algorithm on a worker thread waits for this event such that
+  //   we know the copy to tensor B has finished.
+  // - Once the algorithm has finished executing, the work object
+  //   associated with the execution records the private streams in
+  //   its own events. Then, when the wait() function on the work
+  //   object is called, the streams of the caller are synchronized
+  //   with asynchronous completion of the memory copies back to the
+  //   destination tensors.
+  //
+  // This approach means the caller of the process group function can
+  // retain asynchrony (no need for synchronizing its CUDA streams).
+  // Once the wait() function on the associated work object returns
+  // true, the caller can launch new CUDA kernels and they will be
+  // correctly sequenced.
+  //
+  std::vector<CUDAStream> streams;
+  std::vector<CUDAEvent> events;
+
+  // Used to synchronize between calling thread and worker threads.
+  std::mutex m;
+  std::condition_variable cv;
+  bool busy = false;
+
+  // Default constructor must be specified.
+  AlgorithmEntry() = default;
+
+  // Must not be copyable.
+  // This is implied by the std::unique_ptr member field, but serves
+  // as documentation in case it ever is removed.
+  AlgorithmEntry& operator=(const AlgorithmEntry&) = delete;
+  AlgorithmEntry(const AlgorithmEntry&) = delete;
+};
+
+} // namespace c10d
+
+namespace c10d {
+
+// ProcessGroupGloo implements Gloo bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes. For
+// multi-threaded usage of process groups, you can use consider using
+// multiple process group instances.
+//
+// The Gloo algorithms that this class calls into are cached by their
+// signature (see description of AlgorithmKey above). This cache works
+// as follows: every function call instantiates an AlgorithmKey and
+// looks in the cache for existing entries. If there is one, it is
+// removed from the cache and returned to the caller. If there are
+// none, a new entry is created and returned. If an entry was created
+// before, but is still in use, the call will block and wait until the
+// entry is returned to the cache.
+//
+// In the future, we hope to extend this to allow multiple entries per
+// key, to enable parallelism for a single key. The number of entries
+// per key must always be identical for all processes. This maximum
+// number can be automatically tuned, but only if we let a single
+// process take charge, and have it broadcast the limits.
+//
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class WorkGloo : public ProcessGroup::Work {
+   public:
+    explicit WorkGloo();
+    virtual ~WorkGloo();
+
+    bool isCompleted() const override;
+    bool isSuccess() const override;
+    void synchronize() override;
+    bool wait() override;
+    const std::exception& exception() const override;
+
+   protected:
+    void finish(const AlgorithmEntry& entry);
+    void finishWithException(const ::gloo::Exception& ex);
+
+    std::mutex m_;
+    std::condition_variable cv_;
+    std::atomic<bool> completed_;
+
+    // Use pointer to ::gloo::Exception because it doesn't have a
+    // default constructor and constructing an empty std::unique_ptr
+    // is probably cheaper (this is highly speculative).
+    std::unique_ptr<::gloo::Exception> ex_;
+
+    // List of devices and events so that we can synchronize the
+    // streams of the caller with the kernels that were launched
+    // asynchronously to finish this operation.
+    //
+    // These events are private to a single work instance. An event
+    // captures the progress of a stream at a single point in time. If
+    // we were to use events stored on the algorithm entry, then
+    // multiple work instances might end up using the same events, and
+    // end up interfering with each other (causing unnecessary
+    // synchronization delays). Using events that are private to a
+    // single work instance avoids this. Ad hoc benchmarks showed that
+    // event construction is relatively cheap: creating 8 events takes
+    // 3 microseconds on a fast machine.
+    //
+    // Also see CUDA comment in AlgorithmEntry struct.
+    //
+    bool cuda_;
+    std::vector<int> devices_;
+    std::vector<CUDAEvent> events_;
+
+    friend class ProcessGroupGloo;
+  };
+
+  struct Options {
+    explicit Options();
+
+    std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
+    std::chrono::milliseconds timeout;
+    int threads;
+
+    // This controls how many Gloo algorithm instances are created for
+    // a single identifying key. If you have many identical calls with
+    // tensors of identical size and need to parallelize, this should
+    // be greater than 1. More cache entries means more memory usage.
+    // The default value is 1.
+    int cacheNumAlgorithmEntries;
+  };
+
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<Store>& store,
+      int rank,
+      int size,
+      Options options = Options());
+
+  virtual ~ProcessGroupGloo();
+
+  std::shared_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  std::shared_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+ protected:
+  using KeyType = AlgorithmKey;
+  using EntryType = std::unique_ptr<AlgorithmEntry>;
+  using HashType = torch::hash<AlgorithmKey>;
+  using WorkType = std::tuple<AlgorithmEntry*, std::shared_ptr<WorkGloo>>;
+
+  std::unique_ptr<::gloo::rendezvous::Store> store_;
+  std::vector<std::shared_ptr<::gloo::Context>> contexts_;
+  std::vector<std::thread> threads_;
+  bool stop_;
+
+  void runLoop(void);
+
+  void runSingle(WorkType work);
+
+  void createAlgorithm(AlgorithmEntry& entry);
+
+  template <typename T>
+  void createAllreduce(AlgorithmEntry& entry);
+
+  template <typename T>
+  void createBroadcast(AlgorithmEntry& entry);
+
+  // Construct creates AlgorithmEntry for specified key.
+  EntryType construct(const KeyType& key);
+
+  // Checkout constructs new AlgorithmEntry or returns existing one.
+  AlgorithmEntry* checkout(const KeyType& key);
+
+  // The maximum number of cached algorithms for a single key.
+  const int cacheNumAlgorithmEntries_;
+
+  // Index of the next algorithm to use for a particular key.
+  // Note that this index must be the same for all particating processes.
+  std::unordered_map<KeyType, int, HashType> cacheCurrentEntry_;
+
+  // The list of cached algorithms, by algorithm key.
+  std::unordered_map<KeyType, std::vector<EntryType>, HashType> cache_;
+
+  std::shared_ptr<Work> enqueue(AlgorithmEntry* entry);
+
+  std::deque<WorkType> queue_;
+  std::mutex queueMutex_;
+  std::condition_variable queueProduceCV_;
+  std::condition_variable queueConsumeCV_;
+
+  // Store copy of pointer to THCState retrieved from ::at::globalContext().
+  THCState* thcState_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
new file mode 100644
index 0000000..5a1ce12
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -0,0 +1,296 @@
+#include "ProcessGroupMPI.hpp"
+
+#include <map>
+
+#include <mpi-ext.h> // Needed for CUDA-aware check
+
+namespace c10d {
+
+#define MPI_CHECK(cmd)                                                   \
+  do {                                                                   \
+    int mpiStatus = cmd;                                                 \
+    if (mpiStatus != MPI_SUCCESS) {                                      \
+      std::string err = "MPI error in: " + std::string(__FILE__) + ":" + \
+          std::to_string(__LINE__) +                                     \
+          ", with error code: " + std::to_string(mpiStatus);             \
+      throw std::runtime_error(err);                                     \
+    }                                                                    \
+  } while (0)
+
+namespace {
+
+// Op mapping
+std::map<ReduceOp, MPI_Op> mpiOp = {
+    {ReduceOp::MIN, MPI_MIN},
+    {ReduceOp::MAX, MPI_MAX},
+    {ReduceOp::SUM, MPI_SUM},
+    {ReduceOp::PRODUCT, MPI_PROD},
+};
+// Type mapping
+std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
+    {at::kByte, MPI_UNSIGNED_CHAR},
+    {at::kChar, MPI_CHAR},
+    {at::kDouble, MPI_DOUBLE},
+    {at::kFloat, MPI_FLOAT},
+    {at::kInt, MPI_INT},
+    {at::kLong, MPI_LONG},
+    {at::kShort, MPI_SHORT},
+};
+
+// Checking CUDA-aware MPI support
+bool cudaAwareMpiCheck() {
+// Run time check
+#if defined(MPIX_CUDA_AWARE_SUPPORT)
+  if (MPIX_Query_cuda_support() == 1) {
+    return true;
+  } else {
+    return false;
+  }
+#else // !defined(MPIX_CUDA_AWARE_SUPPORT)
+  return false;
+#endif // MPIX_CUDA_AWARE_SUPPORT
+}
+
+// Checking the input tensor's validity
+void checkSingleTensor(const std::vector<at::Tensor>& tensors) {
+  if (tensors.size() != 1) {
+    throw std::runtime_error(
+        "MPI process group only supports a single "
+        "tensor op");
+  }
+  if (!tensors[0].is_contiguous()) {
+    throw std::runtime_error("input tensor has to be contiguous");
+  }
+  if (tensors[0].is_cuda() && !cudaAwareMpiCheck()) {
+    throw std::runtime_error(
+        "CUDA tensor detected and the MPI used doesn't "
+        "have CUDA-aware MPI support");
+  }
+}
+
+void mpiExit() {
+  MPI_CHECK(MPI_Finalize());
+}
+
+} // namespace
+
+// ProcessGroupMPI::WorkMPI
+ProcessGroupMPI::WorkMPI::WorkMPI() : completed_(false) {}
+
+ProcessGroupMPI::WorkMPI::~WorkMPI() {}
+
+bool ProcessGroupMPI::WorkMPI::isCompleted() const {
+  return completed_;
+}
+
+bool ProcessGroupMPI::WorkMPI::isSuccess() const {
+  return !workException_;
+}
+
+void ProcessGroupMPI::WorkMPI::synchronize() {}
+
+bool ProcessGroupMPI::WorkMPI::wait() {
+  std::unique_lock<std::mutex> lock(workMutex_);
+  while (!completed_) {
+    workCV_.wait(lock);
+  }
+  return isSuccess();
+}
+
+void ProcessGroupMPI::WorkMPI::finish() {
+  {
+    std::unique_lock<std::mutex> lock(workMutex_);
+    completed_ = true;
+  }
+  workCV_.notify_all();
+}
+
+void ProcessGroupMPI::WorkMPI::finishWithException(
+    std::exception_ptr caughtWorkException) {
+  {
+    std::unique_lock<std::mutex> lock(workMutex_);
+    completed_ = true;
+    workException_ = caughtWorkException;
+  }
+  workCV_.notify_all();
+}
+
+const std::exception& ProcessGroupMPI::WorkMPI::exception() const {
+  try {
+    std::rethrow_exception(workException_);
+  } catch (const std::exception& e) {
+    return e;
+  }
+}
+
+// Static global states
+int ProcessGroupMPI::numProcessGroups_ = 0;
+int ProcessGroupMPI::mpiThreadSupport_ = 0;
+std::mutex ProcessGroupMPI::pgGlobalMutex_;
+// We only want to initialize once
+std::once_flag ProcessGroupMPI::onceFlagInitMPI;
+
+void ProcessGroupMPI::initMPIOnce() {
+  // Initialize MPI environment
+  std::call_once(onceFlagInitMPI, []() {
+    MPI_CHECK(MPI_Init_thread(
+        nullptr, nullptr, MPI_THREAD_MULTIPLE, &mpiThreadSupport_));
+    if (mpiThreadSupport_ < MPI_THREAD_SERIALIZED) {
+      throw std::runtime_error(
+          "Used MPI implementation doesn't have the "
+          "minimum level of threading support: "
+          "MPI_THREAD_SERIALIZED. This is required by "
+          "c10d package");
+    }
+    if (std::atexit(mpiExit)) {
+      throw std::runtime_error("Fail to register the MPI exit handler");
+    }
+  });
+}
+
+std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::createProcessGroupMPI() {
+  // Once initialization
+  initMPIOnce();
+
+  int rank = -1;
+  int size = -1;
+  // Update the world size and rank
+  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &size));
+  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+
+  if (rank < 0 || size < 0) {
+    throw std::runtime_error("Failed to get the world_size / rank");
+  }
+
+  return std::make_shared<ProcessGroupMPI>(rank, size);
+}
+
+ProcessGroupMPI::ProcessGroupMPI(int rank, int size)
+    : ProcessGroup(rank, size), stop_(false) {
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+
+  if (mpiThreadSupport_ != MPI_THREAD_MULTIPLE && numProcessGroups_ >= 1) {
+    throw std::runtime_error(
+        "More than one process group created, "
+        "this is not supported due to the used MPI "
+        "implementation doesn't provide the full support "
+        "of multi-threading");
+  }
+  // increase the total PG count
+  ++numProcessGroups_;
+  globalLock.unlock();
+
+  // Start the worker thread accepting MPI calls
+  workerThread_ = std::thread(&ProcessGroupMPI::runLoop, this);
+}
+
+ProcessGroupMPI::~ProcessGroupMPI() {
+  destroy();
+}
+
+void ProcessGroupMPI::destroy() {
+  std::unique_lock<std::mutex> lock(pgMutex_);
+
+  while (!queue_.empty()) {
+    queueConsumeCV_.wait(lock);
+  }
+  // Queue is empty, signal stop
+  stop_ = true;
+
+  // Release lock to allow threads to terminate
+  queueProduceCV_.notify_all();
+
+  lock.unlock();
+
+  // Join the single worker thread
+  workerThread_.join();
+
+  // Decrease the number of PG created
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  --numProcessGroups_;
+}
+
+void ProcessGroupMPI::abort() {
+  destroy();
+  MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+}
+
+void ProcessGroupMPI::runLoop() {
+  std::unique_lock<std::mutex> lock(pgMutex_);
+
+  while (!stop_) {
+    if (queue_.empty()) {
+      queueProduceCV_.wait(lock);
+      continue;
+    }
+
+    auto workTuple = std::move(queue_.front());
+
+    queue_.pop_front();
+    queueConsumeCV_.notify_one();
+
+    auto& workEntry = std::get<0>(workTuple);
+    auto& work = std::get<1>(workTuple);
+
+    lock.unlock();
+
+    try {
+      workEntry->run(workEntry);
+      work->finish();
+    } catch (...) {
+      work->finishWithException(std::current_exception());
+    }
+
+    lock.lock();
+  }
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
+    std::unique_ptr<WorkEntry> entry) {
+  auto work = std::make_shared<WorkMPI>();
+  std::unique_lock<std::mutex> lock(pgMutex_);
+  queue_.push_back(std::make_tuple(std::move(entry), work));
+  queueProduceCV_.notify_one();
+  return work;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  checkSingleTensor(tensors);
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts](std::unique_ptr<WorkEntry>& entry) {
+        auto data = (*entry->src)[0];
+        MPI_CHECK(MPI_Bcast(
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.type().scalarType()),
+            opts.rootRank,
+            MPI_COMM_WORLD));
+      };
+  auto entry = std::unique_ptr<WorkEntry>(
+      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
+  return enqueue(std::move(entry));
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  checkSingleTensor(tensors);
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts](std::unique_ptr<WorkEntry>& entry) {
+        auto data = (*entry->src)[0];
+        MPI_CHECK(MPI_Allreduce(
+            MPI_IN_PLACE,
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.type().scalarType()),
+            mpiOp.at(opts.reduceOp),
+            MPI_COMM_WORLD));
+      };
+  auto entry = std::unique_ptr<WorkEntry>(
+      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
+  return enqueue(std::move(entry));
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
new file mode 100644
index 0000000..cbb980d
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include <mpi.h>
+
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Types.hpp>
+#include <c10d/Utils.hpp>
+
+namespace c10d {
+
+// WorkEntry is the state associated with a single MPI run instance.
+// It include the source Tensor list and destination Tensor list, as well as
+// The actual run function that will operate either on src or dst or both.
+struct WorkEntry {
+  explicit WorkEntry(
+      std::vector<at::Tensor>* src,
+      std::vector<at::Tensor>* dst,
+      std::function<void(std::unique_ptr<WorkEntry>&)> run)
+      : src(src), dst(dst), run(run) {}
+
+  // Not copyable
+  WorkEntry(const WorkEntry&) = delete;
+  // Not copy assignable
+  WorkEntry& operator=(const WorkEntry&) = delete;
+
+  // For input and output tensors (in-place), we will always use src
+  std::vector<at::Tensor>* src;
+  std::vector<at::Tensor>* dst;
+  std::function<void(std::unique_ptr<WorkEntry>&)> run;
+};
+
+// ProcessGroupMPI implements MPI bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes.
+//
+// All MPI functions provided by this class is asynchronously scheduled on a
+// Worker thread. Therefore, ProcessGroupMPI requires the MPI implementation
+// that is used to have a minimum thread support value of MPI_THREAD_SERIALIZED.
+// That is, The process may be multi-threaded, and multiple threads may make
+// MPI calls, but only one at a time: MPI calls are not made concurrently from
+// two distinct threads (all MPI calls are serialized). However, with
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
+// group. In other words, no more than 1 process group can be created globally.
+//
+// If you would like to use multiple ProcessGroupMPI, it requres your MPI
+// implemenation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
+// multiple threads may call MPI, with no restriction.
+//
+// Also note that ProcessGroupMPI only supports a single Tensor operation. In
+// other words, the size of the input Tensor vector should always be 1.
+//
+// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
+// ProcessGroupMPI will automatically detect this support.
+class ProcessGroupMPI : public ProcessGroup {
+ public:
+  class WorkMPI : public ProcessGroup::Work {
+   public:
+    WorkMPI();
+    virtual ~WorkMPI();
+
+    // Checks if request has completed. Non-blocking operation.
+    bool isCompleted() const override;
+
+    // Returns if the work completed successfully
+    // if false, the exception function can be called to get details.
+    bool isSuccess() const override;
+
+    // No op for the case of MPI
+    virtual void synchronize() override;
+
+    // Waits until request completes. Blocking operation
+    // Returns false if the work completed with an exception
+    bool wait() override;
+
+    // Return the exception if wait() returned false.
+    const std::exception& exception() const override;
+
+   protected:
+    void finish();
+    void finishWithException(std::exception_ptr caughtWorkException);
+
+    std::mutex workMutex_;
+    std::condition_variable workCV_;
+    std::atomic<bool> completed_;
+
+    std::exception_ptr workException_;
+
+    friend class ProcessGroupMPI;
+  };
+
+  // Constructor will spawn up the worker thread loop
+  explicit ProcessGroupMPI(int rank, int size);
+
+  virtual ~ProcessGroupMPI();
+
+  // Abort the MPI program, needs to be called when exception is detected
+  void abort();
+
+  std::shared_ptr<ProcessGroup::Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
+  static std::shared_ptr<ProcessGroupMPI> createProcessGroupMPI();
+
+ protected:
+  using WorkType =
+      std::tuple<std::unique_ptr<WorkEntry>, std::shared_ptr<WorkMPI>>;
+  // Worker thread loop
+  void runLoop();
+  // Helper function that is called by the destructor
+  void destroy();
+
+  std::shared_ptr<ProcessGroup::Work> enqueue(std::unique_ptr<WorkEntry> entry);
+
+  bool stop_;
+
+  std::mutex pgMutex_;
+  std::thread workerThread_;
+
+  std::deque<WorkType> queue_;
+  std::condition_variable queueProduceCV_;
+  std::condition_variable queueConsumeCV_;
+
+  // Global states
+  static void initMPIOnce();
+  static std::once_flag onceFlagInitMPI;
+
+  static std::mutex pgGlobalMutex_;
+  static int numProcessGroups_;
+  static int mpiThreadSupport_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
new file mode 100644
index 0000000..9f3f0dc
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -0,0 +1,456 @@
+#include "ProcessGroupNCCL.hpp"
+
+#include <map>
+#include <unordered_set>
+
+#include <THC.h>
+#include <THC/THCGeneral.hpp>
+
+#include <c10d/private/CUDAUtils.hpp>
+
+namespace c10d {
+
+namespace {
+
+// NCCL op mapping
+std::map<ReduceOp, ncclRedOp_t> ncclOp = {
+    {ReduceOp::MIN, ncclMin},
+    {ReduceOp::MAX, ncclMax},
+    {ReduceOp::SUM, ncclSum},
+    {ReduceOp::PRODUCT, ncclProd},
+};
+
+// NCCL type typing
+std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+    {at::kChar, ncclInt8},
+    {at::kByte, ncclUint8},
+    {at::kFloat, ncclFloat},
+    {at::kDouble, ncclDouble},
+    {at::kInt, ncclInt32},
+    {at::kLong, ncclInt64},
+    {at::kHalf, ncclHalf},
+};
+
+// Helper function that gets the data type and issues error if not supported
+ncclDataType_t getNcclDataType(at::ScalarType type) {
+  try {
+    return ncclDataType.at(type);
+  } catch (std::out_of_range& e) {
+    throw std::runtime_error("Unsupported data type for NCCL process group");
+  }
+}
+
+// Get the deviceList String from the list of devices
+std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
+  std::string deviceList;
+  for (auto& device : devices) {
+    if (deviceList.empty()) {
+      deviceList = std::to_string(device.index());
+    } else {
+      deviceList += "," + std::to_string(device.index());
+    }
+  }
+  return deviceList;
+}
+
+// Get the list of devices from list of tensors
+std::vector<at::Device> getDevices(const std::vector<at::Tensor>& tensors) {
+  std::vector<at::Device> res;
+  res.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    res.push_back(tensor.device());
+  }
+  return res;
+}
+
+// Helper that lets the input ncclStreams to wait for the THC stream
+void syncStreams(
+    THCState* thcState,
+    const std::vector<at::Device>& devices,
+    std::vector<CUDAEvent>& ncclEvents,
+    std::vector<CUDAStream>& ncclStreams) {
+  at::DeviceGuard gpuGuard;
+  for (size_t i = 0; i < devices.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    auto currentThcStream =
+        THCState_getCurrentStreamOnDevice(thcState, devices[i].index());
+    CUDAStream& ncclStream = ncclStreams[i];
+    CUDAEvent& ncclEvent = ncclEvents[i];
+
+    C10D_CUDA_CHECK(cudaEventRecord(ncclEvent.getEvent(), currentThcStream));
+    C10D_CUDA_CHECK(
+        cudaStreamWaitEvent(ncclStream.getStream(), ncclEvent.getEvent(), 0));
+  }
+}
+
+} // namespace
+
+ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
+    : devices_(devices) {
+  at::DeviceGuard gpuGuard;
+  cudaEvents_.resize(devices.size());
+  // Now create the CUDA events
+  for (size_t i = 0; i < devices.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    cudaEvents_[i] = CUDAEvent::create(cudaEventDisableTiming);
+  }
+}
+
+ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
+
+// Check if the NCCL kernels are queued on the GPUs
+bool ProcessGroupNCCL::WorkNCCL::isCompleted() const {
+  return true;
+}
+
+// Helper that checks if the NCCL kernels are completed on the GPUs
+bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecution() const {
+  at::DeviceGuard gpuGuard;
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    gpuGuard.set_index(devices_[i].index());
+    auto& cudaEvent = cudaEvents_[i];
+    // Checking the work's corresponding CUDA events' status
+    auto ret = cudaEventQuery(cudaEvent.getEvent());
+    if (ret != cudaSuccess && ret != cudaErrorNotReady) {
+      C10D_CUDA_CHECK(ret);
+    }
+    if (ret == cudaErrorNotReady) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Same as synchronize(), and will always return true
+bool ProcessGroupNCCL::WorkNCCL::wait() {
+  synchronize();
+  return true;
+}
+
+// Waiting on the work's corresponding CUDA events
+void ProcessGroupNCCL::WorkNCCL::synchronize() {
+  auto thcState = ::at::globalContext().lazyInitCUDA();
+  at::DeviceGuard gpuGuard;
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    gpuGuard.set_index(devices_[i].index());
+    auto thcStream =
+        THCState_getCurrentStreamOnDevice(thcState, devices_[i].index());
+    auto& cudaEvent = cudaEvents_[i];
+    // Let THC stream wait for the NCCL stream
+    C10D_CUDA_CHECK(cudaStreamWaitEvent(thcStream, cudaEvent.getEvent(), 0));
+  }
+}
+
+bool ProcessGroupNCCL::WorkNCCL::isSuccess() const {
+  return true;
+}
+
+const std::exception& ProcessGroupNCCL::WorkNCCL::exception() const {
+  throw std::runtime_error(
+      "exception() is not supported by NCCL process "
+      "group's work, since isSuccess() will always return true, and "
+      "isCompleted() and wait() will either succeed or throw");
+}
+
+std::unordered_map<ssize_t, ssize_t> ProcessGroupNCCL::pgUniqueNCCLIDCnt_;
+ssize_t ProcessGroupNCCL::processGroupCounter_ = -1;
+std::mutex ProcessGroupNCCL::pgTrackingLock_;
+
+ProcessGroupNCCL::ProcessGroupNCCL(
+    const std::shared_ptr<Store>& store,
+    int rank,
+    int size)
+    : ProcessGroup(rank, size), store_(store) {
+  thcState_ = ::at::globalContext().lazyInitCUDA();
+  // Generate the Process Group ID for current PG, this needs to be identical
+  // for all processes
+  std::unique_lock<std::mutex> lock(pgTrackingLock_);
+  ++processGroupCounter_;
+  pgUniqueNCCLIDCnt_[processGroupCounter_] = -1;
+  processGroupID_ = std::to_string(processGroupCounter_);
+}
+
+ProcessGroupNCCL::~ProcessGroupNCCL() {
+  std::unique_lock<std::mutex> lock(pgTrackingLock_);
+  pgUniqueNCCLIDCnt_.erase(std::stoull(processGroupID_));
+}
+
+void ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId* ncclID) {
+  // Every time when we create a new unique NCCL ID, we need to use a new
+  // global key to access/update the store.
+  // The key is a combination of processGroupID_ and the current count of
+  // NCCL unique ID created
+  std::unique_lock<std::mutex> lock(pgTrackingLock_);
+  auto processGroupIDKey = std::stoull(processGroupID_);
+  auto uniqueNCCLIDCnt = pgUniqueNCCLIDCnt_[processGroupIDKey] + 1;
+  pgUniqueNCCLIDCnt_[processGroupIDKey] = uniqueNCCLIDCnt;
+
+  lock.unlock();
+
+  std::string storeKey =
+      processGroupID_ + "_" + std::to_string(uniqueNCCLIDCnt);
+
+  // Rank 0 writes to the store as bcast
+  if (rank_ == 0) {
+    auto ncclIDVal = std::vector<uint8_t>(
+        reinterpret_cast<uint8_t*>(ncclID),
+        reinterpret_cast<uint8_t*>(ncclID) + NCCL_UNIQUE_ID_BYTES);
+    store_->set(storeKey, ncclIDVal);
+    // Other ranks get to the store
+  } else {
+    auto ncclIDVal = store_->get(storeKey);
+    // Just a sanity check
+    if (ncclIDVal.size() != NCCL_UNIQUE_ID_BYTES) {
+      throw std::runtime_error(
+          "Unexpected NCCL unique ID length received "
+          "from the store");
+    }
+    // Now put the data back to the input pointer
+    memcpy(ncclID, ncclIDVal.data(), NCCL_UNIQUE_ID_BYTES);
+  }
+}
+
+std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
+    const std::string& devicesKey,
+    const std::vector<at::Device>& devices) {
+  // Sanity check
+  if (devicesKey.empty()) {
+    throw std::runtime_error(
+        "Not able to create/get the NCCL Communicator since "
+        "the GPU devices are not known");
+  }
+  if (devNCCLCommMap_.find(devicesKey) != devNCCLCommMap_.end()) {
+    // Reuse the cached communicator if there is one.
+    return devNCCLCommMap_[devicesKey];
+  }
+  // NCCL communicator not cached, create a new entry
+  std::vector<std::shared_ptr<NCCLComm>> ncclComms;
+  ncclComms.resize(devices.size());
+
+  // Create the unique NCCL ID and broadcast it
+  ncclUniqueId ncclID;
+
+  if (rank_ == 0) {
+    C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID));
+  }
+
+  // Broadcast so that each process can have a unique NCCL ID
+  broadcastUniqueNCCLID(&ncclID);
+
+  at::DeviceGuard gpuGuard;
+
+  std::vector<CUDAEvent> eventVal;
+  std::vector<CUDAStream> streamVal;
+
+  eventVal.resize(devices.size());
+  streamVal.resize(devices.size());
+
+  // Create the NCCL communicators for each GPU
+  C10D_NCCL_CHECK(ncclGroupStart());
+
+  for (size_t i = 0; i < devices.size(); ++i) {
+    // GPU world size and GPU rank
+    int numRanks = getSize() * devices.size();
+    int rank = getRank() * devices.size() + i;
+
+    gpuGuard.set_index(devices[i].index());
+    ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID);
+
+    // Also create the NCCL streams and events
+    streamVal[i] = CUDAStream::create();
+    // Event created using cudaEventDisableTiming flag and not
+    // cudaEventBlockingSync flag will provide the best performance when used
+    // with cudaStreamWaitEvent() and cudaEventQuery(). Since we here don't
+    // measure the performance using cudaEvent, this should be set.
+    eventVal[i] = CUDAEvent::create(cudaEventDisableTiming);
+  }
+
+  C10D_NCCL_CHECK(ncclGroupEnd());
+
+  // Move the NCCL resource to cache
+  devNCCLCommMap_.emplace(devicesKey, std::move(ncclComms));
+  ncclStreams_.emplace(devicesKey, std::move(streamVal));
+  ncclEvents_.emplace(devicesKey, std::move(eventVal));
+
+  return devNCCLCommMap_[devicesKey];
+}
+
+// Helper function that checks the input and output tensors for validity
+void ProcessGroupNCCL::tensorCheckHelper(
+    const std::vector<at::Tensor>& input,
+    const std::vector<at::Tensor>& output,
+    int outputOverInput) {
+  if (input.size() != output.size()) {
+    throw std::runtime_error(
+        "Input tensor sequence should have the same "
+        "number of tensors as the output tensor sequence");
+  }
+
+  if (input.size() == 0) {
+    throw std::runtime_error("The number of input tensors should not be zero");
+  }
+
+  if (input.size() > static_cast<size_t>(thcState_->numDevices)) {
+    throw std::runtime_error(
+        "The number of input tensors is larger than "
+        "the number of available GPUs");
+  }
+
+  // To make sure each tensor is on separate devices
+  std::unordered_set<int> usedDevices;
+  usedDevices.reserve(input.size());
+
+  auto inputNumElement = input[0].numel();
+  auto elementType = input[0].type().scalarType();
+
+  for (size_t i = 0; i < input.size(); ++i) {
+    //  Check to make sure it's a GPU dense tensor
+    if (!(input[i].type().is_cuda() && !input[i].type().is_sparse() &&
+          output[i].type().is_cuda() && !output[i].type().is_sparse())) {
+      throw std::runtime_error(
+          "Only CUDA dense tensor is supported for NCCL "
+          "collective operations");
+    }
+    // Check the tensor type is identical
+    if (input[i].type().scalarType() != elementType ||
+        output[i].type().scalarType() != elementType) {
+      throw std::runtime_error(
+          "Expecting all GPU tensors to have identical "
+          "type");
+    }
+    // Check the input tensor size is identical
+    if (input[i].numel() != inputNumElement) {
+      throw std::runtime_error(
+          "Expecting all input tensors to have identical "
+          "number of elements");
+    }
+    // Check the output tensor size equals to input tensor size
+    if (output[i].numel() != inputNumElement * outputOverInput) {
+      throw std::runtime_error(
+          "The number of elements of output tensor does "
+          "not match the number of elements of the input "
+          "tensor");
+    }
+    // Contiguous verification
+    if (!input[i].is_contiguous() || !output[i].is_contiguous()) {
+      throw std::runtime_error("Expecting all GPU tensors to be contiguous");
+    }
+
+    bool inserted;
+    std::tie(std::ignore, inserted) = usedDevices.insert(input[i].get_device());
+    // Device verification, if the insertion didn't take place
+    if (!inserted) {
+      throw std::runtime_error("Expecting inputs on different GPU devices");
+    }
+
+    // Now check the output device
+    if (input[i].get_device() != output[i].get_device()) {
+      throw std::runtime_error(
+          "Expecting input and output tensors to be on "
+          "the same device");
+    }
+  }
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  tensorCheckHelper(tensors, tensors);
+
+  auto devices = getDevices(tensors);
+  auto key = getKeyFromDevices(devices);
+  auto& ncclComms = getNCCLComm(key, devices);
+
+  // First let NCCL streams wait for THC stream
+  syncStreams(thcState_, devices, ncclEvents_[key], ncclStreams_[key]);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  auto work = std::make_shared<ProcessGroupNCCL::WorkNCCL>(devices);
+
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  C10D_NCCL_CHECK(ncclGroupStart());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    CUDAStream& ncclStream = ncclStreams_[key][i];
+
+    C10D_NCCL_CHECK(ncclAllReduce(
+        tensors[i].data_ptr(),
+        tensors[i].data_ptr(),
+        tensors[i].numel(),
+        getNcclDataType(tensors[i].type().scalarType()),
+        ncclOp[opts.reduceOp],
+        ncclComms[i]->getNcclComm(),
+        ncclStream.getStream()));
+  }
+
+  C10D_NCCL_CHECK(ncclGroupEnd());
+
+  // Event should only be recorded after the ncclGroupEnd()
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    CUDAStream& ncclStream = ncclStreams_[key][i];
+    CUDAEvent& cudaEvent = work->cudaEvents_[i];
+
+    C10D_CUDA_CHECK(
+        cudaEventRecord(cudaEvent.getEvent(), ncclStream.getStream()));
+  }
+
+  return work;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  tensorCheckHelper(tensors, tensors);
+
+  auto devices = getDevices(tensors);
+  auto key = getKeyFromDevices(devices);
+  auto& ncclComms = getNCCLComm(key, devices);
+
+  // First let NCCL streams wait for THC stream
+  syncStreams(thcState_, devices, ncclEvents_[key], ncclStreams_[key]);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  auto work = std::make_shared<ProcessGroupNCCL::WorkNCCL>(devices);
+
+  at::DeviceGuard gpuGuard;
+
+  std::unique_lock<std::mutex> cudaFreeMutexLock(
+      *(THCCachingAllocator_getCudaFreeMutex()));
+
+  C10D_NCCL_CHECK(ncclGroupStart());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    CUDAStream& ncclStream = ncclStreams_[key][i];
+    // root rank of the the GPU
+    int root = opts.rootRank * tensors.size() + opts.rootTensor;
+
+    C10D_NCCL_CHECK(ncclBcast(
+        tensors[i].data_ptr(),
+        tensors[i].numel(),
+        getNcclDataType(tensors[i].type().scalarType()),
+        root,
+        ncclComms[i]->getNcclComm(),
+        ncclStream.getStream()));
+  }
+
+  C10D_NCCL_CHECK(ncclGroupEnd());
+
+  // Event should only be recorded after the ncclGroupEnd()
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    CUDAStream& ncclStream = ncclStreams_[key][i];
+    CUDAEvent& cudaEvent = work->cudaEvents_[i];
+
+    C10D_CUDA_CHECK(
+        cudaEventRecord(cudaEvent.getEvent(), ncclStream.getStream()));
+  }
+
+  return work;
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
new file mode 100644
index 0000000..f5ba90d
--- /dev/null
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <mutex>
+#include <unordered_map>
+
+#include <c10d/CUDAUtils.hpp>
+#include <c10d/NCCLUtils.hpp>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+
+// forward declaration
+struct THCState;
+
+namespace c10d {
+
+// ProcessGroupNCCL implements NCCL bindings for c10d.
+//
+// All functions of the class are expected to be called in the same order
+// across all processes in the process group.  This is the only way that we
+// can guarantee to match up the same calls among all processes.
+//
+// All NCCL functions provided by this class are asynchronous functions. More
+// specifically, each NCCL call is scheduled on a separate CUDA stream that is
+// different from the current THC CUDA stream. This is for the purpose of
+// achieving potentially concurrency and better performance. As a result,
+// it is the callers' responsibilty to make sure that the CUDA stream their
+// code works on (the THC stream) needs to wait for the NCCL operation from
+// this class.
+//
+// This can be done by calling:
+//
+// either WorkNCCL::wait() or WorkNCCL::synchronize(), both achieves the same
+// functionality and are synonyms.
+//
+// Note that WorkNCCL::isSuccess() and WorkNCCL::isCompleted() will always
+// return true since ProcessGroupNCCL is single threaded. Every single NCCL
+// or CUDA failure will simply raise std::runtime_error.
+//
+// Therefore, WorkNCCL::exception() is not supported since isSuccess() always
+// returns true.
+//
+// Also note that WorkNCCL::finishedGPUExecution() is a helper function only
+// provided by ProcessGroupNCCL to check if the NCCL operation of WorkNCCL has
+// finished execution on the GPU (not just scheduled).
+//
+// Example on using the NCCL process group
+//
+//   ProcessGroupNCCL pg(store, rank, size);
+//   std::shared_ptr<WorkNCCL> work = pg.allreduce(tensors);
+//
+//   // At this point, NCCL kernel has already by queued successfully
+//   // Now, let THC stream wait for the NCCL to finish, this function is
+//   // async operation as well
+//
+//   work->wait()
+//
+//   // Now continue on other work in the THC stream.
+class ProcessGroupNCCL : public ProcessGroup {
+ public:
+  class WorkNCCL : public ProcessGroup::Work {
+   public:
+    // Constructor takes a list of CUDA devices
+    WorkNCCL(const std::vector<at::Device>& devices);
+    virtual ~WorkNCCL();
+
+    // Checks if request has completed. In this specific case of NCCL, it checks
+    // if the NCCL operation has completed on the GPU in its own NCCL stream.
+    // Non-blocking operation.
+    bool isCompleted() const override;
+
+    // Let current THC stream wait on the completing of the NCCL work
+    // always return true and will throw if there are exceptions
+    // Non-blocking operation
+    bool wait() override;
+
+    // Will always return true
+    bool isSuccess() const override;
+
+    // Same as wait()
+    void synchronize() override;
+
+    // Not supported by WorkNCCL
+    const std::exception& exception() const override;
+
+    // Helper function that checks if the NCCL kernels have finished
+    // execution on the GPUs
+    bool finishedGPUExecution() const;
+
+   protected:
+    // The cached list of CUDA devices to operate on
+    std::vector<at::Device> devices_;
+
+    // The CUDA events tracking this work item on multiple CUDA devices
+    std::vector<CUDAEvent> cudaEvents_;
+
+    friend class ProcessGroupNCCL;
+  };
+
+  // Constructor will also check the number of available GPUs in the system
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
+
+  virtual ~ProcessGroupNCCL();
+
+  std::shared_ptr<ProcessGroup::Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+ protected:
+  // Helper that broadcasts nccl unique ID to all ranks through the store
+  void broadcastUniqueNCCLID(ncclUniqueId* ncclID);
+
+  // Helper that either looks up the cached NCCL communicators or creates
+  // a new set of NCCL communicators as a cache entry
+  std::vector<std::shared_ptr<NCCLComm>>& getNCCLComm(
+      const std::string& devicesKey,
+      const std::vector<at::Device>& devices);
+
+  // Tensor checker helper
+  void tensorCheckHelper(
+      const std::vector<at::Tensor>& input,
+      const std::vector<at::Tensor>& output,
+      int outputOverInput = 1);
+
+  // Store that is used to exchange each Ranks's NCCL unique ID
+  std::shared_ptr<Store> store_;
+
+  // The NCCL communicator that the process group has cached.
+  // The key is a list of GPU devices that an operation is operating on
+  // The GPU devices are stored in a device sequence and the cache NCCL
+  // communicator is associated with this GPU device sequence
+  //
+  // e.g. If the process group op only uses device 0, then the value of
+  // the used device string stored (value of the hashmap) would be "0".
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
+  //      then the value of the used device string (key) stored would be
+  //      "0,1,2,3,4,5,6,7"
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
+  //      then the value of the used device string stored would be
+  //      "0,4,5,6,7,1,2,3"
+  //
+  //      Note that the order of the device for the tensor list matters.
+  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
+      devNCCLCommMap_;
+
+  // The CUDA steams used by NCCL kernels
+  std::unordered_map<std::string, std::vector<CUDAStream>> ncclStreams_;
+
+  // The CUDA events used to sync NCCL streams
+  std::unordered_map<std::string, std::vector<CUDAEvent>> ncclEvents_;
+
+  // Store copy of pointer to THCState retrieved from ::at::globalContext().
+  THCState* thcState_;
+
+  // ID of this process group
+  std::string processGroupID_;
+
+  // processGroupID tracking
+  static std::mutex pgTrackingLock_;
+  static std::unordered_map<ssize_t, ssize_t> pgUniqueNCCLIDCnt_;
+  static ssize_t processGroupCounter_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/README.md b/torch/lib/c10d/README.md
new file mode 100644
index 0000000..462e8f4
--- /dev/null
+++ b/torch/lib/c10d/README.md
@@ -0,0 +1,19 @@
+# THD refactor
+
+This is a work in progress. It is separate from the main THD directory
+to avoid disrupting THD users or have to deal with backwards compat
+early on. Once this gets to a usable state, we'll add Python bindings
+and a compat layer.
+
+See https://github.com/pytorch/pytorch/issues/7434 for the main issue.
+
+This tree is intentionally not part of the main build and will be
+buildable/testable in isolation, as long as ATen is available in
+`<repository root>/torch/lib/tmp_install`.
+
+To build and install ATen here, navigate to the root of this
+repository and run:
+
+``` shell
+tools/build_pytorch_libs.sh --with-cuda ATen
+```
diff --git a/torch/lib/c10d/Store.cpp b/torch/lib/c10d/Store.cpp
new file mode 100644
index 0000000..da7e8d8
--- /dev/null
+++ b/torch/lib/c10d/Store.cpp
@@ -0,0 +1,11 @@
+#include "Store.hpp"
+
+namespace c10d {
+
+constexpr std::chrono::milliseconds Store::kDefaultTimeout;
+constexpr std::chrono::milliseconds Store::kNoTimeout;
+
+// Define destructor symbol for abstract base class.
+Store::~Store() {}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
new file mode 100644
index 0000000..eadb2ec
--- /dev/null
+++ b/torch/lib/c10d/Store.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace c10d {
+
+class Store {
+ public:
+  static constexpr std::chrono::milliseconds kDefaultTimeout =
+      std::chrono::seconds(30);
+  static constexpr std::chrono::milliseconds kNoTimeout =
+      std::chrono::milliseconds::zero();
+
+  virtual ~Store();
+
+  virtual void set(
+      const std::string& key,
+      const std::vector<uint8_t>& value) = 0;
+
+  virtual std::vector<uint8_t> get(const std::string& key) = 0;
+
+  virtual int64_t add(const std::string& key, int64_t value) = 0;
+
+  virtual bool check(const std::vector<std::string>& keys) = 0;
+
+  virtual void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) = 0;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
new file mode 100644
index 0000000..da4e2f2
--- /dev/null
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -0,0 +1,370 @@
+#include "TCPStore.hpp"
+
+#include <poll.h>
+
+#include <unistd.h>
+#include <algorithm>
+#include <system_error>
+
+namespace c10d {
+
+namespace {
+
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT };
+
+enum class CheckResponseType : uint8_t { READY, NOT_READY };
+
+enum class WaitResponseType : uint8_t { STOP_WAITING };
+
+} // anonymous namespace
+
+// TCPStoreDaemon class methods
+// Simply start the daemon thread
+TCPStoreDaemon::TCPStoreDaemon(int storeListenSocket)
+    : storeListenSocket_(storeListenSocket) {
+  daemonThread_ = std::thread(&TCPStoreDaemon::run, this);
+}
+
+TCPStoreDaemon::~TCPStoreDaemon() {
+  // Stop the run
+  stop();
+  // Join the thread
+  join();
+  // Close unclosed sockets
+  for (auto socket : sockets_) {
+    if (socket != -1) {
+      ::close(socket);
+    }
+  }
+  // Now close the rest control pipe
+  for (auto fd : controlPipeFd_) {
+    if (fd != -1) {
+      ::close(fd);
+    }
+  }
+}
+
+void TCPStoreDaemon::join() {
+  daemonThread_.join();
+}
+
+void TCPStoreDaemon::run() {
+  // Create the control pipe
+  if (pipe(controlPipeFd_.data()) == -1) {
+    throw std::runtime_error(
+        "Failed to create the control pipe to start the "
+        "TCPStoreDaemon run");
+  }
+
+  std::vector<struct pollfd> fds;
+  fds.push_back({.fd = storeListenSocket_, .events = POLLIN});
+  // Push the read end of the pipe to signal the stopping of the daemon run
+  fds.push_back({.fd = controlPipeFd_[0], .events = POLLHUP});
+
+  // receive the queries
+  bool finished = false;
+  while (!finished) {
+    for (size_t i = 0; i < sockets_.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+    SYSCHECK(::poll(fds.data(), fds.size(), -1));
+
+    // TCPStore's listening socket has an event and it should now be able to
+    // accept new connections.
+    if (fds[0].revents != 0) {
+      if (fds[0].revents ^ POLLIN) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent on the master's listening socket: " +
+                std::to_string(fds[0].revents));
+      }
+      int sockFd = std::get<0>(tcputil::accept(storeListenSocket_));
+      sockets_.push_back(sockFd);
+      fds.push_back({.fd = sockFd, .events = POLLIN});
+    }
+    // The pipe receives an event which tells us to shutdown the daemon
+    if (fds[1].revents != 0) {
+      // Will be POLLUP when the pipe is closed
+      if (fds[1].revents ^ POLLHUP) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent on the control pipe's reading fd: " +
+                std::to_string(fds[1].revents));
+      }
+      finished = true;
+      break;
+    }
+    // Skipping the fds[0] and fds[1],
+    // fds[0] is master's listening socket
+    // fds[1] is control pipe's reading fd
+    for (size_t fdIdx = 2; fdIdx < fds.size(); ++fdIdx) {
+      if (fds[fdIdx].revents == 0) {
+        continue;
+      }
+
+      if (fds[fdIdx].revents ^ POLLIN) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent: " + std::to_string(fds[fdIdx].revents) +
+                " on socket: " + std::to_string(fds[fdIdx].fd));
+      }
+      // Now query the socket that has the event
+      try {
+        query(fds[fdIdx].fd);
+      } catch (...) {
+        // There was an error when processing query. Probably an exception
+        // occurred in recv/send what would indicate that socket on the other
+        // side has been closed. If the closing was due to normal exit, then
+        // the store should continue executing. Otherwise, if it was different
+        // exception, other connections will get an exception once they try to
+        // use the store. We will go ahead and close this connection whenever
+        // we hit an exception here.
+        ::close(fds[fdIdx].fd);
+
+        // Remove all the tracking state of the close FD
+        for (auto it = waitingSockets_.begin(); it != waitingSockets_.end();) {
+          for (auto vecIt = it->second.begin(); vecIt != it->second.end();) {
+            if (*vecIt == fds[fdIdx].fd) {
+              vecIt = it->second.erase(vecIt);
+            } else {
+              ++vecIt;
+            }
+          }
+          if (it->second.size() == 0) {
+            it = waitingSockets_.erase(it);
+          } else {
+            ++it;
+          }
+        }
+        for (auto it = keysAwaited_.begin(); it != keysAwaited_.end();) {
+          if (it->first == fds[fdIdx].fd) {
+            it = keysAwaited_.erase(it);
+          } else {
+            ++it;
+          }
+        }
+        fds.erase(fds.begin() + fdIdx);
+        sockets_.erase(sockets_.begin() + fdIdx - 2);
+        --fdIdx;
+        continue;
+      }
+    }
+  }
+}
+
+void TCPStoreDaemon::stop() {
+  if (controlPipeFd_[1] != -1) {
+    // close the write end of the pipe
+    ::close(controlPipeFd_[1]);
+    controlPipeFd_[1] = -1;
+  }
+}
+
+// query communicates with the worker. The format
+// of the query is as follows:
+// type of query | size of arg1 | arg1 | size of arg2 | arg2 | ...
+// or, in the case of wait
+// type of query | number of args | size of arg1 | arg1 | ...
+void TCPStoreDaemon::query(int socket) {
+  QueryType qt;
+  tcputil::recvBytes<QueryType>(socket, &qt, 1);
+
+  if (qt == QueryType::SET) {
+    setHandler(socket);
+
+  } else if (qt == QueryType::ADD) {
+    addHandler(socket);
+
+  } else if (qt == QueryType::GET) {
+    getHandler(socket);
+
+  } else if (qt == QueryType::CHECK) {
+    checkHandler(socket);
+
+  } else if (qt == QueryType::WAIT) {
+    waitHandler(socket);
+
+  } else {
+    throw std::runtime_error("Unexpected query type");
+  }
+}
+
+void TCPStoreDaemon::wakeupWaitingClients(const std::string& key) {
+  auto socketsToWait = waitingSockets_.find(key);
+  if (socketsToWait != waitingSockets_.end()) {
+    for (int socket : socketsToWait->second) {
+      if (--keysAwaited_[socket] == 0) {
+        tcputil::sendValue<WaitResponseType>(
+            socket, WaitResponseType::STOP_WAITING);
+      }
+    }
+    waitingSockets_.erase(socketsToWait);
+  }
+}
+
+void TCPStoreDaemon::setHandler(int socket) {
+  std::string key = tcputil::recvString(socket);
+  tcpStore_[key] = tcputil::recvVector<uint8_t>(socket);
+  // On "set", wake up all clients that have been waiting
+  wakeupWaitingClients(key);
+}
+
+void TCPStoreDaemon::addHandler(int socket) {
+  std::string key = tcputil::recvString(socket);
+  int64_t addVal = tcputil::recvValue<int64_t>(socket);
+
+  if (tcpStore_.find(key) != tcpStore_.end()) {
+    auto buf = reinterpret_cast<const char*>(tcpStore_[key].data());
+    auto len = tcpStore_[key].size();
+    addVal += std::stoll(std::string(buf, len));
+  }
+  auto addValStr = std::to_string(addVal);
+  tcpStore_[key] = std::vector<uint8_t>(addValStr.begin(), addValStr.end());
+  // Now send the new value
+  tcputil::sendValue<int64_t>(socket, addVal);
+  // On "add", wake up all clients that have been waiting
+  wakeupWaitingClients(key);
+}
+
+void TCPStoreDaemon::getHandler(int socket) const {
+  std::string key = tcputil::recvString(socket);
+  auto data = tcpStore_.at(key);
+  tcputil::sendVector<uint8_t>(socket, data);
+}
+
+void TCPStoreDaemon::checkHandler(int socket) const {
+  SizeType nargs;
+  tcputil::recvBytes<SizeType>(socket, &nargs, 1);
+  std::vector<std::string> keys(nargs);
+  for (size_t i = 0; i < nargs; i++) {
+    keys[i] = tcputil::recvString(socket);
+  }
+  // Now we have received all the keys
+  if (checkKeys(keys)) {
+    tcputil::sendValue<CheckResponseType>(socket, CheckResponseType::READY);
+  } else {
+    tcputil::sendValue<CheckResponseType>(socket, CheckResponseType::NOT_READY);
+  }
+}
+
+void TCPStoreDaemon::waitHandler(int socket) {
+  SizeType nargs;
+  tcputil::recvBytes<SizeType>(socket, &nargs, 1);
+  std::vector<std::string> keys(nargs);
+  for (size_t i = 0; i < nargs; i++) {
+    keys[i] = tcputil::recvString(socket);
+  }
+  if (checkKeys(keys)) {
+    tcputil::sendValue<WaitResponseType>(
+        socket, WaitResponseType::STOP_WAITING);
+  } else {
+    for (auto& key : keys) {
+      waitingSockets_[key].push_back(socket);
+    }
+    keysAwaited_[socket] = keys.size();
+  }
+}
+
+bool TCPStoreDaemon::checkKeys(const std::vector<std::string>& keys) const {
+  return std::all_of(keys.begin(), keys.end(), [this](const std::string& s) {
+    return tcpStore_.count(s) > 0;
+  });
+}
+
+// TCPStore class methods
+TCPStore::TCPStore(
+    const std::string& masterAddr,
+    PortType masterPort,
+    bool isServer)
+    : isServer_(isServer),
+      tcpStoreAddr_(masterAddr),
+      tcpStorePort_(masterPort) {
+  if (isServer_) {
+    // Opening up the listening socket
+    std::tie(masterListenSocket_, std::ignore) = tcputil::listen(masterPort);
+    // Now start the daemon
+    tcpStoreDaemon_ = std::unique_ptr<TCPStoreDaemon>(
+        new TCPStoreDaemon(masterListenSocket_));
+  }
+  // Connect to the daemon
+  storeSocket_ = tcputil::connect(tcpStoreAddr_, tcpStorePort_);
+}
+
+TCPStore::~TCPStore() {
+  ::close(storeSocket_);
+  if (isServer_) {
+    // Store daemon should end because of closed connection.
+    // daemon destructor should join the thread
+    tcpStoreDaemon_.reset(nullptr);
+    ::close(masterListenSocket_);
+  }
+}
+
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& data) {
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::SET);
+  tcputil::sendString(storeSocket_, key, true);
+  tcputil::sendVector<uint8_t>(storeSocket_, data);
+}
+
+std::vector<uint8_t> TCPStore::get(const std::string& key) {
+  wait({key});
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::GET);
+  tcputil::sendString(storeSocket_, key);
+  return tcputil::recvVector<uint8_t>(storeSocket_);
+}
+
+int64_t TCPStore::add(const std::string& key, int64_t value) {
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::ADD);
+  tcputil::sendString(storeSocket_, key, true);
+  tcputil::sendValue<int64_t>(storeSocket_, value);
+  return tcputil::recvValue<int64_t>(storeSocket_);
+}
+
+bool TCPStore::check(const std::vector<std::string>& keys) {
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::CHECK);
+  SizeType nkeys = keys.size();
+  tcputil::sendBytes<SizeType>(storeSocket_, &nkeys, 1, (nkeys > 0));
+  for (size_t i = 0; i < nkeys; i++) {
+    tcputil::sendString(storeSocket_, keys[i], (i != (nkeys - 1)));
+  }
+  auto checkResponse = tcputil::recvValue<CheckResponseType>(storeSocket_);
+  if (checkResponse == CheckResponseType::READY) {
+    return true;
+  } else if (checkResponse == CheckResponseType::NOT_READY) {
+    return false;
+  } else {
+    throw std::runtime_error("ready or not_ready response expected");
+  }
+}
+
+void TCPStore::wait(
+    const std::vector<std::string>& keys,
+    const std::chrono::milliseconds& timeout) {
+  // Set the socket timeout if there is a wait timeout
+  if (timeout != kNoTimeout) {
+    struct timeval timeoutTV = {.tv_sec = timeout.count() / 1000,
+                                .tv_usec = (timeout.count() % 1000) * 1000};
+    SYSCHECK(::setsockopt(
+        storeSocket_,
+        SOL_SOCKET,
+        SO_RCVTIMEO,
+        reinterpret_cast<char*>(&timeoutTV),
+        sizeof(timeoutTV)));
+  }
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::WAIT);
+  SizeType nkeys = keys.size();
+  tcputil::sendBytes<SizeType>(storeSocket_, &nkeys, 1, (nkeys > 0));
+  for (size_t i = 0; i < nkeys; i++) {
+    tcputil::sendString(storeSocket_, keys[i], (i != (nkeys - 1)));
+  }
+  auto waitResponse = tcputil::recvValue<WaitResponseType>(storeSocket_);
+  if (waitResponse != WaitResponseType::STOP_WAITING) {
+    throw std::runtime_error("Stop_waiting response is expected");
+  }
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
new file mode 100644
index 0000000..bc0c0be
--- /dev/null
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <memory>
+#include <thread>
+#include <unordered_map>
+
+#include <c10d/Store.hpp>
+#include <c10d/Utils.hpp>
+
+namespace c10d {
+
+class TCPStoreDaemon {
+ public:
+  explicit TCPStoreDaemon(int storeListenSocket);
+  ~TCPStoreDaemon();
+
+  void join();
+
+ protected:
+  void run();
+  void stop();
+
+  void query(int socket);
+
+  void setHandler(int socket);
+  void addHandler(int socket);
+  void getHandler(int socket) const;
+  void checkHandler(int socket) const;
+  void waitHandler(int socket);
+
+  bool checkKeys(const std::vector<std::string>& keys) const;
+  void wakeupWaitingClients(const std::string& key);
+
+  std::thread daemonThread_;
+  std::unordered_map<std::string, std::vector<uint8_t>> tcpStore_;
+  // From key -> the list of sockets waiting on it
+  std::unordered_map<std::string, std::vector<int>> waitingSockets_;
+  // From socket -> number of keys awaited
+  std::unordered_map<int, size_t> keysAwaited_;
+
+  std::vector<int> sockets_;
+  int storeListenSocket_;
+  std::vector<int> controlPipeFd_{-1, -1};
+};
+
+class TCPStore : public Store {
+ public:
+  explicit TCPStore(
+      const std::string& masterAddr,
+      PortType masterPort,
+      bool isServer = false);
+
+  virtual ~TCPStore();
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
+
+ protected:
+  bool isServer_;
+  int storeSocket_ = -1;
+  int masterListenSocket_ = -1;
+
+  std::string tcpStoreAddr_;
+  PortType tcpStorePort_;
+
+  // Only needs to be launched as the server
+  std::unique_ptr<TCPStoreDaemon> tcpStoreDaemon_ = nullptr;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp
new file mode 100644
index 0000000..163e231
--- /dev/null
+++ b/torch/lib/c10d/Types.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstdint>
+
+namespace c10d {
+
+enum class CollectiveType : std::uint8_t {
+  BROADCAST,
+  ALLREDUCE,
+  UNUSED,
+};
+
+enum class ReduceOp : std::uint8_t {
+  SUM = 0,
+  PRODUCT,
+  MIN,
+  MAX,
+  UNUSED,
+};
+
+struct BroadcastOptions {
+  int rootRank = 0;
+  int rootTensor = 0;
+};
+
+struct AllreduceOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
new file mode 100644
index 0000000..b1a5d4e
--- /dev/null
+++ b/torch/lib/c10d/Utils.cpp
@@ -0,0 +1,273 @@
+#include "Utils.hpp"
+
+#include <netdb.h>
+#include <sys/poll.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <thread>
+
+namespace c10d {
+namespace tcputil {
+
+namespace {
+
+constexpr int LISTEN_QUEUE_SIZE = 64;
+
+void setSocketNoDelay(int socket) {
+  int flag = 1;
+  socklen_t optlen = sizeof(flag);
+  SYSCHECK(setsockopt(socket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, optlen));
+}
+
+PortType getSocketPort(int fd) {
+  PortType listenPort;
+  struct ::sockaddr_storage addrStorage;
+  socklen_t addrLen = sizeof(addrStorage);
+  SYSCHECK(getsockname(
+      fd, reinterpret_cast<struct ::sockaddr*>(&addrStorage), &addrLen));
+
+  if (addrStorage.ss_family == AF_INET) {
+    struct ::sockaddr_in* addr =
+        reinterpret_cast<struct ::sockaddr_in*>(&addrStorage);
+    listenPort = ntohs(addr->sin_port);
+
+  } else if (addrStorage.ss_family == AF_INET6) { // AF_INET6
+    struct ::sockaddr_in6* addr =
+        reinterpret_cast<struct ::sockaddr_in6*>(&addrStorage);
+    listenPort = ntohs(addr->sin6_port);
+
+  } else {
+    throw std::runtime_error("unsupported protocol");
+  }
+  return listenPort;
+}
+
+} // namespace
+
+std::string sockaddrToString(struct ::sockaddr* addr) {
+  char address[INET6_ADDRSTRLEN + 1];
+  if (addr->sa_family == AF_INET) {
+    struct ::sockaddr_in* s = reinterpret_cast<struct ::sockaddr_in*>(addr);
+    SYSCHECK(::inet_ntop(AF_INET, &(s->sin_addr), address, INET_ADDRSTRLEN))
+    address[INET_ADDRSTRLEN] = '\0';
+  } else if (addr->sa_family == AF_INET6) {
+    struct ::sockaddr_in6* s = reinterpret_cast<struct ::sockaddr_in6*>(addr);
+    SYSCHECK(::inet_ntop(AF_INET6, &(s->sin6_addr), address, INET6_ADDRSTRLEN))
+    address[INET6_ADDRSTRLEN] = '\0';
+  } else {
+    throw std::runtime_error("unsupported protocol");
+  }
+  return address;
+}
+
+// listen, connect and accept
+std::pair<int, PortType> listen(PortType port) {
+  struct ::addrinfo hints, *res = NULL;
+  std::memset(&hints, 0x00, sizeof(hints));
+  hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
+  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_socktype = SOCK_STREAM; // TCP
+
+  // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
+  //  by editing `/etc/gai.conf`. so there is no need to manual sorting
+  // or protocol preference.
+  int err = ::getaddrinfo(nullptr, std::to_string(port).data(), &hints, &res);
+  if (err != 0 || !res) {
+    throw std::invalid_argument(
+        "cannot find host to listen on: " + std::string(gai_strerror(err)));
+  }
+
+  std::shared_ptr<struct ::addrinfo> addresses(
+      res, [](struct ::addrinfo* p) { ::freeaddrinfo(p); });
+
+  struct ::addrinfo* nextAddr = addresses.get();
+  int socket;
+  while (true) {
+    try {
+      SYSCHECK(
+          socket = ::socket(
+              nextAddr->ai_family,
+              nextAddr->ai_socktype,
+              nextAddr->ai_protocol))
+
+      int optval = 1;
+      SYSCHECK(
+          ::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(int)))
+
+      SYSCHECK(::bind(socket, nextAddr->ai_addr, nextAddr->ai_addrlen))
+      SYSCHECK(::listen(socket, LISTEN_QUEUE_SIZE))
+      break;
+
+    } catch (const std::system_error& e) {
+      ::close(socket);
+      nextAddr = nextAddr->ai_next;
+
+      // we have tried all addresses but could not start
+      // listening on any of them
+      if (!nextAddr) {
+        throw;
+      }
+    }
+  }
+
+  // get listen port and address
+  return {socket, getSocketPort(socket)};
+}
+
+int connect(
+    const std::string& address,
+    PortType port,
+    bool wait,
+    const std::chrono::milliseconds& timeout) {
+  struct ::addrinfo hints, *res = NULL;
+  std::memset(&hints, 0x00, sizeof(hints));
+  hints.ai_flags = AI_NUMERICSERV; // specifies that port (service) is numeric
+  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_socktype = SOCK_STREAM; // TCP
+
+  // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
+  // by editing `/etc/gai.conf`. so there is no need to manual sorting
+  // or protcol preference.
+  int err =
+      ::getaddrinfo(address.data(), std::to_string(port).data(), &hints, &res);
+  if (err != 0 || !res) {
+    throw std::invalid_argument(
+        "host not found: " + std::string(gai_strerror(err)));
+  }
+
+  std::shared_ptr<struct ::addrinfo> addresses(
+      res, [](struct ::addrinfo* p) { ::freeaddrinfo(p); });
+
+  struct ::addrinfo* nextAddr = addresses.get();
+  int socket;
+  // we'll loop over the addresses only if at least of them gave us ECONNREFUSED
+  // Maybe the host was up, but the server wasn't running.
+  bool anyRefused = false;
+  while (true) {
+    try {
+      SYSCHECK(
+          socket = ::socket(
+              nextAddr->ai_family,
+              nextAddr->ai_socktype,
+              nextAddr->ai_protocol))
+
+      ResourceGuard socketGuard([socket]() { ::close(socket); });
+
+      // We need to connect in non-blocking mode, so we can use a timeout
+      SYSCHECK(::fcntl(socket, F_SETFL, O_NONBLOCK));
+
+      int ret = ::connect(socket, nextAddr->ai_addr, nextAddr->ai_addrlen);
+
+      if (ret != 0 && errno != EINPROGRESS) {
+        throw std::system_error(errno, std::system_category());
+      }
+
+      struct ::pollfd pfd;
+      pfd.fd = socket;
+      pfd.events = POLLOUT;
+
+      int numReady = ::poll(&pfd, 1, timeout.count());
+      if (numReady < 0) {
+        throw std::system_error(errno, std::system_category());
+      } else if (numReady == 0) {
+        errno = 0;
+        throw std::runtime_error("connect() timed out");
+      }
+
+      socklen_t errLen = sizeof(errno);
+      errno = 0;
+      ::getsockopt(socket, SOL_SOCKET, SO_ERROR, &errno, &errLen);
+
+      // `errno` is set when:
+      //  1. `getsockopt` has failed
+      //  2. there is awaiting error in the socket
+      //  (the error is saved to the `errno` variable)
+      if (errno != 0) {
+        throw std::system_error(errno, std::system_category());
+      }
+
+      // Disable non-blocking mode
+      int flags;
+      SYSCHECK(flags = ::fcntl(socket, F_GETFL));
+      SYSCHECK(::fcntl(socket, F_SETFL, flags & (~O_NONBLOCK)));
+      socketGuard.release();
+      break;
+
+    } catch (std::exception& e) {
+      if (errno == ECONNREFUSED) {
+        anyRefused = true;
+      }
+
+      // We need to move to the next address because this was not available
+      // to connect or to create a socket.
+      nextAddr = nextAddr->ai_next;
+
+      // We have tried all addresses but could not connect to any of them.
+      if (!nextAddr) {
+        if (!wait || !anyRefused) {
+          throw;
+        }
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        anyRefused = false;
+        nextAddr = addresses.get();
+      }
+    }
+  }
+
+  setSocketNoDelay(socket);
+
+  return socket;
+}
+
+std::tuple<int, std::string> accept(
+    int listenSocket,
+    const std::chrono::milliseconds& timeout) {
+  // poll on listen socket, it allows to make timeout
+  std::unique_ptr<struct ::pollfd[]> events(new struct ::pollfd[1]);
+  events[0] = {.fd = listenSocket, .events = POLLIN};
+
+  while (true) {
+    int res = ::poll(events.get(), 1, timeout.count());
+    if (res == 0) {
+      throw std::runtime_error(
+          "waiting for processes to "
+          "connect has timed out");
+    } else if (res == -1) {
+      if (errno == EINTR) {
+        continue;
+      }
+      throw std::system_error(errno, std::system_category());
+    } else {
+      if (!(events[0].revents & POLLIN))
+        throw std::system_error(ECONNABORTED, std::system_category());
+      break;
+    }
+  }
+
+  int socket;
+  SYSCHECK(socket = ::accept(listenSocket, NULL, NULL))
+
+  // Get address of the connecting process
+  struct ::sockaddr_storage addr;
+  socklen_t addrLen = sizeof(addr);
+  SYSCHECK(::getpeername(
+      socket, reinterpret_cast<struct ::sockaddr*>(&addr), &addrLen))
+
+  setSocketNoDelay(socket);
+
+  return std::make_tuple(
+      socket, sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addr)));
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
new file mode 100644
index 0000000..91d71c6
--- /dev/null
+++ b/torch/lib/c10d/Utils.hpp
@@ -0,0 +1,252 @@
+#pragma once
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <vector>
+
+#include <ATen/ATen.h>
+
+#include <c10d/Types.hpp>
+
+namespace c10d {
+
+// Turns at::IntList into "(1, 2, 3, 4)".
+inline std::string toString(at::IntList l) {
+  std::stringstream ss;
+  ss << "(";
+  for (size_t i = 0; i < l.size(); i++) {
+    if (i > 0) {
+      ss << ", ";
+    }
+    ss << l[i];
+  }
+  ss << ")";
+  return ss.str();
+}
+
+inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
+  // Ensure we have at least one tensor
+  if (tensors.size() == 0) {
+    throw std::invalid_argument("argument is empty");
+  }
+
+  // Ensure all tensors have identical type and shape
+  auto& type = tensors[0].type();
+  auto sizes = tensors[0].sizes();
+  for (size_t i = 1; i < tensors.size(); i++) {
+    if (tensors[i].type() != type) {
+      const std::string expected = type.toString();
+      const std::string actual = tensors[i].type().toString();
+      throw std::invalid_argument(
+          "argument contains mixed types (" + expected + " and " + actual +
+          ")");
+    }
+    if (!tensors[i].sizes().equals(sizes)) {
+      const auto expected = toString(sizes);
+      const auto actual = toString(tensors[i].sizes());
+      throw std::invalid_argument(
+          "argument contains mixed sizes (" + expected + " and " + actual +
+          ")");
+    }
+  }
+}
+
+inline std::vector<std::vector<int64_t>> getSizes(
+    const std::vector<at::Tensor>& tensors) {
+  std::vector<std::vector<int64_t>> sizes(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    sizes[i] = tensors[i].sizes();
+  }
+  return sizes;
+}
+
+inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
+  std::vector<int> devices(tensors.size(), -1);
+  if (tensors[0].type().is_cuda()) {
+    for (size_t i = 0; i < tensors.size(); i++) {
+      devices[i] = tensors[i].storage()->getDevice();
+    }
+  }
+  return devices;
+}
+
+template <typename T>
+std::vector<T*> getDataPointers(const std::vector<at::Tensor>& tensors) {
+  std::vector<T*> ptrs(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ptrs[i] = static_cast<T*>(tensors[i].storage()->data());
+  }
+  return ptrs;
+}
+
+using RankType = uint32_t;
+using PortType = uint16_t;
+using SizeType = uint64_t;
+
+#define SYSCHECK(expr)                                        \
+  {                                                           \
+    errno = 0;                                                \
+    auto ___output = (expr);                                  \
+    (void)___output;                                          \
+    if (errno != 0)                                           \
+      throw std::system_error(errno, std::system_category()); \
+  }
+
+// Helper resource guard class
+class ResourceGuard {
+ public:
+  ResourceGuard(std::function<void()> destructor)
+      : destructor_(std::move(destructor)), released_(false) {}
+
+  ~ResourceGuard() {
+    if (!released_) {
+      destructor_();
+    }
+  }
+
+  void release() {
+    released_ = true;
+  }
+
+ private:
+  std::function<void()> destructor_;
+  bool released_;
+};
+
+namespace tcputil {
+
+constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
+
+// Send and receive
+template <typename T>
+void sendBytes(
+    int socket,
+    const T* buffer,
+    size_t length,
+    bool moreData = false) {
+  size_t bytesToSend = sizeof(T) * length;
+  if (bytesToSend == 0) {
+    return;
+  }
+
+  auto bytes = reinterpret_cast<const uint8_t*>(buffer);
+  uint8_t* currentBytes = const_cast<uint8_t*>(bytes);
+
+  int flags = 0;
+
+#ifdef MSG_MORE
+  if (moreData) { // there is more data to send
+    flags |= MSG_MORE;
+  }
+#endif
+
+  while (bytesToSend > 0) {
+    ssize_t bytesSent;
+    SYSCHECK(bytesSent = ::send(socket, currentBytes, bytesToSend, flags))
+    if (bytesSent == 0) {
+      throw std::system_error(ECONNRESET, std::system_category());
+    }
+
+    bytesToSend -= bytesSent;
+    currentBytes += bytesSent;
+  }
+}
+
+template <typename T>
+void recvBytes(int socket, T* buffer, size_t length) {
+  size_t bytesToReceive = sizeof(T) * length;
+  if (bytesToReceive == 0) {
+    return;
+  }
+
+  auto bytes = reinterpret_cast<uint8_t*>(buffer);
+  uint8_t* currentBytes = bytes;
+
+  while (bytesToReceive > 0) {
+    ssize_t bytesReceived;
+    SYSCHECK(bytesReceived = ::recv(socket, currentBytes, bytesToReceive, 0))
+    if (bytesReceived == 0) {
+      throw std::system_error(ECONNRESET, std::system_category());
+    }
+
+    bytesToReceive -= bytesReceived;
+    currentBytes += bytesReceived;
+  }
+}
+
+// send a vector's length and data
+template <typename T>
+void sendVector(int socket, const std::vector<T>& vec, bool moreData = false) {
+  SizeType size = vec.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<T>(socket, vec.data(), size, moreData);
+}
+
+// receive a vector as sent in sendVector
+template <typename T>
+std::vector<T> recvVector(int socket) {
+  SizeType valueSize;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<T> value(valueSize);
+  recvBytes<T>(socket, value.data(), value.size());
+  return value;
+}
+
+// this is only for convenience when sending rvalues
+template <typename T>
+void sendValue(int socket, const T& value, bool moreData = false) {
+  sendBytes<T>(socket, &value, 1, moreData);
+}
+
+template <typename T>
+T recvValue(int socket) {
+  T value;
+  recvBytes<T>(socket, &value, 1);
+  return value;
+}
+
+// send a string's length and data
+inline void sendString(
+    int socket,
+    const std::string& str,
+    bool moreData = false) {
+  SizeType size = str.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<char>(socket, str.data(), size, moreData);
+}
+
+// receive a string as sent in sendString
+inline std::string recvString(int socket) {
+  SizeType valueSize;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<char> value(valueSize);
+  recvBytes<char>(socket, value.data(), value.size());
+  return std::string(value.data(), value.size());
+}
+
+// Other helpers
+std::string sockaddrToString(struct sockaddr* addr);
+
+std::pair<int, PortType> listen(PortType port);
+
+int connect(
+    const std::string& address,
+    PortType port,
+    bool wait = true,
+    const std::chrono::milliseconds& timeout = kNoTimeout);
+
+std::tuple<int, std::string> accept(
+    int listenSocket,
+    const std::chrono::milliseconds& timeout = kNoTimeout);
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/torch/lib/c10d/bin/test.sh b/torch/lib/c10d/bin/test.sh
new file mode 100755
index 0000000..cd0a119
--- /dev/null
+++ b/torch/lib/c10d/bin/test.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -ex
+
+mkdir -p build
+cd build
+cmake ../ -DCMAKE_INSTALL_PREFIX="$PWD/../../tmp_install"
+make all test
diff --git a/torch/lib/c10d/cmake/Def.hpp.in b/torch/lib/c10d/cmake/Def.hpp.in
new file mode 100644
index 0000000..bf48840
--- /dev/null
+++ b/torch/lib/c10d/cmake/Def.hpp.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#cmakedefine USE_C10D_NCCL
diff --git a/torch/lib/c10d/example/CMakeLists.txt b/torch/lib/c10d/example/CMakeLists.txt
new file mode 100644
index 0000000..ba5f058
--- /dev/null
+++ b/torch/lib/c10d/example/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(allreduce allreduce.cpp)
+target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+target_link_libraries(allreduce pthread c10d)
diff --git a/torch/lib/c10d/example/allreduce.cpp b/torch/lib/c10d/example/allreduce.cpp
new file mode 100644
index 0000000..2698754
--- /dev/null
+++ b/torch/lib/c10d/example/allreduce.cpp
@@ -0,0 +1,31 @@
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroupGloo.hpp>
+
+using namespace ::c10d;
+
+int main(int argc, char** argv) {
+  int rank = atoi(getenv("RANK"));
+  int size = atoi(getenv("SIZE"));
+  auto store = std::make_shared<FileStore>("/tmp/c10d_example");
+  ProcessGroupGloo pg(store, rank, size);
+
+  // Create some tensors
+  const auto ntensors = 10;
+  std::vector<at::Tensor> tensors;
+  for (auto i = 0; i < ntensors; i++) {
+    auto x = at::ones(at::CPU(at::kFloat), {1000, 16 * (i + 1)});
+    tensors.push_back(x);
+  }
+
+  // Kick off work
+  std::vector<std::shared_ptr<ProcessGroup::Work>> pending;
+  for (auto i = 0; i < ntensors; i++) {
+    std::vector<at::Tensor> tmp = {tensors[i]};
+    pending.push_back(pg.allreduce(tmp));
+  }
+
+  // Wait for work to complete
+  for (auto& work : pending) {
+    work->wait();
+  }
+}
diff --git a/torch/lib/c10d/private/CUDAUtils.hpp b/torch/lib/c10d/private/CUDAUtils.hpp
new file mode 100644
index 0000000..2375e02
--- /dev/null
+++ b/torch/lib/c10d/private/CUDAUtils.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <ATen/ATen.h>
+#include <THC/THCStream.h>
+
+#include <c10d/CUDAUtils.hpp>
+
+#define C10D_CUDA_CHECK(condition)        \
+  do {                                    \
+    cudaError_t error = (condition);      \
+    if (error != cudaSuccess) {           \
+      std::stringstream ss;               \
+      ss << "Error at: ";                 \
+      ss << __FILE__;                     \
+      ss << ":";                          \
+      ss << __LINE__;                     \
+      ss << ": ";                         \
+      ss << cudaGetErrorString(error);    \
+      throw std::runtime_error(ss.str()); \
+    }                                     \
+  } while (0)
+
+namespace c10d {
+
+// THCStreamGuard is a RAII guard for selecting a THCStream.
+//
+// It sets both the current device to the stream's device and the
+// current stream in the THC state.
+//
+class THCStreamGuard {
+ public:
+  explicit THCStreamGuard(THCState* state, CUDAStream& stream)
+      : device_(THCStream_device(stream.getTHCStream())), state_(state) {
+    at::DeviceGuard deviceGuard(device_);
+    original_ = THCState_getStream(state_);
+    THCStream_retain(original_);
+    THCState_setStream(state_, stream.getTHCStream());
+  }
+
+  THCStreamGuard(THCStreamGuard&& other)
+      : device_(other.device_), state_(nullptr), original_(nullptr) {
+    std::swap(state_, other.state_);
+    std::swap(original_, other.original_);
+  }
+
+  ~THCStreamGuard() {
+    if (original_ != nullptr) {
+      at::DeviceGuard deviceGuard(device_);
+      THCState_setStream(state_, original_);
+      THCStream_free(original_);
+    }
+  }
+
+ private:
+  const int device_;
+  THCState* state_;
+  THCStream* original_;
+};
+
+} // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
new file mode 100644
index 0000000..6bcb2b5
--- /dev/null
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -0,0 +1,22 @@
+cuda_add_library(c10d_cuda_test CUDATest.cu)
+target_link_libraries(c10d_cuda_test c10d)
+
+function(c10d_add_test test_src)
+  get_filename_component(test_name ${test_src} NAME_WE)
+  add_executable(${test_name} "${test_src}")
+  target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+  target_link_libraries(${test_name} pthread ${ARGN})
+  add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+endfunction()
+
+c10d_add_test(FileStoreTest.cpp c10d)
+c10d_add_test(TCPStoreTest.cpp c10d)
+c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test)
+c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test)
+if(MPI_FOUND)
+  add_definitions(-DMPIEXEC=${MPIEXEC})
+  c10d_add_test(ProcessGroupMPITest.cpp c10d)
+endif()
+if(DISTRIBUTED_NCCL_FOUND)
+  c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
+endif()
diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu
new file mode 100644
index 0000000..6b91229
--- /dev/null
+++ b/torch/lib/c10d/test/CUDATest.cu
@@ -0,0 +1,30 @@
+#include "CUDATest.hpp"
+
+#include <c10d/private/CUDAUtils.hpp>
+
+namespace c10d {
+namespace test {
+
+namespace {
+__global__ void waitClocks(const uint64_t count) {
+  clock_t start = clock64();
+  clock_t offset = 0;
+  while (offset < count) {
+    offset = clock() - start;
+  }
+}
+
+} // namespace
+
+void cudaSleep(CUDAStream& stream, uint64_t clocks) {
+  waitClocks<<<1, 1, 0, stream.getStream()>>>(clocks);
+}
+
+int cudaNumDevices() {
+  int n = 0;
+  C10D_CUDA_CHECK(cudaGetDeviceCount(&n));
+  return n;
+}
+
+} // namespace test
+} // namespace c10d
diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp
new file mode 100644
index 0000000..352a74e
--- /dev/null
+++ b/torch/lib/c10d/test/CUDATest.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <c10d/CUDAUtils.hpp>
+
+namespace c10d {
+namespace test {
+
+void cudaSleep(CUDAStream& stream, uint64_t clocks);
+
+int cudaNumDevices();
+
+} // namespace test
+} // namespace c10d
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
new file mode 100644
index 0000000..dfe68a2
--- /dev/null
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -0,0 +1,82 @@
+#include "StoreTestCommon.hpp"
+
+#include <unistd.h>
+
+#include <iostream>
+#include <thread>
+
+#include <c10d/FileStore.hpp>
+
+std::string tmppath() {
+  const char* tmpdir = getenv("TMPDIR");
+  if (tmpdir == nullptr) {
+    tmpdir = "/tmp";
+  }
+
+  // Create template
+  std::vector<char> tmp(256);
+  auto len = snprintf(tmp.data(), tmp.size(), "%s/testXXXXXX", tmpdir);
+  tmp.resize(len);
+
+  // Create temporary file
+  auto fd = mkstemp(&tmp[0]);
+  if (fd == -1) {
+    throw std::system_error(errno, std::system_category());
+  }
+  close(fd);
+  return std::string(tmp.data(), tmp.size());
+}
+
+int main(int argc, char** argv) {
+  auto path = tmppath();
+  std::cout << "Using temporary file: " << path << std::endl;
+
+  // Basic set/get
+  {
+    c10d::FileStore store(path);
+    c10d::test::set(store, "key0", "value0");
+    c10d::test::set(store, "key1", "value1");
+    c10d::test::set(store, "key2", "value2");
+    c10d::test::check(store, "key0", "value0");
+    c10d::test::check(store, "key1", "value1");
+    c10d::test::check(store, "key2", "value2");
+  }
+
+  // Perform get on new instance
+  {
+    c10d::FileStore store(path);
+    c10d::test::check(store, "key0", "value0");
+  }
+
+  // Hammer on FileStore#add
+  std::vector<std::thread> threads;
+  const auto numThreads = 4;
+  const auto numIterations = 100;
+  c10d::test::Semaphore sem1, sem2;
+  for (auto i = 0; i < numThreads; i++) {
+    threads.push_back(std::move(std::thread([&] {
+      c10d::FileStore store(path);
+      sem1.post();
+      sem2.wait();
+      for (auto j = 0; j < numIterations; j++) {
+        store.add("counter", 1);
+      }
+    })));
+  }
+  sem1.wait(numThreads);
+  sem2.post(numThreads);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  // Check that the counter has the expected value
+  {
+    c10d::FileStore store(path);
+    std::string expected = std::to_string(numThreads * numIterations);
+    c10d::test::check(store, "counter", expected);
+  }
+
+  unlink(path.c_str());
+  std::cout << "Test succeeded" << std::endl;
+  return 0;
+}
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
new file mode 100644
index 0000000..332c553
--- /dev/null
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -0,0 +1,267 @@
+#include <gloo/transport/tcp/device.h>
+
+#include <c10d/CUDAUtils.hpp>
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroupGloo.hpp>
+#include <c10d/private/CUDAUtils.hpp>
+#include <c10d/test/CUDATest.hpp>
+#include <c10d/test/TestUtils.hpp>
+
+using namespace c10d::test;
+
+using c10d::CUDAStream;
+using c10d::ProcessGroup;
+using c10d::THCStreamGuard;
+
+template <typename T, typename... Args>
+std::vector<T> initialize(const std::string& path, int N, Args&&... args) {
+  std::vector<T> tests;
+  for (auto i = 0; i < N; i++) {
+    tests.push_back(std::move(T(path, std::forward<Args>(args)...)));
+  }
+
+  std::vector<std::thread> threads;
+  for (auto i = 0; i < N; i++) {
+    threads.push_back(
+        std::move(std::thread([i, N, &tests] { tests[i].start(i, N); })));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  return tests;
+}
+
+class AsyncTest {
+ public:
+  AsyncTest(const std::string& path) : path_(path) {}
+
+  AsyncTest(AsyncTest&& other) {
+    path_ = std::move(other.path_);
+    pg_ = std::move(other.pg_);
+  }
+
+  ::c10d::ProcessGroupGloo& getProcessGroup() {
+    return *pg_;
+  }
+
+  void start(int rank, int size) {
+    auto store = std::make_shared<::c10d::FileStore>(path_);
+
+    // Use tiny timeout to make this test run fast
+    ::c10d::ProcessGroupGloo::Options options;
+    options.timeout = std::chrono::milliseconds(50);
+
+    pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
+        new ::c10d::ProcessGroupGloo(store, rank, size, options));
+  }
+
+ protected:
+  std::string path_;
+  std::unique_ptr<::c10d::ProcessGroupGloo> pg_;
+};
+
+class AsyncInputIsOutputTest : public AsyncTest {
+ public:
+  AsyncInputIsOutputTest(const std::string& path, int numTensors)
+      : AsyncTest(path),
+        numTensors_(numTensors),
+        numDevices_(cudaNumDevices()),
+        state_(::at::globalContext().lazyInitCUDA()) {
+    const auto& type = at::getType(at::kCUDA, at::kFloat);
+
+    // Allocate inputs on available devices in a round robin fashion.
+    inputs_.resize(numTensors_);
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numTensors_; i++) {
+      deviceGuard.set_index(i % numDevices_);
+      inputs_[i] = type.tensor({16, 16});
+    }
+
+    // Allocate a stream per device.
+    //
+    // The "current stream" is set globally per device in THC, so we
+    // can't make two tensors on the same device use different streams
+    // and pass this along to the collective (since it uses the THC
+    // getters to retrieve the current stream).
+    //
+    streams_.resize(numDevices_);
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      streams_[i] = CUDAStream::create();
+    }
+  }
+
+  std::vector<THCStreamGuard> createStreamGuard() {
+    std::vector<THCStreamGuard> guards;
+    for (auto& stream : streams_) {
+      guards.push_back(std::move(THCStreamGuard(state_, stream)));
+    }
+    return guards;
+  }
+
+  void wait(std::shared_ptr<ProcessGroup::Work>& work) {
+    auto guards = createStreamGuard();
+    if (!work->wait()) {
+      throw work->exception();
+    }
+  }
+
+  std::vector<at::Tensor> getTensors() {
+    std::vector<at::Tensor> outputs(numTensors_);
+
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Copy inputs to outputs
+    for (auto i = 0; i < numTensors_; i++) {
+      outputs[i] = inputs_[i].toBackend(at::kCPU);
+    }
+
+    return outputs;
+  }
+
+ protected:
+  const int numTensors_;
+  const int numDevices_;
+  THCState* state_;
+  std::vector<at::Tensor> inputs_;
+  std::vector<CUDAStream> streams_;
+};
+
+class AsyncAllreduceTest : public AsyncInputIsOutputTest {
+ public:
+  AsyncAllreduceTest(const std::string& path, int numTensors)
+      : AsyncInputIsOutputTest(path, numTensors) {}
+
+  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Launch sleep on every stream
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      cudaSleep(streams_[i], 10 * 1000 * 1000);
+    }
+
+    // Launch value initialization for every tensor
+    for (auto i = 0; i < numTensors_; i++) {
+      deviceGuard.set_index(i % numDevices_);
+      inputs_[i].fill_(pg_->getRank() * numTensors_ + i);
+    }
+
+    return pg_->allreduce(inputs_);
+  }
+};
+
+class AsyncBroadcastTest : public AsyncInputIsOutputTest {
+ public:
+  AsyncBroadcastTest(const std::string& path, int numTensors)
+      : AsyncInputIsOutputTest(path, numTensors) {}
+
+  std::shared_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Launch sleep on every stream
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      cudaSleep(streams_[i], 10 * 1000 * 1000);
+    }
+
+    // Launch value initialization for every tensor
+    for (auto i = 0; i < numTensors_; i++) {
+      deviceGuard.set_index(i % numDevices_);
+      inputs_[i].fill_(pg_->getRank() * numTensors_ + i);
+    }
+
+    ::c10d::BroadcastOptions options;
+    options.rootRank = rootRank;
+    options.rootTensor = rootTensor;
+    return pg_->broadcast(inputs_, options);
+  }
+};
+
+void runAsyncAllreduceTest(
+    const std::string& path,
+    size_t numProcesses,
+    size_t numTensors) {
+  auto tests = initialize<AsyncAllreduceTest>(path, numProcesses, numTensors);
+  std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
+  for (size_t i = 0; i < numProcesses; i++) {
+    work[i] = tests[i].run();
+  }
+
+  // Wait for work to complete
+  for (size_t i = 0; i < numProcesses; i++) {
+    tests[i].wait(work[i]);
+  }
+
+  // Check results
+  for (size_t i = 0; i < numProcesses; i++) {
+    const auto size = numProcesses * numTensors;
+    const auto expected = (size * (size - 1)) / 2;
+    auto tensors = tests[i].getTensors();
+    for (size_t j = 0; j < tensors.size(); j++) {
+      auto& tensor = tensors[j];
+      auto data = tensor.data<float>();
+      for (auto k = 0; k < tensor.numel(); k++) {
+        if (data[k] != expected) {
+          throw std::runtime_error("BOOM!");
+        }
+      }
+    }
+  }
+}
+
+void runAsyncBroadcastTest(
+    const std::string& path,
+    size_t numProcesses,
+    size_t numTensors) {
+  auto tests = initialize<AsyncBroadcastTest>(path, numProcesses, numTensors);
+
+  // Try every permutation of root rank and root tensor
+  for (size_t rootRank = 0; rootRank < numProcesses; rootRank++) {
+    for (size_t rootTensor = 0; rootTensor < numTensors; rootTensor++) {
+      std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
+      for (size_t i = 0; i < numProcesses; i++) {
+        work[i] = tests[i].run(rootRank, rootTensor);
+      }
+
+      // Wait for work to complete
+      for (size_t i = 0; i < numProcesses; i++) {
+        tests[i].wait(work[i]);
+      }
+
+      // Check results
+      const auto expected = (rootRank * numTensors + rootTensor);
+      for (size_t i = 0; i < numProcesses; i++) {
+        auto tensors = tests[i].getTensors();
+        for (size_t j = 0; j < tensors.size(); j++) {
+          auto& tensor = tensors[j];
+          auto data = tensor.data<float>();
+          for (auto k = 0; k < tensor.numel(); k++) {
+            if (data[k] != expected) {
+              throw std::runtime_error("BOOM!");
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  {
+    TemporaryFile file;
+    runAsyncAllreduceTest(file.path, 4, 2);
+  }
+
+  {
+    TemporaryFile file;
+    runAsyncBroadcastTest(file.path, 4, 1);
+  }
+}
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
new file mode 100644
index 0000000..16e686f
--- /dev/null
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -0,0 +1,283 @@
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <condition_variable>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <thread>
+
+#include <gloo/transport/tcp/device.h>
+
+#include <c10d/CUDAUtils.hpp>
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroupGloo.hpp>
+#include <c10d/test/TestUtils.hpp>
+
+using namespace c10d::test;
+
+class SignalTest {
+ public:
+  SignalTest(const std::string& path) : path_(path) {}
+
+  ~SignalTest() {
+    if (arm_.joinable()) {
+      arm_.join();
+    }
+  }
+
+  // Arms test to send signal to PID when the semaphore unlocks. This
+  // happens as soon as the first collective completes successfully.
+  void arm(int pid, int signal) {
+    arm_ = std::move(std::thread([=] {
+      sem_.wait();
+      kill(pid, signal);
+    }));
+  }
+
+  std::shared_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) {
+    auto store = std::make_shared<::c10d::FileStore>(path_);
+
+    // Use tiny timeout to make this test run fast
+    ::c10d::ProcessGroupGloo::Options options;
+    options.timeout = std::chrono::milliseconds(50);
+
+    ::c10d::ProcessGroupGloo pg(store, rank, size, options);
+
+    // Initialize tensor list
+    std::vector<at::Tensor> tensors = {
+        at::ones(at::CPU(at::kFloat), {16, 16}),
+    };
+
+    // Loop until an exception happens
+    std::shared_ptr<::c10d::ProcessGroup::Work> work;
+    while (true) {
+      work = pg.allreduce(tensors);
+      if (!work->wait()) {
+        break;
+      }
+      sem_.post();
+    }
+
+    return std::move(work);
+  }
+
+ protected:
+  std::string path_;
+  std::thread arm_;
+  Semaphore sem_;
+};
+
+std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
+    const std::string& path,
+    int signal) {
+  Fork fork;
+  if (fork.isChild()) {
+    SignalTest test(path);
+    test.run(1, 2);
+    exit(1);
+  }
+
+  SignalTest test(path);
+  test.arm(fork.pid, signal);
+  return test.run(0, 2);
+}
+
+class CollectiveTest {
+ public:
+  static std::vector<CollectiveTest> initialize(
+      const std::string& path,
+      int num) {
+    std::vector<CollectiveTest> tests;
+    for (auto i = 0; i < num; i++) {
+      tests.push_back(std::move(CollectiveTest(path)));
+    }
+
+    std::vector<std::thread> threads;
+    for (auto i = 0; i < num; i++) {
+      threads.push_back(std::move(
+          std::thread([i, &tests] { tests[i].start(i, tests.size()); })));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    return std::move(tests);
+  }
+
+  CollectiveTest(const std::string& path) : path_(path) {}
+
+  CollectiveTest(CollectiveTest&& other) {
+    path_ = std::move(other.path_);
+    pg_ = std::move(other.pg_);
+  }
+
+  ::c10d::ProcessGroupGloo& getProcessGroup() {
+    return *pg_;
+  }
+
+  void start(int rank, int size) {
+    auto store = std::make_shared<::c10d::FileStore>(path_);
+
+    // Use tiny timeout to make this test run fast
+    ::c10d::ProcessGroupGloo::Options options;
+    options.timeout = std::chrono::milliseconds(50);
+
+    pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
+        new ::c10d::ProcessGroupGloo(store, rank, size, options));
+  }
+
+ protected:
+  std::string path_;
+  std::unique_ptr<::c10d::ProcessGroupGloo> pg_;
+};
+
+std::vector<std::vector<at::Tensor>> copyTensors(
+    const std::vector<std::vector<at::Tensor>>& inputs) {
+  std::vector<std::vector<at::Tensor>> outputs(inputs.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    const auto& input = inputs[i];
+    std::vector<at::Tensor> output(input.size());
+    for (size_t j = 0; j < input.size(); j++) {
+      output[j] = input[j].toBackend(at::kCPU);
+    }
+    outputs[i] = std::move(output);
+  }
+  return outputs;
+}
+
+void testAllreduce(const std::string& path, const at::Backend b) {
+  const auto size = 4;
+  auto tests = CollectiveTest::initialize(path, size);
+
+  // Generate inputs
+  std::vector<std::vector<at::Tensor>> inputs(size);
+  for (auto i = 0; i < size; i++) {
+    auto tensor = at::ones(at::getType(b, at::kFloat), {16, 16}) * i;
+    inputs[i] = std::vector<at::Tensor>({tensor});
+  }
+
+  // Kick off work
+  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+  for (auto i = 0; i < size; i++) {
+    work[i] = tests[i].getProcessGroup().allreduce(inputs[i]);
+  }
+
+  // Wait for work to complete
+  for (auto i = 0; i < size; i++) {
+    if (!work[i]->wait()) {
+      throw work[i]->exception();
+    }
+  }
+
+  // Verify outputs
+  const auto expected = (size * (size - 1)) / 2;
+  auto outputs = copyTensors(inputs);
+  for (auto i = 0; i < size; i++) {
+    auto& tensor = outputs[i][0];
+    auto data = tensor.data<float>();
+    for (auto j = 0; j < tensor.numel(); j++) {
+      if (data[j] != expected) {
+        throw std::runtime_error("BOOM!");
+      }
+    }
+  }
+}
+
+void testBroadcast(const std::string& path, const at::Backend b) {
+  const auto size = 2;
+  const auto stride = 2;
+  auto tests = CollectiveTest::initialize(path, size);
+
+  std::vector<std::vector<at::Tensor>> inputs(size);
+  const auto& type = at::getType(b, at::kFloat);
+
+  // Try every permutation of root rank and root tensoro
+  for (auto i = 0; i < size; i++) {
+    for (auto j = 0; j < stride; j++) {
+      // Initialize inputs
+      for (auto k = 0; k < size; k++) {
+        inputs[k].resize(stride);
+        at::DeviceGuard deviceGuard;
+        for (auto l = 0; l < stride; l++) {
+          if (type.is_cuda()) {
+            deviceGuard.set_index(l);
+          }
+          inputs[k][l] = at::ones(type, {16, 16}) * (k * stride + l);
+        }
+      }
+
+      ::c10d::BroadcastOptions options;
+      options.rootRank = i;
+      options.rootTensor = j;
+
+      // Kick off work
+      std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+      for (auto i = 0; i < size; i++) {
+        work[i] = tests[i].getProcessGroup().broadcast(inputs[i], options);
+      }
+
+      // Wait for work to complete
+      for (auto i = 0; i < size; i++) {
+        if (!work[i]->wait()) {
+          throw work[i]->exception();
+        }
+      }
+
+      // Verify outputs
+      const auto expected = (i * stride + j);
+      auto outputs = copyTensors(inputs);
+      for (auto k = 0; k < size; k++) {
+        for (auto l = 0; l < stride; l++) {
+          auto& tensor = outputs[k][l];
+          auto data = tensor.data<float>();
+          for (auto n = 0; n < tensor.numel(); n++) {
+            if (data[n] != expected) {
+              throw std::runtime_error("BOOM!");
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  {
+    TemporaryFile file;
+    auto work = testSignal(file.path, SIGSTOP);
+    auto& ex = work->exception();
+    std::cout << "SIGSTOP test got: " << ex.what() << std::endl;
+  }
+
+  {
+    TemporaryFile file;
+    auto work = testSignal(file.path, SIGKILL);
+    auto& ex = work->exception();
+    std::cout << "SIGKILL test got: " << ex.what() << std::endl;
+  }
+
+  {
+    TemporaryFile file;
+    testAllreduce(file.path, at::kCPU);
+  }
+
+  {
+    TemporaryFile file;
+    testAllreduce(file.path, at::kCUDA);
+  }
+
+  {
+    TemporaryFile file;
+    testBroadcast(file.path, at::kCPU);
+  }
+
+  {
+    TemporaryFile file;
+    testBroadcast(file.path, at::kCUDA);
+  }
+
+  return 0;
+}
diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
new file mode 100644
index 0000000..701e5d4
--- /dev/null
+++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
@@ -0,0 +1,113 @@
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>
+
+#include <unistd.h>
+
+#include <c10d/ProcessGroupMPI.hpp>
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+void testAllreduce(int iter = 1000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  // Generate inputs
+  std::vector<std::vector<at::Tensor>> allTensors(iter);
+  for (auto i = 0; i < iter; ++i) {
+    auto tensor = at::ones(at::CPU(at::kFloat), {16, 16}) * i;
+    allTensors[i] = std::vector<at::Tensor>({tensor});
+  }
+
+  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  for (auto& tensors : allTensors) {
+    // Kick off work
+    std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->allreduce(tensors);
+    works.push_back(std::move(work));
+  }
+
+  for (auto& work : works) {
+    // Wait for work to complete
+    if (!work->wait()) {
+      std::cerr << "Exception received: " << work->exception().what()
+                << std::endl;
+      pg->abort();
+    }
+  }
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+
+  // Verify outputs
+  for (int i = 0; i < iter; ++i) {
+    const auto expected = worldSize * i;
+    auto data = allTensors[i][0].data<float>();
+    for (auto i = 0; i < allTensors[i][0].numel(); ++i) {
+      if (data[i] != expected) {
+        throw std::runtime_error("BOOM!");
+      }
+    }
+  }
+}
+
+void testBroadcast(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  // Generate inputs
+  std::vector<std::vector<at::Tensor>> allTensors(iter);
+
+  for (auto i = 0; i < iter; ++i) {
+    if (pg->getRank() == 0) {
+      auto tensor = at::ones(at::CPU(at::kFloat), {16, 16}) * i;
+      allTensors[i] = std::vector<at::Tensor>({tensor});
+    } else {
+      auto tensor = at::zeros(at::CPU(at::kFloat), {16, 16});
+      allTensors[i] = std::vector<at::Tensor>({tensor});
+    }
+  }
+
+  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  for (auto& tensors : allTensors) {
+    // Kick off work
+    std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->broadcast(tensors);
+    works.push_back(std::move(work));
+  }
+
+  for (auto& work : works) {
+    // Wait for work to complete
+    if (!work->wait()) {
+      std::cerr << "Exception received: " << work->exception().what()
+                << std::endl;
+      pg->abort();
+    }
+  }
+
+  // Verify outputs
+  for (int i = 0; i < iter; ++i) {
+    const auto expected = i;
+    auto data = allTensors[i][0].data<float>();
+    for (auto i = 0; i < allTensors[i][0].numel(); ++i) {
+      if (data[i] != expected) {
+        throw std::runtime_error("BOOM!");
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+#ifdef MPIEXEC
+  // If we are within an openmpi mpirun, then skip the exec
+  if (!std::getenv("OMPI_COMM_WORLD_SIZE")) {
+    std::cout << "Execute mpiexec from: " << STR(MPIEXEC) << std::endl;
+    execl(STR(MPIEXEC), "-np 2", argv[0]);
+  }
+
+  testAllreduce();
+  testBroadcast();
+
+  std::cout << "Test successful" << std::endl;
+#else
+  std::cout << "MPI executable not found, skipping test" << std::endl;
+#endif
+  return EXIT_SUCCESS;
+}
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
new file mode 100644
index 0000000..f92296b
--- /dev/null
+++ b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
@@ -0,0 +1,240 @@
+#include <iostream>
+
+#include <c10d/CUDAUtils.hpp>
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroupNCCL.hpp>
+#include <c10d/private/CUDAUtils.hpp>
+#include <c10d/test/CUDATest.hpp>
+#include <c10d/test/TestUtils.hpp>
+
+using namespace c10d::test;
+
+using c10d::CUDAStream;
+using c10d::ProcessGroup;
+using c10d::THCStreamGuard;
+
+class NCCLTestBase {
+ public:
+  NCCLTestBase(const std::string& path) : path_(path) {}
+
+  NCCLTestBase(NCCLTestBase&& other) {
+    path_ = std::move(other.path_);
+    pg_ = std::move(other.pg_);
+  }
+
+  ::c10d::ProcessGroupNCCL& getProcessGroup() {
+    return *pg_;
+  }
+
+  void initialize(int rank, int size) {
+    auto store = std::make_shared<::c10d::FileStore>(path_);
+
+    pg_ = std::unique_ptr<::c10d::ProcessGroupNCCL>(
+        new ::c10d::ProcessGroupNCCL(store, rank, size));
+  }
+
+ protected:
+  std::string path_;
+  std::unique_ptr<::c10d::ProcessGroupNCCL> pg_;
+};
+
+class NCCLTest : public NCCLTestBase {
+ public:
+  NCCLTest(const std::string& path)
+      : NCCLTestBase(path),
+        numDevices_(cudaNumDevices()),
+        state_(::at::globalContext().lazyInitCUDA()) {
+    const auto& type = at::getType(at::kCUDA, at::kFloat);
+
+    // Each device has a single tensor to perf the NCCL op
+    inputs_.resize(numDevices_);
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      inputs_[i] = type.tensor({3, 3});
+    }
+
+    // Allocate a stream per device.
+    //
+    // The "current stream" is set globally per device in THC, so we
+    // can't make two tensors on the same device use different streams
+    // and pass this along to the collective (since it uses the THC
+    // getters to retrieve the current stream).
+    //
+    streams_.resize(numDevices_);
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      streams_[i] = CUDAStream::create();
+    }
+  }
+
+  std::vector<THCStreamGuard> createStreamGuard() {
+    std::vector<THCStreamGuard> guards;
+    for (auto& stream : streams_) {
+      guards.push_back(std::move(THCStreamGuard(state_, stream)));
+    }
+    return guards;
+  }
+
+  void wait(std::shared_ptr<ProcessGroup::Work>& work) {
+    auto guards = createStreamGuard();
+    work->wait();
+  }
+
+  std::vector<at::Tensor> getTensors() {
+    std::vector<at::Tensor> outputs(numDevices_);
+
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Copy inputs to outputs
+    for (auto i = 0; i < numDevices_; i++) {
+      cudaStreamSynchronize(streams_[i].getStream());
+      outputs[i] = inputs_[i].toBackend(at::kCPU);
+    }
+
+    return outputs;
+  }
+
+  int numDevices() const {
+    return numDevices_;
+  }
+
+ protected:
+  const int numDevices_;
+  THCState* state_;
+  std::vector<at::Tensor> inputs_;
+  std::vector<CUDAStream> streams_;
+};
+
+class AllreduceNCCLTest : public NCCLTest {
+ public:
+  AllreduceNCCLTest(const std::string& path) : NCCLTest(path) {}
+
+  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Launch sleep on every device
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      cudaSleep(streams_[i], 2000 * 1000 * 1000);
+    }
+
+    // Launch value initialization for every tensor
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      inputs_[i].fill_(pg_->getRank() * numDevices_ + i);
+    }
+
+    return pg_->allreduce(inputs_);
+  }
+};
+
+class BroadcastNCCLTest : public NCCLTest {
+ public:
+  BroadcastNCCLTest(const std::string& path) : NCCLTest(path) {}
+
+  std::shared_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
+    // For the duration of this function, make THC use our streams
+    auto guards = createStreamGuard();
+
+    // Launch sleep on every device
+    at::DeviceGuard deviceGuard;
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      cudaSleep(streams_[i], 2000 * 1000 * 1000);
+    }
+
+    // Launch value initialization for every tensor
+    for (auto i = 0; i < numDevices_; i++) {
+      deviceGuard.set_index(i);
+      inputs_[i].fill_(pg_->getRank() * numDevices_ + i);
+    }
+
+    ::c10d::BroadcastOptions options;
+    options.rootRank = rootRank;
+    options.rootTensor = rootTensor;
+    return pg_->broadcast(inputs_, options);
+  }
+};
+
+void testAllreduce(const std::string& path, int rank, int size) {
+  auto test = AllreduceNCCLTest(path);
+  test.initialize(rank, size);
+  auto work = test.run();
+  // Wait for work to finish
+  test.wait(work);
+
+  // Validation
+  const int totalNumGPUs = test.numDevices() * size;
+  const auto expected = (totalNumGPUs * (totalNumGPUs - 1)) / 2;
+  auto tensors = test.getTensors();
+  for (size_t j = 0; j < tensors.size(); j++) {
+    auto& tensor = tensors[j];
+    auto data = tensor.data<float>();
+    for (auto k = 0; k < tensor.numel(); k++) {
+      if (data[k] != expected) {
+        throw std::runtime_error("BOOM!");
+      }
+    }
+  }
+  std::cout << "Allreduce test successful" << std::endl;
+}
+
+void testBroadcast(const std::string& path, int rank, int size) {
+  auto test = BroadcastNCCLTest(path);
+  test.initialize(rank, size);
+
+  const int numDevices = test.numDevices();
+  // Try every permutation of root rank and root tensor
+  for (auto rootRank = 0; rootRank < size; rootRank++) {
+    for (auto rootTensor = 0; rootTensor < numDevices; rootTensor++) {
+      auto work = test.run(rootRank, rootTensor);
+
+      // Wait for work to complete
+      test.wait(work);
+
+      // Check results
+      const auto expected = (rootRank * numDevices + rootTensor);
+      auto tensors = test.getTensors();
+      for (size_t j = 0; j < tensors.size(); j++) {
+        auto& tensor = tensors[j];
+        auto data = tensor.data<float>();
+        for (auto k = 0; k < tensor.numel(); k++) {
+          if (data[k] != expected) {
+            throw std::runtime_error("BOOM!");
+          }
+        }
+      }
+    }
+  }
+  std::cout << "Broadcast test successful" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  // Use WORLD_SIZE and RANK environmental variables to do multi-node
+  // distributed testing
+  auto sizeEnv = std::getenv("WORLD_SIZE");
+  auto rankEnv = std::getenv("RANK");
+
+  int size = 1;
+  int rank = 0;
+
+  if (sizeEnv && rankEnv) {
+    size = std::stoi(std::string(sizeEnv));
+    rank = std::stoi(std::string(rankEnv));
+    std::cout << "Multi-node world size: " << size << " rank: " << rank
+              << std::endl;
+  }
+  {
+    TemporaryFile file;
+    testAllreduce(file.path, rank, size);
+  }
+  {
+    TemporaryFile file;
+    testBroadcast(file.path, rank, size);
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/torch/lib/c10d/test/StoreTestCommon.hpp b/torch/lib/c10d/test/StoreTestCommon.hpp
new file mode 100644
index 0000000..e0c1aa1
--- /dev/null
+++ b/torch/lib/c10d/test/StoreTestCommon.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <c10d/Store.hpp>
+#include <c10d/test/TestUtils.hpp>
+
+namespace c10d {
+namespace test {
+
+inline void set(
+    Store& store,
+    const std::string& key,
+    const std::string& value) {
+  std::vector<uint8_t> data(value.begin(), value.end());
+  store.set(key, data);
+}
+
+inline void check(
+    Store& store,
+    const std::string& key,
+    const std::string& expected) {
+  auto tmp = store.get(key);
+  auto actual = std::string((const char*)tmp.data(), tmp.size());
+  if (actual != expected) {
+    throw std::runtime_error("Expected " + expected + ", got " + actual);
+  }
+}
+
+} // namespace test
+} // namespace c10d
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
new file mode 100644
index 0000000..d5ec6ce
--- /dev/null
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -0,0 +1,88 @@
+#include "StoreTestCommon.hpp"
+
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+#include <c10d/TCPStore.hpp>
+
+int main(int argc, char** argv) {
+  // server store
+  c10d::TCPStore serverStore("127.0.0.1", 29500, true);
+
+  // Basic set/get on the server store
+  c10d::test::set(serverStore, "key0", "value0");
+  c10d::test::set(serverStore, "key1", "value1");
+  c10d::test::set(serverStore, "key2", "value2");
+  c10d::test::check(serverStore, "key0", "value0");
+  c10d::test::check(serverStore, "key1", "value1");
+  c10d::test::check(serverStore, "key2", "value2");
+
+  // Hammer on TCPStore
+  std::vector<std::thread> threads;
+  const auto numThreads = 16;
+  const auto numIterations = 1000;
+  c10d::test::Semaphore sem1, sem2;
+
+  // Each thread will have a client store to send/recv data
+  std::vector<std::unique_ptr<c10d::TCPStore>> clientStores;
+  for (auto i = 0; i < numThreads; i++) {
+    clientStores.push_back(std::unique_ptr<c10d::TCPStore>(
+        new c10d::TCPStore("127.0.0.1", 29500, false)));
+  }
+
+  std::string expectedCounterRes = std::to_string(numThreads * numIterations);
+
+  for (auto i = 0; i < numThreads; i++) {
+    threads.push_back(std::move(
+        std::thread([&sem1, &sem2, &clientStores, i, &expectedCounterRes] {
+          for (auto j = 0; j < numIterations; j++) {
+            clientStores[i]->add("counter", 1);
+          }
+          // Let each thread set and get key on its client store
+          std::string key = "thread_" + std::to_string(i);
+          for (auto j = 0; j < numIterations; j++) {
+            std::string val = "thread_val_" + std::to_string(j);
+            c10d::test::set(*clientStores[i], key, val);
+            c10d::test::check(*clientStores[i], key, val);
+          }
+
+          sem1.post();
+          sem2.wait();
+          // Check the counter results
+          c10d::test::check(*clientStores[i], "counter", expectedCounterRes);
+          // Now check other threads' written data
+          for (auto j = 0; j < numThreads; j++) {
+            if (j == i) {
+              continue;
+            }
+            std::string key = "thread_" + std::to_string(i);
+            std::string val = "thread_val_" + std::to_string(numIterations - 1);
+            c10d::test::check(*clientStores[i], key, val);
+          }
+        })));
+  }
+
+  sem1.wait(numThreads);
+  sem2.post(numThreads);
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  // Clear the store to test that client disconnect won't shutdown the store
+  clientStores.clear();
+
+  // Check that the counter has the expected value
+  c10d::test::check(serverStore, "counter", expectedCounterRes);
+
+  // Check that each threads' written data from the main thread
+  for (auto i = 0; i < numThreads; i++) {
+    std::string key = "thread_" + std::to_string(i);
+    std::string val = "thread_val_" + std::to_string(numIterations - 1);
+    c10d::test::check(serverStore, key, val);
+  }
+
+  std::cout << "Test succeeded" << std::endl;
+  return EXIT_SUCCESS;
+}
diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp
new file mode 100644
index 0000000..26ff7fe
--- /dev/null
+++ b/torch/lib/c10d/test/TestUtils.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <condition_variable>
+#include <mutex>
+#include <string>
+#include <system_error>
+#include <vector>
+
+namespace c10d {
+namespace test {
+
+class Semaphore {
+ public:
+  void post(int n = 1) {
+    std::unique_lock<std::mutex> lock(m_);
+    n_ += n;
+    cv_.notify_all();
+  }
+
+  void wait(int n = 1) {
+    std::unique_lock<std::mutex> lock(m_);
+    while (n_ < n) {
+      cv_.wait(lock);
+    }
+    n_ -= n;
+  }
+
+ protected:
+  int n_ = 0;
+  std::mutex m_;
+  std::condition_variable cv_;
+};
+
+std::string tmppath() {
+  // TMPFILE is for manual test execution during which the user will specify
+  // the full temp file path using the environmental variable TMPFILE
+  const char* tmpfile = getenv("TMPFILE");
+  if (tmpfile) {
+    return std::string(tmpfile);
+  }
+
+  const char* tmpdir = getenv("TMPDIR");
+  if (tmpdir == nullptr) {
+    tmpdir = "/tmp";
+  }
+
+  // Create template
+  std::vector<char> tmp(256);
+  auto len = snprintf(tmp.data(), tmp.size(), "%s/testXXXXXX", tmpdir);
+  tmp.resize(len);
+
+  // Create temporary file
+  auto fd = mkstemp(&tmp[0]);
+  if (fd == -1) {
+    throw std::system_error(errno, std::system_category());
+  }
+  close(fd);
+  return std::string(tmp.data(), tmp.size());
+}
+
+struct TemporaryFile {
+  std::string path;
+
+  TemporaryFile() {
+    path = tmppath();
+  }
+
+  ~TemporaryFile() {
+    unlink(path.c_str());
+  }
+};
+
+struct Fork {
+  pid_t pid;
+
+  Fork() {
+    pid = fork();
+    if (pid < 0) {
+      throw std::system_error(errno, std::system_category(), "fork");
+    }
+  }
+
+  ~Fork() {
+    if (pid > 0) {
+      kill(pid, SIGKILL);
+      waitpid(pid, nullptr, 0);
+    }
+  }
+
+  bool isChild() {
+    return pid == 0;
+  }
+};
+
+} // namespace test
+} // namespace c10d
diff --git a/torch/lib/libshm/CMakeLists.txt b/torch/lib/libshm/CMakeLists.txt
new file mode 100644
index 0000000..bbba3d3
--- /dev/null
+++ b/torch/lib/libshm/CMakeLists.txt
@@ -0,0 +1,75 @@
+project(libshm C CXX)
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+include(${CMAKE_CURRENT_LIST_DIR}/../../../cmake/public/threads.cmake)
+
+FIND_PACKAGE(Caffe2 REQUIRED)
+INCLUDE_DIRECTORIES(${CAFFE2_INCLUDE_DIR})
+
+IF(NOT LIBSHM_INSTALL_LIB_SUBDIR)
+  SET(LIBSHM_INSTALL_LIB_SUBDIR "lib" CACHE PATH "libshm install library directory")
+ENDIF()
+
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+ELSE ()
+  SET(CMAKE_CXX_STANDARD 11)
+ENDIF ()
+
+ADD_LIBRARY(shm SHARED core.cpp)
+ADD_EXECUTABLE(torch_shm_manager manager.cpp)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(shm PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
+TARGET_LINK_LIBRARIES(shm ${CAFFE2_LIBRARIES})
+
+if(UNIX AND NOT APPLE)
+  include(CheckLibraryExists)
+  # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
+  check_library_exists(rt clock_gettime "time.h" NEED_LIBRT)
+  if(NEED_LIBRT)
+    target_link_libraries(shm rt)
+    target_link_libraries(torch_shm_manager rt)
+  else()
+    message(STATUS "Checking if rt requires pthread")
+    # Sometimes, rt won't be available unless you also link against
+    # pthreads.  In this case, the NEED_LIBRT test will fail, because
+    # check_library_exists isn't going to build the C file with the
+    # pthread file, and the build will fail, setting NEED_LIBRT to
+    # false (this is TOTALLY BOGUS, this situation should be an error
+    # situation, not a "oh, I guess rt is not supported", but it's
+    # not too easy to distinguish between the two situations).  So,
+    # if it fails, we try again, but this time also with a dependency
+    # on pthread.  If it succeeds this time, we know we not only need
+    # an rt dependency, but we also need pthread.
+    #
+    # BTW, this test looks for shm_open, because that's what we
+    # really care about (not clock_gettime).  I didn't change the
+    # site above though in case there was a reason we were testing
+    # against clock_gettime. In principle, the choice of symbol you
+    # test for shouldn't matter.
+    set(CMAKE_REQUIRED_LIBRARIES Threads::Threads)
+    check_library_exists(rt shm_open "sys/mman.h" NEED_RT_AND_PTHREAD)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(NEED_RT_AND_PTHREAD)
+      message(STATUS "Needs it, linking against pthread and rt")
+      target_link_libraries(shm rt Threads::Threads)
+      target_link_libraries(torch_shm_manager rt Threads::Threads)
+    endif()
+  endif()
+endif()
+
+
+INSTALL(TARGETS shm LIBRARY DESTINATION ${LIBSHM_INSTALL_LIB_SUBDIR})
+INSTALL(FILES libshm.h DESTINATION "include")
+INSTALL(TARGETS torch_shm_manager DESTINATION "bin")
diff --git a/torch/lib/libshm/alloc_info.h b/torch/lib/libshm/alloc_info.h
new file mode 100644
index 0000000..e441ff5
--- /dev/null
+++ b/torch/lib/libshm/alloc_info.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <unistd.h>
+
+struct AllocInfo {
+  pid_t pid;
+  char free;
+  char filename[60];
+};
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
new file mode 100644
index 0000000..78954c4
--- /dev/null
+++ b/torch/lib/libshm/core.cpp
@@ -0,0 +1,126 @@
+#include <cstring>
+#include <string>
+#include <unordered_map>
+
+#include <TH/TH.h>
+#include "err.h"
+#include "socket.h"
+#include "libshm.h"
+
+std::unordered_map<std::string, ClientSocket> managers;
+std::string manager_executable_path;
+
+AllocInfo get_alloc_info(const char* filename) {
+  AllocInfo info = {0};
+  info.pid = getpid();
+  info.free = false;
+  size_t len = strlen(filename);
+  if (len >= sizeof(info.filename)) {
+    throw std::runtime_error("THMapAllocatorContext_filename too long");
+  }
+  memcpy(info.filename, filename, len + 1);
+  return info;
+}
+
+void start_manager() {
+  int pipe_ends[2];
+  SYSCHECK(pipe(pipe_ends));
+
+  pid_t pid;
+  SYSCHECK(pid = fork());
+  if (!pid) {
+    close(pipe_ends[0]);
+    dup2(pipe_ends[1], 1); // Replace stdout
+    close(pipe_ends[1]);
+    execl(manager_executable_path.c_str(), "torch_shm_manager", NULL);
+    exit(1);
+  }
+  SYSCHECK(close(pipe_ends[1]));
+
+  ssize_t bytes_read;
+  char buffer[1000];
+  std::string handle;
+  for (;;) {
+    SYSCHECK(bytes_read = read(pipe_ends[0], buffer, sizeof(buffer)));
+    handle.append(buffer, bytes_read);
+    if (bytes_read == 0 || handle[handle.length() - 1] == '\n') {
+      break;
+    }
+  }
+  SYSCHECK(close(pipe_ends[0]));
+  if (handle.length() == 0) {
+    std::string msg("error executing torch_shm_manager at \"");
+    msg += manager_executable_path;
+    msg += "\"";
+    throw std::runtime_error(msg);
+  }
+
+  handle.pop_back(); // remove \n
+  if (handle == "ERROR")
+    throw std::exception();
+
+  ClientSocket manager {handle};
+  managers.emplace(std::move(handle), std::move(manager));
+}
+
+ClientSocket& get_manager_socket(const std::string& manager_handle) {
+  auto it = managers.find(manager_handle);
+  if (it == managers.end()) {
+    auto socket = ClientSocket(manager_handle);
+    auto result = managers.emplace(manager_handle, std::move(socket));
+    return result.first->second;
+  } else {
+    return it->second;
+  }
+}
+
+void libshm_init(const char *manager_exec_path) {
+  manager_executable_path = std::string(manager_exec_path);
+}
+
+THManagedMapAllocatorInit::THManagedMapAllocatorInit(const char* manager_handle, const char* filename)
+  : manager_handle_(manager_handle ? manager_handle : "") {
+  // TODO: unlock GIL when contacting the manager
+  try {
+    ClientSocket *socket;
+    if (!manager_handle_.empty()) {
+      socket = &get_manager_socket(manager_handle_);
+    } else {
+      if (managers.size() == 0) {
+        start_manager();
+      }
+      const auto &manager = managers.begin();
+      manager_handle_ = manager->first;
+      socket = &manager->second;
+    }
+    AllocInfo info = get_alloc_info(filename);
+    socket->register_allocation(info);
+  } catch(std::exception &e) {
+    THError(e.what());
+  }
+}
+
+THManagedMapAllocator::THManagedMapAllocator(const char *manager_handle, const char *filename, int flags, ptrdiff_t size)
+  : THManagedMapAllocatorInit(manager_handle, filename), THRefcountedMapAllocator(filename, flags, size) {}
+
+void THManagedMapAllocator::close() {
+  if (closed_) return;
+  AllocInfo info = get_alloc_info(filename());
+  info.free = true;
+  ClientSocket &socket = get_manager_socket(manager_handle_);
+  THRefcountedMapAllocator::close();
+  socket.register_deallocation(info);
+}
+
+static void deleteTHManagedMapAllocator(void* ptr) {
+  delete static_cast<THManagedMapAllocator*>(ptr);
+}
+
+at::DataPtr THManagedMapAllocator::makeDataPtr(const char* manager_handle, const char* filename, int flags, ptrdiff_t size) {
+  auto* context = new THManagedMapAllocator(manager_handle, filename, flags, size);
+  return {context->data(), context, &deleteTHManagedMapAllocator, at::kCPU};
+}
+
+THManagedMapAllocator* THManagedMapAllocator::fromDataPtr(const at::DataPtr& dptr) {
+  return dptr.cast_context<THManagedMapAllocator>(&deleteTHManagedMapAllocator);
+}
diff --git a/torch/lib/libshm/err.h b/torch/lib/libshm/err.h
new file mode 100644
index 0000000..e2244b5
--- /dev/null
+++ b/torch/lib/libshm/err.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <system_error>
+
+#define SYSCHECK(call) { auto __ret = (call); if (__ret < 0) { throw std::system_error(errno, std::system_category()); } }
diff --git a/torch/lib/libshm/libshm.h b/torch/lib/libshm/libshm.h
new file mode 100644
index 0000000..b64d555
--- /dev/null
+++ b/torch/lib/libshm/libshm.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <TH/TH.h>
+
+#ifdef __cplusplus
+
+void libshm_init(const char *manager_exec_path);
+
+// Superclass to run a constructor before THRefcountedMapAllocator
+class THManagedMapAllocatorInit {
+protected:
+  THManagedMapAllocatorInit(const char* manager_handle, const char* filename);
+  std::string manager_handle_;
+};
+
+// Like a THRefcountedMapAllocator, but it also makes use of an external
+// shared memory manager process to ensure that shared memory regions actually
+// get freed in the end (even if processes lose the memory).
+class THManagedMapAllocator : private THManagedMapAllocatorInit, public THRefcountedMapAllocator {
+public:
+  THManagedMapAllocator(const char* manager_handle, const char* filename, int flags, ptrdiff_t size);
+
+  void close() override;
+
+  ~THManagedMapAllocator() { close(); }
+
+  static at::DataPtr makeDataPtr(const char* manager_handle, const char* filename, int flags, ptrdiff_t size);
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+
+  const char* manager_handle() const { return manager_handle_.c_str(); }
+};
+
+#endif
diff --git a/torch/lib/libshm/manager.cpp b/torch/lib/libshm/manager.cpp
new file mode 100644
index 0000000..e68527d
--- /dev/null
+++ b/torch/lib/libshm/manager.cpp
@@ -0,0 +1,162 @@
+#include <sys/mman.h>
+#include <poll.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+
+#include "err.h"
+#include "socket.h"
+
+const int SHUTDOWN_TIMEOUT = 2000; // 2s
+
+#ifdef DEBUG_LOG
+#define COLOR "\033[31;1m"
+#define RESET "\033[0m"
+#define __DEBUG(msg, ...) fprintf(stderr, COLOR msg "%c" RESET, __VA_ARGS__);
+#define DEBUG(...) __DEBUG(__VA_ARGS__, '\n')
+#else
+#define DEBUG(...) (void)0
+#endif
+
+struct ClientSession {
+  ClientSession(ManagerSocket s): socket(std::move(s)), pid(0) {}
+
+  ManagerSocket socket;
+  pid_t pid;
+};
+
+
+std::vector<struct pollfd> pollfds;
+std::unordered_map<int, ClientSession> client_sessions;
+// TODO: check if objects have been freed from time to time
+std::set<std::string> used_objects;
+
+
+void register_fd(int fd) {
+  struct pollfd pfd = {0};
+  pfd.fd = fd;
+  pfd.events = POLLIN;
+  pollfds.push_back(pfd);
+}
+
+
+void unregister_fd(int fd) {
+  pollfds.erase(
+    std::remove_if(pollfds.begin(), pollfds.end(),
+        [fd](const struct pollfd &pfd) { return pfd.fd == fd; }),
+    pollfds.end());
+  client_sessions.erase(fd);
+}
+
+
+void print_init_message(const char *message) {
+  size_t unused;
+  unused = write(1, message, strlen(message));
+  unused = write(1, "\n", 1);
+}
+
+bool object_exists(const char *name) {
+  int fd = shm_open(name, O_RDONLY, 0);
+  if (fd >= 0) {
+    close(fd);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void free_used_object(const std::string &name) {
+  if (!object_exists(name.c_str())) {
+    DEBUG("object %s appears to have been freed", name.c_str());
+    used_objects.erase(name);
+  } else {
+    DEBUG("object %s still exists", name.c_str());
+  }
+}
+
+int main(int argc, char *argv[]) {
+  setsid();  // Daemonize the process
+
+  std::unique_ptr<ManagerServerSocket> srv_socket;
+  try {
+    char tmpfile[L_tmpnam];
+    if (std::tmpnam(tmpfile) == NULL)
+      throw std::runtime_error("could not generate a random filename for manager socket");
+    // TODO: better strategy for generating tmp names
+    // TODO: retry on collisions - this can easily fail
+    srv_socket.reset(new ManagerServerSocket(std::string(tmpfile)));
+    register_fd(srv_socket->socket_fd);
+    print_init_message(tmpfile);
+    DEBUG("opened socket %s", tmpfile);
+  } catch(...) {
+    print_init_message("ERROR");
+    throw;
+  }
+
+  int timeout = -1;
+  std::vector<int> to_add;
+  std::vector<int> to_remove;
+  for (;;) {
+    int nevents;
+    if (client_sessions.size() == 0)
+      timeout = SHUTDOWN_TIMEOUT;
+    SYSCHECK(nevents = poll(pollfds.data(), pollfds.size(), timeout));
+    timeout = -1;
+    if (nevents == 0 && client_sessions.size() == 0)
+      break;
+
+    for (auto &pfd: pollfds) {
+      if (pfd.revents & (POLLERR | POLLHUP)) {
+        // some process died
+        DEBUG("detaching process");
+        auto &session = client_sessions.at(pfd.fd);
+        DEBUG("%d has died", session.pid);
+        to_remove.push_back(pfd.fd);
+      } else if (pfd.revents & POLLIN) {
+        if (pfd.fd == srv_socket->socket_fd) {
+          // someone is joining
+          DEBUG("registered new client");
+          auto client = srv_socket->accept();
+          int fd = client.socket_fd;
+          to_add.push_back(fd);
+          client_sessions.emplace(fd, std::move(client));
+        } else {
+          // someone wants to register a segment
+          DEBUG("got alloc info");
+          auto &session = client_sessions.at(pfd.fd);
+          AllocInfo info = session.socket.receive();
+          session.pid = info.pid;
+          DEBUG("got alloc info: %d %d %s", (int)info.free, info.pid, info.filename);
+          if (info.free) {
+            free_used_object(info.filename);
+          } else {
+            used_objects.insert(info.filename);
+            DEBUG("registered object %s", info.filename);
+            session.socket.confirm();
+          }
+        }
+      }
+    }
+
+    for (int fd: to_add)
+      register_fd(fd);
+    to_add.clear();
+
+    for (int fd: to_remove)
+      unregister_fd(fd);
+    to_remove.clear();
+  }
+
+  for (auto &obj_name: used_objects) {
+    DEBUG("freeing %s", obj_name.c_str());
+    shm_unlink(obj_name.c_str());
+  }
+
+  DEBUG("manager done");
+  return 0;
+}
diff --git a/torch/lib/libshm/socket.h b/torch/lib/libshm/socket.h
new file mode 100644
index 0000000..3969339
--- /dev/null
+++ b/torch/lib/libshm/socket.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <poll.h>
+#include <cstdio>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <cstring>
+
+#include "err.h"
+#include "alloc_info.h"
+
+class Socket {
+public:
+  int socket_fd;
+
+protected:
+  Socket() {
+    SYSCHECK(socket_fd = socket(AF_UNIX, SOCK_STREAM, 0));
+  }
+  Socket(const Socket& other) = delete;
+  Socket(Socket&& other) noexcept : socket_fd(other.socket_fd) { other.socket_fd = -1; };
+  explicit Socket(int fd) : socket_fd(fd) {}
+
+  virtual ~Socket() {
+    if (socket_fd != -1)
+      close(socket_fd);
+  }
+
+  struct sockaddr_un prepare_address(const char *path) {
+    struct sockaddr_un address;
+    address.sun_family = AF_UNIX;
+    strcpy(address.sun_path, path);
+    return address;
+  }
+
+  size_t address_length(struct sockaddr_un address) {
+    return strlen(address.sun_path) + sizeof(address.sun_family);
+  }
+
+  void recv(void *_buffer, size_t num_bytes) {
+    char *buffer = (char*)_buffer;
+    size_t bytes_received = 0;
+    ssize_t step_received;
+    struct pollfd pfd = {0};
+    pfd.fd = socket_fd;
+    pfd.events = POLLIN;
+    while (bytes_received < num_bytes) {
+      SYSCHECK(poll(&pfd, 1, 1000));
+      if (pfd.revents & POLLIN) {
+        SYSCHECK(step_received = ::read(socket_fd, buffer, num_bytes - bytes_received));
+        if (step_received == 0)
+          throw std::runtime_error("Other end has closed the connection");
+        bytes_received += step_received;
+        buffer += step_received;
+      } else if (pfd.revents & (POLLERR | POLLHUP)) {
+        throw std::runtime_error("An error occurred while waiting for the data");
+      } else {
+        throw std::runtime_error("Shared memory manager connection has timed out");
+      }
+    }
+  }
+
+  void send(const void *_buffer, size_t num_bytes) {
+    const char *buffer = (const char*)_buffer;
+    size_t bytes_sent = 0;
+    ssize_t step_sent;
+    while (bytes_sent < num_bytes) {
+      SYSCHECK(step_sent = ::write(socket_fd, buffer, num_bytes));
+      bytes_sent += step_sent;
+      buffer += step_sent;
+    }
+  }
+
+
+};
+
+class ManagerSocket: public Socket {
+public:
+  explicit ManagerSocket(int fd): Socket(fd) {}
+
+  AllocInfo receive() {
+    AllocInfo info;
+    recv(&info, sizeof(info));
+    return info;
+  }
+
+  void confirm() {
+    send("OK", 2);
+  }
+
+};
+
+
+class ManagerServerSocket: public Socket {
+public:
+  explicit ManagerServerSocket(const std::string &path) {
+    socket_path = path;
+    try {
+      struct sockaddr_un address = prepare_address(path.c_str());
+      size_t len = address_length(address);
+      SYSCHECK(bind(socket_fd, (struct sockaddr *)&address, len));
+      SYSCHECK(listen(socket_fd, 10));
+    } catch(std::exception &e) {
+      close(socket_fd);
+      throw;
+    }
+  }
+
+  virtual ~ManagerServerSocket() {
+    unlink(socket_path.c_str());
+  }
+
+  ManagerSocket accept() {
+    int client_fd;
+    struct sockaddr_un addr;
+    socklen_t addr_len = sizeof(addr);
+    SYSCHECK(client_fd = ::accept(socket_fd, (struct sockaddr *)&addr, &addr_len));
+    return ManagerSocket(client_fd);
+  }
+
+  std::string socket_path;
+};
+
+class ClientSocket: public Socket {
+public:
+  explicit ClientSocket(const std::string &path) {
+    try {
+      struct sockaddr_un address = prepare_address(path.c_str());
+      size_t len = address_length(address);
+      SYSCHECK(connect(socket_fd, (struct sockaddr *)&address, len));
+    } catch(std::exception &e) {
+      close(socket_fd);
+      throw;
+    }
+  }
+
+  void register_allocation(AllocInfo &info) {
+    char buffer[3] = {0, 0, 0};
+    ssize_t bytes_read;
+    send(&info, sizeof(info));
+    recv(buffer, 2);
+    if (strcmp(buffer, "OK") != 0)
+      throw std::runtime_error("Shared memory manager didn't respond with an OK");
+  }
+
+  void register_deallocation(AllocInfo &info) {
+    send(&info, sizeof(info));
+  }
+
+};
diff --git a/torch/lib/libshm_windows/CMakeLists.txt b/torch/lib/libshm_windows/CMakeLists.txt
new file mode 100644
index 0000000..a29fbf1
--- /dev/null
+++ b/torch/lib/libshm_windows/CMakeLists.txt
@@ -0,0 +1,28 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+
+IF(NOT LIBSHM_INSTALL_LIB_SUBDIR)
+  SET(LIBSHM_INSTALL_BIN_SUBDIR "bin" CACHE PATH "libshm install binary directory")
+  SET(LIBSHM_INSTALL_LIB_SUBDIR "lib" CACHE PATH "libshm install library directory")
+ENDIF()
+
+ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ADD_DEFINITIONS(-DSHM_EXPORTS)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+ELSE ()
+  SET(CMAKE_CXX_STANDARD 11)
+ENDIF ()
+
+ADD_LIBRARY(shm SHARED core.cpp)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+TARGET_LINK_LIBRARIES(shm ${TH_LIBRARIES})
+
+
+INSTALL(TARGETS shm
+          RUNTIME DESTINATION "${LIBSHM_INSTALL_BIN_SUBDIR}"
+          LIBRARY DESTINATION "${LIBSHM_INSTALL_LIB_SUBDIR}"
+          ARCHIVE DESTINATION "${LIBSHM_INSTALL_LIB_SUBDIR}")
+INSTALL(FILES libshm.h DESTINATION "include")
diff --git a/torch/lib/libshm_windows/core.cpp b/torch/lib/libshm_windows/core.cpp
new file mode 100644
index 0000000..6132440
--- /dev/null
+++ b/torch/lib/libshm_windows/core.cpp
@@ -0,0 +1,23 @@
+#include <cstring>
+#include <string>
+#include <unordered_map>
+
+#include <TH/TH.h>
+#include "libshm.h"
+
+
+void libshm_init(const char *manager_exec_path) {
+}
+
+static void deleteTHManagedMapAllocator(void* ptr) {
+  delete static_cast<THManagedMapAllocator*>(ptr);
+}
+
+at::DataPtr THManagedMapAllocator::makeDataPtr(const char* manager_handle, const char* filename, int flags, ptrdiff_t size) {
+  auto* context = new THManagedMapAllocator(manager_handle, filename, flags, size);
+  return {context->data(), context, &deleteTHManagedMapAllocator, at::kCPU};
+}
+
+THManagedMapAllocator* THManagedMapAllocator::fromDataPtr(const at::DataPtr& dptr) {
+  return dptr.cast_context<THManagedMapAllocator>(&deleteTHManagedMapAllocator);
+}
diff --git a/torch/lib/libshm_windows/libshm.h b/torch/lib/libshm_windows/libshm.h
new file mode 100644
index 0000000..2bc09dc
--- /dev/null
+++ b/torch/lib/libshm_windows/libshm.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <TH/TH.h>
+
+#ifdef __cplusplus
+
+#ifdef SHM_EXPORTS
+# define SHM_API __declspec(dllexport)
+#else
+# define SHM_API __declspec(dllimport)
+#endif
+
+SHM_API void libshm_init(const char *manager_exec_path);
+
+class SHM_API THManagedMapAllocator : public THRefcountedMapAllocator {
+public:
+  THManagedMapAllocator(const char* manager_handle, const char* filename, int flags, ptrdiff_t size)
+    : THRefcountedMapAllocator(filename, flags, size) {}
+
+  static at::DataPtr makeDataPtr(const char* manager_handle, const char* filename, int flags, ptrdiff_t size);
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+
+  const char* manager_handle() const { return "no_manager"; }
+};
+
+#endif
diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py
new file mode 100644
index 0000000..13d62d4
--- /dev/null
+++ b/torch/multiprocessing/__init__.py
@@ -0,0 +1,67 @@
+"""
+torch.multiprocessing is a wrapper around the native :mod:`multiprocessing`
+module. It registers custom reducers, that use shared memory to provide shared
+views on the same data in different processes. Once the tensor/storage is moved
+to shared_memory (see :func:`~torch.Tensor.share_memory_`), it will be possible
+to send it to other processes without making any copies.
+
+The API is 100% compatible with the original module - it's enough to change
+``import multiprocessing`` to ``import torch.multiprocessing`` to have all the
+tensors sent through the queues or shared via other mechanisms, moved to shared
+memory.
+
+Because of the similarity of APIs we do not document most of this package
+contents, and we recommend referring to very good docs of the original module.
+"""
+import sys
+from .reductions import init_reductions
+import multiprocessing
+
+__all__ = ['set_sharing_strategy', 'get_sharing_strategy',
+           'get_all_sharing_strategies']
+
+
+from multiprocessing import *
+
+
+__all__ += multiprocessing.__all__
+
+
+if sys.version_info < (3, 3):
+    """Override basic classes in Python 2.7 and Python 3.3 to use ForkingPickler
+    for serialization. Later versions of Python already use ForkingPickler."""
+    from .queue import Queue, SimpleQueue
+    from .pool import Pool
+
+
+if sys.platform == 'darwin' or sys.platform == 'win32':
+    _sharing_strategy = 'file_system'
+    _all_sharing_strategies = {'file_system'}
+else:
+    _sharing_strategy = 'file_descriptor'
+    _all_sharing_strategies = {'file_descriptor', 'file_system'}
+
+
+def set_sharing_strategy(new_strategy):
+    """Sets the strategy for sharing CPU tensors.
+
+    Arguments:
+        new_strategy (str): Name of the selected strategy. Should be one of
+            the values returned by :func:`get_all_sharing_strategies()`.
+    """
+    global _sharing_strategy
+    assert new_strategy in _all_sharing_strategies
+    _sharing_strategy = new_strategy
+
+
+def get_sharing_strategy():
+    """Returns the current strategy for sharing CPU tensors."""
+    return _sharing_strategy
+
+
+def get_all_sharing_strategies():
+    """Returns a set of sharing strategies supported on a current system."""
+    return _all_sharing_strategies
+
+
+init_reductions()
diff --git a/torch/multiprocessing/pool.py b/torch/multiprocessing/pool.py
new file mode 100644
index 0000000..b768c05
--- /dev/null
+++ b/torch/multiprocessing/pool.py
@@ -0,0 +1,44 @@
+import multiprocessing
+import multiprocessing.pool
+import multiprocessing.util as util
+
+from .queue import SimpleQueue
+
+
+def clean_worker(*args, **kwargs):
+    import gc
+    multiprocessing.pool.worker(*args, **kwargs)
+    # Regular multiprocessing workers don't fully clean up after themselves,
+    # so we have to explicitly trigger garbage collection to make sure that all
+    # destructors are called...
+    gc.collect()
+
+
+class Pool(multiprocessing.pool.Pool):
+    """Pool implementation which uses our version of SimpleQueue.
+    This lets us pass tensors in shared memory across processes instead of
+    serializing the underlying data."""
+
+    def _setup_queues(self):
+        self._inqueue = SimpleQueue()
+        self._outqueue = SimpleQueue()
+        self._quick_put = self._inqueue._writer.send
+        self._quick_get = self._outqueue._reader.recv
+
+    def _repopulate_pool(self):
+        """Bring the number of pool processes up to the specified number,
+        for use after reaping workers which have exited.
+        """
+        for i in range(self._processes - len(self._pool)):
+            # changed worker -> clean_worker
+            args = (self._inqueue, self._outqueue,
+                    self._initializer,
+                    self._initargs, self._maxtasksperchild)
+            if hasattr(self, '_wrap_exception'):
+                args += (self._wrap_exception,)
+            w = self.Process(target=clean_worker, args=args)
+            self._pool.append(w)
+            w.name = w.name.replace('Process', 'PoolWorker')
+            w.daemon = True
+            w.start()
+            util.debug('added worker')
diff --git a/torch/multiprocessing/queue.py b/torch/multiprocessing/queue.py
new file mode 100644
index 0000000..9696ee4
--- /dev/null
+++ b/torch/multiprocessing/queue.py
@@ -0,0 +1,47 @@
+import io
+import multiprocessing
+import multiprocessing.queues
+from multiprocessing.reduction import ForkingPickler
+import pickle
+
+
+class ConnectionWrapper(object):
+    """Proxy class for _multiprocessing.Connection which uses ForkingPickler to
+    serialize objects"""
+
+    def __init__(self, conn):
+        self.conn = conn
+
+    def send(self, obj):
+        buf = io.BytesIO()
+        ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)
+        self.send_bytes(buf.getvalue())
+
+    def recv(self):
+        buf = self.recv_bytes()
+        return pickle.loads(buf)
+
+    def __getattr__(self, name):
+        if 'conn' in self.__dict__:
+            return getattr(self.conn, name)
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, 'conn'))
+
+
+class Queue(multiprocessing.queues.Queue):
+
+    def __init__(self, *args, **kwargs):
+        super(Queue, self).__init__(*args, **kwargs)
+        self._reader = ConnectionWrapper(self._reader)
+        self._writer = ConnectionWrapper(self._writer)
+        self._send = self._writer.send
+        self._recv = self._reader.recv
+
+
+class SimpleQueue(multiprocessing.queues.SimpleQueue):
+
+    def _make_methods(self):
+        if not isinstance(self._reader, ConnectionWrapper):
+            self._reader = ConnectionWrapper(self._reader)
+            self._writer = ConnectionWrapper(self._writer)
+        super(SimpleQueue, self)._make_methods()
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
new file mode 100644
index 0000000..69518bc
--- /dev/null
+++ b/torch/multiprocessing/reductions.py
@@ -0,0 +1,217 @@
+import torch
+import os
+import weakref
+import multiprocessing
+from multiprocessing.reduction import ForkingPickler
+import sys
+try:
+    # Early load resource_sharer to prevent a partially initialized instance
+    # from being inherited in a forked child process. The reduce_storage method
+    # requires this module indirectly through DupFd(). The built-in mp.Queue
+    # class pickles arguments in a background thread which may overlap with the
+    # fork.
+    import multiprocessing.resource_sharer
+except ImportError:
+    pass
+
+
+class StorageRef(object):
+    # An object with a cdata field which may be set to None. We subclass object
+    # instead of using a dict() to support weak references.
+
+    def __init__(self, ptr):
+        self.cdata = ptr
+
+    def __del__(self):
+        torch.Storage._free_weak_ref(self.cdata)
+
+
+# mapping from handles to StorageRef objects
+shared_cache = weakref.WeakValueDictionary()
+
+
+def rebuild_event(handle):
+    return torch.cuda.Event(_handle=handle)
+
+
+def reduce_event(event):
+    return (rebuild_event, (event.ipc_handle(),))
+
+
+def rebuild_tensor(cls, storage, metadata):
+    storage_offset, size, stride = metadata
+    return torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
+
+
+def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
+                        storage_cls, storage_device, storage_handle, storage_size):
+
+    storage = storage_from_cache(storage_cls, storage_handle)
+    if storage is None:
+        torch.cuda._lazy_init()
+        storage = storage_cls._new_shared_cuda(storage_device, storage_handle, storage_size)
+        shared_cache[storage_handle] = storage._weak_ref(StorageRef)
+
+    return torch._utils._rebuild_tensor(storage, tensor_offset, tensor_size, tensor_stride)
+
+
+def reduce_tensor(tensor):
+    storage = tensor.storage()
+
+    # Note [CUDA IPC and the caching allocator]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # When you send a CUDA tensor over IPC, you might expect that you will
+    # get out the same storage from the other end.  However, the CUDA caching
+    # allocator makes it difficult to preserve this invariant.  Consider
+    # the following situation: a tensor of size 0x100 points to offset 0x20 of
+    # a storage at 0xA100 of size 0x100.  (For simplicity, all of these
+    # sizes are given in bytes).  HOWEVER, with the caching allocator, this storage
+    # might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000.
+    #
+    # When we want to send this CUDA tensor over IPC, we must send the
+    # *entire* cudaMalloc allocation, i.e., the 0xA000 region, not just
+    # the storage 0xA100 (because that is what CUDA supports).  So, on the
+    # other end, there simply isn't any way to say, "Wait, you gave me
+    # a bigger region (0xA000) than the one I wanted (0xA100)"; we have
+    # to just make a storage for the entire caching allocator block.
+    #
+    # This is fine, because all we need to do is just adjust the offset
+    # on the tensor itself: instead of:
+    #
+    #   Tensor(size=0x100, offset=0x020, storage=Storage(data=0xA100, size=0x0100))
+    #
+    # we have
+    #
+    #   Tensor(size=0x100, offset=0x120, storage=Storage(data=0xA000, size=0x4000))
+    #
+    # This strategy has a few implications:
+    #
+    # 1. When we serialize a CUDA tensor for IPC, we have to do it all in one
+    #    go (non-compositionally), instead of first serializing storage, and
+    #    then serializing tensor.  This is because the base address of the
+    #    storage allocation affects what offset we write into the tensor.
+    #
+    # 2. We MUST NOT let the new IPC tensor be resizable.  Originally, a resize
+    #    of the storage beyond 0x100 would merely have caused us to do a
+    #    reallocation.  You don't really want to do this, but if you did,
+    #    all that would happen is that you would lose IPC sharing.  But if
+    #    you do this in the new world, we will happily let you write out of
+    #    bounds of your "allocation", clobbering unrelated data in the cached
+    #    allocator block.  BAD!
+    #
+    # By the way, in old versions of PyTorch, we supported this situation
+    # natively using a "storage view", which permitted multiple storages to be
+    # views on each other.  But this was the *only* use of storage views, so we
+    # eliminated it so that we could just use tensor views to implement the same
+    # thing.
+    #
+    if storage.is_cuda:
+        (device, handle, storage_size, storage_offset) = storage._share_cuda_()
+        tensor_offset = tensor.storage_offset()
+
+        # WARNING!  This call to _weak_ref could lead to O(n) deleter
+        # behavior, if you repeatedly call it on the same Storage (all
+        # other sites are guarded by shared_cache; maybe this site
+        # should be too?)
+        shared_cache[handle] = storage._weak_ref(StorageRef)
+
+        return (rebuild_cuda_tensor,
+                (type(tensor),
+                 tensor.size(),
+                 tensor.stride(),
+                 tensor_offset + storage_offset,
+                 type(storage),
+                 device,
+                 handle,
+                 storage_size))
+
+    metadata = (tensor.storage_offset(), tensor.size(), tensor.stride())
+    return (rebuild_tensor, (type(tensor), storage, metadata))
+
+
+def fd_id(fd):
+    # Returns a tuple which uniquely identifies a file descriptor. In Mac OS,
+    # this doesn't work with shared memory handles, which is why we don't
+    # support the "file_descriptor" sharing method on that platform.
+    stat = os.fstat(fd)
+    return (stat.st_ino, stat.st_dev)
+
+
+def storage_from_cache(cls, key):
+    storage_ref = shared_cache.get(key)
+    if storage_ref is None:
+        return None
+    return cls._new_with_weak_ptr(storage_ref)
+
+
+def rebuild_storage_fd(cls, df, size):
+    if sys.version_info[0] == 2:
+        fd = multiprocessing.reduction.rebuild_handle(df)
+    else:
+        fd = df.detach()
+    try:
+        storage = storage_from_cache(cls, fd_id(fd))
+        if storage is not None:
+            return storage
+        storage = cls._new_shared_fd(fd, size)
+        shared_cache[fd_id(fd)] = storage._weak_ref(StorageRef)
+        return storage
+    finally:
+        os.close(fd)
+
+
+def rebuild_storage_filename(cls, manager, handle, size):
+    storage = storage_from_cache(cls, handle)
+    if storage is not None:
+        return storage._shared_decref()
+    storage = cls._new_shared_filename(manager, handle, size)
+    shared_cache[handle] = storage._weak_ref(StorageRef)
+    return storage._shared_decref()
+
+
+def rebuild_storage_empty(cls):
+    return cls()
+
+
+def reduce_storage(storage):
+    from . import get_sharing_strategy
+    if storage.is_cuda:
+        raise RuntimeError("Cannot pickle CUDA storage; try pickling a CUDA tensor instead")
+    elif get_sharing_strategy() == 'file_system':
+        metadata = storage._share_filename_()
+        cache_key = metadata[1]
+        rebuild = rebuild_storage_filename
+        storage._shared_incref()
+    elif storage.size() == 0:
+        # This is special cased because Empty tensors
+        # (with size 0) cannot be mmapped.
+        return (rebuild_storage_empty, (type(storage),))
+    else:
+        fd, size = storage._share_fd_()
+        if sys.version_info[0] == 2:
+            df = multiprocessing.reduction.reduce_handle(fd)
+        else:
+            df = multiprocessing.reduction.DupFd(fd)
+        cache_key = fd_id(fd)
+        metadata = (df, size)
+        rebuild = rebuild_storage_fd
+
+    # WARNING!  This call to _weak_ref could lead to O(n) deleter
+    # behavior, if you repeatedly call it on the same Storage (all
+    # other sites are guarded by shared_cache; maybe this site
+    # should be too?)
+    shared_cache[cache_key] = storage._weak_ref(StorageRef)
+    return (rebuild, (type(storage),) + metadata)
+
+
+def init_reductions():
+    ForkingPickler.register(torch.cuda.Event, reduce_event)
+
+    for t in torch._storage_classes:
+        ForkingPickler.register(t, reduce_storage)
+
+    for t in torch._tensor_classes:
+        ForkingPickler.register(t, reduce_tensor)
+
+    # TODO: Maybe this should be in tensor_classes? :)
+    ForkingPickler.register(torch.Tensor, reduce_tensor)
diff --git a/torch/nn/__init__.py b/torch/nn/__init__.py
new file mode 100644
index 0000000..b870a55
--- /dev/null
+++ b/torch/nn/__init__.py
@@ -0,0 +1,5 @@
+from .modules import *
+from .parameter import Parameter
+from .parallel import DataParallel
+from . import init
+from . import utils
diff --git a/torch/nn/_functions/__init__.py b/torch/nn/_functions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
new file mode 100644
index 0000000..e35ff86
--- /dev/null
+++ b/torch/nn/_functions/dropout.py
@@ -0,0 +1,130 @@
+import torch
+from torch.autograd.function import InplaceFunction
+from itertools import repeat
+
+
+class Dropout(InplaceFunction):
+
+    @staticmethod
+    def _make_noise(input):
+        return input.new().resize_as_(input)
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        r, _ = g.op("Dropout", input, ratio_f=p, is_test_i=not train, outputs=2)
+        return r
+
+    @classmethod
+    def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        ctx.p = p
+        ctx.train = train
+        ctx.inplace = inplace
+
+        if ctx.p == 0 or not ctx.train:
+            return input
+
+        if ctx.inplace:
+            ctx.mark_dirty(input)
+            output = input
+        else:
+            output = input.clone()
+
+        ctx.noise = cls._make_noise(input)
+        if ctx.p == 1:
+            ctx.noise.fill_(0)
+        else:
+            ctx.noise.bernoulli_(1 - ctx.p).div_(1 - ctx.p)
+        ctx.noise = ctx.noise.expand_as(input)
+        output.mul_(ctx.noise)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.p > 0 and ctx.train:
+            return grad_output * ctx.noise, None, None, None
+        else:
+            return grad_output, None, None, None
+
+
+class FeatureDropout(Dropout):
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        from torch.onnx.symbolic import _unimplemented
+        if train:
+            return _unimplemented("FeatureDropout", "training mode")
+        return input
+
+    @staticmethod
+    def _make_noise(input):
+        return input.new().resize_(input.size(0), input.size(1),
+                                   *repeat(1, input.dim() - 2))
+
+
+class AlphaDropout(Dropout):
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        from torch.onnx.symbolic import _unimplemented
+        if train:
+            return _unimplemented("AlphaDropout", "training mode")
+        return input
+
+    @classmethod
+    def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        ctx.p = p
+        ctx.train = train
+        ctx.inplace = inplace
+
+        if ctx.p == 0 or not ctx.train:
+            return input
+
+        if ctx.inplace:
+            ctx.mark_dirty(input)
+            output = input
+        else:
+            output = input.clone()
+
+        ctx.noise = cls._make_noise(input)
+        if ctx.p == 1:
+            a = 0
+            b = ctx.noise
+        else:
+            ctx.noise.bernoulli_(1 - ctx.p)
+            alpha = 1.7580993408473766
+            a = ((alpha ** 2 * ctx.p + 1) * (1 - ctx.p)) ** (-0.5)
+            b = ctx.noise.add(-1).mul_(alpha * a).add_(alpha * a * ctx.p)
+        ctx.noise = ctx.noise.mul_(a).expand_as(input)
+        b = b.expand_as(input)
+        output.mul_(ctx.noise).add_(b)
+
+        return output
+
+
+class FeatureAlphaDropout(AlphaDropout):
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        from torch.onnx.symbolic import _unimplemented
+        if train:
+            return _unimplemented("FeatureAlphaDropout", "training mode")
+        return input
+
+    @staticmethod
+    def _make_noise(input):
+        return input.new().resize_(input.size(0), input.size(1),
+                                   *repeat(1, input.dim() - 2))
diff --git a/torch/nn/_functions/packing.py b/torch/nn/_functions/packing.py
new file mode 100644
index 0000000..a45cb63
--- /dev/null
+++ b/torch/nn/_functions/packing.py
@@ -0,0 +1,56 @@
+import torch
+from torch.autograd import Function
+
+
+class PackPadded(Function):
+    @staticmethod
+    def forward(ctx, input, lengths, batch_first):
+        if batch_first:
+            input = input.transpose(0, 1)
+
+        if lengths[-1] <= 0:
+            raise ValueError("Length of all samples has to be greater than 0, "
+                             "but found an element in 'lengths' that is <= 0")
+
+        steps = []
+        batch_sizes = []
+
+        # lengths is a Tensor, so we must convert to [int] before reversed()
+        lengths_iter = reversed(lengths.tolist())
+
+        batch_size = input.size(1)
+
+        if len(lengths) != batch_size:
+            raise ValueError("Expected `len(lengths)` to be equal to batch_size, but got "
+                             "{} (batch_size={}).".format(len(lengths), batch_size))
+
+        prev_l = 0
+        for i, l in enumerate(lengths_iter):
+            if l > prev_l:
+                c_batch_size = batch_size - i
+                steps.append(input[prev_l:l, :c_batch_size].contiguous().view(-1, *input.size()[2:]))
+                batch_sizes.extend([c_batch_size] * (l - prev_l))
+                prev_l = l
+
+            elif prev_l > l:
+                raise ValueError("'lengths' array has to be sorted in decreasing order")
+
+        ctx.batch_sizes = batch_sizes
+        ctx.batch_first = batch_first
+        ctx.input_size = input.size()
+
+        return torch.cat(steps), torch.LongTensor(batch_sizes)
+
+    @staticmethod
+    def backward(ctx, grad_steps, grad_batch_sizes):
+        grad_input = grad_steps.new(*ctx.input_size).zero_()
+
+        offset = 0
+        for i, bs in enumerate(ctx.batch_sizes):
+            grad_input[i, :bs] = grad_steps[offset:offset + bs]
+            offset += bs
+
+        if ctx.batch_first:
+            grad_input = grad_input.transpose(0, 1)
+
+        return grad_input, None, None
diff --git a/torch/nn/_functions/padding.py b/torch/nn/_functions/padding.py
new file mode 100644
index 0000000..5b00fc6
--- /dev/null
+++ b/torch/nn/_functions/padding.py
@@ -0,0 +1,75 @@
+from torch.autograd import Function, Variable
+from torch.autograd._functions.utils import prepare_onnx_paddings
+
+
+class ConstantPadNd(Function):
+
+    @staticmethod
+    def symbolic(g, input, pad, value=0):
+        paddings = prepare_onnx_paddings(len(input.type().sizes()), pad)
+        return g.op("Pad", input, pads_i=paddings, mode_s="constant", value_f=value)
+
+    @staticmethod
+    def forward(ctx, input, pad, value=0):
+        ctx.pad = pad
+        ctx.value = value
+        ctx.input_size = input.size()
+        ctx.l_inp = len(input.size())
+        ctx.pad_tup = tuple([(a, b) for a, b in zip(pad[:-1:2], pad[1::2])][::-1])
+        ctx.l_pad = len(ctx.pad_tup)
+        ctx.l_diff = ctx.l_inp - ctx.l_pad
+        assert ctx.l_inp >= ctx.l_pad
+
+        new_dim = tuple([sum((d,) + ctx.pad_tup[i]) for i, d in enumerate(input.size()[-ctx.l_pad:])])
+        assert all([d > 0 for d in new_dim]), 'input is too small'
+
+        # crop input if necessary
+        output = input.new(input.size()[:(ctx.l_diff)] + new_dim).fill_(ctx.value)
+        c_input = input
+
+        for i, p in zip(range(ctx.l_inp)[-ctx.l_pad:], ctx.pad_tup):
+            if p[0] < 0:
+                c_input = c_input.narrow(i, -p[0], c_input.size(i) + p[0])
+            if p[1] < 0:
+                c_input = c_input.narrow(i, 0, c_input.size(i) + p[1])
+
+        # crop output if necessary
+        c_output = output
+        for i, p in zip(range(ctx.l_inp)[-ctx.l_pad:], ctx.pad_tup):
+            if p[0] > 0:
+                c_output = c_output.narrow(i, p[0], c_output.size(i) - p[0])
+            if p[1] > 0:
+                c_output = c_output.narrow(i, 0, c_output.size(i) - p[1])
+        c_output.copy_(c_input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = Variable(grad_output.data.new(ctx.input_size).zero_())
+        grad_input_slices = [slice(0, x,) for x in ctx.input_size]
+
+        def narrow_slice(dim, start, length):
+            grad_input_slices[dim] = (slice(grad_input_slices[dim].start + start,
+                                            grad_input_slices[dim].start + start + length))
+
+        def slice_length(dim):
+            return grad_input_slices[dim].stop - grad_input_slices[dim].start
+
+        #  crop grad_input if necessary
+        for i, p in zip(range(ctx.l_inp)[-ctx.l_pad:], ctx.pad_tup):
+            if p[0] < 0:
+                narrow_slice(i, -p[0], slice_length(i) + p[0])
+            if p[1] < 0:
+                narrow_slice(i, 0, slice_length(i) + p[1])
+
+        # crop grad_output if necessary
+        cg_output = grad_output
+        for i_s, p in zip(range(ctx.l_inp)[-ctx.l_pad:], ctx.pad_tup):
+            if p[0] > 0:
+                cg_output = cg_output.narrow(i_s, p[0], cg_output.size(i_s) - p[0])
+            if p[1] > 0:
+                cg_output = cg_output.narrow(i_s, 0, cg_output.size(i_s) - p[1])
+        gis = tuple(grad_input_slices)
+        grad_input[gis] = cg_output
+
+        return grad_input, None, None
diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py
new file mode 100644
index 0000000..c7f5d10
--- /dev/null
+++ b/torch/nn/_functions/rnn.py
@@ -0,0 +1,326 @@
+import warnings
+from torch.autograd import NestedIOFunction
+import torch.backends.cudnn as cudnn
+from .. import functional as F
+import torch
+from .thnn import rnnFusedPointwise as fusedBackend
+import itertools
+from functools import partial
+
+try:
+    import torch.backends.cudnn.rnn
+except ImportError:
+    pass
+
+
+def RNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
+    hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
+    return hy
+
+
+def RNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
+    hy = torch.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
+    return hy
+
+
+def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
+    if input.is_cuda:
+        igates = F.linear(input, w_ih)
+        hgates = F.linear(hidden[0], w_hh)
+        state = fusedBackend.LSTMFused.apply
+        return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)
+
+    hx, cx = hidden
+    gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
+
+    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+
+    ingate = torch.sigmoid(ingate)
+    forgetgate = torch.sigmoid(forgetgate)
+    cellgate = torch.tanh(cellgate)
+    outgate = torch.sigmoid(outgate)
+
+    cy = (forgetgate * cx) + (ingate * cellgate)
+    hy = outgate * torch.tanh(cy)
+
+    return hy, cy
+
+
+def GRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
+
+    if input.is_cuda:
+        gi = F.linear(input, w_ih)
+        gh = F.linear(hidden, w_hh)
+        state = fusedBackend.GRUFused.apply
+        return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
+
+    gi = F.linear(input, w_ih, b_ih)
+    gh = F.linear(hidden, w_hh, b_hh)
+    i_r, i_i, i_n = gi.chunk(3, 1)
+    h_r, h_i, h_n = gh.chunk(3, 1)
+
+    resetgate = torch.sigmoid(i_r + h_r)
+    inputgate = torch.sigmoid(i_i + h_i)
+    newgate = torch.tanh(i_n + resetgate * h_n)
+    hy = newgate + inputgate * (hidden - newgate)
+
+    return hy
+
+
+def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
+
+    num_directions = len(inners)
+    total_layers = num_layers * num_directions
+
+    def forward(input, hidden, weight, batch_sizes):
+        assert(len(weight) == total_layers)
+        next_hidden = []
+
+        if lstm:
+            hidden = list(zip(*hidden))
+
+        for i in range(num_layers):
+            all_output = []
+            for j, inner in enumerate(inners):
+                l = i * num_directions + j
+
+                hy, output = inner(input, hidden[l], weight[l], batch_sizes)
+                next_hidden.append(hy)
+                all_output.append(output)
+
+            input = torch.cat(all_output, input.dim() - 1)
+
+            if dropout != 0 and i < num_layers - 1:
+                input = F.dropout(input, p=dropout, training=train, inplace=False)
+
+        if lstm:
+            next_h, next_c = zip(*next_hidden)
+            next_hidden = (
+                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
+                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
+            )
+        else:
+            next_hidden = torch.cat(next_hidden, 0).view(
+                total_layers, *next_hidden[0].size())
+
+        return next_hidden, input
+
+    return forward
+
+
+def Recurrent(inner, reverse=False):
+    def forward(input, hidden, weight, batch_sizes):
+        output = []
+        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
+        for i in steps:
+            hidden = inner(input[i], hidden, *weight)
+            # hack to handle LSTM
+            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
+
+        if reverse:
+            output.reverse()
+        output = torch.cat(output, 0).view(input.size(0), *output[0].size())
+
+        return hidden, output
+
+    return forward
+
+
+def variable_recurrent_factory(inner, reverse=False):
+    if reverse:
+        return VariableRecurrentReverse(inner)
+    else:
+        return VariableRecurrent(inner)
+
+
+def VariableRecurrent(inner):
+    def forward(input, hidden, weight, batch_sizes):
+
+        output = []
+        input_offset = 0
+        last_batch_size = batch_sizes[0]
+        hiddens = []
+        flat_hidden = not isinstance(hidden, tuple)
+        if flat_hidden:
+            hidden = (hidden,)
+        for batch_size in batch_sizes:
+            step_input = input[input_offset:input_offset + batch_size]
+            input_offset += batch_size
+
+            dec = last_batch_size - batch_size
+            if dec > 0:
+                hiddens.append(tuple(h[-dec:] for h in hidden))
+                hidden = tuple(h[:-dec] for h in hidden)
+            last_batch_size = batch_size
+
+            if flat_hidden:
+                hidden = (inner(step_input, hidden[0], *weight),)
+            else:
+                hidden = inner(step_input, hidden, *weight)
+
+            output.append(hidden[0])
+        hiddens.append(hidden)
+        hiddens.reverse()
+
+        hidden = tuple(torch.cat(h, 0) for h in zip(*hiddens))
+        assert hidden[0].size(0) == batch_sizes[0]
+        if flat_hidden:
+            hidden = hidden[0]
+        output = torch.cat(output, 0)
+
+        return hidden, output
+
+    return forward
+
+
+def VariableRecurrentReverse(inner):
+    def forward(input, hidden, weight, batch_sizes):
+        output = []
+        input_offset = input.size(0)
+        last_batch_size = batch_sizes[-1]
+        initial_hidden = hidden
+        flat_hidden = not isinstance(hidden, tuple)
+        if flat_hidden:
+            hidden = (hidden,)
+            initial_hidden = (initial_hidden,)
+        hidden = tuple(h[:batch_sizes[-1]] for h in hidden)
+        for i in reversed(range(len(batch_sizes))):
+            batch_size = batch_sizes[i]
+            inc = batch_size - last_batch_size
+            if inc > 0:
+                hidden = tuple(torch.cat((h, ih[last_batch_size:batch_size]), 0)
+                               for h, ih in zip(hidden, initial_hidden))
+            last_batch_size = batch_size
+            step_input = input[input_offset - batch_size:input_offset]
+            input_offset -= batch_size
+
+            if flat_hidden:
+                hidden = (inner(step_input, hidden[0], *weight),)
+            else:
+                hidden = inner(step_input, hidden, *weight)
+            output.append(hidden[0])
+
+        output.reverse()
+        output = torch.cat(output, 0)
+        if flat_hidden:
+            hidden = hidden[0]
+        return hidden, output
+
+    return forward
+
+
+def AutogradRNN(mode, input_size, hidden_size, num_layers=1, batch_first=False,
+                dropout=0, train=True, bidirectional=False, variable_length=False,
+                dropout_state=None, flat_weight=None):
+
+    if mode == 'RNN_RELU':
+        cell = RNNReLUCell
+    elif mode == 'RNN_TANH':
+        cell = RNNTanhCell
+    elif mode == 'LSTM':
+        cell = LSTMCell
+    elif mode == 'GRU':
+        cell = GRUCell
+    else:
+        raise Exception('Unknown mode: {}'.format(mode))
+
+    rec_factory = variable_recurrent_factory if variable_length else Recurrent
+
+    if bidirectional:
+        layer = (rec_factory(cell), rec_factory(cell, reverse=True))
+    else:
+        layer = (rec_factory(cell),)
+
+    func = StackedRNN(layer,
+                      num_layers,
+                      (mode == 'LSTM'),
+                      dropout=dropout,
+                      train=train)
+
+    def forward(input, weight, hidden, batch_sizes):
+        if batch_first and not variable_length:
+            input = input.transpose(0, 1)
+
+        nexth, output = func(input, hidden, weight, batch_sizes)
+
+        if batch_first and not variable_length:
+            output = output.transpose(0, 1)
+
+        return output, nexth
+
+    return forward
+
+
+def CudnnRNN(mode, input_size, hidden_size, num_layers=1,
+             batch_first=False, dropout=0, train=True, bidirectional=False,
+             variable_length=False, dropout_state=None, flat_weight=None):
+    if dropout_state is None:
+        dropout_state = {}
+    mode = cudnn.rnn.get_cudnn_mode(mode)
+    # TODO: This is really goofy way of using the Torch RNG to get a random number
+    dropout_seed = int(torch.IntTensor(1).random_())
+    if flat_weight is None:
+        warnings.warn("RNN module weights are not part of single contiguous "
+                      "chunk of memory. This means they need to be compacted "
+                      "at every call, possibly greatly increasing memory usage. "
+                      "To compact weights again call flatten_parameters().", stacklevel=5)
+
+    def forward(input, weight, hx, batch_sizes):
+        if mode == cudnn.CUDNN_LSTM:
+            hx, cx = hx
+        else:
+            cx = None
+
+        handle = cudnn.get_handle()
+        with torch.cuda.device(input.get_device()):
+            dropout_ts = cudnn.rnn.init_dropout_state(dropout, train, dropout_seed, dropout_state)
+
+        weight_arr = list(itertools.chain.from_iterable(weight))
+        weight_stride0 = len(weight[0])
+
+        output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
+            input, weight_arr, weight_stride0,
+            flat_weight,
+            hx, cx,
+            mode, hidden_size, num_layers,
+            batch_first, dropout, train, bool(bidirectional),
+            list(batch_sizes.data) if variable_length else (),
+            dropout_ts)
+
+        if cx is not None:
+            return (output, (hy, cy))
+        else:
+            return (output, hy)
+
+    return forward
+
+
+def RNN(*args, **kwargs):
+
+    def forward(input, *fargs, **fkwargs):
+        if cudnn.is_acceptable(input.data):
+            func = CudnnRNN(*args, **kwargs)
+        else:
+            func = AutogradRNN(*args, **kwargs)
+
+        # Hack for the tracer that allows us to represent RNNs as single
+        # nodes and export them to ONNX in this form
+        # Check the first argument explicitly to reduce the overhead of creating
+        # the lambda. We need special handling here because the forward()
+        # function gets reconstructed each and every time when RNN() is invoked
+        # and we don't want to pay the cost of decorator invocation
+        import torch
+        if torch._C._jit_is_tracing(input):
+            import torch.onnx.symbolic
+            sym = torch.onnx.symbolic.RNN_symbolic_builder(*args, **kwargs)
+            cell_type = args[0]
+
+            bound_symbolic = partial(torch.onnx.symbolic.rnn_trace_override_symbolic,
+                                     cell_type, func, sym)
+
+            decorator = torch.onnx.symbolic_override_first_arg_based(bound_symbolic)
+            func = decorator(func)
+
+        return func(input, *fargs, **fkwargs)
+
+    return forward
diff --git a/torch/nn/_functions/thnn/__init__.py b/torch/nn/_functions/thnn/__init__.py
new file mode 100644
index 0000000..a12d8fa
--- /dev/null
+++ b/torch/nn/_functions/thnn/__init__.py
@@ -0,0 +1,7 @@
+_all_functions = []
+
+from .auto import *
+from .normalization import *
+from .fold import *
+from .sparse import *
+from .rnnFusedPointwise import *
diff --git a/torch/nn/_functions/thnn/auto.py b/torch/nn/_functions/thnn/auto.py
new file mode 100644
index 0000000..9b4145d
--- /dev/null
+++ b/torch/nn/_functions/thnn/auto.py
@@ -0,0 +1,363 @@
+from itertools import repeat
+from collections import defaultdict
+
+import torch
+from torch._thnn.utils import parse_header, THNN_H_PATH
+from torch.autograd.function import Function, InplaceFunction, once_differentiable
+from torch._thnn import type2backend
+from .auto_double_backwards import double_backwards_fns
+from .auto_symbolic import symbolic_fns
+
+from . import _all_functions
+
+
+def _make_function_class_criterion(class_name, update_output, update_grad_input, acc_grad_parameters,
+                                   double_backwards_fn, symbolic_fn):
+    weight_arg_idx = -1
+    for i, arg in enumerate(update_output.arguments):
+        if arg.name.startswith('weight'):
+            weight_arg_idx = i
+            break
+
+    reduce_arg_idx = -1
+    for i, arg in enumerate(update_output.arguments):
+        if arg.name == 'reduce':
+            reduce_arg_idx = i
+            break
+
+    buffers_idx = []
+    additional_arg_idx = 0
+    for arg in update_output.arguments[4:]:
+        if not arg.name.startswith('weight') and arg.type == 'THTensor*':
+            buffers_idx.append(additional_arg_idx)
+        additional_arg_idx += 1
+
+    @staticmethod
+    def symbolic(*args, **kwargs):
+        a = symbolic_fn(*args, **kwargs)
+        return a
+
+    @staticmethod
+    def forward(ctx, input, target, *args):
+        ctx._backend = type2backend[input.type()]
+        ctx.save_for_backward(input, target)
+        if weight_arg_idx >= 0:
+            ctx.weight = args[0]
+            args = args[1:]
+            ctx.additional_args = list(args)
+            insert_idx = weight_arg_idx - 4  # state, input, target, output
+            ctx.additional_args.insert(insert_idx, ctx.weight)
+        else:
+            ctx.additional_args = list(args)
+
+        ctx.forward_args_count = len(ctx.additional_args)
+        for idx in buffers_idx:
+            ctx.additional_args.insert(idx, input.new(1))
+        output = input.new(1)
+        getattr(ctx._backend, update_output.name)(ctx._backend.library_state, input, target,
+                                                  output, *ctx.additional_args)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, target = ctx.saved_tensors
+        # apply returns grad_input, so we need to return Nones for target (1) + 1 for each extra arg passed to forward.
+        return ((backward_cls.apply(input, target, grad_output, ctx.additional_args, ctx._backend),) +
+                (None,) * (ctx.forward_args_count + 1))
+
+    @staticmethod
+    def backward_cls_forward(ctx, input, target, grad_output, additional_args_ctx, backend_ctx):
+        ctx.additional_args = additional_args_ctx
+        ctx._backend = backend_ctx
+        ctx.save_for_backward(input, target, grad_output)
+        grad_input = grad_output.new().resize_as_(input).zero_()
+
+        if reduce_arg_idx >= 0:
+            getattr(ctx._backend, update_grad_input.name)(ctx._backend.library_state, input, target,
+                                                          grad_output, grad_input, *ctx.additional_args)
+            return grad_input
+
+        getattr(ctx._backend, update_grad_input.name)(ctx._backend.library_state, input, target,
+                                                      grad_input, *ctx.additional_args)
+        grad_output_expanded = grad_output.view(*repeat(1, grad_input.dim()))
+        grad_input.mul_(grad_output_expanded.expand_as(grad_input))
+        return grad_input
+
+    @staticmethod
+    def backward_cls_backward(ctx, *grad_params):
+        return double_backwards_fn(ctx, *grad_params)
+
+    backward_cls = type(class_name + "Backward", (Function,),
+                        dict(forward=backward_cls_forward, backward=backward_cls_backward))
+    return type(class_name, (Function,), dict(forward=forward, backward=backward, symbolic=symbolic)), backward_cls
+
+
+def _find_buffers(args, ignored_args):
+    additional_arg_idx = 0
+    buffers = []
+    for arg in args:
+        if arg.name in ignored_args:
+            continue
+        if arg.type == 'THTensor*':
+            buffers.append((additional_arg_idx, arg.name))
+        additional_arg_idx += 1
+    return buffers
+
+
+def _make_function_class(class_name, update_output, update_grad_input, acc_grad_parameters,
+                         double_backwards_fn, symbolic_fn):
+    def has_argument(fn, name):
+        for arg in fn.arguments:
+            if arg.name == name:
+                return True
+        return False
+    save_output = has_argument(update_grad_input, 'output')
+
+    param_args = {'weight', 'bias'}
+    ignored_args = {'weight', 'bias', 'gradWeight', 'gradBias', 'output'}
+    expected_params = [arg for arg in update_output.arguments[3:]
+                       if arg.name in param_args]
+    buffers = {}
+    buffers['update_output'] = _find_buffers(update_output.arguments[3:],
+                                             ignored_args)
+    buffers['update_grad_input'] = _find_buffers(
+        update_grad_input.arguments[4:], ignored_args)
+    if acc_grad_parameters is not None:
+        buffers['acc_grad_parameters'] = _find_buffers(
+            acc_grad_parameters.arguments[3:], ignored_args)
+
+    # This assumes that only the last argument can be
+    # an inplace flag
+    is_inplace = update_output.arguments[-1].name == 'inplace'
+
+    def _initialize_buffers(ctx, fn_name):
+        additional_args = ctx.additional_args
+        for idx, name in buffers[fn_name]:
+            # TODO: some buffers are necessary only for update output and can be
+            # freed right afterwards
+            buffer = ctx.buffers[name]
+            additional_args = additional_args[:idx] + [buffer] + additional_args[idx:]
+        return tuple(additional_args)
+
+    @staticmethod
+    def symbolic(*args, **kwargs):
+        return symbolic_fn(*args, **kwargs)
+
+    @staticmethod
+    def forward(ctx, input, *params):
+        ctx._backend = type2backend[input.type()]
+
+        ctx.additional_args = []
+        tensor_param_list = []
+        for param in params:
+            if isinstance(param, torch.Tensor):
+                if type(param) != type(input):
+                    raise RuntimeError("input type ({}) doesn't match the type of "
+                                       "a parameter tensor ({})".format(torch.typename(input),
+                                                                        torch.typename(param)))
+                tensor_param_list.append(param)
+            else:
+                ctx.additional_args.append(param)
+
+        tensor_params = tuple(tensor_param_list)
+        if is_inplace:
+            ctx.inplace = params[-1]
+        # Allocate temporary buffers and insert them into additional_args
+        ctx.buffers = defaultdict(type(input))
+        additional_args = _initialize_buffers(ctx, 'update_output')
+
+        # Fill in optional params with None
+        args = tensor_params
+        for i in range(len(params), len(expected_params)):
+            param = expected_params[i]
+            if param.is_optional:
+                args += (None,)
+            else:
+                raise ValueError("missing required argument '%s'" % param.name)
+
+        args += tuple(additional_args)
+
+        # If the module is working in-place its output will be set to the
+        # same storage as input, but its tensor won't be dirty.
+        if is_inplace and ctx.inplace:
+            ctx.mark_dirty(input)
+            output = input
+        else:
+            output = input.new()
+
+        if save_output:
+            ctx.save_for_backward(input, output, *tensor_params)
+        else:
+            ctx.save_for_backward(input, *tensor_params)
+
+        if not ctx.requires_grad:
+            del ctx.buffers
+
+        getattr(ctx._backend, update_output.name)(ctx._backend.library_state, input, output, *args)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        t = ctx.saved_tensors
+        input, tensor_params = t[0], t[1:]
+        # Some notes on this function call:
+        # 1) We need to pass params as *params so they are unwrapped correctly in backward_cls_forward.
+        # 2) apply returns the grad_input / grad_tensor_params, so we need to append Nones equal to the number
+        #    of non tensor_params, i.e. the additional_args
+        # 3) it may be simpler to recalculate some of these parameters (e.g. ctx._backend) in backward_cls_forward?
+
+        return (backward_cls.apply(input, grad_output, ctx.additional_args, ctx._backend, ctx.buffers, *tensor_params) +
+                (None,) * len(ctx.additional_args))
+
+    @staticmethod
+    def backward_cls_forward(ctx, input, grad_output, additional_args_ctx, backend_ctx, buffers_ctx, *params):
+        ctx.additional_args = additional_args_ctx
+        ctx.buffers = buffers_ctx
+        ctx._backend = backend_ctx
+        ctx.save_for_backward(input, grad_output, *params)
+        if save_output:
+            output = params[0]
+            params = params[1:]
+
+        grad_params = tuple(None for p in params)
+        grad_input_tuple = (None,)
+        if is_inplace:
+            ctx.inplace = additional_args_ctx[-1]
+
+        if ctx.needs_input_grad[0]:
+            additional_args = _initialize_buffers(ctx, 'update_grad_input')
+            if save_output:
+                additional_args = (output,) + additional_args
+
+            if is_inplace and ctx.inplace:
+                assert additional_args[-1] is True
+                tmp_args = list(additional_args)
+                tmp_args[-1] = False
+                additional_args = tuple(tmp_args)
+            grad_input = input.new(input.size())
+            params_without_bias = params if len(params) < 2 else params[:1]
+            update_grad_input_fn = getattr(ctx._backend, update_grad_input.name)
+            gi_args = params_without_bias + additional_args
+            update_grad_input_fn(ctx._backend.library_state, input, grad_output, grad_input, *gi_args)
+            grad_input_tuple = (grad_input,)
+
+        if acc_grad_parameters and any(ctx.needs_input_grad[1:]):
+            additional_args = _initialize_buffers(ctx, 'acc_grad_parameters')
+            grad_params = tuple(p.new(p.size()).zero_() for p in params)
+            appended_grads = len(expected_params) - len(grad_params)
+            grad_params += (None,) * appended_grads
+            acc_grad_parameters_fn = getattr(ctx._backend, acc_grad_parameters.name)
+            param_args = grad_params + additional_args + (1,)
+            acc_grad_parameters_fn(ctx._backend.library_state, input, grad_output, *param_args)
+            if appended_grads:
+                grad_params = grad_params[:-appended_grads]
+
+        return grad_input_tuple + grad_params
+
+    @staticmethod
+    def backward_cls_backward(ctx, *grad_params):
+        return double_backwards_fn(ctx, *grad_params)
+
+    base_class = Function if not is_inplace else InplaceFunction
+    backward_cls = type(class_name + "Backward", (base_class,), dict(forward=backward_cls_forward,
+                                                                     backward=backward_cls_backward))
+
+    return type(class_name, (base_class,), dict(forward=forward, backward=backward, symbolic=symbolic)), backward_cls
+
+
+def _generate_function_classes(scope_dict):
+    global function_list, function_by_name
+    function_list = parse_header(THNN_H_PATH)
+    function_by_name = {fn.name: fn for fn in function_list}
+    classes_to_generate = {fn.name.partition('_')[0] for fn in function_list}
+    exceptions = {
+        'Linear',
+        'IndexLinear',
+        'SpatialFullConvolution',
+        'SpatialConvolutionMM',
+        'SparseLinear',
+        'TemporalConvolution',
+        'SpatialAveragePooling',
+        'SpatialMaxPooling',
+        'SpatialDilatedMaxPooling',
+        'SpatialMaxUnpooling',
+        'SpatialAdaptiveMaxPooling',
+        'SpatialAdaptiveAveragePooling',
+        'VolumetricAveragePooling',
+        'VolumetricMaxPooling',
+        'VolumetricMaxUnpooling',
+        'VolumetricAdaptiveAveragePooling',
+        'VolumetricAdaptiveMaxPooling',
+        'VolumetricConvolution',
+        'VolumetricFullConvolution',
+        'VolumetricConvolutionMM',
+        'TemporalMaxPooling',
+        'BatchNormalization',
+        'LookupTable',
+        'LookupTableBag',
+        'PReLU',
+        'RReLU',
+        'SoftMax',
+        'LogSoftMax',
+        'GRUFused',
+        'LSTMFused',
+        'unfolded',
+    }
+    name_remap = {
+        'TemporalConvolution': 'Conv1d',
+        'TemporalReflectionPadding': 'ReflectionPad1d',
+        'TemporalReplicationPadding': 'ReplicationPad1d',
+        'SpatialDilatedConvolution': 'DilatedConv2d',
+        'SpatialMaxUnpooling': 'MaxUnpool2d',
+        'SpatialReflectionPadding': 'ReflectionPad2d',
+        'SpatialReplicationPadding': 'ReplicationPad2d',
+        'VolumetricReplicationPadding': 'ReplicationPad3d',
+        'VolumetricMaxUnpooling': 'MaxUnpool3d',
+        'HardTanh': 'Hardtanh',
+        'HardShrink': 'Hardshrink',
+        'SoftPlus': 'Softplus',
+        'SoftShrink': 'Softshrink',
+        'MSECriterion': 'MSELoss',
+        'AbsCriterion': 'L1Loss',
+        'BCECriterion': 'BCELoss',
+        'ClassNLLCriterion': 'NLLLoss',
+        'DistKLDivCriterion': 'KLDivLoss',
+        'SpatialClassNLLCriterion': 'NLLLoss2d',
+        'MultiLabelMarginCriterion': 'MultiLabelMarginLoss',
+        'MultiMarginCriterion': 'MultiMarginLoss',
+        'SmoothL1Criterion': 'SmoothL1Loss',
+        'SoftMarginCriterion': 'SoftMarginLoss',
+    }
+
+    classes_to_generate -= exceptions
+    for fn in classes_to_generate:
+        update_output = function_by_name[fn + '_updateOutput']
+        update_grad_input = function_by_name[fn + '_updateGradInput']
+        acc_grad_parameters = function_by_name.get(fn + '_accGradParameters')
+        class_name = name_remap.get(fn, fn)
+        double_backwards_fn = double_backwards_fns.get(class_name)
+        if double_backwards_fn is None:
+            def make_default_double_backwards_fn(class_name):
+                def default_double_backwards_fn(ctx, *grad_params):
+                    raise ValueError(class_name + " can only be differentiated once.")
+                return default_double_backwards_fn
+            double_backwards_fn = make_default_double_backwards_fn(class_name)
+        symbolic_fn = symbolic_fns.get(class_name)
+        # This has to call a function to retain correct references to functions
+        is_criterion_fn = 'Criterion' in fn
+        if is_criterion_fn:
+            cls, backward_cls = _make_function_class_criterion(class_name, update_output,
+                                                               update_grad_input, acc_grad_parameters,
+                                                               double_backwards_fn, symbolic_fn)
+        else:
+            cls, backward_cls = _make_function_class(class_name, update_output,
+                                                     update_grad_input, acc_grad_parameters,
+                                                     double_backwards_fn, symbolic_fn)
+        scope_dict[class_name] = cls
+        scope_dict[backward_cls.__name__] = backward_cls
+        if not class_name.startswith('_'):
+            _all_functions.append(cls)
+            _all_functions.append(backward_cls)
+
+
+_generate_function_classes(locals())
diff --git a/torch/nn/_functions/thnn/auto_double_backwards.py b/torch/nn/_functions/thnn/auto_double_backwards.py
new file mode 100644
index 0000000..54ce0b5
--- /dev/null
+++ b/torch/nn/_functions/thnn/auto_double_backwards.py
@@ -0,0 +1,256 @@
+import torch
+
+
+def elu_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input, grad_output = t[0], t[1]
+    alpha = ctx.additional_args[0]
+
+    negative_mask = (input < 0).type_as(ggI)
+    exp_alpha = input.exp() * alpha * negative_mask
+    gI = ggI * grad_output * exp_alpha
+
+    non_negative_mask = (input >= 0).type_as(ggI)
+    ggO = ggI * (exp_alpha + non_negative_mask)
+    return gI, ggO, None, None, None, None
+
+
+def gatedlinear_double_backwards(ctx, ggI):
+    input, gO = ctx.saved_tensors
+    dim = ctx.additional_args[0]
+
+    input_size = input.size(dim) // 2
+
+    first_half = input.narrow(dim, 0, input_size)
+    second_half = input.narrow(dim, input_size, input_size)
+    sig_second_half = second_half.sigmoid()
+    one_sub_sig_second_half = 1 - sig_second_half
+    sig_one_sub_sig = sig_second_half * one_sub_sig_second_half
+
+    ggI_first_half = ggI.narrow(dim, 0, input_size)
+    ggI_second_half = ggI.narrow(dim, input_size, input_size)
+    ggI_second_half_times_first_half = ggI_second_half * first_half
+
+    gI_first_half = ggI_second_half * gO * sig_one_sub_sig
+    second_order_sh = sig_one_sub_sig * one_sub_sig_second_half - sig_second_half * sig_one_sub_sig
+    gI_second_half = ggI_second_half_times_first_half * gO * second_order_sh + ggI_first_half * gO * sig_one_sub_sig
+    gI = torch.cat((gI_first_half, gI_second_half), dim)
+
+    ggO = ggI_first_half * sig_second_half + ggI_second_half_times_first_half * sig_one_sub_sig
+
+    return gI, ggO, None, None, None
+
+
+def hardshrink_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input = t[0]
+    lambd = ctx.additional_args[0]
+    gI = None
+
+    mask = torch.zeros_like(input).masked_fill_(input > lambd, 1).masked_fill_(input < -lambd, 1)
+    ggO = ggI * mask
+
+    return gI, ggO, None, None, None
+
+
+def hardtanh_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input, grad_output = t[0], t[1]
+    min_val, max_val = ctx.additional_args[0:2]
+
+    max_mask = input <= max_val
+    min_mask = input <= min_val
+    gI = torch.zeros_like(ggI)
+    ggO = ggI * (max_mask - min_mask).type_as(grad_output)
+    return gI, ggO, None, None, None
+
+
+def leakyrelu_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input = t[0]
+    negative_slope = ctx.additional_args[0]
+
+    gI = torch.zeros_like(ggI)
+    input_lt_0 = (input < 0).type_as(ggI)
+    input_ge_0 = (input >= 0).type_as(ggI)
+    ggO = ggI * (input_lt_0 * negative_slope + input_ge_0)
+    return gI, ggO, None, None, None
+
+
+def logsigmoid_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    # maybe more efficient in terms of output, but save_output is False
+    input, gO = t[0], t[1]
+
+    exp_input = input.exp()
+    exp_input_plus_1 = exp_input + 1
+    gI = ggI * gO * -1 * exp_input / (exp_input_plus_1.pow(2))
+    ggO = ggI / exp_input_plus_1
+
+    return gI, ggO, None, None, None, None
+
+
+def softplus_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input, gO, output = t[0], t[1], t[2]
+    beta, threshold = ctx.additional_args[0], ctx.additional_args[1]
+
+    input_beta = input * beta
+    above_threshold = torch.zeros_like(ggI).masked_fill_(input_beta > threshold, 1)
+    below_threshold = torch.zeros_like(ggI).masked_fill_(input_beta <= threshold, 1)
+
+    exp_output_beta = (output * beta).exp()
+    first_deriv = (exp_output_beta - 1) / exp_output_beta
+    first_deriv_below_threshold = first_deriv * below_threshold
+
+    gI = ggI * gO * first_deriv_below_threshold * beta / exp_output_beta
+    ggO = ggI * (above_threshold + first_deriv_below_threshold)
+
+    return gI, ggO, None, None, None, None
+
+
+def softshrink_double_backwards(ctx, ggI):
+    return hardshrink_double_backwards(ctx, ggI)
+
+
+def threshold_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    input = t[0]
+    threshold, value = ctx.additional_args[0:2]
+
+    gI = torch.zeros_like(ggI)
+    input_gt_threshold = (input > threshold).type_as(ggI)
+    ggO = ggI * input_gt_threshold
+    return gI, ggO, None, None, None
+
+
+def klddivloss_double_backwards(ctx, ggI):
+    size_average = ctx.additional_args[0]
+    input, target, gO = ctx.saved_tensors
+    div_factor = input.nelement() if size_average else 1
+
+    gI = None
+    ggO = (ggI * target).sum() / -div_factor
+
+    return gI, None, ggO, None, None
+
+
+def l1loss_double_backwards(ctx, ggI):
+    size_average = ctx.additional_args[0]
+    input, target, grad_output = ctx.saved_tensors
+    gI = torch.zeros_like(ggI)
+
+    positive_mask = (input > target).type_as(ggI)
+    negative_mask = (input < target).type_as(ggI)
+    ggO = (ggI * (positive_mask - negative_mask)).sum()
+    if size_average:
+        ggO = ggO / input.nelement()
+    return gI, None, ggO, None, None
+
+
+def mseloss_double_backwards(ctx, ggI):
+    size_average = ctx.additional_args[0]
+    reduce = ctx.additional_args[1]
+    input, target, gO = ctx.saved_tensors
+    div_factor = input.nelement() if size_average and reduce else 1
+
+    gI = ggI * (gO * 2. / div_factor).expand_as(input)
+    if reduce:
+        ggO = (ggI * (input - target)).sum() * (2. / div_factor)
+    else:
+        ggO = (ggI * (input - target)) * 2.
+
+    return gI, None, ggO, None, None
+
+
+def nllloss_double_backwards(ctx, ggI):
+    t = ctx.saved_tensors
+    target = t[1]
+    weights = ctx.additional_args[1]
+    size_average = ctx.additional_args[0]
+    ignore_index = ctx.additional_args[3]
+    reduce = ctx.additional_args[4]
+
+    gI = None
+
+    # can't scatter/gather on indices outside of range, let's just put them in range
+    # and 0 out the weights later (so it doesn't matter where in range we put them)
+    target_mask = target == ignore_index
+    safe_target = target.clone()
+    safe_target.masked_fill_(target_mask, 0)
+
+    if weights.dim() == 0:
+        weights_to_scatter = torch.ones_like(safe_target)
+    else:
+        weights_maybe_resized = weights
+        while weights_maybe_resized.dim() < target.dim():
+            weights_maybe_resized = weights_maybe_resized.unsqueeze(1)
+
+        weights_maybe_resized = weights_maybe_resized.expand(weights.size()[0:1] + target.size()[1:])
+        weights_to_scatter = weights_maybe_resized.gather(0, safe_target)
+
+    weights_to_scatter.masked_fill_(target_mask, 0)
+    divisor = weights_to_scatter.sum() if size_average and reduce else 1
+    weights_to_scatter = -1 * weights_to_scatter / divisor
+    zeros = torch.zeros_like(ggI)
+    mask = zeros.scatter_(1, safe_target.unsqueeze(1), weights_to_scatter.unsqueeze(1))
+
+    if reduce:
+        ggO = (ggI * mask).sum()
+    else:
+        ggO = (ggI * mask).sum(dim=1)
+
+    return gI, None, ggO, None, None, None
+
+
+def smoothl1loss_double_backwards(ctx, ggI):
+    size_average = ctx.additional_args[0]
+    input, target, gO = ctx.saved_tensors
+    div_factor = input.nelement() if size_average else 1
+
+    input_sub_target = input - target
+    small_error_mask = (input_sub_target.abs() < 1)
+    large_error_mask = (small_error_mask == 0)
+    large_error_pos_mask = (((input_sub_target > 0) + large_error_mask) == 2).type_as(ggI)
+    large_error_neg_mask = (((input_sub_target <= 0) + large_error_mask) == 2).type_as(ggI)
+    small_error_mask = small_error_mask.type_as(ggI)
+
+    gI = small_error_mask * ggI * gO / div_factor
+    ggO = (ggI * (input_sub_target * small_error_mask + large_error_pos_mask - large_error_neg_mask)).sum() / div_factor
+
+    return gI, None, ggO, None, None, None
+
+
+def softmarginloss_double_backwards(ctx, ggI):
+    size_average = ctx.additional_args[0]
+    input, target, gO = ctx.saved_tensors
+    div_factor = input.nelement() if size_average else 1
+
+    t0 = (1 + (-target * input).exp()).pow(-1)
+    t1 = (-target * (-target * input).exp())
+    first_deriv = t0 * t1
+
+    gI = -1 * gO * ggI / div_factor * (first_deriv.pow(2) + first_deriv * target)
+    ggO = (ggI * first_deriv).sum() / div_factor
+
+    return gI, None, ggO, None, None, None
+
+
+double_backwards_fns = {
+    'ELU': elu_double_backwards,
+    'GatedLinear': gatedlinear_double_backwards,
+    'Hardshrink': hardshrink_double_backwards,
+    'Hardtanh': hardtanh_double_backwards,
+    'LeakyReLU': leakyrelu_double_backwards,
+    'LogSigmoid': logsigmoid_double_backwards,
+    'Softplus': softplus_double_backwards,
+    'Softshrink': softshrink_double_backwards,
+    'Threshold': threshold_double_backwards,
+    'KLDivLoss': klddivloss_double_backwards,
+    'L1Loss': l1loss_double_backwards,
+    'MSELoss': mseloss_double_backwards,
+    'NLLLoss': nllloss_double_backwards,
+    'NLLLoss2d': nllloss_double_backwards,
+    'SmoothL1Loss': smoothl1loss_double_backwards,
+    'SoftMarginLoss': softmarginloss_double_backwards,
+}
diff --git a/torch/nn/_functions/thnn/auto_symbolic.py b/torch/nn/_functions/thnn/auto_symbolic.py
new file mode 100644
index 0000000..4bbad81
--- /dev/null
+++ b/torch/nn/_functions/thnn/auto_symbolic.py
@@ -0,0 +1,22 @@
+from torch.autograd._functions.utils import prepare_onnx_paddings
+
+
+def reflectionpad_symbolic(g, input, *params):
+    mode = "reflect"
+    paddings = prepare_onnx_paddings(len(input.type().sizes()), params)
+    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+
+
+def replicationpad_symbolic(g, input, *params):
+    mode = "edge"
+    paddings = prepare_onnx_paddings(len(input.type().sizes()), params)
+    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+
+
+symbolic_fns = {
+    'ReflectionPad1d': reflectionpad_symbolic,
+    'ReflectionPad2d': reflectionpad_symbolic,
+    'ReplicationPad1d': replicationpad_symbolic,
+    'ReplicationPad2d': replicationpad_symbolic,
+    'ReplicationPad3d': replicationpad_symbolic,
+}
diff --git a/torch/nn/_functions/thnn/fold.py b/torch/nn/_functions/thnn/fold.py
new file mode 100644
index 0000000..dad6560
--- /dev/null
+++ b/torch/nn/_functions/thnn/fold.py
@@ -0,0 +1,86 @@
+from torch.autograd.function import Function, once_differentiable
+from torch._thnn import type2backend
+
+from . import _all_functions
+
+
+class Col2Im(Function):
+
+    @staticmethod
+    def forward(ctx, input, output_size, kernel_size, dilation, padding, stride):
+
+        ctx.output_size = output_size
+        ctx.kernel_size = kernel_size
+        ctx.dilation = dilation
+        ctx.padding = padding
+        ctx.stride = stride
+
+        ctx._backend = type2backend[input.type()]
+
+        output = input.new()
+
+        ctx._backend.Col2Im_updateOutput(ctx._backend.library_state,
+                                         input, output,
+                                         output_size[0], output_size[1],
+                                         kernel_size[0], kernel_size[1],
+                                         dilation[0], dilation[1],
+                                         padding[0], padding[1],
+                                         stride[0], stride[1])
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+
+        grad_input = grad_output.new()
+
+        ctx._backend.Col2Im_updateGradInput(ctx._backend.library_state,
+                                            grad_output,
+                                            grad_input,
+                                            ctx.kernel_size[0], ctx.kernel_size[1],
+                                            ctx.dilation[0], ctx.dilation[1],
+                                            ctx.padding[0], ctx.padding[1],
+                                            ctx.stride[0], ctx.stride[1])
+        return grad_input, None, None, None, None, None
+
+
+class Im2Col(Function):
+
+    @staticmethod
+    def forward(ctx, input, kernel_size, dilation, padding, stride):
+
+        assert input.dim() == 4
+
+        ctx.kernel_size = kernel_size
+        ctx.dilation = dilation
+        ctx.padding = padding
+        ctx.stride = stride
+        ctx.input_size = (input.size(2), input.size(3))
+
+        ctx._backend = type2backend[input.type()]
+
+        output = input.new()
+
+        ctx._backend.Im2Col_updateOutput(ctx._backend.library_state,
+                                         input, output,
+                                         kernel_size[0], kernel_size[1],
+                                         dilation[0], dilation[1],
+                                         padding[0], padding[1],
+                                         stride[0], stride[1])
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+
+        grad_input = grad_output.new()
+
+        ctx._backend.Im2Col_updateGradInput(ctx._backend.library_state,
+                                            grad_output,
+                                            grad_input,
+                                            ctx.input_size[0], ctx.input_size[1],
+                                            ctx.kernel_size[0], ctx.kernel_size[1],
+                                            ctx.dilation[0], ctx.dilation[1],
+                                            ctx.padding[0], ctx.padding[1],
+                                            ctx.stride[0], ctx.stride[1])
+        return grad_input, None, None, None, None
diff --git a/torch/nn/_functions/thnn/normalization.py b/torch/nn/_functions/thnn/normalization.py
new file mode 100644
index 0000000..64c1af3
--- /dev/null
+++ b/torch/nn/_functions/thnn/normalization.py
@@ -0,0 +1,138 @@
+import torch
+from torch.autograd.function import Function
+from torch._thnn import type2backend
+
+from . import _all_functions
+
+
+class CrossMapLRN2d(Function):
+
+    def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
+        super(CrossMapLRN2d, self).__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+        self._backend = None
+        self.scale = None
+
+    def forward(self, input):
+        assert input.dim() == 4
+
+        self.scale = self.scale or input.new()
+        output = input.new()
+
+        backend = type2backend[input.type()]
+        if backend is not None:
+            try:
+                backend.SpatialCrossMapLRN_updateOutput
+                self._backend = backend
+            except NotImplementedError:
+                pass
+
+        if self._backend is not None:
+            self._backend.SpatialCrossMapLRN_updateOutput(
+                self._backend.library_state,
+                input,
+                output,
+                self.scale,
+                self.size,
+                self.alpha,
+                self.beta,
+                self.k
+            )
+        else:
+            batch_size = input.size(0)
+            channels = input.size(1)
+            input_height = input.size(2)
+            input_width = input.size(3)
+
+            output.resize_as_(input)
+            self.scale.resize_as_(input)
+
+            # use output storage as temporary buffer
+            input_square = output
+            torch.pow(input, 2, out=input_square)
+
+            pre_pad = int((self.size - 1) / 2 + 1)
+            pre_pad_crop = channels if pre_pad > channels else pre_pad
+
+            scale_first = self.scale.select(1, 0)
+            scale_first.zero_()
+            # compute first feature map normalization
+            for c in range(pre_pad_crop):
+                scale_first.add_(input_square.select(1, c))
+
+            # reuse computations for next feature maps normalization
+            # by adding the next feature map and removing the previous
+            for c in range(1, channels):
+                scale_previous = self.scale.select(1, c - 1)
+                scale_current = self.scale.select(1, c)
+                scale_current.copy_(scale_previous)
+                if c < channels - pre_pad + 1:
+                    square_next = input_square.select(1, c + pre_pad - 1)
+                    scale_current.add_(1, square_next)
+
+                if c > pre_pad:
+                    square_previous = input_square.select(1, c - pre_pad)
+                    scale_current.add_(-1, square_previous)
+
+            self.scale.mul_(self.alpha / self.size).add_(self.k)
+
+            torch.pow(self.scale, -self.beta, out=output)
+            output.mul_(input)
+
+        self.save_for_backward(input, output)
+        return output
+
+    def backward(self, grad_output):
+        input, output = self.saved_tensors
+        grad_input = grad_output.new()
+
+        if self._backend is not None:
+            self._backend.SpatialCrossMapLRN_updateGradInput(
+                self._backend.library_state,
+                input,
+                grad_output,
+                grad_input,
+                self.scale,
+                output,
+                self.size,
+                self.alpha,
+                self.beta,
+                self.k
+            )
+        else:
+            batch_size = input.size(0)
+            channels = input.size(1)
+            input_height = input.size(2)
+            input_width = input.size(3)
+
+            paddded_ratio = input.new(channels + self.size - 1, input_height,
+                                      input_width)
+            accum_ratio = input.new(input_height, input_width)
+
+            cache_ratio_value = 2 * self.alpha * self.beta / self.size
+            inversePrePad = int(self.size - (self.size - 1) / 2)
+
+            grad_input.resize_as_(input)
+            torch.pow(self.scale, -self.beta, out=grad_input).mul_(grad_output)
+
+            paddded_ratio.zero_()
+            padded_ratio_center = paddded_ratio.narrow(0, inversePrePad,
+                                                       channels)
+            for n in range(batch_size):
+                torch.mul(grad_output[n], output[n], out=padded_ratio_center)
+                padded_ratio_center.div_(self.scale[n])
+                torch.sum(
+                    paddded_ratio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=accum_ratio)
+                for c in range(channels):
+                    accum_ratio.add_(paddded_ratio[c + self.size - 1])
+                    grad_input[n][c].addcmul_(-cache_ratio_value, input[n][c],
+                                              accum_ratio)
+                    accum_ratio.add_(-1, paddded_ratio[c])
+
+        return grad_input
+
+
+_all_functions.append(CrossMapLRN2d)
diff --git a/torch/nn/_functions/thnn/rnnFusedPointwise.py b/torch/nn/_functions/thnn/rnnFusedPointwise.py
new file mode 100644
index 0000000..3b769cb
--- /dev/null
+++ b/torch/nn/_functions/thnn/rnnFusedPointwise.py
@@ -0,0 +1,97 @@
+import torch
+from torch.autograd.function import Function, InplaceFunction, once_differentiable
+from torch._thnn import type2backend
+
+
+class GRUFused(Function):
+    @staticmethod
+    def forward(ctx, input_gate, hidden_gate, hx, ibias=None, hbias=None):
+        ctx.backend = type2backend[input_gate.type()]
+
+        hy = input_gate.new()
+        workspace = input_gate.new(hx.numel() * 5)
+
+        ctx.has_bias = False
+        if ibias is not None:
+            ctx.has_bias = True
+            if ibias.dim() == 1:
+                ibias = ibias.unsqueeze(0)
+            if hbias.dim() == 1:
+                hbias = hbias.unsqueeze(0)
+
+        ctx.backend.GRUFused_updateOutput(
+            ctx.backend.library_state,
+            input_gate, hidden_gate, ibias, hbias, hx, hy, workspace)
+
+        ctx.workspace = workspace
+        ctx.igate_size = input_gate.size()
+        ctx.hgate_size = hidden_gate.size()
+
+        return hy
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, gradOutput):
+        ctx.backend = type2backend[gradOutput.type()]
+
+        gradInputHx = gradOutput.new()
+        gradInInput = gradOutput.new(*ctx.igate_size)
+        gradInHidden = gradOutput.new(*ctx.hgate_size)
+
+        ctx.backend.GRUFused_updateGradInput(
+            ctx.backend.library_state,
+            gradInInput, gradInHidden, gradOutput, gradInputHx, ctx.workspace)
+
+        gb1 = gb2 = None
+        if ctx.has_bias:
+            gb1 = gradInInput.sum(0, keepdim=False)
+            gb2 = gradInHidden.sum(0, keepdim=False)
+        return gradInInput, gradInHidden, gradInputHx, gb1, gb2
+
+
+class LSTMFused(Function):
+    @staticmethod
+    def forward(ctx, input_gate, hidden_gate, cx, ibias=None, hbias=None):
+        ctx.backend = type2backend[input_gate.type()]
+        hy = input_gate.new()
+        cy = input_gate.new()
+
+        ctx.has_bias = False
+        if ibias is not None:
+            ctx.has_bias = True
+            if ibias.dim() == 1:
+                ibias = ibias.unsqueeze(0)
+            if hbias.dim() == 1:
+                hbias = hbias.unsqueeze(0)
+
+        # input_gate gets overwritten with some intermediate values to use in backwards
+        ctx.backend.LSTMFused_updateOutput(
+            ctx.backend.library_state,
+            input_gate, hidden_gate,
+            ibias, hbias,
+            cx, hy, cy)
+
+        ctx.hgate_size = hidden_gate.size()
+        ctx.save_for_backward(input_gate, cx, cy)
+
+        return hy, cy
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, *gradOutput):
+        ctx.backend = type2backend[gradOutput[0].type()]
+        gradInputCx = gradOutput[0].new()
+        gradInGates = gradOutput[0].new(*ctx.hgate_size)
+
+        saved_tens, cx, cy = ctx.saved_tensors
+        ctx.backend.LSTMFused_updateGradInput(
+            ctx.backend.library_state,
+            saved_tens, gradInGates, cx, cy,
+            gradOutput[0], gradOutput[1], gradInputCx)
+
+        gb1 = gb2 = None
+        if ctx.has_bias:
+            gb1 = gradInGates.sum(0, keepdim=False)
+            gb2 = gradInGates.sum(0, keepdim=False)
+
+        return gradInGates, gradInGates, gradInputCx, gb1, gb2
diff --git a/torch/nn/_functions/thnn/sparse.py b/torch/nn/_functions/thnn/sparse.py
new file mode 100644
index 0000000..24a41e7
--- /dev/null
+++ b/torch/nn/_functions/thnn/sparse.py
@@ -0,0 +1,167 @@
+import torch
+from torch.autograd.function import Function
+from torch._thnn import type2backend
+from torch.autograd.function import once_differentiable
+
+from . import _all_functions
+
+
+MODE_SUM = 0
+MODE_MEAN = 1
+
+
+class EmbeddingBag(Function):
+
+    @staticmethod
+    def _renorm(ctx, indices, weight, max_norm, norm_type):
+        # clone indices since LookupTable_renorm modifies it in-place
+        ctx._backend.LookupTable_renorm(
+            ctx._backend.library_state,
+            indices.clone().view(-1),
+            weight,
+            max_norm,
+            norm_type
+        )
+
+    @classmethod
+    def forward(cls, ctx, weight, indices, offsets,
+                max_norm, norm_type, scale_grad_by_freq, mode):
+
+        ctx.max_norm = max_norm
+        ctx.norm_type = norm_type
+        ctx.scale_grad_by_freq = scale_grad_by_freq
+
+        if mode == 'sum':
+            ctx.mode = MODE_SUM
+        elif mode == 'mean':
+            ctx.mode = MODE_MEAN
+        else:
+            raise ValueError("mode needs to be 'sum' or 'mean', but got {}"
+                             .format(mode))
+
+        assert not ctx.needs_input_grad[1], "EmbeddingBag doesn't " \
+            "compute the gradient w.r.t. the indices"
+
+        assert not ctx.needs_input_grad[2], "EmbeddingBag doesn't " \
+            "compute the gradient w.r.t. the offsets"
+
+        assert indices.dim() == 1
+        if offsets.dim() != 1:
+            raise ValueError("offsets has to be a 1D Tensor")
+
+        if offsets[0] != 0:
+            raise ValueError("offsets[0] has to be 0, i.e. the first sequence"
+                             " in the mini-batch has to start from position 0."
+                             "However, got {}".format(offsets[0]))
+        if offsets[-1] > indices.size(0):
+            raise ValueError("offsets[-1] has to be smaller than indices's length"
+                             " ({}), but got offsets[-1] of {}"
+                             .format(indices.size(0), offsets[-1]))
+
+        ctx._backend = type2backend[weight.type()]
+        ctx._weight_size = weight.size()
+        ctx._offset2bag = offsets.new()
+
+        ctx.save_for_backward(indices)
+
+        indices = indices.contiguous().view(-1)
+        output = weight.new()
+
+        if ctx.max_norm is not None:
+            cls._renorm(ctx, indices, weight, max_norm=max_norm, norm_type=norm_type)
+
+        if weight.is_cuda:
+            if ctx.mode == MODE_MEAN:
+                ctx.bag_size = offsets.new().resize_(offsets.size())
+            else:
+                ctx.bag_size = None
+
+            ctx._backend.LookupTableBag_updateOutput(
+                ctx._backend.library_state,
+                indices,
+                offsets,
+                weight,
+                output,
+                ctx._offset2bag,
+                ctx.mode,
+                ctx.bag_size
+            )
+        else:
+            # slow CPU implementation
+            index_output = torch.index_select(weight, 0, indices)
+            # indices = [1, 2, 30, 100, 12], offsets = [0, 2, 3]
+            ctx._offset2bag.resize_(indices.size(0)).zero_()  # offset2bag = [0 0 0 0 0]
+            ctx._offset2bag.index_fill_(0, offsets, 1)  # offset2bag = [1 0 1 0 1]
+            ctx._offset2bag[0] = 0  # offset2bag = [0 0 1 0 1]
+            ctx._offset2bag = ctx._offset2bag.cumsum(0)  # offset2bag = [0 0 1 1 2]
+            output.resize_(offsets.size(0), weight.size(1)).zero_()
+            output.index_add_(0, ctx._offset2bag, index_output)
+            if ctx.mode == MODE_MEAN:
+                if offsets.size(0) == 1:
+                    ctx.bag_size = indices.size(0)
+                else:
+                    ctx.bag_size = weight.new().resize_(offsets.size())
+                    ctx.bag_size[:-1] = offsets[1:] - offsets[:-1]
+                    ctx.bag_size[-1] = indices.size(0) - offsets[-1]
+                    ctx.bag_size = ctx.bag_size[:, None].expand_as(output)
+                output /= ctx.bag_size
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        indices = indices.contiguous().view(-1)
+        grad_output = grad_output.contiguous()
+
+        with torch.cuda.device_of(grad_output):
+            if grad_output.is_cuda:
+                _sorted = torch.cuda.LongTensor()
+                _indices = torch.cuda.LongTensor()
+                _count = torch.cuda.LongTensor()
+            else:
+                _count = torch.IntTensor()
+                _sorted = _indices = None
+
+        grad_weight = grad_output.new(ctx._weight_size).zero_()
+
+        if grad_output.is_cuda:
+            ctx._backend.LookupTableBag_accGradParameters(
+                ctx._backend.library_state,
+                indices,
+                grad_output,
+                grad_weight,
+                ctx._offset2bag,
+                _count,
+                _sorted,
+                _indices,
+                ctx.scale_grad_by_freq,
+                ctx.mode,
+                ctx.bag_size,
+                1
+            )
+        else:
+            # slow CPU implementation
+            if ctx.mode == MODE_MEAN:
+                # divide by average count
+                grad_output = grad_output / ctx.bag_size
+
+            index_grad_output = grad_output.index_select(0, ctx._offset2bag)
+            ctx._backend.LookupTable_accGradParameters(
+                ctx._backend.library_state,
+                indices,
+                index_grad_output,
+                grad_weight,
+                _count,
+                _sorted,
+                _indices,
+                ctx.scale_grad_by_freq,
+                -1,
+                1
+            )
+
+        return grad_weight, None, None, None, None, None, None
+
+
+_all_functions.append(EmbeddingBag)
diff --git a/torch/nn/_functions/vision.py b/torch/nn/_functions/vision.py
new file mode 100644
index 0000000..0ccf0ba
--- /dev/null
+++ b/torch/nn/_functions/vision.py
@@ -0,0 +1,68 @@
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch._thnn import type2backend
+from .thnn.auto import function_by_name
+import torch.backends.cudnn as cudnn
+
+
+def affine_grid_generator(theta, size):
+    if theta.data.is_cuda:
+        if not cudnn.enabled:
+            raise RuntimeError("AffineGridGenerator needs CuDNN for "
+                               "processing CUDA inputs, but CuDNN is not enabled")
+        if not cudnn.is_acceptable(theta.data):
+            raise RuntimeError("AffineGridGenerator generator theta not acceptable for CuDNN")
+        N, C, H, W = size
+        return torch.cudnn_affine_grid_generator(theta, N, C, H, W)
+    else:
+        return AffineGridGenerator.apply(theta, size)
+
+
+# TODO: Port these completely into C++
+
+
+class AffineGridGenerator(Function):
+
+    @staticmethod
+    def _enforce_cudnn(input):
+        if not cudnn.enabled:
+            raise RuntimeError("AffineGridGenerator needs CuDNN for "
+                               "processing CUDA inputs, but CuDNN is not enabled")
+        assert cudnn.is_acceptable(input)
+
+    @staticmethod
+    def forward(ctx, theta, size):
+        assert type(size) == torch.Size
+        N, C, H, W = size
+        ctx.size = size
+        if theta.is_cuda:
+            AffineGridGenerator._enforce_cudnn(theta)
+            assert False
+        ctx.is_cuda = False
+        base_grid = theta.new(N, H, W, 3)
+        linear_points = torch.linspace(-1, 1, W) if W > 1 else torch.Tensor([-1])
+        base_grid[:, :, :, 0] = torch.ger(torch.ones(H), linear_points).expand_as(base_grid[:, :, :, 0])
+        linear_points = torch.linspace(-1, 1, H) if H > 1 else torch.Tensor([-1])
+        base_grid[:, :, :, 1] = torch.ger(linear_points, torch.ones(W)).expand_as(base_grid[:, :, :, 1])
+        base_grid[:, :, :, 2] = 1
+        ctx.base_grid = base_grid
+        grid = torch.bmm(base_grid.view(N, H * W, 3), theta.transpose(1, 2))
+        grid = grid.view(N, H, W, 2)
+        return grid
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_grid):
+        N, C, H, W = ctx.size
+        assert grad_grid.size() == torch.Size([N, H, W, 2])
+        assert ctx.is_cuda == grad_grid.is_cuda
+        if grad_grid.is_cuda:
+            AffineGridGenerator._enforce_cudnn(grad_grid)
+            assert False
+        base_grid = ctx.base_grid
+        grad_theta = torch.bmm(
+            base_grid.view(N, H * W, 3).transpose(1, 2),
+            grad_grid.view(N, H * W, 2))
+        grad_theta = grad_theta.transpose(1, 2)
+        return grad_theta, None
diff --git a/torch/nn/backends/__init__.py b/torch/nn/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/nn/backends/backend.py b/torch/nn/backends/backend.py
new file mode 100644
index 0000000..fb5424b
--- /dev/null
+++ b/torch/nn/backends/backend.py
@@ -0,0 +1,16 @@
+
+class FunctionBackend(object):
+
+    def __init__(self):
+        self.function_classes = {}
+
+    def __getattr__(self, name):
+        fn = self.function_classes.get(name)
+        if fn is None:
+            raise NotImplementedError
+        return fn
+
+    def register_function(self, name, function_class):
+        if self.function_classes.get(name):
+            raise RuntimeError("Trying to register second function under name " + name + " in " + type(self).__name__)
+        self.function_classes[name] = function_class
diff --git a/torch/nn/backends/thnn.py b/torch/nn/backends/thnn.py
new file mode 100644
index 0000000..0c3c633
--- /dev/null
+++ b/torch/nn/backends/thnn.py
@@ -0,0 +1,41 @@
+from .backend import FunctionBackend
+
+
+class THNNFunctionBackend(FunctionBackend):
+
+    def __reduce__(self):
+        return (_get_thnn_function_backend, ())
+
+    def __deepcopy__(self, memo):
+        memo[id(self)] = self
+        return self
+
+    def __copy__(self):
+        return self
+
+
+def _get_thnn_function_backend():
+    return backend
+
+
+def _initialize_backend():
+    from .._functions.thnn import _all_functions as _thnn_functions
+    from .._functions.rnn import RNN, \
+        RNNTanhCell, RNNReLUCell, GRUCell, LSTMCell
+    from .._functions.dropout import Dropout, FeatureDropout
+
+    backend.register_function('RNN', RNN)
+    backend.register_function('RNNTanhCell', RNNTanhCell)
+    backend.register_function('RNNReLUCell', RNNReLUCell)
+    backend.register_function('LSTMCell', LSTMCell)
+    backend.register_function('GRUCell', GRUCell)
+    backend.register_function('Dropout', Dropout)
+    backend.register_function('Dropout2d', FeatureDropout)
+    backend.register_function('Dropout3d', FeatureDropout)
+    for cls in _thnn_functions:
+        name = cls.__name__
+        backend.register_function(name, cls)
+
+
+backend = THNNFunctionBackend()
+_initialize_backend()
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
new file mode 100644
index 0000000..f1c4f75
--- /dev/null
+++ b/torch/nn/functional.py
@@ -0,0 +1,2308 @@
+"""Functional interface"""
+
+import warnings
+import math
+from operator import mul
+from functools import reduce
+
+import torch
+from torch._C import _infer_size, _add_docstr
+from . import _functions
+from .modules import utils
+from ._functions.padding import ConstantPadNd
+from ._functions import vision
+from ._functions.thnn.fold import Col2Im, Im2Col
+from .modules.utils import _single, _pair, _triple, _list_with_default
+from . import grad
+
+
+class _Reduction:
+    # NB: Keep this class in sync with enums in THNN/Reduction.h
+
+    @staticmethod
+    def get_enum(reduction):
+        if reduction == 'none':
+            return 0
+        if reduction == 'elementwise_mean':
+            return 1
+        if reduction == 'sum':
+            return 2
+        raise ValueError(reduction + " is not a valid value for reduction")
+
+    # In order to support previous versions, accept boolean size_average and reduce
+    # and convert them into the new constants for now
+
+    # We use these functions in torch/legacy as well, in which case we'll silence the warning
+    @staticmethod
+    def legacy_get_string(size_average, reduce, emit_warning=True):
+        warning = "size_average and reduce args will be deprecated, please use reduction='{}' instead."
+
+        if size_average is None:
+            size_average = True
+        if reduce is None:
+            reduce = True
+
+        if size_average and reduce:
+            ret = 'elementwise_mean'
+        elif reduce:
+            ret = 'sum'
+        else:
+            ret = 'none'
+        if emit_warning:
+            warnings.warn(warning.format(ret))
+        return ret
+
+    @staticmethod
+    def legacy_get_enum(size_average, reduce, emit_warning=True):
+        return _Reduction.get_enum(_Reduction.legacy_get_string(size_average, reduce, emit_warning))
+
+
+conv1d = _add_docstr(torch.conv1d, r"""
+conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 1D convolution over an input signal composed of several input
+planes.
+
+See :class:`~torch.nn.Conv1d` for details and output shape.
+
+Args:
+    input: input tensor of shape :math:`minibatch \times in\_channels \times iW`
+    weight: filters of shape :math:`out\_channels \times \frac{in\_channels}{groups} \times kW`
+    bias: optional bias of shape (:math:`out\_channels`). Default: ``None``
+    stride: the stride of the convolving kernel. Can be a single number or
+      a one-element tuple `(sW,)`. Default: 1
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a one-element tuple `(padW,)`. Default: 0
+    dilation: the spacing between kernel elements. Can be a single number or
+      a one-element tuple `(dW,)`. Default: 1
+    groups: split input into groups, :math:`in\_channels` should be divisible by
+      the number of groups. Default: 1
+
+Examples::
+
+    >>> filters = torch.randn(33, 16, 3)
+    >>> inputs = torch.randn(20, 16, 50)
+    >>> F.conv1d(inputs, filters)
+""")
+
+conv2d = _add_docstr(torch.conv2d, r"""
+conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 2D convolution over an input image composed of several input
+planes.
+
+See :class:`~torch.nn.Conv2d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
+    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kH \times kW`)
+    bias: optional bias tensor of shape (:math:`out\_channels`). Default: ``None``
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple `(sH, sW)`. Default: 1
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padH, padW)`. Default: 0
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dH, dW)`. Default: 1
+    groups: split input into groups, :math:`in\_channels` should be divisible by the
+      number of groups. Default: 1
+
+Examples::
+
+    >>> # With square kernels and equal stride
+    >>> filters = torch.randn(8,4,3,3)
+    >>> inputs = torch.randn(1,4,5,5)
+    >>> F.conv2d(inputs, filters, padding=1)
+""")
+
+conv3d = _add_docstr(torch.conv3d, r"""
+conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 3D convolution over an input image composed of several input
+planes.
+
+See :class:`~torch.nn.Conv3d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
+    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kT \times kH \times kW`)
+    bias: optional bias tensor of shape (:math:`out\_channels`). Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple `(sT, sH, sW)`. Default: 1
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padT, padH, padW)`. Default: 0
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dT, dH, dW)`. Default: 1
+    groups: split input into groups, :math:`in\_channels` should be divisible by
+      the number of groups. Default: 1
+
+Examples::
+
+    >>> filters = torch.randn(33, 16, 3, 3, 3)
+    >>> inputs = torch.randn(20, 16, 50, 10, 20)
+    >>> F.conv3d(inputs, filters)
+""")
+
+conv_transpose1d = _add_docstr(torch.conv_transpose1d, r"""
+conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 1D transposed convolution operator over an input signal
+composed of several input planes, sometimes also called "deconvolution".
+
+See :class:`~torch.nn.ConvTranspose1d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
+    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kW`)
+    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sW,)``. Default: 1
+    padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padW,)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple ``(out_padW)``. Default: 0
+    groups: split input into groups, :math:`in\_channels` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple ``(dW,)``. Default: 1
+
+Examples::
+
+    >>> inputs = torch.randn(20, 16, 50)
+    >>> weights = torch.randn(16, 33, 5)
+    >>> F.conv_transpose1d(inputs, weights)
+""")
+
+conv_transpose2d = _add_docstr(torch.conv_transpose2d, r"""
+conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 2D transposed convolution operator over an input image
+composed of several input planes, sometimes also called "deconvolution".
+
+See :class:`~torch.nn.ConvTranspose2d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
+    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kH \times kW`)
+    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sH, sW)``. Default: 1
+    padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padH, padW)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple ``(out_padH, out_padW)``.
+      Default: 0
+    groups: split input into groups, :math:`in\_channels` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple ``(dH, dW)``. Default: 1
+
+Examples::
+
+    >>> # With square kernels and equal stride
+    >>> inputs = torch.randn(1, 4, 5, 5)
+    >>> weights = torch.randn(4, 8, 3, 3)
+    >>> F.conv_transpose2d(inputs, weights, padding=1)
+""")
+
+conv_transpose3d = _add_docstr(torch.conv_transpose3d, r"""
+conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 3D transposed convolution operator over an input image
+composed of several input planes, sometimes also called "deconvolution"
+
+See :class:`~torch.nn.ConvTranspose3d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
+    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kT \times kH \times kW`)
+    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sT, sH, sW)``. Default: 1
+    padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padT, padH, padW)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple
+      ``(out_padT, out_padH, out_padW)``. Default: 0
+    groups: split input into groups, :math:`in\_channels` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dT, dH, dW)`. Default: 1
+
+Examples::
+
+    >>> inputs = torch.randn(20, 16, 50, 10, 20)
+    >>> weights = torch.randn(16, 33, 3, 3, 3)
+    >>> F.conv_transpose3d(inputs, weights)
+""")
+
+
+def conv_tbc(input, weight, bias, pad=0):
+    r"""Applies a 1-dimensional sequence convolution over an input sequence.
+    Input and output dimensions are (Time, Batch, Channels) - hence TBC.
+
+    Args:
+        input: input tensor of shape (:math:`\text{sequence length} \times batch \times in\_channels`)
+        weight: filter of shape (:math:`\text{kernel width} \times in\_channels \times out\_channels`)
+        bias: bias of shape (:math:`out\_channels`)
+        pad: number of timesteps to pad
+    """
+    return input.conv_tbc(weight, bias, pad)
+
+
+# Pooling
+avg_pool1d = _add_docstr(torch.avg_pool1d, r"""
+avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) -> Tensor
+
+Applies a 1D average pooling over an input signal composed of several
+input planes.
+
+See :class:`~torch.nn.AvgPool1d` for details and output shape.
+
+Args:
+    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
+    kernel_size: the size of the window. Can be a single number or a
+      tuple `(kW,)`
+    stride: the stride of the window. Can be a single number or a tuple
+      `(sW,)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padW,)`. Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` to compute the
+        output shape. Default: ``False``
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation. Default: ``True``
+
+Example::
+    >>> # pool of square window of size=3, stride=2
+    >>> input = torch.tensor([[[1,2,3,4,5,6,7]]])
+    >>> F.avg_pool1d(input, kernel_size=3, stride=2)
+    tensor([[[ 2.,  4.,  6.]]])
+""")
+
+
+avg_pool2d = _add_docstr(torch._C._nn.avg_pool2d, r"""
+avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) -> Tensor
+
+Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size
+:math:`sH \times sW` steps. The number of output features is equal to the number of
+input planes.
+
+See :class:`~torch.nn.AvgPool2d` for details and output shape.
+
+Args:
+    input: input tensor (:math:`minibatch \times in\_channels \times iH \times iW`)
+    kernel_size: size of the pooling region. Can be a single number or a
+      tuple (:math:`kH \times kW`)
+    stride: stride of the pooling operation. Can be a single number or a
+      tuple `(sH, sW)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padH, padW)`. Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+        to compute the output shape. Default: ``False``
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation. Default: ``True``
+""")
+
+avg_pool3d = _add_docstr(torch._C._nn.avg_pool3d, r"""
+avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) -> Tensor
+
+Applies 3D average-pooling operation in :math:`kT \times kH \times kW` regions by step
+size :math:`sT \times sH \times sW` steps. The number of output features is equal to
+:math:`\lfloor\frac{\text{input planes}}{sT}\rfloor`.
+
+See :class:`~torch.nn.AvgPool3d` for details and output shape.
+
+Args:
+    input: input tensor (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
+    kernel_size: size of the pooling region. Can be a single number or a
+      tuple (:math:`kT \times kH \times kW`)
+    stride: stride of the pooling operation. Can be a single number or a
+      tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padT, padH, padW)`, Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+        to compute the output shape
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation
+""")
+
+
+def fractional_max_pool2d(input, kernel_size, output_size=None,
+                          output_ratio=None, return_indices=False,
+                          _random_samples=None):
+    r"""Applies 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number :math:`k` (for a square kernel of :math:`k \times k`)
+                     or a tuple (:math:`kH \times kW`)
+        output_size: the target output size of the image of the form :math:`oH \times oW`.
+                     Can be a tuple `(oH, oW)` or a single number :math:`oH` for a square image :math:`oH \times oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to `max_unpool2d`.
+
+    Examples::
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> F.fractional_max_pool2d(input, 3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> F.fractional_max_pool2d(input, 3, output_ratio=(0.5, 0.5))
+
+    .. _Fractional MaxPooling:
+        http://arxiv.org/abs/1412.6071
+    """
+    if output_size is None and output_ratio is None:
+        raise ValueError("fractional_max_pool2d requires specifying either "
+                         "an output_size, or a output_ratio")
+    if output_size is None:
+        output_ratio = _pair(output_ratio)
+        output_size = (int(input.size(2) * output_ratio[0]),
+                       int(input.size(3) * output_ratio[1]))
+
+    if _random_samples is None:
+        _random_samples = input.new(input.size(0), input.size(1), 2).uniform_()
+    ret = torch._C._nn.fractional_max_pool2d(input, kernel_size, output_size, _random_samples)
+    return ret if return_indices else ret[0]
+
+
+def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 1D max pooling over an input signal composed of several input
+    planes.
+
+    See :class:`~torch.nn.MaxPool1d` for details.
+    """
+    ret = torch.max_pool1d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    return ret if return_indices else ret[0]
+
+
+def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 2D max pooling over an input signal composed of several input
+    planes.
+
+    See :class:`~torch.nn.MaxPool2d` for details.
+    """
+    ret = torch._C._nn.max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    return ret if return_indices else ret[0]
+
+
+def max_pool3d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 3D max pooling over an input signal composed of several input
+    planes.
+
+    See :class:`~torch.nn.MaxPool3d` for details.
+    """
+    ret = torch._C._nn.max_pool3d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    return ret if return_indices else ret[0]
+
+
+def _unpool_output_size(input, kernel_size, stride, padding, output_size):
+    input_size = input.size()
+    default_size = []
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[d + 2] - 1) * stride[d] +
+                            kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        return default_size
+
+    output_size = list(output_size)
+    if len(output_size) == len(kernel_size) + 2:
+        output_size = output_size[2:]
+    if len(output_size) != len(kernel_size):
+        raise ValueError("output_size should be a sequence containing "
+                         "{} or {} elements, but it has a length of '{}'"
+                         .format(len(kernel_size), len(kernel_size) + 2,
+                                 len(output_size)))
+    for d in range(len(kernel_size)):
+        min_size = default_size[d] - stride[d]
+        max_size = default_size[d] + stride[d]
+        if not (min_size < output_size[d] < max_size):
+            raise ValueError(
+                'invalid output_size "{}" (dim {} must be between {} and {})'
+                .format(output_size, d, min_size, max_size))
+
+    return output_size
+
+
+def max_unpool1d(input, indices, kernel_size, stride=None, padding=0,
+                 output_size=None):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
+
+    See :class:`~torch.nn.MaxUnpool1d` for details.
+    """
+    kernel_size = _single(kernel_size)
+    stride = _single(stride or kernel_size)
+    padding = _single(padding)
+    output_size = _unpool_output_size(input, kernel_size, stride, padding,
+                                      output_size)
+    return torch._C._nn.max_unpool2d(input.unsqueeze(3), indices.unsqueeze(3), output_size + [1]).squeeze(3)
+
+
+def max_unpool2d(input, indices, kernel_size, stride=None, padding=0,
+                 output_size=None):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
+
+    See :class:`~torch.nn.MaxUnpool2d` for details.
+    """
+    kernel_size = _pair(kernel_size)
+    stride = _pair(stride or kernel_size)
+    padding = _pair(padding)
+    output_size = _unpool_output_size(input, kernel_size, stride, padding,
+                                      output_size)
+    return torch._C._nn.max_unpool2d(input, indices, output_size)
+
+
+def max_unpool3d(input, indices, kernel_size, stride=None, padding=0,
+                 output_size=None):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+
+    See :class:`~torch.nn.MaxUnpool3d` for details.
+    """
+    kernel_size = _triple(kernel_size)
+    stride = _triple(stride or kernel_size)
+    padding = _triple(padding)
+    output_size = _unpool_output_size(input, kernel_size, stride, padding,
+                                      output_size)
+    return torch._C._nn.max_unpool3d(input, indices, output_size, stride, padding)
+
+
+def lp_pool2d(input, norm_type, kernel_size, stride=None, ceil_mode=False):
+    r"""Applies a 2D power-average pooling over an input signal composed of
+    several input planes. If the sum of all inputs to the power of `p` is
+    zero, the gradient is set to zero as well.
+
+    See :class:`~torch.nn.LPPool2d` for details.
+    """
+    kw, kh = utils._pair(kernel_size)
+    out = avg_pool2d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
+    return (torch.sign(out) * relu(torch.abs(out))).mul(kw * kh).pow(1. / norm_type)
+
+
+def lp_pool1d(input, norm_type, kernel_size, stride=None, ceil_mode=False):
+    r"""Applies a 1D power-average pooling over an input signal composed of
+    several input planes. If the sum of all inputs to the power of `p` is
+    zero, the gradient is set to zero as well.
+
+    See :class:`~torch.nn.LPPool1d` for details.
+    """
+    out = avg_pool1d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
+    return (torch.sign(out) * relu(torch.abs(out))).mul(kernel_size).pow(1. / norm_type)
+
+
+def adaptive_max_pool1d(input, output_size, return_indices=False):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool1d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    ret = torch.adaptive_max_pool1d(input, output_size)
+    return ret if return_indices else ret[0]
+
+
+def adaptive_max_pool2d(input, output_size, return_indices=False):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            double-integer tuple)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    output_size = _list_with_default(output_size, input.size())
+    ret = torch._C._nn.adaptive_max_pool2d(input, output_size)
+    return ret if return_indices else ret[0]
+
+
+def adaptive_max_pool3d(input, output_size, return_indices=False):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            triple-integer tuple)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    output_size = _list_with_default(output_size, input.size())
+    ret = torch._C._nn.adaptive_max_pool3d(input, output_size)
+    return ret if return_indices else ret[0]
+
+
+adaptive_avg_pool1d = _add_docstr(torch.adaptive_avg_pool1d, r"""
+adaptive_avg_pool1d(input, output_size) -> Tensor
+
+Applies a 1D adaptive average pooling over an input signal composed of
+several input planes.
+
+See :class:`~torch.nn.AdaptiveAvgPool1d` for details and output shape.
+
+Args:
+    output_size: the target output size (single integer)
+""")
+
+
+def adaptive_avg_pool2d(input, output_size):
+    r"""
+    Applies a 2D adaptive average pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            double-integer tuple)
+    """
+    output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_avg_pool2d(input, output_size)
+
+
+def adaptive_avg_pool3d(input, output_size):
+    r"""
+    Applies a 3D adaptive average pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveAvgPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            triple-integer tuple)
+    """
+    output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_avg_pool3d(input, output_size)
+
+
+# Activation functions
+def dropout(input, p=0.5, training=False, inplace=False):
+    return _functions.dropout.Dropout.apply(input, p, training, inplace)
+
+
+def alpha_dropout(input, p=0.5, training=False, inplace=False):
+    r"""Applies alpha dropout to the input.
+
+    See :class:`~torch.nn.AlphaDropout` for details.
+    """
+    return _functions.dropout.AlphaDropout.apply(input, p, training, inplace)
+
+
+def dropout2d(input, p=0.5, training=False, inplace=False):
+    return _functions.dropout.FeatureDropout.apply(input, p, training, inplace)
+
+
+def dropout3d(input, p=0.5, training=False, inplace=False):
+    return _functions.dropout.FeatureDropout.apply(input, p, training, inplace)
+
+
+def feature_alpha_dropout(input, p=0.5, training=False, inplace=False):
+    return _functions.dropout.FeatureAlphaDropout.apply(input, p, training, inplace)
+
+
+def threshold(input, threshold, value, inplace=False):
+    r"""Thresholds each element of the input Tensor.
+
+    See :class:`~torch.nn.Threshold` for more details.
+    """
+    if inplace:
+        return torch._C._nn.threshold_(input, threshold, value)
+    return torch._C._nn.threshold(input, threshold, value)
+
+
+threshold_ = _add_docstr(torch._C._nn.threshold_, r"""
+threshold_(input, threshold, value) -> Tensor
+
+In-place version of :func:`~threshold`.
+""")
+
+
+def relu(input, inplace=False):
+    r"""relu(input, inplace=False) -> Tensor
+
+    Applies the rectified linear unit function element-wise. See
+    :class:`~torch.nn.ReLU` for more details.
+    """
+    if inplace:
+        return torch.relu_(input)
+    return torch.relu(input)
+
+
+relu_ = _add_docstr(torch.relu_, r"""
+relu_(input) -> Tensor
+
+In-place version of :func:`~relu`.
+""")
+
+
+def glu(input, dim=-1):
+    r"""
+    glu(input, dim=-1) -> Tensor
+
+    The gated linear unit. Computes:
+
+    .. math ::
+
+        H = A \times \sigma(B)
+
+    where `input` is split in half along `dim` to form `A` and `B`.
+
+    See `Language Modeling with Gated Convolutional Networks <https://arxiv.org/abs/1612.08083>`_.
+
+    Args:
+        input (Tensor): input tensor
+        dim (int): dimension on which to split the input
+    """
+    if input.dim() == 0:
+        raise RuntimeError("glu does not suppport scalars because halving size must be even")
+    return torch._C._nn.glu(input, dim)
+
+
+def hardtanh(input, min_val=-1., max_val=1., inplace=False):
+    r"""
+    hardtanh(input, min_val=-1., max_val=1., inplace=False) -> Tensor
+
+    Applies the HardTanh function element-wise. See :class:`~torch.nn.Hardtanh` for more
+    details.
+    """
+    if inplace:
+        return torch._C._nn.hardtanh_(input, min_val, max_val)
+    return torch._C._nn.hardtanh(input, min_val, max_val)
+
+
+hardtanh_ = _add_docstr(torch._C._nn.hardtanh_, r"""
+hardtanh_(input, min_val=-1., max_val=1.) -> Tensor
+
+In-place version of :func:`~hardtanh`.
+""")
+
+
+def relu6(input, inplace=False):
+    r"""relu6(input, inplace=False) -> Tensor
+
+    Applies the element-wise function :math:`\text{ReLU6}(x) = \min(\max(0,x), 6)`.
+
+    See :class:`~torch.nn.ReLU6` for more details.
+    """
+    return hardtanh(input, 0, 6, inplace)
+
+
+def elu(input, alpha=1., inplace=False):
+    r"""Applies element-wise,
+    :math:`\text{ELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x) - 1))`.
+
+    See :class:`~torch.nn.ELU` for more details.
+    """
+    if inplace:
+        return torch._C._nn.elu_(input, alpha)
+    return torch._C._nn.elu(input, alpha)
+
+
+elu_ = _add_docstr(torch._C._nn.elu_, r"""
+elu_(input, alpha=1.) -> Tensor
+
+In-place version of :func:`~elu`.
+""")
+
+
+def selu(input, inplace=False):
+    r"""selu(input, inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{SELU}(x) = scale * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`,
+    with :math:`\alpha=1.6732632423543772848170429916717` and
+    :math:`scale=1.0507009873554804934193349852946`.
+
+    See :class:`~torch.nn.SELU` for more details.
+    """
+    if inplace:
+        return torch.selu_(input)
+    return torch.selu(input)
+
+selu_ = _add_docstr(torch.selu_, r"""
+selu_(input) -> Tensor
+
+In-place version of :func:`~selu`.
+""")
+
+
+def leaky_relu(input, negative_slope=0.01, inplace=False):
+    r"""
+    leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative_slope} * \min(0, x)`
+
+    See :class:`~torch.nn.LeakyReLU` for more details.
+    """
+    if inplace:
+        return torch._C._nn.leaky_relu_(input, negative_slope)
+    return torch._C._nn.leaky_relu(input, negative_slope)
+
+
+leaky_relu_ = _add_docstr(torch._C._nn.leaky_relu_, r"""
+leaky_relu_(input, negative_slope=0.01) -> Tensor
+
+In-place version of :func:`~leaky_relu`.
+""")
+
+
+prelu = _add_docstr(torch._C._nn.prelu, r"""
+prelu(input, weight) -> Tensor
+
+Applies element-wise the function
+:math:`\text{PReLU}(x) = \max(0,x) + \text{weight} * \min(0,x)` where weight is a
+learnable parameter.
+
+See :class:`~torch.nn.PReLU` for more details.
+""")
+
+
+def rrelu(input, lower=1. / 8, upper=1. / 3, training=False, inplace=False):
+    r"""rrelu(input, lower=1./8, upper=1./3, training=False, inplace=False) -> Tensor
+
+    Randomized leaky ReLU.
+
+    See :class:`~torch.nn.RReLU` for more details.
+    """
+    if inplace:
+        return torch.rrelu_(input, lower, upper, training)
+    return torch.rrelu(input, lower, upper, training)
+
+
+rrelu_ = _add_docstr(torch.rrelu_, r"""
+rrelu_(input, lower=1./8, upper=1./3, training=False) -> Tensor
+
+In-place version of :func:`~rrelu`.
+""")
+
+logsigmoid = _add_docstr(torch._C._nn.log_sigmoid, r"""
+logsigmoid(input) -> Tensor
+
+Applies element-wise :math:`\text{LogSigmoid}(x) = \log \left(\frac{1}{1 + \exp(-x_i)}\right)`
+
+See :class:`~torch.nn.LogSigmoid` for more details.
+""")
+
+
+def hardshrink(input, lambd=0.5):
+    r"""
+    hardshrink(input, lambd=0.5) -> Tensor
+
+    Applies the hard shrinkage function element-wise
+
+    See :class:`~torch.nn.Hardshrink` for more details.
+    """
+    return torch.hardshrink(input, lambd)
+
+
+def tanhshrink(input):
+    r"""tanhshrink(input) -> Tensor
+
+    Applies element-wise, :math:`\text{Tanhshrink}(x) = x - \text{Tanh}(x)`
+
+    See :class:`~torch.nn.Tanhshrink` for more details.
+    """
+    return input - input.tanh()
+
+
+def softsign(input):
+    r"""softsign(input) -> Tensor
+
+    Applies element-wise, the function :math:`\text{SoftSign}(x) = \frac{x}{1 + |x|}`
+
+    See :class:`~torch.nn.Softsign` for more details.
+    """
+    return input / (input.abs() + 1)
+
+
+softplus = _add_docstr(torch._C._nn.softplus, r"""
+softplus(input, beta=1, threshold=20) -> Tensor
+""")
+
+
+def _get_softmax_dim(name, ndim, stacklevel):
+    warnings.warn("Implicit dimension choice for " + name + " has been deprecated. "
+                  "Change the call to include dim=X as an argument.", stacklevel=stacklevel)
+    if ndim == 0 or ndim == 1 or ndim == 3:
+        return 0
+    else:
+        return 1
+
+
+def softmin(input, dim=None, _stacklevel=3):
+    r"""Applies a softmin function.
+
+    Note that :math:`\text{Softmin}(x) = \text{Softmax}(-x)`. See softmax definition for mathematical formula.
+
+    See :class:`~torch.nn.Softmin` for more details.
+
+    Arguments:
+        input (Tensor): input
+        dim (int): A dimension along which softmin will be computed (so every slice
+            along dim will sum to 1).
+    """
+    if dim is None:
+        dim = _get_softmax_dim('softmin', input.dim(), _stacklevel)
+    return -input.softmax(dim)
+
+
+def softmax(input, dim=None, _stacklevel=3):
+    r"""Applies a softmax function.
+
+    Softmax is defined as:
+
+    :math:`\text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}`
+
+    It is applied to all slices along dim, and will re-scale them so that the elements
+    lie in the range `(0, 1)` and sum to 1.
+
+    See :class:`~torch.nn.Softmax` for more details.
+
+    Arguments:
+        input (Tensor): input
+        dim (int): A dimension along which softmax will be computed.
+
+    .. note::
+        This function doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use log_softmax instead (it's faster and has better numerical properties).
+
+    """
+    if dim is None:
+        dim = _get_softmax_dim('softmax', input.dim(), _stacklevel)
+    return input.softmax(dim)
+
+
+def _sample_gumbel(shape, eps=1e-10, out=None):
+    """
+    Sample from Gumbel(0, 1)
+
+    based on
+    https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
+    (MIT license)
+    """
+    U = out.resize_(shape).uniform_() if out is not None else torch.rand(shape)
+    return - torch.log(eps - torch.log(U + eps))
+
+
+def _gumbel_softmax_sample(logits, tau=1, eps=1e-10):
+    """
+    Draw a sample from the Gumbel-Softmax distribution
+
+    based on
+    https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb
+    (MIT license)
+    """
+    dims = logits.dim()
+    gumbel_noise = _sample_gumbel(logits.size(), eps=eps, out=logits.data.new())
+    y = logits + gumbel_noise
+    return softmax(y / tau, dims - 1)
+
+
+def gumbel_softmax(logits, tau=1, hard=False, eps=1e-10):
+    r"""
+    Sample from the Gumbel-Softmax distribution and optionally discretize.
+
+    Args:
+      logits: `[batch_size, num_features]` unnormalized log probabilities
+      tau: non-negative scalar temperature
+      hard: if ``True``, the returned samples will be discretized as one-hot vectors,
+            but will be differentiated as if it is the soft sample in autograd
+
+    Returns:
+      Sampled tensor of shape ``batch_size x num_features`` from the Gumbel-Softmax distribution.
+      If ``hard=True``, the returned samples will be one-hot, otherwise they will
+      be probability distributions that sum to 1 across features
+
+    Constraints:
+
+    - Currently only work on 2D input :attr:`logits` tensor of shape ``batch_size x num_features``
+
+    Based on
+    https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
+    (MIT license)
+    """
+    shape = logits.size()
+    assert len(shape) == 2
+    y_soft = _gumbel_softmax_sample(logits, tau=tau, eps=eps)
+    if hard:
+        _, k = y_soft.max(-1)
+        # this bit is based on
+        # https://discuss.pytorch.org/t/stop-gradients-for-st-gumbel-softmax/530/5
+        y_hard = logits.new_zeros(*shape).scatter_(-1, k.view(-1, 1), 1.0)
+        # this cool bit of code achieves two things:
+        # - makes the output value exactly one-hot (since we add then
+        #   subtract y_soft value)
+        # - makes the gradient equal to y_soft gradient (since we strip
+        #   all other gradients)
+        y = y_hard - y_soft.detach() + y_soft
+    else:
+        y = y_soft
+    return y
+
+
+def log_softmax(input, dim=None, _stacklevel=3):
+    r"""Applies a softmax followed by a logarithm.
+
+    While mathematically equivalent to log(softmax(x)), doing these two
+    operations separately is slower, and numerically unstable. This function
+    uses an alternative formulation to compute the output and gradient correctly.
+
+    See :class:`~torch.nn.LogSoftmax` for more details.
+
+    Arguments:
+        input (Tensor): input
+        dim (int): A dimension along which log_softmax will be computed.
+    """
+    if dim is None:
+        dim = _get_softmax_dim('log_softmax', input.dim(), _stacklevel)
+    return input.log_softmax(dim)
+
+
+softshrink = _add_docstr(torch._C._nn.softshrink, r"""
+softshrink(input, lambd=0.5) -> Tensor
+
+Applies the soft shrinkage function elementwise
+
+See :class:`~torch.nn.Softshrink` for more details.
+""")
+
+
+def tanh(input):
+    r"""tanh(input) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}`
+
+    See :class:`~torch.nn.Tanh` for more details.
+    """
+    warnings.warn("nn.functional.tanh is deprecated. Use torch.tanh instead.")
+    return input.tanh()
+
+
+def sigmoid(input):
+    r"""sigmoid(input) -> Tensor
+
+    Applies the element-wise function :math:`\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}`
+
+    See :class:`~torch.nn.Sigmoid` for more details.
+    """
+    warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
+    return input.sigmoid()
+
+
+def linear(input, weight, bias=None):
+    r"""
+    Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
+
+    Shape:
+
+        - Input: :math:`(N, *, in\_features)` where `*` means any number of
+          additional dimensions
+        - Weight: :math:`(out\_features, in\_features)`
+        - Bias: :math:`(out\_features)`
+        - Output: :math:`(N, *, out\_features)`
+    """
+    if input.dim() == 2 and bias is not None:
+        # fused op is marginally faster
+        return torch.addmm(bias, input, weight.t())
+
+    output = input.matmul(weight.t())
+    if bias is not None:
+        output += bias
+    return output
+
+
+def bilinear(input1, input2, weight, bias=None):
+    return torch.bilinear(input1, input2, weight, bias)
+
+
+def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
+              scale_grad_by_freq=False, sparse=False):
+    r"""A simple lookup table that looks up embeddings in a fixed dictionary and size.
+
+    This module is often used to retrieve word embeddings using indices.
+    The input to the module is a list of indices, and the embedding matrix,
+    and the output is the corresponding word embeddings.
+
+    See :class:`torch.nn.Embedding` for more details.
+
+    Args:
+        input (LongTensor): Tensor containing indices into the embedding matrix
+        weight (Tensor): The embedding matrix
+            Number of rows should correspond to the maximum possible index + 1,
+            number of columns is the embedding size
+        padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
+                                         (initialized to zeros) whenever it encounters the index.
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting. Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
+
+    Shape:
+        - Input: LongTensor of arbitrary shape containing the indices to extract
+        - Weight: Embedding matrix of floating point type with shape `(V, embedding_dim)`,
+                            where V = maximum index + 1 and embedding_dim = the embedding size
+        - Output: `(*, embedding_dim)`, where `*` is the input shape
+
+    Examples::
+
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([[1,2,4,5],[4,3,2,9]])
+        >>> # an embedding matrix containing 10 tensors of size 3
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> F.embedding(input, embedding_matrix)
+        tensor([[[ 0.8490,  0.9625,  0.6753],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.6246,  0.9751,  0.3618],
+                 [ 0.4161,  0.2419,  0.7383]],
+
+                [[ 0.6246,  0.9751,  0.3618],
+                 [ 0.0237,  0.7794,  0.0528],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.3385,  0.8612,  0.1867]]])
+
+        >>> # example with padding_idx
+        >>> weights = torch.rand(10, 3)
+        >>> weights[0, :].zero_()
+        >>> embedding_matrix = weights
+        >>> input = torch.tensor([[0,2,0,5]])
+        >>> F.embedding(input, embedding_matrix, padding_idx=0)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.5609,  0.5384,  0.8720],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [ 0.6262,  0.2438,  0.7471]]])
+    """
+    if padding_idx is not None:
+        if padding_idx > 0:
+            assert padding_idx < weight.size(0), 'Padding_idx must be within num_embeddings'
+        elif padding_idx < 0:
+            assert padding_idx >= -weight.size(0), 'Padding_idx must be within num_embeddings'
+            padding_idx = weight.size(0) + padding_idx
+    elif padding_idx is None:
+            padding_idx = -1
+    if max_norm is not None:
+        # `embedding_renorm_` will call .contiguous() on input anyways, so we
+        # call it here and take advantage of the improved locality in the
+        # `embedding` call below too.
+        input = input.contiguous()
+        with torch.no_grad():
+            torch.embedding_renorm_(weight, input, max_norm, norm_type)
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+
+
+def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
+                  scale_grad_by_freq=False, mode='mean', sparse=False):
+    r"""Computes sums or means of 'bags' of embeddings, without instantiating the
+    intermediate embeddings.
+
+    See :class:`torch.nn.EmbeddingBag` for more details.
+
+    Args:
+        input (LongTensor): Tensor containing bags of indices into the embedding matrix
+        weight (Tensor): The embedding matrix
+            Number of rows should correspond to the maximum possible index + 1,
+            number of columns is the embedding size
+        offsets (LongTensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                             the starting index position of each bag (sequence) in :attr:`input`.
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting. Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The ``p`` in the ``p``-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
+                                 Note: this option is not supported when ``mode="max"``.
+
+    Shape:
+
+        - :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
+
+          - If :attr:`input` is 2D of shape ``B x N``,
+
+            it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
+            this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
+            :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+          - If :attr:`input` is 1D of shape ``N``,
+
+            it will be treated as a concatenation of multiple bags (sequences).
+            :attr:`offsets` is required to be a 1D tensor containing the
+            starting index positions of each bag in :attr:`input`. Therefore,
+            for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
+            having ``B`` bags. Empty bags (i.e., having 0-length) will have
+            returned vectors filled by zeros.
+
+        - :attr:`weight` (Tensor): the learnable weights of the module of
+          shape ``(num_embeddings x embedding_dim)``
+
+        - :attr:`output`: aggregated embedding values of shape ``B x embedding_dim``
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([1,2,4,5,4,3,2,9])
+        >>> offsets = torch.tensor([0,4])
+        >>> F.embedding_bag(embedding_matrix, input, offsets)
+        tensor([[ 0.3397,  0.3552,  0.5545],
+                [ 0.5893,  0.4386,  0.5882]])
+    """
+    # Check for backward compatibility.
+    # Used to be embedding_bag(weight, input, ...)
+    # Now is     embedding_bag(input, weight, ...)
+    if weight.dtype == torch.long and input.is_floating_point():
+        warnings.warn("Argument order of nn.functional.embedding_bag was changed. "
+                      "Usage `embedding_bag(weight, input, ...)` is deprecated, "
+                      "and should now be `embedding_bag(input, weight, ...)`.")
+        weight, input = input, weight
+
+    if input.dim() == 2:
+        if offsets is not None:
+            raise ValueError("if input is 2D, then offsets has to be None"
+                             ", as input is treated is a mini-batch of"
+                             " fixed length sequences. However, found "
+                             "offsets of type {}".format(type(offsets)))
+        else:
+            offsets = torch.arange(0, input.numel(), input.size(1),
+                                   dtype=torch.long, device=input.device)
+
+            input = input.reshape(-1)
+    elif input.dim() == 1:
+        if offsets is None:
+            raise ValueError("offsets has to be a 1D Tensor but got None")
+        if offsets.dim() != 1:
+            raise ValueError("offsets has to be a 1D Tensor")
+        if offsets[0].item() != 0:
+            raise ValueError("offsets[0] has to be 0, i.e., the first sequence "
+                             "in the mini-batch has to start from position 0. "
+                             "However, got {}".format(offsets[0].item()))
+        if offsets[-1].item() > input.size(0):
+            raise ValueError("offsets[-1] can not be greater than input's length"
+                             " ({}), but got offsets[-1] of {}"
+                             .format(input.size(0), offsets[-1].item()))
+    else:
+        raise ValueError("input has to be 1D or 2D Tensor,"
+                         " but got Tensor of dimension {}".format(input.dim()))
+
+    if mode == 'sum':
+        mode = 0
+    elif mode == 'mean':
+        mode = 1
+    elif mode == 'max':
+        mode = 2
+
+        if scale_grad_by_freq:
+            raise ValueError("max mode does not support scaling the gradient by the frequency")
+
+        if sparse:
+            raise ValueError("max mode does not support sparse weights")
+
+    else:
+        raise ValueError("mode has to be one of sum or mean")
+
+    if max_norm is not None:
+        with torch.no_grad():
+            torch.embedding_renorm_(weight, input, max_norm, norm_type)
+
+    ret, _, _, _ = torch.embedding_bag(
+        weight,
+        input,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse)
+    return ret
+
+
+def batch_norm(input, running_mean, running_var, weight=None, bias=None,
+               training=False, momentum=0.1, eps=1e-5):
+    r"""Applies Batch Normalization for each channel across a batch of data.
+
+    See :class:`~torch.nn.BatchNorm1d`, :class:`~torch.nn.BatchNorm2d`,
+    :class:`~torch.nn.BatchNorm3d` for details.
+    """
+    if training:
+        size = list(input.size())
+        if reduce(mul, size[2:], size[0]) == 1:
+            raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
+    return torch.batch_norm(
+        input, weight, bias, running_mean, running_var,
+        training, momentum, eps, torch.backends.cudnn.enabled
+    )
+
+
+def instance_norm(input, running_mean=None, running_var=None, weight=None,
+                  bias=None, use_input_stats=True, momentum=0.1, eps=1e-5):
+    r"""Applies Instance Normalization for each channel in each data sample in a
+    batch.
+
+    See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`,
+    :class:`~torch.nn.InstanceNorm3d` for details.
+    """
+    if not use_input_stats and (running_mean is None or running_var is None):
+        raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
+
+    b, c = input.size(0), input.size(1)
+    if weight is not None:
+        weight = weight.repeat(b)
+    if bias is not None:
+        bias = bias.repeat(b)
+
+    import torch.onnx.symbolic
+
+    @torch.onnx.symbolic_override_first_arg_based(torch.onnx.symbolic.instance_norm)
+    def _instance_norm(input, running_mean=None, running_var=None, weight=None,
+                       bias=None, use_input_stats=None, momentum=None, eps=None):
+        # Repeat stored stats and affine transform params if necessary
+        if running_mean is not None:
+            running_mean_orig = running_mean
+            running_mean = running_mean_orig.repeat(b)
+        if running_var is not None:
+            running_var_orig = running_var
+            running_var = running_var_orig.repeat(b)
+
+        # Apply instance norm
+        input_reshaped = input.contiguous().view(1, b * c, *input.size()[2:])
+
+        out = batch_norm(
+            input_reshaped, running_mean, running_var, weight=weight, bias=bias,
+            training=use_input_stats, momentum=momentum, eps=eps)
+
+        # Reshape and copy back
+        if running_mean is not None:
+            running_mean_orig.copy_(running_mean.view(b, c).mean(0, keepdim=False))
+        if running_var is not None:
+            running_var_orig.copy_(running_var.view(b, c).mean(0, keepdim=False))
+
+        return out.view(b, c, *input.size()[2:])
+    return _instance_norm(input, running_mean=running_mean,
+                          running_var=running_var, weight=weight, bias=bias,
+                          use_input_stats=use_input_stats, momentum=momentum,
+                          eps=eps)
+
+
+def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
+    r"""Applies Layer Normalization for last certain number of dimensions.
+
+    See :class:`~torch.nn.LayerNorm` for details.
+    """
+    return torch.layer_norm(input, normalized_shape, weight, bias, eps,
+                            torch.backends.cudnn.enabled)
+
+
+def group_norm(input, num_groups, weight=None, bias=None, eps=1e-5):
+    r"""Applies Group Normalization for last certain number of dimensions.
+
+    See :class:`~torch.nn.GroupNorm` for details.
+    """
+    return torch.group_norm(input, num_groups, weight, bias, eps,
+                            torch.backends.cudnn.enabled)
+
+
+def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
+    r"""Applies local response normalization over an input signal composed of
+    several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    See :class:`~torch.nn.LocalResponseNorm` for details.
+    """
+    dim = input.dim()
+    if dim < 3:
+        raise ValueError('Expected 3D or higher dimensionality \
+                         input (got {} dimensions)'.format(dim))
+    div = input.mul(input).unsqueeze(1)
+    if dim == 3:
+        div = pad(div, (0, 0, size // 2, (size - 1) // 2))
+        div = avg_pool2d(div, (size, 1), stride=1).squeeze(1)
+    else:
+        sizes = input.size()
+        div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
+        div = pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
+        div = avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
+        div = div.view(sizes)
+    div = div.mul(alpha).add(k).pow(beta)
+    return input / div
+
+
+# loss
+
+
+def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100,
+             reduce=None, reduction='elementwise_mean'):
+    r"""The negative log likelihood loss.
+
+    See :class:`~torch.nn.NLLLoss` for details.
+
+    Args:
+        input: :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
+            in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K > 1`
+            in the case of K-dimensional loss.
+        target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`,
+            or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for
+            K-dimensional loss.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Default: -100
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Example::
+
+        >>> # input is of size N x C = 3 x 5
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.tensor([1, 0, 4])
+        >>> output = F.nll_loss(F.log_softmax(input), target)
+        >>> output.backward()
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    dim = input.dim()
+    if dim < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(dim))
+
+    if input.size(0) != target.size(0):
+        raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'
+                         .format(input.size(0), target.size(0)))
+    if dim == 2:
+        return torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+    elif dim == 4:
+        return torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+    elif dim == 3 or dim > 4:
+        n = input.size(0)
+        c = input.size(1)
+        out_size = (n,) + input.size()[2:]
+        if target.size()[1:] != input.size()[2:]:
+            raise ValueError('Expected target size {}, got {}'.format(
+                out_size, target.size()))
+        input = input.contiguous().view(n, c, 1, -1)
+        target = target.contiguous().view(n, 1, -1)
+        if reduction is not 'none':
+            return torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+        out = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+        return out.view(out_size)
+
+
+def poisson_nll_loss(input, target, log_input=True, full=False, size_average=None, eps=1e-8,
+                     reduce=None, reduction='elementwise_mean'):
+    r"""Poisson negative log likelihood loss.
+
+    See :class:`~torch.nn.PoissonNLLLoss` for details.
+
+    Args:
+        input: expectation of underlying Poisson distribution.
+        target: random sample :math:`target \sim \text{Poisson}(input)`.
+        log_input: if ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target} * \text{input}`, if ``False`` then loss is
+            :math:`\text{input} - \text{target} * \log(\text{input}+\text{eps})`. Default: ``True``
+        full: whether to compute full loss, i. e. to add the Stirling
+            approximation term. Default: ``False``
+            :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input`=``False``. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    if log_input:
+        loss = torch.exp(input) - target * input
+    else:
+        loss = input - target * torch.log(input + eps)
+    if full:
+        mask = target > 1
+        loss[mask] += (target * torch.log(target) - target + 0.5 * torch.log(2 * math.pi * target))[mask]
+    if reduction is 'none':
+        return loss
+    if reduction is 'elementwise_mean':
+        return torch.mean(loss)
+    return torch.sum(loss)
+
+
+def kl_div(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""The `Kullback-Leibler divergence`_ Loss.
+
+    See :class:`~torch.nn.KLDivLoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape
+        target: Tensor of the same shape as input
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch._C._nn.kl_div(input, target, reduction)
+
+
+def cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100,
+                  reduce=None, reduction='elementwise_mean'):
+    r"""This criterion combines `log_softmax` and `nll_loss` in a single
+    function.
+
+    See :class:`~torch.nn.CrossEntropyLoss` for details.
+
+    Args:
+        input (Tensor) : :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
+            in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K > 1`
+            in the case of K-dimensional loss.
+        target (Tensor) : :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`,
+            or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for
+            K-dimensional loss.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Default: -100
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Examples::
+
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randint(5, (3,), dtype=torch.int64)
+        >>> loss = F.cross_entropy(input, target)
+        >>> loss.backward()
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
+
+
+def binary_cross_entropy(input, target, weight=None, size_average=None,
+                         reduce=None, reduction='elementwise_mean'):
+    r"""Function that measures the Binary Cross Entropy
+    between the target and the output.
+
+    See :class:`~torch.nn.BCELoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape
+        target: Tensor of the same shape as input
+        weight (Tensor, optional): a manual rescaling weight
+                if provided it's repeated to match input tensor shape
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Examples::
+
+        >>> input = torch.randn((3, 2), requires_grad=True)
+        >>> target = torch.rand((3, 2), requires_grad=False)
+        >>> loss = F.binary_cross_entropy(F.sigmoid(input), target)
+        >>> loss.backward()
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    if not (target.size() == input.size()):
+        warnings.warn("Using a target size ({}) that is different to the input size ({}) is deprecated. "
+                      "Please ensure they have the same size.".format(target.size(), input.size()))
+    if input.nelement() != target.nelement():
+        raise ValueError("Target and input must have the same number of elements. target nelement ({}) "
+                         "!= input nelement ({})".format(target.nelement(), input.nelement()))
+
+    if weight is not None:
+        new_size = _infer_size(target.size(), weight.size())
+        weight = weight.expand(new_size)
+
+    return torch._C._nn.binary_cross_entropy(input, target, weight, reduction)
+
+
+def binary_cross_entropy_with_logits(input, target, weight=None, size_average=None,
+                                     reduce=None, reduction='elementwise_mean', pos_weight=None):
+    r"""Function that measures Binary Cross Entropy between target and output
+    logits.
+
+    See :class:`~torch.nn.BCEWithLogitsLoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape
+        target: Tensor of the same shape as input
+        weight (Tensor, optional): a manual rescaling weight
+            if provided it's repeated to match input tensor shape
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+        pos_weight (Tensor, optional): a weight of positive examples.
+                Must be a vector with length equal to the number of classes.
+
+    Examples::
+
+         >>> input = torch.randn(3, requires_grad=True)
+         >>> target = torch.empty(3).random_(2)
+         >>> loss = F.binary_cross_entropy_with_logits(input, target)
+         >>> loss.backward()
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    if not (target.size() == input.size()):
+        raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
+
+    max_val = (-input).clamp(min=0)
+
+    if pos_weight is None:
+        loss = input - input * target + max_val + ((-max_val).exp() + (-input - max_val).exp()).log()
+    else:
+        log_weight = 1 + (pos_weight - 1) * target
+        loss = input - input * target + log_weight * (max_val + ((-max_val).exp() + (-input - max_val).exp()).log())
+
+    if weight is not None:
+        loss = loss * weight
+
+    if reduction == 'none':
+        return loss
+    elif reduction == 'elementwise_mean':
+        return loss.mean()
+    else:
+        return loss.sum()
+
+
+def _pointwise_loss(lambd, lambd_optimized, input, target, reduction='elementwise_mean'):
+    if target.requires_grad:
+        d = lambd(input, target)
+        if reduction == 'none':
+            return d
+        return torch.mean(d) if reduction == 'elementwise_mean' else torch.sum(d)
+    else:
+        return lambd_optimized(input, target, reduction)
+
+
+def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""Function that uses a squared term if the absolute
+    element-wise error falls below 1 and an L1 term otherwise.
+
+    See :class:`~torch.nn.SmoothL1Loss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch._C._nn.smooth_l1_loss(input, target, reduction)
+
+
+def l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    Function that takes the mean element-wise absolute value difference.
+
+    See :class:`~torch.nn.L1Loss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return _pointwise_loss(lambda a, b: torch.abs(a - b), torch._C._nn.l1_loss,
+                           input, target, reduction)
+
+
+def mse_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""mse_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    Measures the element-wise mean squared error.
+
+    See :class:`~torch.nn.MSELoss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return _pointwise_loss(lambda a, b: (a - b) ** 2, torch._C._nn.mse_loss, input, target, reduction)
+
+
+def margin_ranking_loss(input1, input2, target, margin=0, size_average=None,
+                        reduce=None, reduction='elementwise_mean'):
+    r"""margin_ranking_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.MarginRankingLoss` for details.
+    """  # noqa
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    if input1.dim() == 0 or input2.dim() == 0 or target.dim() == 0:
+        raise RuntimeError(("margin_ranking_loss does not support scalars, got sizes: "
+                            "input1: {}, input2: {}, target: {} ".format(input1.size(), input2.size(), target.size())))
+    return torch.margin_ranking_loss(input1, input2, target, margin, reduction)
+
+
+def hinge_embedding_loss(input, target, margin=1.0, size_average=None,
+                         reduce=None, reduction='elementwise_mean'):
+    r"""hinge_embedding_loss(input, target, margin=1.0, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.HingeEmbeddingLoss` for details.
+    """  # noqa
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch.hinge_embedding_loss(input, target, margin, reduction)
+
+
+def multilabel_margin_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""multilabel_margin_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.MultiLabelMarginLoss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch._C._nn.multilabel_margin_loss(input, target, reduction)
+
+
+def soft_margin_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
+    r"""soft_margin_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.SoftMarginLoss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch._C._nn.soft_margin_loss(input, target, reduction)
+
+
+def multilabel_soft_margin_loss(input, target, weight=None, size_average=None,
+                                reduce=None, reduction='elementwise_mean'):
+    r"""multilabel_soft_margin_loss(input, target, weight=None, size_average=None) -> Tensor
+
+    See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    input = torch.sigmoid(input)
+    return binary_cross_entropy(input, target, weight, None, None, reduction)
+
+
+def cosine_embedding_loss(input1, input2, target, margin=0, size_average=None,
+                          reduce=None, reduction='elementwise_mean'):
+    r"""cosine_embedding_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.CosineEmbeddingLoss` for details.
+    """  # noqa
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction)
+
+
+def multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None,
+                      reduce=None, reduction='elementwise_mean'):
+    r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None,
+                          reduce=None, reduction='elementwise_mean') -> Tensor
+
+    See :class:`~torch.nn.MultiMarginLoss` for details.
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    if p != 1 and p != 2:
+        raise ValueError('only p == 1 and p == 2 supported')
+    if weight is not None and weight.dim() != 1:
+        raise ValueError('weight must be one-dimensional')
+
+    return torch._C._nn.multi_margin_loss(input, target, p, margin, weight, reduction)
+
+
+def pixel_shuffle(input, upscale_factor):
+    r"""Rearranges elements in a tensor of shape :math:`[*, C*r^2, H, W]` to a
+    tensor of shape :math:`[C, H*r, W*r]`.
+
+    See :class:`~torch.nn.PixelShuffle` for details.
+
+    Args:
+        input (Tensor): Input
+        upscale_factor (int): factor to increase spatial resolution by
+
+    Examples::
+
+        >>> ps = nn.PixelShuffle(3)
+        >>> input = torch.empty(1, 9, 4, 4)
+        >>> output = ps(input)
+        >>> print(output.size())
+        torch.Size([1, 1, 12, 12])
+    """
+    batch_size, channels, in_height, in_width = input.size()
+    channels //= upscale_factor ** 2
+
+    out_height = in_height * upscale_factor
+    out_width = in_width * upscale_factor
+
+    input_view = input.contiguous().view(
+        batch_size, channels, upscale_factor, upscale_factor,
+        in_height, in_width)
+
+    shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous()
+    return shuffle_out.view(batch_size, channels, out_height, out_width)
+
+
+def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with ``nn.functional.interpolate(...)``.
+
+
+    The algorithm used for upsampling is determined by :attr:`mode`.
+
+    Currently temporal, spatial and volumetric upsampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    The modes available for upsampling are: `nearest`, `linear` (3D-only),
+    `bilinear` (4D-only), `trilinear` (5D-only)
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+        mode (string): algorithm used for upsampling:
+            'nearest' | 'linear' | 'bilinear' | 'trilinear'. Default: 'nearest'
+        align_corners (bool, optional): if True, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is `linear`,
+            `bilinear`, or `trilinear`. Default: False
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, and `trilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+
+    """
+    warnings.warn("nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Down/up samples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    The algorithm used for interpolation is determined by :attr:`mode`.
+
+    Currently temporal, spatial and volumetric sampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    The modes available for resizing are: `nearest`, `linear` (3D-only),
+    `bilinear` (4D-only), `trilinear` (5D-only), `area`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (string): algorithm used for upsampling:
+            'nearest' | 'linear' | 'bilinear' | 'trilinear' | 'area'. Default: 'nearest'
+        align_corners (bool, optional): if True, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is `linear`,
+            `bilinear`, or `trilinear`. Default: False
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, and `trilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+
+    """
+    from numbers import Integral
+    from .modules.utils import _ntuple
+
+    def _check_size_scale_factor(dim):
+        if size is None and scale_factor is None:
+            raise ValueError('either size or scale_factor should be defined')
+        if size is not None and scale_factor is not None:
+            raise ValueError('only one of size or scale_factor should be defined')
+        if scale_factor is not None and isinstance(scale_factor, tuple)\
+                and len(scale_factor) != dim:
+            raise ValueError('scale_factor shape must match input shape. '
+                             'Input is {}D, scale_factor size is {}'.format(dim, len(scale_factor)))
+
+    def _output_size(dim):
+        _check_size_scale_factor(dim)
+        if size is not None:
+            return size
+        scale_factors = _ntuple(dim)(scale_factor)
+        # math.floor might return float in py2.7
+        return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)]
+
+    if mode in ('nearest', 'area'):
+        if align_corners is not None:
+            raise ValueError("align_corners option can only be set with the "
+                             "interpolating modes: linear | bilinear | trilinear")
+    else:
+        if align_corners is None:
+            warnings.warn("Default upsampling behavior when mode={} is changed "
+                          "to align_corners=False since 0.4.0. Please specify "
+                          "align_corners=True if the old behavior is desired. "
+                          "See the documentation of nn.Upsample for details.".format(mode))
+            align_corners = False
+
+    if input.dim() == 3 and mode == 'nearest':
+        return torch._C._nn.upsample_nearest1d(input, _output_size(1))
+    elif input.dim() == 4 and mode == 'nearest':
+        return torch._C._nn.upsample_nearest2d(input, _output_size(2))
+    elif input.dim() == 5 and mode == 'nearest':
+        return torch._C._nn.upsample_nearest3d(input, _output_size(3))
+    elif input.dim() == 3 and mode == 'area':
+        return adaptive_avg_pool1d(input, _output_size(1))
+    elif input.dim() == 4 and mode == 'area':
+        return adaptive_avg_pool2d(input, _output_size(2))
+    elif input.dim() == 5 and mode == 'area':
+        return adaptive_avg_pool3d(input, _output_size(3))
+    elif input.dim() == 3 and mode == 'linear':
+        return torch._C._nn.upsample_linear1d(input, _output_size(1), align_corners)
+    elif input.dim() == 3 and mode == 'bilinear':
+        raise NotImplementedError("Got 3D input, but bilinear mode needs 4D input")
+    elif input.dim() == 3 and mode == 'trilinear':
+        raise NotImplementedError("Got 3D input, but trilinear mode needs 5D input")
+    elif input.dim() == 4 and mode == 'linear':
+        raise NotImplementedError("Got 4D input, but linear mode needs 3D input")
+    elif input.dim() == 4 and mode == 'bilinear':
+        return torch._C._nn.upsample_bilinear2d(input, _output_size(2), align_corners)
+    elif input.dim() == 4 and mode == 'trilinear':
+        raise NotImplementedError("Got 4D input, but trilinear mode needs 5D input")
+    elif input.dim() == 5 and mode == 'linear':
+        raise NotImplementedError("Got 5D input, but linear mode needs 3D input")
+    elif input.dim() == 5 and mode == 'bilinear':
+        raise NotImplementedError("Got 5D input, but bilinear mode needs 4D input")
+    elif input.dim() == 5 and mode == 'trilinear':
+        return torch._C._nn.upsample_trilinear3d(input, _output_size(3), align_corners)
+    else:
+        raise NotImplementedError("Input Error: Only 3D, 4D and 5D input Tensors supported"
+                                  " (got {}D) for the modes: nearest | linear | bilinear | trilinear"
+                                  " (got {})".format(input.dim(), mode))
+
+
+def upsample_nearest(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using nearest neighbours' pixel values.
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with ``nn.functional.interpolate(..., mode='nearest')``.
+
+    Currently spatial and volumetric upsampling are supported (i.e. expected
+    inputs are 4 or 5 dimensional).
+
+    Args:
+        input (Tensor): input
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatia
+            size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='nearest')
+
+
+def upsample_bilinear(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using bilinear upsampling.
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with
+        ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Expected inputs are spatial (4 dimensional). Use `upsample_trilinear` fo
+    volumetric (5 dimensional) inputs.
+
+    Args:
+        input (Tensor): input
+        size (int or Tuple[int, int]): output spatial size.
+        scale_factor (int or Tuple[int, int]): multiplier for spatial size
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='bilinear', align_corners=True)
+
+
+GRID_SAMPLE_MODE_ZEROS = 0
+GRID_SAMPLE_MODE_BORDER = 1
+
+
+def grid_sample(input, grid, mode='bilinear', padding_mode='zeros'):
+    r"""Given an :attr:`input` and a flow-field :attr:`grid`, computes the
+    `output` using input pixel locations from the grid.
+
+    Uses bilinear interpolation to sample the input pixels.
+    Currently, only spatial (4 dimensional) and volumetric (5 dimensional)
+    inputs are supported.
+
+    For each output location, :attr:`grid` has `x`, `y`
+    input pixel locations which are used to compute output.
+    In the case of 5D inputs, :attr:`grid` has `x`, `y`, `z` pixel locations.
+
+    .. Note::
+        To avoid confusion in notation, let's note that `x` corresponds to the `width` dimension `IW`,
+        `y` corresponds to the height dimension `IH` and `z` corresponds to the `depth` dimension `ID`.
+
+    :attr:`grid` has values in the range of `[-1, 1]`. This is because the
+    pixel locations are normalized by the input height and width.
+
+    For example, values: x: -1, y: -1 is the left-top pixel of the input, and
+    values: x: 1, y: 1 is the right-bottom pixel of the input.
+
+    If :attr:`grid` has values outside the range of `[-1, 1]`, those locations
+    are handled as defined by `padding_mode`. Options are `zeros` or `border`,
+    defining those locations to use 0 or image border values as contribution
+    to the bilinear interpolation.
+
+    .. Note:: This function is used in building Spatial Transformer Networks
+
+    Args:
+        input (Tensor): input batch (N x C x IH x IW) or (N x C x ID x IH x IW)
+        grid (Tensor): flow-field of size (N x OH x OW x 2) or (N x OD x OH x OW x 3)
+        padding_mode (str): padding mode for outside grid values
+            'zeros' | 'border'. Default: 'zeros'
+
+    Returns:
+        output (Tensor): output Tensor
+
+    """
+    if mode != 'bilinear':
+        raise NotImplementedError("nn.functional.grid_sample got unsupported mode: '{}'".format(mode))
+    if padding_mode == 'zeros':
+        padding_mode = GRID_SAMPLE_MODE_ZEROS
+    elif padding_mode == 'border':
+        padding_mode = GRID_SAMPLE_MODE_BORDER
+    else:
+        raise ValueError("padding_mode needs to be 'zeros' or 'border', but got {}".format(padding_mode))
+    return torch.grid_sampler(input, grid, padding_mode)
+
+
+def affine_grid(theta, size):
+    r"""Generates a 2d flow field, given a batch of affine matrices :attr:`theta`
+    Generally used in conjunction with :func:`grid_sample` to
+    implement Spatial Transformer Networks.
+
+    Args:
+        theta (Tensor): input batch of affine matrices (:math:`N \times 2 \times 3`)
+        size (torch.Size): the target output image size (:math:`N \times C \times H \times W`)
+                           Example: torch.Size((32, 3, 24, 24))
+
+    Returns:
+        output (Tensor): output Tensor of size (:math:`N \times H \times W \times 2`)
+    """
+    return vision.affine_grid_generator(theta, size)
+
+
+def pad(input, pad, mode='constant', value=0):
+    r"""Pads tensor.
+
+    `Nd` constant padding:  The number of dimensions to pad is
+        :math:`\left\lfloor\frac{len(padding)}{2}\right\rfloor` and the dimensions that get padded begins with the
+        last dimension and moves forward. See below for examples.
+
+    `1D`, `2D` and `3D` "reflect" / "replicate" padding:
+        for 1D:
+                3D input tensor with padding of the form `(padLeft, padRight)`
+        for 2D:
+                4D input tensor with padding of the form `(padLeft, padRight, padTop, padBottom)`.
+        for 3D:
+                5D input tensor with padding of the form
+                `(padLeft, padRight, padTop, padBottom, padFront, padBack)`. No "reflect" implementation.
+
+    See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
+    :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
+    padding modes works.
+
+    Args:
+        input (Tensor): `Nd` tensor
+        pad (tuple): m-elem tuple, where :math:`\frac{m}{2} \leq` input dimensions and :math:`m` is even.
+        mode: 'constant', 'reflect' or 'replicate'. Default: 'constant'
+        value: fill value for 'constant' padding. Default: 0
+
+    Examples::
+
+        >>> t4d = torch.empty(3, 3, 4, 2)
+        >>> p1d = (1, 1) # pad last dim by 1 on each side
+        >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
+        >>> print(out.data.size())
+        torch.Size([3, 3, 4, 4])
+        >>> p2d = (1, 1, 2, 2) # pad last dim by (1, 1) and 2nd to last by (2, 2)
+        >>> out = F.pad(t4d, p2d, "constant", 0)
+        >>> print(out.data.size())
+        torch.Size([3, 3, 8, 4])
+        >>> t4d = torch.empty(3, 3, 4, 2)
+        >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
+        >>> out = F.pad(t4d, p3d, "constant", 0)
+        >>> print(out.data.size())
+        torch.Size([3, 9, 7, 3])
+
+    """
+    assert len(pad) % 2 == 0, 'Padding length must be divisible by 2'
+    assert len(pad) // 2 <= input.dim(), 'Padding length too large'
+    if mode == 'constant':
+        return ConstantPadNd.apply(input, pad, value)
+    else:
+        assert value == 0, 'Padding mode "{}"" doesn\'t take in value argument'.format(mode)
+        if input.dim() == 3:
+            assert len(pad) == 2, '3D tensors expect 2 values for padding'
+            if mode == 'reflect':
+                return torch._C._nn.reflection_pad1d(input, pad)
+            elif mode == 'replicate':
+                return torch._C._nn.replication_pad1d(input, pad)
+        elif input.dim() == 4:
+            assert len(pad) == 4, '4D tensors expect 4 values for padding'
+            if mode == 'reflect':
+                return torch._C._nn.reflection_pad2d(input, pad)
+            elif mode == 'replicate':
+                return torch._C._nn.replication_pad2d(input, pad)
+        elif input.dim() == 5:
+            assert len(pad) == 6, '5D tensors expect 6 values for padding'
+            if mode == 'reflect':
+                raise NotImplementedError
+            elif mode == 'replicate':
+                return torch._C._nn.replication_pad3d(input, pad)
+        else:
+            raise NotImplementedError("Only 3D, 4D, 5D padding with non-constant padding are supported for now")
+
+
+# distance
+
+def pairwise_distance(x1, x2, p=2, eps=1e-6, keepdim=False):
+    r"""
+    See :class:`torch.nn.PairwiseDistance` for details
+    """
+    return torch.pairwise_distance(x1, x2, p, eps, keepdim)
+
+
+def cosine_similarity(x1, x2, dim=1, eps=1e-8):
+    r"""Returns cosine similarity between x1 and x2, computed along dim.
+
+    .. math ::
+        \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}
+
+    Args:
+        x1 (Tensor): First input.
+        x2 (Tensor): Second input (of size matching x1).
+        dim (int, optional): Dimension of vectors. Default: 1
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8
+
+    Shape:
+        - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`.
+        - Output: :math:`(\ast_1, \ast_2)` where 1 is at position `dim`.
+
+    Example::
+
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> output = F.cosine_similarity(input1, input2)
+        >>> print(output)
+    """
+    w12 = torch.sum(x1 * x2, dim)
+    w1 = torch.norm(x1, 2, dim)
+    w2 = torch.norm(x2, 2, dim)
+    return w12 / (w1 * w2).clamp(min=eps)
+
+
+def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False, size_average=None,
+                        reduce=None, reduction="elementwise_mean"):
+    r"""
+    See :class:`~torch.nn.TripletMarginLoss` for details
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
+    return torch.triplet_margin_loss(anchor, positive, negative, margin, p, eps,
+                                     swap, reduction)
+
+
+def normalize(input, p=2, dim=1, eps=1e-12):
+    r"""Performs :math:`L_p` normalization of inputs over specified dimension.
+
+    Does:
+
+    .. math::
+        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}
+
+    for each subtensor v over dimension dim of input. Each subtensor is
+    flattened into a vector, i.e. :math:`\lVert v \rVert_p` is not a matrix
+    norm.
+
+    With default arguments normalizes over the second dimension with Euclidean
+    norm.
+
+    Args:
+        input: input tensor of any shape
+        p (float): the exponent value in the norm formulation. Default: 2
+        dim (int): the dimension to reduce. Default: 1
+        eps (float): small value to avoid division by zero. Default: 1e-12
+    """
+    return input / input.norm(p, dim, True).clamp(min=eps).expand_as(input)
+
+
+def assert_int_or_pair(arg, arg_name, message):
+    assert isinstance(arg, int) or len(arg) == 2, message.format(arg_name)
+
+
+def unfold(input, kernel_size, dilation=1, padding=0, stride=1):
+    r"""Extracts sliding local blocks from an batched input tensor.
+
+    .. warning::
+        Currently, only 4-D input tensors (batched image-like tensors) are
+        supported.
+
+    See :class:`torch.nn.Unfold` for details
+    """
+
+    if input.dim() == 4:
+        msg = '{} must be int or 2-tuple for 4D input'
+        assert_int_or_pair(kernel_size, 'kernel_size', msg)
+        assert_int_or_pair(dilation, 'dilation', msg)
+        assert_int_or_pair(padding, 'padding', msg)
+        assert_int_or_pair(stride, 'stride', msg)
+
+        return Im2Col.apply(input, _pair(kernel_size),
+                            _pair(dilation), _pair(padding), _pair(stride))
+    else:
+        raise NotImplementedError("Input Error: Only 4D input Tensors are supported (got {}D)".format(input.dim()))
+
+
+def fold(input, output_size, kernel_size, dilation=1, padding=0, stride=1):
+    r"""Combines an array of sliding local blocks into a large containing
+    tensor.
+
+    .. warning::
+        Currently, only 4-D output tensors (batched image-like tensors) are
+        supported.
+
+    See :class:`torch.nn.Fold` for details
+    """
+    if input.dim() == 3:
+        msg = '{} must be int or 2-tuple for 3D input'
+        assert_int_or_pair(output_size, 'output_size', msg)
+        assert_int_or_pair(kernel_size, 'kernel_size', msg)
+        assert_int_or_pair(dilation, 'dilation', msg)
+        assert_int_or_pair(padding, 'padding', msg)
+        assert_int_or_pair(stride, 'stride', msg)
+
+        return Col2Im.apply(input, _pair(output_size), _pair(kernel_size),
+                            _pair(dilation), _pair(padding), _pair(stride))
+    else:
+        raise NotImplementedError("Input Error: Only 3D input Tensors are supported (got {}D)".format(input.dim()))
diff --git a/torch/nn/grad.py b/torch/nn/grad.py
new file mode 100644
index 0000000..f76ffc7
--- /dev/null
+++ b/torch/nn/grad.py
@@ -0,0 +1,313 @@
+"""Gradient interface"""
+
+import torch
+from .modules.utils import _single, _pair, _triple
+
+
+def _grad_input_padding(grad_output, input_size, stride, padding, kernel_size):
+    input_size = list(input_size)
+    k = grad_output.dim() - 2
+
+    if len(input_size) == k + 2:
+        input_size = input_size[-k:]
+    if len(input_size) != k:
+        raise ValueError("input_size must have {} elements (got {})"
+                         .format(k + 2, len(input_size)))
+
+    def dim_size(d):
+        return ((grad_output.size(d + 2) - 1) * stride[d] - 2 * padding[d] +
+                kernel_size[d])
+
+    min_sizes = [dim_size(d) for d in range(k)]
+    max_sizes = [min_sizes[d] + stride[d] - 1 for d in range(k)]
+    for size, min_size, max_size in zip(input_size, min_sizes, max_sizes):
+        if size < min_size or size > max_size:
+            raise ValueError(
+                ("requested an input grad size of {}, but valid sizes range "
+                 "from {} to {} (for a grad_output of {})").format(
+                     input_size, min_sizes, max_sizes,
+                     grad_output.size()[2:]))
+
+    return tuple(input_size[d] - min_sizes[d] for d in range(k))
+
+
+def conv1d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv1d with respect to the input of the convolution.
+    This is same as the 1D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weight tensor (out_channels x in_channels/groups x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(1,1,3, requires_grad=True)
+        >>> weight = torch.randn(1,1,1, requires_grad=True)
+        >>> output = F.conv1d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv1d_input(input.shape, weight, grad_output)
+
+    """
+    stride = _single(stride)
+    padding = _single(padding)
+    dilation = _single(dilation)
+    kernel_size = [weight.shape[2]]
+
+    if input_size is None:
+        raise ValueError("grad.conv1d_input requires specifying an input_size")
+
+    grad_input_padding = _grad_input_padding(grad_output, input_size, stride,
+                                             padding, kernel_size)
+
+    return torch.conv_transpose1d(
+        grad_output, weight, bias, stride, padding, grad_input_padding, groups,
+        dilation)
+
+
+def conv1d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv1d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(1,1,3, requires_grad=True)
+        >>> weight = torch.randn(1,1,1, requires_grad=True)
+        >>> output = F.conv1d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_weight = torch.autograd.grad(output, filter, grad_output)
+        >>> F.grad.conv1d_weight(input, weight.shape, grad_output)
+
+    """
+    stride = _single(stride)
+    padding = _single(padding)
+    dilation = _single(dilation)
+    in_channels = input.shape[1]
+    out_channels = grad_output.shape[1]
+    min_batch = input.shape[0]
+
+    grad_output = grad_output.contiguous().repeat(1, in_channels // groups, 1)
+    grad_output = grad_output.contiguous().view(
+        grad_output.shape[0] * grad_output.shape[1], 1, grad_output.shape[2])
+
+    input = input.contiguous().view(1, input.shape[0] * input.shape[1],
+                                    input.shape[2])
+
+    grad_weight = torch.conv1d(input, grad_output, bias, dilation, padding,
+                               stride, in_channels * min_batch)
+
+    grad_weight = grad_weight.contiguous().view(
+        min_batch, grad_weight.shape[1] // min_batch, grad_weight.shape[2])
+
+    return grad_weight.sum(dim=0).view(
+        in_channels // groups, out_channels, grad_weight.shape[2]).transpose(
+            0, 1).narrow(2, 0, weight_size[2])
+
+
+def conv2d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv2d with respect to the input of the convolution.
+    This is same as the 2D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weight tensor (out_channels x in_channels/groups x kH x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(1,1,3,3, requires_grad=True)
+        >>> weight = torch.randn(1,1,1,2, requires_grad=True)
+        >>> output = F.conv2d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv2d_input(input.shape, weight, grad_output)
+
+    """
+    stride = _pair(stride)
+    padding = _pair(padding)
+    dilation = _pair(dilation)
+    kernel_size = (weight.shape[2], weight.shape[3])
+
+    if input_size is None:
+        raise ValueError("grad.conv2d_input requires specifying an input_size")
+
+    grad_input_padding = _grad_input_padding(grad_output, input_size, stride,
+                                             padding, kernel_size)
+
+    return torch.conv_transpose2d(
+        grad_output, weight, bias, stride, padding, grad_input_padding, groups,
+        dilation)
+
+
+def conv2d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv2d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iH x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(1,1,3,3, requires_grad=True)
+        >>> weight = torch.randn(1,1,1,2, requires_grad=True)
+        >>> output = F.conv2d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_weight = torch.autograd.grad(output, filter, grad_output)
+        >>> F.grad.conv2d_weight(input, weight.shape, grad_output)
+
+    """
+    stride = _pair(stride)
+    padding = _pair(padding)
+    dilation = _pair(dilation)
+    in_channels = input.shape[1]
+    out_channels = grad_output.shape[1]
+    min_batch = input.shape[0]
+
+    grad_output = grad_output.contiguous().repeat(1, in_channels // groups, 1,
+                                                  1)
+    grad_output = grad_output.contiguous().view(
+        grad_output.shape[0] * grad_output.shape[1], 1, grad_output.shape[2],
+        grad_output.shape[3])
+
+    input = input.contiguous().view(1, input.shape[0] * input.shape[1],
+                                    input.shape[2], input.shape[3])
+
+    grad_weight = torch.conv2d(input, grad_output, bias, dilation, padding,
+                               stride, in_channels * min_batch)
+
+    grad_weight = grad_weight.contiguous().view(
+        min_batch, grad_weight.shape[1] // min_batch, grad_weight.shape[2],
+        grad_weight.shape[3])
+
+    return grad_weight.sum(dim=0).view(
+        in_channels // groups, out_channels,
+        grad_weight.shape[2], grad_weight.shape[3]).transpose(0, 1).narrow(
+            2, 0, weight_size[2]).narrow(3, 0, weight_size[3])
+
+
+def conv3d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv3d with respect to the input of the convolution.
+    This is same as the 3D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weights tensor (out_channels x in_channels/groups x kT x kH x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oT x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(2, 8, 10, 10, 20, requires_grad=True)
+        >>> weight = torch.randn(4, 8, 2, 3, 3, requires_grad=True)
+        >>> output = F.conv3d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv3d_input(input.shape, weight, grad_output)
+
+    """
+    stride = _triple(stride)
+    padding = _triple(padding)
+    dilation = _triple(dilation)
+    kernel_size = (weight.shape[2], weight.shape[3], weight.shape[4])
+
+    if input_size is None:
+        raise ValueError("grad.conv3d_input requires specifying an input_size")
+
+    grad_input_padding = _grad_input_padding(grad_output, input_size, stride,
+                                             padding, kernel_size)
+
+    return torch.conv_transpose3d(
+        grad_output, weight, bias, stride, padding, grad_input_padding, groups,
+        dilation)
+
+
+def conv3d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1, bias=None):
+    r"""
+    Computes the gradient of conv3d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iT x iH x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oT x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias: optional bias tensor (out_channels). Default: None
+
+    Examples::
+
+        >>> input = torch.randn(2, 8, 10, 10, 20, requires_grad=True)
+        >>> weight = torch.randn(4, 8, 2, 3, 3, requires_grad=True)
+        >>> output = F.conv3d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_weight = torch.autograd.grad(output, weight, grad_output)
+        >>> F.grad.conv3d_weight(input, weight.shape, grad_output)
+
+    """
+    stride = _triple(stride)
+    padding = _triple(padding)
+    dilation = _triple(dilation)
+    in_channels = input.shape[1]
+    out_channels = grad_output.shape[1]
+    min_batch = input.shape[0]
+
+    grad_output = grad_output.repeat(1, in_channels // groups, 1, 1, 1)
+    grad_output = grad_output.contiguous().view(
+        grad_output.shape[0] * grad_output.shape[1], 1, grad_output.shape[2],
+        grad_output.shape[3], grad_output.shape[4])
+
+    input = input.contiguous().view(1, input.shape[0] * input.shape[1],
+                                    input.shape[2], input.shape[3],
+                                    input.shape[4])
+
+    grad_weight = torch.conv3d(input, grad_output, bias, dilation, padding,
+                               stride, in_channels * min_batch)
+
+    grad_weight = grad_weight.contiguous().view(
+        min_batch, grad_weight.shape[1] // min_batch, grad_weight.shape[2],
+        grad_weight.shape[3], grad_weight.shape[4])
+
+    return grad_weight.sum(dim=0).view(
+        in_channels // groups, out_channels, grad_weight.shape[2],
+        grad_weight.shape[3], grad_weight.shape[4]).transpose(0, 1).narrow(
+            2, 0, weight_size[2]).narrow(3, 0, weight_size[3]).narrow(
+                4, 0, weight_size[4])
diff --git a/torch/nn/init.py b/torch/nn/init.py
new file mode 100644
index 0000000..ecc9260
--- /dev/null
+++ b/torch/nn/init.py
@@ -0,0 +1,433 @@
+import math
+import random
+import warnings
+
+import torch
+
+
+def calculate_gain(nonlinearity, param=None):
+    r"""Return the recommended gain value for the given nonlinearity function.
+    The values are as follows:
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative_slope}^2}}`
+    ================= ====================================================
+
+    Args:
+        nonlinearity: the non-linear function (`nn.functional` name)
+        param: optional parameter for the non-linear function
+
+    Examples:
+        >>> gain = nn.init.calculate_gain('leaky_relu')
+    """
+    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def uniform_(tensor, a=0, b=1):
+    r"""Fills the input Tensor with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the lower bound of the uniform distribution
+        b: the upper bound of the uniform distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.uniform_(w)
+    """
+    with torch.no_grad():
+        return tensor.uniform_(a, b)
+
+
+def normal_(tensor, mean=0, std=1):
+    r"""Fills the input Tensor with values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std})`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.normal_(w)
+    """
+    with torch.no_grad():
+        return tensor.normal_(mean, std)
+
+
+def constant_(tensor, val):
+    r"""Fills the input Tensor with the value :math:`\text{val}`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        val: the value to fill the tensor with
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.constant_(w, 0.3)
+    """
+    with torch.no_grad():
+        return tensor.fill_(val)
+
+
+def ones_(tensor):
+    r"""Fills the input Tensor with ones`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.ones_(w)
+    """
+    with torch.no_grad():
+        return tensor.fill_(1)
+
+
+def zeros_(tensor):
+    r"""Fills the input Tensor with zeros`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.zeros_(w)
+    """
+    with torch.no_grad():
+        return tensor.zero_()
+
+
+def eye_(tensor):
+    r"""Fills the 2-dimensional input `Tensor` with the identity
+    matrix. Preserves the identity of the inputs in `Linear` layers, where as
+    many inputs are preserved as possible.
+
+    Args:
+        tensor: a 2-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.eye_(w)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    with torch.no_grad():
+        torch.eye(*tensor.shape, out=tensor, requires_grad=tensor.requires_grad)
+    return tensor
+
+
+def dirac_(tensor):
+    r"""Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac
+    delta function. Preserves the identity of the inputs in `Convolutional`
+    layers, where as many input channels are preserved as possible.
+
+    Args:
+        tensor: a {3, 4, 5}-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 16, 5, 5)
+        >>> nn.init.dirac_(w)
+    """
+    dimensions = tensor.ndimension()
+    if dimensions not in [3, 4, 5]:
+        raise ValueError("Only tensors with 3, 4, or 5 dimensions are supported")
+
+    sizes = tensor.size()
+    min_dim = min(sizes[0], sizes[1])
+    with torch.no_grad():
+        tensor.zero_()
+
+        for d in range(min_dim):
+            if dimensions == 3:  # Temporal convolution
+                tensor[d, d, tensor.size(2) // 2] = 1
+            elif dimensions == 4:  # Spatial convolution
+                tensor[d, d, tensor.size(2) // 2, tensor.size(3) // 2] = 1
+            else:  # Volumetric convolution
+                tensor[d, d, tensor.size(2) // 2, tensor.size(3) // 2, tensor.size(4) // 2] = 1
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed for tensor with less than 2 dimensions")
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(1)
+        fan_out = tensor.size(0)
+    else:
+        num_input_fmaps = tensor.size(1)
+        num_output_fmaps = tensor.size(0)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1):
+    r"""Fills the input `Tensor` with values according to the method
+    described in "Understanding the difficulty of training deep feedforward
+    neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.uniform_(-a, a)
+
+
+def xavier_normal_(tensor, gain=1):
+    r"""Fills the input `Tensor` with values according to the method
+    described in "Understanding the difficulty of training deep feedforward
+    neural networks" - Glorot, X. & Bengio, Y. (2010), using a normal
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std})` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_normal_(w)
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+    with torch.no_grad():
+        return tensor.normal_(0, std)
+
+
+def _calculate_correct_fan(tensor, mode):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in "Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification" - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (0 for ReLU
+            by default)
+        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in`
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing `fan_out` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with 'relu' or 'leaky_relu' (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.uniform_(-bound, bound)
+
+
+def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in "Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification" - He, K. et al. (2015), using a
+    normal distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std})` where
+
+    .. math::
+        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (0 for ReLU
+            by default)
+        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in`
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing `fan_out` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with 'relu' or 'leaky_relu' (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with torch.no_grad():
+        return tensor.normal_(0, std)
+
+
+def orthogonal_(tensor, gain=1):
+    r"""Fills the input `Tensor` with a (semi) orthogonal matrix, as
+    described in "Exact solutions to the nonlinear dynamics of learning in deep
+    linear neural networks" - Saxe, A. et al. (2013). The input tensor must have
+    at least 2 dimensions, and for tensors with more than 2 dimensions the
+    trailing dimensions are flattened.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2`
+        gain: optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.orthogonal_(w)
+    """
+    if tensor.ndimension() < 2:
+        raise ValueError("Only tensors with 2 or more dimensions are supported")
+
+    rows = tensor.size(0)
+    cols = tensor[0].numel()
+    flattened = tensor.new(rows, cols).normal_(0, 1)
+
+    if rows < cols:
+        flattened.t_()
+
+    # Compute the qr factorization
+    q, r = torch.qr(flattened)
+    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+    d = torch.diag(r, 0)
+    ph = d.sign()
+    q *= ph
+
+    if rows < cols:
+        q.t_()
+
+    with torch.no_grad():
+        tensor.view_as(q).copy_(q)
+        tensor.mul_(gain)
+    return tensor
+
+
+def sparse_(tensor, sparsity, std=0.01):
+    r"""Fills the 2D input `Tensor` as a sparse matrix, where the
+    non-zero elements will be drawn from the normal distribution
+    :math:`\mathcal{N}(0, 0.01)`, as described in "Deep learning via
+    Hessian-free optimization" - Martens, J. (2010).
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        sparsity: The fraction of elements in each column to be set to zero
+        std: the standard deviation of the normal distribution used to generate
+            the non-zero values
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.sparse_(w, sparsity=0.1)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    rows, cols = tensor.shape
+    num_zeros = int(math.ceil(sparsity * rows))
+
+    with torch.no_grad():
+        tensor.normal_(0, std)
+        for col_idx in range(cols):
+            row_indices = torch.randperm(rows)
+            zero_indices = row_indices[:num_zeros]
+            tensor[zero_indices, col_idx] = 0
+    return tensor
+
+
+# for backward compatibility
+def _make_deprecate(meth):
+    new_name = meth.__name__
+    old_name = new_name[:-1]
+
+    def deprecated_init(*args, **kwargs):
+        warnings.warn("nn.init.{} is now deprecated in favor of nn.init.{}."
+                      .format(old_name, new_name), stacklevel=2)
+        return meth(*args, **kwargs)
+
+    deprecated_init.__doc__ = r"""
+    {old_name}(...)
+
+    .. warning::
+        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.
+
+    See :func:`~torch.nn.init.{new_name}` for details.""".format(
+        old_name=old_name, new_name=new_name)
+    return deprecated_init
+
+
+uniform = _make_deprecate(uniform_)
+normal = _make_deprecate(normal_)
+constant = _make_deprecate(constant_)
+eye = _make_deprecate(eye_)
+dirac = _make_deprecate(dirac_)
+xavier_uniform = _make_deprecate(xavier_uniform_)
+xavier_normal = _make_deprecate(xavier_normal_)
+kaiming_uniform = _make_deprecate(kaiming_uniform_)
+kaiming_normal = _make_deprecate(kaiming_normal_)
+orthogonal = _make_deprecate(orthogonal_)
+sparse = _make_deprecate(sparse_)
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
new file mode 100644
index 0000000..4d98f48
--- /dev/null
+++ b/torch/nn/modules/__init__.py
@@ -0,0 +1,52 @@
+from .module import Module
+from .linear import Linear, Bilinear
+from .conv import Conv1d, Conv2d, Conv3d, \
+    ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
+    Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \
+    Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU
+from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
+    CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
+    SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
+from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
+from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
+    MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, LPPool1d, LPPool2d, AdaptiveMaxPool1d, \
+    AdaptiveMaxPool2d, AdaptiveMaxPool3d, AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d
+from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d
+from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
+from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout
+from .padding import ReflectionPad1d, ReflectionPad2d, ReplicationPad1d, ReplicationPad2d, \
+    ReplicationPad3d, ZeroPad2d, ConstantPad1d, ConstantPad2d, ConstantPad3d
+from .sparse import Embedding, EmbeddingBag
+from .rnn import RNNBase, RNN, LSTM, GRU, \
+    RNNCell, LSTMCell, GRUCell
+from .pixelshuffle import PixelShuffle
+from .upsampling import UpsamplingNearest2d, UpsamplingBilinear2d, Upsample
+from .distance import PairwiseDistance, CosineSimilarity
+from .fold import Fold, Unfold
+from .adaptive import AdaptiveLogSoftmaxWithLoss
+
+__all__ = [
+    'Module', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d',
+    'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6',
+    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink',
+    'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin',
+    'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss',
+    'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
+    'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
+    'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d',
+    'LPPool1d', 'LPPool2d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm',
+    'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
+    'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
+    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
+    'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance',
+    'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
+    'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
+    'AdaptiveLogSoftmaxWithLoss',
+]
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
new file mode 100644
index 0000000..d372a2c
--- /dev/null
+++ b/torch/nn/modules/activation.py
@@ -0,0 +1,786 @@
+import warnings
+import torch
+from torch.nn.parameter import Parameter
+
+from .module import Module
+from .. import functional as F
+
+
+class Threshold(Module):
+    r"""Thresholds each element of the input Tensor
+
+    Threshold is defined as:
+
+    .. math::
+        y =
+        \begin{cases}
+        x, &\text{ if } x > \text{threshold} \\
+        \text{value}, &\text{ otherwise }
+        \end{cases}
+
+    Args:
+        threshold: The value to threshold at
+        value: The value to replace with
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.Threshold(0.1, 20)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, threshold, value, inplace=False):
+        super(Threshold, self).__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+        # TODO: check in THNN (if inplace == True, then assert value <= threshold)
+
+    def forward(self, input):
+        return F.threshold(input, self.threshold, self.value, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'threshold={}, value={}{}'.format(
+            self.threshold, self.value, inplace_str
+        )
+
+
+class ReLU(Threshold):
+    r"""Applies the rectified linear unit function element-wise
+    :math:`\text{ReLU}(x)= \max(0, x)`
+
+    .. image:: scripts/activation_images/ReLU.png
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 0, inplace)
+
+    def extra_repr(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return inplace_str
+
+
+class RReLU(Module):
+    r"""Applies the randomized leaky rectified liner unit function element-wise
+    described in the paper
+    `Empirical Evaluation of Rectified Activations in Convolutional Network`_.
+
+    The function is defined as:
+
+    .. math::
+        \text{RReLU}(x) = \begin{cases}
+            x & \text{if } x \geq 0 \\
+            ax & \text{ otherwise }
+        \end{cases},
+
+    where :math:`a` is randomly sampled from uniform distribution
+    :math:`\mathcal{U}(\text{lower}, \text{upper})`.
+
+     See: https://arxiv.org/pdf/1505.00853.pdf
+
+    Args:
+        lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}`
+        upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}`
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.RReLU(0.1, 0.3)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _`Empirical Evaluation of Rectified Activations in Convolutional Network`:
+        https://arxiv.org/abs/1505.00853
+    """
+    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
+        super(RReLU, self).__init__()
+        self.lower = lower
+        self.upper = upper
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'lower={}, upper={}{}'.format(self.lower, self.upper, inplace_str)
+
+
+class Hardtanh(Module):
+    r"""Applies the HardTanh function element-wise
+
+    HardTanh is defined as:
+
+    .. math::
+        \text{HardTanh}(x) = \begin{cases}
+            1 & \text{ if } x > 1 \\
+            -1 & \text{ if } x < -1 \\
+            x & \text{ otherwise } \\
+        \end{cases}
+
+    The range of the linear region :math:`[-1, 1]` can be adjusted using
+    :attr:`min_val` and :attr:`max_val`.
+
+    .. image:: scripts/activation_images/Hardtanh.png
+
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.Hardtanh(-2, 2)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, min_val=-1, max_val=1, inplace=False, min_value=None, max_value=None):
+        super(Hardtanh, self).__init__()
+        if min_value is not None:
+            warnings.warn("keyword argument min_value is deprecated and renamed to min_val")
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn("keyword argument max_value is deprecated and renamed to max_val")
+            max_val = max_value
+
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+        assert self.max_val > self.min_val
+
+    def forward(self, input):
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'min_val={}, max_val={}{}'.format(
+            self.min_val, self.max_val, inplace_str
+        )
+
+
+class ReLU6(Hardtanh):
+    r"""Applies the element-wise function :math:`\text{ReLU6}(x) = \min(\max(0,x), 6)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/ReLU6.png
+
+    Examples::
+
+        >>> m = nn.ReLU6()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, inplace=False):
+        super(ReLU6, self).__init__(0, 6, inplace)
+
+    def extra_repr(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return inplace_str
+
+
+class Sigmoid(Module):
+    r"""Applies the element-wise function :math:`\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}`
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Sigmoid.png
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return torch.sigmoid(input)
+
+
+class Tanh(Module):
+    r"""Applies element-wise,
+    :math:`\text{Tanh}(x) = \tanh(x) = \frac{e^x - e^{-x}} {e^x + e^{-x}}`
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Tanh.png
+
+    Examples::
+
+        >>> m = nn.Tanh()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return torch.tanh(input)
+
+
+class ELU(Module):
+    r"""Applies element-wise,
+    :math:`\text{ELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x) - 1))`
+
+    Args:
+        alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/ELU.png
+
+    Examples::
+
+        >>> m = nn.ELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, alpha=1., inplace=False):
+        super(ELU, self).__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.elu(input, self.alpha, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'alpha={}{}'.format(self.alpha, inplace_str)
+
+
+class SELU(Module):
+    r"""Applies element-wise,
+    :math:`\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`,
+    with :math:`\alpha = 1.6732632423543772848170429916717` and
+    :math:`\text{scale} = 1.0507009873554804934193349852946`.
+
+    .. image:: scripts/activation_images/SELU.png
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.SELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    def __init__(self, inplace=False):
+        super(SELU, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.selu(input, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return inplace_str
+
+
+class GLU(Module):
+    r"""Applies the gated linear unit function
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where `a` is the first half of
+    the input vector and `b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(*, N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*, N / 2, *)`
+
+    Examples::
+
+        >>> m = nn.GLU()
+        >>> input = torch.randn(4, 2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, dim=-1):
+        super(GLU, self).__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        return F.glu(input, self.dim)
+
+    def extra_repr(self):
+        return 'dim={}'.format(self.dim)
+
+
+class Hardshrink(Module):
+    r"""Applies the hard shrinkage function element-wise
+    Hardshrink is defined as:
+
+    .. math::
+        \text{HardShrink}(x) =
+        \begin{cases}
+        x, & \text{ if } x > \lambda \\
+        x, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Hardshrink.png
+
+    Examples::
+
+        >>> m = nn.Hardshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, lambd=0.5):
+        super(Hardshrink, self).__init__()
+        self.lambd = lambd
+
+    def forward(self, input):
+        return F.hardshrink(input, self.lambd)
+
+    def extra_repr(self):
+        return '{}'.format(self.lambd)
+
+
+class LeakyReLU(Module):
+    r"""Applies element-wise,
+    :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative_slope} * \min(0, x)` or
+
+    .. math::
+        \text{LeakyRELU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        \text{negative_slope} \times x, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/LeakyReLU.png
+
+    Examples::
+
+        >>> m = nn.LeakyReLU(0.1)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, negative_slope=1e-2, inplace=False):
+        super(LeakyReLU, self).__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'negative_slope={}{}'.format(self.negative_slope, inplace_str)
+
+
+class LogSigmoid(Module):
+    r"""Applies element-wise :math:`\text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)`
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/LogSigmoid.png
+
+    Examples::
+
+        >>> m = nn.LogSigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.logsigmoid(input)
+
+
+class Softplus(Module):
+    r"""Applies element-wise :math:`\text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))`
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    for inputs above a certain value.
+
+    Args:
+        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Softplus.png
+
+    Examples::
+
+        >>> m = nn.Softplus()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, beta=1, threshold=20):
+        super(Softplus, self).__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, input):
+        return F.softplus(input, self.beta, self.threshold)
+
+    def extra_repr(self):
+        return 'beta={}, threshold={}'.format(self.beta, self.threshold)
+
+
+class Softshrink(Module):
+    r"""Applies the soft shrinkage function elementwise
+
+    SoftShrinkage function is defined as:
+
+    .. math::
+        \text{SoftShrinkage}(x) =
+        \begin{cases}
+        x - \lambda, & \text{ if } x > \lambda \\
+        x + \lambda, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` value for the Softshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Softshrink.png
+
+    Examples::
+
+        >>> m = nn.Softshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, lambd=0.5):
+        super(Softshrink, self).__init__()
+        self.lambd = lambd
+
+    def forward(self, input):
+        return F.softshrink(input, self.lambd)
+
+    def extra_repr(self):
+        return str(self.lambd)
+
+
+class PReLU(Module):
+    r"""Applies element-wise the function
+    :math:`\text{PReLU}(x) = \max(0,x) + a * \min(0,x)` or
+
+    .. math::
+        \text{PReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        ax, & \text{ otherwise }
+        \end{cases}
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+
+
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+
+    Args:
+        num_parameters: number of :math:`a` to learn. Default: 1
+        init: the initial value of :math:`a`. Default: 0.25
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/PReLU.png
+
+    Examples::
+
+        >>> m = nn.PReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, num_parameters=1, init=0.25):
+        self.num_parameters = num_parameters
+        super(PReLU, self).__init__()
+        self.weight = Parameter(torch.Tensor(num_parameters).fill_(init))
+
+    def forward(self, input):
+        return F.prelu(input, self.weight)
+
+    def extra_repr(self):
+        return 'num_parameters={}'.format(self.num_parameters)
+
+
+class Softsign(Module):
+    r"""Applies element-wise, the function :math:`\text{SoftSign}(x) = \frac{x}{ 1 + |x|}`
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Softsign.png
+
+    Examples::
+
+        >>> m = nn.Softsign()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.softsign(input)
+
+
+class Tanhshrink(Module):
+    r"""Applies element-wise, :math:`\text{Tanhshrink}(x) = x - \text{Tanh}(x)`
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/Tanhshrink.png
+
+    Examples::
+
+        >>> m = nn.Tanhshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.tanhshrink(input)
+
+
+class Softmin(Module):
+    r"""Applies the Softmin function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range `(0, 1)` and sum to 1
+
+    :math:`\text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}`
+
+    Shape:
+        - Input: any shape
+        - Output: same as input
+
+    Arguments:
+        dim (int): A dimension along which Softmin will be computed (so every slice
+            along dim will sum to 1).
+
+    Returns:
+        a Tensor of the same dimension and shape as the input, with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmin()
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+    def __init__(self, dim=None):
+        super(Softmin, self).__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        return F.softmin(input, self.dim, _stacklevel=5)
+
+
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range (0,1) and sum to 1
+
+    Softmax is defined as
+    :math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}`
+
+    Shape:
+        - Input: any shape
+        - Output: same as input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Arguments:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    .. note::
+        This module doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use `LogSoftmax` instead (it's faster and has better numerical properties).
+
+    Examples::
+
+        >>> m = nn.Softmax()
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    def __init__(self, dim=None):
+        super(Softmax, self).__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+
+    def forward(self, input):
+        return F.softmax(input, self.dim, _stacklevel=5)
+
+
+class Softmax2d(Module):
+    r"""Applies SoftMax over features to each spatial location.
+
+    When given an image of ``Channels x Height x Width``, it will
+    apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmax2d()
+        >>> # you softmax over the 2nd dimension
+        >>> input = torch.randn(2, 3, 12, 13)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        assert input.dim() == 4, 'Softmax2d requires a 4D tensor as input'
+        return F.softmax(input, 1, _stacklevel=5)
+
+
+class LogSoftmax(Module):
+    r"""Applies the `Log(Softmax(x))` function to an n-dimensional input Tensor.
+    The LogSoftmax formulation can be simplified as
+
+    :math:`\text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)`
+
+    Shape:
+        - Input: any shape
+        - Output: same as input
+
+    Arguments:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [-inf, 0)
+
+    Examples::
+
+        >>> m = nn.LogSoftmax()
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    def __init__(self, dim=None):
+        super(LogSoftmax, self).__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+
+    def forward(self, input):
+        return F.log_softmax(input, self.dim, _stacklevel=5)
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
new file mode 100644
index 0000000..88ae064
--- /dev/null
+++ b/torch/nn/modules/adaptive.py
@@ -0,0 +1,273 @@
+# -*- coding: utf-8 -*-
+
+from collections import namedtuple
+
+import torch
+
+from . import Sequential, ModuleList, Linear
+from .module import Module
+from ..functional import log_softmax
+
+
+_ASMoutput = namedtuple('ASMoutput', ['output', 'loss'])
+
+
+class AdaptiveLogSoftmaxWithLoss(Module):
+    r"""Efficient softmax approximation as described in
+    `Efficient softmax approximation for GPUs`_ by Edouard Grave, Armand Joulin,
+    Moustapha Cissé, David Grangier, and Hervé Jégou.
+
+    Adaptive softmax is an approximate strategy for training models with large
+    output spaces. It is most effective when the label distribution is highly
+    imbalanced, for example in natural language modelling, where the word
+    frequency distribution approximately follows the `Zipf's law`_.
+
+    Adaptive softmax partitions the labels into several clusters, according to
+    their frequency. These clusters may contain different number of targets
+    each.
+    Additionally, clusters containig less frequent labels assign lower
+    dimensional embeddings to those labels, which speeds up the computation.
+    For each minibatch, only clusters for which at least one target is
+    present are evaluated.
+
+    The idea is that the clusters which are accessed frequently
+    (like the first one, containing most frequent labels), should also be cheap
+    to compute -- that is, contain a small number of assigned labels.
+
+    We highly recommend taking a look at the original paper for more details.
+
+    * :attr:`cutoffs` should be an ordered Sequence of integers sorted
+      in the increasing order.
+      It controls number of clusters and the partitioning of targets into
+      clusters. For example setting ``cutoffs = [10, 100, 1000]``
+      means that first `10` targets will be assigned
+      to the 'head' of the adaptive softmax, targets `11, 12, ..., 100` will be
+      assigned to the first cluster, and targets `101, 102, ..., 1000` will be
+      assigned to the second cluster, while targets
+      `1001, 1002, ..., n_classes - 1` will be assigned
+      to the last, third cluster
+
+    * :attr:`div_value` is used to compute the size of each additional cluster,
+      which is given as
+      :math:`\left\lfloor\frac{in\_features}{div\_value^{idx}}\right\rfloor`,
+      where :math:`idx` is the cluster index (with clusters
+      for less frequent words having larger indices,
+      and indices starting from :math:`1`).
+
+    * :attr:`head_bias` if set to True, adds a bias term to the 'head' of the
+      adaptive softmax. See paper for details. Set to False in the official
+      implementation.
+
+    .. warning::
+        Labels passed as inputs to this module should be sorted accoridng to
+        their frequency. This means that the most frequent label should be
+        represented by the index `0`, and the least frequent
+        label should be represented by the index `n_classes - 1`.
+
+    .. note::
+        This module returns a ``NamedTuple`` with ``output``
+        and ``loss`` fields. See further documentation for details.
+
+    .. note::
+        To compute log-probabilities for all classes, the ``log_prob``
+        method can be used.
+
+    Args:
+        in_features (int): Number of features in the input tensor
+        n_classes (int): Number of classes in the dataset.
+        cutoffs (Sequence): Cutoffs used to assign targets to their buckets.
+        div_value (float, optional): value used as an exponent to compute sizes
+            of the clusters. Default: 4.0
+
+    Returns:
+        ``NamedTuple`` with ``output`` and ``loss`` fields:
+            * **output** is a Tensor of size ``N`` containing computed target
+              log probabilities for each example
+            * **loss** is a Scalar representing the computed negative
+              log likelihood loss
+
+    Shape:
+        - input: :math:`(N, in\_features)`
+        - target: :math:`(N)` where each value satisfies :math:`0 <= target[i] <= n\_classes`
+        - output: :math:`(N)`
+        - loss: ``Scalar``
+
+
+    .. _Efficient softmax approximation for GPUs:
+        https://arxiv.org/abs/1609.04309
+
+    .. _Zipf's law:
+        https://en.wikipedia.org/wiki/Zipf%27s_law
+    """
+
+    def __init__(self, in_features, n_classes, cutoffs, div_value=4., head_bias=False):
+        super(AdaptiveLogSoftmaxWithLoss, self).__init__()
+
+        cutoffs = list(cutoffs)
+
+        if (cutoffs != sorted(cutoffs)) \
+                or (min(cutoffs) <= 0) \
+                or (max(cutoffs) >= (n_classes - 1)) \
+                or (len(set(cutoffs)) != len(cutoffs)) \
+                or any([int(c) != c for c in cutoffs]):
+
+            raise ValueError("cutoffs should be a sequence of unique, positive "
+                             "integers sorted in an increasing order, where "
+                             "each value is between 1 and n_classes-1")
+
+        self.in_features = in_features
+        self.n_classes = n_classes
+        self.cutoffs = cutoffs + [n_classes]
+        self.div_value = div_value
+        self.head_bias = head_bias
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        self.head = Linear(self.in_features, self.head_size, bias=self.head_bias)
+        self.tail = ModuleList()
+
+        for i in range(self.n_clusters):
+
+            hsz = int(self.in_features // (self.div_value ** (i + 1)))
+            osz = self.cutoffs[i + 1] - self.cutoffs[i]
+
+            projection = Sequential(
+                Linear(self.in_features, hsz, bias=False),
+                Linear(hsz, osz, bias=False)
+            )
+
+            self.tail.append(projection)
+
+    def reset_parameters(self):
+        self.head.reset_parameters()
+        for i2h, h2o in self.tail:
+            i2h.reset_parameters()
+            h2o.reset_parameters()
+
+    def forward(self, input, target):
+        if input.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        used_rows = 0
+        batch_size = target.size(0)
+
+        output = input.new_zeros(batch_size)
+        gather_inds = target.new_empty(batch_size)
+
+        cutoff_values = [0] + self.cutoffs
+        for i in range(len(cutoff_values) - 1):
+
+            low_idx = cutoff_values[i]
+            high_idx = cutoff_values[i + 1]
+
+            target_mask = (target >= low_idx) & (target < high_idx)
+            row_indices = target_mask.nonzero().squeeze()
+
+            if row_indices.numel() == 0:
+                continue
+
+            if i == 0:
+                gather_inds.index_copy_(0, row_indices, target[target_mask])
+
+            else:
+                relative_target = target[target_mask] - low_idx
+                input_subset = input.index_select(0, row_indices)
+
+                cluster_output = self.tail[i - 1](input_subset)
+                cluster_index = self.shortlist_size + i - 1
+
+                gather_inds.index_fill_(0, row_indices, cluster_index)
+
+                cluster_logprob = log_softmax(cluster_output, dim=1)
+                local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
+                output.index_copy_(0, row_indices, local_logprob.squeeze(1))
+
+            used_rows += row_indices.numel()
+
+        if used_rows != batch_size:
+            raise RuntimeError("Target values should be in [0, {}], "
+                               "but values in range [{}, {}] "
+                               "were found. ".format(self.n_classes - 1,
+                                                     target.min().item(),
+                                                     target.max().item()))
+
+        head_output = self.head(input)
+        head_logprob = log_softmax(head_output, dim=1)
+        output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze()
+        loss = (-output).mean()
+
+        return _ASMoutput(output, loss)
+
+    def _get_full_log_prob(self, input, head_output):
+        """ Given input tensor, and output of `self.head`,
+        compute the log of the full distribution """
+
+        out = input.new_empty((head_output.size(0), self.n_classes))
+        head_logprob = log_softmax(head_output, dim=1)
+
+        out[:, :self.shortlist_size] = head_logprob[:, :self.shortlist_size]
+
+        for i, (start_idx, stop_idx) in enumerate(zip(self.cutoffs, self.cutoffs[1:])):
+            cluster_output = self.tail[i](input)
+            cluster_logprob = log_softmax(cluster_output, dim=1)
+            output_logprob = cluster_logprob + head_logprob[:, self.shortlist_size + i].unsqueeze(1)
+
+            out[:, start_idx:stop_idx] = output_logprob
+
+        return out
+
+    def log_prob(self, input):
+        """ Computes log probabilities for all :math:`n\_classes`
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N, n\_classes)`
+
+        """
+
+        head_output = self.head(input)
+        return self._get_full_log_prob(input, head_output)
+
+    def predict(self, input):
+        """ This is equivalent to `self.log_pob(input).argmax(dim=1)`,
+        but is more efficient in some cases.
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            output (Tensor): a class with the highest probability for each example
+
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N)`
+        """
+
+        head_output = self.head(input)
+        output = torch.argmax(head_output, dim=1)
+        not_in_shortlist = (output >= self.shortlist_size)
+        all_in_shortlist = not (not_in_shortlist.any())
+
+        if all_in_shortlist:
+            return output
+
+        elif not_in_shortlist.all():
+            log_prob = self._get_full_log_prob(input, head_output)
+            return torch.argmax(log_prob, dim=1)
+
+        else:
+            log_prob = self._get_full_log_prob(input[not_in_shortlist],
+                                               head_output[not_in_shortlist])
+            output[not_in_shortlist] = torch.argmax(log_prob, dim=1)
+            return output
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
new file mode 100644
index 0000000..8eb869c
--- /dev/null
+++ b/torch/nn/modules/batchnorm.py
@@ -0,0 +1,299 @@
+import torch
+from .module import Module
+from torch.nn.parameter import Parameter
+from .. import functional as F
+
+
+# TODO: check contiguous in THNN
+# TODO: use separate backend functions?
+class _BatchNorm(Module):
+    _version = 2
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
+                 track_running_stats=True):
+        super(_BatchNorm, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_parameter('running_mean', None)
+            self.register_parameter('running_var', None)
+            self.register_parameter('num_batches_tracked', None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        exponential_average_factor = 0.0
+
+        if self.training and self.track_running_stats:
+            self.num_batches_tracked += 1
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+
+        return F.batch_norm(
+            input, self.running_mean, self.running_var, self.weight, self.bias,
+            self.training or not self.track_running_stats,
+            exponential_average_factor, self.eps)
+
+    def extra_repr(self):
+        return '{num_features}, eps={eps}, momentum={momentum}, affine={affine}, ' \
+               'track_running_stats={track_running_stats}'.format(**self.__dict__)
+
+    def _load_from_state_dict(self, state_dict, prefix, metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = metadata.get('version', None)
+
+        if (version is None or version < 2) and self.track_running_stats:
+            # at version 2: added num_batches_tracked buffer
+            #               this should have a default value of 0
+            num_batches_tracked_key = prefix + 'num_batches_tracked'
+            if num_batches_tracked_key not in state_dict:
+                state_dict[num_batches_tracked_key] = torch.tensor(0, dtype=torch.long)
+
+        super(_BatchNorm, self)._load_from_state_dict(
+            state_dict, prefix, metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+
+class BatchNorm1d(_BatchNorm):
+    r"""Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size).
+
+    By default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm1d(100, affine=False)
+        >>> input = torch.randn(20, 100)
+        >>> output = m(input)
+
+    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
+        https://arxiv.org/abs/1502.03167
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'
+                             .format(input.dim()))
+
+
+class BatchNorm2d(_BatchNorm):
+    r"""Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size).
+
+    By default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm2d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+
+    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
+        https://arxiv.org/abs/1502.03167
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError('expected 4D input (got {}D input)'
+                             .format(input.dim()))
+
+
+class BatchNorm3d(_BatchNorm):
+    r"""Applies Batch Normalization over a 5D input (a mini-batch of 3D inputs
+    with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size).
+
+    By default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
+    or Spatio-temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+
+    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
+        https://arxiv.org/abs/1502.03167
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError('expected 5D input (got {}D input)'
+                             .format(input.dim()))
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
new file mode 100644
index 0000000..d5f08aa
--- /dev/null
+++ b/torch/nn/modules/container.py
@@ -0,0 +1,519 @@
+import warnings
+from collections import OrderedDict, Iterable, Mapping
+from itertools import islice
+import operator
+
+import torch
+from .module import Module
+
+
+class Container(Module):
+
+    def __init__(self, **kwargs):
+        super(Container, self).__init__()
+        # DeprecationWarning is ignored by default <sigh>
+        warnings.warn("nn.Container is deprecated. All of it's functionality "
+                      "is now implemented in nn.Module. Subclass that instead.")
+        for key, value in kwargs.items():
+            self.add_module(key, value)
+
+
+class Sequential(Module):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, here is a small example::
+
+        # Example of using Sequential
+        model = nn.Sequential(
+                  nn.Conv2d(1,20,5),
+                  nn.ReLU(),
+                  nn.Conv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Example of using Sequential with OrderedDict
+        model = nn.Sequential(OrderedDict([
+                  ('conv1', nn.Conv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', nn.Conv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+    """
+
+    def __init__(self, *args):
+        super(Sequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator, idx):
+        """Get the idx-th item of the iterator"""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError('index {} is out of range'.format(idx))
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return Sequential(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx, module):
+        key = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx):
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __dir__(self):
+        keys = super(Sequential, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def forward(self, input):
+        for module in self._modules.values():
+            input = module(input)
+        return input
+
+
+class ModuleList(Module):
+    r"""Holds submodules in a list.
+
+    ModuleList can be indexed like a regular Python list, but modules it
+    contains are properly registered, and will be visible by all Module methods.
+
+    Arguments:
+        modules (iterable, optional): an iterable of modules to add
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
+
+            def forward(self, x):
+                # ModuleList can act as an iterable, or be indexed using ints
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i // 2](x) + l(x)
+                return x
+    """
+
+    def __init__(self, modules=None):
+        super(ModuleList, self).__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return ModuleList(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx, module):
+        idx = operator.index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx):
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        # To preserve numbering, self._modules is being reconstructed with modules after deletion
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __iter__(self):
+        return iter(self._modules.values())
+
+    def __iadd__(self, modules):
+        return self.extend(modules)
+
+    def __dir__(self):
+        keys = super(ModuleList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, module):
+        r"""Appends a given module to the end of the list.
+
+        Arguments:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def extend(self, modules):
+        r"""Appends modules from a Python iterable to the end of the list.
+
+        Arguments:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, Iterable):
+            raise TypeError("ModuleList.extend should be called with an "
+                            "iterable, but got " + type(modules).__name__)
+        offset = len(self)
+        for i, module in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+
+class ModuleDict(Module):
+    r"""Holds submodules in a dictionary.
+
+    ModuleDict can be indexed like a regular Python dictionary, but modules it
+    contains are properly registered, and will be visible by all Module methods.
+
+    Arguments:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key/value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.choices = nn.ModuleDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.ModuleDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    def __init__(self, modules=None):
+        super(ModuleDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key):
+        return self._modules[key]
+
+    def __setitem__(self, key, module):
+        self.add_module(key, module)
+
+    def __delitem__(self, key):
+        del self._modules[key]
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __iter__(self):
+        return iter(self._modules)
+
+    def __contains__(self, key):
+        return key in self._modules
+
+    def clear(self):
+        """Remove all items from the ModuleDict.
+        """
+        self._modules.clear()
+
+    def pop(self, key):
+        r"""Remove key from the ModuleDict and return its module.
+
+        Arguments:
+            key (string): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+        r"""Return an iterable of the ModuleDict keys.
+        """
+        return self._modules.keys()
+
+    def items(self):
+        r"""Return an iterable of the ModuleDict key/value pairs.
+        """
+        return self._modules.items()
+
+    def values(self):
+        r"""Return an iterable of the ModuleDict values.
+        """
+        return self._modules.values()
+
+    def update(self, modules):
+        r"""Update the ModuleDict with the key/value pairs from a mapping or
+        an iterable, overwriting existing keys.
+
+        Arguments:
+            modules (iterable): a mapping (dictionary) of (string: :class:`~torch.nn.Module``) or
+                an iterable of key/value pairs of type (string, :class:`~torch.nn.Module``)
+        """
+        if not isinstance(modules, Iterable):
+            raise TypeError("ModuleDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(modules).__name__)
+
+        if isinstance(modules, Mapping):
+            if isinstance(modules, OrderedDict):
+                for key, module in modules.items():
+                    self[key] = module
+            else:
+                for key, module in sorted(modules.items()):
+                    self[key] = module
+        else:
+            for j, m in enumerate(modules):
+                if not isinstance(m, Iterable):
+                    raise TypeError("ModuleDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("ModuleDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(m)) +
+                                     "; 2 is required")
+                self[m[0]] = m[1]
+
+
+class ParameterList(Module):
+    r"""Holds parameters in a list.
+
+    ParameterList can be indexed like a regular Python list, but parameters it
+    contains are properly registered, and will be visible by all Module methods.
+
+    Arguments:
+        parameters (iterable, optional): an iterable of :class:`~torch.nn.Parameter`` to add
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
+
+            def forward(self, x):
+                # ParameterList can act as an iterable, or be indexed using ints
+                for i, p in enumerate(self.params):
+                    x = self.params[i // 2].mm(x) + p.mm(x)
+                return x
+    """
+
+    def __init__(self, parameters=None):
+        super(ParameterList, self).__init__()
+        if parameters is not None:
+            self += parameters
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return ParameterList(list(self._parameters.values())[idx])
+        else:
+            idx = operator.index(idx)
+            if not (-len(self) <= idx < len(self)):
+                raise IndexError('index {} is out of range'.format(idx))
+            if idx < 0:
+                idx += len(self)
+            return self._parameters[str(idx)]
+
+    def __setitem__(self, idx, param):
+        idx = operator.index(idx)
+        return self.register_parameter(str(idx), param)
+
+    def __len__(self):
+        return len(self._parameters)
+
+    def __iter__(self):
+        return iter(self._parameters.values())
+
+    def __iadd__(self, parameters):
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, parameter):
+        """Appends a given parameter at the end of the list.
+
+        Arguments:
+            parameter (nn.Parameter): parameter to append
+        """
+        self.register_parameter(str(len(self)), parameter)
+        return self
+
+    def extend(self, parameters):
+        """Appends parameters from a Python iterable to the end of the list.
+
+        Arguments:
+            parameters (iterable): iterable of parameters to append
+        """
+        if not isinstance(parameters, Iterable):
+            raise TypeError("ParameterList.extend should be called with an "
+                            "iterable, but got " + type(parameters).__name__)
+        offset = len(self)
+        for i, param in enumerate(parameters):
+            self.register_parameter(str(offset + i), param)
+        return self
+
+    def extra_repr(self):
+        child_lines = []
+        for k, p in self._parameters.items():
+            size_str = 'x'.join(str(size) for size in p.size())
+            device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
+            parastr = 'Parameter containing: [{} of size {}{}]'.format(
+                torch.typename(p.data), size_str, device_str)
+            child_lines.append('  (' + str(k) + '): ' + parastr)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+
+
+class ParameterDict(Module):
+    r"""Holds parameters in a dictionary.
+
+    ParameterDict can be indexed like a regular Python dictionary, but parameters it
+    contains are properly registered, and will be visible by all Module methods.
+
+    Arguments:
+        parameters (iterable, optional): a mapping (dictionary) of
+            (string : :class:`~torch.nn.Parameter``) or an iterable of key,value pairs
+            of type (string, :class:`~torch.nn.Parameter``)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.choices = nn.ParameterDict({
+                        'left': nn.Parameter(torch.randn(5, 10)),
+                        'right': nn.Parameter(torch.randn(5, 10))
+                })
+
+            def forward(self, x, choice):
+                x = self.params[choice].mm(x)
+                return x
+    """
+
+    def __init__(self, parameters=None):
+        super(ParameterDict, self).__init__()
+        if parameters is not None:
+            self.update(parameters)
+
+    def __getitem__(self, key):
+        return self._parameters[key]
+
+    def __setitem__(self, key, parameter):
+        self.register_parameter(key, parameter)
+
+    def __delitem__(self, key):
+        del self._parameters[key]
+
+    def __len__(self):
+        return len(self._parameters)
+
+    def __iter__(self):
+        return iter(self._parameters.keys())
+
+    def __contains__(self, key):
+        return key in self._parameters
+
+    def clear(self):
+        """Remove all items from the ParameterDict.
+        """
+        self._parameters.clear()
+
+    def pop(self, key):
+        r"""Remove key from the ParameterDict and return its parameter.
+
+        Arguments:
+            key (string): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+        r"""Return an iterable of the ParameterDict keys.
+        """
+        return self._parameters.keys()
+
+    def items(self):
+        r"""Return an iterable of the ParameterDict key/value pairs.
+        """
+        return self._parameters.items()
+
+    def values(self):
+        r"""Return an iterable of the ParameterDict values.
+        """
+        return self._parameters.values()
+
+    def update(self, parameters):
+        r"""Update the ParameterDict with the key/value pairs from a mapping or
+        an iterable, overwriting existing keys.
+
+        Arguments:
+            parameters (iterable): a mapping (dictionary) of
+                (string : :class:`~torch.nn.Parameter``) or an iterable of
+                key/value pairs of type (string, :class:`~torch.nn.Parameter``)
+        """
+        if not isinstance(parameters, Iterable):
+            raise TypeError("ParametersDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(parameters).__name__)
+
+        if isinstance(parameters, Mapping):
+            if isinstance(parameters, OrderedDict):
+                for key, parameter in parameters.items():
+                    self[key] = parameter
+            else:
+                for key, parameter in sorted(parameters.items()):
+                    self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, Iterable):
+                    raise TypeError("ParameterDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(p).__name__)
+                if not len(p) == 2:
+                    raise ValueError("ParameterDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(p)) +
+                                     "; 2 is required")
+                self[p[0]] = p[1]
+
+    def extra_repr(self):
+        child_lines = []
+        for k, p in self._parameters.items():
+            size_str = 'x'.join(str(size) for size in p.size())
+            device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
+            parastr = 'Parameter containing: [{} of size {}{}]'.format(
+                torch.typename(p.data), size_str, device_str)
+            child_lines.append('  (' + k + '): ' + parastr)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
new file mode 100644
index 0000000..e30e6f8
--- /dev/null
+++ b/torch/nn/modules/conv.py
@@ -0,0 +1,823 @@
+# coding=utf-8
+import math
+import torch
+from torch.nn.parameter import Parameter
+from .. import functional as F
+from .module import Module
+from .utils import _single, _pair, _triple
+
+
+class _ConvNd(Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, transposed, output_padding, groups, bias):
+        super(_ConvNd, self).__init__()
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.output_padding = output_padding
+        self.groups = groups
+        if transposed:
+            self.weight = Parameter(torch.Tensor(
+                in_channels, out_channels // groups, *kernel_size))
+        else:
+            self.weight = Parameter(torch.Tensor(
+                out_channels, in_channels // groups, *kernel_size))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+
+    def extra_repr(self):
+        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', stride={stride}')
+        if self.padding != (0,) * len(self.padding):
+            s += ', padding={padding}'
+        if self.dilation != (1,) * len(self.dilation):
+            s += ', dilation={dilation}'
+        if self.output_padding != (0,) * len(self.output_padding):
+            s += ', output_padding={output_padding}'
+        if self.groups != 1:
+            s += ', groups={groups}'
+        if self.bias is None:
+            s += ', bias=False'
+        return s.format(**self.__dict__)
+
+
+class Conv1d(_ConvNd):
+    r"""Applies a 1D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{in}, L)` and output :math:`(N, C_{out}, L_{out})` can be
+    precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_{out_j}) = \text{bias}(C_{out_j}) +
+                                \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{out_j}, k) \star \text{input}(N_i, k)
+        \end{equation*},
+
+    where :math:`\star` is the valid `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`L` is a length of signal sequence.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a one-element tuple.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both sides
+      for :attr:`padding` number of points.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor \frac{\text{out_channels}}{\text{in_channels}} \right\rfloor`).
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid
+         `cross-correlation`_, and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+
+         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
+         where `K` is a positive integer is termed in literature as depthwise convolution.
+
+         In other words, for an input of size :math:`(N, C_{in}, L_{in})`, if you want a
+         depthwise convolution with a depthwise multiplier `K`,
+         then you use the constructor arguments
+         :math:`(\text{in_channels}=C_{in}, \text{out_channels}=C_{in} * K, ..., \text{groups}=C_{in})`
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            (out_channels, in_channels, kernel_size)
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels)
+
+    Examples::
+
+        >>> m = nn.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        super(Conv1d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _single(0), groups, bias)
+
+    def forward(self, input):
+        return F.conv1d(input, self.weight, self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+
+class Conv2d(_ConvNd):
+    r"""Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{in}, H, W)` and output :math:`(N, C_{out}, H_{out}, W_{out})`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_{out_j}) = \text{bias}(C_{out_j}) +
+                                \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{out_j}, k) \star \text{input}(N_i, k)
+        \end{equation*},
+
+    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a tuple.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor\frac{\text{out_channels}}{\text{in_channels}}\right\rfloor`).
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid `cross-correlation`_,
+         and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+
+         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
+         where `K` is a positive integer is termed in literature as depthwise convolution.
+
+         In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`, if you want a
+         depthwise convolution with a depthwise multiplier `K`,
+         then you use the constructor arguments
+         :math:`(\text{in_channels}=C_{in}, \text{out_channels}=C_{in} * K, ..., \text{groups}=C_{in})`
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
+                        \times (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
+                        \times (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (out_channels, in_channels, kernel_size[0], kernel_size[1])
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        super(Conv2d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _pair(0), groups, bias)
+
+    def forward(self, input):
+        return F.conv2d(input, self.weight, self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+
+class Conv3d(_ConvNd):
+    r"""Applies a 3D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, D, H, W)`
+    and output :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_{out_j}) = \text{bias}(C_{out_j}) +
+                                \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{out_j}, k) \star \text{input}(N_i, k)
+        \end{equation*},
+
+    where :math:`\star` is the valid 3D `cross-correlation`_ operator
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor\frac{\text{out_channels}}{\text{in_channels}}\right\rfloor`).
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid `cross-correlation`_,
+         and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+
+         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
+         where `K` is a positive integer is termed in literature as depthwise convolution.
+
+         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, if you want a
+         depthwise convolution with a depthwise multiplier `K`,
+         then you use the constructor arguments
+         :math:`(\text{in_channels}=C_{in}, \text{out_channels}=C_{in} * K, ..., \text{groups}=C_{in})`
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
+                    \times (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
+                    \times (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2]
+                    \times (\text{kernel_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (out_channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        super(Conv3d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _triple(0), groups, bias)
+
+    def forward(self, input):
+        return F.conv3d(input, self.weight, self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+
+class _ConvTransposeMixin(object):
+
+    def forward(self, input, output_size=None):
+        output_padding = self._output_padding(input, output_size)
+        func = self._backend.ConvNd(
+            self.stride, self.padding, self.dilation, self.transposed,
+            output_padding, self.groups)
+        if self.bias is None:
+            return func(input, self.weight)
+        else:
+            return func(input, self.weight, self.bias)
+
+    def _output_padding(self, input, output_size):
+        if output_size is None:
+            return self.output_padding
+
+        output_size = list(output_size)
+        k = input.dim() - 2
+        if len(output_size) == k + 2:
+            output_size = output_size[-2:]
+        if len(output_size) != k:
+            raise ValueError(
+                "output_size must have {} or {} elements (got {})"
+                .format(k, k + 2, len(output_size)))
+
+        def dim_size(d):
+            return ((input.size(d + 2) - 1) * self.stride[d] -
+                    2 * self.padding[d] + self.kernel_size[d])
+
+        min_sizes = [dim_size(d) for d in range(k)]
+        max_sizes = [min_sizes[d] + self.stride[d] - 1 for d in range(k)]
+        for size, min_size, max_size in zip(output_size, min_sizes, max_sizes):
+            if size < min_size or size > max_size:
+                raise ValueError((
+                    "requested an output size of {}, but valid sizes range "
+                    "from {} to {} (for an input of {})").format(
+                        output_size, min_sizes, max_sizes, input.size()[2:]))
+
+        return tuple([output_size[d] - min_sizes[d] for d in range(k)])
+
+
+class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
+    r"""Applies a 1D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv1d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation).
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for ``kernel_size - 1 - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor\frac{\text{out_channels}}{\text{in_channels}}\right\rfloor`).
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid `cross-correlation`_,
+         and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+        The :attr:`padding` argument effectively adds ``kernel_size - 1 - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv1d` and a :class:`~torch.nn.ConvTranspose1d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv1d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``kernel_size - 1 - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` where
+
+          .. math::
+              L_{out} = (L_{in} - 1) \times \text{stride} - 2 \times \text{padding}
+                    + \text{kernel_size} + \text{output_padding}
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (in_channels, out_channels, kernel_size[0], kernel_size[1])
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        output_padding = _single(output_padding)
+        super(ConvTranspose1d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias)
+
+    def forward(self, input, output_size=None):
+        output_padding = self._output_padding(input, output_size)
+        return F.conv_transpose1d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
+    r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation).
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for ``kernel_size - 1 - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor\frac{\text{out_channels}}{\text{in_channels}}\right\rfloor`).
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimensions
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid `cross-correlation`_,
+         and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+        The :attr:`padding` argument effectively adds ``kernel_size - 1 - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv2d` and a :class:`~torch.nn.ConvTranspose2d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv2d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``kernel_size - 1 - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
+                    + \text{kernel_size}[0] + \text{output_padding}[0]
+
+              W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
+                    + \text{kernel_size}[1] + \text{output_padding}[1]
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (in_channels, out_channels, kernel_size[0], kernel_size[1])
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12)
+        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+        super(ConvTranspose2d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias)
+
+    def forward(self, input, output_size=None):
+        output_padding = self._output_padding(input, output_size)
+        return F.conv_transpose2d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
+    r"""Applies a 3D transposed convolution operator over an input image composed of several input
+    planes.
+    The transposed convolution operator multiplies each input value element-wise by a learnable kernel,
+    and sums over the outputs from all input feature planes.
+
+    This module can be seen as the gradient of Conv3d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation).
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for ``kernel_size - 1 - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels,
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\left\lfloor\frac{\text{out_channels}}{\text{in_channels}}\right\rfloor`).
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimensions
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    .. note::
+
+         Depending of the size of your kernel, several (of the last)
+         columns of the input might be lost, because it is a valid `cross-correlation`_,
+         and not a full `cross-correlation`_.
+         It is up to the user to add proper padding.
+
+    .. note::
+        The :attr:`padding` argument effectively adds ``kernel_size - 1 - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv3d` and a :class:`~torch.nn.ConvTranspose3d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv3d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``kernel_size - 1 - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
+                    + \text{kernel_size}[0] + \text{output_padding}[0]
+
+              H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
+                    + \text{kernel_size}[1] + \text{output_padding}[1]
+
+              W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2]
+                    + \text{kernel_size}[2] + \text{output_padding}[2]
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2])
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        output_padding = _triple(output_padding)
+        super(ConvTranspose3d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias)
+
+    def forward(self, input, output_size=None):
+        output_padding = self._output_padding(input, output_size)
+        return F.conv_transpose3d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+# TODO: Conv2dLocal
+# TODO: Conv2dMap
+# TODO: ConvTranspose2dMap
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py
new file mode 100644
index 0000000..6752e47
--- /dev/null
+++ b/torch/nn/modules/distance.py
@@ -0,0 +1,71 @@
+import torch
+from .module import Module
+from .. import functional as F
+
+
+class PairwiseDistance(Module):
+    r"""
+    Computes the batchwise pairwise distance between vectors :math:`v_1`,:math:`v_2` using the p-norm:
+
+    .. math ::
+        \Vert x \Vert _p := \left( \sum_{i=1}^n  \vert x_i \vert ^ p \right) ^ {1/p}
+
+    Args:
+        p (real): the norm degree. Default: 2
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-6
+        keepdim (bool, optional): Determines whether or not to keep the batch dimension.
+            Default: False
+
+    Shape:
+        - Input1: :math:`(N, D)` where `D = vector dimension`
+        - Input2: :math:`(N, D)`, same shape as the Input1
+        - Output: :math:`(N)`. If :attr:`keepdim` is ``False``, then :math:`(N, 1)`.
+
+    Examples::
+
+        >>> pdist = nn.PairwiseDistance(p=2)
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> output = pdist(input1, input2)
+    """
+    def __init__(self, p=2, eps=1e-6, keepdim=False):
+        super(PairwiseDistance, self).__init__()
+        self.norm = p
+        self.eps = eps
+        self.keepdim = keepdim
+
+    def forward(self, x1, x2):
+        return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
+
+
+class CosineSimilarity(Module):
+    r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along dim.
+
+    .. math ::
+        \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}
+
+    Args:
+        dim (int, optional): Dimension where cosine similarity is computed. Default: 1
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8
+
+    Shape:
+        - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
+        - Input2: :math:`(\ast_1, D, \ast_2)`, same shape as the Input1
+        - Output: :math:`(\ast_1, \ast_2)`
+
+    Examples::
+
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        >>> output = cos(input1, input2)
+    """
+    def __init__(self, dim=1, eps=1e-8):
+        super(CosineSimilarity, self).__init__()
+        self.dim = dim
+        self.eps = eps
+
+    def forward(self, x1, x2):
+        return F.cosine_similarity(x1, x2, self.dim, self.eps)
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
new file mode 100644
index 0000000..48415f6
--- /dev/null
+++ b/torch/nn/modules/dropout.py
@@ -0,0 +1,179 @@
+from .module import Module
+from .. import functional as F
+
+
+class _DropoutNd(Module):
+
+    def __init__(self, p=0.5, inplace=False):
+        super(_DropoutNd, self).__init__()
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        self.p = p
+        self.inplace = inplace
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'p={}{}'.format(self.p, inplace_str)
+
+
+class Dropout(_DropoutNd):
+    r"""During training, randomly zeroes some of the elements of the input
+    tensor with probability :attr:`p` using samples from a Bernoulli
+    distribution. The elements to zero are randomized on every forward call.
+
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    `Improving neural networks by preventing co-adaptation of feature
+    detectors`_ .
+
+    Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    Shape:
+        - Input: `Any`. Input can be of any shape
+        - Output: `Same`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.Dropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Improving neural networks by preventing co-adaptation of feature
+        detectors: https://arxiv.org/abs/1207.0580
+    """
+
+    def forward(self, input):
+        return F.dropout(input, self.p, self.training, self.inplace)
+
+
+class Dropout2d(_DropoutNd):
+    r"""Randomly zeroes whole channels of the input tensor.
+    The channels to zero-out are randomized on every forward call.
+
+    Usually the input comes from :class:`nn.Conv2d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout2d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> m = nn.Dropout2d(p=0.2)
+        >>> input = torch.randn(20, 16, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       http://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input):
+        return F.dropout2d(input, self.p, self.training, self.inplace)
+
+
+class Dropout3d(_DropoutNd):
+    r"""Randomly zeroes whole channels of the input tensor.
+    The channels to zero are randomized on every forward call.
+
+    Usually the input comes from :class:`nn.Conv3d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout3d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> m = nn.Dropout3d(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       http://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input):
+        return F.dropout3d(input, self.p, self.training, self.inplace)
+
+
+class AlphaDropout(_DropoutNd):
+    r"""Applies Alpha Dropout over the input.
+
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing
+    property.
+    For an input with zero mean and unit standard deviation, the output of
+    Alpha Dropout maintains the original mean and standard deviation of the
+    input.
+    Alpha Dropout goes hand-in-hand with SELU activation function, which ensures
+    that the outputs have zero mean and unit standard deviation.
+
+    During training, it randomly masks some of the elements of the input
+    tensor with probability *p* using samples from a bernoulli distribution.
+    The elements to masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit standard deviation.
+
+    During evaluation the module simply computes an identity function.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        p (float): probability of an element to be dropped. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: `Any`. Input can be of any shape
+        - Output: `Same`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.AlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    def forward(self, input):
+        return F.alpha_dropout(input, self.p, self.training)
+
+
+class FeatureAlphaDropout(_DropoutNd):
+
+    def forward(self, input):
+        return F.feature_alpha_dropout(input, self.p, self.training)
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
new file mode 100644
index 0000000..580528e
--- /dev/null
+++ b/torch/nn/modules/fold.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+from .module import Module
+from .. import functional as F
+
+
+class Fold(Module):
+    r"""Combines an array of sliding local blocks into a large containing
+    tensor.
+
+    Consider a batched :attr:`input` tensor containing sliding local blocks,
+    e.g., patches of images, of shape :math:`(N, C \times  \prod(\text{kernel_size}), L)`,
+    where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel_size})`
+    is the number of values with in a block (a block has :math:`\prod(\text{kernel_size})`
+    spatial locations each containing a :math:`C`-channeled vector), and
+    :math:`L` is the total number of blocks. (This is exacly the
+    same specification as the output shape of :class:`~torch.nn.Unfold`.) This
+    operation combines these local blocks into the large :attr:`output` tensor
+    of shape :math:`(N, C, \text{output_size}[0], \text{output_size}[1], \dots)`.
+    Similar to :class:`~torch.nn.Unfold`, the arguments must satisfy
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{output_size}[d] + 2 \times \text{padding}[d] \
+            - \text{dilation}[d] \times (\text{kernel_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`d` is over all spatial dimensions.
+
+    * :attr:`output_size` describes the spatial shape of the large containing
+      tensor of the sliding local blocks. It is useful to resolve the ambiguity
+      when multiple input shapes map to same number of sliding blocks, e.g.,
+      with ``stride > 0``.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        output_size (int or tuple): the shape of the spatial dimensions [2:] of the output
+        kernel_size (int or tuple): the size of the sliding blocks
+        stride (int or tuple): the stride of the sliding blocks in the input
+                               spatial dimensions. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+
+    * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
+      :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
+      their values will be replicated across all spatial dimensions.
+
+    * For the case of two output spatial dimensions this operation is sometimes
+      called ``col2im``.
+
+    .. warning::
+        Currently, only 4-D output tensors (batched image-like tensors) are
+        supported.
+
+    Shape:
+        - Input: :math:`(N, C \times \prod(\text{kernel_size}), L)`
+        - Output: :math:`(N, C, \text{output_size}[0], \text{output_size}[1], \dots)` as described above
+
+    Examples::
+
+        >>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2))
+        >>> input = torch.randn(1, 3 * 2 * 2, 1)
+        >>> output = fold(input)
+        >>> output.size()
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+
+    def __init__(self, output_size, kernel_size, dilation=1, padding=0, stride=1):
+        super(Fold, self).__init__()
+        self.output_size = output_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input):
+        return F.fold(input, self.output_size, self.kernel_size, self.dilation,
+                      self.padding, self.stride)
+
+    def extra_repr(self):
+        return 'output_size={output_size}, kernel_size={kernel_size}, ' \
+            'dilation={dilation}, padding={padding}, stride={stride}'.format(
+                **self.__dict__
+            )
+
+
+class Unfold(Module):
+    r"""Extracts sliding local blocks from a batched input tensor.
+
+    Consider an batched :attr:`input` tensor of shape :math:`(N, C, *)`,
+    where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
+    and :math:`*` represent arbitrary spatial dimensions. This operation flattens
+    each sliding :attr:`kernel_size`-sized block within the spatial dimensions
+    of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output`
+    tensor of shape :math:`(N, C \times \prod(\text{kernel_size}), L)`, where
+    :math:`C \times \prod(\text{kernel_size})` is the total number of values
+    with in each block (a block has :math:`\prod(\text{kernel_size})` spatial
+    locations each containing a :math:`C`-channeled vector), and :math:`L` is
+    the total number of such blocks:
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{input_spatial_size}[d] + 2 \times \text{padding}[d] \
+            - \text{dilation}[d] \times (\text{kernel_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`\text{input_spatial_size}` is formed by the spatial dimensions
+    of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
+    dimensions.
+
+    Therefore, indexing :attr:`output` at the last dimension (column dimension)
+    gives all values within a certain block.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        kernel_size (int or tuple): the size of the sliding blocks
+        stride (int or tuple, optional): the stride of the sliding blocks in the input
+                                         spatial dimensions. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+
+    * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
+      :attr:`stride` is an int or a tuple of length 1, their values will be
+      replicated across all spatial dimensions.
+
+    * For the case of two input spatial dimensions this operation is sometimes
+      called ``im2col``.
+
+    .. warning::
+        Currently, only 4-D input tensors (batched image-like tensors) are
+        supported.
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C \times \prod(\text{kernel_size}), L)` as described above
+
+    Examples::
+
+        >>> unfold = nn.Unfold(kernel_size=(2, 3))
+        >>> input = torch.randn(2, 5, 3, 4)
+        >>> output = unfold(input)
+        >>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels)
+        >>> # 4 blocks (2x3 kernels) in total in the 3x4 input
+        >>> output.size()
+        torch.Size([2, 30, 4])
+
+        >>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
+        >>> inp = torch.randn(1, 3, 10, 12)
+        >>> w = torch.randn(2, 3, 4, 5)
+        >>> inp_unf = torch.nn.functional.unfold(inp, (4, 5))
+        >>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
+        >>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
+        >>> # or equivalently (and avoiding a copy),
+        >>> # out = out_unf.view(1, 2, 7, 8)
+        >>> (torch.nn.functional.conv2d(inp, w) - out).abs().max()
+        tensor(1.9073e-06)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+
+    def __init__(self, kernel_size, dilation=1, padding=0, stride=1):
+        super(Unfold, self).__init__()
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input):
+        return F.unfold(input, self.kernel_size, self.dilation,
+                        self.padding, self.stride)
+
+    def extra_repr(self):
+        return 'kernel_size={kernel_size}, dilation={dilation}, padding={padding},' \
+            ' stride={stride}'.format(**self.__dict__)
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
new file mode 100644
index 0000000..97f63c9
--- /dev/null
+++ b/torch/nn/modules/instancenorm.py
@@ -0,0 +1,241 @@
+from .batchnorm import _BatchNorm
+from .. import functional as F
+
+
+class _InstanceNorm(_BatchNorm):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=False,
+                 track_running_stats=False):
+        super(_InstanceNorm, self).__init__(
+            num_features, eps, momentum, affine, track_running_stats)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def _load_from_state_dict(self, state_dict, prefix, metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = metadata.get('version', None)
+        # at version 1: removed running_mean and running_var when
+        # track_running_stats=False (default)
+        if version is None and not self.track_running_stats:
+            running_stats_keys = []
+            for name in ('running_mean', 'running_var'):
+                key = prefix + name
+                if key in state_dict:
+                    running_stats_keys.append(key)
+            if len(running_stats_keys) > 0:
+                error_msgs.append(
+                    'Unexpected running stats buffer(s) {names} for {klass} '
+                    'with track_running_stats=False. If state_dict is a '
+                    'checkpoint saved before 0.4.0, this may be expected '
+                    'because {klass} does not track running stats by default '
+                    'since 0.4.0. Please remove these keys from state_dict. If '
+                    'the running stats are actually needed, instead set '
+                    'track_running_stats=True in {klass} to enable them. See '
+                    'the documentation of {klass} for details.'
+                    .format(names=" and ".join('"{}"'.format(k) for k in running_stats_keys),
+                            klass=self.__class__.__name__))
+                for key in running_stats_keys:
+                    state_dict.pop(key)
+
+        super(_InstanceNorm, self)._load_from_state_dict(
+            state_dict, prefix, metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return F.instance_norm(
+            input, self.running_mean, self.running_var, self.weight, self.bias,
+            self.training or not self.track_running_stats, self.momentum, self.eps)
+
+
+class InstanceNorm1d(_InstanceNorm):
+    r"""Applies Instance Normalization over a 2D or 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``False``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)`
+        - Output: :math:`(N, C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100, affine=True)
+        >>> input = torch.randn(20, 100, 40)
+        >>> output = m(input)
+
+    .. _`Instance Normalization: The Missing Ingredient for Fast Stylization`:
+        https://arxiv.org/abs/1607.08022
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 3:
+            raise ValueError('expected 3D input (got {}D input)'
+                             .format(input.dim()))
+
+
+class InstanceNorm2d(_InstanceNorm):
+    r"""Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``False``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+
+    .. _`Instance Normalization: The Missing Ingredient for Fast Stylization`:
+        https://arxiv.org/abs/1607.08022
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError('expected 4D input (got {}D input)'
+                             .format(input.dim()))
+
+
+class InstanceNorm3d(_InstanceNorm):
+    r"""Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization`_ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size C (where C is the input size) if :attr:`affine` is ``True``.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``False``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+
+    .. _`Instance Normalization: The Missing Ingredient for Fast Stylization`:
+        https://arxiv.org/abs/1607.08022
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError('expected 5D input (got {}D input)'
+                             .format(input.dim()))
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
new file mode 100644
index 0000000..d56485e
--- /dev/null
+++ b/torch/nn/modules/linear.py
@@ -0,0 +1,122 @@
+import math
+
+import torch
+from torch.nn.parameter import Parameter
+from .. import functional as F
+from .module import Module
+
+
+class Linear(Module):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+
+    Args:
+        in_features: size of each input sample
+        out_features: size of each output sample
+        bias: If set to False, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, *, in\_features)` where :math:`*` means any number of
+          additional dimensions
+        - Output: :math:`(N, *, out\_features)` where all but the last dimension
+          are the same shape as the input.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(out_features x in_features)`
+        bias:   the learnable bias of the module of shape `(out_features)`
+
+    Examples::
+
+        >>> m = nn.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+    """
+
+    def __init__(self, in_features, out_features, bias=True):
+        super(Linear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1. / math.sqrt(self.weight.size(1))
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+
+    def forward(self, input):
+        return F.linear(input, self.weight, self.bias)
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+
+
+class Bilinear(Module):
+    r"""Applies a bilinear transformation to the incoming data:
+    :math:`y = x_1 A x_2 + b`
+
+    Args:
+        in1_features: size of each first input sample
+        in2_features: size of each second input sample
+        out_features: size of each output sample
+        bias: If set to False, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, *, \text{in1_features})`, :math:`(N, *, \text{in2_features})`
+          where :math:`*` means any number of additional dimensions. All but the last
+          dimension of the inputs should be the same.
+        - Output: :math:`(N, *, \text{out_features})` where all but the last dimension
+          are the same shape as the input.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(out_features x in1_features x in2_features)`
+        bias:   the learnable bias of the module of shape `(out_features)`
+
+    Examples::
+
+        >>> m = nn.Bilinear(20, 30, 40)
+        >>> input1 = torch.randn(128, 20)
+        >>> input2 = torch.randn(128, 30)
+        >>> output = m(input1, input2)
+        >>> print(output.size())
+    """
+
+    def __init__(self, in1_features, in2_features, out_features, bias=True):
+        super(Bilinear, self).__init__()
+        self.in1_features = in1_features
+        self.in2_features = in2_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.Tensor(out_features, in1_features, in2_features))
+
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1. / math.sqrt(self.weight.size(1))
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+
+    def forward(self, input1, input2):
+        return F.bilinear(input1, input2, self.weight, self.bias)
+
+    def extra_repr(self):
+        return 'in1_features={}, in2_features={}, out_features={}, bias={}'.format(
+            self.in1_features, self.in2_features, self.out_features, self.bias is not None
+        )
+
+# TODO: PartialLinear - maybe in sparse?
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
new file mode 100644
index 0000000..489e899
--- /dev/null
+++ b/torch/nn/modules/loss.py
@@ -0,0 +1,1128 @@
+import warnings
+
+import torch
+from .module import Module
+from .container import Sequential
+from .activation import LogSoftmax
+from .. import functional as F
+from ..functional import _Reduction
+
+
+class _Loss(Module):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(_Loss, self).__init__()
+        if size_average is not None or reduce is not None:
+            self.reduction = _Reduction.legacy_get_string(size_average, reduce)
+        else:
+            self.reduction = reduction
+
+
+class _WeightedLoss(_Loss):
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
+        self.register_buffer('weight', weight)
+
+
+class L1Loss(_Loss):
+    r"""Creates a criterion that measures the mean absolute value of the
+    element-wise difference between input `x` and target `y`:
+
+    The loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left| x_n - y_n \right|,
+
+    where :math:`N` is the batch size. If reduce is ``True``, then:
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    `x` and `y` arbitrary shapes with a total of `n` elements each.
+
+    The sum operation still operates over all the elements, and divides by `n`.
+
+    The division by `n` can be avoided if one sets the constructor argument
+    `size_average=False`.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar. If reduce is ``False``, then
+          :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> loss = nn.L1Loss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(L1Loss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.l1_loss(input, target, reduction=self.reduction)
+
+
+class NLLLoss(_WeightedLoss):
+    r"""The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    If provided, the optional argument `weight` should be a 1D Tensor assigning
+    weight to each of the classes. This is particularly useful when you have an
+    unbalanced training set.
+
+    The input given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \geq 2` for the `K`-dimensional case (described later).
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The target that this loss expects is a class index
+    `(0 to C-1, where C = number of classes)`
+
+    If :attr:`reduce` is ``False``, the loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_{y_n} x_{n,y_n}, \quad
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore_index}\},
+
+    where :math:`N` is the batch size. If :attr:`reduce` is ``True`` (default),
+    then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & \text{if}\;
+            \text{size_average} = \text{True},\\
+            \sum_{n=1}^N l_n,  & \text{if}\;
+            \text{size_average} = \text{False}.
+        \end{cases}
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 2`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below). In the case of images, it computes NLL loss per-pixel.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When
+            :attr:`size_average` is ``True``, the loss is averaged over
+            non-ignored targets.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, C)` where `C = number of classes`, or
+            :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 2`
+            in the case of `K`-dimensional loss.
+        - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
+            :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 2` in the case of
+            K-dimensional loss.
+        - Output: scalar. If reduce is ``False``, then the same size
+            as the target: :math:`(N)`, or
+            :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 2` in the case
+            of K-dimensional loss.
+
+    Examples::
+
+        >>> m = nn.LogSoftmax()
+        >>> loss = nn.NLLLoss()
+        >>> # input is of size N x C = 3 x 5
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.tensor([1, 0, 4])
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+        >>>
+        >>>
+        >>> # 2D loss example (used, for example, with image inputs)
+        >>> N, C = 5, 4
+        >>> loss = nn.NLLLoss()
+        >>> # input is of size N x C x height x width
+        >>> data = torch.randn(N, 16, 10, 10)
+        >>> m = nn.Conv2d(16, C, (3, 3))
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+        >>> output = loss(m(data), target)
+        >>> output.backward()
+    """
+
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
+                 reduce=None, reduction='elementwise_mean'):
+        super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+
+    def forward(self, input, target):
+        return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
+
+
+class NLLLoss2d(NLLLoss):
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
+                 reduce=None, reduction='elementwise_mean'):
+        warnings.warn("NLLLoss2d has been deprecated. "
+                      "Please use NLLLoss instead as a drop-in replacement and see "
+                      "http://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
+        super(NLLLoss2d, self).__init__(weight, size_average, ignore_index, reduce, reduction)
+
+
+class PoissonNLLLoss(_Loss):
+    r"""Negative log likelihood loss with Poisson distribution of target.
+
+    The loss can be described as:
+
+    .. math::
+        \text{target} \sim \mathrm{Poisson}(\text{input})
+
+        \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
+                                    + \log(\text{target!})
+
+    The last term can be omitted or approximated with Stirling formula. The
+    approximation is used for target values more than 1. For targets less or
+    equal to 1 zeros are added to the loss.
+
+    Args:
+        log_input (bool, optional): if ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
+            :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
+        full (bool, optional): whether to compute full loss, i. e. to add the
+            Stirling approximation term
+
+            .. math::
+                \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input == False`. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Examples::
+
+        >>> loss = nn.PoissonNLLLoss()
+        >>> log_input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> output = loss(log_input, target)
+        >>> output.backward()
+    """
+    def __init__(self, log_input=True, full=False, size_average=None,
+                 eps=1e-8, reduce=None, reduction='elementwise_mean'):
+        super(PoissonNLLLoss, self).__init__(size_average, reduce, reduction)
+        self.log_input = log_input
+        self.full = full
+        self.eps = eps
+
+    def forward(self, log_input, target):
+        return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
+                                  eps=self.eps, reduction=self.reduction)
+
+
+class KLDivLoss(_Loss):
+    r"""The `Kullback-Leibler divergence`_ Loss
+
+    KL divergence is a useful distance measure for continuous distributions
+    and is often useful when performing direct regression over the space of
+    (discretely sampled) continuous output distributions.
+
+    As with :class:`~torch.nn.NLLLoss`, the `input` given is expected to contain
+    *log-probabilities*. However, unlike :class:`~torch.nn.NLLLoss`, `input` is not
+    restricted to a 2D Tensor, because the criterion is applied element-wise.
+    The targets are given as *probabilities* (i.e. without taking the logarithm).
+
+    This criterion expects a `target` `Tensor` of the same size as the
+    `input` `Tensor`.
+
+    The unreduced (i.e. with :attr:`reduce` set to ``False``) loss can be described as:
+
+    .. math::
+        l(x,y) = L := \{ l_1,\dots,l_N \}, \quad
+        l_n = y_n \cdot \left( \log y_n - x_n \right),
+
+    where the index :math:`N` spans all dimensions of ``input`` and :math:`L` has the same
+    shape as ``input``. If :attr:`reduce` is ``True`` (the default), then:
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    By default, the losses are averaged for each minibatch over observations
+    **as well as** over dimensions. However, if the field
+    :attr:`size_average` is set to ``False``, the losses are instead summed.
+
+    .. _Kullback-Leibler divergence:
+        https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
+
+    .. note:: The default averaging means that the loss is actually **not** the
+          KL Divergence because the terms are already probability weighted.
+          A future release of PyTorch may move the default loss closer to the
+          mathematical definition.
+
+          To get the real KL Divergence, use ``size_average=False``, and
+          then divide the output by the batch size.
+
+          Example::
+
+            >>> loss = nn.KLDivLoss(size_average=False)
+            >>> batch_size = 5
+            >>> log_probs1 = F.log_softmax(torch.randn(batch_size, 10), 1)
+            >>> probs2 = F.softmax(torch.randn(batch_size, 10), 1)
+            >>> loss(log_probs1, probs2) / batch_size
+            tensor(0.7142)
+
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - target: :math:`(N, *)`, same shape as the input
+        - output: scalar by default. If `reduce` is ``False``, then :math:`(N, *)`,
+            the same shape as the input
+
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(KLDivLoss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.kl_div(input, target, reduction=self.reduction)
+
+
+class MSELoss(_Loss):
+    r"""Creates a criterion that measures the mean squared error between
+    `n` elements in the input `x` and target `y`.
+
+    The loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left( x_n - y_n \right)^2,
+
+    where :math:`N` is the batch size. If reduce is ``True``, then:
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    The sum operation still operates over all the elements, and divides by `n`.
+
+    The division by `n` can be avoided if one sets :attr:`size_average` to ``False``.
+
+    To get a batch of losses, a loss per batch element, set `reduce` to
+    ``False``. These losses are not averaged and are not affected by
+    `size_average`.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> loss = nn.MSELoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MSELoss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.mse_loss(input, target, reduction=self.reduction)
+
+
+class BCELoss(_WeightedLoss):
+    r"""Creates a criterion that measures the Binary Cross Entropy
+    between the target and the output:
+
+    The loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
+
+    where :math:`N` is the batch size. If reduce is ``True``, then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets `y` should be numbers
+    between 0 and 1.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size
+            "nbatch".
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar. If `reduce` is False, then `(N, *)`, same shape as
+          input.
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> loss = nn.BCELoss()
+        >>> input = torch.randn(3, requires_grad=True)
+        >>> target = torch.empty(3).random_(2)
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+    """
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
+
+
+class BCEWithLogitsLoss(_Loss):
+    r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+    class. This version is more numerically stable than using a plain `Sigmoid`
+    followed by a `BCELoss` as, by combining the operations into one layer,
+    we take advantage of the log-sum-exp trick for numerical stability.
+
+    The loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ t_n \cdot \log \sigma(x_n)
+        + (1 - t_n) \cdot \log (1 - \sigma(x_n)) \right],
+
+    where :math:`N` is the batch size. If reduce is ``True``, then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets `t[i]` should be numbers
+    between 0 and 1.
+
+    It's possible to trade off recall and precision by adding weights to positive examples.
+    In this case the loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ p_n t_n \cdot \log \sigma(x_n)
+        + (1 - t_n) \cdot \log (1 - \sigma(x_n)) \right],
+
+    where :math:`p_n` is the positive weight of class :math:`n`.
+    :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision.
+
+    For example, if a dataset contains 100 positive and 300 negative examples of a single class,
+    then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
+    The loss would act as if the dataset contains math:`3\times 100=300` positive examples.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size
+            "nbatch".
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+        pos_weight (Tensor, optional): a weight of positive examples.
+                Must be a vector with length equal to the number of classes.
+
+     Shape:
+         - Input: :math:`(N, *)` where `*` means, any number of additional
+           dimensions
+         - Target: :math:`(N, *)`, same shape as the input
+
+     Examples::
+
+        >>> loss = nn.BCEWithLogitsLoss()
+        >>> input = torch.randn(3, requires_grad=True)
+        >>> target = torch.empty(3).random_(2)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean', pos_weight=None):
+        super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
+        self.register_buffer('weight', weight)
+        self.register_buffer('pos_weight', pos_weight)
+
+    def forward(self, input, target):
+        return F.binary_cross_entropy_with_logits(input, target,
+                                                  self.weight,
+                                                  pos_weight=self.pos_weight,
+                                                  reduction=self.reduction)
+
+
+class HingeEmbeddingLoss(_Loss):
+    r"""Measures the loss given an input tensor `x` and a labels tensor `y`
+    containing values (`1` or `-1`).
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance as `x`, and is typically
+    used for learning nonlinear embeddings or semi-supervised learning.
+
+    The loss function for :math:`n`-th sample in the mini-batch is
+
+    .. math::
+        l_n = \begin{cases}
+            x_n, & \text{if}\; y_n = 1,\\
+            \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
+        \end{cases}
+
+    and the total loss functions is
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
+            \operatorname{sum}(L),  & \text{if}\; \text{size_average} = \text{False}.
+        \end{cases}
+
+    where :math:`L = \{l_1,\dots,l_N\}^\top`.
+
+    Args:
+        margin (float, optional): Has a default value of `1`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: Tensor of arbitrary shape. The sum operation operates over all the elements.
+        - Target: Same shape as input.
+        - Output: scalar. If reduce is ``False``, then same shape as the input
+    """
+
+    def __init__(self, margin=1.0, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(HingeEmbeddingLoss, self).__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input, target):
+        return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
+
+
+class MultiLabelMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a multi-class multi-classification
+    hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`)
+    and output `y` (which is a 2D `Tensor` of target class indices).
+    For each sample in the mini-batch:
+
+    .. math::
+        \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
+
+    where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`,
+    :math:`y[j] \geq 0`, and :math:`i \neq y[j]` for all `i` and `j`.
+
+    `y` and `x` must have the same size.
+
+    The criterion only considers a contiguous block of non-negative targets that
+    starts at the front.
+
+    This allows for different samples to have variable amounts of target classes
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
+          is the number of classes.
+        - Target: :math:`(C)` or :math:`(N, C)`, same shape as the input.
+        - Output: scalar. If `reduce` is False, then `(N)`.
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MultiLabelMarginLoss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.multilabel_margin_loss(input, target, reduction=self.reduction)
+
+
+class SmoothL1Loss(_Loss):
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below 1 and an L1 term otherwise.
+    It is less sensitive to outliers than the `MSELoss` and in some cases
+    prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
+    Also known as the Huber loss:
+
+    .. math::
+        \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i}
+
+    where :math:`z_{i}` is given by:
+
+    .. math::
+        z_{i} =
+        \begin{cases}
+        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
+        |x_i - y_i| - 0.5, & \text{otherwise }
+        \end{cases}
+
+    `x` and `y` arbitrary shapes with a total of `n` elements each
+    the sum operation still operates over all the elements, and divides by `n`.
+
+    The division by `n` can be avoided if one sets :attr:`size_average` to ``False``
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar. If reduce is ``False``, then
+          :math:`(N, *)`, same shape as the input
+
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.smooth_l1_loss(input, target, reduction=self.reduction)
+
+
+class SoftMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a two-class classification
+    logistic loss between input tensor `x` and target tensor `y` (containing 1 or
+    -1).
+
+    .. math::
+        \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: Tensor of arbitrary shape.
+        - Target: Same shape as input.
+        - Output: scalar. If reduce is ``False``, then same shape as the input
+
+    """
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(SoftMarginLoss, self).__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.soft_margin_loss(input, target, reduction=self.reduction)
+
+
+class CrossEntropyLoss(_WeightedLoss):
+    r"""This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
+
+    It is useful when training a classification problem with `C` classes.
+    If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
+    assigning weight to each of the classes.
+    This is particularly useful when you have an unbalanced training set.
+
+    The `input` is expected to contain scores for each class.
+
+    `input` has to be a Tensor of size either :math:`(minibatch, C)` or
+    :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \geq 2` for the `K`-dimensional case (described later).
+
+    This criterion expects a class index (0 to `C-1`) as the
+    `target` for each value of a 1D tensor of size `minibatch`
+
+    The loss can be described as:
+
+    .. math::
+        \text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
+                       = -x[class] + \log\left(\sum_j \exp(x[j])\right)
+
+    or in the case of the `weight` argument being specified:
+
+    .. math::
+        \text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
+
+    The losses are averaged across observations for each minibatch.
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 2`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below).
+
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When `size_average` is
+            ``True``, the loss is averaged over non-ignored targets.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, C)` where `C = number of classes`, or
+            :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 2`
+            in the case of `K`-dimensional loss.
+        - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
+            :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 2` in the case of
+            K-dimensional loss.
+        - Output: scalar. If reduce is ``False``, then the same size
+            as the target: :math:`(N)`, or
+            :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 2` in the case
+            of K-dimensional loss.
+
+    Examples::
+
+        >>> loss = nn.CrossEntropyLoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
+                 reduce=None, reduction='elementwise_mean'):
+        super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+
+    def forward(self, input, target):
+        return F.cross_entropy(input, target, weight=self.weight,
+                               ignore_index=self.ignore_index, reduction=self.reduction)
+
+
+class MultiLabelSoftMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-label one-versus-all
+    loss based on max-entropy, between input `x` and target `y` of size `(N, C)`.
+    For each sample in the minibatch:
+
+    .. math::
+        loss(x, y) = - \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
+                         + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
+
+    where `i == 0` to `x.nElement()-1`, `y[i]  in {0,1}`.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
+        - Target: :math:`(N, C)`, same shape as the input.
+        - Output: scalar. If `reduce` is False, then `(N)`.
+    """
+
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MultiLabelSoftMarginLoss, self).__init__(weight, size_average, reduce, reduction)
+
+    def forward(self, input, target):
+        return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
+
+
+class CosineEmbeddingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given input tensors
+    :math:`x_1`, :math:`x_2` and a `Tensor` label `y` with values 1 or -1.
+    This is used for measuring whether two inputs are similar or dissimilar,
+    using the cosine distance, and is typically used for learning nonlinear
+    embeddings or semi-supervised learning.
+
+    The loss function for each sample is:
+
+    .. math::
+        \text{loss}(x, y) =
+        \begin{cases}
+        1 - \cos(x_1, x_2), & \text{if } y == 1 \\
+        \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y == -1
+        \end{cases}
+
+    Args:
+        margin (float, optional): Should be a number from `-1` to `1`, `0` to `0.5`
+            is suggested. If `margin` is missing, the default value is `0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+    """
+
+    def __init__(self, margin=0, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(CosineEmbeddingLoss, self).__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1, input2, target):
+        return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
+
+
+class MarginRankingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given
+    inputs `x1`, `x2`, two 1D mini-batch `Tensor`s,
+    and a label 1D mini-batch tensor `y` with values (`1` or `-1`).
+
+    If `y == 1` then it assumed the first input should be ranked higher
+    (have a larger value) than the second input, and vice-versa for `y == -1`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        \text{loss}(x, y) = \max(0, -y * (x1 - x2) + \text{margin})
+
+    Args:
+        margin (float, optional): Has a default value of `0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
+        - Target: :math:`(N)`
+        - Output: scalar. If `reduce` is False, then `(N)`.
+    """
+
+    def __init__(self, margin=0, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MarginRankingLoss, self).__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1, input2, target):
+        return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
+
+
+class MultiMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-class classification hinge
+    loss (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
+    output `y` (which is a 1D tensor of target class indices,
+    :math:`0 \leq y \leq \text{x.size}(1)`):
+
+    For each mini-batch sample, the loss in terms of the 1D input `x` and scalar
+    output `y` is:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
+
+    where `i == 0` to `x.size(0)` and :math:`i \neq y`.
+
+    Optionally, you can give non-equal weighting on the classes by passing
+    a 1D `weight` tensor into the constructor.
+
+    The loss function then becomes:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)}
+
+    Args:
+        p (int, optional): Has a default value of `1`. `1` and `2` are the only
+            supported values
+        margin (float, optional): Has a default value of `1`.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+    """
+
+    def __init__(self, p=1, margin=1, weight=None, size_average=None,
+                 reduce=None, reduction='elementwise_mean'):
+        super(MultiMarginLoss, self).__init__(weight, size_average, reduce, reduction)
+        if p != 1 and p != 2:
+            raise ValueError("only p == 1 and p == 2 supported")
+        assert weight is None or weight.dim() == 1
+        self.p = p
+        self.margin = margin
+
+    def forward(self, input, target):
+        return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
+                                   weight=self.weight, reduction=self.reduction)
+
+
+class TripletMarginLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given an input
+    tensors x1, x2, x3 and a margin with a value greater than 0.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `a`, `p` and `n`: anchor, positive examples and negative
+    example respectively. The shapes of all input tensors should be
+    :math:`(N, D)`.
+
+    The distance swap is described in detail in the paper `Learning shallow
+    convolutional feature descriptors with triplet losses`_ by
+    V. Balntas, E. Riba et al.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+    where :math:`d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p`.
+
+    Args:
+        margin (float, optional): Default: `1`.
+        p (int, optional): The norm degree for pairwise distance. Default: `2`.
+        swap (float, optional): The distance swap is described in detail in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. Default: ``False``.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the sum of the output will be divided by the number of
+            elements in the output, 'sum': the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: 'elementwise_mean'
+
+    Shape:
+        - Input: :math:`(N, D)` where `D` is the vector dimension.
+        - Output: scalar. If `reduce` is False, then `(N)`.
+
+    >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
+    >>> input1 = torch.randn(100, 128, requires_grad=True)
+    >>> input2 = torch.randn(100, 128, requires_grad=True)
+    >>> input3 = torch.randn(100, 128, requires_grad=True)
+    >>> output = triplet_loss(input1, input2, input3)
+    >>> output.backward()
+
+    .. _Learning shallow convolutional feature descriptors with triplet losses:
+        http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf
+    """
+
+    def __init__(self, margin=1.0, p=2, eps=1e-6, swap=False, size_average=None,
+                 reduce=None, reduction='elementwise_mean'):
+        super(TripletMarginLoss, self).__init__(size_average, reduce, reduction)
+        self.margin = margin
+        self.p = p
+        self.eps = eps
+        self.swap = swap
+
+    def forward(self, anchor, positive, negative):
+        return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
+                                     eps=self.eps, swap=self.swap, reduction=self.reduction)
+
+# TODO: L1HingeEmbeddingCriterion
+# TODO: MSECriterion weight
+# TODO: ClassSimplexCriterion
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
new file mode 100644
index 0000000..91bab5c
--- /dev/null
+++ b/torch/nn/modules/module.py
@@ -0,0 +1,954 @@
+from collections import OrderedDict
+import functools
+import itertools
+
+import torch
+from ..backends.thnn import backend as thnn_backend
+from ..parameter import Parameter
+import torch.utils.hooks as hooks
+
+
+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+
+
+class Module(object):
+    r"""Base class for all neural network modules.
+
+    Your models should also subclass this class.
+
+    Modules can also contain other Modules, allowing to nest them in
+    a tree structure. You can assign the submodules as regular attributes::
+
+        import torch.nn as nn
+        import torch.nn.functional as F
+
+        class Model(nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.conv1 = nn.Conv2d(1, 20, 5)
+                self.conv2 = nn.Conv2d(20, 20, 5)
+
+            def forward(self, x):
+               x = F.relu(self.conv1(x))
+               return F.relu(self.conv2(x))
+
+    Submodules assigned in this way will be registered, and will have their
+    parameters converted too when you call `.cuda()`, etc.
+    """
+
+    dump_patches = False
+
+    r"""This allows better BC support for :meth:`load_state_dict`. In
+    :meth:`state_dict`, the version number will be saved as in the attribute
+    `_metadata` of the returned state dict, and thus pickled. `_metadata` is a
+    dictionary with keys follow the naming convention of state dict. See
+    ``_load_from_state_dict`` on how to use this information in loading.
+
+    If new parameters/buffers are added/removed from a module, this number shall
+    be bumped, and the module's `_load_from_state_dict` method can compare the
+    version number and do appropriate changes if the state dict is from before
+    the change."""
+    _version = 1
+
+    def __init__(self):
+        self._backend = thnn_backend
+        self._parameters = OrderedDict()
+        self._buffers = OrderedDict()
+        self._backward_hooks = OrderedDict()
+        self._forward_hooks = OrderedDict()
+        self._forward_pre_hooks = OrderedDict()
+        self._modules = OrderedDict()
+        self.training = True
+
+    def forward(self, *input):
+        r"""Defines the computation performed at every call.
+
+        Should be overridden by all subclasses.
+
+        .. note::
+            Although the recipe for forward pass needs to be defined within
+            this function, one should call the :class:`Module` instance afterwards
+            instead of this since the former takes care of running the
+            registered hooks while the latter silently ignores them.
+        """
+        raise NotImplementedError
+
+    def register_buffer(self, name, tensor):
+        r"""Adds a persistent buffer to the module.
+
+        This is typically used to register a buffer that should not to be
+        considered a model parameter. For example, BatchNorm's ``running_mean``
+        is not a parameter, but is part of the persistent state.
+
+        Buffers can be accessed as attributes using given names.
+
+        Args:
+            name (string): name of the buffer. The buffer can be accessed
+                from this module using the given name
+            tensor (Tensor): buffer to be registered.
+
+        Example::
+
+            >>> self.register_buffer('running_mean', torch.zeros(num_features))
+
+        """
+        if not isinstance(name, torch._six.string_classes):
+            raise TypeError("buffer name should be a string. "
+                            "Got {}".format(torch.typename(name)))
+        elif '.' in name:
+            raise KeyError("buffer name can't contain \".\"")
+        elif name == '':
+            raise KeyError("buffer name can't be empty string \"\"")
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif tensor is not None and not isinstance(tensor, torch.Tensor):
+            raise TypeError("cannot assign '{}' object to buffer '{}' "
+                            "(torch Tensor or None required)"
+                            .format(torch.typename(tensor), name))
+        else:
+            self._buffers[name] = tensor
+
+    def register_parameter(self, name, param):
+        r"""Adds a parameter to the module.
+
+        The parameter can be accessed as an attribute using given name.
+
+        Args:
+            name (string): name of the parameter. The parameter can be accessed
+                from this module using the given name
+            parameter (Parameter): parameter to be added to the module.
+        """
+        if '_parameters' not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call")
+
+        elif not isinstance(name, torch._six.string_classes):
+            raise TypeError("parameter name should be a string. "
+                            "Got {}".format(torch.typename(name)))
+        elif '.' in name:
+            raise KeyError("parameter name can't contain \".\"")
+        elif name == '':
+            raise KeyError("parameter name can't be empty string \"\"")
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("attribute '{}' already exists".format(name))
+
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError("cannot assign '{}' object to parameter '{}' "
+                            "(torch.nn.Parameter or None required)"
+                            .format(torch.typename(param), name))
+        elif param.grad_fn:
+            raise ValueError(
+                "Cannot assign non-leaf Tensor to parameter '{0}'. Model "
+                "parameters must be created explicitly. To express '{0}' "
+                "as a function of another Tensor, compute the value in "
+                "the forward() method.".format(name))
+        else:
+            self._parameters[name] = param
+
+    def add_module(self, name, module):
+        r"""Adds a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (string): name of the child module. The child module can be
+                accessed from this module using the given name
+            parameter (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError("{} is not a Module subclass".format(
+                torch.typename(module)))
+        elif not isinstance(name, torch._six.string_classes):
+            raise TypeError("module name should be a string. Got {}".format(
+                torch.typename(name)))
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif '.' in name:
+            raise KeyError("module name can't contain \".\"")
+        elif name == '':
+            raise KeyError("module name can't be empty string \"\"")
+        self._modules[name] = module
+
+    def _apply(self, fn):
+        for module in self.children():
+            module._apply(fn)
+
+        for param in self._parameters.values():
+            if param is not None:
+                # Tensors stored in modules are graph leaves, and we don't
+                # want to create copy nodes, so we have to unpack the data.
+                param.data = fn(param.data)
+                if param._grad is not None:
+                    param._grad.data = fn(param._grad.data)
+
+        for key, buf in self._buffers.items():
+            if buf is not None:
+                self._buffers[key] = fn(buf)
+
+        return self
+
+    def apply(self, fn):
+        r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
+        as well as self. Typical use includes initializing the parameters of a model
+        (see also :ref:`torch-nn-init`).
+
+        Args:
+            fn (:class:`Module` -> None): function to be applied to each submodule
+
+        Returns:
+            Module: self
+
+        Example::
+
+            >>> def init_weights(m):
+                    print(m)
+                    if type(m) == nn.Linear:
+                        m.weight.data.fill_(1.0)
+                        print(m.weight)
+
+            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+            >>> net.apply(init_weights)
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[ 1.,  1.],
+                    [ 1.,  1.]])
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[ 1.,  1.],
+                    [ 1.,  1.]])
+            Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+            Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+        """
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def cuda(self, device=None):
+        r"""Moves all model parameters and buffers to the GPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on GPU while being optimized.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cuda(device))
+
+    def cpu(self):
+        r"""Moves all model parameters and buffers to the CPU.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cpu())
+
+    def type(self, dst_type):
+        r"""Casts all parameters and buffers to :attr:`dst_type`.
+
+        Arguments:
+            dst_type (type or string): the desired type
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.type(dst_type))
+
+    def float(self):
+        r"""Casts all floating point parameters and buffers to float datatype.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.float() if t.is_floating_point() else t)
+
+    def double(self):
+        r"""Casts all floating point parameters and buffers to ``double`` datatype.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.double() if t.is_floating_point() else t)
+
+    def half(self):
+        r"""Casts all floating point parameters and buffers to ``half`` datatype.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
+
+    def to(self, *args, **kwargs):
+        r"""Moves and/or casts the parameters and buffers.
+
+        This can be called as
+
+        .. function:: to(device=None, dtype=None, non_blocking=False)
+
+        .. function:: to(dtype, non_blocking=False)
+
+        .. function:: to(tensor, non_blocking=False)
+
+        Its signature is similar to :meth:`torch.Tensor.to`, but only accepts
+        floating point desired :attr:`dtype` s. In addition, this method will
+        only cast the floating point parameters and buffers to :attr:`dtype`
+        (if given). The integral parameters and buffers will be moved
+        :attr:`device`, if that is given, but with dtypes unchanged. When
+        :attr:`non_blocking` is set, it tries to convert/move asynchronously
+        with respect to the host if possible, e.g., moving CPU Tensors with
+        pinned memory to CUDA devices.
+
+        See below for examples.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (:class:`torch.device`): the desired device of the parameters
+                and buffers in this module
+            dtype (:class:`torch.dtype`): the desired floating point type of
+                the floating point parameters and buffers in this module
+            tensor (torch.Tensor): Tensor whose dtype and device are the desired
+                dtype and device for all parameters and buffers in this module
+
+        Returns:
+            Module: self
+
+        Example::
+
+            >>> linear = nn.Linear(2, 2)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]])
+            >>> linear.to(torch.double)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]], dtype=torch.float64)
+            >>> gpu1 = torch.device("cuda:1")
+            >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')
+            >>> cpu = torch.device("cpu")
+            >>> linear.to(cpu)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16)
+
+        """
+
+        device, dtype, non_blocking = torch._C._nn._parse_to(*args, **kwargs)
+
+        if dtype is not None:
+            if not dtype.is_floating_point:
+                raise TypeError('nn.Module.to only accepts floating point '
+                                'dtypes, but got desired dtype={}'.format(dtype))
+
+        def convert(t):
+            return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
+
+        return self._apply(convert)
+
+    def register_backward_hook(self, hook):
+        r"""Registers a backward hook on the module.
+
+        The hook will be called every time the gradients with respect to module
+        inputs are computed. The hook should have the following signature::
+
+            hook(module, grad_input, grad_output) -> Tensor or None
+
+        The :attr:`grad_input` and :attr:`grad_output` may be tuples if the
+        module has multiple inputs or outputs. The hook should not modify its
+        arguments, but it can optionally return a new gradient with respect to
+        input that will be used in place of :attr:`grad_input` in subsequent
+        computations.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def register_forward_pre_hook(self, hook):
+        r"""Registers a forward pre-hook on the module.
+
+        The hook will be called every time before :func:`forward` is invoked.
+        It should have the following signature::
+
+            hook(module, input) -> None
+
+        The hook should not modify the input.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._forward_pre_hooks)
+        self._forward_pre_hooks[handle.id] = hook
+        return handle
+
+    def register_forward_hook(self, hook):
+        r"""Registers a forward hook on the module.
+
+        The hook will be called every time after :func:`forward` has computed an output.
+        It should have the following signature::
+
+            hook(module, input, output) -> None
+
+        The hook should not modify the input or output.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._forward_hooks)
+        self._forward_hooks[handle.id] = hook
+        return handle
+
+    def _tracing_name(self, tracing_state):
+        if not tracing_state._traced_module_stack:
+            return None
+        module = tracing_state._traced_module_stack[-1]
+        for name, child in module.named_children():
+            if child is self:
+                return name
+        return None
+
+    def _slow_forward(self, *input, **kwargs):
+        input_vars = tuple(torch.autograd.function._iter_tensors(input))
+        tracing_state = torch.jit.get_tracing_state(input_vars)
+        if not tracing_state:
+            return self.forward(*input, **kwargs)
+        if not hasattr(tracing_state, '_traced_module_stack'):
+            tracing_state._traced_module_stack = []
+        name = self._tracing_name(tracing_state)
+        if name:
+            tracing_state.push_scope('%s[%s]' % (self.__class__.__name__, name))
+        else:
+            tracing_state.push_scope(self.__class__.__name__)
+        tracing_state._traced_module_stack.append(self)
+        try:
+            result = self.forward(*input, **kwargs)
+        finally:
+            tracing_state.pop_scope()
+            tracing_state._traced_module_stack.pop()
+        return result
+
+    def __call__(self, *input, **kwargs):
+        for hook in self._forward_pre_hooks.values():
+            hook(self, input)
+        if torch.jit._tracing:
+            result = self._slow_forward(*input, **kwargs)
+        else:
+            result = self.forward(*input, **kwargs)
+        for hook in self._forward_hooks.values():
+            hook_result = hook(self, input, result)
+            if hook_result is not None:
+                raise RuntimeError(
+                    "forward hooks should never return any values, but '{}'"
+                    "didn't return None".format(hook))
+        if len(self._backward_hooks) > 0:
+            var = result
+            while not isinstance(var, torch.Tensor):
+                if isinstance(var, dict):
+                    var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
+                else:
+                    var = var[0]
+            grad_fn = var.grad_fn
+            if grad_fn is not None:
+                for hook in self._backward_hooks.values():
+                    wrapper = functools.partial(hook, self)
+                    functools.update_wrapper(wrapper, hook)
+                    grad_fn.register_hook(wrapper)
+        return result
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if '_forward_pre_hooks' not in self.__dict__:
+            self._forward_pre_hooks = OrderedDict()
+
+    def __getattr__(self, name):
+        if '_parameters' in self.__dict__:
+            _parameters = self.__dict__['_parameters']
+            if name in _parameters:
+                return _parameters[name]
+        if '_buffers' in self.__dict__:
+            _buffers = self.__dict__['_buffers']
+            if name in _buffers:
+                return _buffers[name]
+        if '_modules' in self.__dict__:
+            modules = self.__dict__['_modules']
+            if name in modules:
+                return modules[name]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, name))
+
+    def __setattr__(self, name, value):
+        def remove_from(*dicts):
+            for d in dicts:
+                if name in d:
+                    del d[name]
+
+        params = self.__dict__.get('_parameters')
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call")
+            remove_from(self.__dict__, self._buffers, self._modules)
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError("cannot assign '{}' as parameter '{}' "
+                                "(torch.nn.Parameter or None expected)"
+                                .format(torch.typename(value), name))
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get('_modules')
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call")
+                remove_from(self.__dict__, self._parameters, self._buffers)
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError("cannot assign '{}' as child module '{}' "
+                                    "(torch.nn.Module or None expected)"
+                                    .format(torch.typename(value), name))
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get('_buffers')
+                if buffers is not None and name in buffers:
+                    if value is not None and not isinstance(value, torch.Tensor):
+                        raise TypeError("cannot assign '{}' as buffer '{}' "
+                                        "(torch.Tensor or None expected)"
+                                        .format(torch.typename(value), name))
+                    buffers[name] = value
+                else:
+                    object.__setattr__(self, name, value)
+
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._buffers:
+            del self._buffers[name]
+        elif name in self._modules:
+            del self._modules[name]
+        else:
+            object.__delattr__(self, name)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        r"""Returns a dictionary containing a whole state of the module.
+
+        Both parameters and persistent buffers (e.g. running averages) are
+        included. Keys are corresponding parameter and buffer names.
+
+        Returns:
+            dict:
+                a dictionary containing a whole state of the module
+
+        Example::
+
+            >>> module.state_dict().keys()
+            ['bias', 'weight']
+
+        """
+        if destination is None:
+            destination = OrderedDict()
+            destination._metadata = OrderedDict()
+        destination._metadata[prefix[:-1]] = dict(version=self._version)
+        for name, param in self._parameters.items():
+            if param is not None:
+                destination[prefix + name] = param if keep_vars else param.data
+        for name, buf in self._buffers.items():
+            if buf is not None:
+                destination[prefix + name] = buf if keep_vars else buf.data
+        for name, module in self._modules.items():
+            if module is not None:
+                module.state_dict(destination, prefix + name + '.', keep_vars=keep_vars)
+        return destination
+
+    def _load_from_state_dict(self, state_dict, prefix, metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        r"""Copies parameters and buffers from :attr:`state_dict` into only
+        this module, but not its descendants. This is called on every submodule
+        in :meth:`~torch.nn.Module.load_state_dict`. Metadata saved for this
+        module in input :attr:`state_dict` is provided as :attr`metadata`.
+        For state dicts without meta data, :attr`metadata` is empty.
+        Subclasses can achieve class-specific backward compatible loading using
+        the version number at `metadata.get("version", None)`.
+
+        .. note::
+            :attr:`state_dict` is not the same object as the input
+            :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So
+            it can be modified.
+
+        Arguments:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+            metadata (dict): a dict containing the metadata for this moodule.
+                See
+            strict (bool): whether to strictly enforce that the keys in
+                :attr:`state_dict` with :attr:`prefix` match the names of
+                parameters and buffers in this module
+            missing_keys (list of str): if ``strict=False``, add missing keys to
+                this list
+            unexpected_keys (list of str): if ``strict=False``, add unexpected
+                keys to this list
+            error_msgs (list of str): error messages should be added to this
+                list, and will be reported together in
+                :meth:`~torch.nn.Module.load_state_dict`
+        """
+        local_name_params = itertools.chain(self._parameters.items(), self._buffers.items())
+        local_state = {k: v.data for k, v in local_name_params if v is not None}
+
+        for name, param in local_state.items():
+            key = prefix + name
+            if key in state_dict:
+                input_param = state_dict[key]
+
+                if input_param.shape != param.shape:
+                    # local shape should match the one in checkpoint
+                    error_msgs.append('size mismatch for {}: copying a param of {} from checkpoint, '
+                                      'where the shape is {} in current model.'
+                                      .format(key, param.shape, input_param.shape))
+                    continue
+
+                if isinstance(input_param, Parameter):
+                    # backwards compatibility for serialized parameters
+                    input_param = input_param.data
+                try:
+                    param.copy_(input_param)
+                except Exception:
+                    error_msgs.append('While copying the parameter named "{}", '
+                                      'whose dimensions in the model are {} and '
+                                      'whose dimensions in the checkpoint are {}.'
+                                      .format(key, param.size(), input_param.size()))
+            elif strict:
+                missing_keys.append(key)
+
+        if strict:
+            for key, input_param in state_dict.items():
+                if key.startswith(prefix):
+                    input_name = key[len(prefix):]
+                    input_name = input_name.split('.', 1)[0]  # get the name of param/buffer/child
+                    if input_name not in self._modules and input_name not in local_state:
+                        unexpected_keys.append(key)
+
+    def load_state_dict(self, state_dict, strict=True):
+        r"""Copies parameters and buffers from :attr:`state_dict` into
+        this module and its descendants. If :attr:`strict` is ``True``, then
+        the keys of :attr:`state_dict` must exactly match the keys returned
+        by this module's :meth:`~torch.nn.Module.state_dict` function.
+
+        Arguments:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            strict (bool, optional): whether to strictly enforce that the keys
+                in :attr:`state_dict` match the keys returned by this module's
+                :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+        """
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(self)
+
+        if strict:
+            error_msg = ''
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0, 'Unexpected key(s) in state_dict: {}. '.format(
+                        ', '.join('"{}"'.format(k) for k in unexpected_keys)))
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0, 'Missing key(s) in state_dict: {}. '.format(
+                        ', '.join('"{}"'.format(k) for k in missing_keys)))
+
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
+
+    def parameters(self):
+        r"""Returns an iterator over module parameters.
+
+        This is typically passed to an optimizer.
+
+        Yields:
+            Parameter: module parameter
+
+        Example::
+
+            >>> for param in model.parameters():
+            >>>     print(type(param.data), param.size())
+            <class 'torch.FloatTensor'> (20L,)
+            <class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for name, param in self.named_parameters():
+            yield param
+
+    def named_parameters(self, memo=None, prefix=''):
+        r"""Returns an iterator over module parameters, yielding both the
+        name of the parameter as well as the parameter itself
+
+        Yields:
+            (string, Parameter): Tuple containing the name and parameter
+
+        Example::
+
+            >>> for name, param in self.named_parameters():
+            >>>    if name in ['bias']:
+            >>>        print(param.size())
+
+        """
+        if memo is None:
+            memo = set()
+        for name, p in self._parameters.items():
+            if p is not None and p not in memo:
+                memo.add(p)
+                yield prefix + ('.' if prefix else '') + name, p
+        for mname, module in self.named_children():
+            submodule_prefix = prefix + ('.' if prefix else '') + mname
+            for name, p in module.named_parameters(memo, submodule_prefix):
+                yield name, p
+
+    def _all_buffers(self, memo=None):
+        if memo is None:
+            memo = set()
+        for name, b in self._buffers.items():
+            if b is not None and b not in memo:
+                memo.add(b)
+                yield b
+        for module in self.children():
+            for b in module._all_buffers(memo):
+                yield b
+
+    def children(self):
+        r"""Returns an iterator over immediate children modules.
+
+        Yields:
+            Module: a child module
+        """
+        for name, module in self.named_children():
+            yield module
+
+    def named_children(self):
+        r"""Returns an iterator over immediate children modules, yielding both
+        the name of the module as well as the module itself.
+
+        Yields:
+            (string, Module): Tuple containing a name and child module
+
+        Example::
+
+            >>> for name, module in model.named_children():
+            >>>     if name in ['conv4', 'conv5']:
+            >>>         print(module)
+
+        """
+        memo = set()
+        for name, module in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield name, module
+
+    def modules(self):
+        r"""Returns an iterator over all modules in the network.
+
+        Yields:
+            Module: a module in the network
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.modules()):
+                    print(idx, '->', m)
+
+            0 -> Sequential (
+              (0): Linear (2 -> 2)
+              (1): Linear (2 -> 2)
+            )
+            1 -> Linear (2 -> 2)
+
+        """
+        for name, module in self.named_modules():
+            yield module
+
+    def named_modules(self, memo=None, prefix=''):
+        r"""Returns an iterator over all modules in the network, yielding
+        both the name of the module as well as the module itself.
+
+        Yields:
+            (string, Module): Tuple of name and module
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.named_modules()):
+                    print(idx, '->', m)
+
+            0 -> ('', Sequential (
+              (0): Linear (2 -> 2)
+              (1): Linear (2 -> 2)
+            ))
+            1 -> ('0', Linear (2 -> 2))
+
+        """
+
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            memo.add(self)
+            yield prefix, self
+            for name, module in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ('.' if prefix else '') + name
+                for m in module.named_modules(memo, submodule_prefix):
+                    yield m
+
+    def train(self, mode=True):
+        r"""Sets the module in training mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in training/evaluation
+        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        Returns:
+            Module: self
+        """
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+
+    def eval(self):
+        r"""Sets the module in evaluation mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in training/evaluation
+        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+        """
+        return self.train(False)
+
+    def zero_grad(self):
+        r"""Sets gradients of all model parameters to zero."""
+        for p in self.parameters():
+            if p.grad is not None:
+                p.grad.detach_()
+                p.grad.zero_()
+
+    def share_memory(self):
+        return self._apply(lambda t: t.share_memory_())
+
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def extra_repr(self):
+        r"""Set the extra representation of the module
+
+        To print customized extra information, you should reimplement
+        this method in your own modules. Both single-line and multi-line
+        strings are acceptable.
+        """
+        return ''
+
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split('\n')
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+        main_str += ')'
+        return main_str
+
+    def __dir__(self):
+        module_attrs = dir(self.__class__)
+        attrs = list(self.__dict__.keys())
+        parameters = list(self._parameters.keys())
+        modules = list(self._modules.keys())
+        buffers = list(self._buffers.keys())
+        keys = module_attrs + attrs + parameters + modules + buffers
+
+        # Eliminate attrs that are not legal Python variable names
+        keys = [key for key in keys if not key[0].isdigit()]
+
+        return sorted(keys)
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
new file mode 100644
index 0000000..49a1aed
--- /dev/null
+++ b/torch/nn/modules/normalization.py
@@ -0,0 +1,226 @@
+import torch
+import numbers
+from torch.nn.parameter import Parameter
+from .module import Module
+from .batchnorm import _BatchNorm
+from .. import functional as F
+
+
+class LocalResponseNorm(Module):
+    r"""Applies local response normalization over an input signal composed
+    of several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    .. math::
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+
+    Args:
+        size: amount of neighbouring channels used for normalization
+        alpha: multiplicative factor. Default: 0.0001
+        beta: exponent. Default: 0.75
+        k: additive factor. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C, ...)`
+        - Output: :math:`(N, C, ...)` (same shape as input)
+
+    Examples::
+
+        >>> lrn = nn.LocalResponseNorm(2)
+        >>> signal_2d = torch.randn(32, 5, 24, 24)
+        >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7)
+        >>> output_2d = lrn(signal_2d)
+        >>> output_4d = lrn(signal_4d)
+
+    """
+
+    def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
+        super(LocalResponseNorm, self).__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input):
+        return F.local_response_norm(input, self.size, self.alpha, self.beta,
+                                     self.k)
+
+    def extra_repr(self):
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+
+
+class CrossMapLRN2d(Module):
+
+    def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
+        super(CrossMapLRN2d, self).__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input):
+        return self._backend.CrossMapLRN2d(self.size, self.alpha, self.beta,
+                                           self.k)(input)
+
+    def extra_repr(self):
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+
+
+class LayerNorm(Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization`_ .
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated separately over the last
+    certain number dimensions which have to be of the shape specified by
+    :attr:`normalized_shape`.
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+                    \times \ldots \times \text{normalized_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> input = torch.randn(20, 5, 10, 10)
+        >>> # With Learnable Parameters
+        >>> m = nn.LayerNorm(input.size()[1:])
+        >>> # Without Learnable Parameters
+        >>> m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
+        >>> # Normalize over last two dimensions
+        >>> m = nn.LayerNorm([10, 10])
+        >>> # Normalize over last dimension of size 10
+        >>> m = nn.LayerNorm(10)
+        >>> # Activating the module
+        >>> output = m(input)
+
+    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
+    """
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.Tensor(*normalized_shape))
+            self.bias = Parameter(torch.Tensor(*normalized_shape))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.elementwise_affine:
+            self.weight.data.fill_(1)
+            self.bias.data.zero_()
+
+    def forward(self, input):
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps)
+
+    def extra_repr(self):
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+
+
+class GroupNorm(Module):
+    r"""Applies Group Normalization over a mini-batch of inputs as described in
+    the paper `Group Normalization`_ .
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
+    separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
+    per-channel affine transform parameter vectorss of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, num\_channels, *)`
+        - Output: :math:`(N, num\_channels, *)` (same shape as input)
+
+    Examples::
+
+        >>> input = torch.randn(20, 6, 10, 10)
+        >>> # Separate 6 channels into 3 groups
+        >>> m = nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+
+    .. _`Group Normalization`: https://arxiv.org/abs/1803.08494
+    """
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
+        super(GroupNorm, self).__init__()
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_channels))
+            self.bias = Parameter(torch.Tensor(num_channels))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.affine:
+            self.weight.data.fill_(1)
+            self.bias.data.zero_()
+
+    def forward(self, input):
+        return F.group_norm(
+            input, self.num_groups, self.weight, self.bias, self.eps)
+
+    def extra_repr(self):
+        return '{num_groups}, {num_channels}, eps={eps}, ' \
+            'affine={affine}'.format(**self.__dict__)
+
+
+# TODO: ContrastiveNorm2d
+# TODO: DivisiveNorm2d
+# TODO: SubtractiveNorm2d
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
new file mode 100644
index 0000000..2ad9a56
--- /dev/null
+++ b/torch/nn/modules/padding.py
@@ -0,0 +1,480 @@
+from .module import Module
+from .utils import _pair, _quadruple, _ntuple
+from .. import functional as F
+
+
+# TODO: grad_output size asserts in THNN
+
+
+class _ConstantPadNd(Module):
+
+    def __init__(self, value):
+        super(_ConstantPadNd, self).__init__()
+        self.value = value
+
+    def forward(self, input):
+        return F.pad(input, self.padding, 'constant', self.value)
+
+    def extra_repr(self):
+        return 'padding={}, value={}'.format(self.padding, self.value)
+
+
+class ConstantPad1d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses (`paddingLeft`, `paddingRight`)
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`
+        - Output: :math:`(N, C, W_{out})` where
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+
+        (0 ,.,.) =
+          0.1875  0.5046 -1.0074  2.0005
+         -0.3540 -1.8645  1.1530  0.0632
+        [torch.FloatTensor of size (1,2,4)]
+
+        >>> m(input)
+
+        (0 ,.,.) =
+          3.5000  3.5000  0.1875  0.5046 -1.0074  2.0005  3.5000  3.5000
+          3.5000  3.5000 -0.3540 -1.8645  1.1530  0.0632  3.5000  3.5000
+        [torch.FloatTensor of size (1,2,8)]
+
+        >>> # using different paddings
+        >>> m = nn.ConstantPad1d((3, 1), 3.5)
+        >>> m(input)
+
+        (0 ,.,.) =
+          3.5000  3.5000  3.5000  0.1875  0.5046 -1.0074  2.0005  3.5000
+          3.5000  3.5000  3.5000 -0.3540 -1.8645  1.1530  0.0632  3.5000
+        [torch.FloatTensor of size (1,2,8)]
+
+    """
+
+    def __init__(self, padding, value):
+        super(ConstantPad1d, self).__init__(value)
+        self.padding = _pair(padding)
+
+
+class ConstantPad2d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (`paddingLeft`, `paddingRight`,
+            `paddingTop`, `paddingBottom`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ConstantPad2d(2, 3.5)
+        >>> input = torch.randn(1, 2, 2)
+        >>> input
+
+        (0 ,.,.) =
+         -0.2295 -0.9774
+         -0.3335 -1.4178
+        [torch.FloatTensor of size (1,2,2)]
+
+        >>> m(input)
+
+        (0 ,.,.) =
+          3.5000  3.5000  3.5000  3.5000  3.5000  3.5000
+          3.5000  3.5000  3.5000  3.5000  3.5000  3.5000
+          3.5000  3.5000 -0.2295 -0.9774  3.5000  3.5000
+          3.5000  3.5000 -0.3335 -1.4178  3.5000  3.5000
+          3.5000  3.5000  3.5000  3.5000  3.5000  3.5000
+          3.5000  3.5000  3.5000  3.5000  3.5000  3.5000
+        [torch.FloatTensor of size (1,6,6)]
+
+        >>> # using different paddings
+        >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
+        >>> m(input)
+
+        (0 ,.,.) =
+          3.5000  3.5000  3.5000  3.5000  3.5000
+          3.5000  3.5000  3.5000  3.5000  3.5000
+          3.5000  3.5000  3.5000 -0.2295 -0.9774
+          3.5000  3.5000  3.5000 -0.3335 -1.4178
+          3.5000  3.5000  3.5000  3.5000  3.5000
+        [torch.FloatTensor of size (1,5,5)]
+
+    """
+
+    def __init__(self, padding, value):
+        super(ConstantPad2d, self).__init__(value)
+        self.padding = _quadruple(padding)
+
+
+class ConstantPad3d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (`paddingLeft`, `paddingRight`, `paddingTop`, `paddingBottom`, `paddingFront`, `paddingBack`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+          :math:`D_{out} = D_{in} + \textit{paddingFront} + \textit{paddingBack}`
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ConstantPad3d(3, 3.5)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings
+        >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
+        >>> output = m(input)
+
+    """
+
+    def __init__(self, padding, value):
+        super(ConstantPad3d, self).__init__(value)
+        self.padding = _ntuple(6)(padding)
+
+
+class _ReflectionPadNd(Module):
+
+    def forward(self, input):
+        return F.pad(input, self.padding, 'reflect')
+
+    def extra_repr(self):
+        return '{}'.format(self.padding)
+
+
+class ReflectionPad1d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses (`paddingLeft`, `paddingRight`)
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`
+        - Output: :math:`(N, C, W_{out})` where
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ReflectionPad1d(2)
+        >>> input = torch.arange(8).reshape(1, 2, 4)
+        >>> input
+
+        (0 ,.,.) =
+          0  1  2  3
+          4  5  6  7
+        [torch.FloatTensor of size (1,2,4)]
+
+        >>> m(input)
+
+        (0 ,.,.) =
+           2   1   0   1   2   3   2   1
+           6   5   4   5   6   7   6   5
+        [torch.FloatTensor of size (1,2,8)]
+
+        >>> # using different paddings
+        >>> m = nn.ReflectionPad1d((3, 1))
+        >>> m(input)
+
+        (0 ,.,.) =
+           3   2   1   0   1   2   3   2
+           7   6   5   4   5   6   7   6
+        [torch.FloatTensor of size (1,2,8)]
+
+    """
+
+    def __init__(self, padding):
+        super(ReflectionPad1d, self).__init__()
+        self.padding = _pair(padding)
+
+
+class ReflectionPad2d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (`paddingLeft`, `paddingRight`,
+            `paddingTop`, `paddingBottom`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ReflectionPad2d(2)
+        >>> input = torch.arange(9).reshape(1, 1, 3, 3)
+        >>> input
+
+        (0 ,0 ,.,.) =
+          0  1  2
+          3  4  5
+          6  7  8
+        [torch.FloatTensor of size (1,1,3,3)]
+
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+           8   7   6   7   8   7   6
+           5   4   3   4   5   4   3
+           2   1   0   1   2   1   0
+           5   4   3   4   5   4   3
+           8   7   6   7   8   7   6
+           5   4   3   4   5   4   3
+           2   1   0   1   2   1   0
+        [torch.FloatTensor of size (1,1,7,7)]
+
+        >>> # using different paddings
+        >>> m = nn.ReflectionPad2d((1, 1, 2, 0))
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+          7  6  7  8  7
+          4  3  4  5  4
+          1  0  1  2  1
+          4  3  4  5  4
+          7  6  7  8  7
+        [torch.FloatTensor of size (1,1,5,5)]
+
+    """
+
+    def __init__(self, padding):
+        super(ReflectionPad2d, self).__init__()
+        self.padding = _quadruple(padding)
+
+
+class _ReplicationPadNd(Module):
+
+    def forward(self, input):
+        return F.pad(input, self.padding, 'replicate')
+
+    def extra_repr(self):
+        return '{}'.format(self.padding)
+
+
+class ReplicationPad1d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses (`paddingLeft`, `paddingRight`)
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`
+        - Output: :math:`(N, C, W_{out})` where
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ReplicationPad1d(2)
+        >>> input = torch.arange(8).reshape(1, 2, 4)
+        >>> input
+
+        (0 ,.,.) =
+          0  1  2  3
+          4  5  6  7
+        [torch.FloatTensor of size (1,2,4)]
+
+        >>> m(input)
+
+        (0 ,.,.) =
+           0   0   0   1   2   3   3   3
+           4   4   4   5   6   7   7   7
+        [torch.FloatTensor of size (1,2,8)]
+
+        >>> # using different paddings
+        >>> m = nn.ReplicationPad1d((3, 1))
+        >>> m(input)
+
+        (0 ,.,.) =
+           0   0   0   0   1   2   3   3
+           4   4   4   4   5   6   7   7
+        [torch.FloatTensor of size (1,2,8)]
+
+    """
+
+    def __init__(self, padding):
+        super(ReplicationPad1d, self).__init__()
+        self.padding = _pair(padding)
+
+
+class ReplicationPad2d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (`paddingLeft`, `paddingRight`,
+            `paddingTop`, `paddingBottom`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ReplicationPad2d(2)
+        >>> input = torch.arange(9).reshape(1, 1, 3, 3)
+        >>> input
+
+        (0 ,0 ,.,.) =
+          0  1  2
+          3  4  5
+          6  7  8
+        [torch.FloatTensor of size (1,1,3,3)]
+
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+           0   0   0   1   2   2   2
+           0   0   0   1   2   2   2
+           0   0   0   1   2   2   2
+           3   3   3   4   5   5   5
+           6   6   6   7   8   8   8
+           6   6   6   7   8   8   8
+           6   6   6   7   8   8   8
+        [torch.FloatTensor of size (1,1,7,7)]
+
+        >>> # using different paddings
+        >>> m = nn.ReplicationPad2d((1, 1, 2, 0))
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+          0  0  1  2  2
+          0  0  1  2  2
+          0  0  1  2  2
+          3  3  4  5  5
+          6  6  7  8  8
+        [torch.FloatTensor of size (1,1,5,5)]
+
+    """
+
+    def __init__(self, padding):
+        super(ReplicationPad2d, self).__init__()
+        self.padding = _quadruple(padding)
+
+
+class ReplicationPad3d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses (`paddingLeft`, `paddingRight`,
+            `paddingTop`, `paddingBottom`, `paddingFront`, `paddingBack`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+          :math:`D_{out} = D_{in} + \textit{paddingFront} + \textit{paddingBack}`
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ReplicationPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings
+        >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+
+    """
+
+    def __init__(self, padding):
+        super(ReplicationPad3d, self).__init__()
+        self.padding = _ntuple(6)(padding)
+
+
+class ZeroPad2d(ConstantPad2d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`d-padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (`paddingLeft`, `paddingRight`,
+            `paddingTop`, `paddingBottom`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = H_{in} + \textit{paddingTop} + \textit{paddingBottom}`
+          :math:`W_{out} = W_{in} + \textit{paddingLeft} + \textit{paddingRight}`
+
+    Examples::
+
+        >>> m = nn.ZeroPad2d(2)
+        >>> input = torch.randn(1, 1, 3, 3)
+        >>> input
+
+        (0 ,0 ,.,.) =
+          1.4418 -1.9812 -0.3815
+         -0.3828 -0.6833 -0.2376
+          0.1433  0.0211  0.4311
+        [torch.FloatTensor of size (1,1,3,3)]
+
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+          0.0000  0.0000  1.4418 -1.9812 -0.3815  0.0000  0.0000
+          0.0000  0.0000 -0.3828 -0.6833 -0.2376  0.0000  0.0000
+          0.0000  0.0000  0.1433  0.0211  0.4311  0.0000  0.0000
+          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+        [torch.FloatTensor of size (1,1,7,7)]
+
+        >>> # using different paddings
+        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m(input)
+
+        (0 ,0 ,.,.) =
+          0.0000  0.0000  0.0000  0.0000  0.0000
+          0.0000  0.0000  0.0000  0.0000  0.0000
+          0.0000  1.4418 -1.9812 -0.3815  0.0000
+          0.0000 -0.3828 -0.6833 -0.2376  0.0000
+          0.0000  0.1433  0.0211  0.4311  0.0000
+        [torch.FloatTensor of size (1,1,5,5)]
+
+    """
+
+    def __init__(self, padding):
+        super(ZeroPad2d, self).__init__(padding, 0)
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
new file mode 100644
index 0000000..26c325f
--- /dev/null
+++ b/torch/nn/modules/pixelshuffle.py
@@ -0,0 +1,43 @@
+from .module import Module
+from .. import functional as F
+
+
+class PixelShuffle(Module):
+    r"""Rearranges elements in a Tensor of shape :math:`(*, r^2C, H, W)` to a
+    tensor of shape :math:`(C, rH, rW)`.
+
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+
+    Look at the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details
+
+    Args:
+        upscale_factor (int): factor to increase spatial resolution by
+
+    Shape:
+        - Input: :math:`(N, C * \text{upscale_factor}^2, H, W)`
+        - Output: :math:`(N, C, H * \text{upscale_factor}, W * \text{upscale_factor})`
+
+    Examples::
+
+        >>> ps = nn.PixelShuffle(3)
+        >>> input = torch.tensor(1, 9, 4, 4)
+        >>> output = ps(input)
+        >>> print(output.size())
+        torch.Size([1, 1, 12, 12])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    def __init__(self, upscale_factor):
+        super(PixelShuffle, self).__init__()
+        self.upscale_factor = upscale_factor
+
+    def forward(self, input):
+        return F.pixel_shuffle(input, self.upscale_factor)
+
+    def extra_repr(self):
+        return 'upscale_factor={}'.format(self.upscale_factor)
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
new file mode 100644
index 0000000..75ab843
--- /dev/null
+++ b/torch/nn/modules/pooling.py
@@ -0,0 +1,986 @@
+import torch
+
+from .module import Module
+from .utils import _single, _pair, _triple
+from .. import functional as F
+
+
+class _MaxPoolNd(Module):
+
+    def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
+                 return_indices=False, ceil_mode=False):
+        super(_MaxPoolNd, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self):
+        return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \
+            ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__)
+
+
+class MaxPool1d(_MaxPoolNd):
+    r"""Applies a 1D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_j, k)  = \max_{m=0, \ldots, \text{kernel_size}-1}
+                \text{input}(N_i, C_j, \text{stride} * k + m)
+        \end{equation*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful when Unpooling later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})`
+        - Output: :math:`(N, C, L_{out})` where
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} + 2 * \text{padding} - \text{dilation}
+                    * (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of size=3, stride=2
+        >>> m = nn.MaxPool1d(3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def forward(self, input):
+        return F.max_pool1d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
+
+    def extra_repr(self):
+        return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \
+            ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__)
+
+
+class MaxPool2d(_MaxPoolNd):
+    r"""Applies a 2D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_j, h, w)  = \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
+                               \text{input}(N_i, C_j, \text{stride}[0] * h + m, \text{stride}[1] * w + n)
+        \end{equation*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful when Unpooling later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[0] - \text{dilation}[0]
+                    * (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[1] - \text{dilation}[1]
+                    * (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def forward(self, input):
+        return F.max_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
+
+
+class MaxPool3d(_MaxPoolNd):
+    r"""Applies a 3D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{align*}
+        \text{out}(N_i, C_j, d, h, w) &= \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
+                \text{input}(N_i, C_j, \text{stride}[0] * k + d,\\ &\text{stride}[1] * h + m, \text{stride}[2] * w + n)
+        \end{align*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful when Unpooling later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] - \text{dilation}[0] *
+                (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] - \text{dilation}[1] *
+                (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] - \text{dilation}[2] *
+                (\text{kernel_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50,44, 31)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def forward(self, input):
+        return F.max_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
+
+
+class _MaxUnpoolNd(Module):
+
+    def extra_repr(self):
+        return 'kernel_size={}, stride={}, padding={}'.format(
+            self.kernel_size, self.stride, self.padding
+        )
+
+
+class MaxUnpool1d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
+
+    :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    .. note:: `MaxPool1d` can map several input sizes to the same output sizes.
+              Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument `output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to ``kernel_size`` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by `MaxPool1d`
+        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in})`
+        - Output: :math:`(N, C, H_{out})` where
+
+          .. math::
+              H_{out} = (H_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel_size}[0]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool1d(2, stride=2)
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+
+        >>> # Example showcasing the use of output_size
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
+
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super(MaxUnpool1d, self).__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride or kernel_size)
+        self.padding = _single(padding)
+
+    def forward(self, input, indices, output_size=None):
+        return F.max_unpool1d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class MaxUnpool2d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
+
+    :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    .. note:: `MaxPool2d` can map several input sizes to the same output sizes.
+              Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument `output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to ``kernel_size`` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by `MaxPool2d`
+        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+            H_{out} = (H_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel_size}[0]
+
+            W_{out} = (W_{in} - 1) * \text{stride}[1] - 2 * \text{padding}[1] + \text{kernel_size}[1]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool2d(2, stride=2)
+        >>> input = torch.tensor([[[[ 1.,  2,  3,  4],
+                                    [ 5,  6,  7,  8],
+                                    [ 9, 10, 11, 12],
+                                    [13, 14, 15, 16]]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])
+
+        >>> # specify a different output size than input size
+        >>> unpool(output, indices, output_size=torch.Size([1, 1, 5, 5]))
+        tensor([[[[  0.,   0.,   0.,   0.,   0.],
+                  [  6.,   0.,   8.,   0.,   0.],
+                  [  0.,   0.,   0.,  14.,   0.],
+                  [ 16.,   0.,   0.,   0.,   0.],
+                  [  0.,   0.,   0.,   0.,   0.]]]])
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super(MaxUnpool2d, self).__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride or kernel_size)
+        self.padding = _pair(padding)
+
+    def forward(self, input, indices, output_size=None):
+        return F.max_unpool2d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class MaxUnpool3d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+
+    :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    .. note:: `MaxPool3d` can map several input sizes to the same output sizes.
+              Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument `output_size` in the forward call.
+              See the Inputs section below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to ``kernel_size`` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by `MaxPool3d`
+        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              D_{out} = (D_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel_size}[0]
+
+              H_{out} = (H_{in} - 1) * \text{stride}[1] - 2 * \text{padding}[1] + \text{kernel_size}[1]
+
+              W_{out} = (W_{in} - 1) * \text{stride}[2] - 2 * \text{padding}[2] + \text{kernel_size}[2]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool3d(3, stride=2)
+        >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15))
+        >>> unpooled_output = unpool(output, indices)
+        >>> unpooled_output.size()
+        torch.Size([20, 16, 51, 33, 15])
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super(MaxUnpool3d, self).__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride or kernel_size)
+        self.padding = _triple(padding)
+
+    def forward(self, input, indices, output_size=None):
+        return F.max_unpool3d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class _AvgPoolNd(Module):
+
+    def extra_repr(self):
+        return 'kernel_size={}, stride={}, padding={}'.format(
+            self.kernel_size, self.stride, self.padding
+        )
+
+
+class AvgPool1d(_AvgPoolNd):
+    r"""Applies a 1D average pooling over an input signal composed of several
+    input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`,
+    output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_j, l)  = \frac{1}{k} \sum_{m=0}^{k}
+                               \text{input}(N_i, C_j, \text{stride} * l + m)
+        \end{equation*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
+    an ``int`` or a one-element tuple.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})`
+        - Output: :math:`(N, C, L_{out})` where
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} +
+              2 * \text{padding} - \text{kernel_size}}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool with window of size=3, stride=2
+        >>> m = nn.AvgPool1d(3, stride=2)
+        >>> m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
+        tensor([[[ 2.,  4.,  6.]]])
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
+                 count_include_pad=True):
+        super(AvgPool1d, self).__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if stride is not None else kernel_size)
+        self.padding = _single(padding)
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def forward(self, input):
+        return F.avg_pool1d(
+            input, self.kernel_size, self.stride, self.padding, self.ceil_mode,
+            self.count_include_pad)
+
+
+class AvgPool2d(_AvgPoolNd):
+    r"""Applies a 2D average pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               \text{input}(N_i, C_j, \text{stride}[0] * h + m, \text{stride}[1] * w + n)
+        \end{equation*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] -
+                \text{kernel_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] -
+                \text{kernel_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
+                 count_include_pad=True):
+        super(AvgPool2d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def forward(self, input):
+        return F.avg_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad)
+
+
+class AvgPool3d(_AvgPoolNd):
+    r"""Applies a 3D average pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        \begin{equation*}
+        \text{out}(N_i, C_j, d, h, w)  = \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                \frac{\text{input}(N_i, C_j, \text{stride}[0] * d + k, \text{stride}[1] * h + m,
+                        \text{stride}[2] * w + n)}
+                     {kD * kH * kW}
+        \end{equation*}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
+    for :attr:`padding` number of points.
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on all three sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] -
+                    \text{kernel_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] -
+                    \text{kernel_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] -
+                    \text{kernel_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50,44, 31)
+        >>> output = m(input)
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
+                 count_include_pad=True):
+        super(AvgPool3d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def forward(self, input):
+        return F.avg_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad)
+
+    def __setstate__(self, d):
+        super(AvgPool3d, self).__setstate__(d)
+        self.__dict__.setdefault('padding', 0)
+        self.__dict__.setdefault('ceil_mode', False)
+        self.__dict__.setdefault('count_include_pad', True)
+
+
+class FractionalMaxPool2d(Module):
+    r"""Applies a 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kHxkW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k) or a tuple `(kh x kw)`
+        output_size: the target output size of the image of the form `oH x oW`.
+                     Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False``
+
+    Examples:
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        http://arxiv.org/abs/1412.6071
+    """
+
+    def __init__(self, kernel_size, output_size=None, output_ratio=None,
+                 return_indices=False, _random_samples=None):
+        super(FractionalMaxPool2d, self).__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer('_random_samples', _random_samples)
+        self.output_size = _pair(output_size) if output_size is not None else None
+        self.output_ratio = _pair(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError("FractionalMaxPool2d requires specifying either "
+                             "an output size, or a pooling ratio")
+        if output_size is not None and output_ratio is not None:
+            raise ValueError("only one of output_size and output_ratio may be specified")
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1):
+                raise ValueError("output_ratio must be between 0 and 1 (got {})"
+                                 .format(output_ratio))
+
+    def forward(self, input):
+        samples = None if self._random_samples is None else self._random_samples
+        return F.fractional_max_pool2d(
+            input, self.kernel_size, self.output_size, self.output_ratio,
+            self.return_indices,
+            _random_samples=samples)
+
+
+class _LPPoolNd(Module):
+
+    def __init__(self, norm_type, kernel_size, stride=None, ceil_mode=False):
+        super(_LPPoolNd, self).__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self):
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
+            'ceil_mode={ceil_mode}'.format(**self.__dict__)
+
+
+class LPPool1d(_LPPoolNd):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input
+    planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = infinity, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: a single int, the size of the window
+        stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})`
+        - Output: :math:`(N, C, L_{out})` where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} +
+              2 * \text{padding} - \text{kernel_size}}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+        >>> # power-2 pool of window of length 3, with stride 2.
+        >>> m = nn.LPPool1d(2, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.lp_pool1d(input, self.norm_type, self.kernel_size,
+                           self.stride, self.ceil_mode)
+
+
+class LPPool2d(_LPPoolNd):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input
+    planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] - \text{dilation}[0] *
+                    (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] - \text{dilation}[1] *
+                    (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool2d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.lp_pool2d(input, self.norm_type, self.kernel_size,
+                           self.stride, self.ceil_mode)
+
+
+class _AdaptiveMaxPoolNd(Module):
+
+    def __init__(self, output_size, return_indices=False):
+        super(_AdaptiveMaxPoolNd, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)
+
+
+class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
+
+    The output size is H, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size H
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool1d. Default: ``False``
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveMaxPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool2d. Default: ``False``
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveMaxPool2d((5,7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveMaxPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool3d. Default: ``False``
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveMaxPool3d((5,7,9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveMaxPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
+
+
+class _AdaptiveAvgPoolNd(Module):
+
+    def __init__(self, output_size):
+        super(_AdaptiveAvgPoolNd, self).__init__()
+        self.output_size = output_size
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)
+
+
+class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
+    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    The output size is H, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size H
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size)
+
+
+class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
+    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveAvgPool2d((5,7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_avg_pool2d(input, self.output_size)
+
+
+class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveAvgPool3d((5,7,9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    def forward(self, input):
+        return F.adaptive_avg_pool3d(input, self.output_size)
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
new file mode 100644
index 0000000..6b405a8
--- /dev/null
+++ b/torch/nn/modules/rnn.py
@@ -0,0 +1,795 @@
+import math
+import torch
+import warnings
+import itertools
+import numbers
+
+from .module import Module
+from ..parameter import Parameter
+from ..utils.rnn import PackedSequence
+
+
+class RNNBase(Module):
+
+    def __init__(self, mode, input_size, hidden_size,
+                 num_layers=1, bias=True, batch_first=False,
+                 dropout=0, bidirectional=False):
+        super(RNNBase, self).__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = dropout
+        self.dropout_state = {}
+        self.bidirectional = bidirectional
+        num_directions = 2 if bidirectional else 1
+
+        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
+                isinstance(dropout, bool):
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0 and num_layers == 1:
+            warnings.warn("dropout option adds dropout after all but last "
+                          "recurrent layer, so non-zero dropout expects "
+                          "num_layers greater than 1, but got dropout={} and "
+                          "num_layers={}".format(dropout, num_layers))
+
+        if mode == 'LSTM':
+            gate_size = 4 * hidden_size
+        elif mode == 'GRU':
+            gate_size = 3 * hidden_size
+        else:
+            gate_size = hidden_size
+
+        self._all_weights = []
+        for layer in range(num_layers):
+            for direction in range(num_directions):
+                layer_input_size = input_size if layer == 0 else hidden_size * num_directions
+
+                w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
+                w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
+                b_ih = Parameter(torch.Tensor(gate_size))
+                b_hh = Parameter(torch.Tensor(gate_size))
+                layer_params = (w_ih, w_hh, b_ih, b_hh)
+
+                suffix = '_reverse' if direction == 1 else ''
+                param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
+                if bias:
+                    param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
+                param_names = [x.format(layer, suffix) for x in param_names]
+
+                for name, param in zip(param_names, layer_params):
+                    setattr(self, name, param)
+                self._all_weights.append(param_names)
+
+        self.flatten_parameters()
+        self.reset_parameters()
+
+    def flatten_parameters(self):
+        """Resets parameter data pointer so that they can use faster code paths.
+
+        Right now, this works only if the module is on the GPU and cuDNN is enabled.
+        Otherwise, it's a no-op.
+        """
+        any_param = next(self.parameters()).data
+        if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param):
+            self._data_ptrs = []
+            return
+
+        # If any parameters alias, we fall back to the slower, copying code path. This is
+        # a sufficient check, because overlapping parameter buffers that don't completely
+        # alias would break the assumptions of the uniqueness check in
+        # Module.named_parameters().
+        unique_data_ptrs = set(p.data_ptr() for l in self.all_weights for p in l)
+        if len(unique_data_ptrs) != sum(len(l) for l in self.all_weights):
+            self._data_ptrs = []
+            return
+
+        with torch.cuda.device_of(any_param):
+            import torch.backends.cudnn.rnn as rnn
+
+            weight_arr = list(itertools.chain.from_iterable(self.all_weights))
+            weight_stride0 = len(self.all_weights[0])
+
+            # NB: This is a temporary hack while we still don't have Tensor
+            # bindings for ATen functions
+            with torch.no_grad():
+                # NB: this is an INPLACE function on weight_arr, that's why the
+                # no_grad() is necessary.
+                weight_buf = torch._cudnn_rnn_flatten_weight(
+                    weight_arr, weight_stride0,
+                    self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
+                    self.batch_first, bool(self.bidirectional))
+
+            self._param_buf_size = weight_buf.size(0)
+            self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
+
+    def _apply(self, fn):
+        ret = super(RNNBase, self)._apply(fn)
+        self.flatten_parameters()
+        return ret
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            weight.data.uniform_(-stdv, stdv)
+
+    def check_forward_args(self, input, hidden, batch_sizes):
+        is_input_packed = batch_sizes is not None
+        expected_input_dim = 2 if is_input_packed else 3
+        if input.dim() != expected_input_dim:
+            raise RuntimeError(
+                'input must have {} dimensions, got {}'.format(
+                    expected_input_dim, input.dim()))
+        if self.input_size != input.size(-1):
+            raise RuntimeError(
+                'input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
+                    self.input_size, input.size(-1)))
+
+        if is_input_packed:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+
+        def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
+            if tuple(hx.size()) != expected_hidden_size:
+                raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
+
+        if self.mode == 'LSTM':
+            check_hidden_size(hidden[0], expected_hidden_size,
+                              'Expected hidden[0] size {}, got {}')
+            check_hidden_size(hidden[1], expected_hidden_size,
+                              'Expected hidden[1] size {}, got {}')
+        else:
+            check_hidden_size(hidden, expected_hidden_size)
+
+    def forward(self, input, hx=None):
+        is_packed = isinstance(input, PackedSequence)
+        if is_packed:
+            input, batch_sizes = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            hx = input.new_zeros(self.num_layers * num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 requires_grad=False)
+            if self.mode == 'LSTM':
+                hx = (hx, hx)
+
+        has_flat_weights = list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs
+        if has_flat_weights:
+            first_data = next(self.parameters()).data
+            assert first_data.storage().size() == self._param_buf_size
+            flat_weight = first_data.new().set_(first_data.storage(), 0, torch.Size([self._param_buf_size]))
+        else:
+            flat_weight = None
+
+        self.check_forward_args(input, hx, batch_sizes)
+        func = self._backend.RNN(
+            self.mode,
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            batch_first=self.batch_first,
+            dropout=self.dropout,
+            train=self.training,
+            bidirectional=self.bidirectional,
+            dropout_state=self.dropout_state,
+            variable_length=is_packed,
+            flat_weight=flat_weight
+        )
+        output, hidden = func(input, self.all_weights, hx, batch_sizes)
+        if is_packed:
+            output = PackedSequence(output, batch_sizes)
+        return output, hidden
+
+    def extra_repr(self):
+        s = '{input_size}, {hidden_size}'
+        if self.num_layers != 1:
+            s += ', num_layers={num_layers}'
+        if self.bias is not True:
+            s += ', bias={bias}'
+        if self.batch_first is not False:
+            s += ', batch_first={batch_first}'
+        if self.dropout != 0:
+            s += ', dropout={dropout}'
+        if self.bidirectional is not False:
+            s += ', bidirectional={bidirectional}'
+        return s.format(**self.__dict__)
+
+    def __setstate__(self, d):
+        super(RNNBase, self).__setstate__(d)
+        self.__dict__.setdefault('_data_ptrs', [])
+        if 'all_weights' in d:
+            self._all_weights = d['all_weights']
+        if isinstance(self._all_weights[0][0], str):
+            return
+        num_layers = self.num_layers
+        num_directions = 2 if self.bidirectional else 1
+        self._all_weights = []
+        for layer in range(num_layers):
+            for direction in range(num_directions):
+                suffix = '_reverse' if direction == 1 else ''
+                weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
+                weights = [x.format(layer, suffix) for x in weights]
+                if self.bias:
+                    self._all_weights += [weights]
+                else:
+                    self._all_weights += [weights[:2]]
+
+    @property
+    def all_weights(self):
+        return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
+
+
+class RNN(RNNBase):
+    r"""Applies a multi-layer Elman RNN with `tanh` or `ReLU` non-linearity to an
+    input sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+
+        h_t = \tanh(w_{ih} x_t + b_{ih}  +  w_{hh} h_{(t-1)} + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
+    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
+    previous layer at time `t-1` or the initial hidden state at time `0`.
+    If :attr:`nonlinearity` is `'relu'`, then `ReLU` is used instead of `tanh`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two RNNs together to form a `stacked RNN`,
+            with the second RNN taking in outputs of the first RNN and
+            computing the final results. Default: 1
+        nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)`. Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            RNN layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
+
+    Inputs: input, h_0
+        - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+          or :func:`torch.nn.utils.rnn.pack_sequence`
+          for details.
+        - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial hidden state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: output, h_n
+        - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
+          containing the output features (`h_k`) from the last layer of the RNN,
+          for each `k`.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
+          been given as the input, the output will also be a packed sequence.
+
+          For the unpacked case, the directions can be separated
+          using ``output.view(seq_len, batch, num_directions, hidden_size)``,
+          with forward and backward being direction `0` and `1` respectively.
+          Similarly, the directions can be separated in the packed case.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for `k = seq_len`.
+
+          Like *output*, the layers can be separated using
+          ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
+
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            of shape `(hidden_size * input_size)` for `k = 0`. Otherwise, the shape is
+            `(hidden_size * hidden_size)`
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            of shape `(hidden_size * hidden_size)`
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+
+    Examples::
+
+        >>> rnn = nn.RNN(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    def __init__(self, *args, **kwargs):
+        if 'nonlinearity' in kwargs:
+            if kwargs['nonlinearity'] == 'tanh':
+                mode = 'RNN_TANH'
+            elif kwargs['nonlinearity'] == 'relu':
+                mode = 'RNN_RELU'
+            else:
+                raise ValueError("Unknown nonlinearity '{}'".format(
+                    kwargs['nonlinearity']))
+            del kwargs['nonlinearity']
+        else:
+            mode = 'RNN_TANH'
+
+        super(RNN, self).__init__(mode, *args, **kwargs)
+
+
+class LSTM(RNNBase):
+    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+
+            \begin{array}{ll}
+            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
+            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+            c_t = f_t c_{(t-1)} + i_t g_t \\
+            h_t = o_t \tanh(c_t)
+            \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}`
+    is the hidden state of the previous layer at time `t-1` or the initial hidden
+    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
+    :math:`o_t` are the input, forget, cell, and output gates, respectively.
+    :math:`\sigma` is the sigmoid function.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two LSTMs together to form a `stacked LSTM`,
+            with the second LSTM taking in outputs of the first LSTM and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            LSTM layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
+
+    Inputs: input, (h_0, c_0)
+        - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
+          of the input sequence.
+          The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial hidden state for each element in the batch.
+        - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial cell state for each element in the batch.
+
+          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
+
+
+    Outputs: output, (h_n, c_n)
+        - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
+          containing the output features `(h_t)` from the last layer of the LSTM,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+
+          For the unpacked case, the directions can be separated
+          using ``output.view(seq_len, batch, num_directions, hidden_size)``,
+          with forward and backward being direction `0` and `1` respectively.
+          Similarly, the directions can be separated in the packed case.
+        - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the hidden state for `t = seq_len`.
+
+          Like *output*, the layers can be separated using
+          ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
+        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the cell state for `t = seq_len`
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
+
+    Examples::
+
+        >>> rnn = nn.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(LSTM, self).__init__('LSTM', *args, **kwargs)
+
+
+class GRU(RNNBase):
+    r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+
+            \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) n_t + z_t h_{(t-1)} \\
+            \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the previous layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two GRUs together to form a `stacked GRU`,
+            with the second GRU taking in outputs of the first GRU and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            GRU layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
+
+    Inputs: input, h_0
+        - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+          for details.
+        - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial hidden state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: output, h_n
+        - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
+          containing the output features h_t from the last layer of the GRU,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+          For the unpacked case, the directions can be separated
+          using ``output.view(seq_len, batch, num_directions, hidden_size)``,
+          with forward and backward being direction `0` and `1` respectively.
+
+          Similarly, the directions can be separated in the packed case.
+        - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the hidden state for `t = seq_len`
+
+          Like *output*, the layers can be separated using
+          ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
+    Examples::
+
+        >>> rnn = nn.GRU(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(GRU, self).__init__('GRU', *args, **kwargs)
+
+
+class RNNCellBase(Module):
+
+    def extra_repr(self):
+        s = '{input_size}, {hidden_size}'
+        if 'bias' in self.__dict__ and self.bias is not True:
+            s += ', bias={bias}'
+        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
+            s += ', nonlinearity={nonlinearity}'
+        return s.format(**self.__dict__)
+
+    def check_forward_input(self, input):
+        if input.size(1) != self.input_size:
+            raise RuntimeError(
+                "input has inconsistent input_size: got {}, expected {}".format(
+                    input.size(1), self.input_size))
+
+    def check_forward_hidden(self, input, hx, hidden_label=''):
+        if input.size(0) != hx.size(0):
+            raise RuntimeError(
+                "Input batch size {} doesn't match hidden{} batch size {}".format(
+                    input.size(0), hidden_label, hx.size(0)))
+
+        if hx.size(1) != self.hidden_size:
+            raise RuntimeError(
+                "hidden{} has inconsistent hidden_size: got {}, expected {}".format(
+                    hidden_label, hx.size(1), self.hidden_size))
+
+
+class RNNCell(RNNCellBase):
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+
+    .. math::
+
+        h' = \tanh(w_{ih} x + b_{ih}  +  w_{hh} h + b_{hh})
+
+    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
+
+    Inputs: input, hidden
+        - **input** of shape `(batch, input_size)`: tensor containing input features
+        - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
+          state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
+
+    Examples::
+
+        >>> rnn = nn.RNNCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+                hx = rnn(input[i], hx)
+                output.append(hx)
+    """
+
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        super(RNNCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.nonlinearity = nonlinearity
+        self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size))
+        self.weight_hh = Parameter(torch.Tensor(hidden_size, hidden_size))
+        if bias:
+            self.bias_ih = Parameter(torch.Tensor(hidden_size))
+            self.bias_hh = Parameter(torch.Tensor(hidden_size))
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, input, hx=None):
+        self.check_forward_input(input)
+        if hx is None:
+            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
+        self.check_forward_hidden(input, hx)
+        if self.nonlinearity == "tanh":
+            func = self._backend.RNNTanhCell
+        elif self.nonlinearity == "relu":
+            func = self._backend.RNNReLUCell
+        else:
+            raise RuntimeError(
+                "Unknown nonlinearity: {}".format(self.nonlinearity))
+
+        return func(
+            input, hx,
+            self.weight_ih, self.weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
+
+
+class LSTMCell(RNNCellBase):
+    r"""A long short-term memory (LSTM) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
+        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
+        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
+        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
+        c' = f * c + i * g \\
+        h' = o \tanh(c') \\
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, (h_0, c_0)
+        - **input** of shape `(batch, input_size)`: tensor containing input features
+        - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden
+          state for each element in the batch.
+        - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state
+          for each element in the batch.
+
+          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
+
+    Outputs: h_1, c_1
+        - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+        - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
+          for each element in the batch
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(4*hidden_size x input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(4*hidden_size x hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
+
+    Examples::
+
+        >>> rnn = nn.LSTMCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+                hx, cx = rnn(input[i], (hx, cx))
+                output.append(hx)
+    """
+
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(LSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
+        self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
+        if bias:
+            self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
+            self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, input, hx=None):
+        self.check_forward_input(input)
+        if hx is None:
+            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
+            hx = (hx, hx)
+        self.check_forward_hidden(input, hx[0], '[0]')
+        self.check_forward_hidden(input, hx[1], '[1]')
+        return self._backend.LSTMCell(
+            input, hx,
+            self.weight_ih, self.weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
+
+
+class GRUCell(RNNCellBase):
+    r"""A gated recurrent unit (GRU) cell
+
+    .. math::
+
+        \begin{array}{ll}
+        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
+        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
+        n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\
+        h' = (1 - z) * n + z * h
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: `True`
+
+    Inputs: input, hidden
+        - **input** of shape `(batch, input_size)`: tensor containing input features
+        - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
+          state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(3*hidden_size x input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(3*hidden_size x hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`
+
+    Examples::
+
+        >>> rnn = nn.GRUCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+                hx = rnn(input[i], hx)
+                output.append(hx)
+    """
+
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(GRUCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_ih = Parameter(torch.Tensor(3 * hidden_size, input_size))
+        self.weight_hh = Parameter(torch.Tensor(3 * hidden_size, hidden_size))
+        if bias:
+            self.bias_ih = Parameter(torch.Tensor(3 * hidden_size))
+            self.bias_hh = Parameter(torch.Tensor(3 * hidden_size))
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, input, hx=None):
+        self.check_forward_input(input)
+        if hx is None:
+            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
+        self.check_forward_hidden(input, hx)
+        return self._backend.GRUCell(
+            input, hx,
+            self.weight_ih, self.weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
new file mode 100644
index 0000000..e60a738
--- /dev/null
+++ b/torch/nn/modules/sparse.py
@@ -0,0 +1,257 @@
+import torch
+from torch.nn.parameter import Parameter
+
+from .module import Module
+from .. import functional as F
+
+
+class Embedding(Module):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
+                                         (initialized to zeros) whenever it encounters the index.
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
+                                 See Notes for more details regarding sparse gradients.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+
+    Shape:
+
+        - Input: LongTensor of arbitrary shape containing the indices to extract
+        - Output: `(*, embedding_dim)`, where `*` is the input shape
+
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+
+    .. note::
+        With :attr:`padding_idx` set, the embedding vector at
+        :attr:`padding_idx` is initialized to all zeros. However, note that this
+        vector can be modified afterwards, e.g., using a customized
+        initialization method, and thus changing the vector used to pad the
+        output. The gradient for this vector from :class:`~torch.nn.Embedding`
+        is always zero.
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # example with padding_idx
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = torch.LongTensor([[0,2,0,5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
+                 max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                 sparse=False, _weight=None):
+        super(Embedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
+                'Shape of weight does not match num_embeddings and embedding_dim'
+            self.weight = Parameter(_weight)
+        self.sparse = sparse
+
+    def reset_parameters(self):
+        self.weight.data.normal_(0, 1)
+        if self.padding_idx is not None:
+            self.weight.data[self.padding_idx].fill_(0)
+
+    def forward(self, input):
+        return F.embedding(
+            input, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+
+    def extra_repr(self):
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.padding_idx is not None:
+            s += ', padding_idx={padding_idx}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        if self.sparse is not False:
+            s += ', sparse=True'
+        return s.format(**self.__dict__)
+
+    @classmethod
+    def from_pretrained(cls, embeddings, freeze=True, sparse=False):
+        r"""Creates Embedding instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the Embedding.
+                First dimension is being passed to Embedding as 'num_embeddings', second as 'embedding_dim'.
+            freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
+            sparse (bool, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor.
+                See Notes for more details regarding sparse gradients.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embedding = nn.Embedding.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([1])
+            >>> embedding(input)
+            tensor([[ 4.0000,  5.1000,  6.3000]])
+        """
+        assert embeddings.dim() == 2, \
+            'Embeddings parameter is expected to be 2-dimensional'
+        rows, cols = embeddings.shape
+        embedding = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            sparse=sparse,
+        )
+        embedding.weight.requires_grad = not freeze
+        return embedding
+
+
+class EmbeddingBag(Module):
+    r"""Computes sums or means of 'bags' of embeddings, without instantiating the
+    intermediate embeddings.
+
+    For bags of constant length, this class
+
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
+
+    However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
+    operations.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
+                                 Notes for more details regarding sparse gradients. Note: this option is not
+                                 supported when ``mode="max"``.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape ``(num_embeddings x embedding_dim)``
+
+    Inputs: :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
+
+        - If :attr:`input` is 2D of shape ``B x N``,
+
+          it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
+          this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
+          :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+        - If :attr:`input` is 1D of shape ``N``,
+
+          it will be treated as a concatenation of multiple bags (sequences).
+          :attr:`offsets` is required to be a 1D tensor containing the
+          starting index positions of each bag in :attr:`input`. Therefore,
+          for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
+          having ``B`` bags. Empty bags (i.e., having 0-length) will have
+          returned vectors filled by zeros.
+
+    Output shape: ``B x embedding_dim``
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
+        >>> offsets = torch.LongTensor([0,4])
+        >>> embedding_sum(input, offsets)
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
+    """
+
+    def __init__(self, num_embeddings, embedding_dim,
+                 max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                 mode='mean', sparse=False):
+        super(EmbeddingBag, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
+        self.mode = mode
+        self.sparse = sparse
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.weight.data.normal_(0, 1)
+
+    def forward(self, input, offsets=None):
+        return F.embedding_bag(input, self.weight, offsets,
+                               self.max_norm, self.norm_type,
+                               self.scale_grad_by_freq, self.mode, self.sparse)
+
+    def extra_repr(self):
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        s += ', mode={mode}'
+        return s.format(**self.__dict__)
+
+# TODO: SparseLinear
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
new file mode 100644
index 0000000..5e0e667
--- /dev/null
+++ b/torch/nn/modules/upsampling.py
@@ -0,0 +1,226 @@
+from numbers import Integral
+import warnings
+
+from .module import Module
+from .. import functional as F
+
+
+class Upsample(Module):
+    r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear, bilinear and trilinear
+    for 3D, 4D and 5D input Tensor, respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (tuple, optional): a tuple of ints `([optional D_out], [optional H_out], W_out)` output sizes
+        scale_factor (int / tuple of ints, optional): the multiplier for the image height / width / depth
+        mode (string, optional): the upsampling algorithm: one of `nearest`, `linear`, `bilinear` and `trilinear`.
+                                    Default: `nearest`
+        align_corners (bool, optional): if True, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is `linear`,
+            `bilinear`, or `trilinear`. Default: False
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})`
+          or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor D_{in} \times \text{scale_factor} \right\rfloor \text{ or size}[-3]
+
+              H_{out} = \left\lfloor H_{in} \times \text{scale_factor} \right\rfloor \text{ or size}[-2]
+
+              W_{out} = \left\lfloor W_{in} \times \text{scale_factor} \right\rfloor \text{ or size}[-1]
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, and `trilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See below for concrete examples on how this affects the outputs.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Examples::
+
+        >>> input = torch.arange(1, 5).view(1, 1, 2, 2).float()
+        >>> input
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='nearest')
+        >>> m(input)
+        tensor([[[[ 1.,  1.,  2.,  2.],
+                  [ 1.,  1.,  2.,  2.],
+                  [ 3.,  3.,  4.,  4.],
+                  [ 3.,  3.,  4.,  4.]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> m(input)
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  2.0000],
+                  [ 1.5000,  1.7500,  2.2500,  2.5000],
+                  [ 2.5000,  2.7500,  3.2500,  3.5000],
+                  [ 3.0000,  3.2500,  3.7500,  4.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> m(input)
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
+
+        >>> # Try scaling the same data in a larger tensor
+        >>>
+        >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
+        >>> input_3x3[:, :, :2, :2].copy_(input)
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])
+        >>> input_3x3
+        tensor([[[[ 1.,  2.,  0.],
+                  [ 3.,  4.,  0.],
+                  [ 0.,  0.,  0.]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> # Notice that values in top left corner are the same with the small input (except at boundary)
+        >>> m(input_3x3)
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  1.5000,  0.5000,  0.0000],
+                  [ 1.5000,  1.7500,  2.2500,  1.8750,  0.6250,  0.0000],
+                  [ 2.5000,  2.7500,  3.2500,  2.6250,  0.8750,  0.0000],
+                  [ 2.2500,  2.4375,  2.8125,  2.2500,  0.7500,  0.0000],
+                  [ 0.7500,  0.8125,  0.9375,  0.7500,  0.2500,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> # Notice that values in top left corner are now changed
+        >>> m(input_3x3)
+        tensor([[[[ 1.0000,  1.4000,  1.8000,  1.6000,  0.8000,  0.0000],
+                  [ 1.8000,  2.2000,  2.6000,  2.2400,  1.1200,  0.0000],
+                  [ 2.6000,  3.0000,  3.4000,  2.8800,  1.4400,  0.0000],
+                  [ 2.4000,  2.7200,  3.0400,  2.5600,  1.2800,  0.0000],
+                  [ 1.2000,  1.3600,  1.5200,  1.2800,  0.6400,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+    """
+
+    def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, input):
+        warnings.warn("nn.Upsampling is deprecated. Use nn.functional.interpolate instead.")
+        return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners)
+
+    def extra_repr(self):
+        if self.scale_factor is not None:
+            info = 'scale_factor=' + str(self.scale_factor)
+        else:
+            info = 'size=' + str(self.size)
+        info += ', mode=' + self.mode
+        return info
+
+
+class UpsamplingNearest2d(Upsample):
+    r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When `size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (tuple, optional): a tuple of ints `(H_out, W_out)` output sizes
+        scale_factor (int, optional): the multiplier for the image height or width
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor H_{in} \times \text{scale_factor} \right\rfloor
+
+              W_{out} = \left\lfloor W_{in} \times \text{scale_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])
+
+        >>> m = nn.UpsamplingNearest2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[ 1.,  1.,  2.,  2.],
+                  [ 1.,  1.,  2.,  2.],
+                  [ 3.,  3.,  4.,  4.],
+                  [ 3.,  3.,  4.,  4.]]]])
+    """
+    def __init__(self, size=None, scale_factor=None):
+        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode='nearest')
+
+    def forward(self, input):
+        warnings.warn("nn.UpsamplingNearest2d is deprecated. Use nn.functional.interpolate instead.")
+        return super(UpsamplingNearest2d, self).forward(input)
+
+
+class UpsamplingBilinear2d(Upsample):
+    r"""Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When `size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (tuple, optional): a tuple of ints `(H_out, W_out)` output sizes
+        scale_factor (int, optional): the multiplier for the image height or width
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
+        equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \left\lfloor H_{in} \times \text{scale_factor} \right\rfloor
+
+              W_{out} = \left\lfloor W_{in} \times \text{scale_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])
+
+        >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
+    """
+    def __init__(self, size=None, scale_factor=None):
+        super(UpsamplingBilinear2d, self).__init__(size, scale_factor, mode='bilinear', align_corners=True)
+
+    def forward(self, input):
+        warnings.warn("nn.UpsamplingBilinear2d is deprecated. Use nn.functional.interpolate instead.")
+        return super(UpsamplingBilinear2d, self).forward(input)
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
new file mode 100644
index 0000000..3cff6a9
--- /dev/null
+++ b/torch/nn/modules/utils.py
@@ -0,0 +1,23 @@
+import collections
+from itertools import repeat
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _list_with_default(out_size, defaults):
+    if isinstance(out_size, int):
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError('Input dimension should be at least {}'.format(len(out_size) + 1))
+    return [v if v is not None else d for v, d in zip(out_size, defaults[-len(out_size):])]
diff --git a/torch/nn/parallel/__init__.py b/torch/nn/parallel/__init__.py
new file mode 100644
index 0000000..9b9eb1e
--- /dev/null
+++ b/torch/nn/parallel/__init__.py
@@ -0,0 +1,10 @@
+from .parallel_apply import parallel_apply
+from .replicate import replicate
+from .data_parallel import DataParallel, data_parallel
+from .scatter_gather import scatter, gather
+from .distributed import DistributedDataParallel
+from .distributed_cpu import DistributedDataParallelCPU
+from .distributed_c10d import _DistributedDataParallelC10d
+
+__all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel',
+           'DataParallel', 'DistributedDataParallel', 'DistributedDataParallelCPU']
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
new file mode 100644
index 0000000..bdb770b
--- /dev/null
+++ b/torch/nn/parallel/_functions.py
@@ -0,0 +1,115 @@
+import warnings
+
+import torch
+import torch.cuda.comm as comm
+from torch.autograd import Function
+
+
+class Broadcast(Function):
+
+    @staticmethod
+    def forward(ctx, target_gpus, *inputs):
+        if not all(input.is_cuda for input in inputs):
+            raise TypeError('Broadcast function not implemented for CPU tensors')
+        ctx.target_gpus = target_gpus
+        if len(inputs) == 0:
+            return tuple()
+        ctx.num_inputs = len(inputs)
+        ctx.input_device = inputs[0].get_device()
+        outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
+        non_differentiables = []
+        for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
+            if not input_requires_grad:
+                for output in outputs:
+                    non_differentiables.append(output[idx])
+        ctx.mark_non_differentiable(*non_differentiables)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None,) + ReduceAddCoalesced.apply(ctx.input_device, ctx.num_inputs, *grad_outputs)
+
+
+class ReduceAddCoalesced(Function):
+
+    @staticmethod
+    def forward(ctx, destination, num_inputs, *grads):
+        ctx.target_gpus = [grads[i].get_device() for i in range(0, len(grads), num_inputs)]
+
+        grads = [grads[i:i + num_inputs]
+                 for i in range(0, len(grads), num_inputs)]
+        return comm.reduce_add_coalesced(grads, destination)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None,) + Broadcast.apply(ctx.target_gpus, *grad_outputs)
+
+
+class Gather(Function):
+
+    @staticmethod
+    def forward(ctx, target_device, dim, *inputs):
+        assert all(map(lambda i: i.is_cuda, inputs))
+        ctx.target_device = target_device
+        ctx.dim = dim
+        ctx.input_gpus = tuple(map(lambda i: i.get_device(), inputs))
+        if all(t.dim() == 0 for t in inputs) and dim == 0:
+            inputs = tuple(t.view(1) for t in inputs)
+            warnings.warn('Was asked to gather along dimension 0, but all '
+                          'input tensors were scalars; will instead unsqueeze '
+                          'and return a vector.')
+            ctx.unsqueezed_scalar = True
+        else:
+            ctx.unsqueezed_scalar = False
+        ctx.input_sizes = tuple(map(lambda i: i.size(ctx.dim), inputs))
+        return comm.gather(inputs, ctx.dim, ctx.target_device)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        scattered_grads = Scatter.apply(ctx.input_gpus, ctx.input_sizes, ctx.dim, grad_output)
+        if ctx.unsqueezed_scalar:
+            scattered_grads = tuple(g[0] for g in scattered_grads)
+        return (None, None) + scattered_grads
+
+
+class Scatter(Function):
+
+    @staticmethod
+    def forward(ctx, target_gpus, chunk_sizes, dim, input):
+        ctx.target_gpus = target_gpus
+        ctx.chunk_sizes = chunk_sizes
+        ctx.dim = dim
+        ctx.input_device = input.get_device() if input.is_cuda else -1
+        streams = None
+        if ctx.input_device == -1:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(device) for device in ctx.target_gpus]
+        outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            for i, output in enumerate(outputs):
+                with torch.cuda.device(ctx.target_gpus[i]):
+                    main_stream = torch.cuda.current_stream()
+                    main_stream.wait_stream(streams[i])
+                    output.record_stream(main_stream)
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        return None, None, None, Gather.apply(ctx.input_device, ctx.dim, *grad_output)
+
+
+# background streams used for copying
+_streams = None
+
+
+def _get_stream(device):
+    """Gets a background stream for copying between CPU and GPU"""
+    global _streams
+    if device == -1:
+        return None
+    if _streams is None:
+        _streams = [None] * torch.cuda.device_count()
+    if _streams[device] is None:
+        _streams[device] = torch.cuda.Stream(device)
+    return _streams[device]
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
new file mode 100644
index 0000000..3fe381a
--- /dev/null
+++ b/torch/nn/parallel/data_parallel.py
@@ -0,0 +1,169 @@
+import operator
+import torch
+import warnings
+from ..modules import Module
+from .scatter_gather import scatter_kwargs, gather
+from .replicate import replicate
+from .parallel_apply import parallel_apply
+
+
+def _check_balance(device_ids):
+    imbalance_warn = """
+    There is an imbalance between your GPUs. You may want to exclude GPU {} which
+    has less than 75% of the memory or cores of GPU {}. You can do so by setting
+    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
+    environment variable."""
+
+    dev_props = [torch.cuda.get_device_properties(i) for i in device_ids]
+
+    def warn_imbalance(get_prop):
+        values = [get_prop(props) for props in dev_props]
+        min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1))
+        max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
+        if min_val / max_val < 0.75:
+            warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
+            return True
+        return False
+
+    if warn_imbalance(lambda props: props.total_memory):
+        return
+    if warn_imbalance(lambda props: props.multi_processor_count):
+        return
+
+
+class DataParallel(Module):
+    r"""Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards
+    pass, gradients from each replica are summed into the original module.
+
+    The batch size should be larger than the number of GPUs used.
+
+    See also: :ref:`cuda-nn-dataparallel-instead`
+
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel EXCEPT Tensors. All tensors will be scattered on dim
+    specified (default 0). Primitive types will be broadcasted, but all
+    other types will be a shallow copy and can be corrupted if written to in
+    the model's forward pass.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        will be invoked ``len(device_ids)`` times, each with inputs located on
+        a particular device. Particularly, the hooks are only guaranteed to be
+        executed in correct order with respect to operations on corresponding
+        devices. For example, it is not guaranteed that hooks set via
+        :meth:`~torch.nn.Module.register_forward_pre_hook` be executed before
+        `all` ``len(device_ids)`` :meth:`~torch.nn.Module.forward` calls, but
+        that each such hook be executed before the corresponding
+        :meth:`~torch.nn.Module.forward` call of that device.
+
+    .. warning::
+        When :attr:`module` returns a scalar (i.e., 0-dimensional tensor) in
+        :func:`forward`, this wrapper will return a vector of length equal to
+        number of devices used in data parallelism, containing the result from
+        each device.
+
+    .. note::
+        There is a subtlety in using the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`pack-rnn-unpack-with-data-parallelism` section in FAQ for
+        details.
+
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)
+    """
+
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0):
+        super(DataParallel, self).__init__()
+
+        if not torch.cuda.is_available():
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.output_device = output_device
+
+        _check_balance(self.device_ids)
+
+        if len(self.device_ids) == 1:
+            self.module.cuda(device_ids[0])
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def replicate(self, module, device_ids):
+        return replicate(module, device_ids)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+
+def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
+    r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
+
+    This is the functional version of the DataParallel module.
+
+    Args:
+        module: the module to evaluate in parallel
+        inputs: inputs to the module
+        device_ids: GPU ids on which to replicate module
+        output_device: GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Tensor containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,)
+
+    if device_ids is None:
+        device_ids = list(range(torch.cuda.device_count()))
+
+    if output_device is None:
+        output_device = device_ids[0]
+
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[:len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
new file mode 100644
index 0000000..2a8f065
--- /dev/null
+++ b/torch/nn/parallel/distributed.py
@@ -0,0 +1,482 @@
+import sys
+import math
+import threading
+import copy
+
+import torch
+from torch.autograd import Variable
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
+    _take_tensors
+
+from torch.cuda.comm import broadcast_coalesced
+from torch.cuda import nccl
+import torch.distributed as dist
+
+from ..modules import Module
+from .replicate import replicate
+from .scatter_gather import scatter_kwargs, gather
+from .parallel_apply import parallel_apply
+
+if sys.version_info[0] == 3:
+    import queue
+else:
+    import Queue as queue
+
+
+class DistributedDataParallel(Module):
+    r"""Implements distributed data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. The module is replicated on each machine and each device, and
+    each such replica handles a portion of the input. During the backwards
+    pass, gradients from each node are averaged.
+
+    The batch size should be larger than the number of GPUs used locally. It
+    should also be an integer multiple of the number of GPUs so that each chunk
+    is the same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires the distributed package to be already
+    initialized in the process group mode
+    (see :func:`torch.distributed.init_process_group`).
+
+    .. warning::
+        This module works only with the ``nccl`` and ``gloo`` backends.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) is a distributed synchronization
+        point. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all buffers and gradients are dense.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        won't be invoked anymore, unless the hooks are initialized in the
+        :meth:`forward` method.
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+                           the module at beginning of the forward function.
+                           (default: True)
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallel(model)
+    """
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0,
+                 broadcast_buffers=True):
+        super(DistributedDataParallel, self).__init__()
+        if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO):
+            raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel')
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.output_device = output_device
+        self.broadcast_buffers = broadcast_buffers
+
+        # Flag used by the NCCL backend to make sure we only reduce gradients
+        # one time in the execution engine
+        self.need_reduction = False
+
+        MB = 1024 * 1024
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = 10 * MB
+        self.nccl_reduce_bucket_size = 256 * MB
+
+        # Sync params and buffers
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+
+        if len(device_ids) > 1:
+            # TODO: we don't need to replicate params in here. they're always going to
+            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
+            # better to not pollute the caches with these small blocks
+            self._module_copies = replicate(self.module, self.device_ids, detach=True)
+            self._module_copies[0] = self.module
+
+            for module_copy in self._module_copies[1:]:
+                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
+                    copy_param.requires_grad = param.requires_grad
+
+        else:
+            self._module_copies = [self.module]
+
+        # For NCCL backend, since every single NCCL call is asynchoronous, we
+        # therefore directly enqueue all the NCCL reduction calls to the
+        # default CUDA stream without spawning up other reduction threads.
+        # This achieves the best performance.
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+            return
+
+        bucket_bytes_cap = 1 * MB
+
+        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
+        param_buckets = []
+        # Split the parameters into buckets and by types as well
+        for dev_idx, module in enumerate(self._module_copies):
+            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))
+
+        self.bucket_sizes = []
+        self.bucket_map = {}
+
+        # We transpose param_buckets, so the loop is over buckets.
+        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
+        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
+            self.bucket_sizes.append(0)
+            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
+            # of params from each device.
+            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
+                if idx == 0:
+                    # Bucket parameter type tracking
+                    bucket_param_type = param_tuple[0].type()
+                    # Only gloo and nccl support half-precision
+                    if bucket_param_type == torch.cuda.HalfTensor and \
+                            dist._backend != dist.dist_backend.GLOO:
+                        raise RuntimeError("DistributedDataParallel currently only "
+                                           "supports half precision parameters "
+                                           "with Nccl and Gloo backend")
+                if not param_tuple[0].requires_grad:
+                    continue
+                for p in param_tuple:
+                    self.bucket_map[p] = bucket_idx
+                self.bucket_sizes[bucket_idx] += 1
+
+        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
+        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
+        self.reduced = [False] * len(self.bucket_sizes)
+
+        self._register_grad_hooks()
+
+        self.dispatch_lock = threading.Lock()
+        self._start_reduction_threads()
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        if dist._backend != dist.dist_backend.NCCL:
+            del attrs['_grad_accs'], attrs['_reduction_queues'], \
+                attrs['_reduction_streams'], attrs['_reduction_threads'], \
+                attrs['_nccl_streams'], attrs['_default_streams']
+        return attrs
+
+    def __setstate__(self, state):
+        super(DistributedDataParallel, self).__setstate__(state)
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+        else:
+            self._register_grad_hooks()
+            self._start_reduction_threads()
+
+    def forward(self, *inputs, **kwargs):
+        self.need_reduction = True
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        self._sync_params()
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super(DistributedDataParallel, self).train(mode)
+        for module in self._module_copies[1:]:
+            module.train(mode)
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        """
+        Broadcast a sequence of tensors to the default group from rank 0.
+        Small tensors are first coalesced into a buffer to reduce the number of
+        broadcasts.
+
+        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
+                            same GPU.
+        buffer_size (int): maximum size of the buffer for coalescing
+        """
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(tensors,
+                                      _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        if len(self.device_ids) > 1:
+            # intra-node parameter sync
+            params = [p.data for p in self.module.parameters()]
+            result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
+            for tensors, module in zip(result[1:], self._module_copies[1:]):
+                for tensor, param in zip(tensors, module.parameters()):
+                    param.data.set_(tensor)
+
+        # module buffer sync
+        if self.broadcast_buffers:
+            buffers = [b.data for b in self.module._all_buffers()]
+            if len(buffers) > 0:
+                # cross-node buffer sync
+                self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size)
+
+                if len(self.device_ids) > 1:
+                    # intra-node buffer sync
+                    result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
+                    for tensors, module in zip(result[1:], self._module_copies[1:]):
+                        for tensor, buf in zip(tensors, module._all_buffers()):
+                            buf.data.set_(tensor)
+
+    def _register_grad_hooks(self):
+        self._grad_accs = []  # need to keep them in scope
+        for device_idx, module in enumerate(self._module_copies):
+            for p in module.parameters():
+                if p.requires_grad:
+                    p_tmp = p.expand_as(p)
+                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
+                    self._grad_accs.append(grad_acc)
+
+    def _register_nccl_grad_hook(self):
+        """
+        This function registers the callback all-reduction function for the
+        NCCL backend. All gradients will be all reduced in one single step.
+        The NCCL reduction will directly be enqueued into the
+        default CUDA stream. Therefore, no synchronization is needed.
+        """
+        # Creating a new group
+        self.nccl_reduction_group_id = dist.new_group()
+
+        def reduction_fn_nccl():
+            # This function only needs to be called once
+            if not self.need_reduction:
+                return
+
+            self.need_reduction = False
+            all_grads = [[] for _ in range(len(self._module_copies))]
+            all_grads_buckets_iters = []
+
+            # Bucketing all the gradients
+            for dev_idx, module in enumerate(self._module_copies):
+                for param in module.parameters():
+                    if not param.requires_grad or param.grad is None:
+                        continue
+                    if param.grad.requires_grad:
+                        raise RuntimeError("DistributedDataParallel only works "
+                                           "with gradients that don't require "
+                                           "grad")
+                    # Adding the gradients for reduction
+                    all_grads[dev_idx].append(param.grad.data)
+
+                # Now bucketing the parameters
+                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
+                                                  self.nccl_reduce_bucket_size)
+
+                all_grads_buckets_iters.append(dev_grads_buckets)
+
+            # Now reduce each bucket one after another
+            for grads_batch in zip(*all_grads_buckets_iters):
+                grads_batch_coalesced = []
+                # Coalesce each bucket
+                for dev_idx, dev_grads_batch in enumerate(grads_batch):
+                    dev_id = self.device_ids[dev_idx]
+                    with torch.cuda.device(dev_id):
+                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                        grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+                # We will only use device 0's results, but this single op should be
+                # faster than doing the following two operation sequentially:
+                # (1) intra-node reduce to lead GPU, followed by
+                # (2) inter-node allreduce for all the first lead GPUs in all nodes
+                dist.all_reduce_multigpu(grads_batch_coalesced,
+                                         group=self.nccl_reduction_group_id)
+
+                # Now only work on the first device of self.device_ids, uncoalesce
+                # the gradients for each bucket
+                grads_batch_coalesced[0] /= dist.get_world_size()
+                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
+                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                    grad.copy_(reduced)
+
+            # clear the gradients and save memory for replicas
+            for module in self._module_copies[1:]:
+                for param in module.parameters():
+                    if param.requires_grad:
+                        param.grad = None
+                        param.data.set_()
+
+        # Now register the reduction hook on the parameters
+        for p in self.module.parameters():
+            if not p.requires_grad:
+                continue
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(reduction_fn_nccl)
+
+            p.register_hook(allreduce_hook)
+
+    def _make_param_hook(self, param, device_idx):
+
+        bucket_idx = self.bucket_map[param]
+
+        def distributed_data_parallel_hook(*unused):
+            if param.grad.requires_grad:
+                raise RuntimeError("DistributedDataParallel only works with "
+                                   "gradients that don't require grad")
+            bucket = self.buckets[bucket_idx][device_idx]
+            bucket.append(param.grad.data)
+
+            # We can flush these and save memory for replicas
+            if device_idx > 0:
+                param.grad = None
+                param.data.set_()
+
+            # Current device's bucket is full
+            if len(bucket) == self.bucket_sizes[bucket_idx]:
+                with torch.cuda.device(self.device_ids[device_idx]):
+                    event = torch.cuda.Event()
+                    event.record()
+                with self.dispatch_lock:
+                    self.bucket_events[bucket_idx][device_idx] = event
+                    self._queue_reduction(bucket_idx)
+
+        return distributed_data_parallel_hook
+
+    def _queue_reduction(self, bucket_idx):
+        dev_buckets = self.buckets[bucket_idx]
+        dev_events = self.bucket_events[bucket_idx]
+
+        # Check if it's ready
+        if any(evt is None for evt in dev_events):
+            return
+
+        # Queue the reduction and make sure backward waits for it
+        event = threading.Event()
+        self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event))
+        Variable._execution_engine.queue_callback(lambda: event.wait())
+
+        # Reset bucket state
+        self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))]
+        self.bucket_events[bucket_idx] = [None] * len(self.device_ids)
+        self.reduced[bucket_idx] = True
+        if all(self.reduced):
+            self.reduced = [False] * len(self.bucket_sizes)
+
+            def sync_reduction_streams():
+                # We only have to sync with the first one, but it's safer to do it this way
+                # in case we change the way in which we paralellize work
+                r_streams = zip(*self._reduction_streams)
+                for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams):
+                    with torch.cuda.device(dev_id):
+                        for reduction_stream in dev_r_streams:
+                            default_stream.wait_stream(reduction_stream)
+            Variable._execution_engine.queue_callback(sync_reduction_streams)
+
+    def _start_reduction_threads(self):
+        num_buckets = len(self.bucket_sizes)
+        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
+        self._reduction_threads = []
+        self._reduction_streams = [[] for _ in range(num_buckets)]
+        self._nccl_streams = []
+        self._default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                # TODO: don't assume we're on a default stream
+                self._default_streams.append(torch.cuda.current_stream())
+                self._nccl_streams.append(torch.cuda.Stream())
+        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
+            for dev_id in self.device_ids:
+                with torch.cuda.device(dev_id):
+                    reduction_streams.append(torch.cuda.Stream())
+            # We only use the first device for distributed reductions
+            dist._register_stream(reduction_streams[0])
+
+            group_id = dist.new_group()
+
+            self._reduction_threads.append(threading.Thread(
+                target=self._reduction_thread_fn,
+                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
+            self._reduction_threads[-1].daemon = True
+            self._reduction_threads[-1].start()
+
+    @staticmethod
+    def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams):
+
+        def _process_batch():
+            dev_grad_batch, dev_events, job_event = queue.get()
+            dev_coalesced = []
+            # Coalesce the tensors on all devices and start a local reduction
+            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
+                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
+                    stream.wait_event(event)
+                    coalesced = _flatten_dense_tensors(grad_batch)
+                    dev_coalesced.append(coalesced)
+            # Wait for all copies to complete before starting the NCCL kernel
+            for stream in reduction_streams:
+                stream.synchronize()
+            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)
+
+            # From now on we're only going to work on the first device (from device_ids)
+            grad_batch = dev_grad_batch[0]
+            coalesced = dev_coalesced[0]
+            reduce_stream = reduction_streams[0]
+            with torch.cuda.stream(reduce_stream):
+                reduce_stream.wait_stream(nccl_streams[0])
+                coalesced /= dist.get_world_size()
+                dist.all_reduce(coalesced, group=group_id)
+                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
+                    grad.copy_(reduced)
+            job_event.set()
+
+        with torch.cuda.device(device_ids[0]):
+            while True:
+                _process_batch()  # just to have a clear scope
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
new file mode 100644
index 0000000..c2b32cb
--- /dev/null
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -0,0 +1,375 @@
+import copy
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
+    _take_tensors
+
+from torch.cuda.comm import broadcast_coalesced
+from torch.cuda import nccl
+import torch.distributed.c10d as c10d
+
+from ..modules import Module
+from .replicate import replicate
+from .scatter_gather import scatter_kwargs, gather
+from .parallel_apply import parallel_apply
+
+
+class _DistributedDataParallelC10d(Module):
+    r"""Implements distributed data parallelism that is based on c10d at the
+    module level.
+
+    Currently this module is EXPERIMENTAL ONLY and should not be
+    used by normal users. Instead, please use DistributedDataParallel.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. The module is replicated on each machine and each device, and
+    each such replica handles a portion of the input. During the backwards
+    pass, gradients from each node are averaged.
+
+    The batch size should be larger than the number of GPUs used locally. It
+    should also be an integer multiple of the number of GPUs so that each chunk
+    is the same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires the c10d process group to be already
+    initialized. This class will basically operate on the provided c10d
+    process group.
+
+    .. warning::
+        This module works only with the ``gloo`` and ``nccl`` process groups.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) is a distributed synchronization
+        point. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    -- warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient all-reduction following the reverse order of the
+        registered parameters of the model. In other wise, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact parameter registeration order.
+
+    .. warning::
+        This module assumes all buffers and gradients are dense.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` process group or
+        a ``gloo`` process group (that uses Infiniband), together with a
+        DataLoader that uses multiple workers, please change the multiprocessing
+        start method to ``forkserver`` (Python 3 only) or ``spawn``.
+        Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe,
+        and you will likely experience deadlocks if you don't change this
+        setting.
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        won't be invoked anymore, unless the hooks are initialized in the
+        :meth:`forward` method.
+
+    Args:
+        module: module to be parallelized
+        process_group: the c10d process group to be used for distributed data
+                       all-reduction
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+                           the module at beginning of the forward function.
+                           (default: True)
+        bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       bucket_cap_mb controls the bucket size in MegaBytes (MB)
+                       (default: 25)
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+        >>> store = torch.distributed.c10d.FileStore("/tmp/tempfile.txt")
+        >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size)
+        >>> net = torch.nn._DistributedDataParallelC10d(model, pg)
+    """
+    def __init__(self, module, process_group, device_ids=None,
+                 output_device=None, dim=0, broadcast_buffers=True,
+                 bucket_cap_mb=25):
+
+        super(_DistributedDataParallelC10d, self).__init__()
+
+        # Use all devices by default
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+
+        if output_device is None:
+            output_device = device_ids[0]
+
+        self.dim = dim
+        self.module = module
+        self.process_group = process_group
+        self.device_ids = device_ids
+        self.output_device = output_device
+        self.broadcast_buffers = broadcast_buffers
+
+        MB = 1024 * 1024
+
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = 25 * MB
+
+        # Sync params and buffers
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+
+        if len(device_ids) > 1:
+            # TODO: we don't need to replicate params in here. they're always going to
+            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
+            # better to not pollute the caches with these small blocks
+            self._module_copies = replicate(self.module, self.device_ids, detach=True)
+            self._module_copies[0] = self.module
+
+            for module_copy in self._module_copies[1:]:
+                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
+                    copy_param.requires_grad = param.requires_grad
+
+        else:
+            self._module_copies = [self.module]
+
+        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
+        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
+
+        for dev_idx, module in enumerate(self._module_copies):
+            self.modules_params_data[dev_idx] = [p.data for p in module.parameters()]
+            self.modules_buffers_data[dev_idx] = [b.data for b in module._all_buffers()]
+
+        bucket_bytes_cap = bucket_cap_mb * MB
+
+        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
+        param_buckets = []
+        # Split the parameters into buckets and by types as well
+        param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies]
+
+        self.bucket_sizes = []
+        self.bucket_map = {}
+
+        # We transpose param_buckets, so the loop is over buckets.
+        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
+        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
+            self.bucket_sizes.append(0)
+            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
+            # of params from each device.
+            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
+                if not param_tuple[0].requires_grad:
+                    continue
+                for p in param_tuple:
+                    self.bucket_map[p] = (bucket_idx, idx)
+                self.bucket_sizes[bucket_idx] += 1
+
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        # The number of params ready in each bucket
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+
+        # coalesced bucket for only device 0
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        # We will always reduce the bucket following the reverse order
+        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
+
+        # default stream tracking to launch nccl reduce kernels
+        self.default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                self.default_streams.append(torch.cuda.current_stream())
+
+        self._register_grad_hooks()
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        del attrs['_grad_accs']
+        return attrs
+
+    def __setstate__(self, state):
+        super(_DistributedDataParallelC10d, self).__setstate__(state)
+        self._register_grad_hooks()
+
+    def forward(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        self._sync_params()
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super(_DistributedDataParallelC10d, self).train(mode)
+        for module in self._module_copies[1:]:
+            module.train(mode)
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            c10d.broadcast(flat_tensors, 0, self.process_group).wait()
+            for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        if len(self.device_ids) > 1:
+            # intra-node parameter sync
+            result = broadcast_coalesced(self.modules_params_data[0],
+                                         self.device_ids,
+                                         self.broadcast_bucket_size)
+            for tensors, module_params_data in zip(result[1:], self.modules_params_data[1:]):
+                for tensor, param_data in zip(tensors, module_params_data):
+                    param_data.set_(tensor)
+
+        # module buffer sync
+        if self.broadcast_buffers:
+            if len(self.modules_buffers_data[0]) > 0:
+                # cross-node buffer sync
+                self._dist_broadcast_coalesced(self.modules_buffers_data[0],
+                                               self.broadcast_bucket_size)
+                if len(self.device_ids) > 1:
+                    # intra-node buffer sync
+                    result = broadcast_coalesced(self.modules_buffers_data[0],
+                                                 self.device_ids,
+                                                 self.broadcast_bucket_size)
+                    for tensors, module_buffers_data in zip(result[1:], self.modules_buffers_data[1:]):
+                        for tensor, buffer_data in zip(tensors, module_buffers_data):
+                            buffer_data.set_(tensor)
+
+    def _register_grad_hooks(self):
+        self._grad_accs = []  # need to keep them in scope
+        for device_idx, module in enumerate(self._module_copies):
+            for p in module.parameters():
+                if p.requires_grad:
+                    p_tmp = p.expand_as(p)
+                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
+                    self._grad_accs.append(grad_acc)
+
+    def _make_param_hook(self, param, device_idx):
+        bucket_idx, bucket_offset = self.bucket_map[param]
+
+        def distributed_data_parallel_hook(*unused):
+            if param.grad.requires_grad:
+                raise RuntimeError("DistributedDataParallelC10d only works "
+                                   "with gradients that don't require grad")
+            bucket = self.buckets[bucket_idx][device_idx]
+            bucket[bucket_offset] = param.grad.data
+            self.buckets_ready_size[bucket_idx][device_idx] += 1
+
+            # We can flush these and save memory for replicas
+            if device_idx > 0:
+                param.grad = None
+                param.data.set_()
+
+            # Current device's bucket is full
+            if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]:
+                self.devs_ready[bucket_idx] += 1
+                if self.devs_ready[bucket_idx] < len(self.device_ids):
+                    return
+
+                # Now all devices's buckets with index: bucket_idx are ready
+                if bucket_idx == self.next_bucket:
+                    self._queue_reduction(bucket_idx)
+                    self.next_bucket -= 1
+                    # Now reduce anything that is ready but not yet reduced
+                    if len(self.ready_buckets_not_reduced) > 0:
+                        sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True)
+                        for i in sorted_todo:
+                            # Nothing can be reduced now
+                            if i < self.next_bucket:
+                                break
+                            self._queue_reduction(i)
+                            self.ready_buckets_not_reduced.remove(i)
+                            if i == self.next_bucket:
+                                self.next_bucket -= 1
+                else:
+                    self.ready_buckets_not_reduced.add(bucket_idx)
+
+                # When all devices' buckets
+                if self.next_bucket == -1:
+                    # A final sync for all the reduction works
+                    self._sync_reduction_works()
+
+        return distributed_data_parallel_hook
+
+    def _queue_reduction(self, bucket_idx):
+        grads_batch = self.buckets[bucket_idx]
+        grads_batch_coalesced = []
+
+        # coalesce the bucket
+        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
+            with torch.cuda.device(dev_id):
+                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+        # reduce to the first GPU in self.device_ids
+        if len(self.device_ids) > 1:
+            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
+
+        # now work on the first gpu
+        reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group)
+        self.reduction_works[bucket_idx] = reduction_work
+        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
+
+    def _sync_reduction_works(self):
+        # Now only work on the first GPU of self.device_ids, uncoalesce
+        # the gradients for each bucket
+        for bucket_idx, grads_batch in enumerate(self.buckets):
+            # wait will let current stream wait on the c10d reduction stream
+            self.reduction_works[bucket_idx].wait()
+
+            self.buckets_coalesced[bucket_idx] /= self.process_group.size()
+            grads_batch_reduced = _unflatten_dense_tensors(
+                self.buckets_coalesced[bucket_idx], grads_batch[0])
+
+            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                grad.copy_(reduced)
+
+        # Reset the module states
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
+
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
diff --git a/torch/nn/parallel/distributed_cpu.py b/torch/nn/parallel/distributed_cpu.py
new file mode 100644
index 0000000..07b59c5
--- /dev/null
+++ b/torch/nn/parallel/distributed_cpu.py
@@ -0,0 +1,105 @@
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import torch.distributed as dist
+from torch.nn.modules import Module
+from collections import defaultdict
+from torch.autograd import Variable
+
+
+class DistributedDataParallelCPU(Module):
+    r"""Implements distributed data parallelism for CPU at the module level.
+
+    This module support the ``mpi``, ``gloo``, ``tcp`` backends.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. The module is replicated on each machine, and each such replica
+    handles a portion of the input. During the backwards pass, gradients from
+    each node are averaged.
+
+    This module could be used in conjunction with the DistributedSampler,
+    (see :class `torch.utils.data.distributed.DistributedSampler`)
+    which will load a subset of the original datset for each node with the same
+    batch size. So strong scaling should be configured like this:
+        n = 1, batch size = 128
+        n = 2, batch size = 64
+        n = 4, batch size = 32
+        n = 8, batch size = 16
+
+    Creation of this class requires the distributed package to be already
+    initialized in the process group mode
+    (see :func:`torch.distributed.init_process_group`).
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) is a distributed synchronization
+        point. Take that into account in case different node might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+
+    .. warning::
+        This module assumes all gradients are dense.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. note::
+        Parameters are broadcast between nodes in the __init__() function. The
+        module performs an all-reduce step on gradients and assumes that they
+        will be modified by the optimizer in all nodes in the same way.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        won't be invoked anymore, unless the hooks are initialized in the
+        :meth:`forward` method.
+
+    Args:
+        module: module to be parallelized
+
+    Example::
+
+        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallelCPU(model)
+    """
+
+    def __init__(self, module):
+        super(DistributedDataParallelCPU, self).__init__()
+        self.module = module
+        self.sync_parameters()
+
+        def allreduce_params():
+            if self.needs_reduction:
+                self.needs_reduction = False
+                buckets = defaultdict(list)
+                for param in self.module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = type(param.data)
+                        buckets[tp].append(param)
+
+                for bucket in buckets.values():
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+
+    def sync_parameters(self):
+        for param in self.module.parameters():
+            dist.broadcast(param.data, 0)
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
new file mode 100644
index 0000000..6a5ab99
--- /dev/null
+++ b/torch/nn/parallel/parallel_apply.py
@@ -0,0 +1,79 @@
+import threading
+import torch
+
+
+def get_a_var(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj
+
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        for result in map(get_a_var, obj):
+            if isinstance(result, torch.Tensor):
+                return result
+    if isinstance(obj, dict):
+        for result in map(get_a_var, obj.items()):
+            if isinstance(result, torch.Tensor):
+                return result
+    return None
+
+
+def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
+    r"""Applies each `module` in :attr:`modules` in parallel on arguments
+    contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
+    on each of :attr:`devices`.
+
+    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
+    :attr:`devices` (if given) should all have same length. Moreover, each
+    element of :attr:`inputs` can either be a single object as the only argument
+    to a module, or a collection of positional arguments.
+    """
+    assert len(modules) == len(inputs)
+    if kwargs_tup is not None:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+
+    lock = threading.Lock()
+    results = {}
+    grad_enabled = torch.is_grad_enabled()
+
+    def _worker(i, module, input, kwargs, device=None):
+        torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            device = get_a_var(input).get_device()
+        try:
+            with torch.cuda.device(device):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
+                output = module(*input, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, kwargs, device))
+                   for i, (module, input, kwargs, device) in
+                   enumerate(zip(modules, inputs, kwargs_tup, devices))]
+
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
+
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
new file mode 100644
index 0000000..4800d6b
--- /dev/null
+++ b/torch/nn/parallel/replicate.py
@@ -0,0 +1,68 @@
+import torch.cuda.comm as comm
+
+
+def replicate(network, devices, detach=False):
+    from ._functions import Broadcast
+
+    devices = tuple(devices)
+    num_replicas = len(devices)
+
+    params = list(network.parameters())
+    param_indices = {param: idx for idx, param in enumerate(params)}
+    param_copies = Broadcast.apply(devices, *params)
+    if len(params) > 0:
+        param_copies = [param_copies[i:i + len(params)]
+                        for i in range(0, len(param_copies), len(params))]
+
+    buffers = list(network._all_buffers())
+    buffer_indices = {buf: idx for idx, buf in enumerate(buffers)}
+    buffer_copies = comm.broadcast_coalesced(buffers, devices)
+
+    modules = list(network.modules())
+    module_copies = [[] for device in devices]
+    module_indices = {}
+
+    for i, module in enumerate(modules):
+        module_indices[module] = i
+        for j in range(num_replicas):
+            replica = module.__new__(type(module))
+            replica.__dict__ = module.__dict__.copy()
+            replica._parameters = replica._parameters.copy()
+            replica._buffers = replica._buffers.copy()
+            replica._modules = replica._modules.copy()
+            module_copies[j].append(replica)
+
+    for i, module in enumerate(modules):
+        for key, child in module._modules.items():
+            if child is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._modules[key] = None
+            else:
+                module_idx = module_indices[child]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._modules[key] = module_copies[j][module_idx]
+        for key, param in module._parameters.items():
+            if param is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = None
+            else:
+                param_idx = param_indices[param]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = param_copies[j][param_idx].detach() \
+                        if detach else param_copies[j][param_idx]
+        for key, buf in module._buffers.items():
+            if buf is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = None
+            else:
+                buffer_idx = buffer_indices[buf]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = buffer_copies[j][buffer_idx]
+
+    return [module_copies[j][0] for j in range(num_replicas)]
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
new file mode 100644
index 0000000..b125b5b
--- /dev/null
+++ b/torch/nn/parallel/scatter_gather.py
@@ -0,0 +1,69 @@
+import torch
+from ._functions import Scatter, Gather
+
+
+def scatter(inputs, target_gpus, dim=0):
+    r"""
+    Slices tensors into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            return Scatter.apply(target_gpus, None, dim, obj)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict) and len(obj) > 0:
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
+
+
+def gather(outputs, target_device, dim=0):
+    r"""
+    Gathers tensors from different GPUs on a specified device
+      (-1 means the CPU).
+    """
+    def gather_map(outputs):
+        out = outputs[0]
+        if isinstance(out, torch.Tensor):
+            return Gather.apply(target_device, dim, *outputs)
+        if out is None:
+            return None
+        if isinstance(out, dict):
+            if not all((len(out) == len(d) for d in outputs)):
+                raise ValueError('All dicts must have the same number of keys')
+            return type(out)(((k, gather_map([d[k] for d in outputs]))
+                              for k in out))
+        return type(out)(map(gather_map, zip(*outputs)))
+
+    # Recursive function calls like this create reference cycles.
+    # Setting the function to None clears the refcycle.
+    try:
+        return gather_map(outputs)
+    finally:
+        gather_map = None
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
new file mode 100644
index 0000000..d087cc5
--- /dev/null
+++ b/torch/nn/parameter.py
@@ -0,0 +1,30 @@
+import torch
+
+
+class Parameter(torch.Tensor):
+    r"""A kind of Tensor that is to be considered a module parameter.
+
+    Parameters are :class:`~torch.Tensor` subclasses, that have a
+    very special property when used with :class:`Module` s - when they're
+    assigned as Module attributes they are automatically added to the list of
+    its parameters, and will appear e.g. in :meth:`~Module.parameters` iterator.
+    Assigning a Tensor doesn't have such effect. This is because one might
+    want to cache some temporary state, like last hidden state of the RNN, in
+    the model. If there was no such class as :class:`Parameter`, these
+    temporaries would get registered too.
+
+    Arguments:
+        data (Tensor): parameter tensor.
+        requires_grad (bool, optional): if the parameter requires gradient. See
+            :ref:`excluding-subgraphs` for more details. Default: `True`
+    """
+    def __new__(cls, data=None, requires_grad=True):
+        if data is None:
+            data = torch.Tensor()
+        return torch.Tensor._make_subclass(cls, data, requires_grad)
+
+    def __repr__(self):
+        return 'Parameter containing:\n' + super(Parameter, self).__repr__()
+
+    def __reduce_ex__(self, proto):
+        return Parameter, (super(Parameter, self), self.requires_grad)
diff --git a/torch/nn/utils/__init__.py b/torch/nn/utils/__init__.py
new file mode 100644
index 0000000..1af5034
--- /dev/null
+++ b/torch/nn/utils/__init__.py
@@ -0,0 +1,5 @@
+from . import rnn
+from .clip_grad import clip_grad_norm, clip_grad_norm_, clip_grad_value_
+from .weight_norm import weight_norm, remove_weight_norm
+from .convert_parameters import parameters_to_vector, vector_to_parameters
+from .spectral_norm import spectral_norm, remove_spectral_norm
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
new file mode 100644
index 0000000..fcccc1f
--- /dev/null
+++ b/torch/nn/utils/clip_grad.py
@@ -0,0 +1,69 @@
+import warnings
+import torch
+from torch._six import inf
+
+
+def clip_grad_norm_(parameters, max_norm, norm_type=2):
+    r"""Clips gradient norm of an iterable of parameters.
+
+    The norm is computed over all gradients together, as if they were
+    concatenated into a single vector. Gradients are modified in-place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+    else:
+        total_norm = 0
+        for p in parameters:
+            param_norm = p.grad.data.norm(norm_type)
+            total_norm += param_norm.item() ** norm_type
+        total_norm = total_norm ** (1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    r"""Clips gradient norm of an iterable of parameters.
+
+    .. warning::
+        This method is now deprecated in favor of
+        :func:`torch.nn.utils.clip_grad_norm_`.
+    """
+    warnings.warn("torch.nn.utils.clip_grad_norm is now deprecated in favor "
+                  "of torch.nn.utils.clip_grad_norm_.", stacklevel=2)
+    return clip_grad_norm_(parameters, max_norm, norm_type)
+
+
+def clip_grad_value_(parameters, clip_value):
+    r"""Clips gradient of an iterable of parameters at specified value.
+
+    Gradients are modified in-place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        clip_value (float or int): maximum allowed value of the gradients
+            The gradients are clipped in the range [-clip_value, clip_value]
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    clip_value = float(clip_value)
+    for p in filter(lambda p: p.grad is not None, parameters):
+        p.grad.data.clamp_(min=-clip_value, max=clip_value)
diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py
new file mode 100644
index 0000000..7f0dd16
--- /dev/null
+++ b/torch/nn/utils/convert_parameters.py
@@ -0,0 +1,83 @@
+import torch
+
+
+def parameters_to_vector(parameters):
+    r"""Convert parameters to one vector
+
+    Arguments:
+        parameters (Iterable[Tensor]): an iterator of Tensors that are the
+            parameters of a model.
+
+    Returns:
+        The parameters represented by a single vector
+    """
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    vec = []
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        vec.append(param.view(-1))
+    return torch.cat(vec)
+
+
+def vector_to_parameters(vec, parameters):
+    r"""Convert one vector to the parameters
+
+    Arguments:
+        vec (Tensor): a single vector represents the parameters of a model.
+        parameters (Iterable[Tensor]): an iterator of Tensors that are the
+            parameters of a model.
+    """
+    # Ensure vec of type Tensor
+    if not isinstance(vec, torch.Tensor):
+        raise TypeError('expected torch.Tensor, but got: {}'
+                        .format(torch.typename(vec)))
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    # Pointer for slicing the vector for each parameter
+    pointer = 0
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        # The length of the parameter
+        num_param = torch.prod(torch.LongTensor(list(param.size())))
+        # Slice the vector, reshape it, and replace the old data of the parameter
+        param.data = vec[pointer:pointer + num_param].view(param.size()).data
+
+        # Increment the pointer
+        pointer += num_param
+
+
+def _check_param_device(param, old_param_device):
+    r"""This helper function is to check if the parameters are located
+    in the same device. Currently, the conversion between model parameters
+    and single vector form is not supported for multiple allocations,
+    e.g. parameters in different GPUs, or mixture of CPU/GPU.
+
+    Arguments:
+        param ([Tensor]): a Tensor of a parameter of a model
+        old_param_device (int): the device where the first parameter of a
+                                model is allocated.
+
+    Returns:
+        old_param_device (int): report device for the first time
+    """
+
+    # Meet the first parameter
+    if old_param_device is None:
+        old_param_device = param.get_device() if param.is_cuda else -1
+    else:
+        warn = False
+        if param.is_cuda:  # Check if in same GPU
+            warn = (param.get_device() != old_param_device)
+        else:  # Check if in CPU
+            warn = (old_param_device != -1)
+        if warn:
+            raise TypeError('Found two parameters on different devices, '
+                            'this is currently not supported.')
+    return old_param_device
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
new file mode 100644
index 0000000..4c1eee0
--- /dev/null
+++ b/torch/nn/utils/rnn.py
@@ -0,0 +1,353 @@
+from collections import namedtuple
+
+import torch
+import torch.onnx
+
+
+from .._functions.packing import PackPadded
+
+PackedSequence_ = namedtuple('PackedSequence', ['data', 'batch_sizes'])
+
+
+class PackedSequence(PackedSequence_):
+    r"""Holds the data and list of :attr:`batch_sizes` of a packed sequence.
+
+    All RNN modules accept packed sequences as inputs.
+
+    Note:
+        Instances of this class should never be created manually. They are meant
+        to be instantiated by functions like :func:`pack_padded_sequence`.
+
+        Batch sizes represent the number elements at each sequence step in
+        the batch, not the varying sequence lengths passed to
+        :func:`pack_padded_sequence`.  For instance, given data  ``abc`` and `x`
+        the :class:`PackedSequence` would contain data ``axbc`` with
+        ``batch_sizes=[2,1,1]``.
+
+    Attributes:
+        data (Tensor): Tensor containing packed sequence
+        batch_sizes (Tensor): Tensor of integers holding
+            information about the batch size at each sequence step
+
+    """
+    def __new__(cls, *args):
+        # support being called as `PackedSequence(data, batch_sizes)`
+        if len(args) == 2:
+            return super(PackedSequence, cls).__new__(cls, *args)
+        # support being called as `PackedSequence((data, batch_sizes))`
+        else:
+            assert len(args) == 1
+            return super(PackedSequence, cls).__new__(cls, *args[0])
+
+    def cuda(self, *args, **kwargs):
+        """Returns a GPU copy if `self.data` not already on the GPU"""
+        if self.is_cuda:
+            return self
+        else:
+            return type(self)(self.data.cuda(*args, **kwargs), self.batch_sizes)
+
+    def cpu(self):
+        """Returns a CPU copy if `self.data` not already on the CPU"""
+        if self.is_cuda:
+            return type(self)(self.data.cpu(), self.batch_sizes)
+        else:
+            return self
+
+    def double(self):
+        r"""Returns copy with `self.data` cast to double type"""
+        return type(self)(self.data.double(), self.batch_sizes)
+
+    def float(self):
+        r"""Returns copy with `self.data` cast to float type"""
+        return type(self)(self.data.float(), self.batch_sizes)
+
+    def half(self):
+        r"""Returns copy with `self.data` cast to half type"""
+        return type(self)(self.data.half(), self.batch_sizes)
+
+    def long(self):
+        r"""Returns copy with `self.data` cast to long type"""
+        return type(self)(self.data.long(), self.batch_sizes)
+
+    def int(self):
+        r"""Returns copy with `self.data` cast to int type"""
+        return type(self)(self.data.int(), self.batch_sizes)
+
+    def short(self):
+        r"""Returns copy with `self.data` cast to short type"""
+        return type(self)(self.data.short(), self.batch_sizes)
+
+    def char(self):
+        r"""Returns copy with `self.data` cast to char type"""
+        return type(self)(self.data.char(), self.batch_sizes)
+
+    def byte(self):
+        r"""Returns copy with `self.data` cast to byte type"""
+        return type(self)(self.data.byte(), self.batch_sizes)
+
+    def to(self, *args, **kwargs):
+        r"""Performs dtype and/or device conversion on `self.data`.
+
+        It has similar signature as :meth:`torch.Tensor.to`.
+
+        .. note::
+
+            If the ``self.data`` Tensor already has the correct :class:`torch.dtype`
+            and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, returns a copy with the desired configuration.
+        """
+        data = self.data.to(*args, **kwargs)
+        if data is self.data:
+            return self
+        else:
+            return type(self)(data, self.batch_sizes)
+
+    @property
+    def is_cuda(self):
+        r"""Returns true if `self.data` stored on a gpu"""
+        return self.data.is_cuda
+
+
+def pack_padded_sequence(input, lengths, batch_first=False):
+    r"""Packs a Tensor containing padded sequences of variable length.
+
+    Input can be of size ``T x B x *`` where `T` is the length of the longest sequence
+    (equal to ``lengths[0]``), `B` is the batch size, and `*` is any number of
+    dimensions (including 0). If ``batch_first`` is True ``B x T x *`` inputs are
+    expected.
+
+    The sequences should be sorted by length in a decreasing order, i.e.
+    ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the
+    shortest one.
+
+    Note:
+        This function accepts any input that has at least two dimensions. You
+        can apply it to pack the labels, and use the output of the RNN with
+        them to compute the loss directly. A Tensor can be retrieved from
+        a :class:`PackedSequence` object by accessing its ``.data`` attribute.
+
+    Arguments:
+        input (Tensor): padded batch of variable length sequences.
+        lengths (Tensor): list of sequences lengths of each batch element.
+        batch_first (bool, optional): if ``True``, the input is expected in ``B x T x *``
+            format.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    if isinstance(lengths, list):
+        lengths = torch.LongTensor(lengths)
+
+    data, batch_sizes = PackPadded.apply(input, lengths, batch_first)
+
+    return PackedSequence(data, batch_sizes)
+
+
+def _symbolic_pack_padded_sequence(g, input, lengths, batch_first=False, padding_value=0.0):
+    # There currently is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+
+    def _onnx_symbolic_pack_padded_sequence(g, input, lengths):
+        if batch_first:
+            input = g.op('Transpose', input, perm_i=[1, 0, 2])
+        if lengths.type().kind() != 'TensorType':
+            raise RuntimeError("Lengths must be a Tensor for ONNX export")
+        # We know it's a TensorType so this check is now safe.
+        if lengths.type().scalarType() != 'Int':
+            raise RuntimeError("ONNX export requires that the lengths passed "
+                               "to pack_padded_sequence must be of type Int")
+        return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+    def pack_padded_sequence_trace_wrapper(input, lengths):
+        return pack_padded_sequence(input, lengths, batch_first=batch_first)
+
+    outputs = g.wrapPyFuncWithSymbolic(
+        pack_padded_sequence_trace_wrapper, [input, lengths], 2,
+        _onnx_symbolic_pack_padded_sequence)
+    return tuple(o for o in outputs)
+
+
+pack_padded_sequence = torch.onnx.symbolic_override_first_arg_based(
+    _symbolic_pack_padded_sequence)(pack_padded_sequence)
+
+
+def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None):
+    r"""Pads a packed batch of variable length sequences.
+
+    It is an inverse operation to :func:`pack_padded_sequence`.
+
+    The returned Tensor's data will be of size ``T x B x *``, where `T` is the length
+    of the longest sequence and `B` is the batch size. If ``batch_first`` is True,
+    the data will be transposed into ``B x T x *`` format.
+
+    Batch elements will be ordered decreasingly by their length.
+
+    .. note::
+        :attr:`total_length` is useful to implement the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`this FAQ section <pack-rnn-unpack-with-data-parallelism>` for
+        details.
+
+    Arguments:
+        sequence (PackedSequence): batch to pad
+        batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
+            format.
+        padding_value (float, optional): values for padded elements.
+        total_length (int, optional): if not ``None``, the output will be padded to
+            have length :attr:`total_length`. This method will throw :class:`ValueError`
+            if :attr:`total_length` is less than the max sequence length in
+            :attr:`sequence`.
+
+    Returns:
+        Tuple of Tensor containing the padded sequence, and a Tensor
+        containing the list of lengths of each sequence in the batch.
+
+    """
+    var_data, batch_sizes = sequence
+    max_batch_size = int(batch_sizes[0])
+    max_seq_length = batch_sizes.size(0)
+    if total_length is not None:
+        if total_length < max_seq_length:
+            raise ValueError("Expected total_length to be at least the length "
+                             "of the longest sequence in input, but got "
+                             "total_length={} and max sequence length being {}"
+                             .format(total_length, max_seq_length))
+        max_seq_length = total_length
+    output = var_data.data.new(max_seq_length, max_batch_size, *var_data.size()[1:]).fill_(padding_value)
+
+    lengths = []
+    data_offset = 0
+    prev_batch_size = int(batch_sizes[0])
+    prev_i = 0
+    for i, batch_size in enumerate(batch_sizes.tolist() + [0]):
+        if batch_size != prev_batch_size:
+            l = prev_batch_size * (i - prev_i)
+            tmp = var_data[data_offset:data_offset + l]
+            output[prev_i:i, :prev_batch_size] = tmp.view(i - prev_i, prev_batch_size, *tmp.size()[1:])
+            data_offset += l
+            prev_i = i
+        dec = prev_batch_size - batch_size
+        if dec > 0:
+            lengths.extend((i,) * dec)
+        prev_batch_size = batch_size
+
+    lengths.reverse()
+
+    if batch_first:
+        output = output.transpose(0, 1)
+    # This Tensor doesn't actually have any history (well,
+    # technically it does; it's just untracked), it is purely here to
+    # make ONNX export easier. That is to say, from an autodiff
+    # standpoint this doesn't make any sense.
+    return output, torch.LongTensor(lengths)
+
+
+def _symbolic_pad_packed_sequence(g, input, batch_first=False, padding_value=0.0, total_length=None):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    def _onnx_symbolic_pad_packed_sequence(g, data, batch_sizes):
+        data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+        if batch_first:
+            data = g.op('Transpose', data, perm_i=[1, 0, 2])
+        return data, lengths
+
+    def pad_packed_sequence_trace_wrapper(data, batch_sizes):
+        return pad_packed_sequence(PackedSequence(data, batch_sizes),
+                                   batch_first=batch_first, padding_value=padding_value)
+
+    data, lengths = g.wrapPyFuncWithSymbolic(
+        pad_packed_sequence_trace_wrapper, [input.data, input.batch_sizes], 2,
+        _onnx_symbolic_pad_packed_sequence)
+    return data, lengths
+
+
+pad_packed_sequence = torch.onnx.symbolic_override_packed_sequence_based(
+    _symbolic_pad_packed_sequence)(pad_packed_sequence)
+
+
+def pad_sequence(sequences, batch_first=False, padding_value=0):
+    r"""Pad a list of variable length Tensors with zero
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        torch.Size([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` where `T` is the
+            length of the longest sequence.
+        Function assumes trailing dimensions and type of all the Tensors
+            in sequences are same.
+
+    Arguments:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        Tensor of size ``T x B x *`` if batch_first is False
+        Tensor of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    max_len = max([s.size(0) for s in sequences])
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(0)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, :length, ...] = tensor
+        else:
+            out_tensor[:length, i, ...] = tensor
+
+    return out_tensor
+
+
+def pack_sequence(sequences):
+    r"""Packs a list of variable length Tensors
+
+    ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
+    the length of a sequence and `*` is any number of trailing dimensions,
+    including zero. They should be sorted in the order of decreasing length.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_sequence
+        >>> a = torch.tensor([1,2,3])
+        >>> b = torch.tensor([4,5])
+        >>> c = torch.tensor([6])
+        >>> pack_sequence([a, b, c])
+        PackedSequence(data=tensor([ 1,  4,  6,  2,  5,  3]), batch_sizes=tensor([ 3,  2,  1]))
+
+
+    Arguments:
+        sequences (list[Tensor]): A list of sequences of decreasing length.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    return pack_padded_sequence(pad_sequence(sequences), [v.size(0) for v in sequences])
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
new file mode 100644
index 0000000..1cdafb3
--- /dev/null
+++ b/torch/nn/utils/spectral_norm.py
@@ -0,0 +1,150 @@
+"""
+Spectral Normalization from https://arxiv.org/abs/1802.05957
+"""
+import torch
+from torch.nn.functional import normalize
+from torch.nn.parameter import Parameter
+
+
+class SpectralNorm(object):
+
+    def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError('Expected n_power_iterations to be positive, but '
+                             'got n_power_iterations={}'.format(n_power_iterations))
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def compute_weight(self, module):
+        weight = getattr(module, self.name + '_orig')
+        u = getattr(module, self.name + '_u')
+        weight_mat = weight
+        if self.dim != 0:
+            # permute dim to front
+            weight_mat = weight_mat.permute(self.dim,
+                                            *[d for d in range(weight_mat.dim()) if d != self.dim])
+        height = weight_mat.size(0)
+        weight_mat = weight_mat.reshape(height, -1)
+        with torch.no_grad():
+            for _ in range(self.n_power_iterations):
+                # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+                # are the first left and right singular vectors.
+                # This power iteration produces approximations of `u` and `v`.
+                v = normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps)
+                u = normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps)
+
+        sigma = torch.dot(u, torch.matmul(weight_mat, v))
+        weight = weight / sigma
+        return weight, u
+
+    def remove(self, module):
+        weight = getattr(module, self.name)
+        delattr(module, self.name)
+        delattr(module, self.name + '_u')
+        delattr(module, self.name + '_orig')
+        module.register_parameter(self.name, torch.nn.Parameter(weight))
+
+    def __call__(self, module, inputs):
+        if module.training:
+            weight, u = self.compute_weight(module)
+            setattr(module, self.name, weight)
+            setattr(module, self.name + '_u', u)
+        else:
+            r_g = getattr(module, self.name + '_orig').requires_grad
+            getattr(module, self.name).detach_().requires_grad_(r_g)
+
+    @staticmethod
+    def apply(module, name, n_power_iterations, dim, eps):
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = module._parameters[name]
+        height = weight.size(dim)
+
+        u = normalize(weight.new_empty(height).normal_(0, 1), dim=0, eps=fn.eps)
+        delattr(module, fn.name)
+        module.register_parameter(fn.name + "_orig", weight)
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a
+        # buffer, which will cause weight to be included in the state dict
+        # and also supports nn.init due to shared storage.
+        module.register_buffer(fn.name, weight.data)
+        module.register_buffer(fn.name + "_u", u)
+
+        module.register_forward_pre_hook(fn)
+        return fn
+
+
+def spectral_norm(module, name='weight', n_power_iterations=1, eps=1e-12, dim=None):
+    r"""Applies spectral normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{W} &= \dfrac{\mathbf{W}}{\sigma(\mathbf{W})} \\
+         \sigma(\mathbf{W}) &= \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generaive Adversarial Networks (GANs) by rescaling the weight tensor
+    with spectral norm :math:`\sigma` of the weight matrix calculated using
+    power iteration method. If the dimension of the weight tensor is greater
+    than 2, it is reshaped to 2D in power iteration method to get spectral
+    norm. This is implemented via a hook that calculates spectral norm and
+    rescales weight before every :meth:`~Module.forward` call.
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectal norm
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms
+        dim (int, optional): dimension corresponding to number of outputs,
+            the default is 0, except for modules that are instances of
+            ConvTranspose1/2/3d, when it is 1
+
+    Returns:
+        The original module with the spectal norm hook
+
+    Example::
+
+        >>> m = spectral_norm(nn.Linear(20, 40))
+        Linear (20 -> 40)
+        >>> m.weight_u.size()
+        torch.Size([20])
+
+    """
+    if dim is None:
+        if isinstance(module, (torch.nn.ConvTranspose1d,
+                               torch.nn.ConvTranspose2d,
+                               torch.nn.ConvTranspose3d)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+    return module
+
+
+def remove_spectral_norm(module, name='weight'):
+    r"""Removes the spectral normalization reparameterization from a module.
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = spectral_norm(nn.Linear(40, 10))
+        >>> remove_spectral_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, SpectralNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError("spectral_norm of '{}' not found in {}".format(
+        name, module))
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
new file mode 100644
index 0000000..abc8b63
--- /dev/null
+++ b/torch/nn/utils/weight_norm.py
@@ -0,0 +1,121 @@
+r"""
+Weight Normalization from https://arxiv.org/abs/1602.07868
+"""
+from torch.nn.parameter import Parameter
+
+
+def _norm(p, dim):
+    """Computes the norm over all dimensions except dim"""
+    if dim is None:
+        return p.norm()
+    elif dim == 0:
+        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
+        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
+    elif dim == p.dim() - 1:
+        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
+        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
+    else:
+        return _norm(p.transpose(0, dim), 0).transpose(0, dim)
+
+
+class WeightNorm(object):
+    def __init__(self, name, dim):
+        self.name = name
+        self.dim = dim
+
+    def compute_weight(self, module):
+        g = getattr(module, self.name + '_g')
+        v = getattr(module, self.name + '_v')
+        return v * (g / _norm(v, self.dim))
+
+    @staticmethod
+    def apply(module, name, dim):
+        fn = WeightNorm(name, dim)
+
+        weight = getattr(module, name)
+
+        # remove w from parameter list
+        del module._parameters[name]
+
+        # add g and v as new parameters and express w as g/||v|| * v
+        module.register_parameter(name + '_g', Parameter(_norm(weight, dim).data))
+        module.register_parameter(name + '_v', Parameter(weight.data))
+        setattr(module, name, fn.compute_weight(module))
+
+        # recompute weight before every forward()
+        module.register_forward_pre_hook(fn)
+
+        return fn
+
+    def remove(self, module):
+        weight = self.compute_weight(module)
+        delattr(module, self.name)
+        del module._parameters[self.name + '_g']
+        del module._parameters[self.name + '_v']
+        module.register_parameter(self.name, Parameter(weight.data))
+
+    def __call__(self, module, inputs):
+        setattr(module, self.name, self.compute_weight(module))
+
+
+def weight_norm(module, name='weight', dim=0):
+    r"""Applies weight normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
+    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
+    Weight normalization is implemented via a hook that recomputes the weight
+    tensor from the magnitude and direction before every :meth:`~Module.forward`
+    call.
+
+    By default, with `dim=0`, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    `dim=None`.
+
+    See https://arxiv.org/abs/1602.07868
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = weight_norm(nn.Linear(20, 40), name='weight')
+        Linear (20 -> 40)
+        >>> m.weight_g.size()
+        torch.Size([40, 1])
+        >>> m.weight_v.size()
+        torch.Size([40, 20])
+
+    """
+    WeightNorm.apply(module, name, dim)
+    return module
+
+
+def remove_weight_norm(module, name='weight'):
+    r"""Removes the weight normalization reparameterization from a module.
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = weight_norm(nn.Linear(20, 40))
+        >>> remove_weight_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError("weight_norm of '{}' not found in {}"
+                     .format(name, module))
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
new file mode 100644
index 0000000..1807b71
--- /dev/null
+++ b/torch/onnx/__init__.py
@@ -0,0 +1,169 @@
+import functools
+import types
+
+import torch._C as _C
+
+TensorProtoDataType = _C._onnx.TensorProtoDataType
+OperatorExportTypes = _C._onnx.OperatorExportTypes
+
+ONNX_ARCHIVE_MODEL_PROTO_NAME = "__MODEL_PROTO"
+
+
+class ExportTypes:
+    PROTOBUF_FILE = 1
+    ZIP_ARCHIVE = 2
+    COMPRESSED_ZIP_ARCHIVE = 3
+    DIRECTORY = 4
+
+
+def _export(*args, **kwargs):
+    from torch.onnx import utils
+    return utils._export(*args, **kwargs)
+
+
+def export(*args, **kwargs):
+    from torch.onnx import utils
+    return utils.export(*args, **kwargs)
+
+
+def export_to_pretty_string(*args, **kwargs):
+    from torch.onnx import utils
+    return utils.export_to_pretty_string(*args, **kwargs)
+
+
+def _export_to_pretty_string(*args, **kwargs):
+    from torch.onnx import utils
+    return utils._export_to_pretty_string(*args, **kwargs)
+
+
+def _optimize_trace(trace, operator_export_type):
+    from torch.onnx import utils
+    trace.set_graph(utils._optimize_graph(trace.graph(), operator_export_type))
+
+
+def set_training(*args, **kwargs):
+    from torch.onnx import utils
+    return utils.set_training(*args, **kwargs)
+
+
+def _run_symbolic_function(*args, **kwargs):
+    from torch.onnx import utils
+    return utils._run_symbolic_function(*args, **kwargs)
+
+
+def _run_symbolic_method(*args, **kwargs):
+    from torch.onnx import utils
+    return utils._run_symbolic_method(*args, **kwargs)
+
+
+def _symbolic_override_wrapper_maker(symbolic_fn, might_trace, fn):
+
+    def wrapper(*args, **kwargs):
+        import torch
+        import torch.jit
+        from torch.autograd import Function, function
+
+        # fast pass
+        if not might_trace(args):
+            return fn(*args, **kwargs)
+
+        flat_args = tuple(function._iter_tensors_permissive(args))
+        flat_args_only_tensors = tuple(t for t in flat_args if isinstance(t, torch.Tensor))
+        if not any(map(torch._C._jit_is_tracing, flat_args_only_tensors)):
+            return fn(*args, **kwargs)
+
+        tstate = torch._C._get_tracing_state(flat_args_only_tensors)
+
+        arg_values = [torch._C._get_value_trace(tstate, x) if isinstance(x, torch.Tensor) else x for x in flat_args]
+
+        # This must come after the calls to get_value_trace, lest we
+        # lose information due to in-place operations.
+        output_vars = fn(*args, **kwargs)
+
+        symbolic_args = function._unflatten(arg_values, args)
+        output_vals = symbolic_fn(tstate.graph(), *symbolic_args, **kwargs)
+
+        for var, val in zip(
+                function._iter_tensors(output_vars),
+                function._iter_jit_values(output_vals)):
+            val.inferTypeFrom(var.data)
+            torch._C._set_value_trace(tstate, var, val)
+
+        return output_vars
+
+    # fn might be autograd.Function too, in this case wrapping doesn't work
+    if isinstance(fn, types.FunctionType):
+        wrapper = functools.wraps(fn)(wrapper)
+
+    return wrapper
+
+
+def symbolic_override(symbolic_fn):
+    r"""
+    Decorator to override ONNX export of the a function with specified subgraph.
+
+    Effectively allows to attach symbolic() implementation to an arbitrary
+    python function or autograd.Function. Requirements for the decorated
+    function:
+     - being non-member function or autograd.Function
+     - positional inputs are Tensors or (nested) lists or tuples of
+       them (similar requirement to NestedIOFunction)
+     - outputs are similarly Tensors or (nested) lists or tuples of them
+     - non-tensor typed values should be keyword arguments both in definition
+       and when called
+
+    Example usage:
+
+    ```
+    def symb(g, x, y):
+        return g.op('Sum', x, y[0], y[1])
+
+    @symbolic_override(symb)
+    def foo(x, y):
+        return x + y[0] + y[1]
+    ```
+    """
+
+    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, lambda x: True)
+
+
+def symbolic_override_first_arg_based(symbolic_fn):
+    r"""
+    Decorator to override ONNX export of the a function with specified subgraph.
+
+    Equivalent to :func:`symbolic_override` but checks only the first argument
+    of the function to figure out whether the tracing is on. Thus the first arg
+    needs to be a Tensor.
+    """
+
+    def might_trace(args):
+        import torch
+        first_arg = args[0]
+        if not isinstance(first_arg, torch.Tensor):
+            raise ValueError('First argument of {} is expected to be a tensor, '
+                             'but got an object of type {}'
+                             .format(symbolic_fn.__name__, type(first_arg)))
+        return torch._C._jit_is_tracing(first_arg)
+
+    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace)
+
+
+def symbolic_override_packed_sequence_based(symbolic_fn):
+    r"""
+    Decorator to override ONNX export of the a function with specified subgraph.
+
+    Equivalent to :func:`symbolic_override` but checks only the first argument
+    of the function to figure out whether the tracing is on. Thus the first arg
+    needs to be a Tensor.
+    """
+
+    def might_trace(args):
+        import torch
+        first_arg = args[0]
+        if not isinstance(first_arg, torch.nn.utils.rnn.PackedSequence):
+            raise ValueError('pad_packed_sequence expects sequence to be a '
+                             'PackedSequence, but got an object of type {}'
+                             .format(type(first_arg)))
+        return torch._C._jit_is_tracing(first_arg[0])
+
+    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace)
diff --git a/torch/onnx/operators.py b/torch/onnx/operators.py
new file mode 100644
index 0000000..d0ecc28
--- /dev/null
+++ b/torch/onnx/operators.py
@@ -0,0 +1,27 @@
+r"""This file provides a location for operators that help exporting
+models via onnx. E.g. shape_as_tensor and reshape_from_tensor_shape
+are to make all dynamic sizes operations traceble.
+
+"""
+
+import torch
+import torch.onnx
+import torch.onnx.utils
+
+
+def _shape_as_tensor(g, input):
+    return g.op('Shape', input)
+
+
+@torch.onnx.symbolic_override(_shape_as_tensor)
+def shape_as_tensor(x):
+    return torch.LongTensor(tuple(x.shape))
+
+
+def _reshape_from_tensor_shape(g, input, shape):
+    return g.op('Reshape', input, shape)
+
+
+@torch.onnx.symbolic_override(_reshape_from_tensor_shape)
+def reshape_from_tensor_shape(x, shape):
+    return x.reshape(shape.tolist())
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
new file mode 100644
index 0000000..f547750
--- /dev/null
+++ b/torch/onnx/symbolic.py
@@ -0,0 +1,1032 @@
+import numbers
+
+import torch
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.utils.rnn import PackedSequence
+import warnings
+
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+
+from collections import Iterable
+from functools import partial
+import itertools
+
+# EDITING THIS FILE? READ THIS FIRST!
+#
+# - This file is ONLY for ATen operators (e.g., operators that show up in the
+#   trace as aten::blah).  If you need to special case a primitive operator,
+#   look at _run_symbolic_function
+# - Parameter ordering does NOT necessarily match what is in VariableType.cpp;
+#   tensors are always first, then non-tensor arguments.
+# - Parameter names must *exactly* match the names in VariableType.cpp, because
+#   dispatch is done with keyword arguments.
+# - Looking for inplace ops?  They're detected by the trailing underscore, and
+#   transparently dispatched to their non inplace versions in
+#   'run_symbolic_function'.   See Note [Export inplace]
+
+# ---------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x.item()
+
+
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, torch._C.Value):
+        return self
+    else:
+        ty = tensor.type().scalarType().lower()
+        return getattr(self, ty)()
+
+
+def _broadcast_if_scalar(x):
+    """Return kwargs enabling broadcasting if 'x' is a scalar."""
+    if isinstance(x, torch._C.Value):
+        return {}
+    else:
+        return {"broadcast_i": 1}
+
+
+def _is_value(x):
+    return isinstance(x, torch._C.Value)
+
+
+def _unimplemented(op, msg):
+    warnings.warn("ONNX export failed on " + op + " because " + msg + " not supported")
+
+
+# ---------------------------------------------------------------------
+# ONNX operator version
+# ---------------------------------------------------------------------
+
+# READ ME BEFORE EDITING _onnx_opset_version:
+#
+# The variable below controls which ONNX operator set version we are
+# targeting.   THIS VARIABLE HAS SEMANTIC EFFECT!  Say a breaking
+# change occurred in version 8.  As long as this variable < 8, you can
+# export models targeting the old behavior.  However, if you bump
+# this variable to 8 or later, the breaking change will take into effect:
+# you MUST adjust any symbolic affected by breaking changes.  The ONNX
+# spec publishes a *comprehensive* list of BC-breaking changes for every
+# operator revision at:
+#
+#   https://github.com/onnx/onnx/blob/master/docs/Changelog.md
+#
+# Please be sure to go through and check all of our implementations here before
+# increasing this number.  This includes symbolic definitions NOT in this
+# file, so grep for "OpName" (with quotes)
+
+_onnx_opset_version = 6
+
+
+# ---------------------------------------------------------------------
+# Symbolic definitions
+# ---------------------------------------------------------------------
+
+
+# Note [Pointwise by scalar]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# What happens if you add a tensor with a constant (e.g., x + 2)?  There are
+# some moving parts to implementing the ONNX translation in this case:
+#
+#   - By the time we get the scalar in a symbolic function here, it is no longer
+#     a Python long/float, but a PyTorch tensor with numel == 1 (eventually, we
+#     want it to be a zero dim tensor but this change has not happened yet.)
+#     However, the type of this scalar is *exactly* what the user wrote in
+#     Python, which may not match the tensor it is being added to.  PyTorch
+#     will do implicit conversions on scalars; however, ONNX will not, so
+#     we must do the conversion ourselves.  This is what _if_scalar_type_as
+#     does.
+#
+#   - Most of the time, the arguments to self/other are pre-expanded according
+#     to broadcasting.  However, a scalar will NOT be broadcasted, so we have
+#     to enable broadcasting ONNX side.
+#
+#   - Dispatch to these functions takes advantage an outrageous coincidence
+#     between the tensor and scalar name.  When we add two tensors together,
+#     you get the dispatch:
+#
+#       add(*[self, other], **{"alpha": alpha})
+#
+#     When you add a tensor and a scalar, you get the dispatch:
+#
+#       add(*[self], **{"other": other, "alpha": alpha})
+#
+#     By having the argument name line up with the name of the scalar attribute
+#     if it exists, we can write a single function for both overloads.
+#
+
+# used to represent "missing" optional inputs
+def unused(g):
+    return g.op("prim::Undefined")
+
+
+def add(g, self, other, alpha):
+    if _scalar(alpha) != 1:
+        return _unimplemented("add", "alpha != 1")
+    # See Note [Pointwise by scalar]
+    return g.op("Add", self, _if_scalar_type_as(other, self), **_broadcast_if_scalar(other))
+
+
+def sub(g, self, other, alpha):
+    if _scalar(alpha) != 1:
+        return _unimplemented("sub", "alpha != 1")
+    # See Note [Pointwise by scalar]
+    return g.op("Sub", self, _if_scalar_type_as(other, self), **_broadcast_if_scalar(other))
+
+
+def mul(g, self, other):
+    # See Note [Pointwise by scalar]
+    return g.op("Mul", self, _if_scalar_type_as(other, self), **_broadcast_if_scalar(other))
+
+
+def div(g, self, other):
+    # See Note [Pointwise by scalar]
+    return g.op("Div", self, _if_scalar_type_as(other, self), **_broadcast_if_scalar(other))
+
+
+def reciprocal(g, self):
+    return g.op("Div", _if_scalar_type_as(torch.ones(1), self), self, broadcast_i=1)
+
+
+# This syntax is Python 2 portable
+def cat(g, *tensors, **kwargs):
+    dim = kwargs.pop("dim")
+    assert not kwargs
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+def mm(g, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    ty = self.type().scalarType().lower()
+    C = g.constant(0, [1], ty)
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0, broadcast_i=True)
+
+
+def bmm(g, self, other):
+    return g.op("MatMul", self, other)
+
+
+def matmul(g, self, other):
+    return g.op("MatMul", self, other)
+
+
+def addmm(g, self, mat1, mat2, beta, alpha):
+    return g.op("Gemm", mat1, mat2, self, beta_f=_scalar(beta), alpha_f=_scalar(alpha))
+
+
+def neg(g, self):
+    return g.op("Neg", self)
+
+
+def sqrt(g, self):
+    return g.op("Sqrt", self)
+
+
+def tanh(g, self):
+    return g.op("Tanh", self)
+
+
+def sigmoid(g, self):
+    return g.op("Sigmoid", self)
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        params = {}
+        if dim is not None:
+            if isinstance(dim, numbers.Number):
+                dim = [dim]
+            params['axes_i'] = dim
+        params['keepdims_i'] = int(bool(keepdim))
+        return g.op(onnx_op_name, self, **params)
+    return symbolic
+
+mean = _reduce_op_symbolic('ReduceMean')
+sum = _reduce_op_symbolic('ReduceSum')
+prod = _reduce_op_symbolic('ReduceProd')
+
+
+def cumsum(g, input, dim):
+    return g.op("ATen", input, operator_s="cumsum", dim_i=dim)
+
+
+def t(g, self):
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+# There is no translation for it, but we don't want to raise an error yet
+def expand(g, self, size, implicit):
+    return None
+
+
+def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
+    return g.op("Gather", weight, indices)
+
+
+def embedding_bag(g,
+                  embedding_matrix,
+                  indices,
+                  offsets,
+                  scale_grad_by_freq,
+                  mode,
+                  sparse):
+    return g.op("ATen",
+                embedding_matrix,
+                indices,
+                offsets,
+                operator_s="embedding_bag",
+                outputs=3,
+                scale_grad_by_freq_i=scale_grad_by_freq,
+                mode_i=mode,
+                sparse_i=sparse)
+
+
+def size(g, self, dim):
+    if _is_value(dim):
+        raise RuntimeError("ONNX export only supports constant dim values in .size()")
+    full_shape = g.op("Shape", self)
+    return select(g, full_shape, dim=0, index=dim)
+
+
+def transpose(g, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    axes = list(range(len(self.type().sizes())))
+    axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+    return g.op("Transpose", self, perm_i=axes)
+
+
+def permute(g, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+def view(g, self, size):
+    if _is_value(size):
+        shape = size
+    else:
+        if self.isTensor():
+            self_sizes = self.type().sizes()
+            if self_sizes and len(size) == 2 and self_sizes[0] == size[0]:
+                return g.op("Flatten", self, axis_i=1)
+        shape = g.op("Constant", value_t=torch.LongTensor(size))
+    return g.op("Reshape", self, shape)
+
+
+def stack(g, *tensors, **kwargs):
+    dim = kwargs.pop('dim')
+    if kwargs:
+        raise RuntimeError("Unexpected kwargs: " + ','.join(kwargs.keys()))
+    if len(tensors) < 1:
+        raise RuntimeError("Expected at least one argument to stack node")
+    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in tensors]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+def split(g, self, split_size, dim):
+    size = self.type().sizes()[dim]
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+def chunk(g, self, chunks, dim):
+    split_size = (self.type().sizes()[dim] + chunks - 1) // chunks
+    return split(g, self, split_size, dim)
+
+
+def select(g, self, dim, index):
+    slice_node = g.op("Slice", self, axes_i=[dim], starts_i=[index], ends_i=[index + 1])
+    return g.op("Squeeze", slice_node, axes_i=[dim])
+
+
+def squeeze(g, self, dim=None):
+    if dim is None:
+        dims = []
+        for i, size in enumerate(self.type().sizes()):
+            if size == 1:
+                dims.append(i)
+    else:
+        dims = [dim]
+    return g.op("Squeeze", self, axes_i=dims)
+
+
+def prelu(g, self, weight):
+    return g.op("PRelu", self, weight)
+
+
+def relu(g, input):
+    return g.op("Relu", input)
+
+
+def threshold(g, self, threshold, value):
+    # See Note [Export inplace]
+    if _scalar(threshold) != 0:
+        return _unimplemented("threshold", "non-zero threshold")
+    if _scalar(value) != 0:
+        return _unimplemented("threshold", "non-zero value")
+    return g.op("Relu", self)
+
+
+def leaky_relu(g, input, negative_slope, inplace=False):
+    # See Note [Export inplace]
+    # TODO: Talk to ONNX about unconditional cast of scalar to float
+    return g.op("LeakyRelu", input, alpha_f=_scalar(negative_slope))
+
+
+def glu(g, input, dim):
+    assert input.type().sizes()[dim] % 2 == 0
+
+    first, second = g.op('Split', input, axis_i=dim, outputs=2)
+    return g.op('Mul', first, g.op('Sigmoid', second))
+
+
+def softmax(g, input, dim=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    if dim < 0:
+        dim = len(input.type().sizes()) + dim
+    if len(input.type().sizes()) != dim + 1:
+        return _unimplemented("dim", "ONNX and PyTorch use different strategies to split the input.")
+    return g.op('Softmax', input, axis_i=dim)
+
+
+def softplus(g, self, beta, threshold):
+    if beta != 1:
+        return _unimplemented("beta", "has to be 1")
+    return g.op('Softplus', self)
+
+
+def max_pool1d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+    if ceil_mode:
+        return _unimplemented("max_pool1d_with_indices", "ceil_mode")
+    if set(_single(dilation)) != {1}:
+        return _unimplemented("max_pool1d_with_indices", "dilation")
+    if stride is None:
+        stride = kernel_size
+    r = g.op("MaxPool", input,
+             kernel_shape_i=_single(kernel_size),
+             pads_i=_single(padding) * 2,
+             strides_i=_single(stride))
+    return r, None
+
+
+def max_pool2d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+    if ceil_mode:
+        return _unimplemented("max_pool2d_with_indices", "ceil_mode")
+    if set(_pair(dilation)) != {1}:
+        return _unimplemented("max_pool2d_with_indices", "dilation")
+    if not stride:
+        stride = kernel_size
+    r = g.op("MaxPool", input,
+             kernel_shape_i=_pair(kernel_size),
+             pads_i=_pair(padding) * 2,
+             strides_i=_pair(stride))
+    return r, None
+
+
+def max_pool3d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+    if ceil_mode:
+        return _unimplemented("max_pool3d_with_indices", "ceil_mode")
+    if set(_triple(dilation)) != {1}:
+        return _unimplemented("max_pool3d_with_indices", "dilation")
+    if not stride:
+        stride = kernel_size
+    r = g.op("MaxPool", input,
+             kernel_shape_i=_triple(kernel_size),
+             pads_i=_triple(padding) * 2,
+             strides_i=_triple(stride))
+    return r, None
+
+
+def _avg_pool(name, tuple_fn):
+    def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad):
+        if ceil_mode:
+            return _unimplemented("avg_pool2d", "ceil_mode")
+        if not stride:
+            stride = kernel_size
+
+        padding = tuple(tuple_fn(padding))
+        if count_include_pad:
+            input = g.op("Pad", input,
+                         pads_i=((0,) * 2 + padding) * 2,
+                         mode_s='constant',
+                         value_f=0.)
+            padding = (0,) * len(padding)
+
+        return g.op("AveragePool", input,
+                    kernel_shape_i=tuple_fn(kernel_size),
+                    strides_i=tuple_fn(stride),
+                    pads_i=padding * 2)
+    return symbolic_fn
+
+
+avg_pool1d = _avg_pool('avg_pool1d', _single)
+avg_pool2d = _avg_pool('avg_pool2d', _pair)
+avg_pool3d = _avg_pool('avg_pool3d', _triple)
+
+
+def reflection_pad(g, input, padding):
+    from torch.autograd._functions.utils import prepare_onnx_paddings
+    mode = "reflect"
+    paddings = prepare_onnx_paddings(len(input.type().sizes()), padding)
+    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+
+
+def replication_pad(g, input, padding):
+    from torch.autograd._functions.utils import prepare_onnx_paddings
+    mode = "edge"
+    paddings = prepare_onnx_paddings(len(input.type().sizes()), padding)
+    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+
+
+reflection_pad1d = reflection_pad
+reflection_pad2d = reflection_pad
+reflection_pad3d = reflection_pad
+replication_pad1d = replication_pad
+replication_pad2d = replication_pad
+replication_pad3d = replication_pad
+
+
+def upsample_nearest2d(g, input, output_size):
+    return g.op("Upsample", input,
+                height_scale_f=float(output_size[-2]) / input.type().sizes()[-2],
+                width_scale_f=float(output_size[-1]) / input.type().sizes()[-1],
+                mode_s="nearest")
+
+
+def upsample_bilinear2d(g, input, output_size, align_corners):
+    if align_corners:
+        return _unimplemented("upsample_bilinear2d", "align_corners == True")
+    w_scale = float(output_size[-1]) / input.type().sizes()[-1]
+    h_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    return g.op("Upsample", input, width_scale_f=w_scale,
+                height_scale_f=h_scale, mode_s="bilinear")
+
+
+def gt(g, input, other):
+    return g.op("Greater", input, _if_scalar_type_as(other, input), **_broadcast_if_scalar(other))
+
+
+def lt(g, input, other):
+    return g.op("Less", input, _if_scalar_type_as(other, input), **_broadcast_if_scalar(other))
+
+
+def ge(g, input, other):
+    return g.op("Not", lt(g, other, input))
+
+
+def le(g, input, other):
+    return g.op("Not", gt(g, other, input))
+
+
+def log_softmax(g, input, dim=None):
+    return g.op("LogSoftmax", input, axis_i=dim)
+
+
+def _convolution(g, input, weight, bias, stride, padding, dilation,
+                 transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled):
+    weight_size = weight.type().sizes()
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if bias.node().kind() != "prim::Undefined" and len(bias.type().sizes()) == 1:
+        args.append(bias)
+
+    kwargs = {"kernel_shape_i": weight_size[2:],
+              "strides_i": stride,
+              # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+              # symmetric padding
+              "pads_i": padding + padding,
+              "dilations_i": dilation,
+              "group_i": groups}
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if bias.node().kind() != "prim::Undefined" and len(bias.type().sizes()) != 1:
+        return g.op("Add", n, bias, broadcast_i=1, axis_i=1)
+    else:
+        return n
+
+
+def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
+    input_sizes = input.type().sizes()
+    if len(input_sizes) == 2:
+        # batchnorm1d accepts 2d and 3d array, but ONNX only accepts 3d
+        input = g.op("Unsqueeze", input, axes_i=[2])
+
+    out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
+               is_test_i=not training,
+               epsilon_f=eps,
+               momentum_f=1 - momentum,
+               outputs=1 if not training else 5)
+    if not training:
+        if len(input_sizes) == 2:
+            out = g.op("Squeeze", out, axes_i=[2])
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setUniqueName("batch_norm_dead_output-" + saved_mean.uniqueName())
+        saved_var.setUniqueName("batch_norm_dead_output-" + saved_var.uniqueName())
+        if len(input_sizes) == 2:
+            res = g.op("Squeeze", res, axes_i=[2])
+        return res
+
+
+def unfold(g, input, dimension, size, step):
+    return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
+
+
+def elu(g, input, alpha, scale):
+    if scale and scale != 1.:
+        return _unimplemented("scale", "does not support scale in Elu")
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=_scalar(alpha))
+
+
+def selu(g, input):
+    return g.op("Selu", input)
+
+
+def index_select(g, self, index, dim):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+def index_put(g, *inputs, **kwargs):
+    return g.op("ATen", *inputs, operator_s='index_put', **kwargs)
+
+
+def type_as(g, self, other):
+    if self.isTensor() and other.isTensor() and self.type().scalarType() == other.type().scalarType():
+        return self
+
+    if other.isTensor():
+        other_type_name = other.type().scalarType()
+        return g.op("Cast", self, to_i=cast_pytorch_to_onnx[other_type_name])
+    else:
+        # We don't know the type of other, bail by emitting ATen
+        return g.op("ATen", self, other, operator_s="type_as")
+
+
+# ignore clone operators that are inserted by PyTorch autograd
+def clone(g, input):
+    return input
+
+
+def abs(g, self):
+    return g.op("Abs", self)
+
+
+def pow(g, self, exponent):
+    return g.op("Pow", self, _if_scalar_type_as(exponent, self), **_broadcast_if_scalar(exponent))
+
+
+def clamp(g, self, min, max):
+    return g.op("Clip", self, min_f=min, max_f=max)
+
+
+def clamp_min(g, self, min):
+    return g.op("Clip", self, min_f=min)
+
+
+def clamp_max(g, self, max):
+    return g.op("Clip", self, max_f=max)
+
+
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+def max(g, self, *args, **kwargs):
+    dim = kwargs.get("dim", None)
+    if dim is None and isinstance(args[0], numbers.Number):
+        dim = args[0]
+    if dim is not None:
+        keepdim = kwargs.get("keepdim", False)
+        # TODO: export it as ReduceMax
+        return g.op("ATen",
+                    self,
+                    operator_s="max",
+                    dim_i=dim,
+                    keepdim_i=keepdim,
+                    outputs=2)
+    else:
+        (other,) = args
+        return g.op("Max", self, other)
+
+
+def min(g, self, *args, **kwargs):
+    dim = kwargs.get("dim", None)
+    if dim is None and isinstance(args[0], numbers.Number):
+        dim = args[0]
+    if dim is not None:
+        keepdim = kwargs.get("keepdim", False)
+        # TODO: export it as ReduceMin
+        return g.op("ATen",
+                    self,
+                    operator_s="min",
+                    dim_i=dim,
+                    keepdim_i=keepdim,
+                    outputs=2)
+    else:
+        (other,) = args
+        return g.op("Min", self, other)
+
+
+def eq(g, self, other):
+    return g.op("Equal", self, other)
+
+
+def exp(g, self):
+    return g.op("Exp", self)
+
+
+def norm(g, self, p, dim, keepdim):
+    if p == 1:
+        f = _reduce_op_symbolic("ReduceL1")
+    elif p == 2:
+        f = _reduce_op_symbolic("ReduceL2")
+    else:
+        raise RuntimeError("ONNX export only p-norms with p of 1 or 2")
+    return f(g, self, dim=dim, keepdim=keepdim)
+
+
+def conv_tbc(g, input, weight, bias, pad):
+    return g.op("ATen", input, weight, bias, operator_s="conv_tbc", pad_i=pad)
+
+
+def _unique(g, input, sorted, return_inverse):
+    return g.op("ATen", input, operator_s="_unique", sorted_i=sorted,
+                return_inverse_i=return_inverse, outputs=2)
+
+
+# Metaprogram symbolics for each ATen native specialized cast operator.
+# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
+# ONNX cast node with `to` attribute 'UINT8'
+#
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    'Byte': torch.onnx.TensorProtoDataType.UINT8,
+    'Char': torch.onnx.TensorProtoDataType.INT8,
+    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
+    'Float': torch.onnx.TensorProtoDataType.FLOAT,
+    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
+    'Int': torch.onnx.TensorProtoDataType.INT32,
+    'Long': torch.onnx.TensorProtoDataType.INT64,
+    'Short': torch.onnx.TensorProtoDataType.INT16,
+}
+
+scalar_name_to_pytorch = {
+    'uint8_t': 'Byte',
+    'int8_t': 'Char',
+    'double': 'Double',
+    'float': 'Float',
+    'half': 'Half',
+    'int': 'Int',
+    'int64_t': 'Long',
+    'int16_t': 'Short',
+}
+
+
+def _cast_func_template(to_i, g, input, non_blocking):
+    return g.op("Cast", input, to_i=to_i)
+
+
+for k, v in cast_pytorch_to_onnx.items():
+    name = '_cast_{}'.format(k)
+    globals()[name] = partial(_cast_func_template, v)
+
+
+def slice(g, self, dim, start, end, step):
+    if step != 1:
+        _unimplemented("slice", "step!=1 is currently not supported")
+    return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end])
+
+
+def hardtanh(g, self, min_val, max_val):
+    return g.op("Clip", self, min_f=min_val, max_f=max_val)
+
+
+def alias(g, self):
+    return self
+
+
+def unsqueeze(g, self, dim):
+    return g.op("Unsqueeze", self, axes_i=[dim])
+
+
+def topk(g, self, k, dim=None, largest=True, sorted=True, out=None):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported for topk")
+    if not largest:
+        _unimplemented("TopK", "Ascending TopK is not supported")
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+def repeat(g, self, repeats):
+    if self.isTensor():
+        sizes = self.type().sizes()
+        diff_dims = len(repeats) - len(sizes)
+        if diff_dims > 0:
+            self = view(g, self, [1] * diff_dims + sizes)
+    return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats)))
+
+
+def instance_norm(g, input, **kwargs):
+    input_type = input.type().scalarType()
+    weight = kwargs.get("weight", None)
+    bias = kwargs.get("bias", None)
+    eps = kwargs.get("eps", 1e-5)
+    if weight is None:
+        weight = g.constant(1.0, [input.type().sizes()[1]], input_type)
+    else:
+        weight = g.op('Constant', value_t=weight)
+    if bias is None:
+        bias = g.constant(0.0, [input.type().sizes()[1]], input_type)
+    else:
+        bias = g.op('Constant', value_t=bias)
+    return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+
+
+def RNN_symbolic_builder(cell_type, *args, **kwargs):
+    if cell_type == 'LSTM':
+        return RNN_variant_symbolic_builder('LSTM', *args, **kwargs)
+    elif cell_type == 'GRU':
+        return RNN_variant_symbolic_builder('GRU', *args, **kwargs)
+    elif cell_type.startswith('RNN_'):
+        return RNN_variant_symbolic_builder('RNN', *args, nonlinearity=cell_type[4:], **kwargs)
+    else:
+        return lambda *args, **kwargs: _unimplemented("RNN", "cell type " + cell_type)
+
+
+def reform_weights(g, w, n, intervals):
+    slices = [g.op('Slice', w, axes_i=[0], starts_i=[x * n], ends_i=[y * n]) for x, y in intervals]
+    return g.op('Concat', *slices, axis_i=0)
+
+
+# WARNING: Here be dragons. i.e. this is a hack that should die in a fire
+#
+# Since we need RNN nodes to work both in the GraphExecutor as well as call the
+# correct symbolic function during ONNX export, we do the following:
+#
+# 1. During tracing we dispatch to this function
+# 2. This function emits a PythonOp wrapping the RNN function that would have
+#    run had we not been tracing. Thus, GraphExecutor will call the RNN operator
+#    via Python. In the future we will likely want to make the RNN modules into
+#    ScriptModules so we can optimize them.
+# 3. We store a wrapper around the ONNX symbolic function in the `symbolic`
+#    attribute of the Python function. The ONNX export pass accesses this
+#    attribute during tracing and calls it to lower the PythonOp into the right
+#    thing
+#
+# The first three parameters to this function are meant to be bound with:
+#   cell_type - The string description of the type of RNN cell. e.g. 'LSTM'
+#   func - The function that would have been called here if we had not been
+#          tracing, e.g. CudnnRNN or AutogradRNN.
+#   sym - The ONNX symbolic we should store in the PythonOp for later export.
+#
+# With those three parameters bound, we can pass the function into the
+# torch.onnx.symbolic_override* functions
+#
+# The remaining arguments are equivalent to the inputs seen when dispatching
+# a symbolic function for an operator. Concretely:
+#  * input - a single input tensor [seq_len, batch, input_size] or if bach_first=True,
+#            [batch, seq_len, input_size]
+#  * weights - list of list of tensors. len(weights) = number of layers
+#              weights[i] is a list of weights, same as the parameters to
+#              torch.nn.{RNN,LSTM,GRU}. See the symbolic builders above
+#  * hiddens - hidden state for the first layer, or {hidden state, cell state} if
+#              cell_type == LSTM
+#  * batch_sizes - 1-D tensor containing the sequence length for each example
+#                  in the batch.
+def rnn_trace_override_symbolic(cell_type, func, sym, g, input, weights, hiddens, batch_sizes):
+    num_layers = len(weights)
+    num_weights = 0
+    for x in weights:
+        num_weights += len(x)
+    weights_per_layer = num_weights // num_layers
+    has_batch_sizes = batch_sizes is not None
+
+    # Since we need flat argument lists in the IR, these two functions and the
+    # supporting code before the `wrapPyFuncWithSymbolic` call are simply
+    # helpers to reconstruct the input, weights, hiddens, and batch_sizes
+    # inputs from the flat argument list. To do this, the above code captures
+    # then lengths of each of these inputs so that we can rematerialize them
+    # later before calling either the RNN function or the ONNX symbolic function
+
+    def forward_flattened_wrapper(input, *args):
+        args_offset = 0
+        weights = []
+        for _ in range(num_layers):
+            weights.append(args[args_offset:args_offset + weights_per_layer])
+            args_offset += weights_per_layer
+        if has_batch_sizes:
+            hiddens = args[args_offset:-1]
+            batch_sizes = args[-1]
+        else:
+            hiddens = args[args_offset:]
+            batch_sizes = None
+        if cell_type != 'LSTM':
+            assert len(hiddens) == 1
+            hiddens = hiddens[0]
+        outputs = func(input, weights, hiddens, batch_sizes)
+        # We also need a flattened output list
+        outs_flattened = [outputs[0]]
+        if cell_type == 'LSTM':
+            for o in outputs[1]:
+                outs_flattened.append(o)
+        else:
+            outs_flattened.append(outputs[1])
+        return tuple(outs_flattened)
+
+    def symbolic_flattened_wrapper(g, input, *args):
+        args_offset = 0
+        weights = []
+        for _ in range(num_layers):
+            weights.append(args[args_offset:args_offset + weights_per_layer])
+            args_offset += weights_per_layer
+        if has_batch_sizes:
+            hiddens = args[args_offset:-1]
+            batch_sizes = args[-1]
+        else:
+            hiddens = args[args_offset:]
+            batch_sizes = None
+        if cell_type != 'LSTM':
+            assert len(hiddens) == 1
+            hiddens = hiddens[0]
+        return sym(g, input, weights, hiddens, batch_sizes)
+
+    flattened_weights = []
+    for x in weights:
+        for y in x:
+            flattened_weights.append(y)
+    if not isinstance(hiddens, Iterable):
+        hiddens = [hiddens]
+    inputs = list(itertools.chain.from_iterable(
+        [[input], flattened_weights, hiddens,
+            [batch_sizes] if batch_sizes else []]))
+    outputs = g.wrapPyFuncWithSymbolic(
+        forward_flattened_wrapper,
+        inputs,
+        3 if cell_type == 'LSTM' else 2,
+        symbolic_flattened_wrapper
+    )
+    return tuple(o for o in outputs)
+
+
+def RNN_variant_symbolic_builder(
+        variant, input_size, hidden_size, num_layers, batch_first, dropout, bidirectional, **kwargs):
+    def symbolic(g, input, all_weights, initial_states, batch_sizes):
+        if batch_first:
+            return _unimplemented("RNN/GRU/LSTM", "batch_first")
+        if dropout and kwargs['train']:
+            return _unimplemented("RNN/GRU/LSTM", "dropout in training mode")
+
+        unidirectional = not bidirectional
+
+        prev_output = input
+
+        h_outs = []
+        if variant == 'RNN' or variant == 'GRU':
+            h0 = initial_states
+        elif variant == 'LSTM':
+            h0, c0 = initial_states
+            c_outs = []
+
+        sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+        if variant == 'GRU':
+            # pytorch is reset, input, hidden
+            # onnx is    input, reset, hidden
+            reform_permutation = [(1, 2), (0, 1), (2, 3)]
+        elif variant == 'LSTM':
+            # pytorch is input, forget, cell, output.
+            # onnx is    input, output, forget, cell.
+            reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+        def transform_weights(layer_index):
+            if variant == 'RNN':
+                weight_ih, weight_hh, bias_ih, bias_hh = all_weights[layer_index]
+            elif variant == 'GRU' or variant == 'LSTM':
+                weight_ih, weight_hh, bias_ih, bias_hh = \
+                    [reform_weights(g, w, hidden_size, reform_permutation) for w in all_weights[layer_index]]
+            bias_concat = g.op('Concat', bias_ih, bias_hh, axis_i=0)
+
+            return tuple(g.op('Unsqueeze', x, axes_i=[0]) for x in (weight_ih, weight_hh, bias_concat))
+
+        def retrieve_state(x, start, end):
+            return x if num_layers == 1 else g.op('Slice', x, axes_i=[0], starts_i=[start], ends_i=[end])
+
+        for i in range(num_layers):
+            if unidirectional:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+                state_indices = i, i + 1
+            else:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+
+                weight_ih = g.op('Concat', weight_ih_f, weight_ih_b, axis_i=0)
+                weight_hh = g.op('Concat', weight_hh_f, weight_hh_b, axis_i=0)
+                bias_concat = g.op('Concat', bias_f, bias_b, axis_i=0)
+
+                state_indices = 2 * i, 2 * i + 2
+
+            inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+            inputs.append(retrieve_state(h0, *state_indices))
+            if variant == 'LSTM':
+                inputs.append(retrieve_state(c0, *state_indices))
+
+            extra_kwargs = {} if unidirectional else {'direction_s': 'bidirectional'}
+            if variant == 'RNN':
+                prev_output, h_out = g.op('RNN', *inputs, outputs=2,
+                                          hidden_size_i=hidden_size,
+                                          activations_s=[kwargs['nonlinearity'].lower()],
+                                          **extra_kwargs)
+            elif variant == 'GRU':
+                prev_output, h_out = g.op('GRU', *inputs, outputs=2,
+                                          hidden_size_i=hidden_size,
+                                          linear_before_reset_i=1,
+                                          **extra_kwargs)
+            elif variant == 'LSTM':
+                prev_output, h_out, c_out = g.op('LSTM', *inputs, outputs=3,
+                                                 hidden_size_i=hidden_size,
+                                                 **extra_kwargs)
+
+            if bidirectional:
+                # The ONNX RNN/GRU/LSTM produce an output of dimensions
+                #   seq_len, num_directions, batch, hidden_size
+                # We have to convert to match pytorch's expected
+                #   seq_len, batch, hidden_size * num_directions
+                # by first moving num_directions to the end with
+                # Transpose, and then combining it with hidden_size
+                # with Reshape.
+                prev_output = g.op('Transpose', prev_output, perm_i=[0, 2, 3, 1])
+                prev_output = g.op('Reshape', prev_output, g.op('Constant', value_t=torch.LongTensor([0, 0, -1])))
+            else:
+                prev_output = g.op('Squeeze', prev_output, axes_i=[1])
+
+            h_outs.append(h_out)
+            if variant == 'LSTM':
+                c_outs.append(c_out)
+        h_outs = h_out if num_layers == 1 else g.op('Concat', *h_outs, axis_i=0)
+        if variant == 'RNN' or variant == 'GRU':
+            return prev_output, h_outs
+        elif variant == 'LSTM':
+            c_outs = c_out if num_layers == 1 else g.op('Concat', *c_outs, axis_i=0)
+            return prev_output, h_outs, c_outs
+
+    return symbolic
+
+
+def _dim_arange(g, like, dim):
+    return g.op('ATen', like, dim_i=dim, operator_s='_dim_arange')
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
new file mode 100644
index 0000000..14a291e
--- /dev/null
+++ b/torch/onnx/utils.py
@@ -0,0 +1,537 @@
+r"""
+The torch.onnx module contains functions to export models into the ONNX
+IR format.  These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+
+import torch
+import torch.jit
+import torch.autograd
+import torch.serialization
+import re
+import collections
+import contextlib
+import numbers
+import warnings
+import functools
+import types
+from torch._six import string_classes
+from torch.autograd import Function, function
+from torch.jit import _unique_state_dict
+from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes
+
+
+@contextlib.contextmanager
+def set_training(model, mode):
+    r"""
+    A context manager to temporarily set the training mode of 'model'
+    to 'mode', resetting it when we exit the with-block.  A no-op if
+    mode is None.
+    """
+    if mode is None:
+        yield
+        return
+    old_mode = model.training
+    if old_mode != mode:
+        model.train(mode)
+    try:
+        yield
+    finally:
+        if old_mode != mode:
+            model.train(old_mode)
+
+
+def export(model, args, f, export_params=True, verbose=False, training=False,
+           input_names=None, output_names=None, aten=False, export_raw_ir=False,
+           operator_export_type=None):
+    r"""
+    Export a model into ONNX format.  This exporter runs your model
+    once in order to get a trace of its execution to be exported;
+    at the moment, it supports a limited set of dynamic models (e.g., RNNs.)
+
+    See also: :ref:`onnx-export`
+
+    Arguments:
+        model (torch.nn.Module): the model to be exported.
+        args (tuple of arguments): the inputs to
+            the model, e.g., such that ``model(*args)`` is a valid
+            invocation of the model.  Any non-Tensor arguments will
+            be hard-coded into the exported model; any Tensor arguments
+            will become inputs of the exported model, in the order they
+            occur in args.  If args is a Tensor, this is equivalent
+            to having called it with a 1-ary tuple of that Tensor.
+            (Note: passing keyword arguments to the model is not currently
+            supported.  Give us a shout if you need it.)
+        f: a file-like object (has to implement fileno that returns a file descriptor)
+            or a string containing a file name.  A binary Protobuf will be written
+            to this file.
+        export_params (bool, default True): if specified, all parameters will
+            be exported.  Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, the ordering as specified by ``model.state_dict().values()``
+        verbose (bool, default False): if specified, we will print out a debug
+            description of the trace being exported.
+        training (bool, default False): export the model in training mode.  At
+            the moment, ONNX is oriented towards exporting models for inference
+            only, so you will generally not need to set this to True.
+        input_names(list of strings, default empty list): names to assign to the
+            input nodes of the graph, in order
+        output_names(list of strings, default empty list): names to assign to the
+            output nodes of the graph, in order
+        aten (bool, default False): [DEPRECATED. use operator_export_type] export the
+            model in aten mode. If using aten mode, all the ops original exported
+            by the functions in symbolic.py are exported as ATen ops.
+        export_raw_ir (bool, default False): [DEPRECATED. use operator_export_type]
+            export the internal IR directly instead of converting it to ONNX ops.
+    """
+    if aten or export_raw_ir:
+        assert operator_export_type is None
+        assert aten ^ export_raw_ir
+        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
+    elif operator_export_type is None:
+        operator_export_type = OperatorExportTypes.ONNX
+    _export(model, args, f, export_params, verbose, training, input_names, output_names,
+            operator_export_type=operator_export_type)
+
+
+def _optimize_graph(graph, operator_export_type):
+    # run dce first to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+
+    torch._C._jit_pass_dce(graph)
+    torch._C._jit_pass_lint(graph)
+
+    torch._C._jit_pass_peephole(graph)
+    torch._C._jit_pass_lint(graph)
+    if operator_export_type != OperatorExportTypes.RAW:
+        graph = torch._C._jit_pass_onnx(graph, operator_export_type)
+        torch._C._jit_pass_lint(graph)
+        torch._C._jit_pass_onnx_peephole(graph)
+        torch._C._jit_pass_lint(graph)
+    torch._C._jit_pass_dce(graph)
+    torch._C._jit_pass_lint(graph)
+    torch._C._jit_pass_fixup_onnx_loops(graph)
+    torch._C._jit_pass_lint(graph)
+    graph = torch._C._jit_pass_canonicalize(graph)
+    torch._C._jit_pass_lint(graph)
+    return graph
+
+
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args, )
+
+    trace, torch_out = torch.jit.get_trace_graph(func, args)
+    trace.set_graph(_optimize_graph(trace.graph(), operator_export_type))
+    if return_outs:
+        return trace, torch_out
+    return trace
+
+
+def _trace_and_get_graph_from_model(model, args, training):
+
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = _unique_state_dict(model).keys()
+
+    # By default, training=False, which is good because running a model in
+    # training mode could result in internal buffers getting updated, dropout
+    # getting applied, etc.  If you really know what you're doing, you
+    # can turn training=True (or None, to preserve whatever the original
+    # training mode was.)
+    with set_training(model, training):
+        trace, torch_out = torch.jit.get_trace_graph(model, args)
+
+    if orig_state_dict_keys != _unique_state_dict(model).keys():
+        raise RuntimeError("state_dict changed after running the tracer; "
+                           "something weird is happening in your model!")
+
+    return trace.graph(), torch_out
+
+
+def _model_to_graph(model, args, f, verbose=False, training=False,
+                    input_names=None, output_names=None,
+                    operator_export_type=OperatorExportTypes.ONNX,
+                    example_outputs=None, propagate=False):
+    # Special case for common case of passing a single Variable
+    if isinstance(args, torch.Tensor):
+        args = (args, )
+
+    if isinstance(model, torch.jit.ScriptModule):
+        torch_out = None
+        assert example_outputs is not None, "example_outputs must be provided when exporting a ScriptModule"
+        if isinstance(example_outputs, torch.Tensor):
+            example_outputs = [example_outputs]
+        try:
+            method = model.__getattr__('forward')
+            graph = method.propagate_and_assign_input_and_output_shapes(
+                args, example_outputs, False, propagate)
+            # Erase number types to bring the graph to a pre-NumberType state
+            torch._C._jit_pass_erase_number_types(graph)
+            params = method.params()
+        except AttributeError:
+            # TODO: just trace it
+            raise RuntimeError('\'forward\' method must be a script method')
+    else:
+        graph, torch_out = _trace_and_get_graph_from_model(model, args, training)
+        params = list(_unique_state_dict(model).values())
+
+    graph = _optimize_graph(graph, operator_export_type)
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    if verbose:
+        print(graph)
+
+    return graph, params, torch_out
+
+
+def export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=False,
+                            input_names=None, output_names=None, aten=False, export_raw_ir=False,
+                            operator_export_type=None, export_type=ExportTypes.PROTOBUF_FILE,
+                            example_outputs=None, propagate=False):
+    if aten or export_raw_ir:
+        assert operator_export_type is None
+        assert aten ^ export_raw_ir
+        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
+    elif operator_export_type is None:
+        operator_export_type = OperatorExportTypes.ONNX
+    return _export_to_pretty_string(model, args, f, export_params, verbose, training,
+                                    input_names, output_names, operator_export_type,
+                                    export_type, example_outputs, propagate)
+
+
+def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=False,
+                             input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
+                             export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, propagate=False):
+    graph, params, torch_out = _model_to_graph(model, args, f, verbose,
+                                               training, input_names,
+                                               output_names, operator_export_type,
+                                               example_outputs, propagate)
+
+    from torch.onnx.symbolic import _onnx_opset_version
+    return graph.prettyPrintExport(params, _onnx_opset_version, False, operator_export_type)
+
+
+# NOTE: the output `torch_out` will contain the output tensors resulting from
+# the trace of a Module. In the case that a torch.nn.ScriptModule is passed in,
+# this output will be None, since we are not doing any tracing but rather
+# directly extracting the graph.
+def _export(model, args, f, export_params=True, verbose=False, training=False,
+            input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
+            export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, propagate=False):
+    graph, params, torch_out = _model_to_graph(model, args, f, verbose,
+                                               training, input_names,
+                                               output_names, operator_export_type,
+                                               example_outputs, propagate)
+
+    # TODO: Don't allocate a in-memory string for the protobuf
+    from torch.onnx.symbolic import _onnx_opset_version
+    defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE
+    if export_params:
+        proto, export_map = graph.export(params, _onnx_opset_version, defer_weight_export, operator_export_type)
+    else:
+        proto, export_map = graph.export([], _onnx_opset_version, False, operator_export_type)
+
+    if export_type == ExportTypes.PROTOBUF_FILE:
+        assert(len(export_map) == 0)
+        torch.serialization._with_file_like(f, "wb", lambda f: f.write(proto))
+    elif export_type in [ExportTypes.ZIP_ARCHIVE, ExportTypes.COMPRESSED_ZIP_ARCHIVE]:
+        import zipfile
+        compression = zipfile.ZIP_DEFLATED \
+            if export_type == ExportTypes.COMPRESSED_ZIP_ARCHIVE \
+            else zipfile.ZIP_STORED
+        with zipfile.ZipFile(f, 'w', compression=compression) as z:
+            z.writestr(ONNX_ARCHIVE_MODEL_PROTO_NAME, proto)
+            for k, v in export_map.items():
+                z.writestr(k, v)
+    elif export_type == ExportTypes.DIRECTORY:
+        import os
+        if os.path.exists(f):
+            assert(os.path.isdir(f))
+        else:
+            os.makedirs(f)
+
+        model_proto_file = os.path.join(f, ONNX_ARCHIVE_MODEL_PROTO_NAME)
+        torch.serialization._with_file_like(
+            model_proto_file, "wb", lambda f: f.write(proto))
+
+        for k, v in export_map.items():
+            weight_proto_file = os.path.join(f, k)
+            torch.serialization._with_file_like(
+                weight_proto_file, "wb", lambda f: f.write(v))
+    else:
+        raise RuntimeError('Unknown export type')
+    return torch_out
+
+
+def _set_input_and_output_names(graph, input_names, output_names):
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                "number of %s names provided (%d) exceeded number of %ss (%d)"
+                % (descriptor, len(name_list), descriptor, len(node_list)))
+        for name, node in zip(name_list, node_list):
+            if node.uniqueName() != name:
+                node.setUniqueName(name)
+    set_names(list(graph.inputs()), input_names, 'input')
+    set_names(list(graph.outputs()), output_names, 'output')
+
+attr_pattern = re.compile("^(.+)_([ifstgz])$")
+
+
+def _run_symbolic_method(op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        return symbolic_fn(*args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = ("{} (occurred when translating {})".format(e.args[0], op_name), )
+        raise
+
+
+def _is_onnx_list(value):
+    if not isinstance(value, string_classes) and \
+            not isinstance(value, torch.Tensor) and \
+            isinstance(value, collections.Iterable):
+        return True
+    return False
+
+
+def _add_attribute(node, key, value, aten):
+    r""" initializes the right attribute based on type of value """
+    m = attr_pattern.match(key)
+    if m is None:
+        raise IndexError((
+            "Invalid attribute specifier '{}' names " +
+            " must be suffixed with type, e.g. 'dim_i' or 'dims_i'").format(key))
+    name, kind = m.group(1), m.group(2)
+    if _is_onnx_list(value):
+        kind += "s"
+    if aten:
+        if isinstance(value, torch.Tensor):
+            # Caffe2 proto does not support tensor attribute.
+            if value.numel() > 1:
+                raise ValueError("Should not pass tensor attribute")
+            value = _scalar(value)
+            if isinstance(value, float):
+                kind = "f"
+            else:
+                kind = "i"
+    return getattr(node, kind + "_")(name, value)
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x[0]
+
+
+def _newNode(g, opname, outputs, *args, **kwargs):
+    if "::" in opname:
+        aten = False
+        ns_opname = opname
+    else:
+        aten = kwargs.pop("aten", False)
+        ns = "aten" if aten else "onnx"
+        ns_opname = ns + "::" + opname
+    n = g.create(ns_opname, args, outputs)
+    for k, v in sorted(kwargs.items()):
+        # TODO: enable inplace in aten exporting mode.
+        if k == "inplace":
+            continue
+        _add_attribute(n, k, v, aten=aten)
+    return n
+
+
+def _graph_op(g, opname, *raw_args, **kwargs):
+    r"""
+    Create an ONNX operator 'opname', taking 'args' as inputs and attributes
+    'kwargs'; returning the node representing the single output of this operator
+    (see the `outputs` keyword argument for multi-return nodes).
+
+    The set of operators and the inputs/attributes they take
+    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+    This function is monkey-patched onto Graph.
+
+    Arguments:
+        opname (string): The ONNX operator name, e.g., `Abs` or `Add`.
+        args (Node...): The inputs to the operator; usually provided
+            as arguments to the `symbolic` definition.
+        kwargs: The attributes of the ONNX operator, with keys named
+            according to the following convention: `alpha_f` indicates
+            the `alpha` attribute with type `f`.  The valid type specifiers are
+            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+            specified with type float accepts either a single float, or a
+            list of floats (e.g., you would say `dims_i` for a `dims` attribute
+            that takes a list of integers).
+        outputs (int, optional):  The number of outputs this operator returns;
+            by default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Node`, representing each output of the ONNX operator
+            in positional.
+    """
+    outputs = kwargs.pop('outputs', 1)
+
+    # Filter out None attributes, this can be convenient client side because
+    # now they can pass through None attributes, and have them not show up
+    kwargs = dict((k, v) for k, v in kwargs.items() if v is not None)
+
+    def const_if_tensor(arg):
+        if arg is None:
+            return arg
+        elif isinstance(arg, torch._C.Value):
+            return arg
+        else:
+            return g.op("Constant", value_z=arg)
+
+    args = list(const_if_tensor(arg) for arg in raw_args)
+    n = g.insertNode(_newNode(g, opname, outputs, *args, **kwargs))
+    if outputs == 1:
+        return n.output()
+    return tuple(o for o in n.outputs())
+
+
+# Note [Export inplace]
+# ~~~~~~~~~~~~~~~~~~~~~
+# In abstract, it would be better for us to export inplace annotations,
+# than to not export them, since it is useful information that can
+# help the target of an ONNX export export more efficiently.  However,
+# ONNX doesn't currently formalize inplace.  Fortunately, it's sound to drop
+# inplace annotations, but we are losing information this way.
+
+
+def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExportTypes.ONNX):
+    # NB: Returning None means the node gets cloned as is into
+    # the new graph
+    try:
+        import torch.onnx.symbolic
+
+        # See Note [Export inplace]
+        # TODO: I think this is not necessary anymore
+        if n.kind().endswith('_'):
+            ns_op_name = n.kind()[:-1]
+        else:
+            ns_op_name = n.kind()
+        ns, op_name = ns_op_name.split("::")
+
+        if ns == "onnx":
+            # Use the original node directly
+            return None
+
+        elif ns == "aten":
+            is_exportable_aten_op = hasattr(torch.onnx.symbolic, op_name)
+            is_onnx_aten_export = operator_export_type == OperatorExportTypes.ONNX_ATEN
+            is_aten_fallback_export = operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK
+            if is_onnx_aten_export or (not is_exportable_aten_op and is_aten_fallback_export):
+                # Direct ATen export requested
+                attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}
+                outputs = n.outputsSize()
+                attrs["outputs"] = outputs
+                return _graph_at(g, op_name, *inputs, aten=True, **attrs)
+
+            else:
+                # Export it regularly
+                attrs = {k: n[k] for k in n.attributeNames()}
+                if not is_exportable_aten_op:
+                    warnings.warn("ONNX export failed on ATen operator {} because torch.onnx.symbolic.{} does not exist"
+                                  .format(op_name, op_name))
+                    return None
+                fn = getattr(torch.onnx.symbolic, op_name)
+                return fn(g, *inputs, **attrs)
+
+        elif ns == "prim":
+            if op_name == "Constant":
+                return g.op("Constant", value_t=n["value"])
+
+            elif op_name == "Undefined":
+                # Undefined is not an ONNX operator; keep it as prim::Undefined
+                # and let the exporter handle finally eliminating these
+                return None
+            elif op_name == 'Loop' or op_name == 'If':
+                new_op_outputs = g.op(op_name, *inputs, outputs=n.outputsSize())
+                new_node = new_op_outputs[0].node() if n.outputsSize() > 1 else new_op_outputs.node()
+                for b in n.blocks():
+                    new_block = new_node.addBlock()
+                    torch._C._jit_pass_onnx_block(b, new_block, operator_export_type, env)
+                return new_op_outputs
+            else:
+                warnings.warn("ONNX export failed on primitive operator {}; please report a bug".format(op_name))
+                return None
+
+        else:
+            warnings.warn("ONNX export failed on an operator with unrecognized namespace {}::{}; "
+                          "please report a bug".format(ns, op_name))
+            return None
+
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = ("{} (occurred when translating {})".format(e.args[0], op_name), )
+        raise
+
+
+# Generate an ONNX ATen op node.
+def _graph_at(g, opname, *args, **kwargs):
+    return g.op("ATen", *args, operator_s=opname, **kwargs)
+
+
+# This helper function can create either constant tensor or constant scalar.
+# If dims is None or 0 or [0], generate a 0-d tensor (scalar).
+#
+# TODO: We might not need this anymore, since most scalars now show up
+# as tensors
+def _graph_constant(g, value, dims, type, *args, **kwargs):
+    assert isinstance(value, numbers.Number)
+    assert type is not None
+    isscalar = False
+    if dims is None or dims == 0 or set(dims) == set([0]):
+        dims = [1]
+        isscalar = True
+    type = type.lower()
+    if type == "char":
+        tensor = torch.CharTensor(*dims)
+    elif type == "short":
+        tensor = torch.ShortTensor(*dims)
+    elif type == "int":
+        tensor = torch.IntTensor(*dims)
+    elif type == "long":
+        tensor = torch.LongTensor(*dims)
+    elif type == "half":
+        tensor = torch.HalfTensor(*dims)
+    elif type == "float":
+        tensor = torch.FloatTensor(*dims)
+    elif type == "double":
+        tensor = torch.DoubleTensor(*dims)
+    else:
+        raise ValueError("Unknown type, type should be one of the following strings: "
+                         "char, short, int, long, half, float, double")
+    tensor.fill_(value)
+    if isscalar:
+        return g.op("Constant", *args, value_z=tensor, **kwargs)
+    return g.op("Constant", *args, value_t=tensor, **kwargs)
+
+
+def _node_getitem(self, k):
+    r"""
+    Accessor for attributes of a node which is polymorphic over
+    return type.
+
+    NB: This is monkey-patched onto Node.
+    """
+    sel = self.kindOf(k)
+    return getattr(self, sel)(k)
+
+
+torch._C.Graph.op = _graph_op
+torch._C.Graph.at = _graph_at
+torch._C.Graph.constant = _graph_constant
+torch._C.Node.__getitem__ = _node_getitem
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
new file mode 100644
index 0000000..03a5a68
--- /dev/null
+++ b/torch/optim/__init__.py
@@ -0,0 +1,31 @@
+"""
+:mod:`torch.optim` is a package implementing various optimization algorithms.
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can be also easily integrated in the
+future.
+"""
+
+from .adadelta import Adadelta
+from .adagrad import Adagrad
+from .adam import Adam
+from .sparse_adam import SparseAdam
+from .adamax import Adamax
+from .asgd import ASGD
+from .sgd import SGD
+from .rprop import Rprop
+from .rmsprop import RMSprop
+from .optimizer import Optimizer
+from .lbfgs import LBFGS
+from . import lr_scheduler
+
+del adadelta
+del adagrad
+del adam
+del sparse_adam
+del adamax
+del asgd
+del sgd
+del rprop
+del rmsprop
+del optimizer
+del lbfgs
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
new file mode 100644
index 0000000..f69de33
--- /dev/null
+++ b/torch/optim/adadelta.py
@@ -0,0 +1,78 @@
+import torch
+
+from .optimizer import Optimizer
+
+
+class Adadelta(Optimizer):
+    """Implements Adadelta algorithm.
+
+    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1212.5701
+    """
+
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError("Invalid rho value: {}".format(rho))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        super(Adadelta, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adadelta does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['square_avg'] = torch.zeros_like(p.data)
+                    state['acc_delta'] = torch.zeros_like(p.data)
+
+                square_avg, acc_delta = state['square_avg'], state['acc_delta']
+                rho, eps = group['rho'], group['eps']
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                square_avg.mul_(rho).addcmul_(1 - rho, grad, grad)
+                std = square_avg.add(eps).sqrt_()
+                delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
+                p.data.add_(-group['lr'], delta)
+                acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta)
+
+        return loss
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
new file mode 100644
index 0000000..b39312d
--- /dev/null
+++ b/torch/optim/adagrad.py
@@ -0,0 +1,96 @@
+import torch
+from .optimizer import Optimizer
+
+
+class Adagrad(Optimizer):
+    """Implements Adagrad algorithm.
+
+    It has been proposed in `Adaptive Subgradient Methods for Online Learning
+    and Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lr_decay (float, optional): learning rate decay (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization: http://jmlr.org/papers/v12/duchi11a.html
+    """
+
+    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= lr_decay:
+            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= initial_accumulator_value:
+            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
+
+        defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay,
+                        initial_accumulator_value=initial_accumulator_value)
+        super(Adagrad, self).__init__(params, defaults)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['step'] = 0
+                state['sum'] = torch.full_like(p.data, initial_accumulator_value)
+
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['sum'].share_memory_()
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad.data
+                state = self.state[p]
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
+
+                if grad.is_sparse:
+                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
+                    grad_indices = grad._indices()
+                    grad_values = grad._values()
+                    size = grad.size()
+
+                    def make_sparse(values):
+                        constructor = grad.new
+                        if grad_indices.dim() == 0 or values.dim() == 0:
+                            return constructor().resize_as_(grad)
+                        return constructor(grad_indices, values, size)
+                    state['sum'].add_(make_sparse(grad_values.pow(2)))
+                    std = state['sum']._sparse_mask(grad)
+                    std_values = std._values().sqrt_().add_(1e-10)
+                    p.data.add_(-clr, make_sparse(grad_values / std_values))
+                else:
+                    state['sum'].addcmul_(1, grad, grad)
+                    std = state['sum'].sqrt().add_(1e-10)
+                    p.data.addcdiv_(-clr, grad, std)
+
+        return loss
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
new file mode 100644
index 0000000..1339579
--- /dev/null
+++ b/torch/optim/adam.py
@@ -0,0 +1,108 @@
+import math
+import torch
+from .optimizer import Optimizer
+
+
+class Adam(Optimizer):
+    """Implements Adam algorithm.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                amsgrad = group['amsgrad']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+        return loss
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
new file mode 100644
index 0000000..c6c803a
--- /dev/null
+++ b/torch/optim/adamax.py
@@ -0,0 +1,88 @@
+import torch
+from .optimizer import Optimizer
+
+
+class Adamax(Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        super(Adamax, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adamax does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state['exp_inf'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
+                beta1, beta2 = group['betas']
+                eps = group['eps']
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                # Update biased first moment estimate.
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                # Update the exponentially weighted infinity norm.
+                norm_buf = torch.cat([
+                    exp_inf.mul_(beta2).unsqueeze(0),
+                    grad.abs().add_(eps).unsqueeze_(0)
+                ], 0)
+                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+
+                bias_correction = 1 - beta1 ** state['step']
+                clr = group['lr'] / bias_correction
+
+                p.data.addcdiv_(-clr, exp_avg, exp_inf)
+
+        return loss
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
new file mode 100644
index 0000000..05c9903
--- /dev/null
+++ b/torch/optim/asgd.py
@@ -0,0 +1,84 @@
+import math
+import torch
+from .optimizer import Optimizer
+
+
+class ASGD(Optimizer):
+    """Implements Averaged Stochastic Gradient Descent.
+
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lambd (float, optional): decay term (default: 1e-4)
+        alpha (float, optional): power for eta update (default: 0.75)
+        t0 (float, optional): point at which to start averaging (default: 1e6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _Acceleration of stochastic approximation by averaging:
+        http://dl.acm.org/citation.cfm?id=131098
+    """
+
+    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
+                        weight_decay=weight_decay)
+        super(ASGD, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('ASGD does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['eta'] = group['lr']
+                    state['mu'] = 1
+                    state['ax'] = torch.zeros_like(p.data)
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                # decay term
+                p.data.mul_(1 - group['lambd'] * state['eta'])
+
+                # update parameter
+                p.data.add_(-state['eta'], grad)
+
+                # averaging
+                if state['mu'] != 1:
+                    state['ax'].add_(p.data.sub(state['ax']).mul(state['mu']))
+                else:
+                    state['ax'].copy_(p.data)
+
+                # update eta and mu
+                state['eta'] = (group['lr'] /
+                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
+                state['mu'] = 1 / max(1, state['step'] - group['t0'])
+
+        return loss
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
new file mode 100644
index 0000000..3ccfc13
--- /dev/null
+++ b/torch/optim/lbfgs.py
@@ -0,0 +1,251 @@
+import torch
+from functools import reduce
+from .optimizer import Optimizer
+
+
+class LBFGS(Optimizer):
+    """Implements L-BFGS algorithm.
+
+    .. warning::
+        This optimizer doesn't support per-parameter options and parameter
+        groups (there can be only one).
+
+    .. warning::
+        Right now all parameters have to be on a single device. This will be
+        improved in the future.
+
+    .. note::
+        This is a very memory intensive optimizer (it requires additional
+        ``param_bytes * (history_size + 1)`` bytes). If it doesn't fit in memory
+        try reducing the history size, or use a different algorithm.
+
+    Arguments:
+        lr (float): learning rate (default: 1)
+        max_iter (int): maximal number of iterations per optimization step
+            (default: 20)
+        max_eval (int): maximal number of function evaluations per optimization
+            step (default: max_iter * 1.25).
+        tolerance_grad (float): termination tolerance on first order optimality
+            (default: 1e-5).
+        tolerance_change (float): termination tolerance on function
+            value/parameter changes (default: 1e-9).
+        history_size (int): update history size (default: 100).
+    """
+
+    def __init__(self, params, lr=1, max_iter=20, max_eval=None,
+                 tolerance_grad=1e-5, tolerance_change=1e-9, history_size=100,
+                 line_search_fn=None):
+        if max_eval is None:
+            max_eval = max_iter * 5 // 4
+        defaults = dict(lr=lr, max_iter=max_iter, max_eval=max_eval,
+                        tolerance_grad=tolerance_grad, tolerance_change=tolerance_change,
+                        history_size=history_size, line_search_fn=line_search_fn)
+        super(LBFGS, self).__init__(params, defaults)
+
+        if len(self.param_groups) != 1:
+            raise ValueError("LBFGS doesn't support per-parameter options "
+                             "(parameter groups)")
+
+        self._params = self.param_groups[0]['params']
+        self._numel_cache = None
+
+    def _numel(self):
+        if self._numel_cache is None:
+            self._numel_cache = reduce(lambda total, p: total + p.numel(), self._params, 0)
+        return self._numel_cache
+
+    def _gather_flat_grad(self):
+        views = []
+        for p in self._params:
+            if p.grad is None:
+                view = p.data.new(p.data.numel()).zero_()
+            elif p.grad.data.is_sparse:
+                view = p.grad.data.to_dense().view(-1)
+            else:
+                view = p.grad.data.view(-1)
+            views.append(view)
+        return torch.cat(views, 0)
+
+    def _add_grad(self, step_size, update):
+        offset = 0
+        for p in self._params:
+            numel = p.numel()
+            # view as to avoid deprecated pointwise semantics
+            p.data.add_(step_size, update[offset:offset + numel].view_as(p.data))
+            offset += numel
+        assert offset == self._numel()
+
+    def step(self, closure):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable): A closure that reevaluates the model
+                and returns the loss.
+        """
+        assert len(self.param_groups) == 1
+
+        group = self.param_groups[0]
+        lr = group['lr']
+        max_iter = group['max_iter']
+        max_eval = group['max_eval']
+        tolerance_grad = group['tolerance_grad']
+        tolerance_change = group['tolerance_change']
+        line_search_fn = group['line_search_fn']
+        history_size = group['history_size']
+
+        # NOTE: LBFGS has only global state, but we register it as state for
+        # the first param, because this helps with casting in load_state_dict
+        state = self.state[self._params[0]]
+        state.setdefault('func_evals', 0)
+        state.setdefault('n_iter', 0)
+
+        # evaluate initial f(x) and df/dx
+        orig_loss = closure()
+        loss = float(orig_loss)
+        current_evals = 1
+        state['func_evals'] += 1
+
+        flat_grad = self._gather_flat_grad()
+        abs_grad_sum = flat_grad.abs().sum()
+
+        if abs_grad_sum <= tolerance_grad:
+            return orig_loss
+
+        # tensors cached in state (for tracing)
+        d = state.get('d')
+        t = state.get('t')
+        old_dirs = state.get('old_dirs')
+        old_stps = state.get('old_stps')
+        H_diag = state.get('H_diag')
+        prev_flat_grad = state.get('prev_flat_grad')
+        prev_loss = state.get('prev_loss')
+
+        n_iter = 0
+        # optimize for a max of max_iter iterations
+        while n_iter < max_iter:
+            # keep track of nb of iterations
+            n_iter += 1
+            state['n_iter'] += 1
+
+            ############################################################
+            # compute gradient descent direction
+            ############################################################
+            if state['n_iter'] == 1:
+                d = flat_grad.neg()
+                old_dirs = []
+                old_stps = []
+                H_diag = 1
+            else:
+                # do lbfgs update (update memory)
+                y = flat_grad.sub(prev_flat_grad)
+                s = d.mul(t)
+                ys = y.dot(s)  # y*s
+                if ys > 1e-10:
+                    # updating memory
+                    if len(old_dirs) == history_size:
+                        # shift history by one (limited-memory)
+                        old_dirs.pop(0)
+                        old_stps.pop(0)
+
+                    # store new direction/step
+                    old_dirs.append(y)
+                    old_stps.append(s)
+
+                    # update scale of initial Hessian approximation
+                    H_diag = ys / y.dot(y)  # (y*y)
+
+                # compute the approximate (L-BFGS) inverse Hessian
+                # multiplied by the gradient
+                num_old = len(old_dirs)
+
+                if 'ro' not in state:
+                    state['ro'] = [None] * history_size
+                    state['al'] = [None] * history_size
+                ro = state['ro']
+                al = state['al']
+
+                for i in range(num_old):
+                    ro[i] = 1. / old_dirs[i].dot(old_stps[i])
+
+                # iteration in L-BFGS loop collapsed to use just one buffer
+                q = flat_grad.neg()
+                for i in range(num_old - 1, -1, -1):
+                    al[i] = old_stps[i].dot(q) * ro[i]
+                    q.add_(-al[i], old_dirs[i])
+
+                # multiply by initial Hessian
+                # r/d is the final direction
+                d = r = torch.mul(q, H_diag)
+                for i in range(num_old):
+                    be_i = old_dirs[i].dot(r) * ro[i]
+                    r.add_(al[i] - be_i, old_stps[i])
+
+            if prev_flat_grad is None:
+                prev_flat_grad = flat_grad.clone()
+            else:
+                prev_flat_grad.copy_(flat_grad)
+            prev_loss = loss
+
+            ############################################################
+            # compute step length
+            ############################################################
+            # reset initial guess for step size
+            if state['n_iter'] == 1:
+                t = min(1., 1. / abs_grad_sum) * lr
+            else:
+                t = lr
+
+            # directional derivative
+            gtd = flat_grad.dot(d)  # g * d
+
+            # optional line search: user function
+            ls_func_evals = 0
+            if line_search_fn is not None:
+                # perform line search, using user function
+                raise RuntimeError("line search function is not supported yet")
+            else:
+                # no line search, simply move with fixed-step
+                self._add_grad(t, d)
+                if n_iter != max_iter:
+                    # re-evaluate function only if not in last iteration
+                    # the reason we do this: in a stochastic setting,
+                    # no use to re-evaluate that function here
+                    loss = float(closure())
+                    flat_grad = self._gather_flat_grad()
+                    abs_grad_sum = flat_grad.abs().sum()
+                    ls_func_evals = 1
+
+            # update func eval
+            current_evals += ls_func_evals
+            state['func_evals'] += ls_func_evals
+
+            ############################################################
+            # check conditions
+            ############################################################
+            if n_iter == max_iter:
+                break
+
+            if current_evals >= max_eval:
+                break
+
+            if abs_grad_sum <= tolerance_grad:
+                break
+
+            if gtd > -tolerance_change:
+                break
+
+            if d.mul(t).abs_().sum() <= tolerance_change:
+                break
+
+            if abs(loss - prev_loss) < tolerance_change:
+                break
+
+        state['d'] = d
+        state['t'] = t
+        state['old_dirs'] = old_dirs
+        state['old_stps'] = old_stps
+        state['H_diag'] = H_diag
+        state['prev_flat_grad'] = prev_flat_grad
+        state['prev_loss'] = prev_loss
+
+        return orig_loss
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
new file mode 100644
index 0000000..ad7f780
--- /dev/null
+++ b/torch/optim/lr_scheduler.py
@@ -0,0 +1,383 @@
+import math
+import torch
+from torch._six import inf
+from bisect import bisect_right
+from functools import partial
+from .optimizer import Optimizer
+
+
+class _LRScheduler(object):
+    def __init__(self, optimizer, last_epoch=-1):
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError('{} is not an Optimizer'.format(
+                type(optimizer).__name__))
+        self.optimizer = optimizer
+        if last_epoch == -1:
+            for group in optimizer.param_groups:
+                group.setdefault('initial_lr', group['lr'])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if 'initial_lr' not in group:
+                    raise KeyError("param 'initial_lr' is not specified "
+                                   "in param_groups[{}] when resuming an optimizer".format(i))
+        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
+        self.step(last_epoch + 1)
+        self.last_epoch = last_epoch
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        """
+        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_lr(self):
+        raise NotImplementedError
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
+
+
+class LambdaLR(_LRScheduler):
+    """Sets the learning rate of each parameter group to the initial lr
+    times a given function. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        lr_lambda (function or list): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such
+            functions, one for each group in optimizer.param_groups.
+        last_epoch (int): The index of last epoch. Default: -1.
+
+    Example:
+        >>> # Assuming optimizer has two groups.
+        >>> lambda1 = lambda epoch: epoch // 30
+        >>> lambda2 = lambda epoch: 0.95 ** epoch
+        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
+        >>> for epoch in range(100):
+        >>>     scheduler.step()
+        >>>     train(...)
+        >>>     validate(...)
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1):
+        self.optimizer = optimizer
+        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            if len(lr_lambda) != len(optimizer.param_groups):
+                raise ValueError("Expected {} lr_lambdas, but got {}".format(
+                    len(optimizer.param_groups), len(lr_lambda)))
+            self.lr_lambdas = list(lr_lambda)
+        self.last_epoch = last_epoch
+        super(LambdaLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        return [base_lr * lmbda(self.last_epoch)
+                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+
+
+class StepLR(_LRScheduler):
+    """Sets the learning rate of each parameter group to the initial lr
+    decayed by gamma every step_size epochs. When last_epoch=-1, sets
+    initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Default: 0.1.
+        last_epoch (int): The index of last epoch. Default: -1.
+
+    Example:
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.05     if epoch < 30
+        >>> # lr = 0.005    if 30 <= epoch < 60
+        >>> # lr = 0.0005   if 60 <= epoch < 90
+        >>> # ...
+        >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+        >>> for epoch in range(100):
+        >>>     scheduler.step()
+        >>>     train(...)
+        >>>     validate(...)
+    """
+
+    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
+        self.step_size = step_size
+        self.gamma = gamma
+        super(StepLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
+                for base_lr in self.base_lrs]
+
+
+class MultiStepLR(_LRScheduler):
+    """Set the learning rate of each parameter group to the initial lr decayed
+    by gamma once the number of epoch reaches one of the milestones. When
+    last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Default: 0.1.
+        last_epoch (int): The index of last epoch. Default: -1.
+
+    Example:
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.05     if epoch < 30
+        >>> # lr = 0.005    if 30 <= epoch < 80
+        >>> # lr = 0.0005   if epoch >= 80
+        >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
+        >>> for epoch in range(100):
+        >>>     scheduler.step()
+        >>>     train(...)
+        >>>     validate(...)
+    """
+
+    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError('Milestones should be a list of'
+                             ' increasing integers. Got {}', milestones)
+        self.milestones = milestones
+        self.gamma = gamma
+        super(MultiStepLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+                for base_lr in self.base_lrs]
+
+
+class ExponentialLR(_LRScheduler):
+    """Set the learning rate of each parameter group to the initial lr decayed
+    by gamma every epoch. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        gamma (float): Multiplicative factor of learning rate decay.
+        last_epoch (int): The index of last epoch. Default: -1.
+    """
+
+    def __init__(self, optimizer, gamma, last_epoch=-1):
+        self.gamma = gamma
+        super(ExponentialLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        return [base_lr * self.gamma ** self.last_epoch
+                for base_lr in self.base_lrs]
+
+
+class CosineAnnealingLR(_LRScheduler):
+    r"""Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial lr and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
+        \cos(\frac{T_{cur}}{T_{max}}\pi))
+
+    When last_epoch=-1, sets initial lr as lr.
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
+    implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum learning rate. Default: 0.
+        last_epoch (int): The index of last epoch. Default: -1.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1):
+        self.T_max = T_max
+        self.eta_min = eta_min
+        super(CosineAnnealingLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        return [self.eta_min + (base_lr - self.eta_min) *
+                (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+                for base_lr in self.base_lrs]
+
+
+class ReduceLROnPlateau(object):
+    """Reduce learning rate when a metric has stopped improving.
+    Models often benefit from reducing the learning rate by a factor
+    of 2-10 once learning stagnates. This scheduler reads a metrics
+    quantity and if no improvement is seen for a 'patience' number
+    of epochs, the learning rate is reduced.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        mode (str): One of `min`, `max`. In `min` mode, lr will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `max` mode it will be reduced when the
+            quantity monitored has stopped increasing. Default: 'min'.
+        factor (float): Factor by which the learning rate will be
+            reduced. new_lr = lr * factor. Default: 0.1.
+        patience (int): Number of epochs with no improvement after
+            which learning rate will be reduced. For example, if
+            `patience = 2`, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the LR after the
+            3rd epoch if the loss still hasn't improved then.
+            Default: 10.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Default: 1e-4.
+        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
+            dynamic_threshold = best * ( 1 + threshold ) in 'max'
+            mode or best * ( 1 - threshold ) in `min` mode.
+            In `abs` mode, dynamic_threshold = best + threshold in
+            `max` mode or best - threshold in `min` mode. Default: 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after lr has been reduced. Default: 0.
+        min_lr (float or list): A scalar or a list of scalars. A
+            lower bound on the learning rate of all param groups
+            or each group respectively. Default: 0.
+        eps (float): Minimal decay applied to lr. If the difference
+            between new and old lr is smaller than eps, the update is
+            ignored. Default: 1e-8.
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> scheduler = ReduceLROnPlateau(optimizer, 'min')
+        >>> for epoch in range(10):
+        >>>     train(...)
+        >>>     val_loss = validate(...)
+        >>>     # Note that step should be called after validate()
+        >>>     scheduler.step(val_loss)
+    """
+
+    def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
+                 verbose=False, threshold=1e-4, threshold_mode='rel',
+                 cooldown=0, min_lr=0, eps=1e-8):
+
+        if factor >= 1.0:
+            raise ValueError('Factor should be < 1.0.')
+        self.factor = factor
+
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError('{} is not an Optimizer'.format(
+                type(optimizer).__name__))
+        self.optimizer = optimizer
+
+        if isinstance(min_lr, list) or isinstance(min_lr, tuple):
+            if len(min_lr) != len(optimizer.param_groups):
+                raise ValueError("expected {} min_lrs, got {}".format(
+                    len(optimizer.param_groups), len(min_lr)))
+            self.min_lrs = list(min_lr)
+        else:
+            self.min_lrs = [min_lr] * len(optimizer.param_groups)
+
+        self.patience = patience
+        self.verbose = verbose
+        self.cooldown = cooldown
+        self.cooldown_counter = 0
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.best = None
+        self.num_bad_epochs = None
+        self.mode_worse = None  # the worse value for the chosen mode
+        self.is_better = None
+        self.eps = eps
+        self.last_epoch = -1
+        self._init_is_better(mode=mode, threshold=threshold,
+                             threshold_mode=threshold_mode)
+        self._reset()
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.mode_worse
+        self.cooldown_counter = 0
+        self.num_bad_epochs = 0
+
+    def step(self, metrics, epoch=None):
+        current = metrics
+        if epoch is None:
+            epoch = self.last_epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+
+        if self.is_better(current, self.best):
+            self.best = current
+            self.num_bad_epochs = 0
+        else:
+            self.num_bad_epochs += 1
+
+        if self.in_cooldown:
+            self.cooldown_counter -= 1
+            self.num_bad_epochs = 0  # ignore any bad epochs in cooldown
+
+        if self.num_bad_epochs > self.patience:
+            self._reduce_lr(epoch)
+            self.cooldown_counter = self.cooldown
+            self.num_bad_epochs = 0
+
+    def _reduce_lr(self, epoch):
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            old_lr = float(param_group['lr'])
+            new_lr = max(old_lr * self.factor, self.min_lrs[i])
+            if old_lr - new_lr > self.eps:
+                param_group['lr'] = new_lr
+                if self.verbose:
+                    print('Epoch {:5d}: reducing learning rate'
+                          ' of group {} to {:.4e}.'.format(epoch, i, new_lr))
+
+    @property
+    def in_cooldown(self):
+        return self.cooldown_counter > 0
+
+    def _cmp(self, mode, threshold_mode, threshold, a, best):
+        if mode == 'min' and threshold_mode == 'rel':
+            rel_epsilon = 1. - threshold
+            return a < best * rel_epsilon
+
+        elif mode == 'min' and threshold_mode == 'abs':
+            return a < best - threshold
+
+        elif mode == 'max' and threshold_mode == 'rel':
+            rel_epsilon = threshold + 1.
+            return a > best * rel_epsilon
+
+        else:  # mode == 'max' and epsilon_mode == 'abs':
+            return a > best + threshold
+
+    def _init_is_better(self, mode, threshold, threshold_mode):
+        if mode not in {'min', 'max'}:
+            raise ValueError('mode ' + mode + ' is unknown!')
+        if threshold_mode not in {'rel', 'abs'}:
+            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
+
+        if mode == 'min':
+            self.mode_worse = inf
+        else:  # mode == 'max':
+            self.mode_worse = -inf
+
+        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
+
+    def state_dict(self):
+        return {key: value for key, value in self.__dict__.items() if key not in {'optimizer', 'is_better'}}
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+        self._init_is_better(mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode)
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
new file mode 100644
index 0000000..21f69bb
--- /dev/null
+++ b/torch/optim/optimizer.py
@@ -0,0 +1,209 @@
+from collections import defaultdict, Iterable
+
+import torch
+from copy import deepcopy
+from itertools import chain
+
+required = object()
+
+
+class Optimizer(object):
+    r"""Base class for all optimizers.
+
+    .. warning::
+        Parameters need to be specified as collections that have a deterministic
+        ordering that is consistent between runs. Examples of objects that don't
+        satisfy those properties are sets and iterators over values of dictionaries.
+
+    Arguments:
+        params (iterable): an iterable of :class:`torch.Tensor` s or
+            :class:`dict` s. Specifies what Tensors should be optimized.
+        defaults: (dict): a dict containing default values of optimization
+            options (used when a parameter group doesn't specify them).
+    """
+
+    def __init__(self, params, defaults):
+        self.defaults = defaults
+
+        if isinstance(params, torch.Tensor):
+            raise TypeError("params argument given to the optimizer should be "
+                            "an iterable of Tensors or dicts, but got " +
+                            torch.typename(params))
+
+        self.state = defaultdict(dict)
+        self.param_groups = []
+
+        param_groups = list(params)
+        if len(param_groups) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+        if not isinstance(param_groups[0], dict):
+            param_groups = [{'params': param_groups}]
+
+        for param_group in param_groups:
+            self.add_param_group(param_group)
+
+    def __getstate__(self):
+        return {
+            'state': self.state,
+            'param_groups': self.param_groups,
+        }
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        for i, group in enumerate(self.param_groups):
+            format_string += '\n'
+            format_string += 'Parameter Group {0}\n'.format(i)
+            for key in sorted(group.keys()):
+                if key != 'params':
+                    format_string += '    {0}: {1}\n'.format(key, group[key])
+        format_string += ')'
+        return format_string
+
+    def state_dict(self):
+        r"""Returns the state of the optimizer as a :class:`dict`.
+
+        It contains two entries:
+
+        * state - a dict holding current optimization state. Its content
+            differs between optimizer classes.
+        * param_groups - a dict containing all parameter groups
+        """
+        # Save ids instead of Tensors
+        def pack_group(group):
+            packed = {k: v for k, v in group.items() if k != 'params'}
+            packed['params'] = [id(p) for p in group['params']]
+            return packed
+        param_groups = [pack_group(g) for g in self.param_groups]
+        # Remap state to use ids as keys
+        packed_state = {(id(k) if isinstance(k, torch.Tensor) else k): v
+                        for k, v in self.state.items()}
+        return {
+            'state': packed_state,
+            'param_groups': param_groups,
+        }
+
+    def load_state_dict(self, state_dict):
+        r"""Loads the optimizer state.
+
+        Arguments:
+            state_dict (dict): optimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = deepcopy(state_dict)
+        # Validate the state_dict
+        groups = self.param_groups
+        saved_groups = state_dict['param_groups']
+
+        if len(groups) != len(saved_groups):
+            raise ValueError("loaded state dict has a different number of "
+                             "parameter groups")
+        param_lens = (len(g['params']) for g in groups)
+        saved_lens = (len(g['params']) for g in saved_groups)
+        if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
+            raise ValueError("loaded state dict contains a parameter group "
+                             "that doesn't match the size of optimizer's group")
+
+        # Update the state
+        id_map = {old_id: p for old_id, p in
+                  zip(chain(*(g['params'] for g in saved_groups)),
+                      chain(*(g['params'] for g in groups)))}
+
+        def cast(param, value):
+            r"""Make a deep copy of value, casting all tensors to device of param."""
+            if isinstance(value, torch.Tensor):
+                # Floating-point types are a bit special here. They are the only ones
+                # that are assumed to always match the type of params.
+                if param.is_floating_point():
+                    value = value.to(param.dtype)
+                value = value.to(param.device)
+                return value
+            elif isinstance(value, dict):
+                return {k: cast(param, v) for k, v in value.items()}
+            elif isinstance(value, Iterable):
+                return type(value)(cast(param, v) for v in value)
+            else:
+                return value
+
+        # Copy state assigned to params (and cast tensors to appropriate types).
+        # State that is not assigned to params is copied as is (needed for
+        # backward compatibility).
+        state = defaultdict(dict)
+        for k, v in state_dict['state'].items():
+            if k in id_map:
+                param = id_map[k]
+                state[param] = cast(param, v)
+            else:
+                state[k] = v
+
+        # Update parameter groups, setting their 'params' value
+        def update_group(group, new_group):
+            new_group['params'] = group['params']
+            return new_group
+        param_groups = [
+            update_group(g, ng) for g, ng in zip(groups, saved_groups)]
+        self.__setstate__({'state': state, 'param_groups': param_groups})
+
+    def zero_grad(self):
+        r"""Clears the gradients of all optimized :class:`torch.Tensor` s."""
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is not None:
+                    p.grad.detach_()
+                    p.grad.zero_()
+
+    def step(self, closure):
+        r"""Performs a single optimization step (parameter update).
+
+        Arguments:
+            closure (callable): A closure that reevaluates the model and
+                returns the loss. Optional for most optimizers.
+        """
+        raise NotImplementedError
+
+    def add_param_group(self, param_group):
+        r"""Add a param group to the :class:`Optimizer` s `param_groups`.
+
+        This can be useful when fine tuning a pre-trained network as frozen layers can be made
+        trainable and added to the :class:`Optimizer` as training progresses.
+
+        Arguments:
+            param_group (dict): Specifies what Tensors should be optimized along with group
+            specific optimization options.
+        """
+        assert isinstance(param_group, dict), "param group must be a dict"
+
+        params = param_group['params']
+        if isinstance(params, torch.Tensor):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
+                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
+        else:
+            param_group['params'] = list(params)
+
+        for param in param_group['params']:
+            if not isinstance(param, torch.Tensor):
+                raise TypeError("optimizer can only optimize Tensors, "
+                                "but one of the params is " + torch.typename(param))
+            if not param.is_leaf:
+                raise ValueError("can't optimize a non-leaf Tensor")
+
+        for name, default in self.defaults.items():
+            if default is required and name not in param_group:
+                raise ValueError("parameter group didn't specify a value of required optimization parameter " +
+                                 name)
+            else:
+                param_group.setdefault(name, default)
+
+        param_set = set()
+        for group in self.param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError("some parameters appear in more than one parameter group")
+
+        self.param_groups.append(param_group)
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
new file mode 100644
index 0000000..cbb28f7
--- /dev/null
+++ b/torch/optim/rmsprop.py
@@ -0,0 +1,102 @@
+import torch
+from .optimizer import Optimizer
+
+
+class RMSprop(Optimizer):
+    """Implements RMSprop algorithm.
+
+    Proposed by G. Hinton in his
+    `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+
+    The centered version first appears in `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    """
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= alpha:
+            raise ValueError("Invalid alpha value: {}".format(alpha))
+
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        super(RMSprop, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RMSprop, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('momentum', 0)
+            group.setdefault('centered', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('RMSprop does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['square_avg'] = torch.zeros_like(p.data)
+                    if group['momentum'] > 0:
+                        state['momentum_buffer'] = torch.zeros_like(p.data)
+                    if group['centered']:
+                        state['grad_avg'] = torch.zeros_like(p.data)
+
+                square_avg = state['square_avg']
+                alpha = group['alpha']
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
+
+                if group['centered']:
+                    grad_avg = state['grad_avg']
+                    grad_avg.mul_(alpha).add_(1 - alpha, grad)
+                    avg = square_avg.addcmul(-1, grad_avg, grad_avg).sqrt().add_(group['eps'])
+                else:
+                    avg = square_avg.sqrt().add_(group['eps'])
+
+                if group['momentum'] > 0:
+                    buf = state['momentum_buffer']
+                    buf.mul_(group['momentum']).addcdiv_(grad, avg)
+                    p.data.add_(-group['lr'], buf)
+                else:
+                    p.data.addcdiv_(-group['lr'], grad, avg)
+
+        return loss
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
new file mode 100644
index 0000000..d24c407
--- /dev/null
+++ b/torch/optim/rprop.py
@@ -0,0 +1,79 @@
+import math
+import torch
+from .optimizer import Optimizer
+
+
+class Rprop(Optimizer):
+    """Implements the resilient backpropagation algorithm.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
+        step_sizes (Tuple[float, float], optional): a pair of minimal and
+            maximal allowed step sizes (default: (1e-6, 50))
+    """
+
+    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < etas[0] < 1.0 < etas[1]:
+            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
+
+        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
+        super(Rprop, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Rprop does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['prev'] = torch.zeros_like(p.data)
+                    state['step_size'] = grad.new().resize_as_(grad).fill_(group['lr'])
+
+                etaminus, etaplus = group['etas']
+                step_size_min, step_size_max = group['step_sizes']
+                step_size = state['step_size']
+
+                state['step'] += 1
+
+                sign = grad.mul(state['prev']).sign()
+                sign[sign.gt(0)] = etaplus
+                sign[sign.lt(0)] = etaminus
+                sign[sign.eq(0)] = 1
+
+                # update stepsizes with step size updates
+                step_size.mul_(sign).clamp_(step_size_min, step_size_max)
+
+                # for dir<0, dfdx=0
+                # for dir>=0 dfdx=dfdx
+                grad = grad.clone()
+                grad[sign.eq(etaminus)] = 0
+
+                # update parameters
+                p.data.addcmul_(-1, grad.sign(), step_size)
+
+                state['prev'].copy_(grad)
+
+        return loss
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
new file mode 100644
index 0000000..78cb3e1
--- /dev/null
+++ b/torch/optim/sgd.py
@@ -0,0 +1,109 @@
+import torch
+from .optimizer import Optimizer, required
+
+
+class SGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+                  v = \rho * v + g \\
+                  p = p - lr * v
+
+        where p, g, v and :math:`\rho` denote the parameters, gradient,
+        velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+             v = \rho * v + lr * g \\
+             p = p - v
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(SGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(SGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                d_p = p.grad.data
+                if weight_decay != 0:
+                    d_p.add_(weight_decay, p.data)
+                if momentum != 0:
+                    param_state = self.state[p]
+                    if 'momentum_buffer' not in param_state:
+                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
+                        buf.mul_(momentum).add_(d_p)
+                    else:
+                        buf = param_state['momentum_buffer']
+                        buf.mul_(momentum).add_(1 - dampening, d_p)
+                    if nesterov:
+                        d_p = d_p.add(momentum, buf)
+                    else:
+                        d_p = buf
+
+                p.data.add_(-group['lr'], d_p)
+
+        return loss
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
new file mode 100644
index 0000000..74d15a9
--- /dev/null
+++ b/torch/optim/sparse_adam.py
@@ -0,0 +1,104 @@
+import math
+import torch
+from .optimizer import Optimizer
+
+
+class SparseAdam(Optimizer):
+    """Implements lazy version of Adam algorithm suitable for sparse tensors.
+
+    In this variant, only moments that show up in the gradient get updated, and
+    only those portions of the gradient get applied to the parameters.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
+        if not 0.0 < lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps)
+        super(SparseAdam, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if not grad.is_sparse:
+                    raise RuntimeError('SparseAdam does not support dense gradients, please consider Adam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                state['step'] += 1
+
+                grad = grad.coalesce()  # the update is non-linear so indices must be unique
+                grad_indices = grad._indices()
+                grad_values = grad._values()
+                size = grad.size()
+
+                def make_sparse(values):
+                    constructor = grad.new
+                    if grad_indices.dim() == 0 or values.dim() == 0:
+                        return constructor().resize_as_(grad)
+                    return constructor(grad_indices, values, size)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                # Decay the first and second moment running average coefficient
+                #      old <- b * old + (1 - b) * new
+                # <==> old += (1 - b) * (new - old)
+                old_exp_avg_values = exp_avg._sparse_mask(grad)._values()
+                exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
+                exp_avg.add_(make_sparse(exp_avg_update_values))
+                old_exp_avg_sq_values = exp_avg_sq._sparse_mask(grad)._values()
+                exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
+                exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))
+
+                # Dense addition again is intended, avoiding another _sparse_mask
+                numer = exp_avg_update_values.add_(old_exp_avg_values)
+                exp_avg_sq_update_values.add_(old_exp_avg_sq_values)
+                denom = exp_avg_sq_update_values.sqrt_().add_(group['eps'])
+                del exp_avg_update_values, exp_avg_sq_update_values
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.add_(make_sparse(-step_size * numer.div_(denom)))
+
+        return loss
diff --git a/torch/random.py b/torch/random.py
new file mode 100644
index 0000000..26cebf9
--- /dev/null
+++ b/torch/random.py
@@ -0,0 +1,111 @@
+import torch
+import contextlib
+import warnings
+
+from torch._C import default_generator
+
+
+def set_rng_state(new_state):
+    r"""Sets the random number generator state.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    default_generator.set_state(new_state)
+
+
+def get_rng_state():
+    r"""Returns the random number generator state as a `torch.ByteTensor`."""
+    return default_generator.get_state()
+
+
+def manual_seed(seed):
+    r"""Sets the seed for generating random numbers. Returns a
+    `torch._C.Generator` object.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+    import torch.cuda
+
+    if not torch.cuda._in_bad_fork:
+        torch.cuda.manual_seed_all(seed)
+
+    return default_generator.manual_seed(seed)
+
+
+def initial_seed():
+    r"""Returns the initial seed for generating random numbers as a
+    Python `long`.
+    """
+    return default_generator.initial_seed()
+
+
+_fork_rng_warned_already = False
+
+
+@contextlib.contextmanager
+def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices"):
+    """
+    Forks the RNG, so that when you return, the RNG is reset
+    to the state that it was previously in.
+
+    Arguments:
+        devices (iterable of CUDA IDs): CUDA devices for which to fork
+            the RNG.  CPU RNG state is always forked.  By default, :meth:`fork_rng` operates
+            on all devices, but will emit a warning if your machine has a lot
+            of devices, since this function will run very slowly in that case.
+            If you explicitly specify devices, this warning will be supressed
+        enabled (bool): if ``False``, the RNG is not forked.  This is a convenience
+            argument for easily disabling the context manager without having
+            to reindent your Python code.
+    """
+
+    import torch.cuda
+    global _fork_rng_warned_already
+
+    # Internal arguments:
+    #   _caller: the function which called fork_rng, which the user used
+    #   _devices_kw: the devices keyword of _caller
+
+    if not enabled:
+        yield
+        return
+
+    if devices is None:
+        num_devices = torch.cuda.device_count()
+        if num_devices > 1 and not _fork_rng_warned_already:
+            warnings.warn(
+                ("CUDA reports that you have {num_devices} available devices, and you "
+                 "have used {caller} without explicitly specifying which devices are being used. "
+                 "For safety, we initialize *every* CUDA device by default, which "
+                 "can be quite slow if you have a lot of GPUs.  If you know that you are only "
+                 "making use of a few CUDA devices, set the environment variable CUDA_VISIBLE_DEVICES "
+                 "or the '{devices_kw}' keyword argument of {caller} with the set of devices "
+                 "you are actually using.  For example, if you are using CPU only, "
+                 "set CUDA_VISIBLE_DEVICES= or devices=[]; if you are using "
+                 "GPU 0 only, set CUDA_VISIBLE_DEVICES=0 or devices=[0].  To initialize "
+                 "all devices and suppress this warning, set the '{devices_kw}' keyword argument "
+                 "to `range(torch.cuda.device_count())`."
+                 ).format(num_devices=num_devices, caller=_caller, devices_kw=_devices_kw))
+            _fork_rng_warned_already = True
+        devices = list(range(num_devices))
+    else:
+        # Protect against user passing us a generator; we need to traverse this
+        # multiple times but a generator will be exhausted upon first traversal
+        devices = list(devices)
+
+    cpu_rng_state = torch.get_rng_state()
+    gpu_rng_states = []
+    for device in devices:
+        with torch.cuda.device(device):
+            gpu_rng_states.append(torch.cuda.get_rng_state())
+
+    try:
+        yield
+    finally:
+        torch.set_rng_state(cpu_rng_state)
+        for device, gpu_rng_state in zip(devices, gpu_rng_states):
+            with torch.cuda.device(device):
+                torch.cuda.set_rng_state(gpu_rng_state)
diff --git a/torch/serialization.py b/torch/serialization.py
new file mode 100644
index 0000000..85c1a83
--- /dev/null
+++ b/torch/serialization.py
@@ -0,0 +1,552 @@
+import difflib
+import inspect
+import os
+import io
+import shutil
+import struct
+import sys
+import torch
+import tarfile
+import tempfile
+import warnings
+from contextlib import closing, contextmanager
+from ._utils import _import_dotted_name
+from ._six import string_classes as _string_classes
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+    import pathlib
+
+DEFAULT_PROTOCOL = 2
+
+LONG_SIZE = struct.Struct('=l').size
+INT_SIZE = struct.Struct('=i').size
+SHORT_SIZE = struct.Struct('=h').size
+
+MAGIC_NUMBER = 0x1950a86a20f9469cfc6c
+PROTOCOL_VERSION = 1001
+STORAGE_KEY_SEPARATOR = ','
+
+
+class SourceChangeWarning(Warning):
+    pass
+
+
+@contextmanager
+def mkdtemp():
+    path = tempfile.mkdtemp()
+    yield path
+    shutil.rmtree(path)
+
+
+_package_registry = []
+
+
+def register_package(priority, tagger, deserializer):
+    queue_elem = (priority, tagger, deserializer)
+    _package_registry.append(queue_elem)
+    _package_registry.sort()
+
+
+def _cpu_tag(obj):
+    if type(obj).__module__ == 'torch':
+        return 'cpu'
+
+
+def _cuda_tag(obj):
+    if type(obj).__module__ == 'torch.cuda':
+        return 'cuda:' + str(obj.get_device())
+
+
+def _cpu_deserialize(obj, location):
+    if location == 'cpu':
+        return obj
+
+
+def _cuda_deserialize(obj, location):
+    if location.startswith('cuda'):
+        if location[5:] == '':
+            device = 0
+        else:
+            device = max(int(location[5:]), 0)
+
+        if not torch.cuda.is_available():
+            raise RuntimeError('Attempting to deserialize object on a CUDA '
+                               'device but torch.cuda.is_available() is False. '
+                               'If you are running on a CPU-only machine, '
+                               'please use torch.load with map_location=\'cpu\' '
+                               'to map your storages to the CPU.')
+        if device >= torch.cuda.device_count():
+            raise RuntimeError('Attempting to deserialize object on CUDA device '
+                               '{} but torch.cuda.device_count() is {}. Please use '
+                               'torch.load with map_location to map your storages '
+                               'to an existing device.'.format(
+                                   device, torch.cuda.device_count()))
+        return obj.cuda(device)
+
+
+register_package(10, _cpu_tag, _cpu_deserialize)
+register_package(20, _cuda_tag, _cuda_deserialize)
+
+
+def location_tag(storage):
+    for _, tagger, _ in _package_registry:
+        location = tagger(storage)
+        if location:
+            return location
+    raise RuntimeError("don't know how to determine data location of " +
+                       torch.typename(storage))
+
+
+def default_restore_location(storage, location):
+    for _, _, fn in _package_registry:
+        result = fn(storage, location)
+        if result is not None:
+            return result
+    raise RuntimeError("don't know how to restore data location of " +
+                       torch.typename(storage) + " (tagged with " +
+                       location + ")")
+
+
+def normalize_storage_type(storage_type):
+    return getattr(torch, storage_type.__name__)
+
+
+def storage_to_tensor_type(storage):
+    storage_type = type(storage)
+    module = _import_dotted_name(storage_type.__module__)
+    return getattr(module, storage_type.__name__.replace('Storage', 'Tensor'))
+
+
+def _with_file_like(f, mode, body):
+    """
+    Executes a body function with a file object for f, opening
+    it in 'mode' if it is a string filename.
+    """
+    new_fd = False
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        new_fd = True
+        f = open(f, mode)
+    try:
+        return body(f)
+    finally:
+        if new_fd:
+            f.close()
+
+
+def _is_compressed_file(f):
+    compress_modules = ['gzip']
+    try:
+        return f.__module__ in compress_modules
+    except AttributeError:
+        return False
+
+
+def _should_read_directly(f):
+    """
+    Checks if f is a file that should be read directly. It should be read
+    directly if it is backed by a real file (has a fileno) and is not a
+    a compressed file (e.g. gzip)
+    """
+    if _is_compressed_file(f):
+        return False
+    try:
+        return f.fileno() >= 0
+    except io.UnsupportedOperation:
+        return False
+    except AttributeError:
+        return False
+
+
+def _check_seekable(f):
+
+    def raise_err_msg(patterns, e):
+        for p in patterns:
+            if p in str(e):
+                msg = (str(e) + ". You can only torch.load from a file that is seekable." +
+                                " Please pre-load the data into a buffer like io.BytesIO and" +
+                                " try to load from it instead.")
+                raise type(e)(msg)
+        raise e
+
+    try:
+        f.seek(f.tell())
+        return True
+    except (io.UnsupportedOperation, AttributeError) as e:
+        raise_err_msg(["seek", "tell"], e)
+
+
+def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):
+    """Saves an object to a disk file.
+
+    See also: :ref:`recommend-saving-models`
+
+    Args:
+        obj: saved object
+        f: a file-like object (has to implement write and flush) or a string
+           containing a file name
+        pickle_module: module used for pickling metadata and objects
+        pickle_protocol: can be specified to override the default protocol
+
+    .. warning::
+        If you are using Python 2, torch.save does NOT support StringIO.StringIO
+        as a valid file-like object. This is because the write method should return
+        the number of bytes written; StringIO.write() does not do this.
+
+        Please use something like io.BytesIO instead.
+
+    Example:
+        >>> # Save to file
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
+        >>> torch.save(x, 'tensor.pt')
+        >>> # Save to io.BytesIO buffer
+        >>> buffer = io.BytesIO()
+        >>> torch.save(x, buffer)
+    """
+    return _with_file_like(f, "wb", lambda f: _save(obj, f, pickle_module, pickle_protocol))
+
+
+def _save(obj, f, pickle_module, pickle_protocol):
+    if sys.version_info[0] == 2:
+        import StringIO
+        if isinstance(f, StringIO.StringIO):
+            msg = ('torch.save received unsupported StringIO.StringIO file object, whose '
+                   'write method does not return the number of bytes written. '
+                   'Please use something like io.BytesIO for torch.save instead.')
+            raise RuntimeError(msg)
+
+    import torch.nn as nn
+    serialized_container_types = {}
+    serialized_storages = {}
+
+    def persistent_id(obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            if obj in serialized_container_types:
+                return None
+            serialized_container_types[obj] = True
+            source_file = source = None
+            try:
+                source_file = inspect.getsourcefile(obj)
+                source = inspect.getsource(obj)
+            except Exception:  # saving the source is optional, so we can ignore any errors
+                warnings.warn("Couldn't retrieve source code for container of "
+                              "type " + obj.__name__ + ". It won't be checked "
+                              "for correctness upon loading.")
+            return ('module', obj, source_file, source)
+        elif torch.is_storage(obj):
+            storage_type = normalize_storage_type(type(obj))
+            # Offset is always 0, but we keep it for backwards compatibility
+            # with the old serialization format (which supported storage views)
+            offset = 0
+            obj_key = str(obj._cdata)
+            location = location_tag(obj)
+            serialized_storages[obj_key] = obj
+            is_view = obj._cdata != obj._cdata
+            if is_view:
+                view_metadata = (str(obj._cdata), offset, obj.size())
+            else:
+                view_metadata = None
+
+            return ('storage',
+                    storage_type,
+                    obj_key,
+                    location,
+                    obj.size(),
+                    view_metadata)
+
+        return None
+
+    sys_info = dict(
+        protocol_version=PROTOCOL_VERSION,
+        little_endian=sys.byteorder == 'little',
+        type_sizes=dict(
+            short=SHORT_SIZE,
+            int=INT_SIZE,
+            long=LONG_SIZE,
+        ),
+    )
+
+    pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
+    pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
+    pickle_module.dump(sys_info, f, protocol=pickle_protocol)
+    pickler = pickle_module.Pickler(f, protocol=pickle_protocol)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+
+    serialized_storage_keys = sorted(serialized_storages.keys())
+    pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
+    f.flush()
+    for key in serialized_storage_keys:
+        serialized_storages[key]._write_file(f, _should_read_directly(f))
+
+
+def load(f, map_location=None, pickle_module=pickle):
+    """Loads an object saved with :func:`torch.save` from a file.
+
+    :meth:`torch.load` uses Python's unpickling facilities but treats storages,
+    which underlie tensors, specially. They are first deserialized on the
+    CPU and are then moved to the device they were saved from. If this fails
+    (e.g. because the run time system doesn't have certain devices), an exception
+    is raised. However, storages can be dynamically remapped to an alternative
+    set of devices using the `map_location` argument.
+
+    If `map_location` is a callable, it will be called once for each serialized
+    storage with two arguments: storage and location. The storage argument
+    will be the initial deserialization of the storage, residing on the CPU.
+    Each serialized storage has a location tag associated with it which
+    identifies the device it was saved from, and this tag is the second
+    argument passed to map_location. The builtin location tags are `'cpu'` for
+    CPU tensors and `'cuda:device_id'` (e.g. `'cuda:2'`) for CUDA tensors.
+    `map_location` should return either None or a storage. If `map_location` returns
+    a storage, it will be used as the final deserialized object, already moved to
+    the right device. Otherwise, :math:`torch.load` will fall back to the default
+    behavior, as if `map_location` wasn't specified.
+
+    If `map_location` is a string, it should be a device tag, where all tensors
+    should be loaded.
+
+    Otherwise, if `map_location` is a dict, it will be used to remap location tags
+    appearing in the file (keys), to ones that specify where to put the
+    storages (values).
+
+    User extensions can register their own location tags and tagging and
+    deserialization methods using `register_package`.
+
+    Args:
+        f: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+        map_location: a function, torch.device, string or a dict specifying how to remap storage
+            locations
+        pickle_module: module used for unpickling metadata and objects (has to
+            match the pickle_module used to serialize file)
+
+    .. note::
+        When you call :meth:`torch.load()` on a file which contains GPU tensors, those tensors
+        will be loaded to GPU by default. You can call `torch.load(.., map_location='cpu')`
+        and then :meth:`load_state_dict` to avoid GPU RAM surge when loading a model checkpoint.
+
+    Example:
+        >>> torch.load('tensors.pt')
+        # Load all tensors onto the CPU
+        >>> torch.load('tensors.pt', map_location=torch.device('cpu'))
+        # Load all tensors onto the CPU, using a function
+        >>> torch.load('tensors.pt', map_location=lambda storage, loc: storage)
+        # Load all tensors onto GPU 1
+        >>> torch.load('tensors.pt', map_location=lambda storage, loc: storage.cuda(1))
+        # Map tensors from GPU 1 to GPU 0
+        >>> torch.load('tensors.pt', map_location={'cuda:1':'cuda:0'})
+        # Load tensor from io.BytesIO object
+        >>> with open('tensor.pt') as f:
+                buffer = io.BytesIO(f.read())
+        >>> torch.load(buffer)
+    """
+    new_fd = False
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        new_fd = True
+        f = open(f, 'rb')
+    try:
+        return _load(f, map_location, pickle_module)
+    finally:
+        if new_fd:
+            f.close()
+
+
+def _load(f, map_location, pickle_module):
+    deserialized_objects = {}
+
+    if map_location is None:
+        restore_location = default_restore_location
+    elif isinstance(map_location, dict):
+        def restore_location(storage, location):
+            location = map_location.get(location, location)
+            return default_restore_location(storage, location)
+    elif isinstance(map_location, _string_classes):
+        def restore_location(storage, location):
+            return default_restore_location(storage, map_location)
+    elif isinstance(map_location, torch.device):
+        def restore_location(storage, location):
+            return default_restore_location(storage, str(map_location))
+    else:
+        def restore_location(storage, location):
+            result = map_location(storage, location)
+            if result is None:
+                result = default_restore_location(storage, location)
+            return result
+
+    def _check_container_source(container_type, source_file, original_source):
+        try:
+            current_source = inspect.getsource(container_type)
+        except Exception:  # saving the source is optional, so we can ignore any errors
+            warnings.warn("Couldn't retrieve source code for container of "
+                          "type " + container_type.__name__ + ". It won't be checked "
+                          "for correctness upon loading.")
+            return
+        if original_source != current_source:
+            if container_type.dump_patches:
+                file_name = container_type.__name__ + '.patch'
+                diff = difflib.unified_diff(current_source.split('\n'),
+                                            original_source.split('\n'),
+                                            source_file,
+                                            source_file, lineterm="")
+                lines = '\n'.join(diff)
+                try:
+                    with open(file_name, 'a+') as f:
+                        file_size = f.seek(0, 2)
+                        f.seek(0)
+                        if file_size == 0:
+                            f.write(lines)
+                        elif file_size != len(lines) or f.read() != lines:
+                            raise IOError
+                    msg = ("Saved a reverse patch to " + file_name + ". "
+                           "Run `patch -p0 < " + file_name + "` to revert your "
+                           "changes.")
+                except IOError:
+                    msg = ("Tried to save a patch, but couldn't create a "
+                           "writable file " + file_name + ". Make sure it "
+                           "doesn't exist and your working directory is "
+                           "writable.")
+            else:
+                msg = ("you can retrieve the original source code by "
+                       "accessing the object's source attribute or set "
+                       "`torch.nn.Module.dump_patches = True` and use the "
+                       "patch tool to revert the changes.")
+            msg = ("source code of class '{}' has changed. {}"
+                   .format(torch.typename(container_type), msg))
+            warnings.warn(msg, SourceChangeWarning)
+
+    def legacy_load(f):
+        deserialized_objects = {}
+
+        def persistent_load(saved_id):
+            if isinstance(saved_id, tuple):
+                # Ignore containers that don't have any sources saved
+                if all(saved_id[1:]):
+                    _check_container_source(*saved_id)
+                return saved_id[0]
+            return deserialized_objects[int(saved_id)]
+
+        with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as tar, \
+                mkdtemp() as tmpdir:
+
+            tar.extract('storages', path=tmpdir)
+            with open(os.path.join(tmpdir, 'storages'), 'rb', 0) as f:
+                num_storages = pickle_module.load(f)
+                for i in range(num_storages):
+                    args = pickle_module.load(f)
+                    key, location, storage_type = args
+                    obj = storage_type._new_with_file(f)
+                    obj = restore_location(obj, location)
+                    deserialized_objects[key] = obj
+
+                storage_views = pickle_module.load(f)
+                for target_cdata, root_cdata, offset, size in storage_views:
+                    root = deserialized_objects[root_cdata]
+                    if offset != 0 or size != root.size():
+                        warnings.warn("Detected storage view in legacy serialized data: "
+                                      "storage views are no longer natively supported, so we are making "
+                                      "a copy of the data instead.  THIS IS A SEMANTIC CHANGE! "
+                                      "If you need aliasing, reserialize your model using "
+                                      "tensors that share storage.")
+
+                        tensor = torch._utils._rebuild_tensor(root, offset, (size,), (1,))
+                        obj = tensor.clone().storage()
+                    else:
+                        # NB: This line does not appear to be exercised by the
+                        # test suite.
+                        obj = root
+                    deserialized_objects[target_cdata] = obj
+
+            tar.extract('tensors', path=tmpdir)
+            with open(os.path.join(tmpdir, 'tensors'), 'rb', 0) as f:
+                num_tensors = pickle_module.load(f)
+                for _ in range(num_tensors):
+                    args = pickle_module.load(f)
+                    key, storage_id, original_tensor_type = args
+                    storage = deserialized_objects[storage_id]
+                    tensor_type = storage_to_tensor_type(storage)
+                    ndim, = struct.unpack('<i', f.read(4))
+                    # skip next 4 bytes; legacy encoding treated ndim as 8 bytes
+                    f.read(4)
+                    size = struct.unpack('<{}q'.format(ndim), f.read(8 * ndim))
+                    stride = struct.unpack('<{}q'.format(ndim), f.read(8 * ndim))
+                    storage_offset, = struct.unpack('<q', f.read(8))
+                    tensor = tensor_type().set_(storage, storage_offset, size, stride)
+                    deserialized_objects[key] = tensor
+
+            pickle_file = tar.extractfile('pickle')
+            unpickler = pickle_module.Unpickler(pickle_file)
+            unpickler.persistent_load = persistent_load
+            result = unpickler.load()
+            return result
+
+    deserialized_objects = {}
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = saved_id[0]
+        data = saved_id[1:]
+
+        if typename == 'module':
+            # Ignore containers that don't have any sources saved
+            if all(data[1:]):
+                _check_container_source(*data)
+            return data[0]
+        elif typename == 'storage':
+            data_type, root_key, location, size, view_metadata = data
+            if root_key not in deserialized_objects:
+                deserialized_objects[root_key] = restore_location(
+                    data_type(size), location)
+            storage = deserialized_objects[root_key]
+            if view_metadata is not None:
+                view_key, offset, view_size = view_metadata
+                if view_key not in deserialized_objects:
+                    deserialized_objects[view_key] = storage[offset:offset + view_size]
+                return deserialized_objects[view_key]
+            else:
+                return storage
+        else:
+            raise RuntimeError("Unknown saved id type: %s" % saved_id[0])
+
+    _check_seekable(f)
+    f_should_read_directly = _should_read_directly(f)
+
+    if f_should_read_directly and f.tell() == 0:
+        # legacy_load requires that f has fileno()
+        # only if offset is zero we can attempt the legacy tar file loader
+        try:
+            return legacy_load(f)
+        except tarfile.TarError:
+            # if not a tarfile, reset file offset and proceed
+            f.seek(0)
+
+    magic_number = pickle_module.load(f)
+    if magic_number != MAGIC_NUMBER:
+        raise RuntimeError("Invalid magic number; corrupt file?")
+    protocol_version = pickle_module.load(f)
+    if protocol_version != PROTOCOL_VERSION:
+        raise RuntimeError("Invalid protocol version: %s" % protocol_version)
+
+    _sys_info = pickle_module.load(f)
+    unpickler = pickle_module.Unpickler(f)
+    unpickler.persistent_load = persistent_load
+    result = unpickler.load()
+
+    deserialized_storage_keys = pickle_module.load(f)
+
+    offset = f.tell() if f_should_read_directly else None
+    for key in deserialized_storage_keys:
+        assert key in deserialized_objects
+        deserialized_objects[key]._set_from_file(f, offset, f_should_read_directly)
+        offset = None
+
+    return result
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
new file mode 100644
index 0000000..f37a341
--- /dev/null
+++ b/torch/sparse/__init__.py
@@ -0,0 +1 @@
+# The Tensor classes are added to this module by python_tensor.cpp
diff --git a/torch/storage.py b/torch/storage.py
new file mode 100644
index 0000000..a8ea4da
--- /dev/null
+++ b/torch/storage.py
@@ -0,0 +1,128 @@
+import io
+
+import torch
+from ._utils import _type, _cuda
+
+
+class _StorageBase(object):
+    is_cuda = False
+    is_sparse = False
+
+    def __str__(self):
+        content = ' ' + '\n '.join(str(self[i]) for i in range(len(self)))
+        return content + '\n[{} of size {}]'.format(torch.typename(self), len(self))
+
+    def __repr__(self):
+        return str(self)
+
+    def __iter__(self):
+        return iter(map(lambda i: self[i], range(self.size())))
+
+    def __copy__(self):
+        return self.clone()
+
+    def __deepcopy__(self, memo):
+        memo = memo.setdefault('torch', {})
+        if self._cdata in memo:
+            return memo[self._cdata]
+        new_storage = self.clone()
+        memo[self._cdata] = new_storage
+        return new_storage
+
+    def __reduce__(self):
+        b = io.BytesIO()
+        torch.save(self, b)
+        return (_load_from_bytes, (b.getvalue(),))
+
+    def __sizeof__(self):
+        return super(_StorageBase, self).__sizeof__() + self.element_size() * self.size()
+
+    def clone(self):
+        """Returns a copy of this storage"""
+        return type(self)(self.size()).copy_(self)
+
+    def tolist(self):
+        """Returns a list containing the elements of this storage"""
+        return [v for v in self]
+
+    def cpu(self):
+        """Returns a CPU copy of this storage if it's not already on the CPU"""
+        return self.type(getattr(torch, self.__class__.__name__))
+
+    def double(self):
+        """Casts this storage to double type"""
+        return self.type(type(self).__module__ + '.DoubleStorage')
+
+    def float(self):
+        """Casts this storage to float type"""
+        return self.type(type(self).__module__ + '.FloatStorage')
+
+    def half(self):
+        """Casts this storage to half type"""
+        return self.type(type(self).__module__ + '.HalfStorage')
+
+    def long(self):
+        """Casts this storage to long type"""
+        return self.type(type(self).__module__ + '.LongStorage')
+
+    def int(self):
+        """Casts this storage to int type"""
+        return self.type(type(self).__module__ + '.IntStorage')
+
+    def short(self):
+        """Casts this storage to short type"""
+        return self.type(type(self).__module__ + '.ShortStorage')
+
+    def char(self):
+        """Casts this storage to char type"""
+        return self.type(type(self).__module__ + '.CharStorage')
+
+    def byte(self):
+        """Casts this storage to byte type"""
+        return self.type(type(self).__module__ + '.ByteStorage')
+
+    def pin_memory(self):
+        """Copies the storage to pinned memory, if it's not already pinned."""
+        if self.is_cuda:
+            raise TypeError("cannot pin '{0}' only CPU memory can be pinned"
+                            .format(self.type()))
+        import torch.cuda
+        allocator = torch.cuda._host_allocator()
+        return type(self)(self.size(), allocator=allocator).copy_(self)
+
+    def share_memory_(self):
+        """Moves the storage to shared memory.
+
+        This is a no-op for storages already in shared memory and for CUDA
+        storages, which do not need to be moved for sharing across processes.
+        Storages in shared memory cannot be resized.
+
+        Returns: self
+        """
+        from torch.multiprocessing import get_sharing_strategy
+        if self.is_cuda:
+            pass  # CUDA doesn't use POSIX shared memory
+        elif get_sharing_strategy() == 'file_system':
+            self._share_filename_()
+        else:
+            self._share_fd_()
+        return self
+
+    @classmethod
+    def _new_shared(cls, size):
+        """Creates a new storage in shared memory with the same data type"""
+        from torch.multiprocessing import get_sharing_strategy
+        if cls.is_cuda:
+            return cls(size)
+        elif get_sharing_strategy() == 'file_system':
+            return cls._new_using_filename(size)
+        else:
+            return cls._new_using_fd(size)
+
+
+def _load_from_bytes(b):
+    return torch.load(io.BytesIO(b))
+
+
+_StorageBase.type = _type
+_StorageBase.cuda = _cuda
diff --git a/torch/tensor.py b/torch/tensor.py
new file mode 100644
index 0000000..60a50b6
--- /dev/null
+++ b/torch/tensor.py
@@ -0,0 +1,409 @@
+import sys
+import torch
+import torch._C as _C
+from collections import OrderedDict
+import torch.utils.hooks as hooks
+import warnings
+import weakref
+from torch._six import imap
+from torch._C import _add_docstr
+
+
+class Tensor(torch._C._TensorBase):
+    def __deepcopy__(self, memo):
+        if not self.is_leaf:
+            raise RuntimeError("Only Tensors created explicitly by the user "
+                               "(graph leaves) support the deepcopy protocol at the moment")
+        if id(self) in memo:
+            return memo[id(self)]
+        with torch.no_grad():
+            if self.is_sparse:
+                new_tensor = self.clone()
+            else:
+                new_storage = self.storage().__deepcopy__(memo)
+                new_tensor = self.new()
+                new_tensor.set_(new_storage, self.storage_offset(), self.size(), self.stride())
+            memo[id(self)] = new_tensor
+            new_tensor.requires_grad = self.requires_grad
+            return new_tensor
+
+    def __reduce_ex__(self, proto):
+        args = (self.storage(),
+                self.storage_offset(),
+                tuple(self.size()),
+                self.stride(),
+                self.requires_grad,
+                self._backward_hooks)
+        return (torch._utils._rebuild_tensor_v2, args)
+
+    def __setstate__(self, state):
+        if not self.is_leaf:
+            raise RuntimeError('__setstate__ can be only called on leaf Tensors')
+        if len(state) == 4:
+            # legacy serialization of Tensor
+            self.set_(*state)
+            return
+        elif len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        self.requires_grad, _, self._backward_hooks = state
+
+    def __repr__(self):
+        # All strings are unicode in Python 3, while we have to encode unicode
+        # strings in Python2. If we can't, let python decide the best
+        # characters to replace unicode characters with.
+        if sys.version_info > (3,):
+            return torch._tensor_str._str(self)
+        else:
+            if hasattr(sys.stdout, 'encoding'):
+                return torch._tensor_str._str(self).encode(
+                    sys.stdout.encoding or 'UTF-8', 'replace')
+            else:
+                return torch._tensor_str._str(self).encode('UTF-8', 'replace')
+
+    def backward(self, gradient=None, retain_graph=None, create_graph=False):
+        r"""Computes the gradient of current tensor w.r.t. graph leaves.
+
+        The graph is differentiated using the chain rule. If the tensor is
+        non-scalar (i.e. its data has more than one element) and requires
+        gradient, the function additionally requires specifying ``gradient``.
+        It should be a tensor of matching type and location, that contains
+        the gradient of the differentiated function w.r.t. ``self``.
+
+        This function accumulates gradients in the leaves - you might need to
+        zero them before calling it.
+
+        Arguments:
+            gradient (Tensor or None): Gradient w.r.t. the
+                tensor. If it is a tensor, it will be automatically converted
+                to a Tensor that does not require grad unless ``create_graph`` is True.
+                None values can be specified for scalar Tensors or ones that
+                don't require grad. If a None value would be acceptable then
+                this argument is optional.
+            retain_graph (bool, optional): If ``False``, the graph used to compute
+                the grads will be freed. Note that in nearly all cases setting
+                this option to True is not needed and often can be worked around
+                in a much more efficient way. Defaults to the value of
+                ``create_graph``.
+            create_graph (bool, optional): If ``True``, graph of the derivative will
+                be constructed, allowing to compute higher order derivative
+                products. Defaults to ``False``.
+        """
+        torch.autograd.backward(self, gradient, retain_graph, create_graph)
+
+    def register_hook(self, hook):
+        r"""Registers a backward hook.
+
+        The hook will be called every time a gradient with respect to the
+        Tensor is computed. The hook should have the following signature::
+
+            hook(grad) -> Tensor or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        Example::
+
+            >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> h = v.register_hook(lambda grad: grad * 2)  # double the gradient
+            >>> v.backward(torch.tensor([1., 2., 3.]))
+            >>> v.grad
+
+             2
+             4
+             6
+            [torch.FloatTensor of size (3,)]
+
+            >>> h.remove()  # removes the hook
+        """
+        if not self.requires_grad:
+            raise RuntimeError("cannot register a hook on a tensor that "
+                               "doesn't require gradient")
+        if self._backward_hooks is None:
+            self._backward_hooks = OrderedDict()
+            if self.grad_fn is not None:
+                self.grad_fn._register_hook_dict(self)
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def reinforce(self, reward):
+        def trim(str):
+            return '\n'.join([line.strip() for line in str.split('\n')])
+
+        raise RuntimeError(trim(r"""reinforce() was removed.
+            Use torch.distributions instead.
+            See http://pytorch.org/docs/master/distributions.html
+
+            Instead of:
+
+            probs = policy_network(state)
+            action = probs.multinomial()
+            next_state, reward = env.step(action)
+            action.reinforce(reward)
+            action.backward()
+
+            Use:
+
+            probs = policy_network(state)
+            # NOTE: categorical is equivalent to what used to be called multinomial
+            m = torch.distributions.Categorical(probs)
+            action = m.sample()
+            next_state, reward = env.step(action)
+            loss = -m.log_prob(action) * reward
+            loss.backward()
+        """))
+
+    detach = _add_docstr(_C._TensorBase.detach, r"""
+    Returns a new Tensor, detached from the current graph.
+
+    The result will never require gradient.
+
+    .. note::
+
+      Returned Tensor uses the same data tensor as the original one.
+      In-place modifications on either of them will be seen, and may trigger
+      errors in correctness checks.
+    """)
+
+    detach_ = _add_docstr(_C._TensorBase.detach_, r"""
+    Detaches the Tensor from the graph that created it, making it a leaf.
+    Views cannot be detached in-place.
+    """)
+
+    def retain_grad(self):
+        r"""Enables .grad attribute for non-leaf Tensors."""
+        if self.grad_fn is None:  # no-op for leaves
+            return
+        if not self.requires_grad:
+            raise RuntimeError("can't retain_grad on Tensor that has requires_grad=False")
+        if hasattr(self, 'retains_grad'):
+            return
+        weak_self = weakref.ref(self)
+
+        def retain_grad_hook(grad):
+            var = weak_self()
+            if var is None:
+                return
+            if var._grad is None:
+                var._grad = grad.clone()
+            else:
+                var._grad = var._grad + grad
+
+        self.register_hook(retain_grad_hook)
+        self.retains_grad = True
+
+    def is_pinned(self):
+        r"""Returns true if this tensor resides in pinned memory"""
+        storage = self.storage()
+        return storage.is_pinned() if storage else False
+
+    def is_shared(self):
+        r"""Checks if tensor is in shared memory.
+
+        This is always ``True`` for CUDA tensors.
+        """
+        return self.storage().is_shared()
+
+    def share_memory_(self):
+        r"""Moves the underlying storage to shared memory.
+
+        This is a no-op if the underlying storage is already in shared memory
+        and for CUDA tensors. Tensors in shared memory cannot be resized.
+        """
+        self.storage().share_memory_()
+        return self
+
+    def __reversed__(self):
+        r"""Reverses the tensor along dimension 0."""
+        if self.dim() == 0:
+            return self
+        else:
+            return self.flip(0)
+
+    def argmax(self, dim=None, keepdim=False):
+        r"""See :func:`torch.argmax`"""
+        return torch.argmax(self, dim, keepdim)
+
+    def argmin(self, dim=None, keepdim=False):
+        r"""See :func:`torch.argmin`"""
+        return torch.argmin(self, dim, keepdim)
+
+    def btrifact(self, info=None, pivot=True):
+        r"""See :func:`torch.btrifact`
+        """
+        if info is not None:
+            warnings.warn("info option in btrifact is deprecated and will be removed in v0.4, "
+                          "consider using btrifact_with_info instead", stacklevel=2)
+            factorization, pivots, _info = super(Tensor, self).btrifact_with_info(pivot=pivot)
+            if info.type() != _info.type():
+                raise ValueError('btrifact expects info to be an IntTenor')
+            info.resize_as_(_info).copy_(_info)
+            return factorization, pivots
+        else:
+            return super(Tensor, self).btrifact(pivot=pivot)
+
+    def stft(self, n_fft, hop_length=None, win_length=None, window=None,
+             center=True, pad_mode='reflect', normalized=False, onesided=True):
+        r"""See :func:`torch.stft`
+
+        .. warning::
+          This function changed signature at version 0.4.1. Calling with
+          the previous signature may cause error or return incorrect result.
+        """
+        return torch.stft(self, n_fft, hop_length, win_length, window, center,
+                          pad_mode, normalized, onesided)
+
+    def resize(self, *sizes):
+        warnings.warn("non-inplace resize is deprecated")
+        from torch.autograd._functions import Resize
+        return Resize.apply(self, sizes)
+
+    def resize_as(self, tensor):
+        warnings.warn("non-inplace resize_as is deprecated")
+        from torch.autograd._functions import Resize
+        return Resize.apply(self, tensor.size())
+
+    def split(self, split_size, dim=0):
+        r"""See :func:`torch.split`
+        """
+        if isinstance(split_size, int):
+            return super(Tensor, self).split(split_size, dim)
+        else:
+            return super(Tensor, self).split_with_sizes(split_size, dim)
+
+    def index_add(self, dim, index, tensor):
+        return self.clone().index_add_(dim, index, tensor)
+
+    def index_copy(self, dim, index, tensor):
+        return self.clone().index_copy_(dim, index, tensor)
+
+    def index_fill(self, dim, index, value):
+        return self.clone().index_fill_(dim, index, value)
+
+    def scatter(self, dim, index, source):
+        return self.clone().scatter_(dim, index, source)
+
+    def scatter_add(self, dim, index, source):
+        return self.clone().scatter_add_(dim, index, source)
+
+    def masked_copy(self, mask, tensor):
+        warnings.warn("masked_copy is deprecated and renamed to masked_scatter, and will be removed in v0.3")
+        return self.masked_scatter(mask, tensor)
+
+    def masked_copy_(self, mask, tensor):
+        warnings.warn("masked_copy_ is deprecated and renamed to masked_scatter_, and will be removed in v0.3")
+        return self.masked_scatter_(mask, tensor)
+
+    def masked_scatter(self, mask, tensor):
+        return self.clone().masked_scatter_(mask, tensor)
+
+    def masked_fill(self, mask, value):
+        return self.clone().masked_fill_(mask, value)
+
+    def unique(self, sorted=False, return_inverse=False):
+        r"""Returns the unique scalar elements of the tensor as a 1-D tensor.
+
+        See :func:`torch.unique`
+        """
+        output, inverse_indices = self._unique(
+            sorted=sorted, return_inverse=return_inverse)
+        if return_inverse:
+            return output, inverse_indices
+        else:
+            return output
+
+    def __rsub__(self, other):
+        return -self + other
+
+    def __rdiv__(self, other):
+        if self.dtype.is_floating_point:
+            return self.reciprocal() * other
+        else:
+            return (self.double().reciprocal() * other).type_as(self)
+
+    __rtruediv__ = __rdiv__
+    __itruediv__ = _C._TensorBase.__idiv__
+
+    __pow__ = _C._TensorBase.pow
+
+    def __format__(self, format_spec):
+        if self.dim() == 0:
+            return self.item().__format__(format_spec)
+        return object.__format__(self, format_spec)
+
+    def __ipow__(self, other):
+        raise NotImplementedError("in-place pow not implemented")
+
+    def __rpow__(self, other):
+        return self.new([other]) ** self
+
+    def __floordiv__(self, other):
+        result = self / other
+        if result.dtype.is_floating_point:
+            result = result.trunc()
+        return result
+
+    def __rfloordiv__(self, other):
+        result = other / self
+        if result.dtype.is_floating_point:
+            result = result.trunc()
+        return result
+
+    __neg__ = _C._TensorBase.neg
+
+    __eq__ = _C._TensorBase.eq
+    __ne__ = _C._TensorBase.ne
+    __lt__ = _C._TensorBase.lt
+    __le__ = _C._TensorBase.le
+    __gt__ = _C._TensorBase.gt
+    __ge__ = _C._TensorBase.ge
+    __abs__ = _C._TensorBase.abs
+
+    def __len__(self):
+        if self.dim() == 0:
+            raise TypeError("len() of a 0-d tensor")
+        return self.shape[0]
+
+    def __iter__(self):
+        # NB: we use 'imap' and not 'map' here, so that in Python 2 we get a
+        # generator and don't eagerly perform all the indexes.  This could
+        # save us work, and also helps keep trace ordering deterministic
+        # (e.g., if you zip(*hiddens), the eager map will force all the
+        # indexes of hiddens[0] before hiddens[1], while the generator
+        # map will interleave them.)
+        if self.dim() == 0:
+            raise TypeError('iteration over a 0-d tensor')
+        return iter(imap(lambda i: self[i], range(self.size(0))))
+
+    def __hash__(self):
+        return id(self)
+
+    def __dir__(self):
+        tensor_methods = dir(self.__class__)
+        tensor_methods.remove('volatile')  # deprecated
+        attrs = list(self.__dict__.keys())
+        keys = tensor_methods + attrs
+        return sorted(keys)
+
+    # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
+    def __array__(self, dtype=None):
+        if dtype is None:
+            return self.cpu().numpy()
+        else:
+            return self.cpu().numpy().astype(dtype, copy=False)
+
+    # Wrap Numpy array again in a suitable tensor when done, to support e.g.
+    # `numpy.sin(tensor) -> tensor` or `numpy.greater(tensor, 0) -> ByteTensor`
+    def __array_wrap__(self, array):
+        if array.dtype == bool:
+            # Workaround, torch has no built-in bool tensor
+            array = array.astype('uint8')
+        return torch.from_numpy(array)
+
+    __module__ = 'torch'
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
new file mode 100644
index 0000000..250a602
--- /dev/null
+++ b/torch/testing/__init__.py
@@ -0,0 +1,106 @@
+"""
+The testing package contains testing-specific utilities.
+"""
+
+import torch
+import random
+
+__all__ = [
+    'assert_allclose', 'make_non_contiguous', 'rand_like', 'randn_like'
+]
+
+rand_like = torch.rand_like
+randn_like = torch.randn_like
+
+
+def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True):
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+    if expected.shape != actual.shape:
+        expected = expected.expand_as(actual)
+    if rtol is None or atol is None:
+        if rtol is not None or atol is not None:
+            raise ValueError("rtol and atol must both be specified or both be unspecified")
+        rtol, atol = _get_default_tolerance(actual, expected)
+
+    close = torch.isclose(actual, expected, rtol, atol, equal_nan)
+    if close.all():
+        return
+
+    # Find the worst offender
+    error = (expected - actual).abs()
+    expected_error = atol + rtol * expected.abs()
+    delta = error - expected_error
+    delta[close] = 0  # mask out NaN/inf
+    _, index = delta.reshape(-1).max(0)
+
+    # TODO: consider adding torch.unravel_index
+    def _unravel_index(index, shape):
+        res = []
+        for size in shape[::-1]:
+            res.append(int(index % size))
+            index = int(index // size)
+        return tuple(res[::-1])
+
+    index = _unravel_index(index.item(), actual.shape)
+
+    # Count number of offenders
+    count = (~close).long().sum()
+
+    msg = ('Not within tolerance rtol={} atol={} at input{} ({} vs. {}) and {}'
+           ' other locations ({:2.2f}%)')
+
+    raise AssertionError(msg.format(
+        rtol, atol, list(index), actual[index].item(), expected[index].item(),
+        count - 1, 100 * count / actual.numel()))
+
+
+def make_non_contiguous(tensor):
+    if tensor.numel() <= 1:  # can't make non-contiguous
+        return tensor.clone()
+    osize = list(tensor.size())
+
+    # randomly inflate a few dimensions in osize
+    for _ in range(2):
+        dim = random.randint(0, len(osize) - 1)
+        add = random.randint(4, 15)
+        osize[dim] = osize[dim] + add
+
+    # narrow doesn't make a non-contiguous tensor if we only narrow the 0-th dimension,
+    # (which will always happen with a 1-dimensional tensor), so let's make a new
+    # right-most dimension and cut it off
+
+    input = tensor.new(torch.Size(osize + [random.randint(2, 3)]))
+    input = input.select(len(input.size()) - 1, random.randint(0, 1))
+    # now extract the input of correct size from 'input'
+    for i in range(len(osize)):
+        if input.size(i) != tensor.size(i):
+            bounds = random.randint(1, input.size(i) - tensor.size(i))
+            input = input.narrow(i, bounds, tensor.size(i))
+
+    input.copy_(tensor)
+    return input
+
+
+def get_all_dtypes():
+    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64,
+            torch.float16, torch.float32, torch.float64]
+
+
+# 'dtype': (rtol, atol)
+_default_tolerances = {
+    'float64': (1e-5, 1e-8),  # NumPy default
+    'float32': (1e-4, 1e-5),  # This may need to be changed
+    'float16': (1e-3, 1e-3),  # This may need to be changed
+}
+
+
+def _get_default_tolerance(a, b=None):
+    if b is None:
+        dtype = str(a.dtype).split('.')[-1]  # e.g. "float32"
+        return _default_tolerances.get(dtype, (0, 0))
+    a_tol = _get_default_tolerance(a)
+    b_tol = _get_default_tolerance(b)
+    return (max(a_tol[0], b_tol[0]), max(a_tol[1], b_tol[1]))
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/utils/backcompat/__init__.py b/torch/utils/backcompat/__init__.py
new file mode 100644
index 0000000..a8e179e
--- /dev/null
+++ b/torch/utils/backcompat/__init__.py
@@ -0,0 +1,21 @@
+from torch._C import _set_backcompat_broadcast_warn
+from torch._C import _get_backcompat_broadcast_warn
+from torch._C import _set_backcompat_keepdim_warn
+from torch._C import _get_backcompat_keepdim_warn
+
+
+class Warning(object):
+    def __init__(self, setter, getter):
+        self.setter = setter
+        self.getter = getter
+
+    def set_enabled(self, value):
+        self.setter(value)
+
+    def get_enabled(self):
+        return self.getter()
+
+    enabled = property(get_enabled, set_enabled)
+
+broadcast_warning = Warning(_set_backcompat_broadcast_warn, _get_backcompat_broadcast_warn)
+keepdim_warning = Warning(_set_backcompat_keepdim_warn, _get_backcompat_keepdim_warn)
diff --git a/torch/utils/bottleneck/__init__.py b/torch/utils/bottleneck/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py
new file mode 100644
index 0000000..951e42a
--- /dev/null
+++ b/torch/utils/bottleneck/__main__.py
@@ -0,0 +1,232 @@
+import argparse
+import cProfile
+import pstats
+import subprocess
+import sys
+import os
+import re
+import contextlib
+
+import torch
+from torch.autograd import profiler
+from torch.utils.collect_env import get_env_info
+
+
+def redirect_argv(new_argv):
+    sys.argv[:] = new_argv[:]
+
+
+def compiled_with_cuda(sysinfo):
+    if sysinfo.cuda_compiled_version:
+        return 'compiled w/ CUDA {}'.format(sysinfo.cuda_compiled_version)
+    return 'not compiled w/ CUDA'
+
+
+env_summary = """
+--------------------------------------------------------------------------------
+  Environment Summary
+--------------------------------------------------------------------------------
+PyTorch {pytorch_version}{debug_str} {cuda_compiled}
+Running with Python {py_version} and {cuda_runtime}
+
+`{pip_version} list` truncated output:
+{pip_list_output}
+""".strip()
+
+
+def run_env_analysis():
+    print('Running environment analysis...')
+    info = get_env_info()
+
+    result = []
+
+    debug_str = ''
+    if info.is_debug_build:
+        debug_str = ' DEBUG'
+
+    cuda_avail = ''
+    if info.is_cuda_available:
+        cuda = info.cuda_runtime_version
+        if cuda is not None:
+            cuda_avail = 'CUDA ' + cuda
+    else:
+        cuda = 'CUDA unavailable'
+
+    pip_version = info.pip_version
+    pip_list_output = info.pip_packages
+    if pip_list_output is None:
+        pip_list_output = 'Unable to fetch'
+
+    result = {
+        'debug_str': debug_str,
+        'pytorch_version': info.torch_version,
+        'cuda_compiled': compiled_with_cuda(info),
+        'py_version': '{}.{}'.format(sys.version_info[0], sys.version_info[1]),
+        'cuda_runtime': cuda_avail,
+        'pip_version': pip_version,
+        'pip_list_output': pip_list_output,
+    }
+
+    return env_summary.format(**result)
+
+
+def run_cprofile(code, globs, launch_blocking=False):
+    print('Running your script with cProfile')
+    prof = cProfile.Profile()
+    prof.enable()
+    exec(code, globs, None)
+    prof.disable()
+    return prof
+
+
+cprof_summary = """
+--------------------------------------------------------------------------------
+  cProfile output
+--------------------------------------------------------------------------------
+""".strip()
+
+
+def print_cprofile_summary(prof, sortby='tottime', topk=15):
+    result = {}
+
+    print(cprof_summary.format(**result))
+
+    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
+    cprofile_stats.print_stats(topk)
+
+
+def run_autograd_prof(code, globs):
+    def run_prof(use_cuda=False):
+        with profiler.profile(use_cuda=use_cuda) as prof:
+            exec(code, globs, None)
+        return prof
+
+    print('Running your script with the autograd profiler...')
+    result = [run_prof(use_cuda=False)]
+    if torch.cuda.is_available():
+        result.append(run_prof(use_cuda=True))
+    else:
+        result.append(None)
+
+    return result
+
+
+autograd_prof_summary = """
+--------------------------------------------------------------------------------
+  autograd profiler output ({mode} mode)
+--------------------------------------------------------------------------------
+        {description}
+{cuda_warning}
+{output}
+""".strip()
+
+
+def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
+    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
+    if sortby not in valid_sortby:
+        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
+                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
+                'Defaulting to `cpu_time`.')
+        print(warn.format(autograd_prof_sortby))
+        sortby = 'cpu_time'
+
+    if mode is 'CUDA':
+        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
+                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
+                        '\tPlease ignore this output if your code does not use CUDA.\n')
+    else:
+        cuda_warning = ''
+
+    sorted_events = sorted(prof.function_events,
+                           key=lambda x: getattr(x, sortby), reverse=True)
+    topk_events = sorted_events[:topk]
+
+    result = {
+        'mode': mode,
+        'description': 'top {} events sorted by {}'.format(topk, sortby),
+        'output': torch.autograd.profiler.build_table(topk_events),
+        'cuda_warning': cuda_warning
+    }
+
+    print(autograd_prof_summary.format(**result))
+
+
+descript = """
+`bottleneck` is a tool that can be used as an initial step for debugging
+bottlenecks in your program.
+
+It summarizes runs of your script with the Python profiler and PyTorch\'s
+autograd profiler. Because your script will be profiled, please ensure that it
+exits in a finite amount of time.
+
+For more complicated uses of the profilers, please see
+https://docs.python.org/3/library/profile.html and
+http://pytorch.org/docs/master/autograd.html#profiler for more information.
+""".strip()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=descript)
+    parser.add_argument('scriptfile', type=str,
+                        help='Path to the script to be run. '
+                        'Usually run with `python path/to/script`.')
+    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
+                        help='Command-line arguments to be passed to the script.')
+    return parser.parse_args()
+
+
+def cpu_time_total(autograd_prof):
+    return sum([event.cpu_time_total for event in autograd_prof.function_events])
+
+
+def main():
+    args = parse_args()
+
+    # Customizable constants.
+    scriptfile = args.scriptfile
+    scriptargs = [] if args.args is None else args.args
+    scriptargs.insert(0, scriptfile)
+    cprofile_sortby = 'tottime'
+    cprofile_topk = 15
+    autograd_prof_sortby = 'cpu_time_total'
+    autograd_prof_topk = 15
+
+    redirect_argv(scriptargs)
+
+    sys.path.insert(0, os.path.dirname(scriptfile))
+    with open(scriptfile, 'rb') as stream:
+        code = compile(stream.read(), scriptfile, 'exec')
+    globs = {
+        '__file__': scriptfile,
+        '__name__': '__main__',
+        '__package__': None,
+        '__cached__': None,
+    }
+
+    print(descript)
+
+    env_summary = run_env_analysis()
+
+    if torch.cuda.is_available():
+        torch.cuda.init()
+    cprofile_prof = run_cprofile(code, globs)
+    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
+
+    print(env_summary)
+    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
+
+    if not torch.cuda.is_available():
+        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+        return
+
+    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
+    # if their execution times are very different.
+    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
+    cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
+    pct_diff = cuda_prof_exec_time - cpu_prof_exec_time / cuda_prof_exec_time
+    if abs(pct_diff) > 0.05:
+        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
+
+if __name__ == '__main__':
+    main()
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
new file mode 100644
index 0000000..403e02a
--- /dev/null
+++ b/torch/utils/checkpoint.py
@@ -0,0 +1,149 @@
+import torch
+import warnings
+
+
+def detach_variable(inputs):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            x = inp.detach()
+            x.requires_grad = inp.requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
+
+
+def check_backward_validity(inputs):
+    if not any(inp.requires_grad for inp in inputs):
+        warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
+
+
+class CheckpointFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        check_backward_validity(args)
+        ctx.run_function = run_function
+        ctx.save_for_backward(*args)
+        with torch.no_grad():
+            outputs = run_function(*args)
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        return (None,) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    r"""Checkpoint a model or part of the model
+
+    Checkpointing works by trading compute for memory. Rather than storing all
+    intermediate activations of the entire computation graph for computing
+    backward, the checkpointed part does **not** save intermediate activations,
+    and instead recomputes them in backward pass. It can be applied on any part
+    of a model.
+
+    Specifically, in the forward pass, :attr:`function` will run in
+    :func:`torch.no_grad` manner, i.e., not storing the intermediate
+    activations. Instead, the forward pass saves the inputs tuple and the
+    :attr:`function` parameter. In the backwards pass, the saved inputs and
+    :attr:`function` is retreived, and the forward pass is computed on
+    :attr:`function` again, now tracking the intermediate activations, and then
+    the gradients are calculated using these activation values.
+
+    .. warning::
+        Checkpointing doesn't work with :func:`torch.autograd.grad`, but only
+        with :func:`torch.autograd.backward`.
+
+    .. warning::
+        If :attr:`function` invocation during backward does anything different
+        than the one during forward, e.g., due to some global variable, the
+        checkpointed version won't be equivalent, and unfortunately it can't be
+        detected.
+
+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        args: tuple containing inputs to the :attr:`function`
+
+    Returns:
+        Output of running :attr:`function` on :attr:`*args`
+    """
+    return CheckpointFunction.apply(function, *args)
+
+
+def checkpoint_sequential(functions, segments, *inputs):
+    r"""A helper function for checkpointing sequential models.
+
+    Sequential models execute a list of modules/functions in order
+    (sequentially). Therefore, we can divide such a model in various segments
+    and checkpoint each segment. All segments except the last will run in
+    :func:`torch.no_grad` manner, i.e., not storing the intermediate
+    activations. The inputs of each checkpointed segment will be saved for
+    re-running the segment in the backward pass.
+
+    See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works.
+
+    .. warning::
+        Checkpointing doesn't work with :func:`torch.autograd.grad`, but only
+        with :func:`torch.autograd.backward`.
+
+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
+    Args:
+        functions: A :class:`torch.nn.Sequential` or the list of modules or
+            functions (comprising the model) to run sequentially.
+        segments: Number of chunks to create in the model
+        inputs: tuple of Tensors that are inputs to :attr:`functions`
+
+    Returns:
+        Output of running :attr:`functions` sequentially on :attr:`*inputs`
+
+    Example:
+        >>> model = nn.Sequential(...)
+        >>> input_var = checkpoint_sequential(model, chunks, input_var)
+    """
+
+    def run_function(start, end, functions):
+        def forward(*inputs):
+            input = inputs[0]
+            for j in range(start, end + 1):
+                input = functions[j](input)
+            return input
+        return forward
+
+    if isinstance(functions, torch.nn.Sequential):
+        functions = list(functions.children())
+
+    segment_size = len(functions) // segments
+    # the last chunk has to be non-volatile
+    end = -1
+    for start in range(0, segment_size * (segments - 1), segment_size):
+        end = start + segment_size - 1
+        inputs = checkpoint(run_function(start, end, functions), *inputs)
+        if not isinstance(inputs, tuple):
+            inputs = (inputs,)
+    return run_function(end + 1, len(functions) - 1, functions)(*inputs)
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
new file mode 100644
index 0000000..ad6a62e
--- /dev/null
+++ b/torch/utils/collect_env.py
@@ -0,0 +1,348 @@
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py`.
+import re
+import subprocess
+import sys
+import time
+import datetime
+import os
+from collections import namedtuple
+
+import torch
+
+PY3 = sys.version_info >= (3, 0)
+
+# System Environment Information
+SystemEnv = namedtuple('SystemEnv', [
+    'torch_version',
+    'is_debug_build',
+    'cuda_compiled_version',
+    'gcc_version',
+    'cmake_version',
+    'os',
+    'python_version',
+    'is_cuda_available',
+    'cuda_runtime_version',
+    'nvidia_driver_version',
+    'nvidia_gpu_models',
+    'cudnn_version',
+    'pip_version',  # 'pip' or 'pip3'
+    'pip_packages',
+    'conda_packages',
+])
+
+
+def run(command):
+    """Returns (return-code, stdout, stderr)"""
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=True)
+    output, err = p.communicate()
+    rc = p.returncode
+    if PY3:
+        output = output.decode("ascii")
+        err = err.decode("ascii")
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Runs command using run_lambda; reads and returns entire output if rc is 0"""
+    rc, out, _ = run_lambda(command)
+    if rc is not 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Runs command using run_lambda, returns the first regex match if it exists"""
+    rc, out, _ = run_lambda(command)
+    if rc is not 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def get_conda_packages(run_lambda):
+    if get_platform() == 'win32':
+        grep_cmd = 'findstr /R "torch soumith"'
+    else:
+        grep_cmd = 'grep "torch\|soumith"'
+    out = run_and_read_all(run_lambda, 'conda list | ' + grep_cmd)
+    if out is None:
+        return out
+    # Comment starting at beginning of line
+    comment_regex = re.compile(r'^#.*\n')
+    return re.sub(comment_regex, '', out)
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc is not 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version', r'V(.*)$')
+
+
+def get_cudnn_version(run_lambda):
+    """This will return a list of libcudnn.so; it's hard to tell which one is being used"""
+    if get_platform() == 'win32':
+        cudnn_cmd = 'where /R "%CUDA_PATH%\\bin" cudnn*.dll'
+    else:
+        cudnn_cmd = 'find /usr/local /usr/lib -type f -name "libcudnn*" 2> /dev/null'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0:
+        return None
+    if rc != 1 and rc != 0:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    result = '\n'.join(sorted(out.split('\n')))
+    return 'Probably one of the following:\n{}'.format(result)
+
+
+def get_nvidia_smi():
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        smi = '"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\%s"' % smi
+    return smi
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    return run_and_read_all(run_lambda, 'wmic os get Caption | findstr /v Caption')
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    platform = get_platform()
+
+    if platform is 'win32' or platform is 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'Mac OSX {}'.format(version)
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return desc
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return desc
+
+        return platform
+
+    # Unknown platform
+    return platform
+
+
+def get_pip_packages(run_lambda):
+    # People generally have `pip` as `pip` or `pip3`
+    def run_with_pip(pip):
+        if get_platform() == 'win32':
+            grep_cmd = 'findstr /R "numpy torch"'
+        else:
+            grep_cmd = 'grep "torch\|numpy"'
+        return run_and_read_all(run_lambda, pip + ' list --format=legacy | ' + grep_cmd)
+
+    if not PY3:
+        return 'pip', run_with_pip('pip')
+
+    # Try to figure out if the user is running pip or pip3.
+    out2 = run_with_pip('pip')
+    out3 = run_with_pip('pip3')
+
+    num_pips = len([x for x in [out2, out3] if x is not None])
+    if num_pips is 0:
+        return 'pip', out2
+
+    if num_pips == 1:
+        if out2 is not None:
+            return 'pip', out2
+        return 'pip3', out3
+
+    # num_pips is 2. Return pip3 by default b/c that most likely
+    # is the one associated with Python 3
+    return 'pip3', out3
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    return SystemEnv(
+        torch_version=torch.__version__,
+        is_debug_build=torch.version.debug,
+        python_version='{}.{}'.format(sys.version_info[0], sys.version_info[1]),
+        is_cuda_available=torch.cuda.is_available(),
+        cuda_compiled_version=torch.version.cuda,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=get_conda_packages(run_lambda),
+        os=get_os(run_lambda),
+        gcc_version=get_gcc_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+    )
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+CUDA used to build PyTorch: {cuda_compiled_version}
+
+OS: {os}
+GCC version: {gcc_version}
+CMake version: {cmake_version}
+
+Python version: {python_version}
+Is CUDA available: {is_cuda_available}
+CUDA runtime version: {cuda_runtime_version}
+GPU models and configuration: {nvidia_gpu_models}
+Nvidia driver version: {nvidia_driver_version}
+cuDNN version: {cudnn_version}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+
+def pretty_str(envinfo):
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'],
+                                               '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'],
+                                                 '[conda] ')
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
new file mode 100644
index 0000000..a7b6442
--- /dev/null
+++ b/torch/utils/cpp_extension.py
@@ -0,0 +1,921 @@
+import copy
+import glob
+import imp
+import os
+import re
+import setuptools
+import subprocess
+import sys
+import sysconfig
+import tempfile
+import warnings
+
+import torch
+from .file_baton import FileBaton
+
+from setuptools.command.build_ext import build_ext
+
+
+def _find_cuda_home():
+    '''Finds the CUDA install path.'''
+    # Guess #1
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        # Guess #2
+        if sys.platform == 'win32':
+            cuda_home = glob.glob(
+                'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+        else:
+            cuda_home = '/usr/local/cuda'
+        if not os.path.exists(cuda_home):
+            # Guess #3
+            try:
+                which = 'where' if sys.platform == 'win32' else 'which'
+                nvcc = subprocess.check_output(
+                    [which, 'nvcc']).decode().rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+            except Exception:
+                cuda_home = None
+    if cuda_home and not torch.cuda.is_available():
+        print("No CUDA runtime is found, using CUDA_HOME='{}'".format(cuda_home))
+    return cuda_home
+
+
+MINIMUM_GCC_VERSION = (4, 9)
+MINIMUM_MSVC_VERSION = (19, 0, 24215)
+ABI_INCOMPATIBILITY_WARNING = '''
+
+                               !! WARNING !!
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+Your compiler ({}) may be ABI-incompatible with PyTorch!
+Please use a compiler that is ABI-compatible with GCC 4.9 and above.
+See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html.
+
+See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
+for instructions on how to install GCC 4.9 or higher.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+                              !! WARNING !!
+'''
+CUDA_HOME = _find_cuda_home()
+# PyTorch releases have the version pattern major.minor.patch, whereas when
+# PyTorch is built from source, we append the git commit hash, which gives
+# it the below pattern.
+BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')
+
+
+def is_binary_build():
+    return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
+
+
+def check_compiler_abi_compatibility(compiler):
+    '''
+    Verifies that the given compiler is ABI-compatible with PyTorch.
+
+    Arguments:
+        compiler (str): The compiler executable name to check (e.g. ``g++``).
+            Must be executable in a shell process.
+
+    Returns:
+        False if the compiler is (likely) ABI-incompatible with PyTorch,
+        else True.
+    '''
+    if not is_binary_build():
+        return True
+    try:
+        check_cmd = '{}' if sys.platform == 'win32' else '{} --version'
+        info = subprocess.check_output(
+            check_cmd.format(compiler).split(), stderr=subprocess.STDOUT)
+    except Exception:
+        _, error, _ = sys.exc_info()
+        warnings.warn('Error checking compiler version: {}'.format(error))
+    else:
+        info = info.decode().lower()
+        if 'gcc' in info or 'g++' in info:
+            # Sometimes the version is given as "major.x" instead of semver.
+            version = re.search(r'(\d+)\.(\d+|x)', info)
+            if version is not None:
+                major, minor = version.groups()
+                minor = 0 if minor == 'x' else int(minor)
+                if (int(major), minor) >= MINIMUM_GCC_VERSION:
+                    return True
+                else:
+                    # Append the detected version for the warning.
+                    compiler = '{} {}'.format(compiler, version.group(0))
+        elif 'Microsoft' in info:
+            info = info.decode().lower()
+            version = re.search(r'(\d+)\.(\d+)\.(\d+)', info)
+            if version is not None:
+                major, minor, revision = version.groups()
+                if (int(major), int(minor),
+                        int(revision)) >= MINIMUM_MSVC_VERSION:
+                    return True
+                else:
+                    # Append the detected version for the warning.
+                    compiler = '{} {}'.format(compiler, version.group(0))
+
+    warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler))
+    return False
+
+
+class BuildExtension(build_ext):
+    '''
+    A custom :mod:`setuptools` build extension .
+
+    This :class:`setuptools.build_ext` subclass takes care of passing the
+    minimum required compiler flags (e.g. ``-std=c++11``) as well as mixed
+    C++/CUDA compilation (and support for CUDA files in general).
+
+    When using :class:`BuildExtension`, it is allowed to supply a dictionary
+    for ``extra_compile_args`` (rather than the usual list) that maps from
+    languages (``cxx`` or ``cuda``) to a list of additional compiler flags to
+    supply to the compiler. This makes it possible to supply different flags to
+    the C++ and CUDA compiler during mixed compilation.
+    '''
+
+    def build_extensions(self):
+        self._check_abi()
+        for extension in self.extensions:
+            self._define_torch_extension_name(extension)
+            self._add_gnu_abi_flag_if_binary(extension)
+
+        # Register .cu and .cuh as valid source extensions.
+        self.compiler.src_extensions += ['.cu', '.cuh']
+        # Save the original _compile method for later.
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler._cpp_extensions += ['.cu', '.cuh']
+            original_compile = self.compiler.compile
+            original_spawn = self.compiler.spawn
+        else:
+            original_compile = self.compiler._compile
+
+        def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            # Copy before we make any modifications.
+            cflags = copy.deepcopy(extra_postargs)
+            try:
+                original_compiler = self.compiler.compiler_so
+                if _is_cuda_file(src):
+                    nvcc = _join_cuda_home('bin', 'nvcc')
+                    self.compiler.set_executable('compiler_so', nvcc)
+                    if isinstance(cflags, dict):
+                        cflags = cflags['nvcc']
+                    cflags += ['--compiler-options', "'-fPIC'"]
+                elif isinstance(cflags, dict):
+                    cflags = cflags['cxx']
+                # NVCC does not allow multiple -std to be passed, so we avoid
+                # overriding the option if the user explicitly passed it.
+                if not any(flag.startswith('-std=') for flag in cflags):
+                    cflags.append('-std=c++11')
+
+                original_compile(obj, src, ext, cc_args, cflags, pp_opts)
+            finally:
+                # Put the original compiler back in place.
+                self.compiler.set_executable('compiler_so', original_compiler)
+
+        def win_wrap_compile(sources,
+                             output_dir=None,
+                             macros=None,
+                             include_dirs=None,
+                             debug=0,
+                             extra_preargs=None,
+                             extra_postargs=None,
+                             depends=None):
+
+            self.cflags = copy.deepcopy(extra_postargs)
+            extra_postargs = None
+
+            def spawn(cmd):
+                orig_cmd = cmd
+                # Using regex to match src, obj and include files
+
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [
+                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [
+                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                include_regex = re.compile(r'((\-|\/)I.*)')
+                include_list = [
+                    m.group(1)
+                    for m in (include_regex.match(elem) for elem in cmd) if m
+                ]
+
+                if len(src_list) >= 1 and len(obj_list) >= 1:
+                    src = src_list[0]
+                    obj = obj_list[0]
+                    if _is_cuda_file(src):
+                        nvcc = _join_cuda_home('bin', 'nvcc')
+                        if isinstance(self.cflags, dict):
+                            cflags = self.cflags['nvcc']
+                        elif isinstance(self.cflags, list):
+                            cflags = self.cflags
+                        else:
+                            cflags = []
+                        cmd = [
+                            nvcc, '-c', src, '-o', obj, '-Xcompiler',
+                            '/wd4819', '-Xcompiler', '/MD'
+                        ] + include_list + cflags
+                    elif isinstance(self.cflags, dict):
+                        cflags = self.cflags['cxx']
+                        cmd += cflags
+                    elif isinstance(self.cflags, list):
+                        cflags = self.cflags
+                        cmd += cflags
+
+                return original_spawn(cmd)
+
+            try:
+                self.compiler.spawn = spawn
+                return original_compile(sources, output_dir, macros,
+                                        include_dirs, debug, extra_preargs,
+                                        extra_postargs, depends)
+            finally:
+                self.compiler.spawn = original_spawn
+
+        # Monkey-patch the _compile method.
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler.compile = win_wrap_compile
+        else:
+            self.compiler._compile = unix_wrap_compile
+
+        build_ext.build_extensions(self)
+
+    def _check_abi(self):
+        # On some platforms, like Windows, compiler_cxx is not available.
+        if hasattr(self.compiler, 'compiler_cxx'):
+            compiler = self.compiler.compiler_cxx[0]
+        elif sys.platform == 'win32':
+            compiler = os.environ.get('CXX', 'cl')
+        else:
+            compiler = os.environ.get('CXX', 'c++')
+        check_compiler_abi_compatibility(compiler)
+
+    def _define_torch_extension_name(self, extension):
+        # pybind11 doesn't support dots in the names
+        # so in order to support extensions in the packages
+        # like torch._C, we take the last part of the string
+        # as the library name
+        names = extension.name.split('.')
+        name = names[-1]
+        define = '-DTORCH_EXTENSION_NAME={}'.format(name)
+        if isinstance(extension.extra_compile_args, dict):
+            for args in extension.extra_compile_args.values():
+                args.append(define)
+        else:
+            extension.extra_compile_args.append(define)
+
+    def _add_gnu_abi_flag_if_binary(self, extension):
+        # If the version string looks like a binary build,
+        # we know that PyTorch was compiled with gcc 4.9.2.
+        # if the extension is compiled with gcc >= 5.1,
+        # then we have to define _GLIBCXX_USE_CXX11_ABI=0
+        # so that the std::string in the API is resolved to
+        # non-C++11 symbols
+        define = '-D_GLIBCXX_USE_CXX11_ABI=0'
+        if is_binary_build():
+            if isinstance(extension.extra_compile_args, dict):
+                for args in extension.extra_compile_args.values():
+                    args.append(define)
+            else:
+                extension.extra_compile_args.append(define)
+
+
+def CppExtension(name, sources, *args, **kwargs):
+    '''
+    Creates a :class:`setuptools.Extension` for C++.
+
+    Convenience method that creates a :class:`setuptools.Extension` with the
+    bare minimum (but often sufficient) arguments to build a C++ extension.
+
+    All arguments are forwarded to the :class:`setuptools.Extension`
+    constructor.
+
+    Example:
+        >>> from setuptools import setup
+        >>> from torch.utils.cpp_extension import BuildExtension, CppExtension
+        >>> setup(
+                name='extension',
+                ext_modules=[
+                    CppExtension(
+                        name='extension',
+                        sources=['extension.cpp'],
+                        extra_compile_args=['-g'])),
+                ],
+                cmdclass={
+                    'build_ext': BuildExtension
+                })
+    '''
+    include_dirs = kwargs.get('include_dirs', [])
+    include_dirs += include_paths()
+    kwargs['include_dirs'] = include_dirs
+
+    if sys.platform == 'win32':
+        library_dirs = kwargs.get('library_dirs', [])
+        library_dirs += library_paths()
+        kwargs['library_dirs'] = library_dirs
+
+        libraries = kwargs.get('libraries', [])
+        libraries.append('caffe2')
+        libraries.append('_C')
+        kwargs['libraries'] = libraries
+
+    kwargs['language'] = 'c++'
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def CUDAExtension(name, sources, *args, **kwargs):
+    '''
+    Creates a :class:`setuptools.Extension` for CUDA/C++.
+
+    Convenience method that creates a :class:`setuptools.Extension` with the
+    bare minimum (but often sufficient) arguments to build a CUDA/C++
+    extension. This includes the CUDA include path, library path and runtime
+    library.
+
+    All arguments are forwarded to the :class:`setuptools.Extension`
+    constructor.
+
+    Example:
+        >>> from setuptools import setup
+        >>> from torch.utils.cpp_extension import BuildExtension, CppExtension
+        >>> setup(
+                name='cuda_extension',
+                ext_modules=[
+                    CUDAExtension(
+                            name='cuda_extension',
+                            sources=['extension.cpp', 'extension_kernel.cu'],
+                            extra_compile_args={'cxx': ['-g'],
+                                                'nvcc': ['-O2']})
+                ],
+                cmdclass={
+                    'build_ext': BuildExtension
+                })
+    '''
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths(cuda=True)
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    libraries.append('cudart')
+    if sys.platform == 'win32':
+        libraries.append('caffe2')
+        libraries.append('caffe2_gpu')
+        libraries.append('_C')
+    kwargs['libraries'] = libraries
+
+    include_dirs = kwargs.get('include_dirs', [])
+    include_dirs += include_paths(cuda=True)
+    kwargs['include_dirs'] = include_dirs
+
+    kwargs['language'] = 'c++'
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def include_paths(cuda=False):
+    '''
+    Get the include paths required to build a C++ or CUDA extension.
+
+    Args:
+        cuda: If `True`, includes CUDA-specific include paths.
+
+    Returns:
+        A list of include path strings.
+    '''
+    here = os.path.abspath(__file__)
+    torch_path = os.path.dirname(os.path.dirname(here))
+    lib_include = os.path.join(torch_path, 'lib', 'include')
+    # Some internal (old) Torch headers don't properly prefix their includes,
+    # so we need to pass -Itorch/lib/include/TH as well.
+    paths = [
+        lib_include,
+        os.path.join(lib_include, 'TH'),
+        os.path.join(lib_include, 'THC')
+    ]
+    if cuda:
+        paths.append(_join_cuda_home('include'))
+    return paths
+
+
+def library_paths(cuda=False):
+    '''
+    Get the library paths required to build a C++ or CUDA extension.
+
+    Args:
+        cuda: If `True`, includes CUDA-specific library paths.
+
+    Returns:
+        A list of library path strings.
+    '''
+    paths = []
+
+    if sys.platform == 'win32':
+        here = os.path.abspath(__file__)
+        torch_path = os.path.dirname(os.path.dirname(here))
+        lib_path = os.path.join(torch_path, 'lib')
+
+        paths.append(lib_path)
+
+    if cuda:
+        lib_dir = 'lib/x64' if sys.platform == 'win32' else 'lib64'
+        paths.append(_join_cuda_home(lib_dir))
+    return paths
+
+
+def load(name,
+         sources,
+         extra_cflags=None,
+         extra_cuda_cflags=None,
+         extra_ldflags=None,
+         extra_include_paths=None,
+         build_directory=None,
+         verbose=False,
+         with_cuda=None):
+    '''
+    Loads a PyTorch C++ extension just-in-time (JIT).
+
+    To load an extension, a Ninja build file is emitted, which is used to
+    compile the given sources into a dynamic library. This library is
+    subsequently loaded into the current Python process as a module and
+    returned from this function, ready for use.
+
+    By default, the directory to which the build file is emitted and the
+    resulting library compiled to is ``<tmp>/torch_extensions/<name>``, where
+    ``<tmp>`` is the temporary folder on the current platform and ``<name>``
+    the name of the extension. This location can be overridden in two ways.
+    First, if the ``TORCH_EXTENSIONS_DIR`` environment variable is set, it
+    replaces ``<tmp>/torch_extensions`` and all extensions will be compiled
+    into subfolders of this directory. Second, if the ``build_directory``
+    argument to this function is supplied, it overrides the entire path, i.e.
+    the library will be compiled into that folder directly.
+
+    To compile the sources, the default system compiler (``c++``) is used,
+    which can be overridden by setting the ``CXX`` environment variable. To pass
+    additional arguments to the compilation process, ``extra_cflags`` or
+    ``extra_ldflags`` can be provided. For example, to compile your extension
+    with optimizations, pass ``extra_cflags=['-O3']``. You can also use
+    ``extra_cflags`` to pass further include directories.
+
+    CUDA support with mixed compilation is provided. Simply pass CUDA source
+    files (``.cu`` or ``.cuh``) along with other sources. Such files will be
+    detected and compiled with nvcc rather than the C++ compiler. This includes
+    passing the CUDA lib64 directory as a library directory, and linking
+    ``cudart``. You can pass additional flags to nvcc via
+    ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various
+    heuristics for finding the CUDA install directory are used, which usually
+    work fine. If not, setting the ``CUDA_HOME`` environment variable is the
+    safest option.
+
+    Args:
+        name: The name of the extension to build. This MUST be the same as the
+            name of the pybind11 module!
+        sources: A list of relative or absolute paths to C++ source files.
+        extra_cflags: optional list of compiler flags to forward to the build.
+        extra_cuda_cflags: optional list of compiler flags to forward to nvcc
+            when building CUDA sources.
+        extra_ldflags: optional list of linker flags to forward to the build.
+        extra_include_paths: optional list of include directories to forward
+            to the build.
+        build_directory: optional path to use as build workspace.
+        verbose: If ``True``, turns on verbose logging of load steps.
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on the existence of ``.cu`` or
+            ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
+            and libraries to be included.
+
+    Returns:
+        The loaded PyTorch extension as a Python module.
+
+    Example:
+        >>> from torch.utils.cpp_extension import load
+        >>> module = load(
+                name='extension',
+                sources=['extension.cpp', 'extension_kernel.cu'],
+                extra_cflags=['-O2'],
+                verbose=True)
+    '''
+    return _jit_compile(
+        name,
+        [sources] if isinstance(sources, str) else sources,
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        build_directory or _get_build_directory(name, verbose),
+        verbose,
+        with_cuda=with_cuda)
+
+
+def load_inline(name,
+                cpp_sources,
+                cuda_sources=None,
+                functions=None,
+                extra_cflags=None,
+                extra_cuda_cflags=None,
+                extra_ldflags=None,
+                extra_include_paths=None,
+                build_directory=None,
+                verbose=False,
+                with_cuda=None):
+    '''
+    Loads a PyTorch C++ extension just-in-time (JIT) from string sources.
+
+    This function behaves exactly like :func:`load`, but takes its sources as
+    strings rather than filenames. These strings are stored to files in the
+    build directory, after which the behavior of :func:`load_inline` is
+    identical to :func:`load`.
+
+    See `the
+    tests <https://github.com/pytorch/pytorch/blob/master/test/test_cpp_extensions.py>`_
+    for good examples of using this function.
+
+    Sources may omit two required parts of a typical non-inline C++ extension:
+    the necessary header includes, as well as the (pybind11) binding code. More
+    precisely, strings passed to ``cpp_sources`` are first concatenated into a
+    single ``.cpp`` file. This file is then prepended with ``#include
+    <torch/torch.h>``.
+
+    Furthermore, if the ``functions`` argument is supplied, bindings will be
+    automatically generated for each function specified. ``functions`` can
+    either be a list of function names, or a dictionary mapping from function
+    names to docstrings. If a list is given, the name of each function is used
+    as its docstring.
+
+    The sources in ``cuda_sources`` are concatenated into a separate ``.cu``
+    file and  prepended with ``ATen/ATen.h``, ``cuda.h`` and ``cuda_runtime.h``
+    includes. The ``.cpp`` and ``.cu`` files are compiled separately, but
+    ultimately linked into a single library. Note that no bindings are
+    generated for functions in ``cuda_sources`` per  se. To bind to a CUDA
+    kernel, you must create a C++ function that calls it, and either declare or
+    define this C++ function in one of the ``cpp_sources`` (and include its
+    name in ``functions``).
+
+    See :func:`load` for a description of arguments omitted below.
+
+    Args:
+        cpp_sources: A string, or list of strings, containing C++ source code.
+        cuda_sources: A string, or list of strings, containing CUDA source code.
+        functions: A list of function names for which to generate function
+            bindings. If a dictionary is given, it should map function names to
+            docstrings (which are otherwise just the function names).
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on whether ``cuda_sources`` is
+            provided. Set it to `True`` to force CUDA headers
+            and libraries to be included.
+
+    Example:
+        >>> from torch.utils.cpp_extension import load_inline
+        >>> source = \'\'\'
+        at::Tensor sin_add(at::Tensor x, at::Tensor y) {
+          return x.sin() + y.sin();
+        }
+        \'\'\'
+        >>> module = load_inline(name='inline_extension',
+                                 cpp_sources=[source],
+                                 functions=['sin_add'])
+    '''
+    build_directory = build_directory or _get_build_directory(name, verbose)
+
+    if isinstance(cpp_sources, str):
+        cpp_sources = [cpp_sources]
+    cuda_sources = cuda_sources or []
+    if isinstance(cuda_sources, str):
+        cuda_sources = [cuda_sources]
+
+    cpp_sources.insert(0, '#include <torch/torch.h>')
+
+    # If `functions` is supplied, we create the pybind11 bindings for the user.
+    # Here, `functions` is (or becomes, after some processing) a map from
+    # function names to function docstrings.
+    if functions is not None:
+        cpp_sources.append('PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {')
+        if isinstance(functions, str):
+            functions = [functions]
+        if isinstance(functions, list):
+            # Make the function docstring the same as the function name.
+            functions = dict((f, f) for f in functions)
+        elif not isinstance(functions, dict):
+            raise ValueError(
+                "Expected 'functions' to be a list or dict, but was {}".format(
+                    type(functions)))
+        for function_name, docstring in functions.items():
+            cpp_sources.append('m.def("{0}", &{0}, "{1}");'.format(
+                function_name, docstring))
+        cpp_sources.append('}')
+
+    cpp_source_path = os.path.join(build_directory, 'main.cpp')
+    with open(cpp_source_path, 'w') as cpp_source_file:
+        cpp_source_file.write('\n'.join(cpp_sources))
+
+    sources = [cpp_source_path]
+
+    if cuda_sources:
+        cuda_sources.insert(0, '#include <ATen/ATen.h>')
+        cuda_sources.insert(1, '#include <cuda.h>')
+        cuda_sources.insert(2, '#include <cuda_runtime.h>')
+
+        cuda_source_path = os.path.join(build_directory, 'cuda.cu')
+        with open(cuda_source_path, 'w') as cuda_source_file:
+            cuda_source_file.write('\n'.join(cuda_sources))
+
+        sources.append(cuda_source_path)
+
+    return _jit_compile(
+        name,
+        sources,
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        build_directory,
+        verbose,
+        with_cuda=with_cuda)
+
+
+def _jit_compile(name,
+                 sources,
+                 extra_cflags,
+                 extra_cuda_cflags,
+                 extra_ldflags,
+                 extra_include_paths,
+                 build_directory,
+                 verbose,
+                 with_cuda=None):
+    baton = FileBaton(os.path.join(build_directory, 'lock'))
+    if baton.try_acquire():
+        try:
+            verify_ninja_availability()
+            check_compiler_abi_compatibility(os.environ.get('CXX', 'c++'))
+            if with_cuda is None:
+                with_cuda = any(map(_is_cuda_file, sources))
+            extra_ldflags = _prepare_ldflags(
+                extra_ldflags or [],
+                with_cuda,
+                verbose)
+            build_file_path = os.path.join(build_directory, 'build.ninja')
+            if verbose:
+                print(
+                    'Emitting ninja build file {}...'.format(build_file_path))
+            # NOTE: Emitting a new ninja build file does not cause re-compilation if
+            # the sources did not change, so it's ok to re-emit (and it's fast).
+            _write_ninja_file(
+                path=build_file_path,
+                name=name,
+                sources=sources,
+                extra_cflags=extra_cflags or [],
+                extra_cuda_cflags=extra_cuda_cflags or [],
+                extra_ldflags=extra_ldflags or [],
+                extra_include_paths=extra_include_paths or [],
+                with_cuda=with_cuda)
+
+            if verbose:
+                print('Building extension module {}...'.format(name))
+            _build_extension_module(name, build_directory)
+        finally:
+            baton.release()
+    else:
+        baton.wait()
+
+    if verbose:
+        print('Loading extension module {}...'.format(name))
+    return _import_module_from_library(name, build_directory)
+
+
+def verify_ninja_availability():
+    '''
+    Returns ``True`` if the `ninja <https://ninja-build.org/>`_ build system is
+    available on the system.
+    '''
+    with open(os.devnull, 'wb') as devnull:
+        try:
+            subprocess.check_call('ninja --version'.split(), stdout=devnull)
+        except OSError:
+            raise RuntimeError("Ninja is required to load C++ extensions")
+
+
+def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
+    if sys.platform == 'win32':
+        python_path = os.path.dirname(sys.executable)
+        python_lib_path = os.path.join(python_path, 'libs')
+
+        here = os.path.abspath(__file__)
+        torch_path = os.path.dirname(os.path.dirname(here))
+        lib_path = os.path.join(torch_path, 'lib')
+
+        extra_ldflags.append('caffe2.lib')
+        if with_cuda:
+            extra_ldflags.append('caffe2_gpu.lib')
+        extra_ldflags.append('_C.lib')
+        extra_ldflags.append('/LIBPATH:{}'.format(python_lib_path))
+        extra_ldflags.append('/LIBPATH:{}'.format(lib_path))
+
+    if with_cuda:
+        if verbose:
+            print('Detected CUDA files, patching ldflags')
+        if sys.platform == 'win32':
+            extra_ldflags.append('/LIBPATH:{}'.format(
+                _join_cuda_home('lib/x64')))
+            extra_ldflags.append('cudart.lib')
+        else:
+            extra_ldflags.append('-L{}'.format(_join_cuda_home('lib64')))
+            extra_ldflags.append('-lcudart')
+
+    return extra_ldflags
+
+
+def _get_build_directory(name, verbose):
+    root_extensions_directory = os.environ.get('TORCH_EXTENSIONS_DIR')
+    if root_extensions_directory is None:
+        # tempfile.gettempdir() will be /tmp on UNIX and \TEMP on Windows.
+        root_extensions_directory = os.path.join(tempfile.gettempdir(),
+                                                 'torch_extensions')
+
+    if verbose:
+        print('Using {} as PyTorch extensions root...'.format(
+            root_extensions_directory))
+
+    build_directory = os.path.join(root_extensions_directory, name)
+    if not os.path.exists(build_directory):
+        if verbose:
+            print('Creating extension directory {}...'.format(build_directory))
+        # This is like mkdir -p, i.e. will also create parent directories.
+        os.makedirs(build_directory)
+
+    return build_directory
+
+
+def _build_extension_module(name, build_directory):
+    try:
+        subprocess.check_output(
+            ['ninja', '-v'], stderr=subprocess.STDOUT, cwd=build_directory)
+    except subprocess.CalledProcessError:
+        # Python 2 and 3 compatible way of getting the error object.
+        _, error, _ = sys.exc_info()
+        # error.output contains the stdout and stderr of the build attempt.
+        raise RuntimeError("Error building extension '{}': {}".format(
+            name, error.output.decode()))
+
+
+def _import_module_from_library(module_name, path):
+    # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+    file, path, description = imp.find_module(module_name, [path])
+    # Close the .so file after load.
+    with file:
+        return imp.load_module(module_name, file, path, description)
+
+
+def _write_ninja_file(path,
+                      name,
+                      sources,
+                      extra_cflags,
+                      extra_cuda_cflags,
+                      extra_ldflags,
+                      extra_include_paths,
+                      with_cuda=False):
+    extra_cflags = [flag.strip() for flag in extra_cflags]
+    extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags]
+    extra_ldflags = [flag.strip() for flag in extra_ldflags]
+    extra_include_paths = [flag.strip() for flag in extra_include_paths]
+
+    # Version 1.3 is required for the `deps` directive.
+    config = ['ninja_required_version = 1.3']
+    config.append('cxx = {}'.format(os.environ.get('CXX', 'c++')))
+    if with_cuda:
+        config.append('nvcc = {}'.format(_join_cuda_home('bin', 'nvcc')))
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    sources = [os.path.abspath(file) for file in sources]
+    includes = [os.path.abspath(file) for file in extra_include_paths]
+
+    # include_paths() gives us the location of torch/torch.h
+    includes += include_paths(with_cuda)
+    # sysconfig.get_paths()['include'] gives us the location of Python.h
+    includes.append(sysconfig.get_paths()['include'])
+
+    common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
+    common_cflags += ['-I{}'.format(include) for include in includes]
+
+    if is_binary_build():
+        common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
+    cflags = common_cflags + ['-fPIC', '-std=c++11'] + extra_cflags
+    if sys.platform == 'win32':
+        from distutils.spawn import _nt_quote_args
+        cflags = _nt_quote_args(cflags)
+    flags = ['cflags = {}'.format(' '.join(cflags))]
+
+    if with_cuda:
+        cuda_flags = common_cflags
+        if sys.platform == 'win32':
+            cuda_flags = _nt_quote_args(cuda_flags)
+        else:
+            cuda_flags += ['--compiler-options', "'-fPIC'"]
+            cuda_flags += extra_cuda_cflags
+            if not any(flag.startswith('-std=') for flag in cuda_flags):
+                cuda_flags.append('-std=c++11')
+
+        flags.append('cuda_flags = {}'.format(' '.join(cuda_flags)))
+
+    if sys.platform == 'win32':
+        ldflags = ['/DLL'] + extra_ldflags
+    else:
+        ldflags = ['-shared'] + extra_ldflags
+    # The darwin linker needs explicit consent to ignore unresolved symbols.
+    if sys.platform == 'darwin':
+        ldflags.append('-undefined dynamic_lookup')
+    elif sys.platform == 'win32':
+        ldflags = _nt_quote_args(ldflags)
+    flags.append('ldflags = {}'.format(' '.join(ldflags)))
+
+    # See https://ninja-build.org/build.ninja.html for reference.
+    compile_rule = ['rule compile']
+    if sys.platform == 'win32':
+        compile_rule.append(
+            '  command = cl /showIncludes $cflags -c $in /Fo$out')
+        compile_rule.append('  deps = msvc')
+    else:
+        compile_rule.append(
+            '  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out')
+        compile_rule.append('  depfile = $out.d')
+        compile_rule.append('  deps = gcc')
+
+    if with_cuda:
+        cuda_compile_rule = ['rule cuda_compile']
+        cuda_compile_rule.append(
+            '  command = $nvcc $cuda_flags -c $in -o $out')
+
+    link_rule = ['rule link']
+    if sys.platform == 'win32':
+        cl_paths = subprocess.check_output(['where',
+                                            'cl']).decode().split('\r\n')
+        if len(cl_paths) >= 1:
+            cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:')
+        else:
+            raise RuntimeError("MSVC is required to load C++ extensions")
+        link_rule.append(
+            '  command = "{}/link.exe" $in /nologo $ldflags /out:$out'.format(
+                cl_path))
+    else:
+        link_rule.append('  command = $cxx $in $ldflags -o $out')
+
+    # Emit one build rule per source to enable incremental build.
+    object_files = []
+    build = []
+    for source_file in sources:
+        # '/path/to/file.cpp' -> 'file'
+        file_name = os.path.splitext(os.path.basename(source_file))[0]
+        if _is_cuda_file(source_file) and with_cuda:
+            rule = 'cuda_compile'
+            # Use a different object filename in case a C++ and CUDA file have
+            # the same filename but different extension (.cpp vs. .cu).
+            target = '{}.cuda.o'.format(file_name)
+        else:
+            rule = 'compile'
+            target = '{}.o'.format(file_name)
+        object_files.append(target)
+        if sys.platform == 'win32':
+            source_file = source_file.replace(':', '$:')
+        build.append('build {}: {} {}'.format(target, rule, source_file))
+
+    ext = '.pyd' if sys.platform == 'win32' else '.so'
+    library_target = '{}{}'.format(name, ext)
+    link = ['build {}: link {}'.format(library_target, ' '.join(object_files))]
+
+    default = ['default {}'.format(library_target)]
+
+    # 'Blocks' should be separated by newlines, for visual benefit.
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)
+    blocks += [link_rule, build, link, default]
+    with open(path, 'w') as build_file:
+        for block in blocks:
+            lines = '\n'.join(block)
+            build_file.write('{}\n\n'.format(lines))
+
+
+def _join_cuda_home(*paths):
+    '''
+    Joins paths with CUDA_HOME, or raises an error if it CUDA_HOME is not set.
+
+    This is basically a lazy way of raising an error for missing $CUDA_HOME
+    only once we need to get any CUDA-specific path.
+    '''
+    if CUDA_HOME is None:
+        raise EnvironmentError('CUDA_HOME environment variable is not set. '
+                               'Please set it to your CUDA install root.')
+    return os.path.join(CUDA_HOME, *paths)
+
+
+def _is_cuda_file(path):
+    return os.path.splitext(path)[1] in ['.cu', '.cuh']
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
new file mode 100644
index 0000000..087ccee
--- /dev/null
+++ b/torch/utils/data/__init__.py
@@ -0,0 +1,4 @@
+
+from .sampler import Sampler, SequentialSampler, RandomSampler, SubsetRandomSampler, WeightedRandomSampler, BatchSampler
+from .dataset import Dataset, TensorDataset, ConcatDataset, Subset, random_split
+from .dataloader import DataLoader
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
new file mode 100644
index 0000000..10457a6
--- /dev/null
+++ b/torch/utils/data/dataloader.py
@@ -0,0 +1,504 @@
+import random
+import torch
+import torch.multiprocessing as multiprocessing
+from torch._C import _set_worker_signal_handlers, _update_worker_pids, \
+    _remove_worker_pids, _error_if_any_worker_fails
+from . import SequentialSampler, RandomSampler, BatchSampler
+import signal
+import functools
+import collections
+import re
+import sys
+import threading
+import traceback
+import os
+import time
+from torch._six import string_classes, int_classes, FileNotFoundError
+
+IS_WINDOWS = sys.platform == "win32"
+if IS_WINDOWS:
+    import ctypes
+    from ctypes.wintypes import DWORD, BOOL, HANDLE
+
+if sys.version_info[0] == 2:
+    import Queue as queue
+else:
+    import queue
+
+
+class ExceptionWrapper(object):
+    r"""Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info):
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+
+
+_use_shared_memory = False
+r"""Whether to use shared memory in default_collate"""
+
+MANAGER_STATUS_CHECK_INTERVAL = 5.0
+
+if IS_WINDOWS:
+    # On Windows, the parent ID of the worker process remains unchanged when the manager process
+    # is gone, and the only way to check it through OS is to let the worker have a process handle
+    # of the manager and ask if the process status has changed.
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
+            self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
+            self.kernel32.OpenProcess.restype = HANDLE
+            self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
+            self.kernel32.WaitForSingleObject.restype = DWORD
+
+            # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx
+            SYNCHRONIZE = 0x00100000
+            self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
+
+            if not self.manager_handle:
+                raise ctypes.WinError(ctypes.get_last_error())
+
+        def is_alive(self):
+            # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
+            return self.kernel32.WaitForSingleObject(self.manager_handle, 0) != 0
+else:
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+
+        def is_alive(self):
+            return os.getppid() == self.manager_pid
+
+
+def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, worker_id):
+    global _use_shared_memory
+    _use_shared_memory = True
+
+    # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+    # module's handlers are executed after Python returns from C low-level
+    # handlers, likely when the same fatal signal happened again already.
+    # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
+    _set_worker_signal_handlers()
+
+    torch.set_num_threads(1)
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    if init_fn is not None:
+        init_fn(worker_id)
+
+    watchdog = ManagerWatchdog()
+
+    while True:
+        try:
+            r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
+        except queue.Empty:
+            if watchdog.is_alive():
+                continue
+            else:
+                break
+        if r is None:
+            break
+        idx, batch_indices = r
+        try:
+            samples = collate_fn([dataset[i] for i in batch_indices])
+        except Exception:
+            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            data_queue.put((idx, samples))
+            del samples
+
+
+def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id):
+    if pin_memory:
+        torch.cuda.set_device(device_id)
+
+    while True:
+        try:
+            r = in_queue.get()
+        except Exception:
+            if done_event.is_set():
+                return
+            raise
+        if r is None:
+            break
+        if isinstance(r[1], ExceptionWrapper):
+            out_queue.put(r)
+            continue
+        idx, batch = r
+        try:
+            if pin_memory:
+                batch = pin_memory_batch(batch)
+        except Exception:
+            out_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            out_queue.put((idx, batch))
+
+numpy_type_map = {
+    'float64': torch.DoubleTensor,
+    'float32': torch.FloatTensor,
+    'float16': torch.HalfTensor,
+    'int64': torch.LongTensor,
+    'int32': torch.IntTensor,
+    'int16': torch.ShortTensor,
+    'int8': torch.CharTensor,
+    'uint8': torch.ByteTensor,
+}
+
+
+def default_collate(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+
+    error_msg = "batch must contain tensors, numbers, dicts or lists; found {}"
+    elem_type = type(batch[0])
+    if isinstance(batch[0], torch.Tensor):
+        out = None
+        if _use_shared_memory:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = batch[0].storage()._new_shared(numel)
+            out = batch[0].new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        elem = batch[0]
+        if elem_type.__name__ == 'ndarray':
+            # array of string classes and object
+            if re.search('[SaUO]', elem.dtype.str) is not None:
+                raise TypeError(error_msg.format(elem.dtype))
+
+            return torch.stack([torch.from_numpy(b) for b in batch], 0)
+        if elem.shape == ():  # scalars
+            py_type = float if elem.dtype.name.startswith('float') else int
+            return numpy_type_map[elem.dtype.name](list(map(py_type, batch)))
+    elif isinstance(batch[0], int_classes):
+        return torch.LongTensor(batch)
+    elif isinstance(batch[0], float):
+        return torch.DoubleTensor(batch)
+    elif isinstance(batch[0], string_classes):
+        return batch
+    elif isinstance(batch[0], collections.Mapping):
+        return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
+    elif isinstance(batch[0], collections.Sequence):
+        transposed = zip(*batch)
+        return [default_collate(samples) for samples in transposed]
+
+    raise TypeError((error_msg.format(type(batch[0]))))
+
+
+def pin_memory_batch(batch):
+    if isinstance(batch, torch.Tensor):
+        return batch.pin_memory()
+    elif isinstance(batch, string_classes):
+        return batch
+    elif isinstance(batch, collections.Mapping):
+        return {k: pin_memory_batch(sample) for k, sample in batch.items()}
+    elif isinstance(batch, collections.Sequence):
+        return [pin_memory_batch(sample) for sample in batch]
+    else:
+        return batch
+
+
+_SIGCHLD_handler_set = False
+r"""Whether SIGCHLD handler is set for DataLoader worker failures. Only one
+handler needs to be set for all DataLoaders in a process."""
+
+
+def _set_SIGCHLD_handler():
+    # Windows doesn't support SIGCHLD handler
+    if sys.platform == 'win32':
+        return
+    # can't set signal in child threads
+    if not isinstance(threading.current_thread(), threading._MainThread):
+        return
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+    previous_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(previous_handler):
+        previous_handler = None
+
+    def handler(signum, frame):
+        # This following call uses `waitid` with WNOHANG from C side. Therefore,
+        # Python can still get and update the process status successfully.
+        _error_if_any_worker_fails()
+        if previous_handler is not None:
+            previous_handler(signum, frame)
+
+    signal.signal(signal.SIGCHLD, handler)
+    _SIGCHLD_handler_set = True
+
+
+class _DataLoaderIter(object):
+    r"""Iterates once over the DataLoader's dataset, as specified by the sampler"""
+
+    def __init__(self, loader):
+        self.dataset = loader.dataset
+        self.collate_fn = loader.collate_fn
+        self.batch_sampler = loader.batch_sampler
+        self.num_workers = loader.num_workers
+        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
+        self.timeout = loader.timeout
+        self.done_event = threading.Event()
+
+        self.sample_iter = iter(self.batch_sampler)
+
+        base_seed = torch.LongTensor(1).random_().item()
+
+        if self.num_workers > 0:
+            self.worker_init_fn = loader.worker_init_fn
+            self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)]
+            self.worker_queue_idx = 0
+            self.worker_result_queue = multiprocessing.SimpleQueue()
+            self.batches_outstanding = 0
+            self.worker_pids_set = False
+            self.shutdown = False
+            self.send_idx = 0
+            self.rcvd_idx = 0
+            self.reorder_dict = {}
+
+            self.workers = [
+                multiprocessing.Process(
+                    target=_worker_loop,
+                    args=(self.dataset, self.index_queues[i],
+                          self.worker_result_queue, self.collate_fn, base_seed + i,
+                          self.worker_init_fn, i))
+                for i in range(self.num_workers)]
+
+            if self.pin_memory or self.timeout > 0:
+                self.data_queue = queue.Queue()
+                if self.pin_memory:
+                    maybe_device_id = torch.cuda.current_device()
+                else:
+                    # do not initialize cuda context if not necessary
+                    maybe_device_id = None
+                self.worker_manager_thread = threading.Thread(
+                    target=_worker_manager_loop,
+                    args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
+                          maybe_device_id))
+                self.worker_manager_thread.daemon = True
+                self.worker_manager_thread.start()
+            else:
+                self.data_queue = self.worker_result_queue
+
+            for w in self.workers:
+                w.daemon = True  # ensure that the worker exits on process exit
+                w.start()
+
+            _update_worker_pids(id(self), tuple(w.pid for w in self.workers))
+            _set_SIGCHLD_handler()
+            self.worker_pids_set = True
+
+            # prime the prefetch loop
+            for _ in range(2 * self.num_workers):
+                self._put_indices()
+
+    def __len__(self):
+        return len(self.batch_sampler)
+
+    def _get_batch(self):
+        if self.timeout > 0:
+            try:
+                return self.data_queue.get(timeout=self.timeout)
+            except queue.Empty:
+                raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
+        else:
+            return self.data_queue.get()
+
+    def __next__(self):
+        if self.num_workers == 0:  # same-process loading
+            indices = next(self.sample_iter)  # may raise StopIteration
+            batch = self.collate_fn([self.dataset[i] for i in indices])
+            if self.pin_memory:
+                batch = pin_memory_batch(batch)
+            return batch
+
+        # check if the next sample has already been generated
+        if self.rcvd_idx in self.reorder_dict:
+            batch = self.reorder_dict.pop(self.rcvd_idx)
+            return self._process_next_batch(batch)
+
+        if self.batches_outstanding == 0:
+            self._shutdown_workers()
+            raise StopIteration
+
+        while True:
+            assert (not self.shutdown and self.batches_outstanding > 0)
+            idx, batch = self._get_batch()
+            self.batches_outstanding -= 1
+            if idx != self.rcvd_idx:
+                # store out-of-order samples
+                self.reorder_dict[idx] = batch
+                continue
+            return self._process_next_batch(batch)
+
+    next = __next__  # Python 2 compatibility
+
+    def __iter__(self):
+        return self
+
+    def _put_indices(self):
+        assert self.batches_outstanding < 2 * self.num_workers
+        indices = next(self.sample_iter, None)
+        if indices is None:
+            return
+        self.index_queues[self.worker_queue_idx].put((self.send_idx, indices))
+        self.worker_queue_idx = (self.worker_queue_idx + 1) % self.num_workers
+        self.batches_outstanding += 1
+        self.send_idx += 1
+
+    def _process_next_batch(self, batch):
+        self.rcvd_idx += 1
+        self._put_indices()
+        if isinstance(batch, ExceptionWrapper):
+            raise batch.exc_type(batch.exc_msg)
+        return batch
+
+    def __getstate__(self):
+        # TODO: add limited pickling support for sharing an iterator
+        # across multiple threads for HOGWILD.
+        # Probably the best way to do this is by moving the sample pushing
+        # to a separate thread and then just sharing the data queue
+        # but signalling the end is tricky without a non-blocking API
+        raise NotImplementedError("_DataLoaderIter cannot be pickled")
+
+    def _shutdown_workers(self):
+        try:
+            if not self.shutdown:
+                self.shutdown = True
+                self.done_event.set()
+                for q in self.index_queues:
+                    q.put(None)
+                # if some workers are waiting to put, make place for them
+                try:
+                    while not self.worker_result_queue.empty():
+                        self.worker_result_queue.get()
+                except (FileNotFoundError, ImportError):
+                    # Many weird errors can happen here due to Python
+                    # shutting down. These are more like obscure Python bugs.
+                    # FileNotFoundError can happen when we rebuild the fd
+                    # fetched from the queue but the socket is already closed
+                    # from the worker side.
+                    # ImportError can happen when the unpickler loads the
+                    # resource from `get`.
+                    pass
+                # done_event should be sufficient to exit worker_manager_thread,
+                # but be safe here and put another None
+                self.worker_result_queue.put(None)
+        finally:
+            # removes pids no matter what
+            if self.worker_pids_set:
+                _remove_worker_pids(id(self))
+                self.worker_pids_set = False
+
+    def __del__(self):
+        if self.num_workers > 0:
+            self._shutdown_workers()
+
+
+class DataLoader(object):
+    r"""
+    Data loader. Combines a dataset and a sampler, and provides
+    single- or multi-process iterators over the dataset.
+
+    Arguments:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: 1).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: False).
+        sampler (Sampler, optional): defines the strategy to draw samples from
+            the dataset. If specified, ``shuffle`` must be False.
+        batch_sampler (Sampler, optional): like sampler, but returns a batch of
+            indices at a time. Mutually exclusive with batch_size, shuffle,
+            sampler, and drop_last.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means that the data will be loaded in the main process.
+            (default: 0)
+        collate_fn (callable, optional): merges a list of samples to form a mini-batch.
+        pin_memory (bool, optional): If ``True``, the data loader will copy tensors
+            into CUDA pinned memory before returning them.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: False)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: 0)
+        worker_init_fn (callable, optional): If not None, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: None)
+
+    .. note:: By default, each worker will have its PyTorch seed set to
+              ``base_seed + worker_id``, where ``base_seed`` is a long generated
+              by main process using its RNG. However, seeds for other libraies
+              may be duplicated upon initializing workers (w.g., NumPy), causing
+              each worker to return identical random numbers. (See
+              :ref:`dataloader-workers-random-seed` section in FAQ.) You may
+              use ``torch.initial_seed()`` to access the PyTorch seed for each
+              worker in :attr:`worker_init_fn`, and use it to set other seeds
+              before data loading.
+
+    .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
+                 unpicklable object, e.g., a lambda function.
+    """
+
+    __initialized = False
+
+    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
+                 num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False,
+                 timeout=0, worker_init_fn=None):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.collate_fn = collate_fn
+        self.pin_memory = pin_memory
+        self.drop_last = drop_last
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+
+        if timeout < 0:
+            raise ValueError('timeout option should be non-negative')
+
+        if batch_sampler is not None:
+            if batch_size > 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError('batch_sampler option is mutually exclusive '
+                                 'with batch_size, shuffle, sampler, and '
+                                 'drop_last')
+            self.batch_size = None
+            self.drop_last = None
+
+        if sampler is not None and shuffle:
+            raise ValueError('sampler option is mutually exclusive with '
+                             'shuffle')
+
+        if self.num_workers < 0:
+            raise ValueError('num_workers option cannot be negative; '
+                             'use num_workers=0 to disable multiprocessing.')
+
+        if batch_sampler is None:
+            if sampler is None:
+                if shuffle:
+                    sampler = RandomSampler(dataset)
+                else:
+                    sampler = SequentialSampler(dataset)
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.__initialized = True
+
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in ('batch_size', 'sampler', 'drop_last'):
+            raise ValueError('{} attribute should not be set after {} is '
+                             'initialized'.format(attr, self.__class__.__name__))
+
+        super(DataLoader, self).__setattr__(attr, val)
+
+    def __iter__(self):
+        return _DataLoaderIter(self)
+
+    def __len__(self):
+        return len(self.batch_sampler)
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
new file mode 100644
index 0000000..0202ca2
--- /dev/null
+++ b/torch/utils/data/dataset.py
@@ -0,0 +1,121 @@
+import bisect
+import warnings
+
+from torch._utils import _accumulate
+from torch import randperm
+
+
+class Dataset(object):
+    """An abstract class representing a Dataset.
+
+    All other datasets should subclass it. All subclasses should override
+    ``__len__``, that provides the size of the dataset, and ``__getitem__``,
+    supporting integer indexing in range from 0 to len(self) exclusive.
+    """
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __add__(self, other):
+        return ConcatDataset([self, other])
+
+
+class TensorDataset(Dataset):
+    """Dataset wrapping tensors.
+
+    Each sample will be retrieved by indexing tensors along the first dimension.
+
+    Arguments:
+        *tensors (Tensor): tensors that have the same size of the first dimension.
+    """
+
+    def __init__(self, *tensors):
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple(tensor[index] for tensor in self.tensors)
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+
+class ConcatDataset(Dataset):
+    """
+    Dataset to concatenate multiple datasets.
+    Purpose: useful to assemble different existing datasets, possibly
+    large-scale datasets as the concatenation operation is done in an
+    on-the-fly manner.
+
+    Arguments:
+        datasets (sequence): List of datasets to be concatenated
+    """
+
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+
+    def __init__(self, datasets):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, 'datasets should not be an empty iterable'
+        self.datasets = list(datasets)
+        self.cumulative_sizes = self.cumsum(self.datasets)
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+
+    @property
+    def cummulative_sizes(self):
+        warnings.warn("cummulative_sizes attribute is renamed to "
+                      "cumulative_sizes", DeprecationWarning, stacklevel=2)
+        return self.cumulative_sizes
+
+
+class Subset(Dataset):
+    """
+    Subset of a dataset at specified indices.
+
+    Arguments:
+        dataset (Dataset): The whole Dataset
+        indices (sequence): Indices in the whole set selected for subset
+    """
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = indices
+
+    def __getitem__(self, idx):
+        return self.dataset[self.indices[idx]]
+
+    def __len__(self):
+        return len(self.indices)
+
+
+def random_split(dataset, lengths):
+    """
+    Randomly split a dataset into non-overlapping new datasets of given lengths.
+
+    Arguments:
+        dataset (Dataset): Dataset to be split
+        lengths (sequence): lengths of splits to be produced
+    """
+    if sum(lengths) != len(dataset):
+        raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
+
+    indices = randperm(sum(lengths))
+    return [Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
new file mode 100644
index 0000000..f779e28
--- /dev/null
+++ b/torch/utils/data/distributed.py
@@ -0,0 +1,58 @@
+import math
+import torch
+from . import Sampler
+from torch.distributed import get_world_size, get_rank
+
+
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None):
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = list(torch.randperm(len(self.dataset), generator=g))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
new file mode 100644
index 0000000..76c834b
--- /dev/null
+++ b/torch/utils/data/sampler.py
@@ -0,0 +1,150 @@
+import torch
+from torch._six import int_classes as _int_classes
+
+
+class Sampler(object):
+    r"""Base class for all Samplers.
+
+    Every Sampler subclass has to provide an __iter__ method, providing a way
+    to iterate over indices of dataset elements, and a __len__ method that
+    returns the length of the returned iterators.
+    """
+
+    def __init__(self, data_source):
+        pass
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler):
+    r"""Samples elements sequentially, always in the same order.
+
+    Arguments:
+        data_source (Dataset): dataset to sample from
+    """
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self):
+        return iter(range(len(self.data_source)))
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class RandomSampler(Sampler):
+    r"""Samples elements randomly, without replacement.
+
+    Arguments:
+        data_source (Dataset): dataset to sample from
+    """
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self):
+        return iter(torch.randperm(len(self.data_source)).tolist())
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class SubsetRandomSampler(Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class WeightedRandomSampler(Sampler):
+    r"""Samples elements from [0,..,len(weights)-1] with given probabilities (weights).
+
+    Arguments:
+        weights (sequence)   : a sequence of weights, not necessary summing up to one
+        num_samples (int): number of samples to draw
+        replacement (bool): if ``True``, samples are drawn with replacement.
+            If not, they are drawn without replacement, which means that when a
+            sample index is drawn for a row, it cannot be drawn again for that row.
+    """
+
+    def __init__(self, weights, num_samples, replacement=True):
+        if not isinstance(num_samples, _int_classes) or isinstance(num_samples, bool) or \
+                num_samples <= 0:
+            raise ValueError("num_samples should be a positive integeral "
+                             "value, but got num_samples={}".format(num_samples))
+        if not isinstance(replacement, bool):
+            raise ValueError("replacement should be a boolean value, but got "
+                             "replacement={}".format(replacement))
+        self.weights = torch.tensor(weights, dtype=torch.double)
+        self.num_samples = num_samples
+        self.replacement = replacement
+
+    def __iter__(self):
+        return iter(torch.multinomial(self.weights, self.num_samples, self.replacement))
+
+    def __len__(self):
+        return self.num_samples
+
+
+class BatchSampler(Sampler):
+    r"""Wraps another sampler to yield a mini-batch of indices.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``
+
+    Example:
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    """
+
+    def __init__(self, sampler, batch_size, drop_last):
+        if not isinstance(sampler, Sampler):
+            raise ValueError("sampler should be an instance of "
+                             "torch.utils.data.Sampler, but got sampler={}"
+                             .format(sampler))
+        if not isinstance(batch_size, _int_classes) or isinstance(batch_size, bool) or \
+                batch_size <= 0:
+            raise ValueError("batch_size should be a positive integeral value, "
+                             "but got batch_size={}".format(batch_size))
+        if not isinstance(drop_last, bool):
+            raise ValueError("drop_last should be a boolean value, but got "
+                             "drop_last={}".format(drop_last))
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        batch = []
+        for idx in self.sampler:
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0 and not self.drop_last:
+            yield batch
+
+    def __len__(self):
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
new file mode 100644
index 0000000..af516ec
--- /dev/null
+++ b/torch/utils/dlpack.py
@@ -0,0 +1,27 @@
+import torch
+
+from torch._C import _from_dlpack as from_dlpack
+from torch._C import _to_dlpack as to_dlpack
+
+torch._C._add_docstr(from_dlpack, r"""from_dlpack(dlpack) -> Tensor
+
+Decodes a DLPack to a tensor.
+
+Args:
+    dlpack: a PyCapsule object with the dltensor
+
+The tensor will share the memory with the object represented
+in the dlpack.
+Note that each dlpack can only be consumed once.
+""")
+
+torch._C._add_docstr(to_dlpack, r"""to_dlpack(tensor) -> PyCapsule
+
+Returns a DLPack representing the tensor.
+
+Args:
+    tensor: a tensor to be exported
+
+The dlpack shares the tensors memory.
+Note that each dlpack can only be consumed once.
+""")
diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py
new file mode 100644
index 0000000..086cd99
--- /dev/null
+++ b/torch/utils/ffi/__init__.py
@@ -0,0 +1,213 @@
+import os
+import glob
+import tempfile
+import shutil
+from functools import wraps, reduce
+from string import Template
+import torch
+import torch.cuda
+from torch._utils import _accumulate
+
+try:
+    import cffi
+except ImportError:
+    raise ImportError("torch.utils.ffi requires the cffi package")
+
+
+if cffi.__version_info__ < (1, 4, 0):
+    raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but "
+                      "got " + '.'.join(map(str, cffi.__version_info__)))
+
+
+def _generate_typedefs():
+    typedefs = []
+    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
+        for lib in ['TH', 'THCuda']:
+            for kind in ['Tensor', 'Storage']:
+                python_name = t + kind
+                if t == 'Float' and lib == 'THCuda':
+                    th_name = 'THCuda' + kind
+                else:
+                    th_name = lib + t + kind
+                th_struct = 'struct ' + th_name
+
+                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
+                # We have to assemble a string here, because we're going to
+                # do this lookup based on tensor.type(), which returns a
+                # string (not a type object, as this code was before)
+                python_module = 'torch.cuda' if lib == 'THCuda' else 'torch'
+                python_class = python_module + '.' + python_name
+                _cffi_to_torch[th_struct] = python_class
+                _torch_to_cffi[python_class] = th_struct
+    return '\n'.join(typedefs) + '\n'
+_cffi_to_torch = {}
+_torch_to_cffi = {}
+_typedefs = _generate_typedefs()
+
+
+PY_MODULE_TEMPLATE = Template("""
+from torch.utils.ffi import _wrap_function
+from .$cffi_wrapper_name import lib as _lib, ffi as _ffi
+
+__all__ = []
+def _import_symbols(locals):
+    for symbol in dir(_lib):
+        fn = getattr(_lib, symbol)
+        if callable(fn):
+            locals[symbol] = _wrap_function(fn, _ffi)
+        else:
+            locals[symbol] = fn
+        __all__.append(symbol)
+
+_import_symbols(locals())
+""")
+
+
+def _setup_wrapper(with_cuda):
+    here = os.path.abspath(os.path.dirname(__file__))
+    lib_dir = os.path.join(here, '..', '..', 'lib')
+    include_dirs = [
+        os.path.join(lib_dir, 'include'),
+        os.path.join(lib_dir, 'include', 'TH'),
+    ]
+
+    wrapper_source = '#include <TH/TH.h>\n'
+    if with_cuda:
+        import torch.cuda
+        wrapper_source += '#include <THC/THC.h>\n'
+        if os.sys.platform == 'win32':
+            cuda_include_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/include')
+            cuda_include_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/include')
+        else:
+            cuda_include_dirs = glob.glob('/usr/local/cuda/include')
+            cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
+        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
+        include_dirs.extend(cuda_include_dirs)
+    return wrapper_source, include_dirs
+
+
+def _create_module_dir(base_path, fullname):
+    module, _, name = fullname.rpartition('.')
+    if not module:
+        target_dir = name
+    else:
+        target_dir = reduce(os.path.join, fullname.split('.'))
+    target_dir = os.path.join(base_path, target_dir)
+    try:
+        os.makedirs(target_dir)
+    except os.error:
+        pass
+    for dirname in _accumulate(fullname.split('.'), os.path.join):
+        init_file = os.path.join(base_path, dirname, '__init__.py')
+        open(init_file, 'a').close()  # Create file if it doesn't exist yet
+    return name, target_dir
+
+
+def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose):
+    try:
+        tmpdir = tempfile.mkdtemp()
+        ext_suf = '.pyd' if os.sys.platform == 'win32' else '.so'
+        libname = cffi_wrapper_name + ext_suf
+        outfile = ffi.compile(tmpdir=tmpdir, verbose=verbose, target=libname)
+        shutil.copy(outfile, os.path.join(target_dir, libname))
+    finally:
+        shutil.rmtree(tmpdir)
+
+
+def _make_python_wrapper(name, cffi_wrapper_name, target_dir):
+    py_source = PY_MODULE_TEMPLATE.substitute(name=name,
+                                              cffi_wrapper_name=cffi_wrapper_name)
+    with open(os.path.join(target_dir, '__init__.py'), 'w') as f:
+        f.write(py_source)
+
+
+def create_extension(name, headers, sources, verbose=True, with_cuda=False,
+                     package=False, relative_to='.', **kwargs):
+    """Creates and configures a cffi.FFI object, that builds PyTorch extension.
+
+    Arguments:
+        name (str): package name. Can be a nested module e.g. ``.ext.my_lib``.
+        headers (str or List[str]): list of headers, that contain only exported
+            functions
+        sources (List[str]): list of sources to compile.
+        verbose (bool, optional): if set to ``False``, no output will be printed
+            (default: True).
+        with_cuda (bool, optional): set to ``True`` to compile with CUDA headers
+            (default: False)
+        package (bool, optional): set to ``True`` to build in package mode (for modules
+            meant to be installed as pip packages) (default: False).
+        relative_to (str, optional): path of the build file. Required when
+            ``package is True``. It's best to use ``__file__`` for this argument.
+        kwargs: additional arguments that are passed to ffi to declare the
+            extension. See `Extension API reference`_ for details.
+
+    .. _`Extension API reference`: https://docs.python.org/3/distutils/apiref.html#distutils.core.Extension
+    """
+    base_path = os.path.abspath(os.path.dirname(relative_to))
+    name_suffix, target_dir = _create_module_dir(base_path, name)
+    if not package:
+        cffi_wrapper_name = '_' + name_suffix
+    else:
+        cffi_wrapper_name = (name.rpartition('.')[0] +
+                             '.{0}._{0}'.format(name_suffix))
+
+    wrapper_source, include_dirs = _setup_wrapper(with_cuda)
+    include_dirs.extend(kwargs.pop('include_dirs', []))
+
+    if os.sys.platform == 'win32':
+        library_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/lib/x64')
+        library_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/lib/x64')
+
+        here = os.path.abspath(os.path.dirname(__file__))
+        lib_dir = os.path.join(here, '..', '..', 'lib')
+
+        library_dirs.append(os.path.join(lib_dir))
+    else:
+        library_dirs = []
+    library_dirs.extend(kwargs.pop('library_dirs', []))
+
+    if isinstance(headers, str):
+        headers = [headers]
+    all_headers_source = ''
+    for header in headers:
+        with open(os.path.join(base_path, header), 'r') as f:
+            all_headers_source += f.read() + '\n\n'
+
+    ffi = cffi.FFI()
+    sources = [os.path.join(base_path, src) for src in sources]
+    # NB: TH headers are C99 now
+    kwargs['extra_compile_args'] = ['-std=c99'] + kwargs.get('extra_compile_args', [])
+    ffi.set_source(cffi_wrapper_name, wrapper_source + all_headers_source,
+                   sources=sources,
+                   include_dirs=include_dirs,
+                   library_dirs=library_dirs, **kwargs)
+    ffi.cdef(_typedefs + all_headers_source)
+
+    _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir)
+
+    def build():
+        _build_extension(ffi, cffi_wrapper_name, target_dir, verbose)
+    ffi.build = build
+    return ffi
+
+
+def _wrap_function(function, ffi):
+    @wraps(function)
+    def safe_call(*args, **kwargs):
+        args = tuple(ffi.cast(_torch_to_cffi.get(arg.type(), 'void') + '*', arg._cdata)
+                     if isinstance(arg, torch.Tensor) or torch.is_storage(arg)
+                     else arg
+                     for arg in args)
+        args = (function,) + args
+        result = torch._C._safe_call(*args, **kwargs)
+        if isinstance(result, ffi.CData):
+            typeof = ffi.typeof(result)
+            if typeof.kind == 'pointer':
+                cdata = int(ffi.cast('uintptr_t', result))
+                cname = typeof.item.cname
+                if cname in _cffi_to_torch:
+                    # TODO: Maybe there is a less janky way to eval
+                    # off of this
+                    return eval(_cffi_to_torch[cname])(cdata=cdata)
+        return result
+    return safe_call
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
new file mode 100644
index 0000000..7efe843
--- /dev/null
+++ b/torch/utils/file_baton.py
@@ -0,0 +1,47 @@
+import os
+import time
+
+
+class FileBaton:
+    '''A primitive, file-based synchronization utility.'''
+
+    def __init__(self, lock_file_path, wait_seconds=0.1):
+        '''
+        Creates a new :class:`FileBaton`.
+
+        Args:
+            lock_file_path: The path to the file used for locking.
+            wait_seconds: The seconds to periorically sleep (spin) when
+                calling ``wait()``.
+        '''
+        self.lock_file_path = lock_file_path
+        self.wait_seconds = wait_seconds
+        self.fd = None
+
+    def try_acquire(self):
+        '''
+        Tries to atomically create a file under exclusive access.
+
+        Returns:
+            True if the file could be created, else False.
+        '''
+        try:
+            self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
+            return True
+        except FileExistsError:
+            return False
+
+    def wait(self):
+        '''
+        Periodically sleeps for a certain amount until the baton is released.
+
+        The amount of time slept depends on the ``wait_seconds`` parameter
+        passed to the constructor.
+        '''
+        while os.path.exists(self.lock_file_path):
+            time.sleep(self.wait_seconds)
+
+    def release(self):
+        '''Releaes the baton and removes its file.'''
+        os.close(self.fd)
+        os.remove(self.lock_file_path)
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
new file mode 100644
index 0000000..1947433
--- /dev/null
+++ b/torch/utils/hooks.py
@@ -0,0 +1,36 @@
+import collections
+import weakref
+
+
+class RemovableHandle(object):
+    """A handle which provides the capability to remove a hook."""
+
+    next_id = 0
+
+    def __init__(self, hooks_dict):
+        self.hooks_dict_ref = weakref.ref(hooks_dict)
+        self.id = RemovableHandle.next_id
+        RemovableHandle.next_id += 1
+
+    def remove(self):
+        hooks_dict = self.hooks_dict_ref()
+        if hooks_dict is not None and self.id in hooks_dict:
+            del hooks_dict[self.id]
+
+    def __getstate__(self):
+        return (self.hooks_dict_ref(), self.id)
+
+    def __setstate__(self, state):
+        if state[0] is None:
+            # create a dead reference
+            self.hooks_dict_ref = weakref.ref(collections.OrderedDict())
+        else:
+            self.hooks_dict_ref = weakref.ref(state[0])
+        self.id = state[1]
+        RemovableHandle.next_id = max(RemovableHandle.next_id, self.id + 1)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.remove()
diff --git a/torch/utils/model_zoo.py b/torch/utils/model_zoo.py
new file mode 100644
index 0000000..dfeb83e
--- /dev/null
+++ b/torch/utils/model_zoo.py
@@ -0,0 +1,133 @@
+import torch
+
+import hashlib
+import os
+import re
+import shutil
+import sys
+import tempfile
+
+try:
+    from requests.utils import urlparse
+    from requests import get as urlopen
+    requests_available = True
+except ImportError:
+    requests_available = False
+    if sys.version_info[0] == 2:
+        from urlparse import urlparse  # noqa f811
+        from urllib2 import urlopen  # noqa f811
+    else:
+        from urllib.request import urlopen
+        from urllib.parse import urlparse
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None  # defined below
+
+# matches bfd8deac from resnet18-bfd8deac.pth
+HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')
+
+
+def load_url(url, model_dir=None, map_location=None, progress=True):
+    r"""Loads the Torch serialized object at the given URL.
+
+    If the object is already present in `model_dir`, it's deserialized and
+    returned. The filename part of the URL should follow the naming convention
+    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
+    digits of the SHA256 hash of the contents of the file. The hash is used to
+    ensure unique names and to verify the contents of the file.
+
+    The default value of `model_dir` is ``$TORCH_HOME/models`` where
+    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
+    overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
+
+    Args:
+        url (string): URL of the object to download
+        model_dir (string, optional): directory in which to save the object
+        map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load)
+        progress (bool, optional): whether or not to display a progress bar to stderr
+
+    Example:
+        >>> state_dict = torch.utils.model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
+
+    """
+    if model_dir is None:
+        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
+        model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file):
+        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
+        hash_prefix = HASH_REGEX.search(filename).group(1)
+        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
+    return torch.load(cached_file, map_location=map_location)
+
+
+def _download_url_to_file(url, dst, hash_prefix, progress):
+    if requests_available:
+        u = urlopen(url, stream=True)
+        file_size = int(u.headers["Content-Length"])
+        u = u.raw
+    else:
+        u = urlopen(url)
+        meta = u.info()
+        if hasattr(meta, 'getheaders'):
+            file_size = int(meta.getheaders("Content-Length")[0])
+        else:
+            file_size = int(meta.get_all("Content-Length")[0])
+
+    f = tempfile.NamedTemporaryFile(delete=False)
+    try:
+        if hash_prefix is not None:
+            sha256 = hashlib.sha256()
+        with tqdm(total=file_size, disable=not progress) as pbar:
+            while True:
+                buffer = u.read(8192)
+                if len(buffer) == 0:
+                    break
+                f.write(buffer)
+                if hash_prefix is not None:
+                    sha256.update(buffer)
+                pbar.update(len(buffer))
+
+        f.close()
+        if hash_prefix is not None:
+            digest = sha256.hexdigest()
+            if digest[:len(hash_prefix)] != hash_prefix:
+                raise RuntimeError('invalid hash value (expected "{}", got "{}")'
+                                   .format(hash_prefix, digest))
+        shutil.move(f.name, dst)
+    finally:
+        f.close()
+        if os.path.exists(f.name):
+            os.remove(f.name)
+
+
+if tqdm is None:
+    # fake tqdm if it's not installed
+    class tqdm(object):
+
+        def __init__(self, total, disable=False):
+            self.total = total
+            self.disable = disable
+            self.n = 0
+
+        def update(self, n):
+            if self.disable:
+                return
+
+            self.n += n
+            sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total)))
+            sys.stderr.flush()
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            if self.disable:
+                return
+
+            sys.stderr.write('\n')
diff --git a/torch/utils/serialization/__init__.py b/torch/utils/serialization/__init__.py
new file mode 100644
index 0000000..f0a83c0
--- /dev/null
+++ b/torch/utils/serialization/__init__.py
@@ -0,0 +1,2 @@
+
+from .read_lua_file import load_lua, T7Reader
diff --git a/torch/utils/serialization/read_lua_file.py b/torch/utils/serialization/read_lua_file.py
new file mode 100644
index 0000000..e01cbd6
--- /dev/null
+++ b/torch/utils/serialization/read_lua_file.py
@@ -0,0 +1,608 @@
+"""
+Based on python-torchfile package.
+https://github.com/bshillingford/python-torchfile
+
+Copyright (c) 2016, Brendan Shillingford
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+TYPE_NIL = 0
+TYPE_NUMBER = 1
+TYPE_STRING = 2
+TYPE_TABLE = 3
+TYPE_TORCH = 4
+TYPE_BOOLEAN = 5
+TYPE_FUNCTION = 6
+TYPE_RECUR_FUNCTION = 8
+LEGACY_TYPE_RECUR_FUNCTION = 7
+
+
+import sys
+import struct
+from array import array
+from collections import namedtuple
+from functools import wraps
+
+import torch
+import torch.legacy.nn as nn
+import torch.cuda
+from torch._thnn import type2backend
+from torch._utils import _import_dotted_name
+
+HAS_CUDA = torch.cuda.is_available()
+
+LuaFunction = namedtuple('LuaFunction', ['size', 'dumped', 'upvalues'])
+
+
+class hashable_uniq_dict(dict):
+    """
+    Subclass of dict with equality and hashing semantics changed:
+    equality and hashing is purely by reference/instance, to match
+    the behaviour of lua tables.
+
+    Supports lua-style dot indexing.
+
+    This way, dicts can be keys of other dicts.
+    """
+
+    def __hash__(self):
+        return id(self)
+
+    def __getattr__(self, key):
+        return self.get(key)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+    # TODO: dict's __lt__ etc. still exist
+
+
+class TorchObject(object):
+    """
+    Simple torch object, used by `add_trivial_class_reader`.
+    Supports both forms of lua-style indexing, i.e. getattr and getitem.
+    Use the `torch_typename` method to get the object's torch class name.
+
+    Equality is by reference, as usual for lua (and the default for Python
+    objects).
+    """
+
+    def __init__(self, typename, obj):
+        self._typename = typename
+        self._obj = obj
+
+    def __getattr__(self, k):
+        return self._obj.get(k)
+
+    def __getitem__(self, k):
+        return self._obj.get(k)
+
+    def torch_typename(self):
+        return self._typename
+
+    def __repr__(self):
+        return "TorchObject(%s, %s)" % (self._typename, repr(self._obj))
+
+    def __str__(self):
+        return repr(self)
+
+    def __dir__(self):
+        keys = list(self._obj.keys())
+        keys.append('torch_typename')
+        return keys
+
+
+reader_registry = {}
+
+
+def get_python_class(typename):
+    module, _, cls_name = typename.rpartition('.')
+    if cls_name.startswith('Cuda'):
+        module = module + '.cuda'
+        cls_name = cls_name[4:]
+        if cls_name == 'Storage' or cls_name == 'Tensor':
+            cls_name = 'Float' + cls_name
+    return _import_dotted_name(module + '.' + cls_name)
+
+
+def make_tensor_reader(typename):
+    python_class = get_python_class(typename)
+
+    def read_tensor(reader, version):
+        # source:
+        # https://github.com/torch/torch7/blob/master/generic/Tensor.c#L1243
+        ndim = reader.read_int()
+
+        # read size:
+        size = torch.LongStorage(reader.read_long_array(ndim))
+        # read stride:
+        stride = torch.LongStorage(reader.read_long_array(ndim))
+        # storage offset:
+        storage_offset = reader.read_long() - 1
+        # read storage:
+        storage = reader.read()
+
+        if storage is None or ndim == 0 or len(size) == 0 or len(stride) == 0:
+            # empty torch tensor
+            return python_class()
+
+        return python_class().set_(storage, storage_offset, torch.Size(size), tuple(stride))
+    return read_tensor
+
+
+def make_storage_reader(typename):
+    python_class = get_python_class(typename)
+    # TODO: be smarter about this
+    element_size = python_class().element_size()
+
+    def read_storage(reader, version):
+        # source:
+        # https://github.com/torch/torch7/blob/master/generic/Storage.c#L244
+        size = reader.read_long() * element_size
+        return python_class.from_buffer(reader.f.read(size), 'native')
+    return read_storage
+
+
+def register_torch_class(obj_kind, reader_factory):
+    for t in ['Double', 'Float', 'Half', 'Long', 'Int', 'Short', 'Char', 'Byte']:
+        for prefix in ['', 'Cuda']:
+            if prefix == 'Cuda' and not HAS_CUDA:
+                continue
+            if t == 'Half' and prefix == '':
+                continue
+            if prefix == 'Cuda' and t == 'Float':
+                cls_name = 'torch.Cuda' + obj_kind
+            else:
+                cls_name = 'torch.' + prefix + t + obj_kind
+            reader_registry[cls_name] = reader_factory(cls_name)
+
+
+register_torch_class('Storage', make_storage_reader)
+register_torch_class('Tensor', make_tensor_reader)
+
+################################################################################
+# Reader function for tds.Vector and tds.Hash
+################################################################################
+
+
+def tds_Vec_reader(reader, version):
+    length = reader.read_long()
+    return [reader.read() for i in range(length)]
+
+
+def tds_Hash_reader(reader, version):
+    length = reader.read_long()
+    obj = {}
+    for i in range(length):
+        k = reader.read()
+        v = reader.read()
+        obj[k] = v
+    return obj
+
+
+reader_registry['tds.Vec'] = tds_Vec_reader
+reader_registry['tds.Hash'] = tds_Hash_reader
+
+################################################################################
+# Reader function for nn modules
+################################################################################
+
+
+def _load_backend(obj):
+    if hasattr(obj, '_type'):
+        obj._backend = type2backend[obj._type]
+        return
+    # Try to find tensor attributes and infer type from them
+    for key in dir(obj):
+        attr = getattr(obj, key)
+        if isinstance(attr, torch.Tensor):
+            try:
+                obj._backend = type2backend[attr.type()]
+            except KeyError:
+                pass
+    # Monkey patch the forward to capture the type of input
+    updateOutput_orig = obj.updateOutput
+
+    def updateOutput_patch(*args):
+        input = args[0]
+        while not isinstance(input, torch.Tensor):
+            input = input[0]
+        obj._backend = type2backend[input.type()]
+        obj.updateOutput = updateOutput_orig
+        return obj.updateOutput(*args)
+    obj.updateOutput = updateOutput_patch
+
+
+def nn_reader(cls):
+    def read_nn_class(reader, version):
+        obj = cls.__new__(cls)
+        attributes = reader.read()
+        obj.__dict__.update(attributes)
+        _load_backend(obj)
+        return obj
+    return read_nn_class
+
+
+reader_registry.update({('nn.' + name): nn_reader(module)
+                        for name, module in nn.__dict__.items()
+                        if name[0] != '_' and name[0].upper() == name[0]})
+
+
+def custom_reader(cls):
+    def reader_factory(fn):
+        base = nn_reader(cls)
+
+        def wrapper(reader, version):
+            obj = base(reader, version)
+            fn(reader, version, obj)
+            return obj
+        reader_registry['nn.' + cls.__name__] = wrapper
+        return wrapper
+    return reader_factory
+
+
+def BatchNorm_reader(reader, version, obj):
+    if version < 2 and hasattr(obj, 'running_std'):
+        obj.running_var = obj.running_var.pow(-2).add(-obj.eps)
+        del obj.running_std
+
+for prefix in ['', 'Spatial', 'Volumetric']:
+    name = prefix + 'BatchNormalization'
+    custom_reader(getattr(nn, name))(BatchNorm_reader)
+
+
+@custom_reader(nn.Transpose)
+def Transpose_reader(reader, version, obj):
+    obj.permutations = list(
+        map(lambda swap: [swap[0] - 1, swap[1] - 1], obj.permutations))
+
+
+@custom_reader(nn.SpatialDivisiveNormalization)
+def SpatialDivisiveNormalization_reader(reader, version, obj):
+    obj.stdestimator.modules[-2].dim += 1
+    obj.meanestimator.modules[-1].dim += 1
+
+
+@custom_reader(nn.SpatialContrastiveNormalization)
+def SpatialContrastiveNormalization_reader(reader, version, obj):
+    raise RuntimeError("loading of SpatialContrastiveNormalization is disabled for now")
+
+
+@custom_reader(nn.GradientReversal)
+def GradientReversal_reader(reader, version, obj):
+    if version < 2:
+        setattr(obj, 'lambda', 1)
+
+
+@custom_reader(nn.VolumetricAveragePooling)
+def VolumetricAveragePooling_reader(reader, version, obj):
+    obj.padT, obj.padH, obj.padW = 0, 0, 0
+    obj.ceil_mode = False
+    obj.count_include_pad = True
+
+################################################################################
+# Functions for patching objects so that they work with legacy modules
+################################################################################
+
+
+def registry_addon(fn):
+    def wrapper_factory(module_name, *args, **kwargs):
+        module_name = 'nn.' + module_name
+        build_fn = reader_registry[module_name]
+
+        def wrapper(reader, version):
+            obj = build_fn(reader, version)
+            fn(obj, *args, **kwargs)
+            return obj
+        reader_registry[module_name] = wrapper
+    return wrapper_factory
+
+
+@registry_addon
+def attr_map(obj, attribute_map):
+    for src, dst in attribute_map.items():
+        setattr(obj, dst, getattr(obj, src))
+        delattr(obj, src)
+
+
+@registry_addon
+def ensure_attr(obj, *attrs):
+    for attr in attrs:
+        if not hasattr(obj, attr):
+            setattr(obj, attr, None)
+
+
+@registry_addon
+def make_none_attr(obj, *attrs):
+    for attr in attrs:
+        setattr(obj, attr, None)
+
+
+@registry_addon
+def decrement(obj, *attrs):
+    for attr in attrs:
+        value = getattr(obj, attr)
+        value -= 1
+        setattr(obj, attr, value)
+
+
+@registry_addon
+def decrement_positive(obj, *attrs):
+    for attr in attrs:
+        value = getattr(obj, attr)
+        if value > 0:
+            value -= 1
+        setattr(obj, attr, value)
+
+
+@registry_addon
+def storage_to_size(obj, *attrs):
+    for attr in attrs:
+        value = getattr(obj, attr)
+        setattr(obj, attr, torch.Size(value))
+
+
+@registry_addon
+def ensure_type(obj, type_map):
+    for attr, converter in type_map.items():
+        value = getattr(obj, attr)
+        setattr(obj, attr, getattr(value, converter)())
+
+
+ensure_attr('Linear', 'bias', 'gradWeight', 'gradBias', 'addBuffer')
+ensure_attr('CAddTable', 'inplace')
+ensure_attr('SpatialFractionalMaxPooling', 'outW', 'outH', 'ratioW', 'ratioH')
+ensure_attr('BatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias',
+            'save_mean', 'save_std')
+ensure_attr('SpatialBatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias',
+            'save_mean', 'save_std')
+ensure_attr('VolumetricBatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias')
+ensure_attr('LookupTable', 'maxNorm', 'normType', '_gradOutput', '_sorted', '_indices')
+ensure_attr('MixtureTable', 'table')
+ensure_attr('WeightedEuclidean', 'fastBackward')
+ensure_attr('VolumetricMaxPooling', 'ceil_mode')
+ensure_attr('BCECriterion', 'buffer')
+ensure_attr('SpatialClassNLLCriterion', 'weights')
+ensure_attr('ClassNLLCriterion', 'weights')
+ensure_attr('ParallelCriterion', 'repeatTarget')
+ensure_attr('MultiMarginCriterion', 'weights')
+ensure_attr('SpatialConvolution', 'bias', 'gradWeight', 'gradBias', '_gradOutput')
+ensure_attr('SpatialCrossMapLRN', 'scale')
+ensure_attr('Dropout', 'inplace')
+make_none_attr('SpatialConvolution', 'finput', 'fgradInput', '_input')
+attr_map('ReLU', {'val': 'value'})
+attr_map('Threshold', {'val': 'value'})
+attr_map('Unsqueeze', {'pos': 'dim'})
+attr_map('HardShrink', {'lambda': 'lambd'})
+attr_map('SoftShrink', {'lambda': 'lambd'})
+attr_map('GradientReversal', {'lambda': 'lambd'})
+attr_map('SpatialAdaptiveMaxPooling', {'H': 'h', 'W': 'w'})
+decrement('Index', 'dimension')
+decrement('SelectTable', 'index')
+decrement('SplitTable', 'dimension')
+decrement_positive('JoinTable', 'dimension')
+decrement('Parallel', 'inputDimension', 'outputDimension')
+decrement('Concat', 'dimension')
+decrement('DepthConcat', 'dimension')
+decrement('Squeeze', 'dim')
+decrement('Unsqueeze', 'dim')
+decrement('Replicate', 'dim')
+decrement('MixtureTable', 'dim')
+decrement('Narrow', 'dimension', 'index')
+decrement('NarrowTable', 'offset')
+decrement('LookupTable', 'paddingValue')
+decrement('SpatialConvolutionMap', 'connTable')
+decrement('SpatialFullConvolutionMap', 'connTable')
+decrement('Select', 'dimension', 'index')
+decrement('Padding', 'dim', 'index')
+decrement('PartialLinear', 'partition')
+decrement_positive('Sum', 'dimension')
+decrement_positive('Max', 'dimension')
+decrement_positive('Min', 'dimension')
+decrement_positive('Mean', 'dimension')
+storage_to_size('View', 'size')
+storage_to_size('DepthConcat', 'outputSize')
+storage_to_size('MixtureTable', 'size')
+ensure_type('PartialLinear', {'partition': 'long'})
+
+
+class T7ReaderException(Exception):
+    pass
+
+
+class T7Reader:
+
+    def __init__(self,
+                 fileobj,
+                 list_heuristic=True,
+                 int_heuristic=True,
+                 unknown_classes=False,
+                 long_size=None):
+        """
+        Params:
+        * `fileobj` file object to read from, must be actual file object
+                    as it must support array, struct, and numpy
+        * `list_heuristic`: automatically turn tables with only consecutive
+                                positive integral indices into lists
+                                (default True)
+        * `int_heuristic`: cast all whole floats into ints (default True)
+        * `force_deserialize_classes`: deserialize all classes, not just the
+                                       whitelisted ones (default True)
+        """
+        self.f = fileobj
+        self.memo = {}
+
+        self.list_heuristic = list_heuristic
+        self.int_heuristic = int_heuristic
+        self.unknown_classes = unknown_classes
+        self.long_size = long_size
+
+    def _read(self, fmt):
+        sz = struct.calcsize(fmt)
+        result = struct.unpack(fmt, self.f.read(sz))
+        if len(result) == 1:
+            return result[0]
+        return result
+
+    def read_boolean(self):
+        return self.read_int() == 1
+
+    def read_int(self):
+        return self._read('i')
+
+    def read_long(self):
+        if self.long_size is None:
+            return self._read('l')
+        elif self.long_size is 8:
+            return self._read('q')
+        else:
+            return self._read('i')
+
+    def read_long_array(self, n):
+        if self.long_size is not None:
+            lst = []
+            for i in range(n):
+                lst.append(self.read_long())
+            return lst
+        else:
+            LONG_SIZE_ARR = 'q' if sys.version_info[0] == 3 else 'l'
+            arr = array(LONG_SIZE_ARR)
+            arr.fromfile(self.f, n)
+            return arr.tolist()
+
+    def read_float(self):
+        return self._read('f')
+
+    def read_double(self):
+        return self._read('d')
+
+    def read_string(self):
+        size = self.read_int()
+        byte_str = self.f.read(size)
+        if not isinstance(byte_str, str):
+            byte_str = str(byte_str, 'ascii')
+        return byte_str
+
+    def read_number(self):
+        x = self.read_double()
+        # Extra checking for integral numbers:
+        if self.int_heuristic and x.is_integer():
+            return int(x)
+        return x
+
+    def memoize_index(fn):
+        @wraps(fn)
+        def wrapper(self, *args, **kwargs):
+            index = self.read_int()
+            if index in self.memo:
+                return self.memo[index]
+            result = fn(self, *args, **kwargs)
+            self.memo[index] = result
+            return result
+        return wrapper
+
+    @memoize_index
+    def read_function(self):
+        size = self.read_int()
+        dumped = self.f.read(size)
+        upvalues = self.read()
+        return LuaFunction(size, dumped, upvalues)
+
+    @memoize_index
+    def read_object(self):
+        version_str = self.read_string()
+        if version_str.startswith('V '):
+            version = int(version_str.partition(' ')[2])
+            cls_name = self.read_string()
+        else:
+            cls_name = version_str
+            version = 0  # created before existence of versioning
+
+        if cls_name in reader_registry:
+            return reader_registry[cls_name](self, version)
+        if self.unknown_classes:
+            return TorchObject(cls_name, self.read())
+        raise T7ReaderException(("don't know how to deserialize Lua class "
+                                 "{}. If you want to ignore this error and load this object "
+                                 "as a dict, specify unknown_classes=True in reader's "
+                                 "constructor").format(cls_name))
+
+    def _can_be_list(self, table):
+        def is_natural(key):
+            return (isinstance(key, int) or
+                    (isinstance(key, float) and key.is_integer()) and
+                    k > 0)
+        natural_keys = all(map(is_natural, table.keys()))
+        if not natural_keys:
+            return False
+        key_sum = sum(table.keys())
+        n = len(table)
+        return n * (n + 1) == 2 * key_sum
+
+    @memoize_index
+    def read_table(self):
+        size = self.read_int()
+        table = hashable_uniq_dict()  # custom hashable dict, can be a key
+        for i in range(size):
+            k = self.read()
+            v = self.read()
+            table[k] = v
+        if self.list_heuristic and self._can_be_list(table):
+            return [table[i] for i in range(1, len(table) + 1)]
+        return table
+
+    def read(self):
+        typeidx = self.read_int()
+
+        if typeidx == TYPE_NIL:
+            return None
+        elif typeidx == TYPE_NUMBER:
+            return self.read_number()
+        elif typeidx == TYPE_BOOLEAN:
+            return self.read_boolean()
+        elif typeidx == TYPE_STRING:
+            return self.read_string()
+        elif (typeidx == TYPE_FUNCTION or typeidx == TYPE_RECUR_FUNCTION or
+              typeidx == LEGACY_TYPE_RECUR_FUNCTION):
+            return self.read_function()
+        elif typeidx == TYPE_TORCH:
+            return self.read_object()
+        elif typeidx == TYPE_TABLE:
+            return self.read_table()
+        else:
+            raise T7ReaderException("unknown type id {}. The file may be "
+                                    "corrupted.".format(typeidx))
+
+
+def load_lua(filename, **kwargs):
+    """
+    Loads the given t7 file using default settings; kwargs are forwarded
+    to `T7Reader`.
+    """
+    with open(filename, 'rb') as f:
+        reader = T7Reader(f, **kwargs)
+        return reader.read()
diff --git a/torch/utils/trainer/__init__.py b/torch/utils/trainer/__init__.py
new file mode 100644
index 0000000..29340cf
--- /dev/null
+++ b/torch/utils/trainer/__init__.py
@@ -0,0 +1,2 @@
+
+from .trainer import Trainer
diff --git a/torch/utils/trainer/plugins/__init__.py b/torch/utils/trainer/plugins/__init__.py
new file mode 100644
index 0000000..e8d10f4
--- /dev/null
+++ b/torch/utils/trainer/plugins/__init__.py
@@ -0,0 +1,5 @@
+from .progress import ProgressMonitor
+from .accuracy import AccuracyMonitor
+from .time import TimeMonitor
+from .loss import LossMonitor
+from .logger import Logger
diff --git a/torch/utils/trainer/plugins/accuracy.py b/torch/utils/trainer/plugins/accuracy.py
new file mode 100644
index 0000000..f6f393c
--- /dev/null
+++ b/torch/utils/trainer/plugins/accuracy.py
@@ -0,0 +1,19 @@
+from .monitor import Monitor
+
+
+class AccuracyMonitor(Monitor):
+    stat_name = 'accuracy'
+
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('unit', '%')
+        kwargs.setdefault('precision', 2)
+        super(AccuracyMonitor, self).__init__(*args, **kwargs)
+
+    def _get_value(self, iteration, input, target, output, loss):
+        batch_size = input.size(0)
+        predictions = output.max(1)[1].type_as(target)
+        correct = predictions.eq(target)
+        if not hasattr(correct, 'sum'):
+            correct = correct.cpu()
+        correct = correct.sum()
+        return 100. * correct / batch_size
diff --git a/torch/utils/trainer/plugins/logger.py b/torch/utils/trainer/plugins/logger.py
new file mode 100644
index 0000000..9bc2dfc
--- /dev/null
+++ b/torch/utils/trainer/plugins/logger.py
@@ -0,0 +1,83 @@
+from collections import defaultdict
+from .plugin import Plugin
+
+
+class Logger(Plugin):
+    alignment = 4
+    separator = '#' * 80
+
+    def __init__(self, fields, interval=None):
+        if interval is None:
+            interval = [(1, 'iteration'), (1, 'epoch')]
+        super(Logger, self).__init__(interval)
+        self.field_widths = defaultdict(lambda: defaultdict(int))
+        self.fields = list(map(lambda f: f.split('.'), fields))
+
+    def _join_results(self, results):
+        joined_out = map(lambda i: (i[0], ' '.join(i[1])), results)
+        joined_fields = map(lambda i: '{}: {}'.format(i[0], i[1]), joined_out)
+        return '\t'.join(joined_fields)
+
+    def log(self, msg):
+        print(msg)
+
+    def register(self, trainer):
+        self.trainer = trainer
+
+    def gather_stats(self):
+        result = {}
+        return result
+
+    def _align_output(self, field_idx, output):
+        for output_idx, o in enumerate(output):
+            if len(o) < self.field_widths[field_idx][output_idx]:
+                num_spaces = self.field_widths[field_idx][output_idx] - len(o)
+                output[output_idx] += ' ' * num_spaces
+            else:
+                self.field_widths[field_idx][output_idx] = len(o)
+
+    def _gather_outputs(self, field, log_fields, stat_parent, stat, require_dict=False):
+        output = []
+        name = ''
+        if isinstance(stat, dict):
+            log_fields = stat.get(log_fields, [])
+            name = stat.get('log_name', '.'.join(field))
+            for f in log_fields:
+                output.append(f.format(**stat))
+        elif not require_dict:
+            name = '.'.join(field)
+            number_format = stat_parent.get('log_format', '')
+            unit = stat_parent.get('log_unit', '')
+            fmt = '{' + number_format + '}' + unit
+            output.append(fmt.format(stat))
+        return name, output
+
+    def _log_all(self, log_fields, prefix=None, suffix=None, require_dict=False):
+        results = []
+        for field_idx, field in enumerate(self.fields):
+            parent, stat = None, self.trainer.stats
+            for f in field:
+                parent, stat = stat, stat[f]
+            name, output = self._gather_outputs(field, log_fields,
+                                                parent, stat, require_dict)
+            if not output:
+                continue
+            self._align_output(field_idx, output)
+            results.append((name, output))
+        if not results:
+            return
+        output = self._join_results(results)
+        if prefix is not None:
+            self.log(prefix)
+        self.log(output)
+        if suffix is not None:
+            self.log(suffix)
+
+    def iteration(self, *args):
+        self._log_all('log_iter_fields')
+
+    def epoch(self, epoch_idx):
+        self._log_all('log_epoch_fields',
+                      prefix=self.separator + '\nEpoch summary:',
+                      suffix=self.separator,
+                      require_dict=True)
diff --git a/torch/utils/trainer/plugins/loss.py b/torch/utils/trainer/plugins/loss.py
new file mode 100644
index 0000000..1bd93f2
--- /dev/null
+++ b/torch/utils/trainer/plugins/loss.py
@@ -0,0 +1,8 @@
+from .monitor import Monitor
+
+
+class LossMonitor(Monitor):
+    stat_name = 'loss'
+
+    def _get_value(self, iteration, input, target, output, loss):
+        return loss.item()
diff --git a/torch/utils/trainer/plugins/monitor.py b/torch/utils/trainer/plugins/monitor.py
new file mode 100644
index 0000000..b1e1d9f
--- /dev/null
+++ b/torch/utils/trainer/plugins/monitor.py
@@ -0,0 +1,57 @@
+from .plugin import Plugin
+
+
+class Monitor(Plugin):
+
+    def __init__(self, running_average=True, epoch_average=True, smoothing=0.7,
+                 precision=None, number_format=None, unit=''):
+        if precision is None:
+            precision = 4
+        if number_format is None:
+            number_format = '.{}f'.format(precision)
+        number_format = ':' + number_format
+        super(Monitor, self).__init__([(1, 'iteration'), (1, 'epoch')])
+
+        self.smoothing = smoothing
+        self.with_running_average = running_average
+        self.with_epoch_average = epoch_average
+
+        self.log_format = number_format
+        self.log_unit = unit
+        self.log_epoch_fields = None
+        self.log_iter_fields = ['{last' + number_format + '}' + unit]
+        if self.with_running_average:
+            self.log_iter_fields += [' ({running_avg' + number_format + '}' + unit + ')']
+        if self.with_epoch_average:
+            self.log_epoch_fields = ['{epoch_mean' + number_format + '}' + unit]
+
+    def register(self, trainer):
+        self.trainer = trainer
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        stats['log_format'] = self.log_format
+        stats['log_unit'] = self.log_unit
+        stats['log_iter_fields'] = self.log_iter_fields
+        if self.with_epoch_average:
+            stats['log_epoch_fields'] = self.log_epoch_fields
+        if self.with_epoch_average:
+            stats['epoch_stats'] = (0, 0)
+
+    def iteration(self, *args):
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        stats['last'] = self._get_value(*args)
+
+        if self.with_epoch_average:
+            stats['epoch_stats'] = tuple(sum(t) for t in
+                                         zip(stats['epoch_stats'], (stats['last'], 1)))
+
+        if self.with_running_average:
+            previous_avg = stats.get('running_avg', 0)
+            stats['running_avg'] = previous_avg * self.smoothing + \
+                stats['last'] * (1 - self.smoothing)
+
+    def epoch(self, idx):
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        if self.with_epoch_average:
+            epoch_stats = stats['epoch_stats']
+            stats['epoch_mean'] = epoch_stats[0] / epoch_stats[1]
+            stats['epoch_stats'] = (0, 0)
diff --git a/torch/utils/trainer/plugins/plugin.py b/torch/utils/trainer/plugins/plugin.py
new file mode 100644
index 0000000..e1ac251
--- /dev/null
+++ b/torch/utils/trainer/plugins/plugin.py
@@ -0,0 +1,10 @@
+
+class Plugin(object):
+
+    def __init__(self, interval=None):
+        if interval is None:
+            interval = []
+        self.trigger_interval = interval
+
+    def register(self, trainer):
+        raise NotImplementedError
diff --git a/torch/utils/trainer/plugins/progress.py b/torch/utils/trainer/plugins/progress.py
new file mode 100644
index 0000000..5b0dc2e
--- /dev/null
+++ b/torch/utils/trainer/plugins/progress.py
@@ -0,0 +1,28 @@
+from .plugin import Plugin
+
+
+class ProgressMonitor(Plugin):
+    stat_name = 'progress'
+
+    def __init__(self):
+        super(ProgressMonitor, self).__init__([(1, 'iteration'), (1, 'epoch')])
+
+    def register(self, trainer):
+        self.trainer = trainer
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        stats['samples_used'] = 0
+        stats['epoch_size'] = len(trainer.dataset)
+        stats['log_iter_fields'] = [
+            '{samples_used}/{epoch_size}',
+            '({percent:.2f}%)'
+        ]
+
+    def iteration(self, iteration, input, *args):
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        stats['samples_used'] += 1
+        stats['percent'] = 100. * stats['samples_used'] / stats['epoch_size']
+
+    def epoch(self, *args):
+        stats = self.trainer.stats.setdefault(self.stat_name, {})
+        stats['samples_used'] = 0
+        stats['percent'] = 0
diff --git a/torch/utils/trainer/plugins/time.py b/torch/utils/trainer/plugins/time.py
new file mode 100644
index 0000000..ffdc198
--- /dev/null
+++ b/torch/utils/trainer/plugins/time.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+import time
+
+from .monitor import Monitor
+
+
+class TimeMonitor(Monitor):
+    stat_name = 'time'
+
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('unit', 'ms')
+        kwargs.setdefault('precision', 0)
+        super(TimeMonitor, self).__init__(*args, **kwargs)
+        self.last_time = None
+
+    def _get_value(self, *args):
+        if self.last_time:
+            now = time.time()
+            duration = now - self.last_time
+            self.last_time = now
+            return duration * 1000
+        else:
+            self.last_time = time.time()
+            return 0
diff --git a/torch/utils/trainer/trainer.py b/torch/utils/trainer/trainer.py
new file mode 100644
index 0000000..3203a75
--- /dev/null
+++ b/torch/utils/trainer/trainer.py
@@ -0,0 +1,76 @@
+import heapq
+
+
+class Trainer(object):
+
+    def __init__(self, model=None, criterion=None, optimizer=None, dataset=None):
+        self.model = model
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.dataset = dataset
+        self.iterations = 0
+        self.stats = {}
+        self.plugin_queues = {
+            'iteration': [],
+            'epoch': [],
+            'batch': [],
+            'update': [],
+        }
+
+    def register_plugin(self, plugin):
+        plugin.register(self)
+
+        intervals = plugin.trigger_interval
+        if not isinstance(intervals, list):
+            intervals = [intervals]
+        for duration, unit in intervals:
+            queue = self.plugin_queues[unit]
+            queue.append((duration, len(queue), plugin))
+
+    def call_plugins(self, queue_name, time, *args):
+        args = (time,) + args
+        queue = self.plugin_queues[queue_name]
+        if len(queue) == 0:
+            return
+        while queue[0][0] <= time:
+            plugin = queue[0][2]
+            getattr(plugin, queue_name)(*args)
+            for trigger in plugin.trigger_interval:
+                if trigger[1] == queue_name:
+                    interval = trigger[0]
+            new_item = (time + interval, queue[0][1], plugin)
+            heapq.heappushpop(queue, new_item)
+
+    def run(self, epochs=1):
+        for q in self.plugin_queues.values():
+            heapq.heapify(q)
+
+        for i in range(1, epochs + 1):
+            self.train()
+            self.call_plugins('epoch', i)
+
+    def train(self):
+        for i, data in enumerate(self.dataset, self.iterations + 1):
+            batch_input, batch_target = data
+            self.call_plugins('batch', i, batch_input, batch_target)
+            input_var = batch_input
+            target_var = batch_target
+
+            plugin_data = [None, None]
+
+            def closure():
+                batch_output = self.model(input_var)
+                loss = self.criterion(batch_output, target_var)
+                loss.backward()
+                if plugin_data[0] is None:
+                    plugin_data[0] = batch_output.data
+                    plugin_data[1] = loss.data
+                return loss
+
+            self.optimizer.zero_grad()
+            self.optimizer.step(closure)
+            self.call_plugins('iteration', i, batch_input, batch_target,
+                              *plugin_data)
+            self.call_plugins('update', i, self.model)
+
+        self.iterations += i
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..8324d22
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 120
+ignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999
+exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,tools/amd_build/pyHIPIFY